2861 files changed, 246738 insertions, 132098 deletions
diff --git a/contrib/llvm/LICENSE.TXT b/contrib/llvm/LICENSE.TXT
index ff63f2b6aae3..461398bab7a7 100644
--- a/contrib/llvm/LICENSE.TXT
+++ b/contrib/llvm/LICENSE.TXT
@@ -4,7 +4,7 @@ LLVM Release License
 University of Illinois/NCSA
 Open Source License
 
-Copyright (c) 2003-2017 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign.
 All rights reserved.
 
 Developed by:
diff --git a/contrib/llvm/include/llvm-c/Comdat.h b/contrib/llvm/include/llvm-c/Comdat.h
new file mode 100644
index 000000000000..499996d68a53
--- /dev/null
+++ b/contrib/llvm/include/llvm-c/Comdat.h
@@ -0,0 +1,75 @@
+/*===-- llvm-c/Comdat.h - Module Comdat C Interface -------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file defines the C interface to COMDAT.                               *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_COMDAT_H
+#define LLVM_C_COMDAT_H
+
+#include "llvm-c/Types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  LLVMAnyComdatSelectionKind,        ///< The linker may choose any COMDAT.
+  LLVMExactMatchComdatSelectionKind, ///< The data referenced by the COMDAT must
+                                     ///< be the same.
+  LLVMLargestComdatSelectionKind,    ///< The linker will choose the largest
+                                     ///< COMDAT.
+  LLVMNoDuplicatesComdatSelectionKind, ///< No other Module may specify this
+                                       ///< COMDAT.
+  LLVMSameSizeComdatSelectionKind ///< The data referenced by the COMDAT must be
+                                  ///< the same size.
+} LLVMComdatSelectionKind;
+
+/**
+ * Return the Comdat in the module with the specified name. It is created
+ * if it didn't already exist.
+ *
+ * @see llvm::Module::getOrInsertComdat()
+ */
+LLVMComdatRef LLVMGetOrInsertComdat(LLVMModuleRef M, const char *Name);
+
+/**
+ * Get the Comdat assigned to the given global object.
+ *
+ * @see llvm::GlobalObject::getComdat()
+ */
+LLVMComdatRef LLVMGetComdat(LLVMValueRef V);
+
+/**
+ * Assign the Comdat to the given global object.
+ *
+ * @see llvm::GlobalObject::setComdat()
+ */
+void LLVMSetComdat(LLVMValueRef V, LLVMComdatRef C);
+
+/*
+ * Get the conflict resolution selection kind for the Comdat.
+ *
+ * @see llvm::Comdat::getSelectionKind()
+ */
+LLVMComdatSelectionKind LLVMGetComdatSelectionKind(LLVMComdatRef C);
+
+/*
+ * Set the conflict resolution selection kind for the Comdat.
+ *
+ * @see llvm::Comdat::setSelectionKind()
+ */
+void LLVMSetComdatSelectionKind(LLVMComdatRef C, LLVMComdatSelectionKind Kind);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/contrib/llvm/include/llvm-c/Core.h b/contrib/llvm/include/llvm-c/Core.h
index 8238c09f9dd0..6792219f8730 100644
--- a/contrib/llvm/include/llvm-c/Core.h
+++ b/contrib/llvm/include/llvm-c/Core.h
@@ -187,19 +187,60 @@ typedef enum {
 } LLVMVisibility;
 
 typedef enum {
+  LLVMNoUnnamedAddr,    /**< Address of the GV is significant. */
+  LLVMLocalUnnamedAddr, /**< Address of the GV is locally insignificant. */
+  LLVMGlobalUnnamedAddr /**< Address of the GV is globally insignificant. */
+} LLVMUnnamedAddr;
+
+typedef enum {
   LLVMDefaultStorageClass   = 0,
   LLVMDLLImportStorageClass = 1, /**< Function to be imported from DLL. */
   LLVMDLLExportStorageClass = 2  /**< Function to be accessible from DLL. */
 } LLVMDLLStorageClass;
 
 typedef enum {
-  LLVMCCallConv           = 0,
-  LLVMFastCallConv        = 8,
-  LLVMColdCallConv        = 9,
-  LLVMWebKitJSCallConv    = 12,
-  LLVMAnyRegCallConv      = 13,
-  LLVMX86StdcallCallConv  = 64,
-  LLVMX86FastcallCallConv = 65
+  LLVMCCallConv             = 0,
+  LLVMFastCallConv          = 8,
+  LLVMColdCallConv          = 9,
+  LLVMGHCCallConv           = 10,
+  LLVMHiPECallConv          = 11,
+  LLVMWebKitJSCallConv      = 12,
+  LLVMAnyRegCallConv        = 13,
+  LLVMPreserveMostCallConv  = 14,
+  LLVMPreserveAllCallConv   = 15,
+  LLVMSwiftCallConv         = 16,
+  LLVMCXXFASTTLSCallConv    = 17,
+  LLVMX86StdcallCallConv    = 64,
+  LLVMX86FastcallCallConv   = 65,
+  LLVMARMAPCSCallConv       = 66,
+  LLVMARMAAPCSCallConv      = 67,
+  LLVMARMAAPCSVFPCallConv   = 68,
+  LLVMMSP430INTRCallConv    = 69,
+  LLVMX86ThisCallCallConv   = 70,
+  LLVMPTXKernelCallConv     = 71,
+  LLVMPTXDeviceCallConv     = 72,
+  LLVMSPIRFUNCCallConv      = 75,
+  LLVMSPIRKERNELCallConv    = 76,
+  LLVMIntelOCLBICallConv    = 77,
+  LLVMX8664SysVCallConv     = 78,
+  LLVMWin64CallConv         = 79,
+  LLVMX86VectorCallCallConv = 80,
+  LLVMHHVMCallConv          = 81,
+  LLVMHHVMCCallConv         = 82,
+  LLVMX86INTRCallConv       = 83,
+  LLVMAVRINTRCallConv       = 84,
+  LLVMAVRSIGNALCallConv     = 85,
+  LLVMAVRBUILTINCallConv    = 86,
+  LLVMAMDGPUVSCallConv      = 87,
+  LLVMAMDGPUGSCallConv      = 88,
+  LLVMAMDGPUPSCallConv      = 89,
+  LLVMAMDGPUCSCallConv      = 90,
+  LLVMAMDGPUKERNELCallConv  = 91,
+  LLVMX86RegCallCallConv    = 92,
+  LLVMAMDGPUHSCallConv      = 93,
+  LLVMMSP430BUILTINCallConv = 94,
+  LLVMAMDGPULSCallConv      = 95,
+  LLVMAMDGPUESCallConv      = 96
 } LLVMCallConv;
 
 typedef enum {
@@ -335,6 +376,62 @@ typedef enum {
     LLVMDSNote
 } LLVMDiagnosticSeverity;
 
+typedef enum {
+  LLVMInlineAsmDialectATT,
+  LLVMInlineAsmDialectIntel
+} LLVMInlineAsmDialect;
+
+typedef enum {
+  /**
+   * Emits an error if two values disagree, otherwise the resulting value is
+   * that of the operands.
+   *
+   * @see Module::ModFlagBehavior::Error
+   */
+  LLVMModuleFlagBehaviorError,
+  /**
+   * Emits a warning if two values disagree. The result value will be the
+   * operand for the flag from the first module being linked.
+   *
+   * @see Module::ModFlagBehavior::Warning
+   */
+  LLVMModuleFlagBehaviorWarning,
+  /**
+   * Adds a requirement that another module flag be present and have a
+   * specified value after linking is performed. The value must be a metadata
+   * pair, where the first element of the pair is the ID of the module flag
+   * to be restricted, and the second element of the pair is the value the
+   * module flag should be restricted to. This behavior can be used to
+   * restrict the allowable results (via triggering of an error) of linking
+   * IDs with the **Override** behavior.
+   *
+   * @see Module::ModFlagBehavior::Require
+   */
+  LLVMModuleFlagBehaviorRequire,
+  /**
+   * Uses the specified value, regardless of the behavior or value of the
+   * other module. If both modules specify **Override**, but the values
+   * differ, an error will be emitted.
+   *
+   * @see Module::ModFlagBehavior::Override
+   */
+  LLVMModuleFlagBehaviorOverride,
+  /**
+   * Appends the two values, which are required to be metadata nodes.
+   *
+   * @see Module::ModFlagBehavior::Append
+   */
+  LLVMModuleFlagBehaviorAppend,
+  /**
+   * Appends the two values, which are required to be metadata
+   * nodes. However, duplicate entries in the second list are dropped
+   * during the append operation.
+   *
+   * @see Module::ModFlagBehavior::AppendUnique
+   */
+  LLVMModuleFlagBehaviorAppendUnique,
+} LLVMModuleFlagBehavior;
+
 /**
  * Attribute index are either LLVMAttributeReturnIndex,
  * LLVMAttributeFunctionIndex or a parameter number from 1 to N.
@@ -566,6 +663,27 @@ const char *LLVMGetModuleIdentifier(LLVMModuleRef M, size_t *Len);
 void LLVMSetModuleIdentifier(LLVMModuleRef M, const char *Ident, size_t Len);
 
 /**
+ * Obtain the module's original source file name.
+ *
+ * @param M Module to obtain the name of
+ * @param Len Out parameter which holds the length of the returned string
+ * @return The original source file name of M
+ * @see Module::getSourceFileName()
+ */
+const char *LLVMGetSourceFileName(LLVMModuleRef M, size_t *Len);
+
+/**
+ * Set the original source file name of a module to a string Name with length
+ * Len.
+ *
+ * @param M The module to set the source file name of
+ * @param Name The string to set M's source file name to
+ * @param Len Length of Name
+ * @see Module::setSourceFileName()
+ */
+void LLVMSetSourceFileName(LLVMModuleRef M, const char *Name, size_t Len);
+
+/**
  * Obtain the data layout for a module.
  *
  * @see Module::getDataLayoutStr()
@@ -599,6 +717,64 @@ const char *LLVMGetTarget(LLVMModuleRef M);
 void LLVMSetTarget(LLVMModuleRef M, const char *Triple);
 
 /**
+ * Returns the module flags as an array of flag-key-value triples.  The caller
+ * is responsible for freeing this array by calling
+ * \c LLVMDisposeModuleFlagsMetadata.
+ *
+ * @see Module::getModuleFlagsMetadata()
+ */
+LLVMModuleFlagEntry *LLVMCopyModuleFlagsMetadata(LLVMModuleRef M, size_t *Len);
+
+/**
+ * Destroys module flags metadata entries.
+ */
+void LLVMDisposeModuleFlagsMetadata(LLVMModuleFlagEntry *Entries);
+
+/**
+ * Returns the flag behavior for a module flag entry at a specific index.
+ *
+ * @see Module::ModuleFlagEntry::Behavior
+ */
+LLVMModuleFlagBehavior
+LLVMModuleFlagEntriesGetFlagBehavior(LLVMModuleFlagEntry *Entries,
+                                     unsigned Index);
+
+/**
+ * Returns the key for a module flag entry at a specific index.
+ *
+ * @see Module::ModuleFlagEntry::Key
+ */
+const char *LLVMModuleFlagEntriesGetKey(LLVMModuleFlagEntry *Entries,
+                                        unsigned Index, size_t *Len);
+
+/**
+ * Returns the metadata for a module flag entry at a specific index.
+ *
+ * @see Module::ModuleFlagEntry::Val
+ */
+LLVMMetadataRef LLVMModuleFlagEntriesGetMetadata(LLVMModuleFlagEntry *Entries,
+                                                 unsigned Index);
+
+/**
+ * Add a module-level flag to the module-level flags metadata if it doesn't
+ * already exist.
+ *
+ * @see Module::getModuleFlag()
+ */
+LLVMMetadataRef LLVMGetModuleFlag(LLVMModuleRef M,
+                                  const char *Key, size_t KeyLen);
+
+/**
+ * Add a module-level flag to the module-level flags metadata if it doesn't
+ * already exist.
+ *
+ * @see Module::addModuleFlag()
+ */
+void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
+                       const char *Key, size_t KeyLen,
+                       LLVMMetadataRef Val);
+
+/**
  * Dump a representation of a module to stderr.
  *
  * @see Module::dump()
@@ -623,11 +799,36 @@ LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
 char *LLVMPrintModuleToString(LLVMModuleRef M);
 
 /**
+ * Get inline assembly for a module.
+ *
+ * @see Module::getModuleInlineAsm()
+ */
+const char *LLVMGetModuleInlineAsm(LLVMModuleRef M, size_t *Len);
+
+/**
  * Set inline assembly for a module.
  *
  * @see Module::setModuleInlineAsm()
  */
-void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm);
+void LLVMSetModuleInlineAsm2(LLVMModuleRef M, const char *Asm, size_t Len);
+
+/**
+ * Append inline assembly to a module.
+ *
+ * @see Module::appendModuleInlineAsm()
+ */
+void LLVMAppendModuleInlineAsm(LLVMModuleRef M, const char *Asm, size_t Len);
+
+/**
+ * Create the specified uniqued inline asm string.
+ *
+ * @see InlineAsm::get()
+ */
+LLVMValueRef LLVMGetInlineAsm(LLVMTypeRef Ty,
+                              char *AsmString, size_t AsmStringSize,
+                              char *Constraints, size_t ConstraintsSize,
+                              LLVMBool HasSideEffects, LLVMBool IsAlignStack,
+                              LLVMInlineAsmDialect Dialect);
 
 /**
  * Obtain the context to which this module is associated.
@@ -718,6 +919,9 @@ LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn);
  */
 LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn);
 
+/** Deprecated: Use LLVMSetModuleInlineAsm2 instead. */
+void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm);
+
 /**
  * @}
  */
@@ -1292,14 +1496,14 @@ LLVMValueKind LLVMGetValueKind(LLVMValueRef Val);
  *
  * @see llvm::Value::getName()
  */
-const char *LLVMGetValueName(LLVMValueRef Val);
+const char *LLVMGetValueName2(LLVMValueRef Val, size_t *Length);
 
 /**
  * Set the string name of a value.
  *
  * @see llvm::Value::setName()
  */
-void LLVMSetValueName(LLVMValueRef Val, const char *Name);
+void LLVMSetValueName2(LLVMValueRef Val, const char *Name, size_t NameLen);
 
 /**
  * Dump a representation of a value to stderr.
@@ -1351,6 +1555,11 @@ LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DECLARE_VALUE_CAST)
 LLVMValueRef LLVMIsAMDNode(LLVMValueRef Val);
 LLVMValueRef LLVMIsAMDString(LLVMValueRef Val);
 
+/** Deprecated: Use LLVMGetValueName2 instead. */
+const char *LLVMGetValueName(LLVMValueRef Val);
+/** Deprecated: Use LLVMSetValueName2 instead. */
+void LLVMSetValueName(LLVMValueRef Val, const char *Name);
+
 /**
  * @}
  */
@@ -1793,10 +2002,12 @@ LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
 LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
                                   LLVMValueRef ElementValueConstant,
                                   unsigned *IdxList, unsigned NumIdx);
+LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB);
+
+/** Deprecated: Use LLVMGetInlineAsm instead. */
 LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty,
                                 const char *AsmString, const char *Constraints,
                                 LLVMBool HasSideEffects, LLVMBool IsAlignStack);
-LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB);
 
 /**
  * @}
@@ -1823,7 +2034,12 @@ LLVMVisibility LLVMGetVisibility(LLVMValueRef Global);
 void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz);
 LLVMDLLStorageClass LLVMGetDLLStorageClass(LLVMValueRef Global);
 void LLVMSetDLLStorageClass(LLVMValueRef Global, LLVMDLLStorageClass Class);
+LLVMUnnamedAddr LLVMGetUnnamedAddress(LLVMValueRef Global);
+void LLVMSetUnnamedAddress(LLVMValueRef Global, LLVMUnnamedAddr UnnamedAddr);
+
+/** Deprecated: Use LLVMGetUnnamedAddress instead. */
 LLVMBool LLVMHasUnnamedAddr(LLVMValueRef Global);
+/** Deprecated: Use LLVMSetUnnamedAddress instead. */
 void LLVMSetUnnamedAddr(LLVMValueRef Global, LLVMBool HasUnnamedAddr);
 
 /**
@@ -1902,6 +2118,56 @@ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                           const char *Name);
 
 /**
+ * Obtain a GlobalAlias value from a Module by its name.
+ *
+ * The returned value corresponds to a llvm::GlobalAlias value.
+ *
+ * @see llvm::Module::getNamedAlias()
+ */
+LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M,
+                                     const char *Name, size_t NameLen);
+
+/**
+ * Obtain an iterator to the first GlobalAlias in a Module.
+ *
+ * @see llvm::Module::alias_begin()
+ */
+LLVMValueRef LLVMGetFirstGlobalAlias(LLVMModuleRef M);
+
+/**
+ * Obtain an iterator to the last GlobalAlias in a Module.
+ *
+ * @see llvm::Module::alias_end()
+ */
+LLVMValueRef LLVMGetLastGlobalAlias(LLVMModuleRef M);
+
+/**
+ * Advance a GlobalAlias iterator to the next GlobalAlias.
+ *
+ * Returns NULL if the iterator was already at the end and there are no more
+ * global aliases.
+ */
+LLVMValueRef LLVMGetNextGlobalAlias(LLVMValueRef GA);
+
+/**
+ * Decrement a GlobalAlias iterator to the previous GlobalAlias.
+ *
+ * Returns NULL if the iterator was already at the beginning and there are
+ * no previous global aliases.
+ */
+LLVMValueRef LLVMGetPreviousGlobalAlias(LLVMValueRef GA);
+
+/**
+ * Retrieve the target value of an alias.
+ */
+LLVMValueRef LLVMAliasGetAliasee(LLVMValueRef Alias);
+
+/**
+ * Set the target value of an alias.
+ */
+void LLVMAliasSetAliasee(LLVMValueRef Alias, LLVMValueRef Aliasee);
+
+/**
  * @}
  */
 
@@ -2523,11 +2789,12 @@ LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst);
 /**
  * Obtain the argument count for a call instruction.
  *
- * This expects an LLVMValueRef that corresponds to a llvm::CallInst or
- * llvm::InvokeInst.
+ * This expects an LLVMValueRef that corresponds to a llvm::CallInst,
+ * llvm::InvokeInst, or llvm:FuncletPadInst.
  *
  * @see llvm::CallInst::getNumArgOperands()
  * @see llvm::InvokeInst::getNumArgOperands()
+ * @see llvm::FuncletPadInst::getNumArgOperands()
  */
 unsigned LLVMGetNumArgOperands(LLVMValueRef Instr);
 
@@ -2612,9 +2879,12 @@ LLVMBasicBlockRef LLVMGetNormalDest(LLVMValueRef InvokeInst);
 /**
  * Return the unwind destination basic block.
  *
- * This only works on llvm::InvokeInst instructions.
+ * Works on llvm::InvokeInst, llvm::CleanupReturnInst, and
+ * llvm::CatchSwitchInst instructions.
  *
  * @see llvm::InvokeInst::getUnwindDest()
+ * @see llvm::CleanupReturnInst::getUnwindDest()
+ * @see llvm::CatchSwitchInst::getUnwindDest()
  */
 LLVMBasicBlockRef LLVMGetUnwindDest(LLVMValueRef InvokeInst);
 
@@ -2630,9 +2900,12 @@ void LLVMSetNormalDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 /**
  * Set the unwind destination basic block.
  *
- * This only works on llvm::InvokeInst instructions.
+ * Works on llvm::InvokeInst, llvm::CleanupReturnInst, and
+ * llvm::CatchSwitchInst instructions.
  *
  * @see llvm::InvokeInst::setUnwindDest()
+ * @see llvm::CleanupReturnInst::setUnwindDest()
+ * @see llvm::CatchSwitchInst::setUnwindDest()
  */
 void LLVMSetUnwindDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 
@@ -2861,11 +3134,26 @@ LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef, LLVMValueRef Fn,
                              LLVMValueRef *Args, unsigned NumArgs,
                              LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
                              const char *Name);
+LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef);
+
+/* Exception Handling */
+LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn);
 LLVMValueRef LLVMBuildLandingPad(LLVMBuilderRef B, LLVMTypeRef Ty,
                                  LLVMValueRef PersFn, unsigned NumClauses,
                                  const char *Name);
-LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn);
-LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef);
+LLVMValueRef LLVMBuildCleanupRet(LLVMBuilderRef B, LLVMValueRef CatchPad,
+                                 LLVMBasicBlockRef BB);
+LLVMValueRef LLVMBuildCatchRet(LLVMBuilderRef B, LLVMValueRef CatchPad,
+                               LLVMBasicBlockRef BB);
+LLVMValueRef LLVMBuildCatchPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
+                               LLVMValueRef *Args, unsigned NumArgs,
+                               const char *Name);
+LLVMValueRef LLVMBuildCleanupPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
+                                 LLVMValueRef *Args, unsigned NumArgs,
+                                 const char *Name);
+LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad,
+                                  LLVMBasicBlockRef UnwindBB,
+                                  unsigned NumHandlers, const char *Name);
 
 /* Add a case to the switch instruction */
 void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal,
@@ -2889,6 +3177,51 @@ LLVMBool LLVMIsCleanup(LLVMValueRef LandingPad);
 /* Set the 'cleanup' flag in the landingpad instruction */
 void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val);
 
+/* Add a destination to the catchswitch instruction */
+void LLVMAddHandler(LLVMValueRef CatchSwitch, LLVMBasicBlockRef Dest);
+
+/* Get the number of handlers on the catchswitch instruction */
+unsigned LLVMGetNumHandlers(LLVMValueRef CatchSwitch);
+
+/**
+ * Obtain the basic blocks acting as handlers for a catchswitch instruction.
+ *
+ * The Handlers parameter should point to a pre-allocated array of
+ * LLVMBasicBlockRefs at least LLVMGetNumHandlers() large. On return, the
+ * first LLVMGetNumHandlers() entries in the array will be populated
+ * with LLVMBasicBlockRef instances.
+ *
+ * @param CatchSwitch The catchswitch instruction to operate on.
+ * @param Handlers Memory address of an array to be filled with basic blocks.
+ */
+void LLVMGetHandlers(LLVMValueRef CatchSwitch, LLVMBasicBlockRef *Handlers);
+
+/* Funclets */
+
+/* Get the number of funcletpad arguments. */
+LLVMValueRef LLVMGetArgOperand(LLVMValueRef Funclet, unsigned i);
+
+/* Set a funcletpad argument at the given index. */
+void LLVMSetArgOperand(LLVMValueRef Funclet, unsigned i, LLVMValueRef value);
+
+/**
+ * Get the parent catchswitch instruction of a catchpad instruction.
+ *
+ * This only works on llvm::CatchPadInst instructions.
+ *
+ * @see llvm::CatchPadInst::getCatchSwitch()
+ */
+LLVMValueRef LLVMGetParentCatchSwitch(LLVMValueRef CatchPad);
+
+/**
+ * Set the parent catchswitch instruction of a catchpad instruction.
+ *
+ * This only works on llvm::CatchPadInst instructions.
+ *
+ * @see llvm::CatchPadInst::setCatchSwitch()
+ */
+void LLVMSetParentCatchSwitch(LLVMValueRef CatchPad, LLVMValueRef CatchSwitch);
+
 /* Arithmetic */
 LLVMValueRef LLVMBuildAdd(LLVMBuilderRef, LLVMValueRef LHS, LLVMValueRef RHS,
                           const char *Name);
@@ -3186,7 +3519,7 @@ LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM);
     @see llvm::FunctionPassManager::run(Function&) */
 LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F);
 
-/** Finalizes all of the function passes scheduled in in the function pass
+/** Finalizes all of the function passes scheduled in the function pass
     manager. Returns 1 if any of the passes modified the module, 0 otherwise.
     @see llvm::FunctionPassManager::doFinalization */
 LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM);
diff --git a/contrib/llvm/include/llvm-c/DataTypes.h b/contrib/llvm/include/llvm-c/DataTypes.h
new file mode 100644
index 000000000000..7081c83ffc2b
--- /dev/null
+++ b/contrib/llvm/include/llvm-c/DataTypes.h
@@ -0,0 +1,90 @@
+/*===-- include/llvm-c/DataTypes.h - Define fixed size types ------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file contains definitions to figure out the size of _HOST_ data types.*|
+|* This file is important because different host OS's define different macros,*|
+|* which makes portability tough.  This file exports the following            *|
+|* definitions:                                                               *|
+|*                                                                            *|
+|*   [u]int(32|64)_t : typedefs for signed and unsigned 32/64 bit system types*|
+|*   [U]INT(8|16|32|64)_(MIN|MAX) : Constants for the min and max values.     *|
+|*                                                                            *|
+|* No library is required when using these functions.                         *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*/
+
+/* Please leave this file C-compatible. */
+
+#ifndef LLVM_C_DATATYPES_H
+#define LLVM_C_DATATYPES_H
+
+#ifdef __cplusplus
+#include <cmath>
+#else
+#include <math.h>
+#endif
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#ifndef _MSC_VER
+
+#if !defined(UINT32_MAX)
+# error "The standard header <cstdint> is not C++11 compliant. Must #define "\
+        "__STDC_LIMIT_MACROS before #including llvm-c/DataTypes.h"
+#endif
+
+#if !defined(UINT32_C)
+# error "The standard header <cstdint> is not C++11 compliant. Must #define "\
+        "__STDC_CONSTANT_MACROS before #including llvm-c/DataTypes.h"
+#endif
+
+/* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
+#include <sys/types.h>
+
+#ifdef _AIX
+// GCC is strict about defining large constants: they must have LL modifier.
+#undef INT64_MAX
+#undef INT64_MIN
+#endif
+
+#else /* _MSC_VER */
+#ifdef __cplusplus
+#include <cstddef>
+#include <cstdlib>
+#else
+#include <stddef.h>
+#include <stdlib.h>
+#endif
+#include <sys/types.h>
+
+#if defined(_WIN64)
+typedef signed __int64 ssize_t;
+#else
+typedef signed int ssize_t;
+#endif /* _WIN64 */
+
+#endif /* _MSC_VER */
+
+/* Set defaults for constants which we cannot find. */
+#if !defined(INT64_MAX)
+# define INT64_MAX 9223372036854775807LL
+#endif
+#if !defined(INT64_MIN)
+# define INT64_MIN ((-INT64_MAX)-1)
+#endif
+#if !defined(UINT64_MAX)
+# define UINT64_MAX 0xffffffffffffffffULL
+#endif
+
+#ifndef HUGE_VALF
+#define HUGE_VALF (float)HUGE_VAL
+#endif
+
+#endif /* LLVM_C_DATATYPES_H */
diff --git a/contrib/llvm/include/llvm-c/DebugInfo.h b/contrib/llvm/include/llvm-c/DebugInfo.h
index d17c690be4da..cee6755f1874 100644
--- a/contrib/llvm/include/llvm-c/DebugInfo.h
+++ b/contrib/llvm/include/llvm-c/DebugInfo.h
@@ -52,6 +52,11 @@ typedef enum {
   LLVMDIFlagBitField = 1 << 19,
   LLVMDIFlagNoReturn = 1 << 20,
   LLVMDIFlagMainSubprogram = 1 << 21,
+  LLVMDIFlagTypePassByValue = 1 << 22,
+  LLVMDIFlagTypePassByReference = 1 << 23,
+  LLVMDIFlagFixedEnum = 1 << 24,
+  LLVMDIFlagThunk = 1 << 25,
+  LLVMDIFlagTrivial = 1 << 26,
   LLVMDIFlagIndirectVirtualBase = (1 << 2) | (1 << 5),
   LLVMDIFlagAccessibility = LLVMDIFlagPrivate | LLVMDIFlagProtected |
                             LLVMDIFlagPublic,
@@ -120,6 +125,11 @@ typedef enum {
 } LLVMDWARFEmissionKind;
 
 /**
+ * An LLVM DWARF type encoding.
+ */
+typedef unsigned LLVMDWARFTypeEncoding;
+
+/**
  * The current debug metadata version number.
  */
 unsigned LLVMDebugMetadataVersion(void);
@@ -211,6 +221,158 @@ LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename,
                         size_t DirectoryLen);
 
 /**
+ * Creates a new descriptor for a module with the specified parent scope.
+ * \param Builder         The \c DIBuilder.
+ * \param ParentScope     The parent scope containing this module declaration.
+ * \param Name            Module name.
+ * \param NameLen         The length of the C string passed to \c Name.
+ * \param ConfigMacros    A space-separated shell-quoted list of -D macro
+                          definitions as they would appear on a command line.
+ * \param ConfigMacrosLen The length of the C string passed to \c ConfigMacros.
+ * \param IncludePath     The path to the module map file.
+ * \param IncludePathLen  The length of the C string passed to \c IncludePath.
+ * \param ISysRoot        The Clang system root (value of -isysroot).
+ * \param ISysRootLen     The length of the C string passed to \c ISysRoot.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope,
+                          const char *Name, size_t NameLen,
+                          const char *ConfigMacros, size_t ConfigMacrosLen,
+                          const char *IncludePath, size_t IncludePathLen,
+                          const char *ISysRoot, size_t ISysRootLen);
+
+/**
+ * Creates a new descriptor for a namespace with the specified parent scope.
+ * \param Builder          The \c DIBuilder.
+ * \param ParentScope      The parent scope containing this module declaration.
+ * \param Name             NameSpace name.
+ * \param NameLen          The length of the C string passed to \c Name.
+ * \param ExportSymbols    Whether or not the namespace exports symbols, e.g.
+ *                         this is true of C++ inline namespaces.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateNameSpace(LLVMDIBuilderRef Builder,
+                             LLVMMetadataRef ParentScope,
+                             const char *Name, size_t NameLen,
+                             LLVMBool ExportSymbols);
+
+/**
+ * Create a new descriptor for the specified subprogram.
+ * \param Builder         The \c DIBuilder.
+ * \param Scope           Function scope.
+ * \param Name            Function name.
+ * \param NameLen         Length of enumeration name.
+ * \param LinkageName     Mangled function name.
+ * \param LinkageNameLen  Length of linkage name.
+ * \param File            File where this variable is defined.
+ * \param LineNo          Line number.
+ * \param Ty              Function type.
+ * \param IsLocalToUnit   True if this function is not externally visible.
+ * \param IsDefinition    True if this is a function definition.
+ * \param ScopeLine       Set to the beginning of the scope this starts
+ * \param Flags           E.g.: \c LLVMDIFlagLValueReference. These flags are
+ *                        used to emit dwarf attributes.
+ * \param IsOptimized     True if optimization is ON.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateFunction(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, const char *LinkageName, size_t LinkageNameLen,
+    LLVMMetadataRef File, unsigned LineNo, LLVMMetadataRef Ty,
+    LLVMBool IsLocalToUnit, LLVMBool IsDefinition,
+    unsigned ScopeLine, LLVMDIFlags Flags, LLVMBool IsOptimized);
+
+/**
+ * Create a descriptor for a lexical block with the specified parent context.
+ * \param Builder      The \c DIBuilder.
+ * \param Scope        Parent lexical block.
+ * \param File         Source file.
+ * \param Line         The line in the source file.
+ * \param Column       The column in the source file.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateLexicalBlock(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope,
+    LLVMMetadataRef File, unsigned Line, unsigned Column);
+
+/**
+ * Create a descriptor for a lexical block with a new file attached.
+ * \param Builder        The \c DIBuilder.
+ * \param Scope          Lexical block.
+ * \param File           Source file.
+ * \param Discriminator  DWARF path discriminator value.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef Builder,
+                                    LLVMMetadataRef Scope,
+                                    LLVMMetadataRef File,
+                                    unsigned Discriminator);
+
+/**
+ * Create a descriptor for an imported namespace. Suitable for e.g. C++
+ * using declarations.
+ * \param Builder    The \c DIBuilder.
+ * \param Scope      The scope this module is imported into
+ * \param File       File where the declaration is located.
+ * \param Line       Line number of the declaration.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedModuleFromNamespace(LLVMDIBuilderRef Builder,
+                                               LLVMMetadataRef Scope,
+                                               LLVMMetadataRef NS,
+                                               LLVMMetadataRef File,
+                                               unsigned Line);
+
+/**
+ * Create a descriptor for an imported module that aliases another
+ * imported entity descriptor.
+ * \param Builder        The \c DIBuilder.
+ * \param Scope          The scope this module is imported into
+ * \param ImportedEntity Previous imported entity to alias.
+ * \param File           File where the declaration is located.
+ * \param Line           Line number of the declaration.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedModuleFromAlias(LLVMDIBuilderRef Builder,
+                                           LLVMMetadataRef Scope,
+                                           LLVMMetadataRef ImportedEntity,
+                                           LLVMMetadataRef File,
+                                           unsigned Line);
+
+/**
+ * Create a descriptor for an imported module.
+ * \param Builder    The \c DIBuilder.
+ * \param Scope      The scope this module is imported into
+ * \param M          The module being imported here
+ * \param File       File where the declaration is located.
+ * \param Line       Line number of the declaration.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedModuleFromModule(LLVMDIBuilderRef Builder,
+                                            LLVMMetadataRef Scope,
+                                            LLVMMetadataRef M,
+                                            LLVMMetadataRef File,
+                                            unsigned Line);
+
+/**
+ * Create a descriptor for an imported function, type, or variable.  Suitable
+ * for e.g. FORTRAN-style USE declarations.
+ * \param Builder    The DIBuilder.
+ * \param Scope      The scope this module is imported into.
+ * \param Decl       The declaration (or definition) of a function, type,
+                     or variable.
+ * \param File       File where the declaration is located.
+ * \param Line       Line number of the declaration.
+ * \param Name       A name that uniquely identifies this imported declaration.
+ * \param NameLen    The length of the C string passed to \c Name.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedDeclaration(LLVMDIBuilderRef Builder,
+                                       LLVMMetadataRef Scope,
+                                       LLVMMetadataRef Decl,
+                                       LLVMMetadataRef File,
+                                       unsigned Line,
+                                       const char *Name, size_t NameLen);
+
+/**
  * Creates a new DebugLocation that describes a source location.
  * \param Line The line in the source file.
  * \param Column The column in the source file.
@@ -225,6 +387,768 @@ LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line,
                                  unsigned Column, LLVMMetadataRef Scope,
                                  LLVMMetadataRef InlinedAt);
 
+/**
+ * Get the line number of this debug location.
+ * \param Location     The debug location.
+ *
+ * @see DILocation::getLine()
+ */
+unsigned LLVMDILocationGetLine(LLVMMetadataRef Location);
+
+/**
+ * Get the column number of this debug location.
+ * \param Location     The debug location.
+ *
+ * @see DILocation::getColumn()
+ */
+unsigned LLVMDILocationGetColumn(LLVMMetadataRef Location);
+
+/**
+ * Get the local scope associated with this debug location.
+ * \param Location     The debug location.
+ *
+ * @see DILocation::getScope()
+ */
+LLVMMetadataRef LLVMDILocationGetScope(LLVMMetadataRef Location);
+
+/**
+ * Create a type array.
+ * \param Builder        The DIBuilder.
+ * \param Data           The type elements.
+ * \param NumElements    Number of type elements.
+ */
+LLVMMetadataRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef Builder,
+                                                  LLVMMetadataRef *Data,
+                                                  size_t NumElements);
+
+/**
+ * Create subroutine type.
+ * \param Builder        The DIBuilder.
+ * \param File            The file in which the subroutine resides.
+ * \param ParameterTypes  An array of subroutine parameter types. This
+ *                        includes return type at 0th index.
+ * \param NumParameterTypes The number of parameter types in \c ParameterTypes
+ * \param Flags           E.g.: \c LLVMDIFlagLValueReference.
+ *                        These flags are used to emit dwarf attributes.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Builder,
+                                  LLVMMetadataRef File,
+                                  LLVMMetadataRef *ParameterTypes,
+                                  unsigned NumParameterTypes,
+                                  LLVMDIFlags Flags);
+
+/**
+ * Create debugging information entry for an enumeration.
+ * \param Builder        The DIBuilder.
+ * \param Scope          Scope in which this enumeration is defined.
+ * \param Name           Enumeration name.
+ * \param NameLen        Length of enumeration name.
+ * \param File           File where this member is defined.
+ * \param LineNumber     Line number.
+ * \param SizeInBits     Member size.
+ * \param AlignInBits    Member alignment.
+ * \param Elements       Enumeration elements.
+ * \param NumElements    Number of enumeration elements.
+ * \param ClassTy        Underlying type of a C++11/ObjC fixed enum.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateEnumerationType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, LLVMMetadataRef *Elements,
+    unsigned NumElements, LLVMMetadataRef ClassTy);
+
+/**
+ * Create debugging information entry for a union.
+ * \param Builder      The DIBuilder.
+ * \param Scope        Scope in which this union is defined.
+ * \param Name         Union name.
+ * \param NameLen      Length of union name.
+ * \param File         File where this member is defined.
+ * \param LineNumber   Line number.
+ * \param SizeInBits   Member size.
+ * \param AlignInBits  Member alignment.
+ * \param Flags        Flags to encode member attribute, e.g. private
+ * \param Elements     Union elements.
+ * \param NumElements  Number of union elements.
+ * \param RunTimeLang  Optional parameter, Objective-C runtime version.
+ * \param UniqueId     A unique identifier for the union.
+ * \param UniqueIdLen  Length of unique identifier.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateUnionType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, LLVMDIFlags Flags,
+    LLVMMetadataRef *Elements, unsigned NumElements, unsigned RunTimeLang,
+    const char *UniqueId, size_t UniqueIdLen);
+
+
+/**
+ * Create debugging information entry for an array.
+ * \param Builder      The DIBuilder.
+ * \param Size         Array size.
+ * \param AlignInBits  Alignment.
+ * \param Ty           Element type.
+ * \param Subscripts   Subscripts.
+ * \param NumSubscripts Number of subscripts.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef Builder, uint64_t Size,
+                             uint32_t AlignInBits, LLVMMetadataRef Ty,
+                             LLVMMetadataRef *Subscripts,
+                             unsigned NumSubscripts);
+
+/**
+ * Create debugging information entry for a vector type.
+ * \param Builder      The DIBuilder.
+ * \param Size         Vector size.
+ * \param AlignInBits  Alignment.
+ * \param Ty           Element type.
+ * \param Subscripts   Subscripts.
+ * \param NumSubscripts Number of subscripts.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateVectorType(LLVMDIBuilderRef Builder, uint64_t Size,
+                              uint32_t AlignInBits, LLVMMetadataRef Ty,
+                              LLVMMetadataRef *Subscripts,
+                              unsigned NumSubscripts);
+
+/**
+ * Create a DWARF unspecified type.
+ * \param Builder   The DIBuilder.
+ * \param Name      The unspecified type's name.
+ * \param NameLen   Length of type name.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateUnspecifiedType(LLVMDIBuilderRef Builder, const char *Name,
+                                   size_t NameLen);
+
+/**
+ * Create debugging information entry for a basic
+ * type.
+ * \param Builder     The DIBuilder.
+ * \param Name        Type name.
+ * \param NameLen     Length of type name.
+ * \param SizeInBits  Size of the type.
+ * \param Encoding    DWARF encoding code, e.g. \c LLVMDWARFTypeEncoding_float.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef Builder, const char *Name,
+                             size_t NameLen, uint64_t SizeInBits,
+                             LLVMDWARFTypeEncoding Encoding);
+
+/**
+ * Create debugging information entry for a pointer.
+ * \param Builder     The DIBuilder.
+ * \param PointeeTy         Type pointed by this pointer.
+ * \param SizeInBits        Size.
+ * \param AlignInBits       Alignment. (optional, pass 0 to ignore)
+ * \param AddressSpace      DWARF address space. (optional, pass 0 to ignore)
+ * \param Name              Pointer type name. (optional)
+ * \param NameLen           Length of pointer type name. (optional)
+ */
+LLVMMetadataRef LLVMDIBuilderCreatePointerType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef PointeeTy,
+    uint64_t SizeInBits, uint32_t AlignInBits, unsigned AddressSpace,
+    const char *Name, size_t NameLen);
+
+/**
+ * Create debugging information entry for a struct.
+ * \param Builder     The DIBuilder.
+ * \param Scope        Scope in which this struct is defined.
+ * \param Name         Struct name.
+ * \param NameLen      Struct name length.
+ * \param File         File where this member is defined.
+ * \param LineNumber   Line number.
+ * \param SizeInBits   Member size.
+ * \param AlignInBits  Member alignment.
+ * \param Flags        Flags to encode member attribute, e.g. private
+ * \param Elements     Struct elements.
+ * \param NumElements  Number of struct elements.
+ * \param RunTimeLang  Optional parameter, Objective-C runtime version.
+ * \param VTableHolder The object containing the vtable for the struct.
+ * \param UniqueId     A unique identifier for the struct.
+ * \param UniqueIdLen  Length of the unique identifier for the struct.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateStructType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, LLVMDIFlags Flags,
+    LLVMMetadataRef DerivedFrom, LLVMMetadataRef *Elements,
+    unsigned NumElements, unsigned RunTimeLang, LLVMMetadataRef VTableHolder,
+    const char *UniqueId, size_t UniqueIdLen);
+
+/**
+ * Create debugging information entry for a member.
+ * \param Builder      The DIBuilder.
+ * \param Scope        Member scope.
+ * \param Name         Member name.
+ * \param NameLen      Length of member name.
+ * \param File         File where this member is defined.
+ * \param LineNo       Line number.
+ * \param SizeInBits   Member size.
+ * \param AlignInBits  Member alignment.
+ * \param OffsetInBits Member offset.
+ * \param Flags        Flags to encode member attribute, e.g. private
+ * \param Ty           Parent type.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateMemberType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNo,
+    uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+    LLVMDIFlags Flags, LLVMMetadataRef Ty);
+
+/**
+ * Create debugging information entry for a
+ * C++ static data member.
+ * \param Builder      The DIBuilder.
+ * \param Scope        Member scope.
+ * \param Name         Member name.
+ * \param NameLen      Length of member name.
+ * \param File         File where this member is declared.
+ * \param LineNumber   Line number.
+ * \param Type         Type of the static member.
+ * \param Flags        Flags to encode member attribute, e.g. private.
+ * \param ConstantVal  Const initializer of the member.
+ * \param AlignInBits  Member alignment.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateStaticMemberType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    LLVMMetadataRef Type, LLVMDIFlags Flags, LLVMValueRef ConstantVal,
+    uint32_t AlignInBits);
+
+/**
+ * Create debugging information entry for a pointer to member.
+ * \param Builder      The DIBuilder.
+ * \param PointeeType  Type pointed to by this pointer.
+ * \param ClassType    Type for which this pointer points to members of.
+ * \param SizeInBits   Size.
+ * \param AlignInBits  Alignment.
+ * \param Flags        Flags.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateMemberPointerType(LLVMDIBuilderRef Builder,
+                                     LLVMMetadataRef PointeeType,
+                                     LLVMMetadataRef ClassType,
+                                     uint64_t SizeInBits,
+                                     uint32_t AlignInBits,
+                                     LLVMDIFlags Flags);
+/**
+ * Create debugging information entry for Objective-C instance variable.
+ * \param Builder      The DIBuilder.
+ * \param Name         Member name.
+ * \param NameLen      The length of the C string passed to \c Name.
+ * \param File         File where this member is defined.
+ * \param LineNo       Line number.
+ * \param SizeInBits   Member size.
+ * \param AlignInBits  Member alignment.
+ * \param OffsetInBits Member offset.
+ * \param Flags        Flags to encode member attribute, e.g. private
+ * \param Ty           Parent type.
+ * \param PropertyNode Property associated with this ivar.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateObjCIVar(LLVMDIBuilderRef Builder,
+                            const char *Name, size_t NameLen,
+                            LLVMMetadataRef File, unsigned LineNo,
+                            uint64_t SizeInBits, uint32_t AlignInBits,
+                            uint64_t OffsetInBits, LLVMDIFlags Flags,
+                            LLVMMetadataRef Ty, LLVMMetadataRef PropertyNode);
+
+/**
+ * Create debugging information entry for Objective-C property.
+ * \param Builder            The DIBuilder.
+ * \param Name               Property name.
+ * \param NameLen            The length of the C string passed to \c Name.
+ * \param File               File where this property is defined.
+ * \param LineNo             Line number.
+ * \param GetterName         Name of the Objective C property getter selector.
+ * \param GetterNameLen      The length of the C string passed to \c GetterName.
+ * \param SetterName         Name of the Objective C property setter selector.
+ * \param SetterNameLen      The length of the C string passed to \c SetterName.
+ * \param PropertyAttributes Objective C property attributes.
+ * \param Ty                 Type.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateObjCProperty(LLVMDIBuilderRef Builder,
+                                const char *Name, size_t NameLen,
+                                LLVMMetadataRef File, unsigned LineNo,
+                                const char *GetterName, size_t GetterNameLen,
+                                const char *SetterName, size_t SetterNameLen,
+                                unsigned PropertyAttributes,
+                                LLVMMetadataRef Ty);
+
+/**
+ * Create a uniqued DIType* clone with FlagObjectPointer and FlagArtificial set.
+ * \param Builder   The DIBuilder.
+ * \param Type      The underlying type to which this pointer points.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
+                                     LLVMMetadataRef Type);
+
+/**
+ * Create debugging information entry for a qualified
+ * type, e.g. 'const int'.
+ * \param Builder     The DIBuilder.
+ * \param Tag         Tag identifying type,
+ *                    e.g. LLVMDWARFTypeQualifier_volatile_type
+ * \param Type        Base Type.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateQualifiedType(LLVMDIBuilderRef Builder, unsigned Tag,
+                                 LLVMMetadataRef Type);
+
+/**
+ * Create debugging information entry for a c++
+ * style reference or rvalue reference type.
+ * \param Builder   The DIBuilder.
+ * \param Tag       Tag identifying type,
+ * \param Type      Base Type.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateReferenceType(LLVMDIBuilderRef Builder, unsigned Tag,
+                                 LLVMMetadataRef Type);
+
+/**
+ * Create C++11 nullptr type.
+ * \param Builder   The DIBuilder.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateNullPtrType(LLVMDIBuilderRef Builder);
+
+/**
+ * Create debugging information entry for a typedef.
+ * \param Builder    The DIBuilder.
+ * \param Type       Original type.
+ * \param Name       Typedef name.
+ * \param File       File where this type is defined.
+ * \param LineNo     Line number.
+ * \param Scope      The surrounding context for the typedef.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Builder, LLVMMetadataRef Type,
+                           const char *Name, size_t NameLen,
+                           LLVMMetadataRef File, unsigned LineNo,
+                           LLVMMetadataRef Scope);
+
+/**
+ * Create debugging information entry to establish inheritance relationship
+ * between two types.
+ * \param Builder       The DIBuilder.
+ * \param Ty            Original type.
+ * \param BaseTy        Base type. Ty is inherits from base.
+ * \param BaseOffset    Base offset.
+ * \param VBPtrOffset  Virtual base pointer offset.
+ * \param Flags         Flags to describe inheritance attribute, e.g. private
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateInheritance(LLVMDIBuilderRef Builder,
+                               LLVMMetadataRef Ty, LLVMMetadataRef BaseTy,
+                               uint64_t BaseOffset, uint32_t VBPtrOffset,
+                               LLVMDIFlags Flags);
+
+/**
+ * Create a permanent forward-declared type.
+ * \param Builder             The DIBuilder.
+ * \param Tag                 A unique tag for this type.
+ * \param Name                Type name.
+ * \param NameLen             Length of type name.
+ * \param Scope               Type scope.
+ * \param File                File where this type is defined.
+ * \param Line                Line number where this type is defined.
+ * \param RuntimeLang         Indicates runtime version for languages like
+ *                            Objective-C.
+ * \param SizeInBits          Member size.
+ * \param AlignInBits         Member alignment.
+ * \param UniqueIdentifier    A unique identifier for the type.
+ * \param UniqueIdentifierLen Length of the unique identifier.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateForwardDecl(
+    LLVMDIBuilderRef Builder, unsigned Tag, const char *Name,
+    size_t NameLen, LLVMMetadataRef Scope, LLVMMetadataRef File, unsigned Line,
+    unsigned RuntimeLang, uint64_t SizeInBits, uint32_t AlignInBits,
+    const char *UniqueIdentifier, size_t UniqueIdentifierLen);
+
+/**
+ * Create a temporary forward-declared type.
+ * \param Builder             The DIBuilder.
+ * \param Tag                 A unique tag for this type.
+ * \param Name                Type name.
+ * \param NameLen             Length of type name.
+ * \param Scope               Type scope.
+ * \param File                File where this type is defined.
+ * \param Line                Line number where this type is defined.
+ * \param RuntimeLang         Indicates runtime version for languages like
+ *                            Objective-C.
+ * \param SizeInBits          Member size.
+ * \param AlignInBits         Member alignment.
+ * \param Flags               Flags.
+ * \param UniqueIdentifier    A unique identifier for the type.
+ * \param UniqueIdentifierLen Length of the unique identifier.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateReplaceableCompositeType(
+    LLVMDIBuilderRef Builder, unsigned Tag, const char *Name,
+    size_t NameLen, LLVMMetadataRef Scope, LLVMMetadataRef File, unsigned Line,
+    unsigned RuntimeLang, uint64_t SizeInBits, uint32_t AlignInBits,
+    LLVMDIFlags Flags, const char *UniqueIdentifier,
+    size_t UniqueIdentifierLen);
+
+/**
+ * Create debugging information entry for a bit field member.
+ * \param Builder             The DIBuilder.
+ * \param Scope               Member scope.
+ * \param Name                Member name.
+ * \param NameLen             Length of member name.
+ * \param File                File where this member is defined.
+ * \param LineNumber          Line number.
+ * \param SizeInBits          Member size.
+ * \param OffsetInBits        Member offset.
+ * \param StorageOffsetInBits Member storage offset.
+ * \param Flags               Flags to encode member attribute.
+ * \param Type                Parent type.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateBitFieldMemberType(LLVMDIBuilderRef Builder,
+                                      LLVMMetadataRef Scope,
+                                      const char *Name, size_t NameLen,
+                                      LLVMMetadataRef File, unsigned LineNumber,
+                                      uint64_t SizeInBits,
+                                      uint64_t OffsetInBits,
+                                      uint64_t StorageOffsetInBits,
+                                      LLVMDIFlags Flags, LLVMMetadataRef Type);
+
+/**
+ * Create debugging information entry for a class.
+ * \param Scope               Scope in which this class is defined.
+ * \param Name                Class name.
+ * \param NameLen             The length of the C string passed to \c Name.
+ * \param File                File where this member is defined.
+ * \param LineNumber          Line number.
+ * \param SizeInBits          Member size.
+ * \param AlignInBits         Member alignment.
+ * \param OffsetInBits        Member offset.
+ * \param Flags               Flags to encode member attribute, e.g. private.
+ * \param DerivedFrom         Debug info of the base class of this type.
+ * \param Elements            Class members.
+ * \param NumElements         Number of class elements.
+ * \param VTableHolder        Debug info of the base class that contains vtable
+ *                            for this type. This is used in
+ *                            DW_AT_containing_type. See DWARF documentation
+ *                            for more info.
+ * \param TemplateParamsNode  Template type parameters.
+ * \param UniqueIdentifier    A unique identifier for the type.
+ * \param UniqueIdentifierLen Length of the unique identifier.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateClassType(LLVMDIBuilderRef Builder,
+    LLVMMetadataRef Scope, const char *Name, size_t NameLen,
+    LLVMMetadataRef File, unsigned LineNumber, uint64_t SizeInBits,
+    uint32_t AlignInBits, uint64_t OffsetInBits, LLVMDIFlags Flags,
+    LLVMMetadataRef DerivedFrom,
+    LLVMMetadataRef *Elements, unsigned NumElements,
+    LLVMMetadataRef VTableHolder, LLVMMetadataRef TemplateParamsNode,
+    const char *UniqueIdentifier, size_t UniqueIdentifierLen);
+
+/**
+ * Create a uniqued DIType* clone with FlagArtificial set.
+ * \param Builder     The DIBuilder.
+ * \param Type        The underlying type.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateArtificialType(LLVMDIBuilderRef Builder,
+                                  LLVMMetadataRef Type);
+
+/**
+ * Get the name of this DIType.
+ * \param DType     The DIType.
+ * \param Length    The length of the returned string.
+ *
+ * @see DIType::getName()
+ */
+const char *LLVMDITypeGetName(LLVMMetadataRef DType, size_t *Length);
+
+/**
+ * Get the size of this DIType in bits.
+ * \param DType     The DIType.
+ *
+ * @see DIType::getSizeInBits()
+ */
+uint64_t LLVMDITypeGetSizeInBits(LLVMMetadataRef DType);
+
+/**
+ * Get the offset of this DIType in bits.
+ * \param DType     The DIType.
+ *
+ * @see DIType::getOffsetInBits()
+ */
+uint64_t LLVMDITypeGetOffsetInBits(LLVMMetadataRef DType);
+
+/**
+ * Get the alignment of this DIType in bits.
+ * \param DType     The DIType.
+ *
+ * @see DIType::getAlignInBits()
+ */
+uint32_t LLVMDITypeGetAlignInBits(LLVMMetadataRef DType);
+
+/**
+ * Get the source line where this DIType is declared.
+ * \param DType     The DIType.
+ *
+ * @see DIType::getLine()
+ */
+unsigned LLVMDITypeGetLine(LLVMMetadataRef DType);
+
+/**
+ * Get the flags associated with this DIType.
+ * \param DType     The DIType.
+ *
+ * @see DIType::getFlags()
+ */
+LLVMDIFlags LLVMDITypeGetFlags(LLVMMetadataRef DType);
+
+/**
+ * Create a descriptor for a value range.
+ * \param Builder    The DIBuilder.
+ * \param LowerBound Lower bound of the subrange, e.g. 0 for C, 1 for Fortran.
+ * \param Count      Count of elements in the subrange.
+ */
+LLVMMetadataRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef Builder,
+                                                 int64_t LowerBound,
+                                                 int64_t Count);
+
+/**
+ * Create an array of DI Nodes.
+ * \param Builder        The DIBuilder.
+ * \param Data           The DI Node elements.
+ * \param NumElements    Number of DI Node elements.
+ */
+LLVMMetadataRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef Builder,
+                                              LLVMMetadataRef *Data,
+                                              size_t NumElements);
+
+/**
+ * Create a new descriptor for the specified variable which has a complex
+ * address expression for its address.
+ * \param Builder     The DIBuilder.
+ * \param Addr        An array of complex address operations.
+ * \param Length      Length of the address operation array.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Builder,
+                                              int64_t *Addr, size_t Length);
+
+/**
+ * Create a new descriptor for the specified variable that does not have an
+ * address, but does have a constant value.
+ * \param Builder     The DIBuilder.
+ * \param Value       The constant value.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateConstantValueExpression(LLVMDIBuilderRef Builder,
+                                           int64_t Value);
+
+/**
+ * Create a new descriptor for the specified variable.
+ * \param Scope       Variable scope.
+ * \param Name        Name of the variable.
+ * \param NameLen     The length of the C string passed to \c Name.
+ * \param Linkage     Mangled  name of the variable.
+ * \param LinkLen     The length of the C string passed to \c Linkage.
+ * \param File        File where this variable is defined.
+ * \param LineNo      Line number.
+ * \param Ty          Variable Type.
+ * \param LocalToUnit Boolean flag indicate whether this variable is
+ *                    externally visible or not.
+ * \param Expr        The location of the global relative to the attached
+ *                    GlobalVariable.
+ * \param Decl        Reference to the corresponding declaration.
+ * \param AlignInBits Variable alignment(or 0 if no alignment attr was
+ *                    specified)
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateGlobalVariableExpression(LLVMDIBuilderRef Builder,
+                                            LLVMMetadataRef Scope,
+                                            const char *Name, size_t NameLen,
+                                            const char *Linkage, size_t LinkLen,
+                                            LLVMMetadataRef File,
+                                            unsigned LineNo,
+                                            LLVMMetadataRef Ty,
+                                            LLVMBool LocalToUnit,
+                                            LLVMMetadataRef Expr,
+                                            LLVMMetadataRef Decl,
+                                            uint32_t AlignInBits);
+/**
+ * Create a new temporary \c MDNode.  Suitable for use in constructing cyclic
+ * \c MDNode structures. A temporary \c MDNode is not uniqued, may be RAUW'd,
+ * and must be manually deleted with \c LLVMDisposeTemporaryMDNode.
+ * \param Ctx            The context in which to construct the temporary node.
+ * \param Data           The metadata elements.
+ * \param NumElements    Number of metadata elements.
+ */
+LLVMMetadataRef LLVMTemporaryMDNode(LLVMContextRef Ctx, LLVMMetadataRef *Data,
+                                    size_t NumElements);
+
+/**
+ * Deallocate a temporary node.
+ *
+ * Calls \c replaceAllUsesWith(nullptr) before deleting, so any remaining
+ * references will be reset.
+ * \param TempNode    The temporary metadata node.
+ */
+void LLVMDisposeTemporaryMDNode(LLVMMetadataRef TempNode);
+
+/**
+ * Replace all uses of temporary metadata.
+ * \param TempTargetMetadata    The temporary metadata node.
+ * \param Replacement           The replacement metadata node.
+ */
+void LLVMMetadataReplaceAllUsesWith(LLVMMetadataRef TempTargetMetadata,
+                                    LLVMMetadataRef Replacement);
+
+/**
+ * Create a new descriptor for the specified global variable that is temporary
+ * and meant to be RAUWed.
+ * \param Scope       Variable scope.
+ * \param Name        Name of the variable.
+ * \param NameLen     The length of the C string passed to \c Name.
+ * \param Linkage     Mangled  name of the variable.
+ * \param LnkLen      The length of the C string passed to \c Linkage.
+ * \param File        File where this variable is defined.
+ * \param LineNo      Line number.
+ * \param Ty          Variable Type.
+ * \param LocalToUnit Boolean flag indicate whether this variable is
+ *                    externally visible or not.
+ * \param Decl        Reference to the corresponding declaration.
+ * \param AlignInBits Variable alignment(or 0 if no alignment attr was
+ *                    specified)
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateTempGlobalVariableFwdDecl(LLVMDIBuilderRef Builder,
+                                             LLVMMetadataRef Scope,
+                                             const char *Name, size_t NameLen,
+                                             const char *Linkage, size_t LnkLen,
+                                             LLVMMetadataRef File,
+                                             unsigned LineNo,
+                                             LLVMMetadataRef Ty,
+                                             LLVMBool LocalToUnit,
+                                             LLVMMetadataRef Decl,
+                                             uint32_t AlignInBits);
+
+/**
+ * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDeclareBefore(
+  LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+  LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+
+/**
+ * Insert a new llvm.dbg.declare intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+
+/**
+ * Insert a new llvm.dbg.value intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
+                                               LLVMValueRef Val,
+                                               LLVMMetadataRef VarInfo,
+                                               LLVMMetadataRef Expr,
+                                               LLVMMetadataRef DebugLoc,
+                                               LLVMValueRef Instr);
+
+/**
+ * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(LLVMDIBuilderRef Builder,
+                                              LLVMValueRef Val,
+                                              LLVMMetadataRef VarInfo,
+                                              LLVMMetadataRef Expr,
+                                              LLVMMetadataRef DebugLoc,
+                                              LLVMBasicBlockRef Block);
+
+/**
+ * Create a new descriptor for a local auto variable.
+ * \param Builder         The DIBuilder.
+ * \param Scope           The local scope the variable is declared in.
+ * \param Name            Variable name.
+ * \param NameLen         Length of variable name.
+ * \param File            File where this variable is defined.
+ * \param LineNo          Line number.
+ * \param Ty              Metadata describing the type of the variable.
+ * \param AlwaysPreserve  If true, this descriptor will survive optimizations.
+ * \param Flags           Flags.
+ * \param AlignInBits     Variable alignment.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateAutoVariable(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNo, LLVMMetadataRef Ty,
+    LLVMBool AlwaysPreserve, LLVMDIFlags Flags, uint32_t AlignInBits);
+
+/**
+ * Create a new descriptor for a function parameter variable.
+ * \param Builder         The DIBuilder.
+ * \param Scope           The local scope the variable is declared in.
+ * \param Name            Variable name.
+ * \param NameLen         Length of variable name.
+ * \param ArgNo           Unique argument number for this variable; starts at 1.
+ * \param File            File where this variable is defined.
+ * \param LineNo          Line number.
+ * \param Ty              Metadata describing the type of the variable.
+ * \param AlwaysPreserve  If true, this descriptor will survive optimizations.
+ * \param Flags           Flags.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateParameterVariable(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, unsigned ArgNo, LLVMMetadataRef File, unsigned LineNo,
+    LLVMMetadataRef Ty, LLVMBool AlwaysPreserve, LLVMDIFlags Flags);
+
+/**
+ * Get the metadata of the subprogram attached to a function.
+ *
+ * @see llvm::Function::getSubprogram()
+ */
+LLVMMetadataRef LLVMGetSubprogram(LLVMValueRef Func);
+
+/**
+ * Set the subprogram attached to a function.
+ *
+ * @see llvm::Function::setSubprogram()
+ */
+void LLVMSetSubprogram(LLVMValueRef Func, LLVMMetadataRef SP);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/contrib/llvm/include/llvm-c/Disassembler.h b/contrib/llvm/include/llvm-c/Disassembler.h
index d6f92e505d3c..5e80b95848cf 100644
--- a/contrib/llvm/include/llvm-c/Disassembler.h
+++ b/contrib/llvm/include/llvm-c/Disassembler.h
@@ -15,12 +15,7 @@
 #ifndef LLVM_C_DISASSEMBLER_H
 #define LLVM_C_DISASSEMBLER_H
 
-#include "llvm/Support/DataTypes.h"
-#ifdef __cplusplus
-#include <cstddef>
-#else
-#include <stddef.h>
-#endif
+#include "llvm-c/DisassemblerTypes.h"
 
 /**
  * @defgroup LLVMCDisassembler Disassembler
@@ -29,146 +24,6 @@
  * @{
  */
 
-/**
- * An opaque reference to a disassembler context.
- */
-typedef void *LLVMDisasmContextRef;
-
-/**
- * The type for the operand information call back function.  This is called to
- * get the symbolic information for an operand of an instruction.  Typically
- * this is from the relocation information, symbol table, etc.  That block of
- * information is saved when the disassembler context is created and passed to
- * the call back in the DisInfo parameter.  The instruction containing operand
- * is at the PC parameter.  For some instruction sets, there can be more than
- * one operand with symbolic information.  To determine the symbolic operand
- * information for each operand, the bytes for the specific operand in the
- * instruction are specified by the Offset parameter and its byte widith is the
- * size parameter.  For instructions sets with fixed widths and one symbolic
- * operand per instruction, the Offset parameter will be zero and Size parameter
- * will be the instruction width.  The information is returned in TagBuf and is
- * Triple specific with its specific information defined by the value of
- * TagType for that Triple.  If symbolic information is returned the function
- * returns 1, otherwise it returns 0.
- */
-typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC,
-                                  uint64_t Offset, uint64_t Size,
-                                  int TagType, void *TagBuf);
-
-/**
- * The initial support in LLVM MC for the most general form of a relocatable
- * expression is "AddSymbol - SubtractSymbol + Offset".  For some Darwin targets
- * this full form is encoded in the relocation information so that AddSymbol and
- * SubtractSymbol can be link edited independent of each other.  Many other
- * platforms only allow a relocatable expression of the form AddSymbol + Offset
- * to be encoded.
- *
- * The LLVMOpInfoCallback() for the TagType value of 1 uses the struct
- * LLVMOpInfo1.  The value of the relocatable expression for the operand,
- * including any PC adjustment, is passed in to the call back in the Value
- * field.  The symbolic information about the operand is returned using all
- * the fields of the structure with the Offset of the relocatable expression
- * returned in the Value field.  It is possible that some symbols in the
- * relocatable expression were assembly temporary symbols, for example
- * "Ldata - LpicBase + constant", and only the Values of the symbols without
- * symbol names are present in the relocation information.  The VariantKind
- * type is one of the Target specific #defines below and is used to print
- * operands like "_foo@GOT", ":lower16:_foo", etc.
- */
-struct LLVMOpInfoSymbol1 {
-  uint64_t Present;  /* 1 if this symbol is present */
-  const char *Name;  /* symbol name if not NULL */
-  uint64_t Value;    /* symbol value if name is NULL */
-};
-
-struct LLVMOpInfo1 {
-  struct LLVMOpInfoSymbol1 AddSymbol;
-  struct LLVMOpInfoSymbol1 SubtractSymbol;
-  uint64_t Value;
-  uint64_t VariantKind;
-};
-
-/**
- * The operand VariantKinds for symbolic disassembly.
- */
-#define LLVMDisassembler_VariantKind_None 0 /* all targets */
-
-/**
- * The ARM target VariantKinds.
- */
-#define LLVMDisassembler_VariantKind_ARM_HI16 1 /* :upper16: */
-#define LLVMDisassembler_VariantKind_ARM_LO16 2 /* :lower16: */
-
-/**
- * The ARM64 target VariantKinds.
- */
-#define LLVMDisassembler_VariantKind_ARM64_PAGE       1 /* @page */
-#define LLVMDisassembler_VariantKind_ARM64_PAGEOFF    2 /* @pageoff */
-#define LLVMDisassembler_VariantKind_ARM64_GOTPAGE    3 /* @gotpage */
-#define LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF 4 /* @gotpageoff */
-#define LLVMDisassembler_VariantKind_ARM64_TLVP       5 /* @tvlppage */
-#define LLVMDisassembler_VariantKind_ARM64_TLVOFF     6 /* @tvlppageoff */
-
-/**
- * The type for the symbol lookup function.  This may be called by the
- * disassembler for things like adding a comment for a PC plus a constant
- * offset load instruction to use a symbol name instead of a load address value.
- * It is passed the block information is saved when the disassembler context is
- * created and the ReferenceValue to look up as a symbol.  If no symbol is found
- * for the ReferenceValue NULL is returned.  The ReferenceType of the
- * instruction is passed indirectly as is the PC of the instruction in
- * ReferencePC.  If the output reference can be determined its type is returned
- * indirectly in ReferenceType along with ReferenceName if any, or that is set
- * to NULL.
- */
-typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
-                                                uint64_t ReferenceValue,
-                                                uint64_t *ReferenceType,
-                                                uint64_t ReferencePC,
-                                                const char **ReferenceName);
-/**
- * The reference types on input and output.
- */
-/* No input reference type or no output reference type. */
-#define LLVMDisassembler_ReferenceType_InOut_None 0
-
-/* The input reference is from a branch instruction. */
-#define LLVMDisassembler_ReferenceType_In_Branch 1
-/* The input reference is from a PC relative load instruction. */
-#define LLVMDisassembler_ReferenceType_In_PCrel_Load 2
-
-/* The input reference is from an ARM64::ADRP instruction. */
-#define LLVMDisassembler_ReferenceType_In_ARM64_ADRP 0x100000001
-/* The input reference is from an ARM64::ADDXri instruction. */
-#define LLVMDisassembler_ReferenceType_In_ARM64_ADDXri 0x100000002
-/* The input reference is from an ARM64::LDRXui instruction. */
-#define LLVMDisassembler_ReferenceType_In_ARM64_LDRXui 0x100000003
-/* The input reference is from an ARM64::LDRXl instruction. */
-#define LLVMDisassembler_ReferenceType_In_ARM64_LDRXl 0x100000004
-/* The input reference is from an ARM64::ADR instruction. */
-#define LLVMDisassembler_ReferenceType_In_ARM64_ADR 0x100000005
-
-/* The output reference is to as symbol stub. */
-#define LLVMDisassembler_ReferenceType_Out_SymbolStub 1
-/* The output reference is to a symbol address in a literal pool. */
-#define LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr 2
-/* The output reference is to a cstring address in a literal pool. */
-#define LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr 3
-
-/* The output reference is to a Objective-C CoreFoundation string. */
-#define LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref 4
-/* The output reference is to a Objective-C message. */
-#define LLVMDisassembler_ReferenceType_Out_Objc_Message 5
-/* The output reference is to a Objective-C message ref. */
-#define LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref 6
-/* The output reference is to a Objective-C selector ref. */
-#define LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref 7
-/* The output reference is to a Objective-C class ref. */
-#define LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref 8
-
-/* The output reference is to a C++ symbol name. */
-#define LLVMDisassembler_ReferenceType_DeMangled_Name 9
-
 #ifdef __cplusplus
 extern "C" {
 #endif /* !defined(__cplusplus) */
diff --git a/contrib/llvm/include/llvm-c/DisassemblerTypes.h b/contrib/llvm/include/llvm-c/DisassemblerTypes.h
new file mode 100644
index 000000000000..e8754ac77055
--- /dev/null
+++ b/contrib/llvm/include/llvm-c/DisassemblerTypes.h
@@ -0,0 +1,160 @@
+/*===-- llvm-c/DisassemblerTypedefs.h -----------------------------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_DISASSEMBLER_TYPES_H
+#define LLVM_DISASSEMBLER_TYPES_H
+
+#include "llvm-c/DataTypes.h"
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#endif
+
+/**
+ * An opaque reference to a disassembler context.
+ */
+typedef void *LLVMDisasmContextRef;
+
+/**
+ * The type for the operand information call back function.  This is called to
+ * get the symbolic information for an operand of an instruction.  Typically
+ * this is from the relocation information, symbol table, etc.  That block of
+ * information is saved when the disassembler context is created and passed to
+ * the call back in the DisInfo parameter.  The instruction containing operand
+ * is at the PC parameter.  For some instruction sets, there can be more than
+ * one operand with symbolic information.  To determine the symbolic operand
+ * information for each operand, the bytes for the specific operand in the
+ * instruction are specified by the Offset parameter and its byte widith is the
+ * size parameter.  For instructions sets with fixed widths and one symbolic
+ * operand per instruction, the Offset parameter will be zero and Size parameter
+ * will be the instruction width.  The information is returned in TagBuf and is
+ * Triple specific with its specific information defined by the value of
+ * TagType for that Triple.  If symbolic information is returned the function
+ * returns 1, otherwise it returns 0.
+ */
+typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC,
+                                  uint64_t Offset, uint64_t Size,
+                                  int TagType, void *TagBuf);
+
+/**
+ * The initial support in LLVM MC for the most general form of a relocatable
+ * expression is "AddSymbol - SubtractSymbol + Offset".  For some Darwin targets
+ * this full form is encoded in the relocation information so that AddSymbol and
+ * SubtractSymbol can be link edited independent of each other.  Many other
+ * platforms only allow a relocatable expression of the form AddSymbol + Offset
+ * to be encoded.
+ *
+ * The LLVMOpInfoCallback() for the TagType value of 1 uses the struct
+ * LLVMOpInfo1.  The value of the relocatable expression for the operand,
+ * including any PC adjustment, is passed in to the call back in the Value
+ * field.  The symbolic information about the operand is returned using all
+ * the fields of the structure with the Offset of the relocatable expression
+ * returned in the Value field.  It is possible that some symbols in the
+ * relocatable expression were assembly temporary symbols, for example
+ * "Ldata - LpicBase + constant", and only the Values of the symbols without
+ * symbol names are present in the relocation information.  The VariantKind
+ * type is one of the Target specific #defines below and is used to print
+ * operands like "_foo@GOT", ":lower16:_foo", etc.
+ */
+struct LLVMOpInfoSymbol1 {
+  uint64_t Present;  /* 1 if this symbol is present */
+  const char *Name;  /* symbol name if not NULL */
+  uint64_t Value;    /* symbol value if name is NULL */
+};
+
+struct LLVMOpInfo1 {
+  struct LLVMOpInfoSymbol1 AddSymbol;
+  struct LLVMOpInfoSymbol1 SubtractSymbol;
+  uint64_t Value;
+  uint64_t VariantKind;
+};
+
+/**
+ * The operand VariantKinds for symbolic disassembly.
+ */
+#define LLVMDisassembler_VariantKind_None 0 /* all targets */
+
+/**
+ * The ARM target VariantKinds.
+ */
+#define LLVMDisassembler_VariantKind_ARM_HI16 1 /* :upper16: */
+#define LLVMDisassembler_VariantKind_ARM_LO16 2 /* :lower16: */
+
+/**
+ * The ARM64 target VariantKinds.
+ */
+#define LLVMDisassembler_VariantKind_ARM64_PAGE       1 /* @page */
+#define LLVMDisassembler_VariantKind_ARM64_PAGEOFF    2 /* @pageoff */
+#define LLVMDisassembler_VariantKind_ARM64_GOTPAGE    3 /* @gotpage */
+#define LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF 4 /* @gotpageoff */
+#define LLVMDisassembler_VariantKind_ARM64_TLVP       5 /* @tvlppage */
+#define LLVMDisassembler_VariantKind_ARM64_TLVOFF     6 /* @tvlppageoff */
+
+/**
+ * The type for the symbol lookup function.  This may be called by the
+ * disassembler for things like adding a comment for a PC plus a constant
+ * offset load instruction to use a symbol name instead of a load address value.
+ * It is passed the block information is saved when the disassembler context is
+ * created and the ReferenceValue to look up as a symbol.  If no symbol is found
+ * for the ReferenceValue NULL is returned.  The ReferenceType of the
+ * instruction is passed indirectly as is the PC of the instruction in
+ * ReferencePC.  If the output reference can be determined its type is returned
+ * indirectly in ReferenceType along with ReferenceName if any, or that is set
+ * to NULL.
+ */
+typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
+                                                uint64_t ReferenceValue,
+                                                uint64_t *ReferenceType,
+                                                uint64_t ReferencePC,
+                                                const char **ReferenceName);
+/**
+ * The reference types on input and output.
+ */
+/* No input reference type or no output reference type. */
+#define LLVMDisassembler_ReferenceType_InOut_None 0
+
+/* The input reference is from a branch instruction. */
+#define LLVMDisassembler_ReferenceType_In_Branch 1
+/* The input reference is from a PC relative load instruction. */
+#define LLVMDisassembler_ReferenceType_In_PCrel_Load 2
+
+/* The input reference is from an ARM64::ADRP instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_ADRP 0x100000001
+/* The input reference is from an ARM64::ADDXri instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_ADDXri 0x100000002
+/* The input reference is from an ARM64::LDRXui instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_LDRXui 0x100000003
+/* The input reference is from an ARM64::LDRXl instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_LDRXl 0x100000004
+/* The input reference is from an ARM64::ADR instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_ADR 0x100000005
+
+/* The output reference is to as symbol stub. */
+#define LLVMDisassembler_ReferenceType_Out_SymbolStub 1
+/* The output reference is to a symbol address in a literal pool. */
+#define LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr 2
+/* The output reference is to a cstring address in a literal pool. */
+#define LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr 3
+
+/* The output reference is to a Objective-C CoreFoundation string. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref 4
+/* The output reference is to a Objective-C message. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Message 5
+/* The output reference is to a Objective-C message ref. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref 6
+/* The output reference is to a Objective-C selector ref. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref 7
+/* The output reference is to a Objective-C class ref. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref 8
+
+/* The output reference is to a C++ symbol name. */
+#define LLVMDisassembler_ReferenceType_DeMangled_Name 9
+
+#endif
diff --git a/contrib/llvm/include/llvm-c/ExecutionEngine.h b/contrib/llvm/include/llvm-c/ExecutionEngine.h
index 51830fe139c6..49ae6fee45f0 100644
--- a/contrib/llvm/include/llvm-c/ExecutionEngine.h
+++ b/contrib/llvm/include/llvm-c/ExecutionEngine.h
@@ -182,6 +182,13 @@ LLVMMCJITMemoryManagerRef LLVMCreateSimpleMCJITMemoryManager(
 
 void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM);
 
+/*===-- JIT Event Listener functions -------------------------------------===*/
+
+LLVMJITEventListenerRef LLVMCreateGDBRegistrationListener(void);
+LLVMJITEventListenerRef LLVMCreateIntelJITEventListener(void);
+LLVMJITEventListenerRef LLVMCreateOprofileJITEventListener(void);
+LLVMJITEventListenerRef LLVMCreatePerfJITEventListener(void);
+
 /**
  * @}
  */
diff --git a/contrib/llvm/include/llvm-c/Initialization.h b/contrib/llvm/include/llvm-c/Initialization.h
index 90c8396f7ad3..e45eafb139f2 100644
--- a/contrib/llvm/include/llvm-c/Initialization.h
+++ b/contrib/llvm/include/llvm-c/Initialization.h
@@ -37,6 +37,7 @@ void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
 void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R);
 void LLVMInitializeVectorization(LLVMPassRegistryRef R);
 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
+void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R);
 void LLVMInitializeIPO(LLVMPassRegistryRef R);
 void LLVMInitializeInstrumentation(LLVMPassRegistryRef R);
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R);
diff --git a/contrib/llvm/include/llvm-c/OrcBindings.h b/contrib/llvm/include/llvm-c/OrcBindings.h
index abb3ac6a7f03..9497f0d40776 100644
--- a/contrib/llvm/include/llvm-c/OrcBindings.h
+++ b/contrib/llvm/include/llvm-c/OrcBindings.h
@@ -29,9 +29,8 @@
 extern "C" {
 #endif
 
-typedef struct LLVMOpaqueSharedModule *LLVMSharedModuleRef;
 typedef struct LLVMOrcOpaqueJITStack *LLVMOrcJITStackRef;
-typedef uint32_t LLVMOrcModuleHandle;
+typedef uint64_t LLVMOrcModuleHandle;
 typedef uint64_t LLVMOrcTargetAddress;
 typedef uint64_t (*LLVMOrcSymbolResolverFn)(const char *Name, void *LookupCtx);
 typedef uint64_t (*LLVMOrcLazyCompileCallbackFn)(LLVMOrcJITStackRef JITStack,
@@ -40,33 +39,6 @@ typedef uint64_t (*LLVMOrcLazyCompileCallbackFn)(LLVMOrcJITStackRef JITStack,
 typedef enum { LLVMOrcErrSuccess = 0, LLVMOrcErrGeneric } LLVMOrcErrorCode;
 
 /**
- * Turn an LLVMModuleRef into an LLVMSharedModuleRef.
- *
- * The JIT uses shared ownership for LLVM modules, since it is generally
- * difficult to know when the JIT will be finished with a module (and the JIT
- * has no way of knowing when a user may be finished with one).
- *
- * Calling this method with an LLVMModuleRef creates a shared-pointer to the
- * module, and returns a reference to this shared pointer.
- *
- * The shared module should be disposed when finished with by calling
- * LLVMOrcDisposeSharedModule (not LLVMDisposeModule). The Module will be
- * deleted when the last shared pointer owner relinquishes it.
- */
-
-LLVMSharedModuleRef LLVMOrcMakeSharedModule(LLVMModuleRef Mod);
-
-/**
- * Dispose of a shared module.
- *
- * The module should not be accessed after this call. The module will be
- * deleted once all clients (including the JIT itself) have released their
- * shared pointers.
- */
-
-void LLVMOrcDisposeSharedModuleRef(LLVMSharedModuleRef SharedMod);
-
-/**
  * Create an ORC JIT stack.
  *
  * The client owns the resulting stack, and must call OrcDisposeInstance(...)
@@ -125,8 +97,7 @@ LLVMOrcErrorCode LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
  */
 LLVMOrcErrorCode
 LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
-                            LLVMOrcModuleHandle *RetHandle,
-                            LLVMSharedModuleRef Mod,
+                            LLVMOrcModuleHandle *RetHandle, LLVMModuleRef Mod,
                             LLVMOrcSymbolResolverFn SymbolResolver,
                             void *SymbolResolverCtx);
 
@@ -135,8 +106,7 @@ LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
  */
 LLVMOrcErrorCode
 LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack,
-                           LLVMOrcModuleHandle *RetHandle,
-                           LLVMSharedModuleRef Mod,
+                           LLVMOrcModuleHandle *RetHandle, LLVMModuleRef Mod,
                            LLVMOrcSymbolResolverFn SymbolResolver,
                            void *SymbolResolverCtx);
 
@@ -171,10 +141,33 @@ LLVMOrcErrorCode LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
                                          const char *SymbolName);
 
 /**
+ * Get symbol address from JIT instance, searching only the specified
+ * handle.
+ */
+LLVMOrcErrorCode LLVMOrcGetSymbolAddressIn(LLVMOrcJITStackRef JITStack,
+                                           LLVMOrcTargetAddress *RetAddr,
+                                           LLVMOrcModuleHandle H,
+                                           const char *SymbolName);
+
+/**
  * Dispose of an ORC JIT stack.
  */
 LLVMOrcErrorCode LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack);
 
+/**
+ * Register a JIT Event Listener.
+ *
+ * A NULL listener is ignored.
+ */
+void LLVMOrcRegisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L);
+
+/**
+ * Unegister a JIT Event Listener.
+ *
+ * A NULL listener is ignored.
+ */
+void LLVMOrcUnregisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L);
+
 #ifdef __cplusplus
 }
 #endif /* extern "C" */
diff --git a/contrib/llvm/include/llvm-c/Support.h b/contrib/llvm/include/llvm-c/Support.h
index 6de184ccab49..37d5d72ff5dc 100644
--- a/contrib/llvm/include/llvm-c/Support.h
+++ b/contrib/llvm/include/llvm-c/Support.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_C_SUPPORT_H
 #define LLVM_C_SUPPORT_H
 
+#include "llvm-c/DataTypes.h"
 #include "llvm-c/Types.h"
-#include "llvm/Support/DataTypes.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/contrib/llvm/include/llvm-c/TargetMachine.h b/contrib/llvm/include/llvm-c/TargetMachine.h
index f4f7f7698c45..7f672b5d10d6 100644
--- a/contrib/llvm/include/llvm-c/TargetMachine.h
+++ b/contrib/llvm/include/llvm-c/TargetMachine.h
@@ -137,6 +137,18 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T, LLVMModuleR
   disposed with LLVMDisposeMessage. */
 char* LLVMGetDefaultTargetTriple(void);
 
+/** Normalize a target triple. The result needs to be disposed with
+  LLVMDisposeMessage. */
+char* LLVMNormalizeTargetTriple(const char* triple);
+
+/** Get the host CPU as a string. The result needs to be disposed with
+  LLVMDisposeMessage. */
+char* LLVMGetHostCPUName(void);
+
+/** Get the host CPU's features as a string. The result needs to be disposed
+  with LLVMDisposeMessage. */
+char* LLVMGetHostCPUFeatures(void);
+
 /** Adds the target-specific analysis passes to the pass manager. */
 void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM);
 
diff --git a/contrib/llvm/include/llvm-c/Transforms/InstCombine.h b/contrib/llvm/include/llvm-c/Transforms/InstCombine.h
new file mode 100644
index 000000000000..e1c1572d53dc
--- /dev/null
+++ b/contrib/llvm/include/llvm-c/Transforms/InstCombine.h
@@ -0,0 +1,43 @@
+/*===-- Scalar.h - Scalar Transformation Library C Interface ----*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to libLLVMInstCombine.a, which        *|
+|* combines instructions to form fewer, simple IR instructions.               *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_TRANSFORMS_INSTCOMBINE_H
+#define LLVM_C_TRANSFORMS_INSTCOMBINE_H
+
+#include "llvm-c/Types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup LLVMCTransformsInstCombine Instruction Combining transformations
+ * @ingroup LLVMCTransforms
+ *
+ * @{
+ */
+
+/** See llvm::createInstructionCombiningPass function. */
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif /* defined(__cplusplus) */
+
+#endif
+
diff --git a/contrib/llvm/include/llvm-c/Transforms/Scalar.h b/contrib/llvm/include/llvm-c/Transforms/Scalar.h
index 8991e0904849..f55cdce86be9 100644
--- a/contrib/llvm/include/llvm-c/Transforms/Scalar.h
+++ b/contrib/llvm/include/llvm-c/Transforms/Scalar.h
@@ -35,6 +35,9 @@ extern "C" {
 /** See llvm::createAggressiveDCEPass function. */
 void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM);
 
+/** See llvm::createAggressiveInstCombinerPass function. */
+void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM);
+
 /** See llvm::createBitTrackingDCEPass function. */
 void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM);
 
@@ -86,6 +89,9 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM);
 /** See llvm::createLoopUnrollPass function. */
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
 
+/** See llvm::createLoopUnrollAndJamPass function. */
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);
+
 /** See llvm::createLoopUnswitchPass function. */
 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);
 
@@ -95,12 +101,6 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM);
 /** See llvm::createPartiallyInlineLibCallsPass function. */
 void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM);
 
-/** See llvm::createLowerSwitchPass function. */
-void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM);
-
-/** See llvm::createPromoteMemoryToRegisterPass function. */
-void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM);
-
 /** See llvm::createReassociatePass function. */
 void LLVMAddReassociatePass(LLVMPassManagerRef PM);
 
diff --git a/contrib/llvm/include/llvm-c/Transforms/Utils.h b/contrib/llvm/include/llvm-c/Transforms/Utils.h
new file mode 100644
index 000000000000..f171f7fbbe3e
--- /dev/null
+++ b/contrib/llvm/include/llvm-c/Transforms/Utils.h
@@ -0,0 +1,50 @@
+/*===-- Utils.h - Transformation Utils Library C Interface ------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to libLLVMTransformUtils.a, which     *|
+|* implements various transformation utilities of the LLVM IR.                *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_TRANSFORMS_UTILS_H
+#define LLVM_C_TRANSFORMS_UTILS_H
+
+#include "llvm-c/Types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup LLVMCTransformsUtils Transformation Utilities
+ * @ingroup LLVMCTransforms
+ *
+ * @{
+ */
+
+/** See llvm::createLowerSwitchPass function. */
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM);
+
+/** See llvm::createPromoteMemoryToRegisterPass function. */
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif /* defined(__cplusplus) */
+
+#endif
+
diff --git a/contrib/llvm/include/llvm-c/Transforms/Vectorize.h b/contrib/llvm/include/llvm-c/Transforms/Vectorize.h
index cf8306aee762..e3f9961acfb1 100644
--- a/contrib/llvm/include/llvm-c/Transforms/Vectorize.h
+++ b/contrib/llvm/include/llvm-c/Transforms/Vectorize.h
@@ -33,9 +33,6 @@ extern "C" {
  * @{
  */
 
-/** DEPRECATED - Use LLVMAddSLPVectorizePass */
-void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
-
 /** See llvm::createLoopVectorizePass function. */
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM);
 
diff --git a/contrib/llvm/include/llvm-c/Types.h b/contrib/llvm/include/llvm-c/Types.h
index d63ea4de933d..4a33542e86cc 100644
--- a/contrib/llvm/include/llvm-c/Types.h
+++ b/contrib/llvm/include/llvm-c/Types.h
@@ -7,14 +7,14 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file defines types used by the the C interface to LLVM.               *|
+|* This file defines types used by the C interface to LLVM.                   *|
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_C_TYPES_H
 #define LLVM_C_TYPES_H
 
-#include "llvm/Support/DataTypes.h"
+#include "llvm-c/DataTypes.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -135,6 +135,21 @@ typedef struct LLVMOpaqueAttributeRef *LLVMAttributeRef;
 typedef struct LLVMOpaqueDiagnosticInfo *LLVMDiagnosticInfoRef;
 
 /**
+ * @see llvm::Comdat
+ */
+typedef struct LLVMComdat *LLVMComdatRef;
+
+/**
+ * @see llvm::Module::ModuleFlagEntry
+ */
+typedef struct LLVMOpaqueModuleFlagEntry LLVMModuleFlagEntry;
+
+/**
+ * @see llvm::JITEventListener
+ */
+typedef struct LLVMOpaqueJITEventListener *LLVMJITEventListenerRef;
+
+/**
  * @}
  */
 
diff --git a/contrib/llvm/include/llvm-c/lto.h b/contrib/llvm/include/llvm-c/lto.h
index 55f3e46c45ed..1acd610f70ac 100644
--- a/contrib/llvm/include/llvm-c/lto.h
+++ b/contrib/llvm/include/llvm-c/lto.h
@@ -44,7 +44,7 @@ typedef bool lto_bool_t;
  * @{
  */
 
-#define LTO_API_VERSION 21
+#define LTO_API_VERSION 22
 
 /**
  * \since prior to LTO_API_VERSION=3
@@ -190,7 +190,7 @@ lto_module_create_from_memory_with_path(const void* mem, size_t length,
                                         const char *path);
 
 /**
- * \brief Loads an object file in its own context.
+ * Loads an object file in its own context.
  *
  * Loads an object file in its own LLVMContext.  This function call is
  * thread-safe.  However, modules created this way should not be merged into an
@@ -205,7 +205,7 @@ lto_module_create_in_local_context(const void *mem, size_t length,
                                    const char *path);
 
 /**
- * \brief Loads an object file in the codegen context.
+ * Loads an object file in the codegen context.
  *
  * Loads an object file into the same context as \c cg.  The module is safe to
  * add using \a lto_codegen_add_module().
@@ -345,7 +345,7 @@ extern lto_code_gen_t
 lto_codegen_create(void);
 
 /**
- * \brief Instantiate a code generator in its own context.
+ * Instantiate a code generator in its own context.
  *
  * Instantiates a code generator in its own context.  Modules added via \a
  * lto_codegen_add_module() must have all been created in the same context,
@@ -539,7 +539,7 @@ lto_codegen_set_should_internalize(lto_code_gen_t cg,
                                    lto_bool_t ShouldInternalize);
 
 /**
- * \brief Set whether to embed uselists in bitcode.
+ * Set whether to embed uselists in bitcode.
  *
  * Sets whether \a lto_codegen_write_merged_modules() should embed uselists in
  * output bitcode.  This should be turned on for all -save-temps output.
@@ -784,7 +784,7 @@ extern void thinlto_codegen_set_cache_dir(thinlto_code_gen_t cg,
 /**
  * Sets the cache pruning interval (in seconds). A negative value disables the
  * pruning. An unspecified default value will be applied, and a value of 0 will
- * be ignored.
+ * force prunning to occur.
  *
  * \since LTO_API_VERSION=18
  */
@@ -793,7 +793,7 @@ extern void thinlto_codegen_set_cache_pruning_interval(thinlto_code_gen_t cg,
 
 /**
  * Sets the maximum cache size that can be persistent across build, in terms of
- * percentage of the available space on the the disk. Set to 100 to indicate
+ * percentage of the available space on the disk. Set to 100 to indicate
  * no limit, 50 to indicate that the cache size will not be left over half the
  * available space. A value over 100 will be reduced to 100, a value of 0 will
  * be ignored. An unspecified default value will be applied.
@@ -817,6 +817,28 @@ extern void thinlto_codegen_set_cache_entry_expiration(thinlto_code_gen_t cg,
                                                        unsigned expiration);
 
 /**
+ * Sets the maximum size of the cache directory (in bytes). A value over the
+ * amount of available space on the disk will be reduced to the amount of
+ * available space. An unspecified default value will be applied. A value of 0
+ * will be ignored.
+ *
+ * \since LTO_API_VERSION=22
+ */
+extern void thinlto_codegen_set_cache_size_bytes(thinlto_code_gen_t cg,
+                                                 unsigned max_size_bytes);
+
+/**
+ * Sets the maximum number of files in the cache directory. An unspecified
+ * default value will be applied. A value of 0 will be ignored.
+ *
+ * \since LTO_API_VERSION=22
+ */
+extern void thinlto_codegen_set_cache_size_files(thinlto_code_gen_t cg,
+                                                 unsigned max_size_files);
+
+
+
+/**
  * @} // endgroup LLVMCTLTO_CACHING
  */
 
diff --git a/contrib/llvm/include/llvm/ADT/APFloat.h b/contrib/llvm/include/llvm/ADT/APFloat.h
index 6c0b6ae78ae3..5c59af4c04ba 100644
--- a/contrib/llvm/include/llvm/ADT/APFloat.h
+++ b/contrib/llvm/include/llvm/ADT/APFloat.h
@@ -1215,7 +1215,7 @@ inline APFloat abs(APFloat X) {
   return X;
 }
 
-/// \brief Returns the negated value of the argument.
+/// Returns the negated value of the argument.
 inline APFloat neg(APFloat X) {
   X.changeSign();
   return X;
diff --git a/contrib/llvm/include/llvm/ADT/APInt.h b/contrib/llvm/include/llvm/ADT/APInt.h
index c81363cc16b7..6bf6b22fb010 100644
--- a/contrib/llvm/include/llvm/ADT/APInt.h
+++ b/contrib/llvm/include/llvm/ADT/APInt.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a class to represent arbitrary precision
+/// This file implements a class to represent arbitrary precision
 /// integral constant values and operations on them.
 ///
 //===----------------------------------------------------------------------===//
@@ -40,7 +40,7 @@ inline APInt operator-(APInt);
 //                              APInt Class
 //===----------------------------------------------------------------------===//
 
-/// \brief Class for arbitrary precision integers.
+/// Class for arbitrary precision integers.
 ///
 /// APInt is a functional replacement for common case unsigned integer type like
 /// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width
@@ -78,6 +78,12 @@ public:
     APINT_BITS_PER_WORD = APINT_WORD_SIZE * CHAR_BIT
   };
 
+  enum class Rounding {
+    DOWN,
+    TOWARD_ZERO,
+    UP,
+  };
+
   static const WordType WORD_MAX = ~WordType(0);
 
 private:
@@ -94,7 +100,7 @@ private:
 
   friend class APSInt;
 
-  /// \brief Fast internal constructor
+  /// Fast internal constructor
   ///
   /// This constructor is used only internally for speed of construction of
   /// temporaries. It is unsafe for general use so it is not public.
@@ -102,19 +108,19 @@ private:
     U.pVal = val;
   }
 
-  /// \brief Determine if this APInt just has one word to store value.
+  /// Determine if this APInt just has one word to store value.
   ///
   /// \returns true if the number of bits <= 64, false otherwise.
   bool isSingleWord() const { return BitWidth <= APINT_BITS_PER_WORD; }
 
-  /// \brief Determine which word a bit is in.
+  /// Determine which word a bit is in.
   ///
   /// \returns the word position for the specified bit position.
   static unsigned whichWord(unsigned bitPosition) {
     return bitPosition / APINT_BITS_PER_WORD;
   }
 
-  /// \brief Determine which bit in a word a bit is in.
+  /// Determine which bit in a word a bit is in.
   ///
   /// \returns the bit position in a word for the specified bit position
   /// in the APInt.
@@ -122,7 +128,7 @@ private:
     return bitPosition % APINT_BITS_PER_WORD;
   }
 
-  /// \brief Get a single bit mask.
+  /// Get a single bit mask.
   ///
   /// \returns a uint64_t with only bit at "whichBit(bitPosition)" set
   /// This method generates and returns a uint64_t (word) mask for a single
@@ -132,7 +138,7 @@ private:
     return 1ULL << whichBit(bitPosition);
   }
 
-  /// \brief Clear unused high order bits
+  /// Clear unused high order bits
   ///
   /// This method is used internally to clear the top "N" bits in the high order
   /// word that are not used by the APInt. This is needed after the most
@@ -151,7 +157,7 @@ private:
     return *this;
   }
 
-  /// \brief Get the word corresponding to a bit position
+  /// Get the word corresponding to a bit position
   /// \returns the corresponding word for the specified bit position.
   uint64_t getWord(unsigned bitPosition) const {
     return isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)];
@@ -162,7 +168,7 @@ private:
   /// value of any bits upon return. Caller should populate the bits after.
   void reallocate(unsigned NewBitWidth);
 
-  /// \brief Convert a char array into an APInt
+  /// Convert a char array into an APInt
   ///
   /// \param radix 2, 8, 10, 16, or 36
   /// Converts a string into a number.  The string must be non-empty
@@ -176,7 +182,7 @@ private:
   /// result to hold the input.
   void fromString(unsigned numBits, StringRef str, uint8_t radix);
 
-  /// \brief An internal division function for dividing APInts.
+  /// An internal division function for dividing APInts.
   ///
   /// This is used by the toString method to divide by the radix. It simply
   /// provides a more convenient form of divide for internal use since KnuthDiv
@@ -258,7 +264,7 @@ public:
   /// \name Constructors
   /// @{
 
-  /// \brief Create a new APInt of numBits width, initialized as val.
+  /// Create a new APInt of numBits width, initialized as val.
   ///
   /// If isSigned is true then val is treated as if it were a signed value
   /// (i.e. as an int64_t) and the appropriate sign extension to the bit width
@@ -279,7 +285,7 @@ public:
     }
   }
 
-  /// \brief Construct an APInt of numBits width, initialized as bigVal[].
+  /// Construct an APInt of numBits width, initialized as bigVal[].
   ///
   /// Note that bigVal.size() can be smaller or larger than the corresponding
   /// bit width but any extraneous bits will be dropped.
@@ -297,7 +303,7 @@ public:
   /// constructor.
   APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]);
 
-  /// \brief Construct an APInt from a string representation.
+  /// Construct an APInt from a string representation.
   ///
   /// This constructor interprets the string \p str in the given radix. The
   /// interpretation stops when the first character that is not suitable for the
@@ -311,7 +317,7 @@ public:
   APInt(unsigned numBits, StringRef str, uint8_t radix);
 
   /// Simply makes *this a copy of that.
-  /// @brief Copy Constructor.
+  /// Copy Constructor.
   APInt(const APInt &that) : BitWidth(that.BitWidth) {
     if (isSingleWord())
       U.VAL = that.U.VAL;
@@ -319,26 +325,26 @@ public:
       initSlowCase(that);
   }
 
-  /// \brief Move Constructor.
+  /// Move Constructor.
   APInt(APInt &&that) : BitWidth(that.BitWidth) {
     memcpy(&U, &that.U, sizeof(U));
     that.BitWidth = 0;
   }
 
-  /// \brief Destructor.
+  /// Destructor.
   ~APInt() {
     if (needsCleanup())
       delete[] U.pVal;
   }
 
-  /// \brief Default constructor that creates an uninteresting APInt
+  /// Default constructor that creates an uninteresting APInt
   /// representing a 1-bit zero value.
   ///
   /// This is useful for object deserialization (pair this with the static
   ///  method Read).
   explicit APInt() : BitWidth(1) { U.VAL = 0; }
 
-  /// \brief Returns whether this instance allocated memory.
+  /// Returns whether this instance allocated memory.
   bool needsCleanup() const { return !isSingleWord(); }
 
   /// Used to insert APInt objects, or objects that contain APInt objects, into
@@ -349,33 +355,33 @@ public:
   /// \name Value Tests
   /// @{
 
-  /// \brief Determine sign of this APInt.
+  /// Determine sign of this APInt.
   ///
   /// This tests the high bit of this APInt to determine if it is set.
   ///
   /// \returns true if this APInt is negative, false otherwise
   bool isNegative() const { return (*this)[BitWidth - 1]; }
 
-  /// \brief Determine if this APInt Value is non-negative (>= 0)
+  /// Determine if this APInt Value is non-negative (>= 0)
   ///
   /// This tests the high bit of the APInt to determine if it is unset.
   bool isNonNegative() const { return !isNegative(); }
 
-  /// \brief Determine if sign bit of this APInt is set.
+  /// Determine if sign bit of this APInt is set.
   ///
   /// This tests the high bit of this APInt to determine if it is set.
   ///
   /// \returns true if this APInt has its sign bit set, false otherwise.
   bool isSignBitSet() const { return (*this)[BitWidth-1]; }
 
-  /// \brief Determine if sign bit of this APInt is clear.
+  /// Determine if sign bit of this APInt is clear.
   ///
   /// This tests the high bit of this APInt to determine if it is clear.
   ///
   /// \returns true if this APInt has its sign bit clear, false otherwise.
   bool isSignBitClear() const { return !isSignBitSet(); }
 
-  /// \brief Determine if this APInt Value is positive.
+  /// Determine if this APInt Value is positive.
   ///
   /// This tests if the value of this APInt is positive (> 0). Note
   /// that 0 is not a positive value.
@@ -383,7 +389,7 @@ public:
   /// \returns true if this APInt is positive.
   bool isStrictlyPositive() const { return isNonNegative() && !isNullValue(); }
 
-  /// \brief Determine if all bits are set
+  /// Determine if all bits are set
   ///
   /// This checks to see if the value has all bits of the APInt are set or not.
   bool isAllOnesValue() const {
@@ -392,13 +398,13 @@ public:
     return countTrailingOnesSlowCase() == BitWidth;
   }
 
-  /// \brief Determine if all bits are clear
+  /// Determine if all bits are clear
   ///
   /// This checks to see if the value has all bits of the APInt are clear or
   /// not.
   bool isNullValue() const { return !*this; }
 
-  /// \brief Determine if this is a value of 1.
+  /// Determine if this is a value of 1.
   ///
   /// This checks to see if the value of this APInt is one.
   bool isOneValue() const {
@@ -407,13 +413,13 @@ public:
     return countLeadingZerosSlowCase() == BitWidth - 1;
   }
 
-  /// \brief Determine if this is the largest unsigned value.
+  /// Determine if this is the largest unsigned value.
   ///
   /// This checks to see if the value of this APInt is the maximum unsigned
   /// value for the APInt's bit width.
   bool isMaxValue() const { return isAllOnesValue(); }
 
-  /// \brief Determine if this is the largest signed value.
+  /// Determine if this is the largest signed value.
   ///
   /// This checks to see if the value of this APInt is the maximum signed
   /// value for the APInt's bit width.
@@ -423,13 +429,13 @@ public:
     return !isNegative() && countTrailingOnesSlowCase() == BitWidth - 1;
   }
 
-  /// \brief Determine if this is the smallest unsigned value.
+  /// Determine if this is the smallest unsigned value.
   ///
   /// This checks to see if the value of this APInt is the minimum unsigned
   /// value for the APInt's bit width.
   bool isMinValue() const { return isNullValue(); }
 
-  /// \brief Determine if this is the smallest signed value.
+  /// Determine if this is the smallest signed value.
   ///
   /// This checks to see if the value of this APInt is the minimum signed
   /// value for the APInt's bit width.
@@ -439,19 +445,19 @@ public:
     return isNegative() && countTrailingZerosSlowCase() == BitWidth - 1;
   }
 
-  /// \brief Check if this APInt has an N-bits unsigned integer value.
+  /// Check if this APInt has an N-bits unsigned integer value.
   bool isIntN(unsigned N) const {
     assert(N && "N == 0 ???");
     return getActiveBits() <= N;
   }
 
-  /// \brief Check if this APInt has an N-bits signed integer value.
+  /// Check if this APInt has an N-bits signed integer value.
   bool isSignedIntN(unsigned N) const {
     assert(N && "N == 0 ???");
     return getMinSignedBits() <= N;
   }
 
-  /// \brief Check if this APInt's value is a power of two greater than zero.
+  /// Check if this APInt's value is a power of two greater than zero.
   ///
   /// \returns true if the argument APInt value is a power of two > 0.
   bool isPowerOf2() const {
@@ -460,12 +466,12 @@ public:
     return countPopulationSlowCase() == 1;
   }
 
-  /// \brief Check if the APInt's value is returned by getSignMask.
+  /// Check if the APInt's value is returned by getSignMask.
   ///
   /// \returns true if this is the value returned by getSignMask.
   bool isSignMask() const { return isMinSignedValue(); }
 
-  /// \brief Convert APInt to a boolean value.
+  /// Convert APInt to a boolean value.
   ///
   /// This converts the APInt to a boolean value as a test against zero.
   bool getBoolValue() const { return !!*this; }
@@ -476,7 +482,7 @@ public:
     return ugt(Limit) ? Limit : getZExtValue();
   }
 
-  /// \brief Check if the APInt consists of a repeated bit pattern.
+  /// Check if the APInt consists of a repeated bit pattern.
   ///
   /// e.g. 0x01010101 satisfies isSplat(8).
   /// \param SplatSizeInBits The size of the pattern in bits. Must divide bit
@@ -505,7 +511,7 @@ public:
     return (Ones > 0) && ((Ones + countLeadingZerosSlowCase()) == BitWidth);
   }
 
-  /// \brief Return true if this APInt value contains a sequence of ones with
+  /// Return true if this APInt value contains a sequence of ones with
   /// the remainder zero.
   bool isShiftedMask() const {
     if (isSingleWord())
@@ -519,29 +525,29 @@ public:
   /// \name Value Generators
   /// @{
 
-  /// \brief Gets maximum unsigned value of APInt for specific bit width.
+  /// Gets maximum unsigned value of APInt for specific bit width.
   static APInt getMaxValue(unsigned numBits) {
     return getAllOnesValue(numBits);
   }
 
-  /// \brief Gets maximum signed value of APInt for a specific bit width.
+  /// Gets maximum signed value of APInt for a specific bit width.
   static APInt getSignedMaxValue(unsigned numBits) {
     APInt API = getAllOnesValue(numBits);
     API.clearBit(numBits - 1);
     return API;
   }
 
-  /// \brief Gets minimum unsigned value of APInt for a specific bit width.
+  /// Gets minimum unsigned value of APInt for a specific bit width.
   static APInt getMinValue(unsigned numBits) { return APInt(numBits, 0); }
 
-  /// \brief Gets minimum signed value of APInt for a specific bit width.
+  /// Gets minimum signed value of APInt for a specific bit width.
   static APInt getSignedMinValue(unsigned numBits) {
     APInt API(numBits, 0);
     API.setBit(numBits - 1);
     return API;
   }
 
-  /// \brief Get the SignMask for a specific bit width.
+  /// Get the SignMask for a specific bit width.
   ///
   /// This is just a wrapper function of getSignedMinValue(), and it helps code
   /// readability when we want to get a SignMask.
@@ -549,19 +555,19 @@ public:
     return getSignedMinValue(BitWidth);
   }
 
-  /// \brief Get the all-ones value.
+  /// Get the all-ones value.
   ///
   /// \returns the all-ones value for an APInt of the specified bit-width.
   static APInt getAllOnesValue(unsigned numBits) {
     return APInt(numBits, WORD_MAX, true);
   }
 
-  /// \brief Get the '0' value.
+  /// Get the '0' value.
   ///
   /// \returns the '0' value for an APInt of the specified bit-width.
   static APInt getNullValue(unsigned numBits) { return APInt(numBits, 0); }
 
-  /// \brief Compute an APInt containing numBits highbits from this APInt.
+  /// Compute an APInt containing numBits highbits from this APInt.
   ///
   /// Get an APInt with the same BitWidth as this APInt, just zero mask
   /// the low bits and right shift to the least significant bit.
@@ -569,7 +575,7 @@ public:
   /// \returns the high "numBits" bits of this APInt.
   APInt getHiBits(unsigned numBits) const;
 
-  /// \brief Compute an APInt containing numBits lowbits from this APInt.
+  /// Compute an APInt containing numBits lowbits from this APInt.
   ///
   /// Get an APInt with the same BitWidth as this APInt, just zero mask
   /// the high bits.
@@ -577,14 +583,14 @@ public:
   /// \returns the low "numBits" bits of this APInt.
   APInt getLoBits(unsigned numBits) const;
 
-  /// \brief Return an APInt with exactly one bit set in the result.
+  /// Return an APInt with exactly one bit set in the result.
   static APInt getOneBitSet(unsigned numBits, unsigned BitNo) {
     APInt Res(numBits, 0);
     Res.setBit(BitNo);
     return Res;
   }
 
-  /// \brief Get a value with a block of bits set.
+  /// Get a value with a block of bits set.
   ///
   /// Constructs an APInt value that has a contiguous range of bits set. The
   /// bits from loBit (inclusive) to hiBit (exclusive) will be set. All other
@@ -603,7 +609,7 @@ public:
     return Res;
   }
 
-  /// \brief Get a value with upper bits starting at loBit set.
+  /// Get a value with upper bits starting at loBit set.
   ///
   /// Constructs an APInt value that has a contiguous range of bits set. The
   /// bits from loBit (inclusive) to numBits (exclusive) will be set. All other
@@ -620,7 +626,7 @@ public:
     return Res;
   }
 
-  /// \brief Get a value with high bits set
+  /// Get a value with high bits set
   ///
   /// Constructs an APInt value that has the top hiBitsSet bits set.
   ///
@@ -632,7 +638,7 @@ public:
     return Res;
   }
 
-  /// \brief Get a value with low bits set
+  /// Get a value with low bits set
   ///
   /// Constructs an APInt value that has the bottom loBitsSet bits set.
   ///
@@ -644,10 +650,10 @@ public:
     return Res;
   }
 
-  /// \brief Return a value containing V broadcasted over NewLen bits.
+  /// Return a value containing V broadcasted over NewLen bits.
   static APInt getSplat(unsigned NewLen, const APInt &V);
 
-  /// \brief Determine if two APInts have the same value, after zero-extending
+  /// Determine if two APInts have the same value, after zero-extending
   /// one of them (if needed!) to ensure that the bit-widths match.
   static bool isSameValue(const APInt &I1, const APInt &I2) {
     if (I1.getBitWidth() == I2.getBitWidth())
@@ -659,7 +665,7 @@ public:
     return I1.zext(I2.getBitWidth()) == I2;
   }
 
-  /// \brief Overload to compute a hash_code for an APInt value.
+  /// Overload to compute a hash_code for an APInt value.
   friend hash_code hash_value(const APInt &Arg);
 
   /// This function returns a pointer to the internal storage of the APInt.
@@ -675,7 +681,7 @@ public:
   /// \name Unary Operators
   /// @{
 
-  /// \brief Postfix increment operator.
+  /// Postfix increment operator.
   ///
   /// Increments *this by 1.
   ///
@@ -686,12 +692,12 @@ public:
     return API;
   }
 
-  /// \brief Prefix increment operator.
+  /// Prefix increment operator.
   ///
   /// \returns *this incremented by one
   APInt &operator++();
 
-  /// \brief Postfix decrement operator.
+  /// Postfix decrement operator.
   ///
   /// Decrements *this by 1.
   ///
@@ -702,12 +708,12 @@ public:
     return API;
   }
 
-  /// \brief Prefix decrement operator.
+  /// Prefix decrement operator.
   ///
   /// \returns *this decremented by one.
   APInt &operator--();
 
-  /// \brief Logical negation operator.
+  /// Logical negation operator.
   ///
   /// Performs logical negation operation on this APInt.
   ///
@@ -722,7 +728,7 @@ public:
   /// \name Assignment Operators
   /// @{
 
-  /// \brief Copy assignment operator.
+  /// Copy assignment operator.
   ///
   /// \returns *this after assignment of RHS.
   APInt &operator=(const APInt &RHS) {
@@ -737,8 +743,13 @@ public:
     return *this;
   }
 
-  /// @brief Move assignment operator.
+  /// Move assignment operator.
   APInt &operator=(APInt &&that) {
+#ifdef _MSC_VER
+    // The MSVC std::shuffle implementation still does self-assignment.
+    if (this == &that)
+      return *this;
+#endif
     assert(this != &that && "Self-move not supported");
     if (!isSingleWord())
       delete[] U.pVal;
@@ -753,7 +764,7 @@ public:
     return *this;
   }
 
-  /// \brief Assignment operator.
+  /// Assignment operator.
   ///
   /// The RHS value is assigned to *this. If the significant bits in RHS exceed
   /// the bit width, the excess bits are truncated. If the bit width is larger
@@ -771,7 +782,7 @@ public:
     return *this;
   }
 
-  /// \brief Bitwise AND assignment operator.
+  /// Bitwise AND assignment operator.
   ///
   /// Performs a bitwise AND operation on this APInt and RHS. The result is
   /// assigned to *this.
@@ -786,7 +797,7 @@ public:
     return *this;
   }
 
-  /// \brief Bitwise AND assignment operator.
+  /// Bitwise AND assignment operator.
   ///
   /// Performs a bitwise AND operation on this APInt and RHS. RHS is
   /// logically zero-extended or truncated to match the bit-width of
@@ -801,7 +812,7 @@ public:
     return *this;
   }
 
-  /// \brief Bitwise OR assignment operator.
+  /// Bitwise OR assignment operator.
   ///
   /// Performs a bitwise OR operation on this APInt and RHS. The result is
   /// assigned *this;
@@ -816,7 +827,7 @@ public:
     return *this;
   }
 
-  /// \brief Bitwise OR assignment operator.
+  /// Bitwise OR assignment operator.
   ///
   /// Performs a bitwise OR operation on this APInt and RHS. RHS is
   /// logically zero-extended or truncated to match the bit-width of
@@ -831,7 +842,7 @@ public:
     return *this;
   }
 
-  /// \brief Bitwise XOR assignment operator.
+  /// Bitwise XOR assignment operator.
   ///
   /// Performs a bitwise XOR operation on this APInt and RHS. The result is
   /// assigned to *this.
@@ -846,7 +857,7 @@ public:
     return *this;
   }
 
-  /// \brief Bitwise XOR assignment operator.
+  /// Bitwise XOR assignment operator.
   ///
   /// Performs a bitwise XOR operation on this APInt and RHS. RHS is
   /// logically zero-extended or truncated to match the bit-width of
@@ -861,7 +872,7 @@ public:
     return *this;
   }
 
-  /// \brief Multiplication assignment operator.
+  /// Multiplication assignment operator.
   ///
   /// Multiplies this APInt by RHS and assigns the result to *this.
   ///
@@ -869,7 +880,7 @@ public:
   APInt &operator*=(const APInt &RHS);
   APInt &operator*=(uint64_t RHS);
 
-  /// \brief Addition assignment operator.
+  /// Addition assignment operator.
   ///
   /// Adds RHS to *this and assigns the result to *this.
   ///
@@ -877,7 +888,7 @@ public:
   APInt &operator+=(const APInt &RHS);
   APInt &operator+=(uint64_t RHS);
 
-  /// \brief Subtraction assignment operator.
+  /// Subtraction assignment operator.
   ///
   /// Subtracts RHS from *this and assigns the result to *this.
   ///
@@ -885,7 +896,7 @@ public:
   APInt &operator-=(const APInt &RHS);
   APInt &operator-=(uint64_t RHS);
 
-  /// \brief Left-shift assignment function.
+  /// Left-shift assignment function.
   ///
   /// Shifts *this left by shiftAmt and assigns the result to *this.
   ///
@@ -903,7 +914,7 @@ public:
     return *this;
   }
 
-  /// \brief Left-shift assignment function.
+  /// Left-shift assignment function.
   ///
   /// Shifts *this left by shiftAmt and assigns the result to *this.
   ///
@@ -914,22 +925,22 @@ public:
   /// \name Binary Operators
   /// @{
 
-  /// \brief Multiplication operator.
+  /// Multiplication operator.
   ///
   /// Multiplies this APInt by RHS and returns the result.
   APInt operator*(const APInt &RHS) const;
 
-  /// \brief Left logical shift operator.
+  /// Left logical shift operator.
   ///
   /// Shifts this APInt left by \p Bits and returns the result.
   APInt operator<<(unsigned Bits) const { return shl(Bits); }
 
-  /// \brief Left logical shift operator.
+  /// Left logical shift operator.
   ///
   /// Shifts this APInt left by \p Bits and returns the result.
   APInt operator<<(const APInt &Bits) const { return shl(Bits); }
 
-  /// \brief Arithmetic right-shift function.
+  /// Arithmetic right-shift function.
   ///
   /// Arithmetic right-shift this APInt by shiftAmt.
   APInt ashr(unsigned ShiftAmt) const {
@@ -953,7 +964,7 @@ public:
     ashrSlowCase(ShiftAmt);
   }
 
-  /// \brief Logical right-shift function.
+  /// Logical right-shift function.
   ///
   /// Logical right-shift this APInt by shiftAmt.
   APInt lshr(unsigned shiftAmt) const {
@@ -975,7 +986,7 @@ public:
     lshrSlowCase(ShiftAmt);
   }
 
-  /// \brief Left-shift function.
+  /// Left-shift function.
   ///
   /// Left-shift this APInt by shiftAmt.
   APInt shl(unsigned shiftAmt) const {
@@ -984,13 +995,13 @@ public:
     return R;
   }
 
-  /// \brief Rotate left by rotateAmt.
+  /// Rotate left by rotateAmt.
   APInt rotl(unsigned rotateAmt) const;
 
-  /// \brief Rotate right by rotateAmt.
+  /// Rotate right by rotateAmt.
   APInt rotr(unsigned rotateAmt) const;
 
-  /// \brief Arithmetic right-shift function.
+  /// Arithmetic right-shift function.
   ///
   /// Arithmetic right-shift this APInt by shiftAmt.
   APInt ashr(const APInt &ShiftAmt) const {
@@ -1002,7 +1013,7 @@ public:
   /// Arithmetic right-shift this APInt by shiftAmt in place.
   void ashrInPlace(const APInt &shiftAmt);
 
-  /// \brief Logical right-shift function.
+  /// Logical right-shift function.
   ///
   /// Logical right-shift this APInt by shiftAmt.
   APInt lshr(const APInt &ShiftAmt) const {
@@ -1014,7 +1025,7 @@ public:
   /// Logical right-shift this APInt by ShiftAmt in place.
   void lshrInPlace(const APInt &ShiftAmt);
 
-  /// \brief Left-shift function.
+  /// Left-shift function.
   ///
   /// Left-shift this APInt by shiftAmt.
   APInt shl(const APInt &ShiftAmt) const {
@@ -1023,28 +1034,31 @@ public:
     return R;
   }
 
-  /// \brief Rotate left by rotateAmt.
+  /// Rotate left by rotateAmt.
   APInt rotl(const APInt &rotateAmt) const;
 
-  /// \brief Rotate right by rotateAmt.
+  /// Rotate right by rotateAmt.
   APInt rotr(const APInt &rotateAmt) const;
 
-  /// \brief Unsigned division operation.
+  /// Unsigned division operation.
   ///
   /// Perform an unsigned divide operation on this APInt by RHS. Both this and
   /// RHS are treated as unsigned quantities for purposes of this division.
   ///
-  /// \returns a new APInt value containing the division result
+  /// \returns a new APInt value containing the division result, rounded towards
+  /// zero.
   APInt udiv(const APInt &RHS) const;
   APInt udiv(uint64_t RHS) const;
 
-  /// \brief Signed division function for APInt.
+  /// Signed division function for APInt.
   ///
   /// Signed divide this APInt by APInt RHS.
+  ///
+  /// The result is rounded towards zero.
   APInt sdiv(const APInt &RHS) const;
   APInt sdiv(int64_t RHS) const;
 
-  /// \brief Unsigned remainder operation.
+  /// Unsigned remainder operation.
   ///
   /// Perform an unsigned remainder operation on this APInt with RHS being the
   /// divisor. Both this and RHS are treated as unsigned quantities for purposes
@@ -1056,13 +1070,13 @@ public:
   APInt urem(const APInt &RHS) const;
   uint64_t urem(uint64_t RHS) const;
 
-  /// \brief Function for signed remainder operation.
+  /// Function for signed remainder operation.
   ///
   /// Signed remainder operation on APInt.
   APInt srem(const APInt &RHS) const;
   int64_t srem(int64_t RHS) const;
 
-  /// \brief Dual division/remainder interface.
+  /// Dual division/remainder interface.
   ///
   /// Sometimes it is convenient to divide two APInt values and obtain both the
   /// quotient and remainder. This function does both operations in the same
@@ -1090,7 +1104,7 @@ public:
   APInt sshl_ov(const APInt &Amt, bool &Overflow) const;
   APInt ushl_ov(const APInt &Amt, bool &Overflow) const;
 
-  /// \brief Array-indexing support.
+  /// Array-indexing support.
   ///
   /// \returns the bit value at bitPosition
   bool operator[](unsigned bitPosition) const {
@@ -1102,7 +1116,7 @@ public:
   /// \name Comparison Operators
   /// @{
 
-  /// \brief Equality operator.
+  /// Equality operator.
   ///
   /// Compares this APInt with RHS for the validity of the equality
   /// relationship.
@@ -1113,7 +1127,7 @@ public:
     return EqualSlowCase(RHS);
   }
 
-  /// \brief Equality operator.
+  /// Equality operator.
   ///
   /// Compares this APInt with a uint64_t for the validity of the equality
   /// relationship.
@@ -1123,7 +1137,7 @@ public:
     return (isSingleWord() || getActiveBits() <= 64) && getZExtValue() == Val;
   }
 
-  /// \brief Equality comparison.
+  /// Equality comparison.
   ///
   /// Compares this APInt with RHS for the validity of the equality
   /// relationship.
@@ -1131,7 +1145,7 @@ public:
   /// \returns true if *this == Val
   bool eq(const APInt &RHS) const { return (*this) == RHS; }
 
-  /// \brief Inequality operator.
+  /// Inequality operator.
   ///
   /// Compares this APInt with RHS for the validity of the inequality
   /// relationship.
@@ -1139,7 +1153,7 @@ public:
   /// \returns true if *this != Val
   bool operator!=(const APInt &RHS) const { return !((*this) == RHS); }
 
-  /// \brief Inequality operator.
+  /// Inequality operator.
   ///
   /// Compares this APInt with a uint64_t for the validity of the inequality
   /// relationship.
@@ -1147,7 +1161,7 @@ public:
   /// \returns true if *this != Val
   bool operator!=(uint64_t Val) const { return !((*this) == Val); }
 
-  /// \brief Inequality comparison
+  /// Inequality comparison
   ///
   /// Compares this APInt with RHS for the validity of the inequality
   /// relationship.
@@ -1155,7 +1169,7 @@ public:
   /// \returns true if *this != Val
   bool ne(const APInt &RHS) const { return !((*this) == RHS); }
 
-  /// \brief Unsigned less than comparison
+  /// Unsigned less than comparison
   ///
   /// Regards both *this and RHS as unsigned quantities and compares them for
   /// the validity of the less-than relationship.
@@ -1163,7 +1177,7 @@ public:
   /// \returns true if *this < RHS when both are considered unsigned.
   bool ult(const APInt &RHS) const { return compare(RHS) < 0; }
 
-  /// \brief Unsigned less than comparison
+  /// Unsigned less than comparison
   ///
   /// Regards both *this as an unsigned quantity and compares it with RHS for
   /// the validity of the less-than relationship.
@@ -1174,7 +1188,7 @@ public:
     return (isSingleWord() || getActiveBits() <= 64) && getZExtValue() < RHS;
   }
 
-  /// \brief Signed less than comparison
+  /// Signed less than comparison
   ///
   /// Regards both *this and RHS as signed quantities and compares them for
   /// validity of the less-than relationship.
@@ -1182,7 +1196,7 @@ public:
   /// \returns true if *this < RHS when both are considered signed.
   bool slt(const APInt &RHS) const { return compareSigned(RHS) < 0; }
 
-  /// \brief Signed less than comparison
+  /// Signed less than comparison
   ///
   /// Regards both *this as a signed quantity and compares it with RHS for
   /// the validity of the less-than relationship.
@@ -1193,7 +1207,7 @@ public:
                                                         : getSExtValue() < RHS;
   }
 
-  /// \brief Unsigned less or equal comparison
+  /// Unsigned less or equal comparison
   ///
   /// Regards both *this and RHS as unsigned quantities and compares them for
   /// validity of the less-or-equal relationship.
@@ -1201,7 +1215,7 @@ public:
   /// \returns true if *this <= RHS when both are considered unsigned.
   bool ule(const APInt &RHS) const { return compare(RHS) <= 0; }
 
-  /// \brief Unsigned less or equal comparison
+  /// Unsigned less or equal comparison
   ///
   /// Regards both *this as an unsigned quantity and compares it with RHS for
   /// the validity of the less-or-equal relationship.
@@ -1209,7 +1223,7 @@ public:
   /// \returns true if *this <= RHS when considered unsigned.
   bool ule(uint64_t RHS) const { return !ugt(RHS); }
 
-  /// \brief Signed less or equal comparison
+  /// Signed less or equal comparison
   ///
   /// Regards both *this and RHS as signed quantities and compares them for
   /// validity of the less-or-equal relationship.
@@ -1217,7 +1231,7 @@ public:
   /// \returns true if *this <= RHS when both are considered signed.
   bool sle(const APInt &RHS) const { return compareSigned(RHS) <= 0; }
 
-  /// \brief Signed less or equal comparison
+  /// Signed less or equal comparison
   ///
   /// Regards both *this as a signed quantity and compares it with RHS for the
   /// validity of the less-or-equal relationship.
@@ -1225,7 +1239,7 @@ public:
   /// \returns true if *this <= RHS when considered signed.
   bool sle(uint64_t RHS) const { return !sgt(RHS); }
 
-  /// \brief Unsigned greather than comparison
+  /// Unsigned greather than comparison
   ///
   /// Regards both *this and RHS as unsigned quantities and compares them for
   /// the validity of the greater-than relationship.
@@ -1233,7 +1247,7 @@ public:
   /// \returns true if *this > RHS when both are considered unsigned.
   bool ugt(const APInt &RHS) const { return !ule(RHS); }
 
-  /// \brief Unsigned greater than comparison
+  /// Unsigned greater than comparison
   ///
   /// Regards both *this as an unsigned quantity and compares it with RHS for
   /// the validity of the greater-than relationship.
@@ -1244,7 +1258,7 @@ public:
     return (!isSingleWord() && getActiveBits() > 64) || getZExtValue() > RHS;
   }
 
-  /// \brief Signed greather than comparison
+  /// Signed greather than comparison
   ///
   /// Regards both *this and RHS as signed quantities and compares them for the
   /// validity of the greater-than relationship.
@@ -1252,7 +1266,7 @@ public:
   /// \returns true if *this > RHS when both are considered signed.
   bool sgt(const APInt &RHS) const { return !sle(RHS); }
 
-  /// \brief Signed greater than comparison
+  /// Signed greater than comparison
   ///
   /// Regards both *this as a signed quantity and compares it with RHS for
   /// the validity of the greater-than relationship.
@@ -1263,7 +1277,7 @@ public:
                                                         : getSExtValue() > RHS;
   }
 
-  /// \brief Unsigned greater or equal comparison
+  /// Unsigned greater or equal comparison
   ///
   /// Regards both *this and RHS as unsigned quantities and compares them for
   /// validity of the greater-or-equal relationship.
@@ -1271,7 +1285,7 @@ public:
   /// \returns true if *this >= RHS when both are considered unsigned.
   bool uge(const APInt &RHS) const { return !ult(RHS); }
 
-  /// \brief Unsigned greater or equal comparison
+  /// Unsigned greater or equal comparison
   ///
   /// Regards both *this as an unsigned quantity and compares it with RHS for
   /// the validity of the greater-or-equal relationship.
@@ -1279,7 +1293,7 @@ public:
   /// \returns true if *this >= RHS when considered unsigned.
   bool uge(uint64_t RHS) const { return !ult(RHS); }
 
-  /// \brief Signed greather or equal comparison
+  /// Signed greater or equal comparison
   ///
   /// Regards both *this and RHS as signed quantities and compares them for
   /// validity of the greater-or-equal relationship.
@@ -1287,7 +1301,7 @@ public:
   /// \returns true if *this >= RHS when both are considered signed.
   bool sge(const APInt &RHS) const { return !slt(RHS); }
 
-  /// \brief Signed greater or equal comparison
+  /// Signed greater or equal comparison
   ///
   /// Regards both *this as a signed quantity and compares it with RHS for
   /// the validity of the greater-or-equal relationship.
@@ -1316,13 +1330,13 @@ public:
   /// \name Resizing Operators
   /// @{
 
-  /// \brief Truncate to new width.
+  /// Truncate to new width.
   ///
   /// Truncate the APInt to a specified width. It is an error to specify a width
   /// that is greater than or equal to the current width.
   APInt trunc(unsigned width) const;
 
-  /// \brief Sign extend to a new width.
+  /// Sign extend to a new width.
   ///
   /// This operation sign extends the APInt to a new width. If the high order
   /// bit is set, the fill on the left will be done with 1 bits, otherwise zero.
@@ -1330,32 +1344,32 @@ public:
   /// current width.
   APInt sext(unsigned width) const;
 
-  /// \brief Zero extend to a new width.
+  /// Zero extend to a new width.
   ///
   /// This operation zero extends the APInt to a new width. The high order bits
   /// are filled with 0 bits.  It is an error to specify a width that is less
   /// than or equal to the current width.
   APInt zext(unsigned width) const;
 
-  /// \brief Sign extend or truncate to width
+  /// Sign extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is sign
   /// extended, truncated, or left alone to make it that width.
   APInt sextOrTrunc(unsigned width) const;
 
-  /// \brief Zero extend or truncate to width
+  /// Zero extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is zero
   /// extended, truncated, or left alone to make it that width.
   APInt zextOrTrunc(unsigned width) const;
 
-  /// \brief Sign extend or truncate to width
+  /// Sign extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is sign
   /// extended, or left alone to make it that width.
   APInt sextOrSelf(unsigned width) const;
 
-  /// \brief Zero extend or truncate to width
+  /// Zero extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is zero
   /// extended, or left alone to make it that width.
@@ -1365,7 +1379,7 @@ public:
   /// \name Bit Manipulation Operators
   /// @{
 
-  /// \brief Set every bit to 1.
+  /// Set every bit to 1.
   void setAllBits() {
     if (isSingleWord())
       U.VAL = WORD_MAX;
@@ -1376,7 +1390,7 @@ public:
     clearUnusedBits();
   }
 
-  /// \brief Set a given bit to 1.
+  /// Set a given bit to 1.
   ///
   /// Set the given bit to 1 whose position is given as "bitPosition".
   void setBit(unsigned BitPosition) {
@@ -1427,7 +1441,7 @@ public:
     return setBits(BitWidth - hiBits, BitWidth);
   }
 
-  /// \brief Set every bit to 0.
+  /// Set every bit to 0.
   void clearAllBits() {
     if (isSingleWord())
       U.VAL = 0;
@@ -1435,7 +1449,7 @@ public:
       memset(U.pVal, 0, getNumWords() * APINT_WORD_SIZE);
   }
 
-  /// \brief Set a given bit to 0.
+  /// Set a given bit to 0.
   ///
   /// Set the given bit to 0 whose position is given as "bitPosition".
   void clearBit(unsigned BitPosition) {
@@ -1452,7 +1466,7 @@ public:
     clearBit(BitWidth - 1);
   }
 
-  /// \brief Toggle every bit to its opposite value.
+  /// Toggle every bit to its opposite value.
   void flipAllBits() {
     if (isSingleWord()) {
       U.VAL ^= WORD_MAX;
@@ -1462,7 +1476,7 @@ public:
     }
   }
 
-  /// \brief Toggles a given bit to its opposite value.
+  /// Toggles a given bit to its opposite value.
   ///
   /// Toggle a given bit to its opposite value whose position is given
   /// as "bitPosition".
@@ -1484,17 +1498,17 @@ public:
   /// \name Value Characterization Functions
   /// @{
 
-  /// \brief Return the number of bits in the APInt.
+  /// Return the number of bits in the APInt.
   unsigned getBitWidth() const { return BitWidth; }
 
-  /// \brief Get the number of words.
+  /// Get the number of words.
   ///
   /// Here one word's bitwidth equals to that of uint64_t.
   ///
   /// \returns the number of words to hold the integer value of this APInt.
   unsigned getNumWords() const { return getNumWords(BitWidth); }
 
-  /// \brief Get the number of words.
+  /// Get the number of words.
   ///
   /// *NOTE* Here one word's bitwidth equals to that of uint64_t.
   ///
@@ -1504,14 +1518,14 @@ public:
     return ((uint64_t)BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
   }
 
-  /// \brief Compute the number of active bits in the value
+  /// Compute the number of active bits in the value
   ///
   /// This function returns the number of active bits which is defined as the
   /// bit width minus the number of leading zeros. This is used in several
   /// computations to see how "wide" the value is.
   unsigned getActiveBits() const { return BitWidth - countLeadingZeros(); }
 
-  /// \brief Compute the number of active words in the value of this APInt.
+  /// Compute the number of active words in the value of this APInt.
   ///
   /// This is used in conjunction with getActiveData to extract the raw value of
   /// the APInt.
@@ -1520,7 +1534,7 @@ public:
     return numActiveBits ? whichWord(numActiveBits - 1) + 1 : 1;
   }
 
-  /// \brief Get the minimum bit size for this signed APInt
+  /// Get the minimum bit size for this signed APInt
   ///
   /// Computes the minimum bit width for this APInt while considering it to be a
   /// signed (and probably negative) value. If the value is not negative, this
@@ -1534,7 +1548,7 @@ public:
     return getActiveBits() + 1;
   }
 
-  /// \brief Get zero extended value
+  /// Get zero extended value
   ///
   /// This method attempts to return the value of this APInt as a zero extended
   /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
@@ -1546,7 +1560,7 @@ public:
     return U.pVal[0];
   }
 
-  /// \brief Get sign extended value
+  /// Get sign extended value
   ///
   /// This method attempts to return the value of this APInt as a sign extended
   /// int64_t. The bit width must be <= 64 or the value must fit within an
@@ -1558,13 +1572,13 @@ public:
     return int64_t(U.pVal[0]);
   }
 
-  /// \brief Get bits required for string value.
+  /// Get bits required for string value.
   ///
   /// This method determines how many bits are required to hold the APInt
   /// equivalent of the string given by \p str.
   static unsigned getBitsNeeded(StringRef str, uint8_t radix);
 
-  /// \brief The APInt version of the countLeadingZeros functions in
+  /// The APInt version of the countLeadingZeros functions in
   ///   MathExtras.h.
   ///
   /// It counts the number of zeros from the most significant bit to the first
@@ -1580,7 +1594,7 @@ public:
     return countLeadingZerosSlowCase();
   }
 
-  /// \brief Count the number of leading one bits.
+  /// Count the number of leading one bits.
   ///
   /// This function is an APInt version of the countLeadingOnes
   /// functions in MathExtras.h. It counts the number of ones from the most
@@ -1600,7 +1614,7 @@ public:
     return isNegative() ? countLeadingOnes() : countLeadingZeros();
   }
 
-  /// \brief Count the number of trailing zero bits.
+  /// Count the number of trailing zero bits.
   ///
   /// This function is an APInt version of the countTrailingZeros
   /// functions in MathExtras.h. It counts the number of zeros from the least
@@ -1614,7 +1628,7 @@ public:
     return countTrailingZerosSlowCase();
   }
 
-  /// \brief Count the number of trailing one bits.
+  /// Count the number of trailing one bits.
   ///
   /// This function is an APInt version of the countTrailingOnes
   /// functions in MathExtras.h. It counts the number of ones from the least
@@ -1628,7 +1642,7 @@ public:
     return countTrailingOnesSlowCase();
   }
 
-  /// \brief Count the number of bits set.
+  /// Count the number of bits set.
   ///
   /// This function is an APInt version of the countPopulation functions
   /// in MathExtras.h. It counts the number of 1 bits in the APInt value.
@@ -1662,7 +1676,7 @@ public:
     toString(Str, Radix, true, false);
   }
 
-  /// \brief Return the APInt as a std::string.
+  /// Return the APInt as a std::string.
   ///
   /// Note that this is an inefficient method.  It is better to pass in a
   /// SmallVector/SmallString to the methods above to avoid thrashing the heap
@@ -1676,16 +1690,16 @@ public:
   /// Value.
   APInt reverseBits() const;
 
-  /// \brief Converts this APInt to a double value.
+  /// Converts this APInt to a double value.
   double roundToDouble(bool isSigned) const;
 
-  /// \brief Converts this unsigned APInt to a double value.
+  /// Converts this unsigned APInt to a double value.
   double roundToDouble() const { return roundToDouble(false); }
 
-  /// \brief Converts this signed APInt to a double value.
+  /// Converts this signed APInt to a double value.
   double signedRoundToDouble() const { return roundToDouble(true); }
 
-  /// \brief Converts APInt bits to a double
+  /// Converts APInt bits to a double
   ///
   /// The conversion does not do a translation from integer to double, it just
   /// re-interprets the bits as a double. Note that it is valid to do this on
@@ -1694,7 +1708,7 @@ public:
     return BitsToDouble(getWord(0));
   }
 
-  /// \brief Converts APInt bits to a double
+  /// Converts APInt bits to a double
   ///
   /// The conversion does not do a translation from integer to float, it just
   /// re-interprets the bits as a float. Note that it is valid to do this on
@@ -1703,7 +1717,7 @@ public:
     return BitsToFloat(getWord(0));
   }
 
-  /// \brief Converts a double to APInt bits.
+  /// Converts a double to APInt bits.
   ///
   /// The conversion does not do a translation from double to integer, it just
   /// re-interprets the bits of the double.
@@ -1711,7 +1725,7 @@ public:
     return APInt(sizeof(double) * CHAR_BIT, DoubleToBits(V));
   }
 
-  /// \brief Converts a float to APInt bits.
+  /// Converts a float to APInt bits.
   ///
   /// The conversion does not do a translation from float to integer, it just
   /// re-interprets the bits of the float.
@@ -1770,10 +1784,10 @@ public:
     return logBase2();
   }
 
-  /// \brief Compute the square root
+  /// Compute the square root
   APInt sqrt() const;
 
-  /// \brief Get the absolute value;
+  /// Get the absolute value;
   ///
   /// If *this is < 0 then return -(*this), otherwise *this;
   APInt abs() const {
@@ -1924,7 +1938,7 @@ public:
   /// Set the least significant BITS and clear the rest.
   static void tcSetLeastSignificantBits(WordType *, unsigned, unsigned bits);
 
-  /// \brief debug method
+  /// debug method
   void dump() const;
 
   /// @}
@@ -1947,7 +1961,7 @@ inline bool operator==(uint64_t V1, const APInt &V2) { return V2 == V1; }
 
 inline bool operator!=(uint64_t V1, const APInt &V2) { return V2 != V1; }
 
-/// \brief Unary bitwise complement operator.
+/// Unary bitwise complement operator.
 ///
 /// \returns an APInt that is the bitwise complement of \p v.
 inline APInt operator~(APInt v) {
@@ -2080,27 +2094,27 @@ inline APInt operator*(uint64_t LHS, APInt b) {
 
 namespace APIntOps {
 
-/// \brief Determine the smaller of two APInts considered to be signed.
+/// Determine the smaller of two APInts considered to be signed.
 inline const APInt &smin(const APInt &A, const APInt &B) {
   return A.slt(B) ? A : B;
 }
 
-/// \brief Determine the larger of two APInts considered to be signed.
+/// Determine the larger of two APInts considered to be signed.
 inline const APInt &smax(const APInt &A, const APInt &B) {
   return A.sgt(B) ? A : B;
 }
 
-/// \brief Determine the smaller of two APInts considered to be signed.
+/// Determine the smaller of two APInts considered to be signed.
 inline const APInt &umin(const APInt &A, const APInt &B) {
   return A.ult(B) ? A : B;
 }
 
-/// \brief Determine the larger of two APInts considered to be unsigned.
+/// Determine the larger of two APInts considered to be unsigned.
 inline const APInt &umax(const APInt &A, const APInt &B) {
   return A.ugt(B) ? A : B;
 }
 
-/// \brief Compute GCD of two unsigned APInt values.
+/// Compute GCD of two unsigned APInt values.
 ///
 /// This function returns the greatest common divisor of the two APInt values
 /// using Stein's algorithm.
@@ -2108,44 +2122,50 @@ inline const APInt &umax(const APInt &A, const APInt &B) {
 /// \returns the greatest common divisor of A and B.
 APInt GreatestCommonDivisor(APInt A, APInt B);
 
-/// \brief Converts the given APInt to a double value.
+/// Converts the given APInt to a double value.
 ///
 /// Treats the APInt as an unsigned value for conversion purposes.
 inline double RoundAPIntToDouble(const APInt &APIVal) {
   return APIVal.roundToDouble();
 }
 
-/// \brief Converts the given APInt to a double value.
+/// Converts the given APInt to a double value.
 ///
 /// Treats the APInt as a signed value for conversion purposes.
 inline double RoundSignedAPIntToDouble(const APInt &APIVal) {
   return APIVal.signedRoundToDouble();
 }
 
-/// \brief Converts the given APInt to a float vlalue.
+/// Converts the given APInt to a float vlalue.
 inline float RoundAPIntToFloat(const APInt &APIVal) {
   return float(RoundAPIntToDouble(APIVal));
 }
 
-/// \brief Converts the given APInt to a float value.
+/// Converts the given APInt to a float value.
 ///
 /// Treast the APInt as a signed value for conversion purposes.
 inline float RoundSignedAPIntToFloat(const APInt &APIVal) {
   return float(APIVal.signedRoundToDouble());
 }
 
-/// \brief Converts the given double value into a APInt.
+/// Converts the given double value into a APInt.
 ///
 /// This function convert a double value to an APInt value.
 APInt RoundDoubleToAPInt(double Double, unsigned width);
 
-/// \brief Converts a float value into a APInt.
+/// Converts a float value into a APInt.
 ///
 /// Converts a float value into an APInt value.
 inline APInt RoundFloatToAPInt(float Float, unsigned width) {
   return RoundDoubleToAPInt(double(Float), width);
 }
 
+/// Return A unsign-divided by B, rounded by the given rounding mode.
+APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM);
+
+/// Return A sign-divided by B, rounded by the given rounding mode.
+APInt RoundingSDiv(const APInt &A, const APInt &B, APInt::Rounding RM);
+
 } // End of APIntOps namespace
 
 // See friend declaration above. This additional declaration is required in
diff --git a/contrib/llvm/include/llvm/ADT/APSInt.h b/contrib/llvm/include/llvm/ADT/APSInt.h
index dabbf3314bd0..7ee2c4c62fce 100644
--- a/contrib/llvm/include/llvm/ADT/APSInt.h
+++ b/contrib/llvm/include/llvm/ADT/APSInt.h
@@ -72,7 +72,7 @@ public:
   }
   using APInt::toString;
 
-  /// \brief Get the correctly-extended \c int64_t value.
+  /// Get the correctly-extended \c int64_t value.
   int64_t getExtValue() const {
     assert(getMinSignedBits() <= 64 && "Too many bits for int64_t");
     return isSigned() ? getSExtValue() : getZExtValue();
@@ -279,13 +279,13 @@ public:
                            : APInt::getSignedMinValue(numBits), Unsigned);
   }
 
-  /// \brief Determine if two APSInts have the same value, zero- or
+  /// Determine if two APSInts have the same value, zero- or
   /// sign-extending as needed.
   static bool isSameValue(const APSInt &I1, const APSInt &I2) {
     return !compareValues(I1, I2);
   }
 
-  /// \brief Compare underlying values of two numbers.
+  /// Compare underlying values of two numbers.
   static int compareValues(const APSInt &I1, const APSInt &I2) {
     if (I1.getBitWidth() == I2.getBitWidth() && I1.isSigned() == I2.isSigned())
       return I1.IsUnsigned ? I1.compare(I2) : I1.compareSigned(I2);
diff --git a/contrib/llvm/include/llvm/ADT/Any.h b/contrib/llvm/include/llvm/ADT/Any.h
new file mode 100644
index 000000000000..c64c39987542
--- /dev/null
+++ b/contrib/llvm/include/llvm/ADT/Any.h
@@ -0,0 +1,150 @@
+//===- Any.h - Generic type erased holder of any type -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file provides Any, a non-template class modeled in the spirit of
+//  std::any.  The idea is to provide a type-safe replacement for C's void*.
+//  It can hold a value of any copy-constructible copy-assignable type
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ANY_H
+#define LLVM_ADT_ANY_H
+
+#include "llvm/ADT/STLExtras.h"
+
+#include <cassert>
+#include <memory>
+#include <type_traits>
+
+namespace llvm {
+
+class Any {
+  template <typename T> struct TypeId { static const char Id; };
+
+  struct StorageBase {
+    virtual ~StorageBase() = default;
+    virtual std::unique_ptr<StorageBase> clone() const = 0;
+    virtual const void *id() const = 0;
+  };
+
+  template <typename T> struct StorageImpl : public StorageBase {
+    explicit StorageImpl(const T &Value) : Value(Value) {}
+
+    explicit StorageImpl(T &&Value) : Value(std::move(Value)) {}
+
+    std::unique_ptr<StorageBase> clone() const override {
+      return llvm::make_unique<StorageImpl<T>>(Value);
+    }
+
+    const void *id() const override { return &TypeId<T>::Id; }
+
+    T Value;
+
+  private:
+    StorageImpl &operator=(const StorageImpl &Other) = delete;
+    StorageImpl(const StorageImpl &Other) = delete;
+  };
+
+public:
+  Any() = default;
+
+  Any(const Any &Other)
+      : Storage(Other.Storage ? Other.Storage->clone() : nullptr) {}
+
+  // When T is Any or T is not copy-constructible we need to explicitly disable
+  // the forwarding constructor so that the copy constructor gets selected
+  // instead.
+  template <
+      typename T,
+      typename std::enable_if<
+          llvm::conjunction<
+              llvm::negation<std::is_same<typename std::decay<T>::type, Any>>,
+              std::is_copy_constructible<typename std::decay<T>::type>>::value,
+          int>::type = 0>
+  Any(T &&Value) {
+    using U = typename std::decay<T>::type;
+    Storage = llvm::make_unique<StorageImpl<U>>(std::forward<T>(Value));
+  }
+
+  Any(Any &&Other) : Storage(std::move(Other.Storage)) {}
+
+  Any &swap(Any &Other) {
+    std::swap(Storage, Other.Storage);
+    return *this;
+  }
+
+  Any &operator=(Any Other) {
+    Storage = std::move(Other.Storage);
+    return *this;
+  }
+
+  bool hasValue() const { return !!Storage; }
+
+  void reset() { Storage.reset(); }
+
+private:
+  template <class T> friend T any_cast(const Any &Value);
+  template <class T> friend T any_cast(Any &Value);
+  template <class T> friend T any_cast(Any &&Value);
+  template <class T> friend const T *any_cast(const Any *Value);
+  template <class T> friend T *any_cast(Any *Value);
+  template <typename T> friend bool any_isa(const Any &Value);
+
+  std::unique_ptr<StorageBase> Storage;
+};
+
+template <typename T> const char Any::TypeId<T>::Id = 0;
+
+
+template <typename T> bool any_isa(const Any &Value) {
+  if (!Value.Storage)
+    return false;
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  return Value.Storage->id() == &Any::TypeId<U>::Id;
+}
+
+template <class T> T any_cast(const Any &Value) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  return static_cast<T>(*any_cast<U>(&Value));
+}
+
+template <class T> T any_cast(Any &Value) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  return static_cast<T>(*any_cast<U>(&Value));
+}
+
+template <class T> T any_cast(Any &&Value) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  return static_cast<T>(std::move(*any_cast<U>(&Value)));
+}
+
+template <class T> const T *any_cast(const Any *Value) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  assert(Value && any_isa<T>(*Value) && "Bad any cast!");
+  if (!Value || !any_isa<U>(*Value))
+    return nullptr;
+  return &static_cast<Any::StorageImpl<U> &>(*Value->Storage).Value;
+}
+
+template <class T> T *any_cast(Any *Value) {
+  using U = typename std::decay<T>::type;
+  assert(Value && any_isa<U>(*Value) && "Bad any cast!");
+  if (!Value || !any_isa<U>(*Value))
+    return nullptr;
+  return &static_cast<Any::StorageImpl<U> &>(*Value->Storage).Value;
+}
+
+} // end namespace llvm
+
+#endif // LLVM_ADT_ANY_H
diff --git a/contrib/llvm/include/llvm/ADT/ArrayRef.h b/contrib/llvm/include/llvm/ADT/ArrayRef.h
index 5f7a769ddac4..9cb25b09c6cb 100644
--- a/contrib/llvm/include/llvm/ADT/ArrayRef.h
+++ b/contrib/llvm/include/llvm/ADT/ArrayRef.h
@@ -184,51 +184,51 @@ namespace llvm {
     /// slice(n) - Chop off the first N elements of the array.
     ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
 
-    /// \brief Drop the first \p N elements of the array.
+    /// Drop the first \p N elements of the array.
     ArrayRef<T> drop_front(size_t N = 1) const {
       assert(size() >= N && "Dropping more elements than exist");
       return slice(N, size() - N);
     }
 
-    /// \brief Drop the last \p N elements of the array.
+    /// Drop the last \p N elements of the array.
     ArrayRef<T> drop_back(size_t N = 1) const {
       assert(size() >= N && "Dropping more elements than exist");
       return slice(0, size() - N);
     }
 
-    /// \brief Return a copy of *this with the first N elements satisfying the
+    /// Return a copy of *this with the first N elements satisfying the
     /// given predicate removed.
     template <class PredicateT> ArrayRef<T> drop_while(PredicateT Pred) const {
       return ArrayRef<T>(find_if_not(*this, Pred), end());
     }
 
-    /// \brief Return a copy of *this with the first N elements not satisfying
+    /// Return a copy of *this with the first N elements not satisfying
     /// the given predicate removed.
     template <class PredicateT> ArrayRef<T> drop_until(PredicateT Pred) const {
       return ArrayRef<T>(find_if(*this, Pred), end());
     }
 
-    /// \brief Return a copy of *this with only the first \p N elements.
+    /// Return a copy of *this with only the first \p N elements.
     ArrayRef<T> take_front(size_t N = 1) const {
       if (N >= size())
         return *this;
       return drop_back(size() - N);
     }
 
-    /// \brief Return a copy of *this with only the last \p N elements.
+    /// Return a copy of *this with only the last \p N elements.
     ArrayRef<T> take_back(size_t N = 1) const {
       if (N >= size())
         return *this;
       return drop_front(size() - N);
     }
 
-    /// \brief Return the first N elements of this Array that satisfy the given
+    /// Return the first N elements of this Array that satisfy the given
     /// predicate.
     template <class PredicateT> ArrayRef<T> take_while(PredicateT Pred) const {
       return ArrayRef<T>(begin(), find_if_not(*this, Pred));
     }
 
-    /// \brief Return the first N elements of this Array that don't satisfy the
+    /// Return the first N elements of this Array that don't satisfy the
     /// given predicate.
     template <class PredicateT> ArrayRef<T> take_until(PredicateT Pred) const {
       return ArrayRef<T>(begin(), find_if(*this, Pred));
@@ -358,7 +358,7 @@ namespace llvm {
       return slice(N, this->size() - N);
     }
 
-    /// \brief Drop the first \p N elements of the array.
+    /// Drop the first \p N elements of the array.
     MutableArrayRef<T> drop_front(size_t N = 1) const {
       assert(this->size() >= N && "Dropping more elements than exist");
       return slice(N, this->size() - N);
@@ -369,42 +369,42 @@ namespace llvm {
       return slice(0, this->size() - N);
     }
 
-    /// \brief Return a copy of *this with the first N elements satisfying the
+    /// Return a copy of *this with the first N elements satisfying the
     /// given predicate removed.
     template <class PredicateT>
     MutableArrayRef<T> drop_while(PredicateT Pred) const {
       return MutableArrayRef<T>(find_if_not(*this, Pred), end());
     }
 
-    /// \brief Return a copy of *this with the first N elements not satisfying
+    /// Return a copy of *this with the first N elements not satisfying
     /// the given predicate removed.
     template <class PredicateT>
     MutableArrayRef<T> drop_until(PredicateT Pred) const {
       return MutableArrayRef<T>(find_if(*this, Pred), end());
     }
 
-    /// \brief Return a copy of *this with only the first \p N elements.
+    /// Return a copy of *this with only the first \p N elements.
     MutableArrayRef<T> take_front(size_t N = 1) const {
       if (N >= this->size())
         return *this;
       return drop_back(this->size() - N);
     }
 
-    /// \brief Return a copy of *this with only the last \p N elements.
+    /// Return a copy of *this with only the last \p N elements.
     MutableArrayRef<T> take_back(size_t N = 1) const {
       if (N >= this->size())
         return *this;
       return drop_front(this->size() - N);
     }
 
-    /// \brief Return the first N elements of this Array that satisfy the given
+    /// Return the first N elements of this Array that satisfy the given
     /// predicate.
     template <class PredicateT>
     MutableArrayRef<T> take_while(PredicateT Pred) const {
       return MutableArrayRef<T>(begin(), find_if_not(*this, Pred));
     }
 
-    /// \brief Return the first N elements of this Array that don't satisfy the
+    /// Return the first N elements of this Array that don't satisfy the
     /// given predicate.
     template <class PredicateT>
     MutableArrayRef<T> take_until(PredicateT Pred) const {
diff --git a/contrib/llvm/include/llvm/ADT/BitVector.h b/contrib/llvm/include/llvm/ADT/BitVector.h
index 99147fec4d4c..438c7d84c581 100644
--- a/contrib/llvm/include/llvm/ADT/BitVector.h
+++ b/contrib/llvm/include/llvm/ADT/BitVector.h
@@ -779,7 +779,7 @@ public:
   }
 
 private:
-  /// \brief Perform a logical left shift of \p Count words by moving everything
+  /// Perform a logical left shift of \p Count words by moving everything
   /// \p Count words to the right in memory.
   ///
   /// While confusing, words are stored from least significant at Bits[0] to
@@ -810,7 +810,7 @@ private:
     clear_unused_bits();
   }
 
-  /// \brief Perform a logical right shift of \p Count words by moving those
+  /// Perform a logical right shift of \p Count words by moving those
   /// words to the left in memory.  See wordShl for more information.
   ///
   void wordShr(uint32_t Count) {
@@ -828,7 +828,8 @@ private:
   }
 
   MutableArrayRef<BitWord> allocate(size_t NumWords) {
-    BitWord *RawBits = (BitWord *)std::malloc(NumWords * sizeof(BitWord));
+    BitWord *RawBits = static_cast<BitWord *>(
+        safe_malloc(NumWords * sizeof(BitWord)));
     return MutableArrayRef<BitWord>(RawBits, NumWords);
   }
 
@@ -867,8 +868,8 @@ private:
   void grow(unsigned NewSize) {
     size_t NewCapacity = std::max<size_t>(NumBitWords(NewSize), Bits.size() * 2);
     assert(NewCapacity > 0 && "realloc-ing zero space");
-    BitWord *NewBits =
-        (BitWord *)std::realloc(Bits.data(), NewCapacity * sizeof(BitWord));
+    BitWord *NewBits = static_cast<BitWord *>(
+        safe_realloc(Bits.data(), NewCapacity * sizeof(BitWord)));
     Bits = MutableArrayRef<BitWord>(NewBits, NewCapacity);
     clear_unused_bits();
   }
diff --git a/contrib/llvm/include/llvm/ADT/CachedHashString.h b/contrib/llvm/include/llvm/ADT/CachedHashString.h
index a56a6213a073..d8f0e7afdd49 100644
--- a/contrib/llvm/include/llvm/ADT/CachedHashString.h
+++ b/contrib/llvm/include/llvm/ADT/CachedHashString.h
@@ -43,6 +43,7 @@ public:
   }
 
   StringRef val() const { return StringRef(P, Size); }
+  const char *data() const { return P; }
   uint32_t size() const { return Size; }
   uint32_t hash() const { return Hash; }
 };
diff --git a/contrib/llvm/include/llvm/ADT/DenseMapInfo.h b/contrib/llvm/include/llvm/ADT/DenseMapInfo.h
index a96904c7dbbf..5d12b424fb37 100644
--- a/contrib/llvm/include/llvm/ADT/DenseMapInfo.h
+++ b/contrib/llvm/include/llvm/ADT/DenseMapInfo.h
@@ -262,6 +262,13 @@ template <typename T> struct DenseMapInfo<ArrayRef<T>> {
   }
 };
 
+template <> struct DenseMapInfo<hash_code> {
+  static inline hash_code getEmptyKey() { return hash_code(-1); }
+  static inline hash_code getTombstoneKey() { return hash_code(-2); }
+  static unsigned getHashValue(hash_code val) { return val; }
+  static bool isEqual(hash_code LHS, hash_code RHS) { return LHS == RHS; }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_DENSEMAPINFO_H
diff --git a/contrib/llvm/include/llvm/ADT/DepthFirstIterator.h b/contrib/llvm/include/llvm/ADT/DepthFirstIterator.h
index e964d7fa2391..1f3766d3c9de 100644
--- a/contrib/llvm/include/llvm/ADT/DepthFirstIterator.h
+++ b/contrib/llvm/include/llvm/ADT/DepthFirstIterator.h
@@ -177,7 +177,7 @@ public:
     return *this;
   }
 
-  /// \brief Skips all children of the current node and traverses to next node
+  /// Skips all children of the current node and traverses to next node
   ///
   /// Note: This function takes care of incrementing the iterator. If you
   /// always increment and call this function, you risk walking off the end.
diff --git a/contrib/llvm/include/llvm/ADT/EpochTracker.h b/contrib/llvm/include/llvm/ADT/EpochTracker.h
index db39ba4e0c50..49ef192364e8 100644
--- a/contrib/llvm/include/llvm/ADT/EpochTracker.h
+++ b/contrib/llvm/include/llvm/ADT/EpochTracker.h
@@ -17,7 +17,6 @@
 #define LLVM_ADT_EPOCH_TRACKER_H
 
 #include "llvm/Config/abi-breaking.h"
-#include "llvm/Config/llvm-config.h"
 
 #include <cstdint>
 
@@ -25,7 +24,7 @@ namespace llvm {
 
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
 
-/// \brief A base class for data structure classes wishing to make iterators
+/// A base class for data structure classes wishing to make iterators
 /// ("handles") pointing into themselves fail-fast.  When building without
 /// asserts, this class is empty and does nothing.
 ///
@@ -40,15 +39,15 @@ class DebugEpochBase {
 public:
   DebugEpochBase() : Epoch(0) {}
 
-  /// \brief Calling incrementEpoch invalidates all handles pointing into the
+  /// Calling incrementEpoch invalidates all handles pointing into the
   /// calling instance.
   void incrementEpoch() { ++Epoch; }
 
-  /// \brief The destructor calls incrementEpoch to make use-after-free bugs
+  /// The destructor calls incrementEpoch to make use-after-free bugs
   /// more likely to crash deterministically.
   ~DebugEpochBase() { incrementEpoch(); }
 
-  /// \brief A base class for iterator classes ("handles") that wish to poll for
+  /// A base class for iterator classes ("handles") that wish to poll for
   /// iterator invalidating modifications in the underlying data structure.
   /// When LLVM is built without asserts, this class is empty and does nothing.
   ///
@@ -66,12 +65,12 @@ public:
     explicit HandleBase(const DebugEpochBase *Parent)
         : EpochAddress(&Parent->Epoch), EpochAtCreation(Parent->Epoch) {}
 
-    /// \brief Returns true if the DebugEpochBase this Handle is linked to has
+    /// Returns true if the DebugEpochBase this Handle is linked to has
     /// not called incrementEpoch on itself since the creation of this
     /// HandleBase instance.
     bool isHandleInSync() const { return *EpochAddress == EpochAtCreation; }
 
-    /// \brief Returns a pointer to the epoch word stored in the data structure
+    /// Returns a pointer to the epoch word stored in the data structure
     /// this handle points into.  Can be used to check if two iterators point
     /// into the same data structure.
     const void *getEpochAddress() const { return EpochAddress; }
diff --git a/contrib/llvm/include/llvm/ADT/FunctionExtras.h b/contrib/llvm/include/llvm/ADT/FunctionExtras.h
new file mode 100644
index 000000000000..2b75dc6ac219
--- /dev/null
+++ b/contrib/llvm/include/llvm/ADT/FunctionExtras.h
@@ -0,0 +1,293 @@
+//===- FunctionExtras.h - Function type erasure utilities -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides a collection of function (or more generally, callable)
+/// type erasure utilities supplementing those provided by the standard library
+/// in `<function>`.
+///
+/// It provides `unique_function`, which works like `std::function` but supports
+/// move-only callable objects.
+///
+/// Future plans:
+/// - Add a `function` that provides const, volatile, and ref-qualified support,
+///   which doesn't work with `std::function`.
+/// - Provide support for specifying multiple signatures to type erase callable
+///   objects with an overload set, such as those produced by generic lambdas.
+/// - Expand to include a copyable utility that directly replaces std::function
+///   but brings the above improvements.
+///
+/// Note that LLVM's utilities are greatly simplified by not supporting
+/// allocators.
+///
+/// If the standard library ever begins to provide comparable facilities we can
+/// consider switching to those.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_FUNCTION_EXTRAS_H
+#define LLVM_ADT_FUNCTION_EXTRAS_H
+
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Support/type_traits.h"
+#include <memory>
+
+namespace llvm {
+
+template <typename FunctionT> class unique_function;
+
+template <typename ReturnT, typename... ParamTs>
+class unique_function<ReturnT(ParamTs...)> {
+  static constexpr size_t InlineStorageSize = sizeof(void *) * 3;
+
+  // MSVC has a bug and ICEs if we give it a particular dependent value
+  // expression as part of the `std::conditional` below. To work around this,
+  // we build that into a template struct's constexpr bool.
+  template <typename T> struct IsSizeLessThanThresholdT {
+    static constexpr bool value = sizeof(T) <= (2 * sizeof(void *));
+  };
+
+  // Provide a type function to map parameters that won't observe extra copies
+  // or moves and which are small enough to likely pass in register to values
+  // and all other types to l-value reference types. We use this to compute the
+  // types used in our erased call utility to minimize copies and moves unless
+  // doing so would force things unnecessarily into memory.
+  //
+  // The heuristic used is related to common ABI register passing conventions.
+  // It doesn't have to be exact though, and in one way it is more strict
+  // because we want to still be able to observe either moves *or* copies.
+  template <typename T>
+  using AdjustedParamT = typename std::conditional<
+      !std::is_reference<T>::value &&
+          llvm::is_trivially_copy_constructible<T>::value &&
+          llvm::is_trivially_move_constructible<T>::value &&
+          IsSizeLessThanThresholdT<T>::value,
+      T, T &>::type;
+
+  // The type of the erased function pointer we use as a callback to dispatch to
+  // the stored callable when it is trivial to move and destroy.
+  using CallPtrT = ReturnT (*)(void *CallableAddr,
+                               AdjustedParamT<ParamTs>... Params);
+  using MovePtrT = void (*)(void *LHSCallableAddr, void *RHSCallableAddr);
+  using DestroyPtrT = void (*)(void *CallableAddr);
+
+  /// A struct to hold a single trivial callback with sufficient alignment for
+  /// our bitpacking.
+  struct alignas(8) TrivialCallback {
+    CallPtrT CallPtr;
+  };
+
+  /// A struct we use to aggregate three callbacks when we need full set of
+  /// operations.
+  struct alignas(8) NonTrivialCallbacks {
+    CallPtrT CallPtr;
+    MovePtrT MovePtr;
+    DestroyPtrT DestroyPtr;
+  };
+
+  // Create a pointer union between either a pointer to a static trivial call
+  // pointer in a struct or a pointer to a static struct of the call, move, and
+  // destroy pointers.
+  using CallbackPointerUnionT =
+      PointerUnion<TrivialCallback *, NonTrivialCallbacks *>;
+
+  // The main storage buffer. This will either have a pointer to out-of-line
+  // storage or an inline buffer storing the callable.
+  union StorageUnionT {
+    // For out-of-line storage we keep a pointer to the underlying storage and
+    // the size. This is enough to deallocate the memory.
+    struct OutOfLineStorageT {
+      void *StoragePtr;
+      size_t Size;
+      size_t Alignment;
+    } OutOfLineStorage;
+    static_assert(
+        sizeof(OutOfLineStorageT) <= InlineStorageSize,
+        "Should always use all of the out-of-line storage for inline storage!");
+
+    // For in-line storage, we just provide an aligned character buffer. We
+    // provide three pointers worth of storage here.
+    typename std::aligned_storage<InlineStorageSize, alignof(void *)>::type
+        InlineStorage;
+  } StorageUnion;
+
+  // A compressed pointer to either our dispatching callback or our table of
+  // dispatching callbacks and the flag for whether the callable itself is
+  // stored inline or not.
+  PointerIntPair<CallbackPointerUnionT, 1, bool> CallbackAndInlineFlag;
+
+  bool isInlineStorage() const { return CallbackAndInlineFlag.getInt(); }
+
+  bool isTrivialCallback() const {
+    return CallbackAndInlineFlag.getPointer().template is<TrivialCallback *>();
+  }
+
+  CallPtrT getTrivialCallback() const {
+    return CallbackAndInlineFlag.getPointer().template get<TrivialCallback *>()->CallPtr;
+  }
+
+  NonTrivialCallbacks *getNonTrivialCallbacks() const {
+    return CallbackAndInlineFlag.getPointer()
+        .template get<NonTrivialCallbacks *>();
+  }
+
+  void *getInlineStorage() { return &StorageUnion.InlineStorage; }
+
+  void *getOutOfLineStorage() {
+    return StorageUnion.OutOfLineStorage.StoragePtr;
+  }
+  size_t getOutOfLineStorageSize() const {
+    return StorageUnion.OutOfLineStorage.Size;
+  }
+  size_t getOutOfLineStorageAlignment() const {
+    return StorageUnion.OutOfLineStorage.Alignment;
+  }
+
+  void setOutOfLineStorage(void *Ptr, size_t Size, size_t Alignment) {
+    StorageUnion.OutOfLineStorage = {Ptr, Size, Alignment};
+  }
+
+  template <typename CallableT>
+  static ReturnT CallImpl(void *CallableAddr, AdjustedParamT<ParamTs>... Params) {
+    return (*reinterpret_cast<CallableT *>(CallableAddr))(
+        std::forward<ParamTs>(Params)...);
+  }
+
+  template <typename CallableT>
+  static void MoveImpl(void *LHSCallableAddr, void *RHSCallableAddr) noexcept {
+    new (LHSCallableAddr)
+        CallableT(std::move(*reinterpret_cast<CallableT *>(RHSCallableAddr)));
+  }
+
+  template <typename CallableT>
+  static void DestroyImpl(void *CallableAddr) noexcept {
+    reinterpret_cast<CallableT *>(CallableAddr)->~CallableT();
+  }
+
+public:
+  unique_function() = default;
+  unique_function(std::nullptr_t /*null_callable*/) {}
+
+  ~unique_function() {
+    if (!CallbackAndInlineFlag.getPointer())
+      return;
+
+    // Cache this value so we don't re-check it after type-erased operations.
+    bool IsInlineStorage = isInlineStorage();
+
+    if (!isTrivialCallback())
+      getNonTrivialCallbacks()->DestroyPtr(
+          IsInlineStorage ? getInlineStorage() : getOutOfLineStorage());
+
+    if (!IsInlineStorage)
+      deallocate_buffer(getOutOfLineStorage(), getOutOfLineStorageSize(),
+                        getOutOfLineStorageAlignment());
+  }
+
+  unique_function(unique_function &&RHS) noexcept {
+    // Copy the callback and inline flag.
+    CallbackAndInlineFlag = RHS.CallbackAndInlineFlag;
+
+    // If the RHS is empty, just copying the above is sufficient.
+    if (!RHS)
+      return;
+
+    if (!isInlineStorage()) {
+      // The out-of-line case is easiest to move.
+      StorageUnion.OutOfLineStorage = RHS.StorageUnion.OutOfLineStorage;
+    } else if (isTrivialCallback()) {
+      // Move is trivial, just memcpy the bytes across.
+      memcpy(getInlineStorage(), RHS.getInlineStorage(), InlineStorageSize);
+    } else {
+      // Non-trivial move, so dispatch to a type-erased implementation.
+      getNonTrivialCallbacks()->MovePtr(getInlineStorage(),
+                                        RHS.getInlineStorage());
+    }
+
+    // Clear the old callback and inline flag to get back to as-if-null.
+    RHS.CallbackAndInlineFlag = {};
+
+#ifndef NDEBUG
+    // In debug builds, we also scribble across the rest of the storage.
+    memset(RHS.getInlineStorage(), 0xAD, InlineStorageSize);
+#endif
+  }
+
+  unique_function &operator=(unique_function &&RHS) noexcept {
+    if (this == &RHS)
+      return *this;
+
+    // Because we don't try to provide any exception safety guarantees we can
+    // implement move assignment very simply by first destroying the current
+    // object and then move-constructing over top of it.
+    this->~unique_function();
+    new (this) unique_function(std::move(RHS));
+    return *this;
+  }
+
+  template <typename CallableT> unique_function(CallableT Callable) {
+    bool IsInlineStorage = true;
+    void *CallableAddr = getInlineStorage();
+    if (sizeof(CallableT) > InlineStorageSize ||
+        alignof(CallableT) > alignof(decltype(StorageUnion.InlineStorage))) {
+      IsInlineStorage = false;
+      // Allocate out-of-line storage. FIXME: Use an explicit alignment
+      // parameter in C++17 mode.
+      auto Size = sizeof(CallableT);
+      auto Alignment = alignof(CallableT);
+      CallableAddr = allocate_buffer(Size, Alignment);
+      setOutOfLineStorage(CallableAddr, Size, Alignment);
+    }
+
+    // Now move into the storage.
+    new (CallableAddr) CallableT(std::move(Callable));
+
+    // See if we can create a trivial callback. We need the callable to be
+    // trivially moved and trivially destroyed so that we don't have to store
+    // type erased callbacks for those operations.
+    //
+    // FIXME: We should use constexpr if here and below to avoid instantiating
+    // the non-trivial static objects when unnecessary. While the linker should
+    // remove them, it is still wasteful.
+    if (llvm::is_trivially_move_constructible<CallableT>::value &&
+        std::is_trivially_destructible<CallableT>::value) {
+      // We need to create a nicely aligned object. We use a static variable
+      // for this because it is a trivial struct.
+      static TrivialCallback Callback = { &CallImpl<CallableT> };
+
+      CallbackAndInlineFlag = {&Callback, IsInlineStorage};
+      return;
+    }
+
+    // Otherwise, we need to point at an object that contains all the different
+    // type erased behaviors needed. Create a static instance of the struct type
+    // here and then use a pointer to that.
+    static NonTrivialCallbacks Callbacks = {
+        &CallImpl<CallableT>, &MoveImpl<CallableT>, &DestroyImpl<CallableT>};
+
+    CallbackAndInlineFlag = {&Callbacks, IsInlineStorage};
+  }
+
+  ReturnT operator()(ParamTs... Params) {
+    void *CallableAddr =
+        isInlineStorage() ? getInlineStorage() : getOutOfLineStorage();
+
+    return (isTrivialCallback()
+                ? getTrivialCallback()
+                : getNonTrivialCallbacks()->CallPtr)(CallableAddr, Params...);
+  }
+
+  explicit operator bool() const {
+    return (bool)CallbackAndInlineFlag.getPointer();
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ADT_FUNCTION_H
diff --git a/contrib/llvm/include/llvm/ADT/GraphTraits.h b/contrib/llvm/include/llvm/ADT/GraphTraits.h
index 225d9eb847f0..27c647f4bbbd 100644
--- a/contrib/llvm/include/llvm/ADT/GraphTraits.h
+++ b/contrib/llvm/include/llvm/ADT/GraphTraits.h
@@ -47,6 +47,19 @@ struct GraphTraits {
   // static nodes_iterator nodes_end  (GraphType *G)
   //    nodes_iterator/begin/end - Allow iteration over all nodes in the graph
 
+  // typedef EdgeRef           - Type of Edge token in the graph, which should
+  //                             be cheap to copy.
+  // typedef ChildEdgeIteratorType - Type used to iterate over children edges in
+  //                             graph, dereference to a EdgeRef.
+
+  // static ChildEdgeIteratorType child_edge_begin(NodeRef)
+  // static ChildEdgeIteratorType child_edge_end(NodeRef)
+  //     Return iterators that point to the beginning and ending of the
+  //     edge list for the given callgraph node.
+  //
+  // static NodeRef edge_dest(EdgeRef)
+  //     Return the destination node of an edge.
+
   // static unsigned       size       (GraphType *G)
   //    Return total number of nodes in the graph
 
@@ -111,6 +124,13 @@ inverse_children(const typename GraphTraits<GraphType>::NodeRef &G) {
                     GraphTraits<Inverse<GraphType>>::child_end(G));
 }
 
+template <class GraphType>
+iterator_range<typename GraphTraits<GraphType>::ChildEdgeIteratorType>
+children_edges(const typename GraphTraits<GraphType>::NodeRef &G) {
+  return make_range(GraphTraits<GraphType>::child_edge_begin(G),
+                    GraphTraits<GraphType>::child_edge_end(G));
+}
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_GRAPHTRAITS_H
diff --git a/contrib/llvm/include/llvm/ADT/Hashing.h b/contrib/llvm/include/llvm/ADT/Hashing.h
index c3b574102f69..9f830baa4243 100644
--- a/contrib/llvm/include/llvm/ADT/Hashing.h
+++ b/contrib/llvm/include/llvm/ADT/Hashing.h
@@ -57,7 +57,7 @@
 
 namespace llvm {
 
-/// \brief An opaque object representing a hash code.
+/// An opaque object representing a hash code.
 ///
 /// This object represents the result of hashing some entity. It is intended to
 /// be used to implement hashtables or other hashing-based data structures.
@@ -73,14 +73,14 @@ class hash_code {
   size_t value;
 
 public:
-  /// \brief Default construct a hash_code.
+  /// Default construct a hash_code.
   /// Note that this leaves the value uninitialized.
   hash_code() = default;
 
-  /// \brief Form a hash code directly from a numerical value.
+  /// Form a hash code directly from a numerical value.
   hash_code(size_t value) : value(value) {}
 
-  /// \brief Convert the hash code to its numerical value for use.
+  /// Convert the hash code to its numerical value for use.
   /*explicit*/ operator size_t() const { return value; }
 
   friend bool operator==(const hash_code &lhs, const hash_code &rhs) {
@@ -90,11 +90,11 @@ public:
     return lhs.value != rhs.value;
   }
 
-  /// \brief Allow a hash_code to be directly run through hash_value.
+  /// Allow a hash_code to be directly run through hash_value.
   friend size_t hash_value(const hash_code &code) { return code.value; }
 };
 
-/// \brief Compute a hash_code for any integer value.
+/// Compute a hash_code for any integer value.
 ///
 /// Note that this function is intended to compute the same hash_code for
 /// a particular value without regard to the pre-promotion type. This is in
@@ -105,21 +105,21 @@ template <typename T>
 typename std::enable_if<is_integral_or_enum<T>::value, hash_code>::type
 hash_value(T value);
 
-/// \brief Compute a hash_code for a pointer's address.
+/// Compute a hash_code for a pointer's address.
 ///
 /// N.B.: This hashes the *address*. Not the value and not the type.
 template <typename T> hash_code hash_value(const T *ptr);
 
-/// \brief Compute a hash_code for a pair of objects.
+/// Compute a hash_code for a pair of objects.
 template <typename T, typename U>
 hash_code hash_value(const std::pair<T, U> &arg);
 
-/// \brief Compute a hash_code for a standard string.
+/// Compute a hash_code for a standard string.
 template <typename T>
 hash_code hash_value(const std::basic_string<T> &arg);
 
 
-/// \brief Override the execution seed with a fixed value.
+/// Override the execution seed with a fixed value.
 ///
 /// This hashing library uses a per-execution seed designed to change on each
 /// run with high probability in order to ensure that the hash codes are not
@@ -164,7 +164,7 @@ static const uint64_t k1 = 0xb492b66fbe98f273ULL;
 static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
 static const uint64_t k3 = 0xc949d7c7509e6557ULL;
 
-/// \brief Bitwise right rotate.
+/// Bitwise right rotate.
 /// Normally this will compile to a single instruction, especially if the
 /// shift is a manifest constant.
 inline uint64_t rotate(uint64_t val, size_t shift) {
@@ -254,13 +254,13 @@ inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) {
   return k2 ^ seed;
 }
 
-/// \brief The intermediate state used during hashing.
+/// The intermediate state used during hashing.
 /// Currently, the algorithm for computing hash codes is based on CityHash and
 /// keeps 56 bytes of arbitrary state.
 struct hash_state {
   uint64_t h0, h1, h2, h3, h4, h5, h6;
 
-  /// \brief Create a new hash_state structure and initialize it based on the
+  /// Create a new hash_state structure and initialize it based on the
   /// seed and the first 64-byte chunk.
   /// This effectively performs the initial mix.
   static hash_state create(const char *s, uint64_t seed) {
@@ -272,7 +272,7 @@ struct hash_state {
     return state;
   }
 
-  /// \brief Mix 32-bytes from the input sequence into the 16-bytes of 'a'
+  /// Mix 32-bytes from the input sequence into the 16-bytes of 'a'
   /// and 'b', including whatever is already in 'a' and 'b'.
   static void mix_32_bytes(const char *s, uint64_t &a, uint64_t &b) {
     a += fetch64(s);
@@ -284,7 +284,7 @@ struct hash_state {
     a += c;
   }
 
-  /// \brief Mix in a 64-byte buffer of data.
+  /// Mix in a 64-byte buffer of data.
   /// We mix all 64 bytes even when the chunk length is smaller, but we
   /// record the actual length.
   void mix(const char *s) {
@@ -302,7 +302,7 @@ struct hash_state {
     std::swap(h2, h0);
   }
 
-  /// \brief Compute the final 64-bit hash code value based on the current
+  /// Compute the final 64-bit hash code value based on the current
   /// state and the length of bytes hashed.
   uint64_t finalize(size_t length) {
     return hash_16_bytes(hash_16_bytes(h3, h5) + shift_mix(h1) * k1 + h2,
@@ -311,7 +311,7 @@ struct hash_state {
 };
 
 
-/// \brief A global, fixed seed-override variable.
+/// A global, fixed seed-override variable.
 ///
 /// This variable can be set using the \see llvm::set_fixed_execution_seed
 /// function. See that function for details. Do not, under any circumstances,
@@ -332,7 +332,7 @@ inline size_t get_execution_seed() {
 }
 
 
-/// \brief Trait to indicate whether a type's bits can be hashed directly.
+/// Trait to indicate whether a type's bits can be hashed directly.
 ///
 /// A type trait which is true if we want to combine values for hashing by
 /// reading the underlying data. It is false if values of this type must
@@ -359,14 +359,14 @@ template <typename T, typename U> struct is_hashable_data<std::pair<T, U> >
                                   (sizeof(T) + sizeof(U)) ==
                                    sizeof(std::pair<T, U>))> {};
 
-/// \brief Helper to get the hashable data representation for a type.
+/// Helper to get the hashable data representation for a type.
 /// This variant is enabled when the type itself can be used.
 template <typename T>
 typename std::enable_if<is_hashable_data<T>::value, T>::type
 get_hashable_data(const T &value) {
   return value;
 }
-/// \brief Helper to get the hashable data representation for a type.
+/// Helper to get the hashable data representation for a type.
 /// This variant is enabled when we must first call hash_value and use the
 /// result as our data.
 template <typename T>
@@ -376,7 +376,7 @@ get_hashable_data(const T &value) {
   return hash_value(value);
 }
 
-/// \brief Helper to store data from a value into a buffer and advance the
+/// Helper to store data from a value into a buffer and advance the
 /// pointer into that buffer.
 ///
 /// This routine first checks whether there is enough space in the provided
@@ -395,7 +395,7 @@ bool store_and_advance(char *&buffer_ptr, char *buffer_end, const T& value,
   return true;
 }
 
-/// \brief Implement the combining of integral values into a hash_code.
+/// Implement the combining of integral values into a hash_code.
 ///
 /// This overload is selected when the value type of the iterator is
 /// integral. Rather than computing a hash_code for each object and then
@@ -435,7 +435,7 @@ hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) {
   return state.finalize(length);
 }
 
-/// \brief Implement the combining of integral values into a hash_code.
+/// Implement the combining of integral values into a hash_code.
 ///
 /// This overload is selected when the value type of the iterator is integral
 /// and when the input iterator is actually a pointer. Rather than computing
@@ -470,7 +470,7 @@ hash_combine_range_impl(ValueT *first, ValueT *last) {
 } // namespace hashing
 
 
-/// \brief Compute a hash_code for a sequence of values.
+/// Compute a hash_code for a sequence of values.
 ///
 /// This hashes a sequence of values. It produces the same hash_code as
 /// 'hash_combine(a, b, c, ...)', but can run over arbitrary sized sequences
@@ -486,7 +486,7 @@ hash_code hash_combine_range(InputIteratorT first, InputIteratorT last) {
 namespace hashing {
 namespace detail {
 
-/// \brief Helper class to manage the recursive combining of hash_combine
+/// Helper class to manage the recursive combining of hash_combine
 /// arguments.
 ///
 /// This class exists to manage the state and various calls involved in the
@@ -499,14 +499,14 @@ struct hash_combine_recursive_helper {
   const size_t seed;
 
 public:
-  /// \brief Construct a recursive hash combining helper.
+  /// Construct a recursive hash combining helper.
   ///
   /// This sets up the state for a recursive hash combine, including getting
   /// the seed and buffer setup.
   hash_combine_recursive_helper()
     : seed(get_execution_seed()) {}
 
-  /// \brief Combine one chunk of data into the current in-flight hash.
+  /// Combine one chunk of data into the current in-flight hash.
   ///
   /// This merges one chunk of data into the hash. First it tries to buffer
   /// the data. If the buffer is full, it hashes the buffer into its
@@ -547,7 +547,7 @@ public:
     return buffer_ptr;
   }
 
-  /// \brief Recursive, variadic combining method.
+  /// Recursive, variadic combining method.
   ///
   /// This function recurses through each argument, combining that argument
   /// into a single hash.
@@ -560,7 +560,7 @@ public:
     return combine(length, buffer_ptr, buffer_end, args...);
   }
 
-  /// \brief Base case for recursive, variadic combining.
+  /// Base case for recursive, variadic combining.
   ///
   /// The base case when combining arguments recursively is reached when all
   /// arguments have been handled. It flushes the remaining buffer and
@@ -588,7 +588,7 @@ public:
 } // namespace detail
 } // namespace hashing
 
-/// \brief Combine values into a single hash_code.
+/// Combine values into a single hash_code.
 ///
 /// This routine accepts a varying number of arguments of any type. It will
 /// attempt to combine them into a single hash_code. For user-defined types it
@@ -610,7 +610,7 @@ template <typename ...Ts> hash_code hash_combine(const Ts &...args) {
 namespace hashing {
 namespace detail {
 
-/// \brief Helper to hash the value of a single integer.
+/// Helper to hash the value of a single integer.
 ///
 /// Overloads for smaller integer types are not provided to ensure consistent
 /// behavior in the presence of integral promotions. Essentially,
diff --git a/contrib/llvm/include/llvm/ADT/ImmutableList.h b/contrib/llvm/include/llvm/ADT/ImmutableList.h
index 60d63e09d426..1f5e9813798d 100644
--- a/contrib/llvm/include/llvm/ADT/ImmutableList.h
+++ b/contrib/llvm/include/llvm/ADT/ImmutableList.h
@@ -166,7 +166,7 @@ public:
     if (ownsAllocator()) delete &getAllocator();
   }
 
-  ImmutableList<T> concat(const T& Head, ImmutableList<T> Tail) {
+  LLVM_NODISCARD ImmutableList<T> concat(const T &Head, ImmutableList<T> Tail) {
     // Profile the new list to see if it already exists in our cache.
     FoldingSetNodeID ID;
     void* InsertPos;
@@ -188,7 +188,7 @@ public:
     return L;
   }
 
-  ImmutableList<T> add(const T& D, ImmutableList<T> L) {
+  LLVM_NODISCARD ImmutableList<T> add(const T& D, ImmutableList<T> L) {
     return concat(D, L);
   }
 
diff --git a/contrib/llvm/include/llvm/ADT/ImmutableMap.h b/contrib/llvm/include/llvm/ADT/ImmutableMap.h
index 10d1e1f0139b..cbc27ff17ccf 100644
--- a/contrib/llvm/include/llvm/ADT/ImmutableMap.h
+++ b/contrib/llvm/include/llvm/ADT/ImmutableMap.h
@@ -114,12 +114,13 @@ public:
 
     ImmutableMap getEmptyMap() { return ImmutableMap(F.getEmptyTree()); }
 
-    ImmutableMap add(ImmutableMap Old, key_type_ref K, data_type_ref D) {
+    LLVM_NODISCARD ImmutableMap add(ImmutableMap Old, key_type_ref K,
+                                    data_type_ref D) {
       TreeTy *T = F.add(Old.Root, std::pair<key_type,data_type>(K,D));
       return ImmutableMap(Canonicalize ? F.getCanonicalTree(T): T);
     }
 
-    ImmutableMap remove(ImmutableMap Old, key_type_ref K) {
+    LLVM_NODISCARD ImmutableMap remove(ImmutableMap Old, key_type_ref K) {
       TreeTy *T = F.remove(Old.Root,K);
       return ImmutableMap(Canonicalize ? F.getCanonicalTree(T): T);
     }
diff --git a/contrib/llvm/include/llvm/ADT/ImmutableSet.h b/contrib/llvm/include/llvm/ADT/ImmutableSet.h
index 9d580c5a3d41..b1d5f4ac42e4 100644
--- a/contrib/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/contrib/llvm/include/llvm/ADT/ImmutableSet.h
@@ -1017,7 +1017,7 @@ public:
     ///  of this operation is logarithmic in the size of the original set.
     ///  The memory allocated to represent the set is released when the
     ///  factory object that created the set is destroyed.
-    ImmutableSet add(ImmutableSet Old, value_type_ref V) {
+    LLVM_NODISCARD ImmutableSet add(ImmutableSet Old, value_type_ref V) {
       TreeTy *NewT = F.add(Old.Root, V);
       return ImmutableSet(Canonicalize ? F.getCanonicalTree(NewT) : NewT);
     }
@@ -1029,7 +1029,7 @@ public:
     ///  of this operation is logarithmic in the size of the original set.
     ///  The memory allocated to represent the set is released when the
     ///  factory object that created the set is destroyed.
-    ImmutableSet remove(ImmutableSet Old, value_type_ref V) {
+    LLVM_NODISCARD ImmutableSet remove(ImmutableSet Old, value_type_ref V) {
       TreeTy *NewT = F.remove(Old.Root, V);
       return ImmutableSet(Canonicalize ? F.getCanonicalTree(NewT) : NewT);
     }
diff --git a/contrib/llvm/include/llvm/ADT/MapVector.h b/contrib/llvm/include/llvm/ADT/MapVector.h
index 3d78f4b203c8..47b4987f210a 100644
--- a/contrib/llvm/include/llvm/ADT/MapVector.h
+++ b/contrib/llvm/include/llvm/ADT/MapVector.h
@@ -36,13 +36,17 @@ template<typename KeyT, typename ValueT,
          typename MapType = DenseMap<KeyT, unsigned>,
          typename VectorType = std::vector<std::pair<KeyT, ValueT>>>
 class MapVector {
-  using value_type = typename VectorType::value_type;
-  using size_type = typename VectorType::size_type;
-
   MapType Map;
   VectorType Vector;
 
+  static_assert(
+      std::is_integral<typename MapType::mapped_type>::value,
+      "The mapped_type of the specified Map must be an integral type");
+
 public:
+  using value_type = typename VectorType::value_type;
+  using size_type = typename VectorType::size_type;
+
   using iterator = typename VectorType::iterator;
   using const_iterator = typename VectorType::const_iterator;
   using reverse_iterator = typename VectorType::reverse_iterator;
@@ -93,9 +97,9 @@ public:
   }
 
   ValueT &operator[](const KeyT &Key) {
-    std::pair<KeyT, unsigned> Pair = std::make_pair(Key, 0);
+    std::pair<KeyT, typename MapType::mapped_type> Pair = std::make_pair(Key, 0);
     std::pair<typename MapType::iterator, bool> Result = Map.insert(Pair);
-    unsigned &I = Result.first->second;
+    auto &I = Result.first->second;
     if (Result.second) {
       Vector.push_back(std::make_pair(Key, ValueT()));
       I = Vector.size() - 1;
@@ -112,9 +116,9 @@ public:
   }
 
   std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &KV) {
-    std::pair<KeyT, unsigned> Pair = std::make_pair(KV.first, 0);
+    std::pair<KeyT, typename MapType::mapped_type> Pair = std::make_pair(KV.first, 0);
     std::pair<typename MapType::iterator, bool> Result = Map.insert(Pair);
-    unsigned &I = Result.first->second;
+    auto &I = Result.first->second;
     if (Result.second) {
       Vector.push_back(std::make_pair(KV.first, KV.second));
       I = Vector.size() - 1;
@@ -125,9 +129,9 @@ public:
 
   std::pair<iterator, bool> insert(std::pair<KeyT, ValueT> &&KV) {
     // Copy KV.first into the map, then move it into the vector.
-    std::pair<KeyT, unsigned> Pair = std::make_pair(KV.first, 0);
+    std::pair<KeyT, typename MapType::mapped_type> Pair = std::make_pair(KV.first, 0);
     std::pair<typename MapType::iterator, bool> Result = Map.insert(Pair);
-    unsigned &I = Result.first->second;
+    auto &I = Result.first->second;
     if (Result.second) {
       Vector.push_back(std::move(KV));
       I = Vector.size() - 1;
@@ -153,14 +157,14 @@ public:
                             (Vector.begin() + Pos->second);
   }
 
-  /// \brief Remove the last element from the vector.
+  /// Remove the last element from the vector.
   void pop_back() {
     typename MapType::iterator Pos = Map.find(Vector.back().first);
     Map.erase(Pos);
     Vector.pop_back();
   }
 
-  /// \brief Remove the element given by Iterator.
+  /// Remove the element given by Iterator.
   ///
   /// Returns an iterator to the element following the one which was removed,
   /// which may be end().
@@ -183,7 +187,7 @@ public:
     return Next;
   }
 
-  /// \brief Remove all elements with the key value Key.
+  /// Remove all elements with the key value Key.
   ///
   /// Returns the number of elements removed.
   size_type erase(const KeyT &Key) {
@@ -194,7 +198,7 @@ public:
     return 1;
   }
 
-  /// \brief Remove the elements that match the predicate.
+  /// Remove the elements that match the predicate.
   ///
   /// Erase all elements that match \c Pred in a single pass.  Takes linear
   /// time.
@@ -223,7 +227,7 @@ void MapVector<KeyT, ValueT, MapType, VectorType>::remove_if(Function Pred) {
   Vector.erase(O, Vector.end());
 }
 
-/// \brief A MapVector that performs no allocations if smaller than a certain
+/// A MapVector that performs no allocations if smaller than a certain
 /// size.
 template <typename KeyT, typename ValueT, unsigned N>
 struct SmallMapVector
diff --git a/contrib/llvm/include/llvm/ADT/None.h b/contrib/llvm/include/llvm/ADT/None.h
index c7a99c61994e..4b6bc1e005b5 100644
--- a/contrib/llvm/include/llvm/ADT/None.h
+++ b/contrib/llvm/include/llvm/ADT/None.h
@@ -17,7 +17,7 @@
 #define LLVM_ADT_NONE_H
 
 namespace llvm {
-/// \brief A simple null object to allow implicit construction of Optional<T>
+/// A simple null object to allow implicit construction of Optional<T>
 /// and similar types without having to spell out the specialization's name.
 // (constant value 1 in an attempt to workaround MSVC build issue... )
 enum class NoneType { None = 1 };
diff --git a/contrib/llvm/include/llvm/ADT/Optional.h b/contrib/llvm/include/llvm/ADT/Optional.h
index 2811d5c1e21b..353e5d0ec9df 100644
--- a/contrib/llvm/include/llvm/ADT/Optional.h
+++ b/contrib/llvm/include/llvm/ADT/Optional.h
@@ -27,124 +27,164 @@
 
 namespace llvm {
 
-template <typename T> class Optional {
+namespace optional_detail {
+/// Storage for any type.
+template <typename T, bool IsPodLike> struct OptionalStorage {
   AlignedCharArrayUnion<T> storage;
   bool hasVal = false;
 
-public:
-  using value_type = T;
-
-  Optional(NoneType) {}
-  explicit Optional() {}
-
-  Optional(const T &y) : hasVal(true) { new (storage.buffer) T(y); }
+  OptionalStorage() = default;
 
-  Optional(const Optional &O) : hasVal(O.hasVal) {
+  OptionalStorage(const T &y) : hasVal(true) { new (storage.buffer) T(y); }
+  OptionalStorage(const OptionalStorage &O) : hasVal(O.hasVal) {
     if (hasVal)
-      new (storage.buffer) T(*O);
+      new (storage.buffer) T(*O.getPointer());
   }
-
-  Optional(T &&y) : hasVal(true) { new (storage.buffer) T(std::forward<T>(y)); }
-
-  Optional(Optional<T> &&O) : hasVal(O) {
-    if (O) {
-      new (storage.buffer) T(std::move(*O));
-      O.reset();
+  OptionalStorage(T &&y) : hasVal(true) {
+    new (storage.buffer) T(std::forward<T>(y));
+  }
+  OptionalStorage(OptionalStorage &&O) : hasVal(O.hasVal) {
+    if (O.hasVal) {
+      new (storage.buffer) T(std::move(*O.getPointer()));
     }
   }
 
-  ~Optional() { reset(); }
-
-  Optional &operator=(T &&y) {
+  OptionalStorage &operator=(T &&y) {
     if (hasVal)
-      **this = std::move(y);
+      *getPointer() = std::move(y);
     else {
       new (storage.buffer) T(std::move(y));
       hasVal = true;
     }
     return *this;
   }
-
-  Optional &operator=(Optional &&O) {
-    if (!O)
+  OptionalStorage &operator=(OptionalStorage &&O) {
+    if (!O.hasVal)
       reset();
     else {
-      *this = std::move(*O);
-      O.reset();
+      *this = std::move(*O.getPointer());
     }
     return *this;
   }
 
-  /// Create a new object by constructing it in place with the given arguments.
-  template <typename... ArgTypes> void emplace(ArgTypes &&... Args) {
-    reset();
-    hasVal = true;
-    new (storage.buffer) T(std::forward<ArgTypes>(Args)...);
-  }
-
-  static inline Optional create(const T *y) {
-    return y ? Optional(*y) : Optional();
-  }
-
   // FIXME: these assignments (& the equivalent const T&/const Optional& ctors)
   // could be made more efficient by passing by value, possibly unifying them
   // with the rvalue versions above - but this could place a different set of
   // requirements (notably: the existence of a default ctor) when implemented
   // in that way. Careful SFINAE to avoid such pitfalls would be required.
-  Optional &operator=(const T &y) {
+  OptionalStorage &operator=(const T &y) {
     if (hasVal)
-      **this = y;
+      *getPointer() = y;
     else {
       new (storage.buffer) T(y);
       hasVal = true;
     }
     return *this;
   }
-
-  Optional &operator=(const Optional &O) {
-    if (!O)
+  OptionalStorage &operator=(const OptionalStorage &O) {
+    if (!O.hasVal)
       reset();
     else
-      *this = *O;
+      *this = *O.getPointer();
     return *this;
   }
 
+  ~OptionalStorage() { reset(); }
+
   void reset() {
     if (hasVal) {
-      (**this).~T();
+      (*getPointer()).~T();
       hasVal = false;
     }
   }
 
-  const T *getPointer() const {
-    assert(hasVal);
-    return reinterpret_cast<const T *>(storage.buffer);
-  }
   T *getPointer() {
     assert(hasVal);
     return reinterpret_cast<T *>(storage.buffer);
   }
-  const T &getValue() const LLVM_LVALUE_FUNCTION {
+  const T *getPointer() const {
     assert(hasVal);
-    return *getPointer();
+    return reinterpret_cast<const T *>(storage.buffer);
   }
-  T &getValue() LLVM_LVALUE_FUNCTION {
-    assert(hasVal);
-    return *getPointer();
+};
+
+#if !defined(__GNUC__) || defined(__clang__) // GCC up to GCC7 miscompiles this.
+/// Storage for trivially copyable types only.
+template <typename T> struct OptionalStorage<T, true> {
+  AlignedCharArrayUnion<T> storage;
+  bool hasVal = false;
+
+  OptionalStorage() = default;
+
+  OptionalStorage(const T &y) : hasVal(true) { new (storage.buffer) T(y); }
+  OptionalStorage &operator=(const T &y) {
+    *reinterpret_cast<T *>(storage.buffer) = y;
+    hasVal = true;
+    return *this;
   }
 
-  explicit operator bool() const { return hasVal; }
-  bool hasValue() const { return hasVal; }
-  const T *operator->() const { return getPointer(); }
-  T *operator->() { return getPointer(); }
-  const T &operator*() const LLVM_LVALUE_FUNCTION {
-    assert(hasVal);
-    return *getPointer();
+  void reset() { hasVal = false; }
+};
+#endif
+} // namespace optional_detail
+
+template <typename T> class Optional {
+  optional_detail::OptionalStorage<T, isPodLike<T>::value> Storage;
+
+public:
+  using value_type = T;
+
+  constexpr Optional() {}
+  constexpr Optional(NoneType) {}
+
+  Optional(const T &y) : Storage(y) {}
+  Optional(const Optional &O) = default;
+
+  Optional(T &&y) : Storage(std::forward<T>(y)) {}
+  Optional(Optional &&O) = default;
+
+  Optional &operator=(T &&y) {
+    Storage = std::move(y);
+    return *this;
   }
-  T &operator*() LLVM_LVALUE_FUNCTION {
-    assert(hasVal);
-    return *getPointer();
+  Optional &operator=(Optional &&O) = default;
+
+  /// Create a new object by constructing it in place with the given arguments.
+  template <typename... ArgTypes> void emplace(ArgTypes &&... Args) {
+    reset();
+    Storage.hasVal = true;
+    new (getPointer()) T(std::forward<ArgTypes>(Args)...);
+  }
+
+  static inline Optional create(const T *y) {
+    return y ? Optional(*y) : Optional();
+  }
+
+  Optional &operator=(const T &y) {
+    Storage = y;
+    return *this;
+  }
+  Optional &operator=(const Optional &O) = default;
+
+  void reset() { Storage.reset(); }
+
+  const T *getPointer() const {
+    assert(Storage.hasVal);
+    return reinterpret_cast<const T *>(Storage.storage.buffer);
+  }
+  T *getPointer() {
+    assert(Storage.hasVal);
+    return reinterpret_cast<T *>(Storage.storage.buffer);
   }
+  const T &getValue() const LLVM_LVALUE_FUNCTION { return *getPointer(); }
+  T &getValue() LLVM_LVALUE_FUNCTION { return *getPointer(); }
+
+  explicit operator bool() const { return Storage.hasVal; }
+  bool hasValue() const { return Storage.hasVal; }
+  const T *operator->() const { return getPointer(); }
+  T *operator->() { return getPointer(); }
+  const T &operator*() const LLVM_LVALUE_FUNCTION { return *getPointer(); }
+  T &operator*() LLVM_LVALUE_FUNCTION { return *getPointer(); }
 
   template <typename U>
   constexpr T getValueOr(U &&value) const LLVM_LVALUE_FUNCTION {
@@ -152,14 +192,8 @@ public:
   }
 
 #if LLVM_HAS_RVALUE_REFERENCE_THIS
-  T &&getValue() && {
-    assert(hasVal);
-    return std::move(*getPointer());
-  }
-  T &&operator*() && {
-    assert(hasVal);
-    return std::move(*getPointer());
-  }
+  T &&getValue() && { return std::move(*getPointer()); }
+  T &&operator*() && { return std::move(*getPointer()); }
 
   template <typename U>
   T getValueOr(U &&value) && {
diff --git a/contrib/llvm/include/llvm/ADT/PackedVector.h b/contrib/llvm/include/llvm/ADT/PackedVector.h
index 95adc2926813..3d53c49536d0 100644
--- a/contrib/llvm/include/llvm/ADT/PackedVector.h
+++ b/contrib/llvm/include/llvm/ADT/PackedVector.h
@@ -65,7 +65,7 @@ protected:
   }
 };
 
-/// \brief Store a vector of values using a specific number of bits for each
+/// Store a vector of values using a specific number of bits for each
 /// value. Both signed and unsigned types can be used, e.g
 /// @code
 ///   PackedVector<signed, 2> vec;
diff --git a/contrib/llvm/include/llvm/ADT/PointerUnion.h b/contrib/llvm/include/llvm/ADT/PointerUnion.h
index 4276859e9254..315e58336cba 100644
--- a/contrib/llvm/include/llvm/ADT/PointerUnion.h
+++ b/contrib/llvm/include/llvm/ADT/PointerUnion.h
@@ -346,6 +346,12 @@ struct PointerLikeTypeTraits<PointerUnion3<PT1, PT2, PT3>> {
   };
 };
 
+template <typename PT1, typename PT2, typename PT3>
+bool operator<(PointerUnion3<PT1, PT2, PT3> lhs,
+               PointerUnion3<PT1, PT2, PT3> rhs) {
+  return lhs.getOpaqueValue() < rhs.getOpaqueValue();
+}
+
 /// A pointer union of four pointer types. See documentation for PointerUnion
 /// for usage.
 template <typename PT1, typename PT2, typename PT3, typename PT4>
diff --git a/contrib/llvm/include/llvm/ADT/SCCIterator.h b/contrib/llvm/include/llvm/ADT/SCCIterator.h
index 784a58dc002f..ab1dc4613be0 100644
--- a/contrib/llvm/include/llvm/ADT/SCCIterator.h
+++ b/contrib/llvm/include/llvm/ADT/SCCIterator.h
@@ -33,7 +33,7 @@
 
 namespace llvm {
 
-/// \brief Enumerate the SCCs of a directed graph in reverse topological order
+/// Enumerate the SCCs of a directed graph in reverse topological order
 /// of the SCC DAG.
 ///
 /// This is implemented using Tarjan's DFS algorithm using an internal stack to
@@ -104,7 +104,7 @@ public:
   }
   static scc_iterator end(const GraphT &) { return scc_iterator(); }
 
-  /// \brief Direct loop termination test which is more efficient than
+  /// Direct loop termination test which is more efficient than
   /// comparison with \c end().
   bool isAtEnd() const {
     assert(!CurrentSCC.empty() || VisitStack.empty());
@@ -125,7 +125,7 @@ public:
     return CurrentSCC;
   }
 
-  /// \brief Test if the current SCC has a loop.
+  /// Test if the current SCC has a loop.
   ///
   /// If the SCC has more than one node, this is trivially true.  If not, it may
   /// still contain a loop if the node has an edge back to itself.
@@ -222,12 +222,12 @@ bool scc_iterator<GraphT, GT>::hasLoop() const {
     return false;
   }
 
-/// \brief Construct the begin iterator for a deduced graph type T.
+/// Construct the begin iterator for a deduced graph type T.
 template <class T> scc_iterator<T> scc_begin(const T &G) {
   return scc_iterator<T>::begin(G);
 }
 
-/// \brief Construct the end iterator for a deduced graph type T.
+/// Construct the end iterator for a deduced graph type T.
 template <class T> scc_iterator<T> scc_end(const T &G) {
   return scc_iterator<T>::end(G);
 }
diff --git a/contrib/llvm/include/llvm/ADT/STLExtras.h b/contrib/llvm/include/llvm/ADT/STLExtras.h
index bcd992b4a716..94365dd9ced1 100644
--- a/contrib/llvm/include/llvm/ADT/STLExtras.h
+++ b/contrib/llvm/include/llvm/ADT/STLExtras.h
@@ -36,6 +36,10 @@
 #include <type_traits>
 #include <utility>
 
+#ifdef EXPENSIVE_CHECKS
+#include <random> // for std::mt19937
+#endif
+
 namespace llvm {
 
 // Only used by compiler if both template types are the same.  Useful when
@@ -54,6 +58,19 @@ using ValueOfRange = typename std::remove_reference<decltype(
 } // end namespace detail
 
 //===----------------------------------------------------------------------===//
+//     Extra additions to <type_traits>
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct negation : std::integral_constant<bool, !bool(T::value)> {};
+
+template <typename...> struct conjunction : std::true_type {};
+template <typename B1> struct conjunction<B1> : B1 {};
+template <typename B1, typename... Bn>
+struct conjunction<B1, Bn...>
+    : std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
+
+//===----------------------------------------------------------------------===//
 //     Extra additions to <functional>
 //===----------------------------------------------------------------------===//
 
@@ -101,6 +118,7 @@ class function_ref<Ret(Params...)> {
 
 public:
   function_ref() = default;
+  function_ref(std::nullptr_t) {}
 
   template <typename Callable>
   function_ref(Callable &&callable,
@@ -266,60 +284,121 @@ auto reverse(
 ///   auto R = make_filter_range(A, [](int N) { return N % 2 == 1; });
 ///   // R contains { 1, 3 }.
 /// \endcode
-template <typename WrappedIteratorT, typename PredicateT>
-class filter_iterator
+///
+/// Note: filter_iterator_base implements support for forward iteration.
+/// filter_iterator_impl exists to provide support for bidirectional iteration,
+/// conditional on whether the wrapped iterator supports it.
+template <typename WrappedIteratorT, typename PredicateT, typename IterTag>
+class filter_iterator_base
     : public iterator_adaptor_base<
-          filter_iterator<WrappedIteratorT, PredicateT>, WrappedIteratorT,
+          filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
+          WrappedIteratorT,
           typename std::common_type<
-              std::forward_iterator_tag,
-              typename std::iterator_traits<
-                  WrappedIteratorT>::iterator_category>::type> {
+              IterTag, typename std::iterator_traits<
+                           WrappedIteratorT>::iterator_category>::type> {
   using BaseT = iterator_adaptor_base<
-      filter_iterator<WrappedIteratorT, PredicateT>, WrappedIteratorT,
+      filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
+      WrappedIteratorT,
       typename std::common_type<
-          std::forward_iterator_tag,
-          typename std::iterator_traits<WrappedIteratorT>::iterator_category>::
-          type>;
+          IterTag, typename std::iterator_traits<
+                       WrappedIteratorT>::iterator_category>::type>;
 
-  struct PayloadType {
-    WrappedIteratorT End;
-    PredicateT Pred;
-  };
-
-  Optional<PayloadType> Payload;
+protected:
+  WrappedIteratorT End;
+  PredicateT Pred;
 
   void findNextValid() {
-    assert(Payload && "Payload should be engaged when findNextValid is called");
-    while (this->I != Payload->End && !Payload->Pred(*this->I))
+    while (this->I != End && !Pred(*this->I))
       BaseT::operator++();
   }
 
-  // Construct the begin iterator. The begin iterator requires to know where end
-  // is, so that it can properly stop when it hits end.
-  filter_iterator(WrappedIteratorT Begin, WrappedIteratorT End, PredicateT Pred)
-      : BaseT(std::move(Begin)),
-        Payload(PayloadType{std::move(End), std::move(Pred)}) {
+  // Construct the iterator. The begin iterator needs to know where the end
+  // is, so that it can properly stop when it gets there. The end iterator only
+  // needs the predicate to support bidirectional iteration.
+  filter_iterator_base(WrappedIteratorT Begin, WrappedIteratorT End,
+                       PredicateT Pred)
+      : BaseT(Begin), End(End), Pred(Pred) {
     findNextValid();
   }
 
-  // Construct the end iterator. It's not incrementable, so Payload doesn't
-  // have to be engaged.
-  filter_iterator(WrappedIteratorT End) : BaseT(End) {}
-
 public:
   using BaseT::operator++;
 
-  filter_iterator &operator++() {
+  filter_iterator_base &operator++() {
     BaseT::operator++();
     findNextValid();
     return *this;
   }
+};
+
+/// Specialization of filter_iterator_base for forward iteration only.
+template <typename WrappedIteratorT, typename PredicateT,
+          typename IterTag = std::forward_iterator_tag>
+class filter_iterator_impl
+    : public filter_iterator_base<WrappedIteratorT, PredicateT, IterTag> {
+  using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>;
+
+public:
+  filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
+                       PredicateT Pred)
+      : BaseT(Begin, End, Pred) {}
+};
+
+/// Specialization of filter_iterator_base for bidirectional iteration.
+template <typename WrappedIteratorT, typename PredicateT>
+class filter_iterator_impl<WrappedIteratorT, PredicateT,
+                           std::bidirectional_iterator_tag>
+    : public filter_iterator_base<WrappedIteratorT, PredicateT,
+                                  std::bidirectional_iterator_tag> {
+  using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT,
+                                     std::bidirectional_iterator_tag>;
+  void findPrevValid() {
+    while (!this->Pred(*this->I))
+      BaseT::operator--();
+  }
+
+public:
+  using BaseT::operator--;
+
+  filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
+                       PredicateT Pred)
+      : BaseT(Begin, End, Pred) {}
+
+  filter_iterator_impl &operator--() {
+    BaseT::operator--();
+    findPrevValid();
+    return *this;
+  }
+};
+
+namespace detail {
 
-  template <typename RT, typename PT>
-  friend iterator_range<filter_iterator<detail::IterOfRange<RT>, PT>>
-  make_filter_range(RT &&, PT);
+template <bool is_bidirectional> struct fwd_or_bidi_tag_impl {
+  using type = std::forward_iterator_tag;
 };
 
+template <> struct fwd_or_bidi_tag_impl<true> {
+  using type = std::bidirectional_iterator_tag;
+};
+
+/// Helper which sets its type member to forward_iterator_tag if the category
+/// of \p IterT does not derive from bidirectional_iterator_tag, and to
+/// bidirectional_iterator_tag otherwise.
+template <typename IterT> struct fwd_or_bidi_tag {
+  using type = typename fwd_or_bidi_tag_impl<std::is_base_of<
+      std::bidirectional_iterator_tag,
+      typename std::iterator_traits<IterT>::iterator_category>::value>::type;
+};
+
+} // namespace detail
+
+/// Defines filter_iterator to a suitable specialization of
+/// filter_iterator_impl, based on the underlying iterator's category.
+template <typename WrappedIteratorT, typename PredicateT>
+using filter_iterator = filter_iterator_impl<
+    WrappedIteratorT, PredicateT,
+    typename detail::fwd_or_bidi_tag<WrappedIteratorT>::type>;
+
 /// Convenience function that takes a range of elements and a predicate,
 /// and return a new filter_iterator range.
 ///
@@ -332,10 +411,11 @@ iterator_range<filter_iterator<detail::IterOfRange<RangeT>, PredicateT>>
 make_filter_range(RangeT &&Range, PredicateT Pred) {
   using FilterIteratorT =
       filter_iterator<detail::IterOfRange<RangeT>, PredicateT>;
-  return make_range(FilterIteratorT(std::begin(std::forward<RangeT>(Range)),
-                                    std::end(std::forward<RangeT>(Range)),
-                                    std::move(Pred)),
-                    FilterIteratorT(std::end(std::forward<RangeT>(Range))));
+  return make_range(
+      FilterIteratorT(std::begin(std::forward<RangeT>(Range)),
+                      std::end(std::forward<RangeT>(Range)), Pred),
+      FilterIteratorT(std::end(std::forward<RangeT>(Range)),
+                      std::end(std::forward<RangeT>(Range)), Pred));
 }
 
 // forward declarations required by zip_shortest/zip_first
@@ -644,7 +724,7 @@ detail::concat_range<ValueT, RangeTs...> concat(RangeTs &&... Ranges) {
 //     Extra additions to <utility>
 //===----------------------------------------------------------------------===//
 
-/// \brief Function object to check whether the first component of a std::pair
+/// Function object to check whether the first component of a std::pair
 /// compares less than the first component of another std::pair.
 struct less_first {
   template <typename T> bool operator()(const T &lhs, const T &rhs) const {
@@ -652,7 +732,7 @@ struct less_first {
   }
 };
 
-/// \brief Function object to check whether the second component of a std::pair
+/// Function object to check whether the second component of a std::pair
 /// compares less than the second component of another std::pair.
 struct less_second {
   template <typename T> bool operator()(const T &lhs, const T &rhs) const {
@@ -662,14 +742,14 @@ struct less_second {
 
 // A subset of N3658. More stuff can be added as-needed.
 
-/// \brief Represents a compile-time sequence of integers.
+/// Represents a compile-time sequence of integers.
 template <class T, T... I> struct integer_sequence {
   using value_type = T;
 
   static constexpr size_t size() { return sizeof...(I); }
 };
 
-/// \brief Alias for the common case of a sequence of size_ts.
+/// Alias for the common case of a sequence of size_ts.
 template <size_t... I>
 struct index_sequence : integer_sequence<std::size_t, I...> {};
 
@@ -678,7 +758,7 @@ struct build_index_impl : build_index_impl<N - 1, N - 1, I...> {};
 template <std::size_t... I>
 struct build_index_impl<0, I...> : index_sequence<I...> {};
 
-/// \brief Creates a compile-time integer sequence for a parameter pack.
+/// Creates a compile-time integer sequence for a parameter pack.
 template <class... Ts>
 struct index_sequence_for : build_index_impl<sizeof...(Ts)> {};
 
@@ -687,7 +767,7 @@ struct index_sequence_for : build_index_impl<sizeof...(Ts)> {};
 template <int N> struct rank : rank<N - 1> {};
 template <> struct rank<0> {};
 
-/// \brief traits class for checking whether type T is one of any of the given
+/// traits class for checking whether type T is one of any of the given
 /// types in the variadic list.
 template <typename T, typename... Ts> struct is_one_of {
   static const bool value = false;
@@ -699,7 +779,7 @@ struct is_one_of<T, U, Ts...> {
       std::is_same<T, U>::value || is_one_of<T, Ts...>::value;
 };
 
-/// \brief traits class for checking whether type T is a base class for all
+/// traits class for checking whether type T is a base class for all
 ///  the given types in the variadic list.
 template <typename T, typename... Ts> struct are_base_of {
   static const bool value = true;
@@ -761,6 +841,10 @@ inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
   // behavior with an empty sequence.
   auto NElts = End - Start;
   if (NElts <= 1) return;
+#ifdef EXPENSIVE_CHECKS
+  std::mt19937 Generator(std::random_device{}());
+  std::shuffle(Start, End, Generator);
+#endif
   qsort(&*Start, NElts, sizeof(*Start), get_array_pod_sort_comparator(*Start));
 }
 
@@ -774,10 +858,34 @@ inline void array_pod_sort(
   // behavior with an empty sequence.
   auto NElts = End - Start;
   if (NElts <= 1) return;
+#ifdef EXPENSIVE_CHECKS
+  std::mt19937 Generator(std::random_device{}());
+  std::shuffle(Start, End, Generator);
+#endif
   qsort(&*Start, NElts, sizeof(*Start),
         reinterpret_cast<int (*)(const void *, const void *)>(Compare));
 }
 
+// Provide wrappers to std::sort which shuffle the elements before sorting
+// to help uncover non-deterministic behavior (PR35135).
+template <typename IteratorTy>
+inline void sort(IteratorTy Start, IteratorTy End) {
+#ifdef EXPENSIVE_CHECKS
+  std::mt19937 Generator(std::random_device{}());
+  std::shuffle(Start, End, Generator);
+#endif
+  std::sort(Start, End);
+}
+
+template <typename IteratorTy, typename Compare>
+inline void sort(IteratorTy Start, IteratorTy End, Compare Comp) {
+#ifdef EXPENSIVE_CHECKS
+  std::mt19937 Generator(std::random_device{}());
+  std::shuffle(Start, End, Generator);
+#endif
+  std::sort(Start, End, Comp);
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <algorithm>
 //===----------------------------------------------------------------------===//
@@ -861,6 +969,11 @@ OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P) {
   return std::copy_if(adl_begin(Range), adl_end(Range), Out, P);
 }
 
+template <typename R, typename OutputIt>
+OutputIt copy(R &&Range, OutputIt Out) {
+  return std::copy(adl_begin(Range), adl_end(Range), Out);
+}
+
 /// Wrapper function around std::find to detect if an element exists
 /// in a container.
 template <typename R, typename E>
@@ -905,7 +1018,7 @@ auto lower_bound(R &&Range, ForwardIt I) -> decltype(adl_begin(Range)) {
   return std::lower_bound(adl_begin(Range), adl_end(Range), I);
 }
 
-/// \brief Given a range of type R, iterate the entire range and return a
+/// Given a range of type R, iterate the entire range and return a
 /// SmallVector with elements of the vector.  This is useful, for example,
 /// when you want to iterate a range and then sort the results.
 template <unsigned Size, typename R>
@@ -926,13 +1039,25 @@ void erase_if(Container &C, UnaryPredicate P) {
   C.erase(remove_if(C, P), C.end());
 }
 
+/// Get the size of a range. This is a wrapper function around std::distance
+/// which is only enabled when the operation is O(1).
+template <typename R>
+auto size(R &&Range, typename std::enable_if<
+                         std::is_same<typename std::iterator_traits<decltype(
+                                          Range.begin())>::iterator_category,
+                                      std::random_access_iterator_tag>::value,
+                         void>::type * = nullptr)
+    -> decltype(std::distance(Range.begin(), Range.end())) {
+  return std::distance(Range.begin(), Range.end());
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <memory>
 //===----------------------------------------------------------------------===//
 
 // Implement make_unique according to N3656.
 
-/// \brief Constructs a `new T()` with the given args and returns a
+/// Constructs a `new T()` with the given args and returns a
 ///        `unique_ptr<T>` which owns the object.
 ///
 /// Example:
@@ -945,7 +1070,7 @@ make_unique(Args &&... args) {
   return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
 
-/// \brief Constructs a `new T[n]` with the given args and returns a
+/// Constructs a `new T[n]` with the given args and returns a
 ///        `unique_ptr<T[]>` which owns the object.
 ///
 /// \param n size of the new array.
diff --git a/contrib/llvm/include/llvm/ADT/ScopeExit.h b/contrib/llvm/include/llvm/ADT/ScopeExit.h
index 4e64352c77df..bd13755fa999 100644
--- a/contrib/llvm/include/llvm/ADT/ScopeExit.h
+++ b/contrib/llvm/include/llvm/ADT/ScopeExit.h
@@ -25,14 +25,26 @@ namespace detail {
 
 template <typename Callable> class scope_exit {
   Callable ExitFunction;
+  bool Engaged = true; // False once moved-from or release()d.
 
 public:
   template <typename Fp>
   explicit scope_exit(Fp &&F) : ExitFunction(std::forward<Fp>(F)) {}
 
-  scope_exit(scope_exit &&Rhs) : ExitFunction(std::move(Rhs.ExitFunction)) {}
+  scope_exit(scope_exit &&Rhs)
+      : ExitFunction(std::move(Rhs.ExitFunction)), Engaged(Rhs.Engaged) {
+    Rhs.release();
+  }
+  scope_exit(const scope_exit &) = delete;
+  scope_exit &operator=(scope_exit &&) = delete;
+  scope_exit &operator=(const scope_exit &) = delete;
 
-  ~scope_exit() { ExitFunction(); }
+  void release() { Engaged = false; }
+
+  ~scope_exit() {
+    if (Engaged)
+      ExitFunction();
+  }
 };
 
 } // end namespace detail
diff --git a/contrib/llvm/include/llvm/ADT/SetVector.h b/contrib/llvm/include/llvm/ADT/SetVector.h
index 04ed52fc543f..3d6781041320 100644
--- a/contrib/llvm/include/llvm/ADT/SetVector.h
+++ b/contrib/llvm/include/llvm/ADT/SetVector.h
@@ -31,7 +31,7 @@
 
 namespace llvm {
 
-/// \brief A vector that has set insertion semantics.
+/// A vector that has set insertion semantics.
 ///
 /// This adapter class provides a way to keep a set of things that also has the
 /// property of a deterministic iteration order. The order of iteration is the
@@ -52,10 +52,10 @@ public:
   using const_reverse_iterator = typename vector_type::const_reverse_iterator;
   using size_type = typename vector_type::size_type;
 
-  /// \brief Construct an empty SetVector
+  /// Construct an empty SetVector
   SetVector() = default;
 
-  /// \brief Initialize a SetVector with a range of elements
+  /// Initialize a SetVector with a range of elements
   template<typename It>
   SetVector(It Start, It End) {
     insert(Start, End);
@@ -69,75 +69,75 @@ public:
     return std::move(vector_);
   }
 
-  /// \brief Determine if the SetVector is empty or not.
+  /// Determine if the SetVector is empty or not.
   bool empty() const {
     return vector_.empty();
   }
 
-  /// \brief Determine the number of elements in the SetVector.
+  /// Determine the number of elements in the SetVector.
   size_type size() const {
     return vector_.size();
   }
 
-  /// \brief Get an iterator to the beginning of the SetVector.
+  /// Get an iterator to the beginning of the SetVector.
   iterator begin() {
     return vector_.begin();
   }
 
-  /// \brief Get a const_iterator to the beginning of the SetVector.
+  /// Get a const_iterator to the beginning of the SetVector.
   const_iterator begin() const {
     return vector_.begin();
   }
 
-  /// \brief Get an iterator to the end of the SetVector.
+  /// Get an iterator to the end of the SetVector.
   iterator end() {
     return vector_.end();
   }
 
-  /// \brief Get a const_iterator to the end of the SetVector.
+  /// Get a const_iterator to the end of the SetVector.
   const_iterator end() const {
     return vector_.end();
   }
 
-  /// \brief Get an reverse_iterator to the end of the SetVector.
+  /// Get an reverse_iterator to the end of the SetVector.
   reverse_iterator rbegin() {
     return vector_.rbegin();
   }
 
-  /// \brief Get a const_reverse_iterator to the end of the SetVector.
+  /// Get a const_reverse_iterator to the end of the SetVector.
   const_reverse_iterator rbegin() const {
     return vector_.rbegin();
   }
 
-  /// \brief Get a reverse_iterator to the beginning of the SetVector.
+  /// Get a reverse_iterator to the beginning of the SetVector.
   reverse_iterator rend() {
     return vector_.rend();
   }
 
-  /// \brief Get a const_reverse_iterator to the beginning of the SetVector.
+  /// Get a const_reverse_iterator to the beginning of the SetVector.
   const_reverse_iterator rend() const {
     return vector_.rend();
   }
 
-  /// \brief Return the first element of the SetVector.
+  /// Return the first element of the SetVector.
   const T &front() const {
     assert(!empty() && "Cannot call front() on empty SetVector!");
     return vector_.front();
   }
 
-  /// \brief Return the last element of the SetVector.
+  /// Return the last element of the SetVector.
   const T &back() const {
     assert(!empty() && "Cannot call back() on empty SetVector!");
     return vector_.back();
   }
 
-  /// \brief Index into the SetVector.
+  /// Index into the SetVector.
   const_reference operator[](size_type n) const {
     assert(n < vector_.size() && "SetVector access out of range!");
     return vector_[n];
   }
 
-  /// \brief Insert a new element into the SetVector.
+  /// Insert a new element into the SetVector.
   /// \returns true if the element was inserted into the SetVector.
   bool insert(const value_type &X) {
     bool result = set_.insert(X).second;
@@ -146,7 +146,7 @@ public:
     return result;
   }
 
-  /// \brief Insert a range of elements into the SetVector.
+  /// Insert a range of elements into the SetVector.
   template<typename It>
   void insert(It Start, It End) {
     for (; Start != End; ++Start)
@@ -154,7 +154,7 @@ public:
         vector_.push_back(*Start);
   }
 
-  /// \brief Remove an item from the set vector.
+  /// Remove an item from the set vector.
   bool remove(const value_type& X) {
     if (set_.erase(X)) {
       typename vector_type::iterator I = find(vector_, X);
@@ -183,7 +183,7 @@ public:
     return vector_.erase(NI);
   }
 
-  /// \brief Remove items from the set vector based on a predicate function.
+  /// Remove items from the set vector based on a predicate function.
   ///
   /// This is intended to be equivalent to the following code, if we could
   /// write it:
@@ -206,19 +206,19 @@ public:
     return true;
   }
 
-  /// \brief Count the number of elements of a given key in the SetVector.
+  /// Count the number of elements of a given key in the SetVector.
   /// \returns 0 if the element is not in the SetVector, 1 if it is.
   size_type count(const key_type &key) const {
     return set_.count(key);
   }
 
-  /// \brief Completely clear the SetVector
+  /// Completely clear the SetVector
   void clear() {
     set_.clear();
     vector_.clear();
   }
 
-  /// \brief Remove the last element of the SetVector.
+  /// Remove the last element of the SetVector.
   void pop_back() {
     assert(!empty() && "Cannot remove an element from an empty SetVector!");
     set_.erase(back());
@@ -239,7 +239,7 @@ public:
     return vector_ != that.vector_;
   }
 
-  /// \brief Compute This := This u S, return whether 'This' changed.
+  /// Compute This := This u S, return whether 'This' changed.
   /// TODO: We should be able to use set_union from SetOperations.h, but
   ///       SetVector interface is inconsistent with DenseSet.
   template <class STy>
@@ -254,7 +254,7 @@ public:
     return Changed;
   }
 
-  /// \brief Compute This := This - B
+  /// Compute This := This - B
   /// TODO: We should be able to use set_subtract from SetOperations.h, but
   ///       SetVector interface is inconsistent with DenseSet.
   template <class STy>
@@ -265,7 +265,7 @@ public:
   }
 
 private:
-  /// \brief A wrapper predicate designed for use with std::remove_if.
+  /// A wrapper predicate designed for use with std::remove_if.
   ///
   /// This predicate wraps a predicate suitable for use with std::remove_if to
   /// call set_.erase(x) on each element which is slated for removal.
@@ -292,7 +292,7 @@ private:
   vector_type vector_;   ///< The vector.
 };
 
-/// \brief A SetVector that performs no allocations if smaller than
+/// A SetVector that performs no allocations if smaller than
 /// a certain size.
 template <typename T, unsigned N>
 class SmallSetVector
@@ -300,7 +300,7 @@ class SmallSetVector
 public:
   SmallSetVector() = default;
 
-  /// \brief Initialize a SmallSetVector with a range of elements
+  /// Initialize a SmallSetVector with a range of elements
   template<typename It>
   SmallSetVector(It Start, It End) {
     this->insert(Start, End);
diff --git a/contrib/llvm/include/llvm/ADT/SmallPtrSet.h b/contrib/llvm/include/llvm/ADT/SmallPtrSet.h
index 78ea613af693..db08e40257ba 100644
--- a/contrib/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/contrib/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -335,7 +335,7 @@ struct RoundUpToPowerOfTwo {
   enum { Val = RoundUpToPowerOfTwoH<N, (N&(N-1)) == 0>::Val };
 };
 
-/// \brief A templated base class for \c SmallPtrSet which provides the
+/// A templated base class for \c SmallPtrSet which provides the
 /// typesafe interface that is common across all small sizes.
 ///
 /// This is particularly useful for passing around between interface boundaries
diff --git a/contrib/llvm/include/llvm/ADT/SmallSet.h b/contrib/llvm/include/llvm/ADT/SmallSet.h
index d52d0f07f9a6..5d84627714bc 100644
--- a/contrib/llvm/include/llvm/ADT/SmallSet.h
+++ b/contrib/llvm/include/llvm/ADT/SmallSet.h
@@ -17,21 +17,120 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/type_traits.h"
 #include <cstddef>
 #include <functional>
 #include <set>
+#include <type_traits>
 #include <utility>
 
 namespace llvm {
 
+/// SmallSetIterator - This class implements a const_iterator for SmallSet by
+/// delegating to the underlying SmallVector or Set iterators.
+template <typename T, unsigned N, typename C>
+class SmallSetIterator
+    : public iterator_facade_base<SmallSetIterator<T, N, C>,
+                                  std::forward_iterator_tag, T> {
+private:
+  using SetIterTy = typename std::set<T, C>::const_iterator;
+  using VecIterTy = typename SmallVector<T, N>::const_iterator;
+  using SelfTy = SmallSetIterator<T, N, C>;
+
+  /// Iterators to the parts of the SmallSet containing the data. They are set
+  /// depending on isSmall.
+  union {
+    SetIterTy SetIter;
+    VecIterTy VecIter;
+  };
+
+  bool isSmall;
+
+public:
+  SmallSetIterator(SetIterTy SetIter) : SetIter(SetIter), isSmall(false) {}
+
+  SmallSetIterator(VecIterTy VecIter) : VecIter(VecIter), isSmall(true) {}
+
+  // Spell out destructor, copy/move constructor and assignment operators for
+  // MSVC STL, where set<T>::const_iterator is not trivially copy constructible.
+  ~SmallSetIterator() {
+    if (isSmall)
+      VecIter.~VecIterTy();
+    else
+      SetIter.~SetIterTy();
+  }
+
+  SmallSetIterator(const SmallSetIterator &Other) : isSmall(Other.isSmall) {
+    if (isSmall)
+      VecIter = Other.VecIter;
+    else
+      // Use placement new, to make sure SetIter is properly constructed, even
+      // if it is not trivially copy-able (e.g. in MSVC).
+      new (&SetIter) SetIterTy(Other.SetIter);
+  }
+
+  SmallSetIterator(SmallSetIterator &&Other) : isSmall(Other.isSmall) {
+    if (isSmall)
+      VecIter = std::move(Other.VecIter);
+    else
+      // Use placement new, to make sure SetIter is properly constructed, even
+      // if it is not trivially copy-able (e.g. in MSVC).
+      new (&SetIter) SetIterTy(std::move(Other.SetIter));
+  }
+
+  SmallSetIterator& operator=(const SmallSetIterator& Other) {
+    // Call destructor for SetIter, so it gets properly destroyed if it is
+    // not trivially destructible in case we are setting VecIter.
+    if (!isSmall)
+      SetIter.~SetIterTy();
+
+    isSmall = Other.isSmall;
+    if (isSmall)
+      VecIter = Other.VecIter;
+    else
+      new (&SetIter) SetIterTy(Other.SetIter);
+    return *this;
+  }
+
+  SmallSetIterator& operator=(SmallSetIterator&& Other) {
+    // Call destructor for SetIter, so it gets properly destroyed if it is
+    // not trivially destructible in case we are setting VecIter.
+    if (!isSmall)
+      SetIter.~SetIterTy();
+
+    isSmall = Other.isSmall;
+    if (isSmall)
+      VecIter = std::move(Other.VecIter);
+    else
+      new (&SetIter) SetIterTy(std::move(Other.SetIter));
+    return *this;
+  }
+
+  bool operator==(const SmallSetIterator &RHS) const {
+    if (isSmall != RHS.isSmall)
+      return false;
+    if (isSmall)
+      return VecIter == RHS.VecIter;
+    return SetIter == RHS.SetIter;
+  }
+
+  SmallSetIterator &operator++() { // Preincrement
+    if (isSmall)
+      VecIter++;
+    else
+      SetIter++;
+    return *this;
+  }
+
+  const T &operator*() const { return isSmall ? *VecIter : *SetIter; }
+};
+
 /// SmallSet - This maintains a set of unique values, optimizing for the case
 /// when the set is small (less than N).  In this case, the set can be
 /// maintained with no mallocs.  If the set gets large, we expand to using an
 /// std::set to maintain reasonable lookup times.
-///
-/// Note that this set does not provide a way to iterate over members in the
-/// set.
 template <typename T, unsigned N, typename C = std::less<T>>
 class SmallSet {
   /// Use a SmallVector to hold the elements here (even though it will never
@@ -50,6 +149,7 @@ class SmallSet {
 
 public:
   using size_type = size_t;
+  using const_iterator = SmallSetIterator<T, N, C>;
 
   SmallSet() = default;
 
@@ -121,6 +221,18 @@ public:
     Set.clear();
   }
 
+  const_iterator begin() const {
+    if (isSmall())
+      return {Vector.begin()};
+    return {Set.begin()};
+  }
+
+  const_iterator end() const {
+    if (isSmall())
+      return {Vector.end()};
+    return {Set.end()};
+  }
+
 private:
   bool isSmall() const { return Set.empty(); }
 
diff --git a/contrib/llvm/include/llvm/ADT/SmallVector.h b/contrib/llvm/include/llvm/ADT/SmallVector.h
index a9ac98d1ad4c..acb4426b4f45 100644
--- a/contrib/llvm/include/llvm/ADT/SmallVector.h
+++ b/contrib/llvm/include/llvm/ADT/SmallVector.h
@@ -18,6 +18,7 @@
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemAlloc.h"
 #include "llvm/Support/type_traits.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
@@ -37,28 +38,42 @@ namespace llvm {
 /// This is all the non-templated stuff common to all SmallVectors.
 class SmallVectorBase {
 protected:
-  void *BeginX, *EndX, *CapacityX;
+  void *BeginX;
+  unsigned Size = 0, Capacity;
 
-protected:
-  SmallVectorBase(void *FirstEl, size_t Size)
-    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+  SmallVectorBase() = delete;
+  SmallVectorBase(void *FirstEl, size_t Capacity)
+      : BeginX(FirstEl), Capacity(Capacity) {}
 
   /// This is an implementation of the grow() method which only works
   /// on POD-like data types and is out of line to reduce code duplication.
-  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
+  void grow_pod(void *FirstEl, size_t MinCapacity, size_t TSize);
 
 public:
-  /// This returns size()*sizeof(T).
-  size_t size_in_bytes() const {
-    return size_t((char*)EndX - (char*)BeginX);
-  }
+  size_t size() const { return Size; }
+  size_t capacity() const { return Capacity; }
+
+  LLVM_NODISCARD bool empty() const { return !Size; }
 
-  /// capacity_in_bytes - This returns capacity()*sizeof(T).
-  size_t capacity_in_bytes() const {
-    return size_t((char*)CapacityX - (char*)BeginX);
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_t Size) {
+    assert(Size <= capacity());
+    this->Size = Size;
   }
+};
 
-  LLVM_NODISCARD bool empty() const { return BeginX == EndX; }
+/// Figure out the offset of the first element.
+template <class T, typename = void> struct SmallVectorAlignmentAndSize {
+  AlignedCharArrayUnion<SmallVectorBase> Base;
+  AlignedCharArrayUnion<T> FirstEl;
 };
 
 /// This is the part of SmallVectorTemplateBase which does not depend on whether
@@ -66,36 +81,34 @@ public:
 /// to avoid unnecessarily requiring T to be complete.
 template <typename T, typename = void>
 class SmallVectorTemplateCommon : public SmallVectorBase {
-private:
-  template <typename, unsigned> friend struct SmallVectorStorage;
-
-  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
-  // don't want it to be automatically run, so we need to represent the space as
-  // something else.  Use an array of char of sufficient alignment.
-  using U = AlignedCharArrayUnion<T>;
-  U FirstEl;
+  /// Find the address of the first element.  For this pointer math to be valid
+  /// with small-size of 0 for T with lots of alignment, it's important that
+  /// SmallVectorStorage is properly-aligned even for small-size of 0.
+  void *getFirstEl() const {
+    return const_cast<void *>(reinterpret_cast<const void *>(
+        reinterpret_cast<const char *>(this) +
+        offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)));
+  }
   // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
 
 protected:
-  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+  SmallVectorTemplateCommon(size_t Size)
+      : SmallVectorBase(getFirstEl(), Size) {}
 
-  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
-    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  void grow_pod(size_t MinCapacity, size_t TSize) {
+    SmallVectorBase::grow_pod(getFirstEl(), MinCapacity, TSize);
   }
 
   /// Return true if this is a smallvector which has not had dynamic
   /// memory allocated for it.
-  bool isSmall() const {
-    return BeginX == static_cast<const void*>(&FirstEl);
-  }
+  bool isSmall() const { return BeginX == getFirstEl(); }
 
   /// Put this vector in a state of being small.
   void resetToSmall() {
-    BeginX = EndX = CapacityX = &FirstEl;
+    BeginX = getFirstEl();
+    Size = Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
   }
 
-  void setEnd(T *P) { this->EndX = P; }
-
 public:
   using size_type = size_t;
   using difference_type = ptrdiff_t;
@@ -117,27 +130,20 @@ public:
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   const_iterator begin() const { return (const_iterator)this->BeginX; }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  iterator end() { return (iterator)this->EndX; }
+  iterator end() { return begin() + size(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  const_iterator end() const { return (const_iterator)this->EndX; }
-
-protected:
-  iterator capacity_ptr() { return (iterator)this->CapacityX; }
-  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
+  const_iterator end() const { return begin() + size(); }
 
-public:
   // reverse iterator creation methods.
   reverse_iterator rbegin()            { return reverse_iterator(end()); }
   const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
   reverse_iterator rend()              { return reverse_iterator(begin()); }
   const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  size_type size() const { return end()-begin(); }
+  size_type size_in_bytes() const { return size() * sizeof(T); }
   size_type max_size() const { return size_type(-1) / sizeof(T); }
 
-  /// Return the total number of elements in the currently allocated buffer.
-  size_t capacity() const { return capacity_ptr() - begin(); }
+  size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
 
   /// Return a pointer to the vector's buffer, even if empty().
   pointer data() { return pointer(begin()); }
@@ -210,21 +216,21 @@ protected:
 
 public:
   void push_back(const T &Elt) {
-    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
       this->grow();
     ::new ((void*) this->end()) T(Elt);
-    this->setEnd(this->end()+1);
+    this->set_size(this->size() + 1);
   }
 
   void push_back(T &&Elt) {
-    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
       this->grow();
     ::new ((void*) this->end()) T(::std::move(Elt));
-    this->setEnd(this->end()+1);
+    this->set_size(this->size() + 1);
   }
 
   void pop_back() {
-    this->setEnd(this->end()-1);
+    this->set_size(this->size() - 1);
     this->end()->~T();
   }
 };
@@ -232,15 +238,13 @@ public:
 // Define this out-of-line to dissuade the C++ compiler from inlining it.
 template <typename T, bool isPodLike>
 void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
-  size_t CurCapacity = this->capacity();
-  size_t CurSize = this->size();
+  if (MinSize > UINT32_MAX)
+    report_bad_alloc_error("SmallVector capacity overflow during allocation");
+
   // Always grow, even from zero.
-  size_t NewCapacity = size_t(NextPowerOf2(CurCapacity+2));
-  if (NewCapacity < MinSize)
-    NewCapacity = MinSize;
-  T *NewElts = static_cast<T*>(malloc(NewCapacity*sizeof(T)));
-  if (NewElts == nullptr)
-    report_bad_alloc_error("Allocation of SmallVector element failed.");
+  size_t NewCapacity = size_t(NextPowerOf2(this->capacity() + 2));
+  NewCapacity = std::min(std::max(NewCapacity, MinSize), size_t(UINT32_MAX));
+  T *NewElts = static_cast<T*>(llvm::safe_malloc(NewCapacity*sizeof(T)));
 
   // Move the elements over.
   this->uninitialized_move(this->begin(), this->end(), NewElts);
@@ -252,9 +256,8 @@ void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
   if (!this->isSmall())
     free(this->begin());
 
-  this->setEnd(NewElts+CurSize);
   this->BeginX = NewElts;
-  this->CapacityX = this->begin()+NewCapacity;
+  this->Capacity = NewCapacity;
 }
 
 
@@ -301,21 +304,17 @@ protected:
 
   /// Double the size of the allocated memory, guaranteeing space for at
   /// least one more element or MinSize if specified.
-  void grow(size_t MinSize = 0) {
-    this->grow_pod(MinSize*sizeof(T), sizeof(T));
-  }
+  void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }
 
 public:
   void push_back(const T &Elt) {
-    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
       this->grow();
     memcpy(this->end(), &Elt, sizeof(T));
-    this->setEnd(this->end()+1);
+    this->set_size(this->size() + 1);
   }
 
-  void pop_back() {
-    this->setEnd(this->end()-1);
-  }
+  void pop_back() { this->set_size(this->size() - 1); }
 };
 
 /// This class consists of common code factored out of the SmallVector class to
@@ -332,16 +331,13 @@ public:
 protected:
   // Default ctor - Initialize to empty.
   explicit SmallVectorImpl(unsigned N)
-    : SmallVectorTemplateBase<T, isPodLike<T>::value>(N*sizeof(T)) {
-  }
+      : SmallVectorTemplateBase<T, isPodLike<T>::value>(N) {}
 
 public:
   SmallVectorImpl(const SmallVectorImpl &) = delete;
 
   ~SmallVectorImpl() {
-    // Destroy the constructed elements in the vector.
-    this->destroy_range(this->begin(), this->end());
-
+    // Subclass has already destructed this vector's elements.
     // If this wasn't grown from the inline copy, deallocate the old space.
     if (!this->isSmall())
       free(this->begin());
@@ -349,31 +345,31 @@ public:
 
   void clear() {
     this->destroy_range(this->begin(), this->end());
-    this->EndX = this->BeginX;
+    this->Size = 0;
   }
 
   void resize(size_type N) {
     if (N < this->size()) {
       this->destroy_range(this->begin()+N, this->end());
-      this->setEnd(this->begin()+N);
+      this->set_size(N);
     } else if (N > this->size()) {
       if (this->capacity() < N)
         this->grow(N);
       for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
         new (&*I) T();
-      this->setEnd(this->begin()+N);
+      this->set_size(N);
     }
   }
 
   void resize(size_type N, const T &NV) {
     if (N < this->size()) {
       this->destroy_range(this->begin()+N, this->end());
-      this->setEnd(this->begin()+N);
+      this->set_size(N);
     } else if (N > this->size()) {
       if (this->capacity() < N)
         this->grow(N);
       std::uninitialized_fill(this->end(), this->begin()+N, NV);
-      this->setEnd(this->begin()+N);
+      this->set_size(N);
     }
   }
 
@@ -398,23 +394,23 @@ public:
   void append(in_iter in_start, in_iter in_end) {
     size_type NumInputs = std::distance(in_start, in_end);
     // Grow allocated space if needed.
-    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+    if (NumInputs > this->capacity() - this->size())
       this->grow(this->size()+NumInputs);
 
     // Copy the new elements over.
     this->uninitialized_copy(in_start, in_end, this->end());
-    this->setEnd(this->end() + NumInputs);
+    this->set_size(this->size() + NumInputs);
   }
 
   /// Add the specified range to the end of the SmallVector.
   void append(size_type NumInputs, const T &Elt) {
     // Grow allocated space if needed.
-    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+    if (NumInputs > this->capacity() - this->size())
       this->grow(this->size()+NumInputs);
 
     // Copy the new elements over.
     std::uninitialized_fill_n(this->end(), NumInputs, Elt);
-    this->setEnd(this->end() + NumInputs);
+    this->set_size(this->size() + NumInputs);
   }
 
   void append(std::initializer_list<T> IL) {
@@ -428,7 +424,7 @@ public:
     clear();
     if (this->capacity() < NumElts)
       this->grow(NumElts);
-    this->setEnd(this->begin()+NumElts);
+    this->set_size(NumElts);
     std::uninitialized_fill(this->begin(), this->end(), Elt);
   }
 
@@ -475,7 +471,7 @@ public:
     iterator I = std::move(E, this->end(), S);
     // Drop the last elts.
     this->destroy_range(I, this->end());
-    this->setEnd(I);
+    this->set_size(I - this->begin());
     return(N);
   }
 
@@ -488,7 +484,7 @@ public:
     assert(I >= this->begin() && "Insertion iterator is out of bounds.");
     assert(I <= this->end() && "Inserting past the end of the vector.");
 
-    if (this->EndX >= this->CapacityX) {
+    if (this->size() >= this->capacity()) {
       size_t EltNo = I-this->begin();
       this->grow();
       I = this->begin()+EltNo;
@@ -497,12 +493,12 @@ public:
     ::new ((void*) this->end()) T(::std::move(this->back()));
     // Push everything else over.
     std::move_backward(I, this->end()-1, this->end());
-    this->setEnd(this->end()+1);
+    this->set_size(this->size() + 1);
 
     // If we just moved the element we're inserting, be sure to update
     // the reference.
     T *EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->EndX)
+    if (I <= EltPtr && EltPtr < this->end())
       ++EltPtr;
 
     *I = ::std::move(*EltPtr);
@@ -518,7 +514,7 @@ public:
     assert(I >= this->begin() && "Insertion iterator is out of bounds.");
     assert(I <= this->end() && "Inserting past the end of the vector.");
 
-    if (this->EndX >= this->CapacityX) {
+    if (this->size() >= this->capacity()) {
       size_t EltNo = I-this->begin();
       this->grow();
       I = this->begin()+EltNo;
@@ -526,12 +522,12 @@ public:
     ::new ((void*) this->end()) T(std::move(this->back()));
     // Push everything else over.
     std::move_backward(I, this->end()-1, this->end());
-    this->setEnd(this->end()+1);
+    this->set_size(this->size() + 1);
 
     // If we just moved the element we're inserting, be sure to update
     // the reference.
     const T *EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->EndX)
+    if (I <= EltPtr && EltPtr < this->end())
       ++EltPtr;
 
     *I = *EltPtr;
@@ -577,7 +573,7 @@ public:
 
     // Move over the elements that we're about to overwrite.
     T *OldEnd = this->end();
-    this->setEnd(this->end() + NumToInsert);
+    this->set_size(this->size() + NumToInsert);
     size_t NumOverwritten = OldEnd-I;
     this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
 
@@ -634,7 +630,7 @@ public:
 
     // Move over the elements that we're about to overwrite.
     T *OldEnd = this->end();
-    this->setEnd(this->end() + NumToInsert);
+    this->set_size(this->size() + NumToInsert);
     size_t NumOverwritten = OldEnd-I;
     this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
 
@@ -654,10 +650,10 @@ public:
   }
 
   template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
-    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
       this->grow();
     ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
-    this->setEnd(this->end() + 1);
+    this->set_size(this->size() + 1);
   }
 
   SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
@@ -676,20 +672,6 @@ public:
     return std::lexicographical_compare(this->begin(), this->end(),
                                         RHS.begin(), RHS.end());
   }
-
-  /// Set the array size to \p N, which the current array must have enough
-  /// capacity for.
-  ///
-  /// This does not construct or destroy any elements in the vector.
-  ///
-  /// Clients can use this in conjunction with capacity() to write past the end
-  /// of the buffer when they know that more elements are available, and only
-  /// update the size later. This avoids the cost of value initializing elements
-  /// which will only be overwritten.
-  void set_size(size_type N) {
-    assert(N <= this->capacity());
-    this->setEnd(this->begin() + N);
-  }
 };
 
 template <typename T>
@@ -699,8 +681,8 @@ void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
   // We can only avoid copying elements if neither vector is small.
   if (!this->isSmall() && !RHS.isSmall()) {
     std::swap(this->BeginX, RHS.BeginX);
-    std::swap(this->EndX, RHS.EndX);
-    std::swap(this->CapacityX, RHS.CapacityX);
+    std::swap(this->Size, RHS.Size);
+    std::swap(this->Capacity, RHS.Capacity);
     return;
   }
   if (RHS.size() > this->capacity())
@@ -718,15 +700,15 @@ void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
   if (this->size() > RHS.size()) {
     size_t EltDiff = this->size() - RHS.size();
     this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
-    RHS.setEnd(RHS.end()+EltDiff);
+    RHS.set_size(RHS.size() + EltDiff);
     this->destroy_range(this->begin()+NumShared, this->end());
-    this->setEnd(this->begin()+NumShared);
+    this->set_size(NumShared);
   } else if (RHS.size() > this->size()) {
     size_t EltDiff = RHS.size() - this->size();
     this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
-    this->setEnd(this->end() + EltDiff);
+    this->set_size(this->size() + EltDiff);
     this->destroy_range(RHS.begin()+NumShared, RHS.end());
-    RHS.setEnd(RHS.begin()+NumShared);
+    RHS.set_size(NumShared);
   }
 }
 
@@ -752,7 +734,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::
     this->destroy_range(NewEnd, this->end());
 
     // Trim.
-    this->setEnd(NewEnd);
+    this->set_size(RHSSize);
     return *this;
   }
 
@@ -762,7 +744,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
     this->destroy_range(this->begin(), this->end());
-    this->setEnd(this->begin());
+    this->set_size(0);
     CurSize = 0;
     this->grow(RHSSize);
   } else if (CurSize) {
@@ -775,7 +757,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::
                            this->begin()+CurSize);
 
   // Set end.
-  this->setEnd(this->begin()+RHSSize);
+  this->set_size(RHSSize);
   return *this;
 }
 
@@ -789,8 +771,8 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
     this->destroy_range(this->begin(), this->end());
     if (!this->isSmall()) free(this->begin());
     this->BeginX = RHS.BeginX;
-    this->EndX = RHS.EndX;
-    this->CapacityX = RHS.CapacityX;
+    this->Size = RHS.Size;
+    this->Capacity = RHS.Capacity;
     RHS.resetToSmall();
     return *this;
   }
@@ -807,7 +789,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
 
     // Destroy excess elements and trim the bounds.
     this->destroy_range(NewEnd, this->end());
-    this->setEnd(NewEnd);
+    this->set_size(RHSSize);
 
     // Clear the RHS.
     RHS.clear();
@@ -822,7 +804,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
     this->destroy_range(this->begin(), this->end());
-    this->setEnd(this->begin());
+    this->set_size(0);
     CurSize = 0;
     this->grow(RHSSize);
   } else if (CurSize) {
@@ -835,22 +817,23 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
                            this->begin()+CurSize);
 
   // Set end.
-  this->setEnd(this->begin()+RHSSize);
+  this->set_size(RHSSize);
 
   RHS.clear();
   return *this;
 }
 
-/// Storage for the SmallVector elements which aren't contained in
-/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
-/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// Storage for the SmallVector elements.  This is specialized for the N=0 case
 /// to avoid allocating unnecessary storage.
 template <typename T, unsigned N>
 struct SmallVectorStorage {
-  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+  AlignedCharArrayUnion<T> InlineElts[N];
 };
-template <typename T> struct SmallVectorStorage<T, 1> {};
-template <typename T> struct SmallVectorStorage<T, 0> {};
+
+/// We need the storage to be properly aligned even for small-size of 0 so that
+/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
+/// well-defined.
+template <typename T> struct alignas(alignof(T)) SmallVectorStorage<T, 0> {};
 
 /// This is a 'vector' (really, a variable-sized array), optimized
 /// for the case when the array is small.  It contains some number of elements
@@ -861,13 +844,15 @@ template <typename T> struct SmallVectorStorage<T, 0> {};
 /// Note that this does not attempt to be exception safe.
 ///
 template <typename T, unsigned N>
-class SmallVector : public SmallVectorImpl<T> {
-  /// Inline space for elements which aren't stored in the base class.
-  SmallVectorStorage<T, N> Storage;
-
+class SmallVector : public SmallVectorImpl<T>, SmallVectorStorage<T, N> {
 public:
   SmallVector() : SmallVectorImpl<T>(N) {}
 
+  ~SmallVector() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+  }
+
   explicit SmallVector(size_t Size, const T &Value = T())
     : SmallVectorImpl<T>(N) {
     this->assign(Size, Value);
diff --git a/contrib/llvm/include/llvm/ADT/SparseMultiSet.h b/contrib/llvm/include/llvm/ADT/SparseMultiSet.h
index c91e0d70f65a..3c8637621510 100644
--- a/contrib/llvm/include/llvm/ADT/SparseMultiSet.h
+++ b/contrib/llvm/include/llvm/ADT/SparseMultiSet.h
@@ -211,7 +211,7 @@ public:
     // The Sparse array doesn't actually need to be initialized, so malloc
     // would be enough here, but that will cause tools like valgrind to
     // complain about branching on uninitialized data.
-    Sparse = reinterpret_cast<SparseT*>(calloc(U, sizeof(SparseT)));
+    Sparse = static_cast<SparseT*>(safe_calloc(U, sizeof(SparseT)));
     Universe = U;
   }
 
diff --git a/contrib/llvm/include/llvm/ADT/SparseSet.h b/contrib/llvm/include/llvm/ADT/SparseSet.h
index 25ade8831922..74cc6dab8c74 100644
--- a/contrib/llvm/include/llvm/ADT/SparseSet.h
+++ b/contrib/llvm/include/llvm/ADT/SparseSet.h
@@ -22,6 +22,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Allocator.h"
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
@@ -163,7 +164,7 @@ public:
     // The Sparse array doesn't actually need to be initialized, so malloc
     // would be enough here, but that will cause tools like valgrind to
     // complain about branching on uninitialized data.
-    Sparse = reinterpret_cast<SparseT*>(calloc(U, sizeof(SparseT)));
+    Sparse = static_cast<SparseT*>(safe_calloc(U, sizeof(SparseT)));
     Universe = U;
   }
 
diff --git a/contrib/llvm/include/llvm/ADT/Statistic.h b/contrib/llvm/include/llvm/ADT/Statistic.h
index d5ebba409c3d..90c2eefceb6c 100644
--- a/contrib/llvm/include/llvm/ADT/Statistic.h
+++ b/contrib/llvm/include/llvm/ADT/Statistic.h
@@ -26,15 +26,24 @@
 #ifndef LLVM_ADT_STATISTIC_H
 #define LLVM_ADT_STATISTIC_H
 
-#include "llvm/Support/Atomic.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include <atomic>
 #include <memory>
+#include <vector>
+
+// Determine whether statistics should be enabled. We must do it here rather
+// than in CMake because multi-config generators cannot determine this at
+// configure time.
+#if !defined(NDEBUG) || LLVM_FORCE_ENABLE_STATS
+#define LLVM_ENABLE_STATS 1
+#endif
 
 namespace llvm {
 
 class raw_ostream;
 class raw_fd_ostream;
+class StringRef;
 
 class Statistic {
 public:
@@ -42,7 +51,7 @@ public:
   const char *Name;
   const char *Desc;
   std::atomic<unsigned> Value;
-  bool Initialized;
+  std::atomic<bool> Initialized;
 
   unsigned getValue() const { return Value.load(std::memory_order_relaxed); }
   const char *getDebugType() const { return DebugType; }
@@ -61,7 +70,7 @@ public:
   // Allow use of this class as the value itself.
   operator unsigned() const { return getValue(); }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+#if LLVM_ENABLE_STATS
    const Statistic &operator=(unsigned Val) {
     Value.store(Val, std::memory_order_relaxed);
     return init();
@@ -143,14 +152,12 @@ public:
 
   void updateMax(unsigned V) {}
 
-#endif  // !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+#endif  // LLVM_ENABLE_STATS
 
 protected:
   Statistic &init() {
-    bool tmp = Initialized;
-    sys::MemoryFence();
-    if (!tmp) RegisterStatistic();
-    TsanHappensAfter(this);
+    if (!Initialized.load(std::memory_order_acquire))
+      RegisterStatistic();
     return *this;
   }
 
@@ -160,21 +167,21 @@ protected:
 // STATISTIC - A macro to make definition of statistics really simple.  This
 // automatically passes the DEBUG_TYPE of the file into the statistic.
 #define STATISTIC(VARNAME, DESC)                                               \
-  static llvm::Statistic VARNAME = {DEBUG_TYPE, #VARNAME, DESC, {0}, false}
+  static llvm::Statistic VARNAME = {DEBUG_TYPE, #VARNAME, DESC, {0}, {false}}
 
-/// \brief Enable the collection and printing of statistics.
+/// Enable the collection and printing of statistics.
 void EnableStatistics(bool PrintOnExit = true);
 
-/// \brief Check if statistics are enabled.
+/// Check if statistics are enabled.
 bool AreStatisticsEnabled();
 
-/// \brief Return a file stream to print our output on.
+/// Return a file stream to print our output on.
 std::unique_ptr<raw_fd_ostream> CreateInfoOutputFile();
 
-/// \brief Print statistics to the file returned by CreateInfoOutputFile().
+/// Print statistics to the file returned by CreateInfoOutputFile().
 void PrintStatistics();
 
-/// \brief Print statistics to the given output stream.
+/// Print statistics to the given output stream.
 void PrintStatistics(raw_ostream &OS);
 
 /// Print statistics in JSON format. This does include all global timers (\see
@@ -183,6 +190,30 @@ void PrintStatistics(raw_ostream &OS);
 /// PrintStatisticsJSON().
 void PrintStatisticsJSON(raw_ostream &OS);
 
+/// Get the statistics. This can be used to look up the value of
+/// statistics without needing to parse JSON.
+///
+/// This function does not prevent statistics being updated by other threads
+/// during it's execution. It will return the value at the point that it is
+/// read. However, it will prevent new statistics from registering until it
+/// completes.
+const std::vector<std::pair<StringRef, unsigned>> GetStatistics();
+
+/// Reset the statistics. This can be used to zero and de-register the
+/// statistics in order to measure a compilation.
+///
+/// When this function begins to call destructors prior to returning, all
+/// statistics will be zero and unregistered. However, that might not remain the
+/// case by the time this function finishes returning. Whether update from other
+/// threads are lost or merely deferred until during the function return is
+/// timing sensitive.
+///
+/// Callers who intend to use this to measure statistics for a single
+/// compilation should ensure that no compilations are in progress at the point
+/// this function is called and that only one compilation executes until calling
+/// GetStatistics().
+void ResetStatistics();
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_STATISTIC_H
diff --git a/contrib/llvm/include/llvm/ADT/StringExtras.h b/contrib/llvm/include/llvm/ADT/StringExtras.h
index 60652f8c55c5..71b0e7527cb7 100644
--- a/contrib/llvm/include/llvm/ADT/StringExtras.h
+++ b/contrib/llvm/include/llvm/ADT/StringExtras.h
@@ -39,6 +39,16 @@ inline char hexdigit(unsigned X, bool LowerCase = false) {
   return X < 10 ? '0' + X : HexChar + X - 10;
 }
 
+/// Given an array of c-style strings terminated by a null pointer, construct
+/// a vector of StringRefs representing the same strings without the terminating
+/// null string.
+inline std::vector<StringRef> toStringRefArray(const char *const *Strings) {
+  std::vector<StringRef> Result;
+  while (*Strings)
+    Result.push_back(*Strings++);
+  return Result;
+}
+
 /// Construct a string ref from a boolean.
 inline StringRef toStringRef(bool B) { return StringRef(B ? "true" : "false"); }
 
@@ -78,6 +88,26 @@ inline bool isAlpha(char C) {
 /// lowercase letter as classified by "C" locale.
 inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
 
+/// Checks whether character \p C is valid ASCII (high bit is zero).
+inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
+
+/// Checks whether all characters in S are ASCII.
+inline bool isASCII(llvm::StringRef S) {
+  for (char C : S)
+    if (LLVM_UNLIKELY(!isASCII(C)))
+      return false;
+  return true;
+}
+
+/// Checks whether character \p C is printable.
+///
+/// Locale-independent version of the C standard library isprint whose results
+/// may differ on different platforms.
+inline bool isPrint(char C) {
+  unsigned char UC = static_cast<unsigned char>(C);
+  return (0x20 <= UC) && (UC <= 0x7E);
+}
+
 /// Returns the corresponding lowercase character if \p x is uppercase.
 inline char toLower(char x) {
   if (x >= 'A' && x <= 'Z')
@@ -157,7 +187,7 @@ inline std::string fromHex(StringRef Input) {
   return Output;
 }
 
-/// \brief Convert the string \p S to an integer of the specified type using
+/// Convert the string \p S to an integer of the specified type using
 /// the radix \p Base.  If \p Base is 0, auto-detects the radix.
 /// Returns true if the number was successfully converted, false otherwise.
 template <typename N> bool to_integer(StringRef S, N &Num, unsigned Base = 0) {
@@ -232,19 +262,6 @@ void SplitString(StringRef Source,
                  SmallVectorImpl<StringRef> &OutFragments,
                  StringRef Delimiters = " \t\n\v\f\r");
 
-/// HashString - Hash function for strings.
-///
-/// This is the Bernstein hash function.
-//
-// FIXME: Investigate whether a modified bernstein hash function performs
-// better: http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-//   X*33+c -> X*33^c
-inline unsigned HashString(StringRef Str, unsigned Result = 0) {
-  for (StringRef::size_type i = 0, e = Str.size(); i != e; ++i)
-    Result = Result * 33 + (unsigned char)Str[i];
-  return Result;
-}
-
 /// Returns the English suffix for an ordinal integer (-st, -nd, -rd, -th).
 inline StringRef getOrdinalSuffix(unsigned Val) {
   // It is critically important that we do this perfectly for
@@ -264,9 +281,13 @@ inline StringRef getOrdinalSuffix(unsigned Val) {
   }
 }
 
-/// PrintEscapedString - Print each character of the specified string, escaping
-/// it if it is not printable or if it is an escape char.
-void PrintEscapedString(StringRef Name, raw_ostream &Out);
+/// Print each character of the specified string, escaping it if it is not
+/// printable or if it is an escape char.
+void printEscapedString(StringRef Name, raw_ostream &Out);
+
+/// Print each character of the specified string, escaping HTML special
+/// characters.
+void printHTMLEscaped(StringRef String, raw_ostream &Out);
 
 /// printLowerCase - Print each character as lowercase if it is uppercase.
 void printLowerCase(StringRef String, raw_ostream &Out);
diff --git a/contrib/llvm/include/llvm/ADT/StringMap.h b/contrib/llvm/include/llvm/ADT/StringMap.h
index 6c2830b44914..a9f83d3f5091 100644
--- a/contrib/llvm/include/llvm/ADT/StringMap.h
+++ b/contrib/llvm/include/llvm/ADT/StringMap.h
@@ -37,12 +37,12 @@ template<typename ValueTy> class StringMapKeyIterator;
 
 /// StringMapEntryBase - Shared base class of StringMapEntry instances.
 class StringMapEntryBase {
-  unsigned StrLen;
+  size_t StrLen;
 
 public:
-  explicit StringMapEntryBase(unsigned Len) : StrLen(Len) {}
+  explicit StringMapEntryBase(size_t Len) : StrLen(Len) {}
 
-  unsigned getKeyLength() const { return StrLen; }
+  size_t getKeyLength() const { return StrLen; }
 };
 
 /// StringMapImpl - This is the base class of StringMap that is shared among
@@ -127,10 +127,10 @@ class StringMapEntry : public StringMapEntryBase {
 public:
   ValueTy second;
 
-  explicit StringMapEntry(unsigned strLen)
+  explicit StringMapEntry(size_t strLen)
     : StringMapEntryBase(strLen), second() {}
   template <typename... InitTy>
-  StringMapEntry(unsigned strLen, InitTy &&... InitVals)
+  StringMapEntry(size_t strLen, InitTy &&... InitVals)
       : StringMapEntryBase(strLen), second(std::forward<InitTy>(InitVals)...) {}
   StringMapEntry(StringMapEntry &E) = delete;
 
@@ -155,19 +155,16 @@ public:
   template <typename AllocatorTy, typename... InitTy>
   static StringMapEntry *Create(StringRef Key, AllocatorTy &Allocator,
                                 InitTy &&... InitVals) {
-    unsigned KeyLength = Key.size();
+    size_t KeyLength = Key.size();
 
     // Allocate a new item with space for the string at the end and a null
     // terminator.
-    unsigned AllocSize = static_cast<unsigned>(sizeof(StringMapEntry))+
-      KeyLength+1;
-    unsigned Alignment = alignof(StringMapEntry);
+    size_t AllocSize = sizeof(StringMapEntry) + KeyLength + 1;
+    size_t Alignment = alignof(StringMapEntry);
 
     StringMapEntry *NewItem =
       static_cast<StringMapEntry*>(Allocator.Allocate(AllocSize,Alignment));
-
-    if (NewItem == nullptr)
-      report_bad_alloc_error("Allocation of StringMap entry failed.");
+    assert(NewItem && "Unhandled out-of-memory");
 
     // Construct the value.
     new (NewItem) StringMapEntry(KeyLength, std::forward<InitTy>(InitVals)...);
@@ -203,8 +200,7 @@ public:
   template<typename AllocatorTy>
   void Destroy(AllocatorTy &Allocator) {
     // Free memory referenced by the item.
-    unsigned AllocSize =
-        static_cast<unsigned>(sizeof(StringMapEntry)) + getKeyLength() + 1;
+    size_t AllocSize = sizeof(StringMapEntry) + getKeyLength() + 1;
     this->~StringMapEntry();
     Allocator.Deallocate(static_cast<void *>(this), AllocSize);
   }
diff --git a/contrib/llvm/include/llvm/ADT/StringRef.h b/contrib/llvm/include/llvm/ADT/StringRef.h
index f6c93a858db1..a5ba5b59b5a3 100644
--- a/contrib/llvm/include/llvm/ADT/StringRef.h
+++ b/contrib/llvm/include/llvm/ADT/StringRef.h
@@ -201,7 +201,7 @@ namespace llvm {
     LLVM_NODISCARD
     int compare_numeric(StringRef RHS) const;
 
-    /// \brief Determine the edit distance between this string and another
+    /// Determine the edit distance between this string and another
     /// string.
     ///
     /// \param Other the string to compare this string against.
@@ -725,10 +725,7 @@ namespace llvm {
     /// \returns The split substrings.
     LLVM_NODISCARD
     std::pair<StringRef, StringRef> split(char Separator) const {
-      size_t Idx = find(Separator);
-      if (Idx == npos)
-        return std::make_pair(*this, StringRef());
-      return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
+      return split(StringRef(&Separator, 1));
     }
 
     /// Split into two substrings around the first occurrence of a separator
@@ -749,6 +746,24 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx + Separator.size(), npos));
     }
 
+    /// Split into two substrings around the last occurrence of a separator
+    /// string.
+    ///
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
+    /// such that (*this == LHS + Separator + RHS) is true and RHS is
+    /// minimal. If \p Separator is not in the string, then the result is a
+    /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
+    ///
+    /// \param Separator - The string to split on.
+    /// \return - The split substrings.
+    LLVM_NODISCARD
+    std::pair<StringRef, StringRef> rsplit(StringRef Separator) const {
+      size_t Idx = rfind(Separator);
+      if (Idx == npos)
+        return std::make_pair(*this, StringRef());
+      return std::make_pair(slice(0, Idx), slice(Idx + Separator.size(), npos));
+    }
+
     /// Split into substrings around the occurrences of a separator string.
     ///
     /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most
@@ -796,10 +811,7 @@ namespace llvm {
     /// \return - The split substrings.
     LLVM_NODISCARD
     std::pair<StringRef, StringRef> rsplit(char Separator) const {
-      size_t Idx = rfind(Separator);
-      if (Idx == npos)
-        return std::make_pair(*this, StringRef());
-      return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
+      return rsplit(StringRef(&Separator, 1));
     }
 
     /// Return string with consecutive \p Char characters starting from the
@@ -855,6 +867,10 @@ namespace llvm {
   /// constexpr StringLiteral S("test");
   ///
   class StringLiteral : public StringRef {
+  private:
+    constexpr StringLiteral(const char *Str, size_t N) : StringRef(Str, N) {
+    }
+
   public:
     template <size_t N>
     constexpr StringLiteral(const char (&Str)[N])
@@ -867,6 +883,12 @@ namespace llvm {
 #endif
         : StringRef(Str, N - 1) {
     }
+
+    // Explicit construction for strings like "foo\0bar".
+    template <size_t N>
+    static constexpr StringLiteral withInnerNUL(const char (&Str)[N]) {
+      return StringLiteral(Str, N - 1);
+    }
   };
 
   /// @name StringRef Comparison Operators
@@ -902,7 +924,7 @@ namespace llvm {
 
   /// @}
 
-  /// \brief Compute a hash_code for a StringRef.
+  /// Compute a hash_code for a StringRef.
   LLVM_NODISCARD
   hash_code hash_value(StringRef S);
 
diff --git a/contrib/llvm/include/llvm/ADT/StringSwitch.h b/contrib/llvm/include/llvm/ADT/StringSwitch.h
index 75577b7738ba..b7860b98ce5d 100644
--- a/contrib/llvm/include/llvm/ADT/StringSwitch.h
+++ b/contrib/llvm/include/llvm/ADT/StringSwitch.h
@@ -20,7 +20,7 @@
 
 namespace llvm {
 
-/// \brief A switch()-like statement whose cases are string literals.
+/// A switch()-like statement whose cases are string literals.
 ///
 /// The StringSwitch class is a simple form of a switch() statement that
 /// determines whether the given string matches one of the given string
@@ -41,216 +41,176 @@ namespace llvm {
 /// \endcode
 template<typename T, typename R = T>
 class StringSwitch {
-  /// \brief The string we are matching.
-  StringRef Str;
+  /// The string we are matching.
+  const StringRef Str;
 
-  /// \brief The pointer to the result of this switch statement, once known,
+  /// The pointer to the result of this switch statement, once known,
   /// null before that.
-  const T *Result;
+  Optional<T> Result;
 
 public:
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   explicit StringSwitch(StringRef S)
-  : Str(S), Result(nullptr) { }
+  : Str(S), Result() { }
 
   // StringSwitch is not copyable.
   StringSwitch(const StringSwitch &) = delete;
+
+  // StringSwitch is not assignable due to 'Str' being 'const'.
   void operator=(const StringSwitch &) = delete;
+  void operator=(StringSwitch &&other) = delete;
 
-  StringSwitch(StringSwitch &&other) {
-    *this = std::move(other);
-  }
-  StringSwitch &operator=(StringSwitch &&other) {
-    Str = other.Str;
-    Result = other.Result;
-    return *this;
-  }
+  StringSwitch(StringSwitch &&other)
+    : Str(other.Str), Result(std::move(other.Result)) { }
 
   ~StringSwitch() = default;
 
   // Case-sensitive case matchers
-  template<unsigned N>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch& Case(const char (&S)[N], const T& Value) {
-    assert(N);
-    if (!Result && N-1 == Str.size() &&
-        (N == 1 || std::memcmp(S, Str.data(), N-1) == 0)) {
-      Result = &Value;
+  StringSwitch &Case(StringLiteral S, T Value) {
+    if (!Result && Str == S) {
+      Result = std::move(Value);
     }
     return *this;
   }
 
-  template<unsigned N>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch& EndsWith(const char (&S)[N], const T &Value) {
-    assert(N);
-    if (!Result && Str.size() >= N-1 &&
-        (N == 1 || std::memcmp(S, Str.data() + Str.size() + 1 - N, N-1) == 0)) {
-      Result = &Value;
+  StringSwitch& EndsWith(StringLiteral S, T Value) {
+    if (!Result && Str.endswith(S)) {
+      Result = std::move(Value);
     }
     return *this;
   }
 
-  template<unsigned N>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch& StartsWith(const char (&S)[N], const T &Value) {
-    assert(N);
-    if (!Result && Str.size() >= N-1 &&
-        (N == 1 || std::memcmp(S, Str.data(), N-1) == 0)) {
-      Result = &Value;
+  StringSwitch& StartsWith(StringLiteral S, T Value) {
+    if (!Result && Str.startswith(S)) {
+      Result = std::move(Value);
     }
     return *this;
   }
 
-  template<unsigned N0, unsigned N1>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const T& Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
     return Case(S0, Value).Case(S1, Value);
   }
 
-  template<unsigned N0, unsigned N1, unsigned N2>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const T& Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      T Value) {
     return Case(S0, Value).Cases(S1, S2, Value);
   }
 
-  template<unsigned N0, unsigned N1, unsigned N2, unsigned N3>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const T& Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, Value);
   }
 
-  template<unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const char (&S4)[N4], const T& Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, StringLiteral S4, T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, S4, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
-            unsigned N5>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const char (&S4)[N4], const char (&S5)[N5],
-                      const T &Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, StringLiteral S4, StringLiteral S5,
+                      T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, S4, S5, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
-            unsigned N5, unsigned N6>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const char (&S4)[N4], const char (&S5)[N5],
-                      const char (&S6)[N6], const T &Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, StringLiteral S4, StringLiteral S5,
+                      StringLiteral S6, T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
-            unsigned N5, unsigned N6, unsigned N7>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const char (&S4)[N4], const char (&S5)[N5],
-                      const char (&S6)[N6], const char (&S7)[N7],
-                      const T &Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, StringLiteral S4, StringLiteral S5,
+                      StringLiteral S6, StringLiteral S7, T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, S7, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
-            unsigned N5, unsigned N6, unsigned N7, unsigned N8>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const char (&S4)[N4], const char (&S5)[N5],
-                      const char (&S6)[N6], const char (&S7)[N7],
-                      const char (&S8)[N8], const T &Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, StringLiteral S4, StringLiteral S5,
+                      StringLiteral S6, StringLiteral S7, StringLiteral S8,
+                      T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, S7, S8, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
-            unsigned N5, unsigned N6, unsigned N7, unsigned N8, unsigned N9>
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  StringSwitch &Cases(const char (&S0)[N0], const char (&S1)[N1],
-                      const char (&S2)[N2], const char (&S3)[N3],
-                      const char (&S4)[N4], const char (&S5)[N5],
-                      const char (&S6)[N6], const char (&S7)[N7],
-                      const char (&S8)[N8], const char (&S9)[N9],
-                      const T &Value) {
+  StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                      StringLiteral S3, StringLiteral S4, StringLiteral S5,
+                      StringLiteral S6, StringLiteral S7, StringLiteral S8,
+                      StringLiteral S9, T Value) {
     return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, S7, S8, S9, Value);
   }
 
   // Case-insensitive case matchers.
-  template <unsigned N>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &CaseLower(const char (&S)[N],
-                                                       const T &Value) {
-    if (!Result && Str.equals_lower(StringRef(S, N - 1)))
-      Result = &Value;
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &CaseLower(StringLiteral S, T Value) {
+    if (!Result && Str.equals_lower(S))
+      Result = std::move(Value);
 
     return *this;
   }
 
-  template <unsigned N>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &EndsWithLower(const char (&S)[N],
-                                                           const T &Value) {
-    if (!Result && Str.endswith_lower(StringRef(S, N - 1)))
-      Result = &Value;
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &EndsWithLower(StringLiteral S, T Value) {
+    if (!Result && Str.endswith_lower(S))
+      Result = Value;
 
     return *this;
   }
 
-  template <unsigned N>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &StartsWithLower(const char (&S)[N],
-                                                             const T &Value) {
-    if (!Result && Str.startswith_lower(StringRef(S, N - 1)))
-      Result = &Value;
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &StartsWithLower(StringLiteral S, T Value) {
+    if (!Result && Str.startswith_lower(S))
+      Result = std::move(Value);
 
     return *this;
   }
-  template <unsigned N0, unsigned N1>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &
-  CasesLower(const char (&S0)[N0], const char (&S1)[N1], const T &Value) {
+
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
     return CaseLower(S0, Value).CaseLower(S1, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &
-  CasesLower(const char (&S0)[N0], const char (&S1)[N1], const char (&S2)[N2],
-             const T &Value) {
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                           T Value) {
     return CaseLower(S0, Value).CasesLower(S1, S2, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &
-  CasesLower(const char (&S0)[N0], const char (&S1)[N1], const char (&S2)[N2],
-             const char (&S3)[N3], const T &Value) {
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                           StringLiteral S3, T Value) {
     return CaseLower(S0, Value).CasesLower(S1, S2, S3, Value);
   }
 
-  template <unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch &
-  CasesLower(const char (&S0)[N0], const char (&S1)[N1], const char (&S2)[N2],
-             const char (&S3)[N3], const char (&S4)[N4], const T &Value) {
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
+                           StringLiteral S3, StringLiteral S4, T Value) {
     return CaseLower(S0, Value).CasesLower(S1, S2, S3, S4, Value);
   }
 
+  LLVM_NODISCARD
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  R Default(const T &Value) const {
+  R Default(T Value) {
     if (Result)
-      return *Result;
+      return std::move(*Result);
     return Value;
   }
 
+  LLVM_NODISCARD
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  operator R() const {
+  operator R() {
     assert(Result && "Fell off the end of a string-switch");
-    return *Result;
+    return std::move(*Result);
   }
 };
 
diff --git a/contrib/llvm/include/llvm/ADT/TinyPtrVector.h b/contrib/llvm/include/llvm/ADT/TinyPtrVector.h
index 73573d65e2b3..1b8e9aa658c3 100644
--- a/contrib/llvm/include/llvm/ADT/TinyPtrVector.h
+++ b/contrib/llvm/include/llvm/ADT/TinyPtrVector.h
@@ -108,6 +108,12 @@ public:
     return *this;
   }
 
+  TinyPtrVector(std::initializer_list<EltTy> IL)
+      : Val(IL.size() == 0
+                ? PtrUnion()
+                : IL.size() == 1 ? PtrUnion(*IL.begin())
+                                 : PtrUnion(new VecTy(IL.begin(), IL.end()))) {}
+
   /// Constructor from an ArrayRef.
   ///
   /// This also is a constructor for individual array elements due to the single
diff --git a/contrib/llvm/include/llvm/ADT/Triple.h b/contrib/llvm/include/llvm/ADT/Triple.h
index 74fc8eb8ccbf..c95b16dd4e8c 100644
--- a/contrib/llvm/include/llvm/ADT/Triple.h
+++ b/contrib/llvm/include/llvm/ADT/Triple.h
@@ -101,6 +101,7 @@ public:
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v8_4a,
     ARMSubArch_v8_3a,
     ARMSubArch_v8_2a,
     ARMSubArch_v8_1a,
@@ -144,7 +145,8 @@ public:
     AMD,
     Mesa,
     SUSE,
-    LastVendorType = SUSE
+    OpenEmbedded,
+    LastVendorType = OpenEmbedded
   };
   enum OSType {
     UnknownOS,
@@ -202,9 +204,7 @@ public:
     MSVC,
     Itanium,
     Cygnus,
-    AMDOpenCL,
     CoreCLR,
-    OpenCL,
     Simulator,  // Simulator variants of other systems, e.g., Apple's iOS
     LastEnvironmentType = Simulator
   };
@@ -660,9 +660,29 @@ public:
     return getArch() == Triple::aarch64 || getArch() == Triple::aarch64_be;
   }
 
-  /// Tests wether the target supports comdat
+  /// Tests whether the target is MIPS 32-bit (little and big endian).
+  bool isMIPS32() const {
+    return getArch() == Triple::mips || getArch() == Triple::mipsel;
+  }
+
+  /// Tests whether the target is MIPS 64-bit (little and big endian).
+  bool isMIPS64() const {
+    return getArch() == Triple::mips64 || getArch() == Triple::mips64el;
+  }
+
+  /// Tests whether the target is MIPS (little and big endian, 32- or 64-bit).
+  bool isMIPS() const {
+    return isMIPS32() || isMIPS64();
+  }
+
+  /// Tests whether the target supports comdat
   bool supportsCOMDAT() const {
-    return !isOSBinFormatMachO() && !isOSBinFormatWasm();
+    return !isOSBinFormatMachO();
+  }
+
+  /// Tests whether the target uses emulated TLS as default.
+  bool hasDefaultEmulatedTLS() const {
+    return isAndroid() || isOSOpenBSD() || isWindowsCygwinEnvironment();
   }
 
   /// @}
diff --git a/contrib/llvm/include/llvm/ADT/UniqueVector.h b/contrib/llvm/include/llvm/ADT/UniqueVector.h
index b17fb2392baf..c86bedd07687 100644
--- a/contrib/llvm/include/llvm/ADT/UniqueVector.h
+++ b/contrib/llvm/include/llvm/ADT/UniqueVector.h
@@ -72,16 +72,16 @@ public:
     return Vector[ID - 1];
   }
 
-  /// \brief Return an iterator to the start of the vector.
+  /// Return an iterator to the start of the vector.
   iterator begin() { return Vector.begin(); }
 
-  /// \brief Return an iterator to the start of the vector.
+  /// Return an iterator to the start of the vector.
   const_iterator begin() const { return Vector.begin(); }
 
-  /// \brief Return an iterator to the end of the vector.
+  /// Return an iterator to the end of the vector.
   iterator end() { return Vector.end(); }
 
-  /// \brief Return an iterator to the end of the vector.
+  /// Return an iterator to the end of the vector.
   const_iterator end() const { return Vector.end(); }
 
   /// size - Returns the number of entries in the vector.
diff --git a/contrib/llvm/include/llvm/ADT/VariadicFunction.h b/contrib/llvm/include/llvm/ADT/VariadicFunction.h
index 403130c623eb..9028abe4c72c 100644
--- a/contrib/llvm/include/llvm/ADT/VariadicFunction.h
+++ b/contrib/llvm/include/llvm/ADT/VariadicFunction.h
@@ -53,7 +53,7 @@ namespace llvm {
 #define LLVM_COMMA_JOIN31(x) LLVM_COMMA_JOIN30(x), x ## 30
 #define LLVM_COMMA_JOIN32(x) LLVM_COMMA_JOIN31(x), x ## 31
 
-/// \brief Class which can simulate a type-safe variadic function.
+/// Class which can simulate a type-safe variadic function.
 ///
 /// The VariadicFunction class template makes it easy to define
 /// type-safe variadic functions where all arguments have the same
diff --git a/contrib/llvm/include/llvm/ADT/edit_distance.h b/contrib/llvm/include/llvm/ADT/edit_distance.h
index 06a01b18a9fb..b2e8ec5c3f6d 100644
--- a/contrib/llvm/include/llvm/ADT/edit_distance.h
+++ b/contrib/llvm/include/llvm/ADT/edit_distance.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-/// \brief Determine the edit distance between two sequences.
+/// Determine the edit distance between two sequences.
 ///
 /// \param FromArray the first sequence to compare.
 ///
diff --git a/contrib/llvm/include/llvm/ADT/ilist.h b/contrib/llvm/include/llvm/ADT/ilist.h
index a788f811e4c6..00bb6d528175 100644
--- a/contrib/llvm/include/llvm/ADT/ilist.h
+++ b/contrib/llvm/include/llvm/ADT/ilist.h
@@ -84,21 +84,11 @@ template <typename NodeTy>
 struct ilist_node_traits : ilist_alloc_traits<NodeTy>,
                            ilist_callback_traits<NodeTy> {};
 
-/// Default template traits for intrusive list.
-///
-/// By inheriting from this, you can easily use default implementations for all
-/// common operations.
-///
-/// TODO: Remove this customization point.  Specializing ilist_traits is
-/// already fully general.
-template <typename NodeTy>
-struct ilist_default_traits : public ilist_node_traits<NodeTy> {};
-
 /// Template traits for intrusive list.
 ///
 /// Customize callbacks and allocation semantics.
 template <typename NodeTy>
-struct ilist_traits : public ilist_default_traits<NodeTy> {};
+struct ilist_traits : public ilist_node_traits<NodeTy> {};
 
 /// Const traits should never be instantiated.
 template <typename Ty> struct ilist_traits<const Ty> {};
@@ -178,9 +168,6 @@ template <class IntrusiveListT, class TraitsT>
 class iplist_impl : public TraitsT, IntrusiveListT {
   typedef IntrusiveListT base_list_type;
 
-protected:
-  typedef iplist_impl iplist_impl_type;
-
 public:
   typedef typename base_list_type::pointer pointer;
   typedef typename base_list_type::const_pointer const_pointer;
@@ -369,26 +356,26 @@ public:
 
   using base_list_type::sort;
 
-  /// \brief Get the previous node, or \c nullptr for the list head.
+  /// Get the previous node, or \c nullptr for the list head.
   pointer getPrevNode(reference N) const {
     auto I = N.getIterator();
     if (I == begin())
       return nullptr;
     return &*std::prev(I);
   }
-  /// \brief Get the previous node, or \c nullptr for the list head.
+  /// Get the previous node, or \c nullptr for the list head.
   const_pointer getPrevNode(const_reference N) const {
     return getPrevNode(const_cast<reference >(N));
   }
 
-  /// \brief Get the next node, or \c nullptr for the list tail.
+  /// Get the next node, or \c nullptr for the list tail.
   pointer getNextNode(reference N) const {
     auto Next = std::next(N.getIterator());
     if (Next == end())
       return nullptr;
     return &*Next;
   }
-  /// \brief Get the next node, or \c nullptr for the list tail.
+  /// Get the next node, or \c nullptr for the list tail.
   const_pointer getNextNode(const_reference N) const {
     return getNextNode(const_cast<reference >(N));
   }
@@ -402,7 +389,7 @@ public:
 template <class T, class... Options>
 class iplist
     : public iplist_impl<simple_ilist<T, Options...>, ilist_traits<T>> {
-  typedef typename iplist::iplist_impl_type iplist_impl_type;
+  using iplist_impl_type = typename iplist::iplist_impl;
 
 public:
   iplist() = default;
diff --git a/contrib/llvm/include/llvm/ADT/ilist_node.h b/contrib/llvm/include/llvm/ADT/ilist_node.h
index 3362611697cb..dd0e6b4ec2b9 100644
--- a/contrib/llvm/include/llvm/ADT/ilist_node.h
+++ b/contrib/llvm/include/llvm/ADT/ilist_node.h
@@ -271,7 +271,7 @@ private:
 public:
   /// @name Adjacent Node Accessors
   /// @{
-  /// \brief Get the previous node, or \c nullptr for the list head.
+  /// Get the previous node, or \c nullptr for the list head.
   NodeTy *getPrevNode() {
     // Should be separated to a reused function, but then we couldn't use auto
     // (and would need the type of the list).
@@ -280,12 +280,12 @@ public:
     return List.getPrevNode(*static_cast<NodeTy *>(this));
   }
 
-  /// \brief Get the previous node, or \c nullptr for the list head.
+  /// Get the previous node, or \c nullptr for the list head.
   const NodeTy *getPrevNode() const {
     return const_cast<ilist_node_with_parent *>(this)->getPrevNode();
   }
 
-  /// \brief Get the next node, or \c nullptr for the list tail.
+  /// Get the next node, or \c nullptr for the list tail.
   NodeTy *getNextNode() {
     // Should be separated to a reused function, but then we couldn't use auto
     // (and would need the type of the list).
@@ -294,7 +294,7 @@ public:
     return List.getNextNode(*static_cast<NodeTy *>(this));
   }
 
-  /// \brief Get the next node, or \c nullptr for the list tail.
+  /// Get the next node, or \c nullptr for the list tail.
   const NodeTy *getNextNode() const {
     return const_cast<ilist_node_with_parent *>(this)->getNextNode();
   }
diff --git a/contrib/llvm/include/llvm/ADT/ilist_node_options.h b/contrib/llvm/include/llvm/ADT/ilist_node_options.h
index c33df1eeb819..7ff4005f6757 100644
--- a/contrib/llvm/include/llvm/ADT/ilist_node_options.h
+++ b/contrib/llvm/include/llvm/ADT/ilist_node_options.h
@@ -11,7 +11,6 @@
 #define LLVM_ADT_ILIST_NODE_OPTIONS_H
 
 #include "llvm/Config/abi-breaking.h"
-#include "llvm/Config/llvm-config.h"
 
 #include <type_traits>
 
diff --git a/contrib/llvm/include/llvm/ADT/iterator.h b/contrib/llvm/include/llvm/ADT/iterator.h
index 711f8f221620..549c5221173d 100644
--- a/contrib/llvm/include/llvm/ADT/iterator.h
+++ b/contrib/llvm/include/llvm/ADT/iterator.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
 
-/// \brief CRTP base class which implements the entire standard iterator facade
+/// CRTP base class which implements the entire standard iterator facade
 /// in terms of a minimal subset of the interface.
 ///
 /// Use this when it is reasonable to implement most of the iterator
@@ -183,7 +183,7 @@ public:
   }
 };
 
-/// \brief CRTP base class for adapting an iterator to a different type.
+/// CRTP base class for adapting an iterator to a different type.
 ///
 /// This class can be used through CRTP to adapt one iterator into another.
 /// Typically this is done through providing in the derived class a custom \c
@@ -274,7 +274,7 @@ public:
   ReferenceT operator*() const { return *I; }
 };
 
-/// \brief An iterator type that allows iterating over the pointees via some
+/// An iterator type that allows iterating over the pointees via some
 /// other iterator.
 ///
 /// The typical usage of this is to expose a type that iterates over Ts, but
@@ -288,7 +288,7 @@ template <typename WrappedIteratorT,
               decltype(**std::declval<WrappedIteratorT>())>::type>
 struct pointee_iterator
     : iterator_adaptor_base<
-          pointee_iterator<WrappedIteratorT>, WrappedIteratorT,
+          pointee_iterator<WrappedIteratorT, T>, WrappedIteratorT,
           typename std::iterator_traits<WrappedIteratorT>::iterator_category,
           T> {
   pointee_iterator() = default;
@@ -311,7 +311,7 @@ make_pointee_range(RangeT &&Range) {
 template <typename WrappedIteratorT,
           typename T = decltype(&*std::declval<WrappedIteratorT>())>
 class pointer_iterator
-    : public iterator_adaptor_base<pointer_iterator<WrappedIteratorT>,
+    : public iterator_adaptor_base<pointer_iterator<WrappedIteratorT, T>,
                                    WrappedIteratorT, T> {
   mutable T Ptr;
 
diff --git a/contrib/llvm/include/llvm/ADT/iterator_range.h b/contrib/llvm/include/llvm/ADT/iterator_range.h
index 3cbf6198eb60..2ba12866ecf3 100644
--- a/contrib/llvm/include/llvm/ADT/iterator_range.h
+++ b/contrib/llvm/include/llvm/ADT/iterator_range.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-/// \brief A range adaptor for a pair of iterators.
+/// A range adaptor for a pair of iterators.
 ///
 /// This just wraps two iterators into a range-compatible interface. Nothing
 /// fancy at all.
@@ -47,7 +47,7 @@ public:
   IteratorT end() const { return end_iterator; }
 };
 
-/// \brief Convenience function for iterating over sub-ranges.
+/// Convenience function for iterating over sub-ranges.
 ///
 /// This provides a bit of syntactic sugar to make using sub-ranges
 /// in for loops a bit easier. Analogous to std::make_pair().
@@ -59,9 +59,10 @@ template <typename T> iterator_range<T> make_range(std::pair<T, T> p) {
   return iterator_range<T>(std::move(p.first), std::move(p.second));
 }
 
-template<typename T>
-iterator_range<decltype(begin(std::declval<T>()))> drop_begin(T &&t, int n) {
-  return make_range(std::next(begin(t), n), end(t));
+template <typename T>
+iterator_range<decltype(adl_begin(std::declval<T>()))> drop_begin(T &&t,
+                                                                  int n) {
+  return make_range(std::next(adl_begin(t), n), adl_end(t));
 }
 }
 
diff --git a/contrib/llvm/include/llvm/Analysis/AliasAnalysis.h b/contrib/llvm/include/llvm/Analysis/AliasAnalysis.h
index 362096b08e13..be3496bbd955 100644
--- a/contrib/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -76,7 +76,7 @@ class Value;
 ///
 /// See docs/AliasAnalysis.html for more information on the specific meanings
 /// of these values.
-enum AliasResult {
+enum AliasResult : uint8_t {
   /// The two locations do not alias at all.
   ///
   /// This value is arranged to convert to false, while all other values
@@ -91,13 +91,16 @@ enum AliasResult {
   MustAlias,
 };
 
+/// << operator for AliasResult.
+raw_ostream &operator<<(raw_ostream &OS, AliasResult AR);
+
 /// Flags indicating whether a memory access modifies or references memory.
 ///
 /// This is no access at all, a modification, a reference, or both
 /// a modification and a reference. These are specifically structured such that
 /// they form a three bit matrix and bit-tests for 'mod' or 'ref' or 'must'
 /// work with any of the possible values.
-enum class ModRefInfo {
+enum class ModRefInfo : uint8_t {
   /// Must is provided for completeness, but no routines will return only
   /// Must today. See definition of Must below.
   Must = 0,
@@ -325,8 +328,8 @@ public:
   AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
 
   /// A convenience wrapper around the primary \c alias interface.
-  AliasResult alias(const Value *V1, uint64_t V1Size, const Value *V2,
-                    uint64_t V2Size) {
+  AliasResult alias(const Value *V1, LocationSize V1Size, const Value *V2,
+                    LocationSize V2Size) {
     return alias(MemoryLocation(V1, V1Size), MemoryLocation(V2, V2Size));
   }
 
@@ -343,8 +346,8 @@ public:
   }
 
   /// A convenience wrapper around the \c isNoAlias helper interface.
-  bool isNoAlias(const Value *V1, uint64_t V1Size, const Value *V2,
-                 uint64_t V2Size) {
+  bool isNoAlias(const Value *V1, LocationSize V1Size, const Value *V2,
+                 LocationSize V2Size) {
     return isNoAlias(MemoryLocation(V1, V1Size), MemoryLocation(V2, V2Size));
   }
 
@@ -501,7 +504,7 @@ public:
 
   /// getModRefInfo (for call sites) - A convenience wrapper.
   ModRefInfo getModRefInfo(ImmutableCallSite CS, const Value *P,
-                           uint64_t Size) {
+                           LocationSize Size) {
     return getModRefInfo(CS, MemoryLocation(P, Size));
   }
 
@@ -512,7 +515,8 @@ public:
   }
 
   /// getModRefInfo (for calls) - A convenience wrapper.
-  ModRefInfo getModRefInfo(const CallInst *C, const Value *P, uint64_t Size) {
+  ModRefInfo getModRefInfo(const CallInst *C, const Value *P,
+                           LocationSize Size) {
     return getModRefInfo(C, MemoryLocation(P, Size));
   }
 
@@ -523,7 +527,8 @@ public:
   }
 
   /// getModRefInfo (for invokes) - A convenience wrapper.
-  ModRefInfo getModRefInfo(const InvokeInst *I, const Value *P, uint64_t Size) {
+  ModRefInfo getModRefInfo(const InvokeInst *I, const Value *P,
+                           LocationSize Size) {
     return getModRefInfo(I, MemoryLocation(P, Size));
   }
 
@@ -532,7 +537,8 @@ public:
   ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc);
 
   /// getModRefInfo (for loads) - A convenience wrapper.
-  ModRefInfo getModRefInfo(const LoadInst *L, const Value *P, uint64_t Size) {
+  ModRefInfo getModRefInfo(const LoadInst *L, const Value *P,
+                           LocationSize Size) {
     return getModRefInfo(L, MemoryLocation(P, Size));
   }
 
@@ -541,7 +547,8 @@ public:
   ModRefInfo getModRefInfo(const StoreInst *S, const MemoryLocation &Loc);
 
   /// getModRefInfo (for stores) - A convenience wrapper.
-  ModRefInfo getModRefInfo(const StoreInst *S, const Value *P, uint64_t Size) {
+  ModRefInfo getModRefInfo(const StoreInst *S, const Value *P,
+                           LocationSize Size) {
     return getModRefInfo(S, MemoryLocation(P, Size));
   }
 
@@ -550,7 +557,8 @@ public:
   ModRefInfo getModRefInfo(const FenceInst *S, const MemoryLocation &Loc);
 
   /// getModRefInfo (for fences) - A convenience wrapper.
-  ModRefInfo getModRefInfo(const FenceInst *S, const Value *P, uint64_t Size) {
+  ModRefInfo getModRefInfo(const FenceInst *S, const Value *P,
+                           LocationSize Size) {
     return getModRefInfo(S, MemoryLocation(P, Size));
   }
 
@@ -580,7 +588,8 @@ public:
   ModRefInfo getModRefInfo(const VAArgInst *I, const MemoryLocation &Loc);
 
   /// getModRefInfo (for va_args) - A convenience wrapper.
-  ModRefInfo getModRefInfo(const VAArgInst *I, const Value *P, uint64_t Size) {
+  ModRefInfo getModRefInfo(const VAArgInst *I, const Value *P,
+                           LocationSize Size) {
     return getModRefInfo(I, MemoryLocation(P, Size));
   }
 
@@ -590,7 +599,7 @@ public:
 
   /// getModRefInfo (for catchpads) - A convenience wrapper.
   ModRefInfo getModRefInfo(const CatchPadInst *I, const Value *P,
-                           uint64_t Size) {
+                           LocationSize Size) {
     return getModRefInfo(I, MemoryLocation(P, Size));
   }
 
@@ -600,7 +609,7 @@ public:
 
   /// getModRefInfo (for catchrets) - A convenience wrapper.
   ModRefInfo getModRefInfo(const CatchReturnInst *I, const Value *P,
-                           uint64_t Size) {
+                           LocationSize Size) {
     return getModRefInfo(I, MemoryLocation(P, Size));
   }
 
@@ -646,7 +655,7 @@ public:
 
   /// A convenience wrapper for constructing the memory location.
   ModRefInfo getModRefInfo(const Instruction *I, const Value *P,
-                           uint64_t Size) {
+                           LocationSize Size) {
     return getModRefInfo(I, MemoryLocation(P, Size));
   }
 
@@ -659,7 +668,7 @@ public:
   ///   http://llvm.org/docs/AliasAnalysis.html#ModRefInfo
   ModRefInfo getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2);
 
-  /// \brief Return information about whether a particular call site modifies
+  /// Return information about whether a particular call site modifies
   /// or reads the specified memory location \p MemLoc before instruction \p I
   /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
   /// instruction ordering queries inside the BasicBlock containing \p I.
@@ -669,9 +678,9 @@ public:
                                 const MemoryLocation &MemLoc, DominatorTree *DT,
                                 OrderedBasicBlock *OBB = nullptr);
 
-  /// \brief A convenience wrapper to synthesize a memory location.
+  /// A convenience wrapper to synthesize a memory location.
   ModRefInfo callCapturesBefore(const Instruction *I, const Value *P,
-                                uint64_t Size, DominatorTree *DT,
+                                LocationSize Size, DominatorTree *DT,
                                 OrderedBasicBlock *OBB = nullptr) {
     return callCapturesBefore(I, MemoryLocation(P, Size), DT, OBB);
   }
@@ -687,7 +696,7 @@ public:
 
   /// A convenience wrapper synthesizing a memory location.
   bool canBasicBlockModify(const BasicBlock &BB, const Value *P,
-                           uint64_t Size) {
+                           LocationSize Size) {
     return canBasicBlockModify(BB, MemoryLocation(P, Size));
   }
 
@@ -702,7 +711,7 @@ public:
 
   /// A convenience wrapper synthesizing a memory location.
   bool canInstructionRangeModRef(const Instruction &I1, const Instruction &I2,
-                                 const Value *Ptr, uint64_t Size,
+                                 const Value *Ptr, LocationSize Size,
                                  const ModRefInfo Mode) {
     return canInstructionRangeModRef(I1, I2, MemoryLocation(Ptr, Size), Mode);
   }
diff --git a/contrib/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h b/contrib/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h
index cd2f631a01f4..0941814a56c3 100644
--- a/contrib/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h
+++ b/contrib/llvm/include/llvm/Analysis/AliasAnalysisEvaluator.h
@@ -56,7 +56,7 @@ public:
   }
   ~AAEvaluator();
 
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
 private:
diff --git a/contrib/llvm/include/llvm/Analysis/AliasSetTracker.h b/contrib/llvm/include/llvm/Analysis/AliasSetTracker.h
index 7da3ebabb8a3..c9680ff40d1e 100644
--- a/contrib/llvm/include/llvm/Analysis/AliasSetTracker.h
+++ b/contrib/llvm/include/llvm/Analysis/AliasSetTracker.h
@@ -37,8 +37,8 @@ namespace llvm {
 class AliasSetTracker;
 class BasicBlock;
 class LoadInst;
-class MemSetInst;
-class MemTransferInst;
+class AnyMemSetInst;
+class AnyMemTransferInst;
 class raw_ostream;
 class StoreInst;
 class VAArgInst;
@@ -52,7 +52,7 @@ class AliasSet : public ilist_node<AliasSet> {
     PointerRec **PrevInList = nullptr;
     PointerRec *NextInList = nullptr;
     AliasSet *AS = nullptr;
-    uint64_t Size = 0;
+    LocationSize Size = 0;
     AAMDNodes AAInfo;
 
   public:
@@ -69,7 +69,7 @@ class AliasSet : public ilist_node<AliasSet> {
       return &NextInList;
     }
 
-    bool updateSizeAndAAInfo(uint64_t NewSize, const AAMDNodes &NewAAInfo) {
+    bool updateSizeAndAAInfo(LocationSize NewSize, const AAMDNodes &NewAAInfo) {
       bool SizeChanged = false;
       if (NewSize > Size) {
         Size = NewSize;
@@ -91,7 +91,7 @@ class AliasSet : public ilist_node<AliasSet> {
       return SizeChanged;
     }
 
-    uint64_t getSize() const { return Size; }
+    LocationSize getSize() const { return Size; }
 
     /// Return the AAInfo, or null if there is no information or conflicting
     /// information.
@@ -247,7 +247,7 @@ public:
     value_type *operator->() const { return &operator*(); }
 
     Value *getPointer() const { return CurNode->getValue(); }
-    uint64_t getSize() const { return CurNode->getSize(); }
+    LocationSize getSize() const { return CurNode->getSize(); }
     AAMDNodes getAAInfo() const { return CurNode->getAAInfo(); }
 
     iterator& operator++() {                // Preincrement
@@ -287,9 +287,8 @@ private:
 
   void removeFromTracker(AliasSetTracker &AST);
 
-  void addPointer(AliasSetTracker &AST, PointerRec &Entry, uint64_t Size,
-                  const AAMDNodes &AAInfo,
-                  bool KnownMustAlias = false);
+  void addPointer(AliasSetTracker &AST, PointerRec &Entry, LocationSize Size,
+                  const AAMDNodes &AAInfo, bool KnownMustAlias = false);
   void addUnknownInst(Instruction *I, AliasAnalysis &AA);
 
   void removeUnknownInst(AliasSetTracker &AST, Instruction *I) {
@@ -309,8 +308,8 @@ private:
 public:
   /// Return true if the specified pointer "may" (or must) alias one of the
   /// members in the set.
-  bool aliasesPointer(const Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo,
-                      AliasAnalysis &AA) const;
+  bool aliasesPointer(const Value *Ptr, LocationSize Size,
+                      const AAMDNodes &AAInfo, AliasAnalysis &AA) const;
   bool aliasesUnknownInst(const Instruction *Inst, AliasAnalysis &AA) const;
 };
 
@@ -364,12 +363,12 @@ public:
   /// These methods return true if inserting the instruction resulted in the
   /// addition of a new alias set (i.e., the pointer did not alias anything).
   ///
-  void add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo); // Add a loc.
+  void add(Value *Ptr, LocationSize Size, const AAMDNodes &AAInfo); // Add a loc
   void add(LoadInst *LI);
   void add(StoreInst *SI);
   void add(VAArgInst *VAAI);
-  void add(MemSetInst *MSI);
-  void add(MemTransferInst *MTI);
+  void add(AnyMemSetInst *MSI);
+  void add(AnyMemTransferInst *MTI);
   void add(Instruction *I);       // Dispatch to one of the other add methods...
   void add(BasicBlock &BB);       // Add all instructions in basic block
   void add(const AliasSetTracker &AST); // Add alias relations from another AST
@@ -384,12 +383,12 @@ public:
   /// argument is non-null, this method sets the value to true if a new alias
   /// set is created to contain the pointer (because the pointer didn't alias
   /// anything).
-  AliasSet &getAliasSetForPointer(Value *P, uint64_t Size,
+  AliasSet &getAliasSetForPointer(Value *P, LocationSize Size,
                                   const AAMDNodes &AAInfo);
 
   /// Return the alias set containing the location specified if one exists,
   /// otherwise return null.
-  AliasSet *getAliasSetForPointerIfExists(const Value *P, uint64_t Size,
+  AliasSet *getAliasSetForPointerIfExists(const Value *P, LocationSize Size,
                                           const AAMDNodes &AAInfo) {
     return mergeAliasSetsForPointer(P, Size, AAInfo);
   }
@@ -446,9 +445,9 @@ private:
     return *Entry;
   }
 
-  AliasSet &addPointer(Value *P, uint64_t Size, const AAMDNodes &AAInfo,
+  AliasSet &addPointer(Value *P, LocationSize Size, const AAMDNodes &AAInfo,
                        AliasSet::AccessLattice E);
-  AliasSet *mergeAliasSetsForPointer(const Value *Ptr, uint64_t Size,
+  AliasSet *mergeAliasSetsForPointer(const Value *Ptr, LocationSize Size,
                                      const AAMDNodes &AAInfo);
 
   /// Merge all alias sets into a single set that is considered to alias any
diff --git a/contrib/llvm/include/llvm/Analysis/AssumptionCache.h b/contrib/llvm/include/llvm/Analysis/AssumptionCache.h
index c965e62a0216..46538b1fa86f 100644
--- a/contrib/llvm/include/llvm/Analysis/AssumptionCache.h
+++ b/contrib/llvm/include/llvm/Analysis/AssumptionCache.h
@@ -32,20 +32,20 @@ class Function;
 class raw_ostream;
 class Value;
 
-/// \brief A cache of @llvm.assume calls within a function.
+/// A cache of \@llvm.assume calls within a function.
 ///
 /// This cache provides fast lookup of assumptions within a function by caching
 /// them and amortizing the cost of scanning for them across all queries. Passes
 /// that create new assumptions are required to call registerAssumption() to
-/// register any new @llvm.assume calls that they create. Deletions of
-/// @llvm.assume calls do not require special handling.
+/// register any new \@llvm.assume calls that they create. Deletions of
+/// \@llvm.assume calls do not require special handling.
 class AssumptionCache {
-  /// \brief The function for which this cache is handling assumptions.
+  /// The function for which this cache is handling assumptions.
   ///
   /// We track this to lazily populate our assumptions.
   Function &F;
 
-  /// \brief Vector of weak value handles to calls of the @llvm.assume
+  /// Vector of weak value handles to calls of the \@llvm.assume
   /// intrinsic.
   SmallVector<WeakTrackingVH, 4> AssumeHandles;
 
@@ -64,7 +64,7 @@ class AssumptionCache {
 
   friend AffectedValueCallbackVH;
 
-  /// \brief A map of values about which an assumption might be providing
+  /// A map of values about which an assumption might be providing
   /// information to the relevant set of assumptions.
   using AffectedValuesMap =
       DenseMap<AffectedValueCallbackVH, SmallVector<WeakTrackingVH, 1>,
@@ -77,17 +77,17 @@ class AssumptionCache {
   /// Copy affected values in the cache for OV to be affected values for NV.
   void copyAffectedValuesInCache(Value *OV, Value *NV);
 
-  /// \brief Flag tracking whether we have scanned the function yet.
+  /// Flag tracking whether we have scanned the function yet.
   ///
   /// We want to be as lazy about this as possible, and so we scan the function
   /// at the last moment.
   bool Scanned = false;
 
-  /// \brief Scan the function for assumptions and add them to the cache.
+  /// Scan the function for assumptions and add them to the cache.
   void scanFunction();
 
 public:
-  /// \brief Construct an AssumptionCache from a function by scanning all of
+  /// Construct an AssumptionCache from a function by scanning all of
   /// its instructions.
   AssumptionCache(Function &F) : F(F) {}
 
@@ -98,17 +98,17 @@ public:
     return false;
   }
 
-  /// \brief Add an @llvm.assume intrinsic to this function's cache.
+  /// Add an \@llvm.assume intrinsic to this function's cache.
   ///
   /// The call passed in must be an instruction within this function and must
   /// not already be in the cache.
   void registerAssumption(CallInst *CI);
 
-  /// \brief Update the cache of values being affected by this assumption (i.e.
+  /// Update the cache of values being affected by this assumption (i.e.
   /// the values about which this assumption provides information).
   void updateAffectedValues(CallInst *CI);
 
-  /// \brief Clear the cache of @llvm.assume intrinsics for a function.
+  /// Clear the cache of \@llvm.assume intrinsics for a function.
   ///
   /// It will be re-scanned the next time it is requested.
   void clear() {
@@ -117,7 +117,7 @@ public:
     Scanned = false;
   }
 
-  /// \brief Access the list of assumption handles currently tracked for this
+  /// Access the list of assumption handles currently tracked for this
   /// function.
   ///
   /// Note that these produce weak handles that may be null. The caller must
@@ -131,7 +131,7 @@ public:
     return AssumeHandles;
   }
 
-  /// \brief Access the list of assumptions which affect this value.
+  /// Access the list of assumptions which affect this value.
   MutableArrayRef<WeakTrackingVH> assumptionsFor(const Value *V) {
     if (!Scanned)
       scanFunction();
@@ -144,7 +144,7 @@ public:
   }
 };
 
-/// \brief A function analysis which provides an \c AssumptionCache.
+/// A function analysis which provides an \c AssumptionCache.
 ///
 /// This analysis is intended for use with the new pass manager and will vend
 /// assumption caches for a given function.
@@ -161,7 +161,7 @@ public:
   }
 };
 
-/// \brief Printer pass for the \c AssumptionAnalysis results.
+/// Printer pass for the \c AssumptionAnalysis results.
 class AssumptionPrinterPass : public PassInfoMixin<AssumptionPrinterPass> {
   raw_ostream &OS;
 
@@ -171,7 +171,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief An immutable pass that tracks lazily created \c AssumptionCache
+/// An immutable pass that tracks lazily created \c AssumptionCache
 /// objects.
 ///
 /// This is essentially a workaround for the legacy pass manager's weaknesses
@@ -203,7 +203,7 @@ class AssumptionCacheTracker : public ImmutablePass {
   FunctionCallsMap AssumptionCaches;
 
 public:
-  /// \brief Get the cached assumptions for a function.
+  /// Get the cached assumptions for a function.
   ///
   /// If no assumptions are cached, this will scan the function. Otherwise, the
   /// existing cache will be returned.
diff --git a/contrib/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/contrib/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index 42e5e9714071..fa81539a9d6f 100644
--- a/contrib/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -55,26 +55,27 @@ class BasicAAResult : public AAResultBase<BasicAAResult> {
   friend AAResultBase<BasicAAResult>;
 
   const DataLayout &DL;
+  const Function &F;
   const TargetLibraryInfo &TLI;
   AssumptionCache &AC;
   DominatorTree *DT;
   LoopInfo *LI;
 
 public:
-  BasicAAResult(const DataLayout &DL, const TargetLibraryInfo &TLI,
-                AssumptionCache &AC, DominatorTree *DT = nullptr,
-                LoopInfo *LI = nullptr)
-      : AAResultBase(), DL(DL), TLI(TLI), AC(AC), DT(DT), LI(LI) {}
+  BasicAAResult(const DataLayout &DL, const Function &F,
+                const TargetLibraryInfo &TLI, AssumptionCache &AC,
+                DominatorTree *DT = nullptr, LoopInfo *LI = nullptr)
+      : AAResultBase(), DL(DL), F(F), TLI(TLI), AC(AC), DT(DT), LI(LI) {}
 
   BasicAAResult(const BasicAAResult &Arg)
-      : AAResultBase(Arg), DL(Arg.DL), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT),
-        LI(Arg.LI) {}
-  BasicAAResult(BasicAAResult &&Arg)
-      : AAResultBase(std::move(Arg)), DL(Arg.DL), TLI(Arg.TLI), AC(Arg.AC),
+      : AAResultBase(Arg), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI), AC(Arg.AC),
         DT(Arg.DT), LI(Arg.LI) {}
+  BasicAAResult(BasicAAResult &&Arg)
+      : AAResultBase(std::move(Arg)), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI),
+        AC(Arg.AC), DT(Arg.DT), LI(Arg.LI) {}
 
   /// Handle invalidation events in the new pass manager.
-  bool invalidate(Function &F, const PreservedAnalyses &PA,
+  bool invalidate(Function &Fn, const PreservedAnalyses &PA,
                   FunctionAnalysisManager::Invalidator &Inv);
 
   AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
@@ -94,7 +95,7 @@ public:
 
   /// Returns the behavior when calling the given function. For use when the
   /// call site is not known.
-  FunctionModRefBehavior getModRefBehavior(const Function *F);
+  FunctionModRefBehavior getModRefBehavior(const Function *Fn);
 
 private:
   // A linear transformation of a Value; this class represents ZExt(SExt(V,
@@ -171,9 +172,9 @@ private:
 
   static bool isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
-      uint64_t ObjectAccessSize);
+      LocationSize ObjectAccessSize);
 
-  /// \brief A Heuristic for aliasGEP that searches for a constant offset
+  /// A Heuristic for aliasGEP that searches for a constant offset
   /// between the variables.
   ///
   /// GetLinearExpression has some limitations, as generally zext(%x + 1)
@@ -183,31 +184,33 @@ private:
   /// the addition overflows.
   bool
   constantOffsetHeuristic(const SmallVectorImpl<VariableGEPIndex> &VarIndices,
-                          uint64_t V1Size, uint64_t V2Size, int64_t BaseOffset,
-                          AssumptionCache *AC, DominatorTree *DT);
+                          LocationSize V1Size, LocationSize V2Size,
+                          int64_t BaseOffset, AssumptionCache *AC,
+                          DominatorTree *DT);
 
   bool isValueEqualInPotentialCycles(const Value *V1, const Value *V2);
 
   void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
                           const SmallVectorImpl<VariableGEPIndex> &Src);
 
-  AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size,
+  AliasResult aliasGEP(const GEPOperator *V1, LocationSize V1Size,
                        const AAMDNodes &V1AAInfo, const Value *V2,
-                       uint64_t V2Size, const AAMDNodes &V2AAInfo,
+                       LocationSize V2Size, const AAMDNodes &V2AAInfo,
                        const Value *UnderlyingV1, const Value *UnderlyingV2);
 
-  AliasResult aliasPHI(const PHINode *PN, uint64_t PNSize,
+  AliasResult aliasPHI(const PHINode *PN, LocationSize PNSize,
                        const AAMDNodes &PNAAInfo, const Value *V2,
-                       uint64_t V2Size, const AAMDNodes &V2AAInfo,
+                       LocationSize V2Size, const AAMDNodes &V2AAInfo,
                        const Value *UnderV2);
 
-  AliasResult aliasSelect(const SelectInst *SI, uint64_t SISize,
+  AliasResult aliasSelect(const SelectInst *SI, LocationSize SISize,
                           const AAMDNodes &SIAAInfo, const Value *V2,
-                          uint64_t V2Size, const AAMDNodes &V2AAInfo,
+                          LocationSize V2Size, const AAMDNodes &V2AAInfo,
                           const Value *UnderV2);
 
-  AliasResult aliasCheck(const Value *V1, uint64_t V1Size, AAMDNodes V1AATag,
-                         const Value *V2, uint64_t V2Size, AAMDNodes V2AATag,
+  AliasResult aliasCheck(const Value *V1, LocationSize V1Size,
+                         AAMDNodes V1AATag, const Value *V2,
+                         LocationSize V2Size, AAMDNodes V2AATag,
                          const Value *O1 = nullptr, const Value *O2 = nullptr);
 };
 
diff --git a/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfo.h b/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
index 89370cbeeea1..ca12db6208b8 100644
--- a/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
@@ -65,17 +65,17 @@ public:
   /// floating points.
   BlockFrequency getBlockFreq(const BasicBlock *BB) const;
 
-  /// \brief Returns the estimated profile count of \p BB.
+  /// Returns the estimated profile count of \p BB.
   /// This computes the relative block frequency of \p BB and multiplies it by
   /// the enclosing function's count (if available) and returns the value.
   Optional<uint64_t> getBlockProfileCount(const BasicBlock *BB) const;
 
-  /// \brief Returns the estimated profile count of \p Freq.
+  /// Returns the estimated profile count of \p Freq.
   /// This uses the frequency \p Freq and multiplies it by
   /// the enclosing function's count (if available) and returns the value.
   Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
 
-  /// \brief Returns true if \p BB is an irreducible loop header
+  /// Returns true if \p BB is an irreducible loop header
   /// block. Otherwise false.
   bool isIrrLoopHeader(const BasicBlock *BB);
 
@@ -105,7 +105,7 @@ public:
   void print(raw_ostream &OS) const;
 };
 
-/// \brief Analysis pass which computes \c BlockFrequencyInfo.
+/// Analysis pass which computes \c BlockFrequencyInfo.
 class BlockFrequencyAnalysis
     : public AnalysisInfoMixin<BlockFrequencyAnalysis> {
   friend AnalysisInfoMixin<BlockFrequencyAnalysis>;
@@ -113,14 +113,14 @@ class BlockFrequencyAnalysis
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result type for this analysis pass.
+  /// Provide the result type for this analysis pass.
   using Result = BlockFrequencyInfo;
 
-  /// \brief Run the analysis pass over a function and produce BFI.
+  /// Run the analysis pass over a function and produce BFI.
   Result run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for the \c BlockFrequencyInfo results.
+/// Printer pass for the \c BlockFrequencyInfo results.
 class BlockFrequencyPrinterPass
     : public PassInfoMixin<BlockFrequencyPrinterPass> {
   raw_ostream &OS;
@@ -131,7 +131,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Legacy analysis pass which computes \c BlockFrequencyInfo.
+/// Legacy analysis pass which computes \c BlockFrequencyInfo.
 class BlockFrequencyInfoWrapperPass : public FunctionPass {
   BlockFrequencyInfo BFI;
 
diff --git a/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 40c40b80bc89..25b2efd33c98 100644
--- a/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/contrib/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -66,7 +66,7 @@ struct IrreducibleGraph;
 // This is part of a workaround for a GCC 4.7 crash on lambdas.
 template <class BT> struct BlockEdgesAdder;
 
-/// \brief Mass of a block.
+/// Mass of a block.
 ///
 /// This class implements a sort of fixed-point fraction always between 0.0 and
 /// 1.0.  getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
@@ -100,7 +100,7 @@ public:
 
   bool operator!() const { return isEmpty(); }
 
-  /// \brief Add another mass.
+  /// Add another mass.
   ///
   /// Adds another mass, saturating at \a isFull() rather than overflowing.
   BlockMass &operator+=(BlockMass X) {
@@ -109,7 +109,7 @@ public:
     return *this;
   }
 
-  /// \brief Subtract another mass.
+  /// Subtract another mass.
   ///
   /// Subtracts another mass, saturating at \a isEmpty() rather than
   /// undeflowing.
@@ -131,7 +131,7 @@ public:
   bool operator<(BlockMass X) const { return Mass < X.Mass; }
   bool operator>(BlockMass X) const { return Mass > X.Mass; }
 
-  /// \brief Convert to scaled number.
+  /// Convert to scaled number.
   ///
   /// Convert to \a ScaledNumber.  \a isFull() gives 1.0, while \a isEmpty()
   /// gives slightly above 0.0.
@@ -164,7 +164,7 @@ template <> struct isPodLike<bfi_detail::BlockMass> {
   static const bool value = true;
 };
 
-/// \brief Base class for BlockFrequencyInfoImpl
+/// Base class for BlockFrequencyInfoImpl
 ///
 /// BlockFrequencyInfoImplBase has supporting data structures and some
 /// algorithms for BlockFrequencyInfoImplBase.  Only algorithms that depend on
@@ -177,7 +177,7 @@ public:
   using Scaled64 = ScaledNumber<uint64_t>;
   using BlockMass = bfi_detail::BlockMass;
 
-  /// \brief Representative of a block.
+  /// Representative of a block.
   ///
   /// This is a simple wrapper around an index into the reverse-post-order
   /// traversal of the blocks.
@@ -206,13 +206,13 @@ public:
     }
   };
 
-  /// \brief Stats about a block itself.
+  /// Stats about a block itself.
   struct FrequencyData {
     Scaled64 Scaled;
     uint64_t Integer;
   };
 
-  /// \brief Data about a loop.
+  /// Data about a loop.
   ///
   /// Contains the data necessary to represent a loop as a pseudo-node once it's
   /// packaged.
@@ -270,7 +270,7 @@ public:
     }
   };
 
-  /// \brief Index of loop information.
+  /// Index of loop information.
   struct WorkingData {
     BlockNode Node;           ///< This node.
     LoopData *Loop = nullptr; ///< The loop this block is inside.
@@ -293,7 +293,7 @@ public:
       return Loop->Parent->Parent;
     }
 
-    /// \brief Resolve a node to its representative.
+    /// Resolve a node to its representative.
     ///
     /// Get the node currently representing Node, which could be a containing
     /// loop.
@@ -320,7 +320,7 @@ public:
       return L;
     }
 
-    /// \brief Get the appropriate mass for a node.
+    /// Get the appropriate mass for a node.
     ///
     /// Get appropriate mass for Node.  If Node is a loop-header (whose loop
     /// has been packaged), returns the mass of its pseudo-node.  If it's a
@@ -333,19 +333,19 @@ public:
       return Loop->Parent->Mass;
     }
 
-    /// \brief Has ContainingLoop been packaged up?
+    /// Has ContainingLoop been packaged up?
     bool isPackaged() const { return getResolvedNode() != Node; }
 
-    /// \brief Has Loop been packaged up?
+    /// Has Loop been packaged up?
     bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
 
-    /// \brief Has Loop been packaged up twice?
+    /// Has Loop been packaged up twice?
     bool isADoublePackage() const {
       return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
     }
   };
 
-  /// \brief Unscaled probability weight.
+  /// Unscaled probability weight.
   ///
   /// Probability weight for an edge in the graph (including the
   /// successor/target node).
@@ -369,7 +369,7 @@ public:
         : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
   };
 
-  /// \brief Distribution of unscaled probability weight.
+  /// Distribution of unscaled probability weight.
   ///
   /// Distribution of unscaled probability weight to a set of successors.
   ///
@@ -398,7 +398,7 @@ public:
       add(Node, Amount, Weight::Backedge);
     }
 
-    /// \brief Normalize the distribution.
+    /// Normalize the distribution.
     ///
     /// Combines multiple edges to the same \a Weight::TargetNode and scales
     /// down so that \a Total fits into 32-bits.
@@ -413,26 +413,26 @@ public:
     void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
   };
 
-  /// \brief Data about each block.  This is used downstream.
+  /// Data about each block.  This is used downstream.
   std::vector<FrequencyData> Freqs;
 
-  /// \brief Whether each block is an irreducible loop header.
+  /// Whether each block is an irreducible loop header.
   /// This is used downstream.
   SparseBitVector<> IsIrrLoopHeader;
 
-  /// \brief Loop data: see initializeLoops().
+  /// Loop data: see initializeLoops().
   std::vector<WorkingData> Working;
 
-  /// \brief Indexed information about loops.
+  /// Indexed information about loops.
   std::list<LoopData> Loops;
 
-  /// \brief Virtual destructor.
+  /// Virtual destructor.
   ///
   /// Need a virtual destructor to mask the compiler warning about
   /// getBlockName().
   virtual ~BlockFrequencyInfoImplBase() = default;
 
-  /// \brief Add all edges out of a packaged loop to the distribution.
+  /// Add all edges out of a packaged loop to the distribution.
   ///
   /// Adds all edges from LocalLoopHead to Dist.  Calls addToDist() to add each
   /// successor edge.
@@ -441,7 +441,7 @@ public:
   bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
                                Distribution &Dist);
 
-  /// \brief Add an edge to the distribution.
+  /// Add an edge to the distribution.
   ///
   /// Adds an edge to Succ to Dist.  If \c LoopHead.isValid(), then whether the
   /// edge is local/exit/backedge is in the context of LoopHead.  Otherwise,
@@ -457,7 +457,7 @@ public:
     return *Working[Head.Index].Loop;
   }
 
-  /// \brief Analyze irreducible SCCs.
+  /// Analyze irreducible SCCs.
   ///
   /// Separate irreducible SCCs from \c G, which is an explict graph of \c
   /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
@@ -468,7 +468,7 @@ public:
   analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
                      std::list<LoopData>::iterator Insert);
 
-  /// \brief Update a loop after packaging irreducible SCCs inside of it.
+  /// Update a loop after packaging irreducible SCCs inside of it.
   ///
   /// Update \c OuterLoop.  Before finding irreducible control flow, it was
   /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
@@ -476,7 +476,7 @@ public:
   /// up need to be removed from \a OuterLoop::Nodes.
   void updateLoopWithIrreducible(LoopData &OuterLoop);
 
-  /// \brief Distribute mass according to a distribution.
+  /// Distribute mass according to a distribution.
   ///
   /// Distributes the mass in Source according to Dist.  If LoopHead.isValid(),
   /// backedges and exits are stored in its entry in Loops.
@@ -485,7 +485,7 @@ public:
   void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
                       Distribution &Dist);
 
-  /// \brief Compute the loop scale for a loop.
+  /// Compute the loop scale for a loop.
   void computeLoopScale(LoopData &Loop);
 
   /// Adjust the mass of all headers in an irreducible loop.
@@ -500,19 +500,19 @@ public:
 
   void distributeIrrLoopHeaderMass(Distribution &Dist);
 
-  /// \brief Package up a loop.
+  /// Package up a loop.
   void packageLoop(LoopData &Loop);
 
-  /// \brief Unwrap loops.
+  /// Unwrap loops.
   void unwrapLoops();
 
-  /// \brief Finalize frequency metrics.
+  /// Finalize frequency metrics.
   ///
   /// Calculates final frequencies and cleans up no-longer-needed data
   /// structures.
   void finalizeMetrics();
 
-  /// \brief Clear all memory.
+  /// Clear all memory.
   void clear();
 
   virtual std::string getBlockName(const BlockNode &Node) const;
@@ -560,7 +560,7 @@ template <> struct TypeMap<MachineBasicBlock> {
   using LoopInfoT = MachineLoopInfo;
 };
 
-/// \brief Get the name of a MachineBasicBlock.
+/// Get the name of a MachineBasicBlock.
 ///
 /// Get the name of a MachineBasicBlock.  It's templated so that including from
 /// CodeGen is unnecessary (that would be a layering issue).
@@ -574,13 +574,13 @@ template <class BlockT> std::string getBlockName(const BlockT *BB) {
     return (MachineName + "[" + BB->getName() + "]").str();
   return MachineName.str();
 }
-/// \brief Get the name of a BasicBlock.
+/// Get the name of a BasicBlock.
 template <> inline std::string getBlockName(const BasicBlock *BB) {
   assert(BB && "Unexpected nullptr");
   return BB->getName().str();
 }
 
-/// \brief Graph of irreducible control flow.
+/// Graph of irreducible control flow.
 ///
 /// This graph is used for determining the SCCs in a loop (or top-level
 /// function) that has irreducible control flow.
@@ -619,7 +619,7 @@ struct IrreducibleGraph {
   std::vector<IrrNode> Nodes;
   SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
 
-  /// \brief Construct an explicit graph containing irreducible control flow.
+  /// Construct an explicit graph containing irreducible control flow.
   ///
   /// Construct an explicit graph of the control flow in \c OuterLoop (or the
   /// top-level function, if \c OuterLoop is \c nullptr).  Uses \c
@@ -687,7 +687,7 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
 
 } // end namespace bfi_detail
 
-/// \brief Shared implementation for block frequency analysis.
+/// Shared implementation for block frequency analysis.
 ///
 /// This is a shared implementation of BlockFrequencyInfo and
 /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
@@ -878,12 +878,12 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
     return RPOT[Node.Index];
   }
 
-  /// \brief Run (and save) a post-order traversal.
+  /// Run (and save) a post-order traversal.
   ///
   /// Saves a reverse post-order traversal of all the nodes in \a F.
   void initializeRPOT();
 
-  /// \brief Initialize loop data.
+  /// Initialize loop data.
   ///
   /// Build up \a Loops using \a LoopInfo.  \a LoopInfo gives us a mapping from
   /// each block to the deepest loop it's in, but we need the inverse.  For each
@@ -892,7 +892,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   /// the loop that are not in sub-loops.
   void initializeLoops();
 
-  /// \brief Propagate to a block's successors.
+  /// Propagate to a block's successors.
   ///
   /// In the context of distributing mass through \c OuterLoop, divide the mass
   /// currently assigned to \c Node between its successors.
@@ -900,7 +900,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   /// \return \c true unless there's an irreducible backedge.
   bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
 
-  /// \brief Compute mass in a particular loop.
+  /// Compute mass in a particular loop.
   ///
   /// Assign mass to \c Loop's header, and then for each block in \c Loop in
   /// reverse post-order, distribute mass to its successors.  Only visits nodes
@@ -910,7 +910,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   /// \return \c true unless there's an irreducible backedge.
   bool computeMassInLoop(LoopData &Loop);
 
-  /// \brief Try to compute mass in the top-level function.
+  /// Try to compute mass in the top-level function.
   ///
   /// Assign mass to the entry block, and then for each block in reverse
   /// post-order, distribute mass to its successors.  Skips nodes that have
@@ -920,7 +920,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   /// \return \c true unless there's an irreducible backedge.
   bool tryToComputeMassInFunction();
 
-  /// \brief Compute mass in (and package up) irreducible SCCs.
+  /// Compute mass in (and package up) irreducible SCCs.
   ///
   /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
   /// of \c Insert), and call \a computeMassInLoop() on each of them.
@@ -935,7 +935,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   void computeIrreducibleMass(LoopData *OuterLoop,
                               std::list<LoopData>::iterator Insert);
 
-  /// \brief Compute mass in all loops.
+  /// Compute mass in all loops.
   ///
   /// For each loop bottom-up, call \a computeMassInLoop().
   ///
@@ -946,7 +946,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   /// \post \a computeMassInLoop() has returned \c true for every loop.
   void computeMassInLoops();
 
-  /// \brief Compute mass in the top-level function.
+  /// Compute mass in the top-level function.
   ///
   /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
   /// compute mass in the top-level function.
@@ -994,7 +994,7 @@ public:
 
   const BranchProbabilityInfoT &getBPI() const { return *BPI; }
 
-  /// \brief Print the frequencies for the current function.
+  /// Print the frequencies for the current function.
   ///
   /// Prints the frequencies for the blocks in the current function.
   ///
@@ -1030,8 +1030,9 @@ void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
   Nodes.clear();
 
   // Initialize.
-  DEBUG(dbgs() << "\nblock-frequency: " << F.getName() << "\n================="
-               << std::string(F.getName().size(), '=') << "\n");
+  LLVM_DEBUG(dbgs() << "\nblock-frequency: " << F.getName()
+                    << "\n================="
+                    << std::string(F.getName().size(), '=') << "\n");
   initializeRPOT();
   initializeLoops();
 
@@ -1067,10 +1068,11 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
   assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
          "More nodes in function than Block Frequency Info supports");
 
-  DEBUG(dbgs() << "reverse-post-order-traversal\n");
+  LLVM_DEBUG(dbgs() << "reverse-post-order-traversal\n");
   for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
     BlockNode Node = getNode(I);
-    DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
+    LLVM_DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node)
+                      << "\n");
     Nodes[*I] = Node;
   }
 
@@ -1081,7 +1083,7 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
 }
 
 template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
-  DEBUG(dbgs() << "loop-detection\n");
+  LLVM_DEBUG(dbgs() << "loop-detection\n");
   if (LI->empty())
     return;
 
@@ -1099,7 +1101,7 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
 
     Loops.emplace_back(Parent, Header);
     Working[Header.Index].Loop = &Loops.back();
-    DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
+    LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
 
     for (const LoopT *L : *Loop)
       Q.emplace_back(L, &Loops.back());
@@ -1128,8 +1130,8 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
 
     Working[Index].Loop = HeaderData.Loop;
     HeaderData.Loop->Nodes.push_back(Index);
-    DEBUG(dbgs() << " - loop = " << getBlockName(Header)
-                 << ": member = " << getBlockName(Index) << "\n");
+    LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header)
+                      << ": member = " << getBlockName(Index) << "\n");
   }
 }
 
@@ -1150,10 +1152,10 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
 template <class BT>
 bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
   // Compute mass in loop.
-  DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
+  LLVM_DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
 
   if (Loop.isIrreducible()) {
-    DEBUG(dbgs() << "isIrreducible = true\n");
+    LLVM_DEBUG(dbgs() << "isIrreducible = true\n");
     Distribution Dist;
     unsigned NumHeadersWithWeight = 0;
     Optional<uint64_t> MinHeaderWeight;
@@ -1165,14 +1167,14 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
       IsIrrLoopHeader.set(Loop.Nodes[H].Index);
       Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
       if (!HeaderWeight) {
-        DEBUG(dbgs() << "Missing irr loop header metadata on "
-              << getBlockName(HeaderNode) << "\n");
+        LLVM_DEBUG(dbgs() << "Missing irr loop header metadata on "
+                          << getBlockName(HeaderNode) << "\n");
         HeadersWithoutWeight.insert(H);
         continue;
       }
-      DEBUG(dbgs() << getBlockName(HeaderNode)
-            << " has irr loop header weight " << HeaderWeight.getValue()
-            << "\n");
+      LLVM_DEBUG(dbgs() << getBlockName(HeaderNode)
+                        << " has irr loop header weight "
+                        << HeaderWeight.getValue() << "\n");
       NumHeadersWithWeight++;
       uint64_t HeaderWeightValue = HeaderWeight.getValue();
       if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
@@ -1194,8 +1196,8 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
       assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
              "Shouldn't have a weight metadata");
       uint64_t MinWeight = MinHeaderWeight.getValue();
-      DEBUG(dbgs() << "Giving weight " << MinWeight
-            << " to " << getBlockName(HeaderNode) << "\n");
+      LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to "
+                        << getBlockName(HeaderNode) << "\n");
       if (MinWeight)
         Dist.addLocal(HeaderNode, MinWeight);
     }
@@ -1224,7 +1226,7 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
 template <class BT>
 bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
   // Compute mass in function.
-  DEBUG(dbgs() << "compute-mass-in-function\n");
+  LLVM_DEBUG(dbgs() << "compute-mass-in-function\n");
   assert(!Working.empty() && "no blocks in function");
   assert(!Working[0].isLoopHeader() && "entry block is a loop header");
 
@@ -1276,9 +1278,10 @@ template <class BT> struct BlockEdgesAdder {
 template <class BT>
 void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
     LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
-  DEBUG(dbgs() << "analyze-irreducible-in-";
-        if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n";
-        else dbgs() << "function\n");
+  LLVM_DEBUG(dbgs() << "analyze-irreducible-in-";
+             if (OuterLoop) dbgs()
+             << "loop: " << getLoopName(*OuterLoop) << "\n";
+             else dbgs() << "function\n");
 
   using namespace bfi_detail;
 
@@ -1304,7 +1307,7 @@ template <class BT>
 bool
 BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
                                                       const BlockNode &Node) {
-  DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
+  LLVM_DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
   // Calculate probability for successors.
   Distribution Dist;
   if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
diff --git a/contrib/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/contrib/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index 417b64978811..45277db46090 100644
--- a/contrib/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -38,7 +38,7 @@ class raw_ostream;
 class TargetLibraryInfo;
 class Value;
 
-/// \brief Analysis providing branch probability information.
+/// Analysis providing branch probability information.
 ///
 /// This is a function analysis which provides information on the relative
 /// probabilities of each "edge" in the function's CFG where such an edge is
@@ -79,7 +79,7 @@ public:
 
   void print(raw_ostream &OS) const;
 
-  /// \brief Get an edge's probability, relative to other out-edges of the Src.
+  /// Get an edge's probability, relative to other out-edges of the Src.
   ///
   /// This routine provides access to the fractional probability between zero
   /// (0%) and one (100%) of this edge executing, relative to other edges
@@ -88,7 +88,7 @@ public:
   BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        unsigned IndexInSuccessors) const;
 
-  /// \brief Get the probability of going from Src to Dst.
+  /// Get the probability of going from Src to Dst.
   ///
   /// It returns the sum of all probabilities for edges from Src to Dst.
   BranchProbability getEdgeProbability(const BasicBlock *Src,
@@ -97,19 +97,19 @@ public:
   BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        succ_const_iterator Dst) const;
 
-  /// \brief Test if an edge is hot relative to other out-edges of the Src.
+  /// Test if an edge is hot relative to other out-edges of the Src.
   ///
   /// Check whether this edge out of the source block is 'hot'. We define hot
   /// as having a relative probability >= 80%.
   bool isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const;
 
-  /// \brief Retrieve the hot successor of a block if one exists.
+  /// Retrieve the hot successor of a block if one exists.
   ///
   /// Given a basic block, look through its successors and if one exists for
   /// which \see isEdgeHot would return true, return that successor block.
   const BasicBlock *getHotSucc(const BasicBlock *BB) const;
 
-  /// \brief Print an edge's probability.
+  /// Print an edge's probability.
   ///
   /// Retrieves an edge's probability similarly to \see getEdgeProbability, but
   /// then prints that probability to the provided stream. That stream is then
@@ -117,7 +117,7 @@ public:
   raw_ostream &printEdgeProbability(raw_ostream &OS, const BasicBlock *Src,
                                     const BasicBlock *Dst) const;
 
-  /// \brief Set the raw edge probability for the given edge.
+  /// Set the raw edge probability for the given edge.
   ///
   /// This allows a pass to explicitly set the edge probability for an edge. It
   /// can be used when updating the CFG to update and preserve the branch
@@ -179,13 +179,13 @@ private:
 
   DenseMap<Edge, BranchProbability> Probs;
 
-  /// \brief Track the last function we run over for printing.
+  /// Track the last function we run over for printing.
   const Function *LastF;
 
-  /// \brief Track the set of blocks directly succeeded by a returning block.
+  /// Track the set of blocks directly succeeded by a returning block.
   SmallPtrSet<const BasicBlock *, 16> PostDominatedByUnreachable;
 
-  /// \brief Track the set of blocks that always lead to a cold call.
+  /// Track the set of blocks that always lead to a cold call.
   SmallPtrSet<const BasicBlock *, 16> PostDominatedByColdCall;
 
   void updatePostDominatedByUnreachable(const BasicBlock *BB);
@@ -201,7 +201,7 @@ private:
   bool calcInvokeHeuristics(const BasicBlock *BB);
 };
 
-/// \brief Analysis pass which computes \c BranchProbabilityInfo.
+/// Analysis pass which computes \c BranchProbabilityInfo.
 class BranchProbabilityAnalysis
     : public AnalysisInfoMixin<BranchProbabilityAnalysis> {
   friend AnalysisInfoMixin<BranchProbabilityAnalysis>;
@@ -209,14 +209,14 @@ class BranchProbabilityAnalysis
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result type for this analysis pass.
+  /// Provide the result type for this analysis pass.
   using Result = BranchProbabilityInfo;
 
-  /// \brief Run the analysis pass over a function and produce BPI.
+  /// Run the analysis pass over a function and produce BPI.
   BranchProbabilityInfo run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for the \c BranchProbabilityAnalysis results.
+/// Printer pass for the \c BranchProbabilityAnalysis results.
 class BranchProbabilityPrinterPass
     : public PassInfoMixin<BranchProbabilityPrinterPass> {
   raw_ostream &OS;
@@ -227,7 +227,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Legacy analysis pass which computes \c BranchProbabilityInfo.
+/// Legacy analysis pass which computes \c BranchProbabilityInfo.
 class BranchProbabilityInfoWrapperPass : public FunctionPass {
   BranchProbabilityInfo BPI;
 
diff --git a/contrib/llvm/include/llvm/Analysis/CFG.h b/contrib/llvm/include/llvm/Analysis/CFG.h
index eab64176f0d7..cccdd1637411 100644
--- a/contrib/llvm/include/llvm/Analysis/CFG.h
+++ b/contrib/llvm/include/llvm/Analysis/CFG.h
@@ -49,7 +49,7 @@ unsigned GetSuccessorNumber(const BasicBlock *BB, const BasicBlock *Succ);
 bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
                     bool AllowIdenticalEdges = false);
 
-/// \brief Determine whether instruction 'To' is reachable from 'From',
+/// Determine whether instruction 'To' is reachable from 'From',
 /// returning true if uncertain.
 ///
 /// Determine whether there is a path from From to To within a single function.
@@ -68,7 +68,7 @@ bool isPotentiallyReachable(const Instruction *From, const Instruction *To,
                             const DominatorTree *DT = nullptr,
                             const LoopInfo *LI = nullptr);
 
-/// \brief Determine whether block 'To' is reachable from 'From', returning
+/// Determine whether block 'To' is reachable from 'From', returning
 /// true if uncertain.
 ///
 /// Determine whether there is a path from From to To within a single function.
@@ -78,7 +78,7 @@ bool isPotentiallyReachable(const BasicBlock *From, const BasicBlock *To,
                             const DominatorTree *DT = nullptr,
                             const LoopInfo *LI = nullptr);
 
-/// \brief Determine whether there is at least one path from a block in
+/// Determine whether there is at least one path from a block in
 /// 'Worklist' to 'StopBB', returning true if uncertain.
 ///
 /// Determine whether there is a path from at least one block in Worklist to
@@ -89,6 +89,73 @@ bool isPotentiallyReachableFromMany(SmallVectorImpl<BasicBlock *> &Worklist,
                                     BasicBlock *StopBB,
                                     const DominatorTree *DT = nullptr,
                                     const LoopInfo *LI = nullptr);
+
+/// Return true if the control flow in \p RPOTraversal is irreducible.
+///
+/// This is a generic implementation to detect CFG irreducibility based on loop
+/// info analysis. It can be used for any kind of CFG (Loop, MachineLoop,
+/// Function, MachineFunction, etc.) by providing an RPO traversal (\p
+/// RPOTraversal) and the loop info analysis (\p LI) of the CFG. This utility
+/// function is only recommended when loop info analysis is available. If loop
+/// info analysis isn't available, please, don't compute it explicitly for this
+/// purpose. There are more efficient ways to detect CFG irreducibility that
+/// don't require recomputing loop info analysis (e.g., T1/T2 or Tarjan's
+/// algorithm).
+///
+/// Requirements:
+///   1) GraphTraits must be implemented for NodeT type. It is used to access
+///      NodeT successors.
+//    2) \p RPOTraversal must be a valid reverse post-order traversal of the
+///      target CFG with begin()/end() iterator interfaces.
+///   3) \p LI must be a valid LoopInfoBase that contains up-to-date loop
+///      analysis information of the CFG.
+///
+/// This algorithm uses the information about reducible loop back-edges already
+/// computed in \p LI. When a back-edge is found during the RPO traversal, the
+/// algorithm checks whether the back-edge is one of the reducible back-edges in
+/// loop info. If it isn't, the CFG is irreducible. For example, for the CFG
+/// below (canonical irreducible graph) loop info won't contain any loop, so the
+/// algorithm will return that the CFG is irreducible when checking the B <-
+/// -> C back-edge.
+///
+/// (A->B, A->C, B->C, C->B, C->D)
+///    A
+///  /   \
+/// B<- ->C
+///       |
+///       D
+///
+template <class NodeT, class RPOTraversalT, class LoopInfoT,
+          class GT = GraphTraits<NodeT>>
+bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI) {
+  /// Check whether the edge (\p Src, \p Dst) is a reducible loop backedge
+  /// according to LI. I.e., check if there exists a loop that contains Src and
+  /// where Dst is the loop header.
+  auto isProperBackedge = [&](NodeT Src, NodeT Dst) {
+    for (const auto *Lp = LI.getLoopFor(Src); Lp; Lp = Lp->getParentLoop()) {
+      if (Lp->getHeader() == Dst)
+        return true;
+    }
+    return false;
+  };
+
+  SmallPtrSet<NodeT, 32> Visited;
+  for (NodeT Node : RPOTraversal) {
+    Visited.insert(Node);
+    for (NodeT Succ : make_range(GT::child_begin(Node), GT::child_end(Node))) {
+      // Succ hasn't been visited yet
+      if (!Visited.count(Succ))
+        continue;
+      // We already visited Succ, thus Node->Succ must be a backedge. Check that
+      // the head matches what we have in the loop information. Otherwise, we
+      // have an irreducible graph.
+      if (!isProperBackedge(Node, Succ))
+        return true;
+    }
+  }
+
+  return false;
+}
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h b/contrib/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
index 6239d5309581..8ae72553ab94 100644
--- a/contrib/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
@@ -56,7 +56,7 @@ public:
   /// Evict the given function from cache
   void evict(const Function *Fn);
 
-  /// \brief Get the alias summary for the given function
+  /// Get the alias summary for the given function
   /// Return nullptr if the summary is not found or not available
   const cflaa::AliasSummary *getAliasSummary(const Function &);
 
@@ -64,19 +64,19 @@ public:
   AliasResult alias(const MemoryLocation &, const MemoryLocation &);
 
 private:
-  /// \brief Ensures that the given function is available in the cache.
+  /// Ensures that the given function is available in the cache.
   /// Returns the appropriate entry from the cache.
   const Optional<FunctionInfo> &ensureCached(const Function &);
 
-  /// \brief Inserts the given Function into the cache.
+  /// Inserts the given Function into the cache.
   void scan(const Function &);
 
-  /// \brief Build summary for a given function
+  /// Build summary for a given function
   FunctionInfo buildInfoFrom(const Function &);
 
   const TargetLibraryInfo &TLI;
 
-  /// \brief Cached mapping of Functions to their StratifiedSets.
+  /// Cached mapping of Functions to their StratifiedSets.
   /// If a function's sets are currently being built, it is marked
   /// in the cache as an Optional without a value. This way, if we
   /// have any kind of recursion, it is discernable from a function
diff --git a/contrib/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h b/contrib/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h
index ee9e29046af8..09e366f11e18 100644
--- a/contrib/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/CFLSteensAliasAnalysis.h
@@ -55,16 +55,16 @@ public:
     return false;
   }
 
-  /// \brief Inserts the given Function into the cache.
+  /// Inserts the given Function into the cache.
   void scan(Function *Fn);
 
   void evict(Function *Fn);
 
-  /// \brief Ensures that the given function is available in the cache.
+  /// Ensures that the given function is available in the cache.
   /// Returns the appropriate entry from the cache.
   const Optional<FunctionInfo> &ensureCached(Function *Fn);
 
-  /// \brief Get the alias summary for the given function
+  /// Get the alias summary for the given function
   /// Return nullptr if the summary is not found or not available
   const cflaa::AliasSummary *getAliasSummary(Function &Fn);
 
@@ -92,7 +92,7 @@ public:
 private:
   const TargetLibraryInfo &TLI;
 
-  /// \brief Cached mapping of Functions to their StratifiedSets.
+  /// Cached mapping of Functions to their StratifiedSets.
   /// If a function's sets are currently being built, it is marked
   /// in the cache as an Optional without a value. This way, if we
   /// have any kind of recursion, it is discernable from a function
diff --git a/contrib/llvm/include/llvm/Analysis/CGSCCPassManager.h b/contrib/llvm/include/llvm/Analysis/CGSCCPassManager.h
index 8123cbad22ff..5e83ea2a6e2b 100644
--- a/contrib/llvm/include/llvm/Analysis/CGSCCPassManager.h
+++ b/contrib/llvm/include/llvm/Analysis/CGSCCPassManager.h
@@ -119,7 +119,7 @@ extern template class AllAnalysesOn<LazyCallGraph::SCC>;
 
 extern template class AnalysisManager<LazyCallGraph::SCC, LazyCallGraph &>;
 
-/// \brief The CGSCC analysis manager.
+/// The CGSCC analysis manager.
 ///
 /// See the documentation for the AnalysisManager template for detail
 /// documentation. This type serves as a convenient way to refer to this
@@ -140,7 +140,7 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
 extern template class PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager,
                                   LazyCallGraph &, CGSCCUpdateResult &>;
 
-/// \brief The CGSCC pass manager.
+/// The CGSCC pass manager.
 ///
 /// See the documentation for the PassManager template for details. It runs
 /// a sequence of SCC passes over each SCC that the manager is run over. This
@@ -175,10 +175,10 @@ public:
   explicit Result(CGSCCAnalysisManager &InnerAM, LazyCallGraph &G)
       : InnerAM(&InnerAM), G(&G) {}
 
-  /// \brief Accessor for the analysis manager.
+  /// Accessor for the analysis manager.
   CGSCCAnalysisManager &getManager() { return *InnerAM; }
 
-  /// \brief Handler for invalidation of the Module.
+  /// Handler for invalidation of the Module.
   ///
   /// If the proxy analysis itself is preserved, then we assume that the set of
   /// SCCs in the Module hasn't changed. Thus any pointers to SCCs in the
@@ -302,7 +302,7 @@ struct CGSCCUpdateResult {
       &InlinedInternalEdges;
 };
 
-/// \brief The core module pass which does a post-order walk of the SCCs and
+/// The core module pass which does a post-order walk of the SCCs and
 /// runs a CGSCC pass over each one.
 ///
 /// Designed to allow composition of a CGSCCPass(Manager) and
@@ -338,7 +338,7 @@ public:
     return *this;
   }
 
-  /// \brief Runs the CGSCC pass across every SCC in the module.
+  /// Runs the CGSCC pass across every SCC in the module.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
     // Setup the CGSCC analysis manager from its proxy.
     CGSCCAnalysisManager &CGAM =
@@ -387,17 +387,17 @@ public:
       do {
         LazyCallGraph::RefSCC *RC = RCWorklist.pop_back_val();
         if (InvalidRefSCCSet.count(RC)) {
-          DEBUG(dbgs() << "Skipping an invalid RefSCC...\n");
+          LLVM_DEBUG(dbgs() << "Skipping an invalid RefSCC...\n");
           continue;
         }
 
         assert(CWorklist.empty() &&
                "Should always start with an empty SCC worklist");
 
-        DEBUG(dbgs() << "Running an SCC pass across the RefSCC: " << *RC
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Running an SCC pass across the RefSCC: " << *RC
+                          << "\n");
 
-        // Push the initial SCCs in reverse post-order as we'll pop off the the
+        // Push the initial SCCs in reverse post-order as we'll pop off the
         // back and so see this in post-order.
         for (LazyCallGraph::SCC &C : llvm::reverse(*RC))
           CWorklist.insert(&C);
@@ -409,12 +409,13 @@ public:
           // other RefSCCs should be queued above, so we just need to skip both
           // scenarios here.
           if (InvalidSCCSet.count(C)) {
-            DEBUG(dbgs() << "Skipping an invalid SCC...\n");
+            LLVM_DEBUG(dbgs() << "Skipping an invalid SCC...\n");
             continue;
           }
           if (&C->getOuterRefSCC() != RC) {
-            DEBUG(dbgs() << "Skipping an SCC that is now part of some other "
-                            "RefSCC...\n");
+            LLVM_DEBUG(dbgs()
+                       << "Skipping an SCC that is now part of some other "
+                          "RefSCC...\n");
             continue;
           }
 
@@ -436,7 +437,8 @@ public:
             // If the CGSCC pass wasn't able to provide a valid updated SCC,
             // the current SCC may simply need to be skipped if invalid.
             if (UR.InvalidatedSCCs.count(C)) {
-              DEBUG(dbgs() << "Skipping invalidated root or island SCC!\n");
+              LLVM_DEBUG(dbgs()
+                         << "Skipping invalidated root or island SCC!\n");
               break;
             }
             // Check that we didn't miss any update scenario.
@@ -464,9 +466,10 @@ public:
             // FIXME: If we ever start having RefSCC passes, we'll want to
             // iterate there too.
             if (UR.UpdatedC)
-              DEBUG(dbgs() << "Re-running SCC passes after a refinement of the "
-                              "current SCC: "
-                           << *UR.UpdatedC << "\n");
+              LLVM_DEBUG(dbgs()
+                         << "Re-running SCC passes after a refinement of the "
+                            "current SCC: "
+                         << *UR.UpdatedC << "\n");
 
             // Note that both `C` and `RC` may at this point refer to deleted,
             // invalid SCC and RefSCCs respectively. But we will short circuit
@@ -494,7 +497,7 @@ private:
   CGSCCPassT Pass;
 };
 
-/// \brief A function to deduce a function pass type and wrap it in the
+/// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename CGSCCPassT>
 ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>
@@ -517,7 +520,7 @@ public:
   public:
     explicit Result(FunctionAnalysisManager &FAM) : FAM(&FAM) {}
 
-    /// \brief Accessor for the analysis manager.
+    /// Accessor for the analysis manager.
     FunctionAnalysisManager &getManager() { return *FAM; }
 
     bool invalidate(LazyCallGraph::SCC &C, const PreservedAnalyses &PA,
@@ -552,7 +555,7 @@ LazyCallGraph::SCC &updateCGAndAnalysisManagerForFunctionPass(
     LazyCallGraph &G, LazyCallGraph::SCC &C, LazyCallGraph::Node &N,
     CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR);
 
-/// \brief Adaptor that maps from a SCC to its functions.
+/// Adaptor that maps from a SCC to its functions.
 ///
 /// Designed to allow composition of a FunctionPass(Manager) and
 /// a CGSCCPassManager. Note that if this pass is constructed with a pointer
@@ -585,7 +588,7 @@ public:
     return *this;
   }
 
-  /// \brief Runs the function pass across every function in the module.
+  /// Runs the function pass across every function in the module.
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR) {
     // Setup the function analysis manager from its proxy.
@@ -601,7 +604,8 @@ public:
     // a pointer we can overwrite.
     LazyCallGraph::SCC *CurrentC = &C;
 
-    DEBUG(dbgs() << "Running function passes across an SCC: " << C << "\n");
+    LLVM_DEBUG(dbgs() << "Running function passes across an SCC: " << C
+                      << "\n");
 
     PreservedAnalyses PA = PreservedAnalyses::all();
     for (LazyCallGraph::Node *N : Nodes) {
@@ -652,7 +656,7 @@ private:
   FunctionPassT Pass;
 };
 
-/// \brief A function to deduce a function pass type and wrap it in the
+/// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename FunctionPassT>
 CGSCCToFunctionPassAdaptor<FunctionPassT>
@@ -757,9 +761,9 @@ public:
         if (!F)
           return false;
 
-        DEBUG(dbgs() << "Found devirutalized call from "
-                     << CS.getParent()->getParent()->getName() << " to "
-                     << F->getName() << "\n");
+        LLVM_DEBUG(dbgs() << "Found devirutalized call from "
+                          << CS.getParent()->getParent()->getName() << " to "
+                          << F->getName() << "\n");
 
         // We now have a direct call where previously we had an indirect call,
         // so iterate to process this devirtualization site.
@@ -793,16 +797,18 @@ public:
 
       // Otherwise, if we've already hit our max, we're done.
       if (Iteration >= MaxIterations) {
-        DEBUG(dbgs() << "Found another devirtualization after hitting the max "
-                        "number of repetitions ("
-                     << MaxIterations << ") on SCC: " << *C << "\n");
+        LLVM_DEBUG(
+            dbgs() << "Found another devirtualization after hitting the max "
+                      "number of repetitions ("
+                   << MaxIterations << ") on SCC: " << *C << "\n");
         PA.intersect(std::move(PassPA));
         break;
       }
 
-      DEBUG(dbgs()
-            << "Repeating an SCC pass after finding a devirtualization in: "
-            << *C << "\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "Repeating an SCC pass after finding a devirtualization in: " << *C
+          << "\n");
 
       // Move over the new call counts in preparation for iterating.
       CallCounts = std::move(NewCallCounts);
@@ -824,7 +830,7 @@ private:
   int MaxIterations;
 };
 
-/// \brief A function to deduce a function pass type and wrap it in the
+/// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename PassT>
 DevirtSCCRepeatedPass<PassT> createDevirtSCCRepeatedPass(PassT Pass,
diff --git a/contrib/llvm/include/llvm/Analysis/CallGraph.h b/contrib/llvm/include/llvm/Analysis/CallGraph.h
index c5687def3ebe..f109cf2fac4d 100644
--- a/contrib/llvm/include/llvm/Analysis/CallGraph.h
+++ b/contrib/llvm/include/llvm/Analysis/CallGraph.h
@@ -66,7 +66,7 @@ class CallGraphNode;
 class Module;
 class raw_ostream;
 
-/// \brief The basic data container for the call graph of a \c Module of IR.
+/// The basic data container for the call graph of a \c Module of IR.
 ///
 /// This class exposes both the interface to the call graph for a module of IR.
 ///
@@ -77,25 +77,25 @@ class CallGraph {
   using FunctionMapTy =
       std::map<const Function *, std::unique_ptr<CallGraphNode>>;
 
-  /// \brief A map from \c Function* to \c CallGraphNode*.
+  /// A map from \c Function* to \c CallGraphNode*.
   FunctionMapTy FunctionMap;
 
-  /// \brief This node has edges to all external functions and those internal
+  /// This node has edges to all external functions and those internal
   /// functions that have their address taken.
   CallGraphNode *ExternalCallingNode;
 
-  /// \brief This node has edges to it from all functions making indirect calls
+  /// This node has edges to it from all functions making indirect calls
   /// or calling an external function.
   std::unique_ptr<CallGraphNode> CallsExternalNode;
 
-  /// \brief Replace the function represented by this node by another.
+  /// Replace the function represented by this node by another.
   ///
   /// This does not rescan the body of the function, so it is suitable when
   /// splicing the body of one function to another while also updating all
   /// callers from the old function to the new.
   void spliceFunction(const Function *From, const Function *To);
 
-  /// \brief Add a function to the call graph, and link the node to all of the
+  /// Add a function to the call graph, and link the node to all of the
   /// functions that it calls.
   void addToCallGraph(Function *F);
 
@@ -110,7 +110,7 @@ public:
   using iterator = FunctionMapTy::iterator;
   using const_iterator = FunctionMapTy::const_iterator;
 
-  /// \brief Returns the module the call graph corresponds to.
+  /// Returns the module the call graph corresponds to.
   Module &getModule() const { return M; }
 
   inline iterator begin() { return FunctionMap.begin(); }
@@ -118,21 +118,21 @@ public:
   inline const_iterator begin() const { return FunctionMap.begin(); }
   inline const_iterator end() const { return FunctionMap.end(); }
 
-  /// \brief Returns the call graph node for the provided function.
+  /// Returns the call graph node for the provided function.
   inline const CallGraphNode *operator[](const Function *F) const {
     const_iterator I = FunctionMap.find(F);
     assert(I != FunctionMap.end() && "Function not in callgraph!");
     return I->second.get();
   }
 
-  /// \brief Returns the call graph node for the provided function.
+  /// Returns the call graph node for the provided function.
   inline CallGraphNode *operator[](const Function *F) {
     const_iterator I = FunctionMap.find(F);
     assert(I != FunctionMap.end() && "Function not in callgraph!");
     return I->second.get();
   }
 
-  /// \brief Returns the \c CallGraphNode which is used to represent
+  /// Returns the \c CallGraphNode which is used to represent
   /// undetermined calls into the callgraph.
   CallGraphNode *getExternalCallingNode() const { return ExternalCallingNode; }
 
@@ -145,7 +145,7 @@ public:
   // modified.
   //
 
-  /// \brief Unlink the function from this module, returning it.
+  /// Unlink the function from this module, returning it.
   ///
   /// Because this removes the function from the module, the call graph node is
   /// destroyed.  This is only valid if the function does not call any other
@@ -153,25 +153,25 @@ public:
   /// this is to dropAllReferences before calling this.
   Function *removeFunctionFromModule(CallGraphNode *CGN);
 
-  /// \brief Similar to operator[], but this will insert a new CallGraphNode for
+  /// Similar to operator[], but this will insert a new CallGraphNode for
   /// \c F if one does not already exist.
   CallGraphNode *getOrInsertFunction(const Function *F);
 };
 
-/// \brief A node in the call graph for a module.
+/// A node in the call graph for a module.
 ///
 /// Typically represents a function in the call graph. There are also special
 /// "null" nodes used to represent theoretical entries in the call graph.
 class CallGraphNode {
 public:
-  /// \brief A pair of the calling instruction (a call or invoke)
+  /// A pair of the calling instruction (a call or invoke)
   /// and the call graph node being called.
   using CallRecord = std::pair<WeakTrackingVH, CallGraphNode *>;
 
 public:
   using CalledFunctionsVector = std::vector<CallRecord>;
 
-  /// \brief Creates a node for the specified function.
+  /// Creates a node for the specified function.
   inline CallGraphNode(Function *F) : F(F) {}
 
   CallGraphNode(const CallGraphNode &) = delete;
@@ -184,7 +184,7 @@ public:
   using iterator = std::vector<CallRecord>::iterator;
   using const_iterator = std::vector<CallRecord>::const_iterator;
 
-  /// \brief Returns the function that this call graph node represents.
+  /// Returns the function that this call graph node represents.
   Function *getFunction() const { return F; }
 
   inline iterator begin() { return CalledFunctions.begin(); }
@@ -194,17 +194,17 @@ public:
   inline bool empty() const { return CalledFunctions.empty(); }
   inline unsigned size() const { return (unsigned)CalledFunctions.size(); }
 
-  /// \brief Returns the number of other CallGraphNodes in this CallGraph that
+  /// Returns the number of other CallGraphNodes in this CallGraph that
   /// reference this node in their callee list.
   unsigned getNumReferences() const { return NumReferences; }
 
-  /// \brief Returns the i'th called function.
+  /// Returns the i'th called function.
   CallGraphNode *operator[](unsigned i) const {
     assert(i < CalledFunctions.size() && "Invalid index");
     return CalledFunctions[i].second;
   }
 
-  /// \brief Print out this call graph node.
+  /// Print out this call graph node.
   void dump() const;
   void print(raw_ostream &OS) const;
 
@@ -213,7 +213,7 @@ public:
   // modified
   //
 
-  /// \brief Removes all edges from this CallGraphNode to any functions it
+  /// Removes all edges from this CallGraphNode to any functions it
   /// calls.
   void removeAllCalledFunctions() {
     while (!CalledFunctions.empty()) {
@@ -222,14 +222,14 @@ public:
     }
   }
 
-  /// \brief Moves all the callee information from N to this node.
+  /// Moves all the callee information from N to this node.
   void stealCalledFunctionsFrom(CallGraphNode *N) {
     assert(CalledFunctions.empty() &&
            "Cannot steal callsite information if I already have some");
     std::swap(CalledFunctions, N->CalledFunctions);
   }
 
-  /// \brief Adds a function to the list of functions called by this one.
+  /// Adds a function to the list of functions called by this one.
   void addCalledFunction(CallSite CS, CallGraphNode *M) {
     assert(!CS.getInstruction() || !CS.getCalledFunction() ||
            !CS.getCalledFunction()->isIntrinsic() ||
@@ -244,23 +244,23 @@ public:
     CalledFunctions.pop_back();
   }
 
-  /// \brief Removes the edge in the node for the specified call site.
+  /// Removes the edge in the node for the specified call site.
   ///
   /// Note that this method takes linear time, so it should be used sparingly.
   void removeCallEdgeFor(CallSite CS);
 
-  /// \brief Removes all call edges from this node to the specified callee
+  /// Removes all call edges from this node to the specified callee
   /// function.
   ///
   /// This takes more time to execute than removeCallEdgeTo, so it should not
   /// be used unless necessary.
   void removeAnyCallEdgeTo(CallGraphNode *Callee);
 
-  /// \brief Removes one edge associated with a null callsite from this node to
+  /// Removes one edge associated with a null callsite from this node to
   /// the specified callee function.
   void removeOneAbstractEdgeTo(CallGraphNode *Callee);
 
-  /// \brief Replaces the edge in the node for the specified call site with a
+  /// Replaces the edge in the node for the specified call site with a
   /// new one.
   ///
   /// Note that this method takes linear time, so it should be used sparingly.
@@ -273,18 +273,18 @@ private:
 
   std::vector<CallRecord> CalledFunctions;
 
-  /// \brief The number of times that this CallGraphNode occurs in the
+  /// The number of times that this CallGraphNode occurs in the
   /// CalledFunctions array of this or other CallGraphNodes.
   unsigned NumReferences = 0;
 
   void DropRef() { --NumReferences; }
   void AddRef() { ++NumReferences; }
 
-  /// \brief A special function that should only be used by the CallGraph class.
+  /// A special function that should only be used by the CallGraph class.
   void allReferencesDropped() { NumReferences = 0; }
 };
 
-/// \brief An analysis pass to compute the \c CallGraph for a \c Module.
+/// An analysis pass to compute the \c CallGraph for a \c Module.
 ///
 /// This class implements the concept of an analysis pass used by the \c
 /// ModuleAnalysisManager to run an analysis over a module and cache the
@@ -295,16 +295,16 @@ class CallGraphAnalysis : public AnalysisInfoMixin<CallGraphAnalysis> {
   static AnalysisKey Key;
 
 public:
-  /// \brief A formulaic type to inform clients of the result type.
+  /// A formulaic type to inform clients of the result type.
   using Result = CallGraph;
 
-  /// \brief Compute the \c CallGraph for the module \c M.
+  /// Compute the \c CallGraph for the module \c M.
   ///
   /// The real work here is done in the \c CallGraph constructor.
   CallGraph run(Module &M, ModuleAnalysisManager &) { return CallGraph(M); }
 };
 
-/// \brief Printer pass for the \c CallGraphAnalysis results.
+/// Printer pass for the \c CallGraphAnalysis results.
 class CallGraphPrinterPass : public PassInfoMixin<CallGraphPrinterPass> {
   raw_ostream &OS;
 
@@ -314,7 +314,7 @@ public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
-/// \brief The \c ModulePass which wraps up a \c CallGraph and the logic to
+/// The \c ModulePass which wraps up a \c CallGraph and the logic to
 /// build it.
 ///
 /// This class exposes both the interface to the call graph container and the
@@ -330,7 +330,7 @@ public:
   CallGraphWrapperPass();
   ~CallGraphWrapperPass() override;
 
-  /// \brief The internal \c CallGraph around which the rest of this interface
+  /// The internal \c CallGraph around which the rest of this interface
   /// is wrapped.
   const CallGraph &getCallGraph() const { return *G; }
   CallGraph &getCallGraph() { return *G; }
@@ -338,7 +338,7 @@ public:
   using iterator = CallGraph::iterator;
   using const_iterator = CallGraph::const_iterator;
 
-  /// \brief Returns the module the call graph corresponds to.
+  /// Returns the module the call graph corresponds to.
   Module &getModule() const { return G->getModule(); }
 
   inline iterator begin() { return G->begin(); }
@@ -346,15 +346,15 @@ public:
   inline const_iterator begin() const { return G->begin(); }
   inline const_iterator end() const { return G->end(); }
 
-  /// \brief Returns the call graph node for the provided function.
+  /// Returns the call graph node for the provided function.
   inline const CallGraphNode *operator[](const Function *F) const {
     return (*G)[F];
   }
 
-  /// \brief Returns the call graph node for the provided function.
+  /// Returns the call graph node for the provided function.
   inline CallGraphNode *operator[](const Function *F) { return (*G)[F]; }
 
-  /// \brief Returns the \c CallGraphNode which is used to represent
+  /// Returns the \c CallGraphNode which is used to represent
   /// undetermined calls into the callgraph.
   CallGraphNode *getExternalCallingNode() const {
     return G->getExternalCallingNode();
@@ -369,7 +369,7 @@ public:
   // modified.
   //
 
-  /// \brief Unlink the function from this module, returning it.
+  /// Unlink the function from this module, returning it.
   ///
   /// Because this removes the function from the module, the call graph node is
   /// destroyed.  This is only valid if the function does not call any other
@@ -379,7 +379,7 @@ public:
     return G->removeFunctionFromModule(CGN);
   }
 
-  /// \brief Similar to operator[], but this will insert a new CallGraphNode for
+  /// Similar to operator[], but this will insert a new CallGraphNode for
   /// \c F if one does not already exist.
   CallGraphNode *getOrInsertFunction(const Function *F) {
     return G->getOrInsertFunction(F);
@@ -426,12 +426,14 @@ template <> struct GraphTraits<CallGraphNode *> {
 template <> struct GraphTraits<const CallGraphNode *> {
   using NodeRef = const CallGraphNode *;
   using CGNPairTy = CallGraphNode::CallRecord;
+  using EdgeRef = const CallGraphNode::CallRecord &;
 
   static NodeRef getEntryNode(const CallGraphNode *CGN) { return CGN; }
   static const CallGraphNode *CGNGetValue(CGNPairTy P) { return P.second; }
 
   using ChildIteratorType =
       mapped_iterator<CallGraphNode::const_iterator, decltype(&CGNGetValue)>;
+  using ChildEdgeIteratorType = CallGraphNode::const_iterator;
 
   static ChildIteratorType child_begin(NodeRef N) {
     return ChildIteratorType(N->begin(), &CGNGetValue);
@@ -440,6 +442,13 @@ template <> struct GraphTraits<const CallGraphNode *> {
   static ChildIteratorType child_end(NodeRef N) {
     return ChildIteratorType(N->end(), &CGNGetValue);
   }
+
+  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+    return N->begin();
+  }
+  static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); }
+
+  static NodeRef edge_dest(EdgeRef E) { return E.second; }
 };
 
 template <>
diff --git a/contrib/llvm/include/llvm/Analysis/CaptureTracking.h b/contrib/llvm/include/llvm/Analysis/CaptureTracking.h
index 8d2c095d8585..7a869a51233a 100644
--- a/contrib/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/contrib/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -46,7 +46,7 @@ namespace llvm {
   /// to speed up capture-tracker queries.
   bool PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                   bool StoreCaptures, const Instruction *I,
-                                  DominatorTree *DT, bool IncludeI = false,
+                                  const DominatorTree *DT, bool IncludeI = false,
                                   OrderedBasicBlock *OBB = nullptr);
 
   /// This callback is used in conjunction with PointerMayBeCaptured. In
diff --git a/contrib/llvm/include/llvm/Analysis/CodeMetrics.h b/contrib/llvm/include/llvm/Analysis/CodeMetrics.h
index 9e861ac18825..752902238522 100644
--- a/contrib/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/contrib/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -29,7 +29,7 @@ class DataLayout;
 class TargetTransformInfo;
 class Value;
 
-/// \brief Check whether a call will lower to something small.
+/// Check whether a call will lower to something small.
 ///
 /// This tests checks whether this callsite will lower to something
 /// significantly cheaper than a traditional call, often a single
@@ -37,64 +37,64 @@ class Value;
 /// return true, so will this function.
 bool callIsSmall(ImmutableCallSite CS);
 
-/// \brief Utility to calculate the size and a few similar metrics for a set
+/// Utility to calculate the size and a few similar metrics for a set
 /// of basic blocks.
 struct CodeMetrics {
-  /// \brief True if this function contains a call to setjmp or other functions
+  /// True if this function contains a call to setjmp or other functions
   /// with attribute "returns twice" without having the attribute itself.
   bool exposesReturnsTwice = false;
 
-  /// \brief True if this function calls itself.
+  /// True if this function calls itself.
   bool isRecursive = false;
 
-  /// \brief True if this function cannot be duplicated.
+  /// True if this function cannot be duplicated.
   ///
   /// True if this function contains one or more indirect branches, or it contains
   /// one or more 'noduplicate' instructions.
   bool notDuplicatable = false;
 
-  /// \brief True if this function contains a call to a convergent function.
+  /// True if this function contains a call to a convergent function.
   bool convergent = false;
 
-  /// \brief True if this function calls alloca (in the C sense).
+  /// True if this function calls alloca (in the C sense).
   bool usesDynamicAlloca = false;
 
-  /// \brief Number of instructions in the analyzed blocks.
+  /// Number of instructions in the analyzed blocks.
   unsigned NumInsts = false;
 
-  /// \brief Number of analyzed blocks.
+  /// Number of analyzed blocks.
   unsigned NumBlocks = false;
 
-  /// \brief Keeps track of basic block code size estimates.
+  /// Keeps track of basic block code size estimates.
   DenseMap<const BasicBlock *, unsigned> NumBBInsts;
 
-  /// \brief Keep track of the number of calls to 'big' functions.
+  /// Keep track of the number of calls to 'big' functions.
   unsigned NumCalls = false;
 
-  /// \brief The number of calls to internal functions with a single caller.
+  /// The number of calls to internal functions with a single caller.
   ///
   /// These are likely targets for future inlining, likely exposed by
   /// interleaved devirtualization.
   unsigned NumInlineCandidates = 0;
 
-  /// \brief How many instructions produce vector values.
+  /// How many instructions produce vector values.
   ///
   /// The inliner is more aggressive with inlining vector kernels.
   unsigned NumVectorInsts = 0;
 
-  /// \brief How many 'ret' instructions the blocks contain.
+  /// How many 'ret' instructions the blocks contain.
   unsigned NumRets = 0;
 
-  /// \brief Add information about a block to the current state.
+  /// Add information about a block to the current state.
   void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI,
                          const SmallPtrSetImpl<const Value*> &EphValues);
 
-  /// \brief Collect a loop's ephemeral values (those used only by an assume
+  /// Collect a loop's ephemeral values (those used only by an assume
   /// or similar intrinsics in the loop).
   static void collectEphemeralValues(const Loop *L, AssumptionCache *AC,
                                      SmallPtrSetImpl<const Value *> &EphValues);
 
-  /// \brief Collect a functions's ephemeral values (those used only by an
+  /// Collect a functions's ephemeral values (those used only by an
   /// assume or similar intrinsics in the function).
   static void collectEphemeralValues(const Function *L, AssumptionCache *AC,
                                      SmallPtrSetImpl<const Value *> &EphValues);
diff --git a/contrib/llvm/include/llvm/Analysis/ConstantFolding.h b/contrib/llvm/include/llvm/Analysis/ConstantFolding.h
index 6d4eef412525..192c1abddcd2 100644
--- a/contrib/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/contrib/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -73,19 +73,19 @@ ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS,
                                 Constant *RHS, const DataLayout &DL,
                                 const TargetLibraryInfo *TLI = nullptr);
 
-/// \brief Attempt to constant fold a binary operation with the specified
+/// Attempt to constant fold a binary operation with the specified
 /// operands.  If it fails, it returns a constant expression of the specified
 /// operands.
 Constant *ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS,
                                        Constant *RHS, const DataLayout &DL);
 
-/// \brief Attempt to constant fold a select instruction with the specified
+/// Attempt to constant fold a select instruction with the specified
 /// operands. The constant result is returned if successful; if not, null is
 /// returned.
 Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
                                         Constant *V2);
 
-/// \brief Attempt to constant fold a cast with the specified operand.  If it
+/// Attempt to constant fold a cast with the specified operand.  If it
 /// fails, it returns a constant expression of the specified operand.
 Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy,
                                   const DataLayout &DL);
@@ -96,25 +96,25 @@ Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy,
 Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
                                              ArrayRef<unsigned> Idxs);
 
-/// \brief Attempt to constant fold an extractvalue instruction with the
+/// Attempt to constant fold an extractvalue instruction with the
 /// specified operands and indices.  The constant result is returned if
 /// successful; if not, null is returned.
 Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
                                               ArrayRef<unsigned> Idxs);
 
-/// \brief Attempt to constant fold an insertelement instruction with the
+/// Attempt to constant fold an insertelement instruction with the
 /// specified operands and indices.  The constant result is returned if
 /// successful; if not, null is returned.
 Constant *ConstantFoldInsertElementInstruction(Constant *Val,
                                                Constant *Elt,
                                                Constant *Idx);
 
-/// \brief Attempt to constant fold an extractelement instruction with the
+/// Attempt to constant fold an extractelement instruction with the
 /// specified operands and indices.  The constant result is returned if
 /// successful; if not, null is returned.
 Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
 
-/// \brief Attempt to constant fold a shufflevector instruction with the
+/// Attempt to constant fold a shufflevector instruction with the
 /// specified operands and indices.  The constant result is returned if
 /// successful; if not, null is returned.
 Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
@@ -147,7 +147,13 @@ Constant *ConstantFoldCall(ImmutableCallSite CS, Function *F,
                            ArrayRef<Constant *> Operands,
                            const TargetLibraryInfo *TLI = nullptr);
 
-/// \brief Check whether the given call has no side-effects.
+/// ConstantFoldLoadThroughBitcast - try to cast constant to destination type
+/// returning null if unsuccessful. Can cast pointer to pointer or pointer to
+/// integer and vice versa if their sizes are equal.
+Constant *ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
+                                         const DataLayout &DL);
+
+/// Check whether the given call has no side-effects.
 /// Specifically checks for math routimes which sometimes set errno.
 bool isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI);
 }
diff --git a/contrib/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/contrib/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
index 39f9c39c34e1..b7447a0547d5 100644
--- a/contrib/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/contrib/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -20,7 +20,7 @@
 
 namespace llvm {
 
-/// \brief Default traits class for extracting a graph from an analysis pass.
+/// Default traits class for extracting a graph from an analysis pass.
 ///
 /// This assumes that 'GraphT' is 'AnalysisT *' and so just passes it through.
 template <typename AnalysisT, typename GraphT = AnalysisT *>
@@ -36,7 +36,7 @@ public:
   DOTGraphTraitsViewer(StringRef GraphName, char &ID)
       : FunctionPass(ID), Name(GraphName) {}
 
-  /// @brief Return true if this function should be processed.
+  /// Return true if this function should be processed.
   ///
   /// An implementation of this class my override this function to indicate that
   /// only certain functions should be viewed.
@@ -78,7 +78,7 @@ public:
   DOTGraphTraitsPrinter(StringRef GraphName, char &ID)
       : FunctionPass(ID), Name(GraphName) {}
 
-  /// @brief Return true if this function should be processed.
+  /// Return true if this function should be processed.
   ///
   /// An implementation of this class my override this function to indicate that
   /// only certain functions should be printed.
diff --git a/contrib/llvm/include/llvm/Analysis/DemandedBits.h b/contrib/llvm/include/llvm/Analysis/DemandedBits.h
index ab8668256ba2..d4384609762d 100644
--- a/contrib/llvm/include/llvm/Analysis/DemandedBits.h
+++ b/contrib/llvm/include/llvm/Analysis/DemandedBits.h
@@ -96,15 +96,15 @@ class DemandedBitsAnalysis : public AnalysisInfoMixin<DemandedBitsAnalysis> {
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result type for this analysis pass.
+  /// Provide the result type for this analysis pass.
   using Result = DemandedBits;
 
-  /// \brief Run the analysis pass over a function and produce demanded bits
+  /// Run the analysis pass over a function and produce demanded bits
   /// information.
   DemandedBits run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for DemandedBits
+/// Printer pass for DemandedBits
 class DemandedBitsPrinterPass : public PassInfoMixin<DemandedBitsPrinterPass> {
   raw_ostream &OS;
 
diff --git a/contrib/llvm/include/llvm/Analysis/DependenceAnalysis.h b/contrib/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 90f33b8c42e5..c8ec737a2cb9 100644
--- a/contrib/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -557,6 +557,17 @@ template <typename T> class ArrayRef;
                           const SCEV *X,
                           const SCEV *Y) const;
 
+    /// isKnownLessThan - Compare to see if S is less than Size
+    /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
+    /// checking if S is an AddRec and we can prove lessthan using the loop
+    /// bounds.
+    bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
+
+    /// isKnownNonNegative - Compare to see if S is known not to be negative
+    /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
+    /// Proving there is no wrapping going on.
+    bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
+
     /// collectUpperBound - All subscripts are the same type (on my machine,
     /// an i64). The loop bound may be a smaller type. collectUpperBound
     /// find the bound, if available, and zero extends it to the Type T.
@@ -914,7 +925,7 @@ template <typename T> class ArrayRef;
                         SmallVectorImpl<Subscript> &Pair);
   }; // class DependenceInfo
 
-  /// \brief AnalysisPass to compute dependence information in a function
+  /// AnalysisPass to compute dependence information in a function
   class DependenceAnalysis : public AnalysisInfoMixin<DependenceAnalysis> {
   public:
     typedef DependenceInfo Result;
@@ -925,7 +936,7 @@ template <typename T> class ArrayRef;
     friend struct AnalysisInfoMixin<DependenceAnalysis>;
   }; // class DependenceAnalysis
 
-  /// \brief Legacy pass manager pass to access dependence information
+  /// Legacy pass manager pass to access dependence information
   class DependenceAnalysisWrapperPass : public FunctionPass {
   public:
     static char ID; // Class identification, replacement for typeinfo
diff --git a/contrib/llvm/include/llvm/Analysis/DivergenceAnalysis.h b/contrib/llvm/include/llvm/Analysis/DivergenceAnalysis.h
index aa2de571ba1b..328c8645d3c0 100644
--- a/contrib/llvm/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/DivergenceAnalysis.h
@@ -13,6 +13,8 @@
 // better decisions.
 //
 //===----------------------------------------------------------------------===//
+#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Function.h"
@@ -35,14 +37,25 @@ public:
   // Print all divergent branches in the function.
   void print(raw_ostream &OS, const Module *) const override;
 
-  // Returns true if V is divergent.
+  // Returns true if V is divergent at its definition.
+  //
+  // Even if this function returns false, V may still be divergent when used
+  // in a different basic block.
   bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
 
   // Returns true if V is uniform/non-divergent.
+  //
+  // Even if this function returns true, V may still be divergent when used
+  // in a different basic block.
   bool isUniform(const Value *V) const { return !isDivergent(V); }
 
+  // Keep the analysis results uptodate by removing an erased value.
+  void removeValue(const Value *V) { DivergentValues.erase(V); }
+
 private:
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
 };
 } // End llvm namespace
+
+#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+\ No newline at end of file
diff --git a/contrib/llvm/include/llvm/Analysis/DominanceFrontier.h b/contrib/llvm/include/llvm/Analysis/DominanceFrontier.h
index a304dff18c79..d94c420d7177 100644
--- a/contrib/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/contrib/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -19,6 +19,7 @@
 #define LLVM_ANALYSIS_DOMINANCEFRONTIER_H
 
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/GenericDomTree.h"
@@ -179,7 +180,7 @@ extern template class DominanceFrontierBase<BasicBlock, false>;
 extern template class DominanceFrontierBase<BasicBlock, true>;
 extern template class ForwardDominanceFrontierBase<BasicBlock>;
 
-/// \brief Analysis pass which computes a \c DominanceFrontier.
+/// Analysis pass which computes a \c DominanceFrontier.
 class DominanceFrontierAnalysis
     : public AnalysisInfoMixin<DominanceFrontierAnalysis> {
   friend AnalysisInfoMixin<DominanceFrontierAnalysis>;
@@ -187,14 +188,14 @@ class DominanceFrontierAnalysis
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result type for this analysis pass.
+  /// Provide the result type for this analysis pass.
   using Result = DominanceFrontier;
 
-  /// \brief Run the analysis pass over a function and produce a dominator tree.
+  /// Run the analysis pass over a function and produce a dominator tree.
   DominanceFrontier run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for the \c DominanceFrontier.
+/// Printer pass for the \c DominanceFrontier.
 class DominanceFrontierPrinterPass
     : public PassInfoMixin<DominanceFrontierPrinterPass> {
   raw_ostream &OS;
diff --git a/contrib/llvm/include/llvm/Analysis/DominanceFrontierImpl.h b/contrib/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
index dffb2e02b621..99224c0bf131 100644
--- a/contrib/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/contrib/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/include/llvm/Analysis/EHPersonalities.h b/contrib/llvm/include/llvm/Analysis/EHPersonalities.h
index 2c45ab4693e6..fe0e65b828ca 100644
--- a/contrib/llvm/include/llvm/Analysis/EHPersonalities.h
+++ b/contrib/llvm/include/llvm/Analysis/EHPersonalities.h
@@ -32,10 +32,11 @@ enum class EHPersonality {
   MSVC_Win64SEH,
   MSVC_CXX,
   CoreCLR,
-  Rust
+  Rust,
+  Wasm_CXX
 };
 
-/// \brief See if the given exception handling personality function is one
+/// See if the given exception handling personality function is one
 /// that we understand.  If so, return a description of it; otherwise return
 /// Unknown.
 EHPersonality classifyEHPersonality(const Value *Pers);
@@ -44,7 +45,7 @@ StringRef getEHPersonalityName(EHPersonality Pers);
 
 EHPersonality getDefaultEHPersonality(const Triple &T);
 
-/// \brief Returns true if this personality function catches asynchronous
+/// Returns true if this personality function catches asynchronous
 /// exceptions.
 inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
   // The two SEH personality functions can catch asynch exceptions. We assume
@@ -59,7 +60,7 @@ inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
   llvm_unreachable("invalid enum");
 }
 
-/// \brief Returns true if this is a personality function that invokes
+/// Returns true if this is a personality function that invokes
 /// handler funclets (which must return to it).
 inline bool isFuncletEHPersonality(EHPersonality Pers) {
   switch (Pers) {
@@ -74,7 +75,23 @@ inline bool isFuncletEHPersonality(EHPersonality Pers) {
   llvm_unreachable("invalid enum");
 }
 
-/// \brief Return true if this personality may be safely removed if there
+/// Returns true if this personality uses scope-style EH IR instructions:
+/// catchswitch, catchpad/ret, and cleanuppad/ret.
+inline bool isScopedEHPersonality(EHPersonality Pers) {
+  switch (Pers) {
+  case EHPersonality::MSVC_CXX:
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::CoreCLR:
+  case EHPersonality::Wasm_CXX:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+/// Return true if this personality may be safely removed if there
 /// are no invoke instructions remaining in the current function.
 inline bool isNoOpWithoutInvoke(EHPersonality Pers) {
   switch (Pers) {
@@ -91,7 +108,7 @@ bool canSimplifyInvokeNoUnwind(const Function *F);
 
 typedef TinyPtrVector<BasicBlock *> ColorVector;
 
-/// \brief If an EH funclet personality is in use (see isFuncletEHPersonality),
+/// If an EH funclet personality is in use (see isFuncletEHPersonality),
 /// this will recompute which blocks are in which funclet. It is possible that
 /// some blocks are in multiple funclets. Consider this analysis to be
 /// expensive.
diff --git a/contrib/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h b/contrib/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
index 8b1c10139de8..be3a28424cf5 100644
--- a/contrib/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
@@ -48,7 +48,7 @@ private:
 public:
   ICallPromotionAnalysis();
 
-  /// \brief Returns reference to array of InstrProfValueData for the given
+  /// Returns reference to array of InstrProfValueData for the given
   /// instruction \p I.
   ///
   /// The \p NumVals, \p TotalCount and \p NumCandidates
diff --git a/contrib/llvm/include/llvm/Analysis/InlineCost.h b/contrib/llvm/include/llvm/Analysis/InlineCost.h
index 985f3880ed3a..8c412057fb81 100644
--- a/contrib/llvm/include/llvm/Analysis/InlineCost.h
+++ b/contrib/llvm/include/llvm/Analysis/InlineCost.h
@@ -52,7 +52,7 @@ const int NoreturnPenalty = 10000;
 const unsigned TotalAllocaSizeRecursiveCaller = 1024;
 }
 
-/// \brief Represents the cost of inlining a function.
+/// Represents the cost of inlining a function.
 ///
 /// This supports special values for functions which should "always" or
 /// "never" be inlined. Otherwise, the cost represents a unitless amount;
@@ -68,10 +68,10 @@ class InlineCost {
     NeverInlineCost = INT_MAX
   };
 
-  /// \brief The estimated cost of inlining this callsite.
+  /// The estimated cost of inlining this callsite.
   const int Cost;
 
-  /// \brief The adjusted threshold against which this cost was computed.
+  /// The adjusted threshold against which this cost was computed.
   const int Threshold;
 
   // Trivial constructor, interesting logic in the factory functions below.
@@ -90,7 +90,7 @@ public:
     return InlineCost(NeverInlineCost, 0);
   }
 
-  /// \brief Test whether the inline cost is low enough for inlining.
+  /// Test whether the inline cost is low enough for inlining.
   explicit operator bool() const {
     return Cost < Threshold;
   }
@@ -99,20 +99,20 @@ public:
   bool isNever() const { return Cost == NeverInlineCost; }
   bool isVariable() const { return !isAlways() && !isNever(); }
 
-  /// \brief Get the inline cost estimate.
+  /// Get the inline cost estimate.
   /// It is an error to call this on an "always" or "never" InlineCost.
   int getCost() const {
     assert(isVariable() && "Invalid access of InlineCost");
     return Cost;
   }
 
-  /// \brief Get the threshold against which the cost was computed
+  /// Get the threshold against which the cost was computed
   int getThreshold() const {
     assert(isVariable() && "Invalid access of InlineCost");
     return Threshold;
   }
 
-  /// \brief Get the cost delta from the threshold for inlining.
+  /// Get the cost delta from the threshold for inlining.
   /// Only valid if the cost is of the variable kind. Returns a negative
   /// value if the cost is too high to inline.
   int getCostDelta() const { return Threshold - getCost(); }
@@ -170,7 +170,7 @@ InlineParams getInlineParams(int Threshold);
 /// line options. If -inline-threshold option is not explicitly passed,
 /// the default threshold is computed from \p OptLevel and \p SizeOptLevel.
 /// An \p OptLevel value above 3 is considered an aggressive optimization mode.
-/// \p SizeOptLevel of 1 corresponds to the the -Os flag and 2 corresponds to
+/// \p SizeOptLevel of 1 corresponds to the -Os flag and 2 corresponds to
 /// the -Oz flag.
 InlineParams getInlineParams(unsigned OptLevel, unsigned SizeOptLevel);
 
@@ -178,7 +178,7 @@ InlineParams getInlineParams(unsigned OptLevel, unsigned SizeOptLevel);
 /// and the call/return instruction.
 int getCallsiteCost(CallSite CS, const DataLayout &DL);
 
-/// \brief Get an InlineCost object representing the cost of inlining this
+/// Get an InlineCost object representing the cost of inlining this
 /// callsite.
 ///
 /// Note that a default threshold is passed into this function. This threshold
@@ -195,7 +195,7 @@ InlineCost getInlineCost(
     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE = nullptr);
 
-/// \brief Get an InlineCost with the callee explicitly specified.
+/// Get an InlineCost with the callee explicitly specified.
 /// This allows you to calculate the cost of inlining a function via a
 /// pointer. This behaves exactly as the version with no explicit callee
 /// parameter in all other respects.
@@ -207,7 +207,7 @@ getInlineCost(CallSite CS, Function *Callee, const InlineParams &Params,
               Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
               ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE);
 
-/// \brief Minimal filter to detect invalid constructs for inlining.
+/// Minimal filter to detect invalid constructs for inlining.
 bool isInlineViable(Function &Callee);
 }
 
diff --git a/contrib/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h b/contrib/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
index edaf4e9025bc..6b1950733246 100644
--- a/contrib/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
+++ b/contrib/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \brief Compute iterated dominance frontiers using a linear time algorithm.
+/// Compute iterated dominance frontiers using a linear time algorithm.
 ///
 /// The algorithm used here is based on:
 ///
@@ -32,7 +32,7 @@
 
 namespace llvm {
 
-/// \brief Determine the iterated dominance frontier, given a set of defining
+/// Determine the iterated dominance frontier, given a set of defining
 /// blocks, and optionally, a set of live-in blocks.
 ///
 /// In turn, the results can be used to place phi nodes.
@@ -48,7 +48,7 @@ class IDFCalculator {
   IDFCalculator(DominatorTreeBase<BasicBlock, IsPostDom> &DT)
       : DT(DT), useLiveIn(false) {}
 
-  /// \brief Give the IDF calculator the set of blocks in which the value is
+  /// Give the IDF calculator the set of blocks in which the value is
   /// defined.  This is equivalent to the set of starting blocks it should be
   /// calculating the IDF for (though later gets pruned based on liveness).
   ///
@@ -57,7 +57,7 @@ class IDFCalculator {
     DefBlocks = &Blocks;
   }
 
-  /// \brief Give the IDF calculator the set of blocks in which the value is
+  /// Give the IDF calculator the set of blocks in which the value is
   /// live on entry to the block.   This is used to prune the IDF calculation to
   /// not include blocks where any phi insertion would be dead.
   ///
@@ -68,14 +68,14 @@ class IDFCalculator {
     useLiveIn = true;
   }
 
-  /// \brief Reset the live-in block set to be empty, and tell the IDF
+  /// Reset the live-in block set to be empty, and tell the IDF
   /// calculator to not use liveness anymore.
   void resetLiveInBlocks() {
     LiveInBlocks = nullptr;
     useLiveIn = false;
   }
 
-  /// \brief Calculate iterated dominance frontiers
+  /// Calculate iterated dominance frontiers
   ///
   /// This uses the linear-time phi algorithm based on DJ-graphs mentioned in
   /// the file-level comment.  It performs DF->IDF pruning using the live-in
diff --git a/contrib/llvm/include/llvm/Analysis/LazyBlockFrequencyInfo.h b/contrib/llvm/include/llvm/Analysis/LazyBlockFrequencyInfo.h
index 71ce0842f6a9..d1afb63d7e08 100644
--- a/contrib/llvm/include/llvm/Analysis/LazyBlockFrequencyInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/LazyBlockFrequencyInfo.h
@@ -75,7 +75,7 @@ private:
   const LoopInfoT *LI;
 };
 
-/// \brief This is an alternative analysis pass to
+/// This is an alternative analysis pass to
 /// BlockFrequencyInfoWrapperPass.  The difference is that with this pass the
 /// block frequencies are not computed when the analysis pass is executed but
 /// rather when the BFI result is explicitly requested by the analysis client.
@@ -109,10 +109,10 @@ public:
 
   LazyBlockFrequencyInfoPass();
 
-  /// \brief Compute and return the block frequencies.
+  /// Compute and return the block frequencies.
   BlockFrequencyInfo &getBFI() { return LBFI.getCalculated(); }
 
-  /// \brief Compute and return the block frequencies.
+  /// Compute and return the block frequencies.
   const BlockFrequencyInfo &getBFI() const { return LBFI.getCalculated(); }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -126,7 +126,7 @@ public:
   void print(raw_ostream &OS, const Module *M) const override;
 };
 
-/// \brief Helper for client passes to initialize dependent passes for LBFI.
+/// Helper for client passes to initialize dependent passes for LBFI.
 void initializeLazyBFIPassPass(PassRegistry &Registry);
 }
 #endif
diff --git a/contrib/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h b/contrib/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h
index e1d404b1ada2..9e6bcfedcbb9 100644
--- a/contrib/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h
@@ -26,7 +26,7 @@ class Function;
 class LoopInfo;
 class TargetLibraryInfo;
 
-/// \brief This is an alternative analysis pass to
+/// This is an alternative analysis pass to
 /// BranchProbabilityInfoWrapperPass.  The difference is that with this pass the
 /// branch probabilities are not computed when the analysis pass is executed but
 /// rather when the BPI results is explicitly requested by the analysis client.
@@ -89,10 +89,10 @@ public:
 
   LazyBranchProbabilityInfoPass();
 
-  /// \brief Compute and return the branch probabilities.
+  /// Compute and return the branch probabilities.
   BranchProbabilityInfo &getBPI() { return LBPI->getCalculated(); }
 
-  /// \brief Compute and return the branch probabilities.
+  /// Compute and return the branch probabilities.
   const BranchProbabilityInfo &getBPI() const { return LBPI->getCalculated(); }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -106,10 +106,10 @@ public:
   void print(raw_ostream &OS, const Module *M) const override;
 };
 
-/// \brief Helper for client passes to initialize dependent passes for LBPI.
+/// Helper for client passes to initialize dependent passes for LBPI.
 void initializeLazyBPIPassPass(PassRegistry &Registry);
 
-/// \brief Simple trait class that provides a mapping between BPI passes and the
+/// Simple trait class that provides a mapping between BPI passes and the
 /// corresponding BPInfo.
 template <typename PassT> struct BPIPassTrait {
   static PassT &getBPI(PassT *P) { return *P; }
diff --git a/contrib/llvm/include/llvm/Analysis/LazyValueInfo.h b/contrib/llvm/include/llvm/Analysis/LazyValueInfo.h
index 787c88cc6ec1..1a4fdb591427 100644
--- a/contrib/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -113,6 +113,13 @@ public:
   /// in LVI, so we need to pass it here as an argument.
   void printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS);
 
+  /// Disables use of the DominatorTree within LVI.
+  void disableDT();
+
+  /// Enables use of the DominatorTree within LVI. Does nothing if the class
+  /// instance was initialized without a DT pointer.
+  void enableDT();
+
   // For old PM pass. Delete once LazyValueInfoWrapperPass is gone.
   void releaseMemory();
 
@@ -121,7 +128,7 @@ public:
                   FunctionAnalysisManager::Invalidator &Inv);
 };
 
-/// \brief Analysis to compute lazy value information.
+/// Analysis to compute lazy value information.
 class LazyValueAnalysis : public AnalysisInfoMixin<LazyValueAnalysis> {
 public:
   typedef LazyValueInfo Result;
diff --git a/contrib/llvm/include/llvm/Analysis/Lint.h b/contrib/llvm/include/llvm/Analysis/Lint.h
index 7c88b137ec3b..db5919fd91c7 100644
--- a/contrib/llvm/include/llvm/Analysis/Lint.h
+++ b/contrib/llvm/include/llvm/Analysis/Lint.h
@@ -26,12 +26,12 @@ class FunctionPass;
 class Module;
 class Function;
 
-/// @brief Create a lint pass.
+/// Create a lint pass.
 ///
 /// Check a module or function.
 FunctionPass *createLintPass();
 
-/// @brief Check a module.
+/// Check a module.
 ///
 /// This should only be used for debugging, because it plays games with
 /// PassManagers and stuff.
diff --git a/contrib/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/contrib/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 28154c873b70..0f3f2be9aeb4 100644
--- a/contrib/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -38,25 +38,25 @@ class SCEVUnionPredicate;
 class LoopAccessInfo;
 class OptimizationRemarkEmitter;
 
-/// \brief Collection of parameters shared beetween the Loop Vectorizer and the
+/// Collection of parameters shared beetween the Loop Vectorizer and the
 /// Loop Access Analysis.
 struct VectorizerParams {
-  /// \brief Maximum SIMD width.
+  /// Maximum SIMD width.
   static const unsigned MaxVectorWidth;
 
-  /// \brief VF as overridden by the user.
+  /// VF as overridden by the user.
   static unsigned VectorizationFactor;
-  /// \brief Interleave factor as overridden by the user.
+  /// Interleave factor as overridden by the user.
   static unsigned VectorizationInterleave;
-  /// \brief True if force-vector-interleave was specified by the user.
+  /// True if force-vector-interleave was specified by the user.
   static bool isInterleaveForced();
 
-  /// \\brief When performing memory disambiguation checks at runtime do not
+  /// \When performing memory disambiguation checks at runtime do not
   /// make more than this number of comparisons.
   static unsigned RuntimeMemoryCheckThreshold;
 };
 
-/// \brief Checks memory dependences among accesses to the same underlying
+/// Checks memory dependences among accesses to the same underlying
 /// object to determine whether there vectorization is legal or not (and at
 /// which vectorization factor).
 ///
@@ -94,12 +94,12 @@ class MemoryDepChecker {
 public:
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
-  /// \brief Set of potential dependent memory accesses.
+  /// Set of potential dependent memory accesses.
   typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
 
-  /// \brief Dependece between memory access instructions.
+  /// Dependece between memory access instructions.
   struct Dependence {
-    /// \brief The type of the dependence.
+    /// The type of the dependence.
     enum DepType {
       // No dependence.
       NoDep,
@@ -127,36 +127,36 @@ public:
       BackwardVectorizableButPreventsForwarding
     };
 
-    /// \brief String version of the types.
+    /// String version of the types.
     static const char *DepName[];
 
-    /// \brief Index of the source of the dependence in the InstMap vector.
+    /// Index of the source of the dependence in the InstMap vector.
     unsigned Source;
-    /// \brief Index of the destination of the dependence in the InstMap vector.
+    /// Index of the destination of the dependence in the InstMap vector.
     unsigned Destination;
-    /// \brief The type of the dependence.
+    /// The type of the dependence.
     DepType Type;
 
     Dependence(unsigned Source, unsigned Destination, DepType Type)
         : Source(Source), Destination(Destination), Type(Type) {}
 
-    /// \brief Return the source instruction of the dependence.
+    /// Return the source instruction of the dependence.
     Instruction *getSource(const LoopAccessInfo &LAI) const;
-    /// \brief Return the destination instruction of the dependence.
+    /// Return the destination instruction of the dependence.
     Instruction *getDestination(const LoopAccessInfo &LAI) const;
 
-    /// \brief Dependence types that don't prevent vectorization.
+    /// Dependence types that don't prevent vectorization.
     static bool isSafeForVectorization(DepType Type);
 
-    /// \brief Lexically forward dependence.
+    /// Lexically forward dependence.
     bool isForward() const;
-    /// \brief Lexically backward dependence.
+    /// Lexically backward dependence.
     bool isBackward() const;
 
-    /// \brief May be a lexically backward dependence type (includes Unknown).
+    /// May be a lexically backward dependence type (includes Unknown).
     bool isPossiblyBackward() const;
 
-    /// \brief Print the dependence.  \p Instr is used to map the instruction
+    /// Print the dependence.  \p Instr is used to map the instruction
     /// indices to instructions.
     void print(raw_ostream &OS, unsigned Depth,
                const SmallVectorImpl<Instruction *> &Instrs) const;
@@ -167,7 +167,7 @@ public:
         ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
         RecordDependences(true) {}
 
-  /// \brief Register the location (instructions are given increasing numbers)
+  /// Register the location (instructions are given increasing numbers)
   /// of a write access.
   void addAccess(StoreInst *SI) {
     Value *Ptr = SI->getPointerOperand();
@@ -176,7 +176,7 @@ public:
     ++AccessIdx;
   }
 
-  /// \brief Register the location (instructions are given increasing numbers)
+  /// Register the location (instructions are given increasing numbers)
   /// of a write access.
   void addAccess(LoadInst *LI) {
     Value *Ptr = LI->getPointerOperand();
@@ -185,29 +185,29 @@ public:
     ++AccessIdx;
   }
 
-  /// \brief Check whether the dependencies between the accesses are safe.
+  /// Check whether the dependencies between the accesses are safe.
   ///
   /// Only checks sets with elements in \p CheckDeps.
   bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
                    const ValueToValueMap &Strides);
 
-  /// \brief No memory dependence was encountered that would inhibit
+  /// No memory dependence was encountered that would inhibit
   /// vectorization.
   bool isSafeForVectorization() const { return SafeForVectorization; }
 
-  /// \brief The maximum number of bytes of a vector register we can vectorize
+  /// The maximum number of bytes of a vector register we can vectorize
   /// the accesses safely with.
   uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
-  /// \brief Return the number of elements that are safe to operate on
+  /// Return the number of elements that are safe to operate on
   /// simultaneously, multiplied by the size of the element in bits.
   uint64_t getMaxSafeRegisterWidth() const { return MaxSafeRegisterWidth; }
 
-  /// \brief In same cases when the dependency check fails we can still
+  /// In same cases when the dependency check fails we can still
   /// vectorize the loop with a dynamic array access check.
   bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
 
-  /// \brief Returns the memory dependences.  If null is returned we exceeded
+  /// Returns the memory dependences.  If null is returned we exceeded
   /// the MaxDependences threshold and this information is not
   /// available.
   const SmallVectorImpl<Dependence> *getDependences() const {
@@ -216,13 +216,13 @@ public:
 
   void clearDependences() { Dependences.clear(); }
 
-  /// \brief The vector of memory access instructions.  The indices are used as
+  /// The vector of memory access instructions.  The indices are used as
   /// instruction identifiers in the Dependence class.
   const SmallVectorImpl<Instruction *> &getMemoryInstructions() const {
     return InstMap;
   }
 
-  /// \brief Generate a mapping between the memory instructions and their
+  /// Generate a mapping between the memory instructions and their
   /// indices according to program order.
   DenseMap<Instruction *, unsigned> generateInstructionOrderMap() const {
     DenseMap<Instruction *, unsigned> OrderMap;
@@ -233,7 +233,7 @@ public:
     return OrderMap;
   }
 
-  /// \brief Find the set of instructions that read or write via \p Ptr.
+  /// Find the set of instructions that read or write via \p Ptr.
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const;
 
@@ -247,42 +247,42 @@ private:
   PredicatedScalarEvolution &PSE;
   const Loop *InnermostLoop;
 
-  /// \brief Maps access locations (ptr, read/write) to program order.
+  /// Maps access locations (ptr, read/write) to program order.
   DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
 
-  /// \brief Memory access instructions in program order.
+  /// Memory access instructions in program order.
   SmallVector<Instruction *, 16> InstMap;
 
-  /// \brief The program order index to be used for the next instruction.
+  /// The program order index to be used for the next instruction.
   unsigned AccessIdx;
 
   // We can access this many bytes in parallel safely.
   uint64_t MaxSafeDepDistBytes;
 
-  /// \brief Number of elements (from consecutive iterations) that are safe to
+  /// Number of elements (from consecutive iterations) that are safe to
   /// operate on simultaneously, multiplied by the size of the element in bits.
   /// The size of the element is taken from the memory access that is most
   /// restrictive.
   uint64_t MaxSafeRegisterWidth;
 
-  /// \brief If we see a non-constant dependence distance we can still try to
+  /// If we see a non-constant dependence distance we can still try to
   /// vectorize this loop with runtime checks.
   bool ShouldRetryWithRuntimeCheck;
 
-  /// \brief No memory dependence was encountered that would inhibit
+  /// No memory dependence was encountered that would inhibit
   /// vectorization.
   bool SafeForVectorization;
 
-  //// \brief True if Dependences reflects the dependences in the
+  //// True if Dependences reflects the dependences in the
   //// loop.  If false we exceeded MaxDependences and
   //// Dependences is invalid.
   bool RecordDependences;
 
-  /// \brief Memory dependences collected during the analysis.  Only valid if
+  /// Memory dependences collected during the analysis.  Only valid if
   /// RecordDependences is true.
   SmallVector<Dependence, 8> Dependences;
 
-  /// \brief Check whether there is a plausible dependence between the two
+  /// Check whether there is a plausible dependence between the two
   /// accesses.
   ///
   /// Access \p A must happen before \p B in program order. The two indices
@@ -298,7 +298,7 @@ private:
                                   const MemAccessInfo &B, unsigned BIdx,
                                   const ValueToValueMap &Strides);
 
-  /// \brief Check whether the data dependence could prevent store-load
+  /// Check whether the data dependence could prevent store-load
   /// forwarding.
   ///
   /// \return false if we shouldn't vectorize at all or avoid larger
@@ -306,7 +306,7 @@ private:
   bool couldPreventStoreLoadForward(uint64_t Distance, uint64_t TypeByteSize);
 };
 
-/// \brief Holds information about the memory runtime legality checks to verify
+/// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
 public:
@@ -355,13 +355,13 @@ public:
               unsigned ASId, const ValueToValueMap &Strides,
               PredicatedScalarEvolution &PSE);
 
-  /// \brief No run-time memory checking is necessary.
+  /// No run-time memory checking is necessary.
   bool empty() const { return Pointers.empty(); }
 
   /// A grouping of pointers. A single memcheck is required between
   /// two groups.
   struct CheckingPtrGroup {
-    /// \brief Create a new pointer checking group containing a single
+    /// Create a new pointer checking group containing a single
     /// pointer, with index \p Index in RtCheck.
     CheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
         : RtCheck(RtCheck), High(RtCheck.Pointers[Index].End),
@@ -369,7 +369,7 @@ public:
       Members.push_back(Index);
     }
 
-    /// \brief Tries to add the pointer recorded in RtCheck at index
+    /// Tries to add the pointer recorded in RtCheck at index
     /// \p Index to this pointer checking group. We can only add a pointer
     /// to a checking group if we will still be able to get
     /// the upper and lower bounds of the check. Returns true in case
@@ -390,7 +390,7 @@ public:
     SmallVector<unsigned, 2> Members;
   };
 
-  /// \brief A memcheck which made up of a pair of grouped pointers.
+  /// A memcheck which made up of a pair of grouped pointers.
   ///
   /// These *have* to be const for now, since checks are generated from
   /// CheckingPtrGroups in LAI::addRuntimeChecks which is a const member
@@ -399,24 +399,24 @@ public:
   typedef std::pair<const CheckingPtrGroup *, const CheckingPtrGroup *>
       PointerCheck;
 
-  /// \brief Generate the checks and store it.  This also performs the grouping
+  /// Generate the checks and store it.  This also performs the grouping
   /// of pointers to reduce the number of memchecks necessary.
   void generateChecks(MemoryDepChecker::DepCandidates &DepCands,
                       bool UseDependencies);
 
-  /// \brief Returns the checks that generateChecks created.
+  /// Returns the checks that generateChecks created.
   const SmallVector<PointerCheck, 4> &getChecks() const { return Checks; }
 
-  /// \brief Decide if we need to add a check between two groups of pointers,
+  /// Decide if we need to add a check between two groups of pointers,
   /// according to needsChecking.
   bool needsChecking(const CheckingPtrGroup &M,
                      const CheckingPtrGroup &N) const;
 
-  /// \brief Returns the number of run-time checks required according to
+  /// Returns the number of run-time checks required according to
   /// needsChecking.
   unsigned getNumberOfChecks() const { return Checks.size(); }
 
-  /// \brief Print the list run-time memory checks necessary.
+  /// Print the list run-time memory checks necessary.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
   /// Print \p Checks.
@@ -432,7 +432,7 @@ public:
   /// Holds a partitioning of pointers into "check groups".
   SmallVector<CheckingPtrGroup, 2> CheckingGroups;
 
-  /// \brief Check if pointers are in the same partition
+  /// Check if pointers are in the same partition
   ///
   /// \p PtrToPartition contains the partition number for pointers (-1 if the
   /// pointer belongs to multiple partitions).
@@ -440,17 +440,17 @@ public:
   arePointersInSamePartition(const SmallVectorImpl<int> &PtrToPartition,
                              unsigned PtrIdx1, unsigned PtrIdx2);
 
-  /// \brief Decide whether we need to issue a run-time check for pointer at
+  /// Decide whether we need to issue a run-time check for pointer at
   /// index \p I and \p J to prove their independence.
   bool needsChecking(unsigned I, unsigned J) const;
 
-  /// \brief Return PointerInfo for pointer at index \p PtrIdx.
+  /// Return PointerInfo for pointer at index \p PtrIdx.
   const PointerInfo &getPointerInfo(unsigned PtrIdx) const {
     return Pointers[PtrIdx];
   }
 
 private:
-  /// \brief Groups pointers such that a single memcheck is required
+  /// Groups pointers such that a single memcheck is required
   /// between two different groups. This will clear the CheckingGroups vector
   /// and re-compute it. We will only group dependecies if \p UseDependencies
   /// is true, otherwise we will create a separate group for each pointer.
@@ -464,12 +464,12 @@ private:
   /// Holds a pointer to the ScalarEvolution analysis.
   ScalarEvolution *SE;
 
-  /// \brief Set of run-time checks required to establish independence of
+  /// Set of run-time checks required to establish independence of
   /// otherwise may-aliasing pointers in the loop.
   SmallVector<PointerCheck, 4> Checks;
 };
 
-/// \brief Drive the analysis of memory accesses in the loop
+/// Drive the analysis of memory accesses in the loop
 ///
 /// This class is responsible for analyzing the memory accesses of a loop.  It
 /// collects the accesses and then its main helper the AccessAnalysis class
@@ -503,7 +503,7 @@ public:
     return PtrRtChecking.get();
   }
 
-  /// \brief Number of memchecks required to prove independence of otherwise
+  /// Number of memchecks required to prove independence of otherwise
   /// may-alias pointers.
   unsigned getNumRuntimePointerChecks() const {
     return PtrRtChecking->getNumberOfChecks();
@@ -521,7 +521,7 @@ public:
   unsigned getNumStores() const { return NumStores; }
   unsigned getNumLoads() const { return NumLoads;}
 
-  /// \brief Add code that checks at runtime if the accessed arrays overlap.
+  /// Add code that checks at runtime if the accessed arrays overlap.
   ///
   /// Returns a pair of instructions where the first element is the first
   /// instruction generated in possibly a sequence of instructions and the
@@ -529,7 +529,7 @@ public:
   std::pair<Instruction *, Instruction *>
   addRuntimeChecks(Instruction *Loc) const;
 
-  /// \brief Generete the instructions for the checks in \p PointerChecks.
+  /// Generete the instructions for the checks in \p PointerChecks.
   ///
   /// Returns a pair of instructions where the first element is the first
   /// instruction generated in possibly a sequence of instructions and the
@@ -539,32 +539,32 @@ public:
                    const SmallVectorImpl<RuntimePointerChecking::PointerCheck>
                        &PointerChecks) const;
 
-  /// \brief The diagnostics report generated for the analysis.  E.g. why we
+  /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
 
-  /// \brief the Memory Dependence Checker which can determine the
+  /// the Memory Dependence Checker which can determine the
   /// loop-independent and loop-carried dependences between memory accesses.
   const MemoryDepChecker &getDepChecker() const { return *DepChecker; }
 
-  /// \brief Return the list of instructions that use \p Ptr to read or write
+  /// Return the list of instructions that use \p Ptr to read or write
   /// memory.
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const {
     return DepChecker->getInstructionsForAccess(Ptr, isWrite);
   }
 
-  /// \brief If an access has a symbolic strides, this maps the pointer value to
+  /// If an access has a symbolic strides, this maps the pointer value to
   /// the stride symbol.
   const ValueToValueMap &getSymbolicStrides() const { return SymbolicStrides; }
 
-  /// \brief Pointer has a symbolic stride.
+  /// Pointer has a symbolic stride.
   bool hasStride(Value *V) const { return StrideSet.count(V); }
 
-  /// \brief Print the information about the memory accesses in the loop.
+  /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// \brief Checks existence of store to invariant address inside loop.
+  /// Checks existence of store to invariant address inside loop.
   /// If the loop has any store to invariant address, then it returns true,
   /// else returns false.
   bool hasStoreToLoopInvariantAddress() const {
@@ -579,15 +579,15 @@ public:
   const PredicatedScalarEvolution &getPSE() const { return *PSE; }
 
 private:
-  /// \brief Analyze the loop.
+  /// Analyze the loop.
   void analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
                    const TargetLibraryInfo *TLI, DominatorTree *DT);
 
-  /// \brief Check if the structure of the loop allows it to be analyzed by this
+  /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
   bool canAnalyzeLoop();
 
-  /// \brief Save the analysis remark.
+  /// Save the analysis remark.
   ///
   /// LAA does not directly emits the remarks.  Instead it stores it which the
   /// client can retrieve and presents as its own analysis
@@ -595,7 +595,7 @@ private:
   OptimizationRemarkAnalysis &recordAnalysis(StringRef RemarkName,
                                              Instruction *Instr = nullptr);
 
-  /// \brief Collect memory access with loop invariant strides.
+  /// Collect memory access with loop invariant strides.
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
@@ -607,7 +607,7 @@ private:
   /// at runtime. Using std::unique_ptr to make using move ctor simpler.
   std::unique_ptr<RuntimePointerChecking> PtrRtChecking;
 
-  /// \brief the Memory Dependence Checker which can determine the
+  /// the Memory Dependence Checker which can determine the
   /// loop-independent and loop-carried dependences between memory accesses.
   std::unique_ptr<MemoryDepChecker> DepChecker;
 
@@ -618,28 +618,28 @@ private:
 
   uint64_t MaxSafeDepDistBytes;
 
-  /// \brief Cache the result of analyzeLoop.
+  /// Cache the result of analyzeLoop.
   bool CanVecMem;
 
-  /// \brief Indicator for storing to uniform addresses.
+  /// Indicator for storing to uniform addresses.
   /// If a loop has write to a loop invariant address then it should be true.
   bool StoreToLoopInvariantAddress;
 
-  /// \brief The diagnostics report generated for the analysis.  E.g. why we
+  /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   std::unique_ptr<OptimizationRemarkAnalysis> Report;
 
-  /// \brief If an access has a symbolic strides, this maps the pointer value to
+  /// If an access has a symbolic strides, this maps the pointer value to
   /// the stride symbol.
   ValueToValueMap SymbolicStrides;
 
-  /// \brief Set of symbolic strides values.
+  /// Set of symbolic strides values.
   SmallPtrSet<Value *, 8> StrideSet;
 };
 
 Value *stripIntegerCast(Value *V);
 
-/// \brief Return the SCEV corresponding to a pointer with the symbolic stride
+/// Return the SCEV corresponding to a pointer with the symbolic stride
 /// replaced with constant one, assuming the SCEV predicate associated with
 /// \p PSE is true.
 ///
@@ -653,7 +653,7 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                       const ValueToValueMap &PtrToStride,
                                       Value *Ptr, Value *OrigPtr = nullptr);
 
-/// \brief If the pointer has a constant stride return it in units of its
+/// If the pointer has a constant stride return it in units of its
 /// element size.  Otherwise return zero.
 ///
 /// Ensure that it does not wrap in the address space, assuming the predicate
@@ -667,12 +667,26 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
-/// \brief Returns true if the memory operations \p A and \p B are consecutive.
+/// Attempt to sort the pointers in \p VL and return the sorted indices
+/// in \p SortedIndices, if reordering is required.
+///
+/// Returns 'true' if sorting is legal, otherwise returns 'false'.
+///
+/// For example, for a given \p VL of memory accesses in program order, a[i+4],
+/// a[i+0], a[i+1] and a[i+7], this function will sort the \p VL and save the
+/// sorted indices in \p SortedIndices as a[i+0], a[i+1], a[i+4], a[i+7] and
+/// saves the mask for actual memory accesses in program order in
+/// \p SortedIndices as <1,2,0,3>
+bool sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                     ScalarEvolution &SE,
+                     SmallVectorImpl<unsigned> &SortedIndices);
+
+/// Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                          ScalarEvolution &SE, bool CheckType = true);
 
-/// \brief This analysis provides dependence information for the memory accesses
+/// This analysis provides dependence information for the memory accesses
 /// of a loop.
 ///
 /// It runs the analysis for a loop on demand.  This can be initiated by
@@ -691,7 +705,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-  /// \brief Query the result of the loop access information for the loop \p L.
+  /// Query the result of the loop access information for the loop \p L.
   ///
   /// If there is no cached result available run the analysis.
   const LoopAccessInfo &getInfo(Loop *L);
@@ -701,11 +715,11 @@ public:
     LoopAccessInfoMap.clear();
   }
 
-  /// \brief Print the result of the analysis when invoked with -analyze.
+  /// Print the result of the analysis when invoked with -analyze.
   void print(raw_ostream &OS, const Module *M = nullptr) const override;
 
 private:
-  /// \brief The cache.
+  /// The cache.
   DenseMap<Loop *, std::unique_ptr<LoopAccessInfo>> LoopAccessInfoMap;
 
   // The used analysis passes.
@@ -716,7 +730,7 @@ private:
   LoopInfo *LI;
 };
 
-/// \brief This analysis provides dependence information for the memory
+/// This analysis provides dependence information for the memory
 /// accesses of a loop.
 ///
 /// It runs the analysis for a loop on demand.  This can be initiated by
diff --git a/contrib/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/contrib/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index 417ee979ce97..00e562c4f31f 100644
--- a/contrib/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/contrib/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -69,7 +69,7 @@ extern cl::opt<bool> EnableMSSALoopDependency;
 extern template class AllAnalysesOn<Loop>;
 
 extern template class AnalysisManager<Loop, LoopStandardAnalysisResults &>;
-/// \brief The loop analysis manager.
+/// The loop analysis manager.
 ///
 /// See the documentation for the AnalysisManager template for detail
 /// documentation. This typedef serves as a convenient way to refer to this
diff --git a/contrib/llvm/include/llvm/Analysis/LoopInfo.h b/contrib/llvm/include/llvm/Analysis/LoopInfo.h
index 28afc39727fa..30b29d66a1d1 100644
--- a/contrib/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/LoopInfo.h
@@ -178,6 +178,12 @@ public:
     return DenseBlockSet;
   }
 
+  /// Return a direct, immutable handle to the blocks set.
+  const SmallPtrSetImpl<const BlockT *> &getBlocksSet() const {
+    assert(!isInvalid() && "Loop not in a valid state!");
+    return DenseBlockSet;
+  }
+
   /// Return true if this loop is no longer valid.  The only valid use of this
   /// helper is "assert(L.isInvalid())" or equivalent, since IsInvalid is set to
   /// true by the destructor.  In other words, if this accessor returns true,
@@ -255,6 +261,20 @@ public:
   /// Otherwise return null.
   BlockT *getExitBlock() const;
 
+  /// Return true if no exit block for the loop has a predecessor that is
+  /// outside the loop.
+  bool hasDedicatedExits() const;
+
+  /// Return all unique successor blocks of this loop.
+  /// These are the blocks _outside of the current loop_ which are branched to.
+  /// This assumes that loop exits are in canonical form, i.e. all exits are
+  /// dedicated exits.
+  void getUniqueExitBlocks(SmallVectorImpl<BlockT *> &ExitBlocks) const;
+
+  /// If getUniqueExitBlocks would return exactly one block, return that block.
+  /// Otherwise return null.
+  BlockT *getUniqueExitBlock() const;
+
   /// Edge type.
   typedef std::pair<const BlockT *, const BlockT *> Edge;
 
@@ -438,7 +458,7 @@ extern template class LoopBase<BasicBlock, Loop>;
 /// in the CFG are necessarily loops.
 class Loop : public LoopBase<BasicBlock, Loop> {
 public:
-  /// \brief A range representing the start and end location of a loop.
+  /// A range representing the start and end location of a loop.
   class LocRange {
     DebugLoc Start;
     DebugLoc End;
@@ -452,7 +472,7 @@ public:
     const DebugLoc &getStart() const { return Start; }
     const DebugLoc &getEnd() const { return End; }
 
-    /// \brief Check for null.
+    /// Check for null.
     ///
     explicit operator bool() const { return Start && End; }
   };
@@ -527,7 +547,7 @@ public:
   ///
   /// If this loop contains the same llvm.loop metadata on each branch to the
   /// header then the node is returned. If any latch instruction does not
-  /// contain llvm.loop or or if multiple latches contain different nodes then
+  /// contain llvm.loop or if multiple latches contain different nodes then
   /// 0 is returned.
   MDNode *getLoopID() const;
   /// Set the llvm.loop loop id metadata for this loop.
@@ -547,20 +567,6 @@ public:
   /// unrolling pass is run more than once (which it generally is).
   void setLoopAlreadyUnrolled();
 
-  /// Return true if no exit block for the loop has a predecessor that is
-  /// outside the loop.
-  bool hasDedicatedExits() const;
-
-  /// Return all unique successor blocks of this loop.
-  /// These are the blocks _outside of the current loop_ which are branched to.
-  /// This assumes that loop exits are in canonical form, i.e. all exits are
-  /// dedicated exits.
-  void getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const;
-
-  /// If getUniqueExitBlocks would return exactly one block, return that block.
-  /// Otherwise return null.
-  BasicBlock *getUniqueExitBlock() const;
-
   void dump() const;
   void dumpVerbose() const;
 
@@ -929,7 +935,7 @@ template <> struct GraphTraits<Loop *> {
   static ChildIteratorType child_end(NodeRef N) { return N->end(); }
 };
 
-/// \brief Analysis pass that exposes the \c LoopInfo for a function.
+/// Analysis pass that exposes the \c LoopInfo for a function.
 class LoopAnalysis : public AnalysisInfoMixin<LoopAnalysis> {
   friend AnalysisInfoMixin<LoopAnalysis>;
   static AnalysisKey Key;
@@ -940,7 +946,7 @@ public:
   LoopInfo run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for the \c LoopAnalysis results.
+/// Printer pass for the \c LoopAnalysis results.
 class LoopPrinterPass : public PassInfoMixin<LoopPrinterPass> {
   raw_ostream &OS;
 
@@ -949,12 +955,12 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Verifier pass for the \c LoopAnalysis results.
+/// Verifier pass for the \c LoopAnalysis results.
 struct LoopVerifierPass : public PassInfoMixin<LoopVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief The legacy pass manager's analysis pass to compute loop information.
+/// The legacy pass manager's analysis pass to compute loop information.
 class LoopInfoWrapperPass : public FunctionPass {
   LoopInfo LI;
 
@@ -968,7 +974,7 @@ public:
   LoopInfo &getLoopInfo() { return LI; }
   const LoopInfo &getLoopInfo() const { return LI; }
 
-  /// \brief Calculate the natural loop information for a given function.
+  /// Calculate the natural loop information for a given function.
   bool runOnFunction(Function &F) override;
 
   void verifyAnalysis() const override;
diff --git a/contrib/llvm/include/llvm/Analysis/LoopInfoImpl.h b/contrib/llvm/include/llvm/Analysis/LoopInfoImpl.h
index b3a16b5369f7..941389858868 100644
--- a/contrib/llvm/include/llvm/Analysis/LoopInfoImpl.h
+++ b/contrib/llvm/include/llvm/Analysis/LoopInfoImpl.h
@@ -82,6 +82,74 @@ BlockT *LoopBase<BlockT, LoopT>::getExitBlock() const {
   return nullptr;
 }
 
+template <class BlockT, class LoopT>
+bool LoopBase<BlockT, LoopT>::hasDedicatedExits() const {
+  // Each predecessor of each exit block of a normal loop is contained
+  // within the loop.
+  SmallVector<BlockT *, 4> ExitBlocks;
+  getExitBlocks(ExitBlocks);
+  for (BlockT *EB : ExitBlocks)
+    for (BlockT *Predecessor : children<Inverse<BlockT *>>(EB))
+      if (!contains(Predecessor))
+        return false;
+  // All the requirements are met.
+  return true;
+}
+
+template <class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::getUniqueExitBlocks(
+    SmallVectorImpl<BlockT *> &ExitBlocks) const {
+  typedef GraphTraits<BlockT *> BlockTraits;
+  typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits;
+
+  assert(hasDedicatedExits() &&
+         "getUniqueExitBlocks assumes the loop has canonical form exits!");
+
+  SmallVector<BlockT *, 32> SwitchExitBlocks;
+  for (BlockT *Block : this->blocks()) {
+    SwitchExitBlocks.clear();
+    for (BlockT *Successor : children<BlockT *>(Block)) {
+      // If block is inside the loop then it is not an exit block.
+      if (contains(Successor))
+        continue;
+
+      BlockT *FirstPred = *InvBlockTraits::child_begin(Successor);
+
+      // If current basic block is this exit block's first predecessor then only
+      // insert exit block in to the output ExitBlocks vector. This ensures that
+      // same exit block is not inserted twice into ExitBlocks vector.
+      if (Block != FirstPred)
+        continue;
+
+      // If a terminator has more then two successors, for example SwitchInst,
+      // then it is possible that there are multiple edges from current block to
+      // one exit block.
+      if (std::distance(BlockTraits::child_begin(Block),
+                        BlockTraits::child_end(Block)) <= 2) {
+        ExitBlocks.push_back(Successor);
+        continue;
+      }
+
+      // In case of multiple edges from current block to exit block, collect
+      // only one edge in ExitBlocks. Use switchExitBlocks to keep track of
+      // duplicate edges.
+      if (!is_contained(SwitchExitBlocks, Successor)) {
+        SwitchExitBlocks.push_back(Successor);
+        ExitBlocks.push_back(Successor);
+      }
+    }
+  }
+}
+
+template <class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getUniqueExitBlock() const {
+  SmallVector<BlockT *, 8> UniqueExitBlocks;
+  getUniqueExitBlocks(UniqueExitBlocks);
+  if (UniqueExitBlocks.size() == 1)
+    return UniqueExitBlocks[0];
+  return nullptr;
+}
+
 /// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
 template <class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::getExitEdges(
@@ -572,8 +640,8 @@ void LoopInfoBase<BlockT, LoopT>::print(raw_ostream &OS) const {
 
 template <typename T>
 bool compareVectors(std::vector<T> &BB1, std::vector<T> &BB2) {
-  std::sort(BB1.begin(), BB1.end());
-  std::sort(BB2.begin(), BB2.end());
+  llvm::sort(BB1.begin(), BB1.end());
+  llvm::sort(BB2.begin(), BB2.end());
   return BB1 == BB2;
 }
 
@@ -617,6 +685,15 @@ static void compareLoops(const LoopT *L, const LoopT *OtherL,
   std::vector<BlockT *> OtherBBs = OtherL->getBlocks();
   assert(compareVectors(BBs, OtherBBs) &&
          "Mismatched basic blocks in the loops!");
+
+  const SmallPtrSetImpl<const BlockT *> &BlocksSet = L->getBlocksSet();
+  const SmallPtrSetImpl<const BlockT *> &OtherBlocksSet = L->getBlocksSet();
+  assert(BlocksSet.size() == OtherBlocksSet.size() &&
+         std::all_of(BlocksSet.begin(), BlocksSet.end(),
+                     [&OtherBlocksSet](const BlockT *BB) {
+                       return OtherBlocksSet.count(BB);
+                     }) &&
+         "Mismatched basic blocks in BlocksSets!");
 }
 #endif
 
@@ -636,6 +713,9 @@ void LoopInfoBase<BlockT, LoopT>::verify(
     LoopT *L = Entry.second;
     assert(Loops.count(L) && "orphaned loop");
     assert(L->contains(BB) && "orphaned block");
+    for (LoopT *ChildLoop : *L)
+      assert(!ChildLoop->contains(BB) &&
+             "BBMap should point to the innermost loop containing BB");
   }
 
   // Recompute LoopInfo to verify loops structure.
diff --git a/contrib/llvm/include/llvm/Analysis/LoopIterator.h b/contrib/llvm/include/llvm/Analysis/LoopIterator.h
index 461f74351821..91c54b23029b 100644
--- a/contrib/llvm/include/llvm/Analysis/LoopIterator.h
+++ b/contrib/llvm/include/llvm/Analysis/LoopIterator.h
@@ -168,6 +168,25 @@ public:
   }
 };
 
+/// Wrapper class to LoopBlocksDFS that provides a standard begin()/end()
+/// interface for the DFS reverse post-order traversal of blocks in a loop body.
+class LoopBlocksRPO {
+private:
+  LoopBlocksDFS DFS;
+
+public:
+  LoopBlocksRPO(Loop *Container) : DFS(Container) {}
+
+  /// Traverse the loop blocks and store the DFS result.
+  void perform(LoopInfo *LI) {
+    DFS.perform(LI);
+  }
+
+  /// Reverse iterate over the cached postorder blocks.
+  LoopBlocksDFS::RPOIterator begin() const { return DFS.beginRPO(); }
+  LoopBlocksDFS::RPOIterator end() const { return DFS.endRPO(); }
+};
+
 /// Specialize po_iterator_storage to record postorder numbers.
 template<> class po_iterator_storage<LoopBlocksTraversal, true> {
   LoopBlocksTraversal &LBT;
diff --git a/contrib/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/contrib/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
index 80f3e5fdcd43..f45bf0b223b8 100644
--- a/contrib/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
+++ b/contrib/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
@@ -57,7 +57,7 @@ public:
   using Base::visit;
 
 private:
-  /// \brief A cache of pointer bases and constant-folded offsets corresponding
+  /// A cache of pointer bases and constant-folded offsets corresponding
   /// to GEP (or derived from GEP) instructions.
   ///
   /// In order to find the base pointer one needs to perform non-trivial
@@ -65,11 +65,11 @@ private:
   /// results saved.
   DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses;
 
-  /// \brief SCEV expression corresponding to number of currently simulated
+  /// SCEV expression corresponding to number of currently simulated
   /// iteration.
   const SCEV *IterationNumber;
 
-  /// \brief A Value->Constant map for keeping values that we managed to
+  /// A Value->Constant map for keeping values that we managed to
   /// constant-fold on the given iteration.
   ///
   /// While we walk the loop instructions, we build up and maintain a mapping
diff --git a/contrib/llvm/include/llvm/Analysis/MemoryBuiltins.h b/contrib/llvm/include/llvm/Analysis/MemoryBuiltins.h
index 7d53e34938b7..5418128f16ef 100644
--- a/contrib/llvm/include/llvm/Analysis/MemoryBuiltins.h
+++ b/contrib/llvm/include/llvm/Analysis/MemoryBuiltins.h
@@ -53,33 +53,33 @@ class Type;
 class UndefValue;
 class Value;
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
 bool isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
                     bool LookThroughBitCast = false);
 
-/// \brief Tests if a value is a call or invoke to a function that returns a
+/// Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
 bool isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
                  bool LookThroughBitCast = false);
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
 bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                     bool LookThroughBitCast = false);
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
 bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                     bool LookThroughBitCast = false);
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates memory similar to malloc or calloc.
 bool isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                             bool LookThroughBitCast = false);
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
 bool isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                    bool LookThroughBitCast = false);
@@ -170,14 +170,14 @@ struct ObjectSizeOpts {
   bool NullIsUnknownSize = false;
 };
 
-/// \brief Compute the size of the object pointed by Ptr. Returns true and the
+/// Compute the size of the object pointed by Ptr. Returns true and the
 /// object size in Size if successful, and false otherwise. In this context, by
 /// object we mean the region of memory starting at Ptr to the end of the
 /// underlying object pointed to by Ptr.
 bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
                    const TargetLibraryInfo *TLI, ObjectSizeOpts Opts = {});
 
-/// Try to turn a call to @llvm.objectsize into an integer value of the given
+/// Try to turn a call to \@llvm.objectsize into an integer value of the given
 /// Type. Returns null on failure.
 /// If MustSucceed is true, this function will not return null, and may return
 /// conservative values governed by the second argument of the call to
@@ -189,7 +189,7 @@ ConstantInt *lowerObjectSizeCall(IntrinsicInst *ObjectSize,
 
 using SizeOffsetType = std::pair<APInt, APInt>;
 
-/// \brief Evaluate the size and offset of an object pointed to by a Value*
+/// Evaluate the size and offset of an object pointed to by a Value*
 /// statically. Fails if size or offset are not known at compile time.
 class ObjectSizeOffsetVisitor
   : public InstVisitor<ObjectSizeOffsetVisitor, SizeOffsetType> {
@@ -248,7 +248,7 @@ private:
 
 using SizeOffsetEvalType = std::pair<Value *, Value *>;
 
-/// \brief Evaluate the size and offset of an object pointed to by a Value*.
+/// Evaluate the size and offset of an object pointed to by a Value*.
 /// May create code to compute the result at run-time.
 class ObjectSizeOffsetEvaluator
   : public InstVisitor<ObjectSizeOffsetEvaluator, SizeOffsetEvalType> {
diff --git a/contrib/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h b/contrib/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
index c2974525a6ff..1c6ec98dfedc 100644
--- a/contrib/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -26,6 +26,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PredIteratorCache.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
@@ -302,7 +303,7 @@ private:
     /// The maximum size of the dereferences of the pointer.
     ///
     /// May be UnknownSize if the sizes are unknown.
-    uint64_t Size = MemoryLocation::UnknownSize;
+    LocationSize Size = MemoryLocation::UnknownSize;
     /// The AA tags associated with dereferences of the pointer.
     ///
     /// The members may be null if there are no tags or conflicting tags.
@@ -314,7 +315,10 @@ private:
   /// Cache storing single nonlocal def for the instruction.
   /// It is set when nonlocal def would be found in function returning only
   /// local dependencies.
-  DenseMap<Instruction *, NonLocalDepResult> NonLocalDefsCache;
+  DenseMap<AssertingVH<const Value>, NonLocalDepResult> NonLocalDefsCache;
+  using ReverseNonLocalDefsCacheTy =
+    DenseMap<Instruction *, SmallPtrSet<const Value*, 4>>;
+  ReverseNonLocalDefsCacheTy ReverseNonLocalDefsCache;
 
   /// This map stores the cached results of doing a pointer lookup at the
   /// bottom of a block.
diff --git a/contrib/llvm/include/llvm/Analysis/MemoryLocation.h b/contrib/llvm/include/llvm/Analysis/MemoryLocation.h
index c1080742e83a..6b680000312c 100644
--- a/contrib/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/contrib/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -27,8 +27,16 @@ class LoadInst;
 class StoreInst;
 class MemTransferInst;
 class MemIntrinsic;
+class AtomicMemTransferInst;
+class AtomicMemIntrinsic;
+class AnyMemTransferInst;
+class AnyMemIntrinsic;
 class TargetLibraryInfo;
 
+// Represents the size of a MemoryLocation. Logically, it's an
+// Optional<uint64_t>, with a special UnknownSize value from `MemoryLocation`.
+using LocationSize = uint64_t;
+
 /// Representation for a specific memory location.
 ///
 /// This abstraction can be used to represent a specific location in memory.
@@ -55,7 +63,7 @@ public:
   /// virtual address space, because there are restrictions on stepping out of
   /// one object and into another. See
   /// http://llvm.org/docs/LangRef.html#pointeraliasing
-  uint64_t Size;
+  LocationSize Size;
 
   /// The metadata nodes which describes the aliasing of the location (each
   /// member is null if that kind of information is unavailable).
@@ -90,17 +98,21 @@ public:
 
   /// Return a location representing the source of a memory transfer.
   static MemoryLocation getForSource(const MemTransferInst *MTI);
+  static MemoryLocation getForSource(const AtomicMemTransferInst *MTI);
+  static MemoryLocation getForSource(const AnyMemTransferInst *MTI);
 
   /// Return a location representing the destination of a memory set or
   /// transfer.
   static MemoryLocation getForDest(const MemIntrinsic *MI);
+  static MemoryLocation getForDest(const AtomicMemIntrinsic *MI);
+  static MemoryLocation getForDest(const AnyMemIntrinsic *MI);
 
   /// Return a location representing a particular argument of a call.
   static MemoryLocation getForArgument(ImmutableCallSite CS, unsigned ArgIdx,
                                        const TargetLibraryInfo &TLI);
 
   explicit MemoryLocation(const Value *Ptr = nullptr,
-                          uint64_t Size = UnknownSize,
+                          LocationSize Size = UnknownSize,
                           const AAMDNodes &AATags = AAMDNodes())
       : Ptr(Ptr), Size(Size), AATags(AATags) {}
 
@@ -110,7 +122,7 @@ public:
     return Copy;
   }
 
-  MemoryLocation getWithNewSize(uint64_t NewSize) const {
+  MemoryLocation getWithNewSize(LocationSize NewSize) const {
     MemoryLocation Copy(*this);
     Copy.Size = NewSize;
     return Copy;
@@ -137,7 +149,7 @@ template <> struct DenseMapInfo<MemoryLocation> {
   }
   static unsigned getHashValue(const MemoryLocation &Val) {
     return DenseMapInfo<const Value *>::getHashValue(Val.Ptr) ^
-           DenseMapInfo<uint64_t>::getHashValue(Val.Size) ^
+           DenseMapInfo<LocationSize>::getHashValue(Val.Size) ^
            DenseMapInfo<AAMDNodes>::getHashValue(Val.AATags);
   }
   static bool isEqual(const MemoryLocation &LHS, const MemoryLocation &RHS) {
diff --git a/contrib/llvm/include/llvm/Analysis/MemorySSA.h b/contrib/llvm/include/llvm/Analysis/MemorySSA.h
index d19f08453ee6..d445e4430e5c 100644
--- a/contrib/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/contrib/llvm/include/llvm/Analysis/MemorySSA.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This file exposes an interface to building/using memory SSA to
+/// This file exposes an interface to building/using memory SSA to
 /// walk memory instructions using a use/def graph.
 ///
 /// Memory SSA class builds an SSA form that links together memory access
@@ -93,6 +93,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include <algorithm>
@@ -118,10 +119,10 @@ struct DefsOnlyTag {};
 
 } // end namespace MSSAHelpers
 
-enum {
+enum : unsigned {
   // Used to signify what the default invalid ID is for MemoryAccess's
   // getID()
-  INVALID_MEMORYACCESS_ID = 0
+  INVALID_MEMORYACCESS_ID = -1U
 };
 
 template <class T> class memoryaccess_def_iterator_base;
@@ -129,7 +130,7 @@ using memoryaccess_def_iterator = memoryaccess_def_iterator_base<MemoryAccess>;
 using const_memoryaccess_def_iterator =
     memoryaccess_def_iterator_base<const MemoryAccess>;
 
-// \brief The base for all memory accesses. All memory accesses in a block are
+// The base for all memory accesses. All memory accesses in a block are
 // linked together using an intrusive list.
 class MemoryAccess
     : public DerivedUser,
@@ -158,11 +159,11 @@ public:
   void print(raw_ostream &OS) const;
   void dump() const;
 
-  /// \brief The user iterators for a memory access
+  /// The user iterators for a memory access
   using iterator = user_iterator;
   using const_iterator = const_user_iterator;
 
-  /// \brief This iterator walks over all of the defs in a given
+  /// This iterator walks over all of the defs in a given
   /// MemoryAccess. For MemoryPhi nodes, this walks arguments. For
   /// MemoryUse/MemoryDef, this walks the defining access.
   memoryaccess_def_iterator defs_begin();
@@ -170,7 +171,7 @@ public:
   memoryaccess_def_iterator defs_end();
   const_memoryaccess_def_iterator defs_end() const;
 
-  /// \brief Get the iterators for the all access list and the defs only list
+  /// Get the iterators for the all access list and the defs only list
   /// We default to the all access list.
   AllAccessType::self_iterator getIterator() {
     return this->AllAccessType::getIterator();
@@ -204,11 +205,11 @@ protected:
   friend class MemoryUse;
   friend class MemoryUseOrDef;
 
-  /// \brief Used by MemorySSA to change the block of a MemoryAccess when it is
+  /// Used by MemorySSA to change the block of a MemoryAccess when it is
   /// moved.
   void setBlock(BasicBlock *BB) { Block = BB; }
 
-  /// \brief Used for debugging and tracking things about MemoryAccesses.
+  /// Used for debugging and tracking things about MemoryAccesses.
   /// Guaranteed unique among MemoryAccesses, no guarantees otherwise.
   inline unsigned getID() const;
 
@@ -217,16 +218,24 @@ protected:
       : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue),
         Block(BB) {}
 
+  // Use deleteValue() to delete a generic MemoryAccess.
+  ~MemoryAccess() = default;
+
 private:
   BasicBlock *Block;
 };
 
+template <>
+struct ilist_alloc_traits<MemoryAccess> {
+  static void deleteNode(MemoryAccess *MA) { MA->deleteValue(); }
+};
+
 inline raw_ostream &operator<<(raw_ostream &OS, const MemoryAccess &MA) {
   MA.print(OS);
   return OS;
 }
 
-/// \brief Class that has the common methods + fields of memory uses/defs. It's
+/// Class that has the common methods + fields of memory uses/defs. It's
 /// a little awkward to have, but there are many cases where we want either a
 /// use or def, and there are many cases where uses are needed (defs aren't
 /// acceptable), and vice-versa.
@@ -239,10 +248,10 @@ public:
 
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
 
-  /// \brief Get the instruction that this MemoryUse represents.
-  Instruction *getMemoryInst() const { return MemoryInst; }
+  /// Get the instruction that this MemoryUse represents.
+  Instruction *getMemoryInst() const { return MemoryInstruction; }
 
-  /// \brief Get the access that produces the memory state used by this Use.
+  /// Get the access that produces the memory state used by this Use.
   MemoryAccess *getDefiningAccess() const { return getOperand(0); }
 
   static bool classof(const Value *MA) {
@@ -255,7 +264,13 @@ public:
   inline MemoryAccess *getOptimized() const;
   inline void setOptimized(MemoryAccess *);
 
-  /// \brief Reset the ID of what this MemoryUse was optimized to, causing it to
+  // Retrieve AliasResult type of the optimized access. Ideally this would be
+  // returned by the caching walker and may go away in the future.
+  Optional<AliasResult> getOptimizedAccessType() const {
+    return OptimizedAccessAlias;
+  }
+
+  /// Reset the ID of what this MemoryUse was optimized to, causing it to
   /// be rewalked by the walker if necessary.
   /// This really should only be called by tests.
   inline void resetOptimized();
@@ -266,20 +281,31 @@ protected:
 
   MemoryUseOrDef(LLVMContext &C, MemoryAccess *DMA, unsigned Vty,
                  DeleteValueTy DeleteValue, Instruction *MI, BasicBlock *BB)
-      : MemoryAccess(C, Vty, DeleteValue, BB, 1), MemoryInst(MI) {
+      : MemoryAccess(C, Vty, DeleteValue, BB, 1), MemoryInstruction(MI),
+        OptimizedAccessAlias(MayAlias) {
     setDefiningAccess(DMA);
   }
 
-  void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false) {
+  // Use deleteValue() to delete a generic MemoryUseOrDef.
+  ~MemoryUseOrDef() = default;
+
+  void setOptimizedAccessType(Optional<AliasResult> AR) {
+    OptimizedAccessAlias = AR;
+  }
+
+  void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false,
+                         Optional<AliasResult> AR = MayAlias) {
     if (!Optimized) {
       setOperand(0, DMA);
       return;
     }
     setOptimized(DMA);
+    setOptimizedAccessType(AR);
   }
 
 private:
-  Instruction *MemoryInst;
+  Instruction *MemoryInstruction;
+  Optional<AliasResult> OptimizedAccessAlias;
 };
 
 template <>
@@ -287,7 +313,7 @@ struct OperandTraits<MemoryUseOrDef>
     : public FixedNumOperandTraits<MemoryUseOrDef, 1> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess)
 
-/// \brief Represents read-only accesses to memory
+/// Represents read-only accesses to memory
 ///
 /// In particular, the set of Instructions that will be represented by
 /// MemoryUse's is exactly the set of Instructions for which
@@ -331,14 +357,14 @@ protected:
 private:
   static void deleteMe(DerivedUser *Self);
 
-  unsigned int OptimizedID = 0;
+  unsigned OptimizedID = INVALID_MEMORYACCESS_ID;
 };
 
 template <>
 struct OperandTraits<MemoryUse> : public FixedNumOperandTraits<MemoryUse, 1> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUse, MemoryAccess)
 
-/// \brief Represents a read-write access to memory, whether it is a must-alias,
+/// Represents a read-write access to memory, whether it is a must-alias,
 /// or a may-alias.
 ///
 /// In particular, the set of Instructions that will be represented by
@@ -369,7 +395,9 @@ public:
     OptimizedID = getDefiningAccess()->getID();
   }
 
-  MemoryAccess *getOptimized() const { return Optimized; }
+  MemoryAccess *getOptimized() const {
+    return cast_or_null<MemoryAccess>(Optimized);
+  }
 
   bool isOptimized() const {
     return getOptimized() && getDefiningAccess() &&
@@ -388,15 +416,15 @@ private:
   static void deleteMe(DerivedUser *Self);
 
   const unsigned ID;
-  MemoryAccess *Optimized = nullptr;
-  unsigned int OptimizedID = INVALID_MEMORYACCESS_ID;
+  unsigned OptimizedID = INVALID_MEMORYACCESS_ID;
+  WeakVH Optimized;
 };
 
 template <>
 struct OperandTraits<MemoryDef> : public FixedNumOperandTraits<MemoryDef, 1> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryDef, MemoryAccess)
 
-/// \brief Represents phi nodes for memory accesses.
+/// Represents phi nodes for memory accesses.
 ///
 /// These have the same semantic as regular phi nodes, with the exception that
 /// only one phi will ever exist in a given basic block.
@@ -476,10 +504,10 @@ public:
 
   const_op_range incoming_values() const { return operands(); }
 
-  /// \brief Return the number of incoming edges
+  /// Return the number of incoming edges
   unsigned getNumIncomingValues() const { return getNumOperands(); }
 
-  /// \brief Return incoming value number x
+  /// Return incoming value number x
   MemoryAccess *getIncomingValue(unsigned I) const { return getOperand(I); }
   void setIncomingValue(unsigned I, MemoryAccess *V) {
     assert(V && "PHI node got a null value!");
@@ -489,17 +517,17 @@ public:
   static unsigned getOperandNumForIncomingValue(unsigned I) { return I; }
   static unsigned getIncomingValueNumForOperand(unsigned I) { return I; }
 
-  /// \brief Return incoming basic block number @p i.
+  /// Return incoming basic block number @p i.
   BasicBlock *getIncomingBlock(unsigned I) const { return block_begin()[I]; }
 
-  /// \brief Return incoming basic block corresponding
+  /// Return incoming basic block corresponding
   /// to an operand of the PHI.
   BasicBlock *getIncomingBlock(const Use &U) const {
     assert(this == U.getUser() && "Iterator doesn't point to PHI's Uses?");
     return getIncomingBlock(unsigned(&U - op_begin()));
   }
 
-  /// \brief Return incoming basic block corresponding
+  /// Return incoming basic block corresponding
   /// to value use iterator.
   BasicBlock *getIncomingBlock(MemoryAccess::const_user_iterator I) const {
     return getIncomingBlock(I.getUse());
@@ -510,7 +538,7 @@ public:
     block_begin()[I] = BB;
   }
 
-  /// \brief Add an incoming value to the end of the PHI list
+  /// Add an incoming value to the end of the PHI list
   void addIncoming(MemoryAccess *V, BasicBlock *BB) {
     if (getNumOperands() == ReservedSpace)
       growOperands(); // Get more space!
@@ -520,7 +548,7 @@ public:
     setIncomingBlock(getNumOperands() - 1, BB);
   }
 
-  /// \brief Return the first index of the specified basic
+  /// Return the first index of the specified basic
   /// block in the value list for this PHI.  Returns -1 if no instance.
   int getBasicBlockIndex(const BasicBlock *BB) const {
     for (unsigned I = 0, E = getNumOperands(); I != E; ++I)
@@ -529,12 +557,53 @@ public:
     return -1;
   }
 
-  Value *getIncomingValueForBlock(const BasicBlock *BB) const {
+  MemoryAccess *getIncomingValueForBlock(const BasicBlock *BB) const {
     int Idx = getBasicBlockIndex(BB);
     assert(Idx >= 0 && "Invalid basic block argument!");
     return getIncomingValue(Idx);
   }
 
+  // After deleting incoming position I, the order of incoming may be changed.
+  void unorderedDeleteIncoming(unsigned I) {
+    unsigned E = getNumOperands();
+    assert(I < E && "Cannot remove out of bounds Phi entry.");
+    // MemoryPhi must have at least two incoming values, otherwise the MemoryPhi
+    // itself should be deleted.
+    assert(E >= 2 && "Cannot only remove incoming values in MemoryPhis with "
+                     "at least 2 values.");
+    setIncomingValue(I, getIncomingValue(E - 1));
+    setIncomingBlock(I, block_begin()[E - 1]);
+    setOperand(E - 1, nullptr);
+    block_begin()[E - 1] = nullptr;
+    setNumHungOffUseOperands(getNumOperands() - 1);
+  }
+
+  // After deleting entries that satisfy Pred, remaining entries may have
+  // changed order.
+  template <typename Fn> void unorderedDeleteIncomingIf(Fn &&Pred) {
+    for (unsigned I = 0, E = getNumOperands(); I != E; ++I)
+      if (Pred(getIncomingValue(I), getIncomingBlock(I))) {
+        unorderedDeleteIncoming(I);
+        E = getNumOperands();
+        --I;
+      }
+    assert(getNumOperands() >= 1 &&
+           "Cannot remove all incoming blocks in a MemoryPhi.");
+  }
+
+  // After deleting incoming block BB, the incoming blocks order may be changed.
+  void unorderedDeleteIncomingBlock(const BasicBlock *BB) {
+    unorderedDeleteIncomingIf(
+        [&](const MemoryAccess *, const BasicBlock *B) { return BB == B; });
+  }
+
+  // After deleting incoming memory access MA, the incoming accesses order may
+  // be changed.
+  void unorderedDeleteIncomingValue(const MemoryAccess *MA) {
+    unorderedDeleteIncomingIf(
+        [&](const MemoryAccess *M, const BasicBlock *) { return MA == M; });
+  }
+
   static bool classof(const Value *V) {
     return V->getValueID() == MemoryPhiVal;
   }
@@ -546,7 +615,7 @@ public:
 protected:
   friend class MemorySSA;
 
-  /// \brief this is more complicated than the generic
+  /// this is more complicated than the generic
   /// User::allocHungoffUses, because we have to allocate Uses for the incoming
   /// values and pointers to the incoming blocks, all in one allocation.
   void allocHungoffUses(unsigned N) {
@@ -558,7 +627,7 @@ private:
   const unsigned ID;
   unsigned ReservedSpace;
 
-  /// \brief This grows the operand list in response to a push_back style of
+  /// This grows the operand list in response to a push_back style of
   /// operation.  This grows the number of ops by 1.5 times.
   void growOperands() {
     unsigned E = getNumOperands();
@@ -607,7 +676,7 @@ inline void MemoryUseOrDef::resetOptimized() {
 template <> struct OperandTraits<MemoryPhi> : public HungoffOperandTraits<2> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
 
-/// \brief Encapsulates MemorySSA, including all data associated with memory
+/// Encapsulates MemorySSA, including all data associated with memory
 /// accesses.
 class MemorySSA {
 public:
@@ -616,7 +685,7 @@ public:
 
   MemorySSAWalker *getWalker();
 
-  /// \brief Given a memory Mod/Ref'ing instruction, get the MemorySSA
+  /// Given a memory Mod/Ref'ing instruction, get the MemorySSA
   /// access associated with it. If passed a basic block gets the memory phi
   /// node that exists for that block, if there is one. Otherwise, this will get
   /// a MemoryUseOrDef.
@@ -626,7 +695,7 @@ public:
   void dump() const;
   void print(raw_ostream &) const;
 
-  /// \brief Return true if \p MA represents the live on entry value
+  /// Return true if \p MA represents the live on entry value
   ///
   /// Loads and stores from pointer arguments and other global values may be
   /// defined by memory operations that do not occur in the current function, so
@@ -650,14 +719,14 @@ public:
   using DefsList =
       simple_ilist<MemoryAccess, ilist_tag<MSSAHelpers::DefsOnlyTag>>;
 
-  /// \brief Return the list of MemoryAccess's for a given basic block.
+  /// Return the list of MemoryAccess's for a given basic block.
   ///
   /// This list is not modifiable by the user.
   const AccessList *getBlockAccesses(const BasicBlock *BB) const {
     return getWritableBlockAccesses(BB);
   }
 
-  /// \brief Return the list of MemoryDef's and MemoryPhi's for a given basic
+  /// Return the list of MemoryDef's and MemoryPhi's for a given basic
   /// block.
   ///
   /// This list is not modifiable by the user.
@@ -665,19 +734,19 @@ public:
     return getWritableBlockDefs(BB);
   }
 
-  /// \brief Given two memory accesses in the same basic block, determine
+  /// Given two memory accesses in the same basic block, determine
   /// whether MemoryAccess \p A dominates MemoryAccess \p B.
   bool locallyDominates(const MemoryAccess *A, const MemoryAccess *B) const;
 
-  /// \brief Given two memory accesses in potentially different blocks,
+  /// Given two memory accesses in potentially different blocks,
   /// determine whether MemoryAccess \p A dominates MemoryAccess \p B.
   bool dominates(const MemoryAccess *A, const MemoryAccess *B) const;
 
-  /// \brief Given a MemoryAccess and a Use, determine whether MemoryAccess \p A
+  /// Given a MemoryAccess and a Use, determine whether MemoryAccess \p A
   /// dominates Use \p B.
   bool dominates(const MemoryAccess *A, const Use &B) const;
 
-  /// \brief Verify that MemorySSA is self consistent (IE definitions dominate
+  /// Verify that MemorySSA is self consistent (IE definitions dominate
   /// all uses, uses appear in the right places).  This is used by unit tests.
   void verifyMemorySSA() const;
 
@@ -694,6 +763,7 @@ protected:
   void verifyDefUses(Function &F) const;
   void verifyDomination(Function &F) const;
   void verifyOrdering(Function &F) const;
+  void verifyDominationNumbers(const Function &F) const;
 
   // This is used by the use optimizer and updater.
   AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
@@ -712,7 +782,7 @@ protected:
   // relies on the updater to fixup what it breaks, so it is not public.
 
   void moveTo(MemoryUseOrDef *What, BasicBlock *BB, AccessList::iterator Where);
-  void moveTo(MemoryUseOrDef *What, BasicBlock *BB, InsertionPlace Point);
+  void moveTo(MemoryAccess *What, BasicBlock *BB, InsertionPlace Point);
 
   // Rename the dominator tree branch rooted at BB.
   void renamePass(BasicBlock *BB, MemoryAccess *IncomingVal,
@@ -748,8 +818,7 @@ private:
   MemoryPhi *createMemoryPhi(BasicBlock *BB);
   MemoryUseOrDef *createNewAccess(Instruction *);
   MemoryAccess *findDominatingDef(BasicBlock *, enum InsertionPlace);
-  void placePHINodes(const SmallPtrSetImpl<BasicBlock *> &,
-                     const DenseMap<const BasicBlock *, unsigned int> &);
+  void placePHINodes(const SmallPtrSetImpl<BasicBlock *> &);
   MemoryAccess *renameBlock(BasicBlock *, MemoryAccess *, bool);
   void renameSuccessorPhis(BasicBlock *, MemoryAccess *, bool);
   void renamePass(DomTreeNode *, MemoryAccess *IncomingVal,
@@ -773,7 +842,7 @@ private:
   // corresponding list is empty.
   AccessMap PerBlockAccesses;
   DefsMap PerBlockDefs;
-  std::unique_ptr<MemoryAccess> LiveOnEntryDef;
+  std::unique_ptr<MemoryAccess, ValueDeleter> LiveOnEntryDef;
 
   // Domination mappings
   // Note that the numbering is local to a block, even though the map is
@@ -831,7 +900,7 @@ public:
   Result run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for \c MemorySSA.
+/// Printer pass for \c MemorySSA.
 class MemorySSAPrinterPass : public PassInfoMixin<MemorySSAPrinterPass> {
   raw_ostream &OS;
 
@@ -841,12 +910,12 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Verifier pass for \c MemorySSA.
+/// Verifier pass for \c MemorySSA.
 struct MemorySSAVerifierPass : PassInfoMixin<MemorySSAVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Legacy analysis pass which computes \c MemorySSA.
+/// Legacy analysis pass which computes \c MemorySSA.
 class MemorySSAWrapperPass : public FunctionPass {
 public:
   MemorySSAWrapperPass();
@@ -867,7 +936,7 @@ private:
   std::unique_ptr<MemorySSA> MSSA;
 };
 
-/// \brief This is the generic walker interface for walkers of MemorySSA.
+/// This is the generic walker interface for walkers of MemorySSA.
 /// Walkers are used to be able to further disambiguate the def-use chains
 /// MemorySSA gives you, or otherwise produce better info than MemorySSA gives
 /// you.
@@ -885,7 +954,7 @@ public:
 
   using MemoryAccessSet = SmallVector<MemoryAccess *, 8>;
 
-  /// \brief Given a memory Mod/Ref/ModRef'ing instruction, calling this
+  /// Given a memory Mod/Ref/ModRef'ing instruction, calling this
   /// will give you the nearest dominating MemoryAccess that Mod's the location
   /// the instruction accesses (by skipping any def which AA can prove does not
   /// alias the location(s) accessed by the instruction given).
@@ -917,7 +986,7 @@ public:
   /// but takes a MemoryAccess instead of an Instruction.
   virtual MemoryAccess *getClobberingMemoryAccess(MemoryAccess *) = 0;
 
-  /// \brief Given a potentially clobbering memory access and a new location,
+  /// Given a potentially clobbering memory access and a new location,
   /// calling this will give you the nearest dominating clobbering MemoryAccess
   /// (by skipping non-aliasing def links).
   ///
@@ -931,7 +1000,7 @@ public:
   virtual MemoryAccess *getClobberingMemoryAccess(MemoryAccess *,
                                                   const MemoryLocation &) = 0;
 
-  /// \brief Given a memory access, invalidate anything this walker knows about
+  /// Given a memory access, invalidate anything this walker knows about
   /// that access.
   /// This API is used by walkers that store information to perform basic cache
   /// invalidation.  This will be called by MemorySSA at appropriate times for
@@ -946,7 +1015,7 @@ protected:
   MemorySSA *MSSA;
 };
 
-/// \brief A MemorySSAWalker that does no alias queries, or anything else. It
+/// A MemorySSAWalker that does no alias queries, or anything else. It
 /// simply returns the links as they were constructed by the builder.
 class DoNothingMemorySSAWalker final : public MemorySSAWalker {
 public:
@@ -962,7 +1031,7 @@ public:
 using MemoryAccessPair = std::pair<MemoryAccess *, MemoryLocation>;
 using ConstMemoryAccessPair = std::pair<const MemoryAccess *, MemoryLocation>;
 
-/// \brief Iterator base class used to implement const and non-const iterators
+/// Iterator base class used to implement const and non-const iterators
 /// over the defining accesses of a MemoryAccess.
 template <class T>
 class memoryaccess_def_iterator_base
@@ -1035,7 +1104,7 @@ inline const_memoryaccess_def_iterator MemoryAccess::defs_end() const {
   return const_memoryaccess_def_iterator();
 }
 
-/// \brief GraphTraits for a MemoryAccess, which walks defs in the normal case,
+/// GraphTraits for a MemoryAccess, which walks defs in the normal case,
 /// and uses in the inverse case.
 template <> struct GraphTraits<MemoryAccess *> {
   using NodeRef = MemoryAccess *;
@@ -1055,7 +1124,7 @@ template <> struct GraphTraits<Inverse<MemoryAccess *>> {
   static ChildIteratorType child_end(NodeRef N) { return N->user_end(); }
 };
 
-/// \brief Provide an iterator that walks defs, giving both the memory access,
+/// Provide an iterator that walks defs, giving both the memory access,
 /// and the current pointer location, updating the pointer location as it
 /// changes due to phi node translation.
 ///
diff --git a/contrib/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/contrib/llvm/include/llvm/Analysis/MemorySSAUpdater.h
index b36b2f01dac6..38f08c1eebdc 100644
--- a/contrib/llvm/include/llvm/Analysis/MemorySSAUpdater.h
+++ b/contrib/llvm/include/llvm/Analysis/MemorySSAUpdater.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // \file
-// \brief An automatic updater for MemorySSA that handles arbitrary insertion,
+// An automatic updater for MemorySSA that handles arbitrary insertion,
 // deletion, and moves.  It performs phi insertion where necessary, and
 // automatically updates the MemorySSA IR to be correct.
 // While updating loads or removing instructions is often easy enough to not
@@ -33,6 +33,7 @@
 #define LLVM_ANALYSIS_MEMORYSSAUPDATER_H
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/BasicBlock.h"
@@ -43,6 +44,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -58,8 +60,13 @@ class raw_ostream;
 class MemorySSAUpdater {
 private:
   MemorySSA *MSSA;
-  SmallVector<MemoryPhi *, 8> InsertedPHIs;
+
+  /// We use WeakVH rather than a costly deletion to deal with dangling pointers.
+  /// MemoryPhis are created eagerly and sometimes get zapped shortly afterwards.
+  SmallVector<WeakVH, 16> InsertedPHIs;
+
   SmallPtrSet<BasicBlock *, 8> VisitedBlocks;
+  SmallSet<AssertingVH<MemoryPhi>, 8> NonOptPhis;
 
 public:
   MemorySSAUpdater(MemorySSA *MSSA) : MSSA(MSSA) {}
@@ -86,6 +93,45 @@ public:
   void moveAfter(MemoryUseOrDef *What, MemoryUseOrDef *Where);
   void moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
                    MemorySSA::InsertionPlace Where);
+  /// `From` block was spliced into `From` and `To`.
+  /// Move all accesses from `From` to `To` starting at instruction `Start`.
+  /// `To` is newly created BB, so empty of MemorySSA::MemoryAccesses.
+  /// Edges are already updated, so successors of `To` with MPhi nodes need to
+  /// update incoming block.
+  /// |------|        |------|
+  /// | From |        | From |
+  /// |      |        |------|
+  /// |      |           ||
+  /// |      |   =>      \/
+  /// |      |        |------|  <- Start
+  /// |      |        |  To  |
+  /// |------|        |------|
+  void moveAllAfterSpliceBlocks(BasicBlock *From, BasicBlock *To,
+                                Instruction *Start);
+  /// `From` block was merged into `To`. All instructions were moved and
+  /// `From` is an empty block with successor edges; `From` is about to be
+  /// deleted. Move all accesses from `From` to `To` starting at instruction
+  /// `Start`. `To` may have multiple successors, `From` has a single
+  /// predecessor. `From` may have successors with MPhi nodes, replace their
+  /// incoming block with `To`.
+  /// |------|        |------|
+  /// |  To  |        |  To  |
+  /// |------|        |      |
+  ///    ||      =>   |      |
+  ///    \/           |      |
+  /// |------|        |      |  <- Start
+  /// | From |        |      |
+  /// |------|        |------|
+  void moveAllAfterMergeBlocks(BasicBlock *From, BasicBlock *To,
+                               Instruction *Start);
+  /// BasicBlock Old had New, an empty BasicBlock, added directly before it,
+  /// and the predecessors in Preds that used to point to Old, now point to
+  /// New. If New is the only predecessor, move Old's Phi, if present, to New.
+  /// Otherwise, add a new Phi in New with appropriate incoming values, and
+  /// update the incoming values in Old's Phi node too, if present.
+  void
+  wireOldPredecessorsToNewImmediatePredecessor(BasicBlock *Old, BasicBlock *New,
+                                               ArrayRef<BasicBlock *> Preds);
 
   // The below are utility functions. Other than creation of accesses to pass
   // to insertDef, and removeAccess to remove accesses, you should generally
@@ -93,7 +139,7 @@ public:
   // the edge cases right, and the above calls already operate in near-optimal
   // time bounds.
 
-  /// \brief Create a MemoryAccess in MemorySSA at a specified point in a block,
+  /// Create a MemoryAccess in MemorySSA at a specified point in a block,
   /// with a specified clobbering definition.
   ///
   /// Returns the new MemoryAccess.
@@ -110,7 +156,7 @@ public:
                                        const BasicBlock *BB,
                                        MemorySSA::InsertionPlace Point);
 
-  /// \brief Create a MemoryAccess in MemorySSA before or after an existing
+  /// Create a MemoryAccess in MemorySSA before or after an existing
   /// MemoryAccess.
   ///
   /// Returns the new MemoryAccess.
@@ -127,7 +173,7 @@ public:
                                           MemoryAccess *Definition,
                                           MemoryAccess *InsertPt);
 
-  /// \brief Remove a MemoryAccess from MemorySSA, including updating all
+  /// Remove a MemoryAccess from MemorySSA, including updating all
   /// definitions and uses.
   /// This should be called when a memory instruction that has a MemoryAccess
   /// associated with it is erased from the program.  For example, if a store or
@@ -135,18 +181,45 @@ public:
   /// on the MemoryAccess for that store/load.
   void removeMemoryAccess(MemoryAccess *);
 
+  /// Remove MemoryAccess for a given instruction, if a MemoryAccess exists.
+  /// This should be called when an instruction (load/store) is deleted from
+  /// the program.
+  void removeMemoryAccess(const Instruction *I) {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
+      removeMemoryAccess(MA);
+  }
+
+  /// Remove all MemoryAcceses in a set of BasicBlocks about to be deleted.
+  /// Assumption we make here: all uses of deleted defs and phi must either
+  /// occur in blocks about to be deleted (thus will be deleted as well), or
+  /// they occur in phis that will simply lose an incoming value.
+  /// Deleted blocks still have successor info, but their predecessor edges and
+  /// Phi nodes may already be updated. Instructions in DeadBlocks should be
+  /// deleted after this call.
+  void removeBlocks(const SmallPtrSetImpl<BasicBlock *> &DeadBlocks);
+
+  /// Get handle on MemorySSA.
+  MemorySSA* getMemorySSA() const { return MSSA; }
+
 private:
   // Move What before Where in the MemorySSA IR.
   template <class WhereType>
   void moveTo(MemoryUseOrDef *What, BasicBlock *BB, WhereType Where);
+  // Move all memory accesses from `From` to `To` starting at `Start`.
+  // Restrictions apply, see public wrappers of this method.
+  void moveAllAccesses(BasicBlock *From, BasicBlock *To, Instruction *Start);
   MemoryAccess *getPreviousDef(MemoryAccess *);
   MemoryAccess *getPreviousDefInBlock(MemoryAccess *);
-  MemoryAccess *getPreviousDefFromEnd(BasicBlock *);
-  MemoryAccess *getPreviousDefRecursive(BasicBlock *);
+  MemoryAccess *
+  getPreviousDefFromEnd(BasicBlock *,
+                        DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &);
+  MemoryAccess *
+  getPreviousDefRecursive(BasicBlock *,
+                          DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &);
   MemoryAccess *recursePhi(MemoryAccess *Phi);
   template <class RangeType>
   MemoryAccess *tryRemoveTrivialPhi(MemoryPhi *Phi, RangeType &Operands);
-  void fixupDefs(const SmallVectorImpl<MemoryAccess *> &);
+  void fixupDefs(const SmallVectorImpl<WeakVH> &);
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/Analysis/MustExecute.h b/contrib/llvm/include/llvm/Analysis/MustExecute.h
new file mode 100644
index 000000000000..8daf156567cd
--- /dev/null
+++ b/contrib/llvm/include/llvm/Analysis/MustExecute.h
@@ -0,0 +1,64 @@
+//===- MustExecute.h - Is an instruction known to execute--------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Contains a collection of routines for determining if a given instruction is
+/// guaranteed to execute if a given point in control flow is reached.  The most
+/// common example is an instruction within a loop being provably executed if we
+/// branch to the header of it's containing loop.  
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_MUSTEXECUTE_H
+#define LLVM_ANALYSIS_MUSTEXECUTE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+
+namespace llvm {
+
+class Instruction;
+class DominatorTree;
+class Loop;
+
+/// Captures loop safety information.
+/// It keep information for loop & its header may throw exception or otherwise
+/// exit abnormaly on any iteration of the loop which might actually execute
+/// at runtime.  The primary way to consume this infromation is via
+/// isGuaranteedToExecute below, but some callers bailout or fallback to
+/// alternate reasoning if a loop contains any implicit control flow.
+struct LoopSafetyInfo {
+  bool MayThrow = false;       // The current loop contains an instruction which
+                               // may throw.
+  bool HeaderMayThrow = false; // Same as previous, but specific to loop header
+  // Used to update funclet bundle operands.
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+
+  LoopSafetyInfo() = default;
+};
+
+/// Computes safety information for a loop checks loop body & header for
+/// the possibility of may throw exception, it takes LoopSafetyInfo and loop as
+/// argument. Updates safety information in LoopSafetyInfo argument.
+/// Note: This is defined to clear and reinitialize an already initialized
+/// LoopSafetyInfo.  Some callers rely on this fact.
+void computeLoopSafetyInfo(LoopSafetyInfo *, Loop *);
+
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once (under the assumption that the loop is entered).
+bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT,
+                           const Loop *CurLoop,
+                           const LoopSafetyInfo *SafetyInfo);
+  
+}
+
+#endif
diff --git a/contrib/llvm/include/llvm/Analysis/ObjCARCAliasAnalysis.h b/contrib/llvm/include/llvm/Analysis/ObjCARCAliasAnalysis.h
index db524ff64ecd..559c77c30811 100644
--- a/contrib/llvm/include/llvm/Analysis/ObjCARCAliasAnalysis.h
+++ b/contrib/llvm/include/llvm/Analysis/ObjCARCAliasAnalysis.h
@@ -29,7 +29,7 @@
 namespace llvm {
 namespace objcarc {
 
-/// \brief This is a simple alias analysis implementation that uses knowledge
+/// This is a simple alias analysis implementation that uses knowledge
 /// of ARC constructs to answer queries.
 ///
 /// TODO: This class could be generalized to know about other ObjC-specific
diff --git a/contrib/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/contrib/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
index e80412a30564..07beb0bb60a3 100644
--- a/contrib/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
+++ b/contrib/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
@@ -34,6 +34,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -43,10 +44,10 @@ class raw_ostream;
 namespace llvm {
 namespace objcarc {
 
-/// \brief A handy option to enable/disable all ARC Optimizations.
+/// A handy option to enable/disable all ARC Optimizations.
 extern bool EnableARCOpts;
 
-/// \brief Test if the given module looks interesting to run ARC optimization
+/// Test if the given module looks interesting to run ARC optimization
 /// on.
 inline bool ModuleHasARC(const Module &M) {
   return
@@ -71,7 +72,7 @@ inline bool ModuleHasARC(const Module &M) {
     M.getNamedValue("clang.arc.use");
 }
 
-/// \brief This is a wrapper around getUnderlyingObject which also knows how to
+/// This is a wrapper around getUnderlyingObject which also knows how to
 /// look through objc_retain and objc_autorelease calls, which we know to return
 /// their argument verbatim.
 inline const Value *GetUnderlyingObjCPtr(const Value *V,
@@ -86,6 +87,18 @@ inline const Value *GetUnderlyingObjCPtr(const Value *V,
   return V;
 }
 
+/// A wrapper for GetUnderlyingObjCPtr used for results memoization.
+inline const Value *
+GetUnderlyingObjCPtrCached(const Value *V, const DataLayout &DL,
+                           DenseMap<const Value *, WeakTrackingVH> &Cache) {
+  if (auto InCache = Cache.lookup(V))
+    return InCache;
+
+  const Value *Computed = GetUnderlyingObjCPtr(V, DL);
+  Cache[V] = const_cast<Value *>(Computed);
+  return Computed;
+}
+
 /// The RCIdentity root of a value \p V is a dominating value U for which
 /// retaining or releasing U is equivalent to retaining or releasing V. In other
 /// words, ARC operations on \p V are equivalent to ARC operations on \p U.
@@ -119,7 +132,7 @@ inline Value *GetRCIdentityRoot(Value *V) {
   return const_cast<Value *>(GetRCIdentityRoot((const Value *)V));
 }
 
-/// \brief Assuming the given instruction is one of the special calls such as
+/// Assuming the given instruction is one of the special calls such as
 /// objc_retain or objc_release, return the RCIdentity root of the argument of
 /// the call.
 inline Value *GetArgRCIdentityRoot(Value *Inst) {
@@ -136,7 +149,7 @@ inline bool IsNoopInstruction(const Instruction *I) {
      cast<GetElementPtrInst>(I)->hasAllZeroIndices());
 }
 
-/// \brief Test whether the given value is possible a retainable object pointer.
+/// Test whether the given value is possible a retainable object pointer.
 inline bool IsPotentialRetainableObjPtr(const Value *Op) {
   // Pointers to static or stack storage are not valid retainable object
   // pointers.
@@ -181,7 +194,7 @@ inline bool IsPotentialRetainableObjPtr(const Value *Op,
   return true;
 }
 
-/// \brief Helper for GetARCInstKind. Determines what kind of construct CS
+/// Helper for GetARCInstKind. Determines what kind of construct CS
 /// is.
 inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) {
   for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
@@ -192,7 +205,7 @@ inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) {
   return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call;
 }
 
-/// \brief Return true if this value refers to a distinct and identifiable
+/// Return true if this value refers to a distinct and identifiable
 /// object.
 ///
 /// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses
diff --git a/contrib/llvm/include/llvm/Analysis/ObjCARCInstKind.h b/contrib/llvm/include/llvm/Analysis/ObjCARCInstKind.h
index 02ff03578238..0b92d8b48356 100644
--- a/contrib/llvm/include/llvm/Analysis/ObjCARCInstKind.h
+++ b/contrib/llvm/include/llvm/Analysis/ObjCARCInstKind.h
@@ -18,7 +18,7 @@ namespace objcarc {
 
 /// \enum ARCInstKind
 ///
-/// \brief Equivalence classes of instructions in the ARC Model.
+/// Equivalence classes of instructions in the ARC Model.
 ///
 /// Since we do not have "instructions" to represent ARC concepts in LLVM IR,
 /// we instead operate on equivalence classes of instructions.
@@ -57,32 +57,32 @@ enum class ARCInstKind {
 
 raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class);
 
-/// \brief Test if the given class is a kind of user.
+/// Test if the given class is a kind of user.
 bool IsUser(ARCInstKind Class);
 
-/// \brief Test if the given class is objc_retain or equivalent.
+/// Test if the given class is objc_retain or equivalent.
 bool IsRetain(ARCInstKind Class);
 
-/// \brief Test if the given class is objc_autorelease or equivalent.
+/// Test if the given class is objc_autorelease or equivalent.
 bool IsAutorelease(ARCInstKind Class);
 
-/// \brief Test if the given class represents instructions which return their
+/// Test if the given class represents instructions which return their
 /// argument verbatim.
 bool IsForwarding(ARCInstKind Class);
 
-/// \brief Test if the given class represents instructions which do nothing if
+/// Test if the given class represents instructions which do nothing if
 /// passed a null pointer.
 bool IsNoopOnNull(ARCInstKind Class);
 
-/// \brief Test if the given class represents instructions which are always safe
+/// Test if the given class represents instructions which are always safe
 /// to mark with the "tail" keyword.
 bool IsAlwaysTail(ARCInstKind Class);
 
-/// \brief Test if the given class represents instructions which are never safe
+/// Test if the given class represents instructions which are never safe
 /// to mark with the "tail" keyword.
 bool IsNeverTail(ARCInstKind Class);
 
-/// \brief Test if the given class represents instructions which are always safe
+/// Test if the given class represents instructions which are always safe
 /// to mark with the nounwind attribute.
 bool IsNoThrow(ARCInstKind Class);
 
@@ -90,11 +90,11 @@ bool IsNoThrow(ARCInstKind Class);
 /// autoreleasepool pop.
 bool CanInterruptRV(ARCInstKind Class);
 
-/// \brief Determine if F is one of the special known Functions.  If it isn't,
+/// Determine if F is one of the special known Functions.  If it isn't,
 /// return ARCInstKind::CallOrUser.
 ARCInstKind GetFunctionClass(const Function *F);
 
-/// \brief Determine which objc runtime call instruction class V belongs to.
+/// Determine which objc runtime call instruction class V belongs to.
 ///
 /// This is similar to GetARCInstKind except that it only detects objc
 /// runtime calls. This allows it to be faster.
diff --git a/contrib/llvm/include/llvm/Analysis/ObjectUtils.h b/contrib/llvm/include/llvm/Analysis/ObjectUtils.h
deleted file mode 100644
index 2ad3b1717009..000000000000
--- a/contrib/llvm/include/llvm/Analysis/ObjectUtils.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- Analysis/ObjectUtils.h - analysis utils for object files -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_OBJECT_UTILS_H
-#define LLVM_ANALYSIS_OBJECT_UTILS_H
-
-#include "llvm/IR/GlobalVariable.h"
-
-namespace llvm {
-
-/// True if GV can be left out of the object symbol table. This is the case
-/// for linkonce_odr values whose address is not significant. While legal, it is
-/// not normally profitable to omit them from the .o symbol table. Using this
-/// analysis makes sense when the information can be passed down to the linker
-/// or we are in LTO.
-inline bool canBeOmittedFromSymbolTable(const GlobalValue *GV) {
-  if (!GV->hasLinkOnceODRLinkage())
-    return false;
-
-  // We assume that anyone who sets global unnamed_addr on a non-constant knows
-  // what they're doing.
-  if (GV->hasGlobalUnnamedAddr())
-    return true;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (auto *Var = dyn_cast<GlobalVariable>(GV))
-    if (!Var->isConstant())
-      return false;
-
-  return GV->hasAtLeastLocalUnnamedAddr();
-}
-
-}
-
-#endif
diff --git a/contrib/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h b/contrib/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
index 26f32acdcda5..fa838696e2f8 100644
--- a/contrib/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
+++ b/contrib/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
@@ -40,7 +40,7 @@ public:
   OptimizationRemarkEmitter(const Function *F, BlockFrequencyInfo *BFI)
       : F(F), BFI(BFI) {}
 
-  /// \brief This variant can be used to generate ORE on demand (without the
+  /// This variant can be used to generate ORE on demand (without the
   /// analysis pass).
   ///
   /// Note that this ctor has a very different cost depending on whether
@@ -66,11 +66,11 @@ public:
   bool invalidate(Function &F, const PreservedAnalyses &PA,
                   FunctionAnalysisManager::Invalidator &Inv);
 
-  /// \brief Output the remark via the diagnostic handler and to the
+  /// Output the remark via the diagnostic handler and to the
   /// optimization record file.
   void emit(DiagnosticInfoOptimizationBase &OptDiag);
 
-  /// \brief Take a lambda that returns a remark which will be emitted.  Second
+  /// Take a lambda that returns a remark which will be emitted.  Second
   /// argument is only used to restrict this to functions.
   template <typename T>
   void emit(T RemarkBuilder, decltype(RemarkBuilder()) * = nullptr) {
@@ -85,7 +85,7 @@ public:
     }
   }
 
-  /// \brief Whether we allow for extra compile-time budget to perform more
+  /// Whether we allow for extra compile-time budget to perform more
   /// analysis to produce fewer false positives.
   ///
   /// This is useful when reporting missed optimizations.  In this case we can
@@ -112,7 +112,7 @@ private:
   /// Similar but use value from \p OptDiag and update hotness there.
   void computeHotness(DiagnosticInfoIROptimization &OptDiag);
 
-  /// \brief Only allow verbose messages if we know we're filtering by hotness
+  /// Only allow verbose messages if we know we're filtering by hotness
   /// (BFI is only set in this case).
   bool shouldEmitVerbose() { return BFI != nullptr; }
 
@@ -120,7 +120,7 @@ private:
   void operator=(const OptimizationRemarkEmitter &) = delete;
 };
 
-/// \brief Add a small namespace to avoid name clashes with the classes used in
+/// Add a small namespace to avoid name clashes with the classes used in
 /// the streaming interface.  We want these to be short for better
 /// write/readability.
 namespace ore {
@@ -158,10 +158,10 @@ class OptimizationRemarkEmitterAnalysis
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result typedef for this analysis pass.
+  /// Provide the result typedef for this analysis pass.
   typedef OptimizationRemarkEmitter Result;
 
-  /// \brief Run the analysis pass over a function and produce BFI.
+  /// Run the analysis pass over a function and produce BFI.
   Result run(Function &F, FunctionAnalysisManager &AM);
 };
 }
diff --git a/contrib/llvm/include/llvm/Analysis/OrderedBasicBlock.h b/contrib/llvm/include/llvm/Analysis/OrderedBasicBlock.h
index 2e716af1f60d..0776aa626005 100644
--- a/contrib/llvm/include/llvm/Analysis/OrderedBasicBlock.h
+++ b/contrib/llvm/include/llvm/Analysis/OrderedBasicBlock.h
@@ -33,28 +33,28 @@ class BasicBlock;
 
 class OrderedBasicBlock {
 private:
-  /// \brief Map a instruction to its position in a BasicBlock.
+  /// Map a instruction to its position in a BasicBlock.
   SmallDenseMap<const Instruction *, unsigned, 32> NumberedInsts;
 
-  /// \brief Keep track of last instruction inserted into \p NumberedInsts.
+  /// Keep track of last instruction inserted into \p NumberedInsts.
   /// It speeds up queries for uncached instructions by providing a start point
   /// for new queries in OrderedBasicBlock::comesBefore.
   BasicBlock::const_iterator LastInstFound;
 
-  /// \brief The position/number to tag the next instruction to be found.
+  /// The position/number to tag the next instruction to be found.
   unsigned NextInstPos;
 
-  /// \brief The source BasicBlock to map.
+  /// The source BasicBlock to map.
   const BasicBlock *BB;
 
-  /// \brief Given no cached results, find if \p A comes before \p B in \p BB.
+  /// Given no cached results, find if \p A comes before \p B in \p BB.
   /// Cache and number out instruction while walking \p BB.
   bool comesBefore(const Instruction *A, const Instruction *B);
 
 public:
   OrderedBasicBlock(const BasicBlock *BasicB);
 
-  /// \brief Find out whether \p A dominates \p B, meaning whether \p A
+  /// Find out whether \p A dominates \p B, meaning whether \p A
   /// comes before \p B in \p BB. This is a simplification that considers
   /// cached instruction positions and ignores other basic blocks, being
   /// only relevant to compare relative instructions positions inside \p BB.
diff --git a/contrib/llvm/include/llvm/Analysis/PHITransAddr.h b/contrib/llvm/include/llvm/Analysis/PHITransAddr.h
index f0f34f3a51f5..0a335b6be6c7 100644
--- a/contrib/llvm/include/llvm/Analysis/PHITransAddr.h
+++ b/contrib/llvm/include/llvm/Analysis/PHITransAddr.h
@@ -43,7 +43,7 @@ class PHITransAddr {
   /// TLI - The target library info if known, otherwise null.
   const TargetLibraryInfo *TLI;
 
-  /// A cache of @llvm.assume calls used by SimplifyInstruction.
+  /// A cache of \@llvm.assume calls used by SimplifyInstruction.
   AssumptionCache *AC;
 
   /// InstInputs - The inputs for our symbolic address.
diff --git a/contrib/llvm/include/llvm/Analysis/Passes.h b/contrib/llvm/include/llvm/Analysis/Passes.h
index 6d8f14fa32f9..09b28a0b0884 100644
--- a/contrib/llvm/include/llvm/Analysis/Passes.h
+++ b/contrib/llvm/include/llvm/Analysis/Passes.h
@@ -96,6 +96,14 @@ namespace llvm {
   //
   FunctionPass *createMemDerefPrinter();
 
+  //===--------------------------------------------------------------------===//
+  //
+  // createMustExecutePrinter - This pass collects information about which
+  // instructions within a loop are guaranteed to execute if the loop header is
+  // entered and prints it with -analyze.
+  //
+  FunctionPass *createMustExecutePrinter();
+
 }
 
 #endif
diff --git a/contrib/llvm/include/llvm/Analysis/PhiValues.h b/contrib/llvm/include/llvm/Analysis/PhiValues.h
new file mode 100644
index 000000000000..6607b329c04f
--- /dev/null
+++ b/contrib/llvm/include/llvm/Analysis/PhiValues.h
@@ -0,0 +1,143 @@
+//===- PhiValues.h - Phi Value Analysis -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PhiValues class, and associated passes, which can be
+// used to find the underlying values of the phis in a function, i.e. the
+// non-phi values that can be found by traversing the phi graph.
+//
+// This information is computed lazily and cached. If new phis are added to the
+// function they are handled correctly, but if an existing phi has its operands
+// modified PhiValues has to be notified by calling invalidateValue.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PHIVALUES_H
+#define LLVM_ANALYSIS_PHIVALUES_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class Use;
+class Value;
+class PHINode;
+class Function;
+
+/// Class for calculating and caching the underlying values of phis in a
+/// function.
+///
+/// Initially the PhiValues is empty, and gets incrementally populated whenever
+/// it is queried.
+class PhiValues {
+public:
+  using ValueSet = SmallPtrSet<Value *, 4>;
+
+  /// Construct an empty PhiValues.
+  PhiValues(const Function &F) : F(F) {}
+
+  /// Get the underlying values of a phi.
+  ///
+  /// This returns the cached value if PN has previously been processed,
+  /// otherwise it processes it first.
+  const ValueSet &getValuesForPhi(const PHINode *PN);
+
+  /// Notify PhiValues that the cached information using V is no longer valid
+  ///
+  /// Whenever a phi has its operands modified the cached values for that phi
+  /// (and the phis that use that phi) become invalid. A user of PhiValues has
+  /// to notify it of this by calling invalidateValue on either the operand or
+  /// the phi, which will then clear the relevant cached information.
+  void invalidateValue(const Value *V);
+
+  /// Free the memory used by this class.
+  void releaseMemory();
+
+  /// Print out the values currently in the cache.
+  void print(raw_ostream &OS) const;
+
+  /// Handle invalidation events in the new pass manager.
+  bool invalidate(Function &, const PreservedAnalyses &,
+                  FunctionAnalysisManager::Invalidator &);
+
+private:
+  using PhiSet = SmallPtrSet<const PHINode *, 4>;
+  using ConstValueSet = SmallPtrSet<const Value *, 4>;
+
+  /// The next depth number to be used by processPhi.
+  unsigned int NextDepthNumber = 1;
+
+  /// Depth numbers of phis. Phis with the same depth number are part of the
+  /// same strongly connected component.
+  DenseMap<const PHINode *, unsigned int> DepthMap;
+
+  /// Non-phi values reachable from each component.
+  DenseMap<unsigned int, ValueSet> NonPhiReachableMap;
+
+  /// All values reachable from each component.
+  DenseMap<unsigned int, ConstValueSet> ReachableMap;
+
+  /// The function that the PhiValues is for.
+  const Function &F;
+
+  /// Process a phi so that its entries in the depth and reachable maps are
+  /// fully populated.
+  void processPhi(const PHINode *PN, SmallVector<const PHINode *, 8> &Stack);
+};
+
+/// The analysis pass which yields a PhiValues
+///
+/// The analysis does nothing by itself, and just returns an empty PhiValues
+/// which will get filled in as it's used.
+class PhiValuesAnalysis : public AnalysisInfoMixin<PhiValuesAnalysis> {
+  friend AnalysisInfoMixin<PhiValuesAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = PhiValues;
+  PhiValues run(Function &F, FunctionAnalysisManager &);
+};
+
+/// A pass for printing the PhiValues for a function.
+///
+/// This pass doesn't print whatever information the PhiValues happens to hold,
+/// but instead first uses the PhiValues to analyze all the phis in the function
+/// so the complete information is printed.
+class PhiValuesPrinterPass : public PassInfoMixin<PhiValuesPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit PhiValuesPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Wrapper pass for the legacy pass manager
+class PhiValuesWrapperPass : public FunctionPass {
+  std::unique_ptr<PhiValues> Result;
+
+public:
+  static char ID;
+  PhiValuesWrapperPass();
+
+  PhiValues &getResult() { return *Result; }
+  const PhiValues &getResult() const { return *Result; }
+
+  bool runOnFunction(Function &F) override;
+  void releaseMemory() override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/Analysis/PostDominators.h b/contrib/llvm/include/llvm/Analysis/PostDominators.h
index 381e65539c4e..f2dc8d135d71 100644
--- a/contrib/llvm/include/llvm/Analysis/PostDominators.h
+++ b/contrib/llvm/include/llvm/Analysis/PostDominators.h
@@ -26,15 +26,18 @@ class raw_ostream;
 
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used to
 /// compute the post-dominator tree.
-struct PostDominatorTree : public PostDomTreeBase<BasicBlock> {
+class PostDominatorTree : public PostDomTreeBase<BasicBlock> {
+public:
   using Base = PostDomTreeBase<BasicBlock>;
 
+  PostDominatorTree() = default;
+  explicit PostDominatorTree(Function &F) { recalculate(F); }
   /// Handle invalidation explicitly.
   bool invalidate(Function &F, const PreservedAnalyses &PA,
                   FunctionAnalysisManager::Invalidator &);
 };
 
-/// \brief Analysis pass which computes a \c PostDominatorTree.
+/// Analysis pass which computes a \c PostDominatorTree.
 class PostDominatorTreeAnalysis
     : public AnalysisInfoMixin<PostDominatorTreeAnalysis> {
   friend AnalysisInfoMixin<PostDominatorTreeAnalysis>;
@@ -42,15 +45,15 @@ class PostDominatorTreeAnalysis
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result type for this analysis pass.
+  /// Provide the result type for this analysis pass.
   using Result = PostDominatorTree;
 
-  /// \brief Run the analysis pass over a function and produce a post dominator
+  /// Run the analysis pass over a function and produce a post dominator
   ///        tree.
   PostDominatorTree run(Function &F, FunctionAnalysisManager &);
 };
 
-/// \brief Printer pass for the \c PostDominatorTree.
+/// Printer pass for the \c PostDominatorTree.
 class PostDominatorTreePrinterPass
     : public PassInfoMixin<PostDominatorTreePrinterPass> {
   raw_ostream &OS;
@@ -75,6 +78,8 @@ struct PostDominatorTreeWrapperPass : public FunctionPass {
 
   bool runOnFunction(Function &F) override;
 
+  void verifyAnalysis() const override;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
diff --git a/contrib/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/contrib/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index 293033458429..58b67e74ba51 100644
--- a/contrib/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -31,7 +31,7 @@ class BasicBlock;
 class BlockFrequencyInfo;
 class CallSite;
 class ProfileSummary;
-/// \brief Analysis providing profile information.
+/// Analysis providing profile information.
 ///
 /// This is an immutable analysis pass that provides ability to query global
 /// (program-level) profile information. The main APIs are isHotCount and
@@ -59,16 +59,16 @@ public:
   ProfileSummaryInfo(ProfileSummaryInfo &&Arg)
       : M(Arg.M), Summary(std::move(Arg.Summary)) {}
 
-  /// \brief Returns true if profile summary is available.
+  /// Returns true if profile summary is available.
   bool hasProfileSummary() { return computeSummary(); }
 
-  /// \brief Returns true if module \c M has sample profile.
+  /// Returns true if module \c M has sample profile.
   bool hasSampleProfile() {
     return hasProfileSummary() &&
            Summary->getKind() == ProfileSummary::PSK_Sample;
   }
 
-  /// \brief Returns true if module \c M has instrumentation profile.
+  /// Returns true if module \c M has instrumentation profile.
   bool hasInstrumentationProfile() {
     return hasProfileSummary() &&
            Summary->getKind() == ProfileSummary::PSK_Instr;
@@ -90,31 +90,37 @@ public:
                                      BlockFrequencyInfo *BFI);
   /// Returns true if the working set size of the code is considered huge.
   bool hasHugeWorkingSetSize();
-  /// \brief Returns true if \p F has hot function entry.
+  /// Returns true if \p F has hot function entry.
   bool isFunctionEntryHot(const Function *F);
   /// Returns true if \p F contains hot code.
   bool isFunctionHotInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
-  /// \brief Returns true if \p F has cold function entry.
+  /// Returns true if \p F has cold function entry.
   bool isFunctionEntryCold(const Function *F);
   /// Returns true if \p F contains only cold code.
   bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
-  /// \brief Returns true if \p F is a hot function.
+  /// Returns true if \p F is a hot function.
   bool isHotCount(uint64_t C);
-  /// \brief Returns true if count \p C is considered cold.
+  /// Returns true if count \p C is considered cold.
   bool isColdCount(uint64_t C);
-  /// \brief Returns true if BasicBlock \p B is considered hot.
+  /// Returns true if BasicBlock \p B is considered hot.
   bool isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI);
-  /// \brief Returns true if BasicBlock \p B is considered cold.
+  /// Returns true if BasicBlock \p B is considered cold.
   bool isColdBB(const BasicBlock *B, BlockFrequencyInfo *BFI);
-  /// \brief Returns true if CallSite \p CS is considered hot.
+  /// Returns true if CallSite \p CS is considered hot.
   bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI);
-  /// \brief Returns true if Callsite \p CS is considered cold.
+  /// Returns true if Callsite \p CS is considered cold.
   bool isColdCallSite(const CallSite &CS, BlockFrequencyInfo *BFI);
-  /// \brief Returns HotCountThreshold if set.
+  /// Returns HotCountThreshold if set. Recompute HotCountThreshold
+  /// if not set.
+  uint64_t getOrCompHotCountThreshold();
+  /// Returns ColdCountThreshold if set. Recompute HotCountThreshold
+  /// if not set.
+  uint64_t getOrCompColdCountThreshold();
+  /// Returns HotCountThreshold if set.
   uint64_t getHotCountThreshold() {
     return HotCountThreshold ? HotCountThreshold.getValue() : 0;
   }
-  /// \brief Returns ColdCountThreshold if set.
+  /// Returns ColdCountThreshold if set.
   uint64_t getColdCountThreshold() {
     return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
   }
@@ -152,7 +158,7 @@ private:
   static AnalysisKey Key;
 };
 
-/// \brief Printer pass that uses \c ProfileSummaryAnalysis.
+/// Printer pass that uses \c ProfileSummaryAnalysis.
 class ProfileSummaryPrinterPass
     : public PassInfoMixin<ProfileSummaryPrinterPass> {
   raw_ostream &OS;
diff --git a/contrib/llvm/include/llvm/Analysis/PtrUseVisitor.h b/contrib/llvm/include/llvm/Analysis/PtrUseVisitor.h
index 9f156a1a6029..b34b25c75040 100644
--- a/contrib/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/contrib/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -47,7 +47,7 @@ namespace llvm {
 
 namespace detail {
 
-/// \brief Implementation of non-dependent functionality for \c PtrUseVisitor.
+/// Implementation of non-dependent functionality for \c PtrUseVisitor.
 ///
 /// See \c PtrUseVisitor for the public interface and detailed comments about
 /// usage. This class is just a helper base class which is not templated and
@@ -55,7 +55,7 @@ namespace detail {
 /// PtrUseVisitor.
 class PtrUseVisitorBase {
 public:
-  /// \brief This class provides information about the result of a visit.
+  /// This class provides information about the result of a visit.
   ///
   /// After walking all the users (recursively) of a pointer, the basic
   /// infrastructure records some commonly useful information such as escape
@@ -64,7 +64,7 @@ public:
   public:
     PtrInfo() : AbortedInfo(nullptr, false), EscapedInfo(nullptr, false) {}
 
-    /// \brief Reset the pointer info, clearing all state.
+    /// Reset the pointer info, clearing all state.
     void reset() {
       AbortedInfo.setPointer(nullptr);
       AbortedInfo.setInt(false);
@@ -72,37 +72,37 @@ public:
       EscapedInfo.setInt(false);
     }
 
-    /// \brief Did we abort the visit early?
+    /// Did we abort the visit early?
     bool isAborted() const { return AbortedInfo.getInt(); }
 
-    /// \brief Is the pointer escaped at some point?
+    /// Is the pointer escaped at some point?
     bool isEscaped() const { return EscapedInfo.getInt(); }
 
-    /// \brief Get the instruction causing the visit to abort.
+    /// Get the instruction causing the visit to abort.
     /// \returns a pointer to the instruction causing the abort if one is
     /// available; otherwise returns null.
     Instruction *getAbortingInst() const { return AbortedInfo.getPointer(); }
 
-    /// \brief Get the instruction causing the pointer to escape.
+    /// Get the instruction causing the pointer to escape.
     /// \returns a pointer to the instruction which escapes the pointer if one
     /// is available; otherwise returns null.
     Instruction *getEscapingInst() const { return EscapedInfo.getPointer(); }
 
-    /// \brief Mark the visit as aborted. Intended for use in a void return.
+    /// Mark the visit as aborted. Intended for use in a void return.
     /// \param I The instruction which caused the visit to abort, if available.
     void setAborted(Instruction *I = nullptr) {
       AbortedInfo.setInt(true);
       AbortedInfo.setPointer(I);
     }
 
-    /// \brief Mark the pointer as escaped. Intended for use in a void return.
+    /// Mark the pointer as escaped. Intended for use in a void return.
     /// \param I The instruction which escapes the pointer, if available.
     void setEscaped(Instruction *I = nullptr) {
       EscapedInfo.setInt(true);
       EscapedInfo.setPointer(I);
     }
 
-    /// \brief Mark the pointer as escaped, and the visit as aborted. Intended
+    /// Mark the pointer as escaped, and the visit as aborted. Intended
     /// for use in a void return.
     /// \param I The instruction which both escapes the pointer and aborts the
     /// visit, if available.
@@ -121,10 +121,10 @@ protected:
   /// \name Visitation infrastructure
   /// @{
 
-  /// \brief The info collected about the pointer being visited thus far.
+  /// The info collected about the pointer being visited thus far.
   PtrInfo PI;
 
-  /// \brief A struct of the data needed to visit a particular use.
+  /// A struct of the data needed to visit a particular use.
   ///
   /// This is used to maintain a worklist fo to-visit uses. This is used to
   /// make the visit be iterative rather than recursive.
@@ -135,10 +135,10 @@ protected:
     APInt Offset;
   };
 
-  /// \brief The worklist of to-visit uses.
+  /// The worklist of to-visit uses.
   SmallVector<UseToVisit, 8> Worklist;
 
-  /// \brief A set of visited uses to break cycles in unreachable code.
+  /// A set of visited uses to break cycles in unreachable code.
   SmallPtrSet<Use *, 8> VisitedUses;
 
   /// @}
@@ -147,14 +147,14 @@ protected:
   /// This state is reset for each instruction visited.
   /// @{
 
-  /// \brief The use currently being visited.
+  /// The use currently being visited.
   Use *U;
 
-  /// \brief True if we have a known constant offset for the use currently
+  /// True if we have a known constant offset for the use currently
   /// being visited.
   bool IsOffsetKnown;
 
-  /// \brief The constant offset of the use if that is known.
+  /// The constant offset of the use if that is known.
   APInt Offset;
 
   /// @}
@@ -163,13 +163,13 @@ protected:
   /// class, we can't create instances directly of this class.
   PtrUseVisitorBase(const DataLayout &DL) : DL(DL) {}
 
-  /// \brief Enqueue the users of this instruction in the visit worklist.
+  /// Enqueue the users of this instruction in the visit worklist.
   ///
   /// This will visit the users with the same offset of the current visit
   /// (including an unknown offset if that is the current state).
   void enqueueUsers(Instruction &I);
 
-  /// \brief Walk the operands of a GEP and adjust the offset as appropriate.
+  /// Walk the operands of a GEP and adjust the offset as appropriate.
   ///
   /// This routine does the heavy lifting of the pointer walk by computing
   /// offsets and looking through GEPs.
@@ -178,7 +178,7 @@ protected:
 
 } // end namespace detail
 
-/// \brief A base class for visitors over the uses of a pointer value.
+/// A base class for visitors over the uses of a pointer value.
 ///
 /// Once constructed, a user can call \c visit on a pointer value, and this
 /// will walk its uses and visit each instruction using an InstVisitor. It also
@@ -216,7 +216,7 @@ public:
                   "Must pass the derived type to this template!");
   }
 
-  /// \brief Recursively visit the uses of the given pointer.
+  /// Recursively visit the uses of the given pointer.
   /// \returns An info struct about the pointer. See \c PtrInfo for details.
   PtrInfo visitPtr(Instruction &I) {
     // This must be a pointer type. Get an integer type suitable to hold
@@ -275,7 +275,7 @@ protected:
     enqueueUsers(GEPI);
   }
 
-  // No-op intrinsics which we know don't escape the pointer to to logic in
+  // No-op intrinsics which we know don't escape the pointer to logic in
   // some other function.
   void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) {}
   void visitMemIntrinsic(MemIntrinsic &I) {}
diff --git a/contrib/llvm/include/llvm/Analysis/RegionInfo.h b/contrib/llvm/include/llvm/Analysis/RegionInfo.h
index 719622359949..27f6cc197927 100644
--- a/contrib/llvm/include/llvm/Analysis/RegionInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/RegionInfo.h
@@ -42,6 +42,7 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
@@ -62,7 +63,7 @@ class DominanceFrontier;
 class DominatorTree;
 class Loop;
 class LoopInfo;
-struct PostDominatorTree;
+class PostDominatorTree;
 class Region;
 template <class RegionTr> class RegionBase;
 class RegionInfo;
@@ -102,7 +103,7 @@ struct RegionTraits<Function> {
   }
 };
 
-/// @brief Marker class to iterate over the elements of a Region in flat mode.
+/// Marker class to iterate over the elements of a Region in flat mode.
 ///
 /// The class is used to either iterate in Flat mode or by not using it to not
 /// iterate in Flat mode.  During a Flat mode iteration all Regions are entered
@@ -112,7 +113,7 @@ struct RegionTraits<Function> {
 template <class GraphType>
 class FlatIt {};
 
-/// @brief A RegionNode represents a subregion or a BasicBlock that is part of a
+/// A RegionNode represents a subregion or a BasicBlock that is part of a
 /// Region.
 template <class Tr>
 class RegionNodeBase {
@@ -135,12 +136,12 @@ private:
   /// RegionNode.
   PointerIntPair<BlockT *, 1, bool> entry;
 
-  /// @brief The parent Region of this RegionNode.
+  /// The parent Region of this RegionNode.
   /// @see getParent()
   RegionT *parent;
 
 protected:
-  /// @brief Create a RegionNode.
+  /// Create a RegionNode.
   ///
   /// @param Parent      The parent of this RegionNode.
   /// @param Entry       The entry BasicBlock of the RegionNode.  If this
@@ -156,7 +157,7 @@ public:
   RegionNodeBase(const RegionNodeBase &) = delete;
   RegionNodeBase &operator=(const RegionNodeBase &) = delete;
 
-  /// @brief Get the parent Region of this RegionNode.
+  /// Get the parent Region of this RegionNode.
   ///
   /// The parent Region is the Region this RegionNode belongs to. If for
   /// example a BasicBlock is element of two Regions, there exist two
@@ -166,7 +167,7 @@ public:
   /// @return Get the parent Region of this RegionNode.
   inline RegionT *getParent() const { return parent; }
 
-  /// @brief Get the entry BasicBlock of this RegionNode.
+  /// Get the entry BasicBlock of this RegionNode.
   ///
   /// If this RegionNode represents a BasicBlock this is just the BasicBlock
   /// itself, otherwise we return the entry BasicBlock of the Subregion
@@ -174,7 +175,7 @@ public:
   /// @return The entry BasicBlock of this RegionNode.
   inline BlockT *getEntry() const { return entry.getPointer(); }
 
-  /// @brief Get the content of this RegionNode.
+  /// Get the content of this RegionNode.
   ///
   /// This can be either a BasicBlock or a subregion. Before calling getNodeAs()
   /// check the type of the content with the isSubRegion() function call.
@@ -182,7 +183,7 @@ public:
   /// @return The content of this RegionNode.
   template <class T> inline T *getNodeAs() const;
 
-  /// @brief Is this RegionNode a subregion?
+  /// Is this RegionNode a subregion?
   ///
   /// @return True if it contains a subregion. False if it contains a
   ///         BasicBlock.
@@ -190,7 +191,7 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
-/// @brief A single entry single exit Region.
+/// A single entry single exit Region.
 ///
 /// A Region is a connected subgraph of a control flow graph that has exactly
 /// two connections to the remaining graph. It can be used to analyze or
@@ -301,7 +302,7 @@ class RegionBase : public RegionNodeBase<Tr> {
   void verifyRegionNest() const;
 
 public:
-  /// @brief Create a new region.
+  /// Create a new region.
   ///
   /// @param Entry  The entry basic block of the region.
   /// @param Exit   The exit basic block of the region.
@@ -318,25 +319,25 @@ public:
   /// Delete the Region and all its subregions.
   ~RegionBase();
 
-  /// @brief Get the entry BasicBlock of the Region.
+  /// Get the entry BasicBlock of the Region.
   /// @return The entry BasicBlock of the region.
   BlockT *getEntry() const {
     return RegionNodeBase<Tr>::getEntry();
   }
 
-  /// @brief Replace the entry basic block of the region with the new basic
+  /// Replace the entry basic block of the region with the new basic
   ///        block.
   ///
   /// @param BB  The new entry basic block of the region.
   void replaceEntry(BlockT *BB);
 
-  /// @brief Replace the exit basic block of the region with the new basic
+  /// Replace the exit basic block of the region with the new basic
   ///        block.
   ///
   /// @param BB  The new exit basic block of the region.
   void replaceExit(BlockT *BB);
 
-  /// @brief Recursively replace the entry basic block of the region.
+  /// Recursively replace the entry basic block of the region.
   ///
   /// This function replaces the entry basic block with a new basic block. It
   /// also updates all child regions that have the same entry basic block as
@@ -345,7 +346,7 @@ public:
   /// @param NewEntry The new entry basic block.
   void replaceEntryRecursive(BlockT *NewEntry);
 
-  /// @brief Recursively replace the exit basic block of the region.
+  /// Recursively replace the exit basic block of the region.
   ///
   /// This function replaces the exit basic block with a new basic block. It
   /// also updates all child regions that have the same exit basic block as
@@ -354,38 +355,38 @@ public:
   /// @param NewExit The new exit basic block.
   void replaceExitRecursive(BlockT *NewExit);
 
-  /// @brief Get the exit BasicBlock of the Region.
+  /// Get the exit BasicBlock of the Region.
   /// @return The exit BasicBlock of the Region, NULL if this is the TopLevel
   ///         Region.
   BlockT *getExit() const { return exit; }
 
-  /// @brief Get the parent of the Region.
+  /// Get the parent of the Region.
   /// @return The parent of the Region or NULL if this is a top level
   ///         Region.
   RegionT *getParent() const {
     return RegionNodeBase<Tr>::getParent();
   }
 
-  /// @brief Get the RegionNode representing the current Region.
+  /// Get the RegionNode representing the current Region.
   /// @return The RegionNode representing the current Region.
   RegionNodeT *getNode() const {
     return const_cast<RegionNodeT *>(
         reinterpret_cast<const RegionNodeT *>(this));
   }
 
-  /// @brief Get the nesting level of this Region.
+  /// Get the nesting level of this Region.
   ///
   /// An toplevel Region has depth 0.
   ///
   /// @return The depth of the region.
   unsigned getDepth() const;
 
-  /// @brief Check if a Region is the TopLevel region.
+  /// Check if a Region is the TopLevel region.
   ///
   /// The toplevel region represents the whole function.
   bool isTopLevelRegion() const { return exit == nullptr; }
 
-  /// @brief Return a new (non-canonical) region, that is obtained by joining
+  /// Return a new (non-canonical) region, that is obtained by joining
   ///        this region with its predecessors.
   ///
   /// @return A region also starting at getEntry(), but reaching to the next
@@ -393,43 +394,43 @@ public:
   ///         NULL if such a basic block does not exist.
   RegionT *getExpandedRegion() const;
 
-  /// @brief Return the first block of this region's single entry edge,
+  /// Return the first block of this region's single entry edge,
   ///        if existing.
   ///
   /// @return The BasicBlock starting this region's single entry edge,
   ///         else NULL.
   BlockT *getEnteringBlock() const;
 
-  /// @brief Return the first block of this region's single exit edge,
+  /// Return the first block of this region's single exit edge,
   ///        if existing.
   ///
   /// @return The BasicBlock starting this region's single exit edge,
   ///         else NULL.
   BlockT *getExitingBlock() const;
 
-  /// @brief Collect all blocks of this region's single exit edge, if existing.
+  /// Collect all blocks of this region's single exit edge, if existing.
   ///
   /// @return True if this region contains all the predecessors of the exit.
   bool getExitingBlocks(SmallVectorImpl<BlockT *> &Exitings) const;
 
-  /// @brief Is this a simple region?
+  /// Is this a simple region?
   ///
   /// A region is simple if it has exactly one exit and one entry edge.
   ///
   /// @return True if the Region is simple.
   bool isSimple() const;
 
-  /// @brief Returns the name of the Region.
+  /// Returns the name of the Region.
   /// @return The Name of the Region.
   std::string getNameStr() const;
 
-  /// @brief Return the RegionInfo object, that belongs to this Region.
+  /// Return the RegionInfo object, that belongs to this Region.
   RegionInfoT *getRegionInfo() const { return RI; }
 
   /// PrintStyle - Print region in difference ways.
   enum PrintStyle { PrintNone, PrintBB, PrintRN };
 
-  /// @brief Print the region.
+  /// Print the region.
   ///
   /// @param OS The output stream the Region is printed to.
   /// @param printTree Print also the tree of subregions.
@@ -438,17 +439,17 @@ public:
              PrintStyle Style = PrintNone) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// @brief Print the region to stderr.
+  /// Print the region to stderr.
   void dump() const;
 #endif
 
-  /// @brief Check if the region contains a BasicBlock.
+  /// Check if the region contains a BasicBlock.
   ///
   /// @param BB The BasicBlock that might be contained in this Region.
   /// @return True if the block is contained in the region otherwise false.
   bool contains(const BlockT *BB) const;
 
-  /// @brief Check if the region contains another region.
+  /// Check if the region contains another region.
   ///
   /// @param SubRegion The region that might be contained in this Region.
   /// @return True if SubRegion is contained in the region otherwise false.
@@ -462,14 +463,14 @@ public:
             SubRegion->getExit() == getExit());
   }
 
-  /// @brief Check if the region contains an Instruction.
+  /// Check if the region contains an Instruction.
   ///
   /// @param Inst The Instruction that might be contained in this region.
   /// @return True if the Instruction is contained in the region otherwise
   /// false.
   bool contains(const InstT *Inst) const { return contains(Inst->getParent()); }
 
-  /// @brief Check if the region contains a loop.
+  /// Check if the region contains a loop.
   ///
   /// @param L The loop that might be contained in this region.
   /// @return True if the loop is contained in the region otherwise false.
@@ -478,7 +479,7 @@ public:
   ///         In that case true is returned.
   bool contains(const LoopT *L) const;
 
-  /// @brief Get the outermost loop in the region that contains a loop.
+  /// Get the outermost loop in the region that contains a loop.
   ///
   /// Find for a Loop L the outermost loop OuterL that is a parent loop of L
   /// and is itself contained in the region.
@@ -488,7 +489,7 @@ public:
   ///         exist or if the region describes the whole function.
   LoopT *outermostLoopInRegion(LoopT *L) const;
 
-  /// @brief Get the outermost loop in the region that contains a basic block.
+  /// Get the outermost loop in the region that contains a basic block.
   ///
   /// Find for a basic block BB the outermost loop L that contains BB and is
   /// itself contained in the region.
@@ -499,13 +500,13 @@ public:
   ///         exist or if the region describes the whole function.
   LoopT *outermostLoopInRegion(LoopInfoT *LI, BlockT *BB) const;
 
-  /// @brief Get the subregion that starts at a BasicBlock
+  /// Get the subregion that starts at a BasicBlock
   ///
   /// @param BB The BasicBlock the subregion should start.
   /// @return The Subregion if available, otherwise NULL.
   RegionT *getSubRegionNode(BlockT *BB) const;
 
-  /// @brief Get the RegionNode for a BasicBlock
+  /// Get the RegionNode for a BasicBlock
   ///
   /// @param BB The BasicBlock at which the RegionNode should start.
   /// @return If available, the RegionNode that represents the subregion
@@ -513,38 +514,38 @@ public:
   ///         representing BB.
   RegionNodeT *getNode(BlockT *BB) const;
 
-  /// @brief Get the BasicBlock RegionNode for a BasicBlock
+  /// Get the BasicBlock RegionNode for a BasicBlock
   ///
   /// @param BB The BasicBlock for which the RegionNode is requested.
   /// @return The RegionNode representing the BB.
   RegionNodeT *getBBNode(BlockT *BB) const;
 
-  /// @brief Add a new subregion to this Region.
+  /// Add a new subregion to this Region.
   ///
   /// @param SubRegion The new subregion that will be added.
   /// @param moveChildren Move the children of this region, that are also
   ///                     contained in SubRegion into SubRegion.
   void addSubRegion(RegionT *SubRegion, bool moveChildren = false);
 
-  /// @brief Remove a subregion from this Region.
+  /// Remove a subregion from this Region.
   ///
   /// The subregion is not deleted, as it will probably be inserted into another
   /// region.
   /// @param SubRegion The SubRegion that will be removed.
   RegionT *removeSubRegion(RegionT *SubRegion);
 
-  /// @brief Move all direct child nodes of this Region to another Region.
+  /// Move all direct child nodes of this Region to another Region.
   ///
   /// @param To The Region the child nodes will be transferred to.
   void transferChildrenTo(RegionT *To);
 
-  /// @brief Verify if the region is a correct region.
+  /// Verify if the region is a correct region.
   ///
   /// Check if this is a correctly build Region. This is an expensive check, as
   /// the complete CFG of the Region will be walked.
   void verifyRegion() const;
 
-  /// @brief Clear the cache for BB RegionNodes.
+  /// Clear the cache for BB RegionNodes.
   ///
   /// After calling this function the BasicBlock RegionNodes will be stored at
   /// different memory locations. RegionNodes obtained before this function is
@@ -620,12 +621,12 @@ public:
   using block_range = iterator_range<block_iterator>;
   using const_block_range = iterator_range<const_block_iterator>;
 
-  /// @brief Returns a range view of the basic blocks in the region.
+  /// Returns a range view of the basic blocks in the region.
   inline block_range blocks() {
     return block_range(block_begin(), block_end());
   }
 
-  /// @brief Returns a range view of the basic blocks in the region.
+  /// Returns a range view of the basic blocks in the region.
   ///
   /// This is the 'const' version of the range view.
   inline const_block_range blocks() const {
@@ -667,7 +668,7 @@ template <class Tr>
 inline raw_ostream &operator<<(raw_ostream &OS, const RegionNodeBase<Tr> &Node);
 
 //===----------------------------------------------------------------------===//
-/// @brief Analysis that detects all canonical Regions.
+/// Analysis that detects all canonical Regions.
 ///
 /// The RegionInfo pass detects all canonical regions in a function. The Regions
 /// are connected using the parent relation. This builds a Program Structure
@@ -725,7 +726,7 @@ class RegionInfoBase {
   BBtoRegionMap BBtoRegion;
 
 protected:
-  /// \brief Update refences to a RegionInfoT held by the RegionT managed here
+  /// Update refences to a RegionInfoT held by the RegionT managed here
   ///
   /// This is a post-move helper. Regions hold references to the owning
   /// RegionInfo object. After a move these need to be fixed.
@@ -739,7 +740,7 @@ protected:
   }
 
 private:
-  /// \brief Wipe this region tree's state without releasing any resources.
+  /// Wipe this region tree's state without releasing any resources.
   ///
   /// This is essentially a post-move helper only. It leaves the object in an
   /// assignable and destroyable state, but otherwise invalid.
@@ -811,40 +812,40 @@ public:
 
   void releaseMemory();
 
-  /// @brief Get the smallest region that contains a BasicBlock.
+  /// Get the smallest region that contains a BasicBlock.
   ///
   /// @param BB The basic block.
   /// @return The smallest region, that contains BB or NULL, if there is no
   /// region containing BB.
   RegionT *getRegionFor(BlockT *BB) const;
 
-  /// @brief  Set the smallest region that surrounds a basic block.
+  ///  Set the smallest region that surrounds a basic block.
   ///
   /// @param BB The basic block surrounded by a region.
   /// @param R The smallest region that surrounds BB.
   void setRegionFor(BlockT *BB, RegionT *R);
 
-  /// @brief A shortcut for getRegionFor().
+  /// A shortcut for getRegionFor().
   ///
   /// @param BB The basic block.
   /// @return The smallest region, that contains BB or NULL, if there is no
   /// region containing BB.
   RegionT *operator[](BlockT *BB) const;
 
-  /// @brief Return the exit of the maximal refined region, that starts at a
+  /// Return the exit of the maximal refined region, that starts at a
   /// BasicBlock.
   ///
   /// @param BB The BasicBlock the refined region starts.
   BlockT *getMaxRegionExit(BlockT *BB) const;
 
-  /// @brief Find the smallest region that contains two regions.
+  /// Find the smallest region that contains two regions.
   ///
   /// @param A The first region.
   /// @param B The second region.
   /// @return The smallest region containing A and B.
   RegionT *getCommonRegion(RegionT *A, RegionT *B) const;
 
-  /// @brief Find the smallest region that contains two basic blocks.
+  /// Find the smallest region that contains two basic blocks.
   ///
   /// @param A The first basic block.
   /// @param B The second basic block.
@@ -853,13 +854,13 @@ public:
     return getCommonRegion(getRegionFor(A), getRegionFor(B));
   }
 
-  /// @brief Find the smallest region that contains a set of regions.
+  /// Find the smallest region that contains a set of regions.
   ///
   /// @param Regions A vector of regions.
   /// @return The smallest region that contains all regions in Regions.
   RegionT *getCommonRegion(SmallVectorImpl<RegionT *> &Regions) const;
 
-  /// @brief Find the smallest region that contains a set of basic blocks.
+  /// Find the smallest region that contains a set of basic blocks.
   ///
   /// @param BBs A vector of basic blocks.
   /// @return The smallest region that contains all basic blocks in BBS.
@@ -867,7 +868,7 @@ public:
 
   RegionT *getTopLevelRegion() const { return TopLevelRegion; }
 
-  /// @brief Clear the Node Cache for all Regions.
+  /// Clear the Node Cache for all Regions.
   ///
   /// @see Region::clearNodeCache()
   void clearNodeCache() {
@@ -930,12 +931,12 @@ public:
                    DominanceFrontier *DF);
 
 #ifndef NDEBUG
-  /// @brief Opens a viewer to show the GraphViz visualization of the regions.
+  /// Opens a viewer to show the GraphViz visualization of the regions.
   ///
   /// Useful during debugging as an alternative to dump().
   void view();
 
-  /// @brief Opens a viewer to show the GraphViz visualization of this region
+  /// Opens a viewer to show the GraphViz visualization of this region
   /// without instructions in the BasicBlocks.
   ///
   /// Useful during debugging as an alternative to dump().
@@ -967,7 +968,7 @@ public:
   //@}
 };
 
-/// \brief Analysis pass that exposes the \c RegionInfo for a function.
+/// Analysis pass that exposes the \c RegionInfo for a function.
 class RegionInfoAnalysis : public AnalysisInfoMixin<RegionInfoAnalysis> {
   friend AnalysisInfoMixin<RegionInfoAnalysis>;
 
@@ -979,7 +980,7 @@ public:
   RegionInfo run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Printer pass for the \c RegionInfo.
+/// Printer pass for the \c RegionInfo.
 class RegionInfoPrinterPass : public PassInfoMixin<RegionInfoPrinterPass> {
   raw_ostream &OS;
 
@@ -989,7 +990,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Verifier pass for the \c RegionInfo.
+/// Verifier pass for the \c RegionInfo.
 struct RegionInfoVerifierPass : PassInfoMixin<RegionInfoVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/contrib/llvm/include/llvm/Analysis/RegionInfoImpl.h b/contrib/llvm/include/llvm/Analysis/RegionInfoImpl.h
index eb6baac2d5e4..5904214aa925 100644
--- a/contrib/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/contrib/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -674,7 +675,7 @@ typename Tr::RegionT *RegionInfoBase<Tr>::createRegion(BlockT *entry,
 #ifdef EXPENSIVE_CHECKS
   region->verifyRegion();
 #else
-  DEBUG(region->verifyRegion());
+  LLVM_DEBUG(region->verifyRegion());
 #endif
 
   updateStatistics(region);
diff --git a/contrib/llvm/include/llvm/Analysis/RegionIterator.h b/contrib/llvm/include/llvm/Analysis/RegionIterator.h
index 4f823cc82210..4fd92fcde20b 100644
--- a/contrib/llvm/include/llvm/Analysis/RegionIterator.h
+++ b/contrib/llvm/include/llvm/Analysis/RegionIterator.h
@@ -26,7 +26,7 @@ namespace llvm {
 class BasicBlock;
 
 //===----------------------------------------------------------------------===//
-/// @brief Hierarchical RegionNode successor iterator.
+/// Hierarchical RegionNode successor iterator.
 ///
 /// This iterator iterates over all successors of a RegionNode.
 ///
@@ -102,7 +102,7 @@ public:
   using Self = RNSuccIterator<NodeRef, BlockT, RegionT>;
   using value_type = typename super::value_type;
 
-  /// @brief Create begin iterator of a RegionNode.
+  /// Create begin iterator of a RegionNode.
   inline RNSuccIterator(NodeRef node)
       : Node(node, node->isSubRegion() ? ItRgBegin : ItBB),
         BItor(BlockTraits::child_begin(node->getEntry())) {
@@ -115,7 +115,7 @@ public:
       advanceRegionSucc();
   }
 
-  /// @brief Create an end iterator.
+  /// Create an end iterator.
   inline RNSuccIterator(NodeRef node, bool)
       : Node(node, node->isSubRegion() ? ItRgEnd : ItBB),
         BItor(BlockTraits::child_end(node->getEntry())) {}
@@ -158,7 +158,7 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
-/// @brief Flat RegionNode iterator.
+/// Flat RegionNode iterator.
 ///
 /// The Flat Region iterator will iterate over all BasicBlock RegionNodes that
 /// are contained in the Region and its subregions. This is close to a virtual
@@ -177,7 +177,7 @@ public:
   using Self = RNSuccIterator<FlatIt<NodeRef>, BlockT, RegionT>;
   using value_type = typename super::value_type;
 
-  /// @brief Create the iterator from a RegionNode.
+  /// Create the iterator from a RegionNode.
   ///
   /// Note that the incoming node must be a bb node, otherwise it will trigger
   /// an assertion when we try to get a BasicBlock.
@@ -193,7 +193,7 @@ public:
       ++Itor;
   }
 
-  /// @brief Create an end iterator
+  /// Create an end iterator
   inline RNSuccIterator(NodeRef node, bool)
       : Node(node), Itor(BlockTraits::child_end(node->getEntry())) {
     assert(!Node->isSubRegion() &&
diff --git a/contrib/llvm/include/llvm/Analysis/RegionPass.h b/contrib/llvm/include/llvm/Analysis/RegionPass.h
index 515b362e5407..b3da91c89cbd 100644
--- a/contrib/llvm/include/llvm/Analysis/RegionPass.h
+++ b/contrib/llvm/include/llvm/Analysis/RegionPass.h
@@ -28,7 +28,7 @@ class RGPassManager;
 class Function;
 
 //===----------------------------------------------------------------------===//
-/// @brief A pass that runs on each Region in a function.
+/// A pass that runs on each Region in a function.
 ///
 /// RegionPass is managed by RGPassManager.
 class RegionPass : public Pass {
@@ -39,7 +39,7 @@ public:
   /// @name To be implemented by every RegionPass
   ///
   //@{
-  /// @brief Run the pass on a specific Region
+  /// Run the pass on a specific Region
   ///
   /// Accessing regions not contained in the current region is not allowed.
   ///
@@ -49,7 +49,7 @@ public:
   /// @return True if the pass modifies this Region.
   virtual bool runOnRegion(Region *R, RGPassManager &RGM) = 0;
 
-  /// @brief Get a pass to print the LLVM IR in the region.
+  /// Get a pass to print the LLVM IR in the region.
   ///
   /// @param O      The output stream to print the Region.
   /// @param Banner The banner to separate different printed passes.
@@ -85,7 +85,7 @@ protected:
   bool skipRegion(Region &R) const;
 };
 
-/// @brief The pass manager to schedule RegionPasses.
+/// The pass manager to schedule RegionPasses.
 class RGPassManager : public FunctionPass, public PMDataManager {
   std::deque<Region*> RQ;
   bool skipThisRegion;
@@ -97,7 +97,7 @@ public:
   static char ID;
   explicit RGPassManager();
 
-  /// @brief Execute all of the passes scheduled for execution.
+  /// Execute all of the passes scheduled for execution.
   ///
   /// @return True if any of the passes modifies the function.
   bool runOnFunction(Function &F) override;
@@ -111,10 +111,10 @@ public:
   PMDataManager *getAsPMDataManager() override { return this; }
   Pass *getAsPass() override { return this; }
 
-  /// @brief Print passes managed by this manager.
+  /// Print passes managed by this manager.
   void dumpPassStructure(unsigned Offset) override;
 
-  /// @brief Get passes contained by this manager.
+  /// Get passes contained by this manager.
   Pass *getContainedPass(unsigned N) {
     assert(N < PassVector.size() && "Pass number out of range!");
     Pass *FP = static_cast<Pass *>(PassVector[N]);
diff --git a/contrib/llvm/include/llvm/Analysis/RegionPrinter.h b/contrib/llvm/include/llvm/Analysis/RegionPrinter.h
index 8f0035cfd8e6..e132eaea5674 100644
--- a/contrib/llvm/include/llvm/Analysis/RegionPrinter.h
+++ b/contrib/llvm/include/llvm/Analysis/RegionPrinter.h
@@ -26,7 +26,7 @@ namespace llvm {
   FunctionPass *createRegionOnlyPrinterPass();
 
 #ifndef NDEBUG
-  /// @brief Open a viewer to display the GraphViz vizualization of the analysis
+  /// Open a viewer to display the GraphViz vizualization of the analysis
   /// result.
   ///
   /// Practical to call in the debugger.
@@ -35,7 +35,7 @@ namespace llvm {
   /// @param RI The analysis to display.
   void viewRegion(llvm::RegionInfo *RI);
 
-  /// @brief Analyze the regions of a function and open its GraphViz
+  /// Analyze the regions of a function and open its GraphViz
   /// visualization in a viewer.
   ///
   /// Useful to call in the debugger.
@@ -46,7 +46,7 @@ namespace llvm {
   /// @param F Function to analyze.
   void viewRegion(const llvm::Function *F);
 
-  /// @brief Open a viewer to display the GraphViz vizualization of the analysis
+  /// Open a viewer to display the GraphViz vizualization of the analysis
   /// result.
   ///
   /// Useful to call in the debugger.
@@ -55,7 +55,7 @@ namespace llvm {
   /// @param RI The analysis to display.
   void viewRegionOnly(llvm::RegionInfo *RI);
 
-  /// @brief Analyze the regions of a function and open its GraphViz
+  /// Analyze the regions of a function and open its GraphViz
   /// visualization in a viewer.
   ///
   /// Useful to call in the debugger.
diff --git a/contrib/llvm/include/llvm/Analysis/ScalarEvolution.h b/contrib/llvm/include/llvm/Analysis/ScalarEvolution.h
index 21b72f3e13c2..89918e3c205b 100644
--- a/contrib/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/contrib/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -587,7 +587,9 @@ public:
   const SCEV *getUMaxExpr(const SCEV *LHS, const SCEV *RHS);
   const SCEV *getUMaxExpr(SmallVectorImpl<const SCEV *> &Operands);
   const SCEV *getSMinExpr(const SCEV *LHS, const SCEV *RHS);
+  const SCEV *getSMinExpr(SmallVectorImpl<const SCEV *> &Operands);
   const SCEV *getUMinExpr(const SCEV *LHS, const SCEV *RHS);
+  const SCEV *getUMinExpr(SmallVectorImpl<const SCEV *> &Operands);
   const SCEV *getUnknown(Value *V);
   const SCEV *getCouldNotCompute();
 
@@ -650,6 +652,10 @@ public:
   /// then perform a umin operation with them.
   const SCEV *getUMinFromMismatchedTypes(const SCEV *LHS, const SCEV *RHS);
 
+  /// Promote the operands to the wider of the types using zero-extension, and
+  /// then perform a umin operation with them. N-ary function.
+  const SCEV *getUMinFromMismatchedTypes(SmallVectorImpl<const SCEV *> &Ops);
+
   /// Transitively follow the chain of pointer-type operands until reaching a
   /// SCEV that does not have a single pointer operand. This returns a
   /// SCEVUnknown pointer for well-formed pointer-type expressions, but corner
@@ -678,7 +684,7 @@ public:
                                 const SCEV *LHS, const SCEV *RHS);
 
   /// Test whether the backedge of the loop is protected by a conditional
-  /// between LHS and RHS.  This is used to to eliminate casts.
+  /// between LHS and RHS.  This is used to eliminate casts.
   bool isLoopBackedgeGuardedByCond(const Loop *L, ICmpInst::Predicate Pred,
                                    const SCEV *LHS, const SCEV *RHS);
 
@@ -764,6 +770,12 @@ public:
   /// loop bodies.
   void forgetLoop(const Loop *L);
 
+  // This method invokes forgetLoop for the outermost loop of the given loop
+  // \p L, making ScalarEvolution forget about all this subtree. This needs to
+  // be done whenever we make a transform that may affect the parameters of the
+  // outer loop, such as exit counts for branches.
+  void forgetTopmostLoop(const Loop *L);
+
   /// This method should be called by the client when it has changed a value
   /// in a way that may effect its value, or which may disconnect it from a
   /// def-use chain linking it to a loop.
@@ -829,11 +841,56 @@ public:
   /// Test if the given expression is known to be non-zero.
   bool isKnownNonZero(const SCEV *S);
 
+  /// Splits SCEV expression \p S into two SCEVs. One of them is obtained from
+  /// \p S by substitution of all AddRec sub-expression related to loop \p L
+  /// with initial value of that SCEV. The second is obtained from \p S by
+  /// substitution of all AddRec sub-expressions related to loop \p L with post
+  /// increment of this AddRec in the loop \p L. In both cases all other AddRec
+  /// sub-expressions (not related to \p L) remain the same.
+  /// If the \p S contains non-invariant unknown SCEV the function returns
+  /// CouldNotCompute SCEV in both values of std::pair.
+  /// For example, for SCEV S={0, +, 1}<L1> + {0, +, 1}<L2> and loop L=L1
+  /// the function returns pair:
+  /// first = {0, +, 1}<L2>
+  /// second = {1, +, 1}<L1> + {0, +, 1}<L2>
+  /// We can see that for the first AddRec sub-expression it was replaced with
+  /// 0 (initial value) for the first element and to {1, +, 1}<L1> (post
+  /// increment value) for the second one. In both cases AddRec expression
+  /// related to L2 remains the same.
+  std::pair<const SCEV *, const SCEV *> SplitIntoInitAndPostInc(const Loop *L,
+                                                                const SCEV *S);
+
+  /// We'd like to check the predicate on every iteration of the most dominated
+  /// loop between loops used in LHS and RHS.
+  /// To do this we use the following list of steps:
+  /// 1. Collect set S all loops on which either LHS or RHS depend.
+  /// 2. If S is non-empty
+  /// a. Let PD be the element of S which is dominated by all other elements.
+  /// b. Let E(LHS) be value of LHS on entry of PD.
+  ///    To get E(LHS), we should just take LHS and replace all AddRecs that are
+  ///    attached to PD on with their entry values.
+  ///    Define E(RHS) in the same way.
+  /// c. Let B(LHS) be value of L on backedge of PD.
+  ///    To get B(LHS), we should just take LHS and replace all AddRecs that are
+  ///    attached to PD on with their backedge values.
+  ///    Define B(RHS) in the same way.
+  /// d. Note that E(LHS) and E(RHS) are automatically available on entry of PD,
+  ///    so we can assert on that.
+  /// e. Return true if isLoopEntryGuardedByCond(Pred, E(LHS), E(RHS)) &&
+  ///                   isLoopBackedgeGuardedByCond(Pred, B(LHS), B(RHS))
+  bool isKnownViaInduction(ICmpInst::Predicate Pred, const SCEV *LHS,
+                           const SCEV *RHS);
+
   /// Test if the given expression is known to satisfy the condition described
   /// by Pred, LHS, and RHS.
   bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
                         const SCEV *RHS);
 
+  /// Test if the condition described by Pred, LHS, RHS is known to be true on
+  /// every iteration of the loop of the recurrency LHS.
+  bool isKnownOnEveryIteration(ICmpInst::Predicate Pred,
+                               const SCEVAddRecExpr *LHS, const SCEV *RHS);
+
   /// Return true if, for all loop invariant X, the predicate "LHS `Pred` X"
   /// is monotonically increasing or decreasing.  In the former case set
   /// `Increasing` to true and in the latter case set `Increasing` to false.
@@ -1040,7 +1097,7 @@ private:
   /// The target library information for the target we are targeting.
   TargetLibraryInfo &TLI;
 
-  /// The tracker for @llvm.assume intrinsics in this function.
+  /// The tracker for \@llvm.assume intrinsics in this function.
   AssumptionCache &AC;
 
   /// The dominator tree.
@@ -1094,6 +1151,12 @@ private:
   /// Mark predicate values currently being processed by isImpliedCond.
   SmallPtrSet<Value *, 6> PendingLoopPredicates;
 
+  /// Mark SCEVUnknown Phis currently being processed by getRangeRef.
+  SmallPtrSet<const PHINode *, 6> PendingPhiRanges;
+
+  // Mark SCEVUnknown Phis currently being processed by isImpliedViaMerge.
+  SmallPtrSet<const PHINode *, 6> PendingMerges;
+
   /// Set to true by isLoopBackedgeGuardedByCond when we're walking the set of
   /// conditions dominating the backedge of a loop.
   bool WalkingBEDominatingConds = false;
@@ -1240,7 +1303,7 @@ private:
     /// If we allowed SCEV predicates to be generated when populating this
     /// vector, this information can contain them and therefore a
     /// SCEVPredicate argument should be added to getExact.
-    const SCEV *getExact(ScalarEvolution *SE,
+    const SCEV *getExact(const Loop *L, ScalarEvolution *SE,
                          SCEVUnionPredicate *Predicates = nullptr) const;
 
     /// Return the number of times this loop exit may fall through to the back
@@ -1424,8 +1487,7 @@ private:
                              bool AllowPredicates = false);
 
   /// Compute the number of times the backedge of the specified loop will
-  /// execute if its exit condition were a conditional branch of ExitCond,
-  /// TBB, and FBB.
+  /// execute if its exit condition were a conditional branch of ExitCond.
   ///
   /// \p ControlsExit is true if ExitCond directly controls the exit
   /// branch. In this case, we can assume that the loop exits only if the
@@ -1435,15 +1497,14 @@ private:
   /// If \p AllowPredicates is set, this call will try to use a minimal set of
   /// SCEV predicates in order to return an exact answer.
   ExitLimit computeExitLimitFromCond(const Loop *L, Value *ExitCond,
-                                     BasicBlock *TBB, BasicBlock *FBB,
-                                     bool ControlsExit,
+                                     bool ExitIfTrue, bool ControlsExit,
                                      bool AllowPredicates = false);
 
   // Helper functions for computeExitLimitFromCond to avoid exponential time
   // complexity.
 
   class ExitLimitCache {
-    // It may look like we need key on the whole (L, TBB, FBB, ControlsExit,
+    // It may look like we need key on the whole (L, ExitIfTrue, ControlsExit,
     // AllowPredicates) tuple, but recursive calls to
     // computeExitLimitFromCondCached from computeExitLimitFromCondImpl only
     // vary the in \c ExitCond and \c ControlsExit parameters.  We remember the
@@ -1451,43 +1512,39 @@ private:
     SmallDenseMap<PointerIntPair<Value *, 1>, ExitLimit> TripCountMap;
 
     const Loop *L;
-    BasicBlock *TBB;
-    BasicBlock *FBB;
+    bool ExitIfTrue;
     bool AllowPredicates;
 
   public:
-    ExitLimitCache(const Loop *L, BasicBlock *TBB, BasicBlock *FBB,
-                   bool AllowPredicates)
-        : L(L), TBB(TBB), FBB(FBB), AllowPredicates(AllowPredicates) {}
+    ExitLimitCache(const Loop *L, bool ExitIfTrue, bool AllowPredicates)
+        : L(L), ExitIfTrue(ExitIfTrue), AllowPredicates(AllowPredicates) {}
 
-    Optional<ExitLimit> find(const Loop *L, Value *ExitCond, BasicBlock *TBB,
-                             BasicBlock *FBB, bool ControlsExit,
-                             bool AllowPredicates);
+    Optional<ExitLimit> find(const Loop *L, Value *ExitCond, bool ExitIfTrue,
+                             bool ControlsExit, bool AllowPredicates);
 
-    void insert(const Loop *L, Value *ExitCond, BasicBlock *TBB,
-                BasicBlock *FBB, bool ControlsExit, bool AllowPredicates,
-                const ExitLimit &EL);
+    void insert(const Loop *L, Value *ExitCond, bool ExitIfTrue,
+                bool ControlsExit, bool AllowPredicates, const ExitLimit &EL);
   };
 
   using ExitLimitCacheTy = ExitLimitCache;
 
   ExitLimit computeExitLimitFromCondCached(ExitLimitCacheTy &Cache,
                                            const Loop *L, Value *ExitCond,
-                                           BasicBlock *TBB, BasicBlock *FBB,
+                                           bool ExitIfTrue,
                                            bool ControlsExit,
                                            bool AllowPredicates);
   ExitLimit computeExitLimitFromCondImpl(ExitLimitCacheTy &Cache, const Loop *L,
-                                         Value *ExitCond, BasicBlock *TBB,
-                                         BasicBlock *FBB, bool ControlsExit,
+                                         Value *ExitCond, bool ExitIfTrue,
+                                         bool ControlsExit,
                                          bool AllowPredicates);
 
   /// Compute the number of times the backedge of the specified loop will
   /// execute if its exit condition were a conditional branch of the ICmpInst
-  /// ExitCond, TBB, and FBB. If AllowPredicates is set, this call will try
+  /// ExitCond and ExitIfTrue. If AllowPredicates is set, this call will try
   /// to use a minimal set of SCEV predicates in order to return an exact
   /// answer.
   ExitLimit computeExitLimitFromICmp(const Loop *L, ICmpInst *ExitCond,
-                                     BasicBlock *TBB, BasicBlock *FBB,
+                                     bool ExitIfTrue,
                                      bool IsSubExpr,
                                      bool AllowPredicates = false);
 
@@ -1591,8 +1648,8 @@ private:
 
   /// Test whether the condition described by Pred, LHS, and RHS is true.
   /// Use only simple non-recursive types of checks, such as range analysis etc.
-  bool isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
-                                 const SCEV *LHS, const SCEV *RHS);
+  bool isKnownViaNonRecursiveReasoning(ICmpInst::Predicate Pred,
+                                       const SCEV *LHS, const SCEV *RHS);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
@@ -1625,6 +1682,18 @@ private:
                                           const SCEV *FoundLHS,
                                           const SCEV *FoundRHS);
 
+  /// Test whether the condition described by Pred, LHS, and RHS is true
+  /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
+  /// true.
+  ///
+  /// This routine tries to figure out predicate for Phis which are SCEVUnknown
+  /// if it is true for every possible incoming value from their respective
+  /// basic blocks.
+  bool isImpliedViaMerge(ICmpInst::Predicate Pred,
+                         const SCEV *LHS, const SCEV *RHS,
+                         const SCEV *FoundLHS, const SCEV *FoundRHS,
+                         unsigned Depth);
+
   /// If we know that the specified Phi is in the header of its containing
   /// loop, we know the loop executes a constant number of times, and the PHI
   /// node is just a recurrence involving constants, fold it.
@@ -1764,10 +1833,22 @@ private:
   const SCEV *getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                  SCEV::NoWrapFlags Flags);
 
+  /// Return x if \p Val is f(x) where f is a 1-1 function.
+  const SCEV *stripInjectiveFunctions(const SCEV *Val) const;
+
+  /// Find all of the loops transitively used in \p S, and fill \p LoopsUsed.
+  /// A loop is considered "used" by an expression if it contains
+  /// an add rec on said loop.
+  void getUsedLoops(const SCEV *S, SmallPtrSetImpl<const Loop *> &LoopsUsed);
+
   /// Find all of the loops transitively used in \p S, and update \c LoopUsers
   /// accordingly.
   void addToLoopUseLists(const SCEV *S);
 
+  /// Try to match the pattern generated by getURemExpr(A, B). If successful,
+  /// Assign A and B to LHS and RHS, respectively.
+  bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
+
   FoldingSet<SCEV> UniqueSCEVs;
   FoldingSet<SCEVPredicate> UniquePreds;
   BumpPtrAllocator SCEVAllocator;
diff --git a/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h b/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
index 3df04e98bd24..58d42680d6bc 100644
--- a/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -321,7 +321,7 @@ namespace llvm {
 
     /// Arrange for there to be a cast of V to Ty at IP, reusing an existing
     /// cast if a suitable one exists, moving an existing cast if a suitable one
-    /// exists but isn't in the right place, or or creating a new one.
+    /// exists but isn't in the right place, or creating a new one.
     Value *ReuseOrCreateCast(Value *V, Type *Ty,
                              Instruction::CastOps Op,
                              BasicBlock::iterator IP);
@@ -335,6 +335,7 @@ namespace llvm {
     Value *expandAddToGEP(const SCEV *const *op_begin,
                           const SCEV *const *op_end,
                           PointerType *PTy, Type *Ty, Value *V);
+    Value *expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty, Value *V);
 
     /// Find a previous Value in ExprValueMap for expand.
     ScalarEvolution::ValueOffsetPair
diff --git a/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h b/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
index acf83455cdcd..42e76094eb2b 100644
--- a/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/contrib/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -350,9 +350,7 @@ class Type;
 
     /// Return an expression representing the value of this expression
     /// one iteration of the loop ahead.
-    const SCEVAddRecExpr *getPostIncExpr(ScalarEvolution &SE) const {
-      return cast<SCEVAddRecExpr>(SE.getAddExpr(this, getStepRecurrence(SE)));
-    }
+    const SCEVAddRecExpr *getPostIncExpr(ScalarEvolution &SE) const;
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
     static bool classof(const SCEV *S) {
diff --git a/contrib/llvm/include/llvm/Analysis/SparsePropagation.h b/contrib/llvm/include/llvm/Analysis/SparsePropagation.h
index 1b8df03b3a1b..defcf96afb25 100644
--- a/contrib/llvm/include/llvm/Analysis/SparsePropagation.h
+++ b/contrib/llvm/include/llvm/Analysis/SparsePropagation.h
@@ -238,7 +238,7 @@ SparseSolver<LatticeKey, LatticeVal, KeyInfo>::getValueState(LatticeKey Key) {
   // If this value is untracked, don't add it to the map.
   if (LV == LatticeFunc->getUntrackedVal())
     return LV;
-  return ValueState[Key] = LV;
+  return ValueState[Key] = std::move(LV);
 }
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
@@ -250,7 +250,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::UpdateState(LatticeKey Key,
 
   // Update the state of the given LatticeKey and add its corresponding LLVM
   // value to the work list.
-  ValueState[Key] = LV;
+  ValueState[Key] = std::move(LV);
   if (Value *V = KeyInfo::getValueFromLatticeKey(Key))
     ValueWorkList.push_back(V);
 }
@@ -260,7 +260,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::MarkBlockExecutable(
     BasicBlock *BB) {
   if (!BBExecutable.insert(BB).second)
     return;
-  DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n");
   BBWorkList.push_back(BB); // Add the block to the work list!
 }
 
@@ -270,8 +270,8 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::markEdgeExecutable(
   if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
     return; // This edge is already known to be executable!
 
-  DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() << " -> "
-               << Dest->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+                    << " -> " << Dest->getName() << "\n");
 
   if (BBExecutable.count(Dest)) {
     // The destination is already executable, but we just made an edge
@@ -318,7 +318,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::getFeasibleSuccessors(
 
     Constant *C =
         dyn_cast_or_null<Constant>(LatticeFunc->GetValueFromLatticeVal(
-            BCValue, BI->getCondition()->getType()));
+            std::move(BCValue), BI->getCondition()->getType()));
     if (!C || !isa<ConstantInt>(C)) {
       // Non-constant values can go either way.
       Succs[0] = Succs[1] = true;
@@ -360,7 +360,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::getFeasibleSuccessors(
     return;
 
   Constant *C = dyn_cast_or_null<Constant>(LatticeFunc->GetValueFromLatticeVal(
-      SCValue, SI.getCondition()->getType()));
+      std::move(SCValue), SI.getCondition()->getType()));
   if (!C || !isa<ConstantInt>(C)) {
     // All destinations are executable!
     Succs.assign(TI.getNumSuccessors(), true);
@@ -408,7 +408,8 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitPHINode(PHINode &PN) {
     LatticeFunc->ComputeInstructionState(PN, ChangedValues, *this);
     for (auto &ChangedValue : ChangedValues)
       if (ChangedValue.second != LatticeFunc->getUntrackedVal())
-        UpdateState(ChangedValue.first, ChangedValue.second);
+        UpdateState(std::move(ChangedValue.first),
+                    std::move(ChangedValue.second));
     return;
   }
 
@@ -477,7 +478,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::Solve() {
       Value *V = ValueWorkList.back();
       ValueWorkList.pop_back();
 
-      DEBUG(dbgs() << "\nPopped off V-WL: " << *V << "\n");
+      LLVM_DEBUG(dbgs() << "\nPopped off V-WL: " << *V << "\n");
 
       // "V" got into the work list because it made a transition. See if any
       // users are both live and in need of updating.
@@ -492,7 +493,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::Solve() {
       BasicBlock *BB = BBWorkList.back();
       BBWorkList.pop_back();
 
-      DEBUG(dbgs() << "\nPopped off BBWL: " << *BB);
+      LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB);
 
       // Notify all instructions in this basic block that they are newly
       // executable.
diff --git a/contrib/llvm/include/llvm/Analysis/SyntheticCountsUtils.h b/contrib/llvm/include/llvm/Analysis/SyntheticCountsUtils.h
new file mode 100644
index 000000000000..87f4a0100b38
--- /dev/null
+++ b/contrib/llvm/include/llvm/Analysis/SyntheticCountsUtils.h
@@ -0,0 +1,52 @@
+//===- SyntheticCountsUtils.h - utilities for count propagation--*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities for synthetic counts propagation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_SYNTHETIC_COUNTS_UTILS_H
+#define LLVM_ANALYSIS_SYNTHETIC_COUNTS_UTILS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/ScaledNumber.h"
+
+namespace llvm {
+
+class CallGraph;
+class Function;
+
+/// Class with methods to propagate synthetic entry counts.
+///
+/// This class is templated on the type of the call graph and designed to work
+/// with the traditional per-module callgraph and the summary callgraphs used in
+/// ThinLTO. This contains only static methods and alias templates.
+template <typename CallGraphType> class SyntheticCountsUtils {
+public:
+  using Scaled64 = ScaledNumber<uint64_t>;
+  using CGT = GraphTraits<CallGraphType>;
+  using NodeRef = typename CGT::NodeRef;
+  using EdgeRef = typename CGT::EdgeRef;
+  using SccTy = std::vector<NodeRef>;
+
+  using GetRelBBFreqTy = function_ref<Optional<Scaled64>(EdgeRef)>;
+  using GetCountTy = function_ref<uint64_t(NodeRef)>;
+  using AddCountTy = function_ref<void(NodeRef, uint64_t)>;
+
+  static void propagate(const CallGraphType &CG, GetRelBBFreqTy GetRelBBFreq,
+                        GetCountTy GetCount, AddCountTy AddCount);
+
+private:
+  static void propagateFromSCC(const SccTy &SCC, GetRelBBFreqTy GetRelBBFreq,
+                               GetCountTy GetCount, AddCountTy AddCount);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/contrib/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index a461ed813b9b..f94debba9c52 100644
--- a/contrib/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/contrib/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -119,6 +119,12 @@ TLI_DEFINE_STRING_INTERNAL("_ZdaPv")
 /// void operator delete[](void*, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZdaPvRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZdaPvRKSt9nothrow_t")
+/// void operator delete[](void*, align_val_t);
+TLI_DEFINE_ENUM_INTERNAL(ZdaPvSt11align_val_t)
+TLI_DEFINE_STRING_INTERNAL("_ZdaPvSt11align_val_t")
+/// void operator delete[](void*, align_val_t, nothrow)
+TLI_DEFINE_ENUM_INTERNAL(ZdaPvSt11align_val_tRKSt9nothrow_t)
+TLI_DEFINE_STRING_INTERNAL("_ZdaPvSt11align_val_tRKSt9nothrow_t")
 /// void operator delete[](void*, unsigned int);
 TLI_DEFINE_ENUM_INTERNAL(ZdaPvj)
 TLI_DEFINE_STRING_INTERNAL("_ZdaPvj")
@@ -131,6 +137,12 @@ TLI_DEFINE_STRING_INTERNAL("_ZdlPv")
 /// void operator delete(void*, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZdlPvRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZdlPvRKSt9nothrow_t")
+/// void operator delete(void*, align_val_t)
+TLI_DEFINE_ENUM_INTERNAL(ZdlPvSt11align_val_t)
+TLI_DEFINE_STRING_INTERNAL("_ZdlPvSt11align_val_t")
+/// void operator delete(void*, align_val_t, nothrow)
+TLI_DEFINE_ENUM_INTERNAL(ZdlPvSt11align_val_tRKSt9nothrow_t)
+TLI_DEFINE_STRING_INTERNAL("_ZdlPvSt11align_val_tRKSt9nothrow_t")
 /// void operator delete(void*, unsigned int);
 TLI_DEFINE_ENUM_INTERNAL(ZdlPvj)
 TLI_DEFINE_STRING_INTERNAL("_ZdlPvj")
@@ -143,24 +155,48 @@ TLI_DEFINE_STRING_INTERNAL("_Znaj")
 /// void *new[](unsigned int, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZnajRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZnajRKSt9nothrow_t")
+/// void *new[](unsigned int, align_val_t)
+TLI_DEFINE_ENUM_INTERNAL(ZnajSt11align_val_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnajSt11align_val_t")
+/// void *new[](unsigned int, align_val_t, nothrow)
+TLI_DEFINE_ENUM_INTERNAL(ZnajSt11align_val_tRKSt9nothrow_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnajSt11align_val_tRKSt9nothrow_t")
 /// void *new[](unsigned long);
 TLI_DEFINE_ENUM_INTERNAL(Znam)
 TLI_DEFINE_STRING_INTERNAL("_Znam")
 /// void *new[](unsigned long, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZnamRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZnamRKSt9nothrow_t")
+/// void *new[](unsigned long, align_val_t)
+TLI_DEFINE_ENUM_INTERNAL(ZnamSt11align_val_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnamSt11align_val_t")
+/// void *new[](unsigned long, align_val_t, nothrow)
+TLI_DEFINE_ENUM_INTERNAL(ZnamSt11align_val_tRKSt9nothrow_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnamSt11align_val_tRKSt9nothrow_t")
 /// void *new(unsigned int);
 TLI_DEFINE_ENUM_INTERNAL(Znwj)
 TLI_DEFINE_STRING_INTERNAL("_Znwj")
 /// void *new(unsigned int, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZnwjRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZnwjRKSt9nothrow_t")
+/// void *new(unsigned int, align_val_t)
+TLI_DEFINE_ENUM_INTERNAL(ZnwjSt11align_val_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnwjSt11align_val_t")
+/// void *new(unsigned int, align_val_t, nothrow)
+TLI_DEFINE_ENUM_INTERNAL(ZnwjSt11align_val_tRKSt9nothrow_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnwjSt11align_val_tRKSt9nothrow_t")
 /// void *new(unsigned long);
 TLI_DEFINE_ENUM_INTERNAL(Znwm)
 TLI_DEFINE_STRING_INTERNAL("_Znwm")
 /// void *new(unsigned long, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZnwmRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZnwmRKSt9nothrow_t")
+/// void *new(unsigned long, align_val_t)
+TLI_DEFINE_ENUM_INTERNAL(ZnwmSt11align_val_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnwmSt11align_val_t")
+/// void *new(unsigned long, align_val_t, nothrow)
+TLI_DEFINE_ENUM_INTERNAL(ZnwmSt11align_val_tRKSt9nothrow_t)
+TLI_DEFINE_STRING_INTERNAL("_ZnwmSt11align_val_tRKSt9nothrow_t")
 /// double __acos_finite(double x);
 TLI_DEFINE_ENUM_INTERNAL(acos_finite)
 TLI_DEFINE_STRING_INTERNAL("__acos_finite")
@@ -601,12 +637,18 @@ TLI_DEFINE_STRING_INTERNAL("ffsll")
 /// int fgetc(FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fgetc)
 TLI_DEFINE_STRING_INTERNAL("fgetc")
+/// int fgetc_unlocked(FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(fgetc_unlocked)
+TLI_DEFINE_STRING_INTERNAL("fgetc_unlocked")
 /// int fgetpos(FILE *stream, fpos_t *pos);
 TLI_DEFINE_ENUM_INTERNAL(fgetpos)
 TLI_DEFINE_STRING_INTERNAL("fgetpos")
 /// char *fgets(char *s, int n, FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fgets)
 TLI_DEFINE_STRING_INTERNAL("fgets")
+/// char *fgets_unlocked(char *s, int n, FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(fgets_unlocked)
+TLI_DEFINE_STRING_INTERNAL("fgets_unlocked")
 /// int fileno(FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fileno)
 TLI_DEFINE_STRING_INTERNAL("fileno")
@@ -673,12 +715,21 @@ TLI_DEFINE_STRING_INTERNAL("fprintf")
 /// int fputc(int c, FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fputc)
 TLI_DEFINE_STRING_INTERNAL("fputc")
+/// int fputc_unlocked(int c, FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(fputc_unlocked)
+TLI_DEFINE_STRING_INTERNAL("fputc_unlocked")
 /// int fputs(const char *s, FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fputs)
 TLI_DEFINE_STRING_INTERNAL("fputs")
+/// int fputs_unlocked(const char *s, FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(fputs_unlocked)
+TLI_DEFINE_STRING_INTERNAL("fputs_unlocked")
 /// size_t fread(void *ptr, size_t size, size_t nitems, FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fread)
 TLI_DEFINE_STRING_INTERNAL("fread")
+/// size_t fread_unlocked(void *ptr, size_t size, size_t nitems, FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(fread_unlocked)
+TLI_DEFINE_STRING_INTERNAL("fread_unlocked")
 /// void free(void *ptr);
 TLI_DEFINE_ENUM_INTERNAL(free)
 TLI_DEFINE_STRING_INTERNAL("free")
@@ -736,6 +787,9 @@ TLI_DEFINE_STRING_INTERNAL("funlockfile")
 /// size_t fwrite(const void *ptr, size_t size, size_t nitems, FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(fwrite)
 TLI_DEFINE_STRING_INTERNAL("fwrite")
+/// size_t fwrite_unlocked(const void *ptr, size_t size, size_t nitems, FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(fwrite_unlocked)
+TLI_DEFINE_STRING_INTERNAL("fwrite_unlocked")
 /// int getc(FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(getc)
 TLI_DEFINE_STRING_INTERNAL("getc")
@@ -745,6 +799,9 @@ TLI_DEFINE_STRING_INTERNAL("getc_unlocked")
 /// int getchar(void);
 TLI_DEFINE_ENUM_INTERNAL(getchar)
 TLI_DEFINE_STRING_INTERNAL("getchar")
+/// int getchar_unlocked(void);
+TLI_DEFINE_ENUM_INTERNAL(getchar_unlocked)
+TLI_DEFINE_STRING_INTERNAL("getchar_unlocked")
 /// char *getenv(const char *name);
 TLI_DEFINE_ENUM_INTERNAL(getenv)
 TLI_DEFINE_STRING_INTERNAL("getenv")
@@ -950,9 +1007,15 @@ TLI_DEFINE_STRING_INTERNAL("printf")
 /// int putc(int c, FILE *stream);
 TLI_DEFINE_ENUM_INTERNAL(putc)
 TLI_DEFINE_STRING_INTERNAL("putc")
+/// int putc_unlocked(int c, FILE *stream);
+TLI_DEFINE_ENUM_INTERNAL(putc_unlocked)
+TLI_DEFINE_STRING_INTERNAL("putc_unlocked")
 /// int putchar(int c);
 TLI_DEFINE_ENUM_INTERNAL(putchar)
 TLI_DEFINE_STRING_INTERNAL("putchar")
+/// int putchar_unlocked(int c);
+TLI_DEFINE_ENUM_INTERNAL(putchar_unlocked)
+TLI_DEFINE_STRING_INTERNAL("putchar_unlocked")
 /// int puts(const char *s);
 TLI_DEFINE_ENUM_INTERNAL(puts)
 TLI_DEFINE_STRING_INTERNAL("puts")
diff --git a/contrib/llvm/include/llvm/Analysis/TargetTransformInfo.h b/contrib/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c20f20cfbe4d..59657cca40f5 100644
--- a/contrib/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/contrib/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -49,7 +49,7 @@ class Type;
 class User;
 class Value;
 
-/// \brief Information about a load/store intrinsic defined by the target.
+/// Information about a load/store intrinsic defined by the target.
 struct MemIntrinsicInfo {
   /// This is the pointer that the intrinsic is loading from or storing to.
   /// If this is non-null, then analysis/optimization passes can assume that
@@ -73,18 +73,18 @@ struct MemIntrinsicInfo {
   }
 };
 
-/// \brief This pass provides access to the codegen interfaces that are needed
+/// This pass provides access to the codegen interfaces that are needed
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
-  /// \brief Construct a TTI object using a type implementing the \c Concept
+  /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
   ///
   /// This is used by targets to construct a TTI wrapping their target-specific
   /// implementaion that encodes appropriate costs for their target.
   template <typename T> TargetTransformInfo(T Impl);
 
-  /// \brief Construct a baseline TTI object using a minimal implementation of
+  /// Construct a baseline TTI object using a minimal implementation of
   /// the \c Concept API below.
   ///
   /// The TTI implementation will reflect the information in the DataLayout
@@ -99,7 +99,7 @@ public:
   // out-of-line.
   ~TargetTransformInfo();
 
-  /// \brief Handle the invalidation of this information.
+  /// Handle the invalidation of this information.
   ///
   /// When used as a result of \c TargetIRAnalysis this method will be called
   /// when the function this was computed for changes. When it returns false,
@@ -114,7 +114,7 @@ public:
   /// \name Generic Target Information
   /// @{
 
-  /// \brief The kind of cost model.
+  /// The kind of cost model.
   ///
   /// There are several different cost models that can be customized by the
   /// target. The normalization of each cost model may be target specific.
@@ -124,7 +124,7 @@ public:
     TCK_CodeSize         ///< Instruction code size.
   };
 
-  /// \brief Query the cost of a specified instruction.
+  /// Query the cost of a specified instruction.
   ///
   /// Clients should use this interface to query the cost of an existing
   /// instruction. The instruction must have a valid parent (basic block).
@@ -145,7 +145,7 @@ public:
     llvm_unreachable("Unknown instruction cost kind");
   }
 
-  /// \brief Underlying constants for 'cost' values in this interface.
+  /// Underlying constants for 'cost' values in this interface.
   ///
   /// Many APIs in this interface return a cost. This enum defines the
   /// fundamental values that should be used to interpret (and produce) those
@@ -169,7 +169,7 @@ public:
     TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
   };
 
-  /// \brief Estimate the cost of a specific operation when lowered.
+  /// Estimate the cost of a specific operation when lowered.
   ///
   /// Note that this is designed to work on an arbitrary synthetic opcode, and
   /// thus work for hypothetical queries before an instruction has even been
@@ -185,7 +185,7 @@ public:
   /// comments for a detailed explanation of the cost values.
   int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy = nullptr) const;
 
-  /// \brief Estimate the cost of a GEP operation when lowered.
+  /// Estimate the cost of a GEP operation when lowered.
   ///
   /// The contract for this function is the same as \c getOperationCost except
   /// that it supports an interface that provides extra information specific to
@@ -193,14 +193,14 @@ public:
   int getGEPCost(Type *PointeeType, const Value *Ptr,
                  ArrayRef<const Value *> Operands) const;
 
-  /// \brief Estimate the cost of a EXT operation when lowered.
+  /// Estimate the cost of a EXT operation when lowered.
   ///
   /// The contract for this function is the same as \c getOperationCost except
   /// that it supports an interface that provides extra information specific to
   /// the EXT operation.
   int getExtCost(const Instruction *I, const Value *Src) const;
 
-  /// \brief Estimate the cost of a function call when lowered.
+  /// Estimate the cost of a function call when lowered.
   ///
   /// The contract for this is the same as \c getOperationCost except that it
   /// supports an interface that provides extra information specific to call
@@ -211,13 +211,13 @@ public:
   /// The latter is only interesting for varargs function types.
   int getCallCost(FunctionType *FTy, int NumArgs = -1) const;
 
-  /// \brief Estimate the cost of calling a specific function when lowered.
+  /// Estimate the cost of calling a specific function when lowered.
   ///
   /// This overload adds the ability to reason about the particular function
   /// being called in the event it is a library call with special lowering.
   int getCallCost(const Function *F, int NumArgs = -1) const;
 
-  /// \brief Estimate the cost of calling a specific function when lowered.
+  /// Estimate the cost of calling a specific function when lowered.
   ///
   /// This overload allows specifying a set of candidate argument values.
   int getCallCost(const Function *F, ArrayRef<const Value *> Arguments) const;
@@ -230,13 +230,13 @@ public:
   /// individual classes of instructions would be better.
   unsigned getInliningThresholdMultiplier() const;
 
-  /// \brief Estimate the cost of an intrinsic when lowered.
+  /// Estimate the cost of an intrinsic when lowered.
   ///
   /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
   int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                        ArrayRef<Type *> ParamTys) const;
 
-  /// \brief Estimate the cost of an intrinsic when lowered.
+  /// Estimate the cost of an intrinsic when lowered.
   ///
   /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
   int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
@@ -248,7 +248,7 @@ public:
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                             unsigned &JTSize) const;
 
-  /// \brief Estimate the cost of a given IR user when lowered.
+  /// Estimate the cost of a given IR user when lowered.
   ///
   /// This can estimate the cost of either a ConstantExpr or Instruction when
   /// lowered. It has two primary advantages over the \c getOperationCost and
@@ -271,7 +271,7 @@ public:
   /// comments for a detailed explanation of the cost values.
   int getUserCost(const User *U, ArrayRef<const Value *> Operands) const;
 
-  /// \brief This is a helper function which calls the two-argument getUserCost
+  /// This is a helper function which calls the two-argument getUserCost
   /// with \p Operands which are the current operands U has.
   int getUserCost(const User *U) const {
     SmallVector<const Value *, 4> Operands(U->value_op_begin(),
@@ -279,14 +279,14 @@ public:
     return getUserCost(U, Operands);
   }
 
-  /// \brief Return true if branch divergence exists.
+  /// Return true if branch divergence exists.
   ///
   /// Branch divergence has a significantly negative impact on GPU performance
   /// when threads in the same wavefront take different paths due to conditional
   /// branches.
   bool hasBranchDivergence() const;
 
-  /// \brief Returns whether V is a source of divergence.
+  /// Returns whether V is a source of divergence.
   ///
   /// This function provides the target-dependent information for
   /// the target-independent DivergenceAnalysis. DivergenceAnalysis first
@@ -294,7 +294,7 @@ public:
   /// starting with the sources of divergence.
   bool isSourceOfDivergence(const Value *V) const;
 
-  // \brief Returns true for the target specific
+  // Returns true for the target specific
   // set of operations which produce uniform result
   // even taking non-unform arguments
   bool isAlwaysUniform(const Value *V) const;
@@ -308,7 +308,7 @@ public:
   /// compared to the same memory location accessed through a pointer with a
   /// different address space.
   //
-  /// This is for for targets with different pointer representations which can
+  /// This is for targets with different pointer representations which can
   /// be converted with the addrspacecast instruction. If a pointer is converted
   /// to this address space, optimizations should attempt to replace the access
   /// with the source address space.
@@ -317,7 +317,7 @@ public:
   /// optimize away.
   unsigned getFlatAddressSpace() const;
 
-  /// \brief Test whether calls to a function lower to actual program function
+  /// Test whether calls to a function lower to actual program function
   /// calls.
   ///
   /// The idea is to test whether the program is likely to require a 'call'
@@ -422,9 +422,16 @@ public:
     bool AllowPeeling;
     /// Allow unrolling of all the iterations of the runtime loop remainder.
     bool UnrollRemainder;
+    /// Allow unroll and jam. Used to enable unroll and jam for the target.
+    bool UnrollAndJam;
+    /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
+    /// value above is used during unroll and jam for the outer loop size.
+    /// This value is used in the same manner to limit the size of the inner
+    /// loop.
+    unsigned UnrollAndJamInnerLoopThreshold;
   };
 
-  /// \brief Get target-customized preferences for the generic loop unrolling
+  /// Get target-customized preferences for the generic loop unrolling
   /// transformation. The caller will initialize UP with the current
   /// target-independent defaults.
   void getUnrollingPreferences(Loop *L, ScalarEvolution &,
@@ -435,7 +442,7 @@ public:
   /// \name Scalar Target Information
   /// @{
 
-  /// \brief Flags indicating the kind of support for population count.
+  /// Flags indicating the kind of support for population count.
   ///
   /// Compared to the SW implementation, HW support is supposed to
   /// significantly boost the performance when the population is dense, and it
@@ -445,18 +452,18 @@ public:
   /// considered as "Slow".
   enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
 
-  /// \brief Return true if the specified immediate is legal add immediate, that
+  /// Return true if the specified immediate is legal add immediate, that
   /// is the target has add instructions which can add a register with the
   /// immediate without having to materialize the immediate into a register.
   bool isLegalAddImmediate(int64_t Imm) const;
 
-  /// \brief Return true if the specified immediate is legal icmp immediate,
+  /// Return true if the specified immediate is legal icmp immediate,
   /// that is the target has icmp instructions which can compare a register
   /// against the immediate without having to materialize the immediate into a
   /// register.
   bool isLegalICmpImmediate(int64_t Imm) const;
 
-  /// \brief Return true if the addressing mode represented by AM is legal for
+  /// Return true if the addressing mode represented by AM is legal for
   /// this target, for a load/store of the specified type.
   /// The type may be VoidTy, in which case only return true if the addressing
   /// mode is legal for a load/store of any legal type.
@@ -467,16 +474,25 @@ public:
                              unsigned AddrSpace = 0,
                              Instruction *I = nullptr) const;
 
-  /// \brief Return true if LSR cost of C1 is lower than C1.
+  /// Return true if LSR cost of C1 is lower than C1.
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2) const;
 
-  /// \brief Return true if the target supports masked load/store
+  /// Return true if the target can fuse a compare and branch.
+  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
+  /// calculation for the instructions in a loop.
+  bool canMacroFuseCmp() const;
+
+  /// \return True is LSR should make efforts to create/preserve post-inc
+  /// addressing mode expressions.
+  bool shouldFavorPostInc() const;
+
+  /// Return true if the target supports masked load/store
   /// AVX2 and AVX-512 targets allow masks for consecutive load and store
   bool isLegalMaskedStore(Type *DataType) const;
   bool isLegalMaskedLoad(Type *DataType) const;
 
-  /// \brief Return true if the target supports masked gather/scatter
+  /// Return true if the target supports masked gather/scatter
   /// AVX-512 fully supports gather and scatter for vectors with 32 and 64
   /// bits scalar type.
   bool isLegalMaskedScatter(Type *DataType) const;
@@ -499,7 +515,7 @@ public:
   /// Return true if target doesn't mind addresses in vectors.
   bool prefersVectorizedAddressing() const;
 
-  /// \brief Return the cost of the scaling factor used in the addressing
+  /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
@@ -509,38 +525,44 @@ public:
                            bool HasBaseReg, int64_t Scale,
                            unsigned AddrSpace = 0) const;
 
-  /// \brief Return true if the loop strength reduce pass should make
+  /// Return true if the loop strength reduce pass should make
   /// Instruction* based TTI queries to isLegalAddressingMode(). This is
   /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
   /// immediate offset and no index register.
   bool LSRWithInstrQueries() const;
 
-  /// \brief Return true if it's free to truncate a value of type Ty1 to type
+  /// Return true if it's free to truncate a value of type Ty1 to type
   /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
   /// by referencing its sub-register AX.
   bool isTruncateFree(Type *Ty1, Type *Ty2) const;
 
-  /// \brief Return true if it is profitable to hoist instruction in the
+  /// Return true if it is profitable to hoist instruction in the
   /// then/else to before if.
   bool isProfitableToHoist(Instruction *I) const;
 
-  /// \brief Return true if this type is legal.
+  bool useAA() const;
+
+  /// Return true if this type is legal.
   bool isTypeLegal(Type *Ty) const;
 
-  /// \brief Returns the target's jmp_buf alignment in bytes.
+  /// Returns the target's jmp_buf alignment in bytes.
   unsigned getJumpBufAlignment() const;
 
-  /// \brief Returns the target's jmp_buf size in bytes.
+  /// Returns the target's jmp_buf size in bytes.
   unsigned getJumpBufSize() const;
 
-  /// \brief Return true if switches should be turned into lookup tables for the
+  /// Return true if switches should be turned into lookup tables for the
   /// target.
   bool shouldBuildLookupTables() const;
 
-  /// \brief Return true if switches should be turned into lookup tables
+  /// Return true if switches should be turned into lookup tables
   /// containing this constant value for the target.
   bool shouldBuildLookupTablesForConstant(Constant *C) const;
 
+  /// Return true if the input function which is cold at all call sites,
+  ///  should use coldcc calling convention.
+  bool useColdCCForColdCall(Function &F) const;
+
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
@@ -551,10 +573,10 @@ public:
   /// the scalarization cost of a load/store.
   bool supportsEfficientVectorElementLoadStore() const;
 
-  /// \brief Don't restrict interleaved unrolling to small loops.
+  /// Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
-  /// \brief If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
+  /// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
   /// true if this is the expansion of memcmp(p1, p2, s) == 0.
   struct MemCmpExpansionOptions {
     // The list of available load sizes (in bytes), sorted in decreasing order.
@@ -562,10 +584,10 @@ public:
   };
   const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;
 
-  /// \brief Enable matching of interleaved access groups.
+  /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
-  /// \brief Indicate that it is potentially unsafe to automatically vectorize
+  /// Indicate that it is potentially unsafe to automatically vectorize
   /// floating-point operations because the semantics of vector and scalar
   /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
   /// does not support IEEE-754 denormal numbers, while depending on the
@@ -574,16 +596,16 @@ public:
   /// operations, shuffles, or casts.
   bool isFPVectorizationPotentiallyUnsafe() const;
 
-  /// \brief Determine if the target supports unaligned memory accesses.
+  /// Determine if the target supports unaligned memory accesses.
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                       unsigned BitWidth, unsigned AddressSpace = 0,
                                       unsigned Alignment = 1,
                                       bool *Fast = nullptr) const;
 
-  /// \brief Return hardware support for population count.
+  /// Return hardware support for population count.
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
-  /// \brief Return true if the hardware has a fast square-root instruction.
+  /// Return true if the hardware has a fast square-root instruction.
   bool haveFastSqrt(Type *Ty) const;
 
   /// Return true if it is faster to check if a floating-point value is NaN
@@ -592,15 +614,15 @@ public:
   /// generally as cheap as checking for ordered/unordered.
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
 
-  /// \brief Return the expected cost of supporting the floating point operation
+  /// Return the expected cost of supporting the floating point operation
   /// of the specified type.
   int getFPOpCost(Type *Ty) const;
 
-  /// \brief Return the expected cost of materializing for the given integer
+  /// Return the expected cost of materializing for the given integer
   /// immediate of the specified type.
   int getIntImmCost(const APInt &Imm, Type *Ty) const;
 
-  /// \brief Return the expected cost of materialization for the given integer
+  /// Return the expected cost of materialization for the given integer
   /// immediate of the specified type for a given instruction. The cost can be
   /// zero if the immediate can be folded into the specified instruction.
   int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
@@ -608,7 +630,7 @@ public:
   int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                     Type *Ty) const;
 
-  /// \brief Return the expected cost for the given integer when optimising
+  /// Return the expected cost for the given integer when optimising
   /// for size. This is different than the other integer immediate cost
   /// functions in that it is subtarget agnostic. This is useful when you e.g.
   /// target one ISA such as Aarch32 but smaller encodings could be possible
@@ -622,11 +644,14 @@ public:
   /// \name Vector Target Information
   /// @{
 
-  /// \brief The various kinds of shuffle patterns for vector queries.
+  /// The various kinds of shuffle patterns for vector queries.
   enum ShuffleKind {
     SK_Broadcast,       ///< Broadcast element 0 to all other elements.
     SK_Reverse,         ///< Reverse the order of the vector.
-    SK_Alternate,       ///< Choose alternate elements from vector.
+    SK_Select,          ///< Selects elements from the corresponding lane of
+                        ///< either source operand. This is equivalent to a
+                        ///< vector select with a constant condition operand.
+    SK_Transpose,       ///< Transpose two vectors.
     SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
     SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset.
     SK_PermuteTwoSrc,   ///< Merge elements from two source vectors into one
@@ -635,7 +660,7 @@ public:
                         ///< shuffle mask.
   };
 
-  /// \brief Additional information about an operand's possible values.
+  /// Additional information about an operand's possible values.
   enum OperandValueKind {
     OK_AnyValue,               // Operand can have any value.
     OK_UniformValue,           // Operand is uniform (splat of a value).
@@ -643,7 +668,7 @@ public:
     OK_NonUniformConstantValue // Operand is a non uniform constant value.
   };
 
-  /// \brief Additional properties of an operand's values.
+  /// Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
   /// \return The number of scalar or vector registers that the target has.
@@ -657,6 +682,19 @@ public:
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return True if the vectorization factor should be chosen to
+  /// make the vector of the smallest element type match the size of a
+  /// vector register. For wider element types, this could result in
+  /// creating vectors that span multiple vector registers.
+  /// If false, the vectorization factor will be chosen based on the
+  /// size of the widest element type.
+  bool shouldMaximizeVectorBandwidth(bool OptSize) const;
+
+  /// \return The minimum vectorization factor for types of given element
+  /// bit width, or 0 if there is no mimimum VF. The returned value only
+  /// applies when shouldMaximizeVectorBandwidth returns true.
+  unsigned getMinimumVF(unsigned ElemWidth) const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -701,10 +739,20 @@ public:
   /// and the number of execution units in the CPU.
   unsigned getMaxInterleaveFactor(unsigned VF) const;
 
-  /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
-  /// \p Args is an optional argument which holds the instruction operands  
-  /// values so the TTI can analyize those values searching for special 
-  /// cases\optimizations based on those values.
+  /// This is an approximation of reciprocal throughput of a math/logic op.
+  /// A higher cost indicates less expected throughput.
+  /// From Agner Fog's guides, reciprocal throughput is "the average number of
+  /// clock cycles per instruction when the instructions are not part of a
+  /// limiting dependency chain."
+  /// Therefore, costs should be scaled to account for multiple execution units
+  /// on the target that can process this type of instruction. For example, if
+  /// there are 5 scalar integer units and 2 vector integer units that can
+  /// calculate an 'add' in a single cycle, this model should indicate that the
+  /// cost of the vector add instruction is 2.5 times the cost of the scalar
+  /// add instruction.
+  /// \p Args is an optional argument which holds the instruction operands
+  /// values so the TTI can analyze those values searching for special
+  /// cases or optimizations based on those values.
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
       OperandValueKind Opd2Info = OK_AnyValue,
@@ -773,7 +821,7 @@ public:
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
-  /// \brief Calculate the cost of performing a vector reduction.
+  /// Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
   /// value using the operation denoted by \p Opcode. The form of the reduction
@@ -867,6 +915,21 @@ public:
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
+  /// The type of load/store indexing.
+  enum MemIndexedMode {
+    MIM_Unindexed,  ///< No indexing.
+    MIM_PreInc,     ///< Pre-incrementing.
+    MIM_PreDec,     ///< Pre-decrementing.
+    MIM_PostInc,    ///< Post-incrementing.
+    MIM_PostDec     ///< Post-decrementing.
+  };
+
+  /// \returns True if the specified indexed load for the given type is legal.
+  bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;
+
+  /// \returns True if the specified indexed store for the given type is legal.
+  bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;
+
   /// \returns The bitwidth of the largest vector type that should be used to
   /// load/store in the given address space.
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
@@ -918,19 +981,19 @@ public:
   /// @}
 
 private:
-  /// \brief Estimate the latency of specified instruction.
+  /// Estimate the latency of specified instruction.
   /// Returns 1 as the default value.
   int getInstructionLatency(const Instruction *I) const;
 
-  /// \brief Returns the expected throughput cost of the instruction.
+  /// Returns the expected throughput cost of the instruction.
   /// Returns -1 if the cost is unknown.
   int getInstructionThroughput(const Instruction *I) const;
 
-  /// \brief The abstract base class used to type erase specific TTI
+  /// The abstract base class used to type erase specific TTI
   /// implementations.
   class Concept;
 
-  /// \brief The template model for the base class which wraps a concrete
+  /// The template model for the base class which wraps a concrete
   /// implementation in a type erased interface.
   template <typename T> class Model;
 
@@ -974,6 +1037,8 @@ public:
                                      Instruction *I) = 0;
   virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                              TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool canMacroFuseCmp() = 0;
+  virtual bool shouldFavorPostInc() const = 0;
   virtual bool isLegalMaskedStore(Type *DataType) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@@ -987,11 +1052,13 @@ public:
   virtual bool LSRWithInstrQueries() = 0;
   virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
   virtual bool isProfitableToHoist(Instruction *I) = 0;
+  virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
   virtual unsigned getJumpBufAlignment() = 0;
   virtual unsigned getJumpBufSize() = 0;
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
+  virtual bool useColdCCForColdCall(Function &F) = 0;
   virtual unsigned
   getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
@@ -1021,6 +1088,8 @@ public:
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
+  virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() = 0;
@@ -1088,6 +1157,8 @@ public:
       unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;
   virtual bool areInlineCompatible(const Function *Caller,
                                    const Function *Callee) const = 0;
+  virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
+  virtual bool isIndexedStoreLegal(MemIndexedMode Mode,Type *Ty) const = 0;
   virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
   virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
   virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
@@ -1192,6 +1263,12 @@ public:
                      TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
+  bool canMacroFuseCmp() override {
+    return Impl.canMacroFuseCmp();
+  }
+  bool shouldFavorPostInc() const override {
+    return Impl.shouldFavorPostInc();
+  }
   bool isLegalMaskedStore(Type *DataType) override {
     return Impl.isLegalMaskedStore(DataType);
   }
@@ -1228,6 +1305,7 @@ public:
   bool isProfitableToHoist(Instruction *I) override {
     return Impl.isProfitableToHoist(I);
   }
+  bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
   unsigned getJumpBufAlignment() override { return Impl.getJumpBufAlignment(); }
   unsigned getJumpBufSize() override { return Impl.getJumpBufSize(); }
@@ -1237,6 +1315,10 @@ public:
   bool shouldBuildLookupTablesForConstant(Constant *C) override {
     return Impl.shouldBuildLookupTablesForConstant(C);
   }
+  bool useColdCCForColdCall(Function &F) override {
+    return Impl.useColdCCForColdCall(F);
+  }
+
   unsigned getScalarizationOverhead(Type *Ty, bool Insert,
                                     bool Extract) override {
     return Impl.getScalarizationOverhead(Ty, Insert, Extract);
@@ -1304,6 +1386,12 @@ public:
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
+    return Impl.shouldMaximizeVectorBandwidth(OptSize);
+  }
+  unsigned getMinimumVF(unsigned ElemWidth) const override {
+    return Impl.getMinimumVF(ElemWidth);
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
@@ -1442,6 +1530,12 @@ public:
                            const Function *Callee) const override {
     return Impl.areInlineCompatible(Caller, Callee);
   }
+  bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
+    return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
+  }
+  bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
+    return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
+  }
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
     return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
   }
@@ -1489,7 +1583,7 @@ template <typename T>
 TargetTransformInfo::TargetTransformInfo(T Impl)
     : TTIImpl(new Model<T>(Impl)) {}
 
-/// \brief Analysis pass providing the \c TargetTransformInfo.
+/// Analysis pass providing the \c TargetTransformInfo.
 ///
 /// The core idea of the TargetIRAnalysis is to expose an interface through
 /// which LLVM targets can analyze and provide information about the middle
@@ -1504,13 +1598,13 @@ class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
 public:
   typedef TargetTransformInfo Result;
 
-  /// \brief Default construct a target IR analysis.
+  /// Default construct a target IR analysis.
   ///
   /// This will use the module's datalayout to construct a baseline
   /// conservative TTI result.
   TargetIRAnalysis();
 
-  /// \brief Construct an IR analysis pass around a target-provide callback.
+  /// Construct an IR analysis pass around a target-provide callback.
   ///
   /// The callback will be called with a particular function for which the TTI
   /// is needed and must return a TTI object for that function.
@@ -1536,7 +1630,7 @@ private:
   friend AnalysisInfoMixin<TargetIRAnalysis>;
   static AnalysisKey Key;
 
-  /// \brief The callback used to produce a result.
+  /// The callback used to produce a result.
   ///
   /// We use a completely opaque callback so that targets can provide whatever
   /// mechanism they desire for constructing the TTI for a given function.
@@ -1548,11 +1642,11 @@ private:
   /// the external TargetMachine, and that reference needs to never dangle.
   std::function<Result(const Function &)> TTICallback;
 
-  /// \brief Helper function used as the callback in the default constructor.
+  /// Helper function used as the callback in the default constructor.
   static Result getDefaultTTI(const Function &F);
 };
 
-/// \brief Wrapper pass for TargetTransformInfo.
+/// Wrapper pass for TargetTransformInfo.
 ///
 /// This pass can be constructed from a TTI object which it stores internally
 /// and is queried by passes.
@@ -1565,7 +1659,7 @@ class TargetTransformInfoWrapperPass : public ImmutablePass {
 public:
   static char ID;
 
-  /// \brief We must provide a default constructor for the pass but it should
+  /// We must provide a default constructor for the pass but it should
   /// never be used.
   ///
   /// Use the constructor below or call one of the creation routines.
@@ -1576,7 +1670,7 @@ public:
   TargetTransformInfo &getTTI(const Function &F);
 };
 
-/// \brief Create an analysis pass wrapper around a TTI object.
+/// Create an analysis pass wrapper around a TTI object.
 ///
 /// This analysis pass just holds the TTI instance and makes it available to
 /// clients.
diff --git a/contrib/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/contrib/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4c37402278ef..e14e2bd44034 100644
--- a/contrib/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/contrib/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -27,7 +27,7 @@
 
 namespace llvm {
 
-/// \brief Base class for use as a mix-in that aids implementing
+/// Base class for use as a mix-in that aids implementing
 /// a TargetTransformInfo-compatible class.
 class TargetTransformInfoImplBase {
 protected:
@@ -155,6 +155,7 @@ public:
     case Intrinsic::sideeffect:
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
+    case Intrinsic::dbg_label:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
     case Intrinsic::lifetime_start:
@@ -246,6 +247,10 @@ public:
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
   }
 
+  bool canMacroFuseCmp() { return false; }
+
+  bool shouldFavorPostInc() const { return false; }
+
   bool isLegalMaskedStore(Type *DataType) { return false; }
 
   bool isLegalMaskedLoad(Type *DataType) { return false; }
@@ -275,6 +280,8 @@ public:
 
   bool isProfitableToHoist(Instruction *I) { return true; }
 
+  bool useAA() { return false; }
+
   bool isTypeLegal(Type *Ty) { return false; }
 
   unsigned getJumpBufAlignment() { return 0; }
@@ -284,6 +291,8 @@ public:
   bool shouldBuildLookupTables() { return true; }
   bool shouldBuildLookupTablesForConstant(Constant *C) { return true; }
 
+  bool useColdCCForColdCall(Function &F) { return false; }
+
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
     return 0;
   }
@@ -343,6 +352,10 @@ public:
 
   unsigned getMinVectorRegisterBitWidth() { return 128; }
 
+  bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
+
+  unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
+
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader) {
@@ -507,6 +520,16 @@ public:
             Callee->getFnAttribute("target-features"));
   }
 
+  bool isIndexedLoadLegal(TTI::MemIndexedMode Mode, Type *Ty,
+                          const DataLayout &DL) const {
+    return false;
+  }
+
+  bool isIndexedStoreLegal(TTI::MemIndexedMode Mode, Type *Ty,
+                           const DataLayout &DL) const {
+    return false;
+  }
+
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { return 128; }
 
   bool isLegalToVectorizeLoad(LoadInst *LI) const { return true; }
@@ -629,7 +652,7 @@ protected:
   }
 };
 
-/// \brief CRTP base class for use as a mix-in that aids implementing
+/// CRTP base class for use as a mix-in that aids implementing
 /// a TargetTransformInfo-compatible class.
 template <typename T>
 class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
diff --git a/contrib/llvm/include/llvm/Analysis/TypeMetadataUtils.h b/contrib/llvm/include/llvm/Analysis/TypeMetadataUtils.h
index 422e153a5a78..6764563f6830 100644
--- a/contrib/llvm/include/llvm/Analysis/TypeMetadataUtils.h
+++ b/contrib/llvm/include/llvm/Analysis/TypeMetadataUtils.h
@@ -35,13 +35,13 @@ struct DevirtCallSite {
   CallSite CS;
 };
 
-/// Given a call to the intrinsic @llvm.type.test, find all devirtualizable
+/// Given a call to the intrinsic \@llvm.type.test, find all devirtualizable
 /// call sites based on the call and return them in DevirtCalls.
 void findDevirtualizableCallsForTypeTest(
     SmallVectorImpl<DevirtCallSite> &DevirtCalls,
     SmallVectorImpl<CallInst *> &Assumes, const CallInst *CI);
 
-/// Given a call to the intrinsic @llvm.type.checked.load, find all
+/// Given a call to the intrinsic \@llvm.type.checked.load, find all
 /// devirtualizable call sites based on the call and return them in DevirtCalls.
 void findDevirtualizableCallsForTypeCheckedLoad(
     SmallVectorImpl<DevirtCallSite> &DevirtCalls,
diff --git a/contrib/llvm/include/llvm/Analysis/Utils/Local.h b/contrib/llvm/include/llvm/Analysis/Utils/Local.h
new file mode 100644
index 000000000000..b4141bbff28d
--- /dev/null
+++ b/contrib/llvm/include/llvm/Analysis/Utils/Local.h
@@ -0,0 +1,91 @@
+//===- Local.h - Functions to perform local transformations -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_UTILS_LOCAL_H
+#define LLVM_ANALYSIS_UTILS_LOCAL_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+
+namespace llvm {
+
+/// Given a getelementptr instruction/constantexpr, emit the code necessary to
+/// compute the offset from the base pointer (without adding in the base
+/// pointer). Return the result as a signed integer of intptr size.
+/// When NoAssumptions is true, no assumptions about index computation not
+/// overflowing is made.
+template <typename IRBuilderTy>
+Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
+                     bool NoAssumptions = false) {
+  GEPOperator *GEPOp = cast<GEPOperator>(GEP);
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
+  Value *Result = Constant::getNullValue(IntPtrTy);
+
+  // If the GEP is inbounds, we know that none of the addressing operations will
+  // overflow in an unsigned sense.
+  bool isInBounds = GEPOp->isInBounds() && !NoAssumptions;
+
+  // Build a mask for high order bits.
+  unsigned IntPtrWidth = IntPtrTy->getScalarType()->getIntegerBitWidth();
+  uint64_t PtrSizeMask =
+      std::numeric_limits<uint64_t>::max() >> (64 - IntPtrWidth);
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
+       ++i, ++GTI) {
+    Value *Op = *i;
+    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
+    if (Constant *OpC = dyn_cast<Constant>(Op)) {
+      if (OpC->isZeroValue())
+        continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        if (OpC->getType()->isVectorTy())
+          OpC = OpC->getSplatValue();
+
+        uint64_t OpValue = cast<ConstantInt>(OpC)->getZExtValue();
+        Size = DL.getStructLayout(STy)->getElementOffset(OpValue);
+
+        if (Size)
+          Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size),
+                                      GEP->getName()+".offs");
+        continue;
+      }
+
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
+      Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/);
+      // Emit an add instruction.
+      Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs");
+      continue;
+    }
+    // Convert to correct type.
+    if (Op->getType() != IntPtrTy)
+      Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c");
+    if (Size != 1) {
+      // We'll let instcombine(mul) convert this to a shl if possible.
+      Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size),
+                              GEP->getName()+".idx", isInBounds /*NUW*/);
+    }
+
+    // Emit an add instruction.
+    Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs");
+  }
+  return Result;
+}
+
+}
+
+#endif // LLVM_TRANSFORMS_UTILS_LOCAL_H
diff --git a/contrib/llvm/include/llvm/Analysis/ValueLattice.h b/contrib/llvm/include/llvm/Analysis/ValueLattice.h
index 18a43aafa8ca..0744ca617e48 100644
--- a/contrib/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/contrib/llvm/include/llvm/Analysis/ValueLattice.h
@@ -49,14 +49,73 @@ class ValueLatticeElement {
     overdefined
   };
 
-  /// Val: This stores the current lattice value along with the Constant* for
-  /// the constant if this is a 'constant' or 'notconstant' value.
   ValueLatticeElementTy Tag;
-  Constant *Val;
-  ConstantRange Range;
+
+  /// The union either stores a pointer to a constant or a constant range,
+  /// associated to the lattice element. We have to ensure that Range is
+  /// initialized or destroyed when changing state to or from constantrange.
+  union {
+    Constant *ConstVal;
+    ConstantRange Range;
+  };
 
 public:
-  ValueLatticeElement() : Tag(undefined), Val(nullptr), Range(1, true) {}
+  // Const and Range are initialized on-demand.
+  ValueLatticeElement() : Tag(undefined) {}
+
+  /// Custom destructor to ensure Range is properly destroyed, when the object
+  /// is deallocated.
+  ~ValueLatticeElement() {
+    switch (Tag) {
+    case overdefined:
+    case undefined:
+    case constant:
+    case notconstant:
+      break;
+    case constantrange:
+      Range.~ConstantRange();
+      break;
+    };
+  }
+
+  /// Custom copy constructor, to ensure Range gets initialized when
+  /// copying a constant range lattice element.
+  ValueLatticeElement(const ValueLatticeElement &Other) : Tag(undefined) {
+    *this = Other;
+  }
+
+  /// Custom assignment operator, to ensure Range gets initialized when
+  /// assigning a constant range lattice element.
+  ValueLatticeElement &operator=(const ValueLatticeElement &Other) {
+    // If we change the state of this from constant range to non constant range,
+    // destroy Range.
+    if (isConstantRange() && !Other.isConstantRange())
+      Range.~ConstantRange();
+
+    // If we change the state of this from a valid ConstVal to another a state
+    // without a valid ConstVal, zero the pointer.
+    if ((isConstant() || isNotConstant()) && !Other.isConstant() &&
+        !Other.isNotConstant())
+      ConstVal = nullptr;
+
+    switch (Other.Tag) {
+    case constantrange:
+      if (!isConstantRange())
+        new (&Range) ConstantRange(Other.Range);
+      else
+        Range = Other.Range;
+      break;
+    case constant:
+    case notconstant:
+      ConstVal = Other.ConstVal;
+      break;
+    case overdefined:
+    case undefined:
+      break;
+    }
+    Tag = Other.Tag;
+    return *this;
+  }
 
   static ValueLatticeElement get(Constant *C) {
     ValueLatticeElement Res;
@@ -89,12 +148,12 @@ public:
 
   Constant *getConstant() const {
     assert(isConstant() && "Cannot get the constant of a non-constant!");
-    return Val;
+    return ConstVal;
   }
 
   Constant *getNotConstant() const {
     assert(isNotConstant() && "Cannot get the constant of a non-notconstant!");
-    return Val;
+    return ConstVal;
   }
 
   const ConstantRange &getConstantRange() const {
@@ -104,10 +163,10 @@ public:
   }
 
   Optional<APInt> asConstantInteger() const {
-    if (isConstant() && isa<ConstantInt>(Val)) {
-      return cast<ConstantInt>(Val)->getValue();
-    } else if (isConstantRange() && Range.isSingleElement()) {
-      return *Range.getSingleElement();
+    if (isConstant() && isa<ConstantInt>(getConstant())) {
+      return cast<ConstantInt>(getConstant())->getValue();
+    } else if (isConstantRange() && getConstantRange().isSingleElement()) {
+      return *getConstantRange().getSingleElement();
     }
     return None;
   }
@@ -116,6 +175,10 @@ private:
   void markOverdefined() {
     if (isOverdefined())
       return;
+    if (isConstant() || isNotConstant())
+      ConstVal = nullptr;
+    if (isConstantRange())
+      Range.~ConstantRange();
     Tag = overdefined;
   }
 
@@ -132,7 +195,7 @@ private:
            "Marking constant with different value");
     assert(isUndefined());
     Tag = constant;
-    Val = V;
+    ConstVal = V;
   }
 
   void markNotConstant(Constant *V) {
@@ -150,7 +213,7 @@ private:
            "Marking !constant with different value");
     assert(isUndefined() || isConstant());
     Tag = notconstant;
-    Val = V;
+    ConstVal = V;
   }
 
   void markConstantRange(ConstantRange NewR) {
@@ -168,7 +231,7 @@ private:
       markOverdefined();
     else {
       Tag = constantrange;
-      Range = std::move(NewR);
+      new (&Range) ConstantRange(std::move(NewR));
     }
   }
 
@@ -189,14 +252,14 @@ public:
     }
 
     if (isConstant()) {
-      if (RHS.isConstant() && Val == RHS.Val)
+      if (RHS.isConstant() && getConstant() == RHS.getConstant())
         return false;
       markOverdefined();
       return true;
     }
 
     if (isNotConstant()) {
-      if (RHS.isNotConstant() && Val == RHS.Val)
+      if (RHS.isNotConstant() && getNotConstant() == RHS.getNotConstant())
         return false;
       markOverdefined();
       return true;
@@ -209,9 +272,11 @@ public:
       markOverdefined();
       return true;
     }
-    ConstantRange NewR = Range.unionWith(RHS.getConstantRange());
+    ConstantRange NewR = getConstantRange().unionWith(RHS.getConstantRange());
     if (NewR.isFullSet())
       markOverdefined();
+    else if (NewR == getConstantRange())
+      return false;
     else
       markConstantRange(std::move(NewR));
     return true;
@@ -223,24 +288,32 @@ public:
     return cast<ConstantInt>(getConstant());
   }
 
-  bool satisfiesPredicate(CmpInst::Predicate Pred,
-                          const ValueLatticeElement &Other) const {
-    // TODO: share with LVI getPredicateResult.
-
+  /// Compares this symbolic value with Other using Pred and returns either
+  /// true, false or undef constants, or nullptr if the comparison cannot be
+  /// evaluated.
+  Constant *getCompare(CmpInst::Predicate Pred, Type *Ty,
+                       const ValueLatticeElement &Other) const {
     if (isUndefined() || Other.isUndefined())
-      return true;
+      return UndefValue::get(Ty);
 
-    if (isConstant() && Other.isConstant() && Pred == CmpInst::FCMP_OEQ)
-      return getConstant() == Other.getConstant();
+    if (isConstant() && Other.isConstant())
+      return ConstantExpr::getCompare(Pred, getConstant(), Other.getConstant());
 
     // Integer constants are represented as ConstantRanges with single
     // elements.
     if (!isConstantRange() || !Other.isConstantRange())
-      return false;
+      return nullptr;
 
     const auto &CR = getConstantRange();
     const auto &OtherCR = Other.getConstantRange();
-    return ConstantRange::makeSatisfyingICmpRegion(Pred, OtherCR).contains(CR);
+    if (ConstantRange::makeSatisfyingICmpRegion(Pred, OtherCR).contains(CR))
+      return ConstantInt::getTrue(Ty);
+    if (ConstantRange::makeSatisfyingICmpRegion(
+            CmpInst::getInversePredicate(Pred), OtherCR)
+            .contains(CR))
+      return ConstantInt::getFalse(Ty);
+
+    return nullptr;
   }
 };
 
diff --git a/contrib/llvm/include/llvm/Analysis/ValueTracking.h b/contrib/llvm/include/llvm/Analysis/ValueTracking.h
index 1fdb3cff5372..e6a219a8045b 100644
--- a/contrib/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/contrib/llvm/include/llvm/Analysis/ValueTracking.h
@@ -101,6 +101,12 @@ class Value;
                       const Instruction *CxtI = nullptr,
                       const DominatorTree *DT = nullptr);
 
+  /// Return true if the two given values are negation.
+  /// Currently can recoginze Value pair:
+  /// 1: <X, Y> if X = sub (0, Y) or Y = sub (0, X)
+  /// 2: <X, Y> if X = sub (A, B) and Y = sub (B, A)
+  bool isKnownNegation(const Value *X, const Value *Y, bool NeedNSW = false);
+
   /// Returns true if the give value is known to be non-negative.
   bool isKnownNonNegative(const Value *V, const DataLayout &DL,
                           unsigned Depth = 0,
@@ -276,6 +282,22 @@ class Value;
   /// pointer, return 'len+1'.  If we can't, return 0.
   uint64_t GetStringLength(const Value *V, unsigned CharSize = 8);
 
+  /// This function returns call pointer argument that is considered the same by
+  /// aliasing rules. You CAN'T use it to replace one value with another.
+  const Value *getArgumentAliasingToReturnedPointer(ImmutableCallSite CS);
+  inline Value *getArgumentAliasingToReturnedPointer(CallSite CS) {
+    return const_cast<Value *>(
+        getArgumentAliasingToReturnedPointer(ImmutableCallSite(CS)));
+  }
+
+  // {launder,strip}.invariant.group returns pointer that aliases its argument,
+  // and it only captures pointer by returning it.
+  // These intrinsics are not marked as nocapture, because returning is
+  // considered as capture. The arguments are not marked as returned neither,
+  // because it would make it useless.
+  bool isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+      ImmutableCallSite CS);
+
   /// This method strips off any GEP address adjustments and pointer casts from
   /// the specified value, returning the original object being addressed. Note
   /// that the returned value has pointer type if the specified value does. If
@@ -288,7 +310,7 @@ class Value;
     return GetUnderlyingObject(const_cast<Value *>(V), DL, MaxLookup);
   }
 
-  /// \brief This method is similar to GetUnderlyingObject except that it can
+  /// This method is similar to GetUnderlyingObject except that it can
   /// look through phi and select instructions and return multiple objects.
   ///
   /// If LoopInfo is passed, loop phis are further analyzed.  If a pointer
@@ -384,6 +406,11 @@ class Value;
                                                AssumptionCache *AC,
                                                const Instruction *CxtI,
                                                const DominatorTree *DT);
+  OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
+                                             const DataLayout &DL,
+                                             AssumptionCache *AC,
+                                             const Instruction *CxtI,
+                                             const DominatorTree *DT);
   OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
                                                const Value *RHS,
                                                const DataLayout &DL,
@@ -401,6 +428,16 @@ class Value;
                                              AssumptionCache *AC = nullptr,
                                              const Instruction *CxtI = nullptr,
                                              const DominatorTree *DT = nullptr);
+  OverflowResult computeOverflowForUnsignedSub(const Value *LHS, const Value *RHS,
+                                               const DataLayout &DL,
+                                               AssumptionCache *AC,
+                                               const Instruction *CxtI,
+                                               const DominatorTree *DT);
+  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+                                             const DataLayout &DL,
+                                             AssumptionCache *AC,
+                                             const Instruction *CxtI,
+                                             const DominatorTree *DT);
 
   /// Returns true if the arithmetic part of the \p II 's result is
   /// used only along the paths control dependent on the computation
@@ -423,6 +460,13 @@ class Value;
   /// though division by zero might cause undefined behavior.
   bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I);
 
+  /// Returns true if this block does not contain a potential implicit exit.
+  /// This is equivelent to saying that all instructions within the basic block
+  /// are guaranteed to transfer execution to their successor within the basic
+  /// block. This has the same assumptions w.r.t. undefined behavior as the
+  /// instruction variant of this function. 
+  bool isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB);
+
   /// Return true if this function can prove that the instruction I
   /// is executed for every iteration of the loop L.
   ///
@@ -454,7 +498,7 @@ class Value;
   /// the parent of I.
   bool programUndefinedIfFullPoison(const Instruction *PoisonI);
 
-  /// \brief Specific patterns of select instructions we can match.
+  /// Specific patterns of select instructions we can match.
   enum SelectPatternFlavor {
     SPF_UNKNOWN = 0,
     SPF_SMIN,                   /// Signed minimum
@@ -467,7 +511,7 @@ class Value;
     SPF_NABS                    /// Negated absolute value
   };
 
-  /// \brief Behavior when a floating point min/max is given one NaN and one
+  /// Behavior when a floating point min/max is given one NaN and one
   /// non-NaN as input.
   enum SelectPatternNaNBehavior {
     SPNB_NA = 0,                /// NaN behavior not applicable.
@@ -486,15 +530,18 @@ class Value;
                                 /// fcmp; select, does the fcmp have to be
                                 /// ordered?
 
-    /// \brief Return true if \p SPF is a min or a max pattern.
+    /// Return true if \p SPF is a min or a max pattern.
     static bool isMinOrMax(SelectPatternFlavor SPF) {
-      return !(SPF == SPF_UNKNOWN || SPF == SPF_ABS || SPF == SPF_NABS);
+      return SPF != SPF_UNKNOWN && SPF != SPF_ABS && SPF != SPF_NABS;
     }
   };
 
   /// Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind
   /// and providing the out parameter results if we successfully match.
   ///
+  /// For ABS/NABS, LHS will be set to the input to the abs idiom. RHS will be
+  /// the negation instruction from the idiom.
+  ///
   /// If CastOp is not nullptr, also match MIN/MAX idioms where the type does
   /// not match that of the original select. If this is the case, the cast
   /// operation (one of Trunc,SExt,Zext) that must be done to transform the
@@ -521,6 +568,19 @@ class Value;
     return Result;
   }
 
+  /// Return the canonical comparison predicate for the specified
+  /// minimum/maximum flavor.
+  CmpInst::Predicate getMinMaxPred(SelectPatternFlavor SPF,
+                                   bool Ordered = false);
+
+  /// Return the inverse minimum/maximum flavor of the specified flavor.
+  /// For example, signed minimum is the inverse of signed maximum.
+  SelectPatternFlavor getInverseMinMaxFlavor(SelectPatternFlavor SPF);
+
+  /// Return the canonical inverse comparison predicate for the specified
+  /// minimum/maximum flavor.
+  CmpInst::Predicate getInverseMinMaxPred(SelectPatternFlavor SPF);
+
   /// Return true if RHS is known to be implied true by LHS.  Return false if
   /// RHS is known to be implied false by LHS.  Otherwise, return None if no
   /// implication can be made.
diff --git a/contrib/llvm/include/llvm/Analysis/VectorUtils.h b/contrib/llvm/include/llvm/Analysis/VectorUtils.h
index 6315e8408f05..9fde36d61091 100644
--- a/contrib/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/contrib/llvm/include/llvm/Analysis/VectorUtils.h
@@ -33,50 +33,50 @@ namespace Intrinsic {
 enum ID : unsigned;
 }
 
-/// \brief Identify if the intrinsic is trivially vectorizable.
+/// Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all
 /// scalars for the scalar form of the intrinsic and all vectors for
 /// the vector form of the intrinsic.
 bool isTriviallyVectorizable(Intrinsic::ID ID);
 
-/// \brief Identifies if the intrinsic has a scalar operand. It checks for
+/// Identifies if the intrinsic has a scalar operand. It checks for
 /// ctlz,cttz and powi special intrinsics whose argument is scalar.
 bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx);
 
-/// \brief Returns intrinsic ID for call.
+/// Returns intrinsic ID for call.
 /// For the input call instruction it finds mapping intrinsic and returns
 /// its intrinsic ID, in case it does not found it return not_intrinsic.
 Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
                                           const TargetLibraryInfo *TLI);
 
-/// \brief Find the operand of the GEP that should be checked for consecutive
+/// Find the operand of the GEP that should be checked for consecutive
 /// stores. This ignores trailing indices that have no effect on the final
 /// pointer.
 unsigned getGEPInductionOperand(const GetElementPtrInst *Gep);
 
-/// \brief If the argument is a GEP, then returns the operand identified by
+/// If the argument is a GEP, then returns the operand identified by
 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 /// operand, it returns that instead.
 Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
 
-/// \brief If a value has only one user that is a CastInst, return it.
+/// If a value has only one user that is a CastInst, return it.
 Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty);
 
-/// \brief Get the stride of a pointer access in a loop. Looks for symbolic
+/// Get the stride of a pointer access in a loop. Looks for symbolic
 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
 Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
 
-/// \brief Given a vector and an element number, see if the scalar value is
+/// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
 Value *findScalarElement(Value *V, unsigned EltNo);
 
-/// \brief Get splat value if the input is a splat vector or return nullptr.
+/// Get splat value if the input is a splat vector or return nullptr.
 /// The value may be extracted from a splat constants vector or from
 /// a sequence of instructions that broadcast a single value into a vector.
 const Value *getSplatValue(const Value *V);
 
-/// \brief Compute a map of integer instructions to their minimum legal type
+/// Compute a map of integer instructions to their minimum legal type
 /// size.
 ///
 /// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int
@@ -124,7 +124,7 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
-/// \brief Create an interleave shuffle mask.
+/// Create an interleave shuffle mask.
 ///
 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
 /// vectorization factor \p VF into a single wide vector. The mask is of the
@@ -138,7 +138,7 @@ Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 Constant *createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
                                unsigned NumVecs);
 
-/// \brief Create a stride shuffle mask.
+/// Create a stride shuffle mask.
 ///
 /// This function creates a shuffle mask whose elements begin at \p Start and
 /// are incremented by \p Stride. The mask can be used to deinterleave an
@@ -153,7 +153,7 @@ Constant *createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
 Constant *createStrideMask(IRBuilder<> &Builder, unsigned Start,
                            unsigned Stride, unsigned VF);
 
-/// \brief Create a sequential shuffle mask.
+/// Create a sequential shuffle mask.
 ///
 /// This function creates shuffle mask whose elements are sequential and begin
 /// at \p Start.  The mask contains \p NumInts integers and is padded with \p
@@ -167,7 +167,7 @@ Constant *createStrideMask(IRBuilder<> &Builder, unsigned Start,
 Constant *createSequentialMask(IRBuilder<> &Builder, unsigned Start,
                                unsigned NumInts, unsigned NumUndefs);
 
-/// \brief Concatenate a list of vectors.
+/// Concatenate a list of vectors.
 ///
 /// This function generates code that concatenate the vectors in \p Vecs into a
 /// single large vector. The number of vectors should be greater than one, and
diff --git a/contrib/llvm/include/llvm/AsmParser/Parser.h b/contrib/llvm/include/llvm/AsmParser/Parser.h
index 5f02e488e5b1..285a7c022a24 100644
--- a/contrib/llvm/include/llvm/AsmParser/Parser.h
+++ b/contrib/llvm/include/llvm/AsmParser/Parser.h
@@ -21,50 +21,95 @@ namespace llvm {
 class Constant;
 class LLVMContext;
 class Module;
+class ModuleSummaryIndex;
 struct SlotMapping;
 class SMDiagnostic;
 class Type;
 
-/// This function is the main interface to the LLVM Assembly Parser. It parses
+/// This function is a main interface to the LLVM Assembly Parser. It parses
 /// an ASCII file that (presumably) contains LLVM Assembly code. It returns a
 /// Module (intermediate representation) with the corresponding features. Note
 /// that this does not verify that the generated Module is valid, so you should
 /// run the verifier after parsing the file to check that it is okay.
-/// \brief Parse LLVM Assembly from a file
+/// Parse LLVM Assembly from a file
 /// \param Filename The name of the file to parse
-/// \param Error Error result info.
+/// \param Err Error result info.
 /// \param Context Context in which to allocate globals info.
 /// \param Slots The optional slot mapping that will be initialized during
 ///              parsing.
 /// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
 ///                         This option should only be set to false by llvm-as
 ///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
 std::unique_ptr<Module>
-parseAssemblyFile(StringRef Filename, SMDiagnostic &Error, LLVMContext &Context,
-                  SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true);
+parseAssemblyFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+                  SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true,
+                  StringRef DataLayoutString = "");
 
 /// The function is a secondary interface to the LLVM Assembly Parser. It parses
 /// an ASCII string that (presumably) contains LLVM Assembly code. It returns a
 /// Module (intermediate representation) with the corresponding features. Note
 /// that this does not verify that the generated Module is valid, so you should
 /// run the verifier after parsing the file to check that it is okay.
-/// \brief Parse LLVM Assembly from a string
+/// Parse LLVM Assembly from a string
 /// \param AsmString The string containing assembly
-/// \param Error Error result info.
+/// \param Err Error result info.
 /// \param Context Context in which to allocate globals info.
 /// \param Slots The optional slot mapping that will be initialized during
 ///              parsing.
 /// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
 ///                         This option should only be set to false by llvm-as
 ///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
 std::unique_ptr<Module> parseAssemblyString(StringRef AsmString,
-                                            SMDiagnostic &Error,
+                                            SMDiagnostic &Err,
                                             LLVMContext &Context,
                                             SlotMapping *Slots = nullptr,
-                                            bool UpgradeDebugInfo = true);
+                                            bool UpgradeDebugInfo = true,
+                                            StringRef DataLayoutString = "");
+
+/// Holds the Module and ModuleSummaryIndex returned by the interfaces
+/// that parse both.
+struct ParsedModuleAndIndex {
+  std::unique_ptr<Module> Mod;
+  std::unique_ptr<ModuleSummaryIndex> Index;
+};
+
+/// This function is a main interface to the LLVM Assembly Parser. It parses
+/// an ASCII file that (presumably) contains LLVM Assembly code, including
+/// a module summary. It returns a Module (intermediate representation) and
+/// a ModuleSummaryIndex with the corresponding features. Note that this does
+/// not verify that the generated Module or Index are valid, so you should
+/// run the verifier after parsing the file to check that they are okay.
+/// Parse LLVM Assembly from a file
+/// \param Filename The name of the file to parse
+/// \param Err Error result info.
+/// \param Context Context in which to allocate globals info.
+/// \param Slots The optional slot mapping that will be initialized during
+///              parsing.
+/// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
+///                         This option should only be set to false by llvm-as
+///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
+ParsedModuleAndIndex
+parseAssemblyFileWithIndex(StringRef Filename, SMDiagnostic &Err,
+                           LLVMContext &Context, SlotMapping *Slots = nullptr,
+                           bool UpgradeDebugInfo = true,
+                           StringRef DataLayoutString = "");
+
+/// This function is a main interface to the LLVM Assembly Parser. It parses
+/// an ASCII file that (presumably) contains LLVM Assembly code for a module
+/// summary. It returns a a ModuleSummaryIndex with the corresponding features.
+/// Note that this does not verify that the generated Index is valid, so you
+/// should run the verifier after parsing the file to check that it is okay.
+/// Parse LLVM Assembly Index from a file
+/// \param Filename The name of the file to parse
+/// \param Err Error result info.
+std::unique_ptr<ModuleSummaryIndex>
+parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err);
 
 /// parseAssemblyFile and parseAssemblyString are wrappers around this function.
-/// \brief Parse LLVM Assembly from a MemoryBuffer.
+/// Parse LLVM Assembly from a MemoryBuffer.
 /// \param F The MemoryBuffer containing assembly
 /// \param Err Error result info.
 /// \param Slots The optional slot mapping that will be initialized during
@@ -72,10 +117,40 @@ std::unique_ptr<Module> parseAssemblyString(StringRef AsmString,
 /// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
 ///                         This option should only be set to false by llvm-as
 ///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
 std::unique_ptr<Module> parseAssembly(MemoryBufferRef F, SMDiagnostic &Err,
                                       LLVMContext &Context,
                                       SlotMapping *Slots = nullptr,
-                                      bool UpgradeDebugInfo = true);
+                                      bool UpgradeDebugInfo = true,
+                                      StringRef DataLayoutString = "");
+
+/// Parse LLVM Assembly including the summary index from a MemoryBuffer.
+///
+/// \param F The MemoryBuffer containing assembly with summary
+/// \param Err Error result info.
+/// \param Slots The optional slot mapping that will be initialized during
+///              parsing.
+/// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
+///                         This option should only be set to false by llvm-as
+///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
+///
+/// parseAssemblyFileWithIndex is a wrapper around this function.
+ParsedModuleAndIndex parseAssemblyWithIndex(MemoryBufferRef F,
+                                            SMDiagnostic &Err,
+                                            LLVMContext &Context,
+                                            SlotMapping *Slots = nullptr,
+                                            bool UpgradeDebugInfo = true,
+                                            StringRef DataLayoutString = "");
+
+/// Parse LLVM Assembly for summary index from a MemoryBuffer.
+///
+/// \param F The MemoryBuffer containing assembly with summary
+/// \param Err Error result info.
+///
+/// parseSummaryIndexAssemblyFile is a wrapper around this function.
+std::unique_ptr<ModuleSummaryIndex>
+parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err);
 
 /// This function is the low-level interface to the LLVM Assembly Parser.
 /// This is kept as an independent function instead of being inlined into
@@ -84,6 +159,7 @@ std::unique_ptr<Module> parseAssembly(MemoryBufferRef F, SMDiagnostic &Err,
 ///
 /// \param F The MemoryBuffer containing assembly
 /// \param M The module to add data to.
+/// \param Index The index to add data to.
 /// \param Err Error result info.
 /// \param Slots The optional slot mapping that will be initialized during
 ///              parsing.
@@ -91,9 +167,11 @@ std::unique_ptr<Module> parseAssembly(MemoryBufferRef F, SMDiagnostic &Err,
 /// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
 ///                         This option should only be set to false by llvm-as
 ///                         for use inside the LLVM testuite!
-bool parseAssemblyInto(MemoryBufferRef F, Module &M, SMDiagnostic &Err,
-                       SlotMapping *Slots = nullptr,
-                       bool UpgradeDebugInfo = true);
+/// \param DataLayoutString Override datalayout in the llvm assembly.
+bool parseAssemblyInto(MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index,
+                       SMDiagnostic &Err, SlotMapping *Slots = nullptr,
+                       bool UpgradeDebugInfo = true,
+                       StringRef DataLayoutString = "");
 
 /// Parse a type and a constant value in the given string.
 ///
diff --git a/contrib/llvm/include/llvm/BinaryFormat/COFF.h b/contrib/llvm/include/llvm/BinaryFormat/COFF.h
index a55c544dfe90..7b973c03cc80 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/contrib/llvm/include/llvm/BinaryFormat/COFF.h
@@ -110,6 +110,9 @@ enum MachineTypes : unsigned {
   IMAGE_FILE_MACHINE_POWERPC = 0x1F0,
   IMAGE_FILE_MACHINE_POWERPCFP = 0x1F1,
   IMAGE_FILE_MACHINE_R4000 = 0x166,
+  IMAGE_FILE_MACHINE_RISCV32 = 0x5032,
+  IMAGE_FILE_MACHINE_RISCV64 = 0x5064,
+  IMAGE_FILE_MACHINE_RISCV128 = 0x5128,
   IMAGE_FILE_MACHINE_SH3 = 0x1A2,
   IMAGE_FILE_MACHINE_SH3DSP = 0x1A3,
   IMAGE_FILE_MACHINE_SH4 = 0x1A6,
@@ -460,7 +463,7 @@ union Auxiliary {
   AuxiliarySectionDefinition SectionDefinition;
 };
 
-/// @brief The Import Directory Table.
+/// The Import Directory Table.
 ///
 /// There is a single array of these and one entry per imported DLL.
 struct ImportDirectoryTableEntry {
@@ -471,7 +474,7 @@ struct ImportDirectoryTableEntry {
   uint32_t ImportAddressTableRVA;
 };
 
-/// @brief The PE32 Import Lookup Table.
+/// The PE32 Import Lookup Table.
 ///
 /// There is an array of these for each imported DLL. It represents either
 /// the ordinal to import from the target DLL, or a name to lookup and import
@@ -482,32 +485,32 @@ struct ImportDirectoryTableEntry {
 struct ImportLookupTableEntry32 {
   uint32_t data;
 
-  /// @brief Is this entry specified by ordinal, or name?
+  /// Is this entry specified by ordinal, or name?
   bool isOrdinal() const { return data & 0x80000000; }
 
-  /// @brief Get the ordinal value of this entry. isOrdinal must be true.
+  /// Get the ordinal value of this entry. isOrdinal must be true.
   uint16_t getOrdinal() const {
     assert(isOrdinal() && "ILT entry is not an ordinal!");
     return data & 0xFFFF;
   }
 
-  /// @brief Set the ordinal value and set isOrdinal to true.
+  /// Set the ordinal value and set isOrdinal to true.
   void setOrdinal(uint16_t o) {
     data = o;
     data |= 0x80000000;
   }
 
-  /// @brief Get the Hint/Name entry RVA. isOrdinal must be false.
+  /// Get the Hint/Name entry RVA. isOrdinal must be false.
   uint32_t getHintNameRVA() const {
     assert(!isOrdinal() && "ILT entry is not a Hint/Name RVA!");
     return data;
   }
 
-  /// @brief Set the Hint/Name entry RVA and set isOrdinal to false.
+  /// Set the Hint/Name entry RVA and set isOrdinal to false.
   void setHintNameRVA(uint32_t rva) { data = rva; }
 };
 
-/// @brief The DOS compatible header at the front of all PEs.
+/// The DOS compatible header at the front of all PEs.
 struct DOSHeader {
   uint16_t Magic;
   uint16_t UsedBytesInTheLastPage;
diff --git a/contrib/llvm/include/llvm/BinaryFormat/Dwarf.def b/contrib/llvm/include/llvm/BinaryFormat/Dwarf.def
index 3ade3ea0d338..57e259615d0c 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/contrib/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -12,15 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 // TODO: Add other DW-based macros.
-#if !(defined HANDLE_DW_TAG || defined HANDLE_DW_AT ||                         \
-      defined HANDLE_DW_FORM || defined HANDLE_DW_OP ||                        \
-      defined HANDLE_DW_LANG || defined HANDLE_DW_ATE ||                       \
-      defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||           \
-      defined HANDLE_DW_CC || defined HANDLE_DW_LNS ||                         \
-      defined HANDLE_DW_LNE || defined HANDLE_DW_LNCT ||                       \
-      defined HANDLE_DW_MACRO || defined HANDLE_DW_RLE ||                      \
-      defined HANDLE_DW_CFA || defined HANDLE_DW_APPLE_PROPERTY ||             \
-      defined HANDLE_DW_UT || defined HANDLE_DWARF_SECTION)
+#if !(                                                                         \
+    defined HANDLE_DW_TAG || defined HANDLE_DW_AT || defined HANDLE_DW_FORM || \
+    defined HANDLE_DW_OP || defined HANDLE_DW_LANG || defined HANDLE_DW_ATE || \
+    defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||             \
+    defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE ||  \
+    defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO ||                       \
+    defined HANDLE_DW_RLE || defined HANDLE_DW_CFA ||                          \
+    defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||                \
+    defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX)
 #error "Missing macro definition of HANDLE_DW*"
 #endif
 
@@ -96,6 +96,10 @@
 #define HANDLE_DWARF_SECTION(ENUM_NAME, ELF_NAME, CMDLINE_NAME)
 #endif
 
+#ifndef HANDLE_DW_IDX
+#define HANDLE_DW_IDX(ID, NAME)
+#endif
+
 HANDLE_DW_TAG(0x0000, null, 2, DWARF)
 HANDLE_DW_TAG(0x0001, array_type, 2, DWARF)
 HANDLE_DW_TAG(0x0002, class_type, 2, DWARF)
@@ -704,6 +708,7 @@ HANDLE_DW_CC(0x03, nocall)
 HANDLE_DW_CC(0x04, pass_by_reference)
 HANDLE_DW_CC(0x05, pass_by_value)
 // Vendor extensions:
+HANDLE_DW_CC(0x40, GNU_renesas_sh)
 HANDLE_DW_CC(0x41, GNU_borland_fastcall_i386)
 HANDLE_DW_CC(0xb0, BORLAND_safecall)
 HANDLE_DW_CC(0xb1, BORLAND_stdcall)
@@ -713,6 +718,22 @@ HANDLE_DW_CC(0xb4, BORLAND_msreturn)
 HANDLE_DW_CC(0xb5, BORLAND_thiscall)
 HANDLE_DW_CC(0xb6, BORLAND_fastcall)
 HANDLE_DW_CC(0xc0, LLVM_vectorcall)
+HANDLE_DW_CC(0xc1, LLVM_Win64)
+HANDLE_DW_CC(0xc2, LLVM_X86_64SysV)
+HANDLE_DW_CC(0xc3, LLVM_AAPCS)
+HANDLE_DW_CC(0xc4, LLVM_AAPCS_VFP)
+HANDLE_DW_CC(0xc5, LLVM_IntelOclBicc)
+HANDLE_DW_CC(0xc6, LLVM_SpirFunction)
+HANDLE_DW_CC(0xc7, LLVM_OpenCLKernel)
+HANDLE_DW_CC(0xc8, LLVM_Swift)
+HANDLE_DW_CC(0xc9, LLVM_PreserveMost)
+HANDLE_DW_CC(0xca, LLVM_PreserveAll)
+HANDLE_DW_CC(0xcb, LLVM_X86RegCall)
+// From GCC source code (include/dwarf2.h): This DW_CC_ value is not currently
+// generated by any toolchain.  It is used internally to GDB to indicate OpenCL C
+// functions that have been compiled with the IBM XL C for OpenCL compiler and use
+// a non-platform calling convention for passing OpenCL C vector types.
+HANDLE_DW_CC(0xff, GDB_IBM_OpenCL)
 
 // Line Number Extended Opcode Encodings
 HANDLE_DW_LNE(0x01, end_sequence)
@@ -743,6 +764,9 @@ HANDLE_DW_LNCT(0x02, directory_index)
 HANDLE_DW_LNCT(0x03, timestamp)
 HANDLE_DW_LNCT(0x04, size)
 HANDLE_DW_LNCT(0x05, MD5)
+// A vendor extension until http://dwarfstd.org/ShowIssue.php?issue=180201.1 is
+// accepted and incorporated into the next DWARF standard.
+HANDLE_DW_LNCT(0x2001, LLVM_source)
 
 // DWARF v5 Macro information.
 HANDLE_DW_MACRO(0x01, define)
@@ -836,14 +860,17 @@ HANDLE_DWARF_SECTION(DebugAranges, ".debug_aranges", "debug-aranges")
 HANDLE_DWARF_SECTION(DebugInfo, ".debug_info", "debug-info")
 HANDLE_DWARF_SECTION(DebugTypes, ".debug_types", "debug-types")
 HANDLE_DWARF_SECTION(DebugLine, ".debug_line", "debug-line")
+HANDLE_DWARF_SECTION(DebugLineStr, ".debug_line_str", "debug-line-str")
 HANDLE_DWARF_SECTION(DebugLoc, ".debug_loc", "debug-loc")
 HANDLE_DWARF_SECTION(DebugFrame, ".debug_frame", "debug-frame")
 HANDLE_DWARF_SECTION(DebugMacro, ".debug_macro", "debug-macro")
-HANDLE_DWARF_SECTION(DebugRanges, ".debug_ranges", "debug-ranges")
+HANDLE_DWARF_SECTION(DebugNames, ".debug_names", "debug-names")
 HANDLE_DWARF_SECTION(DebugPubnames, ".debug_pubnames", "debug-pubnames")
 HANDLE_DWARF_SECTION(DebugPubtypes, ".debug_pubtypes", "debug-pubtypes")
 HANDLE_DWARF_SECTION(DebugGnuPubnames, ".debug_gnu_pubnames", "debug-gnu-pubnames")
 HANDLE_DWARF_SECTION(DebugGnuPubtypes, ".debug_gnu_pubtypes", "debug-gnu-pubtypes")
+HANDLE_DWARF_SECTION(DebugRanges, ".debug_ranges", "debug-ranges")
+HANDLE_DWARF_SECTION(DebugRnglists, ".debug_rnglists", "debug-rnglists")
 HANDLE_DWARF_SECTION(DebugStr, ".debug_str", "debug-str")
 HANDLE_DWARF_SECTION(DebugStrOffsets, ".debug_str_offsets", "debug-str-offsets")
 HANDLE_DWARF_SECTION(DebugCUIndex, ".debug_cu_index", "debug-cu-index")
@@ -855,6 +882,12 @@ HANDLE_DWARF_SECTION(AppleNamespaces, ".apple_namespaces", "apple-namespaces")
 HANDLE_DWARF_SECTION(AppleObjC, ".apple_objc", "apple-objc")
 HANDLE_DWARF_SECTION(GdbIndex, ".gdb_index", "gdb-index")
 
+HANDLE_DW_IDX(0x01, compile_unit)
+HANDLE_DW_IDX(0x02, type_unit)
+HANDLE_DW_IDX(0x03, die_offset)
+HANDLE_DW_IDX(0x04, parent)
+HANDLE_DW_IDX(0x05, type_hash)
+
 
 #undef HANDLE_DW_TAG
 #undef HANDLE_DW_AT
@@ -874,3 +907,4 @@ HANDLE_DWARF_SECTION(GdbIndex, ".gdb_index", "gdb-index")
 #undef HANDLE_DW_APPLE_PROPERTY
 #undef HANDLE_DW_UT
 #undef HANDLE_DWARF_SECTION
+#undef HANDLE_DW_IDX
diff --git a/contrib/llvm/include/llvm/BinaryFormat/Dwarf.h b/contrib/llvm/include/llvm/BinaryFormat/Dwarf.h
index a0e5367b412c..9036f405eaea 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/contrib/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -20,8 +20,12 @@
 #ifndef LLVM_BINARYFORMAT_DWARF_H
 #define LLVM_BINARYFORMAT_DWARF_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadicDetails.h"
 
 namespace llvm {
 class StringRef;
@@ -57,6 +61,9 @@ enum LLVMConstants : uint32_t {
   DWARF_VENDOR_MIPS = 6
 };
 
+/// Constants that define the DWARF format as 32 or 64 bit.
+enum DwarfFormat : uint8_t { DWARF32, DWARF64 };
+
 /// Special ID values that distinguish a CIE from a FDE in DWARF CFI.
 /// Not inside an enum because a 64-bit value is needed.
 /// @{
@@ -125,7 +132,7 @@ enum LocationAtom {
   DW_OP_LLVM_fragment = 0x1000 ///< Only used in LLVM metadata.
 };
 
-enum TypeKind {
+enum TypeKind : uint8_t {
 #define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR) DW_ATE_##NAME = ID,
 #include "llvm/BinaryFormat/Dwarf.def"
   DW_ATE_lo_user = 0x80,
@@ -325,6 +332,13 @@ enum UnitType : unsigned char {
   DW_UT_hi_user = 0xff
 };
 
+enum Index {
+#define HANDLE_DW_IDX(ID, NAME) DW_IDX_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+  DW_IDX_lo_user = 0x2000,
+  DW_IDX_hi_user = 0x3fff
+};
+
 inline bool isUnitType(uint8_t UnitType) {
   switch (UnitType) {
   case DW_UT_compile:
@@ -354,13 +368,16 @@ inline bool isUnitType(dwarf::Tag T) {
 // Constants for the DWARF v5 Accelerator Table Proposal
 enum AcceleratorTable {
   // Data layout descriptors.
-  DW_ATOM_null = 0u,       // Marker as the end of a list of atoms.
+  DW_ATOM_null = 0u,       ///  Marker as the end of a list of atoms.
   DW_ATOM_die_offset = 1u, // DIE offset in the debug_info section.
   DW_ATOM_cu_offset = 2u, // Offset of the compile unit header that contains the
                           // item in question.
   DW_ATOM_die_tag = 3u,   // A tag entry.
   DW_ATOM_type_flags = 4u, // Set of flags for a type.
 
+  DW_ATOM_type_type_flags = 5u, // Dsymutil type extension.
+  DW_ATOM_qual_name_hash = 6u,  // Dsymutil qualified hash extension.
+
   // DW_ATOM_type_flags values.
 
   // Always set for C++, only set for ObjC if this is the @implementation for a
@@ -390,8 +407,8 @@ enum GDBIndexEntryLinkage { GIEL_EXTERNAL, GIEL_STATIC };
 /// \defgroup DwarfConstantsDumping Dwarf constants dumping functions
 ///
 /// All these functions map their argument's value back to the
-/// corresponding enumerator name or return nullptr if the value isn't
-/// known.
+/// corresponding enumerator name or return an empty StringRef if the value
+/// isn't known.
 ///
 /// @{
 StringRef TagString(unsigned Tag);
@@ -410,16 +427,17 @@ StringRef CaseString(unsigned Case);
 StringRef ConventionString(unsigned Convention);
 StringRef InlineCodeString(unsigned Code);
 StringRef ArrayOrderString(unsigned Order);
-StringRef DiscriminantString(unsigned Discriminant);
 StringRef LNStandardString(unsigned Standard);
 StringRef LNExtendedString(unsigned Encoding);
 StringRef MacinfoString(unsigned Encoding);
+StringRef RangeListEncodingString(unsigned Encoding);
 StringRef CallFrameString(unsigned Encoding);
 StringRef ApplePropertyString(unsigned);
 StringRef UnitTypeString(unsigned);
 StringRef AtomTypeString(unsigned Atom);
 StringRef GDBIndexEntryKindString(GDBIndexEntryKind Kind);
 StringRef GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
+StringRef IndexString(unsigned Idx);
 /// @}
 
 /// \defgroup DwarfConstantsParsing Dwarf constants parsing functions
@@ -471,6 +489,49 @@ unsigned AttributeEncodingVendor(TypeKind E);
 unsigned LanguageVendor(SourceLanguage L);
 /// @}
 
+/// A helper struct providing information about the byte size of DW_FORM
+/// values that vary in size depending on the DWARF version, address byte
+/// size, or DWARF32/DWARF64.
+struct FormParams {
+  uint16_t Version;
+  uint8_t AddrSize;
+  DwarfFormat Format;
+
+  /// The definition of the size of form DW_FORM_ref_addr depends on the
+  /// version. In DWARF v2 it's the size of an address; after that, it's the
+  /// size of a reference.
+  uint8_t getRefAddrByteSize() const {
+    if (Version == 2)
+      return AddrSize;
+    return getDwarfOffsetByteSize();
+  }
+
+  /// The size of a reference is determined by the DWARF 32/64-bit format.
+  uint8_t getDwarfOffsetByteSize() const {
+    switch (Format) {
+    case DwarfFormat::DWARF32:
+      return 4;
+    case DwarfFormat::DWARF64:
+      return 8;
+    }
+    llvm_unreachable("Invalid Format value");
+  }
+
+  explicit operator bool() const { return Version && AddrSize; }
+};
+
+/// Get the fixed byte size for a given form.
+///
+/// If the form has a fixed byte size, then an Optional with a value will be
+/// returned. If the form is always encoded using a variable length storage
+/// format (ULEB or SLEB numbers or blocks) then None will be returned.
+///
+/// \param Form DWARF form to get the fixed byte size for.
+/// \param Params DWARF parameters to help interpret forms.
+/// \returns Optional<uint8_t> value with the fixed byte size or None if
+/// \p Form doesn't have a fixed byte size.
+Optional<uint8_t> getFixedFormByteSize(dwarf::Form Form, FormParams Params);
+
 /// Tells whether the specified form is defined in the specified version,
 /// or is an extension if extensions are allowed.
 bool isValidFormForVersion(Form F, unsigned Version, bool ExtensionsOk = true);
@@ -479,6 +540,10 @@ bool isValidFormForVersion(Form F, unsigned Version, bool ExtensionsOk = true);
 /// for attribute Attr.
 StringRef AttributeValueString(uint16_t Attr, unsigned Val);
 
+/// Returns the symbolic string representing Val when used as a value
+/// for atom Atom.
+StringRef AtomValueString(uint16_t Atom, unsigned Val);
+
 /// Describes an entry of the various gnu_pub* debug sections.
 ///
 /// The gnu_pub* kind looks like:
@@ -514,14 +579,46 @@ private:
   };
 };
 
-/// Constants that define the DWARF format as 32 or 64 bit.
-enum DwarfFormat : uint8_t { DWARF32, DWARF64 };
+template <typename Enum> struct EnumTraits : public std::false_type {};
+
+template <> struct EnumTraits<Attribute> : public std::true_type {
+  static constexpr char Type[3] = "AT";
+  static constexpr StringRef (*StringFn)(unsigned) = &AttributeString;
+};
 
-/// The Bernstein hash function used by the accelerator tables.
-uint32_t djbHash(StringRef Buffer);
+template <> struct EnumTraits<Form> : public std::true_type {
+  static constexpr char Type[5] = "FORM";
+  static constexpr StringRef (*StringFn)(unsigned) = &FormEncodingString;
+};
+
+template <> struct EnumTraits<Index> : public std::true_type {
+  static constexpr char Type[4] = "IDX";
+  static constexpr StringRef (*StringFn)(unsigned) = &IndexString;
+};
 
+template <> struct EnumTraits<Tag> : public std::true_type {
+  static constexpr char Type[4] = "TAG";
+  static constexpr StringRef (*StringFn)(unsigned) = &TagString;
+};
 } // End of namespace dwarf
 
+/// Dwarf constants format_provider
+///
+/// Specialization of the format_provider template for dwarf enums. Unlike the
+/// dumping functions above, these format unknown enumerator values as
+/// DW_TYPE_unknown_1234 (e.g. DW_TAG_unknown_ffff).
+template <typename Enum>
+struct format_provider<
+    Enum, typename std::enable_if<dwarf::EnumTraits<Enum>::value>::type> {
+  static void format(const Enum &E, raw_ostream &OS, StringRef Style) {
+    StringRef Str = dwarf::EnumTraits<Enum>::StringFn(E);
+    if (Str.empty()) {
+      OS << "DW_" << dwarf::EnumTraits<Enum>::Type << "_unknown_"
+         << llvm::format("%x", E);
+    } else
+      OS << Str;
+  }
+};
 } // End of namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/BinaryFormat/DynamicTags.def b/contrib/llvm/include/llvm/BinaryFormat/DynamicTags.def
new file mode 100644
index 000000000000..2e15cc30fca7
--- /dev/null
+++ b/contrib/llvm/include/llvm/BinaryFormat/DynamicTags.def
@@ -0,0 +1,216 @@
+#ifndef DYNAMIC_TAG
+#error "DYNAMIC_TAG must be defined"
+#endif
+
+// Add separate macros for the architecture specific tags and the markers
+// such as DT_HIOS, etc. to allow using this file to in other contexts.
+// For example we can use it to generate a stringification switch statement.
+
+#ifndef HEXAGON_DYNAMIC_TAG
+#define HEXAGON_DYNAMIC_TAG(name, value) DYNAMIC_TAG(name, value)
+#define HEXAGON_DYNAMIC_TAG_DEFINED
+#endif
+
+#ifndef MIPS_DYNAMIC_TAG
+#define MIPS_DYNAMIC_TAG(name, value) DYNAMIC_TAG(name, value)
+#define MIPS_DYNAMIC_TAG_DEFINED
+#endif
+
+#ifndef PPC64_DYNAMIC_TAG
+#define PPC64_DYNAMIC_TAG(name, value) DYNAMIC_TAG(name, value)
+#define PPC64_DYNAMIC_TAG_DEFINED
+#endif
+
+#ifndef DYNAMIC_TAG_MARKER
+#define DYNAMIC_TAG_MARKER(name, value) DYNAMIC_TAG(name, value)
+#define DYNAMIC_TAG_MARKER_DEFINED
+#endif
+
+DYNAMIC_TAG(NULL, 0)        // Marks end of dynamic array.
+DYNAMIC_TAG(NEEDED, 1)      // String table offset of needed library.
+DYNAMIC_TAG(PLTRELSZ, 2)    // Size of relocation entries in PLT.
+DYNAMIC_TAG(PLTGOT, 3)      // Address associated with linkage table.
+DYNAMIC_TAG(HASH, 4)        // Address of symbolic hash table.
+DYNAMIC_TAG(STRTAB, 5)      // Address of dynamic string table.
+DYNAMIC_TAG(SYMTAB, 6)      // Address of dynamic symbol table.
+DYNAMIC_TAG(RELA, 7)        // Address of relocation table (Rela entries).
+DYNAMIC_TAG(RELASZ, 8)      // Size of Rela relocation table.
+DYNAMIC_TAG(RELAENT, 9)     // Size of a Rela relocation entry.
+DYNAMIC_TAG(STRSZ, 10)      // Total size of the string table.
+DYNAMIC_TAG(SYMENT, 11)     // Size of a symbol table entry.
+DYNAMIC_TAG(INIT, 12)       // Address of initialization function.
+DYNAMIC_TAG(FINI, 13)       // Address of termination function.
+DYNAMIC_TAG(SONAME, 14)     // String table offset of a shared objects name.
+DYNAMIC_TAG(RPATH, 15)      // String table offset of library search path.
+DYNAMIC_TAG(SYMBOLIC, 16)   // Changes symbol resolution algorithm.
+DYNAMIC_TAG(REL, 17)        // Address of relocation table (Rel entries).
+DYNAMIC_TAG(RELSZ, 18)      // Size of Rel relocation table.
+DYNAMIC_TAG(RELENT, 19)     // Size of a Rel relocation entry.
+DYNAMIC_TAG(PLTREL, 20)     // Type of relocation entry used for linking.
+DYNAMIC_TAG(DEBUG, 21)      // Reserved for debugger.
+DYNAMIC_TAG(TEXTREL, 22)    // Relocations exist for non-writable segments.
+DYNAMIC_TAG(JMPREL, 23)     // Address of relocations associated with PLT.
+DYNAMIC_TAG(BIND_NOW, 24)   // Process all relocations before execution.
+DYNAMIC_TAG(INIT_ARRAY, 25) // Pointer to array of initialization functions.
+DYNAMIC_TAG(FINI_ARRAY, 26) // Pointer to array of termination functions.
+DYNAMIC_TAG(INIT_ARRAYSZ, 27) // Size of DT_INIT_ARRAY.
+DYNAMIC_TAG(FINI_ARRAYSZ, 28) // Size of DT_FINI_ARRAY.
+DYNAMIC_TAG(RUNPATH, 29)      // String table offset of lib search path.
+DYNAMIC_TAG(FLAGS, 30)        // Flags.
+DYNAMIC_TAG_MARKER(ENCODING, 32) // Values from here to DT_LOOS follow the rules
+                                 // for the interpretation of the d_un union.
+
+DYNAMIC_TAG(PREINIT_ARRAY, 32)   // Pointer to array of preinit functions.
+DYNAMIC_TAG(PREINIT_ARRAYSZ, 33) // Size of the DT_PREINIT_ARRAY array.
+
+DYNAMIC_TAG(SYMTAB_SHNDX, 34) // Address of the SHT_SYMTAB_SHNDX section.
+
+// Experimental support for SHT_RELR sections. For details, see proposal
+// at https://groups.google.com/forum/#!topic/generic-abi/bX460iggiKg
+DYNAMIC_TAG(RELRSZ, 35)  // Size of Relr relocation table.
+DYNAMIC_TAG(RELR, 36)    // Address of relocation table (Relr entries).
+DYNAMIC_TAG(RELRENT, 37) // Size of a Relr relocation entry.
+
+DYNAMIC_TAG_MARKER(LOOS, 0x60000000)   // Start of environment specific tags.
+DYNAMIC_TAG_MARKER(HIOS, 0x6FFFFFFF)   // End of environment specific tags.
+DYNAMIC_TAG_MARKER(LOPROC, 0x70000000) // Start of processor specific tags.
+DYNAMIC_TAG_MARKER(HIPROC, 0x7FFFFFFF) // End of processor specific tags.
+
+// Android packed relocation section tags.
+// https://android.googlesource.com/platform/bionic/+/6f12bfece5dcc01325e0abba56a46b1bcf991c69/tools/relocation_packer/src/elf_file.cc#31
+DYNAMIC_TAG(ANDROID_REL, 0x6000000F)
+DYNAMIC_TAG(ANDROID_RELSZ, 0x60000010)
+DYNAMIC_TAG(ANDROID_RELA, 0x60000011)
+DYNAMIC_TAG(ANDROID_RELASZ, 0x60000012)
+
+// Android's experimental support for SHT_RELR sections.
+// https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#253
+DYNAMIC_TAG(ANDROID_RELR, 0x6FFFE000)      // Address of relocation table (Relr entries).
+DYNAMIC_TAG(ANDROID_RELRSZ, 0x6FFFE001)    // Size of Relr relocation table.
+DYNAMIC_TAG(ANDROID_RELRENT, 0x6FFFE003)   // Size of a Relr relocation entry.
+
+DYNAMIC_TAG(GNU_HASH, 0x6FFFFEF5)    // Reference to the GNU hash table.
+DYNAMIC_TAG(TLSDESC_PLT, 0x6FFFFEF6) // Location of PLT entry for TLS
+                                     // descriptor resolver calls.
+DYNAMIC_TAG(TLSDESC_GOT, 0x6FFFFEF7) // Location of GOT entry used by TLS
+                                     // descriptor resolver PLT entry.
+DYNAMIC_TAG(RELACOUNT, 0x6FFFFFF9)   // ELF32_Rela count.
+DYNAMIC_TAG(RELCOUNT, 0x6FFFFFFA)    // ELF32_Rel count.
+
+DYNAMIC_TAG(FLAGS_1, 0X6FFFFFFB) // Flags_1.
+
+DYNAMIC_TAG(VERSYM, 0x6FFFFFF0)     // The address of .gnu.version section.
+DYNAMIC_TAG(VERDEF, 0X6FFFFFFC)     // The address of the version definition
+                                    // table.
+DYNAMIC_TAG(VERDEFNUM, 0X6FFFFFFD)  // The number of entries in DT_VERDEF.
+DYNAMIC_TAG(VERNEED, 0X6FFFFFFE)    // The address of the version dependency
+                                    // table.
+DYNAMIC_TAG(VERNEEDNUM, 0X6FFFFFFF) // The number of entries in DT_VERNEED.
+
+// Hexagon specific dynamic table entries
+HEXAGON_DYNAMIC_TAG(HEXAGON_SYMSZ, 0x70000000)
+HEXAGON_DYNAMIC_TAG(HEXAGON_VER, 0x70000001)
+HEXAGON_DYNAMIC_TAG(HEXAGON_PLT, 0x70000002)
+
+// Mips specific dynamic table entry tags.
+
+MIPS_DYNAMIC_TAG(MIPS_RLD_VERSION, 0x70000001)  // 32 bit version number for
+                                                // runtime linker interface.
+MIPS_DYNAMIC_TAG(MIPS_TIME_STAMP, 0x70000002)   // Time stamp.
+MIPS_DYNAMIC_TAG(MIPS_ICHECKSUM, 0x70000003)    // Checksum of external strings
+                                                // and common sizes.
+MIPS_DYNAMIC_TAG(MIPS_IVERSION, 0x70000004)     // Index of version string
+                                                // in string table.
+MIPS_DYNAMIC_TAG(MIPS_FLAGS, 0x70000005)        // 32 bits of flags.
+MIPS_DYNAMIC_TAG(MIPS_BASE_ADDRESS, 0x70000006) // Base address of the segment.
+MIPS_DYNAMIC_TAG(MIPS_MSYM, 0x70000007)         // Address of .msym section.
+MIPS_DYNAMIC_TAG(MIPS_CONFLICT, 0x70000008)     // Address of .conflict section.
+MIPS_DYNAMIC_TAG(MIPS_LIBLIST, 0x70000009)      // Address of .liblist section.
+MIPS_DYNAMIC_TAG(MIPS_LOCAL_GOTNO, 0x7000000a)  // Number of local global offset
+                                                // table entries.
+MIPS_DYNAMIC_TAG(MIPS_CONFLICTNO, 0x7000000b)   // Number of entries
+                                                // in the .conflict section.
+MIPS_DYNAMIC_TAG(MIPS_LIBLISTNO, 0x70000010)    // Number of entries
+                                                // in the .liblist section.
+MIPS_DYNAMIC_TAG(MIPS_SYMTABNO, 0x70000011)     // Number of entries
+                                                // in the .dynsym section.
+MIPS_DYNAMIC_TAG(MIPS_UNREFEXTNO, 0x70000012)   // Index of first external dynamic
+                                                // symbol not referenced locally.
+MIPS_DYNAMIC_TAG(MIPS_GOTSYM, 0x70000013)       // Index of first dynamic symbol
+                                                // in global offset table.
+MIPS_DYNAMIC_TAG(MIPS_HIPAGENO, 0x70000014)     // Number of page table entries
+                                                // in global offset table.
+MIPS_DYNAMIC_TAG(MIPS_RLD_MAP, 0x70000016)      // Address of run time loader map
+                                                // used for debugging.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_CLASS, 0x70000017)    // Delta C++ class definition.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_CLASS_NO, 0x70000018) // Number of entries
+                                                  // in DT_MIPS_DELTA_CLASS.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_INSTANCE, 0x70000019) // Delta C++ class instances.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_INSTANCE_NO, 0x7000001A) // Number of entries
+                                                     // in DT_MIPS_DELTA_INSTANCE.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_RELOC, 0x7000001B)       // Delta relocations.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_RELOC_NO, 0x7000001C)    // Number of entries
+                                                     // in DT_MIPS_DELTA_RELOC.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_SYM, 0x7000001D)         // Delta symbols that Delta
+                                                     // relocations refer to.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_SYM_NO, 0x7000001E)      // Number of entries
+                                                     // in DT_MIPS_DELTA_SYM.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_CLASSSYM, 0x70000020)    // Delta symbols that hold
+                                                     // class declarations.
+MIPS_DYNAMIC_TAG(MIPS_DELTA_CLASSSYM_NO, 0x70000021) // Number of entries
+                                                     // in DT_MIPS_DELTA_CLASSSYM.
+
+MIPS_DYNAMIC_TAG(MIPS_CXX_FLAGS, 0x70000022)         // Flags indicating information
+                                                     // about C++ flavor.
+MIPS_DYNAMIC_TAG(MIPS_PIXIE_INIT, 0x70000023)        // Pixie information.
+MIPS_DYNAMIC_TAG(MIPS_SYMBOL_LIB, 0x70000024)        // Address of .MIPS.symlib
+MIPS_DYNAMIC_TAG(MIPS_LOCALPAGE_GOTIDX, 0x70000025)  // The GOT index of the first PTE
+                                                     // for a segment
+MIPS_DYNAMIC_TAG(MIPS_LOCAL_GOTIDX, 0x70000026)      // The GOT index of the first PTE
+                                                     // for a local symbol
+MIPS_DYNAMIC_TAG(MIPS_HIDDEN_GOTIDX, 0x70000027)     // The GOT index of the first PTE
+                                                     // for a hidden symbol
+MIPS_DYNAMIC_TAG(MIPS_PROTECTED_GOTIDX, 0x70000028)  // The GOT index of the first PTE
+                                                        // for a protected symbol
+MIPS_DYNAMIC_TAG(MIPS_OPTIONS, 0x70000029)               // Address of `.MIPS.options'.
+MIPS_DYNAMIC_TAG(MIPS_INTERFACE, 0x7000002A)             // Address of `.interface'.
+MIPS_DYNAMIC_TAG(MIPS_DYNSTR_ALIGN, 0x7000002B)          // Unknown.
+MIPS_DYNAMIC_TAG(MIPS_INTERFACE_SIZE, 0x7000002C)        // Size of the .interface section.
+MIPS_DYNAMIC_TAG(MIPS_RLD_TEXT_RESOLVE_ADDR, 0x7000002D) // Size of rld_text_resolve
+                                                         // function stored in the GOT.
+MIPS_DYNAMIC_TAG(MIPS_PERF_SUFFIX, 0x7000002E)  // Default suffix of DSO to be added
+                                                // by rld on dlopen() calls.
+MIPS_DYNAMIC_TAG(MIPS_COMPACT_SIZE, 0x7000002F) // Size of compact relocation
+                                                // section (O32).
+MIPS_DYNAMIC_TAG(MIPS_GP_VALUE, 0x70000030)     // GP value for auxiliary GOTs.
+MIPS_DYNAMIC_TAG(MIPS_AUX_DYNAMIC, 0x70000031)  // Address of auxiliary .dynamic.
+MIPS_DYNAMIC_TAG(MIPS_PLTGOT, 0x70000032)       // Address of the base of the PLTGOT.
+MIPS_DYNAMIC_TAG(MIPS_RWPLT, 0x70000034)        // Points to the base
+                                                // of a writable PLT.
+MIPS_DYNAMIC_TAG(MIPS_RLD_MAP_REL, 0x70000035)  // Relative offset of run time loader
+                                                // map, used for debugging.
+
+// PPC64 specific dynamic table entries.
+PPC64_DYNAMIC_TAG(PPC64_GLINK, 0x70000000) // Address of 32 bytes before the
+                                           // first glink lazy resolver stub.
+
+// Sun machine-independent extensions.
+DYNAMIC_TAG(AUXILIARY, 0x7FFFFFFD) // Shared object to load before self
+DYNAMIC_TAG(FILTER, 0x7FFFFFFF)    // Shared object to get values from
+
+
+#ifdef DYNAMIC_TAG_MARKER_DEFINED
+#undef DYNAMIC_TAG_MARKER
+#endif
+#ifdef MIPS_DYNAMIC_TAG_DEFINED
+#undef MIPS_DYNAMIC_TAG
+#undef MIPS_DYNAMIC_TAG_DEFINED
+#endif
+#ifdef HEXAGON_DYNAMIC_TAG_DEFINED
+#undef HEXAGON_DYNAMIC_TAG
+#undef HEXAGON_DYNAMIC_TAG_DEFINED
+#endif
+#ifdef PPC64_DYNAMIC_TAG_DEFINED
+#undef PPC64_DYNAMIC_TAG
+#undef PPC64_DYNAMIC_TAG_DEFINED
+#endif
diff --git a/contrib/llvm/include/llvm/BinaryFormat/ELF.h b/contrib/llvm/include/llvm/BinaryFormat/ELF.h
index c902972d93bd..0f3f1939ce68 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/contrib/llvm/include/llvm/BinaryFormat/ELF.h
@@ -312,11 +312,6 @@ enum {
   EM_RISCV = 243,         // RISC-V
   EM_LANAI = 244,         // Lanai 32-bit processor
   EM_BPF = 247,           // Linux kernel bpf virtual machine
-
-  // A request has been made to the maintainer of the official registry for
-  // such numbers for an official value for WebAssembly. As soon as one is
-  // allocated, this enum will be updated to use it.
-  EM_WEBASSEMBLY = 0x4157, // WebAssembly architecture
 };
 
 // Object file classes.
@@ -644,18 +639,78 @@ enum {
 #include "ELFRelocs/Sparc.def"
 };
 
-// ELF Relocation types for WebAssembly
-enum {
-#include "ELFRelocs/WebAssembly.def"
-};
-
 // AMDGPU specific e_flags.
 enum : unsigned {
-  // AMDGPU machine architectures.
-  EF_AMDGPU_ARCH_NONE = 0x00000000, // None/unknown.
-  EF_AMDGPU_ARCH_R600 = 0x00000001, // AMD HD2XXX-HD6XXX GPUs.
-  EF_AMDGPU_ARCH_GCN = 0x00000002,  // AMD GCN GFX6+ GPUs.
-  EF_AMDGPU_ARCH = 0x0000000f       // EF_AMDGPU_ARCH_XXX selection mask.
+  // Processor selection mask for EF_AMDGPU_MACH_* values.
+  EF_AMDGPU_MACH = 0x0ff,
+
+  // Not specified processor.
+  EF_AMDGPU_MACH_NONE = 0x000,
+
+  // R600-based processors.
+
+  // Radeon HD 2000/3000 Series (R600).
+  EF_AMDGPU_MACH_R600_R600 = 0x001,
+  EF_AMDGPU_MACH_R600_R630 = 0x002,
+  EF_AMDGPU_MACH_R600_RS880 = 0x003,
+  EF_AMDGPU_MACH_R600_RV670 = 0x004,
+  // Radeon HD 4000 Series (R700).
+  EF_AMDGPU_MACH_R600_RV710 = 0x005,
+  EF_AMDGPU_MACH_R600_RV730 = 0x006,
+  EF_AMDGPU_MACH_R600_RV770 = 0x007,
+  // Radeon HD 5000 Series (Evergreen).
+  EF_AMDGPU_MACH_R600_CEDAR = 0x008,
+  EF_AMDGPU_MACH_R600_CYPRESS = 0x009,
+  EF_AMDGPU_MACH_R600_JUNIPER = 0x00a,
+  EF_AMDGPU_MACH_R600_REDWOOD = 0x00b,
+  EF_AMDGPU_MACH_R600_SUMO = 0x00c,
+  // Radeon HD 6000 Series (Northern Islands).
+  EF_AMDGPU_MACH_R600_BARTS = 0x00d,
+  EF_AMDGPU_MACH_R600_CAICOS = 0x00e,
+  EF_AMDGPU_MACH_R600_CAYMAN = 0x00f,
+  EF_AMDGPU_MACH_R600_TURKS = 0x010,
+
+  // Reserved for R600-based processors.
+  EF_AMDGPU_MACH_R600_RESERVED_FIRST = 0x011,
+  EF_AMDGPU_MACH_R600_RESERVED_LAST = 0x01f,
+
+  // First/last R600-based processors.
+  EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600,
+  EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
+
+  // AMDGCN-based processors.
+
+  // AMDGCN GFX6.
+  EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
+  // AMDGCN GFX7.
+  EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
+  // AMDGCN GFX8.
+  EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
+  // AMDGCN GFX9.
+  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
+
+  // Reserved for AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030,
+
+  // First/last AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX906,
+
+  // Indicates if the xnack target feature is enabled for all code contained in
+  // the object.
+  EF_AMDGPU_XNACK = 0x100,
 };
 
 // ELF Relocation types for AMDGPU
@@ -714,36 +769,46 @@ enum {
 
 // Section types.
 enum : unsigned {
-  SHT_NULL = 0,                    // No associated section (inactive entry).
-  SHT_PROGBITS = 1,                // Program-defined contents.
-  SHT_SYMTAB = 2,                  // Symbol table.
-  SHT_STRTAB = 3,                  // String table.
-  SHT_RELA = 4,                    // Relocation entries; explicit addends.
-  SHT_HASH = 5,                    // Symbol hash table.
-  SHT_DYNAMIC = 6,                 // Information for dynamic linking.
-  SHT_NOTE = 7,                    // Information about the file.
-  SHT_NOBITS = 8,                  // Data occupies no space in the file.
-  SHT_REL = 9,                     // Relocation entries; no explicit addends.
-  SHT_SHLIB = 10,                  // Reserved.
-  SHT_DYNSYM = 11,                 // Symbol table.
-  SHT_INIT_ARRAY = 14,             // Pointers to initialization functions.
-  SHT_FINI_ARRAY = 15,             // Pointers to termination functions.
-  SHT_PREINIT_ARRAY = 16,          // Pointers to pre-init functions.
-  SHT_GROUP = 17,                  // Section group.
-  SHT_SYMTAB_SHNDX = 18,           // Indices for SHN_XINDEX entries.
-  SHT_LOOS = 0x60000000,           // Lowest operating system-specific type.
+  SHT_NULL = 0,                         // No associated section (inactive entry).
+  SHT_PROGBITS = 1,                     // Program-defined contents.
+  SHT_SYMTAB = 2,                       // Symbol table.
+  SHT_STRTAB = 3,                       // String table.
+  SHT_RELA = 4,                         // Relocation entries; explicit addends.
+  SHT_HASH = 5,                         // Symbol hash table.
+  SHT_DYNAMIC = 6,                      // Information for dynamic linking.
+  SHT_NOTE = 7,                         // Information about the file.
+  SHT_NOBITS = 8,                       // Data occupies no space in the file.
+  SHT_REL = 9,                          // Relocation entries; no explicit addends.
+  SHT_SHLIB = 10,                       // Reserved.
+  SHT_DYNSYM = 11,                      // Symbol table.
+  SHT_INIT_ARRAY = 14,                  // Pointers to initialization functions.
+  SHT_FINI_ARRAY = 15,                  // Pointers to termination functions.
+  SHT_PREINIT_ARRAY = 16,               // Pointers to pre-init functions.
+  SHT_GROUP = 17,                       // Section group.
+  SHT_SYMTAB_SHNDX = 18,                // Indices for SHN_XINDEX entries.
+  // Experimental support for SHT_RELR sections. For details, see proposal
+  // at https://groups.google.com/forum/#!topic/generic-abi/bX460iggiKg
+  SHT_RELR = 19,                        // Relocation entries; only offsets.
+  SHT_LOOS = 0x60000000,                // Lowest operating system-specific type.
   // Android packed relocation section types.
   // https://android.googlesource.com/platform/bionic/+/6f12bfece5dcc01325e0abba56a46b1bcf991c69/tools/relocation_packer/src/elf_file.cc#37
   SHT_ANDROID_REL = 0x60000001,
   SHT_ANDROID_RELA = 0x60000002,
-  SHT_LLVM_ODRTAB = 0x6fff4c00,    // LLVM ODR table.
-  SHT_GNU_ATTRIBUTES = 0x6ffffff5, // Object attributes.
-  SHT_GNU_HASH = 0x6ffffff6,       // GNU-style hash table.
-  SHT_GNU_verdef = 0x6ffffffd,     // GNU version definitions.
-  SHT_GNU_verneed = 0x6ffffffe,    // GNU version references.
-  SHT_GNU_versym = 0x6fffffff,     // GNU symbol versions table.
-  SHT_HIOS = 0x6fffffff,           // Highest operating system-specific type.
-  SHT_LOPROC = 0x70000000,         // Lowest processor arch-specific type.
+  SHT_LLVM_ODRTAB = 0x6fff4c00,         // LLVM ODR table.
+  SHT_LLVM_LINKER_OPTIONS = 0x6fff4c01, // LLVM Linker Options.
+  SHT_LLVM_CALL_GRAPH_PROFILE = 0x6fff4c02, // LLVM Call Graph Profile.
+  SHT_LLVM_ADDRSIG = 0x6fff4c03,        // List of address-significant symbols
+                                        // for safe ICF.
+  // Android's experimental support for SHT_RELR sections.
+  // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
+  SHT_ANDROID_RELR = 0x6fffff00,        // Relocation entries; only offsets.
+  SHT_GNU_ATTRIBUTES = 0x6ffffff5,      // Object attributes.
+  SHT_GNU_HASH = 0x6ffffff6,            // GNU-style hash table.
+  SHT_GNU_verdef = 0x6ffffffd,          // GNU version definitions.
+  SHT_GNU_verneed = 0x6ffffffe,         // GNU version references.
+  SHT_GNU_versym = 0x6fffffff,          // GNU symbol versions table.
+  SHT_HIOS = 0x6fffffff,                // Highest operating system-specific type.
+  SHT_LOPROC = 0x70000000,              // Lowest processor arch-specific type.
   // Fixme: All this is duplicated in MCSectionELF. Why??
   // Exception Index table
   SHT_ARM_EXIDX = 0x70000001U,
@@ -753,18 +818,18 @@ enum : unsigned {
   SHT_ARM_ATTRIBUTES = 0x70000003U,
   SHT_ARM_DEBUGOVERLAY = 0x70000004U,
   SHT_ARM_OVERLAYSECTION = 0x70000005U,
-  SHT_HEX_ORDERED = 0x70000000,   // Link editor is to sort the entries in
-                                  // this section based on their sizes
-  SHT_X86_64_UNWIND = 0x70000001, // Unwind information
+  SHT_HEX_ORDERED = 0x70000000,         // Link editor is to sort the entries in
+                                        // this section based on their sizes
+  SHT_X86_64_UNWIND = 0x70000001,       // Unwind information
 
-  SHT_MIPS_REGINFO = 0x70000006,  // Register usage information
-  SHT_MIPS_OPTIONS = 0x7000000d,  // General options
-  SHT_MIPS_DWARF = 0x7000001e,    // DWARF debugging section.
-  SHT_MIPS_ABIFLAGS = 0x7000002a, // ABI information.
+  SHT_MIPS_REGINFO = 0x70000006,        // Register usage information
+  SHT_MIPS_OPTIONS = 0x7000000d,        // General options
+  SHT_MIPS_DWARF = 0x7000001e,          // DWARF debugging section.
+  SHT_MIPS_ABIFLAGS = 0x7000002a,       // ABI information.
 
-  SHT_HIPROC = 0x7fffffff, // Highest processor arch-specific type.
-  SHT_LOUSER = 0x80000000, // Lowest type reserved for applications.
-  SHT_HIUSER = 0xffffffff  // Highest type reserved for applications.
+  SHT_HIPROC = 0x7fffffff,              // Highest processor arch-specific type.
+  SHT_LOUSER = 0x80000000,              // Lowest type reserved for applications.
+  SHT_HIUSER = 0xffffffff               // Highest type reserved for applications.
 };
 
 // Section flags.
@@ -1000,6 +1065,9 @@ struct Elf32_Rela {
   }
 };
 
+// Relocation entry without explicit addend or info (relative relocations only).
+typedef Elf32_Word Elf32_Relr; // offset/bitmap for relative relocations
+
 // Relocation entry, without explicit addend.
 struct Elf64_Rel {
   Elf64_Addr r_offset; // Location (file byte offset, or program virtual addr).
@@ -1033,6 +1101,9 @@ struct Elf64_Rela {
   }
 };
 
+// Relocation entry without explicit addend or info (relative relocations only).
+typedef Elf64_Xword Elf64_Relr; // offset/bitmap for relative relocations
+
 // Program header for ELF32.
 struct Elf32_Phdr {
   Elf32_Word p_type;   // Type of segment
@@ -1096,9 +1167,6 @@ enum {
   PT_MIPS_RTPROC = 0x70000001,   // Runtime procedure table.
   PT_MIPS_OPTIONS = 0x70000002,  // Options segment.
   PT_MIPS_ABIFLAGS = 0x70000003, // Abiflags segment.
-
-  // WebAssembly program header types.
-  PT_WEBASSEMBLY_FUNCTIONS = PT_LOPROC + 0, // Function definitions.
 };
 
 // Segment flag bits.
@@ -1130,154 +1198,9 @@ struct Elf64_Dyn {
 
 // Dynamic table entry tags.
 enum {
-  DT_NULL = 0,          // Marks end of dynamic array.
-  DT_NEEDED = 1,        // String table offset of needed library.
-  DT_PLTRELSZ = 2,      // Size of relocation entries in PLT.
-  DT_PLTGOT = 3,        // Address associated with linkage table.
-  DT_HASH = 4,          // Address of symbolic hash table.
-  DT_STRTAB = 5,        // Address of dynamic string table.
-  DT_SYMTAB = 6,        // Address of dynamic symbol table.
-  DT_RELA = 7,          // Address of relocation table (Rela entries).
-  DT_RELASZ = 8,        // Size of Rela relocation table.
-  DT_RELAENT = 9,       // Size of a Rela relocation entry.
-  DT_STRSZ = 10,        // Total size of the string table.
-  DT_SYMENT = 11,       // Size of a symbol table entry.
-  DT_INIT = 12,         // Address of initialization function.
-  DT_FINI = 13,         // Address of termination function.
-  DT_SONAME = 14,       // String table offset of a shared objects name.
-  DT_RPATH = 15,        // String table offset of library search path.
-  DT_SYMBOLIC = 16,     // Changes symbol resolution algorithm.
-  DT_REL = 17,          // Address of relocation table (Rel entries).
-  DT_RELSZ = 18,        // Size of Rel relocation table.
-  DT_RELENT = 19,       // Size of a Rel relocation entry.
-  DT_PLTREL = 20,       // Type of relocation entry used for linking.
-  DT_DEBUG = 21,        // Reserved for debugger.
-  DT_TEXTREL = 22,      // Relocations exist for non-writable segments.
-  DT_JMPREL = 23,       // Address of relocations associated with PLT.
-  DT_BIND_NOW = 24,     // Process all relocations before execution.
-  DT_INIT_ARRAY = 25,   // Pointer to array of initialization functions.
-  DT_FINI_ARRAY = 26,   // Pointer to array of termination functions.
-  DT_INIT_ARRAYSZ = 27, // Size of DT_INIT_ARRAY.
-  DT_FINI_ARRAYSZ = 28, // Size of DT_FINI_ARRAY.
-  DT_RUNPATH = 29,      // String table offset of lib search path.
-  DT_FLAGS = 30,        // Flags.
-  DT_ENCODING = 32,     // Values from here to DT_LOOS follow the rules
-                        // for the interpretation of the d_un union.
-
-  DT_PREINIT_ARRAY = 32,   // Pointer to array of preinit functions.
-  DT_PREINIT_ARRAYSZ = 33, // Size of the DT_PREINIT_ARRAY array.
-
-  DT_LOOS = 0x60000000,   // Start of environment specific tags.
-  DT_HIOS = 0x6FFFFFFF,   // End of environment specific tags.
-  DT_LOPROC = 0x70000000, // Start of processor specific tags.
-  DT_HIPROC = 0x7FFFFFFF, // End of processor specific tags.
-
-  // Android packed relocation section tags.
-  // https://android.googlesource.com/platform/bionic/+/6f12bfece5dcc01325e0abba56a46b1bcf991c69/tools/relocation_packer/src/elf_file.cc#31
-  DT_ANDROID_REL = 0x6000000F,
-  DT_ANDROID_RELSZ = 0x60000010,
-  DT_ANDROID_RELA = 0x60000011,
-  DT_ANDROID_RELASZ = 0x60000012,
-
-  DT_GNU_HASH = 0x6FFFFEF5, // Reference to the GNU hash table.
-  DT_TLSDESC_PLT =
-      0x6FFFFEF6, // Location of PLT entry for TLS descriptor resolver calls.
-  DT_TLSDESC_GOT = 0x6FFFFEF7, // Location of GOT entry used by TLS descriptor
-                               // resolver PLT entry.
-  DT_RELACOUNT = 0x6FFFFFF9,   // ELF32_Rela count.
-  DT_RELCOUNT = 0x6FFFFFFA,    // ELF32_Rel count.
-
-  DT_FLAGS_1 = 0X6FFFFFFB,    // Flags_1.
-  DT_VERSYM = 0x6FFFFFF0,     // The address of .gnu.version section.
-  DT_VERDEF = 0X6FFFFFFC,     // The address of the version definition table.
-  DT_VERDEFNUM = 0X6FFFFFFD,  // The number of entries in DT_VERDEF.
-  DT_VERNEED = 0X6FFFFFFE,    // The address of the version Dependency table.
-  DT_VERNEEDNUM = 0X6FFFFFFF, // The number of entries in DT_VERNEED.
-
-  // Hexagon specific dynamic table entries
-  DT_HEXAGON_SYMSZ = 0x70000000,
-  DT_HEXAGON_VER = 0x70000001,
-  DT_HEXAGON_PLT = 0x70000002,
-
-  // Mips specific dynamic table entry tags.
-  DT_MIPS_RLD_VERSION = 0x70000001,    // 32 bit version number for runtime
-                                       // linker interface.
-  DT_MIPS_TIME_STAMP = 0x70000002,     // Time stamp.
-  DT_MIPS_ICHECKSUM = 0x70000003,      // Checksum of external strings
-                                       // and common sizes.
-  DT_MIPS_IVERSION = 0x70000004,       // Index of version string
-                                       // in string table.
-  DT_MIPS_FLAGS = 0x70000005,          // 32 bits of flags.
-  DT_MIPS_BASE_ADDRESS = 0x70000006,   // Base address of the segment.
-  DT_MIPS_MSYM = 0x70000007,           // Address of .msym section.
-  DT_MIPS_CONFLICT = 0x70000008,       // Address of .conflict section.
-  DT_MIPS_LIBLIST = 0x70000009,        // Address of .liblist section.
-  DT_MIPS_LOCAL_GOTNO = 0x7000000a,    // Number of local global offset
-                                       // table entries.
-  DT_MIPS_CONFLICTNO = 0x7000000b,     // Number of entries
-                                       // in the .conflict section.
-  DT_MIPS_LIBLISTNO = 0x70000010,      // Number of entries
-                                       // in the .liblist section.
-  DT_MIPS_SYMTABNO = 0x70000011,       // Number of entries
-                                       // in the .dynsym section.
-  DT_MIPS_UNREFEXTNO = 0x70000012,     // Index of first external dynamic symbol
-                                       // not referenced locally.
-  DT_MIPS_GOTSYM = 0x70000013,         // Index of first dynamic symbol
-                                       // in global offset table.
-  DT_MIPS_HIPAGENO = 0x70000014,       // Number of page table entries
-                                       // in global offset table.
-  DT_MIPS_RLD_MAP = 0x70000016,        // Address of run time loader map,
-                                       // used for debugging.
-  DT_MIPS_DELTA_CLASS = 0x70000017,    // Delta C++ class definition.
-  DT_MIPS_DELTA_CLASS_NO = 0x70000018, // Number of entries
-                                       // in DT_MIPS_DELTA_CLASS.
-  DT_MIPS_DELTA_INSTANCE = 0x70000019, // Delta C++ class instances.
-  DT_MIPS_DELTA_INSTANCE_NO = 0x7000001A,     // Number of entries
-                                              // in DT_MIPS_DELTA_INSTANCE.
-  DT_MIPS_DELTA_RELOC = 0x7000001B,           // Delta relocations.
-  DT_MIPS_DELTA_RELOC_NO = 0x7000001C,        // Number of entries
-                                              // in DT_MIPS_DELTA_RELOC.
-  DT_MIPS_DELTA_SYM = 0x7000001D,             // Delta symbols that Delta
-                                              // relocations refer to.
-  DT_MIPS_DELTA_SYM_NO = 0x7000001E,          // Number of entries
-                                              // in DT_MIPS_DELTA_SYM.
-  DT_MIPS_DELTA_CLASSSYM = 0x70000020,        // Delta symbols that hold
-                                              // class declarations.
-  DT_MIPS_DELTA_CLASSSYM_NO = 0x70000021,     // Number of entries
-                                              // in DT_MIPS_DELTA_CLASSSYM.
-  DT_MIPS_CXX_FLAGS = 0x70000022,             // Flags indicating information
-                                              // about C++ flavor.
-  DT_MIPS_PIXIE_INIT = 0x70000023,            // Pixie information.
-  DT_MIPS_SYMBOL_LIB = 0x70000024,            // Address of .MIPS.symlib
-  DT_MIPS_LOCALPAGE_GOTIDX = 0x70000025,      // The GOT index of the first PTE
-                                              // for a segment
-  DT_MIPS_LOCAL_GOTIDX = 0x70000026,          // The GOT index of the first PTE
-                                              // for a local symbol
-  DT_MIPS_HIDDEN_GOTIDX = 0x70000027,         // The GOT index of the first PTE
-                                              // for a hidden symbol
-  DT_MIPS_PROTECTED_GOTIDX = 0x70000028,      // The GOT index of the first PTE
-                                              // for a protected symbol
-  DT_MIPS_OPTIONS = 0x70000029,               // Address of `.MIPS.options'.
-  DT_MIPS_INTERFACE = 0x7000002A,             // Address of `.interface'.
-  DT_MIPS_DYNSTR_ALIGN = 0x7000002B,          // Unknown.
-  DT_MIPS_INTERFACE_SIZE = 0x7000002C,        // Size of the .interface section.
-  DT_MIPS_RLD_TEXT_RESOLVE_ADDR = 0x7000002D, // Size of rld_text_resolve
-                                              // function stored in the GOT.
-  DT_MIPS_PERF_SUFFIX = 0x7000002E,  // Default suffix of DSO to be added
-                                     // by rld on dlopen() calls.
-  DT_MIPS_COMPACT_SIZE = 0x7000002F, // Size of compact relocation
-                                     // section (O32).
-  DT_MIPS_GP_VALUE = 0x70000030,     // GP value for auxiliary GOTs.
-  DT_MIPS_AUX_DYNAMIC = 0x70000031,  // Address of auxiliary .dynamic.
-  DT_MIPS_PLTGOT = 0x70000032,       // Address of the base of the PLTGOT.
-  DT_MIPS_RWPLT = 0x70000034,        // Points to the base
-                                     // of a writable PLT.
-  DT_MIPS_RLD_MAP_REL = 0x70000035,  // Relative offset of run time loader
-                                     // map, used for debugging.
-
-  // Sun machine-independent extensions.
-  DT_AUXILIARY = 0x7FFFFFFD, // Shared object to load before self
-  DT_FILTER = 0x7FFFFFFF     // Shared object to get values from
+#define DYNAMIC_TAG(name, value) DT_##name = value,
+#include "DynamicTags.def"
+#undef DYNAMIC_TAG
 };
 
 // DT_FLAGS values.
@@ -1380,6 +1303,20 @@ enum {
   NT_GNU_HWCAP = 2,
   NT_GNU_BUILD_ID = 3,
   NT_GNU_GOLD_VERSION = 4,
+  NT_GNU_PROPERTY_TYPE_0 = 5,
+};
+
+// Property types used in GNU_PROPERTY_TYPE_0 notes.
+enum : unsigned {
+  GNU_PROPERTY_STACK_SIZE = 1,
+  GNU_PROPERTY_NO_COPY_ON_PROTECTED = 2,
+  GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002
+};
+
+// CET properties
+enum {
+  GNU_PROPERTY_X86_FEATURE_1_IBT = 1 << 0,
+  GNU_PROPERTY_X86_FEATURE_1_SHSTK = 1 << 1
 };
 
 // AMDGPU specific notes.
@@ -1423,6 +1360,20 @@ struct Elf64_Chdr {
   Elf64_Xword ch_addralign;
 };
 
+// Node header for ELF32.
+struct Elf32_Nhdr {
+  Elf32_Word n_namesz;
+  Elf32_Word n_descsz;
+  Elf32_Word n_type;
+};
+
+// Node header for ELF64.
+struct Elf64_Nhdr {
+  Elf64_Word n_namesz;
+  Elf64_Word n_descsz;
+  Elf64_Word n_type;
+};
+
 // Legal values for ch_type field of compressed section header.
 enum {
   ELFCOMPRESS_ZLIB = 1,            // ZLIB/DEFLATE algorithm.
diff --git a/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def b/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def
index 3a47c5a07574..8c5b482f0511 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def
+++ b/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def
@@ -89,6 +89,13 @@
 #undef R_PPC64_DTPREL16_HIGHESTA
 #undef R_PPC64_TLSGD
 #undef R_PPC64_TLSLD
+#undef R_PPC64_ADDR16_HIGH
+#undef R_PPC64_ADDR16_HIGHA
+#undef R_PPC64_TPREL16_HIGH
+#undef R_PPC64_TPREL16_HIGHA
+#undef R_PPC64_DTPREL16_HIGH
+#undef R_PPC64_DTPREL16_HIGHA
+#undef R_PPC64_IRELATIVE
 #undef R_PPC64_REL16
 #undef R_PPC64_REL16_LO
 #undef R_PPC64_REL16_HI
@@ -175,6 +182,13 @@ ELF_RELOC(R_PPC64_DTPREL16_HIGHEST,     105)
 ELF_RELOC(R_PPC64_DTPREL16_HIGHESTA,    106)
 ELF_RELOC(R_PPC64_TLSGD,                107)
 ELF_RELOC(R_PPC64_TLSLD,                108)
+ELF_RELOC(R_PPC64_ADDR16_HIGH,          110)
+ELF_RELOC(R_PPC64_ADDR16_HIGHA,         111)
+ELF_RELOC(R_PPC64_TPREL16_HIGH,         112)
+ELF_RELOC(R_PPC64_TPREL16_HIGHA,        113)
+ELF_RELOC(R_PPC64_DTPREL16_HIGH,        114)
+ELF_RELOC(R_PPC64_DTPREL16_HIGHA,       115)
+ELF_RELOC(R_PPC64_IRELATIVE,            248)
 ELF_RELOC(R_PPC64_REL16,                249)
 ELF_RELOC(R_PPC64_REL16_LO,             250)
 ELF_RELOC(R_PPC64_REL16_HI,             251)
diff --git a/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/WebAssembly.def b/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/WebAssembly.def
deleted file mode 100644
index 9a34349efb96..000000000000
--- a/contrib/llvm/include/llvm/BinaryFormat/ELFRelocs/WebAssembly.def
+++ /dev/null
@@ -1,8 +0,0 @@
-
-#ifndef ELF_RELOC
-#error "ELF_RELOC must be defined"
-#endif
-
-ELF_RELOC(R_WEBASSEMBLY_NONE,          0)
-ELF_RELOC(R_WEBASSEMBLY_DATA,          1)
-ELF_RELOC(R_WEBASSEMBLY_FUNCTION,      2)
diff --git a/contrib/llvm/include/llvm/BinaryFormat/MachO.h b/contrib/llvm/include/llvm/BinaryFormat/MachO.h
index 060fbe162ad2..c5294c76ebf7 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/contrib/llvm/include/llvm/BinaryFormat/MachO.h
@@ -1973,9 +1973,11 @@ const uint32_t PPC_THREAD_STATE_COUNT =
 // Define a union of all load command structs
 #define LOAD_COMMAND_STRUCT(LCStruct) LCStruct LCStruct##_data;
 
-union macho_load_command {
+LLVM_PACKED_START
+union alignas(4) macho_load_command {
 #include "llvm/BinaryFormat/MachO.def"
 };
+LLVM_PACKED_END
 
 } // end namespace MachO
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/BinaryFormat/Magic.h b/contrib/llvm/include/llvm/BinaryFormat/Magic.h
index c0e23db5e1ae..04801f810be3 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/Magic.h
+++ b/contrib/llvm/include/llvm/BinaryFormat/Magic.h
@@ -45,7 +45,8 @@ struct file_magic {
     coff_import_library, ///< COFF import library
     pecoff_executable,   ///< PECOFF executable file
     windows_resource,    ///< Windows compiled resource file (.res)
-    wasm_object          ///< WebAssembly Object file
+    wasm_object,         ///< WebAssembly Object file
+    pdb,                 ///< Windows PDB debug info file
   };
 
   bool is_object() const { return V != unknown; }
@@ -58,10 +59,10 @@ private:
   Impl V = unknown;
 };
 
-/// @brief Identify the type of a binary file based on how magical it is.
+/// Identify the type of a binary file based on how magical it is.
 file_magic identify_magic(StringRef magic);
 
-/// @brief Get and identify \a path's type based on its content.
+/// Get and identify \a path's type based on its content.
 ///
 /// @param path Input path.
 /// @param result Set to the type of file, or file_magic::unknown.
diff --git a/contrib/llvm/include/llvm/BinaryFormat/Wasm.h b/contrib/llvm/include/llvm/BinaryFormat/Wasm.h
index 57a0b441821b..fa5448dacec4 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/contrib/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -24,6 +24,8 @@ namespace wasm {
 const char WasmMagic[] = {'\0', 'a', 's', 'm'};
 // Wasm binary format version
 const uint32_t WasmVersion = 0x1;
+// Wasm linking metadata version
+const uint32_t WasmMetadataVersion = 0x1;
 // Wasm uses a 64k page size
 const uint32_t WasmPageSize = 65536;
 
@@ -33,24 +35,24 @@ struct WasmObjectHeader {
 };
 
 struct WasmSignature {
-  std::vector<int32_t> ParamTypes;
-  int32_t ReturnType;
+  std::vector<uint8_t> ParamTypes;
+  uint8_t ReturnType;
 };
 
 struct WasmExport {
   StringRef Name;
-  uint32_t Kind;
+  uint8_t Kind;
   uint32_t Index;
 };
 
 struct WasmLimits {
-  uint32_t Flags;
+  uint8_t Flags;
   uint32_t Initial;
   uint32_t Maximum;
 };
 
 struct WasmTable {
-  int32_t ElemType;
+  uint8_t ElemType;
   WasmLimits Limits;
 };
 
@@ -65,43 +67,55 @@ struct WasmInitExpr {
   } Value;
 };
 
-struct WasmGlobal {
-  int32_t Type;
+struct WasmGlobalType {
+  uint8_t Type;
   bool Mutable;
+};
+
+struct WasmGlobal {
+  uint32_t Index;
+  WasmGlobalType Type;
   WasmInitExpr InitExpr;
+  StringRef SymbolName; // from the "linking" section
 };
 
 struct WasmImport {
   StringRef Module;
   StringRef Field;
-  uint32_t Kind;
+  uint8_t Kind;
   union {
     uint32_t SigIndex;
-    WasmGlobal Global;
+    WasmGlobalType Global;
     WasmTable Table;
     WasmLimits Memory;
   };
 };
 
 struct WasmLocalDecl {
-  int32_t Type;
+  uint8_t Type;
   uint32_t Count;
 };
 
 struct WasmFunction {
+  uint32_t Index;
   std::vector<WasmLocalDecl> Locals;
   ArrayRef<uint8_t> Body;
   uint32_t CodeSectionOffset;
   uint32_t Size;
+  uint32_t CodeOffset;  // start of Locals and Body
+  StringRef SymbolName; // from the "linking" section
+  StringRef DebugName; // from the "name" section
+  uint32_t Comdat; // from the "comdat info" section
 };
 
 struct WasmDataSegment {
   uint32_t MemoryIndex;
   WasmInitExpr Offset;
   ArrayRef<uint8_t> Content;
-  StringRef Name;
+  StringRef Name; // from the "segment info" section
   uint32_t Alignment;
   uint32_t Flags;
+  uint32_t Comdat; // from the "comdat info" section
 };
 
 struct WasmElemSegment {
@@ -110,21 +124,50 @@ struct WasmElemSegment {
   std::vector<uint32_t> Functions;
 };
 
+// Represents the location of a Wasm data symbol within a WasmDataSegment, as
+// the index of the segment, and the offset and size within the segment.
+struct WasmDataReference {
+  uint32_t Segment;
+  uint32_t Offset;
+  uint32_t Size;
+};
+
 struct WasmRelocation {
-  uint32_t Type;   // The type of the relocation.
-  uint32_t Index;  // Index into function to global index space.
+  uint8_t Type;    // The type of the relocation.
+  uint32_t Index;  // Index into either symbol or type index space.
   uint64_t Offset; // Offset from the start of the section.
   int64_t Addend;  // A value to add to the symbol.
 };
 
 struct WasmInitFunc {
   uint32_t Priority;
-  uint32_t FunctionIndex;
+  uint32_t Symbol;
+};
+
+struct WasmSymbolInfo {
+  StringRef Name;
+  uint8_t Kind;
+  uint32_t Flags;
+  StringRef Module; // For undefined symbols the module name of the import
+  union {
+    // For function or global symbols, the index in function or global index
+    // space.
+    uint32_t ElementIndex;
+    // For a data symbols, the address of the data relative to segment.
+    WasmDataReference DataRef;
+  };
+};
+
+struct WasmFunctionName {
+  uint32_t Index;
+  StringRef Name;
 };
 
 struct WasmLinkingData {
-  uint32_t DataSize;
+  uint32_t Version;
   std::vector<WasmInitFunc> InitFunctions;
+  std::vector<StringRef> Comdats;
+  std::vector<WasmSymbolInfo> SymbolTable;
 };
 
 enum : unsigned {
@@ -143,14 +186,15 @@ enum : unsigned {
 };
 
 // Type immediate encodings used in various contexts.
-enum {
-  WASM_TYPE_I32 = -0x01,
-  WASM_TYPE_I64 = -0x02,
-  WASM_TYPE_F32 = -0x03,
-  WASM_TYPE_F64 = -0x04,
-  WASM_TYPE_ANYFUNC = -0x10,
-  WASM_TYPE_FUNC = -0x20,
-  WASM_TYPE_NORESULT = -0x40, // for blocks with no result values
+enum : unsigned {
+  WASM_TYPE_I32 = 0x7F,
+  WASM_TYPE_I64 = 0x7E,
+  WASM_TYPE_F32 = 0x7D,
+  WASM_TYPE_F64 = 0x7C,
+  WASM_TYPE_ANYFUNC = 0x70,
+  WASM_TYPE_EXCEPT_REF = 0x68,
+  WASM_TYPE_FUNC = 0x60,
+  WASM_TYPE_NORESULT = 0x40, // for blocks with no result values
 };
 
 // Kinds of externals (for imports and exports).
@@ -172,11 +216,6 @@ enum : unsigned {
 };
 
 enum : unsigned {
-  WASM_NAMES_FUNCTION = 0x1,
-  WASM_NAMES_LOCAL = 0x2,
-};
-
-enum : unsigned {
   WASM_LIMITS_FLAG_HAS_MAX = 0x1,
 };
 
@@ -186,24 +225,46 @@ enum class ValType {
   I64 = WASM_TYPE_I64,
   F32 = WASM_TYPE_F32,
   F64 = WASM_TYPE_F64,
+  EXCEPT_REF = WASM_TYPE_EXCEPT_REF,
+};
+
+// Kind codes used in the custom "name" section
+enum : unsigned {
+  WASM_NAMES_FUNCTION = 0x1,
+  WASM_NAMES_LOCAL    = 0x2,
 };
 
-// Linking metadata kinds.
+// Kind codes used in the custom "linking" section
 enum : unsigned {
-  WASM_SYMBOL_INFO    = 0x2,
-  WASM_DATA_SIZE      = 0x3,
   WASM_SEGMENT_INFO   = 0x5,
   WASM_INIT_FUNCS     = 0x6,
+  WASM_COMDAT_INFO    = 0x7,
+  WASM_SYMBOL_TABLE   = 0x8,
+};
+
+// Kind codes used in the custom "linking" section in the WASM_COMDAT_INFO
+enum : unsigned {
+  WASM_COMDAT_DATA        = 0x0,
+  WASM_COMDAT_FUNCTION    = 0x1,
+};
+
+// Kind codes used in the custom "linking" section in the WASM_SYMBOL_TABLE
+enum WasmSymbolType : unsigned {
+  WASM_SYMBOL_TYPE_FUNCTION = 0x0,
+  WASM_SYMBOL_TYPE_DATA = 0x1,
+  WASM_SYMBOL_TYPE_GLOBAL = 0x2,
+  WASM_SYMBOL_TYPE_SECTION = 0x3,
 };
 
 const unsigned WASM_SYMBOL_BINDING_MASK       = 0x3;
-const unsigned WASM_SYMBOL_VISIBILITY_MASK    = 0x4;
+const unsigned WASM_SYMBOL_VISIBILITY_MASK    = 0xc;
 
 const unsigned WASM_SYMBOL_BINDING_GLOBAL     = 0x0;
 const unsigned WASM_SYMBOL_BINDING_WEAK       = 0x1;
 const unsigned WASM_SYMBOL_BINDING_LOCAL      = 0x2;
 const unsigned WASM_SYMBOL_VISIBILITY_DEFAULT = 0x0;
 const unsigned WASM_SYMBOL_VISIBILITY_HIDDEN  = 0x4;
+const unsigned WASM_SYMBOL_UNDEFINED          = 0x10;
 
 #define WASM_RELOC(name, value) name = value,
 
@@ -213,18 +274,25 @@ enum : unsigned {
 
 #undef WASM_RELOC
 
-struct Global {
-  ValType Type;
-  bool Mutable;
+// Useful comparison operators
+inline bool operator==(const WasmSignature &LHS, const WasmSignature &RHS) {
+  return LHS.ReturnType == RHS.ReturnType && LHS.ParamTypes == RHS.ParamTypes;
+}
 
-  // The initial value for this global is either the value of an imported
-  // global, in which case InitialModule and InitialName specify the global
-  // import, or a value, in which case InitialModule is empty and InitialValue
-  // holds the value.
-  StringRef InitialModule;
-  StringRef InitialName;
-  uint64_t InitialValue;
-};
+inline bool operator!=(const WasmSignature &LHS, const WasmSignature &RHS) {
+  return !(LHS == RHS);
+}
+
+inline bool operator==(const WasmGlobalType &LHS, const WasmGlobalType &RHS) {
+  return LHS.Type == RHS.Type && LHS.Mutable == RHS.Mutable;
+}
+
+inline bool operator!=(const WasmGlobalType &LHS, const WasmGlobalType &RHS) {
+  return !(LHS == RHS);
+}
+
+std::string toString(wasm::WasmSymbolType type);
+std::string relocTypetoString(uint32_t type);
 
 } // end namespace wasm
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/BinaryFormat/WasmRelocs.def b/contrib/llvm/include/llvm/BinaryFormat/WasmRelocs.def
index d6f0e42b33bf..8ffd51e483f3 100644
--- a/contrib/llvm/include/llvm/BinaryFormat/WasmRelocs.def
+++ b/contrib/llvm/include/llvm/BinaryFormat/WasmRelocs.def
@@ -11,3 +11,5 @@ WASM_RELOC(R_WEBASSEMBLY_MEMORY_ADDR_SLEB,     4)
 WASM_RELOC(R_WEBASSEMBLY_MEMORY_ADDR_I32,      5)
 WASM_RELOC(R_WEBASSEMBLY_TYPE_INDEX_LEB,       6)
 WASM_RELOC(R_WEBASSEMBLY_GLOBAL_INDEX_LEB,     7)
+WASM_RELOC(R_WEBASSEMBLY_FUNCTION_OFFSET_I32,  8)
+WASM_RELOC(R_WEBASSEMBLY_SECTION_OFFSET_I32,   9)
diff --git a/contrib/llvm/include/llvm/Bitcode/BitcodeWriter.h b/contrib/llvm/include/llvm/Bitcode/BitcodeWriter.h
index c78077525c8b..0010cf6c0544 100644
--- a/contrib/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/contrib/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -86,7 +86,7 @@ class raw_ostream;
     /// Can be used to produce the same module hash for a minimized bitcode
     /// used just for the thin link as in the regular full bitcode that will
     /// be used in the backend.
-    void writeModule(const Module *M, bool ShouldPreserveUseListOrder = false,
+    void writeModule(const Module &M, bool ShouldPreserveUseListOrder = false,
                      const ModuleSummaryIndex *Index = nullptr,
                      bool GenerateHash = false, ModuleHash *ModHash = nullptr);
 
@@ -97,7 +97,7 @@ class raw_ostream;
     ///
     /// ModHash is for use in ThinLTO incremental build, generated while the
     /// IR bitcode file writing.
-    void writeThinLinkBitcode(const Module *M, const ModuleSummaryIndex &Index,
+    void writeThinLinkBitcode(const Module &M, const ModuleSummaryIndex &Index,
                               const ModuleHash &ModHash);
 
     void writeIndex(
@@ -105,7 +105,7 @@ class raw_ostream;
         const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex);
   };
 
-  /// \brief Write the specified module to the specified raw output stream.
+  /// Write the specified module to the specified raw output stream.
   ///
   /// For streams where it matters, the given stream should be in "binary"
   /// mode.
@@ -126,7 +126,7 @@ class raw_ostream;
   /// Can be used to produce the same module hash for a minimized bitcode
   /// used just for the thin link as in the regular full bitcode that will
   /// be used in the backend.
-  void WriteBitcodeToFile(const Module *M, raw_ostream &Out,
+  void WriteBitcodeToFile(const Module &M, raw_ostream &Out,
                           bool ShouldPreserveUseListOrder = false,
                           const ModuleSummaryIndex *Index = nullptr,
                           bool GenerateHash = false,
@@ -139,7 +139,7 @@ class raw_ostream;
   ///
   /// ModHash is for use in ThinLTO incremental build, generated while the IR
   /// bitcode file writing.
-  void WriteThinLinkBitcodeToFile(const Module *M, raw_ostream &Out,
+  void WriteThinLinkBitcodeToFile(const Module &M, raw_ostream &Out,
                                   const ModuleSummaryIndex &Index,
                                   const ModuleHash &ModHash);
 
diff --git a/contrib/llvm/include/llvm/Bitcode/BitcodeWriterPass.h b/contrib/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
index 9ac6fba16b96..05044c9ae11c 100644
--- a/contrib/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
+++ b/contrib/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
@@ -21,9 +21,10 @@
 namespace llvm {
 class Module;
 class ModulePass;
+class Pass;
 class raw_ostream;
 
-/// \brief Create and return a pass that writes the module to the specified
+/// Create and return a pass that writes the module to the specified
 /// ostream. Note that this pass is designed for use with the legacy pass
 /// manager.
 ///
@@ -40,7 +41,10 @@ ModulePass *createBitcodeWriterPass(raw_ostream &Str,
                                     bool EmitSummaryIndex = false,
                                     bool EmitModuleHash = false);
 
-/// \brief Pass for writing a module of IR out to a bitcode file.
+/// Check whether a pass is a BitcodeWriterPass.
+bool isBitcodeWriterPass(Pass *P);
+
+/// Pass for writing a module of IR out to a bitcode file.
 ///
 /// Note that this is intended for use with the new pass manager. To construct
 /// a pass for the legacy pass manager, use the function above.
@@ -51,7 +55,7 @@ class BitcodeWriterPass : public PassInfoMixin<BitcodeWriterPass> {
   bool EmitModuleHash;
 
 public:
-  /// \brief Construct a bitcode writer pass around a particular output stream.
+  /// Construct a bitcode writer pass around a particular output stream.
   ///
   /// If \c ShouldPreserveUseListOrder, encode use-list order so it can be
   /// reproduced when deserialized.
@@ -65,7 +69,7 @@ public:
       : OS(OS), ShouldPreserveUseListOrder(ShouldPreserveUseListOrder),
   EmitSummaryIndex(EmitSummaryIndex), EmitModuleHash(EmitModuleHash) {}
 
-  /// \brief Run the bitcode writer pass, and output the module to the selected
+  /// Run the bitcode writer pass, and output the module to the selected
   /// output stream.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 };
diff --git a/contrib/llvm/include/llvm/Bitcode/BitstreamReader.h b/contrib/llvm/include/llvm/Bitcode/BitstreamReader.h
index b484fa2efbfb..72e7619d9e1c 100644
--- a/contrib/llvm/include/llvm/Bitcode/BitstreamReader.h
+++ b/contrib/llvm/include/llvm/Bitcode/BitstreamReader.h
@@ -429,7 +429,7 @@ public:
     // don't care what code widths are used inside of it.
     ReadVBR(bitc::CodeLenWidth);
     SkipToFourByteBoundary();
-    unsigned NumFourBytes = Read(bitc::BlockSizeWidth);
+    size_t NumFourBytes = Read(bitc::BlockSizeWidth);
 
     // Check that the block wasn't partially defined, and that the offset isn't
     // bogus.
diff --git a/contrib/llvm/include/llvm/Bitcode/BitstreamWriter.h b/contrib/llvm/include/llvm/Bitcode/BitstreamWriter.h
index e276db5f92f6..c854769e0622 100644
--- a/contrib/llvm/include/llvm/Bitcode/BitstreamWriter.h
+++ b/contrib/llvm/include/llvm/Bitcode/BitstreamWriter.h
@@ -90,10 +90,10 @@ public:
     assert(BlockScope.empty() && CurAbbrevs.empty() && "Block imbalance");
   }
 
-  /// \brief Retrieve the current position in the stream, in bits.
+  /// Retrieve the current position in the stream, in bits.
   uint64_t GetCurrentBitNo() const { return GetBufferOffset() * 8 + CurBit; }
 
-  /// \brief Retrieve the number of bits currently used to encode an abbrev ID.
+  /// Retrieve the number of bits currently used to encode an abbrev ID.
   unsigned GetAbbrevIDWidth() const { return CurCodeSize; }
 
   //===--------------------------------------------------------------------===//
diff --git a/contrib/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/contrib/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 01419d7ae2bf..6723cf42dd2c 100644
--- a/contrib/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/contrib/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -256,6 +256,18 @@ enum GlobalValueSummarySymtabCodes {
   // strings in strtab.
   // [n * name]
   FS_CFI_FUNCTION_DECLS = 18,
+  // Per-module summary that also adds relative block frequency to callee info.
+  // PERMODULE_RELBF: [valueid, flags, instcount, numrefs,
+  //                   numrefs x valueid,
+  //                   n x (valueid, relblockfreq)]
+  FS_PERMODULE_RELBF = 19,
+  // Index-wide flags
+  FS_FLAGS = 20,
+  // Maps type identifier to summary information for that type identifier.
+  // TYPE_ID: [typeid, kind, bitwidth, align, size, bitmask, inlinebits,
+  //           n x (typeid, kind, name, numrba,
+  //                numrba x (numarg, numarg x arg, kind, info, byte, bit))]
+  FS_TYPE_ID = 21,
 };
 
 enum MetadataCodes {
@@ -272,7 +284,7 @@ enum MetadataCodes {
   METADATA_ATTACHMENT = 11,    // [m x [value, [n x [id, mdnode]]]
   METADATA_GENERIC_DEBUG = 12, // [distinct, tag, vers, header, n x md num]
   METADATA_SUBRANGE = 13,      // [distinct, count, lo]
-  METADATA_ENUMERATOR = 14,    // [distinct, value, name]
+  METADATA_ENUMERATOR = 14,    // [isUnsigned|distinct, value, name]
   METADATA_BASIC_TYPE = 15,    // [distinct, tag, name, size, align, enc]
   METADATA_FILE = 16, // [distinct, filename, directory, checksumkind, checksum]
   METADATA_DERIVED_TYPE = 17,       // [distinct, ...]
@@ -298,6 +310,7 @@ enum MetadataCodes {
   METADATA_GLOBAL_VAR_EXPR = 37,        // [distinct, var, expr]
   METADATA_INDEX_OFFSET = 38,           // [offset]
   METADATA_INDEX = 39,                  // [bitpos]
+  METADATA_LABEL = 40,                  // [distinct, scope, name, file, line]
 };
 
 // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
@@ -575,6 +588,9 @@ enum AttributeKindCodes {
   ATTR_KIND_SPECULATABLE = 53,
   ATTR_KIND_STRICT_FP = 54,
   ATTR_KIND_SANITIZE_HWADDRESS = 55,
+  ATTR_KIND_NOCF_CHECK = 56,
+  ATTR_KIND_OPT_FOR_FUZZING = 57,
+  ATTR_KIND_SHADOWCALLSTACK = 58,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/contrib/llvm/include/llvm/CodeGen/AccelTable.h b/contrib/llvm/include/llvm/CodeGen/AccelTable.h
new file mode 100644
index 000000000000..13928582f2dd
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/AccelTable.h
@@ -0,0 +1,434 @@
+//==- include/llvm/CodeGen/AccelTable.h - Accelerator Tables -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_DWARFACCELTABLE_H
+#define LLVM_CODEGEN_DWARFACCELTABLE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/DwarfStringPoolEntry.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DJB.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+/// The DWARF and Apple accelerator tables are an indirect hash table optimized
+/// for null lookup rather than access to known data. The Apple accelerator
+/// tables are a precursor of the newer DWARF v5 accelerator tables. Both
+/// formats share common design ideas.
+///
+/// The Apple accelerator table are output into an on-disk format that looks
+/// like this:
+///
+/// .------------------.
+/// |  HEADER          |
+/// |------------------|
+/// |  BUCKETS         |
+/// |------------------|
+/// |  HASHES          |
+/// |------------------|
+/// |  OFFSETS         |
+/// |------------------|
+/// |  DATA            |
+/// `------------------'
+///
+/// The header contains a magic number, version, type of hash function,
+/// the number of buckets, total number of hashes, and room for a special struct
+/// of data and the length of that struct.
+///
+/// The buckets contain an index (e.g. 6) into the hashes array. The hashes
+/// section contains all of the 32-bit hash values in contiguous memory, and the
+/// offsets contain the offset into the data area for the particular hash.
+///
+/// For a lookup example, we could hash a function name and take it modulo the
+/// number of buckets giving us our bucket. From there we take the bucket value
+/// as an index into the hashes table and look at each successive hash as long
+/// as the hash value is still the same modulo result (bucket value) as earlier.
+/// If we have a match we look at that same entry in the offsets table and grab
+/// the offset in the data for our final match.
+///
+/// The DWARF v5 accelerator table consists of zero or more name indices that
+/// are output into an on-disk format that looks like this:
+///
+/// .------------------.
+/// |  HEADER          |
+/// |------------------|
+/// |  CU LIST         |
+/// |------------------|
+/// |  LOCAL TU LIST   |
+/// |------------------|
+/// |  FOREIGN TU LIST |
+/// |------------------|
+/// |  HASH TABLE      |
+/// |------------------|
+/// |  NAME TABLE      |
+/// |------------------|
+/// |  ABBREV TABLE    |
+/// |------------------|
+/// |  ENTRY POOL      |
+/// `------------------'
+///
+/// For the full documentation please refer to the DWARF 5 standard.
+///
+///
+/// This file defines the class template AccelTable, which is represents an
+/// abstract view of an Accelerator table, without any notion of an on-disk
+/// layout. This class is parameterized by an entry type, which should derive
+/// from AccelTableData. This is the type of individual entries in the table,
+/// and it should store the data necessary to emit them. AppleAccelTableData is
+/// the base class for Apple Accelerator Table entries, which have a uniform
+/// structure based on a sequence of Atoms. There are different sub-classes
+/// derived from AppleAccelTable, which differ in the set of Atoms and how they
+/// obtain their values.
+///
+/// An Apple Accelerator Table can be serialized by calling emitAppleAccelTable
+/// function.
+///
+/// TODO: Add DWARF v5 emission code.
+
+namespace llvm {
+
+class AsmPrinter;
+class DwarfCompileUnit;
+class DwarfDebug;
+
+/// Interface which the different types of accelerator table data have to
+/// conform. It serves as a base class for different values of the template
+/// argument of the AccelTable class template.
+class AccelTableData {
+public:
+  virtual ~AccelTableData() = default;
+
+  bool operator<(const AccelTableData &Other) const {
+    return order() < Other.order();
+  }
+
+    // Subclasses should implement:
+    // static uint32_t hash(StringRef Name);
+
+#ifndef NDEBUG
+  virtual void print(raw_ostream &OS) const = 0;
+#endif
+protected:
+  virtual uint64_t order() const = 0;
+};
+
+/// A base class holding non-template-dependant functionality of the AccelTable
+/// class. Clients should not use this class directly but rather instantiate
+/// AccelTable with a type derived from AccelTableData.
+class AccelTableBase {
+public:
+  using HashFn = uint32_t(StringRef);
+
+  /// Represents a group of entries with identical name (and hence, hash value).
+  struct HashData {
+    DwarfStringPoolEntryRef Name;
+    uint32_t HashValue;
+    std::vector<AccelTableData *> Values;
+    MCSymbol *Sym;
+
+    HashData(DwarfStringPoolEntryRef Name, HashFn *Hash)
+        : Name(Name), HashValue(Hash(Name.getString())) {}
+
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const { print(dbgs()); }
+#endif
+  };
+  using HashList = std::vector<HashData *>;
+  using BucketList = std::vector<HashList>;
+
+protected:
+  /// Allocator for HashData and Values.
+  BumpPtrAllocator Allocator;
+
+  using StringEntries = StringMap<HashData, BumpPtrAllocator &>;
+  StringEntries Entries;
+
+  HashFn *Hash;
+  uint32_t BucketCount;
+  uint32_t UniqueHashCount;
+
+  HashList Hashes;
+  BucketList Buckets;
+
+  void computeBucketCount();
+
+  AccelTableBase(HashFn *Hash) : Entries(Allocator), Hash(Hash) {}
+
+public:
+  void finalize(AsmPrinter *Asm, StringRef Prefix);
+  ArrayRef<HashList> getBuckets() const { return Buckets; }
+  uint32_t getBucketCount() const { return BucketCount; }
+  uint32_t getUniqueHashCount() const { return UniqueHashCount; }
+  uint32_t getUniqueNameCount() const { return Entries.size(); }
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const;
+  void dump() const { print(dbgs()); }
+#endif
+
+  AccelTableBase(const AccelTableBase &) = delete;
+  void operator=(const AccelTableBase &) = delete;
+};
+
+/// This class holds an abstract representation of an Accelerator Table,
+/// consisting of a sequence of buckets, each bucket containint a sequence of
+/// HashData entries. The class is parameterized by the type of entries it
+/// holds. The type template parameter also defines the hash function to use for
+/// hashing names.
+template <typename DataT> class AccelTable : public AccelTableBase {
+public:
+  AccelTable() : AccelTableBase(DataT::hash) {}
+
+  template <typename... Types>
+  void addName(DwarfStringPoolEntryRef Name, Types &&... Args);
+};
+
+template <typename AccelTableDataT>
+template <typename... Types>
+void AccelTable<AccelTableDataT>::addName(DwarfStringPoolEntryRef Name,
+                                          Types &&... Args) {
+  assert(Buckets.empty() && "Already finalized!");
+  // If the string is in the list already then add this die to the list
+  // otherwise add a new one.
+  auto Iter = Entries.try_emplace(Name.getString(), Name, Hash).first;
+  assert(Iter->second.Name == Name);
+  Iter->second.Values.push_back(
+      new (Allocator) AccelTableDataT(std::forward<Types>(Args)...));
+}
+
+/// A base class for different implementations of Data classes for Apple
+/// Accelerator Tables. The columns in the table are defined by the static Atoms
+/// variable defined on the subclasses.
+class AppleAccelTableData : public AccelTableData {
+public:
+  /// An Atom defines the form of the data in an Apple accelerator table.
+  /// Conceptually it is a column in the accelerator consisting of a type and a
+  /// specification of the form of its data.
+  struct Atom {
+    /// Atom Type.
+    const uint16_t Type;
+    /// DWARF Form.
+    const uint16_t Form;
+
+    constexpr Atom(uint16_t Type, uint16_t Form) : Type(Type), Form(Form) {}
+
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const { print(dbgs()); }
+#endif
+  };
+  // Subclasses should define:
+  // static constexpr Atom Atoms[];
+
+  virtual void emit(AsmPrinter *Asm) const = 0;
+
+  static uint32_t hash(StringRef Buffer) { return djbHash(Buffer); }
+};
+
+/// The Data class implementation for DWARF v5 accelerator table. Unlike the
+/// Apple Data classes, this class is just a DIE wrapper, and does not know to
+/// serialize itself. The complete serialization logic is in the
+/// emitDWARF5AccelTable function.
+class DWARF5AccelTableData : public AccelTableData {
+public:
+  static uint32_t hash(StringRef Name) { return caseFoldingDjbHash(Name); }
+
+  DWARF5AccelTableData(const DIE &Die) : Die(Die) {}
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override;
+#endif
+
+  const DIE &getDie() const { return Die; }
+  uint64_t getDieOffset() const { return Die.getOffset(); }
+  unsigned getDieTag() const { return Die.getTag(); }
+
+protected:
+  const DIE &Die;
+
+  uint64_t order() const override { return Die.getOffset(); }
+};
+
+class DWARF5AccelTableStaticData : public AccelTableData {
+public:
+  static uint32_t hash(StringRef Name) { return caseFoldingDjbHash(Name); }
+
+  DWARF5AccelTableStaticData(uint64_t DieOffset, unsigned DieTag,
+                             unsigned CUIndex)
+      : DieOffset(DieOffset), DieTag(DieTag), CUIndex(CUIndex) {}
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override;
+#endif
+
+  uint64_t getDieOffset() const { return DieOffset; }
+  unsigned getDieTag() const { return DieTag; }
+  unsigned getCUIndex() const { return CUIndex; }
+
+protected:
+  uint64_t DieOffset;
+  unsigned DieTag;
+  unsigned CUIndex;
+
+  uint64_t order() const override { return DieOffset; }
+};
+
+void emitAppleAccelTableImpl(AsmPrinter *Asm, AccelTableBase &Contents,
+                             StringRef Prefix, const MCSymbol *SecBegin,
+                             ArrayRef<AppleAccelTableData::Atom> Atoms);
+
+/// Emit an Apple Accelerator Table consisting of entries in the specified
+/// AccelTable. The DataT template parameter should be derived from
+/// AppleAccelTableData.
+template <typename DataT>
+void emitAppleAccelTable(AsmPrinter *Asm, AccelTable<DataT> &Contents,
+                         StringRef Prefix, const MCSymbol *SecBegin) {
+  static_assert(std::is_convertible<DataT *, AppleAccelTableData *>::value, "");
+  emitAppleAccelTableImpl(Asm, Contents, Prefix, SecBegin, DataT::Atoms);
+}
+
+void emitDWARF5AccelTable(AsmPrinter *Asm,
+                          AccelTable<DWARF5AccelTableData> &Contents,
+                          const DwarfDebug &DD,
+                          ArrayRef<std::unique_ptr<DwarfCompileUnit>> CUs);
+
+void emitDWARF5AccelTable(
+    AsmPrinter *Asm, AccelTable<DWARF5AccelTableStaticData> &Contents,
+    ArrayRef<MCSymbol *> CUs,
+    llvm::function_ref<unsigned(const DWARF5AccelTableStaticData &)>
+        getCUIndexForEntry);
+
+/// Accelerator table data implementation for simple Apple accelerator tables
+/// with just a DIE reference.
+class AppleAccelTableOffsetData : public AppleAccelTableData {
+public:
+  AppleAccelTableOffsetData(const DIE &D) : Die(D) {}
+
+  void emit(AsmPrinter *Asm) const override;
+
+#ifndef _MSC_VER
+  // The line below is rejected by older versions (TBD) of MSVC.
+  static constexpr Atom Atoms[] = {
+      Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)};
+#else
+  // FIXME: Erase this path once the minimum MSCV version has been bumped.
+  static const SmallVector<Atom, 4> Atoms;
+#endif
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override;
+#endif
+protected:
+  uint64_t order() const override { return Die.getOffset(); }
+
+  const DIE &Die;
+};
+
+/// Accelerator table data implementation for Apple type accelerator tables.
+class AppleAccelTableTypeData : public AppleAccelTableOffsetData {
+public:
+  AppleAccelTableTypeData(const DIE &D) : AppleAccelTableOffsetData(D) {}
+
+  void emit(AsmPrinter *Asm) const override;
+
+#ifndef _MSC_VER
+  // The line below is rejected by older versions (TBD) of MSVC.
+  static constexpr Atom Atoms[] = {
+      Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
+      Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
+      Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
+#else
+  // FIXME: Erase this path once the minimum MSCV version has been bumped.
+  static const SmallVector<Atom, 4> Atoms;
+#endif
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override;
+#endif
+};
+
+/// Accelerator table data implementation for simple Apple accelerator tables
+/// with a DIE offset but no actual DIE pointer.
+class AppleAccelTableStaticOffsetData : public AppleAccelTableData {
+public:
+  AppleAccelTableStaticOffsetData(uint32_t Offset) : Offset(Offset) {}
+
+  void emit(AsmPrinter *Asm) const override;
+
+#ifndef _MSC_VER
+  // The line below is rejected by older versions (TBD) of MSVC.
+  static constexpr Atom Atoms[] = {
+      Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)};
+#else
+  // FIXME: Erase this path once the minimum MSCV version has been bumped.
+  static const SmallVector<Atom, 4> Atoms;
+#endif
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override;
+#endif
+protected:
+  uint64_t order() const override { return Offset; }
+
+  uint32_t Offset;
+};
+
+/// Accelerator table data implementation for type accelerator tables with
+/// a DIE offset but no actual DIE pointer.
+class AppleAccelTableStaticTypeData : public AppleAccelTableStaticOffsetData {
+public:
+  AppleAccelTableStaticTypeData(uint32_t Offset, uint16_t Tag,
+                                bool ObjCClassIsImplementation,
+                                uint32_t QualifiedNameHash)
+      : AppleAccelTableStaticOffsetData(Offset),
+        QualifiedNameHash(QualifiedNameHash), Tag(Tag),
+        ObjCClassIsImplementation(ObjCClassIsImplementation) {}
+
+  void emit(AsmPrinter *Asm) const override;
+
+#ifndef _MSC_VER
+  // The line below is rejected by older versions (TBD) of MSVC.
+  static constexpr Atom Atoms[] = {
+      Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
+      Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
+      Atom(5, dwarf::DW_FORM_data1), Atom(6, dwarf::DW_FORM_data4)};
+#else
+  // FIXME: Erase this path once the minimum MSCV version has been bumped.
+  static const SmallVector<Atom, 4> Atoms;
+#endif
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const override;
+#endif
+protected:
+  uint64_t order() const override { return Offset; }
+
+  uint32_t QualifiedNameHash;
+  uint16_t Tag;
+  bool ObjCClassIsImplementation;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_DWARFACCELTABLE_H
diff --git a/contrib/llvm/include/llvm/CodeGen/Analysis.h b/contrib/llvm/include/llvm/CodeGen/Analysis.h
index ba88f1f78fb8..d77aee66ed76 100644
--- a/contrib/llvm/include/llvm/CodeGen/Analysis.h
+++ b/contrib/llvm/include/llvm/CodeGen/Analysis.h
@@ -36,7 +36,7 @@ class SDValue;
 class SelectionDAG;
 struct EVT;
 
-/// \brief Compute the linearized index of a member in a nested
+/// Compute the linearized index of a member in a nested
 /// aggregate/struct/array.
 ///
 /// Given an LLVM IR aggregate type and a sequence of insertvalue or
@@ -124,7 +124,7 @@ bool returnTypeIsEligibleForTailCall(const Function *F, const Instruction *I,
                                      const TargetLoweringBase &TLI);
 
 DenseMap<const MachineBasicBlock *, int>
-getFuncletMembership(const MachineFunction &MF);
+getEHScopeMembership(const MachineFunction &MF);
 
 } // End llvm namespace
 
diff --git a/contrib/llvm/include/llvm/CodeGen/AsmPrinter.h b/contrib/llvm/include/llvm/CodeGen/AsmPrinter.h
index b8944a668000..b6056380916c 100644
--- a/contrib/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/contrib/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -50,6 +50,7 @@ class GlobalValue;
 class GlobalVariable;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
+class MachineDominatorTree;
 class MachineFunction;
 class MachineInstr;
 class MachineJumpTableInfo;
@@ -92,11 +93,17 @@ public:
   std::unique_ptr<MCStreamer> OutStreamer;
 
   /// The current machine function.
-  const MachineFunction *MF = nullptr;
+  MachineFunction *MF = nullptr;
 
   /// This is a pointer to the current MachineModuleInfo.
   MachineModuleInfo *MMI = nullptr;
 
+  /// This is a pointer to the current MachineLoopInfo.
+  MachineDominatorTree *MDT = nullptr;
+
+  /// This is a pointer to the current MachineLoopInfo.
+  MachineLoopInfo *MLI = nullptr;
+
   /// Optimization remark emitter.
   MachineOptimizationRemarkEmitter *ORE;
 
@@ -130,9 +137,6 @@ private:
 
   static char ID;
 
-  /// If VerboseAsm is set, a pointer to the loop info for this function.
-  MachineLoopInfo *LI = nullptr;
-
   struct HandlerInfo {
     AsmPrinterHandler *Handler;
     const char *TimerName;
@@ -161,6 +165,12 @@ public:
   };
 
 private:
+  /// If generated on the fly this own the instance.
+  std::unique_ptr<MachineDominatorTree> OwnedMDT;
+
+  /// If generated on the fly this own the instance.
+  std::unique_ptr<MachineLoopInfo> OwnedMLI;
+
   /// Structure for generating diagnostics for inline assembly. Only initialised
   /// when necessary.
   mutable std::unique_ptr<SrcMgrDiagInfo> DiagInfo;
@@ -191,6 +201,10 @@ public:
   /// Return a unique ID for the current function.
   unsigned getFunctionNumber() const;
 
+  /// Return symbol for the function pseudo stack if the stack frame is not a
+  /// register based.
+  virtual const MCSymbol *getFunctionFrameSymbol() const { return nullptr; }
+
   MCSymbol *getFunctionBegin() const { return CurrentFnBegin; }
   MCSymbol *getFunctionEnd() const { return CurrentFnEnd; }
   MCSymbol *getCurExceptionSym();
@@ -228,6 +242,7 @@ public:
     TAIL_CALL = 2,
     LOG_ARGS_ENTER = 3,
     CUSTOM_EVENT = 4,
+    TYPED_EVENT = 5,
   };
 
   // The table will contain these structs that point to the sled, the function
@@ -327,15 +342,15 @@ public:
   /// global value is specified, and if that global has an explicit alignment
   /// requested, it will override the alignment request if required for
   /// correctness.
-  void EmitAlignment(unsigned NumBits, const GlobalObject *GO = nullptr) const;
+  void EmitAlignment(unsigned NumBits, const GlobalObject *GV = nullptr) const;
 
   /// Lower the specified LLVM Constant to an MCExpr.
   virtual const MCExpr *lowerConstant(const Constant *CV);
 
-  /// \brief Print a general LLVM constant to the .s file.
+  /// Print a general LLVM constant to the .s file.
   void EmitGlobalConstant(const DataLayout &DL, const Constant *CV);
 
-  /// \brief Unnamed constant global variables solely contaning a pointer to
+  /// Unnamed constant global variables solely contaning a pointer to
   /// another globals variable act like a global variable "proxy", or GOT
   /// equivalents, i.e., it's only used to hold the address of the latter. One
   /// optimization is to replace accesses to these proxies by using the GOT
@@ -345,7 +360,7 @@ public:
   /// accesses to GOT entries.
   void computeGlobalGOTEquivs(Module &M);
 
-  /// \brief Constant expressions using GOT equivalent globals may not be
+  /// Constant expressions using GOT equivalent globals may not be
   /// eligible for PC relative GOT entry conversion, in such cases we need to
   /// emit the proxies we previously omitted in EmitGlobalVariable.
   void emitGlobalGOTEquivs();
@@ -444,13 +459,16 @@ public:
   void printOffset(int64_t Offset, raw_ostream &OS) const;
 
   /// Emit a byte directive and value.
-  void EmitInt8(int Value) const;
+  void emitInt8(int Value) const;
 
   /// Emit a short directive and value.
-  void EmitInt16(int Value) const;
+  void emitInt16(int Value) const;
 
   /// Emit a long directive and value.
-  void EmitInt32(int Value) const;
+  void emitInt32(int Value) const;
+
+  /// Emit a long long directive and value.
+  void emitInt64(uint64_t Value) const;
 
   /// Emit something like ".long Hi-Lo" where the size in bytes of the directive
   /// is specified by Size and Hi/Lo specify the labels.  This implicitly uses
@@ -458,6 +476,10 @@ public:
   void EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                            unsigned Size) const;
 
+  /// Emit something like ".uleb128 Hi-Lo".
+  void EmitLabelDifferenceAsULEB128(const MCSymbol *Hi,
+                                    const MCSymbol *Lo) const;
+
   /// Emit something like ".long Label+Offset" where the size in bytes of the
   /// directive is specified by Size and Label specifies the label.  This
   /// implicitly uses .set if it is available.
@@ -471,6 +493,9 @@ public:
     EmitLabelPlusOffset(Label, 0, Size, IsSectionRelative);
   }
 
+  /// Emit something like ".long Label + Offset".
+  void EmitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const;
+
   //===------------------------------------------------------------------===//
   // Dwarf Emission Helper Routines
   //===------------------------------------------------------------------===//
@@ -481,11 +506,6 @@ public:
   /// Emit the specified unsigned leb128 value.
   void EmitULEB128(uint64_t Value, const char *Desc = nullptr) const;
 
-  /// Emit the specified unsigned leb128 value padded to a specific number
-  /// bytes
-  void EmitPaddedULEB128(uint64_t Value, unsigned PadTo,
-                         const char *Desc = nullptr) const;
-
   /// Emit a .byte 42 directive that corresponds to an encoding.  If verbose
   /// assembly output is enabled, we output comments describing the encoding.
   /// Desc is a string saying what the encoding is specifying (e.g. "LSDA").
@@ -508,7 +528,12 @@ public:
   /// When possible, emit a DwarfStringPool section offset without any
   /// relocations, and without using the symbol.  Otherwise, defers to \a
   /// emitDwarfSymbolReference().
-  void emitDwarfStringOffset(DwarfStringPoolEntryRef S) const;
+  void emitDwarfStringOffset(DwarfStringPoolEntry S) const;
+
+  /// Emit the 4-byte offset of a string from the start of its section.
+  void emitDwarfStringOffset(DwarfStringPoolEntryRef S) const {
+    emitDwarfStringOffset(S.getEntry());
+  }
 
   /// Get the value for DW_AT_APPLE_isa. Zero if no isa encoding specified.
   virtual unsigned getISAEncoding() { return 0; }
@@ -523,10 +548,10 @@ public:
   // Dwarf Lowering Routines
   //===------------------------------------------------------------------===//
 
-  /// \brief Emit frame instruction to describe the layout of the frame.
+  /// Emit frame instruction to describe the layout of the frame.
   void emitCFIInstruction(const MCCFIInstruction &Inst) const;
 
-  /// \brief Emit Dwarf abbreviation table.
+  /// Emit Dwarf abbreviation table.
   template <typename T> void emitDwarfAbbrevs(const T &Abbrevs) const {
     // For each abbreviation.
     for (const auto &Abbrev : Abbrevs)
@@ -538,7 +563,7 @@ public:
 
   void emitDwarfAbbrev(const DIEAbbrev &Abbrev) const;
 
-  /// \brief Recursively emit Dwarf DIE tree.
+  /// Recursively emit Dwarf DIE tree.
   void emitDwarfDIE(const DIE &Die) const;
 
   //===------------------------------------------------------------------===//
@@ -625,10 +650,9 @@ private:
   void EmitXXStructorList(const DataLayout &DL, const Constant *List,
                           bool isCtor);
 
-  GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy &C);
+  GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy &S);
   /// Emit GlobalAlias or GlobalIFunc.
-  void emitGlobalIndirectSymbol(Module &M,
-                                const GlobalIndirectSymbol& GIS);
+  void emitGlobalIndirectSymbol(Module &M, const GlobalIndirectSymbol &GIS);
   void setupCodePaddingContext(const MachineBasicBlock &MBB,
                                MCCodePaddingContext &Context) const;
 };
diff --git a/contrib/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/contrib/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
index 1f9c96b18e1b..b1adf66e7ff4 100644
--- a/contrib/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
+++ b/contrib/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -26,7 +26,7 @@ using CreateCmpXchgInstFun =
     function_ref<void(IRBuilder<> &, Value *, Value *, Value *, AtomicOrdering,
                       Value *&, Value *&)>;
 
-/// \brief Expand an atomic RMW instruction into a loop utilizing
+/// Expand an atomic RMW instruction into a loop utilizing
 /// cmpxchg. You'll want to make sure your target machine likes cmpxchg
 /// instructions in the first place and that there isn't another, better,
 /// transformation available (for example AArch32/AArch64 have linked loads).
@@ -58,7 +58,7 @@ using CreateCmpXchgInstFun =
 ///     [...]
 ///
 /// Returns true if the containing function was modified.
-bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun Factory);
+bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 526ddb1b9706..f76a2426377a 100644
--- a/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -26,7 +26,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -47,6 +46,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
@@ -65,7 +65,7 @@ class TargetMachine;
 
 extern cl::opt<unsigned> PartialUnrollingThreshold;
 
-/// \brief Base class which can be used to help build a TTI implementation.
+/// Base class which can be used to help build a TTI implementation.
 ///
 /// This class provides as much implementation of the TTI interface as is
 /// possible using the target independent parts of the code generator.
@@ -101,16 +101,32 @@ private:
     return Cost;
   }
 
-  /// \brief Local query method delegates up to T which *must* implement this!
+  /// Local query method delegates up to T which *must* implement this!
   const TargetSubtargetInfo *getST() const {
     return static_cast<const T *>(this)->getST();
   }
 
-  /// \brief Local query method delegates up to T which *must* implement this!
+  /// Local query method delegates up to T which *must* implement this!
   const TargetLoweringBase *getTLI() const {
     return static_cast<const T *>(this)->getTLI();
   }
 
+  static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
+    switch (M) {
+      case TTI::MIM_Unindexed:
+        return ISD::UNINDEXED;
+      case TTI::MIM_PreInc:
+        return ISD::PRE_INC;
+      case TTI::MIM_PreDec:
+        return ISD::PRE_DEC;
+      case TTI::MIM_PostInc:
+        return ISD::POST_INC;
+      case TTI::MIM_PostDec:
+        return ISD::POST_DEC;
+    }
+    llvm_unreachable("Unexpected MemIndexedMode");
+  }
+
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
@@ -157,6 +173,18 @@ public:
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
   }
 
+  bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
+                          const DataLayout &DL) const {
+    EVT VT = getTLI()->getValueType(DL, Ty);
+    return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
+  }
+
+  bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
+                           const DataLayout &DL) const {
+    EVT VT = getTLI()->getValueType(DL, Ty);
+    return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
+  }
+
   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
   }
@@ -179,6 +207,8 @@ public:
     return getTLI()->isProfitableToHoist(I);
   }
 
+  bool useAA() const { return getST()->useAA(); }
+
   bool isTypeLegal(Type *Ty) {
     EVT VT = getTLI()->getValueType(DL, Ty);
     return getTLI()->isTypeLegal(VT);
@@ -240,7 +270,7 @@ public:
     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
 
     // Early exit if both a jump table and bit test are not allowed.
-    if (N < 1 || (!IsJTAllowed && DL.getPointerSizeInBits() < N))
+    if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
       return N;
 
     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
@@ -254,7 +284,7 @@ public:
     }
 
     // Check if suitable for a bit test
-    if (N <= DL.getPointerSizeInBits()) {
+    if (N <= DL.getIndexSizeInBits(0u)) {
       SmallPtrSet<const BasicBlock *, 4> Dests;
       for (auto I : SI.cases())
         Dests.insert(I.getCaseSuccessor());
@@ -523,11 +553,15 @@ public:
 
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                           Type *SubTp) {
-    if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc ||
-        Kind == TTI::SK_PermuteSingleSrc) {
+    switch (Kind) {
+    case TTI::SK_Select:
+    case TTI::SK_Transpose:
+    case TTI::SK_PermuteSingleSrc:
+    case TTI::SK_PermuteTwoSrc:
       return getPermuteShuffleOverhead(Tp);
+    default:
+      return 1;
     }
-    return 1;
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -614,7 +648,7 @@ public:
       }
 
       // If we are legalizing by splitting, query the concrete TTI for the cost
-      // of casting the original vector twice. We also need to factor int the
+      // of casting the original vector twice. We also need to factor in the
       // cost of the split itself. Count that as 1, to be consistent with
       // TLI->getTypeLegalizationCost().
       if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
@@ -916,6 +950,20 @@ public:
                                                        RetTy, Args[0], VarMask,
                                                        Alignment);
     }
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_fadd:
+    case Intrinsic::experimental_vector_reduce_fmul:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin:
+    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::experimental_vector_reduce_umin:
+      return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
     }
   }
 
@@ -1039,6 +1087,39 @@ public:
     case Intrinsic::masked_load:
       return static_cast<T *>(this)
           ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
+    case Intrinsic::experimental_vector_reduce_add:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::Add, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_mul:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::Mul, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_and:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::And, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_or:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::Or, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_xor:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_fadd:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::FAdd, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_fmul:
+      return static_cast<T *>(this)->getArithmeticReductionCost(
+          Instruction::FMul, Tys[0], /*IsPairwiseForm=*/false);
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin:
+      return static_cast<T *>(this)->getMinMaxReductionCost(
+          Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
+          /*IsSigned=*/true);
+    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::experimental_vector_reduce_umin:
+      return static_cast<T *>(this)->getMinMaxReductionCost(
+          Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
+          /*IsSigned=*/false);
     case Intrinsic::ctpop:
       ISDs.push_back(ISD::CTPOP);
       // In case of legalization use TCC_Expensive. This is cheaper than a
@@ -1123,7 +1204,7 @@ public:
     return SingleCallCost;
   }
 
-  /// \brief Compute a cost of the given call instruction.
+  /// Compute a cost of the given call instruction.
   ///
   /// Compute the cost of calling function F with return type RetTy and
   /// argument types Tys. F might be nullptr, in this case the cost of an
@@ -1284,7 +1365,7 @@ public:
   /// @}
 };
 
-/// \brief Concrete BasicTTIImpl that can be used if no further customization
+/// Concrete BasicTTIImpl that can be used if no further customization
 /// is needed.
 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
@@ -1298,7 +1379,7 @@ class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
   const TargetLoweringBase *getTLI() const { return TLI; }
 
 public:
-  explicit BasicTTIImpl(const TargetMachine *ST, const Function &F);
+  explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/contrib/llvm/include/llvm/CodeGen/CalcSpillWeights.h
index d9e8206408a7..f85767f1fc11 100644
--- a/contrib/llvm/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/contrib/llvm/include/llvm/CodeGen/CalcSpillWeights.h
@@ -22,7 +22,7 @@ class MachineFunction;
 class MachineLoopInfo;
 class VirtRegMap;
 
-  /// \brief Normalize the spill weight of a live interval
+  /// Normalize the spill weight of a live interval
   ///
   /// The spill weight of a live interval is computed as:
   ///
@@ -42,7 +42,7 @@ class VirtRegMap;
     return UseDefFreq / (Size + 25*SlotIndex::InstrDist);
   }
 
-  /// \brief Calculate auxiliary information for a virtual register such as its
+  /// Calculate auxiliary information for a virtual register such as its
   /// spill weight and allocation hint.
   class VirtRegAuxInfo {
   public:
@@ -64,10 +64,10 @@ class VirtRegMap;
                    NormalizingFn norm = normalizeSpillWeight)
         : MF(mf), LIS(lis), VRM(vrm), Loops(loops), MBFI(mbfi), normalize(norm) {}
 
-    /// \brief (re)compute li's spill weight and allocation hint.
+    /// (re)compute li's spill weight and allocation hint.
     void calculateSpillWeightAndHint(LiveInterval &li);
 
-    /// \brief Compute future expected spill weight of a split artifact of li
+    /// Compute future expected spill weight of a split artifact of li
     /// that will span between start and end slot indexes.
     /// \param li     The live interval to be split.
     /// \param start  The expected begining of the split artifact. Instructions
@@ -78,7 +78,7 @@ class VirtRegMap;
     /// negative weight for unspillable li.
     float futureWeight(LiveInterval &li, SlotIndex start, SlotIndex end);
 
-    /// \brief Helper function for weight calculations.
+    /// Helper function for weight calculations.
     /// (Re)compute li's spill weight and allocation hint, or, for non null
     /// start and end - compute future expected spill weight of a split
     /// artifact of li that will span between start and end slot indexes.
@@ -94,7 +94,7 @@ class VirtRegMap;
                            SlotIndex *end = nullptr);
   };
 
-  /// \brief Compute spill weights and allocation hints for all virtual register
+  /// Compute spill weights and allocation hints for all virtual register
   /// live intervals.
   void calculateSpillWeightsAndHints(LiveIntervals &LIS, MachineFunction &MF,
                                      VirtRegMap *VRM,
diff --git a/contrib/llvm/include/llvm/CodeGen/CallingConvLower.h b/contrib/llvm/include/llvm/CodeGen/CallingConvLower.h
index d30a27328c01..efcf80ba0b4e 100644
--- a/contrib/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/contrib/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -304,7 +304,7 @@ public:
   /// CheckReturn - Analyze the return values of a function, returning
   /// true if the return can be performed without sret-demotion, and
   /// false otherwise.
-  bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+  bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
                    CCAssignFn Fn);
 
   /// AnalyzeCallOperands - Analyze the outgoing arguments to a call,
diff --git a/contrib/llvm/include/llvm/CodeGen/CommandFlags.def b/contrib/llvm/include/llvm/CodeGen/CommandFlags.inc
index fe96033a9c61..7d2d167289e0 100644
--- a/contrib/llvm/include/llvm/CodeGen/CommandFlags.def
+++ b/contrib/llvm/include/llvm/CodeGen/CommandFlags.inc
@@ -17,7 +17,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCTargetOptionsCommandFlags.def"
+#include "llvm/MC/MCTargetOptionsCommandFlags.inc"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -98,7 +98,9 @@ static cl::opt<llvm::ExceptionHandling> ExceptionModel(
         clEnumValN(ExceptionHandling::SjLj, "sjlj", "SjLj exception handling"),
         clEnumValN(ExceptionHandling::ARM, "arm", "ARM EHABI exceptions"),
         clEnumValN(ExceptionHandling::WinEH, "wineh",
-                   "Windows exception model")));
+                   "Windows exception model"),
+        clEnumValN(ExceptionHandling::Wasm, "wasm",
+                   "WebAssembly exception handling")));
 
 static cl::opt<TargetMachine::CodeGenFileType> FileType(
     "filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
@@ -259,6 +261,10 @@ static cl::opt<bool> EnableStackSizeSection(
     "stack-size-section",
     cl::desc("Emit a section containing stack size metadata"), cl::init(false));
 
+static cl::opt<bool>
+    EnableAddrsig("addrsig", cl::desc("Emit an address-significance table"),
+                  cl::init(false));
+
 // Common utility function tightly tied to the options listed here. Initializes
 // a TargetOptions object with CodeGen flags and returns it.
 static TargetOptions InitTargetOptionsFromCodeGenFlags() {
@@ -284,8 +290,10 @@ static TargetOptions InitTargetOptionsFromCodeGenFlags() {
   Options.FunctionSections = FunctionSections;
   Options.UniqueSectionNames = UniqueSectionNames;
   Options.EmulatedTLS = EmulatedTLS;
+  Options.ExplicitEmulatedTLS = EmulatedTLS.getNumOccurrences() > 0;
   Options.ExceptionModel = ExceptionModel;
   Options.EmitStackSizeSection = EnableStackSizeSection;
+  Options.EmitAddrsig = EnableAddrsig;
 
   Options.MCOptions = InitMCTargetOptionsFromFlags();
 
@@ -326,7 +334,27 @@ LLVM_ATTRIBUTE_UNUSED static std::string getFeaturesStr() {
   return Features.getString();
 }
 
-/// \brief Set function attributes of functions in Module M based on CPU,
+LLVM_ATTRIBUTE_UNUSED static std::vector<std::string> getFeatureList() {
+  SubtargetFeatures Features;
+
+  // If user asked for the 'native' CPU, we need to autodetect features.
+  // This is necessary for x86 where the CPU might not support all the
+  // features the autodetected CPU name lists in the target. For example,
+  // not all Sandybridge processors support AVX.
+  if (MCPU == "native") {
+    StringMap<bool> HostFeatures;
+    if (sys::getHostCPUFeatures(HostFeatures))
+      for (auto &F : HostFeatures)
+        Features.AddFeature(F.first(), F.second);
+  }
+
+  for (unsigned i = 0; i != MAttrs.size(); ++i)
+    Features.AddFeature(MAttrs[i]);
+
+  return Features.getFeatures();
+}
+
+/// Set function attributes of functions in Module M based on CPU,
 /// Features, and command line flags.
 LLVM_ATTRIBUTE_UNUSED static void
 setFunctionAttributes(StringRef CPU, StringRef Features, Module &M) {
diff --git a/contrib/llvm/include/llvm/CodeGen/CostTable.h b/contrib/llvm/include/llvm/CodeGen/CostTable.h
index 5a6368c5a0f8..48ad76971520 100644
--- a/contrib/llvm/include/llvm/CodeGen/CostTable.h
+++ b/contrib/llvm/include/llvm/CodeGen/CostTable.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Cost tables and simple lookup functions
+/// Cost tables and simple lookup functions
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,7 +17,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/include/llvm/CodeGen/DIE.h b/contrib/llvm/include/llvm/CodeGen/DIE.h
index f809fc97fe59..7d486b1df56d 100644
--- a/contrib/llvm/include/llvm/CodeGen/DIE.h
+++ b/contrib/llvm/include/llvm/CodeGen/DIE.h
@@ -136,7 +136,7 @@ class DIEAbbrevSet {
   /// The bump allocator to use when creating DIEAbbrev objects in the uniqued
   /// storage container.
   BumpPtrAllocator &Alloc;
-  /// \brief FoldingSet that uniques the abbreviations.
+  /// FoldingSet that uniques the abbreviations.
   FoldingSet<DIEAbbrev> AbbreviationsSet;
   /// A list of all the unique abbreviations in use.
   std::vector<DIEAbbrev *> Abbreviations;
@@ -190,7 +190,7 @@ public:
   uint64_t getValue() const { return Integer; }
   void setValue(uint64_t Val) { Integer = Val; }
 
-  void EmitValue(const AsmPrinter *AP, dwarf::Form Form) const;
+  void EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const;
   unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const;
 
   void print(raw_ostream &O) const;
@@ -868,7 +868,7 @@ public:
     return dwarf::DW_FORM_block;
   }
 
-  void EmitValue(const AsmPrinter *AP, dwarf::Form Form) const;
+  void EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const;
   unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const;
 
   void print(raw_ostream &O) const;
@@ -899,7 +899,7 @@ public:
     return dwarf::DW_FORM_block;
   }
 
-  void EmitValue(const AsmPrinter *AP, dwarf::Form Form) const;
+  void EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const;
   unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const;
 
   void print(raw_ostream &O) const;
diff --git a/contrib/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h b/contrib/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
index fc2b5ddd2d2c..e6c0483cfc35 100644
--- a/contrib/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
+++ b/contrib/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
@@ -41,6 +41,8 @@ public:
   unsigned getOffset() const { return I->second.Offset; }
   unsigned getIndex() const { return I->second.Index; }
   StringRef getString() const { return I->first(); }
+  /// Return the entire string pool entry for convenience.
+  DwarfStringPoolEntry getEntry() const { return I->getValue(); }
 
   bool operator==(const DwarfStringPoolEntryRef &X) const { return I == X.I; }
   bool operator!=(const DwarfStringPoolEntryRef &X) const { return I != X.I; }
diff --git a/contrib/llvm/include/llvm/CodeGen/ExecutionDepsFix.h b/contrib/llvm/include/llvm/CodeGen/ExecutionDepsFix.h
deleted file mode 100644
index f4db8b7322da..000000000000
--- a/contrib/llvm/include/llvm/CodeGen/ExecutionDepsFix.h
+++ /dev/null
@@ -1,230 +0,0 @@
-//==- llvm/CodeGen/ExecutionDepsFix.h - Execution Dependency Fix -*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Execution Dependency Fix pass.
-///
-/// Some X86 SSE instructions like mov, and, or, xor are available in different
-/// variants for different operand types. These variant instructions are
-/// equivalent, but on Nehalem and newer cpus there is extra latency
-/// transferring data between integer and floating point domains.  ARM cores
-/// have similar issues when they are configured with both VFP and NEON
-/// pipelines.
-///
-/// This pass changes the variant instructions to minimize domain crossings.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_EXECUTIONDEPSFIX_H
-#define LLVM_CODEGEN_EXECUTIONDEPSFIX_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <limits>
-#include <utility>
-#include <vector>
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineInstr;
-class TargetInstrInfo;
-
-/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
-/// of execution domains.
-///
-/// An open DomainValue represents a set of instructions that can still switch
-/// execution domain. Multiple registers may refer to the same open
-/// DomainValue - they will eventually be collapsed to the same execution
-/// domain.
-///
-/// A collapsed DomainValue represents a single register that has been forced
-/// into one of more execution domains. There is a separate collapsed
-/// DomainValue for each register, but it may contain multiple execution
-/// domains. A register value is initially created in a single execution
-/// domain, but if we were forced to pay the penalty of a domain crossing, we
-/// keep track of the fact that the register is now available in multiple
-/// domains.
-struct DomainValue {
-  // Basic reference counting.
-  unsigned Refs = 0;
-
-  // Bitmask of available domains. For an open DomainValue, it is the still
-  // possible domains for collapsing. For a collapsed DomainValue it is the
-  // domains where the register is available for free.
-  unsigned AvailableDomains;
-
-  // Pointer to the next DomainValue in a chain.  When two DomainValues are
-  // merged, Victim.Next is set to point to Victor, so old DomainValue
-  // references can be updated by following the chain.
-  DomainValue *Next;
-
-  // Twiddleable instructions using or defining these registers.
-  SmallVector<MachineInstr*, 8> Instrs;
-
-  DomainValue() { clear(); }
-
-  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
-  // track of the domains where the registers are already available.
-  bool isCollapsed() const { return Instrs.empty(); }
-
-  // Is domain available?
-  bool hasDomain(unsigned domain) const {
-    assert(domain <
-               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
-           "undefined behavior");
-    return AvailableDomains & (1u << domain);
-  }
-
-  // Mark domain as available.
-  void addDomain(unsigned domain) {
-    AvailableDomains |= 1u << domain;
-  }
-
-  // Restrict to a single domain available.
-  void setSingleDomain(unsigned domain) {
-    AvailableDomains = 1u << domain;
-  }
-
-  // Return bitmask of domains that are available and in mask.
-  unsigned getCommonDomains(unsigned mask) const {
-    return AvailableDomains & mask;
-  }
-
-  // First domain available.
-  unsigned getFirstDomain() const {
-    return countTrailingZeros(AvailableDomains);
-  }
-
-  // Clear this DomainValue and point to next which has all its data.
-  void clear() {
-    AvailableDomains = 0;
-    Next = nullptr;
-    Instrs.clear();
-  }
-};
-
-/// Information about a live register.
-struct LiveReg {
-  /// Value currently in this register, or NULL when no value is being tracked.
-  /// This counts as a DomainValue reference.
-  DomainValue *Value;
-
-  /// Instruction that defined this register, relative to the beginning of the
-  /// current basic block.  When a LiveReg is used to represent a live-out
-  /// register, this value is relative to the end of the basic block, so it
-  /// will be a negative number.
-  int Def;
-};
-
-class ExecutionDepsFix : public MachineFunctionPass {
-  SpecificBumpPtrAllocator<DomainValue> Allocator;
-  SmallVector<DomainValue*,16> Avail;
-
-  const TargetRegisterClass *const RC;
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  RegisterClassInfo RegClassInfo;
-  std::vector<SmallVector<int, 1>> AliasMap;
-  const unsigned NumRegs;
-  LiveReg *LiveRegs;
-  struct MBBInfo {
-    // Keeps clearance and domain information for all registers. Note that this
-    // is different from the usual definition notion of liveness. The CPU
-    // doesn't care whether or not we consider a register killed.
-    LiveReg *OutRegs = nullptr;
-
-    // Whether we have gotten to this block in primary processing yet.
-    bool PrimaryCompleted = false;
-
-    // The number of predecessors for which primary processing has completed
-    unsigned IncomingProcessed = 0;
-
-    // The value of `IncomingProcessed` at the start of primary processing
-    unsigned PrimaryIncoming = 0;
-
-    // The number of predecessors for which all processing steps are done.
-    unsigned IncomingCompleted = 0;
-
-    MBBInfo() = default;
-  };
-  using MBBInfoMap = DenseMap<MachineBasicBlock *, MBBInfo>;
-  MBBInfoMap MBBInfos;
-
-  /// List of undefined register reads in this block in forward order.
-  std::vector<std::pair<MachineInstr *, unsigned>> UndefReads;
-
-  /// Storage for register unit liveness.
-  LivePhysRegs LiveRegSet;
-
-  /// Current instruction number.
-  /// The first instruction in each basic block is 0.
-  int CurInstr;
-
-public:
-  ExecutionDepsFix(char &PassID, const TargetRegisterClass &RC)
-    : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
-  }
-
-private:
-  iterator_range<SmallVectorImpl<int>::const_iterator>
-  regIndices(unsigned Reg) const;
-  // DomainValue allocation.
-  DomainValue *alloc(int domain = -1);
-  DomainValue *retain(DomainValue *DV) {
-    if (DV) ++DV->Refs;
-    return DV;
-  }
-  void release(DomainValue*);
-  DomainValue *resolve(DomainValue*&);
-
-  // LiveRegs manipulations.
-  void setLiveReg(int rx, DomainValue *DV);
-  void kill(int rx);
-  void force(int rx, unsigned domain);
-  void collapse(DomainValue *dv, unsigned domain);
-  bool merge(DomainValue *A, DomainValue *B);
-
-  void enterBasicBlock(MachineBasicBlock*);
-  void leaveBasicBlock(MachineBasicBlock*);
-  bool isBlockDone(MachineBasicBlock *);
-  void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass);
-  bool visitInstr(MachineInstr *);
-  void processDefs(MachineInstr *, bool breakDependency, bool Kill);
-  void visitSoftInstr(MachineInstr*, unsigned mask);
-  void visitHardInstr(MachineInstr*, unsigned domain);
-  bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                unsigned Pref);
-  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
-  void processUndefReads(MachineBasicBlock*);
-};
-
-} // end namepsace llvm
-
-#endif // LLVM_CODEGEN_EXECUTIONDEPSFIX_H
diff --git a/contrib/llvm/include/llvm/CodeGen/ExecutionDomainFix.h b/contrib/llvm/include/llvm/CodeGen/ExecutionDomainFix.h
new file mode 100644
index 000000000000..338c214dd073
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/ExecutionDomainFix.h
@@ -0,0 +1,213 @@
+//==-- llvm/CodeGen/ExecutionDomainFix.h - Execution Domain Fix -*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Execution Domain Fix pass.
+///
+/// Some X86 SSE instructions like mov, and, or, xor are available in different
+/// variants for different operand types. These variant instructions are
+/// equivalent, but on Nehalem and newer cpus there is extra latency
+/// transferring data between integer and floating point domains.  ARM cores
+/// have similar issues when they are configured with both VFP and NEON
+/// pipelines.
+///
+/// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXECUTIONDOMAINFIX_H
+#define LLVM_CODEGEN_EXECUTIONDOMAINFIX_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LoopTraversal.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineInstr;
+class TargetInstrInfo;
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact that the register is now available in multiple
+/// domains.
+struct DomainValue {
+  /// Basic reference counting.
+  unsigned Refs = 0;
+
+  /// Bitmask of available domains. For an open DomainValue, it is the still
+  /// possible domains for collapsing. For a collapsed DomainValue it is the
+  /// domains where the register is available for free.
+  unsigned AvailableDomains;
+
+  /// Pointer to the next DomainValue in a chain.  When two DomainValues are
+  /// merged, Victim.Next is set to point to Victor, so old DomainValue
+  /// references can be updated by following the chain.
+  DomainValue *Next;
+
+  /// Twiddleable instructions using or defining these registers.
+  SmallVector<MachineInstr *, 8> Instrs;
+
+  DomainValue() { clear(); }
+
+  /// A collapsed DomainValue has no instructions to twiddle - it simply keeps
+  /// track of the domains where the registers are already available.
+  bool isCollapsed() const { return Instrs.empty(); }
+
+  /// Is domain available?
+  bool hasDomain(unsigned domain) const {
+    assert(domain <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
+    return AvailableDomains & (1u << domain);
+  }
+
+  /// Mark domain as available.
+  void addDomain(unsigned domain) { AvailableDomains |= 1u << domain; }
+
+  // Restrict to a single domain available.
+  void setSingleDomain(unsigned domain) { AvailableDomains = 1u << domain; }
+
+  /// Return bitmask of domains that are available and in mask.
+  unsigned getCommonDomains(unsigned mask) const {
+    return AvailableDomains & mask;
+  }
+
+  /// First domain available.
+  unsigned getFirstDomain() const {
+    return countTrailingZeros(AvailableDomains);
+  }
+
+  /// Clear this DomainValue and point to next which has all its data.
+  void clear() {
+    AvailableDomains = 0;
+    Next = nullptr;
+    Instrs.clear();
+  }
+};
+
+class ExecutionDomainFix : public MachineFunctionPass {
+  SpecificBumpPtrAllocator<DomainValue> Allocator;
+  SmallVector<DomainValue *, 16> Avail;
+
+  const TargetRegisterClass *const RC;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  std::vector<SmallVector<int, 1>> AliasMap;
+  const unsigned NumRegs;
+  /// Value currently in each register, or NULL when no value is being tracked.
+  /// This counts as a DomainValue reference.
+  using LiveRegsDVInfo = std::vector<DomainValue *>;
+  LiveRegsDVInfo LiveRegs;
+  /// Keeps domain information for all registers. Note that this
+  /// is different from the usual definition notion of liveness. The CPU
+  /// doesn't care whether or not we consider a register killed.
+  using OutRegsInfoMap = SmallVector<LiveRegsDVInfo, 4>;
+  OutRegsInfoMap MBBOutRegsInfos;
+
+  ReachingDefAnalysis *RDA;
+
+public:
+  ExecutionDomainFix(char &PassID, const TargetRegisterClass &RC)
+      : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<ReachingDefAnalysis>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  /// Translate TRI register number to a list of indices into our smaller tables
+  /// of interesting registers.
+  iterator_range<SmallVectorImpl<int>::const_iterator>
+  regIndices(unsigned Reg) const;
+
+  /// DomainValue allocation.
+  DomainValue *alloc(int domain = -1);
+
+  /// Add reference to DV.
+  DomainValue *retain(DomainValue *DV) {
+    if (DV)
+      ++DV->Refs;
+    return DV;
+  }
+
+  /// Release a reference to DV.  When the last reference is released,
+  /// collapse if needed.
+  void release(DomainValue *);
+
+  /// Follow the chain of dead DomainValues until a live DomainValue is reached.
+  /// Update the referenced pointer when necessary.
+  DomainValue *resolve(DomainValue *&);
+
+  /// Set LiveRegs[rx] = dv, updating reference counts.
+  void setLiveReg(int rx, DomainValue *DV);
+
+  /// Kill register rx, recycle or collapse any DomainValue.
+  void kill(int rx);
+
+  /// Force register rx into domain.
+  void force(int rx, unsigned domain);
+
+  /// Collapse open DomainValue into given domain. If there are multiple
+  /// registers using dv, they each get a unique collapsed DomainValue.
+  void collapse(DomainValue *dv, unsigned domain);
+
+  /// All instructions and registers in B are moved to A, and B is released.
+  bool merge(DomainValue *A, DomainValue *B);
+
+  /// Set up LiveRegs by merging predecessor live-out values.
+  void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
+
+  /// Update live-out values.
+  void leaveBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
+
+  /// Process he given basic block.
+  void processBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
+
+  /// Visit given insturcion.
+  bool visitInstr(MachineInstr *);
+
+  /// Update def-ages for registers defined by MI.
+  /// If Kill is set, also kill off DomainValues clobbered by the defs.
+  void processDefs(MachineInstr *, bool Kill);
+
+  /// A soft instruction can be changed to work in other domains given by mask.
+  void visitSoftInstr(MachineInstr *, unsigned mask);
+
+  /// A hard instruction only works in one domain. All input registers will be
+  /// forced into that domain.
+  void visitHardInstr(MachineInstr *, unsigned domain);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_EXECUTIONDOMAINFIX_H
diff --git a/contrib/llvm/include/llvm/CodeGen/FastISel.h b/contrib/llvm/include/llvm/CodeGen/FastISel.h
index 85bb826dcb8c..865d8a88b8cc 100644
--- a/contrib/llvm/include/llvm/CodeGen/FastISel.h
+++ b/contrib/llvm/include/llvm/CodeGen/FastISel.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
@@ -28,6 +27,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/MachineValueType.h"
 #include <algorithm>
 #include <cstdint>
 #include <utility>
@@ -61,7 +61,7 @@ class Type;
 class User;
 class Value;
 
-/// \brief This is a fast-path instruction selection class that generates poor
+/// This is a fast-path instruction selection class that generates poor
 /// code and doesn't support illegal types or non-trivial lowering, but runs
 /// quickly.
 class FastISel {
@@ -78,7 +78,7 @@ public:
     bool IsReturnValueUsed : 1;
     bool IsPatchPoint : 1;
 
-    // \brief IsTailCall Should be modified by implementations of FastLowerCall
+    // IsTailCall Should be modified by implementations of FastLowerCall
     // that perform tail call conversions.
     bool IsTailCall = false;
 
@@ -215,67 +215,74 @@ protected:
   const TargetLibraryInfo *LibInfo;
   bool SkipTargetIndependentISel;
 
-  /// \brief The position of the last instruction for materializing constants
+  /// The position of the last instruction for materializing constants
   /// for use in the current block. It resets to EmitStartPt when it makes sense
   /// (for example, it's usually profitable to avoid function calls between the
   /// definition and the use)
   MachineInstr *LastLocalValue;
 
-  /// \brief The top most instruction in the current block that is allowed for
+  /// The top most instruction in the current block that is allowed for
   /// emitting local variables. LastLocalValue resets to EmitStartPt when it
   /// makes sense (for example, on function calls)
   MachineInstr *EmitStartPt;
 
+  /// Last local value flush point. On a subsequent flush, no local value will
+  /// sink past this point.
+  MachineBasicBlock::iterator LastFlushPoint;
+
 public:
   virtual ~FastISel();
 
-  /// \brief Return the position of the last instruction emitted for
+  /// Return the position of the last instruction emitted for
   /// materializing constants for use in the current block.
   MachineInstr *getLastLocalValue() { return LastLocalValue; }
 
-  /// \brief Update the position of the last instruction emitted for
+  /// Update the position of the last instruction emitted for
   /// materializing constants for use in the current block.
   void setLastLocalValue(MachineInstr *I) {
     EmitStartPt = I;
     LastLocalValue = I;
   }
 
-  /// \brief Set the current block to which generated machine instructions will
-  /// be appended, and clear the local CSE map.
+  /// Set the current block to which generated machine instructions will
+  /// be appended.
   void startNewBlock();
 
-  /// \brief Return current debug location information.
+  /// Flush the local value map and sink local values if possible.
+  void finishBasicBlock();
+
+  /// Return current debug location information.
   DebugLoc getCurDebugLoc() const { return DbgLoc; }
 
-  /// \brief Do "fast" instruction selection for function arguments and append
+  /// Do "fast" instruction selection for function arguments and append
   /// the machine instructions to the current block. Returns true when
   /// successful.
   bool lowerArguments();
 
-  /// \brief Do "fast" instruction selection for the given LLVM IR instruction
+  /// Do "fast" instruction selection for the given LLVM IR instruction
   /// and append the generated machine instructions to the current block.
   /// Returns true if selection was successful.
   bool selectInstruction(const Instruction *I);
 
-  /// \brief Do "fast" instruction selection for the given LLVM IR operator
+  /// Do "fast" instruction selection for the given LLVM IR operator
   /// (Instruction or ConstantExpr), and append generated machine instructions
   /// to the current block. Return true if selection was successful.
   bool selectOperator(const User *I, unsigned Opcode);
 
-  /// \brief Create a virtual register and arrange for it to be assigned the
+  /// Create a virtual register and arrange for it to be assigned the
   /// value for the given LLVM value.
   unsigned getRegForValue(const Value *V);
 
-  /// \brief Look up the value to see if its value is already cached in a
+  /// Look up the value to see if its value is already cached in a
   /// register. It may be defined by instructions across blocks or defined
   /// locally.
   unsigned lookUpRegForValue(const Value *V);
 
-  /// \brief This is a wrapper around getRegForValue that also takes care of
+  /// This is a wrapper around getRegForValue that also takes care of
   /// truncating or sign-extending the given getelementptr index value.
-  std::pair<unsigned, bool> getRegForGEPIndex(const Value *V);
+  std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
 
-  /// \brief We're checking to see if we can fold \p LI into \p FoldInst. Note
+  /// We're checking to see if we can fold \p LI into \p FoldInst. Note
   /// that we could have a sequence where multiple LLVM IR instructions are
   /// folded into the same machineinstr.  For example we could have:
   ///
@@ -289,7 +296,7 @@ public:
   /// If we succeed folding, return true.
   bool tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst);
 
-  /// \brief The specified machine instr operand is a vreg, and that vreg is
+  /// The specified machine instr operand is a vreg, and that vreg is
   /// being provided by the specified load instruction.  If possible, try to
   /// fold the load as an operand to the instruction, returning true if
   /// possible.
@@ -300,11 +307,11 @@ public:
     return false;
   }
 
-  /// \brief Reset InsertPt to prepare for inserting instructions into the
+  /// Reset InsertPt to prepare for inserting instructions into the
   /// current block.
   void recomputeInsertPt();
 
-  /// \brief Remove all dead instructions between the I and E.
+  /// Remove all dead instructions between the I and E.
   void removeDeadCode(MachineBasicBlock::iterator I,
                       MachineBasicBlock::iterator E);
 
@@ -313,11 +320,11 @@ public:
     DebugLoc DL;
   };
 
-  /// \brief Prepare InsertPt to begin inserting instructions into the local
+  /// Prepare InsertPt to begin inserting instructions into the local
   /// value area and return the old insert position.
   SavePoint enterLocalValueArea();
 
-  /// \brief Reset InsertPt to the given old insert position.
+  /// Reset InsertPt to the given old insert position.
   void leaveLocalValueArea(SavePoint Old);
 
 protected:
@@ -325,45 +332,45 @@ protected:
                     const TargetLibraryInfo *LibInfo,
                     bool SkipTargetIndependentISel = false);
 
-  /// \brief This method is called by target-independent code when the normal
+  /// This method is called by target-independent code when the normal
   /// FastISel process fails to select an instruction. This gives targets a
   /// chance to emit code for anything that doesn't fit into FastISel's
   /// framework. It returns true if it was successful.
   virtual bool fastSelectInstruction(const Instruction *I) = 0;
 
-  /// \brief This method is called by target-independent code to do target-
+  /// This method is called by target-independent code to do target-
   /// specific argument lowering. It returns true if it was successful.
   virtual bool fastLowerArguments();
 
-  /// \brief This method is called by target-independent code to do target-
+  /// This method is called by target-independent code to do target-
   /// specific call lowering. It returns true if it was successful.
   virtual bool fastLowerCall(CallLoweringInfo &CLI);
 
-  /// \brief This method is called by target-independent code to do target-
+  /// This method is called by target-independent code to do target-
   /// specific intrinsic lowering. It returns true if it was successful.
   virtual bool fastLowerIntrinsicCall(const IntrinsicInst *II);
 
-  /// \brief This method is called by target-independent code to request that an
+  /// This method is called by target-independent code to request that an
   /// instruction with the given type and opcode be emitted.
   virtual unsigned fastEmit_(MVT VT, MVT RetVT, unsigned Opcode);
 
-  /// \brief This method is called by target-independent code to request that an
+  /// This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register operand be emitted.
   virtual unsigned fastEmit_r(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
                               bool Op0IsKill);
 
-  /// \brief This method is called by target-independent code to request that an
+  /// This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register operands be emitted.
   virtual unsigned fastEmit_rr(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
                                bool Op0IsKill, unsigned Op1, bool Op1IsKill);
 
-  /// \brief This method is called by target-independent code to request that an
+  /// This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register and immediate
   /// operands be emitted.
   virtual unsigned fastEmit_ri(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
                                bool Op0IsKill, uint64_t Imm);
 
-  /// \brief This method is a wrapper of fastEmit_ri.
+  /// This method is a wrapper of fastEmit_ri.
   ///
   /// It first tries to emit an instruction with an immediate operand using
   /// fastEmit_ri.  If that fails, it materializes the immediate into a register
@@ -371,89 +378,89 @@ protected:
   unsigned fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0, bool Op0IsKill,
                         uint64_t Imm, MVT ImmType);
 
-  /// \brief This method is called by target-independent code to request that an
+  /// This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and immediate operand be emitted.
   virtual unsigned fastEmit_i(MVT VT, MVT RetVT, unsigned Opcode, uint64_t Imm);
 
-  /// \brief This method is called by target-independent code to request that an
+  /// This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and floating-point immediate
   /// operand be emitted.
   virtual unsigned fastEmit_f(MVT VT, MVT RetVT, unsigned Opcode,
                               const ConstantFP *FPImm);
 
-  /// \brief Emit a MachineInstr with no operands and a result register in the
+  /// Emit a MachineInstr with no operands and a result register in the
   /// given register class.
   unsigned fastEmitInst_(unsigned MachineInstOpcode,
                          const TargetRegisterClass *RC);
 
-  /// \brief Emit a MachineInstr with one register operand and a result register
+  /// Emit a MachineInstr with one register operand and a result register
   /// in the given register class.
   unsigned fastEmitInst_r(unsigned MachineInstOpcode,
                           const TargetRegisterClass *RC, unsigned Op0,
                           bool Op0IsKill);
 
-  /// \brief Emit a MachineInstr with two register operands and a result
+  /// Emit a MachineInstr with two register operands and a result
   /// register in the given register class.
   unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
                            const TargetRegisterClass *RC, unsigned Op0,
                            bool Op0IsKill, unsigned Op1, bool Op1IsKill);
 
-  /// \brief Emit a MachineInstr with three register operands and a result
+  /// Emit a MachineInstr with three register operands and a result
   /// register in the given register class.
   unsigned fastEmitInst_rrr(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC, unsigned Op0,
                             bool Op0IsKill, unsigned Op1, bool Op1IsKill,
                             unsigned Op2, bool Op2IsKill);
 
-  /// \brief Emit a MachineInstr with a register operand, an immediate, and a
+  /// Emit a MachineInstr with a register operand, an immediate, and a
   /// result register in the given register class.
   unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
                            const TargetRegisterClass *RC, unsigned Op0,
                            bool Op0IsKill, uint64_t Imm);
 
-  /// \brief Emit a MachineInstr with one register operand and two immediate
+  /// Emit a MachineInstr with one register operand and two immediate
   /// operands.
   unsigned fastEmitInst_rii(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC, unsigned Op0,
                             bool Op0IsKill, uint64_t Imm1, uint64_t Imm2);
 
-  /// \brief Emit a MachineInstr with a floating point immediate, and a result
+  /// Emit a MachineInstr with a floating point immediate, and a result
   /// register in the given register class.
   unsigned fastEmitInst_f(unsigned MachineInstOpcode,
                           const TargetRegisterClass *RC,
                           const ConstantFP *FPImm);
 
-  /// \brief Emit a MachineInstr with two register operands, an immediate, and a
+  /// Emit a MachineInstr with two register operands, an immediate, and a
   /// result register in the given register class.
   unsigned fastEmitInst_rri(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC, unsigned Op0,
                             bool Op0IsKill, unsigned Op1, bool Op1IsKill,
                             uint64_t Imm);
 
-  /// \brief Emit a MachineInstr with a single immediate operand, and a result
+  /// Emit a MachineInstr with a single immediate operand, and a result
   /// register in the given register class.
-  unsigned fastEmitInst_i(unsigned MachineInstrOpcode,
+  unsigned fastEmitInst_i(unsigned MachineInstOpcode,
                           const TargetRegisterClass *RC, uint64_t Imm);
 
-  /// \brief Emit a MachineInstr for an extract_subreg from a specified index of
+  /// Emit a MachineInstr for an extract_subreg from a specified index of
   /// a superregister to a specified type.
   unsigned fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill,
                                       uint32_t Idx);
 
-  /// \brief Emit MachineInstrs to compute the value of Op with all but the
+  /// Emit MachineInstrs to compute the value of Op with all but the
   /// least significant bit set to zero.
   unsigned fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill);
 
-  /// \brief Emit an unconditional branch to the given block, unless it is the
+  /// Emit an unconditional branch to the given block, unless it is the
   /// immediate (fall-through) successor, and update the CFG.
-  void fastEmitBranch(MachineBasicBlock *MBB, const DebugLoc &DL);
+  void fastEmitBranch(MachineBasicBlock *MSucc, const DebugLoc &DbgLoc);
 
   /// Emit an unconditional branch to \p FalseMBB, obtains the branch weight
   /// and adds TrueMBB and FalseMBB to the successor list.
   void finishCondBranch(const BasicBlock *BranchBB, MachineBasicBlock *TrueMBB,
                         MachineBasicBlock *FalseMBB);
 
-  /// \brief Update the value map to include the new mapping for this
+  /// Update the value map to include the new mapping for this
   /// instruction, or insert an extra copy to get the result in a previous
   /// determined register.
   ///
@@ -464,26 +471,26 @@ protected:
 
   unsigned createResultReg(const TargetRegisterClass *RC);
 
-  /// \brief Try to constrain Op so that it is usable by argument OpNum of the
+  /// Try to constrain Op so that it is usable by argument OpNum of the
   /// provided MCInstrDesc. If this fails, create a new virtual register in the
   /// correct class and COPY the value there.
   unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
                                     unsigned OpNum);
 
-  /// \brief Emit a constant in a register using target-specific logic, such as
+  /// Emit a constant in a register using target-specific logic, such as
   /// constant pool loads.
   virtual unsigned fastMaterializeConstant(const Constant *C) { return 0; }
 
-  /// \brief Emit an alloca address in a register using target-specific logic.
+  /// Emit an alloca address in a register using target-specific logic.
   virtual unsigned fastMaterializeAlloca(const AllocaInst *C) { return 0; }
 
-  /// \brief Emit the floating-point constant +0.0 in a register using target-
+  /// Emit the floating-point constant +0.0 in a register using target-
   /// specific logic.
   virtual unsigned fastMaterializeFloatZero(const ConstantFP *CF) {
     return 0;
   }
 
-  /// \brief Check if \c Add is an add that can be safely folded into \c GEP.
+  /// Check if \c Add is an add that can be safely folded into \c GEP.
   ///
   /// \c Add can be folded into \c GEP if:
   /// - \c Add is an add,
@@ -492,16 +499,16 @@ protected:
   /// - \c Add has a constant operand.
   bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
 
-  /// \brief Test whether the given value has exactly one use.
+  /// Test whether the given value has exactly one use.
   bool hasTrivialKill(const Value *V);
 
-  /// \brief Create a machine mem operand from the given instruction.
+  /// Create a machine mem operand from the given instruction.
   MachineMemOperand *createMachineMemOperandFor(const Instruction *I) const;
 
   CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) const;
 
   bool lowerCallTo(const CallInst *CI, MCSymbol *Symbol, unsigned NumArgs);
-  bool lowerCallTo(const CallInst *CI, const char *SymbolName,
+  bool lowerCallTo(const CallInst *CI, const char *SymName,
                    unsigned NumArgs);
   bool lowerCallTo(CallLoweringInfo &CLI);
 
@@ -518,23 +525,24 @@ protected:
   }
 
   bool lowerCall(const CallInst *I);
-  /// \brief Select and emit code for a binary operator instruction, which has
+  /// Select and emit code for a binary operator instruction, which has
   /// an opcode which directly corresponds to the given ISD opcode.
   bool selectBinaryOp(const User *I, unsigned ISDOpcode);
   bool selectFNeg(const User *I);
   bool selectGetElementPtr(const User *I);
   bool selectStackmap(const CallInst *I);
   bool selectPatchpoint(const CallInst *I);
-  bool selectCall(const User *Call);
+  bool selectCall(const User *I);
   bool selectIntrinsicCall(const IntrinsicInst *II);
   bool selectBitCast(const User *I);
   bool selectCast(const User *I, unsigned Opcode);
-  bool selectExtractValue(const User *I);
+  bool selectExtractValue(const User *U);
   bool selectInsertValue(const User *I);
   bool selectXRayCustomEvent(const CallInst *II);
+  bool selectXRayTypedEvent(const CallInst *II);
 
 private:
-  /// \brief Handle PHI nodes in successor blocks.
+  /// Handle PHI nodes in successor blocks.
   ///
   /// Emit code to ensure constants are copied into registers when needed.
   /// Remember the virtual registers that need to be added to the Machine PHI
@@ -543,27 +551,41 @@ private:
   /// correspond to a different MBB than the end.
   bool handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
-  /// \brief Helper for materializeRegForValue to materialize a constant in a
+  /// Helper for materializeRegForValue to materialize a constant in a
   /// target-independent way.
   unsigned materializeConstant(const Value *V, MVT VT);
 
-  /// \brief Helper for getRegForVale. This function is called when the value
+  /// Helper for getRegForVale. This function is called when the value
   /// isn't already available in a register and must be materialized with new
   /// instructions.
   unsigned materializeRegForValue(const Value *V, MVT VT);
 
-  /// \brief Clears LocalValueMap and moves the area for the new local variables
+  /// Clears LocalValueMap and moves the area for the new local variables
   /// to the beginning of the block. It helps to avoid spilling cached variables
   /// across heavy instructions like calls.
   void flushLocalValueMap();
 
-  /// \brief Removes dead local value instructions after SavedLastLocalvalue.
+  /// Removes dead local value instructions after SavedLastLocalvalue.
   void removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue);
 
-  /// \brief Insertion point before trying to select the current instruction.
+  struct InstOrderMap {
+    DenseMap<MachineInstr *, unsigned> Orders;
+    MachineInstr *FirstTerminator = nullptr;
+    unsigned FirstTerminatorOrder = std::numeric_limits<unsigned>::max();
+
+    void initialize(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator LastFlushPoint);
+  };
+
+  /// Sinks the local value materialization instruction LocalMI to its first use
+  /// in the basic block, or deletes it if it is not used.
+  void sinkLocalValueMaterialization(MachineInstr &LocalMI, unsigned DefReg,
+                                     InstOrderMap &OrderMap);
+
+  /// Insertion point before trying to select the current instruction.
   MachineBasicBlock::iterator SavedInsertPt;
 
-  /// \brief Add a stackmap or patchpoint intrinsic call's live variable
+  /// Add a stackmap or patchpoint intrinsic call's live variable
   /// operands to a stackmap or patchpoint machine instruction.
   bool addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
                            const CallInst *CI, unsigned StartIdx);
diff --git a/contrib/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/contrib/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 3b39d87ffb4a..2da00b7d61ab 100644
--- a/contrib/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -118,6 +118,17 @@ public:
   /// cross-basic-block values.
   DenseMap<const Value *, unsigned> ValueMap;
 
+  /// VirtReg2Value map is needed by the Divergence Analysis driven
+  /// instruction selection. It is reverted ValueMap. It is computed
+  /// in lazy style - on demand. It is used to get the Value corresponding
+  /// to the live in virtual register and is called from the
+  /// TargetLowerinInfo::isSDNodeSourceOfDivergence.
+  DenseMap<unsigned, const Value*> VirtReg2Value;
+
+  /// This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence
+  /// to get the Value corresponding to the live-in virtual register.
+  const Value * getValueFromVirtualReg(unsigned Vreg);
+
   /// Track virtual registers created for exception pointers.
   DenseMap<const Value *, unsigned> CatchPadExceptionPointers;
 
@@ -167,6 +178,8 @@ public:
   /// RegFixups - Registers which need to be replaced after isel is done.
   DenseMap<unsigned, unsigned> RegFixups;
 
+  DenseSet<unsigned> RegsWithFixups;
+
   /// StatepointStackSlots - A list of temporary stack slots (frame indices)
   /// used to spill values at a statepoint.  We store them here to enable
   /// reuse of the same stack slots across different statepoints in different
diff --git a/contrib/llvm/include/llvm/CodeGen/GCStrategy.h b/contrib/llvm/include/llvm/CodeGen/GCStrategy.h
index 16168e785f81..91604fd2df87 100644
--- a/contrib/llvm/include/llvm/CodeGen/GCStrategy.h
+++ b/contrib/llvm/include/llvm/CodeGen/GCStrategy.h
@@ -105,12 +105,12 @@ public:
 
   /// By default, write barriers are replaced with simple store
   /// instructions. If true, you must provide a custom pass to lower 
-  /// calls to @llvm.gcwrite.
+  /// calls to \@llvm.gcwrite.
   bool customWriteBarrier() const { return CustomWriteBarriers; }
 
   /// By default, read barriers are replaced with simple load
   /// instructions. If true, you must provide a custom pass to lower 
-  /// calls to @llvm.gcread.
+  /// calls to \@llvm.gcread.
   bool customReadBarrier() const { return CustomReadBarriers; }
 
   /// Returns true if this strategy is expecting the use of gc.statepoints,
@@ -147,7 +147,7 @@ public:
 
   /// By default, roots are left for the code generator so it can generate a
   /// stack map. If true, you must provide a custom pass to lower 
-  /// calls to @llvm.gcroot.
+  /// calls to \@llvm.gcroot.
   bool customRoots() const { return CustomRoots; }
 
   /// If set, gcroot intrinsics should initialize their allocas to null
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index ba84d76de164..58eb412d8c24 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -17,11 +17,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cstdint>
 #include <functional>
 
@@ -123,7 +123,7 @@ protected:
   }
 
   template <typename FuncInfoTy>
-  void setArgFlags(ArgInfo &Arg, unsigned OpNum, const DataLayout &DL,
+  void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL,
                    const FuncInfoTy &FuncInfo) const;
 
   /// Invoke Handler::assignArg on each of the given \p Args and then use
@@ -131,7 +131,7 @@ protected:
   ///
   /// \return True if everything has succeeded, false otherwise.
   bool handleAssignments(MachineIRBuilder &MIRBuilder, ArrayRef<ArgInfo> Args,
-                         ValueHandler &Callback) const;
+                         ValueHandler &Handler) const;
 
 public:
   CallLowering(const TargetLowering *TLI) : TLI(TLI) {}
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
new file mode 100644
index 000000000000..36a33deb4a64
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
@@ -0,0 +1,43 @@
+//== ----- llvm/CodeGen/GlobalISel/Combiner.h --------------------- == //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// This contains common code to drive combines. Combiner Passes will need to
+/// setup a CombinerInfo and call combineMachineFunction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_GLOBALISEL_COMBINER_H
+#define LLVM_CODEGEN_GLOBALISEL_COMBINER_H
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+class MachineRegisterInfo;
+class CombinerInfo;
+class TargetPassConfig;
+class MachineFunction;
+
+class Combiner {
+public:
+  Combiner(CombinerInfo &CombinerInfo, const TargetPassConfig *TPC);
+
+  bool combineMachineInstrs(MachineFunction &MF);
+
+protected:
+  CombinerInfo &CInfo;
+
+  MachineRegisterInfo *MRI = nullptr;
+  const TargetPassConfig *TPC;
+  MachineIRBuilder Builder;
+};
+
+} // End namespace llvm.
+
+#endif // LLVM_CODEGEN_GLOBALISEL_GICOMBINER_H
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
new file mode 100644
index 000000000000..5d5b8398452c
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -0,0 +1,44 @@
+//== llvm/CodeGen/GlobalISel/CombinerHelper.h -------------- -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+//
+/// This contains common combine transformations that may be used in a combine
+/// pass,or by the target elsewhere.
+/// Targets can pick individual opcode transformations from the helper or use
+/// tryCombine which invokes all transformations. All of the transformations
+/// return true if the MachineInstruction changed and false otherwise.
+//
+//===--------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
+#define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
+
+namespace llvm {
+
+class MachineIRBuilder;
+class MachineRegisterInfo;
+class MachineInstr;
+
+class CombinerHelper {
+  MachineIRBuilder &Builder;
+  MachineRegisterInfo &MRI;
+
+public:
+  CombinerHelper(MachineIRBuilder &B);
+
+  /// If \p MI is COPY, try to combine it.
+  /// Returns true if MI changed.
+  bool tryCombineCopy(MachineInstr &MI);
+
+  /// Try to transform \p MI by using all of the above
+  /// combine functions. Returns true if changed.
+  bool tryCombine(MachineInstr &MI);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
new file mode 100644
index 000000000000..1d248547adbf
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
@@ -0,0 +1,48 @@
+//===- llvm/CodeGen/GlobalISel/CombinerInfo.h ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Interface for Targets to specify which operations are combined how and when.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_GLOBALISEL_COMBINER_INFO_H
+#define LLVM_CODEGEN_GLOBALISEL_COMBINER_INFO_H
+
+#include <cassert>
+namespace llvm {
+
+class LegalizerInfo;
+class MachineInstr;
+class MachineIRBuilder;
+class MachineRegisterInfo;
+// Contains information relevant to enabling/disabling various combines for a
+// pass.
+class CombinerInfo {
+public:
+  CombinerInfo(bool AllowIllegalOps, bool ShouldLegalizeIllegal,
+               LegalizerInfo *LInfo)
+      : IllegalOpsAllowed(AllowIllegalOps),
+        LegalizeIllegalOps(ShouldLegalizeIllegal), LInfo(LInfo) {
+    assert(((AllowIllegalOps || !LegalizeIllegalOps) || LInfo) &&
+           "Expecting legalizerInfo when illegalops not allowed");
+  }
+  virtual ~CombinerInfo() = default;
+  /// If \p IllegalOpsAllowed is false, the CombinerHelper will make use of
+  /// the legalizerInfo to check for legality before each transformation.
+  bool IllegalOpsAllowed; // TODO: Make use of this.
+
+  /// If \p LegalizeIllegalOps is true, the Combiner will also legalize the
+  /// illegal ops that are created.
+  bool LegalizeIllegalOps; // TODO: Make use of this.
+  const LegalizerInfo *LInfo;
+  virtual bool combine(MachineInstr &MI, MachineIRBuilder &B) const = 0;
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
new file mode 100644
index 000000000000..8d61f9a68279
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
@@ -0,0 +1,134 @@
+//===-- llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h  --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a version of MachineIRBuilder which does trivial
+/// constant folding.
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+
+namespace llvm {
+
+static Optional<APInt> ConstantFoldBinOp(unsigned Opcode, const unsigned Op1,
+                                         const unsigned Op2,
+                                         const MachineRegisterInfo &MRI) {
+  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
+  auto MaybeOp2Cst = getConstantVRegVal(Op2, MRI);
+  if (MaybeOp1Cst && MaybeOp2Cst) {
+    LLT Ty = MRI.getType(Op1);
+    APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
+    APInt C2(Ty.getSizeInBits(), *MaybeOp2Cst, true);
+    switch (Opcode) {
+    default:
+      break;
+    case TargetOpcode::G_ADD:
+      return C1 + C2;
+    case TargetOpcode::G_AND:
+      return C1 & C2;
+    case TargetOpcode::G_ASHR:
+      return C1.ashr(C2);
+    case TargetOpcode::G_LSHR:
+      return C1.lshr(C2);
+    case TargetOpcode::G_MUL:
+      return C1 * C2;
+    case TargetOpcode::G_OR:
+      return C1 | C2;
+    case TargetOpcode::G_SHL:
+      return C1 << C2;
+    case TargetOpcode::G_SUB:
+      return C1 - C2;
+    case TargetOpcode::G_XOR:
+      return C1 ^ C2;
+    case TargetOpcode::G_UDIV:
+      if (!C2.getBoolValue())
+        break;
+      return C1.udiv(C2);
+    case TargetOpcode::G_SDIV:
+      if (!C2.getBoolValue())
+        break;
+      return C1.sdiv(C2);
+    case TargetOpcode::G_UREM:
+      if (!C2.getBoolValue())
+        break;
+      return C1.urem(C2);
+    case TargetOpcode::G_SREM:
+      if (!C2.getBoolValue())
+        break;
+      return C1.srem(C2);
+    }
+  }
+  return None;
+}
+
+/// An MIRBuilder which does trivial constant folding of binary ops.
+/// Calls to buildInstr will also try to constant fold binary ops.
+class ConstantFoldingMIRBuilder
+    : public FoldableInstructionsBuilder<ConstantFoldingMIRBuilder> {
+public:
+  // Pull in base class constructors.
+  using FoldableInstructionsBuilder<
+      ConstantFoldingMIRBuilder>::FoldableInstructionsBuilder;
+  // Unhide buildInstr
+  using FoldableInstructionsBuilder<ConstantFoldingMIRBuilder>::buildInstr;
+
+  // Implement buildBinaryOp required by FoldableInstructionsBuilder which
+  // tries to constant fold.
+  MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Dst,
+                                    unsigned Src0, unsigned Src1) {
+    validateBinaryOp(Dst, Src0, Src1);
+    auto MaybeCst = ConstantFoldBinOp(Opcode, Src0, Src1, getMF().getRegInfo());
+    if (MaybeCst)
+      return buildConstant(Dst, MaybeCst->getSExtValue());
+    return buildInstr(Opcode).addDef(Dst).addUse(Src0).addUse(Src1);
+  }
+
+  template <typename DstTy, typename UseArg1Ty, typename UseArg2Ty>
+  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty, UseArg1Ty &&Arg1,
+                                 UseArg2Ty &&Arg2) {
+    unsigned Dst = getDestFromArg(Ty);
+    return buildInstr(Opc, Dst, getRegFromArg(std::forward<UseArg1Ty>(Arg1)),
+                      getRegFromArg(std::forward<UseArg2Ty>(Arg2)));
+  }
+
+  // Try to provide an overload for buildInstr for binary ops in order to
+  // constant fold.
+  MachineInstrBuilder buildInstr(unsigned Opc, unsigned Dst, unsigned Src0,
+                                 unsigned Src1) {
+    switch (Opc) {
+    default:
+      break;
+    case TargetOpcode::G_ADD:
+    case TargetOpcode::G_AND:
+    case TargetOpcode::G_ASHR:
+    case TargetOpcode::G_LSHR:
+    case TargetOpcode::G_MUL:
+    case TargetOpcode::G_OR:
+    case TargetOpcode::G_SHL:
+    case TargetOpcode::G_SUB:
+    case TargetOpcode::G_XOR:
+    case TargetOpcode::G_UDIV:
+    case TargetOpcode::G_SDIV:
+    case TargetOpcode::G_UREM:
+    case TargetOpcode::G_SREM: {
+      return buildBinaryOp(Opc, Dst, Src0, Src1);
+    }
+    }
+    return buildInstr(Opc).addDef(Dst).addUse(Src0).addUse(Src1);
+  }
+
+  // Fallback implementation of buildInstr.
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty,
+                                 UseArgsTy &&... Args) {
+    auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty));
+    addUsesFromArgs(MIB, std::forward<UseArgsTy>(Args)...);
+    return MIB;
+  }
+};
+} // namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 7061c014d9b7..f3553966fcdf 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Types.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/IR/Intrinsics.h"
 #include <memory>
 #include <utility>
@@ -63,9 +64,83 @@ private:
   /// Interface used to lower the everything related to calls.
   const CallLowering *CLI;
 
-  /// Mapping of the values of the current LLVM IR function
-  /// to the related virtual registers.
-  ValueToVReg ValToVReg;
+  /// This class contains the mapping between the Values to vreg related data.
+  class ValueToVRegInfo {
+  public:
+    ValueToVRegInfo() = default;
+
+    using VRegListT = SmallVector<unsigned, 1>;
+    using OffsetListT = SmallVector<uint64_t, 1>;
+
+    using const_vreg_iterator =
+        DenseMap<const Value *, VRegListT *>::const_iterator;
+    using const_offset_iterator =
+        DenseMap<const Value *, OffsetListT *>::const_iterator;
+
+    inline const_vreg_iterator vregs_end() const { return ValToVRegs.end(); }
+
+    VRegListT *getVRegs(const Value &V) {
+      auto It = ValToVRegs.find(&V);
+      if (It != ValToVRegs.end())
+        return It->second;
+
+      return insertVRegs(V);
+    }
+
+    OffsetListT *getOffsets(const Value &V) {
+      auto It = TypeToOffsets.find(V.getType());
+      if (It != TypeToOffsets.end())
+        return It->second;
+
+      return insertOffsets(V);
+    }
+
+    const_vreg_iterator findVRegs(const Value &V) const {
+      return ValToVRegs.find(&V);
+    }
+
+    bool contains(const Value &V) const {
+      return ValToVRegs.find(&V) != ValToVRegs.end();
+    }
+
+    void reset() {
+      ValToVRegs.clear();
+      TypeToOffsets.clear();
+      VRegAlloc.DestroyAll();
+      OffsetAlloc.DestroyAll();
+    }
+
+  private:
+    VRegListT *insertVRegs(const Value &V) {
+      assert(ValToVRegs.find(&V) == ValToVRegs.end() && "Value already exists");
+
+      // We placement new using our fast allocator since we never try to free
+      // the vectors until translation is finished.
+      auto *VRegList = new (VRegAlloc.Allocate()) VRegListT();
+      ValToVRegs[&V] = VRegList;
+      return VRegList;
+    }
+
+    OffsetListT *insertOffsets(const Value &V) {
+      assert(TypeToOffsets.find(V.getType()) == TypeToOffsets.end() &&
+             "Type already exists");
+
+      auto *OffsetList = new (OffsetAlloc.Allocate()) OffsetListT();
+      TypeToOffsets[V.getType()] = OffsetList;
+      return OffsetList;
+    }
+    SpecificBumpPtrAllocator<VRegListT> VRegAlloc;
+    SpecificBumpPtrAllocator<OffsetListT> OffsetAlloc;
+
+    // We store pointers to vectors here since references may be invalidated
+    // while we hold them if we stored the vectors directly.
+    DenseMap<const Value *, VRegListT*> ValToVRegs;
+    DenseMap<const Type *, OffsetListT*> TypeToOffsets;
+  };
+
+  /// Mapping of the values of the current LLVM IR function to the related
+  /// virtual registers and offsets.
+  ValueToVRegInfo VMap;
 
   // N.b. it's not completely obvious that this will be sufficient for every
   // LLVM IR construct (with "invoke" being the obvious candidate to mess up our
@@ -82,7 +157,8 @@ private:
 
   // List of stubbed PHI instructions, for values and basic blocks to be filled
   // in once all MachineBasicBlocks have been created.
-  SmallVector<std::pair<const PHINode *, MachineInstr *>, 4> PendingPHIs;
+  SmallVector<std::pair<const PHINode *, SmallVector<MachineInstr *, 1>>, 4>
+      PendingPHIs;
 
   /// Record of what frame index has been allocated to specified allocas for
   /// this function.
@@ -99,7 +175,7 @@ private:
   /// The general algorithm is:
   /// 1. Look for a virtual register for each operand or
   ///    create one.
-  /// 2 Update the ValToVReg accordingly.
+  /// 2 Update the VMap accordingly.
   /// 2.alt. For constant arguments, if they are compile time constants,
   ///   produce an immediate in the right operand and do not touch
   ///   ValToReg. Actually we will go with a virtual register for each
@@ -134,7 +210,7 @@ private:
 
   /// Translate an LLVM string intrinsic (memcpy, memset, ...).
   bool translateMemfunc(const CallInst &CI, MachineIRBuilder &MIRBuilder,
-                        unsigned Intrinsic);
+                        unsigned ID);
 
   void getStackGuard(unsigned DstReg, MachineIRBuilder &MIRBuilder);
 
@@ -146,6 +222,19 @@ private:
 
   bool translateInlineAsm(const CallInst &CI, MachineIRBuilder &MIRBuilder);
 
+  // FIXME: temporary function to expose previous interface to call lowering
+  // until it is refactored.
+  /// Combines all component registers of \p V into a single scalar with size
+  /// "max(Offsets) + last size".
+  unsigned packRegs(const Value &V, MachineIRBuilder &MIRBuilder);
+
+  void unpackRegs(const Value &V, unsigned Src, MachineIRBuilder &MIRBuilder);
+
+  /// Returns true if the value should be split into multiple LLTs.
+  /// If \p Offsets is given then the split type's offsets will be stored in it.
+  bool valueIsSplit(const Value &V,
+                    SmallVectorImpl<uint64_t> *Offsets = nullptr);
+
   /// Translate call instruction.
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
@@ -310,6 +399,9 @@ private:
 
   bool translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateAtomicCmpXchg(const User &U, MachineIRBuilder &MIRBuilder);
+  bool translateAtomicRMW(const User &U, MachineIRBuilder &MIRBuilder);
+
   // Stubs to keep the compiler happy while we implement the rest of the
   // translation.
   bool translateResume(const User &U, MachineIRBuilder &MIRBuilder) {
@@ -327,14 +419,8 @@ private:
   bool translateFence(const User &U, MachineIRBuilder &MIRBuilder) {
     return false;
   }
-  bool translateAtomicCmpXchg(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
-  bool translateAtomicRMW(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
   bool translateAddrSpaceCast(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
+    return translateCast(TargetOpcode::G_ADDRSPACE_CAST, U, MIRBuilder);
   }
   bool translateCleanupPad(const User &U, MachineIRBuilder &MIRBuilder) {
     return false;
@@ -381,9 +467,24 @@ private:
   // * Clear the different maps.
   void finalizeFunction();
 
-  /// Get the VReg that represents \p Val.
-  /// If such VReg does not exist, it is created.
-  unsigned getOrCreateVReg(const Value &Val);
+  /// Get the VRegs that represent \p Val.
+  /// Non-aggregate types have just one corresponding VReg and the list can be
+  /// used as a single "unsigned". Aggregates get flattened. If such VRegs do
+  /// not exist, they are created.
+  ArrayRef<unsigned> getOrCreateVRegs(const Value &Val);
+
+  unsigned getOrCreateVReg(const Value &Val) {
+    auto Regs = getOrCreateVRegs(Val);
+    if (Regs.empty())
+      return 0;
+    assert(Regs.size() == 1 &&
+           "attempt to get single VReg for aggregate or void");
+    return Regs[0];
+  }
+
+  /// Allocate some vregs and offsets in the VMap. Then populate just the
+  /// offsets while leaving the vregs empty.
+  ValueToVRegInfo::VRegListT &allocateVRegs(const Value &Val);
 
   /// Get the frame index that represents \p Val.
   /// If such VReg does not exist, it is created.
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index 4264a866b6c0..471def7f45a3 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CodeGenCoverage.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 #include <bitset>
 #include <cstddef>
 #include <cstdint>
@@ -31,7 +32,6 @@ namespace llvm {
 
 class APInt;
 class APFloat;
-class LLT;
 class MachineInstr;
 class MachineInstrBuilder;
 class MachineFunction;
@@ -81,6 +81,23 @@ enum {
   ///        failed match.
   GIM_Try,
 
+  /// Switch over the opcode on the specified instruction
+  /// - InsnID - Instruction ID
+  /// - LowerBound - numerically minimum opcode supported
+  /// - UpperBound - numerically maximum + 1 opcode supported
+  /// - Default - failure jump target
+  /// - JumpTable... - (UpperBound - LowerBound) (at least 2) jump targets
+  GIM_SwitchOpcode,
+
+  /// Switch over the LLT on the specified instruction operand
+  /// - InsnID - Instruction ID
+  /// - OpIdx - Operand index
+  /// - LowerBound - numerically minimum Type ID supported
+  /// - UpperBound - numerically maximum + 1 Type ID supported
+  /// - Default - failure jump target
+  /// - JumpTable... - (UpperBound - LowerBound) (at least 2) jump targets
+  GIM_SwitchType,
+
   /// Record the specified instruction
   /// - NewInsnID - Instruction ID to define
   /// - InsnID - Instruction ID
@@ -117,6 +134,23 @@ enum {
   GIM_CheckAtomicOrdering,
   GIM_CheckAtomicOrderingOrStrongerThan,
   GIM_CheckAtomicOrderingWeakerThan,
+  /// Check the size of the memory access for the given machine memory operand.
+  /// - InsnID - Instruction ID
+  /// - MMOIdx - MMO index
+  /// - Size - The size in bytes of the memory access
+  GIM_CheckMemorySizeEqualTo,
+  /// Check the size of the memory access for the given machine memory operand
+  /// against the size of an operand.
+  /// - InsnID - Instruction ID
+  /// - MMOIdx - MMO index
+  /// - OpIdx - The operand index to compare the MMO against
+  GIM_CheckMemorySizeEqualToLLT,
+  GIM_CheckMemorySizeLessThanLLT,
+  GIM_CheckMemorySizeGreaterThanLLT,
+  /// Check a generic C++ instruction predicate
+  /// - InsnID - Instruction ID
+  /// - PredicateID - The ID of the predicate function to call
+  GIM_CheckCxxInsnPredicate,
 
   /// Check the type for the specified operand
   /// - InsnID - Instruction ID
@@ -133,12 +167,14 @@ enum {
   /// - OpIdx - Operand index
   /// - Expected register bank (specified as a register class)
   GIM_CheckRegBankForClass,
+
   /// Check the operand matches a complex predicate
   /// - InsnID - Instruction ID
   /// - OpIdx - Operand index
   /// - RendererID - The renderer to hold the result
   /// - Complex predicate ID
   GIM_CheckComplexPattern,
+
   /// Check the operand is a specific integer
   /// - InsnID - Instruction ID
   /// - OpIdx - Operand index
@@ -155,6 +191,7 @@ enum {
   /// - OpIdx - Operand index
   /// - Expected Intrinsic ID
   GIM_CheckIntrinsicID,
+
   /// Check the specified operand is an MBB
   /// - InsnID - Instruction ID
   /// - OpIdx - Operand index
@@ -183,6 +220,7 @@ enum {
   /// - OldInsnID - Instruction ID to mutate
   /// - NewOpcode - The new opcode to use
   GIR_MutateOpcode,
+
   /// Build a new instruction
   /// - InsnID - Instruction ID to define
   /// - Opcode - The new opcode to use
@@ -193,6 +231,7 @@ enum {
   /// - OldInsnID - Instruction ID to copy from
   /// - OpIdx - The operand to copy
   GIR_Copy,
+
   /// Copy an operand to the specified instruction or add a zero register if the
   /// operand is a zero immediate.
   /// - NewInsnID - Instruction ID to modify
@@ -206,6 +245,7 @@ enum {
   /// - OpIdx - The operand to copy
   /// - SubRegIdx - The subregister to copy
   GIR_CopySubReg,
+
   /// Add an implicit register def to the specified instruction
   /// - InsnID - Instruction ID to modify
   /// - RegNum - The register to add
@@ -218,10 +258,13 @@ enum {
   /// - InsnID - Instruction ID to modify
   /// - RegNum - The register to add
   GIR_AddRegister,
-  /// Add a a temporary register to the specified instruction
+
+  /// Add a temporary register to the specified instruction
   /// - InsnID - Instruction ID to modify
   /// - TempRegID - The temporary register ID to add
+  /// - TempRegFlags - The register flags to set
   GIR_AddTempRegister,
+
   /// Add an immediate to the specified instruction
   /// - InsnID - Instruction ID to modify
   /// - Imm - The immediate to add
@@ -230,11 +273,17 @@ enum {
   /// - InsnID - Instruction ID to modify
   /// - RendererID - The renderer to call
   GIR_ComplexRenderer,
+
   /// Render sub-operands of complex operands to the specified instruction
   /// - InsnID - Instruction ID to modify
   /// - RendererID - The renderer to call
   /// - RenderOpID - The suboperand to render.
   GIR_ComplexSubOperandRenderer,
+  /// Render operands to the specified instruction using a custom function
+  /// - InsnID - Instruction ID to modify
+  /// - OldInsnID - Instruction ID to get the matched operand from
+  /// - RendererFnID - Custom renderer function to call
+  GIR_CustomRenderer,
 
   /// Render a G_CONSTANT operator as a sign-extended immediate.
   /// - NewInsnID - Instruction ID to modify
@@ -242,24 +291,34 @@ enum {
   /// The operand index is implicitly 1.
   GIR_CopyConstantAsSImm,
 
+  /// Render a G_FCONSTANT operator as a sign-extended immediate.
+  /// - NewInsnID - Instruction ID to modify
+  /// - OldInsnID - Instruction ID to copy from
+  /// The operand index is implicitly 1.
+  GIR_CopyFConstantAsFPImm,
+
   /// Constrain an instruction operand to a register class.
   /// - InsnID - Instruction ID to modify
   /// - OpIdx - Operand index
   /// - RCEnum - Register class enumeration value
   GIR_ConstrainOperandRC,
+
   /// Constrain an instructions operands according to the instruction
   /// description.
   /// - InsnID - Instruction ID to modify
   GIR_ConstrainSelectedInstOperands,
+
   /// Merge all memory operands into instruction.
   /// - InsnID - Instruction ID to modify
   /// - MergeInsnID... - One or more Instruction ID to merge into the result.
   /// - GIU_MergeMemOperands_EndOfList - Terminates the list of instructions to
   ///                                    merge.
   GIR_MergeMemOperands,
+
   /// Erase from parent.
   /// - InsnID - Instruction ID to erase
   GIR_EraseFromParent,
+
   /// Create a new temporary register that's not constrained.
   /// - TempRegID - The temporary register ID to initialize.
   /// - Expected type
@@ -271,6 +330,9 @@ enum {
   /// Increment the rule coverage counter.
   /// - RuleID - The ID of the rule that was covered.
   GIR_Coverage,
+
+  /// Keeping track of the number of the GI opcodes. Must be the last entry.
+  GIU_NumOpcodes,
 };
 
 enum {
@@ -311,11 +373,27 @@ protected:
   };
 
 public:
-  template <class PredicateBitset, class ComplexMatcherMemFn>
-  struct MatcherInfoTy {
+  template <class PredicateBitset, class ComplexMatcherMemFn,
+            class CustomRendererFn>
+  struct ISelInfoTy {
+    ISelInfoTy(const LLT *TypeObjects, size_t NumTypeObjects,
+               const PredicateBitset *FeatureBitsets,
+               const ComplexMatcherMemFn *ComplexPredicates,
+               const CustomRendererFn *CustomRenderers)
+        : TypeObjects(TypeObjects),
+          FeatureBitsets(FeatureBitsets),
+          ComplexPredicates(ComplexPredicates),
+          CustomRenderers(CustomRenderers) {
+
+      for (size_t I = 0; I < NumTypeObjects; ++I)
+        TypeIDMap[TypeObjects[I]] = I;
+    }
     const LLT *TypeObjects;
     const PredicateBitset *FeatureBitsets;
     const ComplexMatcherMemFn *ComplexPredicates;
+    const CustomRendererFn *CustomRenderers;
+
+    SmallDenseMap<LLT, unsigned, 64> TypeIDMap;
   };
 
 protected:
@@ -324,23 +402,35 @@ protected:
   /// Execute a given matcher table and return true if the match was successful
   /// and false otherwise.
   template <class TgtInstructionSelector, class PredicateBitset,
-            class ComplexMatcherMemFn>
+            class ComplexMatcherMemFn, class CustomRendererFn>
   bool executeMatchTable(
       TgtInstructionSelector &ISel, NewMIVector &OutMIs, MatcherState &State,
-      const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> &MatcherInfo,
+      const ISelInfoTy<PredicateBitset, ComplexMatcherMemFn, CustomRendererFn>
+          &ISelInfo,
       const int64_t *MatchTable, const TargetInstrInfo &TII,
       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
       const RegisterBankInfo &RBI, const PredicateBitset &AvailableFeatures,
       CodeGenCoverage &CoverageInfo) const;
 
+  virtual const int64_t *getMatchTable() const {
+    llvm_unreachable("Should have been overridden by tablegen if used");
+  }
+
   virtual bool testImmPredicate_I64(unsigned, int64_t) const {
-    llvm_unreachable("Subclasses must override this to use tablegen");
+    llvm_unreachable(
+        "Subclasses must override this with a tablegen-erated function");
   }
   virtual bool testImmPredicate_APInt(unsigned, const APInt &) const {
-    llvm_unreachable("Subclasses must override this to use tablegen");
+    llvm_unreachable(
+        "Subclasses must override this with a tablegen-erated function");
   }
   virtual bool testImmPredicate_APFloat(unsigned, const APFloat &) const {
-    llvm_unreachable("Subclasses must override this to use tablegen");
+    llvm_unreachable(
+        "Subclasses must override this with a tablegen-erated function");
+  }
+  virtual bool testMIPredicate_MI(unsigned, const MachineInstr &) const {
+    llvm_unreachable(
+        "Subclasses must override this with a tablegen-erated function");
   }
 
   /// Constrain a register operand of an instruction \p I to a specified
@@ -353,20 +443,6 @@ protected:
                                      const TargetRegisterInfo &TRI,
                                      const RegisterBankInfo &RBI) const;
 
-  /// Mutate the newly-selected instruction \p I to constrain its (possibly
-  /// generic) virtual register operands to the instruction's register class.
-  /// This could involve inserting COPYs before (for uses) or after (for defs).
-  /// This requires the number of operands to match the instruction description.
-  /// \returns whether operand regclass constraining succeeded.
-  ///
-  // FIXME: Not all instructions have the same number of operands. We should
-  // probably expose a constrain helper per operand and let the target selector
-  // constrain individual registers, like fast-isel.
-  bool constrainSelectedInstRegOperands(MachineInstr &I,
-                                        const TargetInstrInfo &TII,
-                                        const TargetRegisterInfo &TRI,
-                                        const RegisterBankInfo &RBI) const;
-
   bool isOperandImmEqual(const MachineOperand &MO, int64_t Value,
                          const MachineRegisterInfo &MRI) const;
 
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
index bf834cf8f5e3..2003a79f6b20 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -40,19 +41,22 @@ enum {
   GIPFP_I64_Invalid = 0,
   GIPFP_APInt_Invalid = 0,
   GIPFP_APFloat_Invalid = 0,
+  GIPFP_MI_Invalid = 0,
 };
 
 template <class TgtInstructionSelector, class PredicateBitset,
-          class ComplexMatcherMemFn>
+          class ComplexMatcherMemFn, class CustomRendererFn>
 bool InstructionSelector::executeMatchTable(
     TgtInstructionSelector &ISel, NewMIVector &OutMIs, MatcherState &State,
-    const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> &MatcherInfo,
+    const ISelInfoTy<PredicateBitset, ComplexMatcherMemFn, CustomRendererFn>
+        &ISelInfo,
     const int64_t *MatchTable, const TargetInstrInfo &TII,
     MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
     const RegisterBankInfo &RBI, const PredicateBitset &AvailableFeatures,
     CodeGenCoverage &CoverageInfo) const {
+
   uint64_t CurrentIdx = 0;
-  SmallVector<uint64_t, 8> OnFailResumeAt;
+  SmallVector<uint64_t, 4> OnFailResumeAt;
 
   enum RejectAction { RejectAndGiveUp, RejectAndResume };
   auto handleReject = [&]() -> RejectAction {
@@ -60,8 +64,7 @@ bool InstructionSelector::executeMatchTable(
                     dbgs() << CurrentIdx << ": Rejected\n");
     if (OnFailResumeAt.empty())
       return RejectAndGiveUp;
-    CurrentIdx = OnFailResumeAt.back();
-    OnFailResumeAt.pop_back();
+    CurrentIdx = OnFailResumeAt.pop_back_val();
     DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
                     dbgs() << CurrentIdx << ": Resume at " << CurrentIdx << " ("
                            << OnFailResumeAt.size() << " try-blocks remain)\n");
@@ -70,7 +73,8 @@ bool InstructionSelector::executeMatchTable(
 
   while (true) {
     assert(CurrentIdx != ~0u && "Invalid MatchTable index");
-    switch (MatchTable[CurrentIdx++]) {
+    int64_t MatcherOpcode = MatchTable[CurrentIdx++];
+    switch (MatcherOpcode) {
     case GIM_Try: {
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
                       dbgs() << CurrentIdx << ": Begin try-block\n");
@@ -124,8 +128,8 @@ bool InstructionSelector::executeMatchTable(
                       dbgs() << CurrentIdx
                              << ": GIM_CheckFeatures(ExpectedBitsetID="
                              << ExpectedBitsetID << ")\n");
-      if ((AvailableFeatures & MatcherInfo.FeatureBitsets[ExpectedBitsetID]) !=
-          MatcherInfo.FeatureBitsets[ExpectedBitsetID]) {
+      if ((AvailableFeatures & ISelInfo.FeatureBitsets[ExpectedBitsetID]) !=
+          ISelInfo.FeatureBitsets[ExpectedBitsetID]) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       }
@@ -136,12 +140,13 @@ bool InstructionSelector::executeMatchTable(
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t Expected = MatchTable[CurrentIdx++];
 
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
       unsigned Opcode = State.MIs[InsnID]->getOpcode();
+
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
                       dbgs() << CurrentIdx << ": GIM_CheckOpcode(MIs[" << InsnID
                              << "], ExpectedOpcode=" << Expected
                              << ") // Got=" << Opcode << "\n");
-      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
       if (Opcode != Expected) {
         if (handleReject() == RejectAndGiveUp)
           return false;
@@ -149,6 +154,77 @@ bool InstructionSelector::executeMatchTable(
       break;
     }
 
+    case GIM_SwitchOpcode: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t LowerBound = MatchTable[CurrentIdx++];
+      int64_t UpperBound = MatchTable[CurrentIdx++];
+      int64_t Default = MatchTable[CurrentIdx++];
+
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      const int64_t Opcode = State.MIs[InsnID]->getOpcode();
+
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(), {
+        dbgs() << CurrentIdx << ": GIM_SwitchOpcode(MIs[" << InsnID << "], ["
+               << LowerBound << ", " << UpperBound << "), Default=" << Default
+               << ", JumpTable...) // Got=" << Opcode << "\n";
+      });
+      if (Opcode < LowerBound || UpperBound <= Opcode) {
+        CurrentIdx = Default;
+        break;
+      }
+      CurrentIdx = MatchTable[CurrentIdx + (Opcode - LowerBound)];
+      if (!CurrentIdx) {
+        CurrentIdx = Default;
+	break;
+      }
+      OnFailResumeAt.push_back(Default);
+      break;
+    }
+
+    case GIM_SwitchType: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t OpIdx = MatchTable[CurrentIdx++];
+      int64_t LowerBound = MatchTable[CurrentIdx++];
+      int64_t UpperBound = MatchTable[CurrentIdx++];
+      int64_t Default = MatchTable[CurrentIdx++];
+
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(), {
+        dbgs() << CurrentIdx << ": GIM_SwitchType(MIs[" << InsnID
+               << "]->getOperand(" << OpIdx << "), [" << LowerBound << ", "
+               << UpperBound << "), Default=" << Default
+               << ", JumpTable...) // Got=";
+        if (!MO.isReg())
+          dbgs() << "Not a VReg\n";
+        else
+          dbgs() << MRI.getType(MO.getReg()) << "\n";
+      });
+      if (!MO.isReg()) {
+        CurrentIdx = Default;
+        break;
+      }
+      const LLT Ty = MRI.getType(MO.getReg());
+      const auto TyI = ISelInfo.TypeIDMap.find(Ty);
+      if (TyI == ISelInfo.TypeIDMap.end()) {
+        CurrentIdx = Default;
+        break;
+      }
+      const int64_t TypeID = TyI->second;
+      if (TypeID < LowerBound || UpperBound <= TypeID) {
+        CurrentIdx = Default;
+        break;
+      }
+      CurrentIdx = MatchTable[CurrentIdx + (TypeID - LowerBound)];
+      if (!CurrentIdx) {
+        CurrentIdx = Default;
+        break;
+      }
+      OnFailResumeAt.push_back(Default);
+      break;
+    }
+
     case GIM_CheckNumOperands: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t Expected = MatchTable[CurrentIdx++];
@@ -194,7 +270,8 @@ bool InstructionSelector::executeMatchTable(
                           << CurrentIdx << ": GIM_CheckAPIntImmPredicate(MIs["
                           << InsnID << "], Predicate=" << Predicate << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-      assert(State.MIs[InsnID]->getOpcode() && "Expected G_CONSTANT");
+      assert(State.MIs[InsnID]->getOpcode() == TargetOpcode::G_CONSTANT &&
+             "Expected G_CONSTANT");
       assert(Predicate > GIPFP_APInt_Invalid && "Expected a valid predicate");
       APInt Value;
       if (State.MIs[InsnID]->getOperand(1).isCImm())
@@ -226,6 +303,21 @@ bool InstructionSelector::executeMatchTable(
           return false;
       break;
     }
+    case GIM_CheckCxxInsnPredicate: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t Predicate = MatchTable[CurrentIdx++];
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs()
+                          << CurrentIdx << ": GIM_CheckCxxPredicate(MIs["
+                          << InsnID << "], Predicate=" << Predicate << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      assert(Predicate > GIPFP_MI_Invalid && "Expected a valid predicate");
+
+      if (!testMIPredicate_MI(Predicate, *State.MIs[InsnID]))
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      break;
+    }
     case GIM_CheckAtomicOrdering: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       AtomicOrdering Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
@@ -233,7 +325,6 @@ bool InstructionSelector::executeMatchTable(
                       dbgs() << CurrentIdx << ": GIM_CheckAtomicOrdering(MIs["
                              << InsnID << "], " << (uint64_t)Ordering << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-
       if (!State.MIs[InsnID]->hasOneMemOperand())
         if (handleReject() == RejectAndGiveUp)
           return false;
@@ -252,7 +343,6 @@ bool InstructionSelector::executeMatchTable(
                              << ": GIM_CheckAtomicOrderingOrStrongerThan(MIs["
                              << InsnID << "], " << (uint64_t)Ordering << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-
       if (!State.MIs[InsnID]->hasOneMemOperand())
         if (handleReject() == RejectAndGiveUp)
           return false;
@@ -271,7 +361,6 @@ bool InstructionSelector::executeMatchTable(
                              << ": GIM_CheckAtomicOrderingWeakerThan(MIs["
                              << InsnID << "], " << (uint64_t)Ordering << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-
       if (!State.MIs[InsnID]->hasOneMemOperand())
         if (handleReject() == RejectAndGiveUp)
           return false;
@@ -282,6 +371,87 @@ bool InstructionSelector::executeMatchTable(
             return false;
       break;
     }
+    case GIM_CheckMemorySizeEqualTo: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t MMOIdx = MatchTable[CurrentIdx++];
+      uint64_t Size = MatchTable[CurrentIdx++];
+
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx
+                             << ": GIM_CheckMemorySizeEqual(MIs[" << InsnID
+                             << "]->memoperands() + " << MMOIdx
+                             << ", Size=" << Size << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+
+      if (State.MIs[InsnID]->getNumMemOperands() <= MMOIdx) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+        break;
+      }
+
+      MachineMemOperand *MMO = *(State.MIs[InsnID]->memoperands_begin() + MMOIdx);
+
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << MMO->getSize() << " bytes vs " << Size
+                             << " bytes\n");
+      if (MMO->getSize() != Size)
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+
+      break;
+    }
+    case GIM_CheckMemorySizeEqualToLLT:
+    case GIM_CheckMemorySizeLessThanLLT:
+    case GIM_CheckMemorySizeGreaterThanLLT: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t MMOIdx = MatchTable[CurrentIdx++];
+      int64_t OpIdx = MatchTable[CurrentIdx++];
+
+      DEBUG_WITH_TYPE(
+          TgtInstructionSelector::getName(),
+          dbgs() << CurrentIdx << ": GIM_CheckMemorySize"
+                 << (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT
+                         ? "EqualTo"
+                         : MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT
+                               ? "GreaterThan"
+                               : "LessThan")
+                 << "LLT(MIs[" << InsnID << "]->memoperands() + " << MMOIdx
+                 << ", OpIdx=" << OpIdx << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (!MO.isReg()) {
+        DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                        dbgs() << CurrentIdx << ": Not a register\n");
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+        break;
+      }
+
+      if (State.MIs[InsnID]->getNumMemOperands() <= MMOIdx) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+        break;
+      }
+
+      MachineMemOperand *MMO = *(State.MIs[InsnID]->memoperands_begin() + MMOIdx);
+
+      unsigned Size = MRI.getType(MO.getReg()).getSizeInBits();
+      if (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT &&
+          MMO->getSize() * 8 != Size) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      } else if (MatcherOpcode == GIM_CheckMemorySizeLessThanLLT &&
+                 MMO->getSize() * 8 >= Size) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      } else if (MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT &&
+                 MMO->getSize() * 8 <= Size)
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+
+      break;
+    }
     case GIM_CheckType: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t OpIdx = MatchTable[CurrentIdx++];
@@ -291,8 +461,9 @@ bool InstructionSelector::executeMatchTable(
                              << "]->getOperand(" << OpIdx
                              << "), TypeID=" << TypeID << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-      if (MRI.getType(State.MIs[InsnID]->getOperand(OpIdx).getReg()) !=
-          MatcherInfo.TypeObjects[TypeID]) {
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (!MO.isReg() ||
+          MRI.getType(MO.getReg()) != ISelInfo.TypeObjects[TypeID]) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       }
@@ -308,7 +479,6 @@ bool InstructionSelector::executeMatchTable(
                              << InsnID << "]->getOperand(" << OpIdx
                              << "), SizeInBits=" << SizeInBits << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-
       // iPTR must be looked up in the target.
       if (SizeInBits == 0) {
         MachineFunction *MF = State.MIs[InsnID]->getParent()->getParent();
@@ -317,11 +487,15 @@ bool InstructionSelector::executeMatchTable(
 
       assert(SizeInBits != 0 && "Pointer size must be known");
 
-      const LLT &Ty = MRI.getType(State.MIs[InsnID]->getOperand(OpIdx).getReg());
-      if (!Ty.isPointer() || Ty.getSizeInBits() != SizeInBits) {
-        if (handleReject() == RejectAndGiveUp)
-          return false;
-      }
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (MO.isReg()) {
+        const LLT &Ty = MRI.getType(MO.getReg());
+        if (!Ty.isPointer() || Ty.getSizeInBits() != SizeInBits)
+          if (handleReject() == RejectAndGiveUp)
+            return false;
+      } else if (handleReject() == RejectAndGiveUp)
+        return false;
+
       break;
     }
     case GIM_CheckRegBankForClass: {
@@ -333,9 +507,10 @@ bool InstructionSelector::executeMatchTable(
                              << InsnID << "]->getOperand(" << OpIdx
                              << "), RCEnum=" << RCEnum << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
-      if (&RBI.getRegBankFromRegClass(*TRI.getRegClass(RCEnum)) !=
-          RBI.getRegBank(State.MIs[InsnID]->getOperand(OpIdx).getReg(), MRI,
-                         TRI)) {
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (!MO.isReg() ||
+          &RBI.getRegBankFromRegClass(*TRI.getRegClass(RCEnum)) !=
+              RBI.getRegBank(MO.getReg(), MRI, TRI)) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       }
@@ -356,7 +531,7 @@ bool InstructionSelector::executeMatchTable(
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
       // FIXME: Use std::invoke() when it's available.
       ComplexRendererFns Renderer =
-          (ISel.*MatcherInfo.ComplexPredicates[ComplexPredicateID])(
+          (ISel.*ISelInfo.ComplexPredicates[ComplexPredicateID])(
               State.MIs[InsnID]->getOperand(OpIdx));
       if (Renderer.hasValue())
         State.Renderers[RendererID] = Renderer.getValue();
@@ -375,16 +550,19 @@ bool InstructionSelector::executeMatchTable(
                              << InsnID << "]->getOperand(" << OpIdx
                              << "), Value=" << Value << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (MO.isReg()) {
+        // isOperandImmEqual() will sign-extend to 64-bits, so should we.
+        LLT Ty = MRI.getType(MO.getReg());
+        Value = SignExtend64(Value, Ty.getSizeInBits());
 
-      // isOperandImmEqual() will sign-extend to 64-bits, so should we.
-      LLT Ty = MRI.getType(State.MIs[InsnID]->getOperand(OpIdx).getReg());
-      Value = SignExtend64(Value, Ty.getSizeInBits());
+        if (!isOperandImmEqual(MO, Value, MRI)) {
+          if (handleReject() == RejectAndGiveUp)
+            return false;
+        }
+      } else if (handleReject() == RejectAndGiveUp)
+        return false;
 
-      if (!isOperandImmEqual(State.MIs[InsnID]->getOperand(OpIdx), Value,
-                             MRI)) {
-        if (handleReject() == RejectAndGiveUp)
-          return false;
-      }
       break;
     }
 
@@ -467,7 +645,7 @@ bool InstructionSelector::executeMatchTable(
     }
     case GIM_Reject:
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
-                      dbgs() << CurrentIdx << ": GIM_Reject");
+                      dbgs() << CurrentIdx << ": GIM_Reject\n");
       if (handleReject() == RejectAndGiveUp)
         return false;
       break;
@@ -649,6 +827,36 @@ bool InstructionSelector::executeMatchTable(
       break;
     }
 
+    // TODO: Needs a test case once we have a pattern that uses this.
+    case GIR_CopyFConstantAsFPImm: {
+      int64_t NewInsnID = MatchTable[CurrentIdx++];
+      int64_t OldInsnID = MatchTable[CurrentIdx++];
+      assert(OutMIs[NewInsnID] && "Attempted to add to undefined instruction");
+      assert(State.MIs[OldInsnID]->getOpcode() == TargetOpcode::G_FCONSTANT && "Expected G_FCONSTANT");
+      if (State.MIs[OldInsnID]->getOperand(1).isFPImm())
+        OutMIs[NewInsnID].addFPImm(
+            State.MIs[OldInsnID]->getOperand(1).getFPImm());
+      else
+        llvm_unreachable("Expected FPImm operand");
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx << ": GIR_CopyFPConstantAsFPImm(OutMIs["
+                             << NewInsnID << "], MIs[" << OldInsnID << "])\n");
+      break;
+    }
+
+    case GIR_CustomRenderer: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t OldInsnID = MatchTable[CurrentIdx++];
+      int64_t RendererFnID = MatchTable[CurrentIdx++];
+      assert(OutMIs[InsnID] && "Attempted to add to undefined instruction");
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx << ": GIR_CustomRenderer(OutMIs["
+                             << InsnID << "], MIs[" << OldInsnID << "], "
+                             << RendererFnID << ")\n");
+      (ISel.*ISelInfo.CustomRenderers[RendererFnID])(OutMIs[InsnID],
+                                                     *State.MIs[OldInsnID]);
+      break;
+    }
     case GIR_ConstrainOperandRC: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t OpIdx = MatchTable[CurrentIdx++];
@@ -710,7 +918,7 @@ bool InstructionSelector::executeMatchTable(
       int64_t TypeID = MatchTable[CurrentIdx++];
 
       State.TempRegisters[TempRegID] =
-          MRI.createGenericVirtualRegister(MatcherInfo.TypeObjects[TypeID]);
+          MRI.createGenericVirtualRegister(ISelInfo.TypeObjects[TypeID]);
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
                       dbgs() << CurrentIdx << ": TempRegs[" << TempRegID
                              << "] = GIR_MakeTempReg(" << TypeID << ")\n");
@@ -729,7 +937,7 @@ bool InstructionSelector::executeMatchTable(
 
     case GIR_Done:
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
-                      dbgs() << CurrentIdx << ": GIR_Done");
+                      dbgs() << CurrentIdx << ": GIR_Done\n");
       return true;
 
     default:
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index e7945ff5bf4f..873587651efd 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -38,7 +38,7 @@ public:
       return false;
     if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
                                            MI.getOperand(1).getReg(), MRI)) {
-      DEBUG(dbgs() << ".. Combine MI: " << MI;);
+      LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       unsigned DstReg = MI.getOperand(0).getReg();
       unsigned SrcReg = DefMI->getOperand(1).getReg();
       Builder.setInstr(MI);
@@ -59,10 +59,10 @@ public:
                                            MI.getOperand(1).getReg(), MRI)) {
       unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
-      if (isInstUnsupported(TargetOpcode::G_AND, DstTy) ||
-          isInstUnsupported(TargetOpcode::G_CONSTANT, DstTy))
+      if (isInstUnsupported({TargetOpcode::G_AND, {DstTy}}) ||
+          isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
-      DEBUG(dbgs() << ".. Combine MI: " << MI;);
+      LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       Builder.setInstr(MI);
       unsigned ZExtSrc = MI.getOperand(1).getReg();
       LLT ZExtSrcTy = MRI.getType(ZExtSrc);
@@ -87,11 +87,11 @@ public:
                                            MI.getOperand(1).getReg(), MRI)) {
       unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
-      if (isInstUnsupported(TargetOpcode::G_SHL, DstTy) ||
-          isInstUnsupported(TargetOpcode::G_ASHR, DstTy) ||
-          isInstUnsupported(TargetOpcode::G_CONSTANT, DstTy))
+      if (isInstUnsupported({TargetOpcode::G_SHL, {DstTy}}) ||
+          isInstUnsupported({TargetOpcode::G_ASHR, {DstTy}}) ||
+          isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
-      DEBUG(dbgs() << ".. Combine MI: " << MI;);
+      LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       Builder.setInstr(MI);
       unsigned SExtSrc = MI.getOperand(1).getReg();
       LLT SExtSrcTy = MRI.getType(SExtSrc);
@@ -121,9 +121,9 @@ public:
                                            MI.getOperand(1).getReg(), MRI)) {
       unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
-      if (isInstUnsupported(TargetOpcode::G_IMPLICIT_DEF, DstTy))
+      if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}}))
         return false;
-      DEBUG(dbgs() << ".. Combine EXT(IMPLICIT_DEF) " << MI;);
+      LLVM_DEBUG(dbgs() << ".. Combine EXT(IMPLICIT_DEF) " << MI;);
       Builder.setInstr(MI);
       Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg);
       markInstAndDefDead(MI, *DefMI, DeadInsts);
@@ -139,9 +139,9 @@ public:
       return false;
 
     unsigned NumDefs = MI.getNumOperands() - 1;
-    unsigned SrcReg = MI.getOperand(NumDefs).getReg();
-    MachineInstr *MergeI = MRI.getVRegDef(SrcReg);
-    if (!MergeI || (MergeI->getOpcode() != TargetOpcode::G_MERGE_VALUES))
+    MachineInstr *MergeI = getOpcodeDef(TargetOpcode::G_MERGE_VALUES,
+                                        MI.getOperand(NumDefs).getReg(), MRI);
+    if (!MergeI)
       return false;
 
     const unsigned NumMergeRegs = MergeI->getNumOperands() - 1;
@@ -253,11 +253,8 @@ private:
     // and as a result, %3, %2, %1 are dead.
     MachineInstr *PrevMI = &MI;
     while (PrevMI != &DefMI) {
-      // If we're dealing with G_UNMERGE_VALUES, tryCombineMerges doesn't really try
-      // to fold copies in between and we can ignore them here.
-      if (PrevMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
-        break;
-      unsigned PrevRegSrc = PrevMI->getOperand(1).getReg();
+      unsigned PrevRegSrc =
+          PrevMI->getOperand(PrevMI->getNumOperands() - 1).getReg();
       MachineInstr *TmpDef = MRI.getVRegDef(PrevRegSrc);
       if (MRI.hasOneUse(PrevRegSrc)) {
         if (TmpDef != &DefMI) {
@@ -269,18 +266,16 @@ private:
         break;
       PrevMI = TmpDef;
     }
-    if ((PrevMI == &DefMI ||
-         DefMI.getOpcode() == TargetOpcode::G_MERGE_VALUES) &&
-        MRI.hasOneUse(DefMI.getOperand(0).getReg()))
+    if (PrevMI == &DefMI && MRI.hasOneUse(DefMI.getOperand(0).getReg()))
       DeadInsts.push_back(&DefMI);
   }
 
   /// Checks if the target legalizer info has specified anything about the
   /// instruction, or if unsupported.
-  bool isInstUnsupported(unsigned Opcode, const LLT &DstTy) const {
-    auto Action = LI.getAction({Opcode, 0, DstTy});
-    return Action.first == LegalizerInfo::LegalizeAction::Unsupported ||
-           Action.first == LegalizerInfo::LegalizeAction::NotFound;
+  bool isInstUnsupported(const LegalityQuery &Query) const {
+    using namespace LegalizeActions;
+    auto Step = LI.getAction(Query);
+    return Step.Action == Unsupported || Step.Action == NotFound;
   }
 };
 
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 8bd8a9dcd0e2..d122e67b87b8 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -93,12 +93,24 @@ public:
   const LegalizerInfo &getLegalizerInfo() const { return LI; }
 
 private:
+  /// Legalize a single operand \p OpIdx of the machine instruction \p MI as a
+  /// Use by extending the operand's type to \p WideTy using the specified \p
+  /// ExtOpcode for the extension instruction, and replacing the vreg of the
+  /// operand in place.
+  void widenScalarSrc(MachineInstr &MI, LLT WideTy, unsigned OpIdx,
+                      unsigned ExtOpcode);
+
+  /// Legalize a single operand \p OpIdx of the machine instruction \p MI as a
+  /// Def by extending the operand's type to \p WideTy and truncating it back
+  /// with the \p TruncOpcode, and replacing the vreg of the operand in place.
+  void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx = 0,
+                      unsigned TruncOpcode = TargetOpcode::G_TRUNC);
 
   /// Helper function to split a wide generic register into bitwise blocks with
   /// the given Type (which implies the number of blocks needed). The generic
   /// registers created are appended to Ops, starting at bit 0 of Reg.
   void extractParts(unsigned Reg, LLT Ty, int NumParts,
-                    SmallVectorImpl<unsigned> &Ops);
+                    SmallVectorImpl<unsigned> &VRegs);
 
   MachineRegisterInfo &MRI;
   const LegalizerInfo &LI;
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index b6735d538b37..713d72eb4c9b 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -19,8 +19,11 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include <cassert>
 #include <cstdint>
@@ -30,9 +33,67 @@
 
 namespace llvm {
 
+extern cl::opt<bool> DisableGISelLegalityCheck;
+
 class MachineInstr;
 class MachineIRBuilder;
 class MachineRegisterInfo;
+class MCInstrInfo;
+
+namespace LegalizeActions {
+enum LegalizeAction : std::uint8_t {
+  /// The operation is expected to be selectable directly by the target, and
+  /// no transformation is necessary.
+  Legal,
+
+  /// The operation should be synthesized from multiple instructions acting on
+  /// a narrower scalar base-type. For example a 64-bit add might be
+  /// implemented in terms of 32-bit add-with-carry.
+  NarrowScalar,
+
+  /// The operation should be implemented in terms of a wider scalar
+  /// base-type. For example a <2 x s8> add could be implemented as a <2
+  /// x s32> add (ignoring the high bits).
+  WidenScalar,
+
+  /// The (vector) operation should be implemented by splitting it into
+  /// sub-vectors where the operation is legal. For example a <8 x s64> add
+  /// might be implemented as 4 separate <2 x s64> adds.
+  FewerElements,
+
+  /// The (vector) operation should be implemented by widening the input
+  /// vector and ignoring the lanes added by doing so. For example <2 x i8> is
+  /// rarely legal, but you might perform an <8 x i8> and then only look at
+  /// the first two results.
+  MoreElements,
+
+  /// The operation itself must be expressed in terms of simpler actions on
+  /// this target. E.g. a SREM replaced by an SDIV and subtraction.
+  Lower,
+
+  /// The operation should be implemented as a call to some kind of runtime
+  /// support library. For example this usually happens on machines that don't
+  /// support floating-point operations natively.
+  Libcall,
+
+  /// The target wants to do something special with this combination of
+  /// operand and type. A callback will be issued when it is needed.
+  Custom,
+
+  /// This operation is completely unsupported on the target. A programming
+  /// error has occurred.
+  Unsupported,
+
+  /// Sentinel value for when no action was found in the specified table.
+  NotFound,
+
+  /// Fall back onto the old rules.
+  /// TODO: Remove this once we've migrated
+  UseLegacyRules,
+};
+} // end namespace LegalizeActions
+
+using LegalizeActions::LegalizeAction;
 
 /// Legalization is decided based on an instruction's opcode, which type slot
 /// we're considering, and what the existing type is. These aspects are gathered
@@ -51,64 +112,642 @@ struct InstrAspect {
   }
 };
 
-class LegalizerInfo {
-public:
-  enum LegalizeAction : std::uint8_t {
-    /// The operation is expected to be selectable directly by the target, and
-    /// no transformation is necessary.
-    Legal,
-
-    /// The operation should be synthesized from multiple instructions acting on
-    /// a narrower scalar base-type. For example a 64-bit add might be
-    /// implemented in terms of 32-bit add-with-carry.
-    NarrowScalar,
-
-    /// The operation should be implemented in terms of a wider scalar
-    /// base-type. For example a <2 x s8> add could be implemented as a <2
-    /// x s32> add (ignoring the high bits).
-    WidenScalar,
-
-    /// The (vector) operation should be implemented by splitting it into
-    /// sub-vectors where the operation is legal. For example a <8 x s64> add
-    /// might be implemented as 4 separate <2 x s64> adds.
-    FewerElements,
-
-    /// The (vector) operation should be implemented by widening the input
-    /// vector and ignoring the lanes added by doing so. For example <2 x i8> is
-    /// rarely legal, but you might perform an <8 x i8> and then only look at
-    /// the first two results.
-    MoreElements,
-
-    /// The operation itself must be expressed in terms of simpler actions on
-    /// this target. E.g. a SREM replaced by an SDIV and subtraction.
-    Lower,
-
-    /// The operation should be implemented as a call to some kind of runtime
-    /// support library. For example this usually happens on machines that don't
-    /// support floating-point operations natively.
-    Libcall,
-
-    /// The target wants to do something special with this combination of
-    /// operand and type. A callback will be issued when it is needed.
-    Custom,
-
-    /// This operation is completely unsupported on the target. A programming
-    /// error has occurred.
-    Unsupported,
-
-    /// Sentinel value for when no action was found in the specified table.
-    NotFound,
+/// The LegalityQuery object bundles together all the information that's needed
+/// to decide whether a given operation is legal or not.
+/// For efficiency, it doesn't make a copy of Types so care must be taken not
+/// to free it before using the query.
+struct LegalityQuery {
+  unsigned Opcode;
+  ArrayRef<LLT> Types;
+
+  struct MemDesc {
+    uint64_t Size;
+    AtomicOrdering Ordering;
   };
 
+  /// Operations which require memory can use this to place requirements on the
+  /// memory type for each MMO.
+  ArrayRef<MemDesc> MMODescrs;
+
+  constexpr LegalityQuery(unsigned Opcode, const ArrayRef<LLT> Types,
+                          const ArrayRef<MemDesc> MMODescrs)
+      : Opcode(Opcode), Types(Types), MMODescrs(MMODescrs) {}
+  constexpr LegalityQuery(unsigned Opcode, const ArrayRef<LLT> Types)
+      : LegalityQuery(Opcode, Types, {}) {}
+
+  raw_ostream &print(raw_ostream &OS) const;
+};
+
+/// The result of a query. It either indicates a final answer of Legal or
+/// Unsupported or describes an action that must be taken to make an operation
+/// more legal.
+struct LegalizeActionStep {
+  /// The action to take or the final answer.
+  LegalizeAction Action;
+  /// If describing an action, the type index to change. Otherwise zero.
+  unsigned TypeIdx;
+  /// If describing an action, the new type for TypeIdx. Otherwise LLT{}.
+  LLT NewType;
+
+  LegalizeActionStep(LegalizeAction Action, unsigned TypeIdx,
+                     const LLT &NewType)
+      : Action(Action), TypeIdx(TypeIdx), NewType(NewType) {}
+
+  bool operator==(const LegalizeActionStep &RHS) const {
+    return std::tie(Action, TypeIdx, NewType) ==
+        std::tie(RHS.Action, RHS.TypeIdx, RHS.NewType);
+  }
+};
+
+using LegalityPredicate = std::function<bool (const LegalityQuery &)>;
+using LegalizeMutation =
+    std::function<std::pair<unsigned, LLT>(const LegalityQuery &)>;
+
+namespace LegalityPredicates {
+struct TypePairAndMemSize {
+  LLT Type0;
+  LLT Type1;
+  uint64_t MemSize;
+
+  bool operator==(const TypePairAndMemSize &Other) const {
+    return Type0 == Other.Type0 && Type1 == Other.Type1 &&
+           MemSize == Other.MemSize;
+  }
+};
+
+/// True iff P0 and P1 are true.
+template<typename Predicate>
+Predicate all(Predicate P0, Predicate P1) {
+  return [=](const LegalityQuery &Query) {
+    return P0(Query) && P1(Query);
+  };
+}
+/// True iff all given predicates are true.
+template<typename Predicate, typename... Args>
+Predicate all(Predicate P0, Predicate P1, Args... args) {
+  return all(all(P0, P1), args...);
+}
+/// True iff the given type index is the specified types.
+LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit);
+/// True iff the given type index is one of the specified types.
+LegalityPredicate typeInSet(unsigned TypeIdx,
+                            std::initializer_list<LLT> TypesInit);
+/// True iff the given types for the given pair of type indexes is one of the
+/// specified type pairs.
+LegalityPredicate
+typePairInSet(unsigned TypeIdx0, unsigned TypeIdx1,
+              std::initializer_list<std::pair<LLT, LLT>> TypesInit);
+/// True iff the given types for the given pair of type indexes is one of the
+/// specified type pairs.
+LegalityPredicate typePairAndMemSizeInSet(
+    unsigned TypeIdx0, unsigned TypeIdx1, unsigned MMOIdx,
+    std::initializer_list<TypePairAndMemSize> TypesAndMemSizeInit);
+/// True iff the specified type index is a scalar.
+LegalityPredicate isScalar(unsigned TypeIdx);
+/// True iff the specified type index is a scalar that's narrower than the given
+/// size.
+LegalityPredicate narrowerThan(unsigned TypeIdx, unsigned Size);
+/// True iff the specified type index is a scalar that's wider than the given
+/// size.
+LegalityPredicate widerThan(unsigned TypeIdx, unsigned Size);
+/// True iff the specified type index is a scalar whose size is not a power of
+/// 2.
+LegalityPredicate sizeNotPow2(unsigned TypeIdx);
+/// True iff the specified MMO index has a size that is not a power of 2
+LegalityPredicate memSizeInBytesNotPow2(unsigned MMOIdx);
+/// True iff the specified type index is a vector whose element count is not a
+/// power of 2.
+LegalityPredicate numElementsNotPow2(unsigned TypeIdx);
+/// True iff the specified MMO index has at an atomic ordering of at Ordering or
+/// stronger.
+LegalityPredicate atomicOrderingAtLeastOrStrongerThan(unsigned MMOIdx,
+                                                      AtomicOrdering Ordering);
+} // end namespace LegalityPredicates
+
+namespace LegalizeMutations {
+/// Select this specific type for the given type index.
+LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty);
+/// Keep the same type as the given type index.
+LegalizeMutation changeTo(unsigned TypeIdx, unsigned FromTypeIdx);
+/// Widen the type for the given type index to the next power of 2.
+LegalizeMutation widenScalarToNextPow2(unsigned TypeIdx, unsigned Min = 0);
+/// Add more elements to the type for the given type index to the next power of
+/// 2.
+LegalizeMutation moreElementsToNextPow2(unsigned TypeIdx, unsigned Min = 0);
+} // end namespace LegalizeMutations
+
+/// A single rule in a legalizer info ruleset.
+/// The specified action is chosen when the predicate is true. Where appropriate
+/// for the action (e.g. for WidenScalar) the new type is selected using the
+/// given mutator.
+class LegalizeRule {
+  LegalityPredicate Predicate;
+  LegalizeAction Action;
+  LegalizeMutation Mutation;
+
+public:
+  LegalizeRule(LegalityPredicate Predicate, LegalizeAction Action,
+               LegalizeMutation Mutation = nullptr)
+      : Predicate(Predicate), Action(Action), Mutation(Mutation) {}
+
+  /// Test whether the LegalityQuery matches.
+  bool match(const LegalityQuery &Query) const {
+    return Predicate(Query);
+  }
+
+  LegalizeAction getAction() const { return Action; }
+
+  /// Determine the change to make.
+  std::pair<unsigned, LLT> determineMutation(const LegalityQuery &Query) const {
+    if (Mutation)
+      return Mutation(Query);
+    return std::make_pair(0, LLT{});
+  }
+};
+
+class LegalizeRuleSet {
+  /// When non-zero, the opcode we are an alias of
+  unsigned AliasOf;
+  /// If true, there is another opcode that aliases this one
+  bool IsAliasedByAnother;
+  SmallVector<LegalizeRule, 2> Rules;
+
+#ifndef NDEBUG
+  /// If bit I is set, this rule set contains a rule that may handle (predicate
+  /// or perform an action upon (or both)) the type index I. The uncertainty
+  /// comes from free-form rules executing user-provided lambda functions. We
+  /// conservatively assume such rules do the right thing and cover all type
+  /// indices. The bitset is intentionally 1 bit wider than it absolutely needs
+  /// to be to distinguish such cases from the cases where all type indices are
+  /// individually handled.
+  SmallBitVector TypeIdxsCovered{MCOI::OPERAND_LAST_GENERIC -
+                                 MCOI::OPERAND_FIRST_GENERIC + 2};
+#endif
+
+  unsigned typeIdx(unsigned TypeIdx) {
+    assert(TypeIdx <=
+               (MCOI::OPERAND_LAST_GENERIC - MCOI::OPERAND_FIRST_GENERIC) &&
+           "Type Index is out of bounds");
+#ifndef NDEBUG
+    TypeIdxsCovered.set(TypeIdx);
+#endif
+    return TypeIdx;
+  }
+  void markAllTypeIdxsAsCovered() {
+#ifndef NDEBUG
+    TypeIdxsCovered.set();
+#endif
+  }
+
+  void add(const LegalizeRule &Rule) {
+    assert(AliasOf == 0 &&
+           "RuleSet is aliased, change the representative opcode instead");
+    Rules.push_back(Rule);
+  }
+
+  static bool always(const LegalityQuery &) { return true; }
+
+  /// Use the given action when the predicate is true.
+  /// Action should not be an action that requires mutation.
+  LegalizeRuleSet &actionIf(LegalizeAction Action,
+                            LegalityPredicate Predicate) {
+    add({Predicate, Action});
+    return *this;
+  }
+  /// Use the given action when the predicate is true.
+  /// Action should be an action that requires mutation.
+  LegalizeRuleSet &actionIf(LegalizeAction Action, LegalityPredicate Predicate,
+                            LegalizeMutation Mutation) {
+    add({Predicate, Action, Mutation});
+    return *this;
+  }
+  /// Use the given action when type index 0 is any type in the given list.
+  /// Action should not be an action that requires mutation.
+  LegalizeRuleSet &actionFor(LegalizeAction Action,
+                             std::initializer_list<LLT> Types) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, typeInSet(typeIdx(0), Types));
+  }
+  /// Use the given action when type index 0 is any type in the given list.
+  /// Action should be an action that requires mutation.
+  LegalizeRuleSet &actionFor(LegalizeAction Action,
+                             std::initializer_list<LLT> Types,
+                             LegalizeMutation Mutation) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, typeInSet(typeIdx(0), Types), Mutation);
+  }
+  /// Use the given action when type indexes 0 and 1 is any type pair in the
+  /// given list.
+  /// Action should not be an action that requires mutation.
+  LegalizeRuleSet &actionFor(LegalizeAction Action,
+                             std::initializer_list<std::pair<LLT, LLT>> Types) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, typePairInSet(typeIdx(0), typeIdx(1), Types));
+  }
+  /// Use the given action when type indexes 0 and 1 is any type pair in the
+  /// given list.
+  /// Action should be an action that requires mutation.
+  LegalizeRuleSet &actionFor(LegalizeAction Action,
+                             std::initializer_list<std::pair<LLT, LLT>> Types,
+                             LegalizeMutation Mutation) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, typePairInSet(typeIdx(0), typeIdx(1), Types),
+                    Mutation);
+  }
+  /// Use the given action when type indexes 0 and 1 are both in the given list.
+  /// That is, the type pair is in the cartesian product of the list.
+  /// Action should not be an action that requires mutation.
+  LegalizeRuleSet &actionForCartesianProduct(LegalizeAction Action,
+                                             std::initializer_list<LLT> Types) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, all(typeInSet(typeIdx(0), Types),
+                                typeInSet(typeIdx(1), Types)));
+  }
+  /// Use the given action when type indexes 0 and 1 are both in their
+  /// respective lists.
+  /// That is, the type pair is in the cartesian product of the lists
+  /// Action should not be an action that requires mutation.
+  LegalizeRuleSet &
+  actionForCartesianProduct(LegalizeAction Action,
+                            std::initializer_list<LLT> Types0,
+                            std::initializer_list<LLT> Types1) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, all(typeInSet(typeIdx(0), Types0),
+                                typeInSet(typeIdx(1), Types1)));
+  }
+  /// Use the given action when type indexes 0, 1, and 2 are all in their
+  /// respective lists.
+  /// That is, the type triple is in the cartesian product of the lists
+  /// Action should not be an action that requires mutation.
+  LegalizeRuleSet &actionForCartesianProduct(
+      LegalizeAction Action, std::initializer_list<LLT> Types0,
+      std::initializer_list<LLT> Types1, std::initializer_list<LLT> Types2) {
+    using namespace LegalityPredicates;
+    return actionIf(Action, all(typeInSet(typeIdx(0), Types0),
+                                all(typeInSet(typeIdx(1), Types1),
+                                    typeInSet(typeIdx(2), Types2))));
+  }
+
+public:
+  LegalizeRuleSet() : AliasOf(0), IsAliasedByAnother(false), Rules() {}
+
+  bool isAliasedByAnother() { return IsAliasedByAnother; }
+  void setIsAliasedByAnother() { IsAliasedByAnother = true; }
+  void aliasTo(unsigned Opcode) {
+    assert((AliasOf == 0 || AliasOf == Opcode) &&
+           "Opcode is already aliased to another opcode");
+    assert(Rules.empty() && "Aliasing will discard rules");
+    AliasOf = Opcode;
+  }
+  unsigned getAlias() const { return AliasOf; }
+
+  /// The instruction is legal if predicate is true.
+  LegalizeRuleSet &legalIf(LegalityPredicate Predicate) {
+    // We have no choice but conservatively assume that the free-form
+    // user-provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::Legal, Predicate);
+  }
+  /// The instruction is legal when type index 0 is any type in the given list.
+  LegalizeRuleSet &legalFor(std::initializer_list<LLT> Types) {
+    return actionFor(LegalizeAction::Legal, Types);
+  }
+  /// The instruction is legal when type indexes 0 and 1 is any type pair in the
+  /// given list.
+  LegalizeRuleSet &legalFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
+    return actionFor(LegalizeAction::Legal, Types);
+  }
+  /// The instruction is legal when type indexes 0 and 1 along with the memory
+  /// size is any type and size tuple in the given list.
+  LegalizeRuleSet &legalForTypesWithMemSize(
+      std::initializer_list<LegalityPredicates::TypePairAndMemSize>
+          TypesAndMemSize) {
+    return actionIf(LegalizeAction::Legal,
+                    LegalityPredicates::typePairAndMemSizeInSet(
+                        typeIdx(0), typeIdx(1), /*MMOIdx*/ 0, TypesAndMemSize));
+  }
+  /// The instruction is legal when type indexes 0 and 1 are both in the given
+  /// list. That is, the type pair is in the cartesian product of the list.
+  LegalizeRuleSet &legalForCartesianProduct(std::initializer_list<LLT> Types) {
+    return actionForCartesianProduct(LegalizeAction::Legal, Types);
+  }
+  /// The instruction is legal when type indexes 0 and 1 are both their
+  /// respective lists.
+  LegalizeRuleSet &legalForCartesianProduct(std::initializer_list<LLT> Types0,
+                                            std::initializer_list<LLT> Types1) {
+    return actionForCartesianProduct(LegalizeAction::Legal, Types0, Types1);
+  }
+
+  /// The instruction is lowered.
+  LegalizeRuleSet &lower() {
+    using namespace LegalizeMutations;
+    // We have no choice but conservatively assume that predicate-less lowering
+    // properly handles all type indices by design:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::Lower, always);
+  }
+  /// The instruction is lowered if predicate is true. Keep type index 0 as the
+  /// same type.
+  LegalizeRuleSet &lowerIf(LegalityPredicate Predicate) {
+    using namespace LegalizeMutations;
+    // We have no choice but conservatively assume that lowering with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::Lower, Predicate);
+  }
+  /// The instruction is lowered if predicate is true.
+  LegalizeRuleSet &lowerIf(LegalityPredicate Predicate,
+                           LegalizeMutation Mutation) {
+    // We have no choice but conservatively assume that lowering with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::Lower, Predicate, Mutation);
+  }
+  /// The instruction is lowered when type index 0 is any type in the given
+  /// list. Keep type index 0 as the same type.
+  LegalizeRuleSet &lowerFor(std::initializer_list<LLT> Types) {
+    return actionFor(LegalizeAction::Lower, Types,
+                     LegalizeMutations::changeTo(0, 0));
+  }
+  /// The instruction is lowered when type index 0 is any type in the given
+  /// list.
+  LegalizeRuleSet &lowerFor(std::initializer_list<LLT> Types,
+                            LegalizeMutation Mutation) {
+    return actionFor(LegalizeAction::Lower, Types, Mutation);
+  }
+  /// The instruction is lowered when type indexes 0 and 1 is any type pair in
+  /// the given list. Keep type index 0 as the same type.
+  LegalizeRuleSet &lowerFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
+    return actionFor(LegalizeAction::Lower, Types,
+                     LegalizeMutations::changeTo(0, 0));
+  }
+  /// The instruction is lowered when type indexes 0 and 1 is any type pair in
+  /// the given list.
+  LegalizeRuleSet &lowerFor(std::initializer_list<std::pair<LLT, LLT>> Types,
+                            LegalizeMutation Mutation) {
+    return actionFor(LegalizeAction::Lower, Types, Mutation);
+  }
+  /// The instruction is lowered when type indexes 0 and 1 are both in their
+  /// respective lists.
+  LegalizeRuleSet &lowerForCartesianProduct(std::initializer_list<LLT> Types0,
+                                            std::initializer_list<LLT> Types1) {
+    using namespace LegalityPredicates;
+    return actionForCartesianProduct(LegalizeAction::Lower, Types0, Types1);
+  }
+  /// The instruction is lowered when when type indexes 0, 1, and 2 are all in
+  /// their respective lists.
+  LegalizeRuleSet &lowerForCartesianProduct(std::initializer_list<LLT> Types0,
+                                            std::initializer_list<LLT> Types1,
+                                            std::initializer_list<LLT> Types2) {
+    using namespace LegalityPredicates;
+    return actionForCartesianProduct(LegalizeAction::Lower, Types0, Types1,
+                                     Types2);
+  }
+
+  /// Like legalIf, but for the Libcall action.
+  LegalizeRuleSet &libcallIf(LegalityPredicate Predicate) {
+    // We have no choice but conservatively assume that a libcall with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::Libcall, Predicate);
+  }
+  LegalizeRuleSet &libcallFor(std::initializer_list<LLT> Types) {
+    return actionFor(LegalizeAction::Libcall, Types);
+  }
+  LegalizeRuleSet &
+  libcallFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
+    return actionFor(LegalizeAction::Libcall, Types);
+  }
+  LegalizeRuleSet &
+  libcallForCartesianProduct(std::initializer_list<LLT> Types) {
+    return actionForCartesianProduct(LegalizeAction::Libcall, Types);
+  }
+  LegalizeRuleSet &
+  libcallForCartesianProduct(std::initializer_list<LLT> Types0,
+                             std::initializer_list<LLT> Types1) {
+    return actionForCartesianProduct(LegalizeAction::Libcall, Types0, Types1);
+  }
+
+  /// Widen the scalar to the one selected by the mutation if the predicate is
+  /// true.
+  LegalizeRuleSet &widenScalarIf(LegalityPredicate Predicate,
+                                 LegalizeMutation Mutation) {
+    // We have no choice but conservatively assume that an action with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::WidenScalar, Predicate, Mutation);
+  }
+  /// Narrow the scalar to the one selected by the mutation if the predicate is
+  /// true.
+  LegalizeRuleSet &narrowScalarIf(LegalityPredicate Predicate,
+                                  LegalizeMutation Mutation) {
+    // We have no choice but conservatively assume that an action with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::NarrowScalar, Predicate, Mutation);
+  }
+
+  /// Add more elements to reach the type selected by the mutation if the
+  /// predicate is true.
+  LegalizeRuleSet &moreElementsIf(LegalityPredicate Predicate,
+                                  LegalizeMutation Mutation) {
+    // We have no choice but conservatively assume that an action with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::MoreElements, Predicate, Mutation);
+  }
+  /// Remove elements to reach the type selected by the mutation if the
+  /// predicate is true.
+  LegalizeRuleSet &fewerElementsIf(LegalityPredicate Predicate,
+                                   LegalizeMutation Mutation) {
+    // We have no choice but conservatively assume that an action with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::FewerElements, Predicate, Mutation);
+  }
+
+  /// The instruction is unsupported.
+  LegalizeRuleSet &unsupported() {
+    return actionIf(LegalizeAction::Unsupported, always);
+  }
+  LegalizeRuleSet &unsupportedIf(LegalityPredicate Predicate) {
+    return actionIf(LegalizeAction::Unsupported, Predicate);
+  }
+  LegalizeRuleSet &unsupportedIfMemSizeNotPow2() {
+    return actionIf(LegalizeAction::Unsupported,
+                    LegalityPredicates::memSizeInBytesNotPow2(0));
+  }
+
+  LegalizeRuleSet &customIf(LegalityPredicate Predicate) {
+    // We have no choice but conservatively assume that a custom action with a
+    // free-form user provided Predicate properly handles all type indices:
+    markAllTypeIdxsAsCovered();
+    return actionIf(LegalizeAction::Custom, Predicate);
+  }
+  LegalizeRuleSet &customFor(std::initializer_list<LLT> Types) {
+    return actionFor(LegalizeAction::Custom, Types);
+  }
+  LegalizeRuleSet &customForCartesianProduct(std::initializer_list<LLT> Types) {
+    return actionForCartesianProduct(LegalizeAction::Custom, Types);
+  }
+  LegalizeRuleSet &
+  customForCartesianProduct(std::initializer_list<LLT> Types0,
+                            std::initializer_list<LLT> Types1) {
+    return actionForCartesianProduct(LegalizeAction::Custom, Types0, Types1);
+  }
+
+  /// Widen the scalar to the next power of two that is at least MinSize.
+  /// No effect if the type is not a scalar or is a power of two.
+  LegalizeRuleSet &widenScalarToNextPow2(unsigned TypeIdx,
+                                         unsigned MinSize = 0) {
+    using namespace LegalityPredicates;
+    return actionIf(LegalizeAction::WidenScalar, sizeNotPow2(typeIdx(TypeIdx)),
+                    LegalizeMutations::widenScalarToNextPow2(TypeIdx, MinSize));
+  }
+
+  LegalizeRuleSet &narrowScalar(unsigned TypeIdx, LegalizeMutation Mutation) {
+    using namespace LegalityPredicates;
+    return actionIf(LegalizeAction::NarrowScalar, isScalar(typeIdx(TypeIdx)),
+                    Mutation);
+  }
+
+  /// Ensure the scalar is at least as wide as Ty.
+  LegalizeRuleSet &minScalar(unsigned TypeIdx, const LLT &Ty) {
+    using namespace LegalityPredicates;
+    using namespace LegalizeMutations;
+    return actionIf(LegalizeAction::WidenScalar,
+                    narrowerThan(TypeIdx, Ty.getSizeInBits()),
+                    changeTo(typeIdx(TypeIdx), Ty));
+  }
+
+  /// Ensure the scalar is at most as wide as Ty.
+  LegalizeRuleSet &maxScalar(unsigned TypeIdx, const LLT &Ty) {
+    using namespace LegalityPredicates;
+    using namespace LegalizeMutations;
+    return actionIf(LegalizeAction::NarrowScalar,
+                    widerThan(TypeIdx, Ty.getSizeInBits()),
+                    changeTo(typeIdx(TypeIdx), Ty));
+  }
+
+  /// Conditionally limit the maximum size of the scalar.
+  /// For example, when the maximum size of one type depends on the size of
+  /// another such as extracting N bits from an M bit container.
+  LegalizeRuleSet &maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx,
+                               const LLT &Ty) {
+    using namespace LegalityPredicates;
+    using namespace LegalizeMutations;
+    return actionIf(LegalizeAction::NarrowScalar,
+                    [=](const LegalityQuery &Query) {
+                      return widerThan(TypeIdx, Ty.getSizeInBits()) &&
+                             Predicate(Query);
+                    },
+                    changeTo(typeIdx(TypeIdx), Ty));
+  }
+
+  /// Limit the range of scalar sizes to MinTy and MaxTy.
+  LegalizeRuleSet &clampScalar(unsigned TypeIdx, const LLT &MinTy,
+                               const LLT &MaxTy) {
+    assert(MinTy.isScalar() && MaxTy.isScalar() && "Expected scalar types");
+    return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy);
+  }
+
+  /// Add more elements to the vector to reach the next power of two.
+  /// No effect if the type is not a vector or the element count is a power of
+  /// two.
+  LegalizeRuleSet &moreElementsToNextPow2(unsigned TypeIdx) {
+    using namespace LegalityPredicates;
+    return actionIf(LegalizeAction::MoreElements,
+                    numElementsNotPow2(typeIdx(TypeIdx)),
+                    LegalizeMutations::moreElementsToNextPow2(TypeIdx));
+  }
+
+  /// Limit the number of elements in EltTy vectors to at least MinElements.
+  LegalizeRuleSet &clampMinNumElements(unsigned TypeIdx, const LLT &EltTy,
+                                       unsigned MinElements) {
+    // Mark the type index as covered:
+    typeIdx(TypeIdx);
+    return actionIf(
+        LegalizeAction::MoreElements,
+        [=](const LegalityQuery &Query) {
+          LLT VecTy = Query.Types[TypeIdx];
+          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+                 VecTy.getNumElements() < MinElements;
+        },
+        [=](const LegalityQuery &Query) {
+          LLT VecTy = Query.Types[TypeIdx];
+          return std::make_pair(
+              TypeIdx, LLT::vector(MinElements, VecTy.getScalarSizeInBits()));
+        });
+  }
+  /// Limit the number of elements in EltTy vectors to at most MaxElements.
+  LegalizeRuleSet &clampMaxNumElements(unsigned TypeIdx, const LLT &EltTy,
+                                       unsigned MaxElements) {
+    // Mark the type index as covered:
+    typeIdx(TypeIdx);
+    return actionIf(
+        LegalizeAction::FewerElements,
+        [=](const LegalityQuery &Query) {
+          LLT VecTy = Query.Types[TypeIdx];
+          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+                 VecTy.getNumElements() > MaxElements;
+        },
+        [=](const LegalityQuery &Query) {
+          LLT VecTy = Query.Types[TypeIdx];
+          return std::make_pair(
+              TypeIdx, LLT::vector(MaxElements, VecTy.getScalarSizeInBits()));
+        });
+  }
+  /// Limit the number of elements for the given vectors to at least MinTy's
+  /// number of elements and at most MaxTy's number of elements.
+  ///
+  /// No effect if the type is not a vector or does not have the same element
+  /// type as the constraints.
+  /// The element type of MinTy and MaxTy must match.
+  LegalizeRuleSet &clampNumElements(unsigned TypeIdx, const LLT &MinTy,
+                                    const LLT &MaxTy) {
+    assert(MinTy.getElementType() == MaxTy.getElementType() &&
+           "Expected element types to agree");
+
+    const LLT &EltTy = MinTy.getElementType();
+    return clampMinNumElements(TypeIdx, EltTy, MinTy.getNumElements())
+        .clampMaxNumElements(TypeIdx, EltTy, MaxTy.getNumElements());
+  }
+
+  /// Fallback on the previous implementation. This should only be used while
+  /// porting a rule.
+  LegalizeRuleSet &fallback() {
+    add({always, LegalizeAction::UseLegacyRules});
+    return *this;
+  }
+
+  /// Check if there is no type index which is obviously not handled by the
+  /// LegalizeRuleSet in any way at all.
+  /// \pre Type indices of the opcode form a dense [0, \p NumTypeIdxs) set.
+  bool verifyTypeIdxsCoverage(unsigned NumTypeIdxs) const;
+
+  /// Apply the ruleset to the given LegalityQuery.
+  LegalizeActionStep apply(const LegalityQuery &Query) const;
+};
+
+class LegalizerInfo {
+public:
   LegalizerInfo();
   virtual ~LegalizerInfo() = default;
 
+  unsigned getOpcodeIdxForOpcode(unsigned Opcode) const;
+  unsigned getActionDefinitionsIdx(unsigned Opcode) const;
+
   /// Compute any ancillary tables needed to quickly decide how an operation
   /// should be handled. This must be called after all "set*Action"methods but
   /// before any query is made or incorrect results may be returned.
   void computeTables();
 
+  /// Perform simple self-diagnostic and assert if there is anything obviously
+  /// wrong with the actions set up.
+  void verify(const MCInstrInfo &MII) const;
+
   static bool needsLegalizingToDifferentSize(const LegalizeAction Action) {
+    using namespace LegalizeActions;
     switch (Action) {
     case NarrowScalar:
     case WidenScalar:
@@ -121,8 +760,8 @@ public:
     }
   }
 
-  typedef std::pair<uint16_t, LegalizeAction> SizeAndAction;
-  typedef std::vector<SizeAndAction> SizeAndActionsVec;
+  using SizeAndAction = std::pair<uint16_t, LegalizeAction>;
+  using SizeAndActionsVec = std::vector<SizeAndAction>;
   using SizeChangeStrategy =
       std::function<SizeAndActionsVec(const SizeAndActionsVec &v)>;
 
@@ -186,8 +825,9 @@ public:
   /// and Unsupported for all other scalar types T.
   static SizeAndActionsVec
   unsupportedForDifferentSizes(const SizeAndActionsVec &v) {
+    using namespace LegalizeActions;
     return increaseToLargerTypesAndDecreaseToLargest(v, Unsupported,
-                                                        Unsupported);
+                                                     Unsupported);
   }
 
   /// A SizeChangeStrategy for the common case where legalization for a
@@ -196,32 +836,36 @@ public:
   /// largest legal type.
   static SizeAndActionsVec
   widenToLargerTypesAndNarrowToLargest(const SizeAndActionsVec &v) {
+    using namespace LegalizeActions;
     assert(v.size() > 0 &&
            "At least one size that can be legalized towards is needed"
            " for this SizeChangeStrategy");
     return increaseToLargerTypesAndDecreaseToLargest(v, WidenScalar,
-                                                        NarrowScalar);
+                                                     NarrowScalar);
   }
 
   static SizeAndActionsVec
   widenToLargerTypesUnsupportedOtherwise(const SizeAndActionsVec &v) {
+    using namespace LegalizeActions;
     return increaseToLargerTypesAndDecreaseToLargest(v, WidenScalar,
-                                                        Unsupported);
+                                                     Unsupported);
   }
 
   static SizeAndActionsVec
   narrowToSmallerAndUnsupportedIfTooSmall(const SizeAndActionsVec &v) {
+    using namespace LegalizeActions;
     return decreaseToSmallerTypesAndIncreaseToSmallest(v, NarrowScalar,
-                                                          Unsupported);
+                                                       Unsupported);
   }
 
   static SizeAndActionsVec
   narrowToSmallerAndWidenToSmallest(const SizeAndActionsVec &v) {
+    using namespace LegalizeActions;
     assert(v.size() > 0 &&
            "At least one size that can be legalized towards is needed"
            " for this SizeChangeStrategy");
     return decreaseToSmallerTypesAndIncreaseToSmallest(v, NarrowScalar,
-                                                          WidenScalar);
+                                                       WidenScalar);
   }
 
   /// A SizeChangeStrategy for the common case where legalization for a
@@ -244,8 +888,9 @@ public:
   ///       (FewerElements, vector(4,32)).
   static SizeAndActionsVec
   moreToWiderTypesAndLessToWidest(const SizeAndActionsVec &v) {
+    using namespace LegalizeActions;
     return increaseToLargerTypesAndDecreaseToLargest(v, MoreElements,
-                                                        FewerElements);
+                                                     FewerElements);
   }
 
   /// Helper function to implement many typical SizeChangeStrategy functions.
@@ -259,22 +904,46 @@ public:
                                               LegalizeAction DecreaseAction,
                                               LegalizeAction IncreaseAction);
 
-  /// Determine what action should be taken to legalize the given generic
-  /// instruction opcode, type-index and type. Requires computeTables to have
-  /// been called.
+  /// Get the action definitions for the given opcode. Use this to run a
+  /// LegalityQuery through the definitions.
+  const LegalizeRuleSet &getActionDefinitions(unsigned Opcode) const;
+
+  /// Get the action definition builder for the given opcode. Use this to define
+  /// the action definitions.
   ///
-  /// \returns a pair consisting of the kind of legalization that should be
-  /// performed and the destination type.
-  std::pair<LegalizeAction, LLT> getAction(const InstrAspect &Aspect) const;
+  /// It is an error to request an opcode that has already been requested by the
+  /// multiple-opcode variant.
+  LegalizeRuleSet &getActionDefinitionsBuilder(unsigned Opcode);
+
+  /// Get the action definition builder for the given set of opcodes. Use this
+  /// to define the action definitions for multiple opcodes at once. The first
+  /// opcode given will be considered the representative opcode and will hold
+  /// the definitions whereas the other opcodes will be configured to refer to
+  /// the representative opcode. This lowers memory requirements and very
+  /// slightly improves performance.
+  ///
+  /// It would be very easy to introduce unexpected side-effects as a result of
+  /// this aliasing if it were permitted to request different but intersecting
+  /// sets of opcodes but that is difficult to keep track of. It is therefore an
+  /// error to request the same opcode twice using this API, to request an
+  /// opcode that already has definitions, or to use the single-opcode API on an
+  /// opcode that has already been requested by this API.
+  LegalizeRuleSet &
+  getActionDefinitionsBuilder(std::initializer_list<unsigned> Opcodes);
+  void aliasActionDefinitions(unsigned OpcodeTo, unsigned OpcodeFrom);
+
+  /// Determine what action should be taken to legalize the described
+  /// instruction. Requires computeTables to have been called.
+  ///
+  /// \returns a description of the next legalization step to perform.
+  LegalizeActionStep getAction(const LegalityQuery &Query) const;
 
   /// Determine what action should be taken to legalize the given generic
   /// instruction.
   ///
-  /// \returns a tuple consisting of the LegalizeAction that should be
-  /// performed, the type-index it should be performed on and the destination
-  /// type.
-  std::tuple<LegalizeAction, unsigned, LLT>
-  getAction(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
+  /// \returns a description of the next legalization step to perform.
+  LegalizeActionStep getAction(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI) const;
 
   bool isLegal(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
 
@@ -283,6 +952,15 @@ public:
                               MachineIRBuilder &MIRBuilder) const;
 
 private:
+  /// Determine what action should be taken to legalize the given generic
+  /// instruction opcode, type-index and type. Requires computeTables to have
+  /// been called.
+  ///
+  /// \returns a pair consisting of the kind of legalization that should be
+  /// performed and the destination type.
+  std::pair<LegalizeAction, LLT>
+  getAspectAction(const InstrAspect &Aspect) const;
+
   /// The SizeAndActionsVec is a representation mapping between all natural
   /// numbers and an Action. The natural number represents the bit size of
   /// the InstrAspect. For example, for a target with native support for 32-bit
@@ -350,6 +1028,7 @@ private:
   /// A partial SizeAndActionsVec potentially doesn't cover all bit sizes,
   /// i.e. it's OK if it doesn't start from size 1.
   static void checkPartialSizeAndActionsVector(const SizeAndActionsVec& v) {
+    using namespace LegalizeActions;
 #ifndef NDEBUG
     // The sizes should be in increasing order
     int prev_size = -1;
@@ -441,7 +1120,7 @@ private:
   static const int LastOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_END;
 
   // Data structures used temporarily during construction of legality data:
-  typedef DenseMap<LLT, LegalizeAction> TypeMap;
+  using TypeMap = DenseMap<LLT, LegalizeAction>;
   SmallVector<TypeMap, 1> SpecifiedActions[LastOp - FirstOp + 1];
   SmallVector<SizeChangeStrategy, 1>
       ScalarSizeChangeStrategies[LastOp - FirstOp + 1];
@@ -456,8 +1135,16 @@ private:
       AddrSpace2PointerActions[LastOp - FirstOp + 1];
   std::unordered_map<uint16_t, SmallVector<SizeAndActionsVec, 1>>
       NumElements2Actions[LastOp - FirstOp + 1];
+
+  LegalizeRuleSet RulesForOpcode[LastOp - FirstOp + 1];
 };
 
+#ifndef NDEBUG
+/// Checks that MIR is fully legal, returns an illegal instruction if it's not,
+/// nullptr otherwise
+const MachineInstr *machineFunctionIsIllegal(const MachineFunction &MF);
+#endif
+
 } // end namespace llvm.
 
 #endif // LLVM_CODEGEN_GLOBALISEL_LEGALIZERINFO_H
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
index 0a46eb9e7840..1e2d4763e5e1 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
@@ -70,6 +70,8 @@ public:
         .set(MachineFunctionProperties::Property::RegBankSelected);
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
 
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
new file mode 100644
index 000000000000..f77f9a8df7ee
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -0,0 +1,338 @@
+//== ----- llvm/CodeGen/GlobalISel/MIPatternMatch.h --------------------- == //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Contains matchers for matching SSA Machine Instructions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_GMIR_PATTERNMATCH_H
+#define LLVM_GMIR_PATTERNMATCH_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+namespace MIPatternMatch {
+
+template <typename Reg, typename Pattern>
+bool mi_match(Reg R, MachineRegisterInfo &MRI, Pattern &&P) {
+  return P.match(MRI, R);
+}
+
+// TODO: Extend for N use.
+template <typename SubPatternT> struct OneUse_match {
+  SubPatternT SubPat;
+  OneUse_match(const SubPatternT &SP) : SubPat(SP) {}
+
+  template <typename OpTy>
+  bool match(const MachineRegisterInfo &MRI, unsigned Reg) {
+    return MRI.hasOneUse(Reg) && SubPat.match(MRI, Reg);
+  }
+};
+
+template <typename SubPat>
+inline OneUse_match<SubPat> m_OneUse(const SubPat &SP) {
+  return SP;
+}
+
+struct ConstantMatch {
+  int64_t &CR;
+  ConstantMatch(int64_t &C) : CR(C) {}
+  bool match(const MachineRegisterInfo &MRI, unsigned Reg) {
+    if (auto MaybeCst = getConstantVRegVal(Reg, MRI)) {
+      CR = *MaybeCst;
+      return true;
+    }
+    return false;
+  }
+};
+
+inline ConstantMatch m_ICst(int64_t &Cst) { return ConstantMatch(Cst); }
+
+// TODO: Rework this for different kinds of MachineOperand.
+// Currently assumes the Src for a match is a register.
+// We might want to support taking in some MachineOperands and call getReg on
+// that.
+
+struct operand_type_match {
+  bool match(const MachineRegisterInfo &MRI, unsigned Reg) { return true; }
+  bool match(const MachineRegisterInfo &MRI, MachineOperand *MO) {
+    return MO->isReg();
+  }
+};
+
+inline operand_type_match m_Reg() { return operand_type_match(); }
+
+/// Matching combinators.
+template <typename... Preds> struct And {
+  template <typename MatchSrc>
+  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+    return true;
+  }
+};
+
+template <typename Pred, typename... Preds>
+struct And<Pred, Preds...> : And<Preds...> {
+  Pred P;
+  And(Pred &&p, Preds &&... preds)
+      : And<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {
+  }
+  template <typename MatchSrc>
+  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+    return P.match(MRI, src) && And<Preds...>::match(MRI, src);
+  }
+};
+
+template <typename... Preds> struct Or {
+  template <typename MatchSrc>
+  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+    return false;
+  }
+};
+
+template <typename Pred, typename... Preds>
+struct Or<Pred, Preds...> : Or<Preds...> {
+  Pred P;
+  Or(Pred &&p, Preds &&... preds)
+      : Or<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {}
+  template <typename MatchSrc>
+  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+    return P.match(MRI, src) || Or<Preds...>::match(MRI, src);
+  }
+};
+
+template <typename... Preds> And<Preds...> m_all_of(Preds &&... preds) {
+  return And<Preds...>(std::forward<Preds>(preds)...);
+}
+
+template <typename... Preds> Or<Preds...> m_any_of(Preds &&... preds) {
+  return Or<Preds...>(std::forward<Preds>(preds)...);
+}
+
+template <typename BindTy> struct bind_helper {
+  static bool bind(const MachineRegisterInfo &MRI, BindTy &VR, BindTy &V) {
+    VR = V;
+    return true;
+  }
+};
+
+template <> struct bind_helper<MachineInstr *> {
+  static bool bind(const MachineRegisterInfo &MRI, MachineInstr *&MI,
+                   unsigned Reg) {
+    MI = MRI.getVRegDef(Reg);
+    if (MI)
+      return true;
+    return false;
+  }
+};
+
+template <> struct bind_helper<LLT> {
+  static bool bind(const MachineRegisterInfo &MRI, LLT &Ty, unsigned Reg) {
+    Ty = MRI.getType(Reg);
+    if (Ty.isValid())
+      return true;
+    return false;
+  }
+};
+
+template <> struct bind_helper<const ConstantFP *> {
+  static bool bind(const MachineRegisterInfo &MRI, const ConstantFP *&F,
+                   unsigned Reg) {
+    F = getConstantFPVRegVal(Reg, MRI);
+    if (F)
+      return true;
+    return false;
+  }
+};
+
+template <typename Class> struct bind_ty {
+  Class &VR;
+
+  bind_ty(Class &V) : VR(V) {}
+
+  template <typename ITy> bool match(const MachineRegisterInfo &MRI, ITy &&V) {
+    return bind_helper<Class>::bind(MRI, VR, V);
+  }
+};
+
+inline bind_ty<unsigned> m_Reg(unsigned &R) { return R; }
+inline bind_ty<MachineInstr *> m_MInstr(MachineInstr *&MI) { return MI; }
+inline bind_ty<LLT> m_Type(LLT &Ty) { return Ty; }
+
+// Helper for matching G_FCONSTANT
+inline bind_ty<const ConstantFP *> m_GFCst(const ConstantFP *&C) { return C; }
+
+// General helper for all the binary generic MI such as G_ADD/G_SUB etc
+template <typename LHS_P, typename RHS_P, unsigned Opcode,
+          bool Commutable = false>
+struct BinaryOp_match {
+  LHS_P L;
+  RHS_P R;
+
+  BinaryOp_match(const LHS_P &LHS, const RHS_P &RHS) : L(LHS), R(RHS) {}
+  template <typename OpTy> bool match(MachineRegisterInfo &MRI, OpTy &&Op) {
+    MachineInstr *TmpMI;
+    if (mi_match(Op, MRI, m_MInstr(TmpMI))) {
+      if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 3) {
+        return (L.match(MRI, TmpMI->getOperand(1).getReg()) &&
+                R.match(MRI, TmpMI->getOperand(2).getReg())) ||
+               (Commutable && (R.match(MRI, TmpMI->getOperand(1).getReg()) &&
+                               L.match(MRI, TmpMI->getOperand(2).getReg())));
+      }
+    }
+    return false;
+  }
+};
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_ADD, true>
+m_GAdd(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_ADD, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SUB> m_GSub(const LHS &L,
+                                                            const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_SUB>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_MUL, true>
+m_GMul(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_MUL, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FADD, true>
+m_GFAdd(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_FADD, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FMUL, true>
+m_GFMul(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_FMUL, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FSUB, false>
+m_GFSub(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_FSUB, false>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_AND, true>
+m_GAnd(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_AND, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_OR, true> m_GOr(const LHS &L,
+                                                                const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_OR, true>(L, R);
+}
+
+// Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
+template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
+  SrcTy L;
+
+  UnaryOp_match(const SrcTy &LHS) : L(LHS) {}
+  template <typename OpTy> bool match(MachineRegisterInfo &MRI, OpTy &&Op) {
+    MachineInstr *TmpMI;
+    if (mi_match(Op, MRI, m_MInstr(TmpMI))) {
+      if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 2) {
+        return L.match(MRI, TmpMI->getOperand(1).getReg());
+      }
+    }
+    return false;
+  }
+};
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_ANYEXT>
+m_GAnyExt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_ANYEXT>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_SEXT> m_GSExt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_SEXT>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_ZEXT> m_GZExt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_ZEXT>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FPEXT> m_GFPExt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_FPEXT>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_TRUNC> m_GTrunc(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_TRUNC>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_BITCAST>
+m_GBitcast(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_BITCAST>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_PTRTOINT>
+m_GPtrToInt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_PTRTOINT>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_INTTOPTR>
+m_GIntToPtr(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_INTTOPTR>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FPTRUNC>
+m_GFPTrunc(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_FPTRUNC>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FABS> m_GFabs(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_FABS>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FNEG> m_GFNeg(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_FNEG>(Src);
+}
+
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::COPY> m_Copy(SrcTy &&Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));
+}
+
+// Helper for checking if a Reg is of specific type.
+struct CheckType {
+  LLT Ty;
+  CheckType(const LLT &Ty) : Ty(Ty) {}
+
+  bool match(MachineRegisterInfo &MRI, unsigned Reg) {
+    return MRI.getType(Reg) == Ty;
+  }
+};
+
+inline CheckType m_SpecificType(LLT Ty) { return Ty; }
+
+} // namespace GMIPatternMatch
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index aa875c11d86f..983a4e680d5c 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -23,7 +23,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
 
-#include <queue>
 
 namespace llvm {
 
@@ -32,11 +31,10 @@ class MachineFunction;
 class MachineInstr;
 class TargetInstrInfo;
 
-/// Helper class to build MachineInstr.
-/// It keeps internally the insertion point and debug location for all
-/// the new instructions we want to create.
-/// This information can be modify via the related setters.
-class MachineIRBuilder {
+/// Class which stores all the state required in a MachineIRBuilder.
+/// Since MachineIRBuilders will only store state in this object, it allows
+/// to transfer BuilderState between different kinds of MachineIRBuilders.
+struct MachineIRBuilderState {
   /// MachineFunction under construction.
   MachineFunction *MF;
   /// Information used to access the description of the opcodes.
@@ -53,15 +51,23 @@ class MachineIRBuilder {
   /// @}
 
   std::function<void(MachineInstr *)> InsertedInstr;
+};
+
+/// Helper class to build MachineInstr.
+/// It keeps internally the insertion point and debug location for all
+/// the new instructions we want to create.
+/// This information can be modify via the related setters.
+class MachineIRBuilderBase {
 
+  MachineIRBuilderState State;
   const TargetInstrInfo &getTII() {
-    assert(TII && "TargetInstrInfo is not set");
-    return *TII;
+    assert(State.TII && "TargetInstrInfo is not set");
+    return *State.TII;
   }
 
   void validateTruncExt(unsigned Dst, unsigned Src, bool IsExtend);
-  MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Res, unsigned Op0, unsigned Op1);
 
+protected:
   unsigned getDestFromArg(unsigned Reg) { return Reg; }
   unsigned getDestFromArg(LLT Ty) {
     return getMF().getRegInfo().createGenericVirtualRegister(Ty);
@@ -89,30 +95,41 @@ class MachineIRBuilder {
     return MIB->getOperand(0).getReg();
   }
 
+  void validateBinaryOp(unsigned Res, unsigned Op0, unsigned Op1);
+
 public:
   /// Some constructors for easy use.
-  MachineIRBuilder() = default;
-  MachineIRBuilder(MachineFunction &MF) { setMF(MF); }
-  MachineIRBuilder(MachineInstr &MI) : MachineIRBuilder(*MI.getMF()) {
+  MachineIRBuilderBase() = default;
+  MachineIRBuilderBase(MachineFunction &MF) { setMF(MF); }
+  MachineIRBuilderBase(MachineInstr &MI) : MachineIRBuilderBase(*MI.getMF()) {
     setInstr(MI);
   }
 
+  MachineIRBuilderBase(const MachineIRBuilderState &BState) : State(BState) {}
+
   /// Getter for the function we currently build.
   MachineFunction &getMF() {
-    assert(MF && "MachineFunction is not set");
-    return *MF;
+    assert(State.MF && "MachineFunction is not set");
+    return *State.MF;
   }
 
+  /// Getter for DebugLoc
+  const DebugLoc &getDL() { return State.DL; }
+
+  /// Getter for MRI
+  MachineRegisterInfo *getMRI() { return State.MRI; }
+
+  /// Getter for the State
+  MachineIRBuilderState &getState() { return State; }
+
   /// Getter for the basic block we currently build.
   MachineBasicBlock &getMBB() {
-    assert(MBB && "MachineBasicBlock is not set");
-    return *MBB;
+    assert(State.MBB && "MachineBasicBlock is not set");
+    return *State.MBB;
   }
 
   /// Current insertion point for new instructions.
-  MachineBasicBlock::iterator getInsertPt() {
-    return II;
-  }
+  MachineBasicBlock::iterator getInsertPt() { return State.II; }
 
   /// Set the insertion point before the specified position.
   /// \pre MBB must be in getMF().
@@ -137,15 +154,16 @@ public:
   /// \name Control where instructions we create are recorded (typically for
   /// visiting again later during legalization).
   /// @{
+  void recordInsertion(MachineInstr *InsertedInstr) const;
   void recordInsertions(std::function<void(MachineInstr *)> InsertedInstr);
   void stopRecordingInsertions();
   /// @}
 
   /// Set the debug location to \p DL for all the next build instructions.
-  void setDebugLoc(const DebugLoc &DL) { this->DL = DL; }
+  void setDebugLoc(const DebugLoc &DL) { this->State.DL = DL; }
 
   /// Get the current instruction's debug location.
-  DebugLoc getDebugLoc() { return DL; }
+  DebugLoc getDebugLoc() { return State.DL; }
 
   /// Build and insert <empty> = \p Opcode <empty>.
   /// The insertion point is the one set by the last call of either
@@ -156,20 +174,6 @@ public:
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildInstr(unsigned Opcode);
 
-  /// DAG like Generic method for building arbitrary instructions as above.
-  /// \Opc opcode for the instruction.
-  /// \Ty Either LLT/TargetRegisterClass/unsigned types for Dst
-  /// \Args Variadic list of uses of types(unsigned/MachineInstrBuilder)
-  /// Uses of type MachineInstrBuilder will perform
-  /// getOperand(0).getReg() to convert to register.
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty,
-                                 UseArgsTy &&... Args) {
-    auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty));
-    addUsesFromArgs(MIB, std::forward<UseArgsTy>(Args)...);
-    return MIB;
-  }
-
   /// Build but don't insert <empty> = \p Opcode <empty>.
   ///
   /// \pre setMF, setBasicBlock or setMI  must have been called.
@@ -227,49 +231,6 @@ public:
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildGlobalValue(unsigned Res, const GlobalValue *GV);
 
-  /// Build and insert \p Res = G_ADD \p Op0, \p Op1
-  ///
-  /// G_ADD sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
-  /// truncated to their width.
-  ///
-  /// \pre setBasicBlock or setMI must have been called.
-  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
-  ///      with the same (scalar or vector) type).
-  ///
-  /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildAdd(unsigned Res, unsigned Op0,
-                               unsigned Op1);
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildAdd(DstTy &&Ty, UseArgsTy &&... UseArgs) {
-    unsigned Res = getDestFromArg(Ty);
-    return buildAdd(Res, (getRegFromArg(UseArgs))...);
-  }
-
-  /// Build and insert \p Res = G_SUB \p Op0, \p Op1
-  ///
-  /// G_SUB sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
-  /// truncated to their width.
-  ///
-  /// \pre setBasicBlock or setMI must have been called.
-  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
-  ///      with the same (scalar or vector) type).
-  ///
-  /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildSub(unsigned Res, unsigned Op0,
-                               unsigned Op1);
-
-  /// Build and insert \p Res = G_MUL \p Op0, \p Op1
-  ///
-  /// G_MUL sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
-  /// truncated to their width.
-  ///
-  /// \pre setBasicBlock or setMI must have been called.
-  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
-  ///      with the same (scalar or vector) type).
-  ///
-  /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildMul(unsigned Res, unsigned Op0,
-                               unsigned Op1);
 
   /// Build and insert \p Res = G_GEP \p Op0, \p Op1
   ///
@@ -338,34 +299,6 @@ public:
   MachineInstrBuilder buildUAdde(unsigned Res, unsigned CarryOut, unsigned Op0,
                                  unsigned Op1, unsigned CarryIn);
 
-  /// Build and insert \p Res = G_AND \p Op0, \p Op1
-  ///
-  /// G_AND sets \p Res to the bitwise and of integer parameters \p Op0 and \p
-  /// Op1.
-  ///
-  /// \pre setBasicBlock or setMI must have been called.
-  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
-  ///      with the same (scalar or vector) type).
-  ///
-  /// \return a MachineInstrBuilder for the newly created instruction.
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildAnd(DstTy &&Dst, UseArgsTy &&... UseArgs) {
-    return buildAnd(getDestFromArg(Dst), getRegFromArg(UseArgs)...);
-  }
-  MachineInstrBuilder buildAnd(unsigned Res, unsigned Op0,
-                               unsigned Op1);
-
-  /// Build and insert \p Res = G_OR \p Op0, \p Op1
-  ///
-  /// G_OR sets \p Res to the bitwise or of integer parameters \p Op0 and \p
-  /// Op1.
-  ///
-  /// \pre setBasicBlock or setMI must have been called.
-  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
-  ///      with the same (scalar or vector) type).
-  ///
-  /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildOr(unsigned Res, unsigned Op0, unsigned Op1);
 
   /// Build and insert \p Res = G_ANYEXT \p Op0
   ///
@@ -399,6 +332,10 @@ public:
   /// \pre \p Op must be smaller than \p Res
   ///
   /// \return The newly created instruction.
+  template <typename DstType, typename ArgType>
+  MachineInstrBuilder buildSExt(DstType &&Res, ArgType &&Arg) {
+    return buildSExt(getDestFromArg(Res), getRegFromArg(Arg));
+  }
   MachineInstrBuilder buildSExt(unsigned Res, unsigned Op);
 
   /// Build and insert \p Res = G_ZEXT \p Op
@@ -413,6 +350,10 @@ public:
   /// \pre \p Op must be smaller than \p Res
   ///
   /// \return The newly created instruction.
+  template <typename DstType, typename ArgType>
+  MachineInstrBuilder buildZExt(DstType &&Res, ArgType &&Arg) {
+    return buildZExt(getDestFromArg(Res), getRegFromArg(Arg));
+  }
   MachineInstrBuilder buildZExt(unsigned Res, unsigned Op);
 
   /// Build and insert \p Res = G_SEXT \p Op, \p Res = G_TRUNC \p Op, or
@@ -423,6 +364,10 @@ public:
   /// \pre \p Op must be a generic virtual register with scalar or vector type.
   ///
   /// \return The newly created instruction.
+  template <typename DstTy, typename UseArgTy>
+  MachineInstrBuilder buildSExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) {
+    return buildSExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use));
+  }
   MachineInstrBuilder buildSExtOrTrunc(unsigned Res, unsigned Op);
 
   /// Build and insert \p Res = G_ZEXT \p Op, \p Res = G_TRUNC \p Op, or
@@ -433,6 +378,10 @@ public:
   /// \pre \p Op must be a generic virtual register with scalar or vector type.
   ///
   /// \return The newly created instruction.
+  template <typename DstTy, typename UseArgTy>
+  MachineInstrBuilder buildZExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) {
+    return buildZExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use));
+  }
   MachineInstrBuilder buildZExtOrTrunc(unsigned Res, unsigned Op);
 
   // Build and insert \p Res = G_ANYEXT \p Op, \p Res = G_TRUNC \p Op, or
@@ -462,6 +411,10 @@ public:
                                       unsigned Op);
 
   /// Build and insert an appropriate cast between two registers of equal size.
+  template <typename DstType, typename ArgType>
+  MachineInstrBuilder buildCast(DstType &&Res, ArgType &&Arg) {
+    return buildCast(getDestFromArg(Res), getRegFromArg(Arg));
+  }
   MachineInstrBuilder buildCast(unsigned Dst, unsigned Src);
 
   /// Build and insert G_BR \p Dest
@@ -471,7 +424,7 @@ public:
   /// \pre setBasicBlock or setMI must have been called.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildBr(MachineBasicBlock &BB);
+  MachineInstrBuilder buildBr(MachineBasicBlock &Dest);
 
   /// Build and insert G_BRCOND \p Tst, \p Dest
   ///
@@ -485,7 +438,7 @@ public:
   ///      depend on bit 0 (for now).
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildBrCond(unsigned Tst, MachineBasicBlock &BB);
+  MachineInstrBuilder buildBrCond(unsigned Tst, MachineBasicBlock &Dest);
 
   /// Build and insert G_BRINDIRECT \p Tgt
   ///
@@ -532,8 +485,18 @@ public:
   /// \pre \p Res must be a generic virtual register with scalar type.
   ///
   /// \return The newly created instruction.
+  template <typename DstType>
+  MachineInstrBuilder buildFConstant(DstType &&Res, const ConstantFP &Val) {
+    return buildFConstant(getDestFromArg(Res), Val);
+  }
   MachineInstrBuilder buildFConstant(unsigned Res, const ConstantFP &Val);
 
+  template <typename DstType>
+  MachineInstrBuilder buildFConstant(DstType &&Res, double Val) {
+    return buildFConstant(getDestFromArg(Res), Val);
+  }
+  MachineInstrBuilder buildFConstant(unsigned Res, double Val);
+
   /// Build and insert \p Res = COPY Op
   ///
   /// Register-to-register COPY sets \p Res to \p Op.
@@ -559,6 +522,18 @@ public:
   MachineInstrBuilder buildLoad(unsigned Res, unsigned Addr,
                                 MachineMemOperand &MMO);
 
+  /// Build and insert `Res = <opcode> Addr, MMO`.
+  ///
+  /// Loads the value stored at \p Addr. Puts the result in \p Res.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildLoadInstr(unsigned Opcode, unsigned Res,
+                                     unsigned Addr, MachineMemOperand &MMO);
+
   /// Build and insert `G_STORE Val, Addr, MMO`.
   ///
   /// Stores the value \p Val to \p Addr.
@@ -580,7 +555,10 @@ public:
   MachineInstrBuilder buildExtract(unsigned Res, unsigned Src, uint64_t Index);
 
   /// Build and insert \p Res = IMPLICIT_DEF.
-  MachineInstrBuilder buildUndef(unsigned Dst);
+  template <typename DstType> MachineInstrBuilder buildUndef(DstType &&Res) {
+    return buildUndef(getDestFromArg(Res));
+  }
+  MachineInstrBuilder buildUndef(unsigned Res);
 
   /// Build and insert instructions to put \p Ops together at the specified p
   /// Indices to form a larger register.
@@ -649,6 +627,10 @@ public:
   /// \pre \p Res must be smaller than \p Op
   ///
   /// \return The newly created instruction.
+  template <typename DstType, typename SrcType>
+  MachineInstrBuilder buildFPTrunc(DstType &&Res, SrcType &&Src) {
+    return buildFPTrunc(getDestFromArg(Res), getRegFromArg(Src));
+  }
   MachineInstrBuilder buildFPTrunc(unsigned Res, unsigned Op);
 
   /// Build and insert \p Res = G_TRUNC \p Op
@@ -735,7 +717,28 @@ public:
   MachineInstrBuilder buildExtractVectorElement(unsigned Res, unsigned Val,
                                                 unsigned Idx);
 
-  /// Build and insert `OldValRes = G_ATOMIC_CMPXCHG Addr, CmpVal, NewVal,
+  /// Build and insert `OldValRes<def>, SuccessRes<def> =
+  /// G_ATOMIC_CMPXCHG_WITH_SUCCESS Addr, CmpVal, NewVal, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with \p NewVal if it is currently
+  /// \p CmpVal otherwise leaves it unchanged. Puts the original value from \p
+  /// Addr in \p Res, along with an s1 indicating whether it was replaced.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register of scalar type.
+  /// \pre \p SuccessRes must be a generic virtual register of scalar type. It
+  ///      will be assigned 0 on failure and 1 on success.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, \p CmpVal, and \p NewVal must be generic virtual
+  ///      registers of the same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder
+  buildAtomicCmpXchgWithSuccess(unsigned OldValRes, unsigned SuccessRes,
+                                unsigned Addr, unsigned CmpVal, unsigned NewVal,
+                                MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMIC_CMPXCHG Addr, CmpVal, NewVal,
   /// MMO`.
   ///
   /// Atomically replace the value at \p Addr with \p NewVal if it is currently
@@ -752,6 +755,328 @@ public:
   MachineInstrBuilder buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
                                          unsigned CmpVal, unsigned NewVal,
                                          MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_<Opcode> Addr, Val, MMO`.
+  ///
+  /// Atomically read-modify-update the value at \p Addr with \p Val. Puts the
+  /// original value from \p Addr in \p OldValRes. The modification is
+  /// determined by the opcode.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMW(unsigned Opcode, unsigned OldValRes,
+                                     unsigned Addr, unsigned Val,
+                                     MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_XCHG Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with \p Val. Puts the original
+  /// value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWXchg(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_ADD Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the addition of \p Val and
+  /// the original value. Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWAdd(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_SUB Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the subtraction of \p Val and
+  /// the original value. Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWSub(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_AND Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the bitwise and of \p Val and
+  /// the original value. Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWAnd(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_NAND Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the bitwise nand of \p Val
+  /// and the original value. Puts the original value from \p Addr in \p
+  /// OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWNand(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_OR Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the bitwise or of \p Val and
+  /// the original value. Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWOr(unsigned OldValRes, unsigned Addr,
+                                       unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_XOR Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the bitwise xor of \p Val and
+  /// the original value. Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWXor(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_MAX Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the signed maximum of \p
+  /// Val and the original value. Puts the original value from \p Addr in \p
+  /// OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWMax(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_MIN Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the signed minimum of \p
+  /// Val and the original value. Puts the original value from \p Addr in \p
+  /// OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWMin(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_UMAX Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the unsigned maximum of \p
+  /// Val and the original value. Puts the original value from \p Addr in \p
+  /// OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWUmax(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_UMIN Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the unsigned minimum of \p
+  /// Val and the original value. Puts the original value from \p Addr in \p
+  /// OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWUmin(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO);
+};
+
+/// A CRTP class that contains methods for building instructions that can
+/// be constant folded. MachineIRBuilders that want to inherit from this will
+/// need to implement buildBinaryOp (for constant folding binary ops).
+/// Alternatively, they can implement buildInstr(Opc, Dst, Uses...) to perform
+/// additional folding for Opc.
+template <typename Base>
+class FoldableInstructionsBuilder : public MachineIRBuilderBase {
+  Base &base() { return static_cast<Base &>(*this); }
+
+public:
+  using MachineIRBuilderBase::MachineIRBuilderBase;
+  /// Build and insert \p Res = G_ADD \p Op0, \p Op1
+  ///
+  /// G_ADD sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
+  /// truncated to their width.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
+  ///      with the same (scalar or vector) type).
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+
+  MachineInstrBuilder buildAdd(unsigned Dst, unsigned Src0, unsigned Src1) {
+    return base().buildBinaryOp(TargetOpcode::G_ADD, Dst, Src0, Src1);
+  }
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildAdd(DstTy &&Ty, UseArgsTy &&... UseArgs) {
+    unsigned Res = base().getDestFromArg(Ty);
+    return base().buildAdd(Res, (base().getRegFromArg(UseArgs))...);
+  }
+
+  /// Build and insert \p Res = G_SUB \p Op0, \p Op1
+  ///
+  /// G_SUB sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
+  /// truncated to their width.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
+  ///      with the same (scalar or vector) type).
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+
+  MachineInstrBuilder buildSub(unsigned Dst, unsigned Src0, unsigned Src1) {
+    return base().buildBinaryOp(TargetOpcode::G_SUB, Dst, Src0, Src1);
+  }
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildSub(DstTy &&Ty, UseArgsTy &&... UseArgs) {
+    unsigned Res = base().getDestFromArg(Ty);
+    return base().buildSub(Res, (base().getRegFromArg(UseArgs))...);
+  }
+
+  /// Build and insert \p Res = G_MUL \p Op0, \p Op1
+  ///
+  /// G_MUL sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
+  /// truncated to their width.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
+  ///      with the same (scalar or vector) type).
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildMul(unsigned Dst, unsigned Src0, unsigned Src1) {
+    return base().buildBinaryOp(TargetOpcode::G_MUL, Dst, Src0, Src1);
+  }
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildMul(DstTy &&Ty, UseArgsTy &&... UseArgs) {
+    unsigned Res = base().getDestFromArg(Ty);
+    return base().buildMul(Res, (base().getRegFromArg(UseArgs))...);
+  }
+
+  /// Build and insert \p Res = G_AND \p Op0, \p Op1
+  ///
+  /// G_AND sets \p Res to the bitwise and of integer parameters \p Op0 and \p
+  /// Op1.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
+  ///      with the same (scalar or vector) type).
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+
+  MachineInstrBuilder buildAnd(unsigned Dst, unsigned Src0, unsigned Src1) {
+    return base().buildBinaryOp(TargetOpcode::G_AND, Dst, Src0, Src1);
+  }
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildAnd(DstTy &&Ty, UseArgsTy &&... UseArgs) {
+    unsigned Res = base().getDestFromArg(Ty);
+    return base().buildAnd(Res, (base().getRegFromArg(UseArgs))...);
+  }
+
+  /// Build and insert \p Res = G_OR \p Op0, \p Op1
+  ///
+  /// G_OR sets \p Res to the bitwise or of integer parameters \p Op0 and \p
+  /// Op1.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
+  ///      with the same (scalar or vector) type).
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildOr(unsigned Dst, unsigned Src0, unsigned Src1) {
+    return base().buildBinaryOp(TargetOpcode::G_OR, Dst, Src0, Src1);
+  }
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildOr(DstTy &&Ty, UseArgsTy &&... UseArgs) {
+    unsigned Res = base().getDestFromArg(Ty);
+    return base().buildOr(Res, (base().getRegFromArg(UseArgs))...);
+  }
+};
+
+class MachineIRBuilder : public FoldableInstructionsBuilder<MachineIRBuilder> {
+public:
+  using FoldableInstructionsBuilder<
+      MachineIRBuilder>::FoldableInstructionsBuilder;
+  MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Dst,
+                                    unsigned Src0, unsigned Src1) {
+    validateBinaryOp(Dst, Src0, Src1);
+    return buildInstr(Opcode).addDef(Dst).addUse(Src0).addUse(Src1);
+  }
+  using FoldableInstructionsBuilder<MachineIRBuilder>::buildInstr;
+  /// DAG like Generic method for building arbitrary instructions as above.
+  /// \Opc opcode for the instruction.
+  /// \Ty Either LLT/TargetRegisterClass/unsigned types for Dst
+  /// \Args Variadic list of uses of types(unsigned/MachineInstrBuilder)
+  /// Uses of type MachineInstrBuilder will perform
+  /// getOperand(0).getReg() to convert to register.
+  template <typename DstTy, typename... UseArgsTy>
+  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty,
+                                 UseArgsTy &&... Args) {
+    auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty));
+    addUsesFromArgs(MIB, std::forward<UseArgsTy>(Args)...);
+    return MIB;
+  }
 };
 
 } // End namespace llvm.
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
index 676955c33fe9..c53ae416e60b 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
@@ -22,7 +22,7 @@
 /// of an instruction should live. It asks the target which banks may be
 /// used for each operand of the instruction and what is the cost. Then,
 /// it chooses the solution which minimize the cost of the instruction plus
-/// the cost of any move that may be needed to to the values into the right
+/// the cost of any move that may be needed to the values into the right
 /// register bank.
 /// In other words, the cost for an instruction on a register bank RegBank
 /// is: Cost of I on RegBank plus the sum of the cost for bringing the
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h
index 5d758423f4e7..d5612e17393c 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h
@@ -42,7 +42,7 @@ private:
 
 public:
   RegisterBank(unsigned ID, const char *Name, unsigned Size,
-               const uint32_t *ContainedRegClasses, unsigned NumRegClasses);
+               const uint32_t *CoveredClasses, unsigned NumRegClasses);
 
   /// Get the identifier of this register bank.
   unsigned getID() const { return ID; }
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
index 02868b220984..82fd7eddb68a 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
@@ -622,6 +622,8 @@ public:
   /// \pre \p Reg is a virtual register that either has a bank or a class.
   /// \returns The constrained register class, or nullptr if there is none.
   /// \note This is a generic variant of MachineRegisterInfo::constrainRegClass
+  /// \note Use MachineRegisterInfo::constrainRegAttrs instead for any non-isel
+  /// purpose, including non-select passes of GlobalISel
   static const TargetRegisterClass *
   constrainGenericRegister(unsigned Reg, const TargetRegisterClass &RC,
                            MachineRegisterInfo &MRI);
diff --git a/contrib/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/contrib/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 5864c15cc8eb..51e3a2732972 100644
--- a/contrib/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/contrib/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -19,8 +19,10 @@
 
 namespace llvm {
 
+class AnalysisUsage;
 class MachineFunction;
 class MachineInstr;
+class MachineOperand;
 class MachineOptimizationRemarkEmitter;
 class MachineOptimizationRemarkMissed;
 class MachineRegisterInfo;
@@ -32,6 +34,7 @@ class TargetRegisterInfo;
 class TargetRegisterClass;
 class Twine;
 class ConstantFP;
+class APFloat;
 
 /// Try to constrain Reg to the specified register class. If this fails,
 /// create a new virtual register in the correct class and insert a COPY before
@@ -57,8 +60,21 @@ unsigned constrainOperandRegClass(const MachineFunction &MF,
                                   const TargetInstrInfo &TII,
                                   const RegisterBankInfo &RBI,
                                   MachineInstr &InsertPt, const MCInstrDesc &II,
-                                  unsigned Reg, unsigned OpIdx);
+                                  const MachineOperand &RegMO, unsigned OpIdx);
 
+/// Mutate the newly-selected instruction \p I to constrain its (possibly
+/// generic) virtual register operands to the instruction's register class.
+/// This could involve inserting COPYs before (for uses) or after (for defs).
+/// This requires the number of operands to match the instruction description.
+/// \returns whether operand regclass constraining succeeded.
+///
+// FIXME: Not all instructions have the same number of operands. We should
+// probably expose a constrain helper per operand and let the target selector
+// constrain individual registers, like fast-isel.
+bool constrainSelectedInstRegOperands(MachineInstr &I,
+                                      const TargetInstrInfo &TII,
+                                      const TargetRegisterInfo &TRI,
+                                      const RegisterBankInfo &RBI);
 /// Check whether an instruction \p MI is dead: it only defines dead virtual
 /// registers, and doesn't have other side effects.
 bool isTriviallyDead(const MachineInstr &MI, const MachineRegisterInfo &MRI);
@@ -85,5 +101,12 @@ const ConstantFP* getConstantFPVRegVal(unsigned VReg,
 MachineInstr *getOpcodeDef(unsigned Opcode, unsigned Reg,
                            const MachineRegisterInfo &MRI);
 
+/// Returns an APFloat from Val converted to the appropriate size.
+APFloat getAPFloatFromSize(double Val, unsigned Size);
+
+/// Modify analysis usage so it preserves passes required for the SelectionDAG
+/// fallback.
+void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU);
+
 } // End namespace llvm.
 #endif
diff --git a/contrib/llvm/include/llvm/CodeGen/ISDOpcodes.h b/contrib/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d256849be9af..80bd796d5374 100644
--- a/contrib/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/contrib/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -377,6 +377,8 @@ namespace ISD {
     /// When the 1st operand is a vector, the shift amount must be in the same
     /// type. (TLI.getShiftAmountTy() will return the same type when the input
     /// type is a vector.)
+    /// For rotates, the shift amount is treated as an unsigned amount modulo
+    /// the element size of the first operand.
     SHL, SRA, SRL, ROTL, ROTR,
 
     /// Byte Swap and Counting operators.
@@ -412,19 +414,11 @@ namespace ISD {
     /// then the result type must also be a vector type.
     SETCC,
 
-    /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, and
-    /// op #2 is a *carry value*. This operator checks the result of
-    /// "LHS - RHS - Carry", and can be used to compare two wide integers:
-    /// (setcce lhshi rhshi (subc lhslo rhslo) cc). Only valid for integers.
-    /// FIXME: This node is deprecated in favor of SETCCCARRY.
-    /// It is kept around for now to provide a smooth transition path
-    /// toward the use of SETCCCARRY and will eventually be removed.
-    SETCCE,
-
     /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but
     /// op #2 is a boolean indicating if there is an incoming carry. This
     /// operator checks the result of "LHS - RHS - Carry", and can be used to
-    /// compare two wide integers: (setcce lhshi rhshi (subc lhslo rhslo) cc).
+    /// compare two wide integers:
+    /// (setcccarry lhshi rhshi (subcarry lhslo rhslo) cc).
     /// Only valid for integers.
     SETCCCARRY,
 
@@ -495,7 +489,8 @@ namespace ISD {
     ZERO_EXTEND_VECTOR_INREG,
 
     /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned
-    /// integer.
+    /// integer. These have the same semantics as fptosi and fptoui in IR. If
+    /// the FP value cannot fit in the integer type, the results are undefined.
     FP_TO_SINT,
     FP_TO_UINT,
 
@@ -779,6 +774,7 @@ namespace ISD {
     ATOMIC_LOAD_ADD,
     ATOMIC_LOAD_SUB,
     ATOMIC_LOAD_AND,
+    ATOMIC_LOAD_CLR,
     ATOMIC_LOAD_OR,
     ATOMIC_LOAD_XOR,
     ATOMIC_LOAD_NAND,
diff --git a/contrib/llvm/include/llvm/CodeGen/LatencyPriorityQueue.h b/contrib/llvm/include/llvm/CodeGen/LatencyPriorityQueue.h
index 988e6d6cb3a3..9b8d83ce77ca 100644
--- a/contrib/llvm/include/llvm/CodeGen/LatencyPriorityQueue.h
+++ b/contrib/llvm/include/llvm/CodeGen/LatencyPriorityQueue.h
@@ -17,6 +17,7 @@
 #define LLVM_CODEGEN_LATENCYPRIORITYQUEUE_H
 
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Config/llvm-config.h"
 
 namespace llvm {
   class LatencyPriorityQueue;
@@ -26,7 +27,7 @@ namespace llvm {
     LatencyPriorityQueue *PQ;
     explicit latency_sort(LatencyPriorityQueue *pq) : PQ(pq) {}
 
-    bool operator()(const SUnit* left, const SUnit* right) const;
+    bool operator()(const SUnit* LHS, const SUnit* RHS) const;
   };
 
   class LatencyPriorityQueue : public SchedulingPriorityQueue {
@@ -83,11 +84,15 @@ namespace llvm {
 
     void remove(SUnit *SU) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override;
+#endif
+
     // scheduledNode - As nodes are scheduled, we look to see if there are any
     // successor nodes that have a single unscheduled predecessor.  If so, that
     // single predecessor has a higher priority, since scheduling it will make
     // the node available.
-    void scheduledNode(SUnit *Node) override;
+    void scheduledNode(SUnit *SU) override;
 
 private:
     void AdjustPriorityOfUnscheduledPreds(SUnit *SU);
diff --git a/contrib/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h b/contrib/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
index 848ee1dc0dc6..221f16a03f16 100644
--- a/contrib/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
 namespace llvm {
-/// \brief This is an alternative analysis pass to MachineBlockFrequencyInfo.
+/// This is an alternative analysis pass to MachineBlockFrequencyInfo.
 /// The difference is that with this pass, the block frequencies are not
 /// computed when the analysis pass is executed but rather when the BFI result
 /// is explicitly requested by the analysis client.
@@ -49,7 +49,7 @@ private:
   /// The function.
   MachineFunction *MF = nullptr;
 
-  /// \brief Calculate MBFI and all other analyses that's not available and
+  /// Calculate MBFI and all other analyses that's not available and
   /// required by BFI.
   MachineBlockFrequencyInfo &calculateIfNotAvailable() const;
 
@@ -58,10 +58,10 @@ public:
 
   LazyMachineBlockFrequencyInfoPass();
 
-  /// \brief Compute and return the block frequencies.
+  /// Compute and return the block frequencies.
   MachineBlockFrequencyInfo &getBFI() { return calculateIfNotAvailable(); }
 
-  /// \brief Compute and return the block frequencies.
+  /// Compute and return the block frequencies.
   const MachineBlockFrequencyInfo &getBFI() const {
     return calculateIfNotAvailable();
   }
diff --git a/contrib/llvm/include/llvm/CodeGen/LiveInterval.h b/contrib/llvm/include/llvm/CodeGen/LiveInterval.h
index f4fa872c7f5b..cdf9ad2588cf 100644
--- a/contrib/llvm/include/llvm/CodeGen/LiveInterval.h
+++ b/contrib/llvm/include/llvm/CodeGen/LiveInterval.h
@@ -326,7 +326,7 @@ namespace llvm {
     /// createDeadDef - Make sure the range has a value defined at Def.
     /// If one already exists, return it. Otherwise allocate a new value and
     /// add liveness for a dead def.
-    VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNInfoAllocator);
+    VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNIAlloc);
 
     /// Create a def of value @p VNI. Return @p VNI. If there already exists
     /// a definition at VNI->def, the value defined there must be @p VNI.
@@ -454,7 +454,7 @@ namespace llvm {
     /// overlapsFrom - Return true if the intersection of the two live ranges
     /// is not empty.  The specified iterator is a hint that we can begin
     /// scanning the Other range starting at I.
-    bool overlapsFrom(const LiveRange &Other, const_iterator I) const;
+    bool overlapsFrom(const LiveRange &Other, const_iterator StartPos) const;
 
     /// Returns true if all segments of the @p Other live range are completely
     /// covered by this live range.
@@ -482,7 +482,7 @@ namespace llvm {
     /// @p Use, return {nullptr, false}. If there is an "undef" before @p Use,
     /// return {nullptr, true}.
     std::pair<VNInfo*,bool> extendInBlock(ArrayRef<SlotIndex> Undefs,
-        SlotIndex StartIdx, SlotIndex Use);
+        SlotIndex StartIdx, SlotIndex Kill);
 
     /// Simplified version of the above "extendInBlock", which assumes that
     /// no register lanes are undefined by <def,read-undef> operands.
@@ -609,7 +609,7 @@ namespace llvm {
     void print(raw_ostream &OS) const;
     void dump() const;
 
-    /// \brief Walk the range and assert if any invariants fail to hold.
+    /// Walk the range and assert if any invariants fail to hold.
     ///
     /// Note that this is a no-op when asserts are disabled.
 #ifdef NDEBUG
@@ -791,7 +791,7 @@ namespace llvm {
     ///    L00E0 and L0010 and the L000F lane into L0007 and L0008. The Mod
     ///    function will be applied to the L0010 and L0008 subranges.
     void refineSubRanges(BumpPtrAllocator &Allocator, LaneBitmask LaneMask,
-                         std::function<void(LiveInterval::SubRange&)> Mod);
+                         std::function<void(LiveInterval::SubRange&)> Apply);
 
     bool operator<(const LiveInterval& other) const {
       const SlotIndex &thisIndex = beginIndex();
@@ -802,7 +802,7 @@ namespace llvm {
     void print(raw_ostream &OS) const;
     void dump() const;
 
-    /// \brief Walks the interval and assert if any invariants fail to hold.
+    /// Walks the interval and assert if any invariants fail to hold.
     ///
     /// Note that this is a no-op when asserts are disabled.
 #ifdef NDEBUG
diff --git a/contrib/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/contrib/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
index b922e543c856..9e2799bd4414 100644
--- a/contrib/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/contrib/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -154,7 +154,7 @@ public:
         unsigned MaxInterferingRegs = std::numeric_limits<unsigned>::max());
 
     // Was this virtual register visited during collectInterferingVRegs?
-    bool isSeenInterference(LiveInterval *VReg) const;
+    bool isSeenInterference(LiveInterval *VirtReg) const;
 
     // Did collectInterferingVRegs collect all interferences?
     bool seenAllInterferences() const { return SeenAllInterferences; }
diff --git a/contrib/llvm/include/llvm/CodeGen/LiveIntervals.h b/contrib/llvm/include/llvm/CodeGen/LiveIntervals.h
index 1150f3c1c47b..291a07a712cb 100644
--- a/contrib/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/contrib/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -105,7 +105,7 @@ class VirtRegMap;
     /// Calculate the spill weight to assign to a single instruction.
     static float getSpillWeight(bool isDef, bool isUse,
                                 const MachineBlockFrequencyInfo *MBFI,
-                                const MachineInstr &Instr);
+                                const MachineInstr &MI);
 
     /// Calculate the spill weight to assign to a single instruction.
     static float getSpillWeight(bool isDef, bool isUse,
@@ -462,6 +462,10 @@ class VirtRegMap;
     void computeRegUnitRange(LiveRange&, unsigned Unit);
     void computeVirtRegInterval(LiveInterval&);
 
+    using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>;
+    void extendSegmentsToUses(LiveRange &Segments,
+                              ShrinkToUsesWorkList &WorkList, unsigned Reg,
+                              LaneBitmask LaneMask);
 
     /// Helper function for repairIntervalsInRange(), walks backwards and
     /// creates/modifies live segments in \p LR to match the operands found.
diff --git a/contrib/llvm/include/llvm/CodeGen/LivePhysRegs.h b/contrib/llvm/include/llvm/CodeGen/LivePhysRegs.h
index f9aab0d09e1f..301a45066b4c 100644
--- a/contrib/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/contrib/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -44,7 +44,7 @@ class MachineOperand;
 class MachineRegisterInfo;
 class raw_ostream;
 
-/// \brief A set of physical registers with utility functions to track liveness
+/// A set of physical registers with utility functions to track liveness
 /// when walking backward/forward through a basic block.
 class LivePhysRegs {
   const TargetRegisterInfo *TRI = nullptr;
@@ -84,7 +84,7 @@ public:
       LiveRegs.insert(*SubRegs);
   }
 
-  /// \brief Removes a physical register, all its sub-registers, and all its
+  /// Removes a physical register, all its sub-registers, and all its
   /// super-registers from the set.
   void removeReg(unsigned Reg) {
     assert(TRI && "LivePhysRegs is not initialized.");
@@ -98,7 +98,7 @@ public:
         SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers =
         nullptr);
 
-  /// \brief Returns true if register \p Reg is contained in the set. This also
+  /// Returns true if register \p Reg is contained in the set. This also
   /// works if only the super register of \p Reg has been defined, because
   /// addReg() always adds all sub-registers to the set as well.
   /// Note: Returns false if just some sub registers are live, use available()
@@ -155,7 +155,7 @@ public:
   void dump() const;
 
 private:
-  /// \brief Adds live-in registers from basic block \p MBB, taking associated
+  /// Adds live-in registers from basic block \p MBB, taking associated
   /// lane masks into consideration.
   void addBlockLiveIns(const MachineBasicBlock &MBB);
 
@@ -169,7 +169,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const LivePhysRegs& LR) {
   return OS;
 }
 
-/// \brief Computes registers live-in to \p MBB assuming all of its successors
+/// Computes registers live-in to \p MBB assuming all of its successors
 /// live-in lists are up-to-date. Puts the result into the given LivePhysReg
 /// instance \p LiveRegs.
 void computeLiveIns(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB);
@@ -185,6 +185,13 @@ void addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs);
 void computeAndAddLiveIns(LivePhysRegs &LiveRegs,
                           MachineBasicBlock &MBB);
 
+/// Convenience function for recomputing live-in's for \p MBB.
+static inline void recomputeLiveIns(MachineBasicBlock &MBB) {
+  LivePhysRegs LPR;
+  MBB.clearLiveIns();
+  computeAndAddLiveIns(LPR, MBB);
+}
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_LIVEPHYSREGS_H
diff --git a/contrib/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/contrib/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index 84bccde0caa2..53830297c525 100644
--- a/contrib/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/contrib/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -117,10 +117,13 @@ private:
   /// registers are created.
   void MRI_NoteNewVirtualRegister(unsigned VReg) override;
 
-  /// \brief Check if MachineOperand \p MO is a last use/kill either in the
+  /// Check if MachineOperand \p MO is a last use/kill either in the
   /// main live range of \p LI or in one of the matching subregister ranges.
   bool useIsKill(const LiveInterval &LI, const MachineOperand &MO) const;
 
+  /// Create a new empty interval based on OldReg.
+  LiveInterval &createEmptyIntervalFrom(unsigned OldReg, bool createSubRanges);
+
 public:
   /// Create a LiveRangeEdit for breaking down parent into smaller pieces.
   /// @param parent The register being spilled or split.
@@ -174,16 +177,13 @@ public:
     return makeArrayRef(NewRegs).slice(FirstNew);
   }
 
-  /// createEmptyIntervalFrom - Create a new empty interval based on OldReg.
-  LiveInterval &createEmptyIntervalFrom(unsigned OldReg);
-
   /// createFrom - Create a new virtual register based on OldReg.
   unsigned createFrom(unsigned OldReg);
 
   /// create - Create a new register with the same class and original slot as
   /// parent.
   LiveInterval &createEmptyInterval() {
-    return createEmptyIntervalFrom(getReg());
+    return createEmptyIntervalFrom(getReg(), true);
   }
 
   unsigned create() { return createFrom(getReg()); }
@@ -233,12 +233,6 @@ public:
     return Rematted.count(ParentVNI);
   }
 
-  void markDeadRemat(MachineInstr *inst) {
-    // DeadRemats is an optional field.
-    if (DeadRemats)
-      DeadRemats->insert(inst);
-  }
-
   /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try
   /// to erase it from LIS.
   void eraseVirtReg(unsigned Reg);
diff --git a/contrib/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/contrib/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index fa6827f6b1f9..f62a55c73085 100644
--- a/contrib/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/contrib/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -107,6 +107,13 @@ public:
   /// with the highest enum value is returned.
   InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg);
 
+  /// Check for interference in the segment [Start, End) that may prevent
+  /// assignment to PhysReg. If this function returns true, there is
+  /// interference in the segment [Start, End) of some other interval already
+  /// assigned to PhysReg. If this function returns false, PhysReg is free at
+  /// the segment [Start, End).
+  bool checkInterference(SlotIndex Start, SlotIndex End, unsigned PhysReg);
+
   /// Assign VirtReg to PhysReg.
   /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
   /// update VirtRegMap. The live range is expected to be available in PhysReg.
diff --git a/contrib/llvm/include/llvm/CodeGen/LiveRegUnits.h b/contrib/llvm/include/llvm/CodeGen/LiveRegUnits.h
index dc4956da9637..249545906e01 100644
--- a/contrib/llvm/include/llvm/CodeGen/LiveRegUnits.h
+++ b/contrib/llvm/include/llvm/CodeGen/LiveRegUnits.h
@@ -16,6 +16,7 @@
 #define LLVM_CODEGEN_LIVEREGUNITS_H
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -40,6 +41,36 @@ public:
     init(TRI);
   }
 
+  /// For a machine instruction \p MI, adds all register units used in
+  /// \p UsedRegUnits and defined or clobbered in \p ModifiedRegUnits. This is
+  /// useful when walking over a range of instructions to track registers
+  /// used or defined seperately.
+  static void accumulateUsedDefed(const MachineInstr &MI,
+                                  LiveRegUnits &ModifiedRegUnits,
+                                  LiveRegUnits &UsedRegUnits,
+                                  const TargetRegisterInfo *TRI) {
+    for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+      if (O->isRegMask())
+        ModifiedRegUnits.addRegsInMask(O->getRegMask());
+      if (!O->isReg())
+        continue;
+      unsigned Reg = O->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      if (O->isDef()) {
+        // Some architectures (e.g. AArch64 XZR/WZR) have registers that are
+        // constant and may be used as destinations to indicate the generated
+        // value is discarded. No need to track such case as a def.
+        if (!TRI->isConstantPhysReg(Reg))
+          ModifiedRegUnits.addReg(Reg);
+      } else {
+        assert(O->isUse() && "Reg operand not a def and not a use");
+        UsedRegUnits.addReg(Reg);
+      }
+    }
+    return;
+  }
+
   /// Initialize and clear the set.
   void init(const TargetRegisterInfo &TRI) {
     this->TRI = &TRI;
@@ -59,7 +90,7 @@ public:
       Units.set(*Unit);
   }
 
-  /// \brief Adds register units covered by physical register \p Reg that are
+  /// Adds register units covered by physical register \p Reg that are
   /// part of the lanemask \p Mask.
   void addRegMasked(unsigned Reg, LaneBitmask Mask) {
     for (MCRegUnitMaskIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
diff --git a/contrib/llvm/include/llvm/CodeGen/LoopTraversal.h b/contrib/llvm/include/llvm/CodeGen/LoopTraversal.h
new file mode 100644
index 000000000000..750da0143c0d
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/LoopTraversal.h
@@ -0,0 +1,116 @@
+//==------ llvm/CodeGen/LoopTraversal.h - Loop Traversal -*- C++ -*---------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Loop Traversal logic.
+///
+/// This class provides the basic blocks traversal order used by passes like
+/// ReachingDefAnalysis and ExecutionDomainFix.
+/// It identifies basic blocks that are part of loops and should to be visited
+/// twice and returns efficient traversal order for all the blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LOOPTRAVERSAL_H
+#define LLVM_CODEGEN_LOOPTRAVERSAL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineFunction;
+
+/// This class provides the basic blocks traversal order used by passes like
+/// ReachingDefAnalysis and ExecutionDomainFix.
+/// It identifies basic blocks that are part of loops and should to be visited
+/// twice and returns efficient traversal order for all the blocks.
+///
+/// We want to visit every instruction in every basic block in order to update
+/// it's execution domain or collect clearance information. However, for the
+/// clearance calculation, we need to know clearances from all predecessors
+/// (including any backedges), therfore we need to visit some blocks twice.
+/// As an example, consider the following loop.
+///
+///
+///    PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
+///          ^                                  |
+///          +----------------------------------+
+///
+/// The iteration order this pass will return is as follows:
+/// Optimized: PH A B C A' B' C' D
+///
+/// The basic block order is constructed as follows:
+/// Once we finish processing some block, we update the counters in MBBInfos
+/// and re-process any successors that are now 'done'.
+/// We call a block that is ready for its final round of processing `done`
+/// (isBlockDone), e.g. when all predecessor information is known.
+///
+/// Note that a naive traversal order would be to do two complete passes over
+/// all basic blocks/instructions, the first for recording clearances, the
+/// second for updating clearance based on backedges.
+/// However, for functions without backedges, or functions with a lot of
+/// straight-line code, and a small loop, that would be a lot of unnecessary
+/// work (since only the BBs that are part of the loop require two passes).
+///
+/// E.g., the naive iteration order for the above exmple is as follows:
+/// Naive: PH A B C D A' B' C' D'
+///
+/// In the optimized approach we avoid processing D twice, because we
+/// can entirely process the predecessors before getting to D.
+class LoopTraversal {
+private:
+  struct MBBInfo {
+    /// Whether we have gotten to this block in primary processing yet.
+    bool PrimaryCompleted = false;
+
+    /// The number of predecessors for which primary processing has completed
+    unsigned IncomingProcessed = 0;
+
+    /// The value of `IncomingProcessed` at the start of primary processing
+    unsigned PrimaryIncoming = 0;
+
+    /// The number of predecessors for which all processing steps are done.
+    unsigned IncomingCompleted = 0;
+
+    MBBInfo() = default;
+  };
+  using MBBInfoMap = SmallVector<MBBInfo, 4>;
+  /// Helps keep track if we proccessed this block and all its predecessors.
+  MBBInfoMap MBBInfos;
+
+public:
+  struct TraversedMBBInfo {
+    /// The basic block.
+    MachineBasicBlock *MBB = nullptr;
+
+    /// True if this is the first time we process the basic block.
+    bool PrimaryPass = true;
+
+    /// True if the block that is ready for its final round of processing.
+    bool IsDone = true;
+
+    TraversedMBBInfo(MachineBasicBlock *BB = nullptr, bool Primary = true,
+                     bool Done = true)
+        : MBB(BB), PrimaryPass(Primary), IsDone(Done) {}
+  };
+  LoopTraversal() {}
+
+  /// Identifies basic blocks that are part of loops and should to be
+  ///  visited twice and returns efficient traversal order for all the blocks.
+  typedef SmallVector<TraversedMBBInfo, 4> TraversalOrder;
+  TraversalOrder traverse(MachineFunction &MF);
+
+private:
+  /// Returens true if the block is ready for its final round of processing.
+  bool isBlockDone(MachineBasicBlock *MBB);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_LOOPTRAVERSAL_H
diff --git a/contrib/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h b/contrib/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
index b631a8c0122a..e199a1f69ad7 100644
--- a/contrib/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
+++ b/contrib/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
@@ -45,7 +45,7 @@ public:
   /// \returns nullptr if a parsing error occurred.
   std::unique_ptr<Module> parseIRModule();
 
-  /// \brief Parses MachineFunctions in the MIR file and add them to the given
+  /// Parses MachineFunctions in the MIR file and add them to the given
   /// MachineModuleInfo \p MMI.
   ///
   /// \returns true if an error occurred.
diff --git a/contrib/llvm/include/llvm/CodeGen/MIRPrinter.h b/contrib/llvm/include/llvm/CodeGen/MIRPrinter.h
index c73adc3f2b11..078c4b2f6072 100644
--- a/contrib/llvm/include/llvm/CodeGen/MIRPrinter.h
+++ b/contrib/llvm/include/llvm/CodeGen/MIRPrinter.h
@@ -38,7 +38,7 @@ void printMIR(raw_ostream &OS, const MachineFunction &MF);
 /// this funciton and the parser will use this function to construct a list if
 /// it is missing.
 void guessSuccessors(const MachineBasicBlock &MBB,
-                     SmallVectorImpl<MachineBasicBlock*> &Successors,
+                     SmallVectorImpl<MachineBasicBlock*> &Result,
                      bool &IsFallthrough);
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/contrib/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index ba40e522e261..7f46406c4789 100644
--- a/contrib/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/contrib/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -258,11 +258,11 @@ template <> struct MappingTraits<MachineStackObject> {
     YamlIO.mapOptional("callee-saved-restored", Object.CalleeSavedRestored,
                        true);
     YamlIO.mapOptional("local-offset", Object.LocalOffset, Optional<int64_t>());
-    YamlIO.mapOptional("di-variable", Object.DebugVar,
+    YamlIO.mapOptional("debug-info-variable", Object.DebugVar,
                        StringValue()); // Don't print it out when it's empty.
-    YamlIO.mapOptional("di-expression", Object.DebugExpr,
+    YamlIO.mapOptional("debug-info-expression", Object.DebugExpr,
                        StringValue()); // Don't print it out when it's empty.
-    YamlIO.mapOptional("di-location", Object.DebugLoc,
+    YamlIO.mapOptional("debug-info-location", Object.DebugLoc,
                        StringValue()); // Don't print it out when it's empty.
   }
 
@@ -283,6 +283,9 @@ struct FixedMachineStackObject {
   bool IsAliased = false;
   StringValue CalleeSavedRegister;
   bool CalleeSavedRestored = true;
+  StringValue DebugVar;
+  StringValue DebugExpr;
+  StringValue DebugLoc;
 
   bool operator==(const FixedMachineStackObject &Other) const {
     return ID == Other.ID && Type == Other.Type && Offset == Other.Offset &&
@@ -290,7 +293,9 @@ struct FixedMachineStackObject {
            StackID == Other.StackID &&
            IsImmutable == Other.IsImmutable && IsAliased == Other.IsAliased &&
            CalleeSavedRegister == Other.CalleeSavedRegister &&
-           CalleeSavedRestored == Other.CalleeSavedRestored;
+           CalleeSavedRestored == Other.CalleeSavedRestored &&
+           DebugVar == Other.DebugVar && DebugExpr == Other.DebugExpr
+           && DebugLoc == Other.DebugLoc;
   }
 };
 
@@ -321,6 +326,12 @@ template <> struct MappingTraits<FixedMachineStackObject> {
                        StringValue()); // Don't print it out when it's empty.
     YamlIO.mapOptional("callee-saved-restored", Object.CalleeSavedRestored,
                      true);
+    YamlIO.mapOptional("debug-info-variable", Object.DebugVar,
+                       StringValue()); // Don't print it out when it's empty.
+    YamlIO.mapOptional("debug-info-expression", Object.DebugExpr,
+                       StringValue()); // Don't print it out when it's empty.
+    YamlIO.mapOptional("debug-info-location", Object.DebugLoc,
+                       StringValue()); // Don't print it out when it's empty.
   }
 
   static const bool flow = true;
@@ -417,6 +428,7 @@ struct MachineFrameInfo {
   bool HasOpaqueSPAdjustment = false;
   bool HasVAStart = false;
   bool HasMustTailInVarArgFunc = false;
+  unsigned LocalFrameSize = 0;
   StringValue SavePoint;
   StringValue RestorePoint;
 
@@ -434,6 +446,7 @@ struct MachineFrameInfo {
            HasOpaqueSPAdjustment == Other.HasOpaqueSPAdjustment &&
            HasVAStart == Other.HasVAStart &&
            HasMustTailInVarArgFunc == Other.HasMustTailInVarArgFunc &&
+           LocalFrameSize == Other.LocalFrameSize &&
            SavePoint == Other.SavePoint && RestorePoint == Other.RestorePoint;
   }
 };
@@ -457,6 +470,7 @@ template <> struct MappingTraits<MachineFrameInfo> {
     YamlIO.mapOptional("hasVAStart", MFI.HasVAStart, false);
     YamlIO.mapOptional("hasMustTailInVarArgFunc", MFI.HasMustTailInVarArgFunc,
                        false);
+    YamlIO.mapOptional("localFrameSize", MFI.LocalFrameSize, (unsigned)0);
     YamlIO.mapOptional("savePoint", MFI.SavePoint,
                        StringValue()); // Don't print it out when it's empty.
     YamlIO.mapOptional("restorePoint", MFI.RestorePoint,
@@ -472,6 +486,7 @@ struct MachineFunction {
   bool Legalized = false;
   bool RegBankSelected = false;
   bool Selected = false;
+  bool FailedISel = false;
   // Register information
   bool TracksRegLiveness = false;
   std::vector<VirtualRegisterDefinition> VirtualRegisters;
@@ -495,6 +510,7 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("legalized", MF.Legalized, false);
     YamlIO.mapOptional("regBankSelected", MF.RegBankSelected, false);
     YamlIO.mapOptional("selected", MF.Selected, false);
+    YamlIO.mapOptional("failedISel", MF.FailedISel, false);
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
     YamlIO.mapOptional("registers", MF.VirtualRegisters,
                        std::vector<VirtualRegisterDefinition>());
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/contrib/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 89210e16629e..ace33efd8713 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -58,7 +58,7 @@ private:
 public:
   void addNodeToList(MachineInstr *N);
   void removeNodeFromList(MachineInstr *N);
-  void transferNodesFromList(ilist_traits &OldList, instr_iterator First,
+  void transferNodesFromList(ilist_traits &FromList, instr_iterator First,
                              instr_iterator Last);
   void deleteNode(MachineInstr *MI);
 };
@@ -115,13 +115,18 @@ private:
   /// branch.
   bool AddressTaken = false;
 
+  /// Indicate that this basic block is the entry block of an EH scope, i.e.,
+  /// the block that used to have a catchpad or cleanuppad instruction in the
+  /// LLVM IR.
+  bool IsEHScopeEntry = false;
+
   /// Indicate that this basic block is the entry block of an EH funclet.
   bool IsEHFuncletEntry = false;
 
   /// Indicate that this basic block is the entry block of a cleanup funclet.
   bool IsCleanupFuncletEntry = false;
 
-  /// \brief since getSymbol is a relatively heavy-weight operation, the symbol
+  /// since getSymbol is a relatively heavy-weight operation, the symbol
   /// is only computed once and is cached.
   mutable MCSymbol *CachedMCSymbol = nullptr;
 
@@ -225,6 +230,14 @@ public:
     return make_range(getFirstTerminator(), end());
   }
 
+  /// Returns a range that iterates over the phis in the basic block.
+  inline iterator_range<iterator> phis() {
+    return make_range(begin(), getFirstNonPHI());
+  }
+  inline iterator_range<const_iterator> phis() const {
+    return const_cast<MachineBasicBlock *>(this)->phis();
+  }
+
   // Machine-CFG iterators
   using pred_iterator = std::vector<MachineBasicBlock *>::iterator;
   using const_pred_iterator = std::vector<MachineBasicBlock *>::const_iterator;
@@ -367,6 +380,14 @@ public:
 
   bool hasEHPadSuccessor() const;
 
+  /// Returns true if this is the entry block of an EH scope, i.e., the block
+  /// that used to have a catchpad or cleanuppad instruction in the LLVM IR.
+  bool isEHScopeEntry() const { return IsEHScopeEntry; }
+
+  /// Indicates if this is the entry block of an EH scope, i.e., the block that
+  /// that used to have a catchpad or cleanuppad instruction in the LLVM IR.
+  void setIsEHScopeEntry(bool V = true) { IsEHScopeEntry = V; }
+
   /// Returns true if this is the entry block of an EH funclet.
   bool isEHFuncletEntry() const { return IsEHFuncletEntry; }
 
@@ -456,6 +477,11 @@ public:
   /// probabilities may need to be normalized.
   void copySuccessor(MachineBasicBlock *Orig, succ_iterator I);
 
+  /// Split the old successor into old plus new and updates the probability
+  /// info.
+  void splitSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New,
+                      bool NormalizeSuccProbs = false);
+
   /// Transfers all the successors from MBB to this machine basic block (i.e.,
   /// copies all the successors FromMBB and remove all the successors from
   /// FromMBB).
@@ -553,7 +579,7 @@ public:
   /// Check if the edge between this block and the given successor \p
   /// Succ, can be split. If this returns true a subsequent call to
   /// SplitCriticalEdge is guaranteed to return a valid basic block if
-  /// no changes occured in the meantime.
+  /// no changes occurred in the meantime.
   bool canSplitCriticalEdge(const MachineBasicBlock *Succ) const;
 
   void pop_front() { Insts.pop_front(); }
@@ -692,12 +718,19 @@ public:
                             bool IsCond);
 
   /// Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE
-  /// instructions.  Return UnknownLoc if there is none.
+  /// and DBG_LABEL instructions.  Return UnknownLoc if there is none.
   DebugLoc findDebugLoc(instr_iterator MBBI);
   DebugLoc findDebugLoc(iterator MBBI) {
     return findDebugLoc(MBBI.getInstrIterator());
   }
 
+  /// Find the previous valid DebugLoc preceding MBBI, skipping and DBG_VALUE
+  /// instructions.  Return UnknownLoc if there is none.
+  DebugLoc findPrevDebugLoc(instr_iterator MBBI);
+  DebugLoc findPrevDebugLoc(iterator MBBI) {
+    return findPrevDebugLoc(MBBI.getInstrIterator());
+  }
+
   /// Find and return the merged DebugLoc of the branch instructions of the
   /// block. Return UnknownLoc if there is none.
   DebugLoc findBranchDebugLoc();
@@ -724,9 +757,10 @@ public:
 
   // Debugging methods.
   void dump() const;
-  void print(raw_ostream &OS, const SlotIndexes* = nullptr) const;
+  void print(raw_ostream &OS, const SlotIndexes * = nullptr,
+             bool IsStandalone = true) const;
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
-             const SlotIndexes* = nullptr) const;
+             const SlotIndexes * = nullptr, bool IsStandalone = true) const;
 
   // Printing method used by LoopInfo.
   void printAsOperand(raw_ostream &OS, bool PrintType = true) const;
@@ -881,7 +915,7 @@ public:
 /// const_instr_iterator} and the respective reverse iterators.
 template<typename IterT>
 inline IterT skipDebugInstructionsForward(IterT It, IterT End) {
-  while (It != End && It->isDebugValue())
+  while (It != End && It->isDebugInstr())
     It++;
   return It;
 }
@@ -892,7 +926,7 @@ inline IterT skipDebugInstructionsForward(IterT It, IterT End) {
 /// const_instr_iterator} and the respective reverse iterators.
 template<class IterT>
 inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin) {
-  while (It != Begin && It->isDebugValue())
+  while (It != Begin && It->isDebugInstr())
     It--;
   return It;
 }
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineConstantPool.h b/contrib/llvm/include/llvm/CodeGen/MachineConstantPool.h
index 1705a0f7e59b..b0b5420a884b 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineConstantPool.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineConstantPool.h
@@ -63,7 +63,7 @@ inline raw_ostream &operator<<(raw_ostream &OS,
 /// This class is a data container for one entry in a MachineConstantPool.
 /// It contains a pointer to the value and an offset from the start of
 /// the constant pool.
-/// @brief An entry in a MachineConstantPool
+/// An entry in a MachineConstantPool
 class MachineConstantPoolEntry {
 public:
   /// The constant itself.
@@ -117,7 +117,7 @@ public:
 /// the use of MO_ConstantPoolIndex values.  When emitting assembly or machine
 /// code, these virtual address references are converted to refer to the
 /// address of the function constant pool values.
-/// @brief The machine constant pool.
+/// The machine constant pool.
 class MachineConstantPool {
   unsigned PoolAlignment;       ///< The alignment for the pool.
   std::vector<MachineConstantPoolEntry> Constants; ///< The pool of constants.
@@ -128,7 +128,7 @@ class MachineConstantPool {
   const DataLayout &getDataLayout() const { return DL; }
 
 public:
-  /// @brief The only constructor.
+  /// The only constructor.
   explicit MachineConstantPool(const DataLayout &DL)
       : PoolAlignment(1), DL(DL) {}
   ~MachineConstantPool();
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h b/contrib/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h
index ffbcc62bfa36..75d75bc3669a 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h
@@ -37,9 +37,9 @@ public:
 
  MachineDominanceFrontier();
 
- DominanceFrontierBase<MachineBasicBlock, false> &getBase() { return Base; }
+ ForwardDominanceFrontierBase<MachineBasicBlock> &getBase() { return Base; }
 
-  const SmallVectorImpl<MachineBasicBlock *> &getRoots() const {
+ const SmallVectorImpl<MachineBasicBlock *> &getRoots() const {
    return Base.getRoots();
   }
 
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineDominators.h b/contrib/llvm/include/llvm/CodeGen/MachineDominators.h
index 98fdb51aae2f..e3d3d169db97 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -45,7 +45,7 @@ using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 /// compute a normal dominator tree.
 ///
 class MachineDominatorTree : public MachineFunctionPass {
-  /// \brief Helper structure used to hold all the basic blocks
+  /// Helper structure used to hold all the basic blocks
   /// involved in the split of a critical edge.
   struct CriticalEdge {
     MachineBasicBlock *FromBB;
@@ -53,12 +53,12 @@ class MachineDominatorTree : public MachineFunctionPass {
     MachineBasicBlock *NewBB;
   };
 
-  /// \brief Pile up all the critical edges to be split.
+  /// Pile up all the critical edges to be split.
   /// The splitting of a critical edge is local and thus, it is possible
   /// to apply several of those changes at the same time.
   mutable SmallVector<CriticalEdge, 32> CriticalEdgesToSplit;
 
-  /// \brief Remember all the basic blocks that are inserted during
+  /// Remember all the basic blocks that are inserted during
   /// edge splitting.
   /// Invariant: NewBBs == all the basic blocks contained in the NewBB
   /// field of all the elements of CriticalEdgesToSplit.
@@ -69,7 +69,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   /// The DominatorTreeBase that is used to compute a normal dominator tree
   std::unique_ptr<DomTreeBase<MachineBasicBlock>> DT;
 
-  /// \brief Apply all the recorded critical edges to the DT.
+  /// Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
   /// the fast query path of DT as much as possible.
   ///
@@ -228,7 +228,7 @@ public:
 
   void print(raw_ostream &OS, const Module*) const override;
 
-  /// \brief Record that the critical edge (FromBB, ToBB) has been
+  /// Record that the critical edge (FromBB, ToBB) has been
   /// split with NewBB.
   /// This is best to use this method instead of directly update the
   /// underlying information, because this helps mitigating the
@@ -249,12 +249,6 @@ public:
            "A basic block inserted via edge splitting cannot appear twice");
     CriticalEdgesToSplit.push_back({FromBB, ToBB, NewBB});
   }
-
-  /// \brief Verify the correctness of the domtree by re-computing it.
-  ///
-  /// This should only be used for debugging as it aborts the program if the
-  /// verification fails.
-  void verifyDomTree() const;
 };
 
 //===-------------------------------------
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/contrib/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index f887517217e1..2d6081f3577d 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -85,9 +85,23 @@ public:
 /// stack offsets of the object, eliminating all MO_FrameIndex operands from
 /// the program.
 ///
-/// @brief Abstract Stack Frame Information
+/// Abstract Stack Frame Information
 class MachineFrameInfo {
+public:
+  /// Stack Smashing Protection (SSP) rules require that vulnerable stack
+  /// allocations are located close the stack protector.
+  enum SSPLayoutKind {
+    SSPLK_None,       ///< Did not trigger a stack protector.  No effect on data
+                      ///< layout.
+    SSPLK_LargeArray, ///< Array or nested array >= SSP-buffer-size.  Closest
+                      ///< to the stack protector.
+    SSPLK_SmallArray, ///< Array or nested array < SSP-buffer-size. 2nd closest
+                      ///< to the stack protector.
+    SSPLK_AddrOf      ///< The address of this allocation is exposed and
+                      ///< triggered protection.  3rd closest to the protector.
+  };
 
+private:
   // Represent a single object allocated on the stack.
   struct StackObject {
     // The offset of this object from the stack pointer on entry to
@@ -123,6 +137,9 @@ class MachineFrameInfo {
     /// necessarily reside in the same contiguous memory block as other stack
     /// objects. Objects with differing stack IDs should not be merged or
     /// replaced substituted for each other.
+    //
+    /// It is assumed a target uses consecutive, increasing stack IDs starting
+    /// from 1.
     uint8_t StackID;
 
     /// If this stack object is originated from an Alloca instruction
@@ -145,12 +162,15 @@ class MachineFrameInfo {
     /// If true, the object has been zero-extended.
     bool isSExt = false;
 
+    uint8_t SSPLayout;
+
     StackObject(uint64_t Size, unsigned Alignment, int64_t SPOffset,
                 bool IsImmutable, bool IsSpillSlot, const AllocaInst *Alloca,
                 bool IsAliased, uint8_t StackID = 0)
       : SPOffset(SPOffset), Size(Size), Alignment(Alignment),
         isImmutable(IsImmutable), isSpillSlot(IsSpillSlot),
-        StackID(StackID), Alloca(Alloca), isAliased(IsAliased) {}
+        StackID(StackID), Alloca(Alloca), isAliased(IsAliased),
+        SSPLayout(SSPLK_None) {}
   };
 
   /// The alignment of the stack.
@@ -485,6 +505,20 @@ public:
     Objects[ObjectIdx+NumFixedObjects].SPOffset = SPOffset;
   }
 
+  SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    return (SSPLayoutKind)Objects[ObjectIdx+NumFixedObjects].SSPLayout;
+  }
+
+  void setObjectSSPLayout(int ObjectIdx, SSPLayoutKind Kind) {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    assert(!isDeadObjectIndex(ObjectIdx) &&
+           "Setting SSP layout for a dead object?");
+    Objects[ObjectIdx+NumFixedObjects].SSPLayout = Kind;
+  }
+
   /// Return the number of bytes that must be allocated to hold
   /// all of the fixed size frame objects.  This is only valid after
   /// Prolog/Epilog code insertion has finalized the stack frame layout.
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineFunction.h b/contrib/llvm/include/llvm/CodeGen/MachineFunction.h
index 7d8b7ebe8d62..e8a4d529faac 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -73,6 +73,7 @@ class SlotIndexes;
 class TargetMachine;
 class TargetRegisterClass;
 class TargetSubtargetInfo;
+struct WasmEHFuncInfo;
 struct WinEHFuncInfo;
 
 template <> struct ilist_alloc_traits<MachineBasicBlock> {
@@ -80,8 +81,8 @@ template <> struct ilist_alloc_traits<MachineBasicBlock> {
 };
 
 template <> struct ilist_callback_traits<MachineBasicBlock> {
-  void addNodeToList(MachineBasicBlock* MBB);
-  void removeNodeFromList(MachineBasicBlock* MBB);
+  void addNodeToList(MachineBasicBlock* N);
+  void removeNodeFromList(MachineBasicBlock* N);
 
   template <class Iterator>
   void transferNodesFromList(ilist_callback_traits &OldList, Iterator, Iterator) {
@@ -96,7 +97,7 @@ template <> struct ilist_callback_traits<MachineBasicBlock> {
 struct MachineFunctionInfo {
   virtual ~MachineFunctionInfo();
 
-  /// \brief Factory function: default behavior is to call new using the
+  /// Factory function: default behavior is to call new using the
   /// supplied allocator.
   ///
   /// This function can be overridden in a derive class.
@@ -245,6 +246,10 @@ class MachineFunction {
   // Keep track of jump tables for switch instructions
   MachineJumpTableInfo *JumpTableInfo;
 
+  // Keeps track of Wasm exception handling related data. This will be null for
+  // functions that aren't using a wasm EH personality.
+  WasmEHFuncInfo *WasmEHInfo = nullptr;
+
   // Keeps track of Windows exception handling related data. This will be null
   // for functions that aren't using a funclet-based EH personality.
   WinEHFuncInfo *WinEHInfo = nullptr;
@@ -319,6 +324,7 @@ class MachineFunction {
 
   bool CallsEHReturn = false;
   bool CallsUnwindInit = false;
+  bool HasEHScopes = false;
   bool HasEHFunclets = false;
 
   /// List of C++ TypeInfo used.
@@ -349,17 +355,18 @@ public:
   struct VariableDbgInfo {
     const DILocalVariable *Var;
     const DIExpression *Expr;
-    unsigned Slot;
+    // The Slot can be negative for fixed stack objects.
+    int Slot;
     const DILocation *Loc;
 
     VariableDbgInfo(const DILocalVariable *Var, const DIExpression *Expr,
-                    unsigned Slot, const DILocation *Loc)
+                    int Slot, const DILocation *Loc)
         : Var(Var), Expr(Expr), Slot(Slot), Loc(Loc) {}
   };
   using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>;
   VariableDbgInfoMapTy VariableDbgInfos;
 
-  MachineFunction(const Function &F, const TargetMachine &TM,
+  MachineFunction(const Function &F, const TargetMachine &Target,
                   const TargetSubtargetInfo &STI, unsigned FunctionNum,
                   MachineModuleInfo &MMI);
   MachineFunction(const MachineFunction &) = delete;
@@ -430,6 +437,12 @@ public:
   MachineConstantPool *getConstantPool() { return ConstantPool; }
   const MachineConstantPool *getConstantPool() const { return ConstantPool; }
 
+  /// getWasmEHFuncInfo - Return information about how the current function uses
+  /// Wasm exception handling. Returns null for functions that don't use wasm
+  /// exception handling.
+  const WasmEHFuncInfo *getWasmEHFuncInfo() const { return WasmEHInfo; }
+  WasmEHFuncInfo *getWasmEHFuncInfo() { return WasmEHInfo; }
+
   /// getWinEHFuncInfo - Return information about how the current function uses
   /// Windows exception handling. Returns null for functions that don't use
   /// funclets for exception handling.
@@ -609,7 +622,7 @@ public:
   //===--------------------------------------------------------------------===//
   // Internal functions used to automatically number MachineBasicBlocks
 
-  /// \brief Adds the MBB to the internal numbering. Returns the unique number
+  /// Adds the MBB to the internal numbering. Returns the unique number
   /// assigned to the MBB.
   unsigned addToMBBNumbering(MachineBasicBlock *MBB) {
     MBBNumbering.push_back(MBB);
@@ -695,14 +708,8 @@ public:
     OperandRecycler.deallocate(Cap, Array);
   }
 
-  /// \brief Allocate and initialize a register mask with @p NumRegister bits.
-  uint32_t *allocateRegisterMask(unsigned NumRegister) {
-    unsigned Size = (NumRegister + 31) / 32;
-    uint32_t *Mask = Allocator.Allocate<uint32_t>(Size);
-    for (unsigned i = 0; i != Size; ++i)
-      Mask[i] = 0;
-    return Mask;
-  }
+  /// Allocate and initialize a register mask with @p NumRegister bits.
+  uint32_t *allocateRegMask();
 
   /// allocateMemRefsArray - Allocate an array to hold MachineMemOperand
   /// pointers.  This array is owned by the MachineFunction.
@@ -759,6 +766,9 @@ public:
   bool callsUnwindInit() const { return CallsUnwindInit; }
   void setCallsUnwindInit(bool b) { CallsUnwindInit = b; }
 
+  bool hasEHScopes() const { return HasEHScopes; }
+  void setHasEHScopes(bool V) { HasEHScopes = V; }
+
   bool hasEHFunclets() const { return HasEHFunclets; }
   void setHasEHFunclets(bool V) { HasEHFunclets = V; }
 
@@ -793,7 +803,7 @@ public:
   void addCleanup(MachineBasicBlock *LandingPad);
 
   void addSEHCatchHandler(MachineBasicBlock *LandingPad, const Function *Filter,
-                          const BlockAddress *RecoverLabel);
+                          const BlockAddress *RecoverBA);
 
   void addSEHCleanupHandler(MachineBasicBlock *LandingPad,
                             const Function *Cleanup);
@@ -860,7 +870,7 @@ public:
 
   /// Collect information used to emit debugging information of a variable.
   void setVariableDbgInfo(const DILocalVariable *Var, const DIExpression *Expr,
-                          unsigned Slot, const DILocation *Loc) {
+                          int Slot, const DILocation *Loc) {
     VariableDbgInfos.emplace_back(Var, Expr, Slot, Loc);
   }
 
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineInstr.h b/contrib/llvm/include/llvm/CodeGen/MachineInstr.h
index 3c1c1bb14f42..88e13cdf4138 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -80,7 +80,21 @@ public:
     FrameDestroy = 1 << 1,              // Instruction is used as a part of
                                         // function frame destruction code.
     BundledPred  = 1 << 2,              // Instruction has bundled predecessors.
-    BundledSucc  = 1 << 3               // Instruction has bundled successors.
+    BundledSucc  = 1 << 3,              // Instruction has bundled successors.
+    FmNoNans     = 1 << 4,              // Instruction does not support Fast
+                                        // math nan values.
+    FmNoInfs     = 1 << 5,              // Instruction does not support Fast
+                                        // math infinity values.
+    FmNsz        = 1 << 6,              // Instruction is not required to retain
+                                        // signed zero values.
+    FmArcp       = 1 << 7,              // Instruction supports Fast math
+                                        // reciprocal approximations.
+    FmContract   = 1 << 8,              // Instruction supports Fast math
+                                        // contraction operations like fma.
+    FmAfn        = 1 << 9,              // Instruction may map to Fast math
+                                        // instrinsic approximation.
+    FmReassoc    = 1 << 10              // Instruction supports Fast math
+                                        // reassociation of operand order.
   };
 
 private:
@@ -93,7 +107,7 @@ private:
   using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity;
   OperandCapacity CapOperands;          // Capacity of the Operands array.
 
-  uint8_t Flags = 0;                    // Various bits of additional
+  uint16_t Flags = 0;                   // Various bits of additional
                                         // information about machine
                                         // instruction.
 
@@ -127,7 +141,7 @@ private:
   /// This constructor create a MachineInstr and add the implicit operands.
   /// It reserves space for number of operands specified by
   /// MCInstrDesc.  An explicit DebugLoc is supplied.
-  MachineInstr(MachineFunction &, const MCInstrDesc &MCID, DebugLoc dl,
+  MachineInstr(MachineFunction &, const MCInstrDesc &tid, DebugLoc dl,
                bool NoImp = false);
 
   // MachineInstrs are pool-allocated and owned by MachineFunction.
@@ -175,7 +189,7 @@ public:
   }
 
   /// Return the MI flags bitvector.
-  uint8_t getFlags() const {
+  uint16_t getFlags() const {
     return Flags;
   }
 
@@ -186,7 +200,7 @@ public:
 
   /// Set a MI flag.
   void setFlag(MIFlag Flag) {
-    Flags |= (uint8_t)Flag;
+    Flags |= (uint16_t)Flag;
   }
 
   void setFlags(unsigned flags) {
@@ -197,7 +211,7 @@ public:
 
   /// clearFlag - Clear a MI flag.
   void clearFlag(MIFlag Flag) {
-    Flags &= ~((uint8_t)Flag);
+    Flags &= ~((uint16_t)Flag);
   }
 
   /// Return true if MI is in a bundle (but not the first MI in a bundle).
@@ -278,6 +292,10 @@ public:
   /// this DBG_VALUE instruction.
   const DIExpression *getDebugExpression() const;
 
+  /// Return the debug label referenced by
+  /// this DBG_LABEL instruction.
+  const DILabel *getDebugLabel() const;
+
   /// Emit an error referring to the source location of this instruction.
   /// This should only be used for inline assembly that is somehow
   /// impossible to compile. Other errors should have been handled much
@@ -304,6 +322,11 @@ public:
     return Operands[i];
   }
 
+  /// Returns the total number of definitions.
+  unsigned getNumDefs() const {
+    return getNumExplicitDefs() + MCID->getNumImplicitDefs();
+  }
+
   /// Return true if operand \p OpIdx is a subregister index.
   bool isOperandSubregIdx(unsigned OpIdx) const {
     assert(getOperand(OpIdx).getType() == MachineOperand::MO_Immediate &&
@@ -322,6 +345,9 @@ public:
   /// Returns the number of non-implicit operands.
   unsigned getNumExplicitOperands() const;
 
+  /// Returns the number of non-implicit definitions.
+  unsigned getNumExplicitDefs() const;
+
   /// iterator/begin/end - Iterate over all operands of a machine instruction.
   using mop_iterator = MachineOperand *;
   using const_mop_iterator = const MachineOperand *;
@@ -356,31 +382,29 @@ public:
   /// Implicit definition are not included!
   iterator_range<mop_iterator> defs() {
     return make_range(operands_begin(),
-                      operands_begin() + getDesc().getNumDefs());
+                      operands_begin() + getNumExplicitDefs());
   }
   /// \copydoc defs()
   iterator_range<const_mop_iterator> defs() const {
     return make_range(operands_begin(),
-                      operands_begin() + getDesc().getNumDefs());
+                      operands_begin() + getNumExplicitDefs());
   }
   /// Returns a range that includes all operands that are register uses.
   /// This may include unrelated operands which are not register uses.
   iterator_range<mop_iterator> uses() {
-    return make_range(operands_begin() + getDesc().getNumDefs(),
-                      operands_end());
+    return make_range(operands_begin() + getNumExplicitDefs(), operands_end());
   }
   /// \copydoc uses()
   iterator_range<const_mop_iterator> uses() const {
-    return make_range(operands_begin() + getDesc().getNumDefs(),
-                      operands_end());
+    return make_range(operands_begin() + getNumExplicitDefs(), operands_end());
   }
   iterator_range<mop_iterator> explicit_uses() {
-    return make_range(operands_begin() + getDesc().getNumDefs(),
-                      operands_begin() + getNumExplicitOperands() );
+    return make_range(operands_begin() + getNumExplicitDefs(),
+                      operands_begin() + getNumExplicitOperands());
   }
   iterator_range<const_mop_iterator> explicit_uses() const {
-    return make_range(operands_begin() + getDesc().getNumDefs(),
-                      operands_begin() + getNumExplicitOperands() );
+    return make_range(operands_begin() + getNumExplicitDefs(),
+                      operands_begin() + getNumExplicitOperands());
   }
 
   /// Returns the number of the operand iterator \p I points to.
@@ -391,7 +415,7 @@ public:
   /// Access to memory operands of the instruction
   mmo_iterator memoperands_begin() const { return MemRefs; }
   mmo_iterator memoperands_end() const { return MemRefs + NumMemRefs; }
-  /// Return true if we don't have any memory operands which described the the
+  /// Return true if we don't have any memory operands which described the
   /// memory access done by this instruction.  If this is true, calling code
   /// must be conservative.
   bool memoperands_empty() const { return NumMemRefs == 0; }
@@ -529,6 +553,12 @@ public:
     return hasProperty(MCID::MoveImm, Type);
   }
 
+  /// Return true if this instruction is a register move.
+  /// (including moving values from subreg to reg)
+  bool isMoveReg(QueryType Type = IgnoreBundle) const {
+    return hasProperty(MCID::MoveReg, Type);
+  }
+
   /// Return true if this instruction is a bitcast instruction.
   bool isBitcast(QueryType Type = IgnoreBundle) const {
     return hasProperty(MCID::Bitcast, Type);
@@ -576,7 +606,7 @@ public:
     return hasProperty(MCID::FoldableAsLoad, Type);
   }
 
-  /// \brief Return true if this instruction behaves
+  /// Return true if this instruction behaves
   /// the same way as the generic REG_SEQUENCE instructions.
   /// E.g., on ARM,
   /// dX VMOVDRR rY, rZ
@@ -590,7 +620,7 @@ public:
     return hasProperty(MCID::RegSequence, Type);
   }
 
-  /// \brief Return true if this instruction behaves
+  /// Return true if this instruction behaves
   /// the same way as the generic EXTRACT_SUBREG instructions.
   /// E.g., on ARM,
   /// rX, rY VMOVRRD dZ
@@ -605,7 +635,7 @@ public:
     return hasProperty(MCID::ExtractSubreg, Type);
   }
 
-  /// \brief Return true if this instruction behaves
+  /// Return true if this instruction behaves
   /// the same way as the generic INSERT_SUBREG instructions.
   /// E.g., on ARM,
   /// dX = VSETLNi32 dY, rZ, Imm
@@ -817,6 +847,8 @@ public:
   bool isPosition() const { return isLabel() || isCFIInstruction(); }
 
   bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; }
+  bool isDebugLabel() const { return getOpcode() == TargetOpcode::DBG_LABEL; }
+  bool isDebugInstr() const { return isDebugValue() || isDebugLabel(); }
 
   /// A DBG_VALUE is indirect iff the first operand is a register and
   /// the second operand is an immediate.
@@ -893,6 +925,9 @@ public:
     case TargetOpcode::EH_LABEL:
     case TargetOpcode::GC_LABEL:
     case TargetOpcode::DBG_VALUE:
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::LIFETIME_START:
+    case TargetOpcode::LIFETIME_END:
       return true;
     }
   }
@@ -1047,7 +1082,7 @@ public:
                         const TargetInstrInfo *TII,
                         const TargetRegisterInfo *TRI) const;
 
-  /// \brief Applies the constraints (def/use) implied by this MI on \p Reg to
+  /// Applies the constraints (def/use) implied by this MI on \p Reg to
   /// the given \p CurRC.
   /// If \p ExploreBundle is set and MI is part of a bundle, all the
   /// instructions inside the bundle will be taken into account. In other words,
@@ -1064,7 +1099,7 @@ public:
       const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
       bool ExploreBundle = false) const;
 
-  /// \brief Applies the constraints (def/use) implied by the \p OpIdx operand
+  /// Applies the constraints (def/use) implied by the \p OpIdx operand
   /// to the given \p CurRC.
   ///
   /// Returns the register class that satisfies both \p CurRC and the
@@ -1233,15 +1268,20 @@ public:
   bool hasComplexRegisterTies() const;
 
   /// Print this MI to \p OS.
+  /// Don't print information that can be inferred from other instructions if
+  /// \p IsStandalone is false. It is usually true when only a fragment of the
+  /// function is printed.
   /// Only print the defs and the opcode if \p SkipOpers is true.
   /// Otherwise, also print operands if \p SkipDebugLoc is true.
   /// Otherwise, also print the debug loc, with a terminating newline.
   /// \p TII is used to print the opcode name.  If it's not present, but the
   /// MI is in a function, the opcode will be printed using the function's TII.
-  void print(raw_ostream &OS, bool SkipOpers = false, bool SkipDebugLoc = false,
+  void print(raw_ostream &OS, bool IsStandalone = true, bool SkipOpers = false,
+             bool SkipDebugLoc = false, bool AddNewLine = true,
              const TargetInstrInfo *TII = nullptr) const;
-  void print(raw_ostream &OS, ModuleSlotTracker &MST, bool SkipOpers = false,
-             bool SkipDebugLoc = false,
+  void print(raw_ostream &OS, ModuleSlotTracker &MST, bool IsStandalone = true,
+             bool SkipOpers = false, bool SkipDebugLoc = false,
+             bool AddNewLine = true,
              const TargetInstrInfo *TII = nullptr) const;
   void dump() const;
   /// @}
@@ -1281,7 +1321,7 @@ public:
 
   /// Erase an operand from an instruction, leaving it with one
   /// fewer operand than it started with.
-  void RemoveOperand(unsigned i);
+  void RemoveOperand(unsigned OpNo);
 
   /// Add a MachineMemOperand to the machine instruction.
   /// This function should be used only occasionally. The setMemRefs function
@@ -1311,6 +1351,11 @@ public:
   /// modify the memrefs of the this MachineInstr.
   std::pair<mmo_iterator, unsigned> mergeMemRefsWith(const MachineInstr& Other);
 
+  /// Return the MIFlags which represent both MachineInstrs. This
+  /// should be used when merging two MachineInstrs into one. This routine does
+  /// not modify the MIFlags of this MachineInstr.
+  uint16_t mergeFlagsWith(const MachineInstr& Other) const;
+
   /// Clear this MachineInstr's memory reference descriptor list.  This resets
   /// the memrefs to their most conservative state.  This should be used only
   /// as a last resort since it greatly pessimizes our knowledge of the memory
@@ -1351,7 +1396,7 @@ private:
   /// Slow path for hasProperty when we're dealing with a bundle.
   bool hasPropertyInBundle(unsigned Mask, QueryType Type) const;
 
-  /// \brief Implements the logic of getRegClassConstraintEffectForVReg for the
+  /// Implements the logic of getRegClassConstraintEffectForVReg for the
   /// this MI and the given operand index \p OpIdx.
   /// If the related operand does not constrained Reg, this returns CurRC.
   const TargetRegisterClass *getRegClassConstraintEffectForVRegImpl(
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/contrib/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index e4f3976ec950..665608755741 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -20,6 +20,7 @@
 #define LLVM_CODEGEN_MACHINEINSTRBUILDER_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -219,6 +220,9 @@ public:
     assert((MI->isDebugValue() ? static_cast<bool>(MI->getDebugVariable())
                                : true) &&
            "first MDNode argument of a DBG_VALUE not a variable");
+    assert((MI->isDebugLabel() ? static_cast<bool>(MI->getDebugLabel())
+                               : true) &&
+           "first MDNode argument of a DBG_LABEL not a label");
     return *this;
   }
 
@@ -283,6 +287,12 @@ public:
     MI->copyImplicitOps(*MF, OtherMI);
     return *this;
   }
+
+  bool constrainAllUses(const TargetInstrInfo &TII,
+                        const TargetRegisterInfo &TRI,
+                        const RegisterBankInfo &RBI) const {
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
 };
 
 /// Builder interface. Specify how to create the initial instruction itself.
@@ -408,6 +418,13 @@ MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
                             const MDNode *Expr);
 
 /// This version of the builder builds a DBG_VALUE intrinsic
+/// for a MachineOperand.
+MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
+                            const MCInstrDesc &MCID, bool IsIndirect,
+                            MachineOperand &MO, const MDNode *Variable,
+                            const MDNode *Expr);
+
+/// This version of the builder builds a DBG_VALUE intrinsic
 /// for either a value in a register or a register-indirect
 /// address and inserts it at position I.
 MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
@@ -416,6 +433,14 @@ MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                             unsigned Reg, const MDNode *Variable,
                             const MDNode *Expr);
 
+/// This version of the builder builds a DBG_VALUE intrinsic
+/// for a machine operand and inserts it at position I.
+MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
+                            MachineBasicBlock::iterator I, const DebugLoc &DL,
+                            const MCInstrDesc &MCID, bool IsIndirect,
+                            MachineOperand &MO, const MDNode *Variable,
+                            const MDNode *Expr);
+
 /// Clone a DBG_VALUE whose value has been spilled to FrameIndex.
 MachineInstr *buildDbgValueForSpill(MachineBasicBlock &BB,
                                     MachineBasicBlock::iterator I,
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/contrib/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index 104655e45524..917fb90380f5 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -54,7 +54,7 @@ public:
   /// that contains the header.
   MachineBasicBlock *getBottomBlock();
 
-  /// \brief Find the block that contains the loop control variable and the
+  /// Find the block that contains the loop control variable and the
   /// loop test. This will return the latch block if it's one of the exiting
   /// blocks. Otherwise, return the exiting block. Return 'null' when
   /// multiple exiting blocks are present.
@@ -97,7 +97,7 @@ public:
 
   LoopInfoBase<MachineBasicBlock, MachineLoop>& getBase() { return LI; }
 
-  /// \brief Find the block that either is the loop preheader, or could
+  /// Find the block that either is the loop preheader, or could
   /// speculatively be used as the preheader. This is e.g. useful to place
   /// loop setup code. Code that cannot be speculated should not be placed
   /// here. SpeculativePreheader is controlling whether it also tries to
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineMemOperand.h b/contrib/llvm/include/llvm/CodeGen/MachineMemOperand.h
index c5b204a79f04..078ef7ca510c 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -184,7 +184,7 @@ public:
   /// atomic operations the atomic ordering requirements when store does not
   /// occur must also be specified.
   MachineMemOperand(MachinePointerInfo PtrInfo, Flags flags, uint64_t s,
-                    unsigned base_alignment,
+                    uint64_t a,
                     const AAMDNodes &AAInfo = AAMDNodes(),
                     const MDNode *Ranges = nullptr,
                     SyncScope::ID SSID = SyncScope::System,
@@ -295,6 +295,9 @@ public:
   /// @{
   void print(raw_ostream &OS) const;
   void print(raw_ostream &OS, ModuleSlotTracker &MST) const;
+  void print(raw_ostream &OS, ModuleSlotTracker &MST,
+             SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineOperand.h b/contrib/llvm/include/llvm/CodeGen/MachineOperand.h
index 4be7942c2c64..53e8889d118a 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -74,7 +74,7 @@ public:
 private:
   /// OpKind - Specify what kind of operand this is.  This discriminates the
   /// union.
-  MachineOperandType OpKind : 8;
+  unsigned OpKind : 8;
 
   /// Subregister number for MO_Register.  A value of 0 indicates the
   /// MO_Register has no subReg.
@@ -85,17 +85,17 @@ private:
   /// TiedTo - Non-zero when this register operand is tied to another register
   /// operand. The encoding of this field is described in the block comment
   /// before MachineInstr::tieOperands().
-  unsigned char TiedTo : 4;
+  unsigned TiedTo : 4;
 
   /// IsDef - True if this is a def, false if this is a use of the register.
   /// This is only valid on register operands.
   ///
-  bool IsDef : 1;
+  unsigned IsDef : 1;
 
   /// IsImp - True if this is an implicit def or use, false if it is explicit.
   /// This is only valid on register opderands.
   ///
-  bool IsImp : 1;
+  unsigned IsImp : 1;
 
   /// IsDeadOrKill
   /// For uses: IsKill - True if this instruction is the last use of the
@@ -103,14 +103,10 @@ private:
   /// For defs: IsDead - True if this register is never used by a subsequent
   /// instruction.
   /// This is only valid on register operands.
-  bool IsDeadOrKill : 1;
+  unsigned IsDeadOrKill : 1;
 
-  /// IsRenamable - True if this register may be renamed, i.e. it does not
-  /// generate a value that is somehow read in a way that is not represented by
-  /// the Machine IR (e.g. to meet an ABI or ISA requirement).  This is only
-  /// valid on physical register operands.  Virtual registers are assumed to
-  /// always be renamable regardless of the value of this field.
-  bool IsRenamable : 1;
+  /// See isRenamable().
+  unsigned IsRenamable : 1;
 
   /// IsUndef - True if this register operand reads an "undef" value, i.e. the
   /// read value doesn't matter.  This flag can be set on both use and def
@@ -129,7 +125,7 @@ private:
   /// Any register can be used for %2, and its value doesn't matter, but
   /// the two operands must be the same register.
   ///
-  bool IsUndef : 1;
+  unsigned IsUndef : 1;
 
   /// IsInternalRead - True if this operand reads a value that was defined
   /// inside the same instruction or bundle.  This flag can be set on both use
@@ -140,16 +136,16 @@ private:
   /// When this flag is set, the instruction bundle must contain at least one
   /// other def of the register.  If multiple instructions in the bundle define
   /// the register, the meaning is target-defined.
-  bool IsInternalRead : 1;
+  unsigned IsInternalRead : 1;
 
   /// IsEarlyClobber - True if this MO_Register 'def' operand is written to
   /// by the MachineInstr before all input registers are read.  This is used to
   /// model the GCC inline asm '&' constraint modifier.
-  bool IsEarlyClobber : 1;
+  unsigned IsEarlyClobber : 1;
 
   /// IsDebug - True if this MO_Register 'use' operand is in a debug pseudo,
   /// not a real instruction.  Such uses should be ignored during codegen.
-  bool IsDebug : 1;
+  unsigned IsDebug : 1;
 
   /// SmallContents - This really should be part of the Contents union, but
   /// lives out here so we can get a better packed struct.
@@ -198,7 +194,19 @@ private:
   } Contents;
 
   explicit MachineOperand(MachineOperandType K)
-    : OpKind(K), SubReg_TargetFlags(0), ParentMI(nullptr) {}
+    : OpKind(K), SubReg_TargetFlags(0), ParentMI(nullptr) {
+    // Assert that the layout is what we expect. It's easy to grow this object.
+    static_assert(alignof(MachineOperand) <= alignof(int64_t),
+                  "MachineOperand shouldn't be more than 8 byte aligned");
+    static_assert(sizeof(Contents) <= 2 * sizeof(void *),
+                  "Contents should be at most two pointers");
+    static_assert(sizeof(MachineOperand) <=
+                      alignTo<alignof(int64_t)>(2 * sizeof(unsigned) +
+                                                3 * sizeof(void *)),
+                  "MachineOperand too big. Should be Kind, SmallContents, "
+                  "ParentMI, and Contents");
+  }
+
 public:
   /// getType - Returns the MachineOperandType for this operand.
   ///
@@ -238,7 +246,7 @@ public:
   /// MO_Immediate operands can also be subreg idices. If it's the case, the
   /// subreg index name will be printed. MachineInstr::isOperandSubregIdx can be
   /// called to check this.
-  static void printSubregIdx(raw_ostream &OS, uint64_t Index,
+  static void printSubRegIdx(raw_ostream &OS, uint64_t Index,
                              const TargetRegisterInfo *TRI);
 
   /// Print operand target flags.
@@ -270,6 +278,9 @@ public:
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
+  /// \param IsStandalone - whether we want a verbose output of the MO. This
+  /// prints extra information that can be easily inferred when printing the
+  /// whole function, but not when printing only a fragment of it.
   /// \param ShouldPrintRegisterTies - whether we want to print register ties.
   /// Sometimes they are easily determined by the instruction's descriptor
   /// (MachineInstr::hasComplexRegiterTies can determine if it's needed).
@@ -280,10 +291,16 @@ public:
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             bool PrintDef, bool ShouldPrintRegisterTies,
+             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
              unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
+  /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
+  /// type to be printed the same way the full version of print(...) does it.
+  void print(raw_ostream &os, LLT TypeToPrint,
+             const TargetRegisterInfo *TRI = nullptr,
+             const TargetIntrinsicInfo *IntrinsicInfo = nullptr) const;
+
   void dump() const;
 
   //===--------------------------------------------------------------------===//
@@ -369,6 +386,35 @@ public:
     return IsUndef;
   }
 
+  /// isRenamable - Returns true if this register may be renamed, i.e. it does
+  /// not generate a value that is somehow read in a way that is not represented
+  /// by the Machine IR (e.g. to meet an ABI or ISA requirement).  This is only
+  /// valid on physical register operands.  Virtual registers are assumed to
+  /// always be renamable regardless of the value of this field.
+  ///
+  /// Operands that are renamable can freely be changed to any other register
+  /// that is a member of the register class returned by
+  /// MI->getRegClassConstraint().
+  ///
+  /// isRenamable can return false for several different reasons:
+  ///
+  /// - ABI constraints (since liveness is not always precisely modeled).  We
+  ///   conservatively handle these cases by setting all physical register
+  ///   operands that didn’t start out as virtual regs to not be renamable.
+  ///   Also any physical register operands created after register allocation or
+  ///   whose register is changed after register allocation will not be
+  ///   renamable.  This state is tracked in the MachineOperand::IsRenamable
+  ///   bit.
+  ///
+  /// - Opcode/target constraints: for opcodes that have complex register class
+  ///   requirements (e.g. that depend on other operands/instructions), we set
+  ///   hasExtraSrcRegAllocReq/hasExtraDstRegAllocReq in the machine opcode
+  ///   description.  Operands belonging to instructions with opcodes that are
+  ///   marked hasExtraSrcRegAllocReq/hasExtraDstRegAllocReq return false from
+  ///   isRenamable().  Additionally, the AllowRegisterRenaming target property
+  ///   prevents any operands from being marked renamable for targets that don't
+  ///   have detailed opcode hasExtraSrcRegAllocReq/hasExtraDstRegAllocReq
+  ///   values.
   bool isRenamable() const;
 
   bool isInternalRead() const {
@@ -458,10 +504,6 @@ public:
 
   void setIsRenamable(bool Val = true);
 
-  /// Set IsRenamable to true if there are no extra register allocation
-  /// requirements placed on this operand by the parent instruction's opcode.
-  void setIsRenamableIfNoExtraRegAllocReq();
-
   void setIsInternalRead(bool Val = true) {
     assert(isReg() && "Wrong MachineOperand mutator");
     IsInternalRead = Val;
@@ -574,6 +616,11 @@ public:
     return Contents.RegMask;
   }
 
+  /// Returns number of elements needed for a regmask array.
+  static unsigned getRegMaskSize(unsigned NumRegs) {
+    return (NumRegs + 31) / 32;
+  }
+
   /// getRegLiveOut - Returns a bit mask of live-out registers.
   const uint32_t *getRegLiveOut() const {
     assert(isRegLiveOut() && "Wrong MachineOperand accessor");
@@ -594,6 +641,11 @@ public:
     Contents.ImmVal = immVal;
   }
 
+  void setCImm(const ConstantInt *CI) {
+    assert(isCImm() && "Wrong MachineOperand mutator");
+    Contents.CI = CI;
+  }
+
   void setFPImm(const ConstantFP *CFP) {
     assert(isFPImm() && "Wrong MachineOperand mutator");
     Contents.CFP = CFP;
@@ -641,7 +693,7 @@ public:
   /// should stay in sync with the hash_value overload below.
   bool isIdenticalTo(const MachineOperand &Other) const;
 
-  /// \brief MachineOperand hash_value overload.
+  /// MachineOperand hash_value overload.
   ///
   /// Note that this includes the same information in the hash that
   /// isIdenticalTo uses for comparison. It is thus suited for use in hash
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/contrib/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
index 2fdefbed37ce..a7ce870400c2 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
@@ -24,7 +24,7 @@ class MachineBasicBlock;
 class MachineBlockFrequencyInfo;
 class MachineInstr;
 
-/// \brief Common features for diagnostics dealing with optimization remarks
+/// Common features for diagnostics dealing with optimization remarks
 /// that are used by machine passes.
 class DiagnosticInfoMIROptimization : public DiagnosticInfoOptimizationBase {
 public:
@@ -151,7 +151,7 @@ public:
   /// Emit an optimization remark.
   void emit(DiagnosticInfoOptimizationBase &OptDiag);
 
-  /// \brief Whether we allow for extra compile-time budget to perform more
+  /// Whether we allow for extra compile-time budget to perform more
   /// analysis to be more informative.
   ///
   /// This is useful to enable additional missed optimizations to be reported
@@ -164,7 +164,7 @@ public:
             .getDiagHandlerPtr()->isAnyRemarkEnabled(PassName));
   }
 
-  /// \brief Take a lambda that returns a remark which will be emitted.  Second
+  /// Take a lambda that returns a remark which will be emitted.  Second
   /// argument is only used to restrict this to functions.
   template <typename T>
   void emit(T RemarkBuilder, decltype(RemarkBuilder()) * = nullptr) {
@@ -192,7 +192,7 @@ private:
   /// Similar but use value from \p OptDiag and update hotness there.
   void computeHotness(DiagnosticInfoMIROptimization &Remark);
 
-  /// \brief Only allow verbose messages if we know we're filtering by hotness
+  /// Only allow verbose messages if we know we're filtering by hotness
   /// (BFI is only set in this case).
   bool shouldEmitVerbose() { return MBFI != nullptr; }
 };
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineOutliner.h b/contrib/llvm/include/llvm/CodeGen/MachineOutliner.h
new file mode 100644
index 000000000000..4249a99a891b
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -0,0 +1,226 @@
+//===---- MachineOutliner.h - Outliner data structures ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Contains all data structures shared between the outliner implemented in
+/// MachineOutliner.cpp and target implementations of the outliner.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MACHINEOUTLINER_H
+#define LLVM_MACHINEOUTLINER_H
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+namespace llvm {
+namespace outliner {
+
+/// Represents how an instruction should be mapped by the outliner.
+/// \p Legal instructions are those which are safe to outline.
+/// \p LegalTerminator instructions are safe to outline, but only as the
+/// last instruction in a sequence.
+/// \p Illegal instructions are those which cannot be outlined.
+/// \p Invisible instructions are instructions which can be outlined, but
+/// shouldn't actually impact the outlining result.
+enum InstrType { Legal, LegalTerminator, Illegal, Invisible };
+
+/// An individual sequence of instructions to be replaced with a call to
+/// an outlined function.
+struct Candidate {
+private:
+  /// The start index of this \p Candidate in the instruction list.
+  unsigned StartIdx;
+
+  /// The number of instructions in this \p Candidate.
+  unsigned Len;
+
+  // The first instruction in this \p Candidate.
+  MachineBasicBlock::iterator FirstInst;
+
+  // The last instruction in this \p Candidate.
+  MachineBasicBlock::iterator LastInst;
+
+  // The basic block that contains this Candidate.
+  MachineBasicBlock *MBB;
+
+  /// Cost of calling an outlined function from this point as defined by the
+  /// target.
+  unsigned CallOverhead;
+
+public:
+  /// The index of this \p Candidate's \p OutlinedFunction in the list of
+  /// \p OutlinedFunctions.
+  unsigned FunctionIdx;
+
+  /// Set to false if the candidate overlapped with another candidate.
+  bool InCandidateList = true;
+
+  /// Identifier denoting the instructions to emit to call an outlined function
+  /// from this point. Defined by the target.
+  unsigned CallConstructionID;
+
+  /// Contains physical register liveness information for the MBB containing
+  /// this \p Candidate.
+  ///
+  /// This is optionally used by the target to calculate more fine-grained
+  /// cost model information.
+  LiveRegUnits LRU;
+
+  /// Return the number of instructions in this Candidate.
+  unsigned getLength() const { return Len; }
+
+  /// Return the start index of this candidate.
+  unsigned getStartIdx() const { return StartIdx; }
+
+  /// Return the end index of this candidate.
+  unsigned getEndIdx() const { return StartIdx + Len - 1; }
+
+  /// Set the CallConstructionID and CallOverhead of this candidate to CID and
+  /// CO respectively.
+  void setCallInfo(unsigned CID, unsigned CO) {
+    CallConstructionID = CID;
+    CallOverhead = CO;
+  }
+
+  /// Returns the call overhead of this candidate if it is in the list.
+  unsigned getCallOverhead() const {
+    return InCandidateList ? CallOverhead : 0;
+  }
+
+  MachineBasicBlock::iterator &front() { return FirstInst; }
+  MachineBasicBlock::iterator &back() { return LastInst; }
+  MachineFunction *getMF() const { return MBB->getParent(); }
+  MachineBasicBlock *getMBB() const { return MBB; }
+
+  /// The number of instructions that would be saved by outlining every
+  /// candidate of this type.
+  ///
+  /// This is a fixed value which is not updated during the candidate pruning
+  /// process. It is only used for deciding which candidate to keep if two
+  /// candidates overlap. The true benefit is stored in the OutlinedFunction
+  /// for some given candidate.
+  unsigned Benefit = 0;
+
+  Candidate(unsigned StartIdx, unsigned Len,
+            MachineBasicBlock::iterator &FirstInst,
+            MachineBasicBlock::iterator &LastInst, MachineBasicBlock *MBB,
+            unsigned FunctionIdx)
+      : StartIdx(StartIdx), Len(Len), FirstInst(FirstInst), LastInst(LastInst),
+        MBB(MBB), FunctionIdx(FunctionIdx) {}
+  Candidate() {}
+
+  /// Used to ensure that \p Candidates are outlined in an order that
+  /// preserves the start and end indices of other \p Candidates.
+  bool operator<(const Candidate &RHS) const {
+    return getStartIdx() > RHS.getStartIdx();
+  }
+
+  /// Compute the registers that are live across this Candidate.
+  /// Used by targets that need this information for cost model calculation.
+  /// If a target does not need this information, then this should not be
+  /// called.
+  void initLRU(const TargetRegisterInfo &TRI) {
+    assert(MBB->getParent()->getRegInfo().tracksLiveness() &&
+           "Candidate's Machine Function must track liveness");
+    LRU.init(TRI);
+    LRU.addLiveOuts(*MBB);
+
+    // Compute liveness from the end of the block up to the beginning of the
+    // outlining candidate.
+    std::for_each(MBB->rbegin(), (MachineBasicBlock::reverse_iterator)front(),
+                  [this](MachineInstr &MI) { LRU.stepBackward(MI); });
+  }
+};
+
+/// The information necessary to create an outlined function for some
+/// class of candidate.
+struct OutlinedFunction {
+
+private:
+  /// The number of candidates for this \p OutlinedFunction.
+  unsigned OccurrenceCount = 0;
+
+public:
+  std::vector<std::shared_ptr<Candidate>> Candidates;
+
+  /// The actual outlined function created.
+  /// This is initialized after we go through and create the actual function.
+  MachineFunction *MF = nullptr;
+
+  /// A number assigned to this function which appears at the end of its name.
+  unsigned Name;
+
+  /// The sequence of integers corresponding to the instructions in this
+  /// function.
+  std::vector<unsigned> Sequence;
+
+  /// Represents the size of a sequence in bytes. (Some instructions vary
+  /// widely in size, so just counting the instructions isn't very useful.)
+  unsigned SequenceSize;
+
+  /// Target-defined overhead of constructing a frame for this function.
+  unsigned FrameOverhead;
+
+  /// Target-defined identifier for constructing a frame for this function.
+  unsigned FrameConstructionID;
+
+  /// Return the number of candidates for this \p OutlinedFunction.
+  unsigned getOccurrenceCount() { return OccurrenceCount; }
+
+  /// Decrement the occurrence count of this OutlinedFunction and return the
+  /// new count.
+  unsigned decrement() {
+    assert(OccurrenceCount > 0 && "Can't decrement an empty function!");
+    OccurrenceCount--;
+    return getOccurrenceCount();
+  }
+
+  /// Return the number of bytes it would take to outline this
+  /// function.
+  unsigned getOutliningCost() {
+    unsigned CallOverhead = 0;
+    for (std::shared_ptr<Candidate> &C : Candidates)
+      CallOverhead += C->getCallOverhead();
+    return CallOverhead + SequenceSize + FrameOverhead;
+  }
+
+  /// Return the size in bytes of the unoutlined sequences.
+  unsigned getNotOutlinedCost() { return OccurrenceCount * SequenceSize; }
+
+  /// Return the number of instructions that would be saved by outlining
+  /// this function.
+  unsigned getBenefit() {
+    unsigned NotOutlinedCost = getNotOutlinedCost();
+    unsigned OutlinedCost = getOutliningCost();
+    return (NotOutlinedCost < OutlinedCost) ? 0
+                                            : NotOutlinedCost - OutlinedCost;
+  }
+
+  OutlinedFunction(std::vector<Candidate> &Cands,
+                   unsigned SequenceSize, unsigned FrameOverhead,
+                   unsigned FrameConstructionID)
+      : SequenceSize(SequenceSize), FrameOverhead(FrameOverhead),
+        FrameConstructionID(FrameConstructionID) {
+    OccurrenceCount = Cands.size();
+    for (Candidate &C : Cands)
+      Candidates.push_back(std::make_shared<outliner::Candidate>(C));
+
+    unsigned B = getBenefit();
+    for (std::shared_ptr<Candidate> &C : Candidates)
+      C->Benefit = B;
+  }
+
+  OutlinedFunction() {}
+};
+} // namespace outliner
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/contrib/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 3be94f802170..5bf4a49c8b3b 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LowLevelType.h"
@@ -75,6 +76,13 @@ private:
              VirtReg2IndexFunctor>
       VRegInfo;
 
+  /// Map for recovering vreg name from vreg number.
+  /// This map is used by the MIR Printer.
+  IndexedMap<std::string, VirtReg2IndexFunctor> VReg2Name;
+
+  /// StringSet that is used to unique vreg names.
+  StringSet<> VRegNames;
+
   /// The flag is true upon \p UpdatedCSRs initialization
   /// and false otherwise.
   bool IsUpdatedCSRsInitialized;
@@ -128,9 +136,9 @@ private:
   /// started.
   BitVector ReservedRegs;
 
-  using VRegToTypeMap = DenseMap<unsigned, LLT>;
-  /// Map generic virtual registers to their actual size.
-  mutable std::unique_ptr<VRegToTypeMap> VRegToType;
+  using VRegToTypeMap = IndexedMap<LLT, VirtReg2IndexFunctor>;
+  /// Map generic virtual registers to their low-level type.
+  VRegToTypeMap VRegToType;
 
   /// Keep track of the physical registers that are live in to the function.
   /// Live in values are typically arguments in registers.  LiveIn values are
@@ -418,6 +426,20 @@ public:
   /// specified register (it may be live-in).
   bool def_empty(unsigned RegNo) const { return def_begin(RegNo) == def_end(); }
 
+  StringRef getVRegName(unsigned Reg) const {
+    return VReg2Name.inBounds(Reg) ? StringRef(VReg2Name[Reg]) : "";
+  }
+
+  void insertVRegByName(StringRef Name, unsigned Reg) {
+    assert((Name.empty() || VRegNames.find(Name) == VRegNames.end()) &&
+           "Named VRegs Must be Unique.");
+    if (!Name.empty()) {
+      VRegNames.insert(Name);
+      VReg2Name.grow(Reg);
+      VReg2Name[Reg] = Name.str();
+    }
+  }
+
   /// Return true if there is exactly one operand defining the specified
   /// register.
   bool hasOneDef(unsigned RegNo) const {
@@ -548,12 +570,16 @@ public:
   /// except that it also changes any definitions of the register as well.
   ///
   /// Note that it is usually necessary to first constrain ToReg's register
-  /// class to match the FromReg constraints using:
+  /// class and register bank to match the FromReg constraints using one of the
+  /// methods:
   ///
   ///   constrainRegClass(ToReg, getRegClass(FromReg))
+  ///   constrainRegAttrs(ToReg, FromReg)
+  ///   RegisterBankInfo::constrainGenericRegister(ToReg,
+  ///       *MRI.getRegClass(FromReg), MRI)
   ///
-  /// That function will return NULL if the virtual registers have incompatible
-  /// constraints.
+  /// These functions will return a falsy result if the virtual registers have
+  /// incompatible constraints.
   ///
   /// Note that if ToReg is a physical register the function will replace and
   /// apply sub registers to ToReg in order to obtain a final/proper physical
@@ -653,10 +679,30 @@ public:
   /// new register class, or NULL if no such class exists.
   /// This should only be used when the constraint is known to be trivial, like
   /// GR32 -> GR32_NOSP. Beware of increasing register pressure.
+  ///
+  /// \note Assumes that the register has a register class assigned.
+  /// Use RegisterBankInfo::constrainGenericRegister in GlobalISel's
+  /// InstructionSelect pass and constrainRegAttrs in every other pass,
+  /// including non-select passes of GlobalISel, instead.
   const TargetRegisterClass *constrainRegClass(unsigned Reg,
                                                const TargetRegisterClass *RC,
                                                unsigned MinNumRegs = 0);
 
+  /// Constrain the register class or the register bank of the virtual register
+  /// \p Reg to be a common subclass and a common bank of both registers
+  /// provided respectively. Do nothing if any of the attributes (classes,
+  /// banks, or low-level types) of the registers are deemed incompatible, or if
+  /// the resulting register will have a class smaller than before and of size
+  /// less than \p MinNumRegs. Return true if such register attributes exist,
+  /// false otherwise.
+  ///
+  /// \note Assumes that each register has either a low-level type or a class
+  /// assigned, but not both. Use this method instead of constrainRegClass and
+  /// RegisterBankInfo::constrainGenericRegister everywhere but SelectionDAG
+  /// ISel / FastISel and GlobalISel's InstructionSelect pass respectively.
+  bool constrainRegAttrs(unsigned Reg, unsigned ConstrainingReg,
+                         unsigned MinNumRegs = 0);
+
   /// recomputeRegClass - Try to find a legal super-class of Reg's register
   /// class that still satisfies the constraints from the instructions using
   /// Reg.  Returns true if Reg was upgraded.
@@ -668,26 +714,23 @@ public:
 
   /// createVirtualRegister - Create and return a new virtual register in the
   /// function with the specified register class.
-  unsigned createVirtualRegister(const TargetRegisterClass *RegClass);
+  unsigned createVirtualRegister(const TargetRegisterClass *RegClass,
+                                 StringRef Name = "");
 
-  /// Accessor for VRegToType. This accessor should only be used
-  /// by global-isel related work.
-  VRegToTypeMap &getVRegToType() const {
-    if (!VRegToType)
-      VRegToType.reset(new VRegToTypeMap);
-    return *VRegToType.get();
-  }
-
-  /// Get the low-level type of \p VReg or LLT{} if VReg is not a generic
+  /// Get the low-level type of \p Reg or LLT{} if Reg is not a generic
   /// (target independent) virtual register.
-  LLT getType(unsigned VReg) const;
+  LLT getType(unsigned Reg) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg) && VRegToType.inBounds(Reg))
+      return VRegToType[Reg];
+    return LLT{};
+  }
 
   /// Set the low-level type of \p VReg to \p Ty.
   void setType(unsigned VReg, LLT Ty);
 
   /// Create and return a new generic virtual register with low-level
   /// type \p Ty.
-  unsigned createGenericVirtualRegister(LLT Ty);
+  unsigned createGenericVirtualRegister(LLT Ty, StringRef Name = "");
 
   /// Remove all types associated to virtual registers (after instruction
   /// selection and constraining of all generic virtual registers).
@@ -698,7 +741,7 @@ public:
   /// temporarily while constructing machine instructions. Most operations are
   /// undefined on an incomplete register until one of setRegClass(),
   /// setRegBank() or setSize() has been called on it.
-  unsigned createIncompleteVirtualRegister();
+  unsigned createIncompleteVirtualRegister(StringRef Name = "");
 
   /// getNumVirtRegs - Return the number of virtual registers created.
   unsigned getNumVirtRegs() const { return VRegInfo.size(); }
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineSSAUpdater.h b/contrib/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
index b5ea2080444d..5e91246b402c 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -56,7 +56,7 @@ public:
   /// MachineSSAUpdater constructor.  If InsertedPHIs is specified, it will be
   /// filled in with all PHI Nodes created by rewriting.
   explicit MachineSSAUpdater(MachineFunction &MF,
-                        SmallVectorImpl<MachineInstr*> *InsertedPHIs = nullptr);
+                        SmallVectorImpl<MachineInstr*> *NewPHI = nullptr);
   MachineSSAUpdater(const MachineSSAUpdater &) = delete;
   MachineSSAUpdater &operator=(const MachineSSAUpdater &) = delete;
   ~MachineSSAUpdater();
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineScheduler.h b/contrib/llvm/include/llvm/CodeGen/MachineScheduler.h
index e327881de13a..85ffa4eda2b8 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -237,7 +237,7 @@ public:
   /// be scheduled at the bottom.
   virtual SUnit *pickNode(bool &IsTopNode) = 0;
 
-  /// \brief Scheduler callback to notify that a new subtree is scheduled.
+  /// Scheduler callback to notify that a new subtree is scheduled.
   virtual void scheduleTree(unsigned SubtreeID) {}
 
   /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an
@@ -318,11 +318,11 @@ public:
       Mutations.push_back(std::move(Mutation));
   }
 
-  /// \brief True if an edge can be added from PredSU to SuccSU without creating
+  /// True if an edge can be added from PredSU to SuccSU without creating
   /// a cycle.
   bool canAddEdge(SUnit *SuccSU, SUnit *PredSU);
 
-  /// \brief Add a DAG edge to the given SU with the given predecessor
+  /// Add a DAG edge to the given SU with the given predecessor
   /// dependence data.
   ///
   /// \returns true if the edge may be added without creating a cycle OR if an
@@ -374,7 +374,7 @@ protected:
   /// Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
   void placeDebugValues();
 
-  /// \brief dump the scheduled Sequence.
+  /// dump the scheduled Sequence.
   void dumpSchedule() const;
 
   // Lesser helpers...
@@ -445,7 +445,7 @@ public:
   /// Return true if this DAG supports VReg liveness and RegPressure.
   bool hasVRegLiveness() const override { return true; }
 
-  /// \brief Return true if register pressure tracking is enabled.
+  /// Return true if register pressure tracking is enabled.
   bool isTrackingPressure() const { return ShouldTrackPressure; }
 
   /// Get current register pressure for the top scheduled instructions.
@@ -897,6 +897,28 @@ protected:
 #endif
 };
 
+// Utility functions used by heuristics in tryCandidate().
+bool tryLess(int TryVal, int CandVal,
+             GenericSchedulerBase::SchedCandidate &TryCand,
+             GenericSchedulerBase::SchedCandidate &Cand,
+             GenericSchedulerBase::CandReason Reason);
+bool tryGreater(int TryVal, int CandVal,
+                GenericSchedulerBase::SchedCandidate &TryCand,
+                GenericSchedulerBase::SchedCandidate &Cand,
+                GenericSchedulerBase::CandReason Reason);
+bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                GenericSchedulerBase::SchedCandidate &Cand,
+                SchedBoundary &Zone);
+bool tryPressure(const PressureChange &TryP,
+                 const PressureChange &CandP,
+                 GenericSchedulerBase::SchedCandidate &TryCand,
+                 GenericSchedulerBase::SchedCandidate &Cand,
+                 GenericSchedulerBase::CandReason Reason,
+                 const TargetRegisterInfo *TRI,
+                 const MachineFunction &MF);
+unsigned getWeakLeft(const SUnit *SU, bool isTop);
+int biasPhysRegCopy(const SUnit *SU, bool isTop);
+
 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
 class GenericScheduler : public GenericSchedulerBase {
@@ -963,9 +985,8 @@ protected:
                      const RegPressureTracker &RPTracker,
                      RegPressureTracker &TempTracker);
 
-  void tryCandidate(SchedCandidate &Cand,
-                    SchedCandidate &TryCand,
-                    SchedBoundary *Zone);
+  virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                            SchedBoundary *Zone) const;
 
   SUnit *pickNodeBidirectional(bool &IsTopNode);
 
diff --git a/contrib/llvm/include/llvm/CodeGen/MacroFusion.h b/contrib/llvm/include/llvm/CodeGen/MacroFusion.h
index dc105fdc68fd..a77226ddaf33 100644
--- a/contrib/llvm/include/llvm/CodeGen/MacroFusion.h
+++ b/contrib/llvm/include/llvm/CodeGen/MacroFusion.h
@@ -25,7 +25,7 @@ class ScheduleDAGMutation;
 class TargetInstrInfo;
 class TargetSubtargetInfo;
 
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
 /// together. Given SecondMI, when FirstMI is unspecified, then check if
 /// SecondMI may be part of a fused pair at all.
 using ShouldSchedulePredTy = std::function<bool(const TargetInstrInfo &TII,
@@ -33,13 +33,13 @@ using ShouldSchedulePredTy = std::function<bool(const TargetInstrInfo &TII,
                                                 const MachineInstr *FirstMI,
                                                 const MachineInstr &SecondMI)>;
 
-/// \brief Create a DAG scheduling mutation to pair instructions back to back
+/// Create a DAG scheduling mutation to pair instructions back to back
 /// for instructions that benefit according to the target-specific
 /// shouldScheduleAdjacent predicate function.
 std::unique_ptr<ScheduleDAGMutation>
 createMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent);
 
-/// \brief Create a DAG scheduling mutation to pair branch instructions with one
+/// Create a DAG scheduling mutation to pair branch instructions with one
 /// of their predecessors back to back for instructions that benefit according
 /// to the target-specific shouldScheduleAdjacent predicate function.
 std::unique_ptr<ScheduleDAGMutation>
diff --git a/contrib/llvm/include/llvm/CodeGen/PBQP/Graph.h b/contrib/llvm/include/llvm/CodeGen/PBQP/Graph.h
index e94878ced10d..a6d88b057dcb 100644
--- a/contrib/llvm/include/llvm/CodeGen/PBQP/Graph.h
+++ b/contrib/llvm/include/llvm/CodeGen/PBQP/Graph.h
@@ -29,12 +29,12 @@ namespace PBQP {
     using NodeId = unsigned;
     using EdgeId = unsigned;
 
-    /// @brief Returns a value representing an invalid (non-existent) node.
+    /// Returns a value representing an invalid (non-existent) node.
     static NodeId invalidNodeId() {
       return std::numeric_limits<NodeId>::max();
     }
 
-    /// @brief Returns a value representing an invalid (non-existent) edge.
+    /// Returns a value representing an invalid (non-existent) edge.
     static EdgeId invalidEdgeId() {
       return std::numeric_limits<EdgeId>::max();
     }
@@ -338,19 +338,19 @@ namespace PBQP {
       const NodeEntry &NE;
     };
 
-    /// @brief Construct an empty PBQP graph.
+    /// Construct an empty PBQP graph.
     Graph() = default;
 
-    /// @brief Construct an empty PBQP graph with the given graph metadata.
+    /// Construct an empty PBQP graph with the given graph metadata.
     Graph(GraphMetadata Metadata) : Metadata(std::move(Metadata)) {}
 
-    /// @brief Get a reference to the graph metadata.
+    /// Get a reference to the graph metadata.
     GraphMetadata& getMetadata() { return Metadata; }
 
-    /// @brief Get a const-reference to the graph metadata.
+    /// Get a const-reference to the graph metadata.
     const GraphMetadata& getMetadata() const { return Metadata; }
 
-    /// @brief Lock this graph to the given solver instance in preparation
+    /// Lock this graph to the given solver instance in preparation
     /// for running the solver. This method will call solver.handleAddNode for
     /// each node in the graph, and handleAddEdge for each edge, to give the
     /// solver an opportunity to set up any requried metadata.
@@ -363,13 +363,13 @@ namespace PBQP {
         Solver->handleAddEdge(EId);
     }
 
-    /// @brief Release from solver instance.
+    /// Release from solver instance.
     void unsetSolver() {
       assert(Solver && "Solver not set.");
       Solver = nullptr;
     }
 
-    /// @brief Add a node with the given costs.
+    /// Add a node with the given costs.
     /// @param Costs Cost vector for the new node.
     /// @return Node iterator for the added node.
     template <typename OtherVectorT>
@@ -382,7 +382,7 @@ namespace PBQP {
       return NId;
     }
 
-    /// @brief Add a node bypassing the cost allocator.
+    /// Add a node bypassing the cost allocator.
     /// @param Costs Cost vector ptr for the new node (must be convertible to
     ///        VectorPtr).
     /// @return Node iterator for the added node.
@@ -401,7 +401,7 @@ namespace PBQP {
       return NId;
     }
 
-    /// @brief Add an edge between the given nodes with the given costs.
+    /// Add an edge between the given nodes with the given costs.
     /// @param N1Id First node.
     /// @param N2Id Second node.
     /// @param Costs Cost matrix for new edge.
@@ -419,7 +419,7 @@ namespace PBQP {
       return EId;
     }
 
-    /// @brief Add an edge bypassing the cost allocator.
+    /// Add an edge bypassing the cost allocator.
     /// @param N1Id First node.
     /// @param N2Id Second node.
     /// @param Costs Cost matrix for new edge.
@@ -444,7 +444,7 @@ namespace PBQP {
       return EId;
     }
 
-    /// @brief Returns true if the graph is empty.
+    /// Returns true if the graph is empty.
     bool empty() const { return NodeIdSet(*this).empty(); }
 
     NodeIdSet nodeIds() const { return NodeIdSet(*this); }
@@ -452,15 +452,15 @@ namespace PBQP {
 
     AdjEdgeIdSet adjEdgeIds(NodeId NId) { return AdjEdgeIdSet(getNode(NId)); }
 
-    /// @brief Get the number of nodes in the graph.
+    /// Get the number of nodes in the graph.
     /// @return Number of nodes in the graph.
     unsigned getNumNodes() const { return NodeIdSet(*this).size(); }
 
-    /// @brief Get the number of edges in the graph.
+    /// Get the number of edges in the graph.
     /// @return Number of edges in the graph.
     unsigned getNumEdges() const { return EdgeIdSet(*this).size(); }
 
-    /// @brief Set a node's cost vector.
+    /// Set a node's cost vector.
     /// @param NId Node to update.
     /// @param Costs New costs to set.
     template <typename OtherVectorT>
@@ -471,7 +471,7 @@ namespace PBQP {
       getNode(NId).Costs = AllocatedCosts;
     }
 
-    /// @brief Get a VectorPtr to a node's cost vector. Rarely useful - use
+    /// Get a VectorPtr to a node's cost vector. Rarely useful - use
     ///        getNodeCosts where possible.
     /// @param NId Node id.
     /// @return VectorPtr to node cost vector.
@@ -483,7 +483,7 @@ namespace PBQP {
       return getNode(NId).Costs;
     }
 
-    /// @brief Get a node's cost vector.
+    /// Get a node's cost vector.
     /// @param NId Node id.
     /// @return Node cost vector.
     const Vector& getNodeCosts(NodeId NId) const {
@@ -502,7 +502,7 @@ namespace PBQP {
       return getNode(NId).getAdjEdgeIds().size();
     }
 
-    /// @brief Update an edge's cost matrix.
+    /// Update an edge's cost matrix.
     /// @param EId Edge id.
     /// @param Costs New cost matrix.
     template <typename OtherMatrixT>
@@ -513,7 +513,7 @@ namespace PBQP {
       getEdge(EId).Costs = AllocatedCosts;
     }
 
-    /// @brief Get a MatrixPtr to a node's cost matrix. Rarely useful - use
+    /// Get a MatrixPtr to a node's cost matrix. Rarely useful - use
     ///        getEdgeCosts where possible.
     /// @param EId Edge id.
     /// @return MatrixPtr to edge cost matrix.
@@ -525,7 +525,7 @@ namespace PBQP {
       return getEdge(EId).Costs;
     }
 
-    /// @brief Get an edge's cost matrix.
+    /// Get an edge's cost matrix.
     /// @param EId Edge id.
     /// @return Edge cost matrix.
     const Matrix& getEdgeCosts(EdgeId EId) const {
@@ -540,21 +540,21 @@ namespace PBQP {
       return getEdge(EId).Metadata;
     }
 
-    /// @brief Get the first node connected to this edge.
+    /// Get the first node connected to this edge.
     /// @param EId Edge id.
     /// @return The first node connected to the given edge.
     NodeId getEdgeNode1Id(EdgeId EId) const {
       return getEdge(EId).getN1Id();
     }
 
-    /// @brief Get the second node connected to this edge.
+    /// Get the second node connected to this edge.
     /// @param EId Edge id.
     /// @return The second node connected to the given edge.
     NodeId getEdgeNode2Id(EdgeId EId) const {
       return getEdge(EId).getN2Id();
     }
 
-    /// @brief Get the "other" node connected to this edge.
+    /// Get the "other" node connected to this edge.
     /// @param EId Edge id.
     /// @param NId Node id for the "given" node.
     /// @return The iterator for the "other" node connected to this edge.
@@ -566,7 +566,7 @@ namespace PBQP {
       return E.getN1Id();
     }
 
-    /// @brief Get the edge connecting two nodes.
+    /// Get the edge connecting two nodes.
     /// @param N1Id First node id.
     /// @param N2Id Second node id.
     /// @return An id for edge (N1Id, N2Id) if such an edge exists,
@@ -581,7 +581,7 @@ namespace PBQP {
       return invalidEdgeId();
     }
 
-    /// @brief Remove a node from the graph.
+    /// Remove a node from the graph.
     /// @param NId Node id.
     void removeNode(NodeId NId) {
       if (Solver)
@@ -598,7 +598,7 @@ namespace PBQP {
       FreeNodeIds.push_back(NId);
     }
 
-    /// @brief Disconnect an edge from the given node.
+    /// Disconnect an edge from the given node.
     ///
     /// Removes the given edge from the adjacency list of the given node.
     /// This operation leaves the edge in an 'asymmetric' state: It will no
@@ -631,14 +631,14 @@ namespace PBQP {
       E.disconnectFrom(*this, NId);
     }
 
-    /// @brief Convenience method to disconnect all neighbours from the given
+    /// Convenience method to disconnect all neighbours from the given
     ///        node.
     void disconnectAllNeighborsFromNode(NodeId NId) {
       for (auto AEId : adjEdgeIds(NId))
         disconnectEdge(AEId, getEdgeOtherNodeId(AEId, NId));
     }
 
-    /// @brief Re-attach an edge to its nodes.
+    /// Re-attach an edge to its nodes.
     ///
     /// Adds an edge that had been previously disconnected back into the
     /// adjacency set of the nodes that the edge connects.
@@ -649,7 +649,7 @@ namespace PBQP {
         Solver->handleReconnectEdge(EId, NId);
     }
 
-    /// @brief Remove an edge from the graph.
+    /// Remove an edge from the graph.
     /// @param EId Edge id.
     void removeEdge(EdgeId EId) {
       if (Solver)
@@ -660,7 +660,7 @@ namespace PBQP {
       Edges[EId].invalidate();
     }
 
-    /// @brief Remove all nodes and edges from the graph.
+    /// Remove all nodes and edges from the graph.
     void clear() {
       Nodes.clear();
       FreeNodeIds.clear();
diff --git a/contrib/llvm/include/llvm/CodeGen/PBQP/Math.h b/contrib/llvm/include/llvm/CodeGen/PBQP/Math.h
index ba405e816d10..d1432a3053c4 100644
--- a/contrib/llvm/include/llvm/CodeGen/PBQP/Math.h
+++ b/contrib/llvm/include/llvm/CodeGen/PBQP/Math.h
@@ -22,34 +22,34 @@ namespace PBQP {
 
 using PBQPNum = float;
 
-/// \brief PBQP Vector class.
+/// PBQP Vector class.
 class Vector {
   friend hash_code hash_value(const Vector &);
 
 public:
-  /// \brief Construct a PBQP vector of the given size.
+  /// Construct a PBQP vector of the given size.
   explicit Vector(unsigned Length)
     : Length(Length), Data(llvm::make_unique<PBQPNum []>(Length)) {}
 
-  /// \brief Construct a PBQP vector with initializer.
+  /// Construct a PBQP vector with initializer.
   Vector(unsigned Length, PBQPNum InitVal)
     : Length(Length), Data(llvm::make_unique<PBQPNum []>(Length)) {
     std::fill(Data.get(), Data.get() + Length, InitVal);
   }
 
-  /// \brief Copy construct a PBQP vector.
+  /// Copy construct a PBQP vector.
   Vector(const Vector &V)
     : Length(V.Length), Data(llvm::make_unique<PBQPNum []>(Length)) {
     std::copy(V.Data.get(), V.Data.get() + Length, Data.get());
   }
 
-  /// \brief Move construct a PBQP vector.
+  /// Move construct a PBQP vector.
   Vector(Vector &&V)
     : Length(V.Length), Data(std::move(V.Data)) {
     V.Length = 0;
   }
 
-  /// \brief Comparison operator.
+  /// Comparison operator.
   bool operator==(const Vector &V) const {
     assert(Length != 0 && Data && "Invalid vector");
     if (Length != V.Length)
@@ -57,27 +57,27 @@ public:
     return std::equal(Data.get(), Data.get() + Length, V.Data.get());
   }
 
-  /// \brief Return the length of the vector
+  /// Return the length of the vector
   unsigned getLength() const {
     assert(Length != 0 && Data && "Invalid vector");
     return Length;
   }
 
-  /// \brief Element access.
+  /// Element access.
   PBQPNum& operator[](unsigned Index) {
     assert(Length != 0 && Data && "Invalid vector");
     assert(Index < Length && "Vector element access out of bounds.");
     return Data[Index];
   }
 
-  /// \brief Const element access.
+  /// Const element access.
   const PBQPNum& operator[](unsigned Index) const {
     assert(Length != 0 && Data && "Invalid vector");
     assert(Index < Length && "Vector element access out of bounds.");
     return Data[Index];
   }
 
-  /// \brief Add another vector to this one.
+  /// Add another vector to this one.
   Vector& operator+=(const Vector &V) {
     assert(Length != 0 && Data && "Invalid vector");
     assert(Length == V.Length && "Vector length mismatch.");
@@ -86,7 +86,7 @@ public:
     return *this;
   }
 
-  /// \brief Returns the index of the minimum value in this vector
+  /// Returns the index of the minimum value in this vector
   unsigned minIndex() const {
     assert(Length != 0 && Data && "Invalid vector");
     return std::min_element(Data.get(), Data.get() + Length) - Data.get();
@@ -97,14 +97,14 @@ private:
   std::unique_ptr<PBQPNum []> Data;
 };
 
-/// \brief Return a hash_value for the given vector.
+/// Return a hash_value for the given vector.
 inline hash_code hash_value(const Vector &V) {
   unsigned *VBegin = reinterpret_cast<unsigned*>(V.Data.get());
   unsigned *VEnd = reinterpret_cast<unsigned*>(V.Data.get() + V.Length);
   return hash_combine(V.Length, hash_combine_range(VBegin, VEnd));
 }
 
-/// \brief Output a textual representation of the given vector on the given
+/// Output a textual representation of the given vector on the given
 ///        output stream.
 template <typename OStream>
 OStream& operator<<(OStream &OS, const Vector &V) {
@@ -118,18 +118,18 @@ OStream& operator<<(OStream &OS, const Vector &V) {
   return OS;
 }
 
-/// \brief PBQP Matrix class
+/// PBQP Matrix class
 class Matrix {
 private:
   friend hash_code hash_value(const Matrix &);
 
 public:
-  /// \brief Construct a PBQP Matrix with the given dimensions.
+  /// Construct a PBQP Matrix with the given dimensions.
   Matrix(unsigned Rows, unsigned Cols) :
     Rows(Rows), Cols(Cols), Data(llvm::make_unique<PBQPNum []>(Rows * Cols)) {
   }
 
-  /// \brief Construct a PBQP Matrix with the given dimensions and initial
+  /// Construct a PBQP Matrix with the given dimensions and initial
   /// value.
   Matrix(unsigned Rows, unsigned Cols, PBQPNum InitVal)
     : Rows(Rows), Cols(Cols),
@@ -137,20 +137,20 @@ public:
     std::fill(Data.get(), Data.get() + (Rows * Cols), InitVal);
   }
 
-  /// \brief Copy construct a PBQP matrix.
+  /// Copy construct a PBQP matrix.
   Matrix(const Matrix &M)
     : Rows(M.Rows), Cols(M.Cols),
       Data(llvm::make_unique<PBQPNum []>(Rows * Cols)) {
     std::copy(M.Data.get(), M.Data.get() + (Rows * Cols), Data.get());
   }
 
-  /// \brief Move construct a PBQP matrix.
+  /// Move construct a PBQP matrix.
   Matrix(Matrix &&M)
     : Rows(M.Rows), Cols(M.Cols), Data(std::move(M.Data)) {
     M.Rows = M.Cols = 0;
   }
 
-  /// \brief Comparison operator.
+  /// Comparison operator.
   bool operator==(const Matrix &M) const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     if (Rows != M.Rows || Cols != M.Cols)
@@ -158,33 +158,33 @@ public:
     return std::equal(Data.get(), Data.get() + (Rows * Cols), M.Data.get());
   }
 
-  /// \brief Return the number of rows in this matrix.
+  /// Return the number of rows in this matrix.
   unsigned getRows() const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     return Rows;
   }
 
-  /// \brief Return the number of cols in this matrix.
+  /// Return the number of cols in this matrix.
   unsigned getCols() const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     return Cols;
   }
 
-  /// \brief Matrix element access.
+  /// Matrix element access.
   PBQPNum* operator[](unsigned R) {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     assert(R < Rows && "Row out of bounds.");
     return Data.get() + (R * Cols);
   }
 
-  /// \brief Matrix element access.
+  /// Matrix element access.
   const PBQPNum* operator[](unsigned R) const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     assert(R < Rows && "Row out of bounds.");
     return Data.get() + (R * Cols);
   }
 
-  /// \brief Returns the given row as a vector.
+  /// Returns the given row as a vector.
   Vector getRowAsVector(unsigned R) const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     Vector V(Cols);
@@ -193,7 +193,7 @@ public:
     return V;
   }
 
-  /// \brief Returns the given column as a vector.
+  /// Returns the given column as a vector.
   Vector getColAsVector(unsigned C) const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     Vector V(Rows);
@@ -202,7 +202,7 @@ public:
     return V;
   }
 
-  /// \brief Matrix transpose.
+  /// Matrix transpose.
   Matrix transpose() const {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     Matrix M(Cols, Rows);
@@ -212,7 +212,7 @@ public:
     return M;
   }
 
-  /// \brief Add the given matrix to this one.
+  /// Add the given matrix to this one.
   Matrix& operator+=(const Matrix &M) {
     assert(Rows != 0 && Cols != 0 && Data && "Invalid matrix");
     assert(Rows == M.Rows && Cols == M.Cols &&
@@ -234,7 +234,7 @@ private:
   std::unique_ptr<PBQPNum []> Data;
 };
 
-/// \brief Return a hash_code for the given matrix.
+/// Return a hash_code for the given matrix.
 inline hash_code hash_value(const Matrix &M) {
   unsigned *MBegin = reinterpret_cast<unsigned*>(M.Data.get());
   unsigned *MEnd =
@@ -242,7 +242,7 @@ inline hash_code hash_value(const Matrix &M) {
   return hash_combine(M.Rows, M.Cols, hash_combine_range(MBegin, MEnd));
 }
 
-/// \brief Output a textual representation of the given matrix on the given
+/// Output a textual representation of the given matrix on the given
 ///        output stream.
 template <typename OStream>
 OStream& operator<<(OStream &OS, const Matrix &M) {
diff --git a/contrib/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h b/contrib/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h
index 8aeb51936760..21b99027970d 100644
--- a/contrib/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h
+++ b/contrib/llvm/include/llvm/CodeGen/PBQP/ReductionRules.h
@@ -23,7 +23,7 @@
 namespace llvm {
 namespace PBQP {
 
-  /// \brief Reduce a node of degree one.
+  /// Reduce a node of degree one.
   ///
   /// Propagate costs from the given node, which must be of degree one, to its
   /// neighbor. Notify the problem domain.
@@ -166,7 +166,7 @@ namespace PBQP {
   }
 #endif
 
-  // \brief Find a solution to a fully reduced graph by backpropagation.
+  // Find a solution to a fully reduced graph by backpropagation.
   //
   // Given a graph and a reduction order, pop each node from the reduction
   // order and greedily compute a minimum solution based on the node costs, and
diff --git a/contrib/llvm/include/llvm/CodeGen/PBQP/Solution.h b/contrib/llvm/include/llvm/CodeGen/PBQP/Solution.h
index 6a247277fdfa..4d4379fbc2c2 100644
--- a/contrib/llvm/include/llvm/CodeGen/PBQP/Solution.h
+++ b/contrib/llvm/include/llvm/CodeGen/PBQP/Solution.h
@@ -21,7 +21,7 @@
 namespace llvm {
 namespace PBQP {
 
-  /// \brief Represents a solution to a PBQP problem.
+  /// Represents a solution to a PBQP problem.
   ///
   /// To get the selection for each node in the problem use the getSelection method.
   class Solution {
@@ -30,17 +30,17 @@ namespace PBQP {
     SelectionsMap selections;
 
   public:
-    /// \brief Initialise an empty solution.
+    /// Initialise an empty solution.
     Solution() = default;
 
-    /// \brief Set the selection for a given node.
+    /// Set the selection for a given node.
     /// @param nodeId Node id.
     /// @param selection Selection for nodeId.
     void setSelection(GraphBase::NodeId nodeId, unsigned selection) {
       selections[nodeId] = selection;
     }
 
-    /// \brief Get a node's selection.
+    /// Get a node's selection.
     /// @param nodeId Node id.
     /// @return The selection for nodeId;
     unsigned getSelection(GraphBase::NodeId nodeId) const {
diff --git a/contrib/llvm/include/llvm/CodeGen/PBQPRAConstraint.h b/contrib/llvm/include/llvm/CodeGen/PBQPRAConstraint.h
index 269b7a7b3a35..995467dc56d8 100644
--- a/contrib/llvm/include/llvm/CodeGen/PBQPRAConstraint.h
+++ b/contrib/llvm/include/llvm/CodeGen/PBQPRAConstraint.h
@@ -33,7 +33,7 @@ class PBQPRAGraph;
 
 using PBQPRAGraph = PBQP::RegAlloc::PBQPRAGraph;
 
-/// @brief Abstract base for classes implementing PBQP register allocation
+/// Abstract base for classes implementing PBQP register allocation
 ///        constraints (e.g. Spill-costs, interference, coalescing).
 class PBQPRAConstraint {
 public:
@@ -44,7 +44,7 @@ private:
   virtual void anchor();
 };
 
-/// @brief PBQP register allocation constraint composer.
+/// PBQP register allocation constraint composer.
 ///
 ///   Constraints added to this list will be applied, in the order that they are
 /// added, to the PBQP graph.
diff --git a/contrib/llvm/include/llvm/CodeGen/ParallelCG.h b/contrib/llvm/include/llvm/CodeGen/ParallelCG.h
index 14ef0ec408ba..dbf09ea31e20 100644
--- a/contrib/llvm/include/llvm/CodeGen/ParallelCG.h
+++ b/contrib/llvm/include/llvm/CodeGen/ParallelCG.h
@@ -40,7 +40,7 @@ std::unique_ptr<Module>
 splitCodeGen(std::unique_ptr<Module> M, ArrayRef<raw_pwrite_stream *> OSs,
              ArrayRef<llvm::raw_pwrite_stream *> BCOSs,
              const std::function<std::unique_ptr<TargetMachine>()> &TMFactory,
-             TargetMachine::CodeGenFileType FT = TargetMachine::CGFT_ObjectFile,
+             TargetMachine::CodeGenFileType FileType = TargetMachine::CGFT_ObjectFile,
              bool PreserveLocals = false);
 
 } // namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/Passes.h b/contrib/llvm/include/llvm/CodeGen/Passes.h
index 064526b1efa7..cb12b14f4435 100644
--- a/contrib/llvm/include/llvm/CodeGen/Passes.h
+++ b/contrib/llvm/include/llvm/CodeGen/Passes.h
@@ -154,6 +154,9 @@ namespace llvm {
   /// This pass adds dead/undef flags after analyzing subregister lanes.
   extern char &DetectDeadLanesID;
 
+  /// This pass perform post-ra machine sink for COPY instructions.
+  extern char &PostRAMachineSinkingID;
+
   /// FastRegisterAllocation Pass - This pass register allocates as fast as
   /// possible. It is best suited for debug code where live ranges are short.
   ///
@@ -212,6 +215,10 @@ namespace llvm {
   /// into tails of their predecessors.
   extern char &TailDuplicateID;
 
+  /// Duplicate blocks with unconditional branches into tails of their
+  /// predecessors. Variant that works before register allocation.
+  extern char &EarlyTailDuplicateID;
+
   /// MachineTraceMetrics - This pass computes critical path and CPU resource
   /// usage in an ensemble of traces.
   extern char &MachineTraceMetricsID;
@@ -269,9 +276,13 @@ namespace llvm {
   /// memory operations.
   extern char &ImplicitNullChecksID;
 
-  /// MachineLICM - This pass performs LICM on machine instructions.
+  /// This pass performs loop invariant code motion on machine instructions.
   extern char &MachineLICMID;
 
+  /// This pass performs loop invariant code motion on machine instructions.
+  /// This variant works before register allocation. \see MachineLICMID.
+  extern char &EarlyMachineLICMID;
+
   /// MachineSinking - This pass performs sinking on machine instructions.
   extern char &MachineSinkingID;
 
@@ -290,7 +301,7 @@ namespace llvm {
   /// StackSlotColoring - This pass performs stack slot coloring.
   extern char &StackSlotColoringID;
 
-  /// \brief This pass lays out funclets contiguously.
+  /// This pass lays out funclets contiguously.
   extern char &FuncletLayoutID;
 
   /// This pass inserts the XRay instrumentation sleds if they are supported by
@@ -300,7 +311,7 @@ namespace llvm {
   /// This pass inserts FEntry calls
   extern char &FEntryInserterID;
 
-  /// \brief This pass implements the "patchable-function" attribute.
+  /// This pass implements the "patchable-function" attribute.
   extern char &PatchableFunctionID;
 
   /// createStackProtectorPass - This pass adds stack protectors to functions.
@@ -318,13 +329,17 @@ namespace llvm {
 
   /// createWinEHPass - Prepares personality functions used by MSVC on Windows,
   /// in addition to the Itanium LSDA based personalities.
-  FunctionPass *createWinEHPass();
+  FunctionPass *createWinEHPass(bool DemoteCatchSwitchPHIOnly = false);
 
   /// createSjLjEHPreparePass - This pass adapts exception handling code to use
   /// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow.
   ///
   FunctionPass *createSjLjEHPreparePass();
 
+  /// createWasmEHPass - This pass adapts exception handling code to use
+  /// WebAssembly's exception handling scheme.
+  FunctionPass *createWasmEHPass();
+
   /// LocalStackSlotAllocation - This pass assigns local frame indices to stack
   /// slots relative to one another and allocates base registers to access them
   /// when it is estimated by the target to be out of range of normal frame
@@ -369,7 +384,7 @@ namespace llvm {
   ///
   ModulePass *createLowerEmuTLSPass();
 
-  /// This pass lowers the @llvm.load.relative intrinsic to instructions.
+  /// This pass lowers the \@llvm.load.relative intrinsic to instructions.
   /// This is unsafe to do earlier because a pass may combine the constant
   /// initializer into the load, which may result in an overflowing evaluation.
   ModulePass *createPreISelIntrinsicLoweringPass();
@@ -408,7 +423,7 @@ namespace llvm {
 
   /// This pass performs outlining on machine instructions directly before
   /// printing assembly.
-  ModulePass *createMachineOutlinerPass(bool OutlineFromLinkOnceODRs = false);
+  ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true);
 
   /// This pass expands the experimental reduction intrinsics into sequences of
   /// shuffles.
@@ -417,9 +432,15 @@ namespace llvm {
   // This pass expands memcmp() to load/stores.
   FunctionPass *createExpandMemCmpPass();
 
+  /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp
+  FunctionPass *createBreakFalseDeps();
+
   // This pass expands indirectbr instructions.
   FunctionPass *createIndirectBrExpandPass();
 
+  /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
+  FunctionPass *createCFIInstrInserter();
+
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/contrib/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
new file mode 100644
index 000000000000..b21b745c8fd1
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -0,0 +1,118 @@
+//==--- llvm/CodeGen/ReachingDefAnalysis.h - Reaching Def Analysis -*- C++ -*---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Reaching Defs Analysis pass.
+///
+/// This pass tracks for each instruction what is the �closest� reaching def of
+/// a given register. It is used by BreakFalseDeps (for clearance calculation)
+/// and ExecutionDomainFix (for arbitrating conflicting domains).
+///
+/// Note that this is different from the usual definition notion of liveness.
+/// The CPU doesn't care whether or not we consider a register killed.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REACHINGDEFSANALYSIS_H
+#define LLVM_CODEGEN_REACHINGDEFSANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LoopTraversal.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineInstr;
+
+/// This class provides the reaching def analysis.
+class ReachingDefAnalysis : public MachineFunctionPass {
+private:
+  MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+  unsigned NumRegUnits;
+  /// Instruction that defined each register, relative to the beginning of the
+  /// current basic block.  When a LiveRegsDefInfo is used to represent a
+  /// live-out register, this value is relative to the end of the basic block,
+  /// so it will be a negative number.
+  using LiveRegsDefInfo = std::vector<int>;
+  LiveRegsDefInfo LiveRegs;
+
+  /// Keeps clearance information for all registers. Note that this
+  /// is different from the usual definition notion of liveness. The CPU
+  /// doesn't care whether or not we consider a register killed.
+  using OutRegsInfoMap = SmallVector<LiveRegsDefInfo, 4>;
+  OutRegsInfoMap MBBOutRegsInfos;
+
+  /// Current instruction number.
+  /// The first instruction in each basic block is 0.
+  int CurInstr;
+
+  /// Maps instructions to their instruction Ids, relative to the begining of
+  /// their basic blocks.
+  DenseMap<MachineInstr *, int> InstIds;
+
+  /// All reaching defs of a given RegUnit for a given MBB.
+  using MBBRegUnitDefs = SmallVector<int, 1>;
+  /// All reaching defs of all reg units for a given MBB
+  using MBBDefsInfo = std::vector<MBBRegUnitDefs>;
+  /// All reaching defs of all reg units for a all MBBs
+  using MBBReachingDefsInfo = SmallVector<MBBDefsInfo, 4>;
+  MBBReachingDefsInfo MBBReachingDefs;
+
+  /// Default values are 'nothing happened a long time ago'.
+  const int ReachingDefDefaultVal = -(1 << 20);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  ReachingDefAnalysis() : MachineFunctionPass(ID) {
+    initializeReachingDefAnalysisPass(*PassRegistry::getPassRegistry());
+  }
+  void releaseMemory() override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  /// Provides the instruction id of the closest reaching def instruction of
+  /// PhysReg that reaches MI, relative to the begining of MI's basic block.
+  int getReachingDef(MachineInstr *MI, int PhysReg);
+
+  /// Provides the clearance - the number of instructions since the closest
+  /// reaching def instuction of PhysReg that reaches MI.
+  int getClearance(MachineInstr *MI, MCPhysReg PhysReg);
+
+private:
+  /// Set up LiveRegs by merging predecessor live-out values.
+  void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
+
+  /// Update live-out values.
+  void leaveBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
+
+  /// Process he given basic block.
+  void processBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
+
+  /// Update def-ages for registers defined by MI.
+  /// Also break dependencies on partial defs and undef uses.
+  void processDefs(MachineInstr *);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_REACHINGDEFSANALYSIS_H
diff --git a/contrib/llvm/include/llvm/CodeGen/RegAllocPBQP.h b/contrib/llvm/include/llvm/CodeGen/RegAllocPBQP.h
index 5b342863eb50..ba9763077d09 100644
--- a/contrib/llvm/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/contrib/llvm/include/llvm/CodeGen/RegAllocPBQP.h
@@ -43,10 +43,10 @@ class raw_ostream;
 namespace PBQP {
 namespace RegAlloc {
 
-/// @brief Spill option index.
+/// Spill option index.
 inline unsigned getSpillOptionIdx() { return 0; }
 
-/// \brief Metadata to speed allocatability test.
+/// Metadata to speed allocatability test.
 ///
 /// Keeps track of the number of infinities in each row and column.
 class MatrixMetadata {
@@ -89,7 +89,7 @@ private:
   std::unique_ptr<bool[]> UnsafeCols;
 };
 
-/// \brief Holds a vector of the allowed physical regs for a vreg.
+/// Holds a vector of the allowed physical regs for a vreg.
 class AllowedRegVector {
   friend hash_code hash_value(const AllowedRegVector &);
 
@@ -127,7 +127,7 @@ inline hash_code hash_value(const AllowedRegVector &OptRegs) {
                       hash_combine_range(OStart, OEnd));
 }
 
-/// \brief Holds graph-level metadata relevant to PBQP RA problems.
+/// Holds graph-level metadata relevant to PBQP RA problems.
 class GraphMetadata {
 private:
   using AllowedRegVecPool = ValuePool<AllowedRegVector>;
@@ -164,7 +164,7 @@ private:
   AllowedRegVecPool AllowedRegVecs;
 };
 
-/// \brief Holds solver state and other metadata relevant to each PBQP RA node.
+/// Holds solver state and other metadata relevant to each PBQP RA node.
 class NodeMetadata {
 public:
   using AllowedRegVector = RegAlloc::AllowedRegVector;
@@ -505,14 +505,14 @@ private:
 public:
   PBQPRAGraph(GraphMetadata Metadata) : BaseT(std::move(Metadata)) {}
 
-  /// @brief Dump this graph to dbgs().
+  /// Dump this graph to dbgs().
   void dump() const;
 
-  /// @brief Dump this graph to an output stream.
+  /// Dump this graph to an output stream.
   /// @param OS Output stream to print on.
   void dump(raw_ostream &OS) const;
 
-  /// @brief Print a representation of this graph in DOT format.
+  /// Print a representation of this graph in DOT format.
   /// @param OS Output stream to print on.
   void printDot(raw_ostream &OS) const;
 };
@@ -527,7 +527,7 @@ inline Solution solve(PBQPRAGraph& G) {
 } // end namespace RegAlloc
 } // end namespace PBQP
 
-/// @brief Create a PBQP register allocator instance.
+/// Create a PBQP register allocator instance.
 FunctionPass *
 createPBQPRegisterAllocator(char *customPassID = nullptr);
 
diff --git a/contrib/llvm/include/llvm/CodeGen/RegisterPressure.h b/contrib/llvm/include/llvm/CodeGen/RegisterPressure.h
index 2b14b78d621d..79054b9e33b7 100644
--- a/contrib/llvm/include/llvm/CodeGen/RegisterPressure.h
+++ b/contrib/llvm/include/llvm/CodeGen/RegisterPressure.h
@@ -171,10 +171,10 @@ class RegisterOperands {
 public:
   /// List of virtual registers and register units read by the instruction.
   SmallVector<RegisterMaskPair, 8> Uses;
-  /// \brief List of virtual registers and register units defined by the
+  /// List of virtual registers and register units defined by the
   /// instruction which are not dead.
   SmallVector<RegisterMaskPair, 8> Defs;
-  /// \brief List of virtual registers and register units defined by the
+  /// List of virtual registers and register units defined by the
   /// instruction but dead.
   SmallVector<RegisterMaskPair, 8> DeadDefs;
 
@@ -219,7 +219,7 @@ public:
     return const_cast<PressureDiffs*>(this)->operator[](Idx);
   }
 
-  /// \brief Record pressure difference induced by the given operand list to
+  /// Record pressure difference induced by the given operand list to
   /// node with index \p Idx.
   void addInstruction(unsigned Idx, const RegisterOperands &RegOpers,
                       const MachineRegisterInfo &MRI);
@@ -546,7 +546,7 @@ protected:
   /// Add Reg to the live in set and increase max pressure.
   void discoverLiveIn(RegisterMaskPair Pair);
 
-  /// \brief Get the SlotIndex for the first nondebug instruction including or
+  /// Get the SlotIndex for the first nondebug instruction including or
   /// after the current position.
   SlotIndex getCurrSlot() const;
 
diff --git a/contrib/llvm/include/llvm/CodeGen/RegisterScavenging.h b/contrib/llvm/include/llvm/CodeGen/RegisterScavenging.h
index 489c72b81a98..b6bd028a8cac 100644
--- a/contrib/llvm/include/llvm/CodeGen/RegisterScavenging.h
+++ b/contrib/llvm/include/llvm/CodeGen/RegisterScavenging.h
@@ -127,7 +127,7 @@ public:
 
   /// Find an unused register of the specified register class.
   /// Return 0 if none is found.
-  unsigned FindUnusedReg(const TargetRegisterClass *RegClass) const;
+  unsigned FindUnusedReg(const TargetRegisterClass *RC) const;
 
   /// Add a scavenging frame index.
   void addScavengingFrameIndex(int FI) {
@@ -158,7 +158,7 @@ public:
   /// Returns the scavenged register.
   /// This is deprecated as it depends on the quality of the kill flags being
   /// present; Use scavengeRegisterBackwards() instead!
-  unsigned scavengeRegister(const TargetRegisterClass *RegClass,
+  unsigned scavengeRegister(const TargetRegisterClass *RC,
                             MachineBasicBlock::iterator I, int SPAdj);
   unsigned scavengeRegister(const TargetRegisterClass *RegClass, int SPAdj) {
     return scavengeRegister(RegClass, MBBI, SPAdj);
@@ -218,7 +218,7 @@ private:
   /// Spill a register after position \p After and reload it before position
   /// \p UseMI.
   ScavengedInfo &spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
-                       MachineBasicBlock::iterator After,
+                       MachineBasicBlock::iterator Before,
                        MachineBasicBlock::iterator &UseMI);
 };
 
diff --git a/contrib/llvm/include/llvm/CodeGen/RegisterUsageInfo.h b/contrib/llvm/include/llvm/CodeGen/RegisterUsageInfo.h
index eabadd8d784a..efd175eeed30 100644
--- a/contrib/llvm/include/llvm/CodeGen/RegisterUsageInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/RegisterUsageInfo.h
@@ -19,6 +19,7 @@
 #ifndef LLVM_CODEGEN_PHYSICALREGISTERUSAGEINFO_H
 #define LLVM_CODEGEN_PHYSICALREGISTERUSAGEINFO_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
@@ -31,8 +32,6 @@ class Function;
 class TargetMachine;
 
 class PhysicalRegisterUsageInfo : public ImmutablePass {
-  virtual void anchor();
-
 public:
   static char ID;
 
@@ -41,25 +40,20 @@ public:
     initializePhysicalRegisterUsageInfoPass(Registry);
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-
-  /// To set TargetMachine *, which is used to print
-  /// analysis when command line option -print-regusage is used.
-  void setTargetMachine(const TargetMachine *TM_) { TM = TM_; }
+  /// Set TargetMachine which is used to print analysis.
+  void setTargetMachine(const TargetMachine &TM);
 
   bool doInitialization(Module &M) override;
 
   bool doFinalization(Module &M) override;
 
   /// To store RegMask for given Function *.
-  void storeUpdateRegUsageInfo(const Function *FP,
-                               std::vector<uint32_t> RegMask);
+  void storeUpdateRegUsageInfo(const Function &FP,
+                               ArrayRef<uint32_t> RegMask);
 
-  /// To query stored RegMask for given Function *, it will return nullptr if
-  /// function is not known.
-  const std::vector<uint32_t> *getRegUsageInfo(const Function *FP);
+  /// To query stored RegMask for given Function *, it will returns ane empty
+  /// array if function is not known.
+  ArrayRef<uint32_t> getRegUsageInfo(const Function &FP);
 
   void print(raw_ostream &OS, const Module *M = nullptr) const override;
 
diff --git a/contrib/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/contrib/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
index 03166ccdfe38..8d582ee298b6 100644
--- a/contrib/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/contrib/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -32,7 +32,7 @@ namespace llvm {
     ResourcePriorityQueue *PQ;
     explicit resource_sort(ResourcePriorityQueue *pq) : PQ(pq) {}
 
-    bool operator()(const SUnit* left, const SUnit* right) const;
+    bool operator()(const SUnit* LHS, const SUnit* RHS) const;
   };
 
   class ResourcePriorityQueue : public SchedulingPriorityQueue {
@@ -121,7 +121,7 @@ namespace llvm {
     void remove(SUnit *SU) override;
 
     /// scheduledNode - Main resource tracking point.
-    void scheduledNode(SUnit *Node) override;
+    void scheduledNode(SUnit *SU) override;
     bool isResourceAvailable(SUnit *SU);
     void reserveResources(SUnit *SU);
 
diff --git a/contrib/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/contrib/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
index 016bef1702c4..28567a1ce437 100644
--- a/contrib/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/contrib/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -29,7 +29,7 @@ namespace RTLIB {
   ///
   enum Libcall {
 #define HANDLE_LIBCALL(code, name) code,
-    #include "RuntimeLibcalls.def"
+    #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
   };
 
diff --git a/contrib/llvm/include/llvm/CodeGen/ScheduleDAG.h b/contrib/llvm/include/llvm/CodeGen/ScheduleDAG.h
index f3f2f05b877d..5e7837834ec8 100644
--- a/contrib/llvm/include/llvm/CodeGen/ScheduleDAG.h
+++ b/contrib/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -76,7 +76,7 @@ class TargetRegisterInfo;
     };
 
   private:
-    /// \brief A pointer to the depending/depended-on SUnit, and an enum
+    /// A pointer to the depending/depended-on SUnit, and an enum
     /// indicating the kind of the dependency.
     PointerIntPair<SUnit *, 2, Kind> Dep;
 
@@ -137,7 +137,7 @@ class TargetRegisterInfo;
       return !operator==(Other);
     }
 
-    /// \brief Returns the latency value for this edge, which roughly means the
+    /// Returns the latency value for this edge, which roughly means the
     /// minimum number of cycles that must elapse between the predecessor and
     /// the successor, given that they have this edge between them.
     unsigned getLatency() const {
@@ -163,7 +163,7 @@ class TargetRegisterInfo;
       return getKind() != Data;
     }
 
-    /// \brief Tests if this is an Order dependence between two memory accesses
+    /// Tests if this is an Order dependence between two memory accesses
     /// where both sides of the dependence access memory in non-volatile and
     /// fully modeled ways.
     bool isNormalMemory() const {
@@ -181,7 +181,7 @@ class TargetRegisterInfo;
       return (isNormalMemory() || isBarrier());
     }
 
-    /// \brief Tests if this is an Order dependence that is marked as
+    /// Tests if this is an Order dependence that is marked as
     /// "must alias", meaning that the SUnits at either end of the edge have a
     /// memory dependence on a known memory location.
     bool isMustAlias() const {
@@ -196,13 +196,13 @@ class TargetRegisterInfo;
       return getKind() == Order && Contents.OrdKind >= Weak;
     }
 
-    /// \brief Tests if this is an Order dependence that is marked as
+    /// Tests if this is an Order dependence that is marked as
     /// "artificial", meaning it isn't necessary for correctness.
     bool isArtificial() const {
       return getKind() == Order && Contents.OrdKind == Artificial;
     }
 
-    /// \brief Tests if this is an Order dependence that is marked as "cluster",
+    /// Tests if this is an Order dependence that is marked as "cluster",
     /// meaning it is artificial and wants to be adjacent.
     bool isCluster() const {
       return getKind() == Order && Contents.OrdKind == Cluster;
@@ -308,7 +308,7 @@ class TargetRegisterInfo;
         nullptr; ///< Is a special copy node if != nullptr.
     const TargetRegisterClass *CopySrcRC = nullptr;
 
-    /// \brief Constructs an SUnit for pre-regalloc scheduling to represent an
+    /// Constructs an SUnit for pre-regalloc scheduling to represent an
     /// SDNode and any nodes flagged to it.
     SUnit(SDNode *node, unsigned nodenum)
       : Node(node), NodeNum(nodenum), isVRegCycle(false), isCall(false),
@@ -319,7 +319,7 @@ class TargetRegisterInfo;
         isUnbuffered(false), hasReservedResource(false), isDepthCurrent(false),
         isHeightCurrent(false) {}
 
-    /// \brief Constructs an SUnit for post-regalloc scheduling to represent a
+    /// Constructs an SUnit for post-regalloc scheduling to represent a
     /// MachineInstr.
     SUnit(MachineInstr *instr, unsigned nodenum)
       : Instr(instr), NodeNum(nodenum), isVRegCycle(false), isCall(false),
@@ -330,7 +330,7 @@ class TargetRegisterInfo;
         isUnbuffered(false), hasReservedResource(false), isDepthCurrent(false),
         isHeightCurrent(false) {}
 
-    /// \brief Constructs a placeholder SUnit.
+    /// Constructs a placeholder SUnit.
     SUnit()
       : isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
         isCommutable(false), hasPhysRegUses(false), hasPhysRegDefs(false),
@@ -339,7 +339,7 @@ class TargetRegisterInfo;
         isCloned(false), isUnbuffered(false), hasReservedResource(false),
         isDepthCurrent(false), isHeightCurrent(false) {}
 
-    /// \brief Boundary nodes are placeholders for the boundary of the
+    /// Boundary nodes are placeholders for the boundary of the
     /// scheduling region.
     ///
     /// BoundaryNodes can have DAG edges, including Data edges, but they do not
@@ -362,7 +362,7 @@ class TargetRegisterInfo;
       return Node;
     }
 
-    /// \brief Returns true if this SUnit refers to a machine instruction as
+    /// Returns true if this SUnit refers to a machine instruction as
     /// opposed to an SDNode.
     bool isInstr() const { return Instr; }
 
@@ -384,7 +384,7 @@ class TargetRegisterInfo;
     /// It also adds the current node as a successor of the specified node.
     bool addPred(const SDep &D, bool Required = true);
 
-    /// \brief Adds a barrier edge to SU by calling addPred(), with latency 0
+    /// Adds a barrier edge to SU by calling addPred(), with latency 0
     /// generally or latency 1 for a store followed by a load.
     bool addPredBarrier(SUnit *SU) {
       SDep Dep(SU, SDep::Barrier);
@@ -406,7 +406,7 @@ class TargetRegisterInfo;
       return Depth;
     }
 
-    /// \brief Returns the height of this node, which is the length of the
+    /// Returns the height of this node, which is the length of the
     /// maximum path down to any node which has no successors.
     unsigned getHeight() const {
       if (!isHeightCurrent)
@@ -414,21 +414,21 @@ class TargetRegisterInfo;
       return Height;
     }
 
-    /// \brief If NewDepth is greater than this node's depth value, sets it to
+    /// If NewDepth is greater than this node's depth value, sets it to
     /// be the new depth value. This also recursively marks successor nodes
     /// dirty.
     void setDepthToAtLeast(unsigned NewDepth);
 
-    /// \brief If NewDepth is greater than this node's depth value, set it to be
+    /// If NewDepth is greater than this node's depth value, set it to be
     /// the new height value. This also recursively marks predecessor nodes
     /// dirty.
     void setHeightToAtLeast(unsigned NewHeight);
 
-    /// \brief Sets a flag in this node to indicate that its stored Depth value
+    /// Sets a flag in this node to indicate that its stored Depth value
     /// will require recomputation the next time getDepth() is called.
     void setDepthDirty();
 
-    /// \brief Sets a flag in this node to indicate that its stored Height value
+    /// Sets a flag in this node to indicate that its stored Height value
     /// will require recomputation the next time getHeight() is called.
     void setHeightDirty();
 
@@ -455,15 +455,15 @@ class TargetRegisterInfo;
       return NumSuccsLeft == 0;
     }
 
-    /// \brief Orders this node's predecessor edges such that the critical path
+    /// Orders this node's predecessor edges such that the critical path
     /// edge occurs first.
     void biasCriticalPath();
 
     void dump(const ScheduleDAG *G) const;
     void dumpAll(const ScheduleDAG *G) const;
     raw_ostream &print(raw_ostream &O,
-                       const SUnit *N = nullptr,
-                       const SUnit *X = nullptr) const;
+                       const SUnit *Entry = nullptr,
+                       const SUnit *Exit = nullptr) const;
     raw_ostream &print(raw_ostream &O, const ScheduleDAG *G) const;
 
   private:
@@ -497,7 +497,7 @@ class TargetRegisterInfo;
 
   //===--------------------------------------------------------------------===//
 
-  /// \brief This interface is used to plug different priorities computation
+  /// This interface is used to plug different priorities computation
   /// algorithms into the list scheduler. It implements the interface of a
   /// standard priority queue, where nodes are inserted in arbitrary order and
   /// returned in priority order.  The computation of the priority and the
@@ -609,7 +609,7 @@ class TargetRegisterInfo;
     virtual void addCustomGraphFeatures(GraphWriter<ScheduleDAG*> &) const {}
 
 #ifndef NDEBUG
-    /// \brief Verifies that all SUnits were scheduled and that their state is
+    /// Verifies that all SUnits were scheduled and that their state is
     /// consistent. Returns the number of scheduled SUnits.
     unsigned VerifyScheduledDAG(bool isBottomUp);
 #endif
@@ -708,7 +708,7 @@ class TargetRegisterInfo;
     /// method.
     void DFS(const SUnit *SU, int UpperBound, bool& HasLoop);
 
-    /// \brief Reassigns topological indexes for the nodes in the DAG to
+    /// Reassigns topological indexes for the nodes in the DAG to
     /// preserve the topological ordering.
     void Shift(BitVector& Visited, int LowerBound, int UpperBound);
 
@@ -735,11 +735,11 @@ class TargetRegisterInfo;
     /// Returns true if addPred(TargetSU, SU) creates a cycle.
     bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
 
-    /// \brief Updates the topological ordering to accommodate an edge to be
+    /// Updates the topological ordering to accommodate an edge to be
     /// added from SUnit \p X to SUnit \p Y.
     void AddPred(SUnit *Y, SUnit *X);
 
-    /// \brief Updates the topological ordering to accommodate an an edge to be
+    /// Updates the topological ordering to accommodate an an edge to be
     /// removed from the specified node \p N from the predecessors of the
     /// current node \p M.
     void RemovePred(SUnit *M, SUnit *N);
diff --git a/contrib/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/contrib/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 14882205584e..520a23846f6e 100644
--- a/contrib/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/contrib/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -190,7 +190,7 @@ namespace llvm {
     using SUList = std::list<SUnit *>;
 
   protected:
-    /// \brief A map from ValueType to SUList, used during DAG construction, as
+    /// A map from ValueType to SUList, used during DAG construction, as
     /// a means of remembering which SUs depend on which memory locations.
     class Value2SUsMap;
 
@@ -201,7 +201,7 @@ namespace llvm {
     void reduceHugeMemNodeMaps(Value2SUsMap &stores,
                                Value2SUsMap &loads, unsigned N);
 
-    /// \brief Adds a chain edge between SUa and SUb, but only if both
+    /// Adds a chain edge between SUa and SUb, but only if both
     /// AliasAnalysis and Target fail to deny the dependency.
     void addChainDependency(SUnit *SUa, SUnit *SUb,
                             unsigned Latency = 0);
@@ -286,7 +286,7 @@ namespace llvm {
     /// Cleans up after scheduling in the given block.
     virtual void finishBlock();
 
-    /// \brief Initialize the DAG and common scheduler state for a new
+    /// Initialize the DAG and common scheduler state for a new
     /// scheduling region. This does not actually create the DAG, only clears
     /// it. The scheduling driver may call BuildSchedGraph multiple times per
     /// scheduling region.
@@ -308,7 +308,7 @@ namespace llvm {
                          LiveIntervals *LIS = nullptr,
                          bool TrackLaneMasks = false);
 
-    /// \brief Adds dependencies from instructions in the current list of
+    /// Adds dependencies from instructions in the current list of
     /// instructions being scheduled to scheduling barrier. We want to make sure
     /// instructions which define registers that are either used by the
     /// terminator or are live-out are properly scheduled. This is especially
diff --git a/contrib/llvm/include/llvm/CodeGen/ScheduleDFS.h b/contrib/llvm/include/llvm/CodeGen/ScheduleDFS.h
index d6a8c791392c..3ecc033ac35a 100644
--- a/contrib/llvm/include/llvm/CodeGen/ScheduleDFS.h
+++ b/contrib/llvm/include/llvm/CodeGen/ScheduleDFS.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 class raw_ostream;
 
-/// \brief Represent the ILP of the subDAG rooted at a DAG node.
+/// Represent the ILP of the subDAG rooted at a DAG node.
 ///
 /// ILPValues summarize the DAG subtree rooted at each node. ILPValues are
 /// valid for all nodes regardless of their subtree membership.
@@ -62,13 +62,13 @@ struct ILPValue {
   void dump() const;
 };
 
-/// \brief Compute the values of each DAG node for various metrics during DFS.
+/// Compute the values of each DAG node for various metrics during DFS.
 class SchedDFSResult {
   friend class SchedDFSImpl;
 
   static const unsigned InvalidSubtreeID = ~0u;
 
-  /// \brief Per-SUnit data computed during DFS for various metrics.
+  /// Per-SUnit data computed during DFS for various metrics.
   ///
   /// A node's SubtreeID is set to itself when it is visited to indicate that it
   /// is the root of a subtree. Later it is set to its parent to indicate an
@@ -81,7 +81,7 @@ class SchedDFSResult {
     NodeData() = default;
   };
 
-  /// \brief Per-Subtree data computed during DFS.
+  /// Per-Subtree data computed during DFS.
   struct TreeData {
     unsigned ParentTreeID = InvalidSubtreeID;
     unsigned SubInstrCount = 0;
@@ -89,7 +89,7 @@ class SchedDFSResult {
     TreeData() = default;
   };
 
-  /// \brief Record a connection between subtrees and the connection level.
+  /// Record a connection between subtrees and the connection level.
   struct Connection {
     unsigned TreeID;
     unsigned Level;
@@ -117,15 +117,15 @@ public:
   SchedDFSResult(bool IsBU, unsigned lim)
     : IsBottomUp(IsBU), SubtreeLimit(lim) {}
 
-  /// \brief Get the node cutoff before subtrees are considered significant.
+  /// Get the node cutoff before subtrees are considered significant.
   unsigned getSubtreeLimit() const { return SubtreeLimit; }
 
-  /// \brief Return true if this DFSResult is uninitialized.
+  /// Return true if this DFSResult is uninitialized.
   ///
   /// resize() initializes DFSResult, while compute() populates it.
   bool empty() const { return DFSNodeData.empty(); }
 
-  /// \brief Clear the results.
+  /// Clear the results.
   void clear() {
     DFSNodeData.clear();
     DFSTreeData.clear();
@@ -133,37 +133,37 @@ public:
     SubtreeConnectLevels.clear();
   }
 
-  /// \brief Initialize the result data with the size of the DAG.
+  /// Initialize the result data with the size of the DAG.
   void resize(unsigned NumSUnits) {
     DFSNodeData.resize(NumSUnits);
   }
 
-  /// \brief Compute various metrics for the DAG with given roots.
+  /// Compute various metrics for the DAG with given roots.
   void compute(ArrayRef<SUnit> SUnits);
 
-  /// \brief Get the number of instructions in the given subtree and its
+  /// Get the number of instructions in the given subtree and its
   /// children.
   unsigned getNumInstrs(const SUnit *SU) const {
     return DFSNodeData[SU->NodeNum].InstrCount;
   }
 
-  /// \brief Get the number of instructions in the given subtree not including
+  /// Get the number of instructions in the given subtree not including
   /// children.
   unsigned getNumSubInstrs(unsigned SubtreeID) const {
     return DFSTreeData[SubtreeID].SubInstrCount;
   }
 
-  /// \brief Get the ILP value for a DAG node.
+  /// Get the ILP value for a DAG node.
   ///
   /// A leaf node has an ILP of 1/1.
   ILPValue getILP(const SUnit *SU) const {
     return ILPValue(DFSNodeData[SU->NodeNum].InstrCount, 1 + SU->getDepth());
   }
 
-  /// \brief The number of subtrees detected in this DAG.
+  /// The number of subtrees detected in this DAG.
   unsigned getNumSubtrees() const { return SubtreeConnectLevels.size(); }
 
-  /// \brief Get the ID of the subtree the given DAG node belongs to.
+  /// Get the ID of the subtree the given DAG node belongs to.
   ///
   /// For convenience, if DFSResults have not been computed yet, give everything
   /// tree ID 0.
@@ -174,7 +174,7 @@ public:
     return DFSNodeData[SU->NodeNum].SubtreeID;
   }
 
-  /// \brief Get the connection level of a subtree.
+  /// Get the connection level of a subtree.
   ///
   /// For bottom-up trees, the connection level is the latency depth (in cycles)
   /// of the deepest connection to another subtree.
@@ -182,7 +182,7 @@ public:
     return SubtreeConnectLevels[SubtreeID];
   }
 
-  /// \brief Scheduler callback to update SubtreeConnectLevels when a tree is
+  /// Scheduler callback to update SubtreeConnectLevels when a tree is
   /// initially scheduled.
   void scheduleTree(unsigned SubtreeID);
 };
diff --git a/contrib/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/contrib/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
index 466ab532030c..3f75d108f282 100644
--- a/contrib/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
+++ b/contrib/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
@@ -106,7 +106,7 @@ class ScoreboardHazardRecognizer : public ScheduleHazardRecognizer {
   Scoreboard RequiredScoreboard;
 
 public:
-  ScoreboardHazardRecognizer(const InstrItineraryData *ItinData,
+  ScoreboardHazardRecognizer(const InstrItineraryData *II,
                              const ScheduleDAG *DAG,
                              const char *ParentDebugType = "");
 
diff --git a/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h b/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h
index 6a5c2db34bb1..888f9425ff90 100644
--- a/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/contrib/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -28,11 +28,12 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DebugLoc.h"
@@ -44,6 +45,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include <algorithm>
 #include <cassert>
@@ -71,8 +73,10 @@ class MachineConstantPoolValue;
 class MCSymbol;
 class OptimizationRemarkEmitter;
 class SDDbgValue;
+class SDDbgLabel;
 class SelectionDAG;
 class SelectionDAGTargetInfo;
+class TargetLibraryInfo;
 class TargetLowering;
 class TargetMachine;
 class TargetSubtargetInfo;
@@ -145,6 +149,7 @@ class SDDbgInfo {
   BumpPtrAllocator Alloc;
   SmallVector<SDDbgValue*, 32> DbgValues;
   SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
+  SmallVector<SDDbgLabel*, 4> DbgLabels;
   using DbgValMapType = DenseMap<const SDNode *, SmallVector<SDDbgValue *, 2>>;
   DbgValMapType DbgValMap;
 
@@ -161,7 +166,11 @@ public:
       DbgValMap[Node].push_back(V);
   }
 
-  /// \brief Invalidate all DbgValues attached to the node and remove
+  void add(SDDbgLabel *L) {
+    DbgLabels.push_back(L);
+  }
+
+  /// Invalidate all DbgValues attached to the node and remove
   /// it from the Node-to-DbgValues map.
   void erase(const SDNode *Node);
 
@@ -169,13 +178,14 @@ public:
     DbgValMap.clear();
     DbgValues.clear();
     ByvalParmDbgValues.clear();
+    DbgLabels.clear();
     Alloc.Reset();
   }
 
   BumpPtrAllocator &getAlloc() { return Alloc; }
 
   bool empty() const {
-    return DbgValues.empty() && ByvalParmDbgValues.empty();
+    return DbgValues.empty() && ByvalParmDbgValues.empty() && DbgLabels.empty();
   }
 
   ArrayRef<SDDbgValue*> getSDDbgValues(const SDNode *Node) {
@@ -186,11 +196,14 @@ public:
   }
 
   using DbgIterator = SmallVectorImpl<SDDbgValue*>::iterator;
+  using DbgLabelIterator = SmallVectorImpl<SDDbgLabel*>::iterator;
 
   DbgIterator DbgBegin() { return DbgValues.begin(); }
   DbgIterator DbgEnd()   { return DbgValues.end(); }
   DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); }
   DbgIterator ByvalParmDbgEnd()   { return ByvalParmDbgValues.end(); }
+  DbgLabelIterator DbgLabelBegin() { return DbgLabels.begin(); }
+  DbgLabelIterator DbgLabelEnd()   { return DbgLabels.end(); }
 };
 
 void checkForCycles(const SelectionDAG *DAG, bool force = false);
@@ -210,11 +223,15 @@ class SelectionDAG {
   const TargetMachine &TM;
   const SelectionDAGTargetInfo *TSI = nullptr;
   const TargetLowering *TLI = nullptr;
+  const TargetLibraryInfo *LibInfo = nullptr;
   MachineFunction *MF;
   Pass *SDAGISelPass = nullptr;
   LLVMContext *Context;
   CodeGenOpt::Level OptLevel;
 
+  DivergenceAnalysis * DA = nullptr;
+  FunctionLoweringInfo * FLI = nullptr;
+
   /// The function-level optimization remark emitter.  Used to emit remarks
   /// whenever manipulating the DAG.
   OptimizationRemarkEmitter *ORE;
@@ -248,7 +265,7 @@ class SelectionDAG {
   /// Pool allocation for misc. objects that are created once per SelectionDAG.
   BumpPtrAllocator Allocator;
 
-  /// Tracks dbg_value information through SDISel.
+  /// Tracks dbg_value and dbg_label information through SDISel.
   SDDbgInfo *DbgInfo;
 
   uint16_t NextPersistentId = 0;
@@ -344,19 +361,7 @@ private:
          .getRawSubclassData();
   }
 
-  void createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
-    assert(!Node->OperandList && "Node already has operands");
-    SDUse *Ops = OperandRecycler.allocate(
-        ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
-
-    for (unsigned I = 0; I != Vals.size(); ++I) {
-      Ops[I].setUser(Node);
-      Ops[I].setInitial(Vals[I]);
-    }
-    Node->NumOperands = Vals.size();
-    Node->OperandList = Ops;
-    checkForCycles(Node);
-  }
+  void createOperands(SDNode *Node, ArrayRef<SDValue> Vals);
 
   void removeOperands(SDNode *Node) {
     if (!Node->OperandList)
@@ -367,7 +372,7 @@ private:
     Node->NumOperands = 0;
     Node->OperandList = nullptr;
   }
-
+  void CreateTopologicalOrder(std::vector<SDNode*>& Order);
 public:
   explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
   SelectionDAG(const SelectionDAG &) = delete;
@@ -376,7 +381,12 @@ public:
 
   /// Prepare this SelectionDAG to process code in the given MachineFunction.
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
-            Pass *PassPtr);
+            Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
+            DivergenceAnalysis * Divergence);
+
+  void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
+    FLI = FuncInfo;
+  }
 
   /// Clear state and free memory necessary to make this
   /// SelectionDAG ready to process a new block.
@@ -389,6 +399,7 @@ public:
   const TargetMachine &getTarget() const { return TM; }
   const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
   const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
+  const TargetLibraryInfo &getLibInfo() const { return *LibInfo; }
   const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
   LLVMContext *getContext() const {return Context; }
   OptimizationRemarkEmitter &getORE() const { return *ORE; }
@@ -460,6 +471,8 @@ public:
     return Root;
   }
 
+  void VerifyDAGDiverence();
+
   /// This iterates over the nodes in the SelectionDAG, folding
   /// certain types of nodes together, or eliminating superfluous nodes.  The
   /// Level argument controls whether Combine is allowed to produce nodes and
@@ -483,7 +496,7 @@ public:
   /// the graph.
   void Legalize();
 
-  /// \brief Transforms a SelectionDAG node and any operands to it into a node
+  /// Transforms a SelectionDAG node and any operands to it into a node
   /// that is compatible with the target instruction selector, as indicated by
   /// the TargetLowering object.
   ///
@@ -534,7 +547,7 @@ public:
   //===--------------------------------------------------------------------===//
   // Node creation methods.
 
-  /// \brief Create a ConstantSDNode wrapping a constant value.
+  /// Create a ConstantSDNode wrapping a constant value.
   /// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
   ///
   /// If only legal types can be produced, this does the necessary
@@ -567,9 +580,13 @@ public:
                             bool isOpaque = false) {
     return getConstant(Val, DL, VT, true, isOpaque);
   }
+
+  /// Create a true or false constant of type \p VT using the target's
+  /// BooleanContent for type \p OpVT.
+  SDValue getBoolConstant(bool V, const SDLoc &DL, EVT VT, EVT OpVT);
   /// @}
 
-  /// \brief Create a ConstantFPSDNode wrapping a constant value.
+  /// Create a ConstantFPSDNode wrapping a constant value.
   /// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
   ///
   /// If only legal types can be produced, this does the necessary
@@ -581,7 +598,7 @@ public:
                         bool isTarget = false);
   SDValue getConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT,
                         bool isTarget = false);
-  SDValue getConstantFP(const ConstantFP &CF, const SDLoc &DL, EVT VT,
+  SDValue getConstantFP(const ConstantFP &V, const SDLoc &DL, EVT VT,
                         bool isTarget = false);
   SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT) {
     return getConstantFP(Val, DL, VT, true);
@@ -741,7 +758,7 @@ public:
     return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
   }
 
-  /// \brief Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
+  /// Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
   /// the shuffle node in input but with swapped operands.
   ///
   /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3>
@@ -765,7 +782,7 @@ public:
 
   /// Return the expression required to zero extend the Op
   /// value assuming it was the smaller SrcTy value.
-  SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT SrcTy);
+  SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);
 
   /// Return an operation which will any-extend the low lanes of the operand
   /// into the specified vector type. For example,
@@ -793,10 +810,10 @@ public:
   /// Create a bitwise NOT operation as (XOR Val, -1).
   SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT);
 
-  /// \brief Create a logical NOT operation as (XOR Val, BooleanOne).
+  /// Create a logical NOT operation as (XOR Val, BooleanOne).
   SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);
 
-  /// \brief Create an add instruction with appropriate flags when used for
+  /// Create an add instruction with appropriate flags when used for
   /// addressing some offset of an object. i.e. if a load is split into multiple
   /// components, create an add nuw from the base pointer to the offset.
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, int64_t Offset) {
@@ -862,17 +879,18 @@ public:
                   ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
   SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys,
                   ArrayRef<SDValue> Ops);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs,
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                   ArrayRef<SDValue> Ops);
 
   // Specialize based on number of operands.
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N,
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
                   const SDNodeFlags Flags = SDNodeFlags());
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
                   SDValue N2, const SDNodeFlags Flags = SDNodeFlags());
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDValue N3);
+                  SDValue N2, SDValue N3,
+                  const SDNodeFlags Flags = SDNodeFlags());
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
                   SDValue N2, SDValue N3, SDValue N4);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
@@ -880,15 +898,15 @@ public:
 
   // Specialize again based on number of operands for nodes with a VTList
   // rather than a single VT.
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList);
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N);
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
                   SDValue N2);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
                   SDValue N2, SDValue N3);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
                   SDValue N2, SDValue N3, SDValue N4);
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
                   SDValue N2, SDValue N3, SDValue N4, SDValue N5);
 
   /// Compute a TokenFactor to force all the incoming stack arguments to be
@@ -910,6 +928,23 @@ public:
                     SDValue Size, unsigned Align, bool isVol, bool isTailCall,
                     MachinePointerInfo DstPtrInfo);
 
+  SDValue getAtomicMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                          unsigned DstAlign, SDValue Src, unsigned SrcAlign,
+                          SDValue Size, Type *SizeTy, unsigned ElemSz,
+                          bool isTailCall, MachinePointerInfo DstPtrInfo,
+                          MachinePointerInfo SrcPtrInfo);
+
+  SDValue getAtomicMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                           unsigned DstAlign, SDValue Src, unsigned SrcAlign,
+                           SDValue Size, Type *SizeTy, unsigned ElemSz,
+                           bool isTailCall, MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo);
+
+  SDValue getAtomicMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                          unsigned DstAlign, SDValue Value, SDValue Size,
+                          Type *SizeTy, unsigned ElemSz, bool isTailCall,
+                          MachinePointerInfo DstPtrInfo);
+
   /// Helper function to make it easier to build SetCC's if you just
   /// have an ISD::CondCode instead of an SDValue.
   ///
@@ -1050,12 +1085,12 @@ public:
                    MachineMemOperand *MMO);
   SDValue
   getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
-                MachinePointerInfo PtrInfo, EVT TVT, unsigned Alignment = 0,
+                MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment = 0,
                 MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
                 const AAMDNodes &AAInfo = AAMDNodes());
   SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
-                        SDValue Ptr, EVT TVT, MachineMemOperand *MMO);
-  SDValue getIndexedStore(SDValue OrigStoe, const SDLoc &dl, SDValue Base,
+                        SDValue Ptr, EVT SVT, MachineMemOperand *MMO);
+  SDValue getIndexedStore(SDValue OrigStore, const SDLoc &dl, SDValue Base,
                           SDValue Offset, ISD::MemIndexedMode AM);
 
   /// Returns sum of the base pointer and offset.
@@ -1121,28 +1156,31 @@ public:
                                SDValue Op3, SDValue Op4, SDValue Op5);
   SDNode *UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops);
 
+  // Propagates the change in divergence to users
+  void updateDivergence(SDNode * N);
+
   /// These are used for target selectors to *mutate* the
   /// specified node to have the specified return type, Target opcode, and
   /// operands.  Note that target opcodes are stored as
   /// ~TargetOpcode in the node opcode field.  The resultant node is returned.
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT, SDValue Op1);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT);
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1);
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
                        SDValue Op1, SDValue Op2);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
                        SDValue Op1, SDValue Op2, SDValue Op3);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
                        ArrayRef<SDValue> Ops);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2);
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
                        EVT VT2, ArrayRef<SDValue> Ops);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
                        EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
                        EVT VT2, SDValue Op1);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
                        EVT VT2, SDValue Op1, SDValue Op2);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, SDVTList VTs,
+  SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, SDVTList VTs,
                        ArrayRef<SDValue> Ops);
 
   /// This *mutates* the specified node to have the specified
@@ -1197,7 +1235,7 @@ public:
                                 SDValue Operand, SDValue Subreg);
 
   /// Get the specified node if it's already available, or else return NULL.
-  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops,
+  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops,
                           const SDNodeFlags Flags = SDNodeFlags());
 
   /// Creates a SDDbgValue node.
@@ -1212,8 +1250,16 @@ public:
 
   /// Creates a FrameIndex SDDbgValue node.
   SDDbgValue *getFrameIndexDbgValue(DIVariable *Var, DIExpression *Expr,
-                                    unsigned FI, const DebugLoc &DL,
-                                    unsigned O);
+                                    unsigned FI, bool IsIndirect,
+                                    const DebugLoc &DL, unsigned O);
+
+  /// Creates a VReg SDDbgValue node.
+  SDDbgValue *getVRegDbgValue(DIVariable *Var, DIExpression *Expr,
+                              unsigned VReg, bool IsIndirect,
+                              const DebugLoc &DL, unsigned O);
+
+  /// Creates a SDDbgLabel node.
+  SDDbgLabel *getDbgLabel(DILabel *Label, const DebugLoc &DL, unsigned O);
 
   /// Transfer debug values from one node to another, while optionally
   /// generating fragment expressions for split-up values. If \p InvalidateDbg
@@ -1245,7 +1291,7 @@ public:
   /// to be given new uses. These new uses of From are left in place, and
   /// not automatically transferred to To.
   ///
-  void ReplaceAllUsesWith(SDValue From, SDValue Op);
+  void ReplaceAllUsesWith(SDValue From, SDValue To);
   void ReplaceAllUsesWith(SDNode *From, SDNode *To);
   void ReplaceAllUsesWith(SDNode *From, const SDValue *To);
 
@@ -1296,6 +1342,9 @@ public:
   /// value is produced by SD.
   void AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter);
 
+  /// Add a dbg_label SDNode.
+  void AddDbgLabel(SDDbgLabel *DB);
+
   /// Get the debug values which reference the given SDNode.
   ArrayRef<SDDbgValue*> GetDbgValues(const SDNode* SD) {
     return DbgInfo->getSDDbgValues(SD);
@@ -1317,6 +1366,13 @@ public:
     return DbgInfo->ByvalParmDbgEnd();
   }
 
+  SDDbgInfo::DbgLabelIterator DbgLabelBegin() {
+    return DbgInfo->DbgLabelBegin();
+  }
+  SDDbgInfo::DbgLabelIterator DbgLabelEnd() {
+    return DbgInfo->DbgLabelEnd();
+  }
+
   /// To be invoked on an SDNode that is slated to be erased. This
   /// function mirrors \c llvm::salvageDebugInfo.
   void salvageDebugInfo(SDNode &N);
@@ -1431,8 +1487,11 @@ public:
   /// Test whether the given SDValue is known to never be NaN.
   bool isKnownNeverNaN(SDValue Op) const;
 
-  /// Test whether the given SDValue is known to never be positive or negative
-  /// zero.
+  /// Test whether the given floating point SDValue is known to never be
+  /// positive or negative zero.
+  bool isKnownNeverZeroFloat(SDValue Op) const;
+
+  /// Test whether the given SDValue is known to contain non-zero value(s).
   bool isKnownNeverZero(SDValue Op) const;
 
   /// Test whether two SDValues are known to compare equal. This
diff --git a/contrib/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/contrib/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index de6849a1eae1..86df0af7303f 100644
--- a/contrib/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/contrib/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -110,6 +110,11 @@ public:
                             CodeGenOpt::Level OptLevel,
                             bool IgnoreChains = false);
 
+  static void InvalidateNodeId(SDNode *N);
+  static int getUninvalidatedNodeId(SDNode *N);
+
+  static void EnforceNodeIdInvariant(SDNode *N);
+
   // Opcodes used by the DAG state machine:
   enum BuiltinOpcodes {
     OPC_Scope,
@@ -199,23 +204,28 @@ protected:
   /// of the new node T.
   void ReplaceUses(SDValue F, SDValue T) {
     CurDAG->ReplaceAllUsesOfValueWith(F, T);
+    EnforceNodeIdInvariant(T.getNode());
   }
 
   /// ReplaceUses - replace all uses of the old nodes F with the use
   /// of the new nodes T.
   void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) {
     CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num);
+    for (unsigned i = 0; i < Num; ++i)
+      EnforceNodeIdInvariant(T[i].getNode());
   }
 
   /// ReplaceUses - replace all uses of the old node F with the use
   /// of the new node T.
   void ReplaceUses(SDNode *F, SDNode *T) {
     CurDAG->ReplaceAllUsesWith(F, T);
+    EnforceNodeIdInvariant(T);
   }
 
   /// Replace all uses of \c F with \c T, then remove \c F from the DAG.
   void ReplaceNode(SDNode *F, SDNode *T) {
     CurDAG->ReplaceAllUsesWith(F, T);
+    EnforceNodeIdInvariant(T);
     CurDAG->RemoveDeadNode(F);
   }
 
@@ -270,7 +280,7 @@ public:
   void SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
                         unsigned TableSize);
 
-  /// \brief Return true if complex patterns for this target can mutate the
+  /// Return true if complex patterns for this target can mutate the
   /// DAG.
   virtual bool ComplexPatternFuncMutatesDAG() const {
     return false;
@@ -282,14 +292,14 @@ private:
 
   // Calls to these functions are generated by tblgen.
   void Select_INLINEASM(SDNode *N);
-  void Select_READ_REGISTER(SDNode *N);
-  void Select_WRITE_REGISTER(SDNode *N);
+  void Select_READ_REGISTER(SDNode *Op);
+  void Select_WRITE_REGISTER(SDNode *Op);
   void Select_UNDEF(SDNode *N);
   void CannotYetSelect(SDNode *N);
 
 private:
   void DoInstructionSelection();
-  SDNode *MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTs,
+  SDNode *MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
                     ArrayRef<SDValue> Ops, unsigned EmitNodeInfo);
 
   SDNode *MutateStrictFPToFP(SDNode *Node, unsigned NewOpc);
@@ -299,10 +309,10 @@ private:
   /// instruction selected, false if no code should be emitted for it.
   bool PrepareEHLandingPad();
 
-  /// \brief Perform instruction selection on all basic blocks in the function.
+  /// Perform instruction selection on all basic blocks in the function.
   void SelectAllBasicBlocks(const Function &Fn);
 
-  /// \brief Perform instruction selection on a single basic block, for
+  /// Perform instruction selection on a single basic block, for
   /// instructions between \p Begin and \p End.  \p HadTailCall will be set
   /// to true if a call in the block was translated as a tail call.
   void SelectBasicBlock(BasicBlock::const_iterator Begin,
@@ -312,7 +322,7 @@ private:
 
   void CodeGenAndEmitDAG();
 
-  /// \brief Generate instructions for lowering the incoming arguments of the
+  /// Generate instructions for lowering the incoming arguments of the
   /// given function.
   void LowerArguments(const Function &F);
 
diff --git a/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 522c2f1b2cb2..1af22185d366 100644
--- a/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -31,17 +31,18 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include <algorithm>
 #include <cassert>
 #include <climits>
@@ -189,8 +190,10 @@ public:
   inline bool isUndef() const;
   inline unsigned getMachineOpcode() const;
   inline const DebugLoc &getDebugLoc() const;
-  inline void dump(const SelectionDAG *G = nullptr) const;
-  inline void dumpr(const SelectionDAG *G = nullptr) const;
+  inline void dump() const;
+  inline void dump(const SelectionDAG *G) const;
+  inline void dumpr() const;
+  inline void dumpr(const SelectionDAG *G) const;
 
   /// Return true if this operand (which must be a chain) reaches the
   /// specified operand without crossing any side-effecting instructions.
@@ -357,21 +360,34 @@ private:
   bool NoUnsignedWrap : 1;
   bool NoSignedWrap : 1;
   bool Exact : 1;
-  bool UnsafeAlgebra : 1;
   bool NoNaNs : 1;
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
   bool VectorReduction : 1;
   bool AllowContract : 1;
+  bool ApproximateFuncs : 1;
+  bool AllowReassociation : 1;
 
 public:
   /// Default constructor turns off all optimization flags.
   SDNodeFlags()
       : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false),
-        Exact(false), UnsafeAlgebra(false), NoNaNs(false), NoInfs(false),
+        Exact(false), NoNaNs(false), NoInfs(false),
         NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false),
-        AllowContract(false) {}
+        AllowContract(false), ApproximateFuncs(false),
+        AllowReassociation(false) {}
+
+  /// Propagate the fast-math-flags from an IR FPMathOperator.
+  void copyFMF(const FPMathOperator &FPMO) {
+    setNoNaNs(FPMO.hasNoNaNs());
+    setNoInfs(FPMO.hasNoInfs());
+    setNoSignedZeros(FPMO.hasNoSignedZeros());
+    setAllowReciprocal(FPMO.hasAllowReciprocal());
+    setAllowContract(FPMO.hasAllowContract());
+    setApproximateFuncs(FPMO.hasApproxFunc());
+    setAllowReassociation(FPMO.hasAllowReassoc());
+  }
 
   /// Sets the state of the flags to the defined state.
   void setDefined() { AnyDefined = true; }
@@ -391,10 +407,6 @@ public:
     setDefined();
     Exact = b;
   }
-  void setUnsafeAlgebra(bool b) {
-    setDefined();
-    UnsafeAlgebra = b;
-  }
   void setNoNaNs(bool b) {
     setDefined();
     NoNaNs = b;
@@ -419,18 +431,32 @@ public:
     setDefined();
     AllowContract = b;
   }
+  void setApproximateFuncs(bool b) {
+    setDefined();
+    ApproximateFuncs = b;
+  }
+  void setAllowReassociation(bool b) {
+    setDefined();
+    AllowReassociation = b;
+  }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
   bool hasNoSignedWrap() const { return NoSignedWrap; }
   bool hasExact() const { return Exact; }
-  bool hasUnsafeAlgebra() const { return UnsafeAlgebra; }
   bool hasNoNaNs() const { return NoNaNs; }
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
   bool hasVectorReduction() const { return VectorReduction; }
   bool hasAllowContract() const { return AllowContract; }
+  bool hasApproximateFuncs() const { return ApproximateFuncs; }
+  bool hasAllowReassociation() const { return AllowReassociation; }
+
+  bool isFast() const {
+    return NoSignedZeros && AllowReciprocal && NoNaNs && NoInfs &&
+           AllowContract && ApproximateFuncs && AllowReassociation;
+  }
 
   /// Clear any flags in this flag set that aren't also set in Flags.
   /// If the given Flags are undefined then don't do anything.
@@ -440,13 +466,14 @@ public:
     NoUnsignedWrap &= Flags.NoUnsignedWrap;
     NoSignedWrap &= Flags.NoSignedWrap;
     Exact &= Flags.Exact;
-    UnsafeAlgebra &= Flags.UnsafeAlgebra;
     NoNaNs &= Flags.NoNaNs;
     NoInfs &= Flags.NoInfs;
     NoSignedZeros &= Flags.NoSignedZeros;
     AllowReciprocal &= Flags.AllowReciprocal;
     VectorReduction &= Flags.VectorReduction;
     AllowContract &= Flags.AllowContract;
+    ApproximateFuncs &= Flags.ApproximateFuncs;
+    AllowReassociation &= Flags.AllowReassociation;
   }
 };
 
@@ -466,11 +493,13 @@ protected:
     friend class SDNode;
     friend class MemIntrinsicSDNode;
     friend class MemSDNode;
+    friend class SelectionDAG;
 
     uint16_t HasDebugValue : 1;
     uint16_t IsMemIntrinsic : 1;
+    uint16_t IsDivergent : 1;
   };
-  enum { NumSDNodeBits = 2 };
+  enum { NumSDNodeBits = 3 };
 
   class ConstantSDNodeBitfields {
     friend class ConstantSDNode;
@@ -540,7 +569,7 @@ protected:
   static_assert(sizeof(ConstantSDNodeBitfields) <= 2, "field too wide");
   static_assert(sizeof(MemSDNodeBitfields) <= 2, "field too wide");
   static_assert(sizeof(LSBaseSDNodeBitfields) <= 2, "field too wide");
-  static_assert(sizeof(LoadSDNodeBitfields) <= 4, "field too wide");
+  static_assert(sizeof(LoadSDNodeBitfields) <= 2, "field too wide");
   static_assert(sizeof(StoreSDNodeBitfields) <= 2, "field too wide");
 
 private:
@@ -662,6 +691,8 @@ public:
   bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; }
   void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; }
 
+  bool isDivergent() const { return SDNodeBits.IsDivergent; }
+
   /// Return true if there are no uses of this node.
   bool use_empty() const { return UseList == nullptr; }
 
@@ -796,16 +827,44 @@ public:
   /// searches to be performed in parallel, caching of results across
   /// queries and incremental addition to Worklist. Stops early if N is
   /// found but will resume. Remember to clear Visited and Worklists
-  /// if DAG changes.
+  /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before
+  /// giving up. The TopologicalPrune flag signals that positive NodeIds are
+  /// topologically ordered (Operands have strictly smaller node id) and search
+  /// can be pruned leveraging this.
   static bool hasPredecessorHelper(const SDNode *N,
                                    SmallPtrSetImpl<const SDNode *> &Visited,
                                    SmallVectorImpl<const SDNode *> &Worklist,
-                                   unsigned int MaxSteps = 0) {
+                                   unsigned int MaxSteps = 0,
+                                   bool TopologicalPrune = false) {
+    SmallVector<const SDNode *, 8> DeferredNodes;
     if (Visited.count(N))
       return true;
+
+    // Node Id's are assigned in three places: As a topological
+    // ordering (> 0), during legalization (results in values set to
+    // 0), new nodes (set to -1). If N has a topolgical id then we
+    // know that all nodes with ids smaller than it cannot be
+    // successors and we need not check them. Filter out all node
+    // that can't be matches. We add them to the worklist before exit
+    // in case of multiple calls. Note that during selection the topological id
+    // may be violated if a node's predecessor is selected before it. We mark
+    // this at selection negating the id of unselected successors and
+    // restricting topological pruning to positive ids.
+
+    int NId = N->getNodeId();
+    // If we Invalidated the Id, reconstruct original NId.
+    if (NId < -1)
+      NId = -(NId + 1);
+
+    bool Found = false;
     while (!Worklist.empty()) {
       const SDNode *M = Worklist.pop_back_val();
-      bool Found = false;
+      int MId = M->getNodeId();
+      if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) &&
+          (MId > 0) && (MId < NId)) {
+        DeferredNodes.push_back(M);
+        continue;
+      }
       for (const SDValue &OpV : M->op_values()) {
         SDNode *Op = OpV.getNode();
         if (Visited.insert(Op).second)
@@ -814,11 +873,16 @@ public:
           Found = true;
       }
       if (Found)
-        return true;
+        break;
       if (MaxSteps != 0 && Visited.size() >= MaxSteps)
-        return false;
+        break;
     }
-    return false;
+    // Push deferred nodes back on worklist.
+    Worklist.append(DeferredNodes.begin(), DeferredNodes.end());
+    // If we bailed early, conservatively return found.
+    if (MaxSteps != 0 && Visited.size() >= MaxSteps)
+      return true;
+    return Found;
   }
 
   /// Return true if all the users of N are contained in Nodes.
@@ -884,6 +948,7 @@ public:
 
   const SDNodeFlags getFlags() const { return Flags; }
   void setFlags(SDNodeFlags NewFlags) { Flags = NewFlags; }
+  bool isFast() { return Flags.isFast(); }
 
   /// Clear any flags in this node that aren't also set in Flags.
   /// If Flags is not in a defined state then this has no effect.
@@ -1089,10 +1154,18 @@ inline const DebugLoc &SDValue::getDebugLoc() const {
   return Node->getDebugLoc();
 }
 
+inline void SDValue::dump() const {
+  return Node->dump();
+}
+
 inline void SDValue::dump(const SelectionDAG *G) const {
   return Node->dump(G);
 }
 
+inline void SDValue::dumpr() const {
+  return Node->dumpr();
+}
+
 inline void SDValue::dumpr(const SelectionDAG *G) const {
   return Node->dumpr(G);
 }
@@ -1173,7 +1246,7 @@ protected:
 
 public:
   MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl, SDVTList VTs,
-            EVT MemoryVT, MachineMemOperand *MMO);
+            EVT memvt, MachineMemOperand *MMO);
 
   bool readMem() const { return MMO->isLoad(); }
   bool writeMem() const { return MMO->isStore(); }
@@ -1190,7 +1263,8 @@ public:
   /// encoding of the volatile flag, as well as bits used by subclasses. This
   /// function should only be used to compute a FoldingSetNodeID value.
   /// The HasDebugValue bit is masked out because CSE map needs to match
-  /// nodes with debug info with nodes without debug info.
+  /// nodes with debug info with nodes without debug info. Same is about
+  /// isDivergent bit.
   unsigned getRawSubclassData() const {
     uint16_t Data;
     union {
@@ -1199,6 +1273,7 @@ public:
     };
     memcpy(&RawSDNodeBits, &this->RawSDNodeBits, sizeof(this->RawSDNodeBits));
     SDNodeBits.HasDebugValue = 0;
+    SDNodeBits.IsDivergent = false;
     memcpy(&Data, &RawSDNodeBits, sizeof(RawSDNodeBits));
     return Data;
   }
@@ -1267,6 +1342,7 @@ public:
            N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_AND     ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_CLR     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_OR      ||
            N->getOpcode() == ISD::ATOMIC_LOAD_XOR     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_NAND    ||
@@ -1318,6 +1394,7 @@ public:
            N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_AND     ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_CLR     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_OR      ||
            N->getOpcode() == ISD::ATOMIC_LOAD_XOR     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_NAND    ||
@@ -1421,9 +1498,8 @@ class ConstantSDNode : public SDNode {
 
   const ConstantInt *Value;
 
-  ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val,
-                 const DebugLoc &DL, EVT VT)
-      : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DL,
+  ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val, EVT VT)
+      : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DebugLoc(),
                getSDVTList(VT)),
         Value(val) {
     ConstantSDNodeBits.IsOpaque = isOpaque;
@@ -1459,10 +1535,9 @@ class ConstantFPSDNode : public SDNode {
 
   const ConstantFP *Value;
 
-  ConstantFPSDNode(bool isTarget, const ConstantFP *val, const DebugLoc &DL,
-                   EVT VT)
-      : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0, DL,
-               getSDVTList(VT)),
+  ConstantFPSDNode(bool isTarget, const ConstantFP *val, EVT VT)
+      : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0,
+               DebugLoc(), getSDVTList(VT)),
         Value(val) {}
 
 public:
@@ -1519,10 +1594,10 @@ bool isOneConstant(SDValue V);
 bool isBitwiseNot(SDValue V);
 
 /// Returns the SDNode if it is a constant splat BuildVector or constant int.
-ConstantSDNode *isConstOrConstSplat(SDValue V);
+ConstantSDNode *isConstOrConstSplat(SDValue N);
 
 /// Returns the SDNode if it is a constant splat BuildVector or constant float.
-ConstantFPSDNode *isConstOrConstSplatFP(SDValue V);
+ConstantFPSDNode *isConstOrConstSplatFP(SDValue N);
 
 class GlobalAddressSDNode : public SDNode {
   friend class SelectionDAG;
@@ -1533,7 +1608,7 @@ class GlobalAddressSDNode : public SDNode {
 
   GlobalAddressSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL,
                       const GlobalValue *GA, EVT VT, int64_t o,
-                      unsigned char TargetFlags);
+                      unsigned char TF);
 
 public:
   const GlobalValue *getGlobal() const { return TheGlobal; }
@@ -1714,13 +1789,13 @@ public:
                        unsigned MinSplatBits = 0,
                        bool isBigEndian = false) const;
 
-  /// \brief Returns the splatted value or a null value if this is not a splat.
+  /// Returns the splatted value or a null value if this is not a splat.
   ///
   /// If passed a non-null UndefElements bitvector, it will resize it to match
   /// the vector width and set the bits where elements are undef.
   SDValue getSplatValue(BitVector *UndefElements = nullptr) const;
 
-  /// \brief Returns the splatted constant or null if this is not a constant
+  /// Returns the splatted constant or null if this is not a constant
   /// splat.
   ///
   /// If passed a non-null UndefElements bitvector, it will resize it to match
@@ -1728,7 +1803,7 @@ public:
   ConstantSDNode *
   getConstantSplatNode(BitVector *UndefElements = nullptr) const;
 
-  /// \brief Returns the splatted constant FP or null if this is not a constant
+  /// Returns the splatted constant FP or null if this is not a constant
   /// FP splat.
   ///
   /// If passed a non-null UndefElements bitvector, it will resize it to match
@@ -1736,7 +1811,7 @@ public:
   ConstantFPSDNode *
   getConstantFPSplatNode(BitVector *UndefElements = nullptr) const;
 
-  /// \brief If this is a constant FP splat and the splatted constant FP is an
+  /// If this is a constant FP splat and the splatted constant FP is an
   /// exact power or 2, return the log base 2 integer value.  Otherwise,
   /// return -1.
   ///
@@ -2120,13 +2195,14 @@ public:
       : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {}
 
   // In the both nodes address is Op1, mask is Op2:
-  // MaskedGatherSDNode  (Chain, src0, mask, base, index), src0 is a passthru value
-  // MaskedScatterSDNode (Chain, value, mask, base, index)
+  // MaskedGatherSDNode  (Chain, passthru, mask, base, index, scale)
+  // MaskedScatterSDNode (Chain, value, mask, base, index, scale)
   // Mask is a vector of i1 elements
   const SDValue &getBasePtr() const { return getOperand(3); }
   const SDValue &getIndex()   const { return getOperand(4); }
   const SDValue &getMask()    const { return getOperand(2); }
   const SDValue &getValue()   const { return getOperand(1); }
+  const SDValue &getScale()   const { return getOperand(5); }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MGATHER ||
@@ -2329,6 +2405,17 @@ namespace ISD {
       cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
   }
 
+  /// Attempt to match a unary predicate against a scalar/splat constant or
+  /// every element of a constant BUILD_VECTOR.
+  bool matchUnaryPredicate(SDValue Op,
+                           std::function<bool(ConstantSDNode *)> Match);
+
+  /// Attempt to match a binary predicate against a pair of scalar/splat
+  /// constants or every element of a pair of constant BUILD_VECTORs.
+  bool matchBinaryPredicate(
+      SDValue LHS, SDValue RHS,
+      std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match);
+
 } // end namespace ISD
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/SlotIndexes.h b/contrib/llvm/include/llvm/CodeGen/SlotIndexes.h
index 3a91e363f923..334267d9828b 100644
--- a/contrib/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/contrib/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -578,9 +578,9 @@ class raw_ostream;
       assert(!MI.isInsideBundle() &&
              "Instructions inside bundles should use bundle start's slot.");
       assert(mi2iMap.find(&MI) == mi2iMap.end() && "Instr already indexed.");
-      // Numbering DBG_VALUE instructions could cause code generation to be
+      // Numbering debug instructions could cause code generation to be
       // affected by debug information.
-      assert(!MI.isDebugValue() && "Cannot number DBG_VALUE instructions.");
+      assert(!MI.isDebugInstr() && "Cannot number debug instructions.");
 
       assert(MI.getParent() != nullptr && "Instr must be added to function.");
 
@@ -674,10 +674,10 @@ class raw_ostream;
       idx2MBBMap.push_back(IdxMBBPair(startIdx, mbb));
 
       renumberIndexes(newItr);
-      std::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
+      llvm::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
     }
 
-    /// \brief Free the resources that were required to maintain a SlotIndex.
+    /// Free the resources that were required to maintain a SlotIndex.
     ///
     /// Once an index is no longer needed (for instance because the instruction
     /// at that index has been moved), the resources required to maintain the
diff --git a/contrib/llvm/include/llvm/CodeGen/StackMaps.h b/contrib/llvm/include/llvm/CodeGen/StackMaps.h
index 4407114d2741..3c9850265737 100644
--- a/contrib/llvm/include/llvm/CodeGen/StackMaps.h
+++ b/contrib/llvm/include/llvm/CodeGen/StackMaps.h
@@ -29,7 +29,7 @@ class MCStreamer;
 class raw_ostream;
 class TargetRegisterInfo;
 
-/// \brief MI-level stackmap operands.
+/// MI-level stackmap operands.
 ///
 /// MI stackmap operations take the form:
 /// <id>, <numBytes>, live args...
@@ -60,7 +60,7 @@ public:
   }
 };
 
-/// \brief MI-level patchpoint operands.
+/// MI-level patchpoint operands.
 ///
 /// MI patchpoint operations take the form:
 /// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
@@ -137,7 +137,7 @@ public:
     return getVarIdx();
   }
 
-  /// \brief Get the next scratch register operand index.
+  /// Get the next scratch register operand index.
   unsigned getNextScratchIdx(unsigned StartIdx = 0) const;
 };
 
@@ -236,15 +236,15 @@ public:
     FnInfos.clear();
   }
 
-  /// \brief Generate a stackmap record for a stackmap instruction.
+  /// Generate a stackmap record for a stackmap instruction.
   ///
   /// MI must be a raw STACKMAP, not a PATCHPOINT.
   void recordStackMap(const MachineInstr &MI);
 
-  /// \brief Generate a stackmap record for a patchpoint instruction.
+  /// Generate a stackmap record for a patchpoint instruction.
   void recordPatchPoint(const MachineInstr &MI);
 
-  /// \brief Generate a stackmap record for a statepoint instruction.
+  /// Generate a stackmap record for a statepoint instruction.
   void recordStatepoint(const MachineInstr &MI);
 
   /// If there is any stack map data, create a stack map section and serialize
@@ -293,11 +293,11 @@ private:
                MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
                LiveOutVec &LiveOuts) const;
 
-  /// \brief Create a live-out register record for the given register @p Reg.
+  /// Create a live-out register record for the given register @p Reg.
   LiveOutReg createLiveOutReg(unsigned Reg,
                               const TargetRegisterInfo *TRI) const;
 
-  /// \brief Parse the register live-out mask and return a vector of live-out
+  /// Parse the register live-out mask and return a vector of live-out
   /// registers that need to be recorded in the stackmap.
   LiveOutVec parseRegisterLiveOutMask(const uint32_t *Mask) const;
 
@@ -311,16 +311,16 @@ private:
                            MachineInstr::const_mop_iterator MOE,
                            bool recordResult = false);
 
-  /// \brief Emit the stackmap header.
+  /// Emit the stackmap header.
   void emitStackmapHeader(MCStreamer &OS);
 
-  /// \brief Emit the function frame record for each function.
+  /// Emit the function frame record for each function.
   void emitFunctionFrameRecords(MCStreamer &OS);
 
-  /// \brief Emit the constant pool.
+  /// Emit the constant pool.
   void emitConstantPoolEntries(MCStreamer &OS);
 
-  /// \brief Emit the callsite info for each stackmap/patchpoint intrinsic call.
+  /// Emit the callsite info for each stackmap/patchpoint intrinsic call.
   void emitCallsiteEntries(MCStreamer &OS);
 
   void print(raw_ostream &OS);
diff --git a/contrib/llvm/include/llvm/CodeGen/StackProtector.h b/contrib/llvm/include/llvm/CodeGen/StackProtector.h
index 72de212d0df9..a506ac636a17 100644
--- a/contrib/llvm/include/llvm/CodeGen/StackProtector.h
+++ b/contrib/llvm/include/llvm/CodeGen/StackProtector.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
@@ -35,24 +36,11 @@ class TargetMachine;
 class Type;
 
 class StackProtector : public FunctionPass {
-public:
-  /// SSPLayoutKind.  Stack Smashing Protection (SSP) rules require that
-  /// vulnerable stack allocations are located close the stack protector.
-  enum SSPLayoutKind {
-    SSPLK_None,       ///< Did not trigger a stack protector.  No effect on data
-                      ///< layout.
-    SSPLK_LargeArray, ///< Array or nested array >= SSP-buffer-size.  Closest
-                      ///< to the stack protector.
-    SSPLK_SmallArray, ///< Array or nested array < SSP-buffer-size. 2nd closest
-                      ///< to the stack protector.
-    SSPLK_AddrOf      ///< The address of this allocation is exposed and
-                      ///< triggered protection.  3rd closest to the protector.
-  };
-
+private:
   /// A mapping of AllocaInsts to their required SSP layout.
-  using SSPLayoutMap = ValueMap<const AllocaInst *, SSPLayoutKind>;
+  using SSPLayoutMap = DenseMap<const AllocaInst *,
+                                MachineFrameInfo::SSPLayoutKind>;
 
-private:
   const TargetMachine *TM = nullptr;
 
   /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -70,7 +58,7 @@ private:
   /// AllocaInst triggers a stack protector.
   SSPLayoutMap Layout;
 
-  /// \brief The minimum size of buffers that will receive stack smashing
+  /// The minimum size of buffers that will receive stack smashing
   /// protection when -fstack-protection is used.
   unsigned SSPBufferSize = 0;
 
@@ -107,7 +95,7 @@ private:
   bool ContainsProtectableArray(Type *Ty, bool &IsLarge, bool Strong = false,
                                 bool InStruct = false) const;
 
-  /// \brief Check whether a stack allocation has its address taken.
+  /// Check whether a stack allocation has its address taken.
   bool HasAddressTaken(const Instruction *AI);
 
   /// RequiresStackProtector - Check whether or not this function needs a
@@ -123,14 +111,12 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-  SSPLayoutKind getSSPLayout(const AllocaInst *AI) const;
-
   // Return true if StackProtector is supposed to be handled by SelectionDAG.
   bool shouldEmitSDCheck(const BasicBlock &BB) const;
 
-  void adjustForColoring(const AllocaInst *From, const AllocaInst *To);
-
   bool runOnFunction(Function &Fn) override;
+
+  void copyToMachineFrameInfo(MachineFrameInfo &MFI) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetCallingConv.h b/contrib/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 8646a15599cb..7d138f585171 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_CODEGEN_TARGETCALLINGCONV_H
 #define LLVM_CODEGEN_TARGETCALLINGCONV_H
 
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <climits>
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/contrib/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 61f1cf07bcf2..f8effee998e3 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -158,6 +158,10 @@ public:
     return false;
   }
 
+  /// Returns true if the target can safely skip saving callee-saved registers
+  /// for noreturn nounwind functions.
+  virtual bool enableCalleeSaveSkip(const MachineFunction &MF) const;
+
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
   virtual void emitPrologue(MachineFunction &MF,
@@ -341,6 +345,14 @@ public:
           return false;
     return true;
   }
+
+  /// Return initial CFA offset value i.e. the one valid at the beginning of the
+  /// function (before any stack operations).
+  virtual int getInitialCFAOffset(const MachineFunction &MF) const;
+
+  /// Return initial CFA register value i.e. the one valid at the beginning of
+  /// the function (before any stack operations).
+  virtual unsigned getInitialCFARegister(const MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 57dee3bb44b3..b5bc561d834c 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,12 +18,14 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOutliner.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/BranchProbability.h"
@@ -79,7 +81,7 @@ public:
 
   /// Given a machine instruction descriptor, returns the register
   /// class constraint for OpNum, or NULL.
-  const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
+  const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
                                          const TargetRegisterInfo *TRI,
                                          const MachineFunction &MF) const;
 
@@ -225,6 +227,17 @@ public:
     return 0;
   }
 
+  /// Optional extension of isLoadFromStackSlot that returns the number of
+  /// bytes loaded from the stack. This must be implemented if a backend
+  /// supports partial stack slot spills/loads to further disambiguate
+  /// what the load does.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                                       int &FrameIndex,
+                                       unsigned &MemBytes) const {
+    MemBytes = 0;
+    return isLoadFromStackSlot(MI, FrameIndex);
+  }
+
   /// Check for post-frame ptr elimination stack locations as well.
   /// This uses a heuristic so it isn't reliable for correctness.
   virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
@@ -252,6 +265,17 @@ public:
     return 0;
   }
 
+  /// Optional extension of isStoreToStackSlot that returns the number of
+  /// bytes stored to the stack. This must be implemented if a backend
+  /// supports partial stack slot spills/loads to further disambiguate
+  /// what the store does.
+  virtual unsigned isStoreToStackSlot(const MachineInstr &MI,
+                                      int &FrameIndex,
+                                      unsigned &MemBytes) const {
+    MemBytes = 0;
+    return isStoreToStackSlot(MI, FrameIndex);
+  }
+
   /// Check for post-frame ptr elimination stack locations as well.
   /// This uses a heuristic, so it isn't reliable for correctness.
   virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
@@ -325,7 +349,7 @@ public:
                              unsigned SubIdx, const MachineInstr &Orig,
                              const TargetRegisterInfo &TRI) const;
 
-  /// \brief Clones instruction or the whole instruction bundle \p Orig and
+  /// Clones instruction or the whole instruction bundle \p Orig and
   /// insert into \p MBB before \p InsertBefore. The target may update operands
   /// that are required to be unique.
   ///
@@ -635,8 +659,8 @@ public:
     return true;
   }
 
-  /// Generate code to reduce the loop iteration by one and check if the loop is
-  /// finished.  Return the value/register of the the new loop count.  We need
+  /// Generate code to reduce the loop iteration by one and check if the loop
+  /// is finished.  Return the value/register of the new loop count.  We need
   /// this function when peeling off one or more iterations of a loop. This
   /// function assumes the nth iteration is peeled first.
   virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar,
@@ -822,6 +846,15 @@ public:
     llvm_unreachable("Target didn't implement TargetInstrInfo::copyPhysReg!");
   }
 
+  /// If the specific machine instruction is a instruction that moves/copies
+  /// value from one register to another register return true along with
+  /// @Source machine operand and @Destination machine operand.
+  virtual bool isCopyInstr(const MachineInstr &MI,
+                           const MachineOperand *&SourceOpNum,
+                           const MachineOperand *&Destination) const {
+    return false;
+  }
+
   /// Store the specified register of the given register class to the specified
   /// stack frame index. The store instruction is to be added to the given
   /// machine basic block before the specified machine instruction. If isKill
@@ -876,7 +909,7 @@ public:
   /// The new instruction is inserted before MI, and the client is responsible
   /// for removing the old instruction.
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
-                                  int FrameIndex,
+                                  int FI,
                                   LiveIntervals *LIS = nullptr) const;
 
   /// Same as the previous version except it allows folding of any load and
@@ -928,13 +961,13 @@ public:
   /// \param InsInstrs - Vector of new instructions that implement P
   /// \param DelInstrs - Old instructions, including Root, that could be
   /// replaced by InsInstr
-  /// \param InstrIdxForVirtReg - map of virtual register to instruction in
+  /// \param InstIdxForVirtReg - map of virtual register to instruction in
   /// InsInstr that defines it
   virtual void genAlternativeCodeSequence(
       MachineInstr &Root, MachineCombinerPattern Pattern,
       SmallVectorImpl<MachineInstr *> &InsInstrs,
       SmallVectorImpl<MachineInstr *> &DelInstrs,
-      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
+      DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const;
 
   /// Attempt to reassociate \P Root and \P Prev according to \P Pattern to
   /// reduce critical path length.
@@ -983,7 +1016,7 @@ protected:
     return nullptr;
   }
 
-  /// \brief Target-dependent implementation of getRegSequenceInputs.
+  /// Target-dependent implementation of getRegSequenceInputs.
   ///
   /// \returns true if it is possible to build the equivalent
   /// REG_SEQUENCE inputs with the pair \p MI, \p DefIdx. False otherwise.
@@ -997,7 +1030,7 @@ protected:
     return false;
   }
 
-  /// \brief Target-dependent implementation of getExtractSubregInputs.
+  /// Target-dependent implementation of getExtractSubregInputs.
   ///
   /// \returns true if it is possible to build the equivalent
   /// EXTRACT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
@@ -1011,7 +1044,7 @@ protected:
     return false;
   }
 
-  /// \brief Target-dependent implementation of getInsertSubregInputs.
+  /// Target-dependent implementation of getInsertSubregInputs.
   ///
   /// \returns true if it is possible to build the equivalent
   /// INSERT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
@@ -1433,7 +1466,7 @@ public:
     return 0;
   }
 
-  /// \brief Return the minimum clearance before an instruction that reads an
+  /// Return the minimum clearance before an instruction that reads an
   /// unused register.
   ///
   /// For example, AVX instructions may copy part of a register operand into
@@ -1500,7 +1533,7 @@ public:
     return false;
   }
 
-  /// \brief Return the value to use for the MachineCSE's LookAheadLimit,
+  /// Return the value to use for the MachineCSE's LookAheadLimit,
   /// which is a heuristic used for CSE'ing phys reg defs.
   virtual unsigned getMachineCSELookAheadLimit() const {
     // The default lookahead is small to prevent unprofitable quadratic
@@ -1569,64 +1602,32 @@ public:
     return false;
   }
 
-  /// \brief Describes the number of instructions that it will take to call and
-  /// construct a frame for a given outlining candidate.
-  struct MachineOutlinerInfo {
-    /// Number of instructions to call an outlined function for this candidate.
-    unsigned CallOverhead;
-
-    /// \brief Number of instructions to construct an outlined function frame
-    /// for this candidate.
-    unsigned FrameOverhead;
-
-    /// \brief Represents the specific instructions that must be emitted to
-    /// construct a call to this candidate.
-    unsigned CallConstructionID;
-
-    /// \brief Represents the specific instructions that must be emitted to
-    /// construct a frame for this candidate's outlined function.
-    unsigned FrameConstructionID;
-
-    MachineOutlinerInfo() {}
-    MachineOutlinerInfo(unsigned CallOverhead, unsigned FrameOverhead,
-                        unsigned CallConstructionID,
-                        unsigned FrameConstructionID)
-        : CallOverhead(CallOverhead), FrameOverhead(FrameOverhead),
-          CallConstructionID(CallConstructionID),
-          FrameConstructionID(FrameConstructionID) {}
-  };
-
-  /// \brief Returns a \p MachineOutlinerInfo struct containing target-specific
+  /// Returns a \p outliner::OutlinedFunction struct containing target-specific
   /// information for a set of outlining candidates.
-  virtual MachineOutlinerInfo getOutlininingCandidateInfo(
-      std::vector<
-          std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-          &RepeatedSequenceLocs) const {
+  virtual outliner::OutlinedFunction getOutliningCandidateInfo(
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
     llvm_unreachable(
-        "Target didn't implement TargetInstrInfo::getOutliningOverhead!");
+        "Target didn't implement TargetInstrInfo::getOutliningCandidateInfo!");
   }
 
-  /// Represents how an instruction should be mapped by the outliner.
-  /// \p Legal instructions are those which are safe to outline.
-  /// \p Illegal instructions are those which cannot be outlined.
-  /// \p Invisible instructions are instructions which can be outlined, but
-  /// shouldn't actually impact the outlining result.
-  enum MachineOutlinerInstrType { Legal, Illegal, Invisible };
-
   /// Returns how or if \p MI should be outlined.
-  virtual MachineOutlinerInstrType getOutliningType(MachineInstr &MI) const {
+  virtual outliner::InstrType
+  getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::getOutliningType!");
   }
 
-  /// Insert a custom epilogue for outlined functions.
-  /// This may be empty, in which case no epilogue or return statement will be
-  /// emitted.
-  virtual void insertOutlinerEpilogue(MachineBasicBlock &MBB,
-                                      MachineFunction &MF,
-                                      const MachineOutlinerInfo &MInfo) const {
+  /// Returns target-defined flags defining properties of the MBB for
+  /// the outliner.
+  virtual unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
+    return 0x0;
+  }
+
+  /// Insert a custom frame for outlined functions.
+  virtual void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+                                  const outliner::OutlinedFunction &OF) const {
     llvm_unreachable(
-        "Target didn't implement TargetInstrInfo::insertOutlinerEpilogue!");
+        "Target didn't implement TargetInstrInfo::buildOutlinedFrame!");
   }
 
   /// Insert a call to an outlined function into the program.
@@ -1635,20 +1636,11 @@ public:
   virtual MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const MachineOutlinerInfo &MInfo) const {
+                     const outliner::Candidate &C) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::insertOutlinedCall!");
   }
 
-  /// Insert a custom prologue for outlined functions.
-  /// This may be empty, in which case no prologue will be emitted.
-  virtual void insertOutlinerPrologue(MachineBasicBlock &MBB,
-                                      MachineFunction &MF,
-                                      const MachineOutlinerInfo &MInfo) const {
-    llvm_unreachable(
-        "Target didn't implement TargetInstrInfo::insertOutlinerPrologue!");
-  }
-
   /// Return true if the function can safely be outlined from.
   /// A function \p MF is considered safe for outlining if an outlined function
   /// produced from instructions in F will produce a program which produces the
@@ -1659,13 +1651,18 @@ public:
                      "TargetInstrInfo::isFunctionSafeToOutlineFrom!");
   }
 
+  /// Return true if the function should be outlined from by default.
+  virtual bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const {
+    return false;
+  }
+
 private:
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
 };
 
-/// \brief Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair.
+/// Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair.
 template <> struct DenseMapInfo<TargetInstrInfo::RegSubRegPair> {
   using RegInfo = DenseMapInfo<unsigned>;
 
@@ -1679,7 +1676,7 @@ template <> struct DenseMapInfo<TargetInstrInfo::RegSubRegPair> {
                                           RegInfo::getTombstoneKey());
   }
 
-  /// \brief Reuse getHashValue implementation from
+  /// Reuse getHashValue implementation from
   /// std::pair<unsigned, unsigned>.
   static unsigned getHashValue(const TargetInstrInfo::RegSubRegPair &Val) {
     std::pair<unsigned, unsigned> PairVal = std::make_pair(Val.Reg, Val.SubReg);
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetLowering.h b/contrib/llvm/include/llvm/CodeGen/TargetLowering.h
index cea8472caa35..d5ff71cf9ac2 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -29,9 +29,9 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -52,6 +52,7 @@
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -222,7 +223,7 @@ public:
   virtual ~TargetLoweringBase() = default;
 
 protected:
-  /// \brief Initialize all of the actions to default values.
+  /// Initialize all of the actions to default values.
   void initActions();
 
 public:
@@ -253,7 +254,8 @@ public:
   /// A documentation for this function would be nice...
   virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;
 
-  EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const;
+  EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
+                       bool LegalTypes = true) const;
 
   /// Returns the type to be used for the index operand of:
   /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
@@ -421,17 +423,17 @@ public:
     return true;
   }
 
-  /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
+  /// Return true if it is cheap to speculate a call to intrinsic cttz.
   virtual bool isCheapToSpeculateCttz() const {
     return false;
   }
 
-  /// \brief Return true if it is cheap to speculate a call to intrinsic ctlz.
+  /// Return true if it is cheap to speculate a call to intrinsic ctlz.
   virtual bool isCheapToSpeculateCtlz() const {
     return false;
   }
 
-  /// \brief Return true if ctlz instruction is fast.
+  /// Return true if ctlz instruction is fast.
   virtual bool isCtlzFast() const {
     return false;
   }
@@ -444,13 +446,13 @@ public:
     return false;
   }
 
-  /// \brief Return true if it is cheaper to split the store of a merged int val
+  /// Return true if it is cheaper to split the store of a merged int val
   /// from a pair of smaller values into multiple stores.
   virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const {
     return false;
   }
 
-  /// \brief Return if the target supports combining a
+  /// Return if the target supports combining a
   /// chain like:
   /// \code
   ///   %andResult = and %val1, #mask
@@ -507,7 +509,30 @@ public:
     return hasAndNotCompare(X);
   }
 
-  /// \brief Return true if the target wants to use the optimization that
+  /// There are two ways to clear extreme bits (either low or high):
+  /// Mask:    x &  (-1 << y)  (the instcombine canonical form)
+  /// Shifts:  x >> y << y
+  /// Return true if the variant with 2 shifts is preferred.
+  /// Return false if there is no preference.
+  virtual bool preferShiftsToClearExtremeBits(SDValue X) const {
+    // By default, let's assume that no one prefers shifts.
+    return false;
+  }
+
+  /// Should we tranform the IR-optimal check for whether given truncation
+  /// down into KeptBits would be truncating or not:
+  ///   (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
+  /// Into it's more traditional form:
+  ///   ((%x << C) a>> C) dstcond %x
+  /// Return true if we should transform.
+  /// Return false if there is no preference.
+  virtual bool shouldTransformSignedTruncationCheck(EVT XVT,
+                                                    unsigned KeptBits) const {
+    // By default, let's assume that no one prefers shifts.
+    return false;
+  }
+
+  /// Return true if the target wants to use the optimization that
   /// turns ext(promotableInst1(...(promotableInstN(load)))) into
   /// promotedInst1(...(promotedInstN(ext(load)))).
   bool enableExtLdPromotion() const { return EnableExtLdPromotion; }
@@ -746,10 +771,10 @@ public:
   /// operations don't trap except for integer divide and remainder.
   virtual bool canOpTrap(unsigned Op, EVT VT) const;
 
-  /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
-  /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to replace
-  /// a VAND with a constant pool entry.
-  virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+  /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
+  /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
+  /// constant pool entry.
+  virtual bool isVectorClearMaskLegal(ArrayRef<int> /*Mask*/,
                                       EVT /*VT*/) const {
     return false;
   }
@@ -765,6 +790,39 @@ public:
     return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
   }
 
+  LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const {
+    unsigned EqOpc;
+    switch (Op) {
+      default: llvm_unreachable("Unexpected FP pseudo-opcode");
+      case ISD::STRICT_FADD: EqOpc = ISD::FADD; break;
+      case ISD::STRICT_FSUB: EqOpc = ISD::FSUB; break;
+      case ISD::STRICT_FMUL: EqOpc = ISD::FMUL; break;
+      case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break;
+      case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
+      case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
+      case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
+      case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
+      case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
+      case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
+      case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
+      case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
+      case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
+      case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
+      case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
+      case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
+      case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
+    }
+
+    auto Action = getOperationAction(EqOpc, VT);
+
+    // We don't currently handle Custom or Promote for strict FP pseudo-ops.
+    // For now, we just expand for those cases.
+    if (Action != Legal)
+      Action = Expand;
+
+    return Action;
+  }
+
   /// Return true if the specified operation is legal on this target or can be
   /// made legal with custom lowering. This is used to help guide high-level
   /// lowering decisions.
@@ -812,7 +870,7 @@ public:
   bool rangeFitsInWord(const APInt &Low, const APInt &High,
                        const DataLayout &DL) const {
     // FIXME: Using the pointer type doesn't seem ideal.
-    uint64_t BW = DL.getPointerSizeInBits();
+    uint64_t BW = DL.getIndexSizeInBits(0u);
     uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
     return Range <= BW;
   }
@@ -820,7 +878,7 @@ public:
   /// Return true if lowering to a jump table is suitable for a set of case
   /// clusters which may contain \p NumCases cases, \p Range range of values.
   /// FIXME: This function check the maximum table size and density, but the
-  /// minimum size is not checked. It would be nice if the the minimum size is
+  /// minimum size is not checked. It would be nice if the minimum size is
   /// also combined within this function. Currently, the minimum size check is
   /// performed in findJumpTable() in SelectionDAGBuiler and
   /// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
@@ -986,9 +1044,14 @@ public:
 
   /// Return true if the specified condition code is legal on this target.
   bool isCondCodeLegal(ISD::CondCode CC, MVT VT) const {
-    return
-      getCondCodeAction(CC, VT) == Legal ||
-      getCondCodeAction(CC, VT) == Custom;
+    return getCondCodeAction(CC, VT) == Legal;
+  }
+
+  /// Return true if the specified condition code is legal or custom on this
+  /// target.
+  bool isCondCodeLegalOrCustom(ISD::CondCode CC, MVT VT) const {
+    return getCondCodeAction(CC, VT) == Legal ||
+           getCondCodeAction(CC, VT) == Custom;
   }
 
   /// If the action for this operation is to promote, this method returns the
@@ -1110,10 +1173,6 @@ public:
   /// Certain combinations of ABIs, Targets and features require that types
   /// are legal for some operations and not for other operations.
   /// For MIPS all vector types must be passed through the integer register set.
-  virtual MVT getRegisterTypeForCallingConv(MVT VT) const {
-    return getRegisterType(VT);
-  }
-
   virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
                                             EVT VT) const {
     return getRegisterType(Context, VT);
@@ -1172,7 +1231,7 @@ public:
     return getPointerTy(DL).getSizeInBits();
   }
 
-  /// \brief Get maximum # of store operations permitted for llvm.memset
+  /// Get maximum # of store operations permitted for llvm.memset
   ///
   /// This function returns the maximum number of store operations permitted
   /// to replace a call to llvm.memset. The value is set by the target at the
@@ -1182,7 +1241,7 @@ public:
     return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
   }
 
-  /// \brief Get maximum # of store operations permitted for llvm.memcpy
+  /// Get maximum # of store operations permitted for llvm.memcpy
   ///
   /// This function returns the maximum number of store operations permitted
   /// to replace a call to llvm.memcpy. The value is set by the target at the
@@ -1192,6 +1251,15 @@ public:
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
+  /// \brief Get maximum # of store operations to be glued together
+  ///
+  /// This function returns the maximum number of store operations permitted
+  /// to glue together during lowering of llvm.memcpy. The value is set by
+  //  the target at the performance threshold for such a replacement.
+  virtual unsigned getMaxGluedStoresPerMemcpy() const {
+    return MaxGluedStoresPerMemcpy;
+  }
+
   /// Get maximum # of load operations permitted for memcmp
   ///
   /// This function returns the maximum number of load operations permitted
@@ -1202,7 +1270,19 @@ public:
     return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
   }
 
-  /// \brief Get maximum # of store operations permitted for llvm.memmove
+  /// For memcmp expansion when the memcmp result is only compared equal or
+  /// not-equal to 0, allow up to this number of load pairs per block. As an
+  /// example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
+  ///   a0 = load2bytes &a[0]
+  ///   b0 = load2bytes &b[0]
+  ///   a2 = load1byte  &a[2]
+  ///   b2 = load1byte  &b[2]
+  ///   r  = cmp eq (a0 ^ b0 | a2 ^ b2), 0
+  virtual unsigned getMemcmpEqZeroLoadsPerBlock() const {
+    return 1;
+  }
+
+  /// Get maximum # of store operations permitted for llvm.memmove
   ///
   /// This function returns the maximum number of store operations permitted
   /// to replace a call to llvm.memmove. The value is set by the target at the
@@ -1212,7 +1292,7 @@ public:
     return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
   }
 
-  /// \brief Determine if the target supports unaligned memory accesses.
+  /// Determine if the target supports unaligned memory accesses.
   ///
   /// This function returns true if the target allows unaligned memory accesses
   /// of the specified type in the given address space. If true, it also returns
@@ -1350,7 +1430,7 @@ public:
   /// If the target has a standard location for the stack protector guard,
   /// returns the address of that location. Otherwise, returns nullptr.
   /// DEPRECATED: please override useLoadStackGuardNode and customize
-  ///             LOAD_STACK_GUARD, or customize @llvm.stackguard().
+  ///             LOAD_STACK_GUARD, or customize \@llvm.stackguard().
   virtual Value *getIRStackGuard(IRBuilder<> &IRB) const;
 
   /// Inserts necessary declarations for SSP (stack protection) purpose.
@@ -1905,7 +1985,7 @@ public:
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
-  /// \brief Return the cost of the scaling factor used in the addressing mode
+  /// Return the cost of the scaling factor used in the addressing mode
   /// represented by AM for this target, for a load/store of the specified type.
   ///
   /// If the AM is supported, the return value must be >= 0.
@@ -2098,11 +2178,14 @@ public:
     return false;
   }
 
-  /// \brief Get the maximum supported factor for interleaved memory accesses.
+  /// Return true if the target has a vector blend instruction.
+  virtual bool hasVectorBlend() const { return false; }
+
+  /// Get the maximum supported factor for interleaved memory accesses.
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
-  /// \brief Lower an interleaved load to target specific intrinsics. Return
+  /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p LI is the vector load instruction.
@@ -2116,7 +2199,7 @@ public:
     return false;
   }
 
-  /// \brief Lower an interleaved store to target specific intrinsics. Return
+  /// Lower an interleaved store to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p SI is the vector store instruction.
@@ -2189,7 +2272,7 @@ public:
     return false;
   }
 
-  /// \brief Return true if it is beneficial to convert a load of a constant to
+  /// Return true if it is beneficial to convert a load of a constant to
   /// just the constant itself.
   /// On some targets it might be more efficient to use a combination of
   /// arithmetic instructions to materialize the constant instead of loading it
@@ -2214,6 +2297,11 @@ public:
     return false;
   }
 
+  // Return true if CodeGenPrepare should consider splitting large offset of a
+  // GEP to make the GEP fit into the addressing mode and can be sunk into the
+  // same blocks of its users.
+  virtual bool shouldConsiderGEPOffsetSplit() const { return false; }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
@@ -2453,7 +2541,7 @@ protected:
   /// expected to be merged.
   unsigned GatherAllAliasesMaxDepth;
 
-  /// \brief Specify maximum number of store instructions per memset call.
+  /// Specify maximum number of store instructions per memset call.
   ///
   /// When lowering \@llvm.memset this field specifies the maximum number of
   /// store operations that may be substituted for the call to memset. Targets
@@ -2469,7 +2557,7 @@ protected:
   /// to memset, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemsetOptSize;
 
-  /// \brief Specify maximum bytes of store instructions per memcpy call.
+  /// Specify maximum bytes of store instructions per memcpy call.
   ///
   /// When lowering \@llvm.memcpy this field specifies the maximum number of
   /// store operations that may be substituted for a call to memcpy. Targets
@@ -2482,13 +2570,21 @@ protected:
   /// constant size.
   unsigned MaxStoresPerMemcpy;
 
+
+  /// \brief Specify max number of store instructions to glue in inlined memcpy.
+  ///
+  /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
+  /// of store instructions to keep together. This helps in pairing and
+  //  vectorization later on.
+  unsigned MaxGluedStoresPerMemcpy = 0;
+
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
   unsigned MaxLoadsPerMemcmp;
   unsigned MaxLoadsPerMemcmpOptSize;
 
-  /// \brief Specify maximum bytes of store instructions per memmove call.
+  /// Specify maximum bytes of store instructions per memmove call.
   ///
   /// When lowering \@llvm.memmove this field specifies the maximum number of
   /// store instructions that may be substituted for a call to memmove. Targets
@@ -2520,6 +2616,16 @@ protected:
   /// sequence of memory operands that is recognized by PrologEpilogInserter.
   MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const;
+
+  /// Replace/modify the XRay custom event operands with target-dependent
+  /// details.
+  MachineBasicBlock *emitXRayCustomEvent(MachineInstr &MI,
+                                         MachineBasicBlock *MBB) const;
+
+  /// Replace/modify the XRay typed event operands with target-dependent
+  /// details.
+  MachineBasicBlock *emitXRayTypedEvent(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const;
 };
 
 /// This class defines information used to lower LLVM code to legal SelectionDAG
@@ -2539,6 +2645,16 @@ public:
 
   bool isPositionIndependent() const;
 
+  virtual bool isSDNodeSourceOfDivergence(const SDNode *N,
+                                          FunctionLoweringInfo *FLI,
+                                          DivergenceAnalysis *DA) const {
+    return false;
+  }
+
+  virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
+    return false;
+  }
+
   /// Returns true by value, base pointer and offset pointer and addressing mode
   /// by reference if the node's address can be legally represented as
   /// pre-indexed load / store address.
@@ -2690,6 +2806,30 @@ public:
   bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
                             DAGCombinerInfo &DCI) const;
 
+  /// Look at Vector Op. At this point, we know that only the DemandedElts
+  /// elements of the result of Op are ever used downstream.  If we can use
+  /// this information to simplify Op, create a new simplified DAG node and
+  /// return true, storing the original and new nodes in TLO.
+  /// Otherwise, analyze the expression and return a mask of KnownUndef and
+  /// KnownZero elements for the expression (used to simplify the caller).
+  /// The KnownUndef/Zero elements may only be accurate for those bits
+  /// in the DemandedMask.
+  /// \p AssumeSingleUse When this parameter is true, this function will
+  ///    attempt to simplify \p Op even if there are multiple uses.
+  ///    Callers are responsible for correctly updating the DAG based on the
+  ///    results of this function, because simply replacing replacing TLO.Old
+  ///    with TLO.New will be incorrect when this parameter is true and TLO.Old
+  ///    has multiple uses.
+  bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask,
+                                  APInt &KnownUndef, APInt &KnownZero,
+                                  TargetLoweringOpt &TLO, unsigned Depth = 0,
+                                  bool AssumeSingleUse = false) const;
+
+  /// Helper wrapper around SimplifyDemandedVectorElts
+  bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
+                                  APInt &KnownUndef, APInt &KnownZero,
+                                  DAGCombinerInfo &DCI) const;
+
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
   /// argument allows us to only collect the known bits that are shared by the
@@ -2718,6 +2858,15 @@ public:
                                                    const SelectionDAG &DAG,
                                                    unsigned Depth = 0) const;
 
+  /// Attempt to simplify any target nodes based on the demanded vector
+  /// elements, returning true on success. Otherwise, analyze the expression and
+  /// return a mask of KnownUndef and KnownZero elements for the expression
+  /// (used to simplify the caller). The KnownUndef/Zero elements may only be
+  /// accurate for those bits in the DemandedMask
+  virtual bool SimplifyDemandedVectorEltsForTargetNode(
+      SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
+      APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
+
   struct DAGCombinerInfo {
     void *DC;  // The DAG Combiner object.
     CombineLevel Level;
@@ -2731,7 +2880,7 @@ public:
 
     bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
     bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
-    bool isAfterLegalizeVectorOps() const {
+    bool isAfterLegalizeDAG() const {
       return Level == AfterLegalizeDAG;
     }
     CombineLevel getDAGCombineLevel() { return Level; }
@@ -2753,12 +2902,8 @@ public:
   /// from getBooleanContents().
   bool isConstFalseVal(const SDNode *N) const;
 
-  /// Return a constant of type VT that contains a true value that respects
-  /// getBooleanContents()
-  SDValue getConstTrueVal(SelectionDAG &DAG, EVT VT, const SDLoc &DL) const;
-
   /// Return if \p N is a True value when extended to \p VT.
-  bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool Signed) const;
+  bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool SExt) const;
 
   /// Try to simplify a setcc built with the specified operands and cc. If it is
   /// unable to simplify it, return a null SDValue.
@@ -3479,7 +3624,7 @@ public:
   /// bounds the returned pointer is unspecified, but will be within the vector
   /// bounds.
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
-                                  SDValue Idx) const;
+                                  SDValue Index) const;
 
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
@@ -3518,6 +3663,13 @@ public:
   virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
                                           SelectionDAG &DAG) const;
 
+  /// Expands target specific indirect branch for the case of JumpTable
+  /// expanasion.
+  virtual SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr,
+                                         SelectionDAG &DAG) const {
+    return DAG.getNode(ISD::BRIND, dl, MVT::Other, Value, Addr);
+  }
+
   // seteq(x, 0) -> truncate(srl(ctlz(zext(x)), log2(#bits)))
   // If we're comparing for equality to zero and isCtlzFast is true, expose the
   // fact that this can be implemented as a ctlz/srl pair, so that the dag
@@ -3528,6 +3680,11 @@ private:
   SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
                                ISD::CondCode Cond, DAGCombinerInfo &DCI,
                                const SDLoc &DL) const;
+
+  SDValue optimizeSetCCOfSignedTruncationCheck(EVT SCCVT, SDValue N0,
+                                               SDValue N1, ISD::CondCode Cond,
+                                               DAGCombinerInfo &DCI,
+                                               const SDLoc &DL) const;
 };
 
 /// Given an LLVM IR type and return type attributes, compute the return value
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/contrib/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 69de9f8cb35d..f5c7fc824ab4 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -15,9 +15,9 @@
 #ifndef LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
 #define LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
 
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
 
@@ -36,16 +36,18 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
 protected:
   MCSymbolRefExpr::VariantKind PLTRelativeVariantKind =
       MCSymbolRefExpr::VK_None;
+  const TargetMachine *TM;
 
 public:
   TargetLoweringObjectFileELF() = default;
   ~TargetLoweringObjectFileELF() override = default;
 
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
   /// Emit Obj-C garbage collection and linker options.
-  void emitModuleMetadata(MCStreamer &Streamer, Module &M,
-                          const TargetMachine &TM) const override;
+  void emitModuleMetadata(MCStreamer &Streamer, Module &M) const override;
 
-  void emitPersonalityValue(MCStreamer &Streamer, const DataLayout &TM,
+  void emitPersonalityValue(MCStreamer &Streamer, const DataLayout &DL,
                             const MCSymbol *Sym) const override;
 
   /// Given a constant with the SectionKind, return a section that it should be
@@ -98,8 +100,7 @@ public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
   /// Emit the module flags that specify the garbage collection information.
-  void emitModuleMetadata(MCStreamer &Streamer, Module &M,
-                          const TargetMachine &TM) const override;
+  void emitModuleMetadata(MCStreamer &Streamer, Module &M) const override;
 
   MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
                                     const TargetMachine &TM) const override;
@@ -153,8 +154,7 @@ public:
                                     const TargetMachine &TM) const override;
 
   /// Emit Obj-C garbage collection and linker options.
-  void emitModuleMetadata(MCStreamer &Streamer, Module &M,
-                          const TargetMachine &TM) const override;
+  void emitModuleMetadata(MCStreamer &Streamer, Module &M) const override;
 
   MCSection *getStaticCtorSection(unsigned Priority,
                                   const MCSymbol *KeySym) const override;
@@ -163,6 +163,19 @@ public:
 
   void emitLinkerFlagsForGlobal(raw_ostream &OS,
                                 const GlobalValue *GV) const override;
+
+  void emitLinkerFlagsForUsed(raw_ostream &OS,
+                              const GlobalValue *GV) const override;
+
+  const MCExpr *lowerRelativeReference(const GlobalValue *LHS,
+                                       const GlobalValue *RHS,
+                                       const TargetMachine &TM) const override;
+
+  /// Given a mergeable constant with the specified size and relocation
+  /// information, return a section that it should be placed in.
+  MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                   const Constant *C,
+                                   unsigned &Align) const override;
 };
 
 class TargetLoweringObjectFileWasm : public TargetLoweringObjectFile {
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetOpcodes.h b/contrib/llvm/include/llvm/CodeGen/TargetOpcodes.h
index 3ca31a970944..d0d959c4ae11 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetOpcodes.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetOpcodes.h
@@ -22,7 +22,7 @@ namespace TargetOpcode {
 enum {
 #define HANDLE_TARGET_OPCODE(OPC) OPC,
 #define HANDLE_TARGET_OPCODE_MARKER(IDENT, OPC) IDENT = OPC,
-#include "llvm/CodeGen/TargetOpcodes.def"
+#include "llvm/Support/TargetOpcodes.def"
 };
 } // end namespace TargetOpcode
 
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h b/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h
index da9841a0586e..5918c524d11c 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -84,20 +84,6 @@ template <> struct isPodLike<IdentifyingPassPtr> {
 /// This is an ImmutablePass solely for the purpose of exposing CodeGen options
 /// to the internals of other CodeGen passes.
 class TargetPassConfig : public ImmutablePass {
-public:
-  /// Pseudo Pass IDs. These are defined within TargetPassConfig because they
-  /// are unregistered pass IDs. They are only useful for use with
-  /// TargetPassConfig APIs to identify multiple occurrences of the same pass.
-  ///
-
-  /// EarlyTailDuplicate - A clone of the TailDuplicate pass that runs early
-  /// during codegen, on SSA form.
-  static char EarlyTailDuplicateID;
-
-  /// PostRAMachineLICM - A clone of the LICM pass that runs during late machine
-  /// optimization after regalloc.
-  static char PostRAMachineLICMID;
-
 private:
   PassManagerBase *PM = nullptr;
   AnalysisID StartBefore = nullptr;
@@ -218,9 +204,6 @@ public:
   /// Return true if the optimized regalloc pipeline is enabled.
   bool getOptimizeRegAlloc() const;
 
-  /// Return true if shrink wrapping is enabled.
-  bool getEnableShrinkWrap() const;
-
   /// Return true if the default global register allocator is in use and
   /// has not be overriden on the command line with '-regalloc=...'
   bool usingDefaultRegAlloc() const;
@@ -229,7 +212,7 @@ public:
   /// representation to the MI representation.
   /// Adds IR based lowering and target specific optimization passes and finally
   /// the core instruction selection passes.
-  /// \returns true if an error occured, false otherwise.
+  /// \returns true if an error occurred, false otherwise.
   bool addISelPasses();
 
   /// Add common target configurable passes that perform LLVM IR to IR
@@ -320,10 +303,6 @@ public:
   /// verification is enabled.
   void addVerifyPass(const std::string &Banner);
 
-  /// Check whether or not GlobalISel should be enabled by default.
-  /// Fallback/abort behavior is controlled via other methods.
-  virtual bool isGlobalISelEnabled() const;
-
   /// Check whether or not GlobalISel should abort on error.
   /// When this is disabled, GlobalISel will fall back on SDISel instead of
   /// erroring out.
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/contrib/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 81907538fb0b..538a5845466c 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -21,11 +21,11 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Printable.h"
 #include <cassert>
@@ -238,12 +238,12 @@ private:
 
 protected:
   TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
-                     regclass_iterator RegClassBegin,
-                     regclass_iterator RegClassEnd,
+                     regclass_iterator RCB,
+                     regclass_iterator RCE,
                      const char *const *SRINames,
                      const LaneBitmask *SRILaneMasks,
                      LaneBitmask CoveringLanes,
-                     const RegClassInfo *const RSI,
+                     const RegClassInfo *const RCIs,
                      unsigned Mode = 0);
   virtual ~TargetRegisterInfo();
 
@@ -444,6 +444,13 @@ public:
     return false;
   }
 
+  /// Returns the original SrcReg unless it is the target of a copy-like
+  /// operation, in which case we chain backwards through all such operations
+  /// to the ultimate source register.  If a physical register is encountered,
+  /// we stop the search.
+  virtual unsigned lookThruCopyLike(unsigned SrcReg,
+                                    const MachineRegisterInfo *MRI) const;
+
   /// Return a null-terminated list of all of the callee-saved registers on
   /// this target. The register should be in the order of desired callee-save
   /// stack frame offset. The first register is closest to the incoming stack
@@ -752,6 +759,9 @@ public:
   virtual const RegClassWeight &getRegClassWeight(
     const TargetRegisterClass *RC) const = 0;
 
+  /// Returns size in bits of a phys/virtual/generic register.
+  unsigned getRegSizeInBits(unsigned Reg, const MachineRegisterInfo &MRI) const;
+
   /// Get the weight in units of pressure for this register unit.
   virtual unsigned getRegUnitWeight(unsigned RegUnit) const = 0;
 
@@ -961,7 +971,7 @@ public:
   //===--------------------------------------------------------------------===//
   /// Subtarget Hooks
 
-  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+  /// SrcRC and DstRC will be morphed into NewRC if this returns true.
   virtual bool shouldCoalesce(MachineInstr *MI,
                               const TargetRegisterClass *SrcRC,
                               unsigned SubReg,
@@ -985,6 +995,12 @@ public:
   /// of the set as well.
   bool checkAllSuperRegsMarked(const BitVector &RegisterSet,
       ArrayRef<MCPhysReg> Exceptions = ArrayRef<MCPhysReg>()) const;
+
+  virtual const TargetRegisterClass *
+  getConstrainedRegClassForOperand(const MachineOperand &MO,
+                                   const MachineRegisterInfo &MRI) const {
+    return nullptr;
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -1151,7 +1167,8 @@ struct VirtReg2IndexFunctor {
 ///
 /// Usage: OS << printReg(Reg, TRI, SubRegIdx) << '\n';
 Printable printReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
-                   unsigned SubRegIdx = 0);
+                   unsigned SubIdx = 0,
+                   const MachineRegisterInfo *MRI = nullptr);
 
 /// Create Printable object to print register units on a \ref raw_ostream.
 ///
@@ -1163,11 +1180,11 @@ Printable printReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
 /// Usage: OS << printRegUnit(Unit, TRI) << '\n';
 Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI);
 
-/// \brief Create Printable object to print virtual registers and physical
+/// Create Printable object to print virtual registers and physical
 /// registers on a \ref raw_ostream.
 Printable printVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI);
 
-/// \brief Create Printable object to print register classes or register banks
+/// Create Printable object to print register classes or register banks
 /// on a \ref raw_ostream.
 Printable printRegClassOrBank(unsigned Reg, const MachineRegisterInfo &RegInfo,
                               const TargetRegisterInfo *TRI);
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetSchedule.h b/contrib/llvm/include/llvm/CodeGen/TargetSchedule.h
index 1044f0bd27e6..6173925e23a1 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetSchedule.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetSchedule.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSchedule.h"
 
@@ -45,24 +46,23 @@ class TargetSchedModel {
 public:
   TargetSchedModel() : SchedModel(MCSchedModel::GetDefaultSchedModel()) {}
 
-  /// \brief Initialize the machine model for instruction scheduling.
+  /// Initialize the machine model for instruction scheduling.
   ///
   /// The machine model API keeps a copy of the top-level MCSchedModel table
   /// indices and may query TargetSubtargetInfo and TargetInstrInfo to resolve
   /// dynamic properties.
-  void init(const MCSchedModel &sm, const TargetSubtargetInfo *sti,
-            const TargetInstrInfo *tii);
+  void init(const TargetSubtargetInfo *TSInfo);
 
   /// Return the MCSchedClassDesc for this instruction.
   const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
 
-  /// \brief TargetSubtargetInfo getter.
+  /// TargetSubtargetInfo getter.
   const TargetSubtargetInfo *getSubtargetInfo() const { return STI; }
 
-  /// \brief TargetInstrInfo getter.
+  /// TargetInstrInfo getter.
   const TargetInstrInfo *getInstrInfo() const { return TII; }
 
-  /// \brief Return true if this machine model includes an instruction-level
+  /// Return true if this machine model includes an instruction-level
   /// scheduling model.
   ///
   /// This is more detailed than the course grain IssueWidth and default
@@ -71,7 +71,7 @@ public:
 
   const MCSchedModel *getMCSchedModel() const { return &SchedModel; }
 
-  /// \brief Return true if this machine model includes cycle-to-cycle itinerary
+  /// Return true if this machine model includes cycle-to-cycle itinerary
   /// data.
   ///
   /// This models scheduling at each stage in the processor pipeline.
@@ -83,35 +83,35 @@ public:
     return nullptr;
   }
 
-  /// \brief Return true if this machine model includes an instruction-level
+  /// Return true if this machine model includes an instruction-level
   /// scheduling model or cycle-to-cycle itinerary data.
   bool hasInstrSchedModelOrItineraries() const {
     return hasInstrSchedModel() || hasInstrItineraries();
   }
 
-  /// \brief Identify the processor corresponding to the current subtarget.
+  /// Identify the processor corresponding to the current subtarget.
   unsigned getProcessorID() const { return SchedModel.getProcessorID(); }
 
-  /// \brief Maximum number of micro-ops that may be scheduled per cycle.
+  /// Maximum number of micro-ops that may be scheduled per cycle.
   unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
 
-  /// \brief Return true if new group must begin.
+  /// Return true if new group must begin.
   bool mustBeginGroup(const MachineInstr *MI,
                           const MCSchedClassDesc *SC = nullptr) const;
-  /// \brief Return true if current group must end.
+  /// Return true if current group must end.
   bool mustEndGroup(const MachineInstr *MI,
                           const MCSchedClassDesc *SC = nullptr) const;
 
-  /// \brief Return the number of issue slots required for this MI.
+  /// Return the number of issue slots required for this MI.
   unsigned getNumMicroOps(const MachineInstr *MI,
                           const MCSchedClassDesc *SC = nullptr) const;
 
-  /// \brief Get the number of kinds of resources for this target.
+  /// Get the number of kinds of resources for this target.
   unsigned getNumProcResourceKinds() const {
     return SchedModel.getNumProcResourceKinds();
   }
 
-  /// \brief Get a processor resource by ID for convenience.
+  /// Get a processor resource by ID for convenience.
   const MCProcResourceDesc *getProcResource(unsigned PIdx) const {
     return SchedModel.getProcResource(PIdx);
   }
@@ -126,7 +126,7 @@ public:
 
   using ProcResIter = const MCWriteProcResEntry *;
 
-  // \brief Get an iterator into the processor resources consumed by this
+  // Get an iterator into the processor resources consumed by this
   // scheduling class.
   ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const {
     // The subtarget holds a single resource table for all processors.
@@ -136,34 +136,34 @@ public:
     return STI->getWriteProcResEnd(SC);
   }
 
-  /// \brief Multiply the number of units consumed for a resource by this factor
+  /// Multiply the number of units consumed for a resource by this factor
   /// to normalize it relative to other resources.
   unsigned getResourceFactor(unsigned ResIdx) const {
     return ResourceFactors[ResIdx];
   }
 
-  /// \brief Multiply number of micro-ops by this factor to normalize it
+  /// Multiply number of micro-ops by this factor to normalize it
   /// relative to other resources.
   unsigned getMicroOpFactor() const {
     return MicroOpFactor;
   }
 
-  /// \brief Multiply cycle count by this factor to normalize it relative to
+  /// Multiply cycle count by this factor to normalize it relative to
   /// other resources. This is the number of resource units per cycle.
   unsigned getLatencyFactor() const {
     return ResourceLCM;
   }
 
-  /// \brief Number of micro-ops that may be buffered for OOO execution.
+  /// Number of micro-ops that may be buffered for OOO execution.
   unsigned getMicroOpBufferSize() const { return SchedModel.MicroOpBufferSize; }
 
-  /// \brief Number of resource units that may be buffered for OOO execution.
+  /// Number of resource units that may be buffered for OOO execution.
   /// \return The buffer size in resource units or -1 for unlimited.
   int getResourceBufferSize(unsigned PIdx) const {
     return SchedModel.getProcResource(PIdx)->BufferSize;
   }
 
-  /// \brief Compute operand latency based on the available machine model.
+  /// Compute operand latency based on the available machine model.
   ///
   /// Compute and return the latency of the given data dependent def and use
   /// when the operand indices are already known. UseMI may be NULL for an
@@ -172,7 +172,7 @@ public:
                                  const MachineInstr *UseMI, unsigned UseOperIdx)
     const;
 
-  /// \brief Compute the instruction latency based on the available machine
+  /// Compute the instruction latency based on the available machine
   /// model.
   ///
   /// Compute and return the expected latency of this instruction independent of
@@ -185,18 +185,20 @@ public:
   /// if converter after moving it to TargetSchedModel).
   unsigned computeInstrLatency(const MachineInstr *MI,
                                bool UseDefaultDefLatency = true) const;
+  unsigned computeInstrLatency(const MCInst &Inst) const;
   unsigned computeInstrLatency(unsigned Opcode) const;
 
 
-  /// \brief Output dependency latency of a pair of defs of the same register.
+  /// Output dependency latency of a pair of defs of the same register.
   ///
   /// This is typically one cycle.
-  unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx,
+  unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
                                 const MachineInstr *DepMI) const;
 
-  /// \brief Compute the reciprocal throughput of the given instruction.
-  Optional<double> computeInstrRThroughput(const MachineInstr *MI) const;
-  Optional<double> computeInstrRThroughput(unsigned Opcode) const;
+  /// Compute the reciprocal throughput of the given instruction.
+  double computeReciprocalThroughput(const MachineInstr *MI) const;
+  double computeReciprocalThroughput(const MCInst &MI) const;
+  double computeReciprocalThroughput(unsigned Opcode) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 9d99cba347ce..227e591f5a7d 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -144,7 +144,7 @@ public:
     return 0;
   }
 
-  /// \brief True if the subtarget should run MachineScheduler after aggressive
+  /// True if the subtarget should run MachineScheduler after aggressive
   /// coalescing.
   ///
   /// This currently replaces the SelectionDAG scheduler with the "source" order
@@ -152,14 +152,14 @@ public:
   /// TargetLowering preference). It does not yet disable the postRA scheduler.
   virtual bool enableMachineScheduler() const;
 
-  /// \brief Support printing of [latency:throughput] comment in output .S file.
+  /// Support printing of [latency:throughput] comment in output .S file.
   virtual bool supportPrintSchedInfo() const { return false; }
 
-  /// \brief True if the machine scheduler should disable the TLI preference
+  /// True if the machine scheduler should disable the TLI preference
   /// for preRA scheduling with the source level scheduler.
   virtual bool enableMachineSchedDefaultSched() const { return true; }
 
-  /// \brief True if the subtarget should enable joining global copies.
+  /// True if the subtarget should enable joining global copies.
   ///
   /// By default this is enabled if the machine scheduler is enabled, but
   /// can be overridden.
@@ -171,13 +171,13 @@ public:
   /// which is the preferred way to influence this.
   virtual bool enablePostRAScheduler() const;
 
-  /// \brief True if the subtarget should run the atomic expansion pass.
+  /// True if the subtarget should run the atomic expansion pass.
   virtual bool enableAtomicExpand() const;
 
   /// True if the subtarget should run the indirectbr expansion pass.
   virtual bool enableIndirectBrExpand() const;
 
-  /// \brief Override generic scheduling policy within a region.
+  /// Override generic scheduling policy within a region.
   ///
   /// This is a convenient way for targets that don't provide any custom
   /// scheduling heuristics (no custom MachineSchedStrategy) to make
@@ -185,7 +185,7 @@ public:
   virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
                                    unsigned NumRegionInstrs) const {}
 
-  // \brief Perform target specific adjustments to the latency of a schedule
+  // Perform target specific adjustments to the latency of a schedule
   // dependency.
   virtual void adjustSchedDependency(SUnit *def, SUnit *use, SDep &dep) const {}
 
@@ -200,13 +200,13 @@ public:
     return CriticalPathRCs.clear();
   }
 
-  // \brief Provide an ordered list of schedule DAG mutations for the post-RA
+  // Provide an ordered list of schedule DAG mutations for the post-RA
   // scheduler.
   virtual void getPostRAMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
   }
 
-  // \brief Provide an ordered list of schedule DAG mutations for the machine
+  // Provide an ordered list of schedule DAG mutations for the machine
   // pipeliner.
   virtual void getSMSMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
@@ -218,25 +218,25 @@ public:
     return CodeGenOpt::Default;
   }
 
-  /// \brief True if the subtarget should run the local reassignment
+  /// True if the subtarget should run the local reassignment
   /// heuristic of the register allocator.
   /// This heuristic may be compile time intensive, \p OptLevel provides
   /// a finer grain to tune the register allocator.
   virtual bool enableRALocalReassignment(CodeGenOpt::Level OptLevel) const;
 
-  /// \brief True if the subtarget should consider the cost of local intervals
+  /// True if the subtarget should consider the cost of local intervals
   /// created by a split candidate when choosing the best split candidate. This
   /// heuristic may be compile time intensive.
   virtual bool enableAdvancedRASplitCost() const;
 
-  /// \brief Enable use of alias analysis during code generation (during MI
+  /// Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
 
-  /// \brief Enable the use of the early if conversion pass.
+  /// Enable the use of the early if conversion pass.
   virtual bool enableEarlyIfConversion() const { return false; }
 
-  /// \brief Return PBQPConstraint(s) for the target.
+  /// Return PBQPConstraint(s) for the target.
   ///
   /// Override to provide custom PBQP constraints.
   virtual std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const {
@@ -249,8 +249,11 @@ public:
   virtual bool enableSubRegLiveness() const { return false; }
 
   /// Returns string representation of scheduler comment
-  std::string getSchedInfoStr(const MachineInstr &MI) const override;
+  std::string getSchedInfoStr(const MachineInstr &MI) const;
   std::string getSchedInfoStr(MCInst const &MCI) const override;
+
+  /// This is called after a .mir file was loaded.
+  virtual void mirFileLoaded(MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/CodeGen/ValueTypes.h b/contrib/llvm/include/llvm/CodeGen/ValueTypes.h
index 40d501edde10..d2ef4a94f8e2 100644
--- a/contrib/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/contrib/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -16,8 +16,8 @@
 #ifndef LLVM_CODEGEN_VALUETYPES_H
 #define LLVM_CODEGEN_VALUETYPES_H
 
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
diff --git a/contrib/llvm/include/llvm/CodeGen/ValueTypes.td b/contrib/llvm/include/llvm/CodeGen/ValueTypes.td
index 73c7fb4ce4b3..0abb4ece1d14 100644
--- a/contrib/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/contrib/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 // Value types - These values correspond to the register types defined in the
-// ValueTypes.h file.  If you update anything here, you must update it there as
-// well!
+// MachineValueTypes.h file.  If you update anything here, you must update it
+// there as well!
 //
 //===----------------------------------------------------------------------===//
 
@@ -69,7 +69,7 @@ def v4i32  : ValueType<128, 43>;   //  4 x i32 vector value
 def v8i32  : ValueType<256, 44>;   //  8 x i32 vector value
 def v16i32 : ValueType<512, 45>;   // 16 x i32 vector value
 def v32i32 : ValueType<1024,46>;   // 32 x i32 vector value
-def v64i32 : ValueType<2048,47>;   // 32 x i32 vector value
+def v64i32 : ValueType<2048,47>;   // 64 x i32 vector value
 
 def v1i64  : ValueType<64 , 48>;   //  1 x i64 vector value
 def v2i64  : ValueType<128, 49>;   //  2 x i64 vector value
@@ -145,6 +145,7 @@ def x86mmx : ValueType<64 , 109>;   // X86 MMX value
 def FlagVT : ValueType<0  , 110>;   // Pre-RA sched glue
 def isVoid : ValueType<0  , 111>;   // Produces no value
 def untyped: ValueType<8  , 112>;   // Produces an untyped value
+def ExceptRef: ValueType<0, 113>;   // WebAssembly's except_ref type
 def token  : ValueType<0  , 248>;   // TokenTy
 def MetadataVT: ValueType<0, 249>;  // Metadata
 
diff --git a/contrib/llvm/include/llvm/CodeGen/VirtRegMap.h b/contrib/llvm/include/llvm/CodeGen/VirtRegMap.h
index 3b06f0393114..6a8e50a7e5f5 100644
--- a/contrib/llvm/include/llvm/CodeGen/VirtRegMap.h
+++ b/contrib/llvm/include/llvm/CodeGen/VirtRegMap.h
@@ -90,24 +90,24 @@ class TargetInstrInfo;
 
     void grow();
 
-    /// @brief returns true if the specified virtual register is
+    /// returns true if the specified virtual register is
     /// mapped to a physical register
     bool hasPhys(unsigned virtReg) const {
       return getPhys(virtReg) != NO_PHYS_REG;
     }
 
-    /// @brief returns the physical register mapped to the specified
+    /// returns the physical register mapped to the specified
     /// virtual register
     unsigned getPhys(unsigned virtReg) const {
       assert(TargetRegisterInfo::isVirtualRegister(virtReg));
       return Virt2PhysMap[virtReg];
     }
 
-    /// @brief creates a mapping for the specified virtual register to
+    /// creates a mapping for the specified virtual register to
     /// the specified physical register
     void assignVirt2Phys(unsigned virtReg, MCPhysReg physReg);
 
-    /// @brief clears the specified virtual register's, physical
+    /// clears the specified virtual register's, physical
     /// register mapping
     void clearVirt(unsigned virtReg) {
       assert(TargetRegisterInfo::isVirtualRegister(virtReg));
@@ -116,26 +116,26 @@ class TargetInstrInfo;
       Virt2PhysMap[virtReg] = NO_PHYS_REG;
     }
 
-    /// @brief clears all virtual to physical register mappings
+    /// clears all virtual to physical register mappings
     void clearAllVirt() {
       Virt2PhysMap.clear();
       grow();
     }
 
-    /// @brief returns true if VirtReg is assigned to its preferred physreg.
+    /// returns true if VirtReg is assigned to its preferred physreg.
     bool hasPreferredPhys(unsigned VirtReg);
 
-    /// @brief returns true if VirtReg has a known preferred register.
+    /// returns true if VirtReg has a known preferred register.
     /// This returns false if VirtReg has a preference that is a virtual
     /// register that hasn't been assigned yet.
     bool hasKnownPreference(unsigned VirtReg);
 
-    /// @brief records virtReg is a split live interval from SReg.
+    /// records virtReg is a split live interval from SReg.
     void setIsSplitFromReg(unsigned virtReg, unsigned SReg) {
       Virt2SplitMap[virtReg] = SReg;
     }
 
-    /// @brief returns the live interval virtReg is split from.
+    /// returns the live interval virtReg is split from.
     unsigned getPreSplitReg(unsigned virtReg) const {
       return Virt2SplitMap[virtReg];
     }
@@ -149,7 +149,7 @@ class TargetInstrInfo;
       return Orig ? Orig : VirtReg;
     }
 
-    /// @brief returns true if the specified virtual register is not
+    /// returns true if the specified virtual register is not
     /// mapped to a stack slot or rematerialized.
     bool isAssignedReg(unsigned virtReg) const {
       if (getStackSlot(virtReg) == NO_STACK_SLOT)
@@ -159,20 +159,20 @@ class TargetInstrInfo;
       return (Virt2SplitMap[virtReg] && Virt2PhysMap[virtReg] != NO_PHYS_REG);
     }
 
-    /// @brief returns the stack slot mapped to the specified virtual
+    /// returns the stack slot mapped to the specified virtual
     /// register
     int getStackSlot(unsigned virtReg) const {
       assert(TargetRegisterInfo::isVirtualRegister(virtReg));
       return Virt2StackSlotMap[virtReg];
     }
 
-    /// @brief create a mapping for the specifed virtual register to
+    /// create a mapping for the specifed virtual register to
     /// the next available stack slot
     int assignVirt2StackSlot(unsigned virtReg);
 
-    /// @brief create a mapping for the specified virtual register to
+    /// create a mapping for the specified virtual register to
     /// the specified stack slot
-    void assignVirt2StackSlot(unsigned virtReg, int frameIndex);
+    void assignVirt2StackSlot(unsigned virtReg, int SS);
 
     void print(raw_ostream &OS, const Module* M = nullptr) const override;
     void dump() const;
diff --git a/contrib/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h b/contrib/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h
new file mode 100644
index 000000000000..3ad6760d8813
--- /dev/null
+++ b/contrib/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h
@@ -0,0 +1,80 @@
+//===--- llvm/CodeGen/WasmEHFuncInfo.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for Wasm exception handling schemes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_WASMEHFUNCINFO_H
+#define LLVM_CODEGEN_WASMEHFUNCINFO_H
+
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/BasicBlock.h"
+
+namespace llvm {
+
+using BBOrMBB = PointerUnion<const BasicBlock *, MachineBasicBlock *>;
+
+struct WasmEHFuncInfo {
+  // When there is an entry <A, B>, if an exception is not caught by A, it
+  // should next unwind to the EH pad B.
+  DenseMap<BBOrMBB, BBOrMBB> EHPadUnwindMap;
+  // For entry <A, B>, A is a BB with an instruction that may throw
+  // (invoke/cleanupret in LLVM IR, call/rethrow in the backend) and B is an EH
+  // pad that A unwinds to.
+  DenseMap<BBOrMBB, BBOrMBB> ThrowUnwindMap;
+
+  // Helper functions
+  const BasicBlock *getEHPadUnwindDest(const BasicBlock *BB) const {
+    return EHPadUnwindMap.lookup(BB).get<const BasicBlock *>();
+  }
+  void setEHPadUnwindDest(const BasicBlock *BB, const BasicBlock *Dest) {
+    EHPadUnwindMap[BB] = Dest;
+  }
+  const BasicBlock *getThrowUnwindDest(BasicBlock *BB) const {
+    return ThrowUnwindMap.lookup(BB).get<const BasicBlock *>();
+  }
+  void setThrowUnwindDest(const BasicBlock *BB, const BasicBlock *Dest) {
+    ThrowUnwindMap[BB] = Dest;
+  }
+  bool hasEHPadUnwindDest(const BasicBlock *BB) const {
+    return EHPadUnwindMap.count(BB);
+  }
+  bool hasThrowUnwindDest(const BasicBlock *BB) const {
+    return ThrowUnwindMap.count(BB);
+  }
+
+  MachineBasicBlock *getEHPadUnwindDest(MachineBasicBlock *MBB) const {
+    return EHPadUnwindMap.lookup(MBB).get<MachineBasicBlock *>();
+  }
+  void setEHPadUnwindDest(MachineBasicBlock *MBB, MachineBasicBlock *Dest) {
+    EHPadUnwindMap[MBB] = Dest;
+  }
+  MachineBasicBlock *getThrowUnwindDest(MachineBasicBlock *MBB) const {
+    return ThrowUnwindMap.lookup(MBB).get<MachineBasicBlock *>();
+  }
+  void setThrowUnwindDest(MachineBasicBlock *MBB, MachineBasicBlock *Dest) {
+    ThrowUnwindMap[MBB] = Dest;
+  }
+  bool hasEHPadUnwindDest(MachineBasicBlock *MBB) const {
+    return EHPadUnwindMap.count(MBB);
+  }
+  bool hasThrowUnwindDest(MachineBasicBlock *MBB) const {
+    return ThrowUnwindMap.count(MBB);
+  }
+};
+
+// Analyze the IR in the given function to build WasmEHFuncInfo.
+void calculateWasmEHInfo(const Function *F, WasmEHFuncInfo &EHInfo);
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_WASMEHFUNCINFO_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
index 9f3a753ad1ae..9dbeb438f4ae 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -61,6 +61,30 @@ template <typename Kind> struct RemappedRecord {
   SmallVector<std::pair<uint32_t, TypeIndex>, 8> Mappings;
 };
 
+template <typename Record, typename Func>
+Error forEachCodeViewRecord(ArrayRef<uint8_t> StreamBuffer, Func F) {
+  while (!StreamBuffer.empty()) {
+    if (StreamBuffer.size() < sizeof(RecordPrefix))
+      return make_error<CodeViewError>(cv_error_code::corrupt_record);
+
+    const RecordPrefix *Prefix =
+        reinterpret_cast<const RecordPrefix *>(StreamBuffer.data());
+
+    size_t RealLen = Prefix->RecordLen + 2;
+    if (StreamBuffer.size() < RealLen)
+      return make_error<CodeViewError>(cv_error_code::corrupt_record);
+
+    ArrayRef<uint8_t> Data = StreamBuffer.take_front(RealLen);
+    StreamBuffer = StreamBuffer.drop_front(RealLen);
+
+    Record R(static_cast<decltype(Record::Type)>((uint16_t)Prefix->RecordKind),
+             Data);
+    if (auto EC = F(R))
+      return EC;
+  }
+  return Error::success();
+}
+
 /// Read a complete record from a stream at a random offset.
 template <typename Kind>
 inline Expected<CVRecord<Kind>> readCVRecordFromStream(BinaryStreamRef Stream,
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index df55e181364c..b765ba1abb4d 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -20,7 +20,7 @@ class TypeCollection;
 class TypeVisitorCallbacks;
 
 enum VisitorDataSource {
-  VDS_BytesPresent, // The record bytes are passed into the the visitation
+  VDS_BytesPresent, // The record bytes are passed into the visitation
                     // function.  The algorithm should first deserialize them
                     // before passing them on through the pipeline.
   VDS_BytesExternal // The record bytes are not present, and it is the
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
index 1a4f510c24ab..4ce9f68cffd9 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -22,8 +22,8 @@
 namespace llvm {
 namespace codeview {
 
-/// Distinguishes individual records in .debug$T section or PDB type stream. The
-/// documentation and headers talk about this as the "leaf" type.
+/// Distinguishes individual records in .debug$T or .debug$P section or PDB type
+/// stream. The documentation and headers talk about this as the "leaf" type.
 enum class TypeRecordKind : uint16_t {
 #define TYPE_RECORD(lf_ename, value, name) name = value,
 #include "CodeViewTypes.def"
@@ -531,7 +531,7 @@ enum LineFlags : uint16_t {
   LF_HaveColumns = 1, // CV_LINES_HAVE_COLUMNS
 };
 
-/// Data in the the SUBSEC_FRAMEDATA subection.
+/// Data in the SUBSEC_FRAMEDATA subection.
 struct FrameData {
   support::ulittle32_t RvaStart;
   support::ulittle32_t CodeSize;
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def b/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
index 3f0660294866..6da8893bd61a 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
@@ -15,254 +15,254 @@
 #define CV_REGISTER(name, value)
 #endif
 
-// This currently only contains the "register subset shraed by all processor
+// This currently only contains the "register subset shared by all processor
 // types" (ERR etc.) and the x86 registers.
 
-CV_REGISTER(ERR, 30000)
-CV_REGISTER(TEB, 30001)
-CV_REGISTER(TIMER, 30002)
-CV_REGISTER(EFAD1, 30003)
-CV_REGISTER(EFAD2, 30004)
-CV_REGISTER(EFAD3, 30005)
-CV_REGISTER(VFRAME, 30006)
-CV_REGISTER(HANDLE, 30007)
-CV_REGISTER(PARAMS, 30008)
-CV_REGISTER(LOCALS, 30009)
-CV_REGISTER(TID, 30010)
-CV_REGISTER(ENV, 30011)
-CV_REGISTER(CMDLN, 30012)
+CV_REGISTER(CVRegERR, 30000)
+CV_REGISTER(CVRegTEB, 30001)
+CV_REGISTER(CVRegTIMER, 30002)
+CV_REGISTER(CVRegEFAD1, 30003)
+CV_REGISTER(CVRegEFAD2, 30004)
+CV_REGISTER(CVRegEFAD3, 30005)
+CV_REGISTER(CVRegVFRAME, 30006)
+CV_REGISTER(CVRegHANDLE, 30007)
+CV_REGISTER(CVRegPARAMS, 30008)
+CV_REGISTER(CVRegLOCALS, 30009)
+CV_REGISTER(CVRegTID, 30010)
+CV_REGISTER(CVRegENV, 30011)
+CV_REGISTER(CVRegCMDLN, 30012)
 
-CV_REGISTER(NONE, 0)
-CV_REGISTER(AL, 1)
-CV_REGISTER(CL, 2)
-CV_REGISTER(DL, 3)
-CV_REGISTER(BL, 4)
-CV_REGISTER(AH, 5)
-CV_REGISTER(CH, 6)
-CV_REGISTER(DH, 7)
-CV_REGISTER(BH, 8)
-CV_REGISTER(AX, 9)
-CV_REGISTER(CX, 10)
-CV_REGISTER(DX, 11)
-CV_REGISTER(BX, 12)
-CV_REGISTER(SP, 13)
-CV_REGISTER(BP, 14)
-CV_REGISTER(SI, 15)
-CV_REGISTER(DI, 16)
-CV_REGISTER(EAX, 17)
-CV_REGISTER(ECX, 18)
-CV_REGISTER(EDX, 19)
-CV_REGISTER(EBX, 20)
-CV_REGISTER(ESP, 21)
-CV_REGISTER(EBP, 22)
-CV_REGISTER(ESI, 23)
-CV_REGISTER(EDI, 24)
-CV_REGISTER(ES, 25)
-CV_REGISTER(CS, 26)
-CV_REGISTER(SS, 27)
-CV_REGISTER(DS, 28)
-CV_REGISTER(FS, 29)
-CV_REGISTER(GS, 30)
-CV_REGISTER(IP, 31)
-CV_REGISTER(FLAGS, 32)
-CV_REGISTER(EIP, 33)
-CV_REGISTER(EFLAGS, 34)
-CV_REGISTER(TEMP, 40)
-CV_REGISTER(TEMPH, 41)
-CV_REGISTER(QUOTE, 42)
-CV_REGISTER(PCDR3, 43)
-CV_REGISTER(PCDR4, 44)
-CV_REGISTER(PCDR5, 45)
-CV_REGISTER(PCDR6, 46)
-CV_REGISTER(PCDR7, 47)
-CV_REGISTER(CR0, 80)
-CV_REGISTER(CR1, 81)
-CV_REGISTER(CR2, 82)
-CV_REGISTER(CR3, 83)
-CV_REGISTER(CR4, 84)
-CV_REGISTER(DR0, 90)
-CV_REGISTER(DR1, 91)
-CV_REGISTER(DR2, 92)
-CV_REGISTER(DR3, 93)
-CV_REGISTER(DR4, 94)
-CV_REGISTER(DR5, 95)
-CV_REGISTER(DR6, 96)
-CV_REGISTER(DR7, 97)
-CV_REGISTER(GDTR, 110)
-CV_REGISTER(GDTL, 111)
-CV_REGISTER(IDTR, 112)
-CV_REGISTER(IDTL, 113)
-CV_REGISTER(LDTR, 114)
-CV_REGISTER(TR, 115)
+CV_REGISTER(CVRegNONE, 0)
+CV_REGISTER(CVRegAL, 1)
+CV_REGISTER(CVRegCL, 2)
+CV_REGISTER(CVRegDL, 3)
+CV_REGISTER(CVRegBL, 4)
+CV_REGISTER(CVRegAH, 5)
+CV_REGISTER(CVRegCH, 6)
+CV_REGISTER(CVRegDH, 7)
+CV_REGISTER(CVRegBH, 8)
+CV_REGISTER(CVRegAX, 9)
+CV_REGISTER(CVRegCX, 10)
+CV_REGISTER(CVRegDX, 11)
+CV_REGISTER(CVRegBX, 12)
+CV_REGISTER(CVRegSP, 13)
+CV_REGISTER(CVRegBP, 14)
+CV_REGISTER(CVRegSI, 15)
+CV_REGISTER(CVRegDI, 16)
+CV_REGISTER(CVRegEAX, 17)
+CV_REGISTER(CVRegECX, 18)
+CV_REGISTER(CVRegEDX, 19)
+CV_REGISTER(CVRegEBX, 20)
+CV_REGISTER(CVRegESP, 21)
+CV_REGISTER(CVRegEBP, 22)
+CV_REGISTER(CVRegESI, 23)
+CV_REGISTER(CVRegEDI, 24)
+CV_REGISTER(CVRegES, 25)
+CV_REGISTER(CVRegCS, 26)
+CV_REGISTER(CVRegSS, 27)
+CV_REGISTER(CVRegDS, 28)
+CV_REGISTER(CVRegFS, 29)
+CV_REGISTER(CVRegGS, 30)
+CV_REGISTER(CVRegIP, 31)
+CV_REGISTER(CVRegFLAGS, 32)
+CV_REGISTER(CVRegEIP, 33)
+CV_REGISTER(CVRegEFLAGS, 34)
+CV_REGISTER(CVRegTEMP, 40)
+CV_REGISTER(CVRegTEMPH, 41)
+CV_REGISTER(CVRegQUOTE, 42)
+CV_REGISTER(CVRegPCDR3, 43)
+CV_REGISTER(CVRegPCDR4, 44)
+CV_REGISTER(CVRegPCDR5, 45)
+CV_REGISTER(CVRegPCDR6, 46)
+CV_REGISTER(CVRegPCDR7, 47)
+CV_REGISTER(CVRegCR0, 80)
+CV_REGISTER(CVRegCR1, 81)
+CV_REGISTER(CVRegCR2, 82)
+CV_REGISTER(CVRegCR3, 83)
+CV_REGISTER(CVRegCR4, 84)
+CV_REGISTER(CVRegDR0, 90)
+CV_REGISTER(CVRegDR1, 91)
+CV_REGISTER(CVRegDR2, 92)
+CV_REGISTER(CVRegDR3, 93)
+CV_REGISTER(CVRegDR4, 94)
+CV_REGISTER(CVRegDR5, 95)
+CV_REGISTER(CVRegDR6, 96)
+CV_REGISTER(CVRegDR7, 97)
+CV_REGISTER(CVRegGDTR, 110)
+CV_REGISTER(CVRegGDTL, 111)
+CV_REGISTER(CVRegIDTR, 112)
+CV_REGISTER(CVRegIDTL, 113)
+CV_REGISTER(CVRegLDTR, 114)
+CV_REGISTER(CVRegTR, 115)
 
-CV_REGISTER(PSEUDO1, 116)
-CV_REGISTER(PSEUDO2, 117)
-CV_REGISTER(PSEUDO3, 118)
-CV_REGISTER(PSEUDO4, 119)
-CV_REGISTER(PSEUDO5, 120)
-CV_REGISTER(PSEUDO6, 121)
-CV_REGISTER(PSEUDO7, 122)
-CV_REGISTER(PSEUDO8, 123)
-CV_REGISTER(PSEUDO9, 124)
+CV_REGISTER(CVRegPSEUDO1, 116)
+CV_REGISTER(CVRegPSEUDO2, 117)
+CV_REGISTER(CVRegPSEUDO3, 118)
+CV_REGISTER(CVRegPSEUDO4, 119)
+CV_REGISTER(CVRegPSEUDO5, 120)
+CV_REGISTER(CVRegPSEUDO6, 121)
+CV_REGISTER(CVRegPSEUDO7, 122)
+CV_REGISTER(CVRegPSEUDO8, 123)
+CV_REGISTER(CVRegPSEUDO9, 124)
 
-CV_REGISTER(ST0, 128)
-CV_REGISTER(ST1, 129)
-CV_REGISTER(ST2, 130)
-CV_REGISTER(ST3, 131)
-CV_REGISTER(ST4, 132)
-CV_REGISTER(ST5, 133)
-CV_REGISTER(ST6, 134)
-CV_REGISTER(ST7, 135)
-CV_REGISTER(CTRL, 136)
-CV_REGISTER(STAT, 137)
-CV_REGISTER(TAG, 138)
-CV_REGISTER(FPIP, 139)
-CV_REGISTER(FPCS, 140)
-CV_REGISTER(FPDO, 141)
-CV_REGISTER(FPDS, 142)
-CV_REGISTER(ISEM, 143)
-CV_REGISTER(FPEIP, 144)
-CV_REGISTER(FPEDO, 145)
+CV_REGISTER(CVRegST0, 128)
+CV_REGISTER(CVRegST1, 129)
+CV_REGISTER(CVRegST2, 130)
+CV_REGISTER(CVRegST3, 131)
+CV_REGISTER(CVRegST4, 132)
+CV_REGISTER(CVRegST5, 133)
+CV_REGISTER(CVRegST6, 134)
+CV_REGISTER(CVRegST7, 135)
+CV_REGISTER(CVRegCTRL, 136)
+CV_REGISTER(CVRegSTAT, 137)
+CV_REGISTER(CVRegTAG, 138)
+CV_REGISTER(CVRegFPIP, 139)
+CV_REGISTER(CVRegFPCS, 140)
+CV_REGISTER(CVRegFPDO, 141)
+CV_REGISTER(CVRegFPDS, 142)
+CV_REGISTER(CVRegISEM, 143)
+CV_REGISTER(CVRegFPEIP, 144)
+CV_REGISTER(CVRegFPEDO, 145)
 
-CV_REGISTER(MM0, 146)
-CV_REGISTER(MM1, 147)
-CV_REGISTER(MM2, 148)
-CV_REGISTER(MM3, 149)
-CV_REGISTER(MM4, 150)
-CV_REGISTER(MM5, 151)
-CV_REGISTER(MM6, 152)
-CV_REGISTER(MM7, 153)
+CV_REGISTER(CVRegMM0, 146)
+CV_REGISTER(CVRegMM1, 147)
+CV_REGISTER(CVRegMM2, 148)
+CV_REGISTER(CVRegMM3, 149)
+CV_REGISTER(CVRegMM4, 150)
+CV_REGISTER(CVRegMM5, 151)
+CV_REGISTER(CVRegMM6, 152)
+CV_REGISTER(CVRegMM7, 153)
 
-CV_REGISTER(XMM0, 154)
-CV_REGISTER(XMM1, 155)
-CV_REGISTER(XMM2, 156)
-CV_REGISTER(XMM3, 157)
-CV_REGISTER(XMM4, 158)
-CV_REGISTER(XMM5, 159)
-CV_REGISTER(XMM6, 160)
-CV_REGISTER(XMM7, 161)
+CV_REGISTER(CVRegXMM0, 154)
+CV_REGISTER(CVRegXMM1, 155)
+CV_REGISTER(CVRegXMM2, 156)
+CV_REGISTER(CVRegXMM3, 157)
+CV_REGISTER(CVRegXMM4, 158)
+CV_REGISTER(CVRegXMM5, 159)
+CV_REGISTER(CVRegXMM6, 160)
+CV_REGISTER(CVRegXMM7, 161)
 
-CV_REGISTER(MXCSR, 211)
+CV_REGISTER(CVRegMXCSR, 211)
 
-CV_REGISTER(EDXEAX, 212)
+CV_REGISTER(CVRegEDXEAX, 212)
 
-CV_REGISTER(EMM0L, 220)
-CV_REGISTER(EMM1L, 221)
-CV_REGISTER(EMM2L, 222)
-CV_REGISTER(EMM3L, 223)
-CV_REGISTER(EMM4L, 224)
-CV_REGISTER(EMM5L, 225)
-CV_REGISTER(EMM6L, 226)
-CV_REGISTER(EMM7L, 227)
+CV_REGISTER(CVRegEMM0L, 220)
+CV_REGISTER(CVRegEMM1L, 221)
+CV_REGISTER(CVRegEMM2L, 222)
+CV_REGISTER(CVRegEMM3L, 223)
+CV_REGISTER(CVRegEMM4L, 224)
+CV_REGISTER(CVRegEMM5L, 225)
+CV_REGISTER(CVRegEMM6L, 226)
+CV_REGISTER(CVRegEMM7L, 227)
 
-CV_REGISTER(EMM0H, 228)
-CV_REGISTER(EMM1H, 229)
-CV_REGISTER(EMM2H, 230)
-CV_REGISTER(EMM3H, 231)
-CV_REGISTER(EMM4H, 232)
-CV_REGISTER(EMM5H, 233)
-CV_REGISTER(EMM6H, 234)
-CV_REGISTER(EMM7H, 235)
+CV_REGISTER(CVRegEMM0H, 228)
+CV_REGISTER(CVRegEMM1H, 229)
+CV_REGISTER(CVRegEMM2H, 230)
+CV_REGISTER(CVRegEMM3H, 231)
+CV_REGISTER(CVRegEMM4H, 232)
+CV_REGISTER(CVRegEMM5H, 233)
+CV_REGISTER(CVRegEMM6H, 234)
+CV_REGISTER(CVRegEMM7H, 235)
 
-CV_REGISTER(MM00, 236)
-CV_REGISTER(MM01, 237)
-CV_REGISTER(MM10, 238)
-CV_REGISTER(MM11, 239)
-CV_REGISTER(MM20, 240)
-CV_REGISTER(MM21, 241)
-CV_REGISTER(MM30, 242)
-CV_REGISTER(MM31, 243)
-CV_REGISTER(MM40, 244)
-CV_REGISTER(MM41, 245)
-CV_REGISTER(MM50, 246)
-CV_REGISTER(MM51, 247)
-CV_REGISTER(MM60, 248)
-CV_REGISTER(MM61, 249)
-CV_REGISTER(MM70, 250)
-CV_REGISTER(MM71, 251)
+CV_REGISTER(CVRegMM00, 236)
+CV_REGISTER(CVRegMM01, 237)
+CV_REGISTER(CVRegMM10, 238)
+CV_REGISTER(CVRegMM11, 239)
+CV_REGISTER(CVRegMM20, 240)
+CV_REGISTER(CVRegMM21, 241)
+CV_REGISTER(CVRegMM30, 242)
+CV_REGISTER(CVRegMM31, 243)
+CV_REGISTER(CVRegMM40, 244)
+CV_REGISTER(CVRegMM41, 245)
+CV_REGISTER(CVRegMM50, 246)
+CV_REGISTER(CVRegMM51, 247)
+CV_REGISTER(CVRegMM60, 248)
+CV_REGISTER(CVRegMM61, 249)
+CV_REGISTER(CVRegMM70, 250)
+CV_REGISTER(CVRegMM71, 251)
 
-CV_REGISTER(BND0, 396)
-CV_REGISTER(BND1, 397)
-CV_REGISTER(BND2, 398)
+CV_REGISTER(CVRegBND0, 396)
+CV_REGISTER(CVRegBND1, 397)
+CV_REGISTER(CVRegBND2, 398)
 
 
-CV_REGISTER(XMM8, 252)
-CV_REGISTER(XMM9, 253)
-CV_REGISTER(XMM10, 254)
-CV_REGISTER(XMM11, 255)
-CV_REGISTER(XMM12, 256)
-CV_REGISTER(XMM13, 257)
-CV_REGISTER(XMM14, 258)
-CV_REGISTER(XMM15, 259)
+CV_REGISTER(CVRegXMM8, 252)
+CV_REGISTER(CVRegXMM9, 253)
+CV_REGISTER(CVRegXMM10, 254)
+CV_REGISTER(CVRegXMM11, 255)
+CV_REGISTER(CVRegXMM12, 256)
+CV_REGISTER(CVRegXMM13, 257)
+CV_REGISTER(CVRegXMM14, 258)
+CV_REGISTER(CVRegXMM15, 259)
 
 
-CV_REGISTER(SIL, 324)
-CV_REGISTER(DIL, 325)
-CV_REGISTER(BPL, 326)
-CV_REGISTER(SPL, 327)
+CV_REGISTER(CVRegSIL, 324)
+CV_REGISTER(CVRegDIL, 325)
+CV_REGISTER(CVRegBPL, 326)
+CV_REGISTER(CVRegSPL, 327)
 
-CV_REGISTER(RAX, 328)
-CV_REGISTER(RBX, 329)
-CV_REGISTER(RCX, 330)
-CV_REGISTER(RDX, 331)
-CV_REGISTER(RSI, 332)
-CV_REGISTER(RDI, 333)
-CV_REGISTER(RBP, 334)
-CV_REGISTER(RSP, 335)
+CV_REGISTER(CVRegRAX, 328)
+CV_REGISTER(CVRegRBX, 329)
+CV_REGISTER(CVRegRCX, 330)
+CV_REGISTER(CVRegRDX, 331)
+CV_REGISTER(CVRegRSI, 332)
+CV_REGISTER(CVRegRDI, 333)
+CV_REGISTER(CVRegRBP, 334)
+CV_REGISTER(CVRegRSP, 335)
 
-CV_REGISTER(R8, 336)
-CV_REGISTER(R9, 337)
-CV_REGISTER(R10, 338)
-CV_REGISTER(R11, 339)
-CV_REGISTER(R12, 340)
-CV_REGISTER(R13, 341)
-CV_REGISTER(R14, 342)
-CV_REGISTER(R15, 343)
+CV_REGISTER(CVRegR8, 336)
+CV_REGISTER(CVRegR9, 337)
+CV_REGISTER(CVRegR10, 338)
+CV_REGISTER(CVRegR11, 339)
+CV_REGISTER(CVRegR12, 340)
+CV_REGISTER(CVRegR13, 341)
+CV_REGISTER(CVRegR14, 342)
+CV_REGISTER(CVRegR15, 343)
 
-CV_REGISTER(R8B, 344)
-CV_REGISTER(R9B, 345)
-CV_REGISTER(R10B, 346)
-CV_REGISTER(R11B, 347)
-CV_REGISTER(R12B, 348)
-CV_REGISTER(R13B, 349)
-CV_REGISTER(R14B, 350)
-CV_REGISTER(R15B, 351)
+CV_REGISTER(CVRegR8B, 344)
+CV_REGISTER(CVRegR9B, 345)
+CV_REGISTER(CVRegR10B, 346)
+CV_REGISTER(CVRegR11B, 347)
+CV_REGISTER(CVRegR12B, 348)
+CV_REGISTER(CVRegR13B, 349)
+CV_REGISTER(CVRegR14B, 350)
+CV_REGISTER(CVRegR15B, 351)
 
-CV_REGISTER(R8W, 352)
-CV_REGISTER(R9W, 353)
-CV_REGISTER(R10W, 354)
-CV_REGISTER(R11W, 355)
-CV_REGISTER(R12W, 356)
-CV_REGISTER(R13W, 357)
-CV_REGISTER(R14W, 358)
-CV_REGISTER(R15W, 359)
+CV_REGISTER(CVRegR8W, 352)
+CV_REGISTER(CVRegR9W, 353)
+CV_REGISTER(CVRegR10W, 354)
+CV_REGISTER(CVRegR11W, 355)
+CV_REGISTER(CVRegR12W, 356)
+CV_REGISTER(CVRegR13W, 357)
+CV_REGISTER(CVRegR14W, 358)
+CV_REGISTER(CVRegR15W, 359)
 
-CV_REGISTER(R8D, 360)
-CV_REGISTER(R9D, 361)
-CV_REGISTER(R10D, 362)
-CV_REGISTER(R11D, 363)
-CV_REGISTER(R12D, 364)
-CV_REGISTER(R13D, 365)
-CV_REGISTER(R14D, 366)
-CV_REGISTER(R15D, 367)
+CV_REGISTER(CVRegR8D, 360)
+CV_REGISTER(CVRegR9D, 361)
+CV_REGISTER(CVRegR10D, 362)
+CV_REGISTER(CVRegR11D, 363)
+CV_REGISTER(CVRegR12D, 364)
+CV_REGISTER(CVRegR13D, 365)
+CV_REGISTER(CVRegR14D, 366)
+CV_REGISTER(CVRegR15D, 367)
 
 
 // cvconst.h defines both CV_REG_YMM0 (252) and CV_AMD64_YMM0 (368). Keep the
 // original prefix to distinguish them.
 
-CV_REGISTER(AMD64_YMM0, 368)
-CV_REGISTER(AMD64_YMM1, 369)
-CV_REGISTER(AMD64_YMM2, 370)
-CV_REGISTER(AMD64_YMM3, 371)
-CV_REGISTER(AMD64_YMM4, 372)
-CV_REGISTER(AMD64_YMM5, 373)
-CV_REGISTER(AMD64_YMM6, 374)
-CV_REGISTER(AMD64_YMM7, 375)
-CV_REGISTER(AMD64_YMM8, 376)
-CV_REGISTER(AMD64_YMM9, 377)
-CV_REGISTER(AMD64_YMM10, 378)
-CV_REGISTER(AMD64_YMM11, 379)
-CV_REGISTER(AMD64_YMM12, 380)
-CV_REGISTER(AMD64_YMM13, 381)
-CV_REGISTER(AMD64_YMM14, 382)
-CV_REGISTER(AMD64_YMM15, 383)
+CV_REGISTER(CVRegAMD64_YMM0, 368)
+CV_REGISTER(CVRegAMD64_YMM1, 369)
+CV_REGISTER(CVRegAMD64_YMM2, 370)
+CV_REGISTER(CVRegAMD64_YMM3, 371)
+CV_REGISTER(CVRegAMD64_YMM4, 372)
+CV_REGISTER(CVRegAMD64_YMM5, 373)
+CV_REGISTER(CVRegAMD64_YMM6, 374)
+CV_REGISTER(CVRegAMD64_YMM7, 375)
+CV_REGISTER(CVRegAMD64_YMM8, 376)
+CV_REGISTER(CVRegAMD64_YMM9, 377)
+CV_REGISTER(CVRegAMD64_YMM10, 378)
+CV_REGISTER(CVRegAMD64_YMM11, 379)
+CV_REGISTER(CVRegAMD64_YMM12, 380)
+CV_REGISTER(CVRegAMD64_YMM13, 381)
+CV_REGISTER(CVRegAMD64_YMM14, 382)
+CV_REGISTER(CVRegAMD64_YMM15, 383)
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewTypes.def b/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewTypes.def
index 69ce9606a670..e9a479dba496 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewTypes.def
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/CodeViewTypes.def
@@ -87,6 +87,8 @@ TYPE_RECORD(LF_UDT_MOD_SRC_LINE, 0x1607, UdtModSourceLine)
 
 TYPE_RECORD(LF_METHODLIST, 0x1206, MethodOverloadList)
 
+TYPE_RECORD(LF_PRECOMP, 0x1509, Precomp)
+TYPE_RECORD(LF_ENDPRECOMP, 0x0014, EndPrecomp)
 
 // 16 bit type records.
 CV_TYPE(LF_MODIFIER_16t, 0x0001)
@@ -106,7 +108,6 @@ CV_TYPE(LF_NOTTRAN, 0x0010)
 CV_TYPE(LF_DIMARRAY_16t, 0x0011)
 CV_TYPE(LF_VFTPATH_16t, 0x0012)
 CV_TYPE(LF_PRECOMP_16t, 0x0013)
-CV_TYPE(LF_ENDPRECOMP, 0x0014)
 CV_TYPE(LF_OEM_16t, 0x0015)
 CV_TYPE(LF_TYPESERVER_ST, 0x0016)
 
@@ -181,7 +182,6 @@ CV_TYPE(LF_MANAGED_ST, 0x140f)
 CV_TYPE(LF_ST_MAX, 0x1500)
 CV_TYPE(LF_TYPESERVER, 0x1501)
 CV_TYPE(LF_DIMARRAY, 0x1508)
-CV_TYPE(LF_PRECOMP, 0x1509)
 CV_TYPE(LF_ALIAS, 0x150a)
 CV_TYPE(LF_DEFARG, 0x150b)
 CV_TYPE(LF_FRIENDFCN, 0x150c)
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h
index 7f0f10e4fbfa..bebc960223cc 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGSTRINGTABLESUBSECTION_H
 #define LLVM_DEBUGINFO_CODEVIEW_DEBUGSTRINGTABLESUBSECTION_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
@@ -66,19 +67,26 @@ public:
   uint32_t insert(StringRef S);
 
   // Return the ID for string S.  Assumes S exists in the table.
-  uint32_t getStringId(StringRef S) const;
+  uint32_t getIdForString(StringRef S) const;
+
+  StringRef getStringForId(uint32_t Id) const;
 
   uint32_t calculateSerializedSize() const override;
   Error commit(BinaryStreamWriter &Writer) const override;
 
   uint32_t size() const;
 
-  StringMap<uint32_t>::const_iterator begin() const { return Strings.begin(); }
+  StringMap<uint32_t>::const_iterator begin() const {
+    return StringToId.begin();
+  }
+
+  StringMap<uint32_t>::const_iterator end() const { return StringToId.end(); }
 
-  StringMap<uint32_t>::const_iterator end() const { return Strings.end(); }
+  std::vector<uint32_t> sortedIds() const;
 
 private:
-  StringMap<uint32_t> Strings;
+  DenseMap<uint32_t, StringRef> IdToString;
+  StringMap<uint32_t> StringToId;
   uint32_t StringSize = 1;
 };
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
index d8ac3343c15f..c4704168ed34 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
@@ -69,9 +69,22 @@ public:
   ArrayRef<ArrayRef<uint8_t>> records() const;
   ArrayRef<GloballyHashedType> hashes() const;
 
-  using CreateRecord = llvm::function_ref<ArrayRef<uint8_t>()>;
+  template <typename CreateFunc>
+  TypeIndex insertRecordAs(GloballyHashedType Hash, size_t RecordSize,
+                           CreateFunc Create) {
+    auto Result = HashedRecords.try_emplace(Hash, nextTypeIndex());
+
+    if (LLVM_UNLIKELY(Result.second)) {
+      uint8_t *Stable = RecordStorage.Allocate<uint8_t>(RecordSize);
+      MutableArrayRef<uint8_t> Data(Stable, RecordSize);
+      SeenRecords.push_back(Create(Data));
+      SeenHashes.push_back(Hash);
+    }
+
+    // Update the caller's copy of Record to point a stable copy.
+    return Result.first->second;
+  }
 
-  TypeIndex insertRecordAs(GloballyHashedType Hash, CreateRecord Create);
   TypeIndex insertRecordBytes(ArrayRef<uint8_t> Data);
   TypeIndex insertRecord(ContinuationRecordBuilder &Builder);
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
index 16d78692c839..383f7dd9fb6a 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
@@ -26,7 +26,7 @@
 namespace llvm {
 namespace codeview {
 
-/// \brief Provides amortized O(1) random access to a CodeView type stream.
+/// Provides amortized O(1) random access to a CodeView type stream.
 /// Normally to access a type from a type stream, you must know its byte
 /// offset into the type stream, because type records are variable-lengthed.
 /// However, this is not the way we prefer to access them.  For example, given
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
index 741337533701..1f732d29a538 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
@@ -58,7 +58,10 @@ struct LocallyHashedType {
   }
 };
 
-enum class GlobalTypeHashAlg : uint16_t { SHA1 = 0 };
+enum class GlobalTypeHashAlg : uint16_t {
+  SHA1 = 0, // standard 20-byte SHA1 hash
+  SHA1_8    // last 8-bytes of standard SHA1 hash
+};
 
 /// A globally hashed type represents a hash value that is sufficient to
 /// uniquely identify a record across multiple type streams or type sequences.
@@ -77,10 +80,10 @@ struct GloballyHashedType {
   GloballyHashedType(StringRef H)
       : GloballyHashedType(ArrayRef<uint8_t>(H.bytes_begin(), H.bytes_end())) {}
   GloballyHashedType(ArrayRef<uint8_t> H) {
-    assert(H.size() == 20);
-    ::memcpy(Hash.data(), H.data(), 20);
+    assert(H.size() == 8);
+    ::memcpy(Hash.data(), H.data(), 8);
   }
-  std::array<uint8_t, 20> Hash;
+  std::array<uint8_t, 8> Hash;
 
   /// Given a sequence of bytes representing a record, compute a global hash for
   /// this record.  Due to the nature of global hashes incorporating the hashes
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 508bdd395f74..61ebdf878ce7 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -330,6 +330,10 @@ public:
     return !!(Attrs & uint32_t(PointerOptions::Unaligned));
   }
 
+  bool isRestrict() const {
+    return !!(Attrs & uint32_t(PointerOptions::Restrict));
+  }
+
   TypeIndex ReferentType;
   uint32_t Attrs;
   Optional<MemberPointerInfo> MemberInfo;
@@ -892,6 +896,33 @@ public:
   TypeIndex ContinuationIndex;
 };
 
+// LF_PRECOMP
+class PrecompRecord : public TypeRecord {
+public:
+  PrecompRecord() = default;
+  explicit PrecompRecord(TypeRecordKind Kind) : TypeRecord(Kind) {}
+
+  uint32_t getStartTypeIndex() const { return StartTypeIndex; }
+  uint32_t getTypesCount() const { return TypesCount; }
+  uint32_t getSignature() const { return Signature; }
+  StringRef getPrecompFilePath() const { return PrecompFilePath; }
+
+  uint32_t StartTypeIndex;
+  uint32_t TypesCount;
+  uint32_t Signature;
+  StringRef PrecompFilePath;
+};
+
+// LF_ENDPRECOMP
+class EndPrecompRecord : public TypeRecord {
+public:
+  EndPrecompRecord() = default;
+  explicit EndPrecompRecord(TypeRecordKind Kind) : TypeRecord(Kind) {}
+
+  uint32_t getSignature() const { return Signature; }
+
+  uint32_t Signature;
+};
 } // end namespace codeview
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 59e216abcb11..583740d2eb4b 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/contrib/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -23,7 +23,7 @@ struct GloballyHashedType;
 class GlobalTypeTableBuilder;
 class MergingTypeTableBuilder;
 
-/// \brief Merge one set of type records into another.  This method assumes
+/// Merge one set of type records into another.  This method assumes
 /// that all records are type records, and there are no Id records present.
 ///
 /// \param Dest The table to store the re-written type records into.
@@ -40,7 +40,7 @@ Error mergeTypeRecords(MergingTypeTableBuilder &Dest,
                        SmallVectorImpl<TypeIndex> &SourceToDest,
                        const CVTypeArray &Types);
 
-/// \brief Merge one set of id records into another.  This method assumes
+/// Merge one set of id records into another.  This method assumes
 /// that all records are id records, and there are no Type records present.
 /// However, since Id records can refer back to Type records, this method
 /// assumes that the referenced type records have also been merged into
@@ -65,7 +65,7 @@ Error mergeIdRecords(MergingTypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
                      SmallVectorImpl<TypeIndex> &SourceToDest,
                      const CVTypeArray &Ids);
 
-/// \brief Merge a unified set of type and id records, splitting them into
+/// Merge a unified set of type and id records, splitting them into
 /// separate output streams.
 ///
 /// \param DestIds The table to store the re-written id records into.
diff --git a/contrib/llvm/include/llvm/DebugInfo/DIContext.h b/contrib/llvm/include/llvm/DebugInfo/DIContext.h
index abace9378607..f89eb34fdd77 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DIContext.h
@@ -31,6 +31,7 @@ namespace llvm {
 struct DILineInfo {
   std::string FileName;
   std::string FunctionName;
+  Optional<StringRef> Source;
   uint32_t Line = 0;
   uint32_t Column = 0;
   uint32_t StartLine = 0;
@@ -159,6 +160,7 @@ struct DIDumpOptions {
   bool ShowForm = false;
   bool SummarizeTypes = false;
   bool Verbose = false;
+  bool DisplayRawContents = false;
 
   /// Return default option set for printing a single DIE without children.
   static DIDumpOptions getForSingleDIE() {
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index 0bade10f6201..1d448728338f 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_DWARFACCELERATORTABLE_H
 #define LLVM_DEBUGINFO_DWARFACCELERATORTABLE_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
@@ -20,18 +21,76 @@
 namespace llvm {
 
 class raw_ostream;
+class ScopedPrinter;
+
+/// The accelerator tables are designed to allow efficient random access
+/// (using a symbol name as a key) into debug info by providing an index of the
+/// debug info DIEs. This class implements the common functionality of Apple and
+/// DWARF 5 accelerator tables.
+/// TODO: Generalize the rest of the AppleAcceleratorTable interface and move it
+/// to this class.
+class DWARFAcceleratorTable {
+protected:
+  DWARFDataExtractor AccelSection;
+  DataExtractor StringSection;
+
+public:
+  /// An abstract class representing a single entry in the accelerator tables.
+  class Entry {
+  protected:
+    SmallVector<DWARFFormValue, 3> Values;
+
+    Entry() = default;
+
+    // Make these protected so only (final) subclasses can be copied around.
+    Entry(const Entry &) = default;
+    Entry(Entry &&) = default;
+    Entry &operator=(const Entry &) = default;
+    Entry &operator=(Entry &&) = default;
+    ~Entry() = default;
+
+
+  public:
+    /// Returns the Offset of the Compilation Unit associated with this
+    /// Accelerator Entry or None if the Compilation Unit offset is not recorded
+    /// in this Accelerator Entry.
+    virtual Optional<uint64_t> getCUOffset() const = 0;
+
+    /// Returns the Tag of the Debug Info Entry associated with this
+    /// Accelerator Entry or None if the Tag is not recorded in this
+    /// Accelerator Entry.
+    virtual Optional<dwarf::Tag> getTag() const = 0;
+
+    /// Returns the raw values of fields in the Accelerator Entry. In general,
+    /// these can only be interpreted with the help of the metadata in the
+    /// owning Accelerator Table.
+    ArrayRef<DWARFFormValue> getValues() const { return Values; }
+  };
+
+  DWARFAcceleratorTable(const DWARFDataExtractor &AccelSection,
+                        DataExtractor StringSection)
+      : AccelSection(AccelSection), StringSection(StringSection) {}
+  virtual ~DWARFAcceleratorTable();
+
+  virtual llvm::Error extract() = 0;
+  virtual void dump(raw_ostream &OS) const = 0;
+
+  DWARFAcceleratorTable(const DWARFAcceleratorTable &) = delete;
+  void operator=(const DWARFAcceleratorTable &) = delete;
+};
 
 /// This implements the Apple accelerator table format, a precursor of the
 /// DWARF 5 accelerator table format.
-/// TODO: Factor out a common base class for both formats.
-class DWARFAcceleratorTable {
+class AppleAcceleratorTable : public DWARFAcceleratorTable {
   struct Header {
     uint32_t Magic;
     uint16_t Version;
     uint16_t HashFunction;
-    uint32_t NumBuckets;
-    uint32_t NumHashes;
+    uint32_t BucketCount;
+    uint32_t HashCount;
     uint32_t HeaderDataLength;
+
+    void dump(ScopedPrinter &W) const;
   };
 
   struct HeaderData {
@@ -40,22 +99,51 @@ class DWARFAcceleratorTable {
 
     uint32_t DIEOffsetBase;
     SmallVector<std::pair<AtomType, Form>, 3> Atoms;
+
+    Optional<uint64_t> extractOffset(Optional<DWARFFormValue> Value) const;
   };
 
   struct Header Hdr;
   struct HeaderData HdrData;
-  DWARFDataExtractor AccelSection;
-  DataExtractor StringSection;
   bool IsValid = false;
 
+  /// Returns true if we should continue scanning for entries or false if we've
+  /// reached the last (sentinel) entry of encountered a parsing error.
+  bool dumpName(ScopedPrinter &W, SmallVectorImpl<DWARFFormValue> &AtomForms,
+                uint32_t *DataOffset) const;
+
 public:
-  /// An iterator for the entries associated with one key. Each entry can have
-  /// multiple DWARFFormValues.
-  class ValueIterator : public std::iterator<std::input_iterator_tag,
-                                            ArrayRef<DWARFFormValue>> {
-    const DWARFAcceleratorTable *AccelTable = nullptr;
-    SmallVector<DWARFFormValue, 3> AtomForms; ///< The decoded data entry.
+  /// Apple-specific implementation of an Accelerator Entry.
+  class Entry final : public DWARFAcceleratorTable::Entry {
+    const HeaderData *HdrData = nullptr;
+
+    Entry(const HeaderData &Data);
+    Entry() = default;
+
+    void extract(const AppleAcceleratorTable &AccelTable, uint32_t *Offset);
+
+  public:
+    Optional<uint64_t> getCUOffset() const override;
+
+    /// Returns the Section Offset of the Debug Info Entry associated with this
+    /// Accelerator Entry or None if the DIE offset is not recorded in this
+    /// Accelerator Entry. The returned offset is relative to the start of the
+    /// Section containing the DIE.
+    Optional<uint64_t> getDIESectionOffset() const;
 
+    Optional<dwarf::Tag> getTag() const override;
+
+    /// Returns the value of the Atom in this Accelerator Entry, if the Entry
+    /// contains such Atom.
+    Optional<DWARFFormValue> lookup(HeaderData::AtomType Atom) const;
+
+    friend class AppleAcceleratorTable;
+    friend class ValueIterator;
+  };
+
+  class ValueIterator : public std::iterator<std::input_iterator_tag, Entry> {
+    const AppleAcceleratorTable *AccelTable = nullptr;
+    Entry Current;           ///< The current entry.
     unsigned DataOffset = 0; ///< Offset into the section.
     unsigned Data = 0; ///< Current data entry.
     unsigned NumData = 0; ///< Number of data entries.
@@ -64,13 +152,11 @@ public:
     void Next();
   public:
     /// Construct a new iterator for the entries at \p DataOffset.
-    ValueIterator(const DWARFAcceleratorTable &AccelTable, unsigned DataOffset);
+    ValueIterator(const AppleAcceleratorTable &AccelTable, unsigned DataOffset);
     /// End marker.
     ValueIterator() = default;
 
-    const ArrayRef<DWARFFormValue> operator*() const {
-      return AtomForms;
-    }
+    const Entry &operator*() const { return Current; }
     ValueIterator &operator++() { Next(); return *this; }
     ValueIterator operator++(int) {
       ValueIterator I = *this;
@@ -85,16 +171,18 @@ public:
     }
   };
 
-
-  DWARFAcceleratorTable(const DWARFDataExtractor &AccelSection,
+  AppleAcceleratorTable(const DWARFDataExtractor &AccelSection,
                         DataExtractor StringSection)
-      : AccelSection(AccelSection), StringSection(StringSection) {}
+      : DWARFAcceleratorTable(AccelSection, StringSection) {}
 
-  llvm::Error extract();
+  llvm::Error extract() override;
   uint32_t getNumBuckets();
   uint32_t getNumHashes();
   uint32_t getSizeHdr();
   uint32_t getHeaderDataLength();
+
+  /// Return the Atom description, which can be used to interpret the raw values
+  /// of the Accelerator Entries in this table.
   ArrayRef<std::pair<HeaderData::AtomType, HeaderData::Form>> getAtomsDesc();
   bool validateForms();
 
@@ -107,10 +195,404 @@ public:
   /// related to the input hash data offset.
   /// DieTag is the tag of the DIE
   std::pair<uint32_t, dwarf::Tag> readAtoms(uint32_t &HashDataOffset);
-  void dump(raw_ostream &OS) const;
+  void dump(raw_ostream &OS) const override;
+
+  /// Look up all entries in the accelerator table matching \c Key.
+  iterator_range<ValueIterator> equal_range(StringRef Key) const;
+};
+
+/// .debug_names section consists of one or more units. Each unit starts with a
+/// header, which is followed by a list of compilation units, local and foreign
+/// type units.
+///
+/// These may be followed by an (optional) hash lookup table, which consists of
+/// an array of buckets and hashes similar to the apple tables above. The only
+/// difference is that the hashes array is 1-based, and consequently an empty
+/// bucket is denoted by 0 and not UINT32_MAX.
+///
+/// Next is the name table, which consists of an array of names and array of
+/// entry offsets. This is different from the apple tables, which store names
+/// next to the actual entries.
+///
+/// The structure of the entries is described by an abbreviations table, which
+/// comes after the name table. Unlike the apple tables, which have a uniform
+/// entry structure described in the header, each .debug_names entry may have
+/// different index attributes (DW_IDX_???) attached to it.
+///
+/// The last segment consists of a list of entries, which is a 0-terminated list
+/// referenced by the name table and interpreted with the help of the
+/// abbreviation table.
+class DWARFDebugNames : public DWARFAcceleratorTable {
+  /// The fixed-size part of a Dwarf 5 Name Index header
+  struct HeaderPOD {
+    uint32_t UnitLength;
+    uint16_t Version;
+    uint16_t Padding;
+    uint32_t CompUnitCount;
+    uint32_t LocalTypeUnitCount;
+    uint32_t ForeignTypeUnitCount;
+    uint32_t BucketCount;
+    uint32_t NameCount;
+    uint32_t AbbrevTableSize;
+    uint32_t AugmentationStringSize;
+  };
+
+public:
+  class NameIndex;
+  class NameIterator;
+  class ValueIterator;
+
+  /// Dwarf 5 Name Index header.
+  struct Header : public HeaderPOD {
+    SmallString<8> AugmentationString;
+
+    Error extract(const DWARFDataExtractor &AS, uint32_t *Offset);
+    void dump(ScopedPrinter &W) const;
+  };
+
+  /// Index attribute and its encoding.
+  struct AttributeEncoding {
+    dwarf::Index Index;
+    dwarf::Form Form;
+
+    constexpr AttributeEncoding(dwarf::Index Index, dwarf::Form Form)
+        : Index(Index), Form(Form) {}
+
+    friend bool operator==(const AttributeEncoding &LHS,
+                           const AttributeEncoding &RHS) {
+      return LHS.Index == RHS.Index && LHS.Form == RHS.Form;
+    }
+  };
+
+  /// Abbreviation describing the encoding of Name Index entries.
+  struct Abbrev {
+    uint32_t Code;  ///< Abbreviation code
+    dwarf::Tag Tag; ///< Dwarf Tag of the described entity.
+    std::vector<AttributeEncoding> Attributes; ///< List of index attributes.
+
+    Abbrev(uint32_t Code, dwarf::Tag Tag,
+           std::vector<AttributeEncoding> Attributes)
+        : Code(Code), Tag(Tag), Attributes(std::move(Attributes)) {}
+
+    void dump(ScopedPrinter &W) const;
+  };
+
+  /// DWARF v5-specific implementation of an Accelerator Entry.
+  class Entry final : public DWARFAcceleratorTable::Entry {
+    const NameIndex *NameIdx;
+    const Abbrev *Abbr;
+
+    Entry(const NameIndex &NameIdx, const Abbrev &Abbr);
+
+  public:
+    Optional<uint64_t> getCUOffset() const override;
+    Optional<dwarf::Tag> getTag() const override { return tag(); }
+
+    /// Returns the Index into the Compilation Unit list of the owning Name
+    /// Index or None if this Accelerator Entry does not have an associated
+    /// Compilation Unit. It is up to the user to verify that the returned Index
+    /// is valid in the owning NameIndex (or use getCUOffset(), which will
+    /// handle that check itself). Note that entries in NameIndexes which index
+    /// just a single Compilation Unit are implicitly associated with that unit,
+    /// so this function will return 0 even without an explicit
+    /// DW_IDX_compile_unit attribute.
+    Optional<uint64_t> getCUIndex() const;
+
+    /// .debug_names-specific getter, which always succeeds (DWARF v5 index
+    /// entries always have a tag).
+    dwarf::Tag tag() const { return Abbr->Tag; }
+
+    /// Returns the Offset of the DIE within the containing CU or TU.
+    Optional<uint64_t> getDIEUnitOffset() const;
+
+    /// Return the Abbreviation that can be used to interpret the raw values of
+    /// this Accelerator Entry.
+    const Abbrev &getAbbrev() const { return *Abbr; }
+
+    /// Returns the value of the Index Attribute in this Accelerator Entry, if
+    /// the Entry contains such Attribute.
+    Optional<DWARFFormValue> lookup(dwarf::Index Index) const;
+
+    void dump(ScopedPrinter &W) const;
+
+    friend class NameIndex;
+    friend class ValueIterator;
+  };
+
+  /// Error returned by NameIndex::getEntry to report it has reached the end of
+  /// the entry list.
+  class SentinelError : public ErrorInfo<SentinelError> {
+  public:
+    static char ID;
+
+    void log(raw_ostream &OS) const override { OS << "Sentinel"; }
+    std::error_code convertToErrorCode() const override;
+  };
+
+private:
+  /// DenseMapInfo for struct Abbrev.
+  struct AbbrevMapInfo {
+    static Abbrev getEmptyKey();
+    static Abbrev getTombstoneKey();
+    static unsigned getHashValue(uint32_t Code) {
+      return DenseMapInfo<uint32_t>::getHashValue(Code);
+    }
+    static unsigned getHashValue(const Abbrev &Abbr) {
+      return getHashValue(Abbr.Code);
+    }
+    static bool isEqual(uint32_t LHS, const Abbrev &RHS) {
+      return LHS == RHS.Code;
+    }
+    static bool isEqual(const Abbrev &LHS, const Abbrev &RHS) {
+      return LHS.Code == RHS.Code;
+    }
+  };
+
+public:
+  /// A single entry in the Name Table (Dwarf 5 sect. 6.1.1.4.6) of the Name
+  /// Index.
+  class NameTableEntry {
+    DataExtractor StrData;
+
+    uint32_t Index;
+    uint32_t StringOffset;
+    uint32_t EntryOffset;
+
+  public:
+    NameTableEntry(const DataExtractor &StrData, uint32_t Index,
+                   uint32_t StringOffset, uint32_t EntryOffset)
+        : StrData(StrData), Index(Index), StringOffset(StringOffset),
+          EntryOffset(EntryOffset) {}
+
+    /// Return the index of this name in the parent Name Index.
+    uint32_t getIndex() const { return Index; }
+
+    /// Returns the offset of the name of the described entities.
+    uint32_t getStringOffset() const { return StringOffset; }
+
+    /// Return the string referenced by this name table entry or nullptr if the
+    /// string offset is not valid.
+    const char *getString() const {
+      uint32_t Off = StringOffset;
+      return StrData.getCStr(&Off);
+    }
+
+    /// Returns the offset of the first Entry in the list.
+    uint32_t getEntryOffset() const { return EntryOffset; }
+  };
+
+  /// Represents a single accelerator table within the Dwarf 5 .debug_names
+  /// section.
+  class NameIndex {
+    DenseSet<Abbrev, AbbrevMapInfo> Abbrevs;
+    struct Header Hdr;
+    const DWARFDebugNames &Section;
+
+    // Base of the whole unit and of various important tables, as offsets from
+    // the start of the section.
+    uint32_t Base;
+    uint32_t CUsBase;
+    uint32_t BucketsBase;
+    uint32_t HashesBase;
+    uint32_t StringOffsetsBase;
+    uint32_t EntryOffsetsBase;
+    uint32_t EntriesBase;
+
+    void dumpCUs(ScopedPrinter &W) const;
+    void dumpLocalTUs(ScopedPrinter &W) const;
+    void dumpForeignTUs(ScopedPrinter &W) const;
+    void dumpAbbreviations(ScopedPrinter &W) const;
+    bool dumpEntry(ScopedPrinter &W, uint32_t *Offset) const;
+    void dumpName(ScopedPrinter &W, const NameTableEntry &NTE,
+                  Optional<uint32_t> Hash) const;
+    void dumpBucket(ScopedPrinter &W, uint32_t Bucket) const;
+
+    Expected<AttributeEncoding> extractAttributeEncoding(uint32_t *Offset);
+
+    Expected<std::vector<AttributeEncoding>>
+    extractAttributeEncodings(uint32_t *Offset);
+
+    Expected<Abbrev> extractAbbrev(uint32_t *Offset);
+
+  public:
+    NameIndex(const DWARFDebugNames &Section, uint32_t Base)
+        : Section(Section), Base(Base) {}
+
+    /// Reads offset of compilation unit CU. CU is 0-based.
+    uint32_t getCUOffset(uint32_t CU) const;
+    uint32_t getCUCount() const { return Hdr.CompUnitCount; }
+
+    /// Reads offset of local type unit TU, TU is 0-based.
+    uint32_t getLocalTUOffset(uint32_t TU) const;
+    uint32_t getLocalTUCount() const { return Hdr.LocalTypeUnitCount; }
+
+    /// Reads signature of foreign type unit TU. TU is 0-based.
+    uint64_t getForeignTUSignature(uint32_t TU) const;
+    uint32_t getForeignTUCount() const { return Hdr.ForeignTypeUnitCount; }
+
+    /// Reads an entry in the Bucket Array for the given Bucket. The returned
+    /// value is a (1-based) index into the Names, StringOffsets and
+    /// EntryOffsets arrays. The input Bucket index is 0-based.
+    uint32_t getBucketArrayEntry(uint32_t Bucket) const;
+    uint32_t getBucketCount() const { return Hdr.BucketCount; }
+
+    /// Reads an entry in the Hash Array for the given Index. The input Index
+    /// is 1-based.
+    uint32_t getHashArrayEntry(uint32_t Index) const;
+
+    /// Reads an entry in the Name Table for the given Index. The Name Table
+    /// consists of two arrays -- String Offsets and Entry Offsets. The returned
+    /// offsets are relative to the starts of respective sections. Input Index
+    /// is 1-based.
+    NameTableEntry getNameTableEntry(uint32_t Index) const;
+
+    uint32_t getNameCount() const { return Hdr.NameCount; }
+
+    const DenseSet<Abbrev, AbbrevMapInfo> &getAbbrevs() const {
+      return Abbrevs;
+    }
+
+    Expected<Entry> getEntry(uint32_t *Offset) const;
+
+    /// Look up all entries in this Name Index matching \c Key.
+    iterator_range<ValueIterator> equal_range(StringRef Key) const;
+
+    NameIterator begin() const { return NameIterator(this, 1); }
+    NameIterator end() const { return NameIterator(this, getNameCount() + 1); }
+
+    llvm::Error extract();
+    uint32_t getUnitOffset() const { return Base; }
+    uint32_t getNextUnitOffset() const { return Base + 4 + Hdr.UnitLength; }
+    void dump(ScopedPrinter &W) const;
+
+    friend class DWARFDebugNames;
+  };
+
+  class ValueIterator : public std::iterator<std::input_iterator_tag, Entry> {
+
+    /// The Name Index we are currently iterating through. The implementation
+    /// relies on the fact that this can also be used as an iterator into the
+    /// "NameIndices" vector in the Accelerator section.
+    const NameIndex *CurrentIndex = nullptr;
+
+    /// Whether this is a local iterator (searches in CurrentIndex only) or not
+    /// (searches all name indices).
+    bool IsLocal;
+
+    Optional<Entry> CurrentEntry;
+    unsigned DataOffset = 0; ///< Offset into the section.
+    std::string Key;         ///< The Key we are searching for.
+    Optional<uint32_t> Hash; ///< Hash of Key, if it has been computed.
+
+    bool getEntryAtCurrentOffset();
+    Optional<uint32_t> findEntryOffsetInCurrentIndex();
+    bool findInCurrentIndex();
+    void searchFromStartOfCurrentIndex();
+    void next();
+
+    /// Set the iterator to the "end" state.
+    void setEnd() { *this = ValueIterator(); }
+
+  public:
+    /// Create a "begin" iterator for looping over all entries in the
+    /// accelerator table matching Key. The iterator will run through all Name
+    /// Indexes in the section in sequence.
+    ValueIterator(const DWARFDebugNames &AccelTable, StringRef Key);
+
+    /// Create a "begin" iterator for looping over all entries in a specific
+    /// Name Index. Other indices in the section will not be visited.
+    ValueIterator(const NameIndex &NI, StringRef Key);
+
+    /// End marker.
+    ValueIterator() = default;
+
+    const Entry &operator*() const { return *CurrentEntry; }
+    ValueIterator &operator++() {
+      next();
+      return *this;
+    }
+    ValueIterator operator++(int) {
+      ValueIterator I = *this;
+      next();
+      return I;
+    }
+
+    friend bool operator==(const ValueIterator &A, const ValueIterator &B) {
+      return A.CurrentIndex == B.CurrentIndex && A.DataOffset == B.DataOffset;
+    }
+    friend bool operator!=(const ValueIterator &A, const ValueIterator &B) {
+      return !(A == B);
+    }
+  };
+
+  class NameIterator {
+
+    /// The Name Index we are iterating through.
+    const NameIndex *CurrentIndex;
+
+    /// The current name in the Name Index.
+    uint32_t CurrentName;
+
+    void next() {
+      assert(CurrentName <= CurrentIndex->getNameCount());
+      ++CurrentName;
+    }
+
+  public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = NameTableEntry;
+    using difference_type = uint32_t;
+    using pointer = NameTableEntry *;
+    using reference = NameTableEntry; // We return entries by value.
+
+    /// Creates an iterator whose initial position is name CurrentName in
+    /// CurrentIndex.
+    NameIterator(const NameIndex *CurrentIndex, uint32_t CurrentName)
+        : CurrentIndex(CurrentIndex), CurrentName(CurrentName) {}
+
+    NameTableEntry operator*() const {
+      return CurrentIndex->getNameTableEntry(CurrentName);
+    }
+    NameIterator &operator++() {
+      next();
+      return *this;
+    }
+    NameIterator operator++(int) {
+      NameIterator I = *this;
+      next();
+      return I;
+    }
+
+    friend bool operator==(const NameIterator &A, const NameIterator &B) {
+      return A.CurrentIndex == B.CurrentIndex && A.CurrentName == B.CurrentName;
+    }
+    friend bool operator!=(const NameIterator &A, const NameIterator &B) {
+      return !(A == B);
+    }
+  };
+
+private:
+  SmallVector<NameIndex, 0> NameIndices;
+  DenseMap<uint32_t, const NameIndex *> CUToNameIndex;
+
+public:
+  DWARFDebugNames(const DWARFDataExtractor &AccelSection,
+                  DataExtractor StringSection)
+      : DWARFAcceleratorTable(AccelSection, StringSection) {}
+
+  llvm::Error extract() override;
+  void dump(raw_ostream &OS) const override;
 
   /// Look up all entries in the accelerator table matching \c Key.
   iterator_range<ValueIterator> equal_range(StringRef Key) const;
+
+  using const_iterator = SmallVector<NameIndex, 0>::const_iterator;
+  const_iterator begin() const { return NameIndices.begin(); }
+  const_iterator end() const { return NameIndices.end(); }
+
+  /// Return the Name Index covering the compile unit at CUOffset, or nullptr if
+  /// there is no Name Index covering that unit.
+  const NameIndex *getCUNameIndex(uint32_t CUOffset);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
new file mode 100644
index 000000000000..5a7df5c353e8
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
@@ -0,0 +1,68 @@
+//===- DWARFAddressRange.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFADDRESSRANGE_H
+#define LLVM_DEBUGINFO_DWARF_DWARFADDRESSRANGE_H
+
+#include "llvm/DebugInfo/DIContext.h"
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+struct DWARFAddressRange {
+  uint64_t LowPC;
+  uint64_t HighPC;
+  uint64_t SectionIndex;
+
+  DWARFAddressRange() = default;
+
+  /// Used for unit testing.
+  DWARFAddressRange(uint64_t LowPC, uint64_t HighPC, uint64_t SectionIndex = 0)
+      : LowPC(LowPC), HighPC(HighPC), SectionIndex(SectionIndex) {}
+
+  /// Returns true if LowPC is smaller or equal to HighPC. This accounts for
+  /// dead-stripped ranges.
+  bool valid() const { return LowPC <= HighPC; }
+
+  /// Returns true if [LowPC, HighPC) intersects with [RHS.LowPC, RHS.HighPC).
+  bool intersects(const DWARFAddressRange &RHS) const {
+    assert(valid() && RHS.valid());
+    // Empty ranges can't intersect.
+    if (LowPC == HighPC || RHS.LowPC == RHS.HighPC)
+      return false;
+    return LowPC < RHS.HighPC && RHS.LowPC < HighPC;
+  }
+
+  /// Returns true if [LowPC, HighPC) fully contains [RHS.LowPC, RHS.HighPC).
+  bool contains(const DWARFAddressRange &RHS) const {
+    assert(valid() && RHS.valid());
+    return LowPC <= RHS.LowPC && RHS.HighPC <= HighPC;
+  }
+
+  void dump(raw_ostream &OS, uint32_t AddressSize,
+            DIDumpOptions DumpOpts = {}) const;
+};
+
+static inline bool operator<(const DWARFAddressRange &LHS,
+                             const DWARFAddressRange &RHS) {
+  return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const DWARFAddressRange &R);
+
+/// DWARFAddressRangesVector - represents a set of absolute address ranges.
+using DWARFAddressRangesVector = std::vector<DWARFAddressRange>;
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFADDRESSRANGE_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
index a18adf87bf8e..c219ca75e640 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -18,13 +18,13 @@ namespace llvm {
 class DWARFCompileUnit : public DWARFUnit {
 public:
   DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
+                   const DWARFUnitHeader &Header,
                    const DWARFDebugAbbrev *DA, const DWARFSection *RS,
                    StringRef SS, const DWARFSection &SOS,
                    const DWARFSection *AOS, const DWARFSection &LS, bool LE,
-                   bool IsDWO, const DWARFUnitSectionBase &UnitSection,
-                   const DWARFUnitIndex::Entry *Entry)
-      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
-                  UnitSection, Entry) {}
+                   bool IsDWO, const DWARFUnitSectionBase &UnitSection)
+      : DWARFUnit(Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
+                  UnitSection) {}
 
   // VTable anchor.
   ~DWARFCompileUnit() override;
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 2ddbc4b91ba2..fe7430c9f04c 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -34,6 +34,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Host.h"
 #include <cstdint>
@@ -43,7 +44,6 @@
 
 namespace llvm {
 
-class DataExtractor;
 class MCRegisterInfo;
 class MemoryBuffer;
 class raw_ostream;
@@ -69,10 +69,11 @@ class DWARFContext : public DIContext {
   std::unique_ptr<DWARFDebugFrame> DebugFrame;
   std::unique_ptr<DWARFDebugFrame> EHFrame;
   std::unique_ptr<DWARFDebugMacro> Macro;
-  std::unique_ptr<DWARFAcceleratorTable> AppleNames;
-  std::unique_ptr<DWARFAcceleratorTable> AppleTypes;
-  std::unique_ptr<DWARFAcceleratorTable> AppleNamespaces;
-  std::unique_ptr<DWARFAcceleratorTable> AppleObjC;
+  std::unique_ptr<DWARFDebugNames> Names;
+  std::unique_ptr<AppleAcceleratorTable> AppleNames;
+  std::unique_ptr<AppleAcceleratorTable> AppleTypes;
+  std::unique_ptr<AppleAcceleratorTable> AppleNamespaces;
+  std::unique_ptr<AppleAcceleratorTable> AppleObjC;
 
   DWARFUnitSection<DWARFCompileUnit> DWOCUs;
   std::deque<DWARFUnitSection<DWARFTypeUnit>> DWOTUs;
@@ -204,6 +205,9 @@ public:
 
   DWARFCompileUnit *getDWOCompileUnitForHash(uint64_t Hash);
 
+  /// Return the compile unit that includes an offset (relative to .debug_info).
+  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
+
   /// Get a DIE given an exact offset.
   DWARFDie getDIEForOffset(uint32_t Offset);
 
@@ -243,19 +247,36 @@ public:
   const DWARFDebugMacro *getDebugMacro();
 
   /// Get a reference to the parsed accelerator table object.
-  const DWARFAcceleratorTable &getAppleNames();
+  const DWARFDebugNames &getDebugNames();
 
   /// Get a reference to the parsed accelerator table object.
-  const DWARFAcceleratorTable &getAppleTypes();
+  const AppleAcceleratorTable &getAppleNames();
 
   /// Get a reference to the parsed accelerator table object.
-  const DWARFAcceleratorTable &getAppleNamespaces();
+  const AppleAcceleratorTable &getAppleTypes();
 
   /// Get a reference to the parsed accelerator table object.
-  const DWARFAcceleratorTable &getAppleObjC();
+  const AppleAcceleratorTable &getAppleNamespaces();
+
+  /// Get a reference to the parsed accelerator table object.
+  const AppleAcceleratorTable &getAppleObjC();
+
+  /// Get a pointer to a parsed line table corresponding to a compile unit.
+  /// Report any parsing issues as warnings on stderr.
+  const DWARFDebugLine::LineTable *getLineTableForUnit(DWARFUnit *U);
 
   /// Get a pointer to a parsed line table corresponding to a compile unit.
-  const DWARFDebugLine::LineTable *getLineTableForUnit(DWARFUnit *cu);
+  /// Report any recoverable parsing problems using the callback.
+  Expected<const DWARFDebugLine::LineTable *>
+  getLineTableForUnit(DWARFUnit *U,
+                      std::function<void(Error)> RecoverableErrorCallback);
+
+  DataExtractor getStringExtractor() const {
+    return DataExtractor(DObj->getStringSection(), false, 0);
+  }
+  DataExtractor getLineStringExtractor() const {
+    return DataExtractor(DObj->getLineStringSection(), false, 0);
+  }
 
   /// Wraps the returned DIEs for a given address.
   struct DIEsForAddress {
@@ -303,9 +324,6 @@ public:
   Error loadRegisterInfo(const object::ObjectFile &Obj);
 
 private:
-  /// Return the compile unit that includes an offset (relative to .debug_info).
-  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
-
   /// Return the compile unit which contains instruction with provided
   /// address.
   DWARFCompileUnit *getCompileUnitForAddress(uint64_t Address);
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h
index a379d9c85b38..10e146b70ec7 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h
@@ -44,6 +44,13 @@ public:
   uint64_t getRelocatedAddress(uint32_t *Off, uint64_t *SecIx = nullptr) const {
     return getRelocatedValue(getAddressSize(), Off, SecIx);
   }
+
+  /// Extracts a DWARF-encoded pointer in \p Offset using \p Encoding.
+  /// There is a DWARF encoding that uses a PC-relative adjustment.
+  /// For these values, \p AbsPosOffset is used to fix them, which should
+  /// reflect the absolute address of this pointer.
+  Optional<uint64_t> getEncodedPointer(uint32_t *Offset, uint8_t Encoding,
+                                       uint64_t AbsPosOffset = 0) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index dfbbb95076e8..ab46fac39f7c 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -43,6 +43,7 @@ public:
     uint64_t Length;
 
     uint64_t getEndAddress() const { return Address + Length; }
+    void dump(raw_ostream &OS, uint32_t AddressSize) const;
   };
 
 private:
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index a711fb295444..ff1c7fb38389 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -10,40 +10,290 @@
 #ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGFRAME_H
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGFRAME_H
 
-#include "llvm/Support/DataExtractor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/Support/Error.h"
 #include <memory>
 #include <vector>
 
 namespace llvm {
 
-class FrameEntry;
 class raw_ostream;
 
-/// \brief A parsed .debug_frame or .eh_frame section
-///
+namespace dwarf {
+
+/// Represent a sequence of Call Frame Information instructions that, when read
+/// in order, construct a table mapping PC to frame state. This can also be
+/// referred to as "CFI rules" in DWARF literature to avoid confusion with
+/// computer programs in the broader sense, and in this context each instruction
+/// would be a rule to establish the mapping. Refer to pg. 172 in the DWARF5
+/// manual, "6.4.1 Structure of Call Frame Information".
+class CFIProgram {
+public:
+  typedef SmallVector<uint64_t, 2> Operands;
+
+  /// An instruction consists of a DWARF CFI opcode and an optional sequence of
+  /// operands. If it refers to an expression, then this expression has its own
+  /// sequence of operations and operands handled separately by DWARFExpression.
+  struct Instruction {
+    Instruction(uint8_t Opcode) : Opcode(Opcode) {}
+
+    uint8_t Opcode;
+    Operands Ops;
+    // Associated DWARF expression in case this instruction refers to one
+    Optional<DWARFExpression> Expression;
+  };
+
+  using InstrList = std::vector<Instruction>;
+  using iterator = InstrList::iterator;
+  using const_iterator = InstrList::const_iterator;
+
+  iterator begin() { return Instructions.begin(); }
+  const_iterator begin() const { return Instructions.begin(); }
+  iterator end() { return Instructions.end(); }
+  const_iterator end() const { return Instructions.end(); }
+
+  unsigned size() const { return (unsigned)Instructions.size(); }
+  bool empty() const { return Instructions.empty(); }
+
+  CFIProgram(uint64_t CodeAlignmentFactor, int64_t DataAlignmentFactor)
+      : CodeAlignmentFactor(CodeAlignmentFactor),
+        DataAlignmentFactor(DataAlignmentFactor) {}
+
+  /// Parse and store a sequence of CFI instructions from Data,
+  /// starting at *Offset and ending at EndOffset. *Offset is updated
+  /// to EndOffset upon successful parsing, or indicates the offset
+  /// where a problem occurred in case an error is returned.
+  Error parse(DataExtractor Data, uint32_t *Offset, uint32_t EndOffset);
+
+  void dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
+            unsigned IndentLevel = 1) const;
+
+private:
+  std::vector<Instruction> Instructions;
+  const uint64_t CodeAlignmentFactor;
+  const int64_t DataAlignmentFactor;
+
+  /// Convenience method to add a new instruction with the given opcode.
+  void addInstruction(uint8_t Opcode) {
+    Instructions.push_back(Instruction(Opcode));
+  }
+
+  /// Add a new single-operand instruction.
+  void addInstruction(uint8_t Opcode, uint64_t Operand1) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+  }
+
+  /// Add a new instruction that has two operands.
+  void addInstruction(uint8_t Opcode, uint64_t Operand1, uint64_t Operand2) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+    Instructions.back().Ops.push_back(Operand2);
+  }
+
+  /// Types of operands to CFI instructions
+  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
+  /// thus this type doesn't need to be explictly written to the file (this is
+  /// not a DWARF encoding). The relationship of instrs to operand types can
+  /// be obtained from getOperandTypes() and is only used to simplify
+  /// instruction printing.
+  enum OperandType {
+    OT_Unset,
+    OT_None,
+    OT_Address,
+    OT_Offset,
+    OT_FactoredCodeOffset,
+    OT_SignedFactDataOffset,
+    OT_UnsignedFactDataOffset,
+    OT_Register,
+    OT_Expression
+  };
+
+  /// Retrieve the array describing the types of operands according to the enum
+  /// above. This is indexed by opcode.
+  static ArrayRef<OperandType[2]> getOperandTypes();
+
+  /// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
+  void printOperand(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
+                    const Instruction &Instr, unsigned OperandIdx,
+                    uint64_t Operand) const;
+};
+
+/// An entry in either debug_frame or eh_frame. This entry can be a CIE or an
+/// FDE.
+class FrameEntry {
+public:
+  enum FrameKind { FK_CIE, FK_FDE };
+
+  FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length, uint64_t CodeAlign,
+             int64_t DataAlign)
+      : Kind(K), Offset(Offset), Length(Length), CFIs(CodeAlign, DataAlign) {}
+
+  virtual ~FrameEntry() {}
+
+  FrameKind getKind() const { return Kind; }
+  uint64_t getOffset() const { return Offset; }
+  uint64_t getLength() const { return Length; }
+  const CFIProgram &cfis() const { return CFIs; }
+  CFIProgram &cfis() { return CFIs; }
+
+  /// Dump the instructions in this CFI fragment
+  virtual void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+                    bool IsEH) const = 0;
+
+protected:
+  const FrameKind Kind;
+
+  /// Offset of this entry in the section.
+  const uint64_t Offset;
+
+  /// Entry length as specified in DWARF.
+  const uint64_t Length;
+
+  CFIProgram CFIs;
+};
+
+/// DWARF Common Information Entry (CIE)
+class CIE : public FrameEntry {
+public:
+  // CIEs (and FDEs) are simply container classes, so the only sensible way to
+  // create them is by providing the full parsed contents in the constructor.
+  CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
+      SmallString<8> Augmentation, uint8_t AddressSize,
+      uint8_t SegmentDescriptorSize, uint64_t CodeAlignmentFactor,
+      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister,
+      SmallString<8> AugmentationData, uint32_t FDEPointerEncoding,
+      uint32_t LSDAPointerEncoding, Optional<uint64_t> Personality,
+      Optional<uint32_t> PersonalityEnc)
+      : FrameEntry(FK_CIE, Offset, Length, CodeAlignmentFactor,
+                   DataAlignmentFactor),
+        Version(Version), Augmentation(std::move(Augmentation)),
+        AddressSize(AddressSize), SegmentDescriptorSize(SegmentDescriptorSize),
+        CodeAlignmentFactor(CodeAlignmentFactor),
+        DataAlignmentFactor(DataAlignmentFactor),
+        ReturnAddressRegister(ReturnAddressRegister),
+        AugmentationData(std::move(AugmentationData)),
+        FDEPointerEncoding(FDEPointerEncoding),
+        LSDAPointerEncoding(LSDAPointerEncoding), Personality(Personality),
+        PersonalityEnc(PersonalityEnc) {}
+
+  static bool classof(const FrameEntry *FE) { return FE->getKind() == FK_CIE; }
+
+  StringRef getAugmentationString() const { return Augmentation; }
+  uint64_t getCodeAlignmentFactor() const { return CodeAlignmentFactor; }
+  int64_t getDataAlignmentFactor() const { return DataAlignmentFactor; }
+  uint8_t getVersion() const { return Version; }
+  uint64_t getReturnAddressRegister() const { return ReturnAddressRegister; }
+  Optional<uint64_t> getPersonalityAddress() const { return Personality; }
+  Optional<uint32_t> getPersonalityEncoding() const { return PersonalityEnc; }
+
+  uint32_t getFDEPointerEncoding() const { return FDEPointerEncoding; }
+
+  uint32_t getLSDAPointerEncoding() const { return LSDAPointerEncoding; }
+
+  void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+            bool IsEH) const override;
+
+private:
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v4
+  const uint8_t Version;
+  const SmallString<8> Augmentation;
+  const uint8_t AddressSize;
+  const uint8_t SegmentDescriptorSize;
+  const uint64_t CodeAlignmentFactor;
+  const int64_t DataAlignmentFactor;
+  const uint64_t ReturnAddressRegister;
+
+  // The following are used when the CIE represents an EH frame entry.
+  const SmallString<8> AugmentationData;
+  const uint32_t FDEPointerEncoding;
+  const uint32_t LSDAPointerEncoding;
+  const Optional<uint64_t> Personality;
+  const Optional<uint32_t> PersonalityEnc;
+};
+
+/// DWARF Frame Description Entry (FDE)
+class FDE : public FrameEntry {
+public:
+  // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
+  // an offset to the CIE (provided by parsing the FDE header). The CIE itself
+  // is obtained lazily once it's actually required.
+  FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
+      uint64_t InitialLocation, uint64_t AddressRange, CIE *Cie,
+      Optional<uint64_t> LSDAAddress)
+      : FrameEntry(FK_FDE, Offset, Length,
+                   Cie ? Cie->getCodeAlignmentFactor() : 0,
+                   Cie ? Cie->getDataAlignmentFactor() : 0),
+        LinkedCIEOffset(LinkedCIEOffset), InitialLocation(InitialLocation),
+        AddressRange(AddressRange), LinkedCIE(Cie), LSDAAddress(LSDAAddress) {}
+
+  ~FDE() override = default;
+
+  const CIE *getLinkedCIE() const { return LinkedCIE; }
+  uint64_t getInitialLocation() const { return InitialLocation; }
+  uint64_t getAddressRange() const { return AddressRange; }
+  Optional<uint64_t> getLSDAAddress() const { return LSDAAddress; }
+
+  void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+            bool IsEH) const override;
+
+  static bool classof(const FrameEntry *FE) { return FE->getKind() == FK_FDE; }
+
+private:
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  const uint64_t LinkedCIEOffset;
+  const uint64_t InitialLocation;
+  const uint64_t AddressRange;
+  const CIE *LinkedCIE;
+  const Optional<uint64_t> LSDAAddress;
+};
+
+} // end namespace dwarf
+
+/// A parsed .debug_frame or .eh_frame section
 class DWARFDebugFrame {
   // True if this is parsing an eh_frame section.
-  bool IsEH;
+  const bool IsEH;
+  // Not zero for sane pointer values coming out of eh_frame
+  const uint64_t EHFrameAddress;
+
+  std::vector<std::unique_ptr<dwarf::FrameEntry>> Entries;
+  using iterator = pointee_iterator<decltype(Entries)::const_iterator>;
+
+  /// Return the entry at the given offset or nullptr.
+  dwarf::FrameEntry *getEntryAtOffset(uint64_t Offset) const;
 
 public:
-  DWARFDebugFrame(bool IsEH);
+  // If IsEH is true, assume it is a .eh_frame section. Otherwise,
+  // it is a .debug_frame section. EHFrameAddress should be different
+  // than zero for correct parsing of .eh_frame addresses when they
+  // use a PC-relative encoding.
+  DWARFDebugFrame(bool IsEH = false, uint64_t EHFrameAddress = 0);
   ~DWARFDebugFrame();
 
   /// Dump the section data into the given stream.
-  void dump(raw_ostream &OS, Optional<uint64_t> Offset) const;
+  void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+            Optional<uint64_t> Offset) const;
 
-  /// \brief Parse the section from raw data.
-  /// data is assumed to be pointing to the beginning of the section.
-  void parse(DataExtractor Data);
+  /// Parse the section from raw data. \p Data is assumed to contain the whole
+  /// frame section contents to be parsed.
+  void parse(DWARFDataExtractor Data);
 
   /// Return whether the section has any entries.
   bool empty() const { return Entries.empty(); }
 
-  /// Return the entry at the given offset or nullptr.
-  FrameEntry *getEntryAtOffset(uint64_t Offset) const;
+  /// DWARF Frame entries accessors
+  iterator begin() const { return Entries.begin(); }
+  iterator end() const { return Entries.end(); }
+  iterator_range<iterator> entries() const {
+    return iterator_range<iterator>(Entries.begin(), Entries.end());
+  }
 
-private:
-  std::vector<std::unique_ptr<FrameEntry>> Entries;
+  uint64_t getEHFrameAddress() const { return EHFrameAddress; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index de8ad4e5ef3c..5b2af34bbcf5 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -10,11 +10,14 @@
 #ifndef LLVM_DEBUGINFO_DWARFDEBUGLINE_H
 #define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/Support/MD5.h"
 #include <cstdint>
 #include <map>
@@ -31,11 +34,30 @@ public:
   struct FileNameEntry {
     FileNameEntry() = default;
 
-    StringRef Name;
+    DWARFFormValue Name;
     uint64_t DirIdx = 0;
     uint64_t ModTime = 0;
     uint64_t Length = 0;
     MD5::MD5Result Checksum;
+    DWARFFormValue Source;
+  };
+
+  /// Tracks which optional content types are present in a DWARF file name
+  /// entry format.
+  struct ContentTypeTracker {
+    ContentTypeTracker() = default;
+
+    /// Whether filename entries provide a modification timestamp.
+    bool HasModTime = false;
+    /// Whether filename entries provide a file size.
+    bool HasLength = false;
+    /// For v5, whether filename entries provide an MD5 checksum.
+    bool HasMD5 = false;
+    /// For v5, whether filename entries provide source text.
+    bool HasSource = false;
+
+    /// Update tracked content types with \p ContentType.
+    void trackContentType(dwarf::LineNumberEntryFormat ContentType);
   };
 
   struct Prologue {
@@ -47,7 +69,7 @@ public:
     /// Version, address size (starting in v5), and DWARF32/64 format; these
     /// parameters affect interpretation of forms (used in the directory and
     /// file tables starting with v5).
-    DWARFFormParams FormParams;
+    dwarf::FormParams FormParams;
     /// The number of bytes following the prologue_length field to the beginning
     /// of the first byte of the statement program itself.
     uint64_t PrologueLength;
@@ -68,13 +90,13 @@ public:
     uint8_t LineRange;
     /// The number assigned to the first special opcode.
     uint8_t OpcodeBase;
-    /// For v5, whether filename entries provide an MD5 checksum.
-    bool HasMD5;
+    /// This tracks which optional file format content types are present.
+    ContentTypeTracker ContentTypes;
     std::vector<uint8_t> StandardOpcodeLengths;
-    std::vector<StringRef> IncludeDirectories;
+    std::vector<DWARFFormValue> IncludeDirectories;
     std::vector<FileNameEntry> FileNames;
 
-    const DWARFFormParams getFormParams() const { return FormParams; }
+    const dwarf::FormParams getFormParams() const { return FormParams; }
     uint16_t getVersion() const { return FormParams.Version; }
     uint8_t getAddressSize() const { return FormParams.AddrSize; }
     bool isDWARF64() const { return FormParams.Format == dwarf::DWARF64; }
@@ -83,6 +105,8 @@ public:
 
     uint32_t sizeofPrologueLength() const { return isDWARF64() ? 8 : 4; }
 
+    bool totalLengthIsValid() const;
+
     /// Length of the prologue in bytes.
     uint32_t getLength() const {
       return PrologueLength + sizeofTotalLength() + sizeof(getVersion()) +
@@ -99,9 +123,9 @@ public:
     }
 
     void clear();
-    void dump(raw_ostream &OS) const;
-    bool parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
-               const DWARFUnit *U = nullptr);
+    void dump(raw_ostream &OS, DIDumpOptions DumpOptions) const;
+    Error parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+                const DWARFContext &Ctx, const DWARFUnit *U = nullptr);
   };
 
   /// Standard .debug_line state machine structure.
@@ -219,12 +243,14 @@ public:
                                    DILineInfoSpecifier::FileLineInfoKind Kind,
                                    DILineInfo &Result) const;
 
-    void dump(raw_ostream &OS) const;
+    void dump(raw_ostream &OS, DIDumpOptions DumpOptions) const;
     void clear();
 
     /// Parse prologue and all rows.
-    bool parse(DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
-               const DWARFUnit *U, raw_ostream *OS = nullptr);
+    Error parse(DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+                const DWARFContext &Ctx, const DWARFUnit *U,
+                std::function<void(Error)> RecoverableErrorCallback = warn,
+                raw_ostream *OS = nullptr);
 
     using RowVector = std::vector<Row>;
     using RowIter = RowVector::const_iterator;
@@ -238,11 +264,75 @@ public:
   private:
     uint32_t findRowInSeq(const DWARFDebugLine::Sequence &Seq,
                           uint64_t Address) const;
+    Optional<StringRef>
+    getSourceByIndex(uint64_t FileIndex,
+                     DILineInfoSpecifier::FileLineInfoKind Kind) const;
   };
 
   const LineTable *getLineTable(uint32_t Offset) const;
-  const LineTable *getOrParseLineTable(DWARFDataExtractor &DebugLineData,
-                                       uint32_t Offset, const DWARFUnit *U);
+  Expected<const LineTable *> getOrParseLineTable(
+      DWARFDataExtractor &DebugLineData, uint32_t Offset,
+      const DWARFContext &Ctx, const DWARFUnit *U,
+      std::function<void(Error)> RecoverableErrorCallback = warn);
+
+  /// Helper to allow for parsing of an entire .debug_line section in sequence.
+  class SectionParser {
+  public:
+    using cu_range = DWARFUnitSection<DWARFCompileUnit>::iterator_range;
+    using tu_range =
+        iterator_range<std::deque<DWARFUnitSection<DWARFTypeUnit>>::iterator>;
+    using LineToUnitMap = std::map<uint64_t, DWARFUnit *>;
+
+    SectionParser(DWARFDataExtractor &Data, const DWARFContext &C, cu_range CUs,
+                  tu_range TUs);
+
+    /// Get the next line table from the section. Report any issues via the
+    /// callbacks.
+    ///
+    /// \param RecoverableErrorCallback - any issues that don't prevent further
+    /// parsing of the table will be reported through this callback.
+    /// \param UnrecoverableErrorCallback - any issues that prevent further
+    /// parsing of the table will be reported through this callback.
+    /// \param OS - if not null, the parser will print information about the
+    /// table as it parses it.
+    LineTable
+    parseNext(function_ref<void(Error)> RecoverableErrorCallback = warn,
+              function_ref<void(Error)> UnrecoverableErrorCallback = warn,
+              raw_ostream *OS = nullptr);
+
+    /// Skip the current line table and go to the following line table (if
+    /// present) immediately.
+    ///
+    /// \param ErrorCallback - report any prologue parsing issues via this
+    /// callback.
+    void skip(function_ref<void(Error)> ErrorCallback = warn);
+
+    /// Indicates if the parser has parsed as much as possible.
+    ///
+    /// \note Certain problems with the line table structure might mean that
+    /// parsing stops before the end of the section is reached.
+    bool done() const { return Done; }
+
+    /// Get the offset the parser has reached.
+    uint32_t getOffset() const { return Offset; }
+
+  private:
+    DWARFUnit *prepareToParse(uint32_t Offset);
+    void moveToNextTable(uint32_t OldOffset, const Prologue &P);
+
+    LineToUnitMap LineToUnit;
+
+    DWARFDataExtractor &DebugLineData;
+    const DWARFContext &Context;
+    uint32_t Offset = 0;
+    bool Done = false;
+  };
+
+  /// Helper function for DWARFDebugLine parse functions, to report issues
+  /// identified during parsing.
+  ///
+  /// \param Err The Error to report.
+  static void warn(Error Err);
 
 private:
   struct ParsingState {
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index a6d319a90457..9a73745fb6b4 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -42,7 +42,8 @@ public:
     SmallVector<Entry, 2> Entries;
     /// Dump this list on OS.
     void dump(raw_ostream &OS, bool IsLittleEndian, unsigned AddressSize,
-              const MCRegisterInfo *MRI, unsigned Indent) const;
+              const MCRegisterInfo *MRI, uint64_t BaseAddress,
+              unsigned Indent) const;
   };
 
 private:
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
index 761871dc6255..cae4804e61d3 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
@@ -32,7 +32,7 @@ public:
 
     /// The name of the object as given by the DW_AT_name attribute of the
     /// referenced DIE.
-    const char *Name;
+    StringRef Name;
   };
 
   /// Each table consists of sets of variable length entries. Each set describes
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index f9ec96366a53..ce7436d9faa3 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
 #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
 
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include <cassert>
 #include <cstdint>
 #include <vector>
@@ -21,47 +21,6 @@ namespace llvm {
 struct BaseAddress;
 class raw_ostream;
 
-struct DWARFAddressRange {
-  uint64_t LowPC;
-  uint64_t HighPC;
-  uint64_t SectionIndex;
-
-  DWARFAddressRange() = default;
-
-  /// Used for unit testing.
-  DWARFAddressRange(uint64_t LowPC, uint64_t HighPC, uint64_t SectionIndex = 0)
-      : LowPC(LowPC), HighPC(HighPC), SectionIndex(SectionIndex) {}
-
-  /// Returns true if LowPC is smaller or equal to HighPC. This accounts for
-  /// dead-stripped ranges.
-  bool valid() const { return LowPC <= HighPC; }
-
-  /// Returns true if [LowPC, HighPC) intersects with [RHS.LowPC, RHS.HighPC).
-  bool intersects(const DWARFAddressRange &RHS) const {
-    // Empty ranges can't intersect.
-    if (LowPC == HighPC || RHS.LowPC == RHS.HighPC)
-      return false;
-    return (LowPC < RHS.HighPC) && (HighPC > RHS.LowPC);
-  }
-
-  /// Returns true if [LowPC, HighPC) fully contains [RHS.LowPC, RHS.HighPC).
-  bool contains(const DWARFAddressRange &RHS) const {
-    if (LowPC <= RHS.LowPC && RHS.LowPC <= HighPC)
-      return LowPC <= RHS.HighPC && RHS.HighPC <= HighPC;
-    return false;
-  }
-};
-
-static inline bool operator<(const DWARFAddressRange &LHS,
-                             const DWARFAddressRange &RHS) {
-  return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const DWARFAddressRange &R);
-
-/// DWARFAddressRangesVector - represents a set of absolute address ranges.
-using DWARFAddressRangesVector = std::vector<DWARFAddressRange>;
-
 class DWARFDebugRangeList {
 public:
   struct RangeListEntry {
@@ -112,7 +71,7 @@ public:
 
   void clear();
   void dump(raw_ostream &OS) const;
-  bool extract(const DWARFDataExtractor &data, uint32_t *offset_ptr);
+  Error extract(const DWARFDataExtractor &data, uint32_t *offset_ptr);
   const std::vector<RangeListEntry> &getEntries() { return Entries; }
 
   /// getAbsoluteRanges - Returns absolute address ranges defined by this range
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
new file mode 100644
index 000000000000..e2e8ab5ed219
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -0,0 +1,60 @@
+//===- DWARFDebugRnglists.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
+#define LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
+
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFListTable.h"
+#include <cstdint>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class Error;
+class raw_ostream;
+
+/// A class representing a single range list entry.
+struct RangeListEntry : public DWARFListEntryBase {
+  /// The values making up the range list entry. Most represent a range with
+  /// a start and end address or a start address and a length. Others are
+  /// single value base addresses or end-of-list with no values. The unneeded
+  /// values are semantically undefined, but initialized to 0.
+  uint64_t Value0;
+  uint64_t Value1;
+
+  Error extract(DWARFDataExtractor Data, uint32_t End, uint32_t *OffsetPtr);
+  void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
+            uint64_t &CurrentBase, DIDumpOptions DumpOpts) const;
+  bool isSentinel() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
+};
+
+/// A class representing a single rangelist.
+class DWARFDebugRnglist : public DWARFListType<RangeListEntry> {
+public:
+  /// Build a DWARFAddressRangesVector from a rangelist.
+  DWARFAddressRangesVector
+  getAbsoluteRanges(llvm::Optional<BaseAddress> BaseAddr) const;
+};
+
+class DWARFDebugRnglistTable : public DWARFListTableBase<DWARFDebugRnglist> {
+public:
+  DWARFDebugRnglistTable()
+      : DWARFListTableBase(/* SectionName    = */ ".debug_rnglists",
+                           /* HeaderString   = */ "ranges:",
+                           /* ListTypeString = */ "range") {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 75fc5995c5b2..6e6b57cbcbd4 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -16,9 +16,9 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFAttribute.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -104,12 +104,24 @@ public:
   /// invalid DWARFDie instance if it doesn't.
   DWARFDie getSibling() const;
 
+  /// Get the previous sibling of this DIE object.
+  ///
+  /// \returns a valid DWARFDie instance if this object has a sibling or an
+  /// invalid DWARFDie instance if it doesn't.
+  DWARFDie getPreviousSibling() const;
+
   /// Get the first child of this DIE object.
   ///
   /// \returns a valid DWARFDie instance if this object has children or an
   /// invalid DWARFDie instance if it doesn't.
   DWARFDie getFirstChild() const;
 
+  /// Get the last child of this DIE object.
+  ///
+  /// \returns a valid null DWARFDie instance if this object has children or an
+  /// invalid DWARFDie instance if it doesn't.
+  DWARFDie getLastChild() const;
+
   /// Dump the DIE and all of its attributes to the supplied stream.
   ///
   /// \param OS the stream to use for output.
@@ -207,7 +219,7 @@ public:
   ///
   /// \returns a address range vector that might be empty if no address range
   /// information is available.
-  DWARFAddressRangesVector getAddressRanges() const;
+  Expected<DWARFAddressRangesVector> getAddressRanges() const;
 
   /// Get all address ranges for any DW_TAG_subprogram DIEs in this DIE or any
   /// of its children.
@@ -288,6 +300,7 @@ public:
   explicit attribute_iterator(DWARFDie D, bool End);
 
   attribute_iterator &operator++();
+  attribute_iterator &operator--();
   explicit operator bool() const { return AttrValue.isValid(); }
   const DWARFAttribute &operator*() const { return AttrValue; }
   bool operator==(const attribute_iterator &X) const { return Index == X.Index; }
@@ -306,26 +319,23 @@ inline bool operator<(const DWARFDie &LHS, const DWARFDie &RHS) {
   return LHS.getOffset() < RHS.getOffset();
 }
 
-class DWARFDie::iterator : public iterator_facade_base<iterator,
-                                                      std::forward_iterator_tag,
-                                                      const DWARFDie> {
+class DWARFDie::iterator
+    : public iterator_facade_base<iterator, std::bidirectional_iterator_tag,
+                                  const DWARFDie> {
   DWARFDie Die;
-  void skipNull() {
-    if (Die && Die.isNULL())
-      Die = DWARFDie();
-  }
 public:
   iterator() = default;
 
   explicit iterator(DWARFDie D) : Die(D) {
-    // If we start out with only a Null DIE then invalidate.
-    skipNull();
   }
 
   iterator &operator++() {
     Die = Die.getSibling();
-    // Don't include the NULL die when iterating.
-    skipNull();
+    return *this;
+  }
+
+  iterator &operator--() {
+    Die = Die.getPreviousSibling();
     return *this;
   }
 
@@ -341,7 +351,7 @@ inline DWARFDie::iterator DWARFDie::begin() const {
 }
 
 inline DWARFDie::iterator DWARFDie::end() const {
-  return iterator();
+  return iterator(getLastChild());
 }
 
 inline iterator_range<DWARFDie::iterator> DWARFDie::children() const {
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index dcd486f3fb13..3fad68a9b48b 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -93,12 +93,13 @@ public:
 
   /// An iterator to go through the expression operations.
   class iterator
-      : public iterator_facade_base<iterator, std::forward_iterator_tag, Operation> {
+      : public iterator_facade_base<iterator, std::forward_iterator_tag,
+                                    Operation> {
     friend class DWARFExpression;
-    DWARFExpression *Expr;
+    const DWARFExpression *Expr;
     uint32_t Offset;
     Operation Op;
-    iterator(DWARFExpression *Expr, uint32_t Offset)
+    iterator(const DWARFExpression *Expr, uint32_t Offset)
         : Expr(Expr), Offset(Offset) {
       Op.Error =
           Offset >= Expr->Data.getData().size() ||
@@ -127,10 +128,11 @@ public:
     assert(AddressSize == 8 || AddressSize == 4);
   }
 
-  iterator begin() { return iterator(this, 0); }
-  iterator end() { return iterator(this, Data.getData().size()); }
+  iterator begin() const { return iterator(this, 0); }
+  iterator end() const { return iterator(this, Data.getData().size()); }
 
-  void print(raw_ostream &OS, const MCRegisterInfo *RegInfo);
+  void print(raw_ostream &OS, const MCRegisterInfo *RegInfo,
+             bool IsEH = false) const;
 
 private:
   DataExtractor Data;
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index d32053519ec4..1b5f71c946f9 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -20,38 +20,10 @@
 
 namespace llvm {
 
+class DWARFContext;
 class DWARFUnit;
 class raw_ostream;
 
-/// A helper struct for DWARFFormValue methods, providing information that
-/// allows it to know the byte size of DW_FORM values that vary in size
-/// depending on the DWARF version, address byte size, or DWARF32/DWARF64.
-struct DWARFFormParams {
-  uint16_t Version;
-  uint8_t AddrSize;
-  dwarf::DwarfFormat Format;
-
-  /// The definition of the size of form DW_FORM_ref_addr depends on the
-  /// version. In DWARF v2 it's the size of an address; after that, it's the
-  /// size of a reference.
-  uint8_t getRefAddrByteSize() const {
-    if (Version == 2)
-      return AddrSize;
-    return getDwarfOffsetByteSize();
-  }
-
-  /// The size of a reference is determined by the DWARF 32/64-bit format.
-  uint8_t getDwarfOffsetByteSize() const {
-    switch (Format) {
-    case dwarf::DwarfFormat::DWARF32:
-      return 4;
-    case dwarf::DwarfFormat::DWARF64:
-      return 8;
-    }
-    llvm_unreachable("Invalid Format value");
-  }
-};
-
 class DWARFFormValue {
 public:
   enum FormClass {
@@ -83,7 +55,7 @@ private:
   dwarf::Form Form;             /// Form for this value.
   ValueType Value;              /// Contains all data for the form.
   const DWARFUnit *U = nullptr; /// Remember the DWARFUnit at extract time.
-
+  const DWARFContext *C = nullptr; /// Context for extract time.
 public:
   DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F) {}
 
@@ -106,10 +78,17 @@ public:
 
   /// Extracts a value in \p Data at offset \p *OffsetPtr. The information
   /// in \p FormParams is needed to interpret some forms. The optional
-  /// \p Unit allows extracting information if the form refers to other
-  /// sections (e.g., .debug_str).
+  /// \p Context and \p Unit allows extracting information if the form refers
+  /// to other sections (e.g., .debug_str).
   bool extractValue(const DWARFDataExtractor &Data, uint32_t *OffsetPtr,
-                    DWARFFormParams FormParams, const DWARFUnit *U = nullptr);
+                    dwarf::FormParams FormParams,
+                    const DWARFContext *Context = nullptr,
+                    const DWARFUnit *Unit = nullptr);
+
+  bool extractValue(const DWARFDataExtractor &Data, uint32_t *OffsetPtr,
+                    dwarf::FormParams FormParams, const DWARFUnit *U) {
+    return extractValue(Data, OffsetPtr, FormParams, nullptr, U);
+  }
 
   bool isInlinedCStr() const {
     return Value.data != nullptr && Value.data == (const uint8_t *)Value.cstr;
@@ -127,19 +106,6 @@ public:
   Optional<uint64_t> getAsCStringOffset() const;
   Optional<uint64_t> getAsReferenceUVal() const;
 
-  /// Get the fixed byte size for a given form.
-  ///
-  /// If the form has a fixed byte size, then an Optional with a value will be
-  /// returned. If the form is always encoded using a variable length storage
-  /// format (ULEB or SLEB numbers or blocks) then None will be returned.
-  ///
-  /// \param Form DWARF form to get the fixed byte size for.
-  /// \param FormParams DWARF parameters to help interpret forms.
-  /// \returns Optional<uint8_t> value with the fixed byte size or None if
-  /// \p Form doesn't have a fixed byte size.
-  static Optional<uint8_t> getFixedByteSize(dwarf::Form Form,
-                                            const DWARFFormParams FormParams);
-
   /// Skip a form's value in \p DebugInfoData at the offset specified by
   /// \p OffsetPtr.
   ///
@@ -150,7 +116,7 @@ public:
   /// \param Params DWARF parameters to help interpret forms.
   /// \returns true on success, false if the form was not skipped.
   bool skipValue(DataExtractor DebugInfoData, uint32_t *OffsetPtr,
-                 const DWARFFormParams Params) const {
+                 const dwarf::FormParams Params) const {
     return DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, Params);
   }
 
@@ -165,7 +131,8 @@ public:
   /// \param FormParams DWARF parameters to help interpret forms.
   /// \returns true on success, false if the form was not skipped.
   static bool skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
-                        uint32_t *OffsetPtr, const DWARFFormParams FormParams);
+                        uint32_t *OffsetPtr,
+                        const dwarf::FormParams FormParams);
 
 private:
   void dumpString(raw_ostream &OS) const;
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
new file mode 100644
index 000000000000..ab12f3bc08b0
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -0,0 +1,278 @@
+//===- DWARFListTable.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFLISTTABLE_H
+#define LLVM_DEBUGINFO_DWARFLISTTABLE_H
+
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+/// A base class for DWARF list entries, such as range or location list
+/// entries.
+struct DWARFListEntryBase {
+  /// The offset at which the entry is located in the section.
+  uint32_t Offset;
+  /// The DWARF encoding (DW_RLE_* or DW_LLE_*).
+  uint8_t EntryKind;
+  /// The index of the section this entry belongs to.
+  uint64_t SectionIndex;
+};
+
+/// A base class for lists of entries that are extracted from a particular
+/// section, such as range lists or location lists.
+template <typename ListEntryType> class DWARFListType {
+  using EntryType = ListEntryType;
+  using ListEntries = std::vector<EntryType>;
+
+protected:
+  ListEntries Entries;
+
+public:
+  // FIXME: We need to consolidate the various verions of "createError"
+  // that are used in the DWARF consumer. Until then, this is a workaround.
+  Error createError(const char *, const char *, uint32_t);
+
+  const ListEntries &getEntries() const { return Entries; }
+  bool empty() const { return Entries.empty(); }
+  void clear() { Entries.clear(); }
+  Error extract(DWARFDataExtractor Data, uint32_t HeaderOffset, uint32_t End,
+                uint32_t *OffsetPtr, StringRef SectionName,
+                StringRef ListStringName);
+};
+
+/// A class representing the header of a list table such as the range list
+/// table in the .debug_rnglists section.
+class DWARFListTableHeader {
+  struct Header {
+    /// The total length of the entries for this table, not including the length
+    /// field itself.
+    uint32_t Length = 0;
+    /// The DWARF version number.
+    uint16_t Version;
+    /// The size in bytes of an address on the target architecture. For
+    /// segmented addressing, this is the size of the offset portion of the
+    /// address.
+    uint8_t AddrSize;
+    /// The size in bytes of a segment selector on the target architecture.
+    /// If the target system uses a flat address space, this value is 0.
+    uint8_t SegSize;
+    /// The number of offsets that follow the header before the range lists.
+    uint32_t OffsetEntryCount;
+  };
+
+  Header HeaderData;
+  /// The offset table, which contains offsets to the individual list entries.
+  /// It is used by forms such as DW_FORM_rnglistx.
+  /// FIXME: Generate the table and use the appropriate forms.
+  std::vector<uint32_t> Offsets;
+  /// The table's format, either DWARF32 or DWARF64.
+  dwarf::DwarfFormat Format;
+  /// The offset at which the header (and hence the table) is located within
+  /// its section.
+  uint32_t HeaderOffset;
+  /// The name of the section the list is located in.
+  StringRef SectionName;
+  /// A characterization of the list for dumping purposes, e.g. "range" or
+  /// "location".
+  StringRef ListTypeString;
+
+public:
+  DWARFListTableHeader(StringRef SectionName, StringRef ListTypeString)
+      : SectionName(SectionName), ListTypeString(ListTypeString) {}
+
+  void clear() {
+    HeaderData = {};
+    Offsets.clear();
+  }
+  uint32_t getHeaderOffset() const { return HeaderOffset; }
+  uint8_t getAddrSize() const { return HeaderData.AddrSize; }
+  uint32_t getLength() const { return HeaderData.Length; }
+  StringRef getSectionName() const { return SectionName; }
+  StringRef getListTypeString() const { return ListTypeString; }
+  dwarf::DwarfFormat getFormat() const { return Format; }
+
+  void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
+  Optional<uint32_t> getOffsetEntry(uint32_t Index) const {
+    if (Index < Offsets.size())
+      return Offsets[Index];
+    return None;
+  }
+
+  /// Extract the table header and the array of offsets.
+  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr);
+
+  /// Returns the length of the table, including the length field, or 0 if the
+  /// length has not been determined (e.g. because the table has not yet been
+  /// parsed, or there was a problem in parsing).
+  uint32_t length() const;
+};
+
+/// A class representing a table of lists as specified in the DWARF v5
+/// standard for location lists and range lists. The table consists of a header
+/// followed by an array of offsets into a DWARF section, followed by zero or
+/// more list entries. The list entries are kept in a map where the keys are
+/// the lists' section offsets.
+template <typename DWARFListType> class DWARFListTableBase {
+  DWARFListTableHeader Header;
+  /// A mapping between file offsets and lists. It is used to find a particular
+  /// list based on an offset (obtained from DW_AT_ranges, for example).
+  std::map<uint32_t, DWARFListType> ListMap;
+  /// This string is displayed as a heading before the list is dumped
+  /// (e.g. "ranges:").
+  StringRef HeaderString;
+
+protected:
+  DWARFListTableBase(StringRef SectionName, StringRef HeaderString,
+                     StringRef ListTypeString)
+      : Header(SectionName, ListTypeString), HeaderString(HeaderString) {}
+
+public:
+  void clear() {
+    Header.clear();
+    ListMap.clear();
+  }
+  /// Extract the table header and the array of offsets.
+  Error extractHeaderAndOffsets(DWARFDataExtractor Data, uint32_t *OffsetPtr) {
+    return Header.extract(Data, OffsetPtr);
+  }
+  /// Extract an entire table, including all list entries.
+  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr);
+  /// Look up a list based on a given offset. Extract it and enter it into the
+  /// list map if necessary.
+  Expected<DWARFListType> findList(DWARFDataExtractor Data, uint32_t Offset);
+
+  uint32_t getHeaderOffset() const { return Header.getHeaderOffset(); }
+  uint8_t getAddrSize() const { return Header.getAddrSize(); }
+
+  void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
+
+  /// Return the contents of the offset entry designated by a given index.
+  Optional<uint32_t> getOffsetEntry(uint32_t Index) const {
+    return Header.getOffsetEntry(Index);
+  }
+  /// Return the size of the table header including the length but not including
+  /// the offsets. This is dependent on the table format, which is unambiguously
+  /// derived from parsing the table.
+  uint8_t getHeaderSize() const {
+    switch (Header.getFormat()) {
+    case dwarf::DwarfFormat::DWARF32:
+      return 12;
+    case dwarf::DwarfFormat::DWARF64:
+      return 20;
+    }
+    llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
+  }
+
+  uint32_t length() { return Header.length(); }
+};
+
+template <typename DWARFListType>
+Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
+                                                 uint32_t *OffsetPtr) {
+  clear();
+  if (Error E = extractHeaderAndOffsets(Data, OffsetPtr))
+    return E;
+
+  Data.setAddressSize(Header.getAddrSize());
+  uint32_t End = getHeaderOffset() + Header.length();
+  while (*OffsetPtr < End) {
+    DWARFListType CurrentList;
+    uint32_t Off = *OffsetPtr;
+    if (Error E = CurrentList.extract(Data, getHeaderOffset(), End, OffsetPtr,
+                                      Header.getSectionName(),
+                                      Header.getListTypeString()))
+      return E;
+    ListMap[Off] = CurrentList;
+  }
+
+  assert(*OffsetPtr == End &&
+         "mismatch between expected length of table and length "
+         "of extracted data");
+  return Error::success();
+}
+
+template <typename ListEntryType>
+Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
+                                            uint32_t HeaderOffset, uint32_t End,
+                                            uint32_t *OffsetPtr,
+                                            StringRef SectionName,
+                                            StringRef ListTypeString) {
+  if (*OffsetPtr < HeaderOffset || *OffsetPtr >= End)
+    return createError("invalid %s list offset 0x%" PRIx32,
+                       ListTypeString.data(), *OffsetPtr);
+  Entries.clear();
+  while (*OffsetPtr < End) {
+    ListEntryType Entry;
+    if (Error E = Entry.extract(Data, End, OffsetPtr))
+      return E;
+    Entries.push_back(Entry);
+    if (Entry.isSentinel())
+      return Error::success();
+  }
+  return createError("no end of list marker detected at end of %s table "
+                     "starting at offset 0x%" PRIx32,
+                     SectionName.data(), HeaderOffset);
+}
+
+template <typename DWARFListType>
+void DWARFListTableBase<DWARFListType>::dump(raw_ostream &OS,
+                                             DIDumpOptions DumpOpts) const {
+  Header.dump(OS, DumpOpts);
+  OS << HeaderString << "\n";
+
+  // Determine the length of the longest encoding string we have in the table,
+  // so we can align the output properly. We only need this in verbose mode.
+  size_t MaxEncodingStringLength = 0;
+  if (DumpOpts.Verbose) {
+    for (const auto &List : ListMap)
+      for (const auto &Entry : List.second.getEntries())
+        MaxEncodingStringLength =
+            std::max(MaxEncodingStringLength,
+                     dwarf::RangeListEncodingString(Entry.EntryKind).size());
+  }
+
+  uint64_t CurrentBase = 0;
+  for (const auto &List : ListMap)
+    for (const auto &Entry : List.second.getEntries())
+      Entry.dump(OS, getAddrSize(), MaxEncodingStringLength, CurrentBase,
+                 DumpOpts);
+}
+
+template <typename DWARFListType>
+Expected<DWARFListType>
+DWARFListTableBase<DWARFListType>::findList(DWARFDataExtractor Data,
+                                            uint32_t Offset) {
+  auto Entry = ListMap.find(Offset);
+  if (Entry != ListMap.end())
+    return Entry->second;
+
+  // Extract the list from the section and enter it into the list map.
+  DWARFListType List;
+  uint32_t End = getHeaderOffset() + Header.length();
+  uint32_t StartingOffset = Offset;
+  if (Error E =
+          List.extract(Data, getHeaderOffset(), End, &Offset,
+                       Header.getSectionName(), Header.getListTypeString()))
+    return std::move(E);
+  ListMap[StartingOffset] = List;
+  return List;
+}
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARFLISTTABLE_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 167eb2da5ba0..6e8f370f4aea 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -42,8 +42,10 @@ public:
   virtual StringRef getDebugFrameSection() const { return ""; }
   virtual StringRef getEHFrameSection() const { return ""; }
   virtual const DWARFSection &getLineSection() const { return Dummy; }
+  virtual StringRef getLineStringSection() const { return ""; }
   virtual StringRef getStringSection() const { return ""; }
   virtual const DWARFSection &getRangeSection() const { return Dummy; }
+  virtual const DWARFSection &getRnglistsSection() const { return Dummy; }
   virtual StringRef getMacinfoSection() const { return ""; }
   virtual StringRef getPubNamesSection() const { return ""; }
   virtual StringRef getPubTypesSection() const { return ""; }
@@ -61,12 +63,14 @@ public:
     return Dummy;
   }
   virtual const DWARFSection &getRangeDWOSection() const { return Dummy; }
+  virtual const DWARFSection &getRnglistsDWOSection() const { return Dummy; }
   virtual const DWARFSection &getAddrSection() const { return Dummy; }
   virtual const DWARFSection &getAppleNamesSection() const { return Dummy; }
   virtual const DWARFSection &getAppleTypesSection() const { return Dummy; }
   virtual const DWARFSection &getAppleNamespacesSection() const {
     return Dummy;
   }
+  virtual const DWARFSection &getDebugNamesSection() const { return Dummy; }
   virtual const DWARFSection &getAppleObjCSection() const { return Dummy; }
   virtual StringRef getCUIndexSection() const { return ""; }
   virtual StringRef getGdbIndexSection() const { return ""; }
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index a7842454f435..cb5a78ee3dbf 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -24,29 +24,21 @@ struct DWARFSection;
 class raw_ostream;
 
 class DWARFTypeUnit : public DWARFUnit {
-private:
-  uint64_t TypeHash;
-  uint32_t TypeOffset;
-
 public:
   DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
+                const DWARFUnitHeader &Header,
                 const DWARFDebugAbbrev *DA, const DWARFSection *RS,
                 StringRef SS, const DWARFSection &SOS, const DWARFSection *AOS,
                 const DWARFSection &LS, bool LE, bool IsDWO,
-                const DWARFUnitSectionBase &UnitSection,
-                const DWARFUnitIndex::Entry *Entry)
-      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
-                  UnitSection, Entry) {}
+                const DWARFUnitSectionBase &UnitSection)
+      : DWARFUnit(Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
+                  UnitSection) {}
 
-  uint32_t getHeaderSize() const override {
-    return DWARFUnit::getHeaderSize() + 12;
-  }
+  uint64_t getTypeHash() const { return getHeader().getTypeHash(); }
+  uint32_t getTypeOffset() const { return getHeader().getTypeOffset(); }
 
   void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {});
   static const DWARFSectionKind Section = DW_SECT_TYPES;
-
-protected:
-  bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 3cec58383f87..988a7958184c 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -18,6 +18,7 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
@@ -40,6 +41,66 @@ class DWARFContext;
 class DWARFDebugAbbrev;
 class DWARFUnit;
 
+/// Base class describing the header of any kind of "unit."  Some information
+/// is specific to certain unit types.  We separate this class out so we can
+/// parse the header before deciding what specific kind of unit to construct.
+class DWARFUnitHeader {
+  // Offset within section.
+  uint32_t Offset = 0;
+  // Version, address size, and DWARF format.
+  dwarf::FormParams FormParams;
+  uint32_t Length = 0;
+  uint64_t AbbrOffset = 0;
+
+  // For DWO units only.
+  const DWARFUnitIndex::Entry *IndexEntry = nullptr;
+
+  // For type units only.
+  uint64_t TypeHash = 0;
+  uint32_t TypeOffset = 0;
+
+  // For v5 split or skeleton compile units only.
+  Optional<uint64_t> DWOId;
+
+  // Unit type as parsed, or derived from the section kind.
+  uint8_t UnitType = 0;
+
+  // Size as parsed. uint8_t for compactness.
+  uint8_t Size = 0;
+
+public:
+  /// Parse a unit header from \p debug_info starting at \p offset_ptr.
+  bool extract(DWARFContext &Context, const DWARFDataExtractor &debug_info,
+               uint32_t *offset_ptr, DWARFSectionKind Kind = DW_SECT_INFO,
+               const DWARFUnitIndex *Index = nullptr);
+  uint32_t getOffset() const { return Offset; }
+  const dwarf::FormParams &getFormParams() const { return FormParams; }
+  uint16_t getVersion() const { return FormParams.Version; }
+  dwarf::DwarfFormat getFormat() const { return FormParams.Format; }
+  uint8_t getAddressByteSize() const { return FormParams.AddrSize; }
+  uint8_t getRefAddrByteSize() const { return FormParams.getRefAddrByteSize(); }
+  uint8_t getDwarfOffsetByteSize() const {
+    return FormParams.getDwarfOffsetByteSize();
+  }
+  uint32_t getLength() const { return Length; }
+  uint64_t getAbbrOffset() const { return AbbrOffset; }
+  Optional<uint64_t> getDWOId() const { return DWOId; }
+  void setDWOId(uint64_t Id) {
+    assert((!DWOId || *DWOId == Id) && "setting DWOId to a different value");
+    DWOId = Id;
+  }
+  const DWARFUnitIndex::Entry *getIndexEntry() const { return IndexEntry; }
+  uint64_t getTypeHash() const { return TypeHash; }
+  uint32_t getTypeOffset() const { return TypeOffset; }
+  uint8_t getUnitType() const { return UnitType; }
+  bool isTypeUnit() const {
+    return UnitType == dwarf::DW_UT_type || UnitType == dwarf::DW_UT_split_type;
+  }
+  uint8_t getSize() const { return Size; }
+  // FIXME: Support DWARF64.
+  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
+};
+
 /// Base class for all DWARFUnitSection classes. This provides the
 /// functionality common to all unit types.
 class DWARFUnitSectionBase {
@@ -56,7 +117,8 @@ public:
 protected:
   ~DWARFUnitSectionBase() = default;
 
-  virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section,
+  virtual void parseImpl(DWARFContext &Context, const DWARFObject &Obj,
+                         const DWARFSection &Section,
                          const DWARFDebugAbbrev *DA, const DWARFSection *RS,
                          StringRef SS, const DWARFSection &SOS,
                          const DWARFSection *AOS, const DWARFSection &LS,
@@ -116,14 +178,14 @@ public:
   }
 
 private:
-  void parseImpl(DWARFContext &Context, const DWARFSection &Section,
-                 const DWARFDebugAbbrev *DA, const DWARFSection *RS,
-                 StringRef SS, const DWARFSection &SOS, const DWARFSection *AOS,
-                 const DWARFSection &LS, bool LE, bool IsDWO,
-                 bool Lazy) override {
+  void parseImpl(DWARFContext &Context, const DWARFObject &Obj,
+                 const DWARFSection &Section, const DWARFDebugAbbrev *DA,
+                 const DWARFSection *RS, StringRef SS, const DWARFSection &SOS,
+                 const DWARFSection *AOS, const DWARFSection &LS, bool LE,
+                 bool IsDWO, bool Lazy) override {
     if (Parsed)
       return;
-    DataExtractor Data(Section.Data, LE, 0);
+    DWARFDataExtractor Data(Obj, Section, LE, 0);
     if (!Parser) {
       const DWARFUnitIndex *Index = nullptr;
       if (IsDWO)
@@ -132,11 +194,12 @@ private:
                 &LS](uint32_t Offset) -> std::unique_ptr<UnitType> {
         if (!Data.isValidOffset(Offset))
           return nullptr;
-        auto U = llvm::make_unique<UnitType>(
-            Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO, *this,
-            Index ? Index->getFromOffset(Offset) : nullptr);
-        if (!U->extract(Data, &Offset))
+        DWARFUnitHeader Header;
+        if (!Header.extract(Context, Data, &Offset, UnitType::Section, Index))
           return nullptr;
+        auto U = llvm::make_unique<UnitType>(
+            Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
+            *this);
         return U;
       };
     }
@@ -168,9 +231,10 @@ struct BaseAddress {
 /// Represents a unit's contribution to the string offsets table.
 struct StrOffsetsContributionDescriptor {
   uint64_t Base = 0;
+  /// The contribution size not including the header.
   uint64_t Size = 0;
   /// Format and version.
-  DWARFFormParams FormParams = {0, 0, dwarf::DwarfFormat::DWARF32};
+  dwarf::FormParams FormParams = {0, 0, dwarf::DwarfFormat::DWARF32};
 
   StrOffsetsContributionDescriptor(uint64_t Base, uint64_t Size,
                                    uint8_t Version, dwarf::DwarfFormat Format)
@@ -193,6 +257,7 @@ class DWARFUnit {
   /// Section containing this DWARFUnit.
   const DWARFSection &InfoSection;
 
+  DWARFUnitHeader Header;
   const DWARFDebugAbbrev *Abbrev;
   const DWARFSection *RangeSection;
   uint32_t RangeSectionBase;
@@ -205,63 +270,28 @@ class DWARFUnit {
   bool isDWO;
   const DWARFUnitSectionBase &UnitSection;
 
-  // Version, address size, and DWARF format.
-  DWARFFormParams FormParams;
   /// Start, length, and DWARF format of the unit's contribution to the string
   /// offsets table (DWARF v5).
   Optional<StrOffsetsContributionDescriptor> StringOffsetsTableContribution;
 
-  uint32_t Offset;
-  uint32_t Length;
+  /// A table of range lists (DWARF v5 and later).
+  Optional<DWARFDebugRnglistTable> RngListTable;
+
   mutable const DWARFAbbreviationDeclarationSet *Abbrevs;
-  uint64_t AbbrOffset;
-  uint8_t UnitType;
   llvm::Optional<BaseAddress> BaseAddr;
   /// The compile unit debug information entry items.
   std::vector<DWARFDebugInfoEntry> DieArray;
 
-  /// The vector of inlined subroutine DIEs that we can map directly to from
-  /// their subprogram below.
-  std::vector<DWARFDie> InlinedSubroutineDIEs;
-
-  /// A type representing a subprogram DIE and a map (built using a sorted
-  /// vector) into that subprogram's inlined subroutine DIEs.
-  struct SubprogramDIEAddrInfo {
-    DWARFDie SubprogramDIE;
-
-    uint64_t SubprogramBasePC;
-
-    /// A vector sorted to allow mapping from a relative PC to the inlined
-    /// subroutine DIE with the most specific address range covering that PC.
-    ///
-    /// The PCs are relative to the `SubprogramBasePC`.
-    ///
-    /// The vector is sorted in ascending order of the first int which
-    /// represents the relative PC for an interval in the map. The second int
-    /// represents the index into the `InlinedSubroutineDIEs` vector of the DIE
-    /// that interval maps to. An index of '-1` indicates an empty mapping. The
-    /// interval covered is from the `.first` relative PC to the next entry's
-    /// `.first` relative PC.
-    std::vector<std::pair<uint32_t, int32_t>> InlinedSubroutineDIEAddrMap;
-  };
-
-  /// Vector of the subprogram DIEs and their subroutine address maps.
-  std::vector<SubprogramDIEAddrInfo> SubprogramDIEAddrInfos;
-
-  /// A vector sorted to allow mapping from a PC to the subprogram DIE (and
-  /// associated addr map) index. Subprograms with overlapping PC ranges aren't
-  /// supported here. Nothing will crash, but the mapping may be inaccurate.
-  /// This vector may also contain "empty" ranges marked by an address with
-  /// a DIE index of '-1'.
-  std::vector<std::pair<uint64_t, int64_t>> SubprogramDIEAddrMap;
+  /// Map from range's start address to end address and corresponding DIE.
+  /// IntervalMap does not support range removal, as a result, we use the
+  /// std::map::upper_bound for address range lookup.
+  std::map<uint64_t, std::pair<uint64_t, DWARFDie>> AddrDieMap;
 
   using die_iterator_range =
       iterator_range<std::vector<DWARFDebugInfoEntry>::iterator>;
 
   std::shared_ptr<DWARFUnit> DWO;
 
-  const DWARFUnitIndex::Entry *IndexEntry;
-
   uint32_t getDIEIndex(const DWARFDebugInfoEntry *Die) {
     auto First = DieArray.data();
     assert(Die >= First && Die < First + DieArray.size());
@@ -269,10 +299,10 @@ class DWARFUnit {
   }
 
 protected:
-  virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+  const DWARFUnitHeader &getHeader() const { return Header; }
 
-  /// Size in bytes of the unit header.
-  virtual uint32_t getHeaderSize() const { return getVersion() <= 4 ? 11 : 12; }
+  /// Size in bytes of the parsed unit header.
+  uint32_t getHeaderSize() const { return Header.getSize(); }
 
   /// Find the unit's contribution to the string offsets table and determine its
   /// length and form. The given offset is expected to be derived from the unit
@@ -291,16 +321,28 @@ protected:
 
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
+            const DWARFUnitHeader &Header,
             const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
             const DWARFSection &SOS, const DWARFSection *AOS,
             const DWARFSection &LS, bool LE, bool IsDWO,
-            const DWARFUnitSectionBase &UnitSection,
-            const DWARFUnitIndex::Entry *IndexEntry = nullptr);
+            const DWARFUnitSectionBase &UnitSection);
 
   virtual ~DWARFUnit();
 
   DWARFContext& getContext() const { return Context; }
-
+  uint32_t getOffset() const { return Header.getOffset(); }
+  const dwarf::FormParams &getFormParams() const {
+    return Header.getFormParams();
+  }
+  uint16_t getVersion() const { return Header.getVersion(); }
+  uint8_t getAddressByteSize() const { return Header.getAddressByteSize(); }
+  uint8_t getRefAddrByteSize() const { return Header.getRefAddrByteSize(); }
+  uint8_t getDwarfOffsetByteSize() const {
+    return Header.getDwarfOffsetByteSize();
+  }
+  uint32_t getLength() const { return Header.getLength(); }
+  uint8_t getUnitType() const { return Header.getUnitType(); }
+  uint32_t getNextUnitOffset() const { return Header.getNextUnitOffset(); }
   const DWARFSection &getLineSection() const { return LineSection; }
   StringRef getStringSection() const { return StringSection; }
   const DWARFSection &getStringOffsetSection() const {
@@ -312,6 +354,9 @@ public:
     AddrOffsetSectionBase = Base;
   }
 
+  /// Recursively update address to Die map.
+  void updateAddressDieMap(DWARFDie Die);
+
   void setRangesSection(const DWARFSection *RS, uint32_t Base) {
     RangeSection = RS;
     RangeSectionBase = Base;
@@ -326,31 +371,18 @@ public:
     return DataExtractor(StringSection, false, 0);
   }
 
-
-  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
-
-  /// extractRangeList - extracts the range list referenced by this compile
-  /// unit from .debug_ranges section. Returns true on success.
-  /// Requires that compile unit is already extracted.
-  bool extractRangeList(uint32_t RangeListOffset,
-                        DWARFDebugRangeList &RangeList) const;
+  /// Extract the range list referenced by this compile unit from the
+  /// .debug_ranges section. If the extraction is unsuccessful, an error
+  /// is returned. Successful extraction requires that the compile unit
+  /// has already been extracted.
+  Error extractRangeList(uint32_t RangeListOffset,
+                         DWARFDebugRangeList &RangeList) const;
   void clear();
-  uint32_t getOffset() const { return Offset; }
-  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
-  uint32_t getLength() const { return Length; }
 
   const Optional<StrOffsetsContributionDescriptor> &
   getStringOffsetsTableContribution() const {
     return StringOffsetsTableContribution;
   }
-  const DWARFFormParams &getFormParams() const { return FormParams; }
-  uint16_t getVersion() const { return FormParams.Version; }
-  dwarf::DwarfFormat getFormat() const { return FormParams.Format; }
-  uint8_t getAddressByteSize() const { return FormParams.AddrSize; }
-  uint8_t getRefAddrByteSize() const { return FormParams.getRefAddrByteSize(); }
-  uint8_t getDwarfOffsetByteSize() const {
-    return FormParams.getDwarfOffsetByteSize();
-  }
 
   uint8_t getDwarfStringOffsetsByteSize() const {
     assert(StringOffsetsTableContribution);
@@ -364,8 +396,6 @@ public:
 
   const DWARFAbbreviationDeclarationSet *getAbbreviations() const;
 
-  uint8_t getUnitType() const { return UnitType; }
-
   static bool isMatchingUnitTypeAndTag(uint8_t UnitType, dwarf::Tag Tag) {
     switch (UnitType) {
     case dwarf::DW_UT_compile:
@@ -383,7 +413,7 @@ public:
     return false;
   }
 
-  /// \brief Return the number of bytes for the header of a unit of
+  /// Return the number of bytes for the header of a unit of
   /// UnitType type.
   ///
   /// This function must be called with a valid unit type which in
@@ -403,9 +433,7 @@ public:
     llvm_unreachable("Invalid UnitType.");
   }
 
-  llvm::Optional<BaseAddress> getBaseAddress() const { return BaseAddr; }
-
-  void setBaseAddress(BaseAddress BaseAddr) { this->BaseAddr = BaseAddr; }
+  llvm::Optional<BaseAddress> getBaseAddress();
 
   DWARFDie getUnitDIE(bool ExtractUnitDIEOnly = true) {
     extractDIEsIfNeeded(ExtractUnitDIEOnly);
@@ -415,7 +443,29 @@ public:
   }
 
   const char *getCompilationDir();
-  Optional<uint64_t> getDWOId();
+  Optional<uint64_t> getDWOId() {
+    extractDIEsIfNeeded(/*CUDieOnly*/ true);
+    return getHeader().getDWOId();
+  }
+  void setDWOId(uint64_t NewID) { Header.setDWOId(NewID); }
+
+  /// Return a vector of address ranges resulting from a (possibly encoded)
+  /// range list starting at a given offset in the appropriate ranges section.
+  Expected<DWARFAddressRangesVector> findRnglistFromOffset(uint32_t Offset);
+
+  /// Return a vector of address ranges retrieved from an encoded range
+  /// list whose offset is found via a table lookup given an index (DWARF v5
+  /// and later).
+  Expected<DWARFAddressRangesVector> findRnglistFromIndex(uint32_t Index);
+
+  /// Return a rangelist's offset based on an index. The index designates
+  /// an entry in the rangelist table's offset array and is supplied by
+  /// DW_FORM_rnglistx.
+  Optional<uint32_t> getRnglistOffset(uint32_t Index) {
+    if (RngListTable)
+      return RngListTable->getOffsetEntry(Index);
+    return None;
+  }
 
   void collectAddressRanges(DWARFAddressRangesVector &CURanges);
 
@@ -433,14 +483,14 @@ public:
   /// getUnitSection - Return the DWARFUnitSection containing this unit.
   const DWARFUnitSectionBase &getUnitSection() const { return UnitSection; }
 
-  /// \brief Returns the number of DIEs in the unit. Parses the unit
+  /// Returns the number of DIEs in the unit. Parses the unit
   /// if necessary.
   unsigned getNumDIEs() {
     extractDIEsIfNeeded(false);
     return DieArray.size();
   }
 
-  /// \brief Return the index of a DIE inside the unit's DIE vector.
+  /// Return the index of a DIE inside the unit's DIE vector.
   ///
   /// It is illegal to call this method with a DIE that hasn't be
   /// created by this unit. In other word, it's illegal to call this
@@ -450,7 +500,7 @@ public:
     return getDIEIndex(D.getDebugInfoEntry());
   }
 
-  /// \brief Return the DIE object at the given index.
+  /// Return the DIE object at the given index.
   DWARFDie getDIEAtIndex(unsigned Index) {
     assert(Index < DieArray.size());
     return DWARFDie(this, &DieArray[Index]);
@@ -458,9 +508,11 @@ public:
 
   DWARFDie getParent(const DWARFDebugInfoEntry *Die);
   DWARFDie getSibling(const DWARFDebugInfoEntry *Die);
+  DWARFDie getPreviousSibling(const DWARFDebugInfoEntry *Die);
   DWARFDie getFirstChild(const DWARFDebugInfoEntry *Die);
+  DWARFDie getLastChild(const DWARFDebugInfoEntry *Die);
 
-  /// \brief Return the DIE object for a given offset inside the
+  /// Return the DIE object for a given offset inside the
   /// unit's DIE vector.
   ///
   /// The unit needs to have its DIEs extracted for this method to work.
@@ -478,7 +530,7 @@ public:
   }
 
   uint32_t getLineTableOffset() const {
-    if (IndexEntry)
+    if (auto IndexEntry = Header.getIndexEntry())
       if (const auto *Contrib = IndexEntry->getOffset(DW_SECT_LINE))
         return Contrib->Offset;
     return 0;
@@ -491,7 +543,9 @@ public:
 
 private:
   /// Size in bytes of the .debug_info data associated with this compile unit.
-  size_t getDebugInfoSize() const { return Length + 4 - getHeaderSize(); }
+  size_t getDebugInfoSize() const {
+    return Header.getLength() + 4 - getHeaderSize();
+  }
 
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
   /// hasn't already been done. Returns the number of DIEs parsed at this call.
@@ -507,9 +561,6 @@ private:
   /// parseDWO - Parses .dwo file for current compile unit. Returns true if
   /// it was actually constructed.
   bool parseDWO();
-
-  void buildSubprogramDIEAddrMap();
-  void buildInlinedSubroutineDIEAddrMap(SubprogramDIEAddrInfo &SPInfo);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 0d920abe3231..a829510a219d 100644
--- a/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/contrib/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -11,7 +11,8 @@
 #define LLVM_DEBUGINFO_DWARF_DWARFVERIFIER_H
 
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 
 #include <cstdint>
@@ -24,7 +25,7 @@ struct DWARFAttribute;
 class DWARFContext;
 class DWARFDie;
 class DWARFUnit;
-class DWARFAcceleratorTable;
+class DWARFCompileUnit;
 class DWARFDataExtractor;
 class DWARFDebugAbbrev;
 class DataExtractor;
@@ -109,7 +110,7 @@ private:
   /// \param Abbrev Pointer to the abbreviations section we are verifying
   /// Abbrev can be a pointer to either .debug_abbrev or debug_abbrev.dwo.
   ///
-  /// \returns The number of errors that occured during verification.
+  /// \returns The number of errors that occurred during verification.
   unsigned verifyAbbrevSection(const DWARFDebugAbbrev *Abbrev);
 
   /// Verifies the header of a unit in the .debug_info section.
@@ -151,14 +152,14 @@ private:
   ///                  type of the unit DIE.
   ///
   /// \returns true if the content is verified successfully, false otherwise.
-  bool verifyUnitContents(DWARFUnit Unit, uint8_t UnitType = 0);
+  bool verifyUnitContents(DWARFUnit &Unit, uint8_t UnitType = 0);
 
   /// Verify that all Die ranges are valid.
   ///
   /// This function currently checks for:
   /// - cases in which lowPC >= highPC
   ///
-  /// \returns Number of errors that occured during verification.
+  /// \returns Number of errors that occurred during verification.
   unsigned verifyDieRanges(const DWARFDie &Die, DieRangeInfo &ParentRI);
 
   /// Verifies the attribute's DWARF attribute and its value.
@@ -170,7 +171,7 @@ private:
   /// \param Die          The DWARF DIE that owns the attribute value
   /// \param AttrValue    The DWARF attribute value to check
   ///
-  /// \returns NumErrors The number of errors occured during verification of
+  /// \returns NumErrors The number of errors occurred during verification of
   /// attributes' values in a .debug_info section unit
   unsigned verifyDebugInfoAttribute(const DWARFDie &Die,
                                     DWARFAttribute &AttrValue);
@@ -185,7 +186,7 @@ private:
   /// \param Die          The DWARF DIE that owns the attribute value
   /// \param AttrValue    The DWARF attribute value to check
   ///
-  /// \returns NumErrors The number of errors occured during verification of
+  /// \returns NumErrors The number of errors occurred during verification of
   /// attributes' forms in a .debug_info section unit
   unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
 
@@ -197,11 +198,11 @@ private:
   /// around, that it doesn't create invalid references by failing to relocate
   /// CU relative and absolute references.
   ///
-  /// \returns NumErrors The number of errors occured during verification of
+  /// \returns NumErrors The number of errors occurred during verification of
   /// references for the .debug_info section
   unsigned verifyDebugInfoReferences();
 
-  /// Verify the the DW_AT_stmt_list encoding and value and ensure that no
+  /// Verify the DW_AT_stmt_list encoding and value and ensure that no
   /// compile units that have the same DW_AT_stmt_list value.
   void verifyDebugLineStmtOffsets();
 
@@ -228,9 +229,42 @@ private:
   /// \param StrData pointer to the string section
   /// \param SectionName the name of the table we're verifying
   ///
-  /// \returns The number of errors occured during verification
-  unsigned verifyAccelTable(const DWARFSection *AccelSection,
-                            DataExtractor *StrData, const char *SectionName);
+  /// \returns The number of errors occurred during verification
+  unsigned verifyAppleAccelTable(const DWARFSection *AccelSection,
+                                 DataExtractor *StrData,
+                                 const char *SectionName);
+
+  unsigned verifyDebugNamesCULists(const DWARFDebugNames &AccelTable);
+  unsigned verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI,
+                                  const DataExtractor &StrData);
+  unsigned verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI);
+  unsigned verifyNameIndexAttribute(const DWARFDebugNames::NameIndex &NI,
+                                    const DWARFDebugNames::Abbrev &Abbr,
+                                    DWARFDebugNames::AttributeEncoding AttrEnc);
+  unsigned verifyNameIndexEntries(const DWARFDebugNames::NameIndex &NI,
+                                  const DWARFDebugNames::NameTableEntry &NTE);
+  unsigned verifyNameIndexCompleteness(const DWARFDie &Die,
+                                       const DWARFDebugNames::NameIndex &NI);
+
+  /// Verify that the DWARF v5 accelerator table is valid.
+  ///
+  /// This function currently checks that:
+  /// - Headers individual Name Indices fit into the section and can be parsed.
+  /// - Abbreviation tables can be parsed and contain valid index attributes
+  ///   with correct form encodings.
+  /// - The CU lists reference existing compile units.
+  /// - The buckets have a valid index, or they are empty.
+  /// - All names are reachable via the hash table (they have the correct hash,
+  ///   and the hash is in the correct bucket).
+  /// - Information in the index entries is complete (all required entries are
+  ///   present) and consistent with the debug_info section DIEs.
+  ///
+  /// \param AccelSection section containing the acceleration table
+  /// \param StrData string section
+  ///
+  /// \returns The number of errors occurred during verification
+  unsigned verifyDebugNames(const DWARFSection &AccelSection,
+                            const DataExtractor &StrData);
 
 public:
   DWARFVerifier(raw_ostream &S, DWARFContext &D,
diff --git a/contrib/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h b/contrib/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
index 19e5c31b3076..3de98c4ecba8 100644
--- a/contrib/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
@@ -20,11 +20,13 @@
 #include <vector>
 
 namespace llvm {
+class FileBufferByteStream;
+class WritableBinaryStream;
 namespace msf {
 
 class MSFBuilder {
 public:
-  /// \brief Create a new `MSFBuilder`.
+  /// Create a new `MSFBuilder`.
   ///
   /// \param BlockSize The internal block size used by the PDB file.  See
   /// isValidBlockSize() for a list of valid block sizes.
@@ -109,7 +111,10 @@ public:
 
   /// Finalize the layout and build the headers and structures that describe the
   /// MSF layout and can be written directly to the MSF file.
-  Expected<MSFLayout> build();
+  Expected<MSFLayout> generateLayout();
+
+  /// Write the MSF layout to the underlying file.
+  Expected<FileBufferByteStream> commit(StringRef Path, MSFLayout &Layout);
 
   BumpPtrAllocator &getAllocator() { return Allocator; }
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h b/contrib/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h
index f28415d4e603..2db2b71df4a7 100644
--- a/contrib/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h
+++ b/contrib/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h
@@ -52,6 +52,16 @@ struct SuperBlock {
 struct MSFLayout {
   MSFLayout() = default;
 
+  uint32_t mainFpmBlock() const {
+    assert(SB->FreeBlockMapBlock == 1 || SB->FreeBlockMapBlock == 2);
+    return SB->FreeBlockMapBlock;
+  }
+
+  uint32_t alternateFpmBlock() const {
+    // If mainFpmBlock is 1, this is 2.  If mainFpmBlock is 2, this is 1.
+    return 3U - mainFpmBlock();
+  }
+
   const SuperBlock *SB = nullptr;
   BitVector FreePageMap;
   ArrayRef<support::ulittle32_t> DirectoryBlocks;
@@ -59,7 +69,7 @@ struct MSFLayout {
   std::vector<ArrayRef<support::ulittle32_t>> StreamMap;
 };
 
-/// \brief Describes the layout of a stream in an MSF layout.  A "stream" here
+/// Describes the layout of a stream in an MSF layout.  A "stream" here
 /// is defined as any logical unit of data which may be arranged inside the MSF
 /// file as a sequence of (possibly discontiguous) blocks.  When we want to read
 /// from a particular MSF Stream, we fill out a stream layout structure and the
@@ -71,7 +81,7 @@ public:
   std::vector<support::ulittle32_t> Blocks;
 };
 
-/// \brief Determine the layout of the FPM stream, given the MSF layout.  An FPM
+/// Determine the layout of the FPM stream, given the MSF layout.  An FPM
 /// stream spans 1 or more blocks, each at equally spaced intervals throughout
 /// the file.
 MSFStreamLayout getFpmStreamLayout(const MSFLayout &Msf,
@@ -108,14 +118,40 @@ inline uint32_t getFpmIntervalLength(const MSFLayout &L) {
   return L.SB->BlockSize;
 }
 
-inline uint32_t getNumFpmIntervals(const MSFLayout &L,
-                                   bool IncludeUnusedFpmData = false) {
-  if (IncludeUnusedFpmData)
-    return divideCeil(L.SB->NumBlocks, L.SB->BlockSize);
+/// Given an MSF with the specified block size and number of blocks, determine
+/// how many pieces the specified Fpm is split into.
+/// \p BlockSize - the block size of the MSF
+/// \p NumBlocks - the total number of blocks in the MSF
+/// \p IncludeUnusedFpmData - When true, this will count every block that is
+///    both in the file and matches the form of an FPM block, even if some of
+///    those FPM blocks are unused (a single FPM block can describe the
+///    allocation status of up to 32,767 blocks, although one appears only
+///    every 4,096 blocks).  So there are 8x as many blocks that match the
+///    form as there are blocks that are necessary to describe the allocation
+///    status of the file.  When this parameter is false, these extraneous
+///    trailing blocks are not counted.
+inline uint32_t getNumFpmIntervals(uint32_t BlockSize, uint32_t NumBlocks,
+                                   bool IncludeUnusedFpmData, int FpmNumber) {
+  assert(FpmNumber == 1 || FpmNumber == 2);
+  if (IncludeUnusedFpmData) {
+    // This calculation determines how many times a number of the form
+    // BlockSize * k + N appears in the range [0, NumBlocks).  We only need to
+    // do this when unused data is included, since the number of blocks dwarfs
+    // the number of fpm blocks.
+    return divideCeil(NumBlocks - FpmNumber, BlockSize);
+  }
 
   // We want the minimum number of intervals required, where each interval can
   // represent BlockSize * 8 blocks.
-  return divideCeil(L.SB->NumBlocks, 8 * L.SB->BlockSize);
+  return divideCeil(NumBlocks, 8 * BlockSize);
+}
+
+inline uint32_t getNumFpmIntervals(const MSFLayout &L,
+                                   bool IncludeUnusedFpmData = false,
+                                   bool AltFpm = false) {
+  return getNumFpmIntervals(L.SB->BlockSize, L.SB->NumBlocks,
+                            IncludeUnusedFpmData,
+                            AltFpm ? L.alternateFpmBlock() : L.mainFpmBlock());
 }
 
 Error validateSuperBlock(const SuperBlock &SB);
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h
new file mode 100644
index 000000000000..39490a4b2209
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h
@@ -0,0 +1,40 @@
+//==- DIAEnumInjectedSources.h - DIA Injected Sources Enumerator -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMINJECTEDSOURCES_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMINJECTEDSOURCES_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
+
+namespace llvm {
+namespace pdb {
+class DIASession;
+
+class DIAEnumInjectedSources : public IPDBEnumChildren<IPDBInjectedSource> {
+public:
+  explicit DIAEnumInjectedSources(
+      const DIASession &PDBSession,
+      CComPtr<IDiaEnumInjectedSources> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+  DIAEnumInjectedSources *clone() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaEnumInjectedSources> Enumerator;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_DIA_DIAENUMINJECTEDSOURCES_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h
new file mode 100644
index 000000000000..52c9563b5d5f
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h
@@ -0,0 +1,40 @@
+//==- DIAEnumSectionContribs.h --------------------------------- -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMSECTIONCONTRIBS_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMSECTIONCONTRIBS_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSectionContrib.h"
+
+namespace llvm {
+namespace pdb {
+class DIASession;
+
+class DIAEnumSectionContribs : public IPDBEnumChildren<IPDBSectionContrib> {
+public:
+  explicit DIAEnumSectionContribs(
+      const DIASession &PDBSession,
+      CComPtr<IDiaEnumSectionContribs> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+  DIAEnumSectionContribs *clone() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaEnumSectionContribs> Enumerator;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_DIA_DIAENUMSECTIONCONTRIBS_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAInjectedSource.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAInjectedSource.h
new file mode 100644
index 000000000000..635508da84ea
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAInjectedSource.h
@@ -0,0 +1,38 @@
+//===- DIAInjectedSource.h - DIA impl for IPDBInjectedSource ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAINJECTEDSOURCE_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAINJECTEDSOURCE_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
+
+namespace llvm {
+namespace pdb {
+class DIASession;
+
+class DIAInjectedSource : public IPDBInjectedSource {
+public:
+  explicit DIAInjectedSource(CComPtr<IDiaInjectedSource> DiaSourceFile);
+
+  uint32_t getCrc32() const override;
+  uint64_t getCodeByteSize() const override;
+  std::string getFileName() const override;
+  std::string getObjectFileName() const override;
+  std::string getVirtualFileName() const override;
+  PDB_SourceCompression getCompression() const override;
+  std::string getCode() const override;
+
+private:
+  CComPtr<IDiaInjectedSource> SourceFile;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_DIA_DIAINJECTEDSOURCE_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
index 2d6c44905ce0..dfb35647055a 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
@@ -30,10 +30,31 @@ public:
   findChildren(PDB_SymType Type, StringRef Name,
                PDB_NameSearchFlags Flags) const override;
   std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByAddr(PDB_SymType Type, StringRef Name,
+                     PDB_NameSearchFlags Flags,
+                     uint32_t Section, uint32_t Offset) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+                   uint64_t VA) const override;
+  std::unique_ptr<IPDBEnumSymbols>
   findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
                     uint32_t RVA) const override;
+
+  std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByAddr(uint32_t Section, uint32_t Offset) const override;
   std::unique_ptr<IPDBEnumSymbols>
   findInlineFramesByRVA(uint32_t RVA) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByVA(uint64_t VA) const override;
+
+  std::unique_ptr<IPDBEnumLineNumbers> findInlineeLines() const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
+                         uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByVA(uint64_t VA, uint32_t Length) const override;
 
   void getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const override;
   void getFrontEndVersion(VersionInfo &Version) const override;
@@ -82,6 +103,7 @@ public:
   uint32_t getSizeInUdt() const override;
   uint32_t getSlot() const override;
   std::string getSourceFileName() const override;
+  std::unique_ptr<IPDBLineNumber> getSrcLineOnTypeDefn() const override;
   uint32_t getStride() const override;
   uint32_t getSubTypeId() const override;
   std::string getSymbolsFileName() const override;
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASectionContrib.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASectionContrib.h
new file mode 100644
index 000000000000..4688f1f91a89
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASectionContrib.h
@@ -0,0 +1,55 @@
+//===- DIASectionContrib.h - DIA Impl. of IPDBSectionContrib ------ C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIASECTIONCONTRIB_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIASECTIONCONTRIB_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBSectionContrib.h"
+
+namespace llvm {
+namespace pdb {
+class DIASession;
+
+class DIASectionContrib : public IPDBSectionContrib {
+public:
+  explicit DIASectionContrib(const DIASession &PDBSession,
+                             CComPtr<IDiaSectionContrib> DiaSection);
+
+  std::unique_ptr<PDBSymbolCompiland> getCompiland() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getAddressOffset() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint64_t getVirtualAddress() const override;
+  uint32_t getLength() const override;
+  bool isNotPaged() const override;
+  bool hasCode() const override;
+  bool hasCode16Bit() const override;
+  bool hasInitializedData() const override;
+  bool hasUninitializedData() const override;
+  bool isRemoved() const override;
+  bool hasComdat() const override;
+  bool isDiscardable() const override;
+  bool isNotCached() const override;
+  bool isShared() const override;
+  bool isExecutable() const override;
+  bool isReadable() const override;
+  bool isWritable() const override;
+  uint32_t getDataCrc32() const override;
+  uint32_t getRelocationsCrc32() const override;
+  uint32_t getCompilandId() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaSectionContrib> Section;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_DIA_DIASECTIONCONTRIB_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASession.h
index 66bd7a7e9c4e..a63659439389 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASession.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -30,18 +30,33 @@ public:
                              std::unique_ptr<IPDBSession> &Session);
 
   uint64_t getLoadAddress() const override;
-  void setLoadAddress(uint64_t Address) override;
+  bool setLoadAddress(uint64_t Address) override;
   std::unique_ptr<PDBSymbolExe> getGlobalScope() override;
   std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override;
 
+  bool addressForVA(uint64_t VA, uint32_t &Section,
+                    uint32_t &Offset) const override;
+  bool addressForRVA(uint32_t RVA, uint32_t &Section,
+                     uint32_t &Offset) const override;
+
   std::unique_ptr<PDBSymbol>
   findSymbolByAddress(uint64_t Address, PDB_SymType Type) const override;
+  std::unique_ptr<PDBSymbol> findSymbolByRVA(uint32_t RVA,
+                                             PDB_SymType Type) const override;
+  std::unique_ptr<PDBSymbol>
+  findSymbolBySectOffset(uint32_t Section, uint32_t Offset,
+                         PDB_SymType Type) const override;
 
   std::unique_ptr<IPDBEnumLineNumbers>
   findLineNumbers(const PDBSymbolCompiland &Compiland,
                   const IPDBSourceFile &File) const override;
   std::unique_ptr<IPDBEnumLineNumbers>
   findLineNumbersByAddress(uint64_t Address, uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
+                              uint32_t Length) const override;
 
   std::unique_ptr<IPDBEnumSourceFiles>
   findSourceFiles(const PDBSymbolCompiland *Compiland, llvm::StringRef Pattern,
@@ -65,9 +80,14 @@ public:
   std::unique_ptr<IPDBEnumDataStreams> getDebugStreams() const override;
 
   std::unique_ptr<IPDBEnumTables> getEnumTables() const override;
+
+  std::unique_ptr<IPDBEnumInjectedSources> getInjectedSources() const override;
+
+  std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
+
 private:
   CComPtr<IDiaSession> Session;
 };
-}
-}
+} // namespace pdb
+} // namespace llvm
 #endif
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASupport.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASupport.h
index 3b4a348289df..92ebc04ae5a4 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASupport.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIASupport.h
@@ -22,14 +22,6 @@
 #define NOMINMAX
 #endif
 
-// llvm/Support/Debug.h unconditionally #defines DEBUG as a macro.
-// DIA headers #define it if it is not already defined, so we have
-// an order of includes problem.  The real fix is to make LLVM use
-// something less generic than DEBUG, such as LLVM_DEBUG(), but it's
-// fairly prevalent.  So for now, we save the definition state and
-// restore it.
-#pragma push_macro("DEBUG")
-
 // atlbase.h has to come before windows.h
 #include <atlbase.h>
 #include <windows.h>
@@ -39,6 +31,4 @@
 #include <dia2.h>
 #include <diacreate.h>
 
-#pragma pop_macro("DEBUG")
-
 #endif // LLVM_DEBUGINFO_PDB_DIA_DIASUPPORT_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAUtils.h b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAUtils.h
new file mode 100644
index 000000000000..aa843e05de70
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/DIA/DIAUtils.h
@@ -0,0 +1,31 @@
+//===- DIAUtils.h - Utility functions for working with DIA ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAUTILS_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAUTILS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/ConvertUTF.h"
+
+template <typename Obj>
+std::string invokeBstrMethod(Obj &Object,
+                             HRESULT (__stdcall Obj::*Func)(BSTR *)) {
+  CComBSTR Str16;
+  HRESULT Result = (Object.*Func)(&Str16);
+  if (S_OK != Result)
+    return std::string();
+
+  std::string Str8;
+  llvm::ArrayRef<char> StrBytes(reinterpret_cast<char *>(Str16.m_str),
+                                Str16.ByteLength());
+  llvm::convertUTF16ToUTF8String(StrBytes, Str8);
+  return Str8;
+}
+
+#endif // LLVM_DEBUGINFO_PDB_DIA_DIAUTILS_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h
new file mode 100644
index 000000000000..e75d64af92bb
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h
@@ -0,0 +1,42 @@
+//===- IPDBInjectedSource.h - base class for PDB injected file --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBINJECTEDSOURCE_H
+#define LLVM_DEBUGINFO_PDB_IPDBINJECTEDSOURCE_H
+
+#include "PDBTypes.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+class raw_ostream;
+
+namespace pdb {
+
+/// IPDBInjectedSource defines an interface used to represent source files
+/// which were injected directly into the PDB file during the compilation
+/// process.  This is used, for example, to add natvis files to a PDB, but
+/// in theory could be used to add arbitrary source code.
+class IPDBInjectedSource {
+public:
+  virtual ~IPDBInjectedSource();
+
+  virtual uint32_t getCrc32() const = 0;
+  virtual uint64_t getCodeByteSize() const = 0;
+  virtual std::string getFileName() const = 0;
+  virtual std::string getObjectFileName() const = 0;
+  virtual std::string getVirtualFileName() const = 0;
+  virtual PDB_SourceCompression getCompression() const = 0;
+  virtual std::string getCode() const = 0;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_IPDBINJECTEDSOURCE_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
index 18b9423378a0..bcb2eaa35630 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
@@ -42,10 +42,31 @@ public:
   findChildren(PDB_SymType Type, StringRef Name,
                PDB_NameSearchFlags Flags) const = 0;
   virtual std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByAddr(PDB_SymType Type, StringRef Name,
+                     PDB_NameSearchFlags Flags,
+                     uint32_t Section, uint32_t Offset) const = 0;
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+                   uint64_t VA) const = 0;
+  virtual std::unique_ptr<IPDBEnumSymbols>
   findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
                     uint32_t RVA) const = 0;
+
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByAddr(uint32_t Section, uint32_t Offset) const = 0;
   virtual std::unique_ptr<IPDBEnumSymbols>
   findInlineFramesByRVA(uint32_t RVA) const = 0;
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByVA(uint64_t VA) const = 0;
+
+  virtual std::unique_ptr<IPDBEnumLineNumbers> findInlineeLines() const = 0;
+  virtual std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
+                         uint32_t Length) const = 0;
+  virtual std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const = 0;
+  virtual std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByVA(uint64_t VA, uint32_t Length) const = 0;
 
   virtual void getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const = 0;
   virtual void getBackEndVersion(VersionInfo &Version) const = 0;
@@ -94,6 +115,8 @@ public:
   virtual uint32_t getSizeInUdt() const = 0;
   virtual uint32_t getSlot() const = 0;
   virtual std::string getSourceFileName() const = 0;
+  virtual std::unique_ptr<IPDBLineNumber>
+  getSrcLineOnTypeDefn() const = 0;
   virtual uint32_t getStride() const = 0;
   virtual uint32_t getSubTypeId() const = 0;
   virtual std::string getSymbolsFileName() const = 0;
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSectionContrib.h b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSectionContrib.h
new file mode 100644
index 000000000000..4fda62404672
--- /dev/null
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSectionContrib.h
@@ -0,0 +1,50 @@
+//==- IPDBSectionContrib.h - Interfaces for PDB SectionContribs --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBSECTIONCONTRIB_H
+#define LLVM_DEBUGINFO_PDB_IPDBSECTIONCONTRIB_H
+
+#include "PDBTypes.h"
+
+namespace llvm {
+namespace pdb {
+
+/// IPDBSectionContrib defines an interface used to represent section
+/// contributions whose information are stored in the PDB.
+class IPDBSectionContrib {
+public:
+  virtual ~IPDBSectionContrib();
+
+  virtual std::unique_ptr<PDBSymbolCompiland> getCompiland() const = 0;
+  virtual uint32_t getAddressSection() const = 0;
+  virtual uint32_t getAddressOffset() const = 0;
+  virtual uint32_t getRelativeVirtualAddress() const = 0;
+  virtual uint64_t getVirtualAddress() const  = 0;
+  virtual uint32_t getLength() const = 0;
+  virtual bool isNotPaged() const = 0;
+  virtual bool hasCode() const = 0;
+  virtual bool hasCode16Bit() const = 0;
+  virtual bool hasInitializedData() const = 0;
+  virtual bool hasUninitializedData() const = 0;
+  virtual bool isRemoved() const = 0;
+  virtual bool hasComdat() const = 0;
+  virtual bool isDiscardable() const = 0;
+  virtual bool isNotCached() const = 0;
+  virtual bool isShared() const = 0;
+  virtual bool isExecutable() const = 0;
+  virtual bool isReadable() const = 0;
+  virtual bool isWritable() const = 0;
+  virtual uint32_t getDataCrc32() const = 0;
+  virtual uint32_t getRelocationsCrc32() const = 0;
+  virtual uint32_t getCompilandId() const = 0;
+};
+}
+}
+
+#endif // LLVM_DEBUGINFO_PDB_IPDBSECTIONCONTRIB_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSession.h b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSession.h
index 6291289de5bf..88ec517bc4a5 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSession.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -28,10 +28,15 @@ public:
   virtual ~IPDBSession();
 
   virtual uint64_t getLoadAddress() const = 0;
-  virtual void setLoadAddress(uint64_t Address) = 0;
+  virtual bool setLoadAddress(uint64_t Address) = 0;
   virtual std::unique_ptr<PDBSymbolExe> getGlobalScope() = 0;
   virtual std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const = 0;
 
+  virtual bool addressForVA(uint64_t VA, uint32_t &Section,
+                            uint32_t &Offset) const = 0;
+  virtual bool addressForRVA(uint32_t RVA, uint32_t &Section,
+                             uint32_t &Offset) const = 0;
+
   template <typename T>
   std::unique_ptr<T> getConcreteSymbolById(uint32_t SymbolId) const {
     return unique_dyn_cast_or_null<T>(getSymbolById(SymbolId));
@@ -39,12 +44,22 @@ public:
 
   virtual std::unique_ptr<PDBSymbol>
   findSymbolByAddress(uint64_t Address, PDB_SymType Type) const = 0;
+  virtual std::unique_ptr<PDBSymbol>
+  findSymbolByRVA(uint32_t RVA, PDB_SymType Type) const = 0;
+  virtual std::unique_ptr<PDBSymbol>
+  findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
+                         PDB_SymType Type) const = 0;
 
   virtual std::unique_ptr<IPDBEnumLineNumbers>
   findLineNumbers(const PDBSymbolCompiland &Compiland,
                   const IPDBSourceFile &File) const = 0;
   virtual std::unique_ptr<IPDBEnumLineNumbers>
   findLineNumbersByAddress(uint64_t Address, uint32_t Length) const = 0;
+  virtual std::unique_ptr<IPDBEnumLineNumbers>
+  findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const = 0;
+  virtual std::unique_ptr<IPDBEnumLineNumbers>
+  findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
+                              uint32_t Length) const = 0;
 
   virtual std::unique_ptr<IPDBEnumSourceFiles>
   findSourceFiles(const PDBSymbolCompiland *Compiland, llvm::StringRef Pattern,
@@ -69,8 +84,14 @@ public:
   virtual std::unique_ptr<IPDBEnumDataStreams> getDebugStreams() const = 0;
 
   virtual std::unique_ptr<IPDBEnumTables> getEnumTables() const = 0;
+
+  virtual std::unique_ptr<IPDBEnumInjectedSources>
+  getInjectedSources() const = 0;
+
+  virtual std::unique_ptr<IPDBEnumSectionContribs>
+  getSectionContribs() const = 0;
 };
-}
-}
+} // namespace pdb
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h
index 8200f51e3da9..9eef4041d0a1 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h
@@ -47,6 +47,8 @@ public:
 
   uint32_t getRecordLength() const;
 
+  const SectionContrib &getSectionContrib() const;
+
 private:
   StringRef ModuleName;
   StringRef ObjFileName;
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index c918a5d5e976..ce4d07917755 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -49,6 +49,7 @@ public:
 
   void setPdbFilePathNI(uint32_t NI);
   void setObjFileName(StringRef Name);
+  void setFirstSectionContrib(const SectionContrib &SC);
   void addSymbol(codeview::CVSymbol Symbol);
 
   void
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h
index 4be113f28d6f..280615bdb507 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStream.h
@@ -38,9 +38,9 @@ class DbiStream {
   friend class DbiStreamBuilder;
 
 public:
-  DbiStream(PDBFile &File, std::unique_ptr<msf::MappedBlockStream> Stream);
+  explicit DbiStream(std::unique_ptr<BinaryStream> Stream);
   ~DbiStream();
-  Error reload();
+  Error reload(PDBFile *Pdb);
 
   PdbRaw_DbiVer getDbiVersion() const;
   uint32_t getAge() const;
@@ -63,6 +63,8 @@ public:
 
   PDB_Machine getMachineType() const;
 
+  const DbiStreamHeader *getHeader() const { return Header; }
+
   BinarySubstreamRef getSectionContributionData() const;
   BinarySubstreamRef getSecMapSubstreamData() const;
   BinarySubstreamRef getModiSubstreamData() const;
@@ -87,12 +89,11 @@ public:
 
 private:
   Error initializeSectionContributionData();
-  Error initializeSectionHeadersData();
+  Error initializeSectionHeadersData(PDBFile *Pdb);
   Error initializeSectionMapData();
-  Error initializeFpoRecords();
+  Error initializeFpoRecords(PDBFile *Pdb);
 
-  PDBFile &Pdb;
-  std::unique_ptr<msf::MappedBlockStream> Stream;
+  std::unique_ptr<BinaryStream> Stream;
 
   PDBStringTable ECNames;
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
index ad4a0d1bcb6b..51befcdac775 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Support/Error.h"
 
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -46,10 +47,12 @@ public:
   void setVersionHeader(PdbRaw_DbiVer V);
   void setAge(uint32_t A);
   void setBuildNumber(uint16_t B);
+  void setBuildNumber(uint8_t Major, uint8_t Minor);
   void setPdbDllVersion(uint16_t V);
   void setPdbDllRbld(uint16_t R);
   void setFlags(uint16_t F);
   void setMachineType(PDB_Machine M);
+  void setMachineType(COFF::MachineTypes M);
   void setSectionMap(ArrayRef<SecMapEntry> SecMap);
 
   // Add given bytes as a new stream.
@@ -121,7 +124,7 @@ private:
   MutableBinaryByteStream FileInfoBuffer;
   std::vector<SectionContrib> SectionContribs;
   ArrayRef<SecMapEntry> SectionMap;
-  llvm::SmallVector<DebugStream, (int)DbgHeaderType::Max> DbgStreams;
+  std::array<Optional<DebugStream>, (int)DbgHeaderType::Max> DbgStreams;
 };
 }
 }
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
index 05c70c4f2175..34cc6179688b 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
@@ -12,6 +12,9 @@
 
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/iterator.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -26,80 +29,303 @@ class BinaryStreamWriter;
 
 namespace pdb {
 
-class HashTableIterator;
+Error readSparseBitVector(BinaryStreamReader &Stream, SparseBitVector<> &V);
+Error writeSparseBitVector(BinaryStreamWriter &Writer, SparseBitVector<> &Vec);
 
+template <typename ValueT, typename TraitsT> class HashTable;
+
+template <typename ValueT, typename TraitsT>
+class HashTableIterator
+    : public iterator_facade_base<HashTableIterator<ValueT, TraitsT>,
+                                  std::forward_iterator_tag,
+                                  std::pair<uint32_t, ValueT>> {
+  friend HashTable<ValueT, TraitsT>;
+
+  HashTableIterator(const HashTable<ValueT, TraitsT> &Map, uint32_t Index,
+                    bool IsEnd)
+      : Map(&Map), Index(Index), IsEnd(IsEnd) {}
+
+public:
+  HashTableIterator(const HashTable<ValueT, TraitsT> &Map) : Map(&Map) {
+    int I = Map.Present.find_first();
+    if (I == -1) {
+      Index = 0;
+      IsEnd = true;
+    } else {
+      Index = static_cast<uint32_t>(I);
+      IsEnd = false;
+    }
+  }
+
+  HashTableIterator &operator=(const HashTableIterator &R) {
+    Map = R.Map;
+    return *this;
+  }
+  bool operator==(const HashTableIterator &R) const {
+    if (IsEnd && R.IsEnd)
+      return true;
+    if (IsEnd != R.IsEnd)
+      return false;
+
+    return (Map == R.Map) && (Index == R.Index);
+  }
+  const std::pair<uint32_t, ValueT> &operator*() const {
+    assert(Map->Present.test(Index));
+    return Map->Buckets[Index];
+  }
+  HashTableIterator &operator++() {
+    while (Index < Map->Buckets.size()) {
+      ++Index;
+      if (Map->Present.test(Index))
+        return *this;
+    }
+
+    IsEnd = true;
+    return *this;
+  }
+
+private:
+  bool isEnd() const { return IsEnd; }
+  uint32_t index() const { return Index; }
+
+  const HashTable<ValueT, TraitsT> *Map;
+  uint32_t Index;
+  bool IsEnd;
+};
+
+template <typename T> struct PdbHashTraits {};
+
+template <> struct PdbHashTraits<uint32_t> {
+  uint32_t hashLookupKey(uint32_t N) const { return N; }
+  uint32_t storageKeyToLookupKey(uint32_t N) const { return N; }
+  uint32_t lookupKeyToStorageKey(uint32_t N) { return N; }
+};
+
+template <typename ValueT, typename TraitsT = PdbHashTraits<ValueT>>
 class HashTable {
-  friend class HashTableIterator;
+  using iterator = HashTableIterator<ValueT, TraitsT>;
+  friend iterator;
 
   struct Header {
     support::ulittle32_t Size;
     support::ulittle32_t Capacity;
   };
 
-  using BucketList = std::vector<std::pair<uint32_t, uint32_t>>;
+  using BucketList = std::vector<std::pair<uint32_t, ValueT>>;
 
 public:
-  HashTable();
-  explicit HashTable(uint32_t Capacity);
+  HashTable() { Buckets.resize(8); }
+
+  explicit HashTable(TraitsT Traits) : HashTable(8, std::move(Traits)) {}
+  HashTable(uint32_t Capacity, TraitsT Traits) : Traits(Traits) {
+    Buckets.resize(Capacity);
+  }
+
+  Error load(BinaryStreamReader &Stream) {
+    const Header *H;
+    if (auto EC = Stream.readObject(H))
+      return EC;
+    if (H->Capacity == 0)
+      return make_error<RawError>(raw_error_code::corrupt_file,
+                                  "Invalid Hash Table Capacity");
+    if (H->Size > maxLoad(H->Capacity))
+      return make_error<RawError>(raw_error_code::corrupt_file,
+                                  "Invalid Hash Table Size");
+
+    Buckets.resize(H->Capacity);
+
+    if (auto EC = readSparseBitVector(Stream, Present))
+      return EC;
+    if (Present.count() != H->Size)
+      return make_error<RawError>(raw_error_code::corrupt_file,
+                                  "Present bit vector does not match size!");
+
+    if (auto EC = readSparseBitVector(Stream, Deleted))
+      return EC;
+    if (Present.intersects(Deleted))
+      return make_error<RawError>(raw_error_code::corrupt_file,
+                                  "Present bit vector interesects deleted!");
 
-  Error load(BinaryStreamReader &Stream);
+    for (uint32_t P : Present) {
+      if (auto EC = Stream.readInteger(Buckets[P].first))
+        return EC;
+      const ValueT *Value;
+      if (auto EC = Stream.readObject(Value))
+        return EC;
+      Buckets[P].second = *Value;
+    }
 
-  uint32_t calculateSerializedLength() const;
-  Error commit(BinaryStreamWriter &Writer) const;
+    return Error::success();
+  }
 
-  void clear();
+  uint32_t calculateSerializedLength() const {
+    uint32_t Size = sizeof(Header);
 
-  uint32_t capacity() const;
-  uint32_t size() const;
+    constexpr int BitsPerWord = 8 * sizeof(uint32_t);
 
-  HashTableIterator begin() const;
-  HashTableIterator end() const;
-  HashTableIterator find(uint32_t K);
+    int NumBitsP = Present.find_last() + 1;
+    int NumBitsD = Deleted.find_last() + 1;
 
-  void set(uint32_t K, uint32_t V);
-  void remove(uint32_t K);
-  uint32_t get(uint32_t K);
+    uint32_t NumWordsP = alignTo(NumBitsP, BitsPerWord) / BitsPerWord;
+    uint32_t NumWordsD = alignTo(NumBitsD, BitsPerWord) / BitsPerWord;
+
+    // Present bit set number of words (4 bytes), followed by that many actual
+    // words (4 bytes each).
+    Size += sizeof(uint32_t);
+    Size += NumWordsP * sizeof(uint32_t);
+
+    // Deleted bit set number of words (4 bytes), followed by that many actual
+    // words (4 bytes each).
+    Size += sizeof(uint32_t);
+    Size += NumWordsD * sizeof(uint32_t);
+
+    // One (Key, ValueT) pair for each entry Present.
+    Size += (sizeof(uint32_t) + sizeof(ValueT)) * size();
+
+    return Size;
+  }
+
+  Error commit(BinaryStreamWriter &Writer) const {
+    Header H;
+    H.Size = size();
+    H.Capacity = capacity();
+    if (auto EC = Writer.writeObject(H))
+      return EC;
+
+    if (auto EC = writeSparseBitVector(Writer, Present))
+      return EC;
+
+    if (auto EC = writeSparseBitVector(Writer, Deleted))
+      return EC;
+
+    for (const auto &Entry : *this) {
+      if (auto EC = Writer.writeInteger(Entry.first))
+        return EC;
+      if (auto EC = Writer.writeObject(Entry.second))
+        return EC;
+    }
+    return Error::success();
+  }
+
+  void clear() {
+    Buckets.resize(8);
+    Present.clear();
+    Deleted.clear();
+  }
+
+  bool empty() const { return size() == 0; }
+  uint32_t capacity() const { return Buckets.size(); }
+  uint32_t size() const { return Present.count(); }
+
+  iterator begin() const { return iterator(*this); }
+  iterator end() const { return iterator(*this, 0, true); }
+
+  /// Find the entry whose key has the specified hash value, using the specified
+  /// traits defining hash function and equality.
+  template <typename Key> iterator find_as(const Key &K) const {
+    uint32_t H = Traits.hashLookupKey(K) % capacity();
+    uint32_t I = H;
+    Optional<uint32_t> FirstUnused;
+    do {
+      if (isPresent(I)) {
+        if (Traits.storageKeyToLookupKey(Buckets[I].first) == K)
+          return iterator(*this, I, false);
+      } else {
+        if (!FirstUnused)
+          FirstUnused = I;
+        // Insertion occurs via linear probing from the slot hint, and will be
+        // inserted at the first empty / deleted location.  Therefore, if we are
+        // probing and find a location that is neither present nor deleted, then
+        // nothing must have EVER been inserted at this location, and thus it is
+        // not possible for a matching value to occur later.
+        if (!isDeleted(I))
+          break;
+      }
+      I = (I + 1) % capacity();
+    } while (I != H);
+
+    // The only way FirstUnused would not be set is if every single entry in the
+    // table were Present.  But this would violate the load factor constraints
+    // that we impose, so it should never happen.
+    assert(FirstUnused);
+    return iterator(*this, *FirstUnused, true);
+  }
+
+  /// Set the entry using a key type that the specified Traits can convert
+  /// from a real key to an internal key.
+  template <typename Key> bool set_as(const Key &K, ValueT V) {
+    return set_as_internal(K, std::move(V), None);
+  }
+
+  template <typename Key> ValueT get(const Key &K) const {
+    auto Iter = find_as(K);
+    assert(Iter != end());
+    return (*Iter).second;
+  }
 
 protected:
   bool isPresent(uint32_t K) const { return Present.test(K); }
   bool isDeleted(uint32_t K) const { return Deleted.test(K); }
 
+  TraitsT Traits;
   BucketList Buckets;
   mutable SparseBitVector<> Present;
   mutable SparseBitVector<> Deleted;
 
 private:
-  static uint32_t maxLoad(uint32_t capacity);
-  void grow();
+  /// Set the entry using a key type that the specified Traits can convert
+  /// from a real key to an internal key.
+  template <typename Key>
+  bool set_as_internal(const Key &K, ValueT V, Optional<uint32_t> InternalKey) {
+    auto Entry = find_as(K);
+    if (Entry != end()) {
+      assert(isPresent(Entry.index()));
+      assert(Traits.storageKeyToLookupKey(Buckets[Entry.index()].first) == K);
+      // We're updating, no need to do anything special.
+      Buckets[Entry.index()].second = V;
+      return false;
+    }
 
-  static Error readSparseBitVector(BinaryStreamReader &Stream,
-                                   SparseBitVector<> &V);
-  static Error writeSparseBitVector(BinaryStreamWriter &Writer,
-                                    SparseBitVector<> &Vec);
-};
+    auto &B = Buckets[Entry.index()];
+    assert(!isPresent(Entry.index()));
+    assert(Entry.isEnd());
+    B.first = InternalKey ? *InternalKey : Traits.lookupKeyToStorageKey(K);
+    B.second = V;
+    Present.set(Entry.index());
+    Deleted.reset(Entry.index());
 
-class HashTableIterator
-    : public iterator_facade_base<HashTableIterator, std::forward_iterator_tag,
-                                  std::pair<uint32_t, uint32_t>> {
-  friend class HashTable;
+    grow();
 
-  HashTableIterator(const HashTable &Map, uint32_t Index, bool IsEnd);
+    assert((find_as(K)) != end());
+    return true;
+  }
 
-public:
-  HashTableIterator(const HashTable &Map);
+  static uint32_t maxLoad(uint32_t capacity) { return capacity * 2 / 3 + 1; }
 
-  HashTableIterator &operator=(const HashTableIterator &R);
-  bool operator==(const HashTableIterator &R) const;
-  const std::pair<uint32_t, uint32_t> &operator*() const;
-  HashTableIterator &operator++();
+  void grow() {
+    uint32_t S = size();
+    uint32_t MaxLoad = maxLoad(capacity());
+    if (S < maxLoad(capacity()))
+      return;
+    assert(capacity() != UINT32_MAX && "Can't grow Hash table!");
 
-private:
-  bool isEnd() const { return IsEnd; }
-  uint32_t index() const { return Index; }
+    uint32_t NewCapacity = (capacity() <= INT32_MAX) ? MaxLoad * 2 : UINT32_MAX;
 
-  const HashTable *Map;
-  uint32_t Index;
-  bool IsEnd;
+    // Growing requires rebuilding the table and re-hashing every item.  Make a
+    // copy with a larger capacity, insert everything into the copy, then swap
+    // it in.
+    HashTable NewMap(NewCapacity, Traits);
+    for (auto I : Present) {
+      auto LookupKey = Traits.storageKeyToLookupKey(Buckets[I].first);
+      NewMap.set_as_internal(LookupKey, Buckets[I].second, Buckets[I].first);
+    }
+
+    Buckets.swap(NewMap.Buckets);
+    std::swap(Present, NewMap.Present);
+    std::swap(Deleted, NewMap.Deleted);
+    assert(capacity() == NewCapacity);
+    assert(size() == S);
+  }
 };
 
 } // end namespace pdb
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h
index fb8271cb5ebc..8c52b042f289 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStream.h
@@ -30,12 +30,14 @@ class InfoStream {
   friend class InfoStreamBuilder;
 
 public:
-  InfoStream(std::unique_ptr<msf::MappedBlockStream> Stream);
+  InfoStream(std::unique_ptr<BinaryStream> Stream);
 
   Error reload();
 
   uint32_t getStreamSize() const;
 
+  const InfoStreamHeader *getHeader() const { return Header; }
+
   bool containsIdStream() const;
   PdbRaw_ImplVer getVersion() const;
   uint32_t getSignature() const;
@@ -50,29 +52,13 @@ public:
 
   BinarySubstreamRef getNamedStreamsBuffer() const;
 
-  uint32_t getNamedStreamIndex(llvm::StringRef Name) const;
-  iterator_range<StringMapConstIterator<uint32_t>> named_streams() const;
+  Expected<uint32_t> getNamedStreamIndex(llvm::StringRef Name) const;
+  StringMap<uint32_t> named_streams() const;
 
 private:
-  std::unique_ptr<msf::MappedBlockStream> Stream;
-
-  // PDB file format version.  We only support VC70.  See the enumeration
-  // `PdbRaw_ImplVer` for the other possible values.
-  uint32_t Version;
-
-  // A 32-bit signature unique across all PDBs.  This is generated with
-  // a call to time() when the PDB is written, but obviously this is not
-  // universally unique.
-  uint32_t Signature;
-
-  // The number of times the PDB has been written.  Might also be used to
-  // ensure that the PDB matches the executable.
-  uint32_t Age;
+  std::unique_ptr<BinaryStream> Stream;
 
-  // Due to the aforementioned limitations with `Signature`, this is a new
-  // signature present on VC70 and higher PDBs which is guaranteed to be
-  // universally unique.
-  codeview::GUID Guid;
+  const InfoStreamHeader *Header;
 
   BinarySubstreamRef SubNamedStreams;
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
index c6cb0e221e70..419e8ada06f7 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
@@ -40,6 +40,10 @@ public:
   void setGuid(codeview::GUID G);
   void addFeature(PdbRaw_FeatureSig Sig);
 
+  uint32_t getAge() const { return Age; }
+  codeview::GUID getGuid() const { return Guid; }
+  Optional<uint32_t> getSignature() const { return Signature; }
+
   uint32_t finalize();
 
   Error finalizeMsfLayout();
@@ -52,8 +56,8 @@ private:
 
   std::vector<PdbRaw_FeatureSig> Features;
   PdbRaw_ImplVer Ver;
-  uint32_t Sig;
   uint32_t Age;
+  Optional<uint32_t> Signature;
   codeview::GUID Guid;
 
   NamedStreamMap &NamedStreams;
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
index 17a82b7ce12d..01b8f1b5da56 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
@@ -25,32 +25,45 @@ class BinaryStreamWriter;
 
 namespace pdb {
 
+class NamedStreamMap;
+
+struct NamedStreamMapTraits {
+  NamedStreamMap *NS;
+
+  explicit NamedStreamMapTraits(NamedStreamMap &NS);
+  uint16_t hashLookupKey(StringRef S) const;
+  StringRef storageKeyToLookupKey(uint32_t Offset) const;
+  uint32_t lookupKeyToStorageKey(StringRef S);
+};
+
 class NamedStreamMap {
   friend class NamedStreamMapBuilder;
 
-  struct FinalizationInfo {
-    uint32_t StringDataBytes = 0;
-    uint32_t SerializedLength = 0;
-  };
-
 public:
   NamedStreamMap();
 
   Error load(BinaryStreamReader &Stream);
   Error commit(BinaryStreamWriter &Writer) const;
-  uint32_t finalize();
+  uint32_t calculateSerializedLength() const;
 
   uint32_t size() const;
   bool get(StringRef Stream, uint32_t &StreamNo) const;
   void set(StringRef Stream, uint32_t StreamNo);
-  void remove(StringRef Stream);
-  const StringMap<uint32_t> &getStringMap() const { return Mapping; }
-  iterator_range<StringMapConstIterator<uint32_t>> entries() const;
+
+  uint32_t appendStringData(StringRef S);
+  StringRef getString(uint32_t Offset) const;
+  uint32_t hashString(uint32_t Offset) const;
+
+  StringMap<uint32_t> entries() const;
 
 private:
-  Optional<FinalizationInfo> FinalizedInfo;
-  HashTable FinalizedHashTable;
-  StringMap<uint32_t> Mapping;
+  NamedStreamMapTraits HashTraits;
+  /// Closed hash table from Offset -> StreamNumber, where Offset is the offset
+  /// of the stream name in NamesBuffer.
+  HashTable<support::ulittle32_t, NamedStreamMapTraits> OffsetIndexMap;
+
+  /// Buffer of string data.
+  std::vector<char> NamesBuffer;
 };
 
 } // end namespace pdb
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
index 931b93fb7266..5b70ecfa2056 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
@@ -35,10 +35,31 @@ public:
     findChildren(PDB_SymType Type, StringRef Name,
       PDB_NameSearchFlags Flags) const override;
   std::unique_ptr<IPDBEnumSymbols>
+    findChildrenByAddr(PDB_SymType Type, StringRef Name,
+                       PDB_NameSearchFlags Flags,
+                       uint32_t Section, uint32_t Offset) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+    findChildrenByVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+                     uint64_t VA) const override;
+  std::unique_ptr<IPDBEnumSymbols>
     findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
       uint32_t RVA) const override;
+
+  std::unique_ptr<IPDBEnumSymbols>
+    findInlineFramesByAddr(uint32_t Section, uint32_t Offset) const override;
   std::unique_ptr<IPDBEnumSymbols>
     findInlineFramesByRVA(uint32_t RVA) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+    findInlineFramesByVA(uint64_t VA) const override;
+
+  std::unique_ptr<IPDBEnumLineNumbers> findInlineeLines() const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+    findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
+                           uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+    findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+    findInlineeLinesByVA(uint64_t VA, uint32_t Length) const override;
 
   void getDataBytes(SmallVector<uint8_t, 32> &Bytes) const override;
   void getFrontEndVersion(VersionInfo &Version) const override;
@@ -87,6 +108,7 @@ public:
   uint32_t getSizeInUdt() const override;
   uint32_t getSlot() const override;
   std::string getSourceFileName() const override;
+  std::unique_ptr<IPDBLineNumber> getSrcLineOnTypeDefn() const override;
   uint32_t getStride() const override;
   uint32_t getSubTypeId() const override;
   std::string getSymbolsFileName() const override;
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 2e68ced46bfe..aff7ef2f8f21 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -49,18 +49,33 @@ public:
   SymIndexId findSymbolByTypeIndex(codeview::TypeIndex TI);
 
   uint64_t getLoadAddress() const override;
-  void setLoadAddress(uint64_t Address) override;
+  bool setLoadAddress(uint64_t Address) override;
   std::unique_ptr<PDBSymbolExe> getGlobalScope() override;
   std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override;
 
+  bool addressForVA(uint64_t VA, uint32_t &Section,
+                    uint32_t &Offset) const override;
+  bool addressForRVA(uint32_t RVA, uint32_t &Section,
+                     uint32_t &Offset) const override;
+
   std::unique_ptr<PDBSymbol>
   findSymbolByAddress(uint64_t Address, PDB_SymType Type) const override;
+  std::unique_ptr<PDBSymbol> findSymbolByRVA(uint32_t RVA,
+                                             PDB_SymType Type) const override;
+  std::unique_ptr<PDBSymbol>
+  findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
+                         PDB_SymType Type) const override;
 
   std::unique_ptr<IPDBEnumLineNumbers>
   findLineNumbers(const PDBSymbolCompiland &Compiland,
                   const IPDBSourceFile &File) const override;
   std::unique_ptr<IPDBEnumLineNumbers>
   findLineNumbersByAddress(uint64_t Address, uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
+                              uint32_t Length) const override;
 
   std::unique_ptr<IPDBEnumSourceFiles>
   findSourceFiles(const PDBSymbolCompiland *Compiland, llvm::StringRef Pattern,
@@ -85,6 +100,10 @@ public:
 
   std::unique_ptr<IPDBEnumTables> getEnumTables() const override;
 
+  std::unique_ptr<IPDBEnumInjectedSources> getInjectedSources() const override;
+
+  std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
+
   PDBFile &getPDBFile() { return *Pdb; }
   const PDBFile &getPDBFile() const { return *Pdb; }
 
@@ -94,7 +113,7 @@ private:
   std::vector<std::unique_ptr<NativeRawSymbol>> SymbolCache;
   DenseMap<codeview::TypeIndex, SymIndexId> TypeIndexToSymbolId;
 };
-}
-}
+} // namespace pdb
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
index 7ed164bee9ee..7f9c4cf9fa83 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
@@ -17,9 +17,11 @@
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 #include <memory>
 #include <vector>
@@ -54,12 +56,34 @@ public:
   Error commit(StringRef Filename);
 
   Expected<uint32_t> getNamedStreamIndex(StringRef Name) const;
-  Error addNamedStream(StringRef Name, uint32_t Size);
+  Error addNamedStream(StringRef Name, StringRef Data);
+  void addInjectedSource(StringRef Name, std::unique_ptr<MemoryBuffer> Buffer);
 
 private:
-  Expected<msf::MSFLayout> finalizeMsfLayout();
+  struct InjectedSourceDescriptor {
+    // The full name of the stream that contains the contents of this injected
+    // source.  This is built as a concatenation of the literal "/src/files"
+    // plus the "vname".
+    std::string StreamName;
+
+    // The exact name of the file name as specified by the user.
+    uint32_t NameIndex;
+
+    // The string table index of the "vname" of the file.  As far as we
+    // understand, this is the same as the name, except it is lowercased and
+    // forward slashes are converted to backslashes.
+    uint32_t VNameIndex;
+    std::unique_ptr<MemoryBuffer> Content;
+  };
+
+  Error finalizeMsfLayout();
+  Expected<uint32_t> allocateNamedStream(StringRef Name, uint32_t Size);
 
   void commitFpm(WritableBinaryStream &MsfBuffer, const msf::MSFLayout &Layout);
+  void commitInjectedSources(WritableBinaryStream &MsfBuffer,
+                             const msf::MSFLayout &Layout);
+  void commitSrcHeaderBlock(WritableBinaryStream &MsfBuffer,
+                            const msf::MSFLayout &Layout);
 
   BumpPtrAllocator &Allocator;
 
@@ -71,7 +95,13 @@ private:
   std::unique_ptr<TpiStreamBuilder> Ipi;
 
   PDBStringTableBuilder Strings;
+  StringTableHashTraits InjectedSourceHashTraits;
+  HashTable<SrcHeaderBlockEntry, StringTableHashTraits> InjectedSourceTable;
+
+  SmallVector<InjectedSourceDescriptor, 2> InjectedSources;
+
   NamedStreamMap NamedStreams;
+  DenseMap<uint32_t, std::string> NamedStreamData;
 };
 }
 }
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h
index b57707ee7923..0f81c18eafe6 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h
@@ -31,6 +31,16 @@ struct MSFLayout;
 namespace pdb {
 
 class PDBFileBuilder;
+class PDBStringTableBuilder;
+
+struct StringTableHashTraits {
+  PDBStringTableBuilder *Table;
+
+  explicit StringTableHashTraits(PDBStringTableBuilder &Table);
+  uint32_t hashLookupKey(StringRef S) const;
+  StringRef storageKeyToLookupKey(uint32_t Offset) const;
+  uint32_t lookupKeyToStorageKey(StringRef S);
+};
 
 class PDBStringTableBuilder {
 public:
@@ -38,6 +48,9 @@ public:
   // Returns the ID for S.
   uint32_t insert(StringRef S);
 
+  uint32_t getIdForString(StringRef S) const;
+  StringRef getStringForId(uint32_t Id) const;
+
   uint32_t calculateSerializedSize() const;
   Error commit(BinaryStreamWriter &Writer) const;
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawConstants.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawConstants.h
index bb1d097b5123..fbbd3318d958 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawConstants.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawConstants.h
@@ -32,6 +32,8 @@ enum PdbRaw_ImplVer : uint32_t {
   PdbImplVC140 = 20140508,
 };
 
+enum class PdbRaw_SrcHeaderBlockVer : uint32_t { SrcVerOne = 19980827 };
+
 enum class PdbRaw_FeatureSig : uint32_t {
   VC110 = PdbImplVC110,
   VC140 = PdbImplVC140,
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawTypes.h
index 8cc083685265..19f592d562e4 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawTypes.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/RawTypes.h
@@ -112,6 +112,8 @@ struct DbiBuildNo {
 
   static const uint16_t BuildMajorMask = 0x7F00;
   static const uint16_t BuildMajorShift = 8;
+
+  static const uint16_t NewVersionFormatMask = 0x8000;
 };
 
 /// The fixed size header that appears at the beginning of the DBI Stream.
@@ -175,18 +177,6 @@ struct DbiStreamHeader {
 };
 static_assert(sizeof(DbiStreamHeader) == 64, "Invalid DbiStreamHeader size!");
 
-struct SectionContribEntry {
-  support::ulittle16_t Section;
-  char Padding1[2];
-  support::little32_t Offset;
-  support::little32_t Size;
-  support::ulittle32_t Characteristics;
-  support::ulittle16_t ModuleIndex;
-  char Padding2[2];
-  support::ulittle32_t DataCrc;
-  support::ulittle32_t RelocCrc;
-};
-
 /// The header preceeding the File Info Substream of the DBI stream.
 struct FileInfoSubstreamHeader {
   /// Total # of modules, should match number of records in the ModuleInfo
@@ -228,7 +218,7 @@ struct ModuleInfoHeader {
   support::ulittle32_t Mod;
 
   /// First section contribution of this module.
-  SectionContribEntry SC;
+  SectionContrib SC;
 
   /// See ModInfoFlags definition.
   support::ulittle16_t Flags;
@@ -328,6 +318,34 @@ struct PDBStringTableHeader {
 
 const uint32_t PDBStringTableSignature = 0xEFFEEFFE;
 
+/// The header preceding the /src/headerblock stream.
+struct SrcHeaderBlockHeader {
+  support::ulittle32_t Version; // PdbRaw_SrcHeaderBlockVer enumeration.
+  support::ulittle32_t Size;    // Size of entire stream.
+  uint64_t FileTime;            // Time stamp (Windows FILETIME format).
+  support::ulittle32_t Age;     // Age
+  uint8_t Padding[44];          // Pad to 64 bytes.
+};
+static_assert(sizeof(SrcHeaderBlockHeader) == 64, "Incorrect struct size!");
+
+/// A single file record entry within the /src/headerblock stream.
+struct SrcHeaderBlockEntry {
+  support::ulittle32_t Size;     // Record Length.
+  support::ulittle32_t Version;  // PdbRaw_SrcHeaderBlockVer enumeration.
+  support::ulittle32_t CRC;      // CRC of the original file contents.
+  support::ulittle32_t FileSize; // Size of original source file.
+  support::ulittle32_t FileNI;   // String table index of file name.
+  support::ulittle32_t ObjNI;    // String table index of object name.
+  support::ulittle32_t VFileNI;  // String table index of virtual file name.
+  uint8_t Compression;           // PDB_SourceCompression enumeration.
+  uint8_t IsVirtual;             // Is this a virtual file (injected)?
+  short Padding;                 // Pad to 4 bytes.
+  char Reserved[8];
+};
+
+constexpr int I = sizeof(SrcHeaderBlockEntry);
+static_assert(sizeof(SrcHeaderBlockEntry) == 40, "Incorrect struct size!");
+
 } // namespace pdb
 } // namespace llvm
 
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index d3475205a6c2..b77939929ecf 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -51,7 +51,7 @@ public:
   uint32_t getNumHashBuckets() const;
   FixedStreamArray<support::ulittle32_t> getHashValues() const;
   FixedStreamArray<codeview::TypeIndexOffset> getTypeIndexOffsets() const;
-  HashTable &getHashAdjusters();
+  HashTable<support::ulittle32_t> &getHashAdjusters();
 
   codeview::CVTypeRange types(bool *HadError) const;
   const codeview::CVTypeArray &typeArray() const { return TypeRecords; }
@@ -75,7 +75,7 @@ private:
   std::unique_ptr<BinaryStream> HashStream;
   FixedStreamArray<support::ulittle32_t> HashValues;
   FixedStreamArray<codeview::TypeIndexOffset> TypeIndexOffsets;
-  HashTable HashAdjusters;
+  HashTable<support::ulittle32_t> HashAdjusters;
 
   const TpiStreamHeader *Header;
 };
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h
index 778121c8eb79..3c9a19801f89 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h
@@ -34,6 +34,8 @@ raw_ostream &operator<<(raw_ostream &OS, const PDB_SymType &Tag);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_MemberAccess &Access);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_UdtType &Type);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Machine &Machine);
+raw_ostream &operator<<(raw_ostream &OS,
+                        const PDB_SourceCompression &Compression);
 
 raw_ostream &operator<<(raw_ostream &OS, const Variant &Value);
 raw_ostream &operator<<(raw_ostream &OS, const VersionInfo &Version);
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
index 26788017cf32..9549089c7eb4 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
@@ -34,6 +34,7 @@ public:
   FORWARD_SYMBOL_METHOD(getName)
 
   std::string getSourceFileName() const;
+  std::string getSourceFileFullPath() const;
 };
 }
 }
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
index ad4285df4d44..76b14bf17784 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
 
+#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
 #include "PDBTypes.h"
 
@@ -53,9 +54,11 @@ public:
   FORWARD_SYMBOL_METHOD(getValue)
   FORWARD_SYMBOL_METHOD(getVirtualAddress)
   FORWARD_SYMBOL_METHOD(isVolatileType)
-};
 
+  std::unique_ptr<IPDBEnumLineNumbers> getLineNumbers() const;
+  uint32_t getCompilandId() const;
+};
+} // namespace pdb
 } // namespace llvm
-}
 
 #endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
index c2f02ea6f126..05d585d25763 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 
+#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
 #include "PDBSymbolTypeFunctionSig.h"
 #include "PDBTypes.h"
@@ -38,7 +39,9 @@ public:
   FORWARD_SYMBOL_METHOD(getAddressSection)
   FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(isCompilerGenerated)
+  FORWARD_SYMBOL_METHOD(isConstructorVirtualBase)
   FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(isCxxReturnUdt)
   FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
   FORWARD_SYMBOL_METHOD(hasFarReturn)
   FORWARD_SYMBOL_METHOD(hasAlloca)
@@ -76,6 +79,9 @@ public:
   FORWARD_SYMBOL_METHOD(getVirtualAddress)
   FORWARD_SYMBOL_METHOD(getVirtualBaseOffset)
   FORWARD_SYMBOL_METHOD(isVolatileType)
+
+  std::unique_ptr<IPDBEnumLineNumbers> getLineNumbers() const;
+  uint32_t getCompilandId() const;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
index c5ae3c51162c..ddbe7e58f183 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 
+#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
 #include "PDBSymbolTypeBuiltin.h"
 #include "PDBTypes.h"
@@ -38,6 +39,7 @@ public:
   FORWARD_SYMBOL_METHOD(getLength)
   FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSrcLineOnTypeDefn)
   FORWARD_SYMBOL_METHOD(isNested)
   FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
   FORWARD_SYMBOL_METHOD(isPacked)
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
index 8de54e70701d..abd4cf5effa2 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
@@ -31,6 +31,8 @@ public:
   void dumpRight(PDBSymDumper &Dumper) const override;
   void dumpArgList(raw_ostream &OS) const;
 
+  bool isCVarArgs() const;
+
   FORWARD_SYMBOL_METHOD(getCallingConvention)
   FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_ID_METHOD(getUnmodifiedType)
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
index c502d4e77afe..7612ebac31dd 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
@@ -32,7 +32,11 @@ public:
   FORWARD_SYMBOL_METHOD(getLength)
   FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(isReference)
+  FORWARD_SYMBOL_METHOD(isRValueReference)
+  FORWARD_SYMBOL_METHOD(isPointerToDataMember)
+  FORWARD_SYMBOL_METHOD(isPointerToMemberFunction)
   FORWARD_SYMBOL_ID_METHOD_WITH_NAME(getType, getPointeeType)
+  FORWARD_SYMBOL_METHOD(isRestrictedType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
index e9e7fe8c9865..e259b6dca3d5 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
 
+#include "IPDBLineNumber.h"
 #include "IPDBSession.h"
 #include "PDBSymbol.h"
 #include "PDBSymbolTypeBaseClass.h"
@@ -45,6 +46,7 @@ public:
   FORWARD_SYMBOL_METHOD(getLength)
   FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSrcLineOnTypeDefn)
   FORWARD_SYMBOL_METHOD(isNested)
   FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
   FORWARD_SYMBOL_METHOD(isPacked)
@@ -53,6 +55,7 @@ public:
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_ID_METHOD(getVirtualTableShape)
   FORWARD_SYMBOL_METHOD(isVolatileType)
+  FORWARD_SYMBOL_METHOD(getAccess)
 };
 }
 } // namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
index a6c6da37d1cc..da6cb1d26771 100644
--- a/contrib/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/contrib/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -23,7 +23,9 @@ namespace llvm {
 namespace pdb {
 
 class IPDBDataStream;
+class IPDBInjectedSource;
 class IPDBLineNumber;
+class IPDBSectionContrib;
 class IPDBSourceFile;
 class IPDBTable;
 class PDBSymDumper;
@@ -65,6 +67,8 @@ using IPDBEnumSourceFiles = IPDBEnumChildren<IPDBSourceFile>;
 using IPDBEnumDataStreams = IPDBEnumChildren<IPDBDataStream>;
 using IPDBEnumLineNumbers = IPDBEnumChildren<IPDBLineNumber>;
 using IPDBEnumTables = IPDBEnumChildren<IPDBTable>;
+using IPDBEnumInjectedSources = IPDBEnumChildren<IPDBInjectedSource>;
+using IPDBEnumSectionContribs = IPDBEnumChildren<IPDBSectionContrib>;
 
 /// Specifies which PDB reader implementation is to be used.  Only a value
 /// of PDB_ReaderType::DIA is currently supported, but Native is in the works.
@@ -96,13 +100,18 @@ enum PDB_NameSearchFlags {
   NS_CaseInsensitive = 0x2,
   NS_FileNameExtMatch = 0x4,
   NS_Regex = 0x8,
-  NS_UndecoratedName = 0x10
+  NS_UndecoratedName = 0x10,
+
+  // For backward compatibility.
+  NS_CaseInFileNameExt = NS_CaseInsensitive | NS_FileNameExtMatch,
+  NS_CaseRegex = NS_Regex | NS_CaseSensitive,
+  NS_CaseInRex = NS_Regex | NS_CaseInsensitive
 };
 
 /// Specifies the hash algorithm that a source file from a PDB was hashed with.
 /// This corresponds to the CV_SourceChksum_t enumeration and are documented
 /// here: https://msdn.microsoft.com/en-us/library/e96az21x.aspx
-enum class PDB_Checksum { None = 0, MD5 = 1, SHA1 = 2 };
+enum class PDB_Checksum { None = 0, MD5 = 1, SHA1 = 2, SHA256 = 3 };
 
 /// These values correspond to the CV_CPU_TYPE_e enumeration, and are documented
 /// here: https://msdn.microsoft.com/en-us/library/b2fc64ek.aspx
@@ -133,6 +142,13 @@ enum class PDB_Machine {
   WceMipsV2 = 0x169
 };
 
+enum class PDB_SourceCompression {
+  None,
+  RunLengthEncoded,
+  Huffman,
+  LZ,
+};
+
 /// These values correspond to the CV_call_e enumeration, and are documented
 /// at the following locations:
 ///   https://msdn.microsoft.com/en-us/library/b2fc64ek.aspx
@@ -209,6 +225,7 @@ enum class PDB_LocType {
   IlRel,
   MetaData,
   Constant,
+  RegRelAliasIndir,
   Max
 };
 
@@ -218,11 +235,24 @@ enum class PDB_UdtType { Struct, Class, Union, Interface };
 
 /// These values correspond to the StackFrameTypeEnum enumeration, and are
 /// documented here: https://msdn.microsoft.com/en-us/library/bc5207xw.aspx.
-enum class PDB_StackFrameType { FPO, KernelTrap, KernelTSS, EBP, FrameData };
+enum class PDB_StackFrameType : uint16_t {
+  FPO,
+  KernelTrap,
+  KernelTSS,
+  EBP,
+  FrameData,
+  Unknown = 0xffff
+};
 
-/// These values correspond to the StackFrameTypeEnum enumeration, and are
-/// documented here: https://msdn.microsoft.com/en-us/library/bc5207xw.aspx.
-enum class PDB_MemoryType { Code, Data, Stack, HeapCode };
+/// These values correspond to the MemoryTypeEnum enumeration, and are
+/// documented here: https://msdn.microsoft.com/en-us/library/ms165609.aspx.
+enum class PDB_MemoryType : uint16_t {
+  Code,
+  Data,
+  Stack,
+  HeapCode,
+  Any = 0xffff
+};
 
 /// These values correspond to the Basictype enumeration, and are documented
 /// here: https://msdn.microsoft.com/en-us/library/4szdtzc3.aspx
@@ -244,13 +274,15 @@ enum class PDB_BuiltinType {
   Complex = 28,
   Bitfield = 29,
   BSTR = 30,
-  HResult = 31
+  HResult = 31,
+  Char16 = 32,
+  Char32 = 33
 };
 
 /// These values correspond to the flags that can be combined to control the
 /// return of an undecorated name for a C++ decorated name, and are documented
 /// here: https://msdn.microsoft.com/en-us/library/kszfk0fs.aspx
-enum PDB_UndnameFlags: uint32_t {
+enum PDB_UndnameFlags : uint32_t {
   Undname_Complete = 0x0,
   Undname_NoLeadingUnderscores = 0x1,
   Undname_NoMsKeywords = 0x2,
diff --git a/contrib/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/contrib/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index 6480aef109c6..289148f569db 100644
--- a/contrib/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/contrib/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -90,11 +90,11 @@ private:
                                     const ObjectFile *Obj,
                                     const std::string &ArchName);
 
-  /// \brief Returns pair of pointers to object and debug object.
+  /// Returns pair of pointers to object and debug object.
   Expected<ObjectPair> getOrCreateObjectPair(const std::string &Path,
                                             const std::string &ArchName);
 
-  /// \brief Return a pointer to object file at specified path, for a specified
+  /// Return a pointer to object file at specified path, for a specified
   /// architecture (e.g. if path refers to a Mach-O universal binary, only one
   /// object file from it will be returned).
   Expected<ObjectFile *> getOrCreateObject(const std::string &Path,
@@ -102,14 +102,14 @@ private:
 
   std::map<std::string, std::unique_ptr<SymbolizableModule>> Modules;
 
-  /// \brief Contains cached results of getOrCreateObjectPair().
+  /// Contains cached results of getOrCreateObjectPair().
   std::map<std::pair<std::string, std::string>, ObjectPair>
       ObjectPairForPathArch;
 
-  /// \brief Contains parsed binary for each path, or parsing error.
+  /// Contains parsed binary for each path, or parsing error.
   std::map<std::string, OwningBinary<Binary>> BinaryForPath;
 
-  /// \brief Parsed object file for path/architecture pair, where "path" refers
+  /// Parsed object file for path/architecture pair, where "path" refers
   /// to Mach-O universal binary.
   std::map<std::pair<std::string, std::string>, std::unique_ptr<ObjectFile>>
       ObjectForUBPathAndArch;
diff --git a/contrib/llvm/include/llvm/Demangle/Demangle.h b/contrib/llvm/include/llvm/Demangle/Demangle.h
index d2eb56b39f9b..df7753f23b87 100644
--- a/contrib/llvm/include/llvm/Demangle/Demangle.h
+++ b/contrib/llvm/include/llvm/Demangle/Demangle.h
@@ -16,13 +16,73 @@ namespace llvm {
 /// The mangled_name is demangled into buf and returned. If the buffer is not
 /// large enough, realloc is used to expand it.
 ///
-/// The *status will be set to
-///   unknown_error: -4
-///   invalid_args:  -3
-///   invalid_mangled_name: -2
-///   memory_alloc_failure: -1
-///   success: 0
+/// The *status will be set to a value from the following enumeration
+enum : int {
+  demangle_unknown_error = -4,
+  demangle_invalid_args = -3,
+  demangle_invalid_mangled_name = -2,
+  demangle_memory_alloc_failure = -1,
+  demangle_success = 0,
+};
 
 char *itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
                       int *status);
-}
+char *microsoftDemangle(const char *mangled_name, char *buf, size_t *n,
+                        int *status);
+
+/// "Partial" demangler. This supports demangling a string into an AST
+/// (typically an intermediate stage in itaniumDemangle) and querying certain
+/// properties or partially printing the demangled name.
+struct ItaniumPartialDemangler {
+  ItaniumPartialDemangler();
+
+  ItaniumPartialDemangler(ItaniumPartialDemangler &&Other);
+  ItaniumPartialDemangler &operator=(ItaniumPartialDemangler &&Other);
+
+  /// Demangle into an AST. Subsequent calls to the rest of the member functions
+  /// implicitly operate on the AST this produces.
+  /// \return true on error, false otherwise
+  bool partialDemangle(const char *MangledName);
+
+  /// Just print the entire mangled name into Buf. Buf and N behave like the
+  /// second and third parameters to itaniumDemangle.
+  char *finishDemangle(char *Buf, size_t *N) const;
+
+  /// Get the base name of a function. This doesn't include trailing template
+  /// arguments, ie for "a::b<int>" this function returns "b".
+  char *getFunctionBaseName(char *Buf, size_t *N) const;
+
+  /// Get the context name for a function. For "a::b::c", this function returns
+  /// "a::b".
+  char *getFunctionDeclContextName(char *Buf, size_t *N) const;
+
+  /// Get the entire name of this function.
+  char *getFunctionName(char *Buf, size_t *N) const;
+
+  /// Get the parameters for this function.
+  char *getFunctionParameters(char *Buf, size_t *N) const;
+  char *getFunctionReturnType(char *Buf, size_t *N) const;
+
+  /// If this function has any any cv or reference qualifiers. These imply that
+  /// the function is a non-static member function.
+  bool hasFunctionQualifiers() const;
+
+  /// If this symbol describes a constructor or destructor.
+  bool isCtorOrDtor() const;
+
+  /// If this symbol describes a function.
+  bool isFunction() const;
+
+  /// If this symbol describes a variable.
+  bool isData() const;
+
+  /// If this symbol is a <special-name>. These are generally implicitly
+  /// generated by the implementation, such as vtables and typeinfo names.
+  bool isSpecialName() const;
+
+  ~ItaniumPartialDemangler();
+private:
+  void *RootNode;
+  void *Context;
+};
+} // namespace llvm
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h b/contrib/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
index 77c23b46d320..b61cb24fa5fb 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -60,7 +60,7 @@ class ObjectFile;
 
 } // end namespace object
 
-/// \brief Helper class for helping synchronize access to the global address map
+/// Helper class for helping synchronize access to the global address map
 /// table.  Access to this class should be serialized under a mutex.
 class ExecutionEngineState {
 public:
@@ -86,7 +86,7 @@ public:
     return GlobalAddressReverseMap;
   }
 
-  /// \brief Erase an entry from the mapping table.
+  /// Erase an entry from the mapping table.
   ///
   /// \returns The address that \p ToUnmap was happed to.
   uint64_t RemoveMapping(StringRef Name);
@@ -94,7 +94,7 @@ public:
 
 using FunctionCreator = std::function<void *(const std::string &)>;
 
-/// \brief Abstract interface for implementation execution of LLVM modules,
+/// Abstract interface for implementation execution of LLVM modules,
 /// designed to support both interpreter and just-in-time (JIT) compiler
 /// implementations.
 class ExecutionEngine {
@@ -137,17 +137,15 @@ protected:
   virtual char *getMemoryForGV(const GlobalVariable *GV);
 
   static ExecutionEngine *(*MCJITCtor)(
-                                std::unique_ptr<Module> M,
-                                std::string *ErrorStr,
-                                std::shared_ptr<MCJITMemoryManager> MM,
-                                std::shared_ptr<JITSymbolResolver> SR,
-                                std::unique_ptr<TargetMachine> TM);
+      std::unique_ptr<Module> M, std::string *ErrorStr,
+      std::shared_ptr<MCJITMemoryManager> MM,
+      std::shared_ptr<LegacyJITSymbolResolver> SR,
+      std::unique_ptr<TargetMachine> TM);
 
   static ExecutionEngine *(*OrcMCJITReplacementCtor)(
-                                std::string *ErrorStr,
-                                std::shared_ptr<MCJITMemoryManager> MM,
-                                std::shared_ptr<JITSymbolResolver> SR,
-                                std::unique_ptr<TargetMachine> TM);
+      std::string *ErrorStr, std::shared_ptr<MCJITMemoryManager> MM,
+      std::shared_ptr<LegacyJITSymbolResolver> SR,
+      std::unique_ptr<TargetMachine> TM);
 
   static ExecutionEngine *(*InterpCtor)(std::unique_ptr<Module> M,
                                         std::string *ErrorStr);
@@ -532,7 +530,7 @@ private:
   std::string *ErrorStr;
   CodeGenOpt::Level OptLevel;
   std::shared_ptr<MCJITMemoryManager> MemMgr;
-  std::shared_ptr<JITSymbolResolver> Resolver;
+  std::shared_ptr<LegacyJITSymbolResolver> Resolver;
   TargetOptions Options;
   Optional<Reloc::Model> RelocModel;
   Optional<CodeModel::Model> CMModel;
@@ -571,8 +569,7 @@ public:
   EngineBuilder&
   setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM);
 
-  EngineBuilder&
-  setSymbolResolver(std::unique_ptr<JITSymbolResolver> SR);
+  EngineBuilder &setSymbolResolver(std::unique_ptr<LegacyJITSymbolResolver> SR);
 
   /// setErrorStr - Set the error string to write to on error.  This option
   /// defaults to NULL.
@@ -637,7 +634,7 @@ public:
     return *this;
   }
 
-  // \brief Use OrcMCJITReplacement instead of MCJIT. Off by default.
+  // Use OrcMCJITReplacement instead of MCJIT. Off by default.
   void setUseOrcMCJITReplacement(bool UseOrcMCJITReplacement) {
     this->UseOrcMCJITReplacement = UseOrcMCJITReplacement;
   }
@@ -645,7 +642,7 @@ public:
   void setEmulatedTLS(bool EmulatedTLS) {
     this->EmulatedTLS = EmulatedTLS;
   }
-  
+
   TargetMachine *selectTarget();
 
   /// selectTarget - Pick a target either via -march or by guessing the native
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/JITEventListener.h b/contrib/llvm/include/llvm/ExecutionEngine/JITEventListener.h
index ff7840f00a44..1ce772ccde95 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/JITEventListener.h
@@ -15,9 +15,11 @@
 #ifndef LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
 #define LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
 
+#include "llvm-c/ExecutionEngine.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/CBindingWrapping.h"
 #include <cstdint>
 #include <vector>
 
@@ -115,10 +117,21 @@ public:
   }
 #endif // USE_OPROFILE
 
+#if LLVM_USE_PERF
+  static JITEventListener *createPerfJITEventListener();
+#else
+  static JITEventListener *createPerfJITEventListener()
+  {
+    return nullptr;
+  }
+#endif // USE_PERF
+
 private:
   virtual void anchor();
 };
 
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITEventListener, LLVMJITEventListenerRef)
+
 } // end namespace llvm
 
 #endif // LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/JITSymbol.h b/contrib/llvm/include/llvm/ExecutionEngine/JITSymbol.h
index 933b3ea8e13d..53037c3dbc72 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/JITSymbol.h
@@ -19,8 +19,11 @@
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <map>
+#include <set>
 #include <string>
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -33,10 +36,10 @@ class BasicSymbolRef;
 
 } // end namespace object
 
-/// @brief Represents an address in the target process's address space.
+/// Represents an address in the target process's address space.
 using JITTargetAddress = uint64_t;
 
-/// @brief Flags for symbols in the JIT.
+/// Flags for symbols in the JIT.
 class JITSymbolFlags {
 public:
   using UnderlyingType = uint8_t;
@@ -48,55 +51,74 @@ public:
     Weak = 1U << 1,
     Common = 1U << 2,
     Absolute = 1U << 3,
-    Exported = 1U << 4
+    Exported = 1U << 4,
+    Lazy = 1U << 5,
+    Materializing = 1U << 6
   };
 
-  /// @brief Default-construct a JITSymbolFlags instance.
+  static JITSymbolFlags stripTransientFlags(JITSymbolFlags Orig) {
+    return static_cast<FlagNames>(Orig.Flags & ~Lazy & ~Materializing);
+  }
+
+  /// Default-construct a JITSymbolFlags instance.
   JITSymbolFlags() = default;
 
-  /// @brief Construct a JITSymbolFlags instance from the given flags.
+  /// Construct a JITSymbolFlags instance from the given flags.
   JITSymbolFlags(FlagNames Flags) : Flags(Flags) {}
 
-  /// @brief Construct a JITSymbolFlags instance from the given flags and target
+  /// Construct a JITSymbolFlags instance from the given flags and target
   ///        flags.
   JITSymbolFlags(FlagNames Flags, TargetFlagsType TargetFlags)
     : Flags(Flags), TargetFlags(TargetFlags) {}
 
-  /// @brief Return true if there was an error retrieving this symbol.
+  /// Return true if there was an error retrieving this symbol.
   bool hasError() const {
     return (Flags & HasError) == HasError;
   }
 
-  /// @brief Returns true if the Weak flag is set.
+  /// Returns true if this is a lazy symbol.
+  ///        This flag is used internally by the JIT APIs to track
+  ///        materialization states.
+  bool isLazy() const { return Flags & Lazy; }
+
+  /// Returns true if this symbol is in the process of being
+  ///        materialized.
+  bool isMaterializing() const { return Flags & Materializing; }
+
+  /// Returns true if this symbol is fully materialized.
+  ///        (i.e. neither lazy, nor materializing).
+  bool isMaterialized() const { return !(Flags & (Lazy | Materializing)); }
+
+  /// Returns true if the Weak flag is set.
   bool isWeak() const {
     return (Flags & Weak) == Weak;
   }
 
-  /// @brief Returns true if the Common flag is set.
+  /// Returns true if the Common flag is set.
   bool isCommon() const {
     return (Flags & Common) == Common;
   }
 
-  /// @brief Returns true if the symbol isn't weak or common.
-  bool isStrongDefinition() const {
+  /// Returns true if the symbol isn't weak or common.
+  bool isStrong() const {
     return !isWeak() && !isCommon();
   }
 
-  /// @brief Returns true if the Exported flag is set.
+  /// Returns true if the Exported flag is set.
   bool isExported() const {
     return (Flags & Exported) == Exported;
   }
 
-  /// @brief Implicitly convert to the underlying flags type.
+  /// Implicitly convert to the underlying flags type.
   operator UnderlyingType&() { return Flags; }
 
-  /// @brief Implicitly convert to the underlying flags type.
+  /// Implicitly convert to the underlying flags type.
   operator const UnderlyingType&() const { return Flags; }
 
-  /// @brief Return a reference to the target-specific flags.
+  /// Return a reference to the target-specific flags.
   TargetFlagsType& getTargetFlags() { return TargetFlags; }
 
-  /// @brief Return a reference to the target-specific flags.
+  /// Return a reference to the target-specific flags.
   const TargetFlagsType& getTargetFlags() const { return TargetFlags; }
 
   /// Construct a JITSymbolFlags value based on the flags of the given global
@@ -112,7 +134,7 @@ private:
   TargetFlagsType TargetFlags = 0;
 };
 
-/// @brief ARM-specific JIT symbol flags.
+/// ARM-specific JIT symbol flags.
 /// FIXME: This should be moved into a target-specific header.
 class ARMJITSymbolFlags {
 public:
@@ -131,54 +153,59 @@ private:
   JITSymbolFlags::TargetFlagsType Flags = 0;
 };
 
-/// @brief Represents a symbol that has been evaluated to an address already.
+/// Represents a symbol that has been evaluated to an address already.
 class JITEvaluatedSymbol {
 public:
-  /// @brief Create a 'null' symbol.
+  JITEvaluatedSymbol() = default;
+
+  /// Create a 'null' symbol.
   JITEvaluatedSymbol(std::nullptr_t) {}
 
-  /// @brief Create a symbol for the given address and flags.
+  /// Create a symbol for the given address and flags.
   JITEvaluatedSymbol(JITTargetAddress Address, JITSymbolFlags Flags)
       : Address(Address), Flags(Flags) {}
 
-  /// @brief An evaluated symbol converts to 'true' if its address is non-zero.
+  /// An evaluated symbol converts to 'true' if its address is non-zero.
   explicit operator bool() const { return Address != 0; }
 
-  /// @brief Return the address of this symbol.
+  /// Return the address of this symbol.
   JITTargetAddress getAddress() const { return Address; }
 
-  /// @brief Return the flags for this symbol.
+  /// Return the flags for this symbol.
   JITSymbolFlags getFlags() const { return Flags; }
 
+  /// Set the flags for this symbol.
+  void setFlags(JITSymbolFlags Flags) { this->Flags = std::move(Flags); }
+
 private:
   JITTargetAddress Address = 0;
   JITSymbolFlags Flags;
 };
 
-/// @brief Represents a symbol in the JIT.
+/// Represents a symbol in the JIT.
 class JITSymbol {
 public:
   using GetAddressFtor = std::function<Expected<JITTargetAddress>()>;
 
-  /// @brief Create a 'null' symbol, used to represent a "symbol not found"
+  /// Create a 'null' symbol, used to represent a "symbol not found"
   ///        result from a successful (non-erroneous) lookup.
   JITSymbol(std::nullptr_t)
       : CachedAddr(0) {}
 
-  /// @brief Create a JITSymbol representing an error in the symbol lookup
+  /// Create a JITSymbol representing an error in the symbol lookup
   ///        process (e.g. a network failure during a remote lookup).
   JITSymbol(Error Err)
     : Err(std::move(Err)), Flags(JITSymbolFlags::HasError) {}
 
-  /// @brief Create a symbol for a definition with a known address.
+  /// Create a symbol for a definition with a known address.
   JITSymbol(JITTargetAddress Addr, JITSymbolFlags Flags)
       : CachedAddr(Addr), Flags(Flags) {}
 
-  /// @brief Construct a JITSymbol from a JITEvaluatedSymbol.
+  /// Construct a JITSymbol from a JITEvaluatedSymbol.
   JITSymbol(JITEvaluatedSymbol Sym)
       : CachedAddr(Sym.getAddress()), Flags(Sym.getFlags()) {}
 
-  /// @brief Create a symbol for a definition that doesn't have a known address
+  /// Create a symbol for a definition that doesn't have a known address
   ///        yet.
   /// @param GetAddress A functor to materialize a definition (fixing the
   ///        address) on demand.
@@ -218,19 +245,19 @@ public:
       CachedAddr.~JITTargetAddress();
   }
 
-  /// @brief Returns true if the symbol exists, false otherwise.
+  /// Returns true if the symbol exists, false otherwise.
   explicit operator bool() const {
     return !Flags.hasError() && (CachedAddr || GetAddress);
   }
 
-  /// @brief Move the error field value out of this JITSymbol.
+  /// Move the error field value out of this JITSymbol.
   Error takeError() {
     if (Flags.hasError())
       return std::move(Err);
     return Error::success();
   }
 
-  /// @brief Get the address of the symbol in the target address space. Returns
+  /// Get the address of the symbol in the target address space. Returns
   ///        '0' if the symbol does not exist.
   Expected<JITTargetAddress> getAddress() {
     assert(!Flags.hasError() && "getAddress called on error value");
@@ -256,11 +283,49 @@ private:
   JITSymbolFlags Flags;
 };
 
-/// \brief Symbol resolution.
+/// Symbol resolution interface.
+///
+/// Allows symbol flags and addresses to be looked up by name.
+/// Symbol queries are done in bulk (i.e. you request resolution of a set of
+/// symbols, rather than a single one) to reduce IPC overhead in the case of
+/// remote JITing, and expose opportunities for parallel compilation.
 class JITSymbolResolver {
 public:
+  using LookupSet = std::set<StringRef>;
+  using LookupResult = std::map<StringRef, JITEvaluatedSymbol>;
+  using LookupFlagsResult = std::map<StringRef, JITSymbolFlags>;
+
   virtual ~JITSymbolResolver() = default;
 
+  /// Returns the fully resolved address and flags for each of the given
+  ///        symbols.
+  ///
+  /// This method will return an error if any of the given symbols can not be
+  /// resolved, or if the resolution process itself triggers an error.
+  virtual Expected<LookupResult> lookup(const LookupSet &Symbols) = 0;
+
+  /// Returns the symbol flags for each of the given symbols.
+  ///
+  /// This method does NOT return an error if any of the given symbols is
+  /// missing. Instead, that symbol will be left out of the result map.
+  virtual Expected<LookupFlagsResult> lookupFlags(const LookupSet &Symbols) = 0;
+
+private:
+  virtual void anchor();
+};
+
+/// Legacy symbol resolution interface.
+class LegacyJITSymbolResolver : public JITSymbolResolver {
+public:
+  /// Performs lookup by, for each symbol, first calling
+  ///        findSymbolInLogicalDylib and if that fails calling
+  ///        findSymbol.
+  Expected<LookupResult> lookup(const LookupSet &Symbols) final;
+
+  /// Performs flags lookup by calling findSymbolInLogicalDylib and
+  ///        returning the flags value for that symbol.
+  Expected<LookupFlagsResult> lookupFlags(const LookupSet &Symbols) final;
+
   /// This method returns the address of the specified symbol if it exists
   /// within the logical dynamic library represented by this JITSymbolResolver.
   /// Unlike findSymbol, queries through this interface should return addresses
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index a961992c2147..8bd21a0e3dd6 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -22,6 +22,8 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
+#include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Attributes.h"
@@ -56,7 +58,47 @@ class Value;
 
 namespace orc {
 
-/// @brief Compile-on-demand layer.
+class ExtractingIRMaterializationUnit;
+
+class CompileOnDemandLayer2 : public IRLayer {
+  friend class ExtractingIRMaterializationUnit;
+
+public:
+  /// Builder for IndirectStubsManagers.
+  using IndirectStubsManagerBuilder =
+      std::function<std::unique_ptr<IndirectStubsManager>()>;
+
+  using GetAvailableContextFunction = std::function<LLVMContext &()>;
+
+  CompileOnDemandLayer2(ExecutionSession &ES, IRLayer &BaseLayer,
+                        JITCompileCallbackManager &CCMgr,
+                        IndirectStubsManagerBuilder BuildIndirectStubsManager,
+                        GetAvailableContextFunction GetAvailableContext);
+
+  Error add(VSO &V, VModuleKey K, std::unique_ptr<Module> M) override;
+
+  void emit(MaterializationResponsibility R, VModuleKey K,
+            std::unique_ptr<Module> M) override;
+
+private:
+  using StubManagersMap =
+      std::map<const VSO *, std::unique_ptr<IndirectStubsManager>>;
+
+  IndirectStubsManager &getStubsManager(const VSO &V);
+
+  void emitExtractedFunctionsModule(MaterializationResponsibility R,
+                                    std::unique_ptr<Module> M);
+
+  mutable std::mutex CODLayerMutex;
+
+  IRLayer &BaseLayer;
+  JITCompileCallbackManager &CCMgr;
+  IndirectStubsManagerBuilder BuildIndirectStubsManager;
+  StubManagersMap StubsMgrs;
+  GetAvailableContextFunction GetAvailableContext;
+};
+
+/// Compile-on-demand layer.
 ///
 ///   When a module is added to this layer a stub is created for each of its
 /// function definitions. The stubs and other global values are immediately
@@ -85,8 +127,6 @@ private:
     return LambdaMaterializer<MaterializerFtor>(std::move(M));
   }
 
-  using BaseLayerModuleHandleT = typename BaseLayerT::ModuleHandleT;
-
   // Provide type-erasure for the Modules and MemoryManagers.
   template <typename ResourceT>
   class ResourceOwner {
@@ -138,18 +178,22 @@ private:
   };
 
   struct LogicalDylib {
-    using SymbolResolverFtor = std::function<JITSymbol(const std::string&)>;
-
     struct SourceModuleEntry {
-      std::shared_ptr<Module> SourceMod;
+      std::unique_ptr<Module> SourceMod;
       std::set<Function*> StubsToClone;
     };
 
     using SourceModulesList = std::vector<SourceModuleEntry>;
     using SourceModuleHandle = typename SourceModulesList::size_type;
 
-    SourceModuleHandle
-    addSourceModule(std::shared_ptr<Module> M) {
+    LogicalDylib() = default;
+
+    LogicalDylib(VModuleKey K, std::shared_ptr<SymbolResolver> BackingResolver,
+                 std::unique_ptr<IndirectStubsMgrT> StubsMgr)
+        : K(std::move(K)), BackingResolver(std::move(BackingResolver)),
+          StubsMgr(std::move(StubsMgr)) {}
+
+    SourceModuleHandle addSourceModule(std::unique_ptr<Module> M) {
       SourceModuleHandle H = SourceModules.size();
       SourceModules.push_back(SourceModuleEntry());
       SourceModules.back().SourceMod = std::move(M);
@@ -168,8 +212,8 @@ private:
                          bool ExportedSymbolsOnly) {
       if (auto Sym = StubsMgr->findStub(Name, ExportedSymbolsOnly))
         return Sym;
-      for (auto BLH : BaseLayerHandles)
-        if (auto Sym = BaseLayer.findSymbolIn(BLH, Name, ExportedSymbolsOnly))
+      for (auto BLK : BaseLayerVModuleKeys)
+        if (auto Sym = BaseLayer.findSymbolIn(BLK, Name, ExportedSymbolsOnly))
           return Sym;
         else if (auto Err = Sym.takeError())
           return std::move(Err);
@@ -177,91 +221,94 @@ private:
     }
 
     Error removeModulesFromBaseLayer(BaseLayerT &BaseLayer) {
-      for (auto &BLH : BaseLayerHandles)
-        if (auto Err = BaseLayer.removeModule(BLH))
+      for (auto &BLK : BaseLayerVModuleKeys)
+        if (auto Err = BaseLayer.removeModule(BLK))
           return Err;
       return Error::success();
     }
 
-    std::shared_ptr<JITSymbolResolver> ExternalSymbolResolver;
+    VModuleKey K;
+    std::shared_ptr<SymbolResolver> BackingResolver;
     std::unique_ptr<IndirectStubsMgrT> StubsMgr;
     StaticGlobalRenamer StaticRenamer;
     SourceModulesList SourceModules;
-    std::vector<BaseLayerModuleHandleT> BaseLayerHandles;
+    std::vector<VModuleKey> BaseLayerVModuleKeys;
   };
 
-  using LogicalDylibList = std::list<LogicalDylib>;
-
 public:
 
-  /// @brief Handle to loaded module.
-  using ModuleHandleT = typename LogicalDylibList::iterator;
-
-  /// @brief Module partitioning functor.
+  /// Module partitioning functor.
   using PartitioningFtor = std::function<std::set<Function*>(Function&)>;
 
-  /// @brief Builder for IndirectStubsManagers.
+  /// Builder for IndirectStubsManagers.
   using IndirectStubsManagerBuilderT =
       std::function<std::unique_ptr<IndirectStubsMgrT>()>;
 
-  /// @brief Construct a compile-on-demand layer instance.
-  CompileOnDemandLayer(BaseLayerT &BaseLayer, PartitioningFtor Partition,
+  using SymbolResolverGetter =
+      std::function<std::shared_ptr<SymbolResolver>(VModuleKey K)>;
+
+  using SymbolResolverSetter =
+      std::function<void(VModuleKey K, std::shared_ptr<SymbolResolver> R)>;
+
+  /// Construct a compile-on-demand layer instance.
+  CompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer,
+                       SymbolResolverGetter GetSymbolResolver,
+                       SymbolResolverSetter SetSymbolResolver,
+                       PartitioningFtor Partition,
                        CompileCallbackMgrT &CallbackMgr,
                        IndirectStubsManagerBuilderT CreateIndirectStubsManager,
                        bool CloneStubsIntoPartitions = true)
-      : BaseLayer(BaseLayer), Partition(std::move(Partition)),
-        CompileCallbackMgr(CallbackMgr),
+      : ES(ES), BaseLayer(BaseLayer),
+        GetSymbolResolver(std::move(GetSymbolResolver)),
+        SetSymbolResolver(std::move(SetSymbolResolver)),
+        Partition(std::move(Partition)), CompileCallbackMgr(CallbackMgr),
         CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
         CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
 
   ~CompileOnDemandLayer() {
     // FIXME: Report error on log.
     while (!LogicalDylibs.empty())
-      consumeError(removeModule(LogicalDylibs.begin()));
+      consumeError(removeModule(LogicalDylibs.begin()->first));
   }
 
-  /// @brief Add a module to the compile-on-demand layer.
-  Expected<ModuleHandleT>
-  addModule(std::shared_ptr<Module> M,
-            std::shared_ptr<JITSymbolResolver> Resolver) {
+  /// Add a module to the compile-on-demand layer.
+  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
 
-    LogicalDylibs.push_back(LogicalDylib());
-    auto &LD = LogicalDylibs.back();
-    LD.ExternalSymbolResolver = std::move(Resolver);
-    LD.StubsMgr = CreateIndirectStubsManager();
+    assert(!LogicalDylibs.count(K) && "VModuleKey K already in use");
+    auto I = LogicalDylibs.insert(
+        LogicalDylibs.end(),
+        std::make_pair(K, LogicalDylib(K, GetSymbolResolver(K),
+                                       CreateIndirectStubsManager())));
 
-    // Process each of the modules in this module set.
-    if (auto Err = addLogicalModule(LD, std::move(M)))
-      return std::move(Err);
-
-    return std::prev(LogicalDylibs.end());
+    return addLogicalModule(I->second, std::move(M));
   }
 
-  /// @brief Add extra modules to an existing logical module.
-  Error addExtraModule(ModuleHandleT H, std::shared_ptr<Module> M) {
-    return addLogicalModule(*H, std::move(M));
+  /// Add extra modules to an existing logical module.
+  Error addExtraModule(VModuleKey K, std::unique_ptr<Module> M) {
+    return addLogicalModule(LogicalDylibs[K], std::move(M));
   }
 
-  /// @brief Remove the module represented by the given handle.
+  /// Remove the module represented by the given key.
   ///
   ///   This will remove all modules in the layers below that were derived from
-  /// the module represented by H.
-  Error removeModule(ModuleHandleT H) {
-    auto Err = H->removeModulesFromBaseLayer(BaseLayer);
-    LogicalDylibs.erase(H);
+  /// the module represented by K.
+  Error removeModule(VModuleKey K) {
+    auto I = LogicalDylibs.find(K);
+    assert(I != LogicalDylibs.end() && "VModuleKey K not valid here");
+    auto Err = I->second.removeModulesFromBaseLayer(BaseLayer);
+    LogicalDylibs.erase(I);
     return Err;
   }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it exists.
   JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
-    for (auto LDI = LogicalDylibs.begin(), LDE = LogicalDylibs.end();
-         LDI != LDE; ++LDI) {
-      if (auto Sym = LDI->StubsMgr->findStub(Name, ExportedSymbolsOnly))
+    for (auto &KV : LogicalDylibs) {
+      if (auto Sym = KV.second.StubsMgr->findStub(Name, ExportedSymbolsOnly))
         return Sym;
-      if (auto Sym = findSymbolIn(LDI, Name, ExportedSymbolsOnly))
+      if (auto Sym = findSymbolIn(KV.first, Name, ExportedSymbolsOnly))
         return Sym;
       else if (auto Err = Sym.takeError())
         return std::move(Err);
@@ -269,14 +316,15 @@ public:
     return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Get the address of a symbol provided by this layer, or some layer
+  /// Get the address of a symbol provided by this layer, or some layer
   ///        below this one.
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
+  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
                          bool ExportedSymbolsOnly) {
-    return H->findSymbol(BaseLayer, Name, ExportedSymbolsOnly);
+    assert(LogicalDylibs.count(K) && "VModuleKey K is not valid here");
+    return LogicalDylibs[K].findSymbol(BaseLayer, Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Update the stub for the given function to point at FnBodyAddr.
+  /// Update the stub for the given function to point at FnBodyAddr.
   /// This can be used to support re-optimization.
   /// @return true if the function exists and the stub is updated, false
   ///         otherwise.
@@ -302,15 +350,14 @@ public:
   }
 
 private:
-
-  Error addLogicalModule(LogicalDylib &LD, std::shared_ptr<Module> SrcMPtr) {
+  Error addLogicalModule(LogicalDylib &LD, std::unique_ptr<Module> SrcMPtr) {
 
     // Rename all static functions / globals to $static.X :
     // This will unique the names across all modules in the logical dylib,
     // simplifying symbol lookup.
     LD.StaticRenamer.rename(*SrcMPtr);
 
-    // Bump the linkage and rename any anonymous/privote members in SrcM to
+    // Bump the linkage and rename any anonymous/private members in SrcM to
     // ensure that everything will resolve properly after we partition SrcM.
     makeAllSymbolsExternallyAccessible(*SrcMPtr);
 
@@ -343,22 +390,21 @@ private:
         // Create a callback, associate it with the stub for the function,
         // and set the compile action to compile the partition containing the
         // function.
-        if (auto CCInfoOrErr = CompileCallbackMgr.getCompileCallback()) {
-          auto &CCInfo = *CCInfoOrErr;
+        auto CompileAction = [this, &LD, LMId, &F]() -> JITTargetAddress {
+          if (auto FnImplAddrOrErr = this->extractAndCompile(LD, LMId, F))
+            return *FnImplAddrOrErr;
+          else {
+            // FIXME: Report error, return to 'abort' or something similar.
+            consumeError(FnImplAddrOrErr.takeError());
+            return 0;
+          }
+        };
+        if (auto CCAddr =
+                CompileCallbackMgr.getCompileCallback(std::move(CompileAction)))
           StubInits[MangledName] =
-            std::make_pair(CCInfo.getAddress(),
-                           JITSymbolFlags::fromGlobalValue(F));
-          CCInfo.setCompileAction([this, &LD, LMId, &F]() -> JITTargetAddress {
-              if (auto FnImplAddrOrErr = this->extractAndCompile(LD, LMId, F))
-                return *FnImplAddrOrErr;
-              else {
-                // FIXME: Report error, return to 'abort' or something similar.
-                consumeError(FnImplAddrOrErr.takeError());
-                return 0;
-              }
-            });
-        } else
-          return CCInfoOrErr.takeError();
+              std::make_pair(*CCAddr, JITSymbolFlags::fromGlobalValue(F));
+        else
+          return CCAddr.takeError();
       }
 
       if (auto Err = LD.StubsMgr->createStubs(StubInits))
@@ -396,9 +442,8 @@ private:
 
     // Initializers may refer to functions declared (but not defined) in this
     // module. Build a materializer to clone decls on demand.
-    Error MaterializerErrors = Error::success();
     auto Materializer = createLambdaMaterializer(
-      [&LD, &GVsM, &MaterializerErrors](Value *V) -> Value* {
+      [&LD, &GVsM](Value *V) -> Value* {
         if (auto *F = dyn_cast<Function>(V)) {
           // Decls in the original module just get cloned.
           if (F->isDeclaration())
@@ -410,18 +455,8 @@ private:
           const DataLayout &DL = GVsM->getDataLayout();
           std::string FName = mangle(F->getName(), DL);
           unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(F->getType());
-          JITTargetAddress StubAddr = 0;
-
-          // Get the address for the stub. If we encounter an error while
-          // doing so, stash it in the MaterializerErrors variable and use a
-          // null address as a placeholder.
-          if (auto StubSym = LD.StubsMgr->findStub(FName, false)) {
-            if (auto StubAddrOrErr = StubSym.getAddress())
-              StubAddr = *StubAddrOrErr;
-            else
-              MaterializerErrors = joinErrors(std::move(MaterializerErrors),
-                                              StubAddrOrErr.takeError());
-          }
+          JITTargetAddress StubAddr =
+            LD.StubsMgr->findStub(FName, false).getAddress();
 
           ConstantInt *StubAddrCI =
             ConstantInt::get(GVsM->getContext(), APInt(PtrBitWidth, StubAddr));
@@ -450,29 +485,58 @@ private:
       NewA->setAliasee(cast<Constant>(Init));
     }
 
-    if (MaterializerErrors)
-      return MaterializerErrors;
-
     // Build a resolver for the globals module and add it to the base layer.
-    auto GVsResolver = createLambdaResolver(
-        [this, &LD](const std::string &Name) -> JITSymbol {
-          if (auto Sym = LD.StubsMgr->findStub(Name, false))
-            return Sym;
-          if (auto Sym = LD.findSymbol(BaseLayer, Name, false))
-            return Sym;
-          else if (auto Err = Sym.takeError())
-            return std::move(Err);
-          return LD.ExternalSymbolResolver->findSymbolInLogicalDylib(Name);
+    auto LegacyLookup = [this, &LD](const std::string &Name) -> JITSymbol {
+      if (auto Sym = LD.StubsMgr->findStub(Name, false))
+        return Sym;
+
+      if (auto Sym = LD.findSymbol(BaseLayer, Name, false))
+        return Sym;
+      else if (auto Err = Sym.takeError())
+        return std::move(Err);
+
+      return nullptr;
+    };
+
+    auto GVsResolver = createSymbolResolver(
+        [&LD, LegacyLookup](const SymbolNameSet &Symbols) {
+          auto SymbolFlags = lookupFlagsWithLegacyFn(Symbols, LegacyLookup);
+
+          if (!SymbolFlags) {
+            logAllUnhandledErrors(SymbolFlags.takeError(), errs(),
+                                  "CODLayer/GVsResolver flags lookup failed: ");
+            return SymbolFlagsMap();
+          }
+
+          if (SymbolFlags->size() == Symbols.size())
+            return *SymbolFlags;
+
+          SymbolNameSet NotFoundViaLegacyLookup;
+          for (auto &S : Symbols)
+            if (!SymbolFlags->count(S))
+              NotFoundViaLegacyLookup.insert(S);
+          auto SymbolFlags2 =
+              LD.BackingResolver->lookupFlags(NotFoundViaLegacyLookup);
+
+          for (auto &KV : SymbolFlags2)
+            (*SymbolFlags)[KV.first] = std::move(KV.second);
+
+          return *SymbolFlags;
         },
-        [&LD](const std::string &Name) {
-          return LD.ExternalSymbolResolver->findSymbol(Name);
+        [this, &LD,
+         LegacyLookup](std::shared_ptr<AsynchronousSymbolQuery> Query,
+                       SymbolNameSet Symbols) {
+          auto NotFoundViaLegacyLookup =
+              lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
+          return LD.BackingResolver->lookup(Query, NotFoundViaLegacyLookup);
         });
 
-    if (auto GVsHOrErr =
-          BaseLayer.addModule(std::move(GVsM), std::move(GVsResolver)))
-      LD.BaseLayerHandles.push_back(*GVsHOrErr);
-    else
-      return GVsHOrErr.takeError();
+    SetSymbolResolver(LD.K, std::move(GVsResolver));
+
+    if (auto Err = BaseLayer.addModule(LD.K, std::move(GVsM)))
+      return Err;
+
+    LD.BaseLayerVModuleKeys.push_back(LD.K);
 
     return Error::success();
   }
@@ -501,11 +565,11 @@ private:
 
     JITTargetAddress CalledAddr = 0;
     auto Part = Partition(F);
-    if (auto PartHOrErr = emitPartition(LD, LMId, Part)) {
-      auto &PartH = *PartHOrErr;
+    if (auto PartKeyOrErr = emitPartition(LD, LMId, Part)) {
+      auto &PartKey = *PartKeyOrErr;
       for (auto *SubF : Part) {
         std::string FnName = mangle(SubF->getName(), SrcM.getDataLayout());
-        if (auto FnBodySym = BaseLayer.findSymbolIn(PartH, FnName, false)) {
+        if (auto FnBodySym = BaseLayer.findSymbolIn(PartKey, FnName, false)) {
           if (auto FnBodyAddrOrErr = FnBodySym.getAddress()) {
             JITTargetAddress FnBodyAddr = *FnBodyAddrOrErr;
 
@@ -526,15 +590,15 @@ private:
           llvm_unreachable("Function not emitted for partition");
       }
 
-      LD.BaseLayerHandles.push_back(PartH);
+      LD.BaseLayerVModuleKeys.push_back(PartKey);
     } else
-      return PartHOrErr.takeError();
+      return PartKeyOrErr.takeError();
 
     return CalledAddr;
   }
 
   template <typename PartitionT>
-  Expected<BaseLayerModuleHandleT>
+  Expected<VModuleKey>
   emitPartition(LogicalDylib &LD,
                 typename LogicalDylib::SourceModuleHandle LMId,
                 const PartitionT &Part) {
@@ -596,28 +660,62 @@ private:
     for (auto *F : Part)
       moveFunctionBody(*F, VMap, &Materializer);
 
+    auto K = ES.allocateVModule();
+
+    auto LegacyLookup = [this, &LD](const std::string &Name) -> JITSymbol {
+      return LD.findSymbol(BaseLayer, Name, false);
+    };
+
     // Create memory manager and symbol resolver.
-    auto Resolver = createLambdaResolver(
-        [this, &LD](const std::string &Name) -> JITSymbol {
-          if (auto Sym = LD.findSymbol(BaseLayer, Name, false))
-            return Sym;
-          else if (auto Err = Sym.takeError())
-            return std::move(Err);
-          return LD.ExternalSymbolResolver->findSymbolInLogicalDylib(Name);
+    auto Resolver = createSymbolResolver(
+        [&LD, LegacyLookup](const SymbolNameSet &Symbols) {
+          auto SymbolFlags = lookupFlagsWithLegacyFn(Symbols, LegacyLookup);
+          if (!SymbolFlags) {
+            logAllUnhandledErrors(SymbolFlags.takeError(), errs(),
+                                  "CODLayer/SubResolver flags lookup failed: ");
+            return SymbolFlagsMap();
+          }
+
+          if (SymbolFlags->size() == Symbols.size())
+            return *SymbolFlags;
+
+          SymbolNameSet NotFoundViaLegacyLookup;
+          for (auto &S : Symbols)
+            if (!SymbolFlags->count(S))
+              NotFoundViaLegacyLookup.insert(S);
+
+          auto SymbolFlags2 =
+              LD.BackingResolver->lookupFlags(NotFoundViaLegacyLookup);
+
+          for (auto &KV : SymbolFlags2)
+            (*SymbolFlags)[KV.first] = std::move(KV.second);
+
+          return *SymbolFlags;
         },
-        [&LD](const std::string &Name) {
-          return LD.ExternalSymbolResolver->findSymbol(Name);
+        [this, &LD, LegacyLookup](std::shared_ptr<AsynchronousSymbolQuery> Q,
+                                  SymbolNameSet Symbols) {
+          auto NotFoundViaLegacyLookup =
+              lookupWithLegacyFn(ES, *Q, Symbols, LegacyLookup);
+          return LD.BackingResolver->lookup(Q,
+                                            std::move(NotFoundViaLegacyLookup));
         });
+    SetSymbolResolver(K, std::move(Resolver));
+
+    if (auto Err = BaseLayer.addModule(std::move(K), std::move(M)))
+      return std::move(Err);
 
-    return BaseLayer.addModule(std::move(M), std::move(Resolver));
+    return K;
   }
 
+  ExecutionSession &ES;
   BaseLayerT &BaseLayer;
+  SymbolResolverGetter GetSymbolResolver;
+  SymbolResolverSetter SetSymbolResolver;
   PartitioningFtor Partition;
   CompileCallbackMgrT &CompileCallbackMgr;
   IndirectStubsManagerBuilderT CreateIndirectStubsManager;
 
-  LogicalDylibList LogicalDylibs;
+  std::map<VModuleKey, LogicalDylib> LogicalDylibs;
   bool CloneStubsIntoPartitions;
 };
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h
index b9f7d6accc30..213a59124c85 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h
@@ -16,13 +16,14 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
@@ -35,45 +36,51 @@ class Module;
 
 namespace orc {
 
-/// @brief Simple compile functor: Takes a single IR module and returns an
-///        ObjectFile.
+/// Simple compile functor: Takes a single IR module and returns an ObjectFile.
+/// This compiler supports a single compilation thread and LLVMContext only.
+/// For multithreaded compilation, use MultiThreadedSimpleCompiler below.
 class SimpleCompiler {
 public:
+  using CompileResult = std::unique_ptr<MemoryBuffer>;
 
-  using CompileResult = object::OwningBinary<object::ObjectFile>;
-
-  /// @brief Construct a simple compile functor with the given target.
+  /// Construct a simple compile functor with the given target.
   SimpleCompiler(TargetMachine &TM, ObjectCache *ObjCache = nullptr)
     : TM(TM), ObjCache(ObjCache) {}
 
-  /// @brief Set an ObjectCache to query before compiling.
+  /// Set an ObjectCache to query before compiling.
   void setObjectCache(ObjectCache *NewCache) { ObjCache = NewCache; }
 
-  /// @brief Compile a Module to an ObjectFile.
+  /// Compile a Module to an ObjectFile.
   CompileResult operator()(Module &M) {
     CompileResult CachedObject = tryToLoadFromObjectCache(M);
-    if (CachedObject.getBinary())
+    if (CachedObject)
       return CachedObject;
 
     SmallVector<char, 0> ObjBufferSV;
-    raw_svector_ostream ObjStream(ObjBufferSV);
-
-    legacy::PassManager PM;
-    MCContext *Ctx;
-    if (TM.addPassesToEmitMC(PM, Ctx, ObjStream))
-      llvm_unreachable("Target does not support MC emission.");
-    PM.run(M);
-    std::unique_ptr<MemoryBuffer> ObjBuffer(
-        new ObjectMemoryBuffer(std::move(ObjBufferSV)));
-    Expected<std::unique_ptr<object::ObjectFile>> Obj =
+
+    {
+      raw_svector_ostream ObjStream(ObjBufferSV);
+
+      legacy::PassManager PM;
+      MCContext *Ctx;
+      if (TM.addPassesToEmitMC(PM, Ctx, ObjStream))
+        llvm_unreachable("Target does not support MC emission.");
+      PM.run(M);
+    }
+
+    auto ObjBuffer =
+        llvm::make_unique<SmallVectorMemoryBuffer>(std::move(ObjBufferSV));
+    auto Obj =
         object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
+
     if (Obj) {
       notifyObjectCompiled(M, *ObjBuffer);
-      return CompileResult(std::move(*Obj), std::move(ObjBuffer));
+      return std::move(ObjBuffer);
     }
+
     // TODO: Actually report errors helpfully.
     consumeError(Obj.takeError());
-    return CompileResult(nullptr, nullptr);
+    return nullptr;
   }
 
 private:
@@ -82,19 +89,7 @@ private:
     if (!ObjCache)
       return CompileResult();
 
-    std::unique_ptr<MemoryBuffer> ObjBuffer = ObjCache->getObject(&M);
-    if (!ObjBuffer)
-      return CompileResult();
-
-    Expected<std::unique_ptr<object::ObjectFile>> Obj =
-        object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
-    if (!Obj) {
-      // TODO: Actually report errors helpfully.
-      consumeError(Obj.takeError());
-      return CompileResult();
-    }
-
-    return CompileResult(std::move(*Obj), std::move(ObjBuffer));
+    return ObjCache->getObject(&M);
   }
 
   void notifyObjectCompiled(const Module &M, const MemoryBuffer &ObjBuffer) {
@@ -106,6 +101,29 @@ private:
   ObjectCache *ObjCache = nullptr;
 };
 
+/// A thread-safe version of SimpleCompiler.
+///
+/// This class creates a new TargetMachine and SimpleCompiler instance for each
+/// compile.
+class MultiThreadedSimpleCompiler {
+public:
+  MultiThreadedSimpleCompiler(JITTargetMachineBuilder JTMB,
+                              ObjectCache *ObjCache = nullptr)
+      : JTMB(std::move(JTMB)), ObjCache(ObjCache) {}
+
+  void setObjectCache(ObjectCache *ObjCache) { this->ObjCache = ObjCache; }
+
+  std::unique_ptr<MemoryBuffer> operator()(Module &M) {
+    auto TM = cantFail(JTMB.createTargetMachine());
+    SimpleCompiler C(*TM, ObjCache);
+    return C(M);
+  }
+
+private:
+  JITTargetMachineBuilder JTMB;
+  ObjectCache *ObjCache = nullptr;
+};
+
 } // end namespace orc
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/Core.h
new file mode 100644
index 000000000000..fd03687cfc21
--- /dev/null
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -0,0 +1,779 @@
+//===------ Core.h -- Core ORC APIs (Layer, JITDylib, etc.) -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains core ORC APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_CORE_H
+#define LLVM_EXECUTIONENGINE_ORC_CORE_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/IR/Module.h"
+
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+// Forward declare some classes.
+class AsynchronousSymbolQuery;
+class ExecutionSession;
+class MaterializationUnit;
+class MaterializationResponsibility;
+class VSO;
+
+/// VModuleKey provides a unique identifier (allocated and managed by
+/// ExecutionSessions) for a module added to the JIT.
+using VModuleKey = uint64_t;
+
+/// A set of symbol names (represented by SymbolStringPtrs for
+//         efficiency).
+using SymbolNameSet = std::set<SymbolStringPtr>;
+
+/// Render a SymbolNameSet to an ostream.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols);
+
+/// A map from symbol names (as SymbolStringPtrs) to JITSymbols
+///        (address/flags pairs).
+using SymbolMap = std::map<SymbolStringPtr, JITEvaluatedSymbol>;
+
+/// Render a SymbolMap to an ostream.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols);
+
+/// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
+using SymbolFlagsMap = std::map<SymbolStringPtr, JITSymbolFlags>;
+
+/// Render a SymbolMap to an ostream.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &Symbols);
+
+/// A base class for materialization failures that allows the failing
+///        symbols to be obtained for logging.
+using SymbolDependenceMap = std::map<VSO *, SymbolNameSet>;
+
+/// Render a SymbolDependendeMap.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
+
+/// A list of VSO pointers.
+using VSOList = std::vector<VSO *>;
+
+/// Render a VSOList.
+raw_ostream &operator<<(raw_ostream &OS, const VSOList &VSOs);
+
+/// Callback to notify client that symbols have been resolved.
+using SymbolsResolvedCallback = std::function<void(Expected<SymbolMap>)>;
+
+/// Callback to notify client that symbols are ready for execution.
+using SymbolsReadyCallback = std::function<void(Error)>;
+
+/// Callback to register the dependencies for a given query.
+using RegisterDependenciesFunction =
+    std::function<void(const SymbolDependenceMap &)>;
+
+/// This can be used as the value for a RegisterDependenciesFunction if there
+/// are no dependants to register with.
+extern RegisterDependenciesFunction NoDependenciesToRegister;
+
+/// Used to notify a VSO that the given set of symbols failed to materialize.
+class FailedToMaterialize : public ErrorInfo<FailedToMaterialize> {
+public:
+  static char ID;
+
+  FailedToMaterialize(SymbolNameSet Symbols);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const SymbolNameSet &getSymbols() const { return Symbols; }
+
+private:
+  SymbolNameSet Symbols;
+};
+
+/// Used to notify clients when symbols can not be found during a lookup.
+class SymbolsNotFound : public ErrorInfo<SymbolsNotFound> {
+public:
+  static char ID;
+
+  SymbolsNotFound(SymbolNameSet Symbols);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const SymbolNameSet &getSymbols() const { return Symbols; }
+
+private:
+  SymbolNameSet Symbols;
+};
+
+/// Tracks responsibility for materialization, and mediates interactions between
+/// MaterializationUnits and VSOs.
+///
+/// An instance of this class is passed to MaterializationUnits when their
+/// materialize method is called. It allows MaterializationUnits to resolve and
+/// finalize symbols, or abandon materialization by notifying any unmaterialized
+/// symbols of an error.
+class MaterializationResponsibility {
+  friend class MaterializationUnit;
+public:
+  MaterializationResponsibility(MaterializationResponsibility &&) = default;
+  MaterializationResponsibility &
+  operator=(MaterializationResponsibility &&) = default;
+
+  /// Destruct a MaterializationResponsibility instance. In debug mode
+  ///        this asserts that all symbols being tracked have been either
+  ///        finalized or notified of an error.
+  ~MaterializationResponsibility();
+
+  /// Returns the target VSO that these symbols are being materialized
+  ///        into.
+  VSO &getTargetVSO() const { return V; }
+
+  /// Returns the symbol flags map for this responsibility instance.
+  SymbolFlagsMap getSymbols() { return SymbolFlags; }
+
+  /// Returns the names of any symbols covered by this
+  /// MaterializationResponsibility object that have queries pending. This
+  /// information can be used to return responsibility for unrequested symbols
+  /// back to the VSO via the delegate method.
+  SymbolNameSet getRequestedSymbols();
+
+  /// Resolves the given symbols. Individual calls to this method may
+  ///        resolve a subset of the symbols, but all symbols must have been
+  ///        resolved prior to calling finalize.
+  void resolve(const SymbolMap &Symbols);
+
+  /// Finalizes all symbols tracked by this instance.
+  void finalize();
+
+  /// Adds new symbols to the VSO and this responsibility instance.
+  ///        VSO entries start out in the materializing state.
+  ///
+  ///   This method can be used by materialization units that want to add
+  /// additional symbols at materialization time (e.g. stubs, compile
+  /// callbacks, metadata).
+  Error defineMaterializing(const SymbolFlagsMap &SymbolFlags);
+
+  /// Notify all unfinalized symbols that an error has occurred.
+  /// This will remove all symbols covered by this MaterializationResponsibilty
+  /// from V, and send an error to any queries waiting on these symbols.
+  void failMaterialization();
+
+  /// Transfers responsibility to the given MaterializationUnit for all
+  /// symbols defined by that MaterializationUnit. This allows
+  /// materializers to break up work based on run-time information (e.g.
+  /// by introspecting which symbols have actually been looked up and
+  /// materializing only those).
+  void replace(std::unique_ptr<MaterializationUnit> MU);
+
+  /// Delegates responsibility for the given symbols to the returned
+  /// materialization responsibility. Useful for breaking up work between
+  /// threads, or different kinds of materialization processes.
+  MaterializationResponsibility delegate(const SymbolNameSet &Symbols);
+
+  void addDependencies(const SymbolStringPtr &Name,
+                       const SymbolDependenceMap &Dependencies);
+
+  /// Add dependencies that apply to all symbols covered by this instance.
+  void addDependenciesForAll(const SymbolDependenceMap &Dependencies);
+
+private:
+  /// Create a MaterializationResponsibility for the given VSO and
+  ///        initial symbols.
+  MaterializationResponsibility(VSO &V, SymbolFlagsMap SymbolFlags);
+
+  VSO &V;
+  SymbolFlagsMap SymbolFlags;
+};
+
+/// A MaterializationUnit represents a set of symbol definitions that can
+///        be materialized as a group, or individually discarded (when
+///        overriding definitions are encountered).
+///
+/// MaterializationUnits are used when providing lazy definitions of symbols to
+/// VSOs. The VSO will call materialize when the address of a symbol is
+/// requested via the lookup method. The VSO will call discard if a stronger
+/// definition is added or already present.
+class MaterializationUnit {
+public:
+  MaterializationUnit(SymbolFlagsMap InitalSymbolFlags)
+      : SymbolFlags(std::move(InitalSymbolFlags)) {}
+
+  virtual ~MaterializationUnit() {}
+
+  /// Return the set of symbols that this source provides.
+  const SymbolFlagsMap &getSymbols() const { return SymbolFlags; }
+
+  /// Called by materialization dispatchers (see
+  /// ExecutionSession::DispatchMaterializationFunction) to trigger
+  /// materialization of this MaterializationUnit.
+  void doMaterialize(VSO &V) {
+    materialize(MaterializationResponsibility(V, std::move(SymbolFlags)));
+  }
+
+  /// Called by VSOs to notify MaterializationUnits that the given symbol has
+  /// been overridden.
+  void doDiscard(const VSO &V, SymbolStringPtr Name) {
+    SymbolFlags.erase(Name);
+    discard(V, std::move(Name));
+  }
+
+protected:
+  SymbolFlagsMap SymbolFlags;
+
+private:
+  virtual void anchor();
+
+  /// Implementations of this method should materialize all symbols
+  ///        in the materialzation unit, except for those that have been
+  ///        previously discarded.
+  virtual void materialize(MaterializationResponsibility R) = 0;
+
+  /// Implementations of this method should discard the given symbol
+  ///        from the source (e.g. if the source is an LLVM IR Module and the
+  ///        symbol is a function, delete the function body or mark it available
+  ///        externally).
+  virtual void discard(const VSO &V, SymbolStringPtr Name) = 0;
+};
+
+using MaterializationUnitList =
+    std::vector<std::unique_ptr<MaterializationUnit>>;
+
+/// A MaterializationUnit implementation for pre-existing absolute symbols.
+///
+/// All symbols will be resolved and marked ready as soon as the unit is
+/// materialized.
+class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit {
+public:
+  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols);
+
+private:
+  void materialize(MaterializationResponsibility R) override;
+  void discard(const VSO &V, SymbolStringPtr Name) override;
+  static SymbolFlagsMap extractFlags(const SymbolMap &Symbols);
+
+  SymbolMap Symbols;
+};
+
+/// Create an AbsoluteSymbolsMaterializationUnit with the given symbols.
+/// Useful for inserting absolute symbols into a VSO. E.g.:
+/// \code{.cpp}
+///   VSO &V = ...;
+///   SymbolStringPtr Foo = ...;
+///   JITEvaluatedSymbol FooSym = ...;
+///   if (auto Err = V.define(absoluteSymbols({{Foo, FooSym}})))
+///     return Err;
+/// \endcode
+///
+inline std::unique_ptr<AbsoluteSymbolsMaterializationUnit>
+absoluteSymbols(SymbolMap Symbols) {
+  return llvm::make_unique<AbsoluteSymbolsMaterializationUnit>(
+      std::move(Symbols));
+}
+
+struct SymbolAliasMapEntry {
+  SymbolAliasMapEntry() = default;
+  SymbolAliasMapEntry(SymbolStringPtr Aliasee, JITSymbolFlags AliasFlags)
+      : Aliasee(std::move(Aliasee)), AliasFlags(AliasFlags) {}
+
+  SymbolStringPtr Aliasee;
+  JITSymbolFlags AliasFlags;
+};
+
+/// A map of Symbols to (Symbol, Flags) pairs.
+using SymbolAliasMap = std::map<SymbolStringPtr, SymbolAliasMapEntry>;
+
+/// A materialization unit for symbol aliases. Allows existing symbols to be
+/// aliased with alternate flags.
+class ReExportsMaterializationUnit : public MaterializationUnit {
+public:
+  /// SourceVSO is allowed to be nullptr, in which case the source VSO is
+  /// taken to be whatever VSO these definitions are materialized in. This
+  /// is useful for defining aliases within a VSO.
+  ///
+  /// Note: Care must be taken that no sets of aliases form a cycle, as such
+  ///       a cycle will result in a deadlock when any symbol in the cycle is
+  ///       resolved.
+  ReExportsMaterializationUnit(VSO *SourceVSO, SymbolAliasMap Aliases);
+
+private:
+  void materialize(MaterializationResponsibility R) override;
+  void discard(const VSO &V, SymbolStringPtr Name) override;
+  static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
+
+  VSO *SourceVSO = nullptr;
+  SymbolAliasMap Aliases;
+};
+
+/// Create a ReExportsMaterializationUnit with the given aliases.
+/// Useful for defining symbol aliases.: E.g., given a VSO V containing symbols
+/// "foo" and "bar", we can define aliases "baz" (for "foo") and "qux" (for
+/// "bar") with:
+/// \code{.cpp}
+///   SymbolStringPtr Baz = ...;
+///   SymbolStringPtr Qux = ...;
+///   if (auto Err = V.define(symbolAliases({
+///       {Baz, { Foo, JITSymbolFlags::Exported }},
+///       {Qux, { Bar, JITSymbolFlags::Weak }}}))
+///     return Err;
+/// \endcode
+inline std::unique_ptr<ReExportsMaterializationUnit>
+symbolAliases(SymbolAliasMap Aliases) {
+  return llvm::make_unique<ReExportsMaterializationUnit>(nullptr,
+                                                         std::move(Aliases));
+}
+
+/// Create a materialization unit for re-exporting symbols from another VSO
+/// with alternative names/flags.
+inline std::unique_ptr<ReExportsMaterializationUnit>
+reexports(VSO &SourceV, SymbolAliasMap Aliases) {
+  return llvm::make_unique<ReExportsMaterializationUnit>(&SourceV,
+                                                         std::move(Aliases));
+}
+
+/// Build a SymbolAliasMap for the common case where you want to re-export
+/// symbols from another VSO with the same linkage/flags.
+Expected<SymbolAliasMap>
+buildSimpleReexportsAliasMap(VSO &SourceV, const SymbolNameSet &Symbols);
+
+/// Base utilities for ExecutionSession.
+class ExecutionSessionBase {
+  // FIXME: Remove this when we remove the old ORC layers.
+  friend class VSO;
+
+public:
+  /// For reporting errors.
+  using ErrorReporter = std::function<void(Error)>;
+
+  /// For dispatching MaterializationUnit::materialize calls.
+  using DispatchMaterializationFunction =
+      std::function<void(VSO &V, std::unique_ptr<MaterializationUnit> MU)>;
+
+  /// Construct an ExecutionSessionBase.
+  ///
+  /// SymbolStringPools may be shared between ExecutionSessions.
+  ExecutionSessionBase(std::shared_ptr<SymbolStringPool> SSP = nullptr)
+      : SSP(SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>()) {}
+
+  /// Returns the SymbolStringPool for this ExecutionSession.
+  SymbolStringPool &getSymbolStringPool() const { return *SSP; }
+
+  /// Run the given lambda with the session mutex locked.
+  template <typename Func> auto runSessionLocked(Func &&F) -> decltype(F()) {
+    std::lock_guard<std::recursive_mutex> Lock(SessionMutex);
+    return F();
+  }
+
+  /// Set the error reporter function.
+  ExecutionSessionBase &setErrorReporter(ErrorReporter ReportError) {
+    this->ReportError = std::move(ReportError);
+    return *this;
+  }
+
+  /// Set the materialization dispatch function.
+  ExecutionSessionBase &setDispatchMaterialization(
+      DispatchMaterializationFunction DispatchMaterialization) {
+    this->DispatchMaterialization = std::move(DispatchMaterialization);
+    return *this;
+  }
+
+  /// Report a error for this execution session.
+  ///
+  /// Unhandled errors can be sent here to log them.
+  void reportError(Error Err) { ReportError(std::move(Err)); }
+
+  /// Allocate a module key for a new module to add to the JIT.
+  VModuleKey allocateVModule() { return ++LastKey; }
+
+  /// Return a module key to the ExecutionSession so that it can be
+  ///        re-used. This should only be done once all resources associated
+  ///        with the original key have been released.
+  void releaseVModule(VModuleKey Key) { /* FIXME: Recycle keys */
+  }
+
+  void legacyFailQuery(AsynchronousSymbolQuery &Q, Error Err);
+
+  using LegacyAsyncLookupFunction = std::function<SymbolNameSet(
+      std::shared_ptr<AsynchronousSymbolQuery> Q, SymbolNameSet Names)>;
+
+  /// A legacy lookup function for JITSymbolResolverAdapter.
+  /// Do not use -- this will be removed soon.
+  Expected<SymbolMap>
+  legacyLookup(ExecutionSessionBase &ES, LegacyAsyncLookupFunction AsyncLookup,
+               SymbolNameSet Names, bool WaiUntilReady,
+               RegisterDependenciesFunction RegisterDependencies);
+
+  /// Search the given VSO list for the given symbols.
+  ///
+  ///
+  /// The OnResolve callback will be called once all requested symbols are
+  /// resolved, or if an error occurs prior to resolution.
+  ///
+  /// The OnReady callback will be called once all requested symbols are ready,
+  /// or if an error occurs after resolution but before all symbols are ready.
+  ///
+  /// If all symbols are found, the RegisterDependencies function will be called
+  /// while the session lock is held. This gives clients a chance to register
+  /// dependencies for on the queried symbols for any symbols they are
+  /// materializing (if a MaterializationResponsibility instance is present,
+  /// this can be implemented by calling
+  /// MaterializationResponsibility::addDependencies). If there are no
+  /// dependenant symbols for this query (e.g. it is being made by a top level
+  /// client to get an address to call) then the value NoDependenciesToRegister
+  /// can be used.
+  void lookup(const VSOList &VSOs, const SymbolNameSet &Symbols,
+              SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
+              RegisterDependenciesFunction RegisterDependencies);
+
+  /// Blocking version of lookup above. Returns the resolved symbol map.
+  /// If WaitUntilReady is true (the default), will not return until all
+  /// requested symbols are ready (or an error occurs). If WaitUntilReady is
+  /// false, will return as soon as all requested symbols are resolved,
+  /// or an error occurs. If WaitUntilReady is false and an error occurs
+  /// after resolution, the function will return a success value, but the
+  /// error will be reported via reportErrors.
+  Expected<SymbolMap> lookup(const VSOList &VSOs, const SymbolNameSet &Symbols,
+                             RegisterDependenciesFunction RegisterDependencies,
+                             bool WaitUntilReady = true);
+
+  /// Materialize the given unit.
+  void dispatchMaterialization(VSO &V,
+                               std::unique_ptr<MaterializationUnit> MU) {
+    DispatchMaterialization(V, std::move(MU));
+  }
+
+private:
+  static void logErrorsToStdErr(Error Err) {
+    logAllUnhandledErrors(std::move(Err), errs(), "JIT session error: ");
+  }
+
+  static void
+  materializeOnCurrentThread(VSO &V, std::unique_ptr<MaterializationUnit> MU) {
+    MU->doMaterialize(V);
+  }
+
+  void runOutstandingMUs();
+
+  mutable std::recursive_mutex SessionMutex;
+  std::shared_ptr<SymbolStringPool> SSP;
+  VModuleKey LastKey = 0;
+  ErrorReporter ReportError = logErrorsToStdErr;
+  DispatchMaterializationFunction DispatchMaterialization =
+      materializeOnCurrentThread;
+
+  // FIXME: Remove this (and runOutstandingMUs) once the linking layer works
+  //        with callbacks from asynchronous queries.
+  mutable std::recursive_mutex OutstandingMUsMutex;
+  std::vector<std::pair<VSO *, std::unique_ptr<MaterializationUnit>>>
+      OutstandingMUs;
+};
+
+/// A symbol query that returns results via a callback when results are
+///        ready.
+///
+/// makes a callback when all symbols are available.
+class AsynchronousSymbolQuery {
+  friend class ExecutionSessionBase;
+  friend class VSO;
+
+public:
+
+  /// Create a query for the given symbols, notify-resolved and
+  ///        notify-ready callbacks.
+  AsynchronousSymbolQuery(const SymbolNameSet &Symbols,
+                          SymbolsResolvedCallback NotifySymbolsResolved,
+                          SymbolsReadyCallback NotifySymbolsReady);
+
+  /// Set the resolved symbol information for the given symbol name.
+  void resolve(const SymbolStringPtr &Name, JITEvaluatedSymbol Sym);
+
+  /// Returns true if all symbols covered by this query have been
+  ///        resolved.
+  bool isFullyResolved() const { return NotYetResolvedCount == 0; }
+
+  /// Call the NotifySymbolsResolved callback.
+  ///
+  /// This should only be called if all symbols covered by the query have been
+  /// resolved.
+  void handleFullyResolved();
+
+  /// Notify the query that a requested symbol is ready for execution.
+  void notifySymbolReady();
+
+  /// Returns true if all symbols covered by this query are ready.
+  bool isFullyReady() const { return NotYetReadyCount == 0; }
+
+  /// Calls the NotifySymbolsReady callback.
+  ///
+  /// This should only be called if all symbols covered by this query are ready.
+  void handleFullyReady();
+
+private:
+  void addQueryDependence(VSO &V, SymbolStringPtr Name);
+
+  void removeQueryDependence(VSO &V, const SymbolStringPtr &Name);
+
+  bool canStillFail();
+
+  void handleFailed(Error Err);
+
+  void detach();
+
+  SymbolsResolvedCallback NotifySymbolsResolved;
+  SymbolsReadyCallback NotifySymbolsReady;
+  SymbolDependenceMap QueryRegistrations;
+  SymbolMap ResolvedSymbols;
+  size_t NotYetResolvedCount;
+  size_t NotYetReadyCount;
+};
+
+/// A symbol table that supports asynchoronous symbol queries.
+///
+/// Represents a virtual shared object. Instances can not be copied or moved, so
+/// their addresses may be used as keys for resource management.
+/// VSO state changes must be made via an ExecutionSession to guarantee that
+/// they are synchronized with respect to other VSO operations.
+class VSO {
+  friend class AsynchronousSymbolQuery;
+  friend class ExecutionSession;
+  friend class ExecutionSessionBase;
+  friend class MaterializationResponsibility;
+public:
+  using FallbackDefinitionGeneratorFunction =
+      std::function<SymbolNameSet(VSO &Parent, const SymbolNameSet &Names)>;
+
+  using AsynchronousSymbolQuerySet =
+      std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
+
+  VSO(const VSO &) = delete;
+  VSO &operator=(const VSO &) = delete;
+  VSO(VSO &&) = delete;
+  VSO &operator=(VSO &&) = delete;
+
+  /// Get the name for this VSO.
+  const std::string &getName() const { return VSOName; }
+
+  /// Get a reference to the ExecutionSession for this VSO.
+  ExecutionSessionBase &getExecutionSession() const { return ES; }
+
+  /// Set a fallback defenition generator. If set, lookup and lookupFlags will
+  /// pass the unresolved symbols set to the fallback definition generator,
+  /// allowing it to add a new definition to the VSO.
+  void setFallbackDefinitionGenerator(
+      FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator) {
+    this->FallbackDefinitionGenerator = std::move(FallbackDefinitionGenerator);
+  }
+
+  /// Set the search order to be used when fixing up definitions in VSO.
+  /// This will replace the previous search order, and apply to any symbol
+  /// resolutions made for definitions in this VSO after the call to
+  /// setSearchOrder (even if the definition itself was added before the
+  /// call).
+  ///
+  /// If SearchThisVSOFirst is set, which by default it is, then this VSO will
+  /// add itself to the beginning of the SearchOrder (Clients should *not*
+  /// put this VSO in the list in this case, to avoid redundant lookups).
+  ///
+  /// If SearchThisVSOFirst is false then the search order will be used as
+  /// given. The main motivation for this feature is to support deliberate
+  /// shadowing of symbols in this VSO by a facade VSO. For example, the
+  /// facade may resolve function names to stubs, and the stubs may compile
+  /// lazily by looking up symbols in this dylib. Adding the facade dylib
+  /// as the first in the search order (instead of this dylib) ensures that
+  /// definitions within this dylib resolve to the lazy-compiling stubs,
+  /// rather than immediately materializing the definitions in this dylib.
+  void setSearchOrder(VSOList NewSearchOrder, bool SearchThisVSOFirst = true);
+
+  /// Add the given VSO to the search order for definitions in this VSO.
+  void addToSearchOrder(VSO &V);
+
+  /// Replace OldV with NewV in the search order if OldV is present. Otherwise
+  /// this operation is a no-op.
+  void replaceInSearchOrder(VSO &OldV, VSO &NewV);
+
+  /// Remove the given VSO from the search order for this VSO if it is
+  /// present. Otherwise this operation is a no-op.
+  void removeFromSearchOrder(VSO &V);
+
+  /// Do something with the search order (run under the session lock).
+  template <typename Func>
+  auto withSearchOrderDo(Func &&F)
+      -> decltype(F(std::declval<const VSOList &>())) {
+    return ES.runSessionLocked([&]() { return F(SearchOrder); });
+  }
+
+  /// Define all symbols provided by the materialization unit to be part
+  ///        of the given VSO.
+  template <typename UniquePtrToMaterializationUnit>
+  typename std::enable_if<
+      std::is_convertible<
+          typename std::decay<UniquePtrToMaterializationUnit>::type,
+          std::unique_ptr<MaterializationUnit>>::value,
+      Error>::type
+  define(UniquePtrToMaterializationUnit &&MU) {
+    return ES.runSessionLocked([&, this]() -> Error {
+      assert(MU && "Can't define with a null MU");
+
+      if (auto Err = defineImpl(*MU))
+        return Err;
+
+      /// defineImpl succeeded.
+      auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU));
+      for (auto &KV : UMI->MU->getSymbols())
+        UnmaterializedInfos[KV.first] = UMI;
+
+      return Error::success();
+    });
+  }
+
+  /// Search the given VSO for the symbols in Symbols. If found, store
+  ///        the flags for each symbol in Flags. Returns any unresolved symbols.
+  SymbolFlagsMap lookupFlags(const SymbolNameSet &Names);
+
+  /// Dump current VSO state to OS.
+  void dump(raw_ostream &OS);
+
+  /// FIXME: Remove this when we remove the old ORC layers.
+  /// Search the given VSOs in order for the symbols in Symbols. Results
+  ///        (once they become available) will be returned via the given Query.
+  ///
+  /// If any symbol is not found then the unresolved symbols will be returned,
+  /// and the query will not be applied. The Query is not failed and can be
+  /// re-used in a subsequent lookup once the symbols have been added, or
+  /// manually failed.
+  SymbolNameSet legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
+                             SymbolNameSet Names);
+
+private:
+  using AsynchronousSymbolQueryList =
+      std::vector<std::shared_ptr<AsynchronousSymbolQuery>>;
+
+  struct UnmaterializedInfo {
+    UnmaterializedInfo(std::unique_ptr<MaterializationUnit> MU)
+        : MU(std::move(MU)) {}
+
+    std::unique_ptr<MaterializationUnit> MU;
+  };
+
+  using UnmaterializedInfosMap =
+      std::map<SymbolStringPtr, std::shared_ptr<UnmaterializedInfo>>;
+
+  struct MaterializingInfo {
+    AsynchronousSymbolQueryList PendingQueries;
+    SymbolDependenceMap Dependants;
+    SymbolDependenceMap UnfinalizedDependencies;
+    bool IsFinalized = false;
+  };
+
+  using MaterializingInfosMap = std::map<SymbolStringPtr, MaterializingInfo>;
+
+  using LookupImplActionFlags = enum {
+    None = 0,
+    NotifyFullyResolved = 1 << 0U,
+    NotifyFullyReady = 1 << 1U,
+    LLVM_MARK_AS_BITMASK_ENUM(NotifyFullyReady)
+  };
+
+  VSO(ExecutionSessionBase &ES, std::string Name);
+
+  Error defineImpl(MaterializationUnit &MU);
+
+  SymbolNameSet lookupFlagsImpl(SymbolFlagsMap &Flags,
+                                const SymbolNameSet &Names);
+
+  void lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                  SymbolNameSet &Unresolved, MaterializationUnitList &MUs);
+
+  void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                      SymbolNameSet &Unresolved, MaterializationUnitList &MUs);
+
+  LookupImplActionFlags
+  lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
+             std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
+             SymbolNameSet &Unresolved);
+
+  void detachQueryHelper(AsynchronousSymbolQuery &Q,
+                         const SymbolNameSet &QuerySymbols);
+
+  void transferFinalizedNodeDependencies(MaterializingInfo &DependantMI,
+                                         const SymbolStringPtr &DependantName,
+                                         MaterializingInfo &FinalizedMI);
+
+  Error defineMaterializing(const SymbolFlagsMap &SymbolFlags);
+
+  void replace(std::unique_ptr<MaterializationUnit> MU);
+
+  SymbolNameSet getRequestedSymbols(const SymbolFlagsMap &SymbolFlags);
+
+  void addDependencies(const SymbolStringPtr &Name,
+                       const SymbolDependenceMap &Dependants);
+
+  void resolve(const SymbolMap &Resolved);
+
+  void finalize(const SymbolFlagsMap &Finalized);
+
+  void notifyFailed(const SymbolNameSet &FailedSymbols);
+
+  ExecutionSessionBase &ES;
+  std::string VSOName;
+  SymbolMap Symbols;
+  UnmaterializedInfosMap UnmaterializedInfos;
+  MaterializingInfosMap MaterializingInfos;
+  FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator;
+  VSOList SearchOrder;
+};
+
+/// An ExecutionSession represents a running JIT program.
+class ExecutionSession : public ExecutionSessionBase {
+public:
+  using ErrorReporter = std::function<void(Error)>;
+
+  using DispatchMaterializationFunction =
+      std::function<void(VSO &V, std::unique_ptr<MaterializationUnit> MU)>;
+
+  /// Construct an ExecutionEngine.
+  ///
+  /// SymbolStringPools may be shared between ExecutionSessions.
+  ExecutionSession(std::shared_ptr<SymbolStringPool> SSP = nullptr)
+      : ExecutionSessionBase(std::move(SSP)) {}
+
+  /// Add a new VSO to this ExecutionSession.
+  VSO &createVSO(std::string Name);
+
+private:
+  std::vector<std::unique_ptr<VSO>> VSOs;
+};
+
+/// Look up the given names in the given VSOs.
+/// VSOs will be searched in order and no VSO pointer may be null.
+/// All symbols must be found within the given VSOs or an error
+/// will be returned.
+Expected<SymbolMap> lookup(const VSOList &VSOs, SymbolNameSet Names);
+
+/// Look up a symbol by searching a list of VSOs.
+Expected<JITEvaluatedSymbol> lookup(const VSOList &VSOs, SymbolStringPtr Name);
+
+/// Mangles symbol names then uniques them in the context of an
+/// ExecutionSession.
+class MangleAndInterner {
+public:
+  MangleAndInterner(ExecutionSessionBase &ES, const DataLayout &DL);
+  SymbolStringPtr operator()(StringRef Name);
+
+private:
+  ExecutionSessionBase &ES;
+  const DataLayout &DL;
+};
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_CORE_H
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index d9b45c6a1e29..e27f6e1e2cd6 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -17,13 +17,16 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cstdint>
 #include <string>
-#include <vector>
 #include <utility>
+#include <vector>
 
 namespace llvm {
 
@@ -31,18 +34,58 @@ class ConstantArray;
 class GlobalVariable;
 class Function;
 class Module;
+class TargetMachine;
 class Value;
 
 namespace orc {
 
-/// @brief This iterator provides a convenient way to iterate over the elements
+/// A utility class for building TargetMachines for JITs.
+class JITTargetMachineBuilder {
+public:
+  JITTargetMachineBuilder(Triple TT);
+  static Expected<JITTargetMachineBuilder> detectHost();
+  Expected<std::unique_ptr<TargetMachine>> createTargetMachine();
+
+  JITTargetMachineBuilder &setArch(std::string Arch) {
+    this->Arch = std::move(Arch);
+    return *this;
+  }
+  JITTargetMachineBuilder &setCPU(std::string CPU) {
+    this->CPU = std::move(CPU);
+    return *this;
+  }
+  JITTargetMachineBuilder &setRelocationModel(Optional<Reloc::Model> RM) {
+    this->RM = std::move(RM);
+    return *this;
+  }
+  JITTargetMachineBuilder &setCodeModel(Optional<CodeModel::Model> CM) {
+    this->CM = std::move(CM);
+    return *this;
+  }
+  JITTargetMachineBuilder &
+  addFeatures(const std::vector<std::string> &FeatureVec);
+  SubtargetFeatures &getFeatures() { return Features; }
+  TargetOptions &getOptions() { return Options; }
+
+private:
+  Triple TT;
+  std::string Arch;
+  std::string CPU;
+  SubtargetFeatures Features;
+  TargetOptions Options;
+  Optional<Reloc::Model> RM;
+  Optional<CodeModel::Model> CM;
+  CodeGenOpt::Level OptLevel = CodeGenOpt::Default;
+};
+
+/// This iterator provides a convenient way to iterate over the elements
 ///        of an llvm.global_ctors/llvm.global_dtors instance.
 ///
 ///   The easiest way to get hold of instances of this class is to use the
 /// getConstructors/getDestructors functions.
 class CtorDtorIterator {
 public:
-  /// @brief Accessor for an element of the global_ctors/global_dtors array.
+  /// Accessor for an element of the global_ctors/global_dtors array.
   ///
   ///   This class provides a read-only view of the element with any casts on
   /// the function stripped away.
@@ -55,23 +98,23 @@ public:
     Value *Data;
   };
 
-  /// @brief Construct an iterator instance. If End is true then this iterator
+  /// Construct an iterator instance. If End is true then this iterator
   ///        acts as the end of the range, otherwise it is the beginning.
   CtorDtorIterator(const GlobalVariable *GV, bool End);
 
-  /// @brief Test iterators for equality.
+  /// Test iterators for equality.
   bool operator==(const CtorDtorIterator &Other) const;
 
-  /// @brief Test iterators for inequality.
+  /// Test iterators for inequality.
   bool operator!=(const CtorDtorIterator &Other) const;
 
-  /// @brief Pre-increment iterator.
+  /// Pre-increment iterator.
   CtorDtorIterator& operator++();
 
-  /// @brief Post-increment iterator.
+  /// Post-increment iterator.
   CtorDtorIterator operator++(int);
 
-  /// @brief Dereference iterator. The resulting value provides a read-only view
+  /// Dereference iterator. The resulting value provides a read-only view
   ///        of this element of the global_ctors/global_dtors list.
   Element operator*() const;
 
@@ -80,32 +123,31 @@ private:
   unsigned I;
 };
 
-/// @brief Create an iterator range over the entries of the llvm.global_ctors
+/// Create an iterator range over the entries of the llvm.global_ctors
 ///        array.
 iterator_range<CtorDtorIterator> getConstructors(const Module &M);
 
-/// @brief Create an iterator range over the entries of the llvm.global_ctors
+/// Create an iterator range over the entries of the llvm.global_ctors
 ///        array.
 iterator_range<CtorDtorIterator> getDestructors(const Module &M);
 
-/// @brief Convenience class for recording constructor/destructor names for
+/// Convenience class for recording constructor/destructor names for
 ///        later execution.
 template <typename JITLayerT>
 class CtorDtorRunner {
 public:
-  /// @brief Construct a CtorDtorRunner for the given range using the given
+  /// Construct a CtorDtorRunner for the given range using the given
   ///        name mangling function.
-  CtorDtorRunner(std::vector<std::string> CtorDtorNames,
-                 typename JITLayerT::ModuleHandleT H)
-      : CtorDtorNames(std::move(CtorDtorNames)), H(H) {}
+  CtorDtorRunner(std::vector<std::string> CtorDtorNames, VModuleKey K)
+      : CtorDtorNames(std::move(CtorDtorNames)), K(K) {}
 
-  /// @brief Run the recorded constructors/destructors through the given JIT
+  /// Run the recorded constructors/destructors through the given JIT
   ///        layer.
   Error runViaLayer(JITLayerT &JITLayer) const {
     using CtorDtorTy = void (*)();
 
-    for (const auto &CtorDtorName : CtorDtorNames)
-      if (auto CtorDtorSym = JITLayer.findSymbolIn(H, CtorDtorName, false)) {
+    for (const auto &CtorDtorName : CtorDtorNames) {
+      if (auto CtorDtorSym = JITLayer.findSymbolIn(K, CtorDtorName, false)) {
         if (auto AddrOrErr = CtorDtorSym.getAddress()) {
           CtorDtorTy CtorDtor =
             reinterpret_cast<CtorDtorTy>(static_cast<uintptr_t>(*AddrOrErr));
@@ -118,15 +160,30 @@ public:
         else
           return make_error<JITSymbolNotFound>(CtorDtorName);
       }
+    }
     return Error::success();
   }
 
 private:
   std::vector<std::string> CtorDtorNames;
-  typename JITLayerT::ModuleHandleT H;
+  orc::VModuleKey K;
 };
 
-/// @brief Support class for static dtor execution. For hosted (in-process) JITs
+class CtorDtorRunner2 {
+public:
+  CtorDtorRunner2(VSO &V) : V(V) {}
+  void add(iterator_range<CtorDtorIterator> CtorDtors);
+  Error run();
+
+private:
+  using CtorDtorList = std::vector<SymbolStringPtr>;
+  using CtorDtorPriorityMap = std::map<unsigned, CtorDtorList>;
+
+  VSO &V;
+  CtorDtorPriorityMap CtorDtorsByPriority;
+};
+
+/// Support class for static dtor execution. For hosted (in-process) JITs
 ///        only!
 ///
 ///   If a __cxa_atexit function isn't found C++ programs that use static
@@ -141,7 +198,26 @@ private:
 /// the client determines that destructors should be run (generally at JIT
 /// teardown or after a return from main), the runDestructors method should be
 /// called.
-class LocalCXXRuntimeOverrides {
+class LocalCXXRuntimeOverridesBase {
+public:
+  /// Run any destructors recorded by the overriden __cxa_atexit function
+  /// (CXAAtExitOverride).
+  void runDestructors();
+
+protected:
+  template <typename PtrTy> JITTargetAddress toTargetAddress(PtrTy *P) {
+    return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(P));
+  }
+
+  using DestructorPtr = void (*)(void *);
+  using CXXDestructorDataPair = std::pair<DestructorPtr, void *>;
+  using CXXDestructorDataPairList = std::vector<CXXDestructorDataPair>;
+  CXXDestructorDataPairList DSOHandleOverride;
+  static int CXAAtExitOverride(DestructorPtr Destructor, void *Arg,
+                               void *DSOHandle);
+};
+
+class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
 public:
   /// Create a runtime-overrides class.
   template <typename MangleFtorT>
@@ -158,32 +234,38 @@ public:
     return nullptr;
   }
 
-  /// Run any destructors recorded by the overriden __cxa_atexit function
-  /// (CXAAtExitOverride).
-  void runDestructors();
-
 private:
-  template <typename PtrTy>
-  JITTargetAddress toTargetAddress(PtrTy* P) {
-    return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(P));
-  }
-
   void addOverride(const std::string &Name, JITTargetAddress Addr) {
     CXXRuntimeOverrides.insert(std::make_pair(Name, Addr));
   }
 
   StringMap<JITTargetAddress> CXXRuntimeOverrides;
+};
 
-  using DestructorPtr = void (*)(void *);
-  using CXXDestructorDataPair = std::pair<DestructorPtr, void *>;
-  using CXXDestructorDataPairList = std::vector<CXXDestructorDataPair>;
-  CXXDestructorDataPairList DSOHandleOverride;
-  static int CXAAtExitOverride(DestructorPtr Destructor, void *Arg,
-                               void *DSOHandle);
+class LocalCXXRuntimeOverrides2 : public LocalCXXRuntimeOverridesBase {
+public:
+  Error enable(VSO &V, MangleAndInterner &Mangler);
 };
 
-} // end namespace orc
+/// A utility class to expose symbols found via dlsym to the JIT.
+///
+/// If an instance of this class is attached to a VSO as a fallback definition
+/// generator, then any symbol found in the given DynamicLibrary that passes
+/// the 'Allow' predicate will be added to the VSO.
+class DynamicLibraryFallbackGenerator {
+public:
+  using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
+  DynamicLibraryFallbackGenerator(sys::DynamicLibrary Dylib,
+                                  const DataLayout &DL, SymbolPredicate Allow);
+  SymbolNameSet operator()(VSO &V, const SymbolNameSet &Names);
 
+private:
+  sys::DynamicLibrary Dylib;
+  SymbolPredicate Allow;
+  char GlobalPrefix;
+};
+
+} // end namespace orc
 } // end namespace llvm
 
 #endif // LLVM_EXECUTIONENGINE_ORC_EXECUTIONUTILS_H
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h
index 8a48c36f4141..a8a88d7cb2d2 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h
@@ -27,7 +27,7 @@ class JITSymbolResolver;
 
 namespace orc {
 
-/// @brief Global mapping layer.
+/// Global mapping layer.
 ///
 ///   This layer overrides the findSymbol method to first search a local symbol
 /// table that the client can define. It can be used to inject new symbol
@@ -38,13 +38,13 @@ template <typename BaseLayerT>
 class GlobalMappingLayer {
 public:
 
-  /// @brief Handle to an added module.
+  /// Handle to an added module.
   using ModuleHandleT = typename BaseLayerT::ModuleHandleT;
 
-  /// @brief Construct an GlobalMappingLayer with the given BaseLayer
+  /// Construct an GlobalMappingLayer with the given BaseLayer
   GlobalMappingLayer(BaseLayerT &BaseLayer) : BaseLayer(BaseLayer) {}
 
-  /// @brief Add the given module to the JIT.
+  /// Add the given module to the JIT.
   /// @return A handle for the added modules.
   Expected<ModuleHandleT>
   addModule(std::shared_ptr<Module> M,
@@ -52,20 +52,20 @@ public:
     return BaseLayer.addModule(std::move(M), std::move(Resolver));
   }
 
-  /// @brief Remove the module set associated with the handle H.
+  /// Remove the module set associated with the handle H.
   Error removeModule(ModuleHandleT H) { return BaseLayer.removeModule(H); }
 
-  /// @brief Manually set the address to return for the given symbol.
+  /// Manually set the address to return for the given symbol.
   void setGlobalMapping(const std::string &Name, JITTargetAddress Addr) {
     SymbolTable[Name] = Addr;
   }
 
-  /// @brief Remove the given symbol from the global mapping.
+  /// Remove the given symbol from the global mapping.
   void eraseGlobalMapping(const std::string &Name) {
     SymbolTable.erase(Name);
   }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   ///
   ///          This method will first search the local symbol table, returning
   ///        any symbol found there. If the symbol is not found in the local
@@ -81,7 +81,7 @@ public:
     return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Get the address of the given symbol in the context of the of the
+  /// Get the address of the given symbol in the context of the of the
   ///        module represented by the handle H. This call is forwarded to the
   ///        base layer's implementation.
   /// @param H The handle for the module to search in.
@@ -94,7 +94,7 @@ public:
     return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Immediately emit and finalize the module set represented by the
+  /// Immediately emit and finalize the module set represented by the
   ///        given handle.
   /// @param H Handle for module set to emit/finalize.
   Error emitAndFinalize(ModuleHandleT H) {
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
index fadd334bed0f..ad6481548d59 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
@@ -16,7 +16,9 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include <memory>
 #include <string>
 
@@ -26,7 +28,30 @@ class Module;
 
 namespace orc {
 
-/// @brief Eager IR compiling layer.
+class IRCompileLayer2 : public IRLayer {
+public:
+  using CompileFunction =
+      std::function<Expected<std::unique_ptr<MemoryBuffer>>(Module &)>;
+
+  using NotifyCompiledFunction =
+      std::function<void(VModuleKey K, std::unique_ptr<Module>)>;
+
+  IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                  CompileFunction Compile);
+
+  void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled);
+
+  void emit(MaterializationResponsibility R, VModuleKey K,
+            std::unique_ptr<Module> M) override;
+
+private:
+  mutable std::mutex IRLayerMutex;
+  ObjectLayer &BaseLayer;
+  CompileFunction Compile;
+  NotifyCompiledFunction NotifyCompiled = NotifyCompiledFunction();
+};
+
+/// Eager IR compiling layer.
 ///
 ///   This layer immediately compiles each IR module added via addModule to an
 /// object file and adds this module file to the layer below, which must
@@ -34,36 +59,40 @@ namespace orc {
 template <typename BaseLayerT, typename CompileFtor>
 class IRCompileLayer {
 public:
+  /// Callback type for notifications when modules are compiled.
+  using NotifyCompiledCallback =
+      std::function<void(VModuleKey K, std::unique_ptr<Module>)>;
 
-  /// @brief Handle to a compiled module.
-  using ModuleHandleT = typename BaseLayerT::ObjHandleT;
-
-  /// @brief Construct an IRCompileLayer with the given BaseLayer, which must
+  /// Construct an IRCompileLayer with the given BaseLayer, which must
   ///        implement the ObjectLayer concept.
-  IRCompileLayer(BaseLayerT &BaseLayer, CompileFtor Compile)
-      : BaseLayer(BaseLayer), Compile(std::move(Compile)) {}
+  IRCompileLayer(
+      BaseLayerT &BaseLayer, CompileFtor Compile,
+      NotifyCompiledCallback NotifyCompiled = NotifyCompiledCallback())
+      : BaseLayer(BaseLayer), Compile(std::move(Compile)),
+        NotifyCompiled(std::move(NotifyCompiled)) {}
 
-  /// @brief Get a reference to the compiler functor.
+  /// Get a reference to the compiler functor.
   CompileFtor& getCompiler() { return Compile; }
 
-  /// @brief Compile the module, and add the resulting object to the base layer
-  ///        along with the given memory manager and symbol resolver.
-  ///
-  /// @return A handle for the added module.
-  Expected<ModuleHandleT>
-  addModule(std::shared_ptr<Module> M,
-            std::shared_ptr<JITSymbolResolver> Resolver) {
-    using CompileResult = decltype(Compile(*M));
-    auto Obj = std::make_shared<CompileResult>(Compile(*M));
-    return BaseLayer.addObject(std::move(Obj), std::move(Resolver));
+  /// (Re)set the NotifyCompiled callback.
+  void setNotifyCompiled(NotifyCompiledCallback NotifyCompiled) {
+    this->NotifyCompiled = std::move(NotifyCompiled);
   }
 
-  /// @brief Remove the module associated with the handle H.
-  Error removeModule(ModuleHandleT H) {
-    return BaseLayer.removeObject(H);
+  /// Compile the module, and add the resulting object to the base layer
+  ///        along with the given memory manager and symbol resolver.
+  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
+    if (auto Err = BaseLayer.addObject(std::move(K), Compile(*M)))
+      return Err;
+    if (NotifyCompiled)
+      NotifyCompiled(std::move(K), std::move(M));
+    return Error::success();
   }
 
-  /// @brief Search for the given named symbol.
+  /// Remove the module associated with the VModuleKey K.
+  Error removeModule(VModuleKey K) { return BaseLayer.removeObject(K); }
+
+  /// Search for the given named symbol.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it exists.
@@ -71,29 +100,28 @@ public:
     return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Get the address of the given symbol in compiled module represented
+  /// Get the address of the given symbol in compiled module represented
   ///        by the handle H. This call is forwarded to the base layer's
   ///        implementation.
-  /// @param H The handle for the module to search in.
+  /// @param K The VModuleKey for the module to search in.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it is found in the
   ///         given module.
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
+  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
                          bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
+    return BaseLayer.findSymbolIn(K, Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Immediately emit and finalize the module represented by the given
+  /// Immediately emit and finalize the module represented by the given
   ///        handle.
-  /// @param H Handle for module to emit/finalize.
-  Error emitAndFinalize(ModuleHandleT H) {
-    return BaseLayer.emitAndFinalize(H);
-  }
+  /// @param K The VModuleKey for the module to emit/finalize.
+  Error emitAndFinalize(VModuleKey K) { return BaseLayer.emitAndFinalize(K); }
 
 private:
   BaseLayerT &BaseLayer;
   CompileFtor Compile;
+  NotifyCompiledCallback NotifyCompiled;
 };
 
 } // end namespace orc
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
index 476061afda59..266a0f45b3e4 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
@@ -15,6 +15,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_IRTRANSFORMLAYER_H
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
 #include <memory>
 #include <string>
 
@@ -22,7 +23,32 @@ namespace llvm {
 class Module;
 namespace orc {
 
-/// @brief IR mutating layer.
+class IRTransformLayer2 : public IRLayer {
+public:
+
+  using TransformFunction =
+    std::function<Expected<std::unique_ptr<Module>>(std::unique_ptr<Module>)>;
+
+  IRTransformLayer2(ExecutionSession &ES, IRLayer &BaseLayer,
+                    TransformFunction Transform = identityTransform);
+
+  void setTransform(TransformFunction Transform) {
+    this->Transform = std::move(Transform);
+  }
+
+  void emit(MaterializationResponsibility R, VModuleKey K,
+            std::unique_ptr<Module> M) override;
+
+  static std::unique_ptr<Module> identityTransform(std::unique_ptr<Module> M) {
+    return M;
+  }
+
+private:
+  IRLayer &BaseLayer;
+  TransformFunction Transform;
+};
+
+/// IR mutating layer.
 ///
 ///   This layer applies a user supplied transform to each module that is added,
 /// then adds the transformed module to the layer below.
@@ -30,28 +56,23 @@ template <typename BaseLayerT, typename TransformFtor>
 class IRTransformLayer {
 public:
 
-  /// @brief Handle to a set of added modules.
-  using ModuleHandleT = typename BaseLayerT::ModuleHandleT;
-
-  /// @brief Construct an IRTransformLayer with the given BaseLayer
+  /// Construct an IRTransformLayer with the given BaseLayer
   IRTransformLayer(BaseLayerT &BaseLayer,
                    TransformFtor Transform = TransformFtor())
     : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-  /// @brief Apply the transform functor to the module, then add the module to
+  /// Apply the transform functor to the module, then add the module to
   ///        the layer below, along with the memory manager and symbol resolver.
   ///
   /// @return A handle for the added modules.
-  Expected<ModuleHandleT>
-  addModule(std::shared_ptr<Module> M,
-            std::shared_ptr<JITSymbolResolver> Resolver) {
-    return BaseLayer.addModule(Transform(std::move(M)), std::move(Resolver));
+  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
+    return BaseLayer.addModule(std::move(K), Transform(std::move(M)));
   }
 
-  /// @brief Remove the module associated with the handle H.
-  Error removeModule(ModuleHandleT H) { return BaseLayer.removeModule(H); }
+  /// Remove the module associated with the VModuleKey K.
+  Error removeModule(VModuleKey K) { return BaseLayer.removeModule(K); }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it exists.
@@ -59,30 +80,28 @@ public:
     return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Get the address of the given symbol in the context of the module
-  ///        represented by the handle H. This call is forwarded to the base
+  /// Get the address of the given symbol in the context of the module
+  ///        represented by the VModuleKey K. This call is forwarded to the base
   ///        layer's implementation.
-  /// @param H The handle for the module to search in.
+  /// @param K The VModuleKey for the module to search in.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it is found in the
   ///         given module.
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
+  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
                          bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
+    return BaseLayer.findSymbolIn(K, Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Immediately emit and finalize the module represented by the given
-  ///        handle.
-  /// @param H Handle for module to emit/finalize.
-  Error emitAndFinalize(ModuleHandleT H) {
-    return BaseLayer.emitAndFinalize(H);
-  }
+  /// Immediately emit and finalize the module represented by the given
+  ///        VModuleKey.
+  /// @param K The VModuleKey for the module to emit/finalize.
+  Error emitAndFinalize(VModuleKey K) { return BaseLayer.emitAndFinalize(K); }
 
-  /// @brief Access the transform functor directly.
+  /// Access the transform functor directly.
   TransformFtor& getTransform() { return Transform; }
 
-  /// @brief Access the mumate functor directly.
+  /// Access the mumate functor directly.
   const TransformFtor& getTransform() const { return Transform; }
 
 private:
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index 029b86a6d2ca..8b0b3fdb7df4 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Process.h"
@@ -46,98 +47,29 @@ class Value;
 
 namespace orc {
 
-/// @brief Target-independent base class for compile callback management.
+/// Target-independent base class for compile callback management.
 class JITCompileCallbackManager {
 public:
-  using CompileFtor = std::function<JITTargetAddress()>;
-
-  /// @brief Handle to a newly created compile callback. Can be used to get an
-  ///        IR constant representing the address of the trampoline, and to set
-  ///        the compile action for the callback.
-  class CompileCallbackInfo {
-  public:
-    CompileCallbackInfo(JITTargetAddress Addr, CompileFtor &Compile)
-        : Addr(Addr), Compile(Compile) {}
-
-    JITTargetAddress getAddress() const { return Addr; }
-    void setCompileAction(CompileFtor Compile) {
-      this->Compile = std::move(Compile);
-    }
-
-  private:
-    JITTargetAddress Addr;
-    CompileFtor &Compile;
-  };
-
-  /// @brief Construct a JITCompileCallbackManager.
+  using CompileFunction = std::function<JITTargetAddress()>;
+
+  /// Construct a JITCompileCallbackManager.
   /// @param ErrorHandlerAddress The address of an error handler in the target
   ///                            process to be used if a compile callback fails.
-  JITCompileCallbackManager(JITTargetAddress ErrorHandlerAddress)
-      : ErrorHandlerAddress(ErrorHandlerAddress) {}
+  JITCompileCallbackManager(ExecutionSession &ES,
+                            JITTargetAddress ErrorHandlerAddress)
+      : ES(ES), CallbacksVSO(ES.createVSO("<Callbacks>")),
+        ErrorHandlerAddress(ErrorHandlerAddress) {}
 
   virtual ~JITCompileCallbackManager() = default;
 
-  /// @brief Execute the callback for the given trampoline id. Called by the JIT
-  ///        to compile functions on demand.
-  JITTargetAddress executeCompileCallback(JITTargetAddress TrampolineAddr) {
-    auto I = ActiveTrampolines.find(TrampolineAddr);
-    // FIXME: Also raise an error in the Orc error-handler when we finally have
-    //        one.
-    if (I == ActiveTrampolines.end())
-      return ErrorHandlerAddress;
-
-    // Found a callback handler. Yank this trampoline out of the active list and
-    // put it back in the available trampolines list, then try to run the
-    // handler's compile and update actions.
-    // Moving the trampoline ID back to the available list first means there's
-    // at
-    // least one available trampoline if the compile action triggers a request
-    // for
-    // a new one.
-    auto Compile = std::move(I->second);
-    ActiveTrampolines.erase(I);
-    AvailableTrampolines.push_back(TrampolineAddr);
-
-    if (auto Addr = Compile())
-      return Addr;
-
-    return ErrorHandlerAddress;
-  }
-
-  /// @brief Reserve a compile callback.
-  Expected<CompileCallbackInfo> getCompileCallback() {
-    if (auto TrampolineAddrOrErr = getAvailableTrampolineAddr()) {
-      const auto &TrampolineAddr = *TrampolineAddrOrErr;
-      auto &Compile = this->ActiveTrampolines[TrampolineAddr];
-      return CompileCallbackInfo(TrampolineAddr, Compile);
-    } else
-      return TrampolineAddrOrErr.takeError();
-  }
-
-  /// @brief Get a CompileCallbackInfo for an existing callback.
-  CompileCallbackInfo getCompileCallbackInfo(JITTargetAddress TrampolineAddr) {
-    auto I = ActiveTrampolines.find(TrampolineAddr);
-    assert(I != ActiveTrampolines.end() && "Not an active trampoline.");
-    return CompileCallbackInfo(I->first, I->second);
-  }
+  /// Reserve a compile callback.
+  Expected<JITTargetAddress> getCompileCallback(CompileFunction Compile);
 
-  /// @brief Release a compile callback.
-  ///
-  ///   Note: Callbacks are auto-released after they execute. This method should
-  /// only be called to manually release a callback that is not going to
-  /// execute.
-  void releaseCompileCallback(JITTargetAddress TrampolineAddr) {
-    auto I = ActiveTrampolines.find(TrampolineAddr);
-    assert(I != ActiveTrampolines.end() && "Not an active trampoline.");
-    ActiveTrampolines.erase(I);
-    AvailableTrampolines.push_back(TrampolineAddr);
-  }
+  /// Execute the callback for the given trampoline id. Called by the JIT
+  ///        to compile functions on demand.
+  JITTargetAddress executeCompileCallback(JITTargetAddress TrampolineAddr);
 
 protected:
-  JITTargetAddress ErrorHandlerAddress;
-
-  using TrampolineMapT = std::map<JITTargetAddress, CompileFtor>;
-  TrampolineMapT ActiveTrampolines;
   std::vector<JITTargetAddress> AvailableTrampolines;
 
 private:
@@ -156,17 +88,25 @@ private:
   virtual Error grow() = 0;
 
   virtual void anchor();
+
+  std::mutex CCMgrMutex;
+  ExecutionSession &ES;
+  VSO &CallbacksVSO;
+  JITTargetAddress ErrorHandlerAddress;
+  std::map<JITTargetAddress, SymbolStringPtr> AddrToSymbol;
+  size_t NextCallbackId = 0;
 };
 
-/// @brief Manage compile callbacks for in-process JITs.
+/// Manage compile callbacks for in-process JITs.
 template <typename TargetT>
 class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
 public:
-  /// @brief Construct a InProcessJITCompileCallbackManager.
+  /// Construct a InProcessJITCompileCallbackManager.
   /// @param ErrorHandlerAddress The address of an error handler in the target
   ///                            process to be used if a compile callback fails.
-  LocalJITCompileCallbackManager(JITTargetAddress ErrorHandlerAddress)
-      : JITCompileCallbackManager(ErrorHandlerAddress) {
+  LocalJITCompileCallbackManager(ExecutionSession &ES,
+                                 JITTargetAddress ErrorHandlerAddress)
+      : JITCompileCallbackManager(ES, ErrorHandlerAddress) {
     /// Set up the resolver block.
     std::error_code EC;
     ResolverBlock = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
@@ -229,38 +169,38 @@ private:
   std::vector<sys::OwningMemoryBlock> TrampolineBlocks;
 };
 
-/// @brief Base class for managing collections of named indirect stubs.
+/// Base class for managing collections of named indirect stubs.
 class IndirectStubsManager {
 public:
-  /// @brief Map type for initializing the manager. See init.
+  /// Map type for initializing the manager. See init.
   using StubInitsMap = StringMap<std::pair<JITTargetAddress, JITSymbolFlags>>;
 
   virtual ~IndirectStubsManager() = default;
 
-  /// @brief Create a single stub with the given name, target address and flags.
+  /// Create a single stub with the given name, target address and flags.
   virtual Error createStub(StringRef StubName, JITTargetAddress StubAddr,
                            JITSymbolFlags StubFlags) = 0;
 
-  /// @brief Create StubInits.size() stubs with the given names, target
+  /// Create StubInits.size() stubs with the given names, target
   ///        addresses, and flags.
   virtual Error createStubs(const StubInitsMap &StubInits) = 0;
 
-  /// @brief Find the stub with the given name. If ExportedStubsOnly is true,
+  /// Find the stub with the given name. If ExportedStubsOnly is true,
   ///        this will only return a result if the stub's flags indicate that it
   ///        is exported.
-  virtual JITSymbol findStub(StringRef Name, bool ExportedStubsOnly) = 0;
+  virtual JITEvaluatedSymbol findStub(StringRef Name, bool ExportedStubsOnly) = 0;
 
-  /// @brief Find the implementation-pointer for the stub.
-  virtual JITSymbol findPointer(StringRef Name) = 0;
+  /// Find the implementation-pointer for the stub.
+  virtual JITEvaluatedSymbol findPointer(StringRef Name) = 0;
 
-  /// @brief Change the value of the implementation pointer for the stub.
+  /// Change the value of the implementation pointer for the stub.
   virtual Error updatePointer(StringRef Name, JITTargetAddress NewAddr) = 0;
 
 private:
   virtual void anchor();
 };
 
-/// @brief IndirectStubsManager implementation for the host architecture, e.g.
+/// IndirectStubsManager implementation for the host architecture, e.g.
 ///        OrcX86_64. (See OrcArchitectureSupport.h).
 template <typename TargetT>
 class LocalIndirectStubsManager : public IndirectStubsManager {
@@ -286,7 +226,7 @@ public:
     return Error::success();
   }
 
-  JITSymbol findStub(StringRef Name, bool ExportedStubsOnly) override {
+  JITEvaluatedSymbol findStub(StringRef Name, bool ExportedStubsOnly) override {
     auto I = StubIndexes.find(Name);
     if (I == StubIndexes.end())
       return nullptr;
@@ -295,13 +235,13 @@ public:
     assert(StubAddr && "Missing stub address");
     auto StubTargetAddr =
         static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(StubAddr));
-    auto StubSymbol = JITSymbol(StubTargetAddr, I->second.second);
+    auto StubSymbol = JITEvaluatedSymbol(StubTargetAddr, I->second.second);
     if (ExportedStubsOnly && !StubSymbol.getFlags().isExported())
       return nullptr;
     return StubSymbol;
   }
 
-  JITSymbol findPointer(StringRef Name) override {
+  JITEvaluatedSymbol findPointer(StringRef Name) override {
     auto I = StubIndexes.find(Name);
     if (I == StubIndexes.end())
       return nullptr;
@@ -310,7 +250,7 @@ public:
     assert(PtrAddr && "Missing pointer address");
     auto PtrTargetAddr =
         static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(PtrAddr));
-    return JITSymbol(PtrTargetAddr, I->second.second);
+    return JITEvaluatedSymbol(PtrTargetAddr, I->second.second);
   }
 
   Error updatePointer(StringRef Name, JITTargetAddress NewAddr) override {
@@ -354,45 +294,45 @@ private:
   StringMap<std::pair<StubKey, JITSymbolFlags>> StubIndexes;
 };
 
-/// @brief Create a local compile callback manager.
+/// Create a local compile callback manager.
 ///
 /// The given target triple will determine the ABI, and the given
 /// ErrorHandlerAddress will be used by the resulting compile callback
 /// manager if a compile callback fails.
 std::unique_ptr<JITCompileCallbackManager>
-createLocalCompileCallbackManager(const Triple &T,
+createLocalCompileCallbackManager(const Triple &T, ExecutionSession &ES,
                                   JITTargetAddress ErrorHandlerAddress);
 
-/// @brief Create a local indriect stubs manager builder.
+/// Create a local indriect stubs manager builder.
 ///
 /// The given target triple will determine the ABI.
 std::function<std::unique_ptr<IndirectStubsManager>()>
 createLocalIndirectStubsManagerBuilder(const Triple &T);
 
-/// @brief Build a function pointer of FunctionType with the given constant
+/// Build a function pointer of FunctionType with the given constant
 ///        address.
 ///
 ///   Usage example: Turn a trampoline address into a function pointer constant
 /// for use in a stub.
 Constant *createIRTypedAddress(FunctionType &FT, JITTargetAddress Addr);
 
-/// @brief Create a function pointer with the given type, name, and initializer
+/// Create a function pointer with the given type, name, and initializer
 ///        in the given Module.
 GlobalVariable *createImplPointer(PointerType &PT, Module &M, const Twine &Name,
                                   Constant *Initializer);
 
-/// @brief Turn a function declaration into a stub function that makes an
+/// Turn a function declaration into a stub function that makes an
 ///        indirect call using the given function pointer.
 void makeStub(Function &F, Value &ImplPointer);
 
-/// @brief Raise linkage types and rename as necessary to ensure that all
+/// Raise linkage types and rename as necessary to ensure that all
 ///        symbols are accessible for other modules.
 ///
 ///   This should be called before partitioning a module to ensure that the
 /// partitions retain access to each other's symbols.
 void makeAllSymbolsExternallyAccessible(Module &M);
 
-/// @brief Clone a function declaration into a new module.
+/// Clone a function declaration into a new module.
 ///
 ///   This function can be used as the first step towards creating a callback
 /// stub (see makeStub), or moving a function body (see moveFunctionBody).
@@ -407,7 +347,7 @@ void makeAllSymbolsExternallyAccessible(Module &M);
 Function *cloneFunctionDecl(Module &Dst, const Function &F,
                             ValueToValueMapTy *VMap = nullptr);
 
-/// @brief Move the body of function 'F' to a cloned function declaration in a
+/// Move the body of function 'F' to a cloned function declaration in a
 ///        different module (See related cloneFunctionDecl).
 ///
 ///   If the target function declaration is not supplied via the NewF parameter
@@ -419,11 +359,11 @@ void moveFunctionBody(Function &OrigF, ValueToValueMapTy &VMap,
                       ValueMaterializer *Materializer = nullptr,
                       Function *NewF = nullptr);
 
-/// @brief Clone a global variable declaration into a new module.
+/// Clone a global variable declaration into a new module.
 GlobalVariable *cloneGlobalVariableDecl(Module &Dst, const GlobalVariable &GV,
                                         ValueToValueMapTy *VMap = nullptr);
 
-/// @brief Move global variable GV from its parent module to cloned global
+/// Move global variable GV from its parent module to cloned global
 ///        declaration in a different module.
 ///
 ///   If the target global declaration is not supplied via the NewGV parameter
@@ -436,11 +376,11 @@ void moveGlobalVariableInitializer(GlobalVariable &OrigGV,
                                    ValueMaterializer *Materializer = nullptr,
                                    GlobalVariable *NewGV = nullptr);
 
-/// @brief Clone a global alias declaration into a new module.
+/// Clone a global alias declaration into a new module.
 GlobalAlias *cloneGlobalAliasDecl(Module &Dst, const GlobalAlias &OrigA,
                                   ValueToValueMapTy &VMap);
 
-/// @brief Clone module flags metadata into the destination module.
+/// Clone module flags metadata into the destination module.
 void cloneModuleFlagsMetadata(Module &Dst, const Module &Src,
                               ValueToValueMapTy &VMap);
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
new file mode 100644
index 000000000000..df655bd82006
--- /dev/null
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -0,0 +1,143 @@
+//===----- LLJIT.h -- An ORC-based JIT for compiling LLVM IR ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for 3Bdetails.
+//
+//===----------------------------------------------------------------------===//
+//
+// An ORC-based JIT for compiling LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LLJIT_H
+#define LLVM_EXECUTIONENGINE_ORC_LLJIT_H
+
+#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+namespace orc {
+
+/// A pre-fabricated ORC JIT stack that can serve as an alternative to MCJIT.
+class LLJIT {
+public:
+  /// Create an LLJIT instance.
+  static Expected<std::unique_ptr<LLJIT>>
+  Create(std::unique_ptr<ExecutionSession> ES,
+         std::unique_ptr<TargetMachine> TM, DataLayout DL);
+
+  /// Returns a reference to the ExecutionSession for this JIT instance.
+  ExecutionSession &getExecutionSession() { return *ES; }
+
+  /// Returns a reference to the VSO representing the JIT'd main program.
+  VSO &getMainVSO() { return Main; }
+
+  /// Convenience method for defining an absolute symbol.
+  Error defineAbsolute(StringRef Name, JITEvaluatedSymbol Address);
+
+  /// Adds an IR module to the given VSO.
+  Error addIRModule(VSO &V, std::unique_ptr<Module> M);
+
+  /// Adds an IR module to the Main VSO.
+  Error addIRModule(std::unique_ptr<Module> M) {
+    return addIRModule(Main, std::move(M));
+  }
+
+  /// Look up a symbol in VSO V by the symbol's linker-mangled name (to look up
+  /// symbols based on their IR name use the lookup function instead).
+  Expected<JITEvaluatedSymbol> lookupLinkerMangled(VSO &V, StringRef Name);
+
+  /// Look up a symbol in the main VSO by the symbol's linker-mangled name (to
+  /// look up symbols based on their IR name use the lookup function instead).
+  Expected<JITEvaluatedSymbol> lookupLinkerMangled(StringRef Name) {
+    return lookupLinkerMangled(Main, Name);
+  }
+
+  /// Look up a symbol in VSO V based on its IR symbol name.
+  Expected<JITEvaluatedSymbol> lookup(VSO &V, StringRef UnmangledName) {
+    return lookupLinkerMangled(V, mangle(UnmangledName));
+  }
+
+  /// Look up a symbol in the main VSO based on its IR symbol name.
+  Expected<JITEvaluatedSymbol> lookup(StringRef UnmangledName) {
+    return lookup(Main, UnmangledName);
+  }
+
+  /// Runs all not-yet-run static constructors.
+  Error runConstructors() { return CtorRunner.run(); }
+
+  /// Runs all not-yet-run static destructors.
+  Error runDestructors() { return DtorRunner.run(); }
+
+protected:
+  LLJIT(std::unique_ptr<ExecutionSession> ES, std::unique_ptr<TargetMachine> TM,
+        DataLayout DL);
+
+  std::shared_ptr<RuntimeDyld::MemoryManager> getMemoryManager(VModuleKey K);
+
+  std::string mangle(StringRef UnmangledName);
+
+  Error applyDataLayout(Module &M);
+
+  void recordCtorDtors(Module &M);
+
+  std::unique_ptr<ExecutionSession> ES;
+  VSO &Main;
+
+  std::unique_ptr<TargetMachine> TM;
+  DataLayout DL;
+
+  RTDyldObjectLinkingLayer2 ObjLinkingLayer;
+  IRCompileLayer2 CompileLayer;
+
+  CtorDtorRunner2 CtorRunner, DtorRunner;
+};
+
+/// An extended version of LLJIT that supports lazy function-at-a-time
+/// compilation of LLVM IR.
+class LLLazyJIT : public LLJIT {
+public:
+  /// Create an LLLazyJIT instance.
+  static Expected<std::unique_ptr<LLLazyJIT>>
+  Create(std::unique_ptr<ExecutionSession> ES,
+         std::unique_ptr<TargetMachine> TM, DataLayout DL, LLVMContext &Ctx);
+
+  /// Set an IR transform (e.g. pass manager pipeline) to run on each function
+  /// when it is compiled.
+  void setLazyCompileTransform(IRTransformLayer2::TransformFunction Transform) {
+    TransformLayer.setTransform(std::move(Transform));
+  }
+
+  /// Add a module to be lazily compiled to VSO V.
+  Error addLazyIRModule(VSO &V, std::unique_ptr<Module> M);
+
+  /// Add a module to be lazily compiled to the main VSO.
+  Error addLazyIRModule(std::unique_ptr<Module> M) {
+    return addLazyIRModule(Main, std::move(M));
+  }
+
+private:
+  LLLazyJIT(std::unique_ptr<ExecutionSession> ES,
+            std::unique_ptr<TargetMachine> TM, DataLayout DL, LLVMContext &Ctx,
+            std::unique_ptr<JITCompileCallbackManager> CCMgr,
+            std::function<std::unique_ptr<IndirectStubsManager>()> ISMBuilder);
+
+  std::unique_ptr<JITCompileCallbackManager> CCMgr;
+  std::function<std::unique_ptr<IndirectStubsManager>()> ISMBuilder;
+
+  IRTransformLayer2 TransformLayer;
+  CompileOnDemandLayer2 CODLayer;
+};
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LLJIT_H
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
index 228392ae0d4a..7b6f3d2f92ab 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
@@ -23,7 +23,7 @@ namespace llvm {
 namespace orc {
 
 template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
-class LambdaResolver : public JITSymbolResolver {
+class LambdaResolver : public LegacyJITSymbolResolver {
 public:
   LambdaResolver(DylibLookupFtorT DylibLookupFtor,
                  ExternalLookupFtorT ExternalLookupFtor)
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/Layer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
new file mode 100644
index 000000000000..91bd4fb83e6f
--- /dev/null
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
@@ -0,0 +1,129 @@
+//===---------------- Layer.h -- Layer interfaces --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Layer interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LAYER_H
+#define LLVM_EXECUTIONENGINE_ORC_LAYER_H
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+namespace orc {
+
+/// Interface for layers that accept LLVM IR.
+class IRLayer {
+public:
+  IRLayer(ExecutionSession &ES);
+  virtual ~IRLayer();
+
+  /// Returns the ExecutionSession for this layer.
+  ExecutionSession &getExecutionSession() { return ES; }
+
+  /// Adds a MaterializationUnit representing the given IR to the given VSO.
+  virtual Error add(VSO &V, VModuleKey K, std::unique_ptr<Module> M);
+
+  /// Emit should materialize the given IR.
+  virtual void emit(MaterializationResponsibility R, VModuleKey K,
+                    std::unique_ptr<Module> M) = 0;
+
+private:
+  ExecutionSession &ES;
+};
+
+/// IRMaterializationUnit is a convenient base class for MaterializationUnits
+/// wrapping LLVM IR. Represents materialization responsibility for all symbols
+/// in the given module. If symbols are overridden by other definitions, then
+/// their linkage is changed to available-externally.
+class IRMaterializationUnit : public MaterializationUnit {
+public:
+  using SymbolNameToDefinitionMap = std::map<SymbolStringPtr, GlobalValue *>;
+
+  /// Create an IRMaterializationLayer. Scans the module to build the
+  /// SymbolFlags and SymbolToDefinition maps.
+  IRMaterializationUnit(ExecutionSession &ES, std::unique_ptr<Module> M);
+
+  /// Create an IRMaterializationLayer from a module, and pre-existing
+  /// SymbolFlags and SymbolToDefinition maps. The maps must provide
+  /// entries for each definition in M.
+  /// This constructor is useful for delegating work from one
+  /// IRMaterializationUnit to another.
+  IRMaterializationUnit(std::unique_ptr<Module> M, SymbolFlagsMap SymbolFlags,
+                        SymbolNameToDefinitionMap SymbolToDefinition);
+
+protected:
+  std::unique_ptr<Module> M;
+  SymbolNameToDefinitionMap SymbolToDefinition;
+
+private:
+  void discard(const VSO &V, SymbolStringPtr Name) override;
+};
+
+/// MaterializationUnit that materializes modules by calling the 'emit' method
+/// on the given IRLayer.
+class BasicIRLayerMaterializationUnit : public IRMaterializationUnit {
+public:
+  BasicIRLayerMaterializationUnit(IRLayer &L, VModuleKey K,
+                                  std::unique_ptr<Module> M);
+private:
+
+  void materialize(MaterializationResponsibility R) override;
+
+  IRLayer &L;
+  VModuleKey K;
+};
+
+/// Interface for Layers that accept object files.
+class ObjectLayer {
+public:
+  ObjectLayer(ExecutionSession &ES);
+  virtual ~ObjectLayer();
+
+  /// Returns the execution session for this layer.
+  ExecutionSession &getExecutionSession() { return ES; }
+
+  /// Adds a MaterializationUnit representing the given IR to the given VSO.
+  virtual Error add(VSO &V, VModuleKey K, std::unique_ptr<MemoryBuffer> O);
+
+  /// Emit should materialize the given IR.
+  virtual void emit(MaterializationResponsibility R, VModuleKey K,
+                    std::unique_ptr<MemoryBuffer> O) = 0;
+
+private:
+  ExecutionSession &ES;
+};
+
+/// Materializes the given object file (represented by a MemoryBuffer
+/// instance) by calling 'emit' on the given ObjectLayer.
+class BasicObjectLayerMaterializationUnit : public MaterializationUnit {
+public:
+
+
+  /// The MemoryBuffer should represent a valid object file.
+  /// If there is any chance that the file is invalid it should be validated
+  /// prior to constructing a BasicObjectLayerMaterializationUnit.
+  BasicObjectLayerMaterializationUnit(ObjectLayer &L, VModuleKey K,
+                                      std::unique_ptr<MemoryBuffer> O);
+
+private:
+  void materialize(MaterializationResponsibility R) override;
+  void discard(const VSO &V, SymbolStringPtr Name) override;
+
+  ObjectLayer &L;
+  VModuleKey K;
+  std::unique_ptr<MemoryBuffer> O;
+};
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LAYER_H
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
index b7e462e85d9d..46761b0ca7e1 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
@@ -32,23 +33,18 @@
 namespace llvm {
 namespace orc {
 
-/// @brief Lazy-emitting IR layer.
+/// Lazy-emitting IR layer.
 ///
 ///   This layer accepts LLVM IR Modules (via addModule), but does not
 /// immediately emit them the layer below. Instead, emissing to the base layer
 /// is deferred until the first time the client requests the address (via
 /// JITSymbol::getAddress) for a symbol contained in this layer.
 template <typename BaseLayerT> class LazyEmittingLayer {
-public:
-
-  using BaseLayerHandleT = typename BaseLayerT::ModuleHandleT;
-
 private:
   class EmissionDeferredModule {
   public:
-    EmissionDeferredModule(std::shared_ptr<Module> M,
-                           std::shared_ptr<JITSymbolResolver> Resolver)
-      : M(std::move(M)), Resolver(std::move(Resolver)) {}
+    EmissionDeferredModule(VModuleKey K, std::unique_ptr<Module> M)
+        : K(std::move(K)), M(std::move(M)) {}
 
     JITSymbol find(StringRef Name, bool ExportedSymbolsOnly, BaseLayerT &B) {
       switch (EmitState) {
@@ -65,13 +61,11 @@ private:
                 return 0;
               else if (this->EmitState == NotEmitted) {
                 this->EmitState = Emitting;
-                if (auto HandleOrErr = this->emitToBaseLayer(B))
-                  Handle = std::move(*HandleOrErr);
-                else
-                  return HandleOrErr.takeError();
+                if (auto Err = this->emitToBaseLayer(B))
+                  return std::move(Err);
                 this->EmitState = Emitted;
               }
-              if (auto Sym = B.findSymbolIn(Handle, PName, ExportedSymbolsOnly))
+              if (auto Sym = B.findSymbolIn(K, PName, ExportedSymbolsOnly))
                 return Sym.getAddress();
               else if (auto Err = Sym.takeError())
                 return std::move(Err);
@@ -89,13 +83,13 @@ private:
         // RuntimeDyld that did the lookup), so just return a nullptr here.
         return nullptr;
       case Emitted:
-        return B.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
+        return B.findSymbolIn(K, Name, ExportedSymbolsOnly);
       }
       llvm_unreachable("Invalid emit-state.");
     }
 
     Error removeModuleFromBaseLayer(BaseLayerT& BaseLayer) {
-      return EmitState != NotEmitted ? BaseLayer.removeModule(Handle)
+      return EmitState != NotEmitted ? BaseLayer.removeModule(K)
                                      : Error::success();
     }
 
@@ -104,10 +98,10 @@ private:
              "Cannot emitAndFinalize while already emitting");
       if (EmitState == NotEmitted) {
         EmitState = Emitting;
-        Handle = emitToBaseLayer(BaseLayer);
+        emitToBaseLayer(BaseLayer);
         EmitState = Emitted;
       }
-      BaseLayer.emitAndFinalize(Handle);
+      BaseLayer.emitAndFinalize(K);
     }
 
   private:
@@ -135,11 +129,11 @@ private:
       return buildMangledSymbols(Name, ExportedSymbolsOnly);
     }
 
-    Expected<BaseLayerHandleT> emitToBaseLayer(BaseLayerT &BaseLayer) {
+    Error emitToBaseLayer(BaseLayerT &BaseLayer) {
       // We don't need the mangled names set any more: Once we've emitted this
       // to the base layer we'll just look for symbols there.
       MangledSymbols.reset();
-      return BaseLayer.addModule(std::move(M), std::move(Resolver));
+      return BaseLayer.addModule(std::move(K), std::move(M));
     }
 
     // If the mangled name of the given GlobalValue matches the given search
@@ -192,46 +186,40 @@ private:
     }
 
     enum { NotEmitted, Emitting, Emitted } EmitState = NotEmitted;
-    BaseLayerHandleT Handle;
-    std::shared_ptr<Module> M;
-    std::shared_ptr<JITSymbolResolver> Resolver;
+    VModuleKey K;
+    std::unique_ptr<Module> M;
     mutable std::unique_ptr<StringMap<const GlobalValue*>> MangledSymbols;
   };
 
-  using ModuleListT = std::list<std::unique_ptr<EmissionDeferredModule>>;
-
   BaseLayerT &BaseLayer;
-  ModuleListT ModuleList;
+  std::map<VModuleKey, std::unique_ptr<EmissionDeferredModule>> ModuleMap;
 
 public:
 
-  /// @brief Handle to a loaded module.
-  using ModuleHandleT = typename ModuleListT::iterator;
-
-  /// @brief Construct a lazy emitting layer.
+  /// Construct a lazy emitting layer.
   LazyEmittingLayer(BaseLayerT &BaseLayer) : BaseLayer(BaseLayer) {}
 
-  /// @brief Add the given module to the lazy emitting layer.
-  Expected<ModuleHandleT>
-  addModule(std::shared_ptr<Module> M,
-            std::shared_ptr<JITSymbolResolver> Resolver) {
-    return ModuleList.insert(
-        ModuleList.end(),
-        llvm::make_unique<EmissionDeferredModule>(std::move(M),
-                                                  std::move(Resolver)));
+  /// Add the given module to the lazy emitting layer.
+  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
+    assert(!ModuleMap.count(K) && "VModuleKey K already in use");
+    ModuleMap[K] =
+        llvm::make_unique<EmissionDeferredModule>(std::move(K), std::move(M));
+    return Error::success();
   }
 
-  /// @brief Remove the module represented by the given handle.
+  /// Remove the module represented by the given handle.
   ///
   ///   This method will free the memory associated with the given module, both
   /// in this layer, and the base layer.
-  Error removeModule(ModuleHandleT H) {
-    Error Err = (*H)->removeModuleFromBaseLayer(BaseLayer);
-    ModuleList.erase(H);
-    return Err;
+  Error removeModule(VModuleKey K) {
+    auto I = ModuleMap.find(K);
+    assert(I != ModuleMap.end() && "VModuleKey K not valid here");
+    auto EDM = std::move(I.second);
+    ModuleMap.erase(I);
+    return EDM->removeModuleFromBaseLayer(BaseLayer);
   }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it exists.
@@ -243,26 +231,27 @@ public:
     // If not found then search the deferred modules. If any of these contain a
     // definition of 'Name' then they will return a JITSymbol that will emit
     // the corresponding module when the symbol address is requested.
-    for (auto &DeferredMod : ModuleList)
-      if (auto Symbol = DeferredMod->find(Name, ExportedSymbolsOnly, BaseLayer))
+    for (auto &KV : ModuleMap)
+      if (auto Symbol = KV.second->find(Name, ExportedSymbolsOnly, BaseLayer))
         return Symbol;
 
     // If no definition found anywhere return a null symbol.
     return nullptr;
   }
 
-  /// @brief Get the address of the given symbol in the context of the of
-  ///        compiled modules represented by the handle H.
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
+  /// Get the address of the given symbol in the context of the of
+  ///        compiled modules represented by the key K.
+  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
                          bool ExportedSymbolsOnly) {
-    return (*H)->find(Name, ExportedSymbolsOnly, BaseLayer);
+    assert(ModuleMap.count(K) && "VModuleKey K not valid here");
+    return ModuleMap[K]->find(Name, ExportedSymbolsOnly, BaseLayer);
   }
 
-  /// @brief Immediately emit and finalize the module represented by the given
-  ///        handle.
-  /// @param H Handle for module to emit/finalize.
-  Error emitAndFinalize(ModuleHandleT H) {
-    return (*H)->emitAndFinalize(BaseLayer);
+  /// Immediately emit and finalize the module represented by the given
+  ///        key.
+  Error emitAndFinalize(VModuleKey K) {
+    assert(ModuleMap.count(K) && "VModuleKey K not valid here");
+    return ModuleMap[K]->emitAndFinalize(BaseLayer);
   }
 };
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h
new file mode 100644
index 000000000000..52c8c162ff0b
--- /dev/null
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h
@@ -0,0 +1,211 @@
+//===--- Legacy.h -- Adapters for ExecutionEngine API interop ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains core ORC APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LEGACY_H
+#define LLVM_EXECUTIONENGINE_ORC_LEGACY_H
+
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+
+namespace llvm {
+namespace orc {
+
+/// SymbolResolver is a composable interface for looking up symbol flags
+///        and addresses using the AsynchronousSymbolQuery type. It will
+///        eventually replace the LegacyJITSymbolResolver interface as the
+///        stardard ORC symbol resolver type.
+///
+/// FIXME: SymbolResolvers should go away and be replaced with VSOs with
+///        defenition generators.
+class SymbolResolver {
+public:
+  virtual ~SymbolResolver() = default;
+
+  /// Returns the flags for each symbol in Symbols that can be found,
+  ///        along with the set of symbol that could not be found.
+  virtual SymbolFlagsMap lookupFlags(const SymbolNameSet &Symbols) = 0;
+
+  /// For each symbol in Symbols that can be found, assigns that symbols
+  ///        value in Query. Returns the set of symbols that could not be found.
+  virtual SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
+                               SymbolNameSet Symbols) = 0;
+
+private:
+  virtual void anchor();
+};
+
+/// Implements SymbolResolver with a pair of supplied function objects
+///        for convenience. See createSymbolResolver.
+template <typename LookupFlagsFn, typename LookupFn>
+class LambdaSymbolResolver final : public SymbolResolver {
+public:
+  template <typename LookupFlagsFnRef, typename LookupFnRef>
+  LambdaSymbolResolver(LookupFlagsFnRef &&LookupFlags, LookupFnRef &&Lookup)
+      : LookupFlags(std::forward<LookupFlagsFnRef>(LookupFlags)),
+        Lookup(std::forward<LookupFnRef>(Lookup)) {}
+
+  SymbolFlagsMap lookupFlags(const SymbolNameSet &Symbols) final {
+    return LookupFlags(Symbols);
+  }
+
+  SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
+                       SymbolNameSet Symbols) final {
+    return Lookup(std::move(Query), std::move(Symbols));
+  }
+
+private:
+  LookupFlagsFn LookupFlags;
+  LookupFn Lookup;
+};
+
+/// Creates a SymbolResolver implementation from the pair of supplied
+///        function objects.
+template <typename LookupFlagsFn, typename LookupFn>
+std::unique_ptr<LambdaSymbolResolver<
+    typename std::remove_cv<
+        typename std::remove_reference<LookupFlagsFn>::type>::type,
+    typename std::remove_cv<
+        typename std::remove_reference<LookupFn>::type>::type>>
+createSymbolResolver(LookupFlagsFn &&LookupFlags, LookupFn &&Lookup) {
+  using LambdaSymbolResolverImpl = LambdaSymbolResolver<
+      typename std::remove_cv<
+          typename std::remove_reference<LookupFlagsFn>::type>::type,
+      typename std::remove_cv<
+          typename std::remove_reference<LookupFn>::type>::type>;
+  return llvm::make_unique<LambdaSymbolResolverImpl>(
+      std::forward<LookupFlagsFn>(LookupFlags), std::forward<LookupFn>(Lookup));
+}
+
+class JITSymbolResolverAdapter : public JITSymbolResolver {
+public:
+  JITSymbolResolverAdapter(ExecutionSession &ES, SymbolResolver &R,
+                           MaterializationResponsibility *MR);
+  Expected<LookupFlagsResult> lookupFlags(const LookupSet &Symbols) override;
+  Expected<LookupResult> lookup(const LookupSet &Symbols) override;
+
+private:
+  ExecutionSession &ES;
+  std::set<SymbolStringPtr> ResolvedStrings;
+  SymbolResolver &R;
+  MaterializationResponsibility *MR;
+};
+
+/// Use the given legacy-style FindSymbol function (i.e. a function that
+///        takes a const std::string& or StringRef and returns a JITSymbol) to
+///        find the flags for each symbol in Symbols and store their flags in
+///        SymbolFlags. If any JITSymbol returned by FindSymbol is in an error
+///        state the function returns immediately with that error, otherwise it
+///        returns the set of symbols not found.
+///
+/// Useful for implementing lookupFlags bodies that query legacy resolvers.
+template <typename FindSymbolFn>
+Expected<SymbolFlagsMap> lookupFlagsWithLegacyFn(const SymbolNameSet &Symbols,
+                                                 FindSymbolFn FindSymbol) {
+  SymbolFlagsMap SymbolFlags;
+
+  for (auto &S : Symbols) {
+    if (JITSymbol Sym = FindSymbol(*S))
+      SymbolFlags[S] = Sym.getFlags();
+    else if (auto Err = Sym.takeError())
+      return std::move(Err);
+  }
+
+  return SymbolFlags;
+}
+
+/// Use the given legacy-style FindSymbol function (i.e. a function that
+///        takes a const std::string& or StringRef and returns a JITSymbol) to
+///        find the address and flags for each symbol in Symbols and store the
+///        result in Query. If any JITSymbol returned by FindSymbol is in an
+///        error then Query.notifyFailed(...) is called with that error and the
+///        function returns immediately. On success, returns the set of symbols
+///        not found.
+///
+/// Useful for implementing lookup bodies that query legacy resolvers.
+template <typename FindSymbolFn>
+SymbolNameSet
+lookupWithLegacyFn(ExecutionSession &ES, AsynchronousSymbolQuery &Query,
+                   const SymbolNameSet &Symbols, FindSymbolFn FindSymbol) {
+  SymbolNameSet SymbolsNotFound;
+  bool NewSymbolsResolved = false;
+
+  for (auto &S : Symbols) {
+    if (JITSymbol Sym = FindSymbol(*S)) {
+      if (auto Addr = Sym.getAddress()) {
+        Query.resolve(S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
+        Query.notifySymbolReady();
+        NewSymbolsResolved = true;
+      } else {
+        ES.legacyFailQuery(Query, Addr.takeError());
+        return SymbolNameSet();
+      }
+    } else if (auto Err = Sym.takeError()) {
+      ES.legacyFailQuery(Query, std::move(Err));
+      return SymbolNameSet();
+    } else
+      SymbolsNotFound.insert(S);
+  }
+
+  if (NewSymbolsResolved && Query.isFullyResolved())
+    Query.handleFullyResolved();
+
+  if (NewSymbolsResolved && Query.isFullyReady())
+    Query.handleFullyReady();
+
+  return SymbolsNotFound;
+}
+
+/// An ORC SymbolResolver implementation that uses a legacy
+///        findSymbol-like function to perform lookup;
+template <typename LegacyLookupFn>
+class LegacyLookupFnResolver final : public SymbolResolver {
+public:
+  using ErrorReporter = std::function<void(Error)>;
+
+  LegacyLookupFnResolver(ExecutionSession &ES, LegacyLookupFn LegacyLookup,
+                         ErrorReporter ReportError)
+      : ES(ES), LegacyLookup(std::move(LegacyLookup)),
+        ReportError(std::move(ReportError)) {}
+
+  SymbolFlagsMap lookupFlags(const SymbolNameSet &Symbols) final {
+    if (auto SymbolFlags = lookupFlagsWithLegacyFn(Symbols, LegacyLookup))
+      return std::move(*SymbolFlags);
+    else {
+      ReportError(SymbolFlags.takeError());
+      return SymbolFlagsMap();
+    }
+  }
+
+  SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
+                       SymbolNameSet Symbols) final {
+    return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
+  }
+
+private:
+  ExecutionSession &ES;
+  LegacyLookupFn LegacyLookup;
+  ErrorReporter ReportError;
+};
+
+template <typename LegacyLookupFn>
+std::shared_ptr<LegacyLookupFnResolver<LegacyLookupFn>>
+createLegacyLookupResolver(ExecutionSession &ES, LegacyLookupFn LegacyLookup,
+                           std::function<void(Error)> ErrorReporter) {
+  return std::make_shared<LegacyLookupFnResolver<LegacyLookupFn>>(
+      ES, std::move(LegacyLookup), std::move(ErrorReporter));
+}
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LEGACY_H
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h
index 957b94912b3f..3dd3cfe05b8d 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h
@@ -15,14 +15,23 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_NULLRESOLVER_H
 #define LLVM_EXECUTIONENGINE_ORC_NULLRESOLVER_H
 
+#include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 
 namespace llvm {
 namespace orc {
 
+class NullResolver : public SymbolResolver {
+public:
+  SymbolFlagsMap lookupFlags(const SymbolNameSet &Symbols) override;
+
+  SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
+                       SymbolNameSet Symbols) override;
+};
+
 /// SymbolResolver impliementation that rejects all resolution requests.
 /// Useful for clients that have no cross-object fixups.
-class NullResolver : public JITSymbolResolver {
+class NullLegacyResolver : public LegacyJITSymbolResolver {
 public:
   JITSymbol findSymbol(const std::string &Name) final;
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index cb47e7520b1a..c6b43a9c8ed6 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -15,6 +15,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_OBJECTTRANSFORMLAYER_H
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -22,7 +23,24 @@
 namespace llvm {
 namespace orc {
 
-/// @brief Object mutating layer.
+class ObjectTransformLayer2 : public ObjectLayer {
+public:
+  using TransformFunction =
+      std::function<Expected<std::unique_ptr<MemoryBuffer>>(
+          std::unique_ptr<MemoryBuffer>)>;
+
+  ObjectTransformLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                        TransformFunction Transform);
+
+  void emit(MaterializationResponsibility R, VModuleKey K,
+            std::unique_ptr<MemoryBuffer> O) override;
+
+private:
+  ObjectLayer &BaseLayer;
+  TransformFunction Transform;
+};
+
+/// Object mutating layer.
 ///
 ///   This layer accepts sets of ObjectFiles (via addObject). It
 /// immediately applies the user supplied functor to each object, then adds
@@ -30,29 +48,24 @@ namespace orc {
 template <typename BaseLayerT, typename TransformFtor>
 class ObjectTransformLayer {
 public:
-  /// @brief Handle to a set of added objects.
-  using ObjHandleT = typename BaseLayerT::ObjHandleT;
-
-  /// @brief Construct an ObjectTransformLayer with the given BaseLayer
+  /// Construct an ObjectTransformLayer with the given BaseLayer
   ObjectTransformLayer(BaseLayerT &BaseLayer,
                        TransformFtor Transform = TransformFtor())
       : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-  /// @brief Apply the transform functor to each object in the object set, then
+  /// Apply the transform functor to each object in the object set, then
   ///        add the resulting set of objects to the base layer, along with the
   ///        memory manager and symbol resolver.
   ///
   /// @return A handle for the added objects.
-  template <typename ObjectPtr>
-  Expected<ObjHandleT> addObject(ObjectPtr Obj,
-                                 std::shared_ptr<JITSymbolResolver> Resolver) {
-    return BaseLayer.addObject(Transform(std::move(Obj)), std::move(Resolver));
+  template <typename ObjectPtr> Error addObject(VModuleKey K, ObjectPtr Obj) {
+    return BaseLayer.addObject(std::move(K), Transform(std::move(Obj)));
   }
 
-  /// @brief Remove the object set associated with the handle H.
-  Error removeObject(ObjHandleT H) { return BaseLayer.removeObject(H); }
+  /// Remove the object set associated with the VModuleKey K.
+  Error removeObject(VModuleKey K) { return BaseLayer.removeObject(K); }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it exists.
@@ -60,36 +73,34 @@ public:
     return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Get the address of the given symbol in the context of the set of
-  ///        objects represented by the handle H. This call is forwarded to the
-  ///        base layer's implementation.
-  /// @param H The handle for the object set to search in.
+  /// Get the address of the given symbol in the context of the set of
+  ///        objects represented by the VModuleKey K. This call is forwarded to
+  ///        the base layer's implementation.
+  /// @param K The VModuleKey associated with the object set to search in.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it is found in the
   ///         given object set.
-  JITSymbol findSymbolIn(ObjHandleT H, const std::string &Name,
+  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
                          bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
+    return BaseLayer.findSymbolIn(K, Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Immediately emit and finalize the object set represented by the
-  ///        given handle.
-  /// @param H Handle for object set to emit/finalize.
-  Error emitAndFinalize(ObjHandleT H) {
-    return BaseLayer.emitAndFinalize(H);
-  }
+  /// Immediately emit and finalize the object set represented by the
+  ///        given VModuleKey K.
+  Error emitAndFinalize(VModuleKey K) { return BaseLayer.emitAndFinalize(K); }
 
-  /// @brief Map section addresses for the objects associated with the handle H.
-  void mapSectionAddress(ObjHandleT H, const void *LocalAddress,
+  /// Map section addresses for the objects associated with the
+  /// VModuleKey K.
+  void mapSectionAddress(VModuleKey K, const void *LocalAddress,
                          JITTargetAddress TargetAddr) {
-    BaseLayer.mapSectionAddress(H, LocalAddress, TargetAddr);
+    BaseLayer.mapSectionAddress(K, LocalAddress, TargetAddr);
   }
 
-  /// @brief Access the transform functor directly.
+  /// Access the transform functor directly.
   TransformFtor &getTransform() { return Transform; }
 
-  /// @brief Access the mumate functor directly.
+  /// Access the mumate functor directly.
   const TransformFtor &getTransform() const { return Transform; }
 
 private:
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
index e1b55649b9f2..581c598aff62 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
@@ -71,7 +71,7 @@ public:
   }
 };
 
-/// @brief Provide information about stub blocks generated by the
+/// Provide information about stub blocks generated by the
 ///        makeIndirectStubsBlock function.
 template <unsigned StubSizeVal> class GenericIndirectStubsInfo {
 public:
@@ -92,16 +92,16 @@ public:
     return *this;
   }
 
-  /// @brief Number of stubs in this block.
+  /// Number of stubs in this block.
   unsigned getNumStubs() const { return NumStubs; }
 
-  /// @brief Get a pointer to the stub at the given index, which must be in
+  /// Get a pointer to the stub at the given index, which must be in
   ///        the range 0 .. getNumStubs() - 1.
   void *getStub(unsigned Idx) const {
     return static_cast<char *>(StubsMem.base()) + Idx * StubSize;
   }
 
-  /// @brief Get a pointer to the implementation-pointer at the given index,
+  /// Get a pointer to the implementation-pointer at the given index,
   ///        which must be in the range 0 .. getNumStubs() - 1.
   void **getPtr(unsigned Idx) const {
     char *PtrsBase = static_cast<char *>(StubsMem.base()) + NumStubs * StubSize;
@@ -124,18 +124,18 @@ public:
   using JITReentryFn = JITTargetAddress (*)(void *CallbackMgr,
                                             void *TrampolineId);
 
-  /// @brief Write the resolver code into the given memory. The user is be
+  /// Write the resolver code into the given memory. The user is be
   ///        responsible for allocating the memory and setting permissions.
   static void writeResolverCode(uint8_t *ResolveMem, JITReentryFn Reentry,
                                 void *CallbackMgr);
 
-  /// @brief Write the requsted number of trampolines into the given memory,
+  /// Write the requsted number of trampolines into the given memory,
   ///        which must be big enough to hold 1 pointer, plus NumTrampolines
   ///        trampolines.
   static void writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
                                unsigned NumTrampolines);
 
-  /// @brief Emit at least MinStubs worth of indirect call stubs, rounded out to
+  /// Emit at least MinStubs worth of indirect call stubs, rounded out to
   ///        the nearest page size.
   ///
   ///   E.g. Asking for 4 stubs on x86-64, where stubs are 8-bytes, with 4k
@@ -145,7 +145,7 @@ public:
                                       unsigned MinStubs, void *InitialPtrVal);
 };
 
-/// @brief X86_64 code that's common to all ABIs.
+/// X86_64 code that's common to all ABIs.
 ///
 /// X86_64 supports lazy JITing.
 class OrcX86_64_Base {
@@ -155,13 +155,13 @@ public:
 
   using IndirectStubsInfo = GenericIndirectStubsInfo<8>;
 
-  /// @brief Write the requsted number of trampolines into the given memory,
+  /// Write the requsted number of trampolines into the given memory,
   ///        which must be big enough to hold 1 pointer, plus NumTrampolines
   ///        trampolines.
   static void writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
                                unsigned NumTrampolines);
 
-  /// @brief Emit at least MinStubs worth of indirect call stubs, rounded out to
+  /// Emit at least MinStubs worth of indirect call stubs, rounded out to
   ///        the nearest page size.
   ///
   ///   E.g. Asking for 4 stubs on x86-64, where stubs are 8-bytes, with 4k
@@ -171,7 +171,7 @@ public:
                                       unsigned MinStubs, void *InitialPtrVal);
 };
 
-/// @brief X86_64 support for SysV ABI (Linux, MacOSX).
+/// X86_64 support for SysV ABI (Linux, MacOSX).
 ///
 /// X86_64_SysV supports lazy JITing.
 class OrcX86_64_SysV : public OrcX86_64_Base {
@@ -181,13 +181,13 @@ public:
   using JITReentryFn = JITTargetAddress (*)(void *CallbackMgr,
                                             void *TrampolineId);
 
-  /// @brief Write the resolver code into the given memory. The user is be
+  /// Write the resolver code into the given memory. The user is be
   ///        responsible for allocating the memory and setting permissions.
   static void writeResolverCode(uint8_t *ResolveMem, JITReentryFn Reentry,
                                 void *CallbackMgr);
 };
 
-/// @brief X86_64 support for Win32.
+/// X86_64 support for Win32.
 ///
 /// X86_64_Win32 supports lazy JITing.
 class OrcX86_64_Win32 : public OrcX86_64_Base {
@@ -197,13 +197,13 @@ public:
   using JITReentryFn = JITTargetAddress (*)(void *CallbackMgr,
                                             void *TrampolineId);
 
-  /// @brief Write the resolver code into the given memory. The user is be
+  /// Write the resolver code into the given memory. The user is be
   ///        responsible for allocating the memory and setting permissions.
   static void writeResolverCode(uint8_t *ResolveMem, JITReentryFn Reentry,
                                 void *CallbackMgr);
 };
 
-/// @brief I386 support.
+/// I386 support.
 ///
 /// I386 supports lazy JITing.
 class OrcI386 {
@@ -217,18 +217,18 @@ public:
   using JITReentryFn = JITTargetAddress (*)(void *CallbackMgr,
                                             void *TrampolineId);
 
-  /// @brief Write the resolver code into the given memory. The user is be
+  /// Write the resolver code into the given memory. The user is be
   ///        responsible for allocating the memory and setting permissions.
   static void writeResolverCode(uint8_t *ResolveMem, JITReentryFn Reentry,
                                 void *CallbackMgr);
 
-  /// @brief Write the requsted number of trampolines into the given memory,
+  /// Write the requsted number of trampolines into the given memory,
   ///        which must be big enough to hold 1 pointer, plus NumTrampolines
   ///        trampolines.
   static void writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
                                unsigned NumTrampolines);
 
-  /// @brief Emit at least MinStubs worth of indirect call stubs, rounded out to
+  /// Emit at least MinStubs worth of indirect call stubs, rounded out to
   ///        the nearest page size.
   ///
   ///   E.g. Asking for 4 stubs on i386, where stubs are 8-bytes, with 4k
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
index e1ac87075ac0..dc60e8d74e97 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
@@ -22,7 +22,9 @@ namespace orc {
 
 enum class OrcErrorCode : int {
   // RPC Errors
-  JITSymbolNotFound = 1,
+  UnknownORCError = 1,
+  DuplicateDefinition,
+  JITSymbolNotFound,
   RemoteAllocatorDoesNotExist,
   RemoteAllocatorIdAlreadyInUse,
   RemoteMProtectAddrUnrecognized,
@@ -39,6 +41,18 @@ enum class OrcErrorCode : int {
 
 std::error_code orcError(OrcErrorCode ErrCode);
 
+class DuplicateDefinition : public ErrorInfo<DuplicateDefinition> {
+public:
+  static char ID;
+
+  DuplicateDefinition(std::string SymbolName);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const std::string &getSymbolName() const;
+private:
+  std::string SymbolName;
+};
+
 class JITSymbolNotFound : public ErrorInfo<JITSymbolNotFound> {
 public:
   static char ID;
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 7179e5ff66fd..739e5ba47c12 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -63,7 +63,7 @@ public:
   public:
     ~RemoteRTDyldMemoryManager() {
       Client.destroyRemoteAllocator(Id);
-      DEBUG(dbgs() << "Destroyed remote allocator " << Id << "\n");
+      LLVM_DEBUG(dbgs() << "Destroyed remote allocator " << Id << "\n");
     }
 
     RemoteRTDyldMemoryManager(const RemoteRTDyldMemoryManager &) = delete;
@@ -79,9 +79,9 @@ public:
       Unmapped.back().CodeAllocs.emplace_back(Size, Alignment);
       uint8_t *Alloc = reinterpret_cast<uint8_t *>(
           Unmapped.back().CodeAllocs.back().getLocalAddress());
-      DEBUG(dbgs() << "Allocator " << Id << " allocated code for "
-                   << SectionName << ": " << Alloc << " (" << Size
-                   << " bytes, alignment " << Alignment << ")\n");
+      LLVM_DEBUG(dbgs() << "Allocator " << Id << " allocated code for "
+                        << SectionName << ": " << Alloc << " (" << Size
+                        << " bytes, alignment " << Alignment << ")\n");
       return Alloc;
     }
 
@@ -92,18 +92,18 @@ public:
         Unmapped.back().RODataAllocs.emplace_back(Size, Alignment);
         uint8_t *Alloc = reinterpret_cast<uint8_t *>(
             Unmapped.back().RODataAllocs.back().getLocalAddress());
-        DEBUG(dbgs() << "Allocator " << Id << " allocated ro-data for "
-                     << SectionName << ": " << Alloc << " (" << Size
-                     << " bytes, alignment " << Alignment << ")\n");
+        LLVM_DEBUG(dbgs() << "Allocator " << Id << " allocated ro-data for "
+                          << SectionName << ": " << Alloc << " (" << Size
+                          << " bytes, alignment " << Alignment << ")\n");
         return Alloc;
       } // else...
 
       Unmapped.back().RWDataAllocs.emplace_back(Size, Alignment);
       uint8_t *Alloc = reinterpret_cast<uint8_t *>(
           Unmapped.back().RWDataAllocs.back().getLocalAddress());
-      DEBUG(dbgs() << "Allocator " << Id << " allocated rw-data for "
-                   << SectionName << ": " << Alloc << " (" << Size
-                   << " bytes, alignment " << Alignment << ")\n");
+      LLVM_DEBUG(dbgs() << "Allocator " << Id << " allocated rw-data for "
+                        << SectionName << ": " << Alloc << " (" << Size
+                        << " bytes, alignment " << Alignment << ")\n");
       return Alloc;
     }
 
@@ -113,36 +113,36 @@ public:
                                 uint32_t RWDataAlign) override {
       Unmapped.push_back(ObjectAllocs());
 
-      DEBUG(dbgs() << "Allocator " << Id << " reserved:\n");
+      LLVM_DEBUG(dbgs() << "Allocator " << Id << " reserved:\n");
 
       if (CodeSize != 0) {
         Unmapped.back().RemoteCodeAddr =
             Client.reserveMem(Id, CodeSize, CodeAlign);
 
-        DEBUG(dbgs() << "  code: "
-                     << format("0x%016x", Unmapped.back().RemoteCodeAddr)
-                     << " (" << CodeSize << " bytes, alignment " << CodeAlign
-                     << ")\n");
+        LLVM_DEBUG(dbgs() << "  code: "
+                          << format("0x%016x", Unmapped.back().RemoteCodeAddr)
+                          << " (" << CodeSize << " bytes, alignment "
+                          << CodeAlign << ")\n");
       }
 
       if (RODataSize != 0) {
         Unmapped.back().RemoteRODataAddr =
             Client.reserveMem(Id, RODataSize, RODataAlign);
 
-        DEBUG(dbgs() << "  ro-data: "
-                     << format("0x%016x", Unmapped.back().RemoteRODataAddr)
-                     << " (" << RODataSize << " bytes, alignment "
-                     << RODataAlign << ")\n");
+        LLVM_DEBUG(dbgs() << "  ro-data: "
+                          << format("0x%016x", Unmapped.back().RemoteRODataAddr)
+                          << " (" << RODataSize << " bytes, alignment "
+                          << RODataAlign << ")\n");
       }
 
       if (RWDataSize != 0) {
         Unmapped.back().RemoteRWDataAddr =
             Client.reserveMem(Id, RWDataSize, RWDataAlign);
 
-        DEBUG(dbgs() << "  rw-data: "
-                     << format("0x%016x", Unmapped.back().RemoteRWDataAddr)
-                     << " (" << RWDataSize << " bytes, alignment "
-                     << RWDataAlign << ")\n");
+        LLVM_DEBUG(dbgs() << "  rw-data: "
+                          << format("0x%016x", Unmapped.back().RemoteRWDataAddr)
+                          << " (" << RWDataSize << " bytes, alignment "
+                          << RWDataAlign << ")\n");
       }
     }
 
@@ -162,7 +162,7 @@ public:
 
     void notifyObjectLoaded(RuntimeDyld &Dyld,
                             const object::ObjectFile &Obj) override {
-      DEBUG(dbgs() << "Allocator " << Id << " applied mappings:\n");
+      LLVM_DEBUG(dbgs() << "Allocator " << Id << " applied mappings:\n");
       for (auto &ObjAllocs : Unmapped) {
         mapAllocsToRemoteAddrs(Dyld, ObjAllocs.CodeAllocs,
                                ObjAllocs.RemoteCodeAddr);
@@ -176,7 +176,7 @@ public:
     }
 
     bool finalizeMemory(std::string *ErrMsg = nullptr) override {
-      DEBUG(dbgs() << "Allocator " << Id << " finalizing:\n");
+      LLVM_DEBUG(dbgs() << "Allocator " << Id << " finalizing:\n");
 
       for (auto &ObjAllocs : Unfinalized) {
         if (copyAndProtect(ObjAllocs.CodeAllocs, ObjAllocs.RemoteCodeAddr,
@@ -261,7 +261,7 @@ public:
     RemoteRTDyldMemoryManager(OrcRemoteTargetClient &Client,
                               ResourceIdMgr::ResourceId Id)
         : Client(Client), Id(Id) {
-      DEBUG(dbgs() << "Created remote allocator " << Id << "\n");
+      LLVM_DEBUG(dbgs() << "Created remote allocator " << Id << "\n");
     }
 
     // Maps all allocations in Allocs to aligned blocks
@@ -270,8 +270,9 @@ public:
       for (auto &Alloc : Allocs) {
         NextAddr = alignTo(NextAddr, Alloc.getAlign());
         Dyld.mapSectionAddress(Alloc.getLocalAddress(), NextAddr);
-        DEBUG(dbgs() << "     " << static_cast<void *>(Alloc.getLocalAddress())
-                     << " -> " << format("0x%016x", NextAddr) << "\n");
+        LLVM_DEBUG(dbgs() << "     "
+                          << static_cast<void *>(Alloc.getLocalAddress())
+                          << " -> " << format("0x%016x", NextAddr) << "\n");
         Alloc.setRemoteAddress(NextAddr);
 
         // Only advance NextAddr if it was non-null to begin with,
@@ -290,22 +291,23 @@ public:
         assert(!Allocs.empty() && "No sections in allocated segment");
 
         for (auto &Alloc : Allocs) {
-          DEBUG(dbgs() << "  copying section: "
-                       << static_cast<void *>(Alloc.getLocalAddress()) << " -> "
-                       << format("0x%016x", Alloc.getRemoteAddress()) << " ("
-                       << Alloc.getSize() << " bytes)\n";);
+          LLVM_DEBUG(dbgs() << "  copying section: "
+                            << static_cast<void *>(Alloc.getLocalAddress())
+                            << " -> "
+                            << format("0x%016x", Alloc.getRemoteAddress())
+                            << " (" << Alloc.getSize() << " bytes)\n";);
 
           if (Client.writeMem(Alloc.getRemoteAddress(), Alloc.getLocalAddress(),
                               Alloc.getSize()))
             return true;
         }
 
-        DEBUG(dbgs() << "  setting "
-                     << (Permissions & sys::Memory::MF_READ ? 'R' : '-')
-                     << (Permissions & sys::Memory::MF_WRITE ? 'W' : '-')
-                     << (Permissions & sys::Memory::MF_EXEC ? 'X' : '-')
-                     << " permissions on block: "
-                     << format("0x%016x", RemoteSegmentAddr) << "\n");
+        LLVM_DEBUG(dbgs() << "  setting "
+                          << (Permissions & sys::Memory::MF_READ ? 'R' : '-')
+                          << (Permissions & sys::Memory::MF_WRITE ? 'W' : '-')
+                          << (Permissions & sys::Memory::MF_EXEC ? 'X' : '-')
+                          << " permissions on block: "
+                          << format("0x%016x", RemoteSegmentAddr) << "\n");
         if (Client.setProtections(Id, RemoteSegmentAddr, Permissions))
           return true;
       }
@@ -356,25 +358,25 @@ public:
       return Error::success();
     }
 
-    JITSymbol findStub(StringRef Name, bool ExportedStubsOnly) override {
+    JITEvaluatedSymbol findStub(StringRef Name, bool ExportedStubsOnly) override {
       auto I = StubIndexes.find(Name);
       if (I == StubIndexes.end())
         return nullptr;
       auto Key = I->second.first;
       auto Flags = I->second.second;
-      auto StubSymbol = JITSymbol(getStubAddr(Key), Flags);
+      auto StubSymbol = JITEvaluatedSymbol(getStubAddr(Key), Flags);
       if (ExportedStubsOnly && !StubSymbol.getFlags().isExported())
         return nullptr;
       return StubSymbol;
     }
 
-    JITSymbol findPointer(StringRef Name) override {
+    JITEvaluatedSymbol findPointer(StringRef Name) override {
       auto I = StubIndexes.find(Name);
       if (I == StubIndexes.end())
         return nullptr;
       auto Key = I->second.first;
       auto Flags = I->second.second;
-      return JITSymbol(getPtrAddr(Key), Flags);
+      return JITEvaluatedSymbol(getPtrAddr(Key), Flags);
     }
 
     Error updatePointer(StringRef Name, JITTargetAddress NewAddr) override {
@@ -449,8 +451,9 @@ public:
   class RemoteCompileCallbackManager : public JITCompileCallbackManager {
   public:
     RemoteCompileCallbackManager(OrcRemoteTargetClient &Client,
+                                 ExecutionSession &ES,
                                  JITTargetAddress ErrorHandlerAddress)
-        : JITCompileCallbackManager(ErrorHandlerAddress), Client(Client) {}
+        : JITCompileCallbackManager(ES, ErrorHandlerAddress), Client(Client) {}
 
   private:
     Error grow() override {
@@ -475,10 +478,10 @@ public:
   /// Channel is the ChannelT instance to communicate on. It is assumed that
   /// the channel is ready to be read from and written to.
   static Expected<std::unique_ptr<OrcRemoteTargetClient>>
-  Create(rpc::RawByteChannel &Channel, std::function<void(Error)> ReportError) {
+  Create(rpc::RawByteChannel &Channel, ExecutionSession &ES) {
     Error Err = Error::success();
     auto Client = std::unique_ptr<OrcRemoteTargetClient>(
-        new OrcRemoteTargetClient(Channel, std::move(ReportError), Err));
+        new OrcRemoteTargetClient(Channel, ES, Err));
     if (Err)
       return std::move(Err);
     return std::move(Client);
@@ -487,7 +490,8 @@ public:
   /// Call the int(void) function at the given address in the target and return
   /// its result.
   Expected<int> callIntVoid(JITTargetAddress Addr) {
-    DEBUG(dbgs() << "Calling int(*)(void) " << format("0x%016x", Addr) << "\n");
+    LLVM_DEBUG(dbgs() << "Calling int(*)(void) " << format("0x%016x", Addr)
+                      << "\n");
     return callB<exec::CallIntVoid>(Addr);
   }
 
@@ -495,16 +499,16 @@ public:
   /// return its result.
   Expected<int> callMain(JITTargetAddress Addr,
                          const std::vector<std::string> &Args) {
-    DEBUG(dbgs() << "Calling int(*)(int, char*[]) " << format("0x%016x", Addr)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Calling int(*)(int, char*[]) "
+                      << format("0x%016x", Addr) << "\n");
     return callB<exec::CallMain>(Addr, Args);
   }
 
   /// Call the void() function at the given address in the target and wait for
   /// it to finish.
   Error callVoidVoid(JITTargetAddress Addr) {
-    DEBUG(dbgs() << "Calling void(*)(void) " << format("0x%016x", Addr)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Calling void(*)(void) " << format("0x%016x", Addr)
+                      << "\n");
     return callB<exec::CallVoidVoid>(Addr);
   }
 
@@ -531,12 +535,14 @@ public:
 
   Expected<RemoteCompileCallbackManager &>
   enableCompileCallbacks(JITTargetAddress ErrorHandlerAddress) {
+    assert(!CallbackManager && "CallbackManager already obtained");
+
     // Emit the resolver block on the JIT server.
     if (auto Err = callB<stubs::EmitResolverBlock>())
       return std::move(Err);
 
     // Create the callback manager.
-    CallbackManager.emplace(*this, ErrorHandlerAddress);
+    CallbackManager.emplace(*this, ES, ErrorHandlerAddress);
     RemoteCompileCallbackManager &Mgr = *CallbackManager;
     return Mgr;
   }
@@ -554,10 +560,10 @@ public:
   Error terminateSession() { return callB<utils::TerminateSession>(); }
 
 private:
-  OrcRemoteTargetClient(rpc::RawByteChannel &Channel,
-                        std::function<void(Error)> ReportError, Error &Err)
+  OrcRemoteTargetClient(rpc::RawByteChannel &Channel, ExecutionSession &ES,
+                        Error &Err)
       : rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel>(Channel, true),
-        ReportError(std::move(ReportError)) {
+        ES(ES) {
     ErrorAsOutParameter EAO(&Err);
 
     addHandler<utils::RequestCompile>(
@@ -577,7 +583,7 @@ private:
 
   void deregisterEHFrames(JITTargetAddress Addr, uint32_t Size) {
     if (auto Err = callB<eh::RegisterEHFrames>(Addr, Size))
-      ReportError(std::move(Err));
+      ES.reportError(std::move(Err));
   }
 
   void destroyRemoteAllocator(ResourceIdMgr::ResourceId Id) {
@@ -592,7 +598,7 @@ private:
   void destroyIndirectStubsManager(ResourceIdMgr::ResourceId Id) {
     IndirectStubOwnerIds.release(Id);
     if (auto Err = callB<stubs::DestroyIndirectStubsOwner>(Id))
-      ReportError(std::move(Err));
+      ES.reportError(std::move(Err));
   }
 
   Expected<std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>>
@@ -625,7 +631,7 @@ private:
     if (auto AddrOrErr = callB<mem::ReserveMem>(Id, Size, Align))
       return *AddrOrErr;
     else {
-      ReportError(AddrOrErr.takeError());
+      ES.reportError(AddrOrErr.takeError());
       return 0;
     }
   }
@@ -633,7 +639,7 @@ private:
   bool setProtections(ResourceIdMgr::ResourceId Id,
                       JITTargetAddress RemoteSegAddr, unsigned ProtFlags) {
     if (auto Err = callB<mem::SetProtections>(Id, RemoteSegAddr, ProtFlags)) {
-      ReportError(std::move(Err));
+      ES.reportError(std::move(Err));
       return true;
     } else
       return false;
@@ -641,7 +647,7 @@ private:
 
   bool writeMem(JITTargetAddress Addr, const char *Src, uint64_t Size) {
     if (auto Err = callB<mem::WriteMem>(DirectBufferWriter(Src, Addr, Size))) {
-      ReportError(std::move(Err));
+      ES.reportError(std::move(Err));
       return true;
     } else
       return false;
@@ -653,6 +659,7 @@ private:
 
   static Error doNothing() { return Error::success(); }
 
+  ExecutionSession &ES;
   std::function<void(Error)> ReportError;
   std::string RemoteTargetTriple;
   uint32_t RemotePointerSize = 0;
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
index cf419d33004c..acbc1682fa5d 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
@@ -161,9 +161,9 @@ private:
     IntVoidFnTy Fn =
         reinterpret_cast<IntVoidFnTy>(static_cast<uintptr_t>(Addr));
 
-    DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
+    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
     int Result = Fn();
-    DEBUG(dbgs() << "  Result = " << Result << "\n");
+    LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
 
     return Result;
   }
@@ -180,15 +180,13 @@ private:
     for (auto &Arg : Args)
       ArgV[Idx++] = Arg.c_str();
     ArgV[ArgC] = 0;
-    DEBUG(
-      for (int Idx = 0; Idx < ArgC; ++Idx) {
-        llvm::dbgs() << "Arg " << Idx << ": " << ArgV[Idx] << "\n";
-      }
-    );
+    LLVM_DEBUG(for (int Idx = 0; Idx < ArgC; ++Idx) {
+      llvm::dbgs() << "Arg " << Idx << ": " << ArgV[Idx] << "\n";
+    });
 
-    DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
+    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
     int Result = Fn(ArgC, ArgV.get());
-    DEBUG(dbgs() << "  Result = " << Result << "\n");
+    LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
 
     return Result;
   }
@@ -199,9 +197,9 @@ private:
     VoidVoidFnTy Fn =
         reinterpret_cast<VoidVoidFnTy>(static_cast<uintptr_t>(Addr));
 
-    DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
+    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
     Fn();
-    DEBUG(dbgs() << "  Complete.\n");
+    LLVM_DEBUG(dbgs() << "  Complete.\n");
 
     return Error::success();
   }
@@ -211,7 +209,7 @@ private:
     if (I != Allocators.end())
       return errorCodeToError(
                orcError(OrcErrorCode::RemoteAllocatorIdAlreadyInUse));
-    DEBUG(dbgs() << "  Created allocator " << Id << "\n");
+    LLVM_DEBUG(dbgs() << "  Created allocator " << Id << "\n");
     Allocators[Id] = Allocator();
     return Error::success();
   }
@@ -221,15 +219,16 @@ private:
     if (I != IndirectStubsOwners.end())
       return errorCodeToError(
                orcError(OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse));
-    DEBUG(dbgs() << "  Create indirect stubs owner " << Id << "\n");
+    LLVM_DEBUG(dbgs() << "  Create indirect stubs owner " << Id << "\n");
     IndirectStubsOwners[Id] = ISBlockOwnerList();
     return Error::success();
   }
 
   Error handleDeregisterEHFrames(JITTargetAddress TAddr, uint32_t Size) {
     uint8_t *Addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(TAddr));
-    DEBUG(dbgs() << "  Registering EH frames at " << format("0x%016x", TAddr)
-                 << ", Size = " << Size << " bytes\n");
+    LLVM_DEBUG(dbgs() << "  Registering EH frames at "
+                      << format("0x%016x", TAddr) << ", Size = " << Size
+                      << " bytes\n");
     EHFramesDeregister(Addr, Size);
     return Error::success();
   }
@@ -240,7 +239,7 @@ private:
       return errorCodeToError(
                orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
     Allocators.erase(I);
-    DEBUG(dbgs() << "  Destroyed allocator " << Id << "\n");
+    LLVM_DEBUG(dbgs() << "  Destroyed allocator " << Id << "\n");
     return Error::success();
   }
 
@@ -256,8 +255,8 @@ private:
   Expected<std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>>
   handleEmitIndirectStubs(ResourceIdMgr::ResourceId Id,
                           uint32_t NumStubsRequired) {
-    DEBUG(dbgs() << "  ISMgr " << Id << " request " << NumStubsRequired
-                 << " stubs.\n");
+    LLVM_DEBUG(dbgs() << "  ISMgr " << Id << " request " << NumStubsRequired
+                      << " stubs.\n");
 
     auto StubOwnerItr = IndirectStubsOwners.find(Id);
     if (StubOwnerItr == IndirectStubsOwners.end())
@@ -328,8 +327,8 @@ private:
 
   Expected<JITTargetAddress> handleGetSymbolAddress(const std::string &Name) {
     JITTargetAddress Addr = SymbolLookup(Name);
-    DEBUG(dbgs() << "  Symbol '" << Name << "' =  " << format("0x%016x", Addr)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "  Symbol '" << Name
+                      << "' =  " << format("0x%016x", Addr) << "\n");
     return Addr;
   }
 
@@ -340,12 +339,13 @@ private:
     uint32_t PageSize = sys::Process::getPageSize();
     uint32_t TrampolineSize = TargetT::TrampolineSize;
     uint32_t IndirectStubSize = TargetT::IndirectStubsInfo::StubSize;
-    DEBUG(dbgs() << "  Remote info:\n"
-                 << "    triple             = '" << ProcessTriple << "'\n"
-                 << "    pointer size       = " << PointerSize << "\n"
-                 << "    page size          = " << PageSize << "\n"
-                 << "    trampoline size    = " << TrampolineSize << "\n"
-                 << "    indirect stub size = " << IndirectStubSize << "\n");
+    LLVM_DEBUG(dbgs() << "  Remote info:\n"
+                      << "    triple             = '" << ProcessTriple << "'\n"
+                      << "    pointer size       = " << PointerSize << "\n"
+                      << "    page size          = " << PageSize << "\n"
+                      << "    trampoline size    = " << TrampolineSize << "\n"
+                      << "    indirect stub size = " << IndirectStubSize
+                      << "\n");
     return std::make_tuple(ProcessTriple, PointerSize, PageSize, TrampolineSize,
                            IndirectStubSize);
   }
@@ -354,8 +354,8 @@ private:
                                                uint64_t Size) {
     uint8_t *Src = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(RSrc));
 
-    DEBUG(dbgs() << "  Reading " << Size << " bytes from "
-                 << format("0x%016x", RSrc) << "\n");
+    LLVM_DEBUG(dbgs() << "  Reading " << Size << " bytes from "
+                      << format("0x%016x", RSrc) << "\n");
 
     std::vector<uint8_t> Buffer;
     Buffer.resize(Size);
@@ -367,8 +367,9 @@ private:
 
   Error handleRegisterEHFrames(JITTargetAddress TAddr, uint32_t Size) {
     uint8_t *Addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(TAddr));
-    DEBUG(dbgs() << "  Registering EH frames at " << format("0x%016x", TAddr)
-                 << ", Size = " << Size << " bytes\n");
+    LLVM_DEBUG(dbgs() << "  Registering EH frames at "
+                      << format("0x%016x", TAddr) << ", Size = " << Size
+                      << " bytes\n");
     EHFramesRegister(Addr, Size);
     return Error::success();
   }
@@ -384,8 +385,9 @@ private:
     if (auto Err = Allocator.allocate(LocalAllocAddr, Size, Align))
       return std::move(Err);
 
-    DEBUG(dbgs() << "  Allocator " << Id << " reserved " << LocalAllocAddr
-                 << " (" << Size << " bytes, alignment " << Align << ")\n");
+    LLVM_DEBUG(dbgs() << "  Allocator " << Id << " reserved " << LocalAllocAddr
+                      << " (" << Size << " bytes, alignment " << Align
+                      << ")\n");
 
     JITTargetAddress AllocAddr = static_cast<JITTargetAddress>(
         reinterpret_cast<uintptr_t>(LocalAllocAddr));
@@ -401,10 +403,11 @@ private:
                orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
     auto &Allocator = I->second;
     void *LocalAddr = reinterpret_cast<void *>(static_cast<uintptr_t>(Addr));
-    DEBUG(dbgs() << "  Allocator " << Id << " set permissions on " << LocalAddr
-                 << " to " << (Flags & sys::Memory::MF_READ ? 'R' : '-')
-                 << (Flags & sys::Memory::MF_WRITE ? 'W' : '-')
-                 << (Flags & sys::Memory::MF_EXEC ? 'X' : '-') << "\n");
+    LLVM_DEBUG(dbgs() << "  Allocator " << Id << " set permissions on "
+                      << LocalAddr << " to "
+                      << (Flags & sys::Memory::MF_READ ? 'R' : '-')
+                      << (Flags & sys::Memory::MF_WRITE ? 'W' : '-')
+                      << (Flags & sys::Memory::MF_EXEC ? 'X' : '-') << "\n");
     return Allocator.setProtections(LocalAddr, Flags);
   }
 
@@ -414,14 +417,14 @@ private:
   }
 
   Error handleWriteMem(DirectBufferWriter DBW) {
-    DEBUG(dbgs() << "  Writing " << DBW.getSize() << " bytes to "
-                 << format("0x%016x", DBW.getDst()) << "\n");
+    LLVM_DEBUG(dbgs() << "  Writing " << DBW.getSize() << " bytes to "
+                      << format("0x%016x", DBW.getDst()) << "\n");
     return Error::success();
   }
 
   Error handleWritePtr(JITTargetAddress Addr, JITTargetAddress PtrVal) {
-    DEBUG(dbgs() << "  Writing pointer *" << format("0x%016x", Addr) << " = "
-                 << format("0x%016x", PtrVal) << "\n");
+    LLVM_DEBUG(dbgs() << "  Writing pointer *" << format("0x%016x", Addr)
+                      << " = " << format("0x%016x", PtrVal) << "\n");
     uintptr_t *Ptr =
         reinterpret_cast<uintptr_t *>(static_cast<uintptr_t>(Addr));
     *Ptr = static_cast<uintptr_t>(PtrVal);
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/RPCUtils.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/RPCUtils.h
index c278cb176853..47bd90bb1bad 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/RPCUtils.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/RPCUtils.h
@@ -1631,7 +1631,7 @@ RPCAsyncDispatch<RPCEndpointT, Func> rpcAsyncDispatch(RPCEndpointT &Endpoint) {
   return RPCAsyncDispatch<RPCEndpointT, Func>(Endpoint);
 }
 
-/// \brief Allows a set of asynchrounous calls to be dispatched, and then
+/// Allows a set of asynchrounous calls to be dispatched, and then
 ///        waited on as a group.
 class ParallelCallGroup {
 public:
@@ -1640,7 +1640,7 @@ public:
   ParallelCallGroup(const ParallelCallGroup &) = delete;
   ParallelCallGroup &operator=(const ParallelCallGroup &) = delete;
 
-  /// \brief Make as asynchronous call.
+  /// Make as asynchronous call.
   template <typename AsyncDispatcher, typename HandlerT, typename... ArgTs>
   Error call(const AsyncDispatcher &AsyncDispatch, HandlerT Handler,
              const ArgTs &... Args) {
@@ -1669,7 +1669,7 @@ public:
     return AsyncDispatch(std::move(WrappedHandler), Args...);
   }
 
-  /// \brief Blocks until all calls have been completed and their return value
+  /// Blocks until all calls have been completed and their return value
   ///        handlers run.
   void wait() {
     std::unique_lock<std::mutex> Lock(M);
@@ -1683,21 +1683,21 @@ private:
   uint32_t NumOutstandingCalls = 0;
 };
 
-/// @brief Convenience class for grouping RPC Functions into APIs that can be
+/// Convenience class for grouping RPC Functions into APIs that can be
 ///        negotiated as a block.
 ///
 template <typename... Funcs>
 class APICalls {
 public:
 
-  /// @brief Test whether this API contains Function F.
+  /// Test whether this API contains Function F.
   template <typename F>
   class Contains {
   public:
     static const bool value = false;
   };
 
-  /// @brief Negotiate all functions in this API.
+  /// Negotiate all functions in this API.
   template <typename RPCEndpoint>
   static Error negotiate(RPCEndpoint &R) {
     return Error::success();
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 246c57341f35..48b3f7a58ed7 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -18,6 +18,9 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
+#include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
@@ -33,15 +36,62 @@
 namespace llvm {
 namespace orc {
 
-class RTDyldObjectLinkingLayerBase {
+class RTDyldObjectLinkingLayer2 : public ObjectLayer {
 public:
+  /// Functor for receiving object-loaded notifications.
+  using NotifyLoadedFunction =
+      std::function<void(VModuleKey, const object::ObjectFile &Obj,
+                         const RuntimeDyld::LoadedObjectInfo &)>;
+
+  /// Functor for receiving finalization notifications.
+  using NotifyFinalizedFunction = std::function<void(VModuleKey)>;
 
-  using ObjectPtr =
-    std::shared_ptr<object::OwningBinary<object::ObjectFile>>;
+  using GetMemoryManagerFunction =
+      std::function<std::shared_ptr<RuntimeDyld::MemoryManager>(VModuleKey)>;
+
+  /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
+  ///        and NotifyFinalized functors.
+  RTDyldObjectLinkingLayer2(
+      ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager,
+      NotifyLoadedFunction NotifyLoaded = NotifyLoadedFunction(),
+      NotifyFinalizedFunction NotifyFinalized = NotifyFinalizedFunction());
+
+  /// Emit the object.
+  void emit(MaterializationResponsibility R, VModuleKey K,
+            std::unique_ptr<MemoryBuffer> O) override;
+
+  /// Map section addresses for the object associated with the
+  ///        VModuleKey K.
+  void mapSectionAddress(VModuleKey K, const void *LocalAddress,
+                         JITTargetAddress TargetAddr) const;
+
+  /// Set the 'ProcessAllSections' flag.
+  ///
+  /// If set to true, all sections in each object file will be allocated using
+  /// the memory manager, rather than just the sections required for execution.
+  ///
+  /// This is kludgy, and may be removed in the future.
+  void setProcessAllSections(bool ProcessAllSections) {
+    this->ProcessAllSections = ProcessAllSections;
+  }
+
+private:
+  mutable std::mutex RTDyldLayerMutex;
+  GetMemoryManagerFunction GetMemoryManager;
+  NotifyLoadedFunction NotifyLoaded;
+  NotifyFinalizedFunction NotifyFinalized;
+  bool ProcessAllSections;
+  std::map<VModuleKey, RuntimeDyld *> ActiveRTDylds;
+  std::map<VModuleKey, std::shared_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
+};
+
+class RTDyldObjectLinkingLayerBase {
+public:
+  using ObjectPtr = std::unique_ptr<MemoryBuffer>;
 
 protected:
 
-  /// @brief Holds an object to be allocated/linked as a unit in the JIT.
+  /// Holds an object to be allocated/linked as a unit in the JIT.
   ///
   /// An instance of this class will be created for each object added
   /// via JITObjectLayer::addObject. Deleting the instance (via
@@ -55,7 +105,7 @@ protected:
     void operator=(const LinkedObject&) = delete;
     virtual ~LinkedObject() = default;
 
-    virtual void finalize() = 0;
+    virtual Error finalize() = 0;
 
     virtual JITSymbol::GetAddressFtor
     getSymbolMaterializer(std::string Name) = 0;
@@ -79,15 +129,9 @@ protected:
     StringMap<JITEvaluatedSymbol> SymbolTable;
     bool Finalized = false;
   };
-
-  using LinkedObjectListT = std::list<std::unique_ptr<LinkedObject>>;
-
-public:
-  /// @brief Handle to a loaded object.
-  using ObjHandleT = LinkedObjectListT::iterator;
 };
 
-/// @brief Bare bones object linking layer.
+/// Bare bones object linking layer.
 ///
 ///   This class is intended to be used as the base layer for a JIT. It allows
 /// object files to be loaded into memory, linked, and the addresses of their
@@ -98,67 +142,93 @@ public:
 
   using RTDyldObjectLinkingLayerBase::ObjectPtr;
 
-  /// @brief Functor for receiving object-loaded notifications.
+  /// Functor for receiving object-loaded notifications.
   using NotifyLoadedFtor =
-    std::function<void(ObjHandleT, const ObjectPtr &Obj,
-                       const RuntimeDyld::LoadedObjectInfo &)>;
+      std::function<void(VModuleKey, const object::ObjectFile &Obj,
+                         const RuntimeDyld::LoadedObjectInfo &)>;
 
-  /// @brief Functor for receiving finalization notifications.
-  using NotifyFinalizedFtor = std::function<void(ObjHandleT)>;
+  /// Functor for receiving finalization notifications.
+  using NotifyFinalizedFtor =
+      std::function<void(VModuleKey, const object::ObjectFile &Obj,
+                         const RuntimeDyld::LoadedObjectInfo &)>;
 
-private:
+  /// Functor for receiving deallocation notifications.
+  using NotifyFreedFtor = std::function<void(VModuleKey, const object::ObjectFile &Obj)>;
 
+private:
+  using OwnedObject = object::OwningBinary<object::ObjectFile>;
 
-  template <typename MemoryManagerPtrT, typename SymbolResolverPtrT,
-            typename FinalizerFtor>
+  template <typename MemoryManagerPtrT>
   class ConcreteLinkedObject : public LinkedObject {
   public:
-    ConcreteLinkedObject(ObjectPtr Obj, MemoryManagerPtrT MemMgr,
-                         SymbolResolverPtrT Resolver,
-                         FinalizerFtor Finalizer,
+    ConcreteLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K,
+                         OwnedObject Obj, MemoryManagerPtrT MemMgr,
+                         std::shared_ptr<SymbolResolver> Resolver,
                          bool ProcessAllSections)
-      : MemMgr(std::move(MemMgr)),
-        PFC(llvm::make_unique<PreFinalizeContents>(std::move(Obj),
-                                                   std::move(Resolver),
-                                                   std::move(Finalizer),
-                                                   ProcessAllSections)) {
+        : K(std::move(K)),
+          Parent(Parent),
+          MemMgr(std::move(MemMgr)),
+          PFC(llvm::make_unique<PreFinalizeContents>(
+              std::move(Obj), std::move(Resolver),
+              ProcessAllSections)) {
       buildInitialSymbolTable(PFC->Obj);
     }
 
     ~ConcreteLinkedObject() override {
-      MemMgr->deregisterEHFrames();
-    }
+      if (this->Parent.NotifyFreed)
+        this->Parent.NotifyFreed(K, *ObjForNotify.getBinary());
 
-    void setHandle(ObjHandleT H) {
-      PFC->Handle = H;
+      MemMgr->deregisterEHFrames();
     }
 
-    void finalize() override {
+    Error finalize() override {
       assert(PFC && "mapSectionAddress called on finalized LinkedObject");
 
-      RuntimeDyld RTDyld(*MemMgr, *PFC->Resolver);
-      RTDyld.setProcessAllSections(PFC->ProcessAllSections);
-      PFC->RTDyld = &RTDyld;
+      JITSymbolResolverAdapter ResolverAdapter(Parent.ES, *PFC->Resolver,
+					       nullptr);
+      PFC->RTDyld = llvm::make_unique<RuntimeDyld>(*MemMgr, ResolverAdapter);
+      PFC->RTDyld->setProcessAllSections(PFC->ProcessAllSections);
+
+      Finalized = true;
+
+      std::unique_ptr<RuntimeDyld::LoadedObjectInfo> Info =
+          PFC->RTDyld->loadObject(*PFC->Obj.getBinary());
+
+      // Copy the symbol table out of the RuntimeDyld instance.
+      {
+        auto SymTab = PFC->RTDyld->getSymbolTable();
+        for (auto &KV : SymTab)
+          SymbolTable[KV.first] = KV.second;
+      }
+
+      if (Parent.NotifyLoaded)
+        Parent.NotifyLoaded(K, *PFC->Obj.getBinary(), *Info);
+
+      PFC->RTDyld->finalizeWithMemoryManagerLocking();
+
+      if (PFC->RTDyld->hasError())
+        return make_error<StringError>(PFC->RTDyld->getErrorString(),
+                                       inconvertibleErrorCode());
 
-      this->Finalized = true;
-      PFC->Finalizer(PFC->Handle, RTDyld, std::move(PFC->Obj),
-                     [&]() {
-                       this->updateSymbolTable(RTDyld);
-                     });
+      if (Parent.NotifyFinalized)
+        Parent.NotifyFinalized(K, *PFC->Obj.getBinary(), *Info);
 
       // Release resources.
+      if (this->Parent.NotifyFreed)
+        ObjForNotify = std::move(PFC->Obj); // needed for callback
       PFC = nullptr;
+      return Error::success();
     }
 
     JITSymbol::GetAddressFtor getSymbolMaterializer(std::string Name) override {
-      return
-        [this, Name]() {
-          // The symbol may be materialized between the creation of this lambda
-          // and its execution, so we need to double check.
-          if (!this->Finalized)
-            this->finalize();
-          return this->getSymbol(Name, false).getAddress();
-        };
+      return [this, Name]() -> Expected<JITTargetAddress> {
+        // The symbol may be materialized between the creation of this lambda
+        // and its execution, so we need to double check.
+        if (!this->Finalized)
+          if (auto Err = this->finalize())
+            return std::move(Err);
+        return this->getSymbol(Name, false).getAddress();
+      };
     }
 
     void mapSectionAddress(const void *LocalAddress,
@@ -169,9 +239,8 @@ private:
     }
 
   private:
-
-    void buildInitialSymbolTable(const ObjectPtr &Obj) {
-      for (auto &Symbol : Obj->getBinary()->symbols()) {
+    void buildInitialSymbolTable(const OwnedObject &Obj) {
+      for (auto &Symbol : Obj.getBinary()->symbols()) {
         if (Symbol.getFlags() & object::SymbolRef::SF_Undefined)
           continue;
         Expected<StringRef> SymbolName = Symbol.getName();
@@ -186,65 +255,64 @@ private:
       }
     }
 
-    void updateSymbolTable(const RuntimeDyld &RTDyld) {
-      for (auto &SymEntry : SymbolTable)
-        SymEntry.second = RTDyld.getSymbol(SymEntry.first());
-    }
-
     // Contains the information needed prior to finalization: the object files,
     // memory manager, resolver, and flags needed for RuntimeDyld.
     struct PreFinalizeContents {
-      PreFinalizeContents(ObjectPtr Obj, SymbolResolverPtrT Resolver,
-                          FinalizerFtor Finalizer, bool ProcessAllSections)
-        : Obj(std::move(Obj)), Resolver(std::move(Resolver)),
-          Finalizer(std::move(Finalizer)),
-          ProcessAllSections(ProcessAllSections) {}
-
-      ObjectPtr Obj;
-      SymbolResolverPtrT Resolver;
-      FinalizerFtor Finalizer;
+      PreFinalizeContents(OwnedObject Obj,
+                          std::shared_ptr<SymbolResolver> Resolver,
+                          bool ProcessAllSections)
+          : Obj(std::move(Obj)),
+            Resolver(std::move(Resolver)),
+            ProcessAllSections(ProcessAllSections) {}
+
+      OwnedObject Obj;
+      std::shared_ptr<SymbolResolver> Resolver;
       bool ProcessAllSections;
-      ObjHandleT Handle;
-      RuntimeDyld *RTDyld;
+      std::unique_ptr<RuntimeDyld> RTDyld;
     };
 
+    VModuleKey K;
+    RTDyldObjectLinkingLayer &Parent;
     MemoryManagerPtrT MemMgr;
+    OwnedObject ObjForNotify;
     std::unique_ptr<PreFinalizeContents> PFC;
   };
 
-  template <typename MemoryManagerPtrT, typename SymbolResolverPtrT,
-            typename FinalizerFtor>
-  std::unique_ptr<
-    ConcreteLinkedObject<MemoryManagerPtrT, SymbolResolverPtrT, FinalizerFtor>>
-  createLinkedObject(ObjectPtr Obj, MemoryManagerPtrT MemMgr,
-                     SymbolResolverPtrT Resolver,
-                     FinalizerFtor Finalizer,
+  template <typename MemoryManagerPtrT>
+  std::unique_ptr<ConcreteLinkedObject<MemoryManagerPtrT>>
+  createLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K,
+                     OwnedObject Obj, MemoryManagerPtrT MemMgr,
+                     std::shared_ptr<SymbolResolver> Resolver,
                      bool ProcessAllSections) {
-    using LOS = ConcreteLinkedObject<MemoryManagerPtrT, SymbolResolverPtrT,
-                                     FinalizerFtor>;
-    return llvm::make_unique<LOS>(std::move(Obj), std::move(MemMgr),
-                                  std::move(Resolver), std::move(Finalizer),
+    using LOS = ConcreteLinkedObject<MemoryManagerPtrT>;
+    return llvm::make_unique<LOS>(Parent, std::move(K), std::move(Obj),
+                                  std::move(MemMgr), std::move(Resolver),
                                   ProcessAllSections);
   }
 
 public:
+  struct Resources {
+    std::shared_ptr<RuntimeDyld::MemoryManager> MemMgr;
+    std::shared_ptr<SymbolResolver> Resolver;
+  };
 
-  /// @brief Functor for creating memory managers.
-  using MemoryManagerGetter =
-    std::function<std::shared_ptr<RuntimeDyld::MemoryManager>()>;
+  using ResourcesGetter = std::function<Resources(VModuleKey)>;
 
-  /// @brief Construct an ObjectLinkingLayer with the given NotifyLoaded,
+  /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyFinalized functors.
   RTDyldObjectLinkingLayer(
-      MemoryManagerGetter GetMemMgr,
+      ExecutionSession &ES, ResourcesGetter GetResources,
       NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
-      NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor())
-      : GetMemMgr(GetMemMgr),
+      NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor(),
+      NotifyFreedFtor NotifyFreed = NotifyFreedFtor())
+      : ES(ES), GetResources(std::move(GetResources)),
         NotifyLoaded(std::move(NotifyLoaded)),
         NotifyFinalized(std::move(NotifyFinalized)),
-        ProcessAllSections(false) {}
+        NotifyFreed(std::move(NotifyFreed)),
+        ProcessAllSections(false) {
+  }
 
-  /// @brief Set the 'ProcessAllSections' flag.
+  /// Set the 'ProcessAllSections' flag.
   ///
   /// If set to true, all sections in each object file will be allocated using
   /// the memory manager, rather than just the sections required for execution.
@@ -254,44 +322,26 @@ public:
     this->ProcessAllSections = ProcessAllSections;
   }
 
-  /// @brief Add an object to the JIT.
-  ///
-  /// @return A handle that can be used to refer to the loaded object (for 
-  ///         symbol searching, finalization, freeing memory, etc.).
-  Expected<ObjHandleT> addObject(ObjectPtr Obj,
-                                 std::shared_ptr<JITSymbolResolver> Resolver) {
-    auto Finalizer = [&](ObjHandleT H, RuntimeDyld &RTDyld,
-                         const ObjectPtr &ObjToLoad,
-                         std::function<void()> LOSHandleLoad) {
-      std::unique_ptr<RuntimeDyld::LoadedObjectInfo> Info =
-        RTDyld.loadObject(*ObjToLoad->getBinary());
-
-      LOSHandleLoad();
-
-      if (this->NotifyLoaded)
-        this->NotifyLoaded(H, ObjToLoad, *Info);
+  /// Add an object to the JIT.
+  Error addObject(VModuleKey K, ObjectPtr ObjBuffer) {
 
-      RTDyld.finalizeWithMemoryManagerLocking();
+    auto Obj =
+        object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
+    if (!Obj)
+      return Obj.takeError();
 
-      if (this->NotifyFinalized)
-        this->NotifyFinalized(H);
-    };
+    assert(!LinkedObjects.count(K) && "VModuleKey already in use");
 
-    auto LO =
-      createLinkedObject(std::move(Obj), GetMemMgr(),
-                         std::move(Resolver), std::move(Finalizer),
-                         ProcessAllSections);
-    // LOS is an owning-ptr. Keep a non-owning one so that we can set the handle
-    // below.
-    auto *LOPtr = LO.get();
+    auto R = GetResources(K);
 
-    ObjHandleT Handle = LinkedObjList.insert(LinkedObjList.end(), std::move(LO));
-    LOPtr->setHandle(Handle);
+    LinkedObjects[K] = createLinkedObject(
+        *this, K, OwnedObject(std::move(*Obj), std::move(ObjBuffer)),
+        std::move(R.MemMgr), std::move(R.Resolver), ProcessAllSections);
 
-    return Handle;
+    return Error::success();
   }
 
-  /// @brief Remove the object associated with handle H.
+  /// Remove the object associated with VModuleKey K.
   ///
   ///   All memory allocated for the object will be freed, and the sections and
   /// symbols it provided will no longer be available. No attempt is made to
@@ -299,57 +349,64 @@ public:
   /// indirectly) will result in undefined behavior. If dependence tracking is
   /// required to detect or resolve such issues it should be added at a higher
   /// layer.
-  Error removeObject(ObjHandleT H) {
+  Error removeObject(VModuleKey K) {
+    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
     // How do we invalidate the symbols in H?
-    LinkedObjList.erase(H);
+    LinkedObjects.erase(K);
     return Error::success();
   }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it exists.
   JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
-    for (auto I = LinkedObjList.begin(), E = LinkedObjList.end(); I != E;
-         ++I)
-      if (auto Symbol = findSymbolIn(I, Name, ExportedSymbolsOnly))
-        return Symbol;
+    for (auto &KV : LinkedObjects)
+      if (auto Sym = KV.second->getSymbol(Name, ExportedSymbolsOnly))
+        return Sym;
+      else if (auto Err = Sym.takeError())
+        return std::move(Err);
 
     return nullptr;
   }
 
-  /// @brief Search for the given named symbol in the context of the loaded
-  ///        object represented by the handle H.
-  /// @param H The handle for the object to search in.
+  /// Search for the given named symbol in the context of the loaded
+  ///        object represented by the VModuleKey K.
+  /// @param K The VModuleKey for the object to search in.
   /// @param Name The name of the symbol to search for.
   /// @param ExportedSymbolsOnly If true, search only for exported symbols.
   /// @return A handle for the given named symbol, if it is found in the
   ///         given object.
-  JITSymbol findSymbolIn(ObjHandleT H, StringRef Name,
+  JITSymbol findSymbolIn(VModuleKey K, StringRef Name,
                          bool ExportedSymbolsOnly) {
-    return (*H)->getSymbol(Name, ExportedSymbolsOnly);
+    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
+    return LinkedObjects[K]->getSymbol(Name, ExportedSymbolsOnly);
   }
 
-  /// @brief Map section addresses for the object associated with the handle H.
-  void mapSectionAddress(ObjHandleT H, const void *LocalAddress,
+  /// Map section addresses for the object associated with the
+  ///        VModuleKey K.
+  void mapSectionAddress(VModuleKey K, const void *LocalAddress,
                          JITTargetAddress TargetAddr) {
-    (*H)->mapSectionAddress(LocalAddress, TargetAddr);
+    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
+    LinkedObjects[K]->mapSectionAddress(LocalAddress, TargetAddr);
   }
 
-  /// @brief Immediately emit and finalize the object represented by the given
-  ///        handle.
-  /// @param H Handle for object to emit/finalize.
-  Error emitAndFinalize(ObjHandleT H) {
-    (*H)->finalize();
-    return Error::success();
+  /// Immediately emit and finalize the object represented by the given
+  ///        VModuleKey.
+  /// @param K VModuleKey for object to emit/finalize.
+  Error emitAndFinalize(VModuleKey K) {
+    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
+    return LinkedObjects[K]->finalize();
   }
 
 private:
+  ExecutionSession &ES;
 
-  LinkedObjectListT LinkedObjList;
-  MemoryManagerGetter GetMemMgr;
+  std::map<VModuleKey, std::unique_ptr<LinkedObject>> LinkedObjects;
+  ResourcesGetter GetResources;
   NotifyLoadedFtor NotifyLoaded;
   NotifyFinalizedFtor NotifyFinalized;
+  NotifyFreedFtor NotifyFreed;
   bool ProcessAllSections = false;
 };
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
index 17255954a99f..955e77607a18 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
@@ -306,8 +306,7 @@ public:
   using ObjHandleT = RemoteObjectLayerAPI::ObjHandleT;
   using RemoteSymbol = RemoteObjectLayerAPI::RemoteSymbol;
 
-  using ObjectPtr =
-    std::shared_ptr<object::OwningBinary<object::ObjectFile>>;
+  using ObjectPtr = std::unique_ptr<MemoryBuffer>;
 
   /// Create a RemoteObjectClientLayer that communicates with a
   /// RemoteObjectServerLayer instance via the given RPCEndpoint.
@@ -323,15 +322,15 @@ public:
             *this, &ThisT::lookupInLogicalDylib);
   }
 
-  /// @brief Add an object to the JIT.
+  /// Add an object to the JIT.
   ///
   /// @return A handle that can be used to refer to the loaded object (for
   ///         symbol searching, finalization, freeing memory, etc.).
   Expected<ObjHandleT>
-  addObject(ObjectPtr Object, std::shared_ptr<JITSymbolResolver> Resolver) {
-    StringRef ObjBuffer = Object->getBinary()->getData();
+  addObject(ObjectPtr ObjBuffer,
+            std::shared_ptr<LegacyJITSymbolResolver> Resolver) {
     if (auto HandleOrErr =
-          this->Remote.template callB<AddObject>(ObjBuffer)) {
+            this->Remote.template callB<AddObject>(ObjBuffer->getBuffer())) {
       auto &Handle = *HandleOrErr;
       // FIXME: Return an error for this:
       assert(!Resolvers.count(Handle) && "Handle already in use?");
@@ -341,26 +340,26 @@ public:
       return HandleOrErr.takeError();
   }
 
-  /// @brief Remove the given object from the JIT.
+  /// Remove the given object from the JIT.
   Error removeObject(ObjHandleT H) {
     return this->Remote.template callB<RemoveObject>(H);
   }
 
-  /// @brief Search for the given named symbol.
+  /// Search for the given named symbol.
   JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
     return remoteToJITSymbol(
              this->Remote.template callB<FindSymbol>(Name,
                                                      ExportedSymbolsOnly));
   }
 
-  /// @brief Search for the given named symbol within the given context.
+  /// Search for the given named symbol within the given context.
   JITSymbol findSymbolIn(ObjHandleT H, StringRef Name, bool ExportedSymbolsOnly) {
     return remoteToJITSymbol(
              this->Remote.template callB<FindSymbolIn>(H, Name,
                                                        ExportedSymbolsOnly));
   }
 
-  /// @brief Immediately emit and finalize the object with the given handle.
+  /// Immediately emit and finalize the object with the given handle.
   Error emitAndFinalize(ObjHandleT H) {
     return this->Remote.template callB<EmitAndFinalize>(H);
   }
@@ -386,7 +385,8 @@ private:
   }
 
   std::map<remote::ResourceIdMgr::ResourceId,
-           std::shared_ptr<JITSymbolResolver>> Resolvers;
+           std::shared_ptr<LegacyJITSymbolResolver>>
+      Resolvers;
 };
 
 /// RemoteObjectServerLayer acts as a server and handling RPC calls for the
@@ -459,30 +459,21 @@ private:
 
   Expected<ObjHandleT> addObject(std::string ObjBuffer) {
     auto Buffer = llvm::make_unique<StringMemoryBuffer>(std::move(ObjBuffer));
-    if (auto ObjectOrErr =
-          object::ObjectFile::createObjectFile(Buffer->getMemBufferRef())) {
-      auto Object =
-        std::make_shared<object::OwningBinary<object::ObjectFile>>(
-          std::move(*ObjectOrErr), std::move(Buffer));
-
-      auto Id = HandleIdMgr.getNext();
-      assert(!BaseLayerHandles.count(Id) && "Id already in use?");
-
-      auto Resolver =
-        createLambdaResolver(
-          [this, Id](const std::string &Name) { return lookup(Id, Name); },
-          [this, Id](const std::string &Name) {
-            return lookupInLogicalDylib(Id, Name);
-          });
-
-      if (auto HandleOrErr =
-          BaseLayer.addObject(std::move(Object), std::move(Resolver))) {
-        BaseLayerHandles[Id] = std::move(*HandleOrErr);
-        return Id;
-      } else
-        return teeLog(HandleOrErr.takeError());
+    auto Id = HandleIdMgr.getNext();
+    assert(!BaseLayerHandles.count(Id) && "Id already in use?");
+
+    auto Resolver = createLambdaResolver(
+        [this, Id](const std::string &Name) { return lookup(Id, Name); },
+        [this, Id](const std::string &Name) {
+          return lookupInLogicalDylib(Id, Name);
+        });
+
+    if (auto HandleOrErr =
+            BaseLayer.addObject(std::move(Buffer), std::move(Resolver))) {
+      BaseLayerHandles[Id] = std::move(*HandleOrErr);
+      return Id;
     } else
-      return teeLog(ObjectOrErr.takeError());
+      return teeLog(HandleOrErr.takeError());
   }
 
   Error removeObject(ObjHandleT H) {
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/contrib/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
index b01fbd44bacd..4c45cfd199dd 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
@@ -23,29 +23,36 @@ namespace orc {
 
 class SymbolStringPtr;
 
-/// @brief String pool for symbol names used by the JIT.
+/// String pool for symbol names used by the JIT.
 class SymbolStringPool {
   friend class SymbolStringPtr;
 public:
-  /// @brief Create a symbol string pointer from the given string.
+  /// Destroy a SymbolStringPool.
+  ~SymbolStringPool();
+
+  /// Create a symbol string pointer from the given string.
   SymbolStringPtr intern(StringRef S);
 
-  /// @brief Remove from the pool any entries that are no longer referenced.
+  /// Remove from the pool any entries that are no longer referenced.
   void clearDeadEntries();
 
-  /// @brief Returns true if the pool is empty.
+  /// Returns true if the pool is empty.
   bool empty() const;
 private:
-  using RefCountType = std::atomic<uint64_t>;
+  using RefCountType = std::atomic<size_t>;
   using PoolMap = StringMap<RefCountType>;
   using PoolMapEntry = StringMapEntry<RefCountType>;
   mutable std::mutex PoolMutex;
   PoolMap Pool;
 };
 
-/// @brief Pointer to a pooled string representing a symbol name.
+/// Pointer to a pooled string representing a symbol name.
 class SymbolStringPtr {
   friend class SymbolStringPool;
+  friend bool operator==(const SymbolStringPtr &LHS,
+                         const SymbolStringPtr &RHS);
+  friend bool operator<(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS);
+
 public:
   SymbolStringPtr() = default;
   SymbolStringPtr(const SymbolStringPtr &Other)
@@ -80,17 +87,7 @@ public:
       --S->getValue();
   }
 
-  bool operator==(const SymbolStringPtr &Other) const {
-    return S == Other.S;
-  }
-
-  bool operator!=(const SymbolStringPtr &Other) const {
-    return !(*this == Other);
-  }
-
-  bool operator<(const SymbolStringPtr &Other) const {
-    return S->getValue() < Other.S->getValue();
-  }
+  StringRef operator*() const { return S->first(); }
 
 private:
 
@@ -103,25 +100,39 @@ private:
   SymbolStringPool::PoolMapEntry *S = nullptr;
 };
 
+inline bool operator==(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS) {
+  return LHS.S == RHS.S;
+}
+
+inline bool operator!=(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS) {
+  return !(LHS == RHS);
+}
+
+inline bool operator<(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS) {
+  return LHS.S < RHS.S;
+}
+
+inline SymbolStringPool::~SymbolStringPool() {
+#ifndef NDEBUG
+  clearDeadEntries();
+  assert(Pool.empty() && "Dangling references at pool destruction time");
+#endif // NDEBUG
+}
+
 inline SymbolStringPtr SymbolStringPool::intern(StringRef S) {
   std::lock_guard<std::mutex> Lock(PoolMutex);
-  auto I = Pool.find(S);
-  if (I != Pool.end())
-    return SymbolStringPtr(&*I);
-
+  PoolMap::iterator I;
   bool Added;
   std::tie(I, Added) = Pool.try_emplace(S, 0);
-  assert(Added && "Insert should always succeed here");
   return SymbolStringPtr(&*I);
 }
 
 inline void SymbolStringPool::clearDeadEntries() {
   std::lock_guard<std::mutex> Lock(PoolMutex);
   for (auto I = Pool.begin(), E = Pool.end(); I != E;) {
-    auto Tmp = std::next(I);
-    if (I->second == 0)
-      Pool.erase(I);
-    I = Tmp;
+    auto Tmp = I++;
+    if (Tmp->second == 0)
+      Pool.erase(Tmp);
   }
 }
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/contrib/llvm/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index 0c1862c5c3ea..23d651f6d1b6 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -47,6 +47,9 @@ public:
   /// newly loaded object.
   virtual void notifyObjectLoaded(ExecutionEngine *EE,
                                   const object::ObjectFile &) {}
+
+private:
+  void anchor() override;
 };
 
 // RuntimeDyld clients often want to handle the memory management of
@@ -56,7 +59,7 @@ public:
 // FIXME: As the RuntimeDyld fills out, additional routines will be needed
 //        for the varying types of objects to be allocated.
 class RTDyldMemoryManager : public MCJITMemoryManager,
-                            public JITSymbolResolver {
+                            public LegacyJITSymbolResolver {
 public:
   RTDyldMemoryManager() = default;
   RTDyldMemoryManager(const RTDyldMemoryManager&) = delete;
@@ -142,6 +145,9 @@ protected:
   };
   typedef std::vector<EHFrame> EHFrameInfos;
   EHFrameInfos EHFrames;
+
+private:
+  void anchor() override;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h b/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
index 56aa04ce694a..5dd5add1bb39 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -65,7 +65,7 @@ protected:
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 
 public:
-  /// \brief Information about the loaded object.
+  /// Information about the loaded object.
   class LoadedObjectInfo : public llvm::LoadedObjectInfo {
     friend class RuntimeDyldImpl;
 
@@ -88,7 +88,7 @@ public:
     ObjSectionToIDMap ObjSecToIDMap;
   };
 
-  /// \brief Memory Management.
+  /// Memory Management.
   class MemoryManager {
     friend class RuntimeDyld;
 
@@ -170,7 +170,7 @@ public:
     bool FinalizationLocked = false;
   };
 
-  /// \brief Construct a RuntimeDyld instance.
+  /// Construct a RuntimeDyld instance.
   RuntimeDyld(MemoryManager &MemMgr, JITSymbolResolver &Resolver);
   RuntimeDyld(const RuntimeDyld &) = delete;
   RuntimeDyld &operator=(const RuntimeDyld &) = delete;
@@ -189,6 +189,13 @@ public:
   /// This address is the one used for relocation.
   JITEvaluatedSymbol getSymbol(StringRef Name) const;
 
+  /// Returns a copy of the symbol table. This can be used by on-finalized
+  /// callbacks to extract the symbol table before throwing away the
+  /// RuntimeDyld instance. Because the map keys (StringRefs) are backed by
+  /// strings inside the RuntimeDyld instance, the map should be processed
+  /// before the RuntimeDyld instance is discarded.
+  std::map<StringRef, JITEvaluatedSymbol> getSymbolTable() const;
+
   /// Resolve the relocations for all symbols we currently know about.
   void resolveRelocations();
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
index de89f405af4c..13fc5fd5a3e7 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
@@ -27,7 +27,7 @@ class RuntimeDyld;
 class RuntimeDyldCheckerImpl;
 class raw_ostream;
 
-/// \brief RuntimeDyld invariant checker for verifying that RuntimeDyld has
+/// RuntimeDyld invariant checker for verifying that RuntimeDyld has
 ///        correctly applied relocations.
 ///
 /// The RuntimeDyldChecker class evaluates expressions against an attached
@@ -74,22 +74,22 @@ public:
                      MCInstPrinter *InstPrinter, raw_ostream &ErrStream);
   ~RuntimeDyldChecker();
 
-  // \brief Get the associated RTDyld instance.
+  // Get the associated RTDyld instance.
   RuntimeDyld& getRTDyld();
 
-  // \brief Get the associated RTDyld instance.
+  // Get the associated RTDyld instance.
   const RuntimeDyld& getRTDyld() const;
 
-  /// \brief Check a single expression against the attached RuntimeDyld
+  /// Check a single expression against the attached RuntimeDyld
   ///        instance.
   bool check(StringRef CheckExpr) const;
 
-  /// \brief Scan the given memory buffer for lines beginning with the string
+  /// Scan the given memory buffer for lines beginning with the string
   ///        in RulePrefix. The remainder of the line is passed to the check
   ///        method to be evaluated as an expression.
   bool checkAllRulesInBuffer(StringRef RulePrefix, MemoryBuffer *MemBuf) const;
 
-  /// \brief Returns the address of the requested section (or an error message
+  /// Returns the address of the requested section (or an error message
   ///        in the second element of the pair if the address cannot be found).
   ///
   /// if 'LocalAddress' is true, this returns the address of the section
@@ -99,7 +99,7 @@ public:
                                                   StringRef SectionName,
                                                   bool LocalAddress);
 
-  /// \brief If there is a section at the given local address, return its load
+  /// If there is a section at the given local address, return its load
   ///        address, otherwise return none.
   Optional<uint64_t> getSectionLoadAddress(void *LocalAddress) const;
 
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h b/contrib/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
index d76e37113c66..3cf131c27778 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/contrib/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -111,7 +111,7 @@ public:
   void operator=(const SectionMemoryManager &) = delete;
   ~SectionMemoryManager() override;
 
-  /// \brief Allocates a memory block of (at least) the given size suitable for
+  /// Allocates a memory block of (at least) the given size suitable for
   /// executable code.
   ///
   /// The value of \p Alignment must be a power of two.  If \p Alignment is zero
@@ -120,7 +120,7 @@ public:
                                unsigned SectionID,
                                StringRef SectionName) override;
 
-  /// \brief Allocates a memory block of (at least) the given size suitable for
+  /// Allocates a memory block of (at least) the given size suitable for
   /// executable code.
   ///
   /// The value of \p Alignment must be a power of two.  If \p Alignment is zero
@@ -129,7 +129,7 @@ public:
                                unsigned SectionID, StringRef SectionName,
                                bool isReadOnly) override;
 
-  /// \brief Update section-specific memory permissions and other attributes.
+  /// Update section-specific memory permissions and other attributes.
   ///
   /// This method is called when object loading is complete and section page
   /// permissions can be applied.  It is up to the memory manager implementation
@@ -142,7 +142,7 @@ public:
   /// \returns true if an error occurred, false otherwise.
   bool finalizeMemory(std::string *ErrMsg = nullptr) override;
 
-  /// \brief Invalidate instruction cache for code sections.
+  /// Invalidate instruction cache for code sections.
   ///
   /// Some platforms with separate data cache and instruction cache require
   /// explicit cache flush, otherwise JIT code manipulations (like resolved
@@ -182,6 +182,8 @@ private:
   std::error_code applyMemoryGroupPermissions(MemoryGroup &MemGroup,
                                               unsigned Permissions);
 
+  void anchor() override;
+
   MemoryGroup CodeMem;
   MemoryGroup RWDataMem;
   MemoryGroup RODataMem;
diff --git a/contrib/llvm/include/llvm/FuzzMutate/FuzzerCLI.h b/contrib/llvm/include/llvm/FuzzMutate/FuzzerCLI.h
index a775fdfb603e..3333e96db166 100644
--- a/contrib/llvm/include/llvm/FuzzMutate/FuzzerCLI.h
+++ b/contrib/llvm/include/llvm/FuzzMutate/FuzzerCLI.h
@@ -68,6 +68,12 @@ std::unique_ptr<Module> parseModule(const uint8_t *Data, size_t Size,
 ///         returns 0 and leaves Dest unchanged.
 size_t writeModule(const Module &M, uint8_t *Dest, size_t MaxSize);
 
+/// Try to parse module and verify it. May output verification errors to the
+/// errs().
+/// \return New module or nullptr in case of error.
+std::unique_ptr<Module> parseAndVerify(const uint8_t *Data, size_t Size,
+                                       LLVMContext &Context);
+
 } // end llvm namespace
 
 #endif // LLVM_FUZZMUTATE_FUZZER_CLI_H
diff --git a/contrib/llvm/include/llvm/FuzzMutate/OpDescriptor.h b/contrib/llvm/include/llvm/FuzzMutate/OpDescriptor.h
index e9f8bf09a79b..dd30fda99bea 100644
--- a/contrib/llvm/include/llvm/FuzzMutate/OpDescriptor.h
+++ b/contrib/llvm/include/llvm/FuzzMutate/OpDescriptor.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include <functional>
@@ -128,7 +129,7 @@ static inline SourcePred anyFloatType() {
 
 static inline SourcePred anyPtrType() {
   auto Pred = [](ArrayRef<Value *>, const Value *V) {
-    return V->getType()->isPointerTy();
+    return V->getType()->isPointerTy() && !V->isSwiftError();
   };
   auto Make = [](ArrayRef<Value *>, ArrayRef<Type *> Ts) {
     std::vector<Constant *> Result;
@@ -142,6 +143,9 @@ static inline SourcePred anyPtrType() {
 
 static inline SourcePred sizedPtrType() {
   auto Pred = [](ArrayRef<Value *>, const Value *V) {
+    if (V->isSwiftError())
+      return false;
+
     if (const auto *PtrT = dyn_cast<PointerType>(V->getType()))
       return PtrT->getElementType()->isSized();
     return false;
diff --git a/contrib/llvm/include/llvm/IR/Attributes.h b/contrib/llvm/include/llvm/IR/Attributes.h
index a05a01073049..5aaaaf3c396b 100644
--- a/contrib/llvm/include/llvm/IR/Attributes.h
+++ b/contrib/llvm/include/llvm/IR/Attributes.h
@@ -6,11 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-///
+//
 /// \file
-/// \brief This file contains the simple types necessary to represent the
+/// This file contains the simple types necessary to represent the
 /// attributes associated with functions and their calls.
-///
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_IR_ATTRIBUTES_H
@@ -22,6 +22,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <bitset>
 #include <cassert>
@@ -35,7 +36,6 @@ namespace llvm {
 class AttrBuilder;
 class AttributeImpl;
 class AttributeListImpl;
-class AttributeList;
 class AttributeSetNode;
 template<typename T> struct DenseMapInfo;
 class Function;
@@ -44,7 +44,7 @@ class Type;
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief Functions, function parameters, and return types can have attributes
+/// Functions, function parameters, and return types can have attributes
 /// to indicate how they should be treated by optimizations and code
 /// generation. This class represents one of those attributes. It's light-weight
 /// and should be passed around by-value.
@@ -71,7 +71,7 @@ public:
     // IR-Level Attributes
     None,                  ///< No attributes have been set
     #define GET_ATTR_ENUM
-    #include "llvm/IR/Attributes.gen"
+    #include "llvm/IR/Attributes.inc"
     EndAttrKinds           ///< Sentinal value useful for loops
   };
 
@@ -87,12 +87,12 @@ public:
   // Attribute Construction
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return a uniquified Attribute object.
+  /// Return a uniquified Attribute object.
   static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val = 0);
   static Attribute get(LLVMContext &Context, StringRef Kind,
                        StringRef Val = StringRef());
 
-  /// \brief Return a uniquified Attribute object that has the specific
+  /// Return a uniquified Attribute object that has the specific
   /// alignment set.
   static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align);
   static Attribute getWithStackAlignment(LLVMContext &Context, uint64_t Align);
@@ -108,51 +108,51 @@ public:
   // Attribute Accessors
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return true if the attribute is an Attribute::AttrKind type.
+  /// Return true if the attribute is an Attribute::AttrKind type.
   bool isEnumAttribute() const;
 
-  /// \brief Return true if the attribute is an integer attribute.
+  /// Return true if the attribute is an integer attribute.
   bool isIntAttribute() const;
 
-  /// \brief Return true if the attribute is a string (target-dependent)
+  /// Return true if the attribute is a string (target-dependent)
   /// attribute.
   bool isStringAttribute() const;
 
-  /// \brief Return true if the attribute is present.
+  /// Return true if the attribute is present.
   bool hasAttribute(AttrKind Val) const;
 
-  /// \brief Return true if the target-dependent attribute is present.
+  /// Return true if the target-dependent attribute is present.
   bool hasAttribute(StringRef Val) const;
 
-  /// \brief Return the attribute's kind as an enum (Attribute::AttrKind). This
+  /// Return the attribute's kind as an enum (Attribute::AttrKind). This
   /// requires the attribute to be an enum or integer attribute.
   Attribute::AttrKind getKindAsEnum() const;
 
-  /// \brief Return the attribute's value as an integer. This requires that the
+  /// Return the attribute's value as an integer. This requires that the
   /// attribute be an integer attribute.
   uint64_t getValueAsInt() const;
 
-  /// \brief Return the attribute's kind as a string. This requires the
+  /// Return the attribute's kind as a string. This requires the
   /// attribute to be a string attribute.
   StringRef getKindAsString() const;
 
-  /// \brief Return the attribute's value as a string. This requires the
+  /// Return the attribute's value as a string. This requires the
   /// attribute to be a string attribute.
   StringRef getValueAsString() const;
 
-  /// \brief Returns the alignment field of an attribute as a byte alignment
+  /// Returns the alignment field of an attribute as a byte alignment
   /// value.
   unsigned getAlignment() const;
 
-  /// \brief Returns the stack alignment field of an attribute as a byte
+  /// Returns the stack alignment field of an attribute as a byte
   /// alignment value.
   unsigned getStackAlignment() const;
 
-  /// \brief Returns the number of dereferenceable bytes from the
+  /// Returns the number of dereferenceable bytes from the
   /// dereferenceable attribute.
   uint64_t getDereferenceableBytes() const;
 
-  /// \brief Returns the number of dereferenceable_or_null bytes from the
+  /// Returns the number of dereferenceable_or_null bytes from the
   /// dereferenceable_or_null attribute.
   uint64_t getDereferenceableOrNullBytes() const;
 
@@ -160,27 +160,27 @@ public:
   /// if not known).
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
 
-  /// \brief The Attribute is converted to a string of equivalent mnemonic. This
+  /// The Attribute is converted to a string of equivalent mnemonic. This
   /// is, presumably, for writing out the mnemonics for the assembly writer.
   std::string getAsString(bool InAttrGrp = false) const;
 
-  /// \brief Equality and non-equality operators.
+  /// Equality and non-equality operators.
   bool operator==(Attribute A) const { return pImpl == A.pImpl; }
   bool operator!=(Attribute A) const { return pImpl != A.pImpl; }
 
-  /// \brief Less-than operator. Useful for sorting the attributes list.
+  /// Less-than operator. Useful for sorting the attributes list.
   bool operator<(Attribute A) const;
 
   void Profile(FoldingSetNodeID &ID) const {
     ID.AddPointer(pImpl);
   }
 
-  /// \brief Return a raw pointer that uniquely identifies this attribute.
+  /// Return a raw pointer that uniquely identifies this attribute.
   void *getRawPointer() const {
     return pImpl;
   }
 
-  /// \brief Get an attribute from a raw pointer created by getRawPointer.
+  /// Get an attribute from a raw pointer created by getRawPointer.
   static Attribute fromRawPointer(void *RawPtr) {
     return Attribute(reinterpret_cast<AttributeImpl*>(RawPtr));
   }
@@ -203,6 +203,9 @@ inline Attribute unwrap(LLVMAttributeRef Attr) {
 /// copy. Adding and removing enum attributes is intended to be fast, but adding
 /// and removing string or integer attributes involves a FoldingSet lookup.
 class AttributeSet {
+  friend AttributeListImpl;
+  template <typename Ty> friend struct DenseMapInfo;
+
   // TODO: Extract AvailableAttrs from AttributeSetNode and store them here.
   // This will allow an efficient implementation of addAttribute and
   // removeAttribute for enum attrs.
@@ -210,9 +213,6 @@ class AttributeSet {
   /// Private implementation pointer.
   AttributeSetNode *SetNode = nullptr;
 
-  friend AttributeListImpl;
-  template <typename Ty> friend struct DenseMapInfo;
-
 private:
   explicit AttributeSet(AttributeSetNode *ASN) : SetNode(ASN) {}
 
@@ -290,16 +290,16 @@ public:
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief Provide DenseMapInfo for AttributeSet.
+/// Provide DenseMapInfo for AttributeSet.
 template <> struct DenseMapInfo<AttributeSet> {
-  static inline AttributeSet getEmptyKey() {
-    uintptr_t Val = static_cast<uintptr_t>(-1);
+  static AttributeSet getEmptyKey() {
+    auto Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<void *>::NumLowBitsAvailable;
     return AttributeSet(reinterpret_cast<AttributeSetNode *>(Val));
   }
 
-  static inline AttributeSet getTombstoneKey() {
-    uintptr_t Val = static_cast<uintptr_t>(-2);
+  static AttributeSet getTombstoneKey() {
+    auto Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<void *>::NumLowBitsAvailable;
     return AttributeSet(reinterpret_cast<AttributeSetNode *>(Val));
   }
@@ -314,7 +314,7 @@ template <> struct DenseMapInfo<AttributeSet> {
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class holds the attributes for a function, its return value, and
+/// This class holds the attributes for a function, its return value, and
 /// its parameters. You access the attributes for each of them via an index into
 /// the AttributeList object. The function attributes are at index
 /// `AttributeList::FunctionIndex', the return value is at index
@@ -333,21 +333,20 @@ private:
   friend class AttributeListImpl;
   friend class AttributeSet;
   friend class AttributeSetNode;
-
   template <typename Ty> friend struct DenseMapInfo;
 
-  /// \brief The attributes that we are managing. This can be null to represent
+  /// The attributes that we are managing. This can be null to represent
   /// the empty attributes list.
   AttributeListImpl *pImpl = nullptr;
 
 public:
-  /// \brief Create an AttributeList with the specified parameters in it.
+  /// Create an AttributeList with the specified parameters in it.
   static AttributeList get(LLVMContext &C,
                            ArrayRef<std::pair<unsigned, Attribute>> Attrs);
   static AttributeList get(LLVMContext &C,
                            ArrayRef<std::pair<unsigned, AttributeSet>> Attrs);
 
-  /// \brief Create an AttributeList from attribute sets for a function, its
+  /// Create an AttributeList from attribute sets for a function, its
   /// return value, and all of its arguments.
   static AttributeList get(LLVMContext &C, AttributeSet FnAttrs,
                            AttributeSet RetAttrs,
@@ -365,7 +364,7 @@ public:
   // AttributeList Construction and Mutation
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return an AttributeList with the specified parameters in it.
+  /// Return an AttributeList with the specified parameters in it.
   static AttributeList get(LLVMContext &C, ArrayRef<AttributeList> Attrs);
   static AttributeList get(LLVMContext &C, unsigned Index,
                            ArrayRef<Attribute::AttrKind> Kinds);
@@ -374,12 +373,12 @@ public:
   static AttributeList get(LLVMContext &C, unsigned Index,
                            const AttrBuilder &B);
 
-  /// \brief Add an attribute to the attribute set at the given index.
+  /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
   AttributeList addAttribute(LLVMContext &C, unsigned Index,
                              Attribute::AttrKind Kind) const;
 
-  /// \brief Add an attribute to the attribute set at the given index.
+  /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
   AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind,
                              StringRef Value = StringRef()) const;
@@ -388,7 +387,7 @@ public:
   /// Returns a new list because attribute lists are immutable.
   AttributeList addAttribute(LLVMContext &C, unsigned Index, Attribute A) const;
 
-  /// \brief Add attributes to the attribute set at the given index.
+  /// Add attributes to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
   AttributeList addAttributes(LLVMContext &C, unsigned Index,
                               const AttrBuilder &B) const;
@@ -420,70 +419,70 @@ public:
     return addAttributes(C, ArgNo + FirstArgIndex, B);
   }
 
-  /// \brief Remove the specified attribute at the specified index from this
+  /// Remove the specified attribute at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeAttribute(LLVMContext &C, unsigned Index,
                                 Attribute::AttrKind Kind) const;
 
-  /// \brief Remove the specified attribute at the specified index from this
+  /// Remove the specified attribute at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeAttribute(LLVMContext &C, unsigned Index,
                                 StringRef Kind) const;
 
-  /// \brief Remove the specified attributes at the specified index from this
+  /// Remove the specified attributes at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeAttributes(LLVMContext &C, unsigned Index,
                                  const AttrBuilder &AttrsToRemove) const;
 
-  /// \brief Remove all attributes at the specified index from this
+  /// Remove all attributes at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeAttributes(LLVMContext &C, unsigned Index) const;
 
-  /// \brief Remove the specified attribute at the specified arg index from this
+  /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo,
                                      Attribute::AttrKind Kind) const {
     return removeAttribute(C, ArgNo + FirstArgIndex, Kind);
   }
 
-  /// \brief Remove the specified attribute at the specified arg index from this
+  /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo,
                                      StringRef Kind) const {
     return removeAttribute(C, ArgNo + FirstArgIndex, Kind);
   }
 
-  /// \brief Remove the specified attribute at the specified arg index from this
+  /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo,
                                       const AttrBuilder &AttrsToRemove) const {
     return removeAttributes(C, ArgNo + FirstArgIndex, AttrsToRemove);
   }
 
-  /// \brief Remove all attributes at the specified arg index from this
+  /// Remove all attributes at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo) const {
     return removeAttributes(C, ArgNo + FirstArgIndex);
   }
 
-  /// \Brief Add the dereferenceable attribute to the attribute set at the given
+  /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// index. Returns a new list because attribute lists are immutable.
   AttributeList addDereferenceableAttr(LLVMContext &C, unsigned Index,
                                        uint64_t Bytes) const;
 
-  /// \Brief Add the dereferenceable attribute to the attribute set at the given
+  /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// arg index. Returns a new list because attribute lists are immutable.
   AttributeList addDereferenceableParamAttr(LLVMContext &C, unsigned ArgNo,
                                             uint64_t Bytes) const {
     return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes);
   }
 
-  /// \brief Add the dereferenceable_or_null attribute to the attribute set at
+  /// Add the dereferenceable_or_null attribute to the attribute set at
   /// the given index. Returns a new list because attribute lists are immutable.
   AttributeList addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
                                              uint64_t Bytes) const;
 
-  /// \brief Add the dereferenceable_or_null attribute to the attribute set at
+  /// Add the dereferenceable_or_null attribute to the attribute set at
   /// the given arg index. Returns a new list because attribute lists are
   /// immutable.
   AttributeList addDereferenceableOrNullParamAttr(LLVMContext &C,
@@ -510,102 +509,102 @@ public:
   // AttributeList Accessors
   //===--------------------------------------------------------------------===//
 
-  /// \brief Retrieve the LLVM context.
+  /// Retrieve the LLVM context.
   LLVMContext &getContext() const;
 
-  /// \brief The attributes for the specified index are returned.
+  /// The attributes for the specified index are returned.
   AttributeSet getAttributes(unsigned Index) const;
 
-  /// \brief The attributes for the argument or parameter at the given index are
+  /// The attributes for the argument or parameter at the given index are
   /// returned.
   AttributeSet getParamAttributes(unsigned ArgNo) const;
 
-  /// \brief The attributes for the ret value are returned.
+  /// The attributes for the ret value are returned.
   AttributeSet getRetAttributes() const;
 
-  /// \brief The function attributes are returned.
+  /// The function attributes are returned.
   AttributeSet getFnAttributes() const;
 
-  /// \brief Return true if the attribute exists at the given index.
+  /// Return true if the attribute exists at the given index.
   bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const;
 
-  /// \brief Return true if the attribute exists at the given index.
+  /// Return true if the attribute exists at the given index.
   bool hasAttribute(unsigned Index, StringRef Kind) const;
 
-  /// \brief Return true if attribute exists at the given index.
+  /// Return true if attribute exists at the given index.
   bool hasAttributes(unsigned Index) const;
 
-  /// \brief Return true if the attribute exists for the given argument
+  /// Return true if the attribute exists for the given argument
   bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
     return hasAttribute(ArgNo + FirstArgIndex, Kind);
   }
 
-  /// \brief Return true if the attribute exists for the given argument
+  /// Return true if the attribute exists for the given argument
   bool hasParamAttr(unsigned ArgNo, StringRef Kind) const {
     return hasAttribute(ArgNo + FirstArgIndex, Kind);
   }
 
-  /// \brief Return true if attributes exists for the given argument
+  /// Return true if attributes exists for the given argument
   bool hasParamAttrs(unsigned ArgNo) const {
     return hasAttributes(ArgNo + FirstArgIndex);
   }
 
-  /// \brief Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
+  /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
   /// may be faster.
   bool hasFnAttribute(Attribute::AttrKind Kind) const;
 
-  /// \brief Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
+  /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
   /// may be faster.
   bool hasFnAttribute(StringRef Kind) const;
 
-  /// \brief Equivalent to hasAttribute(ArgNo + FirstArgIndex, Kind).
+  /// Equivalent to hasAttribute(ArgNo + FirstArgIndex, Kind).
   bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
-  /// \brief Return true if the specified attribute is set for at least one
+  /// Return true if the specified attribute is set for at least one
   /// parameter or for the return value. If Index is not nullptr, the index
   /// of a parameter with the specified attribute is provided.
   bool hasAttrSomewhere(Attribute::AttrKind Kind,
                         unsigned *Index = nullptr) const;
 
-  /// \brief Return the attribute object that exists at the given index.
+  /// Return the attribute object that exists at the given index.
   Attribute getAttribute(unsigned Index, Attribute::AttrKind Kind) const;
 
-  /// \brief Return the attribute object that exists at the given index.
+  /// Return the attribute object that exists at the given index.
   Attribute getAttribute(unsigned Index, StringRef Kind) const;
 
-  /// \brief Return the attribute object that exists at the arg index.
+  /// Return the attribute object that exists at the arg index.
   Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
     return getAttribute(ArgNo + FirstArgIndex, Kind);
   }
 
-  /// \brief Return the attribute object that exists at the given index.
+  /// Return the attribute object that exists at the given index.
   Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const {
     return getAttribute(ArgNo + FirstArgIndex, Kind);
   }
 
-  /// \brief Return the alignment of the return value.
+  /// Return the alignment of the return value.
   unsigned getRetAlignment() const;
 
-  /// \brief Return the alignment for the specified function parameter.
+  /// Return the alignment for the specified function parameter.
   unsigned getParamAlignment(unsigned ArgNo) const;
 
-  /// \brief Get the stack alignment.
+  /// Get the stack alignment.
   unsigned getStackAlignment(unsigned Index) const;
 
-  /// \brief Get the number of dereferenceable bytes (or zero if unknown).
+  /// Get the number of dereferenceable bytes (or zero if unknown).
   uint64_t getDereferenceableBytes(unsigned Index) const;
 
-  /// \brief Get the number of dereferenceable bytes (or zero if unknown) of an
+  /// Get the number of dereferenceable bytes (or zero if unknown) of an
   /// arg.
   uint64_t getParamDereferenceableBytes(unsigned ArgNo) const {
     return getDereferenceableBytes(ArgNo + FirstArgIndex);
   }
 
-  /// \brief Get the number of dereferenceable_or_null bytes (or zero if
+  /// Get the number of dereferenceable_or_null bytes (or zero if
   /// unknown).
   uint64_t getDereferenceableOrNullBytes(unsigned Index) const;
 
-  /// \brief Get the number of dereferenceable_or_null bytes (or zero if
+  /// Get the number of dereferenceable_or_null bytes (or zero if
   /// unknown) of an arg.
   uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const {
     return getDereferenceableOrNullBytes(ArgNo + FirstArgIndex);
@@ -615,7 +614,7 @@ public:
   std::pair<unsigned, Optional<unsigned>>
   getAllocSizeArgs(unsigned Index) const;
 
-  /// \brief Return the attributes at the index as a string.
+  /// Return the attributes at the index as a string.
   std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
 
   //===--------------------------------------------------------------------===//
@@ -637,12 +636,12 @@ public:
   bool operator==(const AttributeList &RHS) const { return pImpl == RHS.pImpl; }
   bool operator!=(const AttributeList &RHS) const { return pImpl != RHS.pImpl; }
 
-  /// \brief Return a raw pointer that uniquely identifies this attribute list.
+  /// Return a raw pointer that uniquely identifies this attribute list.
   void *getRawPointer() const {
     return pImpl;
   }
 
-  /// \brief Return true if there are no attributes.
+  /// Return true if there are no attributes.
   bool isEmpty() const { return pImpl == nullptr; }
 
   void dump() const;
@@ -650,16 +649,16 @@ public:
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief Provide DenseMapInfo for AttributeList.
+/// Provide DenseMapInfo for AttributeList.
 template <> struct DenseMapInfo<AttributeList> {
-  static inline AttributeList getEmptyKey() {
-    uintptr_t Val = static_cast<uintptr_t>(-1);
+  static AttributeList getEmptyKey() {
+    auto Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
     return AttributeList(reinterpret_cast<AttributeListImpl *>(Val));
   }
 
-  static inline AttributeList getTombstoneKey() {
-    uintptr_t Val = static_cast<uintptr_t>(-2);
+  static AttributeList getTombstoneKey() {
+    auto Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
     return AttributeList(reinterpret_cast<AttributeListImpl *>(Val));
   }
@@ -676,7 +675,7 @@ template <> struct DenseMapInfo<AttributeList> {
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class is used in conjunction with the Attribute::get method to
+/// This class is used in conjunction with the Attribute::get method to
 /// create an Attribute object. The object itself is uniquified. The Builder's
 /// value, however, is not. So this can be used as a quick way to test for
 /// equality, presence of attributes, etc.
@@ -691,73 +690,75 @@ class AttrBuilder {
 
 public:
   AttrBuilder() = default;
+
   AttrBuilder(const Attribute &A) {
     addAttribute(A);
   }
+
   AttrBuilder(AttributeList AS, unsigned Idx);
   AttrBuilder(AttributeSet AS);
 
   void clear();
 
-  /// \brief Add an attribute to the builder.
+  /// Add an attribute to the builder.
   AttrBuilder &addAttribute(Attribute::AttrKind Val);
 
-  /// \brief Add the Attribute object to the builder.
+  /// Add the Attribute object to the builder.
   AttrBuilder &addAttribute(Attribute A);
 
-  /// \brief Add the target-dependent attribute to the builder.
+  /// Add the target-dependent attribute to the builder.
   AttrBuilder &addAttribute(StringRef A, StringRef V = StringRef());
 
-  /// \brief Remove an attribute from the builder.
+  /// Remove an attribute from the builder.
   AttrBuilder &removeAttribute(Attribute::AttrKind Val);
 
-  /// \brief Remove the attributes from the builder.
+  /// Remove the attributes from the builder.
   AttrBuilder &removeAttributes(AttributeList A, uint64_t WithoutIndex);
 
-  /// \brief Remove the target-dependent attribute to the builder.
+  /// Remove the target-dependent attribute to the builder.
   AttrBuilder &removeAttribute(StringRef A);
 
-  /// \brief Add the attributes from the builder.
+  /// Add the attributes from the builder.
   AttrBuilder &merge(const AttrBuilder &B);
 
-  /// \brief Remove the attributes from the builder.
+  /// Remove the attributes from the builder.
   AttrBuilder &remove(const AttrBuilder &B);
 
-  /// \brief Return true if the builder has any attribute that's in the
+  /// Return true if the builder has any attribute that's in the
   /// specified builder.
   bool overlaps(const AttrBuilder &B) const;
 
-  /// \brief Return true if the builder has the specified attribute.
+  /// Return true if the builder has the specified attribute.
   bool contains(Attribute::AttrKind A) const {
     assert((unsigned)A < Attribute::EndAttrKinds && "Attribute out of range!");
     return Attrs[A];
   }
 
-  /// \brief Return true if the builder has the specified target-dependent
+  /// Return true if the builder has the specified target-dependent
   /// attribute.
   bool contains(StringRef A) const;
 
-  /// \brief Return true if the builder has IR-level attributes.
+  /// Return true if the builder has IR-level attributes.
   bool hasAttributes() const;
 
-  /// \brief Return true if the builder has any attribute that's in the
+  /// Return true if the builder has any attribute that's in the
   /// specified attribute.
   bool hasAttributes(AttributeList A, uint64_t Index) const;
 
-  /// \brief Return true if the builder has an alignment attribute.
+  /// Return true if the builder has an alignment attribute.
   bool hasAlignmentAttr() const;
 
-  /// \brief Retrieve the alignment attribute, if it exists.
+  /// Retrieve the alignment attribute, if it exists.
   uint64_t getAlignment() const { return Alignment; }
 
-  /// \brief Retrieve the stack alignment attribute, if it exists.
+  /// Retrieve the stack alignment attribute, if it exists.
   uint64_t getStackAlignment() const { return StackAlignment; }
 
-  /// \brief Retrieve the number of dereferenceable bytes, if the
+  /// Retrieve the number of dereferenceable bytes, if the
   /// dereferenceable attribute exists (zero is returned otherwise).
   uint64_t getDereferenceableBytes() const { return DerefBytes; }
 
-  /// \brief Retrieve the number of dereferenceable_or_null bytes, if the
+  /// Retrieve the number of dereferenceable_or_null bytes, if the
   /// dereferenceable_or_null attribute exists (zero is returned otherwise).
   uint64_t getDereferenceableOrNullBytes() const { return DerefOrNullBytes; }
 
@@ -765,19 +766,19 @@ public:
   /// doesn't exist, pair(0, 0) is returned.
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
 
-  /// \brief This turns an int alignment (which must be a power of 2) into the
+  /// This turns an int alignment (which must be a power of 2) into the
   /// form used internally in Attribute.
   AttrBuilder &addAlignmentAttr(unsigned Align);
 
-  /// \brief This turns an int stack alignment (which must be a power of 2) into
+  /// This turns an int stack alignment (which must be a power of 2) into
   /// the form used internally in Attribute.
   AttrBuilder &addStackAlignmentAttr(unsigned Align);
 
-  /// \brief This turns the number of dereferenceable bytes into the form used
+  /// This turns the number of dereferenceable bytes into the form used
   /// internally in Attribute.
   AttrBuilder &addDereferenceableAttr(uint64_t Bytes);
 
-  /// \brief This turns the number of dereferenceable_or_null bytes into the
+  /// This turns the number of dereferenceable_or_null bytes into the
   /// form used internally in Attribute.
   AttrBuilder &addDereferenceableOrNullAttr(uint64_t Bytes);
 
@@ -789,7 +790,7 @@ public:
   /// Attribute.getIntValue().
   AttrBuilder &addAllocSizeAttrFromRawRepr(uint64_t RawAllocSizeRepr);
 
-  /// \brief Return true if the builder contains no target-independent
+  /// Return true if the builder contains no target-independent
   /// attributes.
   bool empty() const { return Attrs.none(); }
 
@@ -800,18 +801,19 @@ public:
   using td_range = iterator_range<td_iterator>;
   using td_const_range = iterator_range<td_const_iterator>;
 
-  td_iterator td_begin()             { return TargetDepAttrs.begin(); }
-  td_iterator td_end()               { return TargetDepAttrs.end(); }
+  td_iterator td_begin() { return TargetDepAttrs.begin(); }
+  td_iterator td_end() { return TargetDepAttrs.end(); }
 
   td_const_iterator td_begin() const { return TargetDepAttrs.begin(); }
-  td_const_iterator td_end() const   { return TargetDepAttrs.end(); }
+  td_const_iterator td_end() const { return TargetDepAttrs.end(); }
 
   td_range td_attrs() { return td_range(td_begin(), td_end()); }
+
   td_const_range td_attrs() const {
     return td_const_range(td_begin(), td_end());
   }
 
-  bool td_empty() const              { return TargetDepAttrs.empty(); }
+  bool td_empty() const { return TargetDepAttrs.empty(); }
 
   bool operator==(const AttrBuilder &B);
   bool operator!=(const AttrBuilder &B) {
@@ -821,14 +823,14 @@ public:
 
 namespace AttributeFuncs {
 
-/// \brief Which attributes cannot be applied to a type.
+/// Which attributes cannot be applied to a type.
 AttrBuilder typeIncompatible(Type *Ty);
 
 /// \returns Return true if the two functions have compatible target-independent
 /// attributes for inlining purposes.
 bool areInlineCompatible(const Function &Caller, const Function &Callee);
 
-/// \brief Merge caller's and callee's attributes.
+/// Merge caller's and callee's attributes.
 void mergeAttributesForInlining(Function &Caller, const Function &Callee);
 
 } // end namespace AttributeFuncs
diff --git a/contrib/llvm/include/llvm/IR/Attributes.td b/contrib/llvm/include/llvm/IR/Attributes.td
index ebe5c1985875..1019f867aab0 100644
--- a/contrib/llvm/include/llvm/IR/Attributes.td
+++ b/contrib/llvm/include/llvm/IR/Attributes.td
@@ -106,9 +106,15 @@ def NoRedZone : EnumAttr<"noredzone">;
 /// Mark the function as not returning.
 def NoReturn : EnumAttr<"noreturn">;
 
+/// Disable Indirect Branch Tracking.
+def NoCfCheck : EnumAttr<"nocf_check">;
+
 /// Function doesn't unwind stack.
 def NoUnwind : EnumAttr<"nounwind">;
 
+/// Select optimizations for best fuzzing signal.
+def OptForFuzzing : EnumAttr<"optforfuzzing">;
+
 /// opt_size.
 def OptimizeForSize : EnumAttr<"optsize">;
 
@@ -130,6 +136,9 @@ def ReturnsTwice : EnumAttr<"returns_twice">;
 /// Safe Stack protection.
 def SafeStack : EnumAttr<"safestack">;
 
+/// Shadow Call Stack protection.
+def ShadowCallStack : EnumAttr<"shadowcallstack">;
+
 /// Sign extended before/after call.
 def SExt : EnumAttr<"signext">;
 
@@ -205,6 +214,7 @@ def : CompatRule<"isEqual<SanitizeThreadAttr>">;
 def : CompatRule<"isEqual<SanitizeMemoryAttr>">;
 def : CompatRule<"isEqual<SanitizeHWAddressAttr>">;
 def : CompatRule<"isEqual<SafeStackAttr>">;
+def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 
 class MergeRule<string F> {
   // The name of the function called to merge the attributes of the caller and
@@ -225,3 +235,4 @@ def : MergeRule<"setOR<ProfileSampleAccurateAttr>">;
 def : MergeRule<"adjustCallerSSPLevel">;
 def : MergeRule<"adjustCallerStackProbes">;
 def : MergeRule<"adjustCallerStackProbeSize">;
+def : MergeRule<"adjustMinLegalVectorWidth">;
diff --git a/contrib/llvm/include/llvm/IR/AutoUpgrade.h b/contrib/llvm/include/llvm/IR/AutoUpgrade.h
index 3f406f0cf196..8cf574c6a138 100644
--- a/contrib/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/contrib/llvm/include/llvm/IR/AutoUpgrade.h
@@ -37,6 +37,10 @@ namespace llvm {
   /// intrinsic function with a call to the specified new function.
   void UpgradeIntrinsicCall(CallInst *CI, Function *NewFn);
 
+  // This upgrades the comment for objc retain release markers in inline asm
+  // calls
+  void UpgradeInlineAsmString(std::string *AsmStr);
+
   /// This is an auto-upgrade hook for any old intrinsic function syntaxes
   /// which need to have both the function updated as well as all calls updated
   /// to the new function. This should only be run in a post-processing fashion
@@ -51,6 +55,10 @@ namespace llvm {
   /// module is modified.
   bool UpgradeModuleFlags(Module &M);
 
+  /// This checks for objc retain release marker which should be upgraded. It
+  /// returns true if module is modified.
+  bool UpgradeRetainReleaseMarker(Module &M);
+
   void UpgradeSectionAttributes(Module &M);
 
   /// If the given TBAA tag uses the scalar TBAA format, create a new node
diff --git a/contrib/llvm/include/llvm/IR/BasicBlock.h b/contrib/llvm/include/llvm/IR/BasicBlock.h
index 77cfc9776df0..1ee19975af75 100644
--- a/contrib/llvm/include/llvm/IR/BasicBlock.h
+++ b/contrib/llvm/include/llvm/IR/BasicBlock.h
@@ -41,7 +41,7 @@ class PHINode;
 class TerminatorInst;
 class ValueSymbolTable;
 
-/// \brief LLVM Basic Block Representation
+/// LLVM Basic Block Representation
 ///
 /// This represents a single basic block in LLVM. A basic block is simply a
 /// container of instructions that execute sequentially. Basic blocks are Values
@@ -70,7 +70,7 @@ private:
 
   void setParent(Function *parent);
 
-  /// \brief Constructor.
+  /// Constructor.
   ///
   /// If the function parameter is specified, the basic block is automatically
   /// inserted at either the end of the function (if InsertBefore is null), or
@@ -84,7 +84,7 @@ public:
   BasicBlock &operator=(const BasicBlock &) = delete;
   ~BasicBlock();
 
-  /// \brief Get the context in which this basic block lives.
+  /// Get the context in which this basic block lives.
   LLVMContext &getContext() const;
 
   /// Instruction iterators...
@@ -93,7 +93,7 @@ public:
   using reverse_iterator = InstListType::reverse_iterator;
   using const_reverse_iterator = InstListType::const_reverse_iterator;
 
-  /// \brief Creates a new BasicBlock.
+  /// Creates a new BasicBlock.
   ///
   /// If the Parent parameter is specified, the basic block is automatically
   /// inserted at either the end of the function (if InsertBefore is 0), or
@@ -104,12 +104,12 @@ public:
     return new BasicBlock(Context, Name, Parent, InsertBefore);
   }
 
-  /// \brief Return the enclosing method, or null if none.
+  /// Return the enclosing method, or null if none.
   const Function *getParent() const { return Parent; }
         Function *getParent()       { return Parent; }
 
-  /// \brief Return the module owning the function this basic block belongs to,
-  /// or nullptr it the function does not have a module.
+  /// Return the module owning the function this basic block belongs to, or
+  /// nullptr if the function does not have a module.
   ///
   /// Note: this is undefined behavior if the block does not have a parent.
   const Module *getModule() const;
@@ -118,34 +118,34 @@ public:
                             static_cast<const BasicBlock *>(this)->getModule());
   }
 
-  /// \brief Returns the terminator instruction if the block is well formed or
-  /// null if the block is not well formed.
+  /// Returns the terminator instruction if the block is well formed or null
+  /// if the block is not well formed.
   const TerminatorInst *getTerminator() const LLVM_READONLY;
   TerminatorInst *getTerminator() {
     return const_cast<TerminatorInst *>(
                         static_cast<const BasicBlock *>(this)->getTerminator());
   }
 
-  /// \brief Returns the call instruction calling @llvm.experimental.deoptimize
-  /// prior to the terminating return instruction of this basic block, if such a
-  /// call is present.  Otherwise, returns null.
+  /// Returns the call instruction calling \@llvm.experimental.deoptimize
+  /// prior to the terminating return instruction of this basic block, if such
+  /// a call is present.  Otherwise, returns null.
   const CallInst *getTerminatingDeoptimizeCall() const;
   CallInst *getTerminatingDeoptimizeCall() {
     return const_cast<CallInst *>(
          static_cast<const BasicBlock *>(this)->getTerminatingDeoptimizeCall());
   }
 
-  /// \brief Returns the call instruction marked 'musttail' prior to the
-  /// terminating return instruction of this basic block, if such a call is
-  /// present.  Otherwise, returns null.
+  /// Returns the call instruction marked 'musttail' prior to the terminating
+  /// return instruction of this basic block, if such a call is present.
+  /// Otherwise, returns null.
   const CallInst *getTerminatingMustTailCall() const;
   CallInst *getTerminatingMustTailCall() {
     return const_cast<CallInst *>(
            static_cast<const BasicBlock *>(this)->getTerminatingMustTailCall());
   }
 
-  /// \brief Returns a pointer to the first instruction in this block that is
-  /// not a PHINode instruction.
+  /// Returns a pointer to the first instruction in this block that is not a
+  /// PHINode instruction.
   ///
   /// When adding instructions to the beginning of the basic block, they should
   /// be added before the returned value, not before the first instruction,
@@ -156,23 +156,23 @@ public:
                        static_cast<const BasicBlock *>(this)->getFirstNonPHI());
   }
 
-  /// \brief Returns a pointer to the first instruction in this block that is not
-  /// a PHINode or a debug intrinsic.
+  /// Returns a pointer to the first instruction in this block that is not a
+  /// PHINode or a debug intrinsic.
   const Instruction* getFirstNonPHIOrDbg() const;
   Instruction* getFirstNonPHIOrDbg() {
     return const_cast<Instruction *>(
                   static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbg());
   }
 
-  /// \brief Returns a pointer to the first instruction in this block that is not
-  /// a PHINode, a debug intrinsic, or a lifetime intrinsic.
+  /// Returns a pointer to the first instruction in this block that is not a
+  /// PHINode, a debug intrinsic, or a lifetime intrinsic.
   const Instruction* getFirstNonPHIOrDbgOrLifetime() const;
   Instruction* getFirstNonPHIOrDbgOrLifetime() {
     return const_cast<Instruction *>(
         static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbgOrLifetime());
   }
 
-  /// \brief Returns an iterator to the first instruction in this block that is
+  /// Returns an iterator to the first instruction in this block that is
   /// suitable for inserting a non-PHI instruction.
   ///
   /// In particular, it skips all PHIs and LandingPad instructions.
@@ -182,23 +182,35 @@ public:
                                           ->getFirstInsertionPt().getNonConst();
   }
 
-  /// \brief Unlink 'this' from the containing function, but do not delete it.
+  /// Return a const iterator range over the instructions in the block, skipping
+  /// any debug instructions.
+  iterator_range<filter_iterator<BasicBlock::const_iterator,
+                                 std::function<bool(const Instruction &)>>>
+  instructionsWithoutDebug() const;
+
+  /// Return an iterator range over the instructions in the block, skipping any
+  /// debug instructions.
+  iterator_range<filter_iterator<BasicBlock::iterator,
+                                 std::function<bool(Instruction &)>>>
+  instructionsWithoutDebug();
+
+  /// Unlink 'this' from the containing function, but do not delete it.
   void removeFromParent();
 
-  /// \brief Unlink 'this' from the containing function and delete it.
+  /// Unlink 'this' from the containing function and delete it.
   ///
   // \returns an iterator pointing to the element after the erased one.
   SymbolTableList<BasicBlock>::iterator eraseFromParent();
 
-  /// \brief Unlink this basic block from its current function and insert it
-  /// into the function that \p MovePos lives in, right before \p MovePos.
+  /// Unlink this basic block from its current function and insert it into
+  /// the function that \p MovePos lives in, right before \p MovePos.
   void moveBefore(BasicBlock *MovePos);
 
-  /// \brief Unlink this basic block from its current function and insert it
+  /// Unlink this basic block from its current function and insert it
   /// right after \p MovePos in the function \p MovePos lives in.
   void moveAfter(BasicBlock *MovePos);
 
-  /// \brief Insert unlinked basic block into a function.
+  /// Insert unlinked basic block into a function.
   ///
   /// Inserts an unlinked basic block into \c Parent.  If \c InsertBefore is
   /// provided, inserts before that basic block, otherwise inserts at the end.
@@ -206,7 +218,7 @@ public:
   /// \pre \a getParent() is \c nullptr.
   void insertInto(Function *Parent, BasicBlock *InsertBefore = nullptr);
 
-  /// \brief Return the predecessor of this block if it has a single predecessor
+  /// Return the predecessor of this block if it has a single predecessor
   /// block. Otherwise return a null pointer.
   const BasicBlock *getSinglePredecessor() const;
   BasicBlock *getSinglePredecessor() {
@@ -214,7 +226,7 @@ public:
                  static_cast<const BasicBlock *>(this)->getSinglePredecessor());
   }
 
-  /// \brief Return the predecessor of this block if it has a unique predecessor
+  /// Return the predecessor of this block if it has a unique predecessor
   /// block. Otherwise return a null pointer.
   ///
   /// Note that unique predecessor doesn't mean single edge, there can be
@@ -226,7 +238,7 @@ public:
                  static_cast<const BasicBlock *>(this)->getUniquePredecessor());
   }
 
-  /// \brief Return the successor of this block if it has a single successor.
+  /// Return the successor of this block if it has a single successor.
   /// Otherwise return a null pointer.
   ///
   /// This method is analogous to getSinglePredecessor above.
@@ -236,7 +248,7 @@ public:
                    static_cast<const BasicBlock *>(this)->getSingleSuccessor());
   }
 
-  /// \brief Return the successor of this block if it has a unique successor.
+  /// Return the successor of this block if it has a unique successor.
   /// Otherwise return a null pointer.
   ///
   /// This method is analogous to getUniquePredecessor above.
@@ -310,28 +322,28 @@ public:
   }
   iterator_range<phi_iterator> phis();
 
-  /// \brief Return the underlying instruction list container.
+  /// Return the underlying instruction list container.
   ///
   /// Currently you need to access the underlying instruction list container
   /// directly if you want to modify it.
   const InstListType &getInstList() const { return InstList; }
         InstListType &getInstList()       { return InstList; }
 
-  /// \brief Returns a pointer to a member of the instruction list.
+  /// Returns a pointer to a member of the instruction list.
   static InstListType BasicBlock::*getSublistAccess(Instruction*) {
     return &BasicBlock::InstList;
   }
 
-  /// \brief Returns a pointer to the symbol table if one exists.
+  /// Returns a pointer to the symbol table if one exists.
   ValueSymbolTable *getValueSymbolTable();
 
-  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast.
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Value *V) {
     return V->getValueID() == Value::BasicBlockVal;
   }
 
-  /// \brief Cause all subinstructions to "let go" of all the references that
-  /// said subinstructions are maintaining.
+  /// Cause all subinstructions to "let go" of all the references that said
+  /// subinstructions are maintaining.
   ///
   /// This allows one to 'delete' a whole class at a time, even though there may
   /// be circular references... first all references are dropped, and all use
@@ -340,8 +352,8 @@ public:
   /// except operator delete.
   void dropAllReferences();
 
-  /// \brief Notify the BasicBlock that the predecessor \p Pred is no longer
-  /// able to reach it.
+  /// Notify the BasicBlock that the predecessor \p Pred is no longer able to
+  /// reach it.
   ///
   /// This is actually not used to update the Predecessor list, but is actually
   /// used to update the PHI nodes that reside in the block.  Note that this
@@ -350,8 +362,7 @@ public:
 
   bool canSplitPredecessors() const;
 
-  /// \brief Split the basic block into two basic blocks at the specified
-  /// instruction.
+  /// Split the basic block into two basic blocks at the specified instruction.
   ///
   /// Note that all instructions BEFORE the specified iterator stay as part of
   /// the original basic block, an unconditional branch is added to the original
@@ -371,37 +382,37 @@ public:
     return splitBasicBlock(I->getIterator(), BBName);
   }
 
-  /// \brief Returns true if there are any uses of this basic block other than
+  /// Returns true if there are any uses of this basic block other than
   /// direct branches, switches, etc. to it.
   bool hasAddressTaken() const { return getSubclassDataFromValue() != 0; }
 
-  /// \brief Update all phi nodes in this basic block's successors to refer to
-  /// basic block \p New instead of to it.
+  /// Update all phi nodes in this basic block's successors to refer to basic
+  /// block \p New instead of to it.
   void replaceSuccessorsPhiUsesWith(BasicBlock *New);
 
-  /// \brief Return true if this basic block is an exception handling block.
+  /// Return true if this basic block is an exception handling block.
   bool isEHPad() const { return getFirstNonPHI()->isEHPad(); }
 
-  /// \brief Return true if this basic block is a landing pad.
+  /// Return true if this basic block is a landing pad.
   ///
   /// Being a ``landing pad'' means that the basic block is the destination of
   /// the 'unwind' edge of an invoke instruction.
   bool isLandingPad() const;
 
-  /// \brief Return the landingpad instruction associated with the landing pad.
+  /// Return the landingpad instruction associated with the landing pad.
   const LandingPadInst *getLandingPadInst() const;
   LandingPadInst *getLandingPadInst() {
     return const_cast<LandingPadInst *>(
                     static_cast<const BasicBlock *>(this)->getLandingPadInst());
   }
 
-  /// \brief Return true if it is legal to hoist instructions into this block.
+  /// Return true if it is legal to hoist instructions into this block.
   bool isLegalToHoistInto() const;
 
   Optional<uint64_t> getIrrLoopHeaderWeight() const;
 
 private:
-  /// \brief Increment the internal refcount of the number of BlockAddresses
+  /// Increment the internal refcount of the number of BlockAddresses
   /// referencing this BasicBlock by \p Amt.
   ///
   /// This is almost always 0, sometimes one possibly, but almost never 2, and
@@ -412,8 +423,8 @@ private:
            "Refcount wrap-around");
   }
 
-  /// \brief Shadow Value::setValueSubclassData with a private forwarding method
-  /// so that any future subclasses cannot accidentally use it.
+  /// Shadow Value::setValueSubclassData with a private forwarding method so
+  /// that any future subclasses cannot accidentally use it.
   void setValueSubclassData(unsigned short D) {
     Value::setValueSubclassData(D);
   }
@@ -422,6 +433,10 @@ private:
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(BasicBlock, LLVMBasicBlockRef)
 
+/// Advance \p It while it points to a debug instruction and return the result.
+/// This assumes that \p It is not at the end of a block.
+BasicBlock::iterator skipDebugIntrinsics(BasicBlock::iterator It);
+
 } // end namespace llvm
 
 #endif // LLVM_IR_BASICBLOCK_H
diff --git a/contrib/llvm/include/llvm/IR/CFG.h b/contrib/llvm/include/llvm/IR/CFG.h
index e259e42e1ce4..f4988e7f1fec 100644
--- a/contrib/llvm/include/llvm/IR/CFG.h
+++ b/contrib/llvm/include/llvm/IR/CFG.h
@@ -107,6 +107,9 @@ inline const_pred_iterator pred_end(const BasicBlock *BB) {
 inline bool pred_empty(const BasicBlock *BB) {
   return pred_begin(BB) == pred_end(BB);
 }
+inline unsigned pred_size(const BasicBlock *BB) {
+  return std::distance(pred_begin(BB), pred_end(BB));
+}
 inline pred_range predecessors(BasicBlock *BB) {
   return pred_range(pred_begin(BB), pred_end(BB));
 }
@@ -140,6 +143,9 @@ inline succ_const_iterator succ_end(const BasicBlock *BB) {
 inline bool succ_empty(const BasicBlock *BB) {
   return succ_begin(BB) == succ_end(BB);
 }
+inline unsigned succ_size(const BasicBlock *BB) {
+  return std::distance(succ_begin(BB), succ_end(BB));
+}
 inline succ_range successors(BasicBlock *BB) {
   return succ_range(succ_begin(BB), succ_end(BB));
 }
diff --git a/contrib/llvm/include/llvm/IR/CallSite.h b/contrib/llvm/include/llvm/IR/CallSite.h
index 5b10da8f2aee..2162ccb982b0 100644
--- a/contrib/llvm/include/llvm/IR/CallSite.h
+++ b/contrib/llvm/include/llvm/IR/CallSite.h
@@ -637,7 +637,8 @@ public:
     if (hasRetAttr(Attribute::NonNull))
       return true;
     else if (getDereferenceableBytes(AttributeList::ReturnIndex) > 0 &&
-             getType()->getPointerAddressSpace() == 0)
+             !NullPointerIsDefined(getCaller(),
+                                   getType()->getPointerAddressSpace()))
       return true;
 
     return false;
diff --git a/contrib/llvm/include/llvm/IR/CallingConv.h b/contrib/llvm/include/llvm/IR/CallingConv.h
index 84fe836adc35..b9c02d7ed424 100644
--- a/contrib/llvm/include/llvm/IR/CallingConv.h
+++ b/contrib/llvm/include/llvm/IR/CallingConv.h
@@ -26,7 +26,7 @@ namespace CallingConv {
 
   /// A set of enums which specify the assigned numeric values for known llvm
   /// calling conventions.
-  /// @brief LLVM Calling Convention Representation
+  /// LLVM Calling Convention Representation
   enum {
     /// C - The default llvm calling convention, compatible with C.  This
     /// convention is the only calling convention that supports varargs calls.
@@ -139,11 +139,11 @@ namespace CallingConv {
     /// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins
     Intel_OCL_BI = 77,
 
-    /// \brief The C convention as specified in the x86-64 supplement to the
+    /// The C convention as specified in the x86-64 supplement to the
     /// System V ABI, used on most non-Windows systems.
     X86_64_SysV = 78,
 
-    /// \brief The C convention as implemented on Windows/x86-64 and
+    /// The C convention as implemented on Windows/x86-64 and
     /// AArch64. This convention differs from the more common
     /// \c X86_64_SysV convention in a number of ways, most notably in
     /// that XMM registers used to pass arguments are shadowed by GPRs,
@@ -153,17 +153,17 @@ namespace CallingConv {
     /// registers to variadic functions.
     Win64 = 79,
 
-    /// \brief MSVC calling convention that passes vectors and vector aggregates
+    /// MSVC calling convention that passes vectors and vector aggregates
     /// in SSE registers.
     X86_VectorCall = 80,
 
-    /// \brief Calling convention used by HipHop Virtual Machine (HHVM) to
+    /// Calling convention used by HipHop Virtual Machine (HHVM) to
     /// perform calls to and from translation cache, and for calling PHP
     /// functions.
     /// HHVM calling convention supports tail/sibling call elimination.
     HHVM = 81,
 
-    /// \brief HHVM calling convention for invoking C/C++ helpers.
+    /// HHVM calling convention for invoking C/C++ helpers.
     HHVM_C = 82,
 
     /// X86_INTR - x86 hardware interrupt context. Callee may take one or two
diff --git a/contrib/llvm/include/llvm/IR/Comdat.h b/contrib/llvm/include/llvm/IR/Comdat.h
index fa87093ca50a..555121e928f7 100644
--- a/contrib/llvm/include/llvm/IR/Comdat.h
+++ b/contrib/llvm/include/llvm/IR/Comdat.h
@@ -16,6 +16,9 @@
 #ifndef LLVM_IR_COMDAT_H
 #define LLVM_IR_COMDAT_H
 
+#include "llvm-c/Types.h"
+#include "llvm/Support/CBindingWrapping.h"
+
 namespace llvm {
 
 class raw_ostream;
@@ -55,6 +58,9 @@ private:
   SelectionKind SK = Any;
 };
 
+// Create wrappers for C Binding types (see CBindingWrapping.h).
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(Comdat, LLVMComdatRef)
+
 inline raw_ostream &operator<<(raw_ostream &OS, const Comdat &C) {
   C.print(OS);
   return OS;
diff --git a/contrib/llvm/include/llvm/IR/Constant.h b/contrib/llvm/include/llvm/IR/Constant.h
index 0c94b58a3112..5fdf0ea00f00 100644
--- a/contrib/llvm/include/llvm/IR/Constant.h
+++ b/contrib/llvm/include/llvm/IR/Constant.h
@@ -38,7 +38,7 @@ class APInt;
 /// structurally equivalent constants will always have the same address.
 /// Constants are created on demand as needed and never deleted: thus clients
 /// don't have to worry about the lifetime of the objects.
-/// @brief LLVM Constant Representation
+/// LLVM Constant Representation
 class Constant : public User {
 protected:
   Constant(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps)
@@ -71,6 +71,26 @@ public:
   /// Return true if the value is the smallest signed value.
   bool isMinSignedValue() const;
 
+  /// Return true if this is a finite and non-zero floating-point scalar
+  /// constant or a vector constant with all finite and non-zero elements.
+  bool isFiniteNonZeroFP() const;
+
+  /// Return true if this is a normal (as opposed to denormal) floating-point
+  /// scalar constant or a vector constant with all normal elements.
+  bool isNormalFP() const;
+
+  /// Return true if this scalar has an exact multiplicative inverse or this
+  /// vector has an exact multiplicative inverse for each element in the vector.
+  bool hasExactInverseFP() const;
+
+  /// Return true if this is a floating-point NaN constant or a vector
+  /// floating-point constant with all NaN elements.
+  bool isNaN() const;
+
+  /// Return true if this is a vector constant that includes any undefined
+  /// elements.
+  bool containsUndefElement() const;
+
   /// Return true if evaluation of this constant could trap. This is true for
   /// things like constant expressions that could divide by zero.
   bool canTrap() const;
@@ -137,7 +157,7 @@ public:
 
   /// @returns the value for an integer or vector of integer constant of the
   /// given type that has all its bits set to true.
-  /// @brief Get the all ones value
+  /// Get the all ones value
   static Constant *getAllOnesValue(Type* Ty);
 
   /// Return the value for an integer or pointer constant, or a vector thereof,
diff --git a/contrib/llvm/include/llvm/IR/ConstantRange.h b/contrib/llvm/include/llvm/IR/ConstantRange.h
index 6889e2658244..1adda3269abc 100644
--- a/contrib/llvm/include/llvm/IR/ConstantRange.h
+++ b/contrib/llvm/include/llvm/IR/ConstantRange.h
@@ -54,7 +54,7 @@ public:
   /// Initialize a range to hold the single specified value.
   ConstantRange(APInt Value);
 
-  /// @brief Initialize a range of values explicitly. This will assert out if
+  /// Initialize a range of values explicitly. This will assert out if
   /// Lower==Upper and Lower != Min or Max value for its type. It will also
   /// assert out if the two APInt's are not the same bit width.
   ConstantRange(APInt Lower, APInt Upper);
diff --git a/contrib/llvm/include/llvm/IR/Constants.h b/contrib/llvm/include/llvm/IR/Constants.h
index 0094fd54992a..f9d5ebc560c7 100644
--- a/contrib/llvm/include/llvm/IR/Constants.h
+++ b/contrib/llvm/include/llvm/IR/Constants.h
@@ -80,7 +80,7 @@ public:
 //===----------------------------------------------------------------------===//
 /// This is the shared class of boolean and integer constants. This class
 /// represents both boolean and integral constants.
-/// @brief Class for constant integers.
+/// Class for constant integers.
 class ConstantInt final : public ConstantData {
   friend class Constant;
 
@@ -107,7 +107,7 @@ public:
   /// to fit the type, unless isSigned is true, in which case the value will
   /// be interpreted as a 64-bit signed integer and sign-extended to fit
   /// the type.
-  /// @brief Get a ConstantInt for a specific value.
+  /// Get a ConstantInt for a specific value.
   static ConstantInt *get(IntegerType *Ty, uint64_t V,
                           bool isSigned = false);
 
@@ -115,7 +115,7 @@ public:
   /// value V will be canonicalized to a an unsigned APInt. Accessing it with
   /// either getSExtValue() or getZExtValue() will yield a correctly sized and
   /// signed value for the type Ty.
-  /// @brief Get a ConstantInt for a specific signed value.
+  /// Get a ConstantInt for a specific signed value.
   static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
   static Constant *getSigned(Type *Ty, int64_t V);
 
@@ -134,7 +134,7 @@ public:
 
   /// Return the constant as an APInt value reference. This allows clients to
   /// obtain a full-precision copy of the value.
-  /// @brief Return the constant's value.
+  /// Return the constant's value.
   inline const APInt &getValue() const {
     return Val;
   }
@@ -145,7 +145,7 @@ public:
   /// Return the constant as a 64-bit unsigned integer value after it
   /// has been zero extended as appropriate for the type of this constant. Note
   /// that this method can assert if the value does not fit in 64 bits.
-  /// @brief Return the zero extended value.
+  /// Return the zero extended value.
   inline uint64_t getZExtValue() const {
     return Val.getZExtValue();
   }
@@ -153,7 +153,7 @@ public:
   /// Return the constant as a 64-bit integer value after it has been sign
   /// extended as appropriate for the type of this constant. Note that
   /// this method can assert if the value does not fit in 64 bits.
-  /// @brief Return the sign extended value.
+  /// Return the sign extended value.
   inline int64_t getSExtValue() const {
     return Val.getSExtValue();
   }
@@ -161,7 +161,7 @@ public:
   /// A helper method that can be used to determine if the constant contained
   /// within is equal to a constant.  This only works for very small values,
   /// because this is all that can be represented with all types.
-  /// @brief Determine if this constant's value is same as an unsigned char.
+  /// Determine if this constant's value is same as an unsigned char.
   bool equalsInt(uint64_t V) const {
     return Val == V;
   }
@@ -181,7 +181,7 @@ public:
   /// the signed version avoids callers having to convert a signed quantity
   /// to the appropriate unsigned type before calling the method.
   /// @returns true if V is a valid value for type Ty
-  /// @brief Determine if the value is in range for the given type.
+  /// Determine if the value is in range for the given type.
   static bool isValueValidForType(Type *Ty, uint64_t V);
   static bool isValueValidForType(Type *Ty, int64_t V);
 
@@ -197,7 +197,7 @@ public:
   /// This is just a convenience method to make client code smaller for a
   /// common case. It also correctly performs the comparison without the
   /// potential for an assertion from getZExtValue().
-  /// @brief Determine if the value is one.
+  /// Determine if the value is one.
   bool isOne() const {
     return Val.isOneValue();
   }
@@ -205,7 +205,7 @@ public:
   /// This function will return true iff every bit in this constant is set
   /// to true.
   /// @returns true iff this constant's bits are all set to true.
-  /// @brief Determine if the value is all ones.
+  /// Determine if the value is all ones.
   bool isMinusOne() const {
     return Val.isAllOnesValue();
   }
@@ -214,7 +214,7 @@ public:
   /// value that may be represented by the constant's type.
   /// @returns true iff this is the largest value that may be represented
   /// by this type.
-  /// @brief Determine if the value is maximal.
+  /// Determine if the value is maximal.
   bool isMaxValue(bool isSigned) const {
     if (isSigned)
       return Val.isMaxSignedValue();
@@ -226,7 +226,7 @@ public:
   /// value that may be represented by this constant's type.
   /// @returns true if this is the smallest value that may be represented by
   /// this type.
-  /// @brief Determine if the value is minimal.
+  /// Determine if the value is minimal.
   bool isMinValue(bool isSigned) const {
     if (isSigned)
       return Val.isMinSignedValue();
@@ -238,7 +238,7 @@ public:
   /// active bits bigger than 64 bits or a value greater than the given uint64_t
   /// value.
   /// @returns true iff this constant is greater or equal to the given number.
-  /// @brief Determine if the value is greater or equal to the given number.
+  /// Determine if the value is greater or equal to the given number.
   bool uge(uint64_t Num) const {
     return Val.uge(Num);
   }
@@ -247,12 +247,12 @@ public:
   /// return it, otherwise return the limit value.  This causes the value
   /// to saturate to the limit.
   /// @returns the min of the value of the constant and the specified value
-  /// @brief Get the constant's value with a saturation limit
+  /// Get the constant's value with a saturation limit
   uint64_t getLimitedValue(uint64_t Limit = ~0ULL) const {
     return Val.getLimitedValue(Limit);
   }
 
-  /// @brief Methods to support type inquiry through isa, cast, and dyn_cast.
+  /// Methods to support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantIntVal;
   }
@@ -283,6 +283,11 @@ public:
   /// for simple constant values like 2.0/1.0 etc, that are known-valid both as
   /// host double and as the target format.
   static Constant *get(Type* Ty, double V);
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantFP for the given value.
+  static Constant *get(Type *Ty, const APFloat &V);
+
   static Constant *get(Type* Ty, StringRef Str);
   static ConstantFP *get(LLVMContext &Context, const APFloat &V);
   static Constant *getNaN(Type *Ty, bool Negative = false, unsigned type = 0);
@@ -687,15 +692,33 @@ class ConstantDataArray final : public ConstantDataSequential {
 public:
   ConstantDataArray(const ConstantDataArray &) = delete;
 
-  /// get() constructors - Return a constant with array type with an element
+  /// get() constructor - Return a constant with array type with an element
   /// count and element type matching the ArrayRef passed in.  Note that this
   /// can return a ConstantAggregateZero object.
-  static Constant *get(LLVMContext &Context, ArrayRef<uint8_t> Elts);
-  static Constant *get(LLVMContext &Context, ArrayRef<uint16_t> Elts);
-  static Constant *get(LLVMContext &Context, ArrayRef<uint32_t> Elts);
-  static Constant *get(LLVMContext &Context, ArrayRef<uint64_t> Elts);
-  static Constant *get(LLVMContext &Context, ArrayRef<float> Elts);
-  static Constant *get(LLVMContext &Context, ArrayRef<double> Elts);
+  template <typename ElementTy>
+  static Constant *get(LLVMContext &Context, ArrayRef<ElementTy> Elts) {
+    const char *Data = reinterpret_cast<const char *>(Elts.data());
+    return getRaw(StringRef(Data, Elts.size() * sizeof(ElementTy)), Elts.size(),
+                  Type::getScalarTy<ElementTy>(Context));
+  }
+
+  /// get() constructor - ArrayTy needs to be compatible with
+  /// ArrayRef<ElementTy>. Calls get(LLVMContext, ArrayRef<ElementTy>).
+  template <typename ArrayTy>
+  static Constant *get(LLVMContext &Context, ArrayTy &Elts) {
+    return ConstantDataArray::get(Context, makeArrayRef(Elts));
+  }
+
+  /// get() constructor - Return a constant with array type with an element
+  /// count and element type matching the NumElements and ElementTy parameters
+  /// passed in. Note that this can return a ConstantAggregateZero object.
+  /// ElementTy needs to be one of i8/i16/i32/i64/float/double. Data is the
+  /// buffer containing the elements. Be careful to make sure Data uses the
+  /// right endianness, the buffer will be used as-is.
+  static Constant *getRaw(StringRef Data, uint64_t NumElements, Type *ElementTy) {
+    Type *Ty = ArrayType::get(ElementTy, NumElements);
+    return getImpl(Data, Ty);
+  }
 
   /// getFP() constructors - Return a constant with array type with an element
   /// count and element type of float with precision matching the number of
@@ -802,7 +825,7 @@ public:
   /// Return the ConstantTokenNone.
   static ConstantTokenNone *get(LLVMContext &Context);
 
-  /// @brief Methods to support type inquiry through isa, cast, and dyn_cast.
+  /// Methods to support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantTokenNoneVal;
   }
@@ -995,10 +1018,15 @@ public:
     return getLShr(C1, C2, true);
   }
 
-  /// Return the identity for the given binary operation,
-  /// i.e. a constant C such that X op C = X and C op X = X for every X.  It
-  /// returns null if the operator doesn't have an identity.
-  static Constant *getBinOpIdentity(unsigned Opcode, Type *Ty);
+  /// Return the identity constant for a binary opcode.
+  /// The identity constant C is defined as X op C = X and C op X = X for every
+  /// X when the binary operation is commutative. If the binop is not
+  /// commutative, callers can acquire the operand 1 identity constant by
+  /// setting AllowRHSConstant to true. For example, any shift has a zero
+  /// identity constant for operand 1: X shift 0 = X.
+  /// Return nullptr if the operator does not have an identity constant.
+  static Constant *getBinOpIdentity(unsigned Opcode, Type *Ty,
+                                    bool AllowRHSConstant = false);
 
   /// Return the absorbing element for the given binary
   /// operation, i.e. a constant C such that X op C = C and C op X = C for
@@ -1009,7 +1037,7 @@ public:
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
 
-  /// \brief Convenience function for getting a Cast operation.
+  /// Convenience function for getting a Cast operation.
   ///
   /// \param ops The opcode for the conversion
   /// \param C  The constant to be converted
@@ -1018,62 +1046,62 @@ public:
   static Constant *getCast(unsigned ops, Constant *C, Type *Ty,
                            bool OnlyIfReduced = false);
 
-  // @brief Create a ZExt or BitCast cast constant expression
+  // Create a ZExt or BitCast cast constant expression
   static Constant *getZExtOrBitCast(
     Constant *C,   ///< The constant to zext or bitcast
     Type *Ty ///< The type to zext or bitcast C to
   );
 
-  // @brief Create a SExt or BitCast cast constant expression
+  // Create a SExt or BitCast cast constant expression
   static Constant *getSExtOrBitCast(
     Constant *C,   ///< The constant to sext or bitcast
     Type *Ty ///< The type to sext or bitcast C to
   );
 
-  // @brief Create a Trunc or BitCast cast constant expression
+  // Create a Trunc or BitCast cast constant expression
   static Constant *getTruncOrBitCast(
     Constant *C,   ///< The constant to trunc or bitcast
     Type *Ty ///< The type to trunc or bitcast C to
   );
 
-  /// @brief Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant
+  /// Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant
   /// expression.
   static Constant *getPointerCast(
     Constant *C,   ///< The pointer value to be casted (operand 0)
     Type *Ty ///< The type to which cast should be made
   );
 
-  /// @brief Create a BitCast or AddrSpaceCast for a pointer type depending on
+  /// Create a BitCast or AddrSpaceCast for a pointer type depending on
   /// the address space.
   static Constant *getPointerBitCastOrAddrSpaceCast(
     Constant *C,   ///< The constant to addrspacecast or bitcast
     Type *Ty ///< The type to bitcast or addrspacecast C to
   );
 
-  /// @brief Create a ZExt, Bitcast or Trunc for integer -> integer casts
+  /// Create a ZExt, Bitcast or Trunc for integer -> integer casts
   static Constant *getIntegerCast(
     Constant *C,    ///< The integer constant to be casted
     Type *Ty, ///< The integer type to cast to
     bool isSigned   ///< Whether C should be treated as signed or not
   );
 
-  /// @brief Create a FPExt, Bitcast or FPTrunc for fp -> fp casts
+  /// Create a FPExt, Bitcast or FPTrunc for fp -> fp casts
   static Constant *getFPCast(
     Constant *C,    ///< The integer constant to be casted
     Type *Ty ///< The integer type to cast to
   );
 
-  /// @brief Return true if this is a convert constant expression
+  /// Return true if this is a convert constant expression
   bool isCast() const;
 
-  /// @brief Return true if this is a compare constant expression
+  /// Return true if this is a compare constant expression
   bool isCompare() const;
 
-  /// @brief Return true if this is an insertvalue or extractvalue expression,
+  /// Return true if this is an insertvalue or extractvalue expression,
   /// and the getIndices() method may be used.
   bool hasIndices() const;
 
-  /// @brief Return true if this is a getelementptr expression and all
+  /// Return true if this is a getelementptr expression and all
   /// the index operands are compile-time known integers within the
   /// corresponding notional static array extents. Note that this is
   /// not equivalant to, a subset of, or a superset of the "inbounds"
@@ -1093,7 +1121,7 @@ public:
   static Constant *get(unsigned Opcode, Constant *C1, Constant *C2,
                        unsigned Flags = 0, Type *OnlyIfReducedTy = nullptr);
 
-  /// \brief Return an ICmp or FCmp comparison operator constant expression.
+  /// Return an ICmp or FCmp comparison operator constant expression.
   ///
   /// \param OnlyIfReduced see \a getWithOperands() docs.
   static Constant *getCompare(unsigned short pred, Constant *C1, Constant *C2,
diff --git a/contrib/llvm/include/llvm/IR/DIBuilder.h b/contrib/llvm/include/llvm/IR/DIBuilder.h
index 3c2074dfe788..06c9421ec1d6 100644
--- a/contrib/llvm/include/llvm/IR/DIBuilder.h
+++ b/contrib/llvm/include/llvm/IR/DIBuilder.h
@@ -18,7 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -46,6 +46,7 @@ namespace llvm {
     DICompileUnit *CUNode;   ///< The one compile unit created by this DIBuiler.
     Function *DeclareFn;     ///< llvm.dbg.declare
     Function *ValueFn;       ///< llvm.dbg.value
+    Function *LabelFn;       ///< llvm.dbg.label
 
     SmallVector<Metadata *, 4> AllEnumTypes;
     /// Track the RetainTypes, since they can be updated later on.
@@ -69,6 +70,9 @@ namespace llvm {
     /// copy.
     DenseMap<MDNode *, SmallVector<TrackingMDNodeRef, 1>> PreservedVariables;
 
+    /// Each subprogram's preserved labels.
+    DenseMap<MDNode *, SmallVector<TrackingMDNodeRef, 1>> PreservedLabels;
+
     /// Create a temporary.
     ///
     /// Create an \a temporary node and track it in \a UnresolvedNodes.
@@ -79,6 +83,10 @@ namespace llvm {
                                DIExpression *Expr, const DILocation *DL,
                                BasicBlock *InsertBB, Instruction *InsertBefore);
 
+    /// Internal helper for insertLabel.
+    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                             BasicBlock *InsertBB, Instruction *InsertBefore);
+
     /// Internal helper for insertDbgValueIntrinsic.
     Instruction *
     insertDbgValueIntrinsic(llvm::Value *Val, DILocalVariable *VarInfo,
@@ -90,7 +98,10 @@ namespace llvm {
     ///
     /// If \c AllowUnresolved, collect unresolved nodes attached to the module
     /// in order to resolve cycles during \a finalize().
-    explicit DIBuilder(Module &M, bool AllowUnresolved = true);
+    ///
+    /// If \p CU is given a value other than nullptr, then set \p CUNode to CU.
+    explicit DIBuilder(Module &M, bool AllowUnresolved = true,
+                       DICompileUnit *CU = nullptr);
     DIBuilder(const DIBuilder &) = delete;
     DIBuilder &operator=(const DIBuilder &) = delete;
 
@@ -138,11 +149,13 @@ namespace llvm {
     /// Create a file descriptor to hold debugging information for a file.
     /// \param Filename  File name.
     /// \param Directory Directory.
-    /// \param CSKind    Checksum kind (e.g. CSK_None, CSK_MD5, CSK_SHA1, etc.).
-    /// \param Checksum  Checksum data.
-    DIFile *createFile(StringRef Filename, StringRef Directory,
-                       DIFile::ChecksumKind CSKind = DIFile::CSK_None,
-                       StringRef Checksum = StringRef());
+    /// \param Checksum  Optional checksum kind (e.g. CSK_MD5, CSK_SHA1, etc.)
+    ///                  and value.
+    /// \param Source    Optional source text.
+    DIFile *
+    createFile(StringRef Filename, StringRef Directory,
+               Optional<DIFile::ChecksumInfo<StringRef>> Checksum = None,
+               Optional<StringRef> Source = None);
 
     /// Create debugging information entry for a macro.
     /// \param Parent     Macro parent (could be nullptr).
@@ -163,7 +176,7 @@ namespace llvm {
                                      DIFile *File);
 
     /// Create a single enumerator value.
-    DIEnumerator *createEnumerator(StringRef Name, int64_t Val);
+    DIEnumerator *createEnumerator(StringRef Name, int64_t Val, bool IsUnsigned = false);
 
     /// Create a DWARF unspecified type.
     DIBasicType *createUnspecifiedType(StringRef Name);
@@ -232,10 +245,11 @@ namespace llvm {
     /// \param Ty           Original type.
     /// \param BaseTy       Base type. Ty is inherits from base.
     /// \param BaseOffset   Base offset.
+    /// \param VBPtrOffset  Virtual base pointer offset.
     /// \param Flags        Flags to describe inheritance attribute,
     ///                     e.g. private
     DIDerivedType *createInheritance(DIType *Ty, DIType *BaseTy,
-                                     uint64_t BaseOffset,
+                                     uint64_t BaseOffset, uint32_t VBPtrOffset,
                                      DINode::DIFlags Flags);
 
     /// Create debugging information entry for a member.
@@ -255,6 +269,27 @@ namespace llvm {
                                     uint64_t OffsetInBits,
                                     DINode::DIFlags Flags, DIType *Ty);
 
+    /// Create debugging information entry for a variant.  A variant
+    /// normally should be a member of a variant part.
+    /// \param Scope        Member scope.
+    /// \param Name         Member name.
+    /// \param File         File where this member is defined.
+    /// \param LineNo       Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param OffsetInBits Member offset.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Discriminant The discriminant for this branch; null for
+    ///                     the default branch
+    /// \param Ty           Parent type.
+    DIDerivedType *createVariantMemberType(DIScope *Scope, StringRef Name,
+					   DIFile *File, unsigned LineNo,
+					   uint64_t SizeInBits,
+					   uint32_t AlignInBits,
+					   uint64_t OffsetInBits,
+					   Constant *Discriminant,
+					   DINode::DIFlags Flags, DIType *Ty);
+
     /// Create debugging information entry for a bit field member.
     /// \param Scope               Member scope.
     /// \param Name                Member name.
@@ -376,6 +411,27 @@ namespace llvm {
                                      unsigned RunTimeLang = 0,
                                      StringRef UniqueIdentifier = "");
 
+    /// Create debugging information entry for a variant part.  A
+    /// variant part normally has a discriminator (though this is not
+    /// required) and a number of variant children.
+    /// \param Scope        Scope in which this union is defined.
+    /// \param Name         Union name.
+    /// \param File         File where this member is defined.
+    /// \param LineNumber   Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Discriminator Discriminant member
+    /// \param Elements     Variant elements.
+    /// \param UniqueIdentifier A unique identifier for the union.
+    DICompositeType *createVariantPart(DIScope *Scope, StringRef Name,
+				       DIFile *File, unsigned LineNumber,
+				       uint64_t SizeInBits, uint32_t AlignInBits,
+				       DINode::DIFlags Flags,
+				       DIDerivedType *Discriminator,
+				       DINodeArray Elements,
+				       StringRef UniqueIdentifier = "");
+
     /// Create debugging information for template
     /// type parameter.
     /// \param Scope        Scope in which this type is defined.
@@ -442,10 +498,11 @@ namespace llvm {
     /// \param Elements       Enumeration elements.
     /// \param UnderlyingType Underlying type of a C++11/ObjC fixed enum.
     /// \param UniqueIdentifier A unique identifier for the enum.
+    /// \param IsFixed Boolean flag indicate if this is C++11/ObjC fixed enum.
     DICompositeType *createEnumerationType(
         DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
         uint64_t SizeInBits, uint32_t AlignInBits, DINodeArray Elements,
-        DIType *UnderlyingType, StringRef UniqueIdentifier = "");
+        DIType *UnderlyingType, StringRef UniqueIdentifier = "", bool IsFixed = false);
 
     /// Create subroutine type.
     /// \param ParameterTypes  An array of subroutine parameter types. This
@@ -458,12 +515,15 @@ namespace llvm {
                          DINode::DIFlags Flags = DINode::FlagZero,
                          unsigned CC = 0);
 
-    /// Create a new DIType* with "artificial" flag set.
-    DIType *createArtificialType(DIType *Ty);
+    /// Create a distinct clone of \p SP with FlagArtificial set.
+    static DISubprogram *createArtificialSubprogram(DISubprogram *SP);
+
+    /// Create a uniqued clone of \p Ty with FlagArtificial set.
+    static DIType *createArtificialType(DIType *Ty);
 
-    /// Create a new DIType* with the "object pointer"
-    /// flag set.
-    DIType *createObjectPointerType(DIType *Ty);
+    /// Create a uniqued clone of \p Ty with FlagObjectPointer and
+    /// FlagArtificial set.
+    static DIType *createObjectPointerType(DIType *Ty);
 
     /// Create a permanent forward-declared type.
     DICompositeType *createForwardDecl(unsigned Tag, StringRef Name,
@@ -500,6 +560,7 @@ namespace llvm {
     /// Create a descriptor for a value range.  This
     /// implicitly uniques the values returned.
     DISubrange *getOrCreateSubrange(int64_t Lo, int64_t Count);
+    DISubrange *getOrCreateSubrange(int64_t Lo, Metadata *CountNode);
 
     /// Create a new descriptor for the specified variable.
     /// \param Context     Variable scope.
@@ -542,6 +603,14 @@ namespace llvm {
                        DINode::DIFlags Flags = DINode::FlagZero,
                        uint32_t AlignInBits = 0);
 
+    /// Create a new descriptor for an label.
+    ///
+    /// \c Scope must be a \a DILocalScope, and thus its scope chain eventually
+    /// leads to a \a DISubprogram.
+    DILabel *
+    createLabel(DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo,
+                bool AlwaysPreserve = false);
+
     /// Create a new descriptor for a parameter variable.
     ///
     /// \c Scope must be a \a DILocalScope, and thus its scope chain eventually
@@ -733,6 +802,20 @@ namespace llvm {
                                DIExpression *Expr, const DILocation *DL,
                                Instruction *InsertBefore);
 
+    /// Insert a new llvm.dbg.label intrinsic call.
+    /// \param LabelInfo    Label's debug info descriptor.
+    /// \param DL           Debug info location.
+    /// \param InsertBefore Location for the new intrinsic.
+    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                             Instruction *InsertBefore);
+
+    /// Insert a new llvm.dbg.label intrinsic call.
+    /// \param LabelInfo    Label's debug info descriptor.
+    /// \param DL           Debug info location.
+    /// \param InsertAtEnd Location for the new intrinsic.
+    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                             BasicBlock *InsertAtEnd);
+
     /// Insert a new llvm.dbg.value intrinsic call.
     /// \param Val          llvm::Value of the variable
     /// \param VarInfo      Variable's debug info descriptor.
diff --git a/contrib/llvm/include/llvm/IR/DataLayout.h b/contrib/llvm/include/llvm/IR/DataLayout.h
index a6c71a5a2c3e..d796a65e6129 100644
--- a/contrib/llvm/include/llvm/IR/DataLayout.h
+++ b/contrib/llvm/include/llvm/IR/DataLayout.h
@@ -61,7 +61,7 @@ enum AlignTypeEnum {
 // sunk down to an FTTI element that is queried rather than a global
 // preference.
 
-/// \brief Layout alignment element.
+/// Layout alignment element.
 ///
 /// Stores the alignment data associated with a given alignment type (integer,
 /// vector, float) and type bit width.
@@ -69,7 +69,7 @@ enum AlignTypeEnum {
 /// \note The unusual order of elements in the structure attempts to reduce
 /// padding and make the structure slightly more cache friendly.
 struct LayoutAlignElem {
-  /// \brief Alignment type from \c AlignTypeEnum
+  /// Alignment type from \c AlignTypeEnum
   unsigned AlignType : 8;
   unsigned TypeBitWidth : 24;
   unsigned ABIAlign : 16;
@@ -81,7 +81,7 @@ struct LayoutAlignElem {
   bool operator==(const LayoutAlignElem &rhs) const;
 };
 
-/// \brief Layout pointer alignment element.
+/// Layout pointer alignment element.
 ///
 /// Stores the alignment data associated with a given pointer and address space.
 ///
@@ -92,15 +92,17 @@ struct PointerAlignElem {
   unsigned PrefAlign;
   uint32_t TypeByteWidth;
   uint32_t AddressSpace;
+  uint32_t IndexWidth;
 
   /// Initializer
   static PointerAlignElem get(uint32_t AddressSpace, unsigned ABIAlign,
-                              unsigned PrefAlign, uint32_t TypeByteWidth);
+                              unsigned PrefAlign, uint32_t TypeByteWidth,
+                              uint32_t IndexWidth);
 
   bool operator==(const PointerAlignElem &rhs) const;
 };
 
-/// \brief A parsed version of the target data layout string in and methods for
+/// A parsed version of the target data layout string in and methods for
 /// querying it.
 ///
 /// The target data layout string is specified *by the target* - a frontend
@@ -113,6 +115,7 @@ private:
 
   unsigned AllocaAddrSpace;
   unsigned StackNaturalAlign;
+  unsigned ProgramAddrSpace;
 
   enum ManglingModeT {
     MM_None,
@@ -126,7 +129,7 @@ private:
 
   SmallVector<unsigned char, 8> LegalIntWidths;
 
-  /// \brief Primitive type alignment data. This is sorted by type and bit
+  /// Primitive type alignment data. This is sorted by type and bit
   /// width during construction.
   using AlignmentsTy = SmallVector<LayoutAlignElem, 16>;
   AlignmentsTy Alignments;
@@ -140,7 +143,7 @@ private:
   AlignmentsTy::iterator
   findAlignmentLowerBound(AlignTypeEnum AlignType, uint32_t BitWidth);
 
-  /// \brief The string representation used to create this DataLayout
+  /// The string representation used to create this DataLayout
   std::string StringRepresentation;
 
   using PointersTy = SmallVector<PointerAlignElem, 8>;
@@ -165,7 +168,8 @@ private:
   unsigned getAlignmentInfo(AlignTypeEnum align_type, uint32_t bit_width,
                             bool ABIAlign, Type *Ty) const;
   void setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
-                           unsigned PrefAlign, uint32_t TypeByteWidth);
+                           unsigned PrefAlign, uint32_t TypeByteWidth,
+                           uint32_t IndexWidth);
 
   /// Internal helper method that returns requested alignment for type.
   unsigned getAlignment(Type *Ty, bool abi_or_pref) const;
@@ -196,6 +200,7 @@ public:
     BigEndian = DL.isBigEndian();
     AllocaAddrSpace = DL.AllocaAddrSpace;
     StackNaturalAlign = DL.StackNaturalAlign;
+    ProgramAddrSpace = DL.ProgramAddrSpace;
     ManglingMode = DL.ManglingMode;
     LegalIntWidths = DL.LegalIntWidths;
     Alignments = DL.Alignments;
@@ -216,7 +221,7 @@ public:
   bool isLittleEndian() const { return !BigEndian; }
   bool isBigEndian() const { return BigEndian; }
 
-  /// \brief Returns the string representation of the DataLayout.
+  /// Returns the string representation of the DataLayout.
   ///
   /// This representation is in the same format accepted by the string
   /// constructor above. This should not be used to compare two DataLayout as
@@ -225,10 +230,10 @@ public:
     return StringRepresentation;
   }
 
-  /// \brief Test if the DataLayout was constructed from an empty string.
+  /// Test if the DataLayout was constructed from an empty string.
   bool isDefault() const { return StringRepresentation.empty(); }
 
-  /// \brief Returns true if the specified type is known to be a native integer
+  /// Returns true if the specified type is known to be a native integer
   /// type supported by the CPU.
   ///
   /// For example, i64 is not native on most 32-bit CPUs and i37 is not native
@@ -252,10 +257,18 @@ public:
   unsigned getStackAlignment() const { return StackNaturalAlign; }
   unsigned getAllocaAddrSpace() const { return AllocaAddrSpace; }
 
+  unsigned getProgramAddressSpace() const { return ProgramAddrSpace; }
+
   bool hasMicrosoftFastStdCallMangling() const {
     return ManglingMode == MM_WinCOFFX86;
   }
 
+  /// Returns true if symbols with leading question marks should not receive IR
+  /// mangling. True for Windows mangling modes.
+  bool doNotMangleLeadingQuestionMark() const {
+    return ManglingMode == MM_WinCOFF || ManglingMode == MM_WinCOFFX86;
+  }
+
   bool hasLinkerPrivateGlobalPrefix() const { return ManglingMode == MM_MachO; }
 
   StringRef getLinkerPrivateGlobalPrefix() const {
@@ -296,7 +309,7 @@ public:
 
   static const char *getManglingComponent(const Triple &T);
 
-  /// \brief Returns true if the specified type fits in a native integer type
+  /// Returns true if the specified type fits in a native integer type
   /// supported by the CPU.
   ///
   /// For example, if the CPU only supports i32 as a native integer type, then
@@ -321,6 +334,9 @@ public:
   /// the backends/clients are updated.
   unsigned getPointerSize(unsigned AS = 0) const;
 
+  // Index size used for address calculation.
+  unsigned getIndexSize(unsigned AS) const;
+
   /// Return the address spaces containing non-integral pointers.  Pointers in
   /// this address space don't have a well-defined bitwise representation.
   ArrayRef<unsigned> getNonIntegralAddressSpaces() const {
@@ -345,6 +361,11 @@ public:
     return getPointerSize(AS) * 8;
   }
 
+  /// Size in bits of index used for address calculation in getelementptr.
+  unsigned getIndexSizeInBits(unsigned AS) const {
+    return getIndexSize(AS) * 8;
+  }
+
   /// Layout pointer size, in bits, based on the type.  If this function is
   /// called with a pointer type, then the type size of the pointer is returned.
   /// If this function is called with a vector of pointers, then the type size
@@ -352,6 +373,10 @@ public:
   /// vector of pointers.
   unsigned getPointerTypeSizeInBits(Type *) const;
 
+  /// Layout size of the index used in GEP calculation.
+  /// The function should be called with pointer or vector of pointers type.
+  unsigned getIndexTypeSizeInBits(Type *Ty) const;
+
   unsigned getPointerTypeSize(Type *Ty) const {
     return getPointerTypeSizeInBits(Ty) / 8;
   }
@@ -373,13 +398,13 @@ public:
   /// [*] The alloc size depends on the alignment, and thus on the target.
   ///     These values are for x86-32 linux.
 
-  /// \brief Returns the number of bits necessary to hold the specified type.
+  /// Returns the number of bits necessary to hold the specified type.
   ///
   /// For example, returns 36 for i36 and 80 for x86_fp80. The type passed must
   /// have a size (Type::isSized() must return true).
   uint64_t getTypeSizeInBits(Type *Ty) const;
 
-  /// \brief Returns the maximum number of bytes that may be overwritten by
+  /// Returns the maximum number of bytes that may be overwritten by
   /// storing the specified type.
   ///
   /// For example, returns 5 for i36 and 10 for x86_fp80.
@@ -387,7 +412,7 @@ public:
     return (getTypeSizeInBits(Ty) + 7) / 8;
   }
 
-  /// \brief Returns the maximum number of bits that may be overwritten by
+  /// Returns the maximum number of bits that may be overwritten by
   /// storing the specified type; always a multiple of 8.
   ///
   /// For example, returns 40 for i36 and 80 for x86_fp80.
@@ -395,7 +420,7 @@ public:
     return 8 * getTypeStoreSize(Ty);
   }
 
-  /// \brief Returns the offset in bytes between successive objects of the
+  /// Returns the offset in bytes between successive objects of the
   /// specified type, including alignment padding.
   ///
   /// This is the amount that alloca reserves for this type. For example,
@@ -405,7 +430,7 @@ public:
     return alignTo(getTypeStoreSize(Ty), getABITypeAlignment(Ty));
   }
 
-  /// \brief Returns the offset in bits between successive objects of the
+  /// Returns the offset in bits between successive objects of the
   /// specified type, including alignment padding; always a multiple of 8.
   ///
   /// This is the amount that alloca reserves for this type. For example,
@@ -414,64 +439,69 @@ public:
     return 8 * getTypeAllocSize(Ty);
   }
 
-  /// \brief Returns the minimum ABI-required alignment for the specified type.
+  /// Returns the minimum ABI-required alignment for the specified type.
   unsigned getABITypeAlignment(Type *Ty) const;
 
-  /// \brief Returns the minimum ABI-required alignment for an integer type of
+  /// Returns the minimum ABI-required alignment for an integer type of
   /// the specified bitwidth.
   unsigned getABIIntegerTypeAlignment(unsigned BitWidth) const;
 
-  /// \brief Returns the preferred stack/global alignment for the specified
+  /// Returns the preferred stack/global alignment for the specified
   /// type.
   ///
   /// This is always at least as good as the ABI alignment.
   unsigned getPrefTypeAlignment(Type *Ty) const;
 
-  /// \brief Returns the preferred alignment for the specified type, returned as
+  /// Returns the preferred alignment for the specified type, returned as
   /// log2 of the value (a shift amount).
   unsigned getPreferredTypeAlignmentShift(Type *Ty) const;
 
-  /// \brief Returns an integer type with size at least as big as that of a
+  /// Returns an integer type with size at least as big as that of a
   /// pointer in the given address space.
   IntegerType *getIntPtrType(LLVMContext &C, unsigned AddressSpace = 0) const;
 
-  /// \brief Returns an integer (vector of integer) type with size at least as
+  /// Returns an integer (vector of integer) type with size at least as
   /// big as that of a pointer of the given pointer (vector of pointer) type.
   Type *getIntPtrType(Type *) const;
 
-  /// \brief Returns the smallest integer type with size at least as big as
+  /// Returns the smallest integer type with size at least as big as
   /// Width bits.
   Type *getSmallestLegalIntType(LLVMContext &C, unsigned Width = 0) const;
 
-  /// \brief Returns the largest legal integer type, or null if none are set.
+  /// Returns the largest legal integer type, or null if none are set.
   Type *getLargestLegalIntType(LLVMContext &C) const {
     unsigned LargestSize = getLargestLegalIntTypeSizeInBits();
     return (LargestSize == 0) ? nullptr : Type::getIntNTy(C, LargestSize);
   }
 
-  /// \brief Returns the size of largest legal integer type size, or 0 if none
+  /// Returns the size of largest legal integer type size, or 0 if none
   /// are set.
   unsigned getLargestLegalIntTypeSizeInBits() const;
 
-  /// \brief Returns the offset from the beginning of the type for the specified
+  /// Returns the type of a GEP index.
+  /// If it was not specified explicitly, it will be the integer type of the
+  /// pointer width - IntPtrType.
+  Type *getIndexType(Type *PtrTy) const;
+
+  /// Returns the offset from the beginning of the type for the specified
   /// indices.
   ///
   /// Note that this takes the element type, not the pointer type.
   /// This is used to implement getelementptr.
   int64_t getIndexedOffsetInType(Type *ElemTy, ArrayRef<Value *> Indices) const;
 
-  /// \brief Returns a StructLayout object, indicating the alignment of the
+  /// Returns a StructLayout object, indicating the alignment of the
   /// struct, its size, and the offsets of its fields.
   ///
   /// Note that this information is lazily cached.
   const StructLayout *getStructLayout(StructType *Ty) const;
 
-  /// \brief Returns the preferred alignment of the specified global.
+  /// Returns the preferred alignment of the specified global.
   ///
   /// This includes an explicitly requested alignment (if the global has one).
   unsigned getPreferredAlignment(const GlobalVariable *GV) const;
 
-  /// \brief Returns the preferred alignment of the specified global, returned
+  /// Returns the preferred alignment of the specified global, returned
   /// in log form.
   ///
   /// This includes an explicitly requested alignment (if the global has one).
@@ -506,7 +536,7 @@ public:
   /// NB: Padding in nested element is not taken into account.
   bool hasPadding() const { return IsPadded; }
 
-  /// \brief Given a valid byte offset into the structure, returns the structure
+  /// Given a valid byte offset into the structure, returns the structure
   /// index that contains it.
   unsigned getElementContainingOffset(uint64_t Offset) const;
 
diff --git a/contrib/llvm/include/llvm/IR/DebugInfo.h b/contrib/llvm/include/llvm/IR/DebugInfo.h
index 1d8e7e2855fd..01178af3c9ff 100644
--- a/contrib/llvm/include/llvm/IR/DebugInfo.h
+++ b/contrib/llvm/include/llvm/IR/DebugInfo.h
@@ -28,10 +28,10 @@ class DbgDeclareInst;
 class DbgValueInst;
 class Module;
 
-/// \brief Find subprogram that is enclosing this scope.
+/// Find subprogram that is enclosing this scope.
 DISubprogram *getDISubprogram(const MDNode *Scope);
 
-/// \brief Strip debug info in the module if it exists.
+/// Strip debug info in the module if it exists.
 ///
 /// To do this, we remove all calls to the debugger intrinsics and any named
 /// metadata for debugging. We also remove debug locations for instructions.
@@ -51,10 +51,10 @@ bool stripDebugInfo(Function &F);
 ///   All debug type metadata nodes are unreachable and garbage collected.
 bool stripNonLineTableDebugInfo(Module &M);
 
-/// \brief Return Debug Info Metadata Version by checking module flags.
+/// Return Debug Info Metadata Version by checking module flags.
 unsigned getDebugMetadataVersionFromModule(const Module &M);
 
-/// \brief Utility to find all debug info in a module.
+/// Utility to find all debug info in a module.
 ///
 /// DebugInfoFinder tries to list all debug info MDNodes used in a module. To
 /// list debug info MDNodes used by an instruction, DebugInfoFinder uses
@@ -64,30 +64,33 @@ unsigned getDebugMetadataVersionFromModule(const Module &M);
 /// used by the CUs.
 class DebugInfoFinder {
 public:
-  /// \brief Process entire module and collect debug info anchors.
+  /// Process entire module and collect debug info anchors.
   void processModule(const Module &M);
+  /// Process a single instruction and collect debug info anchors.
+  void processInstruction(const Module &M, const Instruction &I);
 
-  /// \brief Process DbgDeclareInst.
+  /// Process DbgDeclareInst.
   void processDeclare(const Module &M, const DbgDeclareInst *DDI);
-  /// \brief Process DbgValueInst.
+  /// Process DbgValueInst.
   void processValue(const Module &M, const DbgValueInst *DVI);
-  /// \brief Process debug info location.
+  /// Process debug info location.
   void processLocation(const Module &M, const DILocation *Loc);
 
-  /// \brief Clear all lists.
+  /// Clear all lists.
   void reset();
 
 private:
   void InitializeTypeMap(const Module &M);
 
-  void processType(DIType *DT);
-  void processSubprogram(DISubprogram *SP);
+  void processCompileUnit(DICompileUnit *CU);
   void processScope(DIScope *Scope);
+  void processSubprogram(DISubprogram *SP);
+  void processType(DIType *DT);
   bool addCompileUnit(DICompileUnit *CU);
   bool addGlobalVariable(DIGlobalVariableExpression *DIG);
+  bool addScope(DIScope *Scope);
   bool addSubprogram(DISubprogram *SP);
   bool addType(DIType *DT);
-  bool addScope(DIScope *Scope);
 
 public:
   using compile_unit_iterator =
diff --git a/contrib/llvm/include/llvm/IR/DebugInfoFlags.def b/contrib/llvm/include/llvm/IR/DebugInfoFlags.def
index 7ea6346998fe..b1f5fac64232 100644
--- a/contrib/llvm/include/llvm/IR/DebugInfoFlags.def
+++ b/contrib/llvm/include/llvm/IR/DebugInfoFlags.def
@@ -43,6 +43,11 @@ HANDLE_DI_FLAG((1 << 18), IntroducedVirtual)
 HANDLE_DI_FLAG((1 << 19), BitField)
 HANDLE_DI_FLAG((1 << 20), NoReturn)
 HANDLE_DI_FLAG((1 << 21), MainSubprogram)
+HANDLE_DI_FLAG((1 << 22), TypePassByValue)
+HANDLE_DI_FLAG((1 << 23), TypePassByReference)
+HANDLE_DI_FLAG((1 << 24), FixedEnum)
+HANDLE_DI_FLAG((1 << 25), Thunk)
+HANDLE_DI_FLAG((1 << 26), Trivial)
 
 // To avoid needing a dedicated value for IndirectVirtualBase, we use
 // the bitwise or of Virtual and FwdDecl, which does not otherwise
@@ -52,7 +57,7 @@ HANDLE_DI_FLAG((1 << 2) | (1 << 5), IndirectVirtualBase)
 #ifdef DI_FLAG_LARGEST_NEEDED
 // intended to be used with ADT/BitmaskEnum.h
 // NOTE: always must be equal to largest flag, check this when adding new flag
-HANDLE_DI_FLAG((1 << 21), Largest)
+HANDLE_DI_FLAG((1 << 26), Largest)
 #undef DI_FLAG_LARGEST_NEEDED
 #endif
 
diff --git a/contrib/llvm/include/llvm/IR/DebugInfoMetadata.h b/contrib/llvm/include/llvm/IR/DebugInfoMetadata.h
index 75b0c43b6512..820746851104 100644
--- a/contrib/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/contrib/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -18,11 +18,13 @@
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
@@ -230,6 +232,7 @@ public:
     case DITemplateValueParameterKind:
     case DIGlobalVariableKind:
     case DILocalVariableKind:
+    case DILabelKind:
     case DIObjCPropertyKind:
     case DIImportedEntityKind:
     case DIModuleKind:
@@ -332,31 +335,53 @@ class DISubrange : public DINode {
   friend class LLVMContextImpl;
   friend class MDNode;
 
-  int64_t Count;
   int64_t LowerBound;
 
-  DISubrange(LLVMContext &C, StorageType Storage, int64_t Count,
-             int64_t LowerBound)
-      : DINode(C, DISubrangeKind, Storage, dwarf::DW_TAG_subrange_type, None),
-        Count(Count), LowerBound(LowerBound) {}
+  DISubrange(LLVMContext &C, StorageType Storage, Metadata *Node,
+             int64_t LowerBound, ArrayRef<Metadata *> Ops)
+      : DINode(C, DISubrangeKind, Storage, dwarf::DW_TAG_subrange_type, Ops),
+        LowerBound(LowerBound) {}
+
   ~DISubrange() = default;
 
   static DISubrange *getImpl(LLVMContext &Context, int64_t Count,
                              int64_t LowerBound, StorageType Storage,
                              bool ShouldCreate = true);
 
+  static DISubrange *getImpl(LLVMContext &Context, Metadata *CountNode,
+                             int64_t LowerBound, StorageType Storage,
+                             bool ShouldCreate = true);
+
   TempDISubrange cloneImpl() const {
-    return getTemporary(getContext(), getCount(), getLowerBound());
+    return getTemporary(getContext(), getRawCountNode(), getLowerBound());
   }
 
 public:
   DEFINE_MDNODE_GET(DISubrange, (int64_t Count, int64_t LowerBound = 0),
                     (Count, LowerBound))
 
+  DEFINE_MDNODE_GET(DISubrange, (Metadata *CountNode, int64_t LowerBound = 0),
+                    (CountNode, LowerBound))
+
   TempDISubrange clone() const { return cloneImpl(); }
 
   int64_t getLowerBound() const { return LowerBound; }
-  int64_t getCount() const { return Count; }
+
+  Metadata *getRawCountNode() const {
+    return getOperand(0).get();
+  }
+
+  typedef PointerUnion<ConstantInt*, DIVariable*> CountType;
+
+  CountType getCount() const {
+    if (auto *MD = dyn_cast<ConstantAsMetadata>(getRawCountNode()))
+      return CountType(cast<ConstantInt>(MD->getValue()));
+
+    if (auto *DV = dyn_cast<DIVariable>(getRawCountNode()))
+      return CountType(DV);
+
+    return CountType();
+  }
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DISubrangeKind;
@@ -372,36 +397,38 @@ class DIEnumerator : public DINode {
   friend class MDNode;
 
   int64_t Value;
-
   DIEnumerator(LLVMContext &C, StorageType Storage, int64_t Value,
-               ArrayRef<Metadata *> Ops)
+               bool IsUnsigned, ArrayRef<Metadata *> Ops)
       : DINode(C, DIEnumeratorKind, Storage, dwarf::DW_TAG_enumerator, Ops),
-        Value(Value) {}
+        Value(Value) {
+    SubclassData32 = IsUnsigned;
+  }
   ~DIEnumerator() = default;
 
   static DIEnumerator *getImpl(LLVMContext &Context, int64_t Value,
-                               StringRef Name, StorageType Storage,
-                               bool ShouldCreate = true) {
-    return getImpl(Context, Value, getCanonicalMDString(Context, Name), Storage,
-                   ShouldCreate);
+                               bool IsUnsigned, StringRef Name,
+                               StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, Value, IsUnsigned,
+                   getCanonicalMDString(Context, Name), Storage, ShouldCreate);
   }
   static DIEnumerator *getImpl(LLVMContext &Context, int64_t Value,
-                               MDString *Name, StorageType Storage,
-                               bool ShouldCreate = true);
+                               bool IsUnsigned, MDString *Name,
+                               StorageType Storage, bool ShouldCreate = true);
 
   TempDIEnumerator cloneImpl() const {
-    return getTemporary(getContext(), getValue(), getName());
+    return getTemporary(getContext(), getValue(), isUnsigned(), getName());
   }
 
 public:
-  DEFINE_MDNODE_GET(DIEnumerator, (int64_t Value, StringRef Name),
-                    (Value, Name))
-  DEFINE_MDNODE_GET(DIEnumerator, (int64_t Value, MDString *Name),
-                    (Value, Name))
+  DEFINE_MDNODE_GET(DIEnumerator, (int64_t Value, bool IsUnsigned, StringRef Name),
+                    (Value, IsUnsigned, Name))
+  DEFINE_MDNODE_GET(DIEnumerator, (int64_t Value, bool IsUnsigned, MDString *Name),
+                    (Value, IsUnsigned, Name))
 
   TempDIEnumerator clone() const { return cloneImpl(); }
 
   int64_t getValue() const { return Value; }
+  bool isUnsigned() const { return SubclassData32; }
   StringRef getName() const { return getStringOperand(0); }
 
   MDString *getRawName() const { return getOperandAs<MDString>(0); }
@@ -429,6 +456,7 @@ public:
 
   inline StringRef getFilename() const;
   inline StringRef getDirectory() const;
+  inline Optional<StringRef> getSource() const;
 
   StringRef getName() const;
   DIScopeRef getScope() const;
@@ -473,63 +501,103 @@ class DIFile : public DIScope {
   friend class MDNode;
 
 public:
-  // These values must be explictly set, as they end up in the final object
-  // file.
+  /// Which algorithm (e.g. MD5) a checksum was generated with.
+  ///
+  /// The encoding is explicit because it is used directly in Bitcode. The
+  /// value 0 is reserved to indicate the absence of a checksum in Bitcode.
   enum ChecksumKind {
-    CSK_None = 0,
+    // The first variant was originally CSK_None, encoded as 0. The new
+    // internal representation removes the need for this by wrapping the
+    // ChecksumInfo in an Optional, but to preserve Bitcode compatibility the 0
+    // encoding is reserved.
     CSK_MD5 = 1,
     CSK_SHA1 = 2,
     CSK_Last = CSK_SHA1 // Should be last enumeration.
   };
 
+  /// A single checksum, represented by a \a Kind and a \a Value (a string).
+  template <typename T>
+  struct ChecksumInfo {
+    /// The kind of checksum which \a Value encodes.
+    ChecksumKind Kind;
+    /// The string value of the checksum.
+    T Value;
+
+    ChecksumInfo(ChecksumKind Kind, T Value) : Kind(Kind), Value(Value) { }
+    ~ChecksumInfo() = default;
+    bool operator==(const ChecksumInfo<T> &X) const {
+      return Kind == X.Kind && Value == X.Value;
+    }
+    bool operator!=(const ChecksumInfo<T> &X) const { return !(*this == X); }
+    StringRef getKindAsString() const { return getChecksumKindAsString(Kind); }
+  };
+
 private:
-  ChecksumKind CSKind;
+  Optional<ChecksumInfo<MDString *>> Checksum;
+  Optional<MDString *> Source;
 
-  DIFile(LLVMContext &C, StorageType Storage, ChecksumKind CSK,
+  DIFile(LLVMContext &C, StorageType Storage,
+         Optional<ChecksumInfo<MDString *>> CS, Optional<MDString *> Src,
          ArrayRef<Metadata *> Ops)
       : DIScope(C, DIFileKind, Storage, dwarf::DW_TAG_file_type, Ops),
-        CSKind(CSK) {}
+        Checksum(CS), Source(Src) {}
   ~DIFile() = default;
 
   static DIFile *getImpl(LLVMContext &Context, StringRef Filename,
-                         StringRef Directory, ChecksumKind CSK, StringRef CS,
+                         StringRef Directory,
+                         Optional<ChecksumInfo<StringRef>> CS,
+                         Optional<StringRef> Source,
                          StorageType Storage, bool ShouldCreate = true) {
+    Optional<ChecksumInfo<MDString *>> MDChecksum;
+    if (CS)
+      MDChecksum.emplace(CS->Kind, getCanonicalMDString(Context, CS->Value));
     return getImpl(Context, getCanonicalMDString(Context, Filename),
-                   getCanonicalMDString(Context, Directory), CSK,
-                   getCanonicalMDString(Context, CS), Storage, ShouldCreate);
+                   getCanonicalMDString(Context, Directory), MDChecksum,
+                   Source ? Optional<MDString *>(getCanonicalMDString(Context, *Source)) : None,
+                   Storage, ShouldCreate);
   }
   static DIFile *getImpl(LLVMContext &Context, MDString *Filename,
-                         MDString *Directory, ChecksumKind CSK, MDString *CS,
-                         StorageType Storage, bool ShouldCreate = true);
+                         MDString *Directory,
+                         Optional<ChecksumInfo<MDString *>> CS,
+                         Optional<MDString *> Source, StorageType Storage,
+                         bool ShouldCreate = true);
 
   TempDIFile cloneImpl() const {
     return getTemporary(getContext(), getFilename(), getDirectory(),
-                        getChecksumKind(), getChecksum());
+                        getChecksum(), getSource());
   }
 
 public:
   DEFINE_MDNODE_GET(DIFile, (StringRef Filename, StringRef Directory,
-                             ChecksumKind CSK = CSK_None,
-                             StringRef CS = StringRef()),
-                    (Filename, Directory, CSK, CS))
+                             Optional<ChecksumInfo<StringRef>> CS = None,
+                             Optional<StringRef> Source = None),
+                    (Filename, Directory, CS, Source))
   DEFINE_MDNODE_GET(DIFile, (MDString * Filename, MDString *Directory,
-                             ChecksumKind CSK = CSK_None,
-                             MDString *CS = nullptr),
-                    (Filename, Directory, CSK, CS))
+                             Optional<ChecksumInfo<MDString *>> CS = None,
+                             Optional<MDString *> Source = None),
+                    (Filename, Directory, CS, Source))
 
   TempDIFile clone() const { return cloneImpl(); }
 
   StringRef getFilename() const { return getStringOperand(0); }
   StringRef getDirectory() const { return getStringOperand(1); }
-  StringRef getChecksum() const { return getStringOperand(2); }
-  ChecksumKind getChecksumKind() const { return CSKind; }
-  StringRef getChecksumKindAsString() const;
+  Optional<ChecksumInfo<StringRef>> getChecksum() const {
+    Optional<ChecksumInfo<StringRef>> StringRefChecksum;
+    if (Checksum)
+      StringRefChecksum.emplace(Checksum->Kind, Checksum->Value->getString());
+    return StringRefChecksum;
+  }
+  Optional<StringRef> getSource() const {
+    return Source ? Optional<StringRef>((*Source)->getString()) : None;
+  }
 
   MDString *getRawFilename() const { return getOperandAs<MDString>(0); }
   MDString *getRawDirectory() const { return getOperandAs<MDString>(1); }
-  MDString *getRawChecksum() const { return getOperandAs<MDString>(2); }
+  Optional<ChecksumInfo<MDString *>> getRawChecksum() const { return Checksum; }
+  Optional<MDString *> getRawSource() const { return Source; }
 
-  static ChecksumKind getChecksumKind(StringRef CSKindStr);
+  static StringRef getChecksumKindAsString(ChecksumKind CSKind);
+  static Optional<ChecksumKind> getChecksumKind(StringRef CSKindStr);
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DIFileKind;
@@ -548,6 +616,12 @@ StringRef DIScope::getDirectory() const {
   return "";
 }
 
+Optional<StringRef> DIScope::getSource() const {
+  if (auto *F = getFile())
+    return F->getSource();
+  return None;
+}
+
 /// Base class for types.
 ///
 /// TODO: Remove the hardcoded name and context, since many types don't use
@@ -605,9 +679,11 @@ public:
   Metadata *getRawScope() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
 
-  void setFlags(DIFlags NewFlags) {
-    assert(!isUniqued() && "Cannot set flags on uniqued nodes");
-    Flags = NewFlags;
+  /// Returns a new temporary DIType with updated Flags
+  TempDIType cloneWithFlags(DIFlags NewFlags) const {
+    auto NewTy = clone();
+    NewTy->Flags = NewFlags;
+    return NewTy;
   }
 
   bool isPrivate() const {
@@ -633,6 +709,10 @@ public:
   bool isStaticMember() const { return getFlags() & FlagStaticMember; }
   bool isLValueReference() const { return getFlags() & FlagLValueReference; }
   bool isRValueReference() const { return getFlags() & FlagRValueReference; }
+  bool isTypePassByValue() const { return getFlags() & FlagTypePassByValue; }
+  bool isTypePassByReference() const {
+    return getFlags() & FlagTypePassByReference;
+  }
 
   static bool classof(const Metadata *MD) {
     switch (MD->getMetadataID()) {
@@ -698,6 +778,12 @@ public:
 
   unsigned getEncoding() const { return Encoding; }
 
+  enum class Signedness { Signed, Unsigned };
+
+  /// Return the signedness of this type, or None if this type is neither
+  /// signed nor unsigned.
+  Optional<Signedness> getSignedness() const;
+
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DIBasicTypeKind;
   }
@@ -713,7 +799,7 @@ class DIDerivedType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
-  /// \brief The DWARF address space of the memory pointed to or referenced by a
+  /// The DWARF address space of the memory pointed to or referenced by a
   /// pointer or reference type respectively.
   Optional<unsigned> DWARFAddressSpace;
 
@@ -788,7 +874,8 @@ public:
   /// Get extra data associated with this derived type.
   ///
   /// Class type for pointer-to-members, objective-c property node for ivars,
-  /// or global constant wrapper for static members.
+  /// global constant wrapper for static members, or virtual base pointer offset
+  /// for inheritance.
   ///
   /// TODO: Separate out types that need this extra operand: pointer-to-member
   /// types and member fields (static members and ivars).
@@ -806,6 +893,14 @@ public:
     return dyn_cast_or_null<DIObjCProperty>(getExtraData());
   }
 
+  uint32_t getVBPtrOffset() const {
+    assert(getTag() == dwarf::DW_TAG_inheritance);
+    if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData()))
+      if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue()))
+        return static_cast<uint32_t>(CI->getZExtValue());
+    return 0;
+  }
+
   Constant *getStorageOffsetInBits() const {
     assert(getTag() == dwarf::DW_TAG_member && isBitField());
     if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
@@ -819,6 +914,12 @@ public:
       return C->getValue();
     return nullptr;
   }
+  Constant *getDiscriminantValue() const {
+    assert(getTag() == dwarf::DW_TAG_member && !isStaticMember());
+    if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+      return C->getValue();
+    return nullptr;
+  }
   /// @}
 
   static bool classof(const Metadata *MD) {
@@ -861,12 +962,13 @@ class DICompositeType : public DIType {
           uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
           DIFlags Flags, DINodeArray Elements, unsigned RuntimeLang,
           DITypeRef VTableHolder, DITemplateParameterArray TemplateParams,
-          StringRef Identifier, StorageType Storage, bool ShouldCreate = true) {
+          StringRef Identifier, DIDerivedType *Discriminator,
+          StorageType Storage, bool ShouldCreate = true) {
     return getImpl(
         Context, Tag, getCanonicalMDString(Context, Name), File, Line, Scope,
         BaseType, SizeInBits, AlignInBits, OffsetInBits, Flags, Elements.get(),
         RuntimeLang, VTableHolder, TemplateParams.get(),
-        getCanonicalMDString(Context, Identifier), Storage, ShouldCreate);
+        getCanonicalMDString(Context, Identifier), Discriminator, Storage, ShouldCreate);
   }
   static DICompositeType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
@@ -874,14 +976,15 @@ class DICompositeType : public DIType {
           uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
           DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
           Metadata *VTableHolder, Metadata *TemplateParams,
-          MDString *Identifier, StorageType Storage, bool ShouldCreate = true);
+          MDString *Identifier, Metadata *Discriminator,
+          StorageType Storage, bool ShouldCreate = true);
 
   TempDICompositeType cloneImpl() const {
     return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
                         getScope(), getBaseType(), getSizeInBits(),
                         getAlignInBits(), getOffsetInBits(), getFlags(),
                         getElements(), getRuntimeLang(), getVTableHolder(),
-                        getTemplateParams(), getIdentifier());
+                        getTemplateParams(), getIdentifier(), getDiscriminator());
   }
 
 public:
@@ -892,10 +995,10 @@ public:
                      DIFlags Flags, DINodeArray Elements, unsigned RuntimeLang,
                      DITypeRef VTableHolder,
                      DITemplateParameterArray TemplateParams = nullptr,
-                     StringRef Identifier = ""),
+                     StringRef Identifier = "", DIDerivedType *Discriminator = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                      AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-                     VTableHolder, TemplateParams, Identifier))
+                     VTableHolder, TemplateParams, Identifier, Discriminator))
   DEFINE_MDNODE_GET(DICompositeType,
                     (unsigned Tag, MDString *Name, Metadata *File,
                      unsigned Line, Metadata *Scope, Metadata *BaseType,
@@ -903,10 +1006,11 @@ public:
                      uint64_t OffsetInBits, DIFlags Flags, Metadata *Elements,
                      unsigned RuntimeLang, Metadata *VTableHolder,
                      Metadata *TemplateParams = nullptr,
-                     MDString *Identifier = nullptr),
+                     MDString *Identifier = nullptr,
+                     Metadata *Discriminator = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                      AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-                     VTableHolder, TemplateParams, Identifier))
+                     VTableHolder, TemplateParams, Identifier, Discriminator))
 
   TempDICompositeType clone() const { return cloneImpl(); }
 
@@ -923,7 +1027,7 @@ public:
              Metadata *BaseType, uint64_t SizeInBits, uint32_t AlignInBits,
              uint64_t OffsetInBits, DIFlags Flags, Metadata *Elements,
              unsigned RuntimeLang, Metadata *VTableHolder,
-             Metadata *TemplateParams);
+             Metadata *TemplateParams, Metadata *Discriminator);
   static DICompositeType *getODRTypeIfExists(LLVMContext &Context,
                                              MDString &Identifier);
 
@@ -942,7 +1046,7 @@ public:
                Metadata *BaseType, uint64_t SizeInBits, uint32_t AlignInBits,
                uint64_t OffsetInBits, DIFlags Flags, Metadata *Elements,
                unsigned RuntimeLang, Metadata *VTableHolder,
-               Metadata *TemplateParams);
+               Metadata *TemplateParams, Metadata *Discriminator);
 
   DITypeRef getBaseType() const { return DITypeRef(getRawBaseType()); }
   DINodeArray getElements() const {
@@ -960,6 +1064,8 @@ public:
   Metadata *getRawVTableHolder() const { return getOperand(5); }
   Metadata *getRawTemplateParams() const { return getOperand(6); }
   MDString *getRawIdentifier() const { return getOperandAs<MDString>(7); }
+  Metadata *getRawDiscriminator() const { return getOperand(8); }
+  DIDerivedType *getDiscriminator() const { return getOperandAs<DIDerivedType>(8); }
 
   /// Replace operands.
   ///
@@ -1060,7 +1166,7 @@ public:
   };
 
   static Optional<DebugEmissionKind> getEmissionKind(StringRef Str);
-  static const char *EmissionKindString(DebugEmissionKind EK);
+  static const char *emissionKindString(DebugEmissionKind EK);
 
 private:
   unsigned SourceLanguage;
@@ -1337,6 +1443,7 @@ public:
   DIFile *getFile() const { return getScope()->getFile(); }
   StringRef getFilename() const { return getScope()->getFilename(); }
   StringRef getDirectory() const { return getScope()->getDirectory(); }
+  Optional<StringRef> getSource() const { return getScope()->getSource(); }
 
   /// Get the scope where this is inlined.
   ///
@@ -1380,7 +1487,7 @@ public:
   ///
   /// The above 3 components are encoded into a 32bit unsigned integer in
   /// order. If the lowest bit is 1, the current component is empty, and the
-  /// next component will start in the next bit. Otherwise, the the current
+  /// next component will start in the next bit. Otherwise, the current
   /// component is non-empty, and its content starts in the next bit. The
   /// length of each components is either 5 bit or 12 bit: if the 7th bit
   /// is 0, the bit 2~6 (5 bits) are used to represent the component; if the
@@ -1408,26 +1515,25 @@ public:
   /// discriminator.
   inline const DILocation *cloneWithDuplicationFactor(unsigned DF) const;
 
+  enum { NoGeneratedLocation = false, WithGeneratedLocation = true };
+
   /// When two instructions are combined into a single instruction we also
   /// need to combine the original locations into a single location.
   ///
   /// When the locations are the same we can use either location. When they
-  /// differ, we need a third location which is distinct from either. If
-  /// they have the same file/line but have a different discriminator we
-  /// could create a location with a new discriminator. If they are from
-  /// different files/lines the location is ambiguous and can't be
-  /// represented in a single line entry.  In this case, no location
-  /// should be set, unless the merged instruction is a call, which we will
-  /// set the merged debug location as line 0 of the nearest common scope
-  /// where 2 locations are inlined from. This only applies to Instruction;
-  /// for MachineInstruction, as it is post-inline, we will treat the call
-  /// instruction the same way as other instructions.
+  /// differ, we need a third location which is distinct from either. If they
+  /// have the same file/line but have a different discriminator we could
+  /// create a location with a new discriminator. If they are from different
+  /// files/lines the location is ambiguous and can't be represented in a line
+  /// entry. In this case, if \p GenerateLocation is true, we will set the
+  /// merged debug location as line 0 of the nearest common scope where the two
+  /// locations are inlined from.
   ///
-  /// \p ForInst: The Instruction the merged DILocation is for. If the
-  /// Instruction is unavailable or non-existent, use nullptr.
+  /// \p GenerateLocation: Whether the merged location can be generated when
+  /// \p LocA and \p LocB differ.
   static const DILocation *
   getMergedLocation(const DILocation *LocA, const DILocation *LocB,
-                    const Instruction *ForInst = nullptr);
+                    bool GenerateLocation = NoGeneratedLocation);
 
   /// Returns the base discriminator for a given encoded discriminator \p D.
   static unsigned getBaseDiscriminatorFromDiscriminator(unsigned D) {
@@ -1521,13 +1627,13 @@ class DISubprogram : public DILocalScope {
           unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags,
           bool IsOptimized, DICompileUnit *Unit,
           DITemplateParameterArray TemplateParams, DISubprogram *Declaration,
-          DILocalVariableArray Variables, DITypeArray ThrownTypes,
+          DINodeArray RetainedNodes, DITypeArray ThrownTypes,
           StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
                    IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
                    Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams.get(), Declaration, Variables.get(),
+                   Unit, TemplateParams.get(), Declaration, RetainedNodes.get(),
                    ThrownTypes.get(), Storage, ShouldCreate);
   }
   static DISubprogram *
@@ -1536,7 +1642,7 @@ class DISubprogram : public DILocalScope {
           bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
           Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
           int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit,
-          Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
+          Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes,
           Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate = true);
 
   TempDISubprogram cloneImpl() const {
@@ -1545,7 +1651,7 @@ class DISubprogram : public DILocalScope {
                         isDefinition(), getScopeLine(), getContainingType(),
                         getVirtuality(), getVirtualIndex(), getThisAdjustment(),
                         getFlags(), isOptimized(), getUnit(),
-                        getTemplateParams(), getDeclaration(), getVariables(),
+                        getTemplateParams(), getDeclaration(), getRetainedNodes(),
                         getThrownTypes());
   }
 
@@ -1559,12 +1665,12 @@ public:
                      bool IsOptimized, DICompileUnit *Unit,
                      DITemplateParameterArray TemplateParams = nullptr,
                      DISubprogram *Declaration = nullptr,
-                     DILocalVariableArray Variables = nullptr,
+                     DINodeArray RetainedNodes = nullptr,
                      DITypeArray ThrownTypes = nullptr),
                     (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
                      IsDefinition, ScopeLine, ContainingType, Virtuality,
                      VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit,
-                     TemplateParams, Declaration, Variables, ThrownTypes))
+                     TemplateParams, Declaration, RetainedNodes, ThrownTypes))
   DEFINE_MDNODE_GET(
       DISubprogram,
       (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File,
@@ -1572,15 +1678,22 @@ public:
        unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality,
        unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags,
        bool IsOptimized, Metadata *Unit, Metadata *TemplateParams = nullptr,
-       Metadata *Declaration = nullptr, Metadata *Variables = nullptr,
+       Metadata *Declaration = nullptr, Metadata *RetainedNodes = nullptr,
        Metadata *ThrownTypes = nullptr),
       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
        ScopeLine, ContainingType, Virtuality, VirtualIndex, ThisAdjustment,
-       Flags, IsOptimized, Unit, TemplateParams, Declaration, Variables,
+       Flags, IsOptimized, Unit, TemplateParams, Declaration, RetainedNodes,
        ThrownTypes))
 
   TempDISubprogram clone() const { return cloneImpl(); }
 
+  /// Returns a new temporary DISubprogram with updated Flags
+  TempDISubprogram cloneWithFlags(DIFlags NewFlags) const {
+    auto NewSP = clone();
+    NewSP->Flags = NewFlags;
+    return NewSP;
+  }
+
 public:
   unsigned getLine() const { return Line; }
   unsigned getVirtuality() const { return Virtuality; }
@@ -1623,6 +1736,11 @@ public:
   /// Return true if this subprogram is C++11 noreturn or C11 _Noreturn
   bool isNoReturn() const { return getFlags() & FlagNoReturn; }
 
+  // Check if this routine is a compiler-generated thunk.
+  //
+  // Returns true if this subprogram is a thunk generated by the compiler.
+  bool isThunk() const { return getFlags() & FlagThunk; }
+
   DIScopeRef getScope() const { return DIScopeRef(getRawScope()); }
 
   StringRef getName() const { return getStringOperand(2); }
@@ -1645,8 +1763,8 @@ public:
   DISubprogram *getDeclaration() const {
     return cast_or_null<DISubprogram>(getRawDeclaration());
   }
-  DILocalVariableArray getVariables() const {
-    return cast_or_null<MDTuple>(getRawVariables());
+  DINodeArray getRetainedNodes() const {
+    return cast_or_null<MDTuple>(getRawRetainedNodes());
   }
   DITypeArray getThrownTypes() const {
     return cast_or_null<MDTuple>(getRawThrownTypes());
@@ -1658,7 +1776,7 @@ public:
   Metadata *getRawType() const { return getOperand(4); }
   Metadata *getRawUnit() const { return getOperand(5); }
   Metadata *getRawDeclaration() const { return getOperand(6); }
-  Metadata *getRawVariables() const { return getOperand(7); }
+  Metadata *getRawRetainedNodes() const { return getOperand(7); }
   Metadata *getRawContainingType() const {
     return getNumOperands() > 8 ? getOperandAs<Metadata>(8) : nullptr;
   }
@@ -2094,6 +2212,14 @@ public:
   /// Determines the size of the variable's type.
   Optional<uint64_t> getSizeInBits() const;
 
+  /// Return the signedness of this variable's type, or None if this type is
+  /// neither signed nor unsigned.
+  Optional<DIBasicType::Signedness> getSignedness() const {
+    if (auto *BT = dyn_cast<DIBasicType>(getType().resolve()))
+      return BT->getSignedness();
+    return None;
+  }
+
   StringRef getFilename() const {
     if (auto *F = getFile())
       return F->getFilename();
@@ -2106,6 +2232,12 @@ public:
     return "";
   }
 
+  Optional<StringRef> getSource() const {
+    if (auto *F = getFile())
+      return F->getSource();
+    return None;
+  }
+
   Metadata *getRawScope() const { return getOperand(0); }
   MDString *getRawName() const { return getOperandAs<MDString>(1); }
   Metadata *getRawFile() const { return getOperand(2); }
@@ -2194,6 +2326,11 @@ public:
     ///
     /// Return the number of elements in the operand (1 + args).
     unsigned getSize() const;
+
+    /// Append the elements of this operand to \p V.
+    void appendToVector(SmallVectorImpl<uint64_t> &V) const {
+      V.append(get(), get() + getSize());
+    }
   };
 
   /// An iterator for expression operands.
@@ -2297,10 +2434,29 @@ public:
 
   /// Prepend \p DIExpr with a deref and offset operation and optionally turn it
   /// into a stack value.
-  static DIExpression *prepend(const DIExpression *DIExpr, bool DerefBefore,
+  static DIExpression *prepend(const DIExpression *Expr, bool DerefBefore,
                                int64_t Offset = 0, bool DerefAfter = false,
                                bool StackValue = false);
 
+  /// Prepend \p DIExpr with the given opcodes and optionally turn it into a
+  /// stack value.
+  static DIExpression *prependOpcodes(const DIExpression *Expr,
+                                      SmallVectorImpl<uint64_t> &Ops,
+                                      bool StackValue = false);
+
+  /// Append the opcodes \p Ops to \p DIExpr. Unlike \ref appendToStack, the
+  /// returned expression is a stack value only if \p DIExpr is a stack value.
+  /// If \p DIExpr describes a fragment, the returned expression will describe
+  /// the same fragment.
+  static DIExpression *append(const DIExpression *Expr, ArrayRef<uint64_t> Ops);
+
+  /// Convert \p DIExpr into a stack value if it isn't one already by appending
+  /// DW_OP_deref if needed, and appending \p Ops to the resulting expression.
+  /// If \p DIExpr describes a fragment, the returned expression will describe
+  /// the same fragment.
+  static DIExpression *appendToStack(const DIExpression *Expr,
+                                     ArrayRef<uint64_t> Ops);
+
   /// Create a DIExpression to describe one part of an aggregate variable that
   /// is fragmented across multiple Values. The DW_OP_LLVM_fragment operation
   /// will be appended to the elements of \c Expr. If \c Expr already contains
@@ -2314,6 +2470,32 @@ public:
   static Optional<DIExpression *>
   createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits,
                            unsigned SizeInBits);
+
+  /// Determine the relative position of the fragments described by this
+  /// DIExpression and \p Other.
+  /// Returns -1 if this is entirely before Other, 0 if this and Other overlap,
+  /// 1 if this is entirely after Other.
+  int fragmentCmp(const DIExpression *Other) const {
+    auto Fragment1 = *getFragmentInfo();
+    auto Fragment2 = *Other->getFragmentInfo();
+    unsigned l1 = Fragment1.OffsetInBits;
+    unsigned l2 = Fragment2.OffsetInBits;
+    unsigned r1 = l1 + Fragment1.SizeInBits;
+    unsigned r2 = l2 + Fragment2.SizeInBits;
+    if (r1 <= l2)
+      return -1;
+    else if (r2 <= l1)
+      return 1;
+    else
+      return 0;
+  }
+
+  /// Check if fragments overlap between this DIExpression and \p Other.
+  bool fragmentsOverlap(const DIExpression *Other) const {
+    if (!isFragment() || !Other->isFragment())
+      return true;
+    return fragmentCmp(Other) == 0;
+  }
 };
 
 /// Global variables.
@@ -2476,6 +2658,76 @@ public:
   }
 };
 
+/// Label.
+///
+class DILabel : public DINode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  DILabel(LLVMContext &C, StorageType Storage, unsigned Line,
+          ArrayRef<Metadata *> Ops)
+      : DINode(C, DILabelKind, Storage, dwarf::DW_TAG_label, Ops), Line(Line) {}
+  ~DILabel() = default;
+
+  static DILabel *getImpl(LLVMContext &Context, DIScope *Scope,
+                          StringRef Name, DIFile *File, unsigned Line,
+                          StorageType Storage,
+                          bool ShouldCreate = true) {
+    return getImpl(Context, Scope, getCanonicalMDString(Context, Name), File,
+                   Line, Storage, ShouldCreate);
+  }
+  static DILabel *getImpl(LLVMContext &Context, Metadata *Scope,
+                          MDString *Name, Metadata *File, unsigned Line,
+                          StorageType Storage,
+                          bool ShouldCreate = true);
+
+  TempDILabel cloneImpl() const {
+    return getTemporary(getContext(), getScope(), getName(), getFile(),
+                        getLine());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DILabel,
+                    (DILocalScope * Scope, StringRef Name, DIFile *File,
+                     unsigned Line),
+                    (Scope, Name, File, Line))
+  DEFINE_MDNODE_GET(DILabel,
+                    (Metadata * Scope, MDString *Name, Metadata *File,
+                     unsigned Line),
+                    (Scope, Name, File, Line))
+
+  TempDILabel clone() const { return cloneImpl(); }
+
+  /// Get the local scope for this label.
+  ///
+  /// Labels must be defined in a local scope.
+  DILocalScope *getScope() const {
+    return cast_or_null<DILocalScope>(getRawScope());
+  }
+  unsigned getLine() const { return Line; }
+  StringRef getName() const { return getStringOperand(1); }
+  DIFile *getFile() const { return cast_or_null<DIFile>(getRawFile()); }
+
+  Metadata *getRawScope() const { return getOperand(0); }
+  MDString *getRawName() const { return getOperandAs<MDString>(1); }
+  Metadata *getRawFile() const { return getOperand(2); }
+
+  /// Check that a location is valid for this label.
+  ///
+  /// Check that \c DL exists, is in the same subprogram, and has the same
+  /// inlined-at location as \c this.  (Otherwise, it's not a valid attachment
+  /// to a \a DbgInfoIntrinsic.)
+  bool isValidLocationForIntrinsic(const DILocation *DL) const {
+    return DL && getScope()->getSubprogram() == DL->getScope()->getSubprogram();
+  }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DILabelKind;
+  }
+};
+
 class DIObjCProperty : public DINode {
   friend class LLVMContextImpl;
   friend class MDNode;
@@ -2547,6 +2799,12 @@ public:
     return "";
   }
 
+  Optional<StringRef> getSource() const {
+    if (auto *F = getFile())
+      return F->getSource();
+    return None;
+  }
+
   MDString *getRawName() const { return getOperandAs<MDString>(0); }
   Metadata *getRawFile() const { return getOperand(1); }
   MDString *getRawGetterName() const { return getOperandAs<MDString>(2); }
diff --git a/contrib/llvm/include/llvm/IR/DebugLoc.h b/contrib/llvm/include/llvm/IR/DebugLoc.h
index eef1212abc4b..9f619ffc5c4d 100644
--- a/contrib/llvm/include/llvm/IR/DebugLoc.h
+++ b/contrib/llvm/include/llvm/IR/DebugLoc.h
@@ -24,7 +24,7 @@ namespace llvm {
   class raw_ostream;
   class DILocation;
 
-  /// \brief A debug info location.
+  /// A debug info location.
   ///
   /// This class is a wrapper around a tracking reference to an \a DILocation
   /// pointer.
@@ -37,10 +37,10 @@ namespace llvm {
   public:
     DebugLoc() = default;
 
-    /// \brief Construct from an \a DILocation.
+    /// Construct from an \a DILocation.
     DebugLoc(const DILocation *L);
 
-    /// \brief Construct from an \a MDNode.
+    /// Construct from an \a MDNode.
     ///
     /// Note: if \c N is not an \a DILocation, a verifier check will fail, and
     /// accessors will crash.  However, construction from other nodes is
@@ -48,7 +48,7 @@ namespace llvm {
     /// IR.
     explicit DebugLoc(const MDNode *N);
 
-    /// \brief Get the underlying \a DILocation.
+    /// Get the underlying \a DILocation.
     ///
     /// \pre !*this or \c isa<DILocation>(getAsMDNode()).
     /// @{
@@ -58,7 +58,7 @@ namespace llvm {
     DILocation &operator*() const { return *get(); }
     /// @}
 
-    /// \brief Check for null.
+    /// Check for null.
     ///
     /// Check for null in a way that is safe with broken debug info.  Unlike
     /// the conversion to \c DILocation, this doesn't require that \c Loc is of
@@ -66,10 +66,10 @@ namespace llvm {
     /// \a Instruction::hasMetadata().
     explicit operator bool() const { return Loc; }
 
-    /// \brief Check whether this has a trivial destructor.
+    /// Check whether this has a trivial destructor.
     bool hasTrivialDestructor() const { return Loc.hasTrivialDestructor(); }
 
-    /// \brief Create a new DebugLoc.
+    /// Create a new DebugLoc.
     ///
     /// Create a new DebugLoc at the specified line/col and scope/inline.  This
     /// forwards to \a DILocation::get().
@@ -95,12 +95,12 @@ namespace llvm {
     MDNode *getScope() const;
     DILocation *getInlinedAt() const;
 
-    /// \brief Get the fully inlined-at scope for a DebugLoc.
+    /// Get the fully inlined-at scope for a DebugLoc.
     ///
     /// Gets the inlined-at scope for a DebugLoc.
     MDNode *getInlinedAtScope() const;
 
-    /// \brief Find the debug info location for the start of the function.
+    /// Find the debug info location for the start of the function.
     ///
     /// Walk up the scope chain of given debug loc and find line number info
     /// for the function.
@@ -109,7 +109,7 @@ namespace llvm {
     /// find the subprogram, and then DILocation::get().
     DebugLoc getFnDebugLoc() const;
 
-    /// \brief Return \c this as a bar \a MDNode.
+    /// Return \c this as a bar \a MDNode.
     MDNode *getAsMDNode() const { return Loc; }
 
     bool operator==(const DebugLoc &DL) const { return Loc == DL.Loc; }
@@ -117,7 +117,7 @@ namespace llvm {
 
     void dump() const;
 
-    /// \brief prints source location /path/to/file.exe:line:col @[inlined at]
+    /// prints source location /path/to/file.exe:line:col @[inlined at]
     void print(raw_ostream &OS) const;
   };
 
diff --git a/contrib/llvm/include/llvm/IR/DerivedTypes.h b/contrib/llvm/include/llvm/IR/DerivedTypes.h
index 6e5e085873ab..9526d6287d2f 100644
--- a/contrib/llvm/include/llvm/IR/DerivedTypes.h
+++ b/contrib/llvm/include/llvm/IR/DerivedTypes.h
@@ -36,7 +36,7 @@ class LLVMContext;
 /// Class to represent integer types. Note that this class is also used to
 /// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and
 /// Int64Ty.
-/// @brief Integer representation type
+/// Integer representation type
 class IntegerType : public Type {
   friend class LLVMContextImpl;
 
@@ -59,10 +59,10 @@ public:
   /// If an IntegerType with the same NumBits value was previously instantiated,
   /// that instance will be returned. Otherwise a new one will be created. Only
   /// one instance with a given NumBits value is ever created.
-  /// @brief Get or create an IntegerType instance.
+  /// Get or create an IntegerType instance.
   static IntegerType *get(LLVMContext &C, unsigned NumBits);
 
-  /// @brief Get the number of bits in this IntegerType
+  /// Get the number of bits in this IntegerType
   unsigned getBitWidth() const { return getSubclassData(); }
 
   /// Return a bitmask with ones set for all of the bits that can be set by an
@@ -79,13 +79,13 @@ public:
 
   /// For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
   /// @returns a bit mask with ones set for all the bits of this type.
-  /// @brief Get a bit mask for this type.
+  /// Get a bit mask for this type.
   APInt getMask() const;
 
   /// This method determines if the width of this IntegerType is a power-of-2
   /// in terms of 8 bit bytes.
   /// @returns true if this is a power-of-2 byte width.
-  /// @brief Is this a power-of-2 byte-width IntegerType ?
+  /// Is this a power-of-2 byte-width IntegerType ?
   bool isPowerOf2ByteWidth() const;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast.
@@ -193,7 +193,7 @@ public:
 /// StructType::create() forms.
 ///
 /// Independent of what kind of struct you have, the body of a struct type are
-/// laid out in memory consequtively with the elements directly one after the
+/// laid out in memory consecutively with the elements directly one after the
 /// other (if the struct is packed) or (if not packed) with padding between the
 /// elements as defined by DataLayout (which is required to match what the code
 /// generator for a target expects).
diff --git a/contrib/llvm/include/llvm/IR/DiagnosticHandler.h b/contrib/llvm/include/llvm/IR/DiagnosticHandler.h
index 9256d4850df1..51873bea3d41 100644
--- a/contrib/llvm/include/llvm/IR/DiagnosticHandler.h
+++ b/contrib/llvm/include/llvm/IR/DiagnosticHandler.h
@@ -18,7 +18,7 @@
 namespace llvm {
 class DiagnosticInfo;
 
-/// \brief This is the base class for diagnostic handling in LLVM.
+/// This is the base class for diagnostic handling in LLVM.
 /// The handleDiagnostics method must be overriden by the subclasses to handle
 /// diagnostic. The *RemarkEnabled methods can be overriden to control
 /// which remarks are enabled.
diff --git a/contrib/llvm/include/llvm/IR/DiagnosticInfo.h b/contrib/llvm/include/llvm/IR/DiagnosticInfo.h
index 020b67d6b711..81d4ae84bf01 100644
--- a/contrib/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/contrib/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -39,7 +39,7 @@ class LLVMContext;
 class Module;
 class SMDiagnostic;
 
-/// \brief Defines the different supported severity of a diagnostic.
+/// Defines the different supported severity of a diagnostic.
 enum DiagnosticSeverity : char {
   DS_Error,
   DS_Warning,
@@ -49,7 +49,7 @@ enum DiagnosticSeverity : char {
   DS_Note
 };
 
-/// \brief Defines the different supported kind of a diagnostic.
+/// Defines the different supported kind of a diagnostic.
 /// This enum should be extended with a new ID for each added concrete subclass.
 enum DiagnosticKind {
   DK_InlineAsm,
@@ -79,7 +79,7 @@ enum DiagnosticKind {
   DK_FirstPluginKind
 };
 
-/// \brief Get the next available kind ID for a plugin diagnostic.
+/// Get the next available kind ID for a plugin diagnostic.
 /// Each time this function is called, it returns a different number.
 /// Therefore, a plugin that wants to "identify" its own classes
 /// with a dynamic identifier, just have to use this method to get a new ID
@@ -89,7 +89,7 @@ enum DiagnosticKind {
 /// DiagnosticKind values.
 int getNextAvailablePluginDiagnosticKind();
 
-/// \brief This is the base abstract class for diagnostic reporting in
+/// This is the base abstract class for diagnostic reporting in
 /// the backend.
 /// The print method must be overloaded by the subclasses to print a
 /// user-friendly message in the client of the backend (let us call it a
@@ -389,20 +389,20 @@ private:
   DiagnosticLocation Loc;
 };
 
-/// \brief Common features for diagnostics dealing with optimization remarks
+/// Common features for diagnostics dealing with optimization remarks
 /// that are used by both IR and MIR passes.
 class DiagnosticInfoOptimizationBase : public DiagnosticInfoWithLocationBase {
 public:
-  /// \brief Used to set IsVerbose via the stream interface.
+  /// Used to set IsVerbose via the stream interface.
   struct setIsVerbose {};
 
-  /// \brief When an instance of this is inserted into the stream, the arguments
+  /// When an instance of this is inserted into the stream, the arguments
   /// following will not appear in the remark printed in the compiler output
   /// (-Rpass) but only in the optimization record file
   /// (-fsave-optimization-record).
   struct setExtraArgs {};
 
-  /// \brief Used in the streaming interface as the general argument type.  It
+  /// Used in the streaming interface as the general argument type.  It
   /// internally converts everything into a key-value pair.
   struct Argument {
     std::string Key;
@@ -415,6 +415,7 @@ public:
     Argument(StringRef Key, const Type *T);
     Argument(StringRef Key, StringRef S);
     Argument(StringRef Key, int N);
+    Argument(StringRef Key, float N);
     Argument(StringRef Key, long N);
     Argument(StringRef Key, long long N);
     Argument(StringRef Key, unsigned N);
@@ -503,7 +504,7 @@ protected:
   /// The remark is expected to be noisy.
   bool IsVerbose = false;
 
-  /// \brief If positive, the index of the first argument that only appear in
+  /// If positive, the index of the first argument that only appear in
   /// the optimization records and not in the remark printed in the compiler
   /// output.
   int FirstExtraArgIndex = -1;
@@ -586,7 +587,7 @@ operator<<(RemarkT &R,
   return R;
 }
 
-/// \brief Common features for diagnostics dealing with optimization remarks
+/// Common features for diagnostics dealing with optimization remarks
 /// that are used by IR passes.
 class DiagnosticInfoIROptimization : public DiagnosticInfoOptimizationBase {
 public:
@@ -608,7 +609,7 @@ public:
                                        Loc),
         CodeRegion(CodeRegion) {}
 
-  /// \brief This is ctor variant allows a pass to build an optimization remark
+  /// This is ctor variant allows a pass to build an optimization remark
   /// from an existing remark.
   ///
   /// This is useful when a transformation pass (e.g LV) wants to emit a remark
@@ -711,7 +712,7 @@ public:
                            const DiagnosticLocation &Loc,
                            const Value *CodeRegion);
 
-  /// \brief Same as above but \p Inst is used to derive code region and debug
+  /// Same as above but \p Inst is used to derive code region and debug
   /// location.
   OptimizationRemarkMissed(const char *PassName, StringRef RemarkName,
                            const Instruction *Inst);
@@ -752,7 +753,7 @@ public:
                              const DiagnosticLocation &Loc,
                              const Value *CodeRegion);
 
-  /// \brief This is ctor variant allows a pass to build an optimization remark
+  /// This is ctor variant allows a pass to build an optimization remark
   /// from an existing remark.
   ///
   /// This is useful when a transformation pass (e.g LV) wants to emit a remark
@@ -763,7 +764,7 @@ public:
                              const OptimizationRemarkAnalysis &Orig)
       : DiagnosticInfoIROptimization(PassName, Prepend, Orig) {}
 
-  /// \brief Same as above but \p Inst is used to derive code region and debug
+  /// Same as above but \p Inst is used to derive code region and debug
   /// location.
   OptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName,
                              const Instruction *Inst);
diff --git a/contrib/llvm/include/llvm/IR/DiagnosticPrinter.h b/contrib/llvm/include/llvm/IR/DiagnosticPrinter.h
index 59c83291affa..25c47cdd1a12 100644
--- a/contrib/llvm/include/llvm/IR/DiagnosticPrinter.h
+++ b/contrib/llvm/include/llvm/IR/DiagnosticPrinter.h
@@ -28,7 +28,7 @@ class StringRef;
 class Twine;
 class Value;
 
-/// \brief Interface for custom diagnostic printing.
+/// Interface for custom diagnostic printing.
 class DiagnosticPrinter {
 public:
   virtual ~DiagnosticPrinter() = default;
@@ -58,7 +58,7 @@ public:
   virtual DiagnosticPrinter &operator<<(const SMDiagnostic &Diag) = 0;
 };
 
-/// \brief Basic diagnostic printer that uses an underlying raw_ostream.
+/// Basic diagnostic printer that uses an underlying raw_ostream.
 class DiagnosticPrinterRawOStream : public DiagnosticPrinter {
 protected:
   raw_ostream &Stream;
diff --git a/contrib/llvm/include/llvm/IR/DomTreeUpdater.h b/contrib/llvm/include/llvm/IR/DomTreeUpdater.h
new file mode 100644
index 000000000000..81ba670ac0f5
--- /dev/null
+++ b/contrib/llvm/include/llvm/IR/DomTreeUpdater.h
@@ -0,0 +1,259 @@
+//===- DomTreeUpdater.h - DomTree/Post DomTree Updater ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DomTreeUpdater class, which provides a uniform way to
+// update dominator tree related data structures.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DOMTREEUPDATER_H
+#define LLVM_DOMTREEUPDATER_H
+
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/GenericDomTree.h"
+#include <functional>
+#include <vector>
+
+namespace llvm {
+class DomTreeUpdater {
+public:
+  enum class UpdateStrategy : unsigned char { Eager = 0, Lazy = 1 };
+
+  explicit DomTreeUpdater(UpdateStrategy Strategy_) : Strategy(Strategy_) {}
+  DomTreeUpdater(DominatorTree &DT_, UpdateStrategy Strategy_)
+      : DT(&DT_), Strategy(Strategy_) {}
+  DomTreeUpdater(DominatorTree *DT_, UpdateStrategy Strategy_)
+      : DT(DT_), Strategy(Strategy_) {}
+  DomTreeUpdater(PostDominatorTree &PDT_, UpdateStrategy Strategy_)
+      : PDT(&PDT_), Strategy(Strategy_) {}
+  DomTreeUpdater(PostDominatorTree *PDT_, UpdateStrategy Strategy_)
+      : PDT(PDT_), Strategy(Strategy_) {}
+  DomTreeUpdater(DominatorTree &DT_, PostDominatorTree &PDT_,
+                 UpdateStrategy Strategy_)
+      : DT(&DT_), PDT(&PDT_), Strategy(Strategy_) {}
+  DomTreeUpdater(DominatorTree *DT_, PostDominatorTree *PDT_,
+                 UpdateStrategy Strategy_)
+      : DT(DT_), PDT(PDT_), Strategy(Strategy_) {}
+
+  ~DomTreeUpdater() { flush(); }
+
+  /// Returns true if the current strategy is Lazy.
+  bool isLazy() const { return Strategy == UpdateStrategy::Lazy; };
+
+  /// Returns true if the current strategy is Eager.
+  bool isEager() const { return Strategy == UpdateStrategy::Eager; };
+
+  /// Returns true if it holds a DominatorTree.
+  bool hasDomTree() const { return DT != nullptr; }
+
+  /// Returns true if it holds a PostDominatorTree.
+  bool hasPostDomTree() const { return PDT != nullptr; }
+
+  /// Returns true if there is BasicBlock awaiting deletion.
+  /// The deletion will only happen until a flush event and
+  /// all available trees are up-to-date.
+  /// Returns false under Eager UpdateStrategy.
+  bool hasPendingDeletedBB() const { return !DeletedBBs.empty(); }
+
+  /// Returns true if DelBB is awaiting deletion.
+  /// Returns false under Eager UpdateStrategy.
+  bool isBBPendingDeletion(BasicBlock *DelBB) const;
+
+  /// Returns true if either of DT or PDT is valid and the tree has at
+  /// least one update pending. If DT or PDT is nullptr it is treated
+  /// as having no pending updates. This function does not check
+  /// whether there is BasicBlock awaiting deletion.
+  /// Returns false under Eager UpdateStrategy.
+  bool hasPendingUpdates() const;
+
+  /// Returns true if there are DominatorTree updates queued.
+  /// Returns false under Eager UpdateStrategy or DT is nullptr.
+  bool hasPendingDomTreeUpdates() const;
+
+  /// Returns true if there are PostDominatorTree updates queued.
+  /// Returns false under Eager UpdateStrategy or PDT is nullptr.
+  bool hasPendingPostDomTreeUpdates() const;
+
+  /// Apply updates on all available trees. Under Eager UpdateStrategy with
+  /// ForceRemoveDuplicates enabled or under Lazy UpdateStrategy, it will
+  /// discard duplicated updates and self-dominance updates. If both DT and PDT
+  /// are nullptrs, this function discards all updates. The Eager Strategy
+  /// applies the updates immediately while the Lazy Strategy queues the
+  /// updates. It is required for the state of the LLVM IR to be updated
+  /// *before* applying the Updates because the internal update routine will
+  /// analyze the current state of the relationship between a pair of (From, To)
+  /// BasicBlocks to determine whether a single update needs to be discarded.
+  void applyUpdates(ArrayRef<DominatorTree::UpdateType> Updates,
+                    bool ForceRemoveDuplicates = false);
+
+  /// Notify all available trees on an edge insertion. If both DT and PDT are
+  /// nullptrs, this function discards the update. Under either Strategy,
+  /// self-dominance update will be removed. The Eager Strategy applies
+  /// the update immediately while the Lazy Strategy queues the update.
+  /// It is recommended to only use this method when you have exactly one
+  /// insertion (and no deletions). It is recommended to use applyUpdates() in
+  /// all other cases. This function has to be called *after* making the update
+  /// on the actual CFG. An internal functions checks if the edge exists in the
+  /// CFG in DEBUG mode.
+  void insertEdge(BasicBlock *From, BasicBlock *To);
+
+  /// Notify all available trees on an edge insertion.
+  /// Under either Strategy, the following updates will be discard silently
+  /// 1. Invalid - Inserting an edge that does not exist in the CFG.
+  /// 2. Self-dominance update.
+  /// 3. Both DT and PDT are nullptrs.
+  /// The Eager Strategy applies the update immediately while the Lazy Strategy
+  /// queues the update. It is recommended to only use this method when you have
+  /// exactly one insertion (and no deletions) and want to discard an invalid
+  /// update.
+  void insertEdgeRelaxed(BasicBlock *From, BasicBlock *To);
+
+  /// Notify all available trees on an edge deletion. If both DT and PDT are
+  /// nullptrs, this function discards the update. Under either Strategy,
+  /// self-dominance update will be removed. The Eager Strategy applies
+  /// the update immediately while the Lazy Strategy queues the update.
+  /// It is recommended to only use this method when you have exactly one
+  /// deletion (and no insertions). It is recommended to use applyUpdates() in
+  /// all other cases. This function has to be called *after* making the update
+  /// on the actual CFG. An internal functions checks if the edge doesn't exist
+  /// in the CFG in DEBUG mode.
+  void deleteEdge(BasicBlock *From, BasicBlock *To);
+
+  /// Notify all available trees on an edge deletion.
+  /// Under either Strategy, the following updates will be discard silently
+  /// 1. Invalid - Deleting an edge that still exists in the CFG.
+  /// 2. Self-dominance update.
+  /// 3. Both DT and PDT are nullptrs.
+  /// The Eager Strategy applies the update immediately while the Lazy Strategy
+  /// queues the update. It is recommended to only use this method when you have
+  /// exactly one deletion (and no insertions) and want to discard an invalid
+  /// update.
+  void deleteEdgeRelaxed(BasicBlock *From, BasicBlock *To);
+
+  /// Delete DelBB. DelBB will be removed from its Parent and
+  /// erased from available trees if it exists and finally get deleted.
+  /// Under Eager UpdateStrategy, DelBB will be processed immediately.
+  /// Under Lazy UpdateStrategy, DelBB will be queued until a flush event and
+  /// all available trees are up-to-date. Assert if any instruction of DelBB is
+  /// modified while awaiting deletion. When both DT and PDT are nullptrs, DelBB
+  /// will be queued until flush() is called.
+  void deleteBB(BasicBlock *DelBB);
+
+  /// Delete DelBB. DelBB will be removed from its Parent and
+  /// erased from available trees if it exists. Then the callback will
+  /// be called. Finally, DelBB will be deleted.
+  /// Under Eager UpdateStrategy, DelBB will be processed immediately.
+  /// Under Lazy UpdateStrategy, DelBB will be queued until a flush event and
+  /// all available trees are up-to-date. Assert if any instruction of DelBB is
+  /// modified while awaiting deletion. Multiple callbacks can be queued for one
+  /// DelBB under Lazy UpdateStrategy.
+  void callbackDeleteBB(BasicBlock *DelBB,
+                        std::function<void(BasicBlock *)> Callback);
+
+  /// Recalculate all available trees.
+  /// Under Lazy Strategy, available trees will only be recalculated if there
+  /// are pending updates or there is BasicBlock awaiting deletion. Returns true
+  /// if at least one tree is recalculated.
+  bool recalculate(Function &F);
+
+  /// Flush DomTree updates and return DomTree.
+  /// It also flush out of date updates applied by all available trees
+  /// and flush Deleted BBs if both trees are up-to-date.
+  /// It must only be called when it has a DomTree.
+  DominatorTree &getDomTree();
+
+  /// Flush PostDomTree updates and return PostDomTree.
+  /// It also flush out of date updates applied by all available trees
+  /// and flush Deleted BBs if both trees are up-to-date.
+  /// It must only be called when it has a PostDomTree.
+  PostDominatorTree &getPostDomTree();
+
+  /// Apply all pending updates to available trees and flush all BasicBlocks
+  /// awaiting deletion.
+  /// Does nothing under Eager UpdateStrategy.
+  void flush();
+
+  /// Debug method to help view the internal state of this class.
+  LLVM_DUMP_METHOD void dump() const;
+
+private:
+  class CallBackOnDeletion final : public CallbackVH {
+  public:
+    CallBackOnDeletion(BasicBlock *V,
+                       std::function<void(BasicBlock *)> Callback)
+        : CallbackVH(V), DelBB(V), Callback_(Callback) {}
+
+  private:
+    BasicBlock *DelBB = nullptr;
+    std::function<void(BasicBlock *)> Callback_;
+
+    void deleted() override {
+      Callback_(DelBB);
+      CallbackVH::deleted();
+    }
+  };
+
+  SmallVector<DominatorTree::UpdateType, 16> PendUpdates;
+  size_t PendDTUpdateIndex = 0;
+  size_t PendPDTUpdateIndex = 0;
+  DominatorTree *DT = nullptr;
+  PostDominatorTree *PDT = nullptr;
+  const UpdateStrategy Strategy;
+  SmallPtrSet<BasicBlock *, 8> DeletedBBs;
+  std::vector<CallBackOnDeletion> Callbacks;
+  bool IsRecalculatingDomTree = false;
+  bool IsRecalculatingPostDomTree = false;
+
+  /// First remove all the instructions of DelBB and then make sure DelBB has a
+  /// valid terminator instruction which is necessary to have when DelBB still
+  /// has to be inside of its parent Function while awaiting deletion under Lazy
+  /// UpdateStrategy to prevent other routines from asserting the state of the
+  /// IR is inconsistent. Assert if DelBB is nullptr or has predecessors.
+  void validateDeleteBB(BasicBlock *DelBB);
+
+  /// Returns true if at least one BasicBlock is deleted.
+  bool forceFlushDeletedBB();
+
+  /// Deduplicate and remove unnecessary updates (no-ops) when using Lazy
+  /// UpdateStrategy. Returns true if the update is queued for update.
+  bool applyLazyUpdate(DominatorTree::UpdateKind Kind, BasicBlock *From,
+                       BasicBlock *To);
+
+  /// Helper function to apply all pending DomTree updates.
+  void applyDomTreeUpdates();
+
+  /// Helper function to apply all pending PostDomTree updates.
+  void applyPostDomTreeUpdates();
+
+  /// Helper function to flush deleted BasicBlocks if all available
+  /// trees are up-to-date.
+  void tryFlushDeletedBB();
+
+  /// Drop all updates applied by all available trees and delete BasicBlocks if
+  /// all available trees are up-to-date.
+  void dropOutOfDateUpdates();
+
+  /// Erase Basic Block node that has been unlinked from Function
+  /// in the DomTree and PostDomTree.
+  void eraseDelBBNode(BasicBlock *DelBB);
+
+  /// Returns true if the update appears in the LLVM IR.
+  /// It is used to check whether an update is valid in
+  /// insertEdge/deleteEdge or is unnecessary in the batch update.
+  bool isUpdateValid(DominatorTree::UpdateType Update) const;
+
+  /// Returns true if the update is self dominance.
+  bool isSelfDominance(DominatorTree::UpdateType Update) const;
+};
+} // namespace llvm
+
+#endif // LLVM_DOMTREEUPDATER_H
diff --git a/contrib/llvm/include/llvm/IR/Dominators.h b/contrib/llvm/include/llvm/IR/Dominators.h
index 6ad99e516fba..f9e992b0ef0c 100644
--- a/contrib/llvm/include/llvm/IR/Dominators.h
+++ b/contrib/llvm/include/llvm/IR/Dominators.h
@@ -63,8 +63,10 @@ extern template void DeleteEdge<BBPostDomTree>(BBPostDomTree &DT,
 extern template void ApplyUpdates<BBDomTree>(BBDomTree &DT, BBUpdates);
 extern template void ApplyUpdates<BBPostDomTree>(BBPostDomTree &DT, BBUpdates);
 
-extern template bool Verify<BBDomTree>(const BBDomTree &DT);
-extern template bool Verify<BBPostDomTree>(const BBPostDomTree &DT);
+extern template bool Verify<BBDomTree>(const BBDomTree &DT,
+                                       BBDomTree::VerificationLevel VL);
+extern template bool Verify<BBPostDomTree>(const BBPostDomTree &DT,
+                                           BBPostDomTree::VerificationLevel VL);
 }  // namespace DomTreeBuilder
 
 using DomTreeNode = DomTreeNodeBase<BasicBlock>;
@@ -119,7 +121,7 @@ template <> struct DenseMapInfo<BasicBlockEdge> {
   }
 };
 
-/// \brief Concrete subclass of DominatorTreeBase that is used to compute a
+/// Concrete subclass of DominatorTreeBase that is used to compute a
 /// normal dominator tree.
 ///
 /// Definition: A block is said to be forward statically reachable if there is
@@ -148,19 +150,10 @@ class DominatorTree : public DominatorTreeBase<BasicBlock, false> {
   bool invalidate(Function &F, const PreservedAnalyses &PA,
                   FunctionAnalysisManager::Invalidator &);
 
-  /// \brief Returns *false* if the other dominator tree matches this dominator
-  /// tree.
-  inline bool compare(const DominatorTree &Other) const {
-    const DomTreeNode *R = getRootNode();
-    const DomTreeNode *OtherR = Other.getRootNode();
-    return !R || !OtherR || R->getBlock() != OtherR->getBlock() ||
-           Base::compare(Other);
-  }
-
   // Ensure base-class overloads are visible.
   using Base::dominates;
 
-  /// \brief Return true if Def dominates a use in User.
+  /// Return true if Def dominates a use in User.
   ///
   /// This performs the special checks necessary if Def and User are in the same
   /// basic block. Note that Def doesn't dominate a use in Def itself!
@@ -178,15 +171,9 @@ class DominatorTree : public DominatorTreeBase<BasicBlock, false> {
   // Ensure base class overloads are visible.
   using Base::isReachableFromEntry;
 
-  /// \brief Provide an overload for a Use.
+  /// Provide an overload for a Use.
   bool isReachableFromEntry(const Use &U) const;
 
-  /// \brief Verify the correctness of the domtree by re-computing it.
-  ///
-  /// This should only be used for debugging as it aborts the program if the
-  /// verification fails.
-  void verifyDomTree() const;
-
   // Pop up a GraphViz/gv window with the Dominator Tree rendered using `dot`.
   void viewGraph(const Twine &Name, const Twine &Title);
   void viewGraph();
@@ -234,20 +221,20 @@ template <> struct GraphTraits<DominatorTree*>
   }
 };
 
-/// \brief Analysis pass which computes a \c DominatorTree.
+/// Analysis pass which computes a \c DominatorTree.
 class DominatorTreeAnalysis : public AnalysisInfoMixin<DominatorTreeAnalysis> {
   friend AnalysisInfoMixin<DominatorTreeAnalysis>;
   static AnalysisKey Key;
 
 public:
-  /// \brief Provide the result typedef for this analysis pass.
+  /// Provide the result typedef for this analysis pass.
   using Result = DominatorTree;
 
-  /// \brief Run the analysis pass over a function and produce a dominator tree.
+  /// Run the analysis pass over a function and produce a dominator tree.
   DominatorTree run(Function &F, FunctionAnalysisManager &);
 };
 
-/// \brief Printer pass for the \c DominatorTree.
+/// Printer pass for the \c DominatorTree.
 class DominatorTreePrinterPass
     : public PassInfoMixin<DominatorTreePrinterPass> {
   raw_ostream &OS;
@@ -258,12 +245,12 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Verifier pass for the \c DominatorTree.
+/// Verifier pass for the \c DominatorTree.
 struct DominatorTreeVerifierPass : PassInfoMixin<DominatorTreeVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Legacy analysis pass which computes a \c DominatorTree.
+/// Legacy analysis pass which computes a \c DominatorTree.
 class DominatorTreeWrapperPass : public FunctionPass {
   DominatorTree DT;
 
@@ -290,6 +277,93 @@ public:
   void print(raw_ostream &OS, const Module *M = nullptr) const override;
 };
 
+//===-------------------------------------
+/// Class to defer updates to a DominatorTree.
+///
+/// Definition: Applying updates to every edge insertion and deletion is
+/// expensive and not necessary. When one needs the DominatorTree for analysis
+/// they can request a flush() to perform a larger batch update. This has the
+/// advantage of the DominatorTree inspecting the set of updates to find
+/// duplicates or unnecessary subtree updates.
+///
+/// The scope of DeferredDominance operates at a Function level.
+///
+/// It is not necessary for the user to scrub the updates for duplicates or
+/// updates that point to the same block (Delete, BB_A, BB_A). Performance
+/// can be gained if the caller attempts to batch updates before submitting
+/// to applyUpdates(ArrayRef) in cases where duplicate edge requests will
+/// occur.
+///
+/// It is required for the state of the LLVM IR to be applied *before*
+/// submitting updates. The update routines must analyze the current state
+/// between a pair of (From, To) basic blocks to determine if the update
+/// needs to be queued.
+/// Example (good):
+///     TerminatorInstructionBB->removeFromParent();
+///     DDT->deleteEdge(BB, Successor);
+/// Example (bad):
+///     DDT->deleteEdge(BB, Successor);
+///     TerminatorInstructionBB->removeFromParent();
+class DeferredDominance {
+public:
+  DeferredDominance(DominatorTree &DT_) : DT(DT_) {}
+
+  /// Queues multiple updates and discards duplicates.
+  void applyUpdates(ArrayRef<DominatorTree::UpdateType> Updates);
+
+  /// Helper method for a single edge insertion. It's almost always
+  /// better to batch updates and call applyUpdates to quickly remove duplicate
+  /// edges. This is best used when there is only a single insertion needed to
+  /// update Dominators.
+  void insertEdge(BasicBlock *From, BasicBlock *To);
+
+  /// Helper method for a single edge deletion. It's almost always better
+  /// to batch updates and call applyUpdates to quickly remove duplicate edges.
+  /// This is best used when there is only a single deletion needed to update
+  /// Dominators.
+  void deleteEdge(BasicBlock *From, BasicBlock *To);
+
+  /// Delays the deletion of a basic block until a flush() event.
+  void deleteBB(BasicBlock *DelBB);
+
+  /// Returns true if DelBB is awaiting deletion at a flush() event.
+  bool pendingDeletedBB(BasicBlock *DelBB);
+
+  /// Returns true if pending DT updates are queued for a flush() event.
+  bool pending();
+
+  /// Flushes all pending updates and block deletions. Returns a
+  /// correct DominatorTree reference to be used by the caller for analysis.
+  DominatorTree &flush();
+
+  /// Drops all internal state and forces a (slow) recalculation of the
+  /// DominatorTree based on the current state of the LLVM IR in F. This should
+  /// only be used in corner cases such as the Entry block of F being deleted.
+  void recalculate(Function &F);
+
+  /// Debug method to help view the state of pending updates.
+  LLVM_DUMP_METHOD void dump() const;
+
+private:
+  DominatorTree &DT;
+  SmallVector<DominatorTree::UpdateType, 16> PendUpdates;
+  SmallPtrSet<BasicBlock *, 8> DeletedBBs;
+
+  /// Apply an update (Kind, From, To) to the internal queued updates. The
+  /// update is only added when determined to be necessary. Checks for
+  /// self-domination, unnecessary updates, duplicate requests, and balanced
+  /// pairs of requests are all performed. Returns true if the update is
+  /// queued and false if it is discarded.
+  bool applyUpdate(DominatorTree::UpdateKind Kind, BasicBlock *From,
+                   BasicBlock *To);
+
+  /// Performs all pending basic block deletions. We have to defer the deletion
+  /// of these blocks until after the DominatorTree updates are applied. The
+  /// internal workings of the DominatorTree code expect every update's From
+  /// and To blocks to exist and to be a member of the same Function.
+  bool flushDelBB();
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_DOMINATORS_H
diff --git a/contrib/llvm/include/llvm/IR/Function.h b/contrib/llvm/include/llvm/IR/Function.h
index def842f5fcee..c8d6b0776fbf 100644
--- a/contrib/llvm/include/llvm/IR/Function.h
+++ b/contrib/llvm/include/llvm/IR/Function.h
@@ -141,6 +141,11 @@ public:
   // Provide fast operand accessors.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
+  /// Returns the number of non-debug IR instructions in this function.
+  /// This is equivalent to the sum of the sizes of each basic block contained
+  /// within this function.
+  unsigned getInstructionCount();
+
   /// Returns the FunctionType for me.
   FunctionType *getFunctionType() const {
     return cast<FunctionType>(getValueType());
@@ -181,7 +186,7 @@ public:
 
   static Intrinsic::ID lookupIntrinsicID(StringRef Name);
 
-  /// \brief Recalculate the ID for this function if it is an Intrinsic defined
+  /// Recalculate the ID for this function if it is an Intrinsic defined
   /// in llvm/Intrinsics.h.  Sets the intrinsic ID to Intrinsic::not_intrinsic
   /// if the name of this function does not match an intrinsic in that header.
   /// Note, this method does not need to be called directly, as it is called
@@ -201,53 +206,86 @@ public:
     setValueSubclassData((getSubclassDataFromValue() & 0xc00f) | (ID << 4));
   }
 
-  /// @brief Return the attribute list for this Function.
+  /// Return the attribute list for this Function.
   AttributeList getAttributes() const { return AttributeSets; }
 
-  /// @brief Set the attribute list for this Function.
+  /// Set the attribute list for this Function.
   void setAttributes(AttributeList Attrs) { AttributeSets = Attrs; }
 
-  /// @brief Add function attributes to this function.
+  /// Add function attributes to this function.
   void addFnAttr(Attribute::AttrKind Kind) {
     addAttribute(AttributeList::FunctionIndex, Kind);
   }
 
-  /// @brief Add function attributes to this function.
+  /// Add function attributes to this function.
   void addFnAttr(StringRef Kind, StringRef Val = StringRef()) {
     addAttribute(AttributeList::FunctionIndex,
                  Attribute::get(getContext(), Kind, Val));
   }
 
-  /// @brief Add function attributes to this function.
+  /// Add function attributes to this function.
   void addFnAttr(Attribute Attr) {
     addAttribute(AttributeList::FunctionIndex, Attr);
   }
 
-  /// @brief Remove function attributes from this function.
+  /// Remove function attributes from this function.
   void removeFnAttr(Attribute::AttrKind Kind) {
     removeAttribute(AttributeList::FunctionIndex, Kind);
   }
 
-  /// @brief Remove function attribute from this function.
+  /// Remove function attribute from this function.
   void removeFnAttr(StringRef Kind) {
     setAttributes(getAttributes().removeAttribute(
         getContext(), AttributeList::FunctionIndex, Kind));
   }
 
-  /// \brief Set the entry count for this function.
+  enum ProfileCountType { PCT_Invalid, PCT_Real, PCT_Synthetic };
+
+  /// Class to represent profile counts.
+  ///
+  /// This class represents both real and synthetic profile counts.
+  class ProfileCount {
+  private:
+    uint64_t Count;
+    ProfileCountType PCT;
+    static ProfileCount Invalid;
+
+  public:
+    ProfileCount() : Count(-1), PCT(PCT_Invalid) {}
+    ProfileCount(uint64_t Count, ProfileCountType PCT)
+        : Count(Count), PCT(PCT) {}
+    bool hasValue() const { return PCT != PCT_Invalid; }
+    uint64_t getCount() const { return Count; }
+    ProfileCountType getType() const { return PCT; }
+    bool isSynthetic() const { return PCT == PCT_Synthetic; }
+    explicit operator bool() { return hasValue(); }
+    bool operator!() const { return !hasValue(); }
+    // Update the count retaining the same profile count type.
+    ProfileCount &setCount(uint64_t C) {
+      Count = C;
+      return *this;
+    }
+    static ProfileCount getInvalid() { return ProfileCount(-1, PCT_Invalid); }
+  };
+
+  /// Set the entry count for this function.
   ///
   /// Entry count is the number of times this function was executed based on
-  /// pgo data. \p Imports points to a set of GUIDs that needs to be imported
-  /// by the function for sample PGO, to enable the same inlines as the
-  /// profiled optimized binary.
-  void setEntryCount(uint64_t Count,
+  /// pgo data. \p Imports points to a set of GUIDs that needs to
+  /// be imported by the function for sample PGO, to enable the same inlines as
+  /// the profiled optimized binary.
+  void setEntryCount(ProfileCount Count,
                      const DenseSet<GlobalValue::GUID> *Imports = nullptr);
 
-  /// \brief Get the entry count for this function.
+  /// A convenience wrapper for setting entry count
+  void setEntryCount(uint64_t Count, ProfileCountType Type = PCT_Real,
+                     const DenseSet<GlobalValue::GUID> *Imports = nullptr);
+
+  /// Get the entry count for this function.
   ///
   /// Entry count is the number of times the function was executed based on
   /// pgo data.
-  Optional<uint64_t> getEntryCount() const;
+  ProfileCount getEntryCount() const;
 
   /// Return true if the function is annotated with profile data.
   ///
@@ -265,27 +303,27 @@ public:
   /// Get the section prefix for this function.
   Optional<StringRef> getSectionPrefix() const;
 
-  /// @brief Return true if the function has the attribute.
+  /// Return true if the function has the attribute.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
     return AttributeSets.hasFnAttribute(Kind);
   }
 
-  /// @brief Return true if the function has the attribute.
+  /// Return true if the function has the attribute.
   bool hasFnAttribute(StringRef Kind) const {
     return AttributeSets.hasFnAttribute(Kind);
   }
 
-  /// @brief Return the attribute for the given attribute kind.
+  /// Return the attribute for the given attribute kind.
   Attribute getFnAttribute(Attribute::AttrKind Kind) const {
     return getAttribute(AttributeList::FunctionIndex, Kind);
   }
 
-  /// @brief Return the attribute for the given attribute kind.
+  /// Return the attribute for the given attribute kind.
   Attribute getFnAttribute(StringRef Kind) const {
     return getAttribute(AttributeList::FunctionIndex, Kind);
   }
 
-  /// \brief Return the stack alignment for the function.
+  /// Return the stack alignment for the function.
   unsigned getFnStackAlignment() const {
     if (!hasFnAttribute(Attribute::StackAlignment))
       return 0;
@@ -301,110 +339,110 @@ public:
   void setGC(std::string Str);
   void clearGC();
 
-  /// @brief adds the attribute to the list of attributes.
+  /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute::AttrKind Kind);
 
-  /// @brief adds the attribute to the list of attributes.
+  /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute Attr);
 
-  /// @brief adds the attributes to the list of attributes.
+  /// adds the attributes to the list of attributes.
   void addAttributes(unsigned i, const AttrBuilder &Attrs);
 
-  /// @brief adds the attribute to the list of attributes for the given arg.
+  /// adds the attribute to the list of attributes for the given arg.
   void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
 
-  /// @brief adds the attribute to the list of attributes for the given arg.
+  /// adds the attribute to the list of attributes for the given arg.
   void addParamAttr(unsigned ArgNo, Attribute Attr);
 
-  /// @brief adds the attributes to the list of attributes for the given arg.
+  /// adds the attributes to the list of attributes for the given arg.
   void addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs);
 
-  /// @brief removes the attribute from the list of attributes.
+  /// removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attribute::AttrKind Kind);
 
-  /// @brief removes the attribute from the list of attributes.
+  /// removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, StringRef Kind);
 
-  /// @brief removes the attributes from the list of attributes.
+  /// removes the attributes from the list of attributes.
   void removeAttributes(unsigned i, const AttrBuilder &Attrs);
 
-  /// @brief removes the attribute from the list of attributes.
+  /// removes the attribute from the list of attributes.
   void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
 
-  /// @brief removes the attribute from the list of attributes.
+  /// removes the attribute from the list of attributes.
   void removeParamAttr(unsigned ArgNo, StringRef Kind);
 
-  /// @brief removes the attribute from the list of attributes.
+  /// removes the attribute from the list of attributes.
   void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs);
 
-  /// @brief check if an attributes is in the list of attributes.
+  /// check if an attributes is in the list of attributes.
   bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const {
     return getAttributes().hasAttribute(i, Kind);
   }
 
-  /// @brief check if an attributes is in the list of attributes.
+  /// check if an attributes is in the list of attributes.
   bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const {
     return getAttributes().hasParamAttribute(ArgNo, Kind);
   }
 
-  /// @brief gets the attribute from the list of attributes.
+  /// gets the attribute from the list of attributes.
   Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
     return AttributeSets.getAttribute(i, Kind);
   }
 
-  /// @brief gets the attribute from the list of attributes.
+  /// gets the attribute from the list of attributes.
   Attribute getAttribute(unsigned i, StringRef Kind) const {
     return AttributeSets.getAttribute(i, Kind);
   }
 
-  /// @brief adds the dereferenceable attribute to the list of attributes.
+  /// adds the dereferenceable attribute to the list of attributes.
   void addDereferenceableAttr(unsigned i, uint64_t Bytes);
 
-  /// @brief adds the dereferenceable attribute to the list of attributes for
+  /// adds the dereferenceable attribute to the list of attributes for
   /// the given arg.
   void addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes);
 
-  /// @brief adds the dereferenceable_or_null attribute to the list of
+  /// adds the dereferenceable_or_null attribute to the list of
   /// attributes.
   void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes);
 
-  /// @brief adds the dereferenceable_or_null attribute to the list of
+  /// adds the dereferenceable_or_null attribute to the list of
   /// attributes for the given arg.
   void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes);
 
-  /// @brief Extract the alignment for a call or parameter (0=unknown).
+  /// Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned ArgNo) const {
     return AttributeSets.getParamAlignment(ArgNo);
   }
 
-  /// @brief Extract the number of dereferenceable bytes for a call or
+  /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
   /// @param i AttributeList index, referring to a return value or argument.
   uint64_t getDereferenceableBytes(unsigned i) const {
     return AttributeSets.getDereferenceableBytes(i);
   }
 
-  /// @brief Extract the number of dereferenceable bytes for a parameter.
+  /// Extract the number of dereferenceable bytes for a parameter.
   /// @param ArgNo Index of an argument, with 0 being the first function arg.
   uint64_t getParamDereferenceableBytes(unsigned ArgNo) const {
     return AttributeSets.getParamDereferenceableBytes(ArgNo);
   }
 
-  /// @brief Extract the number of dereferenceable_or_null bytes for a call or
+  /// Extract the number of dereferenceable_or_null bytes for a call or
   /// parameter (0=unknown).
   /// @param i AttributeList index, referring to a return value or argument.
   uint64_t getDereferenceableOrNullBytes(unsigned i) const {
     return AttributeSets.getDereferenceableOrNullBytes(i);
   }
 
-  /// @brief Extract the number of dereferenceable_or_null bytes for a
+  /// Extract the number of dereferenceable_or_null bytes for a
   /// parameter.
   /// @param ArgNo AttributeList ArgNo, referring to an argument.
   uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const {
     return AttributeSets.getParamDereferenceableOrNullBytes(ArgNo);
   }
 
-  /// @brief Determine if the function does not access memory.
+  /// Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
     return hasFnAttribute(Attribute::ReadNone);
   }
@@ -412,7 +450,7 @@ public:
     addFnAttr(Attribute::ReadNone);
   }
 
-  /// @brief Determine if the function does not access or only reads memory.
+  /// Determine if the function does not access or only reads memory.
   bool onlyReadsMemory() const {
     return doesNotAccessMemory() || hasFnAttribute(Attribute::ReadOnly);
   }
@@ -420,7 +458,7 @@ public:
     addFnAttr(Attribute::ReadOnly);
   }
 
-  /// @brief Determine if the function does not access or only writes memory.
+  /// Determine if the function does not access or only writes memory.
   bool doesNotReadMemory() const {
     return doesNotAccessMemory() || hasFnAttribute(Attribute::WriteOnly);
   }
@@ -428,14 +466,14 @@ public:
     addFnAttr(Attribute::WriteOnly);
   }
 
-  /// @brief Determine if the call can access memmory only using pointers based
+  /// Determine if the call can access memmory only using pointers based
   /// on its arguments.
   bool onlyAccessesArgMemory() const {
     return hasFnAttribute(Attribute::ArgMemOnly);
   }
   void setOnlyAccessesArgMemory() { addFnAttr(Attribute::ArgMemOnly); }
 
-  /// @brief Determine if the function may only access memory that is
+  /// Determine if the function may only access memory that is
   ///  inaccessible from the IR.
   bool onlyAccessesInaccessibleMemory() const {
     return hasFnAttribute(Attribute::InaccessibleMemOnly);
@@ -444,7 +482,7 @@ public:
     addFnAttr(Attribute::InaccessibleMemOnly);
   }
 
-  /// @brief Determine if the function may only access memory that is
+  /// Determine if the function may only access memory that is
   ///  either inaccessible from the IR or pointed to by its arguments.
   bool onlyAccessesInaccessibleMemOrArgMem() const {
     return hasFnAttribute(Attribute::InaccessibleMemOrArgMemOnly);
@@ -453,7 +491,7 @@ public:
     addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
   }
 
-  /// @brief Determine if the function cannot return.
+  /// Determine if the function cannot return.
   bool doesNotReturn() const {
     return hasFnAttribute(Attribute::NoReturn);
   }
@@ -461,7 +499,10 @@ public:
     addFnAttr(Attribute::NoReturn);
   }
 
-  /// @brief Determine if the function cannot unwind.
+  /// Determine if the function should not perform indirect branch tracking.
+  bool doesNoCfCheck() const { return hasFnAttribute(Attribute::NoCfCheck); }
+
+  /// Determine if the function cannot unwind.
   bool doesNotThrow() const {
     return hasFnAttribute(Attribute::NoUnwind);
   }
@@ -469,7 +510,7 @@ public:
     addFnAttr(Attribute::NoUnwind);
   }
 
-  /// @brief Determine if the call cannot be duplicated.
+  /// Determine if the call cannot be duplicated.
   bool cannotDuplicate() const {
     return hasFnAttribute(Attribute::NoDuplicate);
   }
@@ -477,7 +518,7 @@ public:
     addFnAttr(Attribute::NoDuplicate);
   }
 
-  /// @brief Determine if the call is convergent.
+  /// Determine if the call is convergent.
   bool isConvergent() const {
     return hasFnAttribute(Attribute::Convergent);
   }
@@ -488,7 +529,7 @@ public:
     removeFnAttr(Attribute::Convergent);
   }
 
-  /// @brief Determine if the call has sideeffects.
+  /// Determine if the call has sideeffects.
   bool isSpeculatable() const {
     return hasFnAttribute(Attribute::Speculatable);
   }
@@ -505,7 +546,7 @@ public:
     addFnAttr(Attribute::NoRecurse);
   }
 
-  /// @brief True if the ABI mandates (or the user requested) that this
+  /// True if the ABI mandates (or the user requested) that this
   /// function be in a unwind table.
   bool hasUWTable() const {
     return hasFnAttribute(Attribute::UWTable);
@@ -514,19 +555,19 @@ public:
     addFnAttr(Attribute::UWTable);
   }
 
-  /// @brief True if this function needs an unwind table.
+  /// True if this function needs an unwind table.
   bool needsUnwindTableEntry() const {
     return hasUWTable() || !doesNotThrow();
   }
 
-  /// @brief Determine if the function returns a structure through first
+  /// Determine if the function returns a structure through first
   /// or second pointer argument.
   bool hasStructRetAttr() const {
     return AttributeSets.hasParamAttribute(0, Attribute::StructRet) ||
            AttributeSets.hasParamAttribute(1, Attribute::StructRet);
   }
 
-  /// @brief Determine if the parameter or return value is marked with NoAlias
+  /// Determine if the parameter or return value is marked with NoAlias
   /// attribute.
   bool returnDoesNotAlias() const {
     return AttributeSets.hasAttribute(AttributeList::ReturnIndex,
@@ -643,30 +684,30 @@ public:
   size_t arg_size() const { return NumArgs; }
   bool arg_empty() const { return arg_size() == 0; }
 
-  /// \brief Check whether this function has a personality function.
+  /// Check whether this function has a personality function.
   bool hasPersonalityFn() const {
     return getSubclassDataFromValue() & (1<<3);
   }
 
-  /// \brief Get the personality function associated with this function.
+  /// Get the personality function associated with this function.
   Constant *getPersonalityFn() const;
   void setPersonalityFn(Constant *Fn);
 
-  /// \brief Check whether this function has prefix data.
+  /// Check whether this function has prefix data.
   bool hasPrefixData() const {
     return getSubclassDataFromValue() & (1<<1);
   }
 
-  /// \brief Get the prefix data associated with this function.
+  /// Get the prefix data associated with this function.
   Constant *getPrefixData() const;
   void setPrefixData(Constant *PrefixData);
 
-  /// \brief Check whether this function has prologue data.
+  /// Check whether this function has prologue data.
   bool hasPrologueData() const {
     return getSubclassDataFromValue() & (1<<2);
   }
 
-  /// \brief Get the prologue data associated with this function.
+  /// Get the prologue data associated with this function.
   Constant *getPrologueData() const;
   void setPrologueData(Constant *PrologueData);
 
@@ -726,12 +767,12 @@ public:
   /// setjmp or other function that gcc recognizes as "returning twice".
   bool callsFunctionThatReturnsTwice() const;
 
-  /// \brief Set the attached subprogram.
+  /// Set the attached subprogram.
   ///
   /// Calls \a setMetadata() with \a LLVMContext::MD_dbg.
   void setSubprogram(DISubprogram *SP);
 
-  /// \brief Get the attached subprogram.
+  /// Get the attached subprogram.
   ///
   /// Calls \a getMetadata() with \a LLVMContext::MD_dbg and casts the result
   /// to \a DISubprogram.
@@ -740,6 +781,12 @@ public:
   /// Returns true if we should emit debug info for profiling.
   bool isDebugInfoForProfiling() const;
 
+  /// Check if null pointer dereferencing is considered undefined behavior for
+  /// the function.
+  /// Return value: false => null pointer dereference is undefined.
+  /// Return value: true =>  null pointer dereference is not undefined.
+  bool nullPointerIsDefined() const;
+
 private:
   void allocHungoffUselist();
   template<int Idx> void setHungoffOperand(Constant *C);
@@ -752,6 +799,13 @@ private:
   void setValueSubclassDataBit(unsigned Bit, bool On);
 };
 
+/// Check whether null pointer dereferencing is considered undefined behavior
+/// for a given function or an address space.
+/// Null pointer access in non-zero address space is not considered undefined.
+/// Return value: false => null pointer dereference is undefined.
+/// Return value: true =>  null pointer dereference is not undefined.
+bool NullPointerIsDefined(const Function *F, unsigned AS = 0);
+
 template <>
 struct OperandTraits<Function> : public HungoffOperandTraits<3> {};
 
diff --git a/contrib/llvm/include/llvm/IR/GlobalObject.h b/contrib/llvm/include/llvm/IR/GlobalObject.h
index 278b193567f1..1fd3568100c2 100644
--- a/contrib/llvm/include/llvm/IR/GlobalObject.h
+++ b/contrib/llvm/include/llvm/IR/GlobalObject.h
@@ -105,6 +105,14 @@ public:
   /// Check if this has any metadata.
   bool hasMetadata() const { return hasMetadataHashEntry(); }
 
+  /// Check if this has any metadata of the given kind.
+  bool hasMetadata(unsigned KindID) const {
+    return getMetadata(KindID) != nullptr;
+  }
+  bool hasMetadata(StringRef Kind) const {
+    return getMetadata(Kind) != nullptr;
+  }
+
   /// Get the current metadata attachments for the given kind, if any.
   ///
   /// These functions require that the function have at most a single attachment
@@ -143,7 +151,9 @@ public:
   getAllMetadata(SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const;
 
   /// Erase all metadata attachments with the given kind.
-  void eraseMetadata(unsigned KindID);
+  ///
+  /// \returns true if any metadata was removed.
+  bool eraseMetadata(unsigned KindID);
 
   /// Copy metadata from Src, adjusting offsets by Offset.
   void copyMetadata(const GlobalObject *Src, unsigned Offset);
diff --git a/contrib/llvm/include/llvm/IR/GlobalValue.h b/contrib/llvm/include/llvm/IR/GlobalValue.h
index 1793de7887fc..9d9f4f65a6b5 100644
--- a/contrib/llvm/include/llvm/IR/GlobalValue.h
+++ b/contrib/llvm/include/llvm/IR/GlobalValue.h
@@ -44,7 +44,7 @@ namespace Intrinsic {
 
 class GlobalValue : public Constant {
 public:
-  /// @brief An enumeration for the kinds of linkage for global values.
+  /// An enumeration for the kinds of linkage for global values.
   enum LinkageTypes {
     ExternalLinkage = 0,///< Externally visible function
     AvailableExternallyLinkage, ///< Available for inspection, not emission.
@@ -59,14 +59,14 @@ public:
     CommonLinkage       ///< Tentative definitions.
   };
 
-  /// @brief An enumeration for the kinds of visibility of global values.
+  /// An enumeration for the kinds of visibility of global values.
   enum VisibilityTypes {
     DefaultVisibility = 0,  ///< The GV is visible
     HiddenVisibility,       ///< The GV is hidden
     ProtectedVisibility     ///< The GV is protected
   };
 
-  /// @brief Storage classes of global values for PE targets.
+  /// Storage classes of global values for PE targets.
   enum DLLStorageClassTypes {
     DefaultStorageClass   = 0,
     DLLImportStorageClass = 1, ///< Function to be imported from DLL
@@ -77,11 +77,12 @@ protected:
   GlobalValue(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
               LinkageTypes Linkage, const Twine &Name, unsigned AddressSpace)
       : Constant(PointerType::get(Ty, AddressSpace), VTy, Ops, NumOps),
-        ValueType(Ty), Linkage(Linkage), Visibility(DefaultVisibility),
+        ValueType(Ty), Visibility(DefaultVisibility),
         UnnamedAddrVal(unsigned(UnnamedAddr::None)),
         DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal),
-        HasLLVMReservedName(false), IsDSOLocal(false),
-        IntID((Intrinsic::ID)0U), Parent(nullptr) {
+        HasLLVMReservedName(false), IsDSOLocal(false), IntID((Intrinsic::ID)0U),
+        Parent(nullptr) {
+    setLinkage(Linkage);
     setName(Name);
   }
 
@@ -109,12 +110,12 @@ protected:
   unsigned IsDSOLocal : 1;
 
 private:
-  friend class Constant;
-
   // Give subclasses access to what otherwise would be wasted padding.
   // (17 + 4 + 2 + 2 + 2 + 3 + 1 + 1) == 32.
   unsigned SubClassData : GlobalValueSubClassDataBits;
 
+  friend class Constant;
+
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
 
@@ -142,8 +143,14 @@ private:
     llvm_unreachable("Fully covered switch above!");
   }
 
+  void maybeSetDsoLocal() {
+    if (hasLocalLinkage() ||
+        (!hasDefaultVisibility() && !hasExternalWeakLinkage()))
+      setDSOLocal(true);
+  }
+
 protected:
-  /// \brief The intrinsic ID for this subclass (which must be a Function).
+  /// The intrinsic ID for this subclass (which must be a Function).
   ///
   /// This member is defined by this class, but not used for anything.
   /// Subclasses can use it to store their intrinsic ID, if they have one.
@@ -232,6 +239,7 @@ public:
     assert((!hasLocalLinkage() || V == DefaultVisibility) &&
            "local linkage requires default visibility");
     Visibility = V;
+    maybeSetDsoLocal();
   }
 
   /// If the value is "Thread Local", its value isn't shared by the threads.
@@ -437,6 +445,7 @@ public:
     if (isLocalLinkage(LT))
       Visibility = DefaultVisibility;
     Linkage = LT;
+    maybeSetDsoLocal();
   }
   LinkageTypes getLinkage() const { return LinkageTypes(Linkage); }
 
@@ -563,6 +572,13 @@ public:
            V->getValueID() == Value::GlobalAliasVal ||
            V->getValueID() == Value::GlobalIFuncVal;
   }
+
+  /// True if GV can be left out of the object symbol table. This is the case
+  /// for linkonce_odr values whose address is not significant. While legal, it
+  /// is not normally profitable to omit them from the .o symbol table. Using
+  /// this analysis makes sense when the information can be passed down to the
+  /// linker or we are in LTO.
+  bool canBeOmittedFromSymbolTable() const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/IR/GlobalVariable.h b/contrib/llvm/include/llvm/IR/GlobalVariable.h
index 34ace6f2b4f4..03b9ec46ebb4 100644
--- a/contrib/llvm/include/llvm/IR/GlobalVariable.h
+++ b/contrib/llvm/include/llvm/IR/GlobalVariable.h
@@ -68,9 +68,6 @@ public:
 
   ~GlobalVariable() {
     dropAllReferences();
-
-    // FIXME: needed by operator delete
-    setGlobalVariableNumOperands(1);
   }
 
   // allocate space for exactly one operand
@@ -78,6 +75,16 @@ public:
     return User::operator new(s, 1);
   }
 
+  // delete space for exactly one operand as created in the corresponding new operator
+  void operator delete(void *ptr){
+    assert(ptr != nullptr && "must not be nullptr");
+    User *Obj = static_cast<User *>(ptr);
+    // Number of operands can be set to 0 after construction and initialization. Make sure
+    // that number of operands is reset to 1, as this is needed in User::operator delete
+    Obj->setGlobalVariableNumOperands(1);
+    User::operator delete(Obj);
+  }
+
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
diff --git a/contrib/llvm/include/llvm/IR/IRBuilder.h b/contrib/llvm/include/llvm/IR/IRBuilder.h
index e687ca689d46..70641ba25d2e 100644
--- a/contrib/llvm/include/llvm/IR/IRBuilder.h
+++ b/contrib/llvm/include/llvm/IR/IRBuilder.h
@@ -1,4 +1,4 @@
-//===---- llvm/IRBuilder.h - Builder for LLVM Instructions ------*- C++ -*-===//
+//===- llvm/IRBuilder.h - Builder for LLVM Instructions ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -42,20 +42,19 @@
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Casting.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <utility>
 
 namespace llvm {
 
 class APInt;
 class MDNode;
-class Module;
 class Use;
 
-/// \brief This provides the default implementation of the IRBuilder
+/// This provides the default implementation of the IRBuilder
 /// 'InsertHelper' method that is called whenever an instruction is created by
 /// IRBuilder and needs to be inserted.
 ///
@@ -86,7 +85,7 @@ protected:
   }
 };
 
-/// \brief Common base class shared among various IRBuilders.
+/// Common base class shared among various IRBuilders.
 class IRBuilderBase {
   DebugLoc CurDbgLocation;
 
@@ -112,7 +111,7 @@ public:
   // Builder configuration methods
   //===--------------------------------------------------------------------===//
 
-  /// \brief Clear the insertion point: created instructions will not be
+  /// Clear the insertion point: created instructions will not be
   /// inserted into a block.
   void ClearInsertionPoint() {
     BB = nullptr;
@@ -123,14 +122,14 @@ public:
   BasicBlock::iterator GetInsertPoint() const { return InsertPt; }
   LLVMContext &getContext() const { return Context; }
 
-  /// \brief This specifies that created instructions should be appended to the
+  /// This specifies that created instructions should be appended to the
   /// end of the specified block.
   void SetInsertPoint(BasicBlock *TheBB) {
     BB = TheBB;
     InsertPt = BB->end();
   }
 
-  /// \brief This specifies that created instructions should be inserted before
+  /// This specifies that created instructions should be inserted before
   /// the specified instruction.
   void SetInsertPoint(Instruction *I) {
     BB = I->getParent();
@@ -139,7 +138,7 @@ public:
     SetCurrentDebugLocation(I->getDebugLoc());
   }
 
-  /// \brief This specifies that created instructions should be inserted at the
+  /// This specifies that created instructions should be inserted at the
   /// specified point.
   void SetInsertPoint(BasicBlock *TheBB, BasicBlock::iterator IP) {
     BB = TheBB;
@@ -148,20 +147,20 @@ public:
       SetCurrentDebugLocation(IP->getDebugLoc());
   }
 
-  /// \brief Set location information used by debugging information.
+  /// Set location information used by debugging information.
   void SetCurrentDebugLocation(DebugLoc L) { CurDbgLocation = std::move(L); }
 
-  /// \brief Get location information used by debugging information.
+  /// Get location information used by debugging information.
   const DebugLoc &getCurrentDebugLocation() const { return CurDbgLocation; }
 
-  /// \brief If this builder has a current debug location, set it on the
+  /// If this builder has a current debug location, set it on the
   /// specified instruction.
   void SetInstDebugLocation(Instruction *I) const {
     if (CurDbgLocation)
       I->setDebugLoc(CurDbgLocation);
   }
 
-  /// \brief Get the return type of the current function that we're emitting
+  /// Get the return type of the current function that we're emitting
   /// into.
   Type *getCurrentFunctionReturnType() const;
 
@@ -171,33 +170,33 @@ public:
     BasicBlock::iterator Point;
 
   public:
-    /// \brief Creates a new insertion point which doesn't point to anything.
+    /// Creates a new insertion point which doesn't point to anything.
     InsertPoint() = default;
 
-    /// \brief Creates a new insertion point at the given location.
+    /// Creates a new insertion point at the given location.
     InsertPoint(BasicBlock *InsertBlock, BasicBlock::iterator InsertPoint)
-      : Block(InsertBlock), Point(InsertPoint) {}
+        : Block(InsertBlock), Point(InsertPoint) {}
 
-    /// \brief Returns true if this insert point is set.
+    /// Returns true if this insert point is set.
     bool isSet() const { return (Block != nullptr); }
 
     BasicBlock *getBlock() const { return Block; }
     BasicBlock::iterator getPoint() const { return Point; }
   };
 
-  /// \brief Returns the current insert point.
+  /// Returns the current insert point.
   InsertPoint saveIP() const {
     return InsertPoint(GetInsertBlock(), GetInsertPoint());
   }
 
-  /// \brief Returns the current insert point, clearing it in the process.
+  /// Returns the current insert point, clearing it in the process.
   InsertPoint saveAndClearIP() {
     InsertPoint IP(GetInsertBlock(), GetInsertPoint());
     ClearInsertionPoint();
     return IP;
   }
 
-  /// \brief Sets the current insert point to a previously-saved location.
+  /// Sets the current insert point to a previously-saved location.
   void restoreIP(InsertPoint IP) {
     if (IP.isSet())
       SetInsertPoint(IP.getBlock(), IP.getPoint());
@@ -205,26 +204,26 @@ public:
       ClearInsertionPoint();
   }
 
-  /// \brief Get the floating point math metadata being used.
+  /// Get the floating point math metadata being used.
   MDNode *getDefaultFPMathTag() const { return DefaultFPMathTag; }
 
-  /// \brief Get the flags to be applied to created floating point ops
+  /// Get the flags to be applied to created floating point ops
   FastMathFlags getFastMathFlags() const { return FMF; }
 
-  /// \brief Clear the fast-math flags.
+  /// Clear the fast-math flags.
   void clearFastMathFlags() { FMF.clear(); }
 
-  /// \brief Set the floating point math metadata to be used.
+  /// Set the floating point math metadata to be used.
   void setDefaultFPMathTag(MDNode *FPMathTag) { DefaultFPMathTag = FPMathTag; }
 
-  /// \brief Set the fast-math flags to be used with generated fp-math operators
+  /// Set the fast-math flags to be used with generated fp-math operators
   void setFastMathFlags(FastMathFlags NewFMF) { FMF = NewFMF; }
 
   //===--------------------------------------------------------------------===//
   // RAII helpers.
   //===--------------------------------------------------------------------===//
 
-  // \brief RAII object that stores the current insertion point and restores it
+  // RAII object that stores the current insertion point and restores it
   // when the object is destroyed. This includes the debug location.
   class InsertPointGuard {
     IRBuilderBase &Builder;
@@ -246,7 +245,7 @@ public:
     }
   };
 
-  // \brief RAII object that stores the current fast math settings and restores
+  // RAII object that stores the current fast math settings and restores
   // them when the object is destroyed.
   class FastMathFlagGuard {
     IRBuilderBase &Builder;
@@ -270,7 +269,7 @@ public:
   // Miscellaneous creation methods.
   //===--------------------------------------------------------------------===//
 
-  /// \brief Make a new global variable with initializer type i8*
+  /// Make a new global variable with initializer type i8*
   ///
   /// Make a new global variable with an initializer that has array of i8 type
   /// filled in with the null terminated string value specified.  The new global
@@ -279,48 +278,48 @@ public:
   GlobalVariable *CreateGlobalString(StringRef Str, const Twine &Name = "",
                                      unsigned AddressSpace = 0);
 
-  /// \brief Get a constant value representing either true or false.
+  /// Get a constant value representing either true or false.
   ConstantInt *getInt1(bool V) {
     return ConstantInt::get(getInt1Ty(), V);
   }
 
-  /// \brief Get the constant value for i1 true.
+  /// Get the constant value for i1 true.
   ConstantInt *getTrue() {
     return ConstantInt::getTrue(Context);
   }
 
-  /// \brief Get the constant value for i1 false.
+  /// Get the constant value for i1 false.
   ConstantInt *getFalse() {
     return ConstantInt::getFalse(Context);
   }
 
-  /// \brief Get a constant 8-bit value.
+  /// Get a constant 8-bit value.
   ConstantInt *getInt8(uint8_t C) {
     return ConstantInt::get(getInt8Ty(), C);
   }
 
-  /// \brief Get a constant 16-bit value.
+  /// Get a constant 16-bit value.
   ConstantInt *getInt16(uint16_t C) {
     return ConstantInt::get(getInt16Ty(), C);
   }
 
-  /// \brief Get a constant 32-bit value.
+  /// Get a constant 32-bit value.
   ConstantInt *getInt32(uint32_t C) {
     return ConstantInt::get(getInt32Ty(), C);
   }
 
-  /// \brief Get a constant 64-bit value.
+  /// Get a constant 64-bit value.
   ConstantInt *getInt64(uint64_t C) {
     return ConstantInt::get(getInt64Ty(), C);
   }
 
-  /// \brief Get a constant N-bit value, zero extended or truncated from
+  /// Get a constant N-bit value, zero extended or truncated from
   /// a 64-bit value.
   ConstantInt *getIntN(unsigned N, uint64_t C) {
     return ConstantInt::get(getIntNTy(N), C);
   }
 
-  /// \brief Get a constant integer value.
+  /// Get a constant integer value.
   ConstantInt *getInt(const APInt &AI) {
     return ConstantInt::get(Context, AI);
   }
@@ -329,65 +328,65 @@ public:
   // Type creation methods
   //===--------------------------------------------------------------------===//
 
-  /// \brief Fetch the type representing a single bit
+  /// Fetch the type representing a single bit
   IntegerType *getInt1Ty() {
     return Type::getInt1Ty(Context);
   }
 
-  /// \brief Fetch the type representing an 8-bit integer.
+  /// Fetch the type representing an 8-bit integer.
   IntegerType *getInt8Ty() {
     return Type::getInt8Ty(Context);
   }
 
-  /// \brief Fetch the type representing a 16-bit integer.
+  /// Fetch the type representing a 16-bit integer.
   IntegerType *getInt16Ty() {
     return Type::getInt16Ty(Context);
   }
 
-  /// \brief Fetch the type representing a 32-bit integer.
+  /// Fetch the type representing a 32-bit integer.
   IntegerType *getInt32Ty() {
     return Type::getInt32Ty(Context);
   }
 
-  /// \brief Fetch the type representing a 64-bit integer.
+  /// Fetch the type representing a 64-bit integer.
   IntegerType *getInt64Ty() {
     return Type::getInt64Ty(Context);
   }
 
-  /// \brief Fetch the type representing a 128-bit integer.
+  /// Fetch the type representing a 128-bit integer.
   IntegerType *getInt128Ty() { return Type::getInt128Ty(Context); }
 
-  /// \brief Fetch the type representing an N-bit integer.
+  /// Fetch the type representing an N-bit integer.
   IntegerType *getIntNTy(unsigned N) {
     return Type::getIntNTy(Context, N);
   }
 
-  /// \brief Fetch the type representing a 16-bit floating point value.
+  /// Fetch the type representing a 16-bit floating point value.
   Type *getHalfTy() {
     return Type::getHalfTy(Context);
   }
 
-  /// \brief Fetch the type representing a 32-bit floating point value.
+  /// Fetch the type representing a 32-bit floating point value.
   Type *getFloatTy() {
     return Type::getFloatTy(Context);
   }
 
-  /// \brief Fetch the type representing a 64-bit floating point value.
+  /// Fetch the type representing a 64-bit floating point value.
   Type *getDoubleTy() {
     return Type::getDoubleTy(Context);
   }
 
-  /// \brief Fetch the type representing void.
+  /// Fetch the type representing void.
   Type *getVoidTy() {
     return Type::getVoidTy(Context);
   }
 
-  /// \brief Fetch the type representing a pointer to an 8-bit integer value.
+  /// Fetch the type representing a pointer to an 8-bit integer value.
   PointerType *getInt8PtrTy(unsigned AddrSpace = 0) {
     return Type::getInt8PtrTy(Context, AddrSpace);
   }
 
-  /// \brief Fetch the type representing a pointer to an integer value.
+  /// Fetch the type representing a pointer to an integer value.
   IntegerType *getIntPtrTy(const DataLayout &DL, unsigned AddrSpace = 0) {
     return DL.getIntPtrType(Context, AddrSpace);
   }
@@ -396,7 +395,7 @@ public:
   // Intrinsic creation methods
   //===--------------------------------------------------------------------===//
 
-  /// \brief Create and insert a memset to the specified pointer and the
+  /// Create and insert a memset to the specified pointer and the
   /// specified value.
   ///
   /// If the pointer isn't an i8*, it will be converted. If a TBAA tag is
@@ -415,27 +414,54 @@ public:
                          MDNode *ScopeTag = nullptr,
                          MDNode *NoAliasTag = nullptr);
 
-  /// \brief Create and insert a memcpy between the specified pointers.
+  /// Create and insert an element unordered-atomic memset of the region of
+  /// memory starting at the given pointer to the given value.
+  ///
+  /// If the pointer isn't an i8*, it will be converted. If a TBAA tag is
+  /// specified, it will be added to the instruction. Likewise with alias.scope
+  /// and noalias tags.
+  CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val,
+                                               uint64_t Size, unsigned Align,
+                                               uint32_t ElementSize,
+                                               MDNode *TBAATag = nullptr,
+                                               MDNode *ScopeTag = nullptr,
+                                               MDNode *NoAliasTag = nullptr) {
+    return CreateElementUnorderedAtomicMemSet(Ptr, Val, getInt64(Size), Align,
+                                              ElementSize, TBAATag, ScopeTag,
+                                              NoAliasTag);
+  }
+
+  CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val,
+                                               Value *Size, unsigned Align,
+                                               uint32_t ElementSize,
+                                               MDNode *TBAATag = nullptr,
+                                               MDNode *ScopeTag = nullptr,
+                                               MDNode *NoAliasTag = nullptr);
+
+  /// Create and insert a memcpy between the specified pointers.
   ///
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction. Likewise with alias.scope
   /// and noalias tags.
-  CallInst *CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
+  CallInst *CreateMemCpy(Value *Dst, unsigned DstAlign, Value *Src,
+                         unsigned SrcAlign, uint64_t Size,
                          bool isVolatile = false, MDNode *TBAATag = nullptr,
                          MDNode *TBAAStructTag = nullptr,
                          MDNode *ScopeTag = nullptr,
                          MDNode *NoAliasTag = nullptr) {
-    return CreateMemCpy(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag,
-                        TBAAStructTag, ScopeTag, NoAliasTag);
+    return CreateMemCpy(Dst, DstAlign, Src, SrcAlign, getInt64(Size),
+                        isVolatile, TBAATag, TBAAStructTag, ScopeTag,
+                        NoAliasTag);
   }
 
-  CallInst *CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
+  CallInst *CreateMemCpy(Value *Dst, unsigned DstAlign, Value *Src,
+                         unsigned SrcAlign, Value *Size,
                          bool isVolatile = false, MDNode *TBAATag = nullptr,
                          MDNode *TBAAStructTag = nullptr,
                          MDNode *ScopeTag = nullptr,
                          MDNode *NoAliasTag = nullptr);
 
-  /// \brief Create and insert an element unordered-atomic memcpy between the
+  /// Create and insert an element unordered-atomic memcpy between the
   /// specified pointers.
   ///
   /// DstAlign/SrcAlign are the alignments of the Dst/Src pointers, respectively.
@@ -459,70 +485,95 @@ public:
       MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr,
       MDNode *NoAliasTag = nullptr);
 
-  /// \brief Create and insert a memmove between the specified
+  /// Create and insert a memmove between the specified
   /// pointers.
   ///
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction. Likewise with alias.scope
   /// and noalias tags.
-  CallInst *CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
-                          bool isVolatile = false, MDNode *TBAATag = nullptr,
-                          MDNode *ScopeTag = nullptr,
+  CallInst *CreateMemMove(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign,
+                          uint64_t Size, bool isVolatile = false,
+                          MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr,
                           MDNode *NoAliasTag = nullptr) {
-    return CreateMemMove(Dst, Src, getInt64(Size), Align, isVolatile,
+    return CreateMemMove(Dst, DstAlign, Src, SrcAlign, getInt64(Size), isVolatile,
                          TBAATag, ScopeTag, NoAliasTag);
   }
 
-  CallInst *CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
-                          bool isVolatile = false, MDNode *TBAATag = nullptr,
+  CallInst *CreateMemMove(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign,
+                          Value *Size, bool isVolatile = false, MDNode *TBAATag = nullptr,
                           MDNode *ScopeTag = nullptr,
                           MDNode *NoAliasTag = nullptr);
 
-  /// \brief Create a vector fadd reduction intrinsic of the source vector.
+  /// \brief Create and insert an element unordered-atomic memmove between the
+  /// specified pointers.
+  ///
+  /// DstAlign/SrcAlign are the alignments of the Dst/Src pointers,
+  /// respectively.
+  ///
+  /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
+  /// specified, it will be added to the instruction. Likewise with alias.scope
+  /// and noalias tags.
+  CallInst *CreateElementUnorderedAtomicMemMove(
+      Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign,
+      uint64_t Size, uint32_t ElementSize, MDNode *TBAATag = nullptr,
+      MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr,
+      MDNode *NoAliasTag = nullptr) {
+    return CreateElementUnorderedAtomicMemMove(
+        Dst, DstAlign, Src, SrcAlign, getInt64(Size), ElementSize, TBAATag,
+        TBAAStructTag, ScopeTag, NoAliasTag);
+  }
+
+  CallInst *CreateElementUnorderedAtomicMemMove(
+      Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size,
+      uint32_t ElementSize, MDNode *TBAATag = nullptr,
+      MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr,
+      MDNode *NoAliasTag = nullptr);
+
+  /// Create a vector fadd reduction intrinsic of the source vector.
   /// The first parameter is a scalar accumulator value for ordered reductions.
   CallInst *CreateFAddReduce(Value *Acc, Value *Src);
 
-  /// \brief Create a vector fmul reduction intrinsic of the source vector.
+  /// Create a vector fmul reduction intrinsic of the source vector.
   /// The first parameter is a scalar accumulator value for ordered reductions.
   CallInst *CreateFMulReduce(Value *Acc, Value *Src);
 
-  /// \brief Create a vector int add reduction intrinsic of the source vector.
+  /// Create a vector int add reduction intrinsic of the source vector.
   CallInst *CreateAddReduce(Value *Src);
 
-  /// \brief Create a vector int mul reduction intrinsic of the source vector.
+  /// Create a vector int mul reduction intrinsic of the source vector.
   CallInst *CreateMulReduce(Value *Src);
 
-  /// \brief Create a vector int AND reduction intrinsic of the source vector.
+  /// Create a vector int AND reduction intrinsic of the source vector.
   CallInst *CreateAndReduce(Value *Src);
 
-  /// \brief Create a vector int OR reduction intrinsic of the source vector.
+  /// Create a vector int OR reduction intrinsic of the source vector.
   CallInst *CreateOrReduce(Value *Src);
 
-  /// \brief Create a vector int XOR reduction intrinsic of the source vector.
+  /// Create a vector int XOR reduction intrinsic of the source vector.
   CallInst *CreateXorReduce(Value *Src);
 
-  /// \brief Create a vector integer max reduction intrinsic of the source
+  /// Create a vector integer max reduction intrinsic of the source
   /// vector.
   CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
 
-  /// \brief Create a vector integer min reduction intrinsic of the source
+  /// Create a vector integer min reduction intrinsic of the source
   /// vector.
   CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
 
-  /// \brief Create a vector float max reduction intrinsic of the source
+  /// Create a vector float max reduction intrinsic of the source
   /// vector.
   CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false);
 
-  /// \brief Create a vector float min reduction intrinsic of the source
+  /// Create a vector float min reduction intrinsic of the source
   /// vector.
   CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false);
 
-  /// \brief Create a lifetime.start intrinsic.
+  /// Create a lifetime.start intrinsic.
   ///
   /// If the pointer isn't i8* it will be converted.
   CallInst *CreateLifetimeStart(Value *Ptr, ConstantInt *Size = nullptr);
 
-  /// \brief Create a lifetime.end intrinsic.
+  /// Create a lifetime.end intrinsic.
   ///
   /// If the pointer isn't i8* it will be converted.
   CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr);
@@ -532,29 +583,29 @@ public:
   /// If the pointer isn't i8* it will be converted.
   CallInst *CreateInvariantStart(Value *Ptr, ConstantInt *Size = nullptr);
 
-  /// \brief Create a call to Masked Load intrinsic
+  /// Create a call to Masked Load intrinsic
   CallInst *CreateMaskedLoad(Value *Ptr, unsigned Align, Value *Mask,
                              Value *PassThru = nullptr, const Twine &Name = "");
 
-  /// \brief Create a call to Masked Store intrinsic
+  /// Create a call to Masked Store intrinsic
   CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align,
                               Value *Mask);
 
-  /// \brief Create a call to Masked Gather intrinsic
+  /// Create a call to Masked Gather intrinsic
   CallInst *CreateMaskedGather(Value *Ptrs, unsigned Align,
                                Value *Mask = nullptr,
                                Value *PassThru = nullptr,
                                const Twine& Name = "");
 
-  /// \brief Create a call to Masked Scatter intrinsic
+  /// Create a call to Masked Scatter intrinsic
   CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, unsigned Align,
                                 Value *Mask = nullptr);
 
-  /// \brief Create an assume intrinsic call that allows the optimizer to
+  /// Create an assume intrinsic call that allows the optimizer to
   /// assume that the provided condition will be true.
   CallInst *CreateAssumption(Value *Cond);
 
-  /// \brief Create a call to the experimental.gc.statepoint intrinsic to
+  /// Create a call to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee,
@@ -563,7 +614,7 @@ public:
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
 
-  /// \brief Create a call to the experimental.gc.statepoint intrinsic to
+  /// Create a call to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee, uint32_t Flags,
@@ -573,16 +624,16 @@ public:
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
 
-  // \brief Conveninence function for the common case when CallArgs are filled
-  // in using makeArrayRef(CS.arg_begin(), CS.arg_end()); Use needs to be
-  // .get()'ed to get the Value pointer.
+  /// Conveninence function for the common case when CallArgs are filled
+  /// in using makeArrayRef(CS.arg_begin(), CS.arg_end()); Use needs to be
+  /// .get()'ed to get the Value pointer.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee, ArrayRef<Use> CallArgs,
                                    ArrayRef<Value *> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
 
-  /// brief Create an invoke to the experimental.gc.statepoint intrinsic to
+  /// Create an invoke to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   InvokeInst *
   CreateGCStatepointInvoke(uint64_t ID, uint32_t NumPatchBytes,
@@ -591,7 +642,7 @@ public:
                            ArrayRef<Value *> DeoptArgs,
                            ArrayRef<Value *> GCArgs, const Twine &Name = "");
 
-  /// brief Create an invoke to the experimental.gc.statepoint intrinsic to
+  /// Create an invoke to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
   InvokeInst *CreateGCStatepointInvoke(
       uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
@@ -610,13 +661,13 @@ public:
                            ArrayRef<Value *> DeoptArgs,
                            ArrayRef<Value *> GCArgs, const Twine &Name = "");
 
-  /// \brief Create a call to the experimental.gc.result intrinsic to extract
+  /// Create a call to the experimental.gc.result intrinsic to extract
   /// the result from a call wrapped in a statepoint.
   CallInst *CreateGCResult(Instruction *Statepoint,
                            Type *ResultType,
                            const Twine &Name = "");
 
-  /// \brief Create a call to the experimental.gc.relocate intrinsics to
+  /// Create a call to the experimental.gc.relocate intrinsics to
   /// project the relocated value of one pointer from the statepoint.
   CallInst *CreateGCRelocate(Instruction *Statepoint,
                              int BaseOffset,
@@ -630,6 +681,18 @@ public:
                                   Value *LHS, Value *RHS,
                                   const Twine &Name = "");
 
+  /// Create a call to intrinsic \p ID with no operands.
+  CallInst *CreateIntrinsic(Intrinsic::ID ID,
+                            Instruction *FMFSource = nullptr,
+                            const Twine &Name = "");
+
+  /// Create a call to intrinsic \p ID with 1 or more operands assuming the
+  /// intrinsic and all operands have the same type. If \p FMFSource is
+  /// provided, copy fast-math-flags from that instruction to the intrinsic.
+  CallInst *CreateIntrinsic(Intrinsic::ID ID, ArrayRef<Value *> Args,
+                            Instruction *FMFSource = nullptr,
+                            const Twine &Name = "");
+
   /// Create call to the minnum intrinsic.
   CallInst *CreateMinNum(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS, Name);
@@ -637,11 +700,11 @@ public:
 
   /// Create call to the maxnum intrinsic.
   CallInst *CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS, Name);
+    return CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS, Name);
   }
 
 private:
-  /// \brief Create a call to a masked intrinsic with given Id.
+  /// Create a call to a masked intrinsic with given Id.
   CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef<Value *> Ops,
                                   ArrayRef<Type *> OverloadedTypes,
                                   const Twine &Name = "");
@@ -649,7 +712,7 @@ private:
   Value *getCastedInt8PtrValue(Value *Ptr);
 };
 
-/// \brief This provides a uniform API for creating instructions and inserting
+/// This provides a uniform API for creating instructions and inserting
 /// them into a basic block: either at the end of a BasicBlock, or at a specific
 /// iterator location in a block.
 ///
@@ -677,7 +740,7 @@ public:
 
   explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr,
                      ArrayRef<OperandBundleDef> OpBundles = None)
-      : IRBuilderBase(C, FPMathTag, OpBundles), Folder() {}
+      : IRBuilderBase(C, FPMathTag, OpBundles) {}
 
   explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = nullptr,
                      ArrayRef<OperandBundleDef> OpBundles = None)
@@ -687,13 +750,13 @@ public:
 
   explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = nullptr,
                      ArrayRef<OperandBundleDef> OpBundles = None)
-      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder() {
+      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles) {
     SetInsertPoint(TheBB);
   }
 
   explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = nullptr,
                      ArrayRef<OperandBundleDef> OpBundles = None)
-      : IRBuilderBase(IP->getContext(), FPMathTag, OpBundles), Folder() {
+      : IRBuilderBase(IP->getContext(), FPMathTag, OpBundles) {
     SetInsertPoint(IP);
   }
 
@@ -707,14 +770,14 @@ public:
   IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP,
             MDNode *FPMathTag = nullptr,
             ArrayRef<OperandBundleDef> OpBundles = None)
-      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder() {
+      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles) {
     SetInsertPoint(TheBB, IP);
   }
 
-  /// \brief Get the constant folder being used.
+  /// Get the constant folder being used.
   const T &getFolder() { return Folder; }
 
-  /// \brief Insert and return the specified instruction.
+  /// Insert and return the specified instruction.
   template<typename InstTy>
   InstTy *Insert(InstTy *I, const Twine &Name = "") const {
     this->InsertHelper(I, Name, BB, InsertPt);
@@ -722,7 +785,7 @@ public:
     return I;
   }
 
-  /// \brief No-op overload to handle constants.
+  /// No-op overload to handle constants.
   Constant *Insert(Constant *C, const Twine& = "") const {
     return C;
   }
@@ -732,7 +795,7 @@ public:
   //===--------------------------------------------------------------------===//
 
 private:
-  /// \brief Helper to add branch weight and unpredictable metadata onto an
+  /// Helper to add branch weight and unpredictable metadata onto an
   /// instruction.
   /// \returns The annotated instruction.
   template <typename InstTy>
@@ -745,17 +808,17 @@ private:
   }
 
 public:
-  /// \brief Create a 'ret void' instruction.
+  /// Create a 'ret void' instruction.
   ReturnInst *CreateRetVoid() {
     return Insert(ReturnInst::Create(Context));
   }
 
-  /// \brief Create a 'ret <val>' instruction.
+  /// Create a 'ret <val>' instruction.
   ReturnInst *CreateRet(Value *V) {
     return Insert(ReturnInst::Create(Context, V));
   }
 
-  /// \brief Create a sequence of N insertvalue instructions,
+  /// Create a sequence of N insertvalue instructions,
   /// with one Value from the retVals array each, that build a aggregate
   /// return value one value at a time, and a ret instruction to return
   /// the resulting aggregate value.
@@ -769,12 +832,12 @@ public:
     return Insert(ReturnInst::Create(Context, V));
   }
 
-  /// \brief Create an unconditional 'br label X' instruction.
+  /// Create an unconditional 'br label X' instruction.
   BranchInst *CreateBr(BasicBlock *Dest) {
     return Insert(BranchInst::Create(Dest));
   }
 
-  /// \brief Create a conditional 'br Cond, TrueDest, FalseDest'
+  /// Create a conditional 'br Cond, TrueDest, FalseDest'
   /// instruction.
   BranchInst *CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False,
                            MDNode *BranchWeights = nullptr,
@@ -783,7 +846,7 @@ public:
                                     BranchWeights, Unpredictable));
   }
 
-  /// \brief Create a conditional 'br Cond, TrueDest, FalseDest'
+  /// Create a conditional 'br Cond, TrueDest, FalseDest'
   /// instruction. Copy branch meta data if available.
   BranchInst *CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False,
                            Instruction *MDSrc) {
@@ -796,7 +859,7 @@ public:
     return Insert(Br);
   }
 
-  /// \brief Create a switch instruction with the specified value, default dest,
+  /// Create a switch instruction with the specified value, default dest,
   /// and with a hint for the number of cases that will be added (for efficient
   /// allocation).
   SwitchInst *CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases = 10,
@@ -806,14 +869,14 @@ public:
                                     BranchWeights, Unpredictable));
   }
 
-  /// \brief Create an indirect branch instruction with the specified address
+  /// Create an indirect branch instruction with the specified address
   /// operand, with an optional hint for the number of destinations that will be
   /// added (for efficient allocation).
   IndirectBrInst *CreateIndirectBr(Value *Addr, unsigned NumDests = 10) {
     return Insert(IndirectBrInst::Create(Addr, NumDests));
   }
 
-  /// \brief Create an invoke instruction.
+  /// Create an invoke instruction.
   InvokeInst *CreateInvoke(Value *Callee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest,
                            ArrayRef<Value *> Args = None,
@@ -878,150 +941,128 @@ private:
     return BO;
   }
 
-  Instruction *AddFPMathAttributes(Instruction *I,
-                                   MDNode *FPMathTag,
-                                   FastMathFlags FMF) const {
-    if (!FPMathTag)
-      FPMathTag = DefaultFPMathTag;
-    if (FPMathTag)
-      I->setMetadata(LLVMContext::MD_fpmath, FPMathTag);
+  Instruction *setFPAttrs(Instruction *I, MDNode *FPMD,
+                          FastMathFlags FMF) const {
+    if (!FPMD)
+      FPMD = DefaultFPMathTag;
+    if (FPMD)
+      I->setMetadata(LLVMContext::MD_fpmath, FPMD);
     I->setFastMathFlags(FMF);
     return I;
   }
 
+  Value *foldConstant(Instruction::BinaryOps Opc, Value *L,
+                      Value *R, const Twine &Name = nullptr) const {
+    auto *LC = dyn_cast<Constant>(L);
+    auto *RC = dyn_cast<Constant>(R);
+    return (LC && RC) ? Insert(Folder.CreateBinOp(Opc, LC, RC), Name) : nullptr;
+  }
+
 public:
   Value *CreateAdd(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateAdd(LC, RC, HasNUW, HasNSW), Name);
     return CreateInsertNUWNSWBinOp(Instruction::Add, LHS, RHS, Name,
                                    HasNUW, HasNSW);
   }
+
   Value *CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateAdd(LHS, RHS, Name, false, true);
   }
+
   Value *CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateAdd(LHS, RHS, Name, true, false);
   }
-  Value *CreateFAdd(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateFAdd(LC, RC), Name);
-    return Insert(AddFPMathAttributes(BinaryOperator::CreateFAdd(LHS, RHS),
-                                      FPMathTag, FMF), Name);
-  }
+
   Value *CreateSub(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateSub(LC, RC, HasNUW, HasNSW), Name);
     return CreateInsertNUWNSWBinOp(Instruction::Sub, LHS, RHS, Name,
                                    HasNUW, HasNSW);
   }
+
   Value *CreateNSWSub(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateSub(LHS, RHS, Name, false, true);
   }
+
   Value *CreateNUWSub(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateSub(LHS, RHS, Name, true, false);
   }
-  Value *CreateFSub(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateFSub(LC, RC), Name);
-    return Insert(AddFPMathAttributes(BinaryOperator::CreateFSub(LHS, RHS),
-                                      FPMathTag, FMF), Name);
-  }
+
   Value *CreateMul(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateMul(LC, RC, HasNUW, HasNSW), Name);
     return CreateInsertNUWNSWBinOp(Instruction::Mul, LHS, RHS, Name,
                                    HasNUW, HasNSW);
   }
+
   Value *CreateNSWMul(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateMul(LHS, RHS, Name, false, true);
   }
+
   Value *CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateMul(LHS, RHS, Name, true, false);
   }
-  Value *CreateFMul(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateFMul(LC, RC), Name);
-    return Insert(AddFPMathAttributes(BinaryOperator::CreateFMul(LHS, RHS),
-                                      FPMathTag, FMF), Name);
-  }
+
   Value *CreateUDiv(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateUDiv(LC, RC, isExact), Name);
     if (!isExact)
       return Insert(BinaryOperator::CreateUDiv(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactUDiv(LHS, RHS), Name);
   }
+
   Value *CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateUDiv(LHS, RHS, Name, true);
   }
+
   Value *CreateSDiv(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateSDiv(LC, RC, isExact), Name);
     if (!isExact)
       return Insert(BinaryOperator::CreateSDiv(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactSDiv(LHS, RHS), Name);
   }
+
   Value *CreateExactSDiv(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateSDiv(LHS, RHS, Name, true);
   }
-  Value *CreateFDiv(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateFDiv(LC, RC), Name);
-    return Insert(AddFPMathAttributes(BinaryOperator::CreateFDiv(LHS, RHS),
-                                      FPMathTag, FMF), Name);
-  }
+
   Value *CreateURem(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateURem(LC, RC), Name);
+    if (Value *V = foldConstant(Instruction::URem, LHS, RHS, Name)) return V;
     return Insert(BinaryOperator::CreateURem(LHS, RHS), Name);
   }
+
   Value *CreateSRem(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateSRem(LC, RC), Name);
+    if (Value *V = foldConstant(Instruction::SRem, LHS, RHS, Name)) return V;
     return Insert(BinaryOperator::CreateSRem(LHS, RHS), Name);
   }
-  Value *CreateFRem(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateFRem(LC, RC), Name);
-    return Insert(AddFPMathAttributes(BinaryOperator::CreateFRem(LHS, RHS),
-                                      FPMathTag, FMF), Name);
-  }
 
   Value *CreateShl(Value *LHS, Value *RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateShl(LC, RC, HasNUW, HasNSW), Name);
     return CreateInsertNUWNSWBinOp(Instruction::Shl, LHS, RHS, Name,
                                    HasNUW, HasNSW);
   }
+
   Value *CreateShl(Value *LHS, const APInt &RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
     return CreateShl(LHS, ConstantInt::get(LHS->getType(), RHS), Name,
                      HasNUW, HasNSW);
   }
+
   Value *CreateShl(Value *LHS, uint64_t RHS, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
     return CreateShl(LHS, ConstantInt::get(LHS->getType(), RHS), Name,
@@ -1030,17 +1071,19 @@ public:
 
   Value *CreateLShr(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateLShr(LC, RC, isExact), Name);
     if (!isExact)
       return Insert(BinaryOperator::CreateLShr(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactLShr(LHS, RHS), Name);
   }
+
   Value *CreateLShr(Value *LHS, const APInt &RHS, const Twine &Name = "",
                     bool isExact = false) {
     return CreateLShr(LHS, ConstantInt::get(LHS->getType(), RHS), Name,isExact);
   }
+
   Value *CreateLShr(Value *LHS, uint64_t RHS, const Twine &Name = "",
                     bool isExact = false) {
     return CreateLShr(LHS, ConstantInt::get(LHS->getType(), RHS), Name,isExact);
@@ -1048,103 +1091,196 @@ public:
 
   Value *CreateAShr(Value *LHS, Value *RHS, const Twine &Name = "",
                     bool isExact = false) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateAShr(LC, RC, isExact), Name);
     if (!isExact)
       return Insert(BinaryOperator::CreateAShr(LHS, RHS), Name);
     return Insert(BinaryOperator::CreateExactAShr(LHS, RHS), Name);
   }
+
   Value *CreateAShr(Value *LHS, const APInt &RHS, const Twine &Name = "",
                     bool isExact = false) {
     return CreateAShr(LHS, ConstantInt::get(LHS->getType(), RHS), Name,isExact);
   }
+
   Value *CreateAShr(Value *LHS, uint64_t RHS, const Twine &Name = "",
                     bool isExact = false) {
     return CreateAShr(LHS, ConstantInt::get(LHS->getType(), RHS), Name,isExact);
   }
 
   Value *CreateAnd(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Constant *RC = dyn_cast<Constant>(RHS)) {
+    if (auto *RC = dyn_cast<Constant>(RHS)) {
       if (isa<ConstantInt>(RC) && cast<ConstantInt>(RC)->isMinusOne())
         return LHS;  // LHS & -1 -> LHS
-      if (Constant *LC = dyn_cast<Constant>(LHS))
+      if (auto *LC = dyn_cast<Constant>(LHS))
         return Insert(Folder.CreateAnd(LC, RC), Name);
     }
     return Insert(BinaryOperator::CreateAnd(LHS, RHS), Name);
   }
+
   Value *CreateAnd(Value *LHS, const APInt &RHS, const Twine &Name = "") {
     return CreateAnd(LHS, ConstantInt::get(LHS->getType(), RHS), Name);
   }
+
   Value *CreateAnd(Value *LHS, uint64_t RHS, const Twine &Name = "") {
     return CreateAnd(LHS, ConstantInt::get(LHS->getType(), RHS), Name);
   }
 
   Value *CreateOr(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Constant *RC = dyn_cast<Constant>(RHS)) {
+    if (auto *RC = dyn_cast<Constant>(RHS)) {
       if (RC->isNullValue())
         return LHS;  // LHS | 0 -> LHS
-      if (Constant *LC = dyn_cast<Constant>(LHS))
+      if (auto *LC = dyn_cast<Constant>(LHS))
         return Insert(Folder.CreateOr(LC, RC), Name);
     }
     return Insert(BinaryOperator::CreateOr(LHS, RHS), Name);
   }
+
   Value *CreateOr(Value *LHS, const APInt &RHS, const Twine &Name = "") {
     return CreateOr(LHS, ConstantInt::get(LHS->getType(), RHS), Name);
   }
+
   Value *CreateOr(Value *LHS, uint64_t RHS, const Twine &Name = "") {
     return CreateOr(LHS, ConstantInt::get(LHS->getType(), RHS), Name);
   }
 
   Value *CreateXor(Value *LHS, Value *RHS, const Twine &Name = "") {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateXor(LC, RC), Name);
+    if (Value *V = foldConstant(Instruction::Xor, LHS, RHS, Name)) return V;
     return Insert(BinaryOperator::CreateXor(LHS, RHS), Name);
   }
+
   Value *CreateXor(Value *LHS, const APInt &RHS, const Twine &Name = "") {
     return CreateXor(LHS, ConstantInt::get(LHS->getType(), RHS), Name);
   }
+
   Value *CreateXor(Value *LHS, uint64_t RHS, const Twine &Name = "") {
     return CreateXor(LHS, ConstantInt::get(LHS->getType(), RHS), Name);
   }
 
+  Value *CreateFAdd(Value *L, Value *R, const Twine &Name = "",
+                    MDNode *FPMD = nullptr) {
+    if (Value *V = foldConstant(Instruction::FAdd, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), FPMD, FMF);
+    return Insert(I, Name);
+  }
+
+  /// Copy fast-math-flags from an instruction rather than using the builder's
+  /// default FMF.
+  Value *CreateFAddFMF(Value *L, Value *R, Instruction *FMFSource,
+                       const Twine &Name = "") {
+    if (Value *V = foldConstant(Instruction::FAdd, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), nullptr,
+                                FMFSource->getFastMathFlags());
+    return Insert(I, Name);
+  }
+
+  Value *CreateFSub(Value *L, Value *R, const Twine &Name = "",
+                    MDNode *FPMD = nullptr) {
+    if (Value *V = foldConstant(Instruction::FSub, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), FPMD, FMF);
+    return Insert(I, Name);
+  }
+
+  /// Copy fast-math-flags from an instruction rather than using the builder's
+  /// default FMF.
+  Value *CreateFSubFMF(Value *L, Value *R, Instruction *FMFSource,
+                       const Twine &Name = "") {
+    if (Value *V = foldConstant(Instruction::FSub, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), nullptr,
+                                FMFSource->getFastMathFlags());
+    return Insert(I, Name);
+  }
+
+  Value *CreateFMul(Value *L, Value *R, const Twine &Name = "",
+                    MDNode *FPMD = nullptr) {
+    if (Value *V = foldConstant(Instruction::FMul, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), FPMD, FMF);
+    return Insert(I, Name);
+  }
+
+  /// Copy fast-math-flags from an instruction rather than using the builder's
+  /// default FMF.
+  Value *CreateFMulFMF(Value *L, Value *R, Instruction *FMFSource,
+                       const Twine &Name = "") {
+    if (Value *V = foldConstant(Instruction::FMul, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), nullptr,
+                                FMFSource->getFastMathFlags());
+    return Insert(I, Name);
+  }
+
+  Value *CreateFDiv(Value *L, Value *R, const Twine &Name = "",
+                    MDNode *FPMD = nullptr) {
+    if (Value *V = foldConstant(Instruction::FDiv, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), FPMD, FMF);
+    return Insert(I, Name);
+  }
+
+  /// Copy fast-math-flags from an instruction rather than using the builder's
+  /// default FMF.
+  Value *CreateFDivFMF(Value *L, Value *R, Instruction *FMFSource,
+                       const Twine &Name = "") {
+    if (Value *V = foldConstant(Instruction::FDiv, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), nullptr,
+                                FMFSource->getFastMathFlags());
+    return Insert(I, Name);
+  }
+
+  Value *CreateFRem(Value *L, Value *R, const Twine &Name = "",
+                    MDNode *FPMD = nullptr) {
+    if (Value *V = foldConstant(Instruction::FRem, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), FPMD, FMF);
+    return Insert(I, Name);
+  }
+
+  /// Copy fast-math-flags from an instruction rather than using the builder's
+  /// default FMF.
+  Value *CreateFRemFMF(Value *L, Value *R, Instruction *FMFSource,
+                       const Twine &Name = "") {
+    if (Value *V = foldConstant(Instruction::FRem, L, R, Name)) return V;
+    Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), nullptr,
+                                FMFSource->getFastMathFlags());
+    return Insert(I, Name);
+  }
+
   Value *CreateBinOp(Instruction::BinaryOps Opc,
                      Value *LHS, Value *RHS, const Twine &Name = "",
                      MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
-        return Insert(Folder.CreateBinOp(Opc, LC, RC), Name);
+    if (Value *V = foldConstant(Opc, LHS, RHS, Name)) return V;
     Instruction *BinOp = BinaryOperator::Create(Opc, LHS, RHS);
     if (isa<FPMathOperator>(BinOp))
-      BinOp = AddFPMathAttributes(BinOp, FPMathTag, FMF);
+      BinOp = setFPAttrs(BinOp, FPMathTag, FMF);
     return Insert(BinOp, Name);
   }
 
   Value *CreateNeg(Value *V, const Twine &Name = "",
                    bool HasNUW = false, bool HasNSW = false) {
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateNeg(VC, HasNUW, HasNSW), Name);
     BinaryOperator *BO = Insert(BinaryOperator::CreateNeg(V), Name);
     if (HasNUW) BO->setHasNoUnsignedWrap();
     if (HasNSW) BO->setHasNoSignedWrap();
     return BO;
   }
+
   Value *CreateNSWNeg(Value *V, const Twine &Name = "") {
     return CreateNeg(V, Name, false, true);
   }
+
   Value *CreateNUWNeg(Value *V, const Twine &Name = "") {
     return CreateNeg(V, Name, true, false);
   }
+
   Value *CreateFNeg(Value *V, const Twine &Name = "",
                     MDNode *FPMathTag = nullptr) {
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateFNeg(VC), Name);
-    return Insert(AddFPMathAttributes(BinaryOperator::CreateFNeg(V),
-                                      FPMathTag, FMF), Name);
+    return Insert(setFPAttrs(BinaryOperator::CreateFNeg(V), FPMathTag, FMF),
+                  Name);
   }
+
   Value *CreateNot(Value *V, const Twine &Name = "") {
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateNot(VC), Name);
     return Insert(BinaryOperator::CreateNot(V), Name);
   }
@@ -1163,26 +1299,32 @@ public:
     const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
     return Insert(new AllocaInst(Ty, DL.getAllocaAddrSpace(), ArraySize), Name);
   }
-  // \brief Provided to resolve 'CreateLoad(Ptr, "...")' correctly, instead of
-  // converting the string to 'bool' for the isVolatile parameter.
+
+  /// Provided to resolve 'CreateLoad(Ptr, "...")' correctly, instead of
+  /// converting the string to 'bool' for the isVolatile parameter.
   LoadInst *CreateLoad(Value *Ptr, const char *Name) {
     return Insert(new LoadInst(Ptr), Name);
   }
+
   LoadInst *CreateLoad(Value *Ptr, const Twine &Name = "") {
     return Insert(new LoadInst(Ptr), Name);
   }
+
   LoadInst *CreateLoad(Type *Ty, Value *Ptr, const Twine &Name = "") {
     return Insert(new LoadInst(Ty, Ptr), Name);
   }
+
   LoadInst *CreateLoad(Value *Ptr, bool isVolatile, const Twine &Name = "") {
     return Insert(new LoadInst(Ptr, nullptr, isVolatile), Name);
   }
+
   StoreInst *CreateStore(Value *Val, Value *Ptr, bool isVolatile = false) {
     return Insert(new StoreInst(Val, Ptr, isVolatile));
   }
-  // \brief Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")'
-  // correctly, instead of converting the string to 'bool' for the isVolatile
-  // parameter.
+
+  /// Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")'
+  /// correctly, instead of converting the string to 'bool' for the isVolatile
+  /// parameter.
   LoadInst *CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name) {
     LoadInst *LI = CreateLoad(Ptr, Name);
     LI->setAlignment(Align);
@@ -1200,17 +1342,20 @@ public:
     LI->setAlignment(Align);
     return LI;
   }
+
   StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align,
                                 bool isVolatile = false) {
     StoreInst *SI = CreateStore(Val, Ptr, isVolatile);
     SI->setAlignment(Align);
     return SI;
   }
+
   FenceInst *CreateFence(AtomicOrdering Ordering,
                          SyncScope::ID SSID = SyncScope::System,
                          const Twine &Name = "") {
     return Insert(new FenceInst(Context, Ordering, SSID), Name);
   }
+
   AtomicCmpXchgInst *
   CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New,
                       AtomicOrdering SuccessOrdering,
@@ -1219,18 +1364,21 @@ public:
     return Insert(new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering,
                                         FailureOrdering, SSID));
   }
+
   AtomicRMWInst *CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val,
                                  AtomicOrdering Ordering,
                                  SyncScope::ID SSID = SyncScope::System) {
     return Insert(new AtomicRMWInst(Op, Ptr, Val, Ordering, SSID));
   }
+
   Value *CreateGEP(Value *Ptr, ArrayRef<Value *> IdxList,
                    const Twine &Name = "") {
     return CreateGEP(nullptr, Ptr, IdxList, Name);
   }
+
   Value *CreateGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
                    const Twine &Name = "") {
-    if (Constant *PC = dyn_cast<Constant>(Ptr)) {
+    if (auto *PC = dyn_cast<Constant>(Ptr)) {
       // Every index must be constant.
       size_t i, e;
       for (i = 0, e = IdxList.size(); i != e; ++i)
@@ -1241,13 +1389,15 @@ public:
     }
     return Insert(GetElementPtrInst::Create(Ty, Ptr, IdxList), Name);
   }
+
   Value *CreateInBoundsGEP(Value *Ptr, ArrayRef<Value *> IdxList,
                            const Twine &Name = "") {
     return CreateInBoundsGEP(nullptr, Ptr, IdxList, Name);
   }
+
   Value *CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
                            const Twine &Name = "") {
-    if (Constant *PC = dyn_cast<Constant>(Ptr)) {
+    if (auto *PC = dyn_cast<Constant>(Ptr)) {
       // Every index must be constant.
       size_t i, e;
       for (i = 0, e = IdxList.size(); i != e; ++i)
@@ -1259,43 +1409,50 @@ public:
     }
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, IdxList), Name);
   }
+
   Value *CreateGEP(Value *Ptr, Value *Idx, const Twine &Name = "") {
     return CreateGEP(nullptr, Ptr, Idx, Name);
   }
+
   Value *CreateGEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = "") {
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
-      if (Constant *IC = dyn_cast<Constant>(Idx))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
+      if (auto *IC = dyn_cast<Constant>(Idx))
         return Insert(Folder.CreateGetElementPtr(Ty, PC, IC), Name);
     return Insert(GetElementPtrInst::Create(Ty, Ptr, Idx), Name);
   }
+
   Value *CreateInBoundsGEP(Type *Ty, Value *Ptr, Value *Idx,
                            const Twine &Name = "") {
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
-      if (Constant *IC = dyn_cast<Constant>(Idx))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
+      if (auto *IC = dyn_cast<Constant>(Idx))
         return Insert(Folder.CreateInBoundsGetElementPtr(Ty, PC, IC), Name);
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idx), Name);
   }
+
   Value *CreateConstGEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = "") {
     return CreateConstGEP1_32(nullptr, Ptr, Idx0, Name);
   }
+
   Value *CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0,
                             const Twine &Name = "") {
     Value *Idx = ConstantInt::get(Type::getInt32Ty(Context), Idx0);
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateGetElementPtr(Ty, PC, Idx), Name);
 
     return Insert(GetElementPtrInst::Create(Ty, Ptr, Idx), Name);
   }
+
   Value *CreateConstInBoundsGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0,
                                     const Twine &Name = "") {
     Value *Idx = ConstantInt::get(Type::getInt32Ty(Context), Idx0);
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateInBoundsGetElementPtr(Ty, PC, Idx), Name);
 
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idx), Name);
   }
+
   Value *CreateConstGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1,
                             const Twine &Name = "") {
     Value *Idxs[] = {
@@ -1303,11 +1460,12 @@ public:
       ConstantInt::get(Type::getInt32Ty(Context), Idx1)
     };
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateGetElementPtr(Ty, PC, Idxs), Name);
 
     return Insert(GetElementPtrInst::Create(Ty, Ptr, Idxs), Name);
   }
+
   Value *CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0,
                                     unsigned Idx1, const Twine &Name = "") {
     Value *Idxs[] = {
@@ -1315,28 +1473,31 @@ public:
       ConstantInt::get(Type::getInt32Ty(Context), Idx1)
     };
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateInBoundsGetElementPtr(Ty, PC, Idxs), Name);
 
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idxs), Name);
   }
+
   Value *CreateConstGEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "") {
     Value *Idx = ConstantInt::get(Type::getInt64Ty(Context), Idx0);
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateGetElementPtr(nullptr, PC, Idx), Name);
 
     return Insert(GetElementPtrInst::Create(nullptr, Ptr, Idx), Name);
   }
+
   Value *CreateConstInBoundsGEP1_64(Value *Ptr, uint64_t Idx0,
                                     const Twine &Name = "") {
     Value *Idx = ConstantInt::get(Type::getInt64Ty(Context), Idx0);
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateInBoundsGetElementPtr(nullptr, PC, Idx), Name);
 
     return Insert(GetElementPtrInst::CreateInBounds(nullptr, Ptr, Idx), Name);
   }
+
   Value *CreateConstGEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1,
                     const Twine &Name = "") {
     Value *Idxs[] = {
@@ -1344,11 +1505,12 @@ public:
       ConstantInt::get(Type::getInt64Ty(Context), Idx1)
     };
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateGetElementPtr(nullptr, PC, Idxs), Name);
 
     return Insert(GetElementPtrInst::Create(nullptr, Ptr, Idxs), Name);
   }
+
   Value *CreateConstInBoundsGEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1,
                                     const Twine &Name = "") {
     Value *Idxs[] = {
@@ -1356,25 +1518,31 @@ public:
       ConstantInt::get(Type::getInt64Ty(Context), Idx1)
     };
 
-    if (Constant *PC = dyn_cast<Constant>(Ptr))
+    if (auto *PC = dyn_cast<Constant>(Ptr))
       return Insert(Folder.CreateInBoundsGetElementPtr(nullptr, PC, Idxs),
                     Name);
 
     return Insert(GetElementPtrInst::CreateInBounds(nullptr, Ptr, Idxs), Name);
   }
+
   Value *CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx,
                          const Twine &Name = "") {
     return CreateConstInBoundsGEP2_32(Ty, Ptr, 0, Idx, Name);
   }
 
-  /// \brief Same as CreateGlobalString, but return a pointer with "i8*" type
+  Value *CreateStructGEP(Value *Ptr, unsigned Idx, const Twine &Name = "") {
+    return CreateConstInBoundsGEP2_32(nullptr, Ptr, 0, Idx, Name);
+  }
+
+  /// Same as CreateGlobalString, but return a pointer with "i8*" type
   /// instead of a pointer to array of i8.
-  Value *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "",
-                               unsigned AddressSpace = 0) {
-    GlobalVariable *gv = CreateGlobalString(Str, Name, AddressSpace);
-    Value *zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *Args[] = { zero, zero };
-    return CreateInBoundsGEP(gv->getValueType(), gv, Args, Name);
+  Constant *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "",
+                                  unsigned AddressSpace = 0) {
+    GlobalVariable *GV = CreateGlobalString(Str, Name, AddressSpace);
+    Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    Constant *Indices[] = {Zero, Zero};
+    return ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV,
+                                                  Indices);
   }
 
   //===--------------------------------------------------------------------===//
@@ -1384,13 +1552,16 @@ public:
   Value *CreateTrunc(Value *V, Type *DestTy, const Twine &Name = "") {
     return CreateCast(Instruction::Trunc, V, DestTy, Name);
   }
+
   Value *CreateZExt(Value *V, Type *DestTy, const Twine &Name = "") {
     return CreateCast(Instruction::ZExt, V, DestTy, Name);
   }
+
   Value *CreateSExt(Value *V, Type *DestTy, const Twine &Name = "") {
     return CreateCast(Instruction::SExt, V, DestTy, Name);
   }
-  /// \brief Create a ZExt or Trunc from the integer value V to DestTy. Return
+
+  /// Create a ZExt or Trunc from the integer value V to DestTy. Return
   /// the value untouched if the type of V is already DestTy.
   Value *CreateZExtOrTrunc(Value *V, Type *DestTy,
                            const Twine &Name = "") {
@@ -1404,7 +1575,8 @@ public:
       return CreateTrunc(V, DestTy, Name);
     return V;
   }
-  /// \brief Create a SExt or Trunc from the integer value V to DestTy. Return
+
+  /// Create a SExt or Trunc from the integer value V to DestTy. Return
   /// the value untouched if the type of V is already DestTy.
   Value *CreateSExtOrTrunc(Value *V, Type *DestTy,
                            const Twine &Name = "") {
@@ -1418,78 +1590,93 @@ public:
       return CreateTrunc(V, DestTy, Name);
     return V;
   }
+
   Value *CreateFPToUI(Value *V, Type *DestTy, const Twine &Name = ""){
     return CreateCast(Instruction::FPToUI, V, DestTy, Name);
   }
+
   Value *CreateFPToSI(Value *V, Type *DestTy, const Twine &Name = ""){
     return CreateCast(Instruction::FPToSI, V, DestTy, Name);
   }
+
   Value *CreateUIToFP(Value *V, Type *DestTy, const Twine &Name = ""){
     return CreateCast(Instruction::UIToFP, V, DestTy, Name);
   }
+
   Value *CreateSIToFP(Value *V, Type *DestTy, const Twine &Name = ""){
     return CreateCast(Instruction::SIToFP, V, DestTy, Name);
   }
+
   Value *CreateFPTrunc(Value *V, Type *DestTy,
                        const Twine &Name = "") {
     return CreateCast(Instruction::FPTrunc, V, DestTy, Name);
   }
+
   Value *CreateFPExt(Value *V, Type *DestTy, const Twine &Name = "") {
     return CreateCast(Instruction::FPExt, V, DestTy, Name);
   }
+
   Value *CreatePtrToInt(Value *V, Type *DestTy,
                         const Twine &Name = "") {
     return CreateCast(Instruction::PtrToInt, V, DestTy, Name);
   }
+
   Value *CreateIntToPtr(Value *V, Type *DestTy,
                         const Twine &Name = "") {
     return CreateCast(Instruction::IntToPtr, V, DestTy, Name);
   }
+
   Value *CreateBitCast(Value *V, Type *DestTy,
                        const Twine &Name = "") {
     return CreateCast(Instruction::BitCast, V, DestTy, Name);
   }
+
   Value *CreateAddrSpaceCast(Value *V, Type *DestTy,
                              const Twine &Name = "") {
     return CreateCast(Instruction::AddrSpaceCast, V, DestTy, Name);
   }
+
   Value *CreateZExtOrBitCast(Value *V, Type *DestTy,
                              const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateZExtOrBitCast(VC, DestTy), Name);
     return Insert(CastInst::CreateZExtOrBitCast(V, DestTy), Name);
   }
+
   Value *CreateSExtOrBitCast(Value *V, Type *DestTy,
                              const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateSExtOrBitCast(VC, DestTy), Name);
     return Insert(CastInst::CreateSExtOrBitCast(V, DestTy), Name);
   }
+
   Value *CreateTruncOrBitCast(Value *V, Type *DestTy,
                               const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateTruncOrBitCast(VC, DestTy), Name);
     return Insert(CastInst::CreateTruncOrBitCast(V, DestTy), Name);
   }
+
   Value *CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy,
                     const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateCast(Op, VC, DestTy), Name);
     return Insert(CastInst::Create(Op, V, DestTy), Name);
   }
+
   Value *CreatePointerCast(Value *V, Type *DestTy,
                            const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreatePointerCast(VC, DestTy), Name);
     return Insert(CastInst::CreatePointerCast(V, DestTy), Name);
   }
@@ -1499,7 +1686,7 @@ public:
     if (V->getType() == DestTy)
       return V;
 
-    if (Constant *VC = dyn_cast<Constant>(V)) {
+    if (auto *VC = dyn_cast<Constant>(V)) {
       return Insert(Folder.CreatePointerBitCastOrAddrSpaceCast(VC, DestTy),
                     Name);
     }
@@ -1512,7 +1699,7 @@ public:
                        const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateIntCast(VC, DestTy, isSigned), Name);
     return Insert(CastInst::CreateIntegerCast(V, DestTy, isSigned), Name);
   }
@@ -1529,16 +1716,15 @@ public:
     return CreateBitCast(V, DestTy, Name);
   }
 
-public:
   Value *CreateFPCast(Value *V, Type *DestTy, const Twine &Name = "") {
     if (V->getType() == DestTy)
       return V;
-    if (Constant *VC = dyn_cast<Constant>(V))
+    if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateFPCast(VC, DestTy), Name);
     return Insert(CastInst::CreateFPCast(V, DestTy), Name);
   }
 
-  // \brief Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
+  // Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
   // compile time error, instead of converting the string to bool for the
   // isSigned parameter.
   Value *CreateIntCast(Value *, Type *, const char *) = delete;
@@ -1550,30 +1736,39 @@ public:
   Value *CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_EQ, LHS, RHS, Name);
   }
+
   Value *CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_NE, LHS, RHS, Name);
   }
+
   Value *CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_UGT, LHS, RHS, Name);
   }
+
   Value *CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_UGE, LHS, RHS, Name);
   }
+
   Value *CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_ULT, LHS, RHS, Name);
   }
+
   Value *CreateICmpULE(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_ULE, LHS, RHS, Name);
   }
+
   Value *CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_SGT, LHS, RHS, Name);
   }
+
   Value *CreateICmpSGE(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_SGE, LHS, RHS, Name);
   }
+
   Value *CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_SLT, LHS, RHS, Name);
   }
+
   Value *CreateICmpSLE(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateICmp(ICmpInst::ICMP_SLE, LHS, RHS, Name);
   }
@@ -1582,54 +1777,67 @@ public:
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_OEQ, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpOGT(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_OGT, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_OGE, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_OLT, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpOLE(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_OLE, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpONE(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_ONE, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpORD(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_ORD, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpUNO(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_UNO, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpUEQ(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_UEQ, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpUGT(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_UGT, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpUGE(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_UGE, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpULT(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_ULT, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpULE(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_ULE, LHS, RHS, Name, FPMathTag);
   }
+
   Value *CreateFCmpUNE(Value *LHS, Value *RHS, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     return CreateFCmp(FCmpInst::FCMP_UNE, LHS, RHS, Name, FPMathTag);
@@ -1637,18 +1845,18 @@ public:
 
   Value *CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS,
                     const Twine &Name = "") {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateICmp(P, LC, RC), Name);
     return Insert(new ICmpInst(P, LHS, RHS), Name);
   }
+
   Value *CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS,
                     const Twine &Name = "", MDNode *FPMathTag = nullptr) {
-    if (Constant *LC = dyn_cast<Constant>(LHS))
-      if (Constant *RC = dyn_cast<Constant>(RHS))
+    if (auto *LC = dyn_cast<Constant>(LHS))
+      if (auto *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFCmp(P, LC, RC), Name);
-    return Insert(AddFPMathAttributes(new FCmpInst(P, LHS, RHS),
-                                      FPMathTag, FMF), Name);
+    return Insert(setFPAttrs(new FCmpInst(P, LHS, RHS), FPMathTag, FMF), Name);
   }
 
   //===--------------------------------------------------------------------===//
@@ -1662,8 +1870,8 @@ public:
 
   CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args = None,
                        const Twine &Name = "", MDNode *FPMathTag = nullptr) {
-    PointerType *PTy = cast<PointerType>(Callee->getType());
-    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    auto *PTy = cast<PointerType>(Callee->getType());
+    auto *FTy = cast<FunctionType>(PTy->getElementType());
     return CreateCall(FTy, Callee, Args, Name, FPMathTag);
   }
 
@@ -1672,7 +1880,7 @@ public:
                        MDNode *FPMathTag = nullptr) {
     CallInst *CI = CallInst::Create(FTy, Callee, Args, DefaultOperandBundles);
     if (isa<FPMathOperator>(CI))
-      CI = cast<CallInst>(AddFPMathAttributes(CI, FPMathTag, FMF));
+      CI = cast<CallInst>(setFPAttrs(CI, FPMathTag, FMF));
     return Insert(CI, Name);
   }
 
@@ -1681,7 +1889,7 @@ public:
                        const Twine &Name = "", MDNode *FPMathTag = nullptr) {
     CallInst *CI = CallInst::Create(Callee, Args, OpBundles);
     if (isa<FPMathOperator>(CI))
-      CI = cast<CallInst>(AddFPMathAttributes(CI, FPMathTag, FMF));
+      CI = cast<CallInst>(setFPAttrs(CI, FPMathTag, FMF));
     return Insert(CI, Name);
   }
 
@@ -1692,9 +1900,9 @@ public:
 
   Value *CreateSelect(Value *C, Value *True, Value *False,
                       const Twine &Name = "", Instruction *MDFrom = nullptr) {
-    if (Constant *CC = dyn_cast<Constant>(C))
-      if (Constant *TC = dyn_cast<Constant>(True))
-        if (Constant *FC = dyn_cast<Constant>(False))
+    if (auto *CC = dyn_cast<Constant>(C))
+      if (auto *TC = dyn_cast<Constant>(True))
+        if (auto *FC = dyn_cast<Constant>(False))
           return Insert(Folder.CreateSelect(CC, TC, FC), Name);
 
     SelectInst *Sel = SelectInst::Create(C, True, False);
@@ -1712,8 +1920,8 @@ public:
 
   Value *CreateExtractElement(Value *Vec, Value *Idx,
                               const Twine &Name = "") {
-    if (Constant *VC = dyn_cast<Constant>(Vec))
-      if (Constant *IC = dyn_cast<Constant>(Idx))
+    if (auto *VC = dyn_cast<Constant>(Vec))
+      if (auto *IC = dyn_cast<Constant>(Idx))
         return Insert(Folder.CreateExtractElement(VC, IC), Name);
     return Insert(ExtractElementInst::Create(Vec, Idx), Name);
   }
@@ -1725,9 +1933,9 @@ public:
 
   Value *CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx,
                              const Twine &Name = "") {
-    if (Constant *VC = dyn_cast<Constant>(Vec))
-      if (Constant *NC = dyn_cast<Constant>(NewElt))
-        if (Constant *IC = dyn_cast<Constant>(Idx))
+    if (auto *VC = dyn_cast<Constant>(Vec))
+      if (auto *NC = dyn_cast<Constant>(NewElt))
+        if (auto *IC = dyn_cast<Constant>(Idx))
           return Insert(Folder.CreateInsertElement(VC, NC, IC), Name);
     return Insert(InsertElementInst::Create(Vec, NewElt, Idx), Name);
   }
@@ -1739,9 +1947,9 @@ public:
 
   Value *CreateShuffleVector(Value *V1, Value *V2, Value *Mask,
                              const Twine &Name = "") {
-    if (Constant *V1C = dyn_cast<Constant>(V1))
-      if (Constant *V2C = dyn_cast<Constant>(V2))
-        if (Constant *MC = dyn_cast<Constant>(Mask))
+    if (auto *V1C = dyn_cast<Constant>(V1))
+      if (auto *V2C = dyn_cast<Constant>(V2))
+        if (auto *MC = dyn_cast<Constant>(Mask))
           return Insert(Folder.CreateShuffleVector(V1C, V2C, MC), Name);
     return Insert(new ShuffleVectorInst(V1, V2, Mask), Name);
   }
@@ -1755,7 +1963,7 @@ public:
   Value *CreateExtractValue(Value *Agg,
                             ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
-    if (Constant *AggC = dyn_cast<Constant>(Agg))
+    if (auto *AggC = dyn_cast<Constant>(Agg))
       return Insert(Folder.CreateExtractValue(AggC, Idxs), Name);
     return Insert(ExtractValueInst::Create(Agg, Idxs), Name);
   }
@@ -1763,8 +1971,8 @@ public:
   Value *CreateInsertValue(Value *Agg, Value *Val,
                            ArrayRef<unsigned> Idxs,
                            const Twine &Name = "") {
-    if (Constant *AggC = dyn_cast<Constant>(Agg))
-      if (Constant *ValC = dyn_cast<Constant>(Val))
+    if (auto *AggC = dyn_cast<Constant>(Agg))
+      if (auto *ValC = dyn_cast<Constant>(Val))
         return Insert(Folder.CreateInsertValue(AggC, ValC, Idxs), Name);
     return Insert(InsertValueInst::Create(Agg, Val, Idxs), Name);
   }
@@ -1778,19 +1986,19 @@ public:
   // Utility creation methods
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return an i1 value testing if \p Arg is null.
+  /// Return an i1 value testing if \p Arg is null.
   Value *CreateIsNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpEQ(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
   }
 
-  /// \brief Return an i1 value testing if \p Arg is not null.
+  /// Return an i1 value testing if \p Arg is not null.
   Value *CreateIsNotNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpNE(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
   }
 
-  /// \brief Return the i64 difference between two pointer values, dividing out
+  /// Return the i64 difference between two pointer values, dividing out
   /// the size of the pointed-to objects.
   ///
   /// This is intended to implement C-style pointer subtraction. As such, the
@@ -1799,7 +2007,7 @@ public:
   Value *CreatePtrDiff(Value *LHS, Value *RHS, const Twine &Name = "") {
     assert(LHS->getType() == RHS->getType() &&
            "Pointer subtraction operand types must match!");
-    PointerType *ArgType = cast<PointerType>(LHS->getType());
+    auto *ArgType = cast<PointerType>(LHS->getType());
     Value *LHS_int = CreatePtrToInt(LHS, Type::getInt64Ty(Context));
     Value *RHS_int = CreatePtrToInt(RHS, Type::getInt64Ty(Context));
     Value *Difference = CreateSub(LHS_int, RHS_int);
@@ -1808,35 +2016,62 @@ public:
                            Name);
   }
 
-  /// \brief Create an invariant.group.barrier intrinsic call, that stops
-  /// optimizer to propagate equality using invariant.group metadata.
-  /// If Ptr type is different from pointer to i8, it's casted to pointer to i8
-  /// in the same address space before call and casted back to Ptr type after
-  /// call.
-  Value *CreateInvariantGroupBarrier(Value *Ptr) {
+  /// Create a launder.invariant.group intrinsic call. If Ptr type is
+  /// different from pointer to i8, it's casted to pointer to i8 in the same
+  /// address space before call and casted back to Ptr type after call.
+  Value *CreateLaunderInvariantGroup(Value *Ptr) {
     assert(isa<PointerType>(Ptr->getType()) &&
-           "invariant.group.barrier only applies to pointers.");
+           "launder.invariant.group only applies to pointers.");
+    // FIXME: we could potentially avoid casts to/from i8*.
     auto *PtrType = Ptr->getType();
     auto *Int8PtrTy = getInt8PtrTy(PtrType->getPointerAddressSpace());
     if (PtrType != Int8PtrTy)
       Ptr = CreateBitCast(Ptr, Int8PtrTy);
     Module *M = BB->getParent()->getParent();
-    Function *FnInvariantGroupBarrier = Intrinsic::getDeclaration(
-        M, Intrinsic::invariant_group_barrier, {Int8PtrTy});
+    Function *FnLaunderInvariantGroup = Intrinsic::getDeclaration(
+        M, Intrinsic::launder_invariant_group, {Int8PtrTy});
 
-    assert(FnInvariantGroupBarrier->getReturnType() == Int8PtrTy &&
-           FnInvariantGroupBarrier->getFunctionType()->getParamType(0) ==
+    assert(FnLaunderInvariantGroup->getReturnType() == Int8PtrTy &&
+           FnLaunderInvariantGroup->getFunctionType()->getParamType(0) ==
                Int8PtrTy &&
-           "InvariantGroupBarrier should take and return the same type");
+           "LaunderInvariantGroup should take and return the same type");
 
-    CallInst *Fn = CreateCall(FnInvariantGroupBarrier, {Ptr});
+    CallInst *Fn = CreateCall(FnLaunderInvariantGroup, {Ptr});
 
     if (PtrType != Int8PtrTy)
       return CreateBitCast(Fn, PtrType);
     return Fn;
   }
 
-  /// \brief Return a vector value that contains \arg V broadcasted to \p
+  /// \brief Create a strip.invariant.group intrinsic call. If Ptr type is
+  /// different from pointer to i8, it's casted to pointer to i8 in the same
+  /// address space before call and casted back to Ptr type after call.
+  Value *CreateStripInvariantGroup(Value *Ptr) {
+    assert(isa<PointerType>(Ptr->getType()) &&
+           "strip.invariant.group only applies to pointers.");
+
+    // FIXME: we could potentially avoid casts to/from i8*.
+    auto *PtrType = Ptr->getType();
+    auto *Int8PtrTy = getInt8PtrTy(PtrType->getPointerAddressSpace());
+    if (PtrType != Int8PtrTy)
+      Ptr = CreateBitCast(Ptr, Int8PtrTy);
+    Module *M = BB->getParent()->getParent();
+    Function *FnStripInvariantGroup = Intrinsic::getDeclaration(
+        M, Intrinsic::strip_invariant_group, {Int8PtrTy});
+
+    assert(FnStripInvariantGroup->getReturnType() == Int8PtrTy &&
+           FnStripInvariantGroup->getFunctionType()->getParamType(0) ==
+               Int8PtrTy &&
+           "StripInvariantGroup should take and return the same type");
+
+    CallInst *Fn = CreateCall(FnStripInvariantGroup, {Ptr});
+
+    if (PtrType != Int8PtrTy)
+      return CreateBitCast(Fn, PtrType);
+    return Fn;
+  }
+
+  /// Return a vector value that contains \arg V broadcasted to \p
   /// NumElts elements.
   Value *CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name = "") {
     assert(NumElts > 0 && "Cannot splat to an empty vector!");
@@ -1852,11 +2087,11 @@ public:
     return CreateShuffleVector(V, Undef, Zeros, Name + ".splat");
   }
 
-  /// \brief Return a value that has been extracted from a larger integer type.
+  /// Return a value that has been extracted from a larger integer type.
   Value *CreateExtractInteger(const DataLayout &DL, Value *From,
                               IntegerType *ExtractedTy, uint64_t Offset,
                               const Twine &Name) {
-    IntegerType *IntTy = cast<IntegerType>(From->getType());
+    auto *IntTy = cast<IntegerType>(From->getType());
     assert(DL.getTypeStoreSize(ExtractedTy) + Offset <=
                DL.getTypeStoreSize(IntTy) &&
            "Element extends past full value");
@@ -1877,7 +2112,7 @@ public:
   }
 
 private:
-  /// \brief Helper function that creates an assume intrinsic call that
+  /// Helper function that creates an assume intrinsic call that
   /// represents an alignment assumption on the provided Ptr, Mask, Type
   /// and Offset.
   CallInst *CreateAlignmentAssumptionHelper(const DataLayout &DL,
@@ -1888,7 +2123,7 @@ private:
 
     if (OffsetValue) {
       bool IsOffsetZero = false;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(OffsetValue))
+      if (const auto *CI = dyn_cast<ConstantInt>(OffsetValue))
         IsOffsetZero = CI->isZero();
 
       if (!IsOffsetZero) {
@@ -1906,7 +2141,7 @@ private:
   }
 
 public:
-  /// \brief Create an assume intrinsic call that represents an alignment
+  /// Create an assume intrinsic call that represents an alignment
   /// assumption on the provided pointer.
   ///
   /// An optional offset can be provided, and if it is provided, the offset
@@ -1917,15 +2152,15 @@ public:
                                       Value *OffsetValue = nullptr) {
     assert(isa<PointerType>(PtrValue->getType()) &&
            "trying to create an alignment assumption on a non-pointer?");
-    PointerType *PtrTy = cast<PointerType>(PtrValue->getType());
+    auto *PtrTy = cast<PointerType>(PtrValue->getType());
     Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
 
     Value *Mask = ConstantInt::get(IntPtrTy, Alignment > 0 ? Alignment - 1 : 0);
     return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
                                            OffsetValue);
   }
-  //
-  /// \brief Create an assume intrinsic call that represents an alignment
+
+  /// Create an assume intrinsic call that represents an alignment
   /// assumption on the provided pointer.
   ///
   /// An optional offset can be provided, and if it is provided, the offset
@@ -1939,7 +2174,7 @@ public:
                                       Value *OffsetValue = nullptr) {
     assert(isa<PointerType>(PtrValue->getType()) &&
            "trying to create an alignment assumption on a non-pointer?");
-    PointerType *PtrTy = cast<PointerType>(PtrValue->getType());
+    auto *PtrTy = cast<PointerType>(PtrValue->getType());
     Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
 
     if (Alignment->getType() != IntPtrTy)
diff --git a/contrib/llvm/include/llvm/IR/IRPrintingPasses.h b/contrib/llvm/include/llvm/IR/IRPrintingPasses.h
index 0825e0696cac..e4ac5d4d88a3 100644
--- a/contrib/llvm/include/llvm/IR/IRPrintingPasses.h
+++ b/contrib/llvm/include/llvm/IR/IRPrintingPasses.h
@@ -23,6 +23,7 @@
 #include <string>
 
 namespace llvm {
+class Pass;
 class BasicBlockPass;
 class Function;
 class FunctionPass;
@@ -32,18 +33,18 @@ class PreservedAnalyses;
 class raw_ostream;
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 
-/// \brief Create and return a pass that writes the module to the specified
+/// Create and return a pass that writes the module to the specified
 /// \c raw_ostream.
 ModulePass *createPrintModulePass(raw_ostream &OS,
                                   const std::string &Banner = "",
                                   bool ShouldPreserveUseListOrder = false);
 
-/// \brief Create and return a pass that prints functions to the specified
+/// Create and return a pass that prints functions to the specified
 /// \c raw_ostream as they are processed.
 FunctionPass *createPrintFunctionPass(raw_ostream &OS,
                                       const std::string &Banner = "");
 
-/// \brief Create and return a pass that writes the BB to the specified
+/// Create and return a pass that writes the BB to the specified
 /// \c raw_ostream.
 BasicBlockPass *createPrintBasicBlockPass(raw_ostream &OS,
                                           const std::string &Banner = "");
@@ -54,7 +55,10 @@ BasicBlockPass *createPrintBasicBlockPass(raw_ostream &OS,
 /// non-printable characters in it.
 void printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name);
 
-/// \brief Pass for printing a Module as LLVM's text IR assembly.
+/// Return true if a pass is for IR printing.
+bool isIRPrintingPass(Pass *P);
+
+/// Pass for printing a Module as LLVM's text IR assembly.
 ///
 /// Note: This pass is for use with the new pass manager. Use the create...Pass
 /// functions above to create passes for use with the legacy pass manager.
@@ -73,7 +77,7 @@ public:
   static StringRef name() { return "PrintModulePass"; }
 };
 
-/// \brief Pass for printing a Function as LLVM's text IR assembly.
+/// Pass for printing a Function as LLVM's text IR assembly.
 ///
 /// Note: This pass is for use with the new pass manager. Use the create...Pass
 /// functions above to create passes for use with the legacy pass manager.
diff --git a/contrib/llvm/include/llvm/IR/InstVisitor.h b/contrib/llvm/include/llvm/IR/InstVisitor.h
index 55579819fd34..65074025a083 100644
--- a/contrib/llvm/include/llvm/IR/InstVisitor.h
+++ b/contrib/llvm/include/llvm/IR/InstVisitor.h
@@ -32,7 +32,7 @@ namespace llvm {
                visit##CLASS_TO_VISIT(static_cast<CLASS_TO_VISIT&>(I))
 
 
-/// @brief Base class for instruction visitors
+/// Base class for instruction visitors
 ///
 /// Instruction visitors are used when you want to perform different actions
 /// for different kinds of instructions without having to use lots of casts
@@ -213,6 +213,7 @@ public:
   // Handle the special instrinsic instruction classes.
   RetTy visitDbgDeclareInst(DbgDeclareInst &I)    { DELEGATE(DbgInfoIntrinsic);}
   RetTy visitDbgValueInst(DbgValueInst &I)        { DELEGATE(DbgInfoIntrinsic);}
+  RetTy visitDbgLabelInst(DbgLabelInst &I)        { DELEGATE(DbgInfoIntrinsic);}
   RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
@@ -272,6 +273,7 @@ private:
       default:                     DELEGATE(IntrinsicInst);
       case Intrinsic::dbg_declare: DELEGATE(DbgDeclareInst);
       case Intrinsic::dbg_value:   DELEGATE(DbgValueInst);
+      case Intrinsic::dbg_label:   DELEGATE(DbgLabelInst);
       case Intrinsic::memcpy:      DELEGATE(MemCpyInst);
       case Intrinsic::memmove:     DELEGATE(MemMoveInst);
       case Intrinsic::memset:      DELEGATE(MemSetInst);
diff --git a/contrib/llvm/include/llvm/IR/InstrTypes.h b/contrib/llvm/include/llvm/IR/InstrTypes.h
index 871f702f95f2..ad0012048ac9 100644
--- a/contrib/llvm/include/llvm/IR/InstrTypes.h
+++ b/contrib/llvm/include/llvm/IR/InstrTypes.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
@@ -80,7 +81,7 @@ public:
     return isa<Instruction>(V) && classof(cast<Instruction>(V));
   }
 
-  // \brief Returns true if this terminator relates to exception handling.
+  // Returns true if this terminator relates to exception handling.
   bool isExceptional() const {
     switch (getOpcode()) {
     case Instruction::CatchSwitch:
@@ -117,7 +118,7 @@ public:
       return idx < TermInst->getNumSuccessors();
     }
 
-    /// \brief Proxy object to allow write access in operator[]
+    /// Proxy object to allow write access in operator[]
     class SuccessorProxy {
       Self it;
 
@@ -391,6 +392,37 @@ public:
     return BO;
   }
 
+  static BinaryOperator *CreateFAddFMF(Value *V1, Value *V2,
+                                       BinaryOperator *FMFSource,
+                                       const Twine &Name = "") {
+    return CreateWithCopiedFlags(Instruction::FAdd, V1, V2, FMFSource, Name);
+  }
+  static BinaryOperator *CreateFSubFMF(Value *V1, Value *V2,
+                                       BinaryOperator *FMFSource,
+                                       const Twine &Name = "") {
+    return CreateWithCopiedFlags(Instruction::FSub, V1, V2, FMFSource, Name);
+  }
+  static BinaryOperator *CreateFMulFMF(Value *V1, Value *V2,
+                                       BinaryOperator *FMFSource,
+                                       const Twine &Name = "") {
+    return CreateWithCopiedFlags(Instruction::FMul, V1, V2, FMFSource, Name);
+  }
+  static BinaryOperator *CreateFDivFMF(Value *V1, Value *V2,
+                                       BinaryOperator *FMFSource,
+                                       const Twine &Name = "") {
+    return CreateWithCopiedFlags(Instruction::FDiv, V1, V2, FMFSource, Name);
+  }
+  static BinaryOperator *CreateFRemFMF(Value *V1, Value *V2,
+                                       BinaryOperator *FMFSource,
+                                       const Twine &Name = "") {
+    return CreateWithCopiedFlags(Instruction::FRem, V1, V2, FMFSource, Name);
+  }
+  static BinaryOperator *CreateFNegFMF(Value *Op, BinaryOperator *FMFSource,
+                                       const Twine &Name = "") {
+    Value *Zero = ConstantFP::getNegativeZero(Op->getType());
+    return CreateWithCopiedFlags(Instruction::FSub, Zero, Op, FMFSource);
+  }
+
   static BinaryOperator *CreateNSW(BinaryOps Opc, Value *V1, Value *V2,
                                    const Twine &Name = "") {
     BinaryOperator *BO = Create(Opc, V1, V2, Name);
@@ -556,16 +588,16 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BinaryOperator, Value)
 /// can be performed with code like:
 ///
 /// if (isa<CastInst>(Instr)) { ... }
-/// @brief Base class of casting instructions.
+/// Base class of casting instructions.
 class CastInst : public UnaryInstruction {
 protected:
-  /// @brief Constructor with insert-before-instruction semantics for subclasses
+  /// Constructor with insert-before-instruction semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
            const Twine &NameStr = "", Instruction *InsertBefore = nullptr)
     : UnaryInstruction(Ty, iType, S, InsertBefore) {
     setName(NameStr);
   }
-  /// @brief Constructor with insert-at-end-of-block semantics for subclasses
+  /// Constructor with insert-at-end-of-block semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
            const Twine &NameStr, BasicBlock *InsertAtEnd)
     : UnaryInstruction(Ty, iType, S, InsertAtEnd) {
@@ -578,7 +610,7 @@ public:
   /// CastOps category (Instruction::isCast(opcode) returns true). This
   /// constructor has insert-before-instruction semantics to automatically
   /// insert the new CastInst before InsertBefore (if it is non-null).
-  /// @brief Construct any of the CastInst subclasses
+  /// Construct any of the CastInst subclasses
   static CastInst *Create(
     Instruction::CastOps,    ///< The opcode of the cast instruction
     Value *S,                ///< The value to be casted (operand 0)
@@ -591,7 +623,7 @@ public:
   /// CastOps category. This constructor has insert-at-end-of-block semantics
   /// to automatically insert the new CastInst at the end of InsertAtEnd (if
   /// its non-null).
-  /// @brief Construct any of the CastInst subclasses
+  /// Construct any of the CastInst subclasses
   static CastInst *Create(
     Instruction::CastOps,    ///< The opcode for the cast instruction
     Value *S,                ///< The value to be casted (operand 0)
@@ -600,7 +632,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a ZExt or BitCast cast instruction
+  /// Create a ZExt or BitCast cast instruction
   static CastInst *CreateZExtOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -608,7 +640,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a ZExt or BitCast cast instruction
+  /// Create a ZExt or BitCast cast instruction
   static CastInst *CreateZExtOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which operand is casted
@@ -616,7 +648,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a SExt or BitCast cast instruction
+  /// Create a SExt or BitCast cast instruction
   static CastInst *CreateSExtOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -624,7 +656,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a SExt or BitCast cast instruction
+  /// Create a SExt or BitCast cast instruction
   static CastInst *CreateSExtOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which operand is casted
@@ -632,7 +664,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
+  /// Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
   static CastInst *CreatePointerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which operand is casted
@@ -640,7 +672,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
+  /// Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
   static CastInst *CreatePointerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -648,7 +680,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a BitCast or an AddrSpaceCast cast instruction.
+  /// Create a BitCast or an AddrSpaceCast cast instruction.
   static CastInst *CreatePointerBitCastOrAddrSpaceCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which operand is casted
@@ -656,7 +688,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a BitCast or an AddrSpaceCast cast instruction.
+  /// Create a BitCast or an AddrSpaceCast cast instruction.
   static CastInst *CreatePointerBitCastOrAddrSpaceCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -664,7 +696,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a BitCast, a PtrToInt, or an IntToPTr cast instruction.
+  /// Create a BitCast, a PtrToInt, or an IntToPTr cast instruction.
   ///
   /// If the value is a pointer type and the destination an integer type,
   /// creates a PtrToInt cast. If the value is an integer type and the
@@ -677,7 +709,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts.
+  /// Create a ZExt, BitCast, or Trunc for int -> int casts.
   static CastInst *CreateIntegerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -686,7 +718,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts.
+  /// Create a ZExt, BitCast, or Trunc for int -> int casts.
   static CastInst *CreateIntegerCast(
     Value *S,                ///< The integer value to be casted (operand 0)
     Type *Ty,          ///< The integer type to which operand is casted
@@ -695,7 +727,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
+  /// Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
   static CastInst *CreateFPCast(
     Value *S,                ///< The floating point value to be casted
     Type *Ty,          ///< The floating point type to cast to
@@ -703,7 +735,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
+  /// Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
   static CastInst *CreateFPCast(
     Value *S,                ///< The floating point value to be casted
     Type *Ty,          ///< The floating point type to cast to
@@ -711,7 +743,7 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a Trunc or BitCast cast instruction
+  /// Create a Trunc or BitCast cast instruction
   static CastInst *CreateTruncOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -719,7 +751,7 @@ public:
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
-  /// @brief Create a Trunc or BitCast cast instruction
+  /// Create a Trunc or BitCast cast instruction
   static CastInst *CreateTruncOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which operand is casted
@@ -727,19 +759,19 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Check whether it is valid to call getCastOpcode for these types.
+  /// Check whether it is valid to call getCastOpcode for these types.
   static bool isCastable(
     Type *SrcTy, ///< The Type from which the value should be cast.
     Type *DestTy ///< The Type to which the value should be cast.
   );
 
-  /// @brief Check whether a bitcast between these types is valid
+  /// Check whether a bitcast between these types is valid
   static bool isBitCastable(
     Type *SrcTy, ///< The Type from which the value should be cast.
     Type *DestTy ///< The Type to which the value should be cast.
   );
 
-  /// @brief Check whether a bitcast, inttoptr, or ptrtoint cast between these
+  /// Check whether a bitcast, inttoptr, or ptrtoint cast between these
   /// types is valid and a no-op.
   ///
   /// This ensures that any pointer<->integer cast has enough bits in the
@@ -751,7 +783,7 @@ public:
 
   /// Returns the opcode necessary to cast Val into Ty using usual casting
   /// rules.
-  /// @brief Infer the opcode for cast operand and type
+  /// Infer the opcode for cast operand and type
   static Instruction::CastOps getCastOpcode(
     const Value *Val, ///< The value to cast
     bool SrcIsSigned, ///< Whether to treat the source as signed
@@ -763,14 +795,14 @@ public:
   /// only deals with integer source and destination types. To simplify that
   /// logic, this method is provided.
   /// @returns true iff the cast has only integral typed operand and dest type.
-  /// @brief Determine if this is an integer-only cast.
+  /// Determine if this is an integer-only cast.
   bool isIntegerCast() const;
 
   /// A lossless cast is one that does not alter the basic value. It implies
   /// a no-op cast but is more stringent, preventing things like int->float,
   /// long->double, or int->ptr.
   /// @returns true iff the cast is lossless.
-  /// @brief Determine if this is a lossless cast.
+  /// Determine if this is a lossless cast.
   bool isLosslessCast() const;
 
   /// A no-op cast is one that can be effected without changing any bits.
@@ -779,7 +811,7 @@ public:
   /// involving Integer and Pointer types. They are no-op casts if the integer
   /// is the same size as the pointer. However, pointer size varies with
   /// platform.
-  /// @brief Determine if the described cast is a no-op cast.
+  /// Determine if the described cast is a no-op cast.
   static bool isNoopCast(
     Instruction::CastOps Opcode, ///< Opcode of cast
     Type *SrcTy,         ///< SrcTy of cast
@@ -787,7 +819,7 @@ public:
     const DataLayout &DL ///< DataLayout to get the Int Ptr type from.
   );
 
-  /// @brief Determine if this cast is a no-op cast.
+  /// Determine if this cast is a no-op cast.
   ///
   /// \param DL is the DataLayout to determine pointer size.
   bool isNoopCast(const DataLayout &DL) const;
@@ -797,7 +829,7 @@ public:
   /// @returns 0 if the CastInst pair can't be eliminated, otherwise
   /// returns Instruction::CastOps value for a cast that can replace
   /// the pair, casting SrcTy to DstTy.
-  /// @brief Determine if a cast pair is eliminable
+  /// Determine if a cast pair is eliminable
   static unsigned isEliminableCastPair(
     Instruction::CastOps firstOpcode,  ///< Opcode of first cast
     Instruction::CastOps secondOpcode, ///< Opcode of second cast
@@ -809,23 +841,23 @@ public:
     Type *DstIntPtrTy  ///< Integer type corresponding to Ptr DstTy, or null
   );
 
-  /// @brief Return the opcode of this CastInst
+  /// Return the opcode of this CastInst
   Instruction::CastOps getOpcode() const {
     return Instruction::CastOps(Instruction::getOpcode());
   }
 
-  /// @brief Return the source type, as a convenience
+  /// Return the source type, as a convenience
   Type* getSrcTy() const { return getOperand(0)->getType(); }
-  /// @brief Return the destination type, as a convenience
+  /// Return the destination type, as a convenience
   Type* getDestTy() const { return getType(); }
 
   /// This method can be used to determine if a cast from S to DstTy using
   /// Opcode op is valid or not.
   /// @returns true iff the proposed cast is valid.
-  /// @brief Determine if a cast is valid without creating one.
+  /// Determine if a cast is valid without creating one.
   static bool castIsValid(Instruction::CastOps op, Value *S, Type *DstTy);
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->isCast();
   }
@@ -839,7 +871,7 @@ public:
 //===----------------------------------------------------------------------===//
 
 /// This class is the base class for the comparison instructions.
-/// @brief Abstract base class of comparison instructions.
+/// Abstract base class of comparison instructions.
 class CmpInst : public Instruction {
 public:
   /// This enumeration lists the possible predicates for CmpInst subclasses.
@@ -905,7 +937,7 @@ public:
   /// the two operands.  Optionally (if InstBefore is specified) insert the
   /// instruction into a BasicBlock right before the specified instruction.
   /// The specified Instruction is allowed to be a dereferenced end iterator.
-  /// @brief Create a CmpInst
+  /// Create a CmpInst
   static CmpInst *Create(OtherOps Op,
                          Predicate predicate, Value *S1,
                          Value *S2, const Twine &Name = "",
@@ -914,21 +946,21 @@ public:
   /// Construct a compare instruction, given the opcode, the predicate and the
   /// two operands.  Also automatically insert this instruction to the end of
   /// the BasicBlock specified.
-  /// @brief Create a CmpInst
+  /// Create a CmpInst
   static CmpInst *Create(OtherOps Op, Predicate predicate, Value *S1,
                          Value *S2, const Twine &Name, BasicBlock *InsertAtEnd);
 
-  /// @brief Get the opcode casted to the right type
+  /// Get the opcode casted to the right type
   OtherOps getOpcode() const {
     return static_cast<OtherOps>(Instruction::getOpcode());
   }
 
-  /// @brief Return the predicate for this instruction.
+  /// Return the predicate for this instruction.
   Predicate getPredicate() const {
     return Predicate(getSubclassDataFromInstruction());
   }
 
-  /// @brief Set the predicate for this instruction to the specified value.
+  /// Set the predicate for this instruction to the specified value.
   void setPredicate(Predicate P) { setInstructionSubclassData(P); }
 
   static bool isFPPredicate(Predicate P) {
@@ -947,7 +979,7 @@ public:
   /// For example, EQ -> NE, UGT -> ULE, SLT -> SGE,
   ///              OEQ -> UNE, UGT -> OLE, OLT -> UGE, etc.
   /// @returns the inverse predicate for the instruction's current predicate.
-  /// @brief Return the inverse of the instruction's predicate.
+  /// Return the inverse of the instruction's predicate.
   Predicate getInversePredicate() const {
     return getInversePredicate(getPredicate());
   }
@@ -955,7 +987,7 @@ public:
   /// For example, EQ -> NE, UGT -> ULE, SLT -> SGE,
   ///              OEQ -> UNE, UGT -> OLE, OLT -> UGE, etc.
   /// @returns the inverse predicate for predicate provided in \p pred.
-  /// @brief Return the inverse of a given predicate
+  /// Return the inverse of a given predicate
   static Predicate getInversePredicate(Predicate pred);
 
   /// For example, EQ->EQ, SLE->SGE, ULT->UGT,
@@ -963,81 +995,109 @@ public:
   /// @returns the predicate that would be the result of exchanging the two
   /// operands of the CmpInst instruction without changing the result
   /// produced.
-  /// @brief Return the predicate as if the operands were swapped
+  /// Return the predicate as if the operands were swapped
   Predicate getSwappedPredicate() const {
     return getSwappedPredicate(getPredicate());
   }
 
   /// This is a static version that you can use without an instruction
   /// available.
-  /// @brief Return the predicate as if the operands were swapped.
+  /// Return the predicate as if the operands were swapped.
   static Predicate getSwappedPredicate(Predicate pred);
 
-  /// @brief Provide more efficient getOperand methods.
+  /// For predicate of kind "is X or equal to 0" returns the predicate "is X".
+  /// For predicate of kind "is X" returns the predicate "is X or equal to 0".
+  /// does not support other kind of predicates.
+  /// @returns the predicate that does not contains is equal to zero if
+  /// it had and vice versa.
+  /// Return the flipped strictness of predicate
+  Predicate getFlippedStrictnessPredicate() const {
+    return getFlippedStrictnessPredicate(getPredicate());
+  }
+
+  /// This is a static version that you can use without an instruction
+  /// available.
+  /// Return the flipped strictness of predicate
+  static Predicate getFlippedStrictnessPredicate(Predicate pred);
+
+  /// For example, SGT -> SGE, SLT -> SLE, ULT -> ULE, UGT -> UGE.
+  /// Returns the non-strict version of strict comparisons.
+  Predicate getNonStrictPredicate() const {
+    return getNonStrictPredicate(getPredicate());
+  }
+
+  /// This is a static version that you can use without an instruction
+  /// available.
+  /// @returns the non-strict version of comparison provided in \p pred.
+  /// If \p pred is not a strict comparison predicate, returns \p pred.
+  /// Returns the non-strict version of strict comparisons.
+  static Predicate getNonStrictPredicate(Predicate pred);
+
+  /// Provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
   /// This is just a convenience that dispatches to the subclasses.
-  /// @brief Swap the operands and adjust predicate accordingly to retain
+  /// Swap the operands and adjust predicate accordingly to retain
   /// the same comparison.
   void swapOperands();
 
   /// This is just a convenience that dispatches to the subclasses.
-  /// @brief Determine if this CmpInst is commutative.
+  /// Determine if this CmpInst is commutative.
   bool isCommutative() const;
 
   /// This is just a convenience that dispatches to the subclasses.
-  /// @brief Determine if this is an equals/not equals predicate.
+  /// Determine if this is an equals/not equals predicate.
   bool isEquality() const;
 
   /// @returns true if the comparison is signed, false otherwise.
-  /// @brief Determine if this instruction is using a signed comparison.
+  /// Determine if this instruction is using a signed comparison.
   bool isSigned() const {
     return isSigned(getPredicate());
   }
 
   /// @returns true if the comparison is unsigned, false otherwise.
-  /// @brief Determine if this instruction is using an unsigned comparison.
+  /// Determine if this instruction is using an unsigned comparison.
   bool isUnsigned() const {
     return isUnsigned(getPredicate());
   }
 
   /// For example, ULT->SLT, ULE->SLE, UGT->SGT, UGE->SGE, SLT->Failed assert
   /// @returns the signed version of the unsigned predicate pred.
-  /// @brief return the signed version of a predicate
+  /// return the signed version of a predicate
   static Predicate getSignedPredicate(Predicate pred);
 
   /// For example, ULT->SLT, ULE->SLE, UGT->SGT, UGE->SGE, SLT->Failed assert
   /// @returns the signed version of the predicate for this instruction (which
   /// has to be an unsigned predicate).
-  /// @brief return the signed version of a predicate
+  /// return the signed version of a predicate
   Predicate getSignedPredicate() {
     return getSignedPredicate(getPredicate());
   }
 
   /// This is just a convenience.
-  /// @brief Determine if this is true when both operands are the same.
+  /// Determine if this is true when both operands are the same.
   bool isTrueWhenEqual() const {
     return isTrueWhenEqual(getPredicate());
   }
 
   /// This is just a convenience.
-  /// @brief Determine if this is false when both operands are the same.
+  /// Determine if this is false when both operands are the same.
   bool isFalseWhenEqual() const {
     return isFalseWhenEqual(getPredicate());
   }
 
   /// @returns true if the predicate is unsigned, false otherwise.
-  /// @brief Determine if the predicate is an unsigned operation.
+  /// Determine if the predicate is an unsigned operation.
   static bool isUnsigned(Predicate predicate);
 
   /// @returns true if the predicate is signed, false otherwise.
-  /// @brief Determine if the predicate is an signed operation.
+  /// Determine if the predicate is an signed operation.
   static bool isSigned(Predicate predicate);
 
-  /// @brief Determine if the predicate is an ordered operation.
+  /// Determine if the predicate is an ordered operation.
   static bool isOrdered(Predicate predicate);
 
-  /// @brief Determine if the predicate is an unordered operation.
+  /// Determine if the predicate is an unordered operation.
   static bool isUnordered(Predicate predicate);
 
   /// Determine if the predicate is true when comparing a value with itself.
@@ -1054,7 +1114,7 @@ public:
   /// operands.
   static bool isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2);
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ICmp ||
            I->getOpcode() == Instruction::FCmp;
@@ -1063,7 +1123,7 @@ public:
     return isa<Instruction>(V) && classof(cast<Instruction>(V));
   }
 
-  /// @brief Create a result type for fcmp/icmp
+  /// Create a result type for fcmp/icmp
   static Type* makeCmpResultType(Type* opnd_type) {
     if (VectorType* vt = dyn_cast<VectorType>(opnd_type)) {
       return VectorType::get(Type::getInt1Ty(opnd_type->getContext()),
@@ -1121,7 +1181,7 @@ public:
 
   /// Convenience accessors
 
-  /// \brief Return the outer EH-pad this funclet is nested within.
+  /// Return the outer EH-pad this funclet is nested within.
   ///
   /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst
   /// is a CatchPadInst.
@@ -1157,7 +1217,7 @@ struct OperandTraits<FuncletPadInst>
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value)
 
-/// \brief A lightweight accessor for an operand bundle meant to be passed
+/// A lightweight accessor for an operand bundle meant to be passed
 /// around by value.
 struct OperandBundleUse {
   ArrayRef<Use> Inputs;
@@ -1166,7 +1226,7 @@ struct OperandBundleUse {
   explicit OperandBundleUse(StringMapEntry<uint32_t> *Tag, ArrayRef<Use> Inputs)
       : Inputs(Inputs), Tag(Tag) {}
 
-  /// \brief Return true if the operand at index \p Idx in this operand bundle
+  /// Return true if the operand at index \p Idx in this operand bundle
   /// has the attribute A.
   bool operandHasAttr(unsigned Idx, Attribute::AttrKind A) const {
     if (isDeoptOperandBundle())
@@ -1177,12 +1237,12 @@ struct OperandBundleUse {
     return false;
   }
 
-  /// \brief Return the tag of this operand bundle as a string.
+  /// Return the tag of this operand bundle as a string.
   StringRef getTagName() const {
     return Tag->getKey();
   }
 
-  /// \brief Return the tag of this operand bundle as an integer.
+  /// Return the tag of this operand bundle as an integer.
   ///
   /// Operand bundle tags are interned by LLVMContextImpl::getOrInsertBundleTag,
   /// and this function returns the unique integer getOrInsertBundleTag
@@ -1191,22 +1251,22 @@ struct OperandBundleUse {
     return Tag->getValue();
   }
 
-  /// \brief Return true if this is a "deopt" operand bundle.
+  /// Return true if this is a "deopt" operand bundle.
   bool isDeoptOperandBundle() const {
     return getTagID() == LLVMContext::OB_deopt;
   }
 
-  /// \brief Return true if this is a "funclet" operand bundle.
+  /// Return true if this is a "funclet" operand bundle.
   bool isFuncletOperandBundle() const {
     return getTagID() == LLVMContext::OB_funclet;
   }
 
 private:
-  /// \brief Pointer to an entry in LLVMContextImpl::getOrInsertBundleTag.
+  /// Pointer to an entry in LLVMContextImpl::getOrInsertBundleTag.
   StringMapEntry<uint32_t> *Tag;
 };
 
-/// \brief A container for an operand bundle being viewed as a set of values
+/// A container for an operand bundle being viewed as a set of values
 /// rather than a set of uses.
 ///
 /// Unlike OperandBundleUse, OperandBundleDefT owns the memory it carries, and
@@ -1241,7 +1301,7 @@ public:
 using OperandBundleDef = OperandBundleDefT<Value *>;
 using ConstOperandBundleDef = OperandBundleDefT<const Value *>;
 
-/// \brief A mixin to add operand bundle functionality to llvm instruction
+/// A mixin to add operand bundle functionality to llvm instruction
 /// classes.
 ///
 /// OperandBundleUser uses the descriptor area co-allocated with the host User
@@ -1289,21 +1349,21 @@ using ConstOperandBundleDef = OperandBundleDefT<const Value *>;
 /// Currently operand bundle users with hung-off operands are not supported.
 template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
 public:
-  /// \brief Return the number of operand bundles associated with this User.
+  /// Return the number of operand bundles associated with this User.
   unsigned getNumOperandBundles() const {
     return std::distance(bundle_op_info_begin(), bundle_op_info_end());
   }
 
-  /// \brief Return true if this User has any operand bundles.
+  /// Return true if this User has any operand bundles.
   bool hasOperandBundles() const { return getNumOperandBundles() != 0; }
 
-  /// \brief Return the index of the first bundle operand in the Use array.
+  /// Return the index of the first bundle operand in the Use array.
   unsigned getBundleOperandsStartIndex() const {
     assert(hasOperandBundles() && "Don't call otherwise!");
     return bundle_op_info_begin()->Begin;
   }
 
-  /// \brief Return the index of the last bundle operand in the Use array.
+  /// Return the index of the last bundle operand in the Use array.
   unsigned getBundleOperandsEndIndex() const {
     assert(hasOperandBundles() && "Don't call otherwise!");
     return bundle_op_info_end()[-1].End;
@@ -1315,7 +1375,7 @@ public:
            Idx < getBundleOperandsEndIndex();
   }
 
-  /// \brief Return the total number operands (not operand bundles) used by
+  /// Return the total number operands (not operand bundles) used by
   /// every operand bundle in this OperandBundleUser.
   unsigned getNumTotalBundleOperands() const {
     if (!hasOperandBundles())
@@ -1328,13 +1388,13 @@ public:
     return End - Begin;
   }
 
-  /// \brief Return the operand bundle at a specific index.
+  /// Return the operand bundle at a specific index.
   OperandBundleUse getOperandBundleAt(unsigned Index) const {
     assert(Index < getNumOperandBundles() && "Index out of bounds!");
     return operandBundleFromBundleOpInfo(*(bundle_op_info_begin() + Index));
   }
 
-  /// \brief Return the number of operand bundles with the tag Name attached to
+  /// Return the number of operand bundles with the tag Name attached to
   /// this instruction.
   unsigned countOperandBundlesOfType(StringRef Name) const {
     unsigned Count = 0;
@@ -1345,7 +1405,7 @@ public:
     return Count;
   }
 
-  /// \brief Return the number of operand bundles with the tag ID attached to
+  /// Return the number of operand bundles with the tag ID attached to
   /// this instruction.
   unsigned countOperandBundlesOfType(uint32_t ID) const {
     unsigned Count = 0;
@@ -1356,7 +1416,7 @@ public:
     return Count;
   }
 
-  /// \brief Return an operand bundle by name, if present.
+  /// Return an operand bundle by name, if present.
   ///
   /// It is an error to call this for operand bundle types that may have
   /// multiple instances of them on the same instruction.
@@ -1372,7 +1432,7 @@ public:
     return None;
   }
 
-  /// \brief Return an operand bundle by tag ID, if present.
+  /// Return an operand bundle by tag ID, if present.
   ///
   /// It is an error to call this for operand bundle types that may have
   /// multiple instances of them on the same instruction.
@@ -1388,7 +1448,7 @@ public:
     return None;
   }
 
-  /// \brief Return the list of operand bundles attached to this instruction as
+  /// Return the list of operand bundles attached to this instruction as
   /// a vector of OperandBundleDefs.
   ///
   /// This function copies the OperandBundeUse instances associated with this
@@ -1400,7 +1460,7 @@ public:
       Defs.emplace_back(getOperandBundleAt(i));
   }
 
-  /// \brief Return the operand bundle for the operand at index OpIdx.
+  /// Return the operand bundle for the operand at index OpIdx.
   ///
   /// It is an error to call this with an OpIdx that does not correspond to an
   /// bundle operand.
@@ -1408,7 +1468,7 @@ public:
     return operandBundleFromBundleOpInfo(getBundleOpInfoForOperand(OpIdx));
   }
 
-  /// \brief Return true if this operand bundle user has operand bundles that
+  /// Return true if this operand bundle user has operand bundles that
   /// may read from the heap.
   bool hasReadingOperandBundles() const {
     // Implementation note: this is a conservative implementation of operand
@@ -1417,7 +1477,7 @@ public:
     return hasOperandBundles();
   }
 
-  /// \brief Return true if this operand bundle user has operand bundles that
+  /// Return true if this operand bundle user has operand bundles that
   /// may write to the heap.
   bool hasClobberingOperandBundles() const {
     for (auto &BOI : bundle_op_infos()) {
@@ -1433,7 +1493,7 @@ public:
     return false;
   }
 
-  /// \brief Return true if the bundle operand at index \p OpIdx has the
+  /// Return true if the bundle operand at index \p OpIdx has the
   /// attribute \p A.
   bool bundleOperandHasAttr(unsigned OpIdx,  Attribute::AttrKind A) const {
     auto &BOI = getBundleOpInfoForOperand(OpIdx);
@@ -1441,7 +1501,7 @@ public:
     return OBU.operandHasAttr(OpIdx - BOI.Begin, A);
   }
 
-  /// \brief Return true if \p Other has the same sequence of operand bundle
+  /// Return true if \p Other has the same sequence of operand bundle
   /// tags with the same number of operands on each one of them as this
   /// OperandBundleUser.
   bool hasIdenticalOperandBundleSchema(
@@ -1453,7 +1513,7 @@ public:
                       Other.bundle_op_info_begin());
   }
 
-  /// \brief Return true if this operand bundle user contains operand bundles
+  /// Return true if this operand bundle user contains operand bundles
   /// with tags other than those specified in \p IDs.
   bool hasOperandBundlesOtherThan(ArrayRef<uint32_t> IDs) const {
     for (unsigned i = 0, e = getNumOperandBundles(); i != e; ++i) {
@@ -1465,7 +1525,7 @@ public:
   }
 
 protected:
-  /// \brief Is the function attribute S disallowed by some operand bundle on
+  /// Is the function attribute S disallowed by some operand bundle on
   /// this operand bundle user?
   bool isFnAttrDisallowedByOpBundle(StringRef S) const {
     // Operand bundles only possibly disallow readnone, readonly and argmenonly
@@ -1473,7 +1533,7 @@ protected:
     return false;
   }
 
-  /// \brief Is the function attribute A disallowed by some operand bundle on
+  /// Is the function attribute A disallowed by some operand bundle on
   /// this operand bundle user?
   bool isFnAttrDisallowedByOpBundle(Attribute::AttrKind A) const {
     switch (A) {
@@ -1499,18 +1559,18 @@ protected:
     llvm_unreachable("switch has a default case!");
   }
 
-  /// \brief Used to keep track of an operand bundle.  See the main comment on
+  /// Used to keep track of an operand bundle.  See the main comment on
   /// OperandBundleUser above.
   struct BundleOpInfo {
-    /// \brief The operand bundle tag, interned by
+    /// The operand bundle tag, interned by
     /// LLVMContextImpl::getOrInsertBundleTag.
     StringMapEntry<uint32_t> *Tag;
 
-    /// \brief The index in the Use& vector where operands for this operand
+    /// The index in the Use& vector where operands for this operand
     /// bundle starts.
     uint32_t Begin;
 
-    /// \brief The index in the Use& vector where operands for this operand
+    /// The index in the Use& vector where operands for this operand
     /// bundle ends.
     uint32_t End;
 
@@ -1519,7 +1579,7 @@ protected:
     }
   };
 
-  /// \brief Simple helper function to map a BundleOpInfo to an
+  /// Simple helper function to map a BundleOpInfo to an
   /// OperandBundleUse.
   OperandBundleUse
   operandBundleFromBundleOpInfo(const BundleOpInfo &BOI) const {
@@ -1531,7 +1591,7 @@ protected:
   using bundle_op_iterator = BundleOpInfo *;
   using const_bundle_op_iterator = const BundleOpInfo *;
 
-  /// \brief Return the start of the list of BundleOpInfo instances associated
+  /// Return the start of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   bundle_op_iterator bundle_op_info_begin() {
     if (!static_cast<InstrTy *>(this)->hasDescriptor())
@@ -1541,7 +1601,7 @@ protected:
     return reinterpret_cast<bundle_op_iterator>(BytesBegin);
   }
 
-  /// \brief Return the start of the list of BundleOpInfo instances associated
+  /// Return the start of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   const_bundle_op_iterator bundle_op_info_begin() const {
     auto *NonConstThis =
@@ -1549,7 +1609,7 @@ protected:
     return NonConstThis->bundle_op_info_begin();
   }
 
-  /// \brief Return the end of the list of BundleOpInfo instances associated
+  /// Return the end of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   bundle_op_iterator bundle_op_info_end() {
     if (!static_cast<InstrTy *>(this)->hasDescriptor())
@@ -1559,7 +1619,7 @@ protected:
     return reinterpret_cast<bundle_op_iterator>(BytesEnd);
   }
 
-  /// \brief Return the end of the list of BundleOpInfo instances associated
+  /// Return the end of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   const_bundle_op_iterator bundle_op_info_end() const {
     auto *NonConstThis =
@@ -1567,17 +1627,17 @@ protected:
     return NonConstThis->bundle_op_info_end();
   }
 
-  /// \brief Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
+  /// Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
   iterator_range<bundle_op_iterator> bundle_op_infos() {
     return make_range(bundle_op_info_begin(), bundle_op_info_end());
   }
 
-  /// \brief Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
+  /// Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
   iterator_range<const_bundle_op_iterator> bundle_op_infos() const {
     return make_range(bundle_op_info_begin(), bundle_op_info_end());
   }
 
-  /// \brief Populate the BundleOpInfo instances and the Use& vector from \p
+  /// Populate the BundleOpInfo instances and the Use& vector from \p
   /// Bundles.  Return the op_iterator pointing to the Use& one past the last
   /// last bundle operand use.
   ///
@@ -1608,7 +1668,7 @@ protected:
     return It;
   }
 
-  /// \brief Return the BundleOpInfo for the operand at index OpIdx.
+  /// Return the BundleOpInfo for the operand at index OpIdx.
   ///
   /// It is an error to call this with an OpIdx that does not correspond to an
   /// bundle operand.
@@ -1620,7 +1680,7 @@ protected:
     llvm_unreachable("Did not find operand bundle for operand!");
   }
 
-  /// \brief Return the total number of values used in \p Bundles.
+  /// Return the total number of values used in \p Bundles.
   static unsigned CountBundleInputs(ArrayRef<OperandBundleDef> Bundles) {
     unsigned Total = 0;
     for (auto &B : Bundles)
diff --git a/contrib/llvm/include/llvm/IR/Instruction.h b/contrib/llvm/include/llvm/IR/Instruction.h
index 6af9cbfae5de..a3bf25056ee5 100644
--- a/contrib/llvm/include/llvm/IR/Instruction.h
+++ b/contrib/llvm/include/llvm/IR/Instruction.h
@@ -128,6 +128,7 @@ public:
   const char *getOpcodeName() const { return getOpcodeName(getOpcode()); }
   bool isTerminator() const { return isTerminator(getOpcode()); }
   bool isBinaryOp() const { return isBinaryOp(getOpcode()); }
+  bool isIntDivRem() const { return isIntDivRem(getOpcode()); }
   bool isShift() { return isShift(getOpcode()); }
   bool isCast() const { return isCast(getOpcode()); }
   bool isFuncletPad() const { return isFuncletPad(getOpcode()); }
@@ -142,6 +143,10 @@ public:
     return Opcode >= BinaryOpsBegin && Opcode < BinaryOpsEnd;
   }
 
+  static inline bool isIntDivRem(unsigned Opcode) {
+    return Opcode == UDiv || Opcode == SDiv || Opcode == URem || Opcode == SRem;
+  }
+
   /// Determine if the Opcode is one of the shift instructions.
   static inline bool isShift(unsigned Opcode) {
     return Opcode >= Shl && Opcode <= AShr;
@@ -284,7 +289,7 @@ public:
   /// Return the debug location for this node as a DebugLoc.
   const DebugLoc &getDebugLoc() const { return DbgLoc; }
 
-  /// Set or clear the nsw flag on this instruction, which must be an operator
+  /// Set or clear the nuw flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
   void setHasNoUnsignedWrap(bool b = true);
 
@@ -535,6 +540,14 @@ public:
   /// matters, isSafeToSpeculativelyExecute may be more appropriate.
   bool mayHaveSideEffects() const { return mayWriteToMemory() || mayThrow(); }
 
+  /// Return true if the instruction can be removed if the result is unused.
+  ///
+  /// When constant folding some instructions cannot be removed even if their
+  /// results are unused. Specifically terminator instructions and calls that
+  /// may have side effects cannot be removed without semantically changing the
+  /// generated program.
+  bool isSafeToRemove() const;
+  
   /// Return true if the instruction is a variety of EH-block.
   bool isEHPad() const {
     switch (getOpcode()) {
@@ -548,6 +561,14 @@ public:
     }
   }
 
+  /// Return a pointer to the next non-debug instruction in the same basic
+  /// block as 'this', or nullptr if no such instruction exists.
+  const Instruction *getNextNonDebugInstruction() const;
+  Instruction *getNextNonDebugInstruction() {
+    return const_cast<Instruction *>(
+        static_cast<const Instruction *>(this)->getNextNonDebugInstruction());
+  }
+
   /// Create a copy of 'this' instruction that is identical in all ways except
   /// the following:
   ///   * The instruction has no parent
@@ -582,7 +603,7 @@ public:
   /// be identical.
   /// @returns true if the specified instruction is the same operation as
   /// the current one.
-  /// @brief Determine if one instruction is the same operation as another.
+  /// Determine if one instruction is the same operation as another.
   bool isSameOperationAs(const Instruction *I, unsigned flags = 0) const;
 
   /// Return true if there are any uses of this instruction in blocks other than
diff --git a/contrib/llvm/include/llvm/IR/Instructions.h b/contrib/llvm/include/llvm/IR/Instructions.h
index c1122d137f24..a2cb84a071f2 100644
--- a/contrib/llvm/include/llvm/IR/Instructions.h
+++ b/contrib/llvm/include/llvm/IR/Instructions.h
@@ -98,6 +98,10 @@ public:
     return cast<PointerType>(Instruction::getType());
   }
 
+  /// Get allocation size in bits. Returns None if size can't be determined,
+  /// e.g. in case of a VLA.
+  Optional<uint64_t> getAllocationSizeInBits(const DataLayout &DL) const;
+
   /// Return the type that is being allocated by the instruction.
   Type *getAllocatedType() const { return AllocatedType; }
   /// for use only in special circumstances that need to generically
@@ -1346,224 +1350,71 @@ public:
   }
 };
 
-//===----------------------------------------------------------------------===//
-/// This class represents a function call, abstracting a target
-/// machine's calling convention.  This class uses low bit of the SubClassData
-/// field to indicate whether or not this is a tail call.  The rest of the bits
-/// hold the calling convention of the call.
-///
-class CallInst : public Instruction,
-                 public OperandBundleUser<CallInst, User::op_iterator> {
-  friend class OperandBundleUser<CallInst, User::op_iterator>;
-
-  AttributeList Attrs; ///< parameter attributes for call
-  FunctionType *FTy;
-
-  CallInst(const CallInst &CI);
-
-  /// Construct a CallInst given a range of arguments.
-  /// Construct a CallInst from a range of arguments
-  inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
-                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
-                  Instruction *InsertBefore);
+class CallInst;
+class InvokeInst;
 
-  inline CallInst(Value *Func, ArrayRef<Value *> Args,
-                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
-                  Instruction *InsertBefore)
-      : CallInst(cast<FunctionType>(
-                     cast<PointerType>(Func->getType())->getElementType()),
-                 Func, Args, Bundles, NameStr, InsertBefore) {}
-
-  inline CallInst(Value *Func, ArrayRef<Value *> Args, const Twine &NameStr,
-                  Instruction *InsertBefore)
-      : CallInst(Func, Args, None, NameStr, InsertBefore) {}
-
-  /// Construct a CallInst given a range of arguments.
-  /// Construct a CallInst from a range of arguments
-  inline CallInst(Value *Func, ArrayRef<Value *> Args,
-                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
-                  BasicBlock *InsertAtEnd);
-
-  explicit CallInst(Value *F, const Twine &NameStr,
-                    Instruction *InsertBefore);
-
-  CallInst(Value *F, const Twine &NameStr, BasicBlock *InsertAtEnd);
-
-  void init(Value *Func, ArrayRef<Value *> Args,
-            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
-    init(cast<FunctionType>(
-             cast<PointerType>(Func->getType())->getElementType()),
-         Func, Args, Bundles, NameStr);
-  }
-  void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
-            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
-  void init(Value *Func, const Twine &NameStr);
+template <class T> struct CallBaseParent { using type = Instruction; };
 
-  bool hasDescriptor() const { return HasDescriptor; }
+template <> struct CallBaseParent<InvokeInst> { using type = TerminatorInst; };
 
+//===----------------------------------------------------------------------===//
+/// Base class for all callable instructions (InvokeInst and CallInst)
+/// Holds everything related to calling a function, abstracting from the base
+/// type @p BaseInstTy and the concrete instruction @p InstTy
+///
+template <class InstTy>
+class CallBase : public CallBaseParent<InstTy>::type,
+                 public OperandBundleUser<InstTy, User::op_iterator> {
 protected:
-  // Note: Instruction needs to be a friend here to call cloneImpl.
-  friend class Instruction;
-
-  CallInst *cloneImpl() const;
-
-public:
-  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
-                          ArrayRef<OperandBundleDef> Bundles = None,
-                          const Twine &NameStr = "",
-                          Instruction *InsertBefore = nullptr) {
-    return Create(cast<FunctionType>(
-                      cast<PointerType>(Func->getType())->getElementType()),
-                  Func, Args, Bundles, NameStr, InsertBefore);
-  }
-
-  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
-                          const Twine &NameStr,
-                          Instruction *InsertBefore = nullptr) {
-    return Create(cast<FunctionType>(
-                      cast<PointerType>(Func->getType())->getElementType()),
-                  Func, Args, None, NameStr, InsertBefore);
-  }
-
-  static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
-                          const Twine &NameStr,
-                          Instruction *InsertBefore = nullptr) {
-    return new (unsigned(Args.size() + 1))
-        CallInst(Ty, Func, Args, None, NameStr, InsertBefore);
-  }
-
-  static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
-                          ArrayRef<OperandBundleDef> Bundles = None,
-                          const Twine &NameStr = "",
-                          Instruction *InsertBefore = nullptr) {
-    const unsigned TotalOps =
-        unsigned(Args.size()) + CountBundleInputs(Bundles) + 1;
-    const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
-
-    return new (TotalOps, DescriptorBytes)
-        CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
-  }
+  AttributeList Attrs; ///< parameter attributes for callable
+  FunctionType *FTy;
+  using BaseInstTy = typename CallBaseParent<InstTy>::type;
 
-  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
-                          ArrayRef<OperandBundleDef> Bundles,
-                          const Twine &NameStr, BasicBlock *InsertAtEnd) {
-    const unsigned TotalOps =
-        unsigned(Args.size()) + CountBundleInputs(Bundles) + 1;
-    const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+  template <class... ArgsTy>
+  CallBase(AttributeList const &A, FunctionType *FT, ArgsTy &&... Args)
+      : BaseInstTy(std::forward<ArgsTy>(Args)...), Attrs(A), FTy(FT) {}
+  bool hasDescriptor() const { return Value::HasDescriptor; }
 
-    return new (TotalOps, DescriptorBytes)
-        CallInst(Func, Args, Bundles, NameStr, InsertAtEnd);
-  }
+  using BaseInstTy::BaseInstTy;
 
-  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
-                          const Twine &NameStr, BasicBlock *InsertAtEnd) {
-    return new (unsigned(Args.size() + 1))
-        CallInst(Func, Args, None, NameStr, InsertAtEnd);
-  }
+  using OperandBundleUser<InstTy,
+                          User::op_iterator>::isFnAttrDisallowedByOpBundle;
+  using OperandBundleUser<InstTy, User::op_iterator>::getNumTotalBundleOperands;
+  using OperandBundleUser<InstTy, User::op_iterator>::bundleOperandHasAttr;
+  using Instruction::getSubclassDataFromInstruction;
+  using Instruction::setInstructionSubclassData;
 
-  static CallInst *Create(Value *F, const Twine &NameStr = "",
-                          Instruction *InsertBefore = nullptr) {
-    return new(1) CallInst(F, NameStr, InsertBefore);
-  }
+public:
+  using Instruction::getContext;
+  using OperandBundleUser<InstTy, User::op_iterator>::hasOperandBundles;
+  using OperandBundleUser<InstTy,
+                          User::op_iterator>::getBundleOperandsStartIndex;
 
-  static CallInst *Create(Value *F, const Twine &NameStr,
-                          BasicBlock *InsertAtEnd) {
-    return new(1) CallInst(F, NameStr, InsertAtEnd);
+  static bool classof(const Instruction *I) {
+    llvm_unreachable(
+        "CallBase is not meant to be used as part of the classof hierarchy");
   }
 
-  /// Create a clone of \p CI with a different set of operand bundles and
-  /// insert it before \p InsertPt.
+public:
+  /// Return the parameter attributes for this call.
   ///
-  /// The returned call instruction is identical \p CI in every way except that
-  /// the operand bundles for the new instruction are set to the operand bundles
-  /// in \p Bundles.
-  static CallInst *Create(CallInst *CI, ArrayRef<OperandBundleDef> Bundles,
-                          Instruction *InsertPt = nullptr);
+  AttributeList getAttributes() const { return Attrs; }
 
-  /// Generate the IR for a call to malloc:
-  /// 1. Compute the malloc call's argument as the specified type's size,
-  ///    possibly multiplied by the array size if the array size is not
-  ///    constant 1.
-  /// 2. Call malloc with that argument.
-  /// 3. Bitcast the result of the malloc call to the specified type.
-  static Instruction *CreateMalloc(Instruction *InsertBefore,
-                                   Type *IntPtrTy, Type *AllocTy,
-                                   Value *AllocSize, Value *ArraySize = nullptr,
-                                   Function* MallocF = nullptr,
-                                   const Twine &Name = "");
-  static Instruction *CreateMalloc(BasicBlock *InsertAtEnd,
-                                   Type *IntPtrTy, Type *AllocTy,
-                                   Value *AllocSize, Value *ArraySize = nullptr,
-                                   Function* MallocF = nullptr,
-                                   const Twine &Name = "");
-  static Instruction *CreateMalloc(Instruction *InsertBefore,
-                                   Type *IntPtrTy, Type *AllocTy,
-                                   Value *AllocSize, Value *ArraySize = nullptr,
-                                   ArrayRef<OperandBundleDef> Bundles = None,
-                                   Function* MallocF = nullptr,
-                                   const Twine &Name = "");
-  static Instruction *CreateMalloc(BasicBlock *InsertAtEnd,
-                                   Type *IntPtrTy, Type *AllocTy,
-                                   Value *AllocSize, Value *ArraySize = nullptr,
-                                   ArrayRef<OperandBundleDef> Bundles = None,
-                                   Function* MallocF = nullptr,
-                                   const Twine &Name = "");
-  /// Generate the IR for a call to the builtin free function.
-  static Instruction *CreateFree(Value *Source,
-                                 Instruction *InsertBefore);
-  static Instruction *CreateFree(Value *Source,
-                                 BasicBlock *InsertAtEnd);
-  static Instruction *CreateFree(Value *Source,
-                                 ArrayRef<OperandBundleDef> Bundles,
-                                 Instruction *InsertBefore);
-  static Instruction *CreateFree(Value *Source,
-                                 ArrayRef<OperandBundleDef> Bundles,
-                                 BasicBlock *InsertAtEnd);
+  /// Set the parameter attributes for this call.
+  ///
+  void setAttributes(AttributeList A) { Attrs = A; }
 
   FunctionType *getFunctionType() const { return FTy; }
 
   void mutateFunctionType(FunctionType *FTy) {
-    mutateType(FTy->getReturnType());
+    Value::mutateType(FTy->getReturnType());
     this->FTy = FTy;
   }
 
-  // Note that 'musttail' implies 'tail'.
-  enum TailCallKind { TCK_None = 0, TCK_Tail = 1, TCK_MustTail = 2,
-                      TCK_NoTail = 3 };
-  TailCallKind getTailCallKind() const {
-    return TailCallKind(getSubclassDataFromInstruction() & 3);
-  }
-
-  bool isTailCall() const {
-    unsigned Kind = getSubclassDataFromInstruction() & 3;
-    return Kind == TCK_Tail || Kind == TCK_MustTail;
-  }
-
-  bool isMustTailCall() const {
-    return (getSubclassDataFromInstruction() & 3) == TCK_MustTail;
-  }
-
-  bool isNoTailCall() const {
-    return (getSubclassDataFromInstruction() & 3) == TCK_NoTail;
-  }
-
-  void setTailCall(bool isTC = true) {
-    setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
-                               unsigned(isTC ? TCK_Tail : TCK_None));
-  }
-
-  void setTailCallKind(TailCallKind TCK) {
-    setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
-                               unsigned(TCK));
-  }
-
-  /// Provide fast operand accessors
-  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
-
   /// Return the number of call arguments.
   ///
   unsigned getNumArgOperands() const {
-    return getNumOperands() - getNumTotalBundleOperands() - 1;
+    return getNumOperands() - getNumTotalBundleOperands() - InstTy::ArgOffset;
   }
 
   /// getArgOperand/setArgOperand - Return/set the i-th call argument.
@@ -1578,46 +1429,112 @@ public:
   }
 
   /// Return the iterator pointing to the beginning of the argument list.
-  op_iterator arg_begin() { return op_begin(); }
+  User::op_iterator arg_begin() { return op_begin(); }
 
   /// Return the iterator pointing to the end of the argument list.
-  op_iterator arg_end() {
+  User::op_iterator arg_end() {
     // [ call args ], [ operand bundles ], callee
-    return op_end() - getNumTotalBundleOperands() - 1;
+    return op_end() - getNumTotalBundleOperands() - InstTy::ArgOffset;
   }
 
   /// Iteration adapter for range-for loops.
-  iterator_range<op_iterator> arg_operands() {
+  iterator_range<User::op_iterator> arg_operands() {
     return make_range(arg_begin(), arg_end());
   }
 
   /// Return the iterator pointing to the beginning of the argument list.
-  const_op_iterator arg_begin() const { return op_begin(); }
+  User::const_op_iterator arg_begin() const { return op_begin(); }
 
   /// Return the iterator pointing to the end of the argument list.
-  const_op_iterator arg_end() const {
+  User::const_op_iterator arg_end() const {
     // [ call args ], [ operand bundles ], callee
-    return op_end() - getNumTotalBundleOperands() - 1;
+    return op_end() - getNumTotalBundleOperands() - InstTy::ArgOffset;
   }
 
   /// Iteration adapter for range-for loops.
-  iterator_range<const_op_iterator> arg_operands() const {
+  iterator_range<User::const_op_iterator> arg_operands() const {
     return make_range(arg_begin(), arg_end());
   }
 
   /// Wrappers for getting the \c Use of a call argument.
   const Use &getArgOperandUse(unsigned i) const {
     assert(i < getNumArgOperands() && "Out of bounds!");
-    return getOperandUse(i);
+    return User::getOperandUse(i);
   }
   Use &getArgOperandUse(unsigned i) {
     assert(i < getNumArgOperands() && "Out of bounds!");
-    return getOperandUse(i);
+    return User::getOperandUse(i);
   }
 
   /// If one of the arguments has the 'returned' attribute, return its
   /// operand value. Otherwise, return nullptr.
-  Value *getReturnedArgOperand() const;
+  Value *getReturnedArgOperand() const {
+    unsigned Index;
+
+    if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+      return getArgOperand(Index - AttributeList::FirstArgIndex);
+    if (const Function *F = getCalledFunction())
+      if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
+          Index)
+        return getArgOperand(Index - AttributeList::FirstArgIndex);
+
+    return nullptr;
+  }
+
+  User::op_iterator op_begin() {
+    return OperandTraits<CallBase>::op_begin(this);
+  }
+
+  User::const_op_iterator op_begin() const {
+    return OperandTraits<CallBase>::op_begin(const_cast<CallBase *>(this));
+  }
+
+  User::op_iterator op_end() { return OperandTraits<CallBase>::op_end(this); }
+
+  User::const_op_iterator op_end() const {
+    return OperandTraits<CallBase>::op_end(const_cast<CallBase *>(this));
+  }
+
+  Value *getOperand(unsigned i_nocapture) const {
+    assert(i_nocapture < OperandTraits<CallBase>::operands(this) &&
+           "getOperand() out of range!");
+    return cast_or_null<Value>(OperandTraits<CallBase>::op_begin(
+                                   const_cast<CallBase *>(this))[i_nocapture]
+                                   .get());
+  }
+
+  void setOperand(unsigned i_nocapture, Value *Val_nocapture) {
+    assert(i_nocapture < OperandTraits<CallBase>::operands(this) &&
+           "setOperand() out of range!");
+    OperandTraits<CallBase>::op_begin(this)[i_nocapture] = Val_nocapture;
+  }
+
+  unsigned getNumOperands() const {
+    return OperandTraits<CallBase>::operands(this);
+  }
+  template <int Idx_nocapture> Use &Op() {
+    return User::OpFrom<Idx_nocapture>(this);
+  }
+  template <int Idx_nocapture> const Use &Op() const {
+    return User::OpFrom<Idx_nocapture>(this);
+  }
+
+  /// Return the function called, or null if this is an
+  /// indirect function invocation.
+  ///
+  Function *getCalledFunction() const {
+    return dyn_cast<Function>(Op<-InstTy::ArgOffset>());
+  }
+
+  /// Determine whether this call has the given attribute.
+  bool hasFnAttr(Attribute::AttrKind Kind) const {
+    assert(Kind != Attribute::NoBuiltin &&
+           "Use CallBase::isNoBuiltin() to check for Attribute::NoBuiltin");
+    return hasFnAttrImpl(Kind);
+  }
+
+  /// Determine whether this call has the given attribute.
+  bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); }
 
   /// getCallingConv/setCallingConv - Get or set the calling convention of this
   /// function call.
@@ -1631,62 +1548,103 @@ public:
                                (ID << 2));
   }
 
-  /// Return the parameter attributes for this call.
-  ///
-  AttributeList getAttributes() const { return Attrs; }
-
-  /// Set the parameter attributes for this call.
-  ///
-  void setAttributes(AttributeList A) { Attrs = A; }
 
   /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute::AttrKind Kind);
+  void addAttribute(unsigned i, Attribute::AttrKind Kind) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addAttribute(getContext(), i, Kind);
+    setAttributes(PAL);
+  }
 
   /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute Attr);
+  void addAttribute(unsigned i, Attribute Attr) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addAttribute(getContext(), i, Attr);
+    setAttributes(PAL);
+  }
 
   /// Adds the attribute to the indicated argument
-  void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
+  void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
+    setAttributes(PAL);
+  }
 
   /// Adds the attribute to the indicated argument
-  void addParamAttr(unsigned ArgNo, Attribute Attr);
+  void addParamAttr(unsigned ArgNo, Attribute Attr) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
+    setAttributes(PAL);
+  }
 
   /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, Attribute::AttrKind Kind);
+  void removeAttribute(unsigned i, Attribute::AttrKind Kind) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeAttribute(getContext(), i, Kind);
+    setAttributes(PAL);
+  }
 
   /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, StringRef Kind);
+  void removeAttribute(unsigned i, StringRef Kind) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeAttribute(getContext(), i, Kind);
+    setAttributes(PAL);
+  }
 
   /// Removes the attribute from the given argument
-  void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
+  void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+    setAttributes(PAL);
+  }
 
   /// Removes the attribute from the given argument
-  void removeParamAttr(unsigned ArgNo, StringRef Kind);
+  void removeParamAttr(unsigned ArgNo, StringRef Kind) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+    setAttributes(PAL);
+  }
 
   /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
+  void addDereferenceableAttr(unsigned i, uint64_t Bytes) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
+    setAttributes(PAL);
+  }
 
   /// adds the dereferenceable_or_null attribute to the list of
   /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes);
-
-  /// Determine whether this call has the given attribute.
-  bool hasFnAttr(Attribute::AttrKind Kind) const {
-    assert(Kind != Attribute::NoBuiltin &&
-           "Use CallInst::isNoBuiltin() to check for Attribute::NoBuiltin");
-    return hasFnAttrImpl(Kind);
-  }
-
-  /// Determine whether this call has the given attribute.
-  bool hasFnAttr(StringRef Kind) const {
-    return hasFnAttrImpl(Kind);
+  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
+    setAttributes(PAL);
   }
 
   /// Determine whether the return value has the given attribute.
-  bool hasRetAttr(Attribute::AttrKind Kind) const;
+  bool hasRetAttr(Attribute::AttrKind Kind) const {
+    if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+      return true;
+
+    // Look at the callee, if available.
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+    return false;
+  }
 
   /// Determine whether the argument or parameter has the given attribute.
-  bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
+  bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
+    assert(ArgNo < getNumArgOperands() && "Param index out of bounds!");
+
+    if (Attrs.hasParamAttribute(ArgNo, Kind))
+      return true;
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().hasParamAttribute(ArgNo, Kind);
+    return false;
+  }
 
   /// Get the attribute of a given kind at a position.
   Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
@@ -1709,7 +1667,6 @@ public:
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
     return getAttributes().getParamAttr(ArgNo, Kind);
   }
-
   /// Return true if the data operand at index \p i has the attribute \p
   /// A.
   ///
@@ -1723,7 +1680,28 @@ public:
   ///  \p i in [1, arg_size + 1)  -> argument number (\p i - 1)
   ///  \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index
   ///     (\p i - 1) in the operand list.
-  bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const;
+  bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const {
+    // There are getNumOperands() - (InstTy::ArgOffset - 1) data operands.
+    // The last operand is the callee.
+    assert(i < (getNumOperands() - InstTy::ArgOffset + 1) &&
+           "Data operand index out of bounds!");
+
+    // The attribute A can either be directly specified, if the operand in
+    // question is a call argument; or be indirectly implied by the kind of its
+    // containing operand bundle, if the operand is a bundle operand.
+
+    if (i == AttributeList::ReturnIndex)
+      return hasRetAttr(Kind);
+
+    // FIXME: Avoid these i - 1 calculations and update the API to use
+    // zero-based indices.
+    if (i < (getNumArgOperands() + 1))
+      return paramHasAttr(i - 1, Kind);
+
+    assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
+           "Must be either a call argument or an operand bundle!");
+    return bundleOperandHasAttr(i - 1, Kind);
+  }
 
   /// Extract the alignment of the return value.
   unsigned getRetAlignment() const { return Attrs.getRetAlignment(); }
@@ -1745,7 +1723,7 @@ public:
     return Attrs.getDereferenceableOrNullBytes(i);
   }
 
-  /// @brief Determine if the return value is marked with NoAlias attribute.
+  /// Determine if the return value is marked with NoAlias attribute.
   bool returnDoesNotAlias() const {
     return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
   }
@@ -1765,15 +1743,6 @@ public:
   void setIsNoInline() {
     addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
   }
-
-  /// Return true if the call can return twice
-  bool canReturnTwice() const {
-    return hasFnAttr(Attribute::ReturnsTwice);
-  }
-  void setCanReturnTwice() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice);
-  }
-
   /// Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
     return hasFnAttr(Attribute::ReadNone);
@@ -1798,7 +1767,7 @@ public:
     addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
   }
 
-  /// @brief Determine if the call can access memmory only using pointers based
+  /// Determine if the call can access memmory only using pointers based
   /// on its arguments.
   bool onlyAccessesArgMemory() const {
     return hasFnAttr(Attribute::ArgMemOnly);
@@ -1807,7 +1776,7 @@ public:
     addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
   }
 
-  /// @brief Determine if the function may only access memory that is
+  /// Determine if the function may only access memory that is
   /// inaccessible from the IR.
   bool onlyAccessesInaccessibleMemory() const {
     return hasFnAttr(Attribute::InaccessibleMemOnly);
@@ -1816,7 +1785,7 @@ public:
     addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly);
   }
 
-  /// @brief Determine if the function may only access memory that is
+  /// Determine if the function may only access memory that is
   /// either inaccessible from the IR or pointed to by its arguments.
   bool onlyAccessesInaccessibleMemOrArgMem() const {
     return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
@@ -1824,26 +1793,28 @@ public:
   void setOnlyAccessesInaccessibleMemOrArgMem() {
     addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOrArgMemOnly);
   }
-
   /// Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn() {
     addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
 
+  /// Determine if the call should not perform indirect branch tracking.
+  bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); }
+
   /// Determine if the call cannot unwind.
   bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
   void setDoesNotThrow() {
     addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
   }
 
-  /// Determine if the call cannot be duplicated.
+  /// Determine if the invoke cannot be duplicated.
   bool cannotDuplicate() const {return hasFnAttr(Attribute::NoDuplicate); }
   void setCannotDuplicate() {
     addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
   }
 
-  /// Determine if the call is convergent
+  /// Determine if the invoke is convergent
   bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
   void setConvergent() {
     addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
@@ -1866,18 +1837,10 @@ public:
   bool hasByValArgument() const {
     return Attrs.hasAttrSomewhere(Attribute::ByVal);
   }
-
-  /// Return the function called, or null if this is an
-  /// indirect function invocation.
-  ///
-  Function *getCalledFunction() const {
-    return dyn_cast<Function>(Op<-1>());
-  }
-
   /// Get a pointer to the function that is invoked by this
   /// instruction.
-  const Value *getCalledValue() const { return Op<-1>(); }
-        Value *getCalledValue()       { return Op<-1>(); }
+  const Value *getCalledValue() const { return Op<-InstTy::ArgOffset>(); }
+  Value *getCalledValue() { return Op<-InstTy::ArgOffset>(); }
 
   /// Set the function called.
   void setCalledFunction(Value* Fn) {
@@ -1889,23 +1852,10 @@ public:
     this->FTy = FTy;
     assert(FTy == cast<FunctionType>(
                       cast<PointerType>(Fn->getType())->getElementType()));
-    Op<-1>() = Fn;
+    Op<-InstTy::ArgOffset>() = Fn;
   }
 
-  /// Check if this call is an inline asm statement.
-  bool isInlineAsm() const {
-    return isa<InlineAsm>(Op<-1>());
-  }
-
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static bool classof(const Instruction *I) {
-    return I->getOpcode() == Instruction::Call;
-  }
-  static bool classof(const Value *V) {
-    return isa<Instruction>(V) && classof(cast<Instruction>(V));
-  }
-
-private:
+protected:
   template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
     if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
       return true;
@@ -1920,7 +1870,227 @@ private:
                                              Kind);
     return false;
   }
+};
+
+//===----------------------------------------------------------------------===//
+/// This class represents a function call, abstracting a target
+/// machine's calling convention.  This class uses low bit of the SubClassData
+/// field to indicate whether or not this is a tail call.  The rest of the bits
+/// hold the calling convention of the call.
+///
+class CallInst : public CallBase<CallInst> {
+  friend class OperandBundleUser<CallInst, User::op_iterator>;
+
+  CallInst(const CallInst &CI);
+
+  /// Construct a CallInst given a range of arguments.
+  /// Construct a CallInst from a range of arguments
+  inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
+                  Instruction *InsertBefore);
+
+  inline CallInst(Value *Func, ArrayRef<Value *> Args,
+                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
+                  Instruction *InsertBefore)
+      : CallInst(cast<FunctionType>(
+                     cast<PointerType>(Func->getType())->getElementType()),
+                 Func, Args, Bundles, NameStr, InsertBefore) {}
+
+  inline CallInst(Value *Func, ArrayRef<Value *> Args, const Twine &NameStr,
+                  Instruction *InsertBefore)
+      : CallInst(Func, Args, None, NameStr, InsertBefore) {}
+
+  /// Construct a CallInst given a range of arguments.
+  /// Construct a CallInst from a range of arguments
+  inline CallInst(Value *Func, ArrayRef<Value *> Args,
+                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
+                  BasicBlock *InsertAtEnd);
+
+  explicit CallInst(Value *F, const Twine &NameStr, Instruction *InsertBefore);
+
+  CallInst(Value *F, const Twine &NameStr, BasicBlock *InsertAtEnd);
+
+  void init(Value *Func, ArrayRef<Value *> Args,
+            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
+    init(cast<FunctionType>(
+             cast<PointerType>(Func->getType())->getElementType()),
+         Func, Args, Bundles, NameStr);
+  }
+  void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
+            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
+  void init(Value *Func, const Twine &NameStr);
+
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+
+  CallInst *cloneImpl() const;
+
+public:
+  static constexpr int ArgOffset = 1;
+
+  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
+                          ArrayRef<OperandBundleDef> Bundles = None,
+                          const Twine &NameStr = "",
+                          Instruction *InsertBefore = nullptr) {
+    return Create(cast<FunctionType>(
+                      cast<PointerType>(Func->getType())->getElementType()),
+                  Func, Args, Bundles, NameStr, InsertBefore);
+  }
+
+  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
+                          const Twine &NameStr,
+                          Instruction *InsertBefore = nullptr) {
+    return Create(cast<FunctionType>(
+                      cast<PointerType>(Func->getType())->getElementType()),
+                  Func, Args, None, NameStr, InsertBefore);
+  }
+
+  static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                          const Twine &NameStr,
+                          Instruction *InsertBefore = nullptr) {
+    return new (unsigned(Args.size() + 1))
+        CallInst(Ty, Func, Args, None, NameStr, InsertBefore);
+  }
+
+  static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                          ArrayRef<OperandBundleDef> Bundles = None,
+                          const Twine &NameStr = "",
+                          Instruction *InsertBefore = nullptr) {
+    const unsigned TotalOps =
+        unsigned(Args.size()) + CountBundleInputs(Bundles) + 1;
+    const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+
+    return new (TotalOps, DescriptorBytes)
+        CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
+  }
+
+  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
+                          ArrayRef<OperandBundleDef> Bundles,
+                          const Twine &NameStr, BasicBlock *InsertAtEnd) {
+    const unsigned TotalOps =
+        unsigned(Args.size()) + CountBundleInputs(Bundles) + 1;
+    const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+
+    return new (TotalOps, DescriptorBytes)
+        CallInst(Func, Args, Bundles, NameStr, InsertAtEnd);
+  }
+
+  static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
+                          const Twine &NameStr, BasicBlock *InsertAtEnd) {
+    return new (unsigned(Args.size() + 1))
+        CallInst(Func, Args, None, NameStr, InsertAtEnd);
+  }
+
+  static CallInst *Create(Value *F, const Twine &NameStr = "",
+                          Instruction *InsertBefore = nullptr) {
+    return new (1) CallInst(F, NameStr, InsertBefore);
+  }
+
+  static CallInst *Create(Value *F, const Twine &NameStr,
+                          BasicBlock *InsertAtEnd) {
+    return new (1) CallInst(F, NameStr, InsertAtEnd);
+  }
+
+  /// Create a clone of \p CI with a different set of operand bundles and
+  /// insert it before \p InsertPt.
+  ///
+  /// The returned call instruction is identical \p CI in every way except that
+  /// the operand bundles for the new instruction are set to the operand bundles
+  /// in \p Bundles.
+  static CallInst *Create(CallInst *CI, ArrayRef<OperandBundleDef> Bundles,
+                          Instruction *InsertPt = nullptr);
+
+  /// Generate the IR for a call to malloc:
+  /// 1. Compute the malloc call's argument as the specified type's size,
+  ///    possibly multiplied by the array size if the array size is not
+  ///    constant 1.
+  /// 2. Call malloc with that argument.
+  /// 3. Bitcast the result of the malloc call to the specified type.
+  static Instruction *CreateMalloc(Instruction *InsertBefore, Type *IntPtrTy,
+                                   Type *AllocTy, Value *AllocSize,
+                                   Value *ArraySize = nullptr,
+                                   Function *MallocF = nullptr,
+                                   const Twine &Name = "");
+  static Instruction *CreateMalloc(BasicBlock *InsertAtEnd, Type *IntPtrTy,
+                                   Type *AllocTy, Value *AllocSize,
+                                   Value *ArraySize = nullptr,
+                                   Function *MallocF = nullptr,
+                                   const Twine &Name = "");
+  static Instruction *CreateMalloc(Instruction *InsertBefore, Type *IntPtrTy,
+                                   Type *AllocTy, Value *AllocSize,
+                                   Value *ArraySize = nullptr,
+                                   ArrayRef<OperandBundleDef> Bundles = None,
+                                   Function *MallocF = nullptr,
+                                   const Twine &Name = "");
+  static Instruction *CreateMalloc(BasicBlock *InsertAtEnd, Type *IntPtrTy,
+                                   Type *AllocTy, Value *AllocSize,
+                                   Value *ArraySize = nullptr,
+                                   ArrayRef<OperandBundleDef> Bundles = None,
+                                   Function *MallocF = nullptr,
+                                   const Twine &Name = "");
+  /// Generate the IR for a call to the builtin free function.
+  static Instruction *CreateFree(Value *Source, Instruction *InsertBefore);
+  static Instruction *CreateFree(Value *Source, BasicBlock *InsertAtEnd);
+  static Instruction *CreateFree(Value *Source,
+                                 ArrayRef<OperandBundleDef> Bundles,
+                                 Instruction *InsertBefore);
+  static Instruction *CreateFree(Value *Source,
+                                 ArrayRef<OperandBundleDef> Bundles,
+                                 BasicBlock *InsertAtEnd);
+
+  // Note that 'musttail' implies 'tail'.
+  enum TailCallKind {
+    TCK_None = 0,
+    TCK_Tail = 1,
+    TCK_MustTail = 2,
+    TCK_NoTail = 3
+  };
+  TailCallKind getTailCallKind() const {
+    return TailCallKind(getSubclassDataFromInstruction() & 3);
+  }
+
+  bool isTailCall() const {
+    unsigned Kind = getSubclassDataFromInstruction() & 3;
+    return Kind == TCK_Tail || Kind == TCK_MustTail;
+  }
+
+  bool isMustTailCall() const {
+    return (getSubclassDataFromInstruction() & 3) == TCK_MustTail;
+  }
+
+  bool isNoTailCall() const {
+    return (getSubclassDataFromInstruction() & 3) == TCK_NoTail;
+  }
+
+  void setTailCall(bool isTC = true) {
+    setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
+                               unsigned(isTC ? TCK_Tail : TCK_None));
+  }
+
+  void setTailCallKind(TailCallKind TCK) {
+    setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
+                               unsigned(TCK));
+  }
+
+  /// Return true if the call can return twice
+  bool canReturnTwice() const { return hasFnAttr(Attribute::ReturnsTwice); }
+  void setCanReturnTwice() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice);
+  }
+
+  /// Check if this call is an inline asm statement.
+  bool isInlineAsm() const { return isa<InlineAsm>(Op<-1>()); }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::Call;
+  }
+  static bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
 
+private:
   // Shadow Instruction::setInstructionSubclassData with a private forwarding
   // method so that subclasses cannot accidentally use it.
   void setInstructionSubclassData(unsigned short D) {
@@ -1929,17 +2099,19 @@ private:
 };
 
 template <>
-struct OperandTraits<CallInst> : public VariadicOperandTraits<CallInst, 1> {
-};
+struct OperandTraits<CallBase<CallInst>>
+    : public VariadicOperandTraits<CallBase<CallInst>, 1> {};
 
 CallInst::CallInst(Value *Func, ArrayRef<Value *> Args,
                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                    BasicBlock *InsertAtEnd)
-    : Instruction(
-          cast<FunctionType>(cast<PointerType>(Func->getType())
-                                 ->getElementType())->getReturnType(),
-          Instruction::Call, OperandTraits<CallInst>::op_end(this) -
-                                 (Args.size() + CountBundleInputs(Bundles) + 1),
+    : CallBase<CallInst>(
+          cast<FunctionType>(
+              cast<PointerType>(Func->getType())->getElementType())
+              ->getReturnType(),
+          Instruction::Call,
+          OperandTraits<CallBase<CallInst>>::op_end(this) -
+              (Args.size() + CountBundleInputs(Bundles) + 1),
           unsigned(Args.size() + CountBundleInputs(Bundles) + 1), InsertAtEnd) {
   init(Func, Args, Bundles, NameStr);
 }
@@ -1947,19 +2119,14 @@ CallInst::CallInst(Value *Func, ArrayRef<Value *> Args,
 CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                    Instruction *InsertBefore)
-    : Instruction(Ty->getReturnType(), Instruction::Call,
-                  OperandTraits<CallInst>::op_end(this) -
-                      (Args.size() + CountBundleInputs(Bundles) + 1),
-                  unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
-                  InsertBefore) {
+    : CallBase<CallInst>(Ty->getReturnType(), Instruction::Call,
+                         OperandTraits<CallBase<CallInst>>::op_end(this) -
+                             (Args.size() + CountBundleInputs(Bundles) + 1),
+                         unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
+                         InsertBefore) {
   init(Ty, Func, Args, Bundles, NameStr);
 }
 
-// Note: if you get compile errors about private methods then
-//       please update your code to use the high-level operand
-//       interfaces. See line 943 above.
-DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallInst, Value)
-
 //===----------------------------------------------------------------------===//
 //                               SelectInst Class
 //===----------------------------------------------------------------------===//
@@ -2263,7 +2430,7 @@ public:
 
   /// Return the shuffle mask value for the specified element of the mask.
   /// Return -1 if the element is undef.
-  static int getMaskValue(Constant *Mask, unsigned Elt);
+  static int getMaskValue(const Constant *Mask, unsigned Elt);
 
   /// Return the shuffle mask value of this instruction for the given element
   /// index. Return -1 if the element is undef.
@@ -2273,7 +2440,8 @@ public:
 
   /// Convert the input shuffle mask operand to a vector of integers. Undefined
   /// elements of the mask are returned as -1.
-  static void getShuffleMask(Constant *Mask, SmallVectorImpl<int> &Result);
+  static void getShuffleMask(const Constant *Mask,
+                             SmallVectorImpl<int> &Result);
 
   /// Return the mask for this instruction as a vector of integers. Undefined
   /// elements of the mask are returned as -1.
@@ -2287,6 +2455,176 @@ public:
     return Mask;
   }
 
+  /// Return true if this shuffle returns a vector with a different number of
+  /// elements than its source elements.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2>
+  bool changesLength() const {
+    unsigned NumSourceElts = Op<0>()->getType()->getVectorNumElements();
+    unsigned NumMaskElts = getMask()->getType()->getVectorNumElements();
+    return NumSourceElts != NumMaskElts;
+  }
+
+  /// Return true if this shuffle mask chooses elements from exactly one source
+  /// vector.
+  /// Example: <7,5,undef,7>
+  /// This assumes that vector operands are the same length as the mask.
+  static bool isSingleSourceMask(ArrayRef<int> Mask);
+  static bool isSingleSourceMask(const Constant *Mask) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isSingleSourceMask(MaskAsInts);
+  }
+
+  /// Return true if this shuffle chooses elements from exactly one source
+  /// vector without changing the length of that vector.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <3,0,undef,3>
+  /// TODO: Optionally allow length-changing shuffles.
+  bool isSingleSource() const {
+    return !changesLength() && isSingleSourceMask(getMask());
+  }
+
+  /// Return true if this shuffle mask chooses elements from exactly one source
+  /// vector without lane crossings. A shuffle using this mask is not
+  /// necessarily a no-op because it may change the number of elements from its
+  /// input vectors or it may provide demanded bits knowledge via undef lanes.
+  /// Example: <undef,undef,2,3>
+  static bool isIdentityMask(ArrayRef<int> Mask);
+  static bool isIdentityMask(const Constant *Mask) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isIdentityMask(MaskAsInts);
+  }
+
+  /// Return true if this shuffle mask chooses elements from exactly one source
+  /// vector without lane crossings and does not change the number of elements
+  /// from its input vectors.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <4,undef,6,undef>
+  /// TODO: Optionally allow length-changing shuffles.
+  bool isIdentity() const {
+    return !changesLength() && isIdentityMask(getShuffleMask());
+  }
+
+  /// Return true if this shuffle mask chooses elements from its source vectors
+  /// without lane crossings. A shuffle using this mask would be
+  /// equivalent to a vector select with a constant condition operand.
+  /// Example: <4,1,6,undef>
+  /// This returns false if the mask does not choose from both input vectors.
+  /// In that case, the shuffle is better classified as an identity shuffle.
+  /// This assumes that vector operands are the same length as the mask
+  /// (a length-changing shuffle can never be equivalent to a vector select).
+  static bool isSelectMask(ArrayRef<int> Mask);
+  static bool isSelectMask(const Constant *Mask) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isSelectMask(MaskAsInts);
+  }
+
+  /// Return true if this shuffle chooses elements from its source vectors
+  /// without lane crossings and all operands have the same number of elements.
+  /// In other words, this shuffle is equivalent to a vector select with a
+  /// constant condition operand.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <undef,1,6,3>
+  /// This returns false if the mask does not choose from both input vectors.
+  /// In that case, the shuffle is better classified as an identity shuffle.
+  /// TODO: Optionally allow length-changing shuffles.
+  bool isSelect() const {
+    return !changesLength() && isSelectMask(getMask());
+  }
+
+  /// Return true if this shuffle mask swaps the order of elements from exactly
+  /// one source vector.
+  /// Example: <7,6,undef,4>
+  /// This assumes that vector operands are the same length as the mask.
+  static bool isReverseMask(ArrayRef<int> Mask);
+  static bool isReverseMask(const Constant *Mask) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isReverseMask(MaskAsInts);
+  }
+
+  /// Return true if this shuffle swaps the order of elements from exactly
+  /// one source vector.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <3,undef,1,undef>
+  /// TODO: Optionally allow length-changing shuffles.
+  bool isReverse() const {
+    return !changesLength() && isReverseMask(getMask());
+  }
+
+  /// Return true if this shuffle mask chooses all elements with the same value
+  /// as the first element of exactly one source vector.
+  /// Example: <4,undef,undef,4>
+  /// This assumes that vector operands are the same length as the mask.
+  static bool isZeroEltSplatMask(ArrayRef<int> Mask);
+  static bool isZeroEltSplatMask(const Constant *Mask) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isZeroEltSplatMask(MaskAsInts);
+  }
+
+  /// Return true if all elements of this shuffle are the same value as the
+  /// first element of exactly one source vector without changing the length
+  /// of that vector.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <undef,0,undef,0>
+  /// TODO: Optionally allow length-changing shuffles.
+  /// TODO: Optionally allow splats from other elements.
+  bool isZeroEltSplat() const {
+    return !changesLength() && isZeroEltSplatMask(getMask());
+  }
+
+  /// Return true if this shuffle mask is a transpose mask.
+  /// Transpose vector masks transpose a 2xn matrix. They read corresponding
+  /// even- or odd-numbered vector elements from two n-dimensional source
+  /// vectors and write each result into consecutive elements of an
+  /// n-dimensional destination vector. Two shuffles are necessary to complete
+  /// the transpose, one for the even elements and another for the odd elements.
+  /// This description closely follows how the TRN1 and TRN2 AArch64
+  /// instructions operate.
+  ///
+  /// For example, a simple 2x2 matrix can be transposed with:
+  ///
+  ///   ; Original matrix
+  ///   m0 = < a, b >
+  ///   m1 = < c, d >
+  ///
+  ///   ; Transposed matrix
+  ///   t0 = < a, c > = shufflevector m0, m1, < 0, 2 >
+  ///   t1 = < b, d > = shufflevector m0, m1, < 1, 3 >
+  ///
+  /// For matrices having greater than n columns, the resulting nx2 transposed
+  /// matrix is stored in two result vectors such that one vector contains
+  /// interleaved elements from all the even-numbered rows and the other vector
+  /// contains interleaved elements from all the odd-numbered rows. For example,
+  /// a 2x4 matrix can be transposed with:
+  ///
+  ///   ; Original matrix
+  ///   m0 = < a, b, c, d >
+  ///   m1 = < e, f, g, h >
+  ///
+  ///   ; Transposed matrix
+  ///   t0 = < a, e, c, g > = shufflevector m0, m1 < 0, 4, 2, 6 >
+  ///   t1 = < b, f, d, h > = shufflevector m0, m1 < 1, 5, 3, 7 >
+  static bool isTransposeMask(ArrayRef<int> Mask);
+  static bool isTransposeMask(const Constant *Mask) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isTransposeMask(MaskAsInts);
+  }
+
+  /// Return true if this shuffle transposes the elements of its inputs without
+  /// changing the length of the vectors. This operation may also be known as a
+  /// merge or interleave. See the description for isTransposeMask() for the
+  /// exact specification.
+  /// Example: shufflevector <4 x n> A, <4 x n> B, <0,4,2,6>
+  bool isTranspose() const {
+    return !changesLength() && isTransposeMask(getMask());
+  }
+
   /// Change values in a shuffle permute mask assuming the two vector operands
   /// of length InVecNumElts have swapped position.
   static void commuteShuffleMask(MutableArrayRef<int> Mask,
@@ -3547,13 +3885,9 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(IndirectBrInst, Value)
 /// Invoke instruction.  The SubclassData field is used to hold the
 /// calling convention of the call.
 ///
-class InvokeInst : public TerminatorInst,
-                   public OperandBundleUser<InvokeInst, User::op_iterator> {
+class InvokeInst : public CallBase<InvokeInst> {
   friend class OperandBundleUser<InvokeInst, User::op_iterator>;
 
-  AttributeList Attrs;
-  FunctionType *FTy;
-
   InvokeInst(const InvokeInst &BI);
 
   /// Construct an InvokeInst given a range of arguments.
@@ -3580,7 +3914,6 @@ class InvokeInst : public TerminatorInst,
                     unsigned Values, const Twine &NameStr,
                     BasicBlock *InsertAtEnd);
 
-  bool hasDescriptor() const { return HasDescriptor; }
 
   void init(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
             ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
@@ -3601,6 +3934,7 @@ protected:
   InvokeInst *cloneImpl() const;
 
 public:
+  static constexpr int ArgOffset = 3;
   static InvokeInst *Create(Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
@@ -3674,299 +4008,15 @@ public:
   static InvokeInst *Create(InvokeInst *II, ArrayRef<OperandBundleDef> Bundles,
                             Instruction *InsertPt = nullptr);
 
-  /// Provide fast operand accessors
-  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
-
-  FunctionType *getFunctionType() const { return FTy; }
-
-  void mutateFunctionType(FunctionType *FTy) {
-    mutateType(FTy->getReturnType());
-    this->FTy = FTy;
-  }
-
-  /// Return the number of invoke arguments.
-  ///
-  unsigned getNumArgOperands() const {
-    return getNumOperands() - getNumTotalBundleOperands() - 3;
-  }
-
-  /// getArgOperand/setArgOperand - Return/set the i-th invoke argument.
-  ///
-  Value *getArgOperand(unsigned i) const {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    return getOperand(i);
-  }
-  void setArgOperand(unsigned i, Value *v) {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    setOperand(i, v);
-  }
-
-  /// Return the iterator pointing to the beginning of the argument list.
-  op_iterator arg_begin() { return op_begin(); }
-
-  /// Return the iterator pointing to the end of the argument list.
-  op_iterator arg_end() {
-    // [ invoke args ], [ operand bundles ], normal dest, unwind dest, callee
-    return op_end() - getNumTotalBundleOperands() - 3;
-  }
-
-  /// Iteration adapter for range-for loops.
-  iterator_range<op_iterator> arg_operands() {
-    return make_range(arg_begin(), arg_end());
-  }
-
-  /// Return the iterator pointing to the beginning of the argument list.
-  const_op_iterator arg_begin() const { return op_begin(); }
-
-  /// Return the iterator pointing to the end of the argument list.
-  const_op_iterator arg_end() const {
-    // [ invoke args ], [ operand bundles ], normal dest, unwind dest, callee
-    return op_end() - getNumTotalBundleOperands() - 3;
-  }
-
-  /// Iteration adapter for range-for loops.
-  iterator_range<const_op_iterator> arg_operands() const {
-    return make_range(arg_begin(), arg_end());
-  }
-
-  /// Wrappers for getting the \c Use of a invoke argument.
-  const Use &getArgOperandUse(unsigned i) const {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    return getOperandUse(i);
-  }
-  Use &getArgOperandUse(unsigned i) {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    return getOperandUse(i);
-  }
-
-  /// If one of the arguments has the 'returned' attribute, return its
-  /// operand value. Otherwise, return nullptr.
-  Value *getReturnedArgOperand() const;
-
-  /// getCallingConv/setCallingConv - Get or set the calling convention of this
-  /// function call.
-  CallingConv::ID getCallingConv() const {
-    return static_cast<CallingConv::ID>(getSubclassDataFromInstruction());
-  }
-  void setCallingConv(CallingConv::ID CC) {
-    auto ID = static_cast<unsigned>(CC);
-    assert(!(ID & ~CallingConv::MaxID) && "Unsupported calling convention");
-    setInstructionSubclassData(ID);
-  }
-
-  /// Return the parameter attributes for this invoke.
-  ///
-  AttributeList getAttributes() const { return Attrs; }
-
-  /// Set the parameter attributes for this invoke.
-  ///
-  void setAttributes(AttributeList A) { Attrs = A; }
-
-  /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute::AttrKind Kind);
-
-  /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute Attr);
-
-  /// Adds the attribute to the indicated argument
-  void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
-
-  /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, Attribute::AttrKind Kind);
-
-  /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, StringRef Kind);
-
-  /// Removes the attribute from the given argument
-  void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
-
-  /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
-
-  /// adds the dereferenceable_or_null attribute to the list of
-  /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes);
-
-  /// Determine whether this call has the given attribute.
-  bool hasFnAttr(Attribute::AttrKind Kind) const {
-    assert(Kind != Attribute::NoBuiltin &&
-           "Use CallInst::isNoBuiltin() to check for Attribute::NoBuiltin");
-    return hasFnAttrImpl(Kind);
-  }
-
-  /// Determine whether this call has the given attribute.
-  bool hasFnAttr(StringRef Kind) const {
-    return hasFnAttrImpl(Kind);
-  }
-
-  /// Determine whether the return value has the given attribute.
-  bool hasRetAttr(Attribute::AttrKind Kind) const;
-
-  /// Determine whether the argument or parameter has the given attribute.
-  bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
-
-  /// Get the attribute of a given kind at a position.
-  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return getAttributes().getAttribute(i, Kind);
-  }
-
-  /// Get the attribute of a given kind at a position.
-  Attribute getAttribute(unsigned i, StringRef Kind) const {
-    return getAttributes().getAttribute(i, Kind);
-  }
-
-  /// Return true if the data operand at index \p i has the attribute \p
-  /// A.
-  ///
-  /// Data operands include invoke arguments and values used in operand bundles,
-  /// but does not include the invokee operand, or the two successor blocks.
-  /// This routine dispatches to the underlying AttributeList or the
-  /// OperandBundleUser as appropriate.
-  ///
-  /// The index \p i is interpreted as
-  ///
-  ///  \p i == Attribute::ReturnIndex  -> the return value
-  ///  \p i in [1, arg_size + 1)  -> argument number (\p i - 1)
-  ///  \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index
-  ///     (\p i - 1) in the operand list.
-  bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const;
-
-  /// Extract the alignment of the return value.
-  unsigned getRetAlignment() const { return Attrs.getRetAlignment(); }
-
-  /// Extract the alignment for a call or parameter (0=unknown).
-  unsigned getParamAlignment(unsigned ArgNo) const {
-    return Attrs.getParamAlignment(ArgNo);
-  }
-
-  /// Extract the number of dereferenceable bytes for a call or
-  /// parameter (0=unknown).
-  uint64_t getDereferenceableBytes(unsigned i) const {
-    return Attrs.getDereferenceableBytes(i);
-  }
-
-  /// Extract the number of dereferenceable_or_null bytes for a call or
-  /// parameter (0=unknown).
-  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return Attrs.getDereferenceableOrNullBytes(i);
-  }
-
-  /// @brief Determine if the return value is marked with NoAlias attribute.
-  bool returnDoesNotAlias() const {
-    return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  }
-
-  /// Return true if the call should not be treated as a call to a
-  /// builtin.
-  bool isNoBuiltin() const {
-    // We assert in hasFnAttr if one passes in Attribute::NoBuiltin, so we have
-    // to check it by hand.
-    return hasFnAttrImpl(Attribute::NoBuiltin) &&
-      !hasFnAttrImpl(Attribute::Builtin);
-  }
-
-  /// Determine if the call requires strict floating point semantics.
-  bool isStrictFP() const { return hasFnAttr(Attribute::StrictFP); }
-
-  /// Return true if the call should not be inlined.
-  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
-  void setIsNoInline() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
-  }
-
-  /// Determine if the call does not access memory.
-  bool doesNotAccessMemory() const {
-    return hasFnAttr(Attribute::ReadNone);
-  }
-  void setDoesNotAccessMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
-  }
-
-  /// Determine if the call does not access or only reads memory.
-  bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
-  }
-  void setOnlyReadsMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
-  }
-
-  /// Determine if the call does not access or only writes memory.
-  bool doesNotReadMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
-  }
-  void setDoesNotReadMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
-  }
-
-  /// @brief Determine if the call access memmory only using it's pointer
-  /// arguments.
-  bool onlyAccessesArgMemory() const {
-    return hasFnAttr(Attribute::ArgMemOnly);
-  }
-  void setOnlyAccessesArgMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
-  }
-
-  /// @brief Determine if the function may only access memory that is
-  /// inaccessible from the IR.
-  bool onlyAccessesInaccessibleMemory() const {
-    return hasFnAttr(Attribute::InaccessibleMemOnly);
-  }
-  void setOnlyAccessesInaccessibleMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly);
-  }
-
-  /// @brief Determine if the function may only access memory that is
-  /// either inaccessible from the IR or pointed to by its arguments.
-  bool onlyAccessesInaccessibleMemOrArgMem() const {
-    return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
-  }
-  void setOnlyAccessesInaccessibleMemOrArgMem() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOrArgMemOnly);
-  }
-
-  /// Determine if the call cannot return.
-  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
-  void setDoesNotReturn() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
-  }
+  /// Determine if the call should not perform indirect branch tracking.
+  bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); }
 
   /// Determine if the call cannot unwind.
   bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
   void setDoesNotThrow() {
     addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
   }
-
-  /// Determine if the invoke cannot be duplicated.
-  bool cannotDuplicate() const {return hasFnAttr(Attribute::NoDuplicate); }
-  void setCannotDuplicate() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
-  }
-
-  /// Determine if the invoke is convergent
-  bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
-  void setConvergent() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
-  void setNotConvergent() {
-    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
-
-  /// Determine if the call returns a structure through first
-  /// pointer argument.
-  bool hasStructRetAttr() const {
-    if (getNumArgOperands() == 0)
-      return false;
-
-    // Be friendly and also check the callee.
-    return paramHasAttr(0, Attribute::StructRet);
-  }
-
-  /// Determine if any call argument is an aggregate passed by value.
-  bool hasByValArgument() const {
-    return Attrs.hasAttrSomewhere(Attribute::ByVal);
-  }
-
+  
   /// Return the function called, or null if this is an
   /// indirect function invocation.
   ///
@@ -4031,20 +4081,6 @@ public:
   }
 
 private:
-  template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
-    if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
-      return true;
-
-    // Operand bundles override attributes on the called function, but don't
-    // override attributes directly present on the invoke instruction.
-    if (isFnAttrDisallowedByOpBundle(Kind))
-      return false;
-
-    if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeList::FunctionIndex,
-                                             Kind);
-    return false;
-  }
 
   // Shadow Instruction::setInstructionSubclassData with a private forwarding
   // method so that subclasses cannot accidentally use it.
@@ -4054,16 +4090,17 @@ private:
 };
 
 template <>
-struct OperandTraits<InvokeInst> : public VariadicOperandTraits<InvokeInst, 3> {
-};
+struct OperandTraits<CallBase<InvokeInst>>
+    : public VariadicOperandTraits<CallBase<InvokeInst>, 3> {};
 
 InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
                        ArrayRef<OperandBundleDef> Bundles, unsigned Values,
                        const Twine &NameStr, Instruction *InsertBefore)
-    : TerminatorInst(Ty->getReturnType(), Instruction::Invoke,
-                     OperandTraits<InvokeInst>::op_end(this) - Values, Values,
-                     InsertBefore) {
+    : CallBase<InvokeInst>(Ty->getReturnType(), Instruction::Invoke,
+                           OperandTraits<CallBase<InvokeInst>>::op_end(this) -
+                               Values,
+                           Values, InsertBefore) {
   init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr);
 }
 
@@ -4071,15 +4108,16 @@ InvokeInst::InvokeInst(Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
                        ArrayRef<OperandBundleDef> Bundles, unsigned Values,
                        const Twine &NameStr, BasicBlock *InsertAtEnd)
-    : TerminatorInst(
-          cast<FunctionType>(cast<PointerType>(Func->getType())
-                                 ->getElementType())->getReturnType(),
-          Instruction::Invoke, OperandTraits<InvokeInst>::op_end(this) - Values,
-          Values, InsertAtEnd) {
+    : CallBase<InvokeInst>(
+          cast<FunctionType>(
+              cast<PointerType>(Func->getType())->getElementType())
+              ->getReturnType(),
+          Instruction::Invoke,
+          OperandTraits<CallBase<InvokeInst>>::op_end(this) - Values, Values,
+          InsertAtEnd) {
   init(Func, IfNormal, IfException, Args, Bundles, NameStr);
 }
 
-DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InvokeInst, Value)
 
 //===----------------------------------------------------------------------===//
 //                              ResumeInst Class
@@ -5190,6 +5228,26 @@ public:
   }
 };
 
+/// A helper function that returns the pointer operand of a load or store
+/// instruction. Returns nullptr if not load or store.
+inline Value *getLoadStorePointerOperand(Value *V) {
+  if (auto *Load = dyn_cast<LoadInst>(V))
+    return Load->getPointerOperand();
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return Store->getPointerOperand();
+  return nullptr;
+}
+
+/// A helper function that returns the pointer operand of a load, store
+/// or GEP instruction. Returns nullptr if not load, store, or GEP.
+inline Value *getPointerOperand(Value *V) {
+  if (auto *Ptr = getLoadStorePointerOperand(V))
+    return Ptr;
+  if (auto *Gep = dyn_cast<GetElementPtrInst>(V))
+    return Gep->getPointerOperand();
+  return nullptr;
+}
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INSTRUCTIONS_H
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicInst.h b/contrib/llvm/include/llvm/IR/IntrinsicInst.h
index 2ca0a24cbae1..6650afcca7fb 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/contrib/llvm/include/llvm/IR/IntrinsicInst.h
@@ -93,6 +93,10 @@ namespace llvm {
       return cast<MetadataAsValue>(getArgOperand(2))->getMetadata();
     }
 
+    /// Get the size (in bits) of the variable, or fragment of the variable that
+    /// is described.
+    Optional<uint64_t> getFragmentSizeInBits() const;
+
     /// \name Casting methods
     /// @{
     static bool classof(const IntrinsicInst *I) {
@@ -100,6 +104,7 @@ namespace llvm {
       case Intrinsic::dbg_declare:
       case Intrinsic::dbg_value:
       case Intrinsic::dbg_addr:
+      case Intrinsic::dbg_label:
         return true;
       default: return false;
       }
@@ -159,6 +164,32 @@ namespace llvm {
     /// @}
   };
 
+  /// This represents the llvm.dbg.label instruction.
+  class DbgLabelInst : public DbgInfoIntrinsic {
+  public:
+    DILabel *getLabel() const {
+      return cast<DILabel>(getRawVariable());
+    }
+
+    Metadata *getRawVariable() const {
+      return cast<MetadataAsValue>(getArgOperand(0))->getMetadata();
+    }
+
+    Metadata *getRawExpression() const {
+      return nullptr;
+    }
+
+    /// Methods for support type inquiry through isa, cast, and dyn_cast:
+    /// @{
+    static bool classof(const IntrinsicInst *I) {
+      return I->getIntrinsicID() == Intrinsic::dbg_label;
+    }
+    static bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+    /// @}
+  };
+
   /// This is the common base class for constrained floating point intrinsics.
   class ConstrainedFPIntrinsic : public IntrinsicInst {
   public:
@@ -243,6 +274,8 @@ namespace llvm {
       return cast<PointerType>(getRawDest()->getType())->getAddressSpace();
     }
 
+    unsigned getDestAlignment() const { return getParamAlignment(ARG_DEST); }
+
     /// Set the specified arguments of the instruction.
     void setDest(Value *Ptr) {
       assert(getRawDest()->getType() == Ptr->getType() &&
@@ -250,6 +283,13 @@ namespace llvm {
       setArgOperand(ARG_DEST, Ptr);
     }
 
+    void setDestAlignment(unsigned Align) {
+      removeParamAttr(ARG_DEST, Attribute::Alignment);
+      if (Align > 0)
+        addParamAttr(ARG_DEST,
+                     Attribute::getWithAlignment(getContext(), Align));
+    }
+
     void setLength(Value *L) {
       assert(getLength()->getType() == L->getType() &&
              "setLength called with value of wrong type!");
@@ -257,6 +297,71 @@ namespace llvm {
     }
   };
 
+  /// Common base class for all memory transfer intrinsics. Simply provides
+  /// common methods.
+  template <class BaseCL> class MemTransferBase : public BaseCL {
+  private:
+    enum { ARG_SOURCE = 1 };
+
+  public:
+    /// Return the arguments to the instruction.
+    Value *getRawSource() const {
+      return const_cast<Value *>(BaseCL::getArgOperand(ARG_SOURCE));
+    }
+    const Use &getRawSourceUse() const {
+      return BaseCL::getArgOperandUse(ARG_SOURCE);
+    }
+    Use &getRawSourceUse() { return BaseCL::getArgOperandUse(ARG_SOURCE); }
+
+    /// This is just like getRawSource, but it strips off any cast
+    /// instructions that feed it, giving the original input.  The returned
+    /// value is guaranteed to be a pointer.
+    Value *getSource() const { return getRawSource()->stripPointerCasts(); }
+
+    unsigned getSourceAddressSpace() const {
+      return cast<PointerType>(getRawSource()->getType())->getAddressSpace();
+    }
+
+    unsigned getSourceAlignment() const {
+      return BaseCL::getParamAlignment(ARG_SOURCE);
+    }
+
+    void setSource(Value *Ptr) {
+      assert(getRawSource()->getType() == Ptr->getType() &&
+             "setSource called with pointer of wrong type!");
+      BaseCL::setArgOperand(ARG_SOURCE, Ptr);
+    }
+
+    void setSourceAlignment(unsigned Align) {
+      BaseCL::removeParamAttr(ARG_SOURCE, Attribute::Alignment);
+      if (Align > 0)
+        BaseCL::addParamAttr(ARG_SOURCE, Attribute::getWithAlignment(
+                                             BaseCL::getContext(), Align));
+    }
+  };
+
+  /// Common base class for all memset intrinsics. Simply provides
+  /// common methods.
+  template <class BaseCL> class MemSetBase : public BaseCL {
+  private:
+    enum { ARG_VALUE = 1 };
+
+  public:
+    Value *getValue() const {
+      return const_cast<Value *>(BaseCL::getArgOperand(ARG_VALUE));
+    }
+    const Use &getValueUse() const {
+      return BaseCL::getArgOperandUse(ARG_VALUE);
+    }
+    Use &getValueUse() { return BaseCL::getArgOperandUse(ARG_VALUE); }
+
+    void setValue(Value *Val) {
+      assert(getValue()->getType() == Val->getType() &&
+             "setValue called with value of wrong type!");
+      BaseCL::setArgOperand(ARG_VALUE, Val);
+    }
+  };
+
   // The common base class for the atomic memset/memmove/memcpy intrinsics
   // i.e. llvm.element.unordered.atomic.memset/memcpy/memmove
   class AtomicMemIntrinsic : public MemIntrinsicBase<AtomicMemIntrinsic> {
@@ -299,23 +404,8 @@ namespace llvm {
 
   /// This class represents atomic memset intrinsic
   // i.e. llvm.element.unordered.atomic.memset
-  class AtomicMemSetInst : public AtomicMemIntrinsic {
-  private:
-    enum { ARG_VALUE = 1 };
-
+  class AtomicMemSetInst : public MemSetBase<AtomicMemIntrinsic> {
   public:
-    Value *getValue() const {
-      return const_cast<Value *>(getArgOperand(ARG_VALUE));
-    }
-    const Use &getValueUse() const { return getArgOperandUse(ARG_VALUE); }
-    Use &getValueUse() { return getArgOperandUse(ARG_VALUE); }
-
-    void setValue(Value *Val) {
-      assert(getValue()->getType() == Val->getType() &&
-             "setValue called with value of wrong type!");
-      setArgOperand(ARG_VALUE, Val);
-    }
-
     static bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memset_element_unordered_atomic;
     }
@@ -326,33 +416,8 @@ namespace llvm {
 
   // This class wraps the atomic memcpy/memmove intrinsics
   // i.e. llvm.element.unordered.atomic.memcpy/memmove
-  class AtomicMemTransferInst : public AtomicMemIntrinsic {
-  private:
-    enum { ARG_SOURCE = 1 };
-
+  class AtomicMemTransferInst : public MemTransferBase<AtomicMemIntrinsic> {
   public:
-    /// Return the arguments to the instruction.
-    Value *getRawSource() const {
-      return const_cast<Value *>(getArgOperand(ARG_SOURCE));
-    }
-    const Use &getRawSourceUse() const { return getArgOperandUse(ARG_SOURCE); }
-    Use &getRawSourceUse() { return getArgOperandUse(ARG_SOURCE); }
-
-    /// This is just like getRawSource, but it strips off any cast
-    /// instructions that feed it, giving the original input.  The returned
-    /// value is guaranteed to be a pointer.
-    Value *getSource() const { return getRawSource()->stripPointerCasts(); }
-
-    unsigned getSourceAddressSpace() const {
-      return cast<PointerType>(getRawSource()->getType())->getAddressSpace();
-    }
-
-    void setSource(Value *Ptr) {
-      assert(getRawSource()->getType() == Ptr->getType() &&
-             "setSource called with pointer of wrong type!");
-      setArgOperand(ARG_SOURCE, Ptr);
-    }
-
     static bool classof(const IntrinsicInst *I) {
       switch (I->getIntrinsicID()) {
       case Intrinsic::memcpy_element_unordered_atomic:
@@ -394,17 +459,9 @@ namespace llvm {
   /// This is the common base class for memset/memcpy/memmove.
   class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
   private:
-    enum { ARG_ALIGN = 3, ARG_VOLATILE = 4 };
+    enum { ARG_VOLATILE = 3 };
 
   public:
-    ConstantInt *getAlignmentCst() const {
-      return cast<ConstantInt>(const_cast<Value *>(getArgOperand(ARG_ALIGN)));
-    }
-
-    unsigned getAlignment() const {
-      return getAlignmentCst()->getZExtValue();
-    }
-
     ConstantInt *getVolatileCst() const {
       return cast<ConstantInt>(
           const_cast<Value *>(getArgOperand(ARG_VOLATILE)));
@@ -414,14 +471,8 @@ namespace llvm {
       return !getVolatileCst()->isZero();
     }
 
-    void setAlignment(Constant *A) { setArgOperand(ARG_ALIGN, A); }
-
     void setVolatile(Constant *V) { setArgOperand(ARG_VOLATILE, V); }
 
-    Type *getAlignmentType() const {
-      return getArgOperand(ARG_ALIGN)->getType();
-    }
-
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static bool classof(const IntrinsicInst *I) {
       switch (I->getIntrinsicID()) {
@@ -438,19 +489,8 @@ namespace llvm {
   };
 
   /// This class wraps the llvm.memset intrinsic.
-  class MemSetInst : public MemIntrinsic {
+  class MemSetInst : public MemSetBase<MemIntrinsic> {
   public:
-    /// Return the arguments to the instruction.
-    Value *getValue() const { return const_cast<Value*>(getArgOperand(1)); }
-    const Use &getValueUse() const { return getArgOperandUse(1); }
-    Use &getValueUse() { return getArgOperandUse(1); }
-
-    void setValue(Value *Val) {
-      assert(getValue()->getType() == Val->getType() &&
-             "setValue called with value of wrong type!");
-      setArgOperand(1, Val);
-    }
-
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memset;
@@ -461,28 +501,8 @@ namespace llvm {
   };
 
   /// This class wraps the llvm.memcpy/memmove intrinsics.
-  class MemTransferInst : public MemIntrinsic {
+  class MemTransferInst : public MemTransferBase<MemIntrinsic> {
   public:
-    /// Return the arguments to the instruction.
-    Value *getRawSource() const { return const_cast<Value*>(getArgOperand(1)); }
-    const Use &getRawSourceUse() const { return getArgOperandUse(1); }
-    Use &getRawSourceUse() { return getArgOperandUse(1); }
-
-    /// This is just like getRawSource, but it strips off any cast
-    /// instructions that feed it, giving the original input.  The returned
-    /// value is guaranteed to be a pointer.
-    Value *getSource() const { return getRawSource()->stripPointerCasts(); }
-
-    unsigned getSourceAddressSpace() const {
-      return cast<PointerType>(getRawSource()->getType())->getAddressSpace();
-    }
-
-    void setSource(Value *Ptr) {
-      assert(getRawSource()->getType() == Ptr->getType() &&
-             "setSource called with pointer of wrong type!");
-      setArgOperand(1, Ptr);
-    }
-
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memcpy ||
@@ -551,23 +571,8 @@ namespace llvm {
   /// This class represents any memset intrinsic
   // i.e. llvm.element.unordered.atomic.memset
   // and  llvm.memset
-  class AnyMemSetInst : public AnyMemIntrinsic {
-  private:
-    enum { ARG_VALUE = 1 };
-
+  class AnyMemSetInst : public MemSetBase<AnyMemIntrinsic> {
   public:
-    Value *getValue() const {
-      return const_cast<Value *>(getArgOperand(ARG_VALUE));
-    }
-    const Use &getValueUse() const { return getArgOperandUse(ARG_VALUE); }
-    Use &getValueUse() { return getArgOperandUse(ARG_VALUE); }
-
-    void setValue(Value *Val) {
-      assert(getValue()->getType() == Val->getType() &&
-             "setValue called with value of wrong type!");
-      setArgOperand(ARG_VALUE, Val);
-    }
-
     static bool classof(const IntrinsicInst *I) {
       switch (I->getIntrinsicID()) {
       case Intrinsic::memset:
@@ -585,33 +590,8 @@ namespace llvm {
   // This class wraps any memcpy/memmove intrinsics
   // i.e. llvm.element.unordered.atomic.memcpy/memmove
   // and  llvm.memcpy/memmove
-  class AnyMemTransferInst : public AnyMemIntrinsic {
-  private:
-    enum { ARG_SOURCE = 1 };
-
+  class AnyMemTransferInst : public MemTransferBase<AnyMemIntrinsic> {
   public:
-    /// Return the arguments to the instruction.
-    Value *getRawSource() const {
-      return const_cast<Value *>(getArgOperand(ARG_SOURCE));
-    }
-    const Use &getRawSourceUse() const { return getArgOperandUse(ARG_SOURCE); }
-    Use &getRawSourceUse() { return getArgOperandUse(ARG_SOURCE); }
-
-    /// This is just like getRawSource, but it strips off any cast
-    /// instructions that feed it, giving the original input.  The returned
-    /// value is guaranteed to be a pointer.
-    Value *getSource() const { return getRawSource()->stripPointerCasts(); }
-
-    unsigned getSourceAddressSpace() const {
-      return cast<PointerType>(getRawSource()->getType())->getAddressSpace();
-    }
-
-    void setSource(Value *Ptr) {
-      assert(getRawSource()->getType() == Ptr->getType() &&
-             "setSource called with pointer of wrong type!");
-      setArgOperand(ARG_SOURCE, Ptr);
-    }
-
     static bool classof(const IntrinsicInst *I) {
       switch (I->getIntrinsicID()) {
       case Intrinsic::memcpy:
diff --git a/contrib/llvm/include/llvm/IR/Intrinsics.h b/contrib/llvm/include/llvm/IR/Intrinsics.h
index fc79da7ae0e6..e1e17f983ff8 100644
--- a/contrib/llvm/include/llvm/IR/Intrinsics.h
+++ b/contrib/llvm/include/llvm/IR/Intrinsics.h
@@ -39,7 +39,7 @@ namespace Intrinsic {
 
     // Get the intrinsic enums generated from Intrinsics.td
 #define GET_INTRINSIC_ENUM_VALUES
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicEnums.inc"
 #undef GET_INTRINSIC_ENUM_VALUES
     , num_intrinsics
   };
@@ -97,7 +97,7 @@ namespace Intrinsic {
   /// intrinsic. This is returned by getIntrinsicInfoTableEntries.
   struct IITDescriptor {
     enum IITDescriptorKind {
-      Void, VarArg, MMX, Token, Metadata, Half, Float, Double,
+      Void, VarArg, MMX, Token, Metadata, Half, Float, Double, Quad,
       Integer, Vector, Pointer, Struct,
       Argument, ExtendArgument, TruncArgument, HalfVecArgument,
       SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt
diff --git a/contrib/llvm/include/llvm/IR/Intrinsics.td b/contrib/llvm/include/llvm/IR/Intrinsics.td
index a2a1f26292ce..64455573ff19 100644
--- a/contrib/llvm/include/llvm/IR/Intrinsics.td
+++ b/contrib/llvm/include/llvm/IR/Intrinsics.td
@@ -117,6 +117,7 @@ def IntrHasSideEffects : IntrinsicProperty;
 
 class LLVMType<ValueType vt> {
   ValueType VT = vt;
+  int isAny = 0;
 }
 
 class LLVMQualPointerType<LLVMType elty, int addrspace>
@@ -131,6 +132,8 @@ class LLVMPointerType<LLVMType elty>
 class LLVMAnyPointerType<LLVMType elty>
   : LLVMType<iPTRAny>{
   LLVMType ElTy = elty;
+
+  let isAny = 1;
 }
 
 // Match the type of another intrinsic parameter.  Number is an index into the
@@ -163,10 +166,12 @@ class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
 class LLVMHalfElementsVectorType<int num> : LLVMMatchType<num>;
 
 def llvm_void_ty       : LLVMType<isVoid>;
-def llvm_any_ty        : LLVMType<Any>;
-def llvm_anyint_ty     : LLVMType<iAny>;
-def llvm_anyfloat_ty   : LLVMType<fAny>;
-def llvm_anyvector_ty  : LLVMType<vAny>;
+let isAny = 1 in {
+  def llvm_any_ty        : LLVMType<Any>;
+  def llvm_anyint_ty     : LLVMType<iAny>;
+  def llvm_anyfloat_ty   : LLVMType<fAny>;
+  def llvm_anyvector_ty  : LLVMType<vAny>;
+}
 def llvm_i1_ty         : LLVMType<i1>;
 def llvm_i8_ty         : LLVMType<i8>;
 def llvm_i16_ty        : LLVMType<i16>;
@@ -249,7 +254,6 @@ def llvm_v8f64_ty      : LLVMType<v8f64>;    //  8 x double
 
 def llvm_vararg_ty     : LLVMType<isVoid>;   // this means vararg here
 
-
 //===----------------------------------------------------------------------===//
 // Intrinsic Definitions.
 //===----------------------------------------------------------------------===//
@@ -390,17 +394,17 @@ def int_instrprof_value_profile : Intrinsic<[],
 
 def int_memcpy  : Intrinsic<[],
                              [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
-                              llvm_i32_ty, llvm_i1_ty],
+                              llvm_i1_ty],
                             [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
                              WriteOnly<0>, ReadOnly<1>]>;
 def int_memmove : Intrinsic<[],
                             [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
-                             llvm_i32_ty, llvm_i1_ty],
+                             llvm_i1_ty],
                             [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
                              ReadOnly<1>]>;
 def int_memset  : Intrinsic<[],
                             [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty,
-                             llvm_i32_ty, llvm_i1_ty],
+                             llvm_i1_ty],
                             [IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
 
 // FIXME: Add version of these floating point intrinsics which allow non-default
@@ -573,6 +577,10 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_ctlz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
   def int_cttz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
   def int_bitreverse : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
+  def int_fshl : Intrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_fshr : Intrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
 //===------------------------ Debugger Intrinsics -------------------------===//
@@ -595,6 +603,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
                                        [llvm_metadata_ty,
                                         llvm_metadata_ty,
                                         llvm_metadata_ty]>;
+  def int_dbg_label        : Intrinsic<[],
+                                       [llvm_metadata_ty]>;
 }
 
 //===------------------ Exception Handling Intrinsics----------------------===//
@@ -706,16 +716,26 @@ def int_invariant_end   : Intrinsic<[],
                                      llvm_anyptr_ty],
                                     [IntrArgMemOnly, NoCapture<2>]>;
 
-// invariant.group.barrier can't be marked with 'readnone' (IntrNoMem),
+// launder.invariant.group can't be marked with 'readnone' (IntrNoMem),
 // because it would cause CSE of two barriers with the same argument.
-// Readonly and argmemonly says that barrier only reads its argument and
-// it can be CSE only if memory didn't change between 2 barriers call,
-// which is valid.
+// Inaccessiblememonly says that the barrier doesn't read the argument,
+// but it changes state not accessible to this module. This way
+// we can DSE through the barrier because it doesn't read the value
+// after store. Although the barrier doesn't modify any memory it
+// can't be marked as readonly, because it would be possible to
+// CSE 2 barriers with store in between.
 // The argument also can't be marked with 'returned' attribute, because
 // it would remove barrier.
-def int_invariant_group_barrier : Intrinsic<[llvm_anyptr_ty],
+// Note that it is still experimental, which means that its semantics
+// might change in the future.
+def int_launder_invariant_group : Intrinsic<[llvm_anyptr_ty],
                                             [LLVMMatchType<0>],
-                                            [IntrReadMem, IntrArgMemOnly]>;
+                                            [IntrInaccessibleMemOnly, IntrSpeculatable]>;
+
+
+def int_strip_invariant_group : Intrinsic<[llvm_anyptr_ty],
+                                          [LLVMMatchType<0>],
+                                          [IntrSpeculatable, IntrNoMem]>;
 
 //===------------------------ Stackmap Intrinsics -------------------------===//
 //
@@ -768,6 +788,7 @@ def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
 def int_coro_end : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty], []>;
 
 def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+def int_coro_noop : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
 
 def int_coro_save : Intrinsic<[llvm_token_ty], [llvm_ptr_ty], []>;
@@ -874,6 +895,10 @@ def int_type_checked_load : Intrinsic<[llvm_ptr_ty, llvm_i1_ty],
                                       [llvm_ptr_ty, llvm_i32_ty, llvm_metadata_ty],
                                       [IntrNoMem]>;
 
+// Create a branch funnel that implements an indirect call to a limited set of
+// callees. This needs to be a musttail call.
+def int_icall_branch_funnel : Intrinsic<[], [llvm_vararg_ty], []>;
+
 def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
                                  [IntrReadMem, IntrArgMemOnly]>;
 
@@ -883,6 +908,10 @@ def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
 // Takes a pointer to a string and the length of the string.
 def int_xray_customevent : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
                                      [NoCapture<0>, ReadOnly<0>, IntrWriteMem]>;
+// Typed event logging for x-ray.
+// Takes a numeric type tag, a pointer to a string and the length of the string.
+def int_xray_typedevent : Intrinsic<[], [llvm_i16_ty, llvm_ptr_ty, llvm_i32_ty],
+                                        [NoCapture<1>, ReadOnly<1>, IntrWriteMem]>;
 //===----------------------------------------------------------------------===//
 
 //===------ Memory intrinsics with element-wise atomicity guarantees ------===//
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsAArch64.td b/contrib/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 65c9aaab975d..688e863c1afe 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -146,6 +146,14 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_CvtFPToFx_Intrinsic
     : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
                 [IntrNoMem]>;
+
+  class AdvSIMD_1Arg_Intrinsic
+    : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+  class AdvSIMD_Dot_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -244,7 +252,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // Vector Max
   def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Max Across Lanes
@@ -256,7 +264,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // Vector Min
   def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Min/Max Number
@@ -354,7 +362,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
 
   // Vector Absolute Value
-  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic;
 
   // Vector Saturating Absolute Value
   def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
@@ -412,6 +420,10 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // Scalar FP Inexact Narrowing
   def int_aarch64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
                                         [IntrNoMem]>;
+
+  // v8.2-A Dot Product
+  def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
+  def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
@@ -572,6 +584,14 @@ def int_aarch64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
 def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
 
 let TargetPrefix = "aarch64" in {
+  class FPCR_Get_Intrinsic
+    : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
+}
+
+// FPCR
+def int_aarch64_get_fpcr : FPCR_Get_Intrinsic;
+
+let TargetPrefix = "aarch64" in {
   class Crypto_AES_DataKey_Intrinsic
     : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 3397fa41db1b..8555db01645f 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -17,6 +17,13 @@ class AMDGPUReadPreloadRegisterIntrinsic
 class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
   : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, GCCBuiltin<name>;
 
+// Used to tag image and resource intrinsics with information used to generate
+// mem operands.
+class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = 0> {
+  int RsrcArg = rsrcarg;
+  bit IsImage = isimage;
+}
+
 let TargetPrefix = "r600" in {
 
 multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
@@ -69,6 +76,59 @@ def int_r600_cube : Intrinsic<
   [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
 >;
 
+def int_r600_store_stream_output : Intrinsic<
+  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
+  llvm_v4f32_ty, // Coord
+  llvm_i32_ty,   // offset_x
+  llvm_i32_ty,   // offset_y,
+  llvm_i32_ty,   // offset_z,
+  llvm_i32_ty,   // resource_id
+  llvm_i32_ty,   // samplerid
+  llvm_i32_ty,   // coord_type_x
+  llvm_i32_ty,   // coord_type_y
+  llvm_i32_ty,   // coord_type_z
+  llvm_i32_ty],  // coord_type_w
+  [IntrNoMem]
+>;
+
+class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
+    llvm_v4i32_ty, // Coord
+    llvm_i32_ty,   // offset_x
+    llvm_i32_ty,   // offset_y,
+    llvm_i32_ty,   // offset_z,
+    llvm_i32_ty,   // resource_id
+    llvm_i32_ty,   // samplerid
+    llvm_i32_ty,   // coord_type_x
+    llvm_i32_ty,   // coord_type_y
+    llvm_i32_ty,   // coord_type_z
+    llvm_i32_ty],  // coord_type_w
+    [IntrNoMem]
+>;
+
+def int_r600_store_swizzle :
+  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_tex : TextureIntrinsicFloatInput;
+def int_r600_texc : TextureIntrinsicFloatInput;
+def int_r600_txl : TextureIntrinsicFloatInput;
+def int_r600_txlc : TextureIntrinsicFloatInput;
+def int_r600_txb : TextureIntrinsicFloatInput;
+def int_r600_txbc : TextureIntrinsicFloatInput;
+def int_r600_txf : TextureIntrinsicInt32Input;
+def int_r600_txq : TextureIntrinsicInt32Input;
+def int_r600_ddx : TextureIntrinsicFloatInput;
+def int_r600_ddy : TextureIntrinsicFloatInput;
+
+def int_r600_dot4 : Intrinsic<[llvm_float_ty],
+  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_r600_kill : Intrinsic<[], [llvm_float_ty], []>;
+
 } // End TargetPrefix = "r600"
 
 let TargetPrefix = "amdgcn" in {
@@ -83,22 +143,22 @@ defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
 
 def int_amdgcn_dispatch_ptr :
   GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_queue_ptr :
   GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_kernarg_segment_ptr :
   GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_implicitarg_ptr :
   GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_groupstaticsize :
@@ -111,7 +171,7 @@ def int_amdgcn_dispatch_id :
 
 def int_amdgcn_implicit_buffer_ptr :
   GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [IntrNoMem, IntrSpeculatable]>;
 
 // Set EXEC to the 64-bit value given.
@@ -300,6 +360,12 @@ def int_amdgcn_sffbh :
   [IntrNoMem, IntrSpeculatable]
 >;
 
+// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
+def int_amdgcn_fmad_ftz :
+  Intrinsic<[llvm_anyfloat_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]
+>;
 
 // Fields should mirror atomicrmw
 class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
@@ -315,165 +381,414 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
 def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
 def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
 
-class AMDGPUImageLoad<bit NoMem = 0> : Intrinsic <
-  [llvm_anyfloat_ty], // vdata(VGPR)
-  [llvm_anyint_ty,    // vaddr(VGPR)
-   llvm_anyint_ty,    // rsrc(SGPR)
-   llvm_i32_ty,       // dmask(imm)
-   llvm_i1_ty,        // glc(imm)
-   llvm_i1_ty,        // slc(imm)
-   llvm_i1_ty,        // lwe(imm)
-   llvm_i1_ty],       // da(imm)
-  !if(NoMem, [IntrNoMem], [IntrReadMem]), "",
-  !if(NoMem, [], [SDNPMemOperand])>;
+class AMDGPULDSF32Intrin<string clang_builtin> :
+  GCCBuiltin<clang_builtin>,
+  Intrinsic<[llvm_float_ty],
+    [LLVMQualPointerType<llvm_float_ty, 3>,
+    llvm_float_ty,
+    llvm_i32_ty, // ordering
+    llvm_i32_ty, // scope
+    llvm_i1_ty], // isVolatile
+    [IntrArgMemOnly, NoCapture<0>]
+>;
 
-def int_amdgcn_image_load : AMDGPUImageLoad;
-def int_amdgcn_image_load_mip : AMDGPUImageLoad;
-def int_amdgcn_image_getresinfo : AMDGPUImageLoad<1>;
+def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
+def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
+def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
 
-class AMDGPUImageStore : Intrinsic <
-  [],
-  [llvm_anyfloat_ty,  // vdata(VGPR)
-   llvm_anyint_ty,    // vaddr(VGPR)
-   llvm_anyint_ty,    // rsrc(SGPR)
-   llvm_i32_ty,       // dmask(imm)
-   llvm_i1_ty,        // glc(imm)
-   llvm_i1_ty,        // slc(imm)
-   llvm_i1_ty,        // lwe(imm)
-   llvm_i1_ty],       // da(imm)
-  [IntrWriteMem], "", [SDNPMemOperand]>;
-
-def int_amdgcn_image_store : AMDGPUImageStore;
-def int_amdgcn_image_store_mip : AMDGPUImageStore;
-
-class AMDGPUImageSample<bit NoMem = 0> : Intrinsic <
-    [llvm_anyfloat_ty], // vdata(VGPR)
-    [llvm_anyfloat_ty,  // vaddr(VGPR)
-     llvm_anyint_ty,    // rsrc(SGPR)
-     llvm_v4i32_ty,     // sampler(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i1_ty,        // unorm(imm)
-     llvm_i1_ty,        // glc(imm)
-     llvm_i1_ty,        // slc(imm)
-     llvm_i1_ty,        // lwe(imm)
-     llvm_i1_ty],       // da(imm)
-     !if(NoMem, [IntrNoMem], [IntrReadMem]), "",
-     !if(NoMem, [], [SDNPMemOperand])>;
-
-// Basic sample
-def int_amdgcn_image_sample : AMDGPUImageSample;
-def int_amdgcn_image_sample_cl : AMDGPUImageSample;
-def int_amdgcn_image_sample_d : AMDGPUImageSample;
-def int_amdgcn_image_sample_d_cl : AMDGPUImageSample;
-def int_amdgcn_image_sample_l : AMDGPUImageSample;
-def int_amdgcn_image_sample_b : AMDGPUImageSample;
-def int_amdgcn_image_sample_b_cl : AMDGPUImageSample;
-def int_amdgcn_image_sample_lz : AMDGPUImageSample;
-def int_amdgcn_image_sample_cd : AMDGPUImageSample;
-def int_amdgcn_image_sample_cd_cl : AMDGPUImageSample;
-
-// Sample with comparison
-def int_amdgcn_image_sample_c : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_cl : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_d : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_d_cl : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_l : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_b : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_b_cl : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_lz : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_cd : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_cd_cl : AMDGPUImageSample;
-
-// Sample with offsets
-def int_amdgcn_image_sample_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_d_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_d_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_l_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_b_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_b_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_lz_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_cd_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_cd_cl_o : AMDGPUImageSample;
-
-// Sample with comparison and offsets
-def int_amdgcn_image_sample_c_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_d_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_d_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_l_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_b_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_b_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_lz_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_cd_o : AMDGPUImageSample;
-def int_amdgcn_image_sample_c_cd_cl_o : AMDGPUImageSample;
-
-// Basic gather4
-def int_amdgcn_image_gather4 : AMDGPUImageSample;
-def int_amdgcn_image_gather4_cl : AMDGPUImageSample;
-def int_amdgcn_image_gather4_l : AMDGPUImageSample;
-def int_amdgcn_image_gather4_b : AMDGPUImageSample;
-def int_amdgcn_image_gather4_b_cl : AMDGPUImageSample;
-def int_amdgcn_image_gather4_lz : AMDGPUImageSample;
-
-// Gather4 with comparison
-def int_amdgcn_image_gather4_c : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_cl : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_l : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_b : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_b_cl : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_lz : AMDGPUImageSample;
-
-// Gather4 with offsets
-def int_amdgcn_image_gather4_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_l_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_b_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_b_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_lz_o : AMDGPUImageSample;
-
-// Gather4 with comparison and offsets
-def int_amdgcn_image_gather4_c_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_l_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_b_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_b_cl_o : AMDGPUImageSample;
-def int_amdgcn_image_gather4_c_lz_o : AMDGPUImageSample;
-
-def int_amdgcn_image_getlod : AMDGPUImageSample<1>;
-
-class AMDGPUImageAtomic : Intrinsic <
-  [llvm_i32_ty],
-  [llvm_i32_ty,       // vdata(VGPR)
-   llvm_anyint_ty,    // vaddr(VGPR)
-   llvm_v8i32_ty,     // rsrc(SGPR)
-   llvm_i1_ty,        // r128(imm)
-   llvm_i1_ty,        // da(imm)
-   llvm_i1_ty],       // slc(imm)
-  [], "", [SDNPMemOperand]>;
-
-def int_amdgcn_image_atomic_swap : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_add : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_sub : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_smin : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_umin : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_smax : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_umax : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_and : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_or : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_xor : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_inc : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_dec : AMDGPUImageAtomic;
-def int_amdgcn_image_atomic_cmpswap : Intrinsic <
-  [llvm_i32_ty],
-  [llvm_i32_ty,       // src(VGPR)
-   llvm_i32_ty,       // cmp(VGPR)
-   llvm_anyint_ty,    // vaddr(VGPR)
-   llvm_v8i32_ty,     // rsrc(SGPR)
-   llvm_i1_ty,        // r128(imm)
-   llvm_i1_ty,        // da(imm)
-   llvm_i1_ty],       // slc(imm)
-  [], "", [SDNPMemOperand]>;
+} // TargetPrefix = "amdgcn"
+
+// New-style image intrinsics
+
+//////////////////////////////////////////////////////////////////////////
+// Dimension-aware image intrinsics framework
+//////////////////////////////////////////////////////////////////////////
+
+// Helper class to represent (type, name) combinations of arguments. The
+// argument names are explanatory and used as DAG operand names for codegen
+// pattern matching.
+class AMDGPUArg<LLVMType ty, string name> {
+  LLVMType Type = ty;
+  string Name = name;
+}
+
+// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
+class makeArgList<list<string> names, LLVMType basety> {
+  list<AMDGPUArg> ret =
+    !listconcat([AMDGPUArg<basety, names[0]>],
+                !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
+}
+
+// Return arglist, with LLVMMatchType's references shifted by 'shift'.
+class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
+  list<AMDGPUArg> ret =
+    !foreach(arg, arglist,
+             !if(!isa<LLVMMatchType>(arg.Type),
+                 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
+                           arg.Name>,
+                 arg));
+}
+
+// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
+// accordingly, and shifted by an additional 'shift'.
+class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
+  list<AMDGPUArg> ret =
+    !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
+           !listconcat(
+             lhs,
+             arglistmatchshift<rhs,
+                               !add(shift, !foldl(0, lhs, a, b,
+                                                  !add(a, b.Type.isAny)))>.ret));
+}
+
+// Represent texture/image types / dimensionality.
+class AMDGPUDimProps<string name, list<string> coord_names, list<string> slice_names> {
+  AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
+  string Name = name; // e.g. "2darraymsaa"
+  bit DA = 0; // DA bit in MIMG encoding
+
+  list<AMDGPUArg> CoordSliceArgs =
+    makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
+  list<AMDGPUArg> CoordSliceIntArgs =
+    makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
+  list<AMDGPUArg> GradientArgs =
+    makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
+                            !foreach(name, coord_names, "d" # name # "dv")),
+                llvm_anyfloat_ty>.ret;
+
+  bits<8> NumCoords = !size(CoordSliceArgs);
+  bits<8> NumGradients = !size(GradientArgs);
+}
+
+def AMDGPUDim1D : AMDGPUDimProps<"1d", ["s"], []>;
+def AMDGPUDim2D : AMDGPUDimProps<"2d", ["s", "t"], []>;
+def AMDGPUDim3D : AMDGPUDimProps<"3d", ["s", "t", "r"], []>;
+let DA = 1 in {
+  def AMDGPUDimCube : AMDGPUDimProps<"cube", ["s", "t"], ["face"]>;
+  def AMDGPUDim1DArray : AMDGPUDimProps<"1darray", ["s"], ["slice"]>;
+  def AMDGPUDim2DArray : AMDGPUDimProps<"2darray", ["s", "t"], ["slice"]>;
+}
+def AMDGPUDim2DMsaa : AMDGPUDimProps<"2dmsaa", ["s", "t"], ["fragid"]>;
+let DA = 1 in {
+  def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<"2darraymsaa", ["s", "t"], ["slice", "fragid"]>;
+}
+
+def AMDGPUDims {
+  list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
+                                 AMDGPUDimCube, AMDGPUDim1DArray,
+                                 AMDGPUDim2DArray];
+  list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
+  list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
+}
+
+// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
+class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
+  string UpperCaseMod = ucmod;
+  string LowerCaseMod = lcmod;
+
+  // {offset} {bias} {z-compare}
+  list<AMDGPUArg> ExtraAddrArgs = extra_addr;
+  bit Gradients = 0;
+
+  // Name of the {lod} or {clamp} argument that is appended to the coordinates,
+  // if any.
+  string LodOrClamp = "";
+}
+
+// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
+// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
+defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
+  multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
+                                       list<AMDGPUArg> extra_addr> {
+    def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
+    def NAME#lcmod#_o : AMDGPUSampleVariant<
+        ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
+  }
+
+  multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
+                                        list<AMDGPUArg> extra_addr> {
+    defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
+    defm NAME : AMDGPUSampleHelper_Offset<
+        "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
+  }
+
+  multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
+                                      list<AMDGPUArg> extra_addr> {
+    defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
+    let LodOrClamp = "clamp" in
+    defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
+  }
+
+  defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
+    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
+    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
+        "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
+    let LodOrClamp = "lod" in
+    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
+    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
+  }
+
+  let Gradients = 1 in {
+    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
+    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
+  }
+}
+
+// Helper class to capture the profile of a dimension-aware image intrinsic.
+// This information is used to generate the intrinsic's type and to inform
+// codegen pattern matching.
+class AMDGPUDimProfile<string opmod,
+                       AMDGPUDimProps dim> {
+  AMDGPUDimProps Dim = dim;
+  string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
+
+  // These are entended to be overwritten by subclasses
+  bit IsSample = 0;
+  bit IsAtomic = 0;
+  list<LLVMType> RetTypes = [];
+  list<AMDGPUArg> DataArgs = [];
+  list<AMDGPUArg> ExtraAddrArgs = [];
+  bit Gradients = 0;
+  string LodClampMip = "";
+
+  int NumRetAndDataAnyTypes =
+    !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
+           !add(a, b.isAny));
+
+  list<AMDGPUArg> AddrArgs =
+    arglistconcat<[ExtraAddrArgs,
+                   !if(Gradients, dim.GradientArgs, []),
+                   !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
+                               !if(!eq(LodClampMip, ""),
+                                   []<AMDGPUArg>,
+                                   [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
+                  NumRetAndDataAnyTypes>.ret;
+  list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
+  list<AMDGPUArg> AddrDefaultArgs =
+    !foreach(arg, AddrArgs,
+             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
+                           !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
+                       arg.Name>);
+  list<AMDGPUArg> AddrA16Args =
+    !foreach(arg, AddrArgs,
+             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
+                           !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
+                       arg.Name>);
+}
+
+class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
+  let IsSample = base.IsSample;
+  let IsAtomic = base.IsAtomic;
+  let RetTypes = base.RetTypes;
+  let DataArgs = base.DataArgs;
+  let ExtraAddrArgs = base.ExtraAddrArgs;
+  let Gradients = base.Gradients;
+  let LodClampMip = base.LodClampMip;
+}
+
+class AMDGPUDimSampleProfile<string opmod,
+                             AMDGPUDimProps dim,
+                             AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
+  let IsSample = 1;
+  let RetTypes = [llvm_anyfloat_ty];
+  let ExtraAddrArgs = sample.ExtraAddrArgs;
+  let Gradients = sample.Gradients;
+  let LodClampMip = sample.LodOrClamp;
+}
+
+class AMDGPUDimNoSampleProfile<string opmod,
+                               AMDGPUDimProps dim,
+                               list<LLVMType> retty,
+                               list<AMDGPUArg> dataargs,
+                               bit Mip = 0> : AMDGPUDimProfile<opmod, dim> {
+  let RetTypes = retty;
+  let DataArgs = dataargs;
+  let LodClampMip = !if(Mip, "mip", "");
+}
+
+class AMDGPUDimAtomicProfile<string opmod,
+                             AMDGPUDimProps dim,
+                             list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
+  let RetTypes = [llvm_anyint_ty];
+  let DataArgs = dataargs;
+  let IsAtomic = 1;
+}
+
+class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
+  let RetTypes = [llvm_anyfloat_ty];
+  let DataArgs = [];
+  let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
+  let LodClampMip = "mip";
+}
+
+// All dimension-aware intrinsics are derived from this class.
+class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
+                              list<IntrinsicProperty> props,
+                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
+    P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
+    !listconcat(
+      !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
+      !if(P_.IsAtomic, [], [llvm_i32_ty]),       // dmask(imm)
+      P_.AddrTypes,                              // vaddr(VGPR)
+      [llvm_v8i32_ty],                           // rsrc(SGPR)
+      !if(P_.IsSample, [llvm_v4i32_ty,           // samp(SGPR)
+                        llvm_i1_ty], []),        // unorm(imm)
+      [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
+       llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
+      props, "", sdnodeprops>,
+  AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
+                           !if(P_.IsAtomic, 0, 1)), 1> {
+  AMDGPUDimProfile P = P_;
+
+  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
+
+  let TargetPrefix = "amdgcn";
+}
+
+// Marker class for intrinsics with a DMask that determines the returned
+// channels.
+class AMDGPUImageDMaskIntrinsic;
+
+defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
+
+  //////////////////////////////////////////////////////////////////////////
+  // Load and store intrinsics
+  //////////////////////////////////////////////////////////////////////////
+  multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
+                                            list<LLVMType> retty,
+                                            list<AMDGPUArg> dataargs,
+                                            list<IntrinsicProperty> props,
+                                            list<SDNodeProperty> sdnodeprops,
+                                            bit Mip = 0> {
+    foreach dim = AMDGPUDims.NoMsaa in {
+      def !strconcat(NAME, "_", dim.Name)
+        : AMDGPUImageDimIntrinsic<
+            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
+            props, sdnodeprops>;
+    }
+  }
+
+  multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
+                                         list<LLVMType> retty,
+                                         list<AMDGPUArg> dataargs,
+                                         list<IntrinsicProperty> props,
+                                         list<SDNodeProperty> sdnodeprops,
+                                         bit Mip = 0> {
+    foreach dim = AMDGPUDims.All in {
+      def !strconcat(NAME, "_", dim.Name)
+        : AMDGPUImageDimIntrinsic<
+            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
+            props, sdnodeprops>;
+    }
+  }
+
+  defm int_amdgcn_image_load
+    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
+                                  [SDNPMemOperand]>,
+      AMDGPUImageDMaskIntrinsic;
+  defm int_amdgcn_image_load_mip
+    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
+                                     [IntrReadMem], [SDNPMemOperand], 1>,
+      AMDGPUImageDMaskIntrinsic;
+
+  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
+              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
+              [IntrWriteMem], [SDNPMemOperand]>;
+  defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
+              "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
+              [IntrWriteMem], [SDNPMemOperand], 1>;
+
+  //////////////////////////////////////////////////////////////////////////
+  // sample and getlod intrinsics
+  //////////////////////////////////////////////////////////////////////////
+  multiclass AMDGPUImageDimSampleDims<string opmod,
+                                      AMDGPUSampleVariant sample,
+                                      bit NoMem = 0> {
+    foreach dim = AMDGPUDims.NoMsaa in {
+      def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
+          AMDGPUDimSampleProfile<opmod, dim, sample>,
+          !if(NoMem, [IntrNoMem], [IntrReadMem]),
+          !if(NoMem, [], [SDNPMemOperand])>;
+    }
+  }
+
+  foreach sample = AMDGPUSampleVariants in {
+    defm int_amdgcn_image_sample # sample.LowerCaseMod
+      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
+        AMDGPUImageDMaskIntrinsic;
+  }
+
+  defm int_amdgcn_image_getlod
+    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
+      AMDGPUImageDMaskIntrinsic;
+
+  //////////////////////////////////////////////////////////////////////////
+  // getresinfo intrinsics
+  //////////////////////////////////////////////////////////////////////////
+  foreach dim = AMDGPUDims.All in {
+    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
+      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
+        AMDGPUImageDMaskIntrinsic;
+  }
+
+  //////////////////////////////////////////////////////////////////////////
+  // gather4 intrinsics
+  //////////////////////////////////////////////////////////////////////////
+  foreach sample = AMDGPUSampleVariantsNoGradients in {
+    foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
+      def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
+          AMDGPUImageDimIntrinsic<
+              AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
+              [IntrReadMem], [SDNPMemOperand]>;
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// atomic intrinsics
+//////////////////////////////////////////////////////////////////////////
+defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
+  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs> {
+    foreach dim = AMDGPUDims.All in {
+      def !strconcat(NAME, "_", dim.Name)
+        : AMDGPUImageDimIntrinsic<
+            AMDGPUDimAtomicProfile<opmod, dim, dataargs>,
+            [], [SDNPMemOperand]>;
+    }
+  }
+
+  multiclass AMDGPUImageDimAtomic<string opmod> {
+    defm "" : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">]>;
+  }
+
+  defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
+  defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
+  defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
+  defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
+  defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
+  defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
+  defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
+  defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
+  defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
+  defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
+
+  // TODO: INC/DEC are weird: they seem to have a vdata argument in hardware,
+  //       even though it clearly shouldn't be needed
+  defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
+  defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
+
+  defm int_amdgcn_image_atomic_cmpswap :
+      AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
+                                               AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Buffer intrinsics
+//////////////////////////////////////////////////////////////////////////
+
+let TargetPrefix = "amdgcn" in {
+
+defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
 
 class AMDGPUBufferLoad : Intrinsic <
   [llvm_anyfloat_ty],
@@ -482,7 +797,8 @@ class AMDGPUBufferLoad : Intrinsic <
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty,        // glc(imm)
    llvm_i1_ty],       // slc(imm)
-  [IntrReadMem], "", [SDNPMemOperand]>;
+  [IntrReadMem], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
@@ -494,7 +810,8 @@ class AMDGPUBufferStore : Intrinsic <
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty,        // glc(imm)
    llvm_i1_ty],       // slc(imm)
-  [IntrWriteMem], "", [SDNPMemOperand]>;
+  [IntrWriteMem], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<1>;
 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
@@ -509,7 +826,8 @@ def int_amdgcn_tbuffer_load : Intrinsic <
      llvm_i32_ty,     // nfmt(imm)
      llvm_i1_ty,     // glc(imm)
      llvm_i1_ty],    // slc(imm)
-    [IntrReadMem], "", [SDNPMemOperand]>;
+    [IntrReadMem], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<0>;
 
 def int_amdgcn_tbuffer_store : Intrinsic <
     [],
@@ -523,7 +841,8 @@ def int_amdgcn_tbuffer_store : Intrinsic <
      llvm_i32_ty,    // nfmt(imm)
      llvm_i1_ty,     // glc(imm)
      llvm_i1_ty],    // slc(imm)
-    [IntrWriteMem], "", [SDNPMemOperand]>;
+    [IntrWriteMem], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<1>;
 
 class AMDGPUBufferAtomic : Intrinsic <
   [llvm_i32_ty],
@@ -532,7 +851,8 @@ class AMDGPUBufferAtomic : Intrinsic <
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty],       // slc(imm)
-  [], "", [SDNPMemOperand]>;
+  [], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<1, 0>;
 def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
 def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
 def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
@@ -551,7 +871,10 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty],       // slc(imm)
-  [], "", [SDNPMemOperand]>;
+  [], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<2, 0>;
+
+} // defset AMDGPUBufferIntrinsics
 
 // Uses that do not set the done bit should set IntrWriteMem on the
 // call site.
@@ -753,6 +1076,19 @@ def int_amdgcn_readlane :
   GCCBuiltin<"__builtin_amdgcn_readlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+// The value to write and lane select arguments must be uniform across the
+// currently active threads of the current wave. Otherwise, the result is
+// undefined.
+def int_amdgcn_writelane :
+  GCCBuiltin<"__builtin_amdgcn_writelane">,
+  Intrinsic<[llvm_i32_ty], [
+    llvm_i32_ty,    // uniform value to write: returned by the selected lane
+    llvm_i32_ty,    // uniform lane select
+    llvm_i32_ty     // returned by all lanes other than the selected one
+  ],
+  [IntrNoMem, IntrConvergent]
+>;
+
 def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable]
@@ -851,6 +1187,109 @@ def int_amdgcn_ds_bpermute :
   GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+//===----------------------------------------------------------------------===//
+// Deep learning intrinsics.
+//===----------------------------------------------------------------------===//
+
+// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_fdot2 :
+  GCCBuiltin<"__builtin_amdgcn_fdot2">,
+  Intrinsic<
+    [llvm_float_ty], // %r
+    [
+      llvm_v2f16_ty, // %a
+      llvm_v2f16_ty, // %b
+      llvm_float_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_sdot2 :
+  GCCBuiltin<"__builtin_amdgcn_sdot2">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_v2i16_ty, // %a
+      llvm_v2i16_ty, // %b
+      llvm_i32_ty    // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_udot2 :
+  GCCBuiltin<"__builtin_amdgcn_udot2">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_v2i16_ty, // %a
+      llvm_v2i16_ty, // %b
+      llvm_i32_ty    // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+def int_amdgcn_sdot4 :
+  GCCBuiltin<"__builtin_amdgcn_sdot4">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+def int_amdgcn_udot4 :
+  GCCBuiltin<"__builtin_amdgcn_udot4">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
+//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
+def int_amdgcn_sdot8 :
+  GCCBuiltin<"__builtin_amdgcn_sdot8">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
+//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
+def int_amdgcn_udot8 :
+  GCCBuiltin<"__builtin_amdgcn_udot8">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
 
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsARM.td b/contrib/llvm/include/llvm/IR/IntrinsicsARM.td
index fe3861301689..f25d2f1dbb5d 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -369,6 +369,10 @@ class Neon_3Arg_Long_Intrinsic
   : Intrinsic<[llvm_anyvector_ty],
               [LLVMMatchType<0>, LLVMTruncatedType<0>, LLVMTruncatedType<0>],
               [IntrNoMem]>;
+
+class Neon_1FloatArg_Intrinsic
+  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
 class Neon_CvtFxToFP_Intrinsic
   : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 class Neon_CvtFPToFx_Intrinsic
@@ -591,8 +595,8 @@ def int_arm_neon_vtbx2 : Neon_Tbl4Arg_Intrinsic;
 def int_arm_neon_vtbx3 : Neon_Tbl5Arg_Intrinsic;
 def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic;
 
-// Vector Rounding
-def int_arm_neon_vrintn : Neon_1Arg_Intrinsic;
+// Vector and Scalar Rounding.
+def int_arm_neon_vrintn : Neon_1FloatArg_Intrinsic;
 def int_arm_neon_vrintx : Neon_1Arg_Intrinsic;
 def int_arm_neon_vrinta : Neon_1Arg_Intrinsic;
 def int_arm_neon_vrintz : Neon_1Arg_Intrinsic;
@@ -616,6 +620,18 @@ def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                                   [llvm_anyptr_ty, llvm_i32_ty],
                                   [IntrReadMem, IntrArgMemOnly]>;
 
+def int_arm_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                    [LLVMAnyPointerType<LLVMMatchType<0>>],
+                                    [IntrReadMem, IntrArgMemOnly]>;
+def int_arm_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                     LLVMMatchType<0>],
+                                    [LLVMAnyPointerType<LLVMMatchType<0>>],
+                                    [IntrReadMem, IntrArgMemOnly]>;
+def int_arm_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                     LLVMMatchType<0>, LLVMMatchType<0>],
+                                    [LLVMAnyPointerType<LLVMMatchType<0>>],
+                                    [IntrReadMem, IntrArgMemOnly]>;
+
 // Vector load N-element structure to one lane.
 // Source operands are: the address, the N input vectors (since only one
 // lane is assigned), the lane number, and the alignment.
@@ -636,6 +652,20 @@ def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                                        LLVMMatchType<0>, llvm_i32_ty,
                                        llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
 
+// Vector load N-element structure to all lanes.
+// Source operands are the address and alignment.
+def int_arm_neon_vld2dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                     [llvm_anyptr_ty, llvm_i32_ty],
+                                     [IntrReadMem, IntrArgMemOnly]>;
+def int_arm_neon_vld3dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                      LLVMMatchType<0>],
+                                     [llvm_anyptr_ty, llvm_i32_ty],
+                                     [IntrReadMem, IntrArgMemOnly]>;
+def int_arm_neon_vld4dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                      LLVMMatchType<0>, LLVMMatchType<0>],
+                                     [llvm_anyptr_ty, llvm_i32_ty],
+                                     [IntrReadMem, IntrArgMemOnly]>;
+
 // Interleaving vector stores from N-element structures.
 // Source operands are: the address, the N vectors, and the alignment.
 def int_arm_neon_vst1 : Intrinsic<[],
@@ -655,6 +685,20 @@ def int_arm_neon_vst4 : Intrinsic<[],
                                    LLVMMatchType<1>, llvm_i32_ty],
                                   [IntrArgMemOnly]>;
 
+def int_arm_neon_vst1x2 : Intrinsic<[],
+                                    [llvm_anyptr_ty, llvm_anyvector_ty,
+                                     LLVMMatchType<1>],
+                                    [IntrArgMemOnly, NoCapture<0>]>;
+def int_arm_neon_vst1x3 : Intrinsic<[],
+                                    [llvm_anyptr_ty, llvm_anyvector_ty,
+                                     LLVMMatchType<1>, LLVMMatchType<1>],
+                                    [IntrArgMemOnly, NoCapture<0>]>;
+def int_arm_neon_vst1x4 : Intrinsic<[],
+                                    [llvm_anyptr_ty, llvm_anyvector_ty,
+                                     LLVMMatchType<1>, LLVMMatchType<1>,
+                                     LLVMMatchType<1>],
+                                    [IntrArgMemOnly, NoCapture<0>]>;
+
 // Vector store N-element structure from one lane.
 // Source operands are: the address, the N vectors, the lane number, and
 // the alignment.
@@ -713,4 +757,14 @@ def int_arm_neon_sha256h: SHA_3Arg_v4i32_Intrinsic;
 def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic;
 def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic;
 
+// Armv8.2-A dot product instructions
+class Neon_Dot_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, llvm_anyvector_ty,
+               LLVMMatchType<1>],
+              [IntrNoMem]>;
+def int_arm_neon_udot : Neon_Dot_Intrinsic;
+def int_arm_neon_sdot : Neon_Dot_Intrinsic;
+
+
 } // end TargetPrefix
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsHexagon.td b/contrib/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 5c96702bca76..25f4215d68a8 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -21,6 +21,13 @@ let TargetPrefix = "hexagon" in {
                               list<IntrinsicProperty> properties>
     : GCCBuiltin<!strconcat("__builtin_", GCCIntSuffix)>,
       Intrinsic<ret_types, param_types, properties>;
+
+  /// Hexagon_NonGCC_Intrinsic - Base class for bitcode convertible Hexagon
+  /// intrinsics.
+  class Hexagon_NonGCC_Intrinsic<list<LLVMType> ret_types,
+                                 list<LLVMType> param_types,
+                                 list<IntrinsicProperty> properties>
+    : Intrinsic<ret_types, param_types, properties>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -424,13 +431,13 @@ class Hexagon_mem_memsisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty,
                            llvm_i32_ty],
-                          [IntrArgMemOnly]>;
+                          [IntrWriteMem]>;
 
 class Hexagon_mem_memdisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty,
                            llvm_i32_ty],
-                          [IntrArgMemOnly]>;
+                          [IntrWriteMem]>;
 
 class Hexagon_mem_memmemsisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
@@ -442,13 +449,13 @@ class Hexagon_mem_memsisisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty,
                            llvm_i32_ty, llvm_i32_ty],
-                          [IntrArgMemOnly]>;
+                          [IntrWriteMem]>;
 
 class Hexagon_mem_memdisisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty,
                            llvm_i32_ty, llvm_i32_ty],
-                          [IntrArgMemOnly]>;
+                          [IntrWriteMem]>;
 
 class Hexagon_v256_v256v256_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
@@ -636,41 +643,6 @@ class Hexagon_df_dfdfdfqi_Intrinsic<string GCCIntSuffix>
 // This one below will not be auto-generated,
 // so make sure, you don't overwrite this one.
 //
-// BUILTIN_INFO(SI_to_SXTHI_asrh,SI_ftype_SI,1)
-//
-def int_hexagon_SI_to_SXTHI_asrh :
-Hexagon_si_si_Intrinsic<"SI_to_SXTHI_asrh">;
-//
-// BUILTIN_INFO_NONCONST(brev_ldd,PTR_ftype_PTRPTRSI,3)
-//
-def int_hexagon_brev_ldd :
-Hexagon_mem_memmemsi_Intrinsic<"brev_ldd">;
-//
-// BUILTIN_INFO_NONCONST(brev_ldw,PTR_ftype_PTRPTRSI,3)
-//
-def int_hexagon_brev_ldw :
-Hexagon_mem_memmemsi_Intrinsic<"brev_ldw">;
-//
-// BUILTIN_INFO_NONCONST(brev_ldh,PTR_ftype_PTRPTRSI,3)
-//
-def int_hexagon_brev_ldh :
-Hexagon_mem_memmemsi_Intrinsic<"brev_ldh">;
-//
-// BUILTIN_INFO_NONCONST(brev_lduh,PTR_ftype_PTRPTRSI,3)
-//
-def int_hexagon_brev_lduh :
-Hexagon_mem_memmemsi_Intrinsic<"brev_lduh">;
-//
-// BUILTIN_INFO_NONCONST(brev_ldb,PTR_ftype_PTRPTRSI,3)
-//
-def int_hexagon_brev_ldb :
-Hexagon_mem_memmemsi_Intrinsic<"brev_ldb">;
-//
-// BUILTIN_INFO_NONCONST(brev_ldub,PTR_ftype_PTRPTRSI,3)
-//
-def int_hexagon_brev_ldub :
-Hexagon_mem_memmemsi_Intrinsic<"brev_ldub">;
-//
 // BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4)
 //
 def int_hexagon_circ_ldd :
@@ -702,31 +674,6 @@ def int_hexagon_circ_ldub :
 Hexagon_mem_memmemsisi_Intrinsic<"circ_ldub">;
 
 //
-// BUILTIN_INFO_NONCONST(brev_stb,PTR_ftype_PTRSISI,3)
-//
-def int_hexagon_brev_stb :
-Hexagon_mem_memsisi_Intrinsic<"brev_stb">;
-//
-// BUILTIN_INFO_NONCONST(brev_sthhi,PTR_ftype_PTRSISI,3)
-//
-def int_hexagon_brev_sthhi :
-Hexagon_mem_memsisi_Intrinsic<"brev_sthhi">;
-//
-// BUILTIN_INFO_NONCONST(brev_sth,PTR_ftype_PTRSISI,3)
-//
-def int_hexagon_brev_sth :
-Hexagon_mem_memsisi_Intrinsic<"brev_sth">;
-//
-// BUILTIN_INFO_NONCONST(brev_stw,PTR_ftype_PTRSISI,3)
-//
-def int_hexagon_brev_stw :
-Hexagon_mem_memsisi_Intrinsic<"brev_stw">;
-//
-// BUILTIN_INFO_NONCONST(brev_std,PTR_ftype_PTRSISI,3)
-//
-def int_hexagon_brev_std :
-Hexagon_mem_memdisi_Intrinsic<"brev_std">;
-//
 // BUILTIN_INFO_NONCONST(circ_std,PTR_ftype_PTRDISISI,4)
 //
 def int_hexagon_circ_std :
@@ -9300,6 +9247,60 @@ Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentq_128B">;
 def int_hexagon_V6_vmaskedstorentnq_128B :
 Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentnq_128B">;
 
+multiclass Hexagon_custom_circ_ld_Intrinsic<LLVMType ElTy> {
+  def NAME#_pci : Hexagon_NonGCC_Intrinsic<
+    [ElTy, llvm_ptr_ty],
+    [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<3>]>;
+  def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
+    [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<2>]>;
+}
+
+defm int_hexagon_L2_loadrub : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadrb : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadruh : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadrh : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadri : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadrd : Hexagon_custom_circ_ld_Intrinsic<llvm_i64_ty>;
+
+multiclass Hexagon_custom_circ_st_Intrinsic<LLVMType ElTy> {
+  def NAME#_pci : Hexagon_NonGCC_Intrinsic<
+    [llvm_ptr_ty],
+    [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<4>]>;
+  def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
+    [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<3>]>;
+}
+
+defm int_hexagon_S2_storerb : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storerh : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storerf : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storeri : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storerd : Hexagon_custom_circ_st_Intrinsic<llvm_i64_ty>;
+
+// The front-end emits the intrinsic call with only two arguments. The third
+// argument from the builtin is already used by front-end to write to memory
+// by generating a store.
+class Hexagon_custom_brev_ld_Intrinsic<LLVMType ElTy>
+ : Hexagon_NonGCC_Intrinsic<
+    [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty],
+    [IntrReadMem]>;
+
+def int_hexagon_L2_loadrub_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadrb_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadruh_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadrh_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadri_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadrd_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i64_ty>;
+
+def int_hexagon_S2_storerb_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stb">;
+def int_hexagon_S2_storerh_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sth">;
+def int_hexagon_S2_storerf_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sthhi">;
+def int_hexagon_S2_storeri_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stw">;
+def int_hexagon_S2_storerd_pbr : Hexagon_mem_memdisi_Intrinsic<"brev_std">;
+
 
 ///
 /// HexagonV62 intrinsics
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsNVVM.td b/contrib/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 73622ce9303f..7f694f68969e 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -3884,96 +3884,100 @@ def int_nvvm_match_all_sync_i64p :
 //
 
 // WMMA.LOAD
-class NVVM_WMMA_LD_ALSTS<string Abc, string Layout, string Space,
-                         string Type, LLVMType regty, int WithStride>
+class NVVM_WMMA_LD_GALSTS<string Geometry, string Abc, string Layout,
+                          string Type, LLVMType regty, int WithStride>
   : Intrinsic<!if(!eq(Abc#Type,"cf16"),
                   [regty, regty, regty, regty],
                   [regty, regty, regty, regty,
                    regty, regty, regty, regty]),
-              !if(WithStride, [llvm_ptr_ty, llvm_i32_ty], [llvm_ptr_ty]),
-              [], // Properties must be set during instantiation.
-              "llvm.nvvm.wmma.load."#Abc#".sync."#Layout#".m16n16k16"
-                #Space
-                #!if(WithStride,".stride","")
-                #"."#Type>;
-
-multiclass NVVM_WMMA_LD_ALST<string Abc, string Layout, string Space,
-                           string Type, LLVMType regty> {
-  def _stride: NVVM_WMMA_LD_ALSTS<Abc, Layout, Space, Type, regty, 1>;
-  def NAME   : NVVM_WMMA_LD_ALSTS<Abc, Layout, Space, Type, regty, 0>;
+              !if(WithStride, [llvm_anyptr_ty, llvm_i32_ty], [llvm_anyptr_ty]),
+              [IntrReadMem, IntrArgMemOnly, ReadOnly<0>, NoCapture<0>],
+              "llvm.nvvm.wmma."
+                # Geometry
+                # ".load"
+                # "." # Abc
+                # "." # Layout
+                # !if(WithStride, ".stride", "")
+                # "." # Type>;
+
+multiclass NVVM_WMMA_LD_GALT<string Geometry, string Abc, string Layout,
+                             string Type, LLVMType regty> {
+  def _stride: NVVM_WMMA_LD_GALSTS<Geometry, Abc, Layout, Type, regty, 1>;
+  def NAME   : NVVM_WMMA_LD_GALSTS<Geometry, Abc, Layout, Type, regty, 0>;
 }
 
-multiclass NVVM_WMMA_LD_ALT<string Abc, string Layout,
-                        string Type, LLVMType regty> {
-  defm _global: NVVM_WMMA_LD_ALST<Abc, Layout, ".global", Type, regty>;
-  defm _shared: NVVM_WMMA_LD_ALST<Abc, Layout, ".shared", Type, regty>;
-  defm NAME:    NVVM_WMMA_LD_ALST<Abc, Layout,        "", Type, regty>;
+multiclass NVVM_WMMA_LD_GAT<string Geometry, string Abc,
+                           string Type, LLVMType regty> {
+  defm _row: NVVM_WMMA_LD_GALT<Geometry, Abc, "row", Type, regty>;
+  defm _col: NVVM_WMMA_LD_GALT<Geometry, Abc, "col", Type, regty>;
 }
 
-multiclass NVVM_WMMA_LD_AT<string Abc, string Type, LLVMType regty> {
-  defm _row: NVVM_WMMA_LD_ALT<Abc, "row", Type, regty>;
-  defm _col: NVVM_WMMA_LD_ALT<Abc, "col", Type, regty>;
+multiclass NVVM_WMMA_LD_G<string Geometry> {
+  defm _a_f16: NVVM_WMMA_LD_GAT<Geometry, "a", "f16", llvm_v2f16_ty>;
+  defm _b_f16: NVVM_WMMA_LD_GAT<Geometry, "b", "f16", llvm_v2f16_ty>;
+  defm _c_f16: NVVM_WMMA_LD_GAT<Geometry, "c", "f16", llvm_v2f16_ty>;
+  defm _c_f32: NVVM_WMMA_LD_GAT<Geometry, "c", "f32", llvm_float_ty>;
 }
 
-// For some reason ReadOnly<N> and NoCapture<N> confuses tblgen if they are
-// passed to Intrinsic<> form inside of a multiclass. Setting them globally
-// outside of the multiclass works.
-let IntrProperties = [IntrReadMem, IntrArgMemOnly,
-                      ReadOnly<0>, NoCapture<0>] in {
-  defm int_nvvm_wmma_load_a_f16: NVVM_WMMA_LD_AT<"a", "f16", llvm_v2f16_ty>;
-  defm int_nvvm_wmma_load_b_f16: NVVM_WMMA_LD_AT<"b", "f16", llvm_v2f16_ty>;
-  defm int_nvvm_wmma_load_c_f16: NVVM_WMMA_LD_AT<"c", "f16", llvm_v2f16_ty>;
-  defm int_nvvm_wmma_load_c_f32: NVVM_WMMA_LD_AT<"c", "f32", llvm_float_ty>;
+multiclass NVVM_WMMA_LD {
+  defm _m32n8k16_load: NVVM_WMMA_LD_G<"m32n8k16">;
+  defm _m16n16k16_load: NVVM_WMMA_LD_G<"m16n16k16">;
+  defm _m8n32k16_load: NVVM_WMMA_LD_G<"m8n32k16">;
 }
 
+defm int_nvvm_wmma: NVVM_WMMA_LD;
+
 // WMMA.STORE.D
-class NVVM_WMMA_STD_LSTS<string Layout, string Space,
-                         string Type, LLVMType regty, int WithStride,
-                         // This is only used to create a typed empty array we
-                         // need to pass to !if below.
-                         list<LLVMType>Empty=[]>
+class NVVM_WMMA_STD_GLSTS<string Geometry, string Layout,
+                          string Type, LLVMType regty, int WithStride,
+                          // This is only used to create a typed empty array we
+                          // need to pass to !if below.
+                          list<LLVMType>Empty=[]>
   : Intrinsic<[],
               !listconcat(
-                [llvm_ptr_ty],
+                [llvm_anyptr_ty],
                 !if(!eq(Type,"f16"),
                     [regty, regty, regty, regty],
                     [regty, regty, regty, regty,
                      regty, regty, regty, regty]),
                 !if(WithStride, [llvm_i32_ty], Empty)),
-              [], // Properties must be set during instantiation.
-              "llvm.nvvm.wmma.store.d.sync."#Layout
-                   #".m16n16k16"#Space
-                   #!if(WithStride,".stride","")
-                   #"."#Type>;
-
-multiclass NVVM_WMMA_STD_LST<string Layout, string Space,
-                            string Type, LLVMType regty> {
-  def _stride: NVVM_WMMA_STD_LSTS<Layout, Space, Type, regty, 1>;
-  def NAME:    NVVM_WMMA_STD_LSTS<Layout, Space, Type, regty, 0>;
+              [IntrWriteMem, IntrArgMemOnly, WriteOnly<0>, NoCapture<0>],
+              "llvm.nvvm.wmma."
+                   # Geometry
+                   # ".store.d"
+                   # "." # Layout
+                   # !if(WithStride, ".stride", "")
+                   # "." # Type>;
+
+multiclass NVVM_WMMA_STD_GLT<string Geometry, string Layout,
+                             string Type, LLVMType regty> {
+  def _stride: NVVM_WMMA_STD_GLSTS<Geometry, Layout, Type, regty, 1>;
+  def NAME:    NVVM_WMMA_STD_GLSTS<Geometry, Layout, Type, regty, 0>;
 }
 
-multiclass NVVM_WMMA_STD_LT<string Layout, string Type, LLVMType regty> {
-  defm _global: NVVM_WMMA_STD_LST<Layout, ".global", Type, regty>;
-  defm _shared: NVVM_WMMA_STD_LST<Layout, ".shared", Type, regty>;
-  defm    NAME: NVVM_WMMA_STD_LST<Layout,        "", Type, regty>;
+multiclass NVVM_WMMA_STD_GT<string Geometry, string Type, LLVMType regty> {
+  defm _row: NVVM_WMMA_STD_GLT<Geometry, "row", Type, regty>;
+  defm _col: NVVM_WMMA_STD_GLT<Geometry, "col", Type, regty>;
 }
-
-multiclass NVVM_WMMA_STD_T<string Type, LLVMType regty> {
-  defm _row: NVVM_WMMA_STD_LT<"row", Type, regty>;
-  defm _col: NVVM_WMMA_STD_LT<"col", Type, regty>;
+multiclass NVVM_WMMA_STD_G<string Geometry> {
+  defm _d_f16: NVVM_WMMA_STD_GT<Geometry, "f16", llvm_v2f16_ty>;
+  defm _d_f32: NVVM_WMMA_STD_GT<Geometry, "f32", llvm_float_ty>;
 }
 
-let IntrProperties = [IntrWriteMem, IntrArgMemOnly,
-                      WriteOnly<0>, NoCapture<0>] in {
-  defm int_nvvm_wmma_store_d_f16: NVVM_WMMA_STD_T<"f16", llvm_v2f16_ty>;
-  defm int_nvvm_wmma_store_d_f32: NVVM_WMMA_STD_T<"f32", llvm_float_ty>;
+multiclass NVVM_WMMA_STD {
+  defm _m32n8k16_store:  NVVM_WMMA_STD_G<"m32n8k16">;
+  defm _m16n16k16_store: NVVM_WMMA_STD_G<"m16n16k16">;
+  defm _m8n32k16_store:  NVVM_WMMA_STD_G<"m8n32k16">;
 }
 
+defm int_nvvm_wmma: NVVM_WMMA_STD;
+
 // WMMA.MMA
-class NVVM_WMMA_MMA_ABDCS<string ALayout, string BLayout,
-                          string DType, LLVMType d_regty,
-                          string CType, LLVMType c_regty,
-                          string Satfinite = "">
+class NVVM_WMMA_MMA_GABDCS<string Geometry,
+                           string ALayout, string BLayout,
+                           string DType, LLVMType d_regty,
+                           string CType, LLVMType c_regty,
+                           string Satfinite = "">
   : Intrinsic<!if(!eq(DType,"f16"),
                       [d_regty, d_regty, d_regty, d_regty],
                       [d_regty, d_regty, d_regty, d_regty,
@@ -3990,39 +3994,54 @@ class NVVM_WMMA_MMA_ABDCS<string ALayout, string BLayout,
                       [c_regty, c_regty, c_regty, c_regty,
                        c_regty, c_regty, c_regty, c_regty])),
               [IntrNoMem],
-              "llvm.nvvm.wmma.mma.sync."#ALayout#"."#BLayout
-                 #".m16n16k16."#DType#"."#CType#Satfinite>;
-
-multiclass NVVM_WMMA_MMA_ABDC<string ALayout, string BLayout,
-                              string DType, LLVMType d_regty,
-                              string CType, LLVMType c_regty> {
-  def NAME : NVVM_WMMA_MMA_ABDCS<ALayout, BLayout,
-                                 DType, d_regty,
-                                 CType, c_regty>;
-  def _satfinite: NVVM_WMMA_MMA_ABDCS<ALayout, BLayout,
-                                      DType, d_regty,
-                                      CType, c_regty,".satfinite">;
+              "llvm.nvvm.wmma."
+                # Geometry
+                # ".mma"
+                # "." # ALayout
+                # "." # BLayout
+                # "." # DType
+                # "." # CType
+                # Satfinite> {
 }
 
-multiclass NVVM_WMMA_MMA_ABD<string ALayout, string BLayout,
+multiclass NVVM_WMMA_MMA_GABDC<string Geometry, string ALayout, string BLayout,
+                               string DType, LLVMType d_regty,
+                               string CType, LLVMType c_regty> {
+  def NAME : NVVM_WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
+                                  DType, d_regty, CType, c_regty>;
+  def _satfinite: NVVM_WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
+                                       DType, d_regty, CType, c_regty,".satfinite">;
+}
+
+multiclass NVVM_WMMA_MMA_GABD<string Geometry, string ALayout, string BLayout,
                               string DType, LLVMType d_regty> {
-  defm _f16: NVVM_WMMA_MMA_ABDC<ALayout, BLayout, DType, d_regty,
+  defm _f16: NVVM_WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_regty,
                                 "f16", llvm_v2f16_ty>;
-  defm _f32: NVVM_WMMA_MMA_ABDC<ALayout, BLayout, DType, d_regty,
+  defm _f32: NVVM_WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_regty,
                                 "f32", llvm_float_ty>;
 }
 
-multiclass NVVM_WMMA_MMA_AB<string ALayout, string BLayout> {
-  defm _f16: NVVM_WMMA_MMA_ABD<ALayout, BLayout, "f16", llvm_v2f16_ty>;
-  defm _f32: NVVM_WMMA_MMA_ABD<ALayout, BLayout, "f32", llvm_float_ty>;
+multiclass NVVM_WMMA_MMA_GAB<string Geometry, string ALayout, string BLayout> {
+  defm _f16: NVVM_WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f16", llvm_v2f16_ty>;
+  defm _f32: NVVM_WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f32", llvm_float_ty>;
+}
+
+multiclass NVVM_WMMA_MMA_GA<string Geometry, string ALayout> {
+  defm _col: NVVM_WMMA_MMA_GAB<Geometry, ALayout, "col">;
+  defm _row: NVVM_WMMA_MMA_GAB<Geometry, ALayout, "row">;
+}
+
+multiclass NVVM_WMMA_MMA_G<string Geometry> {
+  defm _col: NVVM_WMMA_MMA_GA<Geometry, "col">;
+  defm _row: NVVM_WMMA_MMA_GA<Geometry, "row">;
 }
 
-multiclass NVVM_WMMA_MMA_A<string ALayout> {
-  defm _col: NVVM_WMMA_MMA_AB<ALayout, "col">;
-  defm _row: NVVM_WMMA_MMA_AB<ALayout, "row">;
+multiclass NVVM_WMMA_MMA {
+  defm _m32n8k16_mma : NVVM_WMMA_MMA_G<"m32n8k16">;
+  defm _m16n16k16_mma : NVVM_WMMA_MMA_G<"m16n16k16">;
+  defm _m8n32k16_mma : NVVM_WMMA_MMA_G<"m8n32k16">;
 }
 
-defm int_nvvm_wmma_mma_sync_col: NVVM_WMMA_MMA_A<"col">;
-defm int_nvvm_wmma_mma_sync_row: NVVM_WMMA_MMA_A<"row">;
+defm int_nvvm_wmma : NVVM_WMMA_MMA;
 
 } // let TargetPrefix = "nvvm"
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index a302d5726aa3..c4e753af25ca 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -61,6 +61,29 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_bpermd : GCCBuiltin<"__builtin_bpermd">,
                        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                  [IntrNoMem]>;
+
+  def int_ppc_truncf128_round_to_odd
+      : GCCBuiltin<"__builtin_truncf128_round_to_odd">,
+        Intrinsic <[llvm_double_ty], [llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_sqrtf128_round_to_odd
+      : GCCBuiltin<"__builtin_sqrtf128_round_to_odd">,
+        Intrinsic <[llvm_f128_ty], [llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_addf128_round_to_odd
+      : GCCBuiltin<"__builtin_addf128_round_to_odd">,
+        Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_subf128_round_to_odd
+      : GCCBuiltin<"__builtin_subf128_round_to_odd">,
+        Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_mulf128_round_to_odd
+      : GCCBuiltin<"__builtin_mulf128_round_to_odd">,
+        Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_divf128_round_to_odd
+      : GCCBuiltin<"__builtin_divf128_round_to_odd">,
+        Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_fmaf128_round_to_odd
+      : GCCBuiltin<"__builtin_fmaf128_round_to_odd">,
+        Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
+
 }
 
 
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/contrib/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 640ef627bc46..7afc755a1e37 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -8,19 +8,60 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines all of the WebAssembly-specific intrinsics.
+/// This file defines all of the WebAssembly-specific intrinsics.
 ///
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "wasm" in {  // All intrinsics start with "llvm.wasm.".
 
-// Note that current_memory is not IntrNoMem because it must be sequenced with
-// respect to grow_memory calls.
+// Query the current memory size, and increase the current memory size.
+// Note that memory.size is not IntrNoMem because it must be sequenced with
+// respect to memory.grow calls.
+def int_wasm_memory_size : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_i32_ty],
+                                     [IntrReadMem]>;
+def int_wasm_memory_grow : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_i32_ty, LLVMMatchType<0>],
+                                     []>;
+
+// These are the old names.
+def int_wasm_mem_size : Intrinsic<[llvm_anyint_ty],
+                                  [llvm_i32_ty],
+                                  [IntrReadMem]>;
+def int_wasm_mem_grow : Intrinsic<[llvm_anyint_ty],
+                                  [llvm_i32_ty, LLVMMatchType<0>],
+                                  []>;
+
+// These are the old old names. They also lack the immediate field.
 def int_wasm_current_memory : Intrinsic<[llvm_anyint_ty], [], [IntrReadMem]>;
 def int_wasm_grow_memory : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], []>;
 
+//===----------------------------------------------------------------------===//
 // Exception handling intrinsics
-def int_wasm_throw: Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], [Throws]>;
-def int_wasm_rethrow: Intrinsic<[], [], [Throws]>;
+//===----------------------------------------------------------------------===//
+
+// throw / rethrow
+def int_wasm_throw : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty],
+                               [Throws, IntrNoReturn]>;
+def int_wasm_rethrow : Intrinsic<[], [], [Throws, IntrNoReturn]>;
+
+// Since wasm does not use landingpad instructions, these instructions return
+// exception pointer and selector values until we lower them in WasmEHPrepare.
+def int_wasm_get_exception : Intrinsic<[llvm_ptr_ty], [llvm_token_ty],
+                                       [IntrHasSideEffects]>;
+def int_wasm_get_ehselector : Intrinsic<[llvm_i32_ty], [llvm_token_ty],
+                                        [IntrHasSideEffects]>;
+
+// wasm.catch returns the pointer to the exception object caught by wasm 'catch'
+// instruction.
+def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
+                               [IntrHasSideEffects]>;
+
+// WebAssembly EH must maintain the landingpads in the order assigned to them
+// by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
+// used in order to give them the indices in WasmEHPrepare.
+def int_wasm_landingpad_index: Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
 
+// Returns LSDA address of the current function.
+def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 }
diff --git a/contrib/llvm/include/llvm/IR/IntrinsicsX86.td b/contrib/llvm/include/llvm/IR/IntrinsicsX86.td
index 7c000e2b1dc7..905afc130d8f 100644
--- a/contrib/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/contrib/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -63,6 +63,12 @@ let TargetPrefix = "x86" in {
               Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>;
 }
 
+// Read processor ID.
+let TargetPrefix = "x86" in {
+  def int_x86_rdpid : GCCBuiltin<"__builtin_ia32_rdpid">,
+              Intrinsic<[llvm_i32_ty], [], []>;
+}
+
 //===----------------------------------------------------------------------===//
 // CET SS
 let TargetPrefix = "x86" in {
@@ -174,12 +180,6 @@ let TargetPrefix = "x86" in {
 
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_sse_sqrt_ps : GCCBuiltin<"__builtin_ia32_sqrtps">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
-                        [IntrNoMem]>;
   def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
@@ -211,6 +211,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  // NOTE: This comparison intrinsic is not used by clang as long as the
+  //       distinction in signaling behaviour is not implemented.
   def int_x86_sse_cmp_ps :
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
@@ -263,12 +265,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_sse_cvttss2si64 : GCCBuiltin<"__builtin_ia32_cvttss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtsi2ss : // TODO: Remove this intrinsic.
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtsi642ss : // TODO: Remove this intrinsic.
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i64_ty], [IntrNoMem]>;
 
   def int_x86_sse_cvtps2pi : GCCBuiltin<"__builtin_ia32_cvtps2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
@@ -304,12 +300,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // FP arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_sse2_sqrt_pd : GCCBuiltin<"__builtin_ia32_sqrtpd">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
-                        [IntrNoMem]>;
   def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
@@ -329,6 +319,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_cmp_sd : GCCBuiltin<"__builtin_ia32_cmpsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  // NOTE: This comparison intrinsic is not used by clang as long as the
+  //       distinction in signaling behaviour is not implemented.
   def int_x86_sse2_cmp_pd :
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
@@ -402,9 +394,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq128">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty,
-                         llvm_v4i32_ty], [IntrNoMem, Commutative]>;
   def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
@@ -468,8 +457,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Conversion ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_cvtdq2ps : GCCBuiltin<"__builtin_ia32_cvtdq2ps">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_sse2_cvtpd2dq : GCCBuiltin<"__builtin_ia32_cvtpd2dq">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
   def int_x86_sse2_cvttpd2dq : GCCBuiltin<"__builtin_ia32_cvttpd2dq">,
@@ -488,18 +475,9 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
   def int_x86_sse2_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_cvttsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtsi2sd : // TODO: Remove this intrinsic.
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtsi642sd : // TODO: Remove this intrinsic.
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_i64_ty], [IntrNoMem]>;
   def int_x86_sse2_cvtsd2ss : GCCBuiltin<"__builtin_ia32_cvtsd2ss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse2_cvtss2sd : // TODO: Remove this intrinsic.
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_sse_cvtpd2pi : GCCBuiltin<"__builtin_ia32_cvtpd2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
   def int_x86_sse_cvttpd2pi: GCCBuiltin<"__builtin_ia32_cvttpd2pi">,
@@ -797,13 +775,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [IntrNoMem]>;
 }
 
-// Vector multiply
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_pmuldq          : GCCBuiltin<"__builtin_ia32_pmuldq128">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem, Commutative]>;
-}
-
 // Vector insert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_insertps       : GCCBuiltin<"__builtin_ia32_insertps128">,
@@ -982,11 +953,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_avx_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
-
   def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
@@ -1033,325 +999,99 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_vpermilvarps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermi2var_d_128 :
-       GCCBuiltin<"__builtin_ia32_vpermi2vard128_mask">,
-        Intrinsic<[llvm_v4i32_ty],
-        [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
-        [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2vard256_mask">,
-          Intrinsic<[llvm_v8i32_ty],
-          [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2vard512_mask">,
-          Intrinsic<[llvm_v16i32_ty],
-          [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_hi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varhi128_mask">,
-          Intrinsic<[llvm_v8i16_ty],
-          [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_hi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varhi256_mask">,
-          Intrinsic<[llvm_v16i16_ty],
-          [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_hi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varhi512_mask">,
-          Intrinsic<[llvm_v32i16_ty],
-          [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_pd_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varpd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_pd_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varpd512_mask">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_ps_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_ps_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varps512_mask">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varq128_mask">,
-          Intrinsic<[llvm_v2i64_ty],
-          [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varq256_mask">,
-          Intrinsic<[llvm_v4i64_ty],
-          [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermi2var_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varq512_mask">,
-          Intrinsic<[llvm_v8i64_ty],
-          [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_d_512:
-        GCCBuiltin<"__builtin_ia32_vpermt2vard512_mask">,
-        Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
-                  llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_q_512:
-        GCCBuiltin<"__builtin_ia32_vpermt2varq512_mask">,
-        Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                  llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_ps_512:
-        GCCBuiltin<"__builtin_ia32_vpermt2varps512_mask">,
-        Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty,
-                  llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_pd_512:
-        GCCBuiltin<"__builtin_ia32_vpermt2varpd512_mask">,
-        Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty,
-                  llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_d_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2vard128_mask">,
-          Intrinsic<[llvm_v4i32_ty],
-          [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_d_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2vard128_maskz">,
-          Intrinsic<[llvm_v4i32_ty],
-          [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2vard256_mask">,
-          Intrinsic<[llvm_v8i32_ty],
-          [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2vard256_maskz">,
-          Intrinsic<[llvm_v8i32_ty],
-          [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2vard512_maskz">,
-          Intrinsic<[llvm_v16i32_ty],
-          [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_d_128 :
+       GCCBuiltin<"__builtin_ia32_vpermi2vard128">,
+       Intrinsic<[llvm_v4i32_ty],
+                 [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermt2var_hi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varhi128_mask">,
-          Intrinsic<[llvm_v8i16_ty],
-          [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_hi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varhi128_maskz">,
-          Intrinsic<[llvm_v8i16_ty],
-          [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_hi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varhi256_mask">,
-          Intrinsic<[llvm_v16i16_ty],
-          [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_hi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varhi256_maskz">,
-          Intrinsic<[llvm_v16i16_ty],
-          [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_hi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varhi512_mask">,
-          Intrinsic<[llvm_v32i16_ty],
-          [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_hi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varhi512_maskz">,
-          Intrinsic<[llvm_v32i16_ty],
-          [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_pd_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varpd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2i64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_pd_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varpd128_maskz">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2i64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4i64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varpd256_maskz">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4i64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_pd_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varpd512_maskz">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8i64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_ps_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4i32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_ps_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varps128_maskz">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4i32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8i32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varps256_maskz">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8i32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_ps_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varps512_maskz">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varq128_mask">,
-          Intrinsic<[llvm_v2i64_ty],
-          [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vpermt2var_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varq128_maskz">,
-          Intrinsic<[llvm_v2i64_ty],
-          [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vpermt2var_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varq256_mask">,
-          Intrinsic<[llvm_v4i64_ty],
-          [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_d_256 :
+        GCCBuiltin<"__builtin_ia32_vpermi2vard256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_vpermt2var_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varq256_maskz">,
-          Intrinsic<[llvm_v4i64_ty],
-          [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_d_512 :
+        GCCBuiltin<"__builtin_ia32_vpermi2vard512">,
+        Intrinsic<[llvm_v16i32_ty],
+                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                  [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_vpermt2var_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varq512_maskz">,
-          Intrinsic<[llvm_v8i64_ty],
-          [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_hi_128 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varhi128">,
+        Intrinsic<[llvm_v8i16_ty],
+                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermi2var_qi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varqi128_mask">,
-          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
-          llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_hi_256 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varhi256">,
+        Intrinsic<[llvm_v16i16_ty],
+                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty],
+                  [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermt2var_qi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varqi128_mask">,
-          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
-          llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_hi_512 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varhi512">,
+        Intrinsic<[llvm_v32i16_ty],
+                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty],
+                  [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_vpermt2var_qi_128 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varqi128_maskz">,
-          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
-          llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_pd_128 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varpd128">,
+        Intrinsic<[llvm_v2f64_ty],
+                  [llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vpermi2var_pd_256 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varpd256">,
+        Intrinsic<[llvm_v4f64_ty],
+                  [llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vpermi2var_pd_512 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varpd512">,
+        Intrinsic<[llvm_v8f64_ty],
+                  [llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vpermi2var_ps_128 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varps128">,
+        Intrinsic<[llvm_v4f32_ty],
+                  [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vpermi2var_ps_256 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varps256">,
+        Intrinsic<[llvm_v8f32_ty],
+                  [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vpermi2var_ps_512 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varps512">,
+        Intrinsic<[llvm_v16f32_ty],
+                  [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty],
+                  [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermi2var_qi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varqi256_mask">,
-          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
-          llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_q_128 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varq128">,
+        Intrinsic<[llvm_v2i64_ty],
+                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermt2var_qi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varqi256_mask">,
-          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
-          llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_q_256 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varq256">,
+        Intrinsic<[llvm_v4i64_ty],
+                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_vpermt2var_qi_256 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varqi256_maskz">,
-          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
-          llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_q_512 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varq512">,
+        Intrinsic<[llvm_v8i64_ty],
+                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermi2var_qi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermi2varqi512_mask">,
-          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
-          llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_qi_128 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varqi128">,
+        Intrinsic<[llvm_v16i8_ty],
+                  [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpermt2var_qi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varqi512_mask">,
-          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
-          llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_qi_256 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varqi256">,
+        Intrinsic<[llvm_v32i8_ty],
+                  [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_vpermt2var_qi_512 :
-        GCCBuiltin<"__builtin_ia32_vpermt2varqi512_maskz">,
-          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
-          llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
-          [IntrNoMem]>;
+  def int_x86_avx512_vpermi2var_qi_512 :
+        GCCBuiltin<"__builtin_ia32_vpermi2varqi512">,
+        Intrinsic<[llvm_v64i8_ty],
+                  [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpermilvar_pd_512 :
         GCCBuiltin<"__builtin_ia32_vpermilvarpd512">,
@@ -1450,8 +1190,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector convert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_cvtdq2_ps_256 : GCCBuiltin<"__builtin_ia32_cvtdq2ps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8i32_ty], [IntrNoMem]>;
   def int_x86_avx_cvt_pd2_ps_256 : GCCBuiltin<"__builtin_ia32_cvtpd2ps256">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
   def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
@@ -1512,29 +1250,23 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
                   llvm_v4i64_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_fpclass_pd_128 :
-         GCCBuiltin<"__builtin_ia32_fpclasspd128_mask">,
-          Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty],
+  def int_x86_avx512_fpclass_pd_128 :
+          Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_i32_ty],
           [IntrNoMem]>;
-  def int_x86_avx512_mask_fpclass_pd_256 :
-         GCCBuiltin<"__builtin_ia32_fpclasspd256_mask">,
-          Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_i32_ty, llvm_i8_ty],
+  def int_x86_avx512_fpclass_pd_256 :
+          Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_i32_ty],
           [IntrNoMem]>;
-  def int_x86_avx512_mask_fpclass_pd_512 :
-         GCCBuiltin<"__builtin_ia32_fpclasspd512_mask">,
-          Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_i8_ty],
+  def int_x86_avx512_fpclass_pd_512 :
+          Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_i32_ty],
           [IntrNoMem]>;
-  def int_x86_avx512_mask_fpclass_ps_128 :
-         GCCBuiltin<"__builtin_ia32_fpclassps128_mask">,
-          Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty],
+  def int_x86_avx512_fpclass_ps_128 :
+          Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_i32_ty],
           [IntrNoMem]>;
-  def int_x86_avx512_mask_fpclass_ps_256 :
-         GCCBuiltin<"__builtin_ia32_fpclassps256_mask">,
-          Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_i8_ty],
+  def int_x86_avx512_fpclass_ps_256 :
+          Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_i32_ty],
           [IntrNoMem]>;
-  def int_x86_avx512_mask_fpclass_ps_512 :
-         GCCBuiltin<"__builtin_ia32_fpclassps512_mask">,
-          Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_i16_ty],
+  def int_x86_avx512_fpclass_ps_512 :
+          Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_i32_ty],
           [IntrNoMem]>;
   def int_x86_avx512_mask_fpclass_sd :
          GCCBuiltin<"__builtin_ia32_fpclasssd_mask">,
@@ -1600,11 +1332,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_maskstoreps256">,
         Intrinsic<[], [llvm_ptr_ty,
                   llvm_v8i32_ty, llvm_v8f32_ty], [IntrArgMemOnly]>;
-
-  def int_x86_avx512_mask_store_ss :
-        GCCBuiltin<"__builtin_ia32_storess_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
-                  [IntrArgMemOnly]>;
 }
 
 // BITALG bits shuffle
@@ -1661,12 +1388,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
-                         llvm_v8i32_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
-                         llvm_v8i32_ty], [IntrNoMem, Commutative]>;
   def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
@@ -1870,15 +1591,9 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx512_mask_pmul_hr_sw_128 : GCCBuiltin<"__builtin_ia32_pmulhrsw128_mask">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmul_hr_sw_256 : GCCBuiltin<"__builtin_ia32_pmulhrsw256_mask">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
-                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmul_hr_sw_512 : GCCBuiltin<"__builtin_ia32_pmulhrsw512_mask">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
-                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pmul_hr_sw_512 : GCCBuiltin<"__builtin_ia32_pmulhrsw512">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+                         llvm_v32i16_ty], [IntrNoMem, Commutative]>;
 }
 
 // Vector blend
@@ -2025,81 +1740,81 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_mask_prorv_d_128 : GCCBuiltin<"__builtin_ia32_prorvd128_mask">,
+  def int_x86_avx512_prorv_d_128 : GCCBuiltin<"__builtin_ia32_prorvd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prorv_d_256 : GCCBuiltin<"__builtin_ia32_prorvd256_mask">,
+                         llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prorv_d_256 : GCCBuiltin<"__builtin_ia32_prorvd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prorv_d_512 : GCCBuiltin<"__builtin_ia32_prorvd512_mask">,
+                         llvm_v8i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prorv_d_512 : GCCBuiltin<"__builtin_ia32_prorvd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prorv_q_128 : GCCBuiltin<"__builtin_ia32_prorvq128_mask">,
+                         llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prorv_q_128 : GCCBuiltin<"__builtin_ia32_prorvq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prorv_q_256 : GCCBuiltin<"__builtin_ia32_prorvq256_mask">,
+                         llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_prorv_q_256 : GCCBuiltin<"__builtin_ia32_prorvq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prorv_q_512 : GCCBuiltin<"__builtin_ia32_prorvq512_mask">,
+                         llvm_v4i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_prorv_q_512 : GCCBuiltin<"__builtin_ia32_prorvq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+                         llvm_v8i64_ty], [IntrNoMem]>;
 
-   def int_x86_avx512_mask_prol_d_128 : GCCBuiltin<"__builtin_ia32_prold128_mask">,
+   def int_x86_avx512_prol_d_128 : GCCBuiltin<"__builtin_ia32_prold128">,
               Intrinsic<[llvm_v4i32_ty] , [llvm_v4i32_ty,
-                         llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prol_d_256 : GCCBuiltin<"__builtin_ia32_prold256_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prol_d_256 : GCCBuiltin<"__builtin_ia32_prold256">,
               Intrinsic<[llvm_v8i32_ty] , [llvm_v8i32_ty,
-                         llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prol_d_512 : GCCBuiltin<"__builtin_ia32_prold512_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prol_d_512 : GCCBuiltin<"__builtin_ia32_prold512">,
               Intrinsic<[llvm_v16i32_ty] , [llvm_v16i32_ty,
-                         llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prol_q_128 : GCCBuiltin<"__builtin_ia32_prolq128_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prol_q_128 : GCCBuiltin<"__builtin_ia32_prolq128">,
               Intrinsic<[llvm_v2i64_ty] , [llvm_v2i64_ty,
-                         llvm_i32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prol_q_256 : GCCBuiltin<"__builtin_ia32_prolq256_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prol_q_256 : GCCBuiltin<"__builtin_ia32_prolq256">,
               Intrinsic<[llvm_v4i64_ty] , [llvm_v4i64_ty,
-                         llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prol_q_512 : GCCBuiltin<"__builtin_ia32_prolq512_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prol_q_512 : GCCBuiltin<"__builtin_ia32_prolq512">,
               Intrinsic<[llvm_v8i64_ty] , [llvm_v8i64_ty,
-                         llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+                         llvm_i32_ty], [IntrNoMem]>;
 
 
-  def int_x86_avx512_mask_prolv_d_128 : GCCBuiltin<"__builtin_ia32_prolvd128_mask">,
+  def int_x86_avx512_prolv_d_128 : GCCBuiltin<"__builtin_ia32_prolvd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prolv_d_256 : GCCBuiltin<"__builtin_ia32_prolvd256_mask">,
+                         llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prolv_d_256 : GCCBuiltin<"__builtin_ia32_prolvd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prolv_d_512 : GCCBuiltin<"__builtin_ia32_prolvd512_mask">,
+                         llvm_v8i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prolv_d_512 : GCCBuiltin<"__builtin_ia32_prolvd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prolv_q_128 : GCCBuiltin<"__builtin_ia32_prolvq128_mask">,
+                         llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_prolv_q_128 : GCCBuiltin<"__builtin_ia32_prolvq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prolv_q_256 : GCCBuiltin<"__builtin_ia32_prolvq256_mask">,
+                         llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_prolv_q_256 : GCCBuiltin<"__builtin_ia32_prolvq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_prolv_q_512 : GCCBuiltin<"__builtin_ia32_prolvq512_mask">,
+                         llvm_v4i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_prolv_q_512 : GCCBuiltin<"__builtin_ia32_prolvq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pror_d_128 : GCCBuiltin<"__builtin_ia32_prord128_mask">,
+                         llvm_v8i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_pror_d_128 : GCCBuiltin<"__builtin_ia32_prord128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
-                         llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pror_d_256 : GCCBuiltin<"__builtin_ia32_prord256_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pror_d_256 : GCCBuiltin<"__builtin_ia32_prord256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                         llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pror_d_512 : GCCBuiltin<"__builtin_ia32_prord512_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pror_d_512 : GCCBuiltin<"__builtin_ia32_prord512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
-                         llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pror_q_128 : GCCBuiltin<"__builtin_ia32_prorq128_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pror_q_128 : GCCBuiltin<"__builtin_ia32_prorq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_i32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pror_q_256 : GCCBuiltin<"__builtin_ia32_prorq256_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pror_q_256 : GCCBuiltin<"__builtin_ia32_prorq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pror_q_512 : GCCBuiltin<"__builtin_ia32_prorq512_mask">,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pror_q_512 : GCCBuiltin<"__builtin_ia32_prorq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+                         llvm_i32_ty], [IntrNoMem]>;
 
 }
 
@@ -2188,754 +1903,115 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // FMA3 and FMA4
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_fma_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss3">,
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd3">,
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">,
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">,
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmadd_pd : GCCBuiltin<"__builtin_ia32_vfmaddpd">,
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256">,
-              Intrinsic<[llvm_v8f32_ty],
-                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256">,
-              Intrinsic<[llvm_v4f64_ty],
-                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                        [IntrNoMem]>;
-
-  def int_x86_fma_vfmsub_ss : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsub_sd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsub_ps : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsub_pd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsub_ps_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v8f32_ty],
-                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsub_pd_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f64_ty],
-                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_ss : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_sd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_ps : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_pd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_ps_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v8f32_ty],
-                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_pd_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f64_ty],
-                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_ss : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_sd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_ps : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_pd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_ps_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v8f32_ty],
-                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_pd_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f64_ty],
-                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">,
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_ps_256 :
-               GCCBuiltin<"__builtin_ia32_vfmaddsubps256">,
-              Intrinsic<[llvm_v8f32_ty],
-                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_pd_256 :
-              GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">,
-              Intrinsic<[llvm_v4f64_ty],
-                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsubadd_ps : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f32_ty],
-                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsubadd_pd : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v2f64_ty],
-                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsubadd_ps_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v8f32_ty],
-                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_fma_vfmsubadd_pd_256 : // TODO: remove this intrinsic
-              Intrinsic<[llvm_v4f64_ty],
-                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                        [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd128_maskz">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask3">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd256_maskz">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask">,
+  def int_x86_avx512_vfmadd_pd_512 :
           Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask3">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddpd512_maskz">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
+          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask3_vfmadd_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps128_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps128_maskz">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps256_mask3">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps256_maskz">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps512_mask">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps512_mask3">,
+  def int_x86_avx512_vfmadd_ps_512 :
           Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddps512_maskz">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmaddsub_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
+          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask3_vfmaddsub_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmaddsub_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_maskz">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmaddsub_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmaddsub_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask3">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmaddsub_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_maskz">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmaddsub_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmaddsub_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask3">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmaddsub_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_maskz">,
+  // TODO: Can we use 2 vfmadds+shufflevector?
+  def int_x86_avx512_vfmaddsub_pd_512 :
           Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmaddsub_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmaddsub_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmaddsub_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps128_maskz">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmaddsub_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmaddsub_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask3">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
+          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_vfmaddsub_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps256_maskz">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmaddsub_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps512_mask">,
+  def int_x86_avx512_vfmaddsub_ps_512 :
           Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmaddsub_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps512_mask3">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmaddsub_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmaddsubps512_maskz">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-
-  def int_x86_avx512_mask_vfmadd_sd :
-         GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfmadd_ss :
-         GCCBuiltin<"__builtin_ia32_vfmaddss3_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_sd :
-         GCCBuiltin<"__builtin_ia32_vfmaddsd3_maskz">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_vfmadd_ss :
-         GCCBuiltin<"__builtin_ia32_vfmaddss3_maskz">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_sd :
-         GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmadd_ss :
-         GCCBuiltin<"__builtin_ia32_vfmaddss3_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_sd :
-         GCCBuiltin<"__builtin_ia32_vfmsubsd3_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_ss :
-         GCCBuiltin<"__builtin_ia32_vfmsubss3_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmsubpd256_mask3">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmsubpd512_mask3">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmsubps128_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmsubps256_mask3">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsub_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmsubps512_mask3">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsubadd_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfmsubaddpd128_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsubadd_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfmsubaddpd256_mask3">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsubadd_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfmsubaddpd512_mask3">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsubadd_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfmsubaddps128_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsubadd_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfmsubaddps256_mask3">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfmsubadd_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfmsubaddps512_mask3">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmadd_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfnmaddpd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmadd_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfnmaddpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmadd_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfnmaddpd512_mask">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmadd_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfnmaddps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmadd_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfnmaddps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmadd_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfnmaddps512_mask">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_sd :
-         GCCBuiltin<"__builtin_ia32_vfnmsubsd3_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_ss :
-         GCCBuiltin<"__builtin_ia32_vfnmsubss3_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmsub_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_pd_128 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask3">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmsub_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_pd_256 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask3">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmsub_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_pd_512 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask3">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,  llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmsub_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
+          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask3_vfnmsub_ps_128 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask3">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmsub_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_ps_256 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask3">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_vfnmsub_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubps512_mask">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask3_vfnmsub_ps_512 :
-         GCCBuiltin<"__builtin_ia32_vfnmsubps512_mask3">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
-          llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vfmadd_f64 :
+          Intrinsic<[llvm_double_ty],
+                    [llvm_double_ty, llvm_double_ty, llvm_double_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
+  def int_x86_avx512_vfmadd_f32 :
+          Intrinsic<[llvm_float_ty],
+                    [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpmadd52h_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq128_mask">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52h_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq128_maskz">,
+  def int_x86_avx512_vpmadd52h_uq_128 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52l_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq128_mask">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52l_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq128_maskz">,
+                         llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52l_uq_128 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52luq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52h_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq256_mask">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52h_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq256_maskz">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52l_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq256_mask">,
+                         llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52h_uq_256 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52huq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52l_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq256_maskz">,
+                         llvm_v4i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52l_uq_256 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52luq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52h_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq512_mask">,
+                         llvm_v4i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52h_uq_512 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52huq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52h_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq512_maskz">,
+                         llvm_v8i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52l_uq_512 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52luq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52l_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq512_mask">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52l_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq512_maskz">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+                         llvm_v8i64_ty], [IntrNoMem]>;
 }
 
 // VNNI
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_vpdpbusd_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd128_mask">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpbusd_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd128_maskz">,
+  def int_x86_avx512_vpdpbusd_128 :
+              GCCBuiltin<"__builtin_ia32_vpdpbusd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpbusd_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd256_mask">,
-              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpbusd_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd256_maskz">,
+                         llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpbusd_256 :
+              GCCBuiltin<"__builtin_ia32_vpdpbusd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpbusd_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd512_mask">,
-              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpbusd_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusd512_maskz">,
+                         llvm_v8i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpbusd_512 :
+              GCCBuiltin<"__builtin_ia32_vpdpbusd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+                         llvm_v16i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpdpbusds_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds128_mask">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpbusds_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds128_maskz">,
+  def int_x86_avx512_vpdpbusds_128 :
+              GCCBuiltin<"__builtin_ia32_vpdpbusds128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpbusds_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds256_mask">,
-              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpbusds_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds256_maskz">,
+                         llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpbusds_256 :
+              GCCBuiltin<"__builtin_ia32_vpdpbusds256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpbusds_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds512_mask">,
-              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpbusds_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpbusds512_maskz">,
+                         llvm_v8i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpbusds_512 :
+              GCCBuiltin<"__builtin_ia32_vpdpbusds512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+                         llvm_v16i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpdpwssd_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd128_mask">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpwssd_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd128_maskz">,
+  def int_x86_avx512_vpdpwssd_128 :
+              GCCBuiltin<"__builtin_ia32_vpdpwssd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpwssd_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd256_mask">,
-              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpwssd_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd256_maskz">,
+                         llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpwssd_256 :
+              GCCBuiltin<"__builtin_ia32_vpdpwssd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpwssd_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd512_mask">,
-              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpwssd_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssd512_maskz">,
+                         llvm_v8i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpwssd_512 :
+              GCCBuiltin<"__builtin_ia32_vpdpwssd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+                         llvm_v16i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpdpwssds_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds128_mask">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpwssds_128 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds128_maskz">,
+  def int_x86_avx512_vpdpwssds_128 :
+              GCCBuiltin<"__builtin_ia32_vpdpwssds128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpwssds_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds256_mask">,
-              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpwssds_256 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds256_maskz">,
+                         llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpwssds_256 :
+              GCCBuiltin<"__builtin_ia32_vpdpwssds256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpdpwssds_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds512_mask">,
-              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpdpwssds_512 :
-              GCCBuiltin<"__builtin_ia32_vpdpwssds512_maskz">,
+                         llvm_v8i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpdpwssds_512 :
+              GCCBuiltin<"__builtin_ia32_vpdpwssds512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+                         llvm_v16i32_ty], [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3050,62 +2126,62 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               GCCBuiltin<"__builtin_ia32_vpmacsdd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsdqh :
               GCCBuiltin<"__builtin_ia32_vpmacsdqh">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsdql :
               GCCBuiltin<"__builtin_ia32_vpmacsdql">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssdd :
               GCCBuiltin<"__builtin_ia32_vpmacssdd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssdqh :
               GCCBuiltin<"__builtin_ia32_vpmacssdqh">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssdql :
               GCCBuiltin<"__builtin_ia32_vpmacssdql">,
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsswd :
               GCCBuiltin<"__builtin_ia32_vpmacsswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacssww :
               GCCBuiltin<"__builtin_ia32_vpmacssww">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacswd :
               GCCBuiltin<"__builtin_ia32_vpmacswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmacsww :
               GCCBuiltin<"__builtin_ia32_vpmacsww">,
               Intrinsic<[llvm_v8i16_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmadcsswd :
               GCCBuiltin<"__builtin_ia32_vpmadcsswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpmadcswd :
               GCCBuiltin<"__builtin_ia32_vpmadcswd">,
               Intrinsic<[llvm_v4i32_ty],
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, Commutative]>;
   def int_x86_xop_vpperm :
               GCCBuiltin<"__builtin_ia32_vpperm">,
               Intrinsic<[llvm_v16i8_ty],
@@ -3383,48 +2459,42 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 // Permute
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_permvar_df_256 : GCCBuiltin<"__builtin_ia32_permvardf256_mask">,
+  def int_x86_avx512_permvar_df_256 : GCCBuiltin<"__builtin_ia32_permvardf256">,
               Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                        llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_df_512 : GCCBuiltin<"__builtin_ia32_permvardf512_mask">,
+                        llvm_v4i64_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_df_512 : GCCBuiltin<"__builtin_ia32_permvardf512">,
               Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,
-                        llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_di_256 : GCCBuiltin<"__builtin_ia32_permvardi256_mask">,
+                        llvm_v8i64_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_di_256 : GCCBuiltin<"__builtin_ia32_permvardi256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                        llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_di_512 : GCCBuiltin<"__builtin_ia32_permvardi512_mask">,
+                        llvm_v4i64_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_di_512 : GCCBuiltin<"__builtin_ia32_permvardi512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                        llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_hi_128 : GCCBuiltin<"__builtin_ia32_permvarhi128_mask">,
+                        llvm_v8i64_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_hi_128 : GCCBuiltin<"__builtin_ia32_permvarhi128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
-                        llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_hi_256 : GCCBuiltin<"__builtin_ia32_permvarhi256_mask">,
+                        llvm_v8i16_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_hi_256 : GCCBuiltin<"__builtin_ia32_permvarhi256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
-                        llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_hi_512 : GCCBuiltin<"__builtin_ia32_permvarhi512_mask">,
+                        llvm_v16i16_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_hi_512 : GCCBuiltin<"__builtin_ia32_permvarhi512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
-                        llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_qi_128 : GCCBuiltin<"__builtin_ia32_permvarqi128_mask">,
+                        llvm_v32i16_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_qi_128 : GCCBuiltin<"__builtin_ia32_permvarqi128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
-                        llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_qi_256 : GCCBuiltin<"__builtin_ia32_permvarqi256_mask">,
+                        llvm_v16i8_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_qi_256 : GCCBuiltin<"__builtin_ia32_permvarqi256">,
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
-                        llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_qi_512 : GCCBuiltin<"__builtin_ia32_permvarqi512_mask">,
+                        llvm_v32i8_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_qi_512 : GCCBuiltin<"__builtin_ia32_permvarqi512">,
               Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
-                        llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_sf_256 : GCCBuiltin<"__builtin_ia32_permvarsf256_mask">,
-              Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                        llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_sf_512 : GCCBuiltin<"__builtin_ia32_permvarsf512_mask">,
+                        llvm_v64i8_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_sf_512 : GCCBuiltin<"__builtin_ia32_permvarsf512">,
               Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,
-                        llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_si_256 : GCCBuiltin<"__builtin_ia32_permvarsi256_mask">,
-              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                        llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],  [IntrNoMem]>;
-  def int_x86_avx512_mask_permvar_si_512 : GCCBuiltin<"__builtin_ia32_permvarsi512_mask">,
+                        llvm_v16i32_ty],  [IntrNoMem]>;
+  def int_x86_avx512_permvar_si_512 : GCCBuiltin<"__builtin_ia32_permvarsi512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
-                        llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],  [IntrNoMem]>;
+                        llvm_v16i32_ty],  [IntrNoMem]>;
 }
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
@@ -3717,44 +2787,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 //===----------------------------------------------------------------------===//
 // AVX512
 
-// Mask ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  // Mask instructions
-  // 16-bit mask
-  def int_x86_avx512_kand_w : // TODO: remove this intrinsic
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kandn_w : // TODO: remove this intrinsic
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_knot_w : // TODO: remove this intrinsic
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_kor_w : // TODO: remove this intrinsic
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kxor_w : // TODO: remove this intrinsic
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
-              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
-              Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                         [IntrNoMem]>;
-  def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
-              Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_avx512_kortestc_w : GCCBuiltin<"__builtin_ia32_kortestchi">,
-              Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
-                        [IntrNoMem]>;
-}
-
 // Conversion ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_cvttss2si : GCCBuiltin<"__builtin_ia32_vcvttss2si32">,
@@ -3779,9 +2811,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_cvttsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvttsd2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtusi2sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd32">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd64">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -3810,35 +2839,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtb2mask_128 : GCCBuiltin<"__builtin_ia32_cvtb2mask128">,
-              Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtb2mask_256 : GCCBuiltin<"__builtin_ia32_cvtb2mask256">,
-              Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtb2mask_512 : GCCBuiltin<"__builtin_ia32_cvtb2mask512">,
-              Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtw2mask_128 : GCCBuiltin<"__builtin_ia32_cvtw2mask128">,
-              Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtw2mask_256 : GCCBuiltin<"__builtin_ia32_cvtw2mask256">,
-              Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtw2mask_512 : GCCBuiltin<"__builtin_ia32_cvtw2mask512">,
-              Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtd2mask_128 : GCCBuiltin<"__builtin_ia32_cvtd2mask128">,
-              Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtd2mask_256 : GCCBuiltin<"__builtin_ia32_cvtd2mask256">,
-              Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtd2mask_512 : GCCBuiltin<"__builtin_ia32_cvtd2mask512">,
-              Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtq2mask_128 : GCCBuiltin<"__builtin_ia32_cvtq2mask128">,
-              Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtq2mask_256 : GCCBuiltin<"__builtin_ia32_cvtq2mask256">,
-              Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtq2mask_512 : GCCBuiltin<"__builtin_ia32_cvtq2mask512">,
-              Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty], [IntrNoMem]>;
-
 }
 
 // Pack ops.
@@ -3859,18 +2859,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector convert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_cvtdq2ps_128 :
-        GCCBuiltin<"__builtin_ia32_cvtdq2ps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4i32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_cvtdq2ps_256 :
-        GCCBuiltin<"__builtin_ia32_cvtdq2ps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8i32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtdq2ps_512 :
         GCCBuiltin<"__builtin_ia32_cvtdq2ps512_mask">,
           Intrinsic<[llvm_v16f32_ty],
@@ -3883,24 +2871,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [llvm_v2f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvtpd2dq_256 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2dq256_mask">,
-          Intrinsic<[llvm_v4i32_ty],
-          [llvm_v4f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtpd2dq_512 :
         GCCBuiltin<"__builtin_ia32_cvtpd2dq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvtpd2ps_256 :
-        GCCBuiltin<"__builtin_ia32_cvtpd2ps256_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4f64_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtpd2ps_512 :
         GCCBuiltin<"__builtin_ia32_cvtpd2ps512_mask">,
           Intrinsic<[llvm_v8f32_ty],
@@ -3997,18 +2973,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvtps2pd_128 :
-        GCCBuiltin<"__builtin_ia32_cvtps2pd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v4f32_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_cvtps2pd_256 :
-        GCCBuiltin<"__builtin_ia32_cvtps2pd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f32_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtps2pd_512 :
         GCCBuiltin<"__builtin_ia32_cvtps2pd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
@@ -4069,18 +3033,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvtqq2pd_128 :
-        GCCBuiltin<"__builtin_ia32_cvtqq2pd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2i64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_cvtqq2pd_256 :
-        GCCBuiltin<"__builtin_ia32_cvtqq2pd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4i64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtqq2pd_512 :
         GCCBuiltin<"__builtin_ia32_cvtqq2pd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
@@ -4111,12 +3063,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [llvm_v2f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvttpd2dq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttpd2dq256_mask">,
-          Intrinsic<[llvm_v4i32_ty],
-          [llvm_v4f64_ty, llvm_v4i32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvttpd2dq_512 :
         GCCBuiltin<"__builtin_ia32_cvttpd2dq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
@@ -4177,18 +3123,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvttps2dq_128 :
-        GCCBuiltin<"__builtin_ia32_cvttps2dq128_mask">,
-          Intrinsic<[llvm_v4i32_ty],
-          [llvm_v4f32_ty, llvm_v4i32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_cvttps2dq_256 :
-        GCCBuiltin<"__builtin_ia32_cvttps2dq256_mask">,
-          Intrinsic<[llvm_v8i32_ty],
-          [llvm_v8f32_ty, llvm_v8i32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvttps2dq_512 :
         GCCBuiltin<"__builtin_ia32_cvttps2dq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
@@ -4249,36 +3183,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvtudq2ps_128 :
-        GCCBuiltin<"__builtin_ia32_cvtudq2ps128_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-          [llvm_v4i32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_cvtudq2ps_256 :
-        GCCBuiltin<"__builtin_ia32_cvtudq2ps256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8i32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtudq2ps_512 :
         GCCBuiltin<"__builtin_ia32_cvtudq2ps512_mask">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16i32_ty, llvm_v16f32_ty,  llvm_i16_ty,  llvm_i32_ty],
           [IntrNoMem]>;
 
-  def int_x86_avx512_mask_cvtuqq2pd_128 :
-        GCCBuiltin<"__builtin_ia32_cvtuqq2pd128_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-          [llvm_v2i64_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_cvtuqq2pd_256 :
-        GCCBuiltin<"__builtin_ia32_cvtuqq2pd256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4i64_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-
   def int_x86_avx512_mask_cvtuqq2pd_512 :
         GCCBuiltin<"__builtin_ia32_cvtuqq2pd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
@@ -4361,13 +3271,6 @@ def int_x86_avx512_mask_range_ps_512 : GCCBuiltin<"__builtin_ia32_rangeps512_mas
 
 // Vector load with broadcast
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  // TODO: Remove the broadcast intrinsics with no gcc builtin and autoupgrade
-  def int_x86_avx512_vbroadcast_ss_512 :
-        Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
-
-  def int_x86_avx512_vbroadcast_sd_512 :
-        Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
-
    def int_x86_avx512_broadcastmw_512 :
           GCCBuiltin<"__builtin_ia32_broadcastmw512">,
           Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>;
@@ -4391,42 +3294,43 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
-  def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">,
+  def int_x86_avx512_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">,
+                     llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+                     llvm_i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
@@ -4514,31 +3418,17 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round_mask">,
+  def int_x86_avx512_mask_sqrt_ss :
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round_mask">,
+  def int_x86_avx512_mask_sqrt_sd :
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                                    llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256_mask">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                                    llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">,
-        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                                    llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_ps_128 : GCCBuiltin<"__builtin_ia32_sqrtps128_mask">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256_mask">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">,
-        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                                     llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sqrt_pd_512 :
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sqrt_ps_512 :
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_fixupimm_pd_128 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd128_mask">,
           Intrinsic<[llvm_v2f64_ty],
@@ -4787,148 +3677,105 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 // Integer arithmetic ops
 let TargetPrefix = "x86" in {
-  def int_x86_avx512_mask_padds_b_128 : GCCBuiltin<"__builtin_ia32_paddsb128_mask">,
+  def int_x86_avx512_mask_padds_b_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                      llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_padds_b_256 : GCCBuiltin<"__builtin_ia32_paddsb256_mask">,
+  def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                      llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
                      llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_padds_w_128 : GCCBuiltin<"__builtin_ia32_paddsw128_mask">,
+  def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                      llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_padds_w_256 : GCCBuiltin<"__builtin_ia32_paddsw256_mask">,
+  def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
                      llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">,
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_b_128 : GCCBuiltin<"__builtin_ia32_paddusb128_mask">,
+  def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                      llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_b_256 : GCCBuiltin<"__builtin_ia32_paddusb256_mask">,
+  def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                      llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
                      llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_w_128 : GCCBuiltin<"__builtin_ia32_paddusw128_mask">,
+  def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                      llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_w_256 : GCCBuiltin<"__builtin_ia32_paddusw256_mask">,
+  def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
                      llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">,
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubs_b_128 : GCCBuiltin<"__builtin_ia32_psubsb128_mask">,
+  def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                      llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubs_b_256 : GCCBuiltin<"__builtin_ia32_psubsb256_mask">,
+  def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                      llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
                      llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubs_w_128 : GCCBuiltin<"__builtin_ia32_psubsw128_mask">,
+  def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                      llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubs_w_256 : GCCBuiltin<"__builtin_ia32_psubsw256_mask">,
+  def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
                      llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">,
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_b_128 : GCCBuiltin<"__builtin_ia32_psubusb128_mask">,
+  def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                      llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_b_256 : GCCBuiltin<"__builtin_ia32_psubusb256_mask">,
+  def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                      llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
                      llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_w_128 : GCCBuiltin<"__builtin_ia32_psubusw128_mask">,
+  def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                      llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_w_256 : GCCBuiltin<"__builtin_ia32_psubusw256_mask">,
+  def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
                      llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">,
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512_mask">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
-                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512_mask">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
-                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmulhu_w_128 : GCCBuiltin<"__builtin_ia32_pmulhuw128_mask">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmulhu_w_256 : GCCBuiltin<"__builtin_ia32_pmulhuw256_mask">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
-                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmulh_w_128 : GCCBuiltin<"__builtin_ia32_pmulhw128_mask">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmulh_w_256 : GCCBuiltin<"__builtin_ia32_pmulhw256_mask">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
-                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_pmaddw_d_128 :
-         GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">,
-          Intrinsic<[llvm_v4i32_ty],
-          [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-  def int_x86_avx512_mask_pmaddw_d_256 :
-         GCCBuiltin<"__builtin_ia32_pmaddwd256_mask">,
-          Intrinsic<[llvm_v8i32_ty],
-          [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v8i32_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-  def int_x86_avx512_mask_pmaddw_d_512 :
-         GCCBuiltin<"__builtin_ia32_pmaddwd512_mask">,
-          Intrinsic<[llvm_v16i32_ty],
-          [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v16i32_ty,  llvm_i16_ty],
-          [IntrNoMem]>;
-  def int_x86_avx512_mask_pmaddubs_w_128 :
-         GCCBuiltin<"__builtin_ia32_pmaddubsw128_mask">,
-          Intrinsic<[llvm_v8i16_ty],
-          [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v8i16_ty,  llvm_i8_ty],
-          [IntrNoMem]>;
-  def int_x86_avx512_mask_pmaddubs_w_256 :
-         GCCBuiltin<"__builtin_ia32_pmaddubsw256_mask">,
-          Intrinsic<[llvm_v16i16_ty],
-          [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v16i16_ty,  llvm_i16_ty],
-          [IntrNoMem]>;
-  def int_x86_avx512_mask_pmaddubs_w_512 :
-         GCCBuiltin<"__builtin_ia32_pmaddubsw512_mask">,
-          Intrinsic<[llvm_v32i16_ty],
-          [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v32i16_ty,  llvm_i32_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_dbpsadbw_128 :
-         GCCBuiltin<"__builtin_ia32_dbpsadbw128_mask">,
+  def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+                         llvm_v32i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx512_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+                         llvm_v32i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx512_pmaddw_d_512 : GCCBuiltin<"__builtin_ia32_pmaddwd512">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v32i16_ty,
+                         llvm_v32i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx512_pmaddubs_w_512 : GCCBuiltin<"__builtin_ia32_pmaddubsw512">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v64i8_ty,
+                         llvm_v64i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_dbpsadbw_128 :
+         GCCBuiltin<"__builtin_ia32_dbpsadbw128">,
           Intrinsic<[llvm_v8i16_ty],
-          [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_v8i16_ty,
-           llvm_i8_ty], [IntrNoMem]>;
+                    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_dbpsadbw_256 :
-         GCCBuiltin<"__builtin_ia32_dbpsadbw256_mask">,
+  def int_x86_avx512_dbpsadbw_256 :
+         GCCBuiltin<"__builtin_ia32_dbpsadbw256">,
           Intrinsic<[llvm_v16i16_ty],
-          [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty, llvm_v16i16_ty,
-           llvm_i16_ty], [IntrNoMem]>;
+                    [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_dbpsadbw_512 :
-         GCCBuiltin<"__builtin_ia32_dbpsadbw512_mask">,
+  def int_x86_avx512_dbpsadbw_512 :
+         GCCBuiltin<"__builtin_ia32_dbpsadbw512">,
           Intrinsic<[llvm_v32i16_ty],
-          [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty, llvm_v32i16_ty,
-           llvm_i32_ty], [IntrNoMem]>;
+                    [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty], [IntrNoMem]>;
 }
 
 // Gather and Scatter ops
@@ -5299,31 +4146,6 @@ let TargetPrefix = "x86" in {
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                    llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_compress_store_ps_512 :
-                            GCCBuiltin<"__builtin_ia32_compressstoresf512_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty,
-                   llvm_i16_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_pd_512 :
-                            GCCBuiltin<"__builtin_ia32_compressstoredf512_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_ps_256 :
-                            GCCBuiltin<"__builtin_ia32_compressstoresf256_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_pd_256 :
-                            GCCBuiltin<"__builtin_ia32_compressstoredf256_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_ps_128 :
-                            GCCBuiltin<"__builtin_ia32_compressstoresf128_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_pd_128 :
-                            GCCBuiltin<"__builtin_ia32_compressstoredf128_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-
   def int_x86_avx512_mask_compress_d_512 :
                              GCCBuiltin<"__builtin_ia32_compresssi512_mask">,
         Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
@@ -5349,31 +4171,6 @@ let TargetPrefix = "x86" in {
         Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                    llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_compress_store_d_512 :
-                            GCCBuiltin<"__builtin_ia32_compressstoresi512_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty,
-                   llvm_i16_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_q_512 :
-                            GCCBuiltin<"__builtin_ia32_compressstoredi512_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_d_256 :
-                            GCCBuiltin<"__builtin_ia32_compressstoresi256_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_q_256 :
-                            GCCBuiltin<"__builtin_ia32_compressstoredi256_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_d_128 :
-                            GCCBuiltin<"__builtin_ia32_compressstoresi128_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_q_128 :
-                            GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-
   def int_x86_avx512_mask_compress_b_512 :
                              GCCBuiltin<"__builtin_ia32_compressqi512_mask">,
         Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
@@ -5399,31 +4196,6 @@ let TargetPrefix = "x86" in {
         Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                    llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_compress_store_b_512 :
-                            GCCBuiltin<"__builtin_ia32_compressstoreqi512_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v64i8_ty,
-                   llvm_i64_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_w_512 :
-                            GCCBuiltin<"__builtin_ia32_compressstorehi512_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty,
-                   llvm_i32_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_b_256 :
-                            GCCBuiltin<"__builtin_ia32_compressstoreqi256_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty,
-                   llvm_i32_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_w_256 :
-                            GCCBuiltin<"__builtin_ia32_compressstorehi256_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty,
-                   llvm_i16_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_b_128 :
-                            GCCBuiltin<"__builtin_ia32_compressstoreqi128_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v16i8_ty,
-                   llvm_i16_ty], [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_compress_store_w_128 :
-                            GCCBuiltin<"__builtin_ia32_compressstorehi128_mask">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty,
-                   llvm_i8_ty], [IntrArgMemOnly]>;
-
 // expand
   def int_x86_avx512_mask_expand_ps_512 :
                              GCCBuiltin<"__builtin_ia32_expandsf512_mask">,
@@ -5450,31 +4222,6 @@ let TargetPrefix = "x86" in {
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                    llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_expand_load_ps_512 :
-                            GCCBuiltin<"__builtin_ia32_expandloadsf512_mask">,
-        Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty,
-                   llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_pd_512 :
-                            GCCBuiltin<"__builtin_ia32_expandloaddf512_mask">,
-        Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_ps_256 :
-                            GCCBuiltin<"__builtin_ia32_expandloadsf256_mask">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_pd_256 :
-                            GCCBuiltin<"__builtin_ia32_expandloaddf256_mask">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_ps_128 :
-                            GCCBuiltin<"__builtin_ia32_expandloadsf128_mask">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_pd_128 :
-                            GCCBuiltin<"__builtin_ia32_expandloaddf128_mask">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-
   def int_x86_avx512_mask_expand_d_512 :
                              GCCBuiltin<"__builtin_ia32_expandsi512_mask">,
         Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
@@ -5500,31 +4247,6 @@ let TargetPrefix = "x86" in {
         Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                    llvm_i8_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_expand_load_d_512 :
-                            GCCBuiltin<"__builtin_ia32_expandloadsi512_mask">,
-        Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty,
-                   llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_q_512 :
-                            GCCBuiltin<"__builtin_ia32_expandloaddi512_mask">,
-        Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_d_256 :
-                            GCCBuiltin<"__builtin_ia32_expandloadsi256_mask">,
-        Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_q_256 :
-                            GCCBuiltin<"__builtin_ia32_expandloaddi256_mask">,
-        Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_d_128 :
-                            GCCBuiltin<"__builtin_ia32_expandloadsi128_mask">,
-        Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_q_128 :
-                            GCCBuiltin<"__builtin_ia32_expandloaddi128_mask">,
-        Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
-
   def int_x86_avx512_mask_expand_b_512 :
                             GCCBuiltin<"__builtin_ia32_expandqi512_mask">,
         Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
@@ -5549,130 +4271,87 @@ let TargetPrefix = "x86" in {
                             GCCBuiltin<"__builtin_ia32_expandhi128_mask">,
         Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
                    llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_expand_load_b_512 :
-                            GCCBuiltin<"__builtin_ia32_expandloadqi512_mask">,
-        Intrinsic<[llvm_v64i8_ty], [llvm_ptr_ty, llvm_v64i8_ty,
-                   llvm_i64_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_w_512 :
-                            GCCBuiltin<"__builtin_ia32_expandloadhi512_mask">,
-        Intrinsic<[llvm_v32i16_ty], [llvm_ptr_ty, llvm_v32i16_ty,
-                   llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_b_256 :
-                            GCCBuiltin<"__builtin_ia32_expandloadqi256_mask">,
-        Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty, llvm_v32i8_ty,
-                   llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_w_256 :
-                            GCCBuiltin<"__builtin_ia32_expandloadhi256_mask">,
-        Intrinsic<[llvm_v16i16_ty], [llvm_ptr_ty, llvm_v16i16_ty,
-                   llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_b_128 :
-                            GCCBuiltin<"__builtin_ia32_expandloadqi128_mask">,
-        Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_v16i8_ty,
-                   llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
-  def int_x86_avx512_mask_expand_load_w_128 :
-                            GCCBuiltin<"__builtin_ia32_expandloadhi128_mask">,
-        Intrinsic<[llvm_v8i16_ty], [llvm_ptr_ty, llvm_v8i16_ty,
-                   llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
 }
 
 // VBMI2 Concat & Shift
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_vpshld_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpshldq512_mask">,
+  def int_x86_avx512_vpshld_q_512 :
+        GCCBuiltin<"__builtin_ia32_vpshldq512">,
         Intrinsic<[llvm_v8i64_ty],
-                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpshldq256_mask">,
+                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_q_256 :
+        GCCBuiltin<"__builtin_ia32_vpshldq256">,
         Intrinsic<[llvm_v4i64_ty],
-                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpshldq128_mask">,
+                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_q_128 :
+        GCCBuiltin<"__builtin_ia32_vpshldq128">,
         Intrinsic<[llvm_v2i64_ty],
-                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpshld_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpshldd512_mask">,
+  def int_x86_avx512_vpshld_d_512 :
+        GCCBuiltin<"__builtin_ia32_vpshldd512">,
         Intrinsic<[llvm_v16i32_ty],
-                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpshldd256_mask">,
+                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_d_256 :
+        GCCBuiltin<"__builtin_ia32_vpshldd256">,
         Intrinsic<[llvm_v8i32_ty],
-                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_d_128 :
-        GCCBuiltin<"__builtin_ia32_vpshldd128_mask">,
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_d_128 :
+        GCCBuiltin<"__builtin_ia32_vpshldd128">,
         Intrinsic<[llvm_v4i32_ty],
-                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpshld_w_512 :
-        GCCBuiltin<"__builtin_ia32_vpshldw512_mask">,
+  def int_x86_avx512_vpshld_w_512 :
+        GCCBuiltin<"__builtin_ia32_vpshldw512">,
         Intrinsic<[llvm_v32i16_ty],
-                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty,
-                   llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_w_256 :
-        GCCBuiltin<"__builtin_ia32_vpshldw256_mask">,
+                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_w_256 :
+        GCCBuiltin<"__builtin_ia32_vpshldw256">,
         Intrinsic<[llvm_v16i16_ty],
-                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_w_128 :
-        GCCBuiltin<"__builtin_ia32_vpshldw128_mask">,
+                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_w_128 :
+        GCCBuiltin<"__builtin_ia32_vpshldw128">,
         Intrinsic<[llvm_v8i16_ty],
-                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpshrd_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpshrdq512_mask">,
+  def int_x86_avx512_vpshrd_q_512 :
+        GCCBuiltin<"__builtin_ia32_vpshrdq512">,
         Intrinsic<[llvm_v8i64_ty],
-                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpshrdq256_mask">,
+                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_q_256 :
+        GCCBuiltin<"__builtin_ia32_vpshrdq256">,
         Intrinsic<[llvm_v4i64_ty],
-                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpshrdq128_mask">,
+                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_q_128 :
+        GCCBuiltin<"__builtin_ia32_vpshrdq128">,
         Intrinsic<[llvm_v2i64_ty],
-                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpshrd_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpshrdd512_mask">,
+  def int_x86_avx512_vpshrd_d_512 :
+        GCCBuiltin<"__builtin_ia32_vpshrdd512">,
         Intrinsic<[llvm_v16i32_ty],
-                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpshrdd256_mask">,
+                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_d_256 :
+        GCCBuiltin<"__builtin_ia32_vpshrdd256">,
         Intrinsic<[llvm_v8i32_ty],
-                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_d_128 :
-        GCCBuiltin<"__builtin_ia32_vpshrdd128_mask">,
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_d_128 :
+        GCCBuiltin<"__builtin_ia32_vpshrdd128">,
         Intrinsic<[llvm_v4i32_ty],
-                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpshrd_w_512 :
-        GCCBuiltin<"__builtin_ia32_vpshrdw512_mask">,
+  def int_x86_avx512_vpshrd_w_512 :
+        GCCBuiltin<"__builtin_ia32_vpshrdw512">,
         Intrinsic<[llvm_v32i16_ty],
-                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty,
-                   llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_w_256 :
-        GCCBuiltin<"__builtin_ia32_vpshrdw256_mask">,
+                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_w_256 :
+        GCCBuiltin<"__builtin_ia32_vpshrdw256">,
         Intrinsic<[llvm_v16i16_ty],
-                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_w_128 :
-        GCCBuiltin<"__builtin_ia32_vpshrdw128_mask">,
+                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_w_128 :
+        GCCBuiltin<"__builtin_ia32_vpshrdw128">,
         Intrinsic<[llvm_v8i16_ty],
-                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_vpshldv_w_128 :
         GCCBuiltin<"__builtin_ia32_vpshldvw128_mask">,
@@ -5978,7 +4657,6 @@ let TargetPrefix = "x86" in {
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_qw_512 :
-          GCCBuiltin<"__builtin_ia32_pmovqw512_mask">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrNoMem]>;
@@ -6037,8 +4715,7 @@ let TargetPrefix = "x86" in {
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_pmov_qd_256 :
-          GCCBuiltin<"__builtin_ia32_pmovqd256_mask">,
+  def int_x86_avx512_mask_pmov_qd_256 : // FIXME: Replace with trunc+select.
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
@@ -6067,8 +4744,7 @@ let TargetPrefix = "x86" in {
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_pmov_qd_512 :
-          GCCBuiltin<"__builtin_ia32_pmovqd512_mask">,
+  def int_x86_avx512_mask_pmov_qd_512 : // FIXME: Replace with trunc+select.
           Intrinsic<[llvm_v8i32_ty],
                     [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
@@ -6158,7 +4834,6 @@ let TargetPrefix = "x86" in {
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_db_512 :
-          GCCBuiltin<"__builtin_ia32_pmovdb512_mask">,
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
@@ -6248,7 +4923,6 @@ let TargetPrefix = "x86" in {
                     [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
   def int_x86_avx512_mask_pmov_dw_512 :
-          GCCBuiltin<"__builtin_ia32_pmovdw512_mask">,
           Intrinsic<[llvm_v16i16_ty],
                     [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrNoMem]>;
@@ -6307,8 +4981,7 @@ let TargetPrefix = "x86" in {
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
                     [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_pmov_wb_256 :
-          GCCBuiltin<"__builtin_ia32_pmovwb256_mask">,
+  def int_x86_avx512_mask_pmov_wb_256 : // FIXME: Replace with trunc+select.
           Intrinsic<[llvm_v16i8_ty],
                     [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty],
                     [IntrNoMem]>;
@@ -6337,8 +5010,7 @@ let TargetPrefix = "x86" in {
           Intrinsic<[],
                     [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
                     [IntrArgMemOnly]>;
-  def int_x86_avx512_mask_pmov_wb_512 :
-          GCCBuiltin<"__builtin_ia32_pmovwb512_mask">,
+  def int_x86_avx512_mask_pmov_wb_512 : // FIXME: Replace with trunc+select.
           Intrinsic<[llvm_v32i8_ty],
                     [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty],
                     [IntrNoMem]>;
@@ -6371,105 +5043,66 @@ let TargetPrefix = "x86" in {
 
 // Bitwise ternary logic
 let TargetPrefix = "x86" in {
-  def int_x86_avx512_mask_pternlog_d_128 :
-          GCCBuiltin<"__builtin_ia32_pternlogd128_mask">,
-          Intrinsic<[llvm_v4i32_ty],
-                    [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_maskz_pternlog_d_128 :
-          GCCBuiltin<"__builtin_ia32_pternlogd128_maskz">,
+  def int_x86_avx512_pternlog_d_128 :
+          GCCBuiltin<"__builtin_ia32_pternlogd128">,
           Intrinsic<[llvm_v4i32_ty],
-                    [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_pternlog_d_256 :
-          GCCBuiltin<"__builtin_ia32_pternlogd256_mask">,
-          Intrinsic<[llvm_v8i32_ty],
-                    [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
+                    [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_pternlog_d_256 :
-          GCCBuiltin<"__builtin_ia32_pternlogd256_maskz">,
+  def int_x86_avx512_pternlog_d_256 :
+          GCCBuiltin<"__builtin_ia32_pternlogd256">,
           Intrinsic<[llvm_v8i32_ty],
-                    [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_pternlog_d_512 :
-          GCCBuiltin<"__builtin_ia32_pternlogd512_mask">,
-          Intrinsic<[llvm_v16i32_ty],
-                    [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
-                     llvm_i16_ty], [IntrNoMem]>;
+                    [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_pternlog_d_512 :
-          GCCBuiltin<"__builtin_ia32_pternlogd512_maskz">,
+  def int_x86_avx512_pternlog_d_512 :
+          GCCBuiltin<"__builtin_ia32_pternlogd512">,
           Intrinsic<[llvm_v16i32_ty],
-                    [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
-                     llvm_i16_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_pternlog_q_128 :
-          GCCBuiltin<"__builtin_ia32_pternlogq128_mask">,
-          Intrinsic<[llvm_v2i64_ty],
-                    [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
+                    [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty,
+                     llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_pternlog_q_128 :
-          GCCBuiltin<"__builtin_ia32_pternlogq128_maskz">,
+  def int_x86_avx512_pternlog_q_128 :
+          GCCBuiltin<"__builtin_ia32_pternlogq128">,
           Intrinsic<[llvm_v2i64_ty],
-                    [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_pternlog_q_256 :
-          GCCBuiltin<"__builtin_ia32_pternlogq256_mask">,
-          Intrinsic<[llvm_v4i64_ty],
-                    [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
+                    [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_pternlog_q_256 :
-          GCCBuiltin<"__builtin_ia32_pternlogq256_maskz">,
+  def int_x86_avx512_pternlog_q_256 :
+          GCCBuiltin<"__builtin_ia32_pternlogq256">,
           Intrinsic<[llvm_v4i64_ty],
-                    [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_pternlog_q_512 :
-          GCCBuiltin<"__builtin_ia32_pternlogq512_mask">,
-          Intrinsic<[llvm_v8i64_ty],
-                    [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
+                    [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
 
-  def int_x86_avx512_maskz_pternlog_q_512 :
-          GCCBuiltin<"__builtin_ia32_pternlogq512_maskz">,
+  def int_x86_avx512_pternlog_q_512 :
+          GCCBuiltin<"__builtin_ia32_pternlogq512">,
           Intrinsic<[llvm_v8i64_ty],
-                    [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
-                     llvm_i8_ty], [IntrNoMem]>;
+                    [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
+                    [IntrNoMem]>;
 }
 
 // Misc.
 let TargetPrefix = "x86" in {
-  def int_x86_avx512_mask_cmp_ps_512 :
-        GCCBuiltin<"__builtin_ia32_cmpps512_mask">,
-              Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                         llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_cmp_pd_512 :
-        GCCBuiltin<"__builtin_ia32_cmppd512_mask">,
-              Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                         llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_cmp_ps_256 :
-        GCCBuiltin<"__builtin_ia32_cmpps256_mask">,
-              Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                         llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_cmp_pd_256 :
-        GCCBuiltin<"__builtin_ia32_cmppd256_mask">,
-              Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                         llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_cmp_ps_128 :
-        GCCBuiltin<"__builtin_ia32_cmpps128_mask">,
-            Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                       llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_cmp_pd_128 :
-        GCCBuiltin<"__builtin_ia32_cmppd128_mask">,
-            Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                       llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  // NOTE: These comparison intrinsics are not used by clang as long as the
+  //       distinction in signaling behaviour is not implemented.
+  def int_x86_avx512_cmp_ps_512 :
+              Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cmp_pd_512 :
+              Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cmp_ps_256 :
+              Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cmp_pd_256 :
+              Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cmp_ps_128 :
+            Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                       llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cmp_pd_128 :
+            Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                       llvm_i32_ty], [IntrNoMem]>;
+
   def int_x86_avx512_mask_cmp_ss :
         GCCBuiltin<"__builtin_ia32_cmpss_mask">,
               Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
@@ -6518,3 +5151,65 @@ let TargetPrefix = "x86" in {
   def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">,
       Intrinsic<[], [llvm_ptr_ty], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// Cache write back intrinsics
+
+let TargetPrefix = "x86" in {
+  // Write back and invalidate
+  def int_x86_wbinvd : GCCBuiltin<"__builtin_ia32_wbinvd">,
+      Intrinsic<[], [], []>;
+
+  // Write back no-invalidate
+  def int_x86_wbnoinvd : GCCBuiltin<"__builtin_ia32_wbnoinvd">,
+      Intrinsic<[], [], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Cache-line demote
+
+let TargetPrefix = "x86" in {
+  def int_x86_cldemote : GCCBuiltin<"__builtin_ia32_cldemote">,
+      Intrinsic<[], [llvm_ptr_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Wait and pause enhancements
+let TargetPrefix = "x86" in {
+  def int_x86_umonitor : GCCBuiltin<"__builtin_ia32_umonitor">,
+              Intrinsic<[], [llvm_ptr_ty], []>;
+  def int_x86_umwait : GCCBuiltin<"__builtin_ia32_umwait">,
+              Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_x86_tpause : GCCBuiltin<"__builtin_ia32_tpause">,
+              Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Direct Move Instructions
+
+let TargetPrefix = "x86" in {
+  def int_x86_directstore32 : GCCBuiltin<"__builtin_ia32_directstore_u32">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], []>;
+  def int_x86_directstore64 : GCCBuiltin<"__builtin_ia32_directstore_u64">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
+  def int_x86_movdir64b : GCCBuiltin<"__builtin_ia32_movdir64b">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// PTWrite - Write data to processor trace pocket
+
+let TargetPrefix = "x86" in {
+  def int_x86_ptwrite32 : GCCBuiltin<"__builtin_ia32_ptwrite32">,
+              Intrinsic<[], [llvm_i32_ty], []>;
+  def int_x86_ptwrite64 : GCCBuiltin<"__builtin_ia32_ptwrite64">,
+              Intrinsic<[], [llvm_i64_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID - Invalidate Process-Context Identifier
+
+let TargetPrefix = "x86" in {
+  def int_x86_invpcid : GCCBuiltin<"__builtin_ia32_invpcid">,
+              Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
+}
diff --git a/contrib/llvm/include/llvm/IR/LLVMContext.h b/contrib/llvm/include/llvm/IR/LLVMContext.h
index a95634d32c21..ebd445553167 100644
--- a/contrib/llvm/include/llvm/IR/LLVMContext.h
+++ b/contrib/llvm/include/llvm/IR/LLVMContext.h
@@ -31,7 +31,7 @@ class Function;
 class Instruction;
 class LLVMContextImpl;
 class Module;
-class OptBisect;
+class OptPassGate;
 template <typename T> class SmallVectorImpl;
 class SMDiagnostic;
 class StringRef;
@@ -76,7 +76,7 @@ public:
 
   // Pinned metadata names, which always have the same value.  This is a
   // compile-time performance optimization, not a correctness optimization.
-  enum {
+  enum : unsigned {
     MD_dbg = 0,                       // "dbg"
     MD_tbaa = 1,                      // "tbaa"
     MD_prof = 2,                      // "prof"
@@ -108,7 +108,7 @@ public:
   /// operand bundle tags that LLVM has special knowledge of are listed here.
   /// Additionally, this scheme allows LLVM to efficiently check for specific
   /// operand bundle tags without comparing strings.
-  enum {
+  enum : unsigned {
     OB_deopt = 0,         // "deopt"
     OB_funclet = 1,       // "funclet"
     OB_gc_transition = 2, // "gc-transition"
@@ -229,23 +229,23 @@ public:
   /// to caller.
   std::unique_ptr<DiagnosticHandler> getDiagnosticHandler();
 
-  /// \brief Return if a code hotness metric should be included in optimization
+  /// Return if a code hotness metric should be included in optimization
   /// diagnostics.
   bool getDiagnosticsHotnessRequested() const;
-  /// \brief Set if a code hotness metric should be included in optimization
+  /// Set if a code hotness metric should be included in optimization
   /// diagnostics.
   void setDiagnosticsHotnessRequested(bool Requested);
 
-  /// \brief Return the minimum hotness value a diagnostic would need in order
+  /// Return the minimum hotness value a diagnostic would need in order
   /// to be included in optimization diagnostics. If there is no minimum, this
   /// returns None.
   uint64_t getDiagnosticsHotnessThreshold() const;
 
-  /// \brief Set the minimum hotness value a diagnostic needs in order to be
+  /// Set the minimum hotness value a diagnostic needs in order to be
   /// included in optimization diagnostics.
   void setDiagnosticsHotnessThreshold(uint64_t Threshold);
 
-  /// \brief Return the YAML file used by the backend to save optimization
+  /// Return the YAML file used by the backend to save optimization
   /// diagnostics.  If null, diagnostics are not saved in a file but only
   /// emitted via the diagnostic handler.
   yaml::Output *getDiagnosticsOutputFile();
@@ -256,11 +256,11 @@ public:
   /// set, the handler is invoked for each diagnostic message.
   void setDiagnosticsOutputFile(std::unique_ptr<yaml::Output> F);
 
-  /// \brief Get the prefix that should be printed in front of a diagnostic of
+  /// Get the prefix that should be printed in front of a diagnostic of
   ///        the given \p Severity
   static const char *getDiagnosticMessagePrefix(DiagnosticSeverity Severity);
 
-  /// \brief Report a message to the currently installed diagnostic handler.
+  /// Report a message to the currently installed diagnostic handler.
   ///
   /// This function returns, in particular in the case of error reporting
   /// (DI.Severity == \a DS_Error), so the caller should leave the compilation
@@ -272,7 +272,7 @@ public:
   /// "warning: " for \a DS_Warning, and "note: " for \a DS_Note.
   void diagnose(const DiagnosticInfo &DI);
 
-  /// \brief Registers a yield callback with the given context.
+  /// Registers a yield callback with the given context.
   ///
   /// The yield callback function may be called by LLVM to transfer control back
   /// to the client that invoked the LLVM compilation. This can be used to yield
@@ -291,7 +291,7 @@ public:
   /// control to LLVM. Other LLVM contexts are unaffected by this restriction.
   void setYieldCallback(YieldCallbackTy Callback, void *OpaqueHandle);
 
-  /// \brief Calls the yield callback (if applicable).
+  /// Calls the yield callback (if applicable).
   ///
   /// This transfers control of the current thread back to the client, which may
   /// suspend the current thread. Only call this method when LLVM doesn't hold
@@ -307,7 +307,7 @@ public:
   void emitError(const Instruction *I, const Twine &ErrorStr);
   void emitError(const Twine &ErrorStr);
 
-  /// \brief Query for a debug option's value.
+  /// Query for a debug option's value.
   ///
   /// This function returns typed data populated from command line parsing.
   template <typename ValT, typename Base, ValT(Base::*Mem)>
@@ -315,9 +315,17 @@ public:
     return OptionRegistry::instance().template get<ValT, Base, Mem>();
   }
 
-  /// \brief Access the object which manages optimization bisection for failure
-  /// analysis.
-  OptBisect &getOptBisect();
+  /// Access the object which can disable optional passes and individual
+  /// optimizations at compile time.
+  OptPassGate &getOptPassGate() const;
+
+  /// Set the object which can disable optional passes and individual
+  /// optimizations at compile time.
+  ///
+  /// The lifetime of the object must be guaranteed to extend as long as the
+  /// LLVMContext is used by compilation.
+  void setOptPassGate(OptPassGate&);
+
 private:
   // Module needs access to the add/removeModule methods.
   friend class Module;
diff --git a/contrib/llvm/include/llvm/IR/LegacyPassManagers.h b/contrib/llvm/include/llvm/IR/LegacyPassManagers.h
index 3dc4a776dba0..90036c6ce248 100644
--- a/contrib/llvm/include/llvm/IR/LegacyPassManagers.h
+++ b/contrib/llvm/include/llvm/IR/LegacyPassManagers.h
@@ -403,6 +403,15 @@ public:
       InheritedAnalysis[Index++] = (*I)->getAvailableAnalysis();
   }
 
+  /// Set the initial size of the module if the user has specified that they
+  /// want remarks for size.
+  /// Returns 0 if the remark was not requested.
+  unsigned initSizeRemarkInfo(Module &M);
+
+  /// Emit a remark signifying that the number of IR instructions in the module
+  /// changed.
+  void emitInstrCountChangedRemark(Pass *P, Module &M, unsigned CountBefore);
+
 protected:
   // Top level manager.
   PMTopLevelManager *TPM;
diff --git a/contrib/llvm/include/llvm/IR/MDBuilder.h b/contrib/llvm/include/llvm/IR/MDBuilder.h
index dff1ca12407f..174616c7ab1d 100644
--- a/contrib/llvm/include/llvm/IR/MDBuilder.h
+++ b/contrib/llvm/include/llvm/IR/MDBuilder.h
@@ -38,17 +38,17 @@ class MDBuilder {
 public:
   MDBuilder(LLVMContext &context) : Context(context) {}
 
-  /// \brief Return the given string as metadata.
+  /// Return the given string as metadata.
   MDString *createString(StringRef Str);
 
-  /// \brief Return the given constant as metadata.
+  /// Return the given constant as metadata.
   ConstantAsMetadata *createConstant(Constant *C);
 
   //===------------------------------------------------------------------===//
   // FPMath metadata.
   //===------------------------------------------------------------------===//
 
-  /// \brief Return metadata with the given settings.  The special value 0.0
+  /// Return metadata with the given settings.  The special value 0.0
   /// for the Accuracy parameter indicates the default (maximal precision)
   /// setting.
   MDNode *createFPMath(float Accuracy);
@@ -57,19 +57,20 @@ public:
   // Prof metadata.
   //===------------------------------------------------------------------===//
 
-  /// \brief Return metadata containing two branch weights.
+  /// Return metadata containing two branch weights.
   MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight);
 
-  /// \brief Return metadata containing a number of branch weights.
+  /// Return metadata containing a number of branch weights.
   MDNode *createBranchWeights(ArrayRef<uint32_t> Weights);
 
   /// Return metadata specifying that a branch or switch is unpredictable.
   MDNode *createUnpredictable();
 
-  /// Return metadata containing the entry \p Count for a function, and the
+  /// Return metadata containing the entry \p Count for a function, a boolean
+  /// \Synthetic indicating whether the counts were synthetized, and the
   /// GUIDs stored in \p Imports that need to be imported for sample PGO, to
   /// enable the same inlines as the profiled optimized binary
-  MDNode *createFunctionEntryCount(uint64_t Count,
+  MDNode *createFunctionEntryCount(uint64_t Count, bool Synthetic,
                                    const DenseSet<GlobalValue::GUID> *Imports);
 
   /// Return metadata containing the section prefix for a function.
@@ -79,17 +80,17 @@ public:
   // Range metadata.
   //===------------------------------------------------------------------===//
 
-  /// \brief Return metadata describing the range [Lo, Hi).
+  /// Return metadata describing the range [Lo, Hi).
   MDNode *createRange(const APInt &Lo, const APInt &Hi);
 
-  /// \brief Return metadata describing the range [Lo, Hi).
+  /// Return metadata describing the range [Lo, Hi).
   MDNode *createRange(Constant *Lo, Constant *Hi);
 
   //===------------------------------------------------------------------===//
   // Callees metadata.
   //===------------------------------------------------------------------===//
 
-  /// \brief Return metadata indicating the possible callees of indirect
+  /// Return metadata indicating the possible callees of indirect
   /// calls.
   MDNode *createCallees(ArrayRef<Function *> Callees);
 
@@ -98,28 +99,28 @@ public:
   //===------------------------------------------------------------------===//
 
 protected:
-  /// \brief Return metadata appropriate for a AA root node (scope or TBAA).
+  /// Return metadata appropriate for a AA root node (scope or TBAA).
   /// Each returned node is distinct from all other metadata and will never
   /// be identified (uniqued) with anything else.
   MDNode *createAnonymousAARoot(StringRef Name = StringRef(),
                                 MDNode *Extra = nullptr);
 
 public:
-  /// \brief Return metadata appropriate for a TBAA root node. Each returned
+  /// Return metadata appropriate for a TBAA root node. Each returned
   /// node is distinct from all other metadata and will never be identified
   /// (uniqued) with anything else.
   MDNode *createAnonymousTBAARoot() {
     return createAnonymousAARoot();
   }
 
-  /// \brief Return metadata appropriate for an alias scope domain node.
+  /// Return metadata appropriate for an alias scope domain node.
   /// Each returned node is distinct from all other metadata and will never
   /// be identified (uniqued) with anything else.
   MDNode *createAnonymousAliasScopeDomain(StringRef Name = StringRef()) {
     return createAnonymousAARoot(Name);
   }
 
-  /// \brief Return metadata appropriate for an alias scope root node.
+  /// Return metadata appropriate for an alias scope root node.
   /// Each returned node is distinct from all other metadata and will never
   /// be identified (uniqued) with anything else.
   MDNode *createAnonymousAliasScope(MDNode *Domain,
@@ -127,22 +128,22 @@ public:
     return createAnonymousAARoot(Name, Domain);
   }
 
-  /// \brief Return metadata appropriate for a TBAA root node with the given
+  /// Return metadata appropriate for a TBAA root node with the given
   /// name.  This may be identified (uniqued) with other roots with the same
   /// name.
   MDNode *createTBAARoot(StringRef Name);
 
-  /// \brief Return metadata appropriate for an alias scope domain node with
+  /// Return metadata appropriate for an alias scope domain node with
   /// the given name. This may be identified (uniqued) with other roots with
   /// the same name.
   MDNode *createAliasScopeDomain(StringRef Name);
 
-  /// \brief Return metadata appropriate for an alias scope node with
+  /// Return metadata appropriate for an alias scope node with
   /// the given name. This may be identified (uniqued) with other scopes with
   /// the same name and domain.
   MDNode *createAliasScope(StringRef Name, MDNode *Domain);
 
-  /// \brief Return metadata for a non-root TBAA node with the given name,
+  /// Return metadata for a non-root TBAA node with the given name,
   /// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
   MDNode *createTBAANode(StringRef Name, MDNode *Parent,
                          bool isConstant = false);
@@ -155,33 +156,33 @@ public:
       Offset(Offset), Size(Size), Type(Type) {}
   };
 
-  /// \brief Return metadata for a tbaa.struct node with the given
+  /// Return metadata for a tbaa.struct node with the given
   /// struct field descriptions.
   MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields);
 
-  /// \brief Return metadata for a TBAA struct node in the type DAG
+  /// Return metadata for a TBAA struct node in the type DAG
   /// with the given name, a list of pairs (offset, field type in the type DAG).
   MDNode *
   createTBAAStructTypeNode(StringRef Name,
                            ArrayRef<std::pair<MDNode *, uint64_t>> Fields);
 
-  /// \brief Return metadata for a TBAA scalar type node with the
+  /// Return metadata for a TBAA scalar type node with the
   /// given name, an offset and a parent in the TBAA type DAG.
   MDNode *createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
                                    uint64_t Offset = 0);
 
-  /// \brief Return metadata for a TBAA tag node with the given
+  /// Return metadata for a TBAA tag node with the given
   /// base type, access type and offset relative to the base type.
   MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
                                   uint64_t Offset, bool IsConstant = false);
 
-  /// \brief Return metadata for a TBAA type node in the TBAA type DAG with the
+  /// Return metadata for a TBAA type node in the TBAA type DAG with the
   /// given parent type, size in bytes, type identifier and a list of fields.
   MDNode *createTBAATypeNode(MDNode *Parent, uint64_t Size, Metadata *Id,
                              ArrayRef<TBAAStructField> Fields =
                                  ArrayRef<TBAAStructField>());
 
-  /// \brief Return metadata for a TBAA access tag with the given base type,
+  /// Return metadata for a TBAA access tag with the given base type,
   /// final access type, offset of the access relative to the base type, size of
   /// the access and flag indicating whether the accessed object can be
   /// considered immutable for the purposes of the TBAA analysis.
@@ -189,7 +190,11 @@ public:
                               uint64_t Offset, uint64_t Size,
                               bool IsImmutable = false);
 
-  /// \brief Return metadata containing an irreducible loop header weight.
+  /// Return mutable version of the given mutable or immutable TBAA
+  /// access tag.
+  MDNode *createMutableTBAAAccessTag(MDNode *Tag);
+
+  /// Return metadata containing an irreducible loop header weight.
   MDNode *createIrrLoopHeaderWeight(uint64_t Weight);
 };
 
diff --git a/contrib/llvm/include/llvm/IR/Mangler.h b/contrib/llvm/include/llvm/IR/Mangler.h
index 56ee21392ccd..0261c00f524c 100644
--- a/contrib/llvm/include/llvm/IR/Mangler.h
+++ b/contrib/llvm/include/llvm/IR/Mangler.h
@@ -50,6 +50,9 @@ public:
 void emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
                                   const Triple &TT, Mangler &Mangler);
 
+void emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
+                                const Triple &T, Mangler &M);
+
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/include/llvm/IR/Metadata.def b/contrib/llvm/include/llvm/IR/Metadata.def
index 03cdcab7dc47..70a03f28b488 100644
--- a/contrib/llvm/include/llvm/IR/Metadata.def
+++ b/contrib/llvm/include/llvm/IR/Metadata.def
@@ -108,6 +108,7 @@ HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DITemplateValueParameter)
 HANDLE_SPECIALIZED_MDNODE_BRANCH(DIVariable)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIGlobalVariable)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DILocalVariable)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DILabel)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIObjCProperty)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIImportedEntity)
 HANDLE_SPECIALIZED_MDNODE_BRANCH(DIMacroNode)
diff --git a/contrib/llvm/include/llvm/IR/Metadata.h b/contrib/llvm/include/llvm/IR/Metadata.h
index bc0b87a6c348..9ac97f4224ac 100644
--- a/contrib/llvm/include/llvm/IR/Metadata.h
+++ b/contrib/llvm/include/llvm/IR/Metadata.h
@@ -52,20 +52,20 @@ enum LLVMConstants : uint32_t {
   DEBUG_METADATA_VERSION = 3 // Current debug info version number.
 };
 
-/// \brief Root of the metadata hierarchy.
+/// Root of the metadata hierarchy.
 ///
 /// This is a root class for typeless data in the IR.
 class Metadata {
   friend class ReplaceableMetadataImpl;
 
-  /// \brief RTTI.
+  /// RTTI.
   const unsigned char SubclassID;
 
 protected:
-  /// \brief Active type of storage.
+  /// Active type of storage.
   enum StorageType { Uniqued, Distinct, Temporary };
 
-  /// \brief Storage flag for non-uniqued, otherwise unowned, metadata.
+  /// Storage flag for non-uniqued, otherwise unowned, metadata.
   unsigned char Storage;
   // TODO: expose remaining bits to subclasses.
 
@@ -86,7 +86,7 @@ protected:
 
   ~Metadata() = default;
 
-  /// \brief Default handling of a changed operand, which asserts.
+  /// Default handling of a changed operand, which asserts.
   ///
   /// If subclasses pass themselves in as owners to a tracking node reference,
   /// they must provide an implementation of this method.
@@ -97,7 +97,7 @@ protected:
 public:
   unsigned getMetadataID() const { return SubclassID; }
 
-  /// \brief User-friendly dump.
+  /// User-friendly dump.
   ///
   /// If \c M is provided, metadata nodes will be numbered canonically;
   /// otherwise, pointer addresses are substituted.
@@ -110,7 +110,7 @@ public:
   void dump(const Module *M) const;
   /// @}
 
-  /// \brief Print.
+  /// Print.
   ///
   /// Prints definition of \c this.
   ///
@@ -123,7 +123,7 @@ public:
              bool IsForDebug = false) const;
   /// @}
 
-  /// \brief Print as operand.
+  /// Print as operand.
   ///
   /// Prints reference of \c this.
   ///
@@ -162,7 +162,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Metadata &MD) {
   return OS;
 }
 
-/// \brief Metadata wrapper in the Value hierarchy.
+/// Metadata wrapper in the Value hierarchy.
 ///
 /// A member of the \a Value hierarchy to represent a reference to metadata.
 /// This allows, e.g., instrinsics to have metadata as operands.
@@ -177,7 +177,7 @@ class MetadataAsValue : public Value {
 
   MetadataAsValue(Type *Ty, Metadata *MD);
 
-  /// \brief Drop use of metadata (during teardown).
+  /// Drop use of metadata (during teardown).
   void dropUse() { MD = nullptr; }
 
 public:
@@ -198,7 +198,7 @@ private:
   void untrack();
 };
 
-/// \brief API for tracking metadata references through RAUW and deletion.
+/// API for tracking metadata references through RAUW and deletion.
 ///
 /// Shared API for updating \a Metadata pointers in subclasses that support
 /// RAUW.
@@ -207,7 +207,7 @@ private:
 /// user-friendly tracking reference.
 class MetadataTracking {
 public:
-  /// \brief Track the reference to metadata.
+  /// Track the reference to metadata.
   ///
   /// Register \c MD with \c *MD, if the subclass supports tracking.  If \c *MD
   /// gets RAUW'ed, \c MD will be updated to the new address.  If \c *MD gets
@@ -220,7 +220,7 @@ public:
     return track(&MD, *MD, static_cast<Metadata *>(nullptr));
   }
 
-  /// \brief Track the reference to metadata for \a Metadata.
+  /// Track the reference to metadata for \a Metadata.
   ///
   /// As \a track(Metadata*&), but with support for calling back to \c Owner to
   /// tell it that its operand changed.  This could trigger \c Owner being
@@ -229,7 +229,7 @@ public:
     return track(Ref, MD, &Owner);
   }
 
-  /// \brief Track the reference to metadata for \a MetadataAsValue.
+  /// Track the reference to metadata for \a MetadataAsValue.
   ///
   /// As \a track(Metadata*&), but with support for calling back to \c Owner to
   /// tell it that its operand changed.  This could trigger \c Owner being
@@ -238,13 +238,13 @@ public:
     return track(Ref, MD, &Owner);
   }
 
-  /// \brief Stop tracking a reference to metadata.
+  /// Stop tracking a reference to metadata.
   ///
   /// Stops \c *MD from tracking \c MD.
   static void untrack(Metadata *&MD) { untrack(&MD, *MD); }
   static void untrack(void *Ref, Metadata &MD);
 
-  /// \brief Move tracking from one reference to another.
+  /// Move tracking from one reference to another.
   ///
   /// Semantically equivalent to \c untrack(MD) followed by \c track(New),
   /// except that ownership callbacks are maintained.
@@ -257,19 +257,19 @@ public:
   }
   static bool retrack(void *Ref, Metadata &MD, void *New);
 
-  /// \brief Check whether metadata is replaceable.
+  /// Check whether metadata is replaceable.
   static bool isReplaceable(const Metadata &MD);
 
   using OwnerTy = PointerUnion<MetadataAsValue *, Metadata *>;
 
 private:
-  /// \brief Track a reference to metadata for an owner.
+  /// Track a reference to metadata for an owner.
   ///
   /// Generalized version of tracking.
   static bool track(void *Ref, Metadata &MD, OwnerTy Owner);
 };
 
-/// \brief Shared implementation of use-lists for replaceable metadata.
+/// Shared implementation of use-lists for replaceable metadata.
 ///
 /// Most metadata cannot be RAUW'ed.  This is a shared implementation of
 /// use-lists and associated API for the two that support it (\a ValueAsMetadata
@@ -294,12 +294,12 @@ public:
 
   LLVMContext &getContext() const { return Context; }
 
-  /// \brief Replace all uses of this with MD.
+  /// Replace all uses of this with MD.
   ///
   /// Replace all uses of this with \c MD, which is allowed to be null.
   void replaceAllUsesWith(Metadata *MD);
 
-  /// \brief Resolve all uses of this.
+  /// Resolve all uses of this.
   ///
   /// Resolve all uses of this, turning off RAUW permanently.  If \c
   /// ResolveUsers, call \a MDNode::resolve() on any users whose last operand
@@ -326,7 +326,7 @@ private:
   static bool isReplaceable(const Metadata &MD);
 };
 
-/// \brief Value wrapper in the Metadata hierarchy.
+/// Value wrapper in the Metadata hierarchy.
 ///
 /// This is a custom value handle that allows other metadata to refer to
 /// classes in the Value hierarchy.
@@ -340,7 +340,7 @@ class ValueAsMetadata : public Metadata, ReplaceableMetadataImpl {
 
   Value *V;
 
-  /// \brief Drop users without RAUW (during teardown).
+  /// Drop users without RAUW (during teardown).
   void dropUsers() {
     ReplaceableMetadataImpl::resolveAllUses(/* ResolveUsers */ false);
   }
@@ -382,7 +382,7 @@ public:
   static void handleRAUW(Value *From, Value *To);
 
 protected:
-  /// \brief Handle collisions after \a Value::replaceAllUsesWith().
+  /// Handle collisions after \a Value::replaceAllUsesWith().
   ///
   /// RAUW isn't supported directly for \a ValueAsMetadata, but if the wrapped
   /// \a Value gets RAUW'ed and the target already exists, this is used to
@@ -444,7 +444,7 @@ public:
   }
 };
 
-/// \brief Transitional API for extracting constants from Metadata.
+/// Transitional API for extracting constants from Metadata.
 ///
 /// This namespace contains transitional functions for metadata that points to
 /// \a Constants.
@@ -520,7 +520,7 @@ template <class V, class M> struct IsValidReference {
 
 } // end namespace detail
 
-/// \brief Check whether Metadata has a Value.
+/// Check whether Metadata has a Value.
 ///
 /// As an analogue to \a isa(), check whether \c MD has an \a Value inside of
 /// type \c X.
@@ -539,7 +539,7 @@ inline
   return hasa(&MD);
 }
 
-/// \brief Extract a Value from Metadata.
+/// Extract a Value from Metadata.
 ///
 /// As an analogue to \a cast(), extract the \a Value subclass \c X from \c MD.
 template <class X, class Y>
@@ -554,7 +554,7 @@ inline
   return extract(&MD);
 }
 
-/// \brief Extract a Value from Metadata, allowing null.
+/// Extract a Value from Metadata, allowing null.
 ///
 /// As an analogue to \a cast_or_null(), extract the \a Value subclass \c X
 /// from \c MD, allowing \c MD to be null.
@@ -566,7 +566,7 @@ extract_or_null(Y &&MD) {
   return nullptr;
 }
 
-/// \brief Extract a Value from Metadata, if any.
+/// Extract a Value from Metadata, if any.
 ///
 /// As an analogue to \a dyn_cast_or_null(), extract the \a Value subclass \c X
 /// from \c MD, return null if \c MD doesn't contain a \a Value or if the \a
@@ -579,7 +579,7 @@ dyn_extract(Y &&MD) {
   return nullptr;
 }
 
-/// \brief Extract a Value from Metadata, if any, allowing null.
+/// Extract a Value from Metadata, if any, allowing null.
 ///
 /// As an analogue to \a dyn_cast_or_null(), extract the \a Value subclass \c X
 /// from \c MD, return null if \c MD doesn't contain a \a Value or if the \a
@@ -595,7 +595,7 @@ dyn_extract_or_null(Y &&MD) {
 } // end namespace mdconst
 
 //===----------------------------------------------------------------------===//
-/// \brief A single uniqued string.
+/// A single uniqued string.
 ///
 /// These are used to efficiently contain a byte sequence for metadata.
 /// MDString is always unnamed.
@@ -622,22 +622,22 @@ public:
 
   using iterator = StringRef::iterator;
 
-  /// \brief Pointer to the first byte of the string.
+  /// Pointer to the first byte of the string.
   iterator begin() const { return getString().begin(); }
 
-  /// \brief Pointer to one byte past the end of the string.
+  /// Pointer to one byte past the end of the string.
   iterator end() const { return getString().end(); }
 
   const unsigned char *bytes_begin() const { return getString().bytes_begin(); }
   const unsigned char *bytes_end() const { return getString().bytes_end(); }
 
-  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast.
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == MDStringKind;
   }
 };
 
-/// \brief A collection of metadata nodes that might be associated with a
+/// A collection of metadata nodes that might be associated with a
 /// memory access used by the alias-analysis infrastructure.
 struct AAMDNodes {
   explicit AAMDNodes(MDNode *T = nullptr, MDNode *S = nullptr,
@@ -652,16 +652,16 @@ struct AAMDNodes {
 
   explicit operator bool() const { return TBAA || Scope || NoAlias; }
 
-  /// \brief The tag for type-based alias analysis.
+  /// The tag for type-based alias analysis.
   MDNode *TBAA;
 
-  /// \brief The tag for alias scope specification (used with noalias).
+  /// The tag for alias scope specification (used with noalias).
   MDNode *Scope;
 
-  /// \brief The tag specifying the noalias scope.
+  /// The tag specifying the noalias scope.
   MDNode *NoAlias;
 
-  /// \brief Given two sets of AAMDNodes that apply to the same pointer,
+  /// Given two sets of AAMDNodes that apply to the same pointer,
   /// give the best AAMDNodes that are compatible with both (i.e. a set of
   /// nodes whose allowable aliasing conclusions are a subset of those
   /// allowable by both of the inputs). However, for efficiency
@@ -699,7 +699,7 @@ struct DenseMapInfo<AAMDNodes> {
   }
 };
 
-/// \brief Tracking metadata reference owned by Metadata.
+/// Tracking metadata reference owned by Metadata.
 ///
 /// Similar to \a TrackingMDRef, but it's expected to be owned by an instance
 /// of \a Metadata, which has the option of registering itself for callbacks to
@@ -761,7 +761,7 @@ template <> struct simplify_type<const MDOperand> {
   static SimpleType getSimplifiedValue(const MDOperand &MD) { return MD.get(); }
 };
 
-/// \brief Pointer to the context, with optional RAUW support.
+/// Pointer to the context, with optional RAUW support.
 ///
 /// Either a raw (non-null) pointer to the \a LLVMContext, or an owned pointer
 /// to \a ReplaceableMetadataImpl (which has a reference to \a LLVMContext).
@@ -785,7 +785,7 @@ public:
 
   operator LLVMContext &() { return getContext(); }
 
-  /// \brief Whether this contains RAUW support.
+  /// Whether this contains RAUW support.
   bool hasReplaceableUses() const {
     return Ptr.is<ReplaceableMetadataImpl *>();
   }
@@ -809,7 +809,7 @@ public:
     return getReplaceableUses();
   }
 
-  /// \brief Assign RAUW support to this.
+  /// Assign RAUW support to this.
   ///
   /// Make this replaceable, taking ownership of \c ReplaceableUses (which must
   /// not be null).
@@ -822,7 +822,7 @@ public:
     Ptr = ReplaceableUses.release();
   }
 
-  /// \brief Drop RAUW support.
+  /// Drop RAUW support.
   ///
   /// Cede ownership of RAUW support, returning it.
   std::unique_ptr<ReplaceableMetadataImpl> takeReplaceableUses() {
@@ -843,7 +843,7 @@ struct TempMDNodeDeleter {
 #define HANDLE_MDNODE_BRANCH(CLASS) HANDLE_MDNODE_LEAF(CLASS)
 #include "llvm/IR/Metadata.def"
 
-/// \brief Metadata node.
+/// Metadata node.
 ///
 /// Metadata nodes can be uniqued, like constants, or distinct.  Temporary
 /// metadata nodes (with full support for RAUW) can be used to delay uniquing
@@ -876,12 +876,12 @@ protected:
   void *operator new(size_t Size, unsigned NumOps);
   void operator delete(void *Mem);
 
-  /// \brief Required by std, but never called.
+  /// Required by std, but never called.
   void operator delete(void *, unsigned) {
     llvm_unreachable("Constructor throws?");
   }
 
-  /// \brief Required by std, but never called.
+  /// Required by std, but never called.
   void operator delete(void *, unsigned, bool) {
     llvm_unreachable("Constructor throws?");
   }
@@ -910,10 +910,10 @@ public:
   static inline TempMDTuple getTemporary(LLVMContext &Context,
                                          ArrayRef<Metadata *> MDs);
 
-  /// \brief Create a (temporary) clone of this.
+  /// Create a (temporary) clone of this.
   TempMDNode clone() const;
 
-  /// \brief Deallocate a node created by getTemporary.
+  /// Deallocate a node created by getTemporary.
   ///
   /// Calls \c replaceAllUsesWith(nullptr) before deleting, so any remaining
   /// references will be reset.
@@ -921,10 +921,10 @@ public:
 
   LLVMContext &getContext() const { return Context.getContext(); }
 
-  /// \brief Replace a specific operand.
+  /// Replace a specific operand.
   void replaceOperandWith(unsigned I, Metadata *New);
 
-  /// \brief Check if node is fully resolved.
+  /// Check if node is fully resolved.
   ///
   /// If \a isTemporary(), this always returns \c false; if \a isDistinct(),
   /// this always returns \c true.
@@ -941,7 +941,7 @@ public:
   bool isDistinct() const { return Storage == Distinct; }
   bool isTemporary() const { return Storage == Temporary; }
 
-  /// \brief RAUW a temporary.
+  /// RAUW a temporary.
   ///
   /// \pre \a isTemporary() must be \c true.
   void replaceAllUsesWith(Metadata *MD) {
@@ -950,7 +950,7 @@ public:
       Context.getReplaceableUses()->replaceAllUsesWith(MD);
   }
 
-  /// \brief Resolve cycles.
+  /// Resolve cycles.
   ///
   /// Once all forward declarations have been resolved, force cycles to be
   /// resolved.
@@ -961,7 +961,7 @@ public:
   /// Resolve a unique, unresolved node.
   void resolve();
 
-  /// \brief Replace a temporary node with a permanent one.
+  /// Replace a temporary node with a permanent one.
   ///
   /// Try to create a uniqued version of \c N -- in place, if possible -- and
   /// return it.  If \c N cannot be uniqued, return a distinct node instead.
@@ -971,7 +971,7 @@ public:
     return cast<T>(N.release()->replaceWithPermanentImpl());
   }
 
-  /// \brief Replace a temporary node with a uniqued one.
+  /// Replace a temporary node with a uniqued one.
   ///
   /// Create a uniqued version of \c N -- in place, if possible -- and return
   /// it.  Takes ownership of the temporary node.
@@ -983,7 +983,7 @@ public:
     return cast<T>(N.release()->replaceWithUniquedImpl());
   }
 
-  /// \brief Replace a temporary node with a distinct one.
+  /// Replace a temporary node with a distinct one.
   ///
   /// Create a distinct version of \c N -- in place, if possible -- and return
   /// it.  Takes ownership of the temporary node.
@@ -999,7 +999,7 @@ private:
   MDNode *replaceWithDistinctImpl();
 
 protected:
-  /// \brief Set an operand.
+  /// Set an operand.
   ///
   /// Sets the operand directly, without worrying about uniquing.
   void setOperand(unsigned I, Metadata *New);
@@ -1019,14 +1019,14 @@ private:
   void decrementUnresolvedOperandCount();
   void countUnresolvedOperands();
 
-  /// \brief Mutate this to be "uniqued".
+  /// Mutate this to be "uniqued".
   ///
   /// Mutate this so that \a isUniqued().
   /// \pre \a isTemporary().
   /// \pre already added to uniquing set.
   void makeUniqued();
 
-  /// \brief Mutate this to be "distinct".
+  /// Mutate this to be "distinct".
   ///
   /// Mutate this so that \a isDistinct().
   /// \pre \a isTemporary().
@@ -1069,10 +1069,10 @@ public:
     return op_begin()[I];
   }
 
-  /// \brief Return number of MDNode operands.
+  /// Return number of MDNode operands.
   unsigned getNumOperands() const { return NumOperands; }
 
-  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Metadata *MD) {
     switch (MD->getMetadataID()) {
     default:
@@ -1084,10 +1084,10 @@ public:
     }
   }
 
-  /// \brief Check whether MDNode is a vtable access.
+  /// Check whether MDNode is a vtable access.
   bool isTBAAVtableAccess() const;
 
-  /// \brief Methods for metadata merging.
+  /// Methods for metadata merging.
   static MDNode *concatenate(MDNode *A, MDNode *B);
   static MDNode *intersect(MDNode *A, MDNode *B);
   static MDNode *getMostGenericTBAA(MDNode *A, MDNode *B);
@@ -1097,7 +1097,7 @@ public:
   static MDNode *getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B);
 };
 
-/// \brief Tuple of metadata.
+/// Tuple of metadata.
 ///
 /// This is the simple \a MDNode arbitrary tuple.  Nodes are uniqued by
 /// default based on their operands.
@@ -1125,7 +1125,7 @@ class MDTuple : public MDNode {
   }
 
 public:
-  /// \brief Get the hash, if any.
+  /// Get the hash, if any.
   unsigned getHash() const { return SubclassData32; }
 
   static MDTuple *get(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
@@ -1136,14 +1136,14 @@ public:
     return getImpl(Context, MDs, Uniqued, /* ShouldCreate */ false);
   }
 
-  /// \brief Return a distinct node.
+  /// Return a distinct node.
   ///
   /// Return a distinct node -- i.e., a node that is not uniqued.
   static MDTuple *getDistinct(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
     return getImpl(Context, MDs, Distinct);
   }
 
-  /// \brief Return a temporary node.
+  /// Return a temporary node.
   ///
   /// For use in constructing cyclic MDNode structures. A temporary MDNode is
   /// not uniqued, may be RAUW'd, and must be manually deleted with
@@ -1153,7 +1153,7 @@ public:
     return TempMDTuple(getImpl(Context, MDs, Temporary));
   }
 
-  /// \brief Return a (temporary) clone of this.
+  /// Return a (temporary) clone of this.
   TempMDTuple clone() const { return cloneImpl(); }
 
   static bool classof(const Metadata *MD) {
@@ -1182,7 +1182,7 @@ void TempMDNodeDeleter::operator()(MDNode *Node) const {
   MDNode::deleteTemporary(Node);
 }
 
-/// \brief Typed iterator through MDNode operands.
+/// Typed iterator through MDNode operands.
 ///
 /// An iterator that transforms an \a MDNode::iterator into an iterator over a
 /// particular Metadata subclass.
@@ -1213,7 +1213,7 @@ public:
   bool operator!=(const TypedMDOperandIterator &X) const { return I != X.I; }
 };
 
-/// \brief Typed, array-like tuple of metadata.
+/// Typed, array-like tuple of metadata.
 ///
 /// This is a wrapper for \a MDTuple that makes it act like an array holding a
 /// particular type of metadata.
@@ -1314,7 +1314,7 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
-/// \brief A tuple of MDNodes.
+/// A tuple of MDNodes.
 ///
 /// Despite its name, a NamedMDNode isn't itself an MDNode. NamedMDNodes belong
 /// to modules, have names, and contain lists of MDNodes.
@@ -1377,7 +1377,7 @@ public:
   NamedMDNode(const NamedMDNode &) = delete;
   ~NamedMDNode();
 
-  /// \brief Drop all references and remove the node from parent module.
+  /// Drop all references and remove the node from parent module.
   void eraseFromParent();
 
   /// Remove all uses and clear node vector.
@@ -1385,7 +1385,7 @@ public:
   /// Drop all references to this node's operands.
   void clearOperands();
 
-  /// \brief Get the module that holds this named metadata collection.
+  /// Get the module that holds this named metadata collection.
   inline Module *getParent() { return Parent; }
   inline const Module *getParent() const { return Parent; }
 
diff --git a/contrib/llvm/include/llvm/IR/Module.h b/contrib/llvm/include/llvm/IR/Module.h
index 196e32e3615c..a405f7df3efe 100644
--- a/contrib/llvm/include/llvm/IR/Module.h
+++ b/contrib/llvm/include/llvm/IR/Module.h
@@ -59,7 +59,7 @@ class StructType;
 /// A module maintains a GlobalValRefMap object that is used to hold all
 /// constant references to global variables in the module.  When a global
 /// variable is destroyed, it should have no entries in the GlobalValueRefMap.
-/// @brief The main container class for the LLVM Intermediate Representation.
+/// The main container class for the LLVM Intermediate Representation.
 class Module {
 /// @name Types And Enumerations
 /// @{
@@ -207,13 +207,18 @@ public:
   /// @returns the module identifier as a string
   const std::string &getModuleIdentifier() const { return ModuleID; }
 
+  /// Returns the number of non-debug IR instructions in the module.
+  /// This is equivalent to the sum of the IR instruction counts of each
+  /// function contained in the module.
+  unsigned getInstructionCount();
+
   /// Get the module's original source file name. When compiling from
   /// bitcode, this is taken from a bitcode record where it was recorded.
   /// For other compiles it is the same as the ModuleID, which would
   /// contain the source file name.
   const std::string &getSourceFileName() const { return SourceFileName; }
 
-  /// \brief Get a short "name" for the module.
+  /// Get a short "name" for the module.
   ///
   /// This is useful for debugging or logging. It is essentially a convenience
   /// wrapper around getModuleIdentifier().
@@ -251,9 +256,16 @@ public:
   /// versions when the pass does not change.
   std::unique_ptr<RandomNumberGenerator> createRNG(const Pass* P) const;
 
-/// @}
-/// @name Module Level Mutators
-/// @{
+  /// Return true if size-info optimization remark is enabled, false
+  /// otherwise.
+  bool shouldEmitInstrCountChangedRemark() {
+    return getContext().getDiagHandlerPtr()->isAnalysisRemarkEnabled(
+        "size-info");
+  }
+
+  /// @}
+  /// @name Module Level Mutators
+  /// @{
 
   /// Set the module identifier.
   void setModuleIdentifier(StringRef ID) { ModuleID = ID; }
@@ -795,14 +807,14 @@ public:
 /// @name Utility functions for querying Debug information.
 /// @{
 
-  /// \brief Returns the Number of Register ParametersDwarf Version by checking
+  /// Returns the Number of Register ParametersDwarf Version by checking
   /// module flags.
   unsigned getNumberRegisterParameters() const;
 
-  /// \brief Returns the Dwarf Version by checking module flags.
+  /// Returns the Dwarf Version by checking module flags.
   unsigned getDwarfVersion() const;
 
-  /// \brief Returns the CodeView Version by checking module flags.
+  /// Returns the CodeView Version by checking module flags.
   /// Returns zero if not present in module.
   unsigned getCodeViewFlag() const;
 
@@ -810,10 +822,10 @@ public:
 /// @name Utility functions for querying and setting PIC level
 /// @{
 
-  /// \brief Returns the PIC level (small or large model)
+  /// Returns the PIC level (small or large model)
   PICLevel::Level getPICLevel() const;
 
-  /// \brief Set the PIC level (small or large model)
+  /// Set the PIC level (small or large model)
   void setPICLevel(PICLevel::Level PL);
 /// @}
 
@@ -821,28 +833,35 @@ public:
 /// @name Utility functions for querying and setting PIE level
 /// @{
 
-  /// \brief Returns the PIE level (small or large model)
+  /// Returns the PIE level (small or large model)
   PIELevel::Level getPIELevel() const;
 
-  /// \brief Set the PIE level (small or large model)
+  /// Set the PIE level (small or large model)
   void setPIELevel(PIELevel::Level PL);
 /// @}
 
   /// @name Utility functions for querying and setting PGO summary
   /// @{
 
-  /// \brief Attach profile summary metadata to this module.
+  /// Attach profile summary metadata to this module.
   void setProfileSummary(Metadata *M);
 
-  /// \brief Returns profile summary metadata
+  /// Returns profile summary metadata
   Metadata *getProfileSummary();
   /// @}
 
+  /// Returns true if PLT should be avoided for RTLib calls.
+  bool getRtLibUseGOT() const;
+
+  /// Set that PLT should be avoid for RTLib calls.
+  void setRtLibUseGOT();
+
+
   /// Take ownership of the given memory buffer.
   void setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB);
 };
 
-/// \brief Given "llvm.used" or "llvm.compiler.used" as a global name, collect
+/// Given "llvm.used" or "llvm.compiler.used" as a global name, collect
 /// the initializer elements of that global in Set and return the global itself.
 GlobalVariable *collectUsedGlobalVariables(const Module &M,
                                            SmallPtrSetImpl<GlobalValue *> &Set,
diff --git a/contrib/llvm/include/llvm/IR/ModuleSummaryIndex.h b/contrib/llvm/include/llvm/IR/ModuleSummaryIndex.h
index dd7a0db83774..fdf3d4b5f1ce 100644
--- a/contrib/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/contrib/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -25,6 +25,10 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScaledNumber.h"
+#include "llvm/Support/StringSaver.h"
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -45,7 +49,7 @@ template <typename T> struct MappingTraits;
 
 } // end namespace yaml
 
-/// \brief Class to accumulate and hold information about a callee.
+/// Class to accumulate and hold information about a callee.
 struct CalleeInfo {
   enum class HotnessType : uint8_t {
     Unknown = 0,
@@ -54,13 +58,44 @@ struct CalleeInfo {
     Hot = 3,
     Critical = 4
   };
-  HotnessType Hotness = HotnessType::Unknown;
 
-  CalleeInfo() = default;
-  explicit CalleeInfo(HotnessType Hotness) : Hotness(Hotness) {}
+  // The size of the bit-field might need to be adjusted if more values are
+  // added to HotnessType enum.
+  uint32_t Hotness : 3;
+
+  /// The value stored in RelBlockFreq has to be interpreted as the digits of
+  /// a scaled number with a scale of \p -ScaleShift.
+  uint32_t RelBlockFreq : 29;
+  static constexpr int32_t ScaleShift = 8;
+  static constexpr uint64_t MaxRelBlockFreq = (1 << 29) - 1;
+
+  CalleeInfo()
+      : Hotness(static_cast<uint32_t>(HotnessType::Unknown)), RelBlockFreq(0) {}
+  explicit CalleeInfo(HotnessType Hotness, uint64_t RelBF)
+      : Hotness(static_cast<uint32_t>(Hotness)), RelBlockFreq(RelBF) {}
 
   void updateHotness(const HotnessType OtherHotness) {
-    Hotness = std::max(Hotness, OtherHotness);
+    Hotness = std::max(Hotness, static_cast<uint32_t>(OtherHotness));
+  }
+
+  HotnessType getHotness() const { return HotnessType(Hotness); }
+
+  /// Update \p RelBlockFreq from \p BlockFreq and \p EntryFreq
+  ///
+  /// BlockFreq is divided by EntryFreq and added to RelBlockFreq. To represent
+  /// fractional values, the result is represented as a fixed point number with
+  /// scale of -ScaleShift.
+  void updateRelBlockFreq(uint64_t BlockFreq, uint64_t EntryFreq) {
+    if (EntryFreq == 0)
+      return;
+    using Scaled64 = ScaledNumber<uint64_t>;
+    Scaled64 Temp(BlockFreq, ScaleShift);
+    Temp /= Scaled64::get(EntryFreq);
+
+    uint64_t Sum =
+        SaturatingAdd<uint64_t>(Temp.toInt<uint64_t>(), RelBlockFreq);
+    Sum = std::min(Sum, uint64_t(MaxRelBlockFreq));
+    RelBlockFreq = static_cast<uint32_t>(Sum);
   }
 };
 
@@ -69,9 +104,29 @@ class GlobalValueSummary;
 using GlobalValueSummaryList = std::vector<std::unique_ptr<GlobalValueSummary>>;
 
 struct GlobalValueSummaryInfo {
-  /// The GlobalValue corresponding to this summary. This is only used in
-  /// per-module summaries.
-  const GlobalValue *GV = nullptr;
+  union NameOrGV {
+    NameOrGV(bool HaveGVs) {
+      if (HaveGVs)
+        GV = nullptr;
+      else
+        Name = "";
+    }
+
+    /// The GlobalValue corresponding to this summary. This is only used in
+    /// per-module summaries and when the IR is available. E.g. when module
+    /// analysis is being run, or when parsing both the IR and the summary
+    /// from assembly.
+    const GlobalValue *GV;
+
+    /// Summary string representation. This StringRef points to BC module
+    /// string table and is valid until module data is stored in memory.
+    /// This is guaranteed to happen until runThinLTOBackend function is
+    /// called, so it is safe to use this field during thin link. This field
+    /// is only valid if summary index was loaded from BC file.
+    StringRef Name;
+  } U;
+
+  GlobalValueSummaryInfo(bool HaveGVs) : U(HaveGVs) {}
 
   /// List of global value summary structures for a particular value held
   /// in the GlobalValueMap. Requires a vector in the case of multiple
@@ -91,44 +146,98 @@ using GlobalValueSummaryMapTy =
 /// Struct that holds a reference to a particular GUID in a global value
 /// summary.
 struct ValueInfo {
-  const GlobalValueSummaryMapTy::value_type *Ref = nullptr;
+  PointerIntPair<const GlobalValueSummaryMapTy::value_type *, 1, bool>
+      RefAndFlag;
 
   ValueInfo() = default;
-  ValueInfo(const GlobalValueSummaryMapTy::value_type *Ref) : Ref(Ref) {}
+  ValueInfo(bool HaveGVs, const GlobalValueSummaryMapTy::value_type *R) {
+    RefAndFlag.setPointer(R);
+    RefAndFlag.setInt(HaveGVs);
+  }
 
-  operator bool() const { return Ref; }
+  operator bool() const { return getRef(); }
 
-  GlobalValue::GUID getGUID() const { return Ref->first; }
-  const GlobalValue *getValue() const { return Ref->second.GV; }
+  GlobalValue::GUID getGUID() const { return getRef()->first; }
+  const GlobalValue *getValue() const {
+    assert(haveGVs());
+    return getRef()->second.U.GV;
+  }
 
   ArrayRef<std::unique_ptr<GlobalValueSummary>> getSummaryList() const {
-    return Ref->second.SummaryList;
+    return getRef()->second.SummaryList;
+  }
+
+  StringRef name() const {
+    return haveGVs() ? getRef()->second.U.GV->getName()
+                     : getRef()->second.U.Name;
+  }
+
+  bool haveGVs() const { return RefAndFlag.getInt(); }
+
+  const GlobalValueSummaryMapTy::value_type *getRef() const {
+    return RefAndFlag.getPointer();
   }
+
+  bool isDSOLocal() const;
 };
 
+inline raw_ostream &operator<<(raw_ostream &OS, const ValueInfo &VI) {
+  OS << VI.getGUID();
+  if (!VI.name().empty())
+    OS << " (" << VI.name() << ")";
+  return OS;
+}
+
+inline bool operator==(const ValueInfo &A, const ValueInfo &B) {
+  assert(A.getRef() && B.getRef() &&
+         "Need ValueInfo with non-null Ref for comparison");
+  return A.getRef() == B.getRef();
+}
+
+inline bool operator!=(const ValueInfo &A, const ValueInfo &B) {
+  assert(A.getRef() && B.getRef() &&
+         "Need ValueInfo with non-null Ref for comparison");
+  return A.getRef() != B.getRef();
+}
+
+inline bool operator<(const ValueInfo &A, const ValueInfo &B) {
+  assert(A.getRef() && B.getRef() &&
+         "Need ValueInfo with non-null Ref to compare GUIDs");
+  return A.getGUID() < B.getGUID();
+}
+
 template <> struct DenseMapInfo<ValueInfo> {
   static inline ValueInfo getEmptyKey() {
-    return ValueInfo((GlobalValueSummaryMapTy::value_type *)-1);
+    return ValueInfo(false, (GlobalValueSummaryMapTy::value_type *)-8);
   }
 
   static inline ValueInfo getTombstoneKey() {
-    return ValueInfo((GlobalValueSummaryMapTy::value_type *)-2);
+    return ValueInfo(false, (GlobalValueSummaryMapTy::value_type *)-16);
+  }
+
+  static inline bool isSpecialKey(ValueInfo V) {
+    return V == getTombstoneKey() || V == getEmptyKey();
   }
 
-  static bool isEqual(ValueInfo L, ValueInfo R) { return L.Ref == R.Ref; }
-  static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.Ref; }
+  static bool isEqual(ValueInfo L, ValueInfo R) {
+    // We are not supposed to mix ValueInfo(s) with different HaveGVs flag
+    // in a same container.
+    assert(isSpecialKey(L) || isSpecialKey(R) || (L.haveGVs() == R.haveGVs()));
+    return L.getRef() == R.getRef();
+  }
+  static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.getRef(); }
 };
 
-/// \brief Function and variable summary information to aid decisions and
+/// Function and variable summary information to aid decisions and
 /// implementation of importing.
 class GlobalValueSummary {
 public:
-  /// \brief Sububclass discriminator (for dyn_cast<> et al.)
+  /// Sububclass discriminator (for dyn_cast<> et al.)
   enum SummaryKind : unsigned { AliasKind, FunctionKind, GlobalVarKind };
 
   /// Group flags (Linkage, NotEligibleToImport, etc.) as a bitfield.
   struct GVFlags {
-    /// \brief The linkage type of the associated global value.
+    /// The linkage type of the associated global value.
     ///
     /// One use is to flag values that have local linkage types and need to
     /// have module identifier appended before placing into the combined
@@ -170,7 +279,7 @@ private:
   /// GUID includes the module level id in the hash.
   GlobalValue::GUID OriginalName = 0;
 
-  /// \brief Path of module IR containing value's definition, used to locate
+  /// Path of module IR containing value's definition, used to locate
   /// module during importing.
   ///
   /// This is only used during parsing of the combined index, or when
@@ -185,8 +294,6 @@ private:
   /// are listed in the derived FunctionSummary object.
   std::vector<ValueInfo> RefEdgeList;
 
-  bool isLive() const { return Flags.Live; }
-
 protected:
   GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector<ValueInfo> Refs)
       : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {
@@ -199,7 +306,7 @@ public:
 
   /// Returns the hash of the original name, it is identical to the GUID for
   /// externally visible symbols, but not for local ones.
-  GlobalValue::GUID getOriginalName() { return OriginalName; }
+  GlobalValue::GUID getOriginalName() const { return OriginalName; }
 
   /// Initialize the original name hash in this summary.
   void setOriginalName(GlobalValue::GUID Name) { OriginalName = Name; }
@@ -215,7 +322,7 @@ public:
   StringRef modulePath() const { return ModulePath; }
 
   /// Get the flags for this GlobalValue (see \p struct GVFlags).
-  GVFlags flags() { return Flags; }
+  GVFlags flags() const { return Flags; }
 
   /// Return linkage type recorded for this global value.
   GlobalValue::LinkageTypes linkage() const {
@@ -231,6 +338,8 @@ public:
   /// Return true if this global value can't be imported.
   bool notEligibleToImport() const { return Flags.NotEligibleToImport; }
 
+  bool isLive() const { return Flags.Live; }
+
   void setLive(bool Live) { Flags.Live = Live; }
 
   void setDSOLocal(bool Local) { Flags.DSOLocal = Local; }
@@ -249,11 +358,9 @@ public:
   const GlobalValueSummary *getBaseObject() const;
 
   friend class ModuleSummaryIndex;
-  friend void computeDeadSymbols(class ModuleSummaryIndex &,
-                                 const DenseSet<GlobalValue::GUID> &);
 };
 
-/// \brief Alias summary information.
+/// Alias summary information.
 class AliasSummary : public GlobalValueSummary {
   GlobalValueSummary *AliaseeSummary;
   // AliaseeGUID is only set and accessed when we are building a combined index
@@ -273,6 +380,8 @@ public:
   void setAliasee(GlobalValueSummary *Aliasee) { AliaseeSummary = Aliasee; }
   void setAliaseeGUID(GlobalValue::GUID GUID) { AliaseeGUID = GUID; }
 
+  bool hasAliasee() const { return !!AliaseeSummary; }
+
   const GlobalValueSummary &getAliasee() const {
     assert(AliaseeSummary && "Unexpected missing aliasee summary");
     return *AliaseeSummary;
@@ -300,13 +409,20 @@ inline GlobalValueSummary *GlobalValueSummary::getBaseObject() {
   return this;
 }
 
-/// \brief Function summary information to aid decisions and implementation of
+/// Function summary information to aid decisions and implementation of
 /// importing.
 class FunctionSummary : public GlobalValueSummary {
 public:
   /// <CalleeValueInfo, CalleeInfo> call edge pair.
   using EdgeTy = std::pair<ValueInfo, CalleeInfo>;
 
+  /// Types for -force-summary-edges-cold debugging option.
+  enum ForceSummaryHotnessType : unsigned {
+    FSHT_None,
+    FSHT_AllNonCritical,
+    FSHT_All
+  };
+
   /// An "identifier" for a virtual function. This contains the type identifier
   /// represented as a GUID and the offset from the address point to the virtual
   /// function pointer, where "address point" is as defined in the Itanium ABI:
@@ -324,6 +440,26 @@ public:
     std::vector<uint64_t> Args;
   };
 
+  /// All type identifier related information. Because these fields are
+  /// relatively uncommon we only allocate space for them if necessary.
+  struct TypeIdInfo {
+    /// List of type identifiers used by this function in llvm.type.test
+    /// intrinsics referenced by something other than an llvm.assume intrinsic,
+    /// represented as GUIDs.
+    std::vector<GlobalValue::GUID> TypeTests;
+
+    /// List of virtual calls made by this function using (respectively)
+    /// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics that do
+    /// not have all constant integer arguments.
+    std::vector<VFuncId> TypeTestAssumeVCalls, TypeCheckedLoadVCalls;
+
+    /// List of virtual calls made by this function using (respectively)
+    /// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics with
+    /// all constant integer arguments.
+    std::vector<ConstVCall> TypeTestAssumeConstVCalls,
+        TypeCheckedLoadConstVCalls;
+  };
+
   /// Function attribute flags. Used to track if a function accesses memory,
   /// recurses or aliases.
   struct FFlags {
@@ -333,6 +469,25 @@ public:
     unsigned ReturnDoesNotAlias : 1;
   };
 
+  /// Create an empty FunctionSummary (with specified call edges).
+  /// Used to represent external nodes and the dummy root node.
+  static FunctionSummary
+  makeDummyFunctionSummary(std::vector<FunctionSummary::EdgeTy> Edges) {
+    return FunctionSummary(
+        FunctionSummary::GVFlags(
+            GlobalValue::LinkageTypes::AvailableExternallyLinkage,
+            /*NotEligibleToImport=*/true, /*Live=*/true, /*IsLocal=*/false),
+        0, FunctionSummary::FFlags{}, std::vector<ValueInfo>(),
+        std::move(Edges), std::vector<GlobalValue::GUID>(),
+        std::vector<FunctionSummary::VFuncId>(),
+        std::vector<FunctionSummary::VFuncId>(),
+        std::vector<FunctionSummary::ConstVCall>(),
+        std::vector<FunctionSummary::ConstVCall>());
+  }
+
+  /// A dummy node to reference external functions that aren't in the index
+  static FunctionSummary ExternalNode;
+
 private:
   /// Number of instructions (ignoring debug instructions, e.g.) computed
   /// during the initial compile step when the summary index is first built.
@@ -345,25 +500,6 @@ private:
   /// List of <CalleeValueInfo, CalleeInfo> call edge pairs from this function.
   std::vector<EdgeTy> CallGraphEdgeList;
 
-  /// All type identifier related information. Because these fields are
-  /// relatively uncommon we only allocate space for them if necessary.
-  struct TypeIdInfo {
-    /// List of type identifiers used by this function in llvm.type.test
-    /// intrinsics other than by an llvm.assume intrinsic, represented as GUIDs.
-    std::vector<GlobalValue::GUID> TypeTests;
-
-    /// List of virtual calls made by this function using (respectively)
-    /// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics that do
-    /// not have all constant integer arguments.
-    std::vector<VFuncId> TypeTestAssumeVCalls, TypeCheckedLoadVCalls;
-
-    /// List of virtual calls made by this function using (respectively)
-    /// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics with
-    /// all constant integer arguments.
-    std::vector<ConstVCall> TypeTestAssumeConstVCalls,
-        TypeCheckedLoadConstVCalls;
-  };
-
   std::unique_ptr<TypeIdInfo> TIdInfo;
 
 public:
@@ -393,7 +529,7 @@ public:
   }
 
   /// Get function attribute flags.
-  FFlags &fflags() { return FunFlags; }
+  FFlags fflags() const { return FunFlags; }
 
   /// Get the instruction count recorded for this function.
   unsigned instCount() const { return InstCount; }
@@ -452,6 +588,10 @@ public:
       TIdInfo = llvm::make_unique<TypeIdInfo>();
     TIdInfo->TypeTests.push_back(Guid);
   }
+
+  const TypeIdInfo *getTypeIdInfo() const { return TIdInfo.get(); };
+
+  friend struct GraphTraits<ValueInfo>;
 };
 
 template <> struct DenseMapInfo<FunctionSummary::VFuncId> {
@@ -488,7 +628,7 @@ template <> struct DenseMapInfo<FunctionSummary::ConstVCall> {
   }
 };
 
-/// \brief Global variable summary information to aid decisions and
+/// Global variable summary information to aid decisions and
 /// implementation of importing.
 ///
 /// Currently this doesn't add anything to the base \p GlobalValueSummary,
@@ -538,8 +678,11 @@ struct TypeTestResolution {
 
 struct WholeProgramDevirtResolution {
   enum Kind {
-    Indir,      ///< Just do a regular virtual call
-    SingleImpl, ///< Single implementation devirtualization
+    Indir,        ///< Just do a regular virtual call
+    SingleImpl,   ///< Single implementation devirtualization
+    BranchFunnel, ///< When retpoline mitigation is enabled, use a branch funnel
+                  ///< that is defined in the merged module. Otherwise same as
+                  ///< Indir.
   } TheKind = Indir;
 
   std::string SingleImplName;
@@ -607,7 +750,6 @@ private:
 
   /// Mapping from type identifiers to summary information for that type
   /// identifier.
-  // FIXME: Add bitcode read/write support for this field.
   std::map<std::string, TypeIdSummary> TypeIdMap;
 
   /// Mapping from original ID to GUID. If original ID can map to multiple
@@ -619,24 +761,111 @@ private:
   /// considered live.
   bool WithGlobalValueDeadStripping = false;
 
+  /// Indicates that distributed backend should skip compilation of the
+  /// module. Flag is suppose to be set by distributed ThinLTO indexing
+  /// when it detected that the module is not needed during the final
+  /// linking. As result distributed backend should just output a minimal
+  /// valid object file.
+  bool SkipModuleByDistributedBackend = false;
+
+  /// If true then we're performing analysis of IR module, or parsing along with
+  /// the IR from assembly. The value of 'false' means we're reading summary
+  /// from BC or YAML source. Affects the type of value stored in NameOrGV
+  /// union.
+  bool HaveGVs;
+
   std::set<std::string> CfiFunctionDefs;
   std::set<std::string> CfiFunctionDecls;
 
+  // Used in cases where we want to record the name of a global, but
+  // don't have the string owned elsewhere (e.g. the Strtab on a module).
+  StringSaver Saver;
+  BumpPtrAllocator Alloc;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
   GlobalValueSummaryMapTy::value_type *
   getOrInsertValuePtr(GlobalValue::GUID GUID) {
-    return &*GlobalValueMap.emplace(GUID, GlobalValueSummaryInfo{}).first;
+    return &*GlobalValueMap.emplace(GUID, GlobalValueSummaryInfo(HaveGVs))
+                 .first;
   }
 
 public:
+  // See HaveGVs variable comment.
+  ModuleSummaryIndex(bool HaveGVs) : HaveGVs(HaveGVs), Saver(Alloc) {}
+
+  bool haveGVs() const { return HaveGVs; }
+
   gvsummary_iterator begin() { return GlobalValueMap.begin(); }
   const_gvsummary_iterator begin() const { return GlobalValueMap.begin(); }
   gvsummary_iterator end() { return GlobalValueMap.end(); }
   const_gvsummary_iterator end() const { return GlobalValueMap.end(); }
   size_t size() const { return GlobalValueMap.size(); }
 
+  /// Convenience function for doing a DFS on a ValueInfo. Marks the function in
+  /// the FunctionHasParent map.
+  static void discoverNodes(ValueInfo V,
+                            std::map<ValueInfo, bool> &FunctionHasParent) {
+    if (!V.getSummaryList().size())
+      return; // skip external functions that don't have summaries
+
+    // Mark discovered if we haven't yet
+    auto S = FunctionHasParent.emplace(V, false);
+
+    // Stop if we've already discovered this node
+    if (!S.second)
+      return;
+
+    FunctionSummary *F =
+        dyn_cast<FunctionSummary>(V.getSummaryList().front().get());
+    assert(F != nullptr && "Expected FunctionSummary node");
+
+    for (auto &C : F->calls()) {
+      // Insert node if necessary
+      auto S = FunctionHasParent.emplace(C.first, true);
+
+      // Skip nodes that we're sure have parents
+      if (!S.second && S.first->second)
+        continue;
+
+      if (S.second)
+        discoverNodes(C.first, FunctionHasParent);
+      else
+        S.first->second = true;
+    }
+  }
+
+  // Calculate the callgraph root
+  FunctionSummary calculateCallGraphRoot() {
+    // Functions that have a parent will be marked in FunctionHasParent pair.
+    // Once we've marked all functions, the functions in the map that are false
+    // have no parent (so they're the roots)
+    std::map<ValueInfo, bool> FunctionHasParent;
+
+    for (auto &S : *this) {
+      // Skip external functions
+      if (!S.second.SummaryList.size() ||
+          !isa<FunctionSummary>(S.second.SummaryList.front().get()))
+        continue;
+      discoverNodes(ValueInfo(HaveGVs, &S), FunctionHasParent);
+    }
+
+    std::vector<FunctionSummary::EdgeTy> Edges;
+    // create edges to all roots in the Index
+    for (auto &P : FunctionHasParent) {
+      if (P.second)
+        continue; // skip over non-root nodes
+      Edges.push_back(std::make_pair(P.first, CalleeInfo{}));
+    }
+    if (Edges.empty()) {
+      // Failed to find root - return an empty node
+      return FunctionSummary::makeDummyFunctionSummary({});
+    }
+    auto CallGraphRoot = FunctionSummary::makeDummyFunctionSummary(Edges);
+    return CallGraphRoot;
+  }
+
   bool withGlobalValueDeadStripping() const {
     return WithGlobalValueDeadStripping;
   }
@@ -644,27 +873,54 @@ public:
     WithGlobalValueDeadStripping = true;
   }
 
+  bool skipModuleByDistributedBackend() const {
+    return SkipModuleByDistributedBackend;
+  }
+  void setSkipModuleByDistributedBackend() {
+    SkipModuleByDistributedBackend = true;
+  }
+
   bool isGlobalValueLive(const GlobalValueSummary *GVS) const {
     return !WithGlobalValueDeadStripping || GVS->isLive();
   }
   bool isGUIDLive(GlobalValue::GUID GUID) const;
 
+  /// Return a ValueInfo for the index value_type (convenient when iterating
+  /// index).
+  ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const {
+    return ValueInfo(HaveGVs, &R);
+  }
+
   /// Return a ValueInfo for GUID if it exists, otherwise return ValueInfo().
   ValueInfo getValueInfo(GlobalValue::GUID GUID) const {
     auto I = GlobalValueMap.find(GUID);
-    return ValueInfo(I == GlobalValueMap.end() ? nullptr : &*I);
+    return ValueInfo(HaveGVs, I == GlobalValueMap.end() ? nullptr : &*I);
   }
 
   /// Return a ValueInfo for \p GUID.
   ValueInfo getOrInsertValueInfo(GlobalValue::GUID GUID) {
-    return ValueInfo(getOrInsertValuePtr(GUID));
+    return ValueInfo(HaveGVs, getOrInsertValuePtr(GUID));
+  }
+
+  // Save a string in the Index. Use before passing Name to
+  // getOrInsertValueInfo when the string isn't owned elsewhere (e.g. on the
+  // module's Strtab).
+  StringRef saveString(std::string String) { return Saver.save(String); }
+
+  /// Return a ValueInfo for \p GUID setting value \p Name.
+  ValueInfo getOrInsertValueInfo(GlobalValue::GUID GUID, StringRef Name) {
+    assert(!HaveGVs);
+    auto VP = getOrInsertValuePtr(GUID);
+    VP->second.U.Name = Name;
+    return ValueInfo(HaveGVs, VP);
   }
 
   /// Return a ValueInfo for \p GV and mark it as belonging to GV.
   ValueInfo getOrInsertValueInfo(const GlobalValue *GV) {
+    assert(HaveGVs);
     auto VP = getOrInsertValuePtr(GV->getGUID());
-    VP->second.GV = GV;
-    return ValueInfo(VP);
+    VP->second.U.GV = GV;
+    return ValueInfo(HaveGVs, VP);
   }
 
   /// Return the GUID for \p OriginalId in the OidGuidMap.
@@ -679,6 +935,12 @@ public:
   std::set<std::string> &cfiFunctionDecls() { return CfiFunctionDecls; }
   const std::set<std::string> &cfiFunctionDecls() const { return CfiFunctionDecls; }
 
+  /// Add a global value summary for a value.
+  void addGlobalValueSummary(const GlobalValue &GV,
+                             std::unique_ptr<GlobalValueSummary> Summary) {
+    addGlobalValueSummary(getOrInsertValueInfo(&GV), std::move(Summary));
+  }
+
   /// Add a global value summary for a value of the given name.
   void addGlobalValueSummary(StringRef ValueName,
                              std::unique_ptr<GlobalValueSummary> Summary) {
@@ -692,7 +954,7 @@ public:
     addOriginalName(VI.getGUID(), Summary->getOriginalName());
     // Here we have a notionally const VI, but the value it points to is owned
     // by the non-const *this.
-    const_cast<GlobalValueSummaryMapTy::value_type *>(VI.Ref)
+    const_cast<GlobalValueSummaryMapTy::value_type *>(VI.getRef())
         ->second.SummaryList.push_back(std::move(Summary));
   }
 
@@ -730,8 +992,7 @@ public:
   GlobalValueSummary *getGlobalValueSummary(const GlobalValue &GV,
                                             bool PerModuleIndex = true) const {
     assert(GV.hasName() && "Can't get GlobalValueSummary for GV with no name");
-    return getGlobalValueSummary(GlobalValue::getGUID(GV.getName()),
-                                 PerModuleIndex);
+    return getGlobalValueSummary(GV.getGUID(), PerModuleIndex);
   }
 
   /// Returns the first GlobalValueSummary for \p ValueGUID, asserting that
@@ -788,6 +1049,13 @@ public:
     return &*ModulePathStringTable.insert({ModPath, {ModId, Hash}}).first;
   }
 
+  /// Return module entry for module with the given \p ModPath.
+  ModuleInfo *getModule(StringRef ModPath) {
+    auto It = ModulePathStringTable.find(ModPath);
+    assert(It != ModulePathStringTable.end() && "Module not registered");
+    return &*It;
+  }
+
   /// Check if the given Module has any functions available for exporting
   /// in the index. We consider any module present in the ModulePathStringTable
   /// to have exported functions.
@@ -814,7 +1082,7 @@ public:
     return &I->second;
   }
 
-  /// Collect for the given module the list of function it defines
+  /// Collect for the given module the list of functions it defines
   /// (GUID -> Summary).
   void collectDefinedFunctionsForModule(StringRef ModulePath,
                                         GVSummaryMapTy &GVSummaryMap) const;
@@ -823,6 +1091,65 @@ public:
   /// Summary).
   void collectDefinedGVSummariesPerModule(
       StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries) const;
+
+  /// Print to an output stream.
+  void print(raw_ostream &OS, bool IsForDebug = false) const;
+
+  /// Dump to stderr (for debugging).
+  void dump() const;
+
+  /// Export summary to dot file for GraphViz.
+  void exportToDot(raw_ostream& OS) const;
+
+  /// Print out strongly connected components for debugging.
+  void dumpSCCs(raw_ostream &OS);
+};
+
+/// GraphTraits definition to build SCC for the index
+template <> struct GraphTraits<ValueInfo> {
+  typedef ValueInfo NodeRef;
+
+  static NodeRef valueInfoFromEdge(FunctionSummary::EdgeTy &P) {
+    return P.first;
+  }
+  using ChildIteratorType =
+      mapped_iterator<std::vector<FunctionSummary::EdgeTy>::iterator,
+                      decltype(&valueInfoFromEdge)>;
+
+  static NodeRef getEntryNode(ValueInfo V) { return V; }
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    if (!N.getSummaryList().size()) // handle external function
+      return ChildIteratorType(
+          FunctionSummary::ExternalNode.CallGraphEdgeList.begin(),
+          &valueInfoFromEdge);
+    FunctionSummary *F =
+        cast<FunctionSummary>(N.getSummaryList().front()->getBaseObject());
+    return ChildIteratorType(F->CallGraphEdgeList.begin(), &valueInfoFromEdge);
+  }
+
+  static ChildIteratorType child_end(NodeRef N) {
+    if (!N.getSummaryList().size()) // handle external function
+      return ChildIteratorType(
+          FunctionSummary::ExternalNode.CallGraphEdgeList.end(),
+          &valueInfoFromEdge);
+    FunctionSummary *F =
+        cast<FunctionSummary>(N.getSummaryList().front()->getBaseObject());
+    return ChildIteratorType(F->CallGraphEdgeList.end(), &valueInfoFromEdge);
+  }
+};
+
+template <>
+struct GraphTraits<ModuleSummaryIndex *> : public GraphTraits<ValueInfo> {
+  static NodeRef getEntryNode(ModuleSummaryIndex *I) {
+    std::unique_ptr<GlobalValueSummary> Root =
+        make_unique<FunctionSummary>(I->calculateCallGraphRoot());
+    GlobalValueSummaryInfo G(I->haveGVs());
+    G.SummaryList.push_back(std::move(Root));
+    static auto P =
+        GlobalValueSummaryMapTy::value_type(GlobalValue::GUID(0), std::move(G));
+    return ValueInfo(I->haveGVs(), &P);
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/contrib/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 4687f2d53e7e..1b339ab32cf1 100644
--- a/contrib/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/contrib/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -98,6 +98,8 @@ template <> struct ScalarEnumerationTraits<WholeProgramDevirtResolution::Kind> {
   static void enumeration(IO &io, WholeProgramDevirtResolution::Kind &value) {
     io.enumCase(value, "Indir", WholeProgramDevirtResolution::Indir);
     io.enumCase(value, "SingleImpl", WholeProgramDevirtResolution::SingleImpl);
+    io.enumCase(value, "BranchFunnel",
+                WholeProgramDevirtResolution::BranchFunnel);
   }
 };
 
@@ -136,6 +138,7 @@ template <> struct MappingTraits<TypeIdSummary> {
 struct FunctionSummaryYaml {
   unsigned Linkage;
   bool NotEligibleToImport, Live, IsLocal;
+  std::vector<uint64_t> Refs;
   std::vector<uint64_t> TypeTests;
   std::vector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
       TypeCheckedLoadVCalls;
@@ -178,6 +181,7 @@ template <> struct MappingTraits<FunctionSummaryYaml> {
     io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport);
     io.mapOptional("Live", summary.Live);
     io.mapOptional("Local", summary.IsLocal);
+    io.mapOptional("Refs", summary.Refs);
     io.mapOptional("TypeTests", summary.TypeTests);
     io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls);
     io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls);
@@ -207,13 +211,21 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
       io.setError("key not an integer");
       return;
     }
-    auto &Elem = V[KeyInt];
+    if (!V.count(KeyInt))
+      V.emplace(KeyInt, /*IsAnalysis=*/false);
+    auto &Elem = V.find(KeyInt)->second;
     for (auto &FSum : FSums) {
+      std::vector<ValueInfo> Refs;
+      for (auto &RefGUID : FSum.Refs) {
+        if (!V.count(RefGUID))
+          V.emplace(RefGUID, /*IsAnalysis=*/false);
+        Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID)));
+      }
       Elem.SummaryList.push_back(llvm::make_unique<FunctionSummary>(
           GlobalValueSummary::GVFlags(
               static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
               FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal),
-          0, FunctionSummary::FFlags{}, ArrayRef<ValueInfo>{},
+          0, FunctionSummary::FFlags{}, Refs,
           ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests),
           std::move(FSum.TypeTestAssumeVCalls),
           std::move(FSum.TypeCheckedLoadVCalls),
@@ -225,15 +237,20 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
     for (auto &P : V) {
       std::vector<FunctionSummaryYaml> FSums;
       for (auto &Sum : P.second.SummaryList) {
-        if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get()))
+        if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get())) {
+          std::vector<uint64_t> Refs;
+          for (auto &VI : FSum->refs())
+            Refs.push_back(VI.getGUID());
           FSums.push_back(FunctionSummaryYaml{
               FSum->flags().Linkage,
               static_cast<bool>(FSum->flags().NotEligibleToImport),
               static_cast<bool>(FSum->flags().Live),
-              static_cast<bool>(FSum->flags().DSOLocal), FSum->type_tests(),
-              FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(),
+              static_cast<bool>(FSum->flags().DSOLocal), Refs,
+              FSum->type_tests(), FSum->type_test_assume_vcalls(),
+              FSum->type_checked_load_vcalls(),
               FSum->type_test_assume_const_vcalls(),
               FSum->type_checked_load_const_vcalls()});
+          }
       }
       if (!FSums.empty())
         io.mapRequired(llvm::utostr(P.first).c_str(), FSums);
diff --git a/contrib/llvm/include/llvm/IR/Operator.h b/contrib/llvm/include/llvm/IR/Operator.h
index 01746e4b6a29..939cec7f4aa4 100644
--- a/contrib/llvm/include/llvm/IR/Operator.h
+++ b/contrib/llvm/include/llvm/IR/Operator.h
@@ -207,17 +207,28 @@ public:
   bool isFast() const          { return all(); }
 
   /// Flag setters
-  void setAllowReassoc()    { Flags |= AllowReassoc; }
-  void setNoNaNs()          { Flags |= NoNaNs; }
-  void setNoInfs()          { Flags |= NoInfs; }
-  void setNoSignedZeros()   { Flags |= NoSignedZeros; }
-  void setAllowReciprocal() { Flags |= AllowReciprocal; }
-  // TODO: Change the other set* functions to take a parameter?
-  void setAllowContract(bool B) {
+  void setAllowReassoc(bool B = true) {
+    Flags = (Flags & ~AllowReassoc) | B * AllowReassoc;
+  }
+  void setNoNaNs(bool B = true) {
+    Flags = (Flags & ~NoNaNs) | B * NoNaNs;
+  }
+  void setNoInfs(bool B = true) {
+    Flags = (Flags & ~NoInfs) | B * NoInfs;
+  }
+  void setNoSignedZeros(bool B = true) {
+    Flags = (Flags & ~NoSignedZeros) | B * NoSignedZeros;
+  }
+  void setAllowReciprocal(bool B = true) {
+    Flags = (Flags & ~AllowReciprocal) | B * AllowReciprocal;
+  }
+  void setAllowContract(bool B = true) {
     Flags = (Flags & ~AllowContract) | B * AllowContract;
   }
-  void setApproxFunc()      { Flags |= ApproxFunc; }
-  void setFast()            { set(); }
+  void setApproxFunc(bool B = true) {
+    Flags = (Flags & ~ApproxFunc) | B * ApproxFunc;
+  }
+  void setFast(bool B = true) { B ? set() : clear(); }
 
   void operator&=(const FastMathFlags &OtherFlags) {
     Flags &= OtherFlags.Flags;
@@ -507,7 +518,7 @@ public:
       });
   }
 
-  /// \brief Accumulate the constant address offset of this GEP if possible.
+  /// Accumulate the constant address offset of this GEP if possible.
   ///
   /// This routine accepts an APInt into which it will accumulate the constant
   /// offset of this GEP if the GEP is in fact constant. If the GEP is not
diff --git a/contrib/llvm/include/llvm/IR/OptBisect.h b/contrib/llvm/include/llvm/IR/OptBisect.h
index 09e67aa79246..aa24c94c0130 100644
--- a/contrib/llvm/include/llvm/IR/OptBisect.h
+++ b/contrib/llvm/include/llvm/IR/OptBisect.h
@@ -20,14 +20,34 @@
 namespace llvm {
 
 class Pass;
+class Module;
+class Function;
+class BasicBlock;
+class Region;
+class Loop;
+class CallGraphSCC;
+
+/// Extensions to this class implement mechanisms to disable passes and
+/// individual optimizations at compile time.
+class OptPassGate {
+public:
+  virtual ~OptPassGate() = default;
+
+  virtual bool shouldRunPass(const Pass *P, const Module &U) { return true; }
+  virtual bool shouldRunPass(const Pass *P, const Function &U)  {return true; }
+  virtual bool shouldRunPass(const Pass *P, const BasicBlock &U)  { return true; }
+  virtual bool shouldRunPass(const Pass *P, const Region &U)  { return true; }
+  virtual bool shouldRunPass(const Pass *P, const Loop &U)  { return true; }
+  virtual bool shouldRunPass(const Pass *P, const CallGraphSCC &U)  { return true; }
+};
 
 /// This class implements a mechanism to disable passes and individual
 /// optimizations at compile time based on a command line option
 /// (-opt-bisect-limit) in order to perform a bisecting search for
 /// optimization-related problems.
-class OptBisect {
+class OptBisect : public OptPassGate {
 public:
-  /// \brief Default constructor, initializes the OptBisect state based on the
+  /// Default constructor, initializes the OptBisect state based on the
   /// -opt-bisect-limit command line argument.
   ///
   /// By default, bisection is disabled.
@@ -36,20 +56,26 @@ public:
   /// through LLVMContext.
   OptBisect();
 
+  virtual ~OptBisect() = default;
+
   /// Checks the bisect limit to determine if the specified pass should run.
   ///
-  /// This function will immediate return true if bisection is disabled. If the
-  /// bisect limit is set to -1, the function will print a message describing
+  /// These functions immediately return true if bisection is disabled. If the
+  /// bisect limit is set to -1, the functions print a message describing
   /// the pass and the bisect number assigned to it and return true.  Otherwise,
-  /// the function will print a message with the bisect number assigned to the
+  /// the functions print a message with the bisect number assigned to the
   /// pass and indicating whether or not the pass will be run and return true if
-  /// the bisect limit has not yet been exceded or false if it has.
+  /// the bisect limit has not yet been exceeded or false if it has.
   ///
-  /// Most passes should not call this routine directly.  Instead, it is called
-  /// through a helper routine provided by the pass base class.  For instance,
-  /// function passes should call FunctionPass::skipFunction().
-  template <class UnitT>
-  bool shouldRunPass(const Pass *P, const UnitT &U);
+  /// Most passes should not call these routines directly. Instead, they are
+  /// called through helper routines provided by the pass base classes.  For
+  /// instance, function passes should call FunctionPass::skipFunction().
+  bool shouldRunPass(const Pass *P, const Module &U) override;
+  bool shouldRunPass(const Pass *P, const Function &U) override;
+  bool shouldRunPass(const Pass *P, const BasicBlock &U) override;
+  bool shouldRunPass(const Pass *P, const Region &U) override;
+  bool shouldRunPass(const Pass *P, const Loop &U) override;
+  bool shouldRunPass(const Pass *P, const CallGraphSCC &U) override;
 
 private:
   bool checkPass(const StringRef PassName, const StringRef TargetDesc);
diff --git a/contrib/llvm/include/llvm/IR/PassManager.h b/contrib/llvm/include/llvm/IR/PassManager.h
index 4f838a719512..a5d4aaf71c0e 100644
--- a/contrib/llvm/include/llvm/IR/PassManager.h
+++ b/contrib/llvm/include/llvm/IR/PassManager.h
@@ -152,17 +152,17 @@ private:
 /// ```
 class PreservedAnalyses {
 public:
-  /// \brief Convenience factory function for the empty preserved set.
+  /// Convenience factory function for the empty preserved set.
   static PreservedAnalyses none() { return PreservedAnalyses(); }
 
-  /// \brief Construct a special preserved set that preserves all passes.
+  /// Construct a special preserved set that preserves all passes.
   static PreservedAnalyses all() {
     PreservedAnalyses PA;
     PA.PreservedIDs.insert(&AllAnalysesKey);
     return PA;
   }
 
-  /// \brief Construct a preserved analyses object with a single preserved set.
+  /// Construct a preserved analyses object with a single preserved set.
   template <typename AnalysisSetT>
   static PreservedAnalyses allInSet() {
     PreservedAnalyses PA;
@@ -173,7 +173,7 @@ public:
   /// Mark an analysis as preserved.
   template <typename AnalysisT> void preserve() { preserve(AnalysisT::ID()); }
 
-  /// \brief Given an analysis's ID, mark the analysis as preserved, adding it
+  /// Given an analysis's ID, mark the analysis as preserved, adding it
   /// to the set.
   void preserve(AnalysisKey *ID) {
     // Clear this ID from the explicit not-preserved set if present.
@@ -218,7 +218,7 @@ public:
     NotPreservedAnalysisIDs.insert(ID);
   }
 
-  /// \brief Intersect this set with another in place.
+  /// Intersect this set with another in place.
   ///
   /// This is a mutating operation on this preserved set, removing all
   /// preserved passes which are not also preserved in the argument.
@@ -240,7 +240,7 @@ public:
         PreservedIDs.erase(ID);
   }
 
-  /// \brief Intersect this set with a temporary other set in place.
+  /// Intersect this set with a temporary other set in place.
   ///
   /// This is a mutating operation on this preserved set, removing all
   /// preserved passes which are not also preserved in the argument.
@@ -402,7 +402,7 @@ struct AnalysisInfoMixin : PassInfoMixin<DerivedT> {
   }
 };
 
-/// \brief Manages a sequence of passes over a particular unit of IR.
+/// Manages a sequence of passes over a particular unit of IR.
 ///
 /// A pass manager contains a sequence of passes to run over a particular unit
 /// of IR (e.g. Functions, Modules). It is itself a valid pass over that unit of
@@ -420,7 +420,7 @@ template <typename IRUnitT,
 class PassManager : public PassInfoMixin<
                         PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>> {
 public:
-  /// \brief Construct a pass manager.
+  /// Construct a pass manager.
   ///
   /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
   explicit PassManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
@@ -439,7 +439,7 @@ public:
     return *this;
   }
 
-  /// \brief Run all of the passes in this manager over the given unit of IR.
+  /// Run all of the passes in this manager over the given unit of IR.
   /// ExtraArgs are passed to each pass.
   PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
                         ExtraArgTs... ExtraArgs) {
@@ -496,21 +496,21 @@ private:
 
   std::vector<std::unique_ptr<PassConceptT>> Passes;
 
-  /// \brief Flag indicating whether we should do debug logging.
+  /// Flag indicating whether we should do debug logging.
   bool DebugLogging;
 };
 
 extern template class PassManager<Module>;
 
-/// \brief Convenience typedef for a pass manager over modules.
+/// Convenience typedef for a pass manager over modules.
 using ModulePassManager = PassManager<Module>;
 
 extern template class PassManager<Function>;
 
-/// \brief Convenience typedef for a pass manager over functions.
+/// Convenience typedef for a pass manager over functions.
 using FunctionPassManager = PassManager<Function>;
 
-/// \brief A container for analyses that lazily runs them and caches their
+/// A container for analyses that lazily runs them and caches their
 /// results.
 ///
 /// This class can manage analyses for any IR unit where the address of the IR
@@ -527,7 +527,7 @@ private:
       detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
                                   ExtraArgTs...>;
 
-  /// \brief List of analysis pass IDs and associated concept pointers.
+  /// List of analysis pass IDs and associated concept pointers.
   ///
   /// Requires iterators to be valid across appending new entries and arbitrary
   /// erases. Provides the analysis ID to enable finding iterators to a given
@@ -536,10 +536,10 @@ private:
   using AnalysisResultListT =
       std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>;
 
-  /// \brief Map type from IRUnitT pointer to our custom list type.
+  /// Map type from IRUnitT pointer to our custom list type.
   using AnalysisResultListMapT = DenseMap<IRUnitT *, AnalysisResultListT>;
 
-  /// \brief Map type from a pair of analysis ID and IRUnitT pointer to an
+  /// Map type from a pair of analysis ID and IRUnitT pointer to an
   /// iterator into a particular result list (which is where the actual analysis
   /// result is stored).
   using AnalysisResultMapT =
@@ -634,14 +634,14 @@ public:
     const AnalysisResultMapT &Results;
   };
 
-  /// \brief Construct an empty analysis manager.
+  /// Construct an empty analysis manager.
   ///
   /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
   AnalysisManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
   AnalysisManager(AnalysisManager &&) = default;
   AnalysisManager &operator=(AnalysisManager &&) = default;
 
-  /// \brief Returns true if the analysis manager has an empty results cache.
+  /// Returns true if the analysis manager has an empty results cache.
   bool empty() const {
     assert(AnalysisResults.empty() == AnalysisResultLists.empty() &&
            "The storage and index of analysis results disagree on how many "
@@ -649,7 +649,7 @@ public:
     return AnalysisResults.empty();
   }
 
-  /// \brief Clear any cached analysis results for a single unit of IR.
+  /// Clear any cached analysis results for a single unit of IR.
   ///
   /// This doesn't invalidate, but instead simply deletes, the relevant results.
   /// It is useful when the IR is being removed and we want to clear out all the
@@ -669,7 +669,7 @@ public:
     AnalysisResultLists.erase(ResultsListI);
   }
 
-  /// \brief Clear all analysis results cached by this AnalysisManager.
+  /// Clear all analysis results cached by this AnalysisManager.
   ///
   /// Like \c clear(IRUnitT&), this doesn't invalidate the results; it simply
   /// deletes them.  This lets you clean up the AnalysisManager when the set of
@@ -680,7 +680,7 @@ public:
     AnalysisResultLists.clear();
   }
 
-  /// \brief Get the result of an analysis pass for a given IR unit.
+  /// Get the result of an analysis pass for a given IR unit.
   ///
   /// Runs the analysis if a cached result is not available.
   template <typename PassT>
@@ -697,7 +697,7 @@ public:
     return static_cast<ResultModelT &>(ResultConcept).Result;
   }
 
-  /// \brief Get the cached result of an analysis pass for a given IR unit.
+  /// Get the cached result of an analysis pass for a given IR unit.
   ///
   /// This method never runs the analysis.
   ///
@@ -718,7 +718,7 @@ public:
     return &static_cast<ResultModelT *>(ResultConcept)->Result;
   }
 
-  /// \brief Register an analysis pass with the manager.
+  /// Register an analysis pass with the manager.
   ///
   /// The parameter is a callable whose result is an analysis pass. This allows
   /// passing in a lambda to construct the analysis.
@@ -752,7 +752,7 @@ public:
     return true;
   }
 
-  /// \brief Invalidate a specific analysis pass for an IR module.
+  /// Invalidate a specific analysis pass for an IR module.
   ///
   /// Note that the analysis result can disregard invalidation, if it determines
   /// it is in fact still valid.
@@ -762,7 +762,7 @@ public:
     invalidateImpl(PassT::ID(), IR);
   }
 
-  /// \brief Invalidate cached analyses for an IR unit.
+  /// Invalidate cached analyses for an IR unit.
   ///
   /// Walk through all of the analyses pertaining to this unit of IR and
   /// invalidate them, unless they are preserved by the PreservedAnalyses set.
@@ -829,7 +829,7 @@ public:
   }
 
 private:
-  /// \brief Look up a registered analysis pass.
+  /// Look up a registered analysis pass.
   PassConceptT &lookUpPass(AnalysisKey *ID) {
     typename AnalysisPassMapT::iterator PI = AnalysisPasses.find(ID);
     assert(PI != AnalysisPasses.end() &&
@@ -837,7 +837,7 @@ private:
     return *PI->second;
   }
 
-  /// \brief Look up a registered analysis pass.
+  /// Look up a registered analysis pass.
   const PassConceptT &lookUpPass(AnalysisKey *ID) const {
     typename AnalysisPassMapT::const_iterator PI = AnalysisPasses.find(ID);
     assert(PI != AnalysisPasses.end() &&
@@ -845,7 +845,7 @@ private:
     return *PI->second;
   }
 
-  /// \brief Get an analysis result, running the pass if necessary.
+  /// Get an analysis result, running the pass if necessary.
   ResultConceptT &getResultImpl(AnalysisKey *ID, IRUnitT &IR,
                                 ExtraArgTs... ExtraArgs) {
     typename AnalysisResultMapT::iterator RI;
@@ -874,14 +874,14 @@ private:
     return *RI->second->second;
   }
 
-  /// \brief Get a cached analysis result or return null.
+  /// Get a cached analysis result or return null.
   ResultConceptT *getCachedResultImpl(AnalysisKey *ID, IRUnitT &IR) const {
     typename AnalysisResultMapT::const_iterator RI =
         AnalysisResults.find({ID, &IR});
     return RI == AnalysisResults.end() ? nullptr : &*RI->second->second;
   }
 
-  /// \brief Invalidate a function pass result.
+  /// Invalidate a function pass result.
   void invalidateImpl(AnalysisKey *ID, IRUnitT &IR) {
     typename AnalysisResultMapT::iterator RI =
         AnalysisResults.find({ID, &IR});
@@ -895,38 +895,38 @@ private:
     AnalysisResults.erase(RI);
   }
 
-  /// \brief Map type from module analysis pass ID to pass concept pointer.
+  /// Map type from module analysis pass ID to pass concept pointer.
   using AnalysisPassMapT =
       DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>>;
 
-  /// \brief Collection of module analysis passes, indexed by ID.
+  /// Collection of module analysis passes, indexed by ID.
   AnalysisPassMapT AnalysisPasses;
 
-  /// \brief Map from function to a list of function analysis results.
+  /// Map from function to a list of function analysis results.
   ///
   /// Provides linear time removal of all analysis results for a function and
   /// the ultimate storage for a particular cached analysis result.
   AnalysisResultListMapT AnalysisResultLists;
 
-  /// \brief Map from an analysis ID and function to a particular cached
+  /// Map from an analysis ID and function to a particular cached
   /// analysis result.
   AnalysisResultMapT AnalysisResults;
 
-  /// \brief Indicates whether we log to \c llvm::dbgs().
+  /// Indicates whether we log to \c llvm::dbgs().
   bool DebugLogging;
 };
 
 extern template class AnalysisManager<Module>;
 
-/// \brief Convenience typedef for the Module analysis manager.
+/// Convenience typedef for the Module analysis manager.
 using ModuleAnalysisManager = AnalysisManager<Module>;
 
 extern template class AnalysisManager<Function>;
 
-/// \brief Convenience typedef for the Function analysis manager.
+/// Convenience typedef for the Function analysis manager.
 using FunctionAnalysisManager = AnalysisManager<Function>;
 
-/// \brief An analysis over an "outer" IR unit that provides access to an
+/// An analysis over an "outer" IR unit that provides access to an
 /// analysis manager over an "inner" IR unit.  The inner unit must be contained
 /// in the outer unit.
 ///
@@ -977,10 +977,10 @@ public:
       return *this;
     }
 
-    /// \brief Accessor for the analysis manager.
+    /// Accessor for the analysis manager.
     AnalysisManagerT &getManager() { return *InnerAM; }
 
-    /// \brief Handler for invalidation of the outer IR unit, \c IRUnitT.
+    /// Handler for invalidation of the outer IR unit, \c IRUnitT.
     ///
     /// If the proxy analysis itself is not preserved, we assume that the set of
     /// inner IR objects contained in IRUnit may have changed.  In this case,
@@ -1001,7 +1001,7 @@ public:
   explicit InnerAnalysisManagerProxy(AnalysisManagerT &InnerAM)
       : InnerAM(&InnerAM) {}
 
-  /// \brief Run the analysis pass and create our proxy result object.
+  /// Run the analysis pass and create our proxy result object.
   ///
   /// This doesn't do any interesting work; it is primarily used to insert our
   /// proxy result object into the outer analysis cache so that we can proxy
@@ -1040,7 +1040,7 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
 extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
                                                 Module>;
 
-/// \brief An analysis over an "inner" IR unit that provides access to an
+/// An analysis over an "inner" IR unit that provides access to an
 /// analysis manager over a "outer" IR unit.  The inner unit must be contained
 /// in the outer unit.
 ///
@@ -1063,7 +1063,7 @@ class OuterAnalysisManagerProxy
     : public AnalysisInfoMixin<
           OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>> {
 public:
-  /// \brief Result proxy object for \c OuterAnalysisManagerProxy.
+  /// Result proxy object for \c OuterAnalysisManagerProxy.
   class Result {
   public:
     explicit Result(const AnalysisManagerT &AM) : AM(&AM) {}
@@ -1130,7 +1130,7 @@ public:
 
   OuterAnalysisManagerProxy(const AnalysisManagerT &AM) : AM(&AM) {}
 
-  /// \brief Run the analysis pass and create our proxy result object.
+  /// Run the analysis pass and create our proxy result object.
   /// Nothing to see here, it just forwards the \c AM reference into the
   /// result.
   Result run(IRUnitT &, AnalysisManager<IRUnitT, ExtraArgTs...> &,
@@ -1157,7 +1157,7 @@ extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
 using ModuleAnalysisManagerFunctionProxy =
     OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>;
 
-/// \brief Trivial adaptor that maps from a module to its functions.
+/// Trivial adaptor that maps from a module to its functions.
 ///
 /// Designed to allow composition of a FunctionPass(Manager) and
 /// a ModulePassManager, by running the FunctionPass(Manager) over every
@@ -1187,7 +1187,7 @@ public:
   explicit ModuleToFunctionPassAdaptor(FunctionPassT Pass)
       : Pass(std::move(Pass)) {}
 
-  /// \brief Runs the function pass across every function in the module.
+  /// Runs the function pass across every function in the module.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
     FunctionAnalysisManager &FAM =
         AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
@@ -1223,7 +1223,7 @@ private:
   FunctionPassT Pass;
 };
 
-/// \brief A function to deduce a function pass type and wrap it in the
+/// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename FunctionPassT>
 ModuleToFunctionPassAdaptor<FunctionPassT>
@@ -1231,7 +1231,7 @@ createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
   return ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
 }
 
-/// \brief A utility pass template to force an analysis result to be available.
+/// A utility pass template to force an analysis result to be available.
 ///
 /// If there are extra arguments at the pass's run level there may also be
 /// extra arguments to the analysis manager's \c getResult routine. We can't
@@ -1246,7 +1246,7 @@ template <typename AnalysisT, typename IRUnitT,
 struct RequireAnalysisPass
     : PassInfoMixin<RequireAnalysisPass<AnalysisT, IRUnitT, AnalysisManagerT,
                                         ExtraArgTs...>> {
-  /// \brief Run this pass over some unit of IR.
+  /// Run this pass over some unit of IR.
   ///
   /// This pass can be run over any unit of IR and use any analysis manager
   /// provided they satisfy the basic API requirements. When this pass is
@@ -1261,12 +1261,12 @@ struct RequireAnalysisPass
   }
 };
 
-/// \brief A no-op pass template which simply forces a specific analysis result
+/// A no-op pass template which simply forces a specific analysis result
 /// to be invalidated.
 template <typename AnalysisT>
 struct InvalidateAnalysisPass
     : PassInfoMixin<InvalidateAnalysisPass<AnalysisT>> {
-  /// \brief Run this pass over some unit of IR.
+  /// Run this pass over some unit of IR.
   ///
   /// This pass can be run over any unit of IR and use any analysis manager,
   /// provided they satisfy the basic API requirements. When this pass is
@@ -1280,12 +1280,12 @@ struct InvalidateAnalysisPass
   }
 };
 
-/// \brief A utility pass that does nothing, but preserves no analyses.
+/// A utility pass that does nothing, but preserves no analyses.
 ///
 /// Because this preserves no analyses, any analysis passes queried after this
 /// pass runs will recompute fresh results.
 struct InvalidateAllAnalysesPass : PassInfoMixin<InvalidateAllAnalysesPass> {
-  /// \brief Run this pass over some unit of IR.
+  /// Run this pass over some unit of IR.
   template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
   PreservedAnalyses run(IRUnitT &, AnalysisManagerT &, ExtraArgTs &&...) {
     return PreservedAnalyses::none();
diff --git a/contrib/llvm/include/llvm/IR/PassManagerInternal.h b/contrib/llvm/include/llvm/IR/PassManagerInternal.h
index 9195d4dfa428..16a3258b4121 100644
--- a/contrib/llvm/include/llvm/IR/PassManagerInternal.h
+++ b/contrib/llvm/include/llvm/IR/PassManagerInternal.h
@@ -29,17 +29,17 @@ template <typename IRUnitT> class AllAnalysesOn;
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 class PreservedAnalyses;
 
-/// \brief Implementation details of the pass manager interfaces.
+/// Implementation details of the pass manager interfaces.
 namespace detail {
 
-/// \brief Template for the abstract base class used to dispatch
+/// Template for the abstract base class used to dispatch
 /// polymorphically over pass objects.
 template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
 struct PassConcept {
   // Boiler plate necessary for the container of derived classes.
   virtual ~PassConcept() = default;
 
-  /// \brief The polymorphic API which runs the pass over a given IR entity.
+  /// The polymorphic API which runs the pass over a given IR entity.
   ///
   /// Note that actual pass object can omit the analysis manager argument if
   /// desired. Also that the analysis manager may be null if there is no
@@ -47,11 +47,11 @@ struct PassConcept {
   virtual PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
                                 ExtraArgTs... ExtraArgs) = 0;
 
-  /// \brief Polymorphic method to access the name of a pass.
+  /// Polymorphic method to access the name of a pass.
   virtual StringRef name() = 0;
 };
 
-/// \brief A template wrapper used to implement the polymorphic API.
+/// A template wrapper used to implement the polymorphic API.
 ///
 /// Can be instantiated for any object which provides a \c run method accepting
 /// an \c IRUnitT& and an \c AnalysisManager<IRUnit>&. It requires the pass to
@@ -85,7 +85,7 @@ struct PassModel : PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...> {
   PassT Pass;
 };
 
-/// \brief Abstract concept of an analysis result.
+/// Abstract concept of an analysis result.
 ///
 /// This concept is parameterized over the IR unit that this result pertains
 /// to.
@@ -93,7 +93,7 @@ template <typename IRUnitT, typename PreservedAnalysesT, typename InvalidatorT>
 struct AnalysisResultConcept {
   virtual ~AnalysisResultConcept() = default;
 
-  /// \brief Method to try and mark a result as invalid.
+  /// Method to try and mark a result as invalid.
   ///
   /// When the outer analysis manager detects a change in some underlying
   /// unit of the IR, it will call this method on all of the results cached.
@@ -112,7 +112,7 @@ struct AnalysisResultConcept {
                           InvalidatorT &Inv) = 0;
 };
 
-/// \brief SFINAE metafunction for computing whether \c ResultT provides an
+/// SFINAE metafunction for computing whether \c ResultT provides an
 /// \c invalidate member function.
 template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
   using EnabledType = char;
@@ -148,7 +148,7 @@ public:
   enum { Value = sizeof(check<ResultT>(rank<2>())) == sizeof(EnabledType) };
 };
 
-/// \brief Wrapper to model the analysis result concept.
+/// Wrapper to model the analysis result concept.
 ///
 /// By default, this will implement the invalidate method with a trivial
 /// implementation so that the actual analysis result doesn't need to provide
@@ -160,7 +160,7 @@ template <typename IRUnitT, typename PassT, typename ResultT,
               ResultHasInvalidateMethod<IRUnitT, ResultT>::Value>
 struct AnalysisResultModel;
 
-/// \brief Specialization of \c AnalysisResultModel which provides the default
+/// Specialization of \c AnalysisResultModel which provides the default
 /// invalidate functionality.
 template <typename IRUnitT, typename PassT, typename ResultT,
           typename PreservedAnalysesT, typename InvalidatorT>
@@ -184,7 +184,7 @@ struct AnalysisResultModel<IRUnitT, PassT, ResultT, PreservedAnalysesT,
     return *this;
   }
 
-  /// \brief The model bases invalidation solely on being in the preserved set.
+  /// The model bases invalidation solely on being in the preserved set.
   //
   // FIXME: We should actually use two different concepts for analysis results
   // rather than two different models, and avoid the indirect function call for
@@ -199,7 +199,7 @@ struct AnalysisResultModel<IRUnitT, PassT, ResultT, PreservedAnalysesT,
   ResultT Result;
 };
 
-/// \brief Specialization of \c AnalysisResultModel which delegates invalidate
+/// Specialization of \c AnalysisResultModel which delegates invalidate
 /// handling to \c ResultT.
 template <typename IRUnitT, typename PassT, typename ResultT,
           typename PreservedAnalysesT, typename InvalidatorT>
@@ -223,7 +223,7 @@ struct AnalysisResultModel<IRUnitT, PassT, ResultT, PreservedAnalysesT,
     return *this;
   }
 
-  /// \brief The model delegates to the \c ResultT method.
+  /// The model delegates to the \c ResultT method.
   bool invalidate(IRUnitT &IR, const PreservedAnalysesT &PA,
                   InvalidatorT &Inv) override {
     return Result.invalidate(IR, PA, Inv);
@@ -232,7 +232,7 @@ struct AnalysisResultModel<IRUnitT, PassT, ResultT, PreservedAnalysesT,
   ResultT Result;
 };
 
-/// \brief Abstract concept of an analysis pass.
+/// Abstract concept of an analysis pass.
 ///
 /// This concept is parameterized over the IR unit that it can run over and
 /// produce an analysis result.
@@ -241,7 +241,7 @@ template <typename IRUnitT, typename PreservedAnalysesT, typename InvalidatorT,
 struct AnalysisPassConcept {
   virtual ~AnalysisPassConcept() = default;
 
-  /// \brief Method to run this analysis over a unit of IR.
+  /// Method to run this analysis over a unit of IR.
   /// \returns A unique_ptr to the analysis result object to be queried by
   /// users.
   virtual std::unique_ptr<
@@ -249,11 +249,11 @@ struct AnalysisPassConcept {
   run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
       ExtraArgTs... ExtraArgs) = 0;
 
-  /// \brief Polymorphic method to access the name of a pass.
+  /// Polymorphic method to access the name of a pass.
   virtual StringRef name() = 0;
 };
 
-/// \brief Wrapper to model the analysis pass concept.
+/// Wrapper to model the analysis pass concept.
 ///
 /// Can wrap any type which implements a suitable \c run method. The method
 /// must accept an \c IRUnitT& and an \c AnalysisManager<IRUnitT>& as arguments
@@ -283,7 +283,7 @@ struct AnalysisPassModel : AnalysisPassConcept<IRUnitT, PreservedAnalysesT,
       AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
                           PreservedAnalysesT, InvalidatorT>;
 
-  /// \brief The model delegates to the \c PassT::run method.
+  /// The model delegates to the \c PassT::run method.
   ///
   /// The return is wrapped in an \c AnalysisResultModel.
   std::unique_ptr<
@@ -293,7 +293,7 @@ struct AnalysisPassModel : AnalysisPassConcept<IRUnitT, PreservedAnalysesT,
     return llvm::make_unique<ResultModelT>(Pass.run(IR, AM, ExtraArgs...));
   }
 
-  /// \brief The model delegates to a static \c PassT::name method.
+  /// The model delegates to a static \c PassT::name method.
   ///
   /// The returned string ref must point to constant immutable data!
   StringRef name() override { return PassT::name(); }
diff --git a/contrib/llvm/include/llvm/IR/PatternMatch.h b/contrib/llvm/include/llvm/IR/PatternMatch.h
index 245d72fbd16e..af0616cd8221 100644
--- a/contrib/llvm/include/llvm/IR/PatternMatch.h
+++ b/contrib/llvm/include/llvm/IR/PatternMatch.h
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 //
 // This file provides a simple and efficient mechanism for performing general
-// tree-based pattern matches on the LLVM IR.  The power of these routines is
+// tree-based pattern matches on the LLVM IR. The power of these routines is
 // that it allows you to write concise patterns that are expressive and easy to
-// understand.  The other major advantage of this is that it allows you to
-// trivially capture/bind elements in the pattern to variables.  For example,
+// understand. The other major advantage of this is that it allows you to
+// trivially capture/bind elements in the pattern to variables. For example,
 // you can do something like this:
 //
 //  Value *Exp = ...
@@ -68,26 +68,26 @@ template <typename Class> struct class_match {
   template <typename ITy> bool match(ITy *V) { return isa<Class>(V); }
 };
 
-/// \brief Match an arbitrary value and ignore it.
+/// Match an arbitrary value and ignore it.
 inline class_match<Value> m_Value() { return class_match<Value>(); }
 
-/// \brief Match an arbitrary binary operation and ignore it.
+/// Match an arbitrary binary operation and ignore it.
 inline class_match<BinaryOperator> m_BinOp() {
   return class_match<BinaryOperator>();
 }
 
-/// \brief Matches any compare instruction and ignore it.
+/// Matches any compare instruction and ignore it.
 inline class_match<CmpInst> m_Cmp() { return class_match<CmpInst>(); }
 
-/// \brief Match an arbitrary ConstantInt and ignore it.
+/// Match an arbitrary ConstantInt and ignore it.
 inline class_match<ConstantInt> m_ConstantInt() {
   return class_match<ConstantInt>();
 }
 
-/// \brief Match an arbitrary undef constant.
+/// Match an arbitrary undef constant.
 inline class_match<UndefValue> m_Undef() { return class_match<UndefValue>(); }
 
-/// \brief Match an arbitrary Constant and ignore it.
+/// Match an arbitrary Constant and ignore it.
 inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
 
 /// Matching combinators
@@ -132,89 +132,6 @@ inline match_combine_and<LTy, RTy> m_CombineAnd(const LTy &L, const RTy &R) {
   return match_combine_and<LTy, RTy>(L, R);
 }
 
-struct match_zero {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<Constant>(V))
-      return C->isNullValue();
-    return false;
-  }
-};
-
-/// \brief Match an arbitrary zero/null constant.  This includes
-/// zero_initializer for vectors and ConstantPointerNull for pointers.
-inline match_zero m_Zero() { return match_zero(); }
-
-struct match_neg_zero {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<Constant>(V))
-      return C->isNegativeZeroValue();
-    return false;
-  }
-};
-
-/// \brief Match an arbitrary zero/null constant.  This includes
-/// zero_initializer for vectors and ConstantPointerNull for pointers. For
-/// floating point constants, this will match negative zero but not positive
-/// zero
-inline match_neg_zero m_NegZero() { return match_neg_zero(); }
-
-struct match_any_zero {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<Constant>(V))
-      return C->isZeroValue();
-    return false;
-  }
-};
-
-/// \brief - Match an arbitrary zero/null constant.  This includes
-/// zero_initializer for vectors and ConstantPointerNull for pointers. For
-/// floating point constants, this will match negative zero and positive zero
-inline match_any_zero m_AnyZero() { return match_any_zero(); }
-
-struct match_nan {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<ConstantFP>(V))
-      return C->isNaN();
-    return false;
-  }
-};
-
-/// Match an arbitrary NaN constant. This includes quiet and signalling nans.
-inline match_nan m_NaN() { return match_nan(); }
-
-struct match_one {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<Constant>(V))
-      return C->isOneValue();
-    return false;
-  }
-};
-
-/// \brief Match an integer 1 or a vector with all elements equal to 1.
-inline match_one m_One() { return match_one(); }
-
-struct match_all_ones {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<Constant>(V))
-      return C->isAllOnesValue();
-    return false;
-  }
-};
-
-/// \brief Match an integer or vector with all bits set to true.
-inline match_all_ones m_AllOnes() { return match_all_ones(); }
-
-struct match_sign_mask {
-  template <typename ITy> bool match(ITy *V) {
-    if (const auto *C = dyn_cast<Constant>(V))
-      return C->isMinSignedValue();
-    return false;
-  }
-};
-
-/// \brief Match an integer or vector with only the sign bit(s) set.
-inline match_sign_mask m_SignMask() { return match_sign_mask(); }
-
 struct apint_match {
   const APInt *&Res;
 
@@ -255,11 +172,11 @@ struct apfloat_match {
   }
 };
 
-/// \brief Match a ConstantInt or splatted ConstantVector, binding the
+/// Match a ConstantInt or splatted ConstantVector, binding the
 /// specified pointer to the contained APInt.
 inline apint_match m_APInt(const APInt *&Res) { return Res; }
 
-/// \brief Match a ConstantFP or splatted ConstantVector, binding the
+/// Match a ConstantFP or splatted ConstantVector, binding the
 /// specified pointer to the contained APFloat.
 inline apfloat_match m_APFloat(const APFloat *&Res) { return Res; }
 
@@ -278,26 +195,44 @@ template <int64_t Val> struct constantint_match {
   }
 };
 
-/// \brief Match a ConstantInt with a specific value.
+/// Match a ConstantInt with a specific value.
 template <int64_t Val> inline constantint_match<Val> m_ConstantInt() {
   return constantint_match<Val>();
 }
 
-/// \brief This helper class is used to match scalar and vector constants that
+/// This helper class is used to match scalar and vector integer constants that
 /// satisfy a specified predicate.
+/// For vector constants, undefined elements are ignored.
 template <typename Predicate> struct cst_pred_ty : public Predicate {
   template <typename ITy> bool match(ITy *V) {
     if (const auto *CI = dyn_cast<ConstantInt>(V))
       return this->isValue(CI->getValue());
-    if (V->getType()->isVectorTy())
-      if (const auto *C = dyn_cast<Constant>(V))
+    if (V->getType()->isVectorTy()) {
+      if (const auto *C = dyn_cast<Constant>(V)) {
         if (const auto *CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
           return this->isValue(CI->getValue());
+
+        // Non-splat vector constant: check each element for a match.
+        unsigned NumElts = V->getType()->getVectorNumElements();
+        assert(NumElts != 0 && "Constant vector with no elements?");
+        for (unsigned i = 0; i != NumElts; ++i) {
+          Constant *Elt = C->getAggregateElement(i);
+          if (!Elt)
+            return false;
+          if (isa<UndefValue>(Elt))
+            continue;
+          auto *CI = dyn_cast<ConstantInt>(Elt);
+          if (!CI || !this->isValue(CI->getValue()))
+            return false;
+        }
+        return true;
+      }
+    }
     return false;
   }
 };
 
-/// \brief This helper class is used to match scalar and vector constants that
+/// This helper class is used to match scalar and vector constants that
 /// satisfy a specified predicate, and bind them to an APInt.
 template <typename Predicate> struct api_pred_ty : public Predicate {
   const APInt *&Res;
@@ -322,20 +257,202 @@ template <typename Predicate> struct api_pred_ty : public Predicate {
   }
 };
 
-struct is_power2 {
-  bool isValue(const APInt &C) { return C.isPowerOf2(); }
+/// This helper class is used to match scalar and vector floating-point
+/// constants that satisfy a specified predicate.
+/// For vector constants, undefined elements are ignored.
+template <typename Predicate> struct cstfp_pred_ty : public Predicate {
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CF = dyn_cast<ConstantFP>(V))
+      return this->isValue(CF->getValueAPF());
+    if (V->getType()->isVectorTy()) {
+      if (const auto *C = dyn_cast<Constant>(V)) {
+        if (const auto *CF = dyn_cast_or_null<ConstantFP>(C->getSplatValue()))
+          return this->isValue(CF->getValueAPF());
+
+        // Non-splat vector constant: check each element for a match.
+        unsigned NumElts = V->getType()->getVectorNumElements();
+        assert(NumElts != 0 && "Constant vector with no elements?");
+        for (unsigned i = 0; i != NumElts; ++i) {
+          Constant *Elt = C->getAggregateElement(i);
+          if (!Elt)
+            return false;
+          if (isa<UndefValue>(Elt))
+            continue;
+          auto *CF = dyn_cast<ConstantFP>(Elt);
+          if (!CF || !this->isValue(CF->getValueAPF()))
+            return false;
+        }
+        return true;
+      }
+    }
+    return false;
+  }
 };
 
-/// \brief Match an integer or vector power of 2.
-inline cst_pred_ty<is_power2> m_Power2() { return cst_pred_ty<is_power2>(); }
-inline api_pred_ty<is_power2> m_Power2(const APInt *&V) { return V; }
+///////////////////////////////////////////////////////////////////////////////
+//
+// Encapsulate constant value queries for use in templated predicate matchers.
+// This allows checking if constants match using compound predicates and works
+// with vector constants, possibly with relaxed constraints. For example, ignore
+// undef values.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+struct is_all_ones {
+  bool isValue(const APInt &C) { return C.isAllOnesValue(); }
+};
+/// Match an integer or vector with all bits set.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_all_ones> m_AllOnes() {
+  return cst_pred_ty<is_all_ones>();
+}
 
 struct is_maxsignedvalue {
   bool isValue(const APInt &C) { return C.isMaxSignedValue(); }
 };
+/// Match an integer or vector with values having all bits except for the high
+/// bit set (0x7f...).
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_maxsignedvalue> m_MaxSignedValue() {
+  return cst_pred_ty<is_maxsignedvalue>();
+}
+inline api_pred_ty<is_maxsignedvalue> m_MaxSignedValue(const APInt *&V) {
+  return V;
+}
 
-inline cst_pred_ty<is_maxsignedvalue> m_MaxSignedValue() { return cst_pred_ty<is_maxsignedvalue>(); }
-inline api_pred_ty<is_maxsignedvalue> m_MaxSignedValue(const APInt *&V) { return V; }
+struct is_negative {
+  bool isValue(const APInt &C) { return C.isNegative(); }
+};
+/// Match an integer or vector of negative values.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_negative> m_Negative() {
+  return cst_pred_ty<is_negative>();
+}
+inline api_pred_ty<is_negative> m_Negative(const APInt *&V) {
+  return V;
+}
+
+struct is_nonnegative {
+  bool isValue(const APInt &C) { return C.isNonNegative(); }
+};
+/// Match an integer or vector of nonnegative values.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_nonnegative> m_NonNegative() {
+  return cst_pred_ty<is_nonnegative>();
+}
+inline api_pred_ty<is_nonnegative> m_NonNegative(const APInt *&V) {
+  return V;
+}
+
+struct is_one {
+  bool isValue(const APInt &C) { return C.isOneValue(); }
+};
+/// Match an integer 1 or a vector with all elements equal to 1.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_one> m_One() {
+  return cst_pred_ty<is_one>();
+}
+
+struct is_zero_int {
+  bool isValue(const APInt &C) { return C.isNullValue(); }
+};
+/// Match an integer 0 or a vector with all elements equal to 0.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_zero_int> m_ZeroInt() {
+  return cst_pred_ty<is_zero_int>();
+}
+
+struct is_zero {
+  template <typename ITy> bool match(ITy *V) {
+    auto *C = dyn_cast<Constant>(V);
+    return C && (C->isNullValue() || cst_pred_ty<is_zero_int>().match(C));
+  }
+};
+/// Match any null constant or a vector with all elements equal to 0.
+/// For vectors, this includes constants with undefined elements.
+inline is_zero m_Zero() {
+  return is_zero();
+}
+
+struct is_power2 {
+  bool isValue(const APInt &C) { return C.isPowerOf2(); }
+};
+/// Match an integer or vector power-of-2.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_power2> m_Power2() {
+  return cst_pred_ty<is_power2>();
+}
+inline api_pred_ty<is_power2> m_Power2(const APInt *&V) {
+  return V;
+}
+
+struct is_power2_or_zero {
+  bool isValue(const APInt &C) { return !C || C.isPowerOf2(); }
+};
+/// Match an integer or vector of 0 or power-of-2 values.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_power2_or_zero> m_Power2OrZero() {
+  return cst_pred_ty<is_power2_or_zero>();
+}
+inline api_pred_ty<is_power2_or_zero> m_Power2OrZero(const APInt *&V) {
+  return V;
+}
+
+struct is_sign_mask {
+  bool isValue(const APInt &C) { return C.isSignMask(); }
+};
+/// Match an integer or vector with only the sign bit(s) set.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_sign_mask> m_SignMask() {
+  return cst_pred_ty<is_sign_mask>();
+}
+
+struct is_lowbit_mask {
+  bool isValue(const APInt &C) { return C.isMask(); }
+};
+/// Match an integer or vector with only the low bit(s) set.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_lowbit_mask> m_LowBitMask() {
+  return cst_pred_ty<is_lowbit_mask>();
+}
+
+struct is_nan {
+  bool isValue(const APFloat &C) { return C.isNaN(); }
+};
+/// Match an arbitrary NaN constant. This includes quiet and signalling nans.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_nan> m_NaN() {
+  return cstfp_pred_ty<is_nan>();
+}
+
+struct is_any_zero_fp {
+  bool isValue(const APFloat &C) { return C.isZero(); }
+};
+/// Match a floating-point negative zero or positive zero.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_any_zero_fp> m_AnyZeroFP() {
+  return cstfp_pred_ty<is_any_zero_fp>();
+}
+
+struct is_pos_zero_fp {
+  bool isValue(const APFloat &C) { return C.isPosZero(); }
+};
+/// Match a floating-point positive zero.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_pos_zero_fp> m_PosZeroFP() {
+  return cstfp_pred_ty<is_pos_zero_fp>();
+}
+
+struct is_neg_zero_fp {
+  bool isValue(const APFloat &C) { return C.isNegZero(); }
+};
+/// Match a floating-point negative zero.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_neg_zero_fp> m_NegZeroFP() {
+  return cstfp_pred_ty<is_neg_zero_fp>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
 
 template <typename Class> struct bind_ty {
   Class *&VR;
@@ -351,25 +468,25 @@ template <typename Class> struct bind_ty {
   }
 };
 
-/// \brief Match a value, capturing it if we match.
+/// Match a value, capturing it if we match.
 inline bind_ty<Value> m_Value(Value *&V) { return V; }
 inline bind_ty<const Value> m_Value(const Value *&V) { return V; }
 
-/// \brief Match an instruction, capturing it if we match.
+/// Match an instruction, capturing it if we match.
 inline bind_ty<Instruction> m_Instruction(Instruction *&I) { return I; }
-/// \brief Match a binary operator, capturing it if we match.
+/// Match a binary operator, capturing it if we match.
 inline bind_ty<BinaryOperator> m_BinOp(BinaryOperator *&I) { return I; }
 
-/// \brief Match a ConstantInt, capturing the value if we match.
+/// Match a ConstantInt, capturing the value if we match.
 inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 
-/// \brief Match a Constant, capturing the value if we match.
+/// Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
-/// \brief Match a ConstantFP, capturing the value if we match.
+/// Match a ConstantFP, capturing the value if we match.
 inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
 
-/// \brief Match a specified Value*.
+/// Match a specified Value*.
 struct specificval_ty {
   const Value *Val;
 
@@ -378,10 +495,26 @@ struct specificval_ty {
   template <typename ITy> bool match(ITy *V) { return V == Val; }
 };
 
-/// \brief Match if we have a specific specified value.
+/// Match if we have a specific specified value.
 inline specificval_ty m_Specific(const Value *V) { return V; }
 
-/// \brief Match a specified floating point value or vector of all elements of
+/// Stores a reference to the Value *, not the Value * itself,
+/// thus can be used in commutative matchers.
+template <typename Class> struct deferredval_ty {
+  Class *const &Val;
+
+  deferredval_ty(Class *const &V) : Val(V) {}
+
+  template <typename ITy> bool match(ITy *const V) { return V == Val; }
+};
+
+/// A commutative-friendly version of m_Specific().
+inline deferredval_ty<Value> m_Deferred(Value *const &V) { return V; }
+inline deferredval_ty<const Value> m_Deferred(const Value *const &V) {
+  return V;
+}
+
+/// Match a specified floating point value or vector of all elements of
 /// that value.
 struct specific_fpval {
   double Val;
@@ -399,11 +532,11 @@ struct specific_fpval {
   }
 };
 
-/// \brief Match a specific floating point value or vector with all elements
+/// Match a specific floating point value or vector with all elements
 /// equal to the value.
 inline specific_fpval m_SpecificFP(double V) { return specific_fpval(V); }
 
-/// \brief Match a float 1.0 or vector with all elements equal to 1.0.
+/// Match a float 1.0 or vector with all elements equal to 1.0.
 inline specific_fpval m_FPOne() { return m_SpecificFP(1.0); }
 
 struct bind_const_intval_ty {
@@ -421,7 +554,7 @@ struct bind_const_intval_ty {
   }
 };
 
-/// \brief Match a specified integer value or vector of all elements of that
+/// Match a specified integer value or vector of all elements of that
 // value.
 struct specific_intval {
   uint64_t Val;
@@ -438,11 +571,11 @@ struct specific_intval {
   }
 };
 
-/// \brief Match a specific integer value or vector with all elements equal to
+/// Match a specific integer value or vector with all elements equal to
 /// the value.
 inline specific_intval m_SpecificInt(uint64_t V) { return specific_intval(V); }
 
-/// \brief Match a ConstantInt and bind to its value.  This does not match
+/// Match a ConstantInt and bind to its value.  This does not match
 /// ConstantInts wider than 64-bits.
 inline bind_const_intval_ty m_ConstantInt(uint64_t &V) { return V; }
 
@@ -454,13 +587,15 @@ struct AnyBinaryOp_match {
   LHS_t L;
   RHS_t R;
 
+  // The evaluation order is always stable, regardless of Commutability.
+  // The LHS is always matched first.
   AnyBinaryOp_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
     if (auto *I = dyn_cast<BinaryOperator>(V))
       return (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) ||
-             (Commutable && R.match(I->getOperand(0)) &&
-              L.match(I->getOperand(1)));
+             (Commutable && L.match(I->getOperand(1)) &&
+              R.match(I->getOperand(0)));
     return false;
   }
 };
@@ -480,20 +615,22 @@ struct BinaryOp_match {
   LHS_t L;
   RHS_t R;
 
+  // The evaluation order is always stable, regardless of Commutability.
+  // The LHS is always matched first.
   BinaryOp_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
     if (V->getValueID() == Value::InstructionVal + Opcode) {
       auto *I = cast<BinaryOperator>(V);
       return (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) ||
-             (Commutable && R.match(I->getOperand(0)) &&
-              L.match(I->getOperand(1)));
+             (Commutable && L.match(I->getOperand(1)) &&
+              R.match(I->getOperand(0)));
     }
     if (auto *CE = dyn_cast<ConstantExpr>(V))
       return CE->getOpcode() == Opcode &&
              ((L.match(CE->getOperand(0)) && R.match(CE->getOperand(1))) ||
-              (Commutable && R.match(CE->getOperand(0)) &&
-               L.match(CE->getOperand(1))));
+              (Commutable && L.match(CE->getOperand(1)) &&
+               R.match(CE->getOperand(0))));
     return false;
   }
 };
@@ -522,6 +659,13 @@ inline BinaryOp_match<LHS, RHS, Instruction::FSub> m_FSub(const LHS &L,
   return BinaryOp_match<LHS, RHS, Instruction::FSub>(L, R);
 }
 
+/// Match 'fneg X' as 'fsub -0.0, X'.
+template <typename RHS>
+inline BinaryOp_match<cstfp_pred_ty<is_neg_zero_fp>, RHS, Instruction::FSub>
+m_FNeg(const RHS &X) {
+  return m_FSub(m_NegZeroFP(), X);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::Mul> m_Mul(const LHS &L,
                                                         const RHS &R) {
@@ -746,35 +890,35 @@ struct is_idiv_op {
   }
 };
 
-/// \brief Matches shift operations.
+/// Matches shift operations.
 template <typename LHS, typename RHS>
 inline BinOpPred_match<LHS, RHS, is_shift_op> m_Shift(const LHS &L,
                                                       const RHS &R) {
   return BinOpPred_match<LHS, RHS, is_shift_op>(L, R);
 }
 
-/// \brief Matches logical shift operations.
+/// Matches logical shift operations.
 template <typename LHS, typename RHS>
 inline BinOpPred_match<LHS, RHS, is_right_shift_op> m_Shr(const LHS &L,
                                                           const RHS &R) {
   return BinOpPred_match<LHS, RHS, is_right_shift_op>(L, R);
 }
 
-/// \brief Matches logical shift operations.
+/// Matches logical shift operations.
 template <typename LHS, typename RHS>
 inline BinOpPred_match<LHS, RHS, is_logical_shift_op>
 m_LogicalShift(const LHS &L, const RHS &R) {
   return BinOpPred_match<LHS, RHS, is_logical_shift_op>(L, R);
 }
 
-/// \brief Matches bitwise logic operations.
+/// Matches bitwise logic operations.
 template <typename LHS, typename RHS>
 inline BinOpPred_match<LHS, RHS, is_bitwiselogic_op>
 m_BitwiseLogic(const LHS &L, const RHS &R) {
   return BinOpPred_match<LHS, RHS, is_bitwiselogic_op>(L, R);
 }
 
-/// \brief Matches integer division operations.
+/// Matches integer division operations.
 template <typename LHS, typename RHS>
 inline BinOpPred_match<LHS, RHS, is_idiv_op> m_IDiv(const LHS &L,
                                                     const RHS &R) {
@@ -811,14 +955,16 @@ struct CmpClass_match {
   LHS_t L;
   RHS_t R;
 
+  // The evaluation order is always stable, regardless of Commutability.
+  // The LHS is always matched first.
   CmpClass_match(PredicateTy &Pred, const LHS_t &LHS, const RHS_t &RHS)
       : Predicate(Pred), L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
     if (auto *I = dyn_cast<Class>(V))
       if ((L.match(I->getOperand(0)) && R.match(I->getOperand(1))) ||
-          (Commutable && R.match(I->getOperand(0)) &&
-           L.match(I->getOperand(1)))) {
+          (Commutable && L.match(I->getOperand(1)) &&
+           R.match(I->getOperand(0)))) {
         Predicate = I->getPredicate();
         return true;
       }
@@ -871,7 +1017,7 @@ inline SelectClass_match<Cond, LHS, RHS> m_Select(const Cond &C, const LHS &L,
   return SelectClass_match<Cond, LHS, RHS>(C, L, R);
 }
 
-/// \brief This matches a select of two constants, e.g.:
+/// This matches a select of two constants, e.g.:
 /// m_SelectCst<-1, 0>(m_Value(V))
 template <int64_t L, int64_t R, typename Cond>
 inline SelectClass_match<Cond, constantint_match<L>, constantint_match<R>>
@@ -880,6 +1026,84 @@ m_SelectCst(const Cond &C) {
 }
 
 //===----------------------------------------------------------------------===//
+// Matchers for InsertElementInst classes
+//
+
+template <typename Val_t, typename Elt_t, typename Idx_t>
+struct InsertElementClass_match {
+  Val_t V;
+  Elt_t E;
+  Idx_t I;
+
+  InsertElementClass_match(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
+      : V(Val), E(Elt), I(Idx) {}
+
+  template <typename OpTy> bool match(OpTy *VV) {
+    if (auto *II = dyn_cast<InsertElementInst>(VV))
+      return V.match(II->getOperand(0)) && E.match(II->getOperand(1)) &&
+             I.match(II->getOperand(2));
+    return false;
+  }
+};
+
+template <typename Val_t, typename Elt_t, typename Idx_t>
+inline InsertElementClass_match<Val_t, Elt_t, Idx_t>
+m_InsertElement(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx) {
+  return InsertElementClass_match<Val_t, Elt_t, Idx_t>(Val, Elt, Idx);
+}
+
+//===----------------------------------------------------------------------===//
+// Matchers for ExtractElementInst classes
+//
+
+template <typename Val_t, typename Idx_t> struct ExtractElementClass_match {
+  Val_t V;
+  Idx_t I;
+
+  ExtractElementClass_match(const Val_t &Val, const Idx_t &Idx)
+      : V(Val), I(Idx) {}
+
+  template <typename OpTy> bool match(OpTy *VV) {
+    if (auto *II = dyn_cast<ExtractElementInst>(VV))
+      return V.match(II->getOperand(0)) && I.match(II->getOperand(1));
+    return false;
+  }
+};
+
+template <typename Val_t, typename Idx_t>
+inline ExtractElementClass_match<Val_t, Idx_t>
+m_ExtractElement(const Val_t &Val, const Idx_t &Idx) {
+  return ExtractElementClass_match<Val_t, Idx_t>(Val, Idx);
+}
+
+//===----------------------------------------------------------------------===//
+// Matchers for ShuffleVectorInst classes
+//
+
+template <typename V1_t, typename V2_t, typename Mask_t>
+struct ShuffleVectorClass_match {
+  V1_t V1;
+  V2_t V2;
+  Mask_t M;
+
+  ShuffleVectorClass_match(const V1_t &v1, const V2_t &v2, const Mask_t &m)
+      : V1(v1), V2(v2), M(m) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
+      return V1.match(SI->getOperand(0)) && V2.match(SI->getOperand(1)) &&
+             M.match(SI->getOperand(2));
+    return false;
+  }
+};
+
+template <typename V1_t, typename V2_t, typename Mask_t>
+inline ShuffleVectorClass_match<V1_t, V2_t, Mask_t>
+m_ShuffleVector(const V1_t &v1, const V2_t &v2, const Mask_t &m) {
+  return ShuffleVectorClass_match<V1_t, V2_t, Mask_t>(v1, v2, m);
+}
+
+//===----------------------------------------------------------------------===//
 // Matchers for CastInst classes
 //
 
@@ -895,31 +1119,31 @@ template <typename Op_t, unsigned Opcode> struct CastClass_match {
   }
 };
 
-/// \brief Matches BitCast.
+/// Matches BitCast.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::BitCast> m_BitCast(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::BitCast>(Op);
 }
 
-/// \brief Matches PtrToInt.
+/// Matches PtrToInt.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::PtrToInt> m_PtrToInt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::PtrToInt>(Op);
 }
 
-/// \brief Matches Trunc.
+/// Matches Trunc.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::Trunc>(Op);
 }
 
-/// \brief Matches SExt.
+/// Matches SExt.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::SExt> m_SExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::SExt>(Op);
 }
 
-/// \brief Matches ZExt.
+/// Matches ZExt.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::ZExt>(Op);
@@ -932,25 +1156,25 @@ m_ZExtOrSExt(const OpTy &Op) {
   return m_CombineOr(m_ZExt(Op), m_SExt(Op));
 }
 
-/// \brief Matches UIToFP.
+/// Matches UIToFP.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::UIToFP> m_UIToFP(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::UIToFP>(Op);
 }
 
-/// \brief Matches SIToFP.
+/// Matches SIToFP.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::SIToFP> m_SIToFP(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::SIToFP>(Op);
 }
 
-/// \brief Matches FPTrunc
+/// Matches FPTrunc
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::FPTrunc> m_FPTrunc(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::FPTrunc>(Op);
 }
 
-/// \brief Matches FPExt
+/// Matches FPExt
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::FPExt> m_FPExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::FPExt>(Op);
@@ -976,80 +1200,32 @@ template <typename Op_t> struct LoadClass_match {
 template <typename OpTy> inline LoadClass_match<OpTy> m_Load(const OpTy &Op) {
   return LoadClass_match<OpTy>(Op);
 }
+
 //===----------------------------------------------------------------------===//
-// Matchers for unary operators
+// Matcher for StoreInst classes
 //
 
-template <typename LHS_t> struct not_match {
-  LHS_t L;
-
-  not_match(const LHS_t &LHS) : L(LHS) {}
-
-  template <typename OpTy> bool match(OpTy *V) {
-    if (auto *O = dyn_cast<Operator>(V))
-      if (O->getOpcode() == Instruction::Xor) {
-        if (isAllOnes(O->getOperand(1)))
-          return L.match(O->getOperand(0));
-        if (isAllOnes(O->getOperand(0)))
-          return L.match(O->getOperand(1));
-      }
-    return false;
-  }
-
-private:
-  bool isAllOnes(Value *V) {
-    return isa<Constant>(V) && cast<Constant>(V)->isAllOnesValue();
-  }
-};
-
-template <typename LHS> inline not_match<LHS> m_Not(const LHS &L) { return L; }
-
-template <typename LHS_t> struct neg_match {
-  LHS_t L;
-
-  neg_match(const LHS_t &LHS) : L(LHS) {}
-
-  template <typename OpTy> bool match(OpTy *V) {
-    if (auto *O = dyn_cast<Operator>(V))
-      if (O->getOpcode() == Instruction::Sub)
-        return matchIfNeg(O->getOperand(0), O->getOperand(1));
-    return false;
-  }
-
-private:
-  bool matchIfNeg(Value *LHS, Value *RHS) {
-    return ((isa<ConstantInt>(LHS) && cast<ConstantInt>(LHS)->isZero()) ||
-            isa<ConstantAggregateZero>(LHS)) &&
-           L.match(RHS);
-  }
-};
+template <typename ValueOp_t, typename PointerOp_t> struct StoreClass_match {
+  ValueOp_t ValueOp;
+  PointerOp_t PointerOp;
 
-/// \brief Match an integer negate.
-template <typename LHS> inline neg_match<LHS> m_Neg(const LHS &L) { return L; }
-
-template <typename LHS_t> struct fneg_match {
-  LHS_t L;
-
-  fneg_match(const LHS_t &LHS) : L(LHS) {}
+  StoreClass_match(const ValueOp_t &ValueOpMatch,
+                   const PointerOp_t &PointerOpMatch) :
+    ValueOp(ValueOpMatch), PointerOp(PointerOpMatch)  {}
 
   template <typename OpTy> bool match(OpTy *V) {
-    if (auto *O = dyn_cast<Operator>(V))
-      if (O->getOpcode() == Instruction::FSub)
-        return matchIfFNeg(O->getOperand(0), O->getOperand(1));
-    return false;
-  }
-
-private:
-  bool matchIfFNeg(Value *LHS, Value *RHS) {
-    if (const auto *C = dyn_cast<ConstantFP>(LHS))
-      return C->isNegativeZeroValue() && L.match(RHS);
+    if (auto *LI = dyn_cast<StoreInst>(V))
+      return ValueOp.match(LI->getValueOperand()) &&
+             PointerOp.match(LI->getPointerOperand());
     return false;
   }
 };
 
-/// \brief Match a floating point negate.
-template <typename LHS> inline fneg_match<LHS> m_FNeg(const LHS &L) {
-  return L;
+/// Matches StoreInst.
+template <typename ValueOpTy, typename PointerOpTy>
+inline StoreClass_match<ValueOpTy, PointerOpTy>
+m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp) {
+  return StoreClass_match<ValueOpTy, PointerOpTy>(ValueOp, PointerOp);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1106,6 +1282,8 @@ struct MaxMin_match {
   LHS_t L;
   RHS_t R;
 
+  // The evaluation order is always stable, regardless of Commutability.
+  // The LHS is always matched first.
   MaxMin_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
@@ -1132,60 +1310,60 @@ struct MaxMin_match {
       return false;
     // It does!  Bind the operands.
     return (L.match(LHS) && R.match(RHS)) ||
-           (Commutable && R.match(LHS) && L.match(RHS));
+           (Commutable && L.match(RHS) && R.match(LHS));
   }
 };
 
-/// \brief Helper class for identifying signed max predicates.
+/// Helper class for identifying signed max predicates.
 struct smax_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE;
   }
 };
 
-/// \brief Helper class for identifying signed min predicates.
+/// Helper class for identifying signed min predicates.
 struct smin_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_SLE;
   }
 };
 
-/// \brief Helper class for identifying unsigned max predicates.
+/// Helper class for identifying unsigned max predicates.
 struct umax_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE;
   }
 };
 
-/// \brief Helper class for identifying unsigned min predicates.
+/// Helper class for identifying unsigned min predicates.
 struct umin_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_ULE;
   }
 };
 
-/// \brief Helper class for identifying ordered max predicates.
+/// Helper class for identifying ordered max predicates.
 struct ofmax_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_OGT || Pred == CmpInst::FCMP_OGE;
   }
 };
 
-/// \brief Helper class for identifying ordered min predicates.
+/// Helper class for identifying ordered min predicates.
 struct ofmin_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_OLT || Pred == CmpInst::FCMP_OLE;
   }
 };
 
-/// \brief Helper class for identifying unordered max predicates.
+/// Helper class for identifying unordered max predicates.
 struct ufmax_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_UGT || Pred == CmpInst::FCMP_UGE;
   }
 };
 
-/// \brief Helper class for identifying unordered min predicates.
+/// Helper class for identifying unordered min predicates.
 struct ufmin_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_ULT || Pred == CmpInst::FCMP_ULE;
@@ -1216,7 +1394,7 @@ inline MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty> m_UMin(const LHS &L,
   return MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty>(L, R);
 }
 
-/// \brief Match an 'ordered' floating point maximum function.
+/// Match an 'ordered' floating point maximum function.
 /// Floating point has one special value 'NaN'. Therefore, there is no total
 /// order. However, if we can ignore the 'NaN' value (for example, because of a
 /// 'no-nans-float-math' flag) a combination of a fcmp and select has 'maximum'
@@ -1231,7 +1409,7 @@ inline MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty> m_OrdFMax(const LHS &L,
   return MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty>(L, R);
 }
 
-/// \brief Match an 'ordered' floating point minimum function.
+/// Match an 'ordered' floating point minimum function.
 /// Floating point has one special value 'NaN'. Therefore, there is no total
 /// order. However, if we can ignore the 'NaN' value (for example, because of a
 /// 'no-nans-float-math' flag) a combination of a fcmp and select has 'minimum'
@@ -1246,7 +1424,7 @@ inline MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty> m_OrdFMin(const LHS &L,
   return MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty>(L, R);
 }
 
-/// \brief Match an 'unordered' floating point maximum function.
+/// Match an 'unordered' floating point maximum function.
 /// Floating point has one special value 'NaN'. Therefore, there is no total
 /// order. However, if we can ignore the 'NaN' value (for example, because of a
 /// 'no-nans-float-math' flag) a combination of a fcmp and select has 'maximum'
@@ -1261,7 +1439,7 @@ m_UnordFMax(const LHS &L, const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ufmax_pred_ty>(L, R);
 }
 
-/// \brief Match an 'unordered' floating point minimum function.
+/// Match an 'unordered' floating point minimum function.
 /// Floating point has one special value 'NaN'. Therefore, there is no total
 /// order. However, if we can ignore the 'NaN' value (for example, because of a
 /// 'no-nans-float-math' flag) a combination of a fcmp and select has 'minimum'
@@ -1312,7 +1490,7 @@ struct UAddWithOverflow_match {
   }
 };
 
-/// \brief Match an icmp instruction checking for unsigned overflow on addition.
+/// Match an icmp instruction checking for unsigned overflow on addition.
 ///
 /// S is matched to the addition whose result is being checked for overflow, and
 /// L and R are matched to the LHS and RHS of S.
@@ -1334,13 +1512,13 @@ template <typename Opnd_t> struct Argument_match {
   }
 };
 
-/// \brief Match an argument.
+/// Match an argument.
 template <unsigned OpI, typename Opnd_t>
 inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
   return Argument_match<Opnd_t>(OpI, Op);
 }
 
-/// \brief Intrinsic matchers.
+/// Intrinsic matchers.
 struct IntrinsicID_match {
   unsigned ID;
 
@@ -1383,7 +1561,7 @@ struct m_Intrinsic_Ty<T0, T1, T2, T3> {
                         Argument_match<T3>>;
 };
 
-/// \brief Match intrinsic calls like this:
+/// Match intrinsic calls like this:
 /// m_Intrinsic<Intrinsic::fabs>(m_Value(X))
 template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
   return IntrinsicID_match(IntrID);
@@ -1424,6 +1602,16 @@ inline typename m_Intrinsic_Ty<Opnd0>::Ty m_BSwap(const Opnd0 &Op0) {
   return m_Intrinsic<Intrinsic::bswap>(Op0);
 }
 
+template <typename Opnd0>
+inline typename m_Intrinsic_Ty<Opnd0>::Ty m_FAbs(const Opnd0 &Op0) {
+  return m_Intrinsic<Intrinsic::fabs>(Op0);
+}
+
+template <typename Opnd0>
+inline typename m_Intrinsic_Ty<Opnd0>::Ty m_FCanonicalize(const Opnd0 &Op0) {
+  return m_Intrinsic<Intrinsic::canonicalize>(Op0);
+}
+
 template <typename Opnd0, typename Opnd1>
 inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_FMin(const Opnd0 &Op0,
                                                         const Opnd1 &Op1) {
@@ -1436,57 +1624,17 @@ inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_FMax(const Opnd0 &Op0,
   return m_Intrinsic<Intrinsic::maxnum>(Op0, Op1);
 }
 
-template <typename Opnd_t> struct Signum_match {
-  Opnd_t Val;
-  Signum_match(const Opnd_t &V) : Val(V) {}
-
-  template <typename OpTy> bool match(OpTy *V) {
-    unsigned TypeSize = V->getType()->getScalarSizeInBits();
-    if (TypeSize == 0)
-      return false;
-
-    unsigned ShiftWidth = TypeSize - 1;
-    Value *OpL = nullptr, *OpR = nullptr;
-
-    // This is the representation of signum we match:
-    //
-    //  signum(x) == (x >> 63) | (-x >>u 63)
-    //
-    // An i1 value is its own signum, so it's correct to match
-    //
-    //  signum(x) == (x >> 0)  | (-x >>u 0)
-    //
-    // for i1 values.
-
-    auto LHS = m_AShr(m_Value(OpL), m_SpecificInt(ShiftWidth));
-    auto RHS = m_LShr(m_Neg(m_Value(OpR)), m_SpecificInt(ShiftWidth));
-    auto Signum = m_Or(LHS, RHS);
-
-    return Signum.match(V) && OpL == OpR && Val.match(OpL);
-  }
-};
-
-/// \brief Matches a signum pattern.
-///
-/// signum(x) =
-///      x >  0  ->  1
-///      x == 0  ->  0
-///      x <  0  -> -1
-template <typename Val_t> inline Signum_match<Val_t> m_Signum(const Val_t &V) {
-  return Signum_match<Val_t>(V);
-}
-
 //===----------------------------------------------------------------------===//
 // Matchers for two-operands operators with the operators in either order
 //
 
-/// \brief Matches a BinaryOperator with LHS and RHS in either order.
+/// Matches a BinaryOperator with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline AnyBinaryOp_match<LHS, RHS, true> m_c_BinOp(const LHS &L, const RHS &R) {
   return AnyBinaryOp_match<LHS, RHS, true>(L, R);
 }
 
-/// \brief Matches an ICmp with a predicate over LHS and RHS in either order.
+/// Matches an ICmp with a predicate over LHS and RHS in either order.
 /// Does not swap the predicate.
 template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>
@@ -1495,41 +1643,55 @@ m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
                                                                        R);
 }
 
-/// \brief Matches a Add with LHS and RHS in either order.
+/// Matches a Add with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::Add, true> m_c_Add(const LHS &L,
                                                                 const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Add, true>(L, R);
 }
 
-/// \brief Matches a Mul with LHS and RHS in either order.
+/// Matches a Mul with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::Mul, true> m_c_Mul(const LHS &L,
                                                                 const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Mul, true>(L, R);
 }
 
-/// \brief Matches an And with LHS and RHS in either order.
+/// Matches an And with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::And, true> m_c_And(const LHS &L,
                                                                 const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::And, true>(L, R);
 }
 
-/// \brief Matches an Or with LHS and RHS in either order.
+/// Matches an Or with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::Or, true> m_c_Or(const LHS &L,
                                                               const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Or, true>(L, R);
 }
 
-/// \brief Matches an Xor with LHS and RHS in either order.
+/// Matches an Xor with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::Xor, true> m_c_Xor(const LHS &L,
                                                                 const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Xor, true>(L, R);
 }
 
+/// Matches a 'Neg' as 'sub 0, V'.
+template <typename ValTy>
+inline BinaryOp_match<cst_pred_ty<is_zero_int>, ValTy, Instruction::Sub>
+m_Neg(const ValTy &V) {
+  return m_Sub(m_ZeroInt(), V);
+}
+
+/// Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
+template <typename ValTy>
+inline BinaryOp_match<ValTy, cst_pred_ty<is_all_ones>, Instruction::Xor, true>
+m_Not(const ValTy &V) {
+  return m_c_Xor(V, m_AllOnes());
+}
+
 /// Matches an SMin with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty, true>
@@ -1555,6 +1717,60 @@ m_c_UMax(const LHS &L, const RHS &R) {
   return MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty, true>(L, R);
 }
 
+/// Matches FAdd with LHS and RHS in either order.
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FAdd, true>
+m_c_FAdd(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, Instruction::FAdd, true>(L, R);
+}
+
+/// Matches FMul with LHS and RHS in either order.
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FMul, true>
+m_c_FMul(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, Instruction::FMul, true>(L, R);
+}
+
+template <typename Opnd_t> struct Signum_match {
+  Opnd_t Val;
+  Signum_match(const Opnd_t &V) : Val(V) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    unsigned TypeSize = V->getType()->getScalarSizeInBits();
+    if (TypeSize == 0)
+      return false;
+
+    unsigned ShiftWidth = TypeSize - 1;
+    Value *OpL = nullptr, *OpR = nullptr;
+
+    // This is the representation of signum we match:
+    //
+    //  signum(x) == (x >> 63) | (-x >>u 63)
+    //
+    // An i1 value is its own signum, so it's correct to match
+    //
+    //  signum(x) == (x >> 0)  | (-x >>u 0)
+    //
+    // for i1 values.
+
+    auto LHS = m_AShr(m_Value(OpL), m_SpecificInt(ShiftWidth));
+    auto RHS = m_LShr(m_Neg(m_Value(OpR)), m_SpecificInt(ShiftWidth));
+    auto Signum = m_Or(LHS, RHS);
+
+    return Signum.match(V) && OpL == OpR && Val.match(OpL);
+  }
+};
+
+/// Matches a signum pattern.
+///
+/// signum(x) =
+///      x >  0  ->  1
+///      x == 0  ->  0
+///      x <  0  -> -1
+template <typename Val_t> inline Signum_match<Val_t> m_Signum(const Val_t &V) {
+  return Signum_match<Val_t>(V);
+}
+
 } // end namespace PatternMatch
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/IR/ProfileSummary.h b/contrib/llvm/include/llvm/IR/ProfileSummary.h
index d85ce8c443ec..e38663770a13 100644
--- a/contrib/llvm/include/llvm/IR/ProfileSummary.h
+++ b/contrib/llvm/include/llvm/IR/ProfileSummary.h
@@ -51,7 +51,7 @@ private:
   SummaryEntryVector DetailedSummary;
   uint64_t TotalCount, MaxCount, MaxInternalCount, MaxFunctionCount;
   uint32_t NumCounts, NumFunctions;
-  /// \brief Return detailed summary as metadata.
+  /// Return detailed summary as metadata.
   Metadata *getDetailedSummaryMD(LLVMContext &Context);
 
 public:
@@ -67,9 +67,9 @@ public:
         NumCounts(NumCounts), NumFunctions(NumFunctions) {}
 
   Kind getKind() const { return PSK; }
-  /// \brief Return summary information as metadata.
+  /// Return summary information as metadata.
   Metadata *getMD(LLVMContext &Context);
-  /// \brief Construct profile summary from metdata.
+  /// Construct profile summary from metdata.
   static ProfileSummary *getFromMD(Metadata *MD);
   SummaryEntryVector &getDetailedSummary() { return DetailedSummary; }
   uint32_t getNumFunctions() { return NumFunctions; }
diff --git a/contrib/llvm/include/llvm/CodeGen/RuntimeLibcalls.def b/contrib/llvm/include/llvm/IR/RuntimeLibcalls.def
index 7695e9d782ef..7ed90d959f01 100644
--- a/contrib/llvm/include/llvm/CodeGen/RuntimeLibcalls.def
+++ b/contrib/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -130,26 +130,51 @@ HANDLE_LIBCALL(LOG_F64, "log")
 HANDLE_LIBCALL(LOG_F80, "logl")
 HANDLE_LIBCALL(LOG_F128, "logl")
 HANDLE_LIBCALL(LOG_PPCF128, "logl")
+HANDLE_LIBCALL(LOG_FINITE_F32, "__logf_finite")
+HANDLE_LIBCALL(LOG_FINITE_F64, "__log_finite")
+HANDLE_LIBCALL(LOG_FINITE_F80, "__logl_finite")
+HANDLE_LIBCALL(LOG_FINITE_F128, "__logl_finite")
+HANDLE_LIBCALL(LOG_FINITE_PPCF128, "__logl_finite")
 HANDLE_LIBCALL(LOG2_F32, "log2f")
 HANDLE_LIBCALL(LOG2_F64, "log2")
 HANDLE_LIBCALL(LOG2_F80, "log2l")
 HANDLE_LIBCALL(LOG2_F128, "log2l")
 HANDLE_LIBCALL(LOG2_PPCF128, "log2l")
+HANDLE_LIBCALL(LOG2_FINITE_F32, "__log2f_finite")
+HANDLE_LIBCALL(LOG2_FINITE_F64, "__log2_finite")
+HANDLE_LIBCALL(LOG2_FINITE_F80, "__log2l_finite")
+HANDLE_LIBCALL(LOG2_FINITE_F128, "__log2l_finite")
+HANDLE_LIBCALL(LOG2_FINITE_PPCF128, "__log2l_finite")
 HANDLE_LIBCALL(LOG10_F32, "log10f")
 HANDLE_LIBCALL(LOG10_F64, "log10")
 HANDLE_LIBCALL(LOG10_F80, "log10l")
 HANDLE_LIBCALL(LOG10_F128, "log10l")
 HANDLE_LIBCALL(LOG10_PPCF128, "log10l")
+HANDLE_LIBCALL(LOG10_FINITE_F32, "__log10f_finite")
+HANDLE_LIBCALL(LOG10_FINITE_F64, "__log10_finite")
+HANDLE_LIBCALL(LOG10_FINITE_F80, "__log10l_finite")
+HANDLE_LIBCALL(LOG10_FINITE_F128, "__log10l_finite")
+HANDLE_LIBCALL(LOG10_FINITE_PPCF128, "__log10l_finite")
 HANDLE_LIBCALL(EXP_F32, "expf")
 HANDLE_LIBCALL(EXP_F64, "exp")
 HANDLE_LIBCALL(EXP_F80, "expl")
 HANDLE_LIBCALL(EXP_F128, "expl")
 HANDLE_LIBCALL(EXP_PPCF128, "expl")
+HANDLE_LIBCALL(EXP_FINITE_F32, "__expf_finite")
+HANDLE_LIBCALL(EXP_FINITE_F64, "__exp_finite")
+HANDLE_LIBCALL(EXP_FINITE_F80, "__expl_finite")
+HANDLE_LIBCALL(EXP_FINITE_F128, "__expl_finite")
+HANDLE_LIBCALL(EXP_FINITE_PPCF128, "__expl_finite")
 HANDLE_LIBCALL(EXP2_F32, "exp2f")
 HANDLE_LIBCALL(EXP2_F64, "exp2")
 HANDLE_LIBCALL(EXP2_F80, "exp2l")
 HANDLE_LIBCALL(EXP2_F128, "exp2l")
 HANDLE_LIBCALL(EXP2_PPCF128, "exp2l")
+HANDLE_LIBCALL(EXP2_FINITE_F32, "__exp2f_finite")
+HANDLE_LIBCALL(EXP2_FINITE_F64, "__exp2_finite")
+HANDLE_LIBCALL(EXP2_FINITE_F80, "__exp2l_finite")
+HANDLE_LIBCALL(EXP2_FINITE_F128, "__exp2l_finite")
+HANDLE_LIBCALL(EXP2_FINITE_PPCF128, "__exp2l_finite")
 HANDLE_LIBCALL(SIN_F32, "sinf")
 HANDLE_LIBCALL(SIN_F64, "sin")
 HANDLE_LIBCALL(SIN_F80, "sinl")
@@ -172,6 +197,11 @@ HANDLE_LIBCALL(POW_F64, "pow")
 HANDLE_LIBCALL(POW_F80, "powl")
 HANDLE_LIBCALL(POW_F128, "powl")
 HANDLE_LIBCALL(POW_PPCF128, "powl")
+HANDLE_LIBCALL(POW_FINITE_F32, "__powf_finite")
+HANDLE_LIBCALL(POW_FINITE_F64, "__pow_finite")
+HANDLE_LIBCALL(POW_FINITE_F80, "__powl_finite")
+HANDLE_LIBCALL(POW_FINITE_F128, "__powl_finite")
+HANDLE_LIBCALL(POW_FINITE_PPCF128, "__powl_finite")
 HANDLE_LIBCALL(CEIL_F32, "ceilf")
 HANDLE_LIBCALL(CEIL_F64, "ceil")
 HANDLE_LIBCALL(CEIL_F80, "ceill")
@@ -221,6 +251,7 @@ HANDLE_LIBCALL(FMAX_PPCF128, "fmaxl")
 // Conversion
 HANDLE_LIBCALL(FPEXT_F32_PPCF128, "__gcc_stoq")
 HANDLE_LIBCALL(FPEXT_F64_PPCF128, "__gcc_dtoq")
+HANDLE_LIBCALL(FPEXT_F80_F128, "__extendxftf2")
 HANDLE_LIBCALL(FPEXT_F64_F128, "__extenddftf2")
 HANDLE_LIBCALL(FPEXT_F32_F128, "__extendsftf2")
 HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2")
@@ -237,6 +268,7 @@ HANDLE_LIBCALL(FPROUND_PPCF128_F32, "__gcc_qtos")
 HANDLE_LIBCALL(FPROUND_F80_F64, "__truncxfdf2")
 HANDLE_LIBCALL(FPROUND_F128_F64, "__trunctfdf2")
 HANDLE_LIBCALL(FPROUND_PPCF128_F64, "__gcc_qtod")
+HANDLE_LIBCALL(FPROUND_F128_F80, "__trunctfxf2")
 HANDLE_LIBCALL(FPTOSINT_F32_I32, "__fixsfsi")
 HANDLE_LIBCALL(FPTOSINT_F32_I64, "__fixsfdi")
 HANDLE_LIBCALL(FPTOSINT_F32_I128, "__fixsfti")
diff --git a/contrib/llvm/include/llvm/IR/Statepoint.h b/contrib/llvm/include/llvm/IR/Statepoint.h
index ad9537e9762e..c8e905b21a30 100644
--- a/contrib/llvm/include/llvm/IR/Statepoint.h
+++ b/contrib/llvm/include/llvm/IR/Statepoint.h
@@ -196,7 +196,7 @@ public:
     return make_range(arg_begin(), arg_end());
   }
 
-  /// \brief Return true if the call or the callee has the given attribute.
+  /// Return true if the call or the callee has the given attribute.
   bool paramHasAttr(unsigned i, Attribute::AttrKind A) const {
     Function *F = getCalledFunction();
     return getCallSite().paramHasAttr(i + CallArgsBeginPos, A) ||
@@ -465,7 +465,7 @@ struct StatepointDirectives {
 /// AS.
 StatepointDirectives parseStatepointDirectivesFromAttrs(AttributeList AS);
 
-/// Return \c true if the the \p Attr is an attribute that is a statepoint
+/// Return \c true if the \p Attr is an attribute that is a statepoint
 /// directive.
 bool isStatepointDirectiveAttr(Attribute Attr);
 
diff --git a/contrib/llvm/include/llvm/IR/TrackingMDRef.h b/contrib/llvm/include/llvm/IR/TrackingMDRef.h
index bdec904ad1e1..084efada221f 100644
--- a/contrib/llvm/include/llvm/IR/TrackingMDRef.h
+++ b/contrib/llvm/include/llvm/IR/TrackingMDRef.h
@@ -20,7 +20,7 @@
 
 namespace llvm {
 
-/// \brief Tracking metadata reference.
+/// Tracking metadata reference.
 ///
 /// This class behaves like \a TrackingVH, but for metadata.
 class TrackingMDRef {
@@ -70,7 +70,7 @@ public:
     track();
   }
 
-  /// \brief Check whether this has a trivial destructor.
+  /// Check whether this has a trivial destructor.
   ///
   /// If \c MD isn't replaceable, the destructor will be a no-op.
   bool hasTrivialDestructor() const {
@@ -100,7 +100,7 @@ private:
   }
 };
 
-/// \brief Typed tracking ref.
+/// Typed tracking ref.
 ///
 /// Track refererences of a particular type.  It's useful to use this for \a
 /// MDNode and \a ValueAsMetadata.
@@ -135,7 +135,7 @@ public:
   void reset() { Ref.reset(); }
   void reset(T *MD) { Ref.reset(static_cast<Metadata *>(MD)); }
 
-  /// \brief Check whether this has a trivial destructor.
+  /// Check whether this has a trivial destructor.
   bool hasTrivialDestructor() const { return Ref.hasTrivialDestructor(); }
 };
 
diff --git a/contrib/llvm/include/llvm/IR/Type.h b/contrib/llvm/include/llvm/IR/Type.h
index 1574fc334ffc..9c1f99d1b3a2 100644
--- a/contrib/llvm/include/llvm/IR/Type.h
+++ b/contrib/llvm/include/llvm/IR/Type.h
@@ -208,6 +208,9 @@ public:
     return getScalarType()->isIntegerTy(BitWidth);
   }
 
+  /// Return true if this is an integer type or a pointer type.
+  bool isIntOrPtrTy() const { return isIntegerTy() || isPointerTy(); }
+
   /// True if this is an instance of FunctionType.
   bool isFunctionTy() const { return getTypeID() == FunctionTyID; }
 
@@ -229,7 +232,7 @@ public:
   /// Return true if this type could be converted with a lossless BitCast to
   /// type 'Ty'. For example, i8* to i32*. BitCasts are valid for types of the
   /// same size only where no re-interpretation of the bits is done.
-  /// @brief Determine if this type could be losslessly bitcast to Ty
+  /// Determine if this type could be losslessly bitcast to Ty
   bool canLosslesslyBitCastTo(Type *Ty) const;
 
   /// Return true if this type is empty, that is, it has no elements or all of
@@ -407,6 +410,20 @@ public:
   static IntegerType *getInt32Ty(LLVMContext &C);
   static IntegerType *getInt64Ty(LLVMContext &C);
   static IntegerType *getInt128Ty(LLVMContext &C);
+  template <typename ScalarTy> static Type *getScalarTy(LLVMContext &C) {
+    int noOfBits = sizeof(ScalarTy) * CHAR_BIT;
+    if (std::is_integral<ScalarTy>::value) {
+      return (Type*) Type::getIntNTy(C, noOfBits);
+    } else if (std::is_floating_point<ScalarTy>::value) {
+      switch (noOfBits) {
+      case 32:
+        return Type::getFloatTy(C);
+      case 64:
+        return Type::getDoubleTy(C);
+      }
+    }
+    llvm_unreachable("Unsupported type in Type::getScalarTy");
+  }
 
   //===--------------------------------------------------------------------===//
   // Convenience methods for getting pointer types with one of the above builtin
diff --git a/contrib/llvm/include/llvm/IR/Use.h b/contrib/llvm/include/llvm/IR/Use.h
index 0ac13935c7ce..25c44e0871a9 100644
--- a/contrib/llvm/include/llvm/IR/Use.h
+++ b/contrib/llvm/include/llvm/IR/Use.h
@@ -36,7 +36,7 @@ template <typename> struct simplify_type;
 class User;
 class Value;
 
-/// \brief A Use represents the edge between a Value definition and its users.
+/// A Use represents the edge between a Value definition and its users.
 ///
 /// This is notionally a two-dimensional linked list. It supports traversing
 /// all of the uses for a particular value definition. It also supports jumping
@@ -57,7 +57,7 @@ class Use {
 public:
   Use(const Use &U) = delete;
 
-  /// \brief Provide a fast substitute to std::swap<Use>
+  /// Provide a fast substitute to std::swap<Use>
   /// that also works with less standard-compliant compilers
   void swap(Use &RHS);
 
@@ -107,7 +107,7 @@ public:
   operator Value *() const { return Val; }
   Value *get() const { return Val; }
 
-  /// \brief Returns the User that contains this Use.
+  /// Returns the User that contains this Use.
   ///
   /// For an instruction operand, for example, this will return the
   /// instruction.
@@ -123,16 +123,16 @@ public:
 
   Use *getNext() const { return Next; }
 
-  /// \brief Return the operand # of this use in its User.
+  /// Return the operand # of this use in its User.
   unsigned getOperandNo() const;
 
-  /// \brief Initializes the waymarking tags on an array of Uses.
+  /// Initializes the waymarking tags on an array of Uses.
   ///
   /// This sets up the array of Uses such that getUser() can find the User from
   /// any of those Uses.
   static Use *initTags(Use *Start, Use *Stop);
 
-  /// \brief Destroys Use operands when the number of operands of
+  /// Destroys Use operands when the number of operands of
   /// a User changes.
   static void zap(Use *Start, const Use *Stop, bool del = false);
 
@@ -161,7 +161,7 @@ private:
   }
 };
 
-/// \brief Allow clients to treat uses just like values when using
+/// Allow clients to treat uses just like values when using
 /// casting operators.
 template <> struct simplify_type<Use> {
   using SimpleType = Value *;
diff --git a/contrib/llvm/include/llvm/IR/UseListOrder.h b/contrib/llvm/include/llvm/IR/UseListOrder.h
index a8b394fc6302..b6bb0f19a0aa 100644
--- a/contrib/llvm/include/llvm/IR/UseListOrder.h
+++ b/contrib/llvm/include/llvm/IR/UseListOrder.h
@@ -23,7 +23,7 @@ namespace llvm {
 class Function;
 class Value;
 
-/// \brief Structure to hold a use-list order.
+/// Structure to hold a use-list order.
 struct UseListOrder {
   const Value *V = nullptr;
   const Function *F = nullptr;
diff --git a/contrib/llvm/include/llvm/IR/User.h b/contrib/llvm/include/llvm/IR/User.h
index 4dfa19cf241f..d6a603ce845d 100644
--- a/contrib/llvm/include/llvm/IR/User.h
+++ b/contrib/llvm/include/llvm/IR/User.h
@@ -36,7 +36,7 @@ namespace llvm {
 template <typename T> class ArrayRef;
 template <typename T> class MutableArrayRef;
 
-/// \brief Compile-time customization of User operands.
+/// Compile-time customization of User operands.
 ///
 /// Customizes operand-related allocators and accessors.
 template <class>
@@ -81,13 +81,13 @@ protected:
            "Error in initializing hung off uses for User");
   }
 
-  /// \brief Allocate the array of Uses, followed by a pointer
+  /// Allocate the array of Uses, followed by a pointer
   /// (with bottom bit set) to the User.
   /// \param IsPhi identifies callers which are phi nodes and which need
   /// N BasicBlock* allocated along with N
   void allocHungoffUses(unsigned N, bool IsPhi = false);
 
-  /// \brief Grow the number of hung off uses.  Note that allocHungoffUses
+  /// Grow the number of hung off uses.  Note that allocHungoffUses
   /// should be called if there are no uses.
   void growHungoffUses(unsigned N, bool IsPhi = false);
 
@@ -97,15 +97,31 @@ protected:
 public:
   User(const User &) = delete;
 
-  /// \brief Free memory allocated for User and Use objects.
+  /// Free memory allocated for User and Use objects.
   void operator delete(void *Usr);
-  /// \brief Placement delete - required by std, but never called.
-  void operator delete(void*, unsigned) {
+  /// Placement delete - required by std, called if the ctor throws.
+  void operator delete(void *Usr, unsigned) {
+    // Note: If a subclass manipulates the information which is required to calculate the 
+    // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has 
+    // to restore the changed information to the original value, since the dtor of that class
+    // is not called if the ctor fails.  
+    User::operator delete(Usr);
+
+#ifndef LLVM_ENABLE_EXCEPTIONS
     llvm_unreachable("Constructor throws?");
+#endif
   }
-  /// \brief Placement delete - required by std, but never called.
-  void operator delete(void*, unsigned, bool) {
+  /// Placement delete - required by std, called if the ctor throws.
+  void operator delete(void *Usr, unsigned, bool) {
+    // Note: If a subclass manipulates the information which is required to calculate the 
+    // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has 
+    // to restore the changed information to the original value, since the dtor of that class
+    // is not called if the ctor fails.  
+    User::operator delete(Usr);
+
+#ifndef LLVM_ENABLE_EXCEPTIONS
     llvm_unreachable("Constructor throws?");
+#endif
   }
 
 protected:
@@ -194,7 +210,7 @@ public:
     NumUserOperands = NumOps;
   }
 
-  /// \brief Subclasses with hung off uses need to manage the operand count
+  /// Subclasses with hung off uses need to manage the operand count
   /// themselves.  In these instances, the operand count isn't used to find the
   /// OperandList, so there's no issue in having the operand count change.
   void setNumHungOffUseOperands(unsigned NumOps) {
@@ -226,7 +242,7 @@ public:
     return const_op_range(op_begin(), op_end());
   }
 
-  /// \brief Iterator for directly iterating over the operand Values.
+  /// Iterator for directly iterating over the operand Values.
   struct value_op_iterator
       : iterator_adaptor_base<value_op_iterator, op_iterator,
                               std::random_access_iterator_tag, Value *,
@@ -268,7 +284,7 @@ public:
     return make_range(value_op_begin(), value_op_end());
   }
 
-  /// \brief Drop all references to operands.
+  /// Drop all references to operands.
   ///
   /// This function is in charge of "letting go" of all objects that this User
   /// refers to.  This allows one to 'delete' a whole class at a time, even
@@ -281,7 +297,7 @@ public:
       U.set(nullptr);
   }
 
-  /// \brief Replace uses of one Value with another.
+  /// Replace uses of one Value with another.
   ///
   /// Replaces all references to the "From" definition with references to the
   /// "To" definition.
diff --git a/contrib/llvm/include/llvm/IR/Value.h b/contrib/llvm/include/llvm/IR/Value.h
index d848fe921868..f396db995ab0 100644
--- a/contrib/llvm/include/llvm/IR/Value.h
+++ b/contrib/llvm/include/llvm/IR/Value.h
@@ -57,7 +57,7 @@ using ValueName = StringMapEntry<Value *>;
 //                                 Value Class
 //===----------------------------------------------------------------------===//
 
-/// \brief LLVM Value Representation
+/// LLVM Value Representation
 ///
 /// This is a very important LLVM class. It is the base class of all values
 /// computed by a program that may be used as operands to other values. Value is
@@ -83,7 +83,7 @@ class Value {
   unsigned char HasValueHandle : 1; // Has a ValueHandle pointing to this?
 
 protected:
-  /// \brief Hold subclass data that can be dropped.
+  /// Hold subclass data that can be dropped.
   ///
   /// This member is similar to SubclassData, however it is for holding
   /// information which may be used to aid optimization, but which may be
@@ -91,7 +91,7 @@ protected:
   unsigned char SubclassOptionalData : 7;
 
 private:
-  /// \brief Hold arbitrary subclass data.
+  /// Hold arbitrary subclass data.
   ///
   /// This member is defined by this class, but is not used for anything.
   /// Subclasses can use it to hold whatever state they find useful.  This
@@ -99,7 +99,7 @@ private:
   unsigned short SubclassData;
 
 protected:
-  /// \brief The number of operands in the subclass.
+  /// The number of operands in the subclass.
   ///
   /// This member is defined by this class, but not used for anything.
   /// Subclasses can use it to store their number of operands, if they have
@@ -173,7 +173,7 @@ private:
     bool operator==(const user_iterator_impl &x) const { return UI == x.UI; }
     bool operator!=(const user_iterator_impl &x) const { return !operator==(x); }
 
-    /// \brief Returns true if this iterator is equal to user_end() on the value.
+    /// Returns true if this iterator is equal to user_end() on the value.
     bool atEnd() const { return *this == user_iterator_impl(); }
 
     user_iterator_impl &operator++() { // Preincrement
@@ -218,17 +218,17 @@ public:
   /// Delete a pointer to a generic Value.
   void deleteValue();
 
-  /// \brief Support for debugging, callable in GDB: V->dump()
+  /// Support for debugging, callable in GDB: V->dump()
   void dump() const;
 
-  /// \brief Implement operator<< on Value.
+  /// Implement operator<< on Value.
   /// @{
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &O, ModuleSlotTracker &MST,
              bool IsForDebug = false) const;
   /// @}
 
-  /// \brief Print the name of this Value out to the specified raw_ostream.
+  /// Print the name of this Value out to the specified raw_ostream.
   ///
   /// This is useful when you just want to print 'int %reg126', not the
   /// instruction that generated it. If you specify a Module for context, then
@@ -241,13 +241,13 @@ public:
                       ModuleSlotTracker &MST) const;
   /// @}
 
-  /// \brief All values are typed, get the type of this value.
+  /// All values are typed, get the type of this value.
   Type *getType() const { return VTy; }
 
-  /// \brief All values hold a context through their type.
+  /// All values hold a context through their type.
   LLVMContext &getContext() const;
 
-  // \brief All values can potentially be named.
+  // All values can potentially be named.
   bool hasName() const { return HasName; }
   ValueName *getValueName() const;
   void setValueName(ValueName *VN);
@@ -258,35 +258,35 @@ private:
   void setNameImpl(const Twine &Name);
 
 public:
-  /// \brief Return a constant reference to the value's name.
+  /// Return a constant reference to the value's name.
   ///
   /// This guaranteed to return the same reference as long as the value is not
   /// modified.  If the value has a name, this does a hashtable lookup, so it's
   /// not free.
   StringRef getName() const;
 
-  /// \brief Change the name of the value.
+  /// Change the name of the value.
   ///
   /// Choose a new unique name if the provided name is taken.
   ///
   /// \param Name The new name; or "" if the value's name should be removed.
   void setName(const Twine &Name);
 
-  /// \brief Transfer the name from V to this value.
+  /// Transfer the name from V to this value.
   ///
   /// After taking V's name, sets V's name to empty.
   ///
   /// \note It is an error to call V->takeName(V).
   void takeName(Value *V);
 
-  /// \brief Change all uses of this to point to a new Value.
+  /// Change all uses of this to point to a new Value.
   ///
   /// Go through the uses list for this definition and make each use point to
   /// "V" instead of "this".  After this completes, 'this's use list is
   /// guaranteed to be empty.
   void replaceAllUsesWith(Value *V);
 
-  /// \brief Change non-metadata uses of this to point to a new Value.
+  /// Change non-metadata uses of this to point to a new Value.
   ///
   /// Go through the uses list for this definition and make each use point to
   /// "V" instead of "this". This function skips metadata entries in the list.
@@ -299,12 +299,6 @@ public:
   /// values or constant users.
   void replaceUsesOutsideBlock(Value *V, BasicBlock *BB);
 
-  /// replaceUsesExceptBlockAddr - Go through the uses list for this definition
-  /// and make each use point to "V" instead of "this" when the use is outside
-  /// the block. 'This's use list is expected to have at least one element.
-  /// Unlike replaceAllUsesWith this function skips blockaddr uses.
-  void replaceUsesExceptBlockAddr(Value *New);
-
   //----------------------------------------------------------------------
   // Methods for handling the chain of uses of this Value.
   //
@@ -411,7 +405,7 @@ public:
     return materialized_users();
   }
 
-  /// \brief Return true if there is exactly one user of this value.
+  /// Return true if there is exactly one user of this value.
   ///
   /// This is specialized because it is a common request and does not require
   /// traversing the whole use list.
@@ -421,27 +415,27 @@ public:
     return ++I == E;
   }
 
-  /// \brief Return true if this Value has exactly N users.
+  /// Return true if this Value has exactly N users.
   bool hasNUses(unsigned N) const;
 
-  /// \brief Return true if this value has N users or more.
+  /// Return true if this value has N users or more.
   ///
   /// This is logically equivalent to getNumUses() >= N.
   bool hasNUsesOrMore(unsigned N) const;
 
-  /// \brief Check if this value is used in the specified basic block.
+  /// Check if this value is used in the specified basic block.
   bool isUsedInBasicBlock(const BasicBlock *BB) const;
 
-  /// \brief This method computes the number of uses of this Value.
+  /// This method computes the number of uses of this Value.
   ///
   /// This is a linear time operation.  Use hasOneUse, hasNUses, or
   /// hasNUsesOrMore to check for specific values.
   unsigned getNumUses() const;
 
-  /// \brief This method should only be used by the Use class.
+  /// This method should only be used by the Use class.
   void addUse(Use &U) { U.addToList(&UseList); }
 
-  /// \brief Concrete subclass of this.
+  /// Concrete subclass of this.
   ///
   /// An enumeration for keeping track of the concrete subclass of Value that
   /// is actually instantiated. Values of this enumeration are kept in the
@@ -456,7 +450,7 @@ public:
 #include "llvm/IR/Value.def"
   };
 
-  /// \brief Return an ID for the concrete type of this object.
+  /// Return an ID for the concrete type of this object.
   ///
   /// This is used to implement the classof checks.  This should not be used
   /// for any other purpose, as the values may change as LLVM evolves.  Also,
@@ -470,36 +464,36 @@ public:
     return SubclassID;
   }
 
-  /// \brief Return the raw optional flags value contained in this value.
+  /// Return the raw optional flags value contained in this value.
   ///
   /// This should only be used when testing two Values for equivalence.
   unsigned getRawSubclassOptionalData() const {
     return SubclassOptionalData;
   }
 
-  /// \brief Clear the optional flags contained in this value.
+  /// Clear the optional flags contained in this value.
   void clearSubclassOptionalData() {
     SubclassOptionalData = 0;
   }
 
-  /// \brief Check the optional flags for equality.
+  /// Check the optional flags for equality.
   bool hasSameSubclassOptionalData(const Value *V) const {
     return SubclassOptionalData == V->SubclassOptionalData;
   }
 
-  /// \brief Return true if there is a value handle associated with this value.
+  /// Return true if there is a value handle associated with this value.
   bool hasValueHandle() const { return HasValueHandle; }
 
-  /// \brief Return true if there is metadata referencing this value.
+  /// Return true if there is metadata referencing this value.
   bool isUsedByMetadata() const { return IsUsedByMD; }
 
-  /// \brief Return true if this value is a swifterror value.
+  /// Return true if this value is a swifterror value.
   ///
   /// swifterror values can be either a function argument or an alloca with a
   /// swifterror attribute.
   bool isSwiftError() const;
 
-  /// \brief Strip off pointer casts, all-zero GEPs, and aliases.
+  /// Strip off pointer casts, all-zero GEPs, and aliases.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'.
@@ -509,18 +503,19 @@ public:
                          static_cast<const Value *>(this)->stripPointerCasts());
   }
 
-  /// \brief Strip off pointer casts, all-zero GEPs, aliases and barriers.
+  /// Strip off pointer casts, all-zero GEPs, aliases and invariant group
+  /// info.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'. This function should be used only in
   /// Alias analysis.
-  const Value *stripPointerCastsAndBarriers() const;
-  Value *stripPointerCastsAndBarriers() {
+  const Value *stripPointerCastsAndInvariantGroups() const;
+  Value *stripPointerCastsAndInvariantGroups() {
     return const_cast<Value *>(
-        static_cast<const Value *>(this)->stripPointerCastsAndBarriers());
+        static_cast<const Value *>(this)->stripPointerCastsAndInvariantGroups());
   }
 
-  /// \brief Strip off pointer casts and all-zero GEPs.
+  /// Strip off pointer casts and all-zero GEPs.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'.
@@ -530,7 +525,7 @@ public:
           static_cast<const Value *>(this)->stripPointerCastsNoFollowAliases());
   }
 
-  /// \brief Strip off pointer casts and all-constant inbounds GEPs.
+  /// Strip off pointer casts and all-constant inbounds GEPs.
   ///
   /// Returns the original pointer value.  If this is called on a non-pointer
   /// value, it returns 'this'.
@@ -540,7 +535,7 @@ public:
               static_cast<const Value *>(this)->stripInBoundsConstantOffsets());
   }
 
-  /// \brief Accumulate offsets from \a stripInBoundsConstantOffsets().
+  /// Accumulate offsets from \a stripInBoundsConstantOffsets().
   ///
   /// Stores the resulting constant offset stripped into the APInt provided.
   /// The provided APInt will be extended or truncated as needed to be the
@@ -555,7 +550,7 @@ public:
         ->stripAndAccumulateInBoundsConstantOffsets(DL, Offset));
   }
 
-  /// \brief Strip off pointer casts and inbounds GEPs.
+  /// Strip off pointer casts and inbounds GEPs.
   ///
   /// Returns the original pointer value.  If this is called on a non-pointer
   /// value, it returns 'this'.
@@ -565,7 +560,7 @@ public:
                       static_cast<const Value *>(this)->stripInBoundsOffsets());
   }
 
-  /// \brief Returns the number of bytes known to be dereferenceable for the
+  /// Returns the number of bytes known to be dereferenceable for the
   /// pointer value.
   ///
   /// If CanBeNull is set by this function the pointer can either be null or be
@@ -573,13 +568,13 @@ public:
   uint64_t getPointerDereferenceableBytes(const DataLayout &DL,
                                           bool &CanBeNull) const;
 
-  /// \brief Returns an alignment of the pointer value.
+  /// Returns an alignment of the pointer value.
   ///
   /// Returns an alignment which is either specified explicitly, e.g. via
   /// align attribute of a function argument, or guaranteed by DataLayout.
   unsigned getPointerAlignment(const DataLayout &DL) const;
 
-  /// \brief Translate PHI node to its predecessor from the given basic block.
+  /// Translate PHI node to its predecessor from the given basic block.
   ///
   /// If this value is a PHI node with CurBB as its parent, return the value in
   /// the PHI node corresponding to PredBB.  If not, return ourself.  This is
@@ -592,14 +587,14 @@ public:
              static_cast<const Value *>(this)->DoPHITranslation(CurBB, PredBB));
   }
 
-  /// \brief The maximum alignment for instructions.
+  /// The maximum alignment for instructions.
   ///
   /// This is the greatest alignment value supported by load, store, and alloca
   /// instructions, and global values.
   static const unsigned MaxAlignmentExponent = 29;
   static const unsigned MaximumAlignment = 1u << MaxAlignmentExponent;
 
-  /// \brief Mutate the type of this Value to be of the specified type.
+  /// Mutate the type of this Value to be of the specified type.
   ///
   /// Note that this is an extremely dangerous operation which can create
   /// completely invalid IR very easily.  It is strongly recommended that you
@@ -609,17 +604,17 @@ public:
     VTy = Ty;
   }
 
-  /// \brief Sort the use-list.
+  /// Sort the use-list.
   ///
   /// Sorts the Value's use-list by Cmp using a stable mergesort.  Cmp is
   /// expected to compare two \a Use references.
   template <class Compare> void sortUseList(Compare Cmp);
 
-  /// \brief Reverse the use-list.
+  /// Reverse the use-list.
   void reverseUseList();
 
 private:
-  /// \brief Merge two lists together.
+  /// Merge two lists together.
   ///
   /// Merges \c L and \c R using \c Cmp.  To enable stable sorts, always pushes
   /// "equal" items from L before items from R.
diff --git a/contrib/llvm/include/llvm/IR/ValueHandle.h b/contrib/llvm/include/llvm/IR/ValueHandle.h
index b45cc7b6dc02..d94472ce1be1 100644
--- a/contrib/llvm/include/llvm/IR/ValueHandle.h
+++ b/contrib/llvm/include/llvm/IR/ValueHandle.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-/// \brief This is the common base class of value handles.
+/// This is the common base class of value handles.
 ///
 /// ValueHandle's are smart pointers to Value's that have special behavior when
 /// the value is deleted or ReplaceAllUsesWith'd.  See the specific handles
@@ -31,7 +31,7 @@ class ValueHandleBase {
   friend class Value;
 
 protected:
-  /// \brief This indicates what sub class the handle actually is.
+  /// This indicates what sub class the handle actually is.
   ///
   /// This is to avoid having a vtable for the light-weight handle pointers. The
   /// fully general Callback version does have a vtable.
@@ -101,10 +101,10 @@ protected:
            V != DenseMapInfo<Value *>::getTombstoneKey();
   }
 
-  /// \brief Remove this ValueHandle from its current use list.
+  /// Remove this ValueHandle from its current use list.
   void RemoveFromUseList();
 
-  /// \brief Clear the underlying pointer without clearing the use list.
+  /// Clear the underlying pointer without clearing the use list.
   ///
   /// This should only be used if a derived class has manually removed the
   /// handle from the use list.
@@ -121,20 +121,20 @@ private:
   HandleBaseKind getKind() const { return PrevPair.getInt(); }
   void setPrevPtr(ValueHandleBase **Ptr) { PrevPair.setPointer(Ptr); }
 
-  /// \brief Add this ValueHandle to the use list for V.
+  /// Add this ValueHandle to the use list for V.
   ///
   /// List is the address of either the head of the list or a Next node within
   /// the existing use list.
   void AddToExistingUseList(ValueHandleBase **List);
 
-  /// \brief Add this ValueHandle to the use list after Node.
+  /// Add this ValueHandle to the use list after Node.
   void AddToExistingUseListAfter(ValueHandleBase *Node);
 
-  /// \brief Add this ValueHandle to the use list for V.
+  /// Add this ValueHandle to the use list for V.
   void AddToUseList();
 };
 
-/// \brief A nullable Value handle that is nullable.
+/// A nullable Value handle that is nullable.
 ///
 /// This is a value handle that points to a value, and nulls itself
 /// out if that value is deleted.
@@ -172,7 +172,7 @@ template <> struct simplify_type<const WeakVH> {
   static SimpleType getSimplifiedValue(const WeakVH &WVH) { return WVH; }
 };
 
-/// \brief Value handle that is nullable, but tries to track the Value.
+/// Value handle that is nullable, but tries to track the Value.
 ///
 /// This is a value handle that tries hard to point to a Value, even across
 /// RAUW operations, but will null itself out if the value is destroyed.  this
@@ -219,7 +219,7 @@ template <> struct simplify_type<const WeakTrackingVH> {
   }
 };
 
-/// \brief Value handle that asserts if the Value is deleted.
+/// Value handle that asserts if the Value is deleted.
 ///
 /// This is a Value Handle that points to a value and asserts out if the value
 /// is destroyed while the handle is still live.  This is very useful for
@@ -318,7 +318,7 @@ struct isPodLike<AssertingVH<T>> {
 #endif
 };
 
-/// \brief Value handle that tracks a Value across RAUW.
+/// Value handle that tracks a Value across RAUW.
 ///
 /// TrackingVH is designed for situations where a client needs to hold a handle
 /// to a Value (or subclass) across some operations which may move that value,
@@ -379,7 +379,7 @@ public:
   ValueTy &operator*() const { return *getValPtr(); }
 };
 
-/// \brief Value handle with callbacks on RAUW and destruction.
+/// Value handle with callbacks on RAUW and destruction.
 ///
 /// This is a value handle that allows subclasses to define callbacks that run
 /// when the underlying Value has RAUW called on it or is destroyed.  This
@@ -405,7 +405,7 @@ public:
     return getValPtr();
   }
 
-  /// \brief Callback for Value destruction.
+  /// Callback for Value destruction.
   ///
   /// Called when this->getValPtr() is destroyed, inside ~Value(), so you
   /// may call any non-virtual Value method on getValPtr(), but no subclass
@@ -418,7 +418,7 @@ public:
   /// Value that's being destroyed.
   virtual void deleted() { setValPtr(nullptr); }
 
-  /// \brief Callback for Value RAUW.
+  /// Callback for Value RAUW.
   ///
   /// Called when this->getValPtr()->replaceAllUsesWith(new_value) is called,
   /// _before_ any of the uses have actually been replaced.  If WeakTrackingVH
diff --git a/contrib/llvm/include/llvm/IR/ValueMap.h b/contrib/llvm/include/llvm/IR/ValueMap.h
index 11d5823ee479..e7e33918a613 100644
--- a/contrib/llvm/include/llvm/IR/ValueMap.h
+++ b/contrib/llvm/include/llvm/IR/ValueMap.h
@@ -106,8 +106,12 @@ public:
       : Map(NumInitBuckets), Data() {}
   explicit ValueMap(const ExtraData &Data, unsigned NumInitBuckets = 64)
       : Map(NumInitBuckets), Data(Data) {}
+  // ValueMap can't be copied nor moved, beucase the callbacks store pointer
+  // to it.
   ValueMap(const ValueMap &) = delete;
+  ValueMap(ValueMap &&) = delete;
   ValueMap &operator=(const ValueMap &) = delete;
+  ValueMap &operator=(ValueMap &&) = delete;
 
   bool hasMD() const { return bool(MDMap); }
   MDMapT &MD() {
diff --git a/contrib/llvm/include/llvm/IR/ValueSymbolTable.h b/contrib/llvm/include/llvm/IR/ValueSymbolTable.h
index 26cbbfabfc0c..012e717c7470 100644
--- a/contrib/llvm/include/llvm/IR/ValueSymbolTable.h
+++ b/contrib/llvm/include/llvm/IR/ValueSymbolTable.h
@@ -48,13 +48,13 @@ class ValueSymbolTable {
 /// @name Types
 /// @{
 public:
-  /// @brief A mapping of names to values.
+  /// A mapping of names to values.
   using ValueMap = StringMap<Value*>;
 
-  /// @brief An iterator over a ValueMap.
+  /// An iterator over a ValueMap.
   using iterator = ValueMap::iterator;
 
-  /// @brief A const_iterator over a ValueMap.
+  /// A const_iterator over a ValueMap.
   using const_iterator = ValueMap::const_iterator;
 
 /// @}
@@ -71,35 +71,35 @@ public:
   /// This method finds the value with the given \p Name in the
   /// the symbol table.
   /// @returns the value associated with the \p Name
-  /// @brief Lookup a named Value.
+  /// Lookup a named Value.
   Value *lookup(StringRef Name) const { return vmap.lookup(Name); }
 
   /// @returns true iff the symbol table is empty
-  /// @brief Determine if the symbol table is empty
+  /// Determine if the symbol table is empty
   inline bool empty() const { return vmap.empty(); }
 
-  /// @brief The number of name/type pairs is returned.
+  /// The number of name/type pairs is returned.
   inline unsigned size() const { return unsigned(vmap.size()); }
 
   /// This function can be used from the debugger to display the
   /// content of the symbol table while debugging.
-  /// @brief Print out symbol table on stderr
+  /// Print out symbol table on stderr
   void dump() const;
 
 /// @}
 /// @name Iteration
 /// @{
 
-  /// @brief Get an iterator that from the beginning of the symbol table.
+  /// Get an iterator that from the beginning of the symbol table.
   inline iterator begin() { return vmap.begin(); }
 
-  /// @brief Get a const_iterator that from the beginning of the symbol table.
+  /// Get a const_iterator that from the beginning of the symbol table.
   inline const_iterator begin() const { return vmap.begin(); }
 
-  /// @brief Get an iterator to the end of the symbol table.
+  /// Get an iterator to the end of the symbol table.
   inline iterator end() { return vmap.end(); }
 
-  /// @brief Get a const_iterator to the end of the symbol table.
+  /// Get a const_iterator to the end of the symbol table.
   inline const_iterator end() const { return vmap.end(); }
 
   /// @}
@@ -111,7 +111,7 @@ private:
   /// This method adds the provided value \p N to the symbol table.  The Value
   /// must have a name which is used to place the value in the symbol table.
   /// If the inserted name conflicts, this renames the value.
-  /// @brief Add a named value to the symbol table
+  /// Add a named value to the symbol table
   void reinsertValue(Value *V);
 
   /// createValueName - This method attempts to create a value name and insert
diff --git a/contrib/llvm/include/llvm/IR/Verifier.h b/contrib/llvm/include/llvm/IR/Verifier.h
index bc10f330bc8a..7255132e1e65 100644
--- a/contrib/llvm/include/llvm/IR/Verifier.h
+++ b/contrib/llvm/include/llvm/IR/Verifier.h
@@ -80,7 +80,7 @@ public:
   bool visitTBAAMetadata(Instruction &I, const MDNode *MD);
 };
 
-/// \brief Check a function for errors, useful for use when debugging a
+/// Check a function for errors, useful for use when debugging a
 /// pass.
 ///
 /// If there are no errors, the function returns false. If an error is found,
@@ -88,7 +88,7 @@ public:
 /// returned.
 bool verifyFunction(const Function &F, raw_ostream *OS = nullptr);
 
-/// \brief Check a module for errors.
+/// Check a module for errors.
 ///
 /// If there are no errors, the function returns false. If an error is
 /// found, a message describing the error is written to OS (if
@@ -124,7 +124,7 @@ public:
 /// "recovered" from by stripping the debug info.
 bool verifyModule(bool &BrokenDebugInfo, const Module &M, raw_ostream *OS);
 
-/// \brief Create a verifier pass.
+/// Create a verifier pass.
 ///
 /// Check a module or function for validity. This is essentially a pass wrapped
 /// around the above verifyFunction and verifyModule routines and
diff --git a/contrib/llvm/include/llvm/IRReader/IRReader.h b/contrib/llvm/include/llvm/IRReader/IRReader.h
index f5621647db06..bedde8954fbb 100644
--- a/contrib/llvm/include/llvm/IRReader/IRReader.h
+++ b/contrib/llvm/include/llvm/IRReader/IRReader.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_IRREADER_IRREADER_H
 #define LLVM_IRREADER_IRREADER_H
 
+#include "llvm/ADT/StringRef.h"
 #include <memory>
 
 namespace llvm {
@@ -40,9 +41,11 @@ getLazyIRFileModule(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
 /// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
 ///                         This option should only be set to false by llvm-as
 ///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
 std::unique_ptr<Module> parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
                                 LLVMContext &Context,
-                                bool UpgradeDebugInfo = true);
+                                bool UpgradeDebugInfo = true,
+                                StringRef DataLayoutString = "");
 
 /// If the given file holds a bitcode image, return a Module for it.
 /// Otherwise, attempt to parse it as LLVM Assembly and return a Module
@@ -50,9 +53,11 @@ std::unique_ptr<Module> parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
 /// \param UpgradeDebugInfo Run UpgradeDebugInfo, which runs the Verifier.
 ///                         This option should only be set to false by llvm-as
 ///                         for use inside the LLVM testuite!
+/// \param DataLayoutString Override datalayout in the llvm assembly.
 std::unique_ptr<Module> parseIRFile(StringRef Filename, SMDiagnostic &Err,
                                     LLVMContext &Context,
-                                    bool UpgradeDebugInfo = true);
+                                    bool UpgradeDebugInfo = true,
+                                    StringRef DataLayoutString = "");
 }
 
 #endif
diff --git a/contrib/llvm/include/llvm/InitializePasses.h b/contrib/llvm/include/llvm/InitializePasses.h
index 4c79333f5d2e..d67b1d48f274 100644
--- a/contrib/llvm/include/llvm/InitializePasses.h
+++ b/contrib/llvm/include/llvm/InitializePasses.h
@@ -37,6 +37,9 @@ void initializeVectorization(PassRegistry&);
 /// Initialize all passes linked into the InstCombine library.
 void initializeInstCombine(PassRegistry&);
 
+/// Initialize all passes linked into the AggressiveInstCombine library.
+void initializeAggressiveInstCombine(PassRegistry&);
+
 /// Initialize all passes linked into the IPO library.
 void initializeIPO(PassRegistry&);
 
@@ -64,6 +67,7 @@ void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
 void initializeAddressSanitizerPass(PassRegistry&);
+void initializeAggressiveInstCombinerLegacyPassPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlignmentFromAssumptionsPass(PassRegistry&);
 void initializeAlwaysInlinerLegacyPassPass(PassRegistry&);
@@ -73,34 +77,34 @@ void initializeAtomicExpandPass(PassRegistry&);
 void initializeBDCELegacyPassPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAAWrapperPassPass(PassRegistry&);
-void initializeBlockExtractorPassPass(PassRegistry&);
+void initializeBlockExtractorPass(PassRegistry &);
 void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry&);
 void initializeBoundsCheckingLegacyPassPass(PassRegistry&);
 void initializeBranchFolderPassPass(PassRegistry&);
 void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&);
 void initializeBranchRelaxationPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
-void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
+void initializeBreakFalseDepsPass(PassRegistry&);
 void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&);
 void initializeCFGPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGSimplifyPassPass(PassRegistry&);
 void initializeCFGViewerLegacyPassPass(PassRegistry&);
+void initializeCFIInstrInserterPass(PassRegistry&);
 void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
 void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
 void initializeCallGraphDOTPrinterPass(PassRegistry&);
 void initializeCallGraphPrinterLegacyPassPass(PassRegistry&);
 void initializeCallGraphViewerPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry&);
+void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
+void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
 void initializeCodeGenPreparePass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
-void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
 void initializeConstantMergeLegacyPassPass(PassRegistry&);
 void initializeConstantPropagationPass(PassRegistry&);
 void initializeCorrelatedValuePropagationPass(PassRegistry&);
 void initializeCostModelAnalysisPass(PassRegistry&);
-void initializeEntryExitInstrumenterPass(PassRegistry&);
-void initializePostInlineEntryExitInstrumenterPass(PassRegistry&);
 void initializeCrossDSOCFIPass(PassRegistry&);
 void initializeDAEPass(PassRegistry&);
 void initializeDAHPass(PassRegistry&);
@@ -114,8 +118,8 @@ void initializeDemandedBitsWrapperPassPass(PassRegistry&);
 void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
 void initializeDetectDeadLanesPass(PassRegistry&);
-void initializeDivergenceAnalysisPass(PassRegistry&);
 void initializeDivRemPairsLegacyPassPass(PassRegistry&);
+void initializeDivergenceAnalysisPass(PassRegistry&);
 void initializeDomOnlyPrinterPass(PassRegistry&);
 void initializeDomOnlyViewerPass(PassRegistry&);
 void initializeDomPrinterPass(PassRegistry&);
@@ -126,9 +130,12 @@ void initializeDwarfEHPreparePass(PassRegistry&);
 void initializeEarlyCSELegacyPassPass(PassRegistry&);
 void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
 void initializeEarlyIfConverterPass(PassRegistry&);
+void initializeEarlyMachineLICMPass(PassRegistry&);
+void initializeEarlyTailDuplicatePass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEfficiencySanitizerPass(PassRegistry&);
 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
+void initializeEntryExitInstrumenterPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeExpandMemCmpPassPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
@@ -154,21 +161,22 @@ void initializeGlobalOptLegacyPassPass(PassRegistry&);
 void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
+void initializeHWAddressSanitizerPass(PassRegistry&);
 void initializeIPCPPass(PassRegistry&);
 void initializeIPSCCPLegacyPassPass(PassRegistry&);
+void initializeIRCELegacyPassPass(PassRegistry&);
 void initializeIRTranslatorPass(PassRegistry&);
 void initializeIVUsersWrapperPassPass(PassRegistry&);
 void initializeIfConverterPass(PassRegistry&);
 void initializeImplicitNullChecksPass(PassRegistry&);
 void initializeIndVarSimplifyLegacyPassPass(PassRegistry&);
 void initializeIndirectBrExpandPassPass(PassRegistry&);
-void initializeInductiveRangeCheckEliminationPass(PassRegistry&);
 void initializeInferAddressSpacesPass(PassRegistry&);
 void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializeInlineCostAnalysisPass(PassRegistry&);
 void initializeInstCountPass(PassRegistry&);
 void initializeInstNamerPass(PassRegistry&);
-void initializeInstSimplifierPass(PassRegistry&);
+void initializeInstSimplifyLegacyPassPass(PassRegistry &);
 void initializeInstrProfilingLegacyPassPass(PassRegistry&);
 void initializeInstructionCombiningPassPass(PassRegistry&);
 void initializeInstructionSelectPass(PassRegistry&);
@@ -204,6 +212,7 @@ void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
 void initializeLoopDeletionLegacyPassPass(PassRegistry&);
 void initializeLoopDistributeLegacyPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
+void initializeLoopGuardWideningLegacyPassPass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
@@ -216,6 +225,7 @@ void initializeLoopRotateLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
 void initializeLoopStrengthReducePass(PassRegistry&);
+void initializeLoopUnrollAndJamPass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
 void initializeLoopUnswitchPass(PassRegistry&);
 void initializeLoopVectorizePass(PassRegistry&);
@@ -229,6 +239,7 @@ void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokeLegacyPassPass(PassRegistry&);
 void initializeLowerSwitchPass(PassRegistry&);
 void initializeLowerTypeTestsPass(PassRegistry&);
+void initializeMIRCanonicalizerPass(PassRegistry &);
 void initializeMIRPrintingPassPass(PassRegistry&);
 void initializeMachineBlockFrequencyInfoPass(PassRegistry&);
 void initializeMachineBlockPlacementPass(PassRegistry&);
@@ -265,6 +276,7 @@ void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
 void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
+void initializeMustExecutePrinterPass(PassRegistry&);
 void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
 void initializeNaryReassociateLegacyPassPass(PassRegistry&);
 void initializeNewGVNLegacyPassPass(PassRegistry&);
@@ -286,6 +298,7 @@ void initializePartialInlinerLegacyPassPass(PassRegistry&);
 void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&);
 void initializePatchableFunctionPass(PassRegistry&);
 void initializePeepholeOptimizerPass(PassRegistry&);
+void initializePhiValuesWrapperPassPass(PassRegistry&);
 void initializePhysicalRegisterUsageInfoPass(PassRegistry&);
 void initializePlaceBackedgeSafepointsImplPass(PassRegistry&);
 void initializePlaceSafepointsPass(PassRegistry&);
@@ -294,9 +307,11 @@ void initializePostDomOnlyViewerPass(PassRegistry&);
 void initializePostDomPrinterPass(PassRegistry&);
 void initializePostDomViewerPass(PassRegistry&);
 void initializePostDominatorTreeWrapperPassPass(PassRegistry&);
+void initializePostInlineEntryExitInstrumenterPass(PassRegistry&);
 void initializePostMachineSchedulerPass(PassRegistry&);
 void initializePostOrderFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializePostRAHazardRecognizerPass(PassRegistry&);
+void initializePostRAMachineSinkingPass(PassRegistry&);
 void initializePostRASchedulerPass(PassRegistry&);
 void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry&);
 void initializePredicateInfoPrinterLegacyPassPass(PassRegistry&);
@@ -308,11 +323,14 @@ void initializeProfileSummaryInfoWrapperPassPass(PassRegistry&);
 void initializePromoteLegacyPassPass(PassRegistry&);
 void initializePruneEHPass(PassRegistry&);
 void initializeRABasicPass(PassRegistry&);
-void initializeRegAllocFastPass(PassRegistry&);
 void initializeRAGreedyPass(PassRegistry&);
+void initializeReachingDefAnalysisPass(PassRegistry&);
 void initializeReassociateLegacyPassPass(PassRegistry&);
+void initializeRegAllocFastPass(PassRegistry&);
 void initializeRegBankSelectPass(PassRegistry&);
 void initializeRegToMemPass(PassRegistry&);
+void initializeRegUsageInfoCollectorPass(PassRegistry&);
+void initializeRegUsageInfoPropagationPass(PassRegistry&);
 void initializeRegionInfoPassPass(PassRegistry&);
 void initializeRegionOnlyPrinterPass(PassRegistry&);
 void initializeRegionOnlyViewerPass(PassRegistry&);
@@ -324,12 +342,12 @@ void initializeResetMachineFunctionPass(PassRegistry&);
 void initializeReversePostOrderFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializeRewriteStatepointsForGCLegacyPassPass(PassRegistry &);
 void initializeRewriteSymbolsLegacyPassPass(PassRegistry&);
-void initializeSafepointIRVerifierPass(PassRegistry&);
 void initializeSCCPLegacyPassPass(PassRegistry&);
 void initializeSCEVAAWrapperPassPass(PassRegistry&);
 void initializeSLPVectorizerPass(PassRegistry&);
 void initializeSROALegacyPassPass(PassRegistry&);
 void initializeSafeStackLegacyPassPass(PassRegistry&);
+void initializeSafepointIRVerifierPass(PassRegistry&);
 void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
 void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
@@ -361,9 +379,8 @@ void initializeStripNonDebugSymbolsPass(PassRegistry&);
 void initializeStripNonLineTableDebugInfoPass(PassRegistry&);
 void initializeStripSymbolsPass(PassRegistry&);
 void initializeStructurizeCFGPass(PassRegistry&);
-void initializeHWAddressSanitizerPass(PassRegistry&);
 void initializeTailCallElimPass(PassRegistry&);
-void initializeTailDuplicatePassPass(PassRegistry&);
+void initializeTailDuplicatePass(PassRegistry&);
 void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
 void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
@@ -377,12 +394,12 @@ void initializeUnreachableMachineBlockElimPass(PassRegistry&);
 void initializeVerifierLegacyPassPass(PassRegistry&);
 void initializeVirtRegMapPass(PassRegistry&);
 void initializeVirtRegRewriterPass(PassRegistry&);
+void initializeWasmEHPreparePass(PassRegistry&);
 void initializeWholeProgramDevirtPass(PassRegistry&);
 void initializeWinEHPreparePass(PassRegistry&);
 void initializeWriteBitcodePassPass(PassRegistry&);
 void initializeWriteThinLTOBitcodePass(PassRegistry&);
 void initializeXRayInstrumentationPass(PassRegistry&);
-void initializeMIRCanonicalizerPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/LTO/Caching.h b/contrib/llvm/include/llvm/LTO/Caching.h
index 25c719a68b92..7201ab31f5b0 100644
--- a/contrib/llvm/include/llvm/LTO/Caching.h
+++ b/contrib/llvm/include/llvm/LTO/Caching.h
@@ -24,13 +24,8 @@ namespace lto {
 /// This type defines the callback to add a pre-existing native object file
 /// (e.g. in a cache).
 ///
-/// Path is generally expected to be a valid path for the file at the point when
-/// the AddBufferFn function is called, but clients should prefer to access MB
-/// directly in order to avoid a potential race condition.
-///
 /// Buffer callbacks must be thread safe.
-typedef std::function<void(unsigned Task, std::unique_ptr<MemoryBuffer> MB,
-                           StringRef Path)>
+typedef std::function<void(unsigned Task, std::unique_ptr<MemoryBuffer> MB)>
     AddBufferFn;
 
 /// Create a local file system cache which uses the given cache directory and
diff --git a/contrib/llvm/include/llvm/LTO/Config.h b/contrib/llvm/include/llvm/LTO/Config.h
index 4bd981c090b1..57bba5e34840 100644
--- a/contrib/llvm/include/llvm/LTO/Config.h
+++ b/contrib/llvm/include/llvm/LTO/Config.h
@@ -73,6 +73,14 @@ struct Config {
   /// Sample PGO profile path.
   std::string SampleProfile;
 
+  /// The directory to store .dwo files.
+  std::string DwoDir;
+
+  /// The path to write a .dwo file to. This should generally only be used when
+  /// running an individual backend directly via thinBackend(), as otherwise
+  /// all .dwo files will be written to the same path.
+  std::string DwoPath;
+
   /// Optimization remarks file path.
   std::string RemarksFilename = "";
 
@@ -82,6 +90,9 @@ struct Config {
   /// Whether to emit the pass manager debuggging informations.
   bool DebugPassManager = false;
 
+  /// Statistics output file path.
+  std::string StatsFile;
+
   bool ShouldDiscardValueNames = true;
   DiagnosticHandlerFunction DiagHandler;
 
diff --git a/contrib/llvm/include/llvm/LTO/LTO.h b/contrib/llvm/include/llvm/LTO/LTO.h
index 2a2b59847281..7d6beab6b441 100644
--- a/contrib/llvm/include/llvm/LTO/LTO.h
+++ b/contrib/llvm/include/llvm/LTO/LTO.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/LTO/Config.h"
@@ -210,10 +209,16 @@ ThinBackend createInProcessThinBackend(unsigned ParallelismLevel);
 /// appends ".thinlto.bc" and writes the index to that path. If
 /// ShouldEmitImportsFiles is true it also writes a list of imported files to a
 /// similar path with ".imports" appended instead.
+/// LinkedObjectsFile is an output stream to write the list of object files for
+/// the final ThinLTO linking. Can be nullptr.
+/// OnWrite is callback which receives module identifier and notifies LTO user
+/// that index file for the module (and optionally imports file) was created.
+using IndexWriteCallback = std::function<void(const std::string &)>;
 ThinBackend createWriteIndexesThinBackend(std::string OldPrefix,
                                           std::string NewPrefix,
                                           bool ShouldEmitImportsFiles,
-                                          std::string LinkedObjectsFile);
+                                          raw_fd_ostream *LinkedObjectsFile,
+                                          IndexWriteCallback OnWrite);
 
 /// This class implements a resolution-based interface to LLVM's LTO
 /// functionality. It supports regular LTO, parallel LTO code generation and
@@ -320,6 +325,14 @@ private:
 
     bool UnnamedAddr = true;
 
+    /// True if module contains the prevailing definition.
+    bool Prevailing = false;
+
+    /// Returns true if module contains the prevailing definition and symbol is
+    /// an IR symbol. For example when module-level inline asm block is used,
+    /// symbol can be prevailing in module but have no IR name.
+    bool isPrevailingIRSymbol() const { return Prevailing && !IRName.empty(); }
+
     /// This field keeps track of the partition number of this global. The
     /// regular LTO object is partition 0, while each ThinLTO object has its own
     /// partition number from 1 onwards.
diff --git a/contrib/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/contrib/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index d794535700e5..b32a972542c8 100644
--- a/contrib/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/contrib/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -131,7 +131,8 @@ public:
    * To avoid filling the disk space, a few knobs are provided:
    *  - The pruning interval limit the frequency at which the garbage collector
    *    will try to scan the cache directory to prune it from expired entries.
-   *    Setting to -1 disable the pruning (default).
+   *    Setting to -1 disable the pruning (default). Setting to 0 will force
+   *    pruning to occur.
    *  - The pruning expiration time indicates to the garbage collector how old
    *    an entry needs to be to be removed.
    *  - Finally, the garbage collector can be instructed to prune the cache till
@@ -149,10 +150,9 @@ public:
   void setCacheDir(std::string Path) { CacheOptions.Path = std::move(Path); }
 
   /// Cache policy: interval (seconds) between two prunes of the cache. Set to a
-  /// negative value to disable pruning. A value of 0 will be ignored.
+  /// negative value to disable pruning. A value of 0 will force pruning to
+  /// occur.
   void setCachePruningInterval(int Interval) {
-    if (Interval == 0)
-      return;
     if(Interval < 0)
       CacheOptions.Policy.Interval.reset();
     else
@@ -168,8 +168,8 @@ public:
 
   /**
    * Sets the maximum cache size that can be persistent across build, in terms
-   * of percentage of the available space on the the disk. Set to 100 to
-   * indicate no limit, 50 to indicate that the cache size will not be left over
+   * of percentage of the available space on the disk. Set to 100 to indicate
+   * no limit, 50 to indicate that the cache size will not be left over
    * half the available space. A value over 100 will be reduced to 100, and a
    * value of 0 will be ignored.
    *
@@ -184,6 +184,21 @@ public:
       CacheOptions.Policy.MaxSizePercentageOfAvailableSpace = Percentage;
   }
 
+  /// Cache policy: the maximum size for the cache directory in bytes. A value
+  /// over the amount of available space on the disk will be reduced to the
+  /// amount of available space. A value of 0 will be ignored.
+  void setCacheMaxSizeBytes(unsigned MaxSizeBytes) {
+    if (MaxSizeBytes)
+      CacheOptions.Policy.MaxSizeBytes = MaxSizeBytes;
+  }
+
+  /// Cache policy: the maximum number of files in the cache directory. A value
+  /// of 0 will be ignored.
+  void setCacheMaxSizeFiles(unsigned MaxSizeFiles) {
+    if (MaxSizeFiles)
+      CacheOptions.Policy.MaxSizeFiles = MaxSizeFiles;
+  }
+
   /**@}*/
 
   /// Set the path to a directory where to save temporaries at various stages of
diff --git a/contrib/llvm/include/llvm/LinkAllPasses.h b/contrib/llvm/include/llvm/LinkAllPasses.h
index 39d1ec6cffb5..bd432c58b613 100644
--- a/contrib/llvm/include/llvm/LinkAllPasses.h
+++ b/contrib/llvm/include/llvm/LinkAllPasses.h
@@ -39,14 +39,18 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/Support/Valgrind.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -64,6 +68,7 @@ namespace {
 
       (void) llvm::createAAEvalPass();
       (void) llvm::createAggressiveDCEPass();
+      (void) llvm::createAggressiveInstCombinerPass();
       (void) llvm::createBitTrackingDCEPass();
       (void) llvm::createArgumentPromotionPass();
       (void) llvm::createAlignmentFromAssumptionsPass();
@@ -107,10 +112,12 @@ namespace {
       (void) llvm::createGlobalOptimizerPass();
       (void) llvm::createGlobalsAAWrapperPass();
       (void) llvm::createGuardWideningPass();
+      (void) llvm::createLoopGuardWideningPass();
       (void) llvm::createIPConstantPropagationPass();
       (void) llvm::createIPSCCPPass();
       (void) llvm::createInductiveRangeCheckEliminationPass();
       (void) llvm::createIndVarSimplifyPass();
+      (void) llvm::createInstSimplifyLegacyPass();
       (void) llvm::createInstructionCombiningPass();
       (void) llvm::createInternalizePass();
       (void) llvm::createLCSSAPass();
@@ -125,6 +132,7 @@ namespace {
       (void) llvm::createLoopStrengthReducePass();
       (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
+      (void) llvm::createLoopUnrollAndJamPass();
       (void) llvm::createLoopUnswitchPass();
       (void) llvm::createLoopVersioningLICMPass();
       (void) llvm::createLoopIdiomPass();
@@ -195,7 +203,6 @@ namespace {
       (void) llvm::createLowerAtomicPass();
       (void) llvm::createCorrelatedValuePropagationPass();
       (void) llvm::createMemDepPrinter();
-      (void) llvm::createInstructionSimplifierPass();
       (void) llvm::createLoopVectorizePass();
       (void) llvm::createSLPVectorizerPass();
       (void) llvm::createLoadStoreVectorizerPass();
@@ -207,6 +214,7 @@ namespace {
       (void) llvm::createRewriteSymbolsPass();
       (void) llvm::createStraightLineStrengthReducePass();
       (void) llvm::createMemDerefPrinter();
+      (void) llvm::createMustExecutePrinter();
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
       (void) llvm::createScalarizeMaskedMemIntrinPass();
diff --git a/contrib/llvm/include/llvm/Linker/Linker.h b/contrib/llvm/include/llvm/Linker/Linker.h
index 628e0112bd9d..7776c720ec53 100644
--- a/contrib/llvm/include/llvm/Linker/Linker.h
+++ b/contrib/llvm/include/llvm/Linker/Linker.h
@@ -34,7 +34,7 @@ public:
 
   Linker(Module &M);
 
-  /// \brief Link \p Src into the composite.
+  /// Link \p Src into the composite.
   ///
   /// Passing OverrideSymbols as true will have symbols from Src
   /// shadow those in the Dest.
diff --git a/contrib/llvm/include/llvm/MC/MCAsmBackend.h b/contrib/llvm/include/llvm/MC/MCAsmBackend.h
index ef2007ff6920..030d3c05aa5a 100644
--- a/contrib/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/contrib/llvm/include/llvm/MC/MCAsmBackend.h
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFragment.h"
+#include "llvm/Support/Endian.h"
 #include <cstdint>
 #include <memory>
 
@@ -29,6 +30,7 @@ struct MCFixupKindInfo;
 class MCFragment;
 class MCInst;
 class MCObjectStreamer;
+class MCObjectTargetWriter;
 class MCObjectWriter;
 struct MCCodePaddingContext;
 class MCRelaxableFragment;
@@ -41,21 +43,31 @@ class MCAsmBackend {
   std::unique_ptr<MCCodePadder> CodePadder;
 
 protected: // Can only create subclasses.
-  MCAsmBackend();
-  MCAsmBackend(std::unique_ptr<MCCodePadder> TargetCodePadder);
+  MCAsmBackend(support::endianness Endian);
 
 public:
   MCAsmBackend(const MCAsmBackend &) = delete;
   MCAsmBackend &operator=(const MCAsmBackend &) = delete;
   virtual ~MCAsmBackend();
 
+  const support::endianness Endian;
+
   /// lifetime management
   virtual void reset() {}
 
   /// Create a new MCObjectWriter instance for use by the assembler backend to
   /// emit the final object file.
-  virtual std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const = 0;
+  std::unique_ptr<MCObjectWriter>
+  createObjectWriter(raw_pwrite_stream &OS) const;
+
+  /// Create an MCObjectWriter that writes two object files: a .o file which is
+  /// linked into the final program and a .dwo file which is used by debuggers.
+  /// This function is only supported with ELF targets.
+  std::unique_ptr<MCObjectWriter>
+  createDwoObjectWriter(raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS) const;
+
+  virtual std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const = 0;
 
   /// \name Target Fixup Interfaces
   /// @{
@@ -80,9 +92,16 @@ public:
   /// the offset specified by the fixup and following the fixup kind as
   /// appropriate. Errors (such as an out of range fixup value) should be
   /// reported via \p Ctx.
+  /// The  \p STI is present only for fragments of type MCRelaxableFragment and
+  /// MCDataFragment with hasInstructions() == true.
   virtual void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                           const MCValue &Target, MutableArrayRef<char> Data,
-                          uint64_t Value, bool IsResolved) const = 0;
+                          uint64_t Value, bool IsResolved,
+                          const MCSubtargetInfo *STI) const = 0;
+
+  /// Check whether the given target requires emitting differences of two
+  /// symbols as a set of relocations.
+  virtual bool requiresDiffExpressionRelocations() const { return false; }
 
   /// @}
 
@@ -92,14 +111,18 @@ public:
   /// Check whether the given instruction may need relaxation.
   ///
   /// \param Inst - The instruction to test.
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const = 0;
+  /// \param STI - The MCSubtargetInfo in effect when the instruction was
+  /// encoded.
+  virtual bool mayNeedRelaxation(const MCInst &Inst,
+                                 const MCSubtargetInfo &STI) const = 0;
 
   /// Target specific predicate for whether a given fixup requires the
   /// associated instruction to be relaxed.
   virtual bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
                                             uint64_t Value,
                                             const MCRelaxableFragment *DF,
-                                            const MCAsmLayout &Layout) const;
+                                            const MCAsmLayout &Layout,
+                                            const bool WasForced) const;
 
   /// Simple predicate for targets where !Resolved implies requiring relaxation
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -127,7 +150,7 @@ public:
   /// target cannot generate such a sequence, it should return an error.
   ///
   /// \return - True on success.
-  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const = 0;
+  virtual bool writeNopData(raw_ostream &OS, uint64_t Count) const = 0;
 
   /// Give backend an opportunity to finish layout after relaxation
   virtual void finishLayout(MCAssembler const &Asm,
@@ -136,7 +159,7 @@ public:
   /// Handle any target-specific assembler flags. By default, do nothing.
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
 
-  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  /// Generate the compact unwind encoding for the CFI instructions.
   virtual uint32_t
       generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction>) const {
     return 0;
@@ -173,7 +196,7 @@ public:
   /// \param PF The fragment to relax.
   /// \param Layout Code layout information.
   ///
-  /// \returns true iff any relaxation occured.
+  /// \returns true iff any relaxation occurred.
   bool relaxFragment(MCPaddingFragment *PF, MCAsmLayout &Layout);
 };
 
diff --git a/contrib/llvm/include/llvm/MC/MCAsmInfo.h b/contrib/llvm/include/llvm/MC/MCAsmInfo.h
index c538c46fc072..120fb8fa7492 100644
--- a/contrib/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCAsmInfo.h
@@ -84,6 +84,15 @@ protected:
   /// directive for emitting thread local BSS Symbols.  Default is false.
   bool HasMachoTBSSDirective = false;
 
+  /// True if this is a non-GNU COFF target. The COFF port of the GNU linker
+  /// doesn't handle associative comdats in the way that we would like to use
+  /// them.
+  bool HasCOFFAssociativeComdats = false;
+
+  /// True if this is a non-GNU COFF target. For GNU targets, we don't generate
+  /// constants into comdat sections.
+  bool HasCOFFComdatConstants = false;
+
   /// This is the maximum possible length of an instruction, which is needed to
   /// compute the size of an inline asm.  Defaults to 4.
   unsigned MaxInstLength = 4;
@@ -344,6 +353,10 @@ protected:
   /// For example, foo(plt) instead of foo@plt.  Defaults to false.
   bool UseParensForSymbolVariant = false;
 
+  /// True if the target supports flags in ".loc" directive, false if only
+  /// location is allowed.
+  bool SupportsExtendedDwarfLocDirective = true;
+
   //===--- Prologue State ----------------------------------------------===//
 
   std::vector<MCCFIInstruction> InitialFrameState;
@@ -416,7 +429,7 @@ public:
     return nullptr;
   }
 
-  /// \brief True if the section is atomized using the symbols in it.
+  /// True if the section is atomized using the symbols in it.
   /// This is false if the section is not atomized at all (most ELF sections) or
   /// if it is atomized based on its contents (MachO' __TEXT,__cstring for
   /// example).
@@ -459,6 +472,8 @@ public:
 
   bool hasMachoZeroFillDirective() const { return HasMachoZeroFillDirective; }
   bool hasMachoTBSSDirective() const { return HasMachoTBSSDirective; }
+  bool hasCOFFAssociativeComdats() const { return HasCOFFAssociativeComdats; }
+  bool hasCOFFComdatConstants() const { return HasCOFFComdatConstants; }
   unsigned getMaxInstLength() const { return MaxInstLength; }
   unsigned getMinInstAlignment() const { return MinInstAlignment; }
   bool getDollarIsPC() const { return DollarIsPC; }
@@ -579,6 +594,9 @@ public:
   bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; }
   bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; }
   bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; }
+  bool supportsExtendedDwarfLocDirective() const {
+    return SupportsExtendedDwarfLocDirective;
+  }
 
   void addInitialFrameState(const MCCFIInstruction &Inst) {
     InitialFrameState.push_back(Inst);
diff --git a/contrib/llvm/include/llvm/MC/MCAsmLayout.h b/contrib/llvm/include/llvm/MC/MCAsmLayout.h
index 1b20d5b804a4..b711db319302 100644
--- a/contrib/llvm/include/llvm/MC/MCAsmLayout.h
+++ b/contrib/llvm/include/llvm/MC/MCAsmLayout.h
@@ -37,11 +37,11 @@ class MCAsmLayout {
   /// lower ordinal will be valid.
   mutable DenseMap<const MCSection *, MCFragment *> LastValidFragment;
 
-  /// \brief Make sure that the layout for the given fragment is valid, lazily
+  /// Make sure that the layout for the given fragment is valid, lazily
   /// computing it if necessary.
   void ensureValid(const MCFragment *F) const;
 
-  /// \brief Is the layout for this fragment valid?
+  /// Is the layout for this fragment valid?
   bool isFragmentValid(const MCFragment *F) const;
 
 public:
@@ -50,12 +50,12 @@ public:
   /// Get the assembler object this is a layout for.
   MCAssembler &getAssembler() const { return Assembler; }
 
-  /// \brief Invalidate the fragments starting with F because it has been
+  /// Invalidate the fragments starting with F because it has been
   /// resized. The fragment's size should have already been updated, but
   /// its bundle padding will be recomputed.
   void invalidateFragmentsFrom(MCFragment *F);
 
-  /// \brief Perform layout for a single fragment, assuming that the previous
+  /// Perform layout for a single fragment, assuming that the previous
   /// fragment has already been laid out correctly, and the parent section has
   /// been initialized.
   void layoutFragment(MCFragment *Fragment);
@@ -72,31 +72,31 @@ public:
   /// \name Fragment Layout Data
   /// @{
 
-  /// \brief Get the offset of the given fragment inside its containing section.
+  /// Get the offset of the given fragment inside its containing section.
   uint64_t getFragmentOffset(const MCFragment *F) const;
 
   /// @}
   /// \name Utility Functions
   /// @{
 
-  /// \brief Get the address space size of the given section, as it effects
+  /// Get the address space size of the given section, as it effects
   /// layout. This may differ from the size reported by \see getSectionSize() by
   /// not including section tail padding.
   uint64_t getSectionAddressSize(const MCSection *Sec) const;
 
-  /// \brief Get the data size of the given section, as emitted to the object
+  /// Get the data size of the given section, as emitted to the object
   /// file. This may include additional padding, or be 0 for virtual sections.
   uint64_t getSectionFileSize(const MCSection *Sec) const;
 
-  /// \brief Get the offset of the given symbol, as computed in the current
+  /// Get the offset of the given symbol, as computed in the current
   /// layout.
   /// \return True on success.
   bool getSymbolOffset(const MCSymbol &S, uint64_t &Val) const;
 
-  /// \brief Variant that reports a fatal error if the offset is not computable.
+  /// Variant that reports a fatal error if the offset is not computable.
   uint64_t getSymbolOffset(const MCSymbol &S) const;
 
-  /// \brief If this symbol is equivalent to A + Constant, return A.
+  /// If this symbol is equivalent to A + Constant, return A.
   const MCSymbol *getBaseSymbol(const MCSymbol &Symbol) const;
 
   /// @}
diff --git a/contrib/llvm/include/llvm/MC/MCAsmMacro.h b/contrib/llvm/include/llvm/MC/MCAsmMacro.h
index dac8d1a80050..09b32c7ea333 100644
--- a/contrib/llvm/include/llvm/MC/MCAsmMacro.h
+++ b/contrib/llvm/include/llvm/MC/MCAsmMacro.h
@@ -10,10 +10,124 @@
 #ifndef LLVM_MC_MCASMMACRO_H
 #define LLVM_MC_MCASMMACRO_H
 
-#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SMLoc.h"
+#include <vector>
 
 namespace llvm {
 
+/// Target independent representation for an assembler token.
+class AsmToken {
+public:
+  enum TokenKind {
+    // Markers
+    Eof, Error,
+
+    // String values.
+    Identifier,
+    String,
+
+    // Integer values.
+    Integer,
+    BigNum, // larger than 64 bits
+
+    // Real values.
+    Real,
+
+    // Comments
+    Comment,
+    HashDirective,
+    // No-value.
+    EndOfStatement,
+    Colon,
+    Space,
+    Plus, Minus, Tilde,
+    Slash,     // '/'
+    BackSlash, // '\'
+    LParen, RParen, LBrac, RBrac, LCurly, RCurly,
+    Star, Dot, Comma, Dollar, Equal, EqualEqual,
+
+    Pipe, PipePipe, Caret,
+    Amp, AmpAmp, Exclaim, ExclaimEqual, Percent, Hash,
+    Less, LessEqual, LessLess, LessGreater,
+    Greater, GreaterEqual, GreaterGreater, At,
+
+    // MIPS unary expression operators such as %neg.
+    PercentCall16, PercentCall_Hi, PercentCall_Lo, PercentDtprel_Hi,
+    PercentDtprel_Lo, PercentGot, PercentGot_Disp, PercentGot_Hi, PercentGot_Lo,
+    PercentGot_Ofst, PercentGot_Page, PercentGottprel, PercentGp_Rel, PercentHi,
+    PercentHigher, PercentHighest, PercentLo, PercentNeg, PercentPcrel_Hi,
+    PercentPcrel_Lo, PercentTlsgd, PercentTlsldm, PercentTprel_Hi,
+    PercentTprel_Lo
+  };
+
+private:
+  TokenKind Kind;
+
+  /// A reference to the entire token contents; this is always a pointer into
+  /// a memory buffer owned by the source manager.
+  StringRef Str;
+
+  APInt IntVal;
+
+public:
+  AsmToken() = default;
+  AsmToken(TokenKind Kind, StringRef Str, APInt IntVal)
+      : Kind(Kind), Str(Str), IntVal(std::move(IntVal)) {}
+  AsmToken(TokenKind Kind, StringRef Str, int64_t IntVal = 0)
+      : Kind(Kind), Str(Str), IntVal(64, IntVal, true) {}
+
+  TokenKind getKind() const { return Kind; }
+  bool is(TokenKind K) const { return Kind == K; }
+  bool isNot(TokenKind K) const { return Kind != K; }
+
+  SMLoc getLoc() const;
+  SMLoc getEndLoc() const;
+  SMRange getLocRange() const;
+
+  /// Get the contents of a string token (without quotes).
+  StringRef getStringContents() const {
+    assert(Kind == String && "This token isn't a string!");
+    return Str.slice(1, Str.size() - 1);
+  }
+
+  /// Get the identifier string for the current token, which should be an
+  /// identifier or a string. This gets the portion of the string which should
+  /// be used as the identifier, e.g., it does not include the quotes on
+  /// strings.
+  StringRef getIdentifier() const {
+    if (Kind == Identifier)
+      return getString();
+    return getStringContents();
+  }
+
+  /// Get the string for the current token, this includes all characters (for
+  /// example, the quotes on strings) in the token.
+  ///
+  /// The returned StringRef points into the source manager's memory buffer, and
+  /// is safe to store across calls to Lex().
+  StringRef getString() const { return Str; }
+
+  // FIXME: Don't compute this in advance, it makes every token larger, and is
+  // also not generally what we want (it is nicer for recovery etc. to lex 123br
+  // as a single token, then diagnose as an invalid number).
+  int64_t getIntVal() const {
+    assert(Kind == Integer && "This token isn't an integer!");
+    return IntVal.getZExtValue();
+  }
+
+  APInt getAPIntVal() const {
+    assert((Kind == Integer || Kind == BigNum) &&
+           "This token isn't an integer!");
+    return IntVal;
+  }
+
+  void dump(raw_ostream &OS) const;
+  void dump() const { dump(dbgs()); }
+};
+
 struct MCAsmMacroParameter {
   StringRef Name;
   std::vector<AsmToken> Value;
@@ -21,6 +135,9 @@ struct MCAsmMacroParameter {
   bool Vararg = false;
 
   MCAsmMacroParameter() = default;
+
+  void dump() const { dump(dbgs()); }
+  void dump(raw_ostream &OS) const;
 };
 
 typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
@@ -32,6 +149,9 @@ struct MCAsmMacro {
 public:
   MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
       : Name(N), Body(B), Parameters(std::move(P)) {}
+
+  void dump() const { dump(dbgs()); }
+  void dump(raw_ostream &OS) const;
 };
 } // namespace llvm
 
diff --git a/contrib/llvm/include/llvm/MC/MCAssembler.h b/contrib/llvm/include/llvm/MC/MCAssembler.h
index b91b04414021..0f9499d705e4 100644
--- a/contrib/llvm/include/llvm/MC/MCAssembler.h
+++ b/contrib/llvm/include/llvm/MC/MCAssembler.h
@@ -99,11 +99,11 @@ public:
 private:
   MCContext &Context;
 
-  MCAsmBackend &Backend;
+  std::unique_ptr<MCAsmBackend> Backend;
 
-  MCCodeEmitter &Emitter;
+  std::unique_ptr<MCCodeEmitter> Emitter;
 
-  MCObjectWriter &Writer;
+  std::unique_ptr<MCObjectWriter> Writer;
 
   SectionListType Sections;
 
@@ -130,7 +130,7 @@ private:
   // refactoring too.
   mutable SmallPtrSet<const MCSymbol *, 32> ThumbFuncs;
 
-  /// \brief The bundle alignment size currently set in the assembler.
+  /// The bundle alignment size currently set in the assembler.
   ///
   /// By default it's 0, which means bundling is disabled.
   unsigned BundleAlignSize;
@@ -162,12 +162,14 @@ private:
   /// evaluates to.
   /// \param Value [out] On return, the value of the fixup as currently laid
   /// out.
+  /// \param WasForced [out] On return, the value in the fixup is set to the
+  /// correct value if WasForced is true, even if evaluateFixup returns false.
   /// \return Whether the fixup value was fully resolved. This is true if the
   /// \p Value result is fixed, otherwise the value may change due to
   /// relocation.
   bool evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
                      const MCFragment *DF, MCValue &Target,
-                     uint64_t &Value) const;
+                     uint64_t &Value, bool &WasForced) const;
 
   /// Check whether a fixup can be satisfied, or whether it needs to be relaxed
   /// (increased in size, in order to hold its value correctly).
@@ -178,11 +180,11 @@ private:
   bool fragmentNeedsRelaxation(const MCRelaxableFragment *IF,
                                const MCAsmLayout &Layout) const;
 
-  /// \brief Perform one layout iteration and return true if any offsets
+  /// Perform one layout iteration and return true if any offsets
   /// were adjusted.
   bool layoutOnce(MCAsmLayout &Layout);
 
-  /// \brief Perform one layout iteration of the given section and return true
+  /// Perform one layout iteration of the given section and return true
   /// if any offsets were adjusted.
   bool layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec);
 
@@ -214,8 +216,9 @@ public:
   // concrete and require clients to pass in a target like object. The other
   // option is to make this abstract, and have targets provide concrete
   // implementations as we do with AsmParser.
-  MCAssembler(MCContext &Context, MCAsmBackend &Backend,
-              MCCodeEmitter &Emitter, MCObjectWriter &Writer);
+  MCAssembler(MCContext &Context, std::unique_ptr<MCAsmBackend> Backend,
+              std::unique_ptr<MCCodeEmitter> Emitter,
+              std::unique_ptr<MCObjectWriter> Writer);
   MCAssembler(const MCAssembler &) = delete;
   MCAssembler &operator=(const MCAssembler &) = delete;
   ~MCAssembler();
@@ -235,8 +238,8 @@ public:
   /// defining a separate atom.
   bool isSymbolLinkerVisible(const MCSymbol &SD) const;
 
-  /// Emit the section contents using the given object writer.
-  void writeSectionData(const MCSection *Section,
+  /// Emit the section contents to \p OS.
+  void writeSectionData(raw_ostream &OS, const MCSection *Section,
                         const MCAsmLayout &Layout) const;
 
   /// Check whether a given symbol has been flagged with .thumb_func.
@@ -274,11 +277,17 @@ public:
 
   MCContext &getContext() const { return Context; }
 
-  MCAsmBackend &getBackend() const { return Backend; }
+  MCAsmBackend *getBackendPtr() const { return Backend.get(); }
 
-  MCCodeEmitter &getEmitter() const { return Emitter; }
+  MCCodeEmitter *getEmitterPtr() const { return Emitter.get(); }
 
-  MCObjectWriter &getWriter() const { return Writer; }
+  MCObjectWriter *getWriterPtr() const { return Writer.get(); }
+
+  MCAsmBackend &getBackend() const { return *Backend; }
+
+  MCCodeEmitter &getEmitter() const { return *Emitter; }
+
+  MCObjectWriter &getWriter() const { return *Writer; }
 
   MCDwarfLineTableParams getDWARFLinetableParams() const { return LTParams; }
   void setDWARFLinetableParams(MCDwarfLineTableParams P) { LTParams = P; }
@@ -409,6 +418,13 @@ public:
   const MCLOHContainer &getLOHContainer() const {
     return const_cast<MCAssembler *>(this)->getLOHContainer();
   }
+
+  struct CGProfileEntry {
+    const MCSymbolRefExpr *From;
+    const MCSymbolRefExpr *To;
+    uint64_t Count;
+  };
+  std::vector<CGProfileEntry> CGProfile;
   /// @}
   /// \name Backend Data Access
   /// @{
@@ -424,21 +440,22 @@ public:
       FileNames.push_back(FileName);
   }
 
-  /// \brief Write the necessary bundle padding to the given object writer.
+  /// Write the necessary bundle padding to \p OS.
   /// Expects a fragment \p F containing instructions and its size \p FSize.
-  void writeFragmentPadding(const MCFragment &F, uint64_t FSize,
-                            MCObjectWriter *OW) const;
+  void writeFragmentPadding(raw_ostream &OS, const MCEncodedFragment &F,
+                            uint64_t FSize) const;
 
   /// @}
 
   void dump() const;
 };
 
-/// \brief Compute the amount of padding required before the fragment \p F to
+/// Compute the amount of padding required before the fragment \p F to
 /// obey bundling restrictions, where \p FOffset is the fragment's offset in
 /// its section and \p FSize is the fragment's size.
-uint64_t computeBundlePadding(const MCAssembler &Assembler, const MCFragment *F,
-                              uint64_t FOffset, uint64_t FSize);
+uint64_t computeBundlePadding(const MCAssembler &Assembler,
+                              const MCEncodedFragment *F, uint64_t FOffset,
+                              uint64_t FSize);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/MC/MCCodePadder.h b/contrib/llvm/include/llvm/MC/MCCodePadder.h
index 1e91198597c3..4dde6bf59272 100644
--- a/contrib/llvm/include/llvm/MC/MCCodePadder.h
+++ b/contrib/llvm/include/llvm/MC/MCCodePadder.h
@@ -28,7 +28,6 @@ typedef SmallVector<const MCPaddingFragment *, 8> MCPFRange;
 
 struct MCCodePaddingContext {
   bool IsPaddingActive;
-  bool IsBasicBlockInsideInnermostLoop;
   bool IsBasicBlockReachableViaFallthrough;
   bool IsBasicBlockReachableViaBranch;
 };
@@ -119,7 +118,7 @@ public:
   /// \param Fragment The fragment to relax.
   /// \param Layout Code layout information.
   ///
-  /// \returns true iff any relaxation occured.
+  /// \returns true iff any relaxation occurred.
   bool relaxFragment(MCPaddingFragment *Fragment, MCAsmLayout &Layout);
 };
 
diff --git a/contrib/llvm/include/llvm/MC/MCCodeView.h b/contrib/llvm/include/llvm/MC/MCCodeView.h
index c8f14515ed34..1d9e3c6698cf 100644
--- a/contrib/llvm/include/llvm/MC/MCCodeView.h
+++ b/contrib/llvm/include/llvm/MC/MCCodeView.h
@@ -27,7 +27,7 @@ class MCObjectStreamer;
 class MCStreamer;
 class CodeViewContext;
 
-/// \brief Instances of this class represent the information from a
+/// Instances of this class represent the information from a
 /// .cv_loc directive.
 class MCCVLoc {
   uint32_t FunctionId;
@@ -50,13 +50,13 @@ private: // CodeViewContext manages these
 public:
   unsigned getFunctionId() const { return FunctionId; }
 
-  /// \brief Get the FileNum of this MCCVLoc.
+  /// Get the FileNum of this MCCVLoc.
   unsigned getFileNum() const { return FileNum; }
 
-  /// \brief Get the Line of this MCCVLoc.
+  /// Get the Line of this MCCVLoc.
   unsigned getLine() const { return Line; }
 
-  /// \brief Get the Column of this MCCVLoc.
+  /// Get the Column of this MCCVLoc.
   unsigned getColumn() const { return Column; }
 
   bool isPrologueEnd() const { return PrologueEnd; }
@@ -64,13 +64,13 @@ public:
 
   void setFunctionId(unsigned FID) { FunctionId = FID; }
 
-  /// \brief Set the FileNum of this MCCVLoc.
+  /// Set the FileNum of this MCCVLoc.
   void setFileNum(unsigned fileNum) { FileNum = fileNum; }
 
-  /// \brief Set the Line of this MCCVLoc.
+  /// Set the Line of this MCCVLoc.
   void setLine(unsigned line) { Line = line; }
 
-  /// \brief Set the Column of this MCCVLoc.
+  /// Set the Column of this MCCVLoc.
   void setColumn(unsigned column) {
     assert(column <= UINT16_MAX);
     Column = column;
@@ -80,7 +80,7 @@ public:
   void setIsStmt(bool IS) { IsStmt = IS; }
 };
 
-/// \brief Instances of this class represent the line information for
+/// Instances of this class represent the line information for
 /// the CodeView line table entries.  Which is created after a machine
 /// instruction is assembled and uses an address from a temporary label
 /// created at the current address in the current section and the info from
@@ -201,7 +201,7 @@ public:
 
   bool isValidCVFileNumber(unsigned FileNumber);
 
-  /// \brief Add a line entry.
+  /// Add a line entry.
   void addLineEntry(const MCCVLineEntry &LineEntry);
 
   std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId);
diff --git a/contrib/llvm/include/llvm/MC/MCContext.h b/contrib/llvm/include/llvm/MC/MCContext.h
index 358f67c4db6d..a712e2d95cbc 100644
--- a/contrib/llvm/include/llvm/MC/MCContext.h
+++ b/contrib/llvm/include/llvm/MC/MCContext.h
@@ -11,6 +11,7 @@
 #define LLVM_MC_MCCONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -24,6 +25,8 @@
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -134,6 +137,9 @@ namespace llvm {
     /// The compilation directory to use for DW_AT_comp_dir.
     SmallString<128> CompilationDir;
 
+    /// Prefix replacement map for source file information.
+    std::map<const std::string, const std::string> DebugPrefixMap;
+
     /// The main file name if passed in explicitly.
     std::string MainFileName;
 
@@ -269,7 +275,7 @@ namespace llvm {
                                        unsigned UniqueID,
                                        const MCSymbolELF *Associated);
 
-    /// \brief Map of currently defined macros.
+    /// Map of currently defined macros.
     StringMap<MCAsmMacro> MacroMap;
 
   public:
@@ -292,6 +298,10 @@ namespace llvm {
 
     CodeViewContext &getCVContext();
 
+    /// Clear the current cv_loc, if there is one. Avoids lazily creating a
+    /// CodeViewContext if none is needed.
+    void clearCVLocSeen();
+
     void setAllowTemporaryLabels(bool Value) { AllowTemporaryLabels = Value; }
     void setUseNamesOnTempLabels(bool Value) { UseNamesOnTempLabels = Value; }
 
@@ -335,7 +345,7 @@ namespace llvm {
     /// Gets a symbol that will be defined to the final stack offset of a local
     /// variable after codegen.
     ///
-    /// \param Idx - The index of a local variable passed to @llvm.localescape.
+    /// \param Idx - The index of a local variable passed to \@llvm.localescape.
     MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);
 
     MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);
@@ -475,25 +485,39 @@ namespace llvm {
     /// \name Dwarf Management
     /// @{
 
-    /// \brief Get the compilation directory for DW_AT_comp_dir
+    /// Get the compilation directory for DW_AT_comp_dir
     /// The compilation directory should be set with \c setCompilationDir before
     /// calling this function. If it is unset, an empty string will be returned.
     StringRef getCompilationDir() const { return CompilationDir; }
 
-    /// \brief Set the compilation directory for DW_AT_comp_dir
+    /// Set the compilation directory for DW_AT_comp_dir
     void setCompilationDir(StringRef S) { CompilationDir = S.str(); }
 
-    /// \brief Get the main file name for use in error messages and debug
+    /// Get the debug prefix map.
+    const std::map<const std::string, const std::string> &
+    getDebugPrefixMap() const {
+      return DebugPrefixMap;
+    }
+
+    /// Add an entry to the debug prefix map.
+    void addDebugPrefixMapEntry(const std::string &From, const std::string &To);
+
+    // Remaps all debug directory paths in-place as per the debug prefix map.
+    void RemapDebugPaths();
+
+    /// Get the main file name for use in error messages and debug
     /// info. This can be set to ensure we've got the correct file name
     /// after preprocessing or for -save-temps.
     const std::string &getMainFileName() const { return MainFileName; }
 
-    /// \brief Set the main file name and override the default.
+    /// Set the main file name and override the default.
     void setMainFileName(StringRef S) { MainFileName = S; }
 
     /// Creates an entry in the dwarf file and directory tables.
-    unsigned getDwarfFile(StringRef Directory, StringRef FileName,
-                          unsigned FileNumber, unsigned CUID);
+    Expected<unsigned> getDwarfFile(StringRef Directory, StringRef FileName,
+                                    unsigned FileNumber,
+                                    MD5::MD5Result *Checksum,
+                                    Optional<StringRef> Source, unsigned CUID);
 
     bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);
 
@@ -532,8 +556,18 @@ namespace llvm {
       DwarfCompileUnitID = CUIndex;
     }
 
-    void setMCLineTableCompilationDir(unsigned CUID, StringRef CompilationDir) {
-      getMCDwarfLineTable(CUID).setCompilationDir(CompilationDir);
+    /// Specifies the "root" file and directory of the compilation unit.
+    /// These are "file 0" and "directory 0" in DWARF v5.
+    void setMCLineTableRootFile(unsigned CUID, StringRef CompilationDir,
+                                StringRef Filename, MD5::MD5Result *Checksum,
+                                Optional<StringRef> Source) {
+      getMCDwarfLineTable(CUID).setRootFile(CompilationDir, Filename, Checksum,
+                                            Source);
+    }
+
+    /// Reports whether MD5 checksum usage is consistent (all-or-none).
+    bool isDwarfMD5UsageConsistent(unsigned CUID) const {
+      return getMCDwarfLineTable(CUID).isMD5UsageConsistent();
     }
 
     /// Saves the information from the currently parsed dwarf .loc directive
@@ -639,7 +673,7 @@ namespace llvm {
 
 // operator new and delete aren't allowed inside namespaces.
 // The throw specifications are mandated by the standard.
-/// \brief Placement new for using the MCContext's allocator.
+/// Placement new for using the MCContext's allocator.
 ///
 /// This placement form of operator new uses the MCContext's allocator for
 /// obtaining memory. It is a non-throwing new, which means that it returns
@@ -665,7 +699,7 @@ inline void *operator new(size_t Bytes, llvm::MCContext &C,
                           size_t Alignment = 8) noexcept {
   return C.allocate(Bytes, Alignment);
 }
-/// \brief Placement delete companion to the new above.
+/// Placement delete companion to the new above.
 ///
 /// This operator is just a companion to the new above. There is no way of
 /// invoking it directly; see the new operator for more details. This operator
@@ -699,7 +733,7 @@ inline void *operator new[](size_t Bytes, llvm::MCContext &C,
   return C.allocate(Bytes, Alignment);
 }
 
-/// \brief Placement delete[] companion to the new[] above.
+/// Placement delete[] companion to the new[] above.
 ///
 /// This operator is just a companion to the new[] above. There is no way of
 /// invoking it directly; see the new[] operator for more details. This operator
diff --git a/contrib/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h b/contrib/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h
index bd3e5d4638e5..df909a0dccd3 100644
--- a/contrib/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h
+++ b/contrib/llvm/include/llvm/MC/MCDisassembler/MCExternalSymbolizer.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-/// \brief Symbolize using user-provided, C API, callbacks.
+/// Symbolize using user-provided, C API, callbacks.
 ///
 /// See llvm-c/Disassembler.h.
 class MCExternalSymbolizer : public MCSymbolizer {
diff --git a/contrib/llvm/include/llvm/MC/MCDisassembler/MCRelocationInfo.h b/contrib/llvm/include/llvm/MC/MCDisassembler/MCRelocationInfo.h
index 7836e886c303..6030ae660d38 100644
--- a/contrib/llvm/include/llvm/MC/MCDisassembler/MCRelocationInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCDisassembler/MCRelocationInfo.h
@@ -21,7 +21,7 @@ namespace llvm {
 class MCContext;
 class MCExpr;
 
-/// \brief Create MCExprs from relocations found in an object file.
+/// Create MCExprs from relocations found in an object file.
 class MCRelocationInfo {
 protected:
   MCContext &Ctx;
@@ -32,7 +32,7 @@ public:
   MCRelocationInfo &operator=(const MCRelocationInfo &) = delete;
   virtual ~MCRelocationInfo();
 
-  /// \brief Create an MCExpr for the target-specific \p VariantKind.
+  /// Create an MCExpr for the target-specific \p VariantKind.
   /// The VariantKinds are defined in llvm-c/Disassembler.h.
   /// Used by MCExternalSymbolizer.
   /// \returns If possible, an MCExpr corresponding to VariantKind, else 0.
diff --git a/contrib/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h b/contrib/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h
index d85cf5e066f5..0bfa569474ec 100644
--- a/contrib/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h
+++ b/contrib/llvm/include/llvm/MC/MCDisassembler/MCSymbolizer.h
@@ -27,7 +27,7 @@ class MCContext;
 class MCInst;
 class raw_ostream;
 
-/// \brief Symbolize and annotate disassembled instructions.
+/// Symbolize and annotate disassembled instructions.
 ///
 /// For now this mimics the old symbolization logic (from both ARM and x86), that
 /// relied on user-provided (C API) callbacks to do the actual symbol lookup in
@@ -42,7 +42,7 @@ protected:
   std::unique_ptr<MCRelocationInfo> RelInfo;
 
 public:
-  /// \brief Construct an MCSymbolizer, taking ownership of \p RelInfo.
+  /// Construct an MCSymbolizer, taking ownership of \p RelInfo.
   MCSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> RelInfo)
     : Ctx(Ctx), RelInfo(std::move(RelInfo)) {
   }
@@ -51,7 +51,7 @@ public:
   MCSymbolizer &operator=(const MCSymbolizer &) = delete;
   virtual ~MCSymbolizer();
 
-  /// \brief Try to add a symbolic operand instead of \p Value to the MCInst.
+  /// Try to add a symbolic operand instead of \p Value to the MCInst.
   ///
   /// Instead of having a difficult to read immediate, a symbolic operand would
   /// represent this immediate in a more understandable way, for instance as a
@@ -70,7 +70,7 @@ public:
                                         bool IsBranch, uint64_t Offset,
                                         uint64_t InstSize) = 0;
 
-  /// \brief Try to add a comment on the PC-relative load.
+  /// Try to add a comment on the PC-relative load.
   /// For instance, in Mach-O, this is used to add annotations to instructions
   /// that use C string literals, as found in __cstring.
   virtual void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
diff --git a/contrib/llvm/include/llvm/MC/MCDwarf.h b/contrib/llvm/include/llvm/MC/MCDwarf.h
index 88ffa04128e6..785f42d2f9d7 100644
--- a/contrib/llvm/include/llvm/MC/MCDwarf.h
+++ b/contrib/llvm/include/llvm/MC/MCDwarf.h
@@ -16,10 +16,13 @@
 #define LLVM_MC_MCDWARF_H
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MD5.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -31,6 +34,7 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class MCAsmBackend;
 class MCContext;
+class MCDwarfLineStr;
 class MCObjectStreamer;
 class MCStreamer;
 class MCSymbol;
@@ -38,21 +42,28 @@ class raw_ostream;
 class SMLoc;
 class SourceMgr;
 
-/// \brief Instances of this class represent the name of the dwarf
+/// Instances of this class represent the name of the dwarf
 /// .file directive and its associated dwarf file number in the MC file,
 /// and MCDwarfFile's are created and uniqued by the MCContext class where
 /// the file number for each is its index into the vector of DwarfFiles (note
 /// index 0 is not used and not a valid dwarf file number).
 struct MCDwarfFile {
-  // \brief The base name of the file without its directory path.
-  // The StringRef references memory allocated in the MCContext.
+  // The base name of the file without its directory path.
   std::string Name;
 
-  // \brief The index into the list of directory names for this file name.
+  // The index into the list of directory names for this file name.
   unsigned DirIndex;
+
+  /// The MD5 checksum, if there is one. Non-owning pointer to data allocated
+  /// in MCContext.
+  MD5::MD5Result *Checksum = nullptr;
+
+  /// The source code of the file. Non-owning reference to data allocated in
+  /// MCContext.
+  Optional<StringRef> Source;
 };
 
-/// \brief Instances of this class represent the information from a
+/// Instances of this class represent the information from a
 /// dwarf .loc directive.
 class MCDwarfLoc {
   uint32_t FileNum;
@@ -84,55 +95,55 @@ private: // MCContext manages these
   // for an MCDwarfLoc object.
 
 public:
-  /// \brief Get the FileNum of this MCDwarfLoc.
+  /// Get the FileNum of this MCDwarfLoc.
   unsigned getFileNum() const { return FileNum; }
 
-  /// \brief Get the Line of this MCDwarfLoc.
+  /// Get the Line of this MCDwarfLoc.
   unsigned getLine() const { return Line; }
 
-  /// \brief Get the Column of this MCDwarfLoc.
+  /// Get the Column of this MCDwarfLoc.
   unsigned getColumn() const { return Column; }
 
-  /// \brief Get the Flags of this MCDwarfLoc.
+  /// Get the Flags of this MCDwarfLoc.
   unsigned getFlags() const { return Flags; }
 
-  /// \brief Get the Isa of this MCDwarfLoc.
+  /// Get the Isa of this MCDwarfLoc.
   unsigned getIsa() const { return Isa; }
 
-  /// \brief Get the Discriminator of this MCDwarfLoc.
+  /// Get the Discriminator of this MCDwarfLoc.
   unsigned getDiscriminator() const { return Discriminator; }
 
-  /// \brief Set the FileNum of this MCDwarfLoc.
+  /// Set the FileNum of this MCDwarfLoc.
   void setFileNum(unsigned fileNum) { FileNum = fileNum; }
 
-  /// \brief Set the Line of this MCDwarfLoc.
+  /// Set the Line of this MCDwarfLoc.
   void setLine(unsigned line) { Line = line; }
 
-  /// \brief Set the Column of this MCDwarfLoc.
+  /// Set the Column of this MCDwarfLoc.
   void setColumn(unsigned column) {
     assert(column <= UINT16_MAX);
     Column = column;
   }
 
-  /// \brief Set the Flags of this MCDwarfLoc.
+  /// Set the Flags of this MCDwarfLoc.
   void setFlags(unsigned flags) {
     assert(flags <= UINT8_MAX);
     Flags = flags;
   }
 
-  /// \brief Set the Isa of this MCDwarfLoc.
+  /// Set the Isa of this MCDwarfLoc.
   void setIsa(unsigned isa) {
     assert(isa <= UINT8_MAX);
     Isa = isa;
   }
 
-  /// \brief Set the Discriminator of this MCDwarfLoc.
+  /// Set the Discriminator of this MCDwarfLoc.
   void setDiscriminator(unsigned discriminator) {
     Discriminator = discriminator;
   }
 };
 
-/// \brief Instances of this class represent the line information for
+/// Instances of this class represent the line information for
 /// the dwarf line table entries.  Which is created after a machine
 /// instruction is assembled and uses an address from a temporary label
 /// created at the current address in the current section and the info from
@@ -157,13 +168,13 @@ public:
   static void Make(MCObjectStreamer *MCOS, MCSection *Section);
 };
 
-/// \brief Instances of this class represent the line information for a compile
+/// Instances of this class represent the line information for a compile
 /// unit where machine instructions have been assembled after seeing .loc
 /// directives.  This is the information used to build the dwarf line
 /// table for a section.
 class MCLineSection {
 public:
-  // \brief Add an entry to this MCLineSection's line entries.
+  // Add an entry to this MCLineSection's line entries.
   void addLineEntry(const MCDwarfLineEntry &LineEntry, MCSection *Sec) {
     MCLineDivisions[Sec].push_back(LineEntry);
   }
@@ -202,32 +213,69 @@ struct MCDwarfLineTableHeader {
   SmallVector<std::string, 3> MCDwarfDirs;
   SmallVector<MCDwarfFile, 3> MCDwarfFiles;
   StringMap<unsigned> SourceIdMap;
-  StringRef CompilationDir;
+  std::string CompilationDir;
+  MCDwarfFile RootFile;
+  bool HasSource = false;
+private:
+  bool HasAllMD5 = true;
+  bool HasAnyMD5 = false;
 
+public:
   MCDwarfLineTableHeader() = default;
 
-  unsigned getFile(StringRef &Directory, StringRef &FileName,
-                   unsigned FileNumber = 0);
-  std::pair<MCSymbol *, MCSymbol *> Emit(MCStreamer *MCOS,
-                                         MCDwarfLineTableParams Params) const;
+  Expected<unsigned> tryGetFile(StringRef &Directory, StringRef &FileName,
+                                MD5::MD5Result *Checksum,
+                                Optional<StringRef> &Source,
+                                unsigned FileNumber = 0);
+  std::pair<MCSymbol *, MCSymbol *>
+  Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
+       Optional<MCDwarfLineStr> &LineStr) const;
   std::pair<MCSymbol *, MCSymbol *>
   Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
-       ArrayRef<char> SpecialOpcodeLengths) const;
+       ArrayRef<char> SpecialOpcodeLengths,
+       Optional<MCDwarfLineStr> &LineStr) const;
+  void resetMD5Usage() {
+    HasAllMD5 = true;
+    HasAnyMD5 = false;
+  }
+  void trackMD5Usage(bool MD5Used) {
+    HasAllMD5 &= MD5Used;
+    HasAnyMD5 |= MD5Used;
+  }
+  bool isMD5UsageConsistent() const {
+    return MCDwarfFiles.empty() || (HasAllMD5 == HasAnyMD5);
+  }
+
+private:
+  void emitV2FileDirTables(MCStreamer *MCOS) const;
+  void emitV5FileDirTables(MCStreamer *MCOS, Optional<MCDwarfLineStr> &LineStr,
+                           StringRef CtxCompilationDir) const;
 };
 
 class MCDwarfDwoLineTable {
   MCDwarfLineTableHeader Header;
 
 public:
-  void setCompilationDir(StringRef CompilationDir) {
-    Header.CompilationDir = CompilationDir;
-  }
-
-  unsigned getFile(StringRef Directory, StringRef FileName) {
-    return Header.getFile(Directory, FileName);
-  }
-
-  void Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params) const;
+  void maybeSetRootFile(StringRef Directory, StringRef FileName,
+                        MD5::MD5Result *Checksum, Optional<StringRef> Source) {
+    if (!Header.RootFile.Name.empty())
+      return;
+    Header.CompilationDir = Directory;
+    Header.RootFile.Name = FileName;
+    Header.RootFile.DirIndex = 0;
+    Header.RootFile.Checksum = Checksum;
+    Header.RootFile.Source = Source;
+    Header.trackMD5Usage(Checksum);
+    Header.HasSource = Source.hasValue();
+  }
+
+  unsigned getFile(StringRef Directory, StringRef FileName,
+                   MD5::MD5Result *Checksum, Optional<StringRef> Source) {
+    return cantFail(Header.tryGetFile(Directory, FileName, Checksum, Source));
+  }
+
+  void Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params,
+            MCSection *Section) const;
 };
 
 class MCDwarfLineTable {
@@ -239,10 +287,42 @@ public:
   static void Emit(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params);
 
   // This emits the Dwarf file and the line tables for a given Compile Unit.
-  void EmitCU(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params) const;
+  void EmitCU(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params,
+              Optional<MCDwarfLineStr> &LineStr) const;
 
+  Expected<unsigned> tryGetFile(StringRef &Directory, StringRef &FileName,
+                                MD5::MD5Result *Checksum,
+                                Optional<StringRef> Source,
+                                unsigned FileNumber = 0);
   unsigned getFile(StringRef &Directory, StringRef &FileName,
-                   unsigned FileNumber = 0);
+                   MD5::MD5Result *Checksum, Optional<StringRef> &Source,
+                   unsigned FileNumber = 0) {
+    return cantFail(tryGetFile(Directory, FileName, Checksum, Source,
+                               FileNumber));
+  }
+
+  void setRootFile(StringRef Directory, StringRef FileName,
+                   MD5::MD5Result *Checksum, Optional<StringRef> Source) {
+    Header.CompilationDir = Directory;
+    Header.RootFile.Name = FileName;
+    Header.RootFile.DirIndex = 0;
+    Header.RootFile.Checksum = Checksum;
+    Header.RootFile.Source = Source;
+    Header.trackMD5Usage(Checksum);
+    Header.HasSource = Source.hasValue();
+  }
+
+  void resetRootFile() {
+    assert(Header.MCDwarfFiles.empty());
+    Header.RootFile.Name.clear();
+    Header.resetMD5Usage();
+    Header.HasSource = false;
+  }
+
+  bool hasRootFile() const { return !Header.RootFile.Name.empty(); }
+
+  // Report whether MD5 usage has been consistent (all-or-none).
+  bool isMD5UsageConsistent() const { return Header.isMD5UsageConsistent(); }
 
   MCSymbol *getLabel() const {
     return Header.Label;
@@ -252,10 +332,6 @@ public:
     Header.Label = Label;
   }
 
-  void setCompilationDir(StringRef CompilationDir) {
-    Header.CompilationDir = CompilationDir;
-  }
-
   const SmallVectorImpl<std::string> &getMCDwarfDirs() const {
     return Header.MCDwarfDirs;
   }
@@ -372,41 +448,41 @@ private:
   }
 
 public:
-  /// \brief .cfi_def_cfa defines a rule for computing CFA as: take address from
+  /// .cfi_def_cfa defines a rule for computing CFA as: take address from
   /// Register and add Offset to it.
   static MCCFIInstruction createDefCfa(MCSymbol *L, unsigned Register,
                                        int Offset) {
     return MCCFIInstruction(OpDefCfa, L, Register, -Offset, "");
   }
 
-  /// \brief .cfi_def_cfa_register modifies a rule for computing CFA. From now
+  /// .cfi_def_cfa_register modifies a rule for computing CFA. From now
   /// on Register will be used instead of the old one. Offset remains the same.
   static MCCFIInstruction createDefCfaRegister(MCSymbol *L, unsigned Register) {
     return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, "");
   }
 
-  /// \brief .cfi_def_cfa_offset modifies a rule for computing CFA. Register
+  /// .cfi_def_cfa_offset modifies a rule for computing CFA. Register
   /// remains the same, but offset is new. Note that it is the absolute offset
   /// that will be added to a defined register to the compute CFA address.
   static MCCFIInstruction createDefCfaOffset(MCSymbol *L, int Offset) {
     return MCCFIInstruction(OpDefCfaOffset, L, 0, -Offset, "");
   }
 
-  /// \brief .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
+  /// .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
   /// Offset is a relative value that is added/subtracted from the previous
   /// offset.
   static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment) {
     return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, "");
   }
 
-  /// \brief .cfi_offset Previous value of Register is saved at offset Offset
+  /// .cfi_offset Previous value of Register is saved at offset Offset
   /// from CFA.
   static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register,
                                        int Offset) {
     return MCCFIInstruction(OpOffset, L, Register, Offset, "");
   }
 
-  /// \brief .cfi_rel_offset Previous value of Register is saved at offset
+  /// .cfi_rel_offset Previous value of Register is saved at offset
   /// Offset from the current CFA register. This is transformed to .cfi_offset
   /// using the known displacement of the CFA register from the CFA.
   static MCCFIInstruction createRelOffset(MCSymbol *L, unsigned Register,
@@ -414,54 +490,54 @@ public:
     return MCCFIInstruction(OpRelOffset, L, Register, Offset, "");
   }
 
-  /// \brief .cfi_register Previous value of Register1 is saved in
+  /// .cfi_register Previous value of Register1 is saved in
   /// register Register2.
   static MCCFIInstruction createRegister(MCSymbol *L, unsigned Register1,
                                          unsigned Register2) {
     return MCCFIInstruction(OpRegister, L, Register1, Register2);
   }
 
-  /// \brief .cfi_window_save SPARC register window is saved.
+  /// .cfi_window_save SPARC register window is saved.
   static MCCFIInstruction createWindowSave(MCSymbol *L) {
     return MCCFIInstruction(OpWindowSave, L, 0, 0, "");
   }
 
-  /// \brief .cfi_restore says that the rule for Register is now the same as it
+  /// .cfi_restore says that the rule for Register is now the same as it
   /// was at the beginning of the function, after all initial instructions added
   /// by .cfi_startproc were executed.
   static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register) {
     return MCCFIInstruction(OpRestore, L, Register, 0, "");
   }
 
-  /// \brief .cfi_undefined From now on the previous value of Register can't be
+  /// .cfi_undefined From now on the previous value of Register can't be
   /// restored anymore.
   static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register) {
     return MCCFIInstruction(OpUndefined, L, Register, 0, "");
   }
 
-  /// \brief .cfi_same_value Current value of Register is the same as in the
+  /// .cfi_same_value Current value of Register is the same as in the
   /// previous frame. I.e., no restoration is needed.
   static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register) {
     return MCCFIInstruction(OpSameValue, L, Register, 0, "");
   }
 
-  /// \brief .cfi_remember_state Save all current rules for all registers.
+  /// .cfi_remember_state Save all current rules for all registers.
   static MCCFIInstruction createRememberState(MCSymbol *L) {
     return MCCFIInstruction(OpRememberState, L, 0, 0, "");
   }
 
-  /// \brief .cfi_restore_state Restore the previously saved state.
+  /// .cfi_restore_state Restore the previously saved state.
   static MCCFIInstruction createRestoreState(MCSymbol *L) {
     return MCCFIInstruction(OpRestoreState, L, 0, 0, "");
   }
 
-  /// \brief .cfi_escape Allows the user to add arbitrary bytes to the unwind
+  /// .cfi_escape Allows the user to add arbitrary bytes to the unwind
   /// info.
   static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals) {
     return MCCFIInstruction(OpEscape, L, 0, 0, Vals);
   }
 
-  /// \brief A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
+  /// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
   static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int Size) {
     return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, "");
   }
diff --git a/contrib/llvm/include/llvm/MC/MCELFObjectWriter.h b/contrib/llvm/include/llvm/MC/MCELFObjectWriter.h
index fd8d118ccdc5..bff58fef6af9 100644
--- a/contrib/llvm/include/llvm/MC/MCELFObjectWriter.h
+++ b/contrib/llvm/include/llvm/MC/MCELFObjectWriter.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
@@ -50,7 +51,7 @@ struct ELFRelocationEntry {
   void dump() const { print(errs()); }
 };
 
-class MCELFObjectTargetWriter {
+class MCELFObjectTargetWriter : public MCObjectTargetWriter {
   const uint8_t OSABI;
   const uint16_t EMachine;
   const unsigned HasRelocationAddend : 1;
@@ -63,6 +64,11 @@ protected:
 public:
   virtual ~MCELFObjectTargetWriter() = default;
 
+  virtual Triple::ObjectFormatType getFormat() const { return Triple::ELF; }
+  static bool classof(const MCObjectTargetWriter *W) {
+    return W->getFormat() == Triple::ELF;
+  }
+
   static uint8_t getOSABI(Triple::OSType OSType) {
     switch (OSType) {
       case Triple::CloudABI:
@@ -132,7 +138,7 @@ public:
   }
 };
 
-/// \brief Construct a new ELF writer instance.
+/// Construct a new ELF writer instance.
 ///
 /// \param MOTW - The target specific ELF writer subclass.
 /// \param OS - The stream to write to.
@@ -141,6 +147,11 @@ std::unique_ptr<MCObjectWriter>
 createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
                       raw_pwrite_stream &OS, bool IsLittleEndian);
 
+std::unique_ptr<MCObjectWriter>
+createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                         raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                         bool IsLittleEndian);
+
 } // end namespace llvm
 
 #endif // LLVM_MC_MCELFOBJECTWRITER_H
diff --git a/contrib/llvm/include/llvm/MC/MCELFStreamer.h b/contrib/llvm/include/llvm/MC/MCELFStreamer.h
index 2f23cd64ee03..3797079661e4 100644
--- a/contrib/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/contrib/llvm/include/llvm/MC/MCELFStreamer.h
@@ -24,7 +24,8 @@ class MCInst;
 class MCELFStreamer : public MCObjectStreamer {
 public:
   MCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter);
+                std::unique_ptr<MCObjectWriter> OW,
+                std::unique_ptr<MCCodeEmitter> Emitter);
 
   ~MCELFStreamer() override = default;
 
@@ -58,7 +59,8 @@ public:
                              unsigned ByteAlignment) override;
 
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                    uint64_t Size = 0, unsigned ByteAlignment = 0) override;
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc L = SMLoc()) override;
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
@@ -68,6 +70,9 @@ public:
 
   void EmitValueToAlignment(unsigned, int64_t, unsigned, unsigned) override;
 
+  void emitCGProfileEntry(const MCSymbolRefExpr *From,
+                          const MCSymbolRefExpr *To, uint64_t Count) override;
+
   void FinishImpl() override;
 
   void EmitBundleAlignMode(unsigned AlignPow2) override;
@@ -80,8 +85,10 @@ private:
   void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
 
   void fixSymbolsInTLSFixups(const MCExpr *expr);
+  void finalizeCGProfileEntry(const MCSymbolRefExpr *&S);
+  void finalizeCGProfile();
 
-  /// \brief Merge the content of the fragment \p EF into the fragment \p DF.
+  /// Merge the content of the fragment \p EF into the fragment \p DF.
   void mergeFragment(MCDataFragment *, MCDataFragment *);
 
   bool SeenIdent = false;
@@ -93,7 +100,7 @@ private:
 
 MCELFStreamer *createARMELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> TAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> OW,
                                     std::unique_ptr<MCCodeEmitter> Emitter,
                                     bool RelaxAll, bool IsThumb);
 
diff --git a/contrib/llvm/include/llvm/MC/MCExpr.h b/contrib/llvm/include/llvm/MC/MCExpr.h
index fcbbe650d26f..c4dfbe949078 100644
--- a/contrib/llvm/include/llvm/MC/MCExpr.h
+++ b/contrib/llvm/include/llvm/MC/MCExpr.h
@@ -31,7 +31,7 @@ class StringRef;
 
 using SectionAddrMap = DenseMap<const MCSection *, uint64_t>;
 
-/// \brief Base class for the full range of assembler expressions which are
+/// Base class for the full range of assembler expressions which are
 /// needed for parsing.
 class MCExpr {
 public:
@@ -85,7 +85,7 @@ public:
   /// \name Expression Evaluation
   /// @{
 
-  /// \brief Try to evaluate the expression to an absolute value.
+  /// Try to evaluate the expression to an absolute value.
   ///
   /// \param Res - The absolute value, if evaluation succeeds.
   /// \param Layout - The assembler layout object to use for evaluating symbol
@@ -96,11 +96,12 @@ public:
                           const SectionAddrMap &Addrs) const;
   bool evaluateAsAbsolute(int64_t &Res) const;
   bool evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const;
+  bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm) const;
   bool evaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout) const;
 
   bool evaluateKnownAbsolute(int64_t &Res, const MCAsmLayout &Layout) const;
 
-  /// \brief Try to evaluate the expression to a relocatable value, i.e. an
+  /// Try to evaluate the expression to a relocatable value, i.e. an
   /// expression of the fixed form (a - b + constant).
   ///
   /// \param Res - The relocatable value, if evaluation succeeds.
@@ -110,14 +111,14 @@ public:
   bool evaluateAsRelocatable(MCValue &Res, const MCAsmLayout *Layout,
                              const MCFixup *Fixup) const;
 
-  /// \brief Try to evaluate the expression to the form (a - b + constant) where
+  /// Try to evaluate the expression to the form (a - b + constant) where
   /// neither a nor b are variables.
   ///
   /// This is a more aggressive variant of evaluateAsRelocatable. The intended
   /// use is for when relocations are not available, like the .size directive.
   bool evaluateAsValue(MCValue &Res, const MCAsmLayout &Layout) const;
 
-  /// \brief Find the "associated section" for this expression, which is
+  /// Find the "associated section" for this expression, which is
   /// currently defined as the absolute section for constants, or
   /// otherwise the section associated with the first defined symbol in the
   /// expression.
@@ -131,7 +132,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const MCExpr &E) {
   return OS;
 }
 
-//// \brief  Represent a constant integer expression.
+////  Represent a constant integer expression.
 class MCConstantExpr : public MCExpr {
   int64_t Value;
 
@@ -157,7 +158,7 @@ public:
   }
 };
 
-/// \brief  Represent a reference to a symbol from inside an expression.
+///  Represent a reference to a symbol from inside an expression.
 ///
 /// A symbol reference in an expression may be a use of a label, a use of an
 /// assembler variable (defined constant), or constitute an implicit definition
@@ -217,6 +218,8 @@ public:
     VK_PPC_LO,             // symbol@l
     VK_PPC_HI,             // symbol@h
     VK_PPC_HA,             // symbol@ha
+    VK_PPC_HIGH,           // symbol@high
+    VK_PPC_HIGHA,          // symbol@higha
     VK_PPC_HIGHER,         // symbol@higher
     VK_PPC_HIGHERA,        // symbol@highera
     VK_PPC_HIGHEST,        // symbol@highest
@@ -233,6 +236,8 @@ public:
     VK_PPC_TPREL_LO,       // symbol@tprel@l
     VK_PPC_TPREL_HI,       // symbol@tprel@h
     VK_PPC_TPREL_HA,       // symbol@tprel@ha
+    VK_PPC_TPREL_HIGH,     // symbol@tprel@high
+    VK_PPC_TPREL_HIGHA,    // symbol@tprel@higha
     VK_PPC_TPREL_HIGHER,   // symbol@tprel@higher
     VK_PPC_TPREL_HIGHERA,  // symbol@tprel@highera
     VK_PPC_TPREL_HIGHEST,  // symbol@tprel@highest
@@ -240,6 +245,8 @@ public:
     VK_PPC_DTPREL_LO,      // symbol@dtprel@l
     VK_PPC_DTPREL_HI,      // symbol@dtprel@h
     VK_PPC_DTPREL_HA,      // symbol@dtprel@ha
+    VK_PPC_DTPREL_HIGH,    // symbol@dtprel@high
+    VK_PPC_DTPREL_HIGHA,   // symbol@dtprel@higha
     VK_PPC_DTPREL_HIGHER,  // symbol@dtprel@higher
     VK_PPC_DTPREL_HIGHERA, // symbol@dtprel@highera
     VK_PPC_DTPREL_HIGHEST, // symbol@dtprel@highest
@@ -285,6 +292,7 @@ public:
     VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi
     VK_AMDGPU_REL32_LO,      // symbol@rel32@lo
     VK_AMDGPU_REL32_HI,      // symbol@rel32@hi
+    VK_AMDGPU_REL64,         // symbol@rel64
 
     VK_TPREL,
     VK_DTPREL
@@ -346,7 +354,7 @@ public:
   }
 };
 
-/// \brief Unary assembler expressions.
+/// Unary assembler expressions.
 class MCUnaryExpr : public MCExpr {
 public:
   enum Opcode {
@@ -390,10 +398,10 @@ public:
   /// \name Accessors
   /// @{
 
-  /// \brief Get the kind of this unary expression.
+  /// Get the kind of this unary expression.
   Opcode getOpcode() const { return Op; }
 
-  /// \brief Get the child of this unary expression.
+  /// Get the child of this unary expression.
   const MCExpr *getSubExpr() const { return Expr; }
 
   /// @}
@@ -403,7 +411,7 @@ public:
   }
 };
 
-/// \brief Binary assembler expressions.
+/// Binary assembler expressions.
 class MCBinaryExpr : public MCExpr {
 public:
   enum Opcode {
@@ -547,13 +555,13 @@ public:
   /// \name Accessors
   /// @{
 
-  /// \brief Get the kind of this binary expression.
+  /// Get the kind of this binary expression.
   Opcode getOpcode() const { return Op; }
 
-  /// \brief Get the left-hand side expression of the binary operator.
+  /// Get the left-hand side expression of the binary operator.
   const MCExpr *getLHS() const { return LHS; }
 
-  /// \brief Get the right-hand side expression of the binary operator.
+  /// Get the right-hand side expression of the binary operator.
   const MCExpr *getRHS() const { return RHS; }
 
   /// @}
@@ -563,7 +571,7 @@ public:
   }
 };
 
-/// \brief This is an extension point for target-specific MCExpr subclasses to
+/// This is an extension point for target-specific MCExpr subclasses to
 /// implement.
 ///
 /// NOTE: All subclasses are required to have trivial destructors because
@@ -580,6 +588,9 @@ public:
   virtual bool evaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAsmLayout *Layout,
                                          const MCFixup *Fixup) const = 0;
+  // This should be set when assigned expressions are not valid ".set"
+  // expressions, e.g. registers, and must be inlined.
+  virtual bool inlineAssignedExpr() const { return false; }
   virtual void visitUsedExpr(MCStreamer& Streamer) const = 0;
   virtual MCFragment *findAssociatedFragment() const = 0;
 
diff --git a/contrib/llvm/include/llvm/MC/MCFixup.h b/contrib/llvm/include/llvm/MC/MCFixup.h
index b83086c327f2..5f301eafc556 100644
--- a/contrib/llvm/include/llvm/MC/MCFixup.h
+++ b/contrib/llvm/include/llvm/MC/MCFixup.h
@@ -19,7 +19,7 @@
 namespace llvm {
 class MCExpr;
 
-/// \brief Extensible enumeration to represent the type of a fixup.
+/// Extensible enumeration to represent the type of a fixup.
 enum MCFixupKind {
   FK_Data_1 = 0, ///< A one-byte fixup.
   FK_Data_2,     ///< A two-byte fixup.
@@ -41,6 +41,14 @@ enum MCFixupKind {
   FK_SecRel_2,   ///< A two-byte section relative fixup.
   FK_SecRel_4,   ///< A four-byte section relative fixup.
   FK_SecRel_8,   ///< A eight-byte section relative fixup.
+  FK_Data_Add_1, ///< A one-byte add fixup.
+  FK_Data_Add_2, ///< A two-byte add fixup.
+  FK_Data_Add_4, ///< A four-byte add fixup.
+  FK_Data_Add_8, ///< A eight-byte add fixup.
+  FK_Data_Sub_1, ///< A one-byte sub fixup.
+  FK_Data_Sub_2, ///< A two-byte sub fixup.
+  FK_Data_Sub_4, ///< A four-byte sub fixup.
+  FK_Data_Sub_8, ///< A eight-byte sub fixup.
 
   FirstTargetFixupKind = 128,
 
@@ -49,7 +57,7 @@ enum MCFixupKind {
   MaxTargetFixupKind = (1 << 8)
 };
 
-/// \brief Encode information on a single operation to perform on a byte
+/// Encode information on a single operation to perform on a byte
 /// sequence (e.g., an encoded instruction) which requires assemble- or run-
 /// time patching.
 ///
@@ -90,6 +98,28 @@ public:
     return FI;
   }
 
+  /// Return a fixup corresponding to the add half of a add/sub fixup pair for
+  /// the given Fixup.
+  static MCFixup createAddFor(const MCFixup &Fixup) {
+    MCFixup FI;
+    FI.Value = Fixup.getValue();
+    FI.Offset = Fixup.getOffset();
+    FI.Kind = (unsigned)getAddKindForKind(Fixup.getKind());
+    FI.Loc = Fixup.getLoc();
+    return FI;
+  }
+
+  /// Return a fixup corresponding to the sub half of a add/sub fixup pair for
+  /// the given Fixup.
+  static MCFixup createSubFor(const MCFixup &Fixup) {
+    MCFixup FI;
+    FI.Value = Fixup.getValue();
+    FI.Offset = Fixup.getOffset();
+    FI.Kind = (unsigned)getSubKindForKind(Fixup.getKind());
+    FI.Loc = Fixup.getLoc();
+    return FI;
+  }
+
   MCFixupKind getKind() const { return MCFixupKind(Kind); }
 
   uint32_t getOffset() const { return Offset; }
@@ -97,7 +127,7 @@ public:
 
   const MCExpr *getValue() const { return Value; }
 
-  /// \brief Return the generic fixup kind for a value with the given size. It
+  /// Return the generic fixup kind for a value with the given size. It
   /// is an error to pass an unsupported size.
   static MCFixupKind getKindForSize(unsigned Size, bool isPCRel) {
     switch (Size) {
@@ -109,6 +139,30 @@ public:
     }
   }
 
+  /// Return the generic fixup kind for an addition with a given size. It
+  /// is an error to pass an unsupported size.
+  static MCFixupKind getAddKindForKind(unsigned Kind) {
+    switch (Kind) {
+    default: llvm_unreachable("Unknown type to convert!");
+    case FK_Data_1: return FK_Data_Add_1;
+    case FK_Data_2: return FK_Data_Add_2;
+    case FK_Data_4: return FK_Data_Add_4;
+    case FK_Data_8: return FK_Data_Add_8;
+    }
+  }
+
+  /// Return the generic fixup kind for an subtraction with a given size. It
+  /// is an error to pass an unsupported size.
+  static MCFixupKind getSubKindForKind(unsigned Kind) {
+    switch (Kind) {
+    default: llvm_unreachable("Unknown type to convert!");
+    case FK_Data_1: return FK_Data_Sub_1;
+    case FK_Data_2: return FK_Data_Sub_2;
+    case FK_Data_4: return FK_Data_Sub_4;
+    case FK_Data_8: return FK_Data_Sub_8;
+    }
+  }
+
   SMLoc getLoc() const { return Loc; }
 };
 
diff --git a/contrib/llvm/include/llvm/MC/MCFixupKindInfo.h b/contrib/llvm/include/llvm/MC/MCFixupKindInfo.h
index 58183bd778e6..483abb39403f 100644
--- a/contrib/llvm/include/llvm/MC/MCFixupKindInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCFixupKindInfo.h
@@ -12,7 +12,7 @@
 
 namespace llvm {
 
-/// \brief Target independent information on a fixup kind.
+/// Target independent information on a fixup kind.
 struct MCFixupKindInfo {
   enum FixupKindFlags {
     /// Is this fixup kind PCrelative? This is used by the assembler backend to
diff --git a/contrib/llvm/include/llvm/MC/MCFragment.h b/contrib/llvm/include/llvm/MC/MCFragment.h
index 85b55e85469a..47b35175fec8 100644
--- a/contrib/llvm/include/llvm/MC/MCFragment.h
+++ b/contrib/llvm/include/llvm/MC/MCFragment.h
@@ -56,18 +56,13 @@ protected:
   bool HasInstructions;
 
 private:
-  /// \brief Should this fragment be aligned to the end of a bundle?
-  bool AlignToBundleEnd;
-
-  uint8_t BundlePadding;
-
   /// LayoutOrder - The layout order of this fragment.
   unsigned LayoutOrder;
 
   /// The data for the section this fragment is in.
   MCSection *Parent;
 
-  /// Atom - The atom this fragment is in, as represented by it's defining
+  /// Atom - The atom this fragment is in, as represented by its defining
   /// symbol.
   const MCSymbol *Atom;
 
@@ -84,7 +79,7 @@ private:
 
 protected:
   MCFragment(FragmentType Kind, bool HasInstructions,
-             uint8_t BundlePadding, MCSection *Parent = nullptr);
+             MCSection *Parent = nullptr);
 
   ~MCFragment();
 
@@ -110,26 +105,11 @@ public:
   unsigned getLayoutOrder() const { return LayoutOrder; }
   void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
 
-  /// \brief Does this fragment have instructions emitted into it? By default
+  /// Does this fragment have instructions emitted into it? By default
   /// this is false, but specific fragment types may set it to true.
   bool hasInstructions() const { return HasInstructions; }
 
-  /// \brief Should this fragment be placed at the end of an aligned bundle?
-  bool alignToBundleEnd() const { return AlignToBundleEnd; }
-  void setAlignToBundleEnd(bool V) { AlignToBundleEnd = V; }
-
-  /// \brief Get the padding size that must be inserted before this fragment.
-  /// Used for bundling. By default, no padding is inserted.
-  /// Note that padding size is restricted to 8 bits. This is an optimization
-  /// to reduce the amount of space used for each fragment. In practice, larger
-  /// padding should never be required.
-  uint8_t getBundlePadding() const { return BundlePadding; }
-
-  /// \brief Set the padding size for this fragment. By default it's a no-op,
-  /// and only some fragments have a meaningful implementation.
-  void setBundlePadding(uint8_t N) { BundlePadding = N; }
-
-  /// \brief Return true if given frgment has FT_Dummy type.
+  /// Return true if given frgment has FT_Dummy type.
   bool isDummy() const { return Kind == FT_Dummy; }
 
   void dump() const;
@@ -137,8 +117,7 @@ public:
 
 class MCDummyFragment : public MCFragment {
 public:
-  explicit MCDummyFragment(MCSection *Sec)
-      : MCFragment(FT_Dummy, false, 0, Sec) {}
+  explicit MCDummyFragment(MCSection *Sec) : MCFragment(FT_Dummy, false, Sec) {}
 
   static bool classof(const MCFragment *F) { return F->getKind() == FT_Dummy; }
 };
@@ -147,10 +126,19 @@ public:
 /// data.
 ///
 class MCEncodedFragment : public MCFragment {
+  /// Should this fragment be aligned to the end of a bundle?
+  bool AlignToBundleEnd = false;
+
+  uint8_t BundlePadding = 0;
+
 protected:
   MCEncodedFragment(MCFragment::FragmentType FType, bool HasInstructions,
                     MCSection *Sec)
-      : MCFragment(FType, HasInstructions, 0, Sec) {}
+      : MCFragment(FType, HasInstructions, Sec) {}
+
+  /// STI - The MCSubtargetInfo in effect when the instruction was encoded.
+  /// must be non-null for instructions.
+  const MCSubtargetInfo *STI = nullptr;
 
 public:
   static bool classof(const MCFragment *F) {
@@ -164,6 +152,32 @@ public:
       return true;
     }
   }
+
+  /// Should this fragment be placed at the end of an aligned bundle?
+  bool alignToBundleEnd() const { return AlignToBundleEnd; }
+  void setAlignToBundleEnd(bool V) { AlignToBundleEnd = V; }
+
+  /// Get the padding size that must be inserted before this fragment.
+  /// Used for bundling. By default, no padding is inserted.
+  /// Note that padding size is restricted to 8 bits. This is an optimization
+  /// to reduce the amount of space used for each fragment. In practice, larger
+  /// padding should never be required.
+  uint8_t getBundlePadding() const { return BundlePadding; }
+
+  /// Set the padding size for this fragment. By default it's a no-op,
+  /// and only some fragments have a meaningful implementation.
+  void setBundlePadding(uint8_t N) { BundlePadding = N; }
+
+  /// Retrieve the MCSubTargetInfo in effect when the instruction was encoded.
+  /// Guaranteed to be non-null if hasInstructions() == true
+  const MCSubtargetInfo *getSubtargetInfo() const { return STI; }
+
+  /// Record that the fragment contains instructions with the MCSubtargetInfo in
+  /// effect when the instruction was encoded.
+  void setHasInstructions(const MCSubtargetInfo &STI) {
+    HasInstructions = true;
+    this->STI = &STI;
+  }
 };
 
 /// Interface implemented by fragments that contain encoded instructions and/or
@@ -202,6 +216,7 @@ protected:
                                                     Sec) {}
 
 public:
+
   using const_fixup_iterator = SmallVectorImpl<MCFixup>::const_iterator;
   using fixup_iterator = SmallVectorImpl<MCFixup>::iterator;
 
@@ -228,8 +243,6 @@ public:
   MCDataFragment(MCSection *Sec = nullptr)
       : MCEncodedFragmentWithFixups<32, 4>(FT_Data, false, Sec) {}
 
-  void setHasInstructions(bool V) { HasInstructions = V; }
-
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Data;
   }
@@ -259,20 +272,15 @@ class MCRelaxableFragment : public MCEncodedFragmentWithFixups<8, 1> {
   /// Inst - The instruction this is a fragment for.
   MCInst Inst;
 
-  /// STI - The MCSubtargetInfo in effect when the instruction was encoded.
-  const MCSubtargetInfo &STI;
-
 public:
   MCRelaxableFragment(const MCInst &Inst, const MCSubtargetInfo &STI,
                       MCSection *Sec = nullptr)
       : MCEncodedFragmentWithFixups(FT_Relaxable, true, Sec),
-        Inst(Inst), STI(STI) {}
+        Inst(Inst) { this->STI = &STI; }
 
   const MCInst &getInst() const { return Inst; }
   void setInst(const MCInst &Value) { Inst = Value; }
 
-  const MCSubtargetInfo &getSubtargetInfo() { return STI; }
-
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Relaxable;
   }
@@ -300,9 +308,8 @@ class MCAlignFragment : public MCFragment {
 public:
   MCAlignFragment(unsigned Alignment, int64_t Value, unsigned ValueSize,
                   unsigned MaxBytesToEmit, MCSection *Sec = nullptr)
-      : MCFragment(FT_Align, false, 0, Sec), Alignment(Alignment),
-        EmitNops(false), Value(Value),
-        ValueSize(ValueSize), MaxBytesToEmit(MaxBytesToEmit) {}
+      : MCFragment(FT_Align, false, Sec), Alignment(Alignment), EmitNops(false),
+        Value(Value), ValueSize(ValueSize), MaxBytesToEmit(MaxBytesToEmit) {}
 
   /// \name Accessors
   /// @{
@@ -370,7 +377,7 @@ public:
   };
 
   MCPaddingFragment(MCSection *Sec = nullptr)
-      : MCFragment(FT_Padding, false, 0, Sec), PaddingPoliciesMask(PFK_None),
+      : MCFragment(FT_Padding, false, Sec), PaddingPoliciesMask(PFK_None),
         IsInsertionPoint(false), Size(UINT64_C(0)),
         InstInfo({false, MCInst(), false, {0}}) {}
 
@@ -419,22 +426,23 @@ public:
 
 class MCFillFragment : public MCFragment {
   /// Value to use for filling bytes.
-  uint8_t Value;
-
+  uint64_t Value;
+  uint8_t ValueSize;
   /// The number of bytes to insert.
-  const MCExpr &Size;
+  const MCExpr &NumValues;
 
   /// Source location of the directive that this fragment was created for.
   SMLoc Loc;
 
 public:
-  MCFillFragment(uint8_t Value, const MCExpr &Size, SMLoc Loc,
-                 MCSection *Sec = nullptr)
-      : MCFragment(FT_Fill, false, 0, Sec), Value(Value), Size(Size), Loc(Loc) {
-  }
+  MCFillFragment(uint64_t Value, uint8_t VSize, const MCExpr &NumValues,
+                 SMLoc Loc, MCSection *Sec = nullptr)
+      : MCFragment(FT_Fill, false, Sec), Value(Value), ValueSize(VSize),
+        NumValues(NumValues), Loc(Loc) {}
 
-  uint8_t getValue() const { return Value; }
-  const MCExpr &getSize() const { return Size; }
+  uint64_t getValue() const { return Value; }
+  uint8_t getValueSize() const { return ValueSize; }
+  const MCExpr &getNumValues() const { return NumValues; }
 
   SMLoc getLoc() const { return Loc; }
 
@@ -444,19 +452,19 @@ public:
 };
 
 class MCOrgFragment : public MCFragment {
-  /// Offset - The offset this fragment should start at.
+  /// The offset this fragment should start at.
   const MCExpr *Offset;
 
-  /// Value - Value to use for filling bytes.
+  /// Value to use for filling bytes.
   int8_t Value;
 
-  /// Loc - Source location of the directive that this fragment was created for.
+  /// Source location of the directive that this fragment was created for.
   SMLoc Loc;
 
 public:
   MCOrgFragment(const MCExpr &Offset, int8_t Value, SMLoc Loc,
                 MCSection *Sec = nullptr)
-      : MCFragment(FT_Org, false, 0, Sec), Offset(&Offset), Value(Value), Loc(Loc) {}
+      : MCFragment(FT_Org, false, Sec), Offset(&Offset), Value(Value), Loc(Loc) {}
 
   /// \name Accessors
   /// @{
@@ -485,7 +493,7 @@ class MCLEBFragment : public MCFragment {
 
 public:
   MCLEBFragment(const MCExpr &Value_, bool IsSigned_, MCSection *Sec = nullptr)
-      : MCFragment(FT_LEB, false, 0, Sec), Value(&Value_), IsSigned(IsSigned_) {
+      : MCFragment(FT_LEB, false, Sec), Value(&Value_), IsSigned(IsSigned_) {
     Contents.push_back(0);
   }
 
@@ -520,7 +528,7 @@ class MCDwarfLineAddrFragment : public MCFragment {
 public:
   MCDwarfLineAddrFragment(int64_t LineDelta, const MCExpr &AddrDelta,
                           MCSection *Sec = nullptr)
-      : MCFragment(FT_Dwarf, false, 0, Sec), LineDelta(LineDelta),
+      : MCFragment(FT_Dwarf, false, Sec), LineDelta(LineDelta),
         AddrDelta(&AddrDelta) {
     Contents.push_back(0);
   }
@@ -551,7 +559,7 @@ class MCDwarfCallFrameFragment : public MCFragment {
 
 public:
   MCDwarfCallFrameFragment(const MCExpr &AddrDelta, MCSection *Sec = nullptr)
-      : MCFragment(FT_DwarfFrame, false, 0, Sec), AddrDelta(&AddrDelta) {
+      : MCFragment(FT_DwarfFrame, false, Sec), AddrDelta(&AddrDelta) {
     Contents.push_back(0);
   }
 
@@ -576,7 +584,7 @@ class MCSymbolIdFragment : public MCFragment {
 
 public:
   MCSymbolIdFragment(const MCSymbol *Sym, MCSection *Sec = nullptr)
-      : MCFragment(FT_SymbolId, false, 0, Sec), Sym(Sym) {}
+      : MCFragment(FT_SymbolId, false, Sec), Sym(Sym) {}
 
   /// \name Accessors
   /// @{
@@ -610,7 +618,7 @@ public:
                               unsigned StartLineNum, const MCSymbol *FnStartSym,
                               const MCSymbol *FnEndSym,
                               MCSection *Sec = nullptr)
-      : MCFragment(FT_CVInlineLines, false, 0, Sec), SiteFuncId(SiteFuncId),
+      : MCFragment(FT_CVInlineLines, false, Sec), SiteFuncId(SiteFuncId),
         StartFileId(StartFileId), StartLineNum(StartLineNum),
         FnStartSym(FnStartSym), FnEndSym(FnEndSym) {}
 
diff --git a/contrib/llvm/include/llvm/MC/MCInst.h b/contrib/llvm/include/llvm/MC/MCInst.h
index db28fd0fd6d9..67bb11a70387 100644
--- a/contrib/llvm/include/llvm/MC/MCInst.h
+++ b/contrib/llvm/include/llvm/MC/MCInst.h
@@ -30,7 +30,7 @@ class MCInst;
 class MCInstPrinter;
 class raw_ostream;
 
-/// \brief Instances of this class represent operands of the MCInst class.
+/// Instances of this class represent operands of the MCInst class.
 /// This is a simple discriminated union.
 class MCOperand {
   enum MachineOperandType : unsigned char {
@@ -61,13 +61,13 @@ public:
   bool isExpr() const { return Kind == kExpr; }
   bool isInst() const { return Kind == kInst; }
 
-  /// \brief Returns the register number.
+  /// Returns the register number.
   unsigned getReg() const {
     assert(isReg() && "This is not a register operand!");
     return RegVal;
   }
 
-  /// \brief Set the register number.
+  /// Set the register number.
   void setReg(unsigned Reg) {
     assert(isReg() && "This is not a register operand!");
     RegVal = Reg;
@@ -150,11 +150,13 @@ public:
 
   void print(raw_ostream &OS) const;
   void dump() const;
+  bool isBareSymbolRef() const;
+  bool evaluateAsConstantImm(int64_t &Imm) const;
 };
 
 template <> struct isPodLike<MCOperand> { static const bool value = true; };
 
-/// \brief Instances of this class represent a single low-level machine
+/// Instances of this class represent a single low-level machine
 /// instruction.
 class MCInst {
   unsigned Opcode = 0;
@@ -201,7 +203,7 @@ public:
   void print(raw_ostream &OS) const;
   void dump() const;
 
-  /// \brief Dump the MCInst as prettily as possible using the additional MC
+  /// Dump the MCInst as prettily as possible using the additional MC
   /// structures, if given. Operators are separated by the \p Separator
   /// string.
   void dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer = nullptr,
diff --git a/contrib/llvm/include/llvm/MC/MCInstBuilder.h b/contrib/llvm/include/llvm/MC/MCInstBuilder.h
index 30609bdb8b27..c5c4f481e7df 100644
--- a/contrib/llvm/include/llvm/MC/MCInstBuilder.h
+++ b/contrib/llvm/include/llvm/MC/MCInstBuilder.h
@@ -23,42 +23,42 @@ class MCInstBuilder {
   MCInst Inst;
 
 public:
-  /// \brief Create a new MCInstBuilder for an MCInst with a specific opcode.
+  /// Create a new MCInstBuilder for an MCInst with a specific opcode.
   MCInstBuilder(unsigned Opcode) {
     Inst.setOpcode(Opcode);
   }
 
-  /// \brief Add a new register operand.
+  /// Add a new register operand.
   MCInstBuilder &addReg(unsigned Reg) {
     Inst.addOperand(MCOperand::createReg(Reg));
     return *this;
   }
 
-  /// \brief Add a new integer immediate operand.
+  /// Add a new integer immediate operand.
   MCInstBuilder &addImm(int64_t Val) {
     Inst.addOperand(MCOperand::createImm(Val));
     return *this;
   }
 
-  /// \brief Add a new floating point immediate operand.
+  /// Add a new floating point immediate operand.
   MCInstBuilder &addFPImm(double Val) {
     Inst.addOperand(MCOperand::createFPImm(Val));
     return *this;
   }
 
-  /// \brief Add a new MCExpr operand.
+  /// Add a new MCExpr operand.
   MCInstBuilder &addExpr(const MCExpr *Val) {
     Inst.addOperand(MCOperand::createExpr(Val));
     return *this;
   }
 
-  /// \brief Add a new MCInst operand.
+  /// Add a new MCInst operand.
   MCInstBuilder &addInst(const MCInst *Val) {
     Inst.addOperand(MCOperand::createInst(Val));
     return *this;
   }
 
-  /// \brief Add an operand.
+  /// Add an operand.
   MCInstBuilder &addOperand(const MCOperand &Op) {
     Inst.addOperand(Op);
     return *this;
diff --git a/contrib/llvm/include/llvm/MC/MCInstPrinter.h b/contrib/llvm/include/llvm/MC/MCInstPrinter.h
index 069403074b31..df221e1db0e7 100644
--- a/contrib/llvm/include/llvm/MC/MCInstPrinter.h
+++ b/contrib/llvm/include/llvm/MC/MCInstPrinter.h
@@ -15,7 +15,6 @@
 
 namespace llvm {
 
-template <typename T> class ArrayRef;
 class MCAsmInfo;
 class MCInst;
 class MCInstrInfo;
@@ -36,13 +35,13 @@ enum Style {
 
 } // end namespace HexStyle
 
-/// \brief This is an instance of a target assembly language printer that
+/// This is an instance of a target assembly language printer that
 /// converts an MCInst to valid target assembly syntax.
 class MCInstPrinter {
 protected:
-  /// \brief A stream that comments can be emitted to if desired.  Each comment
+  /// A stream that comments can be emitted to if desired.  Each comment
   /// must end with a newline.  This will be null if verbose assembly emission
-  /// is disable.
+  /// is disabled.
   raw_ostream *CommentStream = nullptr;
   const MCAsmInfo &MAI;
   const MCInstrInfo &MII;
@@ -66,18 +65,18 @@ public:
 
   virtual ~MCInstPrinter();
 
-  /// \brief Specify a stream to emit comments to.
+  /// Specify a stream to emit comments to.
   void setCommentStream(raw_ostream &OS) { CommentStream = &OS; }
 
-  /// \brief Print the specified MCInst to the specified raw_ostream.
+  /// Print the specified MCInst to the specified raw_ostream.
   virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
                          const MCSubtargetInfo &STI) = 0;
 
-  /// \brief Return the name of the specified opcode enum (e.g. "MOV32ri") or
+  /// Return the name of the specified opcode enum (e.g. "MOV32ri") or
   /// empty if we can't resolve it.
   StringRef getOpcodeName(unsigned Opcode) const;
 
-  /// \brief Print the assembler register name.
+  /// Print the assembler register name.
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
   bool getUseMarkup() const { return UseMarkup; }
diff --git a/contrib/llvm/include/llvm/MC/MCInstrAnalysis.h b/contrib/llvm/include/llvm/MC/MCInstrAnalysis.h
index dd3e1df477b4..484f03b4d854 100644
--- a/contrib/llvm/include/llvm/MC/MCInstrAnalysis.h
+++ b/contrib/llvm/include/llvm/MC/MCInstrAnalysis.h
@@ -22,6 +22,8 @@
 
 namespace llvm {
 
+class MCRegisterInfo;
+
 class MCInstrAnalysis {
 protected:
   friend class Target;
@@ -60,7 +62,32 @@ public:
     return Info->get(Inst.getOpcode()).isTerminator();
   }
 
-  /// \brief Given a branch instruction try to get the address the branch
+  /// Returns true if at least one of the register writes performed by
+  /// \param Inst implicitly clears the upper portion of all super-registers.
+  /// 
+  /// Example: on X86-64, a write to EAX implicitly clears the upper half of
+  /// RAX. Also (still on x86) an XMM write perfomed by an AVX 128-bit
+  /// instruction implicitly clears the upper portion of the correspondent
+  /// YMM register.
+  ///
+  /// This method also updates an APInt which is used as mask of register
+  /// writes. There is one bit for every explicit/implicit write performed by
+  /// the instruction. If a write implicitly clears its super-registers, then
+  /// the corresponding bit is set (vic. the corresponding bit is cleared).
+  ///
+  /// The first bits in the APint are related to explicit writes. The remaining
+  /// bits are related to implicit writes. The sequence of writes follows the
+  /// machine operand sequence. For implicit writes, the sequence is defined by
+  /// the MCInstrDesc.
+  ///
+  /// The assumption is that the bit-width of the APInt is correctly set by
+  /// the caller. The default implementation conservatively assumes that none of
+  /// the writes clears the upper portion of a super-register.
+  virtual bool clearsSuperRegisters(const MCRegisterInfo &MRI,
+                                    const MCInst &Inst,
+                                    APInt &Writes) const;
+
+  /// Given a branch instruction try to get the address the branch
   /// targets. Return true on success, and the address in Target.
   virtual bool
   evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
diff --git a/contrib/llvm/include/llvm/MC/MCInstrDesc.h b/contrib/llvm/include/llvm/MC/MCInstrDesc.h
index ff4c756a66a1..3e000a2210e9 100644
--- a/contrib/llvm/include/llvm/MC/MCInstrDesc.h
+++ b/contrib/llvm/include/llvm/MC/MCInstrDesc.h
@@ -35,12 +35,12 @@ enum OperandConstraint {
   EARLY_CLOBBER // Operand is an early clobber register operand
 };
 
-/// \brief These are flags set on operands, but should be considered
+/// These are flags set on operands, but should be considered
 /// private, all access should go through the MCOperandInfo accessors.
 /// See the accessors for a description of what these are.
 enum OperandFlags { LookupPtrRegClass = 0, Predicate, OptionalDef };
 
-/// \brief Operands are tagged with one of the values of this enum.
+/// Operands are tagged with one of the values of this enum.
 enum OperandType {
   OPERAND_UNKNOWN = 0,
   OPERAND_IMMEDIATE = 1,
@@ -60,42 +60,39 @@ enum OperandType {
   OPERAND_FIRST_TARGET = 12,
 };
 
-enum GenericOperandType {
-};
-
 }
 
-/// \brief This holds information about one operand of a machine instruction,
+/// This holds information about one operand of a machine instruction,
 /// indicating the register class for register operands, etc.
 class MCOperandInfo {
 public:
-  /// \brief This specifies the register class enumeration of the operand
+  /// This specifies the register class enumeration of the operand
   /// if the operand is a register.  If isLookupPtrRegClass is set, then this is
   /// an index that is passed to TargetRegisterInfo::getPointerRegClass(x) to
   /// get a dynamic register class.
   int16_t RegClass;
 
-  /// \brief These are flags from the MCOI::OperandFlags enum.
+  /// These are flags from the MCOI::OperandFlags enum.
   uint8_t Flags;
 
-  /// \brief Information about the type of the operand.
+  /// Information about the type of the operand.
   uint8_t OperandType;
-  /// \brief The lower 16 bits are used to specify which constraints are set.
+  /// The lower 16 bits are used to specify which constraints are set.
   /// The higher 16 bits are used to specify the value of constraints (4 bits
   /// each).
   uint32_t Constraints;
 
-  /// \brief Set if this operand is a pointer value and it requires a callback
+  /// Set if this operand is a pointer value and it requires a callback
   /// to look up its register class.
   bool isLookupPtrRegClass() const {
     return Flags & (1 << MCOI::LookupPtrRegClass);
   }
 
-  /// \brief Set if this is one of the operands that made up of the predicate
+  /// Set if this is one of the operands that made up of the predicate
   /// operand that controls an isPredicable() instruction.
   bool isPredicate() const { return Flags & (1 << MCOI::Predicate); }
 
-  /// \brief Set if this operand is a optional def.
+  /// Set if this operand is a optional def.
   bool isOptionalDef() const { return Flags & (1 << MCOI::OptionalDef); }
 
   bool isGenericType() const {
@@ -114,7 +111,7 @@ public:
 //===----------------------------------------------------------------------===//
 
 namespace MCID {
-/// \brief These should be considered private to the implementation of the
+/// These should be considered private to the implementation of the
 /// MCInstrDesc class.  Clients should use the predicate methods on MCInstrDesc,
 /// not use these directly.  These all correspond to bitfields in the
 /// MCInstrDesc::Flags field.
@@ -130,6 +127,7 @@ enum Flag {
   IndirectBranch,
   Compare,
   MoveImm,
+  MoveReg,
   Bitcast,
   Select,
   DelaySlot,
@@ -151,11 +149,12 @@ enum Flag {
   ExtractSubreg,
   InsertSubreg,
   Convergent,
-  Add
+  Add,
+  Trap
 };
 }
 
-/// \brief Describe properties that are true of each instruction in the target
+/// Describe properties that are true of each instruction in the target
 /// description file.  This captures information about side effects, register
 /// use and many other things.  There is one instance of this struct for each
 /// target instruction class, and the MachineInstr class points to this struct
@@ -177,12 +176,12 @@ public:
   // deprecated due to a "complex" reason, below.
   int64_t DeprecatedFeature;
 
-  // A complex method to determine is a certain is deprecated or not, and return
-  // the reason for deprecation.
+  // A complex method to determine if a certain instruction is deprecated or
+  // not, and return the reason for deprecation.
   bool (*ComplexDeprecationInfo)(MCInst &, const MCSubtargetInfo &,
                                  std::string &);
 
-  /// \brief Returns the value of the specific constraint if
+  /// Returns the value of the specific constraint if
   /// it is set. Returns -1 if it is not set.
   int getOperandConstraint(unsigned OpNum,
                            MCOI::OperandConstraint Constraint) const {
@@ -194,15 +193,15 @@ public:
     return -1;
   }
 
-  /// \brief Returns true if a certain instruction is deprecated and if so
+  /// Returns true if a certain instruction is deprecated and if so
   /// returns the reason in \p Info.
   bool getDeprecatedInfo(MCInst &MI, const MCSubtargetInfo &STI,
                          std::string &Info) const;
 
-  /// \brief Return the opcode number for this descriptor.
+  /// Return the opcode number for this descriptor.
   unsigned getOpcode() const { return Opcode; }
 
-  /// \brief Return the number of declared MachineOperands for this
+  /// Return the number of declared MachineOperands for this
   /// MachineInstruction.  Note that variadic (isVariadic() returns true)
   /// instructions may have additional operands at the end of the list, and note
   /// that the machine instruction may include implicit register def/uses as
@@ -218,44 +217,50 @@ public:
     return make_range(opInfo_begin(), opInfo_end());
   }
 
-  /// \brief Return the number of MachineOperands that are register
+  /// Return the number of MachineOperands that are register
   /// definitions.  Register definitions always occur at the start of the
   /// machine operand list.  This is the number of "outs" in the .td file,
   /// and does not include implicit defs.
   unsigned getNumDefs() const { return NumDefs; }
 
-  /// \brief Return flags of this instruction.
+  /// Return flags of this instruction.
   uint64_t getFlags() const { return Flags; }
 
-  /// \brief Return true if this instruction can have a variable number of
+  /// Return true if this instruction can have a variable number of
   /// operands.  In this case, the variable operands will be after the normal
   /// operands but before the implicit definitions and uses (if any are
   /// present).
   bool isVariadic() const { return Flags & (1ULL << MCID::Variadic); }
 
-  /// \brief Set if this instruction has an optional definition, e.g.
+  /// Set if this instruction has an optional definition, e.g.
   /// ARM instructions which can set condition code if 's' bit is set.
   bool hasOptionalDef() const { return Flags & (1ULL << MCID::HasOptionalDef); }
 
-  /// \brief Return true if this is a pseudo instruction that doesn't
+  /// Return true if this is a pseudo instruction that doesn't
   /// correspond to a real machine instruction.
   bool isPseudo() const { return Flags & (1ULL << MCID::Pseudo); }
 
-  /// \brief Return true if the instruction is a return.
+  /// Return true if the instruction is a return.
   bool isReturn() const { return Flags & (1ULL << MCID::Return); }
 
-  /// \brief Return true if the instruction is an add instruction.
+  /// Return true if the instruction is an add instruction.
   bool isAdd() const { return Flags & (1ULL << MCID::Add); }
 
-  /// \brief  Return true if the instruction is a call.
+  /// Return true if this instruction is a trap.
+  bool isTrap() const { return Flags & (1ULL << MCID::Trap); }
+
+  /// Return true if the instruction is a register to register move.
+  bool isMoveReg() const { return Flags & (1ULL << MCID::MoveReg); }
+
+  ///  Return true if the instruction is a call.
   bool isCall() const { return Flags & (1ULL << MCID::Call); }
 
-  /// \brief Returns true if the specified instruction stops control flow
+  /// Returns true if the specified instruction stops control flow
   /// from executing the instruction immediately following it.  Examples include
   /// unconditional branches and return instructions.
   bool isBarrier() const { return Flags & (1ULL << MCID::Barrier); }
 
-  /// \brief Returns true if this instruction part of the terminator for
+  /// Returns true if this instruction part of the terminator for
   /// a basic block.  Typically this is things like return and branch
   /// instructions.
   ///
@@ -263,17 +268,17 @@ public:
   /// but before control flow occurs.
   bool isTerminator() const { return Flags & (1ULL << MCID::Terminator); }
 
-  /// \brief Returns true if this is a conditional, unconditional, or
+  /// Returns true if this is a conditional, unconditional, or
   /// indirect branch.  Predicates below can be used to discriminate between
   /// these cases, and the TargetInstrInfo::AnalyzeBranch method can be used to
   /// get more information.
   bool isBranch() const { return Flags & (1ULL << MCID::Branch); }
 
-  /// \brief Return true if this is an indirect branch, such as a
+  /// Return true if this is an indirect branch, such as a
   /// branch through a register.
   bool isIndirectBranch() const { return Flags & (1ULL << MCID::IndirectBranch); }
 
-  /// \brief Return true if this is a branch which may fall
+  /// Return true if this is a branch which may fall
   /// through to the next instruction or may transfer control flow to some other
   /// block.  The TargetInstrInfo::AnalyzeBranch method can be used to get more
   /// information about this branch.
@@ -281,7 +286,7 @@ public:
     return isBranch() & !isBarrier() & !isIndirectBranch();
   }
 
-  /// \brief Return true if this is a branch which always
+  /// Return true if this is a branch which always
   /// transfers control flow to some other block.  The
   /// TargetInstrInfo::AnalyzeBranch method can be used to get more information
   /// about this branch.
@@ -289,40 +294,40 @@ public:
     return isBranch() & isBarrier() & !isIndirectBranch();
   }
 
-  /// \brief Return true if this is a branch or an instruction which directly
+  /// Return true if this is a branch or an instruction which directly
   /// writes to the program counter. Considered 'may' affect rather than
   /// 'does' affect as things like predication are not taken into account.
   bool mayAffectControlFlow(const MCInst &MI, const MCRegisterInfo &RI) const;
 
-  /// \brief Return true if this instruction has a predicate operand
+  /// Return true if this instruction has a predicate operand
   /// that controls execution. It may be set to 'always', or may be set to other
   /// values. There are various methods in TargetInstrInfo that can be used to
   /// control and modify the predicate in this instruction.
   bool isPredicable() const { return Flags & (1ULL << MCID::Predicable); }
 
-  /// \brief Return true if this instruction is a comparison.
+  /// Return true if this instruction is a comparison.
   bool isCompare() const { return Flags & (1ULL << MCID::Compare); }
 
-  /// \brief Return true if this instruction is a move immediate
+  /// Return true if this instruction is a move immediate
   /// (including conditional moves) instruction.
   bool isMoveImmediate() const { return Flags & (1ULL << MCID::MoveImm); }
 
-  /// \brief Return true if this instruction is a bitcast instruction.
+  /// Return true if this instruction is a bitcast instruction.
   bool isBitcast() const { return Flags & (1ULL << MCID::Bitcast); }
 
-  /// \brief Return true if this is a select instruction.
+  /// Return true if this is a select instruction.
   bool isSelect() const { return Flags & (1ULL << MCID::Select); }
 
-  /// \brief Return true if this instruction cannot be safely
+  /// Return true if this instruction cannot be safely
   /// duplicated.  For example, if the instruction has a unique labels attached
   /// to it, duplicating it would cause multiple definition errors.
   bool isNotDuplicable() const { return Flags & (1ULL << MCID::NotDuplicable); }
 
-  /// \brief Returns true if the specified instruction has a delay slot which
+  /// Returns true if the specified instruction has a delay slot which
   /// must be filled by the code generator.
   bool hasDelaySlot() const { return Flags & (1ULL << MCID::DelaySlot); }
 
-  /// \brief Return true for instructions that can be folded as memory operands
+  /// Return true for instructions that can be folded as memory operands
   /// in other instructions. The most common use for this is instructions that
   /// are simple loads from memory that don't modify the loaded value in any
   /// way, but it can also be used for instructions that can be expressed as
@@ -331,7 +336,7 @@ public:
   /// that return a value in their only virtual register definition.
   bool canFoldAsLoad() const { return Flags & (1ULL << MCID::FoldableAsLoad); }
 
-  /// \brief Return true if this instruction behaves
+  /// Return true if this instruction behaves
   /// the same way as the generic REG_SEQUENCE instructions.
   /// E.g., on ARM,
   /// dX VMOVDRR rY, rZ
@@ -343,7 +348,7 @@ public:
   /// override accordingly.
   bool isRegSequenceLike() const { return Flags & (1ULL << MCID::RegSequence); }
 
-  /// \brief Return true if this instruction behaves
+  /// Return true if this instruction behaves
   /// the same way as the generic EXTRACT_SUBREG instructions.
   /// E.g., on ARM,
   /// rX, rY VMOVRRD dZ
@@ -358,7 +363,7 @@ public:
     return Flags & (1ULL << MCID::ExtractSubreg);
   }
 
-  /// \brief Return true if this instruction behaves
+  /// Return true if this instruction behaves
   /// the same way as the generic INSERT_SUBREG instructions.
   /// E.g., on ARM,
   /// dX = VSETLNi32 dY, rZ, Imm
@@ -371,7 +376,7 @@ public:
   bool isInsertSubregLike() const { return Flags & (1ULL << MCID::InsertSubreg); }
 
 
-  /// \brief Return true if this instruction is convergent.
+  /// Return true if this instruction is convergent.
   ///
   /// Convergent instructions may not be made control-dependent on any
   /// additional values.
@@ -381,18 +386,18 @@ public:
   // Side Effect Analysis
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return true if this instruction could possibly read memory.
+  /// Return true if this instruction could possibly read memory.
   /// Instructions with this flag set are not necessarily simple load
   /// instructions, they may load a value and modify it, for example.
   bool mayLoad() const { return Flags & (1ULL << MCID::MayLoad); }
 
-  /// \brief Return true if this instruction could possibly modify memory.
+  /// Return true if this instruction could possibly modify memory.
   /// Instructions with this flag set are not necessarily simple store
   /// instructions, they may store a modified value based on their operands, or
   /// may not actually modify anything, for example.
   bool mayStore() const { return Flags & (1ULL << MCID::MayStore); }
 
-  /// \brief Return true if this instruction has side
+  /// Return true if this instruction has side
   /// effects that are not modeled by other flags.  This does not return true
   /// for instructions whose effects are captured by:
   ///
@@ -412,7 +417,7 @@ public:
   // Flags that indicate whether an instruction can be modified by a method.
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return true if this may be a 2- or 3-address instruction (of the
+  /// Return true if this may be a 2- or 3-address instruction (of the
   /// form "X = op Y, Z, ..."), which produces the same result if Y and Z are
   /// exchanged.  If this flag is set, then the
   /// TargetInstrInfo::commuteInstruction method may be used to hack on the
@@ -424,7 +429,7 @@ public:
   /// commute them.
   bool isCommutable() const { return Flags & (1ULL << MCID::Commutable); }
 
-  /// \brief Return true if this is a 2-address instruction which can be changed
+  /// Return true if this is a 2-address instruction which can be changed
   /// into a 3-address instruction if needed.  Doing this transformation can be
   /// profitable in the register allocator, because it means that the
   /// instruction can use a 2-address form if possible, but degrade into a less
@@ -442,7 +447,7 @@ public:
     return Flags & (1ULL << MCID::ConvertibleTo3Addr);
   }
 
-  /// \brief Return true if this instruction requires custom insertion support
+  /// Return true if this instruction requires custom insertion support
   /// when the DAG scheduler is inserting it into a machine basic block.  If
   /// this is true for the instruction, it basically means that it is a pseudo
   /// instruction used at SelectionDAG time that is expanded out into magic code
@@ -454,13 +459,13 @@ public:
     return Flags & (1ULL << MCID::UsesCustomInserter);
   }
 
-  /// \brief Return true if this instruction requires *adjustment* after
+  /// Return true if this instruction requires *adjustment* after
   /// instruction selection by calling a target hook. For example, this can be
   /// used to fill in ARM 's' optional operand depending on whether the
   /// conditional flag register is used.
   bool hasPostISelHook() const { return Flags & (1ULL << MCID::HasPostISelHook); }
 
-  /// \brief Returns true if this instruction is a candidate for remat. This
+  /// Returns true if this instruction is a candidate for remat. This
   /// flag is only used in TargetInstrInfo method isTriviallyRematerializable.
   ///
   /// If this flag is set, the isReallyTriviallyReMaterializable()
@@ -470,7 +475,7 @@ public:
     return Flags & (1ULL << MCID::Rematerializable);
   }
 
-  /// \brief Returns true if this instruction has the same cost (or less) than a
+  /// Returns true if this instruction has the same cost (or less) than a
   /// move instruction. This is useful during certain types of optimizations
   /// (e.g., remat during two-address conversion or machine licm) where we would
   /// like to remat or hoist the instruction, but not if it costs more than
@@ -481,7 +486,7 @@ public:
   /// for different subtargets.
   bool isAsCheapAsAMove() const { return Flags & (1ULL << MCID::CheapAsAMove); }
 
-  /// \brief Returns true if this instruction source operands have special
+  /// Returns true if this instruction source operands have special
   /// register allocation requirements that are not captured by the operand
   /// register classes. e.g. ARM::STRD's two source registers must be an even /
   /// odd pair, ARM::STM registers have to be in ascending order.  Post-register
@@ -491,7 +496,7 @@ public:
     return Flags & (1ULL << MCID::ExtraSrcRegAllocReq);
   }
 
-  /// \brief Returns true if this instruction def operands have special register
+  /// Returns true if this instruction def operands have special register
   /// allocation requirements that are not captured by the operand register
   /// classes. e.g. ARM::LDRD's two def registers must be an even / odd pair,
   /// ARM::LDM registers have to be in ascending order.  Post-register
@@ -501,7 +506,7 @@ public:
     return Flags & (1ULL << MCID::ExtraDefRegAllocReq);
   }
 
-  /// \brief Return a list of registers that are potentially read by any
+  /// Return a list of registers that are potentially read by any
   /// instance of this machine instruction.  For example, on X86, the "adc"
   /// instruction adds two register operands and adds the carry bit in from the
   /// flags register.  In this case, the instruction is marked as implicitly
@@ -511,7 +516,7 @@ public:
   /// This method returns null if the instruction has no implicit uses.
   const MCPhysReg *getImplicitUses() const { return ImplicitUses; }
 
-  /// \brief Return the number of implicit uses this instruction has.
+  /// Return the number of implicit uses this instruction has.
   unsigned getNumImplicitUses() const {
     if (!ImplicitUses)
       return 0;
@@ -521,7 +526,7 @@ public:
     return i;
   }
 
-  /// \brief Return a list of registers that are potentially written by any
+  /// Return a list of registers that are potentially written by any
   /// instance of this machine instruction.  For example, on X86, many
   /// instructions implicitly set the flags register.  In this case, they are
   /// marked as setting the FLAGS.  Likewise, many instructions always deposit
@@ -533,7 +538,7 @@ public:
   /// This method returns null if the instruction has no implicit defs.
   const MCPhysReg *getImplicitDefs() const { return ImplicitDefs; }
 
-  /// \brief Return the number of implicit defs this instruct has.
+  /// Return the number of implicit defs this instruct has.
   unsigned getNumImplicitDefs() const {
     if (!ImplicitDefs)
       return 0;
@@ -543,7 +548,7 @@ public:
     return i;
   }
 
-  /// \brief Return true if this instruction implicitly
+  /// Return true if this instruction implicitly
   /// uses the specified physical register.
   bool hasImplicitUseOfPhysReg(unsigned Reg) const {
     if (const MCPhysReg *ImpUses = ImplicitUses)
@@ -553,22 +558,22 @@ public:
     return false;
   }
 
-  /// \brief Return true if this instruction implicitly
+  /// Return true if this instruction implicitly
   /// defines the specified physical register.
   bool hasImplicitDefOfPhysReg(unsigned Reg,
                                const MCRegisterInfo *MRI = nullptr) const;
 
-  /// \brief Return the scheduling class for this instruction.  The
+  /// Return the scheduling class for this instruction.  The
   /// scheduling class is an index into the InstrItineraryData table.  This
   /// returns zero if there is no known scheduling information for the
   /// instruction.
   unsigned getSchedClass() const { return SchedClass; }
 
-  /// \brief Return the number of bytes in the encoding of this instruction,
+  /// Return the number of bytes in the encoding of this instruction,
   /// or zero if the encoding size cannot be known from the opcode.
   unsigned getSize() const { return Size; }
 
-  /// \brief Find the index of the first operand in the
+  /// Find the index of the first operand in the
   /// operand list that is used to represent the predicate. It returns -1 if
   /// none is found.
   int findFirstPredOperandIdx() const {
@@ -580,7 +585,7 @@ public:
     return -1;
   }
 
-  /// \brief Return true if this instruction defines the specified physical
+  /// Return true if this instruction defines the specified physical
   /// register, either explicitly or implicitly.
   bool hasDefOfPhysReg(const MCInst &MI, unsigned Reg,
                        const MCRegisterInfo &RI) const;
diff --git a/contrib/llvm/include/llvm/MC/MCInstrInfo.h b/contrib/llvm/include/llvm/MC/MCInstrInfo.h
index 80f1f320b7c2..18da87cf8929 100644
--- a/contrib/llvm/include/llvm/MC/MCInstrInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCInstrInfo.h
@@ -20,7 +20,7 @@
 namespace llvm {
 
 //---------------------------------------------------------------------------
-/// \brief Interface to description of machine instruction set.
+/// Interface to description of machine instruction set.
 class MCInstrInfo {
   const MCInstrDesc *Desc;          // Raw array to allow static init'n
   const unsigned *InstrNameIndices; // Array for name indices in InstrNameData
@@ -28,7 +28,7 @@ class MCInstrInfo {
   unsigned NumOpcodes;              // Number of entries in the desc array
 
 public:
-  /// \brief Initialize MCInstrInfo, called by TableGen auto-generated routines.
+  /// Initialize MCInstrInfo, called by TableGen auto-generated routines.
   /// *DO NOT USE*.
   void InitMCInstrInfo(const MCInstrDesc *D, const unsigned *NI, const char *ND,
                        unsigned NO) {
@@ -40,14 +40,14 @@ public:
 
   unsigned getNumOpcodes() const { return NumOpcodes; }
 
-  /// \brief Return the machine instruction descriptor that corresponds to the
+  /// Return the machine instruction descriptor that corresponds to the
   /// specified instruction opcode.
   const MCInstrDesc &get(unsigned Opcode) const {
     assert(Opcode < NumOpcodes && "Invalid opcode!");
     return Desc[Opcode];
   }
 
-  /// \brief Returns the name for the instructions with the given opcode.
+  /// Returns the name for the instructions with the given opcode.
   StringRef getName(unsigned Opcode) const {
     assert(Opcode < NumOpcodes && "Invalid opcode!");
     return StringRef(&InstrNameData[InstrNameIndices[Opcode]]);
diff --git a/contrib/llvm/include/llvm/MC/MCInstrItineraries.h b/contrib/llvm/include/llvm/MC/MCInstrItineraries.h
index 4443dd113715..fe81376e0db7 100644
--- a/contrib/llvm/include/llvm/MC/MCInstrItineraries.h
+++ b/contrib/llvm/include/llvm/MC/MCInstrItineraries.h
@@ -67,12 +67,12 @@ struct InstrStage {
   int NextCycles_;   ///< Number of machine cycles to next stage
   ReservationKinds Kind_; ///< Kind of the FU reservation
 
-  /// \brief Returns the number of cycles the stage is occupied.
+  /// Returns the number of cycles the stage is occupied.
   unsigned getCycles() const {
     return Cycles_;
   }
 
-  /// \brief Returns the choice of FUs.
+  /// Returns the choice of FUs.
   unsigned getUnits() const {
     return Units_;
   }
@@ -81,7 +81,7 @@ struct InstrStage {
     return Kind_;
   }
 
-  /// \brief Returns the number of cycles from the start of this stage to the
+  /// Returns the number of cycles from the start of this stage to the
   /// start of the next stage in the itinerary
   unsigned getNextCycles() const {
     return (NextCycles_ >= 0) ? (unsigned)NextCycles_ : Cycles_;
@@ -94,11 +94,11 @@ struct InstrStage {
 /// cycle in which operands are read and written.
 ///
 struct InstrItinerary {
-  int      NumMicroOps;        ///< # of micro-ops, -1 means it's variable
-  unsigned FirstStage;         ///< Index of first stage in itinerary
-  unsigned LastStage;          ///< Index of last + 1 stage in itinerary
-  unsigned FirstOperandCycle;  ///< Index of first operand rd/wr
-  unsigned LastOperandCycle;   ///< Index of last + 1 operand rd/wr
+  int16_t  NumMicroOps;        ///< # of micro-ops, -1 means it's variable
+  uint16_t FirstStage;         ///< Index of first stage in itinerary
+  uint16_t LastStage;          ///< Index of last + 1 stage in itinerary
+  uint16_t FirstOperandCycle;  ///< Index of first operand rd/wr
+  uint16_t LastOperandCycle;   ///< Index of last + 1 operand rd/wr
 };
 
 //===----------------------------------------------------------------------===//
@@ -120,28 +120,28 @@ public:
     : SchedModel(SM), Stages(S), OperandCycles(OS), Forwardings(F),
       Itineraries(SchedModel.InstrItineraries) {}
 
-  /// \brief Returns true if there are no itineraries.
+  /// Returns true if there are no itineraries.
   bool isEmpty() const { return Itineraries == nullptr; }
 
-  /// \brief Returns true if the index is for the end marker itinerary.
+  /// Returns true if the index is for the end marker itinerary.
   bool isEndMarker(unsigned ItinClassIndx) const {
-    return ((Itineraries[ItinClassIndx].FirstStage == ~0U) &&
-            (Itineraries[ItinClassIndx].LastStage == ~0U));
+    return ((Itineraries[ItinClassIndx].FirstStage == UINT16_MAX) &&
+            (Itineraries[ItinClassIndx].LastStage == UINT16_MAX));
   }
 
-  /// \brief Return the first stage of the itinerary.
+  /// Return the first stage of the itinerary.
   const InstrStage *beginStage(unsigned ItinClassIndx) const {
     unsigned StageIdx = Itineraries[ItinClassIndx].FirstStage;
     return Stages + StageIdx;
   }
 
-  /// \brief Return the last+1 stage of the itinerary.
+  /// Return the last+1 stage of the itinerary.
   const InstrStage *endStage(unsigned ItinClassIndx) const {
     unsigned StageIdx = Itineraries[ItinClassIndx].LastStage;
     return Stages + StageIdx;
   }
 
-  /// \brief Return the total stage latency of the given class.  The latency is
+  /// Return the total stage latency of the given class.  The latency is
   /// the maximum completion time for any stage in the itinerary.  If no stages
   /// exist, it defaults to one cycle.
   unsigned getStageLatency(unsigned ItinClassIndx) const {
@@ -160,7 +160,7 @@ public:
     return Latency;
   }
 
-  /// \brief Return the cycle for the given class and operand.  Return -1 if no
+  /// Return the cycle for the given class and operand.  Return -1 if no
   /// cycle is specified for the operand.
   int getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const {
     if (isEmpty())
@@ -174,7 +174,7 @@ public:
     return (int)OperandCycles[FirstIdx + OperandIdx];
   }
 
-  /// \brief Return true if there is a pipeline forwarding between instructions
+  /// Return true if there is a pipeline forwarding between instructions
   /// of itinerary classes DefClass and UseClasses so that value produced by an
   /// instruction of itinerary class DefClass, operand index DefIdx can be
   /// bypassed when it's read by an instruction of itinerary class UseClass,
@@ -197,7 +197,7 @@ public:
       Forwardings[FirstUseIdx + UseIdx];
   }
 
-  /// \brief Compute and return the use operand latency of a given itinerary
+  /// Compute and return the use operand latency of a given itinerary
   /// class and operand index if the value is produced by an instruction of the
   /// specified itinerary class and def operand index.
   int getOperandLatency(unsigned DefClass, unsigned DefIdx,
@@ -221,7 +221,7 @@ public:
     return UseCycle;
   }
 
-  /// \brief Return the number of micro-ops that the given class decodes to.
+  /// Return the number of micro-ops that the given class decodes to.
   /// Return -1 for classes that require dynamic lookup via TargetInstrInfo.
   int getNumMicroOps(unsigned ItinClassIndx) const {
     if (isEmpty())
diff --git a/contrib/llvm/include/llvm/MC/MCLabel.h b/contrib/llvm/include/llvm/MC/MCLabel.h
index b6579fd654ab..aaf70691fc01 100644
--- a/contrib/llvm/include/llvm/MC/MCLabel.h
+++ b/contrib/llvm/include/llvm/MC/MCLabel.h
@@ -18,11 +18,11 @@ namespace llvm {
 
 class raw_ostream;
 
-/// \brief Instances of this class represent a label name in the MC file,
+/// Instances of this class represent a label name in the MC file,
 /// and MCLabel are created and uniqued by the MCContext class.  MCLabel
 /// should only be constructed for valid instances in the object file.
 class MCLabel {
-  // \brief The instance number of this Directional Local Label.
+  // The instance number of this Directional Local Label.
   unsigned Instance;
 
 private: // MCContext creates and uniques these.
@@ -34,16 +34,16 @@ public:
   MCLabel(const MCLabel &) = delete;
   MCLabel &operator=(const MCLabel &) = delete;
 
-  /// \brief Get the current instance of this Directional Local Label.
+  /// Get the current instance of this Directional Local Label.
   unsigned getInstance() const { return Instance; }
 
-  /// \brief Increment the current instance of this Directional Local Label.
+  /// Increment the current instance of this Directional Local Label.
   unsigned incInstance() { return ++Instance; }
 
-  /// \brief Print the value to the stream \p OS.
+  /// Print the value to the stream \p OS.
   void print(raw_ostream &OS) const;
 
-  /// \brief Print the value to stderr.
+  /// Print the value to stderr.
   void dump() const;
 };
 
diff --git a/contrib/llvm/include/llvm/MC/MCMachObjectWriter.h b/contrib/llvm/include/llvm/MC/MCMachObjectWriter.h
index 594869f74632..22fbeb72a4ec 100644
--- a/contrib/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/contrib/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -26,7 +26,7 @@ namespace llvm {
 
 class MachObjectWriter;
 
-class MCMachObjectTargetWriter {
+class MCMachObjectTargetWriter : public MCObjectTargetWriter {
   const unsigned Is64Bit : 1;
   const uint32_t CPUType;
   const uint32_t CPUSubtype;
@@ -43,6 +43,11 @@ protected:
 public:
   virtual ~MCMachObjectTargetWriter();
 
+  virtual Triple::ObjectFormatType getFormat() const { return Triple::MachO; }
+  static bool classof(const MCObjectTargetWriter *W) {
+    return W->getFormat() == Triple::MachO;
+  }
+
   /// \name Lifetime Management
   /// @{
 
@@ -116,11 +121,15 @@ class MachObjectWriter : public MCObjectWriter {
 
   MachSymbolData *findSymbolData(const MCSymbol &Sym);
 
+  void writeWithPadding(StringRef Str, uint64_t Size);
+
 public:
   MachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
                    raw_pwrite_stream &OS, bool IsLittleEndian)
-      : MCObjectWriter(OS, IsLittleEndian),
-        TargetObjectWriter(std::move(MOTW)) {}
+      : TargetObjectWriter(std::move(MOTW)),
+        W(OS, IsLittleEndian ? support::little : support::big) {}
+
+  support::endian::Writer W;
 
   const MCSymbol &findAliasedSymbol(const MCSymbol &Sym) const;
 
@@ -260,7 +269,7 @@ public:
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 };
 
 /// Construct a new Mach-O writer instance.
diff --git a/contrib/llvm/include/llvm/MC/MCObjectFileInfo.h b/contrib/llvm/include/llvm/MC/MCObjectFileInfo.h
index 79bf2b97015f..3a27ef8c8fee 100644
--- a/contrib/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_MC_MCOBJECTFILEINFO_H
 #define LLVM_MC_MCOBJECTFILEINFO_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
@@ -79,6 +81,7 @@ protected:
   MCSection *DwarfAbbrevSection;
   MCSection *DwarfInfoSection;
   MCSection *DwarfLineSection;
+  MCSection *DwarfLineStrSection;
   MCSection *DwarfFrameSection;
   MCSection *DwarfPubTypesSection;
   const MCSection *DwarfDebugInlineSection;
@@ -91,11 +94,11 @@ protected:
   // can be enabled by a compiler flag.
   MCSection *DwarfPubNamesSection;
 
-  /// DWARF5 Experimental Debug Info Sections
-  /// DwarfAccelNamesSection, DwarfAccelObjCSection,
-  /// DwarfAccelNamespaceSection, DwarfAccelTypesSection -
-  /// If we use the DWARF accelerated hash tables then we want to emit these
-  /// sections.
+  /// Accelerator table sections. DwarfDebugNamesSection is the DWARF v5
+  /// accelerator table, while DwarfAccelNamesSection, DwarfAccelObjCSection,
+  /// DwarfAccelNamespaceSection, DwarfAccelTypesSection are pre-DWARF v5
+  /// extensions.
+  MCSection *DwarfDebugNamesSection;
   MCSection *DwarfAccelNamesSection;
   MCSection *DwarfAccelObjCSection;
   MCSection *DwarfAccelNamespaceSection;
@@ -113,6 +116,11 @@ protected:
   /// The DWARF v5 string offset and address table sections.
   MCSection *DwarfStrOffSection;
   MCSection *DwarfAddrSection;
+  /// The DWARF v5 range list section.
+  MCSection *DwarfRnglistsSection;
+
+  /// The DWARF v5 range list section for fission.
+  MCSection *DwarfRnglistsDWOSection;
 
   // These are for Fission DWP files.
   MCSection *DwarfCUIndexSection;
@@ -157,6 +165,7 @@ protected:
 
   /// Section containing metadata on function stack sizes.
   MCSection *StackSizesSection;
+  mutable DenseMap<const MCSymbol *, unsigned> StackSizesUniquing;
 
   // ELF specific sections.
   MCSection *DataRelROSection;
@@ -182,6 +191,7 @@ protected:
   MCSection *ConstTextCoalSection;
   MCSection *ConstDataSection;
   MCSection *DataCoalSection;
+  MCSection *ConstDataCoalSection;
   MCSection *DataCommonSection;
   MCSection *DataBSSSection;
   MCSection *FourByteConstantSection;
@@ -196,6 +206,7 @@ protected:
   MCSection *PDataSection;
   MCSection *XDataSection;
   MCSection *SXDataSection;
+  MCSection *GFIDsSection;
 
 public:
   void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx,
@@ -233,6 +244,7 @@ public:
   MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
   MCSection *getDwarfLineSection() const { return DwarfLineSection; }
+  MCSection *getDwarfLineStrSection() const { return DwarfLineStrSection; }
   MCSection *getDwarfFrameSection() const { return DwarfFrameSection; }
   MCSection *getDwarfPubNamesSection() const { return DwarfPubNamesSection; }
   MCSection *getDwarfPubTypesSection() const { return DwarfPubTypesSection; }
@@ -249,9 +261,12 @@ public:
   MCSection *getDwarfLocSection() const { return DwarfLocSection; }
   MCSection *getDwarfARangesSection() const { return DwarfARangesSection; }
   MCSection *getDwarfRangesSection() const { return DwarfRangesSection; }
+  MCSection *getDwarfRnglistsSection() const { return DwarfRnglistsSection; }
   MCSection *getDwarfMacinfoSection() const { return DwarfMacinfoSection; }
 
-  // DWARF5 Experimental Debug Info Sections
+  MCSection *getDwarfDebugNamesSection() const {
+    return DwarfDebugNamesSection;
+  }
   MCSection *getDwarfAccelNamesSection() const {
     return DwarfAccelNamesSection;
   }
@@ -272,6 +287,9 @@ public:
   MCSection *getDwarfStrOffDWOSection() const { return DwarfStrOffDWOSection; }
   MCSection *getDwarfStrOffSection() const { return DwarfStrOffSection; }
   MCSection *getDwarfAddrSection() const { return DwarfAddrSection; }
+  MCSection *getDwarfRnglistsDWOSection() const {
+    return DwarfRnglistsDWOSection;
+  }
   MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; }
   MCSection *getDwarfTUIndexSection() const { return DwarfTUIndexSection; }
   MCSection *getDwarfSwiftASTSection() const { return DwarfSwiftASTSection; }
@@ -293,7 +311,7 @@ public:
   MCSection *getStackMapSection() const { return StackMapSection; }
   MCSection *getFaultMapSection() const { return FaultMapSection; }
 
-  MCSection *getStackSizesSection() const { return StackSizesSection; }
+  MCSection *getStackSizesSection(const MCSection &TextSec) const;
 
   // ELF specific sections.
   MCSection *getDataRelROSection() const { return DataRelROSection; }
@@ -323,6 +341,9 @@ public:
   }
   const MCSection *getConstDataSection() const { return ConstDataSection; }
   const MCSection *getDataCoalSection() const { return DataCoalSection; }
+  const MCSection *getConstDataCoalSection() const {
+    return ConstDataCoalSection;
+  }
   const MCSection *getDataCommonSection() const { return DataCommonSection; }
   MCSection *getDataBSSSection() const { return DataBSSSection; }
   const MCSection *getFourByteConstantSection() const {
@@ -349,6 +370,7 @@ public:
   MCSection *getPDataSection() const { return PDataSection; }
   MCSection *getXDataSection() const { return XDataSection; }
   MCSection *getSXDataSection() const { return SXDataSection; }
+  MCSection *getGFIDsSection() const { return GFIDsSection; }
 
   MCSection *getEHFrameSection() {
     return EHFrameSection;
diff --git a/contrib/llvm/include/llvm/MC/MCObjectStreamer.h b/contrib/llvm/include/llvm/MC/MCObjectStreamer.h
index 43ed00b4a7a7..035206dce939 100644
--- a/contrib/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/contrib/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -26,7 +26,7 @@ class MCAsmBackend;
 class raw_ostream;
 class raw_pwrite_stream;
 
-/// \brief Streaming object file generation interface.
+/// Streaming object file generation interface.
 ///
 /// This class provides an implementation of the MCStreamer interface which is
 /// suitable for use with the assembler backend. Specific object file formats
@@ -34,9 +34,6 @@ class raw_pwrite_stream;
 /// to that file format or custom semantics expected by the object writer
 /// implementation.
 class MCObjectStreamer : public MCStreamer {
-  std::unique_ptr<MCObjectWriter> ObjectWriter;
-  std::unique_ptr<MCAsmBackend> TAB;
-  std::unique_ptr<MCCodeEmitter> Emitter;
   std::unique_ptr<MCAssembler> Assembler;
   MCSection::iterator CurInsertionPoint;
   bool EmitEHFrame;
@@ -51,7 +48,7 @@ class MCObjectStreamer : public MCStreamer {
 
 protected:
   MCObjectStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                   raw_pwrite_stream &OS,
+                   std::unique_ptr<MCObjectWriter> OW,
                    std::unique_ptr<MCCodeEmitter> Emitter);
   ~MCObjectStreamer();
 
@@ -76,7 +73,9 @@ public:
 
   /// Get a data fragment to write into, creating a new one if the current
   /// fragment is not a data fragment.
-  MCDataFragment *getOrCreateDataFragment();
+  /// Optionally a \p STI can be passed in so that a new fragment is created
+  /// if the Subtarget differs from the current fragment.
+  MCDataFragment *getOrCreateDataFragment(const MCSubtargetInfo* STI = nullptr);
   MCPaddingFragment *getOrCreatePaddingFragment();
 
 protected:
@@ -91,8 +90,11 @@ protected:
 public:
   void visitUsedSymbol(const MCSymbol &Sym) override;
 
-  MCAssembler &getAssembler() { return *Assembler; }
+  /// Create a dummy fragment to assign any pending labels.
+  void flushPendingLabels() { flushPendingLabels(nullptr); }
 
+  MCAssembler &getAssembler() { return *Assembler; }
+  MCAssembler *getAssemblerPtr() override;
   /// \name MCStreamer Interface
   /// @{
 
@@ -108,7 +110,7 @@ public:
   void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                        bool = false) override;
 
-  /// \brief Emit an instruction to a special fragment, because this instruction
+  /// Emit an instruction to a special fragment, because this instruction
   /// can change its size during relaxation.
   virtual void EmitInstToFragment(const MCInst &Inst, const MCSubtargetInfo &);
 
@@ -159,7 +161,8 @@ public:
   void EmitGPRel32Value(const MCExpr *Value) override;
   void EmitGPRel64Value(const MCExpr *Value) override;
   bool EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                          const MCExpr *Expr, SMLoc Loc) override;
+                          const MCExpr *Expr, SMLoc Loc,
+                          const MCSubtargetInfo &STI) override;
   using MCStreamer::emitFill;
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                 SMLoc Loc = SMLoc()) override;
@@ -167,6 +170,9 @@ public:
                 SMLoc Loc = SMLoc()) override;
   void EmitFileDirective(StringRef Filename) override;
 
+  void EmitAddrsig() override;
+  void EmitAddrsigSym(const MCSymbol *Sym) override;
+
   void FinishImpl() override;
 
   /// Emit the absolute difference between two symbols if possible.
@@ -179,6 +185,9 @@ public:
   void emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
                               unsigned Size) override;
 
+  void emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
+                                       const MCSymbol *Lo) override;
+
   bool mayHaveInstructions(MCSection &Sec) const override;
 };
 
diff --git a/contrib/llvm/include/llvm/MC/MCObjectWriter.h b/contrib/llvm/include/llvm/MC/MCObjectWriter.h
index cd90690fb186..8bae2bf20083 100644
--- a/contrib/llvm/include/llvm/MC/MCObjectWriter.h
+++ b/contrib/llvm/include/llvm/MC/MCObjectWriter.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,22 +37,9 @@ class MCValue;
 /// points. Once assembly is complete, the object writer is given the
 /// MCAssembler instance, which contains all the symbol and section data which
 /// should be emitted as part of writeObject().
-///
-/// The object writer also contains a number of helper methods for writing
-/// binary data to the output stream.
 class MCObjectWriter {
-  raw_pwrite_stream *OS;
-
 protected:
-  unsigned IsLittleEndian : 1;
-
-  // Can only create subclasses.
-  MCObjectWriter(raw_pwrite_stream &OS, bool IsLittleEndian)
-      : OS(&OS), IsLittleEndian(IsLittleEndian) {}
-
-  unsigned getInitialOffset() {
-    return OS->tell();
-  }
+  MCObjectWriter() = default;
 
 public:
   MCObjectWriter(const MCObjectWriter &) = delete;
@@ -61,11 +49,6 @@ public:
   /// lifetime management
   virtual void reset() {}
 
-  bool isLittleEndian() const { return IsLittleEndian; }
-
-  raw_pwrite_stream &getStream() { return *OS; }
-  void setStream(raw_pwrite_stream &NewOS) { OS = &NewOS; }
-
   /// \name High-Level API
   /// @{
 
@@ -109,90 +92,31 @@ public:
                                                       bool InSet,
                                                       bool IsPCRel) const;
 
-  /// Write the object file.
+  /// Tell the object writer to emit an address-significance table during
+  /// writeObject(). If this function is not called, all symbols are treated as
+  /// address-significant.
+  virtual void emitAddrsigSection() {}
+
+  /// Record the given symbol in the address-significance table to be written
+  /// diring writeObject().
+  virtual void addAddrsigSymbol(const MCSymbol *Sym) {}
+
+  /// Write the object file and returns the number of bytes written.
   ///
   /// This routine is called by the assembler after layout and relaxation is
   /// complete, fixups have been evaluated and applied, and relocations
   /// generated.
-  virtual void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) = 0;
+  virtual uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) = 0;
 
   /// @}
-  /// \name Binary Output
-  /// @{
-
-  void write8(uint8_t Value) { *OS << char(Value); }
-
-  void writeLE16(uint16_t Value) {
-    support::endian::Writer<support::little>(*OS).write(Value);
-  }
-
-  void writeLE32(uint32_t Value) {
-    support::endian::Writer<support::little>(*OS).write(Value);
-  }
-
-  void writeLE64(uint64_t Value) {
-    support::endian::Writer<support::little>(*OS).write(Value);
-  }
-
-  void writeBE16(uint16_t Value) {
-    support::endian::Writer<support::big>(*OS).write(Value);
-  }
-
-  void writeBE32(uint32_t Value) {
-    support::endian::Writer<support::big>(*OS).write(Value);
-  }
-
-  void writeBE64(uint64_t Value) {
-    support::endian::Writer<support::big>(*OS).write(Value);
-  }
-
-  void write16(uint16_t Value) {
-    if (IsLittleEndian)
-      writeLE16(Value);
-    else
-      writeBE16(Value);
-  }
-
-  void write32(uint32_t Value) {
-    if (IsLittleEndian)
-      writeLE32(Value);
-    else
-      writeBE32(Value);
-  }
-
-  void write64(uint64_t Value) {
-    if (IsLittleEndian)
-      writeLE64(Value);
-    else
-      writeBE64(Value);
-  }
-
-  void WriteZeros(unsigned N) {
-    const char Zeros[16] = {0};
-
-    for (unsigned i = 0, e = N / 16; i != e; ++i)
-      *OS << StringRef(Zeros, 16);
-
-    *OS << StringRef(Zeros, N % 16);
-  }
-
-  void writeBytes(const SmallVectorImpl<char> &ByteVec,
-                  unsigned ZeroFillSize = 0) {
-    writeBytes(StringRef(ByteVec.data(), ByteVec.size()), ZeroFillSize);
-  }
-
-  void writeBytes(StringRef Str, unsigned ZeroFillSize = 0) {
-    // TODO: this version may need to go away once all fragment contents are
-    // converted to SmallVector<char, N>
-    assert(
-        (ZeroFillSize == 0 || Str.size() <= ZeroFillSize) &&
-        "data size greater than fill size, unexpected large write will occur");
-    *OS << Str;
-    if (ZeroFillSize)
-      WriteZeros(ZeroFillSize - Str.size());
-  }
+};
 
-  /// @}
+/// Base class for classes that define behaviour that is specific to both the
+/// target and the object format.
+class MCObjectTargetWriter {
+public:
+  virtual ~MCObjectTargetWriter() = default;
+  virtual Triple::ObjectFormatType getFormat() const = 0;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/contrib/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
index 7836ece2d688..10550b3370e8 100644
--- a/contrib/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/contrib/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -10,11 +10,9 @@
 #ifndef LLVM_MC_MCPARSER_MCASMLEXER_H
 #define LLVM_MC_MCPARSER_MCASMLEXER_H
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/SMLoc.h"
+#include "llvm/MC/MCAsmMacro.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -23,113 +21,6 @@
 
 namespace llvm {
 
-/// Target independent representation for an assembler token.
-class AsmToken {
-public:
-  enum TokenKind {
-    // Markers
-    Eof, Error,
-
-    // String values.
-    Identifier,
-    String,
-
-    // Integer values.
-    Integer,
-    BigNum, // larger than 64 bits
-
-    // Real values.
-    Real,
-
-    // Comments
-    Comment,
-    HashDirective,
-    // No-value.
-    EndOfStatement,
-    Colon,
-    Space,
-    Plus, Minus, Tilde,
-    Slash,     // '/'
-    BackSlash, // '\'
-    LParen, RParen, LBrac, RBrac, LCurly, RCurly,
-    Star, Dot, Comma, Dollar, Equal, EqualEqual,
-
-    Pipe, PipePipe, Caret,
-    Amp, AmpAmp, Exclaim, ExclaimEqual, Percent, Hash,
-    Less, LessEqual, LessLess, LessGreater,
-    Greater, GreaterEqual, GreaterGreater, At,
-
-    // MIPS unary expression operators such as %neg.
-    PercentCall16, PercentCall_Hi, PercentCall_Lo, PercentDtprel_Hi,
-    PercentDtprel_Lo, PercentGot, PercentGot_Disp, PercentGot_Hi, PercentGot_Lo,
-    PercentGot_Ofst, PercentGot_Page, PercentGottprel, PercentGp_Rel, PercentHi,
-    PercentHigher, PercentHighest, PercentLo, PercentNeg, PercentPcrel_Hi,
-    PercentPcrel_Lo, PercentTlsgd, PercentTlsldm, PercentTprel_Hi,
-    PercentTprel_Lo
-  };
-
-private:
-  TokenKind Kind;
-
-  /// A reference to the entire token contents; this is always a pointer into
-  /// a memory buffer owned by the source manager.
-  StringRef Str;
-
-  APInt IntVal;
-
-public:
-  AsmToken() = default;
-  AsmToken(TokenKind Kind, StringRef Str, APInt IntVal)
-      : Kind(Kind), Str(Str), IntVal(std::move(IntVal)) {}
-  AsmToken(TokenKind Kind, StringRef Str, int64_t IntVal = 0)
-      : Kind(Kind), Str(Str), IntVal(64, IntVal, true) {}
-
-  TokenKind getKind() const { return Kind; }
-  bool is(TokenKind K) const { return Kind == K; }
-  bool isNot(TokenKind K) const { return Kind != K; }
-
-  SMLoc getLoc() const;
-  SMLoc getEndLoc() const;
-  SMRange getLocRange() const;
-
-  /// Get the contents of a string token (without quotes).
-  StringRef getStringContents() const {
-    assert(Kind == String && "This token isn't a string!");
-    return Str.slice(1, Str.size() - 1);
-  }
-
-  /// Get the identifier string for the current token, which should be an
-  /// identifier or a string. This gets the portion of the string which should
-  /// be used as the identifier, e.g., it does not include the quotes on
-  /// strings.
-  StringRef getIdentifier() const {
-    if (Kind == Identifier)
-      return getString();
-    return getStringContents();
-  }
-
-  /// Get the string for the current token, this includes all characters (for
-  /// example, the quotes on strings) in the token.
-  ///
-  /// The returned StringRef points into the source manager's memory buffer, and
-  /// is safe to store across calls to Lex().
-  StringRef getString() const { return Str; }
-
-  // FIXME: Don't compute this in advance, it makes every token larger, and is
-  // also not generally what we want (it is nicer for recovery etc. to lex 123br
-  // as a single token, then diagnose as an invalid number).
-  int64_t getIntVal() const {
-    assert(Kind == Integer && "This token isn't an integer!");
-    return IntVal.getZExtValue();
-  }
-
-  APInt getAPIntVal() const {
-    assert((Kind == Integer || Kind == BigNum) &&
-           "This token isn't an integer!");
-    return IntVal;
-  }
-};
-
 /// A callback class which is notified of each comment in an assembly file as
 /// it is lexed.
 class AsmCommentConsumer {
diff --git a/contrib/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/contrib/llvm/include/llvm/MC/MCParser/MCAsmParser.h
index 0f79c4777ea9..0d56f36fbae8 100644
--- a/contrib/llvm/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/contrib/llvm/include/llvm/MC/MCParser/MCAsmParser.h
@@ -91,7 +91,7 @@ private:
   IdKind Kind;
 };
 
-/// \brief Generic Sema callback for assembly parser.
+/// Generic Sema callback for assembly parser.
 class MCAsmParserSemaCallback {
 public:
   virtual ~MCAsmParserSemaCallback();
@@ -105,7 +105,7 @@ public:
                                     unsigned &Offset) = 0;
 };
 
-/// \brief Generic assembler parser interface, for use by target specific
+/// Generic assembler parser interface, for use by target specific
 /// assembly parsers.
 class MCAsmParser {
 public:
@@ -153,7 +153,7 @@ public:
 
   virtual MCContext &getContext() = 0;
 
-  /// \brief Return the output streamer for the assembler.
+  /// Return the output streamer for the assembler.
   virtual MCStreamer &getStreamer() = 0;
 
   MCTargetAsmParser &getTargetParser() const { return *TargetParser; }
@@ -168,13 +168,13 @@ public:
   void setEnablePrintSchedInfo(bool Value) { EnablePrintSchedInfo = Value; }
   bool shouldPrintSchedInfo() { return EnablePrintSchedInfo; }
 
-  /// \brief Run the parser on the input source buffer.
+  /// Run the parser on the input source buffer.
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
 
   virtual void setParsingInlineAsm(bool V) = 0;
   virtual bool isParsingInlineAsm() = 0;
 
-  /// \brief Parse MS-style inline assembly.
+  /// Parse MS-style inline assembly.
   virtual bool parseMSInlineAsm(
       void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
       unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
@@ -182,22 +182,22 @@ public:
       SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
       const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) = 0;
 
-  /// \brief Emit a note at the location \p L, with the message \p Msg.
+  /// Emit a note at the location \p L, with the message \p Msg.
   virtual void Note(SMLoc L, const Twine &Msg, SMRange Range = None) = 0;
 
-  /// \brief Emit a warning at the location \p L, with the message \p Msg.
+  /// Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
   virtual bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) = 0;
 
-  /// \brief Return an error at the location \p L, with the message \p Msg. This
+  /// Return an error at the location \p L, with the message \p Msg. This
   /// may be modified before being emitted.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
   bool Error(SMLoc L, const Twine &Msg, SMRange Range = None);
 
-  /// \brief Emit an error at the location \p L, with the message \p Msg.
+  /// Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
@@ -214,21 +214,23 @@ public:
     return rv;
   }
 
+  void clearPendingErrors() { PendingErrors.clear(); }
+
   bool addErrorSuffix(const Twine &Suffix);
 
-  /// \brief Get the next AsmToken in the stream, possibly handling file
+  /// Get the next AsmToken in the stream, possibly handling file
   /// inclusion first.
   virtual const AsmToken &Lex() = 0;
 
-  /// \brief Get the current AsmToken from the stream.
+  /// Get the current AsmToken from the stream.
   const AsmToken &getTok() const;
 
-  /// \brief Report an error at the current lexer location.
+  /// Report an error at the current lexer location.
   bool TokError(const Twine &Msg, SMRange Range = None);
 
   bool parseTokenLoc(SMLoc &Loc);
   bool parseToken(AsmToken::TokenKind T, const Twine &Msg = "unexpected token");
-  /// \brief Attempt to parse and consume token, returning true on
+  /// Attempt to parse and consume token, returning true on
   /// success.
   bool parseOptionalToken(AsmToken::TokenKind T);
 
@@ -241,23 +243,23 @@ public:
   bool check(bool P, const Twine &Msg);
   bool check(bool P, SMLoc Loc, const Twine &Msg);
 
-  /// \brief Parse an identifier or string (as a quoted identifier) and set \p
+  /// Parse an identifier or string (as a quoted identifier) and set \p
   /// Res to the identifier contents.
   virtual bool parseIdentifier(StringRef &Res) = 0;
 
-  /// \brief Parse up to the end of statement and return the contents from the
+  /// Parse up to the end of statement and return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
   virtual StringRef parseStringToEndOfStatement() = 0;
 
-  /// \brief Parse the current token as a string which may include escaped
+  /// Parse the current token as a string which may include escaped
   /// characters and return the string contents.
   virtual bool parseEscapedString(std::string &Data) = 0;
 
-  /// \brief Skip to the end of the current statement, for error recovery.
+  /// Skip to the end of the current statement, for error recovery.
   virtual void eatToEndOfStatement() = 0;
 
-  /// \brief Parse an arbitrary expression.
+  /// Parse an arbitrary expression.
   ///
   /// \param Res - The value of the expression. The result is undefined
   /// on error.
@@ -265,14 +267,14 @@ public:
   virtual bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
   bool parseExpression(const MCExpr *&Res);
 
-  /// \brief Parse a primary expression.
+  /// Parse a primary expression.
   ///
   /// \param Res - The value of the expression. The result is undefined
   /// on error.
   /// \return - False on success.
   virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) = 0;
 
-  /// \brief Parse an arbitrary expression, assuming that an initial '(' has
+  /// Parse an arbitrary expression, assuming that an initial '(' has
   /// already been consumed.
   ///
   /// \param Res - The value of the expression. The result is undefined
@@ -280,19 +282,19 @@ public:
   /// \return - False on success.
   virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
 
-  /// \brief Parse an expression which must evaluate to an absolute value.
+  /// Parse an expression which must evaluate to an absolute value.
   ///
   /// \param Res - The value of the absolute expression. The result is undefined
   /// on error.
   /// \return - False on success.
   virtual bool parseAbsoluteExpression(int64_t &Res) = 0;
 
-  /// \brief Ensure that we have a valid section set in the streamer. Otherwise,
+  /// Ensure that we have a valid section set in the streamer. Otherwise,
   /// report an error and switch to .text.
   /// \return - False on success.
   virtual bool checkForValidSection() = 0;
 
-  /// \brief Parse an arbitrary expression of a specified parenthesis depth,
+  /// Parse an arbitrary expression of a specified parenthesis depth,
   /// assuming that the initial '(' characters have already been consumed.
   ///
   /// \param ParenDepth - Specifies how many trailing expressions outside the
@@ -304,7 +306,7 @@ public:
                                      SMLoc &EndLoc) = 0;
 };
 
-/// \brief Create an MCAsmParser instance.
+/// Create an MCAsmParser instance.
 MCAsmParser *createMCAsmParser(SourceMgr &, MCContext &, MCStreamer &,
                                const MCAsmInfo &, unsigned CB = 0);
 
diff --git a/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h b/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h
index ffb8d7a4a26a..1a132bceddc5 100644
--- a/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -20,7 +20,7 @@ namespace llvm {
 
 class Twine;
 
-/// \brief Generic interface for extending the MCAsmParser,
+/// Generic interface for extending the MCAsmParser,
 /// which is implemented by target and object file assembly parser
 /// implementations.
 class MCAsmParserExtension {
@@ -45,7 +45,7 @@ public:
   MCAsmParserExtension &operator=(const MCAsmParserExtension &) = delete;
   virtual ~MCAsmParserExtension();
 
-  /// \brief Initialize the extension for parsing using the given \p Parser.
+  /// Initialize the extension for parsing using the given \p Parser.
   /// The extension should use the AsmParser interfaces to register its
   /// parsing routines.
   virtual void Initialize(MCAsmParser &Parser);
diff --git a/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserUtils.h b/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserUtils.h
index 84173bb9cb8e..259113bc3860 100644
--- a/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserUtils.h
+++ b/contrib/llvm/include/llvm/MC/MCParser/MCAsmParserUtils.h
@@ -25,7 +25,7 @@ namespace MCParserUtils {
 /// On success, returns false and sets the Symbol and Value output parameters.
 bool parseAssignmentExpression(StringRef Name, bool allow_redef,
                                MCAsmParser &Parser, MCSymbol *&Symbol,
-                               const MCExpr *&Value);
+                               const MCExpr *&Value, bool AllowExtendedExpr = false);
 
 } // namespace MCParserUtils
 
diff --git a/contrib/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/contrib/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
index 9f8550c3887c..2d188a6755e1 100644
--- a/contrib/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/contrib/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/SMLoc.h"
@@ -133,6 +134,53 @@ enum OperandMatchResultTy {
   MatchOperand_ParseFail // operand matched but had errors
 };
 
+enum class DiagnosticPredicateTy {
+  Match,
+  NearMatch,
+  NoMatch,
+};
+
+// When an operand is parsed, the assembler will try to iterate through a set of
+// possible operand classes that the operand might match and call the
+// corresponding PredicateMethod to determine that.
+//
+// If there are two AsmOperands that would give a specific diagnostic if there
+// is no match, there is currently no mechanism to distinguish which operand is
+// a closer match. The DiagnosticPredicate distinguishes between 'completely
+// no match' and 'near match', so the assembler can decide whether to give a
+// specific diagnostic, or use 'InvalidOperand' and continue to find a
+// 'better matching' diagnostic.
+//
+// For example:
+//    opcode opnd0, onpd1, opnd2
+//
+// where:
+//    opnd2 could be an 'immediate of range [-8, 7]'
+//    opnd2 could be a  'register + shift/extend'.
+//
+// If opnd2 is a valid register, but with a wrong shift/extend suffix, it makes
+// little sense to give a diagnostic that the operand should be an immediate
+// in range [-8, 7].
+//
+// This is a light-weight alternative to the 'NearMissInfo' approach
+// below which collects *all* possible diagnostics. This alternative
+// is optional and fully backward compatible with existing
+// PredicateMethods that return a 'bool' (match or no match).
+struct DiagnosticPredicate {
+  DiagnosticPredicateTy Type;
+
+  explicit DiagnosticPredicate(bool Match)
+      : Type(Match ? DiagnosticPredicateTy::Match
+                   : DiagnosticPredicateTy::NearMatch) {}
+  DiagnosticPredicate(DiagnosticPredicateTy T) : Type(T) {}
+  DiagnosticPredicate(const DiagnosticPredicate &) = default;
+
+  operator bool() const { return Type == DiagnosticPredicateTy::Match; }
+  bool isMatch() const { return Type == DiagnosticPredicateTy::Match; }
+  bool isNearMatch() const { return Type == DiagnosticPredicateTy::NearMatch; }
+  bool isNoMatch() const { return Type == DiagnosticPredicateTy::NoMatch; }
+};
+
 // When matching of an assembly instruction fails, there may be multiple
 // encodings that are close to being a match. It's often ambiguous which one
 // the programmer intended to use, so we want to report an error which mentions
@@ -271,6 +319,7 @@ class MCTargetAsmParser : public MCAsmParserExtension {
 public:
   enum MatchResultTy {
     Match_InvalidOperand,
+    Match_InvalidTiedOperand,
     Match_MissingFeature,
     Match_MnemonicFail,
     Match_Success,
@@ -323,6 +372,11 @@ public:
     SemaCallback = Callback;
   }
 
+  // Target-specific parsing of assembler-level variable assignment.
+  virtual bool parseAssignmentExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+    return getParser().parseExpression(Res, EndLoc);
+  }
+
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                              SMLoc &EndLoc) = 0;
 
@@ -400,6 +454,15 @@ public:
   virtual void convertToMapAndConstraints(unsigned Kind,
                                           const OperandVector &Operands) = 0;
 
+  /// Returns whether two registers are equal and is used by the tied-operands
+  /// checks in the AsmMatcher. This method can be overridden allow e.g. a
+  /// sub- or super-register as the tied operand.
+  virtual bool regsEqual(const MCParsedAsmOperand &Op1,
+                         const MCParsedAsmOperand &Op2) const {
+    assert(Op1.isReg() && Op2.isReg() && "Operands not all regs");
+    return Op1.getReg() == Op2.getReg();
+  }
+
   // Return whether this parser uses assignment statements with equals tokens
   virtual bool equalIsAsmAssignment() { return true; };
   // Return whether this start of statement identifier is a label
diff --git a/contrib/llvm/include/llvm/MC/MCRegisterInfo.h b/contrib/llvm/include/llvm/MC/MCRegisterInfo.h
index c57c9ef709da..6edfc30b0aa6 100644
--- a/contrib/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -240,7 +240,7 @@ public:
   friend class MCRegUnitMaskIterator;
   friend class MCRegUnitRootIterator;
 
-  /// \brief Initialize MCRegisterInfo, called by TableGen
+  /// Initialize MCRegisterInfo, called by TableGen
   /// auto-generated routines. *DO NOT USE*.
   void InitMCRegisterInfo(const MCRegisterDesc *D, unsigned NR, unsigned RA,
                           unsigned PC,
@@ -283,7 +283,7 @@ public:
     Dwarf2LRegsSize = 0;
   }
 
-  /// \brief Used to initialize LLVM register to Dwarf
+  /// Used to initialize LLVM register to Dwarf
   /// register number mapping. Called by TableGen auto-generated routines.
   /// *DO NOT USE*.
   void mapLLVMRegsToDwarfRegs(const DwarfLLVMRegPair *Map, unsigned Size,
@@ -297,7 +297,7 @@ public:
     }
   }
 
-  /// \brief Used to initialize Dwarf register to LLVM
+  /// Used to initialize Dwarf register to LLVM
   /// register number mapping. Called by TableGen auto-generated routines.
   /// *DO NOT USE*.
   void mapDwarfRegsToLLVMRegs(const DwarfLLVMRegPair *Map, unsigned Size,
@@ -324,7 +324,7 @@ public:
     L2CVRegs[LLVMReg] = CVReg;
   }
 
-  /// \brief This method should return the register where the return
+  /// This method should return the register where the return
   /// address can be found.
   unsigned getRARegister() const {
     return RAReg;
@@ -341,86 +341,86 @@ public:
     return Desc[RegNo];
   }
 
-  /// \brief Provide a get method, equivalent to [], but more useful with a
+  /// Provide a get method, equivalent to [], but more useful with a
   /// pointer to this object.
   const MCRegisterDesc &get(unsigned RegNo) const {
     return operator[](RegNo);
   }
 
-  /// \brief Returns the physical register number of sub-register "Index"
+  /// Returns the physical register number of sub-register "Index"
   /// for physical register RegNo. Return zero if the sub-register does not
   /// exist.
   unsigned getSubReg(unsigned Reg, unsigned Idx) const;
 
-  /// \brief Return a super-register of the specified register
+  /// Return a super-register of the specified register
   /// Reg so its sub-register of index SubIdx is Reg.
   unsigned getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
                                const MCRegisterClass *RC) const;
 
-  /// \brief For a given register pair, return the sub-register index
+  /// For a given register pair, return the sub-register index
   /// if the second register is a sub-register of the first. Return zero
   /// otherwise.
   unsigned getSubRegIndex(unsigned RegNo, unsigned SubRegNo) const;
 
-  /// \brief Get the size of the bit range covered by a sub-register index.
+  /// Get the size of the bit range covered by a sub-register index.
   /// If the index isn't continuous, return the sum of the sizes of its parts.
   /// If the index is used to access subregisters of different sizes, return -1.
   unsigned getSubRegIdxSize(unsigned Idx) const;
 
-  /// \brief Get the offset of the bit range covered by a sub-register index.
+  /// Get the offset of the bit range covered by a sub-register index.
   /// If an Offset doesn't make sense (the index isn't continuous, or is used to
   /// access sub-registers at different offsets), return -1.
   unsigned getSubRegIdxOffset(unsigned Idx) const;
 
-  /// \brief Return the human-readable symbolic target-specific name for the
+  /// Return the human-readable symbolic target-specific name for the
   /// specified physical register.
   const char *getName(unsigned RegNo) const {
     return RegStrings + get(RegNo).Name;
   }
 
-  /// \brief Return the number of registers this target has (useful for
+  /// Return the number of registers this target has (useful for
   /// sizing arrays holding per register information)
   unsigned getNumRegs() const {
     return NumRegs;
   }
 
-  /// \brief Return the number of sub-register indices
+  /// Return the number of sub-register indices
   /// understood by the target. Index 0 is reserved for the no-op sub-register,
   /// while 1 to getNumSubRegIndices() - 1 represent real sub-registers.
   unsigned getNumSubRegIndices() const {
     return NumSubRegIndices;
   }
 
-  /// \brief Return the number of (native) register units in the
+  /// Return the number of (native) register units in the
   /// target. Register units are numbered from 0 to getNumRegUnits() - 1. They
   /// can be accessed through MCRegUnitIterator defined below.
   unsigned getNumRegUnits() const {
     return NumRegUnits;
   }
 
-  /// \brief Map a target register to an equivalent dwarf register
+  /// Map a target register to an equivalent dwarf register
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
 
-  /// \brief Map a dwarf register back to a target register.
+  /// Map a dwarf register back to a target register.
   int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
-  /// \brief Map a DWARF EH register back to a target register (same as
+  /// Map a DWARF EH register back to a target register (same as
   /// getLLVMRegNum(RegNum, true)) but return -1 if there is no mapping,
   /// rather than asserting that there must be one.
   int getLLVMRegNumFromEH(unsigned RegNum) const;
 
-  /// \brief Map a target EH register number to an equivalent DWARF register
+  /// Map a target EH register number to an equivalent DWARF register
   /// number.
   int getDwarfRegNumFromDwarfEHRegNum(unsigned RegNum) const;
 
-  /// \brief Map a target register to an equivalent SEH register
+  /// Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
   int getSEHRegNum(unsigned RegNum) const;
 
-  /// \brief Map a target register to an equivalent CodeView register
+  /// Map a target register to an equivalent CodeView register
   /// number.
   int getCodeViewRegNum(unsigned RegNum) const;
 
@@ -434,7 +434,7 @@ public:
     return (unsigned)(regclass_end()-regclass_begin());
   }
 
-  /// \brief Returns the register class associated with the enumeration
+  /// Returns the register class associated with the enumeration
   /// value.  See class MCOperandInfo.
   const MCRegisterClass& getRegClass(unsigned i) const {
     assert(i < getNumRegClasses() && "Register Class ID out of range");
@@ -445,33 +445,33 @@ public:
     return RegClassStrings + Class->NameIdx;
   }
 
-   /// \brief Returns the encoding for RegNo
+   /// Returns the encoding for RegNo
   uint16_t getEncodingValue(unsigned RegNo) const {
     assert(RegNo < NumRegs &&
            "Attempting to get encoding for invalid register number!");
     return RegEncodingTable[RegNo];
   }
 
-  /// \brief Returns true if RegB is a sub-register of RegA.
+  /// Returns true if RegB is a sub-register of RegA.
   bool isSubRegister(unsigned RegA, unsigned RegB) const {
     return isSuperRegister(RegB, RegA);
   }
 
-  /// \brief Returns true if RegB is a super-register of RegA.
+  /// Returns true if RegB is a super-register of RegA.
   bool isSuperRegister(unsigned RegA, unsigned RegB) const;
 
-  /// \brief Returns true if RegB is a sub-register of RegA or if RegB == RegA.
+  /// Returns true if RegB is a sub-register of RegA or if RegB == RegA.
   bool isSubRegisterEq(unsigned RegA, unsigned RegB) const {
     return isSuperRegisterEq(RegB, RegA);
   }
 
-  /// \brief Returns true if RegB is a super-register of RegA or if
+  /// Returns true if RegB is a super-register of RegA or if
   /// RegB == RegA.
   bool isSuperRegisterEq(unsigned RegA, unsigned RegB) const {
     return RegA == RegB || isSuperRegister(RegA, RegB);
   }
 
-  /// \brief Returns true if RegB is a super-register or sub-register of RegA
+  /// Returns true if RegB is a super-register or sub-register of RegA
   /// or if RegB == RegA.
   bool isSuperOrSubRegisterEq(unsigned RegA, unsigned RegB) const {
     return isSubRegisterEq(RegA, RegB) || isSuperRegister(RegA, RegB);
@@ -651,17 +651,17 @@ public:
     Reg1 = MCRI->RegUnitRoots[RegUnit][1];
   }
 
-  /// \brief Dereference to get the current root register.
+  /// Dereference to get the current root register.
   unsigned operator*() const {
     return Reg0;
   }
 
-  /// \brief Check if the iterator is at the end of the list.
+  /// Check if the iterator is at the end of the list.
   bool isValid() const {
     return Reg0;
   }
 
-  /// \brief Preincrement to move to the next root register.
+  /// Preincrement to move to the next root register.
   void operator++() {
     assert(isValid() && "Cannot move off the end of the list.");
     Reg0 = Reg1;
diff --git a/contrib/llvm/include/llvm/MC/MCSchedule.h b/contrib/llvm/include/llvm/MC/MCSchedule.h
index a79afe163e6c..f2f1dfb36918 100644
--- a/contrib/llvm/include/llvm/MC/MCSchedule.h
+++ b/contrib/llvm/include/llvm/MC/MCSchedule.h
@@ -15,18 +15,22 @@
 #ifndef LLVM_MC_MCSCHEDULE_H
 #define LLVM_MC_MCSCHEDULE_H
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
 namespace llvm {
 
 struct InstrItinerary;
+class MCSubtargetInfo;
+class MCInstrInfo;
+class MCInst;
+class InstrItineraryData;
 
 /// Define a kind of processor resource that will be modeled by the scheduler.
 struct MCProcResourceDesc {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   const char *Name;
-#endif
   unsigned NumUnits; // Number of resource of this kind
   unsigned SuperIdx; // Index of the resources kind that contains this kind.
 
@@ -44,6 +48,11 @@ struct MCProcResourceDesc {
   // an out-of-order cpus.
   int BufferSize;
 
+  // If the resource has sub-units, a pointer to the first element of an array
+  // of `NumUnits` elements containing the ProcResourceIdx of the sub units.
+  // nullptr if the resource does not have sub-units.
+  const unsigned *SubUnitsIdxBegin;
+
   bool operator==(const MCProcResourceDesc &Other) const {
     return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
       && BufferSize == Other.BufferSize;
@@ -53,8 +62,8 @@ struct MCProcResourceDesc {
 /// Identify one of the processor resource kinds consumed by a particular
 /// scheduling class for the specified number of cycles.
 struct MCWriteProcResEntry {
-  unsigned ProcResourceIdx;
-  unsigned Cycles;
+  uint16_t ProcResourceIdx;
+  uint16_t Cycles;
 
   bool operator==(const MCWriteProcResEntry &Other) const {
     return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
@@ -67,8 +76,8 @@ struct MCWriteProcResEntry {
 /// the WriteResources of this def. When the operand expands to a sequence of
 /// writes, this ID is the last write in the sequence.
 struct MCWriteLatencyEntry {
-  int Cycles;
-  unsigned WriteResourceID;
+  int16_t Cycles;
+  uint16_t WriteResourceID;
 
   bool operator==(const MCWriteLatencyEntry &Other) const {
     return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
@@ -99,21 +108,21 @@ struct MCReadAdvanceEntry {
 ///
 /// Defined as an aggregate struct for creating tables with initializer lists.
 struct MCSchedClassDesc {
-  static const unsigned short InvalidNumMicroOps = UINT16_MAX;
-  static const unsigned short VariantNumMicroOps = UINT16_MAX - 1;
+  static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
+  static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   const char* Name;
 #endif
-  unsigned short NumMicroOps;
-  bool     BeginGroup;
-  bool     EndGroup;
-  unsigned WriteProcResIdx; // First index into WriteProcResTable.
-  unsigned NumWriteProcResEntries;
-  unsigned WriteLatencyIdx; // First index into WriteLatencyTable.
-  unsigned NumWriteLatencyEntries;
-  unsigned ReadAdvanceIdx; // First index into ReadAdvanceTable.
-  unsigned NumReadAdvanceEntries;
+  uint16_t NumMicroOps : 14;
+  bool     BeginGroup : 1;
+  bool     EndGroup : 1;
+  uint16_t WriteProcResIdx; // First index into WriteProcResTable.
+  uint16_t NumWriteProcResEntries;
+  uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
+  uint16_t NumWriteLatencyEntries;
+  uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
+  uint16_t NumReadAdvanceEntries;
 
   bool isValid() const {
     return NumMicroOps != InvalidNumMicroOps;
@@ -123,6 +132,64 @@ struct MCSchedClassDesc {
   }
 };
 
+/// Specify the cost of a register definition in terms of number of physical
+/// register allocated at register renaming stage. For example, AMD Jaguar.
+/// natively supports 128-bit data types, and operations on 256-bit registers
+/// (i.e. YMM registers) are internally split into two COPs (complex operations)
+/// and each COP updates a physical register. Basically, on Jaguar, a YMM
+/// register write effectively consumes two physical registers. That means,
+/// the cost of a YMM write in the BtVer2 model is 2.
+struct MCRegisterCostEntry {
+  unsigned RegisterClassID;
+  unsigned Cost;
+};
+
+/// A register file descriptor.
+///
+/// This struct allows to describe processor register files. In particular, it
+/// helps describing the size of the register file, as well as the cost of
+/// allocating a register file at register renaming stage.
+/// FIXME: this struct can be extended to provide information about the number
+/// of read/write ports to the register file.  A value of zero for field
+/// 'NumPhysRegs' means: this register file has an unbounded number of physical
+/// registers.
+struct MCRegisterFileDesc {
+  const char *Name;
+  uint16_t NumPhysRegs;
+  uint16_t NumRegisterCostEntries;
+  // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
+  uint16_t RegisterCostEntryIdx;
+};
+
+/// Provide extra details about the machine processor.
+///
+/// This is a collection of "optional" processor information that is not
+/// normally used by the LLVM machine schedulers, but that can be consumed by
+/// external tools like llvm-mca to improve the quality of the peformance
+/// analysis.
+struct MCExtraProcessorInfo {
+  // Actual size of the reorder buffer in hardware.
+  unsigned ReorderBufferSize;
+  // Number of instructions retired per cycle.
+  unsigned MaxRetirePerCycle;
+  const MCRegisterFileDesc *RegisterFiles;
+  unsigned NumRegisterFiles;
+  const MCRegisterCostEntry *RegisterCostTable;
+  unsigned NumRegisterCostEntries;
+
+  struct PfmCountersInfo {
+    // An optional name of a performance counter that can be used to measure
+    // cycles.
+    const char *CycleCounter;
+
+    // For each MCProcResourceDesc defined by the processor, an optional list of
+    // names of performance counters that can be used to measure the resource
+    // utilization.
+    const char **IssueCounters;
+  };
+  PfmCountersInfo PfmCounters;
+};
+
 /// Machine model for scheduling, bundling, and heuristics.
 ///
 /// The machine model directly provides basic information about the
@@ -133,9 +200,62 @@ struct MCSchedClassDesc {
 /// provides a detailed reservation table describing each cycle of instruction
 /// execution. Subtargets may define any or all of the above categories of data
 /// depending on the type of CPU and selected scheduler.
+///
+/// The machine independent properties defined here are used by the scheduler as
+/// an abstract machine model. A real micro-architecture has a number of
+/// buffers, queues, and stages. Declaring that a given machine-independent
+/// abstract property corresponds to a specific physical property across all
+/// subtargets can't be done. Nonetheless, the abstract model is
+/// useful. Futhermore, subtargets typically extend this model with processor
+/// specific resources to model any hardware features that can be exploited by
+/// sceduling heuristics and aren't sufficiently represented in the abstract.
+///
+/// The abstract pipeline is built around the notion of an "issue point". This
+/// is merely a reference point for counting machine cycles. The physical
+/// machine will have pipeline stages that delay execution. The scheduler does
+/// not model those delays because they are irrelevant as long as they are
+/// consistent. Inaccuracies arise when instructions have different execution
+/// delays relative to each other, in addition to their intrinsic latency. Those
+/// special cases can be handled by TableGen constructs such as, ReadAdvance,
+/// which reduces latency when reading data, and ResourceCycles, which consumes
+/// a processor resource when writing data for a number of abstract
+/// cycles.
+///
+/// TODO: One tool currently missing is the ability to add a delay to
+/// ResourceCycles. That would be easy to add and would likely cover all cases
+/// currently handled by the legacy itinerary tables.
+///
+/// A note on out-of-order execution and, more generally, instruction
+/// buffers. Part of the CPU pipeline is always in-order. The issue point, which
+/// is the point of reference for counting cycles, only makes sense as an
+/// in-order part of the pipeline. Other parts of the pipeline are sometimes
+/// falling behind and sometimes catching up. It's only interesting to model
+/// those other, decoupled parts of the pipeline if they may be predictably
+/// resource constrained in a way that the scheduler can exploit.
+///
+/// The LLVM machine model distinguishes between in-order constraints and
+/// out-of-order constraints so that the target's scheduling strategy can apply
+/// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
+/// resources would not typically be treated as a hard scheduling
+/// constraint. For example, in the GenericScheduler, a delay caused by limited
+/// out-of-order resources is not directly reflected in the number of cycles
+/// that the scheduler sees between issuing an instruction and its dependent
+/// instructions. In other words, out-of-order resources don't directly increase
+/// the latency between pairs of instructions. However, they can still be used
+/// to detect potential bottlenecks across a sequence of instructions and bias
+/// the scheduling heuristics appropriately.
 struct MCSchedModel {
   // IssueWidth is the maximum number of instructions that may be scheduled in
-  // the same per-cycle group.
+  // the same per-cycle group. This is meant to be a hard in-order constraint
+  // (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
+  // IssueWidth micro-ops can ever be scheduled in a particular cycle.
+  //
+  // In practice, IssueWidth is useful to model any bottleneck between the
+  // decoder (after micro-op expansion) and the out-of-order reservation
+  // stations or the decoder bandwidth itself. If the total number of
+  // reservation stations is also a bottleneck, or if any other pipeline stage
+  // has a bandwidth limitation, then that can be naturally modeled by adding an
+  // out-of-order processor resource.
   unsigned IssueWidth;
   static const unsigned DefaultIssueWidth = 1;
 
@@ -193,11 +313,21 @@ struct MCSchedModel {
   friend class InstrItineraryData;
   const InstrItinerary *InstrItineraries;
 
+  const MCExtraProcessorInfo *ExtraProcessorInfo;
+
+  bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
+
   unsigned getProcessorID() const { return ProcID; }
 
   /// Does this machine model include instruction-level scheduling.
   bool hasInstrSchedModel() const { return SchedClassTable; }
 
+  const MCExtraProcessorInfo &getExtraProcessorInfo() const {
+    assert(hasExtraProcessorInfo() &&
+           "No extra information available for this model");
+    return *ExtraProcessorInfo;
+  }
+
   /// Return true if this machine model data for all instructions with a
   /// scheduling class (itinerary class or SchedRW list).
   bool isComplete() const { return CompleteModel; }
@@ -223,11 +353,31 @@ struct MCSchedModel {
     return &SchedClassTable[SchedClassIdx];
   }
 
+  /// Returns the latency value for the scheduling class.
+  static int computeInstrLatency(const MCSubtargetInfo &STI,
+                                 const MCSchedClassDesc &SCDesc);
+
+  int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
+  int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
+                          const MCInst &Inst) const;
+
+  // Returns the reciprocal throughput information from a MCSchedClassDesc.
+  static double
+  getReciprocalThroughput(const MCSubtargetInfo &STI,
+                          const MCSchedClassDesc &SCDesc);
+
+  static double
+  getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
+
+  double
+  getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
+                          const MCInst &Inst) const;
+
   /// Returns the default initialized model.
   static const MCSchedModel &GetDefaultSchedModel() { return Default; }
   static const MCSchedModel Default;
 };
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/MC/MCSection.h b/contrib/llvm/include/llvm/MC/MCSection.h
index 2771b1e67eab..ba5c60d3ba58 100644
--- a/contrib/llvm/include/llvm/MC/MCSection.h
+++ b/contrib/llvm/include/llvm/MC/MCSection.h
@@ -40,7 +40,7 @@ class MCSection {
 public:
   enum SectionVariant { SV_COFF = 0, SV_ELF, SV_MachO, SV_Wasm };
 
-  /// \brief Express the state of bundle locked groups while emitting code.
+  /// Express the state of bundle locked groups while emitting code.
   enum BundleLockStateType {
     NotBundleLocked,
     BundleLocked,
@@ -65,13 +65,13 @@ private:
   /// The index of this section in the layout order.
   unsigned LayoutOrder;
 
-  /// \brief Keeping track of bundle-locked state.
+  /// Keeping track of bundle-locked state.
   BundleLockStateType BundleLockState = NotBundleLocked;
 
-  /// \brief Current nesting depth of bundle_lock directives.
+  /// Current nesting depth of bundle_lock directives.
   unsigned BundleLockNestingDepth = 0;
 
-  /// \brief We've seen a bundle_lock directive but not its first instruction
+  /// We've seen a bundle_lock directive but not its first instruction
   /// yet.
   bool BundleGroupBeforeFirstInst : 1;
 
diff --git a/contrib/llvm/include/llvm/MC/MCSectionWasm.h b/contrib/llvm/include/llvm/MC/MCSectionWasm.h
index cc467ed9837a..ab4cd7b007ec 100644
--- a/contrib/llvm/include/llvm/MC/MCSectionWasm.h
+++ b/contrib/llvm/include/llvm/MC/MCSectionWasm.h
@@ -26,8 +26,6 @@ class MCSymbol;
 
 /// This represents a section on wasm.
 class MCSectionWasm final : public MCSection {
-private:
-
   /// This is the name of the section.  The referenced memory is owned by
   /// TargetLoweringObjectFileWasm's WasmUniqueMap.
   StringRef SectionName;
@@ -39,17 +37,17 @@ private:
   // The offset of the MC function/data section in the wasm code/data section.
   // For data relocations the offset is relative to start of the data payload
   // itself and does not include the size of the section header.
-  uint64_t SectionOffset;
+  uint64_t SectionOffset = 0;
 
-  // For data sections, this is the offset of the corresponding wasm data
+  // For data sections, this is the index of of the corresponding wasm data
   // segment
-  uint64_t MemoryOffset;
+  uint32_t SegmentIndex = 0;
 
   friend class MCContext;
   MCSectionWasm(StringRef Section, SectionKind K, const MCSymbolWasm *group,
                 unsigned UniqueID, MCSymbol *Begin)
       : MCSection(SV_Wasm, K, Begin), SectionName(Section), UniqueID(UniqueID),
-        Group(group), SectionOffset(0) {}
+        Group(group) {}
 
   void setSectionName(StringRef Name) { SectionName = Name; }
 
@@ -79,8 +77,8 @@ public:
   uint64_t getSectionOffset() const { return SectionOffset; }
   void setSectionOffset(uint64_t Offset) { SectionOffset = Offset; }
 
-  uint32_t getMemoryOffset() const { return MemoryOffset; }
-  void setMemoryOffset(uint32_t Offset) { MemoryOffset = Offset; }
+  uint32_t getSegmentIndex() const { return SegmentIndex; }
+  void setSegmentIndex(uint32_t Index) { SegmentIndex = Index; }
 
   static bool classof(const MCSection *S) { return S->getVariant() == SV_Wasm; }
 };
diff --git a/contrib/llvm/include/llvm/MC/MCStreamer.h b/contrib/llvm/include/llvm/MC/MCStreamer.h
index 5b564e538bb2..0a5d80c6d778 100644
--- a/contrib/llvm/include/llvm/MC/MCStreamer.h
+++ b/contrib/llvm/include/llvm/MC/MCStreamer.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
@@ -23,6 +24,8 @@
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWinEH.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
 #include <cassert>
@@ -167,7 +170,7 @@ private:
   std::unique_ptr<AssemblerConstantPools> ConstantPools;
 };
 
-/// \brief Streaming machine code generation interface.
+/// Streaming machine code generation interface.
 ///
 /// This interface is intended to provide a programatic interface that is very
 /// similar to the level that an assembler .s file provides.  It has callbacks
@@ -194,11 +197,11 @@ class MCStreamer {
   /// closed. Otherwise, issue an error and return null.
   WinEH::FrameInfo *EnsureValidWinFrameInfo(SMLoc Loc);
 
-  /// \brief Tracks an index to represent the order a symbol was emitted in.
+  /// Tracks an index to represent the order a symbol was emitted in.
   /// Zero means we did not emit that symbol.
   DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
 
-  /// \brief This is stack of current and previous section values saved by
+  /// This is stack of current and previous section values saved by
   /// PushSection.
   SmallVector<std::pair<MCSectionSubPair, MCSectionSubPair>, 4> SectionStack;
 
@@ -208,6 +211,8 @@ class MCStreamer {
   /// requires.
   unsigned NextWinCFIID = 0;
 
+  bool UseAssemblerInfoForParsing;
+
 protected:
   MCStreamer(MCContext &Ctx);
 
@@ -244,6 +249,11 @@ public:
 
   MCContext &getContext() const { return Context; }
 
+  virtual MCAssembler *getAssemblerPtr() { return nullptr; }
+
+  void setUseAssemblerInfoForParsing(bool v) { UseAssemblerInfoForParsing = v; }
+  bool getUseAssemblerInfoForParsing() { return UseAssemblerInfoForParsing; }
+
   MCTargetStreamer *getTargetStreamer() {
     return TargetStreamer.get();
   }
@@ -265,19 +275,19 @@ public:
   /// \name Assembly File Formatting.
   /// @{
 
-  /// \brief Return true if this streamer supports verbose assembly and if it is
+  /// Return true if this streamer supports verbose assembly and if it is
   /// enabled.
   virtual bool isVerboseAsm() const { return false; }
 
-  /// \brief Return true if this asm streamer supports emitting unformatted text
+  /// Return true if this asm streamer supports emitting unformatted text
   /// to the .s file with EmitRawText.
   virtual bool hasRawTextSupport() const { return false; }
 
-  /// \brief Is the integrated assembler required for this streamer to function
+  /// Is the integrated assembler required for this streamer to function
   /// correctly?
   virtual bool isIntegratedAssemblerRequired() const { return false; }
 
-  /// \brief Add a textual comment.
+  /// Add a textual comment.
   ///
   /// Typically for comments that can be emitted to the generated .s
   /// file if applicable as a QoI issue to make the output of the compiler
@@ -292,22 +302,22 @@ public:
   /// with a false value.
   virtual void AddComment(const Twine &T, bool EOL = true) {}
 
-  /// \brief Return a raw_ostream that comments can be written to. Unlike
+  /// Return a raw_ostream that comments can be written to. Unlike
   /// AddComment, you are required to terminate comments with \n if you use this
   /// method.
   virtual raw_ostream &GetCommentOS();
 
-  /// \brief Print T and prefix it with the comment string (normally #) and
+  /// Print T and prefix it with the comment string (normally #) and
   /// optionally a tab. This prints the comment immediately, not at the end of
   /// the current line. It is basically a safe version of EmitRawText: since it
   /// only prints comments, the object streamer ignores it instead of asserting.
   virtual void emitRawComment(const Twine &T, bool TabPrefix = true);
 
-  /// \brief Add explicit comment T. T is required to be a valid
+  /// Add explicit comment T. T is required to be a valid
   /// comment in the output and does not need to be escaped.
   virtual void addExplicitComment(const Twine &T);
 
-  /// \brief Emit added explicit comments.
+  /// Emit added explicit comments.
   virtual void emitExplicitComments();
 
   /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
@@ -318,7 +328,7 @@ public:
   /// \name Symbol & Section Management
   /// @{
 
-  /// \brief Return the current section that the streamer is emitting code to.
+  /// Return the current section that the streamer is emitting code to.
   MCSectionSubPair getCurrentSection() const {
     if (!SectionStack.empty())
       return SectionStack.back().first;
@@ -326,32 +336,32 @@ public:
   }
   MCSection *getCurrentSectionOnly() const { return getCurrentSection().first; }
 
-  /// \brief Return the previous section that the streamer is emitting code to.
+  /// Return the previous section that the streamer is emitting code to.
   MCSectionSubPair getPreviousSection() const {
     if (!SectionStack.empty())
       return SectionStack.back().second;
     return MCSectionSubPair();
   }
 
-  /// \brief Returns an index to represent the order a symbol was emitted in.
+  /// Returns an index to represent the order a symbol was emitted in.
   /// (zero if we did not emit that symbol)
   unsigned GetSymbolOrder(const MCSymbol *Sym) const {
     return SymbolOrdering.lookup(Sym);
   }
 
-  /// \brief Update streamer for a new active section.
+  /// Update streamer for a new active section.
   ///
   /// This is called by PopSection and SwitchSection, if the current
   /// section changes.
   virtual void ChangeSection(MCSection *, const MCExpr *);
 
-  /// \brief Save the current and previous section on the section stack.
+  /// Save the current and previous section on the section stack.
   void PushSection() {
     SectionStack.push_back(
         std::make_pair(getCurrentSection(), getPreviousSection()));
   }
 
-  /// \brief Restore the current and previous section from the section stack.
+  /// Restore the current and previous section from the section stack.
   /// Calls ChangeSection as needed.
   ///
   /// Returns false if the stack was empty.
@@ -385,7 +395,7 @@ public:
   virtual void SwitchSection(MCSection *Section,
                              const MCExpr *Subsection = nullptr);
 
-  /// \brief Set the current section where code is being emitted to \p Section.
+  /// Set the current section where code is being emitted to \p Section.
   /// This is required to update CurSection. This version does not call
   /// ChangeSection.
   void SwitchSectionNoChange(MCSection *Section,
@@ -397,18 +407,18 @@ public:
       SectionStack.back().first = MCSectionSubPair(Section, Subsection);
   }
 
-  /// \brief Create the default sections and set the initial one.
+  /// Create the default sections and set the initial one.
   virtual void InitSections(bool NoExecStack);
 
   MCSymbol *endSection(MCSection *Section);
 
-  /// \brief Sets the symbol's section.
+  /// Sets the symbol's section.
   ///
   /// Each emitted symbol will be tracked in the ordering table,
   /// so we can sort on them later.
   void AssignFragment(MCSymbol *Symbol, MCFragment *Fragment);
 
-  /// \brief Emit a label for \p Symbol into the current section.
+  /// Emit a label for \p Symbol into the current section.
   ///
   /// This corresponds to an assembler statement such as:
   ///   foo:
@@ -422,17 +432,17 @@ public:
 
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol);
 
-  /// \brief Note in the output the specified \p Flag.
+  /// Note in the output the specified \p Flag.
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
 
-  /// \brief Emit the given list \p Options of strings as linker
+  /// Emit the given list \p Options of strings as linker
   /// options into the output.
   virtual void EmitLinkerOptions(ArrayRef<std::string> Kind) {}
 
-  /// \brief Note in the output the specified region \p Kind.
+  /// Note in the output the specified region \p Kind.
   virtual void EmitDataRegion(MCDataRegionType Kind) {}
 
-  /// \brief Specify the Mach-O minimum deployment target version.
+  /// Specify the Mach-O minimum deployment target version.
   virtual void EmitVersionMin(MCVersionMinType Type, unsigned Major,
                               unsigned Minor, unsigned Update) {}
 
@@ -443,11 +453,11 @@ public:
 
   void EmitVersionForTarget(const Triple &Target);
 
-  /// \brief Note in the output that the specified \p Func is a Thumb mode
+  /// Note in the output that the specified \p Func is a Thumb mode
   /// function (ARM target only).
   virtual void EmitThumbFunc(MCSymbol *Func);
 
-  /// \brief Emit an assignment of \p Value to \p Symbol.
+  /// Emit an assignment of \p Value to \p Symbol.
   ///
   /// This corresponds to an assembler statement such as:
   ///  symbol = value
@@ -460,7 +470,7 @@ public:
   /// \param Value - The value for the symbol.
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
 
-  /// \brief Emit an weak reference from \p Alias to \p Symbol.
+  /// Emit an weak reference from \p Alias to \p Symbol.
   ///
   /// This corresponds to an assembler statement such as:
   ///  .weakref alias, symbol
@@ -469,53 +479,61 @@ public:
   /// \param Symbol - The symbol being aliased.
   virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
 
-  /// \brief Add the given \p Attribute to \p Symbol.
+  /// Add the given \p Attribute to \p Symbol.
   virtual bool EmitSymbolAttribute(MCSymbol *Symbol,
                                    MCSymbolAttr Attribute) = 0;
 
-  /// \brief Set the \p DescValue for the \p Symbol.
+  /// Set the \p DescValue for the \p Symbol.
   ///
   /// \param Symbol - The symbol to have its n_desc field set.
   /// \param DescValue - The value to set into the n_desc field.
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
 
-  /// \brief Start emitting COFF symbol definition
+  /// Start emitting COFF symbol definition
   ///
   /// \param Symbol - The symbol to have its External & Type fields set.
   virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
 
-  /// \brief Emit the storage class of the symbol.
+  /// Emit the storage class of the symbol.
   ///
   /// \param StorageClass - The storage class the symbol should have.
   virtual void EmitCOFFSymbolStorageClass(int StorageClass);
 
-  /// \brief Emit the type of the symbol.
+  /// Emit the type of the symbol.
   ///
   /// \param Type - A COFF type identifier (see COFF::SymbolType in X86COFF.h)
   virtual void EmitCOFFSymbolType(int Type);
 
-  /// \brief Marks the end of the symbol definition.
+  /// Marks the end of the symbol definition.
   virtual void EndCOFFSymbolDef();
 
   virtual void EmitCOFFSafeSEH(MCSymbol const *Symbol);
 
-  /// \brief Emits a COFF section index.
+  /// Emits the symbol table index of a Symbol into the current section.
+  virtual void EmitCOFFSymbolIndex(MCSymbol const *Symbol);
+
+  /// Emits a COFF section index.
   ///
   /// \param Symbol - Symbol the section number relocation should point to.
   virtual void EmitCOFFSectionIndex(MCSymbol const *Symbol);
 
-  /// \brief Emits a COFF section relative relocation.
+  /// Emits a COFF section relative relocation.
   ///
   /// \param Symbol - Symbol the section relative relocation should point to.
   virtual void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset);
 
-  /// \brief Emit an ELF .size directive.
+  /// Emits a COFF image relative relocation.
+  ///
+  /// \param Symbol - Symbol the image relative relocation should point to.
+  virtual void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
+
+  /// Emit an ELF .size directive.
   ///
   /// This corresponds to an assembler statement such as:
   ///  .size symbol, expression
   virtual void emitELFSize(MCSymbol *Symbol, const MCExpr *Value);
 
-  /// \brief Emit an ELF .symver directive.
+  /// Emit an ELF .symver directive.
   ///
   /// This corresponds to an assembler statement such as:
   ///  .symver _start, foo@@SOME_VERSION
@@ -524,11 +542,11 @@ public:
   virtual void emitELFSymverDirective(StringRef AliasName,
                                       const MCSymbol *Aliasee);
 
-  /// \brief Emit a Linker Optimization Hint (LOH) directive.
+  /// Emit a Linker Optimization Hint (LOH) directive.
   /// \param Args - Arguments of the LOH.
   virtual void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {}
 
-  /// \brief Emit a common symbol.
+  /// Emit a common symbol.
   ///
   /// \param Symbol - The common symbol to emit.
   /// \param Size - The size of the common symbol.
@@ -537,7 +555,7 @@ public:
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                 unsigned ByteAlignment) = 0;
 
-  /// \brief Emit a local common (.lcomm) symbol.
+  /// Emit a local common (.lcomm) symbol.
   ///
   /// \param Symbol - The common symbol to emit.
   /// \param Size - The size of the common symbol.
@@ -545,7 +563,7 @@ public:
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment);
 
-  /// \brief Emit the zerofill section and an optional symbol.
+  /// Emit the zerofill section and an optional symbol.
   ///
   /// \param Section - The zerofill section to create and or to put the symbol
   /// \param Symbol - The zerofill symbol to emit, if non-NULL.
@@ -553,9 +571,10 @@ public:
   /// \param ByteAlignment - The alignment of the zerofill symbol if
   /// non-zero. This must be a power of 2 on some targets.
   virtual void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                            uint64_t Size = 0, unsigned ByteAlignment = 0) = 0;
+                            uint64_t Size = 0, unsigned ByteAlignment = 0,
+                            SMLoc Loc = SMLoc()) = 0;
 
-  /// \brief Emit a thread local bss (.tbss) symbol.
+  /// Emit a thread local bss (.tbss) symbol.
   ///
   /// \param Section - The thread local common section.
   /// \param Symbol - The thread local common symbol to emit.
@@ -569,7 +588,7 @@ public:
   /// \name Generating Data
   /// @{
 
-  /// \brief Emit the bytes in \p Data into the output.
+  /// Emit the bytes in \p Data into the output.
   ///
   /// This is used to implement assembler directives such as .byte, .ascii,
   /// etc.
@@ -579,7 +598,7 @@ public:
   /// method uses .byte directives instead of .ascii or .asciz for readability.
   virtual void EmitBinaryData(StringRef Data);
 
-  /// \brief Emit the expression \p Value into the output as a native
+  /// Emit the expression \p Value into the output as a native
   /// integer of the given \p Size bytes.
   ///
   /// This is used to implement assembler directives such as .word, .quad,
@@ -594,7 +613,7 @@ public:
 
   void EmitValue(const MCExpr *Value, unsigned Size, SMLoc Loc = SMLoc());
 
-  /// \brief Special case of EmitValue that avoids the client having
+  /// Special case of EmitValue that avoids the client having
   /// to pass in a MCExpr for constant integers.
   virtual void EmitIntValue(uint64_t Value, unsigned Size);
 
@@ -602,70 +621,66 @@ public:
 
   virtual void EmitSLEB128Value(const MCExpr *Value);
 
-  /// \brief Special case of EmitULEB128Value that avoids the client having to
+  /// Special case of EmitULEB128Value that avoids the client having to
   /// pass in a MCExpr for constant integers.
   void EmitULEB128IntValue(uint64_t Value);
 
-  /// \brief Like EmitULEB128Value but pads the output to specific number of
-  /// bytes.
-  void EmitPaddedULEB128IntValue(uint64_t Value, unsigned PadTo);
-
-  /// \brief Special case of EmitSLEB128Value that avoids the client having to
+  /// Special case of EmitSLEB128Value that avoids the client having to
   /// pass in a MCExpr for constant integers.
   void EmitSLEB128IntValue(int64_t Value);
 
-  /// \brief Special case of EmitValue that avoids the client having to pass in
+  /// Special case of EmitValue that avoids the client having to pass in
   /// a MCExpr for MCSymbols.
   void EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
                        bool IsSectionRelative = false);
 
-  /// \brief Emit the expression \p Value into the output as a dtprel
+  /// Emit the expression \p Value into the output as a dtprel
   /// (64-bit DTP relative) value.
   ///
   /// This is used to implement assembler directives such as .dtpreldword on
   /// targets that support them.
   virtual void EmitDTPRel64Value(const MCExpr *Value);
 
-  /// \brief Emit the expression \p Value into the output as a dtprel
+  /// Emit the expression \p Value into the output as a dtprel
   /// (32-bit DTP relative) value.
   ///
   /// This is used to implement assembler directives such as .dtprelword on
   /// targets that support them.
   virtual void EmitDTPRel32Value(const MCExpr *Value);
 
-  /// \brief Emit the expression \p Value into the output as a tprel
+  /// Emit the expression \p Value into the output as a tprel
   /// (64-bit TP relative) value.
   ///
   /// This is used to implement assembler directives such as .tpreldword on
   /// targets that support them.
   virtual void EmitTPRel64Value(const MCExpr *Value);
 
-  /// \brief Emit the expression \p Value into the output as a tprel
+  /// Emit the expression \p Value into the output as a tprel
   /// (32-bit TP relative) value.
   ///
   /// This is used to implement assembler directives such as .tprelword on
   /// targets that support them.
   virtual void EmitTPRel32Value(const MCExpr *Value);
 
-  /// \brief Emit the expression \p Value into the output as a gprel64 (64-bit
+  /// Emit the expression \p Value into the output as a gprel64 (64-bit
   /// GP relative) value.
   ///
   /// This is used to implement assembler directives such as .gpdword on
   /// targets that support them.
   virtual void EmitGPRel64Value(const MCExpr *Value);
 
-  /// \brief Emit the expression \p Value into the output as a gprel32 (32-bit
+  /// Emit the expression \p Value into the output as a gprel32 (32-bit
   /// GP relative) value.
   ///
   /// This is used to implement assembler directives such as .gprel32 on
   /// targets that support them.
   virtual void EmitGPRel32Value(const MCExpr *Value);
 
-  /// \brief Emit NumBytes bytes worth of the value specified by FillValue.
+  /// Emit NumBytes bytes worth of the value specified by FillValue.
   /// This implements directives such as '.space'.
   void emitFill(uint64_t NumBytes, uint8_t FillValue);
 
-  /// \brief Emit \p Size bytes worth of the value specified by \p FillValue.
+  /// Emit \p Size bytes worth of the value specified by \p FillValue.
   ///
   /// This is used to implement assembler directives such as .space or .skip.
   ///
@@ -675,7 +690,7 @@ public:
   virtual void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                         SMLoc Loc = SMLoc());
 
-  /// \brief Emit \p NumValues copies of \p Size bytes. Each \p Size bytes is
+  /// Emit \p NumValues copies of \p Size bytes. Each \p Size bytes is
   /// taken from the lowest order 4 bytes of \p Expr expression.
   ///
   /// This is used to implement assembler directives such as .fill.
@@ -683,15 +698,14 @@ public:
   /// \param NumValues - The number of copies of \p Size bytes to emit.
   /// \param Size - The size (in bytes) of each repeated value.
   /// \param Expr - The expression from which \p Size bytes are used.
-  virtual void emitFill(uint64_t NumValues, int64_t Size, int64_t Expr);
   virtual void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                         SMLoc Loc = SMLoc());
 
-  /// \brief Emit NumBytes worth of zeros.
+  /// Emit NumBytes worth of zeros.
   /// This function properly handles data in virtual sections.
   void EmitZeros(uint64_t NumBytes);
 
-  /// \brief Emit some number of copies of \p Value until the byte alignment \p
+  /// Emit some number of copies of \p Value until the byte alignment \p
   /// ByteAlignment is reached.
   ///
   /// If the number of bytes need to emit for the alignment is not a multiple
@@ -712,7 +726,7 @@ public:
                                     unsigned ValueSize = 1,
                                     unsigned MaxBytesToEmit = 0);
 
-  /// \brief Emit nops until the byte alignment \p ByteAlignment is reached.
+  /// Emit nops until the byte alignment \p ByteAlignment is reached.
   ///
   /// This used to align code where the alignment bytes may be executed.  This
   /// can emit different bytes for different sizes to optimize execution.
@@ -725,7 +739,7 @@ public:
   virtual void EmitCodeAlignment(unsigned ByteAlignment,
                                  unsigned MaxBytesToEmit = 0);
 
-  /// \brief Emit some number of copies of \p Value until the byte offset \p
+  /// Emit some number of copies of \p Value until the byte offset \p
   /// Offset is reached.
   ///
   /// This is used to implement assembler directives such as .org.
@@ -744,21 +758,43 @@ public:
 
   /// @}
 
-  /// \brief Switch to a new logical file.  This is used to implement the '.file
+  /// Switch to a new logical file.  This is used to implement the '.file
   /// "foo.c"' assembler directive.
   virtual void EmitFileDirective(StringRef Filename);
 
-  /// \brief Emit the "identifiers" directive.  This implements the
+  /// Emit the "identifiers" directive.  This implements the
   /// '.ident "version foo"' assembler directive.
   virtual void EmitIdent(StringRef IdentString) {}
 
-  /// \brief Associate a filename with a specified logical file number.  This
+  /// Associate a filename with a specified logical file number.  This
   /// implements the DWARF2 '.file 4 "foo.c"' assembler directive.
-  virtual unsigned EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
-                                          StringRef Filename,
-                                          unsigned CUID = 0);
+  unsigned EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                  StringRef Filename,
+                                  MD5::MD5Result *Checksum = nullptr,
+                                  Optional<StringRef> Source = None,
+                                  unsigned CUID = 0) {
+    return cantFail(
+        tryEmitDwarfFileDirective(FileNo, Directory, Filename, Checksum,
+                                  Source, CUID));
+  }
 
-  /// \brief This implements the DWARF2 '.loc fileno lineno ...' assembler
+  /// Associate a filename with a specified logical file number.
+  /// Also associate a directory, optional checksum, and optional source
+  /// text with the logical file.  This implements the DWARF2
+  /// '.file 4 "dir/foo.c"' assembler directive, and the DWARF5
+  /// '.file 4 "dir/foo.c" md5 "..." source "..."' assembler directive.
+  virtual Expected<unsigned> tryEmitDwarfFileDirective(
+      unsigned FileNo, StringRef Directory, StringRef Filename,
+      MD5::MD5Result *Checksum = nullptr, Optional<StringRef> Source = None,
+      unsigned CUID = 0);
+
+  /// Specify the "root" file of the compilation, using the ".file 0" extension.
+  virtual void emitDwarfFile0Directive(StringRef Directory, StringRef Filename,
+                                       MD5::MD5Result *Checksum,
+                                       Optional<StringRef> Source,
+                                       unsigned CUID = 0);
+
+  /// This implements the DWARF2 '.loc fileno lineno ...' assembler
   /// directive.
   virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                      unsigned Column, unsigned Flags,
@@ -772,27 +808,27 @@ public:
                                    ArrayRef<uint8_t> Checksum,
                                    unsigned ChecksumKind);
 
-  /// \brief Introduces a function id for use with .cv_loc.
+  /// Introduces a function id for use with .cv_loc.
   virtual bool EmitCVFuncIdDirective(unsigned FunctionId);
 
-  /// \brief Introduces an inline call site id for use with .cv_loc. Includes
+  /// Introduces an inline call site id for use with .cv_loc. Includes
   /// extra information for inline line table generation.
   virtual bool EmitCVInlineSiteIdDirective(unsigned FunctionId, unsigned IAFunc,
                                            unsigned IAFile, unsigned IALine,
                                            unsigned IACol, SMLoc Loc);
 
-  /// \brief This implements the CodeView '.cv_loc' assembler directive.
+  /// This implements the CodeView '.cv_loc' assembler directive.
   virtual void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                   unsigned Line, unsigned Column,
                                   bool PrologueEnd, bool IsStmt,
                                   StringRef FileName, SMLoc Loc);
 
-  /// \brief This implements the CodeView '.cv_linetable' assembler directive.
+  /// This implements the CodeView '.cv_linetable' assembler directive.
   virtual void EmitCVLinetableDirective(unsigned FunctionId,
                                         const MCSymbol *FnStart,
                                         const MCSymbol *FnEnd);
 
-  /// \brief This implements the CodeView '.cv_inline_linetable' assembler
+  /// This implements the CodeView '.cv_inline_linetable' assembler
   /// directive.
   virtual void EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
                                               unsigned SourceFileId,
@@ -800,16 +836,16 @@ public:
                                               const MCSymbol *FnStartSym,
                                               const MCSymbol *FnEndSym);
 
-  /// \brief This implements the CodeView '.cv_def_range' assembler
+  /// This implements the CodeView '.cv_def_range' assembler
   /// directive.
   virtual void EmitCVDefRangeDirective(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
       StringRef FixedSizePortion);
 
-  /// \brief This implements the CodeView '.cv_stringtable' assembler directive.
+  /// This implements the CodeView '.cv_stringtable' assembler directive.
   virtual void EmitCVStringTableDirective() {}
 
-  /// \brief This implements the CodeView '.cv_filechecksums' assembler directive.
+  /// This implements the CodeView '.cv_filechecksums' assembler directive.
   virtual void EmitCVFileChecksumsDirective() {}
 
   /// This implements the CodeView '.cv_filechecksumoffset' assembler
@@ -825,6 +861,10 @@ public:
   virtual void emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
                                       unsigned Size);
 
+  /// Emit the absolute difference between two symbols encoded with ULEB128.
+  virtual void emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
+                                               const MCSymbol *Lo);
+
   virtual MCSymbol *getDwarfLineTableSymbol(unsigned CUID);
   virtual void EmitCFISections(bool EH, bool Debug);
   void EmitCFIStartProc(bool IsSimple);
@@ -867,6 +907,9 @@ public:
                                 SMLoc Loc = SMLoc());
   virtual void EmitWinEHHandlerData(SMLoc Loc = SMLoc());
 
+  virtual void emitCGProfileEntry(const MCSymbolRefExpr *From,
+                                  const MCSymbolRefExpr *To, uint64_t Count);
+
   /// Get the .pdata section used for the given section. Typically the given
   /// section is either the main .text section or some other COMDAT .text
   /// section, but it may be any section containing code.
@@ -877,41 +920,45 @@ public:
 
   virtual void EmitSyntaxDirective();
 
-  /// \brief Emit a .reloc directive.
+  /// Emit a .reloc directive.
   /// Returns true if the relocation could not be emitted because Name is not
   /// known.
   virtual bool EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                  const MCExpr *Expr, SMLoc Loc) {
+                                  const MCExpr *Expr, SMLoc Loc,
+                                  const MCSubtargetInfo &STI) {
     return true;
   }
 
-  /// \brief Emit the given \p Instruction into the current section.
+  virtual void EmitAddrsig() {}
+  virtual void EmitAddrsigSym(const MCSymbol *Sym) {}
+
+  /// Emit the given \p Instruction into the current section.
   /// PrintSchedInfo == true then schedul comment should be added to output
   virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                                bool PrintSchedInfo = false);
 
-  /// \brief Set the bundle alignment mode from now on in the section.
+  /// Set the bundle alignment mode from now on in the section.
   /// The argument is the power of 2 to which the alignment is set. The
   /// value 0 means turn the bundle alignment off.
   virtual void EmitBundleAlignMode(unsigned AlignPow2);
 
-  /// \brief The following instructions are a bundle-locked group.
+  /// The following instructions are a bundle-locked group.
   ///
   /// \param AlignToEnd - If true, the bundle-locked group will be aligned to
   ///                     the end of a bundle.
   virtual void EmitBundleLock(bool AlignToEnd);
 
-  /// \brief Ends a bundle-locked group.
+  /// Ends a bundle-locked group.
   virtual void EmitBundleUnlock();
 
-  /// \brief If this file is backed by a assembly streamer, this dumps the
+  /// If this file is backed by a assembly streamer, this dumps the
   /// specified string in the output .s file.  This capability is indicated by
   /// the hasRawTextSupport() predicate.  By default this aborts.
   void EmitRawText(const Twine &String);
 
-  /// \brief Streamer specific finalization.
+  /// Streamer specific finalization.
   virtual void FinishImpl();
-  /// \brief Finish emission of machine code.
+  /// Finish emission of machine code.
   void Finish();
 
   virtual bool mayHaveInstructions(MCSection &Sec) const { return true; }
diff --git a/contrib/llvm/include/llvm/MC/MCSubtargetInfo.h b/contrib/llvm/include/llvm/MC/MCSubtargetInfo.h
index dd10881b73a8..b3ce523d9c0c 100644
--- a/contrib/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/contrib/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -27,15 +27,14 @@
 
 namespace llvm {
 
-class MachineInstr;
 class MCInst;
 
 //===----------------------------------------------------------------------===//
 ///
-/// MCSubtargetInfo - Generic base class for all target subtargets.
+/// Generic base class for all target subtargets.
 ///
 class MCSubtargetInfo {
-  Triple TargetTriple;                        // Target triple
+  Triple TargetTriple;
   std::string CPU; // CPU being targeted.
   ArrayRef<SubtargetFeatureKV> ProcFeatures;  // Processor feature list
   ArrayRef<SubtargetFeatureKV> ProcDesc;  // Processor descriptions
@@ -49,7 +48,7 @@ class MCSubtargetInfo {
 
   const InstrStage *Stages;            // Instruction itinerary stages
   const unsigned *OperandCycles;       // Itinerary operand cycles
-  const unsigned *ForwardingPaths;     // Forwarding paths
+  const unsigned *ForwardingPaths;
   FeatureBitset FeatureBits;           // Feature bits for current CPU + FS
 
 public:
@@ -66,22 +65,10 @@ public:
   MCSubtargetInfo &operator=(MCSubtargetInfo &&) = delete;
   virtual ~MCSubtargetInfo() = default;
 
-  /// getTargetTriple - Return the target triple string.
   const Triple &getTargetTriple() const { return TargetTriple; }
+  StringRef getCPU() const { return CPU; }
 
-  /// getCPU - Return the CPU string.
-  StringRef getCPU() const {
-    return CPU;
-  }
-
-  /// getFeatureBits - Return the feature bits.
-  ///
-  const FeatureBitset& getFeatureBits() const {
-    return FeatureBits;
-  }
-
-  /// setFeatureBits - Set the feature bits.
-  ///
+  const FeatureBitset& getFeatureBits() const { return FeatureBits; }
   void setFeatureBits(const FeatureBitset &FeatureBits_) {
     FeatureBits = FeatureBits_;
   }
@@ -102,16 +89,16 @@ public:
   /// string.
   void setDefaultFeatures(StringRef CPU, StringRef FS);
 
-  /// ToggleFeature - Toggle a feature and returns the re-computed feature
-  /// bits. This version does not change the implied bits.
+  /// Toggle a feature and return the re-computed feature bits.
+  /// This version does not change the implied bits.
   FeatureBitset ToggleFeature(uint64_t FB);
 
-  /// ToggleFeature - Toggle a feature and returns the re-computed feature
-  /// bits. This version does not change the implied bits.
+  /// Toggle a feature and return the re-computed feature bits.
+  /// This version does not change the implied bits.
   FeatureBitset ToggleFeature(const FeatureBitset& FB);
 
-  /// ToggleFeature - Toggle a set of features and returns the re-computed
-  /// feature bits. This version will also change all implied bits.
+  /// Toggle a set of features and return the re-computed feature bits.
+  /// This version will also change all implied bits.
   FeatureBitset ToggleFeature(StringRef FS);
 
   /// Apply a feature flag and return the re-computed feature bits, including
@@ -122,8 +109,7 @@ public:
   /// the provided string, ignoring all other features.
   bool checkFeatures(StringRef FS) const;
 
-  /// getSchedModelForCPU - Get the machine model of a CPU.
-  ///
+  /// Get the machine model of a CPU.
   const MCSchedModel &getSchedModelForCPU(StringRef CPU) const;
 
   /// Get the machine model for this subtarget's CPU.
@@ -167,13 +153,19 @@ public:
     return 0;
   }
 
-  /// getInstrItineraryForCPU - Get scheduling itinerary of a CPU.
-  ///
+  /// Get scheduling itinerary of a CPU.
   InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
 
   /// Initialize an InstrItineraryData instance.
   void initInstrItins(InstrItineraryData &InstrItins) const;
 
+  /// Resolve a variant scheduling class for the given MCInst and CPU.
+  virtual unsigned
+  resolveVariantSchedClass(unsigned SchedClass, const MCInst *MI,
+                           unsigned CPUID) const {
+    return 0;
+  }
+
   /// Check whether the CPU string is valid.
   bool isCPUStringValid(StringRef CPU) const {
     auto Found = std::lower_bound(ProcDesc.begin(), ProcDesc.end(), CPU);
@@ -181,10 +173,6 @@ public:
   }
 
   /// Returns string representation of scheduler comment
-  virtual std::string getSchedInfoStr(const MachineInstr &MI) const {
-    return {};
-  }
-
   virtual std::string getSchedInfoStr(MCInst const &MCI) const {
     return {};
   }
diff --git a/contrib/llvm/include/llvm/MC/MCSymbol.h b/contrib/llvm/include/llvm/MC/MCSymbol.h
index 9b1cc6e7d7e8..4681a1be60c4 100644
--- a/contrib/llvm/include/llvm/MC/MCSymbol.h
+++ b/contrib/llvm/include/llvm/MC/MCSymbol.h
@@ -85,7 +85,7 @@ protected:
   /// "Lfoo" or ".foo".
   unsigned IsTemporary : 1;
 
-  /// \brief True if this symbol can be redefined.
+  /// True if this symbol can be redefined.
   unsigned IsRedefinable : 1;
 
   /// IsUsed - True if this symbol has been used.
@@ -141,7 +141,7 @@ protected:
   friend class MCExpr;
   friend class MCContext;
 
-  /// \brief The name for a symbol.
+  /// The name for a symbol.
   /// MCSymbol contains a uint64_t so is probably aligned to 8.  On a 32-bit
   /// system, the name is a pointer so isn't going to satisfy the 8 byte
   /// alignment of uint64_t.  Account for that here.
@@ -168,24 +168,24 @@ protected:
 
 private:
   void operator delete(void *);
-  /// \brief Placement delete - required by std, but never called.
+  /// Placement delete - required by std, but never called.
   void operator delete(void*, unsigned) {
     llvm_unreachable("Constructor throws?");
   }
-  /// \brief Placement delete - required by std, but never called.
+  /// Placement delete - required by std, but never called.
   void operator delete(void*, unsigned, bool) {
     llvm_unreachable("Constructor throws?");
   }
 
-  MCSection *getSectionPtr(bool SetUsed = true) const {
-    if (MCFragment *F = getFragment(SetUsed)) {
+  MCSection *getSectionPtr() const {
+    if (MCFragment *F = getFragment()) {
       assert(F != AbsolutePseudoFragment);
       return F->getParent();
     }
     return nullptr;
   }
 
-  /// \brief Get a reference to the name field.  Requires that we have a name
+  /// Get a reference to the name field.  Requires that we have a name
   const StringMapEntry<bool> *&getNameEntryPtr() {
     assert(FragmentAndHasName.getInt() && "Name is required");
     NameEntryStorageTy *Name = reinterpret_cast<NameEntryStorageTy *>(this);
@@ -221,13 +221,12 @@ public:
 
   /// isUsed - Check if this is used.
   bool isUsed() const { return IsUsed; }
-  void setUsed(bool Value) const { IsUsed |= Value; }
 
-  /// \brief Check if this symbol is redefinable.
+  /// Check if this symbol is redefinable.
   bool isRedefinable() const { return IsRedefinable; }
-  /// \brief Mark this symbol as redefinable.
+  /// Mark this symbol as redefinable.
   void setRedefinable(bool Value) { IsRedefinable = Value; }
-  /// \brief Prepare this symbol to be redefined.
+  /// Prepare this symbol to be redefined.
   void redefineIfPossible() {
     if (IsRedefinable) {
       if (SymbolContents == SymContentsVariable) {
@@ -246,28 +245,28 @@ public:
   /// isDefined - Check if this symbol is defined (i.e., it has an address).
   ///
   /// Defined symbols are either absolute or in some section.
-  bool isDefined(bool SetUsed = true) const {
-    return getFragment(SetUsed) != nullptr;
-  }
+  bool isDefined() const { return !isUndefined(); }
 
   /// isInSection - Check if this symbol is defined in some section (i.e., it
   /// is defined but not absolute).
-  bool isInSection(bool SetUsed = true) const {
-    return isDefined(SetUsed) && !isAbsolute(SetUsed);
+  bool isInSection() const {
+    return isDefined() && !isAbsolute();
   }
 
   /// isUndefined - Check if this symbol undefined (i.e., implicitly defined).
-  bool isUndefined(bool SetUsed = true) const { return !isDefined(SetUsed); }
+  bool isUndefined(bool SetUsed = true) const {
+    return getFragment(SetUsed) == nullptr;
+  }
 
   /// isAbsolute - Check if this is an absolute symbol.
-  bool isAbsolute(bool SetUsed = true) const {
-    return getFragment(SetUsed) == AbsolutePseudoFragment;
+  bool isAbsolute() const {
+    return getFragment() == AbsolutePseudoFragment;
   }
 
   /// Get the section associated with a defined, non-absolute symbol.
-  MCSection &getSection(bool SetUsed = true) const {
-    assert(isInSection(SetUsed) && "Invalid accessor!");
-    return *getSectionPtr(SetUsed);
+  MCSection &getSection() const {
+    assert(isInSection() && "Invalid accessor!");
+    return *getSectionPtr();
   }
 
   /// Mark the symbol as defined in the fragment \p F.
@@ -317,6 +316,8 @@ public:
     Index = Value;
   }
 
+  bool isUnset() const { return SymbolContents == SymContentsUnset; }
+
   uint64_t getOffset() const {
     assert((SymbolContents == SymContentsUnset ||
             SymbolContents == SymContentsOffset) &&
diff --git a/contrib/llvm/include/llvm/MC/MCSymbolMachO.h b/contrib/llvm/include/llvm/MC/MCSymbolMachO.h
index 25220e4a8109..6125c2050976 100644
--- a/contrib/llvm/include/llvm/MC/MCSymbolMachO.h
+++ b/contrib/llvm/include/llvm/MC/MCSymbolMachO.h
@@ -14,7 +14,7 @@
 
 namespace llvm {
 class MCSymbolMachO : public MCSymbol {
-  /// \brief We store the value for the 'desc' symbol field in the
+  /// We store the value for the 'desc' symbol field in the
   /// lowest 16 bits of the implementation defined flags.
   enum MachOSymbolFlags : uint16_t { // See <mach-o/nlist.h>.
     SF_DescFlagsMask                        = 0xFFFF,
@@ -104,7 +104,7 @@ public:
     setFlags(Value & SF_DescFlagsMask);
   }
 
-  /// \brief Get the encoded value of the flags as they will be emitted in to
+  /// Get the encoded value of the flags as they will be emitted in to
   /// the MachO binary
   uint16_t getEncodedFlags(bool EncodeAsAltEntry) const {
     uint16_t Flags = getFlags();
diff --git a/contrib/llvm/include/llvm/MC/MCSymbolWasm.h b/contrib/llvm/include/llvm/MC/MCSymbolWasm.h
index 309ebf96d1b0..e043453dc732 100644
--- a/contrib/llvm/include/llvm/MC/MCSymbolWasm.h
+++ b/contrib/llvm/include/llvm/MC/MCSymbolWasm.h
@@ -15,15 +15,17 @@
 namespace llvm {
 
 class MCSymbolWasm : public MCSymbol {
-private:
-  bool IsFunction = false;
+  wasm::WasmSymbolType Type = wasm::WASM_SYMBOL_TYPE_DATA;
   bool IsWeak = false;
   bool IsHidden = false;
+  bool IsComdat = false;
   std::string ModuleName;
   SmallVector<wasm::ValType, 1> Returns;
   SmallVector<wasm::ValType, 4> Params;
+  wasm::WasmGlobalType GlobalType;
   bool ParamsSet = false;
   bool ReturnsSet = false;
+  bool GlobalTypeSet = false;
 
   /// An expression describing how to calculate the size of a symbol. If a
   /// symbol has no size this field will be NULL.
@@ -40,8 +42,12 @@ public:
   const MCExpr *getSize() const { return SymbolSize; }
   void setSize(const MCExpr *SS) { SymbolSize = SS; }
 
-  bool isFunction() const { return IsFunction; }
-  void setIsFunction(bool isFunc) { IsFunction = isFunc; }
+  bool isFunction() const { return Type == wasm::WASM_SYMBOL_TYPE_FUNCTION; }
+  bool isData() const { return Type == wasm::WASM_SYMBOL_TYPE_DATA; }
+  bool isGlobal() const { return Type == wasm::WASM_SYMBOL_TYPE_GLOBAL; }
+  bool isSection() const { return Type == wasm::WASM_SYMBOL_TYPE_SECTION; }
+  wasm::WasmSymbolType getType() const { return Type; }
+  void setType(wasm::WasmSymbolType type) { Type = type; }
 
   bool isWeak() const { return IsWeak; }
   void setWeak(bool isWeak) { IsWeak = isWeak; }
@@ -49,7 +55,11 @@ public:
   bool isHidden() const { return IsHidden; }
   void setHidden(bool isHidden) { IsHidden = isHidden; }
 
+  bool isComdat() const { return IsComdat; }
+  void setComdat(bool isComdat) { IsComdat = isComdat; }
+
   const StringRef getModuleName() const { return ModuleName; }
+  void setModuleName(StringRef Name) { ModuleName = Name; }
 
   const SmallVector<wasm::ValType, 1> &getReturns() const {
     assert(ReturnsSet);
@@ -70,6 +80,16 @@ public:
     ParamsSet = true;
     Params = std::move(Pars);
   }
+
+  const wasm::WasmGlobalType &getGlobalType() const {
+    assert(GlobalTypeSet);
+    return GlobalType;
+  }
+
+  void setGlobalType(wasm::WasmGlobalType GT) {
+    GlobalTypeSet = true;
+    GlobalType = GT;
+  }
 };
 
 }  // end namespace llvm
diff --git a/contrib/llvm/include/llvm/MC/MCTargetOptions.h b/contrib/llvm/include/llvm/MC/MCTargetOptions.h
index 5509bb3bdc7c..f5d330fbeb22 100644
--- a/contrib/llvm/include/llvm/MC/MCTargetOptions.h
+++ b/contrib/llvm/include/llvm/MC/MCTargetOptions.h
@@ -21,6 +21,7 @@ enum class ExceptionHandling {
   SjLj,     /// setjmp/longjmp based exceptions
   ARM,      /// ARM EHABI
   WinEH,    /// Windows Exception Handling
+  Wasm,     /// WebAssembly Exception Handling
 };
 
 enum class DebugCompressionType {
diff --git a/contrib/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.def b/contrib/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.inc
index 5172fa44511f..5172fa44511f 100644
--- a/contrib/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.def
+++ b/contrib/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.inc
diff --git a/contrib/llvm/include/llvm/MC/MCValue.h b/contrib/llvm/include/llvm/MC/MCValue.h
index ff223f70303b..11f5082ed3f4 100644
--- a/contrib/llvm/include/llvm/MC/MCValue.h
+++ b/contrib/llvm/include/llvm/MC/MCValue.h
@@ -23,7 +23,7 @@ namespace llvm {
 class MCAsmInfo;
 class raw_ostream;
 
-/// \brief This represents an "assembler immediate".
+/// This represents an "assembler immediate".
 ///
 ///  In its most general form, this can hold ":Kind:(SymbolA - SymbolB +
 ///  imm64)".  Not all targets supports relocations of this general form, but we
@@ -49,13 +49,13 @@ public:
   const MCSymbolRefExpr *getSymB() const { return SymB; }
   uint32_t getRefKind() const { return RefKind; }
 
-  /// \brief Is this an absolute (as opposed to relocatable) value.
+  /// Is this an absolute (as opposed to relocatable) value.
   bool isAbsolute() const { return !SymA && !SymB; }
 
-  /// \brief Print the value to the stream \p OS.
+  /// Print the value to the stream \p OS.
   void print(raw_ostream &OS) const;
 
-  /// \brief Print the value to stderr.
+  /// Print the value to stderr.
   void dump() const;
 
   MCSymbolRefExpr::VariantKind getAccessVariant() const;
diff --git a/contrib/llvm/include/llvm/MC/MCWasmObjectWriter.h b/contrib/llvm/include/llvm/MC/MCWasmObjectWriter.h
index a4d5eb857b39..e45030f302ff 100644
--- a/contrib/llvm/include/llvm/MC/MCWasmObjectWriter.h
+++ b/contrib/llvm/include/llvm/MC/MCWasmObjectWriter.h
@@ -10,18 +10,16 @@
 #ifndef LLVM_MC_MCWASMOBJECTWRITER_H
 #define LLVM_MC_MCWASMOBJECTWRITER_H
 
-#include "llvm/ADT/Triple.h"
-#include "llvm/BinaryFormat/Wasm.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include <memory>
 
 namespace llvm {
 
 class MCFixup;
-class MCObjectWriter;
 class MCValue;
 class raw_pwrite_stream;
 
-class MCWasmObjectTargetWriter {
+class MCWasmObjectTargetWriter : public MCObjectTargetWriter {
   const unsigned Is64Bit : 1;
 
 protected:
@@ -30,6 +28,11 @@ protected:
 public:
   virtual ~MCWasmObjectTargetWriter();
 
+  virtual Triple::ObjectFormatType getFormat() const { return Triple::Wasm; }
+  static bool classof(const MCObjectTargetWriter *W) {
+    return W->getFormat() == Triple::Wasm;
+  }
+
   virtual unsigned getRelocType(const MCValue &Target,
                                 const MCFixup &Fixup) const = 0;
 
@@ -39,7 +42,7 @@ public:
   /// @}
 };
 
-/// \brief Construct a new Wasm writer instance.
+/// Construct a new Wasm writer instance.
 ///
 /// \param MOTW - The target specific Wasm writer subclass.
 /// \param OS - The stream to write to.
diff --git a/contrib/llvm/include/llvm/MC/MCWasmStreamer.h b/contrib/llvm/include/llvm/MC/MCWasmStreamer.h
index c0d45451a9ab..01e6a4379287 100644
--- a/contrib/llvm/include/llvm/MC/MCWasmStreamer.h
+++ b/contrib/llvm/include/llvm/MC/MCWasmStreamer.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -27,8 +28,10 @@ class raw_ostream;
 class MCWasmStreamer : public MCObjectStreamer {
 public:
   MCWasmStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                 raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter)
-      : MCObjectStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+                 std::unique_ptr<MCObjectWriter> OW,
+                 std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCObjectStreamer(Context, std::move(TAB), std::move(OW),
+                         std::move(Emitter)),
         SeenIdent(false) {}
 
   ~MCWasmStreamer() override;
@@ -57,7 +60,8 @@ public:
                              unsigned ByteAlignment) override;
 
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                    uint64_t Size = 0, unsigned ByteAlignment = 0) override;
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override;
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
@@ -73,7 +77,7 @@ private:
   void EmitInstToFragment(const MCInst &Inst, const MCSubtargetInfo &) override;
   void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
 
-  /// \brief Merge the content of the fragment \p EF into the fragment \p DF.
+  /// Merge the content of the fragment \p EF into the fragment \p DF.
   void mergeFragment(MCDataFragment *, MCDataFragment *);
 
   bool SeenIdent;
diff --git a/contrib/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/contrib/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index 3234bd93cad0..c1d35ea1f6ba 100644
--- a/contrib/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/contrib/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_MC_MCWINCOFFOBJECTWRITER_H
 #define LLVM_MC_MCWINCOFFOBJECTWRITER_H
 
+#include "llvm/MC/MCObjectWriter.h"
 #include <memory>
 
 namespace llvm {
@@ -17,11 +18,10 @@ namespace llvm {
 class MCAsmBackend;
 class MCContext;
 class MCFixup;
-class MCObjectWriter;
 class MCValue;
 class raw_pwrite_stream;
 
-  class MCWinCOFFObjectTargetWriter {
+  class MCWinCOFFObjectTargetWriter : public MCObjectTargetWriter {
     virtual void anchor();
 
     const unsigned Machine;
@@ -32,6 +32,11 @@ class raw_pwrite_stream;
   public:
     virtual ~MCWinCOFFObjectTargetWriter() = default;
 
+    virtual Triple::ObjectFormatType getFormat() const { return Triple::COFF; }
+    static bool classof(const MCObjectTargetWriter *W) {
+      return W->getFormat() == Triple::COFF;
+    }
+
     unsigned getMachine() const { return Machine; }
     virtual unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                                   const MCFixup &Fixup, bool IsCrossSection,
@@ -39,7 +44,7 @@ class raw_pwrite_stream;
     virtual bool recordRelocation(const MCFixup &) const { return true; }
   };
 
-  /// \brief Construct a new Win COFF writer instance.
+  /// Construct a new Win COFF writer instance.
   ///
   /// \param MOTW - The target specific WinCOFF writer subclass.
   /// \param OS - The stream to write to.
diff --git a/contrib/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/contrib/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index a2500c06efa1..0049d04b4b3f 100644
--- a/contrib/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/contrib/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -28,7 +28,8 @@ class raw_pwrite_stream;
 class MCWinCOFFStreamer : public MCObjectStreamer {
 public:
   MCWinCOFFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-                    std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS);
+                    std::unique_ptr<MCCodeEmitter> CE,
+                    std::unique_ptr<MCObjectWriter> OW);
 
   /// state management
   void reset() override {
@@ -50,14 +51,16 @@ public:
   void EmitCOFFSymbolType(int Type) override;
   void EndCOFFSymbolDef() override;
   void EmitCOFFSafeSEH(MCSymbol const *Symbol) override;
+  void EmitCOFFSymbolIndex(MCSymbol const *Symbol) override;
   void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
+  void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
-                    unsigned ByteAlignment) override;
+                    unsigned ByteAlignment, SMLoc Loc = SMLoc()) override;
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment) override;
   void EmitIdent(StringRef IdentString) override;
diff --git a/contrib/llvm/include/llvm/MC/StringTableBuilder.h b/contrib/llvm/include/llvm/MC/StringTableBuilder.h
index 0df3fcd63723..265260fcee4d 100644
--- a/contrib/llvm/include/llvm/MC/StringTableBuilder.h
+++ b/contrib/llvm/include/llvm/MC/StringTableBuilder.h
@@ -20,10 +20,10 @@ namespace llvm {
 
 class raw_ostream;
 
-/// \brief Utility for building string tables with deduplicated suffixes.
+/// Utility for building string tables with deduplicated suffixes.
 class StringTableBuilder {
 public:
-  enum Kind { ELF, WinCOFF, MachO, RAW };
+  enum Kind { ELF, WinCOFF, MachO, RAW, DWARF };
 
 private:
   DenseMap<CachedHashStringRef, size_t> StringIndexMap;
@@ -39,13 +39,13 @@ public:
   StringTableBuilder(Kind K, unsigned Alignment = 1);
   ~StringTableBuilder();
 
-  /// \brief Add a string to the builder. Returns the position of S in the
+  /// Add a string to the builder. Returns the position of S in the
   /// table. The position will be changed if finalize is used.
   /// Can only be used before the table is finalized.
   size_t add(CachedHashStringRef S);
   size_t add(StringRef S) { return add(CachedHashStringRef(S)); }
 
-  /// \brief Analyze the strings and build the final table. No more strings can
+  /// Analyze the strings and build the final table. No more strings can
   /// be added after this point.
   void finalize();
 
@@ -53,7 +53,7 @@ public:
   /// returned by add will still be valid.
   void finalizeInOrder();
 
-  /// \brief Get the offest of a string in the string table. Can only be used
+  /// Get the offest of a string in the string table. Can only be used
   /// after the table is finalized.
   size_t getOffset(CachedHashStringRef S) const;
   size_t getOffset(StringRef S) const {
diff --git a/contrib/llvm/include/llvm/Object/Archive.h b/contrib/llvm/include/llvm/Object/Archive.h
index 5a1512bb9d36..9ef1e4875191 100644
--- a/contrib/llvm/include/llvm/Object/Archive.h
+++ b/contrib/llvm/include/llvm/Object/Archive.h
@@ -91,9 +91,9 @@ public:
 
     const Archive *Parent;
     ArchiveMemberHeader Header;
-    /// \brief Includes header but not padding byte.
+    /// Includes header but not padding byte.
     StringRef Data;
-    /// \brief Offset from Data to the start of the file.
+    /// Offset from Data to the start of the file.
     uint16_t StartOfFile;
 
     Expected<bool> isThinMember() const;
diff --git a/contrib/llvm/include/llvm/Object/Binary.h b/contrib/llvm/include/llvm/Object/Binary.h
index 5e93691d1fd2..99745e24b8c8 100644
--- a/contrib/llvm/include/llvm/Object/Binary.h
+++ b/contrib/llvm/include/llvm/Object/Binary.h
@@ -156,7 +156,7 @@ public:
   }
 };
 
-/// @brief Create a Binary from Source, autodetecting the file type.
+/// Create a Binary from Source, autodetecting the file type.
 ///
 /// @param Source The data to create the Binary from.
 Expected<std::unique_ptr<Binary>> createBinary(MemoryBufferRef Source,
diff --git a/contrib/llvm/include/llvm/Object/COFF.h b/contrib/llvm/include/llvm/Object/COFF.h
index b072dd5ba7d9..6caadea0175b 100644
--- a/contrib/llvm/include/llvm/Object/COFF.h
+++ b/contrib/llvm/include/llvm/Object/COFF.h
@@ -16,9 +16,9 @@
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/COFF.h"
-#include "llvm/DebugInfo/CodeView/CVDebugRecord.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/Binary.h"
+#include "llvm/Object/CVDebugRecord.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/BinaryByteStream.h"
@@ -276,6 +276,7 @@ struct coff_symbol_generic {
 };
 
 struct coff_aux_section_definition;
+struct coff_aux_weak_external;
 
 class COFFSymbolRef {
 public:
@@ -360,6 +361,13 @@ public:
     return getAux<coff_aux_section_definition>();
   }
 
+  const coff_aux_weak_external *getWeakExternal() const {
+    if (!getNumberOfAuxSymbols() ||
+        getStorageClass() != COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL)
+      return nullptr;
+    return getAux<coff_aux_weak_external>();
+  }
+
   bool isAbsolute() const {
     return getSectionNumber() == -1;
   }
@@ -452,11 +460,12 @@ struct coff_section {
     if (Characteristics & COFF::IMAGE_SCN_TYPE_NO_PAD)
       return 1;
 
-    // Bit [20:24] contains section alignment. Both 0 and 1 mean alignment 1.
+    // Bit [20:24] contains section alignment. 0 means use a default alignment
+    // of 16.
     uint32_t Shift = (Characteristics >> 20) & 0xF;
     if (Shift > 0)
       return 1U << (Shift - 1);
-    return 1;
+    return 16;
   }
 };
 
@@ -927,6 +936,7 @@ public:
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
   Triple::ArchType getArch() const override;
+  Expected<uint64_t> getStartAddress() const override;
   SubtargetFeatures getFeatures() const override { return SubtargetFeatures(); }
 
   import_directory_iterator import_directory_begin() const;
@@ -963,6 +973,8 @@ public:
   std::error_code getDataDirectory(uint32_t index,
                                    const data_directory *&Res) const;
   std::error_code getSection(int32_t index, const coff_section *&Res) const;
+  std::error_code getSection(StringRef SectionName,
+                             const coff_section *&Res) const;
 
   template <typename coff_symbol_type>
   std::error_code getSymbol(uint32_t Index,
@@ -1012,8 +1024,7 @@ public:
     llvm_unreachable("null symbol table pointer!");
   }
 
-  iterator_range<const coff_relocation *>
-  getRelocations(const coff_section *Sec) const;
+  ArrayRef<coff_relocation> getRelocations(const coff_section *Sec) const;
 
   std::error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
   uint64_t getSectionSize(const coff_section *Sec) const;
diff --git a/contrib/llvm/include/llvm/Object/COFFImportFile.h b/contrib/llvm/include/llvm/Object/COFFImportFile.h
index 4b284de679b3..0a4556ad8884 100644
--- a/contrib/llvm/include/llvm/Object/COFFImportFile.h
+++ b/contrib/llvm/include/llvm/Object/COFFImportFile.h
@@ -74,6 +74,7 @@ struct COFFShortExport {
   std::string Name;
   std::string ExtName;
   std::string SymbolName;
+  std::string AliasTarget;
 
   uint16_t Ordinal = 0;
   bool Noname = false;
@@ -81,10 +82,6 @@ struct COFFShortExport {
   bool Private = false;
   bool Constant = false;
 
-  bool isWeak() {
-    return ExtName.size() && ExtName != Name;
-  }
-
   friend bool operator==(const COFFShortExport &L, const COFFShortExport &R) {
     return L.Name == R.Name && L.ExtName == R.ExtName &&
             L.Ordinal == R.Ordinal && L.Noname == R.Noname &&
@@ -98,7 +95,7 @@ struct COFFShortExport {
 
 Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
-                         COFF::MachineTypes Machine, bool MakeWeakAliases);
+                         COFF::MachineTypes Machine, bool MinGW);
 
 } // namespace object
 } // namespace llvm
diff --git a/contrib/llvm/include/llvm/DebugInfo/CodeView/CVDebugRecord.h b/contrib/llvm/include/llvm/Object/CVDebugRecord.h
index 5a0bb4266ba2..faad72c0df29 100644
--- a/contrib/llvm/include/llvm/DebugInfo/CodeView/CVDebugRecord.h
+++ b/contrib/llvm/include/llvm/Object/CVDebugRecord.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_CODEVIEW_CVDEBUGRECORD_H
-#define LLVM_DEBUGINFO_CODEVIEW_CVDEBUGRECORD_H
+#ifndef LLVM_OBJECT_CVDEBUGRECORD_H
+#define LLVM_OBJECT_CVDEBUGRECORD_H
 
 #include "llvm/Support/Endian.h"
 
diff --git a/contrib/llvm/include/llvm/Object/Decompressor.h b/contrib/llvm/include/llvm/Object/Decompressor.h
index c8e888d285e4..2a77d2ffbf68 100644
--- a/contrib/llvm/include/llvm/Object/Decompressor.h
+++ b/contrib/llvm/include/llvm/Object/Decompressor.h
@@ -17,10 +17,10 @@
 namespace llvm {
 namespace object {
 
-/// @brief Decompressor helps to handle decompression of compressed sections.
+/// Decompressor helps to handle decompression of compressed sections.
 class Decompressor {
 public:
-  /// @brief Create decompressor object.
+  /// Create decompressor object.
   /// @param Name        Section name.
   /// @param Data        Section content.
   /// @param IsLE        Flag determines if Data is in little endian form.
@@ -28,27 +28,27 @@ public:
   static Expected<Decompressor> create(StringRef Name, StringRef Data,
                                        bool IsLE, bool Is64Bit);
 
-  /// @brief Resize the buffer and uncompress section data into it.
+  /// Resize the buffer and uncompress section data into it.
   /// @param Out         Destination buffer.
   template <class T> Error resizeAndDecompress(T &Out) {
     Out.resize(DecompressedSize);
     return decompress({Out.data(), (size_t)DecompressedSize});
   }
 
-  /// @brief Uncompress section data to raw buffer provided.
+  /// Uncompress section data to raw buffer provided.
   /// @param Buffer      Destination buffer.
   Error decompress(MutableArrayRef<char> Buffer);
 
-  /// @brief Return memory buffer size required for decompression.
+  /// Return memory buffer size required for decompression.
   uint64_t getDecompressedSize() { return DecompressedSize; }
 
-  /// @brief Return true if section is compressed, including gnu-styled case.
+  /// Return true if section is compressed, including gnu-styled case.
   static bool isCompressed(const object::SectionRef &Section);
 
-  /// @brief Return true if section is a ELF compressed one.
+  /// Return true if section is a ELF compressed one.
   static bool isCompressedELFSection(uint64_t Flags, StringRef Name);
 
-  /// @brief Return true if section name matches gnu style compressed one.
+  /// Return true if section name matches gnu style compressed one.
   static bool isGnuStyle(StringRef Name);
 
 private:
diff --git a/contrib/llvm/include/llvm/Object/ELF.h b/contrib/llvm/include/llvm/Object/ELF.h
index 45c98233dec0..752d468fd25b 100644
--- a/contrib/llvm/include/llvm/Object/ELF.h
+++ b/contrib/llvm/include/llvm/Object/ELF.h
@@ -32,6 +32,7 @@ namespace llvm {
 namespace object {
 
 StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type);
+uint32_t getELFRelrRelocationType(uint32_t Machine);
 StringRef getELFSectionTypeName(uint32_t Machine, uint32_t Type);
 
 // Subclasses of ELFFile may need this for template instantiation
@@ -60,6 +61,7 @@ public:
   using Elf_Phdr = typename ELFT::Phdr;
   using Elf_Rel = typename ELFT::Rel;
   using Elf_Rela = typename ELFT::Rela;
+  using Elf_Relr = typename ELFT::Relr;
   using Elf_Verdef = typename ELFT::Verdef;
   using Elf_Verdaux = typename ELFT::Verdaux;
   using Elf_Verneed = typename ELFT::Verneed;
@@ -67,11 +69,15 @@ public:
   using Elf_Versym = typename ELFT::Versym;
   using Elf_Hash = typename ELFT::Hash;
   using Elf_GnuHash = typename ELFT::GnuHash;
+  using Elf_Nhdr = typename ELFT::Nhdr;
+  using Elf_Note = typename ELFT::Note;
+  using Elf_Note_Iterator = typename ELFT::NoteIterator;
   using Elf_Dyn_Range = typename ELFT::DynRange;
   using Elf_Shdr_Range = typename ELFT::ShdrRange;
   using Elf_Sym_Range = typename ELFT::SymRange;
   using Elf_Rel_Range = typename ELFT::RelRange;
   using Elf_Rela_Range = typename ELFT::RelaRange;
+  using Elf_Relr_Range = typename ELFT::RelrRange;
   using Elf_Phdr_Range = typename ELFT::PhdrRange;
 
   const uint8_t *base() const {
@@ -107,8 +113,12 @@ public:
   StringRef getRelocationTypeName(uint32_t Type) const;
   void getRelocationTypeName(uint32_t Type,
                              SmallVectorImpl<char> &Result) const;
+  uint32_t getRelrRelocationType() const;
 
-  /// \brief Get the symbol for a given relocation.
+  const char *getDynamicTagAsString(unsigned Arch, uint64_t Type) const;
+  const char *getDynamicTagAsString(uint64_t Type) const;
+
+  /// Get the symbol for a given relocation.
   Expected<const Elf_Sym *> getRelocationSymbol(const Elf_Rel *Rel,
                                                 const Elf_Shdr *SymTab) const;
 
@@ -126,6 +136,10 @@ public:
 
   Expected<Elf_Shdr_Range> sections() const;
 
+  Expected<Elf_Dyn_Range> dynamicEntries() const;
+
+  Expected<const uint8_t *> toMappedAddr(uint64_t VAddr) const;
+
   Expected<Elf_Sym_Range> symbols(const Elf_Shdr *Sec) const {
     if (!Sec)
       return makeArrayRef<Elf_Sym>(nullptr, nullptr);
@@ -140,9 +154,15 @@ public:
     return getSectionContentsAsArray<Elf_Rel>(Sec);
   }
 
+  Expected<Elf_Relr_Range> relrs(const Elf_Shdr *Sec) const {
+    return getSectionContentsAsArray<Elf_Relr>(Sec);
+  }
+
+  Expected<std::vector<Elf_Rela>> decode_relrs(Elf_Relr_Range relrs) const;
+
   Expected<std::vector<Elf_Rela>> android_relas(const Elf_Shdr *Sec) const;
 
-  /// \brief Iterate over program header table.
+  /// Iterate over program header table.
   Expected<Elf_Phdr_Range> program_headers() const {
     if (getHeader()->e_phnum && getHeader()->e_phentsize != sizeof(Elf_Phdr))
       return createError("invalid e_phentsize");
@@ -155,6 +175,73 @@ public:
     return makeArrayRef(Begin, Begin + getHeader()->e_phnum);
   }
 
+  /// Get an iterator over notes in a program header.
+  ///
+  /// The program header must be of type \c PT_NOTE.
+  ///
+  /// \param Phdr the program header to iterate over.
+  /// \param Err [out] an error to support fallible iteration, which should
+  ///  be checked after iteration ends.
+  Elf_Note_Iterator notes_begin(const Elf_Phdr &Phdr, Error &Err) const {
+    if (Phdr.p_type != ELF::PT_NOTE) {
+      Err = createError("attempt to iterate notes of non-note program header");
+      return Elf_Note_Iterator(Err);
+    }
+    if (Phdr.p_offset + Phdr.p_filesz > getBufSize()) {
+      Err = createError("invalid program header offset/size");
+      return Elf_Note_Iterator(Err);
+    }
+    return Elf_Note_Iterator(base() + Phdr.p_offset, Phdr.p_filesz, Err);
+  }
+
+  /// Get an iterator over notes in a section.
+  ///
+  /// The section must be of type \c SHT_NOTE.
+  ///
+  /// \param Shdr the section to iterate over.
+  /// \param Err [out] an error to support fallible iteration, which should
+  ///  be checked after iteration ends.
+  Elf_Note_Iterator notes_begin(const Elf_Shdr &Shdr, Error &Err) const {
+    if (Shdr.sh_type != ELF::SHT_NOTE) {
+      Err = createError("attempt to iterate notes of non-note section");
+      return Elf_Note_Iterator(Err);
+    }
+    if (Shdr.sh_offset + Shdr.sh_size > getBufSize()) {
+      Err = createError("invalid section offset/size");
+      return Elf_Note_Iterator(Err);
+    }
+    return Elf_Note_Iterator(base() + Shdr.sh_offset, Shdr.sh_size, Err);
+  }
+
+  /// Get the end iterator for notes.
+  Elf_Note_Iterator notes_end() const {
+    return Elf_Note_Iterator();
+  }
+
+  /// Get an iterator range over notes of a program header.
+  ///
+  /// The program header must be of type \c PT_NOTE.
+  ///
+  /// \param Phdr the program header to iterate over.
+  /// \param Err [out] an error to support fallible iteration, which should
+  ///  be checked after iteration ends.
+  iterator_range<Elf_Note_Iterator> notes(const Elf_Phdr &Phdr,
+                                          Error &Err) const {
+    return make_range(notes_begin(Phdr, Err), notes_end());
+  }
+
+  /// Get an iterator range over notes of a section.
+  ///
+  /// The section must be of type \c SHT_NOTE.
+  ///
+  /// \param Shdr the section to iterate over.
+  /// \param Err [out] an error to support fallible iteration, which should
+  ///  be checked after iteration ends.
+  iterator_range<Elf_Note_Iterator> notes(const Elf_Shdr &Shdr,
+                                          Error &Err) const {
+    return make_range(notes_begin(Shdr, Err), notes_end());
+  }
+
   Expected<StringRef> getSectionStringTable(Elf_Shdr_Range Sections) const;
   Expected<uint32_t> getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
                                      ArrayRef<Elf_Word> ShndxTable) const;
@@ -165,6 +252,7 @@ public:
                                         Elf_Sym_Range Symtab,
                                         ArrayRef<Elf_Word> ShndxTable) const;
   Expected<const Elf_Shdr *> getSection(uint32_t Index) const;
+  Expected<const Elf_Shdr *> getSection(const StringRef SectionName) const;
 
   Expected<const Elf_Sym *> getSymbol(const Elf_Shdr *Sec,
                                       uint32_t Index) const;
@@ -177,10 +265,10 @@ public:
   Expected<ArrayRef<uint8_t>> getSectionContents(const Elf_Shdr *Sec) const;
 };
 
-using ELF32LEFile = ELFFile<ELFType<support::little, false>>;
-using ELF64LEFile = ELFFile<ELFType<support::little, true>>;
-using ELF32BEFile = ELFFile<ELFType<support::big, false>>;
-using ELF64BEFile = ELFFile<ELFType<support::big, true>>;
+using ELF32LEFile = ELFFile<ELF32LE>;
+using ELF64LEFile = ELFFile<ELF64LE>;
+using ELF32BEFile = ELFFile<ELF32BE>;
+using ELF64BEFile = ELFFile<ELF64BE>;
 
 template <class ELFT>
 inline Expected<const typename ELFT::Shdr *>
@@ -327,6 +415,11 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
 }
 
 template <class ELFT>
+uint32_t ELFFile<ELFT>::getRelrRelocationType() const {
+  return getELFRelrRelocationType(getHeader()->e_machine);
+}
+
+template <class ELFT>
 Expected<const typename ELFT::Sym *>
 ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
                                    const Elf_Shdr *SymTab) const {
@@ -429,6 +522,22 @@ ELFFile<ELFT>::getSection(uint32_t Index) const {
 }
 
 template <class ELFT>
+Expected<const typename ELFT::Shdr *>
+ELFFile<ELFT>::getSection(const StringRef SectionName) const {
+  auto TableOrErr = sections();
+  if (!TableOrErr)
+    return TableOrErr.takeError();
+  for (auto &Sec : *TableOrErr) {
+    auto SecNameOrErr = getSectionName(&Sec);
+    if (!SecNameOrErr)
+      return SecNameOrErr.takeError();
+    if (*SecNameOrErr == SectionName)
+      return &Sec;
+  }
+  return createError("invalid section name");
+}
+
+template <class ELFT>
 Expected<StringRef>
 ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section) const {
   if (Section->sh_type != ELF::SHT_STRTAB)
diff --git a/contrib/llvm/include/llvm/Object/ELFObjectFile.h b/contrib/llvm/include/llvm/Object/ELFObjectFile.h
index 40503cb6bb9d..2c0905d545a7 100644
--- a/contrib/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/contrib/llvm/include/llvm/Object/ELFObjectFile.h
@@ -15,6 +15,7 @@
 #define LLVM_OBJECT_ELFOBJECTFILE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
@@ -67,6 +68,9 @@ public:
 
   virtual elf_symbol_iterator_range getDynamicSymbolIterators() const = 0;
 
+  /// Returns platform-specific object flags, if any.
+  virtual unsigned getPlatformFlags() const = 0;
+
   elf_symbol_iterator_range symbols() const;
 
   static bool classof(const Binary *v) { return v->isELF(); }
@@ -77,7 +81,11 @@ public:
 
   SubtargetFeatures getARMFeatures() const;
 
+  SubtargetFeatures getRISCVFeatures() const;
+
   void setARMSubArch(Triple &TheTriple) const override;
+
+  virtual uint16_t getEType() const = 0;
 };
 
 class ELFSectionRef : public SectionRef {
@@ -195,19 +203,20 @@ ELFObjectFileBase::symbols() const {
 
 template <class ELFT> class ELFObjectFile : public ELFObjectFileBase {
   uint16_t getEMachine() const override;
+  uint16_t getEType() const override;
   uint64_t getSymbolSize(DataRefImpl Sym) const override;
 
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  using uintX_t = typename ELFFile<ELFT>::uintX_t;
+  using uintX_t = typename ELFT::uint;
 
-  using Elf_Sym = typename ELFFile<ELFT>::Elf_Sym;
-  using Elf_Shdr = typename ELFFile<ELFT>::Elf_Shdr;
-  using Elf_Ehdr = typename ELFFile<ELFT>::Elf_Ehdr;
-  using Elf_Rel = typename ELFFile<ELFT>::Elf_Rel;
-  using Elf_Rela = typename ELFFile<ELFT>::Elf_Rela;
-  using Elf_Dyn = typename ELFFile<ELFT>::Elf_Dyn;
+  using Elf_Sym = typename ELFT::Sym;
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Ehdr = typename ELFT::Ehdr;
+  using Elf_Rel = typename ELFT::Rel;
+  using Elf_Rela = typename ELFT::Rela;
+  using Elf_Dyn = typename ELFT::Dyn;
 
 private:
   ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF,
@@ -251,6 +260,7 @@ protected:
   bool isSectionVirtual(DataRefImpl Sec) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
+  std::vector<SectionRef> dynamic_relocation_sections() const override;
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
@@ -265,7 +275,7 @@ protected:
   uint64_t getSectionOffset(DataRefImpl Sec) const override;
   StringRef getRelocationTypeName(uint32_t Type) const;
 
-  /// \brief Get the relocation section that contains \a Rel.
+  /// Get the relocation section that contains \a Rel.
   const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
     auto RelSecOrErr = EF.getSection(Rel.d.a);
     if (!RelSecOrErr)
@@ -363,11 +373,9 @@ public:
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
   Triple::ArchType getArch() const override;
+  Expected<uint64_t> getStartAddress() const override;
 
-  std::error_code getPlatformFlags(unsigned &Result) const override {
-    Result = EF.getHeader()->e_flags;
-    return std::error_code();
-  }
+  unsigned getPlatformFlags() const override { return EF.getHeader()->e_flags; }
 
   std::error_code getBuildAttributes(ARMAttributeParser &Attributes) const override {
     auto SectionsOrErr = EF.sections();
@@ -404,10 +412,10 @@ public:
   bool isRelocatableObject() const override;
 };
 
-using ELF32LEObjectFile = ELFObjectFile<ELFType<support::little, false>>;
-using ELF64LEObjectFile = ELFObjectFile<ELFType<support::little, true>>;
-using ELF32BEObjectFile = ELFObjectFile<ELFType<support::big, false>>;
-using ELF64BEObjectFile = ELFObjectFile<ELFType<support::big, true>>;
+using ELF32LEObjectFile = ELFObjectFile<ELF32LE>;
+using ELF64LEObjectFile = ELFObjectFile<ELF64LE>;
+using ELF32BEObjectFile = ELFObjectFile<ELF32BE>;
+using ELF64BEObjectFile = ELFObjectFile<ELF64BE>;
 
 template <class ELFT>
 void ELFObjectFile<ELFT>::moveSymbolNext(DataRefImpl &Sym) const {
@@ -505,6 +513,10 @@ uint16_t ELFObjectFile<ELFT>::getEMachine() const {
   return EF.getHeader()->e_machine;
 }
 
+template <class ELFT> uint16_t ELFObjectFile<ELFT>::getEType() const {
+  return EF.getHeader()->e_type;
+}
+
 template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Sym) const {
   return getSymbol(Sym)->st_size;
@@ -698,8 +710,9 @@ bool ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec) const {
 template <class ELFT>
 bool ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec) const {
   const Elf_Shdr *EShdr = getSection(Sec);
-  return EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
-         EShdr->sh_type == ELF::SHT_PROGBITS;
+  return EShdr->sh_type == ELF::SHT_PROGBITS &&
+         EShdr->sh_flags & ELF::SHF_ALLOC &&
+         !(EShdr->sh_flags & ELF::SHF_EXECINSTR);
 }
 
 template <class ELFT>
@@ -710,6 +723,35 @@ bool ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec) const {
 }
 
 template <class ELFT>
+std::vector<SectionRef>
+ELFObjectFile<ELFT>::dynamic_relocation_sections() const {
+  std::vector<SectionRef> Res;
+  std::vector<uintptr_t> Offsets;
+
+  auto SectionsOrErr = EF.sections();
+  if (!SectionsOrErr)
+    return Res;
+
+  for (const Elf_Shdr &Sec : *SectionsOrErr) {
+    if (Sec.sh_type != ELF::SHT_DYNAMIC)
+      continue;
+    Elf_Dyn *Dynamic =
+        reinterpret_cast<Elf_Dyn *>((uintptr_t)base() + Sec.sh_offset);
+    for (; Dynamic->d_tag != ELF::DT_NULL; Dynamic++) {
+      if (Dynamic->d_tag == ELF::DT_REL || Dynamic->d_tag == ELF::DT_RELA ||
+          Dynamic->d_tag == ELF::DT_JMPREL) {
+        Offsets.push_back(Dynamic->d_un.d_val);
+      }
+    }
+  }
+  for (const Elf_Shdr &Sec : *SectionsOrErr) {
+    if (is_contained(Offsets, Sec.sh_offset))
+      Res.emplace_back(toDRI(&Sec), this);
+  }
+  return Res;
+}
+
+template <class ELFT>
 bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
   return getSection(Sec)->sh_type == ELF::SHT_NOBITS;
 }
@@ -790,8 +832,6 @@ ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
 
 template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel) const {
-  assert(EF.getHeader()->e_type == ELF::ET_REL &&
-         "Only relocatable object files have relocation offsets");
   const Elf_Shdr *sec = getRelSection(Rel);
   if (sec->sh_type == ELF::SHT_REL)
     return getRel(Rel)->r_offset;
@@ -986,8 +1026,6 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
     case ELF::EM_SPARC:
     case ELF::EM_SPARC32PLUS:
       return "ELF32-sparc";
-    case ELF::EM_WEBASSEMBLY:
-      return "ELF32-wasm";
     case ELF::EM_AMDGPU:
       return "ELF32-amdgpu";
     default:
@@ -1011,8 +1049,6 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "ELF64-sparc";
     case ELF::EM_MIPS:
       return "ELF64-mips";
-    case ELF::EM_WEBASSEMBLY:
-      return "ELF64-wasm";
     case ELF::EM_AMDGPU:
       return "ELF64-amdgpu";
     case ELF::EM_BPF:
@@ -1074,26 +1110,20 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
     return IsLittleEndian ? Triple::sparcel : Triple::sparc;
   case ELF::EM_SPARCV9:
     return Triple::sparcv9;
-  case ELF::EM_WEBASSEMBLY:
-    switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
-    case ELF::ELFCLASS32: return Triple::wasm32;
-    case ELF::ELFCLASS64: return Triple::wasm64;
-    default: return Triple::UnknownArch;
-    }
 
   case ELF::EM_AMDGPU: {
     if (!IsLittleEndian)
       return Triple::UnknownArch;
 
-    unsigned EFlags = EF.getHeader()->e_flags;
-    switch (EFlags & ELF::EF_AMDGPU_ARCH) {
-    case ELF::EF_AMDGPU_ARCH_R600:
+    unsigned MACH = EF.getHeader()->e_flags & ELF::EF_AMDGPU_MACH;
+    if (MACH >= ELF::EF_AMDGPU_MACH_R600_FIRST &&
+        MACH <= ELF::EF_AMDGPU_MACH_R600_LAST)
       return Triple::r600;
-    case ELF::EF_AMDGPU_ARCH_GCN:
+    if (MACH >= ELF::EF_AMDGPU_MACH_AMDGCN_FIRST &&
+        MACH <= ELF::EF_AMDGPU_MACH_AMDGCN_LAST)
       return Triple::amdgcn;
-    default:
-      return Triple::UnknownArch;
-    }
+
+    return Triple::UnknownArch;
   }
 
   case ELF::EM_BPF:
@@ -1105,6 +1135,11 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
 }
 
 template <class ELFT>
+Expected<uint64_t> ELFObjectFile<ELFT>::getStartAddress() const {
+  return EF.getHeader()->e_entry;
+}
+
+template <class ELFT>
 ELFObjectFileBase::elf_symbol_iterator_range
 ELFObjectFile<ELFT>::getDynamicSymbolIterators() const {
   return make_range(dynamic_symbol_begin(), dynamic_symbol_end());
diff --git a/contrib/llvm/include/llvm/Object/ELFTypes.h b/contrib/llvm/include/llvm/Object/ELFTypes.h
index 83b688548fdc..fb386120e34d 100644
--- a/contrib/llvm/include/llvm/Object/ELFTypes.h
+++ b/contrib/llvm/include/llvm/Object/ELFTypes.h
@@ -40,11 +40,15 @@ template <class ELFT> struct Elf_Versym_Impl;
 template <class ELFT> struct Elf_Hash_Impl;
 template <class ELFT> struct Elf_GnuHash_Impl;
 template <class ELFT> struct Elf_Chdr_Impl;
+template <class ELFT> struct Elf_Nhdr_Impl;
+template <class ELFT> class Elf_Note_Impl;
+template <class ELFT> class Elf_Note_Iterator_Impl;
+template <class ELFT> struct Elf_CGProfile_Impl;
 
 template <endianness E, bool Is64> struct ELFType {
 private:
   template <typename Ty>
-  using packed = support::detail::packed_endian_specific_integral<Ty, E, 2>;
+  using packed = support::detail::packed_endian_specific_integral<Ty, E, 1>;
 
 public:
   static const endianness TargetEndianness = E;
@@ -58,6 +62,7 @@ public:
   using Phdr = Elf_Phdr_Impl<ELFType<E, Is64>>;
   using Rel = Elf_Rel_Impl<ELFType<E, Is64>, false>;
   using Rela = Elf_Rel_Impl<ELFType<E, Is64>, true>;
+  using Relr = packed<uint>;
   using Verdef = Elf_Verdef_Impl<ELFType<E, Is64>>;
   using Verdaux = Elf_Verdaux_Impl<ELFType<E, Is64>>;
   using Verneed = Elf_Verneed_Impl<ELFType<E, Is64>>;
@@ -66,11 +71,16 @@ public:
   using Hash = Elf_Hash_Impl<ELFType<E, Is64>>;
   using GnuHash = Elf_GnuHash_Impl<ELFType<E, Is64>>;
   using Chdr = Elf_Chdr_Impl<ELFType<E, Is64>>;
+  using Nhdr = Elf_Nhdr_Impl<ELFType<E, Is64>>;
+  using Note = Elf_Note_Impl<ELFType<E, Is64>>;
+  using NoteIterator = Elf_Note_Iterator_Impl<ELFType<E, Is64>>;
+  using CGProfile = Elf_CGProfile_Impl<ELFType<E, Is64>>;
   using DynRange = ArrayRef<Dyn>;
   using ShdrRange = ArrayRef<Shdr>;
   using SymRange = ArrayRef<Sym>;
   using RelRange = ArrayRef<Rel>;
   using RelaRange = ArrayRef<Rela>;
+  using RelrRange = ArrayRef<Relr>;
   using PhdrRange = ArrayRef<Phdr>;
 
   using Half = packed<uint16_t>;
@@ -90,46 +100,7 @@ using ELF64BE = ELFType<support::big, true>;
 // Use an alignment of 2 for the typedefs since that is the worst case for
 // ELF files in archives.
 
-// Templates to choose Elf_Addr and Elf_Off depending on is64Bits.
-template <endianness target_endianness> struct ELFDataTypeTypedefHelperCommon {
-  using Elf_Half = support::detail::packed_endian_specific_integral<
-      uint16_t, target_endianness, 2>;
-  using Elf_Word = support::detail::packed_endian_specific_integral<
-      uint32_t, target_endianness, 2>;
-  using Elf_Sword = support::detail::packed_endian_specific_integral<
-      int32_t, target_endianness, 2>;
-  using Elf_Xword = support::detail::packed_endian_specific_integral<
-      uint64_t, target_endianness, 2>;
-  using Elf_Sxword = support::detail::packed_endian_specific_integral<
-      int64_t, target_endianness, 2>;
-};
-
-template <class ELFT> struct ELFDataTypeTypedefHelper;
-
-/// ELF 32bit types.
-template <endianness TargetEndianness>
-struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, false>>
-    : ELFDataTypeTypedefHelperCommon<TargetEndianness> {
-  using value_type = uint32_t;
-  using Elf_Addr = support::detail::packed_endian_specific_integral<
-      value_type, TargetEndianness, 2>;
-  using Elf_Off = support::detail::packed_endian_specific_integral<
-      value_type, TargetEndianness, 2>;
-};
-
-/// ELF 64bit types.
-template <endianness TargetEndianness>
-struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, true>>
-    : ELFDataTypeTypedefHelperCommon<TargetEndianness> {
-  using value_type = uint64_t;
-  using Elf_Addr = support::detail::packed_endian_specific_integral<
-      value_type, TargetEndianness, 2>;
-  using Elf_Off = support::detail::packed_endian_specific_integral<
-      value_type, TargetEndianness, 2>;
-};
-
 // I really don't like doing this, but the alternative is copypasta.
-
 #define LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)                                       \
   using Elf_Addr = typename ELFT::Addr;                                        \
   using Elf_Off = typename ELFT::Off;                                          \
@@ -139,9 +110,9 @@ struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, true>>
   using Elf_Xword = typename ELFT::Xword;                                      \
   using Elf_Sxword = typename ELFT::Sxword;
 
-#define LLD_ELF_COMMA ,
+#define LLVM_ELF_COMMA ,
 #define LLVM_ELF_IMPORT_TYPES(E, W)                                            \
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFType<E LLD_ELF_COMMA W>)
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFType<E LLVM_ELF_COMMA W>)
 
 // Section header.
 template <class ELFT> struct Elf_Shdr_Base;
@@ -181,7 +152,7 @@ struct Elf_Shdr_Impl : Elf_Shdr_Base<ELFT> {
   using Elf_Shdr_Base<ELFT>::sh_entsize;
   using Elf_Shdr_Base<ELFT>::sh_size;
 
-  /// @brief Get the number of entities this section contains if it has any.
+  /// Get the number of entities this section contains if it has any.
   unsigned getEntityCount() const {
     if (sh_entsize == 0)
       return 0;
@@ -590,6 +561,134 @@ struct Elf_Chdr_Impl<ELFType<TargetEndianness, true>> {
   Elf_Xword ch_addralign;
 };
 
+/// Note header
+template <class ELFT>
+struct Elf_Nhdr_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Word n_namesz;
+  Elf_Word n_descsz;
+  Elf_Word n_type;
+
+  /// The alignment of the name and descriptor.
+  ///
+  /// Implementations differ from the specification here: in practice all
+  /// variants align both the name and descriptor to 4-bytes.
+  static const unsigned int Align = 4;
+
+  /// Get the size of the note, including name, descriptor, and padding.
+  size_t getSize() const {
+    return sizeof(*this) + alignTo<Align>(n_namesz) + alignTo<Align>(n_descsz);
+  }
+};
+
+/// An ELF note.
+///
+/// Wraps a note header, providing methods for accessing the name and
+/// descriptor safely.
+template <class ELFT>
+class Elf_Note_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+
+  const Elf_Nhdr_Impl<ELFT> &Nhdr;
+
+  template <class NoteIteratorELFT> friend class Elf_Note_Iterator_Impl;
+
+  Elf_Note_Impl(const Elf_Nhdr_Impl<ELFT> &Nhdr) : Nhdr(Nhdr) {}
+
+public:
+  /// Get the note's name, excluding the terminating null byte.
+  StringRef getName() const {
+    if (!Nhdr.n_namesz)
+      return StringRef();
+    return StringRef(reinterpret_cast<const char *>(&Nhdr) + sizeof(Nhdr),
+                     Nhdr.n_namesz - 1);
+  }
+
+  /// Get the note's descriptor.
+  ArrayRef<Elf_Word> getDesc() const {
+    if (!Nhdr.n_descsz)
+      return ArrayRef<Elf_Word>();
+    return ArrayRef<Elf_Word>(
+        reinterpret_cast<const Elf_Word *>(
+            reinterpret_cast<const uint8_t *>(&Nhdr) + sizeof(Nhdr) +
+            alignTo<Elf_Nhdr_Impl<ELFT>::Align>(Nhdr.n_namesz)),
+        Nhdr.n_descsz);
+  }
+
+  /// Get the note's type.
+  Elf_Word getType() const { return Nhdr.n_type; }
+};
+
+template <class ELFT>
+class Elf_Note_Iterator_Impl
+    : std::iterator<std::forward_iterator_tag, Elf_Note_Impl<ELFT>> {
+  // Nhdr being a nullptr marks the end of iteration.
+  const Elf_Nhdr_Impl<ELFT> *Nhdr = nullptr;
+  size_t RemainingSize = 0u;
+  Error *Err = nullptr;
+
+  template <class ELFFileELFT> friend class ELFFile;
+
+  // Stop iteration and indicate an overflow.
+  void stopWithOverflowError() {
+    Nhdr = nullptr;
+    *Err = make_error<StringError>("ELF note overflows container",
+                                   object_error::parse_failed);
+  }
+
+  // Advance Nhdr by NoteSize bytes, starting from NhdrPos.
+  //
+  // Assumes NoteSize <= RemainingSize. Ensures Nhdr->getSize() <= RemainingSize
+  // upon returning. Handles stopping iteration when reaching the end of the
+  // container, either cleanly or with an overflow error.
+  void advanceNhdr(const uint8_t *NhdrPos, size_t NoteSize) {
+    RemainingSize -= NoteSize;
+    if (RemainingSize == 0u)
+      Nhdr = nullptr;
+    else if (sizeof(*Nhdr) > RemainingSize)
+      stopWithOverflowError();
+    else {
+      Nhdr = reinterpret_cast<const Elf_Nhdr_Impl<ELFT> *>(NhdrPos + NoteSize);
+      if (Nhdr->getSize() > RemainingSize)
+        stopWithOverflowError();
+    }
+  }
+
+  Elf_Note_Iterator_Impl() {}
+  explicit Elf_Note_Iterator_Impl(Error &Err) : Err(&Err) {}
+  Elf_Note_Iterator_Impl(const uint8_t *Start, size_t Size, Error &Err)
+      : RemainingSize(Size), Err(&Err) {
+    assert(Start && "ELF note iterator starting at NULL");
+    advanceNhdr(Start, 0u);
+  }
+
+public:
+  Elf_Note_Iterator_Impl &operator++() {
+    assert(Nhdr && "incremented ELF note end iterator");
+    const uint8_t *NhdrPos = reinterpret_cast<const uint8_t *>(Nhdr);
+    size_t NoteSize = Nhdr->getSize();
+    advanceNhdr(NhdrPos, NoteSize);
+    return *this;
+  }
+  bool operator==(Elf_Note_Iterator_Impl Other) const {
+    return Nhdr == Other.Nhdr;
+  }
+  bool operator!=(Elf_Note_Iterator_Impl Other) const {
+    return !(*this == Other);
+  }
+  Elf_Note_Impl<ELFT> operator*() const {
+    assert(Nhdr && "dereferenced ELF note end iterator");
+    return Elf_Note_Impl<ELFT>(*Nhdr);
+  }
+};
+
+template <class ELFT> struct Elf_CGProfile_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Word cgp_from;
+  Elf_Word cgp_to;
+  Elf_Xword cgp_weight;
+};
+
 // MIPS .reginfo section
 template <class ELFT>
 struct Elf_Mips_RegInfo;
diff --git a/contrib/llvm/include/llvm/Object/IRObjectFile.h b/contrib/llvm/include/llvm/Object/IRObjectFile.h
index 6c271b1a1f44..993359b766a1 100644
--- a/contrib/llvm/include/llvm/Object/IRObjectFile.h
+++ b/contrib/llvm/include/llvm/Object/IRObjectFile.h
@@ -50,11 +50,22 @@ public:
     return v->isIR();
   }
 
-  /// \brief Finds and returns bitcode embedded in the given object file, or an
+  using module_iterator =
+      pointee_iterator<std::vector<std::unique_ptr<Module>>::const_iterator,
+                       const Module>;
+
+  module_iterator module_begin() const { return module_iterator(Mods.begin()); }
+  module_iterator module_end() const { return module_iterator(Mods.end()); }
+
+  iterator_range<module_iterator> modules() const {
+    return make_range(module_begin(), module_end());
+  }
+
+  /// Finds and returns bitcode embedded in the given object file, or an
   /// error code if not found.
   static Expected<MemoryBufferRef> findBitcodeInObject(const ObjectFile &Obj);
 
-  /// \brief Finds and returns bitcode in the given memory buffer (which may
+  /// Finds and returns bitcode in the given memory buffer (which may
   /// be either a bitcode file or a native object file with embedded bitcode),
   /// or an error code if not found.
   static Expected<MemoryBufferRef>
diff --git a/contrib/llvm/include/llvm/Object/MachO.h b/contrib/llvm/include/llvm/Object/MachO.h
index d0cc40da4293..531b3d249035 100644
--- a/contrib/llvm/include/llvm/Object/MachO.h
+++ b/contrib/llvm/include/llvm/Object/MachO.h
@@ -304,6 +304,8 @@ public:
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
   uint64_t getSectionAlignment(DataRefImpl Sec) const override;
+  Expected<SectionRef> getSection(unsigned SectionIndex) const;
+  Expected<SectionRef> getSection(StringRef SectionName) const;
   bool isSectionCompressed(DataRefImpl Sec) const override;
   bool isSectionText(DataRefImpl Sec) const override;
   bool isSectionData(DataRefImpl Sec) const override;
@@ -463,7 +465,7 @@ public:
 
   // In a MachO file, sections have a segment name. This is used in the .o
   // files. They have a single segment, but this field specifies which segment
-  // a section should be put in in the final object.
+  // a section should be put in the final object.
   StringRef getSectionFinalSegmentName(DataRefImpl Sec) const;
 
   // Names are stored as 16 bytes. These returns the raw 16 bytes without
diff --git a/contrib/llvm/include/llvm/Object/MachOUniversal.h b/contrib/llvm/include/llvm/Object/MachOUniversal.h
index 72837d0970c4..9e70b0bc30c0 100644
--- a/contrib/llvm/include/llvm/Object/MachOUniversal.h
+++ b/contrib/llvm/include/llvm/Object/MachOUniversal.h
@@ -34,9 +34,9 @@ class MachOUniversalBinary : public Binary {
 public:
   class ObjectForArch {
     const MachOUniversalBinary *Parent;
-    /// \brief Index of object in the universal binary.
+    /// Index of object in the universal binary.
     uint32_t Index;
-    /// \brief Descriptor of the object.
+    /// Descriptor of the object.
     MachO::fat_arch Header;
     MachO::fat_arch_64 Header64;
 
diff --git a/contrib/llvm/include/llvm/Object/ModuleSymbolTable.h b/contrib/llvm/include/llvm/Object/ModuleSymbolTable.h
index 9e9322885388..c3cbc27998e5 100644
--- a/contrib/llvm/include/llvm/Object/ModuleSymbolTable.h
+++ b/contrib/llvm/include/llvm/Object/ModuleSymbolTable.h
@@ -57,6 +57,15 @@ public:
   static void CollectAsmSymbols(
       const Module &M,
       function_ref<void(StringRef, object::BasicSymbolRef::Flags)> AsmSymbol);
+
+  /// Parse inline ASM and collect the symvers directives that are defined in
+  /// the current module.
+  ///
+  /// For each found symbol, call \p AsmSymver with the name of the symbol and
+  /// its alias.
+  static void
+  CollectAsmSymvers(const Module &M,
+                    function_ref<void(StringRef, StringRef)> AsmSymver);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/Object/ObjectFile.h b/contrib/llvm/include/llvm/Object/ObjectFile.h
index 079a59468156..02d62e8e4879 100644
--- a/contrib/llvm/include/llvm/Object/ObjectFile.h
+++ b/contrib/llvm/include/llvm/Object/ObjectFile.h
@@ -65,7 +65,7 @@ public:
   symbol_iterator getSymbol() const;
   uint64_t getType() const;
 
-  /// @brief Get a string that represents the type of this relocation.
+  /// Get a string that represents the type of this relocation.
   ///
   /// This is for display purposes only.
   void getTypeName(SmallVectorImpl<char> &Result) const;
@@ -100,7 +100,7 @@ public:
   uint64_t getSize() const;
   std::error_code getContents(StringRef &Result) const;
 
-  /// @brief Get the alignment of this section as the actual value (not log 2).
+  /// Get the alignment of this section as the actual value (not log 2).
   uint64_t getAlignment() const;
 
   bool isCompressed() const;
@@ -154,12 +154,12 @@ public:
   /// offset or a virtual address.
   uint64_t getValue() const;
 
-  /// @brief Get the alignment of this symbol as the actual value (not log 2).
+  /// Get the alignment of this symbol as the actual value (not log 2).
   uint32_t getAlignment() const;
   uint64_t getCommonSize() const;
   Expected<SymbolRef::Type> getType() const;
 
-  /// @brief Get section this symbol is defined in reference to. Result is
+  /// Get section this symbol is defined in reference to. Result is
   /// end_sections() if it is undefined or is an absolute symbol.
   Expected<section_iterator> getSection() const;
 
@@ -262,6 +262,10 @@ public:
     return getCommonSymbolSizeImpl(Symb);
   }
 
+  virtual std::vector<SectionRef> dynamic_relocation_sections() const {
+    return std::vector<SectionRef>();
+  }
+
   using symbol_iterator_range = iterator_range<symbol_iterator>;
   symbol_iterator_range symbols() const {
     return symbol_iterator_range(symbol_begin(), symbol_end());
@@ -275,7 +279,7 @@ public:
     return section_iterator_range(section_begin(), section_end());
   }
 
-  /// @brief The number of bytes used to represent an address in this object
+  /// The number of bytes used to represent an address in this object
   ///        file format.
   virtual uint8_t getBytesInAddress() const = 0;
 
@@ -283,16 +287,13 @@ public:
   virtual Triple::ArchType getArch() const = 0;
   virtual SubtargetFeatures getFeatures() const = 0;
   virtual void setARMSubArch(Triple &TheTriple) const { }
+  virtual Expected<uint64_t> getStartAddress() const {
+    return errorCodeToError(object_error::parse_failed);
+  };
 
-  /// @brief Create a triple from the data in this object file.
+  /// Create a triple from the data in this object file.
   Triple makeTriple() const;
 
-  /// Returns platform-specific object flags, if any.
-  virtual std::error_code getPlatformFlags(unsigned &Result) const {
-    Result = 0;
-    return object_error::invalid_file_type;
-  }
-
   virtual std::error_code
     getBuildAttributes(ARMAttributeParser &Attributes) const {
       return std::error_code();
@@ -307,7 +308,7 @@ public:
   /// @returns Pointer to ObjectFile subclass to handle this type of object.
   /// @param ObjectPath The path to the object file. ObjectPath.isObject must
   ///        return true.
-  /// @brief Create ObjectFile from path.
+  /// Create ObjectFile from path.
   static Expected<OwningBinary<ObjectFile>>
   createObjectFile(StringRef ObjectPath);
 
diff --git a/contrib/llvm/include/llvm/Object/RelocVisitor.h b/contrib/llvm/include/llvm/Object/RelocVisitor.h
index 2d0e938f06fd..008e109f6679 100644
--- a/contrib/llvm/include/llvm/Object/RelocVisitor.h
+++ b/contrib/llvm/include/llvm/Object/RelocVisitor.h
@@ -23,6 +23,7 @@
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/Wasm.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstdint>
@@ -31,7 +32,7 @@
 namespace llvm {
 namespace object {
 
-/// @brief Base class for object file relocation visitors.
+/// Base class for object file relocation visitors.
 class RelocVisitor {
 public:
   explicit RelocVisitor(const ObjectFile &Obj) : ObjToVisit(Obj) {}
@@ -46,6 +47,8 @@ public:
       return visitCOFF(Rel, R, Value);
     if (isa<MachOObjectFile>(ObjToVisit))
       return visitMachO(Rel, R, Value);
+    if (isa<WasmObjectFile>(ObjToVisit))
+      return visitWasm(Rel, R, Value);
 
     HasError = true;
     return 0;
@@ -316,6 +319,27 @@ private:
     HasError = true;
     return 0;
   }
+
+  uint64_t visitWasm(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (ObjToVisit.getArch() == Triple::wasm32) {
+      switch (Rel) {
+      case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+      case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+      case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+      case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
+      case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
+      case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
+      case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+      case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
+      case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
+      case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32:
+        // For wasm section, its offset at 0 -- ignoring Value
+        return 0;
+      }
+    }
+    HasError = true;
+    return 0;
+  }
 };
 
 } // end namespace object
diff --git a/contrib/llvm/include/llvm/Object/Wasm.h b/contrib/llvm/include/llvm/Object/Wasm.h
index 71951d83f3cc..fd34e45feb62 100644
--- a/contrib/llvm/include/llvm/Object/Wasm.h
+++ b/contrib/llvm/include/llvm/Object/Wasm.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
@@ -34,61 +35,49 @@ namespace object {
 
 class WasmSymbol {
 public:
-  enum class SymbolType {
-    FUNCTION_IMPORT,
-    FUNCTION_EXPORT,
-    GLOBAL_IMPORT,
-    GLOBAL_EXPORT,
-    DEBUG_FUNCTION_NAME,
-  };
-
-  WasmSymbol(StringRef Name, SymbolType Type, uint32_t Section,
-             uint32_t ElementIndex, uint32_t FunctionType = 0)
-      : Name(Name), Type(Type), Section(Section), ElementIndex(ElementIndex),
-        FunctionType(FunctionType) {}
+  WasmSymbol(const wasm::WasmSymbolInfo &Info,
+             const wasm::WasmSignature *FunctionType,
+             const wasm::WasmGlobalType *GlobalType)
+      : Info(Info), FunctionType(FunctionType), GlobalType(GlobalType) {}
 
-  StringRef Name;
-  SymbolType Type;
-  uint32_t Section;
-  uint32_t Flags = 0;
+  const wasm::WasmSymbolInfo &Info;
+  const wasm::WasmSignature *FunctionType;
+  const wasm::WasmGlobalType *GlobalType;
 
-  // Index into either the function or global index space.
-  uint32_t ElementIndex;
-
-  // For function, the type index
-  uint32_t FunctionType;
+  bool isTypeFunction() const {
+    return Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION;
+  }
 
-  // Symbols can be both exported and imported (in the case of the weakly
-  // defined symbol).  In this the import index is stored as AltIndex.
-  uint32_t AltIndex = 0;
-  bool HasAltIndex = false;
+  bool isTypeData() const { return Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA; }
 
-  void setAltIndex(uint32_t Index) {
-    HasAltIndex = true;
-    AltIndex = Index;
+  bool isTypeGlobal() const {
+    return Info.Kind == wasm::WASM_SYMBOL_TYPE_GLOBAL;
   }
 
-  bool isFunction() const {
-    return Type == WasmSymbol::SymbolType::FUNCTION_IMPORT ||
-           Type == WasmSymbol::SymbolType::FUNCTION_EXPORT ||
-           Type == WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME;
+  bool isTypeSection() const {
+    return Info.Kind == wasm::WASM_SYMBOL_TYPE_SECTION;
   }
 
+  bool isDefined() const { return !isUndefined(); }
 
-  bool isWeak() const {
+  bool isUndefined() const {
+    return (Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) != 0;
+  }
+
+  bool isBindingWeak() const {
     return getBinding() == wasm::WASM_SYMBOL_BINDING_WEAK;
   }
 
-  bool isGlobal() const {
+  bool isBindingGlobal() const {
     return getBinding() == wasm::WASM_SYMBOL_BINDING_GLOBAL;
   }
 
-  bool isLocal() const {
+  bool isBindingLocal() const {
     return getBinding() == wasm::WASM_SYMBOL_BINDING_LOCAL;
   }
 
   unsigned getBinding() const {
-    return Flags & wasm::WASM_SYMBOL_BINDING_MASK;
+    return Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK;
   }
 
   bool isHidden() const {
@@ -96,16 +85,13 @@ public:
   }
 
   unsigned getVisibility() const {
-    return Flags & wasm::WASM_SYMBOL_VISIBILITY_MASK;
+    return Info.Flags & wasm::WASM_SYMBOL_VISIBILITY_MASK;
   }
 
-  void print(raw_ostream &Out) const {
-    Out << "Name=" << Name << ", Type=" << static_cast<int>(Type)
-        << ", Flags=" << Flags << " ElemIndex=" << ElementIndex;
-  }
+  void print(raw_ostream &Out) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+  LLVM_DUMP_METHOD void dump() const;
 #endif
 };
 
@@ -144,12 +130,16 @@ public:
   ArrayRef<wasm::WasmLimits> memories() const { return Memories; }
   ArrayRef<wasm::WasmGlobal> globals() const { return Globals; }
   ArrayRef<wasm::WasmExport> exports() const { return Exports; }
+  ArrayRef<WasmSymbol> syms() const { return Symbols; }
   const wasm::WasmLinkingData& linkingData() const { return LinkingData; }
   uint32_t getNumberOfSymbols() const { return Symbols.size(); }
   ArrayRef<wasm::WasmElemSegment> elements() const { return ElemSegments; }
   ArrayRef<WasmSegment> dataSegments() const { return DataSegments; }
   ArrayRef<wasm::WasmFunction> functions() const { return Functions; }
+  ArrayRef<wasm::WasmFunctionName> debugNames() const { return DebugNames; }
   uint32_t startFunction() const { return StartFunction; }
+  uint32_t getNumImportedGlobals() const { return NumImportedGlobals; }
+  uint32_t getNumImportedFunctions() const { return NumImportedFunctions; }
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
 
@@ -203,39 +193,50 @@ public:
   SubtargetFeatures getFeatures() const override;
   bool isRelocatableObject() const override;
 
+  struct ReadContext {
+    const uint8_t *Start;
+    const uint8_t *Ptr;
+    const uint8_t *End;
+  };
+
 private:
   bool isValidFunctionIndex(uint32_t Index) const;
+  bool isDefinedFunctionIndex(uint32_t Index) const;
+  bool isValidGlobalIndex(uint32_t Index) const;
+  bool isDefinedGlobalIndex(uint32_t Index) const;
+  bool isValidFunctionSymbol(uint32_t Index) const;
+  bool isValidGlobalSymbol(uint32_t Index) const;
+  bool isValidDataSymbol(uint32_t Index) const;
+  bool isValidSectionSymbol(uint32_t Index) const;
+  wasm::WasmFunction &getDefinedFunction(uint32_t Index);
+  wasm::WasmGlobal &getDefinedGlobal(uint32_t Index);
+
   const WasmSection &getWasmSection(DataRefImpl Ref) const;
   const wasm::WasmRelocation &getWasmRelocation(DataRefImpl Ref) const;
 
-  WasmSection* findCustomSectionByName(StringRef Name);
-  WasmSection* findSectionByType(uint32_t Type);
-
   const uint8_t *getPtr(size_t Offset) const;
   Error parseSection(WasmSection &Sec);
-  Error parseCustomSection(WasmSection &Sec, const uint8_t *Ptr,
-                           const uint8_t *End);
+  Error parseCustomSection(WasmSection &Sec, ReadContext &Ctx);
 
   // Standard section types
-  Error parseTypeSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseImportSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseFunctionSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseTableSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseMemorySection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseGlobalSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseExportSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseStartSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseElemSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseCodeSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseDataSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseTypeSection(ReadContext &Ctx);
+  Error parseImportSection(ReadContext &Ctx);
+  Error parseFunctionSection(ReadContext &Ctx);
+  Error parseTableSection(ReadContext &Ctx);
+  Error parseMemorySection(ReadContext &Ctx);
+  Error parseGlobalSection(ReadContext &Ctx);
+  Error parseExportSection(ReadContext &Ctx);
+  Error parseStartSection(ReadContext &Ctx);
+  Error parseElemSection(ReadContext &Ctx);
+  Error parseCodeSection(ReadContext &Ctx);
+  Error parseDataSection(ReadContext &Ctx);
 
   // Custom section types
-  Error parseNameSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseLinkingSection(const uint8_t *Ptr, const uint8_t *End);
-  Error parseRelocSection(StringRef Name, const uint8_t *Ptr,
-                          const uint8_t *End);
-
-  void populateSymbolTable();
+  Error parseNameSection(ReadContext &Ctx);
+  Error parseLinkingSection(ReadContext &Ctx);
+  Error parseLinkingSectionSymtab(ReadContext &Ctx);
+  Error parseLinkingSectionComdat(ReadContext &Ctx);
+  Error parseRelocSection(StringRef Name, ReadContext &Ctx);
 
   wasm::WasmObjectHeader Header;
   std::vector<WasmSection> Sections;
@@ -250,15 +251,15 @@ private:
   std::vector<WasmSegment> DataSegments;
   std::vector<wasm::WasmFunction> Functions;
   std::vector<WasmSymbol> Symbols;
+  std::vector<wasm::WasmFunctionName> DebugNames;
   uint32_t StartFunction = -1;
   bool HasLinkingSection = false;
   wasm::WasmLinkingData LinkingData;
   uint32_t NumImportedGlobals = 0;
   uint32_t NumImportedFunctions = 0;
-  uint32_t ImportSection = 0;
-  uint32_t ExportSection = 0;
-
-  StringMap<uint32_t> SymbolMap;
+  uint32_t CodeSection = 0;
+  uint32_t DataSection = 0;
+  uint32_t GlobalSection = 0;
 };
 
 } // end namespace object
diff --git a/contrib/llvm/include/llvm/Object/WasmTraits.h b/contrib/llvm/include/llvm/Object/WasmTraits.h
new file mode 100644
index 000000000000..ebcd00b15227
--- /dev/null
+++ b/contrib/llvm/include/llvm/Object/WasmTraits.h
@@ -0,0 +1,63 @@
+//===- WasmTraits.h - DenseMap traits for the Wasm structures ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides llvm::DenseMapInfo traits for the Wasm structures.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_WASMTRAITS_H
+#define LLVM_OBJECT_WASMTRAITS_H
+
+#include "llvm/ADT/Hashing.h"
+#include "llvm/BinaryFormat/Wasm.h"
+
+namespace llvm {
+
+template <typename T> struct DenseMapInfo;
+
+// Traits for using WasmSignature in a DenseMap.
+template <> struct DenseMapInfo<wasm::WasmSignature> {
+  static wasm::WasmSignature getEmptyKey() {
+    return wasm::WasmSignature{{}, 1};
+  }
+  static wasm::WasmSignature getTombstoneKey() {
+    return wasm::WasmSignature{{}, 2};
+  }
+  static unsigned getHashValue(const wasm::WasmSignature &Sig) {
+    unsigned H = hash_value(Sig.ReturnType);
+    for (int32_t Param : Sig.ParamTypes)
+      H = hash_combine(H, Param);
+    return H;
+  }
+  static bool isEqual(const wasm::WasmSignature &LHS,
+                      const wasm::WasmSignature &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Traits for using WasmGlobalType in a DenseMap
+template <> struct DenseMapInfo<wasm::WasmGlobalType> {
+  static wasm::WasmGlobalType getEmptyKey() {
+    return wasm::WasmGlobalType{1, true};
+  }
+  static wasm::WasmGlobalType getTombstoneKey() {
+    return wasm::WasmGlobalType{2, true};
+  }
+  static unsigned getHashValue(const wasm::WasmGlobalType &GlobalType) {
+    return hash_combine(GlobalType.Type, GlobalType.Mutable);
+  }
+  static bool isEqual(const wasm::WasmGlobalType &LHS,
+                      const wasm::WasmGlobalType &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_OBJECT_WASMTRAITS_H
diff --git a/contrib/llvm/include/llvm/ObjectYAML/COFFYAML.h b/contrib/llvm/include/llvm/ObjectYAML/COFFYAML.h
index 8794eaa6d59a..78f021fc0386 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/COFFYAML.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/COFFYAML.h
@@ -67,6 +67,7 @@ struct Section {
   yaml::BinaryRef SectionData;
   std::vector<CodeViewYAML::YAMLDebugSubsection> DebugS;
   std::vector<CodeViewYAML::LeafRecord> DebugT;
+  std::vector<CodeViewYAML::LeafRecord> DebugP;
   Optional<CodeViewYAML::DebugHSection> DebugH;
   std::vector<Relocation> Relocations;
   StringRef Name;
diff --git a/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h b/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h
index 4f0d9efb963b..344966fe6891 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h
@@ -32,10 +32,10 @@ namespace CodeViewYAML {
 struct GlobalHash {
   GlobalHash() = default;
   explicit GlobalHash(StringRef S) : Hash(S) {
-    assert(S.size() == 20 && "Invalid hash size!");
+    assert(S.size() == 8 && "Invalid hash size!");
   }
   explicit GlobalHash(ArrayRef<uint8_t> S) : Hash(S) {
-    assert(S.size() == 20 && "Invalid hash size!");
+    assert(S.size() == 8 && "Invalid hash size!");
   }
   yaml::BinaryRef Hash;
 };
@@ -47,7 +47,7 @@ struct DebugHSection {
   std::vector<GlobalHash> Hashes;
 };
 
-DebugHSection fromDebugH(ArrayRef<uint8_t> DebugT);
+DebugHSection fromDebugH(ArrayRef<uint8_t> DebugH);
 ArrayRef<uint8_t> toDebugH(const DebugHSection &DebugH,
                            BumpPtrAllocator &Alloc);
 
diff --git a/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
index bc3b5567c2f9..1b1306df4f53 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
@@ -51,8 +51,10 @@ struct LeafRecord {
   static Expected<LeafRecord> fromCodeViewRecord(codeview::CVType Type);
 };
 
-std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugT);
-ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc);
+std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
+                                   StringRef SectionName);
+ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc,
+                           StringRef SectionName);
 
 } // end namespace CodeViewYAML
 
diff --git a/contrib/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/contrib/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index 0d7d8b4efbdf..ce3227421930 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 /// \file
-/// \brief Common declarations for yaml2obj
+/// Common declarations for yaml2obj
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_OBJECTYAML_DWARFEMITTER_H
@@ -39,11 +39,12 @@ void EmitDebugInfo(raw_ostream &OS, const Data &DI);
 void EmitDebugLine(raw_ostream &OS, const Data &DI);
 
 Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
-EmitDebugSections(StringRef YAMLString,
+EmitDebugSections(StringRef YAMLString, bool ApplyFixups = false,
                   bool IsLittleEndian = sys::IsLittleEndianHost);
+StringMap<std::unique_ptr<MemoryBuffer>>
+EmitDebugSections(llvm::DWARFYAML::Data &DI, bool ApplyFixups);
 
 } // end namespace DWARFYAML
-
 } // end namespace llvm
 
 #endif // LLVM_OBJECTYAML_DWARFEMITTER_H
diff --git a/contrib/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/contrib/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 2162f0fef852..705c88778945 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares classes for handling the YAML representation
+/// This file declares classes for handling the YAML representation
 /// of DWARF Debug Info.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/include/llvm/ObjectYAML/ELFYAML.h b/contrib/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 7ba83967330e..6fc69735f1c7 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares classes for handling the YAML representation
+/// This file declares classes for handling the YAML representation
 /// of ELF.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/include/llvm/ObjectYAML/MachOYAML.h b/contrib/llvm/include/llvm/ObjectYAML/MachOYAML.h
index 1fa8f92e516a..cec4f86185f0 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/MachOYAML.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/MachOYAML.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares classes for handling the YAML representation
+/// This file declares classes for handling the YAML representation
 /// of Mach-O.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/include/llvm/ObjectYAML/WasmYAML.h b/contrib/llvm/include/llvm/ObjectYAML/WasmYAML.h
index 188ce8e44491..8cd08e520560 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/WasmYAML.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/WasmYAML.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares classes for handling the YAML representation
+/// This file declares classes for handling the YAML representation
 /// of wasm binaries.
 ///
 //===----------------------------------------------------------------------===//
@@ -28,15 +28,17 @@ namespace llvm {
 namespace WasmYAML {
 
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, SectionType)
-LLVM_YAML_STRONG_TYPEDEF(int32_t, ValueType)
-LLVM_YAML_STRONG_TYPEDEF(int32_t, TableType)
-LLVM_YAML_STRONG_TYPEDEF(int32_t, SignatureForm)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, ValueType)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, TableType)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, SignatureForm)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, ExportKind)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, Opcode)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, RelocType)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, SymbolFlags)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, SymbolKind)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, SegmentFlags)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, LimitFlags)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, ComdatKind)
 
 struct FileHeader {
   yaml::Hex32 Version;
@@ -66,6 +68,7 @@ struct ElemSegment {
 };
 
 struct Global {
+  uint32_t Index;
   ValueType Type;
   bool Mutable;
   wasm::WasmInitExpr InitExpr;
@@ -89,6 +92,7 @@ struct LocalDecl {
 };
 
 struct Function {
+  uint32_t Index;
   std::vector<LocalDecl> Locals;
   yaml::BinaryRef Body;
 };
@@ -127,13 +131,29 @@ struct Signature {
 };
 
 struct SymbolInfo {
+  uint32_t Index;
   StringRef Name;
+  SymbolKind Kind;
   SymbolFlags Flags;
+  union {
+    uint32_t ElementIndex;
+    wasm::WasmDataReference DataRef;
+  };
 };
 
 struct InitFunction {
   uint32_t Priority;
-  uint32_t FunctionIndex;
+  uint32_t Symbol;
+};
+
+struct ComdatEntry {
+  ComdatKind Kind;
+  uint32_t Index;
+};
+
+struct Comdat {
+  StringRef Name;
+  std::vector<ComdatEntry> Entries;
 };
 
 struct Section {
@@ -175,10 +195,11 @@ struct LinkingSection : CustomSection {
     return C && C->Name == "linking";
   }
 
-  uint32_t DataSize;
-  std::vector<SymbolInfo> SymbolInfos;
+  uint32_t Version;
+  std::vector<SymbolInfo> SymbolTable;
   std::vector<SegmentInfo> SegmentInfos;
   std::vector<InitFunction> InitFunctions;
+  std::vector<Comdat> Comdats;
 };
 
 struct TypeSection : Section {
@@ -316,6 +337,8 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::NameEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::SegmentInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::SymbolInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::InitFunction)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::ComdatEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Comdat)
 
 namespace llvm {
 namespace yaml {
@@ -352,6 +375,10 @@ template <> struct ScalarBitSetTraits<WasmYAML::SymbolFlags> {
   static void bitset(IO &IO, WasmYAML::SymbolFlags &Value);
 };
 
+template <> struct ScalarEnumerationTraits<WasmYAML::SymbolKind> {
+  static void enumeration(IO &IO, WasmYAML::SymbolKind &Kind);
+};
+
 template <> struct ScalarBitSetTraits<WasmYAML::SegmentFlags> {
   static void bitset(IO &IO, WasmYAML::SegmentFlags &Value);
 };
@@ -412,6 +439,18 @@ template <> struct MappingTraits<WasmYAML::InitFunction> {
   static void mapping(IO &IO, WasmYAML::InitFunction &Init);
 };
 
+template <> struct ScalarEnumerationTraits<WasmYAML::ComdatKind> {
+  static void enumeration(IO &IO, WasmYAML::ComdatKind &Kind);
+};
+
+template <> struct MappingTraits<WasmYAML::ComdatEntry> {
+  static void mapping(IO &IO, WasmYAML::ComdatEntry &ComdatEntry);
+};
+
+template <> struct MappingTraits<WasmYAML::Comdat> {
+  static void mapping(IO &IO, WasmYAML::Comdat &Comdat);
+};
+
 template <> struct ScalarEnumerationTraits<WasmYAML::ValueType> {
   static void enumeration(IO &IO, WasmYAML::ValueType &Type);
 };
diff --git a/contrib/llvm/include/llvm/ObjectYAML/YAML.h b/contrib/llvm/include/llvm/ObjectYAML/YAML.h
index 93266dd67f1a..163cd8dfcf08 100644
--- a/contrib/llvm/include/llvm/ObjectYAML/YAML.h
+++ b/contrib/llvm/include/llvm/ObjectYAML/YAML.h
@@ -21,7 +21,7 @@ class raw_ostream;
 
 namespace yaml {
 
-/// \brief Specialized YAMLIO scalar type for representing a binary blob.
+/// Specialized YAMLIO scalar type for representing a binary blob.
 ///
 /// A typical use case would be to represent the content of a section in a
 /// binary file.
@@ -64,11 +64,11 @@ namespace yaml {
 class BinaryRef {
   friend bool operator==(const BinaryRef &LHS, const BinaryRef &RHS);
 
-  /// \brief Either raw binary data, or a string of hex bytes (must always
+  /// Either raw binary data, or a string of hex bytes (must always
   /// be an even number of characters).
   ArrayRef<uint8_t> Data;
 
-  /// \brief Discriminator between the two states of the `Data` member.
+  /// Discriminator between the two states of the `Data` member.
   bool DataIsHexString = true;
 
 public:
@@ -77,7 +77,7 @@ public:
   BinaryRef(StringRef Data)
       : Data(reinterpret_cast<const uint8_t *>(Data.data()), Data.size()) {}
 
-  /// \brief The number of bytes that are represented by this BinaryRef.
+  /// The number of bytes that are represented by this BinaryRef.
   /// This is the number of bytes that writeAsBinary() will write.
   ArrayRef<uint8_t>::size_type binary_size() const {
     if (DataIsHexString)
@@ -85,11 +85,11 @@ public:
     return Data.size();
   }
 
-  /// \brief Write the contents (regardless of whether it is binary or a
+  /// Write the contents (regardless of whether it is binary or a
   /// hex string) as binary to the given raw_ostream.
   void writeAsBinary(raw_ostream &OS) const;
 
-  /// \brief Write the contents (regardless of whether it is binary or a
+  /// Write the contents (regardless of whether it is binary or a
   /// hex string) as hex to the given raw_ostream.
   ///
   /// For example, a possible output could be `DEADBEEFCAFEBABE`.
diff --git a/contrib/llvm/include/llvm/Option/Arg.h b/contrib/llvm/include/llvm/Option/Arg.h
index c519a4a824c5..d0086bb6d611 100644
--- a/contrib/llvm/include/llvm/Option/Arg.h
+++ b/contrib/llvm/include/llvm/Option/Arg.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Defines the llvm::Arg class for parsed arguments.
+/// Defines the llvm::Arg class for parsed arguments.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -28,35 +28,35 @@ namespace opt {
 
 class ArgList;
 
-/// \brief A concrete instance of a particular driver option.
+/// A concrete instance of a particular driver option.
 ///
 /// The Arg class encodes just enough information to be able to
 /// derive the argument values efficiently.
 class Arg {
 private:
-  /// \brief The option this argument is an instance of.
+  /// The option this argument is an instance of.
   const Option Opt;
 
-  /// \brief The argument this argument was derived from (during tool chain
+  /// The argument this argument was derived from (during tool chain
   /// argument translation), if any.
   const Arg *BaseArg;
 
-  /// \brief How this instance of the option was spelled.
+  /// How this instance of the option was spelled.
   StringRef Spelling;
 
-  /// \brief The index at which this argument appears in the containing
+  /// The index at which this argument appears in the containing
   /// ArgList.
   unsigned Index;
 
-  /// \brief Was this argument used to effect compilation?
+  /// Was this argument used to effect compilation?
   ///
   /// This is used for generating "argument unused" diagnostics.
   mutable unsigned Claimed : 1;
 
-  /// \brief Does this argument own its values?
+  /// Does this argument own its values?
   mutable unsigned OwnsValues : 1;
 
-  /// \brief The argument values, as C strings.
+  /// The argument values, as C strings.
   SmallVector<const char *, 2> Values;
 
 public:
@@ -74,7 +74,7 @@ public:
   StringRef getSpelling() const { return Spelling; }
   unsigned getIndex() const { return Index; }
 
-  /// \brief Return the base argument which generated this arg.
+  /// Return the base argument which generated this arg.
   ///
   /// This is either the argument itself or the argument it was
   /// derived from during tool chain specific argument translation.
@@ -88,7 +88,7 @@ public:
 
   bool isClaimed() const { return getBaseArg().Claimed; }
 
-  /// \brief Set the Arg claimed bit.
+  /// Set the Arg claimed bit.
   void claim() const { getBaseArg().Claimed = true; }
 
   unsigned getNumValues() const { return Values.size(); }
@@ -107,10 +107,10 @@ public:
     return false;
   }
 
-  /// \brief Append the argument onto the given array as strings.
+  /// Append the argument onto the given array as strings.
   void render(const ArgList &Args, ArgStringList &Output) const;
 
-  /// \brief Append the argument, render as an input, onto the given
+  /// Append the argument, render as an input, onto the given
   /// array as strings.
   ///
   /// The distinction is that some options only render their values
@@ -120,7 +120,7 @@ public:
   void print(raw_ostream &O) const;
   void dump() const;
 
-  /// \brief Return a formatted version of the argument and
+  /// Return a formatted version of the argument and
   /// its values, for debugging and diagnostics.
   std::string getAsString(const ArgList &Args) const;
 };
diff --git a/contrib/llvm/include/llvm/Option/ArgList.h b/contrib/llvm/include/llvm/Option/ArgList.h
index aaea68bf8e27..687c8cbb02f9 100644
--- a/contrib/llvm/include/llvm/Option/ArgList.h
+++ b/contrib/llvm/include/llvm/Option/ArgList.h
@@ -85,9 +85,6 @@ public:
     SkipToNextArg();
   }
 
-  // FIXME: This conversion function makes no sense.
-  operator const Arg*() { return *Current; }
-
   reference operator*() const { return *Current; }
   pointer operator->() const { return Current; }
 
@@ -356,7 +353,7 @@ public:
     return MakeArgStringRef(Str.toStringRef(Buf));
   }
 
-  /// \brief Create an arg string for (\p LHS + \p RHS), reusing the
+  /// Create an arg string for (\p LHS + \p RHS), reusing the
   /// string at \p Index if possible.
   const char *GetOrMakeJoinedArgString(unsigned Index, StringRef LHS,
                                         StringRef RHS) const;
@@ -390,6 +387,8 @@ private:
   void releaseMemory();
 
 public:
+  InputArgList() : NumInputArgStrings(0) {}
+
   InputArgList(const char* const *ArgBegin, const char* const *ArgEnd);
 
   InputArgList(InputArgList &&RHS)
diff --git a/contrib/llvm/include/llvm/Option/OptTable.h b/contrib/llvm/include/llvm/Option/OptTable.h
index 57a6954f4878..743c4772c98c 100644
--- a/contrib/llvm/include/llvm/Option/OptTable.h
+++ b/contrib/llvm/include/llvm/Option/OptTable.h
@@ -29,7 +29,7 @@ class ArgList;
 class InputArgList;
 class Option;
 
-/// \brief Provide access to the Option info table.
+/// Provide access to the Option info table.
 ///
 /// The OptTable class provides a layer of indirection which allows Option
 /// instance to be created lazily. In the common case, only a few options will
@@ -38,7 +38,7 @@ class Option;
 /// parts of the driver still use Option instances where convenient.
 class OptTable {
 public:
-  /// \brief Entry for a single option instance in the option data table.
+  /// Entry for a single option instance in the option data table.
   struct Info {
     /// A null terminated array of prefix strings to apply to name while
     /// matching.
@@ -57,7 +57,7 @@ public:
   };
 
 private:
-  /// \brief The option information table.
+  /// The option information table.
   std::vector<Info> OptionInfos;
   bool IgnoreCase;
 
@@ -86,36 +86,36 @@ protected:
 public:
   ~OptTable();
 
-  /// \brief Return the total number of option classes.
+  /// Return the total number of option classes.
   unsigned getNumOptions() const { return OptionInfos.size(); }
 
-  /// \brief Get the given Opt's Option instance, lazily creating it
+  /// Get the given Opt's Option instance, lazily creating it
   /// if necessary.
   ///
   /// \return The option, or null for the INVALID option id.
   const Option getOption(OptSpecifier Opt) const;
 
-  /// \brief Lookup the name of the given option.
+  /// Lookup the name of the given option.
   const char *getOptionName(OptSpecifier id) const {
     return getInfo(id).Name;
   }
 
-  /// \brief Get the kind of the given option.
+  /// Get the kind of the given option.
   unsigned getOptionKind(OptSpecifier id) const {
     return getInfo(id).Kind;
   }
 
-  /// \brief Get the group id for the given option.
+  /// Get the group id for the given option.
   unsigned getOptionGroupID(OptSpecifier id) const {
     return getInfo(id).GroupID;
   }
 
-  /// \brief Get the help text to use to describe this option.
+  /// Get the help text to use to describe this option.
   const char *getOptionHelpText(OptSpecifier id) const {
     return getInfo(id).HelpText;
   }
 
-  /// \brief Get the meta-variable name to use when describing
+  /// Get the meta-variable name to use when describing
   /// this options values in the help text.
   const char *getOptionMetaVar(OptSpecifier id) const {
     return getInfo(id).MetaVar;
@@ -143,6 +143,26 @@ public:
   std::vector<std::string> findByPrefix(StringRef Cur,
                                         unsigned short DisableFlags) const;
 
+  /// Find the OptTable option that most closely matches the given string.
+  ///
+  /// \param [in] Option - A string, such as "-stdlibs=l", that represents user
+  /// input of an option that may not exist in the OptTable. Note that the
+  /// string includes prefix dashes "-" as well as values "=l".
+  /// \param [out] NearestString - The nearest option string found in the
+  /// OptTable.
+  /// \param [in] FlagsToInclude - Only find options with any of these flags.
+  /// Zero is the default, which includes all flags.
+  /// \param [in] FlagsToExclude - Don't find options with this flag. Zero
+  /// is the default, and means exclude nothing.
+  /// \param [in] MinimumLength - Don't find options shorter than this length.
+  /// For example, a minimum length of 3 prevents "-x" from being considered
+  /// near to "-S".
+  ///
+  /// \return The edit distance of the nearest string found.
+  unsigned findNearest(StringRef Option, std::string &NearestString,
+                       unsigned FlagsToInclude = 0, unsigned FlagsToExclude = 0,
+                       unsigned MinimumLength = 4) const;
+
   /// Add Values to Option's Values class
   ///
   /// \param [in] Option - Prefix + Name of the flag which Values will be
@@ -154,7 +174,7 @@ public:
   /// \return true in success, and false in fail.
   bool addValues(const char *Option, const char *Values);
 
-  /// \brief Parse a single argument; returning the new argument and
+  /// Parse a single argument; returning the new argument and
   /// updating Index.
   ///
   /// \param [in,out] Index - The current parsing position in the argument
@@ -172,7 +192,7 @@ public:
                    unsigned FlagsToInclude = 0,
                    unsigned FlagsToExclude = 0) const;
 
-  /// \brief Parse an list of arguments into an InputArgList.
+  /// Parse an list of arguments into an InputArgList.
   ///
   /// The resulting InputArgList will reference the strings in [\p ArgBegin,
   /// \p ArgEnd), and their lifetime should extend past that of the returned
@@ -194,7 +214,7 @@ public:
                          unsigned &MissingArgCount, unsigned FlagsToInclude = 0,
                          unsigned FlagsToExclude = 0) const;
 
-  /// \brief Render the help text for an option table.
+  /// Render the help text for an option table.
   ///
   /// \param OS - The stream to write the help text to.
   /// \param Name - The name to use in the usage line.
diff --git a/contrib/llvm/include/llvm/Option/Option.h b/contrib/llvm/include/llvm/Option/Option.h
index d9aebd5b0757..b09f6043b7a9 100644
--- a/contrib/llvm/include/llvm/Option/Option.h
+++ b/contrib/llvm/include/llvm/Option/Option.h
@@ -95,7 +95,7 @@ public:
     return OptionClass(Info->Kind);
   }
 
-  /// \brief Get the name of this option without any prefix.
+  /// Get the name of this option without any prefix.
   StringRef getName() const {
     assert(Info && "Must have a valid info!");
     return Info->Name;
@@ -113,7 +113,7 @@ public:
     return Owner->getOption(Info->AliasID);
   }
 
-  /// \brief Get the alias arguments as a \0 separated list.
+  /// Get the alias arguments as a \0 separated list.
   /// E.g. ["foo", "bar"] would be returned as "foo\0bar\0".
   const char *getAliasArgs() const {
     assert(Info && "Must have a valid info!");
@@ -123,13 +123,13 @@ public:
     return Info->AliasArgs;
   }
 
-  /// \brief Get the default prefix for this option.
+  /// Get the default prefix for this option.
   StringRef getPrefix() const {
     const char *Prefix = *Info->Prefixes;
     return Prefix ? Prefix : StringRef();
   }
 
-  /// \brief Get the name of this option with the default prefix.
+  /// Get the name of this option with the default prefix.
   std::string getPrefixedName() const {
     std::string Ret = getPrefix();
     Ret += getName();
diff --git a/contrib/llvm/include/llvm/Pass.h b/contrib/llvm/include/llvm/Pass.h
index a29b3771abb4..d65347d611ea 100644
--- a/contrib/llvm/include/llvm/Pass.h
+++ b/contrib/llvm/include/llvm/Pass.h
@@ -353,18 +353,18 @@ protected:
 
 /// If the user specifies the -time-passes argument on an LLVM tool command line
 /// then the value of this boolean will be true, otherwise false.
-/// @brief This is the storage for the -time-passes option.
+/// This is the storage for the -time-passes option.
 extern bool TimePassesIsEnabled;
 
 /// isFunctionInPrintList - returns true if a function should be printed via
 //  debugging options like -print-after-all/-print-before-all.
-//  @brief Tells if the function IR should be printed by PrinterPass.
+//  Tells if the function IR should be printed by PrinterPass.
 extern bool isFunctionInPrintList(StringRef FunctionName);
 
 /// forcePrintModuleIR - returns true if IR printing passes should
 //  be printing module IR (even for local-pass printers e.g. function-pass)
 //  to provide more context, as enabled by debugging option -print-module-scope
-//  @brief Tells if IR printer should be printing module IR
+//  Tells if IR printer should be printing module IR
 extern bool forcePrintModuleIR();
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/PassAnalysisSupport.h b/contrib/llvm/include/llvm/PassAnalysisSupport.h
index b109605355bf..118718747659 100644
--- a/contrib/llvm/include/llvm/PassAnalysisSupport.h
+++ b/contrib/llvm/include/llvm/PassAnalysisSupport.h
@@ -174,7 +174,7 @@ public:
     AnalysisImpls.push_back(pir);
   }
 
-  /// Clear cache that is used to connect a pass to the the analysis (PassInfo).
+  /// Clear cache that is used to connect a pass to the analysis (PassInfo).
   void clearAnalysisImpls() {
     AnalysisImpls.clear();
   }
diff --git a/contrib/llvm/include/llvm/Passes/PassBuilder.h b/contrib/llvm/include/llvm/Passes/PassBuilder.h
index b69988826253..24a93bc76af5 100644
--- a/contrib/llvm/include/llvm/Passes/PassBuilder.h
+++ b/contrib/llvm/include/llvm/Passes/PassBuilder.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
 
@@ -26,6 +27,7 @@ namespace llvm {
 class StringRef;
 class AAManager;
 class TargetMachine;
+class ModuleSummaryIndex;
 
 /// A struct capturing PGO tunables.
 struct PGOOptions {
@@ -47,7 +49,7 @@ struct PGOOptions {
   bool SamplePGOSupport;
 };
 
-/// \brief This class provides access to building LLVM's passes.
+/// This class provides access to building LLVM's passes.
 ///
 /// It's members provide the baseline state available to passes during their
 /// construction. The \c PassRegistry.def file specifies how to construct all
@@ -58,7 +60,7 @@ class PassBuilder {
   Optional<PGOOptions> PGOOpt;
 
 public:
-  /// \brief A struct to capture parsed pass pipeline names.
+  /// A struct to capture parsed pass pipeline names.
   ///
   /// A pipeline is defined as a series of names, each of which may in itself
   /// recursively contain a nested pipeline. A name is either the name of a pass
@@ -71,7 +73,7 @@ public:
     std::vector<PipelineElement> InnerPipeline;
   };
 
-  /// \brief ThinLTO phase.
+  /// ThinLTO phase.
   ///
   /// This enumerates the LLVM ThinLTO optimization phases.
   enum class ThinLTOPhase {
@@ -83,7 +85,7 @@ public:
     PostLink
   };
 
-  /// \brief LLVM-provided high-level optimization levels.
+  /// LLVM-provided high-level optimization levels.
   ///
   /// This enumerates the LLVM-provided high-level optimization levels. Each
   /// level has a specific goal and rationale.
@@ -173,7 +175,7 @@ public:
                        Optional<PGOOptions> PGOOpt = None)
       : TM(TM), PGOOpt(PGOOpt) {}
 
-  /// \brief Cross register the analysis managers through their proxies.
+  /// Cross register the analysis managers through their proxies.
   ///
   /// This is an interface that can be used to cross register each
   // AnalysisManager with all the others analysis managers.
@@ -182,7 +184,7 @@ public:
                             CGSCCAnalysisManager &CGAM,
                             ModuleAnalysisManager &MAM);
 
-  /// \brief Registers all available module analysis passes.
+  /// Registers all available module analysis passes.
   ///
   /// This is an interface that can be used to populate a \c
   /// ModuleAnalysisManager with all registered module analyses. Callers can
@@ -190,7 +192,7 @@ public:
   /// pre-register analyses and this will not override those.
   void registerModuleAnalyses(ModuleAnalysisManager &MAM);
 
-  /// \brief Registers all available CGSCC analysis passes.
+  /// Registers all available CGSCC analysis passes.
   ///
   /// This is an interface that can be used to populate a \c CGSCCAnalysisManager
   /// with all registered CGSCC analyses. Callers can still manually register any
@@ -198,7 +200,7 @@ public:
   /// not override those.
   void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
 
-  /// \brief Registers all available function analysis passes.
+  /// Registers all available function analysis passes.
   ///
   /// This is an interface that can be used to populate a \c
   /// FunctionAnalysisManager with all registered function analyses. Callers can
@@ -206,7 +208,7 @@ public:
   /// pre-register analyses and this will not override those.
   void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
 
-  /// \brief Registers all available loop analysis passes.
+  /// Registers all available loop analysis passes.
   ///
   /// This is an interface that can be used to populate a \c LoopAnalysisManager
   /// with all registered loop analyses. Callers can still manually register any
@@ -309,8 +311,9 @@ public:
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager buildThinLTODefaultPipeline(OptimizationLevel Level,
-                                                bool DebugLogging = false);
+  ModulePassManager
+  buildThinLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
+                              const ModuleSummaryIndex *ImportSummary);
 
   /// Build a pre-link, LTO-targeting default optimization pipeline to a pass
   /// manager.
@@ -339,13 +342,14 @@ public:
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
   ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level,
-                                            bool DebugLogging = false);
+                                            bool DebugLogging,
+                                            ModuleSummaryIndex *ExportSummary);
 
   /// Build the default `AAManager` with the default alias analysis pipeline
   /// registered.
   AAManager buildDefaultAAPipeline();
 
-  /// \brief Parse a textual pass pipeline description into a \c
+  /// Parse a textual pass pipeline description into a \c
   /// ModulePassManager.
   ///
   /// The format of the textual pass pipeline description looks something like:
@@ -409,7 +413,7 @@ public:
   /// returns false.
   bool parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
-  /// \brief Register a callback for a default optimizer pipeline extension
+  /// Register a callback for a default optimizer pipeline extension
   /// point
   ///
   /// This extension point allows adding passes that perform peephole
@@ -420,7 +424,7 @@ public:
     PeepholeEPCallbacks.push_back(C);
   }
 
-  /// \brief Register a callback for a default optimizer pipeline extension
+  /// Register a callback for a default optimizer pipeline extension
   /// point
   ///
   /// This extension point allows adding late loop canonicalization and
@@ -434,7 +438,7 @@ public:
     LateLoopOptimizationsEPCallbacks.push_back(C);
   }
 
-  /// \brief Register a callback for a default optimizer pipeline extension
+  /// Register a callback for a default optimizer pipeline extension
   /// point
   ///
   /// This extension point allows adding loop passes to the end of the loop
@@ -444,7 +448,7 @@ public:
     LoopOptimizerEndEPCallbacks.push_back(C);
   }
 
-  /// \brief Register a callback for a default optimizer pipeline extension
+  /// Register a callback for a default optimizer pipeline extension
   /// point
   ///
   /// This extension point allows adding optimization passes after most of the
@@ -454,7 +458,7 @@ public:
     ScalarOptimizerLateEPCallbacks.push_back(C);
   }
 
-  /// \brief Register a callback for a default optimizer pipeline extension
+  /// Register a callback for a default optimizer pipeline extension
   /// point
   ///
   /// This extension point allows adding CallGraphSCC passes at the end of the
@@ -465,7 +469,7 @@ public:
     CGSCCOptimizerLateEPCallbacks.push_back(C);
   }
 
-  /// \brief Register a callback for a default optimizer pipeline extension
+  /// Register a callback for a default optimizer pipeline extension
   /// point
   ///
   /// This extension point allows adding optimization passes before the
@@ -476,7 +480,17 @@ public:
     VectorizerStartEPCallbacks.push_back(C);
   }
 
-  /// \brief Register a callback for parsing an AliasAnalysis Name to populate
+  /// Register a callback for a default optimizer pipeline extension point.
+  ///
+  /// This extension point allows adding optimization once at the start of the
+  /// pipeline. This does not apply to 'backend' compiles (LTO and ThinLTO
+  /// link-time pipelines).
+  void registerPipelineStartEPCallback(
+      const std::function<void(ModulePassManager &)> &C) {
+    PipelineStartEPCallbacks.push_back(C);
+  }
+
+  /// Register a callback for parsing an AliasAnalysis Name to populate
   /// the given AAManager \p AA
   void registerParseAACallback(
       const std::function<bool(StringRef Name, AAManager &AA)> &C) {
@@ -530,7 +544,7 @@ public:
   }
   /// @}}
 
-  /// \brief Register a callback for a top-level pipeline entry.
+  /// Register a callback for a top-level pipeline entry.
   ///
   /// If the PassManager type is not given at the top level of the pipeline
   /// text, this Callback should be used to determine the appropriate stack of
@@ -589,6 +603,8 @@ private:
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       VectorizerStartEPCallbacks;
   // Module callbacks
+  SmallVector<std::function<void(ModulePassManager &)>, 2>
+      PipelineStartEPCallbacks;
   SmallVector<std::function<void(ModuleAnalysisManager &)>, 2>
       ModuleAnalysisRegistrationCallbacks;
   SmallVector<std::function<bool(StringRef, ModulePassManager &,
diff --git a/contrib/llvm/include/llvm/Passes/PassPlugin.h b/contrib/llvm/include/llvm/Passes/PassPlugin.h
new file mode 100644
index 000000000000..af8f11a7a352
--- /dev/null
+++ b/contrib/llvm/include/llvm/Passes/PassPlugin.h
@@ -0,0 +1,114 @@
+//===- llvm/Passes/PassPlugin.h - Public Plugin API -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines the public entry point for new-PM pass plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PASSES_PASSPLUGIN_H
+#define LLVM_PASSES_PASSPLUGIN_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <string>
+
+namespace llvm {
+class PassBuilder;
+
+/// \macro LLVM_PLUGIN_API_VERSION
+/// Identifies the API version understood by this plugin.
+///
+/// When a plugin is loaded, the driver will check it's supported plugin version
+/// against that of the plugin. A mismatch is an error. The supported version
+/// will be incremented for ABI-breaking changes to the \c PassPluginLibraryInfo
+/// struct, i.e. when callbacks are added, removed, or reordered.
+#define LLVM_PLUGIN_API_VERSION 1
+
+extern "C" {
+/// Information about the plugin required to load its passes
+///
+/// This struct defines the core interface for pass plugins and is supposed to
+/// be filled out by plugin implementors. LLVM-side users of a plugin are
+/// expected to use the \c PassPlugin class below to interface with it.
+struct PassPluginLibraryInfo {
+  /// The API version understood by this plugin, usually \c
+  /// LLVM_PLUGIN_API_VERSION
+  uint32_t APIVersion;
+  /// A meaningful name of the plugin.
+  const char *PluginName;
+  /// The version of the plugin.
+  const char *PluginVersion;
+
+  /// The callback for registering plugin passes with a \c PassBuilder
+  /// instance
+  void (*RegisterPassBuilderCallbacks)(PassBuilder &);
+};
+}
+
+/// A loaded pass plugin.
+///
+/// An instance of this class wraps a loaded pass plugin and gives access to
+/// its interface defined by the \c PassPluginLibraryInfo it exposes.
+class PassPlugin {
+public:
+  /// Attempts to load a pass plugin from a given file.
+  ///
+  /// \returns Returns an error if either the library cannot be found or loaded,
+  /// there is no public entry point, or the plugin implements the wrong API
+  /// version.
+  static Expected<PassPlugin> Load(const std::string &Filename);
+
+  /// Get the filename of the loaded plugin.
+  StringRef getFilename() const { return Filename; }
+
+  /// Get the plugin name
+  StringRef getPluginName() const { return Info.PluginName; }
+
+  /// Get the plugin version
+  StringRef getPluginVersion() const { return Info.PluginVersion; }
+
+  /// Get the plugin API version
+  uint32_t getAPIVersion() const { return Info.APIVersion; }
+
+  /// Invoke the PassBuilder callback registration
+  void registerPassBuilderCallbacks(PassBuilder &PB) const {
+    Info.RegisterPassBuilderCallbacks(PB);
+  }
+
+private:
+  PassPlugin(const std::string &Filename, const sys::DynamicLibrary &Library)
+      : Filename(Filename), Library(Library), Info() {}
+
+  std::string Filename;
+  sys::DynamicLibrary Library;
+  PassPluginLibraryInfo Info;
+};
+}
+
+/// The public entry point for a pass plugin.
+///
+/// When a plugin is loaded by the driver, it will call this entry point to
+/// obtain information about this plugin and about how to register its passes.
+/// This function needs to be implemented by the plugin, see the example below:
+///
+/// ```
+/// extern "C" ::llvm::PassPluginLibraryInfo LLVM_ATTRIBUTE_WEAK
+/// llvmGetPassPluginInfo() {
+///   return {
+///     LLVM_PLUGIN_API_VERSION, "MyPlugin", "v0.1", [](PassBuilder &PB) { ... }
+///   };
+/// }
+/// ```
+extern "C" ::llvm::PassPluginLibraryInfo LLVM_ATTRIBUTE_WEAK
+llvmGetPassPluginInfo();
+
+#endif /* LLVM_PASSES_PASSPLUGIN_H */
diff --git a/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 5a4098cf666c..1ca56dcaf9c5 100644
--- a/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/StringRef.h"
@@ -506,7 +507,7 @@ public:
 /// This is the main interface to get coverage information, using a profile to
 /// fill out execution counts.
 class CoverageMapping {
-  StringSet<> FunctionNames;
+  DenseMap<size_t, DenseSet<size_t>> RecordProvenance;
   std::vector<FunctionRecord> Functions;
   std::vector<std::pair<std::string, uint64_t>> FuncHashMismatches;
   std::vector<std::pair<std::string, uint64_t>> FuncCounterMismatches;
diff --git a/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h b/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
index 633e51565cd2..c88c71a6d6f4 100644
--- a/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
+++ b/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
@@ -32,7 +32,7 @@ namespace coverage {
 
 class CoverageMappingReader;
 
-/// \brief Coverage mapping information for a single function.
+/// Coverage mapping information for a single function.
 struct CoverageMappingRecord {
   StringRef FunctionName;
   uint64_t FunctionHash;
@@ -41,7 +41,7 @@ struct CoverageMappingRecord {
   ArrayRef<CounterMappingRegion> MappingRegions;
 };
 
-/// \brief A file format agnostic iterator over coverage mapping data.
+/// A file format agnostic iterator over coverage mapping data.
 class CoverageMappingIterator
     : public std::iterator<std::input_iterator_tag, CoverageMappingRecord> {
   CoverageMappingReader *Reader;
@@ -101,7 +101,7 @@ public:
   CoverageMappingIterator end() { return CoverageMappingIterator(); }
 };
 
-/// \brief Base class for the raw coverage mapping and filenames data readers.
+/// Base class for the raw coverage mapping and filenames data readers.
 class RawCoverageReader {
 protected:
   StringRef Data;
@@ -114,7 +114,7 @@ protected:
   Error readString(StringRef &Result);
 };
 
-/// \brief Reader for the raw coverage filenames.
+/// Reader for the raw coverage filenames.
 class RawCoverageFilenamesReader : public RawCoverageReader {
   std::vector<StringRef> &Filenames;
 
@@ -128,7 +128,7 @@ public:
   Error read();
 };
 
-/// \brief Checks if the given coverage mapping data is exported for
+/// Checks if the given coverage mapping data is exported for
 /// an unused function.
 class RawCoverageMappingDummyChecker : public RawCoverageReader {
 public:
@@ -138,7 +138,7 @@ public:
   Expected<bool> isDummy();
 };
 
-/// \brief Reader for the raw coverage mapping data.
+/// Reader for the raw coverage mapping data.
 class RawCoverageMappingReader : public RawCoverageReader {
   ArrayRef<StringRef> TranslationUnitFilenames;
   std::vector<StringRef> &Filenames;
@@ -169,7 +169,7 @@ private:
                              unsigned InferredFileID, size_t NumFileIDs);
 };
 
-/// \brief Reader for the coverage mapping data that is emitted by the
+/// Reader for the coverage mapping data that is emitted by the
 /// frontend and stored in an object file.
 class BinaryCoverageReader : public CoverageMappingReader {
 public:
diff --git a/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h b/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
index b6f864ab3de3..86fb1bdf1773 100644
--- a/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
+++ b/contrib/llvm/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
@@ -25,7 +25,7 @@ class raw_ostream;
 
 namespace coverage {
 
-/// \brief Writer of the filenames section for the instrumentation
+/// Writer of the filenames section for the instrumentation
 /// based code coverage.
 class CoverageFilenamesSectionWriter {
   ArrayRef<StringRef> Filenames;
@@ -34,11 +34,11 @@ public:
   CoverageFilenamesSectionWriter(ArrayRef<StringRef> Filenames)
       : Filenames(Filenames) {}
 
-  /// \brief Write encoded filenames to the given output stream.
+  /// Write encoded filenames to the given output stream.
   void write(raw_ostream &OS);
 };
 
-/// \brief Writer for instrumentation based coverage mapping data.
+/// Writer for instrumentation based coverage mapping data.
 class CoverageMappingWriter {
   ArrayRef<unsigned> VirtualFileMapping;
   ArrayRef<CounterExpression> Expressions;
@@ -51,7 +51,7 @@ public:
       : VirtualFileMapping(VirtualFileMapping), Expressions(Expressions),
         MappingRegions(MappingRegions) {}
 
-  /// \brief Write encoded coverage mapping data to the given output stream.
+  /// Write encoded coverage mapping data to the given output stream.
   void write(raw_ostream &OS);
 };
 
diff --git a/contrib/llvm/include/llvm/ProfileData/GCOV.h b/contrib/llvm/include/llvm/ProfileData/GCOV.h
index 497f80b87b26..8500401e44ad 100644
--- a/contrib/llvm/include/llvm/ProfileData/GCOV.h
+++ b/contrib/llvm/include/llvm/ProfileData/GCOV.h
@@ -41,7 +41,7 @@ namespace GCOV {
 
 enum GCOVVersion { V402, V404, V704 };
 
-/// \brief A struct for passing gcov options between functions.
+/// A struct for passing gcov options between functions.
 struct Options {
   Options(bool A, bool B, bool C, bool F, bool P, bool U, bool L, bool N)
       : AllBlocks(A), BranchInfo(B), BranchCount(C), FuncCoverage(F),
@@ -376,6 +376,7 @@ private:
 };
 
 class FileInfo {
+protected:
   // It is unlikely--but possible--for multiple functions to be on the same
   // line.
   // Therefore this typedef allows LineData.Functions to store multiple
@@ -428,7 +429,7 @@ public:
   void print(raw_ostream &OS, StringRef MainFilename, StringRef GCNOFile,
              StringRef GCDAFile);
 
-private:
+protected:
   std::string getCoveragePath(StringRef Filename, StringRef MainFilename);
   std::unique_ptr<raw_ostream> openCoveragePath(StringRef CoveragePath);
   void printFunctionSummary(raw_ostream &OS, const FunctionVector &Funcs) const;
diff --git a/contrib/llvm/include/llvm/ProfileData/InstrProf.h b/contrib/llvm/include/llvm/ProfileData/InstrProf.h
index b08b78cd593c..206142b3565a 100644
--- a/contrib/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/contrib/llvm/include/llvm/ProfileData/InstrProf.h
@@ -425,9 +425,20 @@ private:
   // A map from function runtime address to function name MD5 hash.
   // This map is only populated and used by raw instr profile reader.
   AddrHashMap AddrToMD5Map;
+  bool Sorted = false;
+
+  static StringRef getExternalSymbol() {
+    return "** External Symbol **";
+  }
+
+  // If the symtab is created by a series of calls to \c addFuncName, \c
+  // finalizeSymtab needs to be called before looking up function names.
+  // This is required because the underlying map is a vector (for space
+  // efficiency) which needs to be sorted.
+  inline void finalizeSymtab();
 
 public:
-  InstrProfSymtab() = default; 
+  InstrProfSymtab() = default;
 
   /// Create InstrProfSymtab from an object file section which
   /// contains function PGO names. When section may contain raw
@@ -456,21 +467,17 @@ public:
   /// \p IterRange. This interface is used by IndexedProfReader.
   template <typename NameIterRange> Error create(const NameIterRange &IterRange);
 
-  // If the symtab is created by a series of calls to \c addFuncName, \c
-  // finalizeSymtab needs to be called before looking up function names.
-  // This is required because the underlying map is a vector (for space
-  // efficiency) which needs to be sorted.
-  inline void finalizeSymtab();
-
   /// Update the symtab by adding \p FuncName to the table. This interface
   /// is used by the raw and text profile readers.
   Error addFuncName(StringRef FuncName) {
     if (FuncName.empty())
       return make_error<InstrProfError>(instrprof_error::malformed);
     auto Ins = NameTab.insert(FuncName);
-    if (Ins.second)
+    if (Ins.second) {
       MD5NameMap.push_back(std::make_pair(
           IndexedInstrProf::ComputeHash(FuncName), Ins.first->getKey()));
+      Sorted = false;
+    }
     return Error::success();
   }
 
@@ -480,7 +487,8 @@ public:
     AddrToMD5Map.push_back(std::make_pair(Addr, MD5Val));
   }
 
-  AddrHashMap &getAddrHashMap() { return AddrToMD5Map; }
+  /// Return a function's hash, or 0, if the function isn't in this SymTab.
+  uint64_t getFunctionHashFromAddress(uint64_t Address);
 
   /// Return function's PGO name from the function name's symbol
   /// address in the object file. If an error occurs, return
@@ -491,6 +499,16 @@ public:
   /// If not found, return an empty string.
   inline StringRef getFuncName(uint64_t FuncMD5Hash);
 
+  /// Just like getFuncName, except that it will return a non-empty StringRef
+  /// if the function is external to this symbol table. All such cases
+  /// will be represented using the same StringRef value.
+  inline StringRef getFuncNameOrExternalSymbol(uint64_t FuncMD5Hash);
+
+  /// True if Symbol is the value used to represent external symbols.
+  static bool isExternalSymbol(const StringRef &Symbol) {
+    return Symbol == InstrProfSymtab::getExternalSymbol();
+  }
+
   /// Return function from the name's md5 hash. Return nullptr if not found.
   inline Function *getFunction(uint64_t FuncMD5Hash);
 
@@ -524,14 +542,25 @@ Error InstrProfSymtab::create(const NameIterRange &IterRange) {
 }
 
 void InstrProfSymtab::finalizeSymtab() {
-  std::sort(MD5NameMap.begin(), MD5NameMap.end(), less_first());
-  std::sort(MD5FuncMap.begin(), MD5FuncMap.end(), less_first());
-  std::sort(AddrToMD5Map.begin(), AddrToMD5Map.end(), less_first());
+  if (Sorted)
+    return;
+  llvm::sort(MD5NameMap.begin(), MD5NameMap.end(), less_first());
+  llvm::sort(MD5FuncMap.begin(), MD5FuncMap.end(), less_first());
+  llvm::sort(AddrToMD5Map.begin(), AddrToMD5Map.end(), less_first());
   AddrToMD5Map.erase(std::unique(AddrToMD5Map.begin(), AddrToMD5Map.end()),
                      AddrToMD5Map.end());
+  Sorted = true;
+}
+
+StringRef InstrProfSymtab::getFuncNameOrExternalSymbol(uint64_t FuncMD5Hash) {
+  StringRef ret = getFuncName(FuncMD5Hash);
+  if (ret.empty())
+    return InstrProfSymtab::getExternalSymbol();
+  return ret;
 }
 
 StringRef InstrProfSymtab::getFuncName(uint64_t FuncMD5Hash) {
+  finalizeSymtab();
   auto Result =
       std::lower_bound(MD5NameMap.begin(), MD5NameMap.end(), FuncMD5Hash,
                        [](const std::pair<uint64_t, std::string> &LHS,
@@ -542,6 +571,7 @@ StringRef InstrProfSymtab::getFuncName(uint64_t FuncMD5Hash) {
 }
 
 Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
+  finalizeSymtab();
   auto Result =
       std::lower_bound(MD5FuncMap.begin(), MD5FuncMap.end(), FuncMD5Hash,
                        [](const std::pair<uint64_t, Function*> &LHS,
@@ -614,8 +644,6 @@ struct InstrProfRecord {
     return *this;
   }
 
-  using ValueMapType = std::vector<std::pair<uint64_t, uint64_t>>;
-
   /// Return the number of value profile kinds with non-zero number
   /// of profile sites.
   inline uint32_t getNumValueKinds() const;
@@ -649,7 +677,7 @@ struct InstrProfRecord {
   /// Add ValueData for ValueKind at value Site.
   void addValueData(uint32_t ValueKind, uint32_t Site,
                     InstrProfValueData *VData, uint32_t N,
-                    ValueMapType *ValueMap);
+                    InstrProfSymtab *SymTab);
 
   /// Merge the counts in \p Other into this one.
   /// Optionally scale merged counts by \p Weight.
@@ -723,7 +751,7 @@ private:
 
   // Map indirect call target name hash to name string.
   uint64_t remapValue(uint64_t Value, uint32_t ValueKind,
-                      ValueMapType *HashKeys);
+                      InstrProfSymtab *SymTab);
 
   // Merge Value Profile data from Src record to this record for ValueKind.
   // Scale merged value counts by \p Weight.
@@ -993,7 +1021,7 @@ template <> inline uint64_t getMagic<uint32_t>() {
 // compiler-rt/lib/profile/InstrProfiling.h.
 // It should also match the synthesized type in
 // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters.
-template <class IntPtrT> struct LLVM_ALIGNAS(8) ProfileData {
+template <class IntPtrT> struct alignas(8) ProfileData {
   #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
   #include "llvm/ProfileData/InstrProfData.inc"
 };
diff --git a/contrib/llvm/include/llvm/ProfileData/InstrProfData.inc b/contrib/llvm/include/llvm/ProfileData/InstrProfData.inc
index 6a98dc7b9b85..454620ed997a 100644
--- a/contrib/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/contrib/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -178,7 +178,7 @@ VALUE_PROF_FUNC_PARAM(uint64_t, LargeValue, Type::getInt64Ty(Ctx))
  * functions are profiled by the instrumented code. The target addresses are
  * written in the raw profile data and converted to target function name's MD5
  * hash by the profile reader during deserialization.  Typically, this happens
- * when the the raw profile data is read during profile merging.
+ * when the raw profile data is read during profile merging.
  *
  * For this remapping the ProfData is used.  ProfData contains both the function
  * name hash and the function address.
@@ -308,14 +308,14 @@ typedef struct ValueProfRecord {
 
 #ifdef __cplusplus
   /*!
-   * \brief Return the number of value sites.
+   * Return the number of value sites.
    */
   uint32_t getNumValueSites() const { return NumValueSites; }
   /*!
-   * \brief Read data from this record and save it to Record.
+   * Read data from this record and save it to Record.
    */
   void deserializeTo(InstrProfRecord &Record,
-                     InstrProfRecord::ValueMapType *VMap);
+                     InstrProfSymtab *SymTab);
   /*
    * In-place byte swap:
    * Do byte swap for this instance. \c Old is the original order before
@@ -393,7 +393,7 @@ typedef struct ValueProfData {
    * Read data from this data and save it to \c Record.
    */
   void deserializeTo(InstrProfRecord &Record,
-                     InstrProfRecord::ValueMapType *VMap);
+                     InstrProfSymtab *SymTab);
   void operator delete(void *ptr) { ::operator delete(ptr); }
 #endif
 } ValueProfData;
@@ -458,7 +458,7 @@ getValueProfRecordHeaderSize(uint32_t NumValueSites);
 #endif
 
 /*!
- * \brief Return the \c ValueProfRecord header size including the
+ * Return the \c ValueProfRecord header size including the
  * padding bytes.
  */
 INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
@@ -471,7 +471,7 @@ uint32_t getValueProfRecordHeaderSize(uint32_t NumValueSites) {
 }
 
 /*!
- * \brief Return the total size of the value profile record including the
+ * Return the total size of the value profile record including the
  * header and the value data.
  */
 INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
@@ -482,7 +482,7 @@ uint32_t getValueProfRecordSize(uint32_t NumValueSites,
 }
 
 /*!
- * \brief Return the pointer to the start of value data array.
+ * Return the pointer to the start of value data array.
  */
 INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) {
@@ -491,7 +491,7 @@ InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) {
 }
 
 /*!
- * \brief Return the total number of value data for \c This record.
+ * Return the total number of value data for \c This record.
  */
 INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) {
@@ -503,7 +503,7 @@ uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) {
 }
 
 /*!
- * \brief Use this method to advance to the next \c This \c ValueProfRecord.
+ * Use this method to advance to the next \c This \c ValueProfRecord.
  */
 INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) {
@@ -514,7 +514,7 @@ ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) {
 }
 
 /*!
- * \brief Return the first \c ValueProfRecord instance.
+ * Return the first \c ValueProfRecord instance.
  */
 INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 ValueProfRecord *getFirstValueProfRecord(ValueProfData *This) {
diff --git a/contrib/llvm/include/llvm/ProfileData/InstrProfReader.h b/contrib/llvm/include/llvm/ProfileData/InstrProfReader.h
index aa58ead1eda1..efc22dcd0d9a 100644
--- a/contrib/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/contrib/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -101,7 +101,7 @@ protected:
     return make_error<InstrProfError>(Err);
   }
 
-  Error error(Error E) { return error(InstrProfError::take(std::move(E))); }
+  Error error(Error &&E) { return error(InstrProfError::take(std::move(E))); }
 
   /// Clear the current error and return a successful one.
   Error success() { return error(instrprof_error::success); }
@@ -199,8 +199,6 @@ private:
   uint32_t ValueKindLast;
   uint32_t CurValueDataSize;
 
-  InstrProfRecord::ValueMapType FunctionPtrToNameMap;
-
 public:
   RawInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)) {}
diff --git a/contrib/llvm/include/llvm/ProfileData/ProfileCommon.h b/contrib/llvm/include/llvm/ProfileData/ProfileCommon.h
index 51b065bcdb70..087588f06340 100644
--- a/contrib/llvm/include/llvm/ProfileData/ProfileCommon.h
+++ b/contrib/llvm/include/llvm/ProfileData/ProfileCommon.h
@@ -61,7 +61,7 @@ protected:
   void computeDetailedSummary();
 
 public:
-  /// \brief A vector of useful cutoff values for detailed summary.
+  /// A vector of useful cutoff values for detailed summary.
   static const ArrayRef<uint32_t> DefaultCutoffs;
 };
 
diff --git a/contrib/llvm/include/llvm/ProfileData/SampleProf.h b/contrib/llvm/include/llvm/ProfileData/SampleProf.h
index 641631cc4ec9..0cd6dd2c2c0e 100644
--- a/contrib/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/contrib/llvm/include/llvm/ProfileData/SampleProf.h
@@ -78,11 +78,29 @@ struct is_error_code_enum<llvm::sampleprof_error> : std::true_type {};
 namespace llvm {
 namespace sampleprof {
 
-static inline uint64_t SPMagic() {
+enum SampleProfileFormat {
+  SPF_None = 0,
+  SPF_Text = 0x1,
+  SPF_Compact_Binary = 0x2,
+  SPF_GCC = 0x3,
+  SPF_Binary = 0xff
+};
+
+static inline uint64_t SPMagic(SampleProfileFormat Format = SPF_Binary) {
   return uint64_t('S') << (64 - 8) | uint64_t('P') << (64 - 16) |
          uint64_t('R') << (64 - 24) | uint64_t('O') << (64 - 32) |
          uint64_t('F') << (64 - 40) | uint64_t('4') << (64 - 48) |
-         uint64_t('2') << (64 - 56) | uint64_t(0xff);
+         uint64_t('2') << (64 - 56) | uint64_t(Format);
+}
+
+// Get the proper representation of a string in the input Format.
+static inline StringRef getRepInFormat(StringRef Name,
+                                       SampleProfileFormat Format,
+                                       std::string &GUIDBuf) {
+  if (Name.empty())
+    return Name;
+  GUIDBuf = std::to_string(Function::getGUID(Name));
+  return (Format == SPF_Compact_Binary) ? StringRef(GUIDBuf) : Name;
 }
 
 static inline uint64_t SPVersion() { return 103; }
@@ -359,7 +377,7 @@ public:
   /// GUID to \p S. Also traverse the BodySamples to add hot CallTarget's GUID
   /// to \p S.
   void findInlinedFunctions(DenseSet<GlobalValue::GUID> &S, const Module *M,
-                            uint64_t Threshold) const {
+                            uint64_t Threshold, bool isCompact) const {
     if (TotalSamples <= Threshold)
       return;
     S.insert(Function::getGUID(Name));
@@ -370,11 +388,12 @@ public:
         if (TS.getValue() > Threshold) {
           Function *Callee = M->getFunction(TS.getKey());
           if (!Callee || !Callee->getSubprogram())
-            S.insert(Function::getGUID(TS.getKey()));
+            S.insert(isCompact ? std::stol(TS.getKey().data())
+                               : Function::getGUID(TS.getKey()));
         }
     for (const auto &CS : CallsiteSamples)
       for (const auto &NameFS : CS.second)
-        NameFS.second.findInlinedFunctions(S, M, Threshold);
+        NameFS.second.findInlinedFunctions(S, M, Threshold, isCompact);
   }
 
   /// Set the name of the function.
@@ -383,6 +402,21 @@ public:
   /// Return the function name.
   const StringRef &getName() const { return Name; }
 
+  /// Returns the line offset to the start line of the subprogram.
+  /// We assume that a single function will not exceed 65535 LOC.
+  static unsigned getOffset(const DILocation *DIL);
+
+  /// Get the FunctionSamples of the inline instance where DIL originates
+  /// from.
+  ///
+  /// The FunctionSamples of the instruction (Machine or IR) associated to
+  /// \p DIL is the inlined instance in which that instruction is coming from.
+  /// We traverse the inline stack of that instruction, and match it with the
+  /// tree nodes in the profile.
+  ///
+  /// \returns the FunctionSamples pointer to the inlined instance.
+  const FunctionSamples *findFunctionSamples(const DILocation *DIL) const;
+
 private:
   /// Mangled name of the function.
   StringRef Name;
diff --git a/contrib/llvm/include/llvm/ProfileData/SampleProfReader.h b/contrib/llvm/include/llvm/ProfileData/SampleProfReader.h
index 0e9ab2dc60ee..0617b05e8d4f 100644
--- a/contrib/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/contrib/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -235,7 +235,7 @@ class raw_ostream;
 
 namespace sampleprof {
 
-/// \brief Sample-based profile reader.
+/// Sample-based profile reader.
 ///
 /// Each profile contains sample counts for all the functions
 /// executed. Inside each function, statements are annotated with the
@@ -264,105 +264,113 @@ namespace sampleprof {
 /// compact and I/O efficient. They can both be used interchangeably.
 class SampleProfileReader {
 public:
-  SampleProfileReader(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
-      : Profiles(0), Ctx(C), Buffer(std::move(B)) {}
+  SampleProfileReader(std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+                      SampleProfileFormat Format = SPF_None)
+      : Profiles(0), Ctx(C), Buffer(std::move(B)), Format(Format) {}
 
   virtual ~SampleProfileReader() = default;
 
-  /// \brief Read and validate the file header.
+  /// Read and validate the file header.
   virtual std::error_code readHeader() = 0;
 
-  /// \brief Read sample profiles from the associated file.
+  /// Read sample profiles from the associated file.
   virtual std::error_code read() = 0;
 
-  /// \brief Print the profile for \p FName on stream \p OS.
+  /// Print the profile for \p FName on stream \p OS.
   void dumpFunctionProfile(StringRef FName, raw_ostream &OS = dbgs());
 
-  /// \brief Print all the profiles on stream \p OS.
+  /// Print all the profiles on stream \p OS.
   void dump(raw_ostream &OS = dbgs());
 
-  /// \brief Return the samples collected for function \p F.
+  /// Return the samples collected for function \p F.
   FunctionSamples *getSamplesFor(const Function &F) {
     // The function name may have been updated by adding suffix. In sample
     // profile, the function names are all stripped, so we need to strip
     // the function name suffix before matching with profile.
-    if (Profiles.count(F.getName().split('.').first))
-      return &Profiles[(F.getName().split('.').first)];
+    StringRef Fname = F.getName().split('.').first;
+    std::string FGUID;
+    Fname = getRepInFormat(Fname, getFormat(), FGUID);
+    if (Profiles.count(Fname))
+      return &Profiles[Fname];
     return nullptr;
   }
 
-  /// \brief Return all the profiles.
+  /// Return all the profiles.
   StringMap<FunctionSamples> &getProfiles() { return Profiles; }
 
-  /// \brief Report a parse error message.
+  /// Report a parse error message.
   void reportError(int64_t LineNumber, Twine Msg) const {
     Ctx.diagnose(DiagnosticInfoSampleProfile(Buffer->getBufferIdentifier(),
                                              LineNumber, Msg));
   }
 
-  /// \brief Create a sample profile reader appropriate to the file format.
+  /// Create a sample profile reader appropriate to the file format.
   static ErrorOr<std::unique_ptr<SampleProfileReader>>
   create(const Twine &Filename, LLVMContext &C);
 
-  /// \brief Create a sample profile reader from the supplied memory buffer.
+  /// Create a sample profile reader from the supplied memory buffer.
   static ErrorOr<std::unique_ptr<SampleProfileReader>>
   create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C);
 
-  /// \brief Return the profile summary.
+  /// Return the profile summary.
   ProfileSummary &getSummary() { return *(Summary.get()); }
 
+  /// \brief Return the profile format.
+  SampleProfileFormat getFormat() { return Format; }
+
 protected:
-  /// \brief Map every function to its associated profile.
+  /// Map every function to its associated profile.
   ///
   /// The profile of every function executed at runtime is collected
   /// in the structure FunctionSamples. This maps function objects
   /// to their corresponding profiles.
   StringMap<FunctionSamples> Profiles;
 
-  /// \brief LLVM context used to emit diagnostics.
+  /// LLVM context used to emit diagnostics.
   LLVMContext &Ctx;
 
-  /// \brief Memory buffer holding the profile file.
+  /// Memory buffer holding the profile file.
   std::unique_ptr<MemoryBuffer> Buffer;
 
-  /// \brief Profile summary information.
+  /// Profile summary information.
   std::unique_ptr<ProfileSummary> Summary;
 
-  /// \brief Compute summary for this profile.
+  /// Compute summary for this profile.
   void computeSummary();
+
+  /// \brief The format of sample.
+  SampleProfileFormat Format = SPF_None;
 };
 
 class SampleProfileReaderText : public SampleProfileReader {
 public:
   SampleProfileReaderText(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
-      : SampleProfileReader(std::move(B), C) {}
+      : SampleProfileReader(std::move(B), C, SPF_Text) {}
 
-  /// \brief Read and validate the file header.
+  /// Read and validate the file header.
   std::error_code readHeader() override { return sampleprof_error::success; }
 
-  /// \brief Read sample profiles from the associated file.
+  /// Read sample profiles from the associated file.
   std::error_code read() override;
 
-  /// \brief Return true if \p Buffer is in the format supported by this class.
+  /// Return true if \p Buffer is in the format supported by this class.
   static bool hasFormat(const MemoryBuffer &Buffer);
 };
 
 class SampleProfileReaderBinary : public SampleProfileReader {
 public:
-  SampleProfileReaderBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
-      : SampleProfileReader(std::move(B), C) {}
+  SampleProfileReaderBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+                            SampleProfileFormat Format = SPF_None)
+      : SampleProfileReader(std::move(B), C, Format) {}
 
-  /// \brief Read and validate the file header.
+  /// Read and validate the file header.
   std::error_code readHeader() override;
 
-  /// \brief Read sample profiles from the associated file.
+  /// Read sample profiles from the associated file.
   std::error_code read() override;
 
-  /// \brief Return true if \p Buffer is in the format supported by this class.
-  static bool hasFormat(const MemoryBuffer &Buffer);
-
 protected:
-  /// \brief Read a numeric value of type T from the profile.
+  /// Read a numeric value of type T from the profile.
   ///
   /// If an error occurs during decoding, a diagnostic message is emitted and
   /// EC is set.
@@ -370,7 +378,7 @@ protected:
   /// \returns the read value.
   template <typename T> ErrorOr<T> readNumber();
 
-  /// \brief Read a string from the profile.
+  /// Read a string from the profile.
   ///
   /// If an error occurs during decoding, a diagnostic message is emitted and
   /// EC is set.
@@ -378,29 +386,68 @@ protected:
   /// \returns the read value.
   ErrorOr<StringRef> readString();
 
-  /// Read a string indirectly via the name table.
-  ErrorOr<StringRef> readStringFromTable();
+  /// Read the string index and check whether it overflows the table.
+  template <typename T> inline ErrorOr<uint32_t> readStringIndex(T &Table);
 
-  /// \brief Return true if we've reached the end of file.
+  /// Return true if we've reached the end of file.
   bool at_eof() const { return Data >= End; }
 
   /// Read the contents of the given profile instance.
   std::error_code readProfile(FunctionSamples &FProfile);
 
-  /// \brief Points to the current location in the buffer.
+  /// Points to the current location in the buffer.
   const uint8_t *Data = nullptr;
 
-  /// \brief Points to the end of the buffer.
+  /// Points to the end of the buffer.
   const uint8_t *End = nullptr;
 
+private:
+  std::error_code readSummaryEntry(std::vector<ProfileSummaryEntry> &Entries);
+  virtual std::error_code verifySPMagic(uint64_t Magic) = 0;
+
+  /// Read profile summary.
+  std::error_code readSummary();
+
+  /// Read the whole name table.
+  virtual std::error_code readNameTable() = 0;
+
+  /// Read a string indirectly via the name table.
+  virtual ErrorOr<StringRef> readStringFromTable() = 0;
+};
+
+class SampleProfileReaderRawBinary : public SampleProfileReaderBinary {
+private:
   /// Function name table.
   std::vector<StringRef> NameTable;
+  virtual std::error_code verifySPMagic(uint64_t Magic) override;
+  virtual std::error_code readNameTable() override;
+  /// Read a string indirectly via the name table.
+  virtual ErrorOr<StringRef> readStringFromTable() override;
+
+public:
+  SampleProfileReaderRawBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
+      : SampleProfileReaderBinary(std::move(B), C, SPF_Binary) {}
+
+  /// \brief Return true if \p Buffer is in the format supported by this class.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+};
 
+class SampleProfileReaderCompactBinary : public SampleProfileReaderBinary {
 private:
-  std::error_code readSummaryEntry(std::vector<ProfileSummaryEntry> &Entries);
+  /// Function name table.
+  std::vector<std::string> NameTable;
+  virtual std::error_code verifySPMagic(uint64_t Magic) override;
+  virtual std::error_code readNameTable() override;
+  /// Read a string indirectly via the name table.
+  virtual ErrorOr<StringRef> readStringFromTable() override;
 
-  /// \brief Read profile summary.
-  std::error_code readSummary();
+public:
+  SampleProfileReaderCompactBinary(std::unique_ptr<MemoryBuffer> B,
+                                   LLVMContext &C)
+      : SampleProfileReaderBinary(std::move(B), C, SPF_Compact_Binary) {}
+
+  /// \brief Return true if \p Buffer is in the format supported by this class.
+  static bool hasFormat(const MemoryBuffer &Buffer);
 };
 
 using InlineCallStack = SmallVector<FunctionSamples *, 10>;
@@ -421,15 +468,16 @@ enum HistType {
 class SampleProfileReaderGCC : public SampleProfileReader {
 public:
   SampleProfileReaderGCC(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
-      : SampleProfileReader(std::move(B), C), GcovBuffer(Buffer.get()) {}
+      : SampleProfileReader(std::move(B), C, SPF_GCC),
+        GcovBuffer(Buffer.get()) {}
 
-  /// \brief Read and validate the file header.
+  /// Read and validate the file header.
   std::error_code readHeader() override;
 
-  /// \brief Read sample profiles from the associated file.
+  /// Read sample profiles from the associated file.
   std::error_code read() override;
 
-  /// \brief Return true if \p Buffer is in the format supported by this class.
+  /// Return true if \p Buffer is in the format supported by this class.
   static bool hasFormat(const MemoryBuffer &Buffer);
 
 protected:
@@ -441,7 +489,7 @@ protected:
   template <typename T> ErrorOr<T> readNumber();
   ErrorOr<StringRef> readString();
 
-  /// \brief Read the section tag and check that it's the same as \p Expected.
+  /// Read the section tag and check that it's the same as \p Expected.
   std::error_code readSectionTag(uint32_t Expected);
 
   /// GCOV buffer containing the profile.
diff --git a/contrib/llvm/include/llvm/ProfileData/SampleProfWriter.h b/contrib/llvm/include/llvm/ProfileData/SampleProfWriter.h
index 86af1038d74e..74dc839ff049 100644
--- a/contrib/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/contrib/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -23,14 +23,13 @@
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <system_error>
 
 namespace llvm {
 namespace sampleprof {
 
-enum SampleProfileFormat { SPF_None = 0, SPF_Text, SPF_Binary, SPF_GCC };
-
-/// \brief Sample-based profile writer. Base class.
+/// Sample-based profile writer. Base class.
 class SampleProfileWriter {
 public:
   virtual ~SampleProfileWriter() = default;
@@ -62,21 +61,21 @@ protected:
   SampleProfileWriter(std::unique_ptr<raw_ostream> &OS)
       : OutputStream(std::move(OS)) {}
 
-  /// \brief Write a file header for the profile file.
+  /// Write a file header for the profile file.
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) = 0;
 
-  /// \brief Output stream where to emit the profile to.
+  /// Output stream where to emit the profile to.
   std::unique_ptr<raw_ostream> OutputStream;
 
-  /// \brief Profile summary.
+  /// Profile summary.
   std::unique_ptr<ProfileSummary> Summary;
 
-  /// \brief Compute summary for this profile.
+  /// Compute summary for this profile.
   void computeSummary(const StringMap<FunctionSamples> &ProfileMap);
 };
 
-/// \brief Sample-based profile writer (text format).
+/// Sample-based profile writer (text format).
 class SampleProfileWriterText : public SampleProfileWriter {
 public:
   std::error_code write(const FunctionSamples &S) override;
@@ -101,32 +100,49 @@ private:
                               SampleProfileFormat Format);
 };
 
-/// \brief Sample-based profile writer (binary format).
+/// Sample-based profile writer (binary format).
 class SampleProfileWriterBinary : public SampleProfileWriter {
 public:
   std::error_code write(const FunctionSamples &S) override;
-
-protected:
   SampleProfileWriterBinary(std::unique_ptr<raw_ostream> &OS)
       : SampleProfileWriter(OS) {}
 
-  std::error_code
-  writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
+protected:
+  virtual std::error_code writeNameTable() = 0;
+  virtual std::error_code writeMagicIdent() = 0;
+  std::error_code writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
   std::error_code writeSummary();
   std::error_code writeNameIdx(StringRef FName);
   std::error_code writeBody(const FunctionSamples &S);
+  inline void stablizeNameTable(std::set<StringRef> &V);
+
+  MapVector<StringRef, uint32_t> NameTable;
 
 private:
   void addName(StringRef FName);
   void addNames(const FunctionSamples &S);
 
-  MapVector<StringRef, uint32_t> NameTable;
-
   friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
   SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
                               SampleProfileFormat Format);
 };
 
+class SampleProfileWriterRawBinary : public SampleProfileWriterBinary {
+  using SampleProfileWriterBinary::SampleProfileWriterBinary;
+
+protected:
+  virtual std::error_code writeNameTable() override;
+  virtual std::error_code writeMagicIdent() override;
+};
+
+class SampleProfileWriterCompactBinary : public SampleProfileWriterBinary {
+  using SampleProfileWriterBinary::SampleProfileWriterBinary;
+
+protected:
+  virtual std::error_code writeNameTable() override;
+  virtual std::error_code writeMagicIdent() override;
+};
+
 } // end namespace sampleprof
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/Support/AArch64TargetParser.def b/contrib/llvm/include/llvm/Support/AArch64TargetParser.def
index 30c7924ea5f1..6772e5f9b734 100644
--- a/contrib/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/contrib/llvm/include/llvm/Support/AArch64TargetParser.def
@@ -35,6 +35,11 @@ AARCH64_ARCH("armv8.3-a", ARMV8_3A, "8.3-A", "v8.3a",
              (AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
               AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
               AArch64::AEK_RDM | AArch64::AEK_RCPC))
+AARCH64_ARCH("armv8.4-a", ARMV8_4A, "8.4-A", "v8.4a",
+             ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
+              AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
+              AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD))
 #undef AARCH64_ARCH
 
 #ifndef AARCH64_ARCH_EXT_NAME
@@ -47,6 +52,10 @@ AARCH64_ARCH_EXT_NAME("crc",      AArch64::AEK_CRC,      "+crc",   "-crc")
 AARCH64_ARCH_EXT_NAME("lse",      AArch64::AEK_LSE,      "+lse",   "-lse")
 AARCH64_ARCH_EXT_NAME("rdm",      AArch64::AEK_RDM,      "+rdm",   "-rdm")
 AARCH64_ARCH_EXT_NAME("crypto",   AArch64::AEK_CRYPTO,   "+crypto","-crypto")
+AARCH64_ARCH_EXT_NAME("sm4",      AArch64::AEK_SM4,      "+sm4",   "-sm4")
+AARCH64_ARCH_EXT_NAME("sha3",     AArch64::AEK_SHA3,     "+sha3",  "-sha3")
+AARCH64_ARCH_EXT_NAME("sha2",     AArch64::AEK_SHA2,     "+sha2",  "-sha2")
+AARCH64_ARCH_EXT_NAME("aes",      AArch64::AEK_AES,      "+aes",   "-aes")
 AARCH64_ARCH_EXT_NAME("dotprod",  AArch64::AEK_DOTPROD,  "+dotprod","-dotprod")
 AARCH64_ARCH_EXT_NAME("fp",       AArch64::AEK_FP,       "+fp-armv8",  "-fp-armv8")
 AARCH64_ARCH_EXT_NAME("simd",     AArch64::AEK_SIMD,     "+neon",  "-neon")
@@ -82,6 +91,8 @@ AARCH64_CPU_NAME("exynos-m2", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("exynos-m3", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_CRC))
+AARCH64_CPU_NAME("exynos-m4", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("falkor", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_CRC | AArch64::AEK_RDM))
 AARCH64_CPU_NAME("saphira", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
diff --git a/contrib/llvm/include/llvm/Support/AMDGPUKernelDescriptor.h b/contrib/llvm/include/llvm/Support/AMDGPUKernelDescriptor.h
deleted file mode 100644
index ce2c0c1c959e..000000000000
--- a/contrib/llvm/include/llvm/Support/AMDGPUKernelDescriptor.h
+++ /dev/null
@@ -1,139 +0,0 @@
-//===--- AMDGPUKernelDescriptor.h -------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief AMDGPU kernel descriptor definitions. For more information, visit
-/// https://llvm.org/docs/AMDGPUUsage.html#kernel-descriptor-for-gfx6-gfx9
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_AMDGPUKERNELDESCRIPTOR_H
-#define LLVM_SUPPORT_AMDGPUKERNELDESCRIPTOR_H
-
-#include <cstdint>
-
-// Creates enumeration entries used for packing bits into integers. Enumeration
-// entries include bit shift amount, bit width, and bit mask.
-#define AMDGPU_BITS_ENUM_ENTRY(name, shift, width) \
-  name ## _SHIFT = (shift),                        \
-  name ## _WIDTH = (width),                        \
-  name = (((1 << (width)) - 1) << (shift))         \
-
-// Gets bits for specified bit mask from specified source.
-#define AMDGPU_BITS_GET(src, mask) \
-  ((src & mask) >> mask ## _SHIFT) \
-
-// Sets bits for specified bit mask in specified destination.
-#define AMDGPU_BITS_SET(dst, mask, val)     \
-  dst &= (~(1 << mask ## _SHIFT) & ~mask);  \
-  dst |= (((val) << mask ## _SHIFT) & mask) \
-
-namespace llvm {
-namespace AMDGPU {
-namespace HSAKD {
-
-/// \brief Floating point rounding modes.
-enum : uint8_t {
-  AMDGPU_FLOAT_ROUND_MODE_NEAR_EVEN      = 0,
-  AMDGPU_FLOAT_ROUND_MODE_PLUS_INFINITY  = 1,
-  AMDGPU_FLOAT_ROUND_MODE_MINUS_INFINITY = 2,
-  AMDGPU_FLOAT_ROUND_MODE_ZERO           = 3,
-};
-
-/// \brief Floating point denorm modes.
-enum : uint8_t {
-  AMDGPU_FLOAT_DENORM_MODE_FLUSH_SRC_DST = 0,
-  AMDGPU_FLOAT_DENORM_MODE_FLUSH_DST     = 1,
-  AMDGPU_FLOAT_DENORM_MODE_FLUSH_SRC     = 2,
-  AMDGPU_FLOAT_DENORM_MODE_FLUSH_NONE    = 3,
-};
-
-/// \brief System VGPR workitem IDs.
-enum : uint8_t {
-  AMDGPU_SYSTEM_VGPR_WORKITEM_ID_X         = 0,
-  AMDGPU_SYSTEM_VGPR_WORKITEM_ID_X_Y       = 1,
-  AMDGPU_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z     = 2,
-  AMDGPU_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3,
-};
-
-/// \brief Compute program resource register one layout.
-enum ComputePgmRsrc1 {
-  AMDGPU_BITS_ENUM_ENTRY(GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
-  AMDGPU_BITS_ENUM_ENTRY(GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
-  AMDGPU_BITS_ENUM_ENTRY(PRIORITY, 10, 2),
-  AMDGPU_BITS_ENUM_ENTRY(FLOAT_ROUND_MODE_32, 12, 2),
-  AMDGPU_BITS_ENUM_ENTRY(FLOAT_ROUND_MODE_16_64, 14, 2),
-  AMDGPU_BITS_ENUM_ENTRY(FLOAT_DENORM_MODE_32, 16, 2),
-  AMDGPU_BITS_ENUM_ENTRY(FLOAT_DENORM_MODE_16_64, 18, 2),
-  AMDGPU_BITS_ENUM_ENTRY(PRIV, 20, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_DX10_CLAMP, 21, 1),
-  AMDGPU_BITS_ENUM_ENTRY(DEBUG_MODE, 22, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_IEEE_MODE, 23, 1),
-  AMDGPU_BITS_ENUM_ENTRY(BULKY, 24, 1),
-  AMDGPU_BITS_ENUM_ENTRY(CDBG_USER, 25, 1),
-  AMDGPU_BITS_ENUM_ENTRY(FP16_OVFL, 26, 1),
-  AMDGPU_BITS_ENUM_ENTRY(RESERVED0, 27, 5),
-};
-
-/// \brief Compute program resource register two layout.
-enum ComputePgmRsrc2 {
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_OFFSET, 0, 1),
-  AMDGPU_BITS_ENUM_ENTRY(USER_SGPR_COUNT, 1, 5),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_TRAP_HANDLER, 6, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_SGPR_WORKGROUP_INFO, 10, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_VGPR_WORKITEM_ID, 11, 2),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_MEMORY, 14, 1),
-  AMDGPU_BITS_ENUM_ENTRY(GRANULATED_LDS_SIZE, 15, 9),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1),
-  AMDGPU_BITS_ENUM_ENTRY(ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO, 30, 1),
-  AMDGPU_BITS_ENUM_ENTRY(RESERVED1, 31, 1),
-};
-
-/// \brief Kernel descriptor layout. This layout should be kept backwards
-/// compatible as it is consumed by the command processor.
-struct KernelDescriptor final {
-  uint32_t GroupSegmentFixedSize;
-  uint32_t PrivateSegmentFixedSize;
-  uint32_t MaxFlatWorkGroupSize;
-  uint64_t IsDynamicCallStack : 1;
-  uint64_t IsXNACKEnabled : 1;
-  uint64_t Reserved0 : 30;
-  int64_t KernelCodeEntryByteOffset;
-  uint64_t Reserved1[3];
-  uint32_t ComputePgmRsrc1;
-  uint32_t ComputePgmRsrc2;
-  uint64_t EnableSGPRPrivateSegmentBuffer : 1;
-  uint64_t EnableSGPRDispatchPtr : 1;
-  uint64_t EnableSGPRQueuePtr : 1;
-  uint64_t EnableSGPRKernargSegmentPtr : 1;
-  uint64_t EnableSGPRDispatchID : 1;
-  uint64_t EnableSGPRFlatScratchInit : 1;
-  uint64_t EnableSGPRPrivateSegmentSize : 1;
-  uint64_t EnableSGPRGridWorkgroupCountX : 1;
-  uint64_t EnableSGPRGridWorkgroupCountY : 1;
-  uint64_t EnableSGPRGridWorkgroupCountZ : 1;
-  uint64_t Reserved2 : 54;
-
-  KernelDescriptor() = default;
-};
-
-} // end namespace HSAKD
-} // end namespace AMDGPU
-} // end namespace llvm
-
-#endif // LLVM_SUPPORT_AMDGPUKERNELDESCRIPTOR_H
diff --git a/contrib/llvm/include/llvm/Support/AMDGPUMetadata.h b/contrib/llvm/include/llvm/Support/AMDGPUMetadata.h
index 00039a75c51d..667fb3f3da43 100644
--- a/contrib/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/contrib/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU metadata definitions and in-memory representations.
+/// AMDGPU metadata definitions and in-memory representations.
 ///
 //
 //===----------------------------------------------------------------------===//
@@ -29,17 +29,17 @@ namespace AMDGPU {
 //===----------------------------------------------------------------------===//
 namespace HSAMD {
 
-/// \brief HSA metadata major version.
+/// HSA metadata major version.
 constexpr uint32_t VersionMajor = 1;
-/// \brief HSA metadata minor version.
+/// HSA metadata minor version.
 constexpr uint32_t VersionMinor = 0;
 
-/// \brief HSA metadata beginning assembler directive.
+/// HSA metadata beginning assembler directive.
 constexpr char AssemblerDirectiveBegin[] = ".amd_amdgpu_hsa_metadata";
-/// \brief HSA metadata ending assembler directive.
+/// HSA metadata ending assembler directive.
 constexpr char AssemblerDirectiveEnd[] = ".end_amd_amdgpu_hsa_metadata";
 
-/// \brief Access qualifiers.
+/// Access qualifiers.
 enum class AccessQualifier : uint8_t {
   Default   = 0,
   ReadOnly  = 1,
@@ -48,7 +48,7 @@ enum class AccessQualifier : uint8_t {
   Unknown   = 0xff
 };
 
-/// \brief Address space qualifiers.
+/// Address space qualifiers.
 enum class AddressSpaceQualifier : uint8_t {
   Private  = 0,
   Global   = 1,
@@ -59,7 +59,7 @@ enum class AddressSpaceQualifier : uint8_t {
   Unknown  = 0xff
 };
 
-/// \brief Value kinds.
+/// Value kinds.
 enum class ValueKind : uint8_t {
   ByValue                = 0,
   GlobalBuffer           = 1,
@@ -78,7 +78,7 @@ enum class ValueKind : uint8_t {
   Unknown                = 0xff
 };
 
-/// \brief Value types.
+/// Value types.
 enum class ValueType : uint8_t {
   Struct  = 0,
   I8      = 1,
@@ -106,29 +106,29 @@ namespace Kernel {
 namespace Attrs {
 
 namespace Key {
-/// \brief Key for Kernel::Attr::Metadata::mReqdWorkGroupSize.
+/// Key for Kernel::Attr::Metadata::mReqdWorkGroupSize.
 constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize";
-/// \brief Key for Kernel::Attr::Metadata::mWorkGroupSizeHint.
+/// Key for Kernel::Attr::Metadata::mWorkGroupSizeHint.
 constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint";
-/// \brief Key for Kernel::Attr::Metadata::mVecTypeHint.
+/// Key for Kernel::Attr::Metadata::mVecTypeHint.
 constexpr char VecTypeHint[] = "VecTypeHint";
-/// \brief Key for Kernel::Attr::Metadata::mRuntimeHandle.
+/// Key for Kernel::Attr::Metadata::mRuntimeHandle.
 constexpr char RuntimeHandle[] = "RuntimeHandle";
 } // end namespace Key
 
-/// \brief In-memory representation of kernel attributes metadata.
+/// In-memory representation of kernel attributes metadata.
 struct Metadata final {
-  /// \brief 'reqd_work_group_size' attribute. Optional.
+  /// 'reqd_work_group_size' attribute. Optional.
   std::vector<uint32_t> mReqdWorkGroupSize = std::vector<uint32_t>();
-  /// \brief 'work_group_size_hint' attribute. Optional.
+  /// 'work_group_size_hint' attribute. Optional.
   std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>();
-  /// \brief 'vec_type_hint' attribute. Optional.
+  /// 'vec_type_hint' attribute. Optional.
   std::string mVecTypeHint = std::string();
-  /// \brief External symbol created by runtime to store the kernel address
+  /// External symbol created by runtime to store the kernel address
   /// for enqueued blocks.
   std::string mRuntimeHandle = std::string();
 
-  /// \brief Default constructor.
+  /// Default constructor.
   Metadata() = default;
 
   /// \returns True if kernel attributes metadata is empty, false otherwise.
@@ -151,68 +151,68 @@ struct Metadata final {
 namespace Arg {
 
 namespace Key {
-/// \brief Key for Kernel::Arg::Metadata::mName.
+/// Key for Kernel::Arg::Metadata::mName.
 constexpr char Name[] = "Name";
-/// \brief Key for Kernel::Arg::Metadata::mTypeName.
+/// Key for Kernel::Arg::Metadata::mTypeName.
 constexpr char TypeName[] = "TypeName";
-/// \brief Key for Kernel::Arg::Metadata::mSize.
+/// Key for Kernel::Arg::Metadata::mSize.
 constexpr char Size[] = "Size";
-/// \brief Key for Kernel::Arg::Metadata::mAlign.
+/// Key for Kernel::Arg::Metadata::mAlign.
 constexpr char Align[] = "Align";
-/// \brief Key for Kernel::Arg::Metadata::mValueKind.
+/// Key for Kernel::Arg::Metadata::mValueKind.
 constexpr char ValueKind[] = "ValueKind";
-/// \brief Key for Kernel::Arg::Metadata::mValueType.
+/// Key for Kernel::Arg::Metadata::mValueType.
 constexpr char ValueType[] = "ValueType";
-/// \brief Key for Kernel::Arg::Metadata::mPointeeAlign.
+/// Key for Kernel::Arg::Metadata::mPointeeAlign.
 constexpr char PointeeAlign[] = "PointeeAlign";
-/// \brief Key for Kernel::Arg::Metadata::mAddrSpaceQual.
+/// Key for Kernel::Arg::Metadata::mAddrSpaceQual.
 constexpr char AddrSpaceQual[] = "AddrSpaceQual";
-/// \brief Key for Kernel::Arg::Metadata::mAccQual.
+/// Key for Kernel::Arg::Metadata::mAccQual.
 constexpr char AccQual[] = "AccQual";
-/// \brief Key for Kernel::Arg::Metadata::mActualAccQual.
+/// Key for Kernel::Arg::Metadata::mActualAccQual.
 constexpr char ActualAccQual[] = "ActualAccQual";
-/// \brief Key for Kernel::Arg::Metadata::mIsConst.
+/// Key for Kernel::Arg::Metadata::mIsConst.
 constexpr char IsConst[] = "IsConst";
-/// \brief Key for Kernel::Arg::Metadata::mIsRestrict.
+/// Key for Kernel::Arg::Metadata::mIsRestrict.
 constexpr char IsRestrict[] = "IsRestrict";
-/// \brief Key for Kernel::Arg::Metadata::mIsVolatile.
+/// Key for Kernel::Arg::Metadata::mIsVolatile.
 constexpr char IsVolatile[] = "IsVolatile";
-/// \brief Key for Kernel::Arg::Metadata::mIsPipe.
+/// Key for Kernel::Arg::Metadata::mIsPipe.
 constexpr char IsPipe[] = "IsPipe";
 } // end namespace Key
 
-/// \brief In-memory representation of kernel argument metadata.
+/// In-memory representation of kernel argument metadata.
 struct Metadata final {
-  /// \brief Name. Optional.
+  /// Name. Optional.
   std::string mName = std::string();
-  /// \brief Type name. Optional.
+  /// Type name. Optional.
   std::string mTypeName = std::string();
-  /// \brief Size in bytes. Required.
+  /// Size in bytes. Required.
   uint32_t mSize = 0;
-  /// \brief Alignment in bytes. Required.
+  /// Alignment in bytes. Required.
   uint32_t mAlign = 0;
-  /// \brief Value kind. Required.
+  /// Value kind. Required.
   ValueKind mValueKind = ValueKind::Unknown;
-  /// \brief Value type. Required.
+  /// Value type. Required.
   ValueType mValueType = ValueType::Unknown;
-  /// \brief Pointee alignment in bytes. Optional.
+  /// Pointee alignment in bytes. Optional.
   uint32_t mPointeeAlign = 0;
-  /// \brief Address space qualifier. Optional.
+  /// Address space qualifier. Optional.
   AddressSpaceQualifier mAddrSpaceQual = AddressSpaceQualifier::Unknown;
-  /// \brief Access qualifier. Optional.
+  /// Access qualifier. Optional.
   AccessQualifier mAccQual = AccessQualifier::Unknown;
-  /// \brief Actual access qualifier. Optional.
+  /// Actual access qualifier. Optional.
   AccessQualifier mActualAccQual = AccessQualifier::Unknown;
-  /// \brief True if 'const' qualifier is specified. Optional.
+  /// True if 'const' qualifier is specified. Optional.
   bool mIsConst = false;
-  /// \brief True if 'restrict' qualifier is specified. Optional.
+  /// True if 'restrict' qualifier is specified. Optional.
   bool mIsRestrict = false;
-  /// \brief True if 'volatile' qualifier is specified. Optional.
+  /// True if 'volatile' qualifier is specified. Optional.
   bool mIsVolatile = false;
-  /// \brief True if 'pipe' qualifier is specified. Optional.
+  /// True if 'pipe' qualifier is specified. Optional.
   bool mIsPipe = false;
 
-  /// \brief Default constructor.
+  /// Default constructor.
   Metadata() = default;
 };
 
@@ -224,67 +224,67 @@ struct Metadata final {
 namespace CodeProps {
 
 namespace Key {
-/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentSize.
+/// Key for Kernel::CodeProps::Metadata::mKernargSegmentSize.
 constexpr char KernargSegmentSize[] = "KernargSegmentSize";
-/// \brief Key for Kernel::CodeProps::Metadata::mGroupSegmentFixedSize.
+/// Key for Kernel::CodeProps::Metadata::mGroupSegmentFixedSize.
 constexpr char GroupSegmentFixedSize[] = "GroupSegmentFixedSize";
-/// \brief Key for Kernel::CodeProps::Metadata::mPrivateSegmentFixedSize.
+/// Key for Kernel::CodeProps::Metadata::mPrivateSegmentFixedSize.
 constexpr char PrivateSegmentFixedSize[] = "PrivateSegmentFixedSize";
-/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentAlign.
+/// Key for Kernel::CodeProps::Metadata::mKernargSegmentAlign.
 constexpr char KernargSegmentAlign[] = "KernargSegmentAlign";
-/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontSize.
+/// Key for Kernel::CodeProps::Metadata::mWavefrontSize.
 constexpr char WavefrontSize[] = "WavefrontSize";
-/// \brief Key for Kernel::CodeProps::Metadata::mNumSGPRs.
+/// Key for Kernel::CodeProps::Metadata::mNumSGPRs.
 constexpr char NumSGPRs[] = "NumSGPRs";
-/// \brief Key for Kernel::CodeProps::Metadata::mNumVGPRs.
+/// Key for Kernel::CodeProps::Metadata::mNumVGPRs.
 constexpr char NumVGPRs[] = "NumVGPRs";
-/// \brief Key for Kernel::CodeProps::Metadata::mMaxFlatWorkGroupSize.
+/// Key for Kernel::CodeProps::Metadata::mMaxFlatWorkGroupSize.
 constexpr char MaxFlatWorkGroupSize[] = "MaxFlatWorkGroupSize";
-/// \brief Key for Kernel::CodeProps::Metadata::mIsDynamicCallStack.
+/// Key for Kernel::CodeProps::Metadata::mIsDynamicCallStack.
 constexpr char IsDynamicCallStack[] = "IsDynamicCallStack";
-/// \brief Key for Kernel::CodeProps::Metadata::mIsXNACKEnabled.
+/// Key for Kernel::CodeProps::Metadata::mIsXNACKEnabled.
 constexpr char IsXNACKEnabled[] = "IsXNACKEnabled";
-/// \brief Key for Kernel::CodeProps::Metadata::mNumSpilledSGPRs.
+/// Key for Kernel::CodeProps::Metadata::mNumSpilledSGPRs.
 constexpr char NumSpilledSGPRs[] = "NumSpilledSGPRs";
-/// \brief Key for Kernel::CodeProps::Metadata::mNumSpilledVGPRs.
+/// Key for Kernel::CodeProps::Metadata::mNumSpilledVGPRs.
 constexpr char NumSpilledVGPRs[] = "NumSpilledVGPRs";
 } // end namespace Key
 
-/// \brief In-memory representation of kernel code properties metadata.
+/// In-memory representation of kernel code properties metadata.
 struct Metadata final {
-  /// \brief Size in bytes of the kernarg segment memory. Kernarg segment memory
+  /// Size in bytes of the kernarg segment memory. Kernarg segment memory
   /// holds the values of the arguments to the kernel. Required.
   uint64_t mKernargSegmentSize = 0;
-  /// \brief Size in bytes of the group segment memory required by a workgroup.
+  /// Size in bytes of the group segment memory required by a workgroup.
   /// This value does not include any dynamically allocated group segment memory
   /// that may be added when the kernel is dispatched. Required.
   uint32_t mGroupSegmentFixedSize = 0;
-  /// \brief Size in bytes of the private segment memory required by a workitem.
+  /// Size in bytes of the private segment memory required by a workitem.
   /// Private segment memory includes arg, spill and private segments. Required.
   uint32_t mPrivateSegmentFixedSize = 0;
-  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// Maximum byte alignment of variables used by the kernel in the
   /// kernarg memory segment. Required.
   uint32_t mKernargSegmentAlign = 0;
-  /// \brief Wavefront size. Required.
+  /// Wavefront size. Required.
   uint32_t mWavefrontSize = 0;
-  /// \brief Total number of SGPRs used by a wavefront. Optional.
+  /// Total number of SGPRs used by a wavefront. Optional.
   uint16_t mNumSGPRs = 0;
-  /// \brief Total number of VGPRs used by a workitem. Optional.
+  /// Total number of VGPRs used by a workitem. Optional.
   uint16_t mNumVGPRs = 0;
-  /// \brief Maximum flat work-group size supported by the kernel. Optional.
+  /// Maximum flat work-group size supported by the kernel. Optional.
   uint32_t mMaxFlatWorkGroupSize = 0;
-  /// \brief True if the generated machine code is using a dynamically sized
+  /// True if the generated machine code is using a dynamically sized
   /// call stack. Optional.
   bool mIsDynamicCallStack = false;
-  /// \brief True if the generated machine code is capable of supporting XNACK.
+  /// True if the generated machine code is capable of supporting XNACK.
   /// Optional.
   bool mIsXNACKEnabled = false;
-  /// \brief Number of SGPRs spilled by a wavefront. Optional.
+  /// Number of SGPRs spilled by a wavefront. Optional.
   uint16_t mNumSpilledSGPRs = 0;
-  /// \brief Number of VGPRs spilled by a workitem. Optional.
+  /// Number of VGPRs spilled by a workitem. Optional.
   uint16_t mNumSpilledVGPRs = 0;
 
-  /// \brief Default constructor.
+  /// Default constructor.
   Metadata() = default;
 
   /// \returns True if kernel code properties metadata is empty, false
@@ -308,40 +308,40 @@ struct Metadata final {
 namespace DebugProps {
 
 namespace Key {
-/// \brief Key for Kernel::DebugProps::Metadata::mDebuggerABIVersion.
+/// Key for Kernel::DebugProps::Metadata::mDebuggerABIVersion.
 constexpr char DebuggerABIVersion[] = "DebuggerABIVersion";
-/// \brief Key for Kernel::DebugProps::Metadata::mReservedNumVGPRs.
+/// Key for Kernel::DebugProps::Metadata::mReservedNumVGPRs.
 constexpr char ReservedNumVGPRs[] = "ReservedNumVGPRs";
-/// \brief Key for Kernel::DebugProps::Metadata::mReservedFirstVGPR.
+/// Key for Kernel::DebugProps::Metadata::mReservedFirstVGPR.
 constexpr char ReservedFirstVGPR[] = "ReservedFirstVGPR";
-/// \brief Key for Kernel::DebugProps::Metadata::mPrivateSegmentBufferSGPR.
+/// Key for Kernel::DebugProps::Metadata::mPrivateSegmentBufferSGPR.
 constexpr char PrivateSegmentBufferSGPR[] = "PrivateSegmentBufferSGPR";
-/// \brief Key for
+/// Key for
 ///     Kernel::DebugProps::Metadata::mWavefrontPrivateSegmentOffsetSGPR.
 constexpr char WavefrontPrivateSegmentOffsetSGPR[] =
     "WavefrontPrivateSegmentOffsetSGPR";
 } // end namespace Key
 
-/// \brief In-memory representation of kernel debug properties metadata.
+/// In-memory representation of kernel debug properties metadata.
 struct Metadata final {
-  /// \brief Debugger ABI version. Optional.
+  /// Debugger ABI version. Optional.
   std::vector<uint32_t> mDebuggerABIVersion = std::vector<uint32_t>();
-  /// \brief Consecutive number of VGPRs reserved for debugger use. Must be 0 if
+  /// Consecutive number of VGPRs reserved for debugger use. Must be 0 if
   /// mDebuggerABIVersion is not set. Optional.
   uint16_t mReservedNumVGPRs = 0;
-  /// \brief First fixed VGPR reserved. Must be uint16_t(-1) if
+  /// First fixed VGPR reserved. Must be uint16_t(-1) if
   /// mDebuggerABIVersion is not set or mReservedFirstVGPR is 0. Optional.
   uint16_t mReservedFirstVGPR = uint16_t(-1);
-  /// \brief Fixed SGPR of the first of 4 SGPRs used to hold the scratch V# used
+  /// Fixed SGPR of the first of 4 SGPRs used to hold the scratch V# used
   /// for the entire kernel execution. Must be uint16_t(-1) if
   /// mDebuggerABIVersion is not set or SGPR not used or not known. Optional.
   uint16_t mPrivateSegmentBufferSGPR = uint16_t(-1);
-  /// \brief Fixed SGPR used to hold the wave scratch offset for the entire
+  /// Fixed SGPR used to hold the wave scratch offset for the entire
   /// kernel execution. Must be uint16_t(-1) if mDebuggerABIVersion is not set
   /// or SGPR is not used or not known. Optional.
   uint16_t mWavefrontPrivateSegmentOffsetSGPR = uint16_t(-1);
 
-  /// \brief Default constructor.
+  /// Default constructor.
   Metadata() = default;
 
   /// \returns True if kernel debug properties metadata is empty, false
@@ -360,75 +360,75 @@ struct Metadata final {
 } // end namespace DebugProps
 
 namespace Key {
-/// \brief Key for Kernel::Metadata::mName.
+/// Key for Kernel::Metadata::mName.
 constexpr char Name[] = "Name";
-/// \brief Key for Kernel::Metadata::mSymbolName.
+/// Key for Kernel::Metadata::mSymbolName.
 constexpr char SymbolName[] = "SymbolName";
-/// \brief Key for Kernel::Metadata::mLanguage.
+/// Key for Kernel::Metadata::mLanguage.
 constexpr char Language[] = "Language";
-/// \brief Key for Kernel::Metadata::mLanguageVersion.
+/// Key for Kernel::Metadata::mLanguageVersion.
 constexpr char LanguageVersion[] = "LanguageVersion";
-/// \brief Key for Kernel::Metadata::mAttrs.
+/// Key for Kernel::Metadata::mAttrs.
 constexpr char Attrs[] = "Attrs";
-/// \brief Key for Kernel::Metadata::mArgs.
+/// Key for Kernel::Metadata::mArgs.
 constexpr char Args[] = "Args";
-/// \brief Key for Kernel::Metadata::mCodeProps.
+/// Key for Kernel::Metadata::mCodeProps.
 constexpr char CodeProps[] = "CodeProps";
-/// \brief Key for Kernel::Metadata::mDebugProps.
+/// Key for Kernel::Metadata::mDebugProps.
 constexpr char DebugProps[] = "DebugProps";
 } // end namespace Key
 
-/// \brief In-memory representation of kernel metadata.
+/// In-memory representation of kernel metadata.
 struct Metadata final {
-  /// \brief Kernel source name. Required.
+  /// Kernel source name. Required.
   std::string mName = std::string();
-  /// \brief Kernel descriptor name. Required.
+  /// Kernel descriptor name. Required.
   std::string mSymbolName = std::string();
-  /// \brief Language. Optional.
+  /// Language. Optional.
   std::string mLanguage = std::string();
-  /// \brief Language version. Optional.
+  /// Language version. Optional.
   std::vector<uint32_t> mLanguageVersion = std::vector<uint32_t>();
-  /// \brief Attributes metadata. Optional.
+  /// Attributes metadata. Optional.
   Attrs::Metadata mAttrs = Attrs::Metadata();
-  /// \brief Arguments metadata. Optional.
+  /// Arguments metadata. Optional.
   std::vector<Arg::Metadata> mArgs = std::vector<Arg::Metadata>();
-  /// \brief Code properties metadata. Optional.
+  /// Code properties metadata. Optional.
   CodeProps::Metadata mCodeProps = CodeProps::Metadata();
-  /// \brief Debug properties metadata. Optional.
+  /// Debug properties metadata. Optional.
   DebugProps::Metadata mDebugProps = DebugProps::Metadata();
 
-  /// \brief Default constructor.
+  /// Default constructor.
   Metadata() = default;
 };
 
 } // end namespace Kernel
 
 namespace Key {
-/// \brief Key for HSA::Metadata::mVersion.
+/// Key for HSA::Metadata::mVersion.
 constexpr char Version[] = "Version";
-/// \brief Key for HSA::Metadata::mPrintf.
+/// Key for HSA::Metadata::mPrintf.
 constexpr char Printf[] = "Printf";
-/// \brief Key for HSA::Metadata::mKernels.
+/// Key for HSA::Metadata::mKernels.
 constexpr char Kernels[] = "Kernels";
 } // end namespace Key
 
-/// \brief In-memory representation of HSA metadata.
+/// In-memory representation of HSA metadata.
 struct Metadata final {
-  /// \brief HSA metadata version. Required.
+  /// HSA metadata version. Required.
   std::vector<uint32_t> mVersion = std::vector<uint32_t>();
-  /// \brief Printf metadata. Optional.
+  /// Printf metadata. Optional.
   std::vector<std::string> mPrintf = std::vector<std::string>();
-  /// \brief Kernels metadata. Required.
+  /// Kernels metadata. Required.
   std::vector<Kernel::Metadata> mKernels = std::vector<Kernel::Metadata>();
 
-  /// \brief Default constructor.
+  /// Default constructor.
   Metadata() = default;
 };
 
-/// \brief Converts \p String to \p HSAMetadata.
+/// Converts \p String to \p HSAMetadata.
 std::error_code fromString(std::string String, Metadata &HSAMetadata);
 
-/// \brief Converts \p HSAMetadata to \p String.
+/// Converts \p HSAMetadata to \p String.
 std::error_code toString(Metadata HSAMetadata, std::string &String);
 
 } // end namespace HSAMD
@@ -438,10 +438,10 @@ std::error_code toString(Metadata HSAMetadata, std::string &String);
 //===----------------------------------------------------------------------===//
 namespace PALMD {
 
-/// \brief PAL metadata assembler directive.
+/// PAL metadata assembler directive.
 constexpr char AssemblerDirective[] = ".amd_amdgpu_pal_metadata";
 
-/// \brief PAL metadata keys.
+/// PAL metadata keys.
 enum Key : uint32_t {
   LS_NUM_USED_VGPRS = 0x10000021,
   HS_NUM_USED_VGPRS = 0x10000022,
@@ -468,10 +468,10 @@ enum Key : uint32_t {
   CS_SCRATCH_SIZE = 0x1000004a
 };
 
-/// \brief PAL metadata represented as a vector.
+/// PAL metadata represented as a vector.
 typedef std::vector<uint32_t> Metadata;
 
-/// \brief Converts \p PALMetadata to \p String.
+/// Converts \p PALMetadata to \p String.
 std::error_code toString(const Metadata &PALMetadata, std::string &String);
 
 } // end namespace PALMD
diff --git a/contrib/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/contrib/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
new file mode 100644
index 000000000000..751699e3a19f
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -0,0 +1,185 @@
+//===--- AMDHSAKernelDescriptor.h -----------------------------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDHSA kernel descriptor definitions. For more information, visit
+/// https://llvm.org/docs/AMDGPUUsage.html#kernel-descriptor
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AMDHSAKERNELDESCRIPTOR_H
+#define LLVM_SUPPORT_AMDHSAKERNELDESCRIPTOR_H
+
+#include <cstddef>
+#include <cstdint>
+
+// Gets offset of specified member in specified type.
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE*)0)->MEMBER)
+#endif // offsetof
+
+// Creates enumeration entries used for packing bits into integers. Enumeration
+// entries include bit shift amount, bit width, and bit mask.
+#ifndef AMDHSA_BITS_ENUM_ENTRY
+#define AMDHSA_BITS_ENUM_ENTRY(NAME, SHIFT, WIDTH) \
+  NAME ## _SHIFT = (SHIFT),                        \
+  NAME ## _WIDTH = (WIDTH),                        \
+  NAME = (((1 << (WIDTH)) - 1) << (SHIFT))
+#endif // AMDHSA_BITS_ENUM_ENTRY
+
+// Gets bits for specified bit mask from specified source.
+#ifndef AMDHSA_BITS_GET
+#define AMDHSA_BITS_GET(SRC, MSK) ((SRC & MSK) >> MSK ## _SHIFT)
+#endif // AMDHSA_BITS_GET
+
+// Sets bits for specified bit mask in specified destination.
+#ifndef AMDHSA_BITS_SET
+#define AMDHSA_BITS_SET(DST, MSK, VAL)  \
+  DST &= ~MSK;                          \
+  DST |= ((VAL << MSK ## _SHIFT) & MSK)
+#endif // AMDHSA_BITS_SET
+
+namespace llvm {
+namespace amdhsa {
+
+// Floating point rounding modes. Must match hardware definition.
+enum : uint8_t {
+  FLOAT_ROUND_MODE_NEAR_EVEN = 0,
+  FLOAT_ROUND_MODE_PLUS_INFINITY = 1,
+  FLOAT_ROUND_MODE_MINUS_INFINITY = 2,
+  FLOAT_ROUND_MODE_ZERO = 3,
+};
+
+// Floating point denorm modes. Must match hardware definition.
+enum : uint8_t {
+  FLOAT_DENORM_MODE_FLUSH_SRC_DST = 0,
+  FLOAT_DENORM_MODE_FLUSH_DST = 1,
+  FLOAT_DENORM_MODE_FLUSH_SRC = 2,
+  FLOAT_DENORM_MODE_FLUSH_NONE = 3,
+};
+
+// System VGPR workitem IDs. Must match hardware definition.
+enum : uint8_t {
+  SYSTEM_VGPR_WORKITEM_ID_X = 0,
+  SYSTEM_VGPR_WORKITEM_ID_X_Y = 1,
+  SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2,
+  SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3,
+};
+
+// Compute program resource register 1. Must match hardware definition.
+#define COMPUTE_PGM_RSRC1(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_ ## NAME, SHIFT, WIDTH)
+enum : int32_t {
+  COMPUTE_PGM_RSRC1(GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
+  COMPUTE_PGM_RSRC1(GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
+  COMPUTE_PGM_RSRC1(PRIORITY, 10, 2),
+  COMPUTE_PGM_RSRC1(FLOAT_ROUND_MODE_32, 12, 2),
+  COMPUTE_PGM_RSRC1(FLOAT_ROUND_MODE_16_64, 14, 2),
+  COMPUTE_PGM_RSRC1(FLOAT_DENORM_MODE_32, 16, 2),
+  COMPUTE_PGM_RSRC1(FLOAT_DENORM_MODE_16_64, 18, 2),
+  COMPUTE_PGM_RSRC1(PRIV, 20, 1),
+  COMPUTE_PGM_RSRC1(ENABLE_DX10_CLAMP, 21, 1),
+  COMPUTE_PGM_RSRC1(DEBUG_MODE, 22, 1),
+  COMPUTE_PGM_RSRC1(ENABLE_IEEE_MODE, 23, 1),
+  COMPUTE_PGM_RSRC1(BULKY, 24, 1),
+  COMPUTE_PGM_RSRC1(CDBG_USER, 25, 1),
+  COMPUTE_PGM_RSRC1(FP16_OVFL, 26, 1), // GFX9+
+  COMPUTE_PGM_RSRC1(RESERVED0, 27, 5),
+};
+#undef COMPUTE_PGM_RSRC1
+
+// Compute program resource register 2. Must match hardware definition.
+#define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH)
+enum : int32_t {
+  COMPUTE_PGM_RSRC2(ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, 0, 1),
+  COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5),
+  COMPUTE_PGM_RSRC2(ENABLE_TRAP_HANDLER, 6, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_INFO, 10, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_VGPR_WORKITEM_ID, 11, 2),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_MEMORY, 14, 1),
+  COMPUTE_PGM_RSRC2(GRANULATED_LDS_SIZE, 15, 9),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO, 30, 1),
+  COMPUTE_PGM_RSRC2(RESERVED0, 31, 1),
+};
+#undef COMPUTE_PGM_RSRC2
+
+// Kernel code properties. Must be kept backwards compatible.
+#define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(KERNEL_CODE_PROPERTY_ ## NAME, SHIFT, WIDTH)
+enum : int32_t {
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1),
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_PTR, 1, 1),
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_QUEUE_PTR, 2, 1),
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1),
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1),
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
+  KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
+  KERNEL_CODE_PROPERTY(RESERVED0, 7, 9),
+};
+#undef KERNEL_CODE_PROPERTY
+
+// Kernel descriptor. Must be kept backwards compatible.
+struct kernel_descriptor_t {
+  uint32_t group_segment_fixed_size;
+  uint32_t private_segment_fixed_size;
+  uint8_t reserved0[8];
+  int64_t kernel_code_entry_byte_offset;
+  uint8_t reserved1[24];
+  uint32_t compute_pgm_rsrc1;
+  uint32_t compute_pgm_rsrc2;
+  uint16_t kernel_code_properties;
+  uint8_t reserved2[6];
+};
+
+static_assert(
+    sizeof(kernel_descriptor_t) == 64,
+    "invalid size for kernel_descriptor_t");
+static_assert(
+    offsetof(kernel_descriptor_t, group_segment_fixed_size) == 0,
+    "invalid offset for group_segment_fixed_size");
+static_assert(
+    offsetof(kernel_descriptor_t, private_segment_fixed_size) == 4,
+    "invalid offset for private_segment_fixed_size");
+static_assert(
+    offsetof(kernel_descriptor_t, reserved0) == 8,
+    "invalid offset for reserved0");
+static_assert(
+    offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == 16,
+    "invalid offset for kernel_code_entry_byte_offset");
+static_assert(
+    offsetof(kernel_descriptor_t, reserved1) == 24,
+    "invalid offset for reserved1");
+static_assert(
+    offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48,
+    "invalid offset for compute_pgm_rsrc1");
+static_assert(
+    offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == 52,
+    "invalid offset for compute_pgm_rsrc2");
+static_assert(
+    offsetof(kernel_descriptor_t, kernel_code_properties) == 56,
+    "invalid offset for kernel_code_properties");
+static_assert(
+    offsetof(kernel_descriptor_t, reserved2) == 58,
+    "invalid offset for reserved2");
+
+} // end namespace amdhsa
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_AMDHSAKERNELDESCRIPTOR_H
diff --git a/contrib/llvm/include/llvm/Support/ARMTargetParser.def b/contrib/llvm/include/llvm/Support/ARMTargetParser.def
index 6c8eff1a8f84..78f5410fb733 100644
--- a/contrib/llvm/include/llvm/Support/ARMTargetParser.def
+++ b/contrib/llvm/include/llvm/Support/ARMTargetParser.def
@@ -101,6 +101,11 @@ ARM_ARCH("armv8.3-a", ARMV8_3A, "8.3-A", "v8.3a",
          ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS))
+ARM_ARCH("armv8.4-a", ARMV8_4A, "8.4-A", "v8.4a",
+         ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "v8r", ARMBuildAttrs::CPUArch::v8_R,
           FK_NEON_FP_ARMV8,
           (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB |
@@ -130,6 +135,8 @@ ARM_ARCH_EXT_NAME("invalid",  ARM::AEK_INVALID,  nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("none",     ARM::AEK_NONE,     nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("crc",      ARM::AEK_CRC,      "+crc",   "-crc")
 ARM_ARCH_EXT_NAME("crypto",   ARM::AEK_CRYPTO,   "+crypto","-crypto")
+ARM_ARCH_EXT_NAME("sha2",     ARM::AEK_SHA2,     "+sha2",  "-sha2")
+ARM_ARCH_EXT_NAME("aes",      ARM::AEK_AES,      "+aes",   "-aes")
 ARM_ARCH_EXT_NAME("dotprod",  ARM::AEK_DOTPROD,  "+dotprod","-dotprod")
 ARM_ARCH_EXT_NAME("dsp",      ARM::AEK_DSP,      "+dsp",   "-dsp")
 ARM_ARCH_EXT_NAME("fp",       ARM::AEK_FP,       nullptr,  nullptr)
@@ -253,6 +260,7 @@ ARM_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m1", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m2", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m3", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
+ARM_CPU_NAME("exynos-m4", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("kryo", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 // Non-standard Arch names.
 ARM_CPU_NAME("iwmmxt", IWMMXT, FK_NONE, true, ARM::AEK_NONE)
diff --git a/contrib/llvm/include/llvm/Support/AlignOf.h b/contrib/llvm/include/llvm/Support/AlignOf.h
index abd19afa22f0..9e7a62b85e34 100644
--- a/contrib/llvm/include/llvm/Support/AlignOf.h
+++ b/contrib/llvm/include/llvm/Support/AlignOf.h
@@ -20,7 +20,7 @@
 namespace llvm {
 
 /// \struct AlignedCharArray
-/// \brief Helper for building an aligned character array type.
+/// Helper for building an aligned character array type.
 ///
 /// This template is used to explicitly build up a collection of aligned
 /// character array types. We have to build these up using a macro and explicit
@@ -34,12 +34,12 @@ namespace llvm {
 
 template<std::size_t Alignment, std::size_t Size>
 struct AlignedCharArray {
-  LLVM_ALIGNAS(Alignment) char buffer[Size];
+  alignas(Alignment) char buffer[Size];
 };
 
 #else // _MSC_VER
 
-/// \brief Create a type with an aligned char buffer.
+/// Create a type with an aligned char buffer.
 template<std::size_t Alignment, std::size_t Size>
 struct AlignedCharArray;
 
@@ -124,7 +124,7 @@ union SizerImpl {
 };
 } // end namespace detail
 
-/// \brief This union template exposes a suitably aligned and sized character
+/// This union template exposes a suitably aligned and sized character
 /// array member which can hold elements of any of up to ten types.
 ///
 /// These types may be arrays, structs, or any other types. The goal is to
diff --git a/contrib/llvm/include/llvm/Support/Allocator.h b/contrib/llvm/include/llvm/Support/Allocator.h
index a94aa8fb1f2a..184ac491b1f1 100644
--- a/contrib/llvm/include/llvm/Support/Allocator.h
+++ b/contrib/llvm/include/llvm/Support/Allocator.h
@@ -23,7 +23,9 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemAlloc.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -35,7 +37,7 @@
 
 namespace llvm {
 
-/// \brief CRTP base class providing obvious overloads for the core \c
+/// CRTP base class providing obvious overloads for the core \c
 /// Allocate() methods of LLVM-style allocators.
 ///
 /// This base class both documents the full public interface exposed by all
@@ -43,7 +45,7 @@ namespace llvm {
 /// set of methods which the derived class must define.
 template <typename DerivedT> class AllocatorBase {
 public:
-  /// \brief Allocate \a Size bytes of \a Alignment aligned memory. This method
+  /// Allocate \a Size bytes of \a Alignment aligned memory. This method
   /// must be implemented by \c DerivedT.
   void *Allocate(size_t Size, size_t Alignment) {
 #ifdef __clang__
@@ -57,7 +59,7 @@ public:
     return static_cast<DerivedT *>(this)->Allocate(Size, Alignment);
   }
 
-  /// \brief Deallocate \a Ptr to \a Size bytes of memory allocated by this
+  /// Deallocate \a Ptr to \a Size bytes of memory allocated by this
   /// allocator.
   void Deallocate(const void *Ptr, size_t Size) {
 #ifdef __clang__
@@ -74,12 +76,12 @@ public:
   // The rest of these methods are helpers that redirect to one of the above
   // core methods.
 
-  /// \brief Allocate space for a sequence of objects without constructing them.
+  /// Allocate space for a sequence of objects without constructing them.
   template <typename T> T *Allocate(size_t Num = 1) {
     return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
   }
 
-  /// \brief Deallocate space for a sequence of objects without constructing them.
+  /// Deallocate space for a sequence of objects without constructing them.
   template <typename T>
   typename std::enable_if<
       !std::is_same<typename std::remove_cv<T>::type, void>::value, void>::type
@@ -94,7 +96,7 @@ public:
 
   LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size,
                                                 size_t /*Alignment*/) {
-    return malloc(Size);
+    return safe_malloc(Size);
   }
 
   // Pull in base class overloads.
@@ -119,7 +121,7 @@ void printBumpPtrAllocatorStats(unsigned NumSlabs, size_t BytesAllocated,
 
 } // end namespace detail
 
-/// \brief Allocate memory in an ever growing pool, as if by bump-pointer.
+/// Allocate memory in an ever growing pool, as if by bump-pointer.
 ///
 /// This isn't strictly a bump-pointer allocator as it uses backing slabs of
 /// memory rather than relying on a boundless contiguous heap. However, it has
@@ -187,7 +189,7 @@ public:
     return *this;
   }
 
-  /// \brief Deallocate all but the current slab and reset the current pointer
+  /// Deallocate all but the current slab and reset the current pointer
   /// to the beginning of it, freeing all memory allocated so far.
   void Reset() {
     // Deallocate all but the first slab, and deallocate all custom-sized slabs.
@@ -207,7 +209,7 @@ public:
     Slabs.erase(std::next(Slabs.begin()), Slabs.end());
   }
 
-  /// \brief Allocate space at the specified alignment.
+  /// Allocate space at the specified alignment.
   LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_RETURNS_NOALIAS void *
   Allocate(size_t Size, size_t Alignment) {
     assert(Alignment > 0 && "0-byte alignnment is not allowed. Use 1 instead.");
@@ -302,30 +304,30 @@ public:
   }
 
 private:
-  /// \brief The current pointer into the current slab.
+  /// The current pointer into the current slab.
   ///
   /// This points to the next free byte in the slab.
   char *CurPtr = nullptr;
 
-  /// \brief The end of the current slab.
+  /// The end of the current slab.
   char *End = nullptr;
 
-  /// \brief The slabs allocated so far.
+  /// The slabs allocated so far.
   SmallVector<void *, 4> Slabs;
 
-  /// \brief Custom-sized slabs allocated for too-large allocation requests.
+  /// Custom-sized slabs allocated for too-large allocation requests.
   SmallVector<std::pair<void *, size_t>, 0> CustomSizedSlabs;
 
-  /// \brief How many bytes we've allocated.
+  /// How many bytes we've allocated.
   ///
   /// Used so that we can compute how much space was wasted.
   size_t BytesAllocated = 0;
 
-  /// \brief The number of bytes to put between allocations when running under
+  /// The number of bytes to put between allocations when running under
   /// a sanitizer.
   size_t RedZoneSize = 1;
 
-  /// \brief The allocator instance we use to get slabs of memory.
+  /// The allocator instance we use to get slabs of memory.
   AllocatorT Allocator;
 
   static size_t computeSlabSize(unsigned SlabIdx) {
@@ -336,7 +338,7 @@ private:
     return SlabSize * ((size_t)1 << std::min<size_t>(30, SlabIdx / 128));
   }
 
-  /// \brief Allocate a new slab and move the bump pointers over into the new
+  /// Allocate a new slab and move the bump pointers over into the new
   /// slab, modifying CurPtr and End.
   void StartNewSlab() {
     size_t AllocatedSlabSize = computeSlabSize(Slabs.size());
@@ -351,7 +353,7 @@ private:
     End = ((char *)NewSlab) + AllocatedSlabSize;
   }
 
-  /// \brief Deallocate a sequence of slabs.
+  /// Deallocate a sequence of slabs.
   void DeallocateSlabs(SmallVectorImpl<void *>::iterator I,
                        SmallVectorImpl<void *>::iterator E) {
     for (; I != E; ++I) {
@@ -361,7 +363,7 @@ private:
     }
   }
 
-  /// \brief Deallocate all memory for custom sized slabs.
+  /// Deallocate all memory for custom sized slabs.
   void DeallocateCustomSizedSlabs() {
     for (auto &PtrAndSize : CustomSizedSlabs) {
       void *Ptr = PtrAndSize.first;
@@ -373,11 +375,11 @@ private:
   template <typename T> friend class SpecificBumpPtrAllocator;
 };
 
-/// \brief The standard BumpPtrAllocator which just uses the default template
+/// The standard BumpPtrAllocator which just uses the default template
 /// parameters.
 typedef BumpPtrAllocatorImpl<> BumpPtrAllocator;
 
-/// \brief A BumpPtrAllocator that allows only elements of a specific type to be
+/// A BumpPtrAllocator that allows only elements of a specific type to be
 /// allocated.
 ///
 /// This allows calling the destructor in DestroyAll() and when the allocator is
@@ -430,7 +432,7 @@ public:
     Allocator.Reset();
   }
 
-  /// \brief Allocate space for an array of objects without constructing them.
+  /// Allocate space for an array of objects without constructing them.
   T *Allocate(size_t num = 1) { return Allocator.Allocate<T>(num); }
 };
 
diff --git a/contrib/llvm/include/llvm/Support/AtomicOrdering.h b/contrib/llvm/include/llvm/Support/AtomicOrdering.h
index e93b755aa63b..a679ab30243e 100644
--- a/contrib/llvm/include/llvm/Support/AtomicOrdering.h
+++ b/contrib/llvm/include/llvm/Support/AtomicOrdering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Atomic ordering constants.
+/// Atomic ordering constants.
 ///
 /// These values are used by LLVM to represent atomic ordering for C++11's
 /// memory model and more, as detailed in docs/Atomics.rst.
diff --git a/contrib/llvm/include/llvm/Support/BinaryByteStream.h b/contrib/llvm/include/llvm/Support/BinaryByteStream.h
index db1ccba1398b..9808d3b72157 100644
--- a/contrib/llvm/include/llvm/Support/BinaryByteStream.h
+++ b/contrib/llvm/include/llvm/Support/BinaryByteStream.h
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-/// \brief An implementation of BinaryStream which holds its entire data set
+/// An implementation of BinaryStream which holds its entire data set
 /// in a single contiguous buffer.  BinaryByteStream guarantees that no read
 /// operation will ever incur a copy.  Note that BinaryByteStream does not
 /// own the underlying buffer.
@@ -69,7 +69,7 @@ protected:
   ArrayRef<uint8_t> Data;
 };
 
-/// \brief An implementation of BinaryStream whose data is backed by an llvm
+/// An implementation of BinaryStream whose data is backed by an llvm
 /// MemoryBuffer object.  MemoryBufferByteStream owns the MemoryBuffer in
 /// question.  As with BinaryByteStream, reading from a MemoryBufferByteStream
 /// will never cause a copy.
@@ -83,7 +83,7 @@ public:
   std::unique_ptr<MemoryBuffer> MemBuffer;
 };
 
-/// \brief An implementation of BinaryStream which holds its entire data set
+/// An implementation of BinaryStream which holds its entire data set
 /// in a single contiguous buffer.  As with BinaryByteStream, the mutable
 /// version also guarantees that no read operation will ever incur a copy,
 /// and similarly it does not own the underlying buffer.
@@ -131,7 +131,7 @@ private:
   BinaryByteStream ImmutableStream;
 };
 
-/// \brief An implementation of WritableBinaryStream which can write at its end
+/// An implementation of WritableBinaryStream which can write at its end
 /// causing the underlying data to grow.  This class owns the underlying data.
 class AppendingBinaryByteStream : public WritableBinaryStream {
   std::vector<uint8_t> Data;
@@ -193,7 +193,7 @@ public:
 
   Error commit() override { return Error::success(); }
 
-  /// \brief Return the properties of this stream.
+  /// Return the properties of this stream.
   virtual BinaryStreamFlags getFlags() const override {
     return BSF_Write | BSF_Append;
   }
@@ -201,7 +201,7 @@ public:
   MutableArrayRef<uint8_t> data() { return Data; }
 };
 
-/// \brief An implementation of WritableBinaryStream backed by an llvm
+/// An implementation of WritableBinaryStream backed by an llvm
 /// FileOutputBuffer.
 class FileBufferByteStream : public WritableBinaryStream {
 private:
@@ -222,6 +222,12 @@ private:
       return Error::success();
     }
 
+    /// Returns a pointer to the start of the buffer.
+    uint8_t *getBufferStart() const { return FileBuffer->getBufferStart(); }
+
+    /// Returns a pointer to the end of the buffer.
+    uint8_t *getBufferEnd() const { return FileBuffer->getBufferEnd(); }
+
   private:
     std::unique_ptr<FileOutputBuffer> FileBuffer;
   };
@@ -253,6 +259,12 @@ public:
 
   Error commit() override { return Impl.commit(); }
 
+  /// Returns a pointer to the start of the buffer.
+  uint8_t *getBufferStart() const { return Impl.getBufferStart(); }
+
+  /// Returns a pointer to the end of the buffer.
+  uint8_t *getBufferEnd() const { return Impl.getBufferEnd(); }
+
 private:
   StreamImpl Impl;
 };
diff --git a/contrib/llvm/include/llvm/Support/BinaryStream.h b/contrib/llvm/include/llvm/Support/BinaryStream.h
index d69a03eccfdb..7677214e48ee 100644
--- a/contrib/llvm/include/llvm/Support/BinaryStream.h
+++ b/contrib/llvm/include/llvm/Support/BinaryStream.h
@@ -26,7 +26,7 @@ enum BinaryStreamFlags {
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ BSF_Append)
 };
 
-/// \brief An interface for accessing data in a stream-like format, but which
+/// An interface for accessing data in a stream-like format, but which
 /// discourages copying.  Instead of specifying a buffer in which to copy
 /// data on a read, the API returns an ArrayRef to data owned by the stream's
 /// implementation.  Since implementations may not necessarily store data in a
@@ -39,21 +39,21 @@ public:
 
   virtual llvm::support::endianness getEndian() const = 0;
 
-  /// \brief Given an offset into the stream and a number of bytes, attempt to
+  /// Given an offset into the stream and a number of bytes, attempt to
   /// read the bytes and set the output ArrayRef to point to data owned by the
   /// stream.
   virtual Error readBytes(uint32_t Offset, uint32_t Size,
                           ArrayRef<uint8_t> &Buffer) = 0;
 
-  /// \brief Given an offset into the stream, read as much as possible without
+  /// Given an offset into the stream, read as much as possible without
   /// copying any data.
   virtual Error readLongestContiguousChunk(uint32_t Offset,
                                            ArrayRef<uint8_t> &Buffer) = 0;
 
-  /// \brief Return the number of bytes of data in this stream.
+  /// Return the number of bytes of data in this stream.
   virtual uint32_t getLength() = 0;
 
-  /// \brief Return the properties of this stream.
+  /// Return the properties of this stream.
   virtual BinaryStreamFlags getFlags() const { return BSF_None; }
 
 protected:
@@ -66,7 +66,7 @@ protected:
   }
 };
 
-/// \brief A BinaryStream which can be read from as well as written to.  Note
+/// A BinaryStream which can be read from as well as written to.  Note
 /// that writing to a BinaryStream always necessitates copying from the input
 /// buffer to the stream's backing store.  Streams are assumed to be buffered
 /// so that to be portable it is necessary to call commit() on the stream when
@@ -75,15 +75,15 @@ class WritableBinaryStream : public BinaryStream {
 public:
   ~WritableBinaryStream() override = default;
 
-  /// \brief Attempt to write the given bytes into the stream at the desired
+  /// Attempt to write the given bytes into the stream at the desired
   /// offset. This will always necessitate a copy.  Cannot shrink or grow the
   /// stream, only writes into existing allocated space.
   virtual Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) = 0;
 
-  /// \brief For buffered streams, commits changes to the backing store.
+  /// For buffered streams, commits changes to the backing store.
   virtual Error commit() = 0;
 
-  /// \brief Return the properties of this stream.
+  /// Return the properties of this stream.
   BinaryStreamFlags getFlags() const override { return BSF_Write; }
 
 protected:
diff --git a/contrib/llvm/include/llvm/Support/BinaryStreamArray.h b/contrib/llvm/include/llvm/Support/BinaryStreamArray.h
index 3f5562ba7519..d1571cb37fc6 100644
--- a/contrib/llvm/include/llvm/Support/BinaryStreamArray.h
+++ b/contrib/llvm/include/llvm/Support/BinaryStreamArray.h
@@ -111,7 +111,7 @@ public:
 
   bool empty() const { return Stream.getLength() == 0; }
 
-  /// \brief given an offset into the array's underlying stream, return an
+  /// given an offset into the array's underlying stream, return an
   /// iterator to the record at that offset.  This is considered unsafe
   /// since the behavior is undefined if \p Offset does not refer to the
   /// beginning of a valid record.
diff --git a/contrib/llvm/include/llvm/Support/BinaryStreamReader.h b/contrib/llvm/include/llvm/Support/BinaryStreamReader.h
index ae5ebb2c3628..fe77b550c453 100644
--- a/contrib/llvm/include/llvm/Support/BinaryStreamReader.h
+++ b/contrib/llvm/include/llvm/Support/BinaryStreamReader.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-/// \brief Provides read only access to a subclass of `BinaryStream`.  Provides
+/// Provides read only access to a subclass of `BinaryStream`.  Provides
 /// bounds checking and helpers for writing certain common data types such as
 /// null-terminated strings, integers in various flavors of endianness, etc.
 /// Can be subclassed to provide reading of custom datatypes, although no
diff --git a/contrib/llvm/include/llvm/Support/BinaryStreamRef.h b/contrib/llvm/include/llvm/Support/BinaryStreamRef.h
index 5cf355be6fe9..d8dc1392c01c 100644
--- a/contrib/llvm/include/llvm/Support/BinaryStreamRef.h
+++ b/contrib/llvm/include/llvm/Support/BinaryStreamRef.h
@@ -147,7 +147,7 @@ protected:
   Optional<uint32_t> Length;
 };
 
-/// \brief BinaryStreamRef is to BinaryStream what ArrayRef is to an Array.  It
+/// BinaryStreamRef is to BinaryStream what ArrayRef is to an Array.  It
 /// provides copy-semantics and read only access to a "window" of the underlying
 /// BinaryStream. Note that BinaryStreamRef is *not* a BinaryStream.  That is to
 /// say, it does not inherit and override the methods of BinaryStream.  In
@@ -266,7 +266,7 @@ public:
   /// Conver this WritableBinaryStreamRef to a read-only BinaryStreamRef.
   operator BinaryStreamRef() const;
 
-  /// \brief For buffered streams, commits changes to the backing store.
+  /// For buffered streams, commits changes to the backing store.
   Error commit();
 };
 
diff --git a/contrib/llvm/include/llvm/Support/BinaryStreamWriter.h b/contrib/llvm/include/llvm/Support/BinaryStreamWriter.h
index a4495a1ce27d..6e8a68a30474 100644
--- a/contrib/llvm/include/llvm/Support/BinaryStreamWriter.h
+++ b/contrib/llvm/include/llvm/Support/BinaryStreamWriter.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-/// \brief Provides write only access to a subclass of `WritableBinaryStream`.
+/// Provides write only access to a subclass of `WritableBinaryStream`.
 /// Provides bounds checking and helpers for writing certain common data types
 /// such as null-terminated strings, integers in various flavors of endianness,
 /// etc.  Can be subclassed to provide reading and writing of custom datatypes,
@@ -56,7 +56,7 @@ public:
   /// otherwise returns an appropriate error code.
   Error writeBytes(ArrayRef<uint8_t> Buffer);
 
-  /// Write the the integer \p Value to the underlying stream in the
+  /// Write the integer \p Value to the underlying stream in the
   /// specified endianness.  On success, updates the offset so that
   /// subsequent writes occur at the next unwritten position.
   ///
@@ -80,7 +80,7 @@ public:
     return writeInteger<U>(static_cast<U>(Num));
   }
 
-  /// Write the the string \p Str to the underlying stream followed by a null
+  /// Write the string \p Str to the underlying stream followed by a null
   /// terminator.  On success, updates the offset so that subsequent writes
   /// occur at the next unwritten position.  \p Str need not be null terminated
   /// on input.
@@ -89,7 +89,7 @@ public:
   /// otherwise returns an appropriate error code.
   Error writeCString(StringRef Str);
 
-  /// Write the the string \p Str to the underlying stream without a null
+  /// Write the string \p Str to the underlying stream without a null
   /// terminator.  On success, updates the offset so that subsequent writes
   /// occur at the next unwritten position.
   ///
diff --git a/contrib/llvm/include/llvm/Support/BlockFrequency.h b/contrib/llvm/include/llvm/Support/BlockFrequency.h
index 2e75cbdd29c1..4b468f7acb32 100644
--- a/contrib/llvm/include/llvm/Support/BlockFrequency.h
+++ b/contrib/llvm/include/llvm/Support/BlockFrequency.h
@@ -28,32 +28,32 @@ class BlockFrequency {
 public:
   BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
 
-  /// \brief Returns the maximum possible frequency, the saturation value.
+  /// Returns the maximum possible frequency, the saturation value.
   static uint64_t getMaxFrequency() { return -1ULL; }
 
-  /// \brief Returns the frequency as a fixpoint number scaled by the entry
+  /// Returns the frequency as a fixpoint number scaled by the entry
   /// frequency.
   uint64_t getFrequency() const { return Frequency; }
 
-  /// \brief Multiplies with a branch probability. The computation will never
+  /// Multiplies with a branch probability. The computation will never
   /// overflow.
   BlockFrequency &operator*=(BranchProbability Prob);
   BlockFrequency operator*(BranchProbability Prob) const;
 
-  /// \brief Divide by a non-zero branch probability using saturating
+  /// Divide by a non-zero branch probability using saturating
   /// arithmetic.
   BlockFrequency &operator/=(BranchProbability Prob);
   BlockFrequency operator/(BranchProbability Prob) const;
 
-  /// \brief Adds another block frequency using saturating arithmetic.
+  /// Adds another block frequency using saturating arithmetic.
   BlockFrequency &operator+=(BlockFrequency Freq);
   BlockFrequency operator+(BlockFrequency Freq) const;
 
-  /// \brief Subtracts another block frequency using saturating arithmetic.
+  /// Subtracts another block frequency using saturating arithmetic.
   BlockFrequency &operator-=(BlockFrequency Freq);
   BlockFrequency operator-(BlockFrequency Freq) const;
 
-  /// \brief Shift block frequency to the right by count digits saturating to 1.
+  /// Shift block frequency to the right by count digits saturating to 1.
   BlockFrequency &operator>>=(const unsigned count);
 
   bool operator<(BlockFrequency RHS) const {
diff --git a/contrib/llvm/include/llvm/Support/BranchProbability.h b/contrib/llvm/include/llvm/Support/BranchProbability.h
index b403d7fbf117..3a88e71c2480 100644
--- a/contrib/llvm/include/llvm/Support/BranchProbability.h
+++ b/contrib/llvm/include/llvm/Support/BranchProbability.h
@@ -73,7 +73,7 @@ public:
 
   void dump() const;
 
-  /// \brief Scale a large integer.
+  /// Scale a large integer.
   ///
   /// Scales \c Num.  Guarantees full precision.  Returns the floor of the
   /// result.
@@ -81,7 +81,7 @@ public:
   /// \return \c Num times \c this.
   uint64_t scale(uint64_t Num) const;
 
-  /// \brief Scale a large integer by the inverse.
+  /// Scale a large integer by the inverse.
   ///
   /// Scales \c Num by the inverse of \c this.  Guarantees full precision.
   /// Returns the floor of the result.
diff --git a/contrib/llvm/include/llvm/Support/CachePruning.h b/contrib/llvm/include/llvm/Support/CachePruning.h
index 327c7df4570f..cf3f8ec67a52 100644
--- a/contrib/llvm/include/llvm/Support/CachePruning.h
+++ b/contrib/llvm/include/llvm/Support/CachePruning.h
@@ -37,7 +37,7 @@ struct CachePruningPolicy {
   std::chrono::seconds Expiration = std::chrono::hours(7 * 24); // 1w
 
   /// The maximum size for the cache directory, in terms of percentage of the
-  /// available space on the the disk. Set to 100 to indicate no limit, 50 to
+  /// available space on the disk. Set to 100 to indicate no limit, 50 to
   /// indicate that the cache size will not be left over half the available disk
   /// space. A value over 100 will be reduced to 100. A value of 0 disables the
   /// percentage size-based pruning.
@@ -52,9 +52,11 @@ struct CachePruningPolicy {
   /// the number of files based pruning.
   ///
   /// This defaults to 1000000 because with that many files there are
-  /// diminishing returns on the effectiveness of the cache, and some file
-  /// systems have a limit on how many files can be contained in a directory
-  /// (notably ext4, which is limited to around 6000000 files).
+  /// diminishing returns on the effectiveness of the cache. Some systems have a
+  /// limit on total number of files, and some also limit the number of files
+  /// per directory, such as Linux ext4, with the default setting (block size is
+  /// 4096 and large_dir disabled), there is a per-directory entry limit of
+  /// 508*510*floor(4096/(40+8))~=20M for average filename length of 40.
   uint64_t MaxSizeFiles = 1000000;
 };
 
@@ -66,7 +68,7 @@ struct CachePruningPolicy {
 Expected<CachePruningPolicy> parseCachePruningPolicy(StringRef PolicyStr);
 
 /// Peform pruning using the supplied policy, returns true if pruning
-/// occured, i.e. if Policy.Interval was expired.
+/// occurred, i.e. if Policy.Interval was expired.
 ///
 /// As a safeguard against data loss if the user specifies the wrong directory
 /// as their cache directory, this function will ignore files not matching the
diff --git a/contrib/llvm/include/llvm/Support/Casting.h b/contrib/llvm/include/llvm/Support/Casting.h
index baa2a814e9a1..3f21e0f9ebc3 100644
--- a/contrib/llvm/include/llvm/Support/Casting.h
+++ b/contrib/llvm/include/llvm/Support/Casting.h
@@ -60,7 +60,7 @@ struct isa_impl {
   }
 };
 
-/// \brief Always allow upcasts, and perform no dynamic check for them.
+/// Always allow upcasts, and perform no dynamic check for them.
 template <typename To, typename From>
 struct isa_impl<
     To, From, typename std::enable_if<std::is_base_of<To, From>::value>::type> {
diff --git a/contrib/llvm/include/llvm/Support/CheckedArithmetic.h b/contrib/llvm/include/llvm/Support/CheckedArithmetic.h
new file mode 100644
index 000000000000..039c374136ff
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/CheckedArithmetic.h
@@ -0,0 +1,104 @@
+//==-- llvm/Support/CheckedArithmetic.h - Safe arithmetical operations *- C++ //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains generic functions for operating on integers which
+// give the indication on whether the operation has overflown.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHECKEDARITHMETIC_H
+#define LLVM_SUPPORT_CHECKEDARITHMETIC_H
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
+
+#include <type_traits>
+
+namespace {
+
+/// Utility function to apply a given method of \c APInt \p F to \p LHS and
+/// \p RHS.
+/// \return Empty optional if the operation overflows, or result otherwise.
+template <typename T, typename F>
+typename std::enable_if<std::is_integral<T>::value && sizeof(T) * 8 <= 64,
+                        llvm::Optional<T>>::type
+checkedOp(T LHS, T RHS, F Op, bool Signed = true) {
+  llvm::APInt ALHS(/*BitSize=*/sizeof(T) * 8, LHS, Signed);
+  llvm::APInt ARHS(/*BitSize=*/sizeof(T) * 8, RHS, Signed);
+  bool Overflow;
+  llvm::APInt Out = (ALHS.*Op)(ARHS, Overflow);
+  if (Overflow)
+    return llvm::None;
+  return Signed ? Out.getSExtValue() : Out.getZExtValue();
+}
+}
+
+namespace llvm {
+
+/// Add two signed integers \p LHS and \p RHS.
+/// \return Optional of sum if no signed overflow occurred,
+/// \c None otherwise.
+template <typename T>
+typename std::enable_if<std::is_signed<T>::value, llvm::Optional<T>>::type
+checkedAdd(T LHS, T RHS) {
+  return checkedOp(LHS, RHS, &llvm::APInt::sadd_ov);
+}
+
+/// Multiply two signed integers \p LHS and \p RHS.
+/// \return Optional of product if no signed overflow occurred,
+/// \c None otherwise.
+template <typename T>
+typename std::enable_if<std::is_signed<T>::value, llvm::Optional<T>>::type
+checkedMul(T LHS, T RHS) {
+  return checkedOp(LHS, RHS, &llvm::APInt::smul_ov);
+}
+
+/// Multiply A and B, and add C to the resulting product.
+/// \return Optional of result if no signed overflow occurred,
+/// \c None otherwise.
+template <typename T>
+typename std::enable_if<std::is_signed<T>::value, llvm::Optional<T>>::type
+checkedMulAdd(T A, T B, T C) {
+  if (auto Product = checkedMul(A, B))
+    return checkedAdd(*Product, C);
+  return llvm::None;
+}
+
+/// Add two unsigned integers \p LHS and \p RHS.
+/// \return Optional of sum if no unsigned overflow occurred,
+/// \c None otherwise.
+template <typename T>
+typename std::enable_if<std::is_unsigned<T>::value, llvm::Optional<T>>::type
+checkedAddUnsigned(T LHS, T RHS) {
+  return checkedOp(LHS, RHS, &llvm::APInt::uadd_ov, /*Signed=*/false);
+}
+
+/// Multiply two unsigned integers \p LHS and \p RHS.
+/// \return Optional of product if no unsigned overflow occurred,
+/// \c None otherwise.
+template <typename T>
+typename std::enable_if<std::is_unsigned<T>::value, llvm::Optional<T>>::type
+checkedMulUnsigned(T LHS, T RHS) {
+  return checkedOp(LHS, RHS, &llvm::APInt::umul_ov, /*Signed=*/false);
+}
+
+/// Multiply unsigned integers A and B, and add C to the resulting product.
+/// \return Optional of result if no unsigned overflow occurred,
+/// \c None otherwise.
+template <typename T>
+typename std::enable_if<std::is_unsigned<T>::value, llvm::Optional<T>>::type
+checkedMulAddUnsigned(T A, T B, T C) {
+  if (auto Product = checkedMulUnsigned(A, B))
+    return checkedAddUnsigned(*Product, C);
+  return llvm::None;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/include/llvm/Support/CodeGenCoverage.h b/contrib/llvm/include/llvm/Support/CodeGenCoverage.h
index d5bd837bff28..c863be35b822 100644
--- a/contrib/llvm/include/llvm/Support/CodeGenCoverage.h
+++ b/contrib/llvm/include/llvm/Support/CodeGenCoverage.h
@@ -23,15 +23,18 @@ protected:
   BitVector RuleCoverage;
 
 public:
+  using const_covered_iterator = BitVector::const_set_bits_iterator;
+
   CodeGenCoverage();
 
   void setCovered(uint64_t RuleID);
-  bool isCovered(uint64_t RuleID);
+  bool isCovered(uint64_t RuleID) const;
+  iterator_range<const_covered_iterator> covered() const;
 
   bool parse(MemoryBuffer &Buffer, StringRef BackendName);
   bool emit(StringRef FilePrefix, StringRef BackendName) const;
   void reset();
 };
-} // end namespace llvm
+} // namespace llvm
 
 #endif // ifndef LLVM_SUPPORT_CODEGENCOVERAGE_H
diff --git a/contrib/llvm/include/llvm/Support/CommandLine.h b/contrib/llvm/include/llvm/Support/CommandLine.h
index f043c112861b..799b41fbf8b0 100644
--- a/contrib/llvm/include/llvm/Support/CommandLine.h
+++ b/contrib/llvm/include/llvm/Support/CommandLine.h
@@ -30,6 +30,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <climits>
 #include <cstddef>
@@ -94,7 +95,7 @@ void PrintOptionValues();
 // Forward declaration - AddLiteralOption needs to be up here to make gcc happy.
 class Option;
 
-/// \brief Adds a new option for parsing and provides the option it refers to.
+/// Adds a new option for parsing and provides the option it refers to.
 ///
 /// \param O pointer to the option
 /// \param Name the string name for the option to handle during parsing
@@ -362,7 +363,10 @@ public:
                              bool MultiArg = false);
 
   // Prints option name followed by message.  Always returns true.
-  bool error(const Twine &Message, StringRef ArgName = StringRef());
+  bool error(const Twine &Message, StringRef ArgName = StringRef(), raw_ostream &Errs = llvm::errs());
+  bool error(const Twine &Message, raw_ostream &Errs) {
+    return error(Message, StringRef(), Errs);
+  }
 
   inline int getNumOccurrences() const { return NumOccurrences; }
   inline void reset() { NumOccurrences = 0; }
@@ -1770,7 +1774,7 @@ void PrintHelpMessage(bool Hidden = false, bool Categorized = false);
 // Public interface for accessing registered options.
 //
 
-/// \brief Use this to get a StringMap to all registered named options
+/// Use this to get a StringMap to all registered named options
 /// (e.g. -help). Note \p Map Should be an empty StringMap.
 ///
 /// \return A reference to the StringMap used by the cl APIs to parse options.
@@ -1799,7 +1803,7 @@ void PrintHelpMessage(bool Hidden = false, bool Categorized = false);
 /// than just handing around a global list.
 StringMap<Option *> &getRegisteredOptions(SubCommand &Sub = *TopLevelSubCommand);
 
-/// \brief Use this to get all registered SubCommands from the provided parser.
+/// Use this to get all registered SubCommands from the provided parser.
 ///
 /// \return A range of all SubCommand pointers registered with the parser.
 ///
@@ -1825,7 +1829,7 @@ getRegisteredSubcommands();
 // Standalone command line processing utilities.
 //
 
-/// \brief Tokenizes a command line that can contain escapes and quotes.
+/// Tokenizes a command line that can contain escapes and quotes.
 //
 /// The quoting rules match those used by GCC and other tools that use
 /// libiberty's buildargv() or expandargv() utilities, and do not match bash.
@@ -1841,7 +1845,7 @@ void TokenizeGNUCommandLine(StringRef Source, StringSaver &Saver,
                             SmallVectorImpl<const char *> &NewArgv,
                             bool MarkEOLs = false);
 
-/// \brief Tokenizes a Windows command line which may contain quotes and escaped
+/// Tokenizes a Windows command line which may contain quotes and escaped
 /// quotes.
 ///
 /// See MSDN docs for CommandLineToArgvW for information on the quoting rules.
@@ -1856,7 +1860,7 @@ void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
                                 SmallVectorImpl<const char *> &NewArgv,
                                 bool MarkEOLs = false);
 
-/// \brief String tokenization function type.  Should be compatible with either
+/// String tokenization function type.  Should be compatible with either
 /// Windows or Unix command line tokenizers.
 using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver,
                                    SmallVectorImpl<const char *> &NewArgv,
@@ -1889,7 +1893,7 @@ void tokenizeConfigFile(StringRef Source, StringSaver &Saver,
 bool readConfigFile(StringRef CfgFileName, StringSaver &Saver,
                     SmallVectorImpl<const char *> &Argv);
 
-/// \brief Expand response files on a command line recursively using the given
+/// Expand response files on a command line recursively using the given
 /// StringSaver and tokenization strategy.  Argv should contain the command line
 /// before expansion and will be modified in place. If requested, Argv will
 /// also be populated with nullptrs indicating where each response file line
@@ -1909,7 +1913,7 @@ bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
                          SmallVectorImpl<const char *> &Argv,
                          bool MarkEOLs = false, bool RelativeNames = false);
 
-/// \brief Mark all options not part of this category as cl::ReallyHidden.
+/// Mark all options not part of this category as cl::ReallyHidden.
 ///
 /// \param Category the category of options to keep displaying
 ///
@@ -1919,7 +1923,7 @@ bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
 void HideUnrelatedOptions(cl::OptionCategory &Category,
                           SubCommand &Sub = *TopLevelSubCommand);
 
-/// \brief Mark all options not part of the categories as cl::ReallyHidden.
+/// Mark all options not part of the categories as cl::ReallyHidden.
 ///
 /// \param Categories the categories of options to keep displaying.
 ///
@@ -1929,12 +1933,12 @@ void HideUnrelatedOptions(cl::OptionCategory &Category,
 void HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories,
                           SubCommand &Sub = *TopLevelSubCommand);
 
-/// \brief Reset all command line options to a state that looks as if they have
+/// Reset all command line options to a state that looks as if they have
 /// never appeared on the command line.  This is useful for being able to parse
 /// a command line multiple times (especially useful for writing tests).
 void ResetAllOptionOccurrences();
 
-/// \brief Reset the command line parser back to its initial state.  This
+/// Reset the command line parser back to its initial state.  This
 /// removes
 /// all options, categories, and subcommands and returns the parser to a state
 /// where no options are supported.
diff --git a/contrib/llvm/include/llvm/Support/Compiler.h b/contrib/llvm/include/llvm/Support/Compiler.h
index b19e37235df5..4de815fe61d7 100644
--- a/contrib/llvm/include/llvm/Support/Compiler.h
+++ b/contrib/llvm/include/llvm/Support/Compiler.h
@@ -17,6 +17,9 @@
 
 #include "llvm/Config/llvm-config.h"
 
+#include <new>
+#include <stddef.h>
+
 #if defined(_MSC_VER)
 #include <sal.h>
 #endif
@@ -42,7 +45,7 @@
 #endif
 
 /// \macro LLVM_GNUC_PREREQ
-/// \brief Extend the default __GNUC_PREREQ even if glibc's features.h isn't
+/// Extend the default __GNUC_PREREQ even if glibc's features.h isn't
 /// available.
 #ifndef LLVM_GNUC_PREREQ
 # if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -58,7 +61,7 @@
 #endif
 
 /// \macro LLVM_MSC_PREREQ
-/// \brief Is the compiler MSVC of at least the specified version?
+/// Is the compiler MSVC of at least the specified version?
 /// The common \param version values to check for are:
 ///  * 1900: Microsoft Visual Studio 2015 / 14.0
 #ifdef _MSC_VER
@@ -73,7 +76,7 @@
 #define LLVM_MSC_PREREQ(version) 0
 #endif
 
-/// \brief Does the compiler support ref-qualifiers for *this?
+/// Does the compiler support ref-qualifiers for *this?
 ///
 /// Sadly, this is separate from just rvalue reference support because GCC
 /// and MSVC implemented this later than everything else.
@@ -99,7 +102,7 @@
 /// functions, making them private to any shared library they are linked into.
 /// On PE/COFF targets, library visibility is the default, so this isn't needed.
 #if (__has_attribute(visibility) || LLVM_GNUC_PREREQ(4, 0, 0)) &&              \
-    !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32)
+    !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(_WIN32)
 #define LLVM_LIBRARY_VISIBILITY __attribute__ ((visibility("hidden")))
 #else
 #define LLVM_LIBRARY_VISIBILITY
@@ -146,7 +149,7 @@
 
 // FIXME: Provide this for PE/COFF targets.
 #if (__has_attribute(weak) || LLVM_GNUC_PREREQ(4, 0, 0)) &&                    \
-    (!defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32))
+    (!defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(_WIN32))
 #define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__))
 #else
 #define LLVM_ATTRIBUTE_WEAK
@@ -303,7 +306,7 @@
 #endif
 
 /// \macro LLVM_ASSUME_ALIGNED
-/// \brief Returns a pointer with an assumed alignment.
+/// Returns a pointer with an assumed alignment.
 #if __has_builtin(__builtin_assume_aligned) || LLVM_GNUC_PREREQ(4, 7, 0)
 # define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a)
 #elif defined(LLVM_BUILTIN_UNREACHABLE)
@@ -315,7 +318,7 @@
 #endif
 
 /// \macro LLVM_ALIGNAS
-/// \brief Used to specify a minimum alignment for a structure or variable.
+/// Used to specify a minimum alignment for a structure or variable.
 #if __GNUC__ && !__has_feature(cxx_alignas) && !LLVM_GNUC_PREREQ(4, 8, 1)
 # define LLVM_ALIGNAS(x) __attribute__((aligned(x)))
 #else
@@ -323,7 +326,7 @@
 #endif
 
 /// \macro LLVM_PACKED
-/// \brief Used to specify a packed structure.
+/// Used to specify a packed structure.
 /// LLVM_PACKED(
 ///    struct A {
 ///      int i;
@@ -351,7 +354,7 @@
 #endif
 
 /// \macro LLVM_PTR_SIZE
-/// \brief A constant integer equivalent to the value of sizeof(void*).
+/// A constant integer equivalent to the value of sizeof(void*).
 /// Generally used in combination with LLVM_ALIGNAS or when doing computation in
 /// the preprocessor.
 #ifdef __SIZEOF_POINTER__
@@ -367,7 +370,7 @@
 #endif
 
 /// \macro LLVM_MEMORY_SANITIZER_BUILD
-/// \brief Whether LLVM itself is built with MemorySanitizer instrumentation.
+/// Whether LLVM itself is built with MemorySanitizer instrumentation.
 #if __has_feature(memory_sanitizer)
 # define LLVM_MEMORY_SANITIZER_BUILD 1
 # include <sanitizer/msan_interface.h>
@@ -378,7 +381,7 @@
 #endif
 
 /// \macro LLVM_ADDRESS_SANITIZER_BUILD
-/// \brief Whether LLVM itself is built with AddressSanitizer instrumentation.
+/// Whether LLVM itself is built with AddressSanitizer instrumentation.
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 # define LLVM_ADDRESS_SANITIZER_BUILD 1
 # include <sanitizer/asan_interface.h>
@@ -389,7 +392,7 @@
 #endif
 
 /// \macro LLVM_THREAD_SANITIZER_BUILD
-/// \brief Whether LLVM itself is built with ThreadSanitizer instrumentation.
+/// Whether LLVM itself is built with ThreadSanitizer instrumentation.
 #if __has_feature(thread_sanitizer) || defined(__SANITIZE_THREAD__)
 # define LLVM_THREAD_SANITIZER_BUILD 1
 #else
@@ -432,14 +435,14 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #endif
 
 /// \macro LLVM_NO_SANITIZE
-/// \brief Disable a particular sanitizer for a function.
+/// Disable a particular sanitizer for a function.
 #if __has_attribute(no_sanitize)
 #define LLVM_NO_SANITIZE(KIND) __attribute__((no_sanitize(KIND)))
 #else
 #define LLVM_NO_SANITIZE(KIND)
 #endif
 
-/// \brief Mark debug helper function definitions like dump() that should not be
+/// Mark debug helper function definitions like dump() that should not be
 /// stripped from debug builds.
 /// Note that you should also surround dump() functions with
 /// `#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)` so they do always
@@ -452,7 +455,7 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #endif
 
 /// \macro LLVM_PRETTY_FUNCTION
-/// \brief Gets a user-friendly looking function signature for the current scope
+/// Gets a user-friendly looking function signature for the current scope
 /// using the best available method on each platform.  The exact format of the
 /// resulting string is implementation specific and non-portable, so this should
 /// only be used, for example, for logging or diagnostics.
@@ -465,7 +468,7 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #endif
 
 /// \macro LLVM_THREAD_LOCAL
-/// \brief A thread-local storage specifier which can be used with globals,
+/// A thread-local storage specifier which can be used with globals,
 /// extern globals, and static globals.
 ///
 /// This is essentially an extremely restricted analog to C++11's thread_local
@@ -494,7 +497,7 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #endif
 
 /// \macro LLVM_ENABLE_EXCEPTIONS
-/// \brief Whether LLVM is built with exception support.
+/// Whether LLVM is built with exception support.
 #if __has_feature(cxx_exceptions)
 #define LLVM_ENABLE_EXCEPTIONS 1
 #elif defined(__GNUC__) && defined(__EXCEPTIONS)
@@ -503,4 +506,46 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #define LLVM_ENABLE_EXCEPTIONS 1
 #endif
 
+namespace llvm {
+
+/// Allocate a buffer of memory with the given size and alignment.
+///
+/// When the compiler supports aligned operator new, this will use it to to
+/// handle even over-aligned allocations.
+///
+/// However, this doesn't make any attempt to leverage the fancier techniques
+/// like posix_memalign due to portability. It is mostly intended to allow
+/// compatibility with platforms that, after aligned allocation was added, use
+/// reduced default alignment.
+inline void *allocate_buffer(size_t Size, size_t Alignment) {
+  return ::operator new(Size
+#if __cpp_aligned_new
+                        ,
+                        std::align_val_t(Alignment)
+#endif
+  );
+}
+
+/// Deallocate a buffer of memory with the given size and alignment.
+///
+/// If supported, this will used the sized delete operator. Also if supported,
+/// this will pass the alignment to the delete operator.
+///
+/// The pointer must have been allocated with the corresponding new operator,
+/// most likely using the above helper.
+inline void deallocate_buffer(void *Ptr, size_t Size, size_t Alignment) {
+  ::operator delete(Ptr
+#if __cpp_sized_deallocation
+                    ,
+                    Size
+#endif
+#if __cpp_aligned_new
+                    ,
+                    std::align_val_t(Alignment)
+#endif
+  );
+}
+
+} // End namespace llvm
+
 #endif
diff --git a/contrib/llvm/include/llvm/Support/ConvertUTF.h b/contrib/llvm/include/llvm/Support/ConvertUTF.h
index 99ae171aeabb..6ae56c2470bb 100644
--- a/contrib/llvm/include/llvm/Support/ConvertUTF.h
+++ b/contrib/llvm/include/llvm/Support/ConvertUTF.h
@@ -92,6 +92,7 @@
 
 #include <cstddef>
 #include <string>
+#include <system_error>
 
 // Wrap everything in namespace llvm so that programs can link with llvm and
 // their own version of the unicode libraries.
@@ -286,6 +287,21 @@ bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
                               SmallVectorImpl<UTF16> &DstUTF16);
 
+#if defined(_WIN32)
+namespace sys {
+namespace windows {
+std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
+/// Convert to UTF16 from the current code page used in the system
+std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            SmallVectorImpl<char> &utf8);
+/// Convert from UTF16 to the current code page used in the system
+std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
+                             SmallVectorImpl<char> &utf8);
+} // namespace windows
+} // namespace sys
+#endif
+
 } /* end namespace llvm */
 
 #endif
diff --git a/contrib/llvm/include/llvm/Support/CrashRecoveryContext.h b/contrib/llvm/include/llvm/Support/CrashRecoveryContext.h
index 6cbc331d2731..7b3fd4f882e4 100644
--- a/contrib/llvm/include/llvm/Support/CrashRecoveryContext.h
+++ b/contrib/llvm/include/llvm/Support/CrashRecoveryContext.h
@@ -15,7 +15,7 @@
 namespace llvm {
 class CrashRecoveryContextCleanup;
 
-/// \brief Crash recovery helper object.
+/// Crash recovery helper object.
 ///
 /// This class implements support for running operations in a safe context so
 /// that crashes (memory errors, stack overflow, assertion violations) can be
@@ -27,6 +27,7 @@ class CrashRecoveryContextCleanup;
 /// CrashRecoveryContext::Enable(), and then executing unsafe operations via a
 /// CrashRecoveryContext object. For example:
 ///
+/// \code
 ///    void actual_work(void *);
 ///
 ///    void foo() {
@@ -38,6 +39,11 @@ class CrashRecoveryContextCleanup;
 ///
 ///      ... no crash was detected ...
 ///    }
+/// \endcode
+///
+/// To assist recovery the class allows specifying set of actions that will be
+/// executed in any case, whether crash occurs or not. These actions may be used
+/// to reclaim resources in the case of crash.
 class CrashRecoveryContext {
   void *Impl;
   CrashRecoveryContextCleanup *head;
@@ -46,24 +52,27 @@ public:
   CrashRecoveryContext() : Impl(nullptr), head(nullptr) {}
   ~CrashRecoveryContext();
 
+  /// Register cleanup handler, which is used when the recovery context is
+  /// finished.
+  /// The recovery context owns the handler.
   void registerCleanup(CrashRecoveryContextCleanup *cleanup);
+
   void unregisterCleanup(CrashRecoveryContextCleanup *cleanup);
 
-  /// \brief Enable crash recovery.
+  /// Enable crash recovery.
   static void Enable();
 
-  /// \brief Disable crash recovery.
+  /// Disable crash recovery.
   static void Disable();
 
-  /// \brief Return the active context, if the code is currently executing in a
+  /// Return the active context, if the code is currently executing in a
   /// thread which is in a protected context.
   static CrashRecoveryContext *GetCurrent();
 
-  /// \brief Return true if the current thread is recovering from a
-  /// crash.
+  /// Return true if the current thread is recovering from a crash.
   static bool isRecoveringFromCrash();
 
-  /// \brief Execute the provide callback function (with the given arguments) in
+  /// Execute the provided callback function (with the given arguments) in
   /// a protected context.
   ///
   /// \return True if the function completed successfully, and false if the
@@ -75,7 +84,7 @@ public:
     return RunSafely([&]() { Fn(UserData); });
   }
 
-  /// \brief Execute the provide callback function (with the given arguments) in
+  /// Execute the provide callback function (with the given arguments) in
   /// a protected context which is run in another thread (optionally with a
   /// requested stack size).
   ///
@@ -89,11 +98,18 @@ public:
     return RunSafelyOnThread([&]() { Fn(UserData); }, RequestedStackSize);
   }
 
-  /// \brief Explicitly trigger a crash recovery in the current process, and
+  /// Explicitly trigger a crash recovery in the current process, and
   /// return failure from RunSafely(). This function does not return.
   void HandleCrash();
 };
 
+/// Abstract base class of cleanup handlers.
+///
+/// Derived classes override method recoverResources, which makes actual work on
+/// resource recovery.
+///
+/// Cleanup handlers are stored in a double list, which is owned and managed by
+/// a crash recovery context.
 class CrashRecoveryContextCleanup {
 protected:
   CrashRecoveryContext *context;
@@ -115,7 +131,18 @@ private:
   CrashRecoveryContextCleanup *prev, *next;
 };
 
-template<typename DERIVED, typename T>
+/// Base class of cleanup handler that controls recovery of resources of the
+/// given type.
+///
+/// \tparam Derived Class that uses this class as a base.
+/// \tparam T Type of controlled resource.
+///
+/// This class serves as a base for its template parameter as implied by
+/// Curiously Recurring Template Pattern.
+///
+/// This class factors out creation of a cleanup handler. The latter requires
+/// knowledge of the current recovery context, which is provided by this class.
+template<typename Derived, typename T>
 class CrashRecoveryContextCleanupBase : public CrashRecoveryContextCleanup {
 protected:
   T *resource;
@@ -123,15 +150,20 @@ protected:
       : CrashRecoveryContextCleanup(context), resource(resource) {}
 
 public:
-  static DERIVED *create(T *x) {
+  /// Creates cleanup handler.
+  /// \param x Pointer to the resource recovered by this handler.
+  /// \return New handler or null if the method was called outside a recovery
+  ///         context.
+  static Derived *create(T *x) {
     if (x) {
       if (CrashRecoveryContext *context = CrashRecoveryContext::GetCurrent())
-        return new DERIVED(context, x);
+        return new Derived(context, x);
     }
     return nullptr;
   }
 };
 
+/// Cleanup handler that reclaims resource by calling destructor on it.
 template <typename T>
 class CrashRecoveryContextDestructorCleanup : public
   CrashRecoveryContextCleanupBase<CrashRecoveryContextDestructorCleanup<T>, T> {
@@ -146,6 +178,7 @@ public:
   }
 };
 
+/// Cleanup handler that reclaims resource by calling 'delete' on it.
 template <typename T>
 class CrashRecoveryContextDeleteCleanup : public
   CrashRecoveryContextCleanupBase<CrashRecoveryContextDeleteCleanup<T>, T> {
@@ -157,10 +190,10 @@ public:
   void recoverResources() override { delete this->resource; }
 };
 
+/// Cleanup handler that reclaims resource by calling its method 'Release'.
 template <typename T>
 class CrashRecoveryContextReleaseRefCleanup : public
-  CrashRecoveryContextCleanupBase<CrashRecoveryContextReleaseRefCleanup<T>, T>
-{
+  CrashRecoveryContextCleanupBase<CrashRecoveryContextReleaseRefCleanup<T>, T> {
 public:
   CrashRecoveryContextReleaseRefCleanup(CrashRecoveryContext *context,
                                         T *resource)
@@ -170,6 +203,37 @@ public:
   void recoverResources() override { this->resource->Release(); }
 };
 
+/// Helper class for managing resource cleanups.
+///
+/// \tparam T Type of resource been reclaimed.
+/// \tparam Cleanup Class that defines how the resource is reclaimed.
+///
+/// Clients create objects of this type in the code executed in a crash recovery
+/// context to ensure that the resource will be reclaimed even in the case of
+/// crash. For example:
+///
+/// \code
+///    void actual_work(void *) {
+///      ...
+///      std::unique_ptr<Resource> R(new Resource());
+///      CrashRecoveryContextCleanupRegistrar D(R.get());
+///      ...
+///    }
+///
+///    void foo() {
+///      CrashRecoveryContext CRC;
+///
+///      if (!CRC.RunSafely(actual_work, 0)) {
+///         ... a crash was detected, report error to user ...
+///      }
+/// \endcode
+///
+/// If the code of `actual_work` in the example above does not crash, the
+/// destructor of CrashRecoveryContextCleanupRegistrar removes cleanup code from
+/// the current CrashRecoveryContext and the resource is reclaimed by the
+/// destructor of std::unique_ptr. If crash happens, destructors are not called
+/// and the resource is reclaimed by cleanup object registered in the recovery
+/// context by the constructor of CrashRecoveryContextCleanupRegistrar.
 template <typename T, typename Cleanup = CrashRecoveryContextDeleteCleanup<T> >
 class CrashRecoveryContextCleanupRegistrar {
   CrashRecoveryContextCleanup *cleanup;
diff --git a/contrib/llvm/include/llvm/Support/DJB.h b/contrib/llvm/include/llvm/Support/DJB.h
new file mode 100644
index 000000000000..e03111473362
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/DJB.h
@@ -0,0 +1,33 @@
+//===-- llvm/Support/DJB.h ---DJB Hash --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for the DJ Bernstein hash function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_DJB_H
+#define LLVM_SUPPORT_DJB_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+/// The Bernstein hash function used by the DWARF accelerator tables.
+inline uint32_t djbHash(StringRef Buffer, uint32_t H = 5381) {
+  for (unsigned char C : Buffer.bytes())
+    H = (H << 5) + H + C;
+  return H;
+}
+
+/// Computes the Bernstein hash after folding the input according to the Dwarf 5
+/// standard case folding rules.
+uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_DJB_H
diff --git a/contrib/llvm/include/llvm/Support/DataExtractor.h b/contrib/llvm/include/llvm/Support/DataExtractor.h
index 31447882a919..3a6ada6c77df 100644
--- a/contrib/llvm/include/llvm/Support/DataExtractor.h
+++ b/contrib/llvm/include/llvm/Support/DataExtractor.h
@@ -51,13 +51,13 @@ public:
   DataExtractor(StringRef Data, bool IsLittleEndian, uint8_t AddressSize)
     : Data(Data), IsLittleEndian(IsLittleEndian), AddressSize(AddressSize) {}
 
-  /// \brief Get the data pointed to by this extractor.
+  /// Get the data pointed to by this extractor.
   StringRef getData() const { return Data; }
-  /// \brief Get the endianness for this extractor.
+  /// Get the endianness for this extractor.
   bool isLittleEndian() const { return IsLittleEndian; }
-  /// \brief Get the address size for this extractor.
+  /// Get the address size for this extractor.
   uint8_t getAddressSize() const { return AddressSize; }
-  /// \brief Set the address size for this extractor.
+  /// Set the address size for this extractor.
   void setAddressSize(uint8_t Size) { AddressSize = Size; }
 
   /// Extract a C string from \a *offset_ptr.
diff --git a/contrib/llvm/include/llvm/Support/DataTypes.h b/contrib/llvm/include/llvm/Support/DataTypes.h
new file mode 100644
index 000000000000..ad60a5b3f300
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/DataTypes.h
@@ -0,0 +1,17 @@
+//===-- llvm/Support/DataTypes.h - Define fixed size types ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Due to layering constraints (Support depends on llvm-c) this is a thin
+// wrapper around the implementation that lives in llvm-c, though most clients
+// can/should think of this as being provided by Support for simplicity (not
+// many clients are aware of their dependency on llvm-c).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/DataTypes.h"
diff --git a/contrib/llvm/include/llvm/Support/Debug.h b/contrib/llvm/include/llvm/Support/Debug.h
index 48e9e1bc167d..980abfb0e8da 100644
--- a/contrib/llvm/include/llvm/Support/Debug.h
+++ b/contrib/llvm/include/llvm/Support/Debug.h
@@ -11,17 +11,18 @@
 // code, without it being enabled all of the time, and without having to add
 // command line options to enable it.
 //
-// In particular, just wrap your code with the DEBUG() macro, and it will be
-// enabled automatically if you specify '-debug' on the command-line.
-// DEBUG() requires the DEBUG_TYPE macro to be defined. Set it to "foo" specify
-// that your debug code belongs to class "foo". Be careful that you only do
-// this after including Debug.h and not around any #include of headers. Headers
-// should define and undef the macro acround the code that needs to use the
-// DEBUG() macro. Then, on the command line, you can specify '-debug-only=foo'
-// to enable JUST the debug information for the foo class.
+// In particular, just wrap your code with the LLVM_DEBUG() macro, and it will
+// be enabled automatically if you specify '-debug' on the command-line.
+// LLVM_DEBUG() requires the DEBUG_TYPE macro to be defined. Set it to "foo"
+// specify that your debug code belongs to class "foo". Be careful that you only
+// do this after including Debug.h and not around any #include of headers.
+// Headers should define and undef the macro acround the code that needs to use
+// the LLVM_DEBUG() macro. Then, on the command line, you can specify
+// '-debug-only=foo' to enable JUST the debug information for the foo class.
 //
 // When compiling without assertions, the -debug-* options and all code in
-// DEBUG() statements disappears, so it does not affect the runtime of the code.
+// LLVM_DEBUG() statements disappears, so it does not affect the runtime of the
+// code.
 //
 //===----------------------------------------------------------------------===//
 
@@ -113,9 +114,9 @@ raw_ostream &dbgs();
 // debug build, then the code specified as the option to the macro will be
 // executed.  Otherwise it will not be.  Example:
 //
-// DEBUG(dbgs() << "Bitset contains: " << Bitset << "\n");
+// LLVM_DEBUG(dbgs() << "Bitset contains: " << Bitset << "\n");
 //
-#define DEBUG(X) DEBUG_WITH_TYPE(DEBUG_TYPE, X)
+#define LLVM_DEBUG(X) DEBUG_WITH_TYPE(DEBUG_TYPE, X)
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/Support/DebugCounter.h b/contrib/llvm/include/llvm/Support/DebugCounter.h
index 52e1bd71a2f2..250fc6bb1f5c 100644
--- a/contrib/llvm/include/llvm/Support/DebugCounter.h
+++ b/contrib/llvm/include/llvm/Support/DebugCounter.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 /// \file
-/// \brief This file provides an implementation of debug counters.  Debug
+/// This file provides an implementation of debug counters.  Debug
 /// counters are a tool that let you narrow down a miscompilation to a specific
 /// thing happening.
 ///
@@ -55,7 +55,7 @@ namespace llvm {
 
 class DebugCounter {
 public:
-  /// \brief Returns a reference to the singleton instance.
+  /// Returns a reference to the singleton instance.
   static DebugCounter &instance();
 
   // Used by the command line option parser to push a new value it parsed.
@@ -77,23 +77,19 @@ public:
     auto &Us = instance();
     auto Result = Us.Counters.find(CounterName);
     if (Result != Us.Counters.end()) {
-      auto &CounterPair = Result->second;
-      // We only execute while the skip (first) is zero and the count (second)
-      // is non-zero.
+      auto &CounterInfo = Result->second;
+      ++CounterInfo.Count;
+
+      // We only execute while the Skip is not smaller than Count,
+      // and the StopAfter + Skip is larger than Count.
       // Negative counters always execute.
-      if (CounterPair.first < 0)
+      if (CounterInfo.Skip < 0)
         return true;
-      if (CounterPair.first != 0) {
-        --CounterPair.first;
+      if (CounterInfo.Skip >= CounterInfo.Count)
         return false;
-      }
-      if (CounterPair.second < 0)
-        return true;
-      if (CounterPair.second != 0) {
-        --CounterPair.second;
+      if (CounterInfo.StopAfter < 0)
         return true;
-      }
-      return false;
+      return CounterInfo.StopAfter + CounterInfo.Skip >= CounterInfo.Count;
     }
     // Didn't find the counter, should we warn?
     return true;
@@ -104,21 +100,21 @@ public:
   // the command line).  This will return true even if those values are
   // currently in a state where the counter will always execute.
   static bool isCounterSet(unsigned ID) {
-    return instance().Counters.count(ID);
+    return instance().Counters[ID].IsSet;
   }
 
-  // Return the skip and count for a counter. This only works for set counters.
-  static std::pair<int, int> getCounterValue(unsigned ID) {
+  // Return the Count for a counter. This only works for set counters.
+  static int64_t getCounterValue(unsigned ID) {
     auto &Us = instance();
     auto Result = Us.Counters.find(ID);
     assert(Result != Us.Counters.end() && "Asking about a non-set counter");
-    return Result->second;
+    return Result->second.Count;
   }
 
-  // Set a registered counter to a given value.
-  static void setCounterValue(unsigned ID, const std::pair<int, int> &Val) {
+  // Set a registered counter to a given Count value.
+  static void setCounterValue(unsigned ID, int64_t Count) {
     auto &Us = instance();
-    Us.Counters[ID] = Val;
+    Us.Counters[ID].Count = Count;
   }
 
   // Dump or print the current counter set into llvm::dbgs().
@@ -136,7 +132,7 @@ public:
 
   // Return the name and description of the counter with the given ID.
   std::pair<std::string, std::string> getCounterInfo(unsigned ID) const {
-    return std::make_pair(RegisteredCounters[ID], CounterDesc.lookup(ID));
+    return std::make_pair(RegisteredCounters[ID], Counters.lookup(ID).Desc);
   }
 
   // Iterate through the registered counters
@@ -149,11 +145,19 @@ public:
 private:
   unsigned addCounter(const std::string &Name, const std::string &Desc) {
     unsigned Result = RegisteredCounters.insert(Name);
-    CounterDesc[Result] = Desc;
+    Counters[Result] = {};
+    Counters[Result].Desc = Desc;
     return Result;
   }
-  DenseMap<unsigned, std::pair<long, long>> Counters;
-  DenseMap<unsigned, std::string> CounterDesc;
+  // Struct to store counter info.
+  struct CounterInfo {
+    int64_t Count = 0;
+    int64_t Skip = 0;
+    int64_t StopAfter = -1;
+    bool IsSet = false;
+    std::string Desc;
+  };
+  DenseMap<unsigned, CounterInfo> Counters;
   CounterVector RegisteredCounters;
 };
 
diff --git a/contrib/llvm/include/llvm/Support/DynamicLibrary.h b/contrib/llvm/include/llvm/Support/DynamicLibrary.h
index 469d5dfad062..9563b483f6d5 100644
--- a/contrib/llvm/include/llvm/Support/DynamicLibrary.h
+++ b/contrib/llvm/include/llvm/Support/DynamicLibrary.h
@@ -64,7 +64,7 @@ namespace sys {
     /// if the library fails to load.
     ///
     /// It is safe to call this function multiple times for the same library.
-    /// @brief Open a dynamic library permanently.
+    /// Open a dynamic library permanently.
     static DynamicLibrary getPermanentLibrary(const char *filename,
                                               std::string *errMsg = nullptr);
 
@@ -110,10 +110,10 @@ namespace sys {
     /// search permanently loaded libraries (getPermanentLibrary()) as well
     /// as explicitly registered symbols (AddSymbol()).
     /// @throws std::string on error.
-    /// @brief Search through libraries for address of a symbol
+    /// Search through libraries for address of a symbol
     static void *SearchForAddressOfSymbol(const char *symbolName);
 
-    /// @brief Convenience function for C++ophiles.
+    /// Convenience function for C++ophiles.
     static void *SearchForAddressOfSymbol(const std::string &symbolName) {
       return SearchForAddressOfSymbol(symbolName.c_str());
     }
@@ -121,7 +121,7 @@ namespace sys {
     /// This functions permanently adds the symbol \p symbolName with the
     /// value \p symbolValue.  These symbols are searched before any
     /// libraries.
-    /// @brief Add searchable symbol/value pair.
+    /// Add searchable symbol/value pair.
     static void AddSymbol(StringRef symbolName, void *symbolValue);
 
     class HandleSet;
diff --git a/contrib/llvm/include/llvm/Support/Endian.h b/contrib/llvm/include/llvm/Support/Endian.h
index f50d9b502daf..a4d3f4ff793d 100644
--- a/contrib/llvm/include/llvm/Support/Endian.h
+++ b/contrib/llvm/include/llvm/Support/Endian.h
@@ -34,7 +34,7 @@ enum {aligned = 0, unaligned = 1};
 
 namespace detail {
 
-/// \brief ::value is either alignment, or alignof(T) if alignment is 0.
+/// ::value is either alignment, or alignof(T) if alignment is 0.
 template<class T, int alignment>
 struct PickAlignment {
  enum { value = alignment == 0 ? alignof(T) : alignment };
diff --git a/contrib/llvm/include/llvm/Support/EndianStream.h b/contrib/llvm/include/llvm/Support/EndianStream.h
index 43ecd4a5c97e..9742e253ad3e 100644
--- a/contrib/llvm/include/llvm/Support/EndianStream.h
+++ b/contrib/llvm/include/llvm/Support/EndianStream.h
@@ -23,44 +23,44 @@ namespace llvm {
 namespace support {
 
 namespace endian {
-/// Adapter to write values to a stream in a particular byte order.
-template <endianness endian> struct Writer {
-  raw_ostream &OS;
-  Writer(raw_ostream &OS) : OS(OS) {}
-  template <typename value_type> void write(ArrayRef<value_type> Vals) {
-    for (value_type V : Vals)
-      write(V);
-  }
-  template <typename value_type> void write(value_type Val) {
-    Val = byte_swap<value_type, endian>(Val);
-    OS.write((const char *)&Val, sizeof(value_type));
-  }
-};
 
-template <>
-template <>
-inline void Writer<little>::write<float>(float Val) {
-  write(FloatToBits(Val));
+template <typename value_type>
+inline void write(raw_ostream &os, value_type value, endianness endian) {
+  value = byte_swap<value_type>(value, endian);
+  os.write((const char *)&value, sizeof(value_type));
 }
 
 template <>
-template <>
-inline void Writer<little>::write<double>(double Val) {
-  write(DoubleToBits(Val));
+inline void write<float>(raw_ostream &os, float value, endianness endian) {
+  write(os, FloatToBits(value), endian);
 }
 
 template <>
-template <>
-inline void Writer<big>::write<float>(float Val) {
-  write(FloatToBits(Val));
+inline void write<double>(raw_ostream &os, double value,
+                          endianness endian) {
+  write(os, DoubleToBits(value), endian);
 }
 
-template <>
-template <>
-inline void Writer<big>::write<double>(double Val) {
-  write(DoubleToBits(Val));
+template <typename value_type>
+inline void write(raw_ostream &os, ArrayRef<value_type> vals,
+                  endianness endian) {
+  for (value_type v : vals)
+    write(os, v, endian);
 }
 
+/// Adapter to write values to a stream in a particular byte order.
+struct Writer {
+  raw_ostream &OS;
+  endianness Endian;
+  Writer(raw_ostream &OS, endianness Endian) : OS(OS), Endian(Endian) {}
+  template <typename value_type> void write(ArrayRef<value_type> Val) {
+    endian::write(OS, Val, Endian);
+  }
+  template <typename value_type> void write(value_type Val) {
+    endian::write(OS, Val, Endian);
+  }
+};
+
 } // end namespace endian
 
 } // end namespace support
diff --git a/contrib/llvm/include/llvm/Support/Errc.h b/contrib/llvm/include/llvm/Support/Errc.h
index 80bfe2ac2ee5..dce42782a0d3 100644
--- a/contrib/llvm/include/llvm/Support/Errc.h
+++ b/contrib/llvm/include/llvm/Support/Errc.h
@@ -63,6 +63,7 @@ enum class errc {
   no_such_process = int(std::errc::no_such_process),
   not_a_directory = int(std::errc::not_a_directory),
   not_enough_memory = int(std::errc::not_enough_memory),
+  not_supported = int(std::errc::not_supported),
   operation_not_permitted = int(std::errc::operation_not_permitted),
   permission_denied = int(std::errc::permission_denied),
   read_only_file_system = int(std::errc::read_only_file_system),
diff --git a/contrib/llvm/include/llvm/Support/Errno.h b/contrib/llvm/include/llvm/Support/Errno.h
index 35dc1ea7cf84..8069c3639df3 100644
--- a/contrib/llvm/include/llvm/Support/Errno.h
+++ b/contrib/llvm/include/llvm/Support/Errno.h
@@ -34,9 +34,10 @@ template <typename FailT, typename Fun, typename... Args>
 inline auto RetryAfterSignal(const FailT &Fail, const Fun &F,
                              const Args &... As) -> decltype(F(As...)) {
   decltype(F(As...)) Res;
-  do
+  do {
+    errno = 0;
     Res = F(As...);
-  while (Res == Fail && errno == EINTR);
+  } while (Res == Fail && errno == EINTR);
   return Res;
 }
 
diff --git a/contrib/llvm/include/llvm/Support/Error.h b/contrib/llvm/include/llvm/Support/Error.h
index 8567af392fb0..8015cab45a06 100644
--- a/contrib/llvm/include/llvm/Support/Error.h
+++ b/contrib/llvm/include/llvm/Support/Error.h
@@ -24,6 +24,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -302,6 +303,14 @@ private:
     return Tmp;
   }
 
+  friend raw_ostream &operator<<(raw_ostream &OS, const Error &E) {
+    if (auto P = E.getPtr())
+      P->log(OS);
+    else
+      OS << "success";
+    return OS;
+  }
+
   ErrorInfoBase *Payload = nullptr;
 };
 
@@ -421,7 +430,7 @@ template <class T> class LLVM_NODISCARD Expected {
 
   static const bool isRef = std::is_reference<T>::value;
 
-  using wrap = ReferenceStorage<typename std::remove_reference<T>::type>;
+  using wrap = std::reference_wrapper<typename std::remove_reference<T>::type>;
 
   using error_type = std::unique_ptr<ErrorInfoBase>;
 
@@ -505,7 +514,7 @@ public:
       getErrorStorage()->~error_type();
   }
 
-  /// \brief Return false if there is an error.
+  /// Return false if there is an error.
   explicit operator bool() {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
     Unchecked = HasError;
@@ -513,24 +522,24 @@ public:
     return !HasError;
   }
 
-  /// \brief Returns a reference to the stored T value.
+  /// Returns a reference to the stored T value.
   reference get() {
     assertIsChecked();
     return *getStorage();
   }
 
-  /// \brief Returns a const reference to the stored T value.
+  /// Returns a const reference to the stored T value.
   const_reference get() const {
     assertIsChecked();
     return const_cast<Expected<T> *>(this)->get();
   }
 
-  /// \brief Check that this Expected<T> is an error of type ErrT.
+  /// Check that this Expected<T> is an error of type ErrT.
   template <typename ErrT> bool errorIsA() const {
     return HasError && (*getErrorStorage())->template isA<ErrT>();
   }
 
-  /// \brief Take ownership of the stored error.
+  /// Take ownership of the stored error.
   /// After calling this the Expected<T> is in an indeterminate state that can
   /// only be safely destructed. No further calls (beside the destructor) should
   /// be made on the Expected<T> vaule.
@@ -541,25 +550,25 @@ public:
     return HasError ? Error(std::move(*getErrorStorage())) : Error::success();
   }
 
-  /// \brief Returns a pointer to the stored T value.
+  /// Returns a pointer to the stored T value.
   pointer operator->() {
     assertIsChecked();
     return toPointer(getStorage());
   }
 
-  /// \brief Returns a const pointer to the stored T value.
+  /// Returns a const pointer to the stored T value.
   const_pointer operator->() const {
     assertIsChecked();
     return toPointer(getStorage());
   }
 
-  /// \brief Returns a reference to the stored T value.
+  /// Returns a reference to the stored T value.
   reference operator*() {
     assertIsChecked();
     return *getStorage();
   }
 
-  /// \brief Returns a const reference to the stored T value.
+  /// Returns a const reference to the stored T value.
   const_reference operator*() const {
     assertIsChecked();
     return *getStorage();
@@ -882,16 +891,16 @@ Error handleErrors(Error E, HandlerTs &&... Hs) {
   return handleErrorImpl(std::move(Payload), std::forward<HandlerTs>(Hs)...);
 }
 
-/// Behaves the same as handleErrors, except that it requires that all
-/// errors be handled by the given handlers. If any unhandled error remains
-/// after the handlers have run, report_fatal_error() will be called.
+/// Behaves the same as handleErrors, except that by contract all errors
+/// *must* be handled by the given handlers (i.e. there must be no remaining
+/// errors after running the handlers, or llvm_unreachable is called).
 template <typename... HandlerTs>
 void handleAllErrors(Error E, HandlerTs &&... Handlers) {
   cantFail(handleErrors(std::move(E), std::forward<HandlerTs>(Handlers)...));
 }
 
 /// Check that E is a non-error, then drop it.
-/// If E is an error report_fatal_error will be called.
+/// If E is an error, llvm_unreachable will be called.
 inline void handleAllErrors(Error E) {
   cantFail(std::move(E));
 }
@@ -963,6 +972,18 @@ inline void consumeError(Error Err) {
   handleAllErrors(std::move(Err), [](const ErrorInfoBase &) {});
 }
 
+/// Helper for converting an Error to a bool.
+///
+/// This method returns true if Err is in an error state, or false if it is
+/// in a success state.  Puts Err in a checked state in both cases (unlike
+/// Error::operator bool(), which only does this for success states).
+inline bool errorToBool(Error Err) {
+  bool IsError = static_cast<bool>(Err);
+  if (IsError)
+    consumeError(std::move(Err));
+  return IsError;
+}
+
 /// Helper for Errors used as out-parameters.
 ///
 /// This helper is for use with the Error-as-out-parameter idiom, where an error
@@ -1101,6 +1122,18 @@ private:
   std::error_code EC;
 };
 
+/// Create formatted StringError object.
+template <typename... Ts>
+Error createStringError(std::error_code EC, char const *Fmt,
+                        const Ts &... Vals) {
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+  Stream << format(Fmt, Vals...);
+  return make_error<StringError>(Stream.str(), EC);
+}
+
+Error createStringError(std::error_code EC, char const *Msg);
+
 /// Helper for check-and-exit error handling.
 ///
 /// For tool use only. NOT FOR USE IN LIBRARY CODE.
diff --git a/contrib/llvm/include/llvm/Support/ErrorHandling.h b/contrib/llvm/include/llvm/Support/ErrorHandling.h
index b45f6348390e..39cbfed2436a 100644
--- a/contrib/llvm/include/llvm/Support/ErrorHandling.h
+++ b/contrib/llvm/include/llvm/Support/ErrorHandling.h
@@ -100,6 +100,8 @@ void install_bad_alloc_error_handler(fatal_error_handler_t handler,
 /// Restores default bad alloc error handling behavior.
 void remove_bad_alloc_error_handler();
 
+void install_out_of_memory_new_handler();
+
 /// Reports a bad alloc error, calling any user defined bad alloc
 /// error handler. In contrast to the generic 'report_fatal_error'
 /// functions, this function is expected to return, e.g. the user
@@ -110,7 +112,7 @@ void remove_bad_alloc_error_handler();
 /// in the unwind chain.
 ///
 /// If no error handler is installed (default), then a bad_alloc exception
-/// is thrown if LLVM is compiled with exception support, otherwise an assertion
+/// is thrown, if LLVM is compiled with exception support, otherwise an assertion
 /// is called.
 void report_bad_alloc_error(const char *Reason, bool GenCrashDiag = true);
 
diff --git a/contrib/llvm/include/llvm/Support/ErrorOr.h b/contrib/llvm/include/llvm/Support/ErrorOr.h
index 061fb65db465..e6ce764ad822 100644
--- a/contrib/llvm/include/llvm/Support/ErrorOr.h
+++ b/contrib/llvm/include/llvm/Support/ErrorOr.h
@@ -24,19 +24,7 @@
 
 namespace llvm {
 
-/// \brief Stores a reference that can be changed.
-template <typename T>
-class ReferenceStorage {
-  T *Storage;
-
-public:
-  ReferenceStorage(T &Ref) : Storage(&Ref) {}
-
-  operator T &() const { return *Storage; }
-  T &get() const { return *Storage; }
-};
-
-/// \brief Represents either an error or a value T.
+/// Represents either an error or a value T.
 ///
 /// ErrorOr<T> is a pointer-like class that represents the result of an
 /// operation. The result is either an error, or a value of type T. This is
@@ -71,7 +59,7 @@ class ErrorOr {
 
   static const bool isRef = std::is_reference<T>::value;
 
-  using wrap = ReferenceStorage<typename std::remove_reference<T>::type>;
+  using wrap = std::reference_wrapper<typename std::remove_reference<T>::type>;
 
 public:
   using storage_type = typename std::conditional<isRef, wrap, T>::type;
@@ -161,7 +149,7 @@ public:
       getStorage()->~storage_type();
   }
 
-  /// \brief Return false if there is an error.
+  /// Return false if there is an error.
   explicit operator bool() const {
     return !HasError;
   }
diff --git a/contrib/llvm/include/llvm/Support/FileOutputBuffer.h b/contrib/llvm/include/llvm/Support/FileOutputBuffer.h
index 6aed423a01e3..ee8cbb730878 100644
--- a/contrib/llvm/include/llvm/Support/FileOutputBuffer.h
+++ b/contrib/llvm/include/llvm/Support/FileOutputBuffer.h
@@ -30,13 +30,25 @@ namespace llvm {
 /// not committed, the file will be deleted in the FileOutputBuffer destructor.
 class FileOutputBuffer {
 public:
-  enum  {
-    F_executable = 1  /// set the 'x' bit on the resulting file
+  enum {
+    /// set the 'x' bit on the resulting file
+    F_executable = 1,
+
+    /// the contents of the new file are initialized from the file that exists
+    /// at the location (if present).  This allows in-place modification of an
+    /// existing file.
+    F_modify = 2
   };
 
   /// Factory method to create an OutputBuffer object which manages a read/write
   /// buffer of the specified size. When committed, the buffer will be written
   /// to the file at the specified path.
+  ///
+  /// When F_modify is specified and \p FilePath refers to an existing on-disk
+  /// file \p Size may be set to -1, in which case the entire file is used.
+  /// Otherwise, the file shrinks or grows as necessary based on the value of
+  /// \p Size.  It is an error to specify F_modify and Size=-1 if \p FilePath
+  /// does not exist.
   static Expected<std::unique_ptr<FileOutputBuffer>>
   create(StringRef FilePath, size_t Size, unsigned Flags = 0);
 
diff --git a/contrib/llvm/include/llvm/Support/FileSystem.h b/contrib/llvm/include/llvm/Support/FileSystem.h
index b1683ba5ddb3..02db4596bf1c 100644
--- a/contrib/llvm/include/llvm/Support/FileSystem.h
+++ b/contrib/llvm/include/llvm/Support/FileSystem.h
@@ -30,6 +30,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -53,6 +54,15 @@ namespace llvm {
 namespace sys {
 namespace fs {
 
+#if defined(_WIN32)
+// A Win32 HANDLE is a typedef of void*
+using file_t = void *;
+#else
+using file_t = int;
+#endif
+
+extern const file_t kInvalidFile;
+
 /// An enumeration for the file system's view of the type.
 enum class file_type {
   status_error,
@@ -153,7 +163,7 @@ protected:
   uid_t fs_st_uid = 0;
   gid_t fs_st_gid = 0;
   off_t fs_st_size = 0;
-  #elif defined (LLVM_ON_WIN32)
+  #elif defined (_WIN32)
   uint32_t LastAccessedTimeHigh = 0;
   uint32_t LastAccessedTimeLow = 0;
   uint32_t LastWriteTimeHigh = 0;
@@ -174,7 +184,7 @@ public:
                     uid_t UID, gid_t GID, off_t Size)
       : fs_st_atime(ATime), fs_st_mtime(MTime), fs_st_uid(UID), fs_st_gid(GID),
         fs_st_size(Size), Type(Type), Perms(Perms) {}
-#elif defined(LLVM_ON_WIN32)
+#elif defined(_WIN32)
   basic_file_status(file_type Type, perms Perms, uint32_t LastAccessTimeHigh,
                     uint32_t LastAccessTimeLow, uint32_t LastWriteTimeHigh,
                     uint32_t LastWriteTimeLow, uint32_t FileSizeHigh,
@@ -196,7 +206,7 @@ public:
   uint32_t getUser() const { return fs_st_uid; }
   uint32_t getGroup() const { return fs_st_gid; }
   uint64_t getSize() const { return fs_st_size; }
-  #elif defined (LLVM_ON_WIN32)
+  #elif defined (_WIN32)
   uint32_t getUser() const {
     return 9999; // Not applicable to Windows, so...
   }
@@ -223,7 +233,7 @@ class file_status : public basic_file_status {
   dev_t fs_st_dev = 0;
   nlink_t fs_st_nlinks = 0;
   ino_t fs_st_ino = 0;
-  #elif defined (LLVM_ON_WIN32)
+  #elif defined (_WIN32)
   uint32_t NumLinks = 0;
   uint32_t VolumeSerialNumber = 0;
   uint32_t FileIndexHigh = 0;
@@ -240,7 +250,7 @@ public:
               time_t ATime, time_t MTime, uid_t UID, gid_t GID, off_t Size)
       : basic_file_status(Type, Perms, ATime, MTime, UID, GID, Size),
         fs_st_dev(Dev), fs_st_nlinks(Links), fs_st_ino(Ino) {}
-  #elif defined(LLVM_ON_WIN32)
+  #elif defined(_WIN32)
   file_status(file_type Type, perms Perms, uint32_t LinkCount,
               uint32_t LastAccessTimeHigh, uint32_t LastAccessTimeLow,
               uint32_t LastWriteTimeHigh, uint32_t LastWriteTimeLow,
@@ -262,7 +272,7 @@ public:
 /// @name Physical Operators
 /// @{
 
-/// @brief Make \a path an absolute path.
+/// Make \a path an absolute path.
 ///
 /// Makes \a path absolute using the \a current_directory if it is not already.
 /// An empty \a path will result in the \a current_directory.
@@ -276,7 +286,7 @@ public:
 std::error_code make_absolute(const Twine &current_directory,
                               SmallVectorImpl<char> &path);
 
-/// @brief Make \a path an absolute path.
+/// Make \a path an absolute path.
 ///
 /// Makes \a path absolute using the current directory if it is not already. An
 /// empty \a path will result in the current directory.
@@ -289,7 +299,7 @@ std::error_code make_absolute(const Twine &current_directory,
 ///          platform-specific error_code.
 std::error_code make_absolute(SmallVectorImpl<char> &path);
 
-/// @brief Create all the non-existent directories in path.
+/// Create all the non-existent directories in path.
 ///
 /// @param path Directories to create.
 /// @returns errc::success if is_directory(path), otherwise a platform
@@ -299,7 +309,7 @@ std::error_code create_directories(const Twine &path,
                                    bool IgnoreExisting = true,
                                    perms Perms = owner_all | group_all);
 
-/// @brief Create the directory in path.
+/// Create the directory in path.
 ///
 /// @param path Directory to create.
 /// @returns errc::success if is_directory(path), otherwise a platform
@@ -308,7 +318,7 @@ std::error_code create_directories(const Twine &path,
 std::error_code create_directory(const Twine &path, bool IgnoreExisting = true,
                                  perms Perms = owner_all | group_all);
 
-/// @brief Create a link from \a from to \a to.
+/// Create a link from \a from to \a to.
 ///
 /// The link may be a soft or a hard link, depending on the platform. The caller
 /// may not assume which one. Currently on windows it creates a hard link since
@@ -329,7 +339,7 @@ std::error_code create_link(const Twine &to, const Twine &from);
 /// specific error_code.
 std::error_code create_hard_link(const Twine &to, const Twine &from);
 
-/// @brief Collapse all . and .. patterns, resolve all symlinks, and optionally
+/// Collapse all . and .. patterns, resolve all symlinks, and optionally
 ///        expand ~ expressions to the user's home directory.
 ///
 /// @param path The path to resolve.
@@ -339,21 +349,21 @@ std::error_code create_hard_link(const Twine &to, const Twine &from);
 std::error_code real_path(const Twine &path, SmallVectorImpl<char> &output,
                           bool expand_tilde = false);
 
-/// @brief Get the current path.
+/// Get the current path.
 ///
 /// @param result Holds the current path on return.
 /// @returns errc::success if the current path has been stored in result,
 ///          otherwise a platform-specific error_code.
 std::error_code current_path(SmallVectorImpl<char> &result);
 
-/// @brief Set the current path.
+/// Set the current path.
 ///
 /// @param path The path to set.
 /// @returns errc::success if the current path was successfully set,
 ///          otherwise a platform-specific error_code.
 std::error_code set_current_path(const Twine &path);
 
-/// @brief Remove path. Equivalent to POSIX remove().
+/// Remove path. Equivalent to POSIX remove().
 ///
 /// @param path Input path.
 /// @returns errc::success if path has been removed or didn't exist, otherwise a
@@ -361,14 +371,14 @@ std::error_code set_current_path(const Twine &path);
 ///          returns error if the file didn't exist.
 std::error_code remove(const Twine &path, bool IgnoreNonExisting = true);
 
-/// @brief Recursively delete a directory.
+/// Recursively delete a directory.
 ///
 /// @param path Input path.
 /// @returns errc::success if path has been removed or didn't exist, otherwise a
 ///          platform-specific error code.
 std::error_code remove_directories(const Twine &path, bool IgnoreErrors = true);
 
-/// @brief Rename \a from to \a to.
+/// Rename \a from to \a to.
 ///
 /// Files are renamed as if by POSIX rename(), except that on Windows there may
 /// be a short interval of time during which the destination file does not
@@ -378,13 +388,19 @@ std::error_code remove_directories(const Twine &path, bool IgnoreErrors = true);
 /// @param to The path to rename to. This is created.
 std::error_code rename(const Twine &from, const Twine &to);
 
-/// @brief Copy the contents of \a From to \a To.
+/// Copy the contents of \a From to \a To.
 ///
 /// @param From The path to copy from.
 /// @param To The path to copy to. This is created.
 std::error_code copy_file(const Twine &From, const Twine &To);
 
-/// @brief Resize path to size. File is resized as if by POSIX truncate().
+/// Copy the contents of \a From to \a To.
+///
+/// @param From The path to copy from.
+/// @param ToFD The open file descriptor of the destination file.
+std::error_code copy_file(const Twine &From, int ToFD);
+
+/// Resize path to size. File is resized as if by POSIX truncate().
 ///
 /// @param FD Input file descriptor.
 /// @param Size Size to resize to.
@@ -392,21 +408,21 @@ std::error_code copy_file(const Twine &From, const Twine &To);
 ///          platform-specific error_code.
 std::error_code resize_file(int FD, uint64_t Size);
 
-/// @brief Compute an MD5 hash of a file's contents.
+/// Compute an MD5 hash of a file's contents.
 ///
 /// @param FD Input file descriptor.
 /// @returns An MD5Result with the hash computed, if successful, otherwise a
 ///          std::error_code.
 ErrorOr<MD5::MD5Result> md5_contents(int FD);
 
-/// @brief Version of compute_md5 that doesn't require an open file descriptor.
+/// Version of compute_md5 that doesn't require an open file descriptor.
 ErrorOr<MD5::MD5Result> md5_contents(const Twine &Path);
 
 /// @}
 /// @name Physical Observers
 /// @{
 
-/// @brief Does file exist?
+/// Does file exist?
 ///
 /// @param status A basic_file_status previously returned from stat.
 /// @returns True if the file represented by status exists, false if it does
@@ -415,14 +431,14 @@ bool exists(const basic_file_status &status);
 
 enum class AccessMode { Exist, Write, Execute };
 
-/// @brief Can the file be accessed?
+/// Can the file be accessed?
 ///
 /// @param Path Input path.
 /// @returns errc::success if the path can be accessed, otherwise a
 ///          platform-specific error_code.
 std::error_code access(const Twine &Path, AccessMode Mode);
 
-/// @brief Does file exist?
+/// Does file exist?
 ///
 /// @param Path Input path.
 /// @returns True if it exists, false otherwise.
@@ -430,13 +446,13 @@ inline bool exists(const Twine &Path) {
   return !access(Path, AccessMode::Exist);
 }
 
-/// @brief Can we execute this file?
+/// Can we execute this file?
 ///
 /// @param Path Input path.
 /// @returns True if we can execute it, false otherwise.
 bool can_execute(const Twine &Path);
 
-/// @brief Can we write this file?
+/// Can we write this file?
 ///
 /// @param Path Input path.
 /// @returns True if we can write to it, false otherwise.
@@ -444,7 +460,7 @@ inline bool can_write(const Twine &Path) {
   return !access(Path, AccessMode::Write);
 }
 
-/// @brief Do file_status's represent the same thing?
+/// Do file_status's represent the same thing?
 ///
 /// @param A Input file_status.
 /// @param B Input file_status.
@@ -455,7 +471,7 @@ inline bool can_write(const Twine &Path) {
 ///          otherwise.
 bool equivalent(file_status A, file_status B);
 
-/// @brief Do paths represent the same thing?
+/// Do paths represent the same thing?
 ///
 /// assert(status_known(A) || status_known(B));
 ///
@@ -467,14 +483,14 @@ bool equivalent(file_status A, file_status B);
 ///          platform-specific error_code.
 std::error_code equivalent(const Twine &A, const Twine &B, bool &result);
 
-/// @brief Simpler version of equivalent for clients that don't need to
+/// Simpler version of equivalent for clients that don't need to
 ///        differentiate between an error and false.
 inline bool equivalent(const Twine &A, const Twine &B) {
   bool result;
   return !equivalent(A, B, result) && result;
 }
 
-/// @brief Is the file mounted on a local filesystem?
+/// Is the file mounted on a local filesystem?
 ///
 /// @param path Input path.
 /// @param result Set to true if \a path is on fixed media such as a hard disk,
@@ -483,24 +499,24 @@ inline bool equivalent(const Twine &A, const Twine &B) {
 ///          platform specific error_code.
 std::error_code is_local(const Twine &path, bool &result);
 
-/// @brief Version of is_local accepting an open file descriptor.
+/// Version of is_local accepting an open file descriptor.
 std::error_code is_local(int FD, bool &result);
 
-/// @brief Simpler version of is_local for clients that don't need to
+/// Simpler version of is_local for clients that don't need to
 ///        differentiate between an error and false.
 inline bool is_local(const Twine &Path) {
   bool Result;
   return !is_local(Path, Result) && Result;
 }
 
-/// @brief Simpler version of is_local accepting an open file descriptor for
+/// Simpler version of is_local accepting an open file descriptor for
 ///        clients that don't need to differentiate between an error and false.
 inline bool is_local(int FD) {
   bool Result;
   return !is_local(FD, Result) && Result;
 }
 
-/// @brief Does status represent a directory?
+/// Does status represent a directory?
 ///
 /// @param Path The path to get the type of.
 /// @param Follow For symbolic links, indicates whether to return the file type
@@ -508,13 +524,13 @@ inline bool is_local(int FD) {
 /// @returns A value from the file_type enumeration indicating the type of file.
 file_type get_file_type(const Twine &Path, bool Follow = true);
 
-/// @brief Does status represent a directory?
+/// Does status represent a directory?
 ///
 /// @param status A basic_file_status previously returned from status.
 /// @returns status.type() == file_type::directory_file.
 bool is_directory(const basic_file_status &status);
 
-/// @brief Is path a directory?
+/// Is path a directory?
 ///
 /// @param path Input path.
 /// @param result Set to true if \a path is a directory (after following
@@ -523,20 +539,20 @@ bool is_directory(const basic_file_status &status);
 ///          platform-specific error_code.
 std::error_code is_directory(const Twine &path, bool &result);
 
-/// @brief Simpler version of is_directory for clients that don't need to
+/// Simpler version of is_directory for clients that don't need to
 ///        differentiate between an error and false.
 inline bool is_directory(const Twine &Path) {
   bool Result;
   return !is_directory(Path, Result) && Result;
 }
 
-/// @brief Does status represent a regular file?
+/// Does status represent a regular file?
 ///
 /// @param status A basic_file_status previously returned from status.
 /// @returns status_known(status) && status.type() == file_type::regular_file.
 bool is_regular_file(const basic_file_status &status);
 
-/// @brief Is path a regular file?
+/// Is path a regular file?
 ///
 /// @param path Input path.
 /// @param result Set to true if \a path is a regular file (after following
@@ -545,7 +561,7 @@ bool is_regular_file(const basic_file_status &status);
 ///          platform-specific error_code.
 std::error_code is_regular_file(const Twine &path, bool &result);
 
-/// @brief Simpler version of is_regular_file for clients that don't need to
+/// Simpler version of is_regular_file for clients that don't need to
 ///        differentiate between an error and false.
 inline bool is_regular_file(const Twine &Path) {
   bool Result;
@@ -554,13 +570,13 @@ inline bool is_regular_file(const Twine &Path) {
   return Result;
 }
 
-/// @brief Does status represent a symlink file?
+/// Does status represent a symlink file?
 ///
 /// @param status A basic_file_status previously returned from status.
 /// @returns status_known(status) && status.type() == file_type::symlink_file.
 bool is_symlink_file(const basic_file_status &status);
 
-/// @brief Is path a symlink file?
+/// Is path a symlink file?
 ///
 /// @param path Input path.
 /// @param result Set to true if \a path is a symlink file, false if it is not.
@@ -569,7 +585,7 @@ bool is_symlink_file(const basic_file_status &status);
 ///          platform-specific error_code.
 std::error_code is_symlink_file(const Twine &path, bool &result);
 
-/// @brief Simpler version of is_symlink_file for clients that don't need to
+/// Simpler version of is_symlink_file for clients that don't need to
 ///        differentiate between an error and false.
 inline bool is_symlink_file(const Twine &Path) {
   bool Result;
@@ -578,14 +594,14 @@ inline bool is_symlink_file(const Twine &Path) {
   return Result;
 }
 
-/// @brief Does this status represent something that exists but is not a
+/// Does this status represent something that exists but is not a
 ///        directory or regular file?
 ///
 /// @param status A basic_file_status previously returned from status.
 /// @returns exists(s) && !is_regular_file(s) && !is_directory(s)
 bool is_other(const basic_file_status &status);
 
-/// @brief Is path something that exists but is not a directory,
+/// Is path something that exists but is not a directory,
 ///        regular file, or symlink?
 ///
 /// @param path Input path.
@@ -595,7 +611,7 @@ bool is_other(const basic_file_status &status);
 ///          platform-specific error_code.
 std::error_code is_other(const Twine &path, bool &result);
 
-/// @brief Get file status as if by POSIX stat().
+/// Get file status as if by POSIX stat().
 ///
 /// @param path Input path.
 /// @param result Set to the file status.
@@ -606,10 +622,10 @@ std::error_code is_other(const Twine &path, bool &result);
 std::error_code status(const Twine &path, file_status &result,
                        bool follow = true);
 
-/// @brief A version for when a file descriptor is already available.
+/// A version for when a file descriptor is already available.
 std::error_code status(int FD, file_status &Result);
 
-/// @brief Set file permissions.
+/// Set file permissions.
 ///
 /// @param Path File to set permissions on.
 /// @param Permissions New file permissions.
@@ -620,7 +636,7 @@ std::error_code status(int FD, file_status &Result);
 ///       Otherwise, the file will be marked as read-only.
 std::error_code setPermissions(const Twine &Path, perms Permissions);
 
-/// @brief Get file permissions.
+/// Get file permissions.
 ///
 /// @param Path File to get permissions from.
 /// @returns the permissions if they were successfully retrieved, otherwise a
@@ -630,7 +646,7 @@ std::error_code setPermissions(const Twine &Path, perms Permissions);
 ///       will be returned.
 ErrorOr<perms> getPermissions(const Twine &Path);
 
-/// @brief Get file size.
+/// Get file size.
 ///
 /// @param Path Input path.
 /// @param Result Set to the size of the file in \a Path.
@@ -645,20 +661,20 @@ inline std::error_code file_size(const Twine &Path, uint64_t &Result) {
   return std::error_code();
 }
 
-/// @brief Set the file modification and access time.
+/// Set the file modification and access time.
 ///
 /// @returns errc::success if the file times were successfully set, otherwise a
 ///          platform-specific error_code or errc::function_not_supported on
 ///          platforms where the functionality isn't available.
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time);
 
-/// @brief Is status available?
+/// Is status available?
 ///
 /// @param s Input file status.
 /// @returns True if status() != status_error.
 bool status_known(const basic_file_status &s);
 
-/// @brief Is status available?
+/// Is status available?
 ///
 /// @param path Input path.
 /// @param result Set to true if status() != status_error.
@@ -666,30 +682,58 @@ bool status_known(const basic_file_status &s);
 ///          platform-specific error_code.
 std::error_code status_known(const Twine &path, bool &result);
 
-enum OpenFlags : unsigned {
-  F_None = 0,
+enum CreationDisposition : unsigned {
+  /// CD_CreateAlways - When opening a file:
+  ///   * If it already exists, truncate it.
+  ///   * If it does not already exist, create a new file.
+  CD_CreateAlways = 0,
+
+  /// CD_CreateNew - When opening a file:
+  ///   * If it already exists, fail.
+  ///   * If it does not already exist, create a new file.
+  CD_CreateNew = 1,
+
+  /// CD_OpenAlways - When opening a file:
+  ///   * If it already exists, open the file with the offset set to 0.
+  ///   * If it does not already exist, fail.
+  CD_OpenExisting = 2,
+
+  /// CD_OpenAlways - When opening a file:
+  ///   * If it already exists, open the file with the offset set to 0.
+  ///   * If it does not already exist, create a new file.
+  CD_OpenAlways = 3,
+};
 
-  /// F_Excl - When opening a file, this flag makes raw_fd_ostream
-  /// report an error if the file already exists.
-  F_Excl = 1,
+enum FileAccess : unsigned {
+  FA_Read = 1,
+  FA_Write = 2,
+};
 
-  /// F_Append - When opening a file, if it already exists append to the
-  /// existing file instead of returning an error.  This may not be specified
-  /// with F_Excl.
-  F_Append = 2,
+enum OpenFlags : unsigned {
+  OF_None = 0,
+  F_None = 0, // For compatibility
 
   /// The file should be opened in text mode on platforms that make this
   /// distinction.
-  F_Text = 4,
+  OF_Text = 1,
+  F_Text = 1, // For compatibility
 
-  /// Open the file for read and write.
-  F_RW = 8,
+  /// The file should be opened in append mode.
+  OF_Append = 2,
+  F_Append = 2, // For compatibility
 
   /// Delete the file on close. Only makes a difference on windows.
-  F_Delete = 16
+  OF_Delete = 4,
+
+  /// When a child process is launched, this file should remain open in the
+  /// child process.
+  OF_ChildInherit = 8,
+
+  /// Force files Atime to be updated on access. Only makes a difference on windows.
+  OF_UpdateAtime = 16,
 };
 
-/// @brief Create a uniquely named file.
+/// Create a uniquely named file.
 ///
 /// Generates a unique path suitable for a temporary file and then opens it as a
 /// file. The name is based on \a model with '%' replaced by a random char in
@@ -712,12 +756,13 @@ enum OpenFlags : unsigned {
 ///          otherwise a platform-specific error_code.
 std::error_code createUniqueFile(const Twine &Model, int &ResultFD,
                                  SmallVectorImpl<char> &ResultPath,
-                                 unsigned Mode = all_read | all_write,
-                                 sys::fs::OpenFlags Flags = sys::fs::F_RW);
+                                 unsigned Mode = all_read | all_write);
 
-/// @brief Simpler version for clients that don't want an open file.
+/// Simpler version for clients that don't want an open file. An empty
+/// file will still be created.
 std::error_code createUniqueFile(const Twine &Model,
-                                 SmallVectorImpl<char> &ResultPath);
+                                 SmallVectorImpl<char> &ResultPath,
+                                 unsigned Mode = all_read | all_write);
 
 /// Represents a temporary file.
 ///
@@ -757,7 +802,7 @@ public:
   ~TempFile();
 };
 
-/// @brief Create a file in the system temporary directory.
+/// Create a file in the system temporary directory.
 ///
 /// The filename is of the form prefix-random_chars.suffix. Since the directory
 /// is not know to the caller, Prefix and Suffix cannot have path separators.
@@ -767,16 +812,38 @@ public:
 /// running the assembler.
 std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
                                     int &ResultFD,
-                                    SmallVectorImpl<char> &ResultPath,
-                                    sys::fs::OpenFlags Flags = sys::fs::F_RW);
+                                    SmallVectorImpl<char> &ResultPath);
 
-/// @brief Simpler version for clients that don't want an open file.
+/// Simpler version for clients that don't want an open file. An empty
+/// file will still be created.
 std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
                                     SmallVectorImpl<char> &ResultPath);
 
 std::error_code createUniqueDirectory(const Twine &Prefix,
                                       SmallVectorImpl<char> &ResultPath);
 
+/// Get a unique name, not currently exisiting in the filesystem. Subject
+/// to race conditions, prefer to use createUniqueFile instead.
+///
+/// Similar to createUniqueFile, but instead of creating a file only
+/// checks if it exists. This function is subject to race conditions, if you
+/// want to use the returned name to actually create a file, use
+/// createUniqueFile instead.
+std::error_code getPotentiallyUniqueFileName(const Twine &Model,
+                                             SmallVectorImpl<char> &ResultPath);
+
+/// Get a unique temporary file name, not currently exisiting in the
+/// filesystem. Subject to race conditions, prefer to use createTemporaryFile
+/// instead.
+///
+/// Similar to createTemporaryFile, but instead of creating a file only
+/// checks if it exists. This function is subject to race conditions, if you
+/// want to use the returned name to actually create a file, use
+/// createTemporaryFile instead.
+std::error_code
+getPotentiallyUniqueTempFileName(const Twine &Prefix, StringRef Suffix,
+                                 SmallVectorImpl<char> &ResultPath);
+
 inline OpenFlags operator|(OpenFlags A, OpenFlags B) {
   return OpenFlags(unsigned(A) | unsigned(B));
 }
@@ -786,15 +853,181 @@ inline OpenFlags &operator|=(OpenFlags &A, OpenFlags B) {
   return A;
 }
 
-std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
-                                 OpenFlags Flags, unsigned Mode = 0666);
+inline FileAccess operator|(FileAccess A, FileAccess B) {
+  return FileAccess(unsigned(A) | unsigned(B));
+}
+
+inline FileAccess &operator|=(FileAccess &A, FileAccess B) {
+  A = A | B;
+  return A;
+}
+
+/// @brief Opens a file with the specified creation disposition, access mode,
+/// and flags and returns a file descriptor.
+///
+/// The caller is responsible for closing the file descriptor once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param ResultFD If the file could be opened successfully, its descriptor
+///                 is stored in this location. Otherwise, this is set to -1.
+/// @param Disp Value specifying the existing-file behavior.
+/// @param Access Value specifying whether to open the file in read, write, or
+///               read-write mode.
+/// @param Flags Additional flags.
+/// @param Mode The access permissions of the file, represented in octal.
+/// @returns errc::success if \a Name has been opened, otherwise a
+///          platform-specific error_code.
+std::error_code openFile(const Twine &Name, int &ResultFD,
+                         CreationDisposition Disp, FileAccess Access,
+                         OpenFlags Flags, unsigned Mode = 0666);
+
+/// @brief Opens a file with the specified creation disposition, access mode,
+/// and flags and returns a platform-specific file object.
+///
+/// The caller is responsible for closing the file object once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param Disp Value specifying the existing-file behavior.
+/// @param Access Value specifying whether to open the file in read, write, or
+///               read-write mode.
+/// @param Flags Additional flags.
+/// @param Mode The access permissions of the file, represented in octal.
+/// @returns errc::success if \a Name has been opened, otherwise a
+///          platform-specific error_code.
+Expected<file_t> openNativeFile(const Twine &Name, CreationDisposition Disp,
+                                FileAccess Access, OpenFlags Flags,
+                                unsigned Mode = 0666);
+
+/// @brief Opens the file with the given name in a write-only or read-write
+/// mode, returning its open file descriptor. If the file does not exist, it
+/// is created.
+///
+/// The caller is responsible for closing the file descriptor once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param ResultFD If the file could be opened successfully, its descriptor
+///                 is stored in this location. Otherwise, this is set to -1.
+/// @param Flags Additional flags used to determine whether the file should be
+///              opened in, for example, read-write or in write-only mode.
+/// @param Mode The access permissions of the file, represented in octal.
+/// @returns errc::success if \a Name has been opened, otherwise a
+///          platform-specific error_code.
+inline std::error_code
+openFileForWrite(const Twine &Name, int &ResultFD,
+                 CreationDisposition Disp = CD_CreateAlways,
+                 OpenFlags Flags = OF_None, unsigned Mode = 0666) {
+  return openFile(Name, ResultFD, Disp, FA_Write, Flags, Mode);
+}
+
+/// @brief Opens the file with the given name in a write-only or read-write
+/// mode, returning its open file descriptor. If the file does not exist, it
+/// is created.
+///
+/// The caller is responsible for closing the freeing the file once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param Flags Additional flags used to determine whether the file should be
+///              opened in, for example, read-write or in write-only mode.
+/// @param Mode The access permissions of the file, represented in octal.
+/// @returns a platform-specific file descriptor if \a Name has been opened,
+///          otherwise an error object.
+inline Expected<file_t> openNativeFileForWrite(const Twine &Name,
+                                               CreationDisposition Disp,
+                                               OpenFlags Flags,
+                                               unsigned Mode = 0666) {
+  return openNativeFile(Name, Disp, FA_Write, Flags, Mode);
+}
+
+/// @brief Opens the file with the given name in a write-only or read-write
+/// mode, returning its open file descriptor. If the file does not exist, it
+/// is created.
+///
+/// The caller is responsible for closing the file descriptor once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param ResultFD If the file could be opened successfully, its descriptor
+///                 is stored in this location. Otherwise, this is set to -1.
+/// @param Flags Additional flags used to determine whether the file should be
+///              opened in, for example, read-write or in write-only mode.
+/// @param Mode The access permissions of the file, represented in octal.
+/// @returns errc::success if \a Name has been opened, otherwise a
+///          platform-specific error_code.
+inline std::error_code openFileForReadWrite(const Twine &Name, int &ResultFD,
+                                            CreationDisposition Disp,
+                                            OpenFlags Flags,
+                                            unsigned Mode = 0666) {
+  return openFile(Name, ResultFD, Disp, FA_Write | FA_Read, Flags, Mode);
+}
 
+/// @brief Opens the file with the given name in a write-only or read-write
+/// mode, returning its open file descriptor. If the file does not exist, it
+/// is created.
+///
+/// The caller is responsible for closing the freeing the file once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param Flags Additional flags used to determine whether the file should be
+///              opened in, for example, read-write or in write-only mode.
+/// @param Mode The access permissions of the file, represented in octal.
+/// @returns a platform-specific file descriptor if \a Name has been opened,
+///          otherwise an error object.
+inline Expected<file_t> openNativeFileForReadWrite(const Twine &Name,
+                                                   CreationDisposition Disp,
+                                                   OpenFlags Flags,
+                                                   unsigned Mode = 0666) {
+  return openNativeFile(Name, Disp, FA_Write | FA_Read, Flags, Mode);
+}
+
+/// @brief Opens the file with the given name in a read-only mode, returning
+/// its open file descriptor.
+///
+/// The caller is responsible for closing the file descriptor once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param ResultFD If the file could be opened successfully, its descriptor
+///                 is stored in this location. Otherwise, this is set to -1.
+/// @param RealPath If nonnull, extra work is done to determine the real path
+///                 of the opened file, and that path is stored in this
+///                 location.
+/// @returns errc::success if \a Name has been opened, otherwise a
+///          platform-specific error_code.
 std::error_code openFileForRead(const Twine &Name, int &ResultFD,
+                                OpenFlags Flags = OF_None,
                                 SmallVectorImpl<char> *RealPath = nullptr);
 
+/// @brief Opens the file with the given name in a read-only mode, returning
+/// its open file descriptor.
+///
+/// The caller is responsible for closing the freeing the file once they are
+/// finished with it.
+///
+/// @param Name The path of the file to open, relative or absolute.
+/// @param RealPath If nonnull, extra work is done to determine the real path
+///                 of the opened file, and that path is stored in this
+///                 location.
+/// @returns a platform-specific file descriptor if \a Name has been opened,
+///          otherwise an error object.
+Expected<file_t>
+openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None,
+                      SmallVectorImpl<char> *RealPath = nullptr);
+
+/// @brief Close the file object.  This should be used instead of ::close for
+/// portability.
+///
+/// @param F On input, this is the file to close.  On output, the file is
+/// set to kInvalidFile.
+void closeFile(file_t &F);
+
 std::error_code getUniqueID(const Twine Path, UniqueID &Result);
 
-/// @brief Get disk space usage information.
+/// Get disk space usage information.
 ///
 /// Note: Users must be careful about "Time Of Check, Time Of Use" kind of bug.
 /// Note: Windows reports results according to the quota allocated to the user.
@@ -819,6 +1052,10 @@ private:
   /// Platform-specific mapping state.
   size_t Size;
   void *Mapping;
+#ifdef _WIN32
+  void *FileHandle;
+#endif
+  mapmode Mode;
 
   std::error_code init(int FD, uint64_t Offset, mapmode Mode);
 
@@ -924,14 +1161,16 @@ public:
     SmallString<128> path_storage;
     ec = detail::directory_iterator_construct(
         *State, path.toStringRef(path_storage), FollowSymlinks);
+    update_error_code_for_current_entry(ec);
   }
 
   explicit directory_iterator(const directory_entry &de, std::error_code &ec,
                               bool follow_symlinks = true)
       : FollowSymlinks(follow_symlinks) {
     State = std::make_shared<detail::DirIterState>();
-    ec =
-        detail::directory_iterator_construct(*State, de.path(), FollowSymlinks);
+    ec = detail::directory_iterator_construct(
+        *State, de.path(), FollowSymlinks);
+    update_error_code_for_current_entry(ec);
   }
 
   /// Construct end iterator.
@@ -940,6 +1179,7 @@ public:
   // No operator++ because we need error_code.
   directory_iterator &increment(std::error_code &ec) {
     ec = directory_iterator_increment(*State);
+    update_error_code_for_current_entry(ec);
     return *this;
   }
 
@@ -961,6 +1201,24 @@ public:
   }
   // Other members as required by
   // C++ Std, 24.1.1 Input iterators [input.iterators]
+
+private:
+  // Checks if current entry is valid and populates error code. For example,
+  // current entry may not exist due to broken symbol links.
+  void update_error_code_for_current_entry(std::error_code &ec) {
+    // Bail out if error has already occured earlier to avoid overwriting it.
+    if (ec)
+      return;
+
+    // Empty directory entry is used to mark the end of an interation, it's not
+    // an error.
+    if (State->CurrentEntry == directory_entry())
+      return;
+
+    ErrorOr<basic_file_status> status = State->CurrentEntry.status();
+    if (!status)
+      ec = status.getError();
+  }
 };
 
 namespace detail {
@@ -998,11 +1256,9 @@ public:
     if (State->HasNoPushRequest)
       State->HasNoPushRequest = false;
     else {
-      ErrorOr<basic_file_status> st = State->Stack.top()->status();
-      if (!st) return *this;
-      if (is_directory(*st)) {
+      ErrorOr<basic_file_status> status = State->Stack.top()->status();
+      if (status && is_directory(*status)) {
         State->Stack.push(directory_iterator(*State->Stack.top(), ec, Follow));
-        if (ec) return *this;
         if (State->Stack.top() != end_itr) {
           ++State->Level;
           return *this;
diff --git a/contrib/llvm/include/llvm/Support/FormatAdapters.h b/contrib/llvm/include/llvm/Support/FormatAdapters.h
index 197beb7363df..8320eaad39a9 100644
--- a/contrib/llvm/include/llvm/Support/FormatAdapters.h
+++ b/contrib/llvm/include/llvm/Support/FormatAdapters.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FormatCommon.h"
 #include "llvm/Support/FormatVariadicDetails.h"
 #include "llvm/Support/raw_ostream.h"
@@ -19,7 +20,7 @@
 namespace llvm {
 template <typename T> class FormatAdapter : public detail::format_adapter {
 protected:
-  explicit FormatAdapter(T &&Item) : Item(Item) {}
+  explicit FormatAdapter(T &&Item) : Item(std::forward<T>(Item)) {}
 
   T Item;
 };
@@ -71,6 +72,14 @@ public:
     }
   }
 };
+
+class ErrorAdapter : public FormatAdapter<Error> {
+public:
+  ErrorAdapter(Error &&Item) : FormatAdapter(std::move(Item)) {}
+  ErrorAdapter(ErrorAdapter &&) = default;
+  ~ErrorAdapter() { consumeError(std::move(Item)); }
+  void format(llvm::raw_ostream &Stream, StringRef Style) { Stream << Item; }
+};
 }
 
 template <typename T>
@@ -88,6 +97,13 @@ template <typename T>
 detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
   return detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
 }
+
+// llvm::Error values must be consumed before being destroyed.
+// Wrapping an error in fmt_consume explicitly indicates that the formatv_object
+// should take ownership and consume it.
+inline detail::ErrorAdapter fmt_consume(Error &&Item) {
+  return detail::ErrorAdapter(std::move(Item));
+}
 }
 
 #endif
diff --git a/contrib/llvm/include/llvm/Support/FormatVariadic.h b/contrib/llvm/include/llvm/Support/FormatVariadic.h
index 8c08a7d9488f..b0f582513e07 100644
--- a/contrib/llvm/include/llvm/Support/FormatVariadic.h
+++ b/contrib/llvm/include/llvm/Support/FormatVariadic.h
@@ -118,7 +118,7 @@ public:
 
       auto W = Adapters[R.Index];
 
-      FmtAlign Align(*W, R.Where, R.Align);
+      FmtAlign Align(*W, R.Where, R.Align, R.Pad);
       Align.format(S, R.Options);
     }
   }
@@ -168,7 +168,7 @@ public:
   }
 };
 
-// \brief Format text given a format string and replacement parameters.
+// Format text given a format string and replacement parameters.
 //
 // ===General Description===
 //
@@ -237,6 +237,8 @@ public:
 //      for type T containing a method whose signature is:
 //      void format(const T &Obj, raw_ostream &Stream, StringRef Options)
 //      Then this method is invoked as described in Step 1.
+//   3. If an appropriate operator<< for raw_ostream exists, it will be used.
+//      For this to work, (raw_ostream& << const T&) must return raw_ostream&.
 //
 // If a match cannot be found through either of the above methods, a compiler
 // error is generated.
@@ -258,13 +260,6 @@ inline auto formatv(const char *Fmt, Ts &&... Vals) -> formatv_object<decltype(
       std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
 }
 
-// Allow a formatv_object to be formatted (no options supported).
-template <typename T> struct format_provider<formatv_object<T>> {
-  static void format(const formatv_object<T> &V, raw_ostream &OS, StringRef) {
-    OS << V;
-  }
-};
-
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_FORMATVARIADIC_H
diff --git a/contrib/llvm/include/llvm/Support/FormatVariadicDetails.h b/contrib/llvm/include/llvm/Support/FormatVariadicDetails.h
index 9b60462209dc..56dda430efda 100644
--- a/contrib/llvm/include/llvm/Support/FormatVariadicDetails.h
+++ b/contrib/llvm/include/llvm/Support/FormatVariadicDetails.h
@@ -17,6 +17,7 @@
 
 namespace llvm {
 template <typename T, typename Enable = void> struct format_provider {};
+class Error;
 
 namespace detail {
 class format_adapter {
@@ -38,6 +39,17 @@ public:
   }
 };
 
+template <typename T>
+class stream_operator_format_adapter : public format_adapter {
+  T Item;
+
+public:
+  explicit stream_operator_format_adapter(T &&Item)
+      : Item(std::forward<T>(Item)) {}
+
+  void format(llvm::raw_ostream &S, StringRef Options) override { S << Item; }
+};
+
 template <typename T> class missing_format_adapter;
 
 // Test if format_provider<T> is defined on T and contains a member function
@@ -59,6 +71,23 @@ public:
       (sizeof(test<llvm::format_provider<Decayed>>(nullptr)) == 1);
 };
 
+// Test if raw_ostream& << T -> raw_ostream& is findable via ADL.
+template <class T> class has_StreamOperator {
+public:
+  using ConstRefT = const typename std::decay<T>::type &;
+
+  template <typename U>
+  static char test(typename std::enable_if<
+                   std::is_same<decltype(std::declval<llvm::raw_ostream &>()
+                                         << std::declval<U>()),
+                                llvm::raw_ostream &>::value,
+                   int *>::type);
+
+  template <typename U> static double test(...);
+
+  static bool const value = (sizeof(test<ConstRefT>(nullptr)) == 1);
+};
+
 // Simple template that decides whether a type T should use the member-function
 // based format() invocation.
 template <typename T>
@@ -77,15 +106,24 @@ struct uses_format_provider
           bool, !uses_format_member<T>::value && has_FormatProvider<T>::value> {
 };
 
+// Simple template that decides whether a type T should use the operator<<
+// based format() invocation.  This takes last priority.
+template <typename T>
+struct uses_stream_operator
+    : public std::integral_constant<bool, !uses_format_member<T>::value &&
+                                              !uses_format_provider<T>::value &&
+                                              has_StreamOperator<T>::value> {};
+
 // Simple template that decides whether a type T has neither a member-function
 // nor format_provider based implementation that it can use.  Mostly used so
 // that the compiler spits out a nice diagnostic when a type with no format
 // implementation can be located.
 template <typename T>
 struct uses_missing_provider
-    : public std::integral_constant<bool,
-                                    !uses_format_member<T>::value &&
-                                        !uses_format_provider<T>::value> {};
+    : public std::integral_constant<bool, !uses_format_member<T>::value &&
+                                              !uses_format_provider<T>::value &&
+                                              !uses_stream_operator<T>::value> {
+};
 
 template <typename T>
 typename std::enable_if<uses_format_member<T>::value, T>::type
@@ -101,6 +139,19 @@ build_format_adapter(T &&Item) {
 }
 
 template <typename T>
+typename std::enable_if<uses_stream_operator<T>::value,
+                        stream_operator_format_adapter<T>>::type
+build_format_adapter(T &&Item) {
+  // If the caller passed an Error by value, then stream_operator_format_adapter
+  // would be responsible for consuming it.
+  // Make the caller opt into this by calling fmt_consume().
+  static_assert(
+      !std::is_same<llvm::Error, typename std::remove_cv<T>::type>::value,
+      "llvm::Error-by-value must be wrapped in fmt_consume() for formatv");
+  return stream_operator_format_adapter<T>(std::forward<T>(Item));
+}
+
+template <typename T>
 typename std::enable_if<uses_missing_provider<T>::value,
                         missing_format_adapter<T>>::type
 build_format_adapter(T &&Item) {
diff --git a/contrib/llvm/include/llvm/Support/GenericDomTree.h b/contrib/llvm/include/llvm/Support/GenericDomTree.h
index 635c87a106f0..115abc23e2c6 100644
--- a/contrib/llvm/include/llvm/Support/GenericDomTree.h
+++ b/contrib/llvm/include/llvm/Support/GenericDomTree.h
@@ -50,9 +50,9 @@ template <typename DomTreeT>
 struct SemiNCAInfo;
 }  // namespace DomTreeBuilder
 
-/// \brief Base class for the actual dominator tree node.
+/// Base class for the actual dominator tree node.
 template <class NodeT> class DomTreeNodeBase {
-  friend struct PostDominatorTree;
+  friend class PostDominatorTree;
   friend class DominatorTreeBase<NodeT, false>;
   friend class DominatorTreeBase<NodeT, true>;
   friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase<NodeT, false>>;
@@ -234,10 +234,10 @@ void ApplyUpdates(DomTreeT &DT,
                   ArrayRef<typename DomTreeT::UpdateType> Updates);
 
 template <typename DomTreeT>
-bool Verify(const DomTreeT &DT);
+bool Verify(const DomTreeT &DT, typename DomTreeT::VerificationLevel VL);
 }  // namespace DomTreeBuilder
 
-/// \brief Core dominator tree base class.
+/// Core dominator tree base class.
 ///
 /// This class is a generic template over graph nodes. It is instantiated for
 /// various graphs in the LLVM IR or in the code generator.
@@ -259,7 +259,9 @@ class DominatorTreeBase {
   static constexpr UpdateKind Insert = UpdateKind::Insert;
   static constexpr UpdateKind Delete = UpdateKind::Delete;
 
- protected:
+  enum class VerificationLevel { Fast, Basic, Full };
+
+protected:
   // Dominators always have a single root, postdominators can have more.
   SmallVector<NodeT *, IsPostDom ? 4 : 1> Roots;
 
@@ -316,6 +318,12 @@ class DominatorTreeBase {
   bool compare(const DominatorTreeBase &Other) const {
     if (Parent != Other.Parent) return true;
 
+    if (Roots.size() != Other.Roots.size())
+      return true;
+
+    if (!std::is_permutation(Roots.begin(), Roots.end(), Other.Roots.begin()))
+      return true;
+
     const DomTreeNodeMapType &OtherDomTreeNodes = Other.DomTreeNodes;
     if (DomTreeNodes.size() != OtherDomTreeNodes.size())
       return true;
@@ -343,7 +351,7 @@ class DominatorTreeBase {
   /// block.  This is the same as using operator[] on this class.  The result
   /// may (but is not required to) be null for a forward (backwards)
   /// statically unreachable block.
-  DomTreeNodeBase<NodeT> *getNode(NodeT *BB) const {
+  DomTreeNodeBase<NodeT> *getNode(const NodeT *BB) const {
     auto I = DomTreeNodes.find(BB);
     if (I != DomTreeNodes.end())
       return I->second.get();
@@ -351,7 +359,9 @@ class DominatorTreeBase {
   }
 
   /// See getNode.
-  DomTreeNodeBase<NodeT> *operator[](NodeT *BB) const { return getNode(BB); }
+  DomTreeNodeBase<NodeT> *operator[](const NodeT *BB) const {
+    return getNode(BB);
+  }
 
   /// getRootNode - This returns the entry node for the CFG of the function.  If
   /// this tree represents the post-dominance relations for a function, however,
@@ -750,10 +760,25 @@ public:
     DomTreeBuilder::Calculate(*this);
   }
 
-  /// verify - check parent and sibling property
-  bool verify() const { return DomTreeBuilder::Verify(*this); }
+  /// verify - checks if the tree is correct. There are 3 level of verification:
+  ///  - Full --  verifies if the tree is correct by making sure all the
+  ///             properties (including the parent and the sibling property)
+  ///             hold.
+  ///             Takes O(N^3) time.
+  ///
+  ///  - Basic -- checks if the tree is correct, but compares it to a freshly
+  ///             constructed tree instead of checking the sibling property.
+  ///             Takes O(N^2) time.
+  ///
+  ///  - Fast  -- checks basic tree structure and compares it with a freshly
+  ///             constructed tree.
+  ///             Takes O(N^2) time worst case, but is faster in practise (same
+  ///             as tree construction).
+  bool verify(VerificationLevel VL = VerificationLevel::Full) const {
+    return DomTreeBuilder::Verify(*this, VL);
+  }
 
- protected:
+protected:
   void addRoot(NodeT *BB) { this->Roots.push_back(BB); }
 
   void reset() {
@@ -835,7 +860,7 @@ public:
     return IDom != nullptr;
   }
 
-  /// \brief Wipe this tree's state without releasing any resources.
+  /// Wipe this tree's state without releasing any resources.
   ///
   /// This is essentially a post-move helper only. It leaves the object in an
   /// assignable and destroyable state, but otherwise invalid.
diff --git a/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h
index 9438c9e08850..103ff8ca476a 100644
--- a/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -82,8 +82,8 @@ struct SemiNCAInfo {
     // Note that these children are from the future relative to what the
     // DominatorTree knows about -- using them to gets us some snapshot of the
     // CFG from the past (relative to the state of the CFG).
-    DenseMap<NodePtr, SmallDenseSet<NodePtrAndKind, 4>> FutureSuccessors;
-    DenseMap<NodePtr, SmallDenseSet<NodePtrAndKind, 4>> FuturePredecessors;
+    DenseMap<NodePtr, SmallVector<NodePtrAndKind, 4>> FutureSuccessors;
+    DenseMap<NodePtr, SmallVector<NodePtrAndKind, 4>> FuturePredecessors;
     // Remembers if the whole tree was recalculated at some point during the
     // current batch update.
     bool IsRecalculated = false;
@@ -146,15 +146,15 @@ struct SemiNCAInfo {
           assert(llvm::find(Res, Child) != Res.end()
                  && "Expected child not found in the CFG");
           Res.erase(std::remove(Res.begin(), Res.end(), Child), Res.end());
-          DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> "
-                       << BlockNamePrinter(Child) << "\n");
+          LLVM_DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> "
+                            << BlockNamePrinter(Child) << "\n");
         } else {
           // If there's an deletion in the future, it means that the edge cannot
           // exist in the current CFG, but existed in it before.
           assert(llvm::find(Res, Child) == Res.end() &&
                  "Unexpected child found in the CFG");
-          DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N)
-                       << " -> " << BlockNamePrinter(Child) << "\n");
+          LLVM_DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N)
+                            << " -> " << BlockNamePrinter(Child) << "\n");
           Res.push_back(Child);
         }
       }
@@ -387,7 +387,7 @@ struct SemiNCAInfo {
     SNCA.addVirtualRoot();
     unsigned Num = 1;
 
-    DEBUG(dbgs() << "\t\tLooking for trivial roots\n");
+    LLVM_DEBUG(dbgs() << "\t\tLooking for trivial roots\n");
 
     // Step #1: Find all the trivial roots that are going to will definitely
     // remain tree roots.
@@ -404,14 +404,14 @@ struct SemiNCAInfo {
         Roots.push_back(N);
         // Run DFS not to walk this part of CFG later.
         Num = SNCA.runDFS(N, Num, AlwaysDescend, 1);
-        DEBUG(dbgs() << "Found a new trivial root: " << BlockNamePrinter(N)
-                     << "\n");
-        DEBUG(dbgs() << "Last visited node: "
-                     << BlockNamePrinter(SNCA.NumToNode[Num]) << "\n");
+        LLVM_DEBUG(dbgs() << "Found a new trivial root: " << BlockNamePrinter(N)
+                          << "\n");
+        LLVM_DEBUG(dbgs() << "Last visited node: "
+                          << BlockNamePrinter(SNCA.NumToNode[Num]) << "\n");
       }
     }
 
-    DEBUG(dbgs() << "\t\tLooking for non-trivial roots\n");
+    LLVM_DEBUG(dbgs() << "\t\tLooking for non-trivial roots\n");
 
     // Step #2: Find all non-trivial root candidates. Those are CFG nodes that
     // are reverse-unreachable were not visited by previous DFS walks (i.e. CFG
@@ -431,8 +431,8 @@ struct SemiNCAInfo {
       SmallPtrSet<NodePtr, 4> ConnectToExitBlock;
       for (const NodePtr I : nodes(DT.Parent)) {
         if (SNCA.NodeToInfo.count(I) == 0) {
-          DEBUG(dbgs() << "\t\t\tVisiting node " << BlockNamePrinter(I)
-                       << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "\t\t\tVisiting node " << BlockNamePrinter(I) << "\n");
           // Find the furthest away we can get by following successors, then
           // follow them in reverse.  This gives us some reasonable answer about
           // the post-dom tree inside any infinite loop. In particular, it
@@ -443,47 +443,49 @@ struct SemiNCAInfo {
           // the lowest and highest points in the infinite loop.  In theory, it
           // would be nice to give the canonical backedge for the loop, but it's
           // expensive and does not always lead to a minimal set of roots.
-          DEBUG(dbgs() << "\t\t\tRunning forward DFS\n");
+          LLVM_DEBUG(dbgs() << "\t\t\tRunning forward DFS\n");
 
           const unsigned NewNum = SNCA.runDFS<true>(I, Num, AlwaysDescend, Num);
           const NodePtr FurthestAway = SNCA.NumToNode[NewNum];
-          DEBUG(dbgs() << "\t\t\tFound a new furthest away node "
-                       << "(non-trivial root): "
-                       << BlockNamePrinter(FurthestAway) << "\n");
+          LLVM_DEBUG(dbgs() << "\t\t\tFound a new furthest away node "
+                            << "(non-trivial root): "
+                            << BlockNamePrinter(FurthestAway) << "\n");
           ConnectToExitBlock.insert(FurthestAway);
           Roots.push_back(FurthestAway);
-          DEBUG(dbgs() << "\t\t\tPrev DFSNum: " << Num << ", new DFSNum: "
-                       << NewNum << "\n\t\t\tRemoving DFS info\n");
+          LLVM_DEBUG(dbgs() << "\t\t\tPrev DFSNum: " << Num << ", new DFSNum: "
+                            << NewNum << "\n\t\t\tRemoving DFS info\n");
           for (unsigned i = NewNum; i > Num; --i) {
             const NodePtr N = SNCA.NumToNode[i];
-            DEBUG(dbgs() << "\t\t\t\tRemoving DFS info for "
-                         << BlockNamePrinter(N) << "\n");
+            LLVM_DEBUG(dbgs() << "\t\t\t\tRemoving DFS info for "
+                              << BlockNamePrinter(N) << "\n");
             SNCA.NodeToInfo.erase(N);
             SNCA.NumToNode.pop_back();
           }
           const unsigned PrevNum = Num;
-          DEBUG(dbgs() << "\t\t\tRunning reverse DFS\n");
+          LLVM_DEBUG(dbgs() << "\t\t\tRunning reverse DFS\n");
           Num = SNCA.runDFS(FurthestAway, Num, AlwaysDescend, 1);
           for (unsigned i = PrevNum + 1; i <= Num; ++i)
-            DEBUG(dbgs() << "\t\t\t\tfound node "
-                         << BlockNamePrinter(SNCA.NumToNode[i]) << "\n");
+            LLVM_DEBUG(dbgs() << "\t\t\t\tfound node "
+                              << BlockNamePrinter(SNCA.NumToNode[i]) << "\n");
         }
       }
     }
 
-    DEBUG(dbgs() << "Total: " << Total << ", Num: " << Num << "\n");
-    DEBUG(dbgs() << "Discovered CFG nodes:\n");
-    DEBUG(for (size_t i = 0; i <= Num; ++i) dbgs()
-          << i << ": " << BlockNamePrinter(SNCA.NumToNode[i]) << "\n");
+    LLVM_DEBUG(dbgs() << "Total: " << Total << ", Num: " << Num << "\n");
+    LLVM_DEBUG(dbgs() << "Discovered CFG nodes:\n");
+    LLVM_DEBUG(for (size_t i = 0; i <= Num; ++i) dbgs()
+               << i << ": " << BlockNamePrinter(SNCA.NumToNode[i]) << "\n");
 
     assert((Total + 1 == Num) && "Everything should have been visited");
 
     // Step #3: If we found some non-trivial roots, make them non-redundant.
     if (HasNonTrivialRoots) RemoveRedundantRoots(DT, BUI, Roots);
 
-    DEBUG(dbgs() << "Found roots: ");
-    DEBUG(for (auto *Root : Roots) dbgs() << BlockNamePrinter(Root) << " ");
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "Found roots: ");
+    LLVM_DEBUG(for (auto *Root
+                    : Roots) dbgs()
+               << BlockNamePrinter(Root) << " ");
+    LLVM_DEBUG(dbgs() << "\n");
 
     return Roots;
   }
@@ -499,7 +501,7 @@ struct SemiNCAInfo {
   static void RemoveRedundantRoots(const DomTreeT &DT, BatchUpdatePtr BUI,
                                    RootsT &Roots) {
     assert(IsPostDom && "This function is for postdominators only");
-    DEBUG(dbgs() << "Removing redundant roots\n");
+    LLVM_DEBUG(dbgs() << "Removing redundant roots\n");
 
     SemiNCAInfo SNCA(BUI);
 
@@ -507,8 +509,8 @@ struct SemiNCAInfo {
       auto &Root = Roots[i];
       // Trivial roots are always non-redundant.
       if (!HasForwardSuccessors(Root, BUI)) continue;
-      DEBUG(dbgs() << "\tChecking if " << BlockNamePrinter(Root)
-                   << " remains a root\n");
+      LLVM_DEBUG(dbgs() << "\tChecking if " << BlockNamePrinter(Root)
+                        << " remains a root\n");
       SNCA.clear();
       // Do a forward walk looking for the other roots.
       const unsigned Num = SNCA.runDFS<true>(Root, 0, AlwaysDescend, 0);
@@ -520,9 +522,9 @@ struct SemiNCAInfo {
         // root from the set of roots, as it is reverse-reachable from the other
         // one.
         if (llvm::find(Roots, N) != Roots.end()) {
-          DEBUG(dbgs() << "\tForward DFS walk found another root "
-                       << BlockNamePrinter(N) << "\n\tRemoving root "
-                       << BlockNamePrinter(Root) << "\n");
+          LLVM_DEBUG(dbgs() << "\tForward DFS walk found another root "
+                            << BlockNamePrinter(N) << "\n\tRemoving root "
+                            << BlockNamePrinter(Root) << "\n");
           std::swap(Root, Roots.back());
           Roots.pop_back();
 
@@ -563,7 +565,8 @@ struct SemiNCAInfo {
     SNCA.runSemiNCA(DT);
     if (BUI) {
       BUI->IsRecalculated = true;
-      DEBUG(dbgs() << "DomTree recalculated, skipping future batch updates\n");
+      LLVM_DEBUG(
+          dbgs() << "DomTree recalculated, skipping future batch updates\n");
     }
 
     if (DT.Roots.empty()) return;
@@ -585,8 +588,8 @@ struct SemiNCAInfo {
     // Loop over all of the discovered blocks in the function...
     for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
       NodePtr W = NumToNode[i];
-      DEBUG(dbgs() << "\tdiscovered a new reachable node "
-                   << BlockNamePrinter(W) << "\n");
+      LLVM_DEBUG(dbgs() << "\tdiscovered a new reachable node "
+                        << BlockNamePrinter(W) << "\n");
 
       // Don't replace this with 'count', the insertion side effect is important
       if (DT.DomTreeNodes[W]) continue;  // Haven't calculated this node yet?
@@ -638,8 +641,8 @@ struct SemiNCAInfo {
     assert((From || IsPostDom) &&
            "From has to be a valid CFG node or a virtual root");
     assert(To && "Cannot be a nullptr");
-    DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> "
-                 << BlockNamePrinter(To) << "\n");
+    LLVM_DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> "
+                      << BlockNamePrinter(To) << "\n");
     TreeNodePtr FromTN = DT.getNode(From);
 
     if (!FromTN) {
@@ -678,8 +681,8 @@ struct SemiNCAInfo {
     if (RIt == DT.Roots.end())
       return false;  // To is not a root, nothing to update.
 
-    DEBUG(dbgs() << "\t\tAfter the insertion, " << BlockNamePrinter(To)
-                 << " is no longer a root\n\t\tRebuilding the tree!!!\n");
+    LLVM_DEBUG(dbgs() << "\t\tAfter the insertion, " << BlockNamePrinter(To)
+                      << " is no longer a root\n\t\tRebuilding the tree!!!\n");
 
     CalculateFromScratch(DT, BUI);
     return true;
@@ -706,8 +709,8 @@ struct SemiNCAInfo {
       // can make a different (implicit) decision about which node within an
       // infinite loop becomes a root.
 
-      DEBUG(dbgs() << "Roots are different in updated trees\n"
-                   << "The entire tree needs to be rebuilt\n");
+      LLVM_DEBUG(dbgs() << "Roots are different in updated trees\n"
+                        << "The entire tree needs to be rebuilt\n");
       // It may be possible to update the tree without recalculating it, but
       // we do not know yet how to do it, and it happens rarely in practise.
       CalculateFromScratch(DT, BUI);
@@ -718,8 +721,8 @@ struct SemiNCAInfo {
   // Handles insertion to a node already in the dominator tree.
   static void InsertReachable(DomTreeT &DT, const BatchUpdatePtr BUI,
                               const TreeNodePtr From, const TreeNodePtr To) {
-    DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock())
-                 << " -> " << BlockNamePrinter(To->getBlock()) << "\n");
+    LLVM_DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock())
+                      << " -> " << BlockNamePrinter(To->getBlock()) << "\n");
     if (IsPostDom && UpdateRootsBeforeInsertion(DT, BUI, From, To)) return;
     // DT.findNCD expects both pointers to be valid. When From is a virtual
     // root, then its CFG block pointer is a nullptr, so we have to 'compute'
@@ -732,7 +735,7 @@ struct SemiNCAInfo {
     const TreeNodePtr NCD = DT.getNode(NCDBlock);
     assert(NCD);
 
-    DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n");
+    LLVM_DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n");
     const TreeNodePtr ToIDom = To->getIDom();
 
     // Nothing affected -- NCA property holds.
@@ -741,18 +744,20 @@ struct SemiNCAInfo {
 
     // Identify and collect affected nodes.
     InsertionInfo II;
-    DEBUG(dbgs() << "Marking " << BlockNamePrinter(To) << " as affected\n");
+    LLVM_DEBUG(dbgs() << "Marking " << BlockNamePrinter(To)
+                      << " as affected\n");
     II.Affected.insert(To);
     const unsigned ToLevel = To->getLevel();
-    DEBUG(dbgs() << "Putting " << BlockNamePrinter(To) << " into a Bucket\n");
+    LLVM_DEBUG(dbgs() << "Putting " << BlockNamePrinter(To)
+                      << " into a Bucket\n");
     II.Bucket.push({ToLevel, To});
 
     while (!II.Bucket.empty()) {
       const TreeNodePtr CurrentNode = II.Bucket.top().second;
       const unsigned  CurrentLevel = CurrentNode->getLevel();
       II.Bucket.pop();
-      DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
-                   << BlockNamePrinter(CurrentNode) << "\n");
+      LLVM_DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
+                        << BlockNamePrinter(CurrentNode) << "\n");
 
       II.Visited.insert({CurrentNode, CurrentLevel});
       II.AffectedQueue.push_back(CurrentNode);
@@ -770,8 +775,8 @@ struct SemiNCAInfo {
                              const TreeNodePtr TN, const unsigned RootLevel,
                              const TreeNodePtr NCD, InsertionInfo &II) {
     const unsigned NCDLevel = NCD->getLevel();
-    DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << ",  RootLevel "
-                 << RootLevel << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << ",  RootLevel "
+                      << RootLevel << "\n");
 
     SmallVector<TreeNodePtr, 8> Stack = {TN};
     assert(TN->getBlock() && II.Visited.count(TN) && "Preconditions!");
@@ -780,7 +785,7 @@ struct SemiNCAInfo {
 
     do {
       TreeNodePtr Next = Stack.pop_back_val();
-      DEBUG(dbgs() << " Next: " << BlockNamePrinter(Next) << "\n");
+      LLVM_DEBUG(dbgs() << " Next: " << BlockNamePrinter(Next) << "\n");
 
       for (const NodePtr Succ :
            ChildrenGetter<IsPostDom>::Get(Next->getBlock(), BUI)) {
@@ -788,8 +793,8 @@ struct SemiNCAInfo {
         assert(SuccTN && "Unreachable successor found at reachable insertion");
         const unsigned SuccLevel = SuccTN->getLevel();
 
-        DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ) << ", level = "
-                     << SuccLevel << "\n");
+        LLVM_DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ)
+                          << ", level = " << SuccLevel << "\n");
 
         // Do not process the same node multiple times.
         if (Processed.count(Next) > 0)
@@ -798,11 +803,11 @@ struct SemiNCAInfo {
         // Succ dominated by subtree From -- not affected.
         // (Based on the lemma 2.5 from the second paper.)
         if (SuccLevel > RootLevel) {
-          DEBUG(dbgs() << "\t\tDominated by subtree From\n");
+          LLVM_DEBUG(dbgs() << "\t\tDominated by subtree From\n");
           if (II.Visited.count(SuccTN) != 0) {
-            DEBUG(dbgs() << "\t\t\talready visited at level "
-                         << II.Visited[SuccTN] << "\n\t\t\tcurrent level "
-                         << RootLevel << ")\n");
+            LLVM_DEBUG(dbgs() << "\t\t\talready visited at level "
+                              << II.Visited[SuccTN] << "\n\t\t\tcurrent level "
+                              << RootLevel << ")\n");
 
             // A node can be necessary to visit again if we see it again at
             // a lower level than before.
@@ -810,15 +815,15 @@ struct SemiNCAInfo {
               continue;
           }
 
-          DEBUG(dbgs() << "\t\tMarking visited not affected "
-                       << BlockNamePrinter(Succ) << "\n");
+          LLVM_DEBUG(dbgs() << "\t\tMarking visited not affected "
+                            << BlockNamePrinter(Succ) << "\n");
           II.Visited.insert({SuccTN, RootLevel});
           II.VisitedNotAffectedQueue.push_back(SuccTN);
           Stack.push_back(SuccTN);
         } else if ((SuccLevel > NCDLevel + 1) &&
             II.Affected.count(SuccTN) == 0) {
-          DEBUG(dbgs() << "\t\tMarking affected and adding "
-                       << BlockNamePrinter(Succ) << " to a Bucket\n");
+          LLVM_DEBUG(dbgs() << "\t\tMarking affected and adding "
+                            << BlockNamePrinter(Succ) << " to a Bucket\n");
           II.Affected.insert(SuccTN);
           II.Bucket.push({SuccLevel, SuccTN});
         }
@@ -831,11 +836,11 @@ struct SemiNCAInfo {
   // Updates immediate dominators and levels after insertion.
   static void UpdateInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
                               const TreeNodePtr NCD, InsertionInfo &II) {
-    DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n");
+    LLVM_DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n");
 
     for (const TreeNodePtr TN : II.AffectedQueue) {
-      DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN)
-                   << ") = " << BlockNamePrinter(NCD) << "\n");
+      LLVM_DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN)
+                        << ") = " << BlockNamePrinter(NCD) << "\n");
       TN->setIDom(NCD);
     }
 
@@ -844,12 +849,13 @@ struct SemiNCAInfo {
   }
 
   static void UpdateLevelsAfterInsertion(InsertionInfo &II) {
-    DEBUG(dbgs() << "Updating levels for visited but not affected nodes\n");
+    LLVM_DEBUG(
+        dbgs() << "Updating levels for visited but not affected nodes\n");
 
     for (const TreeNodePtr TN : II.VisitedNotAffectedQueue) {
-      DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = ("
-                   << BlockNamePrinter(TN->getIDom()) << ") "
-                   << TN->getIDom()->getLevel() << " + 1\n");
+      LLVM_DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = ("
+                        << BlockNamePrinter(TN->getIDom()) << ") "
+                        << TN->getIDom()->getLevel() << " + 1\n");
       TN->UpdateLevel();
     }
   }
@@ -857,23 +863,24 @@ struct SemiNCAInfo {
   // Handles insertion to previously unreachable nodes.
   static void InsertUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI,
                                 const TreeNodePtr From, const NodePtr To) {
-    DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From)
-                 << " -> (unreachable) " << BlockNamePrinter(To) << "\n");
+    LLVM_DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From)
+                      << " -> (unreachable) " << BlockNamePrinter(To) << "\n");
 
     // Collect discovered edges to already reachable nodes.
     SmallVector<std::pair<NodePtr, TreeNodePtr>, 8> DiscoveredEdgesToReachable;
     // Discover and connect nodes that became reachable with the insertion.
     ComputeUnreachableDominators(DT, BUI, To, From, DiscoveredEdgesToReachable);
 
-    DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From)
-                 << " -> (prev unreachable) " << BlockNamePrinter(To) << "\n");
+    LLVM_DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From)
+                      << " -> (prev unreachable) " << BlockNamePrinter(To)
+                      << "\n");
 
     // Used the discovered edges and inset discovered connecting (incoming)
     // edges.
     for (const auto &Edge : DiscoveredEdgesToReachable) {
-      DEBUG(dbgs() << "\tInserting discovered connecting edge "
-                   << BlockNamePrinter(Edge.first) << " -> "
-                   << BlockNamePrinter(Edge.second) << "\n");
+      LLVM_DEBUG(dbgs() << "\tInserting discovered connecting edge "
+                        << BlockNamePrinter(Edge.first) << " -> "
+                        << BlockNamePrinter(Edge.second) << "\n");
       InsertReachable(DT, BUI, DT.getNode(Edge.first), Edge.second);
     }
   }
@@ -901,14 +908,14 @@ struct SemiNCAInfo {
     SNCA.runSemiNCA(DT);
     SNCA.attachNewSubtree(DT, Incoming);
 
-    DEBUG(dbgs() << "After adding unreachable nodes\n");
+    LLVM_DEBUG(dbgs() << "After adding unreachable nodes\n");
   }
 
   static void DeleteEdge(DomTreeT &DT, const BatchUpdatePtr BUI,
                          const NodePtr From, const NodePtr To) {
     assert(From && To && "Cannot disconnect nullptrs");
-    DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> "
-                 << BlockNamePrinter(To) << "\n");
+    LLVM_DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> "
+                      << BlockNamePrinter(To) << "\n");
 
 #ifndef NDEBUG
     // Ensure that the edge was in fact deleted from the CFG before informing
@@ -928,8 +935,9 @@ struct SemiNCAInfo {
 
     const TreeNodePtr ToTN = DT.getNode(To);
     if (!ToTN) {
-      DEBUG(dbgs() << "\tTo (" << BlockNamePrinter(To)
-                   << ") already unreachable -- there is no edge to delete\n");
+      LLVM_DEBUG(
+          dbgs() << "\tTo (" << BlockNamePrinter(To)
+                 << ") already unreachable -- there is no edge to delete\n");
       return;
     }
 
@@ -941,8 +949,8 @@ struct SemiNCAInfo {
       DT.DFSInfoValid = false;
 
       const TreeNodePtr ToIDom = ToTN->getIDom();
-      DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
-                   << BlockNamePrinter(ToIDom) << "\n");
+      LLVM_DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
+                        << BlockNamePrinter(ToIDom) << "\n");
 
       // To remains reachable after deletion.
       // (Based on the caption under Figure 4. from the second paper.)
@@ -959,9 +967,9 @@ struct SemiNCAInfo {
   static void DeleteReachable(DomTreeT &DT, const BatchUpdatePtr BUI,
                               const TreeNodePtr FromTN,
                               const TreeNodePtr ToTN) {
-    DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN) << " -> "
-                 << BlockNamePrinter(ToTN) << "\n");
-    DEBUG(dbgs() << "\tRebuilding subtree\n");
+    LLVM_DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN)
+                      << " -> " << BlockNamePrinter(ToTN) << "\n");
+    LLVM_DEBUG(dbgs() << "\tRebuilding subtree\n");
 
     // Find the top of the subtree that needs to be rebuilt.
     // (Based on the lemma 2.6 from the second paper.)
@@ -974,7 +982,7 @@ struct SemiNCAInfo {
     // Top of the subtree to rebuild is the root node. Rebuild the tree from
     // scratch.
     if (!PrevIDomSubTree) {
-      DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
+      LLVM_DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
       CalculateFromScratch(DT, BUI);
       return;
     }
@@ -985,11 +993,12 @@ struct SemiNCAInfo {
       return DT.getNode(To)->getLevel() > Level;
     };
 
-    DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN) << "\n");
+    LLVM_DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN)
+                      << "\n");
 
     SemiNCAInfo SNCA(BUI);
     SNCA.runDFS(ToIDom, 0, DescendBelow, 0);
-    DEBUG(dbgs() << "\tRunning Semi-NCA\n");
+    LLVM_DEBUG(dbgs() << "\tRunning Semi-NCA\n");
     SNCA.runSemiNCA(DT, Level);
     SNCA.reattachExistingSubtree(DT, PrevIDomSubTree);
   }
@@ -998,19 +1007,20 @@ struct SemiNCAInfo {
   // explained on the page 7 of the second paper.
   static bool HasProperSupport(DomTreeT &DT, const BatchUpdatePtr BUI,
                                const TreeNodePtr TN) {
-    DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n");
+    LLVM_DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN)
+                      << "\n");
     for (const NodePtr Pred :
          ChildrenGetter<!IsPostDom>::Get(TN->getBlock(), BUI)) {
-      DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
+      LLVM_DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
       if (!DT.getNode(Pred)) continue;
 
       const NodePtr Support =
           DT.findNearestCommonDominator(TN->getBlock(), Pred);
-      DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
+      LLVM_DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
       if (Support != TN->getBlock()) {
-        DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
-                     << " is reachable from support "
-                     << BlockNamePrinter(Support) << "\n");
+        LLVM_DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
+                          << " is reachable from support "
+                          << BlockNamePrinter(Support) << "\n");
         return true;
       }
     }
@@ -1022,8 +1032,8 @@ struct SemiNCAInfo {
   // (Based on the lemma 2.7 from the second paper.)
   static void DeleteUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI,
                                 const TreeNodePtr ToTN) {
-    DEBUG(dbgs() << "Deleting unreachable subtree " << BlockNamePrinter(ToTN)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Deleting unreachable subtree "
+                      << BlockNamePrinter(ToTN) << "\n");
     assert(ToTN);
     assert(ToTN->getBlock());
 
@@ -1031,8 +1041,9 @@ struct SemiNCAInfo {
       // Deletion makes a region reverse-unreachable and creates a new root.
       // Simulate that by inserting an edge from the virtual root to ToTN and
       // adding it as a new root.
-      DEBUG(dbgs() << "\tDeletion made a region reverse-unreachable\n");
-      DEBUG(dbgs() << "\tAdding new root " << BlockNamePrinter(ToTN) << "\n");
+      LLVM_DEBUG(dbgs() << "\tDeletion made a region reverse-unreachable\n");
+      LLVM_DEBUG(dbgs() << "\tAdding new root " << BlockNamePrinter(ToTN)
+                        << "\n");
       DT.Roots.push_back(ToTN->getBlock());
       InsertReachable(DT, BUI, DT.getNode(nullptr), ToTN);
       return;
@@ -1069,15 +1080,15 @@ struct SemiNCAInfo {
       const TreeNodePtr NCD = DT.getNode(NCDBlock);
       assert(NCD);
 
-      DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN)
-                   << " with NCD = " << BlockNamePrinter(NCD)
-                   << ", MinNode =" << BlockNamePrinter(MinNode) << "\n");
+      LLVM_DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN)
+                        << " with NCD = " << BlockNamePrinter(NCD)
+                        << ", MinNode =" << BlockNamePrinter(MinNode) << "\n");
       if (NCD != TN && NCD->getLevel() < MinNode->getLevel()) MinNode = NCD;
     }
 
     // Root reached, rebuild the whole tree from scratch.
     if (!MinNode->getIDom()) {
-      DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
+      LLVM_DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
       CalculateFromScratch(DT, BUI);
       return;
     }
@@ -1087,7 +1098,7 @@ struct SemiNCAInfo {
     for (unsigned i = LastDFSNum; i > 0; --i) {
       const NodePtr N = SNCA.NumToNode[i];
       const TreeNodePtr TN = DT.getNode(N);
-      DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");
+      LLVM_DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");
 
       EraseNode(DT, TN);
     }
@@ -1095,8 +1106,8 @@ struct SemiNCAInfo {
     // The affected subtree start at the To node -- there's no extra work to do.
     if (MinNode == ToTN) return;
 
-    DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = "
-                 << BlockNamePrinter(MinNode) << "\n");
+    LLVM_DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = "
+                      << BlockNamePrinter(MinNode) << "\n");
     const unsigned MinLevel = MinNode->getLevel();
     const TreeNodePtr PrevIDom = MinNode->getIDom();
     assert(PrevIDom);
@@ -1109,8 +1120,8 @@ struct SemiNCAInfo {
     };
     SNCA.runDFS(MinNode->getBlock(), 0, DescendBelow, 0);
 
-    DEBUG(dbgs() << "Previous IDom(MinNode) = " << BlockNamePrinter(PrevIDom)
-                 << "\nRunning Semi-NCA\n");
+    LLVM_DEBUG(dbgs() << "Previous IDom(MinNode) = "
+                      << BlockNamePrinter(PrevIDom) << "\nRunning Semi-NCA\n");
 
     // Rebuild the remaining part of affected subtree.
     SNCA.runSemiNCA(DT, MinLevel);
@@ -1165,15 +1176,15 @@ struct SemiNCAInfo {
     // predecessors. Note that these sets will only decrease size over time, as
     // the next CFG snapshots slowly approach the actual (current) CFG.
     for (UpdateT &U : BUI.Updates) {
-      BUI.FutureSuccessors[U.getFrom()].insert({U.getTo(), U.getKind()});
-      BUI.FuturePredecessors[U.getTo()].insert({U.getFrom(), U.getKind()});
+      BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()});
+      BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()});
     }
 
-    DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n");
-    DEBUG(if (NumLegalized < 32) for (const auto &U
-                                      : reverse(BUI.Updates)) dbgs()
-          << '\t' << U << "\n");
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n");
+    LLVM_DEBUG(if (NumLegalized < 32) for (const auto &U
+                                           : reverse(BUI.Updates)) dbgs()
+               << '\t' << U << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
 
     // If the DominatorTree was recalculated at some point, stop the batch
     // updates. Full recalculations ignore batch updates and look at the actual
@@ -1201,7 +1212,7 @@ struct SemiNCAInfo {
   // minimizes the amount of work needed done during incremental updates.
   static void LegalizeUpdates(ArrayRef<UpdateT> AllUpdates,
                               SmallVectorImpl<UpdateT> &Result) {
-    DEBUG(dbgs() << "Legalizing " << AllUpdates.size() << " updates\n");
+    LLVM_DEBUG(dbgs() << "Legalizing " << AllUpdates.size() << " updates\n");
     // Count the total number of inserions of each edge.
     // Each insertion adds 1 and deletion subtracts 1. The end number should be
     // one of {-1 (deletion), 0 (NOP), +1 (insertion)}. Otherwise, the sequence
@@ -1241,26 +1252,31 @@ struct SemiNCAInfo {
         Operations[{U.getTo(), U.getFrom()}] = int(i);
     }
 
-    std::sort(Result.begin(), Result.end(),
-              [&Operations](const UpdateT &A, const UpdateT &B) {
-                return Operations[{A.getFrom(), A.getTo()}] >
-                       Operations[{B.getFrom(), B.getTo()}];
-              });
+    llvm::sort(Result.begin(), Result.end(),
+               [&Operations](const UpdateT &A, const UpdateT &B) {
+                 return Operations[{A.getFrom(), A.getTo()}] >
+                        Operations[{B.getFrom(), B.getTo()}];
+               });
   }
 
   static void ApplyNextUpdate(DomTreeT &DT, BatchUpdateInfo &BUI) {
     assert(!BUI.Updates.empty() && "No updates to apply!");
     UpdateT CurrentUpdate = BUI.Updates.pop_back_val();
-    DEBUG(dbgs() << "Applying update: " << CurrentUpdate << "\n");
+    LLVM_DEBUG(dbgs() << "Applying update: " << CurrentUpdate << "\n");
 
     // Move to the next snapshot of the CFG by removing the reverse-applied
-    // current update.
+    // current update. Since updates are performed in the same order they are
+    // legalized it's sufficient to pop the last item here.
     auto &FS = BUI.FutureSuccessors[CurrentUpdate.getFrom()];
-    FS.erase({CurrentUpdate.getTo(), CurrentUpdate.getKind()});
+    assert(FS.back().getPointer() == CurrentUpdate.getTo() &&
+           FS.back().getInt() == CurrentUpdate.getKind());
+    FS.pop_back();
     if (FS.empty()) BUI.FutureSuccessors.erase(CurrentUpdate.getFrom());
 
     auto &FP = BUI.FuturePredecessors[CurrentUpdate.getTo()];
-    FP.erase({CurrentUpdate.getFrom(), CurrentUpdate.getKind()});
+    assert(FP.back().getPointer() == CurrentUpdate.getFrom() &&
+           FP.back().getInt() == CurrentUpdate.getKind());
+    FP.pop_back();
     if (FP.empty()) BUI.FuturePredecessors.erase(CurrentUpdate.getTo());
 
     if (CurrentUpdate.getKind() == UpdateKind::Insert)
@@ -1277,6 +1293,7 @@ struct SemiNCAInfo {
   // root which is the function's entry node. A PostDominatorTree can have
   // multiple roots - one for each node with no successors and for infinite
   // loops.
+  // Running time: O(N).
   bool verifyRoots(const DomTreeT &DT) {
     if (!DT.Parent && !DT.Roots.empty()) {
       errs() << "Tree has no parent but has roots!\n";
@@ -1317,6 +1334,7 @@ struct SemiNCAInfo {
   }
 
   // Checks if the tree contains all reachable nodes in the input graph.
+  // Running time: O(N).
   bool verifyReachability(const DomTreeT &DT) {
     clear();
     doFullDFSWalk(DT, AlwaysDescend);
@@ -1352,6 +1370,7 @@ struct SemiNCAInfo {
 
   // Check if for every parent with a level L in the tree all of its children
   // have level L + 1.
+  // Running time: O(N).
   static bool VerifyLevels(const DomTreeT &DT) {
     for (auto &NodeToTN : DT.DomTreeNodes) {
       const TreeNodePtr TN = NodeToTN.second.get();
@@ -1383,6 +1402,7 @@ struct SemiNCAInfo {
 
   // Check if the computed DFS numbers are correct. Note that DFS info may not
   // be valid, and when that is the case, we don't verify the numbers.
+  // Running time: O(N log(N)).
   static bool VerifyDFSNumbers(const DomTreeT &DT) {
     if (!DT.DFSInfoValid || !DT.Parent)
       return true;
@@ -1426,10 +1446,10 @@ struct SemiNCAInfo {
       // Make a copy and sort it such that it is possible to check if there are
       // no gaps between DFS numbers of adjacent children.
       SmallVector<TreeNodePtr, 8> Children(Node->begin(), Node->end());
-      std::sort(Children.begin(), Children.end(),
-                [](const TreeNodePtr Ch1, const TreeNodePtr Ch2) {
-                  return Ch1->getDFSNumIn() < Ch2->getDFSNumIn();
-                });
+      llvm::sort(Children.begin(), Children.end(),
+                 [](const TreeNodePtr Ch1, const TreeNodePtr Ch2) {
+                   return Ch1->getDFSNumIn() < Ch2->getDFSNumIn();
+                 });
 
       auto PrintChildrenError = [Node, &Children, PrintNodeAndDFSNums](
           const TreeNodePtr FirstCh, const TreeNodePtr SecondCh) {
@@ -1513,10 +1533,10 @@ struct SemiNCAInfo {
   // linear time, but the algorithms are complex. Instead, we do it in a
   // straightforward N^2 and N^3 way below, using direct path reachability.
 
-
   // Checks if the tree has the parent property: if for all edges from V to W in
   // the input graph, such that V is reachable, the parent of W in the tree is
   // an ancestor of V in the tree.
+  // Running time: O(N^2).
   //
   // This means that if a node gets disconnected from the graph, then all of
   // the nodes it dominated previously will now become unreachable.
@@ -1526,8 +1546,8 @@ struct SemiNCAInfo {
       const NodePtr BB = TN->getBlock();
       if (!BB || TN->getChildren().empty()) continue;
 
-      DEBUG(dbgs() << "Verifying parent property of node "
-                   << BlockNamePrinter(TN) << "\n");
+      LLVM_DEBUG(dbgs() << "Verifying parent property of node "
+                        << BlockNamePrinter(TN) << "\n");
       clear();
       doFullDFSWalk(DT, [BB](NodePtr From, NodePtr To) {
         return From != BB && To != BB;
@@ -1549,6 +1569,7 @@ struct SemiNCAInfo {
 
   // Check if the tree has sibling property: if a node V does not dominate a
   // node W for all siblings V and W in the tree.
+  // Running time: O(N^3).
   //
   // This means that if a node gets disconnected from the graph, then all of its
   // siblings will now still be reachable.
@@ -1583,6 +1604,31 @@ struct SemiNCAInfo {
 
     return true;
   }
+
+  // Check if the given tree is the same as a freshly computed one for the same
+  // Parent.
+  // Running time: O(N^2), but faster in practise (same as tree construction).
+  //
+  // Note that this does not check if that the tree construction algorithm is
+  // correct and should be only used for fast (but possibly unsound)
+  // verification.
+  static bool IsSameAsFreshTree(const DomTreeT &DT) {
+    DomTreeT FreshTree;
+    FreshTree.recalculate(*DT.Parent);
+    const bool Different = DT.compare(FreshTree);
+
+    if (Different) {
+      errs() << (DT.isPostDominator() ? "Post" : "")
+             << "DominatorTree is different than a freshly computed one!\n"
+             << "\tCurrent:\n";
+      DT.print(errs());
+      errs() << "\n\tFreshly computed tree:\n";
+      FreshTree.print(errs());
+      errs().flush();
+    }
+
+    return !Different;
+  }
 };
 
 template <class DomTreeT>
@@ -1611,11 +1657,29 @@ void ApplyUpdates(DomTreeT &DT,
 }
 
 template <class DomTreeT>
-bool Verify(const DomTreeT &DT) {
+bool Verify(const DomTreeT &DT, typename DomTreeT::VerificationLevel VL) {
   SemiNCAInfo<DomTreeT> SNCA(nullptr);
-  return SNCA.verifyRoots(DT) && SNCA.verifyReachability(DT) &&
-         SNCA.VerifyLevels(DT) && SNCA.verifyParentProperty(DT) &&
-         SNCA.verifySiblingProperty(DT) && SNCA.VerifyDFSNumbers(DT);
+
+  // Simplist check is to compare against a new tree. This will also
+  // usefully print the old and new trees, if they are different.
+  if (!SNCA.IsSameAsFreshTree(DT))
+    return false;
+
+  // Common checks to verify the properties of the tree. O(N log N) at worst
+  if (!SNCA.verifyRoots(DT) || !SNCA.verifyReachability(DT) ||
+      !SNCA.VerifyLevels(DT) || !SNCA.VerifyDFSNumbers(DT))
+    return false;
+
+  // Extra checks depending on VerificationLevel. Up to O(N^3)
+  if (VL == DomTreeT::VerificationLevel::Basic ||
+      VL == DomTreeT::VerificationLevel::Full)
+    if (!SNCA.verifyParentProperty(DT))
+      return false;
+  if (VL == DomTreeT::VerificationLevel::Full)
+    if (!SNCA.verifySiblingProperty(DT))
+      return false;
+
+  return true;
 }
 
 }  // namespace DomTreeBuilder
diff --git a/contrib/llvm/include/llvm/Support/GraphWriter.h b/contrib/llvm/include/llvm/Support/GraphWriter.h
index 3df5c867f7d3..c9a9f409c522 100644
--- a/contrib/llvm/include/llvm/Support/GraphWriter.h
+++ b/contrib/llvm/include/llvm/Support/GraphWriter.h
@@ -41,7 +41,7 @@ namespace DOT {  // Private functions...
 
 std::string EscapeString(const std::string &Label);
 
-/// \brief Get a color string for this node number. Simply round-robin selects
+/// Get a color string for this node number. Simply round-robin selects
 /// from a reasonable number of colors.
 StringRef getColorString(unsigned NodeNumber);
 
diff --git a/contrib/llvm/include/llvm/Support/Host.h b/contrib/llvm/include/llvm/Support/Host.h
index a4b0a340c568..57c79c0b9fdf 100644
--- a/contrib/llvm/include/llvm/Support/Host.h
+++ b/contrib/llvm/include/llvm/Support/Host.h
@@ -31,7 +31,7 @@
 #define BYTE_ORDER LITTLE_ENDIAN
 #endif
 #else
-#if !defined(BYTE_ORDER) && !defined(LLVM_ON_WIN32)
+#if !defined(BYTE_ORDER) && !defined(_WIN32)
 #include <machine/endian.h>
 #endif
 #endif
@@ -88,9 +88,9 @@ constexpr bool IsBigEndianHost = false;
 
   namespace detail {
   /// Helper functions to extract HostCPUName from /proc/cpuinfo on linux.
-  StringRef getHostCPUNameForPowerPC(const StringRef &ProcCpuinfoContent);
-  StringRef getHostCPUNameForARM(const StringRef &ProcCpuinfoContent);
-  StringRef getHostCPUNameForS390x(const StringRef &ProcCpuinfoContent);
+  StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
+  StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
+  StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForBPF();
   }
 }
diff --git a/contrib/llvm/include/llvm/Support/InitLLVM.h b/contrib/llvm/include/llvm/Support/InitLLVM.h
new file mode 100644
index 000000000000..0f629c9ac92d
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/InitLLVM.h
@@ -0,0 +1,46 @@
+//===- InitLLVM.h -----------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_LLVM_H
+#define LLVM_SUPPORT_LLVM_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/PrettyStackTrace.h"
+
+// The main() functions in typical LLVM tools start with InitLLVM which does
+// the following one-time initializations:
+//
+//  1. Setting up a signal handler so that pretty stack trace is printed out
+//     if a process crashes.
+//
+//  2. If running on Windows, obtain command line arguments using a
+//     multibyte character-aware API and convert arguments into UTF-8
+//     encoding, so that you can assume that command line arguments are
+//     always encoded in UTF-8 on any platform.
+//
+// InitLLVM calls llvm_shutdown() on destruction, which cleans up
+// ManagedStatic objects.
+namespace llvm {
+class InitLLVM {
+public:
+  InitLLVM(int &Argc, const char **&Argv);
+  InitLLVM(int &Argc, char **&Argv)
+      : InitLLVM(Argc, const_cast<const char **&>(Argv)) {}
+
+  ~InitLLVM();
+
+private:
+  BumpPtrAllocator Alloc;
+  SmallVector<const char *, 0> Args;
+  PrettyStackTraceProgram StackPrinter;
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/Support/JSON.h b/contrib/llvm/include/llvm/Support/JSON.h
new file mode 100644
index 000000000000..da3c5ea0b25d
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/JSON.h
@@ -0,0 +1,704 @@
+//===--- JSON.h - JSON values, parsing and serialization -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file supports working with JSON data.
+///
+/// It comprises:
+///
+/// - classes which hold dynamically-typed parsed JSON structures
+///   These are value types that can be composed, inspected, and modified.
+///   See json::Value, and the related types json::Object and json::Array.
+///
+/// - functions to parse JSON text into Values, and to serialize Values to text.
+///   See parse(), operator<<, and format_provider.
+///
+/// - a convention and helpers for mapping between json::Value and user-defined
+///   types. See fromJSON(), ObjectMapper, and the class comment on Value.
+///
+/// Typically, JSON data would be read from an external source, parsed into
+/// a Value, and then converted into some native data structure before doing
+/// real work on it. (And vice versa when writing).
+///
+/// Other serialization mechanisms you may consider:
+///
+/// - YAML is also text-based, and more human-readable than JSON. It's a more
+///   complex format and data model, and YAML parsers aren't ubiquitous.
+///   YAMLParser.h is a streaming parser suitable for parsing large documents
+///   (including JSON, as YAML is a superset). It can be awkward to use
+///   directly. YAML I/O (YAMLTraits.h) provides data mapping that is more
+///   declarative than the toJSON/fromJSON conventions here.
+///
+/// - LLVM bitstream is a space- and CPU- efficient binary format. Typically it
+///   encodes LLVM IR ("bitcode"), but it can be a container for other data.
+///   Low-level reader/writer libraries are in Bitcode/Bitstream*.h
+///
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_JSON_H
+#define LLVM_SUPPORT_JSON_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace llvm {
+namespace json {
+
+// === String encodings ===
+//
+// JSON strings are character sequences (not byte sequences like std::string).
+// We need to know the encoding, and for simplicity only support UTF-8.
+//
+//   - When parsing, invalid UTF-8 is a syntax error like any other
+//
+//   - When creating Values from strings, callers must ensure they are UTF-8.
+//        with asserts on, invalid UTF-8 will crash the program
+//        with asserts off, we'll substitute the replacement character (U+FFFD)
+//     Callers can use json::isUTF8() and json::fixUTF8() for validation.
+//
+//   - When retrieving strings from Values (e.g. asString()), the result will
+//     always be valid UTF-8.
+
+/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
+/// If it returns false, \p Offset is set to a byte offset near the first error.
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
+/// Replaces invalid UTF-8 sequences in \p S with the replacement character
+/// (U+FFFD). The returned string is valid UTF-8.
+/// This is much slower than isUTF8, so test that first.
+std::string fixUTF8(llvm::StringRef S);
+
+class Array;
+class ObjectKey;
+class Value;
+template <typename T> Value toJSON(const llvm::Optional<T> &Opt);
+
+/// An Object is a JSON object, which maps strings to heterogenous JSON values.
+/// It simulates DenseMap<ObjectKey, Value>. ObjectKey is a maybe-owned string.
+class Object {
+  using Storage = DenseMap<ObjectKey, Value, llvm::DenseMapInfo<StringRef>>;
+  Storage M;
+
+public:
+  using key_type = ObjectKey;
+  using mapped_type = Value;
+  using value_type = Storage::value_type;
+  using iterator = Storage::iterator;
+  using const_iterator = Storage::const_iterator;
+
+  explicit Object() = default;
+  // KV is a trivial key-value struct for list-initialization.
+  // (using std::pair forces extra copies).
+  struct KV;
+  explicit Object(std::initializer_list<KV> Properties);
+
+  iterator begin() { return M.begin(); }
+  const_iterator begin() const { return M.begin(); }
+  iterator end() { return M.end(); }
+  const_iterator end() const { return M.end(); }
+
+  bool empty() const { return M.empty(); }
+  size_t size() const { return M.size(); }
+
+  void clear() { M.clear(); }
+  std::pair<iterator, bool> insert(KV E);
+  template <typename... Ts>
+  std::pair<iterator, bool> try_emplace(const ObjectKey &K, Ts &&... Args) {
+    return M.try_emplace(K, std::forward<Ts>(Args)...);
+  }
+  template <typename... Ts>
+  std::pair<iterator, bool> try_emplace(ObjectKey &&K, Ts &&... Args) {
+    return M.try_emplace(std::move(K), std::forward<Ts>(Args)...);
+  }
+
+  iterator find(StringRef K) { return M.find_as(K); }
+  const_iterator find(StringRef K) const { return M.find_as(K); }
+  // operator[] acts as if Value was default-constructible as null.
+  Value &operator[](const ObjectKey &K);
+  Value &operator[](ObjectKey &&K);
+  // Look up a property, returning nullptr if it doesn't exist.
+  Value *get(StringRef K);
+  const Value *get(StringRef K) const;
+  // Typed accessors return None/nullptr if
+  //   - the property doesn't exist
+  //   - or it has the wrong type
+  llvm::Optional<std::nullptr_t> getNull(StringRef K) const;
+  llvm::Optional<bool> getBoolean(StringRef K) const;
+  llvm::Optional<double> getNumber(StringRef K) const;
+  llvm::Optional<int64_t> getInteger(StringRef K) const;
+  llvm::Optional<llvm::StringRef> getString(StringRef K) const;
+  const json::Object *getObject(StringRef K) const;
+  json::Object *getObject(StringRef K);
+  const json::Array *getArray(StringRef K) const;
+  json::Array *getArray(StringRef K);
+};
+bool operator==(const Object &LHS, const Object &RHS);
+inline bool operator!=(const Object &LHS, const Object &RHS) {
+  return !(LHS == RHS);
+}
+
+/// An Array is a JSON array, which contains heterogeneous JSON values.
+/// It simulates std::vector<Value>.
+class Array {
+  std::vector<Value> V;
+
+public:
+  using value_type = Value;
+  using iterator = std::vector<Value>::iterator;
+  using const_iterator = std::vector<Value>::const_iterator;
+
+  explicit Array() = default;
+  explicit Array(std::initializer_list<Value> Elements);
+  template <typename Collection> explicit Array(const Collection &C) {
+    for (const auto &V : C)
+      emplace_back(V);
+  }
+
+  Value &operator[](size_t I) { return V[I]; }
+  const Value &operator[](size_t I) const { return V[I]; }
+  Value &front() { return V.front(); }
+  const Value &front() const { return V.front(); }
+  Value &back() { return V.back(); }
+  const Value &back() const { return V.back(); }
+  Value *data() { return V.data(); }
+  const Value *data() const { return V.data(); }
+
+  iterator begin() { return V.begin(); }
+  const_iterator begin() const { return V.begin(); }
+  iterator end() { return V.end(); }
+  const_iterator end() const { return V.end(); }
+
+  bool empty() const { return V.empty(); }
+  size_t size() const { return V.size(); }
+
+  void clear() { V.clear(); }
+  void push_back(const Value &E) { V.push_back(E); }
+  void push_back(Value &&E) { V.push_back(std::move(E)); }
+  template <typename... Args> void emplace_back(Args &&... A) {
+    V.emplace_back(std::forward<Args>(A)...);
+  }
+  void pop_back() { V.pop_back(); }
+  // FIXME: insert() takes const_iterator since C++11, old libstdc++ disagrees.
+  iterator insert(iterator P, const Value &E) { return V.insert(P, E); }
+  iterator insert(iterator P, Value &&E) {
+    return V.insert(P, std::move(E));
+  }
+  template <typename It> iterator insert(iterator P, It A, It Z) {
+    return V.insert(P, A, Z);
+  }
+  template <typename... Args> iterator emplace(const_iterator P, Args &&... A) {
+    return V.emplace(P, std::forward<Args>(A)...);
+  }
+
+  friend bool operator==(const Array &L, const Array &R) { return L.V == R.V; }
+};
+inline bool operator!=(const Array &L, const Array &R) { return !(L == R); }
+
+/// A Value is an JSON value of unknown type.
+/// They can be copied, but should generally be moved.
+///
+/// === Composing values ===
+///
+/// You can implicitly construct Values from:
+///   - strings: std::string, SmallString, formatv, StringRef, char*
+///              (char*, and StringRef are references, not copies!)
+///   - numbers
+///   - booleans
+///   - null: nullptr
+///   - arrays: {"foo", 42.0, false}
+///   - serializable things: types with toJSON(const T&)->Value, found by ADL
+///
+/// They can also be constructed from object/array helpers:
+///   - json::Object is a type like map<ObjectKey, Value>
+///   - json::Array is a type like vector<Value>
+/// These can be list-initialized, or used to build up collections in a loop.
+/// json::ary(Collection) converts all items in a collection to Values.
+///
+/// === Inspecting values ===
+///
+/// Each Value is one of the JSON kinds:
+///   null    (nullptr_t)
+///   boolean (bool)
+///   number  (double or int64)
+///   string  (StringRef)
+///   array   (json::Array)
+///   object  (json::Object)
+///
+/// The kind can be queried directly, or implicitly via the typed accessors:
+///   if (Optional<StringRef> S = E.getAsString()
+///     assert(E.kind() == Value::String);
+///
+/// Array and Object also have typed indexing accessors for easy traversal:
+///   Expected<Value> E = parse(R"( {"options": {"font": "sans-serif"}} )");
+///   if (Object* O = E->getAsObject())
+///     if (Object* Opts = O->getObject("options"))
+///       if (Optional<StringRef> Font = Opts->getString("font"))
+///         assert(Opts->at("font").kind() == Value::String);
+///
+/// === Converting JSON values to C++ types ===
+///
+/// The convention is to have a deserializer function findable via ADL:
+///     fromJSON(const json::Value&, T&)->bool
+/// Deserializers are provided for:
+///   - bool
+///   - int and int64_t
+///   - double
+///   - std::string
+///   - vector<T>, where T is deserializable
+///   - map<string, T>, where T is deserializable
+///   - Optional<T>, where T is deserializable
+/// ObjectMapper can help writing fromJSON() functions for object types.
+///
+/// For conversion in the other direction, the serializer function is:
+///    toJSON(const T&) -> json::Value
+/// If this exists, then it also allows constructing Value from T, and can
+/// be used to serialize vector<T>, map<string, T>, and Optional<T>.
+///
+/// === Serialization ===
+///
+/// Values can be serialized to JSON:
+///   1) raw_ostream << Value                    // Basic formatting.
+///   2) raw_ostream << formatv("{0}", Value)    // Basic formatting.
+///   3) raw_ostream << formatv("{0:2}", Value)  // Pretty-print with indent 2.
+///
+/// And parsed:
+///   Expected<Value> E = json::parse("[1, 2, null]");
+///   assert(E && E->kind() == Value::Array);
+class Value {
+public:
+  enum Kind {
+    Null,
+    Boolean,
+    /// Number values can store both int64s and doubles at full precision,
+    /// depending on what they were constructed/parsed from.
+    Number,
+    String,
+    Array,
+    Object,
+  };
+
+  // It would be nice to have Value() be null. But that would make {} null too.
+  Value(const Value &M) { copyFrom(M); }
+  Value(Value &&M) { moveFrom(std::move(M)); }
+  Value(std::initializer_list<Value> Elements);
+  Value(json::Array &&Elements) : Type(T_Array) {
+    create<json::Array>(std::move(Elements));
+  }
+  Value(json::Object &&Properties) : Type(T_Object) {
+    create<json::Object>(std::move(Properties));
+  }
+  // Strings: types with value semantics. Must be valid UTF-8.
+  Value(std::string V) : Type(T_String) {
+    if (LLVM_UNLIKELY(!isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      V = fixUTF8(std::move(V));
+    }
+    create<std::string>(std::move(V));
+  }
+  Value(const llvm::SmallVectorImpl<char> &V)
+      : Value(std::string(V.begin(), V.end())){};
+  Value(const llvm::formatv_object_base &V) : Value(V.str()){};
+  // Strings: types with reference semantics. Must be valid UTF-8.
+  Value(StringRef V) : Type(T_StringRef) {
+    create<llvm::StringRef>(V);
+    if (LLVM_UNLIKELY(!isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = Value(fixUTF8(V));
+    }
+  }
+  Value(const char *V) : Value(StringRef(V)) {}
+  Value(std::nullptr_t) : Type(T_Null) {}
+  // Boolean (disallow implicit conversions).
+  // (The last template parameter is a dummy to keep templates distinct.)
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_same<T, bool>::value>::type,
+      bool = false>
+  Value(T B) : Type(T_Boolean) {
+    create<bool>(B);
+  }
+  // Integers (except boolean). Must be non-narrowing convertible to int64_t.
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_integral<T>::value>::type,
+      typename = typename std::enable_if<!std::is_same<T, bool>::value>::type>
+  Value(T I) : Type(T_Integer) {
+    create<int64_t>(int64_t{I});
+  }
+  // Floating point. Must be non-narrowing convertible to double.
+  template <typename T,
+            typename =
+                typename std::enable_if<std::is_floating_point<T>::value>::type,
+            double * = nullptr>
+  Value(T D) : Type(T_Double) {
+    create<double>(double{D});
+  }
+  // Serializable types: with a toJSON(const T&)->Value function, found by ADL.
+  template <typename T,
+            typename = typename std::enable_if<std::is_same<
+                Value, decltype(toJSON(*(const T *)nullptr))>::value>,
+            Value * = nullptr>
+  Value(const T &V) : Value(toJSON(V)) {}
+
+  Value &operator=(const Value &M) {
+    destroy();
+    copyFrom(M);
+    return *this;
+  }
+  Value &operator=(Value &&M) {
+    destroy();
+    moveFrom(std::move(M));
+    return *this;
+  }
+  ~Value() { destroy(); }
+
+  Kind kind() const {
+    switch (Type) {
+    case T_Null:
+      return Null;
+    case T_Boolean:
+      return Boolean;
+    case T_Double:
+    case T_Integer:
+      return Number;
+    case T_String:
+    case T_StringRef:
+      return String;
+    case T_Object:
+      return Object;
+    case T_Array:
+      return Array;
+    }
+    llvm_unreachable("Unknown kind");
+  }
+
+  // Typed accessors return None/nullptr if the Value is not of this type.
+  llvm::Optional<std::nullptr_t> getAsNull() const {
+    if (LLVM_LIKELY(Type == T_Null))
+      return nullptr;
+    return llvm::None;
+  }
+  llvm::Optional<bool> getAsBoolean() const {
+    if (LLVM_LIKELY(Type == T_Boolean))
+      return as<bool>();
+    return llvm::None;
+  }
+  llvm::Optional<double> getAsNumber() const {
+    if (LLVM_LIKELY(Type == T_Double))
+      return as<double>();
+    if (LLVM_LIKELY(Type == T_Integer))
+      return as<int64_t>();
+    return llvm::None;
+  }
+  // Succeeds if the Value is a Number, and exactly representable as int64_t.
+  llvm::Optional<int64_t> getAsInteger() const {
+    if (LLVM_LIKELY(Type == T_Integer))
+      return as<int64_t>();
+    if (LLVM_LIKELY(Type == T_Double)) {
+      double D = as<double>();
+      if (LLVM_LIKELY(std::modf(D, &D) == 0.0 &&
+                      D >= double(std::numeric_limits<int64_t>::min()) &&
+                      D <= double(std::numeric_limits<int64_t>::max())))
+        return D;
+    }
+    return llvm::None;
+  }
+  llvm::Optional<llvm::StringRef> getAsString() const {
+    if (Type == T_String)
+      return llvm::StringRef(as<std::string>());
+    if (LLVM_LIKELY(Type == T_StringRef))
+      return as<llvm::StringRef>();
+    return llvm::None;
+  }
+  const json::Object *getAsObject() const {
+    return LLVM_LIKELY(Type == T_Object) ? &as<json::Object>() : nullptr;
+  }
+  json::Object *getAsObject() {
+    return LLVM_LIKELY(Type == T_Object) ? &as<json::Object>() : nullptr;
+  }
+  const json::Array *getAsArray() const {
+    return LLVM_LIKELY(Type == T_Array) ? &as<json::Array>() : nullptr;
+  }
+  json::Array *getAsArray() {
+    return LLVM_LIKELY(Type == T_Array) ? &as<json::Array>() : nullptr;
+  }
+
+  /// Serializes this Value to JSON, writing it to the provided stream.
+  /// The formatting is compact (no extra whitespace) and deterministic.
+  /// For pretty-printing, use the formatv() format_provider below.
+  friend llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Value &);
+
+private:
+  void destroy();
+  void copyFrom(const Value &M);
+  // We allow moving from *const* Values, by marking all members as mutable!
+  // This hack is needed to support initializer-list syntax efficiently.
+  // (std::initializer_list<T> is a container of const T).
+  void moveFrom(const Value &&M);
+  friend class Array;
+  friend class Object;
+
+  template <typename T, typename... U> void create(U &&... V) {
+    new (reinterpret_cast<T *>(Union.buffer)) T(std::forward<U>(V)...);
+  }
+  template <typename T> T &as() const {
+    return *reinterpret_cast<T *>(Union.buffer);
+  }
+
+  template <typename Indenter>
+  void print(llvm::raw_ostream &, const Indenter &) const;
+  friend struct llvm::format_provider<llvm::json::Value>;
+
+  enum ValueType : char {
+    T_Null,
+    T_Boolean,
+    T_Double,
+    T_Integer,
+    T_StringRef,
+    T_String,
+    T_Object,
+    T_Array,
+  };
+  // All members mutable, see moveFrom().
+  mutable ValueType Type;
+  mutable llvm::AlignedCharArrayUnion<bool, double, int64_t, llvm::StringRef,
+                                      std::string, json::Array, json::Object>
+      Union;
+};
+
+bool operator==(const Value &, const Value &);
+inline bool operator!=(const Value &L, const Value &R) { return !(L == R); }
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Value &);
+
+/// ObjectKey is a used to capture keys in Object. Like Value but:
+///   - only strings are allowed
+///   - it's optimized for the string literal case (Owned == nullptr)
+/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
+class ObjectKey {
+public:
+  ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
+  ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
+    if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *Owned = fixUTF8(std::move(*Owned));
+    }
+    Data = *Owned;
+  }
+  ObjectKey(llvm::StringRef S) : Data(S) {
+    if (LLVM_UNLIKELY(!isUTF8(Data))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = ObjectKey(fixUTF8(S));
+    }
+  }
+  ObjectKey(const llvm::SmallVectorImpl<char> &V)
+      : ObjectKey(std::string(V.begin(), V.end())) {}
+  ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}
+
+  ObjectKey(const ObjectKey &C) { *this = C; }
+  ObjectKey(ObjectKey &&C) : ObjectKey(static_cast<const ObjectKey &&>(C)) {}
+  ObjectKey &operator=(const ObjectKey &C) {
+    if (C.Owned) {
+      Owned.reset(new std::string(*C.Owned));
+      Data = *Owned;
+    } else {
+      Data = C.Data;
+    }
+    return *this;
+  }
+  ObjectKey &operator=(ObjectKey &&) = default;
+
+  operator llvm::StringRef() const { return Data; }
+  std::string str() const { return Data.str(); }
+
+private:
+  // FIXME: this is unneccesarily large (3 pointers). Pointer + length + owned
+  // could be 2 pointers at most.
+  std::unique_ptr<std::string> Owned;
+  llvm::StringRef Data;
+};
+
+inline bool operator==(const ObjectKey &L, const ObjectKey &R) {
+  return llvm::StringRef(L) == llvm::StringRef(R);
+}
+inline bool operator!=(const ObjectKey &L, const ObjectKey &R) {
+  return !(L == R);
+}
+inline bool operator<(const ObjectKey &L, const ObjectKey &R) {
+  return StringRef(L) < StringRef(R);
+}
+
+struct Object::KV {
+  ObjectKey K;
+  Value V;
+};
+
+inline Object::Object(std::initializer_list<KV> Properties) {
+  for (const auto &P : Properties) {
+    auto R = try_emplace(P.K, nullptr);
+    if (R.second)
+      R.first->getSecond().moveFrom(std::move(P.V));
+  }
+}
+inline std::pair<Object::iterator, bool> Object::insert(KV E) {
+  return try_emplace(std::move(E.K), std::move(E.V));
+}
+
+// Standard deserializers are provided for primitive types.
+// See comments on Value.
+inline bool fromJSON(const Value &E, std::string &Out) {
+  if (auto S = E.getAsString()) {
+    Out = *S;
+    return true;
+  }
+  return false;
+}
+inline bool fromJSON(const Value &E, int &Out) {
+  if (auto S = E.getAsInteger()) {
+    Out = *S;
+    return true;
+  }
+  return false;
+}
+inline bool fromJSON(const Value &E, int64_t &Out) {
+  if (auto S = E.getAsInteger()) {
+    Out = *S;
+    return true;
+  }
+  return false;
+}
+inline bool fromJSON(const Value &E, double &Out) {
+  if (auto S = E.getAsNumber()) {
+    Out = *S;
+    return true;
+  }
+  return false;
+}
+inline bool fromJSON(const Value &E, bool &Out) {
+  if (auto S = E.getAsBoolean()) {
+    Out = *S;
+    return true;
+  }
+  return false;
+}
+template <typename T> bool fromJSON(const Value &E, llvm::Optional<T> &Out) {
+  if (E.getAsNull()) {
+    Out = llvm::None;
+    return true;
+  }
+  T Result;
+  if (!fromJSON(E, Result))
+    return false;
+  Out = std::move(Result);
+  return true;
+}
+template <typename T> bool fromJSON(const Value &E, std::vector<T> &Out) {
+  if (auto *A = E.getAsArray()) {
+    Out.clear();
+    Out.resize(A->size());
+    for (size_t I = 0; I < A->size(); ++I)
+      if (!fromJSON((*A)[I], Out[I]))
+        return false;
+    return true;
+  }
+  return false;
+}
+template <typename T>
+bool fromJSON(const Value &E, std::map<std::string, T> &Out) {
+  if (auto *O = E.getAsObject()) {
+    Out.clear();
+    for (const auto &KV : *O)
+      if (!fromJSON(KV.second, Out[llvm::StringRef(KV.first)]))
+        return false;
+    return true;
+  }
+  return false;
+}
+
+// Allow serialization of Optional<T> for supported T.
+template <typename T> Value toJSON(const llvm::Optional<T> &Opt) {
+  return Opt ? Value(*Opt) : Value(nullptr);
+}
+
+/// Helper for mapping JSON objects onto protocol structs.
+///
+/// Example:
+/// \code
+///   bool fromJSON(const Value &E, MyStruct &R) {
+///     ObjectMapper O(E);
+///     if (!O || !O.map("mandatory_field", R.MandatoryField))
+///       return false;
+///     O.map("optional_field", R.OptionalField);
+///     return true;
+///   }
+/// \endcode
+class ObjectMapper {
+public:
+  ObjectMapper(const Value &E) : O(E.getAsObject()) {}
+
+  /// True if the expression is an object.
+  /// Must be checked before calling map().
+  operator bool() { return O; }
+
+  /// Maps a property to a field, if it exists.
+  template <typename T> bool map(StringRef Prop, T &Out) {
+    assert(*this && "Must check this is an object before calling map()");
+    if (const Value *E = O->get(Prop))
+      return fromJSON(*E, Out);
+    return false;
+  }
+
+  /// Maps a property to a field, if it exists.
+  /// (Optional requires special handling, because missing keys are OK).
+  template <typename T> bool map(StringRef Prop, llvm::Optional<T> &Out) {
+    assert(*this && "Must check this is an object before calling map()");
+    if (const Value *E = O->get(Prop))
+      return fromJSON(*E, Out);
+    Out = llvm::None;
+    return true;
+  }
+
+private:
+  const Object *O;
+};
+
+/// Parses the provided JSON source, or returns a ParseError.
+/// The returned Value is self-contained and owns its strings (they do not refer
+/// to the original source).
+llvm::Expected<Value> parse(llvm::StringRef JSON);
+
+class ParseError : public llvm::ErrorInfo<ParseError> {
+  const char *Msg;
+  unsigned Line, Column, Offset;
+
+public:
+  static char ID;
+  ParseError(const char *Msg, unsigned Line, unsigned Column, unsigned Offset)
+      : Msg(Msg), Line(Line), Column(Column), Offset(Offset) {}
+  void log(llvm::raw_ostream &OS) const override {
+    OS << llvm::formatv("[{0}:{1}, byte={2}]: {3}", Line, Column, Offset, Msg);
+  }
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+} // namespace json
+
+/// Allow printing json::Value with formatv().
+/// The default style is basic/compact formatting, like operator<<.
+/// A format string like formatv("{0:2}", Value) pretty-prints with indent 2.
+template <> struct format_provider<llvm::json::Value> {
+  static void format(const llvm::json::Value &, raw_ostream &, StringRef);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/Support/JamCRC.h b/contrib/llvm/include/llvm/Support/JamCRC.h
index 5268bbd9ba1e..846d6cea9828 100644
--- a/contrib/llvm/include/llvm/Support/JamCRC.h
+++ b/contrib/llvm/include/llvm/Support/JamCRC.h
@@ -36,7 +36,7 @@ class JamCRC {
 public:
   JamCRC(uint32_t Init = 0xFFFFFFFFU) : CRC(Init) {}
 
-  // \brief Update the CRC calculation with Data.
+  // Update the CRC calculation with Data.
   void update(ArrayRef<char> Data);
 
   uint32_t getCRC() const { return CRC; }
diff --git a/contrib/llvm/include/llvm/Support/KnownBits.h b/contrib/llvm/include/llvm/Support/KnownBits.h
index 97e73b13fca3..259df9546c57 100644
--- a/contrib/llvm/include/llvm/Support/KnownBits.h
+++ b/contrib/llvm/include/llvm/Support/KnownBits.h
@@ -103,7 +103,7 @@ public:
     One.setSignBit();
   }
 
-  /// Make this value negative.
+  /// Make this value non-negative.
   void makeNonNegative() {
     Zero.setSignBit();
   }
diff --git a/contrib/llvm/include/llvm/Support/LEB128.h b/contrib/llvm/include/llvm/Support/LEB128.h
index 6af6e9f34474..9feb07229225 100644
--- a/contrib/llvm/include/llvm/Support/LEB128.h
+++ b/contrib/llvm/include/llvm/Support/LEB128.h
@@ -19,9 +19,10 @@
 
 namespace llvm {
 
-/// Utility function to encode a SLEB128 value to an output stream.
-inline void encodeSLEB128(int64_t Value, raw_ostream &OS,
-                          unsigned PadTo = 0) {
+/// Utility function to encode a SLEB128 value to an output stream. Returns
+/// the length in bytes of the encoded value.
+inline unsigned encodeSLEB128(int64_t Value, raw_ostream &OS,
+                              unsigned PadTo = 0) {
   bool More;
   unsigned Count = 0;
   do {
@@ -42,7 +43,9 @@ inline void encodeSLEB128(int64_t Value, raw_ostream &OS,
     for (; Count < PadTo - 1; ++Count)
       OS << char(PadValue | 0x80);
     OS << char(PadValue);
+    Count++;
   }
+  return Count;
 }
 
 /// Utility function to encode a SLEB128 value to a buffer. Returns
@@ -73,9 +76,10 @@ inline unsigned encodeSLEB128(int64_t Value, uint8_t *p, unsigned PadTo = 0) {
   return (unsigned)(p - orig_p);
 }
 
-/// Utility function to encode a ULEB128 value to an output stream.
-inline void encodeULEB128(uint64_t Value, raw_ostream &OS,
-                          unsigned PadTo = 0) {
+/// Utility function to encode a ULEB128 value to an output stream. Returns
+/// the length in bytes of the encoded value.
+inline unsigned encodeULEB128(uint64_t Value, raw_ostream &OS,
+                              unsigned PadTo = 0) {
   unsigned Count = 0;
   do {
     uint8_t Byte = Value & 0x7f;
@@ -93,6 +97,7 @@ inline void encodeULEB128(uint64_t Value, raw_ostream &OS,
     OS << '\x00';
     Count++;
   }
+  return Count;
 }
 
 /// Utility function to encode a ULEB128 value to a buffer. Returns
diff --git a/contrib/llvm/include/llvm/Support/LineIterator.h b/contrib/llvm/include/llvm/Support/LineIterator.h
index 9d4cd3bd4c6d..892d289976cb 100644
--- a/contrib/llvm/include/llvm/Support/LineIterator.h
+++ b/contrib/llvm/include/llvm/Support/LineIterator.h
@@ -18,7 +18,7 @@ namespace llvm {
 
 class MemoryBuffer;
 
-/// \brief A forward iterator which reads text lines from a buffer.
+/// A forward iterator which reads text lines from a buffer.
 ///
 /// This class provides a forward iterator interface for reading one line at
 /// a time from a buffer. When default constructed the iterator will be the
@@ -39,23 +39,23 @@ class line_iterator
   StringRef CurrentLine;
 
 public:
-  /// \brief Default construct an "end" iterator.
+  /// Default construct an "end" iterator.
   line_iterator() : Buffer(nullptr) {}
 
-  /// \brief Construct a new iterator around some memory buffer.
+  /// Construct a new iterator around some memory buffer.
   explicit line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks = true,
                          char CommentMarker = '\0');
 
-  /// \brief Return true if we've reached EOF or are an "end" iterator.
+  /// Return true if we've reached EOF or are an "end" iterator.
   bool is_at_eof() const { return !Buffer; }
 
-  /// \brief Return true if we're an "end" iterator or have reached EOF.
+  /// Return true if we're an "end" iterator or have reached EOF.
   bool is_at_end() const { return is_at_eof(); }
 
-  /// \brief Return the current line number. May return any number at EOF.
+  /// Return the current line number. May return any number at EOF.
   int64_t line_number() const { return LineNumber; }
 
-  /// \brief Advance to the next (non-empty, non-comment) line.
+  /// Advance to the next (non-empty, non-comment) line.
   line_iterator &operator++() {
     advance();
     return *this;
@@ -66,7 +66,7 @@ public:
     return tmp;
   }
 
-  /// \brief Get the current line as a \c StringRef.
+  /// Get the current line as a \c StringRef.
   StringRef operator*() const { return CurrentLine; }
   const StringRef *operator->() const { return &CurrentLine; }
 
@@ -80,7 +80,7 @@ public:
   }
 
 private:
-  /// \brief Advance the iterator to the next line.
+  /// Advance the iterator to the next line.
   void advance();
 };
 }
diff --git a/contrib/llvm/include/llvm/Support/LockFileManager.h b/contrib/llvm/include/llvm/Support/LockFileManager.h
index 1e417bdd5b25..86db0b2b1020 100644
--- a/contrib/llvm/include/llvm/Support/LockFileManager.h
+++ b/contrib/llvm/include/llvm/Support/LockFileManager.h
@@ -11,14 +11,13 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/FileSystem.h"
 #include <system_error>
 #include <utility> // for std::pair
 
 namespace llvm {
 class StringRef;
 
-/// \brief Class that manages the creation of a lock file to aid
+/// Class that manages the creation of a lock file to aid
 /// implicit coordination between different processes.
 ///
 /// The implicit coordination works by creating a ".lock" file alongside
@@ -28,33 +27,33 @@ class StringRef;
 /// operation.
 class LockFileManager {
 public:
-  /// \brief Describes the state of a lock file.
+  /// Describes the state of a lock file.
   enum LockFileState {
-    /// \brief The lock file has been created and is owned by this instance
+    /// The lock file has been created and is owned by this instance
     /// of the object.
     LFS_Owned,
-    /// \brief The lock file already exists and is owned by some other
+    /// The lock file already exists and is owned by some other
     /// instance.
     LFS_Shared,
-    /// \brief An error occurred while trying to create or find the lock
+    /// An error occurred while trying to create or find the lock
     /// file.
     LFS_Error
   };
 
-  /// \brief Describes the result of waiting for the owner to release the lock.
+  /// Describes the result of waiting for the owner to release the lock.
   enum WaitForUnlockResult {
-    /// \brief The lock was released successfully.
+    /// The lock was released successfully.
     Res_Success,
-    /// \brief Owner died while holding the lock.
+    /// Owner died while holding the lock.
     Res_OwnerDied,
-    /// \brief Reached timeout while waiting for the owner to release the lock.
+    /// Reached timeout while waiting for the owner to release the lock.
     Res_Timeout
   };
 
 private:
   SmallString<128> FileName;
   SmallString<128> LockFileName;
-  Optional<sys::fs::TempFile> UniqueLockFile;
+  SmallString<128> UniqueLockFileName;
 
   Optional<std::pair<std::string, int> > Owner;
   std::error_code ErrorCode;
@@ -73,22 +72,22 @@ public:
   LockFileManager(StringRef FileName);
   ~LockFileManager();
 
-  /// \brief Determine the state of the lock file.
+  /// Determine the state of the lock file.
   LockFileState getState() const;
 
   operator LockFileState() const { return getState(); }
 
-  /// \brief For a shared lock, wait until the owner releases the lock.
+  /// For a shared lock, wait until the owner releases the lock.
   WaitForUnlockResult waitForUnlock();
 
-  /// \brief Remove the lock file.  This may delete a different lock file than
+  /// Remove the lock file.  This may delete a different lock file than
   /// the one previously read if there is a race.
   std::error_code unsafeRemoveLockFile();
 
-  /// \brief Get error message, or "" if there is no error.
+  /// Get error message, or "" if there is no error.
   std::string getErrorMessage() const;
 
-  /// \brief Set error and error message
+  /// Set error and error message
   void setError(const std::error_code &EC, StringRef ErrorMsg = "") {
     ErrorCode = EC;
     ErrorDiagMsg = ErrorMsg.str();
diff --git a/contrib/llvm/include/llvm/Support/LowLevelTypeImpl.h b/contrib/llvm/include/llvm/Support/LowLevelTypeImpl.h
index 099fa4618997..a0a5a52d206e 100644
--- a/contrib/llvm/include/llvm/Support/LowLevelTypeImpl.h
+++ b/contrib/llvm/include/llvm/Support/LowLevelTypeImpl.h
@@ -28,7 +28,7 @@
 #define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
 
 #include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cassert>
 
 namespace llvm {
diff --git a/contrib/llvm/include/llvm/Support/MD5.h b/contrib/llvm/include/llvm/Support/MD5.h
index 2c0dc76485f8..bb2bdbf1bed2 100644
--- a/contrib/llvm/include/llvm/Support/MD5.h
+++ b/contrib/llvm/include/llvm/Support/MD5.h
@@ -81,20 +81,20 @@ public:
 
   MD5();
 
-  /// \brief Updates the hash for the byte stream provided.
+  /// Updates the hash for the byte stream provided.
   void update(ArrayRef<uint8_t> Data);
 
-  /// \brief Updates the hash for the StringRef provided.
+  /// Updates the hash for the StringRef provided.
   void update(StringRef Str);
 
-  /// \brief Finishes off the hash and puts the result in result.
+  /// Finishes off the hash and puts the result in result.
   void final(MD5Result &Result);
 
-  /// \brief Translates the bytes in \p Res to a hex string that is
+  /// Translates the bytes in \p Res to a hex string that is
   /// deposited into \p Str. The result will be of length 32.
   static void stringifyResult(MD5Result &Result, SmallString<32> &Str);
 
-  /// \brief Computes the hash for a given bytes.
+  /// Computes the hash for a given bytes.
   static std::array<uint8_t, 16> hash(ArrayRef<uint8_t> Data);
 
 private:
diff --git a/contrib/llvm/include/llvm/CodeGen/MachineValueType.h b/contrib/llvm/include/llvm/Support/MachineValueType.h
index b452684757f6..552dea05029c 100644
--- a/contrib/llvm/include/llvm/CodeGen/MachineValueType.h
+++ b/contrib/llvm/include/llvm/Support/MachineValueType.h
@@ -1,4 +1,4 @@
-//===- CodeGen/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
+//===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_MACHINEVALUETYPE_H
-#define LLVM_CODEGEN_MACHINEVALUETYPE_H
+#ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H
+#define LLVM_SUPPORT_MACHINEVALUETYPE_H
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -191,8 +191,10 @@ namespace llvm {
                                // unspecified type.  The register class
                                // will be determined by the opcode.
 
+      ExceptRef      = 113,    // WebAssembly's except_ref type
+
       FIRST_VALUETYPE = 1,     // This is always the beginning of the list.
-      LAST_VALUETYPE =  113,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  114,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -376,7 +378,7 @@ namespace llvm {
               SimpleTy == MVT::v16i64);
     }
 
-    /// Return true if this is a 1024-bit vector type.
+    /// Return true if this is a 2048-bit vector type.
     bool is2048BitVector() const {
       return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
               SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64);
@@ -746,6 +748,7 @@ namespace llvm {
       case v64i32:
       case v32i64:
       case nxv32i64: return 2048;
+      case ExceptRef: return 0; // opaque type
       }
     }
 
diff --git a/contrib/llvm/include/llvm/Support/MathExtras.h b/contrib/llvm/include/llvm/Support/MathExtras.h
index a37a16784e2a..b59f21b4998e 100644
--- a/contrib/llvm/include/llvm/Support/MathExtras.h
+++ b/contrib/llvm/include/llvm/Support/MathExtras.h
@@ -23,22 +23,30 @@
 #include <limits>
 #include <type_traits>
 
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
 #ifdef __ANDROID_NDK__
 #include <android/api-level.h>
 #endif
 
+#ifdef _MSC_VER
+// Declare these intrinsics manually rather including intrin.h. It's very
+// expensive, and MathExtras.h is popular.
+// #include <intrin.h>
+extern "C" {
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+}
+#endif
+
 namespace llvm {
-/// \brief The behavior an operation has on an input of 0.
+/// The behavior an operation has on an input of 0.
 enum ZeroBehavior {
-  /// \brief The returned value is undefined.
+  /// The returned value is undefined.
   ZB_Undefined,
-  /// \brief The returned value is numeric_limits<T>::max()
+  /// The returned value is numeric_limits<T>::max()
   ZB_Max,
-  /// \brief The returned value is numeric_limits<T>::digits
+  /// The returned value is numeric_limits<T>::digits
   ZB_Width
 };
 
@@ -101,7 +109,7 @@ template <typename T> struct TrailingZerosCounter<T, 8> {
 #endif
 } // namespace detail
 
-/// \brief Count number of 0's from the least significant bit to the most
+/// Count number of 0's from the least significant bit to the most
 ///   stopping at the first 1.
 ///
 /// Only unsigned integral types are allowed.
@@ -170,7 +178,7 @@ template <typename T> struct LeadingZerosCounter<T, 8> {
 #endif
 } // namespace detail
 
-/// \brief Count number of 0's from the most significant bit to the least
+/// Count number of 0's from the most significant bit to the least
 ///   stopping at the first 1.
 ///
 /// Only unsigned integral types are allowed.
@@ -185,7 +193,7 @@ std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
   return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
 }
 
-/// \brief Get the index of the first set bit starting from the least
+/// Get the index of the first set bit starting from the least
 ///   significant bit.
 ///
 /// Only unsigned integral types are allowed.
@@ -199,7 +207,7 @@ template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
   return countTrailingZeros(Val, ZB_Undefined);
 }
 
-/// \brief Create a bitmask with the N right-most bits set to 1, and all other
+/// Create a bitmask with the N right-most bits set to 1, and all other
 /// bits set to 0.  Only unsigned types are allowed.
 template <typename T> T maskTrailingOnes(unsigned N) {
   static_assert(std::is_unsigned<T>::value, "Invalid type!");
@@ -208,25 +216,25 @@ template <typename T> T maskTrailingOnes(unsigned N) {
   return N == 0 ? 0 : (T(-1) >> (Bits - N));
 }
 
-/// \brief Create a bitmask with the N left-most bits set to 1, and all other
+/// Create a bitmask with the N left-most bits set to 1, and all other
 /// bits set to 0.  Only unsigned types are allowed.
 template <typename T> T maskLeadingOnes(unsigned N) {
   return ~maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
 }
 
-/// \brief Create a bitmask with the N right-most bits set to 0, and all other
+/// Create a bitmask with the N right-most bits set to 0, and all other
 /// bits set to 1.  Only unsigned types are allowed.
 template <typename T> T maskTrailingZeros(unsigned N) {
   return maskLeadingOnes<T>(CHAR_BIT * sizeof(T) - N);
 }
 
-/// \brief Create a bitmask with the N left-most bits set to 0, and all other
+/// Create a bitmask with the N left-most bits set to 0, and all other
 /// bits set to 1.  Only unsigned types are allowed.
 template <typename T> T maskLeadingZeros(unsigned N) {
   return maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
 }
 
-/// \brief Get the index of the last set bit starting from the least
+/// Get the index of the last set bit starting from the least
 ///   significant bit.
 ///
 /// Only unsigned integral types are allowed.
@@ -243,7 +251,7 @@ template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
          (std::numeric_limits<T>::digits - 1);
 }
 
-/// \brief Macro compressed bit reversal table for 256 bits.
+/// Macro compressed bit reversal table for 256 bits.
 ///
 /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
 static const unsigned char BitReverseTable256[256] = {
@@ -256,7 +264,7 @@ static const unsigned char BitReverseTable256[256] = {
 #undef R6
 };
 
-/// \brief Reverse the bits in \p Val.
+/// Reverse the bits in \p Val.
 template <typename T>
 T reverseBits(T Val) {
   unsigned char in[sizeof(Val)];
@@ -442,7 +450,7 @@ inline uint64_t ByteSwap_64(uint64_t Value) {
   return sys::SwapByteOrder_64(Value);
 }
 
-/// \brief Count the number of ones from the most significant bit to the first
+/// Count the number of ones from the most significant bit to the first
 /// zero bit.
 ///
 /// Ex. countLeadingOnes(0xFF0FFF00) == 8.
@@ -455,10 +463,10 @@ std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
   static_assert(std::numeric_limits<T>::is_integer &&
                     !std::numeric_limits<T>::is_signed,
                 "Only unsigned integral types are allowed.");
-  return countLeadingZeros(~Value, ZB);
+  return countLeadingZeros<T>(~Value, ZB);
 }
 
-/// \brief Count the number of ones from the least significant bit to the first
+/// Count the number of ones from the least significant bit to the first
 /// zero bit.
 ///
 /// Ex. countTrailingOnes(0x00FF00FF) == 8.
@@ -471,7 +479,7 @@ std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
   static_assert(std::numeric_limits<T>::is_integer &&
                     !std::numeric_limits<T>::is_signed,
                 "Only unsigned integral types are allowed.");
-  return countTrailingZeros(~Value, ZB);
+  return countTrailingZeros<T>(~Value, ZB);
 }
 
 namespace detail {
@@ -505,7 +513,7 @@ template <typename T> struct PopulationCounter<T, 8> {
 };
 } // namespace detail
 
-/// \brief Count the number of set bits in a value.
+/// Count the number of set bits in a value.
 /// Ex. countPopulation(0xF000F000) = 8
 /// Returns 0 if the word is zero.
 template <typename T>
@@ -608,7 +616,7 @@ constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
   return (A | B) & (1 + ~(A | B));
 }
 
-/// \brief Aligns \c Addr to \c Alignment bytes, rounding up.
+/// Aligns \c Addr to \c Alignment bytes, rounding up.
 ///
 /// Alignment should be a power of two.  This method rounds up, so
 /// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
@@ -621,7 +629,7 @@ inline uintptr_t alignAddr(const void *Addr, size_t Alignment) {
   return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
 }
 
-/// \brief Returns the necessary adjustment for aligning \c Ptr to \c Alignment
+/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment
 /// bytes, rounding up.
 inline size_t alignmentAdjustment(const void *Ptr, size_t Alignment) {
   return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
diff --git a/contrib/llvm/include/llvm/Support/MemAlloc.h b/contrib/llvm/include/llvm/Support/MemAlloc.h
new file mode 100644
index 000000000000..d06c659cfba6
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/MemAlloc.h
@@ -0,0 +1,49 @@
+//===- MemAlloc.h - Memory allocation functions -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines counterparts of C library allocation functions defined in
+/// the namespace 'std'. The new allocation functions crash on allocation
+/// failure instead of returning null pointer.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MEMALLOC_H
+#define LLVM_SUPPORT_MEMALLOC_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdlib>
+
+namespace llvm {
+
+LLVM_ATTRIBUTE_RETURNS_NONNULL inline void *safe_malloc(size_t Sz) {
+  void *Result = std::malloc(Sz);
+  if (Result == nullptr)
+    report_bad_alloc_error("Allocation failed");
+  return Result;
+}
+
+LLVM_ATTRIBUTE_RETURNS_NONNULL inline void *safe_calloc(size_t Count,
+                                                        size_t Sz) {
+  void *Result = std::calloc(Count, Sz);
+  if (Result == nullptr)
+    report_bad_alloc_error("Allocation failed");
+  return Result;
+}
+
+LLVM_ATTRIBUTE_RETURNS_NONNULL inline void *safe_realloc(void *Ptr, size_t Sz) {
+  void *Result = std::realloc(Ptr, Sz);
+  if (Result == nullptr)
+    report_bad_alloc_error("Allocation failed");
+  return Result;
+}
+
+}
+#endif
diff --git a/contrib/llvm/include/llvm/Support/Memory.h b/contrib/llvm/include/llvm/Support/Memory.h
index 3140dc6eef42..fa026d49a61b 100644
--- a/contrib/llvm/include/llvm/Support/Memory.h
+++ b/contrib/llvm/include/llvm/Support/Memory.h
@@ -25,7 +25,7 @@ namespace sys {
   /// and a size. It is used by the Memory class (a friend) as the result of
   /// various memory allocation operations.
   /// @see Memory
-  /// @brief Memory block abstraction.
+  /// Memory block abstraction.
   class MemoryBlock {
   public:
     MemoryBlock() : Address(nullptr), Size(0) { }
@@ -42,7 +42,7 @@ namespace sys {
   /// This class provides various memory handling functions that manipulate
   /// MemoryBlock instances.
   /// @since 1.4
-  /// @brief An abstraction for memory operations.
+  /// An abstraction for memory operations.
   class Memory {
   public:
     enum ProtectionFlags {
@@ -74,7 +74,7 @@ namespace sys {
     /// \r a non-null MemoryBlock if the function was successful,
     /// otherwise a null MemoryBlock is with \p EC describing the error.
     ///
-    /// @brief Allocate mapped memory.
+    /// Allocate mapped memory.
     static MemoryBlock allocateMappedMemory(size_t NumBytes,
                                             const MemoryBlock *const NearBlock,
                                             unsigned Flags,
@@ -88,7 +88,7 @@ namespace sys {
     /// \r error_success if the function was successful, or an error_code
     /// describing the failure if an error occurred.
     ///
-    /// @brief Release mapped memory.
+    /// Release mapped memory.
     static std::error_code releaseMappedMemory(MemoryBlock &Block);
 
     /// This method sets the protection flags for a block of memory to the
@@ -105,7 +105,7 @@ namespace sys {
     /// \r error_success if the function was successful, or an error_code
     /// describing the failure if an error occurred.
     ///
-    /// @brief Set memory protection state.
+    /// Set memory protection state.
     static std::error_code protectMappedMemory(const MemoryBlock &Block,
                                                unsigned Flags);
 
diff --git a/contrib/llvm/include/llvm/Support/MemoryBuffer.h b/contrib/llvm/include/llvm/Support/MemoryBuffer.h
index 7b849fdb8670..535579ecff53 100644
--- a/contrib/llvm/include/llvm/Support/MemoryBuffer.h
+++ b/contrib/llvm/include/llvm/Support/MemoryBuffer.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -49,7 +50,8 @@ protected:
   void init(const char *BufStart, const char *BufEnd,
             bool RequiresNullTerminator);
 
-  static constexpr bool Writable = false;
+  static constexpr sys::fs::mapped_file_region::mapmode Mapmode =
+      sys::fs::mapped_file_region::readonly;
 
 public:
   MemoryBuffer(const MemoryBuffer &) = delete;
@@ -117,12 +119,6 @@ public:
   static std::unique_ptr<MemoryBuffer>
   getMemBufferCopy(StringRef InputData, const Twine &BufferName = "");
 
-  /// Allocate a new zero-initialized MemoryBuffer of the specified size. Note
-  /// that the caller need not initialize the memory allocated by this method.
-  /// The memory is owned by the MemoryBuffer object.
-  static std::unique_ptr<MemoryBuffer>
-  getNewMemBuffer(size_t Size, StringRef BufferName = "");
-
   /// Read all of stdin into a file buffer, and return it.
   static ErrorOr<std::unique_ptr<MemoryBuffer>> getSTDIN();
 
@@ -152,17 +148,21 @@ public:
   virtual BufferKind getBufferKind() const = 0;
 
   MemoryBufferRef getMemBufferRef() const;
+
+private:
+  virtual void anchor();
 };
 
-/// This class is an extension of MemoryBuffer, which allows writing to the
-/// underlying contents.  It only supports creation methods that are guaranteed
-/// to produce a writable buffer.  For example, mapping a file read-only is not
-/// supported.
+/// This class is an extension of MemoryBuffer, which allows copy-on-write
+/// access to the underlying contents.  It only supports creation methods that
+/// are guaranteed to produce a writable buffer.  For example, mapping a file
+/// read-only is not supported.
 class WritableMemoryBuffer : public MemoryBuffer {
 protected:
   WritableMemoryBuffer() = default;
 
-  static constexpr bool Writable = true;
+  static constexpr sys::fs::mapped_file_region::mapmode Mapmode =
+      sys::fs::mapped_file_region::priv;
 
 public:
   using MemoryBuffer::getBuffer;
@@ -196,6 +196,60 @@ public:
   static std::unique_ptr<WritableMemoryBuffer>
   getNewUninitMemBuffer(size_t Size, const Twine &BufferName = "");
 
+  /// Allocate a new zero-initialized MemoryBuffer of the specified size. Note
+  /// that the caller need not initialize the memory allocated by this method.
+  /// The memory is owned by the MemoryBuffer object.
+  static std::unique_ptr<WritableMemoryBuffer>
+  getNewMemBuffer(size_t Size, const Twine &BufferName = "");
+
+private:
+  // Hide these base class factory function so one can't write
+  //   WritableMemoryBuffer::getXXX()
+  // and be surprised that he got a read-only Buffer.
+  using MemoryBuffer::getFileAsStream;
+  using MemoryBuffer::getFileOrSTDIN;
+  using MemoryBuffer::getMemBuffer;
+  using MemoryBuffer::getMemBufferCopy;
+  using MemoryBuffer::getOpenFile;
+  using MemoryBuffer::getOpenFileSlice;
+  using MemoryBuffer::getSTDIN;
+};
+
+/// This class is an extension of MemoryBuffer, which allows write access to
+/// the underlying contents and committing those changes to the original source.
+/// It only supports creation methods that are guaranteed to produce a writable
+/// buffer.  For example, mapping a file read-only is not supported.
+class WriteThroughMemoryBuffer : public MemoryBuffer {
+protected:
+  WriteThroughMemoryBuffer() = default;
+
+  static constexpr sys::fs::mapped_file_region::mapmode Mapmode =
+      sys::fs::mapped_file_region::readwrite;
+
+public:
+  using MemoryBuffer::getBuffer;
+  using MemoryBuffer::getBufferEnd;
+  using MemoryBuffer::getBufferStart;
+
+  // const_cast is well-defined here, because the underlying buffer is
+  // guaranteed to have been initialized with a mutable buffer.
+  char *getBufferStart() {
+    return const_cast<char *>(MemoryBuffer::getBufferStart());
+  }
+  char *getBufferEnd() {
+    return const_cast<char *>(MemoryBuffer::getBufferEnd());
+  }
+  MutableArrayRef<char> getBuffer() {
+    return {getBufferStart(), getBufferEnd()};
+  }
+
+  static ErrorOr<std::unique_ptr<WriteThroughMemoryBuffer>>
+  getFile(const Twine &Filename, int64_t FileSize = -1);
+
+  /// Map a subrange of the specified file as a ReadWriteMemoryBuffer.
+  static ErrorOr<std::unique_ptr<WriteThroughMemoryBuffer>>
+  getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset);
+
 private:
   // Hide these base class factory function so one can't write
   //   WritableMemoryBuffer::getXXX()
@@ -204,7 +258,6 @@ private:
   using MemoryBuffer::getFileOrSTDIN;
   using MemoryBuffer::getMemBuffer;
   using MemoryBuffer::getMemBufferCopy;
-  using MemoryBuffer::getNewMemBuffer;
   using MemoryBuffer::getOpenFile;
   using MemoryBuffer::getOpenFileSlice;
   using MemoryBuffer::getSTDIN;
diff --git a/contrib/llvm/include/llvm/Support/MipsABIFlags.h b/contrib/llvm/include/llvm/Support/MipsABIFlags.h
index 93f6b416ba88..12c350015b21 100644
--- a/contrib/llvm/include/llvm/Support/MipsABIFlags.h
+++ b/contrib/llvm/include/llvm/Support/MipsABIFlags.h
@@ -42,7 +42,9 @@ enum AFL_ASE {
   AFL_ASE_MSA = 0x00000200,       // MSA ASE
   AFL_ASE_MIPS16 = 0x00000400,    // MIPS16 ASE
   AFL_ASE_MICROMIPS = 0x00000800, // MICROMIPS ASE
-  AFL_ASE_XPA = 0x00001000        // XPA ASE
+  AFL_ASE_XPA = 0x00001000,       // XPA ASE
+  AFL_ASE_CRC = 0x00008000,       // CRC ASE
+  AFL_ASE_GINV = 0x00020000       // GINV ASE
 };
 
 // Values for the isa_ext word of an ABI flags structure.
diff --git a/contrib/llvm/include/llvm/Support/Mutex.h b/contrib/llvm/include/llvm/Support/Mutex.h
index 0f4e61af4439..680d94b24ef5 100644
--- a/contrib/llvm/include/llvm/Support/Mutex.h
+++ b/contrib/llvm/include/llvm/Support/Mutex.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_MUTEX_H
 #define LLVM_SUPPORT_MUTEX_H
 
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
@@ -22,7 +23,7 @@ namespace llvm
 {
   namespace sys
   {
-    /// @brief Platform agnostic Mutex class.
+    /// Platform agnostic Mutex class.
     class MutexImpl
     {
     /// @name Constructors
@@ -33,11 +34,11 @@ namespace llvm
       /// to false, the lock will not be recursive which makes it cheaper but
       /// also more likely to deadlock (same thread can't acquire more than
       /// once).
-      /// @brief Default Constructor.
+      /// Default Constructor.
       explicit MutexImpl(bool recursive = true);
 
       /// Releases and removes the lock
-      /// @brief Destructor
+      /// Destructor
       ~MutexImpl();
 
     /// @}
@@ -48,14 +49,14 @@ namespace llvm
       /// Attempts to unconditionally acquire the lock. If the lock is held by
       /// another thread, this method will wait until it can acquire the lock.
       /// @returns false if any kind of error occurs, true otherwise.
-      /// @brief Unconditionally acquire the lock.
+      /// Unconditionally acquire the lock.
       bool acquire();
 
       /// Attempts to release the lock. If the lock is held by the current
       /// thread, the lock is released allowing other threads to acquire the
       /// lock.
       /// @returns false if any kind of error occurs, true otherwise.
-      /// @brief Unconditionally release the lock.
+      /// Unconditionally release the lock.
       bool release();
 
       /// Attempts to acquire the lock without blocking. If the lock is not
@@ -63,7 +64,7 @@ namespace llvm
       /// the lock is available, it is acquired.
       /// @returns false if any kind of error occurs or the lock is not
       /// available, true otherwise.
-      /// @brief Try to acquire the lock.
+      /// Try to acquire the lock.
       bool tryacquire();
 
     //@}
diff --git a/contrib/llvm/include/llvm/Support/MutexGuard.h b/contrib/llvm/include/llvm/Support/MutexGuard.h
index 07b64b611960..641d64d94988 100644
--- a/contrib/llvm/include/llvm/Support/MutexGuard.h
+++ b/contrib/llvm/include/llvm/Support/MutexGuard.h
@@ -23,7 +23,7 @@ namespace llvm {
   /// these on the stack at the top of some scope to be assured that C++
   /// destruction of the object will always release the Mutex and thus avoid
   /// a host of nasty multi-threading problems in the face of exceptions, etc.
-  /// @brief Guard a section of code with a Mutex.
+  /// Guard a section of code with a Mutex.
   class MutexGuard {
     sys::Mutex &M;
     MutexGuard(const MutexGuard &) = delete;
diff --git a/contrib/llvm/include/llvm/Support/OnDiskHashTable.h b/contrib/llvm/include/llvm/Support/OnDiskHashTable.h
index e9c28daf03b9..912e2700d1a0 100644
--- a/contrib/llvm/include/llvm/Support/OnDiskHashTable.h
+++ b/contrib/llvm/include/llvm/Support/OnDiskHashTable.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Defines facilities for reading and writing on-disk hash tables.
+/// Defines facilities for reading and writing on-disk hash tables.
 ///
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-/// \brief Generates an on disk hash table.
+/// Generates an on disk hash table.
 ///
 /// This needs an \c Info that handles storing values into the hash table's
 /// payload and computes the hash for a given key. This should provide the
@@ -57,7 +57,7 @@ namespace llvm {
 /// };
 /// \endcode
 template <typename Info> class OnDiskChainedHashTableGenerator {
-  /// \brief A single item in the hash table.
+  /// A single item in the hash table.
   class Item {
   public:
     typename Info::key_type Key;
@@ -75,7 +75,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator {
   offset_type NumEntries;
   llvm::SpecificBumpPtrAllocator<Item> BA;
 
-  /// \brief A linked list of values in a particular hash bucket.
+  /// A linked list of values in a particular hash bucket.
   struct Bucket {
     offset_type Off;
     unsigned Length;
@@ -85,7 +85,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator {
   Bucket *Buckets;
 
 private:
-  /// \brief Insert an item into the appropriate hash bucket.
+  /// Insert an item into the appropriate hash bucket.
   void insert(Bucket *Buckets, size_t Size, Item *E) {
     Bucket &B = Buckets[E->Hash & (Size - 1)];
     E->Next = B.Head;
@@ -93,9 +93,10 @@ private:
     B.Head = E;
   }
 
-  /// \brief Resize the hash table, moving the old entries into the new buckets.
+  /// Resize the hash table, moving the old entries into the new buckets.
   void resize(size_t NewSize) {
-    Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
+    Bucket *NewBuckets = static_cast<Bucket *>(
+        safe_calloc(NewSize, sizeof(Bucket)));
     // Populate NewBuckets with the old entries.
     for (size_t I = 0; I < NumBuckets; ++I)
       for (Item *E = Buckets[I].Head; E;) {
@@ -111,14 +112,14 @@ private:
   }
 
 public:
-  /// \brief Insert an entry into the table.
+  /// Insert an entry into the table.
   void insert(typename Info::key_type_ref Key,
               typename Info::data_type_ref Data) {
     Info InfoObj;
     insert(Key, Data, InfoObj);
   }
 
-  /// \brief Insert an entry into the table.
+  /// Insert an entry into the table.
   ///
   /// Uses the provided Info instead of a stack allocated one.
   void insert(typename Info::key_type_ref Key,
@@ -129,7 +130,7 @@ public:
     insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
   }
 
-  /// \brief Determine whether an entry has been inserted.
+  /// Determine whether an entry has been inserted.
   bool contains(typename Info::key_type_ref Key, Info &InfoObj) {
     unsigned Hash = InfoObj.ComputeHash(Key);
     for (Item *I = Buckets[Hash & (NumBuckets - 1)].Head; I; I = I->Next)
@@ -138,18 +139,18 @@ public:
     return false;
   }
 
-  /// \brief Emit the table to Out, which must not be at offset 0.
+  /// Emit the table to Out, which must not be at offset 0.
   offset_type Emit(raw_ostream &Out) {
     Info InfoObj;
     return Emit(Out, InfoObj);
   }
 
-  /// \brief Emit the table to Out, which must not be at offset 0.
+  /// Emit the table to Out, which must not be at offset 0.
   ///
   /// Uses the provided Info instead of a stack allocated one.
   offset_type Emit(raw_ostream &Out, Info &InfoObj) {
     using namespace llvm::support;
-    endian::Writer<little> LE(Out);
+    endian::Writer LE(Out, little);
 
     // Now we're done adding entries, resize the bucket list if it's
     // significantly too large. (This only happens if the number of
@@ -226,13 +227,13 @@ public:
     NumBuckets = 64;
     // Note that we do not need to run the constructors of the individual
     // Bucket objects since 'calloc' returns bytes that are all 0.
-    Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
+    Buckets = static_cast<Bucket *>(safe_calloc(NumBuckets, sizeof(Bucket)));
   }
 
   ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
 };
 
-/// \brief Provides lookup on an on disk hash table.
+/// Provides lookup on an on disk hash table.
 ///
 /// This needs an \c Info that handles reading values from the hash table's
 /// payload and computes the hash for a given key. This should provide the
@@ -338,14 +339,14 @@ public:
     bool operator!=(const iterator &X) const { return X.Data != Data; }
   };
 
-  /// \brief Look up the stored data for a particular key.
+  /// Look up the stored data for a particular key.
   iterator find(const external_key_type &EKey, Info *InfoPtr = nullptr) {
     const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
     hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
     return find_hashed(IKey, KeyHash, InfoPtr);
   }
 
-  /// \brief Look up the stored data for a particular key with a known hash.
+  /// Look up the stored data for a particular key with a known hash.
   iterator find_hashed(const internal_key_type &IKey, hash_value_type KeyHash,
                        Info *InfoPtr = nullptr) {
     using namespace llvm::support;
@@ -403,7 +404,7 @@ public:
 
   Info &getInfoObj() { return InfoObj; }
 
-  /// \brief Create the hash table.
+  /// Create the hash table.
   ///
   /// \param Buckets is the beginning of the hash table itself, which follows
   /// the payload of entire structure. This is the value returned by
@@ -423,7 +424,7 @@ public:
   }
 };
 
-/// \brief Provides lookup and iteration over an on disk hash table.
+/// Provides lookup and iteration over an on disk hash table.
 ///
 /// \copydetails llvm::OnDiskChainedHashTable
 template <typename Info>
@@ -439,7 +440,7 @@ public:
   typedef typename base_type::offset_type       offset_type;
 
 private:
-  /// \brief Iterates over all of the keys in the table.
+  /// Iterates over all of the keys in the table.
   class iterator_base {
     const unsigned char *Ptr;
     offset_type NumItemsInBucketLeft;
@@ -496,7 +497,7 @@ public:
       : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
         Payload(Payload) {}
 
-  /// \brief Iterates over all of the keys in the table.
+  /// Iterates over all of the keys in the table.
   class key_iterator : public iterator_base {
     Info *InfoObj;
 
@@ -542,7 +543,7 @@ public:
     return make_range(key_begin(), key_end());
   }
 
-  /// \brief Iterates over all the entries in the table, returning the data.
+  /// Iterates over all the entries in the table, returning the data.
   class data_iterator : public iterator_base {
     Info *InfoObj;
 
@@ -585,7 +586,7 @@ public:
     return make_range(data_begin(), data_end());
   }
 
-  /// \brief Create the hash table.
+  /// Create the hash table.
   ///
   /// \param Buckets is the beginning of the hash table itself, which follows
   /// the payload of entire structure. This is the value returned by
diff --git a/contrib/llvm/include/llvm/Support/Options.h b/contrib/llvm/include/llvm/Support/Options.h
index 9019804d24e0..dd321c6a1984 100644
--- a/contrib/llvm/include/llvm/Support/Options.h
+++ b/contrib/llvm/include/llvm/Support/Options.h
@@ -56,7 +56,7 @@ char OptionKey<ValT, Base, Mem>::ID = 0;
 
 } // namespace detail
 
-/// \brief Singleton class used to register debug options.
+/// Singleton class used to register debug options.
 ///
 /// The OptionRegistry is responsible for managing lifetimes of the options and
 /// provides interfaces for option registration and reading values from options.
@@ -66,7 +66,7 @@ class OptionRegistry {
 private:
   DenseMap<void *, cl::Option *> Options;
 
-  /// \brief Adds a cl::Option to the registry.
+  /// Adds a cl::Option to the registry.
   ///
   /// \param Key unique key for option
   /// \param O option to map to \p Key
@@ -79,10 +79,10 @@ public:
   ~OptionRegistry();
   OptionRegistry() {}
 
-  /// \brief Returns a reference to the singleton instance.
+  /// Returns a reference to the singleton instance.
   static OptionRegistry &instance();
 
-  /// \brief Registers an option with the OptionRegistry singleton.
+  /// Registers an option with the OptionRegistry singleton.
   ///
   /// \tparam ValT type of the option's data
   /// \tparam Base class used to key the option
@@ -100,7 +100,7 @@ public:
     instance().addOption(&detail::OptionKey<ValT, Base, Mem>::ID, Option);
   }
 
-  /// \brief Returns the value of the option.
+  /// Returns the value of the option.
   ///
   /// \tparam ValT type of the option's data
   /// \tparam Base class used to key the option
diff --git a/contrib/llvm/include/llvm/Support/Parallel.h b/contrib/llvm/include/llvm/Support/Parallel.h
index 6bc0a6bbaf2b..1462265343be 100644
--- a/contrib/llvm/include/llvm/Support/Parallel.h
+++ b/contrib/llvm/include/llvm/Support/Parallel.h
@@ -56,12 +56,12 @@ public:
   ~Latch() { sync(); }
 
   void inc() {
-    std::unique_lock<std::mutex> lock(Mutex);
+    std::lock_guard<std::mutex> lock(Mutex);
     ++Count;
   }
 
   void dec() {
-    std::unique_lock<std::mutex> lock(Mutex);
+    std::lock_guard<std::mutex> lock(Mutex);
     if (--Count == 0)
       Cond.notify_all();
   }
@@ -100,7 +100,7 @@ void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
 #else
 const ptrdiff_t MinParallelSize = 1024;
 
-/// \brief Inclusive median.
+/// Inclusive median.
 template <class RandomAccessIterator, class Comparator>
 RandomAccessIterator medianOf3(RandomAccessIterator Start,
                                RandomAccessIterator End,
@@ -118,7 +118,7 @@ void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End,
                          const Comparator &Comp, TaskGroup &TG, size_t Depth) {
   // Do a sequential sort for small inputs.
   if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) {
-    std::sort(Start, End, Comp);
+    llvm::sort(Start, End, Comp);
     return;
   }
 
@@ -200,7 +200,7 @@ void sort(Policy policy, RandomAccessIterator Start, RandomAccessIterator End,
           const Comparator &Comp = Comparator()) {
   static_assert(is_execution_policy<Policy>::value,
                 "Invalid execution policy!");
-  std::sort(Start, End, Comp);
+  llvm::sort(Start, End, Comp);
 }
 
 template <class Policy, class IterTy, class FuncTy>
diff --git a/contrib/llvm/include/llvm/Support/Path.h b/contrib/llvm/include/llvm/Support/Path.h
index e5979674cf1c..c4cc93721d7e 100644
--- a/contrib/llvm/include/llvm/Support/Path.h
+++ b/contrib/llvm/include/llvm/Support/Path.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/DataTypes.h"
 #include <iterator>
+#include <system_error>
 
 namespace llvm {
 namespace sys {
@@ -30,7 +31,7 @@ enum class Style { windows, posix, native };
 /// @name Lexical Component Iterator
 /// @{
 
-/// @brief Path iterator.
+/// Path iterator.
 ///
 /// This is an input iterator that iterates over the individual components in
 /// \a path. The traversal order is as follows:
@@ -66,11 +67,11 @@ public:
   const_iterator &operator++();    // preincrement
   bool operator==(const const_iterator &RHS) const;
 
-  /// @brief Difference in bytes between this and RHS.
+  /// Difference in bytes between this and RHS.
   ptrdiff_t operator-(const const_iterator &RHS) const;
 };
 
-/// @brief Reverse path iterator.
+/// Reverse path iterator.
 ///
 /// This is an input iterator that iterates over the individual components in
 /// \a path in reverse order. The traversal order is exactly reversed from that
@@ -91,26 +92,26 @@ public:
   reverse_iterator &operator++();    // preincrement
   bool operator==(const reverse_iterator &RHS) const;
 
-  /// @brief Difference in bytes between this and RHS.
+  /// Difference in bytes between this and RHS.
   ptrdiff_t operator-(const reverse_iterator &RHS) const;
 };
 
-/// @brief Get begin iterator over \a path.
+/// Get begin iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized with the first component of \a path.
 const_iterator begin(StringRef path, Style style = Style::native);
 
-/// @brief Get end iterator over \a path.
+/// Get end iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized to the end of \a path.
 const_iterator end(StringRef path);
 
-/// @brief Get reverse begin iterator over \a path.
+/// Get reverse begin iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized with the first reverse component of \a path.
 reverse_iterator rbegin(StringRef path, Style style = Style::native);
 
-/// @brief Get reverse end iterator over \a path.
+/// Get reverse end iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized to the reverse end of \a path.
 reverse_iterator rend(StringRef path);
@@ -119,7 +120,7 @@ reverse_iterator rend(StringRef path);
 /// @name Lexical Modifiers
 /// @{
 
-/// @brief Remove the last component from \a path unless it is the root dir.
+/// Remove the last component from \a path unless it is the root dir.
 ///
 /// @code
 ///   directory/filename.cpp => directory/
@@ -131,7 +132,7 @@ reverse_iterator rend(StringRef path);
 /// @param path A path that is modified to not have a file component.
 void remove_filename(SmallVectorImpl<char> &path, Style style = Style::native);
 
-/// @brief Replace the file extension of \a path with \a extension.
+/// Replace the file extension of \a path with \a extension.
 ///
 /// @code
 ///   ./filename.cpp => ./filename.extension
@@ -146,7 +147,7 @@ void remove_filename(SmallVectorImpl<char> &path, Style style = Style::native);
 void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
                        Style style = Style::native);
 
-/// @brief Replace matching path prefix with another path.
+/// Replace matching path prefix with another path.
 ///
 /// @code
 ///   /foo, /old, /new => /foo
@@ -163,7 +164,7 @@ void replace_path_prefix(SmallVectorImpl<char> &Path,
                          const StringRef &OldPrefix, const StringRef &NewPrefix,
                          Style style = Style::native);
 
-/// @brief Append to path.
+/// Append to path.
 ///
 /// @code
 ///   /foo  + bar/f => /foo/bar/f
@@ -181,7 +182,7 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 void append(SmallVectorImpl<char> &path, Style style, const Twine &a,
             const Twine &b = "", const Twine &c = "", const Twine &d = "");
 
-/// @brief Append to path.
+/// Append to path.
 ///
 /// @code
 ///   /foo  + [bar,f] => /foo/bar/f
@@ -215,7 +216,7 @@ void native(const Twine &path, SmallVectorImpl<char> &result,
 /// @param path A path that is transformed to native format.
 void native(SmallVectorImpl<char> &path, Style style = Style::native);
 
-/// @brief Replaces backslashes with slashes if Windows.
+/// Replaces backslashes with slashes if Windows.
 ///
 /// @param path processed path
 /// @result The result of replacing backslashes with forward slashes if Windows.
@@ -227,7 +228,7 @@ std::string convert_to_slash(StringRef path, Style style = Style::native);
 /// @name Lexical Observers
 /// @{
 
-/// @brief Get root name.
+/// Get root name.
 ///
 /// @code
 ///   //net/hello => //net
@@ -239,7 +240,7 @@ std::string convert_to_slash(StringRef path, Style style = Style::native);
 /// @result The root name of \a path if it has one, otherwise "".
 StringRef root_name(StringRef path, Style style = Style::native);
 
-/// @brief Get root directory.
+/// Get root directory.
 ///
 /// @code
 ///   /goo/hello => /
@@ -252,7 +253,7 @@ StringRef root_name(StringRef path, Style style = Style::native);
 ///               "".
 StringRef root_directory(StringRef path, Style style = Style::native);
 
-/// @brief Get root path.
+/// Get root path.
 ///
 /// Equivalent to root_name + root_directory.
 ///
@@ -260,7 +261,7 @@ StringRef root_directory(StringRef path, Style style = Style::native);
 /// @result The root path of \a path if it has one, otherwise "".
 StringRef root_path(StringRef path, Style style = Style::native);
 
-/// @brief Get relative path.
+/// Get relative path.
 ///
 /// @code
 ///   C:\hello\world => hello\world
@@ -272,7 +273,7 @@ StringRef root_path(StringRef path, Style style = Style::native);
 /// @result The path starting after root_path if one exists, otherwise "".
 StringRef relative_path(StringRef path, Style style = Style::native);
 
-/// @brief Get parent path.
+/// Get parent path.
 ///
 /// @code
 ///   /          => <empty>
@@ -284,7 +285,7 @@ StringRef relative_path(StringRef path, Style style = Style::native);
 /// @result The parent path of \a path if one exists, otherwise "".
 StringRef parent_path(StringRef path, Style style = Style::native);
 
-/// @brief Get filename.
+/// Get filename.
 ///
 /// @code
 ///   /foo.txt    => foo.txt
@@ -298,7 +299,7 @@ StringRef parent_path(StringRef path, Style style = Style::native);
 ///         of \a path.
 StringRef filename(StringRef path, Style style = Style::native);
 
-/// @brief Get stem.
+/// Get stem.
 ///
 /// If filename contains a dot but not solely one or two dots, result is the
 /// substring of filename ending at (but not including) the last dot. Otherwise
@@ -316,7 +317,7 @@ StringRef filename(StringRef path, Style style = Style::native);
 /// @result The stem of \a path.
 StringRef stem(StringRef path, Style style = Style::native);
 
-/// @brief Get extension.
+/// Get extension.
 ///
 /// If filename contains a dot but not solely one or two dots, result is the
 /// substring of filename starting at (and including) the last dot, and ending
@@ -332,18 +333,18 @@ StringRef stem(StringRef path, Style style = Style::native);
 /// @result The extension of \a path.
 StringRef extension(StringRef path, Style style = Style::native);
 
-/// @brief Check whether the given char is a path separator on the host OS.
+/// Check whether the given char is a path separator on the host OS.
 ///
 /// @param value a character
 /// @result true if \a value is a path separator character on the host OS
 bool is_separator(char value, Style style = Style::native);
 
-/// @brief Return the preferred separator for this platform.
+/// Return the preferred separator for this platform.
 ///
 /// @result StringRef of the preferred separator, null-terminated.
 StringRef get_separator(Style style = Style::native);
 
-/// @brief Get the typical temporary directory for the system, e.g.,
+/// Get the typical temporary directory for the system, e.g.,
 /// "/var/tmp" or "C:/TEMP"
 ///
 /// @param erasedOnReboot Whether to favor a path that is erased on reboot
@@ -354,13 +355,13 @@ StringRef get_separator(Style style = Style::native);
 /// @param result Holds the resulting path name.
 void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result);
 
-/// @brief Get the user's home directory.
+/// Get the user's home directory.
 ///
 /// @param result Holds the resulting path name.
 /// @result True if a home directory is set, false otherwise.
 bool home_directory(SmallVectorImpl<char> &result);
 
-/// @brief Get the user's cache directory.
+/// Get the user's cache directory.
 ///
 /// Expect the resulting path to be a directory shared with other
 /// applications/services used by the user. Params \p Path1 to \p Path3 can be
@@ -376,7 +377,7 @@ bool home_directory(SmallVectorImpl<char> &result);
 bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1,
                           const Twine &Path2 = "", const Twine &Path3 = "");
 
-/// @brief Has root name?
+/// Has root name?
 ///
 /// root_name != ""
 ///
@@ -384,7 +385,7 @@ bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1,
 /// @result True if the path has a root name, false otherwise.
 bool has_root_name(const Twine &path, Style style = Style::native);
 
-/// @brief Has root directory?
+/// Has root directory?
 ///
 /// root_directory != ""
 ///
@@ -392,7 +393,7 @@ bool has_root_name(const Twine &path, Style style = Style::native);
 /// @result True if the path has a root directory, false otherwise.
 bool has_root_directory(const Twine &path, Style style = Style::native);
 
-/// @brief Has root path?
+/// Has root path?
 ///
 /// root_path != ""
 ///
@@ -400,7 +401,7 @@ bool has_root_directory(const Twine &path, Style style = Style::native);
 /// @result True if the path has a root path, false otherwise.
 bool has_root_path(const Twine &path, Style style = Style::native);
 
-/// @brief Has relative path?
+/// Has relative path?
 ///
 /// relative_path != ""
 ///
@@ -408,7 +409,7 @@ bool has_root_path(const Twine &path, Style style = Style::native);
 /// @result True if the path has a relative path, false otherwise.
 bool has_relative_path(const Twine &path, Style style = Style::native);
 
-/// @brief Has parent path?
+/// Has parent path?
 ///
 /// parent_path != ""
 ///
@@ -416,7 +417,7 @@ bool has_relative_path(const Twine &path, Style style = Style::native);
 /// @result True if the path has a parent path, false otherwise.
 bool has_parent_path(const Twine &path, Style style = Style::native);
 
-/// @brief Has filename?
+/// Has filename?
 ///
 /// filename != ""
 ///
@@ -424,7 +425,7 @@ bool has_parent_path(const Twine &path, Style style = Style::native);
 /// @result True if the path has a filename, false otherwise.
 bool has_filename(const Twine &path, Style style = Style::native);
 
-/// @brief Has stem?
+/// Has stem?
 ///
 /// stem != ""
 ///
@@ -432,7 +433,7 @@ bool has_filename(const Twine &path, Style style = Style::native);
 /// @result True if the path has a stem, false otherwise.
 bool has_stem(const Twine &path, Style style = Style::native);
 
-/// @brief Has extension?
+/// Has extension?
 ///
 /// extension != ""
 ///
@@ -440,25 +441,25 @@ bool has_stem(const Twine &path, Style style = Style::native);
 /// @result True if the path has a extension, false otherwise.
 bool has_extension(const Twine &path, Style style = Style::native);
 
-/// @brief Is path absolute?
+/// Is path absolute?
 ///
 /// @param path Input path.
 /// @result True if the path is absolute, false if it is not.
 bool is_absolute(const Twine &path, Style style = Style::native);
 
-/// @brief Is path relative?
+/// Is path relative?
 ///
 /// @param path Input path.
 /// @result True if the path is relative, false if it is not.
 bool is_relative(const Twine &path, Style style = Style::native);
 
-/// @brief Remove redundant leading "./" pieces and consecutive separators.
+/// Remove redundant leading "./" pieces and consecutive separators.
 ///
 /// @param path Input path.
 /// @result The cleaned-up \a path.
 StringRef remove_leading_dotslash(StringRef path, Style style = Style::native);
 
-/// @brief In-place remove any './' and optionally '../' components from a path.
+/// In-place remove any './' and optionally '../' components from a path.
 ///
 /// @param path processed path
 /// @param remove_dot_dot specify if '../' (except for leading "../") should be
@@ -467,6 +468,10 @@ StringRef remove_leading_dotslash(StringRef path, Style style = Style::native);
 bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot = false,
                  Style style = Style::native);
 
+#if defined(_WIN32)
+std::error_code widenPath(const Twine &Path8, SmallVectorImpl<wchar_t> &Path16);
+#endif
+
 } // end namespace path
 } // end namespace sys
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/contrib/llvm/include/llvm/Support/PointerLikeTypeTraits.h
index 794230d606a4..1710b57131d1 100644
--- a/contrib/llvm/include/llvm/Support/PointerLikeTypeTraits.h
+++ b/contrib/llvm/include/llvm/Support/PointerLikeTypeTraits.h
@@ -16,6 +16,7 @@
 #define LLVM_SUPPORT_POINTERLIKETYPETRAITS_H
 
 #include "llvm/Support/DataTypes.h"
+#include <assert.h>
 #include <type_traits>
 
 namespace llvm {
@@ -111,6 +112,39 @@ template <> struct PointerLikeTypeTraits<uintptr_t> {
   enum { NumLowBitsAvailable = 0 };
 };
 
+/// Provide suitable custom traits struct for function pointers.
+///
+/// Function pointers can't be directly given these traits as functions can't
+/// have their alignment computed with `alignof` and we need different casting.
+///
+/// To rely on higher alignment for a specialized use, you can provide a
+/// customized form of this template explicitly with higher alignment, and
+/// potentially use alignment attributes on functions to satisfy that.
+template <int Alignment, typename FunctionPointerT>
+struct FunctionPointerLikeTypeTraits {
+  enum { NumLowBitsAvailable = detail::ConstantLog2<Alignment>::value };
+  static inline void *getAsVoidPointer(FunctionPointerT P) {
+    assert((reinterpret_cast<uintptr_t>(P) &
+            ~((uintptr_t)-1 << NumLowBitsAvailable)) == 0 &&
+           "Alignment not satisfied for an actual function pointer!");
+    return reinterpret_cast<void *>(P);
+  }
+  static inline FunctionPointerT getFromVoidPointer(void *P) {
+    return reinterpret_cast<FunctionPointerT>(P);
+  }
+};
+
+/// Provide a default specialization for function pointers that assumes 4-byte
+/// alignment.
+///
+/// We assume here that functions used with this are always at least 4-byte
+/// aligned. This means that, for example, thumb functions won't work or systems
+/// with weird unaligned function pointers won't work. But all practical systems
+/// we support satisfy this requirement.
+template <typename ReturnT, typename... ParamTs>
+struct PointerLikeTypeTraits<ReturnT (*)(ParamTs...)>
+    : FunctionPointerLikeTypeTraits<4, ReturnT (*)(ParamTs...)> {};
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/Support/Process.h b/contrib/llvm/include/llvm/Support/Process.h
index 82b0d9f6ba28..f9f1cac86278 100644
--- a/contrib/llvm/include/llvm/Support/Process.h
+++ b/contrib/llvm/include/llvm/Support/Process.h
@@ -26,7 +26,6 @@
 #define LLVM_SUPPORT_PROCESS_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/DataTypes.h"
@@ -39,13 +38,13 @@ class StringRef;
 namespace sys {
 
 
-/// \brief A collection of legacy interfaces for querying information about the
+/// A collection of legacy interfaces for querying information about the
 /// current executing process.
 class Process {
 public:
   static unsigned getPageSize();
 
-  /// \brief Return process memory usage.
+  /// Return process memory usage.
   /// This static function will return the total amount of memory allocated
   /// by the process. This only counts the memory allocated via the malloc,
   /// calloc and realloc functions and includes any "free" holes in the
@@ -67,10 +66,10 @@ public:
   /// This function makes the necessary calls to the operating system to
   /// prevent core files or any other kind of large memory dumps that can
   /// occur when a program fails.
-  /// @brief Prevent core file generation.
+  /// Prevent core file generation.
   static void PreventCoreFiles();
 
-  /// \brief true if PreventCoreFiles has been called, false otherwise.
+  /// true if PreventCoreFiles has been called, false otherwise.
   static bool AreCoreFilesPrevented();
 
   // This function returns the environment variable \arg name's value as a UTF-8
@@ -90,14 +89,6 @@ public:
   static Optional<std::string> FindInEnvPath(StringRef EnvName,
                                              StringRef FileName);
 
-  /// This function returns a SmallVector containing the arguments passed from
-  /// the operating system to the program.  This function expects to be handed
-  /// the vector passed in from main.
-  static std::error_code
-  GetArgumentVector(SmallVectorImpl<const char *> &Args,
-                    ArrayRef<const char *> ArgsFromMain,
-                    SpecificBumpPtrAllocator<char> &ArgAllocator);
-
   // This functions ensures that the standard file descriptors (input, output,
   // and error) are properly mapped to a file descriptor before we use any of
   // them.  This should only be called by standalone programs, library
diff --git a/contrib/llvm/include/llvm/Support/Program.h b/contrib/llvm/include/llvm/Support/Program.h
index 06fd35078145..1f4dbdce3323 100644
--- a/contrib/llvm/include/llvm/Support/Program.h
+++ b/contrib/llvm/include/llvm/Support/Program.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorOr.h"
 #include <system_error>
 
@@ -27,35 +28,32 @@ namespace sys {
   // a colon on Unix or a semicolon on Windows.
 #if defined(LLVM_ON_UNIX)
   const char EnvPathSeparator = ':';
-#elif defined (LLVM_ON_WIN32)
+#elif defined (_WIN32)
   const char EnvPathSeparator = ';';
 #endif
 
-/// @brief This struct encapsulates information about a process.
-struct ProcessInfo {
-#if defined(LLVM_ON_UNIX)
-  typedef pid_t ProcessId;
-#elif defined(LLVM_ON_WIN32)
-  typedef unsigned long ProcessId; // Must match the type of DWORD on Windows.
-  typedef void * HANDLE; // Must match the type of HANDLE on Windows.
-  /// The handle to the process (available on Windows only).
-  HANDLE ProcessHandle;
+#if defined(_WIN32)
+  typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
+  typedef void *process_t;        // Must match the type of HANDLE on Windows.
 #else
-#error "ProcessInfo is not defined for this platform!"
+  typedef pid_t procid_t;
+  typedef procid_t process_t;
 #endif
 
-  enum : ProcessId { InvalidPid = 0 };
+  /// This struct encapsulates information about a process.
+  struct ProcessInfo {
+    enum : procid_t { InvalidPid = 0 };
 
-  /// The process identifier.
-  ProcessId Pid;
+    procid_t Pid;      /// The process identifier.
+    process_t Process; /// Platform-dependent process object.
 
-  /// The return code, set after execution.
-  int ReturnCode;
+    /// The return code, set after execution.
+    int ReturnCode;
 
-  ProcessInfo();
-};
+    ProcessInfo();
+  };
 
-  /// \brief Find the first executable file \p Name in \p Paths.
+  /// Find the first executable file \p Name in \p Paths.
   ///
   /// This does not perform hashing as a shell would but instead stats each PATH
   /// entry individually so should generally be avoided. Core LLVM library
@@ -91,12 +89,13 @@ struct ProcessInfo {
   int ExecuteAndWait(
       StringRef Program, ///< Path of the program to be executed. It is
       ///< presumed this is the result of the findProgramByName method.
-      const char **Args, ///< A vector of strings that are passed to the
+      ArrayRef<StringRef> Args, ///< An array of strings that are passed to the
       ///< program.  The first element should be the name of the program.
-      ///< The list *must* be terminated by a null char* entry.
-      const char **Env = nullptr, ///< An optional vector of strings to use for
-      ///< the program's environment. If not provided, the current program's
-      ///< environment will be used.
+      ///< The array should **not** be terminated by an empty StringRef.
+      Optional<ArrayRef<StringRef>> Env = None, ///< An optional vector of
+      ///< strings to use for the program's environment. If not provided, the
+      ///< current program's environment will be used.  If specified, the
+      ///< vector should **not** be terminated by an empty StringRef.
       ArrayRef<Optional<StringRef>> Redirects = {}, ///<
       ///< An array of optional paths. Should have a size of zero or three.
       ///< If the array is empty, no redirections are performed.
@@ -125,8 +124,8 @@ struct ProcessInfo {
   /// \note On Microsoft Windows systems, users will need to either call
   /// \see Wait until the process finished execution or win32 CloseHandle() API
   /// on ProcessInfo.ProcessHandle to avoid memory leaks.
-  ProcessInfo ExecuteNoWait(StringRef Program, const char **Args,
-                            const char **Env = nullptr,
+  ProcessInfo ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
+                            Optional<ArrayRef<StringRef>> Env,
                             ArrayRef<Optional<StringRef>> Redirects = {},
                             unsigned MemoryLimit = 0,
                             std::string *ErrMsg = nullptr,
@@ -135,6 +134,11 @@ struct ProcessInfo {
   /// Return true if the given arguments fit within system-specific
   /// argument length limits.
   bool commandLineFitsWithinSystemLimits(StringRef Program,
+                                         ArrayRef<StringRef> Args);
+
+  /// Return true if the given arguments fit within system-specific
+  /// argument length limits.
+  bool commandLineFitsWithinSystemLimits(StringRef Program,
                                          ArrayRef<const char *> Args);
 
   /// File encoding options when writing contents that a non-UTF8 tool will
@@ -191,6 +195,14 @@ struct ProcessInfo {
       ///< string is non-empty upon return an error occurred while invoking the
       ///< program.
       );
+
+#if defined(_WIN32)
+  /// Given a list of command line arguments, quote and escape them as necessary
+  /// to build a single flat command line appropriate for calling CreateProcess
+  /// on
+  /// Windows.
+  std::string flattenWindowsCommandLine(ArrayRef<StringRef> Args);
+#endif
   }
 }
 
diff --git a/contrib/llvm/include/llvm/Support/RWMutex.h b/contrib/llvm/include/llvm/Support/RWMutex.h
index 85f4fc09fb87..5ac3e558999b 100644
--- a/contrib/llvm/include/llvm/Support/RWMutex.h
+++ b/contrib/llvm/include/llvm/Support/RWMutex.h
@@ -21,7 +21,7 @@
 namespace llvm {
 namespace sys {
 
-    /// @brief Platform agnostic RWMutex class.
+    /// Platform agnostic RWMutex class.
     class RWMutexImpl
     {
     /// @name Constructors
@@ -29,7 +29,7 @@ namespace sys {
     public:
 
       /// Initializes the lock but doesn't acquire it.
-      /// @brief Default Constructor.
+      /// Default Constructor.
       explicit RWMutexImpl();
 
     /// @}
@@ -40,7 +40,7 @@ namespace sys {
     /// @}
 
       /// Releases and removes the lock
-      /// @brief Destructor
+      /// Destructor
       ~RWMutexImpl();
 
     /// @}
@@ -52,24 +52,24 @@ namespace sys {
       /// lock is held by a writer, this method will wait until it can acquire
       /// the lock.
       /// @returns false if any kind of error occurs, true otherwise.
-      /// @brief Unconditionally acquire the lock in reader mode.
+      /// Unconditionally acquire the lock in reader mode.
       bool reader_acquire();
 
       /// Attempts to release the lock in reader mode.
       /// @returns false if any kind of error occurs, true otherwise.
-      /// @brief Unconditionally release the lock in reader mode.
+      /// Unconditionally release the lock in reader mode.
       bool reader_release();
 
       /// Attempts to unconditionally acquire the lock in reader mode. If the
       /// lock is held by any readers, this method will wait until it can
       /// acquire the lock.
       /// @returns false if any kind of error occurs, true otherwise.
-      /// @brief Unconditionally acquire the lock in writer mode.
+      /// Unconditionally acquire the lock in writer mode.
       bool writer_acquire();
 
       /// Attempts to release the lock in writer mode.
       /// @returns false if any kind of error occurs, true otherwise.
-      /// @brief Unconditionally release the lock in write mode.
+      /// Unconditionally release the lock in write mode.
       bool writer_release();
 
     //@}
diff --git a/contrib/llvm/include/llvm/Support/Regex.h b/contrib/llvm/include/llvm/Support/Regex.h
index f498835bcb58..d901eb1e3ffb 100644
--- a/contrib/llvm/include/llvm/Support/Regex.h
+++ b/contrib/llvm/include/llvm/Support/Regex.h
@@ -86,11 +86,11 @@ namespace llvm {
     std::string sub(StringRef Repl, StringRef String,
                     std::string *Error = nullptr);
 
-    /// \brief If this function returns true, ^Str$ is an extended regular
+    /// If this function returns true, ^Str$ is an extended regular
     /// expression that matches Str and only Str.
     static bool isLiteralERE(StringRef Str);
 
-    /// \brief Turn String into a regex by escaping its special characters.
+    /// Turn String into a regex by escaping its special characters.
     static std::string escape(StringRef String);
 
   private:
diff --git a/contrib/llvm/include/llvm/Support/SMLoc.h b/contrib/llvm/include/llvm/Support/SMLoc.h
index 5b8be5505540..c74feff378d6 100644
--- a/contrib/llvm/include/llvm/Support/SMLoc.h
+++ b/contrib/llvm/include/llvm/Support/SMLoc.h
@@ -44,8 +44,8 @@ public:
 /// Represents a range in source code.
 ///
 /// SMRange is implemented using a half-open range, as is the convention in C++.
-/// In the string "abc", the range (1,3] represents the substring "bc", and the
-/// range (2,2] represents an empty range between the characters "b" and "c".
+/// In the string "abc", the range [1,3) represents the substring "bc", and the
+/// range [2,2) represents an empty range between the characters "b" and "c".
 class SMRange {
 public:
   SMLoc Start, End;
@@ -54,7 +54,7 @@ public:
   SMRange(NoneType) {}
   SMRange(SMLoc St, SMLoc En) : Start(St), End(En) {
     assert(Start.isValid() == End.isValid() &&
-           "Start and end should either both be valid or both be invalid!");
+           "Start and End should either both be valid or both be invalid!");
   }
 
   bool isValid() const { return Start.isValid(); }
diff --git a/contrib/llvm/include/llvm/Support/SaveAndRestore.h b/contrib/llvm/include/llvm/Support/SaveAndRestore.h
index ef154ac9c913..8e11789907ad 100644
--- a/contrib/llvm/include/llvm/Support/SaveAndRestore.h
+++ b/contrib/llvm/include/llvm/Support/SaveAndRestore.h
@@ -32,18 +32,6 @@ private:
   T OldValue;
 };
 
-/// Similar to \c SaveAndRestore.  Operates only on bools; the old value of a
-/// variable is saved, and during the dstor the old value is or'ed with the new
-/// value.
-struct SaveOr {
-  SaveOr(bool &X) : X(X), OldValue(X) { X = false; }
-  ~SaveOr() { X |= OldValue; }
-
-private:
-  bool &X;
-  const bool OldValue;
-};
-
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/Support/ScaledNumber.h b/contrib/llvm/include/llvm/Support/ScaledNumber.h
index cfbdbc751617..3bd3ccedc42c 100644
--- a/contrib/llvm/include/llvm/Support/ScaledNumber.h
+++ b/contrib/llvm/include/llvm/Support/ScaledNumber.h
@@ -33,16 +33,16 @@
 namespace llvm {
 namespace ScaledNumbers {
 
-/// \brief Maximum scale; same as APFloat for easy debug printing.
+/// Maximum scale; same as APFloat for easy debug printing.
 const int32_t MaxScale = 16383;
 
-/// \brief Maximum scale; same as APFloat for easy debug printing.
+/// Maximum scale; same as APFloat for easy debug printing.
 const int32_t MinScale = -16382;
 
-/// \brief Get the width of a number.
+/// Get the width of a number.
 template <class DigitsT> inline int getWidth() { return sizeof(DigitsT) * 8; }
 
-/// \brief Conditionally round up a scaled number.
+/// Conditionally round up a scaled number.
 ///
 /// Given \c Digits and \c Scale, round up iff \c ShouldRound is \c true.
 /// Always returns \c Scale unless there's an overflow, in which case it
@@ -61,19 +61,19 @@ inline std::pair<DigitsT, int16_t> getRounded(DigitsT Digits, int16_t Scale,
   return std::make_pair(Digits, Scale);
 }
 
-/// \brief Convenience helper for 32-bit rounding.
+/// Convenience helper for 32-bit rounding.
 inline std::pair<uint32_t, int16_t> getRounded32(uint32_t Digits, int16_t Scale,
                                                  bool ShouldRound) {
   return getRounded(Digits, Scale, ShouldRound);
 }
 
-/// \brief Convenience helper for 64-bit rounding.
+/// Convenience helper for 64-bit rounding.
 inline std::pair<uint64_t, int16_t> getRounded64(uint64_t Digits, int16_t Scale,
                                                  bool ShouldRound) {
   return getRounded(Digits, Scale, ShouldRound);
 }
 
-/// \brief Adjust a 64-bit scaled number down to the appropriate width.
+/// Adjust a 64-bit scaled number down to the appropriate width.
 ///
 /// \pre Adding 64 to \c Scale will not overflow INT16_MAX.
 template <class DigitsT>
@@ -91,24 +91,24 @@ inline std::pair<DigitsT, int16_t> getAdjusted(uint64_t Digits,
                              Digits & (UINT64_C(1) << (Shift - 1)));
 }
 
-/// \brief Convenience helper for adjusting to 32 bits.
+/// Convenience helper for adjusting to 32 bits.
 inline std::pair<uint32_t, int16_t> getAdjusted32(uint64_t Digits,
                                                   int16_t Scale = 0) {
   return getAdjusted<uint32_t>(Digits, Scale);
 }
 
-/// \brief Convenience helper for adjusting to 64 bits.
+/// Convenience helper for adjusting to 64 bits.
 inline std::pair<uint64_t, int16_t> getAdjusted64(uint64_t Digits,
                                                   int16_t Scale = 0) {
   return getAdjusted<uint64_t>(Digits, Scale);
 }
 
-/// \brief Multiply two 64-bit integers to create a 64-bit scaled number.
+/// Multiply two 64-bit integers to create a 64-bit scaled number.
 ///
 /// Implemented with four 64-bit integer multiplies.
 std::pair<uint64_t, int16_t> multiply64(uint64_t LHS, uint64_t RHS);
 
-/// \brief Multiply two 32-bit integers to create a 32-bit scaled number.
+/// Multiply two 32-bit integers to create a 32-bit scaled number.
 ///
 /// Implemented with one 64-bit integer multiply.
 template <class DigitsT>
@@ -121,31 +121,31 @@ inline std::pair<DigitsT, int16_t> getProduct(DigitsT LHS, DigitsT RHS) {
   return multiply64(LHS, RHS);
 }
 
-/// \brief Convenience helper for 32-bit product.
+/// Convenience helper for 32-bit product.
 inline std::pair<uint32_t, int16_t> getProduct32(uint32_t LHS, uint32_t RHS) {
   return getProduct(LHS, RHS);
 }
 
-/// \brief Convenience helper for 64-bit product.
+/// Convenience helper for 64-bit product.
 inline std::pair<uint64_t, int16_t> getProduct64(uint64_t LHS, uint64_t RHS) {
   return getProduct(LHS, RHS);
 }
 
-/// \brief Divide two 64-bit integers to create a 64-bit scaled number.
+/// Divide two 64-bit integers to create a 64-bit scaled number.
 ///
 /// Implemented with long division.
 ///
 /// \pre \c Dividend and \c Divisor are non-zero.
 std::pair<uint64_t, int16_t> divide64(uint64_t Dividend, uint64_t Divisor);
 
-/// \brief Divide two 32-bit integers to create a 32-bit scaled number.
+/// Divide two 32-bit integers to create a 32-bit scaled number.
 ///
 /// Implemented with one 64-bit integer divide/remainder pair.
 ///
 /// \pre \c Dividend and \c Divisor are non-zero.
 std::pair<uint32_t, int16_t> divide32(uint32_t Dividend, uint32_t Divisor);
 
-/// \brief Divide two 32-bit numbers to create a 32-bit scaled number.
+/// Divide two 32-bit numbers to create a 32-bit scaled number.
 ///
 /// Implemented with one 64-bit integer divide/remainder pair.
 ///
@@ -167,19 +167,19 @@ std::pair<DigitsT, int16_t> getQuotient(DigitsT Dividend, DigitsT Divisor) {
   return divide32(Dividend, Divisor);
 }
 
-/// \brief Convenience helper for 32-bit quotient.
+/// Convenience helper for 32-bit quotient.
 inline std::pair<uint32_t, int16_t> getQuotient32(uint32_t Dividend,
                                                   uint32_t Divisor) {
   return getQuotient(Dividend, Divisor);
 }
 
-/// \brief Convenience helper for 64-bit quotient.
+/// Convenience helper for 64-bit quotient.
 inline std::pair<uint64_t, int16_t> getQuotient64(uint64_t Dividend,
                                                   uint64_t Divisor) {
   return getQuotient(Dividend, Divisor);
 }
 
-/// \brief Implementation of getLg() and friends.
+/// Implementation of getLg() and friends.
 ///
 /// Returns the rounded lg of \c Digits*2^Scale and an int specifying whether
 /// this was rounded up (1), down (-1), or exact (0).
@@ -206,7 +206,7 @@ inline std::pair<int32_t, int> getLgImpl(DigitsT Digits, int16_t Scale) {
   return std::make_pair(Floor + Round, Round ? 1 : -1);
 }
 
-/// \brief Get the lg (rounded) of a scaled number.
+/// Get the lg (rounded) of a scaled number.
 ///
 /// Get the lg of \c Digits*2^Scale.
 ///
@@ -215,7 +215,7 @@ template <class DigitsT> int32_t getLg(DigitsT Digits, int16_t Scale) {
   return getLgImpl(Digits, Scale).first;
 }
 
-/// \brief Get the lg floor of a scaled number.
+/// Get the lg floor of a scaled number.
 ///
 /// Get the floor of the lg of \c Digits*2^Scale.
 ///
@@ -225,7 +225,7 @@ template <class DigitsT> int32_t getLgFloor(DigitsT Digits, int16_t Scale) {
   return Lg.first - (Lg.second > 0);
 }
 
-/// \brief Get the lg ceiling of a scaled number.
+/// Get the lg ceiling of a scaled number.
 ///
 /// Get the ceiling of the lg of \c Digits*2^Scale.
 ///
@@ -235,7 +235,7 @@ template <class DigitsT> int32_t getLgCeiling(DigitsT Digits, int16_t Scale) {
   return Lg.first + (Lg.second < 0);
 }
 
-/// \brief Implementation for comparing scaled numbers.
+/// Implementation for comparing scaled numbers.
 ///
 /// Compare two 64-bit numbers with different scales.  Given that the scale of
 /// \c L is higher than that of \c R by \c ScaleDiff, compare them.  Return -1,
@@ -244,7 +244,7 @@ template <class DigitsT> int32_t getLgCeiling(DigitsT Digits, int16_t Scale) {
 /// \pre 0 <= ScaleDiff < 64.
 int compareImpl(uint64_t L, uint64_t R, int ScaleDiff);
 
-/// \brief Compare two scaled numbers.
+/// Compare two scaled numbers.
 ///
 /// Compare two scaled numbers.  Returns 0 for equal, -1 for less than, and 1
 /// for greater than.
@@ -271,7 +271,7 @@ int compare(DigitsT LDigits, int16_t LScale, DigitsT RDigits, int16_t RScale) {
   return -compareImpl(RDigits, LDigits, LScale - RScale);
 }
 
-/// \brief Match scales of two numbers.
+/// Match scales of two numbers.
 ///
 /// Given two scaled numbers, match up their scales.  Change the digits and
 /// scales in place.  Shift the digits as necessary to form equivalent numbers,
@@ -324,7 +324,7 @@ int16_t matchScales(DigitsT &LDigits, int16_t &LScale, DigitsT &RDigits,
   return LScale;
 }
 
-/// \brief Get the sum of two scaled numbers.
+/// Get the sum of two scaled numbers.
 ///
 /// Get the sum of two scaled numbers with as much precision as possible.
 ///
@@ -352,19 +352,19 @@ std::pair<DigitsT, int16_t> getSum(DigitsT LDigits, int16_t LScale,
   return std::make_pair(HighBit | Sum >> 1, Scale + 1);
 }
 
-/// \brief Convenience helper for 32-bit sum.
+/// Convenience helper for 32-bit sum.
 inline std::pair<uint32_t, int16_t> getSum32(uint32_t LDigits, int16_t LScale,
                                              uint32_t RDigits, int16_t RScale) {
   return getSum(LDigits, LScale, RDigits, RScale);
 }
 
-/// \brief Convenience helper for 64-bit sum.
+/// Convenience helper for 64-bit sum.
 inline std::pair<uint64_t, int16_t> getSum64(uint64_t LDigits, int16_t LScale,
                                              uint64_t RDigits, int16_t RScale) {
   return getSum(LDigits, LScale, RDigits, RScale);
 }
 
-/// \brief Get the difference of two scaled numbers.
+/// Get the difference of two scaled numbers.
 ///
 /// Get LHS minus RHS with as much precision as possible.
 ///
@@ -395,7 +395,7 @@ std::pair<DigitsT, int16_t> getDifference(DigitsT LDigits, int16_t LScale,
   return std::make_pair(LDigits, LScale);
 }
 
-/// \brief Convenience helper for 32-bit difference.
+/// Convenience helper for 32-bit difference.
 inline std::pair<uint32_t, int16_t> getDifference32(uint32_t LDigits,
                                                     int16_t LScale,
                                                     uint32_t RDigits,
@@ -403,7 +403,7 @@ inline std::pair<uint32_t, int16_t> getDifference32(uint32_t LDigits,
   return getDifference(LDigits, LScale, RDigits, RScale);
 }
 
-/// \brief Convenience helper for 64-bit difference.
+/// Convenience helper for 64-bit difference.
 inline std::pair<uint64_t, int16_t> getDifference64(uint64_t LDigits,
                                                     int16_t LScale,
                                                     uint64_t RDigits,
@@ -443,7 +443,7 @@ public:
   }
 };
 
-/// \brief Simple representation of a scaled number.
+/// Simple representation of a scaled number.
 ///
 /// ScaledNumber is a number represented by digits and a scale.  It uses simple
 /// saturation arithmetic and every operation is well-defined for every value.
@@ -534,7 +534,7 @@ public:
   int16_t getScale() const { return Scale; }
   DigitsType getDigits() const { return Digits; }
 
-  /// \brief Convert to the given integer type.
+  /// Convert to the given integer type.
   ///
   /// Convert to \c IntT using simple saturating arithmetic, truncating if
   /// necessary.
@@ -548,17 +548,17 @@ public:
     return Digits == DigitsType(1) << -Scale;
   }
 
-  /// \brief The log base 2, rounded.
+  /// The log base 2, rounded.
   ///
   /// Get the lg of the scalar.  lg 0 is defined to be INT32_MIN.
   int32_t lg() const { return ScaledNumbers::getLg(Digits, Scale); }
 
-  /// \brief The log base 2, rounded towards INT32_MIN.
+  /// The log base 2, rounded towards INT32_MIN.
   ///
   /// Get the lg floor.  lg 0 is defined to be INT32_MIN.
   int32_t lgFloor() const { return ScaledNumbers::getLgFloor(Digits, Scale); }
 
-  /// \brief The log base 2, rounded towards INT32_MAX.
+  /// The log base 2, rounded towards INT32_MAX.
   ///
   /// Get the lg ceiling.  lg 0 is defined to be INT32_MIN.
   int32_t lgCeiling() const {
@@ -574,7 +574,7 @@ public:
 
   bool operator!() const { return isZero(); }
 
-  /// \brief Convert to a decimal representation in a string.
+  /// Convert to a decimal representation in a string.
   ///
   /// Convert to a string.  Uses scientific notation for very large/small
   /// numbers.  Scientific notation is used roughly for numbers outside of the
@@ -597,7 +597,7 @@ public:
     return ScaledNumberBase::toString(Digits, Scale, Width, Precision);
   }
 
-  /// \brief Print a decimal representation.
+  /// Print a decimal representation.
   ///
   /// Print a string.  See toString for documentation.
   raw_ostream &print(raw_ostream &OS,
@@ -634,7 +634,7 @@ private:
   void shiftLeft(int32_t Shift);
   void shiftRight(int32_t Shift);
 
-  /// \brief Adjust two floats to have matching exponents.
+  /// Adjust two floats to have matching exponents.
   ///
   /// Adjust \c this and \c X to have matching exponents.  Returns the new \c X
   /// by value.  Does nothing if \a isZero() for either.
@@ -647,7 +647,7 @@ private:
   }
 
 public:
-  /// \brief Scale a large number accurately.
+  /// Scale a large number accurately.
   ///
   /// Scale N (multiply it by this).  Uses full precision multiplication, even
   /// if Width is smaller than 64, so information is not lost.
@@ -693,7 +693,7 @@ private:
     return countLeadingZeros32(Digits) + Width - 32;
   }
 
-  /// \brief Adjust a number to width, rounding up if necessary.
+  /// Adjust a number to width, rounding up if necessary.
   ///
   /// Should only be called for \c Shift close to zero.
   ///
diff --git a/contrib/llvm/include/llvm/Support/ScopedPrinter.h b/contrib/llvm/include/llvm/Support/ScopedPrinter.h
index 1b6651932212..062439b4f7db 100644
--- a/contrib/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/contrib/llvm/include/llvm/Support/ScopedPrinter.h
@@ -80,6 +80,8 @@ public:
 
   void resetIndent() { IndentLevel = 0; }
 
+  int getIndentLevel() { return IndentLevel; }
+
   void setPrefix(StringRef P) { Prefix = P; }
 
   void printIndent() {
@@ -136,7 +138,7 @@ public:
       }
     }
 
-    std::sort(SetFlags.begin(), SetFlags.end(), &flagName<TFlag>);
+    llvm::sort(SetFlags.begin(), SetFlags.end(), &flagName<TFlag>);
 
     startLine() << Label << " [ (" << hex(Value) << ")\n";
     for (const auto &Flag : SetFlags) {
@@ -261,7 +263,11 @@ public:
   }
 
   void printString(StringRef Label, const std::string &Value) {
-    startLine() << Label << ": " << Value << "\n";
+    printString(Label, StringRef(Value));
+  }
+
+  void printString(StringRef Label, const char* Value) {
+    printString(Label, StringRef(Value));
   }
 
   template <typename T>
diff --git a/contrib/llvm/include/llvm/Support/Signals.h b/contrib/llvm/include/llvm/Support/Signals.h
index cbd6f686a778..f25a04969904 100644
--- a/contrib/llvm/include/llvm/Support/Signals.h
+++ b/contrib/llvm/include/llvm/Support/Signals.h
@@ -29,16 +29,16 @@ namespace sys {
 
   /// This function registers signal handlers to ensure that if a signal gets
   /// delivered that the named file is removed.
-  /// @brief Remove a file if a fatal signal occurs.
+  /// Remove a file if a fatal signal occurs.
   bool RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg = nullptr);
 
   /// This function removes a file from the list of files to be removed on
   /// signal delivery.
   void DontRemoveFileOnSignal(StringRef Filename);
 
-  /// When an error signal (such as SIBABRT or SIGSEGV) is delivered to the
+  /// When an error signal (such as SIGABRT or SIGSEGV) is delivered to the
   /// process, print a stack trace and then exit.
-  /// \brief Print a stack trace if a fatal signal occurs.
+  /// Print a stack trace if a fatal signal occurs.
   /// \param Argv0 the current binary name, used to find the symbolizer
   ///        relative to the current binary before searching $PATH; can be
   ///        StringRef(), in which case we will only search $PATH.
@@ -50,16 +50,18 @@ namespace sys {
   /// Disable all system dialog boxes that appear when the process crashes.
   void DisableSystemDialogsOnCrash();
 
-  /// \brief Print the stack trace using the given \c raw_ostream object.
+  /// Print the stack trace using the given \c raw_ostream object.
   void PrintStackTrace(raw_ostream &OS);
 
   // Run all registered signal handlers.
   void RunSignalHandlers();
 
-  /// AddSignalHandler - Add a function to be called when an abort/kill signal
-  /// is delivered to the process.  The handler can have a cookie passed to it
-  /// to identify what instance of the handler it is.
-  void AddSignalHandler(void (*FnPtr)(void *), void *Cookie);
+  using SignalHandlerCallback = void (*)(void *);
+
+  /// Add a function to be called when an abort/kill signal is delivered to the
+  /// process. The handler can have a cookie passed to it to identify what
+  /// instance of the handler it is.
+  void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie);
 
   /// This function registers a function to be called when the user "interrupts"
   /// the program (typically by pressing ctrl-c).  When the user interrupts the
@@ -69,7 +71,7 @@ namespace sys {
   /// functions.  An null interrupt function pointer disables the current
   /// installed function.  Note also that the handler may be executed on a
   /// different thread on some platforms.
-  /// @brief Register a function to be called when ctrl-c is pressed.
+  /// Register a function to be called when ctrl-c is pressed.
   void SetInterruptFunction(void (*IF)());
 } // End sys namespace
 } // End llvm namespace
diff --git a/contrib/llvm/include/llvm/ExecutionEngine/ObjectMemoryBuffer.h b/contrib/llvm/include/llvm/Support/SmallVectorMemoryBuffer.h
index 0f00ad006a7d..f43c2fb8f826 100644
--- a/contrib/llvm/include/llvm/ExecutionEngine/ObjectMemoryBuffer.h
+++ b/contrib/llvm/include/llvm/Support/SmallVectorMemoryBuffer.h
@@ -1,4 +1,4 @@
-//===- ObjectMemoryBuffer.h - SmallVector-backed MemoryBuffrer  -*- C++ -*-===//
+//===- SmallVectorMemoryBuffer.h --------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,31 +21,31 @@
 
 namespace llvm {
 
-/// \brief SmallVector-backed MemoryBuffer instance.
+/// SmallVector-backed MemoryBuffer instance.
 ///
 /// This class enables efficient construction of MemoryBuffers from SmallVector
 /// instances. This is useful for MCJIT and Orc, where object files are streamed
 /// into SmallVectors, then inspected using ObjectFile (which takes a
 /// MemoryBuffer).
-class ObjectMemoryBuffer : public MemoryBuffer {
+class SmallVectorMemoryBuffer : public MemoryBuffer {
 public:
-
-  /// \brief Construct an ObjectMemoryBuffer from the given SmallVector r-value.
+  /// Construct an SmallVectorMemoryBuffer from the given SmallVector
+  /// r-value.
   ///
   /// FIXME: It'd be nice for this to be a non-templated constructor taking a
   /// SmallVectorImpl here instead of a templated one taking a SmallVector<N>,
   /// but SmallVector's move-construction/assignment currently only take
   /// SmallVectors. If/when that is fixed we can simplify this constructor and
   /// the following one.
-  ObjectMemoryBuffer(SmallVectorImpl<char> &&SV)
-    : SV(std::move(SV)), BufferName("<in-memory object>") {
+  SmallVectorMemoryBuffer(SmallVectorImpl<char> &&SV)
+      : SV(std::move(SV)), BufferName("<in-memory object>") {
     init(this->SV.begin(), this->SV.end(), false);
   }
 
-  /// \brief Construct a named ObjectMemoryBuffer from the given SmallVector
-  ///        r-value and StringRef.
-  ObjectMemoryBuffer(SmallVectorImpl<char> &&SV, StringRef Name)
-    : SV(std::move(SV)), BufferName(Name) {
+  /// Construct a named SmallVectorMemoryBuffer from the given
+  /// SmallVector r-value and StringRef.
+  SmallVectorMemoryBuffer(SmallVectorImpl<char> &&SV, StringRef Name)
+      : SV(std::move(SV)), BufferName(Name) {
     init(this->SV.begin(), this->SV.end(), false);
   }
 
@@ -56,6 +56,7 @@ public:
 private:
   SmallVector<char, 0> SV;
   std::string BufferName;
+  void anchor() override;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm/include/llvm/Support/SourceMgr.h b/contrib/llvm/include/llvm/Support/SourceMgr.h
index c08bf858760a..63ac893239d1 100644
--- a/contrib/llvm/include/llvm/Support/SourceMgr.h
+++ b/contrib/llvm/include/llvm/Support/SourceMgr.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -57,8 +58,38 @@ private:
     /// The memory buffer for the file.
     std::unique_ptr<MemoryBuffer> Buffer;
 
+    /// Helper type for OffsetCache below: since we're storing many offsets
+    /// into relatively small files (often smaller than 2^8 or 2^16 bytes),
+    /// we select the offset vector element type dynamically based on the
+    /// size of Buffer.
+    using VariableSizeOffsets = PointerUnion4<std::vector<uint8_t> *,
+                                              std::vector<uint16_t> *,
+                                              std::vector<uint32_t> *,
+                                              std::vector<uint64_t> *>;
+
+    /// Vector of offsets into Buffer at which there are line-endings
+    /// (lazily populated). Once populated, the '\n' that marks the end of
+    /// line number N from [1..] is at Buffer[OffsetCache[N-1]]. Since
+    /// these offsets are in sorted (ascending) order, they can be
+    /// binary-searched for the first one after any given offset (eg. an
+    /// offset corresponding to a particular SMLoc).
+    mutable VariableSizeOffsets OffsetCache;
+
+    /// Populate \c OffsetCache and look up a given \p Ptr in it, assuming
+    /// it points somewhere into \c Buffer. The static type parameter \p T
+    /// must be an unsigned integer type from uint{8,16,32,64}_t large
+    /// enough to store offsets inside \c Buffer.
+    template<typename T>
+    unsigned getLineNumber(const char *Ptr) const;
+
     /// This is the location of the parent include, or null if at the top level.
     SMLoc IncludeLoc;
+
+    SrcBuffer() = default;
+    SrcBuffer(SrcBuffer &&);
+    SrcBuffer(const SrcBuffer &) = delete;
+    SrcBuffer &operator=(const SrcBuffer &) = delete;
+    ~SrcBuffer();
   };
 
   /// This is all of the buffers that we are reading from.
@@ -67,10 +98,6 @@ private:
   // This is the list of directories we should search for include files in.
   std::vector<std::string> IncludeDirectories;
 
-  /// This is a cache for line number queries, its implementation is really
-  /// private to SourceMgr.cpp.
-  mutable void *LineNoCache = nullptr;
-
   DiagHandlerTy DiagHandler = nullptr;
   void *DiagContext = nullptr;
 
@@ -80,7 +107,7 @@ public:
   SourceMgr() = default;
   SourceMgr(const SourceMgr &) = delete;
   SourceMgr &operator=(const SourceMgr &) = delete;
-  ~SourceMgr();
+  ~SourceMgr() = default;
 
   void setIncludeDirs(const std::vector<std::string> &Dirs) {
     IncludeDirectories = Dirs;
diff --git a/contrib/llvm/include/llvm/Support/StringSaver.h b/contrib/llvm/include/llvm/Support/StringSaver.h
index e85b2895ce51..6b77d487333b 100644
--- a/contrib/llvm/include/llvm/Support/StringSaver.h
+++ b/contrib/llvm/include/llvm/Support/StringSaver.h
@@ -10,23 +10,49 @@
 #ifndef LLVM_SUPPORT_STRINGSAVER_H
 #define LLVM_SUPPORT_STRINGSAVER_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Allocator.h"
 
 namespace llvm {
 
-/// \brief Saves strings in the inheritor's stable storage and returns a
+/// Saves strings in the provided stable storage and returns a
 /// StringRef with a stable character pointer.
 class StringSaver final {
   BumpPtrAllocator &Alloc;
 
 public:
   StringSaver(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
+
+  // All returned strings are null-terminated: *save(S).end() == 0.
   StringRef save(const char *S) { return save(StringRef(S)); }
   StringRef save(StringRef S);
   StringRef save(const Twine &S) { return save(StringRef(S.str())); }
   StringRef save(const std::string &S) { return save(StringRef(S)); }
 };
+
+/// Saves strings in the provided stable storage and returns a StringRef with a
+/// stable character pointer. Saving the same string yields the same StringRef.
+///
+/// Compared to StringSaver, it does more work but avoids saving the same string
+/// multiple times.
+///
+/// Compared to StringPool, it performs fewer allocations but doesn't support
+/// refcounting/deletion.
+class UniqueStringSaver final {
+  StringSaver Strings;
+  llvm::DenseSet<llvm::StringRef> Unique;
+
+public:
+  UniqueStringSaver(BumpPtrAllocator &Alloc) : Strings(Alloc) {}
+
+  // All returned strings are null-terminated: *save(S).end() == 0.
+  StringRef save(const char *S) { return save(StringRef(S)); }
+  StringRef save(StringRef S);
+  StringRef save(const Twine &S) { return save(StringRef(S.str())); }
+  StringRef save(const std::string &S) { return save(StringRef(S)); }
+};
+
 }
 #endif
diff --git a/contrib/llvm/include/llvm/Support/SystemUtils.h b/contrib/llvm/include/llvm/Support/SystemUtils.h
index 2997b1b0c9cf..bd60793d1554 100644
--- a/contrib/llvm/include/llvm/Support/SystemUtils.h
+++ b/contrib/llvm/include/llvm/Support/SystemUtils.h
@@ -21,7 +21,7 @@ namespace llvm {
 /// Determine if the raw_ostream provided is connected to a terminal. If so,
 /// generate a warning message to errs() advising against display of bitcode
 /// and return true. Otherwise just return false.
-/// @brief Check for output written to a console
+/// Check for output written to a console
 bool CheckBitcodeOutputToConsole(
   raw_ostream &stream_to_check, ///< The stream to be checked
   bool print_warning = true     ///< Control whether warnings are printed
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetOpcodes.def b/contrib/llvm/include/llvm/Support/TargetOpcodes.def
index d3e8483798a7..21f5c7e709b8 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetOpcodes.def
+++ b/contrib/llvm/include/llvm/Support/TargetOpcodes.def
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/TargetOpcodes.def - Target Indep Opcodes ---*- C++ -*-===//
+//===-- llvm/Support/TargetOpcodes.def - Target Indep Opcodes ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -77,6 +77,9 @@ HANDLE_TARGET_OPCODE(SUBREG_TO_REG)
 /// DBG_VALUE - a mapping of the llvm.dbg.value intrinsic
 HANDLE_TARGET_OPCODE(DBG_VALUE)
 
+/// DBG_LABEL - a mapping of the llvm.dbg.label intrinsic
+HANDLE_TARGET_OPCODE(DBG_LABEL)
+
 /// REG_SEQUENCE - This variadic instruction is used to form a register that
 /// represents a consecutive sequence of sub-registers. It's used as a
 /// register coalescing / allocation aid and must be eliminated before code
@@ -183,10 +186,16 @@ HANDLE_TARGET_OPCODE(PATCHABLE_FUNCTION_EXIT)
 /// PATCHABLE_RET which specifically only works for return instructions.
 HANDLE_TARGET_OPCODE(PATCHABLE_TAIL_CALL)
 
-/// Wraps a logging call and its arguments with nop sleds. At runtime, this can be
-/// patched to insert instrumentation instructions.
+/// Wraps a logging call and its arguments with nop sleds. At runtime, this can
+/// be patched to insert instrumentation instructions.
 HANDLE_TARGET_OPCODE(PATCHABLE_EVENT_CALL)
 
+/// Wraps a typed logging call and its argument with nop sleds. At runtime, this
+/// can be patched to insert instrumentation instructions.
+HANDLE_TARGET_OPCODE(PATCHABLE_TYPED_EVENT_CALL)
+
+HANDLE_TARGET_OPCODE(ICALL_BRANCH_FUNNEL)
+
 /// The following generic opcodes are not supposed to appear after ISel.
 /// This is something we might want to relax, but for now, this is convenient
 /// to produce diagnostics.
@@ -259,9 +268,15 @@ HANDLE_TARGET_OPCODE(G_INTTOPTR)
 /// COPY is the relevant instruction.
 HANDLE_TARGET_OPCODE(G_BITCAST)
 
-/// Generic load.
+/// Generic load (including anyext load)
 HANDLE_TARGET_OPCODE(G_LOAD)
 
+/// Generic signext load
+HANDLE_TARGET_OPCODE(G_SEXTLOAD)
+
+/// Generic zeroext load
+HANDLE_TARGET_OPCODE(G_ZEXTLOAD)
+
 /// Generic store.
 HANDLE_TARGET_OPCODE(G_STORE)
 
@@ -427,6 +442,9 @@ HANDLE_TARGET_OPCODE(G_SITOFP)
 /// Generic unsigned-int to float conversion
 HANDLE_TARGET_OPCODE(G_UITOFP)
 
+/// Generic FP absolute value.
+HANDLE_TARGET_OPCODE(G_FABS)
+
 /// Generic pointer offset
 HANDLE_TARGET_OPCODE(G_GEP)
 
@@ -449,12 +467,15 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR)
 /// Generic byte swap.
 HANDLE_TARGET_OPCODE(G_BSWAP)
 
+/// Generic AddressSpaceCast.
+HANDLE_TARGET_OPCODE(G_ADDRSPACE_CAST)
+
 // TODO: Add more generic opcodes as we move along.
 
 /// Marker for the end of the generic opcode.
 /// This is used to check if an opcode is in the range of the
 /// generic opcodes.
-HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_BSWAP)
+HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_ADDRSPACE_CAST)
 
 /// BUILTIN_OP_END - This must be the last enum value in this list.
 /// The target-specific post-isel opcode values start here.
diff --git a/contrib/llvm/include/llvm/Support/TargetParser.h b/contrib/llvm/include/llvm/Support/TargetParser.h
index 13b7befb8ce4..08ad42dda3eb 100644
--- a/contrib/llvm/include/llvm/Support/TargetParser.h
+++ b/contrib/llvm/include/llvm/Support/TargetParser.h
@@ -86,6 +86,8 @@ enum ArchExtKind : unsigned {
   AEK_RAS =         1 << 12,
   AEK_SVE =         1 << 13,
   AEK_DOTPROD =     1 << 14,
+  AEK_SHA2    =     1 << 15,
+  AEK_AES     =     1 << 16,
   // Unsupported extensions.
   AEK_OS = 0x8000000,
   AEK_IWMMXT = 0x10000000,
@@ -137,6 +139,7 @@ unsigned parseFPU(StringRef FPU);
 ArchKind parseArch(StringRef Arch);
 unsigned parseArchExt(StringRef ArchExt);
 ArchKind parseCPUArch(StringRef CPU);
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 ISAKind parseArchISA(StringRef Arch);
 EndianKind parseArchEndian(StringRef Arch);
 ProfileKind parseArchProfile(StringRef Arch);
@@ -170,7 +173,11 @@ enum ArchExtKind : unsigned {
   AEK_SVE =         1 << 9,
   AEK_DOTPROD =     1 << 10,
   AEK_RCPC =        1 << 11,
-  AEK_RDM =         1 << 12
+  AEK_RDM =         1 << 12,
+  AEK_SM4 =         1 << 13,
+  AEK_SHA3 =        1 << 14,
+  AEK_SHA2 =        1 << 15,
+  AEK_AES =         1 << 16,
 };
 
 StringRef getCanonicalArchName(StringRef Arch);
@@ -199,17 +206,21 @@ unsigned checkArchVersion(StringRef Arch);
 unsigned  getDefaultFPU(StringRef CPU, ArchKind AK);
 unsigned  getDefaultExtensions(StringRef CPU, ArchKind AK);
 StringRef getDefaultCPU(StringRef Arch);
+AArch64::ArchKind getCPUArchKind(StringRef CPU);
 
 // Parser
 unsigned parseFPU(StringRef FPU);
 AArch64::ArchKind parseArch(StringRef Arch);
-unsigned parseArchExt(StringRef ArchExt);
+ArchExtKind parseArchExt(StringRef ArchExt);
 ArchKind parseCPUArch(StringRef CPU);
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 ARM::ISAKind parseArchISA(StringRef Arch);
 ARM::EndianKind parseArchEndian(StringRef Arch);
 ARM::ProfileKind parseArchProfile(StringRef Arch);
 unsigned parseArchVersion(StringRef Arch);
 
+bool isX18ReservedByDefault(const Triple &TT);
+
 } // namespace AArch64
 
 namespace X86 {
diff --git a/contrib/llvm/include/llvm/Support/TargetRegistry.h b/contrib/llvm/include/llvm/Support/TargetRegistry.h
index 8a429ab728ed..1bafc4e687da 100644
--- a/contrib/llvm/include/llvm/Support/TargetRegistry.h
+++ b/contrib/llvm/include/llvm/Support/TargetRegistry.h
@@ -19,7 +19,7 @@
 #ifndef LLVM_SUPPORT_TARGETREGISTRY_H
 #define LLVM_SUPPORT_TARGETREGISTRY_H
 
-#include "llvm-c/Disassembler.h"
+#include "llvm-c/DisassemblerTypes.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
@@ -46,6 +46,7 @@ class MCDisassembler;
 class MCInstPrinter;
 class MCInstrAnalysis;
 class MCInstrInfo;
+class MCObjectWriter;
 class MCRegisterInfo;
 class MCRelocationInfo;
 class MCStreamer;
@@ -60,27 +61,44 @@ class TargetMachine;
 class TargetOptions;
 
 MCStreamer *createNullStreamer(MCContext &Ctx);
-MCStreamer *createAsmStreamer(MCContext &Ctx,
-                              std::unique_ptr<formatted_raw_ostream> OS,
-                              bool isVerboseAsm, bool useDwarfDirectory,
-                              MCInstPrinter *InstPrint, MCCodeEmitter *CE,
-                              MCAsmBackend *TAB, bool ShowInst);
+// Takes ownership of \p TAB and \p CE.
+
+/// Create a machine code streamer which will print out assembly for the native
+/// target, suitable for compiling with a native assembler.
+///
+/// \param InstPrint - If given, the instruction printer to use. If not given
+/// the MCInst representation will be printed.  This method takes ownership of
+/// InstPrint.
+///
+/// \param CE - If given, a code emitter to use to show the instruction
+/// encoding inline with the assembly. This method takes ownership of \p CE.
+///
+/// \param TAB - If given, a target asm backend to use to show the fixup
+/// information in conjunction with encoding information. This method takes
+/// ownership of \p TAB.
+///
+/// \param ShowInst - Whether to show the MCInst representation inline with
+/// the assembly.
+MCStreamer *
+createAsmStreamer(MCContext &Ctx, std::unique_ptr<formatted_raw_ostream> OS,
+                  bool isVerboseAsm, bool useDwarfDirectory,
+                  MCInstPrinter *InstPrint, std::unique_ptr<MCCodeEmitter> &&CE,
+                  std::unique_ptr<MCAsmBackend> &&TAB, bool ShowInst);
 
-/// Takes ownership of \p TAB and \p CE.
 MCStreamer *createELFStreamer(MCContext &Ctx,
                               std::unique_ptr<MCAsmBackend> &&TAB,
-                              raw_pwrite_stream &OS,
+                              std::unique_ptr<MCObjectWriter> &&OW,
                               std::unique_ptr<MCCodeEmitter> &&CE,
                               bool RelaxAll);
 MCStreamer *createMachOStreamer(MCContext &Ctx,
                                 std::unique_ptr<MCAsmBackend> &&TAB,
-                                raw_pwrite_stream &OS,
+                                std::unique_ptr<MCObjectWriter> &&OW,
                                 std::unique_ptr<MCCodeEmitter> &&CE,
                                 bool RelaxAll, bool DWARFMustBeAtTheEnd,
                                 bool LabelSections = false);
 MCStreamer *createWasmStreamer(MCContext &Ctx,
                                std::unique_ptr<MCAsmBackend> &&TAB,
-                               raw_pwrite_stream &OS,
+                               std::unique_ptr<MCObjectWriter> &&OW,
                                std::unique_ptr<MCCodeEmitter> &&CE,
                                bool RelaxAll);
 
@@ -143,22 +161,22 @@ public:
   using ELFStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
-                      raw_pwrite_stream &OS,
+                      std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
   using MachOStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
-                      raw_pwrite_stream &OS,
+                      std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
                       bool DWARFMustBeAtTheEnd);
   using COFFStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
-                      raw_pwrite_stream &OS,
+                      std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
                       bool IncrementalLinkerCompatible);
   using WasmStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
-                      raw_pwrite_stream &OS,
+                      std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
   using NullTargetStreamerCtorTy = MCTargetStreamer *(*)(MCStreamer &S);
   using AsmTargetStreamerCtorTy = MCTargetStreamer *(*)(
@@ -441,12 +459,12 @@ public:
   /// \param T The target triple.
   /// \param Ctx The target context.
   /// \param TAB The target assembler backend object. Takes ownership.
-  /// \param OS The stream object.
+  /// \param OW The stream object.
   /// \param Emitter The target independent assembler object.Takes ownership.
   /// \param RelaxAll Relax all fixups?
   MCStreamer *createMCObjectStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&TAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                                      const MCSubtargetInfo &STI, bool RelaxAll,
                                      bool IncrementalLinkerCompatible,
@@ -457,32 +475,35 @@ public:
       llvm_unreachable("Unknown object format");
     case Triple::COFF:
       assert(T.isOSWindows() && "only Windows COFF is supported");
-      S = COFFStreamerCtorFn(Ctx, std::move(TAB), OS, std::move(Emitter),
-                             RelaxAll, IncrementalLinkerCompatible);
+      S = COFFStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
+                             std::move(Emitter), RelaxAll,
+                             IncrementalLinkerCompatible);
       break;
     case Triple::MachO:
       if (MachOStreamerCtorFn)
-        S = MachOStreamerCtorFn(Ctx, std::move(TAB), OS, std::move(Emitter),
-                                RelaxAll, DWARFMustBeAtTheEnd);
+        S = MachOStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
+                                std::move(Emitter), RelaxAll,
+                                DWARFMustBeAtTheEnd);
       else
-        S = createMachOStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
-                                RelaxAll, DWARFMustBeAtTheEnd);
+        S = createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
+                                std::move(Emitter), RelaxAll,
+                                DWARFMustBeAtTheEnd);
       break;
     case Triple::ELF:
       if (ELFStreamerCtorFn)
-        S = ELFStreamerCtorFn(T, Ctx, std::move(TAB), OS, std::move(Emitter),
-                              RelaxAll);
+        S = ELFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
+                              std::move(Emitter), RelaxAll);
       else
-        S = createELFStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
-                              RelaxAll);
+        S = createELFStreamer(Ctx, std::move(TAB), std::move(OW),
+                              std::move(Emitter), RelaxAll);
       break;
     case Triple::Wasm:
       if (WasmStreamerCtorFn)
-        S = WasmStreamerCtorFn(T, Ctx, std::move(TAB), OS, std::move(Emitter),
-                               RelaxAll);
+        S = WasmStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
+                               std::move(Emitter), RelaxAll);
       else
-        S = createWasmStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
-                               RelaxAll);
+        S = createWasmStreamer(Ctx, std::move(TAB), std::move(OW),
+                               std::move(Emitter), RelaxAll);
       break;
     }
     if (ObjectTargetStreamerCtorFn)
@@ -493,12 +514,14 @@ public:
   MCStreamer *createAsmStreamer(MCContext &Ctx,
                                 std::unique_ptr<formatted_raw_ostream> OS,
                                 bool IsVerboseAsm, bool UseDwarfDirectory,
-                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
-                                MCAsmBackend *TAB, bool ShowInst) const {
+                                MCInstPrinter *InstPrint,
+                                std::unique_ptr<MCCodeEmitter> &&CE,
+                                std::unique_ptr<MCAsmBackend> &&TAB,
+                                bool ShowInst) const {
     formatted_raw_ostream &OSRef = *OS;
-    MCStreamer *S = llvm::createAsmStreamer(Ctx, std::move(OS), IsVerboseAsm,
-                                            UseDwarfDirectory, InstPrint, CE,
-                                            TAB, ShowInst);
+    MCStreamer *S = llvm::createAsmStreamer(
+        Ctx, std::move(OS), IsVerboseAsm, UseDwarfDirectory, InstPrint,
+        std::move(CE), std::move(TAB), ShowInst);
     createAsmTargetStreamer(*S, OSRef, InstPrint, IsVerboseAsm);
     return S;
   }
diff --git a/contrib/llvm/include/llvm/Support/TaskQueue.h b/contrib/llvm/include/llvm/Support/TaskQueue.h
new file mode 100644
index 000000000000..49981adb763d
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/TaskQueue.h
@@ -0,0 +1,139 @@
+//===-- llvm/Support/TaskQueue.h - A TaskQueue implementation ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a crude C++11 based task queue.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_TASK_QUEUE_H
+#define LLVM_SUPPORT_TASK_QUEUE_H
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/thread.h"
+
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <utility>
+
+namespace llvm {
+/// TaskQueue executes serialized work on a user-defined Thread Pool.  It
+/// guarantees that if task B is enqueued after task A, task B begins after
+/// task A completes and there is no overlap between the two.
+class TaskQueue {
+  // Because we don't have init capture to use move-only local variables that
+  // are captured into a lambda, we create the promise inside an explicit
+  // callable struct. We want to do as much of the wrapping in the
+  // type-specialized domain (before type erasure) and then erase this into a
+  // std::function.
+  template <typename Callable> struct Task {
+    using ResultTy = typename std::result_of<Callable()>::type;
+    explicit Task(Callable C, TaskQueue &Parent)
+        : C(std::move(C)), P(std::make_shared<std::promise<ResultTy>>()),
+          Parent(&Parent) {}
+
+    template<typename T>
+    void invokeCallbackAndSetPromise(T*) {
+      P->set_value(C());
+    }
+
+    void invokeCallbackAndSetPromise(void*) {
+      C();
+      P->set_value();
+    }
+
+    void operator()() noexcept {
+      ResultTy *Dummy = nullptr;
+      invokeCallbackAndSetPromise(Dummy);
+      Parent->completeTask();
+    }
+
+    Callable C;
+    std::shared_ptr<std::promise<ResultTy>> P;
+    TaskQueue *Parent;
+  };
+
+public:
+  /// Construct a task queue with no work.
+  TaskQueue(ThreadPool &Scheduler) : Scheduler(Scheduler) { (void)Scheduler; }
+
+  /// Blocking destructor: the queue will wait for all work to complete.
+  ~TaskQueue() {
+    Scheduler.wait();
+    assert(Tasks.empty());
+  }
+
+  /// Asynchronous submission of a task to the queue. The returned future can be
+  /// used to wait for the task (and all previous tasks that have not yet
+  /// completed) to finish.
+  template <typename Callable>
+  std::future<typename std::result_of<Callable()>::type> async(Callable &&C) {
+#if !LLVM_ENABLE_THREADS
+    static_assert(false,
+                  "TaskQueue requires building with LLVM_ENABLE_THREADS!");
+#endif
+    Task<Callable> T{std::move(C), *this};
+    using ResultTy = typename std::result_of<Callable()>::type;
+    std::future<ResultTy> F = T.P->get_future();
+    {
+      std::lock_guard<std::mutex> Lock(QueueLock);
+      // If there's already a task in flight, just queue this one up.  If
+      // there is not a task in flight, bypass the queue and schedule this
+      // task immediately.
+      if (IsTaskInFlight)
+        Tasks.push_back(std::move(T));
+      else {
+        Scheduler.async(std::move(T));
+        IsTaskInFlight = true;
+      }
+    }
+    return std::move(F);
+  }
+
+private:
+  void completeTask() {
+    // We just completed a task.  If there are no more tasks in the queue,
+    // update IsTaskInFlight to false and stop doing work.  Otherwise
+    // schedule the next task (while not holding the lock).
+    std::function<void()> Continuation;
+    {
+      std::lock_guard<std::mutex> Lock(QueueLock);
+      if (Tasks.empty()) {
+        IsTaskInFlight = false;
+        return;
+      }
+
+      Continuation = std::move(Tasks.front());
+      Tasks.pop_front();
+    }
+    Scheduler.async(std::move(Continuation));
+  }
+
+  /// The thread pool on which to run the work.
+  ThreadPool &Scheduler;
+
+  /// State which indicates whether the queue currently is currently processing
+  /// any work.
+  bool IsTaskInFlight = false;
+
+  /// Mutex for synchronizing access to the Tasks array.
+  std::mutex QueueLock;
+
+  /// Tasks waiting for execution in the queue.
+  std::deque<std::function<void()>> Tasks;
+};
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_TASK_QUEUE_H
diff --git a/contrib/llvm/include/llvm/Support/ThreadLocal.h b/contrib/llvm/include/llvm/Support/ThreadLocal.h
index 427a67e2a96d..885bd18e8356 100644
--- a/contrib/llvm/include/llvm/Support/ThreadLocal.h
+++ b/contrib/llvm/include/llvm/Support/ThreadLocal.h
@@ -24,7 +24,7 @@ namespace llvm {
     // YOU SHOULD NEVER USE THIS DIRECTLY.
     class ThreadLocalImpl {
       typedef uint64_t ThreadLocalDataTy;
-      /// \brief Platform-specific thread local data.
+      /// Platform-specific thread local data.
       ///
       /// This is embedded in the class and we avoid malloc'ing/free'ing it,
       /// to make this class more safe for use along with CrashRecoveryContext.
diff --git a/contrib/llvm/include/llvm/Support/ThreadPool.h b/contrib/llvm/include/llvm/Support/ThreadPool.h
index fb8255900510..4fdbd528b212 100644
--- a/contrib/llvm/include/llvm/Support/ThreadPool.h
+++ b/contrib/llvm/include/llvm/Support/ThreadPool.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_THREAD_POOL_H
 #define LLVM_SUPPORT_THREAD_POOL_H
 
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/thread.h"
 
 #include <future>
diff --git a/contrib/llvm/include/llvm/Support/Threading.h b/contrib/llvm/include/llvm/Support/Threading.h
index 6d813bccb93f..e8021f648b0d 100644
--- a/contrib/llvm/include/llvm/Support/Threading.h
+++ b/contrib/llvm/include/llvm/Support/Threading.h
@@ -72,7 +72,7 @@ void llvm_execute_on_thread(void (*UserFn)(void *), void *UserData,
 
   enum InitStatus { Uninitialized = 0, Wait = 1, Done = 2 };
 
-  /// \brief The llvm::once_flag structure
+  /// The llvm::once_flag structure
   ///
   /// This type is modeled after std::once_flag to use with llvm::call_once.
   /// This structure must be used as an opaque object. It is a struct to force
@@ -83,7 +83,7 @@ void llvm_execute_on_thread(void (*UserFn)(void *), void *UserData,
 
 #endif
 
-  /// \brief Execute the function specified as a parameter once.
+  /// Execute the function specified as a parameter once.
   ///
   /// Typical usage:
   /// \code
@@ -139,17 +139,17 @@ void llvm_execute_on_thread(void (*UserFn)(void *), void *UserData,
   /// not available.
   unsigned hardware_concurrency();
 
-  /// \brief Return the current thread id, as used in various OS system calls.
+  /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
   /// unique across the entire system, so portable code should not assume
   /// this.
   uint64_t get_threadid();
 
-  /// \brief Get the maximum length of a thread name on this platform.
+  /// Get the maximum length of a thread name on this platform.
   /// A value of 0 means there is no limit.
   uint32_t get_max_thread_name_length();
 
-  /// \brief Set the name of the current thread.  Setting a thread's name can
+  /// Set the name of the current thread.  Setting a thread's name can
   /// be helpful for enabling useful diagnostics under a debugger or when
   /// logging.  The level of support for setting a thread's name varies
   /// wildly across operating systems, and we only make a best effort to
@@ -157,7 +157,7 @@ void llvm_execute_on_thread(void (*UserFn)(void *), void *UserData,
   /// or failure is returned.
   void set_thread_name(const Twine &Name);
 
-  /// \brief Get the name of the current thread.  The level of support for
+  /// Get the name of the current thread.  The level of support for
   /// getting a thread's name varies wildly across operating systems, and it
   /// is not even guaranteed that if you can successfully set a thread's name
   /// that you can later get it back.  This function is intended for diagnostic
diff --git a/contrib/llvm/include/llvm/Support/Timer.h b/contrib/llvm/include/llvm/Support/Timer.h
index 198855ae0377..bfffbc3157b1 100644
--- a/contrib/llvm/include/llvm/Support/Timer.h
+++ b/contrib/llvm/include/llvm/Support/Timer.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_SUPPORT_TIMER_H
 #define LLVM_SUPPORT_TIMER_H
 
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
@@ -194,6 +195,10 @@ class TimerGroup {
 
 public:
   explicit TimerGroup(StringRef Name, StringRef Description);
+
+  explicit TimerGroup(StringRef Name, StringRef Description,
+                      const StringMap<TimeRecord> &Records);
+
   ~TimerGroup();
 
   void setName(StringRef NewName, StringRef NewDescription) {
@@ -207,6 +212,8 @@ public:
   /// This static method prints all timers and clears them all out.
   static void printAll(raw_ostream &OS);
 
+  const char *printJSONValues(raw_ostream &OS, const char *delim);
+
   /// Prints all timers as JSON key/value pairs, and clears them all out.
   static const char *printAllJSONValues(raw_ostream &OS, const char *delim);
 
@@ -223,7 +230,6 @@ private:
   void PrintQueuedTimers(raw_ostream &OS);
   void printJSONValue(raw_ostream &OS, const PrintRecord &R,
                       const char *suffix, double Value);
-  const char *printJSONValues(raw_ostream &OS, const char *delim);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/Support/ToolOutputFile.h b/contrib/llvm/include/llvm/Support/ToolOutputFile.h
index b41ca5a6edaa..cf3bc2fb0171 100644
--- a/contrib/llvm/include/llvm/Support/ToolOutputFile.h
+++ b/contrib/llvm/include/llvm/Support/ToolOutputFile.h
@@ -35,7 +35,7 @@ class ToolOutputFile {
     /// The flag which indicates whether we should not delete the file.
     bool Keep;
 
-    explicit CleanupInstaller(StringRef ilename);
+    explicit CleanupInstaller(StringRef Filename);
     ~CleanupInstaller();
   } Installer;
 
@@ -43,7 +43,7 @@ class ToolOutputFile {
   raw_fd_ostream OS;
 
 public:
-  /// This constructor's arguments are passed to to raw_fd_ostream's
+  /// This constructor's arguments are passed to raw_fd_ostream's
   /// constructor.
   ToolOutputFile(StringRef Filename, std::error_code &EC,
                  sys::fs::OpenFlags Flags);
diff --git a/contrib/llvm/include/llvm/Support/TrailingObjects.h b/contrib/llvm/include/llvm/Support/TrailingObjects.h
index cb5a52b0d861..490bd94f4cd5 100644
--- a/contrib/llvm/include/llvm/Support/TrailingObjects.h
+++ b/contrib/llvm/include/llvm/Support/TrailingObjects.h
@@ -89,25 +89,25 @@ protected:
 };
 
 /// This helper template works-around MSVC 2013's lack of useful
-/// alignas() support. The argument to LLVM_ALIGNAS(), in MSVC, is
+/// alignas() support. The argument to alignas(), in MSVC, is
 /// required to be a literal integer. But, you *can* use template
-/// specialization to select between a bunch of different LLVM_ALIGNAS
+/// specialization to select between a bunch of different alignas()
 /// expressions...
 template <int Align>
 class TrailingObjectsAligner : public TrailingObjectsBase {};
 template <>
-class LLVM_ALIGNAS(1) TrailingObjectsAligner<1> : public TrailingObjectsBase {};
+class alignas(1) TrailingObjectsAligner<1> : public TrailingObjectsBase {};
 template <>
-class LLVM_ALIGNAS(2) TrailingObjectsAligner<2> : public TrailingObjectsBase {};
+class alignas(2) TrailingObjectsAligner<2> : public TrailingObjectsBase {};
 template <>
-class LLVM_ALIGNAS(4) TrailingObjectsAligner<4> : public TrailingObjectsBase {};
+class alignas(4) TrailingObjectsAligner<4> : public TrailingObjectsBase {};
 template <>
-class LLVM_ALIGNAS(8) TrailingObjectsAligner<8> : public TrailingObjectsBase {};
+class alignas(8) TrailingObjectsAligner<8> : public TrailingObjectsBase {};
 template <>
-class LLVM_ALIGNAS(16) TrailingObjectsAligner<16> : public TrailingObjectsBase {
+class alignas(16) TrailingObjectsAligner<16> : public TrailingObjectsBase {
 };
 template <>
-class LLVM_ALIGNAS(32) TrailingObjectsAligner<32> : public TrailingObjectsBase {
+class alignas(32) TrailingObjectsAligner<32> : public TrailingObjectsBase {
 };
 
 // Just a little helper for transforming a type pack into the same
diff --git a/contrib/llvm/include/llvm/Support/Unicode.h b/contrib/llvm/include/llvm/Support/Unicode.h
index adedb1ed83a6..983acaf03635 100644
--- a/contrib/llvm/include/llvm/Support/Unicode.h
+++ b/contrib/llvm/include/llvm/Support/Unicode.h
@@ -60,6 +60,10 @@ bool isPrintable(int UCS);
 ///   * 1 for each of the remaining characters.
 int columnWidthUTF8(StringRef Text);
 
+/// Fold input unicode character according the Simple unicode case folding
+/// rules.
+int foldCharSimple(int C);
+
 } // namespace unicode
 } // namespace sys
 } // namespace llvm
diff --git a/contrib/llvm/include/llvm/Support/UnicodeCharRanges.h b/contrib/llvm/include/llvm/Support/UnicodeCharRanges.h
index 4c655833b396..3cf4a6d96602 100644
--- a/contrib/llvm/include/llvm/Support/UnicodeCharRanges.h
+++ b/contrib/llvm/include/llvm/Support/UnicodeCharRanges.h
@@ -23,7 +23,7 @@
 namespace llvm {
 namespace sys {
 
-/// \brief Represents a closed range of Unicode code points [Lower, Upper].
+/// Represents a closed range of Unicode code points [Lower, Upper].
 struct UnicodeCharRange {
   uint32_t Lower;
   uint32_t Upper;
@@ -36,14 +36,14 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
   return Range.Upper < Value;
 }
 
-/// \brief Holds a reference to an ordered array of UnicodeCharRange and allows
+/// Holds a reference to an ordered array of UnicodeCharRange and allows
 /// to quickly check if a code point is contained in the set represented by this
 /// array.
 class UnicodeCharSet {
 public:
   typedef ArrayRef<UnicodeCharRange> CharRanges;
 
-  /// \brief Constructs a UnicodeCharSet instance from an array of
+  /// Constructs a UnicodeCharSet instance from an array of
   /// UnicodeCharRanges.
   ///
   /// Array pointed by \p Ranges should have the lifetime at least as long as
@@ -63,31 +63,31 @@ public:
   }
 #endif
 
-  /// \brief Returns true if the character set contains the Unicode code point
+  /// Returns true if the character set contains the Unicode code point
   /// \p C.
   bool contains(uint32_t C) const {
     return std::binary_search(Ranges.begin(), Ranges.end(), C);
   }
 
 private:
-  /// \brief Returns true if each of the ranges is a proper closed range
+  /// Returns true if each of the ranges is a proper closed range
   /// [min, max], and if the ranges themselves are ordered and non-overlapping.
   bool rangesAreValid() const {
     uint32_t Prev = 0;
     for (CharRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
          I != E; ++I) {
       if (I != Ranges.begin() && Prev >= I->Lower) {
-        DEBUG(dbgs() << "Upper bound 0x");
-        DEBUG(dbgs().write_hex(Prev));
-        DEBUG(dbgs() << " should be less than succeeding lower bound 0x");
-        DEBUG(dbgs().write_hex(I->Lower) << "\n");
+        LLVM_DEBUG(dbgs() << "Upper bound 0x");
+        LLVM_DEBUG(dbgs().write_hex(Prev));
+        LLVM_DEBUG(dbgs() << " should be less than succeeding lower bound 0x");
+        LLVM_DEBUG(dbgs().write_hex(I->Lower) << "\n");
         return false;
       }
       if (I->Upper < I->Lower) {
-        DEBUG(dbgs() << "Upper bound 0x");
-        DEBUG(dbgs().write_hex(I->Lower));
-        DEBUG(dbgs() << " should not be less than lower bound 0x");
-        DEBUG(dbgs().write_hex(I->Upper) << "\n");
+        LLVM_DEBUG(dbgs() << "Upper bound 0x");
+        LLVM_DEBUG(dbgs().write_hex(I->Lower));
+        LLVM_DEBUG(dbgs() << " should not be less than lower bound 0x");
+        LLVM_DEBUG(dbgs().write_hex(I->Upper) << "\n");
         return false;
       }
       Prev = I->Upper;
diff --git a/contrib/llvm/include/llvm/Support/UniqueLock.h b/contrib/llvm/include/llvm/Support/UniqueLock.h
index b4675f4b43ae..91dc911036d5 100644
--- a/contrib/llvm/include/llvm/Support/UniqueLock.h
+++ b/contrib/llvm/include/llvm/Support/UniqueLock.h
@@ -24,7 +24,7 @@ namespace llvm {
   /// an associated mutex, which is guaranteed to be locked upon creation
   /// and unlocked after destruction. unique_lock can also unlock the mutex
   /// and re-lock it freely during its lifetime.
-  /// @brief Guard a section of code with a mutex.
+  /// Guard a section of code with a mutex.
   template<typename MutexT>
   class unique_lock {
     MutexT *M = nullptr;
diff --git a/contrib/llvm/include/llvm/Support/VersionTuple.h b/contrib/llvm/include/llvm/Support/VersionTuple.h
new file mode 100644
index 000000000000..e85a188e54b4
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/VersionTuple.h
@@ -0,0 +1,154 @@
+//===- VersionTuple.h - Version Number Handling -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Defines the llvm::VersionTuple class, which represents a version in
+/// the form major[.minor[.subminor]].
+///
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SUPPORT_VERSIONTUPLE_H
+#define LLVM_SUPPORT_VERSIONTUPLE_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <tuple>
+
+namespace llvm {
+
+/// Represents a version number in the form major[.minor[.subminor[.build]]].
+class VersionTuple {
+  unsigned Major : 32;
+
+  unsigned Minor : 31;
+  unsigned HasMinor : 1;
+
+  unsigned Subminor : 31;
+  unsigned HasSubminor : 1;
+
+  unsigned Build : 31;
+  unsigned HasBuild : 1;
+
+public:
+  VersionTuple()
+      : Major(0), Minor(0), HasMinor(false), Subminor(0), HasSubminor(false),
+        Build(0), HasBuild(false) {}
+
+  explicit VersionTuple(unsigned Major)
+      : Major(Major), Minor(0), HasMinor(false), Subminor(0),
+        HasSubminor(false), Build(0), HasBuild(false) {}
+
+  explicit VersionTuple(unsigned Major, unsigned Minor)
+      : Major(Major), Minor(Minor), HasMinor(true), Subminor(0),
+        HasSubminor(false), Build(0), HasBuild(false) {}
+
+  explicit VersionTuple(unsigned Major, unsigned Minor, unsigned Subminor)
+      : Major(Major), Minor(Minor), HasMinor(true), Subminor(Subminor),
+        HasSubminor(true), Build(0), HasBuild(false) {}
+
+  explicit VersionTuple(unsigned Major, unsigned Minor, unsigned Subminor,
+                        unsigned Build)
+      : Major(Major), Minor(Minor), HasMinor(true), Subminor(Subminor),
+        HasSubminor(true), Build(Build), HasBuild(true) {}
+
+  /// Determine whether this version information is empty
+  /// (e.g., all version components are zero).
+  bool empty() const {
+    return Major == 0 && Minor == 0 && Subminor == 0 && Build == 0;
+  }
+
+  /// Retrieve the major version number.
+  unsigned getMajor() const { return Major; }
+
+  /// Retrieve the minor version number, if provided.
+  Optional<unsigned> getMinor() const {
+    if (!HasMinor)
+      return None;
+    return Minor;
+  }
+
+  /// Retrieve the subminor version number, if provided.
+  Optional<unsigned> getSubminor() const {
+    if (!HasSubminor)
+      return None;
+    return Subminor;
+  }
+
+  /// Retrieve the build version number, if provided.
+  Optional<unsigned> getBuild() const {
+    if (!HasBuild)
+      return None;
+    return Build;
+  }
+
+  /// Determine if two version numbers are equivalent. If not
+  /// provided, minor and subminor version numbers are considered to be zero.
+  friend bool operator==(const VersionTuple &X, const VersionTuple &Y) {
+    return X.Major == Y.Major && X.Minor == Y.Minor &&
+           X.Subminor == Y.Subminor && X.Build == Y.Build;
+  }
+
+  /// Determine if two version numbers are not equivalent.
+  ///
+  /// If not provided, minor and subminor version numbers are considered to be
+  /// zero.
+  friend bool operator!=(const VersionTuple &X, const VersionTuple &Y) {
+    return !(X == Y);
+  }
+
+  /// Determine whether one version number precedes another.
+  ///
+  /// If not provided, minor and subminor version numbers are considered to be
+  /// zero.
+  friend bool operator<(const VersionTuple &X, const VersionTuple &Y) {
+    return std::tie(X.Major, X.Minor, X.Subminor, X.Build) <
+           std::tie(Y.Major, Y.Minor, Y.Subminor, Y.Build);
+  }
+
+  /// Determine whether one version number follows another.
+  ///
+  /// If not provided, minor and subminor version numbers are considered to be
+  /// zero.
+  friend bool operator>(const VersionTuple &X, const VersionTuple &Y) {
+    return Y < X;
+  }
+
+  /// Determine whether one version number precedes or is
+  /// equivalent to another.
+  ///
+  /// If not provided, minor and subminor version numbers are considered to be
+  /// zero.
+  friend bool operator<=(const VersionTuple &X, const VersionTuple &Y) {
+    return !(Y < X);
+  }
+
+  /// Determine whether one version number follows or is
+  /// equivalent to another.
+  ///
+  /// If not provided, minor and subminor version numbers are considered to be
+  /// zero.
+  friend bool operator>=(const VersionTuple &X, const VersionTuple &Y) {
+    return !(X < Y);
+  }
+
+  /// Retrieve a string representation of the version number.
+  std::string getAsString() const;
+
+  /// Try to parse the given string as a version number.
+  /// \returns \c true if the string does not match the regular expression
+  ///   [0-9]+(\.[0-9]+){0,3}
+  bool tryParse(StringRef string);
+};
+
+/// Print a version number.
+raw_ostream &operator<<(raw_ostream &Out, const VersionTuple &V);
+
+} // end namespace llvm
+#endif // LLVM_SUPPORT_VERSIONTUPLE_H
diff --git a/contrib/llvm/include/llvm/Support/Win64EH.h b/contrib/llvm/include/llvm/Support/Win64EH.h
index f6c492794875..928eb906de0c 100644
--- a/contrib/llvm/include/llvm/Support/Win64EH.h
+++ b/contrib/llvm/include/llvm/Support/Win64EH.h
@@ -101,40 +101,40 @@ struct UnwindInfo {
   // For more information please see MSDN at:
   // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx
 
-  /// \brief Return pointer to language specific data part of UnwindInfo.
+  /// Return pointer to language specific data part of UnwindInfo.
   void *getLanguageSpecificData() {
     return reinterpret_cast<void *>(&UnwindCodes[(NumCodes+1) & ~1]);
   }
 
-  /// \brief Return pointer to language specific data part of UnwindInfo.
+  /// Return pointer to language specific data part of UnwindInfo.
   const void *getLanguageSpecificData() const {
     return reinterpret_cast<const void *>(&UnwindCodes[(NumCodes + 1) & ~1]);
   }
 
-  /// \brief Return image-relative offset of language-specific exception handler.
+  /// Return image-relative offset of language-specific exception handler.
   uint32_t getLanguageSpecificHandlerOffset() const {
     return *reinterpret_cast<const support::ulittle32_t *>(
                getLanguageSpecificData());
   }
 
-  /// \brief Set image-relative offset of language-specific exception handler.
+  /// Set image-relative offset of language-specific exception handler.
   void setLanguageSpecificHandlerOffset(uint32_t offset) {
     *reinterpret_cast<support::ulittle32_t *>(getLanguageSpecificData()) =
         offset;
   }
 
-  /// \brief Return pointer to exception-specific data.
+  /// Return pointer to exception-specific data.
   void *getExceptionData() {
     return reinterpret_cast<void *>(reinterpret_cast<uint32_t *>(
                                                   getLanguageSpecificData())+1);
   }
 
-  /// \brief Return pointer to chained unwind info.
+  /// Return pointer to chained unwind info.
   RuntimeFunction *getChainedFunctionEntry() {
     return reinterpret_cast<RuntimeFunction *>(getLanguageSpecificData());
   }
 
-  /// \brief Return pointer to chained unwind info.
+  /// Return pointer to chained unwind info.
   const RuntimeFunction *getChainedFunctionEntry() const {
     return reinterpret_cast<const RuntimeFunction *>(getLanguageSpecificData());
   }
diff --git a/contrib/llvm/include/llvm/Support/WithColor.h b/contrib/llvm/include/llvm/Support/WithColor.h
new file mode 100644
index 000000000000..85fc5fa0cf14
--- /dev/null
+++ b/contrib/llvm/include/llvm/Support/WithColor.h
@@ -0,0 +1,67 @@
+//===- WithColor.h ----------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WITHCOLOR_H
+#define LLVM_SUPPORT_WITHCOLOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+
+extern cl::OptionCategory ColorCategory;
+
+class raw_ostream;
+
+// Symbolic names for various syntax elements.
+enum class HighlightColor {
+  Address,
+  String,
+  Tag,
+  Attribute,
+  Enumerator,
+  Macro,
+  Error,
+  Warning,
+  Note
+};
+
+/// An RAII object that temporarily switches an output stream to a specific
+/// color.
+class WithColor {
+  raw_ostream &OS;
+  /// Determine whether colors should be displayed.
+  bool colorsEnabled(raw_ostream &OS);
+
+public:
+  /// To be used like this: WithColor(OS, HighlightColor::String) << "text";
+  WithColor(raw_ostream &OS, HighlightColor S);
+  ~WithColor();
+
+  raw_ostream &get() { return OS; }
+  operator raw_ostream &() { return OS; }
+
+  /// Convenience method for printing "error: " to stderr.
+  static raw_ostream &error();
+  /// Convenience method for printing "warning: " to stderr.
+  static raw_ostream &warning();
+  /// Convenience method for printing "note: " to stderr.
+  static raw_ostream &note();
+
+  /// Convenience method for printing "error: " to the given stream.
+  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "");
+  /// Convenience method for printing "warning: " to the given stream.
+  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "");
+  /// Convenience method for printing "note: " to the given stream.
+  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "");
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_DEBUGINFO_WITHCOLOR_H
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index ad1404860fb6..185b357efef5 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/contrib/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -31,6 +31,7 @@ namespace X86Disassembler {
 #define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
 #define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
 #define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
+#define THREEDNOW_MAP_SYM x86Disassembler3DNowOpcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -41,6 +42,7 @@ namespace X86Disassembler {
 #define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
 #define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
 #define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
+#define THREEDNOW_MAP_STR "x86Disassembler3DNowOpcodes"
 
 // Attributes of an instruction that must be known before the opcode can be
 // processed correctly.  Most of these indicate the presence of particular
@@ -91,6 +93,10 @@ enum attributeBits {
                                         "operands change width")               \
   ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
                                         "operands change width")               \
+  ENUM_ENTRY(IC_XD_ADSIZE,          3,  "requires an ADSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_ADSIZE,          3,  "requires an ADSIZE prefix, so "       \
+                                        "operands change width")               \
   ENUM_ENTRY(IC_64BIT_REXW,         5,  "requires a REX.W prefix, so operands "\
                                         "change width; overrides IC_OPSIZE")   \
   ENUM_ENTRY(IC_64BIT_REXW_ADSIZE,  6,  "requires a REX.W prefix and 0x67 "    \
@@ -104,6 +110,8 @@ enum attributeBits {
   ENUM_ENTRY(IC_64BIT_XS,           6,  "Just as meaningful as IC_64BIT_XD")   \
   ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
   ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XD_ADSIZE,    3,  "Just as meaningful as IC_XD_ADSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_ADSIZE,    3,  "Just as meaningful as IC_XS_ADSIZE")  \
   ENUM_ENTRY(IC_64BIT_REXW_XS,      7,  "OPSIZE could mean a different "       \
                                         "opcode")                              \
   ENUM_ENTRY(IC_64BIT_REXW_XD,      7,  "Just as meaningful as "               \
@@ -288,7 +296,8 @@ enum OpcodeType {
   THREEBYTE_3A  = 3,
   XOP8_MAP      = 4,
   XOP9_MAP      = 5,
-  XOPA_MAP      = 6
+  XOPA_MAP      = 6,
+  THREEDNOW_MAP = 7
 };
 
 // The following structs are used for the hierarchical decode table.  After
@@ -443,7 +452,7 @@ enum OperandType {
 };
 #undef ENUM_ENTRY
 
-/// \brief The specification for how to extract and interpret one operand.
+/// The specification for how to extract and interpret one operand.
 struct OperandSpecifier {
   uint8_t encoding;
   uint8_t type;
diff --git a/contrib/llvm/include/llvm/Support/X86TargetParser.def b/contrib/llvm/include/llvm/Support/X86TargetParser.def
index 5c8c576b1027..e4af0657a350 100644
--- a/contrib/llvm/include/llvm/Support/X86TargetParser.def
+++ b/contrib/llvm/include/llvm/Support/X86TargetParser.def
@@ -65,6 +65,8 @@ X86_CPU_TYPE                  ("athlon-xp",   AMD_ATHLON_XP)
 X86_CPU_TYPE                  ("k8",          AMD_K8)
 X86_CPU_TYPE                  ("k8-sse3",     AMD_K8SSE3)
 X86_CPU_TYPE                  ("goldmont",    INTEL_GOLDMONT)
+X86_CPU_TYPE                  ("goldmont-plus", INTEL_GOLDMONT_PLUS)
+X86_CPU_TYPE                  ("tremont",     INTEL_TREMONT)
 #undef X86_CPU_TYPE_COMPAT_WITH_ALIAS
 #undef X86_CPU_TYPE_COMPAT
 #undef X86_CPU_TYPE
diff --git a/contrib/llvm/include/llvm/Support/YAMLParser.h b/contrib/llvm/include/llvm/Support/YAMLParser.h
index c907a99ddb59..5b031a9a4270 100644
--- a/contrib/llvm/include/llvm/Support/YAMLParser.h
+++ b/contrib/llvm/include/llvm/Support/YAMLParser.h
@@ -64,23 +64,26 @@ class Node;
 class Scanner;
 struct Token;
 
-/// \brief Dump all the tokens in this stream to OS.
+/// Dump all the tokens in this stream to OS.
 /// \returns true if there was an error, false otherwise.
 bool dumpTokens(StringRef Input, raw_ostream &);
 
-/// \brief Scans all tokens in input without outputting anything. This is used
+/// Scans all tokens in input without outputting anything. This is used
 ///        for benchmarking the tokenizer.
 /// \returns true if there was an error, false otherwise.
 bool scanTokens(StringRef Input);
 
-/// \brief Escape \a Input for a double quoted scalar.
-std::string escape(StringRef Input);
+/// Escape \a Input for a double quoted scalar; if \p EscapePrintable
+/// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
+/// false, those UTF8 sequences encoding printable unicode scalars will not be
+/// escaped, but emitted verbatim.
+std::string escape(StringRef Input, bool EscapePrintable = true);
 
-/// \brief This class represents a YAML stream potentially containing multiple
+/// This class represents a YAML stream potentially containing multiple
 ///        documents.
 class Stream {
 public:
-  /// \brief This keeps a reference to the string referenced by \p Input.
+  /// This keeps a reference to the string referenced by \p Input.
   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
          std::error_code *EC = nullptr);
 
@@ -107,7 +110,7 @@ private:
   std::unique_ptr<Document> CurrentDoc;
 };
 
-/// \brief Abstract base class for all Nodes.
+/// Abstract base class for all Nodes.
 class Node {
   virtual void anchor();
 
@@ -125,6 +128,11 @@ public:
   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
        StringRef Tag);
 
+  // It's not safe to copy YAML nodes; the document is streamed and the position
+  // is part of the state.
+  Node(const Node &) = delete;
+  void operator=(const Node &) = delete;
+
   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
                      size_t Alignment = 16) noexcept {
     return Alloc.Allocate(Size, Alignment);
@@ -137,15 +145,15 @@ public:
 
   void operator delete(void *) noexcept = delete;
 
-  /// \brief Get the value of the anchor attached to this node. If it does not
+  /// Get the value of the anchor attached to this node. If it does not
   ///        have one, getAnchor().size() will be 0.
   StringRef getAnchor() const { return Anchor; }
 
-  /// \brief Get the tag as it was written in the document. This does not
+  /// Get the tag as it was written in the document. This does not
   ///   perform tag resolution.
   StringRef getRawTag() const { return Tag; }
 
-  /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
+  /// Get the verbatium tag for a given Node. This performs tag resoluton
   ///   and substitution.
   std::string getVerbatimTag() const;
 
@@ -173,11 +181,11 @@ protected:
 private:
   unsigned int TypeID;
   StringRef Anchor;
-  /// \brief The tag as typed in the document.
+  /// The tag as typed in the document.
   StringRef Tag;
 };
 
-/// \brief A null value.
+/// A null value.
 ///
 /// Example:
 ///   !!null null
@@ -191,7 +199,7 @@ public:
   static bool classof(const Node *N) { return N->getType() == NK_Null; }
 };
 
-/// \brief A scalar node is an opaque datum that can be presented as a
+/// A scalar node is an opaque datum that can be presented as a
 ///        series of zero or more Unicode scalar values.
 ///
 /// Example:
@@ -213,7 +221,7 @@ public:
   // utf8).
   StringRef getRawValue() const { return Value; }
 
-  /// \brief Gets the value of this node as a StringRef.
+  /// Gets the value of this node as a StringRef.
   ///
   /// \param Storage is used to store the content of the returned StringRef iff
   ///        it requires any modification from how it appeared in the source.
@@ -232,7 +240,7 @@ private:
                                  SmallVectorImpl<char> &Storage) const;
 };
 
-/// \brief A block scalar node is an opaque datum that can be presented as a
+/// A block scalar node is an opaque datum that can be presented as a
 ///        series of zero or more Unicode scalar values.
 ///
 /// Example:
@@ -251,7 +259,7 @@ public:
     SourceRange = SMRange(Start, End);
   }
 
-  /// \brief Gets the value of this node as a StringRef.
+  /// Gets the value of this node as a StringRef.
   StringRef getValue() const { return Value; }
 
   static bool classof(const Node *N) {
@@ -262,7 +270,7 @@ private:
   StringRef Value;
 };
 
-/// \brief A key and value pair. While not technically a Node under the YAML
+/// A key and value pair. While not technically a Node under the YAML
 ///        representation graph, it is easier to treat them this way.
 ///
 /// TODO: Consider making this not a child of Node.
@@ -276,14 +284,14 @@ public:
   KeyValueNode(std::unique_ptr<Document> &D)
       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
 
-  /// \brief Parse and return the key.
+  /// Parse and return the key.
   ///
   /// This may be called multiple times.
   ///
   /// \returns The key, or nullptr if failed() == true.
   Node *getKey();
 
-  /// \brief Parse and return the value.
+  /// Parse and return the value.
   ///
   /// This may be called multiple times.
   ///
@@ -307,7 +315,7 @@ private:
   Node *Value = nullptr;
 };
 
-/// \brief This is an iterator abstraction over YAML collections shared by both
+/// This is an iterator abstraction over YAML collections shared by both
 ///        sequences and maps.
 ///
 /// BaseT must have a ValueT* member named CurrentEntry and a member function
@@ -387,7 +395,7 @@ template <class CollectionType> void skip(CollectionType &C) {
       i->skip();
 }
 
-/// \brief Represents a YAML map created from either a block map for a flow map.
+/// Represents a YAML map created from either a block map for a flow map.
 ///
 /// This parses the YAML stream as increment() is called.
 ///
@@ -434,7 +442,7 @@ private:
   void increment();
 };
 
-/// \brief Represents a YAML sequence created from either a block sequence for a
+/// Represents a YAML sequence created from either a block sequence for a
 ///        flow sequence.
 ///
 /// This parses the YAML stream as increment() is called.
@@ -490,7 +498,7 @@ private:
   Node *CurrentEntry = nullptr;
 };
 
-/// \brief Represents an alias to a Node with an anchor.
+/// Represents an alias to a Node with an anchor.
 ///
 /// Example:
 ///   *AnchorName
@@ -510,20 +518,20 @@ private:
   StringRef Name;
 };
 
-/// \brief A YAML Stream is a sequence of Documents. A document contains a root
+/// A YAML Stream is a sequence of Documents. A document contains a root
 ///        node.
 class Document {
 public:
   Document(Stream &ParentStream);
 
-  /// \brief Root for parsing a node. Returns a single node.
+  /// Root for parsing a node. Returns a single node.
   Node *parseBlockNode();
 
-  /// \brief Finish parsing the current document and return true if there are
+  /// Finish parsing the current document and return true if there are
   ///        more. Return false otherwise.
   bool skip();
 
-  /// \brief Parse and return the root level node.
+  /// Parse and return the root level node.
   Node *getRoot() {
     if (Root)
       return Root;
@@ -536,18 +544,18 @@ private:
   friend class Node;
   friend class document_iterator;
 
-  /// \brief Stream to read tokens from.
+  /// Stream to read tokens from.
   Stream &stream;
 
-  /// \brief Used to allocate nodes to. All are destroyed without calling their
+  /// Used to allocate nodes to. All are destroyed without calling their
   ///        destructor when the document is destroyed.
   BumpPtrAllocator NodeAllocator;
 
-  /// \brief The root node. Used to support skipping a partially parsed
+  /// The root node. Used to support skipping a partially parsed
   ///        document.
   Node *Root;
 
-  /// \brief Maps tag prefixes to their expansion.
+  /// Maps tag prefixes to their expansion.
   std::map<StringRef, StringRef> TagMap;
 
   Token &peekNext();
@@ -555,20 +563,20 @@ private:
   void setError(const Twine &Message, Token &Location) const;
   bool failed() const;
 
-  /// \brief Parse %BLAH directives and return true if any were encountered.
+  /// Parse %BLAH directives and return true if any were encountered.
   bool parseDirectives();
 
-  /// \brief Parse %YAML
+  /// Parse %YAML
   void parseYAMLDirective();
 
-  /// \brief Parse %TAG
+  /// Parse %TAG
   void parseTAGDirective();
 
-  /// \brief Consume the next token and error if it is not \a TK.
+  /// Consume the next token and error if it is not \a TK.
   bool expectToken(int TK);
 };
 
-/// \brief Iterator abstraction for Documents over a Stream.
+/// Iterator abstraction for Documents over a Stream.
 class document_iterator {
 public:
   document_iterator() = default;
diff --git a/contrib/llvm/include/llvm/Support/YAMLTraits.h b/contrib/llvm/include/llvm/Support/YAMLTraits.h
index 674c78a11695..4b8c4e958288 100644
--- a/contrib/llvm/include/llvm/Support/YAMLTraits.h
+++ b/contrib/llvm/include/llvm/Support/YAMLTraits.h
@@ -511,8 +511,6 @@ inline QuotingType needsQuotes(StringRef S) {
     return QuotingType::Single;
   if (isspace(S.front()) || isspace(S.back()))
     return QuotingType::Single;
-  if (S.front() == ',')
-    return QuotingType::Single;
   if (isNull(S))
     return QuotingType::Single;
   if (isBool(S))
@@ -520,6 +518,13 @@ inline QuotingType needsQuotes(StringRef S) {
   if (isNumeric(S))
     return QuotingType::Single;
 
+  // 7.3.3 Plain Style
+  // Plain scalars must not begin with most indicators, as this would cause
+  // ambiguity with other YAML constructs.
+  static constexpr char Indicators[] = R"(-?:\,[]{}#&*!|>'"%@`)";
+  if (S.find_first_of(Indicators) == 0)
+    return QuotingType::Single;
+
   QuotingType MaxQuotingNeeded = QuotingType::None;
   for (unsigned char C : S) {
     // Alphanum is safe.
@@ -535,11 +540,14 @@ inline QuotingType needsQuotes(StringRef S) {
     case '.':
     case ',':
     case ' ':
-    // TAB (0x9), LF (0xA), CR (0xD) and NEL (0x85) are allowed.
+    // TAB (0x9) is allowed in unquoted strings.
     case 0x9:
+      continue;
+    // LF(0xA) and CR(0xD) may delimit values and so require at least single
+    // quotes.
     case 0xA:
     case 0xD:
-    case 0x85:
+      MaxQuotingNeeded = QuotingType::Single;
       continue;
     // DEL (0x7F) are excluded from the allowed character range.
     case 0x7F:
@@ -1306,7 +1314,7 @@ public:
   Output(raw_ostream &, void *Ctxt = nullptr, int WrapColumn = 70);
   ~Output() override;
 
-  /// \brief Set whether or not to output optional values which are equal
+  /// Set whether or not to output optional values which are equal
   /// to the default value.  By default, when outputting if you attempt
   /// to write a value that is equal to the default, the value gets ignored.
   /// Sometimes, it is useful to be able to see these in the resulting YAML
diff --git a/contrib/llvm/include/llvm/Support/raw_ostream.h b/contrib/llvm/include/llvm/Support/raw_ostream.h
index d11f5a837796..b9ea9b5817f2 100644
--- a/contrib/llvm/include/llvm/Support/raw_ostream.h
+++ b/contrib/llvm/include/llvm/Support/raw_ostream.h
@@ -33,7 +33,9 @@ class FormattedBytes;
 
 namespace sys {
 namespace fs {
+enum FileAccess : unsigned;
 enum OpenFlags : unsigned;
+enum CreationDisposition : unsigned;
 } // end namespace fs
 } // end namespace sys
 
@@ -218,7 +220,7 @@ public:
   raw_ostream &write_uuid(const uuid_t UUID);
 
   /// Output \p Str, turning '\\', '\t', '\n', '"', and anything that doesn't
-  /// satisfy std::isprint into an escape sequence.
+  /// satisfy llvm::isPrint into an escape sequence.
   raw_ostream &write_escaped(StringRef Str, bool UseHexEscapes = false);
 
   raw_ostream &write(unsigned char C);
@@ -242,6 +244,9 @@ public:
   /// indent - Insert 'NumSpaces' spaces.
   raw_ostream &indent(unsigned NumSpaces);
 
+  /// write_zeros - Insert 'NumZeros' nulls.
+  raw_ostream &write_zeros(unsigned NumZeros);
+
   /// Changes the foreground color of text that will be output from this point
   /// forward.
   /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
@@ -293,9 +298,6 @@ private:
   /// \invariant { Size > 0 }
   virtual void write_impl(const char *Ptr, size_t Size) = 0;
 
-  // An out of line virtual method to provide a home for the class vtable.
-  virtual void handle();
-
   /// Return the current position within the stream, not counting the bytes
   /// currently in the buffer.
   virtual uint64_t current_pos() const = 0;
@@ -329,6 +331,8 @@ private:
   /// Copy data into the buffer. Size must not be greater than the number of
   /// unused bytes in the buffer.
   void copy_to_buffer(const char *Ptr, size_t Size);
+
+  virtual void anchor();
 };
 
 /// An abstract base class for streams implementations that also support a
@@ -336,6 +340,7 @@ private:
 /// but needs to patch in a header that needs to know the output size.
 class raw_pwrite_stream : public raw_ostream {
   virtual void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) = 0;
+  void anchor() override;
 
 public:
   explicit raw_pwrite_stream(bool Unbuffered = false)
@@ -383,6 +388,8 @@ class raw_fd_ostream : public raw_pwrite_stream {
   /// Set the flag indicating that an output error has been encountered.
   void error_detected(std::error_code EC) { this->EC = EC; }
 
+  void anchor() override;
+
 public:
   /// Open the specified file for writing. If an error occurs, information
   /// about the error is put into EC, and the stream should be immediately
@@ -392,7 +399,15 @@ public:
   /// As a special case, if Filename is "-", then the stream will use
   /// STDOUT_FILENO instead of opening a file. This will not close the stdout
   /// descriptor.
+  raw_fd_ostream(StringRef Filename, std::error_code &EC);
+  raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                 sys::fs::CreationDisposition Disp);
+  raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                 sys::fs::FileAccess Access);
+  raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                 sys::fs::OpenFlags Flags);
   raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                 sys::fs::CreationDisposition Disp, sys::fs::FileAccess Access,
                  sys::fs::OpenFlags Flags);
 
   /// FD is the file descriptor that this writes to.  If ShouldClose is true,
diff --git a/contrib/llvm/include/llvm/Support/type_traits.h b/contrib/llvm/include/llvm/Support/type_traits.h
index cc0878358800..55d84f138f07 100644
--- a/contrib/llvm/include/llvm/Support/type_traits.h
+++ b/contrib/llvm/include/llvm/Support/type_traits.h
@@ -54,7 +54,7 @@ struct isPodLike<std::pair<T, U>> {
   static const bool value = isPodLike<T>::value && isPodLike<U>::value;
 };
 
-/// \brief Metafunction that determines whether the given type is either an
+/// Metafunction that determines whether the given type is either an
 /// integral type or an enumeration type, including enum classes.
 ///
 /// Note that this accepts potentially more integral types than is_integral
@@ -73,7 +73,7 @@ public:
        std::is_convertible<UnderlyingT, unsigned long long>::value);
 };
 
-/// \brief If T is a pointer, just return it. If it is not, return T&.
+/// If T is a pointer, just return it. If it is not, return T&.
 template<typename T, typename Enable = void>
 struct add_lvalue_reference_if_not_pointer { using type = T &; };
 
@@ -83,7 +83,7 @@ struct add_lvalue_reference_if_not_pointer<
   using type = T;
 };
 
-/// \brief If T is a pointer to X, return a pointer to const X. If it is not,
+/// If T is a pointer to X, return a pointer to const X. If it is not,
 /// return const T.
 template<typename T, typename Enable = void>
 struct add_const_past_pointer { using type = const T; };
@@ -104,12 +104,51 @@ struct const_pointer_or_const_ref<
   using type = typename add_const_past_pointer<T>::type;
 };
 
+namespace detail {
+/// Internal utility to detect trivial copy construction.
+template<typename T> union copy_construction_triviality_helper {
+    T t;
+    copy_construction_triviality_helper() = default;
+    copy_construction_triviality_helper(const copy_construction_triviality_helper&) = default;
+    ~copy_construction_triviality_helper() = default;
+};
+/// Internal utility to detect trivial move construction.
+template<typename T> union move_construction_triviality_helper {
+    T t;
+    move_construction_triviality_helper() = default;
+    move_construction_triviality_helper(move_construction_triviality_helper&&) = default;
+    ~move_construction_triviality_helper() = default;
+};
+} // end namespace detail
+
+/// An implementation of `std::is_trivially_copy_constructible` since we have
+/// users with STLs that don't yet include it.
+template <typename T>
+struct is_trivially_copy_constructible
+    : std::is_copy_constructible<
+          ::llvm::detail::copy_construction_triviality_helper<T>> {};
+template <typename T>
+struct is_trivially_copy_constructible<T &> : std::true_type {};
+template <typename T>
+struct is_trivially_copy_constructible<T &&> : std::false_type {};
+
+/// An implementation of `std::is_trivially_move_constructible` since we have
+/// users with STLs that don't yet include it.
+template <typename T>
+struct is_trivially_move_constructible
+    : std::is_move_constructible<
+          ::llvm::detail::move_construction_triviality_helper<T>> {};
+template <typename T>
+struct is_trivially_move_constructible<T &> : std::true_type {};
+template <typename T>
+struct is_trivially_move_constructible<T &&> : std::true_type {};
+
 } // end namespace llvm
 
 // If the compiler supports detecting whether a class is final, define
 // an LLVM_IS_FINAL macro. If it cannot be defined properly, this
 // macro will be left undefined.
-#if __cplusplus >= 201402L
+#if __cplusplus >= 201402L || defined(_MSC_VER)
 #define LLVM_IS_FINAL(Ty) std::is_final<Ty>()
 #elif __has_feature(is_final) || LLVM_GNUC_PREREQ(4, 7, 0)
 #define LLVM_IS_FINAL(Ty) __is_final(Ty)
diff --git a/contrib/llvm/include/llvm/TableGen/Record.h b/contrib/llvm/include/llvm/TableGen/Record.h
index 55b4dfe2fa2f..e022bc82b4e4 100644
--- a/contrib/llvm/include/llvm/TableGen/Record.h
+++ b/contrib/llvm/include/llvm/TableGen/Record.h
@@ -16,6 +16,8 @@
 #define LLVM_TABLEGEN_RECORD_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
@@ -42,7 +44,9 @@ struct MultiClass;
 class Record;
 class RecordKeeper;
 class RecordVal;
+class Resolver;
 class StringInit;
+class TypedInit;
 
 //===----------------------------------------------------------------------===//
 //  Type Classes
@@ -50,7 +54,7 @@ class StringInit;
 
 class RecTy {
 public:
-  /// \brief Subclass discriminator (for dyn_cast<> et al.)
+  /// Subclass discriminator (for dyn_cast<> et al.)
   enum RecTyKind {
     BitRecTyKind,
     BitsRecTyKind,
@@ -80,6 +84,10 @@ public:
   /// type.
   virtual bool typeIsConvertibleTo(const RecTy *RHS) const;
 
+  /// Return true if 'this' type is equal to or a subtype of RHS. For example,
+  /// a bit set is not an int, but they are convertible.
+  virtual bool typeIsA(const RecTy *RHS) const;
+
   /// Returns the type representing list<this>.
   ListRecTy *getListTy();
 };
@@ -125,6 +133,8 @@ public:
   std::string getAsString() const override;
 
   bool typeIsConvertibleTo(const RecTy *RHS) const override;
+
+  bool typeIsA(const RecTy *RHS) const override;
 };
 
 /// 'code' - Represent a code fragment
@@ -141,6 +151,8 @@ public:
   static CodeRecTy *get() { return &Shared; }
 
   std::string getAsString() const override { return "code"; }
+
+  bool typeIsConvertibleTo(const RecTy *RHS) const override;
 };
 
 /// 'int' - Represent an integer value of no particular size
@@ -169,13 +181,14 @@ class StringRecTy : public RecTy {
 
 public:
   static bool classof(const RecTy *RT) {
-    return RT->getRecTyKind() == StringRecTyKind ||
-           RT->getRecTyKind() == CodeRecTyKind;
+    return RT->getRecTyKind() == StringRecTyKind;
   }
 
   static StringRecTy *get() { return &Shared; }
 
   std::string getAsString() const override;
+
+  bool typeIsConvertibleTo(const RecTy *RHS) const override;
 };
 
 /// 'list<Ty>' - Represent a list of values, all of which must be of
@@ -198,6 +211,8 @@ public:
   std::string getAsString() const override;
 
   bool typeIsConvertibleTo(const RecTy *RHS) const override;
+
+  bool typeIsA(const RecTy *RHS) const override;
 };
 
 /// 'dag' - Represent a dag fragment
@@ -216,27 +231,50 @@ public:
   std::string getAsString() const override;
 };
 
-/// '[classname]' - Represent an instance of a class, such as:
-/// (R32 X = EAX).
-class RecordRecTy : public RecTy {
+/// '[classname]' - Type of record values that have zero or more superclasses.
+///
+/// The list of superclasses is non-redundant, i.e. only contains classes that
+/// are not the superclass of some other listed class.
+class RecordRecTy final : public RecTy, public FoldingSetNode,
+                          public TrailingObjects<RecordRecTy, Record *> {
   friend class Record;
 
-  Record *Rec;
+  unsigned NumClasses;
 
-  explicit RecordRecTy(Record *R) : RecTy(RecordRecTyKind), Rec(R) {}
+  explicit RecordRecTy(unsigned Num)
+      : RecTy(RecordRecTyKind), NumClasses(Num) {}
 
 public:
+  RecordRecTy(const RecordRecTy &) = delete;
+  RecordRecTy &operator=(const RecordRecTy &) = delete;
+
+  // Do not use sized deallocation due to trailing objects.
+  void operator delete(void *p) { ::operator delete(p); }
+
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == RecordRecTyKind;
   }
 
-  static RecordRecTy *get(Record *R);
+  /// Get the record type with the given non-redundant list of superclasses.
+  static RecordRecTy *get(ArrayRef<Record *> Classes);
+
+  void Profile(FoldingSetNodeID &ID) const;
+
+  ArrayRef<Record *> getClasses() const {
+    return makeArrayRef(getTrailingObjects<Record *>(), NumClasses);
+  }
+
+  using const_record_iterator = Record * const *;
 
-  Record *getRecord() const { return Rec; }
+  const_record_iterator classes_begin() const { return getClasses().begin(); }
+  const_record_iterator classes_end() const { return getClasses().end(); }
 
   std::string getAsString() const override;
 
+  bool isSubClassOf(Record *Class) const;
   bool typeIsConvertibleTo(const RecTy *RHS) const override;
+
+  bool typeIsA(const RecTy *RHS) const override;
 };
 
 /// Find a common type that T1 and T2 convert to.
@@ -249,7 +287,7 @@ RecTy *resolveTypes(RecTy *T1, RecTy *T2);
 
 class Init {
 protected:
-  /// \brief Discriminator enum (for isa<>, dyn_cast<>, et al.)
+  /// Discriminator enum (for isa<>, dyn_cast<>, et al.)
   ///
   /// This enum is laid out by a preorder traversal of the inheritance
   /// hierarchy, and does not contain an entry for abstract classes, as per
@@ -263,8 +301,9 @@ protected:
   /// and IK_LastXXXInit be their own values, but that would degrade
   /// readability for really no benefit.
   enum InitKind : uint8_t {
-    IK_BitInit,
+    IK_First, // unused; silence a spurious warning
     IK_FirstTypedInit,
+    IK_BitInit,
     IK_BitsInit,
     IK_CodeInit,
     IK_DagInit,
@@ -277,12 +316,15 @@ protected:
     IK_TernOpInit,
     IK_UnOpInit,
     IK_LastOpInit,
+    IK_FoldOpInit,
+    IK_IsAOpInit,
     IK_StringInit,
     IK_VarInit,
     IK_VarListElementInit,
+    IK_VarBitInit,
+    IK_VarDefInit,
     IK_LastTypedInit,
-    IK_UnsetInit,
-    IK_VarBitInit
+    IK_UnsetInit
   };
 
 private:
@@ -309,6 +351,10 @@ public:
   /// not be completely specified yet.
   virtual bool isComplete() const { return true; }
 
+  /// Is this a concrete and fully resolved value without any references or
+  /// stuck operations? Unset values are concrete.
+  virtual bool isConcrete() const { return false; }
+
   /// Print out this value.
   void print(raw_ostream &OS) const { OS << getAsString(); }
 
@@ -324,8 +370,14 @@ public:
   /// invokes print on stderr.
   void dump() const;
 
-  /// This virtual function converts to the appropriate
-  /// Init based on the passed in type.
+  /// If this initializer is convertible to Ty, return an initializer whose
+  /// type is-a Ty, generating a !cast operation if required. Otherwise, return
+  /// nullptr.
+  virtual Init *getCastTo(RecTy *Ty) const = 0;
+
+  /// Convert to an initializer whose type is-a Ty, or return nullptr if this
+  /// is not possible (this can happen if the initializer's type is convertible
+  /// to Ty, but there are unresolved references).
   virtual Init *convertInitializerTo(RecTy *Ty) const = 0;
 
   /// This method is used to implement the bitrange
@@ -351,33 +403,17 @@ public:
     return nullptr;
   }
 
-  /// This method complements getFieldType to return the
-  /// initializer for the specified field.  If getFieldType returns non-null
-  /// this method should return non-null, otherwise it returns null.
-  virtual Init *getFieldInit(Record &R, const RecordVal *RV,
-                             StringInit *FieldName) const {
-    return nullptr;
-  }
-
   /// This method is used by classes that refer to other
   /// variables which may not be defined at the time the expression is formed.
   /// If a value is set for the variable later, this method will be called on
   /// users of the value to allow the value to propagate out.
-  virtual Init *resolveReferences(Record &R, const RecordVal *RV) const {
+  virtual Init *resolveReferences(Resolver &R) const {
     return const_cast<Init *>(this);
   }
 
   /// This method is used to return the initializer for the specified
   /// bit.
   virtual Init *getBit(unsigned Bit) const = 0;
-
-  /// This method is used to retrieve the initializer for bit
-  /// reference. For non-VarBitInit, it simply returns itself.
-  virtual Init *getBitVar() const { return const_cast<Init*>(this); }
-
-  /// This method is used to retrieve the bit number of a bit
-  /// reference. For non-VarBitInit, it simply returns 0.
-  virtual unsigned getBitNum() const { return 0; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
@@ -404,6 +440,7 @@ public:
 
   RecTy *getType() const { return Ty; }
 
+  Init *getCastTo(RecTy *Ty) const override;
   Init *convertInitializerTo(RecTy *Ty) const override;
 
   Init *convertInitializerBitRange(ArrayRef<unsigned> Bits) const override;
@@ -414,12 +451,6 @@ public:
   /// they are of record type.
   ///
   RecTy *getFieldType(StringInit *FieldName) const override;
-
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                            unsigned Elt) const = 0;
 };
 
 /// '?' - Represents an uninitialized value
@@ -436,6 +467,7 @@ public:
 
   static UnsetInit *get();
 
+  Init *getCastTo(RecTy *Ty) const override;
   Init *convertInitializerTo(RecTy *Ty) const override;
 
   Init *getBit(unsigned Bit) const override {
@@ -443,14 +475,15 @@ public:
   }
 
   bool isComplete() const override { return false; }
+  bool isConcrete() const override { return true; }
   std::string getAsString() const override { return "?"; }
 };
 
 /// 'true'/'false' - Represent a concrete initializer for a bit.
-class BitInit : public Init {
+class BitInit final : public TypedInit {
   bool Value;
 
-  explicit BitInit(bool V) : Init(IK_BitInit), Value(V) {}
+  explicit BitInit(bool V) : TypedInit(IK_BitInit, BitRecTy::get()), Value(V) {}
 
 public:
   BitInit(const BitInit &) = delete;
@@ -471,6 +504,7 @@ public:
     return const_cast<BitInit*>(this);
   }
 
+  bool isConcrete() const override { return true; }
   std::string getAsString() const override { return Value ? "1" : "0"; }
 };
 
@@ -515,17 +549,10 @@ public:
     return true;
   }
 
+  bool isConcrete() const override;
   std::string getAsString() const override;
 
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override {
-    llvm_unreachable("Illegal element reference off bits<n>");
-  }
-
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
   Init *getBit(unsigned Bit) const override {
     assert(Bit < NumBits && "Bit index out of range!");
@@ -555,16 +582,9 @@ public:
   Init *convertInitializerTo(RecTy *Ty) const override;
   Init *convertInitializerBitRange(ArrayRef<unsigned> Bits) const override;
 
+  bool isConcrete() const override { return true; }
   std::string getAsString() const override;
 
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override {
-    llvm_unreachable("Illegal element reference off int");
-  }
-
   Init *getBit(unsigned Bit) const override {
     return BitInit::get((Value & (1ULL << Bit)) != 0);
   }
@@ -591,18 +611,11 @@ public:
 
   Init *convertInitializerTo(RecTy *Ty) const override;
 
+  bool isConcrete() const override { return true; }
   std::string getAsString() const override { return "\"" + Value.str() + "\""; }
 
   std::string getAsUnquotedString() const override { return Value; }
 
-  /// resolveListElementReference - This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override {
-    llvm_unreachable("Illegal element reference off string");
-  }
-
   Init *getBit(unsigned Bit) const override {
     llvm_unreachable("Illegal bit reference off string");
   }
@@ -629,20 +642,13 @@ public:
 
   Init *convertInitializerTo(RecTy *Ty) const override;
 
+  bool isConcrete() const override { return true; }
   std::string getAsString() const override {
     return "[{" + Value.str() + "}]";
   }
 
   std::string getAsUnquotedString() const override { return Value; }
 
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override {
-    llvm_unreachable("Illegal element reference off string");
-  }
-
   Init *getBit(unsigned Bit) const override {
     llvm_unreachable("Illegal bit reference off string");
   }
@@ -679,6 +685,9 @@ public:
     assert(i < NumValues && "List element index out of range!");
     return getTrailingObjects<Init *>()[i];
   }
+  RecTy *getElementType() const {
+    return cast<ListRecTy>(getType())->getElementType();
+  }
 
   Record *getElementAsRecord(unsigned i) const;
 
@@ -691,8 +700,9 @@ public:
   /// If a value is set for the variable later, this method will be called on
   /// users of the value to allow the value to propagate out.
   ///
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
+  bool isConcrete() const override;
   std::string getAsString() const override;
 
   ArrayRef<Init*> getValues() const {
@@ -705,12 +715,6 @@ public:
   size_t         size () const { return NumValues;  }
   bool           empty() const { return NumValues == 0; }
 
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override;
-
   Init *getBit(unsigned Bit) const override {
     llvm_unreachable("Illegal bit reference off list");
   }
@@ -738,13 +742,6 @@ public:
   virtual unsigned getNumOperands() const = 0;
   virtual Init *getOperand(unsigned i) const = 0;
 
-  // Fold - If possible, fold this to a simpler init.  Return this if not
-  // possible to fold.
-  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const = 0;
-
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override;
-
   Init *getBit(unsigned Bit) const override;
 };
 
@@ -752,7 +749,7 @@ public:
 ///
 class UnOpInit : public OpInit, public FoldingSetNode {
 public:
-  enum UnaryOp : uint8_t { CAST, HEAD, TAIL, EMPTY };
+  enum UnaryOp : uint8_t { CAST, HEAD, TAIL, SIZE, EMPTY };
 
 private:
   Init *LHS;
@@ -791,9 +788,9 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const override;
+  Init *Fold(Record *CurRec, bool IsFinal = false) const;
 
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
   std::string getAsString() const override;
 };
@@ -802,7 +799,7 @@ public:
 class BinOpInit : public OpInit, public FoldingSetNode {
 public:
   enum BinaryOp : uint8_t { ADD, AND, OR, SHL, SRA, SRL, LISTCONCAT,
-                            STRCONCAT, CONCAT, EQ };
+                            STRCONCAT, CONCAT, EQ, NE, LE, LT, GE, GT };
 
 private:
   Init *LHS, *RHS;
@@ -820,6 +817,7 @@ public:
 
   static BinOpInit *get(BinaryOp opc, Init *lhs, Init *rhs,
                         RecTy *Type);
+  static Init *getStrConcat(Init *lhs, Init *rhs);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -845,9 +843,9 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const override;
+  Init *Fold(Record *CurRec) const;
 
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
   std::string getAsString() const override;
 };
@@ -855,7 +853,7 @@ public:
 /// !op (X, Y, Z) - Combine two inits.
 class TernOpInit : public OpInit, public FoldingSetNode {
 public:
-  enum TernaryOp : uint8_t { SUBST, FOREACH, IF };
+  enum TernaryOp : uint8_t { SUBST, FOREACH, IF, DAG };
 
 private:
   Init *LHS, *MHS, *RHS;
@@ -903,11 +901,83 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const override;
+  Init *Fold(Record *CurRec) const;
+
+  bool isComplete() const override {
+    return LHS->isComplete() && MHS->isComplete() && RHS->isComplete();
+  }
+
+  Init *resolveReferences(Resolver &R) const override;
+
+  std::string getAsString() const override;
+};
+
+/// !foldl (a, b, expr, start, lst) - Fold over a list.
+class FoldOpInit : public TypedInit, public FoldingSetNode {
+private:
+  Init *Start;
+  Init *List;
+  Init *A;
+  Init *B;
+  Init *Expr;
+
+  FoldOpInit(Init *Start, Init *List, Init *A, Init *B, Init *Expr, RecTy *Type)
+      : TypedInit(IK_FoldOpInit, Type), Start(Start), List(List), A(A), B(B),
+        Expr(Expr) {}
+
+public:
+  FoldOpInit(const FoldOpInit &) = delete;
+  FoldOpInit &operator=(const FoldOpInit &) = delete;
+
+  static bool classof(const Init *I) { return I->getKind() == IK_FoldOpInit; }
+
+  static FoldOpInit *get(Init *Start, Init *List, Init *A, Init *B, Init *Expr,
+                         RecTy *Type);
+
+  void Profile(FoldingSetNodeID &ID) const;
+
+  // Fold - If possible, fold this to a simpler init.  Return this if not
+  // possible to fold.
+  Init *Fold(Record *CurRec) const;
+
+  bool isComplete() const override { return false; }
+
+  Init *resolveReferences(Resolver &R) const override;
+
+  Init *getBit(unsigned Bit) const override;
+
+  std::string getAsString() const override;
+};
+
+/// !isa<type>(expr) - Dynamically determine the type of an expression.
+class IsAOpInit : public TypedInit, public FoldingSetNode {
+private:
+  RecTy *CheckType;
+  Init *Expr;
+
+  IsAOpInit(RecTy *CheckType, Init *Expr)
+      : TypedInit(IK_IsAOpInit, IntRecTy::get()), CheckType(CheckType),
+        Expr(Expr) {}
+
+public:
+  IsAOpInit(const IsAOpInit &) = delete;
+  IsAOpInit &operator=(const IsAOpInit &) = delete;
+
+  static bool classof(const Init *I) { return I->getKind() == IK_IsAOpInit; }
+
+  static IsAOpInit *get(RecTy *CheckType, Init *Expr);
+
+  void Profile(FoldingSetNodeID &ID) const;
+
+  // Fold - If possible, fold this to a simpler init.  Return this if not
+  // possible to fold.
+  Init *Fold() const;
 
   bool isComplete() const override { return false; }
 
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
+
+  Init *getBit(unsigned Bit) const override;
 
   std::string getAsString() const override;
 };
@@ -937,19 +1007,12 @@ public:
     return getNameInit()->getAsUnquotedString();
   }
 
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override;
-
-  RecTy *getFieldType(StringInit *FieldName) const override;
-  Init *getFieldInit(Record &R, const RecordVal *RV,
-                     StringInit *FieldName) const override;
-
   /// This method is used by classes that refer to other
   /// variables which may not be defined at the time they expression is formed.
   /// If a value is set for the variable later, this method will be called on
   /// users of the value to allow the value to propagate out.
   ///
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
   Init *getBit(unsigned Bit) const override;
 
@@ -957,11 +1020,12 @@ public:
 };
 
 /// Opcode{0} - Represent access to one bit of a variable or field.
-class VarBitInit : public Init {
+class VarBitInit final : public TypedInit {
   TypedInit *TI;
   unsigned Bit;
 
-  VarBitInit(TypedInit *T, unsigned B) : Init(IK_VarBitInit), TI(T), Bit(B) {
+  VarBitInit(TypedInit *T, unsigned B)
+      : TypedInit(IK_VarBitInit, BitRecTy::get()), TI(T), Bit(B) {
     assert(T->getType() &&
            (isa<IntRecTy>(T->getType()) ||
             (isa<BitsRecTy>(T->getType()) &&
@@ -979,13 +1043,11 @@ public:
 
   static VarBitInit *get(TypedInit *T, unsigned B);
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
-
-  Init *getBitVar() const override { return TI; }
-  unsigned getBitNum() const override { return Bit; }
+  Init *getBitVar() const { return TI; }
+  unsigned getBitNum() const { return Bit; }
 
   std::string getAsString() const override;
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
   Init *getBit(unsigned B) const override {
     assert(B < 1 && "Bit index out of range!");
@@ -1020,14 +1082,8 @@ public:
   TypedInit *getVariable() const { return TI; }
   unsigned getElementNum() const { return Element; }
 
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override;
-
   std::string getAsString() const override;
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
   Init *getBit(unsigned Bit) const override;
 };
@@ -1038,7 +1094,7 @@ class DefInit : public TypedInit {
 
   Record *Def;
 
-  DefInit(Record *D, RecordRecTy *T) : TypedInit(IK_DefInit, T), Def(D) {}
+  explicit DefInit(Record *D);
 
 public:
   DefInit(const DefInit &) = delete;
@@ -1057,21 +1113,64 @@ public:
   //virtual Init *convertInitializerBitRange(ArrayRef<unsigned> Bits);
 
   RecTy *getFieldType(StringInit *FieldName) const override;
-  Init *getFieldInit(Record &R, const RecordVal *RV,
-                     StringInit *FieldName) const override;
 
+  bool isConcrete() const override { return true; }
   std::string getAsString() const override;
 
   Init *getBit(unsigned Bit) const override {
     llvm_unreachable("Illegal bit reference off def");
   }
+};
 
-  /// This method is used to implement
-  /// VarListElementInit::resolveReferences.  If the list element is resolvable
-  /// now, we return the resolved value, otherwise we return null.
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override {
-    llvm_unreachable("Illegal element reference off def");
+/// classname<targs...> - Represent an uninstantiated anonymous class
+/// instantiation.
+class VarDefInit final : public TypedInit, public FoldingSetNode,
+                         public TrailingObjects<VarDefInit, Init *> {
+  Record *Class;
+  DefInit *Def = nullptr; // after instantiation
+  unsigned NumArgs;
+
+  explicit VarDefInit(Record *Class, unsigned N)
+    : TypedInit(IK_VarDefInit, RecordRecTy::get(Class)), Class(Class), NumArgs(N) {}
+
+  DefInit *instantiate();
+
+public:
+  VarDefInit(const VarDefInit &) = delete;
+  VarDefInit &operator=(const VarDefInit &) = delete;
+
+  // Do not use sized deallocation due to trailing objects.
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_VarDefInit;
+  }
+  static VarDefInit *get(Record *Class, ArrayRef<Init *> Args);
+
+  void Profile(FoldingSetNodeID &ID) const;
+
+  Init *resolveReferences(Resolver &R) const override;
+  Init *Fold() const;
+
+  std::string getAsString() const override;
+
+  Init *getArg(unsigned i) const {
+    assert(i < NumArgs && "Argument index out of range!");
+    return getTrailingObjects<Init *>()[i];
+  }
+
+  using const_iterator = Init *const *;
+
+  const_iterator args_begin() const { return getTrailingObjects<Init *>(); }
+  const_iterator args_end  () const { return args_begin() + NumArgs; }
+
+  size_t         args_size () const { return NumArgs; }
+  bool           args_empty() const { return NumArgs == 0; }
+
+  ArrayRef<Init *> args() const { return makeArrayRef(args_begin(), NumArgs); }
+
+  Init *getBit(unsigned Bit) const override {
+    llvm_unreachable("Illegal bit reference off anonymous def");
   }
 };
 
@@ -1095,12 +1194,13 @@ public:
 
   static FieldInit *get(Init *R, StringInit *FN);
 
-  Init *getBit(unsigned Bit) const override;
+  Init *getRecord() const { return Rec; }
+  StringInit *getFieldName() const { return FieldName; }
 
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override;
+  Init *getBit(unsigned Bit) const override;
 
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
+  Init *Fold(Record *CurRec) const;
 
   std::string getAsString() const override {
     return Rec->getAsString() + "." + FieldName->getValue().str();
@@ -1140,8 +1240,6 @@ public:
 
   void Profile(FoldingSetNodeID &ID) const;
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
-
   Init *getOperator() const { return Val; }
 
   StringInit *getName() const { return ValName; }
@@ -1175,8 +1273,9 @@ public:
     return makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames);
   }
 
-  Init *resolveReferences(Record &R, const RecordVal *RV) const override;
+  Init *resolveReferences(Resolver &R) const override;
 
+  bool isConcrete() const override;
   std::string getAsString() const override;
 
   using const_arg_iterator = SmallVectorImpl<Init*>::const_iterator;
@@ -1197,11 +1296,6 @@ public:
   Init *getBit(unsigned Bit) const override {
     llvm_unreachable("Illegal bit reference off dag");
   }
-
-  Init *resolveListElementReference(Record &R, const RecordVal *RV,
-                                    unsigned Elt) const override {
-    llvm_unreachable("Illegal element reference off dag");
-  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -1229,14 +1323,7 @@ public:
   RecTy *getType() const { return TyAndPrefix.getPointer(); }
   Init *getValue() const { return Value; }
 
-  bool setValue(Init *V) {
-    if (V) {
-      Value = V->convertInitializerTo(getType());
-      return Value == nullptr;
-    }
-    Value = nullptr;
-    return false;
-  }
+  bool setValue(Init *V);
 
   void dump() const;
   void print(raw_ostream &OS, bool PrintSem = true) const;
@@ -1256,6 +1343,9 @@ class Record {
   SmallVector<SMLoc, 4> Locs;
   SmallVector<Init *, 0> TemplateArgs;
   SmallVector<RecordVal, 0> Values;
+
+  // All superclasses in the inheritance forest in reverse preorder (yes, it
+  // must be a forest; diamond-shaped inheritance is not allowed).
   SmallVector<std::pair<Record *, SMRange>, 0> SuperClasses;
 
   // Tracks Record instances. Not owned by Record.
@@ -1267,49 +1357,37 @@ class Record {
   unsigned ID;
 
   bool IsAnonymous;
+  bool IsClass;
 
-  // Class-instance values can be used by other defs.  For example, Struct<i>
-  // is used here as a template argument to another class:
-  //
-  //   multiclass MultiClass<int i> {
-  //     def Def : Class<Struct<i>>;
-  //
-  // These need to get fully resolved before instantiating any other
-  // definitions that use them (e.g. Def).  However, inside a multiclass they
-  // can't be immediately resolved so we mark them ResolveFirst to fully
-  // resolve them later as soon as the multiclass is instantiated.
-  bool ResolveFirst = false;
-
-  void init();
   void checkName();
 
 public:
   // Constructs a record.
   explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records,
-                  bool Anonymous = false) :
-    Name(N), Locs(locs.begin(), locs.end()), TrackedRecords(records),
-    ID(LastID++), IsAnonymous(Anonymous) {
-    init();
+                  bool Anonymous = false, bool Class = false)
+    : Name(N), Locs(locs.begin(), locs.end()), TrackedRecords(records),
+      ID(LastID++), IsAnonymous(Anonymous), IsClass(Class) {
+    checkName();
   }
 
   explicit Record(StringRef N, ArrayRef<SMLoc> locs, RecordKeeper &records,
-                  bool Anonymous = false)
-    : Record(StringInit::get(N), locs, records, Anonymous) {}
+                  bool Class = false)
+      : Record(StringInit::get(N), locs, records, false, Class) {}
 
   // When copy-constructing a Record, we must still guarantee a globally unique
   // ID number.  Don't copy TheInit either since it's owned by the original
   // record. All other fields can be copied normally.
-  Record(const Record &O) :
-    Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
-    Values(O.Values), SuperClasses(O.SuperClasses),
-    TrackedRecords(O.TrackedRecords), ID(LastID++),
-    IsAnonymous(O.IsAnonymous), ResolveFirst(O.ResolveFirst) { }
+  Record(const Record &O)
+    : Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
+      Values(O.Values), SuperClasses(O.SuperClasses),
+      TrackedRecords(O.TrackedRecords), ID(LastID++),
+      IsAnonymous(O.IsAnonymous), IsClass(O.IsClass) { }
 
   static unsigned getNewUID() { return LastID++; }
 
   unsigned getID() const { return ID; }
 
-  StringRef getName() const;
+  StringRef getName() const { return cast<StringInit>(Name)->getValue(); }
 
   Init *getNameInit() const {
     return Name;
@@ -1322,10 +1400,16 @@ public:
   void setName(Init *Name);      // Also updates RecordKeeper.
 
   ArrayRef<SMLoc> getLoc() const { return Locs; }
+  void appendLoc(SMLoc Loc) { Locs.push_back(Loc); }
+
+  // Make the type that this record should have based on its superclasses.
+  RecordRecTy *getType();
 
   /// get the corresponding DefInit.
   DefInit *getDefInit();
 
+  bool isClass() const { return IsClass; }
+
   ArrayRef<Init *> getTemplateArgs() const {
     return TemplateArgs;
   }
@@ -1336,6 +1420,9 @@ public:
     return SuperClasses;
   }
 
+  /// Append the direct super classes of this record to Classes.
+  void getDirectSuperClasses(SmallVectorImpl<Record *> &Classes) const;
+
   bool isTemplateArg(Init *Name) const {
     for (Init *TA : TemplateArgs)
       if (TA == Name) return true;
@@ -1368,13 +1455,6 @@ public:
   void addValue(const RecordVal &RV) {
     assert(getValue(RV.getNameInit()) == nullptr && "Value already added!");
     Values.push_back(RV);
-    if (Values.size() > 1)
-      // Keep NAME at the end of the list.  It makes record dumps a
-      // bit prettier and allows TableGen tests to be written more
-      // naturally.  Tests can use CHECK-NEXT to look for Record
-      // fields they expect to see after a def.  They can't do that if
-      // NAME is the first Record field.
-      std::swap(Values[Values.size() - 2], Values[Values.size() - 1]);
   }
 
   void removeValue(Init *Name) {
@@ -1410,13 +1490,24 @@ public:
   }
 
   void addSuperClass(Record *R, SMRange Range) {
+    assert(!TheInit && "changing type of record after it has been referenced");
     assert(!isSubClassOf(R) && "Already subclassing record!");
     SuperClasses.push_back(std::make_pair(R, Range));
   }
 
   /// If there are any field references that refer to fields
   /// that have been filled in, we can propagate the values now.
-  void resolveReferences() { resolveReferencesTo(nullptr); }
+  ///
+  /// This is a final resolve: any error messages, e.g. due to undefined
+  /// !cast references, are generated now.
+  void resolveReferences();
+
+  /// Apply the resolver to the name of the record as well as to the
+  /// initializers of all fields of the record except SkipVal.
+  ///
+  /// The resolver should not resolve any of the fields itself, to avoid
+  /// recursion / infinite loops.
+  void resolveReferences(Resolver &R, const RecordVal *SkipVal = nullptr);
 
   /// If anything in this record refers to RV, replace the
   /// reference to RV with the RHS of RV.  If RV is null, we resolve all
@@ -1431,14 +1522,6 @@ public:
     return IsAnonymous;
   }
 
-  bool isResolveFirst() const {
-    return ResolveFirst;
-  }
-
-  void setResolveFirst(bool b) {
-    ResolveFirst = b;
-  }
-
   void print(raw_ostream &OS) const;
   void dump() const;
 
@@ -1513,20 +1596,13 @@ public:
 
 raw_ostream &operator<<(raw_ostream &OS, const Record &R);
 
-struct MultiClass {
-  Record Rec;  // Placeholder for template args and Name.
-  using RecordVector = std::vector<std::unique_ptr<Record>>;
-  RecordVector DefPrototypes;
-
-  void dump() const;
-
-  MultiClass(StringRef Name, SMLoc Loc, RecordKeeper &Records) :
-    Rec(Name, Loc, Records) {}
-};
-
 class RecordKeeper {
+  friend class RecordRecTy;
   using RecordMap = std::map<std::string, std::unique_ptr<Record>>;
   RecordMap Classes, Defs;
+  FoldingSet<RecordRecTy> RecordTypePool;
+  std::map<std::string, Init *> ExtraGlobals;
+  unsigned AnonCounter = 0;
 
 public:
   const RecordMap &getClasses() const { return Classes; }
@@ -1542,6 +1618,13 @@ public:
     return I == Defs.end() ? nullptr : I->second.get();
   }
 
+  Init *getGlobal(StringRef Name) const {
+    if (Record *R = getDef(Name))
+      return R->getDefInit();
+    auto It = ExtraGlobals.find(Name);
+    return It == ExtraGlobals.end() ? nullptr : It->second;
+  }
+
   void addClass(std::unique_ptr<Record> R) {
     bool Ins = Classes.insert(std::make_pair(R->getName(),
                                              std::move(R))).second;
@@ -1556,6 +1639,15 @@ public:
     assert(Ins && "Record already exists");
   }
 
+  void addExtraGlobal(StringRef Name, Init *I) {
+    bool Ins = ExtraGlobals.insert(std::make_pair(Name, I)).second;
+    (void)Ins;
+    assert(!getDef(Name));
+    assert(Ins && "Global already exists");
+  }
+
+  Init *getNewAnonymousName();
+
   //===--------------------------------------------------------------------===//
   // High-level helper methods, useful for tablegen backends...
 
@@ -1673,10 +1765,142 @@ struct LessRecordRegister {
 
 raw_ostream &operator<<(raw_ostream &OS, const RecordKeeper &RK);
 
-/// Return an Init with a qualifier prefix referring
-/// to CurRec's name.
-Init *QualifyName(Record &CurRec, MultiClass *CurMultiClass,
-                  Init *Name, StringRef Scoper);
+//===----------------------------------------------------------------------===//
+//  Resolvers
+//===----------------------------------------------------------------------===//
+
+/// Interface for looking up the initializer for a variable name, used by
+/// Init::resolveReferences.
+class Resolver {
+  Record *CurRec;
+  bool IsFinal = false;
+
+public:
+  explicit Resolver(Record *CurRec) : CurRec(CurRec) {}
+  virtual ~Resolver() {}
+
+  Record *getCurrentRecord() const { return CurRec; }
+
+  /// Return the initializer for the given variable name (should normally be a
+  /// StringInit), or nullptr if the name could not be resolved.
+  virtual Init *resolve(Init *VarName) = 0;
+
+  // Whether bits in a BitsInit should stay unresolved if resolving them would
+  // result in a ? (UnsetInit). This behavior is used to represent instruction
+  // encodings by keeping references to unset variables within a record.
+  virtual bool keepUnsetBits() const { return false; }
+
+  // Whether this is the final resolve step before adding a record to the
+  // RecordKeeper. Error reporting during resolve and related constant folding
+  // should only happen when this is true.
+  bool isFinal() const { return IsFinal; }
+
+  void setFinal(bool Final) { IsFinal = Final; }
+};
+
+/// Resolve arbitrary mappings.
+class MapResolver final : public Resolver {
+  struct MappedValue {
+    Init *V;
+    bool Resolved;
+
+    MappedValue() : V(nullptr), Resolved(false) {}
+    MappedValue(Init *V, bool Resolved) : V(V), Resolved(Resolved) {}
+  };
+
+  DenseMap<Init *, MappedValue> Map;
+
+public:
+  explicit MapResolver(Record *CurRec = nullptr) : Resolver(CurRec) {}
+
+  void set(Init *Key, Init *Value) { Map[Key] = {Value, false}; }
+
+  Init *resolve(Init *VarName) override;
+};
+
+/// Resolve all variables from a record except for unset variables.
+class RecordResolver final : public Resolver {
+  DenseMap<Init *, Init *> Cache;
+  SmallVector<Init *, 4> Stack;
+
+public:
+  explicit RecordResolver(Record &R) : Resolver(&R) {}
+
+  Init *resolve(Init *VarName) override;
+
+  bool keepUnsetBits() const override { return true; }
+};
+
+/// Resolve all references to a specific RecordVal.
+//
+// TODO: This is used for resolving references to template arguments, in a
+//       rather inefficient way. Change those uses to resolve all template
+//       arguments simultaneously and get rid of this class.
+class RecordValResolver final : public Resolver {
+  const RecordVal *RV;
+
+public:
+  explicit RecordValResolver(Record &R, const RecordVal *RV)
+      : Resolver(&R), RV(RV) {}
+
+  Init *resolve(Init *VarName) override {
+    if (VarName == RV->getNameInit())
+      return RV->getValue();
+    return nullptr;
+  }
+};
+
+/// Delegate resolving to a sub-resolver, but shadow some variable names.
+class ShadowResolver final : public Resolver {
+  Resolver &R;
+  DenseSet<Init *> Shadowed;
+
+public:
+  explicit ShadowResolver(Resolver &R)
+      : Resolver(R.getCurrentRecord()), R(R) {
+    setFinal(R.isFinal());
+  }
+
+  void addShadow(Init *Key) { Shadowed.insert(Key); }
+
+  Init *resolve(Init *VarName) override {
+    if (Shadowed.count(VarName))
+      return nullptr;
+    return R.resolve(VarName);
+  }
+};
+
+/// (Optionally) delegate resolving to a sub-resolver, and keep track whether
+/// there were unresolved references.
+class TrackUnresolvedResolver final : public Resolver {
+  Resolver *R;
+  bool FoundUnresolved = false;
+
+public:
+  explicit TrackUnresolvedResolver(Resolver *R = nullptr)
+      : Resolver(R ? R->getCurrentRecord() : nullptr), R(R) {}
+
+  bool foundUnresolved() const { return FoundUnresolved; }
+
+  Init *resolve(Init *VarName) override;
+};
+
+/// Do not resolve anything, but keep track of whether a given variable was
+/// referenced.
+class HasReferenceResolver final : public Resolver {
+  Init *VarNameToTrack;
+  bool Found = false;
+
+public:
+  explicit HasReferenceResolver(Init *VarNameToTrack)
+      : Resolver(nullptr), VarNameToTrack(VarNameToTrack) {}
+
+  bool found() const { return Found; }
+
+  Init *resolve(Init *VarName) override;
+};
+
+void EmitJSON(RecordKeeper &RK, raw_ostream &OS);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/include/llvm/TableGen/SearchableTable.td b/contrib/llvm/include/llvm/TableGen/SearchableTable.td
index 12aaf6000c31..1089d363eb6f 100644
--- a/contrib/llvm/include/llvm/TableGen/SearchableTable.td
+++ b/contrib/llvm/include/llvm/TableGen/SearchableTable.td
@@ -8,32 +8,127 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines the key top-level classes needed to produce a reasonably
-// generic table that can be binary-searched via int and string entries.
+// generic table that can be binary-searched. Three types of objects can be
+// defined using the classes in this file:
 //
-// Each table must instantiate "Mappingkind", listing the fields that should be
-// included and fields that shoould be searchable. Only two kinds of fields are
-// searchable at the moment: "strings" (which are compared case-insensitively),
-// and "bits".
+// 1. (Generic) Enums. By instantiating the GenericEnum class once, an enum with
+// the name of the def is generated. It is guarded by the preprocessor define
+// GET_name_DECL, where name is the name of the def.
 //
-// For each "MappingKind" the generated header will create GET_MAPPINGKIND_DECL
-// and GET_MAPPINGKIND_IMPL guards.
+// 2. (Generic) Tables and search indices. By instantiating the GenericTable
+// class once, a table with the name of the instantiating def is generated and
+// guarded by the GET_name_IMPL preprocessor guard.
 //
-// Inside the DECL guard will be a set of function declarations:
-// "lookup{InstanceClass}By{SearchableField}", returning "const {InstanceClass}
-// *" and accepting either a StringRef or a uintN_t. Additionally, if
-// EnumNameField is still defined, there will be an "enum {InstanceClass}Values"
-// allowing C++ code to reference either the primary data table's entries (if
-// EnumValueField is not defined) or some other field (e.g. encoding) if it is.
-//
-// Inside the IMPL guard will be a primary data table "{InstanceClass}sList" and
-// as many searchable indexes as requested
-// ("{InstanceClass}sBy{SearchableField}"). Additionally implementations of the
-// lookup function will be provided.
+// Both a primary key and additional secondary keys / search indices can also
+// be defined, which result in the generation of lookup functions. Their
+// declarations and definitions are all guarded by GET_name_DECL and
+// GET_name_IMPL, respectively, where name is the name of the underlying table.
 //
 // See AArch64SystemOperands.td and its generated header for example uses.
 //
 //===----------------------------------------------------------------------===//
 
+// Define a record derived from this class to generate a generic enum.
+//
+// The name of the record is used as the type name of the C++ enum.
+class GenericEnum {
+  // Name of a TableGen class. The enum will have one entry for each record
+  // that derives from that class.
+  string FilterClass;
+
+  // (Optional) Name of a field that is present in all collected records and
+  // contains the name of enum entries.
+  //
+  // If NameField is not set, the record names will be used instead.
+  string NameField;
+
+  // (Optional) Name of a field that is present in all collected records and
+  // contains the numerical value of enum entries.
+  //
+  // If ValueField is not set, enum values will be assigned automatically,
+  // starting at 0, according to a lexicographical sort of the entry names.
+  string ValueField;
+}
+
+// Define a record derived from this class to generate a generic table. This
+// table can have a searchable primary key, and it can also be referenced by
+// external search indices.
+//
+// The name of the record is used as the name of the global primary array of
+// entries of the table in C++.
+class GenericTable {
+  // Name of a class. The table will have one entry for each record that
+  // derives from that class.
+  string FilterClass;
+
+  // Name of the C++ struct/class type that holds table entries. The
+  // declaration of this type is not generated automatically.
+  string CppTypeName = FilterClass;
+
+  // List of the names of fields of collected records that contain the data for
+  // table entries, in the order that is used for initialization in C++.
+  //
+  // For each field of the table named XXX, TableGen will look for a value
+  // called TypeOf_XXX and use that as a more detailed description of the
+  // type of the field if present. This is required for fields whose type
+  // cannot be deduced automatically, such as enum fields. For example:
+  //
+  //   def MyEnum : GenericEnum {
+  //     let FilterClass = "MyEnum";
+  //     ...
+  //   }
+  //
+  //   class MyTableEntry {
+  //     MyEnum V;
+  //     ...
+  //   }
+  //
+  //   def MyTable : GenericTable {
+  //     let FilterClass = "MyTableEntry";
+  //     let Fields = ["V", ...];
+  //     GenericEnum TypeOf_V = MyEnum;
+  //   }
+  //
+  // Fields of type bit, bits<N>, string, Intrinsic, and Instruction (or
+  // derived classes of those) are supported natively.
+  //
+  // Additionally, fields of type `code` can appear, where the value is used
+  // verbatim as an initializer. However, these fields cannot be used as
+  // search keys.
+  list<string> Fields;
+
+  // (Optional) List of fields that make up the primary key.
+  list<string> PrimaryKey;
+
+  // (Optional) Name of the primary key search function.
+  string PrimaryKeyName;
+
+  // See SearchIndex.EarlyOut
+  bit PrimaryKeyEarlyOut = 0;
+}
+
+// Define a record derived from this class to generate an additional search
+// index for a generic table that has been defined earlier.
+//
+// The name of the record will be used as the name of the C++ lookup function.
+class SearchIndex {
+  // Table that this search index refers to.
+  GenericTable Table;
+
+  // List of fields that make up the key.
+  list<string> Key;
+
+  // If true, the lookup function will check the first field of the key against
+  // the minimum and maximum values in the index before entering the binary
+  // search. This is convenient for tables that add extended data for a subset
+  // of a larger enum-based space, e.g. extended data about a subset of
+  // instructions.
+  //
+  // Can only be used when the first field is an integral (non-string) type.
+  bit EarlyOut = 0;
+}
+
+// Legacy table type with integrated enum.
 class SearchableTable {
   list<string> SearchableFields;
   string EnumNameField = "Name";
diff --git a/contrib/llvm/include/llvm/Support/CodeGenCWrappers.h b/contrib/llvm/include/llvm/Target/CodeGenCWrappers.h
index 47971e80cefb..e9a990569d36 100644
--- a/contrib/llvm/include/llvm/Support/CodeGenCWrappers.h
+++ b/contrib/llvm/include/llvm/Target/CodeGenCWrappers.h
@@ -1,4 +1,4 @@
-//===- llvm/Support/CodeGenCWrappers.h - CodeGen C Wrappers -----*- C++ -*-===//
+//===- llvm/Target/CodeGenCWrappers.h - CodeGen C Wrappers ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_CODEGENCWRAPPERS_H
-#define LLVM_SUPPORT_CODEGENCWRAPPERS_H
+#ifndef LLVM_TARGET_CODEGENCWRAPPERS_H
+#define LLVM_TARGET_CODEGENCWRAPPERS_H
 
 #include "llvm-c/TargetMachine.h"
 #include "llvm/ADT/Optional.h"
@@ -56,7 +56,6 @@ inline LLVMCodeModel wrap(CodeModel::Model Model) {
   }
   llvm_unreachable("Bad CodeModel!");
 }
-
-} // end llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/include/llvm/Target/GenericOpcodes.td b/contrib/llvm/include/llvm/Target/GenericOpcodes.td
index 28c90bf22767..d72746a0838a 100644
--- a/contrib/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/contrib/llvm/include/llvm/Target/GenericOpcodes.td
@@ -126,6 +126,11 @@ def G_BSWAP : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_ADDRSPACE_CAST : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = 0;
+}
 //------------------------------------------------------------------------------
 // Binary ops.
 //------------------------------------------------------------------------------
@@ -378,6 +383,12 @@ def G_UITOFP : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_FABS : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
 //------------------------------------------------------------------------------
 // Floating Point Binary ops.
 //------------------------------------------------------------------------------
@@ -476,6 +487,22 @@ def G_LOAD : GenericInstruction {
   let mayLoad = 1;
 }
 
+// Generic sign-extended load. Expects a MachineMemOperand in addition to explicit operands.
+def G_SEXTLOAD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins ptype1:$addr);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+// Generic zero-extended load. Expects a MachineMemOperand in addition to explicit operands.
+def G_ZEXTLOAD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins ptype1:$addr);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
 // Generic store. Expects a MachineMemOperand in addition to explicit operands.
 def G_STORE : GenericInstruction {
   let OutOperandList = (outs);
diff --git a/contrib/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/contrib/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 575f228cd773..d487759a4852 100644
--- a/contrib/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/contrib/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -28,6 +28,13 @@ class GINodeEquiv<Instruction i, SDNode node> {
   // (ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE) but GlobalISel
   // stores this information in the MachineMemoryOperand.
   bit CheckMMOIsNonAtomic = 0;
+
+  // SelectionDAG has one node for all loads and uses predicates to
+  // differentiate them. GlobalISel on the other hand uses separate opcodes.
+  // When this is true, the resulting opcode is G_LOAD/G_SEXTLOAD/G_ZEXTLOAD
+  // depending on the predicates on the node.
+  Instruction IfSignExtend = ?;
+  Instruction IfZeroExtend = ?;
 }
 
 // These are defined in the same order as the G_* instructions.
@@ -80,11 +87,15 @@ def : GINodeEquiv<G_BSWAP, bswap>;
 // Broadly speaking G_LOAD is equivalent to ISD::LOAD but there are some
 // complications that tablegen must take care of. For example, Predicates such
 // as isSignExtLoad require that this is not a perfect 1:1 mapping since a
-// sign-extending load is (G_SEXT (G_LOAD x)) in GlobalISel. Additionally,
+// sign-extending load is (G_SEXTLOAD x) in GlobalISel. Additionally,
 // G_LOAD handles both atomic and non-atomic loads where as SelectionDAG had
 // separate nodes for them. This GINodeEquiv maps the non-atomic loads to
 // G_LOAD with a non-atomic MachineMemOperand.
-def : GINodeEquiv<G_LOAD, ld> { let CheckMMOIsNonAtomic = 1; }
+def : GINodeEquiv<G_LOAD, ld> {
+  let CheckMMOIsNonAtomic = 1;
+  let IfSignExtend = G_SEXTLOAD;
+  let IfZeroExtend = G_ZEXTLOAD;
+}
 // Broadly speaking G_STORE is equivalent to ISD::STORE but there are some
 // complications that tablegen must take care of. For example, predicates such
 // as isTruncStore require that this is not a perfect 1:1 mapping since a
@@ -112,3 +123,9 @@ def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax>;
 class GIComplexPatternEquiv<ComplexPattern seldag> {
   ComplexPattern SelDAGEquivalent = seldag;
 }
+
+// Specifies the GlobalISel equivalents for SelectionDAG's SDNodeXForm.
+// Should be used on defs that subclass GICustomOperandRenderer<>.
+class GISDNodeXFormEquiv<SDNodeXForm seldag> {
+  SDNodeXForm SelDAGEquivalent = seldag;
+}
diff --git a/contrib/llvm/include/llvm/Target/GlobalISel/Target.td b/contrib/llvm/include/llvm/Target/GlobalISel/Target.td
index fd2ebca86d60..6740f404a9d3 100644
--- a/contrib/llvm/include/llvm/Target/GlobalISel/Target.td
+++ b/contrib/llvm/include/llvm/Target/GlobalISel/Target.td
@@ -46,3 +46,16 @@ class GIComplexOperandMatcher<LLT type, string matcherfn> {
   // overwritten.
   string MatcherFn = matcherfn;
 }
+
+// Defines a custom renderer. This is analogous to SDNodeXForm from
+// SelectionDAG. Unlike SDNodeXForm, this matches a MachineInstr and
+// renders directly to the result instruction without an intermediate node.
+//
+// Definitions that inherit from this may also inherit from GISDNodeXFormEquiv
+// to enable the import of SelectionDAG patterns involving those SDNodeXForms.
+class GICustomOperandRenderer<string rendererfn> {
+  // The function renders the operand(s) of the matched instruction to
+  // the specified instruction. It should be of the form:
+  //   void render(MachineInstrBuilder &MIB, const MachineInstr &MI)
+  string RendererFn = rendererfn;
+}
diff --git a/contrib/llvm/include/llvm/Target/Target.td b/contrib/llvm/include/llvm/Target/Target.td
index 82a3be5e63d4..b746505d2a45 100644
--- a/contrib/llvm/include/llvm/Target/Target.td
+++ b/contrib/llvm/include/llvm/Target/Target.td
@@ -175,6 +175,8 @@ class Register<string n, list<string> altNames = []> {
 
   // HWEncoding - The target specific hardware encoding for this register.
   bits<16> HWEncoding = 0;
+
+  bit isArtificial = 0;
 }
 
 // RegisterWithSubRegs - This can be used to define instances of Register which
@@ -382,6 +384,11 @@ class DwarfRegAlias<Register reg> {
 }
 
 //===----------------------------------------------------------------------===//
+// Pull in the common support for MCPredicate (portable scheduling predicates).
+//
+include "llvm/Target/TargetInstrPredicate.td"
+
+//===----------------------------------------------------------------------===//
 // Pull in the common support for scheduling
 //
 include "llvm/Target/TargetSchedule.td"
@@ -435,11 +442,13 @@ class Instruction {
   bit isIndirectBranch = 0; // Is this instruction an indirect branch?
   bit isCompare    = 0;     // Is this instruction a comparison instruction?
   bit isMoveImm    = 0;     // Is this instruction a move immediate instruction?
+  bit isMoveReg    = 0;     // Is this instruction a move register instruction?
   bit isBitcast    = 0;     // Is this instruction a bitcast instruction?
   bit isSelect     = 0;     // Is this instruction a select instruction?
   bit isBarrier    = 0;     // Can control flow fall through this instruction?
   bit isCall       = 0;     // Is this instruction a call instruction?
   bit isAdd        = 0;     // Is this instruction an add instruction?
+  bit isTrap       = 0;     // Is this instruction a trap instruction?
   bit canFoldAsLoad = 0;    // Can this be folded as a simple memory operand?
   bit mayLoad      = ?;     // Is it possible for this inst to read memory?
   bit mayStore     = ?;     // Is it possible for this inst to write memory?
@@ -566,6 +575,12 @@ class Instruction {
   /// can be queried via the getNamedOperandIdx() function which is generated
   /// by TableGen.
   bit UseNamedOperandTable = 0;
+
+  /// Should FastISel ignore this instruction. For certain ISAs, they have
+  /// instructions which map to the same ISD Opcode, value type operands and
+  /// instruction selection predicates. FastISel cannot handle such cases, but
+  /// SelectionDAG can.
+  bit FastISelShouldIgnore = 0;
 }
 
 /// PseudoInstExpansion - Expansion information for a pseudo-instruction.
@@ -995,6 +1010,12 @@ def DBG_VALUE : StandardPseudoInstruction {
   let AsmString = "DBG_VALUE";
   let hasSideEffects = 0;
 }
+def DBG_LABEL : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins unknown:$label);
+  let AsmString = "DBG_LABEL";
+  let hasSideEffects = 0;
+}
 def REG_SEQUENCE : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, variable_ops);
@@ -1097,7 +1118,7 @@ def PATCHABLE_FUNCTION_ENTER : StandardPseudoInstruction {
   let hasSideEffects = 0;
 }
 def PATCHABLE_RET : StandardPseudoInstruction {
-  let OutOperandList = (outs unknown:$dst);
+  let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "# XRay Function Patchable RET.";
   let usesCustomInserter = 1;
@@ -1114,7 +1135,7 @@ def PATCHABLE_FUNCTION_EXIT : StandardPseudoInstruction {
   let isReturn = 0; // Original return instruction will follow
 }
 def PATCHABLE_TAIL_CALL : StandardPseudoInstruction {
-  let OutOperandList = (outs unknown:$dst);
+  let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "# XRay Tail Call Exit.";
   let usesCustomInserter = 1;
@@ -1131,6 +1152,16 @@ def PATCHABLE_EVENT_CALL : StandardPseudoInstruction {
   let mayStore = 1;
   let hasSideEffects = 1;
 }
+def PATCHABLE_TYPED_EVENT_CALL : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i16imm:$type, ptr_rc:$event, i32imm:$size);
+  let AsmString = "# XRay Typed Event Log.";
+  let usesCustomInserter = 1;
+  let isCall = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+}
 def FENTRY_CALL : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins variable_ops);
@@ -1140,6 +1171,12 @@ def FENTRY_CALL : StandardPseudoInstruction {
   let mayStore = 1;
   let hasSideEffects = 1;
 }
+def ICALL_BRANCH_FUNNEL : StandardPseudoInstruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins variable_ops);
+  let AsmString = "";
+  let hasSideEffects = 1;
+}
 
 // Generic opcodes used in GlobalISel.
 include "llvm/Target/GenericOpcodes.td"
@@ -1290,7 +1327,7 @@ class MnemonicAlias<string From, string To, string VariantName = ""> {
 /// InstAlias - This defines an alternate assembly syntax that is allowed to
 /// match an instruction that has a different (more canonical) assembly
 /// representation.
-class InstAlias<string Asm, dag Result, int Emit = 1> {
+class InstAlias<string Asm, dag Result, int Emit = 1, string VariantName = ""> {
   string AsmString = Asm;      // The .s format to match the instruction with.
   dag ResultInst = Result;     // The MCInst to generate.
 
@@ -1314,7 +1351,7 @@ class InstAlias<string Asm, dag Result, int Emit = 1> {
 
   // Assembler variant name to use for this alias. If not specified then
   // assembler variants will be determined based on AsmString
-  string AsmVariantName = "";
+  string AsmVariantName = VariantName;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1362,6 +1399,12 @@ class Target {
 
   // AssemblyWriters - The AsmWriter instances available for this target.
   list<AsmWriter> AssemblyWriters = [DefaultAsmWriter];
+
+  // AllowRegisterRenaming - Controls whether this target allows
+  // post-register-allocation renaming of registers.  This is done by
+  // setting hasExtraDefRegAllocReq and hasExtraSrcRegAllocReq to 1
+  // for all opcodes if this flag is set to 0.
+  int AllowRegisterRenaming = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/include/llvm/Target/TargetInstrPredicate.td b/contrib/llvm/include/llvm/Target/TargetInstrPredicate.td
new file mode 100644
index 000000000000..d38279b0d65e
--- /dev/null
+++ b/contrib/llvm/include/llvm/Target/TargetInstrPredicate.td
@@ -0,0 +1,197 @@
+//===- TargetInstrPredicate.td - ---------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines MCInstPredicate classes and its subclasses.
+//
+// MCInstPredicate is used to describe constraints on the opcode/operand(s) of
+// an instruction. Each MCInstPredicate class has a well-known semantic, and it
+// is used by a PredicateExpander to generate code for MachineInstr and/or
+// MCInst.
+// 
+// MCInstPredicate definitions can be used to construct MCSchedPredicate
+// definitions. An MCSchedPredicate can be used in place of a SchedPredicate
+// when defining SchedReadVariant and SchedWriteVariant used by a processor
+// scheduling model.
+//
+// Here is an example of MCInstPredicate definition:
+//
+// def MCInstPredicateExample : CheckAll<[
+//    CheckOpcode<[BLR]>,
+//    CheckIsRegOperand<0>,
+//    CheckNot<CheckRegOperand<0, LR>>]>;
+//
+// Predicate `MCInstPredicateExample` checks that the machine instruction in
+// input is a BLR, and that operand at index 0 is register `LR`.
+//
+// That predicate could be used to rewrite the following definition (from
+// AArch64SchedExynosM3.td):
+//
+// def M3BranchLinkFastPred  : SchedPredicate<[{
+//    MI->getOpcode() == AArch64::BLR &&
+//    MI->getOperand(0).isReg() &&
+//    MI->getOperand(0).getReg() != AArch64::LR}]>;
+//
+// MCInstPredicate definitions are used to construct MCSchedPredicate (see the
+// definition of class MCSchedPredicate in llvm/Target/TargetSchedule.td).  An
+// MCSchedPredicate can be used by a `SchedVar` to associate a predicate with a
+// list of SchedReadWrites. Note that `SchedVar` are used to create SchedVariant
+// definitions.
+//
+// Each MCInstPredicate class has a well known semantic. For example,
+// `CheckOpcode` is only used to check the instruction opcode value.
+//
+// MCInstPredicate classes allow the definition of predicates in a declarative
+// way.  These predicates don't require a custom block of C++, and can be used
+// to define conditions on instructions without being bound to a particular
+// representation (i.e. MachineInstr vs MCInst).
+//
+// It also means that tablegen backends must know how to parse and expand them
+// into code that works on MCInst (or MachineInst).
+//
+// Instances of class PredicateExpander (see utils/Tablegen/PredicateExpander.h)
+// know how to expand a predicate. For each MCInstPredicate class, there must be
+// an "expand" method available in the PredicateExpander interface.
+//
+// For example, a `CheckOpcode` predicate is expanded using method
+// `PredicateExpander::expandCheckOpcode()`.
+//
+// New MCInstPredicate classes must be added to this file. For each new class
+// XYZ, an "expandXYZ" method must be added to the PredicateExpander.
+// 
+//===----------------------------------------------------------------------===//
+
+// Forward declarations.
+class Instruction;
+
+// A generic machine instruction predicate.
+class MCInstPredicate;
+
+class MCTrue  : MCInstPredicate;   // A predicate that always evaluates to True.
+class MCFalse : MCInstPredicate;   // A predicate that always evaluates to False.
+def TruePred  : MCTrue;
+def FalsePred : MCFalse;
+
+// A predicate used to negate the outcome of another predicate.
+// It allows to easily express "set difference" operations. For example, it
+// makes it easy to describe a check that tests if an opcode is not part of a
+// set of opcodes.
+class CheckNot<MCInstPredicate P> : MCInstPredicate {
+  MCInstPredicate Pred = P;
+}
+
+// This class is used as a building block to define predicates on instruction
+// operands. It is used to reference a specific machine operand.
+class MCOperandPredicate<int Index> : MCInstPredicate {
+  int OpIndex = Index;
+}
+
+// Return true if machine operand at position `Index` is a register operand.
+class CheckIsRegOperand<int Index> : MCOperandPredicate<Index>;
+
+// Return true if machine operand at position `Index` is an immediate operand.
+class CheckIsImmOperand<int Index> : MCOperandPredicate<Index>;
+
+// Check if machine operands at index `First` and index `Second` both reference
+// the same register.
+class CheckSameRegOperand<int First, int Second> : MCInstPredicate {
+  int FirstIndex = First;
+  int SecondIndex = Second;
+}
+
+// Check that the machine register operand at position `Index` references
+// register R. This predicate assumes that we already checked that the machine
+// operand at position `Index` is a register operand.
+class CheckRegOperand<int Index, Register R> : MCOperandPredicate<Index> {
+  Register Reg = R;
+}
+
+// Check if register operand at index `Index` is the invalid register.
+class CheckInvalidRegOperand<int Index> : MCOperandPredicate<Index>;
+
+// Check that the operand at position `Index` is immediate `Imm`.
+class CheckImmOperand<int Index, int Imm> : MCOperandPredicate<Index> {
+  int ImmVal = Imm;
+}
+
+// Similar to CheckImmOperand, however the immediate is not a literal number.
+// This is useful when we want to compare the value of an operand against an
+// enum value, and we know the actual integer value of that enum.
+class CheckImmOperand_s<int Index, string Value> : MCOperandPredicate<Index> {
+  string ImmVal = Value;
+}
+
+// Check that the operand at position `Index` is immediate value zero.
+class CheckZeroOperand<int Index> : CheckImmOperand<Index, 0>;
+
+// Check that the instruction has exactly `Num` operands.
+class CheckNumOperands<int Num> : MCInstPredicate {
+  int NumOps = Num;
+}
+
+// Check that the instruction opcode is one of the opcodes in set `Opcodes`.
+// This is a simple set membership query. The easier way to check if an opcode
+// is not a member of the set is by using a `CheckNot<CheckOpcode<[...]>>`
+// sequence.
+class CheckOpcode<list<Instruction> Opcodes> : MCInstPredicate {
+  list<Instruction> ValidOpcodes = Opcodes;
+}
+
+// Check that the instruction opcode is a pseudo opcode member of the set
+// `Opcodes`.  This check is always expanded to "false" if we are generating
+// code for MCInst.
+class CheckPseudo<list<Instruction> Opcodes> : CheckOpcode<Opcodes>;
+
+// A non-portable predicate. Only to use as a last resort when a block of code
+// cannot possibly be converted in a declarative way using other MCInstPredicate
+// classes. This check is always expanded to "false" when generating code for
+// MCInst.
+class CheckNonPortable<string Code> : MCInstPredicate {
+  string CodeBlock = Code;
+}
+
+// A sequence of predicates. It is used as the base class for CheckAll, and
+// CheckAny. It allows to describe compositions of predicates.
+class CheckPredicateSequence<list<MCInstPredicate> Preds> : MCInstPredicate {
+  list<MCInstPredicate> Predicates = Preds;
+}
+
+// Check that all of the predicates in `Preds` evaluate to true.
+class CheckAll<list<MCInstPredicate> Sequence>
+    : CheckPredicateSequence<Sequence>;
+
+// Check that at least one of the predicates in `Preds` evaluates to true.
+class CheckAny<list<MCInstPredicate> Sequence>
+    : CheckPredicateSequence<Sequence>;
+
+// Check that a call to method `Name` in class "XXXGenInstrInfo" (where XXX is
+// the `Target` name) returns true.
+//
+// TIIPredicate definitions are used to model calls to the target-specific
+// InstrInfo. A TIIPredicate is treated specially by the InstrInfoEmitter
+// tablegen backend, which will use it to automatically generate a definition in
+// the target specific `GenInstrInfo` class.
+class TIIPredicate<string Target, string Name, MCInstPredicate P> : MCInstPredicate {
+  string TargetName = Target;
+  string FunctionName = Name;
+  MCInstPredicate Pred = P;
+}
+
+// A function predicate that takes as input a machine instruction, and returns
+// a boolean value.
+//
+// This predicate is expanded into a function call by the PredicateExpander.
+// In particular, the PredicateExpander would either expand this predicate into
+// a call to `MCInstFn`, or into a call to`MachineInstrFn` depending on whether
+// it is lowering predicates for MCInst or MachineInstr.
+//
+// In this context, `MCInstFn` and `MachineInstrFn` are both function names.
+class CheckFunctionPredicate<string MCInstFn, string MachineInstrFn> : MCInstPredicate {
+  string MCInstFnName = MCInstFn;
+  string MachineInstrFnName = MachineInstrFn;
+}
diff --git a/contrib/llvm/include/llvm/Target/TargetItinerary.td b/contrib/llvm/include/llvm/Target/TargetItinerary.td
index 3b1998dfb1ff..182054d8444e 100644
--- a/contrib/llvm/include/llvm/Target/TargetItinerary.td
+++ b/contrib/llvm/include/llvm/Target/TargetItinerary.td
@@ -44,9 +44,9 @@ def Reserved : ReservationKind<1>;
 // the execution of an instruction.  Cycles represents the number of
 // discrete time slots needed to complete the stage.  Units represent
 // the choice of functional units that can be used to complete the
-// stage.  Eg. IntUnit1, IntUnit2. NextCycles indicates how many
-// cycles should elapse from the start of this stage to the start of
-// the next stage in the itinerary.  For example:
+// stage.  Eg. IntUnit1, IntUnit2. TimeInc indicates how many cycles
+// should elapse from the start of this stage to the start of the next
+// stage in the itinerary.  For example:
 //
 // A stage is specified in one of two ways:
 //
diff --git a/contrib/llvm/include/llvm/CodeGen/TargetLoweringObjectFile.h b/contrib/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index fe77c2954129..dbdfd4139a0f 100644
--- a/contrib/llvm/include/llvm/CodeGen/TargetLoweringObjectFile.h
+++ b/contrib/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/TargetLoweringObjectFile.h - Object Info ---*- C++ -*-===//
+//===-- llvm/Target/TargetLoweringObjectFile.h - Object Info ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -71,8 +71,7 @@ public:
                                     const MCSymbol *Sym) const;
 
   /// Emit the module-level metadata that the platform cares about.
-  virtual void emitModuleMetadata(MCStreamer &Streamer, Module &M,
-                                  const TargetMachine &TM) const {}
+  virtual void emitModuleMetadata(MCStreamer &Streamer, Module &M) const {}
 
   /// Given a constant with the SectionKind, return a section that it should be
   /// placed in.
@@ -149,7 +148,7 @@ public:
     return StaticDtorSection;
   }
 
-  /// \brief Create a symbol reference to describe the given TLS variable when
+  /// Create a symbol reference to describe the given TLS variable when
   /// emitting the address in debug info.
   virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
 
@@ -159,19 +158,19 @@ public:
     return nullptr;
   }
 
-  /// \brief Target supports replacing a data "PC"-relative access to a symbol
+  /// Target supports replacing a data "PC"-relative access to a symbol
   /// through another symbol, by accessing the later via a GOT entry instead?
   bool supportIndirectSymViaGOTPCRel() const {
     return SupportIndirectSymViaGOTPCRel;
   }
 
-  /// \brief Target GOT "PC"-relative relocation supports encoding an additional
+  /// Target GOT "PC"-relative relocation supports encoding an additional
   /// binary expression with an offset?
   bool supportGOTPCRelWithOffset() const {
     return SupportGOTPCRelWithOffset;
   }
 
-  /// \brief Get the target specific PC relative GOT entry relocation
+  /// Get the target specific PC relative GOT entry relocation
   virtual const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
                                                   const MCValue &MV,
                                                   int64_t Offset,
@@ -183,6 +182,9 @@ public:
   virtual void emitLinkerFlagsForGlobal(raw_ostream &OS,
                                         const GlobalValue *GV) const {}
 
+  virtual void emitLinkerFlagsForUsed(raw_ostream &OS,
+                                      const GlobalValue *GV) const {}
+
 protected:
   virtual MCSection *SelectSectionForGlobal(const GlobalObject *GO,
                                             SectionKind Kind,
diff --git a/contrib/llvm/include/llvm/Target/TargetMachine.h b/contrib/llvm/include/llvm/Target/TargetMachine.h
index 97442f9a7849..1ca68c8df63a 100644
--- a/contrib/llvm/include/llvm/Target/TargetMachine.h
+++ b/contrib/llvm/include/llvm/Target/TargetMachine.h
@@ -138,9 +138,23 @@ public:
   /// Get the pointer size for this target.
   ///
   /// This is the only time the DataLayout in the TargetMachine is used.
-  unsigned getPointerSize() const { return DL.getPointerSize(); }
+  unsigned getPointerSize(unsigned AS) const {
+    return DL.getPointerSize(AS);
+  }
+
+  unsigned getPointerSizeInBits(unsigned AS) const {
+    return DL.getPointerSizeInBits(AS);
+  }
+
+  unsigned getProgramPointerSize() const {
+    return DL.getPointerSize(DL.getProgramAddressSpace());
+  }
 
-  /// \brief Reset the target options based on the function's attributes.
+  unsigned getAllocaPointerSize() const {
+    return DL.getPointerSize(DL.getAllocaAddrSpace());
+  }
+
+  /// Reset the target options based on the function's attributes.
   // FIXME: Remove TargetOptions that affect per-function code generation
   // from TargetMachine.
   void resetTargetOptions(const Function &F) const;
@@ -172,18 +186,28 @@ public:
 
   bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const;
 
+  /// Returns true if this target uses emulated TLS.
+  bool useEmulatedTLS() const;
+
   /// Returns the TLS model which should be used for the given global variable.
   TLSModel::Model getTLSModel(const GlobalValue *GV) const;
 
   /// Returns the optimization level: None, Less, Default, or Aggressive.
   CodeGenOpt::Level getOptLevel() const;
 
-  /// \brief Overrides the optimization level.
+  /// Overrides the optimization level.
   void setOptLevel(CodeGenOpt::Level Level);
 
   void setFastISel(bool Enable) { Options.EnableFastISel = Enable; }
   bool getO0WantsFastISel() { return O0WantsFastISel; }
   void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; }
+  void setGlobalISel(bool Enable) { Options.EnableGlobalISel = Enable; }
+  void setMachineOutliner(bool Enable) {
+    Options.EnableMachineOutliner = Enable;
+  }
+  void setSupportsDefaultOutlining(bool Enable) {
+    Options.SupportsDefaultOutlining = Enable;
+  }
 
   bool shouldPrintMachineCode() const { return Options.PrintMachineCode; }
 
@@ -201,14 +225,14 @@ public:
     return Options.FunctionSections;
   }
 
-  /// \brief Get a \c TargetIRAnalysis appropriate for the target.
+  /// Get a \c TargetIRAnalysis appropriate for the target.
   ///
   /// This is used to construct the new pass manager's target IR analysis pass,
   /// set up appropriately for this target machine. Even the old pass manager
   /// uses this to answer queries about the IR.
   TargetIRAnalysis getTargetIRAnalysis();
 
-  /// \brief Return a TargetTransformInfo for a given function.
+  /// Return a TargetTransformInfo for a given function.
   ///
   /// The returned TargetTransformInfo is specialized to the subtarget
   /// corresponding to \p F.
@@ -234,7 +258,7 @@ public:
   /// \p MMI is an optional parameter that, if set to non-nullptr,
   /// will be used to set the MachineModuloInfo for this PM.
   virtual bool addPassesToEmitFile(PassManagerBase &, raw_pwrite_stream &,
-                                   CodeGenFileType,
+                                   raw_pwrite_stream *, CodeGenFileType,
                                    bool /*DisableVerify*/ = true,
                                    MachineModuleInfo *MMI = nullptr) {
     return true;
@@ -281,14 +305,14 @@ public:
 class LLVMTargetMachine : public TargetMachine {
 protected: // Can only create subclasses.
   LLVMTargetMachine(const Target &T, StringRef DataLayoutString,
-                    const Triple &TargetTriple, StringRef CPU, StringRef FS,
+                    const Triple &TT, StringRef CPU, StringRef FS,
                     const TargetOptions &Options, Reloc::Model RM,
                     CodeModel::Model CM, CodeGenOpt::Level OL);
 
   void initAsmInfo();
 
 public:
-  /// \brief Get a TargetTransformInfo implementation for the target.
+  /// Get a TargetTransformInfo implementation for the target.
   ///
   /// The TTI returned uses the common code generator to answer queries about
   /// the IR.
@@ -303,7 +327,8 @@ public:
   /// \p MMI is an optional parameter that, if set to non-nullptr,
   /// will be used to set the MachineModuloInfofor this PM.
   bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
-                           CodeGenFileType FileType, bool DisableVerify = true,
+                           raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
+                           bool DisableVerify = true,
                            MachineModuleInfo *MMI = nullptr) override;
 
   /// Add passes to the specified pass manager to get machine code emitted with
@@ -311,7 +336,7 @@ public:
   /// fills the MCContext Ctx pointer which can be used to build custom
   /// MCStreamer.
   bool addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
-                         raw_pwrite_stream &OS,
+                         raw_pwrite_stream &Out,
                          bool DisableVerify = true) override;
 
   /// Returns true if the target is expected to pass all machine verifier
@@ -320,10 +345,11 @@ public:
   /// EXPENSIVE_CHECKS is enabled.
   virtual bool isMachineVerifierClean() const { return true; }
 
-  /// \brief Adds an AsmPrinter pass to the pipeline that prints assembly or
+  /// Adds an AsmPrinter pass to the pipeline that prints assembly or
   /// machine code from the MI representation.
   bool addAsmPrinter(PassManagerBase &PM, raw_pwrite_stream &Out,
-                     CodeGenFileType FileTYpe, MCContext &Context);
+                     raw_pwrite_stream *DwoOut, CodeGenFileType FileTYpe,
+                     MCContext &Context);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/Target/TargetOptions.h b/contrib/llvm/include/llvm/Target/TargetOptions.h
index 70fac7833f32..07ed773de55e 100644
--- a/contrib/llvm/include/llvm/Target/TargetOptions.h
+++ b/contrib/llvm/include/llvm/Target/TargetOptions.h
@@ -104,11 +104,14 @@ namespace llvm {
           NoSignedZerosFPMath(false),
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
-          EnableFastISel(false), UseInitArray(false),
+          EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
           DisableIntegratedAS(false), RelaxELFRelocations(false),
           FunctionSections(false), DataSections(false),
-          UniqueSectionNames(true), TrapUnreachable(false), EmulatedTLS(false),
-          EnableIPRA(false), EmitStackSizeSection(false) {}
+          UniqueSectionNames(true), TrapUnreachable(false),
+          NoTrapAfterNoreturn(false), EmulatedTLS(false),
+          ExplicitEmulatedTLS(false), EnableIPRA(false),
+          EmitStackSizeSection(false), EnableMachineOutliner(false),
+          SupportsDefaultOutlining(false), EmitAddrsig(false) {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
     /// option is specified on the command line, and should enable debugging
@@ -186,6 +189,9 @@ namespace llvm {
     /// compile time.
     unsigned EnableFastISel : 1;
 
+    /// EnableGlobalISel - This flag enables global instruction selection.
+    unsigned EnableGlobalISel : 1;
+
     /// UseInitArray - Use .init_array instead of .ctors for static
     /// constructors.
     unsigned UseInitArray : 1;
@@ -209,16 +215,32 @@ namespace llvm {
     /// Emit target-specific trap instruction for 'unreachable' IR instructions.
     unsigned TrapUnreachable : 1;
 
+    /// Do not emit a trap instruction for 'unreachable' IR instructions behind
+    /// noreturn calls, even if TrapUnreachable is true.
+    unsigned NoTrapAfterNoreturn : 1;
+
     /// EmulatedTLS - This flag enables emulated TLS model, using emutls
     /// function in the runtime library..
     unsigned EmulatedTLS : 1;
 
+    /// Whether -emulated-tls or -no-emulated-tls is set.
+    unsigned ExplicitEmulatedTLS : 1;
+
     /// This flag enables InterProcedural Register Allocation (IPRA).
     unsigned EnableIPRA : 1;
 
     /// Emit section containing metadata on function stack sizes.
     unsigned EmitStackSizeSection : 1;
 
+    /// Enables the MachineOutliner pass.
+    unsigned EnableMachineOutliner : 1;
+
+    /// Set if the target supports default outlining behaviour.
+    unsigned SupportsDefaultOutlining : 1;
+
+    /// Emit address-significance table.
+    unsigned EmitAddrsig : 1;
+
     /// FloatABIType - This setting is set by -float-abi=xxx option is specfied
     /// on the command line. This setting may either be Default, Soft, or Hard.
     /// Default selects the target's default behavior. Soft selects the ABI for
diff --git a/contrib/llvm/include/llvm/Target/TargetSchedule.td b/contrib/llvm/include/llvm/Target/TargetSchedule.td
index 7b00c9420e35..6fd2d5b78e54 100644
--- a/contrib/llvm/include/llvm/Target/TargetSchedule.td
+++ b/contrib/llvm/include/llvm/Target/TargetSchedule.td
@@ -99,6 +99,12 @@ class SchedMachineModel {
   // resulting from changes to the instruction definitions.
   bit CompleteModel = 1;
 
+  // Indicates that we should do full overlap checking for multiple InstrRWs
+  // definining the same instructions within the same SchedMachineModel.
+  // FIXME: Remove when all in tree targets are clean with the full check
+  // enabled.
+  bit FullInstRWOverlapCheck = 1;
+
   // A processor may only implement part of published ISA, due to either new ISA
   // extensions, (e.g. Pentium 4 doesn't have AVX) or implementation
   // (ARM/MIPS/PowerPC/SPARC soft float cores).
@@ -175,9 +181,9 @@ class ProcResourceKind;
 // BufferSize=1.
 //
 // SchedModel ties these units to a processor for any stand-alone defs
-// of this class. Instances of subclass ProcResource will be automatically
-// attached to a processor, so SchedModel is not needed.
-class ProcResourceUnits<ProcResourceKind kind, int num> {
+// of this class.
+class ProcResourceUnits<ProcResourceKind kind, int num,
+                        list<string> pfmCounters> {
   ProcResourceKind Kind = kind;
   int NumUnits = num;
   ProcResourceKind Super = ?;
@@ -192,8 +198,8 @@ def EponymousProcResourceKind : ProcResourceKind;
 
 // Subtargets typically define processor resource kind and number of
 // units in one place.
-class ProcResource<int num> : ProcResourceKind,
-  ProcResourceUnits<EponymousProcResourceKind, num>;
+class ProcResource<int num, list<string> pfmCounters = []> : ProcResourceKind,
+  ProcResourceUnits<EponymousProcResourceKind, num, pfmCounters>;
 
 class ProcResGroup<list<ProcResource> resources> : ProcResourceKind {
   list<ProcResource> Resources = resources;
@@ -275,10 +281,9 @@ class ProcWriteResources<list<ProcResourceKind> resources> {
 // ProcResources indicates the set of resources consumed by the write.
 // Optionally, ResourceCycles indicates the number of cycles the
 // resource is consumed. Each ResourceCycles item is paired with the
-// ProcResource item at the same position in its list. Since
-// ResourceCycles are rarely specialized, the list may be
-// incomplete. By default, resources are consumed for a single cycle,
-// regardless of latency, which models a fully pipelined processing
+// ProcResource item at the same position in its list. ResourceCycles
+// can be `[]`: in that case, all resources are consumed for a single
+// cycle, regardless of latency, which models a fully pipelined processing
 // unit. A value of 0 for ResourceCycles means that the resource must
 // be available but is not consumed, which is only relevant for
 // unbuffered resources.
@@ -349,13 +354,23 @@ class PredicateProlog<code c> {
   code Code = c;
 }
 
+// Base class for scheduling predicates.
+class SchedPredicateBase;
+
+// A scheduling predicate whose logic is defined by a MCInstPredicate.
+// This can directly be used by SchedWriteVariant definitions.
+class MCSchedPredicate<MCInstPredicate P> : SchedPredicateBase {
+  MCInstPredicate Pred = P;
+  SchedMachineModel SchedModel = ?;
+}
+
 // Define a predicate to determine which SchedVariant applies to a
 // particular MachineInstr. The code snippet is used as an
 // if-statement's expression. Available variables are MI, SchedModel,
 // and anything defined in a PredicateProlog.
 //
 // SchedModel silences warnings but is ignored.
-class SchedPredicate<code pred> {
+class SchedPredicate<code pred> : SchedPredicateBase {
   SchedMachineModel SchedModel = ?;
   code Predicate = pred;
 }
@@ -370,8 +385,8 @@ def NoSchedPred : SchedPredicate<[{true}]>;
 // operands. In this case, latency is not additive. If the current Variant
 // is already part of a Sequence, then that entire chain leading up to
 // the Variant is distributed over the variadic operands.
-class SchedVar<SchedPredicate pred, list<SchedReadWrite> selected> {
-  SchedPredicate Predicate = pred;
+class SchedVar<SchedPredicateBase pred, list<SchedReadWrite> selected> {
+  SchedPredicateBase Predicate = pred;
   list<SchedReadWrite> Selected = selected;
 }
 
@@ -437,3 +452,102 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
   SchedReadWrite AliasRW = alias;
   SchedMachineModel SchedModel = ?;
 }
+
+// Allow the definition of processor register files for register renaming
+// purposes.
+//
+// Each processor register file declares:
+//  - The set of registers that can be renamed.
+//  - The number of physical registers which can be used for register renaming
+//    purpose.
+//  - The cost of a register rename.
+//
+// The cost of a rename is the number of physical registers allocated by the
+// register alias table to map the new definition. By default, register can be
+// renamed at the cost of a single physical register.  Note that register costs
+// are defined at register class granularity (see field `Costs`).
+//
+// The set of registers that are subject to register renaming is declared using
+// a list of register classes (see field `RegClasses`). An empty list of
+// register classes means: all the logical registers defined by the target can
+// be fully renamed.
+//
+// A register R can be renamed if its register class appears in the `RegClasses`
+// set. When R is written, a new alias is allocated at the cost of one or more
+// physical registers; as a result, false dependencies on R are removed.
+//
+// A sub-register V of register R is implicitly part of the same register file.
+// However, V is only renamed if its register class is part of `RegClasses`.
+// Otherwise, the processor keeps it (as well as any other different part
+// of R) together with R, and a write of V always causes a compulsory read of R.
+//
+// This is what happens for example on AMD processors (at least from Bulldozer
+// onwards), where AL and AH are not treated as independent from AX, and AX is
+// not treated as independent from EAX. A write to AL has an implicity false
+// dependency on the last write to EAX (or a portion of EAX).  As a consequence,
+// a write to AL cannot go in parallel with a write to AH.
+//
+// There is no false dependency if the partial register write belongs to a
+// register class that is in `RegClasses`.
+// There is also no penalty for writes that "clear the content a super-register"
+// (see MC/MCInstrAnalysis.h - method MCInstrAnalysis::clearsSuperRegisters()).
+// On x86-64, 32-bit GPR writes implicitly zero the upper half of the underlying
+// physical register, effectively removing any false dependencies with the
+// previous register definition.
+//
+// TODO: This implementation assumes that there is no limit in the number of
+// renames per cycle, which might not be true for all hardware or register
+// classes. Also, there is no limit to how many times the same logical register
+// can be renamed during the same cycle.
+//
+// TODO: we don't currently model merge penalties for the case where a write to
+// a part of a register is followed by a read from a larger part of the same
+// register. On some Intel chips, different parts of a GPR can be stored in
+// different physical registers. However, there is a cost to pay for when the
+// partial write is combined with the previous super-register definition.  We
+// should add support for these cases, and correctly model merge problems with
+// partial register accesses.
+class RegisterFile<int numPhysRegs, list<RegisterClass> Classes = [],
+                   list<int> Costs = []> {
+  list<RegisterClass> RegClasses = Classes;
+  list<int> RegCosts = Costs;
+  int NumPhysRegs = numPhysRegs;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Describe the retire control unit.
+// A retire control unit specifies the size of the reorder buffer, as well as
+// the maximum number of opcodes that can be retired every cycle.
+// A value less-than-or-equal-to zero for field 'ReorderBufferSize' means: "the
+// size is unknown". The idea is that external tools can fall-back to using
+// field MicroOpBufferSize in SchedModel if the reorder buffer size is unknown.
+// A zero or negative value for field 'MaxRetirePerCycle' means "no
+// restrictions on the number of instructions retired per cycle".
+// Models can optionally specify up to one instance of RetireControlUnit per
+// scheduling model.
+class RetireControlUnit<int bufferSize, int retirePerCycle> {
+  int ReorderBufferSize = bufferSize;
+  int MaxRetirePerCycle = retirePerCycle;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Allow the definition of hardware counters.
+class PfmCounter {
+  SchedMachineModel SchedModel = ?;
+}
+
+// Each processor can define how to measure cycles by defining a
+// PfmCycleCounter.
+class PfmCycleCounter<string counter> : PfmCounter {
+  string Counter = counter;
+}
+
+// Each ProcResourceUnits can define how to measure issued uops by defining
+// a PfmIssueCounter.
+class PfmIssueCounter<ProcResourceUnits resource, list<string> counters>
+    : PfmCounter{
+  // The resource units on which uops are issued.
+  ProcResourceUnits Resource = resource;
+  // The list of counters that measure issue events.
+  list<string> Counters = counters;
+}
diff --git a/contrib/llvm/include/llvm/Target/TargetSelectionDAG.td b/contrib/llvm/include/llvm/Target/TargetSelectionDAG.td
index f6162377b8b7..4ba4d821225d 100644
--- a/contrib/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/contrib/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -485,6 +485,8 @@ def atomic_load_sub : SDNode<"ISD::ATOMIC_LOAD_SUB" , SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_load_and : SDNode<"ISD::ATOMIC_LOAD_AND" , SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def atomic_load_clr : SDNode<"ISD::ATOMIC_LOAD_CLR" , SDTAtomic2,
+                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_load_or  : SDNode<"ISD::ATOMIC_LOAD_OR" , SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_load_xor : SDNode<"ISD::ATOMIC_LOAD_XOR" , SDTAtomic2,
@@ -613,14 +615,18 @@ class CodePatPred<code predicate> : PatPred {
 // compact and readable.
 //
 
-/// PatFrag - Represents a pattern fragment.  This can match something on the
-/// DAG, from a single node to multiple nested other fragments.
+/// PatFrags - Represents a set of pattern fragments.  Each single fragment
+/// can match something on the DAG, from a single node to multiple nested other
+/// fragments.   The whole set of fragments matches if any of the single
+/// fragemnts match.  This allows e.g. matching and "add with overflow" and
+/// a regular "add" with the same fragment set.
 ///
-class PatFrag<dag ops, dag frag, code pred = [{}],
-              SDNodeXForm xform = NOOP_SDNodeXForm> : SDPatternOperator {
+class PatFrags<dag ops, list<dag> frags, code pred = [{}],
+               SDNodeXForm xform = NOOP_SDNodeXForm> : SDPatternOperator {
   dag Operands = ops;
-  dag Fragment = frag;
+  list<dag> Fragments = frags;
   code PredicateCode = pred;
+  code GISelPredicateCode = [{}];
   code ImmediateCode = [{}];
   SDNodeXForm OperandTransform = xform;
 
@@ -679,6 +685,11 @@ class PatFrag<dag ops, dag frag, code pred = [{}],
   ValueType ScalarMemoryVT = ?;
 }
 
+// PatFrag - A version of PatFrags matching only a single fragment.
+class PatFrag<dag ops, dag frag, code pred = [{}],
+              SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatFrags<ops, [frag], pred, xform>;
+
 // OutPatFrag is a pattern fragment that is used as part of an output pattern
 // (not an input pattern). These do not have predicates or transforms, but are
 // used to avoid repeated subexpressions in output patterns.
@@ -1130,27 +1141,27 @@ def setne  : PatFrag<(ops node:$lhs, node:$rhs),
 
 multiclass binary_atomic_op_ord<SDNode atomic_op> {
   def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingMonotonic = 1;
   }
   def #NAME#_acquire : PatFrag<(ops node:$ptr, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingAcquire = 1;
   }
   def #NAME#_release : PatFrag<(ops node:$ptr, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingRelease = 1;
   }
   def #NAME#_acq_rel : PatFrag<(ops node:$ptr, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingAcquireRelease = 1;
   }
   def #NAME#_seq_cst : PatFrag<(ops node:$ptr, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingSequentiallyConsistent = 1;
   }
@@ -1158,27 +1169,27 @@ multiclass binary_atomic_op_ord<SDNode atomic_op> {
 
 multiclass ternary_atomic_op_ord<SDNode atomic_op> {
   def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$cmp, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingMonotonic = 1;
   }
   def #NAME#_acquire : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$cmp, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingAcquire = 1;
   }
   def #NAME#_release : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$cmp, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingRelease = 1;
   }
   def #NAME#_acq_rel : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$cmp, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingAcquireRelease = 1;
   }
   def #NAME#_seq_cst : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
-      (!cast<SDNode>(#NAME) node:$ptr, node:$cmp, node:$val)> {
+      (!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
     let IsAtomic = 1;
     let IsAtomicOrderingSequentiallyConsistent = 1;
   }
@@ -1244,6 +1255,7 @@ defm atomic_load_add  : binary_atomic_op<atomic_load_add>;
 defm atomic_swap      : binary_atomic_op<atomic_swap>;
 defm atomic_load_sub  : binary_atomic_op<atomic_load_sub>;
 defm atomic_load_and  : binary_atomic_op<atomic_load_and>;
+defm atomic_load_clr  : binary_atomic_op<atomic_load_clr>;
 defm atomic_load_or   : binary_atomic_op<atomic_load_or>;
 defm atomic_load_xor  : binary_atomic_op<atomic_load_xor>;
 defm atomic_load_nand : binary_atomic_op<atomic_load_nand>;
diff --git a/contrib/llvm/include/llvm/Testing/Support/Error.h b/contrib/llvm/include/llvm/Testing/Support/Error.h
index 50889b9c66f5..0e5b5403ce87 100644
--- a/contrib/llvm/include/llvm/Testing/Support/Error.h
+++ b/contrib/llvm/include/llvm/Testing/Support/Error.h
@@ -38,7 +38,7 @@ public:
 
   bool MatchAndExplain(const ExpectedHolder<T> &Holder,
                        testing::MatchResultListener *listener) const override {
-    if (!Holder.Success)
+    if (!Holder.Success())
       return false;
 
     bool result = Matcher.MatchAndExplain(*Holder.Exp, listener);
@@ -82,6 +82,53 @@ private:
   M Matcher;
 };
 
+template <typename InfoT>
+class ErrorMatchesMono : public testing::MatcherInterface<const ErrorHolder &> {
+public:
+  explicit ErrorMatchesMono(Optional<testing::Matcher<InfoT &>> Matcher)
+      : Matcher(std::move(Matcher)) {}
+
+  bool MatchAndExplain(const ErrorHolder &Holder,
+                       testing::MatchResultListener *listener) const override {
+    if (Holder.Success())
+      return false;
+
+    if (Holder.Infos.size() > 1) {
+      *listener << "multiple errors";
+      return false;
+    }
+
+    auto &Info = *Holder.Infos[0];
+    if (!Info.isA<InfoT>()) {
+      *listener << "Error was not of given type";
+      return false;
+    }
+
+    if (!Matcher)
+      return true;
+
+    return Matcher->MatchAndExplain(static_cast<InfoT &>(Info), listener);
+  }
+
+  void DescribeTo(std::ostream *OS) const override {
+    *OS << "failed with Error of given type";
+    if (Matcher) {
+      *OS << " and the error ";
+      Matcher->DescribeTo(OS);
+    }
+  }
+
+  void DescribeNegationTo(std::ostream *OS) const override {
+    *OS << "succeeded or did not fail with the error of given type";
+    if (Matcher) {
+      *OS << " or the error ";
+      Matcher->DescribeNegationTo(OS);
+    }
+  }
+
+private:
+  Optional<testing::Matcher<InfoT &>> Matcher;
+};
 } // namespace detail
 
 #define EXPECT_THAT_ERROR(Err, Matcher)                                        \
@@ -94,8 +141,19 @@ private:
 #define ASSERT_THAT_EXPECTED(Err, Matcher)                                     \
   ASSERT_THAT(llvm::detail::TakeExpected(Err), Matcher)
 
-MATCHER(Succeeded, "") { return arg.Success; }
-MATCHER(Failed, "") { return !arg.Success; }
+MATCHER(Succeeded, "") { return arg.Success(); }
+MATCHER(Failed, "") { return !arg.Success(); }
+
+template <typename InfoT>
+testing::Matcher<const detail::ErrorHolder &> Failed() {
+  return MakeMatcher(new detail::ErrorMatchesMono<InfoT>(None));
+}
+
+template <typename InfoT, typename M>
+testing::Matcher<const detail::ErrorHolder &> Failed(M Matcher) {
+  return MakeMatcher(new detail::ErrorMatchesMono<InfoT>(
+      testing::SafeMatcherCast<InfoT &>(Matcher)));
+}
 
 template <typename M>
 detail::ValueMatchesPoly<M> HasValue(M Matcher) {
diff --git a/contrib/llvm/include/llvm/Testing/Support/SupportHelpers.h b/contrib/llvm/include/llvm/Testing/Support/SupportHelpers.h
index d7f0c7142b2c..96264ac81dc4 100644
--- a/contrib/llvm/include/llvm/Testing/Support/SupportHelpers.h
+++ b/contrib/llvm/include/llvm/Testing/Support/SupportHelpers.h
@@ -17,8 +17,9 @@
 namespace llvm {
 namespace detail {
 struct ErrorHolder {
-  bool Success;
-  std::string Message;
+  std::vector<std::shared_ptr<ErrorInfoBase>> Infos;
+
+  bool Success() const { return Infos.empty(); }
 };
 
 template <typename T> struct ExpectedHolder : public ErrorHolder {
@@ -29,15 +30,22 @@ template <typename T> struct ExpectedHolder : public ErrorHolder {
 };
 
 inline void PrintTo(const ErrorHolder &Err, std::ostream *Out) {
-  *Out << (Err.Success ? "succeeded" : "failed");
-  if (!Err.Success) {
-    *Out << "  (" << StringRef(Err.Message).trim().str() << ")";
+  raw_os_ostream OS(*Out);
+  OS << (Err.Success() ? "succeeded" : "failed");
+  if (!Err.Success()) {
+    const char *Delim = "  (";
+    for (const auto &Info : Err.Infos) {
+      OS << Delim;
+      Delim = "; ";
+      Info->log(OS);
+    }
+    OS << ")";
   }
 }
 
 template <typename T>
 void PrintTo(const ExpectedHolder<T> &Item, std::ostream *Out) {
-  if (Item.Success) {
+  if (Item.Success()) {
     *Out << "succeeded with value " << ::testing::PrintToString(*Item.Exp);
   } else {
     PrintTo(static_cast<const ErrorHolder &>(Item), Out);
diff --git a/contrib/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h b/contrib/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
new file mode 100644
index 000000000000..f970acdc741f
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
@@ -0,0 +1,41 @@
+//===- AggressiveInstCombine.h - AggressiveInstCombine pass -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the primary interface to the aggressive instcombine pass.
+/// This pass is suitable for use in the new pass manager. For a pass that works
+/// with the legacy pass manager, please use
+/// \c createAggressiveInstCombinerPass().
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_AGGRESSIVE_INSTCOMBINE_INSTCOMBINE_H
+#define LLVM_TRANSFORMS_AGGRESSIVE_INSTCOMBINE_INSTCOMBINE_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class AggressiveInstCombinePass
+    : public PassInfoMixin<AggressiveInstCombinePass> {
+
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+//===----------------------------------------------------------------------===//
+//
+// AggressiveInstCombiner - Combine expression patterns to form expressions with
+// fewer, simple instructions. This pass does not modify the CFG.
+//
+FunctionPass *createAggressiveInstCombinerPass();
+}
+
+#endif
diff --git a/contrib/llvm/include/llvm/Transforms/IPO.h b/contrib/llvm/include/llvm/Transforms/IPO.h
index ce20a726b783..ebc76bf82118 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TRANSFORMS_IPO_H
 #define LLVM_TRANSFORMS_IPO_H
 
+#include "llvm/ADT/SmallVector.h"
 #include <functional>
 #include <vector>
 
@@ -44,10 +45,6 @@ ModulePass *createStripSymbolsPass(bool OnlyDebugInfo = false);
 //
 ModulePass *createStripNonDebugSymbolsPass();
 
-/// This function returns a new pass that downgrades the debug info in the
-/// module to line tables only.
-ModulePass *createStripNonLineTableDebugInfoPass();
-
 //===----------------------------------------------------------------------===//
 //
 // This pass removes llvm.dbg.declare intrinsics.
@@ -179,10 +176,13 @@ Pass *createLoopExtractorPass();
 ///
 Pass *createSingleLoopExtractorPass();
 
-/// createBlockExtractorPass - This pass extracts all blocks (except those
-/// specified in the argument list) from the functions in the module.
+/// createBlockExtractorPass - This pass extracts all the specified blocks
+/// from the functions in the module.
 ///
 ModulePass *createBlockExtractorPass();
+ModulePass *
+createBlockExtractorPass(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
+                         bool EraseFunctions);
 
 /// createStripDeadPrototypesPass - This pass removes any function declarations
 /// (prototypes) that are not used.
@@ -207,11 +207,6 @@ ModulePass *createMergeFunctionsPass();
 ModulePass *createPartialInliningPass();
 
 //===----------------------------------------------------------------------===//
-// createMetaRenamerPass - Rename everything with metasyntatic names.
-//
-ModulePass *createMetaRenamerPass();
-
-//===----------------------------------------------------------------------===//
 /// createBarrierNoopPass - This pass is purely a module pass barrier in a pass
 /// manager.
 ModulePass *createBarrierNoopPass();
@@ -227,7 +222,7 @@ enum class PassSummaryAction {
   Export, ///< Export information to summary.
 };
 
-/// \brief This pass lowers type metadata and the llvm.type.test intrinsic to
+/// This pass lowers type metadata and the llvm.type.test intrinsic to
 /// bitsets.
 ///
 /// The behavior depends on the summary arguments:
@@ -240,10 +235,10 @@ enum class PassSummaryAction {
 ModulePass *createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
                                      const ModuleSummaryIndex *ImportSummary);
 
-/// \brief This pass export CFI checks for use by external modules.
+/// This pass export CFI checks for use by external modules.
 ModulePass *createCrossDSOCFIPass();
 
-/// \brief This pass implements whole-program devirtualization using type
+/// This pass implements whole-program devirtualization using type
 /// metadata.
 ///
 /// The behavior depends on the summary arguments:
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h b/contrib/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
index 15c80357e4a8..b52c0fdbd2c9 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
@@ -27,7 +27,13 @@ namespace llvm {
 /// be the simplest possible pass to remove always_inline function definitions'
 /// uses by inlining them. The \c GlobalDCE pass can be used to remove these
 /// functions once all users are gone.
-struct AlwaysInlinerPass : PassInfoMixin<AlwaysInlinerPass> {
+class AlwaysInlinerPass : public PassInfoMixin<AlwaysInlinerPass> {
+  bool InsertLifetime;
+
+public:
+  AlwaysInlinerPass(bool InsertLifetime = true)
+      : InsertLifetime(InsertLifetime) {}
+
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h b/contrib/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
index 82ffc69a166e..49ca6cc73393 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
@@ -22,7 +22,11 @@ namespace llvm {
 /// transform it and all of its callers to replace indirect arguments with
 /// direct (by-value) arguments.
 class ArgumentPromotionPass : public PassInfoMixin<ArgumentPromotionPass> {
+  unsigned MaxElements;
+
 public:
+  ArgumentPromotionPass(unsigned MaxElements = 3u) : MaxElements(MaxElements) {}
+
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
 };
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/contrib/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 39e5b5c8ae6f..120a34e15933 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -33,11 +33,17 @@ class Module;
 /// based on the provided summary informations.
 class FunctionImporter {
 public:
-  /// Set of functions to import from a source module. Each entry is a map
-  /// containing all the functions to import for a source module.
-  /// The keys is the GUID identifying a function to import, and the value
-  /// is the threshold applied when deciding to import it.
-  using FunctionsToImportTy = std::map<GlobalValue::GUID, unsigned>;
+  /// Set of functions to import from a source module. Each entry is a set
+  /// containing all the GUIDs of all functions to import for a source module.
+  using FunctionsToImportTy = std::unordered_set<GlobalValue::GUID>;
+
+  /// Map of callee GUID considered for import into a given module to a pair
+  /// consisting of the largest threshold applied when deciding whether to
+  /// import it and, if we decided to import, a pointer to the summary instance
+  /// imported. If we decided not to import, the summary will be nullptr.
+  using ImportThresholdsTy =
+      DenseMap<GlobalValue::GUID,
+               std::pair<unsigned, const GlobalValueSummary *>>;
 
   /// The map contains an entry for every module to import from, the key being
   /// the module identifier to pass to the ModuleLoader. The value is the set of
@@ -107,12 +113,24 @@ void ComputeCrossModuleImportForModuleFromIndex(
     StringRef ModulePath, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList);
 
+/// PrevailingType enum used as a return type of callback passed
+/// to computeDeadSymbols. Yes and No values used when status explicitly
+/// set by symbols resolution, otherwise status is Unknown.
+enum class PrevailingType { Yes, No, Unknown };
+
 /// Compute all the symbols that are "dead": i.e these that can't be reached
 /// in the graph from any of the given symbols listed in
-/// \p GUIDPreservedSymbols.
+/// \p GUIDPreservedSymbols. Non-prevailing symbols are symbols without a
+/// prevailing copy anywhere in IR and are normally dead, \p isPrevailing
+/// predicate returns status of symbol.
 void computeDeadSymbols(
     ModuleSummaryIndex &Index,
-    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols);
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing);
+
+/// Converts value \p GV to declaration, or replaces with a declaration if
+/// it is an alias. Returns true if converted, false if replaced.
+bool convertToDeclaration(GlobalValue &GV);
 
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
 /// \p ModulePath.
@@ -131,9 +149,9 @@ void gatherImportedSummariesForModule(
     std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
 
 /// Emit into \p OutputFilename the files module \p ModulePath will import from.
-std::error_code
-EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
-                 const FunctionImporter::ImportMapTy &ModuleImports);
+std::error_code EmitImportsFiles(
+    StringRef ModulePath, StringRef OutputFilename,
+    const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
 
 /// Resolve WeakForLinker values in \p TheModule based on the information
 /// recorded in the summaries during global summary-based analysis.
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/Inliner.h b/contrib/llvm/include/llvm/Transforms/IPO/Inliner.h
index eda8cf462b50..610e4500e4b1 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO/Inliner.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/Inliner.h
@@ -96,12 +96,17 @@ class InlinerPass : public PassInfoMixin<InlinerPass> {
 public:
   InlinerPass(InlineParams Params = getInlineParams())
       : Params(std::move(Params)) {}
+  ~InlinerPass();
+  InlinerPass(InlinerPass &&Arg)
+      : Params(std::move(Arg.Params)),
+        ImportedFunctionsStats(std::move(Arg.ImportedFunctionsStats)) {}
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
 
 private:
   InlineParams Params;
+  std::unique_ptr<ImportedFunctionsInliningStatistics> ImportedFunctionsStats;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h b/contrib/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h
index 3bcfe65df550..bc448386b63d 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h
@@ -26,6 +26,7 @@
 namespace llvm {
 
 class Module;
+class ModuleSummaryIndex;
 class raw_ostream;
 
 namespace lowertypetests {
@@ -197,6 +198,11 @@ struct ByteArrayBuilder {
 
 class LowerTypeTestsPass : public PassInfoMixin<LowerTypeTestsPass> {
 public:
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+  LowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                     const ModuleSummaryIndex *ImportSummary)
+      : ExportSummary(ExportSummary), ImportSummary(ImportSummary) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/SampleProfile.h b/contrib/llvm/include/llvm/Transforms/IPO/SampleProfile.h
index f5a8590e14a1..cd5a0563898e 100644
--- a/contrib/llvm/include/llvm/Transforms/SampleProfile.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/SampleProfile.h
@@ -1,4 +1,4 @@
-//===- Transforms/SampleProfile.h - SamplePGO pass --------------*- C++ -*-===//
+//===- SampleProfile.h - SamplePGO pass ---------- --------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_SAMPLEPROFILE_H
-#define LLVM_TRANSFORMS_SAMPLEPROFILE_H
+#ifndef LLVM_TRANSFORMS_IPO_SAMPLEPROFILE_H
+#define LLVM_TRANSFORMS_IPO_SAMPLEPROFILE_H
 
 #include "llvm/IR/PassManager.h"
 #include <string>
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/SyntheticCountsPropagation.h b/contrib/llvm/include/llvm/Transforms/IPO/SyntheticCountsPropagation.h
new file mode 100644
index 000000000000..0b3ba86bc9e4
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/IPO/SyntheticCountsPropagation.h
@@ -0,0 +1,19 @@
+#ifndef LLVM_TRANSFORMS_IPO_SYNTHETIC_COUNTS_PROPAGATION_H
+#define LLVM_TRANSFORMS_IPO_SYNTHETIC_COUNTS_PROPAGATION_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/ScaledNumber.h"
+
+namespace llvm {
+class Function;
+class Module;
+
+class SyntheticCountsPropagation
+    : public PassInfoMixin<SyntheticCountsPropagation> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/contrib/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index 1aa4c6f4f559..bf2c79b0751e 100644
--- a/contrib/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/contrib/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -28,6 +28,7 @@ template <typename T> class ArrayRef;
 template <typename T> class MutableArrayRef;
 class Function;
 class GlobalVariable;
+class ModuleSummaryIndex;
 
 namespace wholeprogramdevirt {
 
@@ -218,6 +219,13 @@ void setAfterReturnValues(MutableArrayRef<VirtualCallTarget> Targets,
 } // end namespace wholeprogramdevirt
 
 struct WholeProgramDevirtPass : public PassInfoMixin<WholeProgramDevirtPass> {
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+  WholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                         const ModuleSummaryIndex *ImportSummary)
+      : ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
+    assert(!(ExportSummary && ImportSummary));
+  }
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombine.h b/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
index 6bd22dc46255..ab25fe08553a 100644
--- a/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
+++ b/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
@@ -10,8 +10,7 @@
 ///
 /// This file provides the primary interface to the instcombine pass. This pass
 /// is suitable for use in the new pass manager. For a pass that works with the
-/// legacy pass manager, please look for \c createInstructionCombiningPass() in
-/// Scalar.h.
+/// legacy pass manager, use \c createInstructionCombiningPass().
 ///
 //===----------------------------------------------------------------------===//
 
@@ -37,7 +36,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief The legacy pass manager's instcombine pass.
+/// The legacy pass manager's instcombine pass.
 ///
 /// This is a basic whole-function wrapper around the instcombine utility. It
 /// will try to combine all instructions in the function.
@@ -56,6 +55,20 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnFunction(Function &F) override;
 };
+
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions. This pass does not modify the CFG, and has a tendency to make
+// instructions dead, so a subsequent DCE pass is useful.
+//
+// This pass combines things like:
+//    %Y = add int 1, %X
+//    %Z = add int 1, %Y
+// into:
+//    %Z = add int 2, %X
+//
+FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true);
 }
 
 #endif
diff --git a/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombineWorklist.h
index 271e891bb45e..f860b4b86555 100644
--- a/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombineWorklist.h
+++ b/contrib/llvm/include/llvm/Transforms/InstCombine/InstCombineWorklist.h
@@ -40,7 +40,7 @@ public:
   /// in it.
   void Add(Instruction *I) {
     if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
-      DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
       Worklist.push_back(I);
     }
   }
@@ -57,7 +57,8 @@ public:
     assert(Worklist.empty() && "Worklist must be empty to add initial group");
     Worklist.reserve(List.size()+16);
     WorklistMap.reserve(List.size());
-    DEBUG(dbgs() << "IC: ADDING: " << List.size() << " instrs to worklist\n");
+    LLVM_DEBUG(dbgs() << "IC: ADDING: " << List.size()
+                      << " instrs to worklist\n");
     unsigned Idx = 0;
     for (Instruction *I : reverse(List)) {
       WorklistMap.insert(std::make_pair(I, Idx++));
diff --git a/contrib/llvm/include/llvm/Transforms/Instrumentation.h b/contrib/llvm/include/llvm/Transforms/Instrumentation.h
index b1e13f17aef1..4a346c8d7450 100644
--- a/contrib/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/contrib/llvm/include/llvm/Transforms/Instrumentation.h
@@ -133,7 +133,8 @@ ModulePass *createAddressSanitizerModulePass(bool CompileKernel = false,
 FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0,
                                         bool Recover = false);
 
-FunctionPass *createHWAddressSanitizerPass(bool Recover = false);
+FunctionPass *createHWAddressSanitizerPass(bool CompileKernel = false,
+                                           bool Recover = false);
 
 // Insert ThreadSanitizer (race detection) instrumentation
 FunctionPass *createThreadSanitizerPass();
@@ -186,7 +187,7 @@ struct SanitizerCoverageOptions {
 ModulePass *createSanitizerCoverageModulePass(
     const SanitizerCoverageOptions &Options = SanitizerCoverageOptions());
 
-/// \brief Calculate what to divide by to scale counts.
+/// Calculate what to divide by to scale counts.
 ///
 /// Given the maximum count, calculate a divisor that will scale all the
 /// weights to strictly less than std::numeric_limits<uint32_t>::max().
@@ -196,7 +197,7 @@ static inline uint64_t calculateCountScale(uint64_t MaxCount) {
              : MaxCount / std::numeric_limits<uint32_t>::max() + 1;
 }
 
-/// \brief Scale an individual branch count.
+/// Scale an individual branch count.
 ///
 /// Scale a 64-bit weight down to 32-bits using \c Scale.
 ///
diff --git a/contrib/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h b/contrib/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
new file mode 100644
index 000000000000..c06c1a28715e
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
@@ -0,0 +1,31 @@
+//===- Transforms/Instrumentation/CGProfile.h -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the interface for LLVM's Call Graph Profile pass.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_CGPROFILE_H
+#define LLVM_TRANSFORMS_CGPROFILE_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class CGProfilePass : public PassInfoMixin<CGProfilePass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  void addModuleFlags(
+      Module &M,
+      MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) const;
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_CGPROFILE_H
diff --git a/contrib/llvm/include/llvm/Transforms/GCOVProfiler.h b/contrib/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
index 66bd75c88e24..dd55fbe29eed 100644
--- a/contrib/llvm/include/llvm/Transforms/GCOVProfiler.h
+++ b/contrib/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
@@ -1,4 +1,4 @@
-//===- Transforms/GCOVProfiler.h - GCOVProfiler pass  ----------*- C++ -*-===//
+//===- Transforms/Instrumentation/GCOVProfiler.h ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/include/llvm/Transforms/InstrProfiling.h b/contrib/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
index 0fe6ad5eeac7..13fb3db4ae6f 100644
--- a/contrib/llvm/include/llvm/Transforms/InstrProfiling.h
+++ b/contrib/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
@@ -1,4 +1,4 @@
-//===- Transforms/InstrProfiling.h - Instrumentation passes -----*- C++ -*-===//
+//===- Transforms/Instrumentation/InstrProfiling.h --------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -109,7 +109,8 @@ private:
   void emitRegistration();
 
   /// Emit the necessary plumbing to pull in the runtime initialization.
-  void emitRuntimeHook();
+  /// Returns true if a change was made.
+  bool emitRuntimeHook();
 
   /// Add uses of our data variables and runtime hook.
   void emitUses();
diff --git a/contrib/llvm/include/llvm/Transforms/PGOInstrumentation.h b/contrib/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index c2cc76c422da..c0b37c470b74 100644
--- a/contrib/llvm/include/llvm/Transforms/PGOInstrumentation.h
+++ b/contrib/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -1,4 +1,4 @@
-//===- Transforms/PGOInstrumentation.h - PGO gen/use passes -----*- C++ -*-===//
+//===- Transforms/Instrumentation/PGOInstrumentation.h ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar.h b/contrib/llvm/include/llvm/Transforms/Scalar.h
index 49186bc5cd66..9491e1bbac93 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar.h
@@ -80,7 +80,6 @@ FunctionPass *createDeadStoreEliminationPass();
 // values.
 FunctionPass *createCallSiteSplittingPass();
 
-
 //===----------------------------------------------------------------------===//
 //
 // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm.  This
@@ -89,7 +88,6 @@ FunctionPass *createCallSiteSplittingPass();
 //
 FunctionPass *createAggressiveDCEPass();
 
-
 //===----------------------------------------------------------------------===//
 //
 // GuardWidening - An optimization over the @llvm.experimental.guard intrinsic
@@ -101,6 +99,16 @@ FunctionPass *createGuardWideningPass();
 
 //===----------------------------------------------------------------------===//
 //
+// LoopGuardWidening - Analogous to the GuardWidening pass, but restricted to a
+// single loop at a time for use within a LoopPassManager.  Desired effect is
+// to widen guards into preheader or a single guard within loop if that's not
+// possible.
+//
+Pass *createLoopGuardWideningPass();
+
+
+//===----------------------------------------------------------------------===//
+//
 // BitTrackingDCE - This pass uses a bit-tracking DCE algorithm in order to
 // remove computations of dead bits.
 //
@@ -128,20 +136,6 @@ Pass *createIndVarSimplifyPass();
 
 //===----------------------------------------------------------------------===//
 //
-// InstructionCombining - Combine instructions to form fewer, simple
-// instructions. This pass does not modify the CFG, and has a tendency to make
-// instructions dead, so a subsequent DCE pass is useful.
-//
-// This pass combines things like:
-//    %Y = add int 1, %X
-//    %Z = add int 1, %Y
-// into:
-//    %Z = add int 2, %X
-//
-FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true);
-
-//===----------------------------------------------------------------------===//
-//
 // LICM - This pass is a loop invariant code motion and memory promotion pass.
 //
 Pass *createLICMPass();
@@ -198,6 +192,12 @@ Pass *createSimpleLoopUnrollPass(int OptLevel = 2);
 
 //===----------------------------------------------------------------------===//
 //
+// LoopUnrollAndJam - This pass is a simple loop unroll and jam pass.
+//
+Pass *createLoopUnrollAndJamPass(int OptLevel = 2);
+
+//===----------------------------------------------------------------------===//
+//
 // LoopReroll - This pass is a simple loop rerolling pass.
 //
 Pass *createLoopRerollPass();
@@ -222,20 +222,6 @@ Pass *createLoopVersioningLICMPass();
 
 //===----------------------------------------------------------------------===//
 //
-// PromoteMemoryToRegister - This pass is used to promote memory references to
-// be register references. A simple example of the transformation performed by
-// this pass is:
-//
-//        FROM CODE                           TO CODE
-//   %X = alloca i32, i32 1                 ret i32 42
-//   store i32 42, i32 *%X
-//   %Y = load i32* %X
-//   ret i32 %Y
-//
-FunctionPass *createPromoteMemoryToRegisterPass();
-
-//===----------------------------------------------------------------------===//
-//
 // DemoteRegisterToMemoryPass - This pass is used to demote registers to memory
 // references. In basically undoes the PromoteMemoryToRegister pass to make cfg
 // hacking easier.
@@ -288,31 +274,6 @@ Pass *createStructurizeCFGPass(bool SkipUniformRegions = false);
 
 //===----------------------------------------------------------------------===//
 //
-// BreakCriticalEdges - Break all of the critical edges in the CFG by inserting
-// a dummy basic block. This pass may be "required" by passes that cannot deal
-// with critical edges. For this usage, a pass must call:
-//
-//   AU.addRequiredID(BreakCriticalEdgesID);
-//
-// This pass obviously invalidates the CFG, but can update forward dominator
-// (set, immediate dominators, tree, and frontier) information.
-//
-FunctionPass *createBreakCriticalEdgesPass();
-extern char &BreakCriticalEdgesID;
-
-//===----------------------------------------------------------------------===//
-//
-// LoopSimplify - Insert Pre-header blocks into the CFG for every function in
-// the module.  This pass updates dominator information, loop information, and
-// does not add critical edges to the CFG.
-//
-//   AU.addRequiredID(LoopSimplifyID);
-//
-Pass *createLoopSimplifyPass();
-extern char &LoopSimplifyID;
-
-//===----------------------------------------------------------------------===//
-//
 // TailCallElimination - This pass eliminates call instructions to the current
 // function which occur immediately before return instructions.
 //
@@ -320,30 +281,6 @@ FunctionPass *createTailCallEliminationPass();
 
 //===----------------------------------------------------------------------===//
 //
-// LowerSwitch - This pass converts SwitchInst instructions into a sequence of
-// chained binary branch instructions.
-//
-FunctionPass *createLowerSwitchPass();
-extern char &LowerSwitchID;
-
-//===----------------------------------------------------------------------===//
-//
-// LowerInvoke - This pass removes invoke instructions, converting them to call
-// instructions.
-//
-FunctionPass *createLowerInvokePass();
-extern char &LowerInvokePassID;
-
-//===----------------------------------------------------------------------===//
-//
-// LCSSA - This pass inserts phi nodes at loop boundaries to simplify other loop
-// optimizations.
-//
-Pass *createLCSSAPass();
-extern char &LCSSAID;
-
-//===----------------------------------------------------------------------===//
-//
 // EarlyCSE - This pass performs a simple and fast CSE pass over the dominator
 // tree.
 //
@@ -405,13 +342,6 @@ FunctionPass *createConstantHoistingPass();
 
 //===----------------------------------------------------------------------===//
 //
-// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
-//
-FunctionPass *createInstructionNamerPass();
-extern char &InstructionNamerID;
-
-//===----------------------------------------------------------------------===//
-//
 // Sink - Code Sinking
 //
 FunctionPass *createSinkingPass();
@@ -451,13 +381,6 @@ extern char &InferAddressSpacesID;
 
 //===----------------------------------------------------------------------===//
 //
-// InstructionSimplifier - Remove redundant instructions.
-//
-FunctionPass *createInstructionSimplifierPass();
-extern char &InstructionSimplifierID;
-
-//===----------------------------------------------------------------------===//
-//
 // LowerExpectIntrinsics - Removes llvm.expect intrinsics and creates
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
@@ -477,16 +400,9 @@ FunctionPass *createScalarizerPass();
 
 //===----------------------------------------------------------------------===//
 //
-// AddDiscriminators - Add DWARF path discriminators to the IR.
-FunctionPass *createAddDiscriminatorsPass();
-
-//===----------------------------------------------------------------------===//
-//
 // SeparateConstOffsetFromGEP - Split GEPs for better CSE
 //
-FunctionPass *
-createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr,
-                                     bool LowerGEP = false);
+FunctionPass *createSeparateConstOffsetFromGEPPass(bool LowerGEP = false);
 
 //===----------------------------------------------------------------------===//
 //
@@ -525,13 +441,6 @@ ModulePass *createRewriteStatepointsForGCLegacyPass();
 
 //===----------------------------------------------------------------------===//
 //
-// StripGCRelocates - Remove GC relocates that have been inserted by
-// RewriteStatepointsForGC. The resulting IR is incorrect, but this is useful
-// for manual inspection.
-FunctionPass *createStripGCRelocatesPass();
-
-//===----------------------------------------------------------------------===//
-//
 // Float2Int - Demote floats to ints where possible.
 //
 FunctionPass *createFloat2IntPass();
@@ -556,13 +465,6 @@ FunctionPass *createLoopLoadEliminationPass();
 
 //===----------------------------------------------------------------------===//
 //
-// LoopSimplifyCFG - This pass performs basic CFG simplification on loops,
-// primarily to help other loop passes.
-//
-Pass *createLoopSimplifyCFGPass();
-
-//===----------------------------------------------------------------------===//
-//
 // LoopVersioning - Perform loop multi-versioning.
 //
 FunctionPass *createLoopVersioningPass();
@@ -585,13 +487,10 @@ FunctionPass *createLibCallsShrinkWrapPass();
 
 //===----------------------------------------------------------------------===//
 //
-// EntryExitInstrumenter pass - Instrument function entry/exit with calls to
-// mcount(), @__cyg_profile_func_{enter,exit} and the like. There are two
-// variants, intended to run pre- and post-inlining, respectively.
+// LoopSimplifyCFG - This pass performs basic CFG simplification on loops,
+// primarily to help other loop passes.
 //
-FunctionPass *createEntryExitInstrumenterPass();
-FunctionPass *createPostInlineEntryExitInstrumenterPass();
-
+Pass *createLoopSimplifyCFGPass();
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h b/contrib/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
index f75dc4dc331d..61975036e9ff 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
@@ -33,12 +33,6 @@ struct AlignmentFromAssumptionsPass
   bool runImpl(Function &F, AssumptionCache &AC, ScalarEvolution *SE_,
                DominatorTree *DT_);
 
-  // For memory transfers, we need a common alignment for both the source and
-  // destination. If we have a new alignment for only one operand of a transfer
-  // instruction, save it in these maps.  If we reach the other operand through
-  // another assumption later, then we may change the alignment at that point.
-  DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
-
   ScalarEvolution *SE = nullptr;
   DominatorTree *DT = nullptr;
 
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/contrib/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
index 5ab951a49f2c..b2ca2a1c09ae 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/CallSiteSplitting.h
@@ -21,7 +21,7 @@
 namespace llvm {
 
 struct CallSiteSplittingPass : PassInfoMixin<CallSiteSplittingPass> {
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 } // end namespace llvm
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/contrib/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index d3322dc1c414..84589bf4db99 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -60,7 +60,7 @@ class TargetTransformInfo;
 /// clients.
 namespace consthoist {
 
-/// \brief Keeps track of the user of a constant and the operand index where the
+/// Keeps track of the user of a constant and the operand index where the
 /// constant is used.
 struct ConstantUser {
   Instruction *Inst;
@@ -71,7 +71,7 @@ struct ConstantUser {
 
 using ConstantUseListType = SmallVector<ConstantUser, 8>;
 
-/// \brief Keeps track of a constant candidate and its uses.
+/// Keeps track of a constant candidate and its uses.
 struct ConstantCandidate {
   ConstantUseListType Uses;
   ConstantInt *ConstInt;
@@ -79,14 +79,14 @@ struct ConstantCandidate {
 
   ConstantCandidate(ConstantInt *ConstInt) : ConstInt(ConstInt) {}
 
-  /// \brief Add the user to the use list and update the cost.
+  /// Add the user to the use list and update the cost.
   void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) {
     CumulativeCost += Cost;
     Uses.push_back(ConstantUser(Inst, Idx));
   }
 };
 
-/// \brief This represents a constant that has been rebased with respect to a
+/// This represents a constant that has been rebased with respect to a
 /// base constant. The difference to the base constant is recorded in Offset.
 struct RebasedConstantInfo {
   ConstantUseListType Uses;
@@ -98,7 +98,7 @@ struct RebasedConstantInfo {
 
 using RebasedConstantListType = SmallVector<RebasedConstantInfo, 4>;
 
-/// \brief A base constant and all its rebased constants.
+/// A base constant and all its rebased constants.
 struct ConstantInfo {
   ConstantInt *BaseConstant;
   RebasedConstantListType RebasedConstants;
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h b/contrib/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h
index dca3b2dbf04f..faf03a4ec489 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h
@@ -21,7 +21,7 @@ namespace llvm {
 
 class Function;
 
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
 ///
 /// This pass does a simple depth-first walk over the dominator tree,
 /// eliminating trivially redundant instructions and using instsimplify to
@@ -31,7 +31,7 @@ class Function;
 struct EarlyCSEPass : PassInfoMixin<EarlyCSEPass> {
   EarlyCSEPass(bool UseMemorySSA = false) : UseMemorySSA(UseMemorySSA) {}
 
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   bool UseMemorySSA;
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/GVN.h b/contrib/llvm/include/llvm/Transforms/Scalar/GVN.h
index 440d3f67c35a..b9de07ec9279 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -69,7 +69,7 @@ class GVN : public PassInfoMixin<GVN> {
 public:
   struct Expression;
 
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   /// This removes the specified instruction from
@@ -291,17 +291,17 @@ private:
 /// loads are eliminated by the pass.
 FunctionPass *createGVNPass(bool NoLoads = false);
 
-/// \brief A simple and fast domtree-based GVN pass to hoist common expressions
+/// A simple and fast domtree-based GVN pass to hoist common expressions
 /// from sibling branches.
 struct GVNHoistPass : PassInfoMixin<GVNHoistPass> {
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Uses an "inverted" value numbering to decide the similarity of
+/// Uses an "inverted" value numbering to decide the similarity of
 /// expressions and sinks similar expressions into successors.
 struct GVNSinkPass : PassInfoMixin<GVNSinkPass> {
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/GVNExpression.h b/contrib/llvm/include/llvm/Transforms/Scalar/GVNExpression.h
index 99dae15a3ac0..8b346969b1e9 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/GVNExpression.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/GVNExpression.h
@@ -159,7 +159,7 @@ public:
     return ET > ET_BasicStart && ET < ET_BasicEnd;
   }
 
-  /// \brief Swap two operands. Used during GVN to put commutative operands in
+  /// Swap two operands. Used during GVN to put commutative operands in
   /// order.
   void swapOperands(unsigned First, unsigned Second) {
     std::swap(Operands[First], Operands[Second]);
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/InductiveRangeCheckElimination.h b/contrib/llvm/include/llvm/Transforms/Scalar/InductiveRangeCheckElimination.h
new file mode 100644
index 000000000000..311c549b8326
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/InductiveRangeCheckElimination.h
@@ -0,0 +1,31 @@
+//===- InductiveRangeCheckElimination.h - IRCE ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the Inductive Range Check Elimination
+// loop pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_INDUCTIVERANGECHECKELIMINATION_H
+#define LLVM_TRANSFORMS_SCALAR_INDUCTIVERANGECHECKELIMINATION_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class IRCEPass : public PassInfoMixin<IRCEPass> {
+public:
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_INDUCTIVERANGECHECKELIMINATION_H
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h b/contrib/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h
new file mode 100644
index 000000000000..da79a13eb7cf
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h
@@ -0,0 +1,46 @@
+//===- InstSimplifyPass.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Defines passes for running instruction simplification across chunks of IR.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_INSTSIMPLIFYPASS_H
+#define LLVM_TRANSFORMS_UTILS_INSTSIMPLIFYPASS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class FunctionPass;
+
+/// Run instruction simplification across each instruction in the function.
+///
+/// Instruction simplification has useful constraints in some contexts:
+/// - It will never introduce *new* instructions.
+/// - There is no need to iterate to a fixed point.
+///
+/// Many passes use instruction simplification as a library facility, but it may
+/// also be useful (in tests and other contexts) to have access to this very
+/// restricted transform at a pass granularity. However, for a much more
+/// powerful and comprehensive peephole optimization engine, see the
+/// `instcombine` pass instead.
+class InstSimplifyPass : public PassInfoMixin<InstSimplifyPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Create a legacy pass that does instruction simplification on each
+/// instruction in a function.
+FunctionPass *createInstSimplifyLegacyPass();
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_INSTSIMPLIFYPASS_H
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/contrib/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index a9466713b8e6..b3493a292498 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -34,6 +34,7 @@ class BinaryOperator;
 class BranchInst;
 class CmpInst;
 class Constant;
+class DeferredDominance;
 class Function;
 class Instruction;
 class IntrinsicInst;
@@ -77,6 +78,7 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
   TargetLibraryInfo *TLI;
   LazyValueInfo *LVI;
   AliasAnalysis *AA;
+  DeferredDominance *DDT;
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = false;
@@ -107,8 +109,8 @@ public:
 
   // Glue for old PM.
   bool runImpl(Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_,
-               AliasAnalysis *AA_, bool HasProfileData_,
-               std::unique_ptr<BlockFrequencyInfo> BFI_,
+               AliasAnalysis *AA_, DeferredDominance *DDT_,
+               bool HasProfileData_, std::unique_ptr<BlockFrequencyInfo> BFI_,
                std::unique_ptr<BranchProbabilityInfo> BPI_);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h b/contrib/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
index 5eddd5fdc7e7..e1b33799578b 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
@@ -15,7 +15,7 @@
 
 namespace llvm {
 
-/// \brief Printer pass for the \c LoopAccessInfo results.
+/// Printer pass for the \c LoopAccessInfo results.
 class LoopAccessInfoPrinterPass
     : public PassInfoMixin<LoopAccessInfoPrinterPass> {
   raw_ostream &OS;
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h b/contrib/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h
index 12c7a030ff8b..e1ad67ac6fff 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/LoopDataPrefetch.h
@@ -24,7 +24,7 @@ class LoopDataPrefetchPass : public PassInfoMixin<LoopDataPrefetchPass> {
 public:
   LoopDataPrefetchPass() = default;
 
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/contrib/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
index 56a45ed34178..5f61c39b5530 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -71,7 +71,7 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
 extern template class PassManager<Loop, LoopAnalysisManager,
                                   LoopStandardAnalysisResults &, LPMUpdater &>;
 
-/// \brief The Loop pass manager.
+/// The Loop pass manager.
 ///
 /// See the documentation for the PassManager template for details. It runs
 /// a sequence of Loop passes over each Loop that the manager is run over. This
@@ -253,7 +253,7 @@ private:
       : Worklist(Worklist), LAM(LAM) {}
 };
 
-/// \brief Adaptor that maps from a function to its loops.
+/// Adaptor that maps from a function to its loops.
 ///
 /// Designed to allow composition of a LoopPass(Manager) and a
 /// FunctionPassManager. Note that if this pass is constructed with a \c
@@ -270,7 +270,7 @@ public:
     LoopCanonicalizationFPM.addPass(LCSSAPass());
   }
 
-  /// \brief Runs the loop passes across every loop in the function.
+  /// Runs the loop passes across every loop in the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
     // Before we even compute any loop analyses, first run a miniature function
     // pass pipeline to put loops into their canonical form. Note that we can
@@ -381,7 +381,7 @@ private:
   FunctionPassManager LoopCanonicalizationFPM;
 };
 
-/// \brief A function to deduce a loop pass type and wrap it in the templated
+/// A function to deduce a loop pass type and wrap it in the templated
 /// adaptor.
 template <typename LoopPassT>
 FunctionToLoopPassAdaptor<LoopPassT>
@@ -389,7 +389,7 @@ createFunctionToLoopPassAdaptor(LoopPassT Pass, bool DebugLogging = false) {
   return FunctionToLoopPassAdaptor<LoopPassT>(std::move(Pass), DebugLogging);
 }
 
-/// \brief Pass for printing a loop's contents as textual IR.
+/// Pass for printing a loop's contents as textual IR.
 class PrintLoopPass : public PassInfoMixin<PrintLoopPass> {
   raw_ostream &OS;
   std::string Banner;
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h b/contrib/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
new file mode 100644
index 000000000000..fc69aa361059
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
@@ -0,0 +1,35 @@
+//===- LoopUnrollAndJamPass.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Loop;
+struct LoopStandardAnalysisResults;
+class LPMUpdater;
+
+/// A simple loop rotation transformation.
+class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
+  const int OptLevel;
+
+public:
+  explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h b/contrib/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
index ab9dec0311b2..b6ee6523697c 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
@@ -22,7 +22,7 @@
 namespace llvm {
 
 struct LowerExpectIntrinsicPass : PassInfoMixin<LowerExpectIntrinsicPass> {
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   ///
   /// This will lower all of the expect intrinsic calls in this function into
   /// branch weight metadata. That metadata will subsequently feed the analysis
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h b/contrib/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
index 3cad7bb070d0..48df09cdec9e 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 //! \file
-//! \brief This pass performs merges of loads and stores on both sides of a
+//! This pass performs merges of loads and stores on both sides of a
 //  diamond (hammock). It hoists the loads and sinks the stores.
 //
 // The algorithm iteratively hoists two loads to the same address out of a
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/NewGVN.h b/contrib/llvm/include/llvm/Transforms/Scalar/NewGVN.h
index 05db25502dc3..3f7541863a19 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/NewGVN.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/NewGVN.h
@@ -23,7 +23,7 @@ class Function;
 
 class NewGVNPass : public PassInfoMixin<NewGVNPass> {
 public:
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, AnalysisManager<Function> &AM);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h
index 9997dfa5b6f3..ba7586dffd9d 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h
@@ -29,6 +29,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
+#include <deque>
 
 namespace llvm {
 
@@ -54,7 +55,7 @@ inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
   return LHS.Rank > RHS.Rank; // Sort so that highest rank goes to start.
 }
 
-/// \brief Utility class representing a base and exponent pair which form one
+/// Utility class representing a base and exponent pair which form one
 /// factor of some product.
 struct Factor {
   Value *Base;
@@ -69,9 +70,14 @@ class XorOpnd;
 
 /// Reassociate commutative expressions.
 class ReassociatePass : public PassInfoMixin<ReassociatePass> {
+public:
+  using OrderedSet =
+      SetVector<AssertingVH<Instruction>, std::deque<AssertingVH<Instruction>>>;
+
+protected:
   DenseMap<BasicBlock *, unsigned> RankMap;
   DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
-  SetVector<AssertingVH<Instruction>> RedoInsts;
+  OrderedSet RedoInsts;
 
   // Arbitrary, but prevents quadratic behavior.
   static const unsigned GlobalReassociateLimit = 10;
@@ -108,8 +114,7 @@ private:
                      SmallVectorImpl<reassociate::ValueEntry> &Ops);
   Value *RemoveFactorFromExpression(Value *V, Value *Factor);
   void EraseInst(Instruction *I);
-  void RecursivelyEraseDeadInsts(Instruction *I,
-                                 SetVector<AssertingVH<Instruction>> &Insts);
+  void RecursivelyEraseDeadInsts(Instruction *I, OrderedSet &Insts);
   void OptimizeInst(Instruction *I);
   Instruction *canonicalizeNegConstExpr(Instruction *I);
   void BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT);
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/SCCP.h b/contrib/llvm/include/llvm/Transforms/Scalar/SCCP.h
index b93287fff907..2a294c95a17b 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/SCCP.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/SCCP.h
@@ -21,6 +21,10 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SCCP_H
 #define LLVM_TRANSFORMS_SCALAR_SCCP_H
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
@@ -33,6 +37,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+bool runIPSCCP(Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_SCALAR_SCCP_H
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/SROA.h b/contrib/llvm/include/llvm/Transforms/Scalar/SROA.h
index 4a321e75c68b..b36c6f492be1 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/SROA.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -45,7 +45,7 @@ class SROALegacyPass;
 
 } // end namespace sroa
 
-/// \brief An optimization pass providing Scalar Replacement of Aggregates.
+/// An optimization pass providing Scalar Replacement of Aggregates.
 ///
 /// This pass takes allocations which can be completely analyzed (that is, they
 /// don't escape) and tries to turn them into scalar SSA values. There are
@@ -68,7 +68,7 @@ class SROA : public PassInfoMixin<SROA> {
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
 
-  /// \brief Worklist of alloca instructions to simplify.
+  /// Worklist of alloca instructions to simplify.
   ///
   /// Each alloca in the function is added to this. Each new alloca formed gets
   /// added to it as well to recursively simplify unless that alloca can be
@@ -77,12 +77,12 @@ class SROA : public PassInfoMixin<SROA> {
   /// already present to ensure it is re-visited.
   SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist;
 
-  /// \brief A collection of instructions to delete.
+  /// A collection of instructions to delete.
   /// We try to batch deletions to simplify code and make things a bit more
   /// efficient.
   SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
 
-  /// \brief Post-promotion worklist.
+  /// Post-promotion worklist.
   ///
   /// Sometimes we discover an alloca which has a high probability of becoming
   /// viable for SROA after a round of promotion takes place. In those cases,
@@ -92,17 +92,17 @@ class SROA : public PassInfoMixin<SROA> {
   /// the event they are deleted.
   SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist;
 
-  /// \brief A collection of alloca instructions we can directly promote.
+  /// A collection of alloca instructions we can directly promote.
   std::vector<AllocaInst *> PromotableAllocas;
 
-  /// \brief A worklist of PHIs to speculate prior to promoting allocas.
+  /// A worklist of PHIs to speculate prior to promoting allocas.
   ///
   /// All of these PHIs have been checked for the safety of speculation and by
   /// being speculated will allow promoting allocas currently in the promotable
   /// queue.
   SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs;
 
-  /// \brief A worklist of select instructions to speculate prior to promoting
+  /// A worklist of select instructions to speculate prior to promoting
   /// allocas.
   ///
   /// All of these select instructions have been checked for the safety of
@@ -113,7 +113,7 @@ class SROA : public PassInfoMixin<SROA> {
 public:
   SROA() = default;
 
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
 private:
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h b/contrib/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
index 63bfe6373d04..eed50ec96161 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
@@ -17,9 +17,9 @@
 
 namespace llvm {
 
-/// This pass transforms loops that contain branches on loop-invariant
-/// conditions to have multiple loops. For example, it turns the left into the
-/// right code:
+/// This pass transforms loops that contain branches or switches on loop-
+/// invariant conditions to have multiple loops. For example, it turns the left
+/// into the right code:
 ///
 ///  for (...)                  if (lic)
 ///    A                          for (...)
@@ -35,6 +35,31 @@ namespace llvm {
 /// This pass expects LICM to be run before it to hoist invariant conditions out
 /// of the loop, to make the unswitching opportunity obvious.
 ///
+/// There is a taxonomy of unswitching that we use to classify different forms
+/// of this transformaiton:
+///
+/// - Trival unswitching: this is when the condition can be unswitched without
+///   cloning any code from inside the loop. A non-trivial unswitch requires
+///   code duplication.
+///
+/// - Full unswitching: this is when the branch or switch is completely moved
+///   from inside the loop to outside the loop. Partial unswitching removes the
+///   branch from the clone of the loop but must leave a (somewhat simplified)
+///   branch in the original loop. While theoretically partial unswitching can
+///   be done for switches, the requirements are extreme - we need the loop
+///   invariant input to the switch to be sufficient to collapse to a single
+///   successor in each clone.
+///
+/// This pass always does trivial, full unswitching for both branches and
+/// switches. For branches, it also always does trivial, partial unswitching.
+///
+/// If enabled (via the constructor's `NonTrivial` parameter), this pass will
+/// additionally do non-trivial, full unswitching for branches and switches, and
+/// will do non-trivial, partial unswitching for branches.
+///
+/// Because partial unswitching of switches is extremely unlikely to be possible
+/// in practice and significantly complicates the implementation, this pass does
+/// not currently implement that in any mode.
 class SimpleLoopUnswitchPass : public PassInfoMixin<SimpleLoopUnswitchPass> {
   bool NonTrivial;
 
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h b/contrib/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
index 1afb9c7f954f..ce0a35fc06bd 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
@@ -15,9 +15,9 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SIMPLIFYCFG_H
 #define LLVM_TRANSFORMS_SCALAR_SIMPLIFYCFG_H
 
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Utils/Local.h"
 
 namespace llvm {
 
@@ -46,7 +46,7 @@ public:
   /// Construct a pass with optional optimizations.
   SimplifyCFGPass(const SimplifyCFGOptions &PassOptions);
 
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h b/contrib/llvm/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h
index f39e03d22d65..4a0bfd754723 100644
--- a/contrib/llvm/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h
+++ b/contrib/llvm/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h
@@ -102,7 +102,7 @@ namespace llvm {
 /// here are relatively simple ones around execution and codesize cost, without
 /// any need to consider simplifications or other transformations.
 struct SpeculateAroundPHIsPass : PassInfoMixin<SpeculateAroundPHIsPass> {
-  /// \brief Run the pass over the function.
+  /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/Utils.h b/contrib/llvm/include/llvm/Transforms/Utils.h
new file mode 100644
index 000000000000..0d997ce17b83
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Utils.h
@@ -0,0 +1,118 @@
+//===- llvm/Transforms/Utils.h - Utility Transformations --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the Utils transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_H
+#define LLVM_TRANSFORMS_UTILS_H
+
+namespace llvm {
+
+class ModulePass;
+class FunctionPass;
+class Pass;
+
+//===----------------------------------------------------------------------===//
+// createMetaRenamerPass - Rename everything with metasyntatic names.
+//
+ModulePass *createMetaRenamerPass();
+
+//===----------------------------------------------------------------------===//
+//
+// LowerInvoke - This pass removes invoke instructions, converting them to call
+// instructions.
+//
+FunctionPass *createLowerInvokePass();
+extern char &LowerInvokePassID;
+
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *createInstructionNamerPass();
+extern char &InstructionNamerID;
+
+//===----------------------------------------------------------------------===//
+//
+// LowerSwitch - This pass converts SwitchInst instructions into a sequence of
+// chained binary branch instructions.
+//
+FunctionPass *createLowerSwitchPass();
+extern char &LowerSwitchID;
+
+//===----------------------------------------------------------------------===//
+//
+// EntryExitInstrumenter pass - Instrument function entry/exit with calls to
+// mcount(), @__cyg_profile_func_{enter,exit} and the like. There are two
+// variants, intended to run pre- and post-inlining, respectively.
+//
+FunctionPass *createEntryExitInstrumenterPass();
+FunctionPass *createPostInlineEntryExitInstrumenterPass();
+
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges - Break all of the critical edges in the CFG by inserting
+// a dummy basic block. This pass may be "required" by passes that cannot deal
+// with critical edges. For this usage, a pass must call:
+//
+//   AU.addRequiredID(BreakCriticalEdgesID);
+//
+// This pass obviously invalidates the CFG, but can update forward dominator
+// (set, immediate dominators, tree, and frontier) information.
+//
+FunctionPass *createBreakCriticalEdgesPass();
+extern char &BreakCriticalEdgesID;
+
+//===----------------------------------------------------------------------===//
+//
+// LCSSA - This pass inserts phi nodes at loop boundaries to simplify other loop
+// optimizations.
+//
+Pass *createLCSSAPass();
+extern char &LCSSAID;
+
+//===----------------------------------------------------------------------===//
+//
+// AddDiscriminators - Add DWARF path discriminators to the IR.
+FunctionPass *createAddDiscriminatorsPass();
+
+//===----------------------------------------------------------------------===//
+//
+// PromoteMemoryToRegister - This pass is used to promote memory references to
+// be register references. A simple example of the transformation performed by
+// this pass is:
+//
+//        FROM CODE                           TO CODE
+//   %X = alloca i32, i32 1                 ret i32 42
+//   store i32 42, i32 *%X
+//   %Y = load i32* %X
+//   ret i32 %Y
+//
+FunctionPass *createPromoteMemoryToRegisterPass();
+
+//===----------------------------------------------------------------------===//
+//
+// LoopSimplify - Insert Pre-header blocks into the CFG for every function in
+// the module.  This pass updates dominator information, loop information, and
+// does not add critical edges to the CFG.
+//
+//   AU.addRequiredID(LoopSimplifyID);
+//
+Pass *createLoopSimplifyPass();
+extern char &LoopSimplifyID;
+
+/// This function returns a new pass that downgrades the debug info in the
+/// module to line tables only.
+ModulePass *createStripNonLineTableDebugInfoPass();
+}
+
+#endif
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/contrib/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 74f75509f550..3dfc73b64842 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -27,6 +27,7 @@ namespace llvm {
 
 class BlockFrequencyInfo;
 class BranchProbabilityInfo;
+class DeferredDominance;
 class DominatorTree;
 class Function;
 class Instruction;
@@ -38,7 +39,7 @@ class TargetLibraryInfo;
 class Value;
 
 /// Delete the specified block, which must have no predecessors.
-void DeleteDeadBlock(BasicBlock *BB);
+void DeleteDeadBlock(BasicBlock *BB, DeferredDominance *DDT = nullptr);
 
 /// We know that BB has one predecessor. If there are any single-entry PHI nodes
 /// in it, fold them away. This handles the case when all entries to the PHI
@@ -57,7 +58,8 @@ bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr);
 /// value indicates success or failure.
 bool MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT = nullptr,
                                LoopInfo *LI = nullptr,
-                               MemoryDependenceResults *MemDep = nullptr);
+                               MemoryDependenceResults *MemDep = nullptr,
+                               DeferredDominance *DDT = nullptr);
 
 /// Replace all uses of an instruction (specified by BI) with a value, then
 /// remove and delete the original instruction.
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/contrib/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index a067a685b837..bdcdf6f361f2 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_BUILDLIBCALLS_H
 #define LLVM_TRANSFORMS_UTILS_BUILDLIBCALLS_H
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IRBuilder.h"
 
 namespace llvm {
@@ -29,6 +30,12 @@ namespace llvm {
   /// Returns true if any attributes were set and false otherwise.
   bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI);
 
+  /// Check whether the overloaded unary floating point function
+  /// corresponding to \a Ty is available.
+  bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                       LibFunc DoubleFn, LibFunc FloatFn,
+                       LibFunc LongDoubleFn);
+
   /// Return V if it is an i8*, otherwise cast it to i8*.
   Value *castToCStr(Value *V, IRBuilder<> &B);
 
@@ -104,15 +111,54 @@ namespace llvm {
   Value *emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
                    const TargetLibraryInfo *TLI);
 
-  /// Emit a call to the puts function. Str is required to be a pointer and
+  /// Emit a call to the fputc_unlocked function. This assumes that Char is an
+  /// i32, and File is a pointer to FILE.
+  Value *emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
+                           const TargetLibraryInfo *TLI);
+
+  /// Emit a call to the fputs function. Str is required to be a pointer and
   /// File is a pointer to FILE.
   Value *emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                    const TargetLibraryInfo *TLI);
 
+  /// Emit a call to the fputs_unlocked function. Str is required to be a
+  /// pointer and File is a pointer to FILE.
+  Value *emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
+                           const TargetLibraryInfo *TLI);
+
   /// Emit a call to the fwrite function. This assumes that Ptr is a pointer,
   /// Size is an 'intptr_t', and File is a pointer to FILE.
   Value *emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                     const DataLayout &DL, const TargetLibraryInfo *TLI);
+
+  /// Emit a call to the malloc function.
+  Value *emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
+                    const TargetLibraryInfo *TLI);
+
+  /// Emit a call to the calloc function.
+  Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+                    IRBuilder<> &B, const TargetLibraryInfo &TLI);
+
+  /// Emit a call to the fwrite_unlocked function. This assumes that Ptr is a
+  /// pointer, Size is an 'intptr_t', N is nmemb and File is a pointer to FILE.
+  Value *emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+                            IRBuilder<> &B, const DataLayout &DL,
+                            const TargetLibraryInfo *TLI);
+
+  /// Emit a call to the fgetc_unlocked function. File is a pointer to FILE.
+  Value *emitFGetCUnlocked(Value *File, IRBuilder<> &B,
+                           const TargetLibraryInfo *TLI);
+
+  /// Emit a call to the fgets_unlocked function. Str is required to be a
+  /// pointer, Size is an i32 and File is a pointer to FILE.
+  Value *emitFGetSUnlocked(Value *Str, Value *Size, Value *File, IRBuilder<> &B,
+                           const TargetLibraryInfo *TLI);
+
+  /// Emit a call to the fread_unlocked function. This assumes that Ptr is a
+  /// pointer, Size is an 'intptr_t', N is nmemb and File is a pointer to FILE.
+  Value *emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+                           IRBuilder<> &B, const DataLayout &DL,
+                           const TargetLibraryInfo *TLI);
 }
 
 #endif
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/Cloning.h b/contrib/llvm/include/llvm/Transforms/Utils/Cloning.h
index 178bae76cef6..7531fb2d69b3 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -49,15 +49,15 @@ class ReturnInst;
 
 /// Return an exact copy of the specified module
 ///
-std::unique_ptr<Module> CloneModule(const Module *M);
-std::unique_ptr<Module> CloneModule(const Module *M, ValueToValueMapTy &VMap);
+std::unique_ptr<Module> CloneModule(const Module &M);
+std::unique_ptr<Module> CloneModule(const Module &M, ValueToValueMapTy &VMap);
 
 /// Return a copy of the specified module. The ShouldCloneDefinition function
 /// controls whether a specific GlobalValue's definition is cloned. If the
 /// function returns false, the module copy will contain an external reference
 /// in place of the global definition.
 std::unique_ptr<Module>
-CloneModule(const Module *M, ValueToValueMapTy &VMap,
+CloneModule(const Module &M, ValueToValueMapTy &VMap,
             function_ref<bool(const GlobalValue *)> ShouldCloneDefinition);
 
 /// ClonedCodeInfo - This struct can be used to capture information about code
@@ -240,7 +240,7 @@ bool InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                     AAResults *CalleeAAR = nullptr, bool InsertLifetime = true,
                     Function *ForwardVarArgsTo = nullptr);
 
-/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
 /// Blocks.
 ///
 /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
@@ -252,7 +252,7 @@ Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
                              DominatorTree *DT,
                              SmallVectorImpl<BasicBlock *> &Blocks);
 
-/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+/// Remaps instructions in \p Blocks using the mapping in \p VMap.
 void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
                                ValueToValueMapTy &VMap);
 
@@ -265,7 +265,8 @@ void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
 BasicBlock *
 DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
                                     Instruction *StopAt,
-                                    ValueToValueMapTy &ValueMapping);
+                                    ValueToValueMapTy &ValueMapping,
+                                    DominatorTree *DT = nullptr);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CLONING_H
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/contrib/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 63d34511102d..fab8334d4c66 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -34,7 +34,7 @@ class Module;
 class Type;
 class Value;
 
-  /// \brief Utility class for extracting code into a new function.
+  /// Utility class for extracting code into a new function.
   ///
   /// This utility provides a simple interface for extracting some sequence of
   /// code into its own function, replacing it with a call to that function. It
@@ -65,20 +65,22 @@ class Value;
     Type *RetTy;
 
   public:
-    /// \brief Create a code extractor for a sequence of blocks.
+    /// Create a code extractor for a sequence of blocks.
     ///
     /// Given a sequence of basic blocks where the first block in the sequence
     /// dominates the rest, prepare a code extractor object for pulling this
     /// sequence out into its new function. When a DominatorTree is also given,
     /// extra checking and transformations are enabled. If AllowVarArgs is true,
     /// vararg functions can be extracted. This is safe, if all vararg handling
-    /// code is extracted, including vastart.
+    /// code is extracted, including vastart. If AllowAlloca is true, then
+    /// extraction of blocks containing alloca instructions would be possible,
+    /// however code extractor won't validate whether extraction is legal. 
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr,
-                  bool AllowVarArgs = false);
+                  bool AllowVarArgs = false, bool AllowAlloca = false);
 
-    /// \brief Create a code extractor for a loop body.
+    /// Create a code extractor for a loop body.
     ///
     /// Behaves just like the generic code sequence constructor, but uses the
     /// block sequence of the loop.
@@ -86,27 +88,19 @@ class Value;
                   BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr);
 
-    /// \brief Check to see if a block is valid for extraction.
-    ///
-    /// Blocks containing EHPads, allocas and invokes are not valid. If
-    /// AllowVarArgs is true, blocks with vastart can be extracted. This is
-    /// safe, if all vararg handling code is extracted, including vastart.
-    static bool isBlockValidForExtraction(const BasicBlock &BB,
-                                          bool AllowVarArgs);
-
-    /// \brief Perform the extraction, returning the new function.
+    /// Perform the extraction, returning the new function.
     ///
     /// Returns zero when called on a CodeExtractor instance where isEligible
     /// returns false.
     Function *extractCodeRegion();
 
-    /// \brief Test whether this code extractor is eligible.
+    /// Test whether this code extractor is eligible.
     ///
     /// Based on the blocks used when constructing the code extractor,
     /// determine whether it is eligible for extraction.
     bool isEligible() const { return !Blocks.empty(); }
 
-    /// \brief Compute the set of input values and output values for the code.
+    /// Compute the set of input values and output values for the code.
     ///
     /// These can be used either when performing the extraction or to evaluate
     /// the expected size of a call to the extracted function. Note that this
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/Evaluator.h b/contrib/llvm/include/llvm/Transforms/Utils/Evaluator.h
index 0e987b93177a..9908ae6fd393 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/Evaluator.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/Evaluator.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
@@ -73,6 +74,18 @@ public:
     ValueStack.back()[V] = C;
   }
 
+  /// Given call site return callee and list of its formal arguments
+  Function *getCalleeWithFormalArgs(CallSite &CS,
+                                    SmallVector<Constant *, 8> &Formals);
+
+  /// Given call site and callee returns list of callee formal argument
+  /// values converting them when necessary
+  bool getFormalParams(CallSite &CS, Function *F,
+                       SmallVector<Constant *, 8> &Formals);
+
+  /// Casts call result to a type of bitcast call expression
+  Constant *castCallResultIfNeeded(Value *CallExpr, Constant *RV);
+
   const DenseMap<Constant*, Constant*> &getMutatedMemory() const {
     return MutatedMemory;
   }
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h b/contrib/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h
index b7a3d130aa11..b55a9893bcf7 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h
@@ -22,7 +22,7 @@
 namespace llvm {
 class Module;
 class Function;
-/// \brief Calculate and dump ThinLTO specific inliner stats.
+/// Calculate and dump ThinLTO specific inliner stats.
 /// The main statistics are:
 /// (1) Number of inlined imported functions,
 /// (2) Number of imported functions inlined into importing module (indirect),
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/IntegerDivision.h b/contrib/llvm/include/llvm/Transforms/Utils/IntegerDivision.h
index 0ec3321b9cf8..5d9927eb51b2 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/IntegerDivision.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/IntegerDivision.h
@@ -29,7 +29,7 @@ namespace llvm {
   /// e.g. when more information about the operands are known. Implements both
   /// 32bit and 64bit scalar division.
   ///
-  /// @brief Replace Rem with generated code.
+  /// Replace Rem with generated code.
   bool expandRemainder(BinaryOperator *Rem);
 
   /// Generate code to divide two integers, replacing Div with the generated
@@ -38,7 +38,7 @@ namespace llvm {
   /// when more information about the operands are known. Implements both
   /// 32bit and 64bit scalar division.
   ///
-  /// @brief Replace Div with generated code.
+  /// Replace Div with generated code.
   bool expandDivision(BinaryOperator* Div);
 
   /// Generate code to calculate the remainder of two integers, replacing Rem
@@ -46,26 +46,26 @@ namespace llvm {
   /// makes it useful for targets with little or no support for less than
   /// 32 bit arithmetic.
   ///
-  /// @brief Replace Rem with generated code.
+  /// Replace Rem with generated code.
   bool expandRemainderUpTo32Bits(BinaryOperator *Rem);
 
   /// Generate code to calculate the remainder of two integers, replacing Rem
   /// with the generated code. Uses ExpandReminder with a 64bit Rem.
   ///
-  /// @brief Replace Rem with generated code.
+  /// Replace Rem with generated code.
   bool expandRemainderUpTo64Bits(BinaryOperator *Rem);
 
   /// Generate code to divide two integers, replacing Div with the generated
   /// code. Uses ExpandDivision with a 32bit Div which makes it useful for
   /// targets with little or no support for less than 32 bit arithmetic.
   ///
-  /// @brief Replace Rem with generated code.
+  /// Replace Rem with generated code.
   bool expandDivisionUpTo32Bits(BinaryOperator *Div);
 
   /// Generate code to divide two integers, replacing Div with the generated
   /// code. Uses ExpandDivision with a 64bit Div.
   ///
-  /// @brief Replace Rem with generated code.
+  /// Replace Rem with generated code.
   bool expandDivisionUpTo64Bits(BinaryOperator *Div);
 
 } // End llvm namespace
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/Local.h b/contrib/llvm/include/llvm/Transforms/Utils/Local.h
index 01db88bc15c2..b8df32565723 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/Local.h
@@ -16,10 +16,12 @@
 #define LLVM_TRANSFORMS_UTILS_LOCAL_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -117,7 +119,8 @@ struct SimplifyCFGOptions {
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
 bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
-                            const TargetLibraryInfo *TLI = nullptr);
+                            const TargetLibraryInfo *TLI = nullptr,
+                            DeferredDominance *DDT = nullptr);
 
 //===----------------------------------------------------------------------===//
 //  Local dead code elimination.
@@ -140,6 +143,18 @@ bool wouldInstructionBeTriviallyDead(Instruction *I,
 bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
                                         const TargetLibraryInfo *TLI = nullptr);
 
+/// Delete all of the instructions in `DeadInsts`, and all other instructions
+/// that deleting these in turn causes to be trivially dead.
+///
+/// The initial instructions in the provided vector must all have empty use
+/// lists and satisfy `isInstructionTriviallyDead`.
+///
+/// `DeadInsts` will be used as scratch storage for this routine and will be
+/// empty afterward.
+void RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts,
+    const TargetLibraryInfo *TLI = nullptr);
+
 /// If the specified value is an effectively dead PHI node, due to being a
 /// def-use chain of single-use nodes that either forms a cycle or is terminated
 /// by a trivially dead instruction, delete it. If that makes any of its
@@ -171,18 +186,21 @@ bool SimplifyInstructionsInBlock(BasicBlock *BB,
 ///
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the 'and' to 0.
-void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred);
+void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
+                                  DeferredDominance *DDT = nullptr);
 
 /// BB is a block with one predecessor and its predecessor is known to have one
 /// successor (BB!). Eliminate the edge between them, moving the instructions in
 /// the predecessor into BB. This deletes the predecessor block.
-void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, DominatorTree *DT = nullptr);
+void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, DominatorTree *DT = nullptr,
+                                 DeferredDominance *DDT = nullptr);
 
 /// BB is known to contain an unconditional branch, and contains no instructions
 /// other than PHI nodes, potential debug intrinsics and the branch. If
 /// possible, eliminate BB by rewriting all the predecessors to branch to the
 /// successor block and return true. If we can't transform, return false.
-bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB);
+bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+                                             DeferredDominance *DDT = nullptr);
 
 /// Check for and eliminate duplicate PHI nodes in this block. This doesn't try
 /// to be clever about PHI nodes which differ only in the order of the incoming
@@ -246,72 +264,6 @@ inline unsigned getKnownAlignment(Value *V, const DataLayout &DL,
   return getOrEnforceKnownAlignment(V, 0, DL, CxtI, AC, DT);
 }
 
-/// Given a getelementptr instruction/constantexpr, emit the code necessary to
-/// compute the offset from the base pointer (without adding in the base
-/// pointer). Return the result as a signed integer of intptr size.
-/// When NoAssumptions is true, no assumptions about index computation not
-/// overflowing is made.
-template <typename IRBuilderTy>
-Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
-                     bool NoAssumptions = false) {
-  GEPOperator *GEPOp = cast<GEPOperator>(GEP);
-  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
-  Value *Result = Constant::getNullValue(IntPtrTy);
-
-  // If the GEP is inbounds, we know that none of the addressing operations will
-  // overflow in an unsigned sense.
-  bool isInBounds = GEPOp->isInBounds() && !NoAssumptions;
-
-  // Build a mask for high order bits.
-  unsigned IntPtrWidth = IntPtrTy->getScalarType()->getIntegerBitWidth();
-  uint64_t PtrSizeMask =
-      std::numeric_limits<uint64_t>::max() >> (64 - IntPtrWidth);
-
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
-       ++i, ++GTI) {
-    Value *Op = *i;
-    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
-    if (Constant *OpC = dyn_cast<Constant>(Op)) {
-      if (OpC->isZeroValue())
-        continue;
-
-      // Handle a struct index, which adds its field offset to the pointer.
-      if (StructType *STy = GTI.getStructTypeOrNull()) {
-        if (OpC->getType()->isVectorTy())
-          OpC = OpC->getSplatValue();
-
-        uint64_t OpValue = cast<ConstantInt>(OpC)->getZExtValue();
-        Size = DL.getStructLayout(STy)->getElementOffset(OpValue);
-
-        if (Size)
-          Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size),
-                                      GEP->getName()+".offs");
-        continue;
-      }
-
-      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
-      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
-      Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/);
-      // Emit an add instruction.
-      Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs");
-      continue;
-    }
-    // Convert to correct type.
-    if (Op->getType() != IntPtrTy)
-      Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c");
-    if (Size != 1) {
-      // We'll let instcombine(mul) convert this to a shl if possible.
-      Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size),
-                              GEP->getName()+".idx", isInBounds /*NUW*/);
-    }
-
-    // Emit an add instruction.
-    Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs");
-  }
-  return Result;
-}
-
 ///===---------------------------------------------------------------------===//
 ///  Dbg Intrinsic utilities
 ///
@@ -335,6 +287,10 @@ void ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
 /// llvm.dbg.value intrinsics.
 bool LowerDbgDeclare(Function &F);
 
+/// Propagate dbg.value intrinsics through the newly inserted PHIs.
+void insertDebugValuesForPHIs(BasicBlock *BB,
+                              SmallVectorImpl<PHINode *> &InsertedPHIs);
+
 /// Finds all intrinsics declaring local variables as living in the memory that
 /// 'V' points to. This may include a mix of dbg.declare and
 /// dbg.addr intrinsics.
@@ -343,6 +299,9 @@ TinyPtrVector<DbgInfoIntrinsic *> FindDbgAddrUses(Value *V);
 /// Finds the llvm.dbg.value intrinsics describing a value.
 void findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V);
 
+/// Finds the debug info intrinsics describing a value.
+void findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgInsts, Value *V);
+
 /// Replaces llvm.dbg.declare instruction when the address it
 /// describes is replaced with a new value. If Deref is true, an
 /// additional DW_OP_deref is prepended to the expression. If Offset
@@ -357,7 +316,7 @@ bool replaceDbgDeclare(Value *Address, Value *NewAddress,
 /// DW_OP_deref is prepended to the expression. If Offset is non-zero,
 /// a constant displacement is added to the expression (between the
 /// optional Deref operations). Offset can be negative. The new
-/// llvm.dbg.declare is inserted immediately before AI.
+/// llvm.dbg.declare is inserted immediately after AI.
 bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                 DIBuilder &Builder, bool DerefBefore,
                                 int Offset, bool DerefAfter);
@@ -370,10 +329,27 @@ bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
 void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                               DIBuilder &Builder, int Offset = 0);
 
-/// Assuming the instruction \p I is going to be deleted, attempt to salvage any
-/// dbg.value intrinsics referring to \p I by rewriting its effect into a
-/// DIExpression.
-void salvageDebugInfo(Instruction &I);
+/// Assuming the instruction \p I is going to be deleted, attempt to salvage
+/// debug users of \p I by writing the effect of \p I in a DIExpression.
+/// Returns true if any debug users were updated.
+bool salvageDebugInfo(Instruction &I);
+
+/// Point debug users of \p From to \p To or salvage them. Use this function
+/// only when replacing all uses of \p From with \p To, with a guarantee that
+/// \p From is going to be deleted.
+///
+/// Follow these rules to prevent use-before-def of \p To:
+///   . If \p To is a linked Instruction, set \p DomPoint to \p To.
+///   . If \p To is an unlinked Instruction, set \p DomPoint to the Instruction
+///     \p To will be inserted after.
+///   . If \p To is not an Instruction (e.g a Constant), the choice of
+///     \p DomPoint is arbitrary. Pick \p From for simplicity.
+///
+/// If a debug user cannot be preserved without reordering variable updates or
+/// introducing a use-before-def, it is either salvaged (\ref salvageDebugInfo)
+/// or deleted. Returns true if any debug users were updated.
+bool replaceAllDbgUsesWith(Instruction &From, Value &To, Instruction &DomPoint,
+                           DominatorTree &DT);
 
 /// Remove all instructions from a basic block other than it's terminator
 /// and any present EH pad instructions.
@@ -382,7 +358,8 @@ unsigned removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
 /// Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
 unsigned changeToUnreachable(Instruction *I, bool UseLLVMTrap,
-                             bool PreserveLCSSA = false);
+                             bool PreserveLCSSA = false,
+                             DeferredDominance *DDT = nullptr);
 
 /// Convert the CallInst to InvokeInst with the specified unwind edge basic
 /// block.  This also splits the basic block where CI is located, because
@@ -397,12 +374,13 @@ BasicBlock *changeToInvokeAndSplitBasicBlock(CallInst *CI,
 ///
 /// \param BB  Block whose terminator will be replaced.  Its terminator must
 ///            have an unwind successor.
-void removeUnwindEdge(BasicBlock *BB);
+void removeUnwindEdge(BasicBlock *BB, DeferredDominance *DDT = nullptr);
 
 /// Remove all blocks that can not be reached from the function's entry.
 ///
 /// Returns true if any basic block was removed.
-bool removeUnreachableBlocks(Function &F, LazyValueInfo *LVI = nullptr);
+bool removeUnreachableBlocks(Function &F, LazyValueInfo *LVI = nullptr,
+                             DeferredDominance *DDT = nullptr);
 
 /// Combine the metadata of two instructions so that K can replace J
 ///
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h b/contrib/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
new file mode 100644
index 000000000000..231e5bbb6dee
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
@@ -0,0 +1,40 @@
+//===- LoopRotationUtils.h - Utilities to perform loop rotation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utilities to convert a loop into a loop with bottom test.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOOPROTATIONUTILS_H
+#define LLVM_TRANSFORMS_UTILS_LOOPROTATIONUTILS_H
+
+namespace llvm {
+
+class AssumptionCache;
+class DominatorTree;
+class Loop;
+class LoopInfo;
+class ScalarEvolution;
+struct SimplifyQuery;
+class TargetTransformInfo;
+
+/// Convert a loop into a loop with bottom test. It may
+/// perform loop latch simplication as well if the flag RotationOnly
+/// is false. The flag Threshold represents the size threshold of the loop
+/// header. If the loop header's size exceeds the threshold, the loop rotation
+/// will give up. The flag IsUtilMode controls the heuristic used in the
+/// LoopRotation. If it is true, the profitability heuristic will be ignored.
+bool LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
+                  AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE,
+                  const SimplifyQuery &SQ, bool RotationOnly,
+                  unsigned Threshold, bool IsUtilMode);
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/LoopSimplify.h b/contrib/llvm/include/llvm/Transforms/Utils/LoopSimplify.h
index f3828bc16e2f..166da2738ffd 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/LoopSimplify.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/LoopSimplify.h
@@ -52,7 +52,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Simplify each loop in a loop nest recursively.
+/// Simplify each loop in a loop nest recursively.
 ///
 /// This takes a potentially un-simplified loop L (and its children) and turns
 /// it into a simplified loop nest with preheaders and single backedges. It will
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/contrib/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index fb53647112f9..eb4c99102a63 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -47,17 +48,6 @@ class SCEV;
 class TargetLibraryInfo;
 class TargetTransformInfo;
 
-/// \brief Captures loop safety information.
-/// It keep information for loop & its header may throw exception.
-struct LoopSafetyInfo {
-  bool MayThrow = false;       // The current loop contains an instruction which
-                               // may throw.
-  bool HeaderMayThrow = false; // Same as previous, but specific to loop header
-  // Used to update funclet bundle operands.
-  DenseMap<BasicBlock *, ColorVector> BlockColors;
-
-  LoopSafetyInfo() = default;
-};
 
 /// The RecurrenceDescriptor is used to identify recurrences variables in a
 /// loop. Reduction is a special case of recurrence that has uses of the
@@ -299,16 +289,16 @@ public:
   /// induction, the induction descriptor \p D will contain the data describing
   /// this induction. If by some other means the caller has a better SCEV
   /// expression for \p Phi than the one returned by the ScalarEvolution
-  /// analysis, it can be passed through \p Expr. If the def-use chain 
+  /// analysis, it can be passed through \p Expr. If the def-use chain
   /// associated with the phi includes casts (that we know we can ignore
   /// under proper runtime checks), they are passed through \p CastsToIgnore.
-  static bool 
+  static bool
   isInductionPHI(PHINode *Phi, const Loop* L, ScalarEvolution *SE,
                  InductionDescriptor &D, const SCEV *Expr = nullptr,
                  SmallVectorImpl<Instruction *> *CastsToIgnore = nullptr);
 
   /// Returns true if \p Phi is a floating point induction in the loop \p L.
-  /// If \p Phi is an induction, the induction descriptor \p D will contain 
+  /// If \p Phi is an induction, the induction descriptor \p D will contain
   /// the data describing this induction.
   static bool isFPInductionPHI(PHINode *Phi, const Loop* L,
                                ScalarEvolution *SE, InductionDescriptor &D);
@@ -344,11 +334,11 @@ public:
       Instruction::BinaryOpsEnd;
   }
 
-  /// Returns a reference to the type cast instructions in the induction 
+  /// Returns a reference to the type cast instructions in the induction
   /// update chain, that are redundant when guarded with a runtime
   /// SCEV overflow check.
-  const SmallVectorImpl<Instruction *> &getCastInsts() const { 
-    return RedundantCasts; 
+  const SmallVectorImpl<Instruction *> &getCastInsts() const {
+    return RedundantCasts;
   }
 
 private:
@@ -395,7 +385,7 @@ bool formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
 bool formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
                               DominatorTree &DT, LoopInfo &LI);
 
-/// \brief Put loop into LCSSA form.
+/// Put loop into LCSSA form.
 ///
 /// Looks at all instructions in the loop which have uses outside of the
 /// current loop. For each, an LCSSA PHI node is inserted and the uses outside
@@ -408,7 +398,7 @@ bool formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 /// Returns true if any modifications are made to the loop.
 bool formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE);
 
-/// \brief Put a loop nest into LCSSA form.
+/// Put a loop nest into LCSSA form.
 ///
 /// This recursively forms LCSSA for a loop nest.
 ///
@@ -420,7 +410,7 @@ bool formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE);
 bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
                           ScalarEvolution *SE);
 
-/// \brief Walk the specified region of the CFG (defined by all blocks
+/// Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in
 /// reverse depth first order w.r.t the DominatorTree. This allows us to visit
 /// uses before definitions, allowing us to sink a loop body in one pass without
@@ -433,7 +423,7 @@ bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                 AliasSetTracker *, LoopSafetyInfo *,
                 OptimizationRemarkEmitter *ORE);
 
-/// \brief Walk the specified region of the CFG (defined by all blocks
+/// Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in depth
 /// first order w.r.t the DominatorTree.  This allows us to visit definitions
 /// before uses, allowing us to hoist a loop body in one pass without iteration.
@@ -459,7 +449,7 @@ bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
 void deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
                     LoopInfo *LI);
 
-/// \brief Try to promote memory values to scalars by sinking stores out of
+/// Try to promote memory values to scalars by sinking stores out of
 /// the loop and moving loads to before the loop.  We do this by looping over
 /// the stores in the loop, looking for stores to Must pointers which are
 /// loop invariant. It takes a set of must-alias values, Loop exit blocks
@@ -480,22 +470,10 @@ bool promoteLoopAccessesToScalars(const SmallSetVector<Value *, 8> &,
 SmallVector<DomTreeNode *, 16> collectChildrenInLoop(DomTreeNode *N,
                                                      const Loop *CurLoop);
 
-/// \brief Computes safety information for a loop
-/// checks loop body & header for the possibility of may throw
-/// exception, it takes LoopSafetyInfo and loop as argument.
-/// Updates safety information in LoopSafetyInfo argument.
-void computeLoopSafetyInfo(LoopSafetyInfo *, Loop *);
-
-/// Returns true if the instruction in a loop is guaranteed to execute at least
-/// once.
-bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT,
-                           const Loop *CurLoop,
-                           const LoopSafetyInfo *SafetyInfo);
-
-/// \brief Returns the instructions that use values defined in the loop.
+/// Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L);
 
-/// \brief Find string metadata for loop
+/// Find string metadata for loop
 ///
 /// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
 /// operand or null otherwise.  If the string metadata is not found return
@@ -503,11 +481,11 @@ SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L);
 Optional<const MDOperand *> findStringMetadataForLoop(Loop *TheLoop,
                                                       StringRef Name);
 
-/// \brief Set input string into loop metadata by keeping other values intact.
+/// Set input string into loop metadata by keeping other values intact.
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
 
-/// \brief Get a loop's estimated trip count based on branch weight metadata.
+/// Get a loop's estimated trip count based on branch weight metadata.
 /// Returns 0 when the count is estimated to be 0, or None when a meaningful
 /// estimate can not be made.
 Optional<unsigned> getLoopEstimatedTripCount(Loop *L);
@@ -531,11 +509,18 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         LoopSafetyInfo *SafetyInfo,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Generates an ordered vector reduction using extracts to reduce the value.
+Value *
+getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op,
+                    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+                        RecurrenceDescriptor::MRK_Invalid,
+                    ArrayRef<Value *> RedOps = None);
+
 /// Generates a vector reduction using shufflevectors to reduce the value.
 Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
                            RecurrenceDescriptor::MinMaxRecurrenceKind
                                MinMaxKind = RecurrenceDescriptor::MRK_Invalid,
-                           ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+                           ArrayRef<Value *> RedOps = None);
 
 /// Create a target reduction of the given vector. The reduction operation
 /// is described by the \p Opcode parameter. min/max reductions require
@@ -547,7 +532,7 @@ createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
                             unsigned Opcode, Value *Src,
                             TargetTransformInfo::ReductionFlags Flags =
                                 TargetTransformInfo::ReductionFlags(),
-                            ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+                            ArrayRef<Value *> RedOps = None);
 
 /// Create a generic target reduction using a recurrence descriptor \p Desc
 /// The target is queried to determine if intrinsics or shuffle sequences are
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/LoopVersioning.h b/contrib/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
index fa5d7845d080..fcd734b37a1f 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
@@ -28,14 +28,14 @@ class LoopAccessInfo;
 class LoopInfo;
 class ScalarEvolution;
 
-/// \brief This class emits a version of the loop where run-time checks ensure
+/// This class emits a version of the loop where run-time checks ensure
 /// that may-alias pointers can't overlap.
 ///
 /// It currently only supports single-exit loops and assumes that the loop
 /// already has a preheader.
 class LoopVersioning {
 public:
-  /// \brief Expects LoopAccessInfo, Loop, LoopInfo, DominatorTree as input.
+  /// Expects LoopAccessInfo, Loop, LoopInfo, DominatorTree as input.
   /// It uses runtime check provided by the user. If \p UseLAIChecks is true,
   /// we will retain the default checks made by LAI. Otherwise, construct an
   /// object having no checks and we expect the user to add them.
@@ -43,7 +43,7 @@ public:
                  DominatorTree *DT, ScalarEvolution *SE,
                  bool UseLAIChecks = true);
 
-  /// \brief Performs the CFG manipulation part of versioning the loop including
+  /// Performs the CFG manipulation part of versioning the loop including
   /// the DominatorTree and LoopInfo updates.
   ///
   /// The loop that was used to construct the class will be the "versioned" loop
@@ -58,38 +58,38 @@ public:
   ///        transform L
   void versionLoop() { versionLoop(findDefsUsedOutsideOfLoop(VersionedLoop)); }
 
-  /// \brief Same but if the client has already precomputed the set of values
+  /// Same but if the client has already precomputed the set of values
   /// used outside the loop, this API will allows passing that.
   void versionLoop(const SmallVectorImpl<Instruction *> &DefsUsedOutside);
 
-  /// \brief Returns the versioned loop.  Control flows here if pointers in the
+  /// Returns the versioned loop.  Control flows here if pointers in the
   /// loop don't alias (i.e. all memchecks passed).  (This loop is actually the
   /// same as the original loop that we got constructed with.)
   Loop *getVersionedLoop() { return VersionedLoop; }
 
-  /// \brief Returns the fall-back loop.  Control flows here if pointers in the
+  /// Returns the fall-back loop.  Control flows here if pointers in the
   /// loop may alias (i.e. one of the memchecks failed).
   Loop *getNonVersionedLoop() { return NonVersionedLoop; }
 
-  /// \brief Sets the runtime alias checks for versioning the loop.
+  /// Sets the runtime alias checks for versioning the loop.
   void setAliasChecks(
       SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks);
 
-  /// \brief Sets the runtime SCEV checks for versioning the loop.
+  /// Sets the runtime SCEV checks for versioning the loop.
   void setSCEVChecks(SCEVUnionPredicate Check);
 
-  /// \brief Annotate memory instructions in the versioned loop with no-alias
+  /// Annotate memory instructions in the versioned loop with no-alias
   /// metadata based on the memchecks issued.
   ///
   /// This is just wrapper that calls prepareNoAliasMetadata and
   /// annotateInstWithNoAlias on the instructions of the versioned loop.
   void annotateLoopWithNoAlias();
 
-  /// \brief Set up the aliasing scopes based on the memchecks.  This needs to
+  /// Set up the aliasing scopes based on the memchecks.  This needs to
   /// be called before the first call to annotateInstWithNoAlias.
   void prepareNoAliasMetadata();
 
-  /// \brief Add the noalias annotations to \p VersionedInst.
+  /// Add the noalias annotations to \p VersionedInst.
   ///
   /// \p OrigInst is the instruction corresponding to \p VersionedInst in the
   /// original loop.  Initialize the aliasing scopes with
@@ -98,50 +98,50 @@ public:
                                const Instruction *OrigInst);
 
 private:
-  /// \brief Adds the necessary PHI nodes for the versioned loops based on the
+  /// Adds the necessary PHI nodes for the versioned loops based on the
   /// loop-defined values used outside of the loop.
   ///
   /// This needs to be called after versionLoop if there are defs in the loop
   /// that are used outside the loop.
   void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside);
 
-  /// \brief Add the noalias annotations to \p I.  Initialize the aliasing
+  /// Add the noalias annotations to \p I.  Initialize the aliasing
   /// scopes with prepareNoAliasMetadata once before this can be called.
   void annotateInstWithNoAlias(Instruction *I) {
     annotateInstWithNoAlias(I, I);
   }
 
-  /// \brief The original loop.  This becomes the "versioned" one.  I.e.,
+  /// The original loop.  This becomes the "versioned" one.  I.e.,
   /// control flows here if pointers in the loop don't alias.
   Loop *VersionedLoop;
-  /// \brief The fall-back loop.  I.e. control flows here if pointers in the
+  /// The fall-back loop.  I.e. control flows here if pointers in the
   /// loop may alias (memchecks failed).
   Loop *NonVersionedLoop;
 
-  /// \brief This maps the instructions from VersionedLoop to their counterpart
+  /// This maps the instructions from VersionedLoop to their counterpart
   /// in NonVersionedLoop.
   ValueToValueMapTy VMap;
 
-  /// \brief The set of alias checks that we are versioning for.
+  /// The set of alias checks that we are versioning for.
   SmallVector<RuntimePointerChecking::PointerCheck, 4> AliasChecks;
 
-  /// \brief The set of SCEV checks that we are versioning for.
+  /// The set of SCEV checks that we are versioning for.
   SCEVUnionPredicate Preds;
 
-  /// \brief Maps a pointer to the pointer checking group that the pointer
+  /// Maps a pointer to the pointer checking group that the pointer
   /// belongs to.
   DenseMap<const Value *, const RuntimePointerChecking::CheckingPtrGroup *>
       PtrToGroup;
 
-  /// \brief The alias scope corresponding to a pointer checking group.
+  /// The alias scope corresponding to a pointer checking group.
   DenseMap<const RuntimePointerChecking::CheckingPtrGroup *, MDNode *>
       GroupToScope;
 
-  /// \brief The list of alias scopes that a pointer checking group can't alias.
+  /// The list of alias scopes that a pointer checking group can't alias.
   DenseMap<const RuntimePointerChecking::CheckingPtrGroup *, MDNode *>
       GroupToNonAliasingScopeList;
 
-  /// \brief Analyses used.
+  /// Analyses used.
   const LoopAccessInfo &LAI;
   LoopInfo *LI;
   DominatorTree *DT;
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/ModuleUtils.h b/contrib/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
index 4b9bc8293810..14615c25d093 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
@@ -49,7 +49,7 @@ Function *checkSanitizerInterfaceFunction(Constant *FuncOrBitcast);
 Function *declareSanitizerInitFunction(Module &M, StringRef InitName,
                                        ArrayRef<Type *> InitArgTypes);
 
-/// \brief Creates sanitizer constructor function, and calls sanitizer's init
+/// Creates sanitizer constructor function, and calls sanitizer's init
 /// function from it.
 /// \return Returns pair of pointers to constructor, and init functions
 /// respectively.
@@ -62,10 +62,10 @@ std::pair<Function *, Function *> createSanitizerCtorAndInitFunctions(
 /// the list of public globals in the module.
 bool nameUnamedGlobals(Module &M);
 
-/// \brief Adds global values to the llvm.used list.
+/// Adds global values to the llvm.used list.
 void appendToUsed(Module &M, ArrayRef<GlobalValue *> Values);
 
-/// \brief Adds global values to the llvm.compiler.used list.
+/// Adds global values to the llvm.compiler.used list.
 void appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values);
 
 /// Filter out potentially dead comdat functions where other entries keep the
@@ -84,7 +84,7 @@ void appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values);
 void filterDeadComdatFunctions(
     Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions);
 
-/// \brief Produce a unique identifier for this module by taking the MD5 sum of
+/// Produce a unique identifier for this module by taking the MD5 sum of
 /// the names of the module's strong external symbols that are not comdat
 /// members.
 ///
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/OrderedInstructions.h b/contrib/llvm/include/llvm/Transforms/Utils/OrderedInstructions.h
index 165d4bdaa6d4..7f57fde638b8 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/OrderedInstructions.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/OrderedInstructions.h
@@ -35,6 +35,11 @@ class OrderedInstructions {
   /// The dominator tree of the parent function.
   DominatorTree *DT;
 
+  /// Return true if the first instruction comes before the second in the
+  /// same basic block. It will create an ordered basic block, if it does
+  /// not yet exist in OBBMap.
+  bool localDominates(const Instruction *, const Instruction *) const;
+
 public:
   /// Constructor.
   OrderedInstructions(DominatorTree *DT) : DT(DT) {}
@@ -42,6 +47,12 @@ public:
   /// Return true if first instruction dominates the second.
   bool dominates(const Instruction *, const Instruction *) const;
 
+  /// Return true if the first instruction comes before the second in the
+  /// dominator tree DFS traversal if they are in different basic blocks,
+  /// or if the first instruction comes before the second in the same basic
+  /// block.
+  bool dfsBefore(const Instruction *, const Instruction *) const;
+
   /// Invalidate the OrderedBasicBlock cache when its basic block changes.
   /// i.e. If an instruction is deleted or added to the basic block, the user
   /// should call this function to invalidate the OrderedBasicBlock cache for
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/contrib/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index 8150f1528397..b53eda7e5a42 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief  This file implements the PredicateInfo analysis, which creates an Extended
+///  This file implements the PredicateInfo analysis, which creates an Extended
 /// SSA form for operations used in branch comparisons and llvm.assume
 /// comparisons.
 ///
@@ -31,7 +31,7 @@
 /// %cmp = icmp eq i32, %x, 50
 /// br i1 %cmp, label %true, label %false
 /// true:
-/// %x.0 = call @llvm.ssa_copy.i32(i32 %x)
+/// %x.0 = call \@llvm.ssa_copy.i32(i32 %x)
 /// ret i32 %x.0
 /// false:
 /// ret i32 1
@@ -54,6 +54,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
@@ -69,6 +70,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/Support/Casting.h"
@@ -193,7 +195,7 @@ namespace PredicateInfoClasses {
 struct ValueDFS;
 }
 
-/// \brief Encapsulates PredicateInfo, including all data associated with memory
+/// Encapsulates PredicateInfo, including all data associated with memory
 /// accesses.
 class PredicateInfo {
 private:
@@ -261,6 +263,8 @@ private:
   // The set of edges along which we can only handle phi uses, due to critical
   // edges.
   DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
+  // The set of ssa_copy declarations we created with our custom mangling.
+  SmallSet<AssertingVH<Function>, 20> CreatedDeclarations;
 };
 
 // This pass does eager building and then printing of PredicateInfo. It is used
@@ -275,7 +279,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
-/// \brief Printer pass for \c PredicateInfo.
+/// Printer pass for \c PredicateInfo.
 class PredicateInfoPrinterPass
     : public PassInfoMixin<PredicateInfoPrinterPass> {
   raw_ostream &OS;
@@ -285,7 +289,7 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// \brief Verifier pass for \c PredicateInfo.
+/// Verifier pass for \c PredicateInfo.
 struct PredicateInfoVerifierPass : PassInfoMixin<PredicateInfoVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/contrib/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
index bb8a61a474f2..5ddfbe2bf058 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -23,7 +23,7 @@ class DominatorTree;
 class AliasSetTracker;
 class AssumptionCache;
 
-/// \brief Return true if this alloca is legal for promotion.
+/// Return true if this alloca is legal for promotion.
 ///
 /// This is true if there are only loads, stores, and lifetime markers
 /// (transitively) using this alloca. This also enforces that there is only
@@ -31,7 +31,7 @@ class AssumptionCache;
 /// markers.
 bool isAllocaPromotable(const AllocaInst *AI);
 
-/// \brief Promote the specified list of alloca instructions into scalar
+/// Promote the specified list of alloca instructions into scalar
 /// registers, inserting PHI nodes as appropriate.
 ///
 /// This function makes use of DominanceFrontier information.  This function
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index 6cd9f1539b0b..4a7911662990 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -30,7 +30,7 @@ class Type;
 class Use;
 class Value;
 
-/// \brief Helper class for SSA formation on a set of values defined in
+/// Helper class for SSA formation on a set of values defined in
 /// multiple blocks.
 ///
 /// This is used when code duplication or another unstructured
@@ -62,25 +62,25 @@ public:
   SSAUpdater &operator=(const SSAUpdater &) = delete;
   ~SSAUpdater();
 
-  /// \brief Reset this object to get ready for a new set of SSA updates with
+  /// Reset this object to get ready for a new set of SSA updates with
   /// type 'Ty'.
   ///
   /// PHI nodes get a name based on 'Name'.
   void Initialize(Type *Ty, StringRef Name);
 
-  /// \brief Indicate that a rewritten value is available in the specified block
+  /// Indicate that a rewritten value is available in the specified block
   /// with the specified value.
   void AddAvailableValue(BasicBlock *BB, Value *V);
 
-  /// \brief Return true if the SSAUpdater already has a value for the specified
+  /// Return true if the SSAUpdater already has a value for the specified
   /// block.
   bool HasValueForBlock(BasicBlock *BB) const;
 
-  /// \brief Construct SSA form, materializing a value that is live at the end
+  /// Construct SSA form, materializing a value that is live at the end
   /// of the specified block.
   Value *GetValueAtEndOfBlock(BasicBlock *BB);
 
-  /// \brief Construct SSA form, materializing a value that is live in the
+  /// Construct SSA form, materializing a value that is live in the
   /// middle of the specified block.
   ///
   /// \c GetValueInMiddleOfBlock is the same as \c GetValueAtEndOfBlock except
@@ -102,7 +102,7 @@ public:
   /// merge the appropriate values, and this value isn't live out of the block.
   Value *GetValueInMiddleOfBlock(BasicBlock *BB);
 
-  /// \brief Rewrite a use of the symbolic value.
+  /// Rewrite a use of the symbolic value.
   ///
   /// This handles PHI nodes, which use their value in the corresponding
   /// predecessor. Note that this will not work if the use is supposed to be
@@ -111,7 +111,7 @@ public:
   /// be below it.
   void RewriteUse(Use &U);
 
-  /// \brief Rewrite a use like \c RewriteUse but handling in-block definitions.
+  /// Rewrite a use like \c RewriteUse but handling in-block definitions.
   ///
   /// This version of the method can rewrite uses in the same block as
   /// a definition, because it assumes that all uses of a value are below any
@@ -122,7 +122,7 @@ private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
 };
 
-/// \brief Helper class for promoting a collection of loads and stores into SSA
+/// Helper class for promoting a collection of loads and stores into SSA
 /// Form using the SSAUpdater.
 ///
 /// This handles complexities that SSAUpdater doesn't, such as multiple loads
@@ -139,32 +139,32 @@ public:
                        SSAUpdater &S, StringRef Name = StringRef());
   virtual ~LoadAndStorePromoter() = default;
 
-  /// \brief This does the promotion.
+  /// This does the promotion.
   ///
   /// Insts is a list of loads and stores to promote, and Name is the basename
   /// for the PHIs to insert. After this is complete, the loads and stores are
   /// removed from the code.
   void run(const SmallVectorImpl<Instruction *> &Insts) const;
 
-  /// \brief Return true if the specified instruction is in the Inst list.
+  /// Return true if the specified instruction is in the Inst list.
   ///
   /// The Insts list is the one passed into the constructor. Clients should
   /// implement this with a more efficient version if possible.
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction *> &Insts) const;
 
-  /// \brief This hook is invoked after all the stores are found and inserted as
+  /// This hook is invoked after all the stores are found and inserted as
   /// available values.
   virtual void doExtraRewritesBeforeFinalDeletion() const {}
 
-  /// \brief Clients can choose to implement this to get notified right before
+  /// Clients can choose to implement this to get notified right before
   /// a load is RAUW'd another value.
   virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const {}
 
-  /// \brief Called before each instruction is deleted.
+  /// Called before each instruction is deleted.
   virtual void instructionDeleted(Instruction *I) const {}
 
-  /// \brief Called to update debug info associated with the instruction.
+  /// Called to update debug info associated with the instruction.
   virtual void updateDebugInfo(Instruction *I) const {}
 };
 
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h b/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
new file mode 100644
index 000000000000..53a608f01804
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
@@ -0,0 +1,92 @@
+//===- SSAUpdaterBulk.h - Unstructured SSA Update Tool ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SSAUpdaterBulk class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
+#define LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/PredIteratorCache.h"
+
+namespace llvm {
+
+class BasicBlock;
+class PHINode;
+template <typename T> class SmallVectorImpl;
+class Type;
+class Use;
+class Value;
+class DominatorTree;
+
+/// Helper class for SSA formation on a set of values defined in multiple
+/// blocks.
+///
+/// This is used when code duplication or another unstructured transformation
+/// wants to rewrite a set of uses of one value with uses of a set of values.
+/// The update is done only when RewriteAllUses is called, all other methods are
+/// used for book-keeping. That helps to share some common computations between
+/// updates of different uses (which is not the case when traditional SSAUpdater
+/// is used).
+class SSAUpdaterBulk {
+  struct RewriteInfo {
+    DenseMap<BasicBlock *, Value *> Defines;
+    SmallVector<Use *, 4> Uses;
+    StringRef Name;
+    Type *Ty;
+    RewriteInfo(){};
+    RewriteInfo(StringRef &N, Type *T) : Name(N), Ty(T){};
+  };
+  SmallVector<RewriteInfo, 4> Rewrites;
+
+  PredIteratorCache PredCache;
+
+  Value *computeValueAt(BasicBlock *BB, RewriteInfo &R, DominatorTree *DT);
+
+public:
+  explicit SSAUpdaterBulk(){};
+  SSAUpdaterBulk(const SSAUpdaterBulk &) = delete;
+  SSAUpdaterBulk &operator=(const SSAUpdaterBulk &) = delete;
+  ~SSAUpdaterBulk(){};
+
+  /// Add a new variable to the SSA rewriter. This needs to be called before
+  /// AddAvailableValue or AddUse calls. The return value is the variable ID,
+  /// which needs to be passed to AddAvailableValue and AddUse.
+  unsigned AddVariable(StringRef Name, Type *Ty);
+
+  /// Indicate that a rewritten value is available in the specified block with
+  /// the specified value.
+  void AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V);
+
+  /// Record a use of the symbolic value. This use will be updated with a
+  /// rewritten value when RewriteAllUses is called.
+  void AddUse(unsigned Var, Use *U);
+
+  /// Return true if the SSAUpdater already has a value for the specified
+  /// variable in the specified block.
+  bool HasValueForBlock(unsigned Var, BasicBlock *BB);
+
+  /// Perform all the necessary updates, including new PHI-nodes insertion and
+  /// the requested uses update.
+  ///
+  /// The function requires dominator tree DT, which is used for computing
+  /// locations for new phi-nodes insertions. If a nonnull pointer to a vector
+  /// InsertedPHIs is passed, all the new phi-nodes will be added to this
+  /// vector.
+  void RewriteAllUses(DominatorTree *DT,
+                      SmallVectorImpl<PHINode *> *InsertedPHIs = nullptr);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index b1611d49a456..b7649ba88334 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -379,7 +379,7 @@ public:
         Traits::AddPHIOperand(PHI, PredInfo->AvailableVal, Pred);
       }
 
-      DEBUG(dbgs() << "  Inserted PHI: " << *PHI << "\n");
+      LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *PHI << "\n");
 
       // If the client wants to know about all new instructions, tell it.
       if (InsertedPHIs) InsertedPHIs->push_back(PHI);
@@ -389,12 +389,8 @@ public:
   /// FindExistingPHI - Look through the PHI nodes in a block to see if any of
   /// them match what is needed.
   void FindExistingPHI(BlkT *BB, BlockListTy *BlockList) {
-    for (typename BlkT::iterator BBI = BB->begin(), BBE = BB->end();
-         BBI != BBE; ++BBI) {
-      PhiT *SomePHI = Traits::InstrIsPHI(&*BBI);
-      if (!SomePHI)
-        break;
-      if (CheckIfPHIMatches(SomePHI)) {
+    for (auto &SomePHI : BB->phis()) {
+      if (CheckIfPHIMatches(&SomePHI)) {
         RecordMatchingPHIs(BlockList);
         break;
       }
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/SimplifyInstructions.h b/contrib/llvm/include/llvm/Transforms/Utils/SimplifyInstructions.h
deleted file mode 100644
index 3f838611626f..000000000000
--- a/contrib/llvm/include/llvm/Transforms/Utils/SimplifyInstructions.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- SimplifyInstructions.h - Remove redundant instructions ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification.  If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYINSTRUCTIONS_H
-#define LLVM_TRANSFORMS_UTILS_SIMPLIFYINSTRUCTIONS_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-/// This pass removes redundant instructions.
-class InstSimplifierPass : public PassInfoMixin<InstSimplifierPass> {
-public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-};
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_UTILS_SIMPLIFYINSTRUCTIONS_H
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 73a62f59203b..d007f909c6a4 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -30,7 +30,7 @@ class BasicBlock;
 class Function;
 class OptimizationRemarkEmitter;
 
-/// \brief This class implements simplifications for calls to fortified library
+/// This class implements simplifications for calls to fortified library
 /// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to,
 /// when possible, replace them with their non-checking counterparts.
 /// Other optimizations can also be done, but it's possible to disable them and
@@ -45,7 +45,7 @@ public:
   FortifiedLibCallSimplifier(const TargetLibraryInfo *TLI,
                              bool OnlyLowerUnknownSize = false);
 
-  /// \brief Take the given call instruction and return a more
+  /// Take the given call instruction and return a more
   /// optimal value to replace the instruction with or 0 if a more
   /// optimal form can't be found.
   /// The call must not be an indirect call.
@@ -60,7 +60,7 @@ private:
   Value *optimizeStrpCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc Func);
   Value *optimizeStrpNCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc Func);
 
-  /// \brief Checks whether the call \p CI to a fortified libcall is foldable
+  /// Checks whether the call \p CI to a fortified libcall is foldable
   /// to the non-fortified version.
   bool isFortifiedCallFoldable(CallInst *CI, unsigned ObjSizeOp,
                                unsigned SizeOp, bool isString);
@@ -78,13 +78,13 @@ private:
   bool UnsafeFPShrink;
   function_ref<void(Instruction *, Value *)> Replacer;
 
-  /// \brief Internal wrapper for RAUW that is the default implementation.
+  /// Internal wrapper for RAUW that is the default implementation.
   ///
   /// Other users may provide an alternate function with this signature instead
   /// of this one.
   static void replaceAllUsesWithDefault(Instruction *I, Value *With);
 
-  /// \brief Replace an instruction's uses with a value using our replacer.
+  /// Replace an instruction's uses with a value using our replacer.
   void replaceAllUsesWith(Instruction *I, Value *With);
 
 public:
@@ -124,6 +124,7 @@ private:
   Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeRealloc(CallInst *CI, IRBuilder<> &B);
   Value *optimizeWcslen(CallInst *CI, IRBuilder<> &B);
   // Wrapper for all String/Memory Library Call Optimizations
   Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);
@@ -150,15 +151,22 @@ private:
   Value *optimizeIsDigit(CallInst *CI, IRBuilder<> &B);
   Value *optimizeIsAscii(CallInst *CI, IRBuilder<> &B);
   Value *optimizeToAscii(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeAtoi(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrtol(CallInst *CI, IRBuilder<> &B);
 
   // Formatting and IO Library Call Optimizations
   Value *optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
                                 int StreamArg = -1);
   Value *optimizePrintF(CallInst *CI, IRBuilder<> &B);
   Value *optimizeSPrintF(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeSnPrintF(CallInst *CI, IRBuilder<> &B);
   Value *optimizeFPrintF(CallInst *CI, IRBuilder<> &B);
   Value *optimizeFWrite(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFRead(CallInst *CI, IRBuilder<> &B);
   Value *optimizeFPuts(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFGets(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFPutc(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFGetc(CallInst *CI, IRBuilder<> &B);
   Value *optimizePuts(CallInst *CI, IRBuilder<> &B);
 
   // Helper methods
@@ -169,6 +177,7 @@ private:
                       SmallVectorImpl<CallInst *> &SinCosCalls);
   Value *optimizePrintFString(CallInst *CI, IRBuilder<> &B);
   Value *optimizeSPrintFString(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B);
   Value *optimizeFPrintFString(CallInst *CI, IRBuilder<> &B);
 
   /// hasFloatVersion - Checks if there is a float version of the specified
diff --git a/contrib/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/contrib/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 12aa3bc6e770..a6b84af068a5 100644
--- a/contrib/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -19,11 +19,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 namespace llvm {
 
 class AssumptionCache;
 class BasicBlock;
+class DependenceInfo;
 class DominatorTree;
 class Loop;
 class LoopInfo;
@@ -71,13 +73,54 @@ bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
 
 void computePeelCount(Loop *L, unsigned LoopSize,
                       TargetTransformInfo::UnrollingPreferences &UP,
-                      unsigned &TripCount);
+                      unsigned &TripCount, ScalarEvolution &SE);
+
+bool canPeel(Loop *L);
 
 bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
               DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
 
+LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                                  unsigned TripMultiple, bool UnrollRemainder,
+                                  LoopInfo *LI, ScalarEvolution *SE,
+                                  DominatorTree *DT, AssumptionCache *AC,
+                                  OptimizationRemarkEmitter *ORE);
+
+bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+                          DependenceInfo &DI);
+
+bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+                        DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+                        const SmallPtrSetImpl<const Value *> &EphValues,
+                        OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+                        unsigned MaxTripCount, unsigned &TripMultiple,
+                        unsigned LoopSize,
+                        TargetTransformInfo::UnrollingPreferences &UP,
+                        bool &UseUpperBound);
+
+BasicBlock *foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
+                                     ScalarEvolution *SE, DominatorTree *DT);
+
+void remapInstruction(Instruction *I, ValueToValueMapTy &VMap);
+
+void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                             ScalarEvolution *SE, DominatorTree *DT,
+                             AssumptionCache *AC);
+
 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 
+TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);
+
+unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                             bool &NotDuplicatable, bool &Convergent,
+                             const TargetTransformInfo &TTI,
+                             const SmallPtrSetImpl<const Value *> &EphValues,
+                             unsigned BEInsns);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
diff --git a/contrib/llvm/include/llvm/Transforms/Vectorize.h b/contrib/llvm/include/llvm/Transforms/Vectorize.h
index 19845e471e48..950af7ffe05f 100644
--- a/contrib/llvm/include/llvm/Transforms/Vectorize.h
+++ b/contrib/llvm/include/llvm/Transforms/Vectorize.h
@@ -21,88 +21,88 @@ class BasicBlockPass;
 class Pass;
 
 //===----------------------------------------------------------------------===//
-/// @brief Vectorize configuration.
+/// Vectorize configuration.
 struct VectorizeConfig {
   //===--------------------------------------------------------------------===//
   // Target architecture related parameters
 
-  /// @brief The size of the native vector registers.
+  /// The size of the native vector registers.
   unsigned VectorBits;
 
-  /// @brief Vectorize boolean values.
+  /// Vectorize boolean values.
   bool VectorizeBools;
 
-  /// @brief Vectorize integer values.
+  /// Vectorize integer values.
   bool VectorizeInts;
 
-  /// @brief Vectorize floating-point values.
+  /// Vectorize floating-point values.
   bool VectorizeFloats;
 
-  /// @brief Vectorize pointer values.
+  /// Vectorize pointer values.
   bool VectorizePointers;
 
-  /// @brief Vectorize casting (conversion) operations.
+  /// Vectorize casting (conversion) operations.
   bool VectorizeCasts;
 
-  /// @brief Vectorize floating-point math intrinsics.
+  /// Vectorize floating-point math intrinsics.
   bool VectorizeMath;
 
-  /// @brief Vectorize bit intrinsics.
+  /// Vectorize bit intrinsics.
   bool VectorizeBitManipulations;
 
-  /// @brief Vectorize the fused-multiply-add intrinsic.
+  /// Vectorize the fused-multiply-add intrinsic.
   bool VectorizeFMA;
 
-  /// @brief Vectorize select instructions.
+  /// Vectorize select instructions.
   bool VectorizeSelect;
 
-  /// @brief Vectorize comparison instructions.
+  /// Vectorize comparison instructions.
   bool VectorizeCmp;
 
-  /// @brief Vectorize getelementptr instructions.
+  /// Vectorize getelementptr instructions.
   bool VectorizeGEP;
 
-  /// @brief Vectorize loads and stores.
+  /// Vectorize loads and stores.
   bool VectorizeMemOps;
 
-  /// @brief Only generate aligned loads and stores.
+  /// Only generate aligned loads and stores.
   bool AlignedOnly;
 
   //===--------------------------------------------------------------------===//
   // Misc parameters
 
-  /// @brief The required chain depth for vectorization.
+  /// The required chain depth for vectorization.
   unsigned ReqChainDepth;
 
-  /// @brief The maximum search distance for instruction pairs.
+  /// The maximum search distance for instruction pairs.
   unsigned SearchLimit;
 
-  /// @brief The maximum number of candidate pairs with which to use a full
+  /// The maximum number of candidate pairs with which to use a full
   ///        cycle check.
   unsigned MaxCandPairsForCycleCheck;
 
-  /// @brief Replicating one element to a pair breaks the chain.
+  /// Replicating one element to a pair breaks the chain.
   bool SplatBreaksChain;
 
-  /// @brief The maximum number of pairable instructions per group.
+  /// The maximum number of pairable instructions per group.
   unsigned MaxInsts;
 
-  /// @brief The maximum number of candidate instruction pairs per group.
+  /// The maximum number of candidate instruction pairs per group.
   unsigned MaxPairs;
 
-  /// @brief The maximum number of pairing iterations.
+  /// The maximum number of pairing iterations.
   unsigned MaxIter;
 
-  /// @brief Don't try to form odd-length vectors.
+  /// Don't try to form odd-length vectors.
   bool Pow2LenOnly;
 
-  /// @brief Don't boost the chain-depth contribution of loads and stores.
+  /// Don't boost the chain-depth contribution of loads and stores.
   bool NoMemOpBoost;
 
-  /// @brief Use a fast instruction dependency analysis.
+  /// Use a fast instruction dependency analysis.
   bool FastDep;
 
-  /// @brief Initialize the VectorizeConfig from command line options.
+  /// Initialize the VectorizeConfig from command line options.
   VectorizeConfig();
 };
 
@@ -120,7 +120,7 @@ Pass *createLoopVectorizePass(bool NoUnrolling = false,
 Pass *createSLPVectorizerPass();
 
 //===----------------------------------------------------------------------===//
-/// @brief Vectorize the BasicBlock.
+/// Vectorize the BasicBlock.
 ///
 /// @param BB The BasicBlock to be vectorized
 /// @param P  The current running pass, should require AliasAnalysis and
diff --git a/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
new file mode 100644
index 000000000000..224879cdba52
--- /dev/null
+++ b/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -0,0 +1,482 @@
+//===- llvm/Transforms/Vectorize/LoopVectorizationLegality.h ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines the LoopVectorizationLegality class. Original code
+/// in Loop Vectorizer has been moved out to its own file for modularity
+/// and reusability.
+///
+/// Currently, it works for innermost loop vectorization. Extending this to
+/// outer loop vectorization is a TODO item.
+///
+/// Also provides:
+/// 1) LoopVectorizeHints class which keeps a number of loop annotations
+/// locally for easy look up. It has the ability to write them back as
+/// loop metadata, upon request.
+/// 2) LoopVectorizationRequirements class for lazy bail out for the purpose
+/// of reporting useful failure to vectorize message.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+namespace llvm {
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
+/// RemarkName is the identifier for the remark.  If \p I is passed it is an
+/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
+/// the location of the remark.  \return the remark object that can be
+/// streamed to.
+OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
+                                                  StringRef RemarkName,
+                                                  Loop *TheLoop,
+                                                  Instruction *I = nullptr);
+
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
+class LoopVectorizeHints {
+  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
+
+  /// Hint - associates name and validation with the hint value.
+  struct Hint {
+    const char *Name;
+    unsigned Value; // This may have to change for non-numeric values.
+    HintKind Kind;
+
+    Hint(const char *Name, unsigned Value, HintKind Kind)
+        : Name(Name), Value(Value), Kind(Kind) {}
+
+    bool validate(unsigned Val);
+  };
+
+  /// Vectorization width.
+  Hint Width;
+
+  /// Vectorization interleave factor.
+  Hint Interleave;
+
+  /// Vectorization forced
+  Hint Force;
+
+  /// Already Vectorized
+  Hint IsVectorized;
+
+  /// Return the loop metadata prefix.
+  static StringRef Prefix() { return "llvm.loop."; }
+
+  /// True if there is any unsafe math in the loop.
+  bool PotentiallyUnsafe = false;
+
+public:
+  enum ForceKind {
+    FK_Undefined = -1, ///< Not selected.
+    FK_Disabled = 0,   ///< Forcing disabled.
+    FK_Enabled = 1,    ///< Forcing enabled.
+  };
+
+  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+                     OptimizationRemarkEmitter &ORE);
+
+  /// Mark the loop L as already vectorized by setting the width to 1.
+  void setAlreadyVectorized() {
+    IsVectorized.Value = 1;
+    Hint Hints[] = {IsVectorized};
+    writeHintsToMetadata(Hints);
+  }
+
+  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const;
+
+  /// Dumps all the hint information.
+  void emitRemarkWithHints() const;
+
+  unsigned getWidth() const { return Width.Value; }
+  unsigned getInterleave() const { return Interleave.Value; }
+  unsigned getIsVectorized() const { return IsVectorized.Value; }
+  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+
+  /// If hints are provided that force vectorization, use the AlwaysPrint
+  /// pass name to force the frontend to print the diagnostic.
+  const char *vectorizeAnalysisPassName() const;
+
+  bool allowReordering() const {
+    // When enabling loop hints are provided we allow the vectorizer to change
+    // the order of operations that is given by the scalar loop. This is not
+    // enabled by default because can be unsafe or inefficient. For example,
+    // reordering floating-point operations will change the way round-off
+    // error accumulates in the loop.
+    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+  }
+
+  bool isPotentiallyUnsafe() const {
+    // Avoid FP vectorization if the target is unsure about proper support.
+    // This may be related to the SIMD unit in the target not handling
+    // IEEE 754 FP ops properly, or bad single-to-double promotions.
+    // Otherwise, a sequence of vectorized loops, even without reduction,
+    // could lead to different end results on the destination vectors.
+    return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
+  }
+
+  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
+
+private:
+  /// Find hints specified in the loop metadata and update local values.
+  void getHintsFromMetadata();
+
+  /// Checks string hint with one operand and set value if valid.
+  void setHint(StringRef Name, Metadata *Arg);
+
+  /// Create a new hint from name / value pair.
+  MDNode *createHintMetadata(StringRef Name, unsigned V) const;
+
+  /// Matches metadata with hint name.
+  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes);
+
+  /// Sets current hints into loop metadata, keeping other values intact.
+  void writeHintsToMetadata(ArrayRef<Hint> HintTypes);
+
+  /// The loop these hints belong to.
+  const Loop *TheLoop;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+};
+
+/// This holds vectorization requirements that must be verified late in
+/// the process. The requirements are set by legalize and costmodel. Once
+/// vectorization has been determined to be possible and profitable the
+/// requirements can be verified by looking for metadata or compiler options.
+/// For example, some loops require FP commutativity which is only allowed if
+/// vectorization is explicitly specified or if the fast-math compiler option
+/// has been provided.
+/// Late evaluation of these requirements allows helpful diagnostics to be
+/// composed that tells the user what need to be done to vectorize the loop. For
+/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
+/// evaluation should be used only when diagnostics can generated that can be
+/// followed by a non-expert user.
+class LoopVectorizationRequirements {
+public:
+  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
+
+  void addUnsafeAlgebraInst(Instruction *I) {
+    // First unsafe algebra instruction.
+    if (!UnsafeAlgebraInst)
+      UnsafeAlgebraInst = I;
+  }
+
+  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
+
+  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints);
+
+private:
+  unsigned NumRuntimePointerChecks = 0;
+  Instruction *UnsafeAlgebraInst = nullptr;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(
+      Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
+      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
+      std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
+      OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
+      LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
+      : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
+        ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
+
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  using InductionList = MapVector<PHINode *, InductionDescriptor>;
+
+  /// RecurrenceSet contains the phi nodes that are recurrences other than
+  /// inductions and reductions.
+  using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  /// Temporarily taking UseVPlanNativePath parameter. If true, take
+  /// the new code path being implemented for outer loop vectorization
+  /// (should be functional for inner loop vectorization) based on VPlan.
+  /// If false, good old LV code.
+  bool canVectorize(bool UseVPlanNativePath);
+
+  /// Returns the primary induction variable.
+  PHINode *getPrimaryInduction() { return PrimaryInduction; }
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Return the first-order recurrences found in the loop.
+  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
+
+  /// Return the set of instructions to sink to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+
+  /// Returns the widest induction type.
+  Type *getWidestInductionType() { return WidestIndTy; }
+
+  /// Returns True if V is a Phi node of an induction variable in this loop.
+  bool isInductionPhi(const Value *V);
+
+  /// Returns True if V is a cast that is part of an induction def-use chain,
+  /// and had been proven to be redundant under a runtime guard (in other
+  /// words, the cast has the same SCEV expression as the induction phi).
+  bool isCastedInductionVariable(const Value *V);
+
+  /// Returns True if V can be considered as an induction variable in this
+  /// loop. V can be the induction phi, or some redundant cast in the def-use
+  /// chain of the inducion phi.
+  bool isInductionVariable(const Value *V);
+
+  /// Returns True if PN is a reduction variable in this loop.
+  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
+
+  /// Returns True if Phi is a first-order recurrence in this loop.
+  bool isFirstOrderRecurrence(const PHINode *Phi);
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// Check if this pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  /// Returns:
+  /// 0 - Stride is unknown or non-consecutive.
+  /// 1 - Address is consecutive.
+  /// -1 - Address is consecutive, and decreasing.
+  /// NOTE: This method must only be used before modifying the original scalar
+  /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
+  int isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns the information that we collected about runtime memory check.
+  const RuntimePointerChecking *getRuntimePointerChecking() const {
+    return LAI->getRuntimePointerChecking();
+  }
+
+  const LoopAccessInfo *getLAI() const { return LAI; }
+
+  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
+
+  uint64_t getMaxSafeRegisterWidth() const {
+    return LAI->getDepChecker().getMaxSafeRegisterWidth();
+  }
+
+  bool hasStride(Value *V) { return LAI->hasStride(V); }
+
+  /// Returns true if vector representation of the instruction \p I
+  /// requires mask.
+  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
+
+  unsigned getNumStores() const { return LAI->getNumStores(); }
+  unsigned getNumLoads() const { return LAI->getNumLoads(); }
+
+  // Returns true if the NoNaN attribute is set on the function.
+  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
+private:
+  /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
+  /// its nested loops are considered legal for vectorization. These legal
+  /// checks are common for inner and outer loop vectorization.
+  /// Temporarily taking UseVPlanNativePath parameter. If true, take
+  /// the new code path being implemented for outer loop vectorization
+  /// (should be functional for inner loop vectorization) based on VPlan.
+  /// If false, good old LV code.
+  bool canVectorizeLoopNestCFG(Loop *Lp, bool UseVPlanNativePath);
+
+  /// Return true if the pre-header, exiting and latch blocks of \p Lp
+  /// (non-recursive) are considered legal for vectorization.
+  /// Temporarily taking UseVPlanNativePath parameter. If true, take
+  /// the new code path being implemented for outer loop vectorization
+  /// (should be functional for inner loop vectorization) based on VPlan.
+  /// If false, good old LV code.
+  bool canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath);
+
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeInstrs();
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if the loop is vectorizable
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Return true if we can vectorize this outer loop. The method performs
+  /// specific checks for outer loop vectorization.
+  bool canVectorizeOuterLoop();
+
+  /// Return true if all of the instructions in the block can be speculatively
+  /// executed. \p SafePtrs is a list of addresses that are known to be legal
+  /// and we know that we can read from them without segfault.
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
+
+  /// Updates the vectorization state by adding \p Phi to the inductions list.
+  /// This can set \p Phi as the main induction of the loop if \p Phi is a
+  /// better choice for the main induction than the existing one.
+  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
+                       SmallPtrSetImpl<Value *> &AllowedExit);
+
+  /// Create an analysis remark that explains why vectorization failed
+  ///
+  /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
+  /// an instruction that prevents vectorization.  Otherwise the loop is used
+  /// for the location of the remark.  \return the remark object that can be
+  /// streamed to.
+  OptimizationRemarkAnalysis
+  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
+    return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+                                  RemarkName, TheLoop, I);
+  }
+
+  /// If an access has a symbolic strides, this maps the pointer value to
+  /// the stride symbol.
+  const ValueToValueMap *getSymbolicStrides() {
+    // FIXME: Currently, the set of symbolic strides is sometimes queried before
+    // it's collected.  This happens from canVectorizeWithIfConvert, when the
+    // pointer is checked to reference consecutive elements suitable for a
+    // masked access.
+    return LAI ? &LAI->getSymbolicStrides() : nullptr;
+  }
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+  /// Applies dynamic knowledge to simplify SCEV expressions in the context
+  /// of existing SCEV assumptions. The analysis will also add a minimal set
+  /// of new predicates if this is required to enable vectorization and
+  /// unrolling.
+  PredicatedScalarEvolution &PSE;
+
+  /// Target Library Info.
+  TargetLibraryInfo *TLI;
+
+  /// Dominator Tree.
+  DominatorTree *DT;
+
+  // LoopAccess analysis.
+  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
+
+  // And the loop-accesses info corresponding to this loop.  This pointer is
+  // null until canVectorizeMemory sets it up.
+  const LoopAccessInfo *LAI = nullptr;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the primary induction variable. This is the counter of the
+  /// loop.
+  PHINode *PrimaryInduction = nullptr;
+
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
+  /// Holds all the casts that participate in the update chain of the induction
+  /// variables, and that have been proven to be redundant (possibly under a
+  /// runtime guard). These casts can be ignored when creating the vectorized
+  /// loop body.
+  SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
+
+  /// Holds the phi nodes that are first-order recurrences.
+  RecurrenceSet FirstOrderRecurrences;
+
+  /// Holds instructions that need to sink past other instructions to handle
+  /// first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter;
+
+  /// Holds the widest induction type encountered.
+  Type *WidestIndTy = nullptr;
+
+  /// Allowed outside users. This holds the induction and reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value *, 4> AllowedExit;
+
+  /// Can we assume the absence of NaNs.
+  bool HasFunNoNaNAttr = false;
+
+  /// Vectorization requirements that will go through late-evaluation.
+  LoopVectorizationRequirements *Requirements;
+
+  /// Used to emit an analysis of any legality issues.
+  LoopVectorizeHints *Hints;
+
+  /// The demanded bits analsyis is used to compute the minimum type size in
+  /// which a reduction can be computed.
+  DemandedBits *DB;
+
+  /// The assumption cache analysis is used to compute the minimum type size in
+  /// which a reduction can be computed.
+  AssumptionCache *AC;
+
+  /// While vectorizing these instructions we have to generate a
+  /// call to the appropriate masked intrinsic
+  SmallPtrSet<const Instruction *, 8> MaskedOp;
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H
diff --git a/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index 32b56d372ea1..d79d84691803 100644
--- a/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/contrib/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -26,6 +26,14 @@
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
 //
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
diff --git a/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 781a628a0974..3152e8192fc5 100644
--- a/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -82,7 +82,7 @@ public:
                OptimizationRemarkEmitter *ORE_);
 
 private:
-  /// \brief Collect store and getelementptr instructions and organize them
+  /// Collect store and getelementptr instructions and organize them
   /// according to the underlying object of their pointer operands. We sort the
   /// instructions by their underlying objects to reduce the cost of
   /// consecutive access queries.
@@ -91,21 +91,23 @@ private:
   ///       every time we run into a memory barrier.
   void collectSeedInstructions(BasicBlock *BB);
 
-  /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
+  /// Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
-  /// \brief Try to vectorize a list of operands.
+  /// Try to vectorize a list of operands.
+  /// \param UserCost Cost of the user operations of \p VL if they may affect
+  /// the cost of the vectorization.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          bool AllowReorder = false);
+                          int UserCost = 0, bool AllowReorder = false);
 
-  /// \brief Try to vectorize a chain that may start at the operands of \p I.
+  /// Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
 
-  /// \brief Vectorize the store instructions collected in Stores.
+  /// Vectorize the store instructions collected in Stores.
   bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);
 
-  /// \brief Vectorize the index computations of the getelementptr instructions
+  /// Vectorize the index computations of the getelementptr instructions
   /// collected in GEPs.
   bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
@@ -131,7 +133,7 @@ private:
   bool vectorizeSimpleInstructions(SmallVectorImpl<WeakVH> &Instructions,
                                    BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
-  /// \brief Scan the basic block and look for patterns that are likely to start
+  /// Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
diff --git a/contrib/llvm/include/llvm/XRay/XRayRecord.h b/contrib/llvm/include/llvm/XRay/XRayRecord.h
index 5c5e9f436f4a..76873447f170 100644
--- a/contrib/llvm/include/llvm/XRay/XRayRecord.h
+++ b/contrib/llvm/include/llvm/XRay/XRayRecord.h
@@ -75,6 +75,9 @@ struct XRayRecord {
   /// The thread ID for the currently running thread.
   uint32_t TId;
 
+  /// The process ID for the currently running process.
+  uint32_t PId;
+
   /// The function call arguments.
   std::vector<uint64_t> CallArgs;
 };
diff --git a/contrib/llvm/include/llvm/XRay/YAMLXRayRecord.h b/contrib/llvm/include/llvm/XRay/YAMLXRayRecord.h
index b436aefb1e8f..0de9ea0968e6 100644
--- a/contrib/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/contrib/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -37,6 +37,7 @@ struct YAMLXRayRecord {
   std::string Function;
   uint64_t TSC;
   uint32_t TId;
+  uint32_t PId;
   std::vector<uint64_t> CallArgs;
 };
 
@@ -79,6 +80,7 @@ template <> struct MappingTraits<xray::YAMLXRayRecord> {
     IO.mapOptional("args", Record.CallArgs);
     IO.mapRequired("cpu", Record.CPU);
     IO.mapRequired("thread", Record.TId);
+    IO.mapOptional("process", Record.PId, 0U);
     IO.mapRequired("kind", Record.Type);
     IO.mapRequired("tsc", Record.TSC);
   }
diff --git a/contrib/llvm/include/llvm/module.modulemap b/contrib/llvm/include/llvm/module.modulemap
index d8b07c4f54da..649cdf3b0a89 100644
--- a/contrib/llvm/include/llvm/module.modulemap
+++ b/contrib/llvm/include/llvm/module.modulemap
@@ -20,15 +20,12 @@ module LLVM_Backend {
 
     // Exclude these; they're intended to be included into only a single
     // translation unit (or none) and aren't part of this module.
-    exclude header "CodeGen/CommandFlags.h"
     exclude header "CodeGen/LinkAllAsmWriterComponents.h"
     exclude header "CodeGen/LinkAllCodegenComponents.h"
 
     // These are intended for (repeated) textual inclusion.
-    textual header "CodeGen/CommandFlags.def"
+    textual header "CodeGen/CommandFlags.inc"
     textual header "CodeGen/DIEValue.def"
-    textual header "CodeGen/RuntimeLibcalls.def"
-    textual header "CodeGen/TargetOpcodes.def"
   }
 
   module Target {
@@ -43,6 +40,7 @@ module LLVM_BinaryFormat {
     requires cplusplus
     umbrella "BinaryFormat" module * { export * }
     textual header "BinaryFormat/Dwarf.def"
+    textual header "BinaryFormat/DynamicTags.def"
     textual header "BinaryFormat/MachO.def"
     textual header "BinaryFormat/ELFRelocs/AArch64.def"
     textual header "BinaryFormat/ELFRelocs/AMDGPU.def"
@@ -60,7 +58,6 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/Sparc.def"
     textual header "BinaryFormat/ELFRelocs/SystemZ.def"
     textual header "BinaryFormat/ELFRelocs/x86_64.def"
-    textual header "BinaryFormat/ELFRelocs/WebAssembly.def"
     textual header "BinaryFormat/WasmRelocs.def"
 }
 
@@ -90,16 +87,21 @@ module LLVM_DebugInfo_PDB {
   // FIXME: There should be a better way to specify this.
   exclude header "DebugInfo/PDB/DIA/DIADataStream.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSymbols.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumTables.h"
+  exclude header "DebugInfo/PDB/DIA/DIAInjectedSource.h"
   exclude header "DebugInfo/PDB/DIA/DIALineNumber.h"
   exclude header "DebugInfo/PDB/DIA/DIARawSymbol.h"
+  exclude header "DebugInfo/PDB/DIA/DIASectionContrib.h"
   exclude header "DebugInfo/PDB/DIA/DIASession.h"
   exclude header "DebugInfo/PDB/DIA/DIASourceFile.h"
   exclude header "DebugInfo/PDB/DIA/DIASupport.h"
   exclude header "DebugInfo/PDB/DIA/DIATable.h"
+  exclude header "DebugInfo/PDB/DIA/DIAUtils.h"
 }
 
 module LLVM_DebugInfo_PDB_DIA {
@@ -192,6 +194,8 @@ module LLVM_intrinsic_gen {
   module IR_CFG { header "IR/CFG.h" export * }
   module IR_ConstantRange { header "IR/ConstantRange.h" export * }
   module IR_Dominators { header "IR/Dominators.h" export * }
+  module Analysis_PostDominators { header "Analysis/PostDominators.h" export * }
+  module IR_DomTreeUpdater { header "IR/DomTreeUpdater.h" export * }
   module IR_IRBuilder { header "IR/IRBuilder.h" export * }
   module IR_PassManager { header "IR/PassManager.h" export * }
   module IR_PredIteratorCache { header "IR/PredIteratorCache.h" export * }
@@ -217,6 +221,7 @@ module LLVM_IR {
   textual header "IR/Instruction.def"
   textual header "IR/Metadata.def"
   textual header "IR/Value.def"
+  textual header "IR/RuntimeLibcalls.def"
 }
 
 module LLVM_IRReader { requires cplusplus umbrella "IRReader" module * { export * } }
@@ -229,7 +234,7 @@ module LLVM_MC {
   umbrella "MC"
   module * { export * }
 
-  textual header "MC/MCTargetOptionsCommandFlags.def"
+  textual header "MC/MCTargetOptionsCommandFlags.inc"
 }
 
 // Used by llvm-tblgen
@@ -297,6 +302,7 @@ module LLVM_Utils {
     // These are intended for textual inclusion.
     textual header "Support/ARMTargetParser.def"
     textual header "Support/AArch64TargetParser.def"
+    textual header "Support/TargetOpcodes.def"
     textual header "Support/X86TargetParser.def"
   }
 
@@ -305,12 +311,6 @@ module LLVM_Utils {
     header "Support/ConvertUTF.h"
     export *
   }
-
-  module LLVM_CodeGen_MachineValueType {
-    requires cplusplus
-    header "CodeGen/MachineValueType.h"
-    export *
-  }
 }
 
 // This is used for a $src == $build compilation. Otherwise we use
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
index 55df66714178..a6585df949f8 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -126,7 +126,7 @@ ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {
 
     // Early-exit the moment we reach the bottom of the lattice.
     if (isNoModRef(Result))
-      return Result;
+      return ModRefInfo::NoModRef;
   }
 
   return Result;
@@ -162,7 +162,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,
 
     // Early-exit the moment we reach the bottom of the lattice.
     if (isNoModRef(Result))
-      return Result;
+      return ModRefInfo::NoModRef;
   }
 
   // Try to refine the mod-ref info further using other API entry points to the
@@ -224,7 +224,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
 
     // Early-exit the moment we reach the bottom of the lattice.
     if (isNoModRef(Result))
-      return Result;
+      return ModRefInfo::NoModRef;
   }
 
   // Try to refine the mod-ref info further using other API entry points to the
@@ -254,85 +254,91 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
   // information from CS1's references to the memory referenced by
   // CS2's arguments.
   if (onlyAccessesArgPointees(CS2B)) {
+    if (!doesAccessArgPointees(CS2B))
+      return ModRefInfo::NoModRef;
     ModRefInfo R = ModRefInfo::NoModRef;
-    if (doesAccessArgPointees(CS2B)) {
-      bool IsMustAlias = true;
-      for (auto I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
-        const Value *Arg = *I;
-        if (!Arg->getType()->isPointerTy())
-          continue;
-        unsigned CS2ArgIdx = std::distance(CS2.arg_begin(), I);
-        auto CS2ArgLoc = MemoryLocation::getForArgument(CS2, CS2ArgIdx, TLI);
-
-        // ArgModRefCS2 indicates what CS2 might do to CS2ArgLoc, and the
-        // dependence of CS1 on that location is the inverse:
-        // - If CS2 modifies location, dependence exists if CS1 reads or writes.
-        // - If CS2 only reads location, dependence exists if CS1 writes.
-        ModRefInfo ArgModRefCS2 = getArgModRefInfo(CS2, CS2ArgIdx);
-        ModRefInfo ArgMask = ModRefInfo::NoModRef;
-        if (isModSet(ArgModRefCS2))
-          ArgMask = ModRefInfo::ModRef;
-        else if (isRefSet(ArgModRefCS2))
-          ArgMask = ModRefInfo::Mod;
-
-        // ModRefCS1 indicates what CS1 might do to CS2ArgLoc, and we use
-        // above ArgMask to update dependence info.
-        ModRefInfo ModRefCS1 = getModRefInfo(CS1, CS2ArgLoc);
-        ArgMask = intersectModRef(ArgMask, ModRefCS1);
-
-        // Conservatively clear IsMustAlias unless only MustAlias is found.
-        IsMustAlias &= isMustSet(ModRefCS1);
-
-        R = intersectModRef(unionModRef(R, ArgMask), Result);
-        if (R == Result) {
-          // On early exit, not all args were checked, cannot set Must.
-          if (I + 1 != E)
-            IsMustAlias = false;
-          break;
-        }
+    bool IsMustAlias = true;
+    for (auto I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
+      const Value *Arg = *I;
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      unsigned CS2ArgIdx = std::distance(CS2.arg_begin(), I);
+      auto CS2ArgLoc = MemoryLocation::getForArgument(CS2, CS2ArgIdx, TLI);
+
+      // ArgModRefCS2 indicates what CS2 might do to CS2ArgLoc, and the
+      // dependence of CS1 on that location is the inverse:
+      // - If CS2 modifies location, dependence exists if CS1 reads or writes.
+      // - If CS2 only reads location, dependence exists if CS1 writes.
+      ModRefInfo ArgModRefCS2 = getArgModRefInfo(CS2, CS2ArgIdx);
+      ModRefInfo ArgMask = ModRefInfo::NoModRef;
+      if (isModSet(ArgModRefCS2))
+        ArgMask = ModRefInfo::ModRef;
+      else if (isRefSet(ArgModRefCS2))
+        ArgMask = ModRefInfo::Mod;
+
+      // ModRefCS1 indicates what CS1 might do to CS2ArgLoc, and we use
+      // above ArgMask to update dependence info.
+      ModRefInfo ModRefCS1 = getModRefInfo(CS1, CS2ArgLoc);
+      ArgMask = intersectModRef(ArgMask, ModRefCS1);
+
+      // Conservatively clear IsMustAlias unless only MustAlias is found.
+      IsMustAlias &= isMustSet(ModRefCS1);
+
+      R = intersectModRef(unionModRef(R, ArgMask), Result);
+      if (R == Result) {
+        // On early exit, not all args were checked, cannot set Must.
+        if (I + 1 != E)
+          IsMustAlias = false;
+        break;
       }
-      // If Alias found and only MustAlias found above, set Must bit.
-      R = IsMustAlias ? setMust(R) : clearMust(R);
     }
-    return R;
+
+    if (isNoModRef(R))
+      return ModRefInfo::NoModRef;
+
+    // If MustAlias found above, set Must bit.
+    return IsMustAlias ? setMust(R) : clearMust(R);
   }
 
   // If CS1 only accesses memory through arguments, check if CS2 references
   // any of the memory referenced by CS1's arguments. If not, return NoModRef.
   if (onlyAccessesArgPointees(CS1B)) {
+    if (!doesAccessArgPointees(CS1B))
+      return ModRefInfo::NoModRef;
     ModRefInfo R = ModRefInfo::NoModRef;
-    if (doesAccessArgPointees(CS1B)) {
-      bool IsMustAlias = true;
-      for (auto I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
-        const Value *Arg = *I;
-        if (!Arg->getType()->isPointerTy())
-          continue;
-        unsigned CS1ArgIdx = std::distance(CS1.arg_begin(), I);
-        auto CS1ArgLoc = MemoryLocation::getForArgument(CS1, CS1ArgIdx, TLI);
-
-        // ArgModRefCS1 indicates what CS1 might do to CS1ArgLoc; if CS1 might
-        // Mod CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If
-        // CS1 might Ref, then we care only about a Mod by CS2.
-        ModRefInfo ArgModRefCS1 = getArgModRefInfo(CS1, CS1ArgIdx);
-        ModRefInfo ModRefCS2 = getModRefInfo(CS2, CS1ArgLoc);
-        if ((isModSet(ArgModRefCS1) && isModOrRefSet(ModRefCS2)) ||
-            (isRefSet(ArgModRefCS1) && isModSet(ModRefCS2)))
-          R = intersectModRef(unionModRef(R, ArgModRefCS1), Result);
-
-        // Conservatively clear IsMustAlias unless only MustAlias is found.
-        IsMustAlias &= isMustSet(ModRefCS2);
-
-        if (R == Result) {
-          // On early exit, not all args were checked, cannot set Must.
-          if (I + 1 != E)
-            IsMustAlias = false;
-          break;
-        }
+    bool IsMustAlias = true;
+    for (auto I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
+      const Value *Arg = *I;
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      unsigned CS1ArgIdx = std::distance(CS1.arg_begin(), I);
+      auto CS1ArgLoc = MemoryLocation::getForArgument(CS1, CS1ArgIdx, TLI);
+
+      // ArgModRefCS1 indicates what CS1 might do to CS1ArgLoc; if CS1 might
+      // Mod CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If
+      // CS1 might Ref, then we care only about a Mod by CS2.
+      ModRefInfo ArgModRefCS1 = getArgModRefInfo(CS1, CS1ArgIdx);
+      ModRefInfo ModRefCS2 = getModRefInfo(CS2, CS1ArgLoc);
+      if ((isModSet(ArgModRefCS1) && isModOrRefSet(ModRefCS2)) ||
+          (isRefSet(ArgModRefCS1) && isModSet(ModRefCS2)))
+        R = intersectModRef(unionModRef(R, ArgModRefCS1), Result);
+
+      // Conservatively clear IsMustAlias unless only MustAlias is found.
+      IsMustAlias &= isMustSet(ModRefCS2);
+
+      if (R == Result) {
+        // On early exit, not all args were checked, cannot set Must.
+        if (I + 1 != E)
+          IsMustAlias = false;
+        break;
       }
-      // If Alias found and only MustAlias found above, set Must bit.
-      R = IsMustAlias ? setMust(R) : clearMust(R);
     }
-    return R;
+
+    if (isNoModRef(R))
+      return ModRefInfo::NoModRef;
+
+    // If MustAlias found above, set Must bit.
+    return IsMustAlias ? setMust(R) : clearMust(R);
   }
 
   return Result;
@@ -366,6 +372,24 @@ FunctionModRefBehavior AAResults::getModRefBehavior(const Function *F) {
   return Result;
 }
 
+raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) {
+  switch (AR) {
+  case NoAlias:
+    OS << "NoAlias";
+    break;
+  case MustAlias:
+    OS << "MustAlias";
+    break;
+  case MayAlias:
+    OS << "MayAlias";
+    break;
+  case PartialAlias:
+    OS << "PartialAlias";
+    break;
+  }
+  return OS;
+}
+
 //===----------------------------------------------------------------------===//
 // Helper method implementation
 //===----------------------------------------------------------------------===//
@@ -515,7 +539,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
   return ModRefInfo::ModRef;
 }
 
-/// \brief Return information about whether a particular call site modifies
+/// Return information about whether a particular call site modifies
 /// or reads the specified memory location \p MemLoc before instruction \p I
 /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
 /// instruction-ordering queries inside the BasicBlock containing \p I.
@@ -548,7 +572,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
 
   unsigned ArgNo = 0;
   ModRefInfo R = ModRefInfo::NoModRef;
-  bool MustAlias = true;
+  bool IsMustAlias = true;
   // Set flag only if no May found and all operands processed.
   for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
        CI != CE; ++CI, ++ArgNo) {
@@ -566,7 +590,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
     // assume that the call could touch the pointer, even though it doesn't
     // escape.
     if (AR != MustAlias)
-      MustAlias = false;
+      IsMustAlias = false;
     if (AR == NoAlias)
       continue;
     if (CS.doesNotAccessMemory(ArgNo))
@@ -578,7 +602,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
     // Not returning MustModRef since we have not seen all the arguments.
     return ModRefInfo::ModRef;
   }
-  return MustAlias ? setMust(R) : clearMust(R);
+  return IsMustAlias ? setMust(R) : clearMust(R);
 }
 
 /// canBasicBlockModify - Return true if it is possible for execution of the
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index f737cecc43d1..764ae9160350 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -41,7 +41,7 @@ static cl::opt<bool> PrintMustModRef("print-mustmodref", cl::ReallyHidden);
 
 static cl::opt<bool> EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden);
 
-static void PrintResults(const char *Msg, bool P, const Value *V1,
+static void PrintResults(AliasResult AR, bool P, const Value *V1,
                          const Value *V2, const Module *M) {
   if (PrintAll || P) {
     std::string o1, o2;
@@ -50,18 +50,15 @@ static void PrintResults(const char *Msg, bool P, const Value *V1,
       V1->printAsOperand(os1, true, M);
       V2->printAsOperand(os2, true, M);
     }
-    
+
     if (o2 < o1)
       std::swap(o1, o2);
-    errs() << "  " << Msg << ":\t"
-           << o1 << ", "
-           << o2 << "\n";
+    errs() << "  " << AR << ":\t" << o1 << ", " << o2 << "\n";
   }
 }
 
-static inline void
-PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
-                   Module *M) {
+static inline void PrintModRefResults(const char *Msg, bool P, Instruction *I,
+                                      Value *Ptr, Module *M) {
   if (PrintAll || P) {
     errs() << "  " << Msg << ":  Ptr: ";
     Ptr->printAsOperand(errs(), true, M);
@@ -69,21 +66,19 @@ PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
   }
 }
 
-static inline void
-PrintModRefResults(const char *Msg, bool P, CallSite CSA, CallSite CSB,
-                   Module *M) {
+static inline void PrintModRefResults(const char *Msg, bool P, CallSite CSA,
+                                      CallSite CSB, Module *M) {
   if (PrintAll || P) {
-    errs() << "  " << Msg << ": " << *CSA.getInstruction()
-           << " <-> " << *CSB.getInstruction() << '\n';
+    errs() << "  " << Msg << ": " << *CSA.getInstruction() << " <-> "
+           << *CSB.getInstruction() << '\n';
   }
 }
 
-static inline void
-PrintLoadStoreResults(const char *Msg, bool P, const Value *V1,
-                      const Value *V2, const Module *M) {
+static inline void PrintLoadStoreResults(AliasResult AR, bool P,
+                                         const Value *V1, const Value *V2,
+                                         const Module *M) {
   if (PrintAll || P) {
-    errs() << "  " << Msg << ": " << *V1
-           << " <-> " << *V2 << '\n';
+    errs() << "  " << AR << ": " << *V1 << " <-> " << *V2 << '\n';
   }
 }
 
@@ -155,22 +150,22 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
       Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType();
       if (I2ElTy->isSized()) I2Size = DL.getTypeStoreSize(I2ElTy);
 
-      switch (AA.alias(*I1, I1Size, *I2, I2Size)) {
+      AliasResult AR = AA.alias(*I1, I1Size, *I2, I2Size);
+      switch (AR) {
       case NoAlias:
-        PrintResults("NoAlias", PrintNoAlias, *I1, *I2, F.getParent());
+        PrintResults(AR, PrintNoAlias, *I1, *I2, F.getParent());
         ++NoAliasCount;
         break;
       case MayAlias:
-        PrintResults("MayAlias", PrintMayAlias, *I1, *I2, F.getParent());
+        PrintResults(AR, PrintMayAlias, *I1, *I2, F.getParent());
         ++MayAliasCount;
         break;
       case PartialAlias:
-        PrintResults("PartialAlias", PrintPartialAlias, *I1, *I2,
-                     F.getParent());
+        PrintResults(AR, PrintPartialAlias, *I1, *I2, F.getParent());
         ++PartialAliasCount;
         break;
       case MustAlias:
-        PrintResults("MustAlias", PrintMustAlias, *I1, *I2, F.getParent());
+        PrintResults(AR, PrintMustAlias, *I1, *I2, F.getParent());
         ++MustAliasCount;
         break;
       }
@@ -181,26 +176,23 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
     // iterate over all pairs of load, store
     for (Value *Load : Loads) {
       for (Value *Store : Stores) {
-        switch (AA.alias(MemoryLocation::get(cast<LoadInst>(Load)),
-                         MemoryLocation::get(cast<StoreInst>(Store)))) {
+        AliasResult AR = AA.alias(MemoryLocation::get(cast<LoadInst>(Load)),
+                                  MemoryLocation::get(cast<StoreInst>(Store)));
+        switch (AR) {
         case NoAlias:
-          PrintLoadStoreResults("NoAlias", PrintNoAlias, Load, Store,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintNoAlias, Load, Store, F.getParent());
           ++NoAliasCount;
           break;
         case MayAlias:
-          PrintLoadStoreResults("MayAlias", PrintMayAlias, Load, Store,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintMayAlias, Load, Store, F.getParent());
           ++MayAliasCount;
           break;
         case PartialAlias:
-          PrintLoadStoreResults("PartialAlias", PrintPartialAlias, Load, Store,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintPartialAlias, Load, Store, F.getParent());
           ++PartialAliasCount;
           break;
         case MustAlias:
-          PrintLoadStoreResults("MustAlias", PrintMustAlias, Load, Store,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintMustAlias, Load, Store, F.getParent());
           ++MustAliasCount;
           break;
         }
@@ -211,26 +203,23 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
     for (SetVector<Value *>::iterator I1 = Stores.begin(), E = Stores.end();
          I1 != E; ++I1) {
       for (SetVector<Value *>::iterator I2 = Stores.begin(); I2 != I1; ++I2) {
-        switch (AA.alias(MemoryLocation::get(cast<StoreInst>(*I1)),
-                         MemoryLocation::get(cast<StoreInst>(*I2)))) {
+        AliasResult AR = AA.alias(MemoryLocation::get(cast<StoreInst>(*I1)),
+                                  MemoryLocation::get(cast<StoreInst>(*I2)));
+        switch (AR) {
         case NoAlias:
-          PrintLoadStoreResults("NoAlias", PrintNoAlias, *I1, *I2,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintNoAlias, *I1, *I2, F.getParent());
           ++NoAliasCount;
           break;
         case MayAlias:
-          PrintLoadStoreResults("MayAlias", PrintMayAlias, *I1, *I2,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintMayAlias, *I1, *I2, F.getParent());
           ++MayAliasCount;
           break;
         case PartialAlias:
-          PrintLoadStoreResults("PartialAlias", PrintPartialAlias, *I1, *I2,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintPartialAlias, *I1, *I2, F.getParent());
           ++PartialAliasCount;
           break;
         case MustAlias:
-          PrintLoadStoreResults("MustAlias", PrintMustAlias, *I1, *I2,
-                                F.getParent());
+          PrintLoadStoreResults(AR, PrintMustAlias, *I1, *I2, F.getParent());
           ++MustAliasCount;
           break;
         }
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisSummary.h b/contrib/llvm/lib/Analysis/AliasAnalysisSummary.h
index 51a85f4e7061..fb93a12420f8 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysisSummary.h
+++ b/contrib/llvm/lib/Analysis/AliasAnalysisSummary.h
@@ -13,7 +13,7 @@
 /// Summary-based analysis, also known as bottom-up analysis, is a style of
 /// interprocedrual static analysis that tries to analyze the callees before the
 /// callers get analyzed. The key idea of summary-based analysis is to first
-/// process each function indepedently, outline its behavior in a condensed
+/// process each function independently, outline its behavior in a condensed
 /// summary, and then instantiate the summary at the callsite when the said
 /// function is called elsewhere. This is often in contrast to another style
 /// called top-down analysis, in which callers are always analyzed first before
diff --git a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
index c88e0dd7dc44..8aee81b1f1d8 100644
--- a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -126,7 +127,7 @@ void AliasSet::removeFromTracker(AliasSetTracker &AST) {
 }
 
 void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
-                          uint64_t Size, const AAMDNodes &AAInfo,
+                          LocationSize Size, const AAMDNodes &AAInfo,
                           bool KnownMustAlias) {
   assert(!Entry.hasAliasSet() && "Entry already in set!");
 
@@ -182,7 +183,7 @@ void AliasSet::addUnknownInst(Instruction *I, AliasAnalysis &AA) {
 /// aliasesPointer - Return true if the specified pointer "may" (or must)
 /// alias one of the members in the set.
 ///
-bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
+bool AliasSet::aliasesPointer(const Value *Ptr, LocationSize Size,
                               const AAMDNodes &AAInfo,
                               AliasAnalysis &AA) const {
   if (AliasAny)
@@ -262,7 +263,7 @@ void AliasSetTracker::clear() {
 /// alias the pointer. Return the unified set, or nullptr if no set that aliases
 /// the pointer was found.
 AliasSet *AliasSetTracker::mergeAliasSetsForPointer(const Value *Ptr,
-                                                    uint64_t Size,
+                                                    LocationSize Size,
                                                     const AAMDNodes &AAInfo) {
   AliasSet *FoundSet = nullptr;
   for (iterator I = begin(), E = end(); I != E;) {
@@ -302,7 +303,8 @@ AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
 
 /// getAliasSetForPointer - Return the alias set that the specified pointer
 /// lives in.
-AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, uint64_t Size,
+AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer,
+                                                 LocationSize Size,
                                                  const AAMDNodes &AAInfo) {
   AliasSet::PointerRec &Entry = getEntryFor(Pointer);
 
@@ -347,7 +349,8 @@ AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, uint64_t Size,
   return AliasSets.back();
 }
 
-void AliasSetTracker::add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) {
+void AliasSetTracker::add(Value *Ptr, LocationSize Size,
+                          const AAMDNodes &AAInfo) {
   addPointer(Ptr, Size, AAInfo, AliasSet::NoAccess);
 }
 
@@ -386,7 +389,7 @@ void AliasSetTracker::add(VAArgInst *VAAI) {
              AliasSet::ModRefAccess);
 }
 
-void AliasSetTracker::add(MemSetInst *MSI) {
+void AliasSetTracker::add(AnyMemSetInst *MSI) {
   AAMDNodes AAInfo;
   MSI->getAAMetadata(AAInfo);
 
@@ -399,11 +402,12 @@ void AliasSetTracker::add(MemSetInst *MSI) {
 
   AliasSet &AS =
       addPointer(MSI->getRawDest(), Len, AAInfo, AliasSet::ModAccess);
-  if (MSI->isVolatile())
+  auto *MS = dyn_cast<MemSetInst>(MSI);
+  if (MS && MS->isVolatile())
     AS.setVolatile();
 }
 
-void AliasSetTracker::add(MemTransferInst *MTI) {
+void AliasSetTracker::add(AnyMemTransferInst *MTI) {
   AAMDNodes AAInfo;
   MTI->getAAMetadata(AAInfo);
 
@@ -415,13 +419,15 @@ void AliasSetTracker::add(MemTransferInst *MTI) {
 
   AliasSet &ASSrc =
       addPointer(MTI->getRawSource(), Len, AAInfo, AliasSet::RefAccess);
-  if (MTI->isVolatile())
-    ASSrc.setVolatile();
 
   AliasSet &ASDst =
       addPointer(MTI->getRawDest(), Len, AAInfo, AliasSet::ModAccess);
-  if (MTI->isVolatile())
+
+  auto* MT = dyn_cast<MemTransferInst>(MTI);
+  if (MT && MT->isVolatile()) {
+    ASSrc.setVolatile();
     ASDst.setVolatile();
+  }
 }
 
 void AliasSetTracker::addUnknown(Instruction *Inst) {
@@ -461,9 +467,9 @@ void AliasSetTracker::add(Instruction *I) {
     return add(SI);
   if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return add(VAAI);
-  if (MemSetInst *MSI = dyn_cast<MemSetInst>(I))
+  if (AnyMemSetInst *MSI = dyn_cast<AnyMemSetInst>(I))
     return add(MSI);
-  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I))
+  if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(I))
     return add(MTI);
   return addUnknown(I);
 }
@@ -588,7 +594,7 @@ AliasSet &AliasSetTracker::mergeAllAliasSets() {
   return *AliasAnyAS;
 }
 
-AliasSet &AliasSetTracker::addPointer(Value *P, uint64_t Size,
+AliasSet &AliasSetTracker::addPointer(Value *P, LocationSize Size,
                                       const AAMDNodes &AAInfo,
                                       AliasSet::AccessLattice E) {
   AliasSet &AS = getAliasSetForPointer(P, Size, AAInfo);
@@ -633,8 +639,12 @@ void AliasSet::print(raw_ostream &OS) const {
     OS << "\n    " << UnknownInsts.size() << " Unknown instructions: ";
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
       if (i) OS << ", ";
-      if (auto *I = getUnknownInst(i))
-        I->printAsOperand(OS);
+      if (auto *I = getUnknownInst(i)) {
+        if (I->hasName())
+          I->printAsOperand(OS);
+        else
+          I->print(OS);
+      }
     }
   }
   OS << "\n";
diff --git a/contrib/llvm/lib/Analysis/Analysis.cpp b/contrib/llvm/lib/Analysis/Analysis.cpp
index 0e0b5c92a918..30576cf1ae10 100644
--- a/contrib/llvm/lib/Analysis/Analysis.cpp
+++ b/contrib/llvm/lib/Analysis/Analysis.cpp
@@ -65,8 +65,10 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeMemoryDependenceWrapperPassPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
   initializeModuleSummaryIndexWrapperPassPass(Registry);
+  initializeMustExecutePrinterPass(Registry);
   initializeObjCARCAAWrapperPassPass(Registry);
   initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
+  initializePhiValuesWrapperPassPass(Registry);
   initializePostDominatorTreeWrapperPassPass(Registry);
   initializeRegionInfoPassPass(Registry);
   initializeRegionViewerPass(Registry);
diff --git a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 537813b6b752..96326347b712 100644
--- a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -85,15 +85,15 @@ const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;
 // depth otherwise the algorithm in aliasGEP will assert.
 static const unsigned MaxLookupSearchDepth = 6;
 
-bool BasicAAResult::invalidate(Function &F, const PreservedAnalyses &PA,
+bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &Inv) {
   // We don't care if this analysis itself is preserved, it has no state. But
   // we need to check that the analyses it depends on have been. Note that we
   // may be created without handles to some analyses and in that case don't
   // depend on them.
-  if (Inv.invalidate<AssumptionAnalysis>(F, PA) ||
-      (DT && Inv.invalidate<DominatorTreeAnalysis>(F, PA)) ||
-      (LI && Inv.invalidate<LoopAnalysis>(F, PA)))
+  if (Inv.invalidate<AssumptionAnalysis>(Fn, PA) ||
+      (DT && Inv.invalidate<DominatorTreeAnalysis>(Fn, PA)) ||
+      (LI && Inv.invalidate<LoopAnalysis>(Fn, PA)))
     return true;
 
   // Otherwise this analysis result remains valid.
@@ -132,7 +132,10 @@ static bool isNonEscapingLocalObject(const Value *V) {
 /// Returns true if the pointer is one which would have been considered an
 /// escape by isNonEscapingLocalObject.
 static bool isEscapeSource(const Value *V) {
-  if (isa<CallInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V))
+  if (ImmutableCallSite(V))
+    return true;
+
+  if (isa<Argument>(V))
     return true;
 
   // The load case works because isNonEscapingLocalObject considers all
@@ -147,10 +150,12 @@ static bool isEscapeSource(const Value *V) {
 /// Returns the size of the object specified by V or UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
+                              bool NullIsValidLoc,
                               bool RoundToAlign = false) {
   uint64_t Size;
   ObjectSizeOpts Opts;
   Opts.RoundToAlign = RoundToAlign;
+  Opts.NullIsUnknownSize = NullIsValidLoc;
   if (getObjectSize(V, Size, DL, &TLI, Opts))
     return Size;
   return MemoryLocation::UnknownSize;
@@ -160,7 +165,8 @@ static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
 /// Size.
 static bool isObjectSmallerThan(const Value *V, uint64_t Size,
                                 const DataLayout &DL,
-                                const TargetLibraryInfo &TLI) {
+                                const TargetLibraryInfo &TLI,
+                                bool NullIsValidLoc) {
   // Note that the meanings of the "object" are slightly different in the
   // following contexts:
   //    c1: llvm::getObjectSize()
@@ -192,15 +198,16 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
 
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
-  uint64_t ObjectSize = getObjectSize(V, DL, TLI, /*RoundToAlign*/ true);
+  uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc,
+                                      /*RoundToAlign*/ true);
 
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size;
 }
 
 /// Returns true if we can prove that the object specified by V has size Size.
 static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
-                         const TargetLibraryInfo &TLI) {
-  uint64_t ObjectSize = getObjectSize(V, DL, TLI);
+                         const TargetLibraryInfo &TLI, bool NullIsValidLoc) {
+  uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc);
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize == Size;
 }
 
@@ -285,6 +292,19 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
       case Instruction::Shl:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
                                 SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
+
+        // We're trying to linearize an expression of the kind:
+        //   shl i8 -128, 36
+        // where the shift count exceeds the bitwidth of the type.
+        // We can't decompose this further (the expression would return
+        // a poison value).
+        if (Offset.getBitWidth() < RHS.getLimitedValue() ||
+            Scale.getBitWidth() < RHS.getLimitedValue()) {
+          Scale = 1;
+          Offset = 0;
+          return V;
+        }
+
         Offset <<= RHS.getLimitedValue();
         Scale <<= RHS.getLimitedValue();
         // the semantics of nsw and nuw for left shifts don't match those of
@@ -414,11 +434,21 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
 
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
     if (!GEPOp) {
-      if (auto CS = ImmutableCallSite(V))
-        if (const Value *RV = CS.getReturnedArgOperand()) {
-          V = RV;
+      if (auto CS = ImmutableCallSite(V)) {
+        // CaptureTracking can know about special capturing properties of some
+        // intrinsics like launder.invariant.group, that can't be expressed with
+        // the attributes, but have properties like returning aliasing pointer.
+        // Because some analysis may assume that nocaptured pointer is not
+        // returned from some special intrinsic (because function would have to
+        // be marked with returns attribute), it is crucial to use this function
+        // because it should be in sync with CaptureTracking. Not using it may
+        // cause weird miscompilations where 2 aliasing pointers are assumed to
+        // noalias.
+        if (auto *RP = getArgumentAliasingToReturnedPointer(CS)) {
+          V = RP;
           continue;
         }
+      }
 
       // If it's not a GEP, hand it off to SimplifyInstruction to see if it
       // can come up with something. This matches what GetUnderlyingObject does.
@@ -490,6 +520,13 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, ZExtBits,
                                   SExtBits, DL, 0, AC, DT, NSW, NUW);
 
+      // All GEP math happens in the width of the pointer type,
+      // so we can truncate the value to 64-bits as we don't handle
+      // currently pointers larger than 64 bits and we would crash
+      // later. TODO: Make `Scale` an APInt to avoid this problem.
+      if (IndexScale.getBitWidth() > 64)
+        IndexScale = IndexScale.sextOrTrunc(64);
+
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
       Decomposed.OtherOffset += IndexOffset.getSExtValue() * Scale;
@@ -832,8 +869,11 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
       IsMustAlias = false;
 
     // Early return if we improved mod ref information
-    if (!isModAndRefSet(Result))
+    if (!isModAndRefSet(Result)) {
+      if (isNoModRef(Result))
+        return ModRefInfo::NoModRef;
       return IsMustAlias ? setMust(Result) : clearMust(Result);
+    }
   }
 
   // If the CallSite is to malloc or calloc, we can assume that it doesn't
@@ -854,7 +894,7 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
   // operands, i.e., source and destination of any given memcpy must no-alias.
   // If Loc must-aliases either one of these two locations, then it necessarily
   // no-aliases the other.
-  if (auto *Inst = dyn_cast<MemCpyInst>(CS.getInstruction())) {
+  if (auto *Inst = dyn_cast<AnyMemCpyInst>(CS.getInstruction())) {
     AliasResult SrcAA, DestAA;
 
     if ((SrcAA = getBestAAResults().alias(MemoryLocation::getForSource(Inst),
@@ -958,12 +998,12 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS1,
 /// Provide ad-hoc rules to disambiguate accesses through two GEP operators,
 /// both having the exact same pointer operand.
 static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
-                                            uint64_t V1Size,
+                                            LocationSize V1Size,
                                             const GEPOperator *GEP2,
-                                            uint64_t V2Size,
+                                            LocationSize V2Size,
                                             const DataLayout &DL) {
-  assert(GEP1->getPointerOperand()->stripPointerCastsAndBarriers() ==
-             GEP2->getPointerOperand()->stripPointerCastsAndBarriers() &&
+  assert(GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
+             GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
          GEP1->getPointerOperandType() == GEP2->getPointerOperandType() &&
          "Expected GEPs with the same pointer operand");
 
@@ -1135,8 +1175,8 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
 // the highest %f1 can be is (%alloca + 3). This means %random can not be higher
 // than (%alloca - 1), and so is not inbounds, a contradiction.
 bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
-      const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject, 
-      uint64_t ObjectAccessSize) {
+      const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
+      LocationSize ObjectAccessSize) {
   // If the object access size is unknown, or the GEP isn't inbounds, bail.
   if (ObjectAccessSize == MemoryLocation::UnknownSize || !GEPOp->isInBounds())
     return false;
@@ -1153,13 +1193,13 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
                              DecompObject.OtherOffset;
 
   // If the GEP has no variable indices, we know the precise offset
-  // from the base, then use it. If the GEP has variable indices, we're in
-  // a bit more trouble: we can't count on the constant offsets that come
-  // from non-struct sources, since these can be "rewound" by a negative
-  // variable offset. So use only offsets that came from structs.
+  // from the base, then use it. If the GEP has variable indices,
+  // we can't get exact GEP offset to identify pointer alias. So return
+  // false in that case.
+  if (!DecompGEP.VarIndices.empty())
+    return false;
   int64_t GEPBaseOffset = DecompGEP.StructOffset;
-  if (DecompGEP.VarIndices.empty())
-    GEPBaseOffset += DecompGEP.OtherOffset;
+  GEPBaseOffset += DecompGEP.OtherOffset;
 
   return (GEPBaseOffset >= ObjectBaseOffset + (int64_t)ObjectAccessSize);
 }
@@ -1170,11 +1210,11 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
 /// We know that V1 is a GEP, but we don't know anything about V2.
 /// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for
 /// V2.
-AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
-                                    const AAMDNodes &V1AAInfo, const Value *V2,
-                                    uint64_t V2Size, const AAMDNodes &V2AAInfo,
-                                    const Value *UnderlyingV1,
-                                    const Value *UnderlyingV2) {
+AliasResult
+BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
+                        const AAMDNodes &V1AAInfo, const Value *V2,
+                        LocationSize V2Size, const AAMDNodes &V2AAInfo,
+                        const Value *UnderlyingV1, const Value *UnderlyingV2) {
   DecomposedGEP DecompGEP1, DecompGEP2;
   bool GEP1MaxLookupReached =
     DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
@@ -1241,8 +1281,8 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     // If we know the two GEPs are based off of the exact same pointer (and not
     // just the same underlying object), see if that tells us anything about
     // the resulting pointers.
-    if (GEP1->getPointerOperand()->stripPointerCastsAndBarriers() ==
-            GEP2->getPointerOperand()->stripPointerCastsAndBarriers() &&
+    if (GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
+            GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
         GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) {
       AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
       // If we couldn't find anything interesting, don't abandon just yet.
@@ -1403,9 +1443,10 @@ static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
 
 /// Provides a bunch of ad-hoc rules to disambiguate a Select instruction
 /// against another.
-AliasResult BasicAAResult::aliasSelect(const SelectInst *SI, uint64_t SISize,
+AliasResult BasicAAResult::aliasSelect(const SelectInst *SI,
+                                       LocationSize SISize,
                                        const AAMDNodes &SIAAInfo,
-                                       const Value *V2, uint64_t V2Size,
+                                       const Value *V2, LocationSize V2Size,
                                        const AAMDNodes &V2AAInfo,
                                        const Value *UnderV2) {
   // If the values are Selects with the same condition, we can do a more precise
@@ -1438,9 +1479,10 @@ AliasResult BasicAAResult::aliasSelect(const SelectInst *SI, uint64_t SISize,
 
 /// Provide a bunch of ad-hoc rules to disambiguate a PHI instruction against
 /// another.
-AliasResult BasicAAResult::aliasPHI(const PHINode *PN, uint64_t PNSize,
+AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
                                     const AAMDNodes &PNAAInfo, const Value *V2,
-                                    uint64_t V2Size, const AAMDNodes &V2AAInfo,
+                                    LocationSize V2Size,
+                                    const AAMDNodes &V2AAInfo,
                                     const Value *UnderV2) {
   // Track phi nodes we have visited. We use this information when we determine
   // value equivalence.
@@ -1545,9 +1587,9 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, uint64_t PNSize,
 
 /// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as
 /// array references.
-AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
+AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
                                       AAMDNodes V1AAInfo, const Value *V2,
-                                      uint64_t V2Size, AAMDNodes V2AAInfo, 
+                                      LocationSize V2Size, AAMDNodes V2AAInfo,
                                       const Value *O1, const Value *O2) {
   // If either of the memory references is empty, it doesn't matter what the
   // pointer values are.
@@ -1555,8 +1597,8 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
     return NoAlias;
 
   // Strip off any casts if they exist.
-  V1 = V1->stripPointerCastsAndBarriers();
-  V2 = V2->stripPointerCastsAndBarriers();
+  V1 = V1->stripPointerCastsAndInvariantGroups();
+  V2 = V2->stripPointerCastsAndInvariantGroups();
 
   // If V1 or V2 is undef, the result is NoAlias because we can always pick a
   // value for undef that aliases nothing in the program.
@@ -1585,10 +1627,10 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
   // Null values in the default address space don't point to any object, so they
   // don't alias any other pointer.
   if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O1))
-    if (CPN->getType()->getAddressSpace() == 0)
+    if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
       return NoAlias;
   if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O2))
-    if (CPN->getType()->getAddressSpace() == 0)
+    if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
       return NoAlias;
 
   if (O1 != O2) {
@@ -1624,10 +1666,11 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
 
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
+  bool NullIsValidLocation = NullPointerIsDefined(&F);
   if ((V1Size != MemoryLocation::UnknownSize &&
-       isObjectSmallerThan(O2, V1Size, DL, TLI)) ||
+       isObjectSmallerThan(O2, V1Size, DL, TLI, NullIsValidLocation)) ||
       (V2Size != MemoryLocation::UnknownSize &&
-       isObjectSmallerThan(O1, V2Size, DL, TLI)))
+       isObjectSmallerThan(O1, V2Size, DL, TLI, NullIsValidLocation)))
     return NoAlias;
 
   // Check the cache before climbing up use-def chains. This also terminates
@@ -1687,8 +1730,8 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
   if (O1 == O2)
     if (V1Size != MemoryLocation::UnknownSize &&
         V2Size != MemoryLocation::UnknownSize &&
-        (isObjectSize(O1, V1Size, DL, TLI) ||
-         isObjectSize(O2, V2Size, DL, TLI)))
+        (isObjectSize(O1, V1Size, DL, TLI, NullIsValidLocation) ||
+         isObjectSize(O2, V2Size, DL, TLI, NullIsValidLocation)))
       return AliasCache[Locs] = PartialAlias;
 
   // Recurse back into the best AA results we have, potentially with refined
@@ -1771,8 +1814,8 @@ void BasicAAResult::GetIndexDifference(
 }
 
 bool BasicAAResult::constantOffsetHeuristic(
-    const SmallVectorImpl<VariableGEPIndex> &VarIndices, uint64_t V1Size,
-    uint64_t V2Size, int64_t BaseOffset, AssumptionCache *AC,
+    const SmallVectorImpl<VariableGEPIndex> &VarIndices, LocationSize V1Size,
+    LocationSize V2Size, int64_t BaseOffset, AssumptionCache *AC,
     DominatorTree *DT) {
   if (VarIndices.size() != 2 || V1Size == MemoryLocation::UnknownSize ||
       V2Size == MemoryLocation::UnknownSize)
@@ -1832,6 +1875,7 @@ AnalysisKey BasicAA::Key;
 
 BasicAAResult BasicAA::run(Function &F, FunctionAnalysisManager &AM) {
   return BasicAAResult(F.getParent()->getDataLayout(),
+                       F,
                        AM.getResult<TargetLibraryAnalysis>(F),
                        AM.getResult<AssumptionAnalysis>(F),
                        &AM.getResult<DominatorTreeAnalysis>(F),
@@ -1864,7 +1908,7 @@ bool BasicAAWrapperPass::runOnFunction(Function &F) {
   auto &DTWP = getAnalysis<DominatorTreeWrapperPass>();
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
 
-  Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), TLIWP.getTLI(),
+  Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F, TLIWP.getTLI(),
                                  ACT.getAssumptionCache(F), &DTWP.getDomTree(),
                                  LIWP ? &LIWP->getLoopInfo() : nullptr));
 
@@ -1881,6 +1925,7 @@ void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
 BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) {
   return BasicAAResult(
       F.getParent()->getDataLayout(),
+      F,
       P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
       P.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
 }
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 7e323022d9ce..3d095068e7ff 100644
--- a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
@@ -73,7 +74,7 @@ using LoopData = BlockFrequencyInfoImplBase::LoopData;
 using Weight = BlockFrequencyInfoImplBase::Weight;
 using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData;
 
-/// \brief Dithering mass distributer.
+/// Dithering mass distributer.
 ///
 /// This class splits up a single mass into portions by weight, dithering to
 /// spread out error.  No mass is lost.  The dithering precision depends on the
@@ -155,9 +156,9 @@ static void combineWeight(Weight &W, const Weight &OtherW) {
 
 static void combineWeightsBySorting(WeightList &Weights) {
   // Sort so edges to the same node are adjacent.
-  std::sort(Weights.begin(), Weights.end(),
-            [](const Weight &L,
-               const Weight &R) { return L.TargetNode < R.TargetNode; });
+  llvm::sort(Weights.begin(), Weights.end(),
+             [](const Weight &L,
+                const Weight &R) { return L.TargetNode < R.TargetNode; });
 
   // Combine adjacent edges.
   WeightList::iterator O = Weights.begin();
@@ -276,7 +277,7 @@ void BlockFrequencyInfoImplBase::clear() {
   Loops.clear();
 }
 
-/// \brief Clear all memory not needed downstream.
+/// Clear all memory not needed downstream.
 ///
 /// Releases all memory not used downstream.  In particular, saves Freqs.
 static void cleanup(BlockFrequencyInfoImplBase &BFI) {
@@ -315,13 +316,13 @@ bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
 #endif
 
   if (isLoopHeader(Resolved)) {
-    DEBUG(debugSuccessor("backedge"));
+    LLVM_DEBUG(debugSuccessor("backedge"));
     Dist.addBackedge(Resolved, Weight);
     return true;
   }
 
   if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {
-    DEBUG(debugSuccessor("  exit  "));
+    LLVM_DEBUG(debugSuccessor("  exit  "));
     Dist.addExit(Resolved, Weight);
     return true;
   }
@@ -333,7 +334,7 @@ bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
              "unhandled irreducible control flow");
 
       // Irreducible backedge.  Abort.
-      DEBUG(debugSuccessor("abort!!!"));
+      LLVM_DEBUG(debugSuccessor("abort!!!"));
       return false;
     }
 
@@ -344,7 +345,7 @@ bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
            "unhandled irreducible control flow");
   }
 
-  DEBUG(debugSuccessor(" local  "));
+  LLVM_DEBUG(debugSuccessor(" local  "));
   Dist.addLocal(Resolved, Weight);
   return true;
 }
@@ -361,10 +362,10 @@ bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
   return true;
 }
 
-/// \brief Compute the loop scale for a loop.
+/// Compute the loop scale for a loop.
 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   // Compute loop scale.
-  DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
+  LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
 
   // Infinite loops need special handling. If we give the back edge an infinite
   // mass, they may saturate all the other scales in the function down to 1,
@@ -390,20 +391,21 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   Loop.Scale =
       ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse();
 
-  DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
-               << " - " << TotalBackedgeMass << ")\n"
-               << " - scale = " << Loop.Scale << "\n");
+  LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " ("
+                    << BlockMass::getFull() << " - " << TotalBackedgeMass
+                    << ")\n"
+                    << " - scale = " << Loop.Scale << "\n");
 }
 
-/// \brief Package up a loop.
+/// Package up a loop.
 void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {
-  DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");
+  LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");
 
   // Clear the subloop exits to prevent quadratic memory usage.
   for (const BlockNode &M : Loop.Nodes) {
     if (auto *Loop = Working[M.Index].getPackagedLoop())
       Loop->Exits.clear();
-    DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
+    LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
   }
   Loop.IsPackaged = true;
 }
@@ -425,7 +427,7 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
                                                 LoopData *OuterLoop,
                                                 Distribution &Dist) {
   BlockMass Mass = Working[Source.Index].getMass();
-  DEBUG(dbgs() << "  => mass:  " << Mass << "\n");
+  LLVM_DEBUG(dbgs() << "  => mass:  " << Mass << "\n");
 
   // Distribute mass to successors as laid out in Dist.
   DitheringDistributer D(Dist, Mass);
@@ -435,7 +437,7 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
     BlockMass Taken = D.takeMass(W.Amount);
     if (W.Type == Weight::Local) {
       Working[W.TargetNode.Index].getMass() += Taken;
-      DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+      LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
       continue;
     }
 
@@ -445,14 +447,14 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
     // Check for a backedge.
     if (W.Type == Weight::Backedge) {
       OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken;
-      DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
+      LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
       continue;
     }
 
     // This must be an exit.
     assert(W.Type == Weight::Exit);
     OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
-    DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
+    LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
   }
 }
 
@@ -480,28 +482,28 @@ static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
   }
 
   // Translate the floats to integers.
-  DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
-               << ", factor = " << ScalingFactor << "\n");
+  LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
+                    << ", factor = " << ScalingFactor << "\n");
   for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
     Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
     BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
-    DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
-                 << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled
-                 << ", int = " << BFI.Freqs[Index].Integer << "\n");
+    LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
+                      << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled
+                      << ", int = " << BFI.Freqs[Index].Integer << "\n");
   }
 }
 
-/// \brief Unwrap a loop package.
+/// Unwrap a loop package.
 ///
 /// Visits all the members of a loop, adjusting their BlockData according to
 /// the loop's pseudo-node.
 static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
-  DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
-               << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
-               << "\n");
+  LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
+                    << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
+                    << "\n");
   Loop.Scale *= Loop.Mass.toScaled();
   Loop.IsPackaged = false;
-  DEBUG(dbgs() << "  => combined-scale = " << Loop.Scale << "\n");
+  LLVM_DEBUG(dbgs() << "  => combined-scale = " << Loop.Scale << "\n");
 
   // Propagate the head scale through the loop.  Since members are visited in
   // RPO, the head scale will be updated by the loop scale first, and then the
@@ -511,8 +513,8 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
     Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
                                        : BFI.Freqs[N.Index].Scaled;
     Scaled64 New = Loop.Scale * F;
-    DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New
-                 << "\n");
+    LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => "
+                      << New << "\n");
     F = New;
   }
 }
@@ -544,7 +546,7 @@ void BlockFrequencyInfoImplBase::finalizeMetrics() {
   cleanup(*this);
 
   // Print out the final stats.
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
 }
 
 BlockFrequency
@@ -567,7 +569,7 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
   if (!EntryCount)
     return None;
   // Use 128 bit APInt to do the arithmetic to avoid overflow.
-  APInt BlockCount(128, EntryCount.getValue());
+  APInt BlockCount(128, EntryCount.getCount());
   APInt BlockFreq(128, Freq);
   APInt EntryFreq(128, getEntryFreq());
   BlockCount *= BlockFreq;
@@ -669,7 +671,7 @@ template <> struct GraphTraits<IrreducibleGraph> {
 
 } // end namespace llvm
 
-/// \brief Find extra irreducible headers.
+/// Find extra irreducible headers.
 ///
 /// Find entry blocks and other blocks with backedges, which exist when \c G
 /// contains irreducible sub-SCCs.
@@ -694,7 +696,8 @@ static void findIrreducibleHeaders(
       // This is an entry block.
       I->second = true;
       Headers.push_back(Irr.Node);
-      DEBUG(dbgs() << "  => entry = " << BFI.getBlockName(Irr.Node) << "\n");
+      LLVM_DEBUG(dbgs() << "  => entry = " << BFI.getBlockName(Irr.Node)
+                        << "\n");
       break;
     }
   }
@@ -702,7 +705,7 @@ static void findIrreducibleHeaders(
          "Expected irreducible CFG; -loop-info is likely invalid");
   if (Headers.size() == InSCC.size()) {
     // Every block is a header.
-    std::sort(Headers.begin(), Headers.end());
+    llvm::sort(Headers.begin(), Headers.end());
     return;
   }
 
@@ -725,7 +728,8 @@ static void findIrreducibleHeaders(
 
       // Store the extra header.
       Headers.push_back(Irr.Node);
-      DEBUG(dbgs() << "  => extra = " << BFI.getBlockName(Irr.Node) << "\n");
+      LLVM_DEBUG(dbgs() << "  => extra = " << BFI.getBlockName(Irr.Node)
+                        << "\n");
       break;
     }
     if (Headers.back() == Irr.Node)
@@ -734,10 +738,10 @@ static void findIrreducibleHeaders(
 
     // This is not a header.
     Others.push_back(Irr.Node);
-    DEBUG(dbgs() << "  => other = " << BFI.getBlockName(Irr.Node) << "\n");
+    LLVM_DEBUG(dbgs() << "  => other = " << BFI.getBlockName(Irr.Node) << "\n");
   }
-  std::sort(Headers.begin(), Headers.end());
-  std::sort(Others.begin(), Others.end());
+  llvm::sort(Headers.begin(), Headers.end());
+  llvm::sort(Others.begin(), Others.end());
 }
 
 static void createIrreducibleLoop(
@@ -745,7 +749,7 @@ static void createIrreducibleLoop(
     LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
     const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {
   // Translate the SCC into RPO.
-  DEBUG(dbgs() << " - found-scc\n");
+  LLVM_DEBUG(dbgs() << " - found-scc\n");
 
   LoopData::NodeList Headers;
   LoopData::NodeList Others;
@@ -806,27 +810,28 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
   BlockMass LoopMass = BlockMass::getFull();
   Distribution Dist;
 
-  DEBUG(dbgs() << "adjust-loop-header-mass:\n");
+  LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n");
   for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
     auto &HeaderNode = Loop.Nodes[H];
     auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)];
-    DEBUG(dbgs() << " - Add back edge mass for node "
-                 << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
+    LLVM_DEBUG(dbgs() << " - Add back edge mass for node "
+                      << getBlockName(HeaderNode) << ": " << BackedgeMass
+                      << "\n");
     if (BackedgeMass.getMass() > 0)
       Dist.addLocal(HeaderNode, BackedgeMass.getMass());
     else
-      DEBUG(dbgs() << "   Nothing added. Back edge mass is zero\n");
+      LLVM_DEBUG(dbgs() << "   Nothing added. Back edge mass is zero\n");
   }
 
   DitheringDistributer D(Dist, LoopMass);
 
-  DEBUG(dbgs() << " Distribute loop mass " << LoopMass
-               << " to headers using above weights\n");
+  LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass
+                    << " to headers using above weights\n");
   for (const Weight &W : Dist.Weights) {
     BlockMass Taken = D.takeMass(W.Amount);
     assert(W.Type == Weight::Local && "all weights should be local");
     Working[W.TargetNode.Index].getMass() = Taken;
-    DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+    LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
   }
 }
 
@@ -837,6 +842,6 @@ void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist)
     BlockMass Taken = D.takeMass(W.Amount);
     assert(W.Type == Weight::Local && "all weights should be local");
     Working[W.TargetNode.Index].getMass() = Taken;
-    DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+    LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
   }
 }
diff --git a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 58ccad89d508..54a657073f0f 100644
--- a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -85,15 +86,17 @@ char BranchProbabilityInfoWrapperPass::ID = 0;
 // Probability of the edge BB2->BB3 = 4 / (124 + 4) = 0.03125
 static const uint32_t LBH_TAKEN_WEIGHT = 124;
 static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
+// Unlikely edges within a loop are half as likely as other edges
+static const uint32_t LBH_UNLIKELY_WEIGHT = 62;
 
-/// \brief Unreachable-terminating branch taken probability.
+/// Unreachable-terminating branch taken probability.
 ///
 /// This is the probability for a branch being taken to a block that terminates
 /// (eventually) in unreachable. These are predicted as unlikely as possible.
 /// All reachable probability will equally share the remaining part.
 static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 
-/// \brief Weight for a branch taken going into a cold block.
+/// Weight for a branch taken going into a cold block.
 ///
 /// This is the weight for a branch taken toward a block marked
 /// cold.  A block is marked cold if it's postdominated by a
@@ -101,7 +104,7 @@ static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 /// are those marked with attribute 'cold'.
 static const uint32_t CC_TAKEN_WEIGHT = 4;
 
-/// \brief Weight for a branch not-taken into a cold block.
+/// Weight for a branch not-taken into a cold block.
 ///
 /// This is the weight for a branch not taken toward a block marked
 /// cold.
@@ -116,20 +119,20 @@ static const uint32_t ZH_NONTAKEN_WEIGHT = 12;
 static const uint32_t FPH_TAKEN_WEIGHT = 20;
 static const uint32_t FPH_NONTAKEN_WEIGHT = 12;
 
-/// \brief Invoke-terminating normal branch taken weight
+/// Invoke-terminating normal branch taken weight
 ///
 /// This is the weight for branching to the normal destination of an invoke
 /// instruction. We expect this to happen most of the time. Set the weight to an
 /// absurdly high value so that nested loops subsume it.
 static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
 
-/// \brief Invoke-terminating normal branch not-taken weight.
+/// Invoke-terminating normal branch not-taken weight.
 ///
 /// This is the weight for branching to the unwind destination of an invoke
 /// instruction. This is essentially never taken.
 static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 
-/// \brief Add \p BB to PostDominatedByUnreachable set if applicable.
+/// Add \p BB to PostDominatedByUnreachable set if applicable.
 void
 BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
@@ -160,7 +163,7 @@ BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
   PostDominatedByUnreachable.insert(BB);
 }
 
-/// \brief Add \p BB to PostDominatedByColdCall set if applicable.
+/// Add \p BB to PostDominatedByColdCall set if applicable.
 void
 BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
   assert(!PostDominatedByColdCall.count(BB));
@@ -194,18 +197,16 @@ BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
       }
 }
 
-/// \brief Calculate edge weights for successors lead to unreachable.
+/// Calculate edge weights for successors lead to unreachable.
 ///
 /// Predict that a successor which leads necessarily to an
 /// unreachable-terminated block as extremely unlikely.
 bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
+  (void) TI;
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
-
-  // Return false here so that edge weights for InvokeInst could be decided
-  // in calcInvokeHeuristics().
-  if (isa<InvokeInst>(TI))
-    return false;
+  assert(!isa<InvokeInst>(TI) &&
+         "Invokes should have already been handled by calcInvokeHeuristics");
 
   SmallVector<unsigned, 4> UnreachableEdges;
   SmallVector<unsigned, 4> ReachableEdges;
@@ -338,7 +339,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   return true;
 }
 
-/// \brief Calculate edge weights for edges leading to cold blocks.
+/// Calculate edge weights for edges leading to cold blocks.
 ///
 /// A cold block is one post-dominated by  a block with a call to a
 /// cold function.  Those edges are unlikely to be taken, so we give
@@ -348,12 +349,10 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
 /// Return false, otherwise.
 bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
+  (void) TI;
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
-
-  // Return false here so that edge weights for InvokeInst could be decided
-  // in calcInvokeHeuristics().
-  if (isa<InvokeInst>(TI))
-    return false;
+  assert(!isa<InvokeInst>(TI) &&
+         "Invokes should have already been handled by calcInvokeHeuristics");
 
   // Determine which successors are post-dominated by a cold block.
   SmallVector<unsigned, 4> ColdEdges;
@@ -390,7 +389,7 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
   return true;
 }
 
-// Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
+// Calculate Edge Weights using "Pointer Heuristics". Predict a comparison
 // between two pointer or pointer and NULL will fail.
 bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
   const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
@@ -457,6 +456,113 @@ static bool isSCCHeader(const BasicBlock *BB, int SccNum,
     return HeaderMapIt->second;
 }
 
+// Compute the unlikely successors to the block BB in the loop L, specifically
+// those that are unlikely because this is a loop, and add them to the
+// UnlikelyBlocks set.
+static void
+computeUnlikelySuccessors(const BasicBlock *BB, Loop *L,
+                          SmallPtrSetImpl<const BasicBlock*> &UnlikelyBlocks) {
+  // Sometimes in a loop we have a branch whose condition is made false by
+  // taking it. This is typically something like
+  //  int n = 0;
+  //  while (...) {
+  //    if (++n >= MAX) {
+  //      n = 0;
+  //    }
+  //  }
+  // In this sort of situation taking the branch means that at the very least it
+  // won't be taken again in the next iteration of the loop, so we should
+  // consider it less likely than a typical branch.
+  //
+  // We detect this by looking back through the graph of PHI nodes that sets the
+  // value that the condition depends on, and seeing if we can reach a successor
+  // block which can be determined to make the condition false.
+  //
+  // FIXME: We currently consider unlikely blocks to be half as likely as other
+  // blocks, but if we consider the example above the likelyhood is actually
+  // 1/MAX. We could therefore be more precise in how unlikely we consider
+  // blocks to be, but it would require more careful examination of the form
+  // of the comparison expression.
+  const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return;
+
+  // Check if the branch is based on an instruction compared with a constant
+  CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+  if (!CI || !isa<Instruction>(CI->getOperand(0)) ||
+      !isa<Constant>(CI->getOperand(1)))
+    return;
+
+  // Either the instruction must be a PHI, or a chain of operations involving
+  // constants that ends in a PHI which we can then collapse into a single value
+  // if the PHI value is known.
+  Instruction *CmpLHS = dyn_cast<Instruction>(CI->getOperand(0));
+  PHINode *CmpPHI = dyn_cast<PHINode>(CmpLHS);
+  Constant *CmpConst = dyn_cast<Constant>(CI->getOperand(1));
+  // Collect the instructions until we hit a PHI
+  SmallVector<BinaryOperator *, 1> InstChain;
+  while (!CmpPHI && CmpLHS && isa<BinaryOperator>(CmpLHS) &&
+         isa<Constant>(CmpLHS->getOperand(1))) {
+    // Stop if the chain extends outside of the loop
+    if (!L->contains(CmpLHS))
+      return;
+    InstChain.push_back(cast<BinaryOperator>(CmpLHS));
+    CmpLHS = dyn_cast<Instruction>(CmpLHS->getOperand(0));
+    if (CmpLHS)
+      CmpPHI = dyn_cast<PHINode>(CmpLHS);
+  }
+  if (!CmpPHI || !L->contains(CmpPHI))
+    return;
+
+  // Trace the phi node to find all values that come from successors of BB
+  SmallPtrSet<PHINode*, 8> VisitedInsts;
+  SmallVector<PHINode*, 8> WorkList;
+  WorkList.push_back(CmpPHI);
+  VisitedInsts.insert(CmpPHI);
+  while (!WorkList.empty()) {
+    PHINode *P = WorkList.back();
+    WorkList.pop_back();
+    for (BasicBlock *B : P->blocks()) {
+      // Skip blocks that aren't part of the loop
+      if (!L->contains(B))
+        continue;
+      Value *V = P->getIncomingValueForBlock(B);
+      // If the source is a PHI add it to the work list if we haven't
+      // already visited it.
+      if (PHINode *PN = dyn_cast<PHINode>(V)) {
+        if (VisitedInsts.insert(PN).second)
+          WorkList.push_back(PN);
+        continue;
+      }
+      // If this incoming value is a constant and B is a successor of BB, then
+      // we can constant-evaluate the compare to see if it makes the branch be
+      // taken or not.
+      Constant *CmpLHSConst = dyn_cast<Constant>(V);
+      if (!CmpLHSConst ||
+          std::find(succ_begin(BB), succ_end(BB), B) == succ_end(BB))
+        continue;
+      // First collapse InstChain
+      for (Instruction *I : llvm::reverse(InstChain)) {
+        CmpLHSConst = ConstantExpr::get(I->getOpcode(), CmpLHSConst,
+                                        cast<Constant>(I->getOperand(1)), true);
+        if (!CmpLHSConst)
+          break;
+      }
+      if (!CmpLHSConst)
+        continue;
+      // Now constant-evaluate the compare
+      Constant *Result = ConstantExpr::getCompare(CI->getPredicate(),
+                                                  CmpLHSConst, CmpConst, true);
+      // If the result means we don't branch to the block then that block is
+      // unlikely.
+      if (Result &&
+          ((Result->isZeroValue() && B == BI->getSuccessor(0)) ||
+           (Result->isOneValue() && B == BI->getSuccessor(1))))
+        UnlikelyBlocks.insert(B);
+    }
+  }
+}
+
 // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
 // as taken, exiting edges as not-taken.
 bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
@@ -470,15 +576,22 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
       return false;
   }
 
+  SmallPtrSet<const BasicBlock*, 8> UnlikelyBlocks;
+  if (L)
+    computeUnlikelySuccessors(BB, L, UnlikelyBlocks);
+
   SmallVector<unsigned, 8> BackEdges;
   SmallVector<unsigned, 8> ExitingEdges;
   SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
+  SmallVector<unsigned, 8> UnlikelyEdges;
 
   for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     // Use LoopInfo if we have it, otherwise fall-back to SCC info to catch
     // irreducible loops.
     if (L) {
-      if (!L->contains(*I))
+      if (UnlikelyBlocks.count(*I) != 0)
+        UnlikelyEdges.push_back(I.getSuccessorIndex());
+      else if (!L->contains(*I))
         ExitingEdges.push_back(I.getSuccessorIndex());
       else if (L->getHeader() == *I)
         BackEdges.push_back(I.getSuccessorIndex());
@@ -494,42 +607,46 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
     }
   }
 
-  if (BackEdges.empty() && ExitingEdges.empty())
+  if (BackEdges.empty() && ExitingEdges.empty() && UnlikelyEdges.empty())
     return false;
 
   // Collect the sum of probabilities of back-edges/in-edges/exiting-edges, and
   // normalize them so that they sum up to one.
-  BranchProbability Probs[] = {BranchProbability::getZero(),
-                               BranchProbability::getZero(),
-                               BranchProbability::getZero()};
   unsigned Denom = (BackEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
                    (InEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
+                   (UnlikelyEdges.empty() ? 0 : LBH_UNLIKELY_WEIGHT) +
                    (ExitingEdges.empty() ? 0 : LBH_NONTAKEN_WEIGHT);
-  if (!BackEdges.empty())
-    Probs[0] = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
-  if (!InEdges.empty())
-    Probs[1] = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
-  if (!ExitingEdges.empty())
-    Probs[2] = BranchProbability(LBH_NONTAKEN_WEIGHT, Denom);
 
   if (uint32_t numBackEdges = BackEdges.size()) {
-    auto Prob = Probs[0] / numBackEdges;
+    BranchProbability TakenProb = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
+    auto Prob = TakenProb / numBackEdges;
     for (unsigned SuccIdx : BackEdges)
       setEdgeProbability(BB, SuccIdx, Prob);
   }
 
   if (uint32_t numInEdges = InEdges.size()) {
-    auto Prob = Probs[1] / numInEdges;
+    BranchProbability TakenProb = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
+    auto Prob = TakenProb / numInEdges;
     for (unsigned SuccIdx : InEdges)
       setEdgeProbability(BB, SuccIdx, Prob);
   }
 
   if (uint32_t numExitingEdges = ExitingEdges.size()) {
-    auto Prob = Probs[2] / numExitingEdges;
+    BranchProbability NotTakenProb = BranchProbability(LBH_NONTAKEN_WEIGHT,
+                                                       Denom);
+    auto Prob = NotTakenProb / numExitingEdges;
     for (unsigned SuccIdx : ExitingEdges)
       setEdgeProbability(BB, SuccIdx, Prob);
   }
 
+  if (uint32_t numUnlikelyEdges = UnlikelyEdges.size()) {
+    BranchProbability UnlikelyProb = BranchProbability(LBH_UNLIKELY_WEIGHT,
+                                                       Denom);
+    auto Prob = UnlikelyProb / numUnlikelyEdges;
+    for (unsigned SuccIdx : UnlikelyEdges)
+      setEdgeProbability(BB, SuccIdx, Prob);
+  }
+
   return true;
 }
 
@@ -752,8 +869,7 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
   if (I != Probs.end())
     return I->second;
 
-  return {1,
-          static_cast<uint32_t>(std::distance(succ_begin(Src), succ_end(Src)))};
+  return {1, static_cast<uint32_t>(succ_size(Src))};
 }
 
 BranchProbability
@@ -788,8 +904,9 @@ void BranchProbabilityInfo::setEdgeProbability(const BasicBlock *Src,
                                                BranchProbability Prob) {
   Probs[std::make_pair(Src, IndexInSuccessors)] = Prob;
   Handles.insert(BasicBlockCallbackVH(Src, this));
-  DEBUG(dbgs() << "set edge " << Src->getName() << " -> " << IndexInSuccessors
-               << " successor probability to " << Prob << "\n");
+  LLVM_DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
+                    << IndexInSuccessors << " successor probability to " << Prob
+                    << "\n");
 }
 
 raw_ostream &
@@ -814,8 +931,8 @@ void BranchProbabilityInfo::eraseBlock(const BasicBlock *BB) {
 
 void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
                                       const TargetLibraryInfo *TLI) {
-  DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
-               << " ----\n\n");
+  LLVM_DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
+                    << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
   assert(PostDominatedByUnreachable.empty());
   assert(PostDominatedByColdCall.empty());
@@ -833,18 +950,19 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
     if (Scc.size() == 1)
       continue;
 
-    DEBUG(dbgs() << "BPI: SCC " << SccNum << ":");
+    LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":");
     for (auto *BB : Scc) {
-      DEBUG(dbgs() << " " << BB->getName());
+      LLVM_DEBUG(dbgs() << " " << BB->getName());
       SccI.SccNums[BB] = SccNum;
     }
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
   }
 
   // Walk the basic blocks in post-order so that we can build up state about
   // the successors of a block iteratively.
   for (auto BB : post_order(&F.getEntryBlock())) {
-    DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Computing probabilities for " << BB->getName()
+                      << "\n");
     updatePostDominatedByUnreachable(BB);
     updatePostDominatedByColdCall(BB);
     // If there is no at least two successors, no sense to set probability.
@@ -852,6 +970,8 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
       continue;
     if (calcMetadataWeights(BB))
       continue;
+    if (calcInvokeHeuristics(BB))
+      continue;
     if (calcUnreachableHeuristics(BB))
       continue;
     if (calcColdCallHeuristics(BB))
@@ -864,7 +984,6 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
       continue;
     if (calcFloatingPointHeuristics(BB))
       continue;
-    calcInvokeHeuristics(BB);
   }
 
   PostDominatedByUnreachable.clear();
@@ -879,6 +998,10 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
 
 void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
+  // We require DT so it's available when LI is available. The LI updating code
+  // asserts that DT is also present so if we don't make sure that we have DT
+  // here, that assert will trigger.
+  AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.setPreservesAll();
diff --git a/contrib/llvm/lib/Analysis/CFGPrinter.cpp b/contrib/llvm/lib/Analysis/CFGPrinter.cpp
index fb261755e5d1..fc25cef8ddca 100644
--- a/contrib/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/CFGPrinter.cpp
@@ -22,6 +22,11 @@
 #include "llvm/Support/FileSystem.h"
 using namespace llvm;
 
+static cl::opt<std::string> CFGFuncName(
+    "cfg-func-name", cl::Hidden,
+    cl::desc("The name of a function (or its substring)"
+             " whose CFG is viewed/printed."));
+
 namespace {
   struct CFGViewerLegacyPass : public FunctionPass {
     static char ID; // Pass identifcation, replacement for typeid
@@ -83,6 +88,8 @@ PreservedAnalyses CFGOnlyViewerPass::run(Function &F,
 }
 
 static void writeCFGToDotFile(Function &F, bool CFGOnly = false) {
+  if (!CFGFuncName.empty() && !F.getName().contains(CFGFuncName))
+     return;
   std::string Filename = ("cfg." + F.getName() + ".dot").str();
   errs() << "Writing '" << Filename << "'...";
 
@@ -162,6 +169,8 @@ PreservedAnalyses CFGOnlyPrinterPass::run(Function &F,
 /// being a 'dot' and 'gv' program in your path.
 ///
 void Function::viewCFG() const {
+  if (!CFGFuncName.empty() && !getName().contains(CFGFuncName))
+     return;
   ViewGraph(this, "cfg" + getName());
 }
 
@@ -171,6 +180,8 @@ void Function::viewCFG() const {
 /// this can make the graph smaller.
 ///
 void Function::viewCFGOnly() const {
+  if (!CFGFuncName.empty() && !getName().contains(CFGFuncName))
+     return;
   ViewGraph(this, "cfg" + getName(), true);
 }
 
diff --git a/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
index 076a2b205d00..194983418b08 100644
--- a/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -18,7 +18,7 @@
 //
 // The algorithm used here is based on recursive state machine matching scheme
 // proposed in "Demand-driven alias analysis for C" by Xin Zheng and Radu
-// Rugina. The general idea is to extend the tranditional transitive closure
+// Rugina. The general idea is to extend the traditional transitive closure
 // algorithm to perform CFL matching along the way: instead of recording
 // "whether X is reachable from Y", we keep track of "whether X is reachable
 // from Y at state Z", where the "state" field indicates where we are in the CFL
@@ -337,7 +337,7 @@ public:
   FunctionInfo(const Function &, const SmallVectorImpl<Value *> &,
                const ReachabilitySet &, const AliasAttrMap &);
 
-  bool mayAlias(const Value *, uint64_t, const Value *, uint64_t) const;
+  bool mayAlias(const Value *, LocationSize, const Value *, LocationSize) const;
   const AliasSummary &getAliasSummary() const { return Summary; }
 };
 
@@ -395,7 +395,7 @@ populateAliasMap(DenseMap<const Value *, std::vector<OffsetValue>> &AliasMap,
     }
 
     // Sort AliasList for faster lookup
-    std::sort(AliasList.begin(), AliasList.end());
+    llvm::sort(AliasList.begin(), AliasList.end());
   }
 }
 
@@ -479,7 +479,7 @@ static void populateExternalRelations(
   }
 
   // Remove duplicates in ExtRelations
-  std::sort(ExtRelations.begin(), ExtRelations.end());
+  llvm::sort(ExtRelations.begin(), ExtRelations.end());
   ExtRelations.erase(std::unique(ExtRelations.begin(), ExtRelations.end()),
                      ExtRelations.end());
 }
@@ -516,9 +516,9 @@ CFLAndersAAResult::FunctionInfo::getAttrs(const Value *V) const {
 }
 
 bool CFLAndersAAResult::FunctionInfo::mayAlias(const Value *LHS,
-                                               uint64_t LHSSize,
+                                               LocationSize LHSSize,
                                                const Value *RHS,
-                                               uint64_t RHSSize) const {
+                                               LocationSize RHSSize) const {
   assert(LHS && RHS);
 
   // Check if we've seen LHS and RHS before. Sometimes LHS or RHS can be created
@@ -645,7 +645,7 @@ static void processWorkListItem(const WorkListItem &Item, const CFLGraph &Graph,
   // relations that are symmetric, we could actually cut the storage by half by
   // sorting FromNode and ToNode before insertion happens.
 
-  // The newly added value alias pair may pontentially generate more memory
+  // The newly added value alias pair may potentially generate more memory
   // alias pairs. Check for them here.
   auto FromNodeBelow = getNodeBelow(Graph, FromNode);
   auto ToNodeBelow = getNodeBelow(Graph, ToNode);
@@ -855,8 +855,9 @@ AliasResult CFLAndersAAResult::query(const MemoryLocation &LocA,
     if (!Fn) {
       // The only times this is known to happen are when globals + InlineAsm are
       // involved
-      DEBUG(dbgs()
-            << "CFLAndersAA: could not extract parent function information.\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "CFLAndersAA: could not extract parent function information.\n");
       return MayAlias;
     }
   } else {
diff --git a/contrib/llvm/lib/Analysis/CFLGraph.h b/contrib/llvm/lib/Analysis/CFLGraph.h
index e4e92864061f..86812009da7c 100644
--- a/contrib/llvm/lib/Analysis/CFLGraph.h
+++ b/contrib/llvm/lib/Analysis/CFLGraph.h
@@ -46,7 +46,7 @@
 namespace llvm {
 namespace cflaa {
 
-/// \brief The Program Expression Graph (PEG) of CFL analysis
+/// The Program Expression Graph (PEG) of CFL analysis
 /// CFLGraph is auxiliary data structure used by CFL-based alias analysis to
 /// describe flow-insensitive pointer-related behaviors. Given an LLVM function,
 /// the main purpose of this graph is to abstract away unrelated facts and
@@ -154,7 +154,7 @@ public:
   }
 };
 
-///\brief A builder class used to create CFLGraph instance from a given function
+///A builder class used to create CFLGraph instance from a given function
 /// The CFL-AA that uses this builder must provide its own type as a template
 /// argument. This is necessary for interprocedural processing: CFLGraphBuilder
 /// needs a way of obtaining the summary of other functions when callinsts are
@@ -423,17 +423,15 @@ template <typename CFLAA> class CFLGraphBuilder {
         addNode(Inst);
 
       // Check if Inst is a call to a library function that
-      // allocates/deallocates
-      // on the heap. Those kinds of functions do not introduce any aliases.
+      // allocates/deallocates on the heap. Those kinds of functions do not
+      // introduce any aliases.
       // TODO: address other common library functions such as realloc(),
-      // strdup(),
-      // etc.
+      // strdup(), etc.
       if (isMallocOrCallocLikeFn(Inst, &TLI) || isFreeCall(Inst, &TLI))
         return;
 
       // TODO: Add support for noalias args/all the other fun function
-      // attributes
-      // that we can tack on.
+      // attributes that we can tack on.
       SmallVector<Function *, 4> Targets;
       if (getPossibleTargets(CS, Targets))
         if (tryInterproceduralAnalysis(CS, Targets))
@@ -515,14 +513,16 @@ template <typename CFLAA> class CFLGraphBuilder {
         visitGEP(*GEPOp);
         break;
       }
+
       case Instruction::PtrToInt: {
-        auto *Ptr = CE->getOperand(0);
-        addNode(Ptr, getAttrEscaped());
+        addNode(CE->getOperand(0), getAttrEscaped());
         break;
       }
-      case Instruction::IntToPtr:
+
+      case Instruction::IntToPtr: {
         addNode(CE, getAttrUnknown());
         break;
+      }
 
       case Instruction::BitCast:
       case Instruction::AddrSpaceCast:
@@ -535,48 +535,29 @@ template <typename CFLAA> class CFLGraphBuilder {
       case Instruction::SIToFP:
       case Instruction::FPToUI:
       case Instruction::FPToSI: {
-        auto *Src = CE->getOperand(0);
-        addAssignEdge(Src, CE);
+        addAssignEdge(CE->getOperand(0), CE);
         break;
       }
+
       case Instruction::Select: {
-        auto *TrueVal = CE->getOperand(0);
-        auto *FalseVal = CE->getOperand(1);
-        addAssignEdge(TrueVal, CE);
-        addAssignEdge(FalseVal, CE);
-        break;
-      }
-      case Instruction::InsertElement: {
-        auto *Vec = CE->getOperand(0);
-        auto *Val = CE->getOperand(1);
-        addAssignEdge(Vec, CE);
-        addStoreEdge(Val, CE);
-        break;
-      }
-      case Instruction::ExtractElement: {
-        auto *Ptr = CE->getOperand(0);
-        addLoadEdge(Ptr, CE);
+        addAssignEdge(CE->getOperand(1), CE);
+        addAssignEdge(CE->getOperand(2), CE);
         break;
       }
+
+      case Instruction::InsertElement:
       case Instruction::InsertValue: {
-        auto *Agg = CE->getOperand(0);
-        auto *Val = CE->getOperand(1);
-        addAssignEdge(Agg, CE);
-        addStoreEdge(Val, CE);
+        addAssignEdge(CE->getOperand(0), CE);
+        addStoreEdge(CE->getOperand(1), CE);
         break;
       }
+
+      case Instruction::ExtractElement:
       case Instruction::ExtractValue: {
-        auto *Ptr = CE->getOperand(0);
-        addLoadEdge(Ptr, CE);
-        break;
-      }
-      case Instruction::ShuffleVector: {
-        auto *From1 = CE->getOperand(0);
-        auto *From2 = CE->getOperand(1);
-        addAssignEdge(From1, CE);
-        addAssignEdge(From2, CE);
+        addLoadEdge(CE->getOperand(0), CE);
         break;
       }
+
       case Instruction::Add:
       case Instruction::Sub:
       case Instruction::FSub:
@@ -596,9 +577,11 @@ template <typename CFLAA> class CFLGraphBuilder {
       case Instruction::AShr:
       case Instruction::ICmp:
       case Instruction::FCmp:
+      case Instruction::ShuffleVector: {
         addAssignEdge(CE->getOperand(0), CE);
         addAssignEdge(CE->getOperand(1), CE);
         break;
+      }
 
       default:
         llvm_unreachable("Unknown instruction type encountered!");
diff --git a/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
index eee6d26ba787..30ce13578e54 100644
--- a/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -276,8 +276,9 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
   if (!MaybeFnA && !MaybeFnB) {
     // The only times this is known to happen are when globals + InlineAsm are
     // involved
-    DEBUG(dbgs()
-          << "CFLSteensAA: could not extract parent function information.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "CFLSteensAA: could not extract parent function information.\n");
     return MayAlias;
   }
 
diff --git a/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp b/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp
index ceff94756fe3..b325afb8e7c5 100644
--- a/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -32,7 +32,7 @@
 
 using namespace llvm;
 
-// Explicit template instantiations and specialization defininitions for core
+// Explicit template instantiations and specialization definitions for core
 // template typedefs.
 namespace llvm {
 
@@ -75,7 +75,7 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
     // If the CGSCC pass wasn't able to provide a valid updated SCC, the
     // current SCC may simply need to be skipped if invalid.
     if (UR.InvalidatedSCCs.count(C)) {
-      DEBUG(dbgs() << "Skipping invalidated root or island SCC!\n");
+      LLVM_DEBUG(dbgs() << "Skipping invalidated root or island SCC!\n");
       break;
     }
     // Check that we didn't miss any update scenario.
@@ -96,7 +96,7 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
     // ...getContext().yield();
   }
 
-  // Invaliadtion was handled after each pass in the above loop for the current
+  // Invalidation was handled after each pass in the above loop for the current
   // SCC. Therefore, the remaining analysis results in the AnalysisManager are
   // preserved. We mark this with a set so that we don't need to inspect each
   // one individually.
@@ -353,7 +353,8 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
 
   // Add the current SCC to the worklist as its shape has changed.
   UR.CWorklist.insert(C);
-  DEBUG(dbgs() << "Enqueuing the existing SCC in the worklist:" << *C << "\n");
+  LLVM_DEBUG(dbgs() << "Enqueuing the existing SCC in the worklist:" << *C
+                    << "\n");
 
   SCC *OldC = C;
 
@@ -372,7 +373,7 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
   // We need to propagate an invalidation call to all but the newly current SCC
   // because the outer pass manager won't do that for us after splitting them.
   // FIXME: We should accept a PreservedAnalysis from the CG updater so that if
-  // there are preserved ananalyses we can avoid invalidating them here for
+  // there are preserved analysis we can avoid invalidating them here for
   // split-off SCCs.
   // We know however that this will preserve any FAM proxy so go ahead and mark
   // that.
@@ -389,7 +390,7 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
     assert(C != &NewC && "No need to re-visit the current SCC!");
     assert(OldC != &NewC && "Already handled the original SCC!");
     UR.CWorklist.insert(&NewC);
-    DEBUG(dbgs() << "Enqueuing a newly formed SCC:" << NewC << "\n");
+    LLVM_DEBUG(dbgs() << "Enqueuing a newly formed SCC:" << NewC << "\n");
 
     // Ensure new SCCs' function analyses are updated.
     if (NeedFAMProxy)
@@ -514,8 +515,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
                           return false;
 
                         RC->removeOutgoingEdge(N, *TargetN);
-                        DEBUG(dbgs() << "Deleting outgoing edge from '" << N
-                                     << "' to '" << TargetN << "'\n");
+                        LLVM_DEBUG(dbgs() << "Deleting outgoing edge from '"
+                                          << N << "' to '" << TargetN << "'\n");
                         return true;
                       }),
       DeadTargets.end());
@@ -546,8 +547,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
       assert(NewRC != RC && "Should not encounter the current RefSCC further "
                             "in the postorder list of new RefSCCs.");
       UR.RCWorklist.insert(NewRC);
-      DEBUG(dbgs() << "Enqueuing a new RefSCC in the update worklist: "
-                   << *NewRC << "\n");
+      LLVM_DEBUG(dbgs() << "Enqueuing a new RefSCC in the update worklist: "
+                        << *NewRC << "\n");
     }
   }
 
@@ -564,8 +565,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
       RC->switchOutgoingEdgeToRef(N, *RefTarget);
-      DEBUG(dbgs() << "Switch outgoing call edge to a ref edge from '" << N
-                   << "' to '" << *RefTarget << "'\n");
+      LLVM_DEBUG(dbgs() << "Switch outgoing call edge to a ref edge from '" << N
+                        << "' to '" << *RefTarget << "'\n");
       continue;
     }
 
@@ -593,12 +594,12 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
       RC->switchOutgoingEdgeToCall(N, *CallTarget);
-      DEBUG(dbgs() << "Switch outgoing ref edge to a call edge from '" << N
-                   << "' to '" << *CallTarget << "'\n");
+      LLVM_DEBUG(dbgs() << "Switch outgoing ref edge to a call edge from '" << N
+                        << "' to '" << *CallTarget << "'\n");
       continue;
     }
-    DEBUG(dbgs() << "Switch an internal ref edge to a call edge from '" << N
-                 << "' to '" << *CallTarget << "'\n");
+    LLVM_DEBUG(dbgs() << "Switch an internal ref edge to a call edge from '"
+                      << N << "' to '" << *CallTarget << "'\n");
 
     // Otherwise we are switching an internal ref edge to a call edge. This
     // may merge away some SCCs, and we add those to the UpdateResult. We also
@@ -635,7 +636,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
 
       // If one of the invalidated SCCs had a cached proxy to a function
       // analysis manager, we need to create a proxy in the new current SCC as
-      // the invaliadted SCCs had their functions moved.
+      // the invalidated SCCs had their functions moved.
       if (HasFunctionAnalysisProxy)
         AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, G);
 
@@ -661,14 +662,14 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
       // post-order sequence, and may end up observing more precise context to
       // optimize the current SCC.
       UR.CWorklist.insert(C);
-      DEBUG(dbgs() << "Enqueuing the existing SCC in the worklist: " << *C
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Enqueuing the existing SCC in the worklist: " << *C
+                        << "\n");
       // Enqueue in reverse order as we pop off the back of the worklist.
       for (SCC &MovedC : llvm::reverse(make_range(RC->begin() + InitialSCCIndex,
                                                   RC->begin() + NewSCCIndex))) {
         UR.CWorklist.insert(&MovedC);
-        DEBUG(dbgs() << "Enqueuing a newly earlier in post-order SCC: "
-                     << MovedC << "\n");
+        LLVM_DEBUG(dbgs() << "Enqueuing a newly earlier in post-order SCC: "
+                          << MovedC << "\n");
       }
     }
   }
diff --git a/contrib/llvm/lib/Analysis/CallGraph.cpp b/contrib/llvm/lib/Analysis/CallGraph.cpp
index ac3ea2b73fed..7d5d2d2e4496 100644
--- a/contrib/llvm/lib/Analysis/CallGraph.cpp
+++ b/contrib/llvm/lib/Analysis/CallGraph.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Function.h"
@@ -96,8 +97,8 @@ void CallGraph::print(raw_ostream &OS) const {
   for (const auto &I : *this)
     Nodes.push_back(I.second.get());
 
-  std::sort(Nodes.begin(), Nodes.end(),
-            [](CallGraphNode *LHS, CallGraphNode *RHS) {
+  llvm::sort(Nodes.begin(), Nodes.end(),
+             [](CallGraphNode *LHS, CallGraphNode *RHS) {
     if (Function *LF = LHS->getFunction())
       if (Function *RF = RHS->getFunction())
         return LF->getName() < RF->getName();
diff --git a/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
index a2dda58a6a2f..f2211edba216 100644
--- a/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -120,6 +120,7 @@ bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
                                  bool &DevirtualizedCall) {
   bool Changed = false;
   PMDataManager *PM = P->getAsPMDataManager();
+  Module &M = CG.getModule();
 
   if (!PM) {
     CallGraphSCCPass *CGSP = (CallGraphSCCPass*)P;
@@ -129,8 +130,17 @@ bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
     }
 
     {
+      unsigned InstrCount = 0;
+      bool EmitICRemark = M.shouldEmitInstrCountChangedRemark();
       TimeRegion PassTimer(getPassTimer(CGSP));
+      if (EmitICRemark)
+        InstrCount = initSizeRemarkInfo(M);
       Changed = CGSP->runOnSCC(CurSCC);
+
+      // If the pass modified the module, it may have modified the instruction
+      // count of the module. Try emitting a remark.
+      if (EmitICRemark)
+        emitInstrCountChangedRemark(P, M, InstrCount);
     }
     
     // After the CGSCCPass is done, when assertions are enabled, use
@@ -162,8 +172,8 @@ bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
   // The function pass(es) modified the IR, they may have clobbered the
   // callgraph.
   if (Changed && CallGraphUpToDate) {
-    DEBUG(dbgs() << "CGSCCPASSMGR: Pass Dirtied SCC: "
-                 << P->getPassName() << '\n');
+    LLVM_DEBUG(dbgs() << "CGSCCPASSMGR: Pass Dirtied SCC: " << P->getPassName()
+                      << '\n');
     CallGraphUpToDate = false;
   }
   return Changed;
@@ -181,12 +191,11 @@ bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
 bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
                                      bool CheckingMode) {
   DenseMap<Value*, CallGraphNode*> CallSites;
-  
-  DEBUG(dbgs() << "CGSCCPASSMGR: Refreshing SCC with " << CurSCC.size()
-               << " nodes:\n";
-        for (CallGraphNode *CGN : CurSCC)
-          CGN->dump();
-        );
+
+  LLVM_DEBUG(dbgs() << "CGSCCPASSMGR: Refreshing SCC with " << CurSCC.size()
+                    << " nodes:\n";
+             for (CallGraphNode *CGN
+                  : CurSCC) CGN->dump(););
 
   bool MadeChange = false;
   bool DevirtualizedCall = false;
@@ -307,8 +316,8 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
             // one.
             if (!ExistingNode->getFunction()) {
               DevirtualizedCall = true;
-              DEBUG(dbgs() << "  CGSCCPASSMGR: Devirtualized call to '"
-                           << Callee->getName() << "'\n");
+              LLVM_DEBUG(dbgs() << "  CGSCCPASSMGR: Devirtualized call to '"
+                                << Callee->getName() << "'\n");
             }
           } else {
             CalleeNode = CG.getCallsExternalNode();
@@ -363,17 +372,15 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
       CallSites.clear();
   }
 
-  DEBUG(if (MadeChange) {
-          dbgs() << "CGSCCPASSMGR: Refreshed SCC is now:\n";
-          for (CallGraphNode *CGN : CurSCC)
-            CGN->dump();
-          if (DevirtualizedCall)
-            dbgs() << "CGSCCPASSMGR: Refresh devirtualized a call!\n";
-
-         } else {
-           dbgs() << "CGSCCPASSMGR: SCC Refresh didn't change call graph.\n";
-         }
-        );
+  LLVM_DEBUG(if (MadeChange) {
+    dbgs() << "CGSCCPASSMGR: Refreshed SCC is now:\n";
+    for (CallGraphNode *CGN : CurSCC)
+      CGN->dump();
+    if (DevirtualizedCall)
+      dbgs() << "CGSCCPASSMGR: Refresh devirtualized a call!\n";
+  } else {
+    dbgs() << "CGSCCPASSMGR: SCC Refresh didn't change call graph.\n";
+  });
   (void)MadeChange;
 
   return DevirtualizedCall;
@@ -472,16 +479,17 @@ bool CGPassManager::runOnModule(Module &M) {
     unsigned Iteration = 0;
     bool DevirtualizedCall = false;
     do {
-      DEBUG(if (Iteration)
-              dbgs() << "  SCCPASSMGR: Re-visiting SCC, iteration #"
-                     << Iteration << '\n');
+      LLVM_DEBUG(if (Iteration) dbgs()
+                 << "  SCCPASSMGR: Re-visiting SCC, iteration #" << Iteration
+                 << '\n');
       DevirtualizedCall = false;
       Changed |= RunAllPassesOnSCC(CurSCC, CG, DevirtualizedCall);
     } while (Iteration++ < MaxIterations && DevirtualizedCall);
     
     if (DevirtualizedCall)
-      DEBUG(dbgs() << "  CGSCCPASSMGR: Stopped iteration after " << Iteration
-                   << " times, due to -max-cg-scc-iterations\n");
+      LLVM_DEBUG(dbgs() << "  CGSCCPASSMGR: Stopped iteration after "
+                        << Iteration
+                        << " times, due to -max-cg-scc-iterations\n");
 
     MaxSCCIterations.updateMax(Iteration);
   }
@@ -648,7 +656,7 @@ Pass *CallGraphSCCPass::createPrinterPass(raw_ostream &OS,
 bool CallGraphSCCPass::skipSCC(CallGraphSCC &SCC) const {
   return !SCC.getCallGraph().getModule()
               .getContext()
-              .getOptBisect()
+              .getOptPassGate()
               .shouldRunPass(this, SCC);
 }
 
diff --git a/contrib/llvm/lib/Analysis/CaptureTracking.cpp b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
index 3b0026ba10e9..d4f73bdb4361 100644
--- a/contrib/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
@@ -60,7 +61,7 @@ namespace {
   /// as the given instruction and the use.
   struct CapturesBefore : public CaptureTracker {
 
-    CapturesBefore(bool ReturnCaptures, const Instruction *I, DominatorTree *DT,
+    CapturesBefore(bool ReturnCaptures, const Instruction *I, const DominatorTree *DT,
                    bool IncludeI, OrderedBasicBlock *IC)
       : OrderedBB(IC), BeforeHere(I), DT(DT),
         ReturnCaptures(ReturnCaptures), IncludeI(IncludeI), Captured(false) {}
@@ -140,7 +141,7 @@ namespace {
 
     OrderedBasicBlock *OrderedBB;
     const Instruction *BeforeHere;
-    DominatorTree *DT;
+    const DominatorTree *DT;
 
     bool ReturnCaptures;
     bool IncludeI;
@@ -184,7 +185,7 @@ bool llvm::PointerMayBeCaptured(const Value *V,
 /// queries about relative order among instructions in the same basic block.
 bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                       bool StoreCaptures, const Instruction *I,
-                                      DominatorTree *DT, bool IncludeI,
+                                      const DominatorTree *DT, bool IncludeI,
                                       OrderedBasicBlock *OBB) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
@@ -215,18 +216,22 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
   assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
   SmallVector<const Use *, Threshold> Worklist;
   SmallSet<const Use *, Threshold> Visited;
-  int Count = 0;
 
-  for (const Use &U : V->uses()) {
-    // If there are lots of uses, conservatively say that the value
-    // is captured to avoid taking too much compile time.
-    if (Count++ >= Threshold)
-      return Tracker->tooManyUses();
-
-    if (!Tracker->shouldExplore(&U)) continue;
-    Visited.insert(&U);
-    Worklist.push_back(&U);
-  }
+  auto AddUses = [&](const Value *V) {
+    int Count = 0;
+    for (const Use &U : V->uses()) {
+      // If there are lots of uses, conservatively say that the value
+      // is captured to avoid taking too much compile time.
+      if (Count++ >= Threshold)
+        return Tracker->tooManyUses();
+      if (!Visited.insert(&U).second)
+        continue;
+      if (!Tracker->shouldExplore(&U))
+        continue;
+      Worklist.push_back(&U);
+    }
+  };
+  AddUses(V);
 
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
@@ -243,6 +248,16 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
       if (CS.onlyReadsMemory() && CS.doesNotThrow() && I->getType()->isVoidTy())
         break;
 
+      // The pointer is not captured if returned pointer is not captured.
+      // NOTE: CaptureTracking users should not assume that only functions
+      // marked with nocapture do not capture. This means that places like
+      // GetUnderlyingObject in ValueTracking or DecomposeGEPExpression
+      // in BasicAA also need to know about this property.
+      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CS)) {
+        AddUses(I);
+        break;
+      }
+
       // Volatile operations effectively capture the memory location that they
       // load and store to.
       if (auto *MI = dyn_cast<MemIntrinsic>(I))
@@ -313,17 +328,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
     case Instruction::Select:
     case Instruction::AddrSpaceCast:
       // The original value is not captured via this if the new value isn't.
-      Count = 0;
-      for (Use &UU : I->uses()) {
-        // If there are lots of uses, conservatively say that the value
-        // is captured to avoid taking too much compile time.
-        if (Count++ >= Threshold)
-          return Tracker->tooManyUses();
-
-        if (Visited.insert(&UU).second)
-          if (Tracker->shouldExplore(&UU))
-            Worklist.push_back(&UU);
-      }
+      AddUses(I);
       break;
     case Instruction::ICmp: {
       // Don't count comparisons of a no-alias return value against null as
diff --git a/contrib/llvm/lib/Analysis/CodeMetrics.cpp b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
index ac7d14ebdaea..46cc87d2b178 100644
--- a/contrib/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
@@ -61,7 +61,7 @@ static void completeEphemeralValues(SmallPtrSetImpl<const Value *> &Visited,
       continue;
 
     EphValues.insert(V);
-    DEBUG(dbgs() << "Ephemeral Value: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "Ephemeral Value: " << *V << "\n");
 
     // Append any more operands to consider.
     appendSpeculatableOperands(V, Visited, Worklist);
diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
index e88b8f14d54e..c5281c57bc19 100644
--- a/contrib/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
@@ -286,7 +286,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
                                       APInt &Offset, const DataLayout &DL) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
-    unsigned BitWidth = DL.getPointerTypeSizeInBits(GV->getType());
+    unsigned BitWidth = DL.getIndexTypeSizeInBits(GV->getType());
     Offset = APInt(BitWidth, 0);
     return true;
   }
@@ -305,7 +305,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   if (!GEP)
     return false;
 
-  unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType());
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
   APInt TmpOffset(BitWidth, 0);
 
   // If the base isn't a global+constant, we aren't either.
@@ -320,6 +320,41 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   return true;
 }
 
+Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
+                                         const DataLayout &DL) {
+  do {
+    Type *SrcTy = C->getType();
+
+    // If the type sizes are the same and a cast is legal, just directly
+    // cast the constant.
+    if (DL.getTypeSizeInBits(DestTy) == DL.getTypeSizeInBits(SrcTy)) {
+      Instruction::CastOps Cast = Instruction::BitCast;
+      // If we are going from a pointer to int or vice versa, we spell the cast
+      // differently.
+      if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+        Cast = Instruction::IntToPtr;
+      else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+        Cast = Instruction::PtrToInt;
+
+      if (CastInst::castIsValid(Cast, C, DestTy))
+        return ConstantExpr::getCast(Cast, C, DestTy);
+    }
+
+    // If this isn't an aggregate type, there is nothing we can do to drill down
+    // and find a bitcastable constant.
+    if (!SrcTy->isAggregateType())
+      return nullptr;
+
+    // We're simulating a load through a pointer that was bitcast to point to
+    // a different type, so we can try to walk down through the initial
+    // elements of an aggregate to see if some part of th e aggregate is
+    // castable to implement the "load" semantic model.
+    C = C->getAggregateElement(0u);
+  } while (C);
+
+  return nullptr;
+}
+
 namespace {
 
 /// Recursive helper to read bits out of global. C is the constant being copied
@@ -537,8 +572,8 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
   return ConstantInt::get(IntType->getContext(), ResultVal);
 }
 
-Constant *ConstantFoldLoadThroughBitcast(ConstantExpr *CE, Type *DestTy,
-                                         const DataLayout &DL) {
+Constant *ConstantFoldLoadThroughBitcastExpr(ConstantExpr *CE, Type *DestTy,
+                                             const DataLayout &DL) {
   auto *SrcPtr = CE->getOperand(0);
   auto *SrcPtrTy = dyn_cast<PointerType>(SrcPtr->getType());
   if (!SrcPtrTy)
@@ -549,37 +584,7 @@ Constant *ConstantFoldLoadThroughBitcast(ConstantExpr *CE, Type *DestTy,
   if (!C)
     return nullptr;
 
-  do {
-    Type *SrcTy = C->getType();
-
-    // If the type sizes are the same and a cast is legal, just directly
-    // cast the constant.
-    if (DL.getTypeSizeInBits(DestTy) == DL.getTypeSizeInBits(SrcTy)) {
-      Instruction::CastOps Cast = Instruction::BitCast;
-      // If we are going from a pointer to int or vice versa, we spell the cast
-      // differently.
-      if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
-        Cast = Instruction::IntToPtr;
-      else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
-        Cast = Instruction::PtrToInt;
-
-      if (CastInst::castIsValid(Cast, C, DestTy))
-        return ConstantExpr::getCast(Cast, C, DestTy);
-    }
-
-    // If this isn't an aggregate type, there is nothing we can do to drill down
-    // and find a bitcastable constant.
-    if (!SrcTy->isAggregateType())
-      return nullptr;
-
-    // We're simulating a load through a pointer that was bitcast to point to
-    // a different type, so we can try to walk down through the initial
-    // elements of an aggregate to see if some part of th e aggregate is
-    // castable to implement the "load" semantic model.
-    C = C->getAggregateElement(0u);
-  } while (C);
-
-  return nullptr;
+  return llvm::ConstantFoldLoadThroughBitcast(C, DestTy, DL);
 }
 
 } // end anonymous namespace
@@ -611,7 +616,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
   }
 
   if (CE->getOpcode() == Instruction::BitCast)
-    if (Constant *LoadedC = ConstantFoldLoadThroughBitcast(CE, Ty, DL))
+    if (Constant *LoadedC = ConstantFoldLoadThroughBitcastExpr(CE, Ty, DL))
       return LoadedC;
 
   // Instead of loading constant c string, use corresponding integer value
@@ -808,26 +813,26 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
-    if (!isa<ConstantInt>(Ops[i])) {
-
-      // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
-      // "inttoptr (sub (ptrtoint Ptr), V)"
-      if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) {
-        auto *CE = dyn_cast<ConstantExpr>(Ops[1]);
-        assert((!CE || CE->getType() == IntPtrTy) &&
-               "CastGEPIndices didn't canonicalize index types!");
-        if (CE && CE->getOpcode() == Instruction::Sub &&
-            CE->getOperand(0)->isNullValue()) {
-          Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
-          Res = ConstantExpr::getSub(Res, CE->getOperand(1));
-          Res = ConstantExpr::getIntToPtr(Res, ResTy);
-          if (auto *FoldedRes = ConstantFoldConstant(Res, DL, TLI))
-            Res = FoldedRes;
-          return Res;
+      if (!isa<ConstantInt>(Ops[i])) {
+
+        // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
+        // "inttoptr (sub (ptrtoint Ptr), V)"
+        if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) {
+          auto *CE = dyn_cast<ConstantExpr>(Ops[1]);
+          assert((!CE || CE->getType() == IntPtrTy) &&
+                 "CastGEPIndices didn't canonicalize index types!");
+          if (CE && CE->getOpcode() == Instruction::Sub &&
+              CE->getOperand(0)->isNullValue()) {
+            Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
+            Res = ConstantExpr::getSub(Res, CE->getOperand(1));
+            Res = ConstantExpr::getIntToPtr(Res, ResTy);
+            if (auto *FoldedRes = ConstantFoldConstant(Res, DL, TLI))
+              Res = FoldedRes;
+            return Res;
+          }
         }
+        return nullptr;
       }
-      return nullptr;
-    }
 
   unsigned BitWidth = DL.getTypeSizeInBits(IntPtrTy);
   APInt Offset =
@@ -1387,6 +1392,8 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
   case Intrinsic::copysign:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
   case Intrinsic::round:
   case Intrinsic::masked_load:
   case Intrinsic::sadd_with_overflow:
@@ -1582,16 +1589,37 @@ double getValueAsDouble(ConstantFP *Op) {
 
 Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                                  ArrayRef<Constant *> Operands,
-                                 const TargetLibraryInfo *TLI) {
+                                 const TargetLibraryInfo *TLI,
+                                 ImmutableCallSite CS) {
   if (Operands.size() == 1) {
     if (isa<UndefValue>(Operands[0])) {
       // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN
       if (IntrinsicID == Intrinsic::cos)
         return Constant::getNullValue(Ty);
       if (IntrinsicID == Intrinsic::bswap ||
-          IntrinsicID == Intrinsic::bitreverse)
+          IntrinsicID == Intrinsic::bitreverse ||
+          IntrinsicID == Intrinsic::launder_invariant_group ||
+          IntrinsicID == Intrinsic::strip_invariant_group)
         return Operands[0];
     }
+
+    if (isa<ConstantPointerNull>(Operands[0])) {
+      // launder(null) == null == strip(null) iff in addrspace 0
+      if (IntrinsicID == Intrinsic::launder_invariant_group ||
+          IntrinsicID == Intrinsic::strip_invariant_group) {
+        // If instruction is not yet put in a basic block (e.g. when cloning
+        // a function during inlining), CS caller may not be available.
+        // So check CS's BB first before querying CS.getCaller.
+        const Function *Caller = CS.getParent() ? CS.getCaller() : nullptr;
+        if (Caller &&
+            !NullPointerIsDefined(
+                Caller, Operands[0]->getType()->getPointerAddressSpace())) {
+          return Operands[0];
+        }
+        return nullptr;
+      }
+    }
+
     if (auto *Op = dyn_cast<ConstantFP>(Operands[0])) {
       if (IntrinsicID == Intrinsic::convert_to_fp16) {
         APFloat Val(Op->getValueAPF());
@@ -1988,7 +2016,8 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
 Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
                                  VectorType *VTy, ArrayRef<Constant *> Operands,
                                  const DataLayout &DL,
-                                 const TargetLibraryInfo *TLI) {
+                                 const TargetLibraryInfo *TLI,
+                                 ImmutableCallSite CS) {
   SmallVector<Constant *, 4> Result(VTy->getNumElements());
   SmallVector<Constant *, 4> Lane(Operands.size());
   Type *Ty = VTy->getElementType();
@@ -2051,7 +2080,7 @@ Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
     }
 
     // Use the regular scalar folding to simplify this column.
-    Constant *Folded = ConstantFoldScalarCall(Name, IntrinsicID, Ty, Lane, TLI);
+    Constant *Folded = ConstantFoldScalarCall(Name, IntrinsicID, Ty, Lane, TLI, CS);
     if (!Folded)
       return nullptr;
     Result[I] = Folded;
@@ -2076,9 +2105,9 @@ llvm::ConstantFoldCall(ImmutableCallSite CS, Function *F,
 
   if (auto *VTy = dyn_cast<VectorType>(Ty))
     return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands,
-                                  F->getParent()->getDataLayout(), TLI);
+                                  F->getParent()->getDataLayout(), TLI, CS);
 
-  return ConstantFoldScalarCall(Name, F->getIntrinsicID(), Ty, Operands, TLI);
+  return ConstantFoldScalarCall(Name, F->getIntrinsicID(), Ty, Operands, TLI, CS);
 }
 
 bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
diff --git a/contrib/llvm/lib/Analysis/Delinearization.cpp b/contrib/llvm/lib/Analysis/Delinearization.cpp
index dd5af9d43ef8..4cafb7da16d3 100644
--- a/contrib/llvm/lib/Analysis/Delinearization.cpp
+++ b/contrib/llvm/lib/Analysis/Delinearization.cpp
@@ -69,16 +69,6 @@ bool Delinearization::runOnFunction(Function &F) {
   return false;
 }
 
-static Value *getPointerOperand(Instruction &Inst) {
-  if (LoadInst *Load = dyn_cast<LoadInst>(&Inst))
-    return Load->getPointerOperand();
-  else if (StoreInst *Store = dyn_cast<StoreInst>(&Inst))
-    return Store->getPointerOperand();
-  else if (GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(&Inst))
-    return Gep->getPointerOperand();
-  return nullptr;
-}
-
 void Delinearization::print(raw_ostream &O, const Module *) const {
   O << "Delinearization on function " << F->getName() << ":\n";
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
@@ -93,7 +83,7 @@ void Delinearization::print(raw_ostream &O, const Module *) const {
     // Delinearize the memory access as analyzed in all the surrounding loops.
     // Do not analyze memory accesses outside loops.
     for (Loop *L = LI->getLoopFor(BB); L != nullptr; L = L->getParentLoop()) {
-      const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(*Inst), L);
+      const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(Inst), L);
 
       const SCEVUnknown *BasePointer =
           dyn_cast<SCEVUnknown>(SE->getPointerBase(AccessFn));
diff --git a/contrib/llvm/lib/Analysis/DemandedBits.cpp b/contrib/llvm/lib/Analysis/DemandedBits.cpp
index de7d21f9f133..58c5bccff65d 100644
--- a/contrib/llvm/lib/Analysis/DemandedBits.cpp
+++ b/contrib/llvm/lib/Analysis/DemandedBits.cpp
@@ -283,7 +283,7 @@ void DemandedBits::performAnalysis() {
     if (!isAlwaysLive(&I))
       continue;
 
-    DEBUG(dbgs() << "DemandedBits: Root: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "DemandedBits: Root: " << I << "\n");
     // For integer-valued instructions, set up an initial empty set of alive
     // bits and add the instruction to the work list. For other instructions
     // add their operands to the work list (for integer values operands, mark
@@ -313,13 +313,13 @@ void DemandedBits::performAnalysis() {
   while (!Worklist.empty()) {
     Instruction *UserI = Worklist.pop_back_val();
 
-    DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI);
+    LLVM_DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI);
     APInt AOut;
     if (UserI->getType()->isIntegerTy()) {
       AOut = AliveBits[UserI];
-      DEBUG(dbgs() << " Alive Out: " << AOut);
+      LLVM_DEBUG(dbgs() << " Alive Out: " << AOut);
     }
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
 
     if (!UserI->getType()->isIntegerTy())
       Visited.insert(UserI);
diff --git a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
index 34eccc07f265..79c2728d5620 100644
--- a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -24,8 +24,7 @@
 // Both of these are conservative weaknesses;
 // that is, not a source of correctness problems.
 //
-// The implementation depends on the GEP instruction to differentiate
-// subscripts. Since Clang linearizes some array subscripts, the dependence
+// Since Clang linearizes some array subscripts, the dependence
 // analysis is using SCEV->delinearize to recover the representation of multiple
 // subscripts, and thus avoid the more expensive and less precise MIV tests. The
 // delinearization is controlled by the flag -da-delinearize.
@@ -59,6 +58,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -108,8 +108,8 @@ STATISTIC(BanerjeeIndependence, "Banerjee independence");
 STATISTIC(BanerjeeSuccesses, "Banerjee successes");
 
 static cl::opt<bool>
-Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
-            cl::desc("Try to delinearize array references."));
+    Delinearize("da-delinearize", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+                cl::desc("Try to delinearize array references."));
 
 //===----------------------------------------------------------------------===//
 // basics
@@ -415,9 +415,9 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
 //            PLDI 1991
 bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   ++DeltaApplications;
-  DEBUG(dbgs() << "\tintersect constraints\n");
-  DEBUG(dbgs() << "\t    X ="; X->dump(dbgs()));
-  DEBUG(dbgs() << "\t    Y ="; Y->dump(dbgs()));
+  LLVM_DEBUG(dbgs() << "\tintersect constraints\n");
+  LLVM_DEBUG(dbgs() << "\t    X ="; X->dump(dbgs()));
+  LLVM_DEBUG(dbgs() << "\t    Y ="; Y->dump(dbgs()));
   assert(!Y->isPoint() && "Y must not be a Point");
   if (X->isAny()) {
     if (Y->isAny())
@@ -433,7 +433,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   }
 
   if (X->isDistance() && Y->isDistance()) {
-    DEBUG(dbgs() << "\t    intersect 2 distances\n");
+    LLVM_DEBUG(dbgs() << "\t    intersect 2 distances\n");
     if (isKnownPredicate(CmpInst::ICMP_EQ, X->getD(), Y->getD()))
       return false;
     if (isKnownPredicate(CmpInst::ICMP_NE, X->getD(), Y->getD())) {
@@ -460,12 +460,12 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
          "We shouldn't ever see X->isPoint() && Y->isPoint()");
 
   if (X->isLine() && Y->isLine()) {
-    DEBUG(dbgs() << "\t    intersect 2 lines\n");
+    LLVM_DEBUG(dbgs() << "\t    intersect 2 lines\n");
     const SCEV *Prod1 = SE->getMulExpr(X->getA(), Y->getB());
     const SCEV *Prod2 = SE->getMulExpr(X->getB(), Y->getA());
     if (isKnownPredicate(CmpInst::ICMP_EQ, Prod1, Prod2)) {
       // slopes are equal, so lines are parallel
-      DEBUG(dbgs() << "\t\tsame slope\n");
+      LLVM_DEBUG(dbgs() << "\t\tsame slope\n");
       Prod1 = SE->getMulExpr(X->getC(), Y->getB());
       Prod2 = SE->getMulExpr(X->getB(), Y->getC());
       if (isKnownPredicate(CmpInst::ICMP_EQ, Prod1, Prod2))
@@ -479,7 +479,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
     }
     if (isKnownPredicate(CmpInst::ICMP_NE, Prod1, Prod2)) {
       // slopes differ, so lines intersect
-      DEBUG(dbgs() << "\t\tdifferent slopes\n");
+      LLVM_DEBUG(dbgs() << "\t\tdifferent slopes\n");
       const SCEV *C1B2 = SE->getMulExpr(X->getC(), Y->getB());
       const SCEV *C1A2 = SE->getMulExpr(X->getC(), Y->getA());
       const SCEV *C2B1 = SE->getMulExpr(Y->getC(), X->getB());
@@ -501,10 +501,10 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
       APInt Xbot = A1B2_A2B1->getAPInt();
       APInt Ytop = C1A2_C2A1->getAPInt();
       APInt Ybot = A2B1_A1B2->getAPInt();
-      DEBUG(dbgs() << "\t\tXtop = " << Xtop << "\n");
-      DEBUG(dbgs() << "\t\tXbot = " << Xbot << "\n");
-      DEBUG(dbgs() << "\t\tYtop = " << Ytop << "\n");
-      DEBUG(dbgs() << "\t\tYbot = " << Ybot << "\n");
+      LLVM_DEBUG(dbgs() << "\t\tXtop = " << Xtop << "\n");
+      LLVM_DEBUG(dbgs() << "\t\tXbot = " << Xbot << "\n");
+      LLVM_DEBUG(dbgs() << "\t\tYtop = " << Ytop << "\n");
+      LLVM_DEBUG(dbgs() << "\t\tYbot = " << Ybot << "\n");
       APInt Xq = Xtop; // these need to be initialized, even
       APInt Xr = Xtop; // though they're just going to be overwritten
       APInt::sdivrem(Xtop, Xbot, Xq, Xr);
@@ -516,7 +516,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
         ++DeltaSuccesses;
         return true;
       }
-      DEBUG(dbgs() << "\t\tX = " << Xq << ", Y = " << Yq << "\n");
+      LLVM_DEBUG(dbgs() << "\t\tX = " << Xq << ", Y = " << Yq << "\n");
       if (Xq.slt(0) || Yq.slt(0)) {
         X->setEmpty();
         ++DeltaSuccesses;
@@ -525,7 +525,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
       if (const SCEVConstant *CUB =
           collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
         const APInt &UpperBound = CUB->getAPInt();
-        DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
+        LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
           X->setEmpty();
           ++DeltaSuccesses;
@@ -545,7 +545,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   assert(!(X->isLine() && Y->isPoint()) && "This case should never occur");
 
   if (X->isPoint() && Y->isLine()) {
-    DEBUG(dbgs() << "\t    intersect Point and Line\n");
+    LLVM_DEBUG(dbgs() << "\t    intersect Point and Line\n");
     const SCEV *A1X1 = SE->getMulExpr(Y->getA(), X->getX());
     const SCEV *B1Y1 = SE->getMulExpr(Y->getB(), X->getY());
     const SCEV *Sum = SE->getAddExpr(A1X1, B1Y1);
@@ -622,13 +622,38 @@ void Dependence::dump(raw_ostream &OS) const {
   OS << "!\n";
 }
 
+// Returns NoAlias/MayAliass/MustAlias for two memory locations based upon their
+// underlaying objects. If LocA and LocB are known to not alias (for any reason:
+// tbaa, non-overlapping regions etc), then it is known there is no dependecy.
+// Otherwise the underlying objects are checked to see if they point to
+// different identifiable objects.
 static AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
-                                          const DataLayout &DL, const Value *A,
-                                          const Value *B) {
-  const Value *AObj = GetUnderlyingObject(A, DL);
-  const Value *BObj = GetUnderlyingObject(B, DL);
-  return AA->alias(AObj, DL.getTypeStoreSize(AObj->getType()),
-                   BObj, DL.getTypeStoreSize(BObj->getType()));
+                                          const DataLayout &DL,
+                                          const MemoryLocation &LocA,
+                                          const MemoryLocation &LocB) {
+  // Check the original locations (minus size) for noalias, which can happen for
+  // tbaa, incompatible underlying object locations, etc.
+  MemoryLocation LocAS(LocA.Ptr, MemoryLocation::UnknownSize, LocA.AATags);
+  MemoryLocation LocBS(LocB.Ptr, MemoryLocation::UnknownSize, LocB.AATags);
+  if (AA->alias(LocAS, LocBS) == NoAlias)
+    return NoAlias;
+
+  // Check the underlying objects are the same
+  const Value *AObj = GetUnderlyingObject(LocA.Ptr, DL);
+  const Value *BObj = GetUnderlyingObject(LocB.Ptr, DL);
+
+  // If the underlying objects are the same, they must alias
+  if (AObj == BObj)
+    return MustAlias;
+
+  // We may have hit the recursion limit for underlying objects, or have
+  // underlying objects where we don't know they will alias.
+  if (!isIdentifiedObject(AObj) || !isIdentifiedObject(BObj))
+    return MayAlias;
+
+  // Otherwise we know the objects are different and both identified objects so
+  // must not alias.
+  return NoAlias;
 }
 
 
@@ -644,17 +669,6 @@ bool isLoadOrStore(const Instruction *I) {
 }
 
 
-static
-Value *getPointerOperand(Instruction *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  llvm_unreachable("Value is not load or store instruction");
-  return nullptr;
-}
-
-
 // Examines the loop nesting of the Src and Dst
 // instructions and establishes their shared loops. Sets the variables
 // CommonLevels, SrcLevels, and MaxLevels.
@@ -980,6 +994,57 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
   }
 }
 
+/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1))
+/// with some extra checking if S is an AddRec and we can prove less-than using
+/// the loop bounds.
+bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
+  // First unify to the same type
+  auto *SType = dyn_cast<IntegerType>(S->getType());
+  auto *SizeType = dyn_cast<IntegerType>(Size->getType());
+  if (!SType || !SizeType)
+    return false;
+  Type *MaxType =
+      (SType->getBitWidth() >= SizeType->getBitWidth()) ? SType : SizeType;
+  S = SE->getTruncateOrZeroExtend(S, MaxType);
+  Size = SE->getTruncateOrZeroExtend(Size, MaxType);
+
+  // Special check for addrecs using BE taken count
+  const SCEV *Bound = SE->getMinusSCEV(S, Size);
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Bound)) {
+    if (AddRec->isAffine()) {
+      const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop());
+      if (!isa<SCEVCouldNotCompute>(BECount)) {
+        const SCEV *Limit = AddRec->evaluateAtIteration(BECount, *SE);
+        if (SE->isKnownNegative(Limit))
+          return true;
+      }
+    }
+  }
+
+  // Check using normal isKnownNegative
+  const SCEV *LimitedBound =
+      SE->getMinusSCEV(S, SE->getSMaxExpr(Size, SE->getOne(Size->getType())));
+  return SE->isKnownNegative(LimitedBound);
+}
+
+bool DependenceInfo::isKnownNonNegative(const SCEV *S, const Value *Ptr) const {
+  bool Inbounds = false;
+  if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(Ptr))
+    Inbounds = SrcGEP->isInBounds();
+  if (Inbounds) {
+    if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) {
+      if (AddRec->isAffine()) {
+        // We know S is for Ptr, the operand on a load/store, so doesn't wrap.
+        // If both parts are NonNegative, the end result will be NonNegative
+        if (SE->isKnownNonNegative(AddRec->getStart()) &&
+            SE->isKnownNonNegative(AddRec->getOperand(1)))
+          return true;
+      }
+    }
+  }
+
+  return SE->isKnownNonNegative(S);
+}
 
 // All subscripts are all the same type.
 // Loop bound may be smaller (e.g., a char).
@@ -1019,19 +1084,19 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
 // Return true if dependence disproved.
 bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
                              FullDependence &Result) const {
-  DEBUG(dbgs() << "    src = " << *Src << "\n");
-  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "    src = " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   ++ZIVapplications;
   if (isKnownPredicate(CmpInst::ICMP_EQ, Src, Dst)) {
-    DEBUG(dbgs() << "    provably dependent\n");
+    LLVM_DEBUG(dbgs() << "    provably dependent\n");
     return false; // provably dependent
   }
   if (isKnownPredicate(CmpInst::ICMP_NE, Src, Dst)) {
-    DEBUG(dbgs() << "    provably independent\n");
+    LLVM_DEBUG(dbgs() << "    provably independent\n");
     ++ZIVindependence;
     return true; // provably independent
   }
-  DEBUG(dbgs() << "    possibly dependent\n");
+  LLVM_DEBUG(dbgs() << "    possibly dependent\n");
   Result.Consistent = false;
   return false; // possibly dependent
 }
@@ -1068,25 +1133,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
                                    const SCEV *DstConst, const Loop *CurLoop,
                                    unsigned Level, FullDependence &Result,
                                    Constraint &NewConstraint) const {
-  DEBUG(dbgs() << "\tStrong SIV test\n");
-  DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
-  DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
-  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst);
-  DEBUG(dbgs() << ", " << *SrcConst->getType() << "\n");
-  DEBUG(dbgs() << "\t    DstConst = " << *DstConst);
-  DEBUG(dbgs() << ", " << *DstConst->getType() << "\n");
+  LLVM_DEBUG(dbgs() << "\tStrong SIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
+  LLVM_DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst);
+  LLVM_DEBUG(dbgs() << ", " << *SrcConst->getType() << "\n");
+  LLVM_DEBUG(dbgs() << "\t    DstConst = " << *DstConst);
+  LLVM_DEBUG(dbgs() << ", " << *DstConst->getType() << "\n");
   ++StrongSIVapplications;
   assert(0 < Level && Level <= CommonLevels && "level out of range");
   Level--;
 
   const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
-  DEBUG(dbgs() << "\t    Delta = " << *Delta);
-  DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta);
+  LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
 
   // check that |Delta| < iteration count
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
-    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
-    DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
+    LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
+    LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
     const SCEV *AbsDelta =
       SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
     const SCEV *AbsCoeff =
@@ -1107,8 +1172,8 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     APInt Distance  = ConstDelta; // these need to be initialized
     APInt Remainder = ConstDelta;
     APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
-    DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
-    DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+    LLVM_DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
+    LLVM_DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
     // Make sure Coeff divides Delta exactly
     if (Remainder != 0) {
       // Coeff doesn't divide Distance, no dependence
@@ -1135,7 +1200,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   }
   else {
     if (Coeff->isOne()) {
-      DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
+      LLVM_DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
       Result.DV[Level].Distance = Delta; // since X/1 == X
       NewConstraint.setDistance(Delta, CurLoop);
     }
@@ -1204,16 +1269,16 @@ bool DependenceInfo::weakCrossingSIVtest(
     const SCEV *Coeff, const SCEV *SrcConst, const SCEV *DstConst,
     const Loop *CurLoop, unsigned Level, FullDependence &Result,
     Constraint &NewConstraint, const SCEV *&SplitIter) const {
-  DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
-  DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
-  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
-  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  LLVM_DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
   ++WeakCrossingSIVapplications;
   assert(0 < Level && Level <= CommonLevels && "Level out of range");
   Level--;
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
-  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   NewConstraint.setLine(Coeff, Coeff, Delta, CurLoop);
   if (Delta->isZero()) {
     Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::LT);
@@ -1243,7 +1308,7 @@ bool DependenceInfo::weakCrossingSIVtest(
   SplitIter = SE->getUDivExpr(
       SE->getSMaxExpr(SE->getZero(Delta->getType()), Delta),
       SE->getMulExpr(SE->getConstant(Delta->getType(), 2), ConstCoeff));
-  DEBUG(dbgs() << "\t    Split iter = " << *SplitIter << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Split iter = " << *SplitIter << "\n");
 
   const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
   if (!ConstDelta)
@@ -1251,8 +1316,8 @@ bool DependenceInfo::weakCrossingSIVtest(
 
   // We're certain that ConstCoeff > 0; therefore,
   // if Delta < 0, then no dependence.
-  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
-  DEBUG(dbgs() << "\t    ConstCoeff = " << *ConstCoeff << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "\t    ConstCoeff = " << *ConstCoeff << "\n");
   if (SE->isKnownNegative(Delta)) {
     // No dependence, Delta < 0
     ++WeakCrossingSIVindependence;
@@ -1263,11 +1328,11 @@ bool DependenceInfo::weakCrossingSIVtest(
   // We're certain that Delta > 0 and ConstCoeff > 0.
   // Check Delta/(2*ConstCoeff) against upper loop bound
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
-    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
     const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
                                     ConstantTwo);
-    DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
+    LLVM_DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
     if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
       // Delta too big, no dependence
       ++WeakCrossingSIVindependence;
@@ -1295,19 +1360,19 @@ bool DependenceInfo::weakCrossingSIVtest(
   APInt Distance = APDelta; // these need to be initialzed
   APInt Remainder = APDelta;
   APInt::sdivrem(APDelta, APCoeff, Distance, Remainder);
-  DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
   if (Remainder != 0) {
     // Coeff doesn't divide Delta, no dependence
     ++WeakCrossingSIVindependence;
     ++WeakCrossingSIVsuccesses;
     return true;
   }
-  DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
 
   // if 2*Coeff doesn't divide Delta, then the equal direction isn't possible
   APInt Two = APInt(Distance.getBitWidth(), 2, true);
   Remainder = Distance.srem(Two);
-  DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
   if (Remainder != 0) {
     // Equal direction isn't possible
     Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::EQ);
@@ -1343,7 +1408,7 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM,
     APInt::sdivrem(G0, G1, Q, R);
   }
   G = G1;
-  DEBUG(dbgs() << "\t    GCD = " << G << "\n");
+  LLVM_DEBUG(dbgs() << "\t    GCD = " << G << "\n");
   X = AM.slt(0) ? -A1 : A1;
   Y = BM.slt(0) ? B1 : -B1;
 
@@ -1416,17 +1481,17 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                   const Loop *CurLoop, unsigned Level,
                                   FullDependence &Result,
                                   Constraint &NewConstraint) const {
-  DEBUG(dbgs() << "\tExact SIV test\n");
-  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
-  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
-  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
-  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  LLVM_DEBUG(dbgs() << "\tExact SIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
+  LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  LLVM_DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
   ++ExactSIVapplications;
   assert(0 < Level && Level <= CommonLevels && "Level out of range");
   Level--;
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
-  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   NewConstraint.setLine(SrcCoeff, SE->getNegativeSCEV(DstCoeff),
                         Delta, CurLoop);
   const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
@@ -1447,7 +1512,7 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
     return true;
   }
 
-  DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
+  LLVM_DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
 
   // since SCEV construction normalizes, LM = 0
   APInt UM(Bits, 1, true);
@@ -1456,7 +1521,7 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   if (const SCEVConstant *CUB =
       collectConstantUpperBound(CurLoop, Delta->getType())) {
     UM = CUB->getAPInt();
-    DEBUG(dbgs() << "\t    UM = " << UM << "\n");
+    LLVM_DEBUG(dbgs() << "\t    UM = " << UM << "\n");
     UMvalid = true;
   }
 
@@ -1467,18 +1532,18 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   APInt TMUL = BM.sdiv(G);
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
-    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (UMvalid) {
       TU = minAPInt(TU, floorOfQuotient(UM - X, TMUL));
-      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
-    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (UMvalid) {
       TL = maxAPInt(TL, ceilingOfQuotient(UM - X, TMUL));
-      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
 
@@ -1486,18 +1551,18 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   TMUL = AM.sdiv(G);
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
-    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (UMvalid) {
       TU = minAPInt(TU, floorOfQuotient(UM - Y, TMUL));
-      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
-    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (UMvalid) {
       TL = maxAPInt(TL, ceilingOfQuotient(UM - Y, TMUL));
-      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
   if (TL.sgt(TU)) {
@@ -1512,15 +1577,15 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // less than
   APInt SaveTU(TU); // save these
   APInt SaveTL(TL);
-  DEBUG(dbgs() << "\t    exploring LT direction\n");
+  LLVM_DEBUG(dbgs() << "\t    exploring LT direction\n");
   TMUL = AM - BM;
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(X - Y + 1, TMUL));
-    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(X - Y + 1, TMUL));
-    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   if (TL.sle(TU)) {
     NewDirection |= Dependence::DVEntry::LT;
@@ -1530,23 +1595,23 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // equal
   TU = SaveTU; // restore
   TL = SaveTL;
-  DEBUG(dbgs() << "\t    exploring EQ direction\n");
+  LLVM_DEBUG(dbgs() << "\t    exploring EQ direction\n");
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(X - Y, TMUL));
-    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(X - Y, TMUL));
-    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   TMUL = BM - AM;
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(Y - X, TMUL));
-    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(Y - X, TMUL));
-    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   if (TL.sle(TU)) {
     NewDirection |= Dependence::DVEntry::EQ;
@@ -1556,14 +1621,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // greater than
   TU = SaveTU; // restore
   TL = SaveTL;
-  DEBUG(dbgs() << "\t    exploring GT direction\n");
+  LLVM_DEBUG(dbgs() << "\t    exploring GT direction\n");
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(Y - X + 1, TMUL));
-    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(Y - X + 1, TMUL));
-    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   if (TL.sle(TU)) {
     NewDirection |= Dependence::DVEntry::GT;
@@ -1607,9 +1672,9 @@ bool isRemainderZero(const SCEVConstant *Dividend,
 //
 // If i is not an integer, there's no dependence.
 // If i < 0 or > UB, there's no dependence.
-// If i = 0, the direction is <= and peeling the
+// If i = 0, the direction is >= and peeling the
 // 1st iteration will break the dependence.
-// If i = UB, the direction is >= and peeling the
+// If i = UB, the direction is <= and peeling the
 // last iteration will break the dependence.
 // Otherwise, the direction is *.
 //
@@ -1629,10 +1694,10 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   // For the WeakSIV test, it's possible the loop isn't common to
   // the Src and Dst loops. If it isn't, then there's no need to
   // record a direction.
-  DEBUG(dbgs() << "\tWeak-Zero (src) SIV test\n");
-  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << "\n");
-  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
-  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  LLVM_DEBUG(dbgs() << "\tWeak-Zero (src) SIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << "\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  LLVM_DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
   ++WeakZeroSIVapplications;
   assert(0 < Level && Level <= MaxLevels && "Level out of range");
   Level--;
@@ -1640,10 +1705,10 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
   NewConstraint.setLine(SE->getZero(Delta->getType()), DstCoeff, Delta,
                         CurLoop);
-  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   if (isKnownPredicate(CmpInst::ICMP_EQ, SrcConst, DstConst)) {
     if (Level < CommonLevels) {
-      Result.DV[Level].Direction &= Dependence::DVEntry::LE;
+      Result.DV[Level].Direction &= Dependence::DVEntry::GE;
       Result.DV[Level].PeelFirst = true;
       ++WeakZeroSIVsuccesses;
     }
@@ -1661,7 +1726,7 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
-    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
     if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
       ++WeakZeroSIVindependence;
@@ -1671,7 +1736,7 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
     if (isKnownPredicate(CmpInst::ICMP_EQ, NewDelta, Product)) {
       // dependences caused by last iteration
       if (Level < CommonLevels) {
-        Result.DV[Level].Direction &= Dependence::DVEntry::GE;
+        Result.DV[Level].Direction &= Dependence::DVEntry::LE;
         Result.DV[Level].PeelLast = true;
         ++WeakZeroSIVsuccesses;
       }
@@ -1738,10 +1803,10 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
                                         Constraint &NewConstraint) const {
   // For the WeakSIV test, it's possible the loop isn't common to the
   // Src and Dst loops. If it isn't, then there's no need to record a direction.
-  DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
-  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << "\n");
-  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
-  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << "\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  LLVM_DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
   ++WeakZeroSIVapplications;
   assert(0 < Level && Level <= SrcLevels && "Level out of range");
   Level--;
@@ -1749,7 +1814,7 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
   NewConstraint.setLine(SrcCoeff, SE->getZero(Delta->getType()), Delta,
                         CurLoop);
-  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   if (isKnownPredicate(CmpInst::ICMP_EQ, DstConst, SrcConst)) {
     if (Level < CommonLevels) {
       Result.DV[Level].Direction &= Dependence::DVEntry::LE;
@@ -1770,7 +1835,7 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
-    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
     if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
       ++WeakZeroSIVindependence;
@@ -1819,15 +1884,15 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                    const SCEV *SrcConst, const SCEV *DstConst,
                                    const Loop *SrcLoop, const Loop *DstLoop,
                                    FullDependence &Result) const {
-  DEBUG(dbgs() << "\tExact RDIV test\n");
-  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
-  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
-  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
-  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  LLVM_DEBUG(dbgs() << "\tExact RDIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
+  LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
+  LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  LLVM_DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
   ++ExactRDIVapplications;
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
-  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
   const SCEVConstant *ConstSrcCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   const SCEVConstant *ConstDstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
@@ -1845,7 +1910,7 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
     return true;
   }
 
-  DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
+  LLVM_DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
 
   // since SCEV construction seems to normalize, LM = 0
   APInt SrcUM(Bits, 1, true);
@@ -1854,7 +1919,7 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   if (const SCEVConstant *UpperBound =
       collectConstantUpperBound(SrcLoop, Delta->getType())) {
     SrcUM = UpperBound->getAPInt();
-    DEBUG(dbgs() << "\t    SrcUM = " << SrcUM << "\n");
+    LLVM_DEBUG(dbgs() << "\t    SrcUM = " << SrcUM << "\n");
     SrcUMvalid = true;
   }
 
@@ -1864,7 +1929,7 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   if (const SCEVConstant *UpperBound =
       collectConstantUpperBound(DstLoop, Delta->getType())) {
     DstUM = UpperBound->getAPInt();
-    DEBUG(dbgs() << "\t    DstUM = " << DstUM << "\n");
+    LLVM_DEBUG(dbgs() << "\t    DstUM = " << DstUM << "\n");
     DstUMvalid = true;
   }
 
@@ -1875,18 +1940,18 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   APInt TMUL = BM.sdiv(G);
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
-    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (SrcUMvalid) {
       TU = minAPInt(TU, floorOfQuotient(SrcUM - X, TMUL));
-      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
-    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (SrcUMvalid) {
       TL = maxAPInt(TL, ceilingOfQuotient(SrcUM - X, TMUL));
-      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
 
@@ -1894,18 +1959,18 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   TMUL = AM.sdiv(G);
   if (TMUL.sgt(0)) {
     TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
-    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (DstUMvalid) {
       TU = minAPInt(TU, floorOfQuotient(DstUM - Y, TMUL));
-      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
     TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
-    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (DstUMvalid) {
       TL = maxAPInt(TL, ceilingOfQuotient(DstUM - Y, TMUL));
-      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+      LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
   if (TL.sgt(TU))
@@ -1961,27 +2026,27 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
                                       const Loop *Loop1,
                                       const Loop *Loop2) const {
   ++SymbolicRDIVapplications;
-  DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
-  DEBUG(dbgs() << "\t    A1 = " << *A1);
-  DEBUG(dbgs() << ", type = " << *A1->getType() << "\n");
-  DEBUG(dbgs() << "\t    A2 = " << *A2 << "\n");
-  DEBUG(dbgs() << "\t    C1 = " << *C1 << "\n");
-  DEBUG(dbgs() << "\t    C2 = " << *C2 << "\n");
+  LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
+  LLVM_DEBUG(dbgs() << "\t    A1 = " << *A1);
+  LLVM_DEBUG(dbgs() << ", type = " << *A1->getType() << "\n");
+  LLVM_DEBUG(dbgs() << "\t    A2 = " << *A2 << "\n");
+  LLVM_DEBUG(dbgs() << "\t    C1 = " << *C1 << "\n");
+  LLVM_DEBUG(dbgs() << "\t    C2 = " << *C2 << "\n");
   const SCEV *N1 = collectUpperBound(Loop1, A1->getType());
   const SCEV *N2 = collectUpperBound(Loop2, A1->getType());
-  DEBUG(if (N1) dbgs() << "\t    N1 = " << *N1 << "\n");
-  DEBUG(if (N2) dbgs() << "\t    N2 = " << *N2 << "\n");
+  LLVM_DEBUG(if (N1) dbgs() << "\t    N1 = " << *N1 << "\n");
+  LLVM_DEBUG(if (N2) dbgs() << "\t    N2 = " << *N2 << "\n");
   const SCEV *C2_C1 = SE->getMinusSCEV(C2, C1);
   const SCEV *C1_C2 = SE->getMinusSCEV(C1, C2);
-  DEBUG(dbgs() << "\t    C2 - C1 = " << *C2_C1 << "\n");
-  DEBUG(dbgs() << "\t    C1 - C2 = " << *C1_C2 << "\n");
+  LLVM_DEBUG(dbgs() << "\t    C2 - C1 = " << *C2_C1 << "\n");
+  LLVM_DEBUG(dbgs() << "\t    C1 - C2 = " << *C1_C2 << "\n");
   if (SE->isKnownNonNegative(A1)) {
     if (SE->isKnownNonNegative(A2)) {
       // A1 >= 0 && A2 >= 0
       if (N1) {
         // make sure that c2 - c1 <= a1*N1
         const SCEV *A1N1 = SE->getMulExpr(A1, N1);
-        DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
+        LLVM_DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
         if (isKnownPredicate(CmpInst::ICMP_SGT, C2_C1, A1N1)) {
           ++SymbolicRDIVindependence;
           return true;
@@ -1990,7 +2055,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
       if (N2) {
         // make sure that -a2*N2 <= c2 - c1, or a2*N2 >= c1 - c2
         const SCEV *A2N2 = SE->getMulExpr(A2, N2);
-        DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
+        LLVM_DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
         if (isKnownPredicate(CmpInst::ICMP_SLT, A2N2, C1_C2)) {
           ++SymbolicRDIVindependence;
           return true;
@@ -2004,7 +2069,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         const SCEV *A1N1 = SE->getMulExpr(A1, N1);
         const SCEV *A2N2 = SE->getMulExpr(A2, N2);
         const SCEV *A1N1_A2N2 = SE->getMinusSCEV(A1N1, A2N2);
-        DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
+        LLVM_DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
         if (isKnownPredicate(CmpInst::ICMP_SGT, C2_C1, A1N1_A2N2)) {
           ++SymbolicRDIVindependence;
           return true;
@@ -2025,7 +2090,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         const SCEV *A1N1 = SE->getMulExpr(A1, N1);
         const SCEV *A2N2 = SE->getMulExpr(A2, N2);
         const SCEV *A1N1_A2N2 = SE->getMinusSCEV(A1N1, A2N2);
-        DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
+        LLVM_DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
         if (isKnownPredicate(CmpInst::ICMP_SGT, A1N1_A2N2, C2_C1)) {
           ++SymbolicRDIVindependence;
           return true;
@@ -2042,7 +2107,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
       if (N1) {
         // make sure that a1*N1 <= c2 - c1
         const SCEV *A1N1 = SE->getMulExpr(A1, N1);
-        DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
+        LLVM_DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
         if (isKnownPredicate(CmpInst::ICMP_SGT, A1N1, C2_C1)) {
           ++SymbolicRDIVindependence;
           return true;
@@ -2051,7 +2116,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
       if (N2) {
         // make sure that c2 - c1 <= -a2*N2, or c1 - c2 >= a2*N2
         const SCEV *A2N2 = SE->getMulExpr(A2, N2);
-        DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
+        LLVM_DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
         if (isKnownPredicate(CmpInst::ICMP_SLT, C1_C2, A2N2)) {
           ++SymbolicRDIVindependence;
           return true;
@@ -2074,8 +2139,8 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
 bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
                              FullDependence &Result, Constraint &NewConstraint,
                              const SCEV *&SplitIter) const {
-  DEBUG(dbgs() << "    src = " << *Src << "\n");
-  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "    src = " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
   const SCEVAddRecExpr *DstAddRec = dyn_cast<SCEVAddRecExpr>(Dst);
   if (SrcAddRec && DstAddRec) {
@@ -2151,8 +2216,8 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
   const SCEV *SrcCoeff, *DstCoeff;
   const Loop *SrcLoop, *DstLoop;
 
-  DEBUG(dbgs() << "    src = " << *Src << "\n");
-  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "    src = " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
   const SCEVAddRecExpr *DstAddRec = dyn_cast<SCEVAddRecExpr>(Dst);
   if (SrcAddRec && DstAddRec) {
@@ -2208,8 +2273,8 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
 bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
                              const SmallBitVector &Loops,
                              FullDependence &Result) const {
-  DEBUG(dbgs() << "    src = " << *Src << "\n");
-  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "    src = " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   Result.Consistent = false;
   return gcdMIVtest(Src, Dst, Result) ||
     banerjeeMIVtest(Src, Dst, Loops, Result);
@@ -2249,7 +2314,7 @@ const SCEVConstant *getConstantPart(const SCEV *Expr) {
 // to "a common divisor".
 bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
                                 FullDependence &Result) const {
-  DEBUG(dbgs() << "starting gcd\n");
+  LLVM_DEBUG(dbgs() << "starting gcd\n");
   ++GCDapplications;
   unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
   APInt RunningGCD = APInt::getNullValue(BitWidth);
@@ -2294,7 +2359,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
 
   APInt ExtraGCD = APInt::getNullValue(BitWidth);
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
-  DEBUG(dbgs() << "    Delta = " << *Delta << "\n");
+  LLVM_DEBUG(dbgs() << "    Delta = " << *Delta << "\n");
   const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Delta);
   if (const SCEVAddExpr *Sum = dyn_cast<SCEVAddExpr>(Delta)) {
     // If Delta is a sum of products, we may be able to make further progress.
@@ -2321,11 +2386,11 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   if (!Constant)
     return false;
   APInt ConstDelta = cast<SCEVConstant>(Constant)->getAPInt();
-  DEBUG(dbgs() << "    ConstDelta = " << ConstDelta << "\n");
+  LLVM_DEBUG(dbgs() << "    ConstDelta = " << ConstDelta << "\n");
   if (ConstDelta == 0)
     return false;
   RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ExtraGCD);
-  DEBUG(dbgs() << "    RunningGCD = " << RunningGCD << "\n");
+  LLVM_DEBUG(dbgs() << "    RunningGCD = " << RunningGCD << "\n");
   APInt Remainder = ConstDelta.srem(RunningGCD);
   if (Remainder != 0) {
     ++GCDindependence;
@@ -2344,7 +2409,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // Given A[5*i + 10*j*M + 9*M*N] and A[15*i + 20*j*M - 21*N*M + 5],
   // we need to remember that the constant part is 5 and the RunningGCD should
   // be initialized to ExtraGCD = 30.
-  DEBUG(dbgs() << "    ExtraGCD = " << ExtraGCD << '\n');
+  LLVM_DEBUG(dbgs() << "    ExtraGCD = " << ExtraGCD << '\n');
 
   bool Improved = false;
   Coefficients = Src;
@@ -2399,10 +2464,10 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
       continue;
     APInt ConstCoeff = Constant->getAPInt();
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
-    DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n");
+    LLVM_DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n");
     if (RunningGCD != 0) {
       Remainder = ConstDelta.srem(RunningGCD);
-      DEBUG(dbgs() << "\tRemainder = " << Remainder << "\n");
+      LLVM_DEBUG(dbgs() << "\tRemainder = " << Remainder << "\n");
       if (Remainder != 0) {
         unsigned Level = mapSrcLoop(CurLoop);
         Result.DV[Level - 1].Direction &= unsigned(~Dependence::DVEntry::EQ);
@@ -2412,7 +2477,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   }
   if (Improved)
     ++GCDsuccesses;
-  DEBUG(dbgs() << "all done\n");
+  LLVM_DEBUG(dbgs() << "all done\n");
   return false;
 }
 
@@ -2453,35 +2518,35 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
 bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
                                      const SmallBitVector &Loops,
                                      FullDependence &Result) const {
-  DEBUG(dbgs() << "starting Banerjee\n");
+  LLVM_DEBUG(dbgs() << "starting Banerjee\n");
   ++BanerjeeApplications;
-  DEBUG(dbgs() << "    Src = " << *Src << '\n');
+  LLVM_DEBUG(dbgs() << "    Src = " << *Src << '\n');
   const SCEV *A0;
   CoefficientInfo *A = collectCoeffInfo(Src, true, A0);
-  DEBUG(dbgs() << "    Dst = " << *Dst << '\n');
+  LLVM_DEBUG(dbgs() << "    Dst = " << *Dst << '\n');
   const SCEV *B0;
   CoefficientInfo *B = collectCoeffInfo(Dst, false, B0);
   BoundInfo *Bound = new BoundInfo[MaxLevels + 1];
   const SCEV *Delta = SE->getMinusSCEV(B0, A0);
-  DEBUG(dbgs() << "\tDelta = " << *Delta << '\n');
+  LLVM_DEBUG(dbgs() << "\tDelta = " << *Delta << '\n');
 
   // Compute bounds for all the * directions.
-  DEBUG(dbgs() << "\tBounds[*]\n");
+  LLVM_DEBUG(dbgs() << "\tBounds[*]\n");
   for (unsigned K = 1; K <= MaxLevels; ++K) {
     Bound[K].Iterations = A[K].Iterations ? A[K].Iterations : B[K].Iterations;
     Bound[K].Direction = Dependence::DVEntry::ALL;
     Bound[K].DirSet = Dependence::DVEntry::NONE;
     findBoundsALL(A, B, Bound, K);
 #ifndef NDEBUG
-    DEBUG(dbgs() << "\t    " << K << '\t');
+    LLVM_DEBUG(dbgs() << "\t    " << K << '\t');
     if (Bound[K].Lower[Dependence::DVEntry::ALL])
-      DEBUG(dbgs() << *Bound[K].Lower[Dependence::DVEntry::ALL] << '\t');
+      LLVM_DEBUG(dbgs() << *Bound[K].Lower[Dependence::DVEntry::ALL] << '\t');
     else
-      DEBUG(dbgs() << "-inf\t");
+      LLVM_DEBUG(dbgs() << "-inf\t");
     if (Bound[K].Upper[Dependence::DVEntry::ALL])
-      DEBUG(dbgs() << *Bound[K].Upper[Dependence::DVEntry::ALL] << '\n');
+      LLVM_DEBUG(dbgs() << *Bound[K].Upper[Dependence::DVEntry::ALL] << '\n');
     else
-      DEBUG(dbgs() << "+inf\n");
+      LLVM_DEBUG(dbgs() << "+inf\n");
 #endif
   }
 
@@ -2537,23 +2602,23 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
                                            const SCEV *Delta) const {
   if (Level > CommonLevels) {
     // record result
-    DEBUG(dbgs() << "\t[");
+    LLVM_DEBUG(dbgs() << "\t[");
     for (unsigned K = 1; K <= CommonLevels; ++K) {
       if (Loops[K]) {
         Bound[K].DirSet |= Bound[K].Direction;
 #ifndef NDEBUG
         switch (Bound[K].Direction) {
         case Dependence::DVEntry::LT:
-          DEBUG(dbgs() << " <");
+          LLVM_DEBUG(dbgs() << " <");
           break;
         case Dependence::DVEntry::EQ:
-          DEBUG(dbgs() << " =");
+          LLVM_DEBUG(dbgs() << " =");
           break;
         case Dependence::DVEntry::GT:
-          DEBUG(dbgs() << " >");
+          LLVM_DEBUG(dbgs() << " >");
           break;
         case Dependence::DVEntry::ALL:
-          DEBUG(dbgs() << " *");
+          LLVM_DEBUG(dbgs() << " *");
           break;
         default:
           llvm_unreachable("unexpected Bound[K].Direction");
@@ -2561,7 +2626,7 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
 #endif
       }
     }
-    DEBUG(dbgs() << " ]\n");
+    LLVM_DEBUG(dbgs() << " ]\n");
     return 1;
   }
   if (Loops[Level]) {
@@ -2572,34 +2637,40 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
       findBoundsGT(A, B, Bound, Level);
       findBoundsEQ(A, B, Bound, Level);
 #ifndef NDEBUG
-      DEBUG(dbgs() << "\tBound for level = " << Level << '\n');
-      DEBUG(dbgs() << "\t    <\t");
+      LLVM_DEBUG(dbgs() << "\tBound for level = " << Level << '\n');
+      LLVM_DEBUG(dbgs() << "\t    <\t");
       if (Bound[Level].Lower[Dependence::DVEntry::LT])
-        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::LT] << '\t');
+        LLVM_DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::LT]
+                          << '\t');
       else
-        DEBUG(dbgs() << "-inf\t");
+        LLVM_DEBUG(dbgs() << "-inf\t");
       if (Bound[Level].Upper[Dependence::DVEntry::LT])
-        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::LT] << '\n');
+        LLVM_DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::LT]
+                          << '\n');
       else
-        DEBUG(dbgs() << "+inf\n");
-      DEBUG(dbgs() << "\t    =\t");
+        LLVM_DEBUG(dbgs() << "+inf\n");
+      LLVM_DEBUG(dbgs() << "\t    =\t");
       if (Bound[Level].Lower[Dependence::DVEntry::EQ])
-        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::EQ] << '\t');
+        LLVM_DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::EQ]
+                          << '\t');
       else
-        DEBUG(dbgs() << "-inf\t");
+        LLVM_DEBUG(dbgs() << "-inf\t");
       if (Bound[Level].Upper[Dependence::DVEntry::EQ])
-        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::EQ] << '\n');
+        LLVM_DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::EQ]
+                          << '\n');
       else
-        DEBUG(dbgs() << "+inf\n");
-      DEBUG(dbgs() << "\t    >\t");
+        LLVM_DEBUG(dbgs() << "+inf\n");
+      LLVM_DEBUG(dbgs() << "\t    >\t");
       if (Bound[Level].Lower[Dependence::DVEntry::GT])
-        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::GT] << '\t');
+        LLVM_DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::GT]
+                          << '\t');
       else
-        DEBUG(dbgs() << "-inf\t");
+        LLVM_DEBUG(dbgs() << "-inf\t");
       if (Bound[Level].Upper[Dependence::DVEntry::GT])
-        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::GT] << '\n');
+        LLVM_DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::GT]
+                          << '\n');
       else
-        DEBUG(dbgs() << "+inf\n");
+        LLVM_DEBUG(dbgs() << "+inf\n");
 #endif
     }
 
@@ -2846,21 +2917,21 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
   }
   Constant = Subscript;
 #ifndef NDEBUG
-  DEBUG(dbgs() << "\tCoefficient Info\n");
+  LLVM_DEBUG(dbgs() << "\tCoefficient Info\n");
   for (unsigned K = 1; K <= MaxLevels; ++K) {
-    DEBUG(dbgs() << "\t    " << K << "\t" << *CI[K].Coeff);
-    DEBUG(dbgs() << "\tPos Part = ");
-    DEBUG(dbgs() << *CI[K].PosPart);
-    DEBUG(dbgs() << "\tNeg Part = ");
-    DEBUG(dbgs() << *CI[K].NegPart);
-    DEBUG(dbgs() << "\tUpper Bound = ");
+    LLVM_DEBUG(dbgs() << "\t    " << K << "\t" << *CI[K].Coeff);
+    LLVM_DEBUG(dbgs() << "\tPos Part = ");
+    LLVM_DEBUG(dbgs() << *CI[K].PosPart);
+    LLVM_DEBUG(dbgs() << "\tNeg Part = ");
+    LLVM_DEBUG(dbgs() << *CI[K].NegPart);
+    LLVM_DEBUG(dbgs() << "\tUpper Bound = ");
     if (CI[K].Iterations)
-      DEBUG(dbgs() << *CI[K].Iterations);
+      LLVM_DEBUG(dbgs() << *CI[K].Iterations);
     else
-      DEBUG(dbgs() << "+inf");
-    DEBUG(dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "+inf");
+    LLVM_DEBUG(dbgs() << '\n');
   }
-  DEBUG(dbgs() << "\t    Constant = " << *Subscript << '\n');
+  LLVM_DEBUG(dbgs() << "\t    Constant = " << *Subscript << '\n');
 #endif
   return CI;
 }
@@ -2985,8 +3056,8 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
                                bool &Consistent) {
   bool Result = false;
   for (unsigned LI : Loops.set_bits()) {
-    DEBUG(dbgs() << "\t    Constraint[" << LI << "] is");
-    DEBUG(Constraints[LI].dump(dbgs()));
+    LLVM_DEBUG(dbgs() << "\t    Constraint[" << LI << "] is");
+    LLVM_DEBUG(Constraints[LI].dump(dbgs()));
     if (Constraints[LI].isDistance())
       Result |= propagateDistance(Src, Dst, Constraints[LI], Consistent);
     else if (Constraints[LI].isLine())
@@ -3007,17 +3078,17 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
                                        Constraint &CurConstraint,
                                        bool &Consistent) {
   const Loop *CurLoop = CurConstraint.getAssociatedLoop();
-  DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
   const SCEV *A_K = findCoefficient(Src, CurLoop);
   if (A_K->isZero())
     return false;
   const SCEV *DA_K = SE->getMulExpr(A_K, CurConstraint.getD());
   Src = SE->getMinusSCEV(Src, DA_K);
   Src = zeroCoefficient(Src, CurLoop);
-  DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
-  DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
   Dst = addToCoefficient(Dst, CurLoop, SE->getNegativeSCEV(A_K));
-  DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
   if (!findCoefficient(Dst, CurLoop)->isZero())
     Consistent = false;
   return true;
@@ -3036,9 +3107,10 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   const SCEV *A = CurConstraint.getA();
   const SCEV *B = CurConstraint.getB();
   const SCEV *C = CurConstraint.getC();
-  DEBUG(dbgs() << "\t\tA = " << *A << ", B = " << *B << ", C = " << *C << "\n");
-  DEBUG(dbgs() << "\t\tSrc = " << *Src << "\n");
-  DEBUG(dbgs() << "\t\tDst = " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tA = " << *A << ", B = " << *B << ", C = " << *C
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tSrc = " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tDst = " << *Dst << "\n");
   if (A->isZero()) {
     const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
@@ -3094,8 +3166,8 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
   }
-  DEBUG(dbgs() << "\t\tnew Src = " << *Src << "\n");
-  DEBUG(dbgs() << "\t\tnew Dst = " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tnew Src = " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tnew Dst = " << *Dst << "\n");
   return true;
 }
 
@@ -3110,13 +3182,13 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
   const SCEV *AP_K = findCoefficient(Dst, CurLoop);
   const SCEV *XA_K = SE->getMulExpr(A_K, CurConstraint.getX());
   const SCEV *YAP_K = SE->getMulExpr(AP_K, CurConstraint.getY());
-  DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
   Src = SE->getAddExpr(Src, SE->getMinusSCEV(XA_K, YAP_K));
   Src = zeroCoefficient(Src, CurLoop);
-  DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
-  DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
   Dst = zeroCoefficient(Dst, CurLoop);
-  DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
   return true;
 }
 
@@ -3124,8 +3196,8 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
 // Update direction vector entry based on the current constraint.
 void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
                                      const Constraint &CurConstraint) const {
-  DEBUG(dbgs() << "\tUpdate direction, constraint =");
-  DEBUG(CurConstraint.dump(dbgs()));
+  LLVM_DEBUG(dbgs() << "\tUpdate direction, constraint =");
+  LLVM_DEBUG(CurConstraint.dump(dbgs()));
   if (CurConstraint.isAny())
     ; // use defaults
   else if (CurConstraint.isDistance()) {
@@ -3177,8 +3249,10 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
 /// for each loop level.
 bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
                                     SmallVectorImpl<Subscript> &Pair) {
-  Value *SrcPtr = getPointerOperand(Src);
-  Value *DstPtr = getPointerOperand(Dst);
+  assert(isLoadOrStore(Src) && "instruction is not load or store");
+  assert(isLoadOrStore(Dst) && "instruction is not load or store");
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *DstPtr = getLoadStorePointerOperand(Dst);
 
   Loop *SrcLoop = LI->getLoopFor(Src->getParent());
   Loop *DstLoop = LI->getLoopFor(Dst->getParent());
@@ -3230,14 +3304,34 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
 
   int size = SrcSubscripts.size();
 
-  DEBUG({
-      dbgs() << "\nSrcSubscripts: ";
+  // Statically check that the array bounds are in-range. The first subscript we
+  // don't have a size for and it cannot overflow into another subscript, so is
+  // always safe. The others need to be 0 <= subscript[i] < bound, for both src
+  // and dst.
+  // FIXME: It may be better to record these sizes and add them as constraints
+  // to the dependency checks.
+  for (int i = 1; i < size; ++i) {
+    if (!isKnownNonNegative(SrcSubscripts[i], SrcPtr))
+      return false;
+
+    if (!isKnownLessThan(SrcSubscripts[i], Sizes[i - 1]))
+      return false;
+
+    if (!isKnownNonNegative(DstSubscripts[i], DstPtr))
+      return false;
+
+    if (!isKnownLessThan(DstSubscripts[i], Sizes[i - 1]))
+      return false;
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "\nSrcSubscripts: ";
     for (int i = 0; i < size; i++)
       dbgs() << *SrcSubscripts[i];
     dbgs() << "\nDstSubscripts: ";
     for (int i = 0; i < size; i++)
       dbgs() << *DstSubscripts[i];
-    });
+  });
 
   // The delinearization transforms a single-subscript MIV dependence test into
   // a multi-subscript SIV dependence test that is easier to compute. So we
@@ -3248,13 +3342,6 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
     Pair[i].Src = SrcSubscripts[i];
     Pair[i].Dst = DstSubscripts[i];
     unifySubscriptType(&Pair[i]);
-
-    // FIXME: we should record the bounds SrcSizes[i] and DstSizes[i] that the
-    // delinearization has found, and add these constraints to the dependence
-    // check to avoid memory accesses overflow from one dimension into another.
-    // This is related to the problem of determining the existence of data
-    // dependences in array accesses using a different number of subscripts: in
-    // C one can access an array A[100][100]; as A[0][9999], *A[9999], etc.
   }
 
   return true;
@@ -3299,23 +3386,26 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
   if (!isLoadOrStore(Src) || !isLoadOrStore(Dst)) {
     // can only analyze simple loads and stores, i.e., no calls, invokes, etc.
-    DEBUG(dbgs() << "can only handle simple loads and stores\n");
+    LLVM_DEBUG(dbgs() << "can only handle simple loads and stores\n");
     return make_unique<Dependence>(Src, Dst);
   }
 
-  Value *SrcPtr = getPointerOperand(Src);
-  Value *DstPtr = getPointerOperand(Dst);
+  assert(isLoadOrStore(Src) && "instruction is not load or store");
+  assert(isLoadOrStore(Dst) && "instruction is not load or store");
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *DstPtr = getLoadStorePointerOperand(Dst);
 
-  switch (underlyingObjectsAlias(AA, F->getParent()->getDataLayout(), DstPtr,
-                                 SrcPtr)) {
+  switch (underlyingObjectsAlias(AA, F->getParent()->getDataLayout(),
+                                 MemoryLocation::get(Dst),
+                                 MemoryLocation::get(Src))) {
   case MayAlias:
   case PartialAlias:
     // cannot analyse objects if we don't understand their aliasing.
-    DEBUG(dbgs() << "can't analyze may or partial alias\n");
+    LLVM_DEBUG(dbgs() << "can't analyze may or partial alias\n");
     return make_unique<Dependence>(Src, Dst);
   case NoAlias:
     // If the objects noalias, they are distinct, accesses are independent.
-    DEBUG(dbgs() << "no alias\n");
+    LLVM_DEBUG(dbgs() << "no alias\n");
     return nullptr;
   case MustAlias:
     break; // The underlying objects alias; test accesses for dependence.
@@ -3323,56 +3413,24 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
   // establish loop nesting levels
   establishNestingLevels(Src, Dst);
-  DEBUG(dbgs() << "    common nesting levels = " << CommonLevels << "\n");
-  DEBUG(dbgs() << "    maximum nesting levels = " << MaxLevels << "\n");
+  LLVM_DEBUG(dbgs() << "    common nesting levels = " << CommonLevels << "\n");
+  LLVM_DEBUG(dbgs() << "    maximum nesting levels = " << MaxLevels << "\n");
 
   FullDependence Result(Src, Dst, PossiblyLoopIndependent, CommonLevels);
   ++TotalArrayPairs;
 
-  // See if there are GEPs we can use.
-  bool UsefulGEP = false;
-  GEPOperator *SrcGEP = dyn_cast<GEPOperator>(SrcPtr);
-  GEPOperator *DstGEP = dyn_cast<GEPOperator>(DstPtr);
-  if (SrcGEP && DstGEP &&
-      SrcGEP->getPointerOperandType() == DstGEP->getPointerOperandType()) {
-    const SCEV *SrcPtrSCEV = SE->getSCEV(SrcGEP->getPointerOperand());
-    const SCEV *DstPtrSCEV = SE->getSCEV(DstGEP->getPointerOperand());
-    DEBUG(dbgs() << "    SrcPtrSCEV = " << *SrcPtrSCEV << "\n");
-    DEBUG(dbgs() << "    DstPtrSCEV = " << *DstPtrSCEV << "\n");
-
-    UsefulGEP = isLoopInvariant(SrcPtrSCEV, LI->getLoopFor(Src->getParent())) &&
-                isLoopInvariant(DstPtrSCEV, LI->getLoopFor(Dst->getParent())) &&
-                (SrcGEP->getNumOperands() == DstGEP->getNumOperands()) &&
-                isKnownPredicate(CmpInst::ICMP_EQ, SrcPtrSCEV, DstPtrSCEV);
-  }
-  unsigned Pairs = UsefulGEP ? SrcGEP->idx_end() - SrcGEP->idx_begin() : 1;
-  SmallVector<Subscript, 4> Pair(Pairs);
-  if (UsefulGEP) {
-    DEBUG(dbgs() << "    using GEPs\n");
-    unsigned P = 0;
-    for (GEPOperator::const_op_iterator SrcIdx = SrcGEP->idx_begin(),
-           SrcEnd = SrcGEP->idx_end(),
-           DstIdx = DstGEP->idx_begin();
-         SrcIdx != SrcEnd;
-         ++SrcIdx, ++DstIdx, ++P) {
-      Pair[P].Src = SE->getSCEV(*SrcIdx);
-      Pair[P].Dst = SE->getSCEV(*DstIdx);
-      unifySubscriptType(&Pair[P]);
-    }
-  }
-  else {
-    DEBUG(dbgs() << "    ignoring GEPs\n");
-    const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
-    const SCEV *DstSCEV = SE->getSCEV(DstPtr);
-    DEBUG(dbgs() << "    SrcSCEV = " << *SrcSCEV << "\n");
-    DEBUG(dbgs() << "    DstSCEV = " << *DstSCEV << "\n");
-    Pair[0].Src = SrcSCEV;
-    Pair[0].Dst = DstSCEV;
-  }
+  unsigned Pairs = 1;
+  SmallVector<Subscript, 2> Pair(Pairs);
+  const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
+  const SCEV *DstSCEV = SE->getSCEV(DstPtr);
+  LLVM_DEBUG(dbgs() << "    SrcSCEV = " << *SrcSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "    DstSCEV = " << *DstSCEV << "\n");
+  Pair[0].Src = SrcSCEV;
+  Pair[0].Dst = DstSCEV;
 
-  if (Delinearize && CommonLevels > 1) {
+  if (Delinearize) {
     if (tryDelinearize(Src, Dst, Pair)) {
-      DEBUG(dbgs() << "    delinearized GEP\n");
+      LLVM_DEBUG(dbgs() << "    delinearized\n");
       Pairs = Pair.size();
     }
   }
@@ -3388,12 +3446,12 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
                    Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
-    DEBUG(dbgs() << "    subscript " << P << "\n");
-    DEBUG(dbgs() << "\tsrc = " << *Pair[P].Src << "\n");
-    DEBUG(dbgs() << "\tdst = " << *Pair[P].Dst << "\n");
-    DEBUG(dbgs() << "\tclass = " << Pair[P].Classification << "\n");
-    DEBUG(dbgs() << "\tloops = ");
-    DEBUG(dumpSmallBitVector(Pair[P].Loops));
+    LLVM_DEBUG(dbgs() << "    subscript " << P << "\n");
+    LLVM_DEBUG(dbgs() << "\tsrc = " << *Pair[P].Src << "\n");
+    LLVM_DEBUG(dbgs() << "\tdst = " << *Pair[P].Dst << "\n");
+    LLVM_DEBUG(dbgs() << "\tclass = " << Pair[P].Classification << "\n");
+    LLVM_DEBUG(dbgs() << "\tloops = ");
+    LLVM_DEBUG(dumpSmallBitVector(Pair[P].Loops));
   }
 
   SmallBitVector Separable(Pairs);
@@ -3498,25 +3556,25 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     }
   }
 
-  DEBUG(dbgs() << "    Separable = ");
-  DEBUG(dumpSmallBitVector(Separable));
-  DEBUG(dbgs() << "    Coupled = ");
-  DEBUG(dumpSmallBitVector(Coupled));
+  LLVM_DEBUG(dbgs() << "    Separable = ");
+  LLVM_DEBUG(dumpSmallBitVector(Separable));
+  LLVM_DEBUG(dbgs() << "    Coupled = ");
+  LLVM_DEBUG(dumpSmallBitVector(Coupled));
 
   Constraint NewConstraint;
   NewConstraint.setAny(SE);
 
   // test separable subscripts
   for (unsigned SI : Separable.set_bits()) {
-    DEBUG(dbgs() << "testing subscript " << SI);
+    LLVM_DEBUG(dbgs() << "testing subscript " << SI);
     switch (Pair[SI].Classification) {
     case Subscript::ZIV:
-      DEBUG(dbgs() << ", ZIV\n");
+      LLVM_DEBUG(dbgs() << ", ZIV\n");
       if (testZIV(Pair[SI].Src, Pair[SI].Dst, Result))
         return nullptr;
       break;
     case Subscript::SIV: {
-      DEBUG(dbgs() << ", SIV\n");
+      LLVM_DEBUG(dbgs() << ", SIV\n");
       unsigned Level;
       const SCEV *SplitIter = nullptr;
       if (testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint,
@@ -3525,12 +3583,12 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       break;
     }
     case Subscript::RDIV:
-      DEBUG(dbgs() << ", RDIV\n");
+      LLVM_DEBUG(dbgs() << ", RDIV\n");
       if (testRDIV(Pair[SI].Src, Pair[SI].Dst, Result))
         return nullptr;
       break;
     case Subscript::MIV:
-      DEBUG(dbgs() << ", MIV\n");
+      LLVM_DEBUG(dbgs() << ", MIV\n");
       if (testMIV(Pair[SI].Src, Pair[SI].Dst, Pair[SI].Loops, Result))
         return nullptr;
       break;
@@ -3541,20 +3599,20 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
   if (Coupled.count()) {
     // test coupled subscript groups
-    DEBUG(dbgs() << "starting on coupled subscripts\n");
-    DEBUG(dbgs() << "MaxLevels + 1 = " << MaxLevels + 1 << "\n");
+    LLVM_DEBUG(dbgs() << "starting on coupled subscripts\n");
+    LLVM_DEBUG(dbgs() << "MaxLevels + 1 = " << MaxLevels + 1 << "\n");
     SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
     for (unsigned II = 0; II <= MaxLevels; ++II)
       Constraints[II].setAny(SE);
     for (unsigned SI : Coupled.set_bits()) {
-      DEBUG(dbgs() << "testing subscript group " << SI << " { ");
+      LLVM_DEBUG(dbgs() << "testing subscript group " << SI << " { ");
       SmallBitVector Group(Pair[SI].Group);
       SmallBitVector Sivs(Pairs);
       SmallBitVector Mivs(Pairs);
       SmallBitVector ConstrainedLevels(MaxLevels + 1);
       SmallVector<Subscript *, 4> PairsInGroup;
       for (unsigned SJ : Group.set_bits()) {
-        DEBUG(dbgs() << SJ << " ");
+        LLVM_DEBUG(dbgs() << SJ << " ");
         if (Pair[SJ].Classification == Subscript::SIV)
           Sivs.set(SJ);
         else
@@ -3562,15 +3620,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         PairsInGroup.push_back(&Pair[SJ]);
       }
       unifySubscriptType(PairsInGroup);
-      DEBUG(dbgs() << "}\n");
+      LLVM_DEBUG(dbgs() << "}\n");
       while (Sivs.any()) {
         bool Changed = false;
         for (unsigned SJ : Sivs.set_bits()) {
-          DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
+          LLVM_DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
           // SJ is an SIV subscript that's part of the current coupled group
           unsigned Level;
           const SCEV *SplitIter = nullptr;
-          DEBUG(dbgs() << "SIV\n");
+          LLVM_DEBUG(dbgs() << "SIV\n");
           if (testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level, Result, NewConstraint,
                       SplitIter))
             return nullptr;
@@ -3586,15 +3644,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         }
         if (Changed) {
           // propagate, possibly creating new SIVs and ZIVs
-          DEBUG(dbgs() << "    propagating\n");
-          DEBUG(dbgs() << "\tMivs = ");
-          DEBUG(dumpSmallBitVector(Mivs));
+          LLVM_DEBUG(dbgs() << "    propagating\n");
+          LLVM_DEBUG(dbgs() << "\tMivs = ");
+          LLVM_DEBUG(dumpSmallBitVector(Mivs));
           for (unsigned SJ : Mivs.set_bits()) {
             // SJ is an MIV subscript that's part of the current coupled group
-            DEBUG(dbgs() << "\tSJ = " << SJ << "\n");
+            LLVM_DEBUG(dbgs() << "\tSJ = " << SJ << "\n");
             if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops,
                           Constraints, Result.Consistent)) {
-              DEBUG(dbgs() << "\t    Changed\n");
+              LLVM_DEBUG(dbgs() << "\t    Changed\n");
               ++DeltaPropagations;
               Pair[SJ].Classification =
                 classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
@@ -3602,7 +3660,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
                              Pair[SJ].Loops);
               switch (Pair[SJ].Classification) {
               case Subscript::ZIV:
-                DEBUG(dbgs() << "ZIV\n");
+                LLVM_DEBUG(dbgs() << "ZIV\n");
                 if (testZIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
                   return nullptr;
                 Mivs.reset(SJ);
@@ -3625,7 +3683,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       // test & propagate remaining RDIVs
       for (unsigned SJ : Mivs.set_bits()) {
         if (Pair[SJ].Classification == Subscript::RDIV) {
-          DEBUG(dbgs() << "RDIV test\n");
+          LLVM_DEBUG(dbgs() << "RDIV test\n");
           if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
             return nullptr;
           // I don't yet understand how to propagate RDIV results
@@ -3638,7 +3696,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       // Better to somehow test all remaining subscripts simultaneously.
       for (unsigned SJ : Mivs.set_bits()) {
         if (Pair[SJ].Classification == Subscript::MIV) {
-          DEBUG(dbgs() << "MIV test\n");
+          LLVM_DEBUG(dbgs() << "MIV test\n");
           if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
             return nullptr;
         }
@@ -3647,7 +3705,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       }
 
       // update Result.DV from constraint vector
-      DEBUG(dbgs() << "    updating\n");
+      LLVM_DEBUG(dbgs() << "    updating\n");
       for (unsigned SJ : ConstrainedLevels.set_bits()) {
         if (SJ > CommonLevels)
           break;
@@ -3753,51 +3811,27 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   assert(Dst->mayReadFromMemory() || Dst->mayWriteToMemory());
   assert(isLoadOrStore(Src));
   assert(isLoadOrStore(Dst));
-  Value *SrcPtr = getPointerOperand(Src);
-  Value *DstPtr = getPointerOperand(Dst);
-  assert(underlyingObjectsAlias(AA, F->getParent()->getDataLayout(), DstPtr,
-                                SrcPtr) == MustAlias);
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *DstPtr = getLoadStorePointerOperand(Dst);
+  assert(underlyingObjectsAlias(AA, F->getParent()->getDataLayout(),
+                                MemoryLocation::get(Dst),
+                                MemoryLocation::get(Src)) == MustAlias);
 
   // establish loop nesting levels
   establishNestingLevels(Src, Dst);
 
   FullDependence Result(Src, Dst, false, CommonLevels);
 
-  // See if there are GEPs we can use.
-  bool UsefulGEP = false;
-  GEPOperator *SrcGEP = dyn_cast<GEPOperator>(SrcPtr);
-  GEPOperator *DstGEP = dyn_cast<GEPOperator>(DstPtr);
-  if (SrcGEP && DstGEP &&
-      SrcGEP->getPointerOperandType() == DstGEP->getPointerOperandType()) {
-    const SCEV *SrcPtrSCEV = SE->getSCEV(SrcGEP->getPointerOperand());
-    const SCEV *DstPtrSCEV = SE->getSCEV(DstGEP->getPointerOperand());
-    UsefulGEP = isLoopInvariant(SrcPtrSCEV, LI->getLoopFor(Src->getParent())) &&
-                isLoopInvariant(DstPtrSCEV, LI->getLoopFor(Dst->getParent())) &&
-                (SrcGEP->getNumOperands() == DstGEP->getNumOperands());
-  }
-  unsigned Pairs = UsefulGEP ? SrcGEP->idx_end() - SrcGEP->idx_begin() : 1;
-  SmallVector<Subscript, 4> Pair(Pairs);
-  if (UsefulGEP) {
-    unsigned P = 0;
-    for (GEPOperator::const_op_iterator SrcIdx = SrcGEP->idx_begin(),
-           SrcEnd = SrcGEP->idx_end(),
-           DstIdx = DstGEP->idx_begin();
-         SrcIdx != SrcEnd;
-         ++SrcIdx, ++DstIdx, ++P) {
-      Pair[P].Src = SE->getSCEV(*SrcIdx);
-      Pair[P].Dst = SE->getSCEV(*DstIdx);
-    }
-  }
-  else {
-    const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
-    const SCEV *DstSCEV = SE->getSCEV(DstPtr);
-    Pair[0].Src = SrcSCEV;
-    Pair[0].Dst = DstSCEV;
-  }
+  unsigned Pairs = 1;
+  SmallVector<Subscript, 2> Pair(Pairs);
+  const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
+  const SCEV *DstSCEV = SE->getSCEV(DstPtr);
+  Pair[0].Src = SrcSCEV;
+  Pair[0].Dst = DstSCEV;
 
-  if (Delinearize && CommonLevels > 1) {
+  if (Delinearize) {
     if (tryDelinearize(Src, Dst, Pair)) {
-      DEBUG(dbgs() << "    delinearized GEP\n");
+      LLVM_DEBUG(dbgs() << "    delinearized\n");
       Pairs = Pair.size();
     }
   }
diff --git a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
index ac684ec18466..f5f1874c9303 100644
--- a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -77,6 +77,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "divergence"
+
 namespace {
 
 class DivergencePropagator {
@@ -299,6 +301,10 @@ bool DivergenceAnalysis::runOnFunction(Function &F) {
                           PDT, DivergentValues);
   DP.populateWithSourcesOfDivergence();
   DP.propagate();
+  LLVM_DEBUG(
+    dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
+    print(dbgs(), F.getParent())
+  );
   return false;
 }
 
@@ -318,12 +324,17 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
 
   // Dumps all divergent values in F, arguments and then instructions.
   for (auto &Arg : F->args()) {
-    if (DivergentValues.count(&Arg))
-      OS << "DIVERGENT:  " << Arg << "\n";
+    OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : "           ");
+    OS << Arg << "\n";
   }
   // Iterate instructions using instructions() to ensure a deterministic order.
-  for (auto &I : instructions(F)) {
-    if (DivergentValues.count(&I))
-      OS << "DIVERGENT:" << I << "\n";
+  for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) {
+    auto &BB = *BI;
+    OS << "\n           " << BB.getName() << ":\n";
+    for (auto &I : BB.instructionsWithoutDebug()) {
+      OS << (DivergentValues.count(&I) ? "DIVERGENT:     " : "               ");
+      OS << I << "\n";
+    }
   }
+  OS << "\n";
 }
diff --git a/contrib/llvm/lib/Analysis/DominanceFrontier.cpp b/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
index bb8caf4a5174..de7f62cf4ecd 100644
--- a/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
+++ b/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
diff --git a/contrib/llvm/lib/Analysis/EHPersonalities.cpp b/contrib/llvm/lib/Analysis/EHPersonalities.cpp
index b12ae9884e3d..2d35a3fa9118 100644
--- a/contrib/llvm/lib/Analysis/EHPersonalities.cpp
+++ b/contrib/llvm/lib/Analysis/EHPersonalities.cpp
@@ -25,20 +25,21 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
   if (!F)
     return EHPersonality::Unknown;
   return StringSwitch<EHPersonality>(F->getName())
-    .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
-    .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
-    .Case("__gxx_personality_seh0",EHPersonality::GNU_CXX)
-    .Case("__gxx_personality_sj0", EHPersonality::GNU_CXX_SjLj)
-    .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
-    .Case("__gcc_personality_seh0",EHPersonality::GNU_C)
-    .Case("__gcc_personality_sj0", EHPersonality::GNU_C_SjLj)
-    .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
-    .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
-    .Case("_except_handler4",      EHPersonality::MSVC_X86SEH)
-    .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
-    .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
-    .Case("ProcessCLRException",   EHPersonality::CoreCLR)
-    .Case("rust_eh_personality",   EHPersonality::Rust)
+    .Case("__gnat_eh_personality",     EHPersonality::GNU_Ada)
+    .Case("__gxx_personality_v0",      EHPersonality::GNU_CXX)
+    .Case("__gxx_personality_seh0",    EHPersonality::GNU_CXX)
+    .Case("__gxx_personality_sj0",     EHPersonality::GNU_CXX_SjLj)
+    .Case("__gcc_personality_v0",      EHPersonality::GNU_C)
+    .Case("__gcc_personality_seh0",    EHPersonality::GNU_C)
+    .Case("__gcc_personality_sj0",     EHPersonality::GNU_C_SjLj)
+    .Case("__objc_personality_v0",     EHPersonality::GNU_ObjC)
+    .Case("_except_handler3",          EHPersonality::MSVC_X86SEH)
+    .Case("_except_handler4",          EHPersonality::MSVC_X86SEH)
+    .Case("__C_specific_handler",      EHPersonality::MSVC_Win64SEH)
+    .Case("__CxxFrameHandler3",        EHPersonality::MSVC_CXX)
+    .Case("ProcessCLRException",       EHPersonality::CoreCLR)
+    .Case("rust_eh_personality",       EHPersonality::Rust)
+    .Case("__gxx_wasm_personality_v0", EHPersonality::Wasm_CXX)
     .Default(EHPersonality::Unknown);
 }
 
@@ -55,6 +56,7 @@ StringRef llvm::getEHPersonalityName(EHPersonality Pers) {
   case EHPersonality::MSVC_CXX:      return "__CxxFrameHandler3";
   case EHPersonality::CoreCLR:       return "ProcessCLRException";
   case EHPersonality::Rust:          return "rust_eh_personality";
+  case EHPersonality::Wasm_CXX:      return "__gxx_wasm_personality_v0";
   case EHPersonality::Unknown:       llvm_unreachable("Unknown EHPersonality!");
   }
 
diff --git a/contrib/llvm/lib/Analysis/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
index 94306d0f54ad..197aee9dacb7 100644
--- a/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -65,7 +65,7 @@ class GlobalsAAResult::FunctionInfo {
   /// Build a wrapper struct that has 8-byte alignment. All heap allocations
   /// should provide this much alignment at least, but this makes it clear we
   /// specifically rely on this amount of alignment.
-  struct LLVM_ALIGNAS(8) AlignedMap {
+  struct alignas(8) AlignedMap {
     AlignedMap() {}
     AlignedMap(const AlignedMap &Arg) : Map(Arg.Map) {}
     GlobalInfoMapType Map;
@@ -584,6 +584,10 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
           } else if (Function *Callee = CS.getCalledFunction()) {
             // The callgraph doesn't include intrinsic calls.
             if (Callee->isIntrinsic()) {
+              if (isa<DbgInfoIntrinsic>(I))
+                // Don't let dbg intrinsics affect alias info.
+                continue;
+
               FunctionModRefBehavior Behaviour =
                   AAResultBase::getModRefBehavior(Callee);
               FI.addModRefInfo(createModRefInfo(Behaviour));
diff --git a/contrib/llvm/lib/Analysis/IVUsers.cpp b/contrib/llvm/lib/Analysis/IVUsers.cpp
index c30feb973e60..609e5e3a1448 100644
--- a/contrib/llvm/lib/Analysis/IVUsers.cpp
+++ b/contrib/llvm/lib/Analysis/IVUsers.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -234,13 +235,13 @@ bool IVUsers::AddUsersImpl(Instruction *I,
     if (LI->getLoopFor(User->getParent()) != L) {
       if (isa<PHINode>(User) || Processed.count(User) ||
           !AddUsersImpl(User, SimpleLoopNests)) {
-        DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
-                     << "   OF SCEV: " << *ISE << '\n');
+        LLVM_DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
+                          << "   OF SCEV: " << *ISE << '\n');
         AddUserToIVUsers = true;
       }
     } else if (Processed.count(User) || !AddUsersImpl(User, SimpleLoopNests)) {
-      DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
-                   << "   OF SCEV: " << *ISE << '\n');
+      LLVM_DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
+                        << "   OF SCEV: " << *ISE << '\n');
       AddUserToIVUsers = true;
     }
 
@@ -273,14 +274,15 @@ bool IVUsers::AddUsersImpl(Instruction *I,
         // If we normalized the expression, but denormalization doesn't give the
         // original one, discard this user.
         if (OriginalISE != DenormalizedISE) {
-          DEBUG(dbgs() << "   DISCARDING (NORMALIZATION ISN'T INVERTIBLE): "
-                       << *ISE << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "   DISCARDING (NORMALIZATION ISN'T INVERTIBLE): "
+                     << *ISE << '\n');
           IVUses.pop_back();
           return false;
         }
       }
-      DEBUG(if (SE->getSCEV(I) != ISE)
-              dbgs() << "   NORMALIZED TO: " << *ISE << '\n');
+      LLVM_DEBUG(if (SE->getSCEV(I) != ISE) dbgs()
+                 << "   NORMALIZED TO: " << *ISE << '\n');
     }
   }
   return true;
diff --git a/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index c11176bbb9c8..4659c0a00629 100644
--- a/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -71,19 +71,19 @@ uint32_t ICallPromotionAnalysis::getProfitablePromotionCandidates(
     const Instruction *Inst, uint32_t NumVals, uint64_t TotalCount) {
   ArrayRef<InstrProfValueData> ValueDataRef(ValueDataArray.get(), NumVals);
 
-  DEBUG(dbgs() << " \nWork on callsite " << *Inst << " Num_targets: " << NumVals
-               << "\n");
+  LLVM_DEBUG(dbgs() << " \nWork on callsite " << *Inst
+                    << " Num_targets: " << NumVals << "\n");
 
   uint32_t I = 0;
   uint64_t RemainingCount = TotalCount;
   for (; I < MaxNumPromotions && I < NumVals; I++) {
     uint64_t Count = ValueDataRef[I].Count;
     assert(Count <= RemainingCount);
-    DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
-                 << "  Target_func: " << ValueDataRef[I].Value << "\n");
+    LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                      << "  Target_func: " << ValueDataRef[I].Value << "\n");
 
     if (!isPromotionProfitable(Count, TotalCount, RemainingCount)) {
-      DEBUG(dbgs() << " Not promote: Cold target.\n");
+      LLVM_DEBUG(dbgs() << " Not promote: Cold target.\n");
       return I;
     }
     RemainingCount -= Count;
diff --git a/contrib/llvm/lib/Analysis/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp
index b0cb29203a5a..a6cccc3b5910 100644
--- a/contrib/llvm/lib/Analysis/InlineCost.cpp
+++ b/contrib/llvm/lib/Analysis/InlineCost.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
@@ -135,7 +136,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool ContainsNoDuplicateCall;
   bool HasReturn;
   bool HasIndirectBr;
-  bool HasFrameEscape;
+  bool HasUninlineableIntrinsic;
+  bool UsesVarArgs;
 
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
@@ -280,12 +282,13 @@ public:
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
-        EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+        HasUninlineableIntrinsic(false), UsesVarArgs(false), AllocatedSize(0),
+        NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
+        SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -308,12 +311,12 @@ public:
 
 } // namespace
 
-/// \brief Test whether the given value is an Alloca-derived function argument.
+/// Test whether the given value is an Alloca-derived function argument.
 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
   return SROAArgValues.count(V);
 }
 
-/// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
+/// Lookup the SROA-candidate argument and cost iterator which V maps to.
 /// Returns false if V does not map to a SROA-candidate.
 bool CallAnalyzer::lookupSROAArgAndCost(
     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
@@ -329,7 +332,7 @@ bool CallAnalyzer::lookupSROAArgAndCost(
   return CostIt != SROAArgCosts.end();
 }
 
-/// \brief Disable SROA for the candidate marked by this cost iterator.
+/// Disable SROA for the candidate marked by this cost iterator.
 ///
 /// This marks the candidate as no longer viable for SROA, and adds the cost
 /// savings associated with it back into the inline cost measurement.
@@ -343,7 +346,7 @@ void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
   disableLoadElimination();
 }
 
-/// \brief If 'V' maps to a SROA candidate, disable SROA for it.
+/// If 'V' maps to a SROA candidate, disable SROA for it.
 void CallAnalyzer::disableSROA(Value *V) {
   Value *SROAArg;
   DenseMap<Value *, int>::iterator CostIt;
@@ -351,7 +354,7 @@ void CallAnalyzer::disableSROA(Value *V) {
     disableSROA(CostIt);
 }
 
-/// \brief Accumulate the given cost for a particular SROA candidate.
+/// Accumulate the given cost for a particular SROA candidate.
 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                                       int InstructionCost) {
   CostIt->second += InstructionCost;
@@ -366,12 +369,12 @@ void CallAnalyzer::disableLoadElimination() {
   }
 }
 
-/// \brief Accumulate a constant GEP offset into an APInt if possible.
+/// Accumulate a constant GEP offset into an APInt if possible.
 ///
 /// Returns false if unable to compute the offset for any reason. Respects any
 /// simplified values known during the analysis of this callsite.
 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
-  unsigned IntPtrWidth = DL.getPointerSizeInBits();
+  unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
   assert(IntPtrWidth == Offset.getBitWidth());
 
   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
@@ -399,7 +402,7 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
   return true;
 }
 
-/// \brief Use TTI to check whether a GEP is free.
+/// Use TTI to check whether a GEP is free.
 ///
 /// Respects any simplified values known during the analysis of this callsite.
 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
@@ -450,8 +453,12 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
   // SROA if it *might* be used in an inappropriate manner.
 
   // Phi nodes are always zero-cost.
-
-  APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits());
+  // FIXME: Pointer sizes may differ between different address spaces, so do we
+  // need to use correct address space in the call to getPointerSizeInBits here?
+  // Or could we skip the getPointerSizeInBits call completely? As far as I can
+  // see the ZeroOffset is used as a dummy value, so we can probably use any
+  // bit width for the ZeroOffset?
+  APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
   bool CheckSROA = I.getType()->isPointerTy();
 
   // Track the constant or pointer with constant offset we've seen so far.
@@ -536,7 +543,7 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
   return true;
 }
 
-/// \brief Check we can fold GEPs of constant-offset call site argument pointers.
+/// Check we can fold GEPs of constant-offset call site argument pointers.
 /// This requires target data and inbounds GEPs.
 ///
 /// \return true if the specified GEP can be folded.
@@ -641,7 +648,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // Track base/offset pairs when converted to a plain integer provided the
   // integer is large enough to represent the pointer.
   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
-  if (IntegerSize >= DL.getPointerSizeInBits()) {
+  unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
+  if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
     std::pair<Value *, APInt> BaseAndOffset =
         ConstantOffsetPtrs.lookup(I.getOperand(0));
     if (BaseAndOffset.first)
@@ -674,7 +682,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   // modifications provided the integer is not too large.
   Value *Op = I.getOperand(0);
   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
-  if (IntegerSize <= DL.getPointerSizeInBits()) {
+  if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
     if (BaseAndOffset.first)
       ConstantOffsetPtrs[&I] = BaseAndOffset;
@@ -913,14 +921,14 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
     auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
     if (!Caller->optForSize() && HotCallSiteThreshold) {
-      DEBUG(dbgs() << "Hot callsite.\n");
+      LLVM_DEBUG(dbgs() << "Hot callsite.\n");
       // FIXME: This should update the threshold only if it exceeds the
       // current threshold, but AutoFDO + ThinLTO currently relies on this
       // behavior to prevent inlining of hot callsites during ThinLTO
       // compile phase.
       Threshold = HotCallSiteThreshold.getValue();
     } else if (isColdCallSite(CS, CallerBFI)) {
-      DEBUG(dbgs() << "Cold callsite.\n");
+      LLVM_DEBUG(dbgs() << "Cold callsite.\n");
       // Do not apply bonuses for a cold callsite including the
       // LastCallToStatic bonus. While this bonus might result in code size
       // reduction, it can cause the size of a non-cold caller to increase
@@ -931,13 +939,13 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
       // Use callee's global profile information only if we have no way of
       // determining this via callsite information.
       if (PSI->isFunctionEntryHot(&Callee)) {
-        DEBUG(dbgs() << "Hot callee.\n");
+        LLVM_DEBUG(dbgs() << "Hot callee.\n");
         // If callsite hotness can not be determined, we may still know
         // that the callee is hot and treat it as a weaker hint for threshold
         // increase.
         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
       } else if (PSI->isFunctionEntryCold(&Callee)) {
-        DEBUG(dbgs() << "Cold callee.\n");
+        LLVM_DEBUG(dbgs() << "Cold callee.\n");
         // Do not apply bonuses for a cold callee including the
         // LastCallToStatic bonus. While this bonus might result in code size
         // reduction, it can cause the size of a non-cold caller to increase
@@ -1155,7 +1163,7 @@ bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   return false;
 }
 
-/// \brief Try to simplify a call site.
+/// Try to simplify a call site.
 ///
 /// Takes a concrete function and callsite and tries to actually simplify it by
 /// analyzing the arguments and call itself with instsimplify. Returns true if
@@ -1225,8 +1233,13 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
         disableLoadElimination();
         // SROA can usually chew through these intrinsics, but they aren't free.
         return false;
+      case Intrinsic::icall_branch_funnel:
       case Intrinsic::localescape:
-        HasFrameEscape = true;
+        HasUninlineableIntrinsic = true;
+        return false;
+      case Intrinsic::vastart:
+      case Intrinsic::vaend:
+        UsesVarArgs = true;
         return false;
       }
     }
@@ -1521,7 +1534,7 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
   return false;
 }
 
-/// \brief Analyze a basic block for its contribution to the inline cost.
+/// Analyze a basic block for its contribution to the inline cost.
 ///
 /// This method walks the analyzer over every instruction in the given basic
 /// block and accounts for their cost during inlining at this callsite. It
@@ -1562,7 +1575,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-        HasIndirectBr || HasFrameEscape) {
+        HasIndirectBr || HasUninlineableIntrinsic || UsesVarArgs) {
       if (ORE)
         ORE->emit([&]() {
           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
@@ -1598,7 +1611,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
   return true;
 }
 
-/// \brief Compute the base pointer and cumulative constant offsets for V.
+/// Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
 /// accumulates the total constant offset applied in the returned constant. It
@@ -1608,7 +1621,8 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   if (!V->getType()->isPointerTy())
     return nullptr;
 
-  unsigned IntPtrWidth = DL.getPointerSizeInBits();
+  unsigned AS = V->getType()->getPointerAddressSpace();
+  unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
   APInt Offset = APInt::getNullValue(IntPtrWidth);
 
   // Even though we don't look through PHI nodes, we could be called on an
@@ -1632,11 +1646,11 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   } while (Visited.insert(V).second);
 
-  Type *IntPtrTy = DL.getIntPtrType(V->getContext());
+  Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
 }
 
-/// \brief Find dead blocks due to deleted CFG edges during inlining.
+/// Find dead blocks due to deleted CFG edges during inlining.
 ///
 /// If we know the successor of the current block, \p CurrBB, has to be \p
 /// NextBB, the other successors of \p CurrBB are dead if these successors have
@@ -1674,7 +1688,7 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
   }
 }
 
-/// \brief Analyze a call site for potential inlining.
+/// Analyze a call site for potential inlining.
 ///
 /// Returns true if inlining this call is viable, and false if it is not
 /// viable. It computes the cost and adjusts the threshold based on numerous
@@ -1867,7 +1881,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-/// \brief Dump stats about this call's analysis.
+/// Dump stats about this call's analysis.
 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
   DEBUG_PRINT_STAT(NumConstantArgs);
@@ -1887,7 +1901,7 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 }
 #endif
 
-/// \brief Test that there are no attribute conflicts between Caller and Callee
+/// Test that there are no attribute conflicts between Caller and Callee
 ///        that prevent inlining.
 static bool functionsHaveCompatibleAttributes(Function *Caller,
                                               Function *Callee,
@@ -1904,7 +1918,8 @@ int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
       // size of the byval type by the target's pointer size.
       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
-      unsigned PointerSize = DL.getPointerSizeInBits();
+      unsigned AS = PTy->getAddressSpace();
+      unsigned PointerSize = DL.getPointerSizeInBits(AS);
       // Ceiling division.
       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
 
@@ -1948,6 +1963,19 @@ InlineCost llvm::getInlineCost(
   if (!Callee)
     return llvm::InlineCost::getNever();
 
+  // Never inline calls with byval arguments that does not have the alloca
+  // address space. Since byval arguments can be replaced with a copy to an
+  // alloca, the inlined code would need to be adjusted to handle that the
+  // argument is in the alloca address space (so it is a little bit complicated
+  // to solve).
+  unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
+  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
+    if (CS.isByValArgument(I)) {
+      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+      if (PTy->getAddressSpace() != AllocaAS)
+        return llvm::InlineCost::getNever();
+    }
+
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
@@ -1966,6 +1994,11 @@ InlineCost llvm::getInlineCost(
   if (Caller->hasFnAttribute(Attribute::OptimizeNone))
     return llvm::InlineCost::getNever();
 
+  // Don't inline a function that treats null pointer as valid into a caller
+  // that does not have this attribute.
+  if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
+    return llvm::InlineCost::getNever();
+
   // Don't inline functions which can be interposed at link-time.  Don't inline
   // functions marked noinline or call sites marked noinline.
   // Note: inlining non-exact non-interposable functions is fine, since we know
@@ -1974,14 +2007,14 @@ InlineCost llvm::getInlineCost(
       CS.isNoInline())
     return llvm::InlineCost::getNever();
 
-  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
-                     << "... (caller:" << Caller->getName() << ")\n");
+  LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+                          << "... (caller:" << Caller->getName() << ")\n");
 
   CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
                   Params);
   bool ShouldInline = CA.analyzeCall(CS);
 
-  DEBUG(CA.dump());
+  LLVM_DEBUG(CA.dump());
 
   // Check if there was a reason to force inlining or no inlining.
   if (!ShouldInline && CA.getCost() < CA.getThreshold())
@@ -2015,12 +2048,21 @@ bool llvm::isInlineViable(Function &F) {
           cast<CallInst>(CS.getInstruction())->canReturnTwice())
         return false;
 
-      // Disallow inlining functions that call @llvm.localescape. Doing this
-      // correctly would require major changes to the inliner.
-      if (CS.getCalledFunction() &&
-          CS.getCalledFunction()->getIntrinsicID() ==
-              llvm::Intrinsic::localescape)
-        return false;
+      if (CS.getCalledFunction())
+        switch (CS.getCalledFunction()->getIntrinsicID()) {
+        default:
+          break;
+        // Disallow inlining of @llvm.icall.branch.funnel because current
+        // backend can't separate call targets from call arguments.
+        case llvm::Intrinsic::icall_branch_funnel:
+        // Disallow inlining functions that call @llvm.localescape. Doing this
+        // correctly would require major changes to the inliner.
+        case llvm::Intrinsic::localescape:
+        // Disallow inlining of functions that access VarArgs.
+        case llvm::Intrinsic::vastart:
+        case llvm::Intrinsic::vaend:
+          return false;
+        }
     }
   }
 
diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
index c814ff122e44..519d6d67be51 100644
--- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -62,6 +62,8 @@ static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyCastInst(unsigned, Value *, Type *,
                                const SimplifyQuery &, unsigned);
+static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, const SimplifyQuery &,
+                              unsigned);
 
 /// For a boolean type or a vector of boolean type, return false or a vector
 /// with every element false.
@@ -90,7 +92,7 @@ static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
 }
 
 /// Does the given value dominate the specified phi node?
-static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
+static bool valueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     // Arguments and constants dominate all instructions.
@@ -99,7 +101,7 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   // If we are processing instructions (and/or basic blocks) that have not been
   // fully added to a function, the parent nodes may still be null. Simply
   // return the conservative answer in these cases.
-  if (!I->getParent() || !P->getParent() || !I->getParent()->getParent())
+  if (!I->getParent() || !P->getParent() || !I->getFunction())
     return false;
 
   // If we have a DominatorTree then do a precise test.
@@ -108,7 +110,7 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
 
   // Otherwise, if the instruction is in the entry block and is not an invoke,
   // then it obviously dominates all phi nodes.
-  if (I->getParent() == &I->getParent()->getParent()->getEntryBlock() &&
+  if (I->getParent() == &I->getFunction()->getEntryBlock() &&
       !isa<InvokeInst>(I))
     return true;
 
@@ -443,13 +445,13 @@ static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
   if (isa<PHINode>(LHS)) {
     PI = cast<PHINode>(LHS);
     // Bail out if RHS and the phi may be mutually interdependent due to a loop.
-    if (!ValueDominatesPHI(RHS, PI, Q.DT))
+    if (!valueDominatesPHI(RHS, PI, Q.DT))
       return nullptr;
   } else {
     assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
     PI = cast<PHINode>(RHS);
     // Bail out if LHS and the phi may be mutually interdependent due to a loop.
-    if (!ValueDominatesPHI(LHS, PI, Q.DT))
+    if (!valueDominatesPHI(LHS, PI, Q.DT))
       return nullptr;
   }
 
@@ -490,7 +492,7 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   PHINode *PI = cast<PHINode>(LHS);
 
   // Bail out if RHS and the phi may be mutually interdependent due to a loop.
-  if (!ValueDominatesPHI(RHS, PI, Q.DT))
+  if (!valueDominatesPHI(RHS, PI, Q.DT))
     return nullptr;
 
   // Evaluate the BinOp on the incoming phi values.
@@ -525,7 +527,7 @@ static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
 
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
     return C;
@@ -538,6 +540,10 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   if (match(Op1, m_Zero()))
     return Op0;
 
+  // If two operands are negative, return 0.
+  if (isKnownNegation(Op0, Op1))
+    return Constant::getNullValue(Op0->getType());
+
   // X + (Y - X) -> Y
   // (Y - X) + X -> Y
   // Eg: X + -X -> 0
@@ -555,10 +561,14 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   // add nsw/nuw (xor Y, signmask), signmask --> Y
   // The no-wrapping add guarantees that the top bit will be set by the add.
   // Therefore, the xor must be clearing the already set sign bit of Y.
-  if ((isNSW || isNUW) && match(Op1, m_SignMask()) &&
+  if ((IsNSW || IsNUW) && match(Op1, m_SignMask()) &&
       match(Op0, m_Xor(m_Value(Y), m_SignMask())))
     return Y;
 
+  // add nuw %x, -1  ->  -1, because %x can only be 0.
+  if (IsNUW && match(Op1, m_AllOnes()))
+    return Op1; // Which is -1.
+
   /// i1 add -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
@@ -581,12 +591,12 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   return nullptr;
 }
 
-Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                              const SimplifyQuery &Query) {
-  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query, RecursionLimit);
+  return ::SimplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit);
 }
 
-/// \brief Compute the base pointer and cumulative constant offsets for V.
+/// Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
 /// accumulates the total constant offset applied in the returned constant. It
@@ -637,7 +647,7 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
   return OffsetIntPtr;
 }
 
-/// \brief Compute the constant difference between two pointer values.
+/// Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
 static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
                                           Value *RHS) {
@@ -680,14 +690,14 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   if (match(Op0, m_Zero())) {
     // 0 - X -> 0 if the sub is NUW.
     if (isNUW)
-      return Op0;
+      return Constant::getNullValue(Op0->getType());
 
     KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (Known.Zero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
       if (isNSW)
-        return Op0;
+        return Constant::getNullValue(Op0->getType());
 
       // 0 - X -> X if X is 0 or the minimum signed value.
       return Op1;
@@ -799,12 +809,9 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return C;
 
   // X * undef -> 0
-  if (match(Op1, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
-
   // X * 0 -> 0
-  if (match(Op1, m_Zero()))
-    return Op1;
+  if (match(Op1, m_CombineOr(m_Undef(), m_Zero())))
+    return Constant::getNullValue(Op0->getType());
 
   // X * 1 -> X
   if (match(Op1, m_One()))
@@ -868,13 +875,14 @@ static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
   if (match(Op1, m_Zero()))
     return UndefValue::get(Ty);
 
-  // If any element of a constant divisor vector is zero, the whole op is undef.
+  // If any element of a constant divisor vector is zero or undef, the whole op
+  // is undef.
   auto *Op1C = dyn_cast<Constant>(Op1);
   if (Op1C && Ty->isVectorTy()) {
     unsigned NumElts = Ty->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = Op1C->getAggregateElement(i);
-      if (Elt && Elt->isNullValue())
+      if (Elt && (Elt->isNullValue() || isa<UndefValue>(Elt)))
         return UndefValue::get(Ty);
     }
   }
@@ -887,7 +895,7 @@ static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
   // 0 / X -> 0
   // 0 % X -> 0
   if (match(Op0, m_Zero()))
-    return Op0;
+    return Constant::getNullValue(Op0->getType());
 
   // X / X -> 1
   // X % X -> 0
@@ -898,7 +906,10 @@ static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
   // X % 1 -> 0
   // If this is a boolean op (single-bit element type), we can't have
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
-  if (match(Op1, m_One()) || Ty->isIntOrIntVectorTy(1))
+  // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1.
+  Value *X;
+  if (match(Op1, m_One()) || Ty->isIntOrIntVectorTy(1) ||
+      (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
     return IsDiv ? Op0 : Constant::getNullValue(Ty);
 
   return nullptr;
@@ -978,18 +989,17 @@ static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   bool IsSigned = Opcode == Instruction::SDiv;
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
-  Value *X = nullptr, *Y = nullptr;
-  if (match(Op0, m_Mul(m_Value(X), m_Value(Y))) && (X == Op1 || Y == Op1)) {
-    if (Y != Op1) std::swap(X, Y); // Ensure expression is (X * Y) / Y, Y = Op1
-    OverflowingBinaryOperator *Mul = cast<OverflowingBinaryOperator>(Op0);
-    // If the Mul knows it does not overflow, then we are good to go.
+  Value *X;
+  if (match(Op0, m_c_Mul(m_Value(X), m_Specific(Op1)))) {
+    auto *Mul = cast<OverflowingBinaryOperator>(Op0);
+    // If the Mul does not overflow, then we are good to go.
     if ((IsSigned && Mul->hasNoSignedWrap()) ||
         (!IsSigned && Mul->hasNoUnsignedWrap()))
       return X;
-    // If X has the form X = A / Y then X * Y cannot overflow.
-    if (BinaryOperator *Div = dyn_cast<BinaryOperator>(X))
-      if (Div->getOpcode() == Opcode && Div->getOperand(1) == Y)
-        return X;
+    // If X has the form X = A / Y, then X * Y cannot overflow.
+    if ((IsSigned && match(X, m_SDiv(m_Value(), m_Specific(Op1)))) ||
+        (!IsSigned && match(X, m_UDiv(m_Value(), m_Specific(Op1)))))
+      return X;
   }
 
   // (X rem Y) / Y -> 0
@@ -1041,6 +1051,13 @@ static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
        match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
     return Op0;
 
+  // (X << Y) % X -> 0
+  if ((Opcode == Instruction::SRem &&
+       match(Op0, m_NSWShl(m_Specific(Op1), m_Value()))) ||
+      (Opcode == Instruction::URem &&
+       match(Op0, m_NUWShl(m_Specific(Op1), m_Value()))))
+    return Constant::getNullValue(Op0->getType());
+
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
@@ -1064,6 +1081,10 @@ static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
 /// If not, this returns null.
 static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
+  // If two operands are negated and no signed overflow, return -1.
+  if (isKnownNegation(Op0, Op1, /*NeedNSW=*/true))
+    return Constant::getAllOnesValue(Op0->getType());
+
   return simplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse);
 }
 
@@ -1086,6 +1107,16 @@ Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
 /// If not, this returns null.
 static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
+  // If the divisor is 0, the result is undefined, so assume the divisor is -1.
+  // srem Op0, (sext i1 X) --> srem Op0, -1 --> 0
+  Value *X;
+  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+    return ConstantInt::getNullValue(Op0->getType());
+
+  // If the two operands are negated, return 0.
+  if (isKnownNegation(Op0, Op1))
+    return ConstantInt::getNullValue(Op0->getType());
+
   return simplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse);
 }
 
@@ -1140,10 +1171,14 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
-    return Op0;
+    return Constant::getNullValue(Op0->getType());
 
   // X shift by 0 -> X
-  if (match(Op1, m_Zero()))
+  // Shift-by-sign-extended bool must be shift-by-0 because shift-by-all-ones
+  // would be poison.
+  Value *X;
+  if (match(Op1, m_Zero()) ||
+      (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
     return Op0;
 
   // Fold undefined shifts.
@@ -1177,7 +1212,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
   return nullptr;
 }
 
-/// \brief Given operands for an Shl, LShr or AShr, see if we can
+/// Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
                                  Value *Op1, bool isExact, const SimplifyQuery &Q,
@@ -1220,6 +1255,13 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   Value *X;
   if (match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1)))))
     return X;
+
+  // shl nuw i8 C, %x  ->  C  iff C has sign bit set.
+  if (isNUW && match(Op0, m_Negative()))
+    return Op0;
+  // NOTE: could use computeKnownBits() / LazyValueInfo,
+  // but the cost-benefit analysis suggests it isn't worth it.
+
   return nullptr;
 }
 
@@ -1257,9 +1299,10 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                     MaxRecurse))
     return V;
 
-  // all ones >>a X -> all ones
+  // all ones >>a X -> -1
+  // Do not return Op0 because it may contain undef elements if it's a vector.
   if (match(Op0, m_AllOnes()))
-    return Op0;
+    return Constant::getAllOnesValue(Op0->getType());
 
   // (X << A) >> A -> X
   Value *X;
@@ -1295,7 +1338,7 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
       ICmpInst::isUnsigned(UnsignedPred))
     ;
   else if (match(UnsignedICmp,
-                 m_ICmp(UnsignedPred, m_Value(Y), m_Specific(X))) &&
+                 m_ICmp(UnsignedPred, m_Specific(Y), m_Value(X))) &&
            ICmpInst::isUnsigned(UnsignedPred))
     UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
   else
@@ -1413,6 +1456,43 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return nullptr;
 }
 
+static Value *simplifyAndOrOfICmpsWithZero(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                           bool IsAnd) {
+  ICmpInst::Predicate P0 = Cmp0->getPredicate(), P1 = Cmp1->getPredicate();
+  if (!match(Cmp0->getOperand(1), m_Zero()) ||
+      !match(Cmp1->getOperand(1), m_Zero()) || P0 != P1)
+    return nullptr;
+
+  if ((IsAnd && P0 != ICmpInst::ICMP_NE) || (!IsAnd && P1 != ICmpInst::ICMP_EQ))
+    return nullptr;
+
+  // We have either "(X == 0 || Y == 0)" or "(X != 0 && Y != 0)".
+  Value *X = Cmp0->getOperand(0);
+  Value *Y = Cmp1->getOperand(0);
+
+  // If one of the compares is a masked version of a (not) null check, then
+  // that compare implies the other, so we eliminate the other. Optionally, look
+  // through a pointer-to-int cast to match a null check of a pointer type.
+
+  // (X == 0) || (([ptrtoint] X & ?) == 0) --> ([ptrtoint] X & ?) == 0
+  // (X == 0) || ((? & [ptrtoint] X) == 0) --> (? & [ptrtoint] X) == 0
+  // (X != 0) && (([ptrtoint] X & ?) != 0) --> ([ptrtoint] X & ?) != 0
+  // (X != 0) && ((? & [ptrtoint] X) != 0) --> (? & [ptrtoint] X) != 0
+  if (match(Y, m_c_And(m_Specific(X), m_Value())) ||
+      match(Y, m_c_And(m_PtrToInt(m_Specific(X)), m_Value())))
+    return Cmp1;
+
+  // (([ptrtoint] Y & ?) == 0) || (Y == 0) --> ([ptrtoint] Y & ?) == 0
+  // ((? & [ptrtoint] Y) == 0) || (Y == 0) --> (? & [ptrtoint] Y) == 0
+  // (([ptrtoint] Y & ?) != 0) && (Y != 0) --> ([ptrtoint] Y & ?) != 0
+  // ((? & [ptrtoint] Y) != 0) && (Y != 0) --> (? & [ptrtoint] Y) != 0
+  if (match(X, m_c_And(m_Specific(Y), m_Value())) ||
+      match(X, m_c_And(m_PtrToInt(m_Specific(Y)), m_Value())))
+    return Cmp0;
+
+  return nullptr;
+}
+
 static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) & (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
@@ -1473,6 +1553,9 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
     return X;
 
+  if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
+    return X;
+
   if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1))
     return X;
   if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0))
@@ -1541,6 +1624,9 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
     return X;
 
+  if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
+    return X;
+
   if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1))
     return X;
   if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0))
@@ -1638,7 +1724,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
 
   // X & 0 = 0
   if (match(Op1, m_Zero()))
-    return Op1;
+    return Constant::getNullValue(Op0->getType());
 
   // X & -1 = X
   if (match(Op1, m_AllOnes()))
@@ -1733,21 +1819,16 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return C;
 
   // X | undef -> -1
-  if (match(Op1, m_Undef()))
+  // X | -1 = -1
+  // Do not return Op1 because it may contain undef elements if it's a vector.
+  if (match(Op1, m_Undef()) || match(Op1, m_AllOnes()))
     return Constant::getAllOnesValue(Op0->getType());
 
   // X | X = X
-  if (Op0 == Op1)
-    return Op0;
-
   // X | 0 = X
-  if (match(Op1, m_Zero()))
+  if (Op0 == Op1 || match(Op1, m_Zero()))
     return Op0;
 
-  // X | -1 = -1
-  if (match(Op1, m_AllOnes()))
-    return Op1;
-
   // A | ~A  =  ~A | A  =  -1
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
@@ -2051,9 +2132,12 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
       ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
       ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
       uint64_t LHSSize, RHSSize;
+      ObjectSizeOpts Opts;
+      Opts.NullIsUnknownSize =
+          NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
       if (LHSOffsetCI && RHSOffsetCI &&
-          getObjectSize(LHS, LHSSize, DL, TLI) &&
-          getObjectSize(RHS, RHSSize, DL, TLI)) {
+          getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
+          getObjectSize(RHS, RHSSize, DL, TLI, Opts)) {
         const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
         const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
         if (!LHSOffsetValue.isNegative() &&
@@ -2442,6 +2526,20 @@ static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper) {
 
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS) {
+  Type *ITy = GetCompareTy(RHS); // The return type.
+
+  Value *X;
+  // Sign-bit checks can be optimized to true/false after unsigned
+  // floating-point casts:
+  // icmp slt (bitcast (uitofp X)),  0 --> false
+  // icmp sgt (bitcast (uitofp X)), -1 --> true
+  if (match(LHS, m_BitCast(m_UIToFP(m_Value(X))))) {
+    if (Pred == ICmpInst::ICMP_SLT && match(RHS, m_Zero()))
+      return ConstantInt::getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_SGT && match(RHS, m_AllOnes()))
+      return ConstantInt::getTrue(ITy);
+  }
+
   const APInt *C;
   if (!match(RHS, m_APInt(C)))
     return nullptr;
@@ -2449,9 +2547,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   // Rule out tautological comparisons (eg., ult 0 or uge 0).
   ConstantRange RHS_CR = ConstantRange::makeExactICmpRegion(Pred, *C);
   if (RHS_CR.isEmptySet())
-    return ConstantInt::getFalse(GetCompareTy(RHS));
+    return ConstantInt::getFalse(ITy);
   if (RHS_CR.isFullSet())
-    return ConstantInt::getTrue(GetCompareTy(RHS));
+    return ConstantInt::getTrue(ITy);
 
   // Find the range of possible values for binary operators.
   unsigned Width = C->getBitWidth();
@@ -2469,9 +2567,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
 
   if (!LHS_CR.isFullSet()) {
     if (RHS_CR.contains(LHS_CR))
-      return ConstantInt::getTrue(GetCompareTy(RHS));
+      return ConstantInt::getTrue(ITy);
     if (RHS_CR.inverse().contains(LHS_CR))
-      return ConstantInt::getFalse(GetCompareTy(RHS));
+      return ConstantInt::getFalse(ITy);
   }
 
   return nullptr;
@@ -3008,8 +3106,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   Type *ITy = GetCompareTy(LHS); // The return type.
 
   // icmp X, X -> true/false
-  // X icmp undef -> true/false.  For example, icmp ugt %X, undef -> false
-  // because X could be 0.
+  // icmp X, undef -> true/false because undef could be X.
   if (LHS == RHS || isa<UndefValue>(RHS))
     return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));
 
@@ -3309,6 +3406,12 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(RetTy);
   }
 
+  // NaN is unordered; NaN is not ordered.
+  assert((FCmpInst::isOrdered(Pred) || FCmpInst::isUnordered(Pred)) &&
+         "Comparison must be either ordered or unordered");
+  if (match(RHS, m_NaN()))
+    return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
+
   // fcmp pred x, undef  and  fcmp pred undef, x
   // fold to true if unordered, false if ordered
   if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) {
@@ -3328,15 +3431,6 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // Handle fcmp with constant RHS.
   const APFloat *C;
   if (match(RHS, m_APFloat(C))) {
-    // If the constant is a nan, see if we can fold the comparison based on it.
-    if (C->isNaN()) {
-      if (FCmpInst::isOrdered(Pred)) // True "if ordered and foo"
-        return getFalse(RetTy);
-      assert(FCmpInst::isUnordered(Pred) &&
-             "Comparison must be either ordered or unordered!");
-      // True if unordered.
-      return getTrue(RetTy);
-    }
     // Check whether the constant is an infinity.
     if (C->isInfinity()) {
       if (C->isNegative()) {
@@ -3475,6 +3569,17 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     }
   }
 
+  // Same for GEPs.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    if (MaxRecurse) {
+      SmallVector<Value *, 8> NewOps(GEP->getNumOperands());
+      transform(GEP->operands(), NewOps.begin(),
+                [&](Value *V) { return V == Op ? RepOp : V; });
+      return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q,
+                             MaxRecurse - 1);
+    }
+  }
+
   // TODO: We could hand off more cases to instsimplify here.
 
   // If all operands are constant after substituting Op for RepOp then we can
@@ -3581,24 +3686,6 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                               TrueVal, FalseVal))
     return V;
 
-  if (CondVal->hasOneUse()) {
-    const APInt *C;
-    if (match(CmpRHS, m_APInt(C))) {
-      // X < MIN ? T : F  -->  F
-      if (Pred == ICmpInst::ICMP_SLT && C->isMinSignedValue())
-        return FalseVal;
-      // X < MIN ? T : F  -->  F
-      if (Pred == ICmpInst::ICMP_ULT && C->isMinValue())
-        return FalseVal;
-      // X > MAX ? T : F  -->  F
-      if (Pred == ICmpInst::ICMP_SGT && C->isMaxSignedValue())
-        return FalseVal;
-      // X > MAX ? T : F  -->  F
-      if (Pred == ICmpInst::ICMP_UGT && C->isMaxValue())
-        return FalseVal;
-    }
-  }
-
   // If we have an equality comparison, then we know the value in one of the
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
@@ -3631,37 +3718,38 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
 
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
-                                 Value *FalseVal, const SimplifyQuery &Q,
-                                 unsigned MaxRecurse) {
-  // select true, X, Y  -> X
-  // select false, X, Y -> Y
-  if (Constant *CB = dyn_cast<Constant>(CondVal)) {
-    if (Constant *CT = dyn_cast<Constant>(TrueVal))
-      if (Constant *CF = dyn_cast<Constant>(FalseVal))
-        return ConstantFoldSelectInstruction(CB, CT, CF);
-    if (CB->isAllOnesValue())
+static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
+                                 const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (auto *CondC = dyn_cast<Constant>(Cond)) {
+    if (auto *TrueC = dyn_cast<Constant>(TrueVal))
+      if (auto *FalseC = dyn_cast<Constant>(FalseVal))
+        return ConstantFoldSelectInstruction(CondC, TrueC, FalseC);
+
+    // select undef, X, Y -> X or Y
+    if (isa<UndefValue>(CondC))
+      return isa<Constant>(FalseVal) ? FalseVal : TrueVal;
+
+    // TODO: Vector constants with undef elements don't simplify.
+
+    // select true, X, Y  -> X
+    if (CondC->isAllOnesValue())
       return TrueVal;
-    if (CB->isNullValue())
+    // select false, X, Y -> Y
+    if (CondC->isNullValue())
       return FalseVal;
   }
 
-  // select C, X, X -> X
+  // select ?, X, X -> X
   if (TrueVal == FalseVal)
     return TrueVal;
 
-  if (isa<UndefValue>(CondVal)) {  // select undef, X, Y -> X or Y
-    if (isa<Constant>(FalseVal))
-      return FalseVal;
-    return TrueVal;
-  }
-  if (isa<UndefValue>(TrueVal))   // select C, undef, X -> X
+  if (isa<UndefValue>(TrueVal))   // select ?, undef, X -> X
     return FalseVal;
-  if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
+  if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
     return TrueVal;
 
   if (Value *V =
-          simplifySelectWithICmpCond(CondVal, TrueVal, FalseVal, Q, MaxRecurse))
+          simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
   return nullptr;
@@ -3712,7 +3800,7 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
       // The following transforms are only safe if the ptrtoint cast
       // doesn't truncate the pointers.
       if (Ops[1]->getType()->getScalarSizeInBits() ==
-          Q.DL.getPointerSizeInBits(AS)) {
+          Q.DL.getIndexSizeInBits(AS)) {
         auto PtrToIntOrZero = [GEPTy](Value *P) -> Value * {
           if (match(P, m_Zero()))
             return Constant::getNullValue(GEPTy);
@@ -3752,10 +3840,10 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
   if (Q.DL.getTypeAllocSize(LastType) == 1 &&
       all_of(Ops.slice(1).drop_back(1),
              [](Value *Idx) { return match(Idx, m_Zero()); })) {
-    unsigned PtrWidth =
-        Q.DL.getPointerSizeInBits(Ops[0]->getType()->getPointerAddressSpace());
-    if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == PtrWidth) {
-      APInt BasePtrOffset(PtrWidth, 0);
+    unsigned IdxWidth =
+        Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace());
+    if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) {
+      APInt BasePtrOffset(IdxWidth, 0);
       Value *StrippedBasePtr =
           Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL,
                                                             BasePtrOffset);
@@ -3946,7 +4034,7 @@ static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) {
   // instruction, we cannot return X as the result of the PHI node unless it
   // dominates the PHI block.
   if (HasUndefInput)
-    return ValueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;
+    return valueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;
 
   return CommonValue;
 }
@@ -4123,6 +4211,28 @@ Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
   return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
 }
 
+static Constant *propagateNaN(Constant *In) {
+  // If the input is a vector with undef elements, just return a default NaN.
+  if (!In->isNaN())
+    return ConstantFP::getNaN(In->getType());
+
+  // Propagate the existing NaN constant when possible.
+  // TODO: Should we quiet a signaling NaN?
+  return In;
+}
+
+static Constant *simplifyFPBinop(Value *Op0, Value *Op1) {
+  if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
+    return ConstantFP::getNaN(Op0->getType());
+
+  if (match(Op0, m_NaN()))
+    return propagateNaN(cast<Constant>(Op0));
+  if (match(Op1, m_NaN()))
+    return propagateNaN(cast<Constant>(Op1));
+
+  return nullptr;
+}
+
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
@@ -4130,29 +4240,28 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
     return C;
 
+  if (Constant *C = simplifyFPBinop(Op0, Op1))
+    return C;
+
   // fadd X, -0 ==> X
-  if (match(Op1, m_NegZero()))
+  if (match(Op1, m_NegZeroFP()))
     return Op0;
 
   // fadd X, 0 ==> X, when we know X is not -0
-  if (match(Op1, m_Zero()) &&
+  if (match(Op1, m_PosZeroFP()) &&
       (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
-  // fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
-  //   where nnan and ninf have to occur at least once somewhere in this
-  //   expression
-  Value *SubOp = nullptr;
-  if (match(Op1, m_FSub(m_AnyZero(), m_Specific(Op0))))
-    SubOp = Op1;
-  else if (match(Op0, m_FSub(m_AnyZero(), m_Specific(Op1))))
-    SubOp = Op0;
-  if (SubOp) {
-    Instruction *FSub = cast<Instruction>(SubOp);
-    if ((FMF.noNaNs() || FSub->hasNoNaNs()) &&
-        (FMF.noInfs() || FSub->hasNoInfs()))
-      return Constant::getNullValue(Op0->getType());
-  }
+  // With nnan: (+/-0.0 - X) + X --> 0.0 (and commuted variant)
+  // We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN.
+  // Negative zeros are allowed because we always end up with positive zero:
+  // X = -0.0: (-0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
+  // X = -0.0: ( 0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
+  // X =  0.0: (-0.0 - ( 0.0)) + ( 0.0) == (-0.0) + ( 0.0) == 0.0
+  // X =  0.0: ( 0.0 - ( 0.0)) + ( 0.0) == ( 0.0) + ( 0.0) == 0.0
+  if (FMF.noNaNs() && (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) ||
+                       match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0)))))
+    return ConstantFP::getNullValue(Op0->getType());
 
   return nullptr;
 }
@@ -4164,23 +4273,27 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
     return C;
 
-  // fsub X, 0 ==> X
-  if (match(Op1, m_Zero()))
+  if (Constant *C = simplifyFPBinop(Op0, Op1))
+    return C;
+
+  // fsub X, +0 ==> X
+  if (match(Op1, m_PosZeroFP()))
     return Op0;
 
   // fsub X, -0 ==> X, when we know X is not -0
-  if (match(Op1, m_NegZero()) &&
+  if (match(Op1, m_NegZeroFP()) &&
       (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
   // fsub -0.0, (fsub -0.0, X) ==> X
   Value *X;
-  if (match(Op0, m_NegZero()) && match(Op1, m_FSub(m_NegZero(), m_Value(X))))
+  if (match(Op0, m_NegZeroFP()) &&
+      match(Op1, m_FSub(m_NegZeroFP(), m_Value(X))))
     return X;
 
   // fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored.
-  if (FMF.noSignedZeros() && match(Op0, m_AnyZero()) &&
-      match(Op1, m_FSub(m_AnyZero(), m_Value(X))))
+  if (FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()) &&
+      match(Op1, m_FSub(m_AnyZeroFP(), m_Value(X))))
     return X;
 
   // fsub nnan x, x ==> 0.0
@@ -4196,13 +4309,25 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
     return C;
 
+  if (Constant *C = simplifyFPBinop(Op0, Op1))
+    return C;
+
   // fmul X, 1.0 ==> X
   if (match(Op1, m_FPOne()))
     return Op0;
 
   // fmul nnan nsz X, 0 ==> 0
-  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
-    return Op1;
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZeroFP()))
+    return ConstantFP::getNullValue(Op0->getType());
+
+  // sqrt(X) * sqrt(X) --> X, if we can:
+  // 1. Remove the intermediate rounding (reassociate).
+  // 2. Ignore non-zero negative numbers because sqrt would produce NAN.
+  // 3. Ignore -0.0 because sqrt(-0.0) == -0.0, but -0.0 * -0.0 == 0.0.
+  Value *X;
+  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) &&
+      FMF.allowReassoc() && FMF.noNaNs() && FMF.noSignedZeros())
+    return X;
 
   return nullptr;
 }
@@ -4228,13 +4353,8 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
     return C;
 
-  // undef / X -> undef    (the undef could be a snan).
-  if (match(Op0, m_Undef()))
-    return Op0;
-
-  // X / undef -> undef
-  if (match(Op1, m_Undef()))
-    return Op1;
+  if (Constant *C = simplifyFPBinop(Op0, Op1))
+    return C;
 
   // X / 1.0 -> X
   if (match(Op1, m_FPOne()))
@@ -4243,14 +4363,20 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   // 0 / X -> 0
   // Requires that NaNs are off (X could be zero) and signed zeroes are
   // ignored (X could be positive or negative, so the output sign is unknown).
-  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZero()))
-    return Op0;
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()))
+    return ConstantFP::getNullValue(Op0->getType());
 
   if (FMF.noNaNs()) {
     // X / X -> 1.0 is legal when NaNs are ignored.
+    // We can ignore infinities because INF/INF is NaN.
     if (Op0 == Op1)
       return ConstantFP::get(Op0->getType(), 1.0);
 
+    // (X * Y) / Y --> X if we can reassociate to the above form.
+    Value *X;
+    if (FMF.allowReassoc() && match(Op0, m_c_FMul(m_Value(X), m_Specific(Op1))))
+      return X;
+
     // -X /  X -> -1.0 and
     //  X / -X -> -1.0 are legal when NaNs are ignored.
     // We can ignore signed zeros because +-0.0/+-0.0 is NaN and ignored.
@@ -4274,19 +4400,20 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
     return C;
 
-  // undef % X -> undef    (the undef could be a snan).
-  if (match(Op0, m_Undef()))
-    return Op0;
-
-  // X % undef -> undef
-  if (match(Op1, m_Undef()))
-    return Op1;
+  if (Constant *C = simplifyFPBinop(Op0, Op1))
+    return C;
 
-  // 0 % X -> 0
-  // Requires that NaNs are off (X could be zero) and signed zeroes are
-  // ignored (X could be positive or negative, so the output sign is unknown).
-  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZero()))
-    return Op0;
+  // Unlike fdiv, the result of frem always matches the sign of the dividend.
+  // The constant match may include undef elements in a vector, so return a full
+  // zero constant as the result.
+  if (FMF.noNaNs()) {
+    // +0 % X -> 0
+    if (match(Op0, m_PosZeroFP()))
+      return ConstantFP::getNullValue(Op0->getType());
+    // -0 % X -> -0
+    if (match(Op0, m_NegZeroFP()))
+      return ConstantFP::getNegativeZero(Op0->getType());
+  }
 
   return nullptr;
 }
@@ -4515,28 +4642,28 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
     }
     case Intrinsic::exp: {
       // exp(log(x)) -> x
-      if (Q.CxtI->isFast() &&
+      if (Q.CxtI->hasAllowReassoc() &&
           match(IIOperand, m_Intrinsic<Intrinsic::log>(m_Value(X))))
         return X;
       return nullptr;
     }
     case Intrinsic::exp2: {
       // exp2(log2(x)) -> x
-      if (Q.CxtI->isFast() &&
+      if (Q.CxtI->hasAllowReassoc() &&
           match(IIOperand, m_Intrinsic<Intrinsic::log2>(m_Value(X))))
         return X;
       return nullptr;
     }
     case Intrinsic::log: {
       // log(exp(x)) -> x
-      if (Q.CxtI->isFast() &&
+      if (Q.CxtI->hasAllowReassoc() &&
           match(IIOperand, m_Intrinsic<Intrinsic::exp>(m_Value(X))))
         return X;
       return nullptr;
     }
     case Intrinsic::log2: {
       // log2(exp2(x)) -> x
-      if (Q.CxtI->isFast() &&
+      if (Q.CxtI->hasAllowReassoc() &&
           match(IIOperand, m_Intrinsic<Intrinsic::exp2>(m_Value(X)))) {
         return X;
       }
@@ -4606,6 +4733,14 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
           return LHS;
       }
       return nullptr;
+    case Intrinsic::maxnum:
+    case Intrinsic::minnum:
+      // If one argument is NaN, return the other argument.
+      if (match(LHS, m_NaN()))
+        return RHS;
+      if (match(RHS, m_NaN()))
+        return LHS;
+      return nullptr;
     default:
       return nullptr;
     }
@@ -4843,7 +4978,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
   return Result == I ? UndefValue::get(I->getType()) : Result;
 }
 
-/// \brief Implementation of recursive simplification through an instruction's
+/// Implementation of recursive simplification through an instruction's
 /// uses.
 ///
 /// This is the common implementation of the recursive simplification routines.
diff --git a/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp b/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp
index 3992657417c5..e7751d32aab3 100644
--- a/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -21,15 +21,20 @@ template <class NodeTy, bool IsPostDom>
 void IDFCalculator<NodeTy, IsPostDom>::calculate(
     SmallVectorImpl<BasicBlock *> &PHIBlocks) {
   // Use a priority queue keyed on dominator tree level so that inserted nodes
-  // are handled from the bottom of the dominator tree upwards.
-  typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
+  // are handled from the bottom of the dominator tree upwards. We also augment
+  // the level with a DFS number to ensure that the blocks are ordered in a
+  // deterministic way.
+  typedef std::pair<DomTreeNode *, std::pair<unsigned, unsigned>>
+      DomTreeNodePair;
   typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>,
                               less_second> IDFPriorityQueue;
   IDFPriorityQueue PQ;
 
+  DT.updateDFSNumbers();
+
   for (BasicBlock *BB : *DefBlocks) {
     if (DomTreeNode *Node = DT.getNode(BB))
-      PQ.push({Node, Node->getLevel()});
+      PQ.push({Node, std::make_pair(Node->getLevel(), Node->getDFSNumIn())});
   }
 
   SmallVector<DomTreeNode *, 32> Worklist;
@@ -40,7 +45,7 @@ void IDFCalculator<NodeTy, IsPostDom>::calculate(
     DomTreeNodePair RootPair = PQ.top();
     PQ.pop();
     DomTreeNode *Root = RootPair.first;
-    unsigned RootLevel = RootPair.second;
+    unsigned RootLevel = RootPair.second.first;
 
     // Walk all dominator tree children of Root, inspecting their CFG edges with
     // targets elsewhere on the dominator tree. Only targets whose level is at
@@ -77,7 +82,8 @@ void IDFCalculator<NodeTy, IsPostDom>::calculate(
 
         PHIBlocks.emplace_back(SuccBB);
         if (!DefBlocks->count(SuccBB))
-          PQ.push(std::make_pair(SuccNode, SuccLevel));
+          PQ.push(std::make_pair(
+              SuccNode, std::make_pair(SuccLevel, SuccNode->getDFSNumIn())));
       }
 
       for (auto DomChild : *Node) {
diff --git a/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp b/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp
index a8178ecc0a24..93c23bca96af 100644
--- a/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/LazyBranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
 
 using namespace llvm;
 
@@ -41,6 +42,10 @@ void LazyBlockFrequencyInfoPass::print(raw_ostream &OS, const Module *) const {
 
 void LazyBlockFrequencyInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   LazyBranchProbabilityInfoPass::getLazyBPIAnalysisUsage(AU);
+  // We require DT so it's available when LI is available. The LI updating code
+  // asserts that DT is also present so if we don't make sure that we have DT
+  // here, that assert will trigger.
+  AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
diff --git a/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp
index e2884d0a4564..429b78c3a47e 100644
--- a/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LazyBranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Dominators.h"
 
 using namespace llvm;
 
@@ -42,6 +43,10 @@ void LazyBranchProbabilityInfoPass::print(raw_ostream &OS,
 }
 
 void LazyBranchProbabilityInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We require DT so it's available when LI is available. The LI updating code
+  // asserts that DT is also present so if we don't make sure that we have DT
+  // here, that assert will trigger.
+  AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.setPreservesAll();
diff --git a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
index 54299d078be5..b1d585bfc683 100644
--- a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -65,15 +66,15 @@ static void addEdge(SmallVectorImpl<LazyCallGraph::Edge> &Edges,
   if (!EdgeIndexMap.insert({&N, Edges.size()}).second)
     return;
 
-  DEBUG(dbgs() << "    Added callable function: " << N.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "    Added callable function: " << N.getName() << "\n");
   Edges.emplace_back(LazyCallGraph::Edge(N, EK));
 }
 
 LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
   assert(!Edges && "Must not have already populated the edges for this node!");
 
-  DEBUG(dbgs() << "  Adding functions called by '" << getName()
-               << "' to the graph.\n");
+  LLVM_DEBUG(dbgs() << "  Adding functions called by '" << getName()
+                    << "' to the graph.\n");
 
   Edges = EdgeSequence();
 
@@ -151,8 +152,8 @@ static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
 }
 
 LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
-  DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
+                    << "\n");
   for (Function &F : M) {
     if (F.isDeclaration())
       continue;
@@ -167,8 +168,8 @@ LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
 
     // External linkage defined functions have edges to them from other
     // modules.
-    DEBUG(dbgs() << "  Adding '" << F.getName()
-                 << "' to entry set of the graph.\n");
+    LLVM_DEBUG(dbgs() << "  Adding '" << F.getName()
+                      << "' to entry set of the graph.\n");
     addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
   }
 
@@ -180,8 +181,9 @@ LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
       if (Visited.insert(GV.getInitializer()).second)
         Worklist.push_back(GV.getInitializer());
 
-  DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
-                  "entry set.\n");
+  LLVM_DEBUG(
+      dbgs() << "  Adding functions referenced by global initializers to the "
+                "entry set.\n");
   visitReferences(Worklist, Visited, [&](Function &F) {
     addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F),
             LazyCallGraph::Edge::Ref);
@@ -427,7 +429,7 @@ bool LazyCallGraph::RefSCC::isAncestorOf(const RefSCC &RC) const {
 ///   source to target.
 ///
 /// This helper routine, in addition to updating the postorder sequence itself
-/// will also update a map from SCCs to indices within that sequecne.
+/// will also update a map from SCCs to indices within that sequence.
 ///
 /// The sequence and the map must operate on pointers to the SCC type.
 ///
@@ -713,7 +715,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
   //
   // However, we specially handle the target node. The target node is known to
   // reach all other nodes in the original SCC by definition. This means that
-  // we want the old SCC to be replaced with an SCC contaning that node as it
+  // we want the old SCC to be replaced with an SCC containing that node as it
   // will be the root of whatever SCC DAG results from the DFS. Assumptions
   // about an SCC such as the set of functions called will continue to hold,
   // etc.
@@ -822,7 +824,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
         // Cleared the DFS early, start another round.
         break;
 
-      // We've finished processing N and its descendents, put it on our pending
+      // We've finished processing N and its descendants, put it on our pending
       // SCC stack to eventually get merged into an SCC of nodes.
       PendingSCCStack.push_back(N);
 
@@ -1234,7 +1236,7 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN,
         ++I;
       }
 
-      // We've finished processing N and its descendents, put it on our pending
+      // We've finished processing N and its descendants, put it on our pending
       // stack to eventually get merged into a RefSCC.
       PendingRefSCCStack.push_back(N);
 
@@ -1271,8 +1273,7 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN,
       // the removal hasn't changed the structure at all. This is an important
       // special case and we can directly exit the entire routine more
       // efficiently as soon as we discover it.
-      if (std::distance(RefSCCNodes.begin(), RefSCCNodes.end()) ==
-          NumRefSCCNodes) {
+      if (llvm::size(RefSCCNodes) == NumRefSCCNodes) {
         // Clear out the low link field as we won't need it.
         for (Node *N : RefSCCNodes)
           N->LowLink = -1;
@@ -1294,7 +1295,7 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN,
 
   // Otherwise we create a collection of new RefSCC nodes and build
   // a radix-sort style map from postorder number to these new RefSCCs. We then
-  // append SCCs to each of these RefSCCs in the order they occured in the
+  // append SCCs to each of these RefSCCs in the order they occurred in the
   // original SCCs container.
   for (int i = 0; i < PostOrderNumber; ++i)
     Result.push_back(G->createRefSCC(*G));
@@ -1617,7 +1618,7 @@ void LazyCallGraph::buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
         ++I;
       }
 
-      // We've finished processing N and its descendents, put it on our pending
+      // We've finished processing N and its descendants, put it on our pending
       // SCC stack to eventually get merged into an SCC of nodes.
       PendingSCCStack.push_back(N);
 
@@ -1738,7 +1739,7 @@ static void printNode(raw_ostream &OS, LazyCallGraph::Node &N) {
 }
 
 static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &C) {
-  ptrdiff_t Size = std::distance(C.begin(), C.end());
+  ptrdiff_t Size = size(C);
   OS << "    SCC with " << Size << " functions:\n";
 
   for (LazyCallGraph::Node &N : C)
@@ -1746,7 +1747,7 @@ static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &C) {
 }
 
 static void printRefSCC(raw_ostream &OS, LazyCallGraph::RefSCC &C) {
-  ptrdiff_t Size = std::distance(C.begin(), C.end());
+  ptrdiff_t Size = size(C);
   OS << "  RefSCC with " << Size << " call SCCs:\n";
 
   for (LazyCallGraph::SCC &InnerC : C)
diff --git a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
index d7da669f6e79..435b6f205199 100644
--- a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -392,8 +392,8 @@ namespace {
       if (!BlockValueSet.insert(BV).second)
         return false;  // It's already in the stack.
 
-      DEBUG(dbgs() << "PUSH: " << *BV.second << " in " << BV.first->getName()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "PUSH: " << *BV.second << " in "
+                        << BV.first->getName() << "\n");
       BlockValueStack.push_back(BV);
       return true;
     }
@@ -401,6 +401,7 @@ namespace {
     AssumptionCache *AC;  ///< A pointer to the cache of @llvm.assume calls.
     const DataLayout &DL; ///< A mandatory DataLayout
     DominatorTree *DT;    ///< An optional DT pointer.
+    DominatorTree *DisabledDT; ///< Stores DT if it's disabled.
 
   ValueLatticeElement getBlockValue(Value *Val, BasicBlock *BB);
   bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
@@ -463,13 +464,30 @@ namespace {
       TheCache.eraseBlock(BB);
     }
 
+    /// Disables use of the DominatorTree within LVI.
+    void disableDT() {
+      if (DT) {
+        assert(!DisabledDT && "Both DT and DisabledDT are not nullptr!");
+        std::swap(DT, DisabledDT);
+      }
+    }
+
+    /// Enables use of the DominatorTree within LVI. Does nothing if the class
+    /// instance was initialized without a DT pointer.
+    void enableDT() {
+      if (DisabledDT) {
+        assert(!DT && "Both DT and DisabledDT are not nullptr!");
+        std::swap(DT, DisabledDT);
+      }
+    }
+
     /// This is the update interface to inform the cache that an edge from
     /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc.
     void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
 
     LazyValueInfoImpl(AssumptionCache *AC, const DataLayout &DL,
                        DominatorTree *DT = nullptr)
-        : AC(AC), DL(DL), DT(DT) {}
+        : AC(AC), DL(DL), DT(DT), DisabledDT(nullptr) {}
   };
 } // end anonymous namespace
 
@@ -490,7 +508,8 @@ void LazyValueInfoImpl::solve() {
     // PredicateInfo is used in LVI or CVP, we should be able to make the
     // overdefined cache global, and remove this throttle.
     if (processedCount > MaxProcessedPerValue) {
-      DEBUG(dbgs() << "Giving up on stack because we are getting too deep\n");
+      LLVM_DEBUG(
+          dbgs() << "Giving up on stack because we are getting too deep\n");
       // Fill in the original values
       while (!StartingStack.empty()) {
         std::pair<BasicBlock *, Value *> &e = StartingStack.back();
@@ -511,8 +530,9 @@ void LazyValueInfoImpl::solve() {
       assert(TheCache.hasCachedValueInfo(e.second, e.first) &&
              "Result should be in cache!");
 
-      DEBUG(dbgs() << "POP " << *e.second << " in " << e.first->getName()
-                   << " = " << TheCache.getCachedValueInfo(e.second, e.first) << "\n");
+      LLVM_DEBUG(
+          dbgs() << "POP " << *e.second << " in " << e.first->getName() << " = "
+                 << TheCache.getCachedValueInfo(e.second, e.first) << "\n");
 
       BlockValueStack.pop_back();
       BlockValueSet.erase(e);
@@ -563,8 +583,8 @@ bool LazyValueInfoImpl::solveBlockValue(Value *Val, BasicBlock *BB) {
 
   if (TheCache.hasCachedValueInfo(Val, BB)) {
     // If we have a cached value, use that.
-    DEBUG(dbgs() << "  reuse BB '" << BB->getName()
-                 << "' val=" << TheCache.getCachedValueInfo(Val, BB) << '\n');
+    LLVM_DEBUG(dbgs() << "  reuse BB '" << BB->getName() << "' val="
+                      << TheCache.getCachedValueInfo(Val, BB) << '\n');
 
     // Since we're reusing a cached value, we don't need to update the
     // OverDefinedCache. The cache will have been properly updated whenever the
@@ -619,8 +639,8 @@ bool LazyValueInfoImpl::solveBlockValueImpl(ValueLatticeElement &Res,
       return solveBlockValueBinaryOp(Res, BO, BB);
   }
 
-  DEBUG(dbgs() << " compute BB '" << BB->getName()
-                 << "' - unknown inst def found.\n");
+  LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                    << "' - unknown inst def found.\n");
   Res = getFromRangeMetadata(BBI);
   return true;
 }
@@ -684,9 +704,11 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(ValueLatticeElement &BBLV,
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
     // Before giving up, see if we can prove the pointer non-null local to
     // this particular block.
-    if (Val->getType()->isPointerTy() &&
-        (isKnownNonZero(Val, DL) || isObjectDereferencedInBlock(Val, BB))) {
-      PointerType *PTy = cast<PointerType>(Val->getType());
+    PointerType *PTy = dyn_cast<PointerType>(Val->getType());
+    if (PTy &&
+        (isKnownNonZero(Val, DL) ||
+          (isObjectDereferencedInBlock(Val, BB) &&
+           !NullPointerIsDefined(BB->getParent(), PTy->getAddressSpace())))) {
       Result = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
     } else {
       Result = ValueLatticeElement::getOverdefined();
@@ -715,13 +737,13 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(ValueLatticeElement &BBLV,
     // If we hit overdefined, exit early.  The BlockVals entry is already set
     // to overdefined.
     if (Result.isOverdefined()) {
-      DEBUG(dbgs() << " compute BB '" << BB->getName()
-            << "' - overdefined because of pred (non local).\n");
+      LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                        << "' - overdefined because of pred (non local).\n");
       // Before giving up, see if we can prove the pointer non-null local to
       // this particular block.
-      if (Val->getType()->isPointerTy() &&
-          isObjectDereferencedInBlock(Val, BB)) {
-        PointerType *PTy = cast<PointerType>(Val->getType());
+      PointerType *PTy = dyn_cast<PointerType>(Val->getType());
+      if (PTy && isObjectDereferencedInBlock(Val, BB) &&
+          !NullPointerIsDefined(BB->getParent(), PTy->getAddressSpace())) {
         Result = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
       }
 
@@ -759,8 +781,8 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(ValueLatticeElement &BBLV,
     // If we hit overdefined, exit early.  The BlockVals entry is already set
     // to overdefined.
     if (Result.isOverdefined()) {
-      DEBUG(dbgs() << " compute BB '" << BB->getName()
-            << "' - overdefined because of pred (local).\n");
+      LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                        << "' - overdefined because of pred (local).\n");
 
       BBLV = Result;
       return true;
@@ -950,8 +972,8 @@ bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV,
     break;
   default:
     // Unhandled instructions are overdefined.
-    DEBUG(dbgs() << " compute BB '" << BB->getName()
-                 << "' - overdefined (unknown cast).\n");
+    LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                      << "' - overdefined (unknown cast).\n");
     BBLV = ValueLatticeElement::getOverdefined();
     return true;
   }
@@ -1009,8 +1031,8 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(ValueLatticeElement &BBLV,
     break;
   default:
     // Unhandled instructions are overdefined.
-    DEBUG(dbgs() << " compute BB '" << BB->getName()
-                 << "' - overdefined (unknown binary operator).\n");
+    LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                      << "' - overdefined (unknown binary operator).\n");
     BBLV = ValueLatticeElement::getOverdefined();
     return true;
   };
@@ -1127,9 +1149,17 @@ getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
              (!isTrueDest && BO->getOpcode() != BinaryOperator::Or))
     return ValueLatticeElement::getOverdefined();
 
-  auto RHS = getValueFromCondition(Val, BO->getOperand(0), isTrueDest, Visited);
-  auto LHS = getValueFromCondition(Val, BO->getOperand(1), isTrueDest, Visited);
-  return intersect(RHS, LHS);
+  // Prevent infinite recursion if Cond references itself as in this example:
+  //  Cond: "%tmp4 = and i1 %tmp4, undef"
+  //    BL: "%tmp4 = and i1 %tmp4, undef"
+  //    BR: "i1 undef"
+  Value *BL = BO->getOperand(0);
+  Value *BR = BO->getOperand(1);
+  if (BL == Cond || BR == Cond)
+    return ValueLatticeElement::getOverdefined();
+
+  return intersect(getValueFromCondition(Val, BL, isTrueDest, Visited),
+                   getValueFromCondition(Val, BR, isTrueDest, Visited));
 }
 
 static ValueLatticeElement
@@ -1196,7 +1226,7 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
   return ValueLatticeElement::getOverdefined();
 }
 
-/// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
+/// Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
 /// Val is not constrained on the edge.  Result is unspecified if return value
 /// is false.
 static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
@@ -1321,7 +1351,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
   return false;
 }
 
-/// \brief Compute the value of Val on the edge BBFrom -> BBTo or the value at
+/// Compute the value of Val on the edge BBFrom -> BBTo or the value at
 /// the basic block if the edge does not constrain Val.
 bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
                                      BasicBlock *BBTo,
@@ -1373,8 +1403,8 @@ bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
 
 ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
                                                        Instruction *CxtI) {
-  DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
-        << BB->getName() << "'\n");
+  LLVM_DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
+                    << BB->getName() << "'\n");
 
   assert(BlockValueStack.empty() && BlockValueSet.empty());
   if (!hasBlockValue(V, BB)) {
@@ -1384,13 +1414,13 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
   ValueLatticeElement Result = getBlockValue(V, BB);
   intersectAssumeOrGuardBlockValueConstantRange(V, Result, CxtI);
 
-  DEBUG(dbgs() << "  Result = " << Result << "\n");
+  LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
 }
 
 ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) {
-  DEBUG(dbgs() << "LVI Getting value " << *V << " at '"
-        << CxtI->getName() << "'\n");
+  LLVM_DEBUG(dbgs() << "LVI Getting value " << *V << " at '" << CxtI->getName()
+                    << "'\n");
 
   if (auto *C = dyn_cast<Constant>(V))
     return ValueLatticeElement::get(C);
@@ -1400,15 +1430,16 @@ ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) {
     Result = getFromRangeMetadata(I);
   intersectAssumeOrGuardBlockValueConstantRange(V, Result, CxtI);
 
-  DEBUG(dbgs() << "  Result = " << Result << "\n");
+  LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
 }
 
 ValueLatticeElement LazyValueInfoImpl::
 getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
                Instruction *CxtI) {
-  DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '"
-        << FromBB->getName() << "' to '" << ToBB->getName() << "'\n");
+  LLVM_DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '"
+                    << FromBB->getName() << "' to '" << ToBB->getName()
+                    << "'\n");
 
   ValueLatticeElement Result;
   if (!getEdgeValue(V, FromBB, ToBB, Result, CxtI)) {
@@ -1418,7 +1449,7 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
     assert(WasFastQuery && "More work to do after problem solved?");
   }
 
-  DEBUG(dbgs() << "  Result = " << Result << "\n");
+  LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
 }
 
@@ -1791,6 +1822,16 @@ void LazyValueInfo::printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS)
   }
 }
 
+void LazyValueInfo::disableDT() {
+  if (PImpl)
+    getImpl(PImpl, AC, DL, DT).disableDT();
+}
+
+void LazyValueInfo::enableDT() {
+  if (PImpl)
+    getImpl(PImpl, AC, DL, DT).enableDT();
+}
+
 // Print the LVI for the function arguments at the start of each basic block.
 void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot(
     const BasicBlock *BB, formatted_raw_ostream &OS) {
@@ -1807,7 +1848,7 @@ void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot(
 
 // This function prints the LVI analysis for the instruction I at the beginning
 // of various basic blocks. It relies on calculated values that are stored in
-// the LazyValueInfoCache, and in the absence of cached values, recalculte the
+// the LazyValueInfoCache, and in the absence of cached values, recalculate the
 // LazyValueInfo for `I`, and print that info.
 void LazyValueInfoAnnotatedWriter::emitInstructionAnnot(
     const Instruction *I, formatted_raw_ostream &OS) {
@@ -1830,7 +1871,7 @@ void LazyValueInfoAnnotatedWriter::emitInstructionAnnot(
   };
 
   printResult(ParentBB);
-  // Print the LVI analysis results for the the immediate successor blocks, that
+  // Print the LVI analysis results for the immediate successor blocks, that
   // are dominated by `ParentBB`.
   for (auto *BBSucc : successors(ParentBB))
     if (DT.dominates(ParentBB, BBSucc))
diff --git a/contrib/llvm/lib/Analysis/Lint.cpp b/contrib/llvm/lib/Analysis/Lint.cpp
index 0e3f498cb14c..db919bd233bf 100644
--- a/contrib/llvm/lib/Analysis/Lint.cpp
+++ b/contrib/llvm/lib/Analysis/Lint.cpp
@@ -165,13 +165,13 @@ namespace {
       }
     }
 
-    /// \brief A check failed, so printout out the condition and the message.
+    /// A check failed, so printout out the condition and the message.
     ///
     /// This provides a nice place to put a breakpoint if you want to see why
     /// something is not correct.
     void CheckFailed(const Twine &Message) { MessagesStr << Message << '\n'; }
 
-    /// \brief A check failed (with values to print).
+    /// A check failed (with values to print).
     ///
     /// This calls the Message-only version so that the above is easier to set
     /// a breakpoint on.
@@ -323,9 +323,9 @@ void Lint::visitCallSite(CallSite CS) {
       MemCpyInst *MCI = cast<MemCpyInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MCI->getDest(), MemoryLocation::UnknownSize,
-                           MCI->getAlignment(), nullptr, MemRef::Write);
+                           MCI->getDestAlignment(), nullptr, MemRef::Write);
       visitMemoryReference(I, MCI->getSource(), MemoryLocation::UnknownSize,
-                           MCI->getAlignment(), nullptr, MemRef::Read);
+                           MCI->getSourceAlignment(), nullptr, MemRef::Read);
 
       // Check that the memcpy arguments don't overlap. The AliasAnalysis API
       // isn't expressive enough for what we really want to do. Known partial
@@ -345,16 +345,16 @@ void Lint::visitCallSite(CallSite CS) {
       MemMoveInst *MMI = cast<MemMoveInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MMI->getDest(), MemoryLocation::UnknownSize,
-                           MMI->getAlignment(), nullptr, MemRef::Write);
+                           MMI->getDestAlignment(), nullptr, MemRef::Write);
       visitMemoryReference(I, MMI->getSource(), MemoryLocation::UnknownSize,
-                           MMI->getAlignment(), nullptr, MemRef::Read);
+                           MMI->getSourceAlignment(), nullptr, MemRef::Read);
       break;
     }
     case Intrinsic::memset: {
       MemSetInst *MSI = cast<MemSetInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MSI->getDest(), MemoryLocation::UnknownSize,
-                           MSI->getAlignment(), nullptr, MemRef::Write);
+                           MSI->getDestAlignment(), nullptr, MemRef::Write);
       break;
     }
 
diff --git a/contrib/llvm/lib/Analysis/Loads.cpp b/contrib/llvm/lib/Analysis/Loads.cpp
index 834727c9224d..d319d4c249d3 100644
--- a/contrib/llvm/lib/Analysis/Loads.cpp
+++ b/contrib/llvm/lib/Analysis/Loads.cpp
@@ -80,7 +80,7 @@ static bool isDereferenceableAndAlignedPointer(
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     const Value *Base = GEP->getPointerOperand();
 
-    APInt Offset(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+    APInt Offset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
     if (!GEP->accumulateConstantOffset(DL, Offset) || Offset.isNegative() ||
         !Offset.urem(APInt(Offset.getBitWidth(), Align)).isMinValue())
       return false;
@@ -108,8 +108,8 @@ static bool isDereferenceableAndAlignedPointer(
                                               DL, CtxI, DT, Visited);
 
   if (auto CS = ImmutableCallSite(V))
-    if (const Value *RV = CS.getReturnedArgOperand())
-      return isDereferenceableAndAlignedPointer(RV, Align, Size, DL, CtxI, DT,
+    if (auto *RP = getArgumentAliasingToReturnedPointer(CS))
+      return isDereferenceableAndAlignedPointer(RP, Align, Size, DL, CtxI, DT,
                                                 Visited);
 
   // If we don't know, assume the worst.
@@ -146,7 +146,7 @@ bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
 
   SmallPtrSet<const Value *, 32> Visited;
   return ::isDereferenceableAndAlignedPointer(
-      V, Align, APInt(DL.getTypeSizeInBits(VTy), DL.getTypeStoreSize(Ty)), DL,
+      V, Align, APInt(DL.getIndexTypeSizeInBits(VTy), DL.getTypeStoreSize(Ty)), DL,
       CtxI, DT, Visited);
 }
 
@@ -156,7 +156,7 @@ bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL,
   return isDereferenceableAndAlignedPointer(V, 1, DL, CtxI, DT);
 }
 
-/// \brief Test if A and B will obviously have the same value.
+/// Test if A and B will obviously have the same value.
 ///
 /// This includes recognizing that %t0 and %t1 will have the same
 /// value in code like this:
@@ -187,7 +187,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   return false;
 }
 
-/// \brief Check if executing a load of this pointer value cannot trap.
+/// Check if executing a load of this pointer value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
 /// analysis and returns true if it is safe to load immediately before ScanFrom.
diff --git a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e141d6c58b65..c6175bf9bee9 100644
--- a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -92,7 +92,7 @@ static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
     cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
 unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
 
-/// \brief The maximum iterations used to merge memory checks
+/// The maximum iterations used to merge memory checks
 static cl::opt<unsigned> MemoryCheckMergeThreshold(
     "memory-check-merge-threshold", cl::Hidden,
     cl::desc("Maximum number of comparisons done when trying to merge "
@@ -102,7 +102,7 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(
 /// Maximum SIMD width.
 const unsigned VectorizerParams::MaxVectorWidth = 64;
 
-/// \brief We collect dependences up to this threshold.
+/// We collect dependences up to this threshold.
 static cl::opt<unsigned>
     MaxDependences("max-dependences", cl::Hidden,
                    cl::desc("Maximum number of dependences collected by "
@@ -124,7 +124,7 @@ static cl::opt<bool> EnableMemAccessVersioning(
     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
     cl::desc("Enable symbolic stride memory access versioning"));
 
-/// \brief Enable store-to-load forwarding conflict detection. This option can
+/// Enable store-to-load forwarding conflict detection. This option can
 /// be disabled for correctness testing.
 static cl::opt<bool> EnableForwardingConflictDetection(
     "store-to-load-forwarding-conflict-detection", cl::Hidden,
@@ -165,8 +165,8 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
     PSE.addPredicate(*SE->getEqualPredicate(U, CT));
     auto *Expr = PSE.getSCEV(Ptr);
 
-    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
+                      << " by: " << *Expr << "\n");
     return Expr;
   }
 
@@ -490,23 +490,23 @@ void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
 
 namespace {
 
-/// \brief Analyses memory accesses in a loop.
+/// Analyses memory accesses in a loop.
 ///
 /// Checks whether run time pointer checks are needed and builds sets for data
 /// dependence checking.
 class AccessAnalysis {
 public:
-  /// \brief Read or write access location.
+  /// Read or write access location.
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
-  AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA,
+  AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AliasAnalysis *AA,
+                 LoopInfo *LI, MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE)
-      : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
-        PSE(PSE) {}
+      : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
+        IsRTCheckAnalysisNeeded(false), PSE(PSE) {}
 
-  /// \brief Register a load  and whether it is only read from.
+  /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
     AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
@@ -515,14 +515,14 @@ public:
       ReadOnlyPtr.insert(Ptr);
   }
 
-  /// \brief Register a store.
+  /// Register a store.
   void addStore(MemoryLocation &Loc) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
     AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, true));
   }
 
-  /// \brief Check if we can emit a run-time no-alias check for \p Access.
+  /// Check if we can emit a run-time no-alias check for \p Access.
   ///
   /// Returns true if we can emit a run-time no alias check for \p Access.
   /// If we can check this access, this also adds it to a dependence set and
@@ -537,7 +537,7 @@ public:
                             unsigned ASId, bool ShouldCheckStride,
                             bool Assume);
 
-  /// \brief Check whether we can check the pointers at runtime for
+  /// Check whether we can check the pointers at runtime for
   /// non-intersection.
   ///
   /// Returns true if we need no check or if we do and we can generate them
@@ -546,13 +546,13 @@ public:
                        Loop *TheLoop, const ValueToValueMap &Strides,
                        bool ShouldCheckWrap = false);
 
-  /// \brief Goes over all memory accesses, checks whether a RT check is needed
+  /// Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
   void buildDependenceSets() {
     processMemAccesses();
   }
 
-  /// \brief Initial processing of memory accesses determined that we need to
+  /// Initial processing of memory accesses determined that we need to
   /// perform dependency checking.
   ///
   /// Note that this can later be cleared if we retry memcheck analysis without
@@ -570,7 +570,7 @@ public:
 private:
   typedef SetVector<MemAccessInfo> PtrAccessSet;
 
-  /// \brief Go over all memory access and check whether runtime pointer checks
+  /// Go over all memory access and check whether runtime pointer checks
   /// are needed and build sets of dependency check candidates.
   void processMemAccesses();
 
@@ -579,6 +579,9 @@ private:
 
   const DataLayout &DL;
 
+  /// The loop being checked.
+  const Loop *TheLoop;
+
   /// List of accesses that need a further dependence check.
   MemAccessInfoList CheckDeps;
 
@@ -596,7 +599,7 @@ private:
   /// dependence check.
   MemoryDepChecker::DepCandidates &DepCands;
 
-  /// \brief Initial processing of memory accesses determined that we may need
+  /// Initial processing of memory accesses determined that we may need
   /// to add memchecks.  Perform the analysis to determine the necessary checks.
   ///
   /// Note that, this is different from isDependencyCheckNeeded.  When we retry
@@ -611,7 +614,7 @@ private:
 
 } // end anonymous namespace
 
-/// \brief Check whether a pointer can participate in a runtime bounds check.
+/// Check whether a pointer can participate in a runtime bounds check.
 /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
 /// by adding run-time checks (overflow checks) if necessary.
 static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
@@ -634,7 +637,7 @@ static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
   return AR->isAffine();
 }
 
-/// \brief Check whether a pointer address cannot wrap.
+/// Check whether a pointer address cannot wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE,
                      const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
   const SCEV *PtrScev = PSE.getSCEV(Ptr);
@@ -684,7 +687,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
 
   bool IsWrite = Access.getInt();
   RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
-  DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
+  LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
 
   return true;
  }
@@ -729,7 +732,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
 
       if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
                                 RunningDepId, ASId, ShouldCheckWrap, false)) {
-        DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
+        LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
         Retries.push_back(Access);
         CanDoAliasSetRT = false;
       }
@@ -791,8 +794,9 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       unsigned ASi = PtrI->getType()->getPointerAddressSpace();
       unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
       if (ASi != ASj) {
-        DEBUG(dbgs() << "LAA: Runtime check would require comparison between"
-                       " different address spaces\n");
+        LLVM_DEBUG(
+            dbgs() << "LAA: Runtime check would require comparison between"
+                      " different address spaces\n");
         return false;
       }
     }
@@ -801,8 +805,8 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   if (NeedRTCheck && CanDoRT)
     RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
 
-  DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
-               << " pointer comparisons.\n");
+  LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
+                    << " pointer comparisons.\n");
 
   RtCheck.Need = NeedRTCheck;
 
@@ -817,10 +821,10 @@ void AccessAnalysis::processMemAccesses() {
   // process read-only pointers. This allows us to skip dependence tests for
   // read-only pointers.
 
-  DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
-  DEBUG(dbgs() << "  AST: "; AST.dump());
-  DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
-  DEBUG({
+  LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
+  LLVM_DEBUG(dbgs() << "  AST: "; AST.dump());
+  LLVM_DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
+  LLVM_DEBUG({
     for (auto A : Accesses)
       dbgs() << "\t" << *A.getPointer() << " (" <<
                 (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
@@ -904,11 +908,15 @@ void AccessAnalysis::processMemAccesses() {
           ValueVector TempObjects;
 
           GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
-          DEBUG(dbgs() << "Underlying objects for pointer " << *Ptr << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "Underlying objects for pointer " << *Ptr << "\n");
           for (Value *UnderlyingObj : TempObjects) {
             // nullptr never alias, don't join sets for pointer that have "null"
             // in their UnderlyingObjects list.
-            if (isa<ConstantPointerNull>(UnderlyingObj))
+            if (isa<ConstantPointerNull>(UnderlyingObj) &&
+                !NullPointerIsDefined(
+                    TheLoop->getHeader()->getParent(),
+                    UnderlyingObj->getType()->getPointerAddressSpace()))
               continue;
 
             UnderlyingObjToAccessMap::iterator Prev =
@@ -917,7 +925,7 @@ void AccessAnalysis::processMemAccesses() {
               DepCands.unionSets(Access, Prev->second);
 
             ObjToLastAccess[UnderlyingObj] = Access;
-            DEBUG(dbgs() << "  " << *UnderlyingObj << "\n");
+            LLVM_DEBUG(dbgs() << "  " << *UnderlyingObj << "\n");
           }
         }
       }
@@ -931,7 +939,7 @@ static bool isInBoundsGep(Value *Ptr) {
   return false;
 }
 
-/// \brief Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
+/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
 /// i.e. monotonically increasing/decreasing.
 static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
                            PredicatedScalarEvolution &PSE, const Loop *L) {
@@ -979,7 +987,7 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
   return false;
 }
 
-/// \brief Check whether the access through \p Ptr has a constant stride.
+/// Check whether the access through \p Ptr has a constant stride.
 int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
                            const Loop *Lp, const ValueToValueMap &StridesMap,
                            bool Assume, bool ShouldCheckWrap) {
@@ -989,8 +997,8 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   // Make sure that the pointer does not point to aggregate types.
   auto *PtrTy = cast<PointerType>(Ty);
   if (PtrTy->getElementType()->isAggregateType()) {
-    DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type" << *Ptr
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
+                      << *Ptr << "\n");
     return 0;
   }
 
@@ -1001,15 +1009,15 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
     AR = PSE.getAsAddRec(Ptr);
 
   if (!AR) {
-    DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
-                 << " SCEV: " << *PtrScev << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
+                      << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // The accesss function must stride over the innermost loop.
   if (Lp != AR->getLoop()) {
-    DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
-          *Ptr << " SCEV: " << *AR << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
+                      << *Ptr << " SCEV: " << *AR << "\n");
     return 0;
   }
 
@@ -1024,18 +1032,20 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   bool IsNoWrapAddRec = !ShouldCheckWrap ||
     PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
     isNoWrapAddRec(Ptr, AR, PSE, Lp);
-  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
-  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
+  if (!IsNoWrapAddRec && !IsInBoundsGEP &&
+      NullPointerIsDefined(Lp->getHeader()->getParent(),
+                           PtrTy->getAddressSpace())) {
     if (Assume) {
       PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
       IsNoWrapAddRec = true;
-      DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
-                   << "LAA:   Pointer: " << *Ptr << "\n"
-                   << "LAA:   SCEV: " << *AR << "\n"
-                   << "LAA:   Added an overflow assumption\n");
+      LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
+                        << "LAA:   Pointer: " << *Ptr << "\n"
+                        << "LAA:   SCEV: " << *AR << "\n"
+                        << "LAA:   Added an overflow assumption\n");
     } else {
-      DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-                   << *Ptr << " SCEV: " << *AR << "\n");
+      LLVM_DEBUG(
+          dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
+                 << *Ptr << " SCEV: " << *AR << "\n");
       return 0;
     }
   }
@@ -1046,8 +1056,8 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
   if (!C) {
-    DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr <<
-          " SCEV: " << *AR << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
+                      << " SCEV: " << *AR << "\n");
     return 0;
   }
 
@@ -1070,15 +1080,16 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   // If the SCEV could wrap but we have an inbounds gep with a unit stride we
   // know we can't "wrap around the address space". In case of address space
   // zero we know that this won't happen without triggering undefined behavior.
-  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
-      Stride != 1 && Stride != -1) {
+  if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
+      (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
+                                              PtrTy->getAddressSpace()))) {
     if (Assume) {
       // We can avoid this case by adding a run-time check.
-      DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
-                   << "inbouds or in address space 0 may wrap:\n"
-                   << "LAA:   Pointer: " << *Ptr << "\n"
-                   << "LAA:   SCEV: " << *AR << "\n"
-                   << "LAA:   Added an overflow assumption\n");
+      LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
+                        << "inbouds or in address space 0 may wrap:\n"
+                        << "LAA:   Pointer: " << *Ptr << "\n"
+                        << "LAA:   SCEV: " << *AR << "\n"
+                        << "LAA:   Added an overflow assumption\n");
       PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
     } else
       return 0;
@@ -1087,14 +1098,65 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   return Stride;
 }
 
-/// Take the pointer operand from the Load/Store instruction.
-/// Returns NULL if this is not a valid Load/Store instruction.
-static Value *getPointerOperand(Value *I) {
-  if (auto *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (auto *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return nullptr;
+bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                           ScalarEvolution &SE,
+                           SmallVectorImpl<unsigned> &SortedIndices) {
+  assert(llvm::all_of(
+             VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
+         "Expected list of pointer operands.");
+  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
+  OffValPairs.reserve(VL.size());
+
+  // Walk over the pointers, and map each of them to an offset relative to
+  // first pointer in the array.
+  Value *Ptr0 = VL[0];
+  const SCEV *Scev0 = SE.getSCEV(Ptr0);
+  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
+
+  llvm::SmallSet<int64_t, 4> Offsets;
+  for (auto *Ptr : VL) {
+    // TODO: Outline this code as a special, more time consuming, version of
+    // computeConstantDifference() function.
+    if (Ptr->getType()->getPointerAddressSpace() !=
+        Ptr0->getType()->getPointerAddressSpace())
+      return false;
+    // If a pointer refers to a different underlying object, bail - the
+    // pointers are by definition incomparable.
+    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
+    if (CurrObj != Obj0)
+      return false;
+
+    const SCEV *Scev = SE.getSCEV(Ptr);
+    const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Scev, Scev0));
+    // The pointers may not have a constant offset from each other, or SCEV
+    // may just not be smart enough to figure out they do. Regardless,
+    // there's nothing we can do.
+    if (!Diff)
+      return false;
+
+    // Check if the pointer with the same offset is found.
+    int64_t Offset = Diff->getAPInt().getSExtValue();
+    if (!Offsets.insert(Offset).second)
+      return false;
+    OffValPairs.emplace_back(Offset, Ptr);
+  }
+  SortedIndices.clear();
+  SortedIndices.resize(VL.size());
+  std::iota(SortedIndices.begin(), SortedIndices.end(), 0);
+
+  // Sort the memory accesses and keep the order of their uses in UseOrder.
+  std::stable_sort(SortedIndices.begin(), SortedIndices.end(),
+                   [&OffValPairs](unsigned Left, unsigned Right) {
+                     return OffValPairs[Left].first < OffValPairs[Right].first;
+                   });
+
+  // Check if the order is consecutive already.
+  if (llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) {
+        return I == SortedIndices[I];
+      }))
+    SortedIndices.clear();
+
+  return true;
 }
 
 /// Take the address space operand from the Load/Store instruction.
@@ -1110,8 +1172,8 @@ static unsigned getAddressSpaceOperand(Value *I) {
 /// Returns true if the memory operations \p A and \p B are consecutive.
 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                                ScalarEvolution &SE, bool CheckType) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
+  Value *PtrA = getLoadStorePointerOperand(A);
+  Value *PtrB = getLoadStorePointerOperand(B);
   unsigned ASA = getAddressSpaceOperand(A);
   unsigned ASB = getAddressSpaceOperand(B);
 
@@ -1127,11 +1189,11 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   if (CheckType && PtrA->getType() != PtrB->getType())
     return false;
 
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+  APInt Size(IdxWidth, DL.getTypeStoreSize(Ty));
 
-  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
@@ -1242,8 +1304,9 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   }
 
   if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
-    DEBUG(dbgs() << "LAA: Distance " << Distance
-                 << " that could cause a store-load forwarding conflict\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: Distance " << Distance
+               << " that could cause a store-load forwarding conflict\n");
     return true;
   }
 
@@ -1321,7 +1384,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   return false;
 }
 
-/// \brief Check the dependence for two accesses with the same stride \p Stride.
+/// Check the dependence for two accesses with the same stride \p Stride.
 /// \p Distance is the positive distance and \p TypeByteSize is type size in
 /// bytes.
 ///
@@ -1395,16 +1458,16 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
 
-  DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-               << "(Induction step: " << StrideAPtr << ")\n");
-  DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
-               << *InstMap[BIdx] << ": " << *Dist << "\n");
+  LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
+                    << "(Induction step: " << StrideAPtr << ")\n");
+  LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
+                    << *InstMap[BIdx] << ": " << *Dist << "\n");
 
   // Need accesses with constant stride. We don't want to vectorize
   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
   // the address space.
   if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
-    DEBUG(dbgs() << "Pointer access with non-constant stride\n");
+    LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
     return Dependence::Unknown;
   }
 
@@ -1421,7 +1484,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
                                  TypeByteSize))
       return Dependence::NoDep;
 
-    DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
+    LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
     ShouldRetryWithRuntimeCheck = true;
     return Dependence::Unknown;
   }
@@ -1432,7 +1495,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // Attempt to prove strided accesses independent.
   if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
       areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
-    DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
+    LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
     return Dependence::NoDep;
   }
 
@@ -1442,11 +1505,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     if (IsTrueDataDependence && EnableForwardingConflictDetection &&
         (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
          ATy != BTy)) {
-      DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
+      LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
       return Dependence::ForwardButPreventsForwarding;
     }
 
-    DEBUG(dbgs() << "LAA: Dependence is negative\n");
+    LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
     return Dependence::Forward;
   }
 
@@ -1455,15 +1518,17 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   if (Val == 0) {
     if (ATy == BTy)
       return Dependence::Forward;
-    DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: Zero dependence difference but different types\n");
     return Dependence::Unknown;
   }
 
   assert(Val.isStrictlyPositive() && "Expect a positive value");
 
   if (ATy != BTy) {
-    DEBUG(dbgs() <<
-          "LAA: ReadWrite-Write positive dependency with different types\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "LAA: ReadWrite-Write positive dependency with different types\n");
     return Dependence::Unknown;
   }
 
@@ -1504,15 +1569,15 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   uint64_t MinDistanceNeeded =
       TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
   if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
-    DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
+                      << Distance << '\n');
     return Dependence::Backward;
   }
 
   // Unsafe if the minimum distance needed is greater than max safe distance.
   if (MinDistanceNeeded > MaxSafeDepDistBytes) {
-    DEBUG(dbgs() << "LAA: Failure because it needs at least "
-                 << MinDistanceNeeded << " size in bytes");
+    LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
+                      << MinDistanceNeeded << " size in bytes");
     return Dependence::Backward;
   }
 
@@ -1541,8 +1606,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::BackwardVectorizableButPreventsForwarding;
 
   uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
-  DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
-               << " with max VF = " << MaxVF << '\n');
+  LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
+                    << " with max VF = " << MaxVF << '\n');
   uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
   MaxSafeRegisterWidth = std::min(MaxSafeRegisterWidth, MaxVFInBits);
   return Dependence::BackwardVectorizable;
@@ -1600,7 +1665,8 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
               if (Dependences.size() >= MaxDependences) {
                 RecordDependences = false;
                 Dependences.clear();
-                DEBUG(dbgs() << "Too many dependences, stopped recording\n");
+                LLVM_DEBUG(dbgs()
+                           << "Too many dependences, stopped recording\n");
               }
             }
             if (!RecordDependences && !SafeForVectorization)
@@ -1612,7 +1678,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
     }
   }
 
-  DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
+  LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
   return SafeForVectorization;
 }
 
@@ -1642,20 +1708,21 @@ void MemoryDepChecker::Dependence::print(
 
 bool LoopAccessInfo::canAnalyzeLoop() {
   // We need to have a loop header.
-  DEBUG(dbgs() << "LAA: Found a loop in "
-               << TheLoop->getHeader()->getParent()->getName() << ": "
-               << TheLoop->getHeader()->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
+                    << TheLoop->getHeader()->getParent()->getName() << ": "
+                    << TheLoop->getHeader()->getName() << '\n');
 
   // We can only analyze innermost loops.
   if (!TheLoop->empty()) {
-    DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
+    LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
     recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
     return false;
   }
 
   // We must have a single backedge.
   if (TheLoop->getNumBackEdges() != 1) {
-    DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
     recordAnalysis("CFGNotUnderstood")
         << "loop control flow is not understood by analyzer";
     return false;
@@ -1663,7 +1730,8 @@ bool LoopAccessInfo::canAnalyzeLoop() {
 
   // We must have a single exiting block.
   if (!TheLoop->getExitingBlock()) {
-    DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
     recordAnalysis("CFGNotUnderstood")
         << "loop control flow is not understood by analyzer";
     return false;
@@ -1673,7 +1741,8 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   // checked at the end of each iteration. With that we can assume that all
   // instructions in the loop are executed the same number of times.
   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
     recordAnalysis("CFGNotUnderstood")
         << "loop control flow is not understood by analyzer";
     return false;
@@ -1684,7 +1753,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   if (ExitCount == PSE->getSE()->getCouldNotCompute()) {
     recordAnalysis("CantComputeNumberOfIterations")
         << "could not determine number of loop iterations";
-    DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
+    LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
     return false;
   }
 
@@ -1734,7 +1803,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
         if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
           recordAnalysis("NonSimpleLoad", Ld)
               << "read with atomic ordering or volatile read";
-          DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
+          LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
           CanVecMem = false;
           return;
         }
@@ -1758,7 +1827,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
         if (!St->isSimple() && !IsAnnotatedParallel) {
           recordAnalysis("NonSimpleStore", St)
               << "write with atomic ordering or volatile write";
-          DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
+          LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
           CanVecMem = false;
           return;
         }
@@ -1777,14 +1846,14 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
   // Check if we see any stores. If there are no stores, then we don't
   // care if the pointers are *restrict*.
   if (!Stores.size()) {
-    DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
+    LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
     CanVecMem = true;
     return;
   }
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses, *PSE);
+                          TheLoop, AA, LI, DependentAccesses, *PSE);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1814,9 +1883,9 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
   }
 
   if (IsAnnotatedParallel) {
-    DEBUG(dbgs()
-          << "LAA: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
+               << "checks.\n");
     CanVecMem = true;
     return;
   }
@@ -1851,7 +1920,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
   // If we write (or read-write) to a single destination and there are no
   // other reads in this loop then is it safe to vectorize.
   if (NumReadWrites == 1 && NumReads == 0) {
-    DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
+    LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
     CanVecMem = true;
     return;
   }
@@ -1866,23 +1935,24 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
                                                   TheLoop, SymbolicStrides);
   if (!CanDoRTIfNeeded) {
     recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
-    DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
-                 << "the array bounds.\n");
+    LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
+                      << "the array bounds.\n");
     CanVecMem = false;
     return;
   }
 
-  DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
+  LLVM_DEBUG(
+      dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
 
   CanVecMem = true;
   if (Accesses.isDependencyCheckNeeded()) {
-    DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
+    LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
     CanVecMem = DepChecker->areDepsSafe(
         DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
     MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
 
     if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
-      DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
+      LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
 
       // Clear the dependency checks. We assume they are not needed.
       Accesses.resetDepChecks(*DepChecker);
@@ -1898,7 +1968,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
       if (!CanDoRTIfNeeded) {
         recordAnalysis("CantCheckMemDepsAtRunTime")
             << "cannot check memory dependencies at runtime";
-        DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
+        LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
         CanVecMem = false;
         return;
       }
@@ -1908,16 +1978,17 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
   }
 
   if (CanVecMem)
-    DEBUG(dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
-                 << (PtrRtChecking->Need ? "" : " don't")
-                 << " need runtime memory checks.\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
+               << (PtrRtChecking->Need ? "" : " don't")
+               << " need runtime memory checks.\n");
   else {
     recordAnalysis("UnsafeMemDep")
         << "unsafe dependent memory operations in loop. Use "
            "#pragma loop distribute(enable) to allow loop distribution "
            "to attempt to isolate the offending operations into a separate "
            "loop";
-    DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
+    LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
   }
 }
 
@@ -1974,7 +2045,7 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
 
 namespace {
 
-/// \brief IR Values for the lower and upper bounds of a pointer evolution.  We
+/// IR Values for the lower and upper bounds of a pointer evolution.  We
 /// need to use value-handles because SCEV expansion can invalidate previously
 /// expanded values.  Thus expansion of a pointer can invalidate the bounds for
 /// a previous one.
@@ -1985,7 +2056,7 @@ struct PointerBounds {
 
 } // end anonymous namespace
 
-/// \brief Expand code for the lower and upper bound of the pointer group \p CG
+/// Expand code for the lower and upper bound of the pointer group \p CG
 /// in \p TheLoop.  \return the values for the bounds.
 static PointerBounds
 expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
@@ -2001,8 +2072,8 @@ expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
   Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
 
   if (SE->isLoopInvariant(Sc, TheLoop)) {
-    DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
+                      << *Ptr << "\n");
     // Ptr could be in the loop body. If so, expand a new one at the correct
     // location.
     Instruction *Inst = dyn_cast<Instruction>(Ptr);
@@ -2015,15 +2086,16 @@ expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
     return {NewPtr, NewPtrPlusOne};
   } else {
     Value *Start = nullptr, *End = nullptr;
-    DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
     Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
     End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
-    DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n");
+    LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
+                      << "\n");
     return {Start, End};
   }
 }
 
-/// \brief Turns a collection of checks into a collection of expanded upper and
+/// Turns a collection of checks into a collection of expanded upper and
 /// lower bounds for both pointers in the check.
 static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
     const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
@@ -2136,9 +2208,9 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   if (!Stride)
     return;
 
-  DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
-                  "versioning:");
-  DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
+  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
+                       "versioning:");
+  LLVM_DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
 
   // Avoid adding the "Stride == 1" predicate when we know that 
   // Stride >= Trip-Count. Such a predicate will effectively optimize a single
@@ -2174,12 +2246,13 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // "Stride >= TripCount" is equivalent to checking: 
   // Stride - BETakenCount > 0
   if (SE->isKnownPositive(StrideMinusBETaken)) {
-    DEBUG(dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
-                    "Stride==1 predicate will imply that the loop executes "
-                    "at most once.\n");
+    LLVM_DEBUG(
+        dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
+                  "Stride==1 predicate will imply that the loop executes "
+                  "at most once.\n");
     return;
-  }  
-  DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
+  }
+  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
 
   SymbolicStrides[Ptr] = Stride;
   StrideSet.insert(Stride);
diff --git a/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp b/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp
index ea7a62d179c4..074023a7e1e2 100644
--- a/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp
+++ b/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp
@@ -24,7 +24,7 @@ cl::opt<bool> EnableMSSALoopDependency(
     "enable-mssa-loop-dependency", cl::Hidden, cl::init(false),
     cl::desc("Enable MemorySSA dependency for loop pass manager"));
 
-// Explicit template instantiations and specialization defininitions for core
+// Explicit template instantiations and specialization definitions for core
 // template typedefs.
 template class AllAnalysesOn<Loop>;
 template class AnalysisManager<Loop, LoopStandardAnalysisResults &>;
diff --git a/contrib/llvm/lib/Analysis/LoopInfo.cpp b/contrib/llvm/lib/Analysis/LoopInfo.cpp
index 9e54d60779a0..3f78456b3586 100644
--- a/contrib/llvm/lib/Analysis/LoopInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LoopInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
@@ -377,69 +378,6 @@ Loop::LocRange Loop::getLocRange() const {
   return LocRange();
 }
 
-bool Loop::hasDedicatedExits() const {
-  // Each predecessor of each exit block of a normal loop is contained
-  // within the loop.
-  SmallVector<BasicBlock *, 4> ExitBlocks;
-  getExitBlocks(ExitBlocks);
-  for (BasicBlock *BB : ExitBlocks)
-    for (BasicBlock *Predecessor : predecessors(BB))
-      if (!contains(Predecessor))
-        return false;
-  // All the requirements are met.
-  return true;
-}
-
-void Loop::getUniqueExitBlocks(
-    SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
-  assert(hasDedicatedExits() &&
-         "getUniqueExitBlocks assumes the loop has canonical form exits!");
-
-  SmallVector<BasicBlock *, 32> SwitchExitBlocks;
-  for (BasicBlock *BB : this->blocks()) {
-    SwitchExitBlocks.clear();
-    for (BasicBlock *Successor : successors(BB)) {
-      // If block is inside the loop then it is not an exit block.
-      if (contains(Successor))
-        continue;
-
-      pred_iterator PI = pred_begin(Successor);
-      BasicBlock *FirstPred = *PI;
-
-      // If current basic block is this exit block's first predecessor
-      // then only insert exit block in to the output ExitBlocks vector.
-      // This ensures that same exit block is not inserted twice into
-      // ExitBlocks vector.
-      if (BB != FirstPred)
-        continue;
-
-      // If a terminator has more then two successors, for example SwitchInst,
-      // then it is possible that there are multiple edges from current block
-      // to one exit block.
-      if (std::distance(succ_begin(BB), succ_end(BB)) <= 2) {
-        ExitBlocks.push_back(Successor);
-        continue;
-      }
-
-      // In case of multiple edges from current block to exit block, collect
-      // only one edge in ExitBlocks. Use switchExitBlocks to keep track of
-      // duplicate edges.
-      if (!is_contained(SwitchExitBlocks, Successor)) {
-        SwitchExitBlocks.push_back(Successor);
-        ExitBlocks.push_back(Successor);
-      }
-    }
-  }
-}
-
-BasicBlock *Loop::getUniqueExitBlock() const {
-  SmallVector<BasicBlock *, 8> UniqueExitBlocks;
-  getUniqueExitBlocks(UniqueExitBlocks);
-  if (UniqueExitBlocks.size() == 1)
-    return UniqueExitBlocks[0];
-  return nullptr;
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Loop::dump() const { print(dbgs()); }
 
diff --git a/contrib/llvm/lib/Analysis/LoopPass.cpp b/contrib/llvm/lib/Analysis/LoopPass.cpp
index 9af717bafdca..07a151ce0fce 100644
--- a/contrib/llvm/lib/Analysis/LoopPass.cpp
+++ b/contrib/llvm/lib/Analysis/LoopPass.cpp
@@ -142,8 +142,17 @@ void LPPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
 void LPPassManager::markLoopAsDeleted(Loop &L) {
   assert((&L == CurrentLoop || CurrentLoop->contains(&L)) &&
          "Must not delete loop outside the current loop tree!");
-  if (&L == CurrentLoop)
+  // If this loop appears elsewhere within the queue, we also need to remove it
+  // there. However, we have to be careful to not remove the back of the queue
+  // as that is assumed to match the current loop.
+  assert(LQ.back() == CurrentLoop && "Loop queue back isn't the current loop!");
+  LQ.erase(std::remove(LQ.begin(), LQ.end(), &L), LQ.end());
+
+  if (&L == CurrentLoop) {
     CurrentLoopDeleted = true;
+    // Add this loop back onto the back of the queue to preserve our invariants.
+    LQ.push_back(&L);
+  }
 }
 
 /// run - Execute all of the passes scheduled for execution.  Keep track of
@@ -151,7 +160,10 @@ void LPPassManager::markLoopAsDeleted(Loop &L) {
 bool LPPassManager::runOnFunction(Function &F) {
   auto &LIWP = getAnalysis<LoopInfoWrapperPass>();
   LI = &LIWP.getLoopInfo();
+  Module &M = *F.getParent();
+#if 0
   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+#endif
   bool Changed = false;
 
   // Collect inherited analysis from Module level pass manager.
@@ -181,6 +193,8 @@ bool LPPassManager::runOnFunction(Function &F) {
   }
 
   // Walk Loops
+  unsigned InstrCount = 0;
+  bool EmitICRemark = M.shouldEmitInstrCountChangedRemark();
   while (!LQ.empty()) {
     CurrentLoopDeleted = false;
     CurrentLoop = LQ.back();
@@ -198,8 +212,11 @@ bool LPPassManager::runOnFunction(Function &F) {
       {
         PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
         TimeRegion PassTimer(getPassTimer(P));
-
+        if (EmitICRemark)
+          InstrCount = initSizeRemarkInfo(M);
         Changed |= P->runOnLoop(CurrentLoop, *this);
+        if (EmitICRemark)
+          emitInstrCountChangedRemark(P, M, InstrCount);
       }
 
       if (Changed)
@@ -225,8 +242,12 @@ bool LPPassManager::runOnFunction(Function &F) {
         // is that LPPassManager might run passes which do not require LCSSA
         // form (LoopPassPrinter for example). We should skip verification for
         // such passes.
+        // FIXME: Loop-sink currently break LCSSA. Fix it and reenable the
+        // verification!
+#if 0
         if (mustPreserveAnalysisID(LCSSAVerificationPass::ID))
-          CurrentLoop->isRecursivelyLCSSAForm(*DT, *LI);
+          assert(CurrentLoop->isRecursivelyLCSSAForm(*DT, *LI));
+#endif
 
         // Then call the regular verifyAnalysis functions.
         verifyPreservedAnalysis(P);
@@ -351,13 +372,13 @@ bool LoopPass::skipLoop(const Loop *L) const {
     return false;
   // Check the opt bisect limit.
   LLVMContext &Context = F->getContext();
-  if (!Context.getOptBisect().shouldRunPass(this, *L))
+  if (!Context.getOptPassGate().shouldRunPass(this, *L))
     return true;
   // Check for the OptimizeNone attribute.
   if (F->hasFnAttribute(Attribute::OptimizeNone)) {
     // FIXME: Report this to dbgs() only once per function.
-    DEBUG(dbgs() << "Skipping pass '" << getPassName()
-          << "' in function " << F->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Skipping pass '" << getPassName() << "' in function "
+                      << F->getName() << "\n");
     // FIXME: Delete loop from pass manager's queue?
     return true;
   }
diff --git a/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
index 0da90dae3d9a..c8b91a7a1a51 100644
--- a/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -17,7 +17,7 @@
 
 using namespace llvm;
 
-/// \brief Try to simplify instruction \param I using its SCEV expression.
+/// Try to simplify instruction \param I using its SCEV expression.
 ///
 /// The idea is that some AddRec expressions become constants, which then
 /// could trigger folding of other instructions. However, that only happens
diff --git a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
index 24fedfed772c..686ad294378c 100644
--- a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -40,7 +41,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -75,12 +75,24 @@ static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
   {LibFunc_valloc,              {MallocLike,  1, 0,  -1}},
   {LibFunc_Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
   {LibFunc_ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc_ZnwjSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new(unsigned int, align_val_t)
+  {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int, align_val_t, nothrow)
+                                {MallocLike,  3, 0,  -1}},
   {LibFunc_Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
   {LibFunc_ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
+  {LibFunc_ZnwmSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new(unsigned long, align_val_t)
+  {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long, align_val_t, nothrow)
+                                {MallocLike,  3, 0,  -1}},
   {LibFunc_Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
   {LibFunc_ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc_ZnajSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new[](unsigned int, align_val_t)
+  {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int, align_val_t, nothrow)
+                                {MallocLike,  3, 0,  -1}},
   {LibFunc_Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
   {LibFunc_ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
+  {LibFunc_ZnamSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new[](unsigned long, align_val_t)
+  {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long, align_val_t, nothrow)
+                                 {MallocLike,  3, 0,  -1}},
   {LibFunc_msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
   {LibFunc_msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
   {LibFunc_msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
@@ -112,10 +124,9 @@ static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast
 
   IsNoBuiltin = CS.isNoBuiltin();
 
-  const Function *Callee = CS.getCalledFunction();
-  if (!Callee || !Callee->isDeclaration())
-    return nullptr;
-  return Callee;
+  if (const Function *Callee = CS.getCalledFunction())
+    return Callee;
+  return nullptr;
 }
 
 /// Returns the allocation data for the given value if it's either a call to a
@@ -206,7 +217,7 @@ static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
   return CS && CS.hasRetAttr(Attribute::NoAlias);
 }
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
 bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
@@ -214,7 +225,7 @@ bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
   return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast).hasValue();
 }
 
-/// \brief Tests if a value is a call or invoke to a function that returns a
+/// Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
 bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
                        bool LookThroughBitCast) {
@@ -224,29 +235,29 @@ bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
          hasNoAliasAttr(V, LookThroughBitCast);
 }
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
 bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
   return getAllocationData(V, MallocLike, TLI, LookThroughBitCast).hasValue();
 }
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
 bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
   return getAllocationData(V, CallocLike, TLI, LookThroughBitCast).hasValue();
 }
 
-/// \brief Tests if a value is a call or invoke to a library function that
-/// allocates memory similiar to malloc or calloc.
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory similar to malloc or calloc.
 bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                                   bool LookThroughBitCast) {
   return getAllocationData(V, MallocOrCallocLike, TLI,
                            LookThroughBitCast).hasValue();
 }
 
-/// \brief Tests if a value is a call or invoke to a library function that
+/// Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
 bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                          bool LookThroughBitCast) {
@@ -350,11 +361,10 @@ const CallInst *llvm::extractCallocCall(const Value *I,
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
 const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
-  const CallInst *CI = dyn_cast<CallInst>(I);
-  if (!CI || isa<IntrinsicInst>(CI))
-    return nullptr;
-  Function *Callee = CI->getCalledFunction();
-  if (Callee == nullptr)
+  bool IsNoBuiltinCall;
+  const Function *Callee =
+      getCalledFunction(I, /*LookThroughBitCast=*/false, IsNoBuiltinCall);
+  if (Callee == nullptr || IsNoBuiltinCall)
     return nullptr;
 
   StringRef FnName = Callee->getName();
@@ -374,9 +384,11 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   else if (TLIFn == LibFunc_ZdlPvj ||              // delete(void*, uint)
            TLIFn == LibFunc_ZdlPvm ||              // delete(void*, ulong)
            TLIFn == LibFunc_ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
+           TLIFn == LibFunc_ZdlPvSt11align_val_t || // delete(void*, align_val_t)
            TLIFn == LibFunc_ZdaPvj ||              // delete[](void*, uint)
            TLIFn == LibFunc_ZdaPvm ||              // delete[](void*, ulong)
            TLIFn == LibFunc_ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
+           TLIFn == LibFunc_ZdaPvSt11align_val_t || // delete[](void*, align_val_t)
            TLIFn == LibFunc_msvc_delete_ptr32_int ||      // delete(void*, uint)
            TLIFn == LibFunc_msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
            TLIFn == LibFunc_msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
@@ -386,6 +398,9 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
            TLIFn == LibFunc_msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
            TLIFn == LibFunc_msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
+  else if (TLIFn == LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t || // delete(void*, align_val_t, nothrow)
+           TLIFn == LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t) // delete[](void*, align_val_t, nothrow)
+    ExpectedNumParams = 3;
   else
     return nullptr;
 
@@ -400,7 +415,7 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   if (FTy->getParamType(0) != Type::getInt8PtrTy(Callee->getContext()))
     return nullptr;
 
-  return CI;
+  return dyn_cast<CallInst>(I);
 }
 
 //===----------------------------------------------------------------------===//
@@ -412,7 +427,7 @@ static APInt getSizeWithOverflow(const SizeOffsetType &Data) {
   return Data.first - Data.second;
 }
 
-/// \brief Compute the size of the object pointed by Ptr. Returns true and the
+/// Compute the size of the object pointed by Ptr. Returns true and the
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the alignment of
 /// allocas, byval arguments, and global variables.
@@ -513,8 +528,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
       return visitGEPOperator(cast<GEPOperator>(*CE));
   }
 
-  DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V
-        << '\n');
+  LLVM_DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: "
+                    << *V << '\n');
   return unknown();
 }
 
@@ -627,7 +642,14 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
 
 SizeOffsetType
 ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull& CPN) {
-  if (Options.NullIsUnknownSize && CPN.getType()->getAddressSpace() == 0)
+  // If null is unknown, there's nothing we can do. Additionally, non-zero
+  // address spaces can make use of null, so we don't presume to know anything
+  // about that.
+  //
+  // TODO: How should this work with address space casts? We currently just drop
+  // them on the floor, but it's unclear what we should do when a NULL from
+  // addrspace(1) gets casted to addrspace(0) (or vice-versa).
+  if (Options.NullIsUnknownSize || CPN.getType()->getAddressSpace())
     return unknown();
   return std::make_pair(Zero, Zero);
 }
@@ -714,7 +736,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitUndefValue(UndefValue&) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
-  DEBUG(dbgs() << "ObjectSizeOffsetVisitor unknown instruction:" << I << '\n');
+  LLVM_DEBUG(dbgs() << "ObjectSizeOffsetVisitor unknown instruction:" << I
+                    << '\n');
   return unknown();
 }
 
@@ -793,8 +816,9 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
     // Ignore values where we cannot do more than ObjectSizeVisitor.
     Result = unknown();
   } else {
-    DEBUG(dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: "
-          << *V << '\n');
+    LLVM_DEBUG(
+        dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: " << *V
+               << '\n');
     Result = unknown();
   }
 
@@ -931,6 +955,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitSelectInst(SelectInst &I) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitInstruction(Instruction &I) {
-  DEBUG(dbgs() << "ObjectSizeOffsetEvaluator unknown instruction:" << I <<'\n');
+  LLVM_DEBUG(dbgs() << "ObjectSizeOffsetEvaluator unknown instruction:" << I
+                    << '\n');
   return unknown();
 }
diff --git a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index bf83f52ccf2e..7eeefd54f007 100644
--- a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -154,24 +154,16 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
   }
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-    AAMDNodes AAInfo;
-
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
-      II->getAAMetadata(AAInfo);
-      Loc = MemoryLocation(
-          II->getArgOperand(1),
-          cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(), AAInfo);
+      Loc = MemoryLocation::getForArgument(II, 1, TLI);
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
       return ModRefInfo::Mod;
     case Intrinsic::invariant_end:
-      II->getAAMetadata(AAInfo);
-      Loc = MemoryLocation(
-          II->getArgOperand(2),
-          cast<ConstantInt>(II->getArgOperand(1))->getZExtValue(), AAInfo);
+      Loc = MemoryLocation::getForArgument(II, 2, TLI);
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
       return ModRefInfo::Mod;
@@ -363,8 +355,8 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
 MemDepResult
 MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
                                                             BasicBlock *BB) {
-  auto *InvariantGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group);
-  if (!InvariantGroupMD)
+
+  if (!LI->getMetadata(LLVMContext::MD_invariant_group))
     return MemDepResult::getUnknown();
 
   // Take the ptr operand after all casts and geps 0. This way we can search
@@ -425,7 +417,7 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
       if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
-          U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD)
+          U->getMetadata(LLVMContext::MD_invariant_group) != nullptr)
         ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
   }
@@ -441,6 +433,7 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
   NonLocalDefsCache.try_emplace(
       LI, NonLocalDepResult(ClosestDependency->getParent(),
                             MemDepResult::getDef(ClosestDependency), nullptr));
+  ReverseNonLocalDefsCache[ClosestDependency].insert(LI);
   return MemDepResult::getNonLocal();
 }
 
@@ -813,7 +806,7 @@ MemoryDependenceResults::getNonLocalCallDependency(CallSite QueryCS) {
         DirtyBlocks.push_back(Entry.getBB());
 
     // Sort the cache so that we can do fast binary search lookups below.
-    std::sort(Cache.begin(), Cache.end());
+    llvm::sort(Cache.begin(), Cache.end());
 
     ++NumCacheDirtyNonLocal;
     // cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: "
@@ -832,7 +825,7 @@ MemoryDependenceResults::getNonLocalCallDependency(CallSite QueryCS) {
   SmallPtrSet<BasicBlock *, 32> Visited;
 
   unsigned NumSortedEntries = Cache.size();
-  DEBUG(AssertSorted(Cache));
+  LLVM_DEBUG(AssertSorted(Cache));
 
   // Iterate while we still have blocks to update.
   while (!DirtyBlocks.empty()) {
@@ -845,7 +838,7 @@ MemoryDependenceResults::getNonLocalCallDependency(CallSite QueryCS) {
 
     // Do a binary search to see if we already have an entry for this block in
     // the cache set.  If so, find it.
-    DEBUG(AssertSorted(Cache, NumSortedEntries));
+    LLVM_DEBUG(AssertSorted(Cache, NumSortedEntries));
     NonLocalDepInfo::iterator Entry =
         std::upper_bound(Cache.begin(), Cache.begin() + NumSortedEntries,
                          NonLocalDepEntry(DirtyBB));
@@ -927,12 +920,12 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
   {
-    // Check if there is cached Def with invariant.group. FIXME: cache might be
-    // invalid if cached instruction would be removed between call to
-    // getPointerDependencyFrom and this function.
+    // Check if there is cached Def with invariant.group.
     auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst);
     if (NonLocalDefIt != NonLocalDefsCache.end()) {
-      Result.push_back(std::move(NonLocalDefIt->second));
+      Result.push_back(NonLocalDefIt->second);
+      ReverseNonLocalDefsCache[NonLocalDefIt->second.getResult().getInst()]
+          .erase(QueryInst);
       NonLocalDefsCache.erase(NonLocalDefIt);
       return;
     }
@@ -1076,7 +1069,7 @@ SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache,
     break;
   default:
     // Added many values, do a full scale sort.
-    std::sort(Cache.begin(), Cache.end());
+    llvm::sort(Cache.begin(), Cache.end());
     break;
   }
 }
@@ -1218,7 +1211,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   unsigned NumSortedEntries = Cache->size();
   unsigned WorklistEntries = BlockNumberLimit;
   bool GotWorklistLimit = false;
-  DEBUG(AssertSorted(*Cache));
+  LLVM_DEBUG(AssertSorted(*Cache));
 
   while (!Worklist.empty()) {
     BasicBlock *BB = Worklist.pop_back_val();
@@ -1249,7 +1242,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
       // Get the dependency info for Pointer in BB.  If we have cached
       // information, we will use it, otherwise we compute it.
-      DEBUG(AssertSorted(*Cache, NumSortedEntries));
+      LLVM_DEBUG(AssertSorted(*Cache, NumSortedEntries));
       MemDepResult Dep = GetNonLocalInfoForBlock(QueryInst, Loc, isLoad, BB,
                                                  Cache, NumSortedEntries);
 
@@ -1463,13 +1456,33 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // Okay, we're done now.  If we added new values to the cache, re-sort it.
   SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
-  DEBUG(AssertSorted(*Cache));
+  LLVM_DEBUG(AssertSorted(*Cache));
   return true;
 }
 
-/// If P exists in CachedNonLocalPointerInfo, remove it.
+/// If P exists in CachedNonLocalPointerInfo or NonLocalDefsCache, remove it.
 void MemoryDependenceResults::RemoveCachedNonLocalPointerDependencies(
     ValueIsLoadPair P) {
+
+  // Most of the time this cache is empty.
+  if (!NonLocalDefsCache.empty()) {
+    auto it = NonLocalDefsCache.find(P.getPointer());
+    if (it != NonLocalDefsCache.end()) {
+      RemoveFromReverseMap(ReverseNonLocalDefsCache,
+                           it->second.getResult().getInst(), P.getPointer());
+      NonLocalDefsCache.erase(it);
+    }
+
+    if (auto *I = dyn_cast<Instruction>(P.getPointer())) {
+      auto toRemoveIt = ReverseNonLocalDefsCache.find(I);
+      if (toRemoveIt != ReverseNonLocalDefsCache.end()) {
+        for (const auto &entry : toRemoveIt->second)
+          NonLocalDefsCache.erase(entry);
+        ReverseNonLocalDefsCache.erase(toRemoveIt);
+      }
+    }
+  }
+
   CachedNonLocalPointerInfo::iterator It = NonLocalPointerDeps.find(P);
   if (It == NonLocalPointerDeps.end())
     return;
@@ -1646,7 +1659,7 @@ void MemoryDependenceResults::removeInstruction(Instruction *RemInst) {
 
       // Re-sort the NonLocalDepInfo.  Changing the dirty entry to its
       // subsequent value may invalidate the sortedness.
-      std::sort(NLPDI.begin(), NLPDI.end());
+      llvm::sort(NLPDI.begin(), NLPDI.end());
     }
 
     ReverseNonLocalPtrDeps.erase(ReversePtrDepIt);
@@ -1659,7 +1672,7 @@ void MemoryDependenceResults::removeInstruction(Instruction *RemInst) {
   }
 
   assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?");
-  DEBUG(verifyRemoved(RemInst));
+  LLVM_DEBUG(verifyRemoved(RemInst));
 }
 
 /// Verify that the specified instruction does not occur in our internal data
diff --git a/contrib/llvm/lib/Analysis/MemoryLocation.cpp b/contrib/llvm/lib/Analysis/MemoryLocation.cpp
index 9db6c499129a..55924db284ec 100644
--- a/contrib/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryLocation.cpp
@@ -65,6 +65,14 @@ MemoryLocation MemoryLocation::get(const AtomicRMWInst *RMWI) {
 }
 
 MemoryLocation MemoryLocation::getForSource(const MemTransferInst *MTI) {
+  return getForSource(cast<AnyMemTransferInst>(MTI));
+}
+
+MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) {
+  return getForSource(cast<AnyMemTransferInst>(MTI));
+}
+
+MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) {
   uint64_t Size = UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
     Size = C->getValue().getZExtValue();
@@ -77,17 +85,25 @@ MemoryLocation MemoryLocation::getForSource(const MemTransferInst *MTI) {
   return MemoryLocation(MTI->getRawSource(), Size, AATags);
 }
 
-MemoryLocation MemoryLocation::getForDest(const MemIntrinsic *MTI) {
+MemoryLocation MemoryLocation::getForDest(const MemIntrinsic *MI) {
+  return getForDest(cast<AnyMemIntrinsic>(MI));
+}
+
+MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) {
+  return getForDest(cast<AnyMemIntrinsic>(MI));
+}
+
+MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
   uint64_t Size = UnknownSize;
-  if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
+  if (ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()))
     Size = C->getValue().getZExtValue();
 
   // memcpy/memmove can have AA tags. For memcpy, they apply
   // to both the source and the destination.
   AAMDNodes AATags;
-  MTI->getAAMetadata(AATags);
+  MI->getAAMetadata(AATags);
 
-  return MemoryLocation(MTI->getRawDest(), Size, AATags);
+  return MemoryLocation(MI->getRawDest(), Size, AATags);
 }
 
 MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
diff --git a/contrib/llvm/lib/Analysis/MemorySSA.cpp b/contrib/llvm/lib/Analysis/MemorySSA.cpp
index 09605f61fa93..f57d490ce96e 100644
--- a/contrib/llvm/lib/Analysis/MemorySSA.cpp
+++ b/contrib/llvm/lib/Analysis/MemorySSA.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
@@ -82,7 +83,7 @@ static cl::opt<bool>
 
 namespace llvm {
 
-/// \brief An assembly annotator class to print Memory SSA information in
+/// An assembly annotator class to print Memory SSA information in
 /// comments.
 class MemorySSAAnnotatedWriter : public AssemblyAnnotationWriter {
   friend class MemorySSA;
@@ -235,13 +236,25 @@ static bool areLoadsReorderable(const LoadInst *Use,
   return !(SeqCstUse || MayClobberIsAcquire);
 }
 
-static bool instructionClobbersQuery(MemoryDef *MD,
-                                     const MemoryLocation &UseLoc,
-                                     const Instruction *UseInst,
-                                     AliasAnalysis &AA) {
+namespace {
+
+struct ClobberAlias {
+  bool IsClobber;
+  Optional<AliasResult> AR;
+};
+
+} // end anonymous namespace
+
+// Return a pair of {IsClobber (bool), AR (AliasResult)}. It relies on AR being
+// ignored if IsClobber = false.
+static ClobberAlias instructionClobbersQuery(MemoryDef *MD,
+                                             const MemoryLocation &UseLoc,
+                                             const Instruction *UseInst,
+                                             AliasAnalysis &AA) {
   Instruction *DefInst = MD->getMemoryInst();
   assert(DefInst && "Defining instruction not actually an instruction");
   ImmutableCallSite UseCS(UseInst);
+  Optional<AliasResult> AR;
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
     // These intrinsics will show up as affecting memory, but they are just
@@ -249,13 +262,14 @@ static bool instructionClobbersQuery(MemoryDef *MD,
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
       if (UseCS)
-        return false;
-      return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), UseLoc);
+        return {false, NoAlias};
+      AR = AA.alias(MemoryLocation(II->getArgOperand(1)), UseLoc);
+      return {AR == MustAlias, AR};
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
     case Intrinsic::assume:
-      return false;
+      return {false, NoAlias};
     default:
       break;
     }
@@ -263,19 +277,23 @@ static bool instructionClobbersQuery(MemoryDef *MD,
 
   if (UseCS) {
     ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
-    return isModOrRefSet(I);
+    AR = isMustSet(I) ? MustAlias : MayAlias;
+    return {isModOrRefSet(I), AR};
   }
 
   if (auto *DefLoad = dyn_cast<LoadInst>(DefInst))
     if (auto *UseLoad = dyn_cast<LoadInst>(UseInst))
-      return !areLoadsReorderable(UseLoad, DefLoad);
+      return {!areLoadsReorderable(UseLoad, DefLoad), MayAlias};
 
-  return isModSet(AA.getModRefInfo(DefInst, UseLoc));
+  ModRefInfo I = AA.getModRefInfo(DefInst, UseLoc);
+  AR = isMustSet(I) ? MustAlias : MayAlias;
+  return {isModSet(I), AR};
 }
 
-static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU,
-                                     const MemoryLocOrCall &UseMLOC,
-                                     AliasAnalysis &AA) {
+static ClobberAlias instructionClobbersQuery(MemoryDef *MD,
+                                             const MemoryUseOrDef *MU,
+                                             const MemoryLocOrCall &UseMLOC,
+                                             AliasAnalysis &AA) {
   // FIXME: This is a temporary hack to allow a single instructionClobbersQuery
   // to exist while MemoryLocOrCall is pushed through places.
   if (UseMLOC.IsCall)
@@ -288,7 +306,7 @@ static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU,
 // Return true when MD may alias MU, return false otherwise.
 bool MemorySSAUtil::defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
                                         AliasAnalysis &AA) {
-  return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA);
+  return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA).IsClobber;
 }
 
 namespace {
@@ -303,6 +321,7 @@ struct UpwardsMemoryQuery {
   const Instruction *Inst = nullptr;
   // The MemoryAccess we actually got called with, used to test local domination
   const MemoryAccess *OriginalAccess = nullptr;
+  Optional<AliasResult> AR = MayAlias;
 
   UpwardsMemoryQuery() = default;
 
@@ -333,9 +352,6 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysis &AA,
                                                    const Instruction *I) {
   // If the memory can't be changed, then loads of the memory can't be
   // clobbered.
-  //
-  // FIXME: We should handle invariant groups, as well. It's a bit harder,
-  // because we need to pay close attention to invariant group barriers.
   return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
                               AA.pointsToConstantMemory(cast<LoadInst>(I)->
                                                           getPointerOperand()));
@@ -386,9 +402,15 @@ checkClobberSanity(MemoryAccess *Start, MemoryAccess *ClobberAt,
           //
           // Also, note that this can't be hoisted out of the `Worklist` loop,
           // since MD may only act as a clobber for 1 of N MemoryLocations.
-          FoundClobber =
-              FoundClobber || MSSA.isLiveOnEntryDef(MD) ||
-              instructionClobbersQuery(MD, MAP.second, Query.Inst, AA);
+          FoundClobber = FoundClobber || MSSA.isLiveOnEntryDef(MD);
+          if (!FoundClobber) {
+            ClobberAlias CA =
+                instructionClobbersQuery(MD, MAP.second, Query.Inst, AA);
+            if (CA.IsClobber) {
+              FoundClobber = true;
+              // Not used: CA.AR;
+            }
+          }
         }
         break;
       }
@@ -398,7 +420,8 @@ checkClobberSanity(MemoryAccess *Start, MemoryAccess *ClobberAt,
 
       if (auto *MD = dyn_cast<MemoryDef>(MA)) {
         (void)MD;
-        assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA) &&
+        assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA)
+                    .IsClobber &&
                "Found clobber before reaching ClobberAt!");
         continue;
       }
@@ -468,9 +491,10 @@ class ClobberWalker {
   /// Result of calling walkToPhiOrClobber.
   struct UpwardsWalkResult {
     /// The "Result" of the walk. Either a clobber, the last thing we walked, or
-    /// both.
+    /// both. Include alias info when clobber found.
     MemoryAccess *Result;
     bool IsKnownClobber;
+    Optional<AliasResult> AR;
   };
 
   /// Walk to the next Phi or Clobber in the def chain starting at Desc.Last.
@@ -486,17 +510,21 @@ class ClobberWalker {
     for (MemoryAccess *Current : def_chain(Desc.Last)) {
       Desc.Last = Current;
       if (Current == StopAt)
-        return {Current, false};
-
-      if (auto *MD = dyn_cast<MemoryDef>(Current))
-        if (MSSA.isLiveOnEntryDef(MD) ||
-            instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA))
-          return {MD, true};
+        return {Current, false, MayAlias};
+
+      if (auto *MD = dyn_cast<MemoryDef>(Current)) {
+        if (MSSA.isLiveOnEntryDef(MD))
+          return {MD, true, MustAlias};
+        ClobberAlias CA =
+            instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA);
+        if (CA.IsClobber)
+          return {MD, true, CA.AR};
+      }
     }
 
     assert(isa<MemoryPhi>(Desc.Last) &&
            "Ended at a non-clobber that's not a phi?");
-    return {Desc.Last, false};
+    return {Desc.Last, false, MayAlias};
   }
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
@@ -819,8 +847,6 @@ public:
   ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT)
       : MSSA(MSSA), AA(AA), DT(DT) {}
 
-  void reset() {}
-
   /// Finds the nearest clobber for the given query, optimizing phis if
   /// possible.
   MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q) {
@@ -839,6 +865,7 @@ public:
     MemoryAccess *Result;
     if (WalkResult.IsKnownClobber) {
       Result = WalkResult.Result;
+      Q.AR = WalkResult.AR;
     } else {
       OptznResult OptRes = tryOptimizePhi(cast<MemoryPhi>(FirstDesc.Last),
                                           Current, Q.StartingLoc);
@@ -876,12 +903,11 @@ struct RenamePassData {
 
 namespace llvm {
 
-/// \brief A MemorySSAWalker that does AA walks to disambiguate accesses. It no
-/// longer does caching on its own,
-/// but the name has been retained for the moment.
+/// A MemorySSAWalker that does AA walks to disambiguate accesses. It no
+/// longer does caching on its own, but the name has been retained for the
+/// moment.
 class MemorySSA::CachingWalker final : public MemorySSAWalker {
   ClobberWalker Walker;
-  bool AutoResetWalker = true;
 
   MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
 
@@ -896,13 +922,6 @@ public:
                                           const MemoryLocation &) override;
   void invalidateInfo(MemoryAccess *) override;
 
-  /// Whether we call resetClobberWalker() after each time we *actually* walk to
-  /// answer a clobber query.
-  void setAutoResetWalker(bool AutoReset) { AutoResetWalker = AutoReset; }
-
-  /// Drop the walker's persistent data structures.
-  void resetClobberWalker() { Walker.reset(); }
-
   void verify(const MemorySSA *MSSA) override {
     MemorySSAWalker::verify(MSSA);
     Walker.verify(MSSA);
@@ -930,7 +949,7 @@ void MemorySSA::renameSuccessorPhis(BasicBlock *BB, MemoryAccess *IncomingVal,
   }
 }
 
-/// \brief Rename a single basic block into MemorySSA form.
+/// Rename a single basic block into MemorySSA form.
 /// Uses the standard SSA renaming algorithm.
 /// \returns The new incoming value.
 MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB, MemoryAccess *IncomingVal,
@@ -953,7 +972,7 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB, MemoryAccess *IncomingVal,
   return IncomingVal;
 }
 
-/// \brief This is the standard SSA renaming algorithm.
+/// This is the standard SSA renaming algorithm.
 ///
 /// We walk the dominator tree in preorder, renaming accesses, and then filling
 /// in phi nodes in our successors.
@@ -1002,7 +1021,7 @@ void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
   }
 }
 
-/// \brief This handles unreachable block accesses by deleting phi nodes in
+/// This handles unreachable block accesses by deleting phi nodes in
 /// unreachable blocks, and marking all other unreachable MemoryAccess's as
 /// being uses of the live on entry definition.
 void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
@@ -1044,7 +1063,7 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
 
 MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
     : AA(AA), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
-      NextID(INVALID_MEMORYACCESS_ID) {
+      NextID(0) {
   buildMemorySSA();
 }
 
@@ -1106,6 +1125,7 @@ private:
     // This is where the last walk for this memory location ended.
     unsigned long LastKill;
     bool LastKillValid;
+    Optional<AliasResult> AR;
   };
 
   void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &,
@@ -1165,7 +1185,7 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
     }
 
     if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getMemoryInst())) {
-      MU->setDefiningAccess(MSSA->getLiveOnEntryDef(), true);
+      MU->setDefiningAccess(MSSA->getLiveOnEntryDef(), true, None);
       continue;
     }
 
@@ -1207,6 +1227,7 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
     if (!LocInfo.LastKillValid) {
       LocInfo.LastKill = VersionStack.size() - 1;
       LocInfo.LastKillValid = true;
+      LocInfo.AR = MayAlias;
     }
 
     // At this point, we should have corrected last kill and LowerBound to be
@@ -1219,10 +1240,11 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
     unsigned long UpperBound = VersionStack.size() - 1;
 
     if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) {
-      DEBUG(dbgs() << "MemorySSA skipping optimization of " << *MU << " ("
-                   << *(MU->getMemoryInst()) << ")"
-                   << " because there are " << UpperBound - LocInfo.LowerBound
-                   << " stores to disambiguate\n");
+      LLVM_DEBUG(dbgs() << "MemorySSA skipping optimization of " << *MU << " ("
+                        << *(MU->getMemoryInst()) << ")"
+                        << " because there are "
+                        << UpperBound - LocInfo.LowerBound
+                        << " stores to disambiguate\n");
       // Because we did not walk, LastKill is no longer valid, as this may
       // have been a kill.
       LocInfo.LastKillValid = false;
@@ -1250,24 +1272,32 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
         // Reset UpperBound to liveOnEntryDef's place in the stack
         UpperBound = 0;
         FoundClobberResult = true;
+        LocInfo.AR = MustAlias;
         break;
       }
-      if (instructionClobbersQuery(MD, MU, UseMLOC, *AA)) {
+      ClobberAlias CA = instructionClobbersQuery(MD, MU, UseMLOC, *AA);
+      if (CA.IsClobber) {
         FoundClobberResult = true;
+        LocInfo.AR = CA.AR;
         break;
       }
       --UpperBound;
     }
+
+    // Note: Phis always have AliasResult AR set to MayAlias ATM.
+
     // At the end of this loop, UpperBound is either a clobber, or lower bound
     // PHI walking may cause it to be < LowerBound, and in fact, < LastKill.
     if (FoundClobberResult || UpperBound < LocInfo.LastKill) {
-      MU->setDefiningAccess(VersionStack[UpperBound], true);
       // We were last killed now by where we got to
+      if (MSSA->isLiveOnEntryDef(VersionStack[UpperBound]))
+        LocInfo.AR = None;
+      MU->setDefiningAccess(VersionStack[UpperBound], true, LocInfo.AR);
       LocInfo.LastKill = UpperBound;
     } else {
       // Otherwise, we checked all the new ones, and now we know we can get to
       // LastKill.
-      MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true);
+      MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true, LocInfo.AR);
     }
     LocInfo.LowerBound = VersionStack.size() - 1;
     LocInfo.LowerBoundBlock = BB;
@@ -1289,19 +1319,13 @@ void MemorySSA::OptimizeUses::optimizeUses() {
 }
 
 void MemorySSA::placePHINodes(
-    const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks,
-    const DenseMap<const BasicBlock *, unsigned int> &BBNumbers) {
+    const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks) {
   // Determine where our MemoryPhi's should go
   ForwardIDFCalculator IDFs(*DT);
   IDFs.setDefiningBlocks(DefiningBlocks);
   SmallVector<BasicBlock *, 32> IDFBlocks;
   IDFs.calculate(IDFBlocks);
 
-  std::sort(IDFBlocks.begin(), IDFBlocks.end(),
-            [&BBNumbers](const BasicBlock *A, const BasicBlock *B) {
-              return BBNumbers.lookup(A) < BBNumbers.lookup(B);
-            });
-
   // Now place MemoryPhi nodes.
   for (auto &BB : IDFBlocks)
     createMemoryPhi(BB);
@@ -1315,11 +1339,8 @@ void MemorySSA::buildMemorySSA() {
   // semantics do *not* imply that something with no immediate uses can simply
   // be removed.
   BasicBlock &StartingPoint = F.getEntryBlock();
-  LiveOnEntryDef =
-      llvm::make_unique<MemoryDef>(F.getContext(), nullptr, nullptr,
-                                   &StartingPoint, NextID++);
-  DenseMap<const BasicBlock *, unsigned int> BBNumbers;
-  unsigned NextBBNum = 0;
+  LiveOnEntryDef.reset(new MemoryDef(F.getContext(), nullptr, nullptr,
+                                     &StartingPoint, NextID++));
 
   // We maintain lists of memory accesses per-block, trading memory for time. We
   // could just look up the memory access for every possible instruction in the
@@ -1328,7 +1349,6 @@ void MemorySSA::buildMemorySSA() {
   // Go through each block, figure out where defs occur, and chain together all
   // the accesses.
   for (BasicBlock &B : F) {
-    BBNumbers[&B] = NextBBNum++;
     bool InsertIntoDef = false;
     AccessList *Accesses = nullptr;
     DefsList *Defs = nullptr;
@@ -1350,7 +1370,7 @@ void MemorySSA::buildMemorySSA() {
     if (InsertIntoDef)
       DefiningBlocks.insert(&B);
   }
-  placePHINodes(DefiningBlocks, BBNumbers);
+  placePHINodes(DefiningBlocks);
 
   // Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
   // filled in with all blocks.
@@ -1359,11 +1379,7 @@ void MemorySSA::buildMemorySSA() {
 
   CachingWalker *Walker = getWalkerImpl();
 
-  // We're doing a batch of updates; don't drop useful caches between them.
-  Walker->setAutoResetWalker(false);
   OptimizeUses(this, Walker, AA, DT).optimizeUses();
-  Walker->setAutoResetWalker(true);
-  Walker->resetClobberWalker();
 
   // Mark the uses in unreachable blocks as live on entry, so that they go
   // somewhere.
@@ -1426,7 +1442,7 @@ void MemorySSA::insertIntoListsBefore(MemoryAccess *What, const BasicBlock *BB,
     auto *Defs = getOrCreateDefsList(BB);
     // If we got asked to insert at the end, we have an easy job, just shove it
     // at the end. If we got asked to insert before an existing def, we also get
-    // an terator. If we got asked to insert before a use, we have to hunt for
+    // an iterator. If we got asked to insert before a use, we have to hunt for
     // the next def.
     if (WasEnd) {
       Defs->push_back(*What);
@@ -1445,7 +1461,7 @@ void MemorySSA::insertIntoListsBefore(MemoryAccess *What, const BasicBlock *BB,
   BlockNumberingValid.erase(BB);
 }
 
-// Move What before Where in the IR.  The end result is taht What will belong to
+// Move What before Where in the IR.  The end result is that What will belong to
 // the right lists and have the right Block set, but will not otherwise be
 // correct. It will not have the right defining access, and if it is a def,
 // things below it will not properly be updated.
@@ -1457,8 +1473,18 @@ void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
   insertIntoListsBefore(What, BB, Where);
 }
 
-void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+void MemorySSA::moveTo(MemoryAccess *What, BasicBlock *BB,
                        InsertionPlace Point) {
+  if (isa<MemoryPhi>(What)) {
+    assert(Point == Beginning &&
+           "Can only move a Phi at the beginning of the block");
+    // Update lookup table entry
+    ValueToMemoryAccess.erase(What->getBlock());
+    bool Inserted = ValueToMemoryAccess.insert({BB, What}).second;
+    (void)Inserted;
+    assert(Inserted && "Cannot move a Phi to a block that already has one");
+  }
+
   removeFromLists(What, false);
   What->setBlock(BB);
   insertIntoListsForBlock(What, BB, Point);
@@ -1498,7 +1524,7 @@ static inline bool isOrdered(const Instruction *I) {
   return false;
 }
 
-/// \brief Helper function to create new memory accesses
+/// Helper function to create new memory accesses
 MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   // The assume intrinsic has a control dependency which we model by claiming
   // that it writes arbitrarily. Ignore that fake memory dependency here.
@@ -1526,9 +1552,6 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   if (!Def && !Use)
     return nullptr;
 
-  assert((Def || Use) &&
-         "Trying to create a memory access with a non-memory instruction");
-
   MemoryUseOrDef *MUD;
   if (Def)
     MUD = new MemoryDef(I->getContext(), nullptr, I, I->getParent(), NextID++);
@@ -1538,7 +1561,7 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   return MUD;
 }
 
-/// \brief Returns true if \p Replacer dominates \p Replacee .
+/// Returns true if \p Replacer dominates \p Replacee .
 bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
                              const MemoryAccess *Replacee) const {
   if (isa<MemoryUseOrDef>(Replacee))
@@ -1555,40 +1578,40 @@ bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
   return true;
 }
 
-/// \brief Properly remove \p MA from all of MemorySSA's lookup tables.
+/// Properly remove \p MA from all of MemorySSA's lookup tables.
 void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   assert(MA->use_empty() &&
          "Trying to remove memory access that still has uses");
   BlockNumbering.erase(MA);
-  if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA))
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
     MUD->setDefiningAccess(nullptr);
   // Invalidate our walker's cache if necessary
   if (!isa<MemoryUse>(MA))
     Walker->invalidateInfo(MA);
-  // The call below to erase will destroy MA, so we can't change the order we
-  // are doing things here
+
   Value *MemoryInst;
-  if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA)) {
+  if (const auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
     MemoryInst = MUD->getMemoryInst();
-  } else {
+  else
     MemoryInst = MA->getBlock();
-  }
+
   auto VMA = ValueToMemoryAccess.find(MemoryInst);
   if (VMA->second == MA)
     ValueToMemoryAccess.erase(VMA);
 }
 
-/// \brief Properly remove \p MA from all of MemorySSA's lists.
+/// Properly remove \p MA from all of MemorySSA's lists.
 ///
 /// Because of the way the intrusive list and use lists work, it is important to
 /// do removal in the right order.
 /// ShouldDelete defaults to true, and will cause the memory access to also be
 /// deleted, not just removed.
 void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
+  BasicBlock *BB = MA->getBlock();
   // The access list owns the reference, so we erase it from the non-owning list
   // first.
   if (!isa<MemoryUse>(MA)) {
-    auto DefsIt = PerBlockDefs.find(MA->getBlock());
+    auto DefsIt = PerBlockDefs.find(BB);
     std::unique_ptr<DefsList> &Defs = DefsIt->second;
     Defs->remove(*MA);
     if (Defs->empty())
@@ -1597,15 +1620,17 @@ void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
 
   // The erase call here will delete it. If we don't want it deleted, we call
   // remove instead.
-  auto AccessIt = PerBlockAccesses.find(MA->getBlock());
+  auto AccessIt = PerBlockAccesses.find(BB);
   std::unique_ptr<AccessList> &Accesses = AccessIt->second;
   if (ShouldDelete)
     Accesses->erase(MA);
   else
     Accesses->remove(MA);
 
-  if (Accesses->empty())
+  if (Accesses->empty()) {
     PerBlockAccesses.erase(AccessIt);
+    BlockNumberingValid.erase(BB);
+  }
 }
 
 void MemorySSA::print(raw_ostream &OS) const {
@@ -1621,10 +1646,49 @@ void MemorySSA::verifyMemorySSA() const {
   verifyDefUses(F);
   verifyDomination(F);
   verifyOrdering(F);
+  verifyDominationNumbers(F);
   Walker->verify(this);
 }
 
-/// \brief Verify that the order and existence of MemoryAccesses matches the
+/// Verify that all of the blocks we believe to have valid domination numbers
+/// actually have valid domination numbers.
+void MemorySSA::verifyDominationNumbers(const Function &F) const {
+#ifndef NDEBUG
+  if (BlockNumberingValid.empty())
+    return;
+
+  SmallPtrSet<const BasicBlock *, 16> ValidBlocks = BlockNumberingValid;
+  for (const BasicBlock &BB : F) {
+    if (!ValidBlocks.count(&BB))
+      continue;
+
+    ValidBlocks.erase(&BB);
+
+    const AccessList *Accesses = getBlockAccesses(&BB);
+    // It's correct to say an empty block has valid numbering.
+    if (!Accesses)
+      continue;
+
+    // Block numbering starts at 1.
+    unsigned long LastNumber = 0;
+    for (const MemoryAccess &MA : *Accesses) {
+      auto ThisNumberIter = BlockNumbering.find(&MA);
+      assert(ThisNumberIter != BlockNumbering.end() &&
+             "MemoryAccess has no domination number in a valid block!");
+
+      unsigned long ThisNumber = ThisNumberIter->second;
+      assert(ThisNumber > LastNumber &&
+             "Domination numbers should be strictly increasing!");
+      LastNumber = ThisNumber;
+    }
+  }
+
+  assert(ValidBlocks.empty() &&
+         "All valid BasicBlocks should exist in F -- dangling pointers?");
+#endif
+}
+
+/// Verify that the order and existence of MemoryAccesses matches the
 /// order and existence of memory affecting instructions.
 void MemorySSA::verifyOrdering(Function &F) const {
   // Walk all the blocks, comparing what the lookups think and what the access
@@ -1687,7 +1751,7 @@ void MemorySSA::verifyOrdering(Function &F) const {
   }
 }
 
-/// \brief Verify the domination properties of MemorySSA by checking that each
+/// Verify the domination properties of MemorySSA by checking that each
 /// definition dominates all of its uses.
 void MemorySSA::verifyDomination(Function &F) const {
 #ifndef NDEBUG
@@ -1709,7 +1773,7 @@ void MemorySSA::verifyDomination(Function &F) const {
 #endif
 }
 
-/// \brief Verify the def-use lists in MemorySSA, by verifying that \p Use
+/// Verify the def-use lists in MemorySSA, by verifying that \p Use
 /// appears in the use list of \p Def.
 void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
 #ifndef NDEBUG
@@ -1723,7 +1787,7 @@ void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
 #endif
 }
 
-/// \brief Verify the immediate use information, by walking all the memory
+/// Verify the immediate use information, by walking all the memory
 /// accesses and verifying that, for each use, it appears in the
 /// appropriate def's use list
 void MemorySSA::verifyDefUses(Function &F) const {
@@ -1733,8 +1797,12 @@ void MemorySSA::verifyDefUses(Function &F) const {
       assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
                                           pred_begin(&B), pred_end(&B))) &&
              "Incomplete MemoryPhi Node");
-      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
         verifyUseInDefs(Phi->getIncomingValue(I), Phi);
+        assert(find(predecessors(&B), Phi->getIncomingBlock(I)) !=
+                   pred_end(&B) &&
+               "Incoming phi block not a block predecessor");
+      }
     }
 
     for (Instruction &I : B) {
@@ -1769,7 +1837,7 @@ void MemorySSA::renumberBlock(const BasicBlock *B) const {
   BlockNumberingValid.insert(B);
 }
 
-/// \brief Determine, for two memory accesses in the same block,
+/// Determine, for two memory accesses in the same block,
 /// whether \p Dominator dominates \p Dominatee.
 /// \returns True if \p Dominator dominates \p Dominatee.
 bool MemorySSA::locallyDominates(const MemoryAccess *Dominator,
@@ -1844,12 +1912,24 @@ void MemoryAccess::print(raw_ostream &OS) const {
 void MemoryDef::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
 
+  auto printID = [&OS](MemoryAccess *A) {
+    if (A && A->getID())
+      OS << A->getID();
+    else
+      OS << LiveOnEntryStr;
+  };
+
   OS << getID() << " = MemoryDef(";
-  if (UO && UO->getID())
-    OS << UO->getID();
-  else
-    OS << LiveOnEntryStr;
-  OS << ')';
+  printID(UO);
+  OS << ")";
+
+  if (isOptimized()) {
+    OS << "->";
+    printID(getOptimized());
+
+    if (Optional<AliasResult> AR = getOptimizedAccessType())
+      OS << " " << *AR;
+  }
 }
 
 void MemoryPhi::print(raw_ostream &OS) const {
@@ -1886,6 +1966,9 @@ void MemoryUse::print(raw_ostream &OS) const {
   else
     OS << LiveOnEntryStr;
   OS << ')';
+
+  if (Optional<AliasResult> AR = getOptimizedAccessType())
+    OS << " " << *AR;
 }
 
 void MemoryAccess::dump() const {
@@ -1977,21 +2060,13 @@ void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
     MUD->resetOptimized();
 }
 
-/// \brief Walk the use-def chains starting at \p MA and find
+/// Walk the use-def chains starting at \p MA and find
 /// the MemoryAccess that actually clobbers Loc.
 ///
 /// \returns our clobbering memory access
 MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
-  MemoryAccess *New = Walker.findClobber(StartingAccess, Q);
-#ifdef EXPENSIVE_CHECKS
-  MemoryAccess *NewNoCache = Walker.findClobber(StartingAccess, Q);
-  assert(NewNoCache == New && "Cache made us hand back a different result?");
-  (void)NewNoCache;
-#endif
-  if (AutoResetWalker)
-    resetClobberWalker();
-  return New;
+  return Walker.findClobber(StartingAccess, Q);
 }
 
 MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
@@ -2023,10 +2098,10 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
                                      : StartingUseOrDef;
 
   MemoryAccess *Clobber = getClobberingMemoryAccess(DefiningAccess, Q);
-  DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
-  DEBUG(dbgs() << *StartingUseOrDef << "\n");
-  DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
-  DEBUG(dbgs() << *Clobber << "\n");
+  LLVM_DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
+  LLVM_DEBUG(dbgs() << *StartingUseOrDef << "\n");
+  LLVM_DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
+  LLVM_DEBUG(dbgs() << *Clobber << "\n");
   return Clobber;
 }
 
@@ -2038,24 +2113,23 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
     return MA;
 
   // If this is an already optimized use or def, return the optimized result.
-  // Note: Currently, we do not store the optimized def result because we'd need
-  // a separate field, since we can't use it as the defining access.
-  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
-    if (MUD->isOptimized())
-      return MUD->getOptimized();
+  // Note: Currently, we store the optimized def result in a separate field,
+  // since we can't use the defining access.
+  if (StartingAccess->isOptimized())
+    return StartingAccess->getOptimized();
 
   const Instruction *I = StartingAccess->getMemoryInst();
   UpwardsMemoryQuery Q(I, StartingAccess);
-  // We can't sanely do anything with a fences, they conservatively
-  // clobber all memory, and have no locations to get pointers from to
-  // try to disambiguate.
+  // We can't sanely do anything with a fence, since they conservatively clobber
+  // all memory, and have no locations to get pointers from to try to
+  // disambiguate.
   if (!Q.IsCall && I->isFenceLike())
     return StartingAccess;
 
   if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
     MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
-      MUD->setOptimized(LiveOnEntry);
+    StartingAccess->setOptimized(LiveOnEntry);
+    StartingAccess->setOptimizedAccessType(None);
     return LiveOnEntry;
   }
 
@@ -2064,16 +2138,23 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
 
   // At this point, DefiningAccess may be the live on entry def.
   // If it is, we will not get a better result.
-  if (MSSA->isLiveOnEntryDef(DefiningAccess))
+  if (MSSA->isLiveOnEntryDef(DefiningAccess)) {
+    StartingAccess->setOptimized(DefiningAccess);
+    StartingAccess->setOptimizedAccessType(None);
     return DefiningAccess;
+  }
 
   MemoryAccess *Result = getClobberingMemoryAccess(DefiningAccess, Q);
-  DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
-  DEBUG(dbgs() << *DefiningAccess << "\n");
-  DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
-  DEBUG(dbgs() << *Result << "\n");
-  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
-    MUD->setOptimized(Result);
+  LLVM_DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
+  LLVM_DEBUG(dbgs() << *DefiningAccess << "\n");
+  LLVM_DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
+  LLVM_DEBUG(dbgs() << *Result << "\n");
+
+  StartingAccess->setOptimized(Result);
+  if (MSSA->isLiveOnEntryDef(Result))
+    StartingAccess->setOptimizedAccessType(None);
+  else if (Q.AR == MustAlias)
+    StartingAccess->setOptimizedAccessType(MustAlias);
 
   return Result;
 }
diff --git a/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp b/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp
index f5d89f699a5a..abe2b3c25a58 100644
--- a/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -37,36 +37,45 @@ using namespace llvm;
 // that there are two or more definitions needing to be merged.
 // This still will leave non-minimal form in the case of irreducible control
 // flow, where phi nodes may be in cycles with themselves, but unnecessary.
-MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(BasicBlock *BB) {
-  // Single predecessor case, just recurse, we can only have one definition.
+MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(
+    BasicBlock *BB,
+    DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &CachedPreviousDef) {
+  // First, do a cache lookup. Without this cache, certain CFG structures
+  // (like a series of if statements) take exponential time to visit.
+  auto Cached = CachedPreviousDef.find(BB);
+  if (Cached != CachedPreviousDef.end()) {
+    return Cached->second;
+  }
+
   if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-    return getPreviousDefFromEnd(Pred);
-  } else if (VisitedBlocks.count(BB)) {
+    // Single predecessor case, just recurse, we can only have one definition.
+    MemoryAccess *Result = getPreviousDefFromEnd(Pred, CachedPreviousDef);
+    CachedPreviousDef.insert({BB, Result});
+    return Result;
+  }
+
+  if (VisitedBlocks.count(BB)) {
     // We hit our node again, meaning we had a cycle, we must insert a phi
     // node to break it so we have an operand. The only case this will
     // insert useless phis is if we have irreducible control flow.
-    return MSSA->createMemoryPhi(BB);
-  } else if (VisitedBlocks.insert(BB).second) {
+    MemoryAccess *Result = MSSA->createMemoryPhi(BB);
+    CachedPreviousDef.insert({BB, Result});
+    return Result;
+  }
+
+  if (VisitedBlocks.insert(BB).second) {
     // Mark us visited so we can detect a cycle
-    SmallVector<MemoryAccess *, 8> PhiOps;
+    SmallVector<TrackingVH<MemoryAccess>, 8> PhiOps;
 
     // Recurse to get the values in our predecessors for placement of a
     // potential phi node. This will insert phi nodes if we cycle in order to
     // break the cycle and have an operand.
     for (auto *Pred : predecessors(BB))
-      PhiOps.push_back(getPreviousDefFromEnd(Pred));
+      PhiOps.push_back(getPreviousDefFromEnd(Pred, CachedPreviousDef));
 
     // Now try to simplify the ops to avoid placing a phi.
     // This may return null if we never created a phi yet, that's okay
     MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MSSA->getMemoryAccess(BB));
-    bool PHIExistsButNeedsUpdate = false;
-    // See if the existing phi operands match what we need.
-    // Unlike normal SSA, we only allow one phi node per block, so we can't just
-    // create a new one.
-    if (Phi && Phi->getNumOperands() != 0)
-      if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) {
-        PHIExistsButNeedsUpdate = true;
-      }
 
     // See if we can avoid the phi by simplifying it.
     auto *Result = tryRemoveTrivialPhi(Phi, PhiOps);
@@ -75,14 +84,20 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(BasicBlock *BB) {
       if (!Phi)
         Phi = MSSA->createMemoryPhi(BB);
 
-      // These will have been filled in by the recursive read we did above.
-      if (PHIExistsButNeedsUpdate) {
-        std::copy(PhiOps.begin(), PhiOps.end(), Phi->op_begin());
-        std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin());
+      // See if the existing phi operands match what we need.
+      // Unlike normal SSA, we only allow one phi node per block, so we can't just
+      // create a new one.
+      if (Phi->getNumOperands() != 0) {
+        // FIXME: Figure out whether this is dead code and if so remove it.
+        if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) {
+          // These will have been filled in by the recursive read we did above.
+          std::copy(PhiOps.begin(), PhiOps.end(), Phi->op_begin());
+          std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin());
+        }
       } else {
         unsigned i = 0;
         for (auto *Pred : predecessors(BB))
-          Phi->addIncoming(PhiOps[i++], Pred);
+          Phi->addIncoming(&*PhiOps[i++], Pred);
         InsertedPHIs.push_back(Phi);
       }
       Result = Phi;
@@ -90,6 +105,7 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(BasicBlock *BB) {
 
     // Set ourselves up for the next variable by resetting visited state.
     VisitedBlocks.erase(BB);
+    CachedPreviousDef.insert({BB, Result});
     return Result;
   }
   llvm_unreachable("Should have hit one of the three cases above");
@@ -100,9 +116,10 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(BasicBlock *BB) {
 // it continues globally, creating phi nodes to ensure we have a single
 // definition.
 MemoryAccess *MemorySSAUpdater::getPreviousDef(MemoryAccess *MA) {
-  auto *LocalResult = getPreviousDefInBlock(MA);
-
-  return LocalResult ? LocalResult : getPreviousDefRecursive(MA->getBlock());
+  if (auto *LocalResult = getPreviousDefInBlock(MA))
+    return LocalResult;
+  DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> CachedPreviousDef;
+  return getPreviousDefRecursive(MA->getBlock(), CachedPreviousDef);
 }
 
 // This starts at the memory access, and goes backwards in the block to the find
@@ -133,13 +150,15 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefInBlock(MemoryAccess *MA) {
 }
 
 // This starts at the end of block
-MemoryAccess *MemorySSAUpdater::getPreviousDefFromEnd(BasicBlock *BB) {
+MemoryAccess *MemorySSAUpdater::getPreviousDefFromEnd(
+    BasicBlock *BB,
+    DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &CachedPreviousDef) {
   auto *Defs = MSSA->getWritableBlockDefs(BB);
 
   if (Defs)
     return &*Defs->rbegin();
 
-  return getPreviousDefRecursive(BB);
+  return getPreviousDefRecursive(BB, CachedPreviousDef);
 }
 // Recurse over a set of phi uses to eliminate the trivial ones
 MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) {
@@ -165,6 +184,10 @@ MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) {
 template <class RangeType>
 MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
                                                     RangeType &Operands) {
+  // Bail out on non-opt Phis.
+  if (NonOptPhis.count(Phi))
+    return Phi;
+
   // Detect equal or self arguments
   MemoryAccess *Same = nullptr;
   for (auto &Op : Operands) {
@@ -174,7 +197,7 @@ MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
     // not the same, return the phi since it's not eliminatable by us
     if (Same)
       return Phi;
-    Same = cast<MemoryAccess>(Op);
+    Same = cast<MemoryAccess>(&*Op);
   }
   // Never found a non-self reference, the phi is undef
   if (Same == nullptr)
@@ -230,10 +253,8 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
   InsertedPHIs.clear();
 
   // See if we had a local def, and if not, go hunting.
-  MemoryAccess *DefBefore = getPreviousDefInBlock(MD);
-  bool DefBeforeSameBlock = DefBefore != nullptr;
-  if (!DefBefore)
-    DefBefore = getPreviousDefRecursive(MD->getBlock());
+  MemoryAccess *DefBefore = getPreviousDef(MD);
+  bool DefBeforeSameBlock = DefBefore->getBlock() == MD->getBlock();
 
   // There is a def before us, which means we can replace any store/phi uses
   // of that thing with us, since we are in the way of whatever was there
@@ -255,8 +276,7 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
   // above and reset ourselves.
   MD->setDefiningAccess(DefBefore);
 
-  SmallVector<MemoryAccess *, 8> FixupList(InsertedPHIs.begin(),
-                                           InsertedPHIs.end());
+  SmallVector<WeakVH, 8> FixupList(InsertedPHIs.begin(), InsertedPHIs.end());
   if (!DefBeforeSameBlock) {
     // If there was a local def before us, we must have the same effect it
     // did. Because every may-def is the same, any phis/etc we would create, it
@@ -277,7 +297,7 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
     fixupDefs(FixupList);
     FixupList.clear();
     // Put any new phis on the fixup list, and process them
-    FixupList.append(InsertedPHIs.end() - StartingPHISize, InsertedPHIs.end());
+    FixupList.append(InsertedPHIs.begin() + StartingPHISize, InsertedPHIs.end());
   }
   // Now that all fixups are done, rename all uses if we are asked.
   if (RenameUses) {
@@ -294,19 +314,29 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
     MSSA->renamePass(MD->getBlock(), FirstDef, Visited);
     // We just inserted a phi into this block, so the incoming value will become
     // the phi anyway, so it does not matter what we pass.
-    for (auto *MP : InsertedPHIs)
-      MSSA->renamePass(MP->getBlock(), nullptr, Visited);
+    for (auto &MP : InsertedPHIs) {
+      MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MP);
+      if (Phi)
+        MSSA->renamePass(Phi->getBlock(), nullptr, Visited);
+    }
   }
 }
 
-void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<MemoryAccess *> &Vars) {
+void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<WeakVH> &Vars) {
   SmallPtrSet<const BasicBlock *, 8> Seen;
   SmallVector<const BasicBlock *, 16> Worklist;
-  for (auto *NewDef : Vars) {
+  for (auto &Var : Vars) {
+    MemoryAccess *NewDef = dyn_cast_or_null<MemoryAccess>(Var);
+    if (!NewDef)
+      continue;
     // First, see if there is a local def after the operand.
     auto *Defs = MSSA->getWritableBlockDefs(NewDef->getBlock());
     auto DefIter = NewDef->getDefsIterator();
 
+    // The temporary Phi is being fixed, unmark it for not to optimize.
+    if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(NewDef))
+      NonOptPhis.erase(Phi);
+
     // If there is a local def after us, we only have to rename that.
     if (++DefIter != Defs->end()) {
       cast<MemoryDef>(DefIter)->setDefiningAccess(NewDef);
@@ -366,6 +396,11 @@ void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<MemoryAccess *> &Vars) {
 template <class WhereType>
 void MemorySSAUpdater::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
                               WhereType Where) {
+  // Mark MemoryPhi users of What not to be optimized.
+  for (auto *U : What->users())
+    if (MemoryPhi *PhiUser = dyn_cast<MemoryPhi>(U))
+      NonOptPhis.insert(PhiUser);
+
   // Replace all our users with our defining access.
   What->replaceAllUsesWith(What->getDefiningAccess());
 
@@ -377,6 +412,10 @@ void MemorySSAUpdater::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
     insertDef(MD);
   else
     insertUse(cast<MemoryUse>(What));
+
+  // Clear dangling pointers. We added all MemoryPhi users, but not all
+  // of them are removed by fixupDefs().
+  NonOptPhis.clear();
 }
 
 // Move What before Where in the MemorySSA IR.
@@ -394,7 +433,57 @@ void MemorySSAUpdater::moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
   return moveTo(What, BB, Where);
 }
 
-/// \brief If all arguments of a MemoryPHI are defined by the same incoming
+// All accesses in To used to be in From. Move to end and update access lists.
+void MemorySSAUpdater::moveAllAccesses(BasicBlock *From, BasicBlock *To,
+                                       Instruction *Start) {
+
+  MemorySSA::AccessList *Accs = MSSA->getWritableBlockAccesses(From);
+  if (!Accs)
+    return;
+
+  MemoryAccess *FirstInNew = nullptr;
+  for (Instruction &I : make_range(Start->getIterator(), To->end()))
+    if ((FirstInNew = MSSA->getMemoryAccess(&I)))
+      break;
+  if (!FirstInNew)
+    return;
+
+  auto *MUD = cast<MemoryUseOrDef>(FirstInNew);
+  do {
+    auto NextIt = ++MUD->getIterator();
+    MemoryUseOrDef *NextMUD = (!Accs || NextIt == Accs->end())
+                                  ? nullptr
+                                  : cast<MemoryUseOrDef>(&*NextIt);
+    MSSA->moveTo(MUD, To, MemorySSA::End);
+    // Moving MUD from Accs in the moveTo above, may delete Accs, so we need to
+    // retrieve it again.
+    Accs = MSSA->getWritableBlockAccesses(From);
+    MUD = NextMUD;
+  } while (MUD);
+}
+
+void MemorySSAUpdater::moveAllAfterSpliceBlocks(BasicBlock *From,
+                                                BasicBlock *To,
+                                                Instruction *Start) {
+  assert(MSSA->getBlockAccesses(To) == nullptr &&
+         "To block is expected to be free of MemoryAccesses.");
+  moveAllAccesses(From, To, Start);
+  for (BasicBlock *Succ : successors(To))
+    if (MemoryPhi *MPhi = MSSA->getMemoryAccess(Succ))
+      MPhi->setIncomingBlock(MPhi->getBasicBlockIndex(From), To);
+}
+
+void MemorySSAUpdater::moveAllAfterMergeBlocks(BasicBlock *From, BasicBlock *To,
+                                               Instruction *Start) {
+  assert(From->getSinglePredecessor() == To &&
+         "From block is expected to have a single predecessor (To).");
+  moveAllAccesses(From, To, Start);
+  for (BasicBlock *Succ : successors(From))
+    if (MemoryPhi *MPhi = MSSA->getMemoryAccess(Succ))
+      MPhi->setIncomingBlock(MPhi->getBasicBlockIndex(From), To);
+}
+
+/// If all arguments of a MemoryPHI are defined by the same incoming
 /// argument, return that argument.
 static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
   MemoryAccess *MA = nullptr;
@@ -408,6 +497,35 @@ static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
   return MA;
 }
 
+void MemorySSAUpdater::wireOldPredecessorsToNewImmediatePredecessor(
+    BasicBlock *Old, BasicBlock *New, ArrayRef<BasicBlock *> Preds) {
+  assert(!MSSA->getWritableBlockAccesses(New) &&
+         "Access list should be null for a new block.");
+  MemoryPhi *Phi = MSSA->getMemoryAccess(Old);
+  if (!Phi)
+    return;
+  if (pred_size(Old) == 1) {
+    assert(pred_size(New) == Preds.size() &&
+           "Should have moved all predecessors.");
+    MSSA->moveTo(Phi, New, MemorySSA::Beginning);
+  } else {
+    assert(!Preds.empty() && "Must be moving at least one predecessor to the "
+                             "new immediate predecessor.");
+    MemoryPhi *NewPhi = MSSA->createMemoryPhi(New);
+    SmallPtrSet<BasicBlock *, 16> PredsSet(Preds.begin(), Preds.end());
+    Phi->unorderedDeleteIncomingIf([&](MemoryAccess *MA, BasicBlock *B) {
+      if (PredsSet.count(B)) {
+        NewPhi->addIncoming(MA, B);
+        return true;
+      }
+      return false;
+    });
+    Phi->addIncoming(NewPhi, New);
+    if (onlySingleValue(NewPhi))
+      removeMemoryAccess(NewPhi);
+  }
+}
+
 void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA) {
   assert(!MSSA->isLiveOnEntryDef(MA) &&
          "Trying to remove the live on entry def");
@@ -456,6 +574,39 @@ void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA) {
   MSSA->removeFromLists(MA);
 }
 
+void MemorySSAUpdater::removeBlocks(
+    const SmallPtrSetImpl<BasicBlock *> &DeadBlocks) {
+  // First delete all uses of BB in MemoryPhis.
+  for (BasicBlock *BB : DeadBlocks) {
+    TerminatorInst *TI = BB->getTerminator();
+    assert(TI && "Basic block expected to have a terminator instruction");
+    for (BasicBlock *Succ : TI->successors())
+      if (!DeadBlocks.count(Succ))
+        if (MemoryPhi *MP = MSSA->getMemoryAccess(Succ)) {
+          MP->unorderedDeleteIncomingBlock(BB);
+          if (MP->getNumIncomingValues() == 1)
+            removeMemoryAccess(MP);
+        }
+    // Drop all references of all accesses in BB
+    if (MemorySSA::AccessList *Acc = MSSA->getWritableBlockAccesses(BB))
+      for (MemoryAccess &MA : *Acc)
+        MA.dropAllReferences();
+  }
+
+  // Next, delete all memory accesses in each block
+  for (BasicBlock *BB : DeadBlocks) {
+    MemorySSA::AccessList *Acc = MSSA->getWritableBlockAccesses(BB);
+    if (!Acc)
+      continue;
+    for (auto AB = Acc->begin(), AE = Acc->end(); AB != AE;) {
+      MemoryAccess *MA = &*AB;
+      ++AB;
+      MSSA->removeFromLookups(MA);
+      MSSA->removeFromLists(MA);
+    }
+  }
+}
+
 MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB(
     Instruction *I, MemoryAccess *Definition, const BasicBlock *BB,
     MemorySSA::InsertionPlace Point) {
diff --git a/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index efa5bd564ad0..17dae20ce3a1 100644
--- a/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -58,6 +59,18 @@ using namespace llvm;
 
 #define DEBUG_TYPE "module-summary-analysis"
 
+// Option to force edges cold which will block importing when the
+// -import-cold-multiplier is set to 0. Useful for debugging.
+FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold =
+    FunctionSummary::FSHT_None;
+cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC(
+    "force-summary-edges-cold", cl::Hidden, cl::location(ForceSummaryEdgesCold),
+    cl::desc("Force all edges in the function summary to cold"),
+    cl::values(clEnumValN(FunctionSummary::FSHT_None, "none", "None."),
+               clEnumValN(FunctionSummary::FSHT_AllNonCritical,
+                          "all-non-critical", "All non-critical edges."),
+               clEnumValN(FunctionSummary::FSHT_All, "all", "All edges.")));
+
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
@@ -268,14 +281,23 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         auto ScaledCount = PSI->getProfileCount(&I, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
+        if (ForceSummaryEdgesCold != FunctionSummary::FSHT_None)
+          Hotness = CalleeInfo::HotnessType::Cold;
 
         // Use the original CalledValue, in case it was an alias. We want
         // to record the call edge to the alias in that case. Eventually
         // an alias summary will be created to associate the alias and
         // aliasee.
-        CallGraphEdges[Index.getOrInsertValueInfo(
-                           cast<GlobalValue>(CalledValue))]
-            .updateHotness(Hotness);
+        auto &ValueInfo = CallGraphEdges[Index.getOrInsertValueInfo(
+            cast<GlobalValue>(CalledValue))];
+        ValueInfo.updateHotness(Hotness);
+        // Add the relative block frequency to CalleeInfo if there is no profile
+        // information.
+        if (BFI != nullptr && Hotness == CalleeInfo::HotnessType::Unknown) {
+          uint64_t BBFreq = BFI->getBlockFreq(&BB).getFrequency();
+          uint64_t EntryFreq = BFI->getEntryFreq();
+          ValueInfo.updateRelBlockFreq(BBFreq, EntryFreq);
+        }
       } else {
         // Skip inline assembly calls.
         if (CI && CI->isInlineAsm())
@@ -284,6 +306,18 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         if (!CalledValue || isa<Constant>(CalledValue))
           continue;
 
+        // Check if the instruction has a callees metadata. If so, add callees
+        // to CallGraphEdges to reflect the references from the metadata, and
+        // to enable importing for subsequent indirect call promotion and
+        // inlining.
+        if (auto *MD = I.getMetadata(LLVMContext::MD_callees)) {
+          for (auto &Op : MD->operands()) {
+            Function *Callee = mdconst::extract_or_null<Function>(Op);
+            if (Callee)
+              CallGraphEdges[Index.getOrInsertValueInfo(Callee)];
+          }
+        }
+
         uint32_t NumVals, NumCandidates;
         uint64_t TotalCount;
         auto CandidateProfileData =
@@ -299,7 +333,9 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
   // sample PGO, to enable the same inlines as the profiled optimized binary.
   for (auto &I : F.getImportGUIDs())
     CallGraphEdges[Index.getOrInsertValueInfo(I)].updateHotness(
-        CalleeInfo::HotnessType::Critical);
+        ForceSummaryEdgesCold == FunctionSummary::FSHT_All
+            ? CalleeInfo::HotnessType::Cold
+            : CalleeInfo::HotnessType::Critical);
 
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport =
@@ -325,7 +361,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       TypeCheckedLoadConstVCalls.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
-  Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
+  Index.addGlobalValueSummary(F, std::move(FuncSummary));
 }
 
 static void
@@ -341,7 +377,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(V.getGUID());
-  Index.addGlobalValueSummary(V.getName(), std::move(GVarSummary));
+  Index.addGlobalValueSummary(V, std::move(GVarSummary));
 }
 
 static void
@@ -357,7 +393,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
   AS->setAliasee(AliaseeSummary);
   if (NonRenamableLocal)
     CantBePromoted.insert(A.getGUID());
-  Index.addGlobalValueSummary(A.getName(), std::move(AS));
+  Index.addGlobalValueSummary(A, std::move(AS));
 }
 
 // Set LiveRoot flag on entries matching the given value name.
@@ -372,7 +408,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
     ProfileSummaryInfo *PSI) {
   assert(PSI);
-  ModuleSummaryIndex Index;
+  ModuleSummaryIndex Index(/*HaveGVs=*/true);
 
   // Identify the local values in the llvm.used and llvm.compiler.used sets,
   // which should not be exported as they would then require renaming and
@@ -419,7 +455,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                                               /* NotEligibleToImport = */ true,
                                               /* Live = */ true,
                                               /* Local */ GV->isDSOLocal());
-          CantBePromoted.insert(GlobalValue::getGUID(Name));
+          CantBePromoted.insert(GV->getGUID());
           // Create the appropriate summary type.
           if (Function *F = dyn_cast<Function>(GV)) {
             std::unique_ptr<FunctionSummary> Summary =
@@ -436,12 +472,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     ArrayRef<FunctionSummary::VFuncId>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ConstVCall>{});
-            Index.addGlobalValueSummary(Name, std::move(Summary));
+            Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
                 llvm::make_unique<GlobalVarSummary>(GVFlags,
                                                     ArrayRef<ValueInfo>{});
-            Index.addGlobalValueSummary(Name, std::move(Summary));
+            Index.addGlobalValueSummary(*GV, std::move(Summary));
           }
         });
   }
@@ -571,14 +607,14 @@ ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass()
 
 bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
   auto &PSI = *getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  Index = buildModuleSummaryIndex(
+  Index.emplace(buildModuleSummaryIndex(
       M,
       [this](const Function &F) {
         return &(this->getAnalysis<BlockFrequencyInfoWrapperPass>(
                          *const_cast<Function *>(&F))
                      .getBFI());
       },
-      &PSI);
+      &PSI));
   return false;
 }
 
diff --git a/contrib/llvm/lib/Analysis/MustExecute.cpp b/contrib/llvm/lib/Analysis/MustExecute.cpp
new file mode 100644
index 000000000000..fc4049874622
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/MustExecute.cpp
@@ -0,0 +1,269 @@
+//===- MustExecute.cpp - Printer for isGuaranteedToExecute ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// Computes loop safety information, checks loop body & header
+/// for the possibility of may throw exception.
+///
+void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
+  assert(CurLoop != nullptr && "CurLoop can't be null");
+  BasicBlock *Header = CurLoop->getHeader();
+  // Setting default safety values.
+  SafetyInfo->MayThrow = false;
+  SafetyInfo->HeaderMayThrow = false;
+  // Iterate over header and compute safety info.
+  SafetyInfo->HeaderMayThrow =
+    !isGuaranteedToTransferExecutionToSuccessor(Header);
+
+  SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+  // Iterate over loop instructions and compute safety info.
+  // Skip header as it has been computed and stored in HeaderMayThrow.
+  // The first block in loopinfo.Blocks is guaranteed to be the header.
+  assert(Header == *CurLoop->getBlocks().begin() &&
+         "First block must be header");
+  for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
+                            BBE = CurLoop->block_end();
+       (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
+    SafetyInfo->MayThrow |=
+      !isGuaranteedToTransferExecutionToSuccessor(*BB);
+
+  // Compute funclet colors if we might sink/hoist in a function with a funclet
+  // personality routine.
+  Function *Fn = CurLoop->getHeader()->getParent();
+  if (Fn->hasPersonalityFn())
+    if (Constant *PersonalityFn = Fn->getPersonalityFn())
+      if (isScopedEHPersonality(classifyEHPersonality(PersonalityFn)))
+        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
+}
+
+/// Return true if we can prove that the given ExitBlock is not reached on the
+/// first iteration of the given loop.  That is, the backedge of the loop must
+/// be executed before the ExitBlock is executed in any dynamic execution trace.
+static bool CanProveNotTakenFirstIteration(BasicBlock *ExitBlock,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop) {
+  auto *CondExitBlock = ExitBlock->getSinglePredecessor();
+  if (!CondExitBlock)
+    // expect unique exits
+    return false;
+  assert(CurLoop->contains(CondExitBlock) && "meaning of exit block");
+  auto *BI = dyn_cast<BranchInst>(CondExitBlock->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+  // If condition is constant and false leads to ExitBlock then we always
+  // execute the true branch.
+  if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition()))
+    return BI->getSuccessor(Cond->getZExtValue() ? 1 : 0) == ExitBlock;
+  auto *Cond = dyn_cast<CmpInst>(BI->getCondition());
+  if (!Cond)
+    return false;
+  // todo: this would be a lot more powerful if we used scev, but all the
+  // plumbing is currently missing to pass a pointer in from the pass
+  // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known
+  auto *LHS = dyn_cast<PHINode>(Cond->getOperand(0));
+  auto *RHS = Cond->getOperand(1);
+  if (!LHS || LHS->getParent() != CurLoop->getHeader())
+    return false;
+  auto DL = ExitBlock->getModule()->getDataLayout();
+  auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+  auto *SimpleValOrNull = SimplifyCmpInst(Cond->getPredicate(),
+                                          IVStart, RHS,
+                                          {DL, /*TLI*/ nullptr,
+                                              DT, /*AC*/ nullptr, BI});
+  auto *SimpleCst = dyn_cast_or_null<Constant>(SimpleValOrNull);
+  if (!SimpleCst)
+    return false;
+  if (ExitBlock == BI->getSuccessor(0))
+    return SimpleCst->isZeroValue();
+  assert(ExitBlock == BI->getSuccessor(1) && "implied by above");
+  return SimpleCst->isAllOnesValue();
+}
+
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once.
+bool llvm::isGuaranteedToExecute(const Instruction &Inst,
+                                 const DominatorTree *DT, const Loop *CurLoop,
+                                 const LoopSafetyInfo *SafetyInfo) {
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst unless we can prove that Inst comes before the potential implicit
+    // exit.  At the moment, we use a (cheap) hack for the common case where
+    // the instruction of interest is the first one in the block.
+    return !SafetyInfo->HeaderMayThrow ||
+      Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Note: There are two styles of reasoning intermixed below for
+  // implementation efficiency reasons.  They are:
+  // 1) If we can prove that the instruction dominates all exit blocks, then we
+  // know the instruction must have executed on *some* iteration before we
+  // exit.  We do not prove *which* iteration the instruction must execute on.
+  // 2) If we can prove that the instruction dominates the latch and all exits
+  // which might be taken on the first iteration, we know the instruction must
+  // execute on the first iteration.  This second style allows a conditional
+  // exit before the instruction of interest which is provably not taken on the
+  // first iteration.  This is a quite common case for range check like
+  // patterns.  TODO: support loops with multiple latches.
+
+  const bool InstDominatesLatch =
+    CurLoop->getLoopLatch() != nullptr &&
+    DT->dominates(Inst.getParent(), CurLoop->getLoopLatch());
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(Inst.getParent(), ExitBlock))
+      if (!InstDominatesLatch ||
+          !CanProveNotTakenFirstIteration(ExitBlock, DT, CurLoop))
+        return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
+  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
+  // just a special case of this.)
+  return true;
+}
+
+
+namespace {
+  struct MustExecutePrinter : public FunctionPass {
+
+    static char ID; // Pass identification, replacement for typeid
+    MustExecutePrinter() : FunctionPass(ID) {
+      initializeMustExecutePrinterPass(*PassRegistry::getPassRegistry());
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+    }
+    bool runOnFunction(Function &F) override;
+  };
+}
+
+char MustExecutePrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(MustExecutePrinter, "print-mustexecute",
+                      "Instructions which execute on loop entry", false, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(MustExecutePrinter, "print-mustexecute",
+                    "Instructions which execute on loop entry", false, true)
+
+FunctionPass *llvm::createMustExecutePrinter() {
+  return new MustExecutePrinter();
+}
+
+static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) {
+  // TODO: merge these two routines.  For the moment, we display the best
+  // result obtained by *either* implementation.  This is a bit unfair since no
+  // caller actually gets the full power at the moment.
+  LoopSafetyInfo LSI;
+  computeLoopSafetyInfo(&LSI, L);
+  return isGuaranteedToExecute(I, DT, L, &LSI) ||
+    isGuaranteedToExecuteForEveryIteration(&I, L);
+}
+
+namespace {
+/// An assembly annotator class to print must execute information in
+/// comments.
+class MustExecuteAnnotatedWriter : public AssemblyAnnotationWriter {
+  DenseMap<const Value*, SmallVector<Loop*, 4> > MustExec;
+
+public:
+  MustExecuteAnnotatedWriter(const Function &F,
+                             DominatorTree &DT, LoopInfo &LI) {
+    for (auto &I: instructions(F)) {
+      Loop *L = LI.getLoopFor(I.getParent());
+      while (L) {
+        if (isMustExecuteIn(I, L, &DT)) {
+          MustExec[&I].push_back(L);
+        }
+        L = L->getParentLoop();
+      };
+    }
+  }
+  MustExecuteAnnotatedWriter(const Module &M,
+                             DominatorTree &DT, LoopInfo &LI) {
+    for (auto &F : M)
+    for (auto &I: instructions(F)) {
+      Loop *L = LI.getLoopFor(I.getParent());
+      while (L) {
+        if (isMustExecuteIn(I, L, &DT)) {
+          MustExec[&I].push_back(L);
+        }
+        L = L->getParentLoop();
+      };
+    }
+  }
+
+
+  void printInfoComment(const Value &V, formatted_raw_ostream &OS) override {  
+    if (!MustExec.count(&V))
+      return;
+
+    const auto &Loops = MustExec.lookup(&V);
+    const auto NumLoops = Loops.size();
+    if (NumLoops > 1)
+      OS << " ; (mustexec in " << NumLoops << " loops: ";
+    else
+      OS << " ; (mustexec in: ";
+    
+    bool first = true;
+    for (const Loop *L : Loops) {
+      if (!first)
+        OS << ", ";
+      first = false;
+      OS << L->getHeader()->getName();
+    }
+    OS << ")";
+  }
+};
+} // namespace
+
+bool MustExecutePrinter::runOnFunction(Function &F) {
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  MustExecuteAnnotatedWriter Writer(F, DT, LI);
+  F.print(dbgs(), &Writer);
+  
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp b/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
index 55335f3a7cb0..d6db6386c38b 100644
--- a/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
+++ b/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 using namespace llvm::objcarc;
 
-/// \brief A handy option to enable/disable all ARC Optimizations.
+/// A handy option to enable/disable all ARC Optimizations.
 bool llvm::objcarc::EnableARCOpts;
 static cl::opt<bool, true> EnableARCOptimizations(
     "enable-objc-arc-opts", cl::desc("enable/disable all ARC Optimizations"),
diff --git a/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
index f374dd33f86f..f268e2a9abdd 100644
--- a/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
+++ b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -209,6 +209,7 @@ static bool isInertIntrinsic(unsigned ID) {
   // Don't let dbg info affect our results.
   case Intrinsic::dbg_declare:
   case Intrinsic::dbg_value:
+  case Intrinsic::dbg_label:
     // Short cut: Some intrinsics obviously don't use ObjC pointers.
     return true;
   default:
@@ -233,7 +234,7 @@ static bool isUseOnlyIntrinsic(unsigned ID) {
   }
 }
 
-/// \brief Determine what kind of construct V is.
+/// Determine what kind of construct V is.
 ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
     // Any instruction other than bitcast and gep with a pointer operand have a
@@ -331,7 +332,7 @@ ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
   return ARCInstKind::None;
 }
 
-/// \brief Test if the given class is a kind of user.
+/// Test if the given class is a kind of user.
 bool llvm::objcarc::IsUser(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::User:
@@ -365,7 +366,7 @@ bool llvm::objcarc::IsUser(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class is objc_retain or equivalent.
+/// Test if the given class is objc_retain or equivalent.
 bool llvm::objcarc::IsRetain(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
@@ -401,7 +402,7 @@ bool llvm::objcarc::IsRetain(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class is objc_autorelease or equivalent.
+/// Test if the given class is objc_autorelease or equivalent.
 bool llvm::objcarc::IsAutorelease(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Autorelease:
@@ -435,7 +436,7 @@ bool llvm::objcarc::IsAutorelease(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class represents instructions which return their
+/// Test if the given class represents instructions which return their
 /// argument verbatim.
 bool llvm::objcarc::IsForwarding(ARCInstKind Class) {
   switch (Class) {
@@ -470,7 +471,7 @@ bool llvm::objcarc::IsForwarding(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class represents instructions which do nothing if
+/// Test if the given class represents instructions which do nothing if
 /// passed a null pointer.
 bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) {
   switch (Class) {
@@ -505,7 +506,7 @@ bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class represents instructions which are always safe
+/// Test if the given class represents instructions which are always safe
 /// to mark with the "tail" keyword.
 bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) {
   // ARCInstKind::RetainBlock may be given a stack argument.
@@ -541,7 +542,7 @@ bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class represents instructions which are never safe
+/// Test if the given class represents instructions which are never safe
 /// to mark with the "tail" keyword.
 bool llvm::objcarc::IsNeverTail(ARCInstKind Class) {
   /// It is never safe to tail call objc_autorelease since by tail calling
@@ -580,7 +581,7 @@ bool llvm::objcarc::IsNeverTail(ARCInstKind Class) {
   llvm_unreachable("covered switch isn't covered?");
 }
 
-/// \brief Test if the given class represents instructions which are always safe
+/// Test if the given class represents instructions which are always safe
 /// to mark with the nounwind attribute.
 bool llvm::objcarc::IsNoThrow(ARCInstKind Class) {
   // objc_retainBlock is not nounwind because it calls user copy constructors
diff --git a/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
index a04c0aef04be..6c47651eae9e 100644
--- a/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
+++ b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
@@ -30,7 +30,7 @@ OrderedBasicBlock::OrderedBasicBlock(const BasicBlock *BasicB)
   LastInstFound = BB->end();
 }
 
-/// \brief Given no cached results, find if \p A comes before \p B in \p BB.
+/// Given no cached results, find if \p A comes before \p B in \p BB.
 /// Cache and number out instruction while walking \p BB.
 bool OrderedBasicBlock::comesBefore(const Instruction *A,
                                     const Instruction *B) {
@@ -58,7 +58,7 @@ bool OrderedBasicBlock::comesBefore(const Instruction *A,
   return Inst != B;
 }
 
-/// \brief Find out whether \p A dominates \p B, meaning whether \p A
+/// Find out whether \p A dominates \p B, meaning whether \p A
 /// comes before \p B in \p BB. This is a simplification that considers
 /// cached instruction positions and ignores other basic blocks, being
 /// only relevant to compare relative instructions positions inside \p BB.
diff --git a/contrib/llvm/lib/Analysis/PHITransAddr.cpp b/contrib/llvm/lib/Analysis/PHITransAddr.cpp
index 682af4dc708e..858f08f6537a 100644
--- a/contrib/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/contrib/llvm/lib/Analysis/PHITransAddr.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
diff --git a/contrib/llvm/lib/Analysis/PhiValues.cpp b/contrib/llvm/lib/Analysis/PhiValues.cpp
new file mode 100644
index 000000000000..ef121815d2cf
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PhiValues.cpp
@@ -0,0 +1,196 @@
+//===- PhiValues.cpp - Phi Value Analysis ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/PhiValues.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+bool PhiValues::invalidate(Function &, const PreservedAnalyses &PA,
+                           FunctionAnalysisManager::Invalidator &) {
+  // PhiValues is invalidated if it isn't preserved.
+  auto PAC = PA.getChecker<PhiValuesAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>());
+}
+
+// The goal here is to find all of the non-phi values reachable from this phi,
+// and to do the same for all of the phis reachable from this phi, as doing so
+// is necessary anyway in order to get the values for this phi. We do this using
+// Tarjan's algorithm with Nuutila's improvements to find the strongly connected
+// components of the phi graph rooted in this phi:
+//  * All phis in a strongly connected component will have the same reachable
+//    non-phi values. The SCC may not be the maximal subgraph for that set of
+//    reachable values, but finding out that isn't really necessary (it would
+//    only reduce the amount of memory needed to store the values).
+//  * Tarjan's algorithm completes components in a bottom-up manner, i.e. it
+//    never completes a component before the components reachable from it have
+//    been completed. This means that when we complete a component we have
+//    everything we need to collect the values reachable from that component.
+//  * We collect both the non-phi values reachable from each SCC, as that's what
+//    we're ultimately interested in, and all of the reachable values, i.e.
+//    including phis, as that makes invalidateValue easier.
+void PhiValues::processPhi(const PHINode *Phi,
+                           SmallVector<const PHINode *, 8> &Stack) {
+  // Initialize the phi with the next depth number.
+  assert(DepthMap.lookup(Phi) == 0);
+  assert(NextDepthNumber != UINT_MAX);
+  unsigned int DepthNumber = ++NextDepthNumber;
+  DepthMap[Phi] = DepthNumber;
+
+  // Recursively process the incoming phis of this phi.
+  for (Value *PhiOp : Phi->incoming_values()) {
+    if (PHINode *PhiPhiOp = dyn_cast<PHINode>(PhiOp)) {
+      // Recurse if the phi has not yet been visited.
+      if (DepthMap.lookup(PhiPhiOp) == 0)
+        processPhi(PhiPhiOp, Stack);
+      assert(DepthMap.lookup(PhiPhiOp) != 0);
+      // If the phi did not become part of a component then this phi and that
+      // phi are part of the same component, so adjust the depth number.
+      if (!ReachableMap.count(DepthMap[PhiPhiOp]))
+        DepthMap[Phi] = std::min(DepthMap[Phi], DepthMap[PhiPhiOp]);
+    }
+  }
+
+  // Now that incoming phis have been handled, push this phi to the stack.
+  Stack.push_back(Phi);
+
+  // If the depth number has not changed then we've finished collecting the phis
+  // of a strongly connected component.
+  if (DepthMap[Phi] == DepthNumber) {
+    // Collect the reachable values for this component. The phis of this
+    // component will be those on top of the depth stach with the same or
+    // greater depth number.
+    ConstValueSet Reachable;
+    while (!Stack.empty() && DepthMap[Stack.back()] >= DepthNumber) {
+      const PHINode *ComponentPhi = Stack.pop_back_val();
+      Reachable.insert(ComponentPhi);
+      DepthMap[ComponentPhi] = DepthNumber;
+      for (Value *Op : ComponentPhi->incoming_values()) {
+        if (PHINode *PhiOp = dyn_cast<PHINode>(Op)) {
+          // If this phi is not part of the same component then that component
+          // is guaranteed to have been completed before this one. Therefore we
+          // can just add its reachable values to the reachable values of this
+          // component.
+          auto It = ReachableMap.find(DepthMap[PhiOp]);
+          if (It != ReachableMap.end())
+            Reachable.insert(It->second.begin(), It->second.end());
+        } else {
+          Reachable.insert(Op);
+        }
+      }
+    }
+    ReachableMap.insert({DepthNumber,Reachable});
+
+    // Filter out phis to get the non-phi reachable values.
+    ValueSet NonPhi;
+    for (const Value *V : Reachable)
+      if (!isa<PHINode>(V))
+        NonPhi.insert(const_cast<Value*>(V));
+    NonPhiReachableMap.insert({DepthNumber,NonPhi});
+  }
+}
+
+const PhiValues::ValueSet &PhiValues::getValuesForPhi(const PHINode *PN) {
+  if (DepthMap.count(PN) == 0) {
+    SmallVector<const PHINode *, 8> Stack;
+    processPhi(PN, Stack);
+    assert(Stack.empty());
+  }
+  assert(DepthMap.lookup(PN) != 0);
+  return NonPhiReachableMap[DepthMap[PN]];
+}
+
+void PhiValues::invalidateValue(const Value *V) {
+  // Components that can reach V are invalid.
+  SmallVector<unsigned int, 8> InvalidComponents;
+  for (auto &Pair : ReachableMap)
+    if (Pair.second.count(V))
+      InvalidComponents.push_back(Pair.first);
+
+  for (unsigned int N : InvalidComponents) {
+    for (const Value *V : ReachableMap[N])
+      if (const PHINode *PN = dyn_cast<PHINode>(V))
+        DepthMap.erase(PN);
+    NonPhiReachableMap.erase(N);
+    ReachableMap.erase(N);
+  }
+}
+
+void PhiValues::releaseMemory() {
+  DepthMap.clear();
+  NonPhiReachableMap.clear();
+  ReachableMap.clear();
+}
+
+void PhiValues::print(raw_ostream &OS) const {
+  // Iterate through the phi nodes of the function rather than iterating through
+  // DepthMap in order to get predictable ordering.
+  for (const BasicBlock &BB : F) {
+    for (const PHINode &PN : BB.phis()) {
+      OS << "PHI ";
+      PN.printAsOperand(OS, false);
+      OS << " has values:\n";
+      unsigned int N = DepthMap.lookup(&PN);
+      auto It = NonPhiReachableMap.find(N);
+      if (It == NonPhiReachableMap.end())
+        OS << "  UNKNOWN\n";
+      else if (It->second.empty())
+        OS << "  NONE\n";
+      else
+        for (Value *V : It->second)
+          // Printing of an instruction prints two spaces at the start, so
+          // handle instructions and everything else slightly differently in
+          // order to get consistent indenting.
+          if (Instruction *I = dyn_cast<Instruction>(V))
+            OS << *I << "\n";
+          else
+            OS << "  " << *V << "\n";
+    }
+  }
+}
+
+AnalysisKey PhiValuesAnalysis::Key;
+PhiValues PhiValuesAnalysis::run(Function &F, FunctionAnalysisManager &) {
+  return PhiValues(F);
+}
+
+PreservedAnalyses PhiValuesPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "PHI Values for function: " << F.getName() << "\n";
+  PhiValues &PI = AM.getResult<PhiValuesAnalysis>(F);
+  for (const BasicBlock &BB : F)
+    for (const PHINode &PN : BB.phis())
+      PI.getValuesForPhi(&PN);
+  PI.print(OS);
+  return PreservedAnalyses::all();
+}
+
+PhiValuesWrapperPass::PhiValuesWrapperPass() : FunctionPass(ID) {
+  initializePhiValuesWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool PhiValuesWrapperPass::runOnFunction(Function &F) {
+  Result.reset(new PhiValues(F));
+  return false;
+}
+
+void PhiValuesWrapperPass::releaseMemory() {
+  Result->releaseMemory();
+}
+
+void PhiValuesWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+char PhiValuesWrapperPass::ID = 0;
+
+INITIALIZE_PASS(PhiValuesWrapperPass, "phi-values", "Phi Values Analysis", false,
+                true)
diff --git a/contrib/llvm/lib/Analysis/PostDominators.cpp b/contrib/llvm/lib/Analysis/PostDominators.cpp
index 2282401085d4..e6b660fe26d7 100644
--- a/contrib/llvm/lib/Analysis/PostDominators.cpp
+++ b/contrib/llvm/lib/Analysis/PostDominators.cpp
@@ -21,6 +21,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "postdomtree"
 
+#ifdef EXPENSIVE_CHECKS
+static constexpr bool ExpensiveChecksEnabled = true;
+#else
+static constexpr bool ExpensiveChecksEnabled = false;
+#endif
+
 //===----------------------------------------------------------------------===//
 //  PostDominatorTree Implementation
 //===----------------------------------------------------------------------===//
@@ -44,6 +50,13 @@ bool PostDominatorTreeWrapperPass::runOnFunction(Function &F) {
   return false;
 }
 
+void PostDominatorTreeWrapperPass::verifyAnalysis() const {
+  if (VerifyDomInfo)
+    assert(DT.verify(PostDominatorTree::VerificationLevel::Full));
+  else if (ExpensiveChecksEnabled)
+    assert(DT.verify(PostDominatorTree::VerificationLevel::Basic));
+}
+
 void PostDominatorTreeWrapperPass::print(raw_ostream &OS, const Module *) const {
   DT.print(OS);
 }
@@ -56,8 +69,7 @@ AnalysisKey PostDominatorTreeAnalysis::Key;
 
 PostDominatorTree PostDominatorTreeAnalysis::run(Function &F,
                                                  FunctionAnalysisManager &) {
-  PostDominatorTree PDT;
-  PDT.recalculate(F);
+  PostDominatorTree PDT(F);
   return PDT;
 }
 
diff --git a/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 347d093b0f61..fb591f5d6a69 100644
--- a/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -112,7 +112,7 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
   // FIXME: The heuristic used below for determining hotness is based on
   // preliminary SPEC tuning for inliner. This will eventually be a
   // convenience method that calls isHotCount.
-  return FunctionCount && isHotCount(FunctionCount.getValue());
+  return FunctionCount && isHotCount(FunctionCount.getCount());
 }
 
 /// Returns true if the function contains hot code. This can include a hot
@@ -125,7 +125,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
   if (!F || !computeSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
-    if (isHotCount(FunctionCount.getValue()))
+    if (isHotCount(FunctionCount.getCount()))
       return true;
 
   if (hasSampleProfile()) {
@@ -154,7 +154,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
   if (!F || !computeSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
-    if (!isColdCount(FunctionCount.getValue()))
+    if (!isColdCount(FunctionCount.getCount()))
       return false;
 
   if (hasSampleProfile()) {
@@ -187,7 +187,7 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
   // FIXME: The heuristic used below for determining coldness is based on
   // preliminary SPEC tuning for inliner. This will eventually be a
   // convenience method that calls isHotCount.
-  return FunctionCount && isColdCount(FunctionCount.getValue());
+  return FunctionCount && isColdCount(FunctionCount.getCount());
 }
 
 /// Compute the hot and cold thresholds.
@@ -223,6 +223,18 @@ bool ProfileSummaryInfo::isColdCount(uint64_t C) {
   return ColdCountThreshold && C <= ColdCountThreshold.getValue();
 }
 
+uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() {
+  if (!HotCountThreshold)
+    computeThresholds();
+  return HotCountThreshold && HotCountThreshold.getValue();
+}
+
+uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() {
+  if (!ColdCountThreshold)
+    computeThresholds();
+  return ColdCountThreshold && ColdCountThreshold.getValue();
+}
+
 bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) {
   auto Count = BFI->getBlockProfileCount(B);
   return Count && isHotCount(*Count);
@@ -247,7 +259,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
     return isColdCount(*C);
 
   // In SamplePGO, if the caller has been sampled, and there is no profile
-  // annotatedon the callsite, we consider the callsite as cold.
+  // annotated on the callsite, we consider the callsite as cold.
   // If there is no profile for the caller, and we know the profile is
   // accurate, we consider the callsite as cold.
   return (hasSampleProfile() &&
diff --git a/contrib/llvm/lib/Analysis/RegionInfo.cpp b/contrib/llvm/lib/Analysis/RegionInfo.cpp
index 900487323005..2bd611350f46 100644
--- a/contrib/llvm/lib/Analysis/RegionInfo.cpp
+++ b/contrib/llvm/lib/Analysis/RegionInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/RegionPrinter.h"
 #endif
 #include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -80,7 +81,7 @@ RegionInfo::~RegionInfo() = default;
 bool RegionInfo::invalidate(Function &F, const PreservedAnalyses &PA,
                             FunctionAnalysisManager::Invalidator &) {
   // Check whether the analysis, all analyses on functions, or the function's
-  // CFG have been preserved.
+  // CFG has been preserved.
   auto PAC = PA.getChecker<RegionInfoAnalysis>();
   return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
            PAC.preservedSet<CFGAnalyses>());
diff --git a/contrib/llvm/lib/Analysis/RegionPass.cpp b/contrib/llvm/lib/Analysis/RegionPass.cpp
index c5d71b25e022..ed17df2e7e93 100644
--- a/contrib/llvm/lib/Analysis/RegionPass.cpp
+++ b/contrib/llvm/lib/Analysis/RegionPass.cpp
@@ -158,12 +158,9 @@ bool RGPassManager::runOnFunction(Function &F) {
   }
 
   // Print the region tree after all pass.
-  DEBUG(
-    dbgs() << "\nRegion tree of function " << F.getName()
-           << " after all region Pass:\n";
-    RI->dump();
-    dbgs() << "\n";
-    );
+  LLVM_DEBUG(dbgs() << "\nRegion tree of function " << F.getName()
+                    << " after all region Pass:\n";
+             RI->dump(); dbgs() << "\n";);
 
   return Changed;
 }
@@ -283,14 +280,14 @@ Pass *RegionPass::createPrinterPass(raw_ostream &O,
 
 bool RegionPass::skipRegion(Region &R) const {
   Function &F = *R.getEntry()->getParent();
-  if (!F.getContext().getOptBisect().shouldRunPass(this, R))
+  if (!F.getContext().getOptPassGate().shouldRunPass(this, R))
     return true;
 
   if (F.hasFnAttribute(Attribute::OptimizeNone)) {
     // Report this only once per function.
     if (R.getEntry() == &F.getEntryBlock())
-      DEBUG(dbgs() << "Skipping pass '" << getPassName()
-            << "' on function " << F.getName() << "\n");
+      LLVM_DEBUG(dbgs() << "Skipping pass '" << getPassName()
+                        << "' on function " << F.getName() << "\n");
     return true;
   }
   return false;
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
index bfff7afb5b4e..aa95ace93014 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -83,6 +83,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -205,11 +206,6 @@ static cl::opt<unsigned>
                   cl::desc("Max coefficients in AddRec during evolving"),
                   cl::init(16));
 
-static cl::opt<bool> VersionUnknown(
-    "scev-version-unknown", cl::Hidden,
-    cl::desc("Use predicated scalar evolution to version SCEVUnknowns"),
-    cl::init(false));
-
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -425,24 +421,21 @@ SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID,
 SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
                                    const SCEV *op, Type *ty)
   : SCEVCastExpr(ID, scTruncate, op, ty) {
-  assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(Op->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot truncate non-integer value!");
 }
 
 SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
                                        const SCEV *op, Type *ty)
   : SCEVCastExpr(ID, scZeroExtend, op, ty) {
-  assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(Op->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot zero extend non-integer value!");
 }
 
 SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID,
                                        const SCEV *op, Type *ty)
   : SCEVCastExpr(ID, scSignExtend, op, ty) {
-  assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(Op->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot sign extend non-integer value!");
 }
 
@@ -1260,42 +1253,32 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
     return getTruncateOrZeroExtend(SZ->getOperand(), Ty);
 
-  // trunc(x1+x2+...+xN) --> trunc(x1)+trunc(x2)+...+trunc(xN) if we can
-  // eliminate all the truncates, or we replace other casts with truncates.
-  if (const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Op)) {
+  // trunc(x1 + ... + xN) --> trunc(x1) + ... + trunc(xN) and
+  // trunc(x1 * ... * xN) --> trunc(x1) * ... * trunc(xN),
+  // if after transforming we have at most one truncate, not counting truncates
+  // that replace other casts.
+  if (isa<SCEVAddExpr>(Op) || isa<SCEVMulExpr>(Op)) {
+    auto *CommOp = cast<SCEVCommutativeExpr>(Op);
     SmallVector<const SCEV *, 4> Operands;
-    bool hasTrunc = false;
-    for (unsigned i = 0, e = SA->getNumOperands(); i != e && !hasTrunc; ++i) {
-      const SCEV *S = getTruncateExpr(SA->getOperand(i), Ty);
-      if (!isa<SCEVCastExpr>(SA->getOperand(i)))
-        hasTrunc = isa<SCEVTruncateExpr>(S);
+    unsigned numTruncs = 0;
+    for (unsigned i = 0, e = CommOp->getNumOperands(); i != e && numTruncs < 2;
+         ++i) {
+      const SCEV *S = getTruncateExpr(CommOp->getOperand(i), Ty);
+      if (!isa<SCEVCastExpr>(CommOp->getOperand(i)) && isa<SCEVTruncateExpr>(S))
+        numTruncs++;
       Operands.push_back(S);
     }
-    if (!hasTrunc)
-      return getAddExpr(Operands);
-    // In spite we checked in the beginning that ID is not in the cache,
-    // it is possible that during recursion and different modification
-    // ID came to cache, so if we found it, just return it.
-    if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP))
-      return S;
-  }
-
-  // trunc(x1*x2*...*xN) --> trunc(x1)*trunc(x2)*...*trunc(xN) if we can
-  // eliminate all the truncates, or we replace other casts with truncates.
-  if (const SCEVMulExpr *SM = dyn_cast<SCEVMulExpr>(Op)) {
-    SmallVector<const SCEV *, 4> Operands;
-    bool hasTrunc = false;
-    for (unsigned i = 0, e = SM->getNumOperands(); i != e && !hasTrunc; ++i) {
-      const SCEV *S = getTruncateExpr(SM->getOperand(i), Ty);
-      if (!isa<SCEVCastExpr>(SM->getOperand(i)))
-        hasTrunc = isa<SCEVTruncateExpr>(S);
-      Operands.push_back(S);
+    if (numTruncs < 2) {
+      if (isa<SCEVAddExpr>(Op))
+        return getAddExpr(Operands);
+      else if (isa<SCEVMulExpr>(Op))
+        return getMulExpr(Operands);
+      else
+        llvm_unreachable("Unexpected SCEV type for Op.");
     }
-    if (!hasTrunc)
-      return getMulExpr(Operands);
-    // In spite we checked in the beginning that ID is not in the cache,
-    // it is possible that during recursion and different modification
-    // ID came to cache, so if we found it, just return it.
+    // Although we checked in the beginning that ID is not in the cache, it is
+    // possible that during recursion and different modification ID was inserted
+    // into the cache. So if we find it, just return it.
     if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP))
       return S;
   }
@@ -1576,6 +1559,43 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start,
   return false;
 }
 
+// Finds an integer D for an expression (C + x + y + ...) such that the top
+// level addition in (D + (C - D + x + y + ...)) would not wrap (signed or
+// unsigned) and the number of trailing zeros of (C - D + x + y + ...) is
+// maximized, where C is the \p ConstantTerm, x, y, ... are arbitrary SCEVs, and
+// the (C + x + y + ...) expression is \p WholeAddExpr.
+static APInt extractConstantWithoutWrapping(ScalarEvolution &SE,
+                                            const SCEVConstant *ConstantTerm,
+                                            const SCEVAddExpr *WholeAddExpr) {
+  const APInt C = ConstantTerm->getAPInt();
+  const unsigned BitWidth = C.getBitWidth();
+  // Find number of trailing zeros of (x + y + ...) w/o the C first:
+  uint32_t TZ = BitWidth;
+  for (unsigned I = 1, E = WholeAddExpr->getNumOperands(); I < E && TZ; ++I)
+    TZ = std::min(TZ, SE.GetMinTrailingZeros(WholeAddExpr->getOperand(I)));
+  if (TZ) {
+    // Set D to be as many least significant bits of C as possible while still
+    // guaranteeing that adding D to (C - D + x + y + ...) won't cause a wrap:
+    return TZ < BitWidth ? C.trunc(TZ).zext(BitWidth) : C;
+  }
+  return APInt(BitWidth, 0);
+}
+
+// Finds an integer D for an affine AddRec expression {C,+,x} such that the top
+// level addition in (D + {C-D,+,x}) would not wrap (signed or unsigned) and the
+// number of trailing zeros of (C - D + x * n) is maximized, where C is the \p
+// ConstantStart, x is an arbitrary \p Step, and n is the loop trip count.
+static APInt extractConstantWithoutWrapping(ScalarEvolution &SE,
+                                            const APInt &ConstantStart,
+                                            const SCEV *Step) {
+  const unsigned BitWidth = ConstantStart.getBitWidth();
+  const uint32_t TZ = SE.GetMinTrailingZeros(Step);
+  if (TZ)
+    return TZ < BitWidth ? ConstantStart.trunc(TZ).zext(BitWidth)
+                         : ConstantStart;
+  return APInt(BitWidth, 0);
+}
+
 const SCEV *
 ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
@@ -1732,9 +1752,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
           const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
                                       getUnsignedRangeMax(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) ||
-              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) &&
-               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
-                                           AR->getPostIncExpr(*this), N))) {
+              isKnownOnEveryIteration(ICmpInst::ICMP_ULT, AR, N)) {
             // Cache knowledge of AR NUW, which is propagated to this
             // AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
@@ -1749,9 +1767,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
           const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
                                       getSignedRangeMin(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) ||
-              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) &&
-               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT,
-                                           AR->getPostIncExpr(*this), N))) {
+              isKnownOnEveryIteration(ICmpInst::ICMP_UGT, AR, N)) {
             // Cache knowledge of AR NW, which is propagated to this
             // AddRec.  Negative step causes unsigned wrap, but it
             // still can't self-wrap.
@@ -1766,6 +1782,23 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
         }
       }
 
+      // zext({C,+,Step}) --> (zext(D) + zext({C-D,+,Step}))<nuw><nsw>
+      // if D + (C - D + Step * n) could be proven to not unsigned wrap
+      // where D maximizes the number of trailing zeros of (C - D + Step * n)
+      if (const auto *SC = dyn_cast<SCEVConstant>(Start)) {
+        const APInt &C = SC->getAPInt();
+        const APInt &D = extractConstantWithoutWrapping(*this, C, Step);
+        if (D != 0) {
+          const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth);
+          const SCEV *SResidual =
+              getAddRecExpr(getConstant(C - D), Step, L, AR->getNoWrapFlags());
+          const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1);
+          return getAddExpr(SZExtD, SZExtR,
+                            (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW),
+                            Depth + 1);
+        }
+      }
+
       if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) {
         const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
         return getAddRecExpr(
@@ -1774,6 +1807,20 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       }
     }
 
+  // zext(A % B) --> zext(A) % zext(B)
+  {
+    const SCEV *LHS;
+    const SCEV *RHS;
+    if (matchURem(Op, LHS, RHS))
+      return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1),
+                         getZeroExtendExpr(RHS, Ty, Depth + 1));
+  }
+
+  // zext(A / B) --> zext(A) / zext(B).
+  if (auto *Div = dyn_cast<SCEVUDivExpr>(Op))
+    return getUDivExpr(getZeroExtendExpr(Div->getLHS(), Ty, Depth + 1),
+                       getZeroExtendExpr(Div->getRHS(), Ty, Depth + 1));
+
   if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
     // zext((A + B + ...)<nuw>) --> (zext(A) + zext(B) + ...)<nuw>
     if (SA->hasNoUnsignedWrap()) {
@@ -1784,6 +1831,65 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
         Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
       return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1);
     }
+
+    // zext(C + x + y + ...) --> (zext(D) + zext((C - D) + x + y + ...))
+    // if D + (C - D + x + y + ...) could be proven to not unsigned wrap
+    // where D maximizes the number of trailing zeros of (C - D + x + y + ...)
+    //
+    // Often address arithmetics contain expressions like
+    // (zext (add (shl X, C1), C2)), for instance, (zext (5 + (4 * X))).
+    // This transformation is useful while proving that such expressions are
+    // equal or differ by a small constant amount, see LoadStoreVectorizer pass.
+    if (const auto *SC = dyn_cast<SCEVConstant>(SA->getOperand(0))) {
+      const APInt &D = extractConstantWithoutWrapping(*this, SC, SA);
+      if (D != 0) {
+        const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth);
+        const SCEV *SResidual =
+            getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth);
+        const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1);
+        return getAddExpr(SZExtD, SZExtR,
+                          (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW),
+                          Depth + 1);
+      }
+    }
+  }
+
+  if (auto *SM = dyn_cast<SCEVMulExpr>(Op)) {
+    // zext((A * B * ...)<nuw>) --> (zext(A) * zext(B) * ...)<nuw>
+    if (SM->hasNoUnsignedWrap()) {
+      // If the multiply does not unsign overflow then we can, by definition,
+      // commute the zero extension with the multiply operation.
+      SmallVector<const SCEV *, 4> Ops;
+      for (const auto *Op : SM->operands())
+        Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
+      return getMulExpr(Ops, SCEV::FlagNUW, Depth + 1);
+    }
+
+    // zext(2^K * (trunc X to iN)) to iM ->
+    // 2^K * (zext(trunc X to i{N-K}) to iM)<nuw>
+    //
+    // Proof:
+    //
+    //     zext(2^K * (trunc X to iN)) to iM
+    //   = zext((trunc X to iN) << K) to iM
+    //   = zext((trunc X to i{N-K}) << K)<nuw> to iM
+    //     (because shl removes the top K bits)
+    //   = zext((2^K * (trunc X to i{N-K}))<nuw>) to iM
+    //   = (2^K * (zext(trunc X to i{N-K}) to iM))<nuw>.
+    //
+    if (SM->getNumOperands() == 2)
+      if (auto *MulLHS = dyn_cast<SCEVConstant>(SM->getOperand(0)))
+        if (MulLHS->getAPInt().isPowerOf2())
+          if (auto *TruncRHS = dyn_cast<SCEVTruncateExpr>(SM->getOperand(1))) {
+            int NewTruncBits = getTypeSizeInBits(TruncRHS->getType()) -
+                               MulLHS->getAPInt().logBase2();
+            Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
+            return getMulExpr(
+                getZeroExtendExpr(MulLHS, Ty),
+                getZeroExtendExpr(
+                    getTruncateExpr(TruncRHS->getOperand(), NewTruncTy), Ty),
+                SCEV::FlagNUW, Depth + 1);
+          }
   }
 
   // The cast wasn't folded; create an explicit cast node.
@@ -1847,24 +1953,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       return getTruncateOrSignExtend(X, Ty);
   }
 
-  // sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2
   if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
-    if (SA->getNumOperands() == 2) {
-      auto *SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0));
-      auto *SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1));
-      if (SMul && SC1) {
-        if (auto *SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) {
-          const APInt &C1 = SC1->getAPInt();
-          const APInt &C2 = SC2->getAPInt();
-          if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
-              C2.ugt(C1) && C2.isPowerOf2())
-            return getAddExpr(getSignExtendExpr(SC1, Ty, Depth + 1),
-                              getSignExtendExpr(SMul, Ty, Depth + 1),
-                              SCEV::FlagAnyWrap, Depth + 1);
-        }
-      }
-    }
-
     // sext((A + B + ...)<nsw>) --> (sext(A) + sext(B) + ...)<nsw>
     if (SA->hasNoSignedWrap()) {
       // If the addition does not sign overflow then we can, by definition,
@@ -1874,6 +1963,28 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
         Ops.push_back(getSignExtendExpr(Op, Ty, Depth + 1));
       return getAddExpr(Ops, SCEV::FlagNSW, Depth + 1);
     }
+
+    // sext(C + x + y + ...) --> (sext(D) + sext((C - D) + x + y + ...))
+    // if D + (C - D + x + y + ...) could be proven to not signed wrap
+    // where D maximizes the number of trailing zeros of (C - D + x + y + ...)
+    //
+    // For instance, this will bring two seemingly different expressions:
+    //     1 + sext(5 + 20 * %x + 24 * %y)  and
+    //         sext(6 + 20 * %x + 24 * %y)
+    // to the same form:
+    //     2 + sext(4 + 20 * %x + 24 * %y)
+    if (const auto *SC = dyn_cast<SCEVConstant>(SA->getOperand(0))) {
+      const APInt &D = extractConstantWithoutWrapping(*this, SC, SA);
+      if (D != 0) {
+        const SCEV *SSExtD = getSignExtendExpr(getConstant(D), Ty, Depth);
+        const SCEV *SResidual =
+            getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth);
+        const SCEV *SSExtR = getSignExtendExpr(SResidual, Ty, Depth + 1);
+        return getAddExpr(SSExtD, SSExtR,
+                          (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW),
+                          Depth + 1);
+      }
+    }
   }
   // If the input value is a chrec scev, and we can prove that the value
   // did not overflow the old, smaller, value, we can sign extend all of the
@@ -1994,9 +2105,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             getSignedOverflowLimitForStep(Step, &Pred, this);
         if (OverflowLimit &&
             (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
-             (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) &&
-              isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this),
-                                          OverflowLimit)))) {
+             isKnownOnEveryIteration(Pred, AR, OverflowLimit))) {
           // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
           const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
           return getAddRecExpr(
@@ -2005,21 +2114,20 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
         }
       }
 
-      // If Start and Step are constants, check if we can apply this
-      // transformation:
-      // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2
-      auto *SC1 = dyn_cast<SCEVConstant>(Start);
-      auto *SC2 = dyn_cast<SCEVConstant>(Step);
-      if (SC1 && SC2) {
-        const APInt &C1 = SC1->getAPInt();
-        const APInt &C2 = SC2->getAPInt();
-        if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
-            C2.isPowerOf2()) {
-          Start = getSignExtendExpr(Start, Ty, Depth + 1);
-          const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L,
-                                            AR->getNoWrapFlags());
-          return getAddExpr(Start, getSignExtendExpr(NewAR, Ty, Depth + 1),
-                            SCEV::FlagAnyWrap, Depth + 1);
+      // sext({C,+,Step}) --> (sext(D) + sext({C-D,+,Step}))<nuw><nsw>
+      // if D + (C - D + Step * n) could be proven to not signed wrap
+      // where D maximizes the number of trailing zeros of (C - D + Step * n)
+      if (const auto *SC = dyn_cast<SCEVConstant>(Start)) {
+        const APInt &C = SC->getAPInt();
+        const APInt &D = extractConstantWithoutWrapping(*this, C, Step);
+        if (D != 0) {
+          const SCEV *SSExtD = getSignExtendExpr(getConstant(D), Ty, Depth);
+          const SCEV *SResidual =
+              getAddRecExpr(getConstant(C - D), Step, L, AR->getNoWrapFlags());
+          const SCEV *SSExtR = getSignExtendExpr(SResidual, Ty, Depth + 1);
+          return getAddExpr(SSExtD, SSExtR,
+                            (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW),
+                            Depth + 1);
         }
       }
 
@@ -2215,22 +2323,35 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
 
   SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask);
 
-  if (SignOrUnsignWrap != SignOrUnsignMask && Type == scAddExpr &&
-      Ops.size() == 2 && isa<SCEVConstant>(Ops[0])) {
+  if (SignOrUnsignWrap != SignOrUnsignMask &&
+      (Type == scAddExpr || Type == scMulExpr) && Ops.size() == 2 &&
+      isa<SCEVConstant>(Ops[0])) {
 
-    // (A + C) --> (A + C)<nsw> if the addition does not sign overflow
-    // (A + C) --> (A + C)<nuw> if the addition does not unsign overflow
+    auto Opcode = [&] {
+      switch (Type) {
+      case scAddExpr:
+        return Instruction::Add;
+      case scMulExpr:
+        return Instruction::Mul;
+      default:
+        llvm_unreachable("Unexpected SCEV op.");
+      }
+    }();
 
     const APInt &C = cast<SCEVConstant>(Ops[0])->getAPInt();
+
+    // (A <opcode> C) --> (A <opcode> C)<nsw> if the op doesn't sign overflow.
     if (!(SignOrUnsignWrap & SCEV::FlagNSW)) {
       auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
-          Instruction::Add, C, OBO::NoSignedWrap);
+          Opcode, C, OBO::NoSignedWrap);
       if (NSWRegion.contains(SE->getSignedRange(Ops[1])))
         Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
     }
+
+    // (A <opcode> C) --> (A <opcode> C)<nuw> if the op doesn't unsign overflow.
     if (!(SignOrUnsignWrap & SCEV::FlagNUW)) {
       auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
-          Instruction::Add, C, OBO::NoUnsignedWrap);
+          Opcode, C, OBO::NoUnsignedWrap);
       if (NUWRegion.contains(SE->getUnsignedRange(Ops[1])))
         Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
     }
@@ -2240,59 +2361,7 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
 }
 
 bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L) {
-  if (!isLoopInvariant(S, L))
-    return false;
-  // If a value depends on a SCEVUnknown which is defined after the loop, we
-  // conservatively assume that we cannot calculate it at the loop's entry.
-  struct FindDominatedSCEVUnknown {
-    bool Found = false;
-    const Loop *L;
-    DominatorTree &DT;
-    LoopInfo &LI;
-
-    FindDominatedSCEVUnknown(const Loop *L, DominatorTree &DT, LoopInfo &LI)
-        : L(L), DT(DT), LI(LI) {}
-
-    bool checkSCEVUnknown(const SCEVUnknown *SU) {
-      if (auto *I = dyn_cast<Instruction>(SU->getValue())) {
-        if (DT.dominates(L->getHeader(), I->getParent()))
-          Found = true;
-        else
-          assert(DT.dominates(I->getParent(), L->getHeader()) &&
-                 "No dominance relationship between SCEV and loop?");
-      }
-      return false;
-    }
-
-    bool follow(const SCEV *S) {
-      switch (static_cast<SCEVTypes>(S->getSCEVType())) {
-      case scConstant:
-        return false;
-      case scAddRecExpr:
-      case scTruncate:
-      case scZeroExtend:
-      case scSignExtend:
-      case scAddExpr:
-      case scMulExpr:
-      case scUMaxExpr:
-      case scSMaxExpr:
-      case scUDivExpr:
-        return true;
-      case scUnknown:
-        return checkSCEVUnknown(cast<SCEVUnknown>(S));
-      case scCouldNotCompute:
-        llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-      }
-      return false;
-    }
-
-    bool isDone() { return Found; }
-  };
-
-  FindDominatedSCEVUnknown FSU(L, DT, LI);
-  SCEVTraversal<FindDominatedSCEVUnknown> ST(FSU);
-  ST.visitAll(S);
-  return !FSU.Found;
+  return isLoopInvariant(S, L) && properlyDominates(S, L->getHeader());
 }
 
 /// Get a canonical add expression, or something simpler if possible.
@@ -2423,7 +2492,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     }
     if (Ok) {
       // Evaluate the expression in the larger type.
-      const SCEV *Fold = getAddExpr(LargeOps, Flags, Depth + 1);
+      const SCEV *Fold = getAddExpr(LargeOps, SCEV::FlagAnyWrap, Depth + 1);
       // If it folds to something simple, use it. Otherwise, don't.
       if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
         return getTruncateExpr(Fold, Ty);
@@ -2801,22 +2870,21 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   unsigned Idx = 0;
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
 
-    // C1*(C2+V) -> C1*C2 + C1*V
     if (Ops.size() == 2)
-        if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
-          // If any of Add's ops are Adds or Muls with a constant,
-          // apply this transformation as well.
-          if (Add->getNumOperands() == 2)
-            // TODO: There are some cases where this transformation is not
-            // profitable, for example:
-            // Add = (C0 + X) * Y + Z.
-            // Maybe the scope of this transformation should be narrowed down.
-            if (containsConstantInAddMulChain(Add))
-              return getAddExpr(getMulExpr(LHSC, Add->getOperand(0),
-                                           SCEV::FlagAnyWrap, Depth + 1),
-                                getMulExpr(LHSC, Add->getOperand(1),
-                                           SCEV::FlagAnyWrap, Depth + 1),
-                                SCEV::FlagAnyWrap, Depth + 1);
+      // C1*(C2+V) -> C1*C2 + C1*V
+      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
+        // If any of Add's ops are Adds or Muls with a constant, apply this
+        // transformation as well.
+        //
+        // TODO: There are some cases where this transformation is not
+        // profitable; for example, Add = (C0 + X) * Y + Z.  Maybe the scope of
+        // this transformation should be narrowed down.
+        if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add))
+          return getAddExpr(getMulExpr(LHSC, Add->getOperand(0),
+                                       SCEV::FlagAnyWrap, Depth + 1),
+                            getMulExpr(LHSC, Add->getOperand(1),
+                                       SCEV::FlagAnyWrap, Depth + 1),
+                            SCEV::FlagAnyWrap, Depth + 1);
 
     ++Idx;
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
@@ -3128,6 +3196,21 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
             }
           }
       }
+
+      // (A/B)/C --> A/(B*C) if safe and B*C can be folded.
+      if (const SCEVUDivExpr *OtherDiv = dyn_cast<SCEVUDivExpr>(LHS)) {
+        if (auto *DivisorConstant =
+                dyn_cast<SCEVConstant>(OtherDiv->getRHS())) {
+          bool Overflow = false;
+          APInt NewRHS =
+              DivisorConstant->getAPInt().umul_ov(RHSC->getAPInt(), Overflow);
+          if (Overflow) {
+            return getConstant(RHSC->getType(), 0, false);
+          }
+          return getUDivExpr(OtherDiv->getLHS(), getConstant(NewRHS));
+        }
+      }
+
       // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
       if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) {
         SmallVector<const SCEV *, 4> Operands;
@@ -3579,12 +3662,13 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
     //  X umax Y umax Y  -->  X umax Y
     //  X umax Y         -->  X, if X is always greater than Y
-    if (Ops[i] == Ops[i+1] ||
-        isKnownPredicate(ICmpInst::ICMP_UGE, Ops[i], Ops[i+1])) {
-      Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2);
+    if (Ops[i] == Ops[i + 1] || isKnownViaNonRecursiveReasoning(
+                                    ICmpInst::ICMP_UGE, Ops[i], Ops[i + 1])) {
+      Ops.erase(Ops.begin() + i + 1, Ops.begin() + i + 2);
       --i; --e;
-    } else if (isKnownPredicate(ICmpInst::ICMP_ULE, Ops[i], Ops[i+1])) {
-      Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
+    } else if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, Ops[i],
+                                               Ops[i + 1])) {
+      Ops.erase(Ops.begin() + i, Ops.begin() + i + 1);
       --i; --e;
     }
 
@@ -3611,14 +3695,35 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 
 const SCEV *ScalarEvolution::getSMinExpr(const SCEV *LHS,
                                          const SCEV *RHS) {
-  // ~smax(~x, ~y) == smin(x, y).
-  return getNotSCEV(getSMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
+  SmallVector<const SCEV *, 2> Ops = { LHS, RHS };
+  return getSMinExpr(Ops);
+}
+
+const SCEV *ScalarEvolution::getSMinExpr(SmallVectorImpl<const SCEV *> &Ops) {
+  // ~smax(~x, ~y, ~z) == smin(x, y, z).
+  SmallVector<const SCEV *, 2> NotOps;
+  for (auto *S : Ops)
+    NotOps.push_back(getNotSCEV(S));
+  return getNotSCEV(getSMaxExpr(NotOps));
 }
 
 const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
                                          const SCEV *RHS) {
-  // ~umax(~x, ~y) == umin(x, y)
-  return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
+  SmallVector<const SCEV *, 2> Ops = { LHS, RHS };
+  return getUMinExpr(Ops);
+}
+
+const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl<const SCEV *> &Ops) {
+  assert(!Ops.empty() && "At least one operand must be!");
+  // Trivial case.
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // ~umax(~x, ~y, ~z) == umin(x, y, z).
+  SmallVector<const SCEV *, 2> NotOps;
+  for (auto *S : Ops)
+    NotOps.push_back(getNotSCEV(S));
+  return getNotSCEV(getUMaxExpr(NotOps));
 }
 
 const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
@@ -3670,13 +3775,15 @@ const SCEV *ScalarEvolution::getUnknown(Value *V) {
 /// target-specific information.
 bool ScalarEvolution::isSCEVable(Type *Ty) const {
   // Integers and pointers are always SCEVable.
-  return Ty->isIntegerTy() || Ty->isPointerTy();
+  return Ty->isIntOrPtrTy();
 }
 
 /// Return the size in bits of the specified type, for which isSCEVable must
 /// return true.
 uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
+  if (Ty->isPointerTy())
+    return getDataLayout().getIndexTypeSizeInBits(Ty);
   return getDataLayout().getTypeSizeInBits(Ty);
 }
 
@@ -3779,6 +3886,24 @@ void ScalarEvolution::eraseValueFromMap(Value *V) {
   }
 }
 
+/// Check whether value has nuw/nsw/exact set but SCEV does not.
+/// TODO: In reality it is better to check the poison recursevely
+/// but this is better than nothing.
+static bool SCEVLostPoisonFlags(const SCEV *S, const Value *V) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (isa<OverflowingBinaryOperator>(I)) {
+      if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) {
+        if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap())
+          return true;
+        if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap())
+          return true;
+      }
+    } else if (isa<PossiblyExactOperator>(I) && I->isExact())
+      return true;
+  }
+  return false;
+}
+
 /// Return an existing SCEV if it exists, otherwise analyze the expression and
 /// create a new one.
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
@@ -3792,7 +3917,7 @@ const SCEV *ScalarEvolution::getSCEV(Value *V) {
     // ValueExprMap before insert S->{V, 0} into ExprValueMap.
     std::pair<ValueExprMapType::iterator, bool> Pair =
         ValueExprMap.insert({SCEVCallbackVH(V, this), S});
-    if (Pair.second) {
+    if (Pair.second && !SCEVLostPoisonFlags(S, V)) {
       ExprValueMap[S].insert({V, nullptr});
 
       // If S == Stripped + Offset, add Stripped -> {V, Offset} into
@@ -3895,8 +4020,7 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
 const SCEV *
 ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
-  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot truncate or zero extend with non-integer arguments!");
   if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
     return V;  // No conversion
@@ -3909,8 +4033,7 @@ const SCEV *
 ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
                                          Type *Ty) {
   Type *SrcTy = V->getType();
-  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot truncate or zero extend with non-integer arguments!");
   if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
     return V;  // No conversion
@@ -3922,8 +4045,7 @@ ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
 const SCEV *
 ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
-  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot noop or zero extend with non-integer arguments!");
   assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
          "getNoopOrZeroExtend cannot truncate!");
@@ -3935,8 +4057,7 @@ ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) {
 const SCEV *
 ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
-  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot noop or sign extend with non-integer arguments!");
   assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
          "getNoopOrSignExtend cannot truncate!");
@@ -3948,8 +4069,7 @@ ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) {
 const SCEV *
 ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
-  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot noop or any extend with non-integer arguments!");
   assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
          "getNoopOrAnyExtend cannot truncate!");
@@ -3961,8 +4081,7 @@ ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) {
 const SCEV *
 ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
-  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
-         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot truncate or noop with non-integer arguments!");
   assert(getTypeSizeInBits(SrcTy) >= getTypeSizeInBits(Ty) &&
          "getTruncateOrNoop cannot extend!");
@@ -3986,15 +4105,32 @@ const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS,
 
 const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS,
                                                         const SCEV *RHS) {
-  const SCEV *PromotedLHS = LHS;
-  const SCEV *PromotedRHS = RHS;
+  SmallVector<const SCEV *, 2> Ops = { LHS, RHS };
+  return getUMinFromMismatchedTypes(Ops);
+}
+
+const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(
+    SmallVectorImpl<const SCEV *> &Ops) {
+  assert(!Ops.empty() && "At least one operand must be!");
+  // Trivial case.
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // Find the max type first.
+  Type *MaxType = nullptr;
+  for (auto *S : Ops)
+    if (MaxType)
+      MaxType = getWiderType(MaxType, S->getType());
+    else
+      MaxType = S->getType();
 
-  if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
-    PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
-  else
-    PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType());
+  // Extend all ops to max type.
+  SmallVector<const SCEV *, 2> PromotedOps;
+  for (auto *S : Ops)
+    PromotedOps.push_back(getNoopOrZeroExtend(S, MaxType));
 
-  return getUMinExpr(PromotedLHS, PromotedRHS);
+  // Generate umin.
+  return getUMinExpr(PromotedOps);
 }
 
 const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
@@ -4071,37 +4207,90 @@ void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) {
 
 namespace {
 
+/// Takes SCEV S and Loop L. For each AddRec sub-expression, use its start
+/// expression in case its Loop is L. If it is not L then
+/// if IgnoreOtherLoops is true then use AddRec itself
+/// otherwise rewrite cannot be done.
+/// If SCEV contains non-invariant unknown SCEV rewrite cannot be done.
 class SCEVInitRewriter : public SCEVRewriteVisitor<SCEVInitRewriter> {
 public:
-  static const SCEV *rewrite(const SCEV *S, const Loop *L,
-                             ScalarEvolution &SE) {
+  static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE,
+                             bool IgnoreOtherLoops = true) {
     SCEVInitRewriter Rewriter(L, SE);
     const SCEV *Result = Rewriter.visit(S);
-    return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
+    if (Rewriter.hasSeenLoopVariantSCEVUnknown())
+      return SE.getCouldNotCompute();
+    return Rewriter.hasSeenOtherLoops() && !IgnoreOtherLoops
+               ? SE.getCouldNotCompute()
+               : Result;
   }
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     if (!SE.isLoopInvariant(Expr, L))
-      Valid = false;
+      SeenLoopVariantSCEVUnknown = true;
     return Expr;
   }
 
   const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
-    // Only allow AddRecExprs for this loop.
+    // Only re-write AddRecExprs for this loop.
     if (Expr->getLoop() == L)
       return Expr->getStart();
-    Valid = false;
+    SeenOtherLoops = true;
     return Expr;
   }
 
-  bool isValid() { return Valid; }
+  bool hasSeenLoopVariantSCEVUnknown() { return SeenLoopVariantSCEVUnknown; }
+
+  bool hasSeenOtherLoops() { return SeenOtherLoops; }
 
 private:
   explicit SCEVInitRewriter(const Loop *L, ScalarEvolution &SE)
       : SCEVRewriteVisitor(SE), L(L) {}
 
   const Loop *L;
-  bool Valid = true;
+  bool SeenLoopVariantSCEVUnknown = false;
+  bool SeenOtherLoops = false;
+};
+
+/// Takes SCEV S and Loop L. For each AddRec sub-expression, use its post
+/// increment expression in case its Loop is L. If it is not L then
+/// use AddRec itself.
+/// If SCEV contains non-invariant unknown SCEV rewrite cannot be done.
+class SCEVPostIncRewriter : public SCEVRewriteVisitor<SCEVPostIncRewriter> {
+public:
+  static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE) {
+    SCEVPostIncRewriter Rewriter(L, SE);
+    const SCEV *Result = Rewriter.visit(S);
+    return Rewriter.hasSeenLoopVariantSCEVUnknown()
+        ? SE.getCouldNotCompute()
+        : Result;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (!SE.isLoopInvariant(Expr, L))
+      SeenLoopVariantSCEVUnknown = true;
+    return Expr;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    // Only re-write AddRecExprs for this loop.
+    if (Expr->getLoop() == L)
+      return Expr->getPostIncExpr(SE);
+    SeenOtherLoops = true;
+    return Expr;
+  }
+
+  bool hasSeenLoopVariantSCEVUnknown() { return SeenLoopVariantSCEVUnknown; }
+
+  bool hasSeenOtherLoops() { return SeenOtherLoops; }
+
+private:
+  explicit SCEVPostIncRewriter(const Loop *L, ScalarEvolution &SE)
+      : SCEVRewriteVisitor(SE), L(L) {}
+
+  const Loop *L;
+  bool SeenLoopVariantSCEVUnknown = false;
+  bool SeenOtherLoops = false;
 };
 
 /// This class evaluates the compare condition by matching it against the
@@ -4673,7 +4862,7 @@ ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI
 
   const SCEV *StartExtended = getExtendedExpr(StartVal, Signed);
   if (PredIsKnownFalse(StartVal, StartExtended)) {
-    DEBUG(dbgs() << "P2 is compile-time false\n";);
+    LLVM_DEBUG(dbgs() << "P2 is compile-time false\n";);
     return None;
   }
 
@@ -4681,7 +4870,7 @@ ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI
   // NSSW or NUSW)
   const SCEV *AccumExtended = getExtendedExpr(Accum, /*CreateSignExtend=*/true);
   if (PredIsKnownFalse(Accum, AccumExtended)) {
-    DEBUG(dbgs() << "P3 is compile-time false\n";);
+    LLVM_DEBUG(dbgs() << "P3 is compile-time false\n";);
     return None;
   }
 
@@ -4690,7 +4879,7 @@ ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI
     if (Expr != ExtendedExpr &&
         !isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) {
       const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr);
-      DEBUG (dbgs() << "Added Predicate: " << *Pred);
+      LLVM_DEBUG(dbgs() << "Added Predicate: " << *Pred);
       Predicates.push_back(Pred);
     }
   };
@@ -4953,7 +5142,7 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
     // by one iteration:
     //   PHI(f(0), f({1,+,1})) --> f({0,+,1})
     const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this);
-    const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this);
+    const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this, false);
     if (Shifted != getCouldNotCompute() &&
         Start != getCouldNotCompute()) {
       const SCEV *StartVal = getSCEV(StartValueV);
@@ -5515,6 +5704,25 @@ ScalarEvolution::getRangeRef(const SCEV *S,
                           APInt::getSignedMaxValue(BitWidth).ashr(NS - 1) + 1));
     }
 
+    // A range of Phi is a subset of union of all ranges of its input.
+    if (const PHINode *Phi = dyn_cast<PHINode>(U->getValue())) {
+      // Make sure that we do not run over cycled Phis.
+      if (PendingPhiRanges.insert(Phi).second) {
+        ConstantRange RangeFromOps(BitWidth, /*isFullSet=*/false);
+        for (auto &Op : Phi->operands()) {
+          auto OpRange = getRangeRef(getSCEV(Op), SignHint);
+          RangeFromOps = RangeFromOps.unionWith(OpRange);
+          // No point to continue if we already have a full set.
+          if (RangeFromOps.isFullSet())
+            break;
+        }
+        ConservativeResult = ConservativeResult.intersectWith(RangeFromOps);
+        bool Erased = PendingPhiRanges.erase(Phi);
+        assert(Erased && "Failed to erase Phi properly?");
+        (void) Erased;
+      }
+    }
+
     return setRange(U, SignHint, std::move(ConservativeResult));
   }
 
@@ -6134,33 +6342,33 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       }
       break;
 
-  case Instruction::Shl:
-    // Turn shift left of a constant amount into a multiply.
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(BO->RHS)) {
-      uint32_t BitWidth = cast<IntegerType>(SA->getType())->getBitWidth();
-
-      // If the shift count is not less than the bitwidth, the result of
-      // the shift is undefined. Don't try to analyze it, because the
-      // resolution chosen here may differ from the resolution chosen in
-      // other parts of the compiler.
-      if (SA->getValue().uge(BitWidth))
-        break;
+    case Instruction::Shl:
+      // Turn shift left of a constant amount into a multiply.
+      if (ConstantInt *SA = dyn_cast<ConstantInt>(BO->RHS)) {
+        uint32_t BitWidth = cast<IntegerType>(SA->getType())->getBitWidth();
 
-      // It is currently not resolved how to interpret NSW for left
-      // shift by BitWidth - 1, so we avoid applying flags in that
-      // case. Remove this check (or this comment) once the situation
-      // is resolved. See
-      // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html
-      // and http://reviews.llvm.org/D8890 .
-      auto Flags = SCEV::FlagAnyWrap;
-      if (BO->Op && SA->getValue().ult(BitWidth - 1))
-        Flags = getNoWrapFlagsFromUB(BO->Op);
+        // If the shift count is not less than the bitwidth, the result of
+        // the shift is undefined. Don't try to analyze it, because the
+        // resolution chosen here may differ from the resolution chosen in
+        // other parts of the compiler.
+        if (SA->getValue().uge(BitWidth))
+          break;
 
-      Constant *X = ConstantInt::get(getContext(),
-        APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
-      return getMulExpr(getSCEV(BO->LHS), getSCEV(X), Flags);
-    }
-    break;
+        // It is currently not resolved how to interpret NSW for left
+        // shift by BitWidth - 1, so we avoid applying flags in that
+        // case. Remove this check (or this comment) once the situation
+        // is resolved. See
+        // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html
+        // and http://reviews.llvm.org/D8890 .
+        auto Flags = SCEV::FlagAnyWrap;
+        if (BO->Op && SA->getValue().ult(BitWidth - 1))
+          Flags = getNoWrapFlagsFromUB(BO->Op);
+
+        Constant *X = ConstantInt::get(
+            getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
+        return getMulExpr(getSCEV(BO->LHS), getSCEV(X), Flags);
+      }
+      break;
 
     case Instruction::AShr: {
       // AShr X, C, where C is a constant.
@@ -6384,11 +6592,11 @@ const SCEV *ScalarEvolution::getExitCount(const Loop *L,
 const SCEV *
 ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
                                                  SCEVUnionPredicate &Preds) {
-  return getPredicatedBackedgeTakenInfo(L).getExact(this, &Preds);
+  return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds);
 }
 
 const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
-  return getBackedgeTakenInfo(L).getExact(this);
+  return getBackedgeTakenInfo(L).getExact(L, this);
 }
 
 /// Similar to getBackedgeTakenCount, except return the least SCEV value that is
@@ -6445,8 +6653,13 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // must be cleared in this scope.
   BackedgeTakenInfo Result = computeBackedgeTakenCount(L);
 
-  if (Result.getExact(this) != getCouldNotCompute()) {
-    assert(isLoopInvariant(Result.getExact(this), L) &&
+  // In product build, there are no usage of statistic.
+  (void)NumTripCountsComputed;
+  (void)NumTripCountsNotComputed;
+#if LLVM_ENABLE_STATS || !defined(NDEBUG)
+  const SCEV *BEExact = Result.getExact(L, this);
+  if (BEExact != getCouldNotCompute()) {
+    assert(isLoopInvariant(BEExact, L) &&
            isLoopInvariant(Result.getMax(this), L) &&
            "Computed backedge-taken count isn't loop invariant for loop!");
     ++NumTripCountsComputed;
@@ -6456,6 +6669,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
     // Only count loops that have phi nodes as not being computable.
     ++NumTripCountsNotComputed;
   }
+#endif // LLVM_ENABLE_STATS || !defined(NDEBUG)
 
   // Now that we know more about the trip count for this loop, forget any
   // existing SCEV values for PHI nodes in this loop since they are only
@@ -6591,6 +6805,12 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
   }
 }
 
+void ScalarEvolution::forgetTopmostLoop(const Loop *L) {
+  while (Loop *Parent = L->getParentLoop())
+    L = Parent;
+  forgetLoop(L);
+}
+
 void ScalarEvolution::forgetValue(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return;
@@ -6619,28 +6839,35 @@ void ScalarEvolution::forgetValue(Value *V) {
 }
 
 /// Get the exact loop backedge taken count considering all loop exits. A
-/// computable result can only be returned for loops with a single exit.
-/// Returning the minimum taken count among all exits is incorrect because one
-/// of the loop's exit limit's may have been skipped. howFarToZero assumes that
-/// the limit of each loop test is never skipped. This is a valid assumption as
-/// long as the loop exits via that test. For precise results, it is the
-/// caller's responsibility to specify the relevant loop exit using
-/// getExact(ExitingBlock, SE).
+/// computable result can only be returned for loops with all exiting blocks
+/// dominating the latch. howFarToZero assumes that the limit of each loop test
+/// is never skipped. This is a valid assumption as long as the loop exits via
+/// that test. For precise results, it is the caller's responsibility to specify
+/// the relevant loop exiting block using getExact(ExitingBlock, SE).
 const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE,
+ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
                                              SCEVUnionPredicate *Preds) const {
   // If any exits were not computable, the loop is not computable.
   if (!isComplete() || ExitNotTaken.empty())
     return SE->getCouldNotCompute();
 
-  const SCEV *BECount = nullptr;
+  const BasicBlock *Latch = L->getLoopLatch();
+  // All exiting blocks we have collected must dominate the only backedge.
+  if (!Latch)
+    return SE->getCouldNotCompute();
+
+  // All exiting blocks we have gathered dominate loop's latch, so exact trip
+  // count is simply a minimum out of all these calculated exit counts.
+  SmallVector<const SCEV *, 2> Ops;
   for (auto &ENT : ExitNotTaken) {
-    assert(ENT.ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");
+    const SCEV *BECount = ENT.ExactNotTaken;
+    assert(BECount != SE->getCouldNotCompute() && "Bad exit SCEV!");
+    assert(SE->DT.dominates(ENT.ExitingBlock, Latch) &&
+           "We should only have known counts for exiting blocks that dominate "
+           "latch!");
+
+    Ops.push_back(BECount);
 
-    if (!BECount)
-      BECount = ENT.ExactNotTaken;
-    else if (BECount != ENT.ExactNotTaken)
-      return SE->getCouldNotCompute();
     if (Preds && !ENT.hasAlwaysTruePredicate())
       Preds->add(ENT.Predicate.get());
 
@@ -6648,8 +6875,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE,
            "Predicate should be always true!");
   }
 
-  assert(BECount && "Invalid not taken count for loop exit");
-  return BECount;
+  return SE->getUMinFromMismatchedTypes(Ops);
 }
 
 /// Get the exact not taken count for this loop exit.
@@ -6846,99 +7072,60 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L,
 ScalarEvolution::ExitLimit
 ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
                                       bool AllowPredicates) {
-  // Okay, we've chosen an exiting block.  See what condition causes us to exit
-  // at this block and remember the exit block and whether all other targets
-  // lead to the loop header.
-  bool MustExecuteLoopHeader = true;
-  BasicBlock *Exit = nullptr;
-  for (auto *SBB : successors(ExitingBlock))
-    if (!L->contains(SBB)) {
-      if (Exit) // Multiple exit successors.
-        return getCouldNotCompute();
-      Exit = SBB;
-    } else if (SBB != L->getHeader()) {
-      MustExecuteLoopHeader = false;
-    }
-
-  // At this point, we know we have a conditional branch that determines whether
-  // the loop is exited.  However, we don't know if the branch is executed each
-  // time through the loop.  If not, then the execution count of the branch will
-  // not be equal to the trip count of the loop.
-  //
-  // Currently we check for this by checking to see if the Exit branch goes to
-  // the loop header.  If so, we know it will always execute the same number of
-  // times as the loop.  We also handle the case where the exit block *is* the
-  // loop header.  This is common for un-rotated loops.
-  //
-  // If both of those tests fail, walk up the unique predecessor chain to the
-  // header, stopping if there is an edge that doesn't exit the loop. If the
-  // header is reached, the execution count of the branch will be equal to the
-  // trip count of the loop.
-  //
-  //  More extensive analysis could be done to handle more cases here.
-  //
-  if (!MustExecuteLoopHeader && ExitingBlock != L->getHeader()) {
-    // The simple checks failed, try climbing the unique predecessor chain
-    // up to the header.
-    bool Ok = false;
-    for (BasicBlock *BB = ExitingBlock; BB; ) {
-      BasicBlock *Pred = BB->getUniquePredecessor();
-      if (!Pred)
-        return getCouldNotCompute();
-      TerminatorInst *PredTerm = Pred->getTerminator();
-      for (const BasicBlock *PredSucc : PredTerm->successors()) {
-        if (PredSucc == BB)
-          continue;
-        // If the predecessor has a successor that isn't BB and isn't
-        // outside the loop, assume the worst.
-        if (L->contains(PredSucc))
-          return getCouldNotCompute();
-      }
-      if (Pred == L->getHeader()) {
-        Ok = true;
-        break;
-      }
-      BB = Pred;
-    }
-    if (!Ok)
-      return getCouldNotCompute();
-  }
+  assert(L->contains(ExitingBlock) && "Exit count for non-loop block?");
+  // If our exiting block does not dominate the latch, then its connection with
+  // loop's exit limit may be far from trivial.
+  const BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || !DT.dominates(ExitingBlock, Latch))
+    return getCouldNotCompute();
 
   bool IsOnlyExit = (L->getExitingBlock() != nullptr);
   TerminatorInst *Term = ExitingBlock->getTerminator();
   if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
     assert(BI->isConditional() && "If unconditional, it can't be in loop!");
+    bool ExitIfTrue = !L->contains(BI->getSuccessor(0));
+    assert(ExitIfTrue == L->contains(BI->getSuccessor(1)) &&
+           "It should have one successor in loop and one exit block!");
     // Proceed to the next level to examine the exit condition expression.
     return computeExitLimitFromCond(
-        L, BI->getCondition(), BI->getSuccessor(0), BI->getSuccessor(1),
+        L, BI->getCondition(), ExitIfTrue,
         /*ControlsExit=*/IsOnlyExit, AllowPredicates);
   }
 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(Term))
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(Term)) {
+    // For switch, make sure that there is a single exit from the loop.
+    BasicBlock *Exit = nullptr;
+    for (auto *SBB : successors(ExitingBlock))
+      if (!L->contains(SBB)) {
+        if (Exit) // Multiple exit successors.
+          return getCouldNotCompute();
+        Exit = SBB;
+      }
+    assert(Exit && "Exiting block must have at least one exit");
     return computeExitLimitFromSingleExitSwitch(L, SI, Exit,
                                                 /*ControlsExit=*/IsOnlyExit);
+  }
 
   return getCouldNotCompute();
 }
 
 ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCond(
-    const Loop *L, Value *ExitCond, BasicBlock *TBB, BasicBlock *FBB,
+    const Loop *L, Value *ExitCond, bool ExitIfTrue,
     bool ControlsExit, bool AllowPredicates) {
-  ScalarEvolution::ExitLimitCacheTy Cache(L, TBB, FBB, AllowPredicates);
-  return computeExitLimitFromCondCached(Cache, L, ExitCond, TBB, FBB,
+  ScalarEvolution::ExitLimitCacheTy Cache(L, ExitIfTrue, AllowPredicates);
+  return computeExitLimitFromCondCached(Cache, L, ExitCond, ExitIfTrue,
                                         ControlsExit, AllowPredicates);
 }
 
 Optional<ScalarEvolution::ExitLimit>
 ScalarEvolution::ExitLimitCache::find(const Loop *L, Value *ExitCond,
-                                      BasicBlock *TBB, BasicBlock *FBB,
-                                      bool ControlsExit, bool AllowPredicates) {
+                                      bool ExitIfTrue, bool ControlsExit,
+                                      bool AllowPredicates) {
   (void)this->L;
-  (void)this->TBB;
-  (void)this->FBB;
+  (void)this->ExitIfTrue;
   (void)this->AllowPredicates;
 
-  assert(this->L == L && this->TBB == TBB && this->FBB == FBB &&
+  assert(this->L == L && this->ExitIfTrue == ExitIfTrue &&
          this->AllowPredicates == AllowPredicates &&
          "Variance in assumed invariant key components!");
   auto Itr = TripCountMap.find({ExitCond, ControlsExit});
@@ -6948,47 +7135,48 @@ ScalarEvolution::ExitLimitCache::find(const Loop *L, Value *ExitCond,
 }
 
 void ScalarEvolution::ExitLimitCache::insert(const Loop *L, Value *ExitCond,
-                                             BasicBlock *TBB, BasicBlock *FBB,
+                                             bool ExitIfTrue,
                                              bool ControlsExit,
                                              bool AllowPredicates,
                                              const ExitLimit &EL) {
-  assert(this->L == L && this->TBB == TBB && this->FBB == FBB &&
+  assert(this->L == L && this->ExitIfTrue == ExitIfTrue &&
          this->AllowPredicates == AllowPredicates &&
          "Variance in assumed invariant key components!");
 
   auto InsertResult = TripCountMap.insert({{ExitCond, ControlsExit}, EL});
   assert(InsertResult.second && "Expected successful insertion!");
   (void)InsertResult;
+  (void)ExitIfTrue;
 }
 
 ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondCached(
-    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, BasicBlock *TBB,
-    BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) {
+    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue,
+    bool ControlsExit, bool AllowPredicates) {
 
   if (auto MaybeEL =
-          Cache.find(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates))
+          Cache.find(L, ExitCond, ExitIfTrue, ControlsExit, AllowPredicates))
     return *MaybeEL;
 
-  ExitLimit EL = computeExitLimitFromCondImpl(Cache, L, ExitCond, TBB, FBB,
+  ExitLimit EL = computeExitLimitFromCondImpl(Cache, L, ExitCond, ExitIfTrue,
                                               ControlsExit, AllowPredicates);
-  Cache.insert(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates, EL);
+  Cache.insert(L, ExitCond, ExitIfTrue, ControlsExit, AllowPredicates, EL);
   return EL;
 }
 
 ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
-    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, BasicBlock *TBB,
-    BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) {
+    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue,
+    bool ControlsExit, bool AllowPredicates) {
   // Check if the controlling expression for this loop is an And or Or.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
-      bool EitherMayExit = L->contains(TBB);
+      bool EitherMayExit = !ExitIfTrue;
       ExitLimit EL0 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit,
-          AllowPredicates);
+          Cache, L, BO->getOperand(0), ExitIfTrue,
+          ControlsExit && !EitherMayExit, AllowPredicates);
       ExitLimit EL1 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit,
-          AllowPredicates);
+          Cache, L, BO->getOperand(1), ExitIfTrue,
+          ControlsExit && !EitherMayExit, AllowPredicates);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (EitherMayExit) {
@@ -7010,7 +7198,6 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
       } else {
         // Both conditions must be true at the same time for the loop to exit.
         // For now, be conservative.
-        assert(L->contains(FBB) && "Loop block has no successor in loop!");
         if (EL0.MaxNotTaken == EL1.MaxNotTaken)
           MaxBECount = EL0.MaxNotTaken;
         if (EL0.ExactNotTaken == EL1.ExactNotTaken)
@@ -7031,13 +7218,13 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
     }
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
-      bool EitherMayExit = L->contains(FBB);
+      bool EitherMayExit = ExitIfTrue;
       ExitLimit EL0 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit,
-          AllowPredicates);
+          Cache, L, BO->getOperand(0), ExitIfTrue,
+          ControlsExit && !EitherMayExit, AllowPredicates);
       ExitLimit EL1 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit,
-          AllowPredicates);
+          Cache, L, BO->getOperand(1), ExitIfTrue,
+          ControlsExit && !EitherMayExit, AllowPredicates);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (EitherMayExit) {
@@ -7059,7 +7246,6 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
       } else {
         // Both conditions must be false at the same time for the loop to exit.
         // For now, be conservative.
-        assert(L->contains(TBB) && "Loop block has no successor in loop!");
         if (EL0.MaxNotTaken == EL1.MaxNotTaken)
           MaxBECount = EL0.MaxNotTaken;
         if (EL0.ExactNotTaken == EL1.ExactNotTaken)
@@ -7075,12 +7261,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
   // Proceed to the next level to examine the icmp.
   if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond)) {
     ExitLimit EL =
-        computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
+        computeExitLimitFromICmp(L, ExitCondICmp, ExitIfTrue, ControlsExit);
     if (EL.hasFullInfo() || !AllowPredicates)
       return EL;
 
     // Try again, but use SCEV predicates this time.
-    return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit,
+    return computeExitLimitFromICmp(L, ExitCondICmp, ExitIfTrue, ControlsExit,
                                     /*AllowPredicates=*/true);
   }
 
@@ -7089,7 +7275,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
   // preserve the CFG and is temporarily leaving constant conditions
   // in place.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(ExitCond)) {
-    if (L->contains(FBB) == !CI->getZExtValue())
+    if (ExitIfTrue == !CI->getZExtValue())
       // The backedge is always taken.
       return getCouldNotCompute();
     else
@@ -7098,19 +7284,18 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
   }
 
   // If it's not an integer or pointer comparison then compute it the hard way.
-  return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+  return computeExitCountExhaustively(L, ExitCond, ExitIfTrue);
 }
 
 ScalarEvolution::ExitLimit
 ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
                                           ICmpInst *ExitCond,
-                                          BasicBlock *TBB,
-                                          BasicBlock *FBB,
+                                          bool ExitIfTrue,
                                           bool ControlsExit,
                                           bool AllowPredicates) {
   // If the condition was exit on true, convert the condition to exit on false
   ICmpInst::Predicate Pred;
-  if (!L->contains(FBB))
+  if (!ExitIfTrue)
     Pred = ExitCond->getPredicate();
   else
     Pred = ExitCond->getInversePredicate();
@@ -7192,7 +7377,7 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
   }
 
   auto *ExhaustiveCount =
-      computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+      computeExitCountExhaustively(L, ExitCond, ExitIfTrue);
 
   if (!isa<SCEVCouldNotCompute>(ExhaustiveCount))
     return ExhaustiveCount;
@@ -8104,6 +8289,14 @@ const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
   return getSCEVAtScope(getSCEV(V), L);
 }
 
+const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const {
+  if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S))
+    return stripInjectiveFunctions(ZExt->getOperand());
+  if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S))
+    return stripInjectiveFunctions(SExt->getOperand());
+  return S;
+}
+
 /// Finds the minimum unsigned root of the following equation:
 ///
 ///     A * X = B (mod N)
@@ -8233,7 +8426,9 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     return getCouldNotCompute();  // Otherwise it will loop infinitely.
   }
 
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V);
+  const SCEVAddRecExpr *AddRec =
+      dyn_cast<SCEVAddRecExpr>(stripInjectiveFunctions(V));
+
   if (!AddRec && AllowPredicates)
     // Try to make this an AddRec using runtime tests, in the first X
     // iterations of this loop, where X is the SCEV expression found by the
@@ -8641,43 +8836,88 @@ bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
   return isKnownNegative(S) || isKnownPositive(S);
 }
 
+std::pair<const SCEV *, const SCEV *>
+ScalarEvolution::SplitIntoInitAndPostInc(const Loop *L, const SCEV *S) {
+  // Compute SCEV on entry of loop L.
+  const SCEV *Start = SCEVInitRewriter::rewrite(S, L, *this);
+  if (Start == getCouldNotCompute())
+    return { Start, Start };
+  // Compute post increment SCEV for loop L.
+  const SCEV *PostInc = SCEVPostIncRewriter::rewrite(S, L, *this);
+  assert(PostInc != getCouldNotCompute() && "Unexpected could not compute");
+  return { Start, PostInc };
+}
+
+bool ScalarEvolution::isKnownViaInduction(ICmpInst::Predicate Pred,
+                                          const SCEV *LHS, const SCEV *RHS) {
+  // First collect all loops.
+  SmallPtrSet<const Loop *, 8> LoopsUsed;
+  getUsedLoops(LHS, LoopsUsed);
+  getUsedLoops(RHS, LoopsUsed);
+
+  if (LoopsUsed.empty())
+    return false;
+
+  // Domination relationship must be a linear order on collected loops.
+#ifndef NDEBUG
+  for (auto *L1 : LoopsUsed)
+    for (auto *L2 : LoopsUsed)
+      assert((DT.dominates(L1->getHeader(), L2->getHeader()) ||
+              DT.dominates(L2->getHeader(), L1->getHeader())) &&
+             "Domination relationship is not a linear order");
+#endif
+
+  const Loop *MDL =
+      *std::max_element(LoopsUsed.begin(), LoopsUsed.end(),
+                        [&](const Loop *L1, const Loop *L2) {
+         return DT.properlyDominates(L1->getHeader(), L2->getHeader());
+       });
+
+  // Get init and post increment value for LHS.
+  auto SplitLHS = SplitIntoInitAndPostInc(MDL, LHS);
+  // if LHS contains unknown non-invariant SCEV then bail out.
+  if (SplitLHS.first == getCouldNotCompute())
+    return false;
+  assert (SplitLHS.second != getCouldNotCompute() && "Unexpected CNC");
+  // Get init and post increment value for RHS.
+  auto SplitRHS = SplitIntoInitAndPostInc(MDL, RHS);
+  // if RHS contains unknown non-invariant SCEV then bail out.
+  if (SplitRHS.first == getCouldNotCompute())
+    return false;
+  assert (SplitRHS.second != getCouldNotCompute() && "Unexpected CNC");
+  // It is possible that init SCEV contains an invariant load but it does
+  // not dominate MDL and is not available at MDL loop entry, so we should
+  // check it here.
+  if (!isAvailableAtLoopEntry(SplitLHS.first, MDL) ||
+      !isAvailableAtLoopEntry(SplitRHS.first, MDL))
+    return false;
+
+  return isLoopEntryGuardedByCond(MDL, Pred, SplitLHS.first, SplitRHS.first) &&
+         isLoopBackedgeGuardedByCond(MDL, Pred, SplitLHS.second,
+                                     SplitRHS.second);
+}
+
 bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
                                        const SCEV *LHS, const SCEV *RHS) {
   // Canonicalize the inputs first.
   (void)SimplifyICmpOperands(Pred, LHS, RHS);
 
-  // If LHS or RHS is an addrec, check to see if the condition is true in
-  // every iteration of the loop.
-  // If LHS and RHS are both addrec, both conditions must be true in
-  // every iteration of the loop.
-  const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS);
-  const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
-  bool LeftGuarded = false;
-  bool RightGuarded = false;
-  if (LAR) {
-    const Loop *L = LAR->getLoop();
-    if (isLoopEntryGuardedByCond(L, Pred, LAR->getStart(), RHS) &&
-        isLoopBackedgeGuardedByCond(L, Pred, LAR->getPostIncExpr(*this), RHS)) {
-      if (!RAR) return true;
-      LeftGuarded = true;
-    }
-  }
-  if (RAR) {
-    const Loop *L = RAR->getLoop();
-    if (isLoopEntryGuardedByCond(L, Pred, LHS, RAR->getStart()) &&
-        isLoopBackedgeGuardedByCond(L, Pred, LHS, RAR->getPostIncExpr(*this))) {
-      if (!LAR) return true;
-      RightGuarded = true;
-    }
-  }
-  if (LeftGuarded && RightGuarded)
+  if (isKnownViaInduction(Pred, LHS, RHS))
     return true;
 
   if (isKnownPredicateViaSplitting(Pred, LHS, RHS))
     return true;
 
-  // Otherwise see what can be done with known constant ranges.
-  return isKnownPredicateViaConstantRanges(Pred, LHS, RHS);
+  // Otherwise see what can be done with some simple reasoning.
+  return isKnownViaNonRecursiveReasoning(Pred, LHS, RHS);
+}
+
+bool ScalarEvolution::isKnownOnEveryIteration(ICmpInst::Predicate Pred,
+                                              const SCEVAddRecExpr *LHS,
+                                              const SCEV *RHS) {
+  const Loop *L = LHS->getLoop();
+  return isLoopEntryGuardedByCond(L, Pred, LHS->getStart(), RHS) &&
+         isLoopBackedgeGuardedByCond(L, Pred, LHS->getPostIncExpr(*this), RHS);
 }
 
 bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS,
@@ -8944,7 +9184,7 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return true;
 
-  if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS))
+  if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS))
     return true;
 
   BasicBlock *Latch = L->getLoopLatch();
@@ -9049,9 +9289,68 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return false;
 
-  if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS))
+  // Both LHS and RHS must be available at loop entry.
+  assert(isAvailableAtLoopEntry(LHS, L) &&
+         "LHS is not available at Loop Entry");
+  assert(isAvailableAtLoopEntry(RHS, L) &&
+         "RHS is not available at Loop Entry");
+
+  if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS))
     return true;
 
+  // If we cannot prove strict comparison (e.g. a > b), maybe we can prove
+  // the facts (a >= b && a != b) separately. A typical situation is when the
+  // non-strict comparison is known from ranges and non-equality is known from
+  // dominating predicates. If we are proving strict comparison, we always try
+  // to prove non-equality and non-strict comparison separately.
+  auto NonStrictPredicate = ICmpInst::getNonStrictPredicate(Pred);
+  const bool ProvingStrictComparison = (Pred != NonStrictPredicate);
+  bool ProvedNonStrictComparison = false;
+  bool ProvedNonEquality = false;
+
+  if (ProvingStrictComparison) {
+    ProvedNonStrictComparison =
+        isKnownViaNonRecursiveReasoning(NonStrictPredicate, LHS, RHS);
+    ProvedNonEquality =
+        isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_NE, LHS, RHS);
+    if (ProvedNonStrictComparison && ProvedNonEquality)
+      return true;
+  }
+
+  // Try to prove (Pred, LHS, RHS) using isImpliedViaGuard.
+  auto ProveViaGuard = [&](BasicBlock *Block) {
+    if (isImpliedViaGuard(Block, Pred, LHS, RHS))
+      return true;
+    if (ProvingStrictComparison) {
+      if (!ProvedNonStrictComparison)
+        ProvedNonStrictComparison =
+            isImpliedViaGuard(Block, NonStrictPredicate, LHS, RHS);
+      if (!ProvedNonEquality)
+        ProvedNonEquality =
+            isImpliedViaGuard(Block, ICmpInst::ICMP_NE, LHS, RHS);
+      if (ProvedNonStrictComparison && ProvedNonEquality)
+        return true;
+    }
+    return false;
+  };
+
+  // Try to prove (Pred, LHS, RHS) using isImpliedCond.
+  auto ProveViaCond = [&](Value *Condition, bool Inverse) {
+    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse))
+      return true;
+    if (ProvingStrictComparison) {
+      if (!ProvedNonStrictComparison)
+        ProvedNonStrictComparison =
+            isImpliedCond(NonStrictPredicate, LHS, RHS, Condition, Inverse);
+      if (!ProvedNonEquality)
+        ProvedNonEquality =
+            isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS, Condition, Inverse);
+      if (ProvedNonStrictComparison && ProvedNonEquality)
+        return true;
+    }
+    return false;
+  };
+
   // Starting at the loop predecessor, climb up the predecessor chain, as long
   // as there are predecessors that can be found that have unique successors
   // leading to the original header.
@@ -9060,7 +9359,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
        Pair.first;
        Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
 
-    if (isImpliedViaGuard(Pair.first, Pred, LHS, RHS))
+    if (ProveViaGuard(Pair.first))
       return true;
 
     BranchInst *LoopEntryPredicate =
@@ -9069,9 +9368,8 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
         LoopEntryPredicate->isUnconditional())
       continue;
 
-    if (isImpliedCond(Pred, LHS, RHS,
-                      LoopEntryPredicate->getCondition(),
-                      LoopEntryPredicate->getSuccessor(0) != Pair.second))
+    if (ProveViaCond(LoopEntryPredicate->getCondition(),
+                     LoopEntryPredicate->getSuccessor(0) != Pair.second))
       return true;
   }
 
@@ -9083,7 +9381,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
     if (!DT.dominates(CI, L->getHeader()))
       continue;
 
-    if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
+    if (ProveViaCond(CI->getArgOperand(0), false))
       return true;
   }
 
@@ -9318,17 +9616,25 @@ Optional<APInt> ScalarEvolution::computeConstantDifference(const SCEV *More,
     return M - L;
   }
 
-  const SCEV *L, *R;
   SCEV::NoWrapFlags Flags;
-  if (splitBinaryAdd(Less, L, R, Flags))
-    if (const auto *LC = dyn_cast<SCEVConstant>(L))
-      if (R == More)
-        return -(LC->getAPInt());
-
-  if (splitBinaryAdd(More, L, R, Flags))
-    if (const auto *LC = dyn_cast<SCEVConstant>(L))
-      if (R == Less)
-        return LC->getAPInt();
+  const SCEV *LLess = nullptr, *RLess = nullptr;
+  const SCEV *LMore = nullptr, *RMore = nullptr;
+  const SCEVConstant *C1 = nullptr, *C2 = nullptr;
+  // Compare (X + C1) vs X.
+  if (splitBinaryAdd(Less, LLess, RLess, Flags))
+    if ((C1 = dyn_cast<SCEVConstant>(LLess)))
+      if (RLess == More)
+        return -(C1->getAPInt());
+
+  // Compare X vs (X + C2).
+  if (splitBinaryAdd(More, LMore, RMore, Flags))
+    if ((C2 = dyn_cast<SCEVConstant>(LMore)))
+      if (RMore == Less)
+        return C2->getAPInt();
+
+  // Compare (X + C1) vs (X + C2).
+  if (C1 && C2 && RLess == RMore)
+    return C2->getAPInt() - C1->getAPInt();
 
   return None;
 }
@@ -9405,10 +9711,121 @@ bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
   }
 
   // Try to prove (1) or (2), as needed.
-  return isLoopEntryGuardedByCond(L, Pred, FoundRHS,
+  return isAvailableAtLoopEntry(FoundRHS, L) &&
+         isLoopEntryGuardedByCond(L, Pred, FoundRHS,
                                   getConstant(FoundRHSLimit));
 }
 
+bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred,
+                                        const SCEV *LHS, const SCEV *RHS,
+                                        const SCEV *FoundLHS,
+                                        const SCEV *FoundRHS, unsigned Depth) {
+  const PHINode *LPhi = nullptr, *RPhi = nullptr;
+
+  auto ClearOnExit = make_scope_exit([&]() {
+    if (LPhi) {
+      bool Erased = PendingMerges.erase(LPhi);
+      assert(Erased && "Failed to erase LPhi!");
+      (void)Erased;
+    }
+    if (RPhi) {
+      bool Erased = PendingMerges.erase(RPhi);
+      assert(Erased && "Failed to erase RPhi!");
+      (void)Erased;
+    }
+  });
+
+  // Find respective Phis and check that they are not being pending.
+  if (const SCEVUnknown *LU = dyn_cast<SCEVUnknown>(LHS))
+    if (auto *Phi = dyn_cast<PHINode>(LU->getValue())) {
+      if (!PendingMerges.insert(Phi).second)
+        return false;
+      LPhi = Phi;
+    }
+  if (const SCEVUnknown *RU = dyn_cast<SCEVUnknown>(RHS))
+    if (auto *Phi = dyn_cast<PHINode>(RU->getValue())) {
+      // If we detect a loop of Phi nodes being processed by this method, for
+      // example:
+      //
+      //   %a = phi i32 [ %some1, %preheader ], [ %b, %latch ]
+      //   %b = phi i32 [ %some2, %preheader ], [ %a, %latch ]
+      //
+      // we don't want to deal with a case that complex, so return conservative
+      // answer false.
+      if (!PendingMerges.insert(Phi).second)
+        return false;
+      RPhi = Phi;
+    }
+
+  // If none of LHS, RHS is a Phi, nothing to do here.
+  if (!LPhi && !RPhi)
+    return false;
+
+  // If there is a SCEVUnknown Phi we are interested in, make it left.
+  if (!LPhi) {
+    std::swap(LHS, RHS);
+    std::swap(FoundLHS, FoundRHS);
+    std::swap(LPhi, RPhi);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  assert(LPhi && "LPhi should definitely be a SCEVUnknown Phi!");
+  const BasicBlock *LBB = LPhi->getParent();
+  const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
+
+  auto ProvedEasily = [&](const SCEV *S1, const SCEV *S2) {
+    return isKnownViaNonRecursiveReasoning(Pred, S1, S2) ||
+           isImpliedCondOperandsViaRanges(Pred, S1, S2, FoundLHS, FoundRHS) ||
+           isImpliedViaOperations(Pred, S1, S2, FoundLHS, FoundRHS, Depth);
+  };
+
+  if (RPhi && RPhi->getParent() == LBB) {
+    // Case one: RHS is also a SCEVUnknown Phi from the same basic block.
+    // If we compare two Phis from the same block, and for each entry block
+    // the predicate is true for incoming values from this block, then the
+    // predicate is also true for the Phis.
+    for (const BasicBlock *IncBB : predecessors(LBB)) {
+      const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB));
+      const SCEV *R = getSCEV(RPhi->getIncomingValueForBlock(IncBB));
+      if (!ProvedEasily(L, R))
+        return false;
+    }
+  } else if (RAR && RAR->getLoop()->getHeader() == LBB) {
+    // Case two: RHS is also a Phi from the same basic block, and it is an
+    // AddRec. It means that there is a loop which has both AddRec and Unknown
+    // PHIs, for it we can compare incoming values of AddRec from above the loop
+    // and latch with their respective incoming values of LPhi.
+    // TODO: Generalize to handle loops with many inputs in a header.
+    if (LPhi->getNumIncomingValues() != 2) return false;
+
+    auto *RLoop = RAR->getLoop();
+    auto *Predecessor = RLoop->getLoopPredecessor();
+    assert(Predecessor && "Loop with AddRec with no predecessor?");
+    const SCEV *L1 = getSCEV(LPhi->getIncomingValueForBlock(Predecessor));
+    if (!ProvedEasily(L1, RAR->getStart()))
+      return false;
+    auto *Latch = RLoop->getLoopLatch();
+    assert(Latch && "Loop with AddRec with no latch?");
+    const SCEV *L2 = getSCEV(LPhi->getIncomingValueForBlock(Latch));
+    if (!ProvedEasily(L2, RAR->getPostIncExpr(*this)))
+      return false;
+  } else {
+    // In all other cases go over inputs of LHS and compare each of them to RHS,
+    // the predicate is true for (LHS, RHS) if it is true for all such pairs.
+    // At this point RHS is either a non-Phi, or it is a Phi from some block
+    // different from LBB.
+    for (const BasicBlock *IncBB : predecessors(LBB)) {
+      // Check that RHS is available in this block.
+      if (!dominates(RHS, IncBB))
+        return false;
+      const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB));
+      if (!ProvedEasily(L, RHS))
+        return false;
+    }
+  }
+  return true;
+}
+
 bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
@@ -9562,13 +9979,14 @@ bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
   };
 
   // Acquire values from extensions.
+  auto *OrigLHS = LHS;
   auto *OrigFoundLHS = FoundLHS;
   LHS = GetOpFromSExt(LHS);
   FoundLHS = GetOpFromSExt(FoundLHS);
 
   // Is the SGT predicate can be proved trivially or using the found context.
   auto IsSGTViaContext = [&](const SCEV *S1, const SCEV *S2) {
-    return isKnownViaSimpleReasoning(ICmpInst::ICMP_SGT, S1, S2) ||
+    return isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SGT, S1, S2) ||
            isImpliedViaOperations(ICmpInst::ICMP_SGT, S1, S2, OrigFoundLHS,
                                   FoundRHS, Depth + 1);
   };
@@ -9669,11 +10087,17 @@ bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
     }
   }
 
+  // If our expression contained SCEVUnknown Phis, and we split it down and now
+  // need to prove something for them, try to prove the predicate for every
+  // possible incoming values of those Phis.
+  if (isImpliedViaMerge(Pred, OrigLHS, RHS, OrigFoundLHS, FoundRHS, Depth + 1))
+    return true;
+
   return false;
 }
 
 bool
-ScalarEvolution::isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
+ScalarEvolution::isKnownViaNonRecursiveReasoning(ICmpInst::Predicate Pred,
                                            const SCEV *LHS, const SCEV *RHS) {
   return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
          IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
@@ -9695,26 +10119,26 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
     break;
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_SLE:
-    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
-        isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS))
+    if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
+        isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_SGE:
-    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
-        isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS))
+    if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
+        isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_ULE:
-    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
-        isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS))
+    if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
+        isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_UGE:
-    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
-        isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS))
+    if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
+        isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS))
       return true;
     break;
   }
@@ -10192,6 +10616,31 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
   return SE.getCouldNotCompute();
 }
 
+const SCEVAddRecExpr *
+SCEVAddRecExpr::getPostIncExpr(ScalarEvolution &SE) const {
+  assert(getNumOperands() > 1 && "AddRec with zero step?");
+  // There is a temptation to just call getAddExpr(this, getStepRecurrence(SE)),
+  // but in this case we cannot guarantee that the value returned will be an
+  // AddRec because SCEV does not have a fixed point where it stops
+  // simplification: it is legal to return ({rec1} + {rec2}). For example, it
+  // may happen if we reach arithmetic depth limit while simplifying. So we
+  // construct the returned value explicitly.
+  SmallVector<const SCEV *, 3> Ops;
+  // If this is {A,+,B,+,C,...,+,N}, then its step is {B,+,C,+,...,+,N}, and
+  // (this + Step) is {A+B,+,B+C,+...,+,N}.
+  for (unsigned i = 0, e = getNumOperands() - 1; i < e; ++i)
+    Ops.push_back(SE.getAddExpr(getOperand(i), getOperand(i + 1)));
+  // We know that the last operand is not a constant zero (otherwise it would
+  // have been popped out earlier). This guarantees us that if the result has
+  // the same last operand, then it will also not be popped out, meaning that
+  // the returned value will be an AddRec.
+  const SCEV *Last = getOperand(getNumOperands() - 1);
+  assert(!Last->isZero() && "Recurrency with zero step?");
+  Ops.push_back(Last);
+  return cast<SCEVAddRecExpr>(SE.getAddRecExpr(Ops, getLoop(),
+                                               SCEV::FlagAnyWrap));
+}
+
 // Return true when S contains at least an undef value.
 static inline bool containsUndefs(const SCEV *S) {
   return SCEVExprContains(S, [](const SCEV *S) {
@@ -10334,22 +10783,22 @@ void ScalarEvolution::collectParametricTerms(const SCEV *Expr,
   SCEVCollectStrides StrideCollector(*this, Strides);
   visitAll(Expr, StrideCollector);
 
-  DEBUG({
-      dbgs() << "Strides:\n";
-      for (const SCEV *S : Strides)
-        dbgs() << *S << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Strides:\n";
+    for (const SCEV *S : Strides)
+      dbgs() << *S << "\n";
+  });
 
   for (const SCEV *S : Strides) {
     SCEVCollectTerms TermCollector(Terms);
     visitAll(S, TermCollector);
   }
 
-  DEBUG({
-      dbgs() << "Terms:\n";
-      for (const SCEV *T : Terms)
-        dbgs() << *T << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Terms:\n";
+    for (const SCEV *T : Terms)
+      dbgs() << *T << "\n";
+  });
 
   SCEVCollectAddRecMultiplies MulCollector(Terms, *this);
   visitAll(Expr, MulCollector);
@@ -10460,18 +10909,18 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
   if (!containsParameters(Terms))
     return;
 
-  DEBUG({
-      dbgs() << "Terms:\n";
-      for (const SCEV *T : Terms)
-        dbgs() << *T << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Terms:\n";
+    for (const SCEV *T : Terms)
+      dbgs() << *T << "\n";
+  });
 
   // Remove duplicates.
   array_pod_sort(Terms.begin(), Terms.end());
   Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end());
 
   // Put larger terms first.
-  std::sort(Terms.begin(), Terms.end(), [](const SCEV *LHS, const SCEV *RHS) {
+  llvm::sort(Terms.begin(), Terms.end(), [](const SCEV *LHS, const SCEV *RHS) {
     return numberOfTerms(LHS) > numberOfTerms(RHS);
   });
 
@@ -10491,11 +10940,11 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
     if (const SCEV *NewT = removeConstantFactors(*this, T))
       NewTerms.push_back(NewT);
 
-  DEBUG({
-      dbgs() << "Terms after sorting:\n";
-      for (const SCEV *T : NewTerms)
-        dbgs() << *T << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Terms after sorting:\n";
+    for (const SCEV *T : NewTerms)
+      dbgs() << *T << "\n";
+  });
 
   if (NewTerms.empty() || !findArrayDimensionsRec(*this, NewTerms, Sizes)) {
     Sizes.clear();
@@ -10505,11 +10954,11 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
   // The last element to be pushed into Sizes is the size of an element.
   Sizes.push_back(ElementSize);
 
-  DEBUG({
-      dbgs() << "Sizes:\n";
-      for (const SCEV *S : Sizes)
-        dbgs() << *S << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Sizes:\n";
+    for (const SCEV *S : Sizes)
+      dbgs() << *S << "\n";
+  });
 }
 
 void ScalarEvolution::computeAccessFunctions(
@@ -10529,13 +10978,13 @@ void ScalarEvolution::computeAccessFunctions(
     const SCEV *Q, *R;
     SCEVDivision::divide(*this, Res, Sizes[i], &Q, &R);
 
-    DEBUG({
-        dbgs() << "Res: " << *Res << "\n";
-        dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
-        dbgs() << "Res divided by Sizes[i]:\n";
-        dbgs() << "Quotient: " << *Q << "\n";
-        dbgs() << "Remainder: " << *R << "\n";
-      });
+    LLVM_DEBUG({
+      dbgs() << "Res: " << *Res << "\n";
+      dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
+      dbgs() << "Res divided by Sizes[i]:\n";
+      dbgs() << "Quotient: " << *Q << "\n";
+      dbgs() << "Remainder: " << *R << "\n";
+    });
 
     Res = Q;
 
@@ -10563,11 +11012,11 @@ void ScalarEvolution::computeAccessFunctions(
 
   std::reverse(Subscripts.begin(), Subscripts.end());
 
-  DEBUG({
-      dbgs() << "Subscripts:\n";
-      for (const SCEV *S : Subscripts)
-        dbgs() << *S << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Subscripts:\n";
+    for (const SCEV *S : Subscripts)
+      dbgs() << *S << "\n";
+  });
 }
 
 /// Splits the SCEV into two vectors of SCEVs representing the subscripts and
@@ -10641,17 +11090,17 @@ void ScalarEvolution::delinearize(const SCEV *Expr,
   if (Subscripts.empty())
     return;
 
-  DEBUG({
-      dbgs() << "succeeded to delinearize " << *Expr << "\n";
-      dbgs() << "ArrayDecl[UnknownSize]";
-      for (const SCEV *S : Sizes)
-        dbgs() << "[" << *S << "]";
+  LLVM_DEBUG({
+    dbgs() << "succeeded to delinearize " << *Expr << "\n";
+    dbgs() << "ArrayDecl[UnknownSize]";
+    for (const SCEV *S : Sizes)
+      dbgs() << "[" << *S << "]";
 
-      dbgs() << "\nArrayRef";
-      for (const SCEV *S : Subscripts)
-        dbgs() << "[" << *S << "]";
-      dbgs() << "\n";
-    });
+    dbgs() << "\nArrayRef";
+    for (const SCEV *S : Subscripts)
+      dbgs() << "[" << *S << "]";
+    dbgs() << "\n";
+  });
 }
 
 //===----------------------------------------------------------------------===//
@@ -10728,6 +11177,8 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)),
       ValueExprMap(std::move(Arg.ValueExprMap)),
       PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)),
+      PendingPhiRanges(std::move(Arg.PendingPhiRanges)),
+      PendingMerges(std::move(Arg.PendingMerges)),
       MinTrailingZerosCache(std::move(Arg.MinTrailingZerosCache)),
       BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
       PredicatedBackedgeTakenCounts(
@@ -10771,6 +11222,8 @@ ScalarEvolution::~ScalarEvolution() {
     BTCI.second.clear();
 
   assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
+  assert(PendingPhiRanges.empty() && "getRangeRef garbage");
+  assert(PendingMerges.empty() && "isImpliedViaMerge garbage");
   assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!");
   assert(!ProvingSplitPredicate && "ProvingSplitPredicate garbage!");
 }
@@ -11181,9 +11634,13 @@ ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts);
 }
 
-void ScalarEvolution::addToLoopUseLists(const SCEV *S) {
+void
+ScalarEvolution::getUsedLoops(const SCEV *S,
+                              SmallPtrSetImpl<const Loop *> &LoopsUsed) {
   struct FindUsedLoops {
-    SmallPtrSet<const Loop *, 8> LoopsUsed;
+    FindUsedLoops(SmallPtrSetImpl<const Loop *> &LoopsUsed)
+        : LoopsUsed(LoopsUsed) {}
+    SmallPtrSetImpl<const Loop *> &LoopsUsed;
     bool follow(const SCEV *S) {
       if (auto *AR = dyn_cast<SCEVAddRecExpr>(S))
         LoopsUsed.insert(AR->getLoop());
@@ -11193,10 +11650,14 @@ void ScalarEvolution::addToLoopUseLists(const SCEV *S) {
     bool isDone() const { return false; }
   };
 
-  FindUsedLoops F;
+  FindUsedLoops F(LoopsUsed);
   SCEVTraversal<FindUsedLoops>(F).visitAll(S);
+}
 
-  for (auto *L : F.LoopsUsed)
+void ScalarEvolution::addToLoopUseLists(const SCEV *S) {
+  SmallPtrSet<const Loop *, 8> LoopsUsed;
+  getUsedLoops(S, LoopsUsed);
+  for (auto *L : LoopsUsed)
     LoopUsers[L].push_back(S);
 }
 
@@ -11472,8 +11933,6 @@ private:
   // couldn't create an AddRec for it, or couldn't add the predicate), we just
   // return \p Expr.
   const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) {
-    if (!VersionUnknown)
-      return Expr;
     if (!isa<PHINode>(Expr->getValue()))
       return Expr;
     Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
@@ -11481,6 +11940,12 @@ private:
     if (!PredicatedRewrite)
       return Expr;
     for (auto *P : PredicatedRewrite->second){
+      // Wrap predicates from outer loops are not supported.
+      if (auto *WP = dyn_cast<const SCEVWrapPredicate>(P)) {
+        auto *AR = cast<const SCEVAddRecExpr>(WP->getExpr());
+        if (L != AR->getLoop())
+          return Expr;
+      }
       if (!addOverflowAssumption(P))
         return Expr;
     }
@@ -11786,3 +12251,43 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
       OS.indent(Depth + 2) << "--> " << *II->second.second << "\n";
     }
 }
+
+// Match the mathematical pattern A - (A / B) * B, where A and B can be
+// arbitrary expressions.
+// It's not always easy, as A and B can be folded (imagine A is X / 2, and B is
+// 4, A / B becomes X / 8).
+bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
+                                const SCEV *&RHS) {
+  const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
+  if (Add == nullptr || Add->getNumOperands() != 2)
+    return false;
+
+  const SCEV *A = Add->getOperand(1);
+  const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
+
+  if (Mul == nullptr)
+    return false;
+
+  const auto MatchURemWithDivisor = [&](const SCEV *B) {
+    // (SomeExpr + (-(SomeExpr / B) * B)).
+    if (Expr == getURemExpr(A, B)) {
+      LHS = A;
+      RHS = B;
+      return true;
+    }
+    return false;
+  };
+
+  // (SomeExpr + (-1 * (SomeExpr / B) * B)).
+  if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
+    return MatchURemWithDivisor(Mul->getOperand(1)) ||
+           MatchURemWithDivisor(Mul->getOperand(2));
+
+  // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
+  if (Mul->getNumOperands() == 2)
+    return MatchURemWithDivisor(Mul->getOperand(1)) ||
+           MatchURemWithDivisor(Mul->getOperand(0)) ||
+           MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) ||
+           MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0)));
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
index 53ce33bacbe9..8f89389c4b5d 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -589,6 +589,12 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   return expand(SE.getAddExpr(Ops));
 }
 
+Value *SCEVExpander::expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty,
+                                    Value *V) {
+  const SCEV *const Ops[1] = {Op};
+  return expandAddToGEP(Ops, Ops + 1, PTy, Ty, V);
+}
+
 /// PickMostRelevantLoop - Given two loops pick the one that's most relevant for
 /// SCEV expansion. If they are nested, this is the most nested. If they are
 /// neighboring, pick the later.
@@ -1036,8 +1042,7 @@ Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
     if (!isa<ConstantInt>(StepV))
       GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()),
                                   GEPPtrTy->getAddressSpace());
-    const SCEV *const StepArray[1] = { SE.getSCEV(StepV) };
-    IncV = expandAddToGEP(StepArray, StepArray+1, GEPPtrTy, IntTy, PN);
+    IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN);
     if (IncV->getType() != PN->getType()) {
       IncV = Builder.CreateBitCast(IncV, PN->getType());
       rememberInstruction(IncV);
@@ -1051,7 +1056,7 @@ Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
   return IncV;
 }
 
-/// \brief Hoist the addrec instruction chain rooted in the loop phi above the
+/// Hoist the addrec instruction chain rooted in the loop phi above the
 /// position. This routine assumes that this is possible (has been checked).
 void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
                                   Instruction *Pos, PHINode *LoopPhi) {
@@ -1067,7 +1072,7 @@ void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
   } while (InstToHoist != LoopPhi);
 }
 
-/// \brief Check whether we can cheaply express the requested SCEV in terms of
+/// Check whether we can cheaply express the requested SCEV in terms of
 /// the available PHI SCEV by truncation and/or inversion of the step.
 static bool canBeCheaplyTransformed(ScalarEvolution &SE,
                                     const SCEVAddRecExpr *Phi,
@@ -1169,8 +1174,11 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       if (!IsMatchingSCEV && !TryNonMatchingSCEV)
           continue;
 
+      // TODO: this possibly can be reworked to avoid this cast at all.
       Instruction *TempIncV =
-          cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock));
+          dyn_cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock));
+      if (!TempIncV)
+        continue;
 
       // Check whether we can reuse this PHI node.
       if (LSRMode) {
@@ -1387,7 +1395,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
       // IVUsers tries to prevent this case, so it is rare. However, it can
       // happen when an IVUser outside the loop is not dominated by the latch
       // block. Adjusting IVIncInsertPos before expansion begins cannot handle
-      // all cases. Consider a phi outide whose operand is replaced during
+      // all cases. Consider a phi outside whose operand is replaced during
       // expansion with the value of the postinc user. Without fundamentally
       // changing the way postinc users are tracked, the only remedy is
       // inserting an extra IV increment. StepV might fold into PostLoopOffset,
@@ -1407,7 +1415,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   }
 
   // We have decided to reuse an induction variable of a dominating loop. Apply
-  // truncation and/or invertion of the step.
+  // truncation and/or inversion of the step.
   if (TruncTy) {
     Type *ResTy = Result->getType();
     // Normalize the result type.
@@ -1440,12 +1448,9 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
       if (Result->getType()->isIntegerTy()) {
         Value *Base = expandCodeFor(PostLoopOffset, ExpandTy);
-        const SCEV *const OffsetArray[1] = {SE.getUnknown(Result)};
-        Result = expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Base);
+        Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base);
       } else {
-        const SCEV *const OffsetArray[1] = {PostLoopOffset};
-        Result =
-            expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Result);
+        Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result);
       }
     } else {
       Result = InsertNoopCastOfTo(Result, IntTy);
@@ -1497,9 +1502,9 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
     // comments on expandAddToGEP for details.
     const SCEV *Base = S->getStart();
-    const SCEV *RestArray[1] = { Rest };
     // Dig into the expression to find the pointer base for a GEP.
-    ExposePointerBase(Base, RestArray[0], SE);
+    const SCEV *ExposedRest = Rest;
+    ExposePointerBase(Base, ExposedRest, SE);
     // If we found a pointer, expand the AddRec with a GEP.
     if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
       // Make sure the Base isn't something exotic, such as a multiplied
@@ -1508,7 +1513,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
       if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
         Value *StartV = expand(Base);
         assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
-        return expandAddToGEP(RestArray, RestArray+1, PTy, Ty, StartV);
+        return expandAddToGEP(ExposedRest, PTy, Ty, StartV);
       }
     }
 
@@ -1862,7 +1867,7 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
     Phis.push_back(&PN);
 
   if (TTI)
-    std::sort(Phis.begin(), Phis.end(), [](Value *LHS, Value *RHS) {
+    llvm::sort(Phis.begin(), Phis.end(), [](Value *LHS, Value *RHS) {
       // Put pointers at the back and make sure pointer < pointer = false.
       if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
         return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy();
@@ -2154,8 +2159,9 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   const SCEV *Step = AR->getStepRecurrence(SE);
   const SCEV *Start = AR->getStart();
 
+  Type *ARTy = AR->getType();
   unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType());
-  unsigned DstBits = SE.getTypeSizeInBits(AR->getType());
+  unsigned DstBits = SE.getTypeSizeInBits(ARTy);
 
   // The expression {Start,+,Step} has nusw/nssw if
   //   Step < 0, Start - |Step| * Backedge <= Start
@@ -2167,11 +2173,12 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   Value *TripCountVal = expandCodeFor(ExitCount, CountTy, Loc);
 
   IntegerType *Ty =
-      IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(AR->getType()));
+      IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
+  Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
 
   Value *StepValue = expandCodeFor(Step, Ty, Loc);
   Value *NegStepValue = expandCodeFor(SE.getNegativeSCEV(Step), Ty, Loc);
-  Value *StartValue = expandCodeFor(Start, Ty, Loc);
+  Value *StartValue = expandCodeFor(Start, ARExpandTy, Loc);
 
   ConstantInt *Zero =
       ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
@@ -2194,8 +2201,18 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   // Compute:
   //   Start + |Step| * Backedge < Start
   //   Start - |Step| * Backedge > Start
-  Value *Add = Builder.CreateAdd(StartValue, MulV);
-  Value *Sub = Builder.CreateSub(StartValue, MulV);
+  Value *Add = nullptr, *Sub = nullptr;
+  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) {
+    const SCEV *MulS = SE.getSCEV(MulV);
+    const SCEV *NegMulS = SE.getNegativeSCEV(MulS);
+    Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue),
+                                ARPtrTy);
+    Sub = Builder.CreateBitCast(
+        expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy);
+  } else {
+    Add = Builder.CreateAdd(StartValue, MulV);
+    Sub = Builder.CreateSub(StartValue, MulV);
+  }
 
   Value *EndCompareGT = Builder.CreateICmp(
       Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
@@ -2209,7 +2226,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
 
   // If the backedge taken count type is larger than the AR type,
   // check that we don't drop any bits by truncating it. If we are
-  // droping bits, then we have overflow (unless the step is zero).
+  // dropping bits, then we have overflow (unless the step is zero).
   if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) {
     auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits);
     auto *BackedgeCheck =
diff --git a/contrib/llvm/lib/Analysis/StratifiedSets.h b/contrib/llvm/lib/Analysis/StratifiedSets.h
index 772df175b384..2f20cd12506c 100644
--- a/contrib/llvm/lib/Analysis/StratifiedSets.h
+++ b/contrib/llvm/lib/Analysis/StratifiedSets.h
@@ -29,7 +29,7 @@ typedef unsigned StratifiedIndex;
 /// NOTE: ^ This can't be a short -- bootstrapping clang has a case where
 /// ~1M sets exist.
 
-// \brief Container of information related to a value in a StratifiedSet.
+// Container of information related to a value in a StratifiedSet.
 struct StratifiedInfo {
   StratifiedIndex Index;
   /// For field sensitivity, etc. we can tack fields on here.
@@ -37,7 +37,7 @@ struct StratifiedInfo {
 
 /// A "link" between two StratifiedSets.
 struct StratifiedLink {
-  /// \brief This is a value used to signify "does not exist" where the
+  /// This is a value used to signify "does not exist" where the
   /// StratifiedIndex type is used.
   ///
   /// This is used instead of Optional<StratifiedIndex> because
@@ -63,7 +63,7 @@ struct StratifiedLink {
   void clearAbove() { Above = SetSentinel; }
 };
 
-/// \brief These are stratified sets, as described in "Fast algorithms for
+/// These are stratified sets, as described in "Fast algorithms for
 /// Dyck-CFL-reachability with applications to Alias Analysis" by Zhang Q, Lyu M
 /// R, Yuan H, and Su Z. -- in short, this is meant to represent different sets
 /// of Value*s. If two Value*s are in the same set, or if both sets have
@@ -172,7 +172,7 @@ private:
 /// remap has occurred, and use this information so we can defer renumbering set
 /// elements until build time.
 template <typename T> class StratifiedSetsBuilder {
-  /// \brief Represents a Stratified Set, with information about the Stratified
+  /// Represents a Stratified Set, with information about the Stratified
   /// Set above it, the set below it, and whether the current set has been
   /// remapped to another.
   struct BuilderLink {
@@ -263,7 +263,7 @@ template <typename T> class StratifiedSetsBuilder {
     StratifiedIndex Remap;
   };
 
-  /// \brief This function performs all of the set unioning/value renumbering
+  /// This function performs all of the set unioning/value renumbering
   /// that we've been putting off, and generates a vector<StratifiedLink> that
   /// may be placed in a StratifiedSets instance.
   void finalizeSets(std::vector<StratifiedLink> &StratLinks) {
@@ -302,7 +302,7 @@ template <typename T> class StratifiedSetsBuilder {
     }
   }
 
-  /// \brief There's a guarantee in StratifiedLink where all bits set in a
+  /// There's a guarantee in StratifiedLink where all bits set in a
   /// Link.externals will be set in all Link.externals "below" it.
   static void propagateAttrs(std::vector<StratifiedLink> &Links) {
     const auto getHighestParentAbove = [&Links](StratifiedIndex Idx) {
@@ -351,7 +351,7 @@ public:
     return addAtMerging(Main, NewIndex);
   }
 
-  /// \brief Restructures the stratified sets as necessary to make "ToAdd" in a
+  /// Restructures the stratified sets as necessary to make "ToAdd" in a
   /// set above "Main". There are some cases where this is not possible (see
   /// above), so we merge them such that ToAdd and Main are in the same set.
   bool addAbove(const T &Main, const T &ToAdd) {
@@ -364,7 +364,7 @@ public:
     return addAtMerging(ToAdd, Above);
   }
 
-  /// \brief Restructures the stratified sets as necessary to make "ToAdd" in a
+  /// Restructures the stratified sets as necessary to make "ToAdd" in a
   /// set below "Main". There are some cases where this is not possible (see
   /// above), so we merge them such that ToAdd and Main are in the same set.
   bool addBelow(const T &Main, const T &ToAdd) {
@@ -437,7 +437,7 @@ private:
     return *Current;
   }
 
-  /// \brief Merges two sets into one another. Assumes that these sets are not
+  /// Merges two sets into one another. Assumes that these sets are not
   /// already one in the same.
   void merge(StratifiedIndex Idx1, StratifiedIndex Idx2) {
     assert(inbounds(Idx1) && inbounds(Idx2));
@@ -458,7 +458,7 @@ private:
     mergeDirect(Idx1, Idx2);
   }
 
-  /// \brief Merges two sets assuming that the set at `Idx1` is unreachable from
+  /// Merges two sets assuming that the set at `Idx1` is unreachable from
   /// traversing above or below the set at `Idx2`.
   void mergeDirect(StratifiedIndex Idx1, StratifiedIndex Idx2) {
     assert(inbounds(Idx1) && inbounds(Idx2));
diff --git a/contrib/llvm/lib/Analysis/SyntheticCountsUtils.cpp b/contrib/llvm/lib/Analysis/SyntheticCountsUtils.cpp
new file mode 100644
index 000000000000..b085fa274d7f
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/SyntheticCountsUtils.cpp
@@ -0,0 +1,113 @@
+//===--- SyntheticCountsUtils.cpp - synthetic counts propagation utils ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities for propagating synthetic counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/SyntheticCountsUtils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+// Given an SCC, propagate entry counts along the edge of the SCC nodes.
+template <typename CallGraphType>
+void SyntheticCountsUtils<CallGraphType>::propagateFromSCC(
+    const SccTy &SCC, GetRelBBFreqTy GetRelBBFreq, GetCountTy GetCount,
+    AddCountTy AddCount) {
+
+  SmallPtrSet<NodeRef, 8> SCCNodes;
+  SmallVector<std::pair<NodeRef, EdgeRef>, 8> SCCEdges, NonSCCEdges;
+
+  for (auto &Node : SCC)
+    SCCNodes.insert(Node);
+
+  // Partition the edges coming out of the SCC into those whose destination is
+  // in the SCC and the rest.
+  for (const auto &Node : SCCNodes) {
+    for (auto &E : children_edges<CallGraphType>(Node)) {
+      if (SCCNodes.count(CGT::edge_dest(E)))
+        SCCEdges.emplace_back(Node, E);
+      else
+        NonSCCEdges.emplace_back(Node, E);
+    }
+  }
+
+  // For nodes in the same SCC, update the counts in two steps:
+  // 1. Compute the additional count for each node by propagating the counts
+  // along all incoming edges to the node that originate from within the same
+  // SCC and summing them up.
+  // 2. Add the additional counts to the nodes in the SCC.
+  // This ensures that the order of
+  // traversal of nodes within the SCC doesn't affect the final result.
+
+  DenseMap<NodeRef, uint64_t> AdditionalCounts;
+  for (auto &E : SCCEdges) {
+    auto OptRelFreq = GetRelBBFreq(E.second);
+    if (!OptRelFreq)
+      continue;
+    Scaled64 RelFreq = OptRelFreq.getValue();
+    auto Caller = E.first;
+    auto Callee = CGT::edge_dest(E.second);
+    RelFreq *= Scaled64(GetCount(Caller), 0);
+    uint64_t AdditionalCount = RelFreq.toInt<uint64_t>();
+    AdditionalCounts[Callee] += AdditionalCount;
+  }
+
+  // Update the counts for the nodes in the SCC.
+  for (auto &Entry : AdditionalCounts)
+    AddCount(Entry.first, Entry.second);
+
+  // Now update the counts for nodes outside the SCC.
+  for (auto &E : NonSCCEdges) {
+    auto OptRelFreq = GetRelBBFreq(E.second);
+    if (!OptRelFreq)
+      continue;
+    Scaled64 RelFreq = OptRelFreq.getValue();
+    auto Caller = E.first;
+    auto Callee = CGT::edge_dest(E.second);
+    RelFreq *= Scaled64(GetCount(Caller), 0);
+    AddCount(Callee, RelFreq.toInt<uint64_t>());
+  }
+}
+
+/// Propgate synthetic entry counts on a callgraph \p CG.
+///
+/// This performs a reverse post-order traversal of the callgraph SCC. For each
+/// SCC, it first propagates the entry counts to the nodes within the SCC
+/// through call edges and updates them in one shot. Then the entry counts are
+/// propagated to nodes outside the SCC. This requires \p GraphTraits
+/// to have a specialization for \p CallGraphType.
+
+template <typename CallGraphType>
+void SyntheticCountsUtils<CallGraphType>::propagate(const CallGraphType &CG,
+                                                    GetRelBBFreqTy GetRelBBFreq,
+                                                    GetCountTy GetCount,
+                                                    AddCountTy AddCount) {
+  std::vector<SccTy> SCCs;
+
+  // Collect all the SCCs.
+  for (auto I = scc_begin(CG); !I.isAtEnd(); ++I)
+    SCCs.push_back(*I);
+
+  // The callgraph-scc needs to be visited in top-down order for propagation.
+  // The scc iterator returns the scc in bottom-up order, so reverse the SCCs
+  // and call propagateFromSCC.
+  for (auto &SCC : reverse(SCCs))
+    propagateFromSCC(SCC, GetRelBBFreq, GetCount, AddCount);
+}
+
+template class llvm::SyntheticCountsUtils<const CallGraph *>;
diff --git a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
index d18246ac5941..102135fbf313 100644
--- a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -62,6 +62,18 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
                         }) &&
          "TargetLibraryInfoImpl function names must be sorted");
 
+  // Set IO unlocked variants as unavailable
+  // Set them as available per system below
+  TLI.setUnavailable(LibFunc_getchar_unlocked);
+  TLI.setUnavailable(LibFunc_putc_unlocked);
+  TLI.setUnavailable(LibFunc_putchar_unlocked);
+  TLI.setUnavailable(LibFunc_fputc_unlocked);
+  TLI.setUnavailable(LibFunc_fgetc_unlocked);
+  TLI.setUnavailable(LibFunc_fread_unlocked);
+  TLI.setUnavailable(LibFunc_fwrite_unlocked);
+  TLI.setUnavailable(LibFunc_fputs_unlocked);
+  TLI.setUnavailable(LibFunc_fgets_unlocked);
+
   bool ShouldExtI32Param = false, ShouldExtI32Return = false,
        ShouldSignExtI32Param = false;
   // PowerPC64, Sparc64, SystemZ need signext/zeroext on i32 parameters and
@@ -73,8 +85,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   }
   // Mips, on the other hand, needs signext on i32 parameters corresponding
   // to both signed and unsigned ints.
-  if (T.getArch() == Triple::mips || T.getArch() == Triple::mipsel ||
-      T.getArch() == Triple::mips64 || T.getArch() == Triple::mips64el) {
+  if (T.isMIPS()) {
     ShouldSignExtI32Param = true;
   }
   TLI.setShouldExtI32Param(ShouldExtI32Param);
@@ -107,6 +118,12 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
   // All versions of watchOS support it.
   if (T.isMacOSX()) {
+    // available IO unlocked variants on Mac OS X
+    TLI.setAvailable(LibFunc_getc_unlocked);
+    TLI.setAvailable(LibFunc_getchar_unlocked);
+    TLI.setAvailable(LibFunc_putc_unlocked);
+    TLI.setAvailable(LibFunc_putchar_unlocked);
+
     if (T.isMacOSXVersionLT(10, 5))
       TLI.setUnavailable(LibFunc_memset_pattern16);
   } else if (T.isiOS()) {
@@ -245,51 +262,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc_tanhf);
     }
 
-    // These definitions are due to math-finite.h header on Linux
-    TLI.setUnavailable(LibFunc_acos_finite);
-    TLI.setUnavailable(LibFunc_acosf_finite);
-    TLI.setUnavailable(LibFunc_acosl_finite);
-    TLI.setUnavailable(LibFunc_acosh_finite);
-    TLI.setUnavailable(LibFunc_acoshf_finite);
-    TLI.setUnavailable(LibFunc_acoshl_finite);
-    TLI.setUnavailable(LibFunc_asin_finite);
-    TLI.setUnavailable(LibFunc_asinf_finite);
-    TLI.setUnavailable(LibFunc_asinl_finite);
-    TLI.setUnavailable(LibFunc_atan2_finite);
-    TLI.setUnavailable(LibFunc_atan2f_finite);
-    TLI.setUnavailable(LibFunc_atan2l_finite);
-    TLI.setUnavailable(LibFunc_atanh_finite);
-    TLI.setUnavailable(LibFunc_atanhf_finite);
-    TLI.setUnavailable(LibFunc_atanhl_finite);
-    TLI.setUnavailable(LibFunc_cosh_finite);
-    TLI.setUnavailable(LibFunc_coshf_finite);
-    TLI.setUnavailable(LibFunc_coshl_finite);
-    TLI.setUnavailable(LibFunc_exp10_finite);
-    TLI.setUnavailable(LibFunc_exp10f_finite);
-    TLI.setUnavailable(LibFunc_exp10l_finite);
-    TLI.setUnavailable(LibFunc_exp2_finite);
-    TLI.setUnavailable(LibFunc_exp2f_finite);
-    TLI.setUnavailable(LibFunc_exp2l_finite);
-    TLI.setUnavailable(LibFunc_exp_finite);
-    TLI.setUnavailable(LibFunc_expf_finite);
-    TLI.setUnavailable(LibFunc_expl_finite);
-    TLI.setUnavailable(LibFunc_log10_finite);
-    TLI.setUnavailable(LibFunc_log10f_finite);
-    TLI.setUnavailable(LibFunc_log10l_finite);
-    TLI.setUnavailable(LibFunc_log2_finite);
-    TLI.setUnavailable(LibFunc_log2f_finite);
-    TLI.setUnavailable(LibFunc_log2l_finite);
-    TLI.setUnavailable(LibFunc_log_finite);
-    TLI.setUnavailable(LibFunc_logf_finite);
-    TLI.setUnavailable(LibFunc_logl_finite);
-    TLI.setUnavailable(LibFunc_pow_finite);
-    TLI.setUnavailable(LibFunc_powf_finite);
-    TLI.setUnavailable(LibFunc_powl_finite);
-    TLI.setUnavailable(LibFunc_sinh_finite);
-    TLI.setUnavailable(LibFunc_sinhf_finite);
-    TLI.setUnavailable(LibFunc_sinhl_finite);
-
-    // Win32 does *not* provide provide these functions, but they are
+    // Win32 does *not* provide these functions, but they are
     // generally available on POSIX-compliant systems:
     TLI.setUnavailable(LibFunc_access);
     TLI.setUnavailable(LibFunc_bcmp);
@@ -309,7 +282,6 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_ftello);
     TLI.setUnavailable(LibFunc_ftrylockfile);
     TLI.setUnavailable(LibFunc_funlockfile);
-    TLI.setUnavailable(LibFunc_getc_unlocked);
     TLI.setUnavailable(LibFunc_getitimer);
     TLI.setUnavailable(LibFunc_getlogin_r);
     TLI.setUnavailable(LibFunc_getpwnam);
@@ -441,15 +413,18 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_flsll);
   }
 
-  // The following functions are available on at least Linux:
-  if (!T.isOSLinux()) {
+  // The following functions are available on Linux,
+  // but Android uses bionic instead of glibc.
+  if (!T.isOSLinux() || T.isAndroid()) {
     TLI.setUnavailable(LibFunc_dunder_strdup);
     TLI.setUnavailable(LibFunc_dunder_strtok_r);
     TLI.setUnavailable(LibFunc_dunder_isoc99_scanf);
     TLI.setUnavailable(LibFunc_dunder_isoc99_sscanf);
     TLI.setUnavailable(LibFunc_under_IO_getc);
     TLI.setUnavailable(LibFunc_under_IO_putc);
-    TLI.setUnavailable(LibFunc_memalign);
+    // But, Android has memalign.
+    if (!T.isAndroid())
+      TLI.setUnavailable(LibFunc_memalign);
     TLI.setUnavailable(LibFunc_fopen64);
     TLI.setUnavailable(LibFunc_fseeko64);
     TLI.setUnavailable(LibFunc_fstat64);
@@ -460,6 +435,65 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_stat64);
     TLI.setUnavailable(LibFunc_statvfs64);
     TLI.setUnavailable(LibFunc_tmpfile64);
+
+    // Relaxed math functions are included in math-finite.h on Linux (GLIBC).
+    TLI.setUnavailable(LibFunc_acos_finite);
+    TLI.setUnavailable(LibFunc_acosf_finite);
+    TLI.setUnavailable(LibFunc_acosl_finite);
+    TLI.setUnavailable(LibFunc_acosh_finite);
+    TLI.setUnavailable(LibFunc_acoshf_finite);
+    TLI.setUnavailable(LibFunc_acoshl_finite);
+    TLI.setUnavailable(LibFunc_asin_finite);
+    TLI.setUnavailable(LibFunc_asinf_finite);
+    TLI.setUnavailable(LibFunc_asinl_finite);
+    TLI.setUnavailable(LibFunc_atan2_finite);
+    TLI.setUnavailable(LibFunc_atan2f_finite);
+    TLI.setUnavailable(LibFunc_atan2l_finite);
+    TLI.setUnavailable(LibFunc_atanh_finite);
+    TLI.setUnavailable(LibFunc_atanhf_finite);
+    TLI.setUnavailable(LibFunc_atanhl_finite);
+    TLI.setUnavailable(LibFunc_cosh_finite);
+    TLI.setUnavailable(LibFunc_coshf_finite);
+    TLI.setUnavailable(LibFunc_coshl_finite);
+    TLI.setUnavailable(LibFunc_exp10_finite);
+    TLI.setUnavailable(LibFunc_exp10f_finite);
+    TLI.setUnavailable(LibFunc_exp10l_finite);
+    TLI.setUnavailable(LibFunc_exp2_finite);
+    TLI.setUnavailable(LibFunc_exp2f_finite);
+    TLI.setUnavailable(LibFunc_exp2l_finite);
+    TLI.setUnavailable(LibFunc_exp_finite);
+    TLI.setUnavailable(LibFunc_expf_finite);
+    TLI.setUnavailable(LibFunc_expl_finite);
+    TLI.setUnavailable(LibFunc_log10_finite);
+    TLI.setUnavailable(LibFunc_log10f_finite);
+    TLI.setUnavailable(LibFunc_log10l_finite);
+    TLI.setUnavailable(LibFunc_log2_finite);
+    TLI.setUnavailable(LibFunc_log2f_finite);
+    TLI.setUnavailable(LibFunc_log2l_finite);
+    TLI.setUnavailable(LibFunc_log_finite);
+    TLI.setUnavailable(LibFunc_logf_finite);
+    TLI.setUnavailable(LibFunc_logl_finite);
+    TLI.setUnavailable(LibFunc_pow_finite);
+    TLI.setUnavailable(LibFunc_powf_finite);
+    TLI.setUnavailable(LibFunc_powl_finite);
+    TLI.setUnavailable(LibFunc_sinh_finite);
+    TLI.setUnavailable(LibFunc_sinhf_finite);
+    TLI.setUnavailable(LibFunc_sinhl_finite);
+  }
+
+  if ((T.isOSLinux() && T.isGNUEnvironment()) ||
+      (T.isAndroid() && !T.isAndroidVersionLT(28))) {
+    // available IO unlocked variants on GNU/Linux and Android P or later
+    TLI.setAvailable(LibFunc_getc_unlocked);
+    TLI.setAvailable(LibFunc_getchar_unlocked);
+    TLI.setAvailable(LibFunc_putc_unlocked);
+    TLI.setAvailable(LibFunc_putchar_unlocked);
+    TLI.setAvailable(LibFunc_fputc_unlocked);
+    TLI.setAvailable(LibFunc_fgetc_unlocked);
+    TLI.setAvailable(LibFunc_fread_unlocked);
+    TLI.setAvailable(LibFunc_fwrite_unlocked);
+    TLI.setAvailable(LibFunc_fputs_unlocked);
+    TLI.setAvailable(LibFunc_fgets_unlocked);
   }
 
   // As currently implemented in clang, NVPTX code has no standard library to
@@ -689,10 +723,12 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_siprintf:
   case LibFunc_sprintf:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(1)->isPointerTy());
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_snprintf:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(2)->isPointerTy());
+            FTy.getParamType(2)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_setitimer:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
@@ -802,6 +838,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_feof:
   case LibFunc_fflush:
   case LibFunc_fgetc:
+  case LibFunc_fgetc_unlocked:
   case LibFunc_fileno:
   case LibFunc_flockfile:
   case LibFunc_free:
@@ -830,6 +867,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
   case LibFunc_fputc:
+  case LibFunc_fputc_unlocked:
   case LibFunc_fstat:
   case LibFunc_frexp:
   case LibFunc_frexpf:
@@ -837,18 +875,22 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_fstatvfs:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
   case LibFunc_fgets:
+  case LibFunc_fgets_unlocked:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
   case LibFunc_fread:
+  case LibFunc_fread_unlocked:
     return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(3)->isPointerTy());
   case LibFunc_fwrite:
+  case LibFunc_fwrite_unlocked:
     return (NumParams == 4 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isIntegerTy() &&
             FTy.getParamType(2)->isIntegerTy() &&
             FTy.getParamType(3)->isPointerTy());
   case LibFunc_fputs:
+  case LibFunc_fputs_unlocked:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
   case LibFunc_fscanf:
@@ -861,6 +903,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
   case LibFunc_getchar:
+  case LibFunc_getchar_unlocked:
     return (NumParams == 0 && FTy.getReturnType()->isIntegerTy());
   case LibFunc_gets:
     return (NumParams == 1 && FTy.getParamType(0) == PCharTy);
@@ -873,6 +916,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
   case LibFunc_putc:
+  case LibFunc_putc_unlocked:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
   case LibFunc_pread:
   case LibFunc_pwrite:
@@ -989,8 +1033,26 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_msvc_new_array_int_nothrow:
   // new[](unsigned long long, nothrow);
   case LibFunc_msvc_new_array_longlong_nothrow:
+  // new(unsigned int, align_val_t)
+  case LibFunc_ZnwjSt11align_val_t:
+  // new(unsigned long, align_val_t)
+  case LibFunc_ZnwmSt11align_val_t:
+  // new[](unsigned int, align_val_t)
+  case LibFunc_ZnajSt11align_val_t:
+  // new[](unsigned long, align_val_t)
+  case LibFunc_ZnamSt11align_val_t:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
 
+  // new(unsigned int, align_val_t, nothrow)
+  case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t:
+  // new(unsigned long, align_val_t, nothrow)
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
+  // new[](unsigned int, align_val_t, nothrow)
+  case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t:
+  // new[](unsigned long, align_val_t, nothrow)
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
+    return (NumParams == 3 && FTy.getReturnType()->isPointerTy());
+
   // void operator delete[](void*);
   case LibFunc_ZdaPv:
   // void operator delete(void*);
@@ -1017,6 +1079,10 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_ZdlPvj:
   // void operator delete(void*, unsigned long);
   case LibFunc_ZdlPvm:
+  // void operator delete(void*, align_val_t)
+  case LibFunc_ZdlPvSt11align_val_t:
+  // void operator delete[](void*, align_val_t)
+  case LibFunc_ZdaPvSt11align_val_t:
   // void operator delete[](void*, unsigned int);
   case LibFunc_msvc_delete_array_ptr32_int:
   // void operator delete[](void*, nothrow);
@@ -1035,6 +1101,12 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_msvc_delete_ptr64_nothrow:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
 
+  // void operator delete(void*, align_val_t, nothrow)
+  case LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t:
+  // void operator delete[](void*, align_val_t, nothrow)
+  case LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy());
+
   case LibFunc_memset_pattern16:
     return (!FTy.isVarArg() && NumParams == 3 &&
             FTy.getParamType(0)->isPointerTy() &&
@@ -1231,6 +1303,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_isascii:
   case LibFunc_toascii:
   case LibFunc_putchar:
+  case LibFunc_putchar_unlocked:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getReturnType() == FTy.getParamType(0));
 
@@ -1326,10 +1399,10 @@ static bool compareWithVectorFnName(const VecDesc &LHS, StringRef S) {
 
 void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
   VectorDescs.insert(VectorDescs.end(), Fns.begin(), Fns.end());
-  std::sort(VectorDescs.begin(), VectorDescs.end(), compareByScalarFnName);
+  llvm::sort(VectorDescs.begin(), VectorDescs.end(), compareByScalarFnName);
 
   ScalarDescs.insert(ScalarDescs.end(), Fns.begin(), Fns.end());
-  std::sort(ScalarDescs.begin(), ScalarDescs.end(), compareByVectorFnName);
+  llvm::sort(ScalarDescs.begin(), ScalarDescs.end(), compareByVectorFnName);
 }
 
 void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
@@ -1387,6 +1460,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"sinf", "__svml_sinf8", 8},
         {"sinf", "__svml_sinf16", 16},
 
+        {"llvm.sin.f64", "__svml_sin2", 2},
+        {"llvm.sin.f64", "__svml_sin4", 4},
+        {"llvm.sin.f64", "__svml_sin8", 8},
+
+        {"llvm.sin.f32", "__svml_sinf4", 4},
+        {"llvm.sin.f32", "__svml_sinf8", 8},
+        {"llvm.sin.f32", "__svml_sinf16", 16},
+
         {"cos", "__svml_cos2", 2},
         {"cos", "__svml_cos4", 4},
         {"cos", "__svml_cos8", 8},
@@ -1395,6 +1476,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"cosf", "__svml_cosf8", 8},
         {"cosf", "__svml_cosf16", 16},
 
+        {"llvm.cos.f64", "__svml_cos2", 2},
+        {"llvm.cos.f64", "__svml_cos4", 4},
+        {"llvm.cos.f64", "__svml_cos8", 8},
+
+        {"llvm.cos.f32", "__svml_cosf4", 4},
+        {"llvm.cos.f32", "__svml_cosf8", 8},
+        {"llvm.cos.f32", "__svml_cosf16", 16},
+
         {"pow", "__svml_pow2", 2},
         {"pow", "__svml_pow4", 4},
         {"pow", "__svml_pow8", 8},
diff --git a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
index b744cae51ed7..9de2f789c89c 100644
--- a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -31,7 +31,7 @@ static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
                                      cl::desc("Recognize reduction patterns."));
 
 namespace {
-/// \brief No-op implementation of the TTI interface using the utility base
+/// No-op implementation of the TTI interface using the utility base
 /// classes.
 ///
 /// This is used when no target specific information is available.
@@ -155,6 +155,14 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::canMacroFuseCmp() const {
+  return TTIImpl->canMacroFuseCmp();
+}
+
+bool TargetTransformInfo::shouldFavorPostInc() const {
+  return TTIImpl->shouldFavorPostInc();
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedStore(DataType);
 }
@@ -207,6 +215,8 @@ bool TargetTransformInfo::isProfitableToHoist(Instruction *I) const {
   return TTIImpl->isProfitableToHoist(I);
 }
 
+bool TargetTransformInfo::useAA() const { return TTIImpl->useAA(); }
+
 bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
   return TTIImpl->isTypeLegal(Ty);
 }
@@ -226,6 +236,10 @@ bool TargetTransformInfo::shouldBuildLookupTablesForConstant(Constant *C) const
   return TTIImpl->shouldBuildLookupTablesForConstant(C);
 }
 
+bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
+  return TTIImpl->useColdCCForColdCall(F);
+}
+
 unsigned TargetTransformInfo::
 getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const {
   return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
@@ -326,6 +340,14 @@ unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
+  return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
+}
+
+unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {
+  return TTIImpl->getMinimumVF(ElemWidth);
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -547,6 +569,16 @@ bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
   return TTIImpl->areInlineCompatible(Caller, Callee);
 }
 
+bool TargetTransformInfo::isIndexedLoadLegal(MemIndexedMode Mode,
+                                             Type *Ty) const {
+  return TTIImpl->isIndexedLoadLegal(Mode, Ty);
+}
+
+bool TargetTransformInfo::isIndexedStoreLegal(MemIndexedMode Mode,
+                                              Type *Ty) const {
+  return TTIImpl->isIndexedStoreLegal(Mode, Ty);
+}
+
 unsigned TargetTransformInfo::getLoadStoreVecRegBitWidth(unsigned AS) const {
   return TTIImpl->getLoadStoreVecRegBitWidth(AS);
 }
@@ -598,73 +630,43 @@ int TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
   return TTIImpl->getInstructionLatency(I);
 }
 
-static bool isReverseVectorMask(ArrayRef<int> Mask) {
-  for (unsigned i = 0, MaskSize = Mask.size(); i < MaskSize; ++i)
-    if (Mask[i] >= 0 && Mask[i] != (int)(MaskSize - 1 - i))
-      return false;
-  return true;
-}
-
-static bool isSingleSourceVectorMask(ArrayRef<int> Mask) {
-  bool Vec0 = false;
-  bool Vec1 = false;
-  for (unsigned i = 0, NumVecElts = Mask.size(); i < NumVecElts; ++i) {
-    if (Mask[i] >= 0) {
-      if ((unsigned)Mask[i] >= NumVecElts)
-        Vec1 = true;
-      else
-        Vec0 = true;
-    }
-  }
-  return !(Vec0 && Vec1);
-}
-
-static bool isZeroEltBroadcastVectorMask(ArrayRef<int> Mask) {
-  for (unsigned i = 0; i < Mask.size(); ++i)
-    if (Mask[i] > 0)
-      return false;
-  return true;
-}
-
-static bool isAlternateVectorMask(ArrayRef<int> Mask) {
-  bool isAlternate = true;
-  unsigned MaskSize = Mask.size();
-
-  // Example: shufflevector A, B, <0,5,2,7>
-  for (unsigned i = 0; i < MaskSize && isAlternate; ++i) {
-    if (Mask[i] < 0)
-      continue;
-    isAlternate = Mask[i] == (int)((i & 1) ? MaskSize + i : i);
-  }
-
-  if (isAlternate)
-    return true;
+static TargetTransformInfo::OperandValueKind
+getOperandInfo(Value *V, TargetTransformInfo::OperandValueProperties &OpProps) {
+  TargetTransformInfo::OperandValueKind OpInfo =
+      TargetTransformInfo::OK_AnyValue;
+  OpProps = TargetTransformInfo::OP_None;
 
-  isAlternate = true;
-  // Example: shufflevector A, B, <4,1,6,3>
-  for (unsigned i = 0; i < MaskSize && isAlternate; ++i) {
-    if (Mask[i] < 0)
-      continue;
-    isAlternate = Mask[i] == (int)((i & 1) ? i : MaskSize + i);
+  if (auto *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getValue().isPowerOf2())
+      OpProps = TargetTransformInfo::OP_PowerOf2;
+    return TargetTransformInfo::OK_UniformConstantValue;
   }
 
-  return isAlternate;
-}
-
-static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
-  TargetTransformInfo::OperandValueKind OpInfo =
-      TargetTransformInfo::OK_AnyValue;
+  const Value *Splat = getSplatValue(V);
 
-  // Check for a splat of a constant or for a non uniform vector of constants.
+  // Check for a splat of a constant or for a non uniform vector of constants
+  // and check if the constant(s) are all powers of two.
   if (isa<ConstantVector>(V) || isa<ConstantDataVector>(V)) {
     OpInfo = TargetTransformInfo::OK_NonUniformConstantValue;
-    if (cast<Constant>(V)->getSplatValue() != nullptr)
+    if (Splat) {
       OpInfo = TargetTransformInfo::OK_UniformConstantValue;
+      if (auto *CI = dyn_cast<ConstantInt>(Splat))
+        if (CI->getValue().isPowerOf2())
+          OpProps = TargetTransformInfo::OP_PowerOf2;
+    } else if (auto *CDS = dyn_cast<ConstantDataSequential>(V)) {
+      OpProps = TargetTransformInfo::OP_PowerOf2;
+      for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
+        if (auto *CI = dyn_cast<ConstantInt>(CDS->getElementAsConstant(I)))
+          if (CI->getValue().isPowerOf2())
+            continue;
+        OpProps = TargetTransformInfo::OP_None;
+        break;
+      }
+    }
   }
 
   // Check for a splat of a uniform value. This is not loop aware, so return
   // true only for the obviously uniform cases (argument, globalvalue)
-  const Value *Splat = getSplatValue(V);
   if (Splat && (isa<Argument>(Splat) || isa<GlobalValue>(Splat)))
     OpInfo = TargetTransformInfo::OK_UniformValue;
 
@@ -994,15 +996,13 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    TargetTransformInfo::OperandValueKind Op1VK =
-      getOperandInfo(I->getOperand(0));
-    TargetTransformInfo::OperandValueKind Op2VK =
-      getOperandInfo(I->getOperand(1));
-    SmallVector<const Value*, 2> Operands(I->operand_values());
-    return getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
-                                       Op2VK, TargetTransformInfo::OP_None,
-                                       TargetTransformInfo::OP_None,
-                                       Operands);
+    TargetTransformInfo::OperandValueKind Op1VK, Op2VK;
+    TargetTransformInfo::OperandValueProperties Op1VP, Op2VP;
+    Op1VK = getOperandInfo(I->getOperand(0), Op1VP);
+    Op2VK = getOperandInfo(I->getOperand(1), Op2VP);
+    SmallVector<const Value *, 2> Operands(I->operand_values());
+    return getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK, Op2VK,
+                                  Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
@@ -1101,31 +1101,30 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   }
   case Instruction::ShuffleVector: {
     const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
-    Type *VecTypOp0 = Shuffle->getOperand(0)->getType();
-    unsigned NumVecElems = VecTypOp0->getVectorNumElements();
-    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+    // TODO: Identify and add costs for insert/extract subvector, etc.
+    if (Shuffle->changesLength())
+      return -1;
+    
+    if (Shuffle->isIdentity())
+      return 0;
 
-    if (NumVecElems == Mask.size()) {
-      if (isReverseVectorMask(Mask))
-        return getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0,
-                                   0, nullptr);
-      if (isAlternateVectorMask(Mask))
-        return getShuffleCost(TargetTransformInfo::SK_Alternate,
-                                   VecTypOp0, 0, nullptr);
+    Type *Ty = Shuffle->getType();
+    if (Shuffle->isReverse())
+      return TTIImpl->getShuffleCost(SK_Reverse, Ty, 0, nullptr);
 
-      if (isZeroEltBroadcastVectorMask(Mask))
-        return getShuffleCost(TargetTransformInfo::SK_Broadcast,
-                                   VecTypOp0, 0, nullptr);
+    if (Shuffle->isSelect())
+      return TTIImpl->getShuffleCost(SK_Select, Ty, 0, nullptr);
 
-      if (isSingleSourceVectorMask(Mask))
-        return getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                   VecTypOp0, 0, nullptr);
+    if (Shuffle->isTranspose())
+      return TTIImpl->getShuffleCost(SK_Transpose, Ty, 0, nullptr);
 
-      return getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
-                                 VecTypOp0, 0, nullptr);
-    }
+    if (Shuffle->isZeroEltSplat())
+      return TTIImpl->getShuffleCost(SK_Broadcast, Ty, 0, nullptr);
 
-    return -1;
+    if (Shuffle->isSingleSource())
+      return TTIImpl->getShuffleCost(SK_PermuteSingleSrc, Ty, 0, nullptr);
+
+    return TTIImpl->getShuffleCost(SK_PermuteTwoSrc, Ty, 0, nullptr);
   }
   case Instruction::Call:
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
diff --git a/contrib/llvm/lib/Analysis/Trace.cpp b/contrib/llvm/lib/Analysis/Trace.cpp
index 34c998501a6c..4dec53151ed6 100644
--- a/contrib/llvm/lib/Analysis/Trace.cpp
+++ b/contrib/llvm/lib/Analysis/Trace.cpp
@@ -16,6 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/Trace.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Compiler.h"
diff --git a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 173db399b9d6..25a154edf4ac 100644
--- a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -104,21 +104,6 @@
 // If neither node is an ancestor of the other and they have the same root,
 // then we say NoAlias.
 //
-// TODO: The current metadata format doesn't support struct
-// fields. For example:
-//   struct X {
-//     double d;
-//     int i;
-//   };
-//   void foo(struct X *x, struct X *y, double *p) {
-//     *x = *y;
-//     *p = 0.0;
-//   }
-// Struct X has a double member, so the store to *x can alias the store to *p.
-// Currently it's not possible to precisely describe all the things struct X
-// aliases, so struct assignments must use conservative TBAA nodes. There's
-// no scheme for attaching metadata to @llvm.memcpy yet either.
-//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
@@ -146,6 +131,17 @@ static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true), cl::Hidden);
 
 namespace {
 
+/// isNewFormatTypeNode - Return true iff the given type node is in the new
+/// size-aware format.
+static bool isNewFormatTypeNode(const MDNode *N) {
+  if (N->getNumOperands() < 3)
+    return false;
+  // In the old format the first operand is a string.
+  if (!isa<MDNode>(N->getOperand(0)))
+    return false;
+  return true;
+}
+
 /// This is a simple wrapper around an MDNode which provides a higher-level
 /// interface by hiding the details of how alias analysis information is encoded
 /// in its operands.
@@ -160,8 +156,15 @@ public:
   /// getNode - Get the MDNode for this TBAANode.
   MDNodeTy *getNode() const { return Node; }
 
+  /// isNewFormat - Return true iff the wrapped type node is in the new
+  /// size-aware format.
+  bool isNewFormat() const { return isNewFormatTypeNode(Node); }
+
   /// getParent - Get this TBAANode's Alias tree parent.
   TBAANodeImpl<MDNodeTy> getParent() const {
+    if (isNewFormat())
+      return TBAANodeImpl(cast<MDNodeTy>(Node->getOperand(0)));
+
     if (Node->getNumOperands() < 2)
       return TBAANodeImpl<MDNodeTy>();
     MDNodeTy *P = dyn_cast_or_null<MDNodeTy>(Node->getOperand(1));
@@ -196,7 +199,7 @@ using MutableTBAANode = TBAANodeImpl<MDNode>;
 /// information is encoded in its operands.
 template<typename MDNodeTy>
 class TBAAStructTagNodeImpl {
-  /// This node should be created with createTBAAStructTagNode.
+  /// This node should be created with createTBAAAccessTag().
   MDNodeTy *Node;
 
 public:
@@ -205,6 +208,17 @@ public:
   /// Get the MDNode for this TBAAStructTagNode.
   MDNodeTy *getNode() const { return Node; }
 
+  /// isNewFormat - Return true iff the wrapped access tag is in the new
+  /// size-aware format.
+  bool isNewFormat() const {
+    if (Node->getNumOperands() < 4)
+      return false;
+    if (MDNodeTy *AccessType = getAccessType())
+      if (!TBAANodeImpl<MDNodeTy>(AccessType).isNewFormat())
+        return false;
+    return true;
+  }
+
   MDNodeTy *getBaseType() const {
     return dyn_cast_or_null<MDNode>(Node->getOperand(0));
   }
@@ -217,13 +231,20 @@ public:
     return mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
   }
 
+  uint64_t getSize() const {
+    if (!isNewFormat())
+      return UINT64_MAX;
+    return mdconst::extract<ConstantInt>(Node->getOperand(3))->getZExtValue();
+  }
+
   /// Test if this TBAAStructTagNode represents a type for objects
   /// which are not modified (by any means) in the context where this
   /// AliasAnalysis is relevant.
   bool isTypeImmutable() const {
-    if (Node->getNumOperands() < 4)
+    unsigned OpNo = isNewFormat() ? 4 : 3;
+    if (Node->getNumOperands() < OpNo + 1)
       return false;
-    ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(3));
+    ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(OpNo));
     if (!CI)
       return false;
     return CI->getValue()[0];
@@ -241,7 +262,7 @@ using MutableTBAAStructTagNode = TBAAStructTagNodeImpl<MDNode>;
 /// higher-level interface by hiding the details of how alias analysis
 /// information is encoded in its operands.
 class TBAAStructTypeNode {
-  /// This node should be created with createTBAAStructTypeNode.
+  /// This node should be created with createTBAATypeNode().
   const MDNode *Node = nullptr;
 
 public:
@@ -251,43 +272,80 @@ public:
   /// Get the MDNode for this TBAAStructTypeNode.
   const MDNode *getNode() const { return Node; }
 
+  /// isNewFormat - Return true iff the wrapped type node is in the new
+  /// size-aware format.
+  bool isNewFormat() const { return isNewFormatTypeNode(Node); }
+
+  bool operator==(const TBAAStructTypeNode &Other) const {
+    return getNode() == Other.getNode();
+  }
+
+  /// getId - Return type identifier.
+  Metadata *getId() const {
+    return Node->getOperand(isNewFormat() ? 2 : 0);
+  }
+
+  unsigned getNumFields() const {
+    unsigned FirstFieldOpNo = isNewFormat() ? 3 : 1;
+    unsigned NumOpsPerField = isNewFormat() ? 3 : 2;
+    return (getNode()->getNumOperands() - FirstFieldOpNo) / NumOpsPerField;
+  }
+
+  TBAAStructTypeNode getFieldType(unsigned FieldIndex) const {
+    unsigned FirstFieldOpNo = isNewFormat() ? 3 : 1;
+    unsigned NumOpsPerField = isNewFormat() ? 3 : 2;
+    unsigned OpIndex = FirstFieldOpNo + FieldIndex * NumOpsPerField;
+    auto *TypeNode = cast<MDNode>(getNode()->getOperand(OpIndex));
+    return TBAAStructTypeNode(TypeNode);
+  }
+
   /// Get this TBAAStructTypeNode's field in the type DAG with
   /// given offset. Update the offset to be relative to the field type.
-  TBAAStructTypeNode getParent(uint64_t &Offset) const {
-    // Parent can be omitted for the root node.
-    if (Node->getNumOperands() < 2)
-      return TBAAStructTypeNode();
-
-    // Fast path for a scalar type node and a struct type node with a single
-    // field.
-    if (Node->getNumOperands() <= 3) {
-      uint64_t Cur = Node->getNumOperands() == 2
-                         ? 0
-                         : mdconst::extract<ConstantInt>(Node->getOperand(2))
-                               ->getZExtValue();
-      Offset -= Cur;
-      MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
-      if (!P)
+  TBAAStructTypeNode getField(uint64_t &Offset) const {
+    bool NewFormat = isNewFormat();
+    if (NewFormat) {
+      // New-format root and scalar type nodes have no fields.
+      if (Node->getNumOperands() < 6)
+        return TBAAStructTypeNode();
+    } else {
+      // Parent can be omitted for the root node.
+      if (Node->getNumOperands() < 2)
         return TBAAStructTypeNode();
-      return TBAAStructTypeNode(P);
+
+      // Fast path for a scalar type node and a struct type node with a single
+      // field.
+      if (Node->getNumOperands() <= 3) {
+        uint64_t Cur = Node->getNumOperands() == 2
+                           ? 0
+                           : mdconst::extract<ConstantInt>(Node->getOperand(2))
+                                 ->getZExtValue();
+        Offset -= Cur;
+        MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
+        if (!P)
+          return TBAAStructTypeNode();
+        return TBAAStructTypeNode(P);
+      }
     }
 
     // Assume the offsets are in order. We return the previous field if
     // the current offset is bigger than the given offset.
+    unsigned FirstFieldOpNo = NewFormat ? 3 : 1;
+    unsigned NumOpsPerField = NewFormat ? 3 : 2;
     unsigned TheIdx = 0;
-    for (unsigned Idx = 1; Idx < Node->getNumOperands(); Idx += 2) {
+    for (unsigned Idx = FirstFieldOpNo; Idx < Node->getNumOperands();
+         Idx += NumOpsPerField) {
       uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1))
                          ->getZExtValue();
       if (Cur > Offset) {
-        assert(Idx >= 3 &&
-               "TBAAStructTypeNode::getParent should have an offset match!");
-        TheIdx = Idx - 2;
+        assert(Idx >= FirstFieldOpNo + NumOpsPerField &&
+               "TBAAStructTypeNode::getField should have an offset match!");
+        TheIdx = Idx - NumOpsPerField;
         break;
       }
     }
     // Move along the last field.
     if (TheIdx == 0)
-      TheIdx = Node->getNumOperands() - 2;
+      TheIdx = Node->getNumOperands() - NumOpsPerField;
     uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1))
                        ->getZExtValue();
     Offset -= Cur;
@@ -403,15 +461,11 @@ bool MDNode::isTBAAVtableAccess() const {
   }
 
   // For struct-path aware TBAA, we use the access type of the tag.
-  if (getNumOperands() < 2)
-    return false;
-  MDNode *Tag = cast_or_null<MDNode>(getOperand(1));
-  if (!Tag)
-    return false;
-  if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
-    if (Tag1->getString() == "vtable pointer")
+  TBAAStructTagNode Tag(this);
+  TBAAStructTypeNode AccessType(Tag.getAccessType());
+  if(auto *Id = dyn_cast<MDString>(AccessType.getId()))
+    if (Id->getString() == "vtable pointer")
       return true;
-  }
   return false;
 }
 
@@ -485,26 +539,6 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
     N.NoAlias = getMetadata(LLVMContext::MD_noalias);
 }
 
-static bool findAccessType(TBAAStructTagNode BaseTag,
-                           const MDNode *AccessTypeNode,
-                           uint64_t &OffsetInBase) {
-  // Start from the base type, follow the edge with the correct offset in
-  // the type DAG and adjust the offset until we reach the access type or
-  // until we reach a root node.
-  TBAAStructTypeNode BaseType(BaseTag.getBaseType());
-  OffsetInBase = BaseTag.getOffset();
-
-  while (const MDNode *BaseTypeNode = BaseType.getNode()) {
-    if (BaseTypeNode == AccessTypeNode)
-      return true;
-
-    // Follow the edge with the correct offset, Offset will be adjusted to
-    // be relative to the field type.
-    BaseType = BaseType.getParent(OffsetInBase);
-  }
-  return false;
-}
-
 static const MDNode *createAccessTag(const MDNode *AccessType) {
   // If there is no access type or the access type is the root node, then
   // we don't have any useful access tag to return.
@@ -512,12 +546,111 @@ static const MDNode *createAccessTag(const MDNode *AccessType) {
     return nullptr;
 
   Type *Int64 = IntegerType::get(AccessType->getContext(), 64);
-  auto *ImmutabilityFlag = ConstantAsMetadata::get(ConstantInt::get(Int64, 0));
+  auto *OffsetNode = ConstantAsMetadata::get(ConstantInt::get(Int64, 0));
+
+  if (TBAAStructTypeNode(AccessType).isNewFormat()) {
+    // TODO: Take access ranges into account when matching access tags and
+    // fix this code to generate actual access sizes for generic tags.
+    uint64_t AccessSize = UINT64_MAX;
+    auto *SizeNode =
+        ConstantAsMetadata::get(ConstantInt::get(Int64, AccessSize));
+    Metadata *Ops[] = {const_cast<MDNode*>(AccessType),
+                       const_cast<MDNode*>(AccessType),
+                       OffsetNode, SizeNode};
+    return MDNode::get(AccessType->getContext(), Ops);
+  }
+
   Metadata *Ops[] = {const_cast<MDNode*>(AccessType),
-                     const_cast<MDNode*>(AccessType), ImmutabilityFlag};
+                     const_cast<MDNode*>(AccessType),
+                     OffsetNode};
   return MDNode::get(AccessType->getContext(), Ops);
 }
 
+static bool hasField(TBAAStructTypeNode BaseType,
+                     TBAAStructTypeNode FieldType) {
+  for (unsigned I = 0, E = BaseType.getNumFields(); I != E; ++I) {
+    TBAAStructTypeNode T = BaseType.getFieldType(I);
+    if (T == FieldType || hasField(T, FieldType))
+      return true;
+  }
+  return false;
+}
+
+/// Return true if for two given accesses, one of the accessed objects may be a
+/// subobject of the other. The \p BaseTag and \p SubobjectTag parameters
+/// describe the accesses to the base object and the subobject respectively.
+/// \p CommonType must be the metadata node describing the common type of the
+/// accessed objects. On return, \p MayAlias is set to true iff these accesses
+/// may alias and \p Generic, if not null, points to the most generic access
+/// tag for the given two.
+static bool mayBeAccessToSubobjectOf(TBAAStructTagNode BaseTag,
+                                     TBAAStructTagNode SubobjectTag,
+                                     const MDNode *CommonType,
+                                     const MDNode **GenericTag,
+                                     bool &MayAlias) {
+  // If the base object is of the least common type, then this may be an access
+  // to its subobject.
+  if (BaseTag.getAccessType() == BaseTag.getBaseType() &&
+      BaseTag.getAccessType() == CommonType) {
+    if (GenericTag)
+      *GenericTag = createAccessTag(CommonType);
+    MayAlias = true;
+    return true;
+  }
+
+  // If the access to the base object is through a field of the subobject's
+  // type, then this may be an access to that field. To check for that we start
+  // from the base type, follow the edge with the correct offset in the type DAG
+  // and adjust the offset until we reach the field type or until we reach the
+  // access type.
+  bool NewFormat = BaseTag.isNewFormat();
+  TBAAStructTypeNode BaseType(BaseTag.getBaseType());
+  uint64_t OffsetInBase = BaseTag.getOffset();
+
+  for (;;) {
+    // In the old format there is no distinction between fields and parent
+    // types, so in this case we consider all nodes up to the root.
+    if (!BaseType.getNode()) {
+      assert(!NewFormat && "Did not see access type in access path!");
+      break;
+    }
+
+    if (BaseType.getNode() == SubobjectTag.getBaseType()) {
+      bool SameMemberAccess = OffsetInBase == SubobjectTag.getOffset();
+      if (GenericTag) {
+        *GenericTag = SameMemberAccess ? SubobjectTag.getNode() :
+                                         createAccessTag(CommonType);
+      }
+      MayAlias = SameMemberAccess;
+      return true;
+    }
+
+    // With new-format nodes we stop at the access type.
+    if (NewFormat && BaseType.getNode() == BaseTag.getAccessType())
+      break;
+
+    // Follow the edge with the correct offset. Offset will be adjusted to
+    // be relative to the field type.
+    BaseType = BaseType.getField(OffsetInBase);
+  }
+
+  // If the base object has a direct or indirect field of the subobject's type,
+  // then this may be an access to that field. We need this to check now that
+  // we support aggregates as access types.
+  if (NewFormat) {
+    // TBAAStructTypeNode BaseAccessType(BaseTag.getAccessType());
+    TBAAStructTypeNode FieldType(SubobjectTag.getBaseType());
+    if (hasField(BaseType, FieldType)) {
+      if (GenericTag)
+        *GenericTag = createAccessTag(CommonType);
+      MayAlias = true;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// matchTags - Return true if the given couple of accesses are allowed to
 /// overlap. If \arg GenericTag is not null, then on return it points to the
 /// most generic access descriptor for the given two.
@@ -545,38 +678,26 @@ static bool matchAccessTags(const MDNode *A, const MDNode *B,
   const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(),
                                                 TagB.getAccessType());
 
-  // TODO: We need to check if AccessType of TagA encloses AccessType of
-  // TagB to support aggregate AccessType. If yes, return true.
-
-  // Climb the type DAG from base type of A to see if we reach base type of B.
-  uint64_t OffsetA;
-  if (findAccessType(TagA, TagB.getBaseType(), OffsetA)) {
-    bool SameMemberAccess = OffsetA == TagB.getOffset();
+  // If the final access types have different roots, they're part of different
+  // potentially unrelated type systems, so we must be conservative.
+  if (!CommonType) {
     if (GenericTag)
-      *GenericTag = SameMemberAccess ? TagB.getNode() :
-                                       createAccessTag(CommonType);
-    return SameMemberAccess;
+      *GenericTag = nullptr;
+    return true;
   }
 
-  // Climb the type DAG from base type of B to see if we reach base type of A.
-  uint64_t OffsetB;
-  if (findAccessType(TagB, TagA.getBaseType(), OffsetB)) {
-    bool SameMemberAccess = OffsetB == TagA.getOffset();
-    if (GenericTag)
-      *GenericTag = SameMemberAccess ? TagA.getNode() :
-                                       createAccessTag(CommonType);
-    return SameMemberAccess;
-  }
+  // If one of the accessed objects may be a subobject of the other, then such
+  // accesses may alias.
+  bool MayAlias;
+  if (mayBeAccessToSubobjectOf(/* BaseTag= */ TagA, /* SubobjectTag= */ TagB,
+                               CommonType, GenericTag, MayAlias) ||
+      mayBeAccessToSubobjectOf(/* BaseTag= */ TagB, /* SubobjectTag= */ TagA,
+                               CommonType, GenericTag, MayAlias))
+    return MayAlias;
 
+  // Otherwise, we've proved there's no alias.
   if (GenericTag)
     *GenericTag = createAccessTag(CommonType);
-
-  // If the final access types have different roots, they're part of different
-  // potentially unrelated type systems, so we must be conservative.
-  if (!CommonType)
-    return true;
-
-  // If they have the same root, then we've proved there's no alias.
   return false;
 }
 
diff --git a/contrib/llvm/lib/Analysis/ValueTracking.cpp b/contrib/llvm/lib/Analysis/ValueTracking.cpp
index 46ac3f451f81..04a7b73c22bf 100644
--- a/contrib/llvm/lib/Analysis/ValueTracking.cpp
+++ b/contrib/llvm/lib/Analysis/ValueTracking.cpp
@@ -89,7 +89,7 @@ static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
 
-  return DL.getPointerTypeSizeInBits(Ty);
+  return DL.getIndexTypeSizeInBits(Ty);
 }
 
 namespace {
@@ -190,6 +190,14 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
          "LHS and RHS should have the same type");
   assert(LHS->getType()->isIntOrIntVectorTy() &&
          "LHS and RHS should be integers");
+  // Look for an inverted mask: (X & ~M) op (Y & M).
+  Value *M;
+  if (match(LHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
+      match(RHS, m_c_And(m_Specific(M), m_Value())))
+    return true;
+  if (match(RHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
+      match(LHS, m_c_And(m_Specific(M), m_Value())))
+    return true;
   IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
   KnownBits LHSKnown(IT->getBitWidth());
   KnownBits RHSKnown(IT->getBitWidth());
@@ -493,6 +501,7 @@ bool llvm::isAssumeLikeIntrinsic(const Instruction *I) {
       case Intrinsic::sideeffect:
       case Intrinsic::dbg_declare:
       case Intrinsic::dbg_value:
+      case Intrinsic::dbg_label:
       case Intrinsic::invariant_start:
       case Intrinsic::invariant_end:
       case Intrinsic::lifetime_start:
@@ -530,7 +539,7 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
   if (Inv->getParent() != CxtI->getParent())
     return false;
 
-  // If we have a dom tree, then we now know that the assume doens't dominate
+  // If we have a dom tree, then we now know that the assume doesn't dominate
   // the other instruction.  If we don't have a dom tree then we can check if
   // the assume is first in the BB.
   if (!DT) {
@@ -574,7 +583,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
     if (Q.isExcluded(I))
       continue;
 
-    // Warning: This loop can end up being somewhat performance sensetive.
+    // Warning: This loop can end up being somewhat performance sensitive.
     // We're running this loop for once for each value queried resulting in a
     // runtime of ~O(#assumes * #values).
 
@@ -816,6 +825,14 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
+      // If the RHS is known zero, then this assumption must be wrong (nothing
+      // is unsigned less than zero). Signal a conflict and get out of here.
+      if (RHSKnown.isZero()) {
+        Known.Zero.setAllBits();
+        Known.One.setAllBits();
+        break;
+      }
+
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
@@ -848,7 +865,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
 /// Compute known bits from a shift operator, including those with a
 /// non-constant shift amount. Known is the output of this function. Known2 is a
 /// pre-allocated temporary with the same bit width as Known. KZF and KOF are
-/// operator-specific functors that, given the known-zero or known-one bits
+/// operator-specific functions that, given the known-zero or known-one bits
 /// respectively, and a shift amount, compute the implied known-zero or
 /// known-one bits of the shift operator's result respectively for that shift
 /// amount. The results from calling KZF and KOF are conservatively combined for
@@ -966,12 +983,9 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     // matching the form add(x, add(x, y)) where y is odd.
     // TODO: This could be generalized to clearing any bit set in y where the
     // following bit is known to be unset in y.
-    Value *Y = nullptr;
+    Value *X = nullptr, *Y = nullptr;
     if (!Known.Zero[0] && !Known.One[0] &&
-        (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
-                                       m_Value(Y))) ||
-         match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
-                                       m_Value(Y))))) {
+        match(I, m_c_BinOp(m_Value(X), m_Add(m_Deferred(X), m_Value(Y))))) {
       Known2.resetAll();
       computeKnownBits(Y, Known2, Depth + 1, Q);
       if (Known2.countMinTrailingOnes() > 0)
@@ -1064,6 +1078,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       // leading zero bits.
       MaxHighZeros =
           std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
+    } else if (SPF == SPF_ABS) {
+      // RHS from matchSelectPattern returns the negation part of abs pattern.
+      // If the negate has an NSW flag we can assume the sign bit of the result
+      // will be 0 because that makes abs(INT_MIN) undefined.
+      if (cast<Instruction>(RHS)->hasNoSignedWrap())
+        MaxHighZeros = 1;
     }
 
     // Only known if known in both the LHS and RHS.
@@ -1093,7 +1113,10 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     unsigned SrcBitWidth;
     // Note that we handle pointer operands here because of inttoptr/ptrtoint
     // which fall through here.
-    SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType());
+    Type *ScalarTy = SrcTy->getScalarType();
+    SrcBitWidth = ScalarTy->isPointerTy() ?
+      Q.DL.getIndexTypeSizeInBits(ScalarTy) :
+      Q.DL.getTypeSizeInBits(ScalarTy);
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     Known = Known.zextOrTrunc(SrcBitWidth);
@@ -1106,7 +1129,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
   }
   case Instruction::BitCast: {
     Type *SrcTy = I->getOperand(0)->getType();
-    if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+    if (SrcTy->isIntOrPtrTy() &&
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
@@ -1547,9 +1570,13 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
   assert((V->getType()->isIntOrIntVectorTy(BitWidth) ||
           V->getType()->isPtrOrPtrVectorTy()) &&
          "Not integer or pointer type!");
-  assert(Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth &&
-         "V and Known should have same BitWidth");
+
+  Type *ScalarTy = V->getType()->getScalarType();
+  unsigned ExpectedWidth = ScalarTy->isPointerTy() ?
+    Q.DL.getIndexTypeSizeInBits(ScalarTy) : Q.DL.getTypeSizeInBits(ScalarTy);
+  assert(ExpectedWidth == BitWidth && "V and Known should have same BitWidth");
   (void)BitWidth;
+  (void)ExpectedWidth;
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
@@ -1646,14 +1673,11 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                             const Query &Q) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
-  if (const Constant *C = dyn_cast<Constant>(V)) {
-    if (C->isNullValue())
-      return OrZero;
-
-    const APInt *ConstIntOrConstSplatInt;
-    if (match(C, m_APInt(ConstIntOrConstSplatInt)))
-      return ConstIntOrConstSplatInt->isPowerOf2();
-  }
+  // Attempt to match against constants.
+  if (OrZero && match(V, m_Power2OrZero()))
+      return true;
+  if (match(V, m_Power2()))
+      return true;
 
   // 1 << X is clearly a power of two if the one is not shifted off the end.  If
   // it is shifted off the end then the result is undefined.
@@ -1737,7 +1761,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
   return false;
 }
 
-/// \brief Test whether a GEP's result is known to be non-null.
+/// Test whether a GEP's result is known to be non-null.
 ///
 /// Uses properties inherent in a GEP to try to determine whether it is known
 /// to be non-null.
@@ -1745,7 +1769,12 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
 /// Currently this routine does not support vector GEPs.
 static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
                               const Query &Q) {
-  if (!GEP->isInBounds() || GEP->getPointerAddressSpace() != 0)
+  const Function *F = nullptr;
+  if (const Instruction *I = dyn_cast<Instruction>(GEP))
+    F = I->getFunction();
+
+  if (!GEP->isInBounds() ||
+      NullPointerIsDefined(F, GEP->getPointerAddressSpace()))
     return false;
 
   // FIXME: Support vector-GEPs.
@@ -1919,6 +1948,10 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     }
   }
 
+  // Some of the tests below are recursive, so bail out if we hit the limit.
+  if (Depth++ >= MaxDepth)
+    return false;
+
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
     // Alloca never returns null, malloc might.
@@ -1935,14 +1968,14 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
       if (LI->getMetadata(LLVMContext::MD_nonnull))
         return true;
 
-    if (auto CS = ImmutableCallSite(V))
+    if (auto CS = ImmutableCallSite(V)) {
       if (CS.isReturnNonNull())
         return true;
+      if (const auto *RP = getArgumentAliasingToReturnedPointer(CS))
+        return isKnownNonZero(RP, Depth, Q);
+    }
   }
 
-  // The remaining tests are all recursive, so bail out if we hit the limit.
-  if (Depth++ >= MaxDepth)
-    return false;
 
   // Check for recursive pointer simplifications.
   if (V->getType()->isPointerTy()) {
@@ -2180,7 +2213,7 @@ static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
-/// vector element with the mininum number of known sign bits.
+/// vector element with the minimum number of known sign bits.
 static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
                                        const Query &Q) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
@@ -2189,7 +2222,11 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
   // in V, so for undef we have to conservatively return 1.  We don't have the
   // same behavior for poison though -- that's a FIXME today.
 
-  unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
+  Type *ScalarTy = V->getType()->getScalarType();
+  unsigned TyBits = ScalarTy->isPointerTy() ?
+    Q.DL.getIndexTypeSizeInBits(ScalarTy) :
+    Q.DL.getTypeSizeInBits(ScalarTy);
+
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
@@ -2300,7 +2337,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
 
   case Instruction::Select:
     Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (Tmp == 1) return 1;  // Early out.
+    if (Tmp == 1) break;
     Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
     return std::min(Tmp, Tmp2);
 
@@ -2308,7 +2345,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (Tmp == 1) return 1;  // Early out.
+    if (Tmp == 1) break;
 
     // Special case decrementing a value (ADD X, -1):
     if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
@@ -2328,12 +2365,12 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
       }
 
     Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (Tmp2 == 1) return 1;
+    if (Tmp2 == 1) break;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Sub:
     Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (Tmp2 == 1) return 1;
+    if (Tmp2 == 1) break;
 
     // Handle NEG.
     if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
@@ -2356,15 +2393,15 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (Tmp == 1) return 1;  // Early out.
+    if (Tmp == 1) break;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Mul: {
     // The output of the Mul can be at most twice the valid bits in the inputs.
     unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (SignBitsOp0 == 1) return 1;  // Early out.
+    if (SignBitsOp0 == 1) break;
     unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (SignBitsOp1 == 1) return 1;
+    if (SignBitsOp1 == 1) break;
     unsigned OutValidBits =
         (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1);
     return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1;
@@ -2671,7 +2708,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
       return true;
 
   // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
-  if (match(Op, m_FAdd(m_Value(), m_Zero())))
+  if (match(Op, m_FAdd(m_Value(), m_PosZeroFP())))
     return true;
 
   // sitofp and uitofp turn into +0.0 for zero.
@@ -2712,6 +2749,24 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
            (!SignBitOnly && CFP->getValueAPF().isZero());
   }
 
+  // Handle vector of constants.
+  if (auto *CV = dyn_cast<Constant>(V)) {
+    if (CV->getType()->isVectorTy()) {
+      unsigned NumElts = CV->getType()->getVectorNumElements();
+      for (unsigned i = 0; i != NumElts; ++i) {
+        auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
+        if (!CFP)
+          return false;
+        if (CFP->getValueAPF().isNegative() &&
+            (SignBitOnly || !CFP->getValueAPF().isZero()))
+          return false;
+      }
+
+      // All non-negative ConstantFPs.
+      return true;
+    }
+  }
+
   if (Depth == MaxDepth)
     return false; // Limit search depth.
 
@@ -2749,6 +2804,12 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
     // Widening/narrowing never change sign.
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1);
+  case Instruction::ExtractElement:
+    // Look through extract element. At the moment we keep this simple and skip
+    // tracking the specific element. But at least we might find information
+    // valid for all elements of the vector.
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::Call:
     const auto *CI = cast<CallInst>(I);
     Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
@@ -2963,7 +3024,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
   if (!V)
     return nullptr;
 
-  // Insert the value in the new (sub) aggregrate
+  // Insert the value in the new (sub) aggregate
   return InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
                                  "tmp", InsertBefore);
 }
@@ -2992,9 +3053,9 @@ static Value *BuildSubAggregate(Value *From, ArrayRef<unsigned> idx_range,
   return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
 }
 
-/// Given an aggregrate and an sequence of indices, see if
-/// the scalar value indexed is already around as a register, for example if it
-/// were inserted directly into the aggregrate.
+/// Given an aggregate and a sequence of indices, see if the scalar value
+/// indexed is already around as a register, for example if it was inserted
+/// directly into the aggregate.
 ///
 /// If InsertBefore is not null, this function will duplicate (modified)
 /// insertvalues when a part of a nested struct is extracted.
@@ -3086,7 +3147,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 /// pointer plus a constant offset. Return the base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                               const DataLayout &DL) {
-  unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(Ptr->getType());
   APInt ByteOffset(BitWidth, 0);
 
   // We walk up the defs but use a visited set to handle unreachable code. In
@@ -3104,7 +3165,7 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
       // means when we construct GEPOffset, we need to use the size
       // of GEP's pointer type rather than the size of the original
       // pointer type.
-      APInt GEPOffset(DL.getPointerTypeSizeInBits(Ptr->getType()), 0);
+      APInt GEPOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
       if (!GEP->accumulateConstantOffset(DL, GEPOffset))
         break;
 
@@ -3326,7 +3387,8 @@ static uint64_t GetStringLengthH(const Value *V,
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
-  if (!V->getType()->isPointerTy()) return 0;
+  if (!V->getType()->isPointerTy())
+    return 0;
 
   SmallPtrSet<const PHINode*, 32> PHIs;
   uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
@@ -3335,7 +3397,24 @@ uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
   return Len == ~0ULL ? 1 : Len;
 }
 
-/// \brief \p PN defines a loop-variant pointer to an object.  Check if the
+const Value *llvm::getArgumentAliasingToReturnedPointer(ImmutableCallSite CS) {
+  assert(CS &&
+         "getArgumentAliasingToReturnedPointer only works on nonnull CallSite");
+  if (const Value *RV = CS.getReturnedArgOperand())
+    return RV;
+  // This can be used only as a aliasing property.
+  if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CS))
+    return CS.getArgOperand(0);
+  return nullptr;
+}
+
+bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+    ImmutableCallSite CS) {
+  return CS.getIntrinsicID() == Intrinsic::launder_invariant_group ||
+         CS.getIntrinsicID() == Intrinsic::strip_invariant_group;
+}
+
+/// \p PN defines a loop-variant pointer to an object.  Check if the
 /// previous iteration of the loop was referring to the same object as \p PN.
 static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
                                          const LoopInfo *LI) {
@@ -3380,11 +3459,21 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
       // An alloca can't be further simplified.
       return V;
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
-          V = RV;
+      if (auto CS = CallSite(V)) {
+        // CaptureTracking can know about special capturing properties of some
+        // intrinsics like launder.invariant.group, that can't be expressed with
+        // the attributes, but have properties like returning aliasing pointer.
+        // Because some analysis may assume that nocaptured pointer is not
+        // returned from some special intrinsic (because function would have to
+        // be marked with returns attribute), it is crucial to use this function
+        // because it should be in sync with CaptureTracking. Not using it may
+        // cause weird miscompilations where 2 aliasing pointers are assumed to
+        // noalias.
+        if (auto *RP = getArgumentAliasingToReturnedPointer(CS)) {
+          V = RP;
           continue;
         }
+      }
 
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -3658,6 +3747,48 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
   return OverflowResult::MayOverflow;
 }
 
+OverflowResult llvm::computeOverflowForSignedMul(const Value *LHS,
+                                                 const Value *RHS,
+                                                 const DataLayout &DL,
+                                                 AssumptionCache *AC,
+                                                 const Instruction *CxtI,
+                                                 const DominatorTree *DT) {
+  // Multiplying n * m significant bits yields a result of n + m significant
+  // bits. If the total number of significant bits does not exceed the
+  // result bit width (minus 1), there is no overflow.
+  // This means if we have enough leading sign bits in the operands
+  // we can guarantee that the result does not overflow.
+  // Ref: "Hacker's Delight" by Henry Warren
+  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+
+  // Note that underestimating the number of sign bits gives a more
+  // conservative answer.
+  unsigned SignBits = ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) +
+                      ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT);
+
+  // First handle the easy case: if we have enough sign bits there's
+  // definitely no overflow.
+  if (SignBits > BitWidth + 1)
+    return OverflowResult::NeverOverflows;
+
+  // There are two ambiguous cases where there can be no overflow:
+  //   SignBits == BitWidth + 1    and
+  //   SignBits == BitWidth
+  // The second case is difficult to check, therefore we only handle the
+  // first case.
+  if (SignBits == BitWidth + 1) {
+    // It overflows only when both arguments are negative and the true
+    // product is exactly the minimum negative number.
+    // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
+    // For simplicity we just check if at least one side is not negative.
+    KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+    KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
+      return OverflowResult::NeverOverflows;
+  }
+  return OverflowResult::MayOverflow;
+}
+
 OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
                                                    const Value *RHS,
                                                    const DataLayout &DL,
@@ -3684,7 +3815,7 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
   return OverflowResult::MayOverflow;
 }
 
-/// \brief Return true if we can prove that adding the two values of the
+/// Return true if we can prove that adding the two values of the
 /// knownbits will not overflow.
 /// Otherwise return false.
 static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
@@ -3787,6 +3918,47 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
   return OverflowResult::MayOverflow;
 }
 
+OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
+                                                   const Value *RHS,
+                                                   const DataLayout &DL,
+                                                   AssumptionCache *AC,
+                                                   const Instruction *CxtI,
+                                                   const DominatorTree *DT) {
+  // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
+    return OverflowResult::NeverOverflows;
+
+  return OverflowResult::MayOverflow;
+}
+
+OverflowResult llvm::computeOverflowForSignedSub(const Value *LHS,
+                                                 const Value *RHS,
+                                                 const DataLayout &DL,
+                                                 AssumptionCache *AC,
+                                                 const Instruction *CxtI,
+                                                 const DominatorTree *DT) {
+  // If LHS and RHS each have at least two sign bits, the subtraction
+  // cannot overflow.
+  if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
+      ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
+    return OverflowResult::NeverOverflows;
+
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, 0, AC, CxtI, DT);
+
+  KnownBits RHSKnown = computeKnownBits(RHS, DL, 0, AC, CxtI, DT);
+
+  // Subtraction of two 2's complement numbers having identical signs will
+  // never overflow.
+  if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
+    return OverflowResult::NeverOverflows;
+
+  // TODO: implement logic similar to checkRippleForAdd
+  return OverflowResult::MayOverflow;
+}
+
 bool llvm::isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
                                      const DominatorTree &DT) {
 #ifndef NDEBUG
@@ -3928,6 +4100,15 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
   return true;
 }
 
+bool llvm::isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) {
+  // TODO: This is slightly consdervative for invoke instruction since exiting
+  // via an exception *is* normal control for them.
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+      return false;
+  return true;
+}
+
 bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
                                                   const Loop *L) {
   // The loop header is guaranteed to be executed for every iteration.
@@ -4180,7 +4361,9 @@ static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
   if (L.Flavor != R.Flavor)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
-  // Match the compare to the min/max operations of the select operands.
+  // We have something like: x Pred y ? min(a, b) : min(c, d).
+  // Try to match the compare to the min/max operations of the select operands.
+  // First, make sure we have the right compare predicate.
   switch (L.Flavor) {
   case SPF_SMIN:
     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) {
@@ -4218,21 +4401,38 @@ static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
     return {SPF_UNKNOWN, SPNB_NA, false};
   }
 
-  // a pred c ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
-  if (CmpLHS == A && CmpRHS == C && D == B)
-    return {L.Flavor, SPNB_NA, false};
+  // If there is a common operand in the already matched min/max and the other
+  // min/max operands match the compare operands (either directly or inverted),
+  // then this is min/max of the same flavor.
 
+  // a pred c ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
+  // ~c pred ~a ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
+  if (D == B) {
+    if ((CmpLHS == A && CmpRHS == C) || (match(C, m_Not(m_Specific(CmpLHS))) &&
+                                         match(A, m_Not(m_Specific(CmpRHS)))))
+      return {L.Flavor, SPNB_NA, false};
+  }
   // a pred d ? m(a, b) : m(b, d) --> m(m(a, b), m(b, d))
-  if (CmpLHS == A && CmpRHS == D && C == B)
-    return {L.Flavor, SPNB_NA, false};
-
+  // ~d pred ~a ? m(a, b) : m(b, d) --> m(m(a, b), m(b, d))
+  if (C == B) {
+    if ((CmpLHS == A && CmpRHS == D) || (match(D, m_Not(m_Specific(CmpLHS))) &&
+                                         match(A, m_Not(m_Specific(CmpRHS)))))
+      return {L.Flavor, SPNB_NA, false};
+  }
   // b pred c ? m(a, b) : m(c, a) --> m(m(a, b), m(c, a))
-  if (CmpLHS == B && CmpRHS == C && D == A)
-    return {L.Flavor, SPNB_NA, false};
-
+  // ~c pred ~b ? m(a, b) : m(c, a) --> m(m(a, b), m(c, a))
+  if (D == A) {
+    if ((CmpLHS == B && CmpRHS == C) || (match(C, m_Not(m_Specific(CmpLHS))) &&
+                                         match(B, m_Not(m_Specific(CmpRHS)))))
+      return {L.Flavor, SPNB_NA, false};
+  }
   // b pred d ? m(a, b) : m(a, d) --> m(m(a, b), m(a, d))
-  if (CmpLHS == B && CmpRHS == D && C == A)
-    return {L.Flavor, SPNB_NA, false};
+  // ~d pred ~b ? m(a, b) : m(a, d) --> m(m(a, b), m(a, d))
+  if (C == A) {
+    if ((CmpLHS == B && CmpRHS == D) || (match(D, m_Not(m_Specific(CmpLHS))) &&
+                                         match(B, m_Not(m_Specific(CmpRHS)))))
+      return {L.Flavor, SPNB_NA, false};
+  }
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
@@ -4311,6 +4511,27 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
+bool llvm::isKnownNegation(const Value *X, const Value *Y, bool NeedNSW) {
+  assert(X && Y && "Invalid operand");
+
+  // X = sub (0, Y) || X = sub nsw (0, Y)
+  if ((!NeedNSW && match(X, m_Sub(m_ZeroInt(), m_Specific(Y)))) ||
+      (NeedNSW && match(X, m_NSWSub(m_ZeroInt(), m_Specific(Y)))))
+    return true;
+
+  // Y = sub (0, X) || Y = sub nsw (0, X)
+  if ((!NeedNSW && match(Y, m_Sub(m_ZeroInt(), m_Specific(X)))) ||
+      (NeedNSW && match(Y, m_NSWSub(m_ZeroInt(), m_Specific(X)))))
+    return true;
+
+  // X = sub (A, B), Y = sub (B, A) || X = sub nsw (A, B), Y = sub nsw (B, A)
+  Value *A, *B;
+  return (!NeedNSW && (match(X, m_Sub(m_Value(A), m_Value(B))) &&
+                        match(Y, m_Sub(m_Specific(B), m_Specific(A))))) ||
+         (NeedNSW && (match(X, m_NSWSub(m_Value(A), m_Value(B))) &&
+                       match(Y, m_NSWSub(m_Specific(B), m_Specific(A)))));
+}
+
 static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               FastMathFlags FMF,
                                               Value *CmpLHS, Value *CmpRHS,
@@ -4409,25 +4630,49 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
     case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered};
     }
   }
-
-  const APInt *C1;
-  if (match(CmpRHS, m_APInt(C1))) {
-    if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) ||
-        (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) {
-
-      // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
-      // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
-      if (Pred == ICmpInst::ICMP_SGT &&
-          (C1->isNullValue() || C1->isAllOnesValue())) {
-        return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
-      }
-
-      // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
-      // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
-      if (Pred == ICmpInst::ICMP_SLT &&
-          (C1->isNullValue() || C1->isOneValue())) {
-        return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
-      }
+  
+  if (isKnownNegation(TrueVal, FalseVal)) {
+    // Sign-extending LHS does not change its sign, so TrueVal/FalseVal can
+    // match against either LHS or sext(LHS).
+    auto MaybeSExtCmpLHS =
+        m_CombineOr(m_Specific(CmpLHS), m_SExt(m_Specific(CmpLHS)));
+    auto ZeroOrAllOnes = m_CombineOr(m_ZeroInt(), m_AllOnes());
+    auto ZeroOrOne = m_CombineOr(m_ZeroInt(), m_One());
+    if (match(TrueVal, MaybeSExtCmpLHS)) {
+      // Set the return values. If the compare uses the negated value (-X >s 0),
+      // swap the return values because the negated value is always 'RHS'.
+      LHS = TrueVal;
+      RHS = FalseVal;
+      if (match(CmpLHS, m_Neg(m_Specific(FalseVal))))
+        std::swap(LHS, RHS);
+
+      // (X >s 0) ? X : -X or (X >s -1) ? X : -X --> ABS(X)
+      // (-X >s 0) ? -X : X or (-X >s -1) ? -X : X --> ABS(X)
+      if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, ZeroOrAllOnes))
+        return {SPF_ABS, SPNB_NA, false};
+
+      // (X <s 0) ? X : -X or (X <s 1) ? X : -X --> NABS(X)
+      // (-X <s 0) ? -X : X or (-X <s 1) ? -X : X --> NABS(X)
+      if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, ZeroOrOne))
+        return {SPF_NABS, SPNB_NA, false};
+    }
+    else if (match(FalseVal, MaybeSExtCmpLHS)) {
+      // Set the return values. If the compare uses the negated value (-X >s 0),
+      // swap the return values because the negated value is always 'RHS'.
+      LHS = FalseVal;
+      RHS = TrueVal;
+      if (match(CmpLHS, m_Neg(m_Specific(TrueVal))))
+        std::swap(LHS, RHS);
+
+      // (X >s 0) ? -X : X or (X >s -1) ? -X : X --> NABS(X)
+      // (-X >s 0) ? X : -X or (-X >s -1) ? X : -X --> NABS(X)
+      if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, ZeroOrAllOnes))
+        return {SPF_NABS, SPNB_NA, false};
+
+      // (X <s 0) ? -X : X or (X <s 1) ? -X : X --> ABS(X)
+      // (-X <s 0) ? X : -X or (-X <s 1) ? X : -X --> ABS(X)
+      if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, ZeroOrOne))
+        return {SPF_ABS, SPNB_NA, false};
     }
   }
 
@@ -4449,7 +4694,7 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
 ///
 /// The function processes the case when type of true and false values of a
 /// select instruction differs from type of the cmp instruction operands because
-/// of a cast instructon. The function checks if it is legal to move the cast
+/// of a cast instruction. The function checks if it is legal to move the cast
 /// operation after "select". If yes, it returns the new second value of
 /// "select" (with the assumption that cast is moved):
 /// 1. As operand of cast instruction when both values of "select" are same cast
@@ -4602,6 +4847,30 @@ SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
                               LHS, RHS, Depth);
 }
 
+CmpInst::Predicate llvm::getMinMaxPred(SelectPatternFlavor SPF, bool Ordered) {
+  if (SPF == SPF_SMIN) return ICmpInst::ICMP_SLT;
+  if (SPF == SPF_UMIN) return ICmpInst::ICMP_ULT;
+  if (SPF == SPF_SMAX) return ICmpInst::ICMP_SGT;
+  if (SPF == SPF_UMAX) return ICmpInst::ICMP_UGT;
+  if (SPF == SPF_FMINNUM)
+    return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT;
+  if (SPF == SPF_FMAXNUM)
+    return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;
+  llvm_unreachable("unhandled!");
+}
+
+SelectPatternFlavor llvm::getInverseMinMaxFlavor(SelectPatternFlavor SPF) {
+  if (SPF == SPF_SMIN) return SPF_SMAX;
+  if (SPF == SPF_UMIN) return SPF_UMAX;
+  if (SPF == SPF_SMAX) return SPF_SMIN;
+  if (SPF == SPF_UMAX) return SPF_UMIN;
+  llvm_unreachable("unhandled!");
+}
+
+CmpInst::Predicate llvm::getInverseMinMaxPred(SelectPatternFlavor SPF) {
+  return getMinMaxPred(getInverseMinMaxFlavor(SPF));
+}
+
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
                             const Value *RHS, const DataLayout &DL,
diff --git a/contrib/llvm/lib/Analysis/VectorUtils.cpp b/contrib/llvm/lib/Analysis/VectorUtils.cpp
index 2becfbfe8a8d..d73d24736439 100644
--- a/contrib/llvm/lib/Analysis/VectorUtils.cpp
+++ b/contrib/llvm/lib/Analysis/VectorUtils.cpp
@@ -28,7 +28,7 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-/// \brief Identify if the intrinsic is trivially vectorizable.
+/// Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all
 /// scalars for the scalar form of the intrinsic and all vectors for
 /// the vector form of the intrinsic.
@@ -67,7 +67,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   }
 }
 
-/// \brief Identifies if the intrinsic has a scalar operand. It check for
+/// Identifies if the intrinsic has a scalar operand. It check for
 /// ctlz,cttz and powi special intrinsics whose argument is scalar.
 bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
                                         unsigned ScalarOpdIdx) {
@@ -81,7 +81,7 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
   }
 }
 
-/// \brief Returns intrinsic ID for call.
+/// Returns intrinsic ID for call.
 /// For the input call instruction it finds mapping intrinsic and returns
 /// its ID, in case it does not found it return not_intrinsic.
 Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
@@ -97,7 +97,7 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
   return Intrinsic::not_intrinsic;
 }
 
-/// \brief Find the operand of the GEP that should be checked for consecutive
+/// Find the operand of the GEP that should be checked for consecutive
 /// stores. This ignores trailing indices that have no effect on the final
 /// pointer.
 unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
@@ -121,7 +121,7 @@ unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
   return LastOperand;
 }
 
-/// \brief If the argument is a GEP, then returns the operand identified by
+/// If the argument is a GEP, then returns the operand identified by
 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 /// operand, it returns that instead.
 Value *llvm::stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
@@ -140,7 +140,7 @@ Value *llvm::stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   return GEP->getOperand(InductionOperand);
 }
 
-/// \brief If a value has only one user that is a CastInst, return it.
+/// If a value has only one user that is a CastInst, return it.
 Value *llvm::getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
   Value *UniqueCast = nullptr;
   for (User *U : Ptr->users()) {
@@ -155,7 +155,7 @@ Value *llvm::getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
   return UniqueCast;
 }
 
-/// \brief Get the stride of a pointer access in a loop. Looks for symbolic
+/// Get the stride of a pointer access in a loop. Looks for symbolic
 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
 Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
@@ -163,7 +163,7 @@ Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
     return nullptr;
 
   // Try to remove a gep instruction to make the pointer (actually index at this
-  // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
+  // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
   // pointer, otherwise, we are analyzing the index.
   Value *OrigPtr = Ptr;
 
@@ -230,7 +230,7 @@ Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   return Stride;
 }
 
-/// \brief Given a vector and an element number, see if the scalar value is
+/// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
 Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
@@ -280,7 +280,7 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
   return nullptr;
 }
 
-/// \brief Get splat value if the input is a splat vector or return nullptr.
+/// Get splat value if the input is a splat vector or return nullptr.
 /// This function is not fully general. It checks only 2 cases:
 /// the input value is (1) a splat constants vector or (2) a sequence
 /// of instructions that broadcast a single value into a vector.
diff --git a/contrib/llvm/lib/AsmParser/LLLexer.cpp b/contrib/llvm/lib/AsmParser/LLLexer.cpp
index d8be4ad42ad5..da9855ff630b 100644
--- a/contrib/llvm/lib/AsmParser/LLLexer.cpp
+++ b/contrib/llvm/lib/AsmParser/LLLexer.cpp
@@ -157,9 +157,10 @@ static const char *isLabelTail(const char *CurPtr) {
 // Lexer definition.
 //===----------------------------------------------------------------------===//
 
-LLLexer::LLLexer(StringRef StartBuf, SourceMgr &sm, SMDiagnostic &Err,
+LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
                  LLVMContext &C)
-  : CurBuf(StartBuf), ErrorInfo(Err), SM(sm), Context(C), APFloatVal(0.0) {
+    : CurBuf(StartBuf), ErrorInfo(Err), SM(SM), Context(C), APFloatVal(0.0),
+      IgnoreColonInIdentifiers(false) {
   CurPtr = CurBuf.begin();
 }
 
@@ -219,6 +220,10 @@ lltok::Kind LLLexer::LexToken() {
       SkipLineComment();
       continue;
     case '!': return LexExclaim();
+    case '^':
+      return LexCaret();
+    case ':':
+      return lltok::colon;
     case '#': return LexHash();
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -328,6 +333,22 @@ bool LLLexer::ReadVarName() {
   return false;
 }
 
+// Lex an ID: [0-9]+. On success, the ID is stored in UIntVal and Token is
+// returned, otherwise the Error token is returned.
+lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
+  if (!isdigit(static_cast<unsigned char>(CurPtr[0])))
+    return lltok::Error;
+
+  for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+    /*empty*/;
+
+  uint64_t Val = atoull(TokStart + 1, CurPtr);
+  if ((unsigned)Val != Val)
+    Error("invalid value number (too large)!");
+  UIntVal = unsigned(Val);
+  return Token;
+}
+
 lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
   // Handle StringConstant: \"[^\"]*\"
   if (CurPtr[0] == '"') {
@@ -357,17 +378,7 @@ lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
     return Var;
 
   // Handle VarID: [0-9]+
-  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
-    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
-      /*empty*/;
-
-    uint64_t Val = atoull(TokStart+1, CurPtr);
-    if ((unsigned)Val != Val)
-      Error("invalid value number (too large)!");
-    UIntVal = unsigned(Val);
-    return VarID;
-  }
-  return lltok::Error;
+  return LexUIntID(VarID);
 }
 
 /// Lex all tokens that start with a % character.
@@ -420,22 +431,18 @@ lltok::Kind LLLexer::LexExclaim() {
   return lltok::exclaim;
 }
 
+/// Lex all tokens that start with a ^ character.
+///    SummaryID ::= ^[0-9]+
+lltok::Kind LLLexer::LexCaret() {
+  // Handle SummaryID: ^[0-9]+
+  return LexUIntID(lltok::SummaryID);
+}
+
 /// Lex all tokens that start with a # character.
 ///    AttrGrpID ::= #[0-9]+
 lltok::Kind LLLexer::LexHash() {
   // Handle AttrGrpID: #[0-9]+
-  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
-    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
-      /*empty*/;
-
-    uint64_t Val = atoull(TokStart+1, CurPtr);
-    if ((unsigned)Val != Val)
-      Error("invalid value number (too large)!");
-    UIntVal = unsigned(Val);
-    return lltok::AttrGrpID;
-  }
-
-  return lltok::Error;
+  return LexUIntID(lltok::AttrGrpID);
 }
 
 /// Lex a label, integer type, keyword, or hexadecimal integer constant.
@@ -457,8 +464,9 @@ lltok::Kind LLLexer::LexIdentifier() {
       KeywordEnd = CurPtr;
   }
 
-  // If we stopped due to a colon, this really is a label.
-  if (*CurPtr == ':') {
+  // If we stopped due to a colon, unless we were directed to ignore it,
+  // this really is a label.
+  if (!IgnoreColonInIdentifiers && *CurPtr == ':') {
     StrVal.assign(StartChar-1, CurPtr++);
     return lltok::LabelStr;
   }
@@ -648,7 +656,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nonnull);
   KEYWORD(noredzone);
   KEYWORD(noreturn);
+  KEYWORD(nocf_check);
   KEYWORD(nounwind);
+  KEYWORD(optforfuzzing);
   KEYWORD(optnone);
   KEYWORD(optsize);
   KEYWORD(readnone);
@@ -663,6 +673,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(sspstrong);
   KEYWORD(strictfp);
   KEYWORD(safestack);
+  KEYWORD(shadowcallstack);
   KEYWORD(sanitize_address);
   KEYWORD(sanitize_hwaddress);
   KEYWORD(sanitize_thread);
@@ -708,6 +719,73 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(catch);
   KEYWORD(filter);
 
+  // Summary index keywords.
+  KEYWORD(path);
+  KEYWORD(hash);
+  KEYWORD(gv);
+  KEYWORD(guid);
+  KEYWORD(name);
+  KEYWORD(summaries);
+  KEYWORD(flags);
+  KEYWORD(linkage);
+  KEYWORD(notEligibleToImport);
+  KEYWORD(live);
+  KEYWORD(dsoLocal);
+  KEYWORD(function);
+  KEYWORD(insts);
+  KEYWORD(funcFlags);
+  KEYWORD(readNone);
+  KEYWORD(readOnly);
+  KEYWORD(noRecurse);
+  KEYWORD(returnDoesNotAlias);
+  KEYWORD(calls);
+  KEYWORD(callee);
+  KEYWORD(hotness);
+  KEYWORD(unknown);
+  KEYWORD(hot);
+  KEYWORD(critical);
+  KEYWORD(relbf);
+  KEYWORD(variable);
+  KEYWORD(aliasee);
+  KEYWORD(refs);
+  KEYWORD(typeIdInfo);
+  KEYWORD(typeTests);
+  KEYWORD(typeTestAssumeVCalls);
+  KEYWORD(typeCheckedLoadVCalls);
+  KEYWORD(typeTestAssumeConstVCalls);
+  KEYWORD(typeCheckedLoadConstVCalls);
+  KEYWORD(vFuncId);
+  KEYWORD(offset);
+  KEYWORD(args);
+  KEYWORD(typeid);
+  KEYWORD(summary);
+  KEYWORD(typeTestRes);
+  KEYWORD(kind);
+  KEYWORD(unsat);
+  KEYWORD(byteArray);
+  KEYWORD(inline);
+  KEYWORD(single);
+  KEYWORD(allOnes);
+  KEYWORD(sizeM1BitWidth);
+  KEYWORD(alignLog2);
+  KEYWORD(sizeM1);
+  KEYWORD(bitMask);
+  KEYWORD(inlineBits);
+  KEYWORD(wpdResolutions);
+  KEYWORD(wpdRes);
+  KEYWORD(indir);
+  KEYWORD(singleImpl);
+  KEYWORD(branchFunnel);
+  KEYWORD(singleImplName);
+  KEYWORD(resByArg);
+  KEYWORD(byArg);
+  KEYWORD(uniformRetVal);
+  KEYWORD(uniqueRetVal);
+  KEYWORD(virtualConstProp);
+  KEYWORD(info);
+  KEYWORD(byte);
+  KEYWORD(bit);
+
 #undef KEYWORD
 
   // Keywords for types.
diff --git a/contrib/llvm/lib/AsmParser/LLLexer.h b/contrib/llvm/lib/AsmParser/LLLexer.h
index 90bf17d7a747..21deb6e08910 100644
--- a/contrib/llvm/lib/AsmParser/LLLexer.h
+++ b/contrib/llvm/lib/AsmParser/LLLexer.h
@@ -42,6 +42,10 @@ namespace llvm {
     APFloat APFloatVal;
     APSInt  APSIntVal;
 
+    // When false (default), an identifier ending in ':' is a label token.
+    // When true, the ':' is treated as a separate token.
+    bool IgnoreColonInIdentifiers;
+
   public:
     explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
                      LLVMContext &C);
@@ -59,8 +63,11 @@ namespace llvm {
     const APSInt &getAPSIntVal() const { return APSIntVal; }
     const APFloat &getAPFloatVal() const { return APFloatVal; }
 
+    void setIgnoreColonInIdentifiers(bool val) {
+      IgnoreColonInIdentifiers = val;
+    }
 
-    bool Error(LocTy L, const Twine &Msg) const;
+    bool Error(LocTy ErrorLoc, const Twine &Msg) const;
     bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); }
 
     void Warning(LocTy WarningLoc, const Twine &Msg) const;
@@ -81,15 +88,17 @@ namespace llvm {
     lltok::Kind LexDollar();
     lltok::Kind LexExclaim();
     lltok::Kind LexPercent();
+    lltok::Kind LexUIntID(lltok::Kind Token);
     lltok::Kind LexVar(lltok::Kind Var, lltok::Kind VarID);
     lltok::Kind LexQuote();
     lltok::Kind Lex0x();
     lltok::Kind LexHash();
+    lltok::Kind LexCaret();
 
     uint64_t atoull(const char *Buffer, const char *End);
     uint64_t HexIntToVal(const char *Buffer, const char *End);
     void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
-    void FP80HexToIntPair(const char *Buff, const char *End, uint64_t Pair[2]);
+    void FP80HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
   };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm/lib/AsmParser/LLParser.cpp
index c3ab95550e03..599b59bf61e8 100644
--- a/contrib/llvm/lib/AsmParser/LLParser.cpp
+++ b/contrib/llvm/lib/AsmParser/LLParser.cpp
@@ -71,8 +71,8 @@ bool LLParser::Run() {
         Lex.getLoc(),
         "Can't read textual IR with a Context that discards named Values");
 
-  return ParseTopLevelEntities() ||
-         ValidateEndOfModule();
+  return ParseTopLevelEntities() || ValidateEndOfModule() ||
+         ValidateEndOfIndex();
 }
 
 bool LLParser::parseStandaloneConstantValue(Constant *&C,
@@ -120,6 +120,8 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) {
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
 bool LLParser::ValidateEndOfModule() {
+  if (!M)
+    return false;
   // Handle any function attribute group forward references.
   for (const auto &RAG : ForwardRefAttrGroups) {
     Value *V = RAG.first;
@@ -258,11 +260,54 @@ bool LLParser::ValidateEndOfModule() {
   return false;
 }
 
+/// Do final validity and sanity checks at the end of the index.
+bool LLParser::ValidateEndOfIndex() {
+  if (!Index)
+    return false;
+
+  if (!ForwardRefValueInfos.empty())
+    return Error(ForwardRefValueInfos.begin()->second.front().second,
+                 "use of undefined summary '^" +
+                     Twine(ForwardRefValueInfos.begin()->first) + "'");
+
+  if (!ForwardRefAliasees.empty())
+    return Error(ForwardRefAliasees.begin()->second.front().second,
+                 "use of undefined summary '^" +
+                     Twine(ForwardRefAliasees.begin()->first) + "'");
+
+  if (!ForwardRefTypeIds.empty())
+    return Error(ForwardRefTypeIds.begin()->second.front().second,
+                 "use of undefined type id summary '^" +
+                     Twine(ForwardRefTypeIds.begin()->first) + "'");
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Top-Level Entities
 //===----------------------------------------------------------------------===//
 
 bool LLParser::ParseTopLevelEntities() {
+  // If there is no Module, then parse just the summary index entries.
+  if (!M) {
+    while (true) {
+      switch (Lex.getKind()) {
+      case lltok::Eof:
+        return false;
+      case lltok::SummaryID:
+        if (ParseSummaryEntry())
+          return true;
+        break;
+      case lltok::kw_source_filename:
+        if (ParseSourceFileName())
+          return true;
+        break;
+      default:
+        // Skip everything else
+        Lex.Lex();
+      }
+    }
+  }
   while (true) {
     switch (Lex.getKind()) {
     default:         return TokError("expected top-level entity");
@@ -282,6 +327,10 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
     case lltok::ComdatVar:  if (parseComdat()) return true; break;
     case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
+    case lltok::SummaryID:
+      if (ParseSummaryEntry())
+        return true;
+      break;
     case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
     case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
     case lltok::kw_uselistorder: if (ParseUseListOrder()) return true; break;
@@ -327,7 +376,8 @@ bool LLParser::ParseTargetDefinition() {
     if (ParseToken(lltok::equal, "expected '=' after target datalayout") ||
         ParseStringConstant(Str))
       return true;
-    M->setDataLayout(Str);
+    if (DataLayoutStr.empty())
+      M->setDataLayout(Str);
     return false;
   }
 }
@@ -336,12 +386,12 @@ bool LLParser::ParseTargetDefinition() {
 ///   ::= 'source_filename' '=' STRINGCONSTANT
 bool LLParser::ParseSourceFileName() {
   assert(Lex.getKind() == lltok::kw_source_filename);
-  std::string Str;
   Lex.Lex();
   if (ParseToken(lltok::equal, "expected '=' after source_filename") ||
-      ParseStringConstant(Str))
+      ParseStringConstant(SourceFileName))
     return true;
-  M->setSourceFileName(Str);
+  if (M)
+    M->setSourceFileName(SourceFileName);
   return false;
 }
 
@@ -710,11 +760,87 @@ bool LLParser::ParseStandaloneMetadata() {
   return false;
 }
 
+// Skips a single module summary entry.
+bool LLParser::SkipModuleSummaryEntry() {
+  // Each module summary entry consists of a tag for the entry
+  // type, followed by a colon, then the fields surrounded by nested sets of
+  // parentheses. The "tag:" looks like a Label. Once parsing support is
+  // in place we will look for the tokens corresponding to the expected tags.
+  if (Lex.getKind() != lltok::kw_gv && Lex.getKind() != lltok::kw_module &&
+      Lex.getKind() != lltok::kw_typeid)
+    return TokError(
+        "Expected 'gv', 'module', or 'typeid' at the start of summary entry");
+  Lex.Lex();
+  if (ParseToken(lltok::colon, "expected ':' at start of summary entry") ||
+      ParseToken(lltok::lparen, "expected '(' at start of summary entry"))
+    return true;
+  // Now walk through the parenthesized entry, until the number of open
+  // parentheses goes back down to 0 (the first '(' was parsed above).
+  unsigned NumOpenParen = 1;
+  do {
+    switch (Lex.getKind()) {
+    case lltok::lparen:
+      NumOpenParen++;
+      break;
+    case lltok::rparen:
+      NumOpenParen--;
+      break;
+    case lltok::Eof:
+      return TokError("found end of file while parsing summary entry");
+    default:
+      // Skip everything in between parentheses.
+      break;
+    }
+    Lex.Lex();
+  } while (NumOpenParen > 0);
+  return false;
+}
+
+/// SummaryEntry
+///   ::= SummaryID '=' GVEntry | ModuleEntry | TypeIdEntry
+bool LLParser::ParseSummaryEntry() {
+  assert(Lex.getKind() == lltok::SummaryID);
+  unsigned SummaryID = Lex.getUIntVal();
+
+  // For summary entries, colons should be treated as distinct tokens,
+  // not an indication of the end of a label token.
+  Lex.setIgnoreColonInIdentifiers(true);
+
+  Lex.Lex();
+  if (ParseToken(lltok::equal, "expected '=' here"))
+    return true;
+
+  // If we don't have an index object, skip the summary entry.
+  if (!Index)
+    return SkipModuleSummaryEntry();
+
+  switch (Lex.getKind()) {
+  case lltok::kw_gv:
+    return ParseGVEntry(SummaryID);
+  case lltok::kw_module:
+    return ParseModuleEntry(SummaryID);
+  case lltok::kw_typeid:
+    return ParseTypeIdEntry(SummaryID);
+    break;
+  default:
+    return Error(Lex.getLoc(), "unexpected summary kind");
+  }
+  Lex.setIgnoreColonInIdentifiers(false);
+  return false;
+}
+
 static bool isValidVisibilityForLinkage(unsigned V, unsigned L) {
   return !GlobalValue::isLocalLinkage((GlobalValue::LinkageTypes)L) ||
          (GlobalValue::VisibilityTypes)V == GlobalValue::DefaultVisibility;
 }
 
+// If there was an explicit dso_local, update GV. In the absence of an explicit
+// dso_local we keep the default value.
+static void maybeSetDSOLocal(bool DSOLocal, GlobalValue &GV) {
+  if (DSOLocal)
+    GV.setDSOLocal(true);
+}
+
 /// parseIndirectSymbol:
 ///   ::= GlobalVar '=' OptionalLinkage OptionalPreemptionSpecifier 
 ///                     OptionalVisibility OptionalDLLStorageClass
@@ -749,11 +875,6 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
     return Error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
-  if (DSOLocal && !IsAlias) {
-    return Error(NameLoc,
-                 "dso_local is invalid on ifunc");
-  }
-
   Type *Ty;
   LocTy ExplicitTypeLoc = Lex.getLoc();
   if (ParseType(Ty) ||
@@ -826,7 +947,7 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
   GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
   GA->setUnnamedAddr(UnnamedAddr);
-  GA->setDSOLocal(DSOLocal);
+  maybeSetDSOLocal(DSOLocal, *GA);
 
   if (Name.empty())
     NumberedVals.push_back(GA.get());
@@ -947,7 +1068,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
     GV->setInitializer(Init);
   GV->setConstant(IsConstant);
   GV->setLinkage((GlobalValue::LinkageTypes)Linkage);
-  GV->setDSOLocal(DSOLocal);
+  maybeSetDSOLocal(DSOLocal, *GV);
   GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GV->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
   GV->setExternallyInitialized(IsExternallyInitialized);
@@ -1128,8 +1249,11 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break;
     case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break;
     case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break;
+    case lltok::kw_nocf_check: B.addAttribute(Attribute::NoCfCheck); break;
     case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break;
     case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optforfuzzing:
+      B.addAttribute(Attribute::OptForFuzzing); break;
     case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break;
     case lltok::kw_optsize: B.addAttribute(Attribute::OptimizeForSize); break;
     case lltok::kw_readnone: B.addAttribute(Attribute::ReadNone); break;
@@ -1142,6 +1266,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_sspstrong:
       B.addAttribute(Attribute::StackProtectStrong); break;
     case lltok::kw_safestack: B.addAttribute(Attribute::SafeStack); break;
+    case lltok::kw_shadowcallstack:
+      B.addAttribute(Attribute::ShadowCallStack); break;
     case lltok::kw_sanitize_address:
       B.addAttribute(Attribute::SanitizeAddress); break;
     case lltok::kw_sanitize_hwaddress:
@@ -1465,7 +1591,9 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_nonlazybind:
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
+    case lltok::kw_nocf_check:
     case lltok::kw_nounwind:
+    case lltok::kw_optforfuzzing:
     case lltok::kw_optnone:
     case lltok::kw_optsize:
     case lltok::kw_returns_twice:
@@ -1477,6 +1605,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_sspreq:
     case lltok::kw_sspstrong:
     case lltok::kw_safestack:
+    case lltok::kw_shadowcallstack:
     case lltok::kw_strictfp:
     case lltok::kw_uwtable:
       HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
@@ -1558,7 +1687,9 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_nonlazybind:
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
+    case lltok::kw_nocf_check:
     case lltok::kw_nounwind:
+    case lltok::kw_optforfuzzing:
     case lltok::kw_optnone:
     case lltok::kw_optsize:
     case lltok::kw_returns_twice:
@@ -1570,6 +1701,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_sspreq:
     case lltok::kw_sspstrong:
     case lltok::kw_safestack:
+    case lltok::kw_shadowcallstack:
     case lltok::kw_strictfp:
     case lltok::kw_uwtable:
       HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
@@ -2609,11 +2741,24 @@ bool LLParser::PerFunctionState::FinishFunction() {
   return false;
 }
 
+static bool isValidVariableType(Module *M, Type *Ty, Value *Val, bool IsCall) {
+  if (Val->getType() == Ty)
+    return true;
+  // For calls we also accept variables in the program address space
+  if (IsCall && isa<PointerType>(Ty)) {
+    Type *TyInProgAS = cast<PointerType>(Ty)->getElementType()->getPointerTo(
+        M->getDataLayout().getProgramAddressSpace());
+    if (Val->getType() == TyInProgAS)
+      return true;
+  }
+  return false;
+}
+
 /// GetVal - Get a value with the specified name or ID, creating a
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
 Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
-                                          LocTy Loc) {
+                                          LocTy Loc, bool IsCall) {
   // Look this name up in the normal function symbol table.
   Value *Val = F.getValueSymbolTable()->lookup(Name);
 
@@ -2627,7 +2772,8 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
 
   // If we have the value in the symbol table or fwd-ref table, return it.
   if (Val) {
-    if (Val->getType() == Ty) return Val;
+    if (isValidVariableType(P.M, Ty, Val, IsCall))
+      return Val;
     if (Ty->isLabelTy())
       P.Error(Loc, "'%" + Name + "' is not a basic block");
     else
@@ -2654,7 +2800,8 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
   return FwdVal;
 }
 
-Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc) {
+Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc,
+                                          bool IsCall) {
   // Look this name up in the normal function symbol table.
   Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
 
@@ -2668,7 +2815,8 @@ Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc) {
 
   // If we have the value in the symbol table or fwd-ref table, return it.
   if (Val) {
-    if (Val->getType() == Ty) return Val;
+    if (isValidVariableType(P.M, Ty, Val, IsCall))
+      return Val;
     if (Ty->isLabelTy())
       P.Error(Loc, "'%" + Twine(ID) + "' is not a basic block");
     else
@@ -2759,13 +2907,13 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
 /// forward reference record if needed.
 BasicBlock *LLParser::PerFunctionState::GetBB(const std::string &Name,
                                               LocTy Loc) {
-  return dyn_cast_or_null<BasicBlock>(GetVal(Name,
-                                      Type::getLabelTy(F.getContext()), Loc));
+  return dyn_cast_or_null<BasicBlock>(
+      GetVal(Name, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
 }
 
 BasicBlock *LLParser::PerFunctionState::GetBB(unsigned ID, LocTy Loc) {
-  return dyn_cast_or_null<BasicBlock>(GetVal(ID,
-                                      Type::getLabelTy(F.getContext()), Loc));
+  return dyn_cast_or_null<BasicBlock>(
+      GetVal(ID, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
 }
 
 /// DefineBB - Define the specified basic block, which is either named or
@@ -3384,7 +3532,7 @@ bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) {
   ValID ID;
   Value *V = nullptr;
   bool Parsed = ParseValID(ID) ||
-                ConvertValIDToValue(Ty, ID, V, nullptr);
+                ConvertValIDToValue(Ty, ID, V, nullptr, /*IsCall=*/false);
   if (V && !(C = dyn_cast<Constant>(V)))
     return Error(ID.Loc, "global values must be constants");
   return Parsed;
@@ -3490,6 +3638,39 @@ template <class FieldTy> struct MDFieldImpl {
       : Val(std::move(Default)), Seen(false) {}
 };
 
+/// Structure to represent an optional metadata field that
+/// can be of either type (A or B) and encapsulates the
+/// MD<typeofA>Field and MD<typeofB>Field structs, so not
+/// to reimplement the specifics for representing each Field.
+template <class FieldTypeA, class FieldTypeB> struct MDEitherFieldImpl {
+  typedef MDEitherFieldImpl<FieldTypeA, FieldTypeB> ImplTy;
+  FieldTypeA A;
+  FieldTypeB B;
+  bool Seen;
+
+  enum {
+    IsInvalid = 0,
+    IsTypeA = 1,
+    IsTypeB = 2
+  } WhatIs;
+
+  void assign(FieldTypeA A) {
+    Seen = true;
+    this->A = std::move(A);
+    WhatIs = IsTypeA;
+  }
+
+  void assign(FieldTypeB B) {
+    Seen = true;
+    this->B = std::move(B);
+    WhatIs = IsTypeB;
+  }
+
+  explicit MDEitherFieldImpl(FieldTypeA DefaultA, FieldTypeB DefaultB)
+      : A(std::move(DefaultA)), B(std::move(DefaultB)), Seen(false),
+        WhatIs(IsInvalid) {}
+};
+
 struct MDUnsignedField : public MDFieldImpl<uint64_t> {
   uint64_t Max;
 
@@ -3576,10 +3757,45 @@ struct MDFieldList : public MDFieldImpl<SmallVector<Metadata *, 4>> {
 };
 
 struct ChecksumKindField : public MDFieldImpl<DIFile::ChecksumKind> {
-  ChecksumKindField() : ImplTy(DIFile::CSK_None) {}
   ChecksumKindField(DIFile::ChecksumKind CSKind) : ImplTy(CSKind) {}
 };
 
+struct MDSignedOrMDField : MDEitherFieldImpl<MDSignedField, MDField> {
+  MDSignedOrMDField(int64_t Default = 0, bool AllowNull = true)
+      : ImplTy(MDSignedField(Default), MDField(AllowNull)) {}
+
+  MDSignedOrMDField(int64_t Default, int64_t Min, int64_t Max,
+                    bool AllowNull = true)
+      : ImplTy(MDSignedField(Default, Min, Max), MDField(AllowNull)) {}
+
+  bool isMDSignedField() const { return WhatIs == IsTypeA; }
+  bool isMDField() const { return WhatIs == IsTypeB; }
+  int64_t getMDSignedValue() const {
+    assert(isMDSignedField() && "Wrong field type");
+    return A.Val;
+  }
+  Metadata *getMDFieldValue() const {
+    assert(isMDField() && "Wrong field type");
+    return B.Val;
+  }
+};
+
+struct MDSignedOrUnsignedField
+    : MDEitherFieldImpl<MDSignedField, MDUnsignedField> {
+  MDSignedOrUnsignedField() : ImplTy(MDSignedField(0), MDUnsignedField(0)) {}
+
+  bool isMDSignedField() const { return WhatIs == IsTypeA; }
+  bool isMDUnsignedField() const { return WhatIs == IsTypeB; }
+  int64_t getMDSignedValue() const {
+    assert(isMDSignedField() && "Wrong field type");
+    return A.Val;
+  }
+  uint64_t getMDUnsignedValue() const {
+    assert(isMDUnsignedField() && "Wrong field type");
+    return B.Val;
+  }
+};
+
 } // end anonymous namespace
 
 namespace llvm {
@@ -3834,6 +4050,50 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDField &Result) {
 }
 
 template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            MDSignedOrMDField &Result) {
+  // Try to parse a signed int.
+  if (Lex.getKind() == lltok::APSInt) {
+    MDSignedField Res = Result.A;
+    if (!ParseMDField(Loc, Name, Res)) {
+      Result.assign(Res);
+      return false;
+    }
+    return true;
+  }
+
+  // Otherwise, try to parse as an MDField.
+  MDField Res = Result.B;
+  if (!ParseMDField(Loc, Name, Res)) {
+    Result.assign(Res);
+    return false;
+  }
+
+  return true;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            MDSignedOrUnsignedField &Result) {
+  if (Lex.getKind() != lltok::APSInt)
+    return false;
+
+  if (Lex.getAPSIntVal().isSigned()) {
+    MDSignedField Res = Result.A;
+    if (ParseMDField(Loc, Name, Res))
+      return true;
+    Result.assign(Res);
+    return false;
+  }
+
+  MDUnsignedField Res = Result.B;
+  if (ParseMDField(Loc, Name, Res))
+    return true;
+  Result.assign(Res);
+  return false;
+}
+
+template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
   LocTy ValueLoc = Lex.getLoc();
   std::string S;
@@ -3860,13 +4120,14 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDFieldList &Result) {
 template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             ChecksumKindField &Result) {
-  if (Lex.getKind() != lltok::ChecksumKind)
+  Optional<DIFile::ChecksumKind> CSKind =
+      DIFile::getChecksumKind(Lex.getStrVal());
+
+  if (Lex.getKind() != lltok::ChecksumKind || !CSKind)
     return TokError(
         "invalid checksum kind" + Twine(" '") + Lex.getStrVal() + "'");
 
-  DIFile::ChecksumKind CSKind = DIFile::getChecksumKind(Lex.getStrVal());
-
-  Result.assign(CSKind);
+  Result.assign(*CSKind);
   Lex.Lex();
   return false;
 }
@@ -3977,27 +4238,45 @@ bool LLParser::ParseGenericDINode(MDNode *&Result, bool IsDistinct) {
 
 /// ParseDISubrange:
 ///   ::= !DISubrange(count: 30, lowerBound: 2)
+///   ::= !DISubrange(count: !node, lowerBound: 2)
 bool LLParser::ParseDISubrange(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  REQUIRED(count, MDSignedField, (-1, -1, INT64_MAX));                         \
+  REQUIRED(count, MDSignedOrMDField, (-1, -1, INT64_MAX, false));              \
   OPTIONAL(lowerBound, MDSignedField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DISubrange, (Context, count.Val, lowerBound.Val));
+  if (count.isMDSignedField())
+    Result = GET_OR_DISTINCT(
+        DISubrange, (Context, count.getMDSignedValue(), lowerBound.Val));
+  else if (count.isMDField())
+    Result = GET_OR_DISTINCT(
+        DISubrange, (Context, count.getMDFieldValue(), lowerBound.Val));
+  else
+    return true;
+
   return false;
 }
 
 /// ParseDIEnumerator:
-///   ::= !DIEnumerator(value: 30, name: "SomeKind")
+///   ::= !DIEnumerator(value: 30, isUnsigned: true, name: "SomeKind")
 bool LLParser::ParseDIEnumerator(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(name, MDStringField, );                                             \
-  REQUIRED(value, MDSignedField, );
+  REQUIRED(value, MDSignedOrUnsignedField, );                                  \
+  OPTIONAL(isUnsigned, MDBoolField, (false));
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIEnumerator, (Context, value.Val, name.Val));
+  if (isUnsigned.Val && value.isMDSignedField())
+    return TokError("unsigned enumerator with negative value");
+
+  int64_t Value = value.isMDSignedField()
+                      ? value.getMDSignedValue()
+                      : static_cast<int64_t>(value.getMDUnsignedValue());
+  Result =
+      GET_OR_DISTINCT(DIEnumerator, (Context, Value, isUnsigned.Val, name.Val));
+
   return false;
 }
 
@@ -4068,7 +4347,8 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(runtimeLang, DwarfLangField, );                                     \
   OPTIONAL(vtableHolder, MDField, );                                           \
   OPTIONAL(templateParams, MDField, );                                         \
-  OPTIONAL(identifier, MDStringField, );
+  OPTIONAL(identifier, MDStringField, );                                       \
+  OPTIONAL(discriminator, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4078,7 +4358,7 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
             Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val,
             scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
             elements.Val, runtimeLang.Val, vtableHolder.Val,
-            templateParams.Val)) {
+            templateParams.Val, discriminator.Val)) {
       Result = CT;
       return false;
     }
@@ -4089,7 +4369,8 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
       DICompositeType,
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
        size.Val, align.Val, offset.Val, flags.Val, elements.Val,
-       runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val));
+       runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val,
+       discriminator.Val));
   return false;
 }
 
@@ -4107,20 +4388,34 @@ bool LLParser::ParseDISubroutineType(MDNode *&Result, bool IsDistinct) {
 }
 
 /// ParseDIFileType:
-///   ::= !DIFileType(filename: "path/to/file", directory: "/path/to/dir"
+///   ::= !DIFileType(filename: "path/to/file", directory: "/path/to/dir",
 ///                   checksumkind: CSK_MD5,
-///                   checksum: "000102030405060708090a0b0c0d0e0f")
+///                   checksum: "000102030405060708090a0b0c0d0e0f",
+///                   source: "source file contents")
 bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
+  // The default constructed value for checksumkind is required, but will never
+  // be used, as the parser checks if the field was actually Seen before using
+  // the Val.
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(filename, MDStringField, );                                         \
   REQUIRED(directory, MDStringField, );                                        \
-  OPTIONAL(checksumkind, ChecksumKindField, );                                 \
-  OPTIONAL(checksum, MDStringField, );
+  OPTIONAL(checksumkind, ChecksumKindField, (DIFile::CSK_MD5));                \
+  OPTIONAL(checksum, MDStringField, );                                         \
+  OPTIONAL(source, MDStringField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Optional<DIFile::ChecksumInfo<MDString *>> OptChecksum;
+  if (checksumkind.Seen && checksum.Seen)
+    OptChecksum.emplace(checksumkind.Val, checksum.Val);
+  else if (checksumkind.Seen || checksum.Seen)
+    return Lex.Error("'checksumkind' and 'checksum' must be provided together");
+
+  Optional<MDString *> OptSource;
+  if (source.Seen)
+    OptSource = source.Val;
   Result = GET_OR_DISTINCT(DIFile, (Context, filename.Val, directory.Val,
-                                    checksumkind.Val, checksum.Val));
+                                    OptChecksum, OptSource));
   return false;
 }
 
@@ -4170,7 +4465,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     virtuality: DW_VIRTUALTIY_pure_virtual,
 ///                     virtualIndex: 10, thisAdjustment: 4, flags: 11,
 ///                     isOptimized: false, templateParams: !4, declaration: !5,
-///                     variables: !6, thrownTypes: !7)
+///                     retainedNodes: !6, thrownTypes: !7)
 bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   auto Loc = Lex.getLoc();
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
@@ -4192,7 +4487,7 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(unit, MDField, );                                                   \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
-  OPTIONAL(variables, MDField, );                                              \
+  OPTIONAL(retainedNodes, MDField, );                                              \
   OPTIONAL(thrownTypes, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
@@ -4208,7 +4503,7 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
        type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val,
        containingType.Val, virtuality.Val, virtualIndex.Val, thisAdjustment.Val,
        flags.Val, isOptimized.Val, unit.Val, templateParams.Val,
-       declaration.Val, variables.Val, thrownTypes.Val));
+       declaration.Val, retainedNodes.Val, thrownTypes.Val));
   return false;
 }
 
@@ -4391,6 +4686,22 @@ bool LLParser::ParseDILocalVariable(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
+/// ParseDILabel:
+///   ::= !DILabel(scope: !0, name: "foo", file: !1, line: 7)
+bool LLParser::ParseDILabel(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(scope, MDField, (/* AllowNull */ false));                           \
+  REQUIRED(name, MDStringField, );                                             \
+  REQUIRED(file, MDField, );                                                   \
+  REQUIRED(line, LineField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DILabel,
+                           (Context, scope.Val, name.Val, file.Val, line.Val));
+  return false;
+}
+
 /// ParseDIExpression:
 ///   ::= !DIExpression(0, 7, -1)
 bool LLParser::ParseDIExpression(MDNode *&Result, bool IsDistinct) {
@@ -4579,18 +4890,18 @@ bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
 //===----------------------------------------------------------------------===//
 
 bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
-                                   PerFunctionState *PFS) {
+                                   PerFunctionState *PFS, bool IsCall) {
   if (Ty->isFunctionTy())
     return Error(ID.Loc, "functions are not values, refer to them as pointers");
 
   switch (ID.Kind) {
   case ValID::t_LocalID:
     if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
-    V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc);
+    V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_LocalName:
     if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
-    V = PFS->GetVal(ID.StrVal, Ty, ID.Loc);
+    V = PFS->GetVal(ID.StrVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_InlineAsm: {
     if (!ID.FTy || !InlineAsm::Verify(ID.FTy, ID.StrVal2))
@@ -4706,7 +5017,7 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
   case ValID::t_ConstantStruct:
   case ValID::t_PackedConstantStruct: {
     Value *V;
-    if (ConvertValIDToValue(Ty, ID, V, /*PFS=*/nullptr))
+    if (ConvertValIDToValue(Ty, ID, V, /*PFS=*/nullptr, /*IsCall=*/false))
       return true;
     assert(isa<Constant>(V) && "Expected a constant value");
     C = cast<Constant>(V);
@@ -4723,7 +5034,8 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
 bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
   V = nullptr;
   ValID ID;
-  return ParseValID(ID, PFS) || ConvertValIDToValue(Ty, ID, V, PFS);
+  return ParseValID(ID, PFS) ||
+         ConvertValIDToValue(Ty, ID, V, PFS, /*IsCall=*/false);
 }
 
 bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) {
@@ -4923,7 +5235,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     NumberedVals.push_back(Fn);
 
   Fn->setLinkage((GlobalValue::LinkageTypes)Linkage);
-  Fn->setDSOLocal(DSOLocal);
+  maybeSetDSOLocal(DSOLocal, *Fn);
   Fn->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   Fn->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
   Fn->setCallingConv(CC);
@@ -5476,7 +5788,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Look up the callee.
   Value *Callee;
-  if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS))
+  if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS,
+                          /*IsCall=*/true))
     return true;
 
   // Set up the Attribute for the function.
@@ -6067,7 +6380,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Look up the callee.
   Value *Callee;
-  if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS))
+  if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS,
+                          /*IsCall=*/true))
     return true;
 
   // Set up the Attribute for the function.
@@ -6174,14 +6488,7 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  const DataLayout &DL = M->getDataLayout();
-  unsigned AS = DL.getAllocaAddrSpace();
-  if (AS != AddrSpace) {
-    // TODO: In the future it should be possible to specify addrspace per-alloca.
-    return Error(ASLoc, "address space must match datalayout");
-  }
-
-  AllocaInst *AI = new AllocaInst(Ty, AS, Size, Alignment);
+  AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;
@@ -6567,8 +6874,8 @@ bool LLParser::sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes,
   if (NumUses < 2)
     return Error(Loc, "value only has one use");
   if (Order.size() != Indexes.size() || NumUses > Indexes.size())
-    return Error(Loc, "wrong number of indexes, expected " +
-                          Twine(std::distance(V->use_begin(), V->use_end())));
+    return Error(Loc,
+                 "wrong number of indexes, expected " + Twine(V->getNumUses()));
 
   V->sortUseList([&](const Use &L, const Use &R) {
     return Order.lookup(&L) < Order.lookup(&R);
@@ -6680,3 +6987,1170 @@ bool LLParser::ParseUseListOrderBB() {
 
   return sortUseListOrder(V, Indexes, Loc);
 }
+
+/// ModuleEntry
+///   ::= 'module' ':' '(' 'path' ':' STRINGCONSTANT ',' 'hash' ':' Hash ')'
+/// Hash ::= '(' UInt32 ',' UInt32 ',' UInt32 ',' UInt32 ',' UInt32 ')'
+bool LLParser::ParseModuleEntry(unsigned ID) {
+  assert(Lex.getKind() == lltok::kw_module);
+  Lex.Lex();
+
+  std::string Path;
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_path, "expected 'path' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseStringConstant(Path) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_hash, "expected 'hash' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  ModuleHash Hash;
+  if (ParseUInt32(Hash[0]) || ParseToken(lltok::comma, "expected ',' here") ||
+      ParseUInt32(Hash[1]) || ParseToken(lltok::comma, "expected ',' here") ||
+      ParseUInt32(Hash[2]) || ParseToken(lltok::comma, "expected ',' here") ||
+      ParseUInt32(Hash[3]) || ParseToken(lltok::comma, "expected ',' here") ||
+      ParseUInt32(Hash[4]))
+    return true;
+
+  if (ParseToken(lltok::rparen, "expected ')' here") ||
+      ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  auto ModuleEntry = Index->addModule(Path, ID, Hash);
+  ModuleIdMap[ID] = ModuleEntry->first();
+
+  return false;
+}
+
+/// TypeIdEntry
+///   ::= 'typeid' ':' '(' 'name' ':' STRINGCONSTANT ',' TypeIdSummary ')'
+bool LLParser::ParseTypeIdEntry(unsigned ID) {
+  assert(Lex.getKind() == lltok::kw_typeid);
+  Lex.Lex();
+
+  std::string Name;
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_name, "expected 'name' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseStringConstant(Name))
+    return true;
+
+  TypeIdSummary &TIS = Index->getOrInsertTypeIdSummary(Name);
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseTypeIdSummary(TIS) || ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  // Check if this ID was forward referenced, and if so, update the
+  // corresponding GUIDs.
+  auto FwdRefTIDs = ForwardRefTypeIds.find(ID);
+  if (FwdRefTIDs != ForwardRefTypeIds.end()) {
+    for (auto TIDRef : FwdRefTIDs->second) {
+      assert(!*TIDRef.first &&
+             "Forward referenced type id GUID expected to be 0");
+      *TIDRef.first = GlobalValue::getGUID(Name);
+    }
+    ForwardRefTypeIds.erase(FwdRefTIDs);
+  }
+
+  return false;
+}
+
+/// TypeIdSummary
+///   ::= 'summary' ':' '(' TypeTestResolution [',' OptionalWpdResolutions]? ')'
+bool LLParser::ParseTypeIdSummary(TypeIdSummary &TIS) {
+  if (ParseToken(lltok::kw_summary, "expected 'summary' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseTypeTestResolution(TIS.TTRes))
+    return true;
+
+  if (EatIfPresent(lltok::comma)) {
+    // Expect optional wpdResolutions field
+    if (ParseOptionalWpdResolutions(TIS.WPDRes))
+      return true;
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// TypeTestResolution
+///   ::= 'typeTestRes' ':' '(' 'kind' ':'
+///         ( 'unsat' | 'byteArray' | 'inline' | 'single' | 'allOnes' ) ','
+///         'sizeM1BitWidth' ':' SizeM1BitWidth [',' 'alignLog2' ':' UInt64]?
+///         [',' 'sizeM1' ':' UInt64]? [',' 'bitMask' ':' UInt8]?
+///         [',' 'inlinesBits' ':' UInt64]? ')'
+bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
+  if (ParseToken(lltok::kw_typeTestRes, "expected 'typeTestRes' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_kind, "expected 'kind' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  switch (Lex.getKind()) {
+  case lltok::kw_unsat:
+    TTRes.TheKind = TypeTestResolution::Unsat;
+    break;
+  case lltok::kw_byteArray:
+    TTRes.TheKind = TypeTestResolution::ByteArray;
+    break;
+  case lltok::kw_inline:
+    TTRes.TheKind = TypeTestResolution::Inline;
+    break;
+  case lltok::kw_single:
+    TTRes.TheKind = TypeTestResolution::Single;
+    break;
+  case lltok::kw_allOnes:
+    TTRes.TheKind = TypeTestResolution::AllOnes;
+    break;
+  default:
+    return Error(Lex.getLoc(), "unexpected TypeTestResolution kind");
+  }
+  Lex.Lex();
+
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_sizeM1BitWidth, "expected 'sizeM1BitWidth' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseUInt32(TTRes.SizeM1BitWidth))
+    return true;
+
+  // Parse optional fields
+  while (EatIfPresent(lltok::comma)) {
+    switch (Lex.getKind()) {
+    case lltok::kw_alignLog2:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") ||
+          ParseUInt64(TTRes.AlignLog2))
+        return true;
+      break;
+    case lltok::kw_sizeM1:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseUInt64(TTRes.SizeM1))
+        return true;
+      break;
+    case lltok::kw_bitMask: {
+      unsigned Val;
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseUInt32(Val))
+        return true;
+      assert(Val <= 0xff);
+      TTRes.BitMask = (uint8_t)Val;
+      break;
+    }
+    case lltok::kw_inlineBits:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") ||
+          ParseUInt64(TTRes.InlineBits))
+        return true;
+      break;
+    default:
+      return Error(Lex.getLoc(), "expected optional TypeTestResolution field");
+    }
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// OptionalWpdResolutions
+///   ::= 'wpsResolutions' ':' '(' WpdResolution [',' WpdResolution]* ')'
+/// WpdResolution ::= '(' 'offset' ':' UInt64 ',' WpdRes ')'
+bool LLParser::ParseOptionalWpdResolutions(
+    std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap) {
+  if (ParseToken(lltok::kw_wpdResolutions, "expected 'wpdResolutions' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  do {
+    uint64_t Offset;
+    WholeProgramDevirtResolution WPDRes;
+    if (ParseToken(lltok::lparen, "expected '(' here") ||
+        ParseToken(lltok::kw_offset, "expected 'offset' here") ||
+        ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(Offset) ||
+        ParseToken(lltok::comma, "expected ',' here") || ParseWpdRes(WPDRes) ||
+        ParseToken(lltok::rparen, "expected ')' here"))
+      return true;
+    WPDResMap[Offset] = WPDRes;
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// WpdRes
+///   ::= 'wpdRes' ':' '(' 'kind' ':' 'indir'
+///         [',' OptionalResByArg]? ')'
+///   ::= 'wpdRes' ':' '(' 'kind' ':' 'singleImpl'
+///         ',' 'singleImplName' ':' STRINGCONSTANT ','
+///         [',' OptionalResByArg]? ')'
+///   ::= 'wpdRes' ':' '(' 'kind' ':' 'branchFunnel'
+///         [',' OptionalResByArg]? ')'
+bool LLParser::ParseWpdRes(WholeProgramDevirtResolution &WPDRes) {
+  if (ParseToken(lltok::kw_wpdRes, "expected 'wpdRes' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_kind, "expected 'kind' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  switch (Lex.getKind()) {
+  case lltok::kw_indir:
+    WPDRes.TheKind = WholeProgramDevirtResolution::Indir;
+    break;
+  case lltok::kw_singleImpl:
+    WPDRes.TheKind = WholeProgramDevirtResolution::SingleImpl;
+    break;
+  case lltok::kw_branchFunnel:
+    WPDRes.TheKind = WholeProgramDevirtResolution::BranchFunnel;
+    break;
+  default:
+    return Error(Lex.getLoc(), "unexpected WholeProgramDevirtResolution kind");
+  }
+  Lex.Lex();
+
+  // Parse optional fields
+  while (EatIfPresent(lltok::comma)) {
+    switch (Lex.getKind()) {
+    case lltok::kw_singleImplName:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':' here") ||
+          ParseStringConstant(WPDRes.SingleImplName))
+        return true;
+      break;
+    case lltok::kw_resByArg:
+      if (ParseOptionalResByArg(WPDRes.ResByArg))
+        return true;
+      break;
+    default:
+      return Error(Lex.getLoc(),
+                   "expected optional WholeProgramDevirtResolution field");
+    }
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// OptionalResByArg
+///   ::= 'wpdRes' ':' '(' ResByArg[, ResByArg]* ')'
+/// ResByArg ::= Args ',' 'byArg' ':' '(' 'kind' ':'
+///                ( 'indir' | 'uniformRetVal' | 'UniqueRetVal' |
+///                  'virtualConstProp' )
+///                [',' 'info' ':' UInt64]? [',' 'byte' ':' UInt32]?
+///                [',' 'bit' ':' UInt32]? ')'
+bool LLParser::ParseOptionalResByArg(
+    std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>
+        &ResByArg) {
+  if (ParseToken(lltok::kw_resByArg, "expected 'resByArg' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  do {
+    std::vector<uint64_t> Args;
+    if (ParseArgs(Args) || ParseToken(lltok::comma, "expected ',' here") ||
+        ParseToken(lltok::kw_byArg, "expected 'byArg here") ||
+        ParseToken(lltok::colon, "expected ':' here") ||
+        ParseToken(lltok::lparen, "expected '(' here") ||
+        ParseToken(lltok::kw_kind, "expected 'kind' here") ||
+        ParseToken(lltok::colon, "expected ':' here"))
+      return true;
+
+    WholeProgramDevirtResolution::ByArg ByArg;
+    switch (Lex.getKind()) {
+    case lltok::kw_indir:
+      ByArg.TheKind = WholeProgramDevirtResolution::ByArg::Indir;
+      break;
+    case lltok::kw_uniformRetVal:
+      ByArg.TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+      break;
+    case lltok::kw_uniqueRetVal:
+      ByArg.TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+      break;
+    case lltok::kw_virtualConstProp:
+      ByArg.TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+      break;
+    default:
+      return Error(Lex.getLoc(),
+                   "unexpected WholeProgramDevirtResolution::ByArg kind");
+    }
+    Lex.Lex();
+
+    // Parse optional fields
+    while (EatIfPresent(lltok::comma)) {
+      switch (Lex.getKind()) {
+      case lltok::kw_info:
+        Lex.Lex();
+        if (ParseToken(lltok::colon, "expected ':' here") ||
+            ParseUInt64(ByArg.Info))
+          return true;
+        break;
+      case lltok::kw_byte:
+        Lex.Lex();
+        if (ParseToken(lltok::colon, "expected ':' here") ||
+            ParseUInt32(ByArg.Byte))
+          return true;
+        break;
+      case lltok::kw_bit:
+        Lex.Lex();
+        if (ParseToken(lltok::colon, "expected ':' here") ||
+            ParseUInt32(ByArg.Bit))
+          return true;
+        break;
+      default:
+        return Error(Lex.getLoc(),
+                     "expected optional whole program devirt field");
+      }
+    }
+
+    if (ParseToken(lltok::rparen, "expected ')' here"))
+      return true;
+
+    ResByArg[Args] = ByArg;
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// OptionalResByArg
+///   ::= 'args' ':' '(' UInt64[, UInt64]* ')'
+bool LLParser::ParseArgs(std::vector<uint64_t> &Args) {
+  if (ParseToken(lltok::kw_args, "expected 'args' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  do {
+    uint64_t Val;
+    if (ParseUInt64(Val))
+      return true;
+    Args.push_back(Val);
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+static ValueInfo EmptyVI =
+    ValueInfo(false, (GlobalValueSummaryMapTy::value_type *)-8);
+
+/// Stores the given Name/GUID and associated summary into the Index.
+/// Also updates any forward references to the associated entry ID.
+void LLParser::AddGlobalValueToIndex(
+    std::string Name, GlobalValue::GUID GUID, GlobalValue::LinkageTypes Linkage,
+    unsigned ID, std::unique_ptr<GlobalValueSummary> Summary) {
+  // First create the ValueInfo utilizing the Name or GUID.
+  ValueInfo VI;
+  if (GUID != 0) {
+    assert(Name.empty());
+    VI = Index->getOrInsertValueInfo(GUID);
+  } else {
+    assert(!Name.empty());
+    if (M) {
+      auto *GV = M->getNamedValue(Name);
+      assert(GV);
+      VI = Index->getOrInsertValueInfo(GV);
+    } else {
+      assert(
+          (!GlobalValue::isLocalLinkage(Linkage) || !SourceFileName.empty()) &&
+          "Need a source_filename to compute GUID for local");
+      GUID = GlobalValue::getGUID(
+          GlobalValue::getGlobalIdentifier(Name, Linkage, SourceFileName));
+      VI = Index->getOrInsertValueInfo(GUID, Index->saveString(Name));
+    }
+  }
+
+  // Add the summary if one was provided.
+  if (Summary)
+    Index->addGlobalValueSummary(VI, std::move(Summary));
+
+  // Resolve forward references from calls/refs
+  auto FwdRefVIs = ForwardRefValueInfos.find(ID);
+  if (FwdRefVIs != ForwardRefValueInfos.end()) {
+    for (auto VIRef : FwdRefVIs->second) {
+      assert(*VIRef.first == EmptyVI &&
+             "Forward referenced ValueInfo expected to be empty");
+      *VIRef.first = VI;
+    }
+    ForwardRefValueInfos.erase(FwdRefVIs);
+  }
+
+  // Resolve forward references from aliases
+  auto FwdRefAliasees = ForwardRefAliasees.find(ID);
+  if (FwdRefAliasees != ForwardRefAliasees.end()) {
+    for (auto AliaseeRef : FwdRefAliasees->second) {
+      assert(!AliaseeRef.first->hasAliasee() &&
+             "Forward referencing alias already has aliasee");
+      AliaseeRef.first->setAliasee(VI.getSummaryList().front().get());
+    }
+    ForwardRefAliasees.erase(FwdRefAliasees);
+  }
+
+  // Save the associated ValueInfo for use in later references by ID.
+  if (ID == NumberedValueInfos.size())
+    NumberedValueInfos.push_back(VI);
+  else {
+    // Handle non-continuous numbers (to make test simplification easier).
+    if (ID > NumberedValueInfos.size())
+      NumberedValueInfos.resize(ID + 1);
+    NumberedValueInfos[ID] = VI;
+  }
+}
+
+/// ParseGVEntry
+///   ::= 'gv' ':' '(' ('name' ':' STRINGCONSTANT | 'guid' ':' UInt64)
+///         [',' 'summaries' ':' Summary[',' Summary]* ]? ')'
+/// Summary ::= '(' (FunctionSummary | VariableSummary | AliasSummary) ')'
+bool LLParser::ParseGVEntry(unsigned ID) {
+  assert(Lex.getKind() == lltok::kw_gv);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  std::string Name;
+  GlobalValue::GUID GUID = 0;
+  switch (Lex.getKind()) {
+  case lltok::kw_name:
+    Lex.Lex();
+    if (ParseToken(lltok::colon, "expected ':' here") ||
+        ParseStringConstant(Name))
+      return true;
+    // Can't create GUID/ValueInfo until we have the linkage.
+    break;
+  case lltok::kw_guid:
+    Lex.Lex();
+    if (ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(GUID))
+      return true;
+    break;
+  default:
+    return Error(Lex.getLoc(), "expected name or guid tag");
+  }
+
+  if (!EatIfPresent(lltok::comma)) {
+    // No summaries. Wrap up.
+    if (ParseToken(lltok::rparen, "expected ')' here"))
+      return true;
+    // This was created for a call to an external or indirect target.
+    // A GUID with no summary came from a VALUE_GUID record, dummy GUID
+    // created for indirect calls with VP. A Name with no GUID came from
+    // an external definition. We pass ExternalLinkage since that is only
+    // used when the GUID must be computed from Name, and in that case
+    // the symbol must have external linkage.
+    AddGlobalValueToIndex(Name, GUID, GlobalValue::ExternalLinkage, ID,
+                          nullptr);
+    return false;
+  }
+
+  // Have a list of summaries
+  if (ParseToken(lltok::kw_summaries, "expected 'summaries' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  do {
+    if (ParseToken(lltok::lparen, "expected '(' here"))
+      return true;
+    switch (Lex.getKind()) {
+    case lltok::kw_function:
+      if (ParseFunctionSummary(Name, GUID, ID))
+        return true;
+      break;
+    case lltok::kw_variable:
+      if (ParseVariableSummary(Name, GUID, ID))
+        return true;
+      break;
+    case lltok::kw_alias:
+      if (ParseAliasSummary(Name, GUID, ID))
+        return true;
+      break;
+    default:
+      return Error(Lex.getLoc(), "expected summary type");
+    }
+    if (ParseToken(lltok::rparen, "expected ')' here"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// FunctionSummary
+///   ::= 'function' ':' '(' 'module' ':' ModuleReference ',' GVFlags
+///         ',' 'insts' ':' UInt32 [',' OptionalFFlags]? [',' OptionalCalls]?
+///         [',' OptionalTypeIdInfo]? [',' OptionalRefs]? ')'
+bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
+                                    unsigned ID) {
+  assert(Lex.getKind() == lltok::kw_function);
+  Lex.Lex();
+
+  StringRef ModulePath;
+  GlobalValueSummary::GVFlags GVFlags = GlobalValueSummary::GVFlags(
+      /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false,
+      /*Live=*/false, /*IsLocal=*/false);
+  unsigned InstCount;
+  std::vector<FunctionSummary::EdgeTy> Calls;
+  FunctionSummary::TypeIdInfo TypeIdInfo;
+  std::vector<ValueInfo> Refs;
+  // Default is all-zeros (conservative values).
+  FunctionSummary::FFlags FFlags = {};
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseModuleReference(ModulePath) ||
+      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_insts, "expected 'insts' here") ||
+      ParseToken(lltok::colon, "expected ':' here") || ParseUInt32(InstCount))
+    return true;
+
+  // Parse optional fields
+  while (EatIfPresent(lltok::comma)) {
+    switch (Lex.getKind()) {
+    case lltok::kw_funcFlags:
+      if (ParseOptionalFFlags(FFlags))
+        return true;
+      break;
+    case lltok::kw_calls:
+      if (ParseOptionalCalls(Calls))
+        return true;
+      break;
+    case lltok::kw_typeIdInfo:
+      if (ParseOptionalTypeIdInfo(TypeIdInfo))
+        return true;
+      break;
+    case lltok::kw_refs:
+      if (ParseOptionalRefs(Refs))
+        return true;
+      break;
+    default:
+      return Error(Lex.getLoc(), "expected optional function summary field");
+    }
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  auto FS = llvm::make_unique<FunctionSummary>(
+      GVFlags, InstCount, FFlags, std::move(Refs), std::move(Calls),
+      std::move(TypeIdInfo.TypeTests),
+      std::move(TypeIdInfo.TypeTestAssumeVCalls),
+      std::move(TypeIdInfo.TypeCheckedLoadVCalls),
+      std::move(TypeIdInfo.TypeTestAssumeConstVCalls),
+      std::move(TypeIdInfo.TypeCheckedLoadConstVCalls));
+
+  FS->setModulePath(ModulePath);
+
+  AddGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
+                        ID, std::move(FS));
+
+  return false;
+}
+
+/// VariableSummary
+///   ::= 'variable' ':' '(' 'module' ':' ModuleReference ',' GVFlags
+///         [',' OptionalRefs]? ')'
+bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
+                                    unsigned ID) {
+  assert(Lex.getKind() == lltok::kw_variable);
+  Lex.Lex();
+
+  StringRef ModulePath;
+  GlobalValueSummary::GVFlags GVFlags = GlobalValueSummary::GVFlags(
+      /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false,
+      /*Live=*/false, /*IsLocal=*/false);
+  std::vector<ValueInfo> Refs;
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseModuleReference(ModulePath) ||
+      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags))
+    return true;
+
+  // Parse optional refs field
+  if (EatIfPresent(lltok::comma)) {
+    if (ParseOptionalRefs(Refs))
+      return true;
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  auto GS = llvm::make_unique<GlobalVarSummary>(GVFlags, std::move(Refs));
+
+  GS->setModulePath(ModulePath);
+
+  AddGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
+                        ID, std::move(GS));
+
+  return false;
+}
+
+/// AliasSummary
+///   ::= 'alias' ':' '(' 'module' ':' ModuleReference ',' GVFlags ','
+///         'aliasee' ':' GVReference ')'
+bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
+                                 unsigned ID) {
+  assert(Lex.getKind() == lltok::kw_alias);
+  LocTy Loc = Lex.getLoc();
+  Lex.Lex();
+
+  StringRef ModulePath;
+  GlobalValueSummary::GVFlags GVFlags = GlobalValueSummary::GVFlags(
+      /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false,
+      /*Live=*/false, /*IsLocal=*/false);
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseModuleReference(ModulePath) ||
+      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_aliasee, "expected 'aliasee' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  ValueInfo AliaseeVI;
+  unsigned GVId;
+  if (ParseGVReference(AliaseeVI, GVId))
+    return true;
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  auto AS = llvm::make_unique<AliasSummary>(GVFlags);
+
+  AS->setModulePath(ModulePath);
+
+  // Record forward reference if the aliasee is not parsed yet.
+  if (AliaseeVI == EmptyVI) {
+    auto FwdRef = ForwardRefAliasees.insert(
+        std::make_pair(GVId, std::vector<std::pair<AliasSummary *, LocTy>>()));
+    FwdRef.first->second.push_back(std::make_pair(AS.get(), Loc));
+  } else
+    AS->setAliasee(AliaseeVI.getSummaryList().front().get());
+
+  AddGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
+                        ID, std::move(AS));
+
+  return false;
+}
+
+/// Flag
+///   ::= [0|1]
+bool LLParser::ParseFlag(unsigned &Val) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
+    return TokError("expected integer");
+  Val = (unsigned)Lex.getAPSIntVal().getBoolValue();
+  Lex.Lex();
+  return false;
+}
+
+/// OptionalFFlags
+///   := 'funcFlags' ':' '(' ['readNone' ':' Flag]?
+///        [',' 'readOnly' ':' Flag]? [',' 'noRecurse' ':' Flag]?
+///        [',' 'returnDoesNotAlias' ':' Flag]? ')'
+bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
+  assert(Lex.getKind() == lltok::kw_funcFlags);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' in funcFlags") |
+      ParseToken(lltok::lparen, "expected '(' in funcFlags"))
+    return true;
+
+  do {
+    unsigned Val;
+    switch (Lex.getKind()) {
+    case lltok::kw_readNone:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+        return true;
+      FFlags.ReadNone = Val;
+      break;
+    case lltok::kw_readOnly:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+        return true;
+      FFlags.ReadOnly = Val;
+      break;
+    case lltok::kw_noRecurse:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+        return true;
+      FFlags.NoRecurse = Val;
+      break;
+    case lltok::kw_returnDoesNotAlias:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+        return true;
+      FFlags.ReturnDoesNotAlias = Val;
+      break;
+    default:
+      return Error(Lex.getLoc(), "expected function flag type");
+    }
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' in funcFlags"))
+    return true;
+
+  return false;
+}
+
+/// OptionalCalls
+///   := 'calls' ':' '(' Call [',' Call]* ')'
+/// Call ::= '(' 'callee' ':' GVReference
+///            [( ',' 'hotness' ':' Hotness | ',' 'relbf' ':' UInt32 )]? ')'
+bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
+  assert(Lex.getKind() == lltok::kw_calls);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' in calls") |
+      ParseToken(lltok::lparen, "expected '(' in calls"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  // Parse each call edge
+  do {
+    ValueInfo VI;
+    if (ParseToken(lltok::lparen, "expected '(' in call") ||
+        ParseToken(lltok::kw_callee, "expected 'callee' in call") ||
+        ParseToken(lltok::colon, "expected ':'"))
+      return true;
+
+    LocTy Loc = Lex.getLoc();
+    unsigned GVId;
+    if (ParseGVReference(VI, GVId))
+      return true;
+
+    CalleeInfo::HotnessType Hotness = CalleeInfo::HotnessType::Unknown;
+    unsigned RelBF = 0;
+    if (EatIfPresent(lltok::comma)) {
+      // Expect either hotness or relbf
+      if (EatIfPresent(lltok::kw_hotness)) {
+        if (ParseToken(lltok::colon, "expected ':'") || ParseHotness(Hotness))
+          return true;
+      } else {
+        if (ParseToken(lltok::kw_relbf, "expected relbf") ||
+            ParseToken(lltok::colon, "expected ':'") || ParseUInt32(RelBF))
+          return true;
+      }
+    }
+    // Keep track of the Call array index needing a forward reference.
+    // We will save the location of the ValueInfo needing an update, but
+    // can only do so once the std::vector is finalized.
+    if (VI == EmptyVI)
+      IdToIndexMap[GVId].push_back(std::make_pair(Calls.size(), Loc));
+    Calls.push_back(FunctionSummary::EdgeTy{VI, CalleeInfo(Hotness, RelBF)});
+
+    if (ParseToken(lltok::rparen, "expected ')' in call"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  // Now that the Calls vector is finalized, it is safe to save the locations
+  // of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    for (auto P : I.second) {
+      assert(Calls[P.first].first == EmptyVI &&
+             "Forward referenced ValueInfo expected to be empty");
+      auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
+          I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
+      FwdRef.first->second.push_back(
+          std::make_pair(&Calls[P.first].first, P.second));
+    }
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' in calls"))
+    return true;
+
+  return false;
+}
+
+/// Hotness
+///   := ('unknown'|'cold'|'none'|'hot'|'critical')
+bool LLParser::ParseHotness(CalleeInfo::HotnessType &Hotness) {
+  switch (Lex.getKind()) {
+  case lltok::kw_unknown:
+    Hotness = CalleeInfo::HotnessType::Unknown;
+    break;
+  case lltok::kw_cold:
+    Hotness = CalleeInfo::HotnessType::Cold;
+    break;
+  case lltok::kw_none:
+    Hotness = CalleeInfo::HotnessType::None;
+    break;
+  case lltok::kw_hot:
+    Hotness = CalleeInfo::HotnessType::Hot;
+    break;
+  case lltok::kw_critical:
+    Hotness = CalleeInfo::HotnessType::Critical;
+    break;
+  default:
+    return Error(Lex.getLoc(), "invalid call edge hotness");
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// OptionalRefs
+///   := 'refs' ':' '(' GVReference [',' GVReference]* ')'
+bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
+  assert(Lex.getKind() == lltok::kw_refs);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' in refs") |
+      ParseToken(lltok::lparen, "expected '(' in refs"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  // Parse each ref edge
+  do {
+    ValueInfo VI;
+    LocTy Loc = Lex.getLoc();
+    unsigned GVId;
+    if (ParseGVReference(VI, GVId))
+      return true;
+
+    // Keep track of the Refs array index needing a forward reference.
+    // We will save the location of the ValueInfo needing an update, but
+    // can only do so once the std::vector is finalized.
+    if (VI == EmptyVI)
+      IdToIndexMap[GVId].push_back(std::make_pair(Refs.size(), Loc));
+    Refs.push_back(VI);
+  } while (EatIfPresent(lltok::comma));
+
+  // Now that the Refs vector is finalized, it is safe to save the locations
+  // of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    for (auto P : I.second) {
+      assert(Refs[P.first] == EmptyVI &&
+             "Forward referenced ValueInfo expected to be empty");
+      auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
+          I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
+      FwdRef.first->second.push_back(std::make_pair(&Refs[P.first], P.second));
+    }
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' in refs"))
+    return true;
+
+  return false;
+}
+
+/// OptionalTypeIdInfo
+///   := 'typeidinfo' ':' '(' [',' TypeTests]? [',' TypeTestAssumeVCalls]?
+///         [',' TypeCheckedLoadVCalls]?  [',' TypeTestAssumeConstVCalls]?
+///         [',' TypeCheckedLoadConstVCalls]? ')'
+bool LLParser::ParseOptionalTypeIdInfo(
+    FunctionSummary::TypeIdInfo &TypeIdInfo) {
+  assert(Lex.getKind() == lltok::kw_typeIdInfo);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' in typeIdInfo"))
+    return true;
+
+  do {
+    switch (Lex.getKind()) {
+    case lltok::kw_typeTests:
+      if (ParseTypeTests(TypeIdInfo.TypeTests))
+        return true;
+      break;
+    case lltok::kw_typeTestAssumeVCalls:
+      if (ParseVFuncIdList(lltok::kw_typeTestAssumeVCalls,
+                           TypeIdInfo.TypeTestAssumeVCalls))
+        return true;
+      break;
+    case lltok::kw_typeCheckedLoadVCalls:
+      if (ParseVFuncIdList(lltok::kw_typeCheckedLoadVCalls,
+                           TypeIdInfo.TypeCheckedLoadVCalls))
+        return true;
+      break;
+    case lltok::kw_typeTestAssumeConstVCalls:
+      if (ParseConstVCallList(lltok::kw_typeTestAssumeConstVCalls,
+                              TypeIdInfo.TypeTestAssumeConstVCalls))
+        return true;
+      break;
+    case lltok::kw_typeCheckedLoadConstVCalls:
+      if (ParseConstVCallList(lltok::kw_typeCheckedLoadConstVCalls,
+                              TypeIdInfo.TypeCheckedLoadConstVCalls))
+        return true;
+      break;
+    default:
+      return Error(Lex.getLoc(), "invalid typeIdInfo list type");
+    }
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' in typeIdInfo"))
+    return true;
+
+  return false;
+}
+
+/// TypeTests
+///   ::= 'typeTests' ':' '(' (SummaryID | UInt64)
+///         [',' (SummaryID | UInt64)]* ')'
+bool LLParser::ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests) {
+  assert(Lex.getKind() == lltok::kw_typeTests);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' in typeIdInfo"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  do {
+    GlobalValue::GUID GUID = 0;
+    if (Lex.getKind() == lltok::SummaryID) {
+      unsigned ID = Lex.getUIntVal();
+      LocTy Loc = Lex.getLoc();
+      // Keep track of the TypeTests array index needing a forward reference.
+      // We will save the location of the GUID needing an update, but
+      // can only do so once the std::vector is finalized.
+      IdToIndexMap[ID].push_back(std::make_pair(TypeTests.size(), Loc));
+      Lex.Lex();
+    } else if (ParseUInt64(GUID))
+      return true;
+    TypeTests.push_back(GUID);
+  } while (EatIfPresent(lltok::comma));
+
+  // Now that the TypeTests vector is finalized, it is safe to save the
+  // locations of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    for (auto P : I.second) {
+      assert(TypeTests[P.first] == 0 &&
+             "Forward referenced type id GUID expected to be 0");
+      auto FwdRef = ForwardRefTypeIds.insert(std::make_pair(
+          I.first, std::vector<std::pair<GlobalValue::GUID *, LocTy>>()));
+      FwdRef.first->second.push_back(
+          std::make_pair(&TypeTests[P.first], P.second));
+    }
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' in typeIdInfo"))
+    return true;
+
+  return false;
+}
+
+/// VFuncIdList
+///   ::= Kind ':' '(' VFuncId [',' VFuncId]* ')'
+bool LLParser::ParseVFuncIdList(
+    lltok::Kind Kind, std::vector<FunctionSummary::VFuncId> &VFuncIdList) {
+  assert(Lex.getKind() == Kind);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  do {
+    FunctionSummary::VFuncId VFuncId;
+    if (ParseVFuncId(VFuncId, IdToIndexMap, VFuncIdList.size()))
+      return true;
+    VFuncIdList.push_back(VFuncId);
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  // Now that the VFuncIdList vector is finalized, it is safe to save the
+  // locations of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    for (auto P : I.second) {
+      assert(VFuncIdList[P.first].GUID == 0 &&
+             "Forward referenced type id GUID expected to be 0");
+      auto FwdRef = ForwardRefTypeIds.insert(std::make_pair(
+          I.first, std::vector<std::pair<GlobalValue::GUID *, LocTy>>()));
+      FwdRef.first->second.push_back(
+          std::make_pair(&VFuncIdList[P.first].GUID, P.second));
+    }
+  }
+
+  return false;
+}
+
+/// ConstVCallList
+///   ::= Kind ':' '(' ConstVCall [',' ConstVCall]* ')'
+bool LLParser::ParseConstVCallList(
+    lltok::Kind Kind,
+    std::vector<FunctionSummary::ConstVCall> &ConstVCallList) {
+  assert(Lex.getKind() == Kind);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  do {
+    FunctionSummary::ConstVCall ConstVCall;
+    if (ParseConstVCall(ConstVCall, IdToIndexMap, ConstVCallList.size()))
+      return true;
+    ConstVCallList.push_back(ConstVCall);
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  // Now that the ConstVCallList vector is finalized, it is safe to save the
+  // locations of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    for (auto P : I.second) {
+      assert(ConstVCallList[P.first].VFunc.GUID == 0 &&
+             "Forward referenced type id GUID expected to be 0");
+      auto FwdRef = ForwardRefTypeIds.insert(std::make_pair(
+          I.first, std::vector<std::pair<GlobalValue::GUID *, LocTy>>()));
+      FwdRef.first->second.push_back(
+          std::make_pair(&ConstVCallList[P.first].VFunc.GUID, P.second));
+    }
+  }
+
+  return false;
+}
+
+/// ConstVCall
+///   ::= VFuncId, Args
+bool LLParser::ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
+                               IdToIndexMapType &IdToIndexMap, unsigned Index) {
+  if (ParseVFuncId(ConstVCall.VFunc, IdToIndexMap, Index) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseArgs(ConstVCall.Args))
+    return true;
+
+  return false;
+}
+
+/// VFuncId
+///   ::= 'vFuncId' ':' '(' (SummaryID | 'guid' ':' UInt64) ','
+///         'offset' ':' UInt64 ')'
+bool LLParser::ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
+                            IdToIndexMapType &IdToIndexMap, unsigned Index) {
+  assert(Lex.getKind() == lltok::kw_vFuncId);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  if (Lex.getKind() == lltok::SummaryID) {
+    VFuncId.GUID = 0;
+    unsigned ID = Lex.getUIntVal();
+    LocTy Loc = Lex.getLoc();
+    // Keep track of the array index needing a forward reference.
+    // We will save the location of the GUID needing an update, but
+    // can only do so once the caller's std::vector is finalized.
+    IdToIndexMap[ID].push_back(std::make_pair(Index, Loc));
+    Lex.Lex();
+  } else if (ParseToken(lltok::kw_guid, "expected 'guid' here") ||
+             ParseToken(lltok::colon, "expected ':' here") ||
+             ParseUInt64(VFuncId.GUID))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_offset, "expected 'offset' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseUInt64(VFuncId.Offset) ||
+      ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// GVFlags
+///   ::= 'flags' ':' '(' 'linkage' ':' OptionalLinkageAux ','
+///         'notEligibleToImport' ':' Flag ',' 'live' ':' Flag ','
+///         'dsoLocal' ':' Flag ')'
+bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
+  assert(Lex.getKind() == lltok::kw_flags);
+  Lex.Lex();
+
+  bool HasLinkage;
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_linkage, "expected 'linkage' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  GVFlags.Linkage = parseOptionalLinkageAux(Lex.getKind(), HasLinkage);
+  assert(HasLinkage && "Linkage not optional in summary entry");
+  Lex.Lex();
+
+  unsigned Flag;
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_notEligibleToImport,
+                 "expected 'notEligibleToImport' here") ||
+      ParseToken(lltok::colon, "expected ':' here") || ParseFlag(Flag))
+    return true;
+  GVFlags.NotEligibleToImport = Flag;
+
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_live, "expected 'live' here") ||
+      ParseToken(lltok::colon, "expected ':' here") || ParseFlag(Flag))
+    return true;
+  GVFlags.Live = Flag;
+
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseToken(lltok::kw_dsoLocal, "expected 'dsoLocal' here") ||
+      ParseToken(lltok::colon, "expected ':' here") || ParseFlag(Flag))
+    return true;
+  GVFlags.DSOLocal = Flag;
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// ModuleReference
+///   ::= 'module' ':' UInt
+bool LLParser::ParseModuleReference(StringRef &ModulePath) {
+  // Parse module id.
+  if (ParseToken(lltok::kw_module, "expected 'module' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::SummaryID, "expected module ID"))
+    return true;
+
+  unsigned ModuleID = Lex.getUIntVal();
+  auto I = ModuleIdMap.find(ModuleID);
+  // We should have already parsed all module IDs
+  assert(I != ModuleIdMap.end());
+  ModulePath = I->second;
+  return false;
+}
+
+/// GVReference
+///   ::= SummaryID
+bool LLParser::ParseGVReference(ValueInfo &VI, unsigned &GVId) {
+  if (ParseToken(lltok::SummaryID, "expected GV ID"))
+    return true;
+
+  GVId = Lex.getUIntVal();
+
+  // Check if we already have a VI for this GV
+  if (GVId < NumberedValueInfos.size()) {
+    assert(NumberedValueInfos[GVId] != EmptyVI);
+    VI = NumberedValueInfos[GVId];
+  } else
+    // We will create a forward reference to the stored location.
+    VI = EmptyVI;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/AsmParser/LLParser.h b/contrib/llvm/lib/AsmParser/LLParser.h
index 94e4c1ae96d5..811f96418fa5 100644
--- a/contrib/llvm/lib/AsmParser/LLParser.h
+++ b/contrib/llvm/lib/AsmParser/LLParser.h
@@ -20,6 +20,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueHandle.h"
@@ -90,7 +91,10 @@ namespace llvm {
   private:
     LLVMContext &Context;
     LLLexer Lex;
+    // Module being parsed, null if we are only parsing summary index.
     Module *M;
+    // Summary index being parsed, null if we are only parsing Module.
+    ModuleSummaryIndex *Index;
     SlotMapping *Slots;
 
     // Instruction metadata resolution.  Each instruction can have a list of
@@ -139,16 +143,40 @@ namespace llvm {
     std::map<Value*, std::vector<unsigned> > ForwardRefAttrGroups;
     std::map<unsigned, AttrBuilder> NumberedAttrBuilders;
 
+    // Summary global value reference information.
+    std::map<unsigned, std::vector<std::pair<ValueInfo *, LocTy>>>
+        ForwardRefValueInfos;
+    std::map<unsigned, std::vector<std::pair<AliasSummary *, LocTy>>>
+        ForwardRefAliasees;
+    std::vector<ValueInfo> NumberedValueInfos;
+
+    // Summary type id reference information.
+    std::map<unsigned, std::vector<std::pair<GlobalValue::GUID *, LocTy>>>
+        ForwardRefTypeIds;
+
+    // Map of module ID to path.
+    std::map<unsigned, StringRef> ModuleIdMap;
+
     /// Only the llvm-as tool may set this to false to bypass
     /// UpgradeDebuginfo so it can generate broken bitcode.
     bool UpgradeDebugInfo;
 
+    /// DataLayout string to override that in LLVM assembly.
+    StringRef DataLayoutStr;
+
+    std::string SourceFileName;
+
   public:
     LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
-             SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true)
-        : Context(M->getContext()), Lex(F, SM, Err, M->getContext()), M(M),
+             ModuleSummaryIndex *Index, LLVMContext &Context,
+             SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true,
+             StringRef DataLayoutString = "")
+        : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index),
           Slots(Slots), BlockAddressPFS(nullptr),
-          UpgradeDebugInfo(UpgradeDebugInfo) {}
+          UpgradeDebugInfo(UpgradeDebugInfo), DataLayoutStr(DataLayoutString) {
+      if (!DataLayoutStr.empty())
+        M->setDataLayout(DataLayoutStr);
+    }
     bool Run();
 
     bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
@@ -174,12 +202,12 @@ namespace llvm {
     /// GetGlobalVal - Get a value with the specified name or ID, creating a
     /// forward reference record if needed.  This can return null if the value
     /// exists but does not have the right type.
-    GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc);
+    GlobalValue *GetGlobalVal(const std::string &Name, Type *Ty, LocTy Loc);
     GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc);
 
     /// Get a Comdat with the specified name, creating a forward reference
     /// record if needed.
-    Comdat *getComdat(const std::string &N, LocTy Loc);
+    Comdat *getComdat(const std::string &Name, LocTy Loc);
 
     // Helper Routines.
     bool ParseToken(lltok::Kind T, const char *ErrMsg);
@@ -232,6 +260,7 @@ namespace llvm {
       Loc = Lex.getLoc();
       return ParseUInt64(Val);
     }
+    bool ParseFlag(unsigned &Val);
 
     bool ParseStringAttribute(AttrBuilder &B);
 
@@ -241,12 +270,12 @@ namespace llvm {
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
     bool ParseOptionalParamAttrs(AttrBuilder &B);
     bool ParseOptionalReturnAttrs(AttrBuilder &B);
-    bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage,
+    bool ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
                               unsigned &Visibility, unsigned &DLLStorageClass,
                               bool &DSOLocal);
     void ParseOptionalDSOLocal(bool &DSOLocal);
-    void ParseOptionalVisibility(unsigned &Visibility);
-    void ParseOptionalDLLStorageClass(unsigned &DLLStorageClass);
+    void ParseOptionalVisibility(unsigned &Res);
+    void ParseOptionalDLLStorageClass(unsigned &Res);
     bool ParseOptionalCallingConv(unsigned &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
     bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
@@ -259,7 +288,7 @@ namespace llvm {
     bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
                                      bool &AteExtraComma);
     bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
-    bool parseAllocSizeArguments(unsigned &ElemSizeArg,
+    bool parseAllocSizeArguments(unsigned &BaseSizeArg,
                                  Optional<unsigned> &HowManyArg);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,
                         bool &AteExtraComma);
@@ -274,6 +303,7 @@ namespace llvm {
     // Top-Level Entities
     bool ParseTopLevelEntities();
     bool ValidateEndOfModule();
+    bool ValidateEndOfIndex();
     bool ParseTargetDefinition();
     bool ParseModuleAsm();
     bool ParseSourceFileName();
@@ -286,13 +316,13 @@ namespace llvm {
     bool ParseGlobalType(bool &IsConstant);
     bool ParseUnnamedGlobal();
     bool ParseNamedGlobal();
-    bool ParseGlobal(const std::string &Name, LocTy Loc, unsigned Linkage,
+    bool ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage,
                      bool HasLinkage, unsigned Visibility,
                      unsigned DLLStorageClass, bool DSOLocal,
                      GlobalVariable::ThreadLocalMode TLM,
                      GlobalVariable::UnnamedAddr UnnamedAddr);
-    bool parseIndirectSymbol(const std::string &Name, LocTy Loc,
-                             unsigned Linkage, unsigned Visibility,
+    bool parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
+                             unsigned L, unsigned Visibility,
                              unsigned DLLStorageClass, bool DSOLocal,
                              GlobalVariable::ThreadLocalMode TLM,
                              GlobalVariable::UnnamedAddr UnnamedAddr);
@@ -306,6 +336,48 @@ namespace llvm {
                                     std::vector<unsigned> &FwdRefAttrGrps,
                                     bool inAttrGrp, LocTy &BuiltinLoc);
 
+    // Module Summary Index Parsing.
+    bool SkipModuleSummaryEntry();
+    bool ParseSummaryEntry();
+    bool ParseModuleEntry(unsigned ID);
+    bool ParseModuleReference(StringRef &ModulePath);
+    bool ParseGVReference(ValueInfo &VI, unsigned &GVId);
+    bool ParseGVEntry(unsigned ID);
+    bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+    bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+    bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+    bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags);
+    bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags);
+    bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls);
+    bool ParseHotness(CalleeInfo::HotnessType &Hotness);
+    bool ParseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo);
+    bool ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests);
+    bool ParseVFuncIdList(lltok::Kind Kind,
+                          std::vector<FunctionSummary::VFuncId> &VFuncIdList);
+    bool ParseConstVCallList(
+        lltok::Kind Kind,
+        std::vector<FunctionSummary::ConstVCall> &ConstVCallList);
+    using IdToIndexMapType =
+        std::map<unsigned, std::vector<std::pair<unsigned, LocTy>>>;
+    bool ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
+                         IdToIndexMapType &IdToIndexMap, unsigned Index);
+    bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
+                      IdToIndexMapType &IdToIndexMap, unsigned Index);
+    bool ParseOptionalRefs(std::vector<ValueInfo> &Refs);
+    bool ParseTypeIdEntry(unsigned ID);
+    bool ParseTypeIdSummary(TypeIdSummary &TIS);
+    bool ParseTypeTestResolution(TypeTestResolution &TTRes);
+    bool ParseOptionalWpdResolutions(
+        std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap);
+    bool ParseWpdRes(WholeProgramDevirtResolution &WPDRes);
+    bool ParseOptionalResByArg(
+        std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>
+            &ResByArg);
+    bool ParseArgs(std::vector<uint64_t> &Args);
+    void AddGlobalValueToIndex(std::string Name, GlobalValue::GUID,
+                               GlobalValue::LinkageTypes Linkage, unsigned ID,
+                               std::unique_ptr<GlobalValueSummary> Summary);
+
     // Type Parsing.
     bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);
     bool ParseType(Type *&Result, bool AllowVoid = false) {
@@ -341,7 +413,7 @@ namespace llvm {
       /// number of it, otherwise it is -1.
       int FunctionNumber;
     public:
-      PerFunctionState(LLParser &p, Function &f, int FunctionNumber);
+      PerFunctionState(LLParser &p, Function &f, int functionNumber);
       ~PerFunctionState();
 
       Function &getFunction() const { return F; }
@@ -351,8 +423,8 @@ namespace llvm {
       /// GetVal - Get a value with the specified name or ID, creating a
       /// forward reference record if needed.  This can return null if the value
       /// exists but does not have the right type.
-      Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc);
-      Value *GetVal(unsigned ID, Type *Ty, LocTy Loc);
+      Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall);
+      Value *GetVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
 
       /// SetInstName - After an instruction is parsed and inserted into its
       /// basic block, this installs its name.
@@ -374,7 +446,7 @@ namespace llvm {
     };
 
     bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
-                             PerFunctionState *PFS);
+                             PerFunctionState *PFS, bool IsCall);
 
     bool parseConstantValue(Type *Ty, Constant *&C);
     bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
@@ -425,7 +497,7 @@ namespace llvm {
 
     // Constant Parsing.
     bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
-    bool ParseGlobalValue(Type *Ty, Constant *&V);
+    bool ParseGlobalValue(Type *Ty, Constant *&C);
     bool ParseGlobalTypeAndValue(Constant *&V);
     bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
                                 Optional<unsigned> *InRangeOp = nullptr);
@@ -435,9 +507,9 @@ namespace llvm {
                               PerFunctionState *PFS);
     bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS);
     bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false);
-    bool ParseMDNode(MDNode *&MD);
-    bool ParseMDNodeTail(MDNode *&MD);
-    bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &MDs);
+    bool ParseMDNode(MDNode *&N);
+    bool ParseMDNodeTail(MDNode *&N);
+    bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts);
     bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD);
     bool ParseInstructionMetadata(Instruction &Inst);
     bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO);
@@ -477,7 +549,7 @@ namespace llvm {
     enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 };
     int ParseInstruction(Instruction *&Inst, BasicBlock *BB,
                          PerFunctionState &PFS);
-    bool ParseCmpPredicate(unsigned &Pred, unsigned Opc);
+    bool ParseCmpPredicate(unsigned &P, unsigned Opc);
 
     bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
     bool ParseBr(Instruction *&Inst, PerFunctionState &PFS);
@@ -491,29 +563,29 @@ namespace llvm {
     bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
 
-    bool ParseArithmetic(Instruction *&I, PerFunctionState &PFS, unsigned Opc,
+    bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
                          unsigned OperandType);
-    bool ParseLogical(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
-    bool ParseCompare(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
-    bool ParseCast(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
-    bool ParseSelect(Instruction *&I, PerFunctionState &PFS);
-    bool ParseVA_Arg(Instruction *&I, PerFunctionState &PFS);
-    bool ParseExtractElement(Instruction *&I, PerFunctionState &PFS);
-    bool ParseInsertElement(Instruction *&I, PerFunctionState &PFS);
-    bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
-    int ParsePHI(Instruction *&I, PerFunctionState &PFS);
-    bool ParseLandingPad(Instruction *&I, PerFunctionState &PFS);
-    bool ParseCall(Instruction *&I, PerFunctionState &PFS,
-                   CallInst::TailCallKind IsTail);
-    int ParseAlloc(Instruction *&I, PerFunctionState &PFS);
-    int ParseLoad(Instruction *&I, PerFunctionState &PFS);
-    int ParseStore(Instruction *&I, PerFunctionState &PFS);
-    int ParseCmpXchg(Instruction *&I, PerFunctionState &PFS);
-    int ParseAtomicRMW(Instruction *&I, PerFunctionState &PFS);
-    int ParseFence(Instruction *&I, PerFunctionState &PFS);
-    int ParseGetElementPtr(Instruction *&I, PerFunctionState &PFS);
-    int ParseExtractValue(Instruction *&I, PerFunctionState &PFS);
-    int ParseInsertValue(Instruction *&I, PerFunctionState &PFS);
+    bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+    bool ParseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+    bool ParseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+    bool ParseSelect(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS);
+    int ParsePHI(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseCall(Instruction *&Inst, PerFunctionState &PFS,
+                   CallInst::TailCallKind TCK);
+    int ParseAlloc(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseLoad(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseStore(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseFence(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS);
+    int ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS);
 
     // Use-list order directives.
     bool ParseUseListOrder(PerFunctionState *PFS = nullptr);
diff --git a/contrib/llvm/lib/AsmParser/LLToken.h b/contrib/llvm/lib/AsmParser/LLToken.h
index ad826cc4fd21..8d8c7e99656e 100644
--- a/contrib/llvm/lib/AsmParser/LLToken.h
+++ b/contrib/llvm/lib/AsmParser/LLToken.h
@@ -36,6 +36,7 @@ enum Kind {
   rparen,  // (  )
   exclaim, // !
   bar,     // |
+  colon,   // :
 
   kw_x,
   kw_true,
@@ -199,7 +200,9 @@ enum Kind {
   kw_nonnull,
   kw_noredzone,
   kw_noreturn,
+  kw_nocf_check,
   kw_nounwind,
+  kw_optforfuzzing,
   kw_optnone,
   kw_optsize,
   kw_readnone,
@@ -212,6 +215,7 @@ enum Kind {
   kw_sspreq,
   kw_sspstrong,
   kw_safestack,
+  kw_shadowcallstack,
   kw_sret,
   kw_sanitize_thread,
   kw_sanitize_memory,
@@ -344,10 +348,78 @@ enum Kind {
   kw_uselistorder,
   kw_uselistorder_bb,
 
+  // Summary index keywords
+  kw_path,
+  kw_hash,
+  kw_gv,
+  kw_guid,
+  kw_name,
+  kw_summaries,
+  kw_flags,
+  kw_linkage,
+  kw_notEligibleToImport,
+  kw_live,
+  kw_dsoLocal,
+  kw_function,
+  kw_insts,
+  kw_funcFlags,
+  kw_readNone,
+  kw_readOnly,
+  kw_noRecurse,
+  kw_returnDoesNotAlias,
+  kw_calls,
+  kw_callee,
+  kw_hotness,
+  kw_unknown,
+  kw_hot,
+  kw_critical,
+  kw_relbf,
+  kw_variable,
+  kw_aliasee,
+  kw_refs,
+  kw_typeIdInfo,
+  kw_typeTests,
+  kw_typeTestAssumeVCalls,
+  kw_typeCheckedLoadVCalls,
+  kw_typeTestAssumeConstVCalls,
+  kw_typeCheckedLoadConstVCalls,
+  kw_vFuncId,
+  kw_offset,
+  kw_args,
+  kw_typeid,
+  kw_summary,
+  kw_typeTestRes,
+  kw_kind,
+  kw_unsat,
+  kw_byteArray,
+  kw_inline,
+  kw_single,
+  kw_allOnes,
+  kw_sizeM1BitWidth,
+  kw_alignLog2,
+  kw_sizeM1,
+  kw_bitMask,
+  kw_inlineBits,
+  kw_wpdResolutions,
+  kw_wpdRes,
+  kw_indir,
+  kw_singleImpl,
+  kw_branchFunnel,
+  kw_singleImplName,
+  kw_resByArg,
+  kw_byArg,
+  kw_uniformRetVal,
+  kw_uniqueRetVal,
+  kw_virtualConstProp,
+  kw_info,
+  kw_byte,
+  kw_bit,
+
   // Unsigned Valued tokens (UIntVal).
   GlobalID,   // @42
   LocalVarID, // %42
   AttrGrpID,  // #42
+  SummaryID,  // ^42
 
   // String valued tokens (StrVal).
   LabelStr,         // foo:
diff --git a/contrib/llvm/lib/AsmParser/Parser.cpp b/contrib/llvm/lib/AsmParser/Parser.cpp
index a43ae2b5577a..1205dff24e8a 100644
--- a/contrib/llvm/lib/AsmParser/Parser.cpp
+++ b/contrib/llvm/lib/AsmParser/Parser.cpp
@@ -15,6 +15,7 @@
 #include "LLParser.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
@@ -22,32 +23,39 @@
 #include <system_error>
 using namespace llvm;
 
-bool llvm::parseAssemblyInto(MemoryBufferRef F, Module &M, SMDiagnostic &Err,
-                             SlotMapping *Slots, bool UpgradeDebugInfo) {
+bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M,
+                             ModuleSummaryIndex *Index, SMDiagnostic &Err,
+                             SlotMapping *Slots, bool UpgradeDebugInfo,
+                             StringRef DataLayoutString) {
   SourceMgr SM;
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
-  return LLParser(F.getBuffer(), SM, Err, &M, Slots, UpgradeDebugInfo).Run();
+  LLVMContext Context;
+  return LLParser(F.getBuffer(), SM, Err, M, Index,
+                  M ? M->getContext() : Context, Slots, UpgradeDebugInfo,
+                  DataLayoutString)
+      .Run();
 }
 
 std::unique_ptr<Module>
 llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
-                    SlotMapping *Slots, bool UpgradeDebugInfo) {
+                    SlotMapping *Slots, bool UpgradeDebugInfo,
+                    StringRef DataLayoutString) {
   std::unique_ptr<Module> M =
       make_unique<Module>(F.getBufferIdentifier(), Context);
 
-  if (parseAssemblyInto(F, *M, Err, Slots, UpgradeDebugInfo))
+  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, UpgradeDebugInfo,
+                        DataLayoutString))
     return nullptr;
 
   return M;
 }
 
-std::unique_ptr<Module> llvm::parseAssemblyFile(StringRef Filename,
-                                                SMDiagnostic &Err,
-                                                LLVMContext &Context,
-                                                SlotMapping *Slots,
-                                                bool UpgradeDebugInfo) {
+std::unique_ptr<Module>
+llvm::parseAssemblyFile(StringRef Filename, SMDiagnostic &Err,
+                        LLVMContext &Context, SlotMapping *Slots,
+                        bool UpgradeDebugInfo, StringRef DataLayoutString) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -57,16 +65,84 @@ std::unique_ptr<Module> llvm::parseAssemblyFile(StringRef Filename,
   }
 
   return parseAssembly(FileOrErr.get()->getMemBufferRef(), Err, Context, Slots,
-                       UpgradeDebugInfo);
+                       UpgradeDebugInfo, DataLayoutString);
+}
+
+ParsedModuleAndIndex llvm::parseAssemblyWithIndex(
+    MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
+    SlotMapping *Slots, bool UpgradeDebugInfo, StringRef DataLayoutString) {
+  std::unique_ptr<Module> M =
+      make_unique<Module>(F.getBufferIdentifier(), Context);
+  std::unique_ptr<ModuleSummaryIndex> Index =
+      make_unique<ModuleSummaryIndex>(/*HaveGVs=*/true);
+
+  if (parseAssemblyInto(F, M.get(), Index.get(), Err, Slots, UpgradeDebugInfo,
+                        DataLayoutString))
+    return {nullptr, nullptr};
+
+  return {std::move(M), std::move(Index)};
 }
 
-std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
-                                                  SMDiagnostic &Err,
-                                                  LLVMContext &Context,
-                                                  SlotMapping *Slots,
-                                                  bool UpgradeDebugInfo) {
+ParsedModuleAndIndex llvm::parseAssemblyFileWithIndex(
+    StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+    SlotMapping *Slots, bool UpgradeDebugInfo, StringRef DataLayoutString) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = FileOrErr.getError()) {
+    Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                       "Could not open input file: " + EC.message());
+    return {nullptr, nullptr};
+  }
+
+  return parseAssemblyWithIndex(FileOrErr.get()->getMemBufferRef(), Err,
+                                Context, Slots, UpgradeDebugInfo,
+                                DataLayoutString);
+}
+
+std::unique_ptr<Module>
+llvm::parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
+                          LLVMContext &Context, SlotMapping *Slots,
+                          bool UpgradeDebugInfo, StringRef DataLayoutString) {
   MemoryBufferRef F(AsmString, "<string>");
-  return parseAssembly(F, Err, Context, Slots, UpgradeDebugInfo);
+  return parseAssembly(F, Err, Context, Slots, UpgradeDebugInfo,
+                       DataLayoutString);
+}
+
+static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
+                                          ModuleSummaryIndex &Index,
+                                          SMDiagnostic &Err) {
+  SourceMgr SM;
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
+  SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
+
+  // The parser holds a reference to a context that is unused when parsing the
+  // index, but we need to initialize it.
+  LLVMContext unusedContext;
+  return LLParser(F.getBuffer(), SM, Err, nullptr, &Index, unusedContext).Run();
+}
+
+std::unique_ptr<ModuleSummaryIndex>
+llvm::parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err) {
+  std::unique_ptr<ModuleSummaryIndex> Index =
+      make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
+
+  if (parseSummaryIndexAssemblyInto(F, *Index, Err))
+    return nullptr;
+
+  return Index;
+}
+
+std::unique_ptr<ModuleSummaryIndex>
+llvm::parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = FileOrErr.getError()) {
+    Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                       "Could not open input file: " + EC.message());
+    return nullptr;
+  }
+
+  return parseSummaryIndexAssembly(FileOrErr.get()->getMemBufferRef(), Err);
 }
 
 Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err,
@@ -75,7 +151,7 @@ Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err,
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Asm);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
   Constant *C;
-  if (LLParser(Asm, SM, Err, const_cast<Module *>(&M))
+  if (LLParser(Asm, SM, Err, const_cast<Module *>(&M), nullptr, M.getContext())
           .parseStandaloneConstantValue(C, Slots))
     return nullptr;
   return C;
@@ -104,7 +180,7 @@ Type *llvm::parseTypeAtBeginning(StringRef Asm, unsigned &Read,
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Asm);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
   Type *Ty;
-  if (LLParser(Asm, SM, Err, const_cast<Module *>(&M))
+  if (LLParser(Asm, SM, Err, const_cast<Module *>(&M), nullptr, M.getContext())
           .parseTypeAtBeginning(Ty, Read, Slots))
     return nullptr;
   return Ty;
diff --git a/contrib/llvm/lib/BinaryFormat/Dwarf.cpp b/contrib/llvm/lib/BinaryFormat/Dwarf.cpp
index 86e3b02577fd..5984de73ae63 100644
--- a/contrib/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/contrib/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -393,16 +393,6 @@ StringRef llvm::dwarf::ArrayOrderString(unsigned Order) {
   return StringRef();
 }
 
-StringRef llvm::dwarf::DiscriminantString(unsigned Discriminant) {
-  switch (Discriminant) {
-  case DW_DSC_label:
-    return "DW_DSC_label";
-  case DW_DSC_range:
-    return "DW_DSC_range";
-  }
-  return StringRef();
-}
-
 StringRef llvm::dwarf::LNStandardString(unsigned Standard) {
   switch (Standard) {
   default:
@@ -454,6 +444,17 @@ unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
       .Default(DW_MACINFO_invalid);
 }
 
+StringRef llvm::dwarf::RangeListEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_RLE(ID, NAME)                                                \
+  case DW_RLE_##NAME:                                                          \
+    return "DW_RLE_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
 StringRef llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
   default:
@@ -498,7 +499,10 @@ StringRef llvm::dwarf::AtomTypeString(unsigned AT) {
   case DW_ATOM_die_tag:
     return "DW_ATOM_die_tag";
   case DW_ATOM_type_flags:
+  case DW_ATOM_type_type_flags:
     return "DW_ATOM_type_flags";
+  case DW_ATOM_qual_name_hash:
+    return "DW_ATOM_qual_name_hash";
   }
   return StringRef();
 }
@@ -560,13 +564,122 @@ StringRef llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) {
     return InlineCodeString(Val);
   case DW_AT_ordering:
     return ArrayOrderString(Val);
-  case DW_AT_discr_value:
-    return DiscriminantString(Val);
+  case DW_AT_APPLE_runtime_class:
+    return LanguageString(Val);
+  }
+
+  return StringRef();
+}
+
+StringRef llvm::dwarf::AtomValueString(uint16_t Atom, unsigned Val) {
+  switch (Atom) {
+  case DW_ATOM_null:
+    return "NULL";
+  case DW_ATOM_die_tag:
+    return TagString(Val);
   }
 
   return StringRef();
 }
 
+StringRef llvm::dwarf::IndexString(unsigned Idx) {
+  switch (Idx) {
+  default:
+    return StringRef();
+#define HANDLE_DW_IDX(ID, NAME)                                                \
+  case DW_IDX_##NAME:                                                          \
+    return "DW_IDX_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+Optional<uint8_t> llvm::dwarf::getFixedFormByteSize(dwarf::Form Form,
+                                                    FormParams Params) {
+  switch (Form) {
+  case DW_FORM_addr:
+    if (Params)
+      return Params.AddrSize;
+    return None;
+
+  case DW_FORM_block:          // ULEB128 length L followed by L bytes.
+  case DW_FORM_block1:         // 1 byte length L followed by L bytes.
+  case DW_FORM_block2:         // 2 byte length L followed by L bytes.
+  case DW_FORM_block4:         // 4 byte length L followed by L bytes.
+  case DW_FORM_string:         // C-string with null terminator.
+  case DW_FORM_sdata:          // SLEB128.
+  case DW_FORM_udata:          // ULEB128.
+  case DW_FORM_ref_udata:      // ULEB128.
+  case DW_FORM_indirect:       // ULEB128.
+  case DW_FORM_exprloc:        // ULEB128 length L followed by L bytes.
+  case DW_FORM_strx:           // ULEB128.
+  case DW_FORM_addrx:          // ULEB128.
+  case DW_FORM_loclistx:       // ULEB128.
+  case DW_FORM_rnglistx:       // ULEB128.
+  case DW_FORM_GNU_addr_index: // ULEB128.
+  case DW_FORM_GNU_str_index:  // ULEB128.
+    return None;
+
+  case DW_FORM_ref_addr:
+    if (Params)
+      return Params.getRefAddrByteSize();
+    return None;
+
+  case DW_FORM_flag:
+  case DW_FORM_data1:
+  case DW_FORM_ref1:
+  case DW_FORM_strx1:
+  case DW_FORM_addrx1:
+    return 1;
+
+  case DW_FORM_data2:
+  case DW_FORM_ref2:
+  case DW_FORM_strx2:
+  case DW_FORM_addrx2:
+    return 2;
+
+  case DW_FORM_strx3:
+    return 3;
+
+  case DW_FORM_data4:
+  case DW_FORM_ref4:
+  case DW_FORM_ref_sup4:
+  case DW_FORM_strx4:
+  case DW_FORM_addrx4:
+    return 4;
+
+  case DW_FORM_strp:
+  case DW_FORM_GNU_ref_alt:
+  case DW_FORM_GNU_strp_alt:
+  case DW_FORM_line_strp:
+  case DW_FORM_sec_offset:
+  case DW_FORM_strp_sup:
+    if (Params)
+      return Params.getDwarfOffsetByteSize();
+    return None;
+
+  case DW_FORM_data8:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_sig8:
+  case DW_FORM_ref_sup8:
+    return 8;
+
+  case DW_FORM_flag_present:
+    return 0;
+
+  case DW_FORM_data16:
+    return 16;
+
+  case DW_FORM_implicit_const:
+    // The implicit value is stored in the abbreviation as a SLEB128, and
+    // there no data in debug info.
+    return 0;
+
+  default:
+    break;
+  }
+  return None;
+}
+
 bool llvm::dwarf::isValidFormForVersion(Form F, unsigned Version,
                                         bool ExtensionsOk) {
   if (FormVendor(F) == DWARF_VENDOR_DWARF) {
@@ -576,9 +689,7 @@ bool llvm::dwarf::isValidFormForVersion(Form F, unsigned Version,
   return ExtensionsOk;
 }
 
-uint32_t llvm::dwarf::djbHash(StringRef Buffer) {
-  uint32_t H = 5381;
-  for (char C : Buffer.bytes())
-    H = ((H << 5) + H) + C;
-  return H;
-}
+constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
+constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
+constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
+constexpr char llvm::dwarf::EnumTraits<Tag>::Type[];
diff --git a/contrib/llvm/lib/BinaryFormat/Magic.cpp b/contrib/llvm/lib/BinaryFormat/Magic.cpp
index 42546eaa732b..5a339583fca1 100644
--- a/contrib/llvm/lib/BinaryFormat/Magic.cpp
+++ b/contrib/llvm/lib/BinaryFormat/Magic.cpp
@@ -14,6 +14,7 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
@@ -30,7 +31,7 @@ static bool startswith(StringRef Magic, const char (&S)[N]) {
   return Magic.startswith(StringRef(S, N - 1));
 }
 
-/// @brief Identify the magic in magic.
+/// Identify the magic in magic.
 file_magic llvm::identify_magic(StringRef Magic) {
   if (Magic.size() < 4)
     return file_magic::unknown;
@@ -181,7 +182,7 @@ file_magic llvm::identify_magic(StringRef Magic) {
       return file_magic::coff_object;
     break;
 
-  case 'M': // Possible MS-DOS stub on Windows PE file
+  case 'M': // Possible MS-DOS stub on Windows PE file or MSF/PDB file.
     if (startswith(Magic, "MZ") && Magic.size() >= 0x3c + 4) {
       uint32_t off = read32le(Magic.data() + 0x3c);
       // PE/COFF file, either EXE or DLL.
@@ -189,6 +190,8 @@ file_magic llvm::identify_magic(StringRef Magic) {
               StringRef(COFF::PEMagic, sizeof(COFF::PEMagic))))
         return file_magic::pecoff_executable;
     }
+    if (Magic.startswith("Microsoft C/C++ MSF 7.00\r\n"))
+      return file_magic::pdb;
     break;
 
   case 0x64: // x86-64 or ARM64 Windows.
@@ -203,15 +206,12 @@ file_magic llvm::identify_magic(StringRef Magic) {
 }
 
 std::error_code llvm::identify_magic(const Twine &Path, file_magic &Result) {
-  int FD;
-  if (std::error_code EC = openFileForRead(Path, FD))
-    return EC;
+  auto FileOrError = MemoryBuffer::getFile(Path);
+  if (!FileOrError)
+    return FileOrError.getError();
 
-  char Buffer[32];
-  int Length = read(FD, Buffer, sizeof(Buffer));
-  if (close(FD) != 0 || Length < 0)
-    return std::error_code(errno, std::generic_category());
+  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+  Result = identify_magic(FileBuffer->getBuffer());
 
-  Result = identify_magic(StringRef(Buffer, Length));
   return std::error_code();
 }
diff --git a/contrib/llvm/lib/BinaryFormat/Wasm.cpp b/contrib/llvm/lib/BinaryFormat/Wasm.cpp
new file mode 100644
index 000000000000..35360d0ae4f0
--- /dev/null
+++ b/contrib/llvm/lib/BinaryFormat/Wasm.cpp
@@ -0,0 +1,34 @@
+//===-- llvm/BinaryFormat/Wasm.cpp -------------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/Wasm.h"
+
+std::string llvm::wasm::toString(wasm::WasmSymbolType type) {
+  switch (type) {
+  case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+    return "WASM_SYMBOL_TYPE_FUNCTION";
+  case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+    return "WASM_SYMBOL_TYPE_GLOBAL";
+  case wasm::WASM_SYMBOL_TYPE_DATA:
+    return "WASM_SYMBOL_TYPE_DATA";
+  case wasm::WASM_SYMBOL_TYPE_SECTION:
+    return "WASM_SYMBOL_TYPE_SECTION";
+  }
+  llvm_unreachable("unknown symbol type");
+}
+
+std::string llvm::wasm::relocTypetoString(uint32_t type) {
+  switch (type) {
+#define WASM_RELOC(NAME, VALUE) case VALUE: return #NAME;
+#include "llvm/BinaryFormat/WasmRelocs.def"
+#undef WASM_RELOC
+  default:
+    llvm_unreachable("unknown reloc type");
+  }
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 945ac4515368..c45b441238bc 100644
--- a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Bitcode/BitstreamReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -532,7 +533,7 @@ public:
   Error materializeModule() override;
   std::vector<StructType *> getIdentifiedStructTypes() const override;
 
-  /// \brief Main interface to parsing a bitcode buffer.
+  /// Main interface to parsing a bitcode buffer.
   /// \returns true if an error occurred.
   Error parseBitcodeInto(Module *M, bool ShouldLazyLoadMetadata = false,
                          bool IsImporting = false);
@@ -743,14 +744,16 @@ private:
   std::vector<ValueInfo> makeRefList(ArrayRef<uint64_t> Record);
   std::vector<FunctionSummary::EdgeTy> makeCallList(ArrayRef<uint64_t> Record,
                                                     bool IsOldProfileFormat,
-                                                    bool HasProfile);
+                                                    bool HasProfile,
+                                                    bool HasRelBF);
   Error parseEntireSummary(unsigned ID);
   Error parseModuleStringTable();
 
   std::pair<ValueInfo, GlobalValue::GUID>
   getValueInfoFromValueId(unsigned ValueId);
 
-  ModuleSummaryIndex::ModuleInfo *addThisModule();
+  void addThisModule();
+  ModuleSummaryIndex::ModuleInfo *getThisModule();
 };
 
 } // end anonymous namespace
@@ -1159,6 +1162,9 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   case Attribute::Speculatable:    return 1ULL << 54;
   case Attribute::StrictFP:        return 1ULL << 55;
   case Attribute::SanitizeHWAddress: return 1ULL << 56;
+  case Attribute::NoCfCheck:       return 1ULL << 57;
+  case Attribute::OptForFuzzing:   return 1ULL << 58;
+  case Attribute::ShadowCallStack: return 1ULL << 59;
   case Attribute::Dereferenceable:
     llvm_unreachable("dereferenceable attribute not supported in raw format");
     break;
@@ -1197,7 +1203,7 @@ static void addRawAttributeValue(AttrBuilder &B, uint64_t Val) {
   }
 }
 
-/// \brief This fills an AttrBuilder object with the LLVM attributes that have
+/// This fills an AttrBuilder object with the LLVM attributes that have
 /// been decoded from the given integer. This function must stay in sync with
 /// 'encodeLLVMAttributesForBitcode'.
 static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
@@ -1337,8 +1343,12 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
     return Attribute::NoReturn;
+  case bitc::ATTR_KIND_NOCF_CHECK:
+    return Attribute::NoCfCheck;
   case bitc::ATTR_KIND_NO_UNWIND:
     return Attribute::NoUnwind;
+  case bitc::ATTR_KIND_OPT_FOR_FUZZING:
+    return Attribute::OptForFuzzing;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE:
     return Attribute::OptimizeForSize;
   case bitc::ATTR_KIND_OPTIMIZE_NONE:
@@ -1365,6 +1375,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::StackProtectStrong;
   case bitc::ATTR_KIND_SAFESTACK:
     return Attribute::SafeStack;
+  case bitc::ATTR_KIND_SHADOWCALLSTACK:
+    return Attribute::ShadowCallStack;
   case bitc::ATTR_KIND_STRICT_FP:
     return Attribute::StrictFP;
   case bitc::ATTR_KIND_STRUCT_RET:
@@ -2512,6 +2524,7 @@ Error BitcodeReader::parseConstants() {
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
       PointerType *PTy = cast<PointerType>(CurTy);
+      UpgradeInlineAsmString(&AsmStr);
       V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
                          AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
       break;
@@ -2537,6 +2550,7 @@ Error BitcodeReader::parseConstants() {
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
       PointerType *PTy = cast<PointerType>(CurTy);
+      UpgradeInlineAsmString(&AsmStr);
       V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
                          AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
                          InlineAsm::AsmDialect(AsmDialect));
@@ -2817,6 +2831,13 @@ Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
   return Error::success();
 }
 
+static void inferDSOLocal(GlobalValue *GV) {
+  // infer dso_local from linkage and visibility if it is not encoded.
+  if (GV->hasLocalLinkage() ||
+      (!GV->hasDefaultVisibility() && !GV->hasExternalWeakLinkage()))
+    GV->setDSOLocal(true);
+}
+
 Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   // v1: [pointer type, isconst, initid, linkage, alignment, section,
   // visibility, threadlocal, unnamed_addr, externally_initialized,
@@ -2909,6 +2930,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   if (Record.size() > 13) {
     NewGV->setDSOLocal(getDecodedDSOLocal(Record[13]));
   }
+  inferDSOLocal(NewGV);
 
   return Error::success();
 }
@@ -2993,6 +3015,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
   if (Record.size() > 15) {
     Func->setDSOLocal(getDecodedDSOLocal(Record[15]));
   }
+  inferDSOLocal(Func);
 
   ValueList.push_back(Func);
 
@@ -3056,16 +3079,21 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
       // FIXME: Change to an error if non-default in 4.0.
       NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
   }
-  if (OpNum != Record.size())
-    NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
-  else
-    upgradeDLLImportExportLinkage(NewGA, Linkage);
-  if (OpNum != Record.size())
-    NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
-  if (OpNum != Record.size())
-    NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
+  if (BitCode == bitc::MODULE_CODE_ALIAS ||
+      BitCode == bitc::MODULE_CODE_ALIAS_OLD) {
+    if (OpNum != Record.size())
+      NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
+    else
+      upgradeDLLImportExportLinkage(NewGA, Linkage);
+    if (OpNum != Record.size())
+      NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
+    if (OpNum != Record.size())
+      NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
+  }
   if (OpNum != Record.size())
     NewGA->setDSOLocal(getDecodedDSOLocal(Record[OpNum++]));
+  inferDSOLocal(NewGA);
+
   ValueList.push_back(NewGA);
   IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
   return Error::success();
@@ -4775,6 +4803,9 @@ Error BitcodeReader::materializeModule() {
   UpgradeDebugInfo(*TheModule);
 
   UpgradeModuleFlags(*TheModule);
+
+  UpgradeRetainReleaseMarker(*TheModule);
+
   return Error::success();
 }
 
@@ -4788,9 +4819,13 @@ ModuleSummaryIndexBitcodeReader::ModuleSummaryIndexBitcodeReader(
     : BitcodeReaderBase(std::move(Cursor), Strtab), TheIndex(TheIndex),
       ModulePath(ModulePath), ModuleId(ModuleId) {}
 
+void ModuleSummaryIndexBitcodeReader::addThisModule() {
+  TheIndex.addModule(ModulePath, ModuleId);
+}
+
 ModuleSummaryIndex::ModuleInfo *
-ModuleSummaryIndexBitcodeReader::addThisModule() {
-  return TheIndex.addModule(ModulePath, ModuleId);
+ModuleSummaryIndexBitcodeReader::getThisModule() {
+  return TheIndex.getModule(ModulePath);
 }
 
 std::pair<ValueInfo, GlobalValue::GUID>
@@ -4812,8 +4847,15 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID(
   if (PrintSummaryGUIDs)
     dbgs() << "GUID " << ValueGUID << "(" << OriginalNameID << ") is "
            << ValueName << "\n";
-  ValueIdToValueInfoMap[ValueID] =
-      std::make_pair(TheIndex.getOrInsertValueInfo(ValueGUID), OriginalNameID);
+
+  // UseStrtab is false for legacy summary formats and value names are
+  // created on stack. In that case we save the name in a string saver in
+  // the index so that the value name can be recorded.
+  ValueIdToValueInfoMap[ValueID] = std::make_pair(
+      TheIndex.getOrInsertValueInfo(
+          ValueGUID,
+          UseStrtab ? ValueName : TheIndex.saveString(ValueName.str())),
+      OriginalNameID);
 }
 
 // Specialized value symbol table parser used when reading module index
@@ -4942,6 +4984,9 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() {
         break;
       case bitc::GLOBALVAL_SUMMARY_BLOCK_ID:
       case bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID:
+        // Add the module if it is a per-module index (has a source file name).
+        if (!SourceFileName.empty())
+          addThisModule();
         assert(!SeenValueSymbolTable &&
                "Already read VST when parsing summary block?");
         // We might not have a VST if there were no values in the
@@ -4987,7 +5032,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() {
         case bitc::MODULE_CODE_HASH: {
           if (Record.size() != 5)
             return error("Invalid hash length " + Twine(Record.size()).str());
-          auto &Hash = addThisModule()->second.second;
+          auto &Hash = getThisModule()->second.second;
           int Pos = 0;
           for (auto &Val : Record) {
             assert(!(Val >> 32) && "Unexpected high bits set");
@@ -5042,12 +5087,15 @@ ModuleSummaryIndexBitcodeReader::makeRefList(ArrayRef<uint64_t> Record) {
   return Ret;
 }
 
-std::vector<FunctionSummary::EdgeTy> ModuleSummaryIndexBitcodeReader::makeCallList(
-    ArrayRef<uint64_t> Record, bool IsOldProfileFormat, bool HasProfile) {
+std::vector<FunctionSummary::EdgeTy>
+ModuleSummaryIndexBitcodeReader::makeCallList(ArrayRef<uint64_t> Record,
+                                              bool IsOldProfileFormat,
+                                              bool HasProfile, bool HasRelBF) {
   std::vector<FunctionSummary::EdgeTy> Ret;
   Ret.reserve(Record.size());
   for (unsigned I = 0, E = Record.size(); I != E; ++I) {
     CalleeInfo::HotnessType Hotness = CalleeInfo::HotnessType::Unknown;
+    uint64_t RelBF = 0;
     ValueInfo Callee = getValueInfoFromValueId(Record[I]).first;
     if (IsOldProfileFormat) {
       I += 1; // Skip old callsitecount field
@@ -5055,11 +5103,63 @@ std::vector<FunctionSummary::EdgeTy> ModuleSummaryIndexBitcodeReader::makeCallLi
         I += 1; // Skip old profilecount field
     } else if (HasProfile)
       Hotness = static_cast<CalleeInfo::HotnessType>(Record[++I]);
-    Ret.push_back(FunctionSummary::EdgeTy{Callee, CalleeInfo{Hotness}});
+    else if (HasRelBF)
+      RelBF = Record[++I];
+    Ret.push_back(FunctionSummary::EdgeTy{Callee, CalleeInfo(Hotness, RelBF)});
   }
   return Ret;
 }
 
+static void
+parseWholeProgramDevirtResolutionByArg(ArrayRef<uint64_t> Record, size_t &Slot,
+                                       WholeProgramDevirtResolution &Wpd) {
+  uint64_t ArgNum = Record[Slot++];
+  WholeProgramDevirtResolution::ByArg &B =
+      Wpd.ResByArg[{Record.begin() + Slot, Record.begin() + Slot + ArgNum}];
+  Slot += ArgNum;
+
+  B.TheKind =
+      static_cast<WholeProgramDevirtResolution::ByArg::Kind>(Record[Slot++]);
+  B.Info = Record[Slot++];
+  B.Byte = Record[Slot++];
+  B.Bit = Record[Slot++];
+}
+
+static void parseWholeProgramDevirtResolution(ArrayRef<uint64_t> Record,
+                                              StringRef Strtab, size_t &Slot,
+                                              TypeIdSummary &TypeId) {
+  uint64_t Id = Record[Slot++];
+  WholeProgramDevirtResolution &Wpd = TypeId.WPDRes[Id];
+
+  Wpd.TheKind = static_cast<WholeProgramDevirtResolution::Kind>(Record[Slot++]);
+  Wpd.SingleImplName = {Strtab.data() + Record[Slot],
+                        static_cast<size_t>(Record[Slot + 1])};
+  Slot += 2;
+
+  uint64_t ResByArgNum = Record[Slot++];
+  for (uint64_t I = 0; I != ResByArgNum; ++I)
+    parseWholeProgramDevirtResolutionByArg(Record, Slot, Wpd);
+}
+
+static void parseTypeIdSummaryRecord(ArrayRef<uint64_t> Record,
+                                     StringRef Strtab,
+                                     ModuleSummaryIndex &TheIndex) {
+  size_t Slot = 0;
+  TypeIdSummary &TypeId = TheIndex.getOrInsertTypeIdSummary(
+      {Strtab.data() + Record[Slot], static_cast<size_t>(Record[Slot + 1])});
+  Slot += 2;
+
+  TypeId.TTRes.TheKind = static_cast<TypeTestResolution::Kind>(Record[Slot++]);
+  TypeId.TTRes.SizeM1BitWidth = Record[Slot++];
+  TypeId.TTRes.AlignLog2 = Record[Slot++];
+  TypeId.TTRes.SizeM1 = Record[Slot++];
+  TypeId.TTRes.BitMask = Record[Slot++];
+  TypeId.TTRes.InlineBits = Record[Slot++];
+
+  while (Slot < Record.size())
+    parseWholeProgramDevirtResolution(Record, Strtab, Slot, TypeId);
+}
+
 // Eagerly parse the entire summary block. This populates the GlobalValueSummary
 // objects in the index.
 Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
@@ -5122,6 +5222,19 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     switch (BitCode) {
     default: // Default behavior: ignore.
       break;
+    case bitc::FS_FLAGS: {  // [flags]
+      uint64_t Flags = Record[0];
+      // Scan flags (set only on the combined index).
+      assert(Flags <= 0x3 && "Unexpected bits in flag");
+
+      // 1 bit: WithGlobalValueDeadStripping flag.
+      if (Flags & 0x1)
+        TheIndex.setWithGlobalValueDeadStripping();
+      // 1 bit: SkipModuleByDistributedBackend flag.
+      if (Flags & 0x2)
+        TheIndex.setSkipModuleByDistributedBackend();
+      break;
+    }
     case bitc::FS_VALUE_GUID: { // [valueid, refguid]
       uint64_t ValueID = Record[0];
       GlobalValue::GUID RefGUID = Record[1];
@@ -5134,7 +5247,11 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     // FS_PERMODULE_PROFILE: [valueid, flags, instcount, fflags, numrefs,
     //                        numrefs x valueid,
     //                        n x (valueid, hotness)]
+    // FS_PERMODULE_RELBF: [valueid, flags, instcount, fflags, numrefs,
+    //                      numrefs x valueid,
+    //                      n x (valueid, relblockfreq)]
     case bitc::FS_PERMODULE:
+    case bitc::FS_PERMODULE_RELBF:
     case bitc::FS_PERMODULE_PROFILE: {
       unsigned ValueID = Record[0];
       uint64_t RawFlags = Record[1];
@@ -5160,9 +5277,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       std::vector<ValueInfo> Refs = makeRefList(
           ArrayRef<uint64_t>(Record).slice(RefListStartIndex, NumRefs));
       bool HasProfile = (BitCode == bitc::FS_PERMODULE_PROFILE);
+      bool HasRelBF = (BitCode == bitc::FS_PERMODULE_RELBF);
       std::vector<FunctionSummary::EdgeTy> Calls = makeCallList(
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
-          IsOldProfileFormat, HasProfile);
+          IsOldProfileFormat, HasProfile, HasRelBF);
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs),
           std::move(Calls), std::move(PendingTypeTests),
@@ -5176,7 +5294,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       PendingTypeTestAssumeConstVCalls.clear();
       PendingTypeCheckedLoadConstVCalls.clear();
       auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
-      FS->setModulePath(addThisModule()->first());
+      FS->setModulePath(getThisModule()->first());
       FS->setOriginalName(VIAndOriginalGUID.second);
       TheIndex.addGlobalValueSummary(VIAndOriginalGUID.first, std::move(FS));
       break;
@@ -5195,7 +5313,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       // string table section in the per-module index, we create a single
       // module path string table entry with an empty (0) ID to take
       // ownership.
-      AS->setModulePath(addThisModule()->first());
+      AS->setModulePath(getThisModule()->first());
 
       GlobalValue::GUID AliaseeGUID =
           getValueInfoFromValueId(AliaseeID).first.getGUID();
@@ -5219,7 +5337,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       std::vector<ValueInfo> Refs =
           makeRefList(ArrayRef<uint64_t>(Record).slice(2));
       auto FS = llvm::make_unique<GlobalVarSummary>(Flags, std::move(Refs));
-      FS->setModulePath(addThisModule()->first());
+      FS->setModulePath(getThisModule()->first());
       auto GUID = getValueInfoFromValueId(ValueID);
       FS->setOriginalName(GUID.second);
       TheIndex.addGlobalValueSummary(GUID.first, std::move(FS));
@@ -5254,7 +5372,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       bool HasProfile = (BitCode == bitc::FS_COMBINED_PROFILE);
       std::vector<FunctionSummary::EdgeTy> Edges = makeCallList(
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
-          IsOldProfileFormat, HasProfile);
+          IsOldProfileFormat, HasProfile, false);
       ValueInfo VI = getValueInfoFromValueId(ValueID).first;
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs),
@@ -5362,6 +5480,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
             {Strtab.data() + Record[I], static_cast<size_t>(Record[I + 1])});
       break;
     }
+
     case bitc::FS_CFI_FUNCTION_DECLS: {
       std::set<std::string> &CfiFunctionDecls = TheIndex.cfiFunctionDecls();
       for (unsigned I = 0; I != Record.size(); I += 2)
@@ -5369,6 +5488,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
             {Strtab.data() + Record[I], static_cast<size_t>(Record[I + 1])});
       break;
     }
+
+    case bitc::FS_TYPE_ID:
+      parseTypeIdSummaryRecord(Record, Strtab, TheIndex);
+      break;
     }
   }
   llvm_unreachable("Exit infinite loop");
@@ -5604,7 +5727,7 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
   }
 }
 
-/// \brief Get a lazy one-at-time loading module from bitcode.
+/// Get a lazy one-at-time loading module from bitcode.
 ///
 /// This isn't always used in a lazy context.  In particular, it's also used by
 /// \a parseModule().  If this is truly lazy, then we need to eagerly pull
@@ -5678,7 +5801,7 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
   BitstreamCursor Stream(Buffer);
   Stream.JumpToBit(ModuleBit);
 
-  auto Index = llvm::make_unique<ModuleSummaryIndex>();
+  auto Index = llvm::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
   ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, *Index,
                                     ModuleIdentifier, 0);
 
diff --git a/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 1b0d80d26cf5..011c41e2cecd 100644
--- a/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -165,7 +165,7 @@ public:
   /// necessary.
   Metadata *getMetadataFwdRef(unsigned Idx);
 
-  /// Return the the given metadata only if it is fully resolved.
+  /// Return the given metadata only if it is fully resolved.
   ///
   /// Gives the same result as \a lookup(), unless \a MDNode::isResolved()
   /// would give \c false.
@@ -822,6 +822,7 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
       case bitc::METADATA_TEMPLATE_VALUE:
       case bitc::METADATA_GLOBAL_VAR:
       case bitc::METADATA_LOCAL_VAR:
+      case bitc::METADATA_LABEL:
       case bitc::METADATA_EXPRESSION:
       case bitc::METADATA_OBJC_PROPERTY:
       case bitc::METADATA_IMPORTED_ENTITY:
@@ -1174,14 +1175,25 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_SUBRANGE: {
-    if (Record.size() != 3)
-      return error("Invalid record");
+    Metadata *Val = nullptr;
+    // Operand 'count' is interpreted as:
+    // - Signed integer (version 0)
+    // - Metadata node  (version 1)
+    switch (Record[0] >> 1) {
+    case 0:
+      Val = GET_OR_DISTINCT(DISubrange,
+                            (Context, Record[1], unrotateSign(Record.back())));
+      break;
+    case 1:
+      Val = GET_OR_DISTINCT(DISubrange, (Context, getMDOrNull(Record[1]),
+                                         unrotateSign(Record.back())));
+      break;
+    default:
+      return error("Invalid record: Unsupported version of DISubrange");
+    }
 
-    IsDistinct = Record[0];
-    MetadataList.assignValue(
-        GET_OR_DISTINCT(DISubrange,
-                        (Context, Record[1], unrotateSign(Record[2]))),
-        NextMetadataNo);
+    MetadataList.assignValue(Val, NextMetadataNo);
+    IsDistinct = Record[0] & 1;
     NextMetadataNo++;
     break;
   }
@@ -1189,10 +1201,11 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() != 3)
       return error("Invalid record");
 
-    IsDistinct = Record[0];
+    IsDistinct = Record[0] & 1;
+    bool IsUnsigned = Record[0] & 2;
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIEnumerator, (Context, unrotateSign(Record[1]),
-                                       getMDString(Record[2]))),
+                                       IsUnsigned, getMDString(Record[2]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1235,7 +1248,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPOSITE_TYPE: {
-    if (Record.size() != 16)
+    if (Record.size() < 16 || Record.size() > 17)
       return error("Invalid record");
 
     // If we have a UUID and this is not a forward declaration, lookup the
@@ -1258,6 +1271,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     unsigned RuntimeLang = Record[12];
     Metadata *VTableHolder = nullptr;
     Metadata *TemplateParams = nullptr;
+    Metadata *Discriminator = nullptr;
     auto *Identifier = getMDString(Record[15]);
     // If this module is being parsed so that it can be ThinLTO imported
     // into another module, composite types only need to be imported
@@ -1278,13 +1292,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       Elements = getMDOrNull(Record[11]);
       VTableHolder = getDITypeRefOrNull(Record[13]);
       TemplateParams = getMDOrNull(Record[14]);
+      if (Record.size() > 16)
+        Discriminator = getMDOrNull(Record[16]);
     }
     DICompositeType *CT = nullptr;
     if (Identifier)
       CT = DICompositeType::buildODRType(
           Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
           SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-          VTableHolder, TemplateParams);
+          VTableHolder, TemplateParams, Discriminator);
 
     // Create a node if we didn't get a lazy ODR type.
     if (!CT)
@@ -1335,17 +1351,25 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
   }
 
   case bitc::METADATA_FILE: {
-    if (Record.size() != 3 && Record.size() != 5)
+    if (Record.size() != 3 && Record.size() != 5 && Record.size() != 6)
       return error("Invalid record");
 
     IsDistinct = Record[0];
+    Optional<DIFile::ChecksumInfo<MDString *>> Checksum;
+    // The BitcodeWriter writes null bytes into Record[3:4] when the Checksum
+    // is not present. This matches up with the old internal representation,
+    // and the old encoding for CSK_None in the ChecksumKind. The new
+    // representation reserves the value 0 in the ChecksumKind to continue to
+    // encode None in a backwards-compatible way.
+    if (Record.size() > 4 && Record[3] && Record[4])
+      Checksum.emplace(static_cast<DIFile::ChecksumKind>(Record[3]),
+                       getMDString(Record[4]));
     MetadataList.assignValue(
         GET_OR_DISTINCT(
             DIFile,
-            (Context, getMDString(Record[1]), getMDString(Record[2]),
-             Record.size() == 3 ? DIFile::CSK_None
-                                : static_cast<DIFile::ChecksumKind>(Record[3]),
-             Record.size() == 3 ? nullptr : getMDString(Record[4]))),
+            (Context, getMDString(Record[1]), getMDString(Record[2]), Checksum,
+             Record.size() > 5 ? Optional<MDString *>(getMDString(Record[5]))
+                               : None)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1415,7 +1439,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
          HasUnit ? CUorFn : nullptr,                        // unit
          getMDOrNull(Record[15 + Offset]),                  // templateParams
          getMDOrNull(Record[16 + Offset]),                  // declaration
-         getMDOrNull(Record[17 + Offset]),                  // variables
+         getMDOrNull(Record[17 + Offset]),                  // retainedNodes
          HasThrownTypes ? getMDOrNull(Record[20]) : nullptr // thrownTypes
          ));
     MetadataList.assignValue(SP, NextMetadataNo);
@@ -1624,6 +1648,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     NextMetadataNo++;
     break;
   }
+  case bitc::METADATA_LABEL: {
+    if (Record.size() != 5)
+      return error("Invalid record");
+
+    IsDistinct = Record[0] & 1;
+    MetadataList.assignValue(
+        GET_OR_DISTINCT(DILabel,
+                        (Context, getMDOrNull(Record[1]),
+                         getMDString(Record[2]),
+                         getMDOrNull(Record[3]), Record[4])),
+        NextMetadataNo);
+    NextMetadataNo++;
+    break;
+  }
   case bitc::METADATA_EXPRESSION: {
     if (Record.size() < 1)
       return error("Invalid record");
diff --git a/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp b/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp
index 08bfa291098c..1ab22b5cc3d1 100644
--- a/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp
@@ -32,7 +32,7 @@ namespace llvm {
 
 namespace {
 
-/// \brief A class for maintaining the slot number definition
+/// A class for maintaining the slot number definition
 /// as a placeholder for the actual definition for forward constants defs.
 class ConstantPlaceHolder : public ConstantExpr {
 public:
@@ -46,7 +46,7 @@ public:
   // allocate space for exactly one operand
   void *operator new(size_t s) { return User::operator new(s, 1); }
 
-  /// \brief Methods to support type inquiry through isa, cast, and dyn_cast.
+  /// Methods to support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Value *V) {
     return isa<ConstantExpr>(V) &&
            cast<ConstantExpr>(V)->getOpcode() == Instruction::UserOp1;
@@ -144,7 +144,7 @@ Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
 void BitcodeReaderValueList::resolveConstantForwardRefs() {
   // Sort the values by-pointer so that they are efficient to look up with a
   // binary search.
-  std::sort(ResolveConstants.begin(), ResolveConstants.end());
+  llvm::sort(ResolveConstants.begin(), ResolveConstants.end());
 
   SmallVector<Constant *, 64> NewOps;
 
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp
index e0388418a3d9..763cd12aa2d7 100644
--- a/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp
@@ -25,7 +25,7 @@ int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path) {
   if (EC)
     return -1;
 
-  WriteBitcodeToFile(unwrap(M), OS);
+  WriteBitcodeToFile(*unwrap(M), OS);
   return 0;
 }
 
@@ -33,7 +33,7 @@ int LLVMWriteBitcodeToFD(LLVMModuleRef M, int FD, int ShouldClose,
                          int Unbuffered) {
   raw_fd_ostream OS(FD, ShouldClose, Unbuffered);
 
-  WriteBitcodeToFile(unwrap(M), OS);
+  WriteBitcodeToFile(*unwrap(M), OS);
   return 0;
 }
 
@@ -45,6 +45,6 @@ LLVMMemoryBufferRef LLVMWriteBitcodeToMemoryBuffer(LLVMModuleRef M) {
   std::string Data;
   raw_string_ostream OS(Data);
 
-  WriteBitcodeToFile(unwrap(M), OS);
+  WriteBitcodeToFile(*unwrap(M), OS);
   return wrap(MemoryBuffer::getMemBufferCopy(OS.str()).release());
 }
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7bf37857eb97..be75df0820d9 100644
--- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Bitcode/BitCodes.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
@@ -86,6 +87,12 @@ static cl::opt<unsigned>
                    cl::desc("Number of metadatas above which we emit an index "
                             "to enable lazy-loading"));
 
+cl::opt<bool> WriteRelBFToSummary(
+    "write-relbf-to-summary", cl::Hidden, cl::init(false),
+    cl::desc("Write relative block frequency to function summary "));
+
+extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold;
+
 namespace {
 
 /// These are manifest constants used by the bitcode writer. They do not need to
@@ -167,12 +174,12 @@ protected:
 public:
   /// Constructs a ModuleBitcodeWriterBase object for the given Module,
   /// writing to the provided \p Buffer.
-  ModuleBitcodeWriterBase(const Module *M, StringTableBuilder &StrtabBuilder,
+  ModuleBitcodeWriterBase(const Module &M, StringTableBuilder &StrtabBuilder,
                           BitstreamWriter &Stream,
                           bool ShouldPreserveUseListOrder,
                           const ModuleSummaryIndex *Index)
-      : BitcodeWriterBase(Stream, StrtabBuilder), M(*M),
-        VE(*M, ShouldPreserveUseListOrder), Index(Index) {
+      : BitcodeWriterBase(Stream, StrtabBuilder), M(M),
+        VE(M, ShouldPreserveUseListOrder), Index(Index) {
     // Assign ValueIds to any callee values in the index that came from
     // indirect call profiles and were recorded as a GUID not a Value*
     // (which would have been assigned an ID by the ValueEnumerator).
@@ -190,7 +197,7 @@ public:
           // otherwise we would have a Value for it). If so, synthesize
           // a value id.
           for (auto &CallEdge : FS->calls())
-            if (!CallEdge.first.getValue())
+            if (!CallEdge.first.haveGVs() || !CallEdge.first.getValue())
               assignValueId(CallEdge.first.getGUID());
   }
 
@@ -223,7 +230,7 @@ private:
 
   // Helper to get the valueId for the type of value recorded in VI.
   unsigned getValueId(ValueInfo VI) {
-    if (!VI.getValue())
+    if (!VI.haveGVs() || !VI.getValue())
       return getValueId(VI.getGUID());
     return VE.getValueID(VI.getValue());
   }
@@ -251,7 +258,7 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase {
 public:
   /// Constructs a ModuleBitcodeWriter object for the given Module,
   /// writing to the provided \p Buffer.
-  ModuleBitcodeWriter(const Module *M, SmallVectorImpl<char> &Buffer,
+  ModuleBitcodeWriter(const Module &M, SmallVectorImpl<char> &Buffer,
                       StringTableBuilder &StrtabBuilder,
                       BitstreamWriter &Stream, bool ShouldPreserveUseListOrder,
                       const ModuleSummaryIndex *Index, bool GenerateHash,
@@ -328,6 +335,8 @@ private:
                              unsigned Abbrev);
   void writeDILocalVariable(const DILocalVariable *N,
                             SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILabel(const DILabel *N,
+                    SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
   void writeDIExpression(const DIExpression *N,
                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
   void writeDIGlobalVariableExpression(const DIGlobalVariableExpression *N,
@@ -635,8 +644,12 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_RED_ZONE;
   case Attribute::NoReturn:
     return bitc::ATTR_KIND_NO_RETURN;
+  case Attribute::NoCfCheck:
+    return bitc::ATTR_KIND_NOCF_CHECK;
   case Attribute::NoUnwind:
     return bitc::ATTR_KIND_NO_UNWIND;
+  case Attribute::OptForFuzzing:
+    return bitc::ATTR_KIND_OPT_FOR_FUZZING;
   case Attribute::OptimizeForSize:
     return bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE;
   case Attribute::OptimizeNone:
@@ -663,6 +676,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_STACK_PROTECT_STRONG;
   case Attribute::SafeStack:
     return bitc::ATTR_KIND_SAFESTACK;
+  case Attribute::ShadowCallStack:
+    return bitc::ATTR_KIND_SHADOWCALLSTACK;
   case Attribute::StrictFP:
     return bitc::ATTR_KIND_STRICT_FP;
   case Attribute::StructRet:
@@ -1302,7 +1317,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   // Emit the ifunc information.
   for (const GlobalIFunc &I : M.ifuncs()) {
     // IFUNC: [strtab offset, strtab size, ifunc type, address space, resolver
-    //         val#, linkage, visibility]
+    //         val#, linkage, visibility, DSO_Local]
     Vals.push_back(addToStrtab(I.getName()));
     Vals.push_back(I.getName().size());
     Vals.push_back(VE.getTypeID(I.getValueType()));
@@ -1310,6 +1325,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.push_back(VE.getValueID(I.getResolver()));
     Vals.push_back(getEncodedLinkage(I));
     Vals.push_back(getEncodedVisibility(I));
+    Vals.push_back(I.isDSOLocal());
     Stream.EmitRecord(bitc::MODULE_CODE_IFUNC, Vals);
     Vals.clear();
   }
@@ -1441,8 +1457,9 @@ static uint64_t rotateSign(int64_t I) {
 void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
                                           SmallVectorImpl<uint64_t> &Record,
                                           unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
-  Record.push_back(N->getCount());
+  const uint64_t Version = 1 << 1;
+  Record.push_back((uint64_t)N->isDistinct() | Version);
+  Record.push_back(VE.getMetadataOrNullID(N->getRawCountNode()));
   Record.push_back(rotateSign(N->getLowerBound()));
 
   Stream.EmitRecord(bitc::METADATA_SUBRANGE, Record, Abbrev);
@@ -1452,7 +1469,7 @@ void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
 void ModuleBitcodeWriter::writeDIEnumerator(const DIEnumerator *N,
                                             SmallVectorImpl<uint64_t> &Record,
                                             unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  Record.push_back((N->isUnsigned() << 1) | N->isDistinct());
   Record.push_back(rotateSign(N->getValue()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
 
@@ -1521,6 +1538,7 @@ void ModuleBitcodeWriter::writeDICompositeType(
   Record.push_back(VE.getMetadataOrNullID(N->getVTableHolder()));
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawIdentifier()));
+  Record.push_back(VE.getMetadataOrNullID(N->getDiscriminator()));
 
   Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
   Record.clear();
@@ -1545,8 +1563,18 @@ void ModuleBitcodeWriter::writeDIFile(const DIFile *N,
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getRawFilename()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawDirectory()));
-  Record.push_back(N->getChecksumKind());
-  Record.push_back(VE.getMetadataOrNullID(N->getRawChecksum()));
+  if (N->getRawChecksum()) {
+    Record.push_back(N->getRawChecksum()->Kind);
+    Record.push_back(VE.getMetadataOrNullID(N->getRawChecksum()->Value));
+  } else {
+    // Maintain backwards compatibility with the old internal representation of
+    // CSK_None in ChecksumKind by writing nulls here when Checksum is None.
+    Record.push_back(0);
+    Record.push_back(VE.getMetadataOrNullID(nullptr));
+  }
+  auto Source = N->getRawSource();
+  if (Source)
+    Record.push_back(VE.getMetadataOrNullID(*Source));
 
   Stream.EmitRecord(bitc::METADATA_FILE, Record, Abbrev);
   Record.clear();
@@ -1602,7 +1630,7 @@ void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N,
   Record.push_back(VE.getMetadataOrNullID(N->getRawUnit()));
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
-  Record.push_back(VE.getMetadataOrNullID(N->getVariables().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRetainedNodes().get()));
   Record.push_back(N->getThisAdjustment());
   Record.push_back(VE.getMetadataOrNullID(N->getThrownTypes().get()));
 
@@ -1759,6 +1787,19 @@ void ModuleBitcodeWriter::writeDILocalVariable(
   Record.clear();
 }
 
+void ModuleBitcodeWriter::writeDILabel(
+    const DILabel *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  Record.push_back((uint64_t)N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+
+  Stream.EmitRecord(bitc::METADATA_LABEL, Record, Abbrev);
+  Record.clear();
+}
+
 void ModuleBitcodeWriter::writeDIExpression(const DIExpression *N,
                                             SmallVectorImpl<uint64_t> &Record,
                                             unsigned Abbrev) {
@@ -3312,10 +3353,14 @@ void IndexBitcodeWriter::writeModStrings() {
 
 /// Write the function type metadata related records that need to appear before
 /// a function summary entry (whether per-module or combined).
-static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
-                                             FunctionSummary *FS) {
-  if (!FS->type_tests().empty())
+static void writeFunctionTypeMetadataRecords(
+    BitstreamWriter &Stream, FunctionSummary *FS,
+    std::set<GlobalValue::GUID> &ReferencedTypeIds) {
+  if (!FS->type_tests().empty()) {
     Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+    for (auto &TT : FS->type_tests())
+      ReferencedTypeIds.insert(TT);
+  }
 
   SmallVector<uint64_t, 64> Record;
 
@@ -3327,6 +3372,7 @@ static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
     for (auto &VF : VFs) {
       Record.push_back(VF.GUID);
       Record.push_back(VF.Offset);
+      ReferencedTypeIds.insert(VF.GUID);
     }
     Stream.EmitRecord(Ty, Record);
   };
@@ -3341,6 +3387,7 @@ static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
     for (auto &VC : VCs) {
       Record.clear();
       Record.push_back(VC.VFunc.GUID);
+      ReferencedTypeIds.insert(VC.VFunc.GUID);
       Record.push_back(VC.VFunc.Offset);
       Record.insert(Record.end(), VC.Args.begin(), VC.Args.end());
       Stream.EmitRecord(Ty, Record);
@@ -3353,6 +3400,51 @@ static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
                      FS->type_checked_load_const_vcalls());
 }
 
+static void writeWholeProgramDevirtResolutionByArg(
+    SmallVector<uint64_t, 64> &NameVals, const std::vector<uint64_t> &args,
+    const WholeProgramDevirtResolution::ByArg &ByArg) {
+  NameVals.push_back(args.size());
+  NameVals.insert(NameVals.end(), args.begin(), args.end());
+
+  NameVals.push_back(ByArg.TheKind);
+  NameVals.push_back(ByArg.Info);
+  NameVals.push_back(ByArg.Byte);
+  NameVals.push_back(ByArg.Bit);
+}
+
+static void writeWholeProgramDevirtResolution(
+    SmallVector<uint64_t, 64> &NameVals, StringTableBuilder &StrtabBuilder,
+    uint64_t Id, const WholeProgramDevirtResolution &Wpd) {
+  NameVals.push_back(Id);
+
+  NameVals.push_back(Wpd.TheKind);
+  NameVals.push_back(StrtabBuilder.add(Wpd.SingleImplName));
+  NameVals.push_back(Wpd.SingleImplName.size());
+
+  NameVals.push_back(Wpd.ResByArg.size());
+  for (auto &A : Wpd.ResByArg)
+    writeWholeProgramDevirtResolutionByArg(NameVals, A.first, A.second);
+}
+
+static void writeTypeIdSummaryRecord(SmallVector<uint64_t, 64> &NameVals,
+                                     StringTableBuilder &StrtabBuilder,
+                                     const std::string &Id,
+                                     const TypeIdSummary &Summary) {
+  NameVals.push_back(StrtabBuilder.add(Id));
+  NameVals.push_back(Id.size());
+
+  NameVals.push_back(Summary.TTRes.TheKind);
+  NameVals.push_back(Summary.TTRes.SizeM1BitWidth);
+  NameVals.push_back(Summary.TTRes.AlignLog2);
+  NameVals.push_back(Summary.TTRes.SizeM1);
+  NameVals.push_back(Summary.TTRes.BitMask);
+  NameVals.push_back(Summary.TTRes.InlineBits);
+
+  for (auto &W : Summary.WPDRes)
+    writeWholeProgramDevirtResolution(NameVals, StrtabBuilder, W.first,
+                                      W.second);
+}
+
 // Helper to emit a single function summary record.
 void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
@@ -3361,7 +3453,8 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
-  writeFunctionTypeMetadataRecords(Stream, FS);
+  std::set<GlobalValue::GUID> ReferencedTypeIds;
+  writeFunctionTypeMetadataRecords(Stream, FS, ReferencedTypeIds);
 
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
@@ -3371,16 +3464,21 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
   for (auto &RI : FS->refs())
     NameVals.push_back(VE.getValueID(RI.getValue()));
 
-  bool HasProfileData = F.hasProfileData();
+  bool HasProfileData =
+      F.hasProfileData() || ForceSummaryEdgesCold != FunctionSummary::FSHT_None;
   for (auto &ECI : FS->calls()) {
     NameVals.push_back(getValueId(ECI.first));
     if (HasProfileData)
       NameVals.push_back(static_cast<uint8_t>(ECI.second.Hotness));
+    else if (WriteRelBFToSummary)
+      NameVals.push_back(ECI.second.RelBlockFreq);
   }
 
   unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev);
   unsigned Code =
-      (HasProfileData ? bitc::FS_PERMODULE_PROFILE : bitc::FS_PERMODULE);
+      (HasProfileData ? bitc::FS_PERMODULE_PROFILE
+                      : (WriteRelBFToSummary ? bitc::FS_PERMODULE_RELBF
+                                             : bitc::FS_PERMODULE));
 
   // Emit the finished record.
   Stream.EmitRecord(Code, NameVals, FSAbbrev);
@@ -3392,7 +3490,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
 void ModuleBitcodeWriterBase::writeModuleLevelReferences(
     const GlobalVariable &V, SmallVector<uint64_t, 64> &NameVals,
     unsigned FSModRefsAbbrev) {
-  auto VI = Index->getValueInfo(GlobalValue::getGUID(V.getName()));
+  auto VI = Index->getValueInfo(V.getGUID());
   if (!VI || VI.getSummaryList().empty()) {
     // Only declarations should not have a summary (a declaration might however
     // have a summary if the def was in module level asm).
@@ -3409,7 +3507,7 @@ void ModuleBitcodeWriterBase::writeModuleLevelReferences(
     NameVals.push_back(VE.getValueID(RI.getValue()));
   // Sort the refs for determinism output, the vector returned by FS->refs() has
   // been initialized from a DenseSet.
-  std::sort(NameVals.begin() + SizeBeforeRefs, NameVals.end());
+  llvm::sort(NameVals.begin() + SizeBeforeRefs, NameVals.end());
 
   Stream.EmitRecord(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS, NameVals,
                     FSModRefsAbbrev);
@@ -3446,31 +3544,34 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
                       ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
-  // Abbrev for FS_PERMODULE.
+  // Abbrev for FS_PERMODULE_PROFILE.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
-  // numrefs x valueid, n x (valueid)
+  // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
-  // Abbrev for FS_PERMODULE_PROFILE.
+  // Abbrev for FS_PERMODULE or FS_PERMODULE_RELBF.
   Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
+  if (WriteRelBFToSummary)
+    Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_RELBF));
+  else
+    Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
-  // numrefs x valueid, n x (valueid, hotness)
+  // numrefs x valueid, n x (valueid [, rel_block_freq])
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_PERMODULE_GLOBALVAR_INIT_REFS.
   Abbv = std::make_shared<BitCodeAbbrev>();
@@ -3498,7 +3599,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     if (!F.hasName())
       report_fatal_error("Unexpected anonymous function when writing summary");
 
-    ValueInfo VI = Index->getValueInfo(GlobalValue::getGUID(F.getName()));
+    ValueInfo VI = Index->getValueInfo(F.getGUID());
     if (!VI || VI.getSummaryList().empty()) {
       // Only declarations should not have a summary (a declaration might
       // however have a summary if the def was in module level asm).
@@ -3539,6 +3640,14 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3);
   Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
 
+  // Write the index flags.
+  uint64_t Flags = 0;
+  if (Index.withGlobalValueDeadStripping())
+    Flags |= 0x1;
+  if (Index.skipModuleByDistributedBackend())
+    Flags |= 0x2;
+  Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef<uint64_t>{Flags});
+
   for (const auto &GVI : valueIds()) {
     Stream.EmitRecord(bitc::FS_VALUE_GUID,
                       ArrayRef<uint64_t>{GVI.second, GVI.first});
@@ -3600,6 +3709,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
 
   SmallVector<uint64_t, 64> NameVals;
 
+  // Set that will be populated during call to writeFunctionTypeMetadataRecords
+  // with the type ids referenced by this index file.
+  std::set<GlobalValue::GUID> ReferencedTypeIds;
+
   // For local linkage, we also emit the original name separately
   // immediately after the record.
   auto MaybeEmitOriginalName = [&](GlobalValueSummary &S) {
@@ -3651,7 +3764,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
 
     auto *FS = cast<FunctionSummary>(S);
-    writeFunctionTypeMetadataRecords(Stream, FS);
+    writeFunctionTypeMetadataRecords(Stream, FS, ReferencedTypeIds);
 
     NameVals.push_back(*ValueId);
     NameVals.push_back(Index.getModuleId(FS->modulePath()));
@@ -3673,7 +3786,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
 
     bool HasProfileData = false;
     for (auto &EI : FS->calls()) {
-      HasProfileData |= EI.second.Hotness != CalleeInfo::HotnessType::Unknown;
+      HasProfileData |=
+          EI.second.getHotness() != CalleeInfo::HotnessType::Unknown;
       if (HasProfileData)
         break;
     }
@@ -3757,6 +3871,17 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.clear();
   }
 
+  if (!Index.typeIds().empty()) {
+    for (auto &S : Index.typeIds()) {
+      // Skip if not referenced in any GV summary within this index file.
+      if (!ReferencedTypeIds.count(GlobalValue::getGUID(S.first)))
+        continue;
+      writeTypeIdSummaryRecord(NameVals, StrtabBuilder, S.first, S.second);
+      Stream.EmitRecord(bitc::FS_TYPE_ID, NameVals);
+      NameVals.clear();
+    }
+  }
+
   Stream.ExitBlock();
 }
 
@@ -4012,7 +4137,7 @@ void BitcodeWriter::copyStrtab(StringRef Strtab) {
   WroteStrtab = true;
 }
 
-void BitcodeWriter::writeModule(const Module *M,
+void BitcodeWriter::writeModule(const Module &M,
                                 bool ShouldPreserveUseListOrder,
                                 const ModuleSummaryIndex *Index,
                                 bool GenerateHash, ModuleHash *ModHash) {
@@ -4022,8 +4147,8 @@ void BitcodeWriter::writeModule(const Module *M,
   // Modules in case it needs to materialize metadata. But the bitcode writer
   // requires that the module is materialized, so we can cast to non-const here,
   // after checking that it is in fact materialized.
-  assert(M->isMaterialized());
-  Mods.push_back(const_cast<Module *>(M));
+  assert(M.isMaterialized());
+  Mods.push_back(const_cast<Module *>(&M));
 
   ModuleBitcodeWriter ModuleWriter(M, Buffer, StrtabBuilder, *Stream,
                                    ShouldPreserveUseListOrder, Index,
@@ -4039,9 +4164,8 @@ void BitcodeWriter::writeIndex(
   IndexWriter.write();
 }
 
-/// WriteBitcodeToFile - Write the specified module to the specified output
-/// stream.
-void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
+/// Write the specified module to the specified output stream.
+void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
                               bool ShouldPreserveUseListOrder,
                               const ModuleSummaryIndex *Index,
                               bool GenerateHash, ModuleHash *ModHash) {
@@ -4050,7 +4174,7 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
 
   // If this is darwin or another generic macho target, reserve space for the
   // header.
-  Triple TT(M->getTargetTriple());
+  Triple TT(M.getTargetTriple());
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
     Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
 
@@ -4107,7 +4231,7 @@ class ThinLinkBitcodeWriter : public ModuleBitcodeWriterBase {
   const ModuleHash *ModHash;
 
 public:
-  ThinLinkBitcodeWriter(const Module *M, StringTableBuilder &StrtabBuilder,
+  ThinLinkBitcodeWriter(const Module &M, StringTableBuilder &StrtabBuilder,
                         BitstreamWriter &Stream,
                         const ModuleSummaryIndex &Index,
                         const ModuleHash &ModHash)
@@ -4225,7 +4349,7 @@ void ThinLinkBitcodeWriter::write() {
   Stream.ExitBlock();
 }
 
-void BitcodeWriter::writeThinLinkBitcode(const Module *M,
+void BitcodeWriter::writeThinLinkBitcode(const Module &M,
                                          const ModuleSummaryIndex &Index,
                                          const ModuleHash &ModHash) {
   assert(!WroteStrtab);
@@ -4234,8 +4358,8 @@ void BitcodeWriter::writeThinLinkBitcode(const Module *M,
   // Modules in case it needs to materialize metadata. But the bitcode writer
   // requires that the module is materialized, so we can cast to non-const here,
   // after checking that it is in fact materialized.
-  assert(M->isMaterialized());
-  Mods.push_back(const_cast<Module *>(M));
+  assert(M.isMaterialized());
+  Mods.push_back(const_cast<Module *>(&M));
 
   ThinLinkBitcodeWriter ThinLinkWriter(M, StrtabBuilder, *Stream, Index,
                                        ModHash);
@@ -4245,7 +4369,7 @@ void BitcodeWriter::writeThinLinkBitcode(const Module *M,
 // Write the specified thin link bitcode file to the given raw output stream,
 // where it will be written in a new bitcode block. This is used when
 // writing the per-module index file for ThinLTO.
-void llvm::WriteThinLinkBitcodeToFile(const Module *M, raw_ostream &Out,
+void llvm::WriteThinLinkBitcodeToFile(const Module &M, raw_ostream &Out,
                                       const ModuleSummaryIndex &Index,
                                       const ModuleHash &ModHash) {
   SmallVector<char, 0> Buffer;
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 80cab762a68c..41212e575f8e 100644
--- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -23,7 +23,7 @@ PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
   const ModuleSummaryIndex *Index =
       EmitSummaryIndex ? &(AM.getResult<ModuleSummaryIndexAnalysis>(M))
                        : nullptr;
-  WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
+  WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
   return PreservedAnalyses::all();
 }
 
@@ -55,7 +55,7 @@ namespace {
           EmitSummaryIndex
               ? &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex())
               : nullptr;
-      WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, Index,
+      WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index,
                          EmitModuleHash);
       return false;
     }
@@ -80,3 +80,7 @@ ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str,
   return new WriteBitcodePass(Str, ShouldPreserveUseListOrder,
                               EmitSummaryIndex, EmitModuleHash);
 }
+
+bool llvm::isBitcodeWriterPass(Pass *P) {
+  return P->getPassID() == (llvm::AnalysisID)&WriteBitcodePass::ID;
+}
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index d99befcdaeae..d473741e8ceb 100644
--- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -14,6 +14,7 @@
 #include "ValueEnumerator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -183,7 +184,7 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
     return;
 
   bool IsGlobalValue = OM.isGlobalValue(ID);
-  std::sort(List.begin(), List.end(), [&](const Entry &L, const Entry &R) {
+  llvm::sort(List.begin(), List.end(), [&](const Entry &L, const Entry &R) {
     const Use *LU = L.first;
     const Use *RU = R.first;
     if (LU == RU)
@@ -488,7 +489,7 @@ void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
     V->print(errs());
     errs() << '\n';
 
-    OS << " Uses(" << std::distance(V->use_begin(),V->use_end()) << "):";
+    OS << " Uses(" << V->getNumUses() << "):";
     for (const Use &U : V->uses()) {
       if (&U != &*V->use_begin())
         OS << ",";
@@ -744,7 +745,7 @@ void ValueEnumerator::organizeMetadata() {
   // and then sort by the original/current ID.  Since the IDs are guaranteed to
   // be unique, the result of std::sort will be deterministic.  There's no need
   // for std::stable_sort.
-  std::sort(Order.begin(), Order.end(), [this](MDIndex LHS, MDIndex RHS) {
+  llvm::sort(Order.begin(), Order.end(), [this](MDIndex LHS, MDIndex RHS) {
     return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) <
            std::make_tuple(RHS.F, getMetadataTypeOrder(RHS.get(MDs)), RHS.ID);
   });
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index ffcb9a09ad73..632ea8e9cdc4 100644
--- a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -35,6 +34,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <map>
@@ -139,10 +139,11 @@ AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
       CriticalPathSet |= CPSet;
    }
 
-  DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
-  DEBUG(for (unsigned r : CriticalPathSet.set_bits())
-          dbgs() << " " << printReg(r, TRI));
-  DEBUG(dbgs() << '\n');
+   LLVM_DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
+   LLVM_DEBUG(for (unsigned r
+                   : CriticalPathSet.set_bits()) dbgs()
+              << " " << printReg(r, TRI));
+   LLVM_DEBUG(dbgs() << '\n');
 }
 
 AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
@@ -202,9 +203,9 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
   PrescanInstruction(MI, Count, PassthruRegs);
   ScanInstruction(MI, Count);
 
-  DEBUG(dbgs() << "Observe: ");
-  DEBUG(MI.dump());
-  DEBUG(dbgs() << "\tRegs:");
+  LLVM_DEBUG(dbgs() << "Observe: ");
+  LLVM_DEBUG(MI.dump());
+  LLVM_DEBUG(dbgs() << "\tRegs:");
 
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
   for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
@@ -215,16 +216,16 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
     // conservative location (i.e. the beginning of the previous
     // schedule region).
     if (State->IsLive(Reg)) {
-      DEBUG(if (State->GetGroup(Reg) != 0)
-              dbgs() << " " << printReg(Reg, TRI) << "=g" <<
-                State->GetGroup(Reg) << "->g0(region live-out)");
+      LLVM_DEBUG(if (State->GetGroup(Reg) != 0) dbgs()
+                 << " " << printReg(Reg, TRI) << "=g" << State->GetGroup(Reg)
+                 << "->g0(region live-out)");
       State->UnionGroups(Reg, 0);
     } else if ((DefIndices[Reg] < InsertPosIndex)
                && (DefIndices[Reg] >= Count)) {
       DefIndices[Reg] = Count;
     }
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 }
 
 bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr &MI,
@@ -313,7 +314,7 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
   // subregister definitions).
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI)) {
-      DEBUG(if (!header && footer) dbgs() << footer);
+      LLVM_DEBUG(if (!header && footer) dbgs() << footer);
       return;
     }
 
@@ -322,9 +323,11 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
     DefIndices[Reg] = ~0u;
     RegRefs.erase(Reg);
     State->LeaveGroup(Reg);
-    DEBUG(if (header) {
-        dbgs() << header << printReg(Reg, TRI); header = nullptr; });
-    DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
+    LLVM_DEBUG(if (header) {
+      dbgs() << header << printReg(Reg, TRI);
+      header = nullptr;
+    });
+    LLVM_DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
     // Repeat for subregisters. Note that we only do this if the superregister
     // was not live because otherwise, regardless whether we have an explicit
     // use of the subregister, the subregister's contents are needed for the
@@ -336,15 +339,17 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
         DefIndices[SubregReg] = ~0u;
         RegRefs.erase(SubregReg);
         State->LeaveGroup(SubregReg);
-        DEBUG(if (header) {
-            dbgs() << header << printReg(Reg, TRI); header = nullptr; });
-        DEBUG(dbgs() << " " << printReg(SubregReg, TRI) << "->g" <<
-              State->GetGroup(SubregReg) << tag);
+        LLVM_DEBUG(if (header) {
+          dbgs() << header << printReg(Reg, TRI);
+          header = nullptr;
+        });
+        LLVM_DEBUG(dbgs() << " " << printReg(SubregReg, TRI) << "->g"
+                          << State->GetGroup(SubregReg) << tag);
       }
     }
   }
 
-  DEBUG(if (!header && footer) dbgs() << footer);
+  LLVM_DEBUG(if (!header && footer) dbgs() << footer);
 }
 
 void AggressiveAntiDepBreaker::PrescanInstruction(
@@ -367,14 +372,15 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
     HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
   }
 
-  DEBUG(dbgs() << "\tDef Groups:");
+  LLVM_DEBUG(dbgs() << "\tDef Groups:");
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
 
-    DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g" << State->GetGroup(Reg));
+    LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
+                      << State->GetGroup(Reg));
 
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
@@ -383,7 +389,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
     // can tell user specified registers from compiler-specified.
     if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI) ||
         MI.isInlineAsm()) {
-      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      LLVM_DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
 
@@ -393,8 +399,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
       unsigned AliasReg = *AI;
       if (State->IsLive(AliasReg)) {
         State->UnionGroups(Reg, AliasReg);
-        DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via "
-                     << printReg(AliasReg, TRI) << ")");
+        LLVM_DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via "
+                          << printReg(AliasReg, TRI) << ")");
       }
     }
 
@@ -406,7 +412,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
     RegRefs.insert(std::make_pair(Reg, RR));
   }
 
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   // Scan the register defs for this instruction and update
   // live-ranges.
@@ -437,7 +443,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
 
 void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
                                                unsigned Count) {
-  DEBUG(dbgs() << "\tUse Groups:");
+  LLVM_DEBUG(dbgs() << "\tUse Groups:");
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
@@ -448,11 +454,11 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
   // FIXME: The issue with predicated instruction is more complex. We are being
   // conservatively here because the kill markers cannot be trusted after
   // if-conversion:
-  // %r6 = LDR %sp, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // %r6 = LDR %sp, %reg0, 92, 14, %reg0; mem:LD4[FixedStack14]
   // ...
-  // STR %r0, killed %r6, %reg0, 0, pred:0, pred:%cpsr; mem:ST4[%395]
-  // %r6 = LDR %sp, %reg0, 100, pred:0, pred:%cpsr; mem:LD4[FixedStack12]
-  // STR %r0, killed %r6, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  // STR %r0, killed %r6, %reg0, 0, 0, %cpsr; mem:ST4[%395]
+  // %r6 = LDR %sp, %reg0, 100, 0, %cpsr; mem:LD4[FixedStack12]
+  // STR %r0, killed %r6, %reg0, 0, 14, %reg0; mem:ST4[%396](align=8)
   //
   // The first R6 kill is not really a kill since it's killed by a predicated
   // instruction which may not be executed. The second R6 def may or may not
@@ -469,7 +475,8 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
 
-    DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g" << State->GetGroup(Reg));
+    LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
+                      << State->GetGroup(Reg));
 
     // It wasn't previously live but now it is, this is a kill. Forget
     // the previous live-range information and start a new live-range
@@ -477,7 +484,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     HandleLastUse(Reg, Count, "(last-use)");
 
     if (Special) {
-      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      LLVM_DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
 
@@ -489,12 +496,12 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     RegRefs.insert(std::make_pair(Reg, RR));
   }
 
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   // Form a group of all defs and uses of a KILL instruction to ensure
   // that all registers are renamed as a group.
   if (MI.isKill()) {
-    DEBUG(dbgs() << "\tKill Group:");
+    LLVM_DEBUG(dbgs() << "\tKill Group:");
 
     unsigned FirstReg = 0;
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
@@ -504,15 +511,15 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
       if (Reg == 0) continue;
 
       if (FirstReg != 0) {
-        DEBUG(dbgs() << "=" << printReg(Reg, TRI));
+        LLVM_DEBUG(dbgs() << "=" << printReg(Reg, TRI));
         State->UnionGroups(FirstReg, Reg);
       } else {
-        DEBUG(dbgs() << " " << printReg(Reg, TRI));
+        LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI));
         FirstReg = Reg;
       }
     }
 
-    DEBUG(dbgs() << "->g" << State->GetGroup(FirstReg) << '\n');
+    LLVM_DEBUG(dbgs() << "->g" << State->GetGroup(FirstReg) << '\n');
   }
 }
 
@@ -535,7 +542,7 @@ BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
       BV &= RCBV;
     }
 
-    DEBUG(dbgs() << " " << TRI->getRegClassName(RC));
+    LLVM_DEBUG(dbgs() << " " << TRI->getRegClassName(RC));
   }
 
   return BV;
@@ -562,8 +569,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
   // Find the "superest" register in the group. At the same time,
   // collect the BitVector of registers that can be used to rename
   // each register.
-  DEBUG(dbgs() << "\tRename Candidates for Group g" << AntiDepGroupIndex
-        << ":\n");
+  LLVM_DEBUG(dbgs() << "\tRename Candidates for Group g" << AntiDepGroupIndex
+                    << ":\n");
   std::map<unsigned, BitVector> RenameRegisterMap;
   unsigned SuperReg = 0;
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
@@ -573,13 +580,13 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
 
     // If Reg has any references, then collect possible rename regs
     if (RegRefs.count(Reg) > 0) {
-      DEBUG(dbgs() << "\t\t" << printReg(Reg, TRI) << ":");
+      LLVM_DEBUG(dbgs() << "\t\t" << printReg(Reg, TRI) << ":");
 
       BitVector &BV = RenameRegisterMap[Reg];
       assert(BV.empty());
       BV = GetRenameRegisters(Reg);
 
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << " ::";
         for (unsigned r : BV.set_bits())
           dbgs() << " " << printReg(r, TRI);
@@ -625,11 +632,11 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
 
   ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(SuperRC);
   if (Order.empty()) {
-    DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
+    LLVM_DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
     return false;
   }
 
-  DEBUG(dbgs() << "\tFind Registers:");
+  LLVM_DEBUG(dbgs() << "\tFind Registers:");
 
   RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
 
@@ -645,7 +652,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
     // Don't replace a register with itself.
     if (NewSuperReg == SuperReg) continue;
 
-    DEBUG(dbgs() << " [" << printReg(NewSuperReg, TRI) << ':');
+    LLVM_DEBUG(dbgs() << " [" << printReg(NewSuperReg, TRI) << ':');
     RenameMap.clear();
 
     // For each referenced group register (which must be a SuperReg or
@@ -662,11 +669,11 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
           NewReg = TRI->getSubReg(NewSuperReg, NewSubRegIdx);
       }
 
-      DEBUG(dbgs() << " " << printReg(NewReg, TRI));
+      LLVM_DEBUG(dbgs() << " " << printReg(NewReg, TRI));
 
       // Check if Reg can be renamed to NewReg.
       if (!RenameRegisterMap[Reg].test(NewReg)) {
-        DEBUG(dbgs() << "(no rename)");
+        LLVM_DEBUG(dbgs() << "(no rename)");
         goto next_super_reg;
       }
 
@@ -675,7 +682,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
       // must also check all aliases of NewReg, because we can't define a
       // register when any sub or super is already live.
       if (State->IsLive(NewReg) || (KillIndices[Reg] > DefIndices[NewReg])) {
-        DEBUG(dbgs() << "(live)");
+        LLVM_DEBUG(dbgs() << "(live)");
         goto next_super_reg;
       } else {
         bool found = false;
@@ -683,7 +690,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
           unsigned AliasReg = *AI;
           if (State->IsLive(AliasReg) ||
               (KillIndices[Reg] > DefIndices[AliasReg])) {
-            DEBUG(dbgs() << "(alias " << printReg(AliasReg, TRI) << " live)");
+            LLVM_DEBUG(dbgs()
+                       << "(alias " << printReg(AliasReg, TRI) << " live)");
             found = true;
             break;
           }
@@ -701,7 +709,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
           continue;
 
         if (UseMI->getOperand(Idx).isEarlyClobber()) {
-          DEBUG(dbgs() << "(ec)");
+          LLVM_DEBUG(dbgs() << "(ec)");
           goto next_super_reg;
         }
       }
@@ -715,7 +723,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
 
         MachineInstr *DefMI = Q.second.Operand->getParent();
         if (DefMI->readsRegister(NewReg, TRI)) {
-          DEBUG(dbgs() << "(ec)");
+          LLVM_DEBUG(dbgs() << "(ec)");
           goto next_super_reg;
         }
       }
@@ -728,14 +736,14 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
     // renamed, as recorded in RenameMap.
     RenameOrder.erase(SuperRC);
     RenameOrder.insert(RenameOrderType::value_type(SuperRC, R));
-    DEBUG(dbgs() << "]\n");
+    LLVM_DEBUG(dbgs() << "]\n");
     return true;
 
   next_super_reg:
-    DEBUG(dbgs() << ']');
+    LLVM_DEBUG(dbgs() << ']');
   } while (R != EndR);
 
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   // No registers are free and available!
   return false;
@@ -788,13 +796,13 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
   }
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n");
-  DEBUG(dbgs() << "Available regs:");
+  LLVM_DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n");
+  LLVM_DEBUG(dbgs() << "Available regs:");
   for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
     if (!State->IsLive(Reg))
-      DEBUG(dbgs() << " " << printReg(Reg, TRI));
+      LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI));
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 #endif
 
   BitVector RegAliases(TRI->getNumRegs());
@@ -808,11 +816,11 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
        I != E; --Count) {
     MachineInstr &MI = *--I;
 
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
-    DEBUG(dbgs() << "Anti: ");
-    DEBUG(MI.dump());
+    LLVM_DEBUG(dbgs() << "Anti: ");
+    LLVM_DEBUG(MI.dump());
 
     std::set<unsigned> PassthruRegs;
     GetPassthruRegs(MI, PassthruRegs);
@@ -848,30 +856,30 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
             (Edge->getKind() != SDep::Output)) continue;
 
         unsigned AntiDepReg = Edge->getReg();
-        DEBUG(dbgs() << "\tAntidep reg: " << printReg(AntiDepReg, TRI));
+        LLVM_DEBUG(dbgs() << "\tAntidep reg: " << printReg(AntiDepReg, TRI));
         assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
 
         if (!MRI.isAllocatable(AntiDepReg)) {
           // Don't break anti-dependencies on non-allocatable registers.
-          DEBUG(dbgs() << " (non-allocatable)\n");
+          LLVM_DEBUG(dbgs() << " (non-allocatable)\n");
           continue;
         } else if (ExcludeRegs && ExcludeRegs->test(AntiDepReg)) {
           // Don't break anti-dependencies for critical path registers
           // if not on the critical path
-          DEBUG(dbgs() << " (not critical-path)\n");
+          LLVM_DEBUG(dbgs() << " (not critical-path)\n");
           continue;
         } else if (PassthruRegs.count(AntiDepReg) != 0) {
           // If the anti-dep register liveness "passes-thru", then
           // don't try to change it. It will be changed along with
           // the use if required to break an earlier antidep.
-          DEBUG(dbgs() << " (passthru)\n");
+          LLVM_DEBUG(dbgs() << " (passthru)\n");
           continue;
         } else {
           // No anti-dep breaking for implicit deps
           MachineOperand *AntiDepOp = MI.findRegisterDefOperand(AntiDepReg);
           assert(AntiDepOp && "Can't find index for defined register operand");
           if (!AntiDepOp || AntiDepOp->isImplicit()) {
-            DEBUG(dbgs() << " (implicit)\n");
+            LLVM_DEBUG(dbgs() << " (implicit)\n");
             continue;
           }
 
@@ -897,13 +905,13 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
                  PE = PathSU->Preds.end(); P != PE; ++P) {
             if ((P->getSUnit() == NextSU) && (P->getKind() != SDep::Anti) &&
                 (P->getKind() != SDep::Output)) {
-              DEBUG(dbgs() << " (real dependency)\n");
+              LLVM_DEBUG(dbgs() << " (real dependency)\n");
               AntiDepReg = 0;
               break;
             } else if ((P->getSUnit() != NextSU) &&
                        (P->getKind() == SDep::Data) &&
                        (P->getReg() == AntiDepReg)) {
-              DEBUG(dbgs() << " (other dependency)\n");
+              LLVM_DEBUG(dbgs() << " (other dependency)\n");
               AntiDepReg = 0;
               break;
             }
@@ -941,17 +949,17 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
         // Determine AntiDepReg's register group.
         const unsigned GroupIndex = State->GetGroup(AntiDepReg);
         if (GroupIndex == 0) {
-          DEBUG(dbgs() << " (zero group)\n");
+          LLVM_DEBUG(dbgs() << " (zero group)\n");
           continue;
         }
 
-        DEBUG(dbgs() << '\n');
+        LLVM_DEBUG(dbgs() << '\n');
 
         // Look for a suitable register to use to break the anti-dependence.
         std::map<unsigned, unsigned> RenameMap;
         if (FindSuitableFreeRegisters(GroupIndex, RenameOrder, RenameMap)) {
-          DEBUG(dbgs() << "\tBreaking anti-dependence edge on "
-                       << printReg(AntiDepReg, TRI) << ":");
+          LLVM_DEBUG(dbgs() << "\tBreaking anti-dependence edge on "
+                            << printReg(AntiDepReg, TRI) << ":");
 
           // Handle each group register...
           for (std::map<unsigned, unsigned>::iterator
@@ -959,9 +967,9 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
             unsigned CurrReg = S->first;
             unsigned NewReg = S->second;
 
-            DEBUG(dbgs() << " " << printReg(CurrReg, TRI) << "->"
-                         << printReg(NewReg, TRI) << "("
-                         << RegRefs.count(CurrReg) << " refs)");
+            LLVM_DEBUG(dbgs() << " " << printReg(CurrReg, TRI) << "->"
+                              << printReg(NewReg, TRI) << "("
+                              << RegRefs.count(CurrReg) << " refs)");
 
             // Update the references to the old register CurrReg to
             // refer to the new register NewReg.
@@ -994,7 +1002,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
           }
 
           ++Broken;
-          DEBUG(dbgs() << '\n');
+          LLVM_DEBUG(dbgs() << '\n');
         }
       }
     }
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
index 8e8c1d8e08d1..37dcb0be824e 100644
--- a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -39,7 +39,7 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
     HardHints = true;
   rewind();
 
-  DEBUG({
+  LLVM_DEBUG({
     if (!Hints.empty()) {
       dbgs() << "hints:";
       for (unsigned I = 0, E = Hints.size(); I != E; ++I)
diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp
index 0731ae575437..79f11def38f7 100644
--- a/contrib/llvm/lib/CodeGen/Analysis.cpp
+++ b/contrib/llvm/lib/CodeGen/Analysis.cpp
@@ -629,26 +629,26 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
   return true;
 }
 
-static void collectFuncletMembers(
-    DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
+static void collectEHScopeMembers(
+    DenseMap<const MachineBasicBlock *, int> &EHScopeMembership, int EHScope,
     const MachineBasicBlock *MBB) {
   SmallVector<const MachineBasicBlock *, 16> Worklist = {MBB};
   while (!Worklist.empty()) {
     const MachineBasicBlock *Visiting = Worklist.pop_back_val();
-    // Don't follow blocks which start new funclets.
+    // Don't follow blocks which start new scopes.
     if (Visiting->isEHPad() && Visiting != MBB)
       continue;
 
-    // Add this MBB to our funclet.
-    auto P = FuncletMembership.insert(std::make_pair(Visiting, Funclet));
+    // Add this MBB to our scope.
+    auto P = EHScopeMembership.insert(std::make_pair(Visiting, EHScope));
 
     // Don't revisit blocks.
     if (!P.second) {
-      assert(P.first->second == Funclet && "MBB is part of two funclets!");
+      assert(P.first->second == EHScope && "MBB is part of two scopes!");
       continue;
     }
 
-    // Returns are boundaries where funclet transfer can occur, don't follow
+    // Returns are boundaries where scope transfer can occur, don't follow
     // successors.
     if (Visiting->isReturnBlock())
       continue;
@@ -659,25 +659,25 @@ static void collectFuncletMembers(
 }
 
 DenseMap<const MachineBasicBlock *, int>
-llvm::getFuncletMembership(const MachineFunction &MF) {
-  DenseMap<const MachineBasicBlock *, int> FuncletMembership;
+llvm::getEHScopeMembership(const MachineFunction &MF) {
+  DenseMap<const MachineBasicBlock *, int> EHScopeMembership;
 
   // We don't have anything to do if there aren't any EH pads.
-  if (!MF.hasEHFunclets())
-    return FuncletMembership;
+  if (!MF.hasEHScopes())
+    return EHScopeMembership;
 
   int EntryBBNumber = MF.front().getNumber();
   bool IsSEH = isAsynchronousEHPersonality(
       classifyEHPersonality(MF.getFunction().getPersonalityFn()));
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  SmallVector<const MachineBasicBlock *, 16> FuncletBlocks;
+  SmallVector<const MachineBasicBlock *, 16> EHScopeBlocks;
   SmallVector<const MachineBasicBlock *, 16> UnreachableBlocks;
   SmallVector<const MachineBasicBlock *, 16> SEHCatchPads;
   SmallVector<std::pair<const MachineBasicBlock *, int>, 16> CatchRetSuccessors;
   for (const MachineBasicBlock &MBB : MF) {
-    if (MBB.isEHFuncletEntry()) {
-      FuncletBlocks.push_back(&MBB);
+    if (MBB.isEHScopeEntry()) {
+      EHScopeBlocks.push_back(&MBB);
     } else if (IsSEH && MBB.isEHPad()) {
       SEHCatchPads.push_back(&MBB);
     } else if (MBB.pred_empty()) {
@@ -686,8 +686,8 @@ llvm::getFuncletMembership(const MachineFunction &MF) {
 
     MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator();
 
-    // CatchPads are not funclets for SEH so do not consider CatchRet to
-    // transfer control to another funclet.
+    // CatchPads are not scopes for SEH so do not consider CatchRet to
+    // transfer control to another scope.
     if (MBBI == MBB.end() || MBBI->getOpcode() != TII->getCatchReturnOpcode())
       continue;
 
@@ -700,24 +700,24 @@ llvm::getFuncletMembership(const MachineFunction &MF) {
   }
 
   // We don't have anything to do if there aren't any EH pads.
-  if (FuncletBlocks.empty())
-    return FuncletMembership;
+  if (EHScopeBlocks.empty())
+    return EHScopeMembership;
 
   // Identify all the basic blocks reachable from the function entry.
-  collectFuncletMembers(FuncletMembership, EntryBBNumber, &MF.front());
-  // All blocks not part of a funclet are in the parent function.
+  collectEHScopeMembers(EHScopeMembership, EntryBBNumber, &MF.front());
+  // All blocks not part of a scope are in the parent function.
   for (const MachineBasicBlock *MBB : UnreachableBlocks)
-    collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB);
-  // Next, identify all the blocks inside the funclets.
-  for (const MachineBasicBlock *MBB : FuncletBlocks)
-    collectFuncletMembers(FuncletMembership, MBB->getNumber(), MBB);
-  // SEH CatchPads aren't really funclets, handle them separately.
+    collectEHScopeMembers(EHScopeMembership, EntryBBNumber, MBB);
+  // Next, identify all the blocks inside the scopes.
+  for (const MachineBasicBlock *MBB : EHScopeBlocks)
+    collectEHScopeMembers(EHScopeMembership, MBB->getNumber(), MBB);
+  // SEH CatchPads aren't really scopes, handle them separately.
   for (const MachineBasicBlock *MBB : SEHCatchPads)
-    collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB);
+    collectEHScopeMembers(EHScopeMembership, EntryBBNumber, MBB);
   // Finally, identify all the targets of a catchret.
   for (std::pair<const MachineBasicBlock *, int> CatchRetPair :
        CatchRetSuccessors)
-    collectFuncletMembers(FuncletMembership, CatchRetPair.second,
+    collectEHScopeMembers(EHScopeMembership, CatchRetPair.second,
                           CatchRetPair.first);
-  return FuncletMembership;
+  return EHScopeMembership;
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index 15cfbd5c40ff..9011f025f595 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -91,7 +91,8 @@ void ARMException::endFunction(const MachineFunction *MF) {
     ATS.emitFnEnd();
 }
 
-void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
+void ARMException::emitTypeInfos(unsigned TTypeEncoding,
+                                 MCSymbol *TTBaseLabel) {
   const MachineFunction *MF = Asm->MF;
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
@@ -112,6 +113,8 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
     Asm->EmitTTypeReference(GV, TTypeEncoding);
   }
 
+  Asm->OutStreamer->EmitLabel(TTBaseLabel);
+
   // Emit the Exception Specifications.
   if (VerboseAsm && !FilterIds.empty()) {
     Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
new file mode 100644
index 000000000000..20b0b8d3feab
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -0,0 +1,721 @@
+//===- llvm/CodeGen/AsmPrinter/AccelTable.cpp - Accelerator Tables --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AccelTable.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+using namespace llvm;
+
+void AccelTableBase::computeBucketCount() {
+  // First get the number of unique hashes.
+  std::vector<uint32_t> Uniques;
+  Uniques.reserve(Entries.size());
+  for (const auto &E : Entries)
+    Uniques.push_back(E.second.HashValue);
+  array_pod_sort(Uniques.begin(), Uniques.end());
+  std::vector<uint32_t>::iterator P =
+      std::unique(Uniques.begin(), Uniques.end());
+
+  UniqueHashCount = std::distance(Uniques.begin(), P);
+
+  if (UniqueHashCount > 1024)
+    BucketCount = UniqueHashCount / 4;
+  else if (UniqueHashCount > 16)
+    BucketCount = UniqueHashCount / 2;
+  else
+    BucketCount = std::max<uint32_t>(UniqueHashCount, 1);
+}
+
+void AccelTableBase::finalize(AsmPrinter *Asm, StringRef Prefix) {
+  // Create the individual hash data outputs.
+  for (auto &E : Entries) {
+    // Unique the entries.
+    std::stable_sort(E.second.Values.begin(), E.second.Values.end(),
+                     [](const AccelTableData *A, const AccelTableData *B) {
+                       return *A < *B;
+                     });
+    E.second.Values.erase(
+        std::unique(E.second.Values.begin(), E.second.Values.end()),
+        E.second.Values.end());
+  }
+
+  // Figure out how many buckets we need, then compute the bucket contents and
+  // the final ordering. The hashes and offsets can be emitted by walking these
+  // data structures. We add temporary symbols to the data so they can be
+  // referenced when emitting the offsets.
+  computeBucketCount();
+
+  // Compute bucket contents and final ordering.
+  Buckets.resize(BucketCount);
+  for (auto &E : Entries) {
+    uint32_t Bucket = E.second.HashValue % BucketCount;
+    Buckets[Bucket].push_back(&E.second);
+    E.second.Sym = Asm->createTempSymbol(Prefix);
+  }
+
+  // Sort the contents of the buckets by hash value so that hash collisions end
+  // up together. Stable sort makes testing easier and doesn't cost much more.
+  for (auto &Bucket : Buckets)
+    std::stable_sort(Bucket.begin(), Bucket.end(),
+                     [](HashData *LHS, HashData *RHS) {
+                       return LHS->HashValue < RHS->HashValue;
+                     });
+}
+
+namespace {
+/// Base class for writing out Accelerator tables. It holds the common
+/// functionality for the two Accelerator table types.
+class AccelTableWriter {
+protected:
+  AsmPrinter *const Asm;          ///< Destination.
+  const AccelTableBase &Contents; ///< Data to emit.
+
+  /// Controls whether to emit duplicate hash and offset table entries for names
+  /// with identical hashes. Apple tables don't emit duplicate entries, DWARF v5
+  /// tables do.
+  const bool SkipIdenticalHashes;
+
+  void emitHashes() const;
+
+  /// Emit offsets to lists of entries with identical names. The offsets are
+  /// relative to the Base argument.
+  void emitOffsets(const MCSymbol *Base) const;
+
+public:
+  AccelTableWriter(AsmPrinter *Asm, const AccelTableBase &Contents,
+                   bool SkipIdenticalHashes)
+      : Asm(Asm), Contents(Contents), SkipIdenticalHashes(SkipIdenticalHashes) {
+  }
+};
+
+class AppleAccelTableWriter : public AccelTableWriter {
+  using Atom = AppleAccelTableData::Atom;
+
+  /// The fixed header of an Apple Accelerator Table.
+  struct Header {
+    uint32_t Magic = MagicHash;
+    uint16_t Version = 1;
+    uint16_t HashFunction = dwarf::DW_hash_function_djb;
+    uint32_t BucketCount;
+    uint32_t HashCount;
+    uint32_t HeaderDataLength;
+
+    /// 'HASH' magic value to detect endianness.
+    static const uint32_t MagicHash = 0x48415348;
+
+    Header(uint32_t BucketCount, uint32_t UniqueHashCount, uint32_t DataLength)
+        : BucketCount(BucketCount), HashCount(UniqueHashCount),
+          HeaderDataLength(DataLength) {}
+
+    void emit(AsmPrinter *Asm) const;
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const { print(dbgs()); }
+#endif
+  };
+
+  /// The HeaderData describes the structure of an Apple accelerator table
+  /// through a list of Atoms.
+  struct HeaderData {
+    /// In the case of data that is referenced via DW_FORM_ref_* the offset
+    /// base is used to describe the offset for all forms in the list of atoms.
+    uint32_t DieOffsetBase;
+
+    const SmallVector<Atom, 4> Atoms;
+
+    HeaderData(ArrayRef<Atom> AtomList, uint32_t Offset = 0)
+        : DieOffsetBase(Offset), Atoms(AtomList.begin(), AtomList.end()) {}
+
+    void emit(AsmPrinter *Asm) const;
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const { print(dbgs()); }
+#endif
+  };
+
+  Header Header;
+  HeaderData HeaderData;
+  const MCSymbol *SecBegin;
+
+  void emitBuckets() const;
+  void emitData() const;
+
+public:
+  AppleAccelTableWriter(AsmPrinter *Asm, const AccelTableBase &Contents,
+                        ArrayRef<Atom> Atoms, const MCSymbol *SecBegin)
+      : AccelTableWriter(Asm, Contents, true),
+        Header(Contents.getBucketCount(), Contents.getUniqueHashCount(),
+               8 + (Atoms.size() * 4)),
+        HeaderData(Atoms), SecBegin(SecBegin) {}
+
+  void emit() const;
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const;
+  void dump() const { print(dbgs()); }
+#endif
+};
+
+/// Class responsible for emitting a DWARF v5 Accelerator Table. The only
+/// public function is emit(), which performs the actual emission.
+///
+/// The class is templated in its data type. This allows us to emit both dyamic
+/// and static data entries. A callback abstract the logic to provide a CU
+/// index for a given entry, which is different per data type, but identical
+/// for every entry in the same table.
+template <typename DataT>
+class Dwarf5AccelTableWriter : public AccelTableWriter {
+  struct Header {
+    uint32_t UnitLength = 0;
+    uint16_t Version = 5;
+    uint16_t Padding = 0;
+    uint32_t CompUnitCount;
+    uint32_t LocalTypeUnitCount = 0;
+    uint32_t ForeignTypeUnitCount = 0;
+    uint32_t BucketCount;
+    uint32_t NameCount;
+    uint32_t AbbrevTableSize = 0;
+    uint32_t AugmentationStringSize = sizeof(AugmentationString);
+    char AugmentationString[8] = {'L', 'L', 'V', 'M', '0', '7', '0', '0'};
+
+    Header(uint32_t CompUnitCount, uint32_t BucketCount, uint32_t NameCount)
+        : CompUnitCount(CompUnitCount), BucketCount(BucketCount),
+          NameCount(NameCount) {}
+
+    void emit(const Dwarf5AccelTableWriter &Ctx) const;
+  };
+  struct AttributeEncoding {
+    dwarf::Index Index;
+    dwarf::Form Form;
+  };
+
+  Header Header;
+  DenseMap<uint32_t, SmallVector<AttributeEncoding, 2>> Abbreviations;
+  ArrayRef<MCSymbol *> CompUnits;
+  llvm::function_ref<unsigned(const DataT &)> getCUIndexForEntry;
+  MCSymbol *ContributionStart = Asm->createTempSymbol("names_start");
+  MCSymbol *ContributionEnd = Asm->createTempSymbol("names_end");
+  MCSymbol *AbbrevStart = Asm->createTempSymbol("names_abbrev_start");
+  MCSymbol *AbbrevEnd = Asm->createTempSymbol("names_abbrev_end");
+  MCSymbol *EntryPool = Asm->createTempSymbol("names_entries");
+
+  DenseSet<uint32_t> getUniqueTags() const;
+
+  // Right now, we emit uniform attributes for all tags.
+  SmallVector<AttributeEncoding, 2> getUniformAttributes() const;
+
+  void emitCUList() const;
+  void emitBuckets() const;
+  void emitStringOffsets() const;
+  void emitAbbrevs() const;
+  void emitEntry(const DataT &Entry) const;
+  void emitData() const;
+
+public:
+  Dwarf5AccelTableWriter(
+      AsmPrinter *Asm, const AccelTableBase &Contents,
+      ArrayRef<MCSymbol *> CompUnits,
+      llvm::function_ref<unsigned(const DataT &)> GetCUIndexForEntry);
+
+  void emit() const;
+};
+} // namespace
+
+void AccelTableWriter::emitHashes() const {
+  uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+  unsigned BucketIdx = 0;
+  for (auto &Bucket : Contents.getBuckets()) {
+    for (auto &Hash : Bucket) {
+      uint32_t HashValue = Hash->HashValue;
+      if (SkipIdenticalHashes && PrevHash == HashValue)
+        continue;
+      Asm->OutStreamer->AddComment("Hash in Bucket " + Twine(BucketIdx));
+      Asm->emitInt32(HashValue);
+      PrevHash = HashValue;
+    }
+    BucketIdx++;
+  }
+}
+
+void AccelTableWriter::emitOffsets(const MCSymbol *Base) const {
+  const auto &Buckets = Contents.getBuckets();
+  uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (auto *Hash : Buckets[i]) {
+      uint32_t HashValue = Hash->HashValue;
+      if (SkipIdenticalHashes && PrevHash == HashValue)
+        continue;
+      PrevHash = HashValue;
+      Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i));
+      Asm->EmitLabelDifference(Hash->Sym, Base, sizeof(uint32_t));
+    }
+  }
+}
+
+void AppleAccelTableWriter::Header::emit(AsmPrinter *Asm) const {
+  Asm->OutStreamer->AddComment("Header Magic");
+  Asm->emitInt32(Magic);
+  Asm->OutStreamer->AddComment("Header Version");
+  Asm->emitInt16(Version);
+  Asm->OutStreamer->AddComment("Header Hash Function");
+  Asm->emitInt16(HashFunction);
+  Asm->OutStreamer->AddComment("Header Bucket Count");
+  Asm->emitInt32(BucketCount);
+  Asm->OutStreamer->AddComment("Header Hash Count");
+  Asm->emitInt32(HashCount);
+  Asm->OutStreamer->AddComment("Header Data Length");
+  Asm->emitInt32(HeaderDataLength);
+}
+
+void AppleAccelTableWriter::HeaderData::emit(AsmPrinter *Asm) const {
+  Asm->OutStreamer->AddComment("HeaderData Die Offset Base");
+  Asm->emitInt32(DieOffsetBase);
+  Asm->OutStreamer->AddComment("HeaderData Atom Count");
+  Asm->emitInt32(Atoms.size());
+
+  for (const Atom &A : Atoms) {
+    Asm->OutStreamer->AddComment(dwarf::AtomTypeString(A.Type));
+    Asm->emitInt16(A.Type);
+    Asm->OutStreamer->AddComment(dwarf::FormEncodingString(A.Form));
+    Asm->emitInt16(A.Form);
+  }
+}
+
+void AppleAccelTableWriter::emitBuckets() const {
+  const auto &Buckets = Contents.getBuckets();
+  unsigned index = 0;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    Asm->OutStreamer->AddComment("Bucket " + Twine(i));
+    if (!Buckets[i].empty())
+      Asm->emitInt32(index);
+    else
+      Asm->emitInt32(std::numeric_limits<uint32_t>::max());
+    // Buckets point in the list of hashes, not to the data. Do not increment
+    // the index multiple times in case of hash collisions.
+    uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+    for (auto *HD : Buckets[i]) {
+      uint32_t HashValue = HD->HashValue;
+      if (PrevHash != HashValue)
+        ++index;
+      PrevHash = HashValue;
+    }
+  }
+}
+
+void AppleAccelTableWriter::emitData() const {
+  const auto &Buckets = Contents.getBuckets();
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+    for (auto &Hash : Buckets[i]) {
+      // Terminate the previous entry if there is no hash collision with the
+      // current one.
+      if (PrevHash != std::numeric_limits<uint64_t>::max() &&
+          PrevHash != Hash->HashValue)
+        Asm->emitInt32(0);
+      // Remember to emit the label for our offset.
+      Asm->OutStreamer->EmitLabel(Hash->Sym);
+      Asm->OutStreamer->AddComment(Hash->Name.getString());
+      Asm->emitDwarfStringOffset(Hash->Name);
+      Asm->OutStreamer->AddComment("Num DIEs");
+      Asm->emitInt32(Hash->Values.size());
+      for (const auto *V : Hash->Values)
+        static_cast<const AppleAccelTableData *>(V)->emit(Asm);
+      PrevHash = Hash->HashValue;
+    }
+    // Emit the final end marker for the bucket.
+    if (!Buckets[i].empty())
+      Asm->emitInt32(0);
+  }
+}
+
+void AppleAccelTableWriter::emit() const {
+  Header.emit(Asm);
+  HeaderData.emit(Asm);
+  emitBuckets();
+  emitHashes();
+  emitOffsets(SecBegin);
+  emitData();
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::Header::emit(
+    const Dwarf5AccelTableWriter &Ctx) const {
+  assert(CompUnitCount > 0 && "Index must have at least one CU.");
+
+  AsmPrinter *Asm = Ctx.Asm;
+  Asm->OutStreamer->AddComment("Header: unit length");
+  Asm->EmitLabelDifference(Ctx.ContributionEnd, Ctx.ContributionStart,
+                           sizeof(uint32_t));
+  Asm->OutStreamer->EmitLabel(Ctx.ContributionStart);
+  Asm->OutStreamer->AddComment("Header: version");
+  Asm->emitInt16(Version);
+  Asm->OutStreamer->AddComment("Header: padding");
+  Asm->emitInt16(Padding);
+  Asm->OutStreamer->AddComment("Header: compilation unit count");
+  Asm->emitInt32(CompUnitCount);
+  Asm->OutStreamer->AddComment("Header: local type unit count");
+  Asm->emitInt32(LocalTypeUnitCount);
+  Asm->OutStreamer->AddComment("Header: foreign type unit count");
+  Asm->emitInt32(ForeignTypeUnitCount);
+  Asm->OutStreamer->AddComment("Header: bucket count");
+  Asm->emitInt32(BucketCount);
+  Asm->OutStreamer->AddComment("Header: name count");
+  Asm->emitInt32(NameCount);
+  Asm->OutStreamer->AddComment("Header: abbreviation table size");
+  Asm->EmitLabelDifference(Ctx.AbbrevEnd, Ctx.AbbrevStart, sizeof(uint32_t));
+  Asm->OutStreamer->AddComment("Header: augmentation string size");
+  assert(AugmentationStringSize % 4 == 0);
+  Asm->emitInt32(AugmentationStringSize);
+  Asm->OutStreamer->AddComment("Header: augmentation string");
+  Asm->OutStreamer->EmitBytes({AugmentationString, AugmentationStringSize});
+}
+
+template <typename DataT>
+DenseSet<uint32_t> Dwarf5AccelTableWriter<DataT>::getUniqueTags() const {
+  DenseSet<uint32_t> UniqueTags;
+  for (auto &Bucket : Contents.getBuckets()) {
+    for (auto *Hash : Bucket) {
+      for (auto *Value : Hash->Values) {
+        unsigned Tag = static_cast<const DataT *>(Value)->getDieTag();
+        UniqueTags.insert(Tag);
+      }
+    }
+  }
+  return UniqueTags;
+}
+
+template <typename DataT>
+SmallVector<typename Dwarf5AccelTableWriter<DataT>::AttributeEncoding, 2>
+Dwarf5AccelTableWriter<DataT>::getUniformAttributes() const {
+  SmallVector<AttributeEncoding, 2> UA;
+  if (CompUnits.size() > 1) {
+    size_t LargestCUIndex = CompUnits.size() - 1;
+    dwarf::Form Form = DIEInteger::BestForm(/*IsSigned*/ false, LargestCUIndex);
+    UA.push_back({dwarf::DW_IDX_compile_unit, Form});
+  }
+  UA.push_back({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+  return UA;
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitCUList() const {
+  for (const auto &CU : enumerate(CompUnits)) {
+    Asm->OutStreamer->AddComment("Compilation unit " + Twine(CU.index()));
+    Asm->emitDwarfSymbolReference(CU.value());
+  }
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitBuckets() const {
+  uint32_t Index = 1;
+  for (const auto &Bucket : enumerate(Contents.getBuckets())) {
+    Asm->OutStreamer->AddComment("Bucket " + Twine(Bucket.index()));
+    Asm->emitInt32(Bucket.value().empty() ? 0 : Index);
+    Index += Bucket.value().size();
+  }
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitStringOffsets() const {
+  for (const auto &Bucket : enumerate(Contents.getBuckets())) {
+    for (auto *Hash : Bucket.value()) {
+      DwarfStringPoolEntryRef String = Hash->Name;
+      Asm->OutStreamer->AddComment("String in Bucket " + Twine(Bucket.index()) +
+                                   ": " + String.getString());
+      Asm->emitDwarfStringOffset(String);
+    }
+  }
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitAbbrevs() const {
+  Asm->OutStreamer->EmitLabel(AbbrevStart);
+  for (const auto &Abbrev : Abbreviations) {
+    Asm->OutStreamer->AddComment("Abbrev code");
+    assert(Abbrev.first != 0);
+    Asm->EmitULEB128(Abbrev.first);
+    Asm->OutStreamer->AddComment(dwarf::TagString(Abbrev.first));
+    Asm->EmitULEB128(Abbrev.first);
+    for (const auto &AttrEnc : Abbrev.second) {
+      Asm->EmitULEB128(AttrEnc.Index, dwarf::IndexString(AttrEnc.Index).data());
+      Asm->EmitULEB128(AttrEnc.Form,
+                       dwarf::FormEncodingString(AttrEnc.Form).data());
+    }
+    Asm->EmitULEB128(0, "End of abbrev");
+    Asm->EmitULEB128(0, "End of abbrev");
+  }
+  Asm->EmitULEB128(0, "End of abbrev list");
+  Asm->OutStreamer->EmitLabel(AbbrevEnd);
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitEntry(const DataT &Entry) const {
+  auto AbbrevIt = Abbreviations.find(Entry.getDieTag());
+  assert(AbbrevIt != Abbreviations.end() &&
+         "Why wasn't this abbrev generated?");
+
+  Asm->EmitULEB128(AbbrevIt->first, "Abbreviation code");
+  for (const auto &AttrEnc : AbbrevIt->second) {
+    Asm->OutStreamer->AddComment(dwarf::IndexString(AttrEnc.Index));
+    switch (AttrEnc.Index) {
+    case dwarf::DW_IDX_compile_unit: {
+      DIEInteger ID(getCUIndexForEntry(Entry));
+      ID.EmitValue(Asm, AttrEnc.Form);
+      break;
+    }
+    case dwarf::DW_IDX_die_offset:
+      assert(AttrEnc.Form == dwarf::DW_FORM_ref4);
+      Asm->emitInt32(Entry.getDieOffset());
+      break;
+    default:
+      llvm_unreachable("Unexpected index attribute!");
+    }
+  }
+}
+
+template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emitData() const {
+  Asm->OutStreamer->EmitLabel(EntryPool);
+  for (auto &Bucket : Contents.getBuckets()) {
+    for (auto *Hash : Bucket) {
+      // Remember to emit the label for our offset.
+      Asm->OutStreamer->EmitLabel(Hash->Sym);
+      for (const auto *Value : Hash->Values)
+        emitEntry(*static_cast<const DataT *>(Value));
+      Asm->OutStreamer->AddComment("End of list: " + Hash->Name.getString());
+      Asm->emitInt32(0);
+    }
+  }
+}
+
+template <typename DataT>
+Dwarf5AccelTableWriter<DataT>::Dwarf5AccelTableWriter(
+    AsmPrinter *Asm, const AccelTableBase &Contents,
+    ArrayRef<MCSymbol *> CompUnits,
+    llvm::function_ref<unsigned(const DataT &)> getCUIndexForEntry)
+    : AccelTableWriter(Asm, Contents, false),
+      Header(CompUnits.size(), Contents.getBucketCount(),
+             Contents.getUniqueNameCount()),
+      CompUnits(CompUnits), getCUIndexForEntry(std::move(getCUIndexForEntry)) {
+  DenseSet<uint32_t> UniqueTags = getUniqueTags();
+  SmallVector<AttributeEncoding, 2> UniformAttributes = getUniformAttributes();
+
+  Abbreviations.reserve(UniqueTags.size());
+  for (uint32_t Tag : UniqueTags)
+    Abbreviations.try_emplace(Tag, UniformAttributes);
+}
+
+template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emit() const {
+  Header.emit(*this);
+  emitCUList();
+  emitBuckets();
+  emitHashes();
+  emitStringOffsets();
+  emitOffsets(EntryPool);
+  emitAbbrevs();
+  emitData();
+  Asm->OutStreamer->EmitValueToAlignment(4, 0);
+  Asm->OutStreamer->EmitLabel(ContributionEnd);
+}
+
+void llvm::emitAppleAccelTableImpl(AsmPrinter *Asm, AccelTableBase &Contents,
+                                   StringRef Prefix, const MCSymbol *SecBegin,
+                                   ArrayRef<AppleAccelTableData::Atom> Atoms) {
+  Contents.finalize(Asm, Prefix);
+  AppleAccelTableWriter(Asm, Contents, Atoms, SecBegin).emit();
+}
+
+void llvm::emitDWARF5AccelTable(
+    AsmPrinter *Asm, AccelTable<DWARF5AccelTableData> &Contents,
+    const DwarfDebug &DD, ArrayRef<std::unique_ptr<DwarfCompileUnit>> CUs) {
+  std::vector<MCSymbol *> CompUnits;
+  for (const auto &CU : enumerate(CUs)) {
+    assert(CU.index() == CU.value()->getUniqueID());
+    const DwarfCompileUnit *MainCU =
+        DD.useSplitDwarf() ? CU.value()->getSkeleton() : CU.value().get();
+    CompUnits.push_back(MainCU->getLabelBegin());
+  }
+
+  Contents.finalize(Asm, "names");
+  Dwarf5AccelTableWriter<DWARF5AccelTableData>(
+      Asm, Contents, CompUnits,
+      [&DD](const DWARF5AccelTableData &Entry) {
+        const DIE *CUDie = Entry.getDie().getUnitDie();
+        return DD.lookupCU(CUDie)->getUniqueID();
+      })
+      .emit();
+}
+
+void llvm::emitDWARF5AccelTable(
+    AsmPrinter *Asm, AccelTable<DWARF5AccelTableStaticData> &Contents,
+    ArrayRef<MCSymbol *> CUs,
+    llvm::function_ref<unsigned(const DWARF5AccelTableStaticData &)>
+        getCUIndexForEntry) {
+  Contents.finalize(Asm, "names");
+  Dwarf5AccelTableWriter<DWARF5AccelTableStaticData>(Asm, Contents, CUs,
+                                                     getCUIndexForEntry)
+      .emit();
+}
+
+void AppleAccelTableOffsetData::emit(AsmPrinter *Asm) const {
+  Asm->emitInt32(Die.getDebugSectionOffset());
+}
+
+void AppleAccelTableTypeData::emit(AsmPrinter *Asm) const {
+  Asm->emitInt32(Die.getDebugSectionOffset());
+  Asm->emitInt16(Die.getTag());
+  Asm->emitInt8(0);
+}
+
+void AppleAccelTableStaticOffsetData::emit(AsmPrinter *Asm) const {
+  Asm->emitInt32(Offset);
+}
+
+void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const {
+  Asm->emitInt32(Offset);
+  Asm->emitInt16(Tag);
+  Asm->emitInt8(ObjCClassIsImplementation ? dwarf::DW_FLAG_type_implementation
+                                          : 0);
+  Asm->emitInt32(QualifiedNameHash);
+}
+
+#ifndef _MSC_VER
+// The lines below are rejected by older versions (TBD) of MSVC.
+constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[];
+constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[];
+constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[];
+constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[];
+#else
+// FIXME: Erase this path once the minimum MSCV version has been bumped.
+const SmallVector<AppleAccelTableData::Atom, 4>
+    AppleAccelTableOffsetData::Atoms = {
+        Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)};
+const SmallVector<AppleAccelTableData::Atom, 4> AppleAccelTableTypeData::Atoms =
+    {Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
+     Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
+     Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
+const SmallVector<AppleAccelTableData::Atom, 4>
+    AppleAccelTableStaticOffsetData::Atoms = {
+        Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)};
+const SmallVector<AppleAccelTableData::Atom, 4>
+    AppleAccelTableStaticTypeData::Atoms = {
+        Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
+        Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
+        Atom(5, dwarf::DW_FORM_data1), Atom(6, dwarf::DW_FORM_data4)};
+#endif
+
+#ifndef NDEBUG
+void AppleAccelTableWriter::Header::print(raw_ostream &OS) const {
+  OS << "Magic: " << format("0x%x", Magic) << "\n"
+     << "Version: " << Version << "\n"
+     << "Hash Function: " << HashFunction << "\n"
+     << "Bucket Count: " << BucketCount << "\n"
+     << "Header Data Length: " << HeaderDataLength << "\n";
+}
+
+void AppleAccelTableData::Atom::print(raw_ostream &OS) const {
+  OS << "Type: " << dwarf::AtomTypeString(Type) << "\n"
+     << "Form: " << dwarf::FormEncodingString(Form) << "\n";
+}
+
+void AppleAccelTableWriter::HeaderData::print(raw_ostream &OS) const {
+  OS << "DIE Offset Base: " << DieOffsetBase << "\n";
+  for (auto Atom : Atoms)
+    Atom.print(OS);
+}
+
+void AppleAccelTableWriter::print(raw_ostream &OS) const {
+  Header.print(OS);
+  HeaderData.print(OS);
+  Contents.print(OS);
+  SecBegin->print(OS, nullptr);
+}
+
+void AccelTableBase::HashData::print(raw_ostream &OS) const {
+  OS << "Name: " << Name.getString() << "\n";
+  OS << "  Hash Value: " << format("0x%x", HashValue) << "\n";
+  OS << "  Symbol: ";
+  if (Sym)
+    OS << *Sym;
+  else
+    OS << "<none>";
+  OS << "\n";
+  for (auto *Value : Values)
+    Value->print(OS);
+}
+
+void AccelTableBase::print(raw_ostream &OS) const {
+  // Print Content.
+  OS << "Entries: \n";
+  for (const auto &Entry : Entries) {
+    OS << "Name: " << Entry.first() << "\n";
+    for (auto *V : Entry.second.Values)
+      V->print(OS);
+  }
+
+  OS << "Buckets and Hashes: \n";
+  for (auto &Bucket : Buckets)
+    for (auto &Hash : Bucket)
+      Hash->print(OS);
+
+  OS << "Data: \n";
+  for (auto &E : Entries)
+    E.second.print(OS);
+}
+
+void DWARF5AccelTableData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << getDieOffset() << "\n";
+  OS << "  Tag: " << dwarf::TagString(getDieTag()) << "\n";
+}
+
+void DWARF5AccelTableStaticData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << getDieOffset() << "\n";
+  OS << "  Tag: " << dwarf::TagString(getDieTag()) << "\n";
+}
+
+void AppleAccelTableOffsetData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << Die.getOffset() << "\n";
+}
+
+void AppleAccelTableTypeData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << Die.getOffset() << "\n";
+  OS << "  Tag: " << dwarf::TagString(Die.getTag()) << "\n";
+}
+
+void AppleAccelTableStaticOffsetData::print(raw_ostream &OS) const {
+  OS << "  Static Offset: " << Offset << "\n";
+}
+
+void AppleAccelTableStaticTypeData::print(raw_ostream &OS) const {
+  OS << "  Static Offset: " << Offset << "\n";
+  OS << "  QualifiedNameHash: " << format("%x\n", QualifiedNameHash) << "\n";
+  OS << "  Tag: " << dwarf::TagString(Tag) << "\n";
+  OS << "  ObjCClassIsImplementation: "
+     << (ObjCClassIsImplementation ? "true" : "false");
+  OS << "\n";
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index 59ed0324bdb0..4a226527cb5b 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -10,9 +10,9 @@
 #include "AddressPool.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <utility>
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
index 990a158d87cd..5350006bf744 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -39,7 +39,7 @@ class AddressPool {
 public:
   AddressPool() = default;
 
-  /// \brief Returns the index into the address pool with the given
+  /// Returns the index into the address pool with the given
   /// label/symbol.
   unsigned getIndex(const MCSymbol *Sym, bool TLS = false);
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index d7995447592c..9bbc77b3056b 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "WinCFGuard.h"
 #include "WinException.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -30,7 +31,6 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -39,6 +39,7 @@
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -54,7 +55,6 @@
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -87,6 +87,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
@@ -107,6 +108,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
@@ -130,6 +132,8 @@ static const char *const DbgTimerName = "emit";
 static const char *const DbgTimerDescription = "Debug Info Emission";
 static const char *const EHTimerName = "write_exception";
 static const char *const EHTimerDescription = "DWARF Exception Writer";
+static const char *const CFGuardName = "Control Flow Guard";
+static const char *const CFGuardDescription = "Control Flow Guard Tables";
 static const char *const CodeViewLineTablesGroupName = "linetables";
 static const char *const CodeViewLineTablesGroupDescription =
   "CodeView Line Tables";
@@ -211,8 +215,10 @@ const DataLayout &AsmPrinter::getDataLayout() const {
 }
 
 // Do not use the cached DataLayout because some client use it without a Module
-// (llvm-dsymutil, llvm-dwarfdump).
-unsigned AsmPrinter::getPointerSize() const { return TM.getPointerSize(); }
+// (dsymutil, llvm-dwarfdump).
+unsigned AsmPrinter::getPointerSize() const {
+  return TM.getPointerSize(0); // FIXME: Default address space
+}
 
 const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
   assert(MF && "getSubtargetInfo requires a valid MachineFunction!");
@@ -234,7 +240,6 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineModuleInfo>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
-  AU.addRequired<MachineLoopInfo>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -246,7 +251,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   OutStreamer->InitSections(false);
 
-  // Emit the version-min deplyment target directive if needed.
+  // Emit the version-min deployment target directive if needed.
   //
   // FIXME: If we end up with a collection of these sorts of Darwin-specific
   // or ELF-specific things, it may make sense to have a platform helper class
@@ -291,8 +296,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
-    if (EmitCodeView && (TM.getTargetTriple().isKnownWindowsMSVCEnvironment() ||
-                         TM.getTargetTriple().isWindowsItaniumEnvironment())) {
+    if (EmitCodeView && TM.getTargetTriple().isOSWindows()) {
       Handlers.push_back(HandlerInfo(new CodeViewDebug(this),
                                      DbgTimerName, DbgTimerDescription,
                                      CodeViewLineTablesGroupName,
@@ -350,10 +354,20 @@ bool AsmPrinter::doInitialization(Module &M) {
       break;
     }
     break;
+  case ExceptionHandling::Wasm:
+    // TODO to prevent warning
+    break;
   }
   if (ES)
     Handlers.push_back(HandlerInfo(ES, EHTimerName, EHTimerDescription,
                                    DWARFGroupName, DWARFGroupDescription));
+
+  if (mdconst::extract_or_null<ConstantInt>(
+          MMI->getModule()->getModuleFlag("cfguard")))
+    Handlers.push_back(HandlerInfo(new WinCFGuard(this), CFGuardName,
+                                   CFGuardDescription, DWARFGroupName,
+                                   DWARFGroupDescription));
+
   return false;
 }
 
@@ -361,7 +375,7 @@ static bool canBeHidden(const GlobalValue *GV, const MCAsmInfo &MAI) {
   if (!MAI.hasWeakDefCanBeHiddenDirective())
     return false;
 
-  return canBeOmittedFromSymbolTable(GV);
+  return GV->canBeOmittedFromSymbolTable();
 }
 
 void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
@@ -416,7 +430,7 @@ MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
 
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
 void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
-  bool IsEmuTLSVar = TM.Options.EmulatedTLS && GV->isThreadLocal();
+  bool IsEmuTLSVar = TM.useEmulatedTLS() && GV->isThreadLocal();
   assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) &&
          "No emulated TLS variables in the common section");
 
@@ -898,6 +912,30 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   return true;
 }
 
+/// This method handles the target-independent form of DBG_LABEL, returning
+/// true if it was able to do so.  A false return means the target will need
+/// to handle MI in EmitInstruction.
+static bool emitDebugLabelComment(const MachineInstr *MI, AsmPrinter &AP) {
+  if (MI->getNumOperands() != 1)
+    return false;
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "DEBUG_LABEL: ";
+
+  const DILabel *V = MI->getDebugLabel();
+  if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) {
+    StringRef Name = SP->getName();
+    if (!Name.empty())
+      OS << Name << ":";
+  }
+  OS << V->getName();
+
+  // NOTE: Want this comment at start of line, don't emit with AddComment.
+  AP.OutStreamer->emitRawComment(OS.str());
+  return true;
+}
+
 AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() const {
   if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI &&
       MF->getFunction().needsUnwindTableEntry())
@@ -952,7 +990,8 @@ void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
   if (!MF.getTarget().Options.EmitStackSizeSection)
     return;
 
-  MCSection *StackSizeSection = getObjFileLowering().getStackSizesSection();
+  MCSection *StackSizeSection =
+      getObjFileLowering().getStackSizesSection(*getCurrentSection());
   if (!StackSizeSection)
     return;
 
@@ -964,10 +1003,9 @@ void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
   OutStreamer->PushSection();
   OutStreamer->SwitchSection(StackSizeSection);
 
-  const MCSymbol *FunctionSymbol = getSymbol(&MF.getFunction());
+  const MCSymbol *FunctionSymbol = getFunctionBegin();
   uint64_t StackSize = FrameInfo.getStackSize();
-  OutStreamer->EmitValue(MCSymbolRefExpr::create(FunctionSymbol, OutContext),
-                         /* size = */ 8);
+  OutStreamer->EmitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
   OutStreamer->EmitULEB128IntValue(StackSize);
 
   OutStreamer->PopSection();
@@ -996,6 +1034,24 @@ void AsmPrinter::EmitFunctionBody() {
 
   bool ShouldPrintDebugScopes = MMI->hasDebugInfo();
 
+  if (isVerbose()) {
+    // Get MachineDominatorTree or compute it on the fly if it's unavailable
+    MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+    if (!MDT) {
+      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT->getBase().recalculate(*MF);
+      MDT = OwnedMDT.get();
+    }
+
+    // Get MachineLoopInfo or compute it on the fly if it's unavailable
+    MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    if (!MLI) {
+      OwnedMLI = make_unique<MachineLoopInfo>();
+      OwnedMLI->getBase().analyze(MDT->getBase());
+      MLI = OwnedMLI.get();
+    }
+  }
+
   // Print out code for the function.
   bool HasAnyRealCode = false;
   int NumInstsInFunction = 0;
@@ -1005,7 +1061,7 @@ void AsmPrinter::EmitFunctionBody() {
     for (auto &MI : MBB) {
       // Print the assembly for the instruction.
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
-          !MI.isDebugValue()) {
+          !MI.isDebugInstr()) {
         HasAnyRealCode = true;
         ++NumInstsInFunction;
       }
@@ -1044,6 +1100,12 @@ void AsmPrinter::EmitFunctionBody() {
             EmitInstruction(&MI);
         }
         break;
+      case TargetOpcode::DBG_LABEL:
+        if (isVerbose()) {
+          if (!emitDebugLabelComment(&MI, *this))
+            EmitInstruction(&MI);
+        }
+        break;
       case TargetOpcode::IMPLICIT_DEF:
         if (isVerbose()) emitImplicitDef(&MI);
         break;
@@ -1155,7 +1217,7 @@ void AsmPrinter::EmitFunctionBody() {
   OutStreamer->AddBlankLine();
 }
 
-/// \brief Compute the number of Global Variables that uses a Constant.
+/// Compute the number of Global Variables that uses a Constant.
 static unsigned getNumGlobalVariableUses(const Constant *C) {
   if (!C)
     return 0;
@@ -1170,7 +1232,7 @@ static unsigned getNumGlobalVariableUses(const Constant *C) {
   return NumUses;
 }
 
-/// \brief Only consider global GOT equivalents if at least one user is a
+/// Only consider global GOT equivalents if at least one user is a
 /// cstexpr inside an initializer of another global variables. Also, don't
 /// handle cstexpr inside instructions. During global variable emission,
 /// candidates are skipped and are emitted later in case at least one cstexpr
@@ -1193,7 +1255,7 @@ static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
   return NumGOTEquivUsers > 0;
 }
 
-/// \brief Unnamed constant global variables solely contaning a pointer to
+/// Unnamed constant global variables solely contaning a pointer to
 /// another globals variable is equivalent to a GOT table entry; it contains the
 /// the address of another symbol. Optimize it and replace accesses to these
 /// "GOT equivalents" by using the GOT entry for the final global instead.
@@ -1214,7 +1276,7 @@ void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
   }
 }
 
-/// \brief Constant expressions using GOT equivalent globals may not be eligible
+/// Constant expressions using GOT equivalent globals may not be eligible
 /// for PC relative GOT entry conversion, in such cases we need to emit such
 /// globals we previously omitted in EmitGlobalVariable.
 void AsmPrinter::emitGlobalGOTEquivs() {
@@ -1312,7 +1374,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
 
-  TLOF.emitModuleMetadata(*OutStreamer, M, TM);
+  TLOF.emitModuleMetadata(*OutStreamer, M);
 
   if (TM.getTargetTriple().isOSBinFormatELF()) {
     MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
@@ -1323,6 +1385,7 @@ bool AsmPrinter::doFinalization(Module &M) {
       OutStreamer->SwitchSection(TLOF.getDataSection());
       const DataLayout &DL = M.getDataLayout();
 
+      EmitAlignment(Log2_32(DL.getPointerSize()));
       for (const auto &Stub : Stubs) {
         OutStreamer->EmitLabel(Stub.first);
         OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
@@ -1421,6 +1484,61 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
       OutStreamer->SwitchSection(S);
 
+  if (TM.getTargetTriple().isOSBinFormatCOFF()) {
+    // Emit /EXPORT: flags for each exported global as necessary.
+    const auto &TLOF = getObjFileLowering();
+    std::string Flags;
+
+    for (const GlobalValue &GV : M.global_values()) {
+      raw_string_ostream OS(Flags);
+      TLOF.emitLinkerFlagsForGlobal(OS, &GV);
+      OS.flush();
+      if (!Flags.empty()) {
+        OutStreamer->SwitchSection(TLOF.getDrectveSection());
+        OutStreamer->EmitBytes(Flags);
+      }
+      Flags.clear();
+    }
+
+    // Emit /INCLUDE: flags for each used global as necessary.
+    if (const auto *LU = M.getNamedGlobal("llvm.used")) {
+      assert(LU->hasInitializer() &&
+             "expected llvm.used to have an initializer");
+      assert(isa<ArrayType>(LU->getValueType()) &&
+             "expected llvm.used to be an array type");
+      if (const auto *A = cast<ConstantArray>(LU->getInitializer())) {
+        for (const Value *Op : A->operands()) {
+          const auto *GV =
+              cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+          // Global symbols with internal or private linkage are not visible to
+          // the linker, and thus would cause an error when the linker tried to
+          // preserve the symbol due to the `/include:` directive.
+          if (GV->hasLocalLinkage())
+            continue;
+
+          raw_string_ostream OS(Flags);
+          TLOF.emitLinkerFlagsForUsed(OS, GV);
+          OS.flush();
+
+          if (!Flags.empty()) {
+            OutStreamer->SwitchSection(TLOF.getDrectveSection());
+            OutStreamer->EmitBytes(Flags);
+          }
+          Flags.clear();
+        }
+      }
+    }
+  }
+
+  if (TM.Options.EmitAddrsig) {
+    // Emit address-significance attributes for all globals.
+    OutStreamer->EmitAddrsig();
+    for (const GlobalValue &GV : M.global_values())
+      if (!GV.isThreadLocal() && !GV.getName().startswith("llvm.") &&
+          !GV.hasAtLeastLocalUnnamedAddr())
+        OutStreamer->EmitAddrsigSym(getSymbol(&GV));
+  }
+
   // Allow the target to emit any magic that it wants at the end of the file,
   // after everything else has gone out.
   EmitEndOfAsmFile(M);
@@ -1429,6 +1547,8 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   OutStreamer->Finish();
   OutStreamer->reset();
+  OwnedMLI.reset();
+  OwnedMDT.reset();
 
   return false;
 }
@@ -1447,14 +1567,14 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   CurrentFnBegin = nullptr;
   CurExceptionSym = nullptr;
   bool NeedsLocalForSize = MAI->needsLocalForSize();
-  if (needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize) {
+  if (needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize ||
+      MF.getTarget().Options.EmitStackSizeSection) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
       CurrentFnSymForSize = CurrentFnBegin;
   }
 
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
-  LI = &getAnalysis<MachineLoopInfo>();
 
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   EnablePrintSchedInfo = PrintSchedule.getNumOccurrences()
@@ -1842,22 +1962,27 @@ void AsmPrinter::EmitModuleIdents(Module &M) {
 // Emission and print routines
 //
 
-/// EmitInt8 - Emit a byte directive and value.
+/// Emit a byte directive and value.
 ///
-void AsmPrinter::EmitInt8(int Value) const {
+void AsmPrinter::emitInt8(int Value) const {
   OutStreamer->EmitIntValue(Value, 1);
 }
 
-/// EmitInt16 - Emit a short directive and value.
-void AsmPrinter::EmitInt16(int Value) const {
+/// Emit a short directive and value.
+void AsmPrinter::emitInt16(int Value) const {
   OutStreamer->EmitIntValue(Value, 2);
 }
 
-/// EmitInt32 - Emit a long directive and value.
-void AsmPrinter::EmitInt32(int Value) const {
+/// Emit a long directive and value.
+void AsmPrinter::emitInt32(int Value) const {
   OutStreamer->EmitIntValue(Value, 4);
 }
 
+/// Emit a long long directive and value.
+void AsmPrinter::emitInt64(uint64_t Value) const {
+  OutStreamer->EmitIntValue(Value, 8);
+}
+
 /// Emit something like ".long Hi-Lo" where the size in bytes of the directive
 /// is specified by Size and Hi/Lo specify the labels. This implicitly uses
 /// .set if it avoids relocations.
@@ -2069,6 +2194,7 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
                                    uint64_t Offset = 0);
 
 static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP);
+static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP);
 
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
@@ -2146,13 +2272,15 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
                                    ElementByteSize);
     }
   } else {
+    Type *ET = CDS->getElementType();
     for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
-      emitGlobalConstantFP(cast<ConstantFP>(CDS->getElementAsConstant(I)), AP);
+      emitGlobalConstantFP(CDS->getElementAsAPFloat(I), ET, AP);
   }
 
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
+  assert(EmittedSize <= Size && "Size cannot be less than EmittedSize!");
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer->EmitZeros(Padding);
 }
@@ -2216,17 +2344,17 @@ static void emitGlobalConstantStruct(const DataLayout &DL,
          "Layout of constant struct may be incorrect!");
 }
 
-static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
-  APInt API = CFP->getValueAPF().bitcastToAPInt();
+static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
+  APInt API = APF.bitcastToAPInt();
 
   // First print a comment with what we think the original floating-point value
   // should have been.
   if (AP.isVerbose()) {
     SmallString<8> StrVal;
-    CFP->getValueAPF().toString(StrVal);
+    APF.toString(StrVal);
 
-    if (CFP->getType())
-      CFP->getType()->print(AP.OutStreamer->GetCommentOS());
+    if (ET)
+      ET->print(AP.OutStreamer->GetCommentOS());
     else
       AP.OutStreamer->GetCommentOS() << "Printing <null> Type";
     AP.OutStreamer->GetCommentOS() << ' ' << StrVal << '\n';
@@ -2241,7 +2369,7 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.getDataLayout().isBigEndian() && !CFP->getType()->isPPC_FP128Ty()) {
+  if (AP.getDataLayout().isBigEndian() && !ET->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
     if (TrailingBytes)
@@ -2260,8 +2388,11 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
 
   // Emit the tail padding for the long double.
   const DataLayout &DL = AP.getDataLayout();
-  AP.OutStreamer->EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
-                            DL.getTypeStoreSize(CFP->getType()));
+  AP.OutStreamer->EmitZeros(DL.getTypeAllocSize(ET) - DL.getTypeStoreSize(ET));
+}
+
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
+  emitGlobalConstantFP(CFP->getValueAPF(), CFP->getType(), AP);
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
@@ -2320,7 +2451,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
   }
 }
 
-/// \brief Transform a not absolute MCExpr containing a reference to a GOT
+/// Transform a not absolute MCExpr containing a reference to a GOT
 /// equivalent global, by a target specific GOT pc relative access to the
 /// final symbol.
 static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
@@ -2533,6 +2664,25 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  if (getSubtargetInfo().getTargetTriple().isKnownWindowsMSVCEnvironment()) {
+    const MachineConstantPoolEntry &CPE =
+        MF->getConstantPool()->getConstants()[CPID];
+    if (!CPE.isMachineConstantPoolEntry()) {
+      const DataLayout &DL = MF->getDataLayout();
+      SectionKind Kind = CPE.getSectionKind(&DL);
+      const Constant *C = CPE.Val.ConstVal;
+      unsigned Align = CPE.Alignment;
+      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+              getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
+        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+          if (Sym->isUndefined())
+            OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+          return Sym;
+        }
+      }
+    }
+  }
+
   const DataLayout &DL = getDataLayout();
   return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
                                       "CPI" + Twine(getFunctionNumber()) + "_" +
@@ -2631,13 +2781,9 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
 void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB,
                                          MCCodePaddingContext &Context) const {
   assert(MF != nullptr && "Machine function must be valid");
-  assert(LI != nullptr && "Loop info must be valid");
   Context.IsPaddingActive = !MF->hasInlineAsm() &&
                             !MF->getFunction().optForSize() &&
                             TM.getOptLevel() != CodeGenOpt::None;
-  const MachineLoop *CurrentLoop = LI->getLoopFor(&MBB);
-  Context.IsBasicBlockInsideInnermostLoop =
-      CurrentLoop != nullptr && CurrentLoop->getSubLoops().empty();
   Context.IsBasicBlockReachableViaFallthrough =
       std::find(MBB.pred_begin(), MBB.pred_end(), MBB.getPrevNode()) !=
       MBB.pred_end();
@@ -2689,7 +2835,9 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
         OutStreamer->GetCommentOS() << '\n';
       }
     }
-    emitBasicBlockLoopComments(MBB, LI, *this);
+
+    assert(MLI != nullptr && "MachineLoopInfo should has been computed");
+    emitBasicBlockLoopComments(MBB, MLI, *this);
   }
 
   // Print the main label for the block.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 08eb14e242c5..605588470670 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -26,6 +25,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
@@ -43,15 +43,6 @@ void AsmPrinter::EmitSLEB128(int64_t Value, const char *Desc) const {
   OutStreamer->EmitSLEB128IntValue(Value);
 }
 
-/// EmitULEB128 - emit the specified unsigned leb128 value.
-void AsmPrinter::EmitPaddedULEB128(uint64_t Value, unsigned PadTo,
-                                   const char *Desc) const {
-  if (isVerbose() && Desc)
-    OutStreamer->AddComment(Desc);
-
-  OutStreamer->EmitPaddedULEB128IntValue(Value, PadTo);
-}
-
 void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc) const {
   if (isVerbose() && Desc)
     OutStreamer->AddComment(Desc);
@@ -59,6 +50,12 @@ void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc) const {
   OutStreamer->EmitULEB128IntValue(Value);
 }
 
+/// Emit something like ".uleb128 Hi-Lo".
+void AsmPrinter::EmitLabelDifferenceAsULEB128(const MCSymbol *Hi,
+                                              const MCSymbol *Lo) const {
+  OutStreamer->emitAbsoluteSymbolDiffAsULEB128(Hi, Lo);
+}
+
 static const char *DecodeDWARFEncoding(unsigned Encoding) {
   switch (Encoding) {
   case dwarf::DW_EH_PE_absptr:
@@ -67,6 +64,10 @@ static const char *DecodeDWARFEncoding(unsigned Encoding) {
     return "omit";
   case dwarf::DW_EH_PE_pcrel:
     return "pcrel";
+  case dwarf::DW_EH_PE_uleb128:
+    return "uleb128";
+  case dwarf::DW_EH_PE_sleb128:
+    return "sleb128";
   case dwarf::DW_EH_PE_udata4:
     return "udata4";
   case dwarf::DW_EH_PE_udata8:
@@ -167,14 +168,19 @@ void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
   EmitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4);
 }
 
-void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntryRef S) const {
+void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
   if (MAI->doesDwarfUseRelocationsAcrossSections()) {
-    emitDwarfSymbolReference(S.getSymbol());
+    assert(S.Symbol && "No symbol available");
+    emitDwarfSymbolReference(S.Symbol);
     return;
   }
 
   // Just emit the offset directly; no need for symbol math.
-  EmitInt32(S.getOffset());
+  emitInt32(S.Offset);
+}
+
+void AsmPrinter::EmitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const {
+  EmitLabelPlusOffset(Label, Offset, MAI->getCodePointerSize());
 }
 
 //===----------------------------------------------------------------------===//
@@ -252,7 +258,7 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
       emitDwarfDIE(Child);
 
     OutStreamer->AddComment("End Of Children Mark");
-    EmitInt8(0);
+    emitInt8(0);
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
index 638226e90a7a..f5ac95a20b10 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
@@ -27,29 +27,29 @@ class MCSymbol;
 
 typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm);
 
-/// \brief Collects and handles AsmPrinter objects required to build debug
+/// Collects and handles AsmPrinter objects required to build debug
 /// or EH information.
 class AsmPrinterHandler {
 public:
   virtual ~AsmPrinterHandler();
 
-  /// \brief For symbols that have a size designated (e.g. common symbols),
+  /// For symbols that have a size designated (e.g. common symbols),
   /// this tracks that size.
   virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) = 0;
 
-  /// \brief Emit all sections that should come after the content.
+  /// Emit all sections that should come after the content.
   virtual void endModule() = 0;
 
-  /// \brief Gather pre-function debug information.
+  /// Gather pre-function debug information.
   /// Every beginFunction(MF) call should be followed by an endFunction(MF)
   /// call.
   virtual void beginFunction(const MachineFunction *MF) = 0;
 
-  // \brief Emit any of function marker (like .cfi_endproc). This is called
+  // Emit any of function marker (like .cfi_endproc). This is called
   // before endFunction and cannot switch sections.
   virtual void markFunctionEnd();
 
-  /// \brief Gather post-function debug information.
+  /// Gather post-function debug information.
   /// Please note that some AsmPrinter implementations may not call
   /// beginFunction at all.
   virtual void endFunction(const MachineFunction *MF) = 0;
@@ -58,15 +58,15 @@ public:
                              ExceptionSymbolProvider ESP) {}
   virtual void endFragment() {}
 
-  /// \brief Emit target-specific EH funclet machinery.
+  /// Emit target-specific EH funclet machinery.
   virtual void beginFunclet(const MachineBasicBlock &MBB,
                             MCSymbol *Sym = nullptr) {}
   virtual void endFunclet() {}
 
-  /// \brief Process beginning of an instruction.
+  /// Process beginning of an instruction.
   virtual void beginInstruction(const MachineInstr *MI) = 0;
 
-  /// \brief Process end of an instruction.
+  /// Process end of an instruction.
   virtual void endInstruction() = 0;
 };
 } // End of namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 04a72ba3d738..4159eb19423a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -132,6 +132,9 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
+  // Do not use assembler-level information for parsing inline assembly.
+  OutStreamer->setUseAssemblerInfoForParsing(false);
+
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
   // we only need MCInstrInfo for asm parsing. We create one unconditionally
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
index aaf6180c9404..2163cc7e3e11 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -43,7 +43,7 @@ public:
   APByteStreamer(AsmPrinter &Asm) : AP(Asm) {}
   void EmitInt8(uint8_t Byte, const Twine &Comment) override {
     AP.OutStreamer->AddComment(Comment);
-    AP.EmitInt8(Byte);
+    AP.emitInt8(Byte);
   }
   void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
     AP.OutStreamer->AddComment(Comment);
@@ -76,7 +76,7 @@ private:
   SmallVectorImpl<char> &Buffer;
   SmallVectorImpl<std::string> &Comments;
 
-  /// \brief Only verbose textual output needs comments.  This will be set to
+  /// Only verbose textual output needs comments.  This will be set to
   /// true for that case, and false otherwise.  If false, comments passed in to
   /// the emit methods will be ignored.
   bool GenerateComments;
@@ -93,15 +93,27 @@ public:
   }
   void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
     raw_svector_ostream OSE(Buffer);
-    encodeSLEB128(DWord, OSE);
-    if (GenerateComments)
+    unsigned Length = encodeSLEB128(DWord, OSE);
+    if (GenerateComments) {
       Comments.push_back(Comment.str());
+      // Add some empty comments to keep the Buffer and Comments vectors aligned
+      // with each other.
+      for (size_t i = 1; i < Length; ++i)
+        Comments.push_back("");
+
+    }
   }
   void EmitULEB128(uint64_t DWord, const Twine &Comment) override {
     raw_svector_ostream OSE(Buffer);
-    encodeULEB128(DWord, OSE);
-    if (GenerateComments)
+    unsigned Length = encodeULEB128(DWord, OSE);
+    if (GenerateComments) {
       Comments.push_back(Comment.str());
+      // Add some empty comments to keep the Buffer and Comments vectors aligned
+      // with each other.
+      for (size_t i = 1; i < Length; ++i)
+        Comments.push_back("");
+
+    }
   }
 };
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 1d0a003dc50a..8c5c5478d01a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -36,7 +36,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
@@ -75,6 +74,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -114,6 +114,16 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
 
   StringRef Dir = File->getDirectory(), Filename = File->getFilename();
 
+  // If this is a Unix-style path, just use it as is. Don't try to canonicalize
+  // it textually because one of the path components could be a symlink.
+  if (!Dir.empty() && Dir[0] == '/') {
+    Filepath = Dir;
+    if (Dir.back() != '/')
+      Filepath += '/';
+    Filepath += Filename;
+    return Filepath;
+  }
+
   // Clang emits directory and relative filename info into the IR, but CodeView
   // operates on full paths.  We could change Clang to emit full paths too, but
   // that would increase the IR size and probably not needed for other users.
@@ -165,14 +175,21 @@ unsigned CodeViewDebug::maybeRecordFile(const DIFile *F) {
   auto Insertion = FileIdMap.insert(std::make_pair(FullPath, NextId));
   if (Insertion.second) {
     // We have to compute the full filepath and emit a .cv_file directive.
-    std::string Checksum = fromHex(F->getChecksum());
-    void *CKMem = OS.getContext().allocate(Checksum.size(), 1);
-    memcpy(CKMem, Checksum.data(), Checksum.size());
-    ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
-                                      Checksum.size());
-    DIFile::ChecksumKind ChecksumKind = F->getChecksumKind();
+    ArrayRef<uint8_t> ChecksumAsBytes;
+    FileChecksumKind CSKind = FileChecksumKind::None;
+    if (F->getChecksum()) {
+      std::string Checksum = fromHex(F->getChecksum()->Value);
+      void *CKMem = OS.getContext().allocate(Checksum.size(), 1);
+      memcpy(CKMem, Checksum.data(), Checksum.size());
+      ChecksumAsBytes = ArrayRef<uint8_t>(
+          reinterpret_cast<const uint8_t *>(CKMem), Checksum.size());
+      switch (F->getChecksum()->Kind) {
+      case DIFile::CSK_MD5:  CSKind = FileChecksumKind::MD5; break;
+      case DIFile::CSK_SHA1: CSKind = FileChecksumKind::SHA1; break;
+      }
+    }
     bool Success = OS.EmitCVFileDirective(NextId, FullPath, ChecksumAsBytes,
-                                          static_cast<unsigned>(ChecksumKind));
+                                          static_cast<unsigned>(CSKind));
     (void)Success;
     assert(Success && ".cv_file directive failed");
   }
@@ -358,15 +375,15 @@ unsigned CodeViewDebug::getPointerSizeInBytes() {
 }
 
 void CodeViewDebug::recordLocalVariable(LocalVariable &&Var,
-                                        const DILocation *InlinedAt) {
-  if (InlinedAt) {
+                                        const LexicalScope *LS) {
+  if (const DILocation *InlinedAt = LS->getInlinedAt()) {
     // This variable was inlined. Associate it with the InlineSite.
     const DISubprogram *Inlinee = Var.DIVar->getScope()->getSubprogram();
     InlineSite &Site = getInlineSite(InlinedAt, Inlinee);
     Site.InlinedLocals.emplace_back(Var);
   } else {
-    // This variable goes in the main ProcSym.
-    CurFn->Locals.emplace_back(Var);
+    // This variable goes into the corresponding lexical scope.
+    ScopeVariables[LS].emplace_back(Var);
   }
 }
 
@@ -463,7 +480,7 @@ void CodeViewDebug::endModule() {
   // Emit per-function debug information.
   for (auto &P : FnDebugInfo)
     if (!P.first->isDeclarationForLinker())
-      emitDebugInfoForFunction(P.first, P.second);
+      emitDebugInfoForFunction(P.first, *P.second);
 
   // Emit global variable debug information.
   setCurrentSubprogram(nullptr);
@@ -501,12 +518,12 @@ void CodeViewDebug::endModule() {
   clear();
 }
 
-static void emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S) {
+static void emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S,
+    unsigned MaxFixedRecordLength = 0xF00) {
   // The maximum CV record length is 0xFF00. Most of the strings we emit appear
   // after a fixed length portion of the record. The fixed length portion should
   // always be less than 0xF00 (3840) bytes, so truncate the string so that the
   // overall record size is less than the maximum allowed.
-  unsigned MaxFixedRecordLength = 0xF00;
   SmallString<32> NullTerminatedString(
       S.take_front(MaxRecordLength - MaxFixedRecordLength - 1));
   NullTerminatedString.push_back('\0');
@@ -517,7 +534,7 @@ void CodeViewDebug::emitTypeInformation() {
   if (TypeTable.empty())
     return;
 
-  // Start the .debug$T section with 0x4.
+  // Start the .debug$T or .debug$P section with 0x4.
   OS.SwitchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
   emitCodeViewMagicVersion();
 
@@ -572,7 +589,7 @@ void CodeViewDebug::emitTypeGlobalHashes() {
   OS.AddComment("Section Version");
   OS.EmitIntValue(0, 2);
   OS.AddComment("Hash Algorithm");
-  OS.EmitIntValue(uint16_t(GlobalTypeHashAlg::SHA1), 2);
+  OS.EmitIntValue(uint16_t(GlobalTypeHashAlg::SHA1_8), 2);
 
   TypeIndex TI(TypeIndex::FirstNonSimpleIndex);
   for (const auto &GHR : TypeTable.hashes()) {
@@ -585,7 +602,7 @@ void CodeViewDebug::emitTypeGlobalHashes() {
       OS.AddComment(Comment);
       ++TI;
     }
-    assert(GHR.Hash.size() % 20 == 0);
+    assert(GHR.Hash.size() == 8);
     StringRef S(reinterpret_cast<const char *>(GHR.Hash.data()),
                 GHR.Hash.size());
     OS.EmitBinaryData(S);
@@ -821,10 +838,61 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
     emitCodeViewMagicVersion();
 }
 
+// Emit an S_THUNK32/S_END symbol pair for a thunk routine.
+// The only supported thunk ordinal is currently the standard type.
+void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
+                                          FunctionInfo &FI,
+                                          const MCSymbol *Fn) {
+  std::string FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
+  const ThunkOrdinal ordinal = ThunkOrdinal::Standard; // Only supported kind.
+
+  OS.AddComment("Symbol subsection for " + Twine(FuncName));
+  MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+
+  // Emit S_THUNK32
+  MCSymbol *ThunkRecordBegin = MMI->getContext().createTempSymbol(),
+           *ThunkRecordEnd   = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(ThunkRecordEnd, ThunkRecordBegin, 2);
+  OS.EmitLabel(ThunkRecordBegin);
+  OS.AddComment("Record kind: S_THUNK32");
+  OS.EmitIntValue(unsigned(SymbolKind::S_THUNK32), 2);
+  OS.AddComment("PtrParent");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("PtrEnd");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("PtrNext");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("Thunk section relative address");
+  OS.EmitCOFFSecRel32(Fn, /*Offset=*/0);
+  OS.AddComment("Thunk section index");
+  OS.EmitCOFFSectionIndex(Fn);
+  OS.AddComment("Code size");
+  OS.emitAbsoluteSymbolDiff(FI.End, Fn, 2);
+  OS.AddComment("Ordinal");
+  OS.EmitIntValue(unsigned(ordinal), 1);
+  OS.AddComment("Function name");
+  emitNullTerminatedSymbolName(OS, FuncName);
+  // Additional fields specific to the thunk ordinal would go here.
+  OS.EmitLabel(ThunkRecordEnd);
+
+  // Local variables/inlined routines are purposely omitted here.  The point of
+  // marking this as a thunk is so Visual Studio will NOT stop in this routine.
+
+  // Emit S_PROC_ID_END
+  const unsigned RecordLengthForSymbolEnd = 2;
+  OS.AddComment("Record length");
+  OS.EmitIntValue(RecordLengthForSymbolEnd, 2);
+  OS.AddComment("Record kind: S_PROC_ID_END");
+  OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2);
+
+  endCVSubsection(SymbolsEnd);
+}
+
 void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
                                              FunctionInfo &FI) {
-  // For each function there is a separate subsection
-  // which holds the PC to file:line table.
+  // For each function there is a separate subsection which holds the PC to
+  // file:line table.
   const MCSymbol *Fn = Asm->getSymbol(GV);
   assert(Fn);
 
@@ -836,6 +904,11 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
   assert(SP);
   setCurrentSubprogram(SP);
 
+  if (SP->isThunk()) {
+    emitDebugInfoForThunk(GV, FI, Fn);
+    return;
+  }
+
   // If we have a display name, build the fully qualified name by walking the
   // chain of scopes.
   if (!SP->getName().empty())
@@ -898,6 +971,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
     OS.EmitLabel(ProcRecordEnd);
 
     emitLocalVariableList(FI.Locals);
+    emitLexicalBlockList(FI.ChildBlocks, FI);
 
     // Emit inlined call site information. Only emit functions inlined directly
     // into the parent function. We'll emit the other sites recursively as part
@@ -1018,7 +1092,7 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
     LocalVariable Var;
     Var.DIVar = VI.Var;
     Var.DefRanges.emplace_back(std::move(DefRange));
-    recordLocalVariable(std::move(Var), VI.Loc->getInlinedAt());
+    recordLocalVariable(std::move(Var), Scope);
   }
 }
 
@@ -1100,7 +1174,7 @@ void CodeViewDebug::calculateRanges(
       auto J = std::next(I);
       const DIExpression *DIExpr = DVInst->getDebugExpression();
       while (J != E &&
-             !fragmentsOverlap(DIExpr, J->first->getDebugExpression()))
+             !DIExpr->fragmentsOverlap(J->first->getDebugExpression()))
         ++J;
       if (J != E)
         End = getLabelBeforeInsn(J->first);
@@ -1149,14 +1223,15 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
     Var.DIVar = DIVar;
 
     calculateRanges(Var, Ranges);
-    recordLocalVariable(std::move(Var), InlinedAt);
+    recordLocalVariable(std::move(Var), Scope);
   }
 }
 
 void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   const Function &GV = MF->getFunction();
-  assert(FnDebugInfo.count(&GV) == false);
-  CurFn = &FnDebugInfo[&GV];
+  auto Insertion = FnDebugInfo.insert({&GV, llvm::make_unique<FunctionInfo>()});
+  assert(Insertion.second && "function already has info");
+  CurFn = Insertion.first->second.get();
   CurFn->FuncId = NextFuncId++;
   CurFn->Begin = Asm->getFunctionBegin();
 
@@ -1261,6 +1336,7 @@ TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
     return lowerTypePointer(cast<DIDerivedType>(Ty));
   case dwarf::DW_TAG_ptr_to_member_type:
     return lowerTypeMemberPointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_restrict_type:
   case dwarf::DW_TAG_const_type:
   case dwarf::DW_TAG_volatile_type:
   // TODO: add support for DW_TAG_atomic_type here
@@ -1281,6 +1357,8 @@ TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
     return lowerTypeClass(cast<DICompositeType>(Ty));
   case dwarf::DW_TAG_union_type:
     return lowerTypeUnion(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_unspecified_type:
+    return TypeIndex::None();
   default:
     // Use the null type index.
     return TypeIndex();
@@ -1308,7 +1386,7 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
   DITypeRef ElementTypeRef = Ty->getBaseType();
   TypeIndex ElementTypeIndex = getTypeIndex(ElementTypeRef);
   // IndexType is size_t, which depends on the bitness of the target.
-  TypeIndex IndexType = Asm->TM.getPointerSize() == 8
+  TypeIndex IndexType = getPointerSizeInBytes() == 8
                             ? TypeIndex(SimpleTypeKind::UInt64Quad)
                             : TypeIndex(SimpleTypeKind::UInt32Long);
 
@@ -1323,7 +1401,9 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     const DISubrange *Subrange = cast<DISubrange>(Element);
     assert(Subrange->getLowerBound() == 0 &&
            "codeview doesn't support subranges with lower bounds");
-    int64_t Count = Subrange->getCount();
+    int64_t Count = -1;
+    if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt*>())
+      Count = CI->getSExtValue();
 
     // Forward declarations of arrays without a size and VLAs use a count of -1.
     // Emit a count of zero in these cases to match what MSVC does for arrays
@@ -1441,12 +1521,13 @@ TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
   return TypeIndex(STK);
 }
 
-TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty) {
+TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty,
+                                          PointerOptions PO) {
   TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType());
 
-  // Pointers to simple types can use SimpleTypeMode, rather than having a
-  // dedicated pointer type record.
-  if (PointeeTI.isSimple() &&
+  // Pointers to simple types without any options can use SimpleTypeMode, rather
+  // than having a dedicated pointer type record.
+  if (PointeeTI.isSimple() && PO == PointerOptions::None &&
       PointeeTI.getSimpleMode() == SimpleTypeMode::Direct &&
       Ty->getTag() == dwarf::DW_TAG_pointer_type) {
     SimpleTypeMode Mode = Ty->getSizeInBits() == 64
@@ -1470,10 +1551,7 @@ TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty) {
     PM = PointerMode::RValueReference;
     break;
   }
-  // FIXME: MSVC folds qualifiers into PointerOptions in the context of a method
-  // 'this' pointer, but not normal contexts. Figure out what we're supposed to
-  // do.
-  PointerOptions PO = PointerOptions::None;
+
   PointerRecord PR(PointeeTI, PK, PM, PO, Ty->getSizeInBits() / 8);
   return TypeTable.writeLeafType(PR);
 }
@@ -1511,16 +1589,17 @@ translatePtrToMemberRep(unsigned SizeInBytes, bool IsPMF, unsigned Flags) {
   llvm_unreachable("invalid ptr to member representation");
 }
 
-TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty) {
+TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty,
+                                                PointerOptions PO) {
   assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type);
   TypeIndex ClassTI = getTypeIndex(Ty->getClassType());
   TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType());
-  PointerKind PK = Asm->TM.getPointerSize() == 8 ? PointerKind::Near64
-                                                 : PointerKind::Near32;
+  PointerKind PK = getPointerSizeInBytes() == 8 ? PointerKind::Near64
+                                                : PointerKind::Near32;
   bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
   PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction
                          : PointerMode::PointerToDataMember;
-  PointerOptions PO = PointerOptions::None; // FIXME
+
   assert(Ty->getSizeInBits() / 8 <= 0xff && "pointer size too big");
   uint8_t SizeInBytes = Ty->getSizeInBits() / 8;
   MemberPointerInfo MPI(
@@ -1545,6 +1624,7 @@ static CallingConvention dwarfCCToCodeView(unsigned DwarfCC) {
 
 TypeIndex CodeViewDebug::lowerTypeModifier(const DIDerivedType *Ty) {
   ModifierOptions Mods = ModifierOptions::None;
+  PointerOptions PO = PointerOptions::None;
   bool IsModifier = true;
   const DIType *BaseTy = Ty;
   while (IsModifier && BaseTy) {
@@ -1552,9 +1632,16 @@ TypeIndex CodeViewDebug::lowerTypeModifier(const DIDerivedType *Ty) {
     switch (BaseTy->getTag()) {
     case dwarf::DW_TAG_const_type:
       Mods |= ModifierOptions::Const;
+      PO |= PointerOptions::Const;
       break;
     case dwarf::DW_TAG_volatile_type:
       Mods |= ModifierOptions::Volatile;
+      PO |= PointerOptions::Volatile;
+      break;
+    case dwarf::DW_TAG_restrict_type:
+      // Only pointer types be marked with __restrict. There is no known flag
+      // for __restrict in LF_MODIFIER records.
+      PO |= PointerOptions::Restrict;
       break;
     default:
       IsModifier = false;
@@ -1563,7 +1650,31 @@ TypeIndex CodeViewDebug::lowerTypeModifier(const DIDerivedType *Ty) {
     if (IsModifier)
       BaseTy = cast<DIDerivedType>(BaseTy)->getBaseType().resolve();
   }
+
+  // Check if the inner type will use an LF_POINTER record. If so, the
+  // qualifiers will go in the LF_POINTER record. This comes up for types like
+  // 'int *const' and 'int *__restrict', not the more common cases like 'const
+  // char *'.
+  if (BaseTy) {
+    switch (BaseTy->getTag()) {
+    case dwarf::DW_TAG_pointer_type:
+    case dwarf::DW_TAG_reference_type:
+    case dwarf::DW_TAG_rvalue_reference_type:
+      return lowerTypePointer(cast<DIDerivedType>(BaseTy), PO);
+    case dwarf::DW_TAG_ptr_to_member_type:
+      return lowerTypeMemberPointer(cast<DIDerivedType>(BaseTy), PO);
+    default:
+      break;
+    }
+  }
+
   TypeIndex ModifiedTI = getTypeIndex(BaseTy);
+
+  // Return the base type index if there aren't any modifiers. For example, the
+  // metadata could contain restrict wrappers around non-pointer types.
+  if (Mods == ModifierOptions::None)
+    return ModifiedTI;
+
   ModifierRecord MR(ModifiedTI, Mods);
   return TypeTable.writeLeafType(MR);
 }
@@ -1573,6 +1684,11 @@ TypeIndex CodeViewDebug::lowerTypeFunction(const DISubroutineType *Ty) {
   for (DITypeRef ArgTypeRef : Ty->getTypeArray())
     ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
 
+  // MSVC uses type none for variadic argument.
+  if (ReturnAndArgTypeIndices.size() > 1 &&
+      ReturnAndArgTypeIndices.back() == TypeIndex::Void()) {
+    ReturnAndArgTypeIndices.back() = TypeIndex::None();
+  }
   TypeIndex ReturnTypeIndex = TypeIndex::Void();
   ArrayRef<TypeIndex> ArgTypeIndices = None;
   if (!ReturnAndArgTypeIndices.empty()) {
@@ -1602,6 +1718,11 @@ TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty,
   for (DITypeRef ArgTypeRef : Ty->getTypeArray())
     ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
 
+  // MSVC uses type none for variadic argument.
+  if (ReturnAndArgTypeIndices.size() > 1 &&
+      ReturnAndArgTypeIndices.back() == TypeIndex::Void()) {
+    ReturnAndArgTypeIndices.back() = TypeIndex::None();
+  }
   TypeIndex ReturnTypeIndex = TypeIndex::Void();
   ArrayRef<TypeIndex> ArgTypeIndices = None;
   if (!ReturnAndArgTypeIndices.empty()) {
@@ -1716,6 +1837,26 @@ static ClassOptions getCommonClassOptions(const DICompositeType *Ty) {
   return CO;
 }
 
+void CodeViewDebug::addUDTSrcLine(const DIType *Ty, TypeIndex TI) {
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    break;
+  default:
+    return;
+  }
+
+  if (const auto *File = Ty->getFile()) {
+    StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(File));
+    TypeIndex SIDI = TypeTable.writeLeafType(SIDR);
+
+    UdtSourceLineRecord USLR(TI, SIDI, Ty->getLine());
+    TypeTable.writeLeafType(USLR);
+  }
+}
+
 TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
   ClassOptions CO = getCommonClassOptions(Ty);
   TypeIndex FTI;
@@ -1744,7 +1885,11 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
 
   EnumRecord ER(EnumeratorCount, CO, FTI, FullName, Ty->getIdentifier(),
                 getTypeIndex(Ty->getBaseType()));
-  return TypeTable.writeLeafType(ER);
+  TypeIndex EnumTI = TypeTable.writeLeafType(ER);
+
+  addUDTSrcLine(Ty, EnumTI);
+
+  return EnumTI;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1793,12 +1938,33 @@ void CodeViewDebug::collectMemberInfo(ClassInfo &Info,
     Info.Members.push_back({DDTy, 0});
     return;
   }
-  // An unnamed member must represent a nested struct or union. Add all the
-  // indirect fields to the current record.
+
+  // An unnamed member may represent a nested struct or union. Attempt to
+  // interpret the unnamed member as a DICompositeType possibly wrapped in
+  // qualifier types. Add all the indirect fields to the current record if that
+  // succeeds, and drop the member if that fails.
   assert((DDTy->getOffsetInBits() % 8) == 0 && "Unnamed bitfield member!");
   uint64_t Offset = DDTy->getOffsetInBits();
   const DIType *Ty = DDTy->getBaseType().resolve();
-  const DICompositeType *DCTy = cast<DICompositeType>(Ty);
+  bool FullyResolved = false;
+  while (!FullyResolved) {
+    switch (Ty->getTag()) {
+    case dwarf::DW_TAG_const_type:
+    case dwarf::DW_TAG_volatile_type:
+      // FIXME: we should apply the qualifier types to the indirect fields
+      // rather than dropping them.
+      Ty = cast<DIDerivedType>(Ty)->getBaseType().resolve();
+      break;
+    default:
+      FullyResolved = true;
+      break;
+    }
+  }
+
+  const DICompositeType *DCTy = dyn_cast<DICompositeType>(Ty);
+  if (!DCTy)
+    return;
+
   ClassInfo NestedInfo = collectClassInfo(DCTy);
   for (const ClassInfo::MemberInfo &IndirectField : NestedInfo.Members)
     Info.Members.push_back(
@@ -1838,7 +2004,28 @@ ClassInfo CodeViewDebug::collectClassInfo(const DICompositeType *Ty) {
   return Info;
 }
 
+static bool shouldAlwaysEmitCompleteClassType(const DICompositeType *Ty) {
+  // This routine is used by lowerTypeClass and lowerTypeUnion to determine
+  // if a complete type should be emitted instead of a forward reference.
+  return Ty->getName().empty() && Ty->getIdentifier().empty() &&
+      !Ty->isForwardDecl();
+}
+
 TypeIndex CodeViewDebug::lowerTypeClass(const DICompositeType *Ty) {
+  // Emit the complete type for unnamed structs.  C++ classes with methods
+  // which have a circular reference back to the class type are expected to
+  // be named by the front-end and should not be "unnamed".  C unnamed
+  // structs should not have circular references.
+  if (shouldAlwaysEmitCompleteClassType(Ty)) {
+    // If this unnamed complete type is already in the process of being defined
+    // then the description of the type is malformed and cannot be emitted
+    // into CodeView correctly so report a fatal error.
+    auto I = CompleteTypeIndices.find(Ty);
+    if (I != CompleteTypeIndices.end() && I->second == TypeIndex())
+      report_fatal_error("cannot debug circular reference to unnamed type");
+    return getCompleteTypeIndex(Ty);
+  }
+
   // First, construct the forward decl.  Don't look into Ty to compute the
   // forward decl options, since it might not be available in all TUs.
   TypeRecordKind Kind = getRecordKind(Ty);
@@ -1875,13 +2062,7 @@ TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
                  SizeInBytes, FullName, Ty->getIdentifier());
   TypeIndex ClassTI = TypeTable.writeLeafType(CR);
 
-  if (const auto *File = Ty->getFile()) {
-    StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(File));
-    TypeIndex SIDI = TypeTable.writeLeafType(SIDR);
-
-    UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine());
-    TypeTable.writeLeafType(USLR);
-  }
+  addUDTSrcLine(Ty, ClassTI);
 
   addToUDTs(Ty);
 
@@ -1889,6 +2070,10 @@ TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
 }
 
 TypeIndex CodeViewDebug::lowerTypeUnion(const DICompositeType *Ty) {
+  // Emit the complete type for unnamed unions.
+  if (shouldAlwaysEmitCompleteClassType(Ty))
+    return getCompleteTypeIndex(Ty);
+
   ClassOptions CO =
       ClassOptions::ForwardReference | getCommonClassOptions(Ty);
   std::string FullName = getFullyQualifiedName(Ty);
@@ -1917,11 +2102,7 @@ TypeIndex CodeViewDebug::lowerCompleteTypeUnion(const DICompositeType *Ty) {
                  Ty->getIdentifier());
   TypeIndex UnionTI = TypeTable.writeLeafType(UR);
 
-  StringIdRecord SIR(TypeIndex(0x0), getFullFilepath(Ty->getFile()));
-  TypeIndex SIRI = TypeTable.writeLeafType(SIR);
-
-  UdtSourceLineRecord USLR(UnionTI, SIRI, Ty->getLine());
-  TypeTable.writeLeafType(USLR);
+  addUDTSrcLine(Ty, UnionTI);
 
   addToUDTs(Ty);
 
@@ -1943,8 +2124,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
   for (const DIDerivedType *I : Info.Inheritance) {
     if (I->getFlags() & DINode::FlagVirtual) {
       // Virtual base.
-      // FIXME: Emit VBPtrOffset when the frontend provides it.
-      unsigned VBPtrOffset = 0;
+      unsigned VBPtrOffset = I->getVBPtrOffset();
       // FIXME: Despite the accessor name, the offset is really in bytes.
       unsigned VBTableIndex = I->getOffsetInBits() / 4;
       auto RecordKind = (I->getFlags() & DINode::FlagIndirectVirtualBase) == DINode::FlagIndirectVirtualBase
@@ -1956,6 +2136,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
           VBTableIndex);
 
       ContinuationBuilder.writeMemberType(VBCR);
+      MemberCount++;
     } else {
       assert(I->getOffsetInBits() % 8 == 0 &&
              "bases must be on byte boundaries");
@@ -1963,6 +2144,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
                           getTypeIndex(I->getBaseType()),
                           I->getOffsetInBits() / 8);
       ContinuationBuilder.writeMemberType(BCR);
+      MemberCount++;
     }
   }
 
@@ -2121,9 +2303,7 @@ TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) {
     return getTypeIndex(Ty);
   }
 
-  // Check if we've already translated the complete record type.  Lowering a
-  // complete type should never trigger lowering another complete type, so we
-  // can reuse the hash table lookup result.
+  // Check if we've already translated the complete record type.
   const auto *CTy = cast<DICompositeType>(Ty);
   auto InsertResult = CompleteTypeIndices.insert({CTy, TypeIndex()});
   if (!InsertResult.second)
@@ -2134,13 +2314,16 @@ TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) {
   // Make sure the forward declaration is emitted first. It's unclear if this
   // is necessary, but MSVC does it, and we should follow suit until we can show
   // otherwise.
-  TypeIndex FwdDeclTI = getTypeIndex(CTy);
+  // We only emit a forward declaration for named types.
+  if (!CTy->getName().empty() || !CTy->getIdentifier().empty()) {
+    TypeIndex FwdDeclTI = getTypeIndex(CTy);
 
-  // Just use the forward decl if we don't have complete type info. This might
-  // happen if the frontend is using modules and expects the complete definition
-  // to be emitted elsewhere.
-  if (CTy->isForwardDecl())
-    return FwdDeclTI;
+    // Just use the forward decl if we don't have complete type info. This
+    // might happen if the frontend is using modules and expects the complete
+    // definition to be emitted elsewhere.
+    if (CTy->isForwardDecl())
+      return FwdDeclTI;
+  }
 
   TypeIndex TI;
   switch (CTy->getTag()) {
@@ -2155,7 +2338,11 @@ TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) {
     llvm_unreachable("not a record");
   }
 
-  InsertResult.first->second = TI;
+  // Update the type index associated with this CompositeType.  This cannot
+  // use the 'InsertResult' iterator above because it is potentially
+  // invalidated by map insertions which can occur while lowering the class
+  // type above.
+  CompleteTypeIndices[CTy] = TI;
   return TI;
 }
 
@@ -2179,10 +2366,10 @@ void CodeViewDebug::emitLocalVariableList(ArrayRef<LocalVariable> Locals) {
   for (const LocalVariable &L : Locals)
     if (L.DIVar->isParameter())
       Params.push_back(&L);
-  std::sort(Params.begin(), Params.end(),
-            [](const LocalVariable *L, const LocalVariable *R) {
-              return L->DIVar->getArg() < R->DIVar->getArg();
-            });
+  llvm::sort(Params.begin(), Params.end(),
+             [](const LocalVariable *L, const LocalVariable *R) {
+               return L->DIVar->getArg() < R->DIVar->getArg();
+             });
   for (const LocalVariable *L : Params)
     emitLocalVariable(*L);
 
@@ -2272,15 +2459,150 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {
   }
 }
 
+void CodeViewDebug::emitLexicalBlockList(ArrayRef<LexicalBlock *> Blocks,
+                                         const FunctionInfo& FI) {
+  for (LexicalBlock *Block : Blocks)
+    emitLexicalBlock(*Block, FI);
+}
+
+/// Emit an S_BLOCK32 and S_END record pair delimiting the contents of a
+/// lexical block scope.
+void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
+                                     const FunctionInfo& FI) {
+  MCSymbol *RecordBegin = MMI->getContext().createTempSymbol(),
+           *RecordEnd   = MMI->getContext().createTempSymbol();
+
+  // Lexical block symbol record.
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(RecordEnd, RecordBegin, 2);   // Record Length
+  OS.EmitLabel(RecordBegin);
+  OS.AddComment("Record kind: S_BLOCK32");
+  OS.EmitIntValue(SymbolKind::S_BLOCK32, 2);              // Record Kind
+  OS.AddComment("PtrParent");
+  OS.EmitIntValue(0, 4);                                  // PtrParent
+  OS.AddComment("PtrEnd");
+  OS.EmitIntValue(0, 4);                                  // PtrEnd
+  OS.AddComment("Code size");
+  OS.emitAbsoluteSymbolDiff(Block.End, Block.Begin, 4);   // Code Size
+  OS.AddComment("Function section relative address");
+  OS.EmitCOFFSecRel32(Block.Begin, /*Offset=*/0);         // Func Offset
+  OS.AddComment("Function section index");
+  OS.EmitCOFFSectionIndex(FI.Begin);                      // Func Symbol
+  OS.AddComment("Lexical block name");
+  emitNullTerminatedSymbolName(OS, Block.Name);           // Name
+  OS.EmitLabel(RecordEnd);
+
+  // Emit variables local to this lexical block.
+  emitLocalVariableList(Block.Locals);
+
+  // Emit lexical blocks contained within this block.
+  emitLexicalBlockList(Block.Children, FI);
+
+  // Close the lexical block scope.
+  OS.AddComment("Record length");
+  OS.EmitIntValue(2, 2);                                  // Record Length
+  OS.AddComment("Record kind: S_END");
+  OS.EmitIntValue(SymbolKind::S_END, 2);                  // Record Kind
+}
+
+/// Convenience routine for collecting lexical block information for a list
+/// of lexical scopes.
+void CodeViewDebug::collectLexicalBlockInfo(
+        SmallVectorImpl<LexicalScope *> &Scopes,
+        SmallVectorImpl<LexicalBlock *> &Blocks,
+        SmallVectorImpl<LocalVariable> &Locals) {
+  for (LexicalScope *Scope : Scopes)
+    collectLexicalBlockInfo(*Scope, Blocks, Locals);
+}
+
+/// Populate the lexical blocks and local variable lists of the parent with
+/// information about the specified lexical scope.
+void CodeViewDebug::collectLexicalBlockInfo(
+    LexicalScope &Scope,
+    SmallVectorImpl<LexicalBlock *> &ParentBlocks,
+    SmallVectorImpl<LocalVariable> &ParentLocals) {
+  if (Scope.isAbstractScope())
+    return;
+
+  auto LocalsIter = ScopeVariables.find(&Scope);
+  if (LocalsIter == ScopeVariables.end()) {
+    // This scope does not contain variables and can be eliminated.
+    collectLexicalBlockInfo(Scope.getChildren(), ParentBlocks, ParentLocals);
+    return;
+  }
+  SmallVectorImpl<LocalVariable> &Locals = LocalsIter->second;
+
+  const DILexicalBlock *DILB = dyn_cast<DILexicalBlock>(Scope.getScopeNode());
+  if (!DILB) {
+    // This scope is not a lexical block and can be eliminated, but keep any
+    // local variables it contains.
+    ParentLocals.append(Locals.begin(), Locals.end());
+    collectLexicalBlockInfo(Scope.getChildren(), ParentBlocks, ParentLocals);
+    return;
+  }
+
+  const SmallVectorImpl<InsnRange> &Ranges = Scope.getRanges();
+  if (Ranges.size() != 1 || !getLabelAfterInsn(Ranges.front().second)) {
+    // This lexical block scope has too many address ranges to represent in the
+    // current CodeView format or does not have a valid address range.
+    // Eliminate this lexical scope and promote any locals it contains to the
+    // parent scope.
+    //
+    // For lexical scopes with multiple address ranges you may be tempted to
+    // construct a single range covering every instruction where the block is
+    // live and everything in between.  Unfortunately, Visual Studio only
+    // displays variables from the first matching lexical block scope.  If the
+    // first lexical block contains exception handling code or cold code which
+    // is moved to the bottom of the routine creating a single range covering
+    // nearly the entire routine, then it will hide all other lexical blocks
+    // and the variables they contain.
+    //
+    ParentLocals.append(Locals.begin(), Locals.end());
+    collectLexicalBlockInfo(Scope.getChildren(), ParentBlocks, ParentLocals);
+    return;
+  }
+
+  // Create a new CodeView lexical block for this lexical scope.  If we've
+  // seen this DILexicalBlock before then the scope tree is malformed and
+  // we can handle this gracefully by not processing it a second time.
+  auto BlockInsertion = CurFn->LexicalBlocks.insert({DILB, LexicalBlock()});
+  if (!BlockInsertion.second)
+    return;
+
+  // Create a lexical block containing the local variables and collect the
+  // the lexical block information for the children.
+  const InsnRange &Range = Ranges.front();
+  assert(Range.first && Range.second);
+  LexicalBlock &Block = BlockInsertion.first->second;
+  Block.Begin = getLabelBeforeInsn(Range.first);
+  Block.End = getLabelAfterInsn(Range.second);
+  assert(Block.Begin && "missing label for scope begin");
+  assert(Block.End && "missing label for scope end");
+  Block.Name = DILB->getName();
+  Block.Locals = std::move(Locals);
+  ParentBlocks.push_back(&Block);
+  collectLexicalBlockInfo(Scope.getChildren(), Block.Children, Block.Locals);
+}
+
 void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
   const Function &GV = MF->getFunction();
   assert(FnDebugInfo.count(&GV));
-  assert(CurFn == &FnDebugInfo[&GV]);
+  assert(CurFn == FnDebugInfo[&GV].get());
 
   collectVariableInfo(GV.getSubprogram());
 
+  // Build the lexical block structure to emit for this routine.
+  if (LexicalScope *CFS = LScopes.getCurrentFunctionScope())
+    collectLexicalBlockInfo(*CFS, CurFn->ChildBlocks, CurFn->Locals);
+
+  // Clear the scope and variable information from the map which will not be
+  // valid after we have finished processing this routine.  This also prepares
+  // the map for the subsequent routine.
+  ScopeVariables.clear();
+
   // Don't emit anything if we don't have any line tables.
-  if (!CurFn->HaveLineInfo) {
+  // Thunks are compiler-generated and probably won't have source correlation.
+  if (!CurFn->HaveLineInfo && !GV.getSubprogram()->isThunk()) {
     FnDebugInfo.erase(&GV);
     CurFn = nullptr;
     return;
@@ -2296,8 +2618,8 @@ void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
 void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
 
-  // Ignore DBG_VALUE locations and function prologue.
-  if (!Asm || !CurFn || MI->isDebugValue() ||
+  // Ignore DBG_VALUE and DBG_LABEL locations and function prologue.
+  if (!Asm || !CurFn || MI->isDebugInstr() ||
       MI->getFlag(MachineInstr::FrameSetup))
     return;
 
@@ -2306,7 +2628,7 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   DebugLoc DL = MI->getDebugLoc();
   if (!DL && MI->getParent() != PrevInstBB) {
     for (const auto &NextMI : *MI->getParent()) {
-      if (NextMI.isDebugValue())
+      if (NextMI.isDebugInstr())
         continue;
       DL = NextMI.getDebugLoc();
       if (DL)
@@ -2432,6 +2754,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV,
   // FIXME: Thread local data, etc
   MCSymbol *DataBegin = MMI->getContext().createTempSymbol(),
            *DataEnd = MMI->getContext().createTempSymbol();
+  const unsigned FixedLengthOfThisRecord = 12;
   OS.AddComment("Record length");
   OS.emitAbsoluteSymbolDiff(DataEnd, DataBegin, 2);
   OS.EmitLabel(DataBegin);
@@ -2459,6 +2782,6 @@ void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV,
   OS.AddComment("Segment");
   OS.EmitCOFFSectionIndex(GVSym);
   OS.AddComment("Name");
-  emitNullTerminatedSymbolName(OS, DIGV->getName());
+  emitNullTerminatedSymbolName(OS, DIGV->getName(), FixedLengthOfThisRecord);
   OS.EmitLabel(DataEnd);
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 69e93640d7ef..6a0da5f993d0 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -48,7 +48,7 @@ class MCStreamer;
 class MCSymbol;
 class MachineFunction;
 
-/// \brief Collects and handles line tables information in a CodeView format.
+/// Collects and handles line tables information in a CodeView format.
 class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   MCStreamer &OS;
   BumpPtrAllocator Allocator;
@@ -107,9 +107,23 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
     unsigned SiteFuncId = 0;
   };
 
+  // Combines information from DILexicalBlock and LexicalScope.
+  struct LexicalBlock {
+    SmallVector<LocalVariable, 1> Locals;
+    SmallVector<LexicalBlock *, 1> Children;
+    const MCSymbol *Begin;
+    const MCSymbol *End;
+    StringRef Name;
+  };
+
   // For each function, store a vector of labels to its instructions, as well as
   // to the end of the function.
   struct FunctionInfo {
+    FunctionInfo() = default;
+
+    // Uncopyable.
+    FunctionInfo(const FunctionInfo &FI) = delete;
+
     /// Map from inlined call site to inlined instructions and child inlined
     /// call sites. Listed in program order.
     std::unordered_map<const DILocation *, InlineSite> InlineSites;
@@ -119,6 +133,11 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
     SmallVector<LocalVariable, 1> Locals;
 
+    std::unordered_map<const DILexicalBlockBase*, LexicalBlock> LexicalBlocks;
+
+    // Lexical blocks containing local variables.
+    SmallVector<LexicalBlock *, 1> ChildBlocks;
+
     std::vector<std::pair<MCSymbol *, MDNode *>> Annotations;
 
     const MCSymbol *Begin = nullptr;
@@ -129,6 +148,12 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   };
   FunctionInfo *CurFn = nullptr;
 
+  // Map used to seperate variables according to the lexical scope they belong
+  // in.  This is populated by recordLocalVariable() before
+  // collectLexicalBlocks() separates the variables between the FunctionInfo
+  // and LexicalBlocks.
+  DenseMap<const LexicalScope *, SmallVector<LocalVariable, 1>> ScopeVariables;
+
   /// The set of comdat .debug$S sections that we've seen so far. Each section
   /// must start with a magic version number that must only be emitted once.
   /// This set tracks which sections we've already opened.
@@ -159,7 +184,7 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   /// Remember some debug info about each function. Keep it in a stable order to
   /// emit at the end of the TU.
-  MapVector<const Function *, FunctionInfo> FnDebugInfo;
+  MapVector<const Function *, std::unique_ptr<FunctionInfo>> FnDebugInfo;
 
   /// Map from full file path to .cv_file id. Full paths are built from DIFiles
   /// and are stored in FileToFilepathMap;
@@ -200,7 +225,7 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   using FileToFilepathMapTy = std::map<const DIFile *, std::string>;
   FileToFilepathMapTy FileToFilepathMap;
 
-  StringRef getFullFilepath(const DIFile *S);
+  StringRef getFullFilepath(const DIFile *File);
 
   unsigned maybeRecordFile(const DIFile *F);
 
@@ -214,7 +239,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   }
 
   /// Emit the magic version number at the start of a CodeView type or symbol
-  /// section. Appears at the front of every .debug$S or .debug$T section.
+  /// section. Appears at the front of every .debug$S or .debug$T or .debug$P
+  /// section.
   void emitCodeViewMagicVersion();
 
   void emitTypeInformation();
@@ -225,6 +251,10 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void emitInlineeLinesSubsection();
 
+  void emitDebugInfoForThunk(const Function *GV,
+                             FunctionInfo &FI,
+                             const MCSymbol *Fn);
+
   void emitDebugInfoForFunction(const Function *GV, FunctionInfo &FI);
 
   void emitDebugInfoForGlobals();
@@ -253,9 +283,18 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &Processed);
 
+  // Construct the lexical block tree for a routine, pruning emptpy lexical
+  // scopes, and populate it with local variables.
+  void collectLexicalBlockInfo(SmallVectorImpl<LexicalScope *> &Scopes,
+                               SmallVectorImpl<LexicalBlock *> &Blocks,
+                               SmallVectorImpl<LocalVariable> &Locals);
+  void collectLexicalBlockInfo(LexicalScope &Scope,
+                               SmallVectorImpl<LexicalBlock *> &ParentBlocks,
+                               SmallVectorImpl<LocalVariable> &ParentLocals);
+
   /// Records information about a local variable in the appropriate scope. In
   /// particular, locals from inlined code live inside the inlining site.
-  void recordLocalVariable(LocalVariable &&Var, const DILocation *Loc);
+  void recordLocalVariable(LocalVariable &&Var, const LexicalScope *LS);
 
   /// Emits local variables in the appropriate order.
   void emitLocalVariableList(ArrayRef<LocalVariable> Locals);
@@ -263,6 +302,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// Emits an S_LOCAL record and its associated defined ranges.
   void emitLocalVariable(const LocalVariable &Var);
 
+  /// Emits a sequence of lexical block scopes and their children.
+  void emitLexicalBlockList(ArrayRef<LexicalBlock *> Blocks,
+                            const FunctionInfo& FI);
+
+  /// Emit a lexical block scope and its children.
+  void emitLexicalBlock(const LexicalBlock &Block, const FunctionInfo& FI);
+
   /// Translates the DIType to codeview if necessary and returns a type index
   /// for it.
   codeview::TypeIndex getTypeIndex(DITypeRef TypeRef,
@@ -279,12 +325,18 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void addToUDTs(const DIType *Ty);
 
+  void addUDTSrcLine(const DIType *Ty, codeview::TypeIndex TI);
+
   codeview::TypeIndex lowerType(const DIType *Ty, const DIType *ClassTy);
   codeview::TypeIndex lowerTypeAlias(const DIDerivedType *Ty);
   codeview::TypeIndex lowerTypeArray(const DICompositeType *Ty);
   codeview::TypeIndex lowerTypeBasic(const DIBasicType *Ty);
-  codeview::TypeIndex lowerTypePointer(const DIDerivedType *Ty);
-  codeview::TypeIndex lowerTypeMemberPointer(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypePointer(
+      const DIDerivedType *Ty,
+      codeview::PointerOptions PO = codeview::PointerOptions::None);
+  codeview::TypeIndex lowerTypeMemberPointer(
+      const DIDerivedType *Ty,
+      codeview::PointerOptions PO = codeview::PointerOptions::None);
   codeview::TypeIndex lowerTypeModifier(const DIDerivedType *Ty);
   codeview::TypeIndex lowerTypeFunction(const DISubroutineType *Ty);
   codeview::TypeIndex lowerTypeVFTableShape(const DIDerivedType *Ty);
@@ -327,21 +379,21 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   unsigned getPointerSizeInBytes();
 
 protected:
-  /// \brief Gather pre-function debug information.
+  /// Gather pre-function debug information.
   void beginFunctionImpl(const MachineFunction *MF) override;
 
-  /// \brief Gather post-function debug information.
+  /// Gather post-function debug information.
   void endFunctionImpl(const MachineFunction *) override;
 
 public:
-  CodeViewDebug(AsmPrinter *Asm);
+  CodeViewDebug(AsmPrinter *AP);
 
   void setSymbolSize(const MCSymbol *, uint64_t) override {}
 
-  /// \brief Emit the COFF section that holds the line table information.
+  /// Emit the COFF section that holds the line table information.
   void endModule() override;
 
-  /// \brief Process beginning of an instruction.
+  /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 };
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index b3148db30cd6..570424a79c81 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -17,6 +17,7 @@
 #include "DwarfUnit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -86,8 +87,9 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
     // easily, which helps track down where it came from.
     if (!dwarf::isValidFormForVersion(AttrData.getForm(),
                                       AP->getDwarfVersion())) {
-      DEBUG(dbgs() << "Invalid form " << format("0x%x", AttrData.getForm())
-                   << " for DWARF version " << AP->getDwarfVersion() << "\n");
+      LLVM_DEBUG(dbgs() << "Invalid form " << format("0x%x", AttrData.getForm())
+                        << " for DWARF version " << AP->getDwarfVersion()
+                        << "\n");
       llvm_unreachable("Invalid form for specified DWARF version");
     }
 #endif
@@ -388,6 +390,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_data2:
   case dwarf::DW_FORM_strx2:
   case dwarf::DW_FORM_addrx2:
+  case dwarf::DW_FORM_strx3:
   case dwarf::DW_FORM_strp:
   case dwarf::DW_FORM_ref4:
   case dwarf::DW_FORM_data4:
@@ -410,6 +413,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_GNU_str_index:
   case dwarf::DW_FORM_GNU_addr_index:
   case dwarf::DW_FORM_ref_udata:
+  case dwarf::DW_FORM_strx:
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -423,58 +427,23 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 /// SizeOf - Determine size of integer value in bytes.
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  dwarf::FormParams Params = {0, 0, dwarf::DWARF32};
+  if (AP)
+    Params = {AP->getDwarfVersion(), uint8_t(AP->getPointerSize()),
+              AP->OutStreamer->getContext().getDwarfFormat()};
+
+  if (Optional<uint8_t> FixedSize = dwarf::getFixedFormByteSize(Form, Params))
+    return *FixedSize;
+
   switch (Form) {
-  case dwarf::DW_FORM_implicit_const:
-  case dwarf::DW_FORM_flag_present:
-    return 0;
-  case dwarf::DW_FORM_flag:
-  case dwarf::DW_FORM_ref1:
-  case dwarf::DW_FORM_data1:
-  case dwarf::DW_FORM_strx1:
-  case dwarf::DW_FORM_addrx1:
-    return sizeof(int8_t);
-  case dwarf::DW_FORM_ref2:
-  case dwarf::DW_FORM_data2:
-  case dwarf::DW_FORM_strx2:
-  case dwarf::DW_FORM_addrx2:
-    return sizeof(int16_t);
-  case dwarf::DW_FORM_ref4:
-  case dwarf::DW_FORM_data4:
-  case dwarf::DW_FORM_ref_sup4:
-  case dwarf::DW_FORM_strx4:
-  case dwarf::DW_FORM_addrx4:
-    return sizeof(int32_t);
-  case dwarf::DW_FORM_ref8:
-  case dwarf::DW_FORM_ref_sig8:
-  case dwarf::DW_FORM_data8:
-  case dwarf::DW_FORM_ref_sup8:
-    return sizeof(int64_t);
-  case dwarf::DW_FORM_ref_addr:
-    if (AP->getDwarfVersion() == 2)
-      return AP->getPointerSize();
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_strp:
-  case dwarf::DW_FORM_GNU_ref_alt:
-  case dwarf::DW_FORM_GNU_strp_alt:
-  case dwarf::DW_FORM_line_strp:
-  case dwarf::DW_FORM_sec_offset:
-  case dwarf::DW_FORM_strp_sup:
-    switch (AP->OutStreamer->getContext().getDwarfFormat()) {
-    case dwarf::DWARF32:
-      return 4;
-    case dwarf::DWARF64:
-      return 8;
-    }
-    llvm_unreachable("Invalid DWARF format");
   case dwarf::DW_FORM_GNU_str_index:
   case dwarf::DW_FORM_GNU_addr_index:
   case dwarf::DW_FORM_ref_udata:
+  case dwarf::DW_FORM_strx:
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
     return getSLEB128Size(Integer);
-  case dwarf::DW_FORM_addr:
-    return AP->getPointerSize();
   default: llvm_unreachable("DIE Value form not supported yet");
   }
 }
@@ -564,44 +533,46 @@ void DIEDelta::print(raw_ostream &O) const {
 /// EmitValue - Emit string value.
 ///
 void DIEString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  assert(
-      (Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_GNU_str_index) &&
-      "Expected valid string form");
-
   // Index of string in symbol table.
-  if (Form == dwarf::DW_FORM_GNU_str_index) {
+  switch (Form) {
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_strx3:
+  case dwarf::DW_FORM_strx4:
     DIEInteger(S.getIndex()).EmitValue(AP, Form);
     return;
-  }
-
-  // Relocatable symbol.
-  assert(Form == dwarf::DW_FORM_strp);
-  if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) {
-    DIELabel(S.getSymbol()).EmitValue(AP, Form);
+  case dwarf::DW_FORM_strp:
+    if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
+      DIELabel(S.getSymbol()).EmitValue(AP, Form);
+    else
+      DIEInteger(S.getOffset()).EmitValue(AP, Form);
     return;
+  default:
+    llvm_unreachable("Expected valid string form");
   }
-
-  // Offset into symbol table.
-  DIEInteger(S.getOffset()).EmitValue(AP, Form);
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
 unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  assert(
-      (Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_GNU_str_index) &&
-      "Expected valid string form");
-
   // Index of string in symbol table.
-  if (Form == dwarf::DW_FORM_GNU_str_index)
+  switch (Form) {
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_strx3:
+  case dwarf::DW_FORM_strx4:
     return DIEInteger(S.getIndex()).SizeOf(AP, Form);
-
-  // Relocatable symbol.
-  if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
-    return DIELabel(S.getSymbol()).SizeOf(AP, Form);
-
-  // Offset into symbol table.
-  return DIEInteger(S.getOffset()).SizeOf(AP, Form);
+  case dwarf::DW_FORM_strp:
+    if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
+      return DIELabel(S.getSymbol()).SizeOf(AP, Form);
+    return DIEInteger(S.getOffset()).SizeOf(AP, Form);
+  default:
+    llvm_unreachable("Expected valid string form");
+  }
 }
 
 LLVM_DUMP_METHOD
@@ -615,8 +586,8 @@ void DIEString::print(raw_ostream &O) const {
 void DIEInlineString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_string) {
     for (char ch : S)
-      AP->EmitInt8(ch);
-    AP->EmitInt8(0);
+      AP->emitInt8(ch);
+    AP->emitInt8(0);
     return;
   }
   llvm_unreachable("Expected valid string form");
@@ -722,9 +693,9 @@ unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const {
 void DIELoc::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
-  case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
-  case dwarf::DW_FORM_block2: Asm->EmitInt16(Size);   break;
-  case dwarf::DW_FORM_block4: Asm->EmitInt32(Size);   break;
+  case dwarf::DW_FORM_block1: Asm->emitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->emitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->emitInt32(Size);   break;
   case dwarf::DW_FORM_block:
   case dwarf::DW_FORM_exprloc:
     Asm->EmitULEB128(Size); break;
@@ -773,10 +744,11 @@ unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const {
 void DIEBlock::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
-  case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
-  case dwarf::DW_FORM_block2: Asm->EmitInt16(Size);   break;
-  case dwarf::DW_FORM_block4: Asm->EmitInt32(Size);   break;
+  case dwarf::DW_FORM_block1: Asm->emitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->emitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->emitInt32(Size);   break;
   case dwarf::DW_FORM_block:  Asm->EmitULEB128(Size); break;
+  case dwarf::DW_FORM_string: break;
   case dwarf::DW_FORM_data16: break;
   }
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 15ade3c96dfe..b8f1202494d7 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
-/// \brief Grabs the string in whichever attribute is passed in and returns
+/// Grabs the string in whichever attribute is passed in and returns
 /// a reference to it.
 static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
   // Iterate through all the attributes until we find the one we're
@@ -40,10 +40,10 @@ static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
   return StringRef("");
 }
 
-/// \brief Adds the string in \p Str to the hash. This also hashes
+/// Adds the string in \p Str to the hash. This also hashes
 /// a trailing NULL with the string.
 void DIEHash::addString(StringRef Str) {
-  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
+  LLVM_DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
   Hash.update(Str);
   Hash.update(makeArrayRef((uint8_t)'\0'));
 }
@@ -51,9 +51,9 @@ void DIEHash::addString(StringRef Str) {
 // FIXME: The LEB128 routines are copied and only slightly modified out of
 // LEB128.h.
 
-/// \brief Adds the unsigned in \p Value to the hash encoded as a ULEB128.
+/// Adds the unsigned in \p Value to the hash encoded as a ULEB128.
 void DIEHash::addULEB128(uint64_t Value) {
-  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  LLVM_DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
   do {
     uint8_t Byte = Value & 0x7f;
     Value >>= 7;
@@ -64,7 +64,7 @@ void DIEHash::addULEB128(uint64_t Value) {
 }
 
 void DIEHash::addSLEB128(int64_t Value) {
-  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  LLVM_DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
   bool More;
   do {
     uint8_t Byte = Value & 0x7f;
@@ -77,10 +77,10 @@ void DIEHash::addSLEB128(int64_t Value) {
   } while (More);
 }
 
-/// \brief Including \p Parent adds the context of Parent to the hash..
+/// Including \p Parent adds the context of Parent to the hash..
 void DIEHash::addParentContext(const DIE &Parent) {
 
-  DEBUG(dbgs() << "Adding parent context to hash...\n");
+  LLVM_DEBUG(dbgs() << "Adding parent context to hash...\n");
 
   // [7.27.2] For each surrounding type or namespace beginning with the
   // outermost such construct...
@@ -108,7 +108,7 @@ void DIEHash::addParentContext(const DIE &Parent) {
 
     // ... Then the name, taken from the DW_AT_name attribute.
     StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
-    DEBUG(dbgs() << "... adding context: " << Name << "\n");
+    LLVM_DEBUG(dbgs() << "... adding context: " << Name << "\n");
     if (!Name.empty())
       addString(Name);
   }
@@ -118,9 +118,9 @@ void DIEHash::addParentContext(const DIE &Parent) {
 void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
 
   for (const auto &V : Die.values()) {
-    DEBUG(dbgs() << "Attribute: "
-                 << dwarf::AttributeString(V.getAttribute())
-                 << " added.\n");
+    LLVM_DEBUG(dbgs() << "Attribute: "
+                      << dwarf::AttributeString(V.getAttribute())
+                      << " added.\n");
     switch (V.getAttribute()) {
 #define HANDLE_DIE_HASH_ATTR(NAME)                                             \
   case dwarf::NAME:                                                            \
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
index 29337ae38a99..dae517ab2c29 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -23,7 +23,7 @@ namespace llvm {
 class AsmPrinter;
 class CompileUnit;
 
-/// \brief An object containing the capability of hashing and adding hash
+/// An object containing the capability of hashing and adding hash
 /// attributes onto a DIE.
 class DIEHash {
   // Collection of all attributes used in hashing a particular DIE.
@@ -35,66 +35,66 @@ class DIEHash {
 public:
   DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
 
-  /// \brief Computes the CU signature.
+  /// Computes the CU signature.
   uint64_t computeCUSignature(StringRef DWOName, const DIE &Die);
 
-  /// \brief Computes the type signature.
+  /// Computes the type signature.
   uint64_t computeTypeSignature(const DIE &Die);
 
   // Helper routines to process parts of a DIE.
 private:
-  /// \brief Adds the parent context of \param Die to the hash.
-  void addParentContext(const DIE &Die);
+  /// Adds the parent context of \param Parent to the hash.
+  void addParentContext(const DIE &Parent);
 
-  /// \brief Adds the attributes of \param Die to the hash.
+  /// Adds the attributes of \param Die to the hash.
   void addAttributes(const DIE &Die);
 
-  /// \brief Computes the full DWARF4 7.27 hash of the DIE.
+  /// Computes the full DWARF4 7.27 hash of the DIE.
   void computeHash(const DIE &Die);
 
   // Routines that add DIEValues to the hash.
 public:
-  /// \brief Adds \param Value to the hash.
+  /// Adds \param Value to the hash.
   void update(uint8_t Value) { Hash.update(Value); }
 
-  /// \brief Encodes and adds \param Value to the hash as a ULEB128.
+  /// Encodes and adds \param Value to the hash as a ULEB128.
   void addULEB128(uint64_t Value);
 
-  /// \brief Encodes and adds \param Value to the hash as a SLEB128.
+  /// Encodes and adds \param Value to the hash as a SLEB128.
   void addSLEB128(int64_t Value);
 
 private:
-  /// \brief Adds \param Str to the hash and includes a NULL byte.
+  /// Adds \param Str to the hash and includes a NULL byte.
   void addString(StringRef Str);
 
-  /// \brief Collects the attributes of DIE \param Die into the \param Attrs
+  /// Collects the attributes of DIE \param Die into the \param Attrs
   /// structure.
   void collectAttributes(const DIE &Die, DIEAttrs &Attrs);
 
-  /// \brief Hashes the attributes in \param Attrs in order.
+  /// Hashes the attributes in \param Attrs in order.
   void hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag);
 
-  /// \brief Hashes the data in a block like DIEValue, e.g. DW_FORM_block or
+  /// Hashes the data in a block like DIEValue, e.g. DW_FORM_block or
   /// DW_FORM_exprloc.
   void hashBlockData(const DIE::const_value_range &Values);
 
-  /// \brief Hashes the contents pointed to in the .debug_loc section.
+  /// Hashes the contents pointed to in the .debug_loc section.
   void hashLocList(const DIELocList &LocList);
 
-  /// \brief Hashes an individual attribute.
+  /// Hashes an individual attribute.
   void hashAttribute(const DIEValue &Value, dwarf::Tag Tag);
 
-  /// \brief Hashes an attribute that refers to another DIE.
+  /// Hashes an attribute that refers to another DIE.
   void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
                     const DIE &Entry);
 
-  /// \brief Hashes a reference to a named type in such a way that is
+  /// Hashes a reference to a named type in such a way that is
   /// independent of whether that type is described by a declaration or a
   /// definition.
   void hashShallowTypeReference(dwarf::Attribute Attribute, const DIE &Entry,
                                 StringRef Name);
 
-  /// \brief Hashes a reference to a previously referenced type DIE.
+  /// Hashes a reference to a previously referenced type DIE.
   void hashRepeatedTypeReference(dwarf::Attribute Attribute,
                                  unsigned DieNumber);
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 856758c8e4f6..25518a339c61 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
-// \brief If @MI is a DBG_VALUE with debug value described by a
+// If @MI is a DBG_VALUE with debug value described by a
 // defined register, returns the number of this register.
 // In the other case, returns 0.
 static unsigned isDescribedByReg(const MachineInstr &MI) {
@@ -50,8 +50,8 @@ void DbgValueHistoryMap::startInstrRange(InlinedVariable Var,
   auto &Ranges = VarInstrRanges[Var];
   if (!Ranges.empty() && Ranges.back().second == nullptr &&
       Ranges.back().first->isIdenticalTo(MI)) {
-    DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
-                 << "\t" << Ranges.back().first << "\t" << MI << "\n");
+    LLVM_DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
+                      << "\t" << Ranges.back().first << "\t" << MI << "\n");
     return;
   }
   Ranges.push_back(std::make_pair(&MI, nullptr));
@@ -86,7 +86,7 @@ using RegDescribedVarsMap = std::map<unsigned, SmallVector<InlinedVariable, 1>>;
 
 } // end anonymous namespace
 
-// \brief Claim that @Var is not described by @RegNo anymore.
+// Claim that @Var is not described by @RegNo anymore.
 static void dropRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
                                 InlinedVariable Var) {
   const auto &I = RegVars.find(RegNo);
@@ -100,7 +100,7 @@ static void dropRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
     RegVars.erase(I);
 }
 
-// \brief Claim that @Var is now described by @RegNo.
+// Claim that @Var is now described by @RegNo.
 static void addRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
                                InlinedVariable Var) {
   assert(RegNo != 0U);
@@ -109,7 +109,7 @@ static void addRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
   VarSet.push_back(Var);
 }
 
-// \brief Terminate the location range for variables described by register at
+// Terminate the location range for variables described by register at
 // @I by inserting @ClobberingInstr to their history.
 static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
                                 RegDescribedVarsMap::iterator I,
@@ -122,7 +122,7 @@ static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
   RegVars.erase(I);
 }
 
-// \brief Terminate the location range for variables described by register
+// Terminate the location range for variables described by register
 // @RegNo by inserting @ClobberingInstr to their history.
 static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
                                 DbgValueHistoryMap &HistMap,
@@ -133,7 +133,7 @@ static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
   clobberRegisterUses(RegVars, I, HistMap, ClobberingInstr);
 }
 
-// \brief Returns the first instruction in @MBB which corresponds to
+// Returns the first instruction in @MBB which corresponds to
 // the function epilogue, or nullptr if @MBB doesn't contain an epilogue.
 static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
   auto LastMI = MBB.getLastNonDebugInstr();
@@ -155,7 +155,7 @@ static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
   return &*MBB.begin();
 }
 
-// \brief Collect registers that are modified in the function body (their
+// Collect registers that are modified in the function body (their
 // contents is changed outside of the prologue and epilogue).
 static void collectChangingRegs(const MachineFunction *MF,
                                 const TargetRegisterInfo *TRI,
@@ -198,7 +198,7 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
   RegDescribedVarsMap RegVars;
   for (const auto &MBB : *MF) {
     for (const auto &MI : MBB) {
-      if (!MI.isDebugValue()) {
+      if (!MI.isDebugInstr()) {
         // Not a DBG_VALUE instruction. It may clobber registers which describe
         // some variables.
         for (const MachineOperand &MO : MI.operands()) {
@@ -234,6 +234,10 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
         continue;
       }
 
+      // Skip DBG_LABEL instructions.
+      if (MI.isDebugLabel())
+        continue;
+
       assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
       // Use the base variable (without any DW_OP_piece expressions)
       // as index into History. The full variables including the
@@ -265,3 +269,33 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
     }
   }
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DbgValueHistoryMap::dump() const {
+  dbgs() << "DbgValueHistoryMap:\n";
+  for (const auto &VarRangePair : *this) {
+    const InlinedVariable &Var = VarRangePair.first;
+    const InstrRanges &Ranges = VarRangePair.second;
+
+    const DILocalVariable *LocalVar = Var.first;
+    const DILocation *Location = Var.second;
+
+    dbgs() << " - " << LocalVar->getName() << " at ";
+
+    if (Location)
+      dbgs() << Location->getFilename() << ":" << Location->getLine() << ":"
+             << Location->getColumn();
+    else
+      dbgs() << "<unknown location>";
+
+    dbgs() << " --\n";
+
+    for (const InstrRange &Range : Ranges) {
+      dbgs() << "   Begin: " << *Range.first;
+      if (Range.second)
+        dbgs() << "   End  : " << *Range.second;
+      dbgs() << "\n";
+    }
+  }
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
index a7b0562e8102..a262cb38b175 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
@@ -52,6 +52,10 @@ public:
   void clear() { VarInstrRanges.clear(); }
   InstrRangesMap::const_iterator begin() const { return VarInstrRanges.begin(); }
   InstrRangesMap::const_iterator end() const { return VarInstrRanges.end(); }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const;
+#endif
 };
 
 void calculateDbgValueHistory(const MachineFunction *MF,
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 2e5c22447936..82e14dc13cb1 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -25,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfdebug"
+
 Optional<DbgVariableLocation>
 DbgVariableLocation::extractFromMachineInstruction(
     const MachineInstr &Instruction) {
@@ -123,29 +125,6 @@ MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
   return LabelsAfterInsn.lookup(MI);
 }
 
-int DebugHandlerBase::fragmentCmp(const DIExpression *P1,
-                                  const DIExpression *P2) {
-  auto Fragment1 = *P1->getFragmentInfo();
-  auto Fragment2 = *P2->getFragmentInfo();
-  unsigned l1 = Fragment1.OffsetInBits;
-  unsigned l2 = Fragment2.OffsetInBits;
-  unsigned r1 = l1 + Fragment1.SizeInBits;
-  unsigned r2 = l2 + Fragment2.SizeInBits;
-  if (r1 <= l2)
-    return -1;
-  else if (r2 <= l1)
-    return 1;
-  else
-    return 0;
-}
-
-bool DebugHandlerBase::fragmentsOverlap(const DIExpression *P1,
-                                        const DIExpression *P2) {
-  if (!P1->isFragment() || !P2->isFragment())
-    return true;
-  return fragmentCmp(P1, P2) == 0;
-}
-
 /// If this type is derived from a base type then return base type size.
 uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   DIType *Ty = TyRef.resolve();
@@ -213,6 +192,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
   assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
   calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
                            DbgValues);
+  LLVM_DEBUG(DbgValues.dump());
 
   // Request labels for the full history.
   for (const auto &I : DbgValues) {
@@ -232,8 +212,8 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
           const DIExpression *Fragment = I->first->getDebugExpression();
           if (std::all_of(Ranges.begin(), I,
                           [&](DbgValueHistoryMap::InstrRange Pred) {
-                            return !fragmentsOverlap(
-                                Fragment, Pred.first->getDebugExpression());
+                            return !Fragment->fragmentsOverlap(
+                                Pred.first->getDebugExpression());
                           }))
             LabelsBeforeInsn[I->first] = Asm->getFunctionBegin();
           else
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index 245d70038de9..1ccefe32be75 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -122,14 +122,6 @@ public:
   /// Return Label immediately following the instruction.
   MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
-  /// Determine the relative position of the fragments described by P1 and P2.
-  /// Returns -1 if P1 is entirely before P2, 0 if P1 and P2 overlap, 1 if P1 is
-  /// entirely after P2.
-  static int fragmentCmp(const DIExpression *P1, const DIExpression *P2);
-
-  /// Determine whether two variable fragments overlap.
-  static bool fragmentsOverlap(const DIExpression *P1, const DIExpression *P2);
-
   /// If this type is derived from a base type then return base type size.
   static uint64_t getBaseTypeSize(const DITypeRef TyRef);
 };
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 3d6d8a76529c..ac49657b68fa 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 
 #include "DebugLocStream.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -20,7 +21,7 @@
 namespace llvm {
 class AsmPrinter;
 
-/// \brief This struct describes location entries emitted in the .debug_loc
+/// This struct describes location entries emitted in the .debug_loc
 /// section.
 class DebugLocEntry {
   /// Begin and end symbols for the address range that this location is valid.
@@ -28,7 +29,7 @@ class DebugLocEntry {
   const MCSymbol *End;
 
 public:
-  /// \brief A single location or constant.
+  /// A single location or constant.
   struct Value {
     Value(const DIExpression *Expr, int64_t i)
         : Expression(Expr), EntryKind(E_Integer) {
@@ -105,13 +106,13 @@ public:
     Values.push_back(std::move(Val));
   }
 
-  /// \brief If this and Next are describing different pieces of the same
+  /// If this and Next are describing different pieces of the same
   /// variable, merge them by appending Next's values to the current
   /// list of values.
   /// Return true if the merge was successful.
   bool MergeValues(const DebugLocEntry &Next);
 
-  /// \brief Attempt to merge this DebugLocEntry with Next and return
+  /// Attempt to merge this DebugLocEntry with Next and return
   /// true if the merge was successful. Entries can be merged if they
   /// share the same Loc/Constant and if Next immediately follows this
   /// Entry.
@@ -135,10 +136,10 @@ public:
         }) && "value must be a piece");
   }
 
-  // \brief Sort the pieces by offset.
+  // Sort the pieces by offset.
   // Remove any duplicate entries by dropping all but the first.
   void sortUniqueValues() {
-    std::sort(Values.begin(), Values.end());
+    llvm::sort(Values.begin(), Values.end());
     Values.erase(
         std::unique(
             Values.begin(), Values.end(), [](const Value &A, const Value &B) {
@@ -147,12 +148,12 @@ public:
         Values.end());
   }
 
-  /// \brief Lower this entry into a DWARF expression.
+  /// Lower this entry into a DWARF expression.
   void finalize(const AsmPrinter &AP, DebugLocStream::ListBuilder &List,
                 const DIBasicType *BT);
 };
 
-/// \brief Compare two Values for equality.
+/// Compare two Values for equality.
 inline bool operator==(const DebugLocEntry::Value &A,
                        const DebugLocEntry::Value &B) {
   if (A.EntryKind != B.EntryKind)
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
index 0c551dfff9cc..8dcf5cbc1889 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
@@ -22,7 +22,7 @@ class DwarfCompileUnit;
 class MachineInstr;
 class MCSymbol;
 
-/// \brief Byte stream of .debug_loc entries.
+/// Byte stream of .debug_loc entries.
 ///
 /// Stores a unified stream of .debug_loc entries.  There's \a List for each
 /// variable/inlined-at pair, and an \a Entry for each \a DebugLocEntry.
@@ -55,7 +55,7 @@ private:
   SmallString<256> DWARFBytes;
   SmallVector<std::string, 32> Comments;
 
-  /// \brief Only verbose textual output needs comments.  This will be set to
+  /// Only verbose textual output needs comments.  This will be set to
   /// true for that case, and false otherwise.
   bool GenerateComments;
 
@@ -69,7 +69,7 @@ public:
   class EntryBuilder;
 
 private:
-  /// \brief Start a new .debug_loc entry list.
+  /// Start a new .debug_loc entry list.
   ///
   /// Start a new .debug_loc entry list.  Return the new list's index so it can
   /// be retrieved later via \a getList().
@@ -89,7 +89,7 @@ private:
   /// \return false iff the list is deleted.
   bool finalizeList(AsmPrinter &Asm);
 
-  /// \brief Start a new .debug_loc entry.
+  /// Start a new .debug_loc entry.
   ///
   /// Until the next call, bytes added to the stream will be added to this
   /// entry.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
deleted file mode 100644
index c21b3d3451ad..000000000000
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- llvm/CodeGen/DwarfAccelTable.cpp - Dwarf Accelerator Tables --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing dwarf accelerator tables.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DwarfAccelTable.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/DIE.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <vector>
-
-using namespace llvm;
-
-// The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
-DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList)
-    : Header(8 + (atomList.size() * 4)), HeaderData(atomList),
-      Entries(Allocator) {}
-
-void DwarfAccelTable::AddName(DwarfStringPoolEntryRef Name, const DIE *die,
-                              char Flags) {
-  assert(Data.empty() && "Already finalized!");
-  // If the string is in the list already then add this die to the list
-  // otherwise add a new one.
-  DataArray &DIEs = Entries[Name.getString()];
-  assert(!DIEs.Name || DIEs.Name == Name);
-  DIEs.Name = Name;
-  DIEs.Values.push_back(new (Allocator) HashDataContents(die, Flags));
-}
-
-void DwarfAccelTable::ComputeBucketCount() {
-  // First get the number of unique hashes.
-  std::vector<uint32_t> uniques(Data.size());
-  for (size_t i = 0, e = Data.size(); i < e; ++i)
-    uniques[i] = Data[i]->HashValue;
-  array_pod_sort(uniques.begin(), uniques.end());
-  std::vector<uint32_t>::iterator p =
-      std::unique(uniques.begin(), uniques.end());
-  uint32_t num = std::distance(uniques.begin(), p);
-
-  // Then compute the bucket size, minimum of 1 bucket.
-  if (num > 1024)
-    Header.bucket_count = num / 4;
-  else if (num > 16)
-    Header.bucket_count = num / 2;
-  else
-    Header.bucket_count = num > 0 ? num : 1;
-
-  Header.hashes_count = num;
-}
-
-// compareDIEs - comparison predicate that sorts DIEs by their offset.
-static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
-                        const DwarfAccelTable::HashDataContents *B) {
-  return A->Die->getOffset() < B->Die->getOffset();
-}
-
-void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, StringRef Prefix) {
-  // Create the individual hash data outputs.
-  Data.reserve(Entries.size());
-  for (StringMap<DataArray>::iterator EI = Entries.begin(), EE = Entries.end();
-       EI != EE; ++EI) {
-
-    // Unique the entries.
-    std::stable_sort(EI->second.Values.begin(), EI->second.Values.end(), compareDIEs);
-    EI->second.Values.erase(
-        std::unique(EI->second.Values.begin(), EI->second.Values.end()),
-        EI->second.Values.end());
-
-    HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
-    Data.push_back(Entry);
-  }
-
-  // Figure out how many buckets we need, then compute the bucket
-  // contents and the final ordering. We'll emit the hashes and offsets
-  // by doing a walk during the emission phase. We add temporary
-  // symbols to the data so that we can reference them during the offset
-  // later, we'll emit them when we emit the data.
-  ComputeBucketCount();
-
-  // Compute bucket contents and final ordering.
-  Buckets.resize(Header.bucket_count);
-  for (size_t i = 0, e = Data.size(); i < e; ++i) {
-    uint32_t bucket = Data[i]->HashValue % Header.bucket_count;
-    Buckets[bucket].push_back(Data[i]);
-    Data[i]->Sym = Asm->createTempSymbol(Prefix);
-  }
-
-  // Sort the contents of the buckets by hash value so that hash
-  // collisions end up together. Stable sort makes testing easier and
-  // doesn't cost much more.
-  for (size_t i = 0; i < Buckets.size(); ++i)
-    std::stable_sort(Buckets[i].begin(), Buckets[i].end(),
-                     [] (HashData *LHS, HashData *RHS) {
-                       return LHS->HashValue < RHS->HashValue;
-                     });
-}
-
-// Emits the header for the table via the AsmPrinter.
-void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
-  Asm->OutStreamer->AddComment("Header Magic");
-  Asm->EmitInt32(Header.magic);
-  Asm->OutStreamer->AddComment("Header Version");
-  Asm->EmitInt16(Header.version);
-  Asm->OutStreamer->AddComment("Header Hash Function");
-  Asm->EmitInt16(Header.hash_function);
-  Asm->OutStreamer->AddComment("Header Bucket Count");
-  Asm->EmitInt32(Header.bucket_count);
-  Asm->OutStreamer->AddComment("Header Hash Count");
-  Asm->EmitInt32(Header.hashes_count);
-  Asm->OutStreamer->AddComment("Header Data Length");
-  Asm->EmitInt32(Header.header_data_len);
-  Asm->OutStreamer->AddComment("HeaderData Die Offset Base");
-  Asm->EmitInt32(HeaderData.die_offset_base);
-  Asm->OutStreamer->AddComment("HeaderData Atom Count");
-  Asm->EmitInt32(HeaderData.Atoms.size());
-  for (size_t i = 0; i < HeaderData.Atoms.size(); i++) {
-    Atom A = HeaderData.Atoms[i];
-    Asm->OutStreamer->AddComment(dwarf::AtomTypeString(A.type));
-    Asm->EmitInt16(A.type);
-    Asm->OutStreamer->AddComment(dwarf::FormEncodingString(A.form));
-    Asm->EmitInt16(A.form);
-  }
-}
-
-// Walk through and emit the buckets for the table. Each index is
-// an offset into the list of hashes.
-void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
-  unsigned index = 0;
-  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
-    Asm->OutStreamer->AddComment("Bucket " + Twine(i));
-    if (!Buckets[i].empty())
-      Asm->EmitInt32(index);
-    else
-      Asm->EmitInt32(std::numeric_limits<uint32_t>::max());
-    // Buckets point in the list of hashes, not to the data. Do not
-    // increment the index multiple times in case of hash collisions.
-    uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
-    for (auto *HD : Buckets[i]) {
-      uint32_t HashValue = HD->HashValue;
-      if (PrevHash != HashValue)
-        ++index;
-      PrevHash = HashValue;
-    }
-  }
-}
-
-// Walk through the buckets and emit the individual hashes for each
-// bucket.
-void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
-  uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
-  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
-    for (HashList::const_iterator HI = Buckets[i].begin(),
-                                  HE = Buckets[i].end();
-         HI != HE; ++HI) {
-      uint32_t HashValue = (*HI)->HashValue;
-      if (PrevHash == HashValue)
-        continue;
-      Asm->OutStreamer->AddComment("Hash in Bucket " + Twine(i));
-      Asm->EmitInt32(HashValue);
-      PrevHash = HashValue;
-    }
-  }
-}
-
-// Walk through the buckets and emit the individual offsets for each
-// element in each bucket. This is done via a symbol subtraction from the
-// beginning of the section. The non-section symbol will be output later
-// when we emit the actual data.
-void DwarfAccelTable::emitOffsets(AsmPrinter *Asm, const MCSymbol *SecBegin) {
-  uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
-  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
-    for (HashList::const_iterator HI = Buckets[i].begin(),
-                                  HE = Buckets[i].end();
-         HI != HE; ++HI) {
-      uint32_t HashValue = (*HI)->HashValue;
-      if (PrevHash == HashValue)
-        continue;
-      PrevHash = HashValue;
-      Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i));
-      MCContext &Context = Asm->OutStreamer->getContext();
-      const MCExpr *Sub = MCBinaryExpr::createSub(
-          MCSymbolRefExpr::create((*HI)->Sym, Context),
-          MCSymbolRefExpr::create(SecBegin, Context), Context);
-      Asm->OutStreamer->EmitValue(Sub, sizeof(uint32_t));
-    }
-  }
-}
-
-// Walk through the buckets and emit the full data for each element in
-// the bucket. For the string case emit the dies and the various offsets.
-// Terminate each HashData bucket with 0.
-void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfDebug *D) {
-  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
-    uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
-    for (HashList::const_iterator HI = Buckets[i].begin(),
-                                  HE = Buckets[i].end();
-         HI != HE; ++HI) {
-      // Terminate the previous entry if there is no hash collision
-      // with the current one.
-      if (PrevHash != std::numeric_limits<uint64_t>::max() &&
-          PrevHash != (*HI)->HashValue)
-        Asm->EmitInt32(0);
-      // Remember to emit the label for our offset.
-      Asm->OutStreamer->EmitLabel((*HI)->Sym);
-      Asm->OutStreamer->AddComment((*HI)->Str);
-      Asm->emitDwarfStringOffset((*HI)->Data.Name);
-      Asm->OutStreamer->AddComment("Num DIEs");
-      Asm->EmitInt32((*HI)->Data.Values.size());
-      for (HashDataContents *HD : (*HI)->Data.Values) {
-        // Emit the DIE offset
-        Asm->EmitInt32(HD->Die->getDebugSectionOffset());
-        // If we have multiple Atoms emit that info too.
-        // FIXME: A bit of a hack, we either emit only one atom or all info.
-        if (HeaderData.Atoms.size() > 1) {
-          Asm->EmitInt16(HD->Die->getTag());
-          Asm->EmitInt8(HD->Flags);
-        }
-      }
-      PrevHash = (*HI)->HashValue;
-    }
-    // Emit the final end marker for the bucket.
-    if (!Buckets[i].empty())
-      Asm->EmitInt32(0);
-  }
-}
-
-// Emit the entire data structure to the output file.
-void DwarfAccelTable::emit(AsmPrinter *Asm, const MCSymbol *SecBegin,
-                           DwarfDebug *D) {
-  // Emit the header.
-  EmitHeader(Asm);
-
-  // Emit the buckets.
-  EmitBuckets(Asm);
-
-  // Emit the hashes.
-  EmitHashes(Asm);
-
-  // Emit the offsets.
-  emitOffsets(Asm, SecBegin);
-
-  // Emit the hash data.
-  EmitData(Asm, D);
-}
-
-#ifndef NDEBUG
-void DwarfAccelTable::print(raw_ostream &OS) {
-  Header.print(OS);
-  HeaderData.print(OS);
-
-  OS << "Entries: \n";
-  for (StringMap<DataArray>::const_iterator EI = Entries.begin(),
-                                            EE = Entries.end();
-       EI != EE; ++EI) {
-    OS << "Name: " << EI->getKeyData() << "\n";
-    for (HashDataContents *HD : EI->second.Values)
-      HD->print(OS);
-  }
-
-  OS << "Buckets and Hashes: \n";
-  for (size_t i = 0, e = Buckets.size(); i < e; ++i)
-    for (HashList::const_iterator HI = Buckets[i].begin(),
-                                  HE = Buckets[i].end();
-         HI != HE; ++HI)
-      (*HI)->print(OS);
-
-  OS << "Data: \n";
-  for (std::vector<HashData *>::const_iterator DI = Data.begin(),
-                                               DE = Data.end();
-       DI != DE; ++DI)
-    (*DI)->print(OS);
-}
-#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
deleted file mode 100644
index f56199dc8e72..000000000000
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ /dev/null
@@ -1,261 +0,0 @@
-//==- llvm/CodeGen/DwarfAccelTable.h - Dwarf Accelerator Tables --*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing dwarf accelerator tables.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/DIE.h"
-#include "llvm/CodeGen/DwarfStringPoolEntry.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-// The dwarf accelerator tables are an indirect hash table optimized
-// for null lookup rather than access to known data. They are output into
-// an on-disk format that looks like this:
-//
-// .-------------.
-// |  HEADER     |
-// |-------------|
-// |  BUCKETS    |
-// |-------------|
-// |  HASHES     |
-// |-------------|
-// |  OFFSETS    |
-// |-------------|
-// |  DATA       |
-// `-------------'
-//
-// where the header contains a magic number, version, type of hash function,
-// the number of buckets, total number of hashes, and room for a special
-// struct of data and the length of that struct.
-//
-// The buckets contain an index (e.g. 6) into the hashes array. The hashes
-// section contains all of the 32-bit hash values in contiguous memory, and
-// the offsets contain the offset into the data area for the particular
-// hash.
-//
-// For a lookup example, we could hash a function name and take it modulo the
-// number of buckets giving us our bucket. From there we take the bucket value
-// as an index into the hashes table and look at each successive hash as long
-// as the hash value is still the same modulo result (bucket value) as earlier.
-// If we have a match we look at that same entry in the offsets table and
-// grab the offset in the data for our final match.
-
-namespace llvm {
-
-class AsmPrinter;
-class DwarfDebug;
-
-class DwarfAccelTable {
-  // Helper function to compute the number of buckets needed based on
-  // the number of unique hashes.
-  void ComputeBucketCount();
-
-  struct TableHeader {
-    uint32_t magic = MagicHash; // 'HASH' magic value to allow endian detection
-    uint16_t version = 1;       // Version number.
-    uint16_t hash_function = dwarf::DW_hash_function_djb;
-    // The hash function enumeration that was used.
-    uint32_t bucket_count = 0;  // The number of buckets in this hash table.
-    uint32_t hashes_count = 0;  // The total number of unique hash values
-                                // and hash data offsets in this table.
-    uint32_t header_data_len;   // The bytes to skip to get to the hash
-                                // indexes (buckets) for correct alignment.
-    // Also written to disk is the implementation specific header data.
-
-    static const uint32_t MagicHash = 0x48415348;
-
-    TableHeader(uint32_t data_len) : header_data_len(data_len) {}
-
-#ifndef NDEBUG
-    void print(raw_ostream &OS) {
-      OS << "Magic: " << format("0x%x", magic) << "\n"
-         << "Version: " << version << "\n"
-         << "Hash Function: " << hash_function << "\n"
-         << "Bucket Count: " << bucket_count << "\n"
-         << "Header Data Length: " << header_data_len << "\n";
-    }
-
-    void dump() { print(dbgs()); }
-#endif
-  };
-
-public:
-  // The HeaderData describes the form of each set of data. In general this
-  // is as a list of atoms (atom_count) where each atom contains a type
-  // (AtomType type) of data, and an encoding form (form). In the case of
-  // data that is referenced via DW_FORM_ref_* the die_offset_base is
-  // used to describe the offset for all forms in the list of atoms.
-  // This also serves as a public interface of sorts.
-  // When written to disk this will have the form:
-  //
-  // uint32_t die_offset_base
-  // uint32_t atom_count
-  // atom_count Atoms
-
-  // Make these public so that they can be used as a general interface to
-  // the class.
-  struct Atom {
-    uint16_t type; // enum AtomType
-    uint16_t form; // DWARF DW_FORM_ defines
-
-    constexpr Atom(uint16_t type, uint16_t form) : type(type), form(form) {}
-
-#ifndef NDEBUG
-    void print(raw_ostream &OS) {
-      OS << "Type: " << dwarf::AtomTypeString(type) << "\n"
-         << "Form: " << dwarf::FormEncodingString(form) << "\n";
-    }
-
-    void dump() { print(dbgs()); }
-#endif
-  };
-
-private:
-  struct TableHeaderData {
-    uint32_t die_offset_base;
-    SmallVector<Atom, 3> Atoms;
-
-    TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
-        : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
-
-#ifndef NDEBUG
-    void print(raw_ostream &OS) {
-      OS << "die_offset_base: " << die_offset_base << "\n";
-      for (size_t i = 0; i < Atoms.size(); i++)
-        Atoms[i].print(OS);
-    }
-
-    void dump() { print(dbgs()); }
-#endif
-  };
-
-  // The data itself consists of a str_offset, a count of the DIEs in the
-  // hash and the offsets to the DIEs themselves.
-  // On disk each data section is ended with a 0 KeyType as the end of the
-  // hash chain.
-  // On output this looks like:
-  // uint32_t str_offset
-  // uint32_t hash_data_count
-  // HashData[hash_data_count]
-public:
-  struct HashDataContents {
-    const DIE *Die;   // Offsets
-    char Flags; // Specific flags to output
-
-    HashDataContents(const DIE *D, char Flags) : Die(D), Flags(Flags) {}
-
-#ifndef NDEBUG
-    void print(raw_ostream &OS) const {
-      OS << "  Offset: " << Die->getOffset() << "\n"
-         << "  Tag: " << dwarf::TagString(Die->getTag()) << "\n"
-         << "  Flags: " << Flags << "\n";
-    }
-#endif
-  };
-
-private:
-  // String Data
-  struct DataArray {
-    DwarfStringPoolEntryRef Name;
-    std::vector<HashDataContents *> Values;
-  };
-
-  friend struct HashData;
-
-  struct HashData {
-    StringRef Str;
-    uint32_t HashValue;
-    MCSymbol *Sym;
-    DwarfAccelTable::DataArray &Data; // offsets
-
-    HashData(StringRef S, DwarfAccelTable::DataArray &Data)
-        : Str(S), Data(Data) {
-      HashValue = dwarf::djbHash(S);
-    }
-
-#ifndef NDEBUG
-    void print(raw_ostream &OS) {
-      OS << "Name: " << Str << "\n";
-      OS << "  Hash Value: " << format("0x%x", HashValue) << "\n";
-      OS << "  Symbol: ";
-      if (Sym)
-        OS << *Sym;
-      else
-        OS << "<none>";
-      OS << "\n";
-      for (HashDataContents *C : Data.Values) {
-        OS << "  Offset: " << C->Die->getOffset() << "\n";
-        OS << "  Tag: " << dwarf::TagString(C->Die->getTag()) << "\n";
-        OS << "  Flags: " << C->Flags << "\n";
-      }
-    }
-
-    void dump() { print(dbgs()); }
-#endif
-  };
-
-  // Internal Functions
-  void EmitHeader(AsmPrinter *);
-  void EmitBuckets(AsmPrinter *);
-  void EmitHashes(AsmPrinter *);
-  void emitOffsets(AsmPrinter *, const MCSymbol *);
-  void EmitData(AsmPrinter *, DwarfDebug *D);
-
-  // Allocator for HashData and HashDataContents.
-  BumpPtrAllocator Allocator;
-
-  // Output Variables
-  TableHeader Header;
-  TableHeaderData HeaderData;
-  std::vector<HashData *> Data;
-
-  using StringEntries = StringMap<DataArray, BumpPtrAllocator &>;
-
-  StringEntries Entries;
-
-  // Buckets/Hashes/Offsets
-  using HashList = std::vector<HashData *>;
-  using BucketList = std::vector<HashList>;
-  BucketList Buckets;
-  HashList Hashes;
-
-  // Public Implementation
-public:
-  DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
-  DwarfAccelTable(const DwarfAccelTable &) = delete;
-  DwarfAccelTable &operator=(const DwarfAccelTable &) = delete;
-
-  void AddName(DwarfStringPoolEntryRef Name, const DIE *Die, char Flags = 0);
-  void FinalizeTable(AsmPrinter *, StringRef);
-  void emit(AsmPrinter *, const MCSymbol *, DwarfDebug *);
-#ifndef NDEBUG
-  void print(raw_ostream &OS);
-  void dump() { print(dbgs()); }
-#endif
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index cbb4c48b4d88..1990456cc555 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
@@ -30,6 +29,7 @@
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index c8cd8eb8ffd3..32271a0ef24a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -40,6 +39,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
@@ -94,16 +94,18 @@ void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
                  DIEInteger(0));
 }
 
-unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName,
-                                               StringRef DirName) {
+unsigned DwarfCompileUnit::getOrCreateSourceID(const DIFile *File) {
   // If we print assembly, we can't separate .file entries according to
   // compile units. Thus all files will belong to the default compile unit.
 
   // FIXME: add a better feature test than hasRawTextSupport. Even better,
   // extend .file to support this.
+  unsigned CUID = Asm->OutStreamer->hasRawTextSupport() ? 0 : getUniqueID();
+  if (!File)
+    return Asm->OutStreamer->EmitDwarfFileDirective(0, "", "", nullptr, None, CUID);
   return Asm->OutStreamer->EmitDwarfFileDirective(
-      0, DirName, FileName,
-      Asm->OutStreamer->hasRawTextSupport() ? 0 : getUniqueID());
+      0, File->getDirectory(), File->getFilename(), getMD5AsBytes(File),
+      File->getSource(), CUID);
 }
 
 DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
@@ -190,10 +192,13 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
       DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
     }
 
+    if (Expr)
+      DwarfExpr->addFragmentOffset(Expr);
+
     if (Global) {
       const MCSymbol *Sym = Asm->getSymbol(Global);
       if (Global->isThreadLocal()) {
-        if (Asm->TM.Options.EmulatedTLS) {
+        if (Asm->TM.useEmulatedTLS()) {
           // TODO: add debug info for emulated thread local mode.
         } else {
           // FIXME: Make this work with -gsplit-dwarf.
@@ -225,10 +230,13 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
         addOpAddress(*Loc, Sym);
       }
     }
-    if (Expr) {
-      DwarfExpr->addFragmentOffset(Expr);
-      DwarfExpr->addExpression(Expr);
-    }
+    // Global variables attached to symbols are memory locations.
+    // It would be better if this were unconditional, but malformed input that
+    // mixes non-fragments and fragments for the same variable is too expensive
+    // to detect in the verifier.
+    if (DwarfExpr->isUnknownLocation())
+      DwarfExpr->setMemoryLocationKind();
+    DwarfExpr->addExpression(Expr);
   }
   if (Loc)
     addBlock(*VariableDIE, dwarf::DW_AT_location, DwarfExpr->finalize());
@@ -241,7 +249,8 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
 
     // If the linkage name is different than the name, go ahead and output
     // that as well into the name table.
-    if (GV->getLinkageName() != "" && GV->getName() != GV->getLinkageName())
+    if (GV->getLinkageName() != "" && GV->getName() != GV->getLinkageName() &&
+        DD->useAllLinkageNames())
       DD->addAccelName(GV->getLinkageName(), *VariableDIE);
   }
 
@@ -267,15 +276,20 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
 
 void DwarfCompileUnit::initStmtList() {
   // Define start line table label for each Compile Unit.
-  MCSymbol *LineTableStartSym =
-      Asm->OutStreamer->getDwarfLineTableSymbol(getUniqueID());
+  MCSymbol *LineTableStartSym;
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  if (DD->useSectionsAsReferences()) {
+    LineTableStartSym = TLOF.getDwarfLineSection()->getBeginSymbol();
+  } else {
+    LineTableStartSym =
+        Asm->OutStreamer->getDwarfLineTableSymbol(getUniqueID());
+  }
 
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section. For split dwarf this is
   // left in the skeleton CU and so not included.
   // The line table entries are not always emitted in assembly, so it
   // is not okay to use line_table_start here.
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   StmtListValue =
       addSectionLabel(getUnitDie(), dwarf::DW_AT_stmt_list, LineTableStartSym,
                       TLOF.getDwarfLineSection()->getBeginSymbol());
@@ -313,10 +327,16 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
 
   // Only include DW_AT_frame_base in full debug info
   if (!includeMinimalInlineScopes()) {
-    const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
-    MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-    if (RI->isPhysicalRegister(Location.getReg()))
-      addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+    if (Asm->MF->getTarget().getTargetTriple().isNVPTX()) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_call_frame_cfa);
+      addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc);
+    } else {
+      const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
+      MachineLocation Location(RI->getFrameRegister(*Asm->MF));
+      if (RI->isPhysicalRegister(Location.getReg()))
+        addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+    }
   }
 
   // Add name to the name table, we do this here because we're guaranteed
@@ -385,21 +405,28 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
                                          SmallVector<RangeSpan, 2> Range) {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
 
-  // Emit offset in .debug_range as a relocatable label. emitDIE will handle
-  // emitting it appropriately.
+  // Emit the offset into .debug_ranges or .debug_rnglists as a relocatable
+  // label. emitDIE() will handle emitting it appropriately.
   const MCSymbol *RangeSectionSym =
-      TLOF.getDwarfRangesSection()->getBeginSymbol();
+      DD->getDwarfVersion() >= 5
+          ? TLOF.getDwarfRnglistsSection()->getBeginSymbol()
+          : TLOF.getDwarfRangesSection()->getBeginSymbol();
 
   RangeSpanList List(Asm->createTempSymbol("debug_ranges"), std::move(Range));
 
   // Under fission, ranges are specified by constant offsets relative to the
   // CU's DW_AT_GNU_ranges_base.
-  if (isDwoUnit())
-    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                    RangeSectionSym);
-  else
+  // FIXME: For DWARF v5, do not generate the DW_AT_ranges attribute under
+  // fission until we support the forms using the .debug_addr section
+  // (DW_RLE_startx_endx etc.).
+  if (isDwoUnit()) {
+    if (DD->getDwarfVersion() < 5)
+      addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                      RangeSectionSym);
+  } else {
     addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
                     RangeSectionSym);
+  }
 
   // Add the range list to the set of ranges to be emitted.
   (Skeleton ? Skeleton : this)->CURangeLists.push_back(std::move(List));
@@ -407,9 +434,10 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
 
 void DwarfCompileUnit::attachRangesOrLowHighPC(
     DIE &Die, SmallVector<RangeSpan, 2> Ranges) {
-  if (Ranges.size() == 1) {
-    const auto &single = Ranges.front();
-    attachLowHighPC(Die, single.getStart(), single.getEnd());
+  if (Ranges.size() == 1 || !DD->useRangesSection()) {
+    const RangeSpan &Front = Ranges.front();
+    const RangeSpan &Back = Ranges.back();
+    attachLowHighPC(Die, Front.getStart(), Back.getEnd());
   } else
     addScopeRangeList(Die, std::move(Ranges));
 }
@@ -443,7 +471,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
   // Add the call site information to the DIE.
   const DILocation *IA = Scope->getInlinedAt();
   addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
-          getOrCreateSourceID(IA->getFilename(), IA->getDirectory()));
+          getOrCreateSourceID(IA->getFile()));
   addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, IA->getLine());
   if (IA->getDiscriminator() && DD->getDwarfVersion() >= 4)
     addUInt(*ScopeDIE, dwarf::DW_AT_GNU_discriminator, None,
@@ -482,6 +510,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
                                                 bool Abstract) {
   // Define variable debug information entry.
   auto VariableDie = DIE::get(DIEValueAllocator, DV.getTag());
+  insertDIE(DV.getVariable(), VariableDie);
 
   if (Abstract) {
     applyVariableAttributes(DV, *VariableDie);
@@ -547,8 +576,11 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     Ops.append(Expr->elements_begin(), Expr->elements_end());
     DIExpressionCursor Cursor(Ops);
     DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addMachineRegExpression(
-        *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg);
+    if (const MCSymbol *FrameSymbol = Asm->getFunctionFrameSymbol())
+      addOpAddress(*Loc, FrameSymbol);
+    else
+      DwarfExpr.addMachineRegExpression(
+          *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg);
     DwarfExpr.addExpression(std::move(Cursor));
   }
   addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
@@ -565,13 +597,95 @@ DIE *DwarfCompileUnit::constructVariableDIE(DbgVariable &DV,
   return Var;
 }
 
+/// Return all DIVariables that appear in count: expressions.
+static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) {
+  SmallVector<const DIVariable *, 2> Result;
+  auto *Array = dyn_cast<DICompositeType>(Var->getType());
+  if (!Array || Array->getTag() != dwarf::DW_TAG_array_type)
+    return Result;
+  for (auto *El : Array->getElements()) {
+    if (auto *Subrange = dyn_cast<DISubrange>(El)) {
+      auto Count = Subrange->getCount();
+      if (auto *Dependency = Count.dyn_cast<DIVariable *>())
+        Result.push_back(Dependency);
+    }
+  }
+  return Result;
+}
+
+/// Sort local variables so that variables appearing inside of helper
+/// expressions come first.
+static SmallVector<DbgVariable *, 8>
+sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
+  SmallVector<DbgVariable *, 8> Result;
+  SmallVector<PointerIntPair<DbgVariable *, 1>, 8> WorkList;
+  // Map back from a DIVariable to its containing DbgVariable.
+  SmallDenseMap<const DILocalVariable *, DbgVariable *> DbgVar;
+  // Set of DbgVariables in Result.
+  SmallDenseSet<DbgVariable *, 8> Visited;
+  // For cycle detection.
+  SmallDenseSet<DbgVariable *, 8> Visiting;
+
+  // Initialize the worklist and the DIVariable lookup table.
+  for (auto Var : reverse(Input)) {
+    DbgVar.insert({Var->getVariable(), Var});
+    WorkList.push_back({Var, 0});
+  }
+
+  // Perform a stable topological sort by doing a DFS.
+  while (!WorkList.empty()) {
+    auto Item = WorkList.back();
+    DbgVariable *Var = Item.getPointer();
+    bool visitedAllDependencies = Item.getInt();
+    WorkList.pop_back();
+
+    // Dependency is in a different lexical scope or a global.
+    if (!Var)
+      continue;
+
+    // Already handled.
+    if (Visited.count(Var))
+      continue;
+
+    // Add to Result if all dependencies are visited.
+    if (visitedAllDependencies) {
+      Visited.insert(Var);
+      Result.push_back(Var);
+      continue;
+    }
+
+    // Detect cycles.
+    auto Res = Visiting.insert(Var);
+    if (!Res.second) {
+      assert(false && "dependency cycle in local variables");
+      return Result;
+    }
+
+    // Push dependencies and this node onto the worklist, so that this node is
+    // visited again after all of its dependencies are handled.
+    WorkList.push_back({Var, 1});
+    for (auto *Dependency : dependencies(Var)) {
+      auto Dep = dyn_cast_or_null<const DILocalVariable>(Dependency);
+      WorkList.push_back({DbgVar[Dep], 0});
+    }
+  }
+  return Result;
+}
+
 DIE *DwarfCompileUnit::createScopeChildrenDIE(LexicalScope *Scope,
                                               SmallVectorImpl<DIE *> &Children,
                                               bool *HasNonScopeChildren) {
   assert(Children.empty());
   DIE *ObjectPointer = nullptr;
 
-  for (DbgVariable *DV : DU->getScopeVariables().lookup(Scope))
+  // Emit function arguments (order is significant).
+  auto Vars = DU->getScopeVariables().lookup(Scope);
+  for (auto &DV : Vars.Args)
+    Children.push_back(constructVariableDIE(*DV.second, *Scope, ObjectPointer));
+
+  // Emit local variables.
+  auto Locals = sortLocalVars(Vars.Locals);
+  for (DbgVariable *DV : Locals)
     Children.push_back(constructVariableDIE(*DV, *Scope, ObjectPointer));
 
   // Skip imported directives in gmlt-like data.
@@ -687,9 +801,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
   else
     EntityDie = getDIE(Entity);
   assert(EntityDie);
-  auto *File = Module->getFile();
-  addSourceLine(*IMDie, Module->getLine(), File ? File->getFilename() : "",
-                File ? File->getDirectory() : "");
+  addSourceLine(*IMDie, Module->getLine(), Module->getFile());
   addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
   StringRef Name = Module->getName();
   if (!Name.empty())
@@ -750,7 +862,7 @@ void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var,
 
 void DwarfCompileUnit::emitHeader(bool UseOffsets) {
   // Don't bother labeling the .dwo unit, as its offset isn't used.
-  if (!Skeleton) {
+  if (!Skeleton && !DD->useSectionsAsReferences()) {
     LabelBegin = Asm->createTempSymbol("cu_begin");
     Asm->OutStreamer->EmitLabel(LabelBegin);
   }
@@ -759,6 +871,8 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {
                                 : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton
                                                       : dwarf::DW_UT_compile;
   DwarfUnit::emitCommonHeader(UseOffsets, UT);
+  if (DD->getDwarfVersion() >= 5 && UT != dwarf::DW_UT_compile)
+    Asm->emitInt64(getDWOId());
 }
 
 bool DwarfCompileUnit::hasDwarfPubSections() const {
@@ -767,7 +881,8 @@ bool DwarfCompileUnit::hasDwarfPubSections() const {
   if (CUNode->getGnuPubnames())
     return true;
 
-  return DD->tuneForGDB() && !includeMinimalInlineScopes();
+  return DD->tuneForGDB() && DD->usePubSections() &&
+         !includeMinimalInlineScopes();
 }
 
 /// addGlobalName - Add a new global name to the compile unit.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 68482eb7e358..51e1558fe4a3 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -83,7 +83,10 @@ class DwarfCompileUnit final : public DwarfUnit {
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
   DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
 
-  /// \brief Construct a DIE for the given DbgVariable without initializing the
+  /// DWO ID for correlating skeleton and split units.
+  uint64_t DWOId = 0;
+
+  /// Construct a DIE for the given DbgVariable without initializing the
   /// DbgVariable's DIE reference.
   DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
 
@@ -141,7 +144,7 @@ public:
 
   DwarfCompileUnit &getCU() override { return *this; }
 
-  unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
+  unsigned getOrCreateSourceID(const DIFile *File) override;
 
   void addImportedEntity(const DIImportedEntity* IE) {
     DIScope *Scope = IE->getScope();
@@ -159,7 +162,7 @@ public:
 
   void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
 
-  /// \brief Find DIE for the given subprogram and attach appropriate
+  /// Find DIE for the given subprogram and attach appropriate
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
   /// variables.
@@ -168,7 +171,7 @@ public:
   void constructScopeDIE(LexicalScope *Scope,
                          SmallVectorImpl<DIE *> &FinalChildren);
 
-  /// \brief A helper function to construct a RangeSpanList for a given
+  /// A helper function to construct a RangeSpanList for a given
   /// lexical scope.
   void addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range);
 
@@ -177,11 +180,11 @@ public:
   void attachRangesOrLowHighPC(DIE &D,
                                const SmallVectorImpl<InsnRange> &Ranges);
 
-  /// \brief This scope represents inlined body of a function. Construct
+  /// This scope represents inlined body of a function. Construct
   /// DIE to represent this concrete inlined copy of the function.
   DIE *constructInlinedScopeDIE(LexicalScope *Scope);
 
-  /// \brief Construct new DW_TAG_lexical_block for this scope and
+  /// Construct new DW_TAG_lexical_block for this scope and
   /// attach DW_AT_low_pc/DW_AT_high_pc labels.
   DIE *constructLexicalScopeDIE(LexicalScope *Scope);
 
@@ -196,14 +199,14 @@ public:
                               SmallVectorImpl<DIE *> &Children,
                               bool *HasNonScopeChildren = nullptr);
 
-  /// \brief Construct a DIE for this subprogram scope.
+  /// Construct a DIE for this subprogram scope.
   void constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope);
 
   DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
 
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
-  /// \brief Construct import_module DIE.
+  /// Construct import_module DIE.
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
 
   void finishSubprogramDefinition(const DISubprogram *SP);
@@ -214,11 +217,18 @@ public:
   DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
                                            const DILocalVariable *&Cleansed);
   DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
-  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
+  void createAbstractVariable(const DILocalVariable *Var, LexicalScope *Scope);
 
   /// Set the skeleton unit associated with this unit.
   void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
 
+  unsigned getHeaderSize() const override {
+    // DWARF v5 added the DWO ID to the header for split/skeleton units.
+    unsigned DWOIdSize =
+        DD->getDwarfVersion() >= 5 && DD->useSplitDwarf() ? sizeof(uint64_t)
+                                                          : 0;
+    return DwarfUnit::getHeaderSize() + DWOIdSize;
+  }
   unsigned getLength() {
     return sizeof(uint32_t) + // Length field
         getHeaderSize() + getUnitDie().getSize();
@@ -290,6 +300,9 @@ public:
   void setBaseAddress(const MCSymbol *Base) { BaseAddress = Base; }
   const MCSymbol *getBaseAddress() const { return BaseAddress; }
 
+  uint64_t getDWOId() const { return DWOId; }
+  void setDWOId(uint64_t DwoId) { DWOId = DwoId; }
+
   bool hasDwarfPubSections() const;
 };
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 2c9c7d4f3146..8761fae9dd22 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -16,7 +16,6 @@
 #include "DIEHash.h"
 #include "DebugLocEntry.h"
 #include "DebugLocStream.h"
-#include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfExpression.h"
 #include "DwarfFile.h"
@@ -31,6 +30,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
@@ -39,7 +39,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Constants.h"
@@ -66,6 +65,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
@@ -94,6 +94,11 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section",
                                            cl::desc("Generate dwarf aranges"),
                                            cl::init(false));
 
+static cl::opt<bool>
+    GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
+                           cl::desc("Generate DWARF4 type units."),
+                           cl::init(false));
+
 static cl::opt<bool> SplitDwarfCrossCuReferences(
     "split-dwarf-cross-cu-references", cl::Hidden,
     cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
@@ -107,14 +112,40 @@ static cl::opt<DefaultOnOff> UnknownLocations(
                clEnumVal(Enable, "In all cases"), clEnumVal(Disable, "Never")),
     cl::init(Default));
 
+static cl::opt<AccelTableKind> AccelTables(
+    "accel-tables", cl::Hidden, cl::desc("Output dwarf accelerator tables."),
+    cl::values(clEnumValN(AccelTableKind::Default, "Default",
+                          "Default for platform"),
+               clEnumValN(AccelTableKind::None, "Disable", "Disabled."),
+               clEnumValN(AccelTableKind::Apple, "Apple", "Apple"),
+               clEnumValN(AccelTableKind::Dwarf, "Dwarf", "DWARF")),
+    cl::init(AccelTableKind::Default));
+
 static cl::opt<DefaultOnOff>
-DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
-                 cl::desc("Output prototype dwarf accelerator tables."),
+DwarfInlinedStrings("dwarf-inlined-strings", cl::Hidden,
+                 cl::desc("Use inlined strings rather than string section."),
                  cl::values(clEnumVal(Default, "Default for platform"),
                             clEnumVal(Enable, "Enabled"),
                             clEnumVal(Disable, "Disabled")),
                  cl::init(Default));
 
+static cl::opt<bool>
+    NoDwarfPubSections("no-dwarf-pub-sections", cl::Hidden,
+                       cl::desc("Disable emission of DWARF pub sections."),
+                       cl::init(false));
+
+static cl::opt<bool>
+    NoDwarfRangesSection("no-dwarf-ranges-section", cl::Hidden,
+                         cl::desc("Disable emission .debug_ranges section."),
+                         cl::init(false));
+
+static cl::opt<DefaultOnOff> DwarfSectionsAsReferences(
+    "dwarf-sections-as-references", cl::Hidden,
+    cl::desc("Use sections+offset as references rather than labels."),
+    cl::values(clEnumVal(Default, "Default for platform"),
+               clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
+    cl::init(Default));
+
 enum LinkageNameOption {
   DefaultLinkageNames,
   AllLinkageNames,
@@ -215,11 +246,11 @@ ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const {
                         return A.Expr->isFragment();
                       }) &&
          "multiple FI expressions without DW_OP_LLVM_fragment");
-  std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(),
-            [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {
-              return A.Expr->getFragmentInfo()->OffsetInBits <
-                     B.Expr->getFragmentInfo()->OffsetInBits;
-            });
+  llvm::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(),
+             [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {
+               return A.Expr->getFragmentInfo()->OffsetInBits <
+                      B.Expr->getFragmentInfo()->OffsetInBits;
+             });
 
   return FrameIndexExprs;
 }
@@ -258,23 +289,34 @@ void DbgVariable::addMMIEntry(const DbgVariable &V) {
          "conflicting locations for variable");
 }
 
-static const DwarfAccelTable::Atom TypeAtoms[] = {
-    DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
-    DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
-    DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
+static AccelTableKind computeAccelTableKind(unsigned DwarfVersion,
+                                            bool GenerateTypeUnits,
+                                            DebuggerKind Tuning,
+                                            const Triple &TT) {
+  // Honor an explicit request.
+  if (AccelTables != AccelTableKind::Default)
+    return AccelTables;
+
+  // Accelerator tables with type units are currently not supported.
+  if (GenerateTypeUnits)
+    return AccelTableKind::None;
+
+  // Accelerator tables get emitted if targetting DWARF v5 or LLDB.  DWARF v5
+  // always implies debug_names. For lower standard versions we use apple
+  // accelerator tables on apple platforms and debug_names elsewhere.
+  if (DwarfVersion >= 5)
+    return AccelTableKind::Dwarf;
+  if (Tuning == DebuggerKind::LLDB)
+    return TT.isOSBinFormatMachO() ? AccelTableKind::Apple
+                                   : AccelTableKind::Dwarf;
+  return AccelTableKind::None;
+}
 
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     : DebugHandlerBase(A), DebugLocs(A->OutStreamer->isVerboseAsm()),
       InfoHolder(A, "info_string", DIEValueAllocator),
       SkeletonHolder(A, "skel_string", DIEValueAllocator),
-      IsDarwin(A->TM.getTargetTriple().isOSDarwin()),
-      AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
-                                       dwarf::DW_FORM_data4)),
-      AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
-                                      dwarf::DW_FORM_data4)),
-      AccelNamespace(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
-                                           dwarf::DW_FORM_data4)),
-      AccelTypes(TypeAtoms) {
+      IsDarwin(A->TM.getTargetTriple().isOSDarwin()) {
   const Triple &TT = Asm->TM.getTargetTriple();
 
   // Make sure we know our "debugger tuning."  The target option takes
@@ -288,11 +330,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   else
     DebuggerTuning = DebuggerKind::GDB;
 
-  // Turn on accelerator tables for LLDB by default.
-  if (DwarfAccelTables == Default)
-    HasDwarfAccelTables = tuneForLLDB();
+  if (DwarfInlinedStrings == Default)
+    UseInlineStrings = TT.isNVPTX();
   else
-    HasDwarfAccelTables = DwarfAccelTables == Enable;
+    UseInlineStrings = DwarfInlinedStrings == Enable;
+
+  UseLocSection = !TT.isNVPTX();
 
   HasAppleExtensionAttributes = tuneForLLDB();
 
@@ -308,8 +351,23 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion;
   unsigned DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
                                     : MMI->getModule()->getDwarfVersion();
-  // Use dwarf 4 by default if nothing is requested.
-  DwarfVersion = DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION;
+  // Use dwarf 4 by default if nothing is requested. For NVPTX, use dwarf 2.
+  DwarfVersion =
+      TT.isNVPTX() ? 2 : (DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION);
+
+  UsePubSections = !NoDwarfPubSections && !TT.isNVPTX();
+  UseRangesSection = !NoDwarfRangesSection && !TT.isNVPTX();
+
+  // Use sections as references. Force for NVPTX.
+  if (DwarfSectionsAsReferences == Default)
+    UseSectionsAsReferences = TT.isNVPTX();
+  else
+    UseSectionsAsReferences = DwarfSectionsAsReferences == Enable;
+
+  GenerateTypeUnits = GenerateDwarfTypeUnits;
+
+  TheAccelTableKind = computeAccelTableKind(
+      DwarfVersion, GenerateTypeUnits, DebuggerTuning, A->TM.getTargetTriple());
 
   // Work around a GDB bug. GDB doesn't support the standard opcode;
   // SCE doesn't support GNU's; LLDB prefers the standard opcode, which
@@ -321,6 +379,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   // GDB does not fully support the DWARF 4 representation for bitfields.
   UseDWARF2Bitfields = (DwarfVersion < 4) || tuneForGDB();
 
+  // The DWARF v5 string offsets table has - possibly shared - contributions
+  // from each compile and type unit each preceded by a header. The string
+  // offsets table used by the pre-DWARF v5 split-DWARF implementation uses
+  // a monolithic string offsets table without any header.
+  UseSegmentedStringOffsetsTable = DwarfVersion >= 5;
+
   Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
 }
 
@@ -355,17 +419,18 @@ static StringRef getObjCMethodName(StringRef In) {
 }
 
 // Add the various names to the Dwarf accelerator table names.
-// TODO: Determine whether or not we should add names for programs
-// that do not have a DW_AT_name or DW_AT_linkage_name field - this
-// is only slightly different than the lookup of non-standard ObjC names.
 void DwarfDebug::addSubprogramNames(const DISubprogram *SP, DIE &Die) {
   if (!SP->isDefinition())
     return;
-  addAccelName(SP->getName(), Die);
 
-  // If the linkage name is different than the name, go ahead and output
-  // that as well into the name table.
-  if (SP->getLinkageName() != "" && SP->getName() != SP->getLinkageName())
+  if (SP->getName() != "")
+    addAccelName(SP->getName(), Die);
+
+  // If the linkage name is different than the name, go ahead and output that as
+  // well into the name table. Only do that if we are going to actually emit
+  // that name.
+  if (SP->getLinkageName() != "" && SP->getName() != SP->getLinkageName() &&
+      (useAllLinkageNames() || InfoHolder.getAbstractSPDies().lookup(SP)))
     addAccelName(SP->getLinkageName(), Die);
 
   // If this is an Objective-C selector name add it to the ObjC accelerator
@@ -471,8 +536,9 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
   // explicitly describe the directory of all files, never relying on the
   // compilation directory.
   if (!Asm->OutStreamer->hasRawTextSupport() || SingleCU)
-    Asm->OutStreamer->getContext().setMCLineTableCompilationDir(
-        NewCU.getUniqueID(), CompilationDir);
+    Asm->OutStreamer->emitDwarfFile0Directive(
+        CompilationDir, FN, NewCU.getMD5AsBytes(DIUnit->getFile()),
+        DIUnit->getSource(), NewCU.getUniqueID());
 
   StringRef Producer = DIUnit->getProducer();
   StringRef Flags = DIUnit->getFlags();
@@ -486,6 +552,10 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
                 DIUnit->getSourceLanguage());
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
 
+  // Add DW_str_offsets_base to the unit DIE, except for split units.
+  if (useSegmentedStringOffsetsTable() && !useSplitDwarf())
+    NewCU.addStringOffsetsStart();
+
   if (!useSplitDwarf()) {
     NewCU.initStmtList();
 
@@ -541,21 +611,22 @@ void DwarfDebug::constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
 /// Sort and unique GVEs by comparing their fragment offset.
 static SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &
 sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
-  std::sort(GVEs.begin(), GVEs.end(),
-            [](DwarfCompileUnit::GlobalExpr A, DwarfCompileUnit::GlobalExpr B) {
-              // Sort order: first null exprs, then exprs without fragment
-              // info, then sort by fragment offset in bits.
-              // FIXME: Come up with a more comprehensive comparator so
-              // the sorting isn't non-deterministic, and so the following
-              // std::unique call works correctly.
-              if (!A.Expr || !B.Expr)
-                return !!B.Expr;
-              auto FragmentA = A.Expr->getFragmentInfo();
-              auto FragmentB = B.Expr->getFragmentInfo();
-              if (!FragmentA || !FragmentB)
-                return !!FragmentB;
-              return FragmentA->OffsetInBits < FragmentB->OffsetInBits;
-            });
+  llvm::sort(GVEs.begin(), GVEs.end(),
+             [](DwarfCompileUnit::GlobalExpr A,
+                DwarfCompileUnit::GlobalExpr B) {
+               // Sort order: first null exprs, then exprs without fragment
+               // info, then sort by fragment offset in bits.
+               // FIXME: Come up with a more comprehensive comparator so
+               // the sorting isn't non-deterministic, and so the following
+               // std::unique call works correctly.
+               if (!A.Expr || !B.Expr)
+                 return !!B.Expr;
+               auto FragmentA = A.Expr->getFragmentInfo();
+               auto FragmentB = B.Expr->getFragmentInfo();
+               if (!FragmentA || !FragmentB)
+                 return !!FragmentB;
+               return FragmentA->OffsetInBits < FragmentB->OffsetInBits;
+             });
   GVEs.erase(std::unique(GVEs.begin(), GVEs.end(),
                          [](DwarfCompileUnit::GlobalExpr A,
                             DwarfCompileUnit::GlobalExpr B) {
@@ -590,6 +661,19 @@ void DwarfDebug::beginModule() {
       GVMap[GVE->getVariable()].push_back({&Global, GVE->getExpression()});
   }
 
+  // Create the symbol that designates the start of the unit's contribution
+  // to the string offsets table. In a split DWARF scenario, only the skeleton
+  // unit has the DW_AT_str_offsets_base attribute (and hence needs the symbol).
+  if (useSegmentedStringOffsetsTable())
+    (useSplitDwarf() ? SkeletonHolder : InfoHolder)
+        .setStringOffsetsStartSym(Asm->createTempSymbol("str_offsets_base"));
+
+  // Create the symbol that designates the start of the DWARF v5 range list
+  // table. It is located past the header and before the offsets table.
+  if (getDwarfVersion() >= 5)
+    (useSplitDwarf() ? SkeletonHolder : InfoHolder)
+        .setRnglistsTableBaseSym(Asm->createTempSymbol("rnglists_table_base"));
+
   for (DICompileUnit *CUNode : M->debug_compile_units()) {
     // FIXME: Move local imported entities into a list attached to the
     // subprogram, then this search won't be needed and a
@@ -694,11 +778,15 @@ void DwarfDebug::finalizeModuleInfo() {
       // Emit a unique identifier for this CU.
       uint64_t ID =
           DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie());
-      TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
-                    dwarf::DW_FORM_data8, ID);
-      SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
-                    dwarf::DW_FORM_data8, ID);
-
+      if (getDwarfVersion() >= 5) {
+        TheCU.setDWOId(ID);
+        SkCU->setDWOId(ID);
+      } else {
+        TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                      dwarf::DW_FORM_data8, ID);
+        SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                      dwarf::DW_FORM_data8, ID);
+      }
       // We don't keep track of which addresses are used in which CU so this
       // is a bit pessimistic under LTO.
       if (!AddrPool.isEmpty()) {
@@ -706,7 +794,7 @@ void DwarfDebug::finalizeModuleInfo() {
         SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_addr_base,
                               Sym, Sym);
       }
-      if (!SkCU->getRangeLists().empty()) {
+      if (getDwarfVersion() < 5 && !SkCU->getRangeLists().empty()) {
         const MCSymbol *Sym = TLOF.getDwarfRangesSection()->getBeginSymbol();
         SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
                               Sym, Sym);
@@ -721,7 +809,7 @@ void DwarfDebug::finalizeModuleInfo() {
     // ranges for all subprogram DIEs for mach-o.
     DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
     if (unsigned NumRanges = TheCU.getRanges().size()) {
-      if (NumRanges > 1)
+      if (NumRanges > 1 && useRangesSection())
         // A DW_AT_low_pc attribute may also be specified in combination with
         // DW_AT_ranges to specify the default base address for use in
         // location lists (see Section 2.6.2) and range lists (see Section
@@ -732,6 +820,10 @@ void DwarfDebug::finalizeModuleInfo() {
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
 
+    if (getDwarfVersion() >= 5 && !useSplitDwarf() &&
+        !U.getRangeLists().empty())
+      U.addRnglistsBase();
+
     auto *CUNode = cast<DICompileUnit>(P.first);
     // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
     if (CUNode->getMacros())
@@ -799,11 +891,20 @@ void DwarfDebug::endModule() {
   }
 
   // Emit info into the dwarf accelerator table sections.
-  if (useDwarfAccelTables()) {
+  switch (getAccelTableKind()) {
+  case AccelTableKind::Apple:
     emitAccelNames();
     emitAccelObjC();
     emitAccelNamespaces();
     emitAccelTypes();
+    break;
+  case AccelTableKind::Dwarf:
+    emitAccelDebugNames();
+    break;
+  case AccelTableKind::None:
+    break;
+  case AccelTableKind::Default:
+    llvm_unreachable("Default should have already been resolved.");
   }
 
   // Emit the pubnames and pubtypes sections if requested.
@@ -887,7 +988,7 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
   llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
-/// \brief If this and Next are describing different fragments of the same
+/// If this and Next are describing different fragments of the same
 /// variable, merge them by appending Next's values to the current
 /// list of values.
 /// Return true if the merge was successful.
@@ -903,8 +1004,7 @@ bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
     // sorted.
     for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
       for (; j < Next.Values.size(); ++j) {
-        int res = DebugHandlerBase::fragmentCmp(
-            cast<DIExpression>(Values[i].Expression),
+        int res = cast<DIExpression>(Values[i].Expression)->fragmentCmp(
             cast<DIExpression>(Next.Values[j].Expression));
         if (res == 0) // The two expressions overlap, we can't merge.
           return false;
@@ -967,7 +1067,7 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
     // If this fragment overlaps with any open ranges, truncate them.
     const DIExpression *DIExpr = Begin->getDebugExpression();
     auto Last = remove_if(OpenRanges, [&](DebugLocEntry::Value R) {
-      return fragmentsOverlap(DIExpr, R.getExpression());
+      return DIExpr->fragmentsOverlap(R.getExpression());
     });
     OpenRanges.erase(Last, OpenRanges.end());
 
@@ -983,7 +1083,7 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
       EndLabel = getLabelBeforeInsn(std::next(I)->first);
     assert(EndLabel && "Forgot label after instruction ending a range!");
 
-    DEBUG(dbgs() << "DotDebugLoc: " << *Begin << "\n");
+    LLVM_DEBUG(dbgs() << "DotDebugLoc: " << *Begin << "\n");
 
     auto Value = getDebugLocValue(Begin);
     DebugLocEntry Loc(StartLabel, EndLabel, Value);
@@ -1012,7 +1112,7 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
     // Attempt to coalesce the ranges of two otherwise identical
     // DebugLocEntries.
     auto CurEntry = DebugLoc.rbegin();
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << CurEntry->getValues().size() << " Values:\n";
       for (auto &Value : CurEntry->getValues())
         Value.dump();
@@ -1131,6 +1231,9 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
       RegVar->initializeDbgValue(MInsn);
       continue;
     }
+    // Do not emit location lists if .debug_loc secton is disabled.
+    if (!useLocSection())
+      continue;
 
     // Handle multiple DBG_VALUE instructions describing one variable.
     DebugLocStream::ListBuilder List(DebugLocs, TheCU, *Asm, *RegVar, *MInsn);
@@ -1151,10 +1254,12 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
   }
 
   // Collect info for variables that were optimized out.
-  for (const DILocalVariable *DV : SP->getVariables()) {
-    if (Processed.insert(InlinedVariable(DV, nullptr)).second)
-      if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
-        createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
+  for (const DINode *DN : SP->getRetainedNodes()) {
+    if (auto *DV = dyn_cast<DILocalVariable>(DN)) {
+      if (Processed.insert(InlinedVariable(DV, nullptr)).second)
+        if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
+          createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
+    }
   }
 }
 
@@ -1168,7 +1273,9 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     return;
 
   // Check if source location changes, but ignore DBG_VALUE and CFI locations.
-  if (MI->isMetaInstruction())
+  // If the instruction is part of the function frame setup code, do not emit
+  // any line record, as there is no correspondence with any user code.
+  if (MI->isMetaInstruction() || MI->getFlag(MachineInstr::FrameSetup))
     return;
   const DebugLoc &DL = MI->getDebugLoc();
   // When we emit a line-0 record, we don't update PrevInstLoc; so look at
@@ -1333,14 +1440,16 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
   // Construct abstract scopes.
   for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
     auto *SP = cast<DISubprogram>(AScope->getScopeNode());
-    // Collect info for variables that were optimized out.
-    for (const DILocalVariable *DV : SP->getVariables()) {
-      if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
-        continue;
-      ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
-                                      DV->getScope());
-      assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
-             && "ensureAbstractVariableIsCreated inserted abstract scopes");
+    for (const DINode *DN : SP->getRetainedNodes()) {
+      if (auto *DV = dyn_cast<DILocalVariable>(DN)) {
+        // Collect info for variables that were optimized out.
+        if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
+          continue;
+        ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
+                                        DV->getScope());
+        assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
+               && "ensureAbstractVariableIsCreated inserted abstract scopes");
+      }
     }
     constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
@@ -1366,21 +1475,19 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
                                   unsigned Flags) {
   StringRef Fn;
-  StringRef Dir;
-  unsigned Src = 1;
+  unsigned FileNo = 1;
   unsigned Discriminator = 0;
   if (auto *Scope = cast_or_null<DIScope>(S)) {
     Fn = Scope->getFilename();
-    Dir = Scope->getDirectory();
     if (Line != 0 && getDwarfVersion() >= 4)
       if (auto *LBF = dyn_cast<DILexicalBlockFile>(Scope))
         Discriminator = LBF->getDiscriminator();
 
     unsigned CUID = Asm->OutStreamer->getContext().getDwarfCompileUnitID();
-    Src = static_cast<DwarfCompileUnit &>(*InfoHolder.getUnits()[CUID])
-              .getOrCreateSourceID(Fn, Dir);
+    FileNo = static_cast<DwarfCompileUnit &>(*InfoHolder.getUnits()[CUID])
+              .getOrCreateSourceID(Scope->getFile());
   }
-  Asm->OutStreamer->EmitDwarfLocDirective(Src, Line, Col, Flags, 0,
+  Asm->OutStreamer->EmitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
                                           Discriminator, Fn);
 }
 
@@ -1401,13 +1508,30 @@ void DwarfDebug::emitAbbreviations() {
   Holder.emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevSection());
 }
 
-void DwarfDebug::emitAccel(DwarfAccelTable &Accel, MCSection *Section,
+void DwarfDebug::emitStringOffsetsTableHeader() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.getStringPool().emitStringOffsetsTableHeader(
+      *Asm, Asm->getObjFileLowering().getDwarfStrOffSection(),
+      Holder.getStringOffsetsStartSym());
+}
+
+template <typename AccelTableT>
+void DwarfDebug::emitAccel(AccelTableT &Accel, MCSection *Section,
                            StringRef TableName) {
-  Accel.FinalizeTable(Asm, TableName);
   Asm->OutStreamer->SwitchSection(Section);
 
   // Emit the full data.
-  Accel.emit(Asm, Section->getBeginSymbol(), this);
+  emitAppleAccelTable(Asm, Accel, TableName, Section->getBeginSymbol());
+}
+
+void DwarfDebug::emitAccelDebugNames() {
+  // Don't emit anything if we have no compilation units to index.
+  if (getUnits().empty())
+    return;
+
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfDebugNamesSection());
+  emitDWARF5AccelTable(Asm, AccelDebugNames, *this, getUnits());
 }
 
 // Emit visible names into a hashed accelerator table section.
@@ -1525,6 +1649,14 @@ void DwarfDebug::emitDebugPubSections() {
   }
 }
 
+void DwarfDebug::emitSectionReference(const DwarfCompileUnit &CU) {
+  if (useSectionsAsReferences())
+    Asm->EmitDwarfOffset(CU.getSection()->getBeginSymbol(),
+                         CU.getDebugSectionOffset());
+  else
+    Asm->emitDwarfSymbolReference(CU.getLabelBegin());
+}
+
 void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
                                      DwarfCompileUnit *TheU,
                                      const StringMap<const DIE *> &Globals) {
@@ -1540,13 +1672,13 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
   Asm->OutStreamer->EmitLabel(BeginLabel);
 
   Asm->OutStreamer->AddComment("DWARF Version");
-  Asm->EmitInt16(dwarf::DW_PUBNAMES_VERSION);
+  Asm->emitInt16(dwarf::DW_PUBNAMES_VERSION);
 
   Asm->OutStreamer->AddComment("Offset of Compilation Unit Info");
-  Asm->emitDwarfSymbolReference(TheU->getLabelBegin());
+  emitSectionReference(*TheU);
 
   Asm->OutStreamer->AddComment("Compilation Unit Length");
-  Asm->EmitInt32(TheU->getLength());
+  Asm->emitInt32(TheU->getLength());
 
   // Emit the pubnames for this compilation unit.
   for (const auto &GI : Globals) {
@@ -1554,14 +1686,14 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
     const DIE *Entity = GI.second;
 
     Asm->OutStreamer->AddComment("DIE offset");
-    Asm->EmitInt32(Entity->getOffset());
+    Asm->emitInt32(Entity->getOffset());
 
     if (GnuStyle) {
       dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity);
       Asm->OutStreamer->AddComment(
           Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
           dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
-      Asm->EmitInt8(Desc.toBits());
+      Asm->emitInt8(Desc.toBits());
     }
 
     Asm->OutStreamer->AddComment("External Name");
@@ -1569,14 +1701,20 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
   }
 
   Asm->OutStreamer->AddComment("End Mark");
-  Asm->EmitInt32(0);
+  Asm->emitInt32(0);
   Asm->OutStreamer->EmitLabel(EndLabel);
 }
 
 /// Emit null-terminated strings into a debug str section.
 void DwarfDebug::emitDebugStr() {
+  MCSection *StringOffsetsSection = nullptr;
+  if (useSegmentedStringOffsetsTable()) {
+    emitStringOffsetsTableHeader();
+    StringOffsetsSection = Asm->getObjFileLowering().getDwarfStrOffSection();
+  }
   DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
-  Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
+  Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection(),
+                     StringOffsetsSection, /* UseRelativeOffsets = */ true);
 }
 
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
@@ -1589,7 +1727,6 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
 }
 
 static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
-                              ByteStreamer &Streamer,
                               const DebugLocEntry::Value &Value,
                               DwarfExpression &DwarfExpr) {
   auto *DIExpr = Value.getExpression();
@@ -1634,11 +1771,11 @@ void DebugLocEntry::finalize(const AsmPrinter &AP,
            "fragments are expected to be sorted");
 
     for (auto Fragment : Values)
-      emitDebugLocValue(AP, BT, Streamer, Fragment, DwarfExpr);
+      emitDebugLocValue(AP, BT, Fragment, DwarfExpr);
 
   } else {
     assert(Values.size() == 1 && "only fragments may have >1 value");
-    emitDebugLocValue(AP, BT, Streamer, Value, DwarfExpr);
+    emitDebugLocValue(AP, BT, Value, DwarfExpr);
   }
   DwarfExpr.finalize();
 }
@@ -1646,7 +1783,7 @@ void DebugLocEntry::finalize(const AsmPrinter &AP,
 void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) {
   // Emit the size.
   Asm->OutStreamer->AddComment("Loc expr size");
-  Asm->EmitInt16(DebugLocs.getBytes(Entry).size());
+  Asm->emitInt16(DebugLocs.getBytes(Entry).size());
 
   // Emit the entry.
   APByteStreamer Streamer(*Asm);
@@ -1694,14 +1831,14 @@ void DwarfDebug::emitDebugLocDWO() {
       // rather than two. We could get fancier and try to, say, reuse an
       // address we know we've emitted elsewhere (the start of the function?
       // The start of the CU or CU subrange that encloses this range?)
-      Asm->EmitInt8(dwarf::DW_LLE_startx_length);
+      Asm->emitInt8(dwarf::DW_LLE_startx_length);
       unsigned idx = AddrPool.getIndex(Entry.BeginSym);
       Asm->EmitULEB128(idx);
       Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4);
 
       emitDebugLocEntryLocation(Entry);
     }
-    Asm->EmitInt8(dwarf::DW_LLE_end_of_list);
+    Asm->emitInt8(dwarf::DW_LLE_end_of_list);
   }
 }
 
@@ -1752,7 +1889,7 @@ void DwarfDebug::emitDebugARanges() {
     }
 
     // Sort the symbols by offset within the section.
-    std::sort(
+    std::stable_sort(
         List.begin(), List.end(), [&](const SymbolCU &A, const SymbolCU &B) {
           unsigned IA = A.Sym ? Asm->OutStreamer->GetSymbolOrder(A.Sym) : 0;
           unsigned IB = B.Sym ? Asm->OutStreamer->GetSymbolOrder(B.Sym) : 0;
@@ -1801,10 +1938,10 @@ void DwarfDebug::emitDebugARanges() {
   }
 
   // Sort the CU list (again, to ensure consistent output order).
-  std::sort(CUs.begin(), CUs.end(),
-            [](const DwarfCompileUnit *A, const DwarfCompileUnit *B) {
-              return A->getUniqueID() < B->getUniqueID();
-            });
+  llvm::sort(CUs.begin(), CUs.end(),
+             [](const DwarfCompileUnit *A, const DwarfCompileUnit *B) {
+               return A->getUniqueID() < B->getUniqueID();
+             });
 
   // Emit an arange table for each CU we used.
   for (DwarfCompileUnit *CU : CUs) {
@@ -1832,15 +1969,15 @@ void DwarfDebug::emitDebugARanges() {
 
     // For each compile unit, write the list of spans it covers.
     Asm->OutStreamer->AddComment("Length of ARange Set");
-    Asm->EmitInt32(ContentSize);
+    Asm->emitInt32(ContentSize);
     Asm->OutStreamer->AddComment("DWARF Arange version number");
-    Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
+    Asm->emitInt16(dwarf::DW_ARANGES_VERSION);
     Asm->OutStreamer->AddComment("Offset Into Debug Info Section");
-    Asm->emitDwarfSymbolReference(CU->getLabelBegin());
+    emitSectionReference(*CU);
     Asm->OutStreamer->AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(PtrSize);
+    Asm->emitInt8(PtrSize);
     Asm->OutStreamer->AddComment("Segment Size (in bytes)");
-    Asm->EmitInt8(0);
+    Asm->emitInt8(0);
 
     Asm->OutStreamer->emitFill(Padding, 0xff);
 
@@ -1867,17 +2004,151 @@ void DwarfDebug::emitDebugARanges() {
   }
 }
 
-/// Emit address ranges into a debug ranges section.
+/// Emit a single range list. We handle both DWARF v5 and earlier.
+static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
+                          const RangeSpanList &List) {
+
+  auto DwarfVersion = CU->getDwarfVersion();
+  // Emit our symbol so we can find the beginning of the range.
+  Asm->OutStreamer->EmitLabel(List.getSym());
+  // Gather all the ranges that apply to the same section so they can share
+  // a base address entry.
+  MapVector<const MCSection *, std::vector<const RangeSpan *>> SectionRanges;
+  // Size for our labels.
+  auto Size = Asm->MAI->getCodePointerSize();
+
+  for (const RangeSpan &Range : List.getRanges())
+    SectionRanges[&Range.getStart()->getSection()].push_back(&Range);
+
+  auto *CUBase = CU->getBaseAddress();
+  bool BaseIsSet = false;
+  for (const auto &P : SectionRanges) {
+    // Don't bother with a base address entry if there's only one range in
+    // this section in this range list - for example ranges for a CU will
+    // usually consist of single regions from each of many sections
+    // (-ffunction-sections, or just C++ inline functions) except under LTO
+    // or optnone where there may be holes in a single CU's section
+    // contributions.
+    auto *Base = CUBase;
+    if (!Base && P.second.size() > 1 &&
+        (UseDwarfRangesBaseAddressSpecifier || DwarfVersion >= 5)) {
+      BaseIsSet = true;
+      // FIXME/use care: This may not be a useful base address if it's not
+      // the lowest address/range in this object.
+      Base = P.second.front()->getStart();
+      if (DwarfVersion >= 5) {
+        Asm->OutStreamer->AddComment("DW_RLE_base_address");
+        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_address, 1);
+      } else
+        Asm->OutStreamer->EmitIntValue(-1, Size);
+      Asm->OutStreamer->AddComment("  base address");
+      Asm->OutStreamer->EmitSymbolValue(Base, Size);
+    } else if (BaseIsSet && DwarfVersion < 5) {
+      BaseIsSet = false;
+      assert(!Base);
+      Asm->OutStreamer->EmitIntValue(-1, Size);
+      Asm->OutStreamer->EmitIntValue(0, Size);
+    }
+
+    for (const auto *RS : P.second) {
+      const MCSymbol *Begin = RS->getStart();
+      const MCSymbol *End = RS->getEnd();
+      assert(Begin && "Range without a begin symbol?");
+      assert(End && "Range without an end symbol?");
+      if (Base) {
+        if (DwarfVersion >= 5) {
+          // Emit DW_RLE_offset_pair when we have a base.
+          Asm->OutStreamer->AddComment("DW_RLE_offset_pair");
+          Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_offset_pair, 1);
+          Asm->OutStreamer->AddComment("  starting offset");
+          Asm->EmitLabelDifferenceAsULEB128(Begin, Base);
+          Asm->OutStreamer->AddComment("  ending offset");
+          Asm->EmitLabelDifferenceAsULEB128(End, Base);
+        } else {
+          Asm->EmitLabelDifference(Begin, Base, Size);
+          Asm->EmitLabelDifference(End, Base, Size);
+        }
+      } else if (DwarfVersion >= 5) {
+        Asm->OutStreamer->AddComment("DW_RLE_start_length");
+        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_start_length, 1);
+        Asm->OutStreamer->AddComment("  start");
+        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+        Asm->OutStreamer->AddComment("  length");
+        Asm->EmitLabelDifferenceAsULEB128(End, Begin);
+      } else {
+        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+        Asm->OutStreamer->EmitSymbolValue(End, Size);
+      }
+    }
+  }
+  if (DwarfVersion >= 5) {
+    Asm->OutStreamer->AddComment("DW_RLE_end_of_list");
+    Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_end_of_list, 1);
+  } else {
+    // Terminate the list with two 0 values.
+    Asm->OutStreamer->EmitIntValue(0, Size);
+    Asm->OutStreamer->EmitIntValue(0, Size);
+  }
+}
+
+// Emit the header of a DWARF 5 range list table. Returns the symbol that
+// designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm, DwarfFile &Holder) {
+  // The length is described by a starting label right after the length field
+  // and an end label.
+  MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
+  MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
+  // Build the range table header, which starts with the length field.
+  Asm->EmitLabelDifference(TableEnd, TableStart, 4);
+  Asm->OutStreamer->EmitLabel(TableStart);
+  // Version number (DWARF v5 and later).
+  Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
+  // Address size.
+  Asm->emitInt8(Asm->MAI->getCodePointerSize());
+  // Segment selector size.
+  Asm->emitInt8(0);
+
+  MCSymbol *RnglistTableBaseSym = Holder.getRnglistsTableBaseSym();
+
+  // FIXME: Generate the offsets table and use DW_FORM_rnglistx with the
+  // DW_AT_ranges attribute. Until then set the number of offsets to 0.
+  Asm->emitInt32(0);
+  Asm->OutStreamer->EmitLabel(RnglistTableBaseSym);
+  return TableEnd;
+}
+
+/// Emit address ranges into the .debug_ranges section or into the DWARF v5
+/// .debug_rnglists section.
 void DwarfDebug::emitDebugRanges() {
   if (CUMap.empty())
     return;
 
-  // Start the dwarf ranges section.
-  Asm->OutStreamer->SwitchSection(
-      Asm->getObjFileLowering().getDwarfRangesSection());
+  auto NoRangesPresent = [this]() {
+    return llvm::all_of(
+        CUMap, [](const decltype(CUMap)::const_iterator::value_type &Pair) {
+          return Pair.second->getRangeLists().empty();
+        });
+  };
 
-  // Size for our labels.
-  unsigned char Size = Asm->MAI->getCodePointerSize();
+  if (!useRangesSection()) {
+    assert(NoRangesPresent() && "No debug ranges expected.");
+    return;
+  }
+
+  if (getDwarfVersion() >= 5 && NoRangesPresent())
+    return;
+
+  // Start the dwarf ranges section.
+  MCSymbol *TableEnd = nullptr;
+  if (getDwarfVersion() >= 5) {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfRnglistsSection());
+    TableEnd = emitRnglistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
+                                                            : InfoHolder);
+  } else
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfRangesSection());
 
   // Grab the specific ranges for the compile units in the module.
   for (const auto &I : CUMap) {
@@ -1887,61 +2158,12 @@ void DwarfDebug::emitDebugRanges() {
       TheCU = Skel;
 
     // Iterate over the misc ranges for the compile units in the module.
-    for (const RangeSpanList &List : TheCU->getRangeLists()) {
-      // Emit our symbol so we can find the beginning of the range.
-      Asm->OutStreamer->EmitLabel(List.getSym());
-
-      // Gather all the ranges that apply to the same section so they can share
-      // a base address entry.
-      MapVector<const MCSection *, std::vector<const RangeSpan *>> MV;
-      for (const RangeSpan &Range : List.getRanges()) {
-        MV[&Range.getStart()->getSection()].push_back(&Range);
-      }
-
-      auto *CUBase = TheCU->getBaseAddress();
-      bool BaseIsSet = false;
-      for (const auto &P : MV) {
-        // Don't bother with a base address entry if there's only one range in
-        // this section in this range list - for example ranges for a CU will
-        // usually consist of single regions from each of many sections
-        // (-ffunction-sections, or just C++ inline functions) except under LTO
-        // or optnone where there may be holes in a single CU's section
-        // contrubutions.
-        auto *Base = CUBase;
-        if (!Base && P.second.size() > 1 &&
-            UseDwarfRangesBaseAddressSpecifier) {
-          BaseIsSet = true;
-          // FIXME/use care: This may not be a useful base address if it's not
-          // the lowest address/range in this object.
-          Base = P.second.front()->getStart();
-          Asm->OutStreamer->EmitIntValue(-1, Size);
-          Asm->OutStreamer->EmitSymbolValue(Base, Size);
-        } else if (BaseIsSet) {
-          BaseIsSet = false;
-          Asm->OutStreamer->EmitIntValue(-1, Size);
-          Asm->OutStreamer->EmitIntValue(0, Size);
-        }
-
-        for (const auto *RS : P.second) {
-          const MCSymbol *Begin = RS->getStart();
-          const MCSymbol *End = RS->getEnd();
-          assert(Begin && "Range without a begin symbol?");
-          assert(End && "Range without an end symbol?");
-          if (Base) {
-            Asm->EmitLabelDifference(Begin, Base, Size);
-            Asm->EmitLabelDifference(End, Base, Size);
-          } else {
-            Asm->OutStreamer->EmitSymbolValue(Begin, Size);
-            Asm->OutStreamer->EmitSymbolValue(End, Size);
-          }
-        }
-      }
-
-      // And terminate the list with two 0 values.
-      Asm->OutStreamer->EmitIntValue(0, Size);
-      Asm->OutStreamer->EmitIntValue(0, Size);
-    }
+    for (const RangeSpanList &List : TheCU->getRangeLists())
+      emitRangeList(Asm, TheCU, List);
   }
+
+  if (TableEnd)
+    Asm->OutStreamer->EmitLabel(TableEnd);
 }
 
 void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
@@ -1963,20 +2185,17 @@ void DwarfDebug::emitMacro(DIMacro &M) {
   Asm->OutStreamer->EmitBytes(Name);
   if (!Value.empty()) {
     // There should be one space between macro name and macro value.
-    Asm->EmitInt8(' ');
+    Asm->emitInt8(' ');
     Asm->OutStreamer->EmitBytes(Value);
   }
-  Asm->EmitInt8('\0');
+  Asm->emitInt8('\0');
 }
 
 void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
   assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
   Asm->EmitULEB128(dwarf::DW_MACINFO_start_file);
   Asm->EmitULEB128(F.getLine());
-  DIFile *File = F.getFile();
-  unsigned FID =
-      U.getOrCreateSourceID(File->getFilename(), File->getDirectory());
-  Asm->EmitULEB128(FID);
+  Asm->EmitULEB128(U.getOrCreateSourceID(F.getFile()));
   handleMacroNodes(F.getElements(), U);
   Asm->EmitULEB128(dwarf::DW_MACINFO_end_file);
 }
@@ -1995,11 +2214,14 @@ void DwarfDebug::emitDebugMacinfo() {
     auto *SkCU = TheCU.getSkeleton();
     DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
     auto *CUNode = cast<DICompileUnit>(P.first);
-    Asm->OutStreamer->EmitLabel(U.getMacroLabelBegin());
-    handleMacroNodes(CUNode->getMacros(), U);
+    DIMacroNodeArray Macros = CUNode->getMacros();
+    if (!Macros.empty()) {
+      Asm->OutStreamer->EmitLabel(U.getMacroLabelBegin());
+      handleMacroNodes(Macros, U);
+    }
   }
   Asm->OutStreamer->AddComment("End Of Macro List Mark");
-  Asm->EmitInt8(0);
+  Asm->emitInt8(0);
 }
 
 // DWARF5 Experimental Separate Dwarf emitters.
@@ -2017,9 +2239,6 @@ void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
   SkeletonHolder.addUnit(std::move(NewU));
 }
 
-// This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
-// DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
-// DW_AT_addr_base, DW_AT_ranges_base.
 DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
   auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
@@ -2029,6 +2248,9 @@ DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
   NewCU.initStmtList();
 
+  if (useSegmentedStringOffsetsTable())
+    NewCU.addStringOffsetsStart();
+
   initSkeletonUnit(CU, NewCU.getUnitDie(), std::move(OwnedUnit));
 
   return NewCU;
@@ -2051,26 +2273,37 @@ void DwarfDebug::emitDebugAbbrevDWO() {
 
 void DwarfDebug::emitDebugLineDWO() {
   assert(useSplitDwarf() && "No split dwarf?");
-  Asm->OutStreamer->SwitchSection(
+  SplitTypeUnitFileTable.Emit(
+      *Asm->OutStreamer, MCDwarfLineTableParams(),
       Asm->getObjFileLowering().getDwarfLineDWOSection());
-  SplitTypeUnitFileTable.Emit(*Asm->OutStreamer, MCDwarfLineTableParams());
+}
+
+void DwarfDebug::emitStringOffsetsTableHeaderDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  InfoHolder.getStringPool().emitStringOffsetsTableHeader(
+      *Asm, Asm->getObjFileLowering().getDwarfStrOffDWOSection(),
+      InfoHolder.getStringOffsetsStartSym());
 }
 
 // Emit the .debug_str.dwo section for separated dwarf. This contains the
 // string section and is identical in format to traditional .debug_str
 // sections.
 void DwarfDebug::emitDebugStrDWO() {
+  if (useSegmentedStringOffsetsTable())
+    emitStringOffsetsTableHeaderDWO();
   assert(useSplitDwarf() && "No split dwarf?");
   MCSection *OffSec = Asm->getObjFileLowering().getDwarfStrOffDWOSection();
   InfoHolder.emitStrings(Asm->getObjFileLowering().getDwarfStrDWOSection(),
-                         OffSec);
+                         OffSec, /* UseRelativeOffsets = */ false);
 }
 
 MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
   if (!useSplitDwarf())
     return nullptr;
-  if (SingleCU)
-    SplitTypeUnitFileTable.setCompilationDir(CU.getCUNode()->getDirectory());
+  const DICompileUnit *DIUnit = CU.getCUNode();
+  SplitTypeUnitFileTable.maybeSetRootFile(
+      DIUnit->getDirectory(), DIUnit->getFilename(),
+      CU.getMD5AsBytes(DIUnit->getFile()), DIUnit->getSource());
   return &SplitTypeUnitFileTable;
 }
 
@@ -2119,10 +2352,16 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   if (useSplitDwarf())
     NewTU.setSection(Asm->getObjFileLowering().getDwarfTypesDWOSection());
   else {
-    CU.applyStmtList(UnitDie);
     NewTU.setSection(Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+    // Non-split type units reuse the compile unit's line table.
+    CU.applyStmtList(UnitDie);
   }
 
+  // Add DW_AT_str_offsets_base to the type unit DIE, but not for split type
+  // units.
+  if (useSegmentedStringOffsetsTable() && !useSplitDwarf())
+    NewTU.addStringOffsetsStart();
+
   NewTU.setType(NewTU.createTypeDIE(CTy));
 
   if (TopLevelType) {
@@ -2157,32 +2396,50 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   CU.addDIETypeSignature(RefDie, Signature);
 }
 
-// Accelerator table mutators - add each name along with its companion
-// DIE to the proper table while ensuring that the name that we're going
-// to reference is in the string table. We do this since the names we
-// add may not only be identical to the names in the DIE.
-void DwarfDebug::addAccelName(StringRef Name, const DIE &Die) {
-  if (!useDwarfAccelTables())
+// Add the Name along with its companion DIE to the appropriate accelerator
+// table (for AccelTableKind::Dwarf it's always AccelDebugNames, for
+// AccelTableKind::Apple, we use the table we got as an argument). If
+// accelerator tables are disabled, this function does nothing.
+template <typename DataT>
+void DwarfDebug::addAccelNameImpl(AccelTable<DataT> &AppleAccel, StringRef Name,
+                                  const DIE &Die) {
+  if (getAccelTableKind() == AccelTableKind::None)
     return;
-  AccelNames.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  DwarfStringPoolEntryRef Ref =
+      Holder.getStringPool().getEntry(*Asm, Name);
+
+  switch (getAccelTableKind()) {
+  case AccelTableKind::Apple:
+    AppleAccel.addName(Ref, Die);
+    break;
+  case AccelTableKind::Dwarf:
+    AccelDebugNames.addName(Ref, Die);
+    break;
+  case AccelTableKind::Default:
+    llvm_unreachable("Default should have already been resolved.");
+  case AccelTableKind::None:
+    llvm_unreachable("None handled above");
+  }
+}
+
+void DwarfDebug::addAccelName(StringRef Name, const DIE &Die) {
+  addAccelNameImpl(AccelNames, Name, Die);
 }
 
 void DwarfDebug::addAccelObjC(StringRef Name, const DIE &Die) {
-  if (!useDwarfAccelTables())
-    return;
-  AccelObjC.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+  // ObjC names go only into the Apple accelerator tables.
+  if (getAccelTableKind() == AccelTableKind::Apple)
+    addAccelNameImpl(AccelObjC, Name, Die);
 }
 
 void DwarfDebug::addAccelNamespace(StringRef Name, const DIE &Die) {
-  if (!useDwarfAccelTables())
-    return;
-  AccelNamespace.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+  addAccelNameImpl(AccelNamespace, Name, Die);
 }
 
 void DwarfDebug::addAccelType(StringRef Name, const DIE &Die, char Flags) {
-  if (!useDwarfAccelTables())
-    return;
-  AccelTypes.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+  addAccelNameImpl(AccelTypes, Name, Die);
 }
 
 uint16_t DwarfDebug::getDwarfVersion() const {
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 2ae0b418a91e..0c7be5d27dfe 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -18,7 +18,6 @@
 #include "DbgValueHistoryCalculator.h"
 #include "DebugHandlerBase.h"
 #include "DebugLocStream.h"
-#include "DwarfAccelTable.h"
 #include "DwarfFile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -31,6 +30,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -192,6 +192,14 @@ struct SymbolCU {
   DwarfCompileUnit *CU;
 };
 
+/// The kind of accelerator tables we should emit.
+enum class AccelTableKind {
+  Default, ///< Platform default.
+  None,    ///< None.
+  Apple,   ///< .apple_names, .apple_namespaces, .apple_types, .apple_objc.
+  Dwarf,   ///< DWARF v5 .debug_names.
+};
+
 /// Collects and handles dwarf debug information.
 class DwarfDebug : public DebugHandlerBase {
   /// All DIEValues are allocated through this allocator.
@@ -255,12 +263,37 @@ class DwarfDebug : public DebugHandlerBase {
   /// Whether to emit all linkage names, or just abstract subprograms.
   bool UseAllLinkageNames;
 
+  /// Use inlined strings.
+  bool UseInlineStrings = false;
+
+  /// Whether to emit DWARF pub sections or not.
+  bool UsePubSections = true;
+
+  /// Allow emission of .debug_ranges section.
+  bool UseRangesSection = true;
+
+  /// True if the sections itself must be used as references and don't create
+  /// temp symbols inside DWARF sections.
+  bool UseSectionsAsReferences = false;
+
+  ///Allow emission of the .debug_loc section.
+  bool UseLocSection = true;
+
+  /// Generate DWARF v4 type units.
+  bool GenerateTypeUnits;
+
   /// DWARF5 Experimental Options
   /// @{
-  bool HasDwarfAccelTables;
+  AccelTableKind TheAccelTableKind;
   bool HasAppleExtensionAttributes;
   bool HasSplitDwarf;
 
+  /// Whether to generate the DWARF v5 string offsets table.
+  /// It consists of a series of contributions, each preceded by a header.
+  /// The pre-DWARF v5 string offsets table for split dwarf is, in contrast,
+  /// a monolithic sequence of string offsets.
+  bool UseSegmentedStringOffsetsTable;
+
   /// Separated Dwarf Variables
   /// In general these will all be for bits that are left in the
   /// original object file, rather than things that are meant
@@ -283,10 +316,12 @@ class DwarfDebug : public DebugHandlerBase {
 
   AddressPool AddrPool;
 
-  DwarfAccelTable AccelNames;
-  DwarfAccelTable AccelObjC;
-  DwarfAccelTable AccelNamespace;
-  DwarfAccelTable AccelTypes;
+  /// Accelerator tables.
+  AccelTable<DWARF5AccelTableData> AccelDebugNames;
+  AccelTable<AppleAccelTableOffsetData> AccelNames;
+  AccelTable<AppleAccelTableOffsetData> AccelObjC;
+  AccelTable<AppleAccelTableOffsetData> AccelNamespace;
+  AccelTable<AppleAccelTableTypeData> AccelTypes;
 
   // Identify a debugger for "tuning" the debug info.
   DebuggerKind DebuggerTuning = DebuggerKind::Default;
@@ -299,9 +334,9 @@ class DwarfDebug : public DebugHandlerBase {
 
   using InlinedVariable = DbgValueHistoryMap::InlinedVariable;
 
-  void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var,
+  void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV,
                                        const MDNode *Scope);
-  void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var,
+  void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable IV,
                                                const MDNode *Scope);
 
   DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU,
@@ -310,6 +345,10 @@ class DwarfDebug : public DebugHandlerBase {
   /// Construct a DIE for this abstract scope.
   void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
 
+  template <typename DataT>
+  void addAccelNameImpl(AccelTable<DataT> &AppleAccel, StringRef Name,
+                        const DIE &Die);
+
   void finishVariableDefinitions();
 
   void finishSubprogramDefinitions();
@@ -324,9 +363,15 @@ class DwarfDebug : public DebugHandlerBase {
   /// Emit the abbreviation section.
   void emitAbbreviations();
 
+  /// Emit the string offsets table header.
+  void emitStringOffsetsTableHeader();
+
   /// Emit a specified accelerator table.
-  void emitAccel(DwarfAccelTable &Accel, MCSection *Section,
-                 StringRef TableName);
+  template <typename AccelTableT>
+  void emitAccel(AccelTableT &Accel, MCSection *Section, StringRef TableName);
+
+  /// Emit DWARF v5 accelerator table.
+  void emitAccelDebugNames();
 
   /// Emit visible names into a hashed accelerator table section.
   void emitAccelNames();
@@ -363,6 +408,9 @@ class DwarfDebug : public DebugHandlerBase {
   /// Emit address ranges into a debug ranges section.
   void emitDebugRanges();
 
+  /// Emit range lists into a DWARF v5 debug rnglists section.
+  void emitDebugRnglists();
+
   /// Emit macros into a debug macinfo section.
   void emitDebugMacinfo();
   void emitMacro(DIMacro &M);
@@ -375,8 +423,13 @@ class DwarfDebug : public DebugHandlerBase {
   void initSkeletonUnit(const DwarfUnit &U, DIE &Die,
                         std::unique_ptr<DwarfCompileUnit> NewU);
 
-  /// Construct the split debug info compile unit for the debug info
-  /// section.
+  /// Construct the split debug info compile unit for the debug info section.
+  /// In DWARF v5, the skeleton unit DIE may have the following attributes:
+  /// DW_AT_addr_base, DW_AT_comp_dir, DW_AT_dwo_name, DW_AT_high_pc,
+  /// DW_AT_low_pc, DW_AT_ranges, DW_AT_stmt_list, and DW_AT_str_offsets_base.
+  /// Prior to DWARF v5 it may also have DW_AT_GNU_dwo_id. DW_AT_GNU_dwo_name
+  /// is used instead of DW_AT_dwo_name, Dw_AT_GNU_addr_base instead of
+  /// DW_AT_addr_base, and DW_AT_GNU_ranges_base instead of DW_AT_rnglists_base.
   DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU);
 
   /// Emit the debug info dwo section.
@@ -388,6 +441,9 @@ class DwarfDebug : public DebugHandlerBase {
   /// Emit the debug line dwo section.
   void emitDebugLineDWO();
 
+  /// Emit the dwo stringoffsets table header.
+  void emitStringOffsetsTableHeaderDWO();
+
   /// Emit the debug str dwo section.
   void emitDebugStrDWO();
 
@@ -422,6 +478,9 @@ class DwarfDebug : public DebugHandlerBase {
   void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
                                       DenseSet<InlinedVariable> &P);
 
+  /// Emit the reference to the section.
+  void emitSectionReference(const DwarfCompileUnit &CU);
+
 protected:
   /// Gather pre-function debug information.
   void beginFunctionImpl(const MachineFunction *MF) override;
@@ -478,11 +537,30 @@ public:
   /// DWARF4 format.
   bool useDWARF2Bitfields() const { return UseDWARF2Bitfields; }
 
+  /// Returns whether to use inline strings.
+  bool useInlineStrings() const { return UseInlineStrings; }
+
+  /// Returns whether GNU pub sections should be emitted.
+  bool usePubSections() const { return UsePubSections; }
+
+  /// Returns whether ranges section should be emitted.
+  bool useRangesSection() const { return UseRangesSection; }
+
+  /// Returns whether to use sections as labels rather than temp symbols.
+  bool useSectionsAsReferences() const {
+    return UseSectionsAsReferences;
+  }
+
+  /// Returns whether .debug_loc section should be emitted.
+  bool useLocSection() const { return UseLocSection; }
+
+  /// Returns whether to generate DWARF v4 type units.
+  bool generateTypeUnits() const { return GenerateTypeUnits; }
+
   // Experimental DWARF5 features.
 
-  /// Returns whether or not to emit tables that dwarf consumers can
-  /// use to accelerate lookup.
-  bool useDwarfAccelTables() const { return HasDwarfAccelTables; }
+  /// Returns what kind (if any) of accelerator tables to emit.
+  AccelTableKind getAccelTableKind() const { return TheAccelTableKind; }
 
   bool useAppleExtensionAttributes() const {
     return HasAppleExtensionAttributes;
@@ -492,6 +570,16 @@ public:
   /// split dwarf proposal support.
   bool useSplitDwarf() const { return HasSplitDwarf; }
 
+  /// Returns whether to generate a string offsets table with (possibly shared)
+  /// contributions from each CU and type unit. This implies the use of
+  /// DW_FORM_strx* indirect references with DWARF v5 and beyond. Note that
+  /// DW_FORM_GNU_str_index is also an indirect reference, but it is used with
+  /// a pre-DWARF v5 implementation of split DWARF sections, which uses a
+  /// monolithic string offsets table.
+  bool useSegmentedStringOffsetsTable() const {
+    return UseSegmentedStringOffsetsTable;
+  }
+
   bool shareAcrossDWOCUs() const;
 
   /// Returns the Dwarf Version.
@@ -537,6 +625,9 @@ public:
 
   /// Find the matching DwarfCompileUnit for the given CU DIE.
   DwarfCompileUnit *lookupCU(const DIE *Die) { return CUDieMap.lookup(Die); }
+  const DwarfCompileUnit *lookupCU(const DIE *Die) const {
+    return CUDieMap.lookup(Die);
+  }
 
   /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
   ///
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
index 80d5bd208ed8..b57ea8fc6322 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -70,7 +70,7 @@ public:
 };
 
 class LLVM_LIBRARY_VISIBILITY ARMException : public DwarfCFIExceptionBase {
-  void emitTypeInfos(unsigned TTypeEncoding) override;
+  void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) override;
   ARMTargetStreamer &getTargetStreamer();
 
 public:
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 68d25fe37b43..d8d1a5e8f841 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -123,7 +123,10 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
   const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg);
   unsigned RegSize = TRI.getRegSizeInBits(*RC);
   // Keep track of the bits in the register we already emitted, so we
-  // can avoid emitting redundant aliasing subregs.
+  // can avoid emitting redundant aliasing subregs. Because this is
+  // just doing a greedy scan of all subregisters, it is possible that
+  // this doesn't find a combination of subregisters that fully cover
+  // the register (even though one may exist).
   SmallBitVector Coverage(RegSize, false);
   for (MCSubRegIterator SR(MachineReg, &TRI); SR.isValid(); ++SR) {
     unsigned Idx = TRI.getSubRegIndex(MachineReg, *SR);
@@ -143,7 +146,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
     if (CurSubReg.test(Coverage)) {
       // Emit a piece for any gap in the coverage.
       if (Offset > CurPos)
-        DwarfRegs.push_back({-1, Offset - CurPos, nullptr});
+        DwarfRegs.push_back({-1, Offset - CurPos, "no DWARF register encoding"});
       DwarfRegs.push_back(
           {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"});
       if (Offset >= MaxSize)
@@ -154,8 +157,13 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
       CurPos = Offset + Size;
     }
   }
-
-  return CurPos;
+  // Failed to find any DWARF encoding.
+  if (CurPos == 0)
+    return false;
+  // Found a partial or complete DWARF encoding.
+  if (CurPos < RegSize)
+    DwarfRegs.push_back({-1, RegSize - CurPos, "no DWARF register encoding"});
+  return true;
 }
 
 void DwarfExpression::addStackValue() {
@@ -341,11 +349,22 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_plus:
     case dwarf::DW_OP_minus:
     case dwarf::DW_OP_mul:
+    case dwarf::DW_OP_div:
+    case dwarf::DW_OP_mod:
+    case dwarf::DW_OP_or:
+    case dwarf::DW_OP_and:
+    case dwarf::DW_OP_xor:
+    case dwarf::DW_OP_shl:
+    case dwarf::DW_OP_shr:
+    case dwarf::DW_OP_shra:
+    case dwarf::DW_OP_lit0:
+    case dwarf::DW_OP_not:
+    case dwarf::DW_OP_dup:
       emitOp(Op->getOp());
       break;
     case dwarf::DW_OP_deref:
       assert(LocationKind != Register);
-      if (LocationKind != Memory && isMemoryLocation(ExprCursor))
+      if (LocationKind != Memory && ::isMemoryLocation(ExprCursor))
         // Turning this into a memory location description makes the deref
         // implicit.
         LocationKind = Memory;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index ea5cbc40ba35..952b0d99a95a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -211,6 +211,9 @@ public:
   /// Emit an unsigned constant.
   void addUnsignedConstant(const APInt &Value);
 
+  bool isMemoryLocation() const { return LocationKind == Memory; }
+  bool isUnknownLocation() const { return LocationKind == Unknown; }
+
   /// Lock this down to become a memory location description.
   void setMemoryLocationKind() {
     assert(LocationKind == Unknown);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 3c04c969192d..c90bd568162d 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -77,42 +77,24 @@ unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
 void DwarfFile::emitAbbrevs(MCSection *Section) { Abbrevs.Emit(Asm, Section); }
 
 // Emit strings into a string section.
-void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection) {
-  StrPool.emit(*Asm, StrSection, OffsetSection);
+void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection,
+                            bool UseRelativeOffsets) {
+  StrPool.emit(*Asm, StrSection, OffsetSection, UseRelativeOffsets);
 }
 
 bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
-  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
+  auto &ScopeVars = ScopeVariables[LS];
   const DILocalVariable *DV = Var->getVariable();
-  // Variables with positive arg numbers are parameters.
   if (unsigned ArgNum = DV->getArg()) {
-    // Keep all parameters in order at the start of the variable list to ensure
-    // function types are correct (no out-of-order parameters)
-    //
-    // This could be improved by only doing it for optimized builds (unoptimized
-    // builds have the right order to begin with), searching from the back (this
-    // would catch the unoptimized case quickly), or doing a binary search
-    // rather than linear search.
-    auto I = Vars.begin();
-    while (I != Vars.end()) {
-      unsigned CurNum = (*I)->getVariable()->getArg();
-      // A local (non-parameter) variable has been found, insert immediately
-      // before it.
-      if (CurNum == 0)
-        break;
-      // A later indexed parameter has been found, insert immediately before it.
-      if (CurNum > ArgNum)
-        break;
-      if (CurNum == ArgNum) {
-        (*I)->addMMIEntry(*Var);
-        return false;
-      }
-      ++I;
+    auto Cached = ScopeVars.Args.find(ArgNum);
+    if (Cached == ScopeVars.Args.end())
+      ScopeVars.Args[ArgNum] = Var;
+    else {
+      Cached->second->addMMIEntry(*Var);
+      return false;
     }
-    Vars.insert(I, Var);
-    return true;
-  }
-
-  Vars.push_back(Var);
+  } else {
+    ScopeVars.Locals.push_back(Var);
+  }    
   return true;
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index 167ca13c19c1..8dfbc4e1c434 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
+#include <map>
 #include <memory>
 #include <utility>
 
@@ -43,8 +44,23 @@ class DwarfFile {
 
   DwarfStringPool StrPool;
 
-  // Collection of dbg variables of a scope.
-  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> ScopeVariables;
+  /// DWARF v5: The symbol that designates the start of the contribution to
+  /// the string offsets table. The contribution is shared by all units.
+  MCSymbol *StringOffsetsStartSym = nullptr;
+
+  /// DWARF v5: The symbol that designates the base of the range list table.
+  /// The table is shared by all units.
+  MCSymbol *RnglistsTableBaseSym = nullptr;
+
+  /// The variables of a lexical scope.
+  struct ScopeVars {
+    /// We need to sort Args by ArgNo and check for duplicates. This could also
+    /// be implemented as a list or vector + std::lower_bound().
+    std::map<unsigned, DbgVariable *> Args;
+    SmallVector<DbgVariable *, 8> Locals;
+  };
+  /// Collection of DbgVariables of each lexical scope.
+  DenseMap<LexicalScope *, ScopeVars> ScopeVariables;
 
   // Collection of abstract subprogram DIEs.
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
@@ -62,39 +78,51 @@ public:
     return CUs;
   }
 
-  /// \brief Compute the size and offset of a DIE given an incoming Offset.
+  /// Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
 
-  /// \brief Compute the size and offset of all the DIEs.
+  /// Compute the size and offset of all the DIEs.
   void computeSizeAndOffsets();
 
-  /// \brief Compute the size and offset of all the DIEs in the given unit.
+  /// Compute the size and offset of all the DIEs in the given unit.
   /// \returns The size of the root DIE.
   unsigned computeSizeAndOffsetsForUnit(DwarfUnit *TheU);
 
-  /// \brief Add a unit to the list of CUs.
+  /// Add a unit to the list of CUs.
   void addUnit(std::unique_ptr<DwarfCompileUnit> U);
 
-  /// \brief Emit all of the units to the section listed with the given
+  /// Emit all of the units to the section listed with the given
   /// abbreviation section.
   void emitUnits(bool UseOffsets);
 
-  /// \brief Emit the given unit to its section.
+  /// Emit the given unit to its section.
   void emitUnit(DwarfUnit *U, bool UseOffsets);
 
-  /// \brief Emit a set of abbreviations to the specific section.
+  /// Emit a set of abbreviations to the specific section.
   void emitAbbrevs(MCSection *);
 
-  /// \brief Emit all of the strings to the section given.
-  void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr);
+  /// Emit all of the strings to the section given. If OffsetSection is
+  /// non-null, emit a table of string offsets to it. If UseRelativeOffsets
+  /// is false, emit absolute offsets to the strings. Otherwise, emit
+  /// relocatable references to the strings if they are supported by the target.
+  void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr,
+                   bool UseRelativeOffsets = false);
 
-  /// \brief Returns the string pool.
+  /// Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
 
+  MCSymbol *getStringOffsetsStartSym() const { return StringOffsetsStartSym; }
+
+  void setStringOffsetsStartSym(MCSymbol *Sym) { StringOffsetsStartSym = Sym; }
+
+  MCSymbol *getRnglistsTableBaseSym() const { return RnglistsTableBaseSym; }
+
+  void setRnglistsTableBaseSym(MCSymbol *Sym) { RnglistsTableBaseSym = Sym; }
+
   /// \returns false if the variable was merged with a previous one.
   bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
-  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> &getScopeVariables() {
+  DenseMap<LexicalScope *, ScopeVars> &getScopeVariables() {
     return ScopeVariables;
   }
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index aa5f01e88933..a61fa83cfb03 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -39,8 +39,30 @@ DwarfStringPool::EntryRef DwarfStringPool::getEntry(AsmPrinter &Asm,
   return EntryRef(*I.first);
 }
 
+void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm,
+                                                   MCSection *Section,
+                                                   MCSymbol *StartSym) {
+  if (empty())
+    return;
+  Asm.OutStreamer->SwitchSection(Section);
+  unsigned EntrySize = 4;
+  // FIXME: DWARF64
+  // We are emitting the header for a contribution to the string offsets
+  // table. The header consists of an entry with the contribution's
+  // size (not including the size of the length field), the DWARF version and
+  // 2 bytes of padding.
+  Asm.emitInt32(size() * EntrySize + 4);
+  Asm.emitInt16(Asm.getDwarfVersion());
+  Asm.emitInt16(0);
+  // Define the symbol that marks the start of the contribution. It is
+  // referenced by most unit headers via DW_AT_str_offsets_base.
+  // Split units do not use the attribute.
+  if (StartSym)
+    Asm.OutStreamer->EmitLabel(StartSym);
+}
+
 void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
-                           MCSection *OffsetSection) {
+                           MCSection *OffsetSection, bool UseRelativeOffsets) {
   if (Pool.empty())
     return;
 
@@ -74,6 +96,9 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
     Asm.OutStreamer->SwitchSection(OffsetSection);
     unsigned size = 4; // FIXME: DWARF64 is 8.
     for (const auto &Entry : Entries)
-      Asm.OutStreamer->EmitIntValue(Entry->getValue().Offset, size);
+      if (UseRelativeOffsets)
+        Asm.emitDwarfStringOffset(Entry->getValue());
+      else
+        Asm.OutStreamer->EmitIntValue(Entry->getValue().Offset, size);
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
index 1cac3b7c8432..6e6988ea4ad4 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -19,6 +19,7 @@ namespace llvm {
 
 class AsmPrinter;
 class MCSection;
+class MCSymbol;
 
 // Collection of strings for this unit and assorted symbols.
 // A String->Symbol mapping of strings used by indirect
@@ -36,11 +37,17 @@ public:
 
   DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm, StringRef Prefix);
 
+  void emitStringOffsetsTableHeader(AsmPrinter &Asm, MCSection *OffsetSection,
+                                    MCSymbol *StartSym);
+
   void emit(AsmPrinter &Asm, MCSection *StrSection,
-            MCSection *OffsetSection = nullptr);
+            MCSection *OffsetSection = nullptr,
+            bool UseRelativeOffsets = false);
 
   bool empty() const { return Pool.empty(); }
 
+  unsigned size() const { return Pool.size(); }
+
   /// Get a reference to an entry in the string pool.
   EntryRef getEntry(AsmPrinter &Asm, StringRef Str);
 };
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 4ea59f504bd4..43b835b2c4aa 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -19,10 +19,10 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Constants.h"
@@ -30,12 +30,14 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -45,11 +47,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
-static cl::opt<bool>
-GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
-                       cl::desc("Generate DWARF4 type units."),
-                       cl::init(false));
-
 DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
                                        DIELoc &DIE)
     : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU),
@@ -83,8 +80,6 @@ DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
                              MCDwarfDwoLineTable *SplitLineTable)
     : DwarfUnit(dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU), CU(CU),
       SplitLineTable(SplitLineTable) {
-  if (SplitLineTable)
-    addSectionOffset(getUnitDie(), dwarf::DW_AT_stmt_list, 0);
 }
 
 DwarfUnit::~DwarfUnit() {
@@ -185,7 +180,7 @@ bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
     return false;
   return (isa<DIType>(D) ||
           (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
-         !GenerateDwarfTypeUnits;
+         !DD->generateTypeUnits();
 }
 
 DIE *DwarfUnit::getDIE(const DINode *D) const {
@@ -239,9 +234,28 @@ void DwarfUnit::addSInt(DIELoc &Die, Optional<dwarf::Form> Form,
 
 void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
                           StringRef String) {
-  Die.addValue(DIEValueAllocator, Attribute,
-               isDwoUnit() ? dwarf::DW_FORM_GNU_str_index : dwarf::DW_FORM_strp,
-               DIEString(DU->getStringPool().getEntry(*Asm, String)));
+  if (DD->useInlineStrings()) {
+    Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_string,
+                 new (DIEValueAllocator)
+                     DIEInlineString(String, DIEValueAllocator));
+    return;
+  }
+  auto StringPoolEntry = DU->getStringPool().getEntry(*Asm, String);
+  dwarf::Form IxForm =
+      isDwoUnit() ? dwarf::DW_FORM_GNU_str_index : dwarf::DW_FORM_strp;
+  // For DWARF v5 and beyond, use the smallest strx? form possible.
+  if (useSegmentedStringOffsetsTable()) {
+    IxForm = dwarf::DW_FORM_strx1;
+    unsigned Index = StringPoolEntry.getIndex();
+    if (Index > 0xffffff)
+      IxForm = dwarf::DW_FORM_strx4;
+    else if (Index > 0xffff)
+      IxForm = dwarf::DW_FORM_strx3;
+    else if (Index > 0xff)
+      IxForm = dwarf::DW_FORM_strx2;
+  }
+  Die.addValue(DIEValueAllocator, Attribute, IxForm,
+               DIEString(StringPoolEntry));
 }
 
 DIEValueList::value_iterator DwarfUnit::addLabel(DIEValueList &Die,
@@ -263,9 +277,33 @@ void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
     addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer);
 }
 
-unsigned DwarfTypeUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
-  return SplitLineTable ? SplitLineTable->getFile(DirName, FileName)
-                        : getCU().getOrCreateSourceID(FileName, DirName);
+MD5::MD5Result *DwarfUnit::getMD5AsBytes(const DIFile *File) const {
+  assert(File);
+  if (DD->getDwarfVersion() < 5)
+    return nullptr;
+  Optional<DIFile::ChecksumInfo<StringRef>> Checksum = File->getChecksum();
+  if (!Checksum || Checksum->Kind != DIFile::CSK_MD5)
+    return nullptr;
+
+  // Convert the string checksum to an MD5Result for the streamer.
+  // The verifier validates the checksum so we assume it's okay.
+  // An MD5 checksum is 16 bytes.
+  std::string ChecksumString = fromHex(Checksum->Value);
+  void *CKMem = Asm->OutStreamer->getContext().allocate(16, 1);
+  memcpy(CKMem, ChecksumString.data(), 16);
+  return reinterpret_cast<MD5::MD5Result *>(CKMem);
+}
+
+unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) {
+  if (!SplitLineTable)
+    return getCU().getOrCreateSourceID(File);
+  if (!UsedLineTable) {
+    UsedLineTable = true;
+    // This is a split type unit that needs a line table.
+    addSectionOffset(getUnitDie(), dwarf::DW_AT_stmt_list, 0);
+  }
+  return SplitLineTable->getFile(File->getDirectory(), File->getFilename(),
+                                 getMD5AsBytes(File), File->getSource());
 }
 
 void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
@@ -335,12 +373,11 @@ void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
   Die.addValue(DIEValueAllocator, Attribute, Block->BestForm(), Block);
 }
 
-void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, StringRef File,
-                              StringRef Directory) {
+void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, const DIFile *File) {
   if (Line == 0)
     return;
 
-  unsigned FileID = getOrCreateSourceID(File, Directory);
+  unsigned FileID = getOrCreateSourceID(File);
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
@@ -349,32 +386,31 @@ void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, StringRef File,
 void DwarfUnit::addSourceLine(DIE &Die, const DILocalVariable *V) {
   assert(V);
 
-  addSourceLine(Die, V->getLine(), V->getScope()->getFilename(),
-                V->getScope()->getDirectory());
+  addSourceLine(Die, V->getLine(), V->getFile());
 }
 
 void DwarfUnit::addSourceLine(DIE &Die, const DIGlobalVariable *G) {
   assert(G);
 
-  addSourceLine(Die, G->getLine(), G->getFilename(), G->getDirectory());
+  addSourceLine(Die, G->getLine(), G->getFile());
 }
 
 void DwarfUnit::addSourceLine(DIE &Die, const DISubprogram *SP) {
   assert(SP);
 
-  addSourceLine(Die, SP->getLine(), SP->getFilename(), SP->getDirectory());
+  addSourceLine(Die, SP->getLine(), SP->getFile());
 }
 
 void DwarfUnit::addSourceLine(DIE &Die, const DIType *Ty) {
   assert(Ty);
 
-  addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
+  addSourceLine(Die, Ty->getLine(), Ty->getFile());
 }
 
 void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) {
   assert(Ty);
 
-  addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
+  addSourceLine(Die, Ty->getLine(), Ty->getFile());
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -727,7 +763,7 @@ DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
   else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
     constructTypeDIE(TyDIE, STy);
   else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
-    if (GenerateDwarfTypeUnits && !Ty->isForwardDecl())
+    if (DD->generateTypeUnits() && !Ty->isForwardDecl())
       if (MDString *TypeId = CTy->getRawIdentifier()) {
         DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
         // Skip updating the accelerator tables since this is not the full type.
@@ -917,9 +953,24 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   case dwarf::DW_TAG_enumeration_type:
     constructEnumTypeDIE(Buffer, CTy);
     break;
+  case dwarf::DW_TAG_variant_part:
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_class_type: {
+    // Emit the discriminator for a variant part.
+    DIDerivedType *Discriminator = nullptr;
+    if (Tag == dwarf::DW_TAG_variant_part) {
+      Discriminator = CTy->getDiscriminator();
+      if (Discriminator) {
+        // DWARF says:
+        //    If the variant part has a discriminant, the discriminant is
+        //    represented by a separate debugging information entry which is
+        //    a child of the variant part entry.
+        DIE &DiscMember = constructMemberDIE(Buffer, Discriminator);
+        addDIEEntry(Buffer, dwarf::DW_AT_discr, DiscMember);
+      }
+    }
+
     // Add elements to structure type.
     DINodeArray Elements = CTy->getElements();
     for (const auto *Element : Elements) {
@@ -933,6 +984,18 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
           addType(ElemDie, resolve(DDTy->getBaseType()), dwarf::DW_AT_friend);
         } else if (DDTy->isStaticMember()) {
           getOrCreateStaticMemberDIE(DDTy);
+        } else if (Tag == dwarf::DW_TAG_variant_part) {
+          // When emitting a variant part, wrap each member in
+          // DW_TAG_variant.
+          DIE &Variant = createAndAddDIE(dwarf::DW_TAG_variant, Buffer);
+          if (const ConstantInt *CI =
+              dyn_cast_or_null<ConstantInt>(DDTy->getDiscriminantValue())) {
+            if (isUnsignedDIType(DD, resolve(Discriminator->getBaseType())))
+              addUInt(Variant, dwarf::DW_AT_discr_value, None, CI->getZExtValue());
+            else
+              addSInt(Variant, dwarf::DW_AT_discr_value, None, CI->getSExtValue());
+          }
+          constructMemberDIE(Variant, DDTy);
         } else {
           constructMemberDIE(Buffer, DDTy);
         }
@@ -952,6 +1015,11 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
         if (unsigned PropertyAttributes = Property->getAttributes())
           addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, None,
                   PropertyAttributes);
+      } else if (auto *Composite = dyn_cast<DICompositeType>(Element)) {
+        if (Composite->getTag() == dwarf::DW_TAG_variant_part) {
+          DIE &VariantPart = createAndAddDIE(Composite->getTag(), Buffer);
+          constructTypeDIE(VariantPart, Composite);
+        }
       }
     }
 
@@ -975,6 +1043,15 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
         Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
       addTemplateParams(Buffer, CTy->getTemplateParams());
 
+    // Add the type's non-standard calling convention.
+    uint8_t CC = 0;
+    if (CTy->isTypePassByValue())
+      CC = dwarf::DW_CC_pass_by_value;
+    else if (CTy->isTypePassByReference())
+      CC = dwarf::DW_CC_pass_by_reference;
+    if (CC)
+      addUInt(Buffer, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1,
+              CC);
     break;
   }
   default:
@@ -1152,9 +1229,8 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
     // Look at the Decl's linkage name only if we emitted it.
     if (DD->useAllLinkageNames())
       DeclLinkageName = SPDecl->getLinkageName();
-    unsigned DeclID =
-        getOrCreateSourceID(SPDecl->getFilename(), SPDecl->getDirectory());
-    unsigned DefID = getOrCreateSourceID(SP->getFilename(), SP->getDirectory());
+    unsigned DeclID = getOrCreateSourceID(SPDecl->getFile());
+    unsigned DefID = getOrCreateSourceID(SP->getFile());
     if (DeclID != DefID)
       addUInt(SPDie, dwarf::DW_AT_decl_file, None, DefID);
 
@@ -1304,14 +1380,17 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
   // DW_AT_lower_bound and DW_AT_count attributes.
   int64_t LowerBound = SR->getLowerBound();
   int64_t DefaultLowerBound = getDefaultLowerBound();
-  int64_t Count = SR->getCount();
+  int64_t Count = -1;
+  if (auto *CI = SR->getCount().dyn_cast<ConstantInt*>())
+    Count = CI->getSExtValue();
 
   if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
     addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
 
-  if (Count != -1)
-    // FIXME: An unbounded array should reference the expression that defines
-    // the array.
+  if (auto *CV = SR->getCount().dyn_cast<DIVariable*>()) {
+    if (auto *CountVarDIE = getDIE(CV))
+      addDIEEntry(DW_Subrange, dwarf::DW_AT_count, *CountVarDIE);
+  } else if (Count != -1)
     addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
 }
 
@@ -1320,16 +1399,49 @@ DIE *DwarfUnit::getIndexTyDie() {
     return IndexTyDie;
   // Construct an integer type to use for indexes.
   IndexTyDie = &createAndAddDIE(dwarf::DW_TAG_base_type, getUnitDie());
-  addString(*IndexTyDie, dwarf::DW_AT_name, "sizetype");
+  StringRef Name = "__ARRAY_SIZE_TYPE__";
+  addString(*IndexTyDie, dwarf::DW_AT_name, Name);
   addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
   addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
           dwarf::DW_ATE_unsigned);
+  DD->addAccelType(Name, *IndexTyDie, /*Flags*/ 0);
   return IndexTyDie;
 }
 
+/// Returns true if the vector's size differs from the sum of sizes of elements
+/// the user specified.  This can occur if the vector has been rounded up to
+/// fit memory alignment constraints.
+static bool hasVectorBeenPadded(const DICompositeType *CTy) {
+  assert(CTy && CTy->isVector() && "Composite type is not a vector");
+  const uint64_t ActualSize = CTy->getSizeInBits();
+
+  // Obtain the size of each element in the vector.
+  DIType *BaseTy = CTy->getBaseType().resolve();
+  assert(BaseTy && "Unknown vector element type.");
+  const uint64_t ElementSize = BaseTy->getSizeInBits();
+
+  // Locate the number of elements in the vector.
+  const DINodeArray Elements = CTy->getElements();
+  assert(Elements.size() == 1 &&
+         Elements[0]->getTag() == dwarf::DW_TAG_subrange_type &&
+         "Invalid vector element array, expected one element of type subrange");
+  const auto Subrange = cast<DISubrange>(Elements[0]);
+  const auto CI = Subrange->getCount().get<ConstantInt *>();
+  const int32_t NumVecElements = CI->getSExtValue();
+
+  // Ensure we found the element count and that the actual size is wide
+  // enough to contain the requested size.
+  assert(ActualSize >= (NumVecElements * ElementSize) && "Invalid vector size");
+  return ActualSize != (NumVecElements * ElementSize);
+}
+
 void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
-  if (CTy->isVector())
+  if (CTy->isVector()) {
     addFlag(Buffer, dwarf::DW_AT_GNU_vector);
+    if (hasVectorBeenPadded(CTy))
+      addUInt(Buffer, dwarf::DW_AT_byte_size, None,
+              CTy->getSizeInBits() / CHAR_BIT);
+  }
 
   // Emit the element type.
   addType(Buffer, resolve(CTy->getBaseType()));
@@ -1350,6 +1462,15 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
 }
 
 void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  const DIType *DTy = resolve(CTy->getBaseType());
+  bool IsUnsigned = DTy && isUnsignedDIType(DD, DTy);
+  if (DTy) {
+    if (DD->getDwarfVersion() >= 3)
+      addType(Buffer, DTy);
+    if (DD->getDwarfVersion() >= 4 && (CTy->getFlags() & DINode::FlagFixedEnum))
+      addFlag(Buffer, dwarf::DW_AT_enum_class);
+  }
+
   DINodeArray Elements = CTy->getElements();
 
   // Add enumerators to enumeration type.
@@ -1359,16 +1480,10 @@ void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       DIE &Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
       StringRef Name = Enum->getName();
       addString(Enumerator, dwarf::DW_AT_name, Name);
-      int64_t Value = Enum->getValue();
-      addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
-              Value);
+      auto Value = static_cast<uint64_t>(Enum->getValue());
+      addConstantValue(Enumerator, IsUnsigned, Value);
     }
   }
-  const DIType *DTy = resolve(CTy->getBaseType());
-  if (DTy) {
-    addType(Buffer, DTy);
-    addFlag(Buffer, dwarf::DW_AT_enum_class);
-  }
 }
 
 void DwarfUnit::constructContainingTypeDIEs() {
@@ -1385,7 +1500,7 @@ void DwarfUnit::constructContainingTypeDIEs() {
   }
 }
 
-void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
+DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
   DIE &MemberDie = createAndAddDIE(DT->getTag(), Buffer);
   StringRef Name = DT->getName();
   if (!Name.empty())
@@ -1490,6 +1605,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
 
   if (DT->isArtificial())
     addFlag(MemberDie, dwarf::DW_AT_artificial);
+
+  return MemberDie;
 }
 
 DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
@@ -1542,18 +1659,18 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
 void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   // Emit size of content not including length itself
   Asm->OutStreamer->AddComment("Length of Unit");
-  Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize());
+  Asm->emitInt32(getHeaderSize() + getUnitDie().getSize());
 
   Asm->OutStreamer->AddComment("DWARF version number");
   unsigned Version = DD->getDwarfVersion();
-  Asm->EmitInt16(Version);
+  Asm->emitInt16(Version);
 
   // DWARF v5 reorders the address size and adds a unit type.
   if (Version >= 5) {
     Asm->OutStreamer->AddComment("DWARF Unit Type");
-    Asm->EmitInt8(UT);
+    Asm->emitInt8(UT);
     Asm->OutStreamer->AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(Asm->MAI->getCodePointerSize());
+    Asm->emitInt8(Asm->MAI->getCodePointerSize());
   }
 
   // We share one abbreviations table across all units so it's always at the
@@ -1562,14 +1679,14 @@ void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   if (UseOffsets)
-    Asm->EmitInt32(0);
+    Asm->emitInt32(0);
   else
     Asm->emitDwarfSymbolReference(
         TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
 
   if (Version <= 4) {
     Asm->OutStreamer->AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(Asm->MAI->getCodePointerSize());
+    Asm->emitInt8(Asm->MAI->getCodePointerSize());
   }
 }
 
@@ -1628,3 +1745,19 @@ const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const {
     return nullptr;
   return getSection()->getBeginSymbol();
 }
+
+void DwarfUnit::addStringOffsetsStart() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(getUnitDie(), dwarf::DW_AT_str_offsets_base,
+                  DU->getStringOffsetsStartSym(),
+                  TLOF.getDwarfStrOffSection()->getBeginSymbol());
+}
+
+void DwarfUnit::addRnglistsBase() {
+  assert(DD->getDwarfVersion() >= 5 &&
+         "DW_AT_rnglists_base requires DWARF version 5 or later");
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(getUnitDie(), dwarf::DW_AT_rnglists_base,
+                  DU->getRnglistsTableBaseSym(),
+                  TLOF.getDwarfRnglistsSection()->getBeginSymbol());
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 4cc01b3298d4..69696f626536 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -98,7 +98,7 @@ protected:
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const DINode *> ContainingTypeMap;
 
-  DwarfUnit(dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A, DwarfDebug *DW,
+  DwarfUnit(dwarf::Tag, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW,
             DwarfFile *DWU);
 
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
@@ -112,6 +112,8 @@ public:
   uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
   const DICompileUnit *getCUNode() const { return CUNode; }
 
+  uint16_t getDwarfVersion() const { return DD->getDwarfVersion(); }
+
   /// Return true if this compile unit has something to write out.
   bool hasContent() const { return getUnitDie().hasChildren(); }
 
@@ -185,7 +187,7 @@ public:
 
   /// Add a dwarf op address data and value using the form given and an
   /// op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
-  void addOpAddress(DIELoc &Die, const MCSymbol *Label);
+  void addOpAddress(DIELoc &Die, const MCSymbol *Sym);
 
   /// Add a label delta attribute data and value.
   void addLabelDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
@@ -201,14 +203,13 @@ public:
   void addDIETypeSignature(DIE &Die, uint64_t Signature);
 
   /// Add block data.
-  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc);
 
   /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIEBlock *Block);
 
   /// Add location information to specified debug information entry.
-  void addSourceLine(DIE &Die, unsigned Line, StringRef File,
-                     StringRef Directory);
+  void addSourceLine(DIE &Die, unsigned Line, const DIFile *File);
   void addSourceLine(DIE &Die, const DILocalVariable *V);
   void addSourceLine(DIE &Die, const DIGlobalVariable *G);
   void addSourceLine(DIE &Die, const DISubprogram *SP);
@@ -259,7 +260,7 @@ public:
                                  bool SkipSPAttributes = false);
 
   /// Find existing DIE or create new DIE for the given type.
-  DIE *getOrCreateTypeDIE(const MDNode *N);
+  DIE *getOrCreateTypeDIE(const MDNode *TyNode);
 
   /// Get context owner's DIE.
   DIE *getOrCreateContextDIE(const DIScope *Context);
@@ -274,6 +275,10 @@ public:
   /// call insertDIE if MD is not null.
   DIE &createAndAddDIE(unsigned Tag, DIE &Parent, const DINode *N = nullptr);
 
+  bool useSegmentedStringOffsetsTable() const {
+    return DD->useSegmentedStringOffsetsTable();
+  }
+
   /// Compute the size of a header for this unit, not including the initial
   /// length field.
   virtual unsigned getHeaderSize() const {
@@ -287,6 +292,12 @@ public:
   /// Emit the header for this unit, not including the initial length field.
   virtual void emitHeader(bool UseOffsets) = 0;
 
+  /// Add the DW_AT_str_offsets_base attribute to the unit DIE.
+  void addStringOffsetsStart();
+
+  /// Add the DW_AT_rnglists_base attribute to the unit DIE.
+  void addRnglistsBase();
+
   virtual DwarfCompileUnit &getCU() = 0;
 
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
@@ -300,15 +311,19 @@ public:
                                       const MCSymbol *Label,
                                       const MCSymbol *Sec);
 
+  /// If the \p File has an MD5 checksum, return it as an MD5Result
+  /// allocated in the MCContext.
+  MD5::MD5Result *getMD5AsBytes(const DIFile *File) const;
+
 protected:
   ~DwarfUnit();
 
   /// Create new static data member DIE.
   DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
 
-  /// Look up the source ID with the given directory and source file names. If
-  /// none currently exists, create a new ID and insert it in the line table.
-  virtual unsigned getOrCreateSourceID(StringRef File, StringRef Directory) = 0;
+  /// Look up the source ID for the given file. If none currently exists,
+  /// create a new ID and insert it in the line table.
+  virtual unsigned getOrCreateSourceID(const DIFile *File) = 0;
 
   /// Look in the DwarfDebug map for the MDNode that corresponds to the
   /// reference.
@@ -327,11 +342,11 @@ protected:
 private:
   void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
-  void constructTypeDIE(DIE &Buffer, const DISubroutineType *DTy);
+  void constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy);
   void constructSubrangeDIE(DIE &Buffer, const DISubrange *SR, DIE *IndexTy);
   void constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy);
   void constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy);
-  void constructMemberDIE(DIE &Buffer, const DIDerivedType *DT);
+  DIE &constructMemberDIE(DIE &Buffer, const DIDerivedType *DT);
   void constructTemplateTypeParameterDIE(DIE &Buffer,
                                          const DITemplateTypeParameter *TP);
   void constructTemplateValueParameterDIE(DIE &Buffer,
@@ -357,8 +372,9 @@ class DwarfTypeUnit final : public DwarfUnit {
   const DIE *Ty;
   DwarfCompileUnit &CU;
   MCDwarfDwoLineTable *SplitLineTable;
+  bool UsedLineTable = false;
 
-  unsigned getOrCreateSourceID(StringRef File, StringRef Directory) override;
+  unsigned getOrCreateSourceID(const DIFile *File) override;
   bool isDwoUnit() const override;
 
 public:
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 3cdab57bca70..65de9d7e65a4 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -30,6 +29,7 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -58,10 +58,10 @@ unsigned EHStreamer::sharedTypeIDs(const LandingPadInfo *L,
 
 /// Compute the actions table and gather the first action index for each landing
 /// pad site.
-unsigned EHStreamer::
-computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
-                    SmallVectorImpl<ActionEntry> &Actions,
-                    SmallVectorImpl<unsigned> &FirstActions) {
+void EHStreamer::computeActionsTable(
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    SmallVectorImpl<ActionEntry> &Actions,
+    SmallVectorImpl<unsigned> &FirstActions) {
   // The action table follows the call-site table in the LSDA. The individual
   // records are of two types:
   //
@@ -149,7 +149,7 @@ computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
       FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
     } // else identical - re-use previous FirstAction
 
-    // Information used when created the call-site table. The action record
+    // Information used when creating the call-site table. The action record
     // field of the call site record is the offset of the first associated
     // action record, relative to the start of the actions table. This value is
     // biased by 1 (1 indicating the start of the actions table), and 0
@@ -161,8 +161,6 @@ computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
 
     PrevLPI = LPI;
   }
-
-  return SizeActions;
 }
 
 /// Return `true' if this is a call to a function marked `nounwind'. Return
@@ -361,55 +359,33 @@ void EHStreamer::emitExceptionTable() {
     LandingPads.push_back(&PadInfos[i]);
 
   // Order landing pads lexicographically by type id.
-  std::sort(LandingPads.begin(), LandingPads.end(),
-            [](const LandingPadInfo *L,
-               const LandingPadInfo *R) { return L->TypeIds < R->TypeIds; });
+  llvm::sort(LandingPads.begin(), LandingPads.end(),
+             [](const LandingPadInfo *L,
+                const LandingPadInfo *R) { return L->TypeIds < R->TypeIds; });
 
   // Compute the actions table and gather the first action index for each
   // landing pad site.
   SmallVector<ActionEntry, 32> Actions;
   SmallVector<unsigned, 64> FirstActions;
-  unsigned SizeActions =
-    computeActionsTable(LandingPads, Actions, FirstActions);
+  computeActionsTable(LandingPads, Actions, FirstActions);
 
   // Compute the call-site table.
   SmallVector<CallSiteEntry, 64> CallSites;
   computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
-  // Final tallies.
-
-  // Call sites.
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
-  bool HaveTTData = IsSJLJ ? (!TypeInfos.empty() || !FilterIds.empty()) : true;
-
-  unsigned CallSiteTableLength;
-  if (IsSJLJ)
-    CallSiteTableLength = 0;
-  else {
-    unsigned SiteStartSize  = 4; // dwarf::DW_EH_PE_udata4
-    unsigned SiteLengthSize = 4; // dwarf::DW_EH_PE_udata4
-    unsigned LandingPadSize = 4; // dwarf::DW_EH_PE_udata4
-    CallSiteTableLength =
-      CallSites.size() * (SiteStartSize + SiteLengthSize + LandingPadSize);
-  }
-
-  for (unsigned i = 0, e = CallSites.size(); i < e; ++i) {
-    CallSiteTableLength += getULEB128Size(CallSites[i].Action);
-    if (IsSJLJ)
-      CallSiteTableLength += getULEB128Size(i);
-  }
+  unsigned CallSiteEncoding =
+      IsSJLJ ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_uleb128;
+  bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
 
   // Type infos.
   MCSection *LSDASection = Asm->getObjFileLowering().getLSDASection();
   unsigned TTypeEncoding;
-  unsigned TypeFormatSize;
 
   if (!HaveTTData) {
-    // For SjLj exceptions, if there is no TypeInfo, then we just explicitly say
-    // that we're omitting that bit.
+    // If there is no TypeInfo, then we just explicitly say that we're omitting
+    // that bit.
     TTypeEncoding = dwarf::DW_EH_PE_omit;
-    // dwarf::DW_EH_PE_absptr
-    TypeFormatSize = Asm->getDataLayout().getPointerSize();
   } else {
     // Okay, we have actual filters or typeinfos to emit.  As such, we need to
     // pick a type encoding for them.  We're about to emit a list of pointers to
@@ -439,7 +415,6 @@ void EHStreamer::emitExceptionTable() {
     // in target-independent code.
     //
     TTypeEncoding = Asm->getObjFileLowering().getTTypeEncoding();
-    TypeFormatSize = Asm->GetSizeOfEncodedValue(TTypeEncoding);
   }
 
   // Begin the exception table.
@@ -460,64 +435,35 @@ void EHStreamer::emitExceptionTable() {
   Asm->EmitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
   Asm->EmitEncodingByte(TTypeEncoding, "@TType");
 
-  // The type infos need to be aligned. GCC does this by inserting padding just
-  // before the type infos. However, this changes the size of the exception
-  // table, so you need to take this into account when you output the exception
-  // table size. However, the size is output using a variable length encoding.
-  // So by increasing the size by inserting padding, you may increase the number
-  // of bytes used for writing the size. If it increases, say by one byte, then
-  // you now need to output one less byte of padding to get the type infos
-  // aligned. However this decreases the size of the exception table. This
-  // changes the value you have to output for the exception table size. Due to
-  // the variable length encoding, the number of bytes used for writing the
-  // length may decrease. If so, you then have to increase the amount of
-  // padding. And so on. If you look carefully at the GCC code you will see that
-  // it indeed does this in a loop, going on and on until the values stabilize.
-  // We chose another solution: don't output padding inside the table like GCC
-  // does, instead output it before the table.
-  unsigned SizeTypes = TypeInfos.size() * TypeFormatSize;
-  unsigned CallSiteTableLengthSize = getULEB128Size(CallSiteTableLength);
-  unsigned TTypeBaseOffset =
-    sizeof(int8_t) +                            // Call site format
-    CallSiteTableLengthSize +                   // Call site table length size
-    CallSiteTableLength +                       // Call site table length
-    SizeActions +                               // Actions size
-    SizeTypes;
-  unsigned TTypeBaseOffsetSize = getULEB128Size(TTypeBaseOffset);
-  unsigned TotalSize =
-    sizeof(int8_t) +                            // LPStart format
-    sizeof(int8_t) +                            // TType format
-    (HaveTTData ? TTypeBaseOffsetSize : 0) +    // TType base offset size
-    TTypeBaseOffset;                            // TType base offset
-  unsigned PadBytes = (4 - TotalSize) & 3;
-
+  MCSymbol *TTBaseLabel = nullptr;
   if (HaveTTData) {
-    // Account for any extra padding that will be added to the call site table
-    // length.
-    Asm->EmitPaddedULEB128(TTypeBaseOffset, TTypeBaseOffsetSize + PadBytes,
-                           "@TType base offset");
-    PadBytes = 0;
+    // N.B.: There is a dependency loop between the size of the TTBase uleb128
+    // here and the amount of padding before the aligned type table. The
+    // assembler must sometimes pad this uleb128 or insert extra padding before
+    // the type table. See PR35809 or GNU as bug 4029.
+    MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
+    TTBaseLabel = Asm->createTempSymbol("ttbase");
+    Asm->EmitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
+    Asm->OutStreamer->EmitLabel(TTBaseRefLabel);
   }
 
   bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
 
+  // Emit the landing pad call site table.
+  MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
+  MCSymbol *CstEndLabel = Asm->createTempSymbol("cst_end");
+  Asm->EmitEncodingByte(CallSiteEncoding, "Call site");
+  Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
+  Asm->OutStreamer->EmitLabel(CstBeginLabel);
+
   // SjLj Exception handling
   if (IsSJLJ) {
-    Asm->EmitEncodingByte(dwarf::DW_EH_PE_udata4, "Call site");
-
-    // Add extra padding if it wasn't added to the TType base offset.
-    Asm->EmitPaddedULEB128(CallSiteTableLength,
-                           CallSiteTableLengthSize + PadBytes,
-                           "Call site table length");
-
-    // Emit the landing pad site information.
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
       const CallSiteEntry &S = *I;
 
-      // Offset of the landing pad, counted in 16-byte bundles relative to the
-      // @LPStart address.
+      // Index of the call site entry.
       if (VerboseAsm) {
         Asm->OutStreamer->AddComment(">> Call Site " + Twine(idx) + " <<");
         Asm->OutStreamer->AddComment("  On exception at call site "+Twine(idx));
@@ -557,14 +503,6 @@ void EHStreamer::emitExceptionTable() {
     // A missing entry in the call-site table indicates that a call is not
     // supposed to throw.
 
-    // Emit the landing pad call site table.
-    Asm->EmitEncodingByte(dwarf::DW_EH_PE_udata4, "Call site");
-
-    // Add extra padding if it wasn't added to the TType base offset.
-    Asm->EmitPaddedULEB128(CallSiteTableLength,
-                           CallSiteTableLengthSize + PadBytes,
-                           "Call site table length");
-
     unsigned Entry = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I) {
@@ -579,29 +517,27 @@ void EHStreamer::emitExceptionTable() {
       if (!EndLabel)
         EndLabel = Asm->getFunctionEnd();
 
-      // Offset of the call site relative to the previous call site, counted in
-      // number of 16-byte bundles. The first call site is counted relative to
-      // the start of the procedure fragment.
+      // Offset of the call site relative to the start of the procedure.
       if (VerboseAsm)
         Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) + " <<");
-      Asm->EmitLabelDifference(BeginLabel, EHFuncBeginSym, 4);
+      Asm->EmitLabelDifferenceAsULEB128(BeginLabel, EHFuncBeginSym);
       if (VerboseAsm)
         Asm->OutStreamer->AddComment(Twine("  Call between ") +
                                      BeginLabel->getName() + " and " +
                                      EndLabel->getName());
-      Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
+      Asm->EmitLabelDifferenceAsULEB128(EndLabel, BeginLabel);
 
-      // Offset of the landing pad, counted in 16-byte bundles relative to the
-      // @LPStart address.
+      // Offset of the landing pad relative to the start of the procedure.
       if (!S.LPad) {
         if (VerboseAsm)
           Asm->OutStreamer->AddComment("    has no landing pad");
-        Asm->OutStreamer->EmitIntValue(0, 4/*size*/);
+        Asm->EmitULEB128(0);
       } else {
         if (VerboseAsm)
           Asm->OutStreamer->AddComment(Twine("    jumps to ") +
                                        S.LPad->LandingPadLabel->getName());
-        Asm->EmitLabelDifference(S.LPad->LandingPadLabel, EHFuncBeginSym, 4);
+        Asm->EmitLabelDifferenceAsULEB128(S.LPad->LandingPadLabel,
+                                          EHFuncBeginSym);
       }
 
       // Offset of the first associated action record, relative to the start of
@@ -617,6 +553,7 @@ void EHStreamer::emitExceptionTable() {
       Asm->EmitULEB128(S.Action);
     }
   }
+  Asm->OutStreamer->EmitLabel(CstEndLabel);
 
   // Emit the Action Table.
   int Entry = 0;
@@ -660,12 +597,15 @@ void EHStreamer::emitExceptionTable() {
     Asm->EmitSLEB128(Action.NextAction);
   }
 
-  emitTypeInfos(TTypeEncoding);
+  if (HaveTTData) {
+    Asm->EmitAlignment(2);
+    emitTypeInfos(TTypeEncoding, TTBaseLabel);
+  }
 
   Asm->EmitAlignment(2);
 }
 
-void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
+void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
   const MachineFunction *MF = Asm->MF;
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
@@ -687,6 +627,8 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
     Asm->EmitTTypeReference(GV, TTypeEncoding);
   }
 
+  Asm->OutStreamer->EmitLabel(TTBaseLabel);
+
   // Emit the Exception Specifications.
   if (VerboseAsm && !FilterIds.empty()) {
     Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
index 7962b761d8de..b89421a1e067 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -72,9 +72,9 @@ protected:
 
   /// Compute the actions table and gather the first action index for each
   /// landing pad site.
-  unsigned computeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
-                               SmallVectorImpl<ActionEntry> &Actions,
-                               SmallVectorImpl<unsigned> &FirstActions);
+  void computeActionsTable(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+                           SmallVectorImpl<ActionEntry> &Actions,
+                           SmallVectorImpl<unsigned> &FirstActions);
 
   void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      RangeMapType &PadMap);
@@ -86,7 +86,7 @@ protected:
   /// no entry and must not be contained in the try-range of any entry - they
   /// form gaps in the table.  Entries must be ordered by try-range address.
   void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
+                            const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                             const SmallVectorImpl<unsigned> &FirstActions);
 
   /// Emit landing pads and actions.
@@ -110,9 +110,9 @@ protected:
   ///     catches in the function.  This tables is reversed indexed base 1.
   void emitExceptionTable();
 
-  virtual void emitTypeInfos(unsigned TTypeEncoding);
+  virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel);
 
-  // Helpers for for identifying what kind of clause an EH typeid or selector
+  // Helpers for identifying what kind of clause an EH typeid or selector
   // corresponds to. Negative selectors are for filter clauses, the zero
   // selector is for cleanups, and positive selectors are for catch clauses.
   static bool isFilterEHSelector(int Selector) { return Selector < 0; }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index e459c02c9a6e..49cc376fcc98 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCs.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -27,6 +26,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
 
@@ -77,7 +77,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
 
     // Emit PointCount.
     OS.AddComment("safe point count");
-    AP.EmitInt16(MD.size());
+    AP.emitInt16(MD.size());
 
     // And each safe point...
     for (GCFunctionInfo::iterator PI = MD.begin(), PE = MD.end(); PI != PE;
@@ -94,7 +94,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
 
     // Emit the stack frame size.
     OS.AddComment("stack frame size (in words)");
-    AP.EmitInt16(MD.getFrameSize() / IntPtrSize);
+    AP.emitInt16(MD.getFrameSize() / IntPtrSize);
 
     // Emit stack arity, i.e. the number of stacked arguments.
     unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
@@ -102,11 +102,11 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                               ? MD.getFunction().arg_size() - RegisteredArgs
                               : 0;
     OS.AddComment("stack arity");
-    AP.EmitInt16(StackArity);
+    AP.emitInt16(StackArity);
 
     // Emit the number of live roots in the function.
     OS.AddComment("live root count");
-    AP.EmitInt16(MD.live_size(PI));
+    AP.emitInt16(MD.live_size(PI));
 
     // And for each live root...
     for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
@@ -114,7 +114,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
          LI != LE; ++LI) {
       // Emit live root's offset within the stack frame.
       OS.AddComment("stack index (offset / wordsize)");
-      AP.EmitInt16(LI->StackOffset / IntPtrSize);
+      AP.emitInt16(LI->StackOffset / IntPtrSize);
     }
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index e0cc241dd23f..59a57ed30d10 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -18,7 +18,6 @@
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCs.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
@@ -27,6 +26,7 @@
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <cctype>
 #include <cstddef>
 #include <cstdint>
@@ -129,7 +129,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     // Very rude!
     report_fatal_error(" Too much descriptor for ocaml GC");
   }
-  AP.EmitInt16(NumDescriptors);
+  AP.emitInt16(NumDescriptors);
   AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
 
   for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
@@ -166,8 +166,8 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
       }
 
       AP.OutStreamer->EmitSymbolValue(J->Label, IntPtrSize);
-      AP.EmitInt16(FrameSize);
-      AP.EmitInt16(LiveCount);
+      AP.emitInt16(FrameSize);
+      AP.emitInt16(LiveCount);
 
       for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
                                          KE = FI.live_end(J);
@@ -178,7 +178,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
               "GC root stack offset is outside of fixed stack frame and out "
               "of range for ocaml GC!");
         }
-        AP.EmitInt16(K->StackOffset);
+        AP.emitInt16(K->StackOffset);
       }
 
       AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
new file mode 100644
index 000000000000..18d37caf57ee
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -0,0 +1,45 @@
+//===-- CodeGen/AsmPrinter/WinCFGuard.cpp - Control Flow Guard Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Win64 exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WinCFGuard.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCStreamer.h"
+
+#include <vector>
+
+using namespace llvm;
+
+WinCFGuard::WinCFGuard(AsmPrinter *A) : AsmPrinterHandler(), Asm(A) {}
+
+WinCFGuard::~WinCFGuard() {}
+
+void WinCFGuard::endModule() {
+  const Module *M = Asm->MMI->getModule();
+  std::vector<const Function *> Functions;
+  for (const Function &F : *M)
+    if (F.hasAddressTaken())
+      Functions.push_back(&F);
+  if (Functions.empty())
+    return;
+  auto &OS = *Asm->OutStreamer;
+  OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGFIDsSection());
+  for (const Function *F : Functions)
+    OS.EmitCOFFSymbolIndex(Asm->getSymbol(F));
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
new file mode 100644
index 000000000000..124e8f04bfad
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
@@ -0,0 +1,54 @@
+//===-- WinCFGuard.h - Windows Control Flow Guard Handling ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing windows exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H
+
+#include "AsmPrinterHandler.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY WinCFGuard : public AsmPrinterHandler {
+  /// Target of directive emission.
+  AsmPrinter *Asm;
+
+public:
+  WinCFGuard(AsmPrinter *A);
+  ~WinCFGuard() override;
+
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+
+  /// Emit the Control Flow Guard function ID table
+  void endModule() override;
+
+  /// Gather pre-function debug information.
+  /// Every beginFunction(MF) call should be followed by an endFunction(MF)
+  /// call.
+  void beginFunction(const MachineFunction *MF) override {}
+
+  /// Gather post-function debug information.
+  /// Please note that some AsmPrinter implementations may not call
+  /// beginFunction at all.
+  void endFunction(const MachineFunction *MF) override {}
+
+  /// Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override {}
+
+  /// Process end of an instruction.
+  void endInstruction() override {}
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index a6a8e84a949f..eff73a58d8d2 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -35,6 +34,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
index 371061c2c2ec..eed3c4453ffc 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -100,7 +100,7 @@ public:
   /// Gather and emit post-function exception information.
   void endFunction(const MachineFunction *) override;
 
-  /// \brief Emit target-specific EH funclet machinery.
+  /// Emit target-specific EH funclet machinery.
   void beginFunclet(const MachineBasicBlock &MBB, MCSymbol *Sym) override;
   void endFunclet() override;
 };
diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 7042bc997223..f2615edaece2 100644
--- a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -98,8 +98,8 @@ namespace {
         CreateCmpXchgInstFun CreateCmpXchg);
 
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-    bool isIdempotentRMW(AtomicRMWInst *AI);
-    bool simplifyIdempotentRMW(AtomicRMWInst *AI);
+    bool isIdempotentRMW(AtomicRMWInst *RMWI);
+    bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
 
     bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
                                  Value *PointerOperand, Value *ValueOperand,
@@ -379,8 +379,8 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAlignment(LI->getAlignment());
   NewLI->setVolatile(LI->isVolatile());
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
-  DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
-  
+  LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
+
   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
@@ -462,7 +462,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   NewSI->setAlignment(SI->getAlignment());
   NewSI->setVolatile(SI->isVolatile());
   NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
-  DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
+  LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
   SI->eraseFromParent();
   return NewSI;
 }
@@ -943,7 +943,7 @@ AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *
                                             CI->getSyncScopeID());
   NewCI->setVolatile(CI->isVolatile());
   NewCI->setWeak(CI->isWeak());
-  DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
+  LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
 
   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
index 7f358a679366..c7a0c6457164 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
@@ -152,7 +152,7 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
 
 void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   assert(MBB->pred_empty() && "MBB must be dead!");
-  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+  LLVM_DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
   MachineFunction *MF = MBB->getParent();
   // drop all successors.
@@ -164,7 +164,7 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
 
   // Remove the block.
   MF->erase(MBB);
-  FuncletMembership.erase(MBB);
+  EHScopeMembership.erase(MBB);
   if (MLI)
     MLI->removeBlock(MBB);
 }
@@ -199,8 +199,8 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
       MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
   }
 
-  // Recalculate funclet membership.
-  FuncletMembership = getFuncletMembership(MF);
+  // Recalculate EH scope membership.
+  EHScopeMembership = getEHScopeMembership(MF);
 
   bool MadeChangeThisIteration = true;
   while (MadeChangeThisIteration) {
@@ -296,6 +296,11 @@ static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
   return HashMachineInstr(*I);
 }
 
+///  Whether MI should be counted as an instruction when calculating common tail.
+static bool countsAsInstruction(const MachineInstr &MI) {
+  return !(MI.isDebugValue() || MI.isCFIInstruction());
+}
+
 /// ComputeCommonTailLength - Given two machine basic blocks, compute the number
 /// of instructions they actually have in common together at their end.  Return
 /// iterators for the first shared instruction in each block.
@@ -310,26 +315,27 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
     --I1; --I2;
     // Skip debugging pseudos; necessary to avoid changing the code.
-    while (I1->isDebugValue()) {
+    while (!countsAsInstruction(*I1)) {
       if (I1==MBB1->begin()) {
-        while (I2->isDebugValue()) {
-          if (I2==MBB2->begin())
+        while (!countsAsInstruction(*I2)) {
+          if (I2==MBB2->begin()) {
             // I1==DBG at begin; I2==DBG at begin
-            return TailLen;
+            goto SkipTopCFIAndReturn;
+          }
           --I2;
         }
         ++I2;
         // I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
-        return TailLen;
+        goto SkipTopCFIAndReturn;
       }
       --I1;
     }
     // I1==first (untested) non-DBG preceding known match
-    while (I2->isDebugValue()) {
+    while (!countsAsInstruction(*I2)) {
       if (I2==MBB2->begin()) {
         ++I1;
         // I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
-        return TailLen;
+        goto SkipTopCFIAndReturn;
       }
       --I2;
     }
@@ -352,7 +358,7 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   // I1==MBB1->begin() work as expected.)
   if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
     --I2;
-    while (I2->isDebugValue()) {
+    while (I2->isDebugInstr()) {
       if (I2 == MBB2->begin())
         return TailLen;
       --I2;
@@ -361,13 +367,44 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   }
   if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
     --I1;
-    while (I1->isDebugValue()) {
+    while (I1->isDebugInstr()) {
       if (I1 == MBB1->begin())
         return TailLen;
       --I1;
     }
     ++I1;
   }
+
+SkipTopCFIAndReturn:
+  // Ensure that I1 and I2 do not point to a CFI_INSTRUCTION. This can happen if
+  // I1 and I2 are non-identical when compared and then one or both of them ends
+  // up pointing to a CFI instruction after being incremented. For example:
+  /*
+    BB1:
+    ...
+    INSTRUCTION_A
+    ADD32ri8  <- last common instruction
+    ...
+    BB2:
+    ...
+    INSTRUCTION_B
+    CFI_INSTRUCTION
+    ADD32ri8  <- last common instruction
+    ...
+  */
+  // When INSTRUCTION_A and INSTRUCTION_B are compared as not equal, after
+  // incrementing the iterators, I1 will point to ADD, however I2 will point to
+  // the CFI instruction. Later on, this leads to BB2 being 'hacked off' at the
+  // wrong place (in ReplaceTailWithBranchTo()) which results in losing this CFI
+  // instruction.
+  while (I1 != MBB1->end() && I1->isCFIInstruction()) {
+    ++I1;
+  }
+
+  while (I2 != MBB2->end() && I2->isCFIInstruction()) {
+    ++I2;
+  }
+
   return TailLen;
 }
 
@@ -438,11 +475,11 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   if (UpdateLiveIns)
     computeAndAddLiveIns(LiveRegs, *NewMBB);
 
-  // Add the new block to the funclet.
-  const auto &FuncletI = FuncletMembership.find(&CurMBB);
-  if (FuncletI != FuncletMembership.end()) {
-    auto n = FuncletI->second;
-    FuncletMembership[NewMBB] = n;
+  // Add the new block to the EH scope.
+  const auto &EHScopeI = EHScopeMembership.find(&CurMBB);
+  if (EHScopeI != EHScopeMembership.end()) {
+    auto n = EHScopeI->second;
+    EHScopeMembership[NewMBB] = n;
   }
 
   return NewMBB;
@@ -454,7 +491,7 @@ static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
                                 MachineBasicBlock::iterator E) {
   unsigned Time = 0;
   for (; I != E; ++I) {
-    if (I->isDebugValue())
+    if (!countsAsInstruction(*I))
       continue;
     if (I->isCall())
       Time += 10;
@@ -589,7 +626,7 @@ static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) {
 /// SuccBB          A common successor of MBB1, MBB2 which are in a canonical form
 ///                 relative to SuccBB
 /// PredBB          The layout predecessor of SuccBB, if any.
-/// FuncletMembership  map from block to funclet #.
+/// EHScopeMembership  map from block to EH scope #.
 /// AfterPlacement  True if we are merging blocks after layout. Stricter
 ///                 thresholds apply to prevent undoing tail-duplication.
 static bool
@@ -598,24 +635,24 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
                   MachineBasicBlock::iterator &I1,
                   MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
                   MachineBasicBlock *PredBB,
-                  DenseMap<const MachineBasicBlock *, int> &FuncletMembership,
+                  DenseMap<const MachineBasicBlock *, int> &EHScopeMembership,
                   bool AfterPlacement) {
-  // It is never profitable to tail-merge blocks from two different funclets.
-  if (!FuncletMembership.empty()) {
-    auto Funclet1 = FuncletMembership.find(MBB1);
-    assert(Funclet1 != FuncletMembership.end());
-    auto Funclet2 = FuncletMembership.find(MBB2);
-    assert(Funclet2 != FuncletMembership.end());
-    if (Funclet1->second != Funclet2->second)
+  // It is never profitable to tail-merge blocks from two different EH scopes.
+  if (!EHScopeMembership.empty()) {
+    auto EHScope1 = EHScopeMembership.find(MBB1);
+    assert(EHScope1 != EHScopeMembership.end());
+    auto EHScope2 = EHScopeMembership.find(MBB2);
+    assert(EHScope2 != EHScopeMembership.end());
+    if (EHScope1->second != EHScope2->second)
       return false;
   }
 
   CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
   if (CommonTailLen == 0)
     return false;
-  DEBUG(dbgs() << "Common tail length of " << printMBBReference(*MBB1)
-               << " and " << printMBBReference(*MBB2) << " is " << CommonTailLen
-               << '\n');
+  LLVM_DEBUG(dbgs() << "Common tail length of " << printMBBReference(*MBB1)
+                    << " and " << printMBBReference(*MBB2) << " is "
+                    << CommonTailLen << '\n');
 
   // It's almost always profitable to merge any number of non-terminator
   // instructions with the block that falls through into the common successor.
@@ -706,7 +743,7 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
                             MinCommonTailLength,
                             CommonTailLen, TrialBBI1, TrialBBI2,
                             SuccBB, PredBB,
-                            FuncletMembership,
+                            EHScopeMembership,
                             AfterBlockPlacement)) {
         if (CommonTailLen > maxCommonTailLength) {
           SameTails.clear();
@@ -770,8 +807,8 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
     SameTails[commonTailIndex].getTailStartPos();
   MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
 
-  DEBUG(dbgs() << "\nSplitting " << printMBBReference(*MBB) << ", size "
-               << maxCommonTailLength);
+  LLVM_DEBUG(dbgs() << "\nSplitting " << printMBBReference(*MBB) << ", size "
+                    << maxCommonTailLength);
 
   // If the split block unconditionally falls-thru to SuccBB, it will be
   // merged. In control flow terms it should then take SuccBB's name. e.g. If
@@ -780,7 +817,7 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
     SuccBB->getBasicBlock() : MBB->getBasicBlock();
   MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI, BB);
   if (!newMBB) {
-    DEBUG(dbgs() << "... failed!");
+    LLVM_DEBUG(dbgs() << "... failed!");
     return false;
   }
 
@@ -814,12 +851,12 @@ mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
     assert(MBBI != MBBIE && "Reached BB end within common tail length!");
     (void)MBBIE;
 
-    if (MBBI->isDebugValue()) {
+    if (!countsAsInstruction(*MBBI)) {
       ++MBBI;
       continue;
     }
 
-    while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
+    while ((MBBICommon != MBBIECommon) && !countsAsInstruction(*MBBICommon))
       ++MBBICommon;
 
     assert(MBBICommon != MBBIECommon &&
@@ -859,7 +896,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
   }
 
   for (auto &MI : *MBB) {
-    if (MI.isDebugValue())
+    if (!countsAsInstruction(MI))
       continue;
     DebugLoc DL = MI.getDebugLoc();
     for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
@@ -869,7 +906,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
       auto &Pos = NextCommonInsts[i];
       assert(Pos != SameTails[i].getBlock()->end() &&
           "Reached BB end within common tail");
-      while (Pos->isDebugValue()) {
+      while (!countsAsInstruction(*Pos)) {
         ++Pos;
         assert(Pos != SameTails[i].getBlock()->end() &&
             "Reached BB end within common tail");
@@ -884,11 +921,12 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
   if (UpdateLiveIns) {
     LivePhysRegs NewLiveIns(*TRI);
     computeLiveIns(NewLiveIns, *MBB);
+    LiveRegs.init(*TRI);
 
     // The flag merging may lead to some register uses no longer using the
     // <undef> flag, add IMPLICIT_DEFs in the predecessors as necessary.
     for (MachineBasicBlock *Pred : MBB->predecessors()) {
-      LiveRegs.init(*TRI);
+      LiveRegs.clear();
       LiveRegs.addLiveOuts(*Pred);
       MachineBasicBlock::iterator InsertBefore = Pred->getFirstTerminator();
       for (unsigned Reg : NewLiveIns) {
@@ -919,18 +957,19 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
                                       unsigned MinCommonTailLength) {
   bool MadeChange = false;
 
-  DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
-        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) dbgs()
-        << printMBBReference(*MergePotentials[i].getBlock())
-        << (i == e - 1 ? "" : ", ");
-        dbgs() << "\n"; if (SuccBB) {
-          dbgs() << "  with successor " << printMBBReference(*SuccBB) << '\n';
-          if (PredBB)
-            dbgs() << "  which has fall-through from "
-                   << printMBBReference(*PredBB) << "\n";
-        } dbgs() << "Looking for common tails of at least "
-                 << MinCommonTailLength << " instruction"
-                 << (MinCommonTailLength == 1 ? "" : "s") << '\n';);
+  LLVM_DEBUG(
+      dbgs() << "\nTryTailMergeBlocks: ";
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) dbgs()
+      << printMBBReference(*MergePotentials[i].getBlock())
+      << (i == e - 1 ? "" : ", ");
+      dbgs() << "\n"; if (SuccBB) {
+        dbgs() << "  with successor " << printMBBReference(*SuccBB) << '\n';
+        if (PredBB)
+          dbgs() << "  which has fall-through from "
+                 << printMBBReference(*PredBB) << "\n";
+      } dbgs() << "Looking for common tails of at least "
+               << MinCommonTailLength << " instruction"
+               << (MinCommonTailLength == 1 ? "" : "s") << '\n';);
 
   // Sort by hash value so that blocks with identical end sequences sort
   // together.
@@ -1010,19 +1049,19 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
 
     // MBB is common tail.  Adjust all other BB's to jump to this one.
     // Traversal must be forwards so erases work.
-    DEBUG(dbgs() << "\nUsing common tail in " << printMBBReference(*MBB)
-                 << " for ");
+    LLVM_DEBUG(dbgs() << "\nUsing common tail in " << printMBBReference(*MBB)
+                      << " for ");
     for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
       if (commonTailIndex == i)
         continue;
-      DEBUG(dbgs() << printMBBReference(*SameTails[i].getBlock())
-                   << (i == e - 1 ? "" : ", "));
+      LLVM_DEBUG(dbgs() << printMBBReference(*SameTails[i].getBlock())
+                        << (i == e - 1 ? "" : ", "));
       // Hack the end off BB i, making it jump to BB commonTailIndex instead.
       replaceTailWithBranchTo(SameTails[i].getTailStartPos(), *MBB);
       // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
       MergePotentials.erase(SameTails[i].getMPIter());
     }
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
     // We leave commonTailIndex in the worklist in case there are other blocks
     // that match it with a smaller number of instructions.
     MadeChange = true;
@@ -1254,8 +1293,8 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
 
   // Make sure blocks are numbered in order
   MF.RenumberBlocks();
-  // Renumbering blocks alters funclet membership, recalculate it.
-  FuncletMembership = getFuncletMembership(MF);
+  // Renumbering blocks alters EH scope membership, recalculate it.
+  EHScopeMembership = getEHScopeMembership(MF);
 
   for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
        I != E; ) {
@@ -1319,6 +1358,53 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
   return DebugLoc();
 }
 
+static void copyDebugInfoToPredecessor(const TargetInstrInfo *TII,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock &PredMBB) {
+  auto InsertBefore = PredMBB.getFirstTerminator();
+  for (MachineInstr &MI : MBB.instrs())
+    if (MI.isDebugValue()) {
+      TII->duplicate(PredMBB, InsertBefore, MI);
+      LLVM_DEBUG(dbgs() << "Copied debug value from empty block to pred: "
+                        << MI);
+    }
+}
+
+static void copyDebugInfoToSuccessor(const TargetInstrInfo *TII,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock &SuccMBB) {
+  auto InsertBefore = SuccMBB.SkipPHIsAndLabels(SuccMBB.begin());
+  for (MachineInstr &MI : MBB.instrs())
+    if (MI.isDebugValue()) {
+      TII->duplicate(SuccMBB, InsertBefore, MI);
+      LLVM_DEBUG(dbgs() << "Copied debug value from empty block to succ: "
+                        << MI);
+    }
+}
+
+// Try to salvage DBG_VALUE instructions from an otherwise empty block. If such
+// a basic block is removed we would lose the debug information unless we have
+// copied the information to a predecessor/successor.
+//
+// TODO: This function only handles some simple cases. An alternative would be
+// to run a heavier analysis, such as the LiveDebugValues pass, before we do
+// branch folding.
+static void salvageDebugInfoFromEmptyBlock(const TargetInstrInfo *TII,
+                                           MachineBasicBlock &MBB) {
+  assert(IsEmptyBlock(&MBB) && "Expected an empty block (except debug info).");
+  // If this MBB is the only predecessor of a successor it is legal to copy
+  // DBG_VALUE instructions to the beginning of the successor.
+  for (MachineBasicBlock *SuccBB : MBB.successors())
+    if (SuccBB->pred_size() == 1)
+      copyDebugInfoToSuccessor(TII, MBB, *SuccBB);
+  // If this MBB is the only successor of a predecessor it is legal to copy the
+  // DBG_VALUE instructions to the end of the predecessor (just before the
+  // terminators, assuming that the terminator isn't affecting the DBG_VALUE).
+  for (MachineBasicBlock *PredBB : MBB.predecessors())
+    if (PredBB->succ_size() == 1)
+      copyDebugInfoToPredecessor(TII, MBB, *PredBB);
+}
+
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
@@ -1327,14 +1413,14 @@ ReoptimizeBlock:
   MachineFunction::iterator FallThrough = MBB->getIterator();
   ++FallThrough;
 
-  // Make sure MBB and FallThrough belong to the same funclet.
-  bool SameFunclet = true;
-  if (!FuncletMembership.empty() && FallThrough != MF.end()) {
-    auto MBBFunclet = FuncletMembership.find(MBB);
-    assert(MBBFunclet != FuncletMembership.end());
-    auto FallThroughFunclet = FuncletMembership.find(&*FallThrough);
-    assert(FallThroughFunclet != FuncletMembership.end());
-    SameFunclet = MBBFunclet->second == FallThroughFunclet->second;
+  // Make sure MBB and FallThrough belong to the same EH scope.
+  bool SameEHScope = true;
+  if (!EHScopeMembership.empty() && FallThrough != MF.end()) {
+    auto MBBEHScope = EHScopeMembership.find(MBB);
+    assert(MBBEHScope != EHScopeMembership.end());
+    auto FallThroughEHScope = EHScopeMembership.find(&*FallThrough);
+    assert(FallThroughEHScope != EHScopeMembership.end());
+    SameEHScope = MBBEHScope->second == FallThroughEHScope->second;
   }
 
   // If this block is empty, make everyone use its fall-through, not the block
@@ -1342,7 +1428,8 @@ ReoptimizeBlock:
   // points to this block.  Blocks with their addresses taken shouldn't be
   // optimized away.
   if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
-      SameFunclet) {
+      SameEHScope) {
+    salvageDebugInfoFromEmptyBlock(TII, *MBB);
     // Dead block?  Leave for cleanup later.
     if (MBB->pred_empty()) return MadeChange;
 
@@ -1406,8 +1493,8 @@ ReoptimizeBlock:
     if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
         PrevBB.succ_size() == 1 &&
         !MBB->hasAddressTaken() && !MBB->isEHPad()) {
-      DEBUG(dbgs() << "\nMerging into block: " << PrevBB
-                   << "From MBB: " << *MBB);
+      LLVM_DEBUG(dbgs() << "\nMerging into block: " << PrevBB
+                        << "From MBB: " << *MBB);
       // Remove redundant DBG_VALUEs first.
       if (PrevBB.begin() != PrevBB.end()) {
         MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
@@ -1416,7 +1503,7 @@ ReoptimizeBlock:
         // Check if DBG_VALUE at the end of PrevBB is identical to the
         // DBG_VALUE at the beginning of MBB.
         while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
-               && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
+               && PrevBBIter->isDebugInstr() && MBBIter->isDebugInstr()) {
           if (!MBBIter->isIdenticalTo(*PrevBBIter))
             break;
           MachineInstr &DuplicateDbg = *MBBIter;
@@ -1493,8 +1580,8 @@ ReoptimizeBlock:
         // Reverse the branch so we will fall through on the previous true cond.
         SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
         if (!TII->reverseBranchCondition(NewPriorCond)) {
-          DEBUG(dbgs() << "\nMoving MBB: " << *MBB
-                       << "To make fallthrough to: " << *PriorTBB << "\n");
+          LLVM_DEBUG(dbgs() << "\nMoving MBB: " << *MBB
+                            << "To make fallthrough to: " << *PriorTBB << "\n");
 
           DebugLoc dl = getBranchDebugLoc(PrevBB);
           TII->removeBranch(PrevBB);
@@ -1829,8 +1916,12 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
 
   if (Uses.empty())
     return Loc;
+  // If the terminator is the only instruction in the block and Uses is not
+  // empty (or we would have returned above), we can still safely hoist
+  // instructions just before the terminator as long as the Defs/Uses are not
+  // violated (which is checked in HoistCommonCodeInSuccs).
   if (Loc == MBB->begin())
-    return MBB->end();
+    return Loc;
 
   // The terminator is probably a conditional branch, try not to separate the
   // branch from condition setting instruction.
@@ -1917,7 +2008,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     return false;
 
   bool HasDups = false;
-  SmallVector<unsigned, 4> LocalDefs, LocalKills;
   SmallSet<unsigned, 4> ActiveDefsSet, AllDefsSet;
   MachineBasicBlock::iterator TIB = TBB->begin();
   MachineBasicBlock::iterator FIB = FBB->begin();
@@ -2000,7 +2090,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!Reg)
         continue;
       if (!AllDefsSet.count(Reg)) {
-        LocalKills.push_back(Reg);
         continue;
       }
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
@@ -2018,7 +2107,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       unsigned Reg = MO.getReg();
       if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
-      LocalDefs.push_back(Reg);
       addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
       addRegAndItsAliases(Reg, TRI, AllDefsSet);
     }
@@ -2034,25 +2122,9 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MBB->splice(Loc, TBB, TBB->begin(), TIB);
   FBB->erase(FBB->begin(), FIB);
 
-  // Update livein's.
-  bool ChangedLiveIns = false;
-  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
-    unsigned Def = LocalDefs[i];
-    if (ActiveDefsSet.count(Def)) {
-      TBB->addLiveIn(Def);
-      FBB->addLiveIn(Def);
-      ChangedLiveIns = true;
-    }
-  }
-  for (unsigned K : LocalKills) {
-    TBB->removeLiveIn(K);
-    FBB->removeLiveIn(K);
-    ChangedLiveIns = true;
-  }
-
-  if (ChangedLiveIns) {
-    TBB->sortUniqueLiveIns();
-    FBB->sortUniqueLiveIns();
+  if (UpdateLiveIns) {
+    recomputeLiveIns(*TBB);
+    recomputeLiveIns(*FBB);
   }
 
   ++NumHoist;
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h
index 0f0952550137..accd0ab7317b 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.h
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.h
@@ -38,11 +38,11 @@ class TargetRegisterInfo;
 
     explicit BranchFolder(bool defaultEnableTailMerge,
                           bool CommonHoist,
-                          MBFIWrapper &MBFI,
-                          const MachineBranchProbabilityInfo &MBPI,
+                          MBFIWrapper &FreqInfo,
+                          const MachineBranchProbabilityInfo &ProbInfo,
                           // Min tail length to merge. Defaults to commandline
                           // flag. Ignored for optsize.
-                          unsigned MinCommonTailLength = 0);
+                          unsigned MinTailLength = 0);
 
     /// Perhaps branch folding, tail merging and other CFG optimizations on the
     /// given function.  Block placement changes the layout and may create new
@@ -75,7 +75,7 @@ class TargetRegisterInfo;
 
     std::vector<MergePotentialsElt> MergePotentials;
     SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging;
-    DenseMap<const MachineBasicBlock *, int> FuncletMembership;
+    DenseMap<const MachineBasicBlock *, int> EHScopeMembership;
 
     class SameTailElt {
       MPIterator MPIter;
@@ -132,7 +132,7 @@ class TargetRegisterInfo;
     LivePhysRegs LiveRegs;
 
   public:
-    /// \brief This class keeps track of branch frequencies of newly created
+    /// This class keeps track of branch frequencies of newly created
     /// blocks and tail-merged blocks.
     class MBFIWrapper {
     public:
diff --git a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp
index 0d87f142c7cc..c092da2b6602 100644
--- a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -95,7 +96,7 @@ class BranchRelaxation : public MachineFunctionPass {
 
   MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI,
                                            MachineBasicBlock *DestBB);
-  void adjustBlockOffsets(MachineBasicBlock &MBB);
+  void adjustBlockOffsets(MachineBasicBlock &Start);
   bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const;
 
   bool fixupConditionalBranch(MachineInstr &MI);
@@ -287,10 +288,11 @@ bool BranchRelaxation::isBlockInRange(
   if (TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - BrOffset))
     return true;
 
-  DEBUG(dbgs() << "Out of range branch to destination "
-               << printMBBReference(DestBB) << " from "
-               << printMBBReference(*MI.getParent()) << " to " << DestOffset
-               << " offset " << DestOffset - BrOffset << '\t' << MI);
+  LLVM_DEBUG(dbgs() << "Out of range branch to destination "
+                    << printMBBReference(DestBB) << " from "
+                    << printMBBReference(*MI.getParent()) << " to "
+                    << DestOffset << " offset " << DestOffset - BrOffset << '\t'
+                    << MI);
 
   return false;
 }
@@ -302,8 +304,41 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
   DebugLoc DL = MI.getDebugLoc();
   MachineBasicBlock *MBB = MI.getParent();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  MachineBasicBlock *NewBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
 
+  auto insertUncondBranch = [&](MachineBasicBlock *MBB,
+                                MachineBasicBlock *DestBB) {
+    unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
+    int NewBrSize = 0;
+    TII->insertUnconditionalBranch(*MBB, DestBB, DL, &NewBrSize);
+    BBSize += NewBrSize;
+  };
+  auto insertBranch = [&](MachineBasicBlock *MBB, MachineBasicBlock *TBB,
+                          MachineBasicBlock *FBB,
+                          SmallVectorImpl<MachineOperand>& Cond) {
+    unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
+    int NewBrSize = 0;
+    TII->insertBranch(*MBB, TBB, FBB, Cond, DL, &NewBrSize);
+    BBSize += NewBrSize;
+  };
+  auto removeBranch = [&](MachineBasicBlock *MBB) {
+    unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
+    int RemovedSize = 0;
+    TII->removeBranch(*MBB, &RemovedSize);
+    BBSize -= RemovedSize;
+  };
+
+  auto finalizeBlockChanges = [&](MachineBasicBlock *MBB,
+                                  MachineBasicBlock *NewBB) {
+    // Keep the block offsets up to date.
+    adjustBlockOffsets(*MBB);
+
+    // Need to fix live-in lists if we track liveness.
+    if (NewBB && TRI->trackLivenessAfterRegAlloc(*MF))
+      computeAndAddLiveIns(LiveRegs, *NewBB);
+  };
+
   bool Fail = TII->analyzeBranch(*MBB, TBB, FBB, Cond);
   assert(!Fail && "branches to be relaxed must be analyzable");
   (void)Fail;
@@ -316,71 +351,90 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
   // b   L1
   // L2:
 
-  if (FBB && isBlockInRange(MI, *FBB)) {
-    // Last MI in the BB is an unconditional branch. We can simply invert the
-    // condition and swap destinations:
-    // beq L1
-    // b   L2
-    // =>
-    // bne L2
-    // b   L1
-    DEBUG(dbgs() << "  Invert condition and swap "
-                    "its destination with " << MBB->back());
-
-    TII->reverseBranchCondition(Cond);
-    int OldSize = 0, NewSize = 0;
-    TII->removeBranch(*MBB, &OldSize);
-    TII->insertBranch(*MBB, FBB, TBB, Cond, DL, &NewSize);
-
-    BlockInfo[MBB->getNumber()].Size += (NewSize - OldSize);
-    return true;
-  } else if (FBB) {
-    // We need to split the basic block here to obtain two long-range
-    // unconditional branches.
-    auto &NewBB = *MF->CreateMachineBasicBlock(MBB->getBasicBlock());
-    MF->insert(++MBB->getIterator(), &NewBB);
-
-    // Insert an entry into BlockInfo to align it properly with the block
-    // numbers.
-    BlockInfo.insert(BlockInfo.begin() + NewBB.getNumber(), BasicBlockInfo());
-
-    unsigned &NewBBSize = BlockInfo[NewBB.getNumber()].Size;
-    int NewBrSize;
-    TII->insertUnconditionalBranch(NewBB, FBB, DL, &NewBrSize);
-    NewBBSize += NewBrSize;
-
-    // Update the successor lists according to the transformation to follow.
-    // Do it here since if there's no split, no update is needed.
-    MBB->replaceSuccessor(FBB, &NewBB);
-    NewBB.addSuccessor(FBB);
+  bool ReversedCond = !TII->reverseBranchCondition(Cond);
+  if (ReversedCond) {
+    if (FBB && isBlockInRange(MI, *FBB)) {
+      // Last MI in the BB is an unconditional branch. We can simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      LLVM_DEBUG(dbgs() << "  Invert condition and swap "
+                           "its destination with "
+                        << MBB->back());
+
+      removeBranch(MBB);
+      insertBranch(MBB, FBB, TBB, Cond);
+      finalizeBlockChanges(MBB, nullptr);
+      return true;
+    }
+    if (FBB) {
+      // We need to split the basic block here to obtain two long-range
+      // unconditional branches.
+      NewBB = createNewBlockAfter(*MBB);
+
+      insertUncondBranch(NewBB, FBB);
+      // Update the succesor lists according to the transformation to follow.
+      // Do it here since if there's no split, no update is needed.
+      MBB->replaceSuccessor(FBB, NewBB);
+      NewBB->addSuccessor(FBB);
+    }
 
-    // Need to fix live-in lists if we track liveness.
-    if (TRI->trackLivenessAfterRegAlloc(*MF))
-      computeAndAddLiveIns(LiveRegs, NewBB);
+    // We now have an appropriate fall-through block in place (either naturally or
+    // just created), so we can use the inverted the condition.
+    MachineBasicBlock &NextBB = *std::next(MachineFunction::iterator(MBB));
+
+    LLVM_DEBUG(dbgs() << "  Insert B to " << printMBBReference(*TBB)
+                      << ", invert condition and change dest. to "
+                      << printMBBReference(NextBB) << '\n');
+
+    removeBranch(MBB);
+    // Insert a new conditional branch and a new unconditional branch.
+    insertBranch(MBB, &NextBB, TBB, Cond);
+
+    finalizeBlockChanges(MBB, NewBB);
+    return true;
   }
+  // Branch cond can't be inverted.
+  // In this case we always add a block after the MBB.
+  LLVM_DEBUG(dbgs() << "  The branch condition can't be inverted. "
+                    << "  Insert a new BB after " << MBB->back());
 
-  // We now have an appropriate fall-through block in place (either naturally or
-  // just created), so we can invert the condition.
-  MachineBasicBlock &NextBB = *std::next(MachineFunction::iterator(MBB));
+  if (!FBB)
+    FBB = &(*std::next(MachineFunction::iterator(MBB)));
 
-  DEBUG(dbgs() << "  Insert B to " << printMBBReference(*TBB)
-               << ", invert condition and change dest. to "
-               << printMBBReference(NextBB) << '\n');
+  // This is the block with cond. branch and the distance to TBB is too long.
+  //    beq L1
+  // L2:
 
-  unsigned &MBBSize = BlockInfo[MBB->getNumber()].Size;
+  // We do the following transformation:
+  //    beq NewBB
+  //    b L2
+  // NewBB:
+  //    b L1
+  // L2:
 
-  // Insert a new conditional branch and a new unconditional branch.
-  int RemovedSize = 0;
-  TII->reverseBranchCondition(Cond);
-  TII->removeBranch(*MBB, &RemovedSize);
-  MBBSize -= RemovedSize;
+  NewBB = createNewBlockAfter(*MBB);
+  insertUncondBranch(NewBB, TBB);
 
-  int AddedSize = 0;
-  TII->insertBranch(*MBB, &NextBB, TBB, Cond, DL, &AddedSize);
-  MBBSize += AddedSize;
+  LLVM_DEBUG(dbgs() << "  Insert cond B to the new BB "
+                    << printMBBReference(*NewBB)
+                    << "  Keep the exiting condition.\n"
+                    << "  Insert B to " << printMBBReference(*FBB) << ".\n"
+                    << "  In the new BB: Insert B to "
+                    << printMBBReference(*TBB) << ".\n");
 
-  // Finally, keep the block offsets up to date.
-  adjustBlockOffsets(*MBB);
+  // Update the successor lists according to the transformation to follow.
+  MBB->replaceSuccessor(TBB, NewBB);
+  NewBB->addSuccessor(TBB);
+
+  // Replace branch in the current (MBB) block.
+  removeBranch(MBB);
+  insertBranch(MBB, NewBB, FBB, Cond);
+
+  finalizeBlockChanges(MBB, NewBB);
   return true;
 }
 
@@ -490,7 +544,7 @@ bool BranchRelaxation::relaxBranchInstructions() {
 bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
 
-  DEBUG(dbgs() << "***** BranchRelaxation *****\n");
+  LLVM_DEBUG(dbgs() << "***** BranchRelaxation *****\n");
 
   const TargetSubtargetInfo &ST = MF->getSubtarget();
   TII = ST.getInstrInfo();
@@ -507,7 +561,7 @@ bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   // sizes of each block.
   scanFunction();
 
-  DEBUG(dbgs() << "  Basic blocks before relaxation\n"; dumpBBs(););
+  LLVM_DEBUG(dbgs() << "  Basic blocks before relaxation\n"; dumpBBs(););
 
   bool MadeChange = false;
   while (relaxBranchInstructions())
@@ -516,7 +570,7 @@ bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   // After a while, this might be made debug-only, but it is not expensive.
   verify();
 
-  DEBUG(dbgs() << "  Basic blocks after relaxation\n\n"; dumpBBs());
+  LLVM_DEBUG(dbgs() << "  Basic blocks after relaxation\n\n"; dumpBBs());
 
   BlockInfo.clear();
 
diff --git a/contrib/llvm/lib/CodeGen/BreakFalseDeps.cpp b/contrib/llvm/lib/CodeGen/BreakFalseDeps.cpp
new file mode 100644
index 000000000000..7f098cb71657
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -0,0 +1,271 @@
+//==- llvm/CodeGen/BreakFalseDeps.cpp - Break False Dependency Fix -*- C++ -*==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Break False Dependency pass.
+///
+/// Some instructions have false dependencies which cause unnecessary stalls.
+/// For exmaple, instructions that only write part of a register, and implicitly
+/// need to read the other parts of the register.  This may cause unwanted
+/// stalls preventing otherwise unrelated instructions from executing in
+/// parallel in an out-of-order CPU.
+/// This pass is aimed at identifying and avoiding these depepndencies when
+/// possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+
+using namespace llvm;
+
+namespace llvm {
+
+class BreakFalseDeps : public MachineFunctionPass {
+private:
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RegClassInfo;
+
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr *, unsigned>> UndefReads;
+
+  /// Storage for register unit liveness.
+  LivePhysRegs LiveRegSet;
+
+  ReachingDefAnalysis *RDA;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  BreakFalseDeps() : MachineFunctionPass(ID) {
+    initializeBreakFalseDepsPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<ReachingDefAnalysis>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  /// Process he given basic block.
+  void processBasicBlock(MachineBasicBlock *MBB);
+
+  /// Update def-ages for registers defined by MI.
+  /// Also break dependencies on partial defs and undef uses.
+  void processDefs(MachineInstr *MI);
+
+  /// Helps avoid false dependencies on undef registers by updating the
+  /// machine instructions' undef operand to use a register that the instruction
+  /// is truly dependent on, or use a register with clearance higher than Pref.
+  /// Returns true if it was able to find a true dependency, thus not requiring
+  /// a dependency breaking instruction regardless of clearance.
+  bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+    unsigned Pref);
+
+  /// Return true to if it makes sense to break dependence on a partial
+  /// def or undef use.
+  bool shouldBreakDependence(MachineInstr *, unsigned OpIdx, unsigned Pref);
+
+  /// Break false dependencies on undefined register reads.
+  /// Walk the block backward computing precise liveness. This is expensive, so
+  /// we only do it on demand. Note that the occurrence of undefined register
+  /// reads that should be broken is very rare, but when they occur we may have
+  /// many in a single block.
+  void processUndefReads(MachineBasicBlock *);
+};
+
+} // namespace llvm
+
+#define DEBUG_TYPE "break-false-deps"
+
+char BreakFalseDeps::ID = 0;
+INITIALIZE_PASS_BEGIN(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false)
+
+FunctionPass *llvm::createBreakFalseDeps() { return new BreakFalseDeps(); }
+
+bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+  unsigned Pref) {
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  assert(MO.isUndef() && "Expected undef machine operand");
+
+  unsigned OriginalReg = MO.getReg();
+
+  // Update only undef operands that have reg units that are mapped to one root.
+  for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) {
+    unsigned NumRoots = 0;
+    for (MCRegUnitRootIterator Root(*Unit, TRI); Root.isValid(); ++Root) {
+      NumRoots++;
+      if (NumRoots > 1)
+        return false;
+    }
+  }
+
+  // Get the undef operand's register class
+  const TargetRegisterClass *OpRC =
+    TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
+
+  // If the instruction has a true dependency, we can hide the false depdency
+  // behind it.
+  for (MachineOperand &CurrMO : MI->operands()) {
+    if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() ||
+      !OpRC->contains(CurrMO.getReg()))
+      continue;
+    // We found a true dependency - replace the undef register with the true
+    // dependency.
+    MO.setReg(CurrMO.getReg());
+    return true;
+  }
+
+  // Go over all registers in the register class and find the register with
+  // max clearance or clearance higher than Pref.
+  unsigned MaxClearance = 0;
+  unsigned MaxClearanceReg = OriginalReg;
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(OpRC);
+  for (MCPhysReg Reg : Order) {
+    unsigned Clearance = RDA->getClearance(MI, Reg);
+    if (Clearance <= MaxClearance)
+      continue;
+    MaxClearance = Clearance;
+    MaxClearanceReg = Reg;
+
+    if (MaxClearance > Pref)
+      break;
+  }
+
+  // Update the operand if we found a register with better clearance.
+  if (MaxClearanceReg != OriginalReg)
+    MO.setReg(MaxClearanceReg);
+
+  return false;
+}
+
+bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+  unsigned Pref) {
+  unsigned reg = MI->getOperand(OpIdx).getReg();
+  unsigned Clearance = RDA->getClearance(MI, reg);
+  LLVM_DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+
+  if (Pref > Clearance) {
+    LLVM_DEBUG(dbgs() << ": Break dependency.\n");
+    return true;
+  }
+  LLVM_DEBUG(dbgs() << ": OK .\n");
+  return false;
+}
+
+void BreakFalseDeps::processDefs(MachineInstr *MI) {
+  assert(!MI->isDebugInstr() && "Won't process debug values");
+
+  // Break dependence on undef uses. Do this before updating LiveRegs below.
+  unsigned OpNum;
+  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
+  if (Pref) {
+    bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref);
+    // We don't need to bother trying to break a dependency if this
+    // instruction has a true dependency on that register through another
+    // operand - we'll have to wait for it to be available regardless.
+    if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref))
+      UndefReads.push_back(std::make_pair(MI, OpNum));
+  }
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0,
+    e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+    i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    // Check clearance before partial register updates.
+    unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
+    if (Pref && shouldBreakDependence(MI, i, Pref))
+      TII->breakPartialRegDependency(*MI, i, TRI);
+  }
+}
+
+void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) {
+  if (UndefReads.empty())
+    return;
+
+  // Collect this block's live out register units.
+  LiveRegSet.init(*TRI);
+  // We do not need to care about pristine registers as they are just preserved
+  // but not actually used in the function.
+  LiveRegSet.addLiveOutsNoPristines(*MBB);
+
+  MachineInstr *UndefMI = UndefReads.back().first;
+  unsigned OpIdx = UndefReads.back().second;
+
+  for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
+    // Update liveness, including the current instruction's defs.
+    LiveRegSet.stepBackward(I);
+
+    if (UndefMI == &I) {
+      if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
+        TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
+
+      UndefReads.pop_back();
+      if (UndefReads.empty())
+        return;
+
+      UndefMI = UndefReads.back().first;
+      OpIdx = UndefReads.back().second;
+    }
+  }
+}
+
+void BreakFalseDeps::processBasicBlock(MachineBasicBlock *MBB) {
+  UndefReads.clear();
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid doing the work
+  // to try and break dependencies now.
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isDebugInstr())
+      processDefs(&MI);
+  }
+  processUndefReads(MBB);
+}
+
+bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+  MF = &mf;
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  RDA = &getAnalysis<ReachingDefAnalysis>();
+
+  RegClassInfo.runOnMachineFunction(mf);
+
+  LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n");
+
+  // Traverse the basic blocks.
+  for (MachineBasicBlock &MBB : mf) {
+    processBasicBlock(&MBB);
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/CFIInstrInserter.cpp b/contrib/llvm/lib/CodeGen/CFIInstrInserter.cpp
new file mode 100644
index 000000000000..00ebf63fc174
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -0,0 +1,326 @@
+//===------ CFIInstrInserter.cpp - Insert additional CFI instructions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass verifies incoming and outgoing CFA information of basic
+/// blocks. CFA information is information about offset and register set by CFI
+/// directives, valid at the start and end of a basic block. This pass checks
+/// that outgoing information of predecessors matches incoming information of
+/// their successors. Then it checks if blocks have correct CFA calculation rule
+/// set and inserts additional CFI instruction at their beginnings if they
+/// don't. CFI instructions are inserted if basic blocks have incorrect offset
+/// or register set by previous blocks, as a result of a non-linear layout of
+/// blocks in a function.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+static cl::opt<bool> VerifyCFI("verify-cfiinstrs",
+    cl::desc("Verify Call Frame Information instructions"),
+    cl::init(false),
+    cl::Hidden);
+
+namespace {
+class CFIInstrInserter : public MachineFunctionPass {
+ public:
+  static char ID;
+
+  CFIInstrInserter() : MachineFunctionPass(ID) {
+    initializeCFIInstrInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!MF.getMMI().hasDebugInfo() &&
+        !MF.getFunction().needsUnwindTableEntry())
+      return false;
+
+    MBBVector.resize(MF.getNumBlockIDs());
+    calculateCFAInfo(MF);
+
+    if (VerifyCFI) {
+      if (unsigned ErrorNum = verify(MF))
+        report_fatal_error("Found " + Twine(ErrorNum) +
+                           " in/out CFI information errors.");
+    }
+    bool insertedCFI = insertCFIInstrs(MF);
+    MBBVector.clear();
+    return insertedCFI;
+  }
+
+ private:
+  struct MBBCFAInfo {
+    MachineBasicBlock *MBB;
+    /// Value of cfa offset valid at basic block entry.
+    int IncomingCFAOffset = -1;
+    /// Value of cfa offset valid at basic block exit.
+    int OutgoingCFAOffset = -1;
+    /// Value of cfa register valid at basic block entry.
+    unsigned IncomingCFARegister = 0;
+    /// Value of cfa register valid at basic block exit.
+    unsigned OutgoingCFARegister = 0;
+    /// If in/out cfa offset and register values for this block have already
+    /// been set or not.
+    bool Processed = false;
+  };
+
+  /// Contains cfa offset and register values valid at entry and exit of basic
+  /// blocks.
+  std::vector<MBBCFAInfo> MBBVector;
+
+  /// Calculate cfa offset and register values valid at entry and exit for all
+  /// basic blocks in a function.
+  void calculateCFAInfo(MachineFunction &MF);
+  /// Calculate cfa offset and register values valid at basic block exit by
+  /// checking the block for CFI instructions. Block's incoming CFA info remains
+  /// the same.
+  void calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo);
+  /// Update in/out cfa offset and register values for successors of the basic
+  /// block.
+  void updateSuccCFAInfo(MBBCFAInfo &MBBInfo);
+
+  /// Check if incoming CFA information of a basic block matches outgoing CFA
+  /// information of the previous block. If it doesn't, insert CFI instruction
+  /// at the beginning of the block that corrects the CFA calculation rule for
+  /// that block.
+  bool insertCFIInstrs(MachineFunction &MF);
+  /// Return the cfa offset value that should be set at the beginning of a MBB
+  /// if needed. The negated value is needed when creating CFI instructions that
+  /// set absolute offset.
+  int getCorrectCFAOffset(MachineBasicBlock *MBB) {
+    return -MBBVector[MBB->getNumber()].IncomingCFAOffset;
+  }
+
+  void report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ);
+  /// Go through each MBB in a function and check that outgoing offset and
+  /// register of its predecessors match incoming offset and register of that
+  /// MBB, as well as that incoming offset and register of its successors match
+  /// outgoing offset and register of the MBB.
+  unsigned verify(MachineFunction &MF);
+};
+}  // namespace
+
+char CFIInstrInserter::ID = 0;
+INITIALIZE_PASS(CFIInstrInserter, "cfi-instr-inserter",
+                "Check CFA info and insert CFI instructions if needed", false,
+                false)
+FunctionPass *llvm::createCFIInstrInserter() { return new CFIInstrInserter(); }
+
+void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
+  // Initial CFA offset value i.e. the one valid at the beginning of the
+  // function.
+  int InitialOffset =
+      MF.getSubtarget().getFrameLowering()->getInitialCFAOffset(MF);
+  // Initial CFA register value i.e. the one valid at the beginning of the
+  // function.
+  unsigned InitialRegister =
+      MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF);
+
+  // Initialize MBBMap.
+  for (MachineBasicBlock &MBB : MF) {
+    MBBCFAInfo MBBInfo;
+    MBBInfo.MBB = &MBB;
+    MBBInfo.IncomingCFAOffset = InitialOffset;
+    MBBInfo.OutgoingCFAOffset = InitialOffset;
+    MBBInfo.IncomingCFARegister = InitialRegister;
+    MBBInfo.OutgoingCFARegister = InitialRegister;
+    MBBVector[MBB.getNumber()] = MBBInfo;
+  }
+
+  // Set in/out cfa info for all blocks in the function. This traversal is based
+  // on the assumption that the first block in the function is the entry block
+  // i.e. that it has initial cfa offset and register values as incoming CFA
+  // information.
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBVector[MBB.getNumber()].Processed) continue;
+    updateSuccCFAInfo(MBBVector[MBB.getNumber()]);
+  }
+}
+
+void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
+  // Outgoing cfa offset set by the block.
+  int SetOffset = MBBInfo.IncomingCFAOffset;
+  // Outgoing cfa register set by the block.
+  unsigned SetRegister = MBBInfo.IncomingCFARegister;
+  const std::vector<MCCFIInstruction> &Instrs =
+      MBBInfo.MBB->getParent()->getFrameInstructions();
+
+  // Determine cfa offset and register set by the block.
+  for (MachineInstr &MI : *MBBInfo.MBB) {
+    if (MI.isCFIInstruction()) {
+      unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
+      const MCCFIInstruction &CFI = Instrs[CFIIndex];
+      switch (CFI.getOperation()) {
+      case MCCFIInstruction::OpDefCfaRegister:
+        SetRegister = CFI.getRegister();
+        break;
+      case MCCFIInstruction::OpDefCfaOffset:
+        SetOffset = CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpAdjustCfaOffset:
+        SetOffset += CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpDefCfa:
+        SetRegister = CFI.getRegister();
+        SetOffset = CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpRememberState:
+        // TODO: Add support for handling cfi_remember_state.
+#ifndef NDEBUG
+        report_fatal_error(
+            "Support for cfi_remember_state not implemented! Value of CFA "
+            "may be incorrect!\n");
+#endif
+        break;
+      case MCCFIInstruction::OpRestoreState:
+        // TODO: Add support for handling cfi_restore_state.
+#ifndef NDEBUG
+        report_fatal_error(
+            "Support for cfi_restore_state not implemented! Value of CFA may "
+            "be incorrect!\n");
+#endif
+        break;
+      // Other CFI directives do not affect CFA value.
+      case MCCFIInstruction::OpSameValue:
+      case MCCFIInstruction::OpOffset:
+      case MCCFIInstruction::OpRelOffset:
+      case MCCFIInstruction::OpEscape:
+      case MCCFIInstruction::OpRestore:
+      case MCCFIInstruction::OpUndefined:
+      case MCCFIInstruction::OpRegister:
+      case MCCFIInstruction::OpWindowSave:
+      case MCCFIInstruction::OpGnuArgsSize:
+        break;
+      }
+    }
+  }
+
+  MBBInfo.Processed = true;
+
+  // Update outgoing CFA info.
+  MBBInfo.OutgoingCFAOffset = SetOffset;
+  MBBInfo.OutgoingCFARegister = SetRegister;
+}
+
+void CFIInstrInserter::updateSuccCFAInfo(MBBCFAInfo &MBBInfo) {
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  Stack.push_back(MBBInfo.MBB);
+
+  do {
+    MachineBasicBlock *Current = Stack.pop_back_val();
+    MBBCFAInfo &CurrentInfo = MBBVector[Current->getNumber()];
+    if (CurrentInfo.Processed)
+      continue;
+
+    calculateOutgoingCFAInfo(CurrentInfo);
+    for (auto *Succ : CurrentInfo.MBB->successors()) {
+      MBBCFAInfo &SuccInfo = MBBVector[Succ->getNumber()];
+      if (!SuccInfo.Processed) {
+        SuccInfo.IncomingCFAOffset = CurrentInfo.OutgoingCFAOffset;
+        SuccInfo.IncomingCFARegister = CurrentInfo.OutgoingCFARegister;
+        Stack.push_back(Succ);
+      }
+    }
+  } while (!Stack.empty());
+}
+
+bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
+  const MBBCFAInfo *PrevMBBInfo = &MBBVector[MF.front().getNumber()];
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  bool InsertedCFIInstr = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // Skip the first MBB in a function
+    if (MBB.getNumber() == MF.front().getNumber()) continue;
+
+    const MBBCFAInfo &MBBInfo = MBBVector[MBB.getNumber()];
+    auto MBBI = MBBInfo.MBB->begin();
+    DebugLoc DL = MBBInfo.MBB->findDebugLoc(MBBI);
+
+    if (PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset) {
+      // If both outgoing offset and register of a previous block don't match
+      // incoming offset and register of this block, add a def_cfa instruction
+      // with the correct offset and register for this block.
+      if (PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) {
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+            nullptr, MBBInfo.IncomingCFARegister, getCorrectCFAOffset(&MBB)));
+        BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+        // If outgoing offset of a previous block doesn't match incoming offset
+        // of this block, add a def_cfa_offset instruction with the correct
+        // offset for this block.
+      } else {
+        unsigned CFIIndex =
+            MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+                nullptr, getCorrectCFAOffset(&MBB)));
+        BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+      InsertedCFIInstr = true;
+      // If outgoing register of a previous block doesn't match incoming
+      // register of this block, add a def_cfa_register instruction with the
+      // correct register for this block.
+    } else if (PrevMBBInfo->OutgoingCFARegister !=
+               MBBInfo.IncomingCFARegister) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MBBInfo.IncomingCFARegister));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+    PrevMBBInfo = &MBBInfo;
+  }
+  return InsertedCFIInstr;
+}
+
+void CFIInstrInserter::report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ) {
+  errs() << "*** Inconsistent CFA register and/or offset between pred and succ "
+            "***\n";
+  errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
+         << " in " << Pred.MBB->getParent()->getName()
+         << " outgoing CFA Reg:" << Pred.OutgoingCFARegister << "\n";
+  errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
+         << " in " << Pred.MBB->getParent()->getName()
+         << " outgoing CFA Offset:" << Pred.OutgoingCFAOffset << "\n";
+  errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber()
+         << " incoming CFA Reg:" << Succ.IncomingCFARegister << "\n";
+  errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber()
+         << " incoming CFA Offset:" << Succ.IncomingCFAOffset << "\n";
+}
+
+unsigned CFIInstrInserter::verify(MachineFunction &MF) {
+  unsigned ErrorNum = 0;
+  for (auto *CurrMBB : depth_first(&MF)) {
+    const MBBCFAInfo &CurrMBBInfo = MBBVector[CurrMBB->getNumber()];
+    for (MachineBasicBlock *Succ : CurrMBB->successors()) {
+      const MBBCFAInfo &SuccMBBInfo = MBBVector[Succ->getNumber()];
+      // Check that incoming offset and register values of successors match the
+      // outgoing offset and register values of CurrMBB
+      if (SuccMBBInfo.IncomingCFAOffset != CurrMBBInfo.OutgoingCFAOffset ||
+          SuccMBBInfo.IncomingCFARegister != CurrMBBInfo.OutgoingCFARegister) {
+        report(CurrMBBInfo, SuccMBBInfo);
+        ErrorNum++;
+      }
+    }
+  }
+  return ErrorNum;
+}
diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
index b8920a601938..57541182cab2 100644
--- a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -35,8 +35,8 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
                            const MachineLoopInfo &MLI,
                            const MachineBlockFrequencyInfo &MBFI,
                            VirtRegAuxInfo::NormalizingFn norm) {
-  DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm);
@@ -236,7 +236,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
       continue;
 
     numInstr++;
-    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
+    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugInstr())
       continue;
     if (!visited.insert(mi).second)
       continue;
diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp
index 879cd2859ee9..2f845354c570 100644
--- a/contrib/llvm/lib/CodeGen/CodeGen.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp
@@ -23,11 +23,14 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
+  initializeCFIInstrInserterPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
   initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPreparePass(Registry);
   initializeEarlyIfConverterPass(Registry);
+  initializeEarlyMachineLICMPass(Registry);
+  initializeEarlyTailDuplicatePass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeExpandMemCmpPassPass(Registry);
   initializeExpandPostRAPass(Registry);
@@ -48,6 +51,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
+  initializeMIRCanonicalizerPass(Registry);
   initializeMachineBlockFrequencyInfoPass(Registry);
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
@@ -74,12 +78,15 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializePeepholeOptimizerPass(Registry);
   initializePostMachineSchedulerPass(Registry);
   initializePostRAHazardRecognizerPass(Registry);
+  initializePostRAMachineSinkingPass(Registry);
   initializePostRASchedulerPass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeProcessImplicitDefsPass(Registry);
   initializeRABasicPass(Registry);
-  initializeRegAllocFastPass(Registry);
   initializeRAGreedyPass(Registry);
+  initializeRegAllocFastPass(Registry);
+  initializeRegUsageInfoCollectorPass(Registry);
+  initializeRegUsageInfoPropagationPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
@@ -90,7 +97,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeStackMapLivenessPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
-  initializeTailDuplicatePassPass(Registry);
+  initializeTailDuplicatePass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
   initializeUnpackMachineBundlesPass(Registry);
@@ -98,9 +105,9 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeUnreachableMachineBlockElimPass(Registry);
   initializeVirtRegMapPass(Registry);
   initializeVirtRegRewriterPass(Registry);
+  initializeWasmEHPreparePass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeXRayInstrumentationPass(Registry);
-  initializeMIRCanonicalizerPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 26ca8d4ee88c..c41beb094604 100644
--- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -30,15 +30,16 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -79,13 +80,13 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include <algorithm>
 #include <cassert>
@@ -196,7 +197,7 @@ AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
                 cl::desc("Allow creation of Phis in Address sinking."));
 
 static cl::opt<bool>
-AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(false),
+AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true),
                    cl::desc("Allow creation of selects in Address sinking."));
 
 static cl::opt<bool> AddrSinkCombineBaseReg(
@@ -215,6 +216,11 @@ static cl::opt<bool> AddrSinkCombineScaledReg(
     "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
     cl::desc("Allow combining of ScaledReg field in Address sinking."));
 
+static cl::opt<bool>
+    EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
+                         cl::init(true),
+                         cl::desc("Enable splitting large offset of GEP."));
+
 namespace {
 
 using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
@@ -260,6 +266,20 @@ class TypePromotionTransaction;
     /// Keep track of sext chains based on their initial value.
     DenseMap<Value *, Instruction *> SeenChainsForSExt;
 
+    /// Keep track of GEPs accessing the same data structures such as structs or
+    /// arrays that are candidates to be split later because of their large
+    /// size.
+    DenseMap<
+        AssertingVH<Value>,
+        SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
+        LargeOffsetGEPMap;
+
+    /// Keep track of new GEP base after splitting the GEPs having large offset.
+    SmallSet<AssertingVH<Value>, 2> NewGEPBases;
+
+    /// Map serial numbers to Large offset GEPs.
+    DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
+
     /// Keep track of SExt promoted.
     ValueToSExts ValToSExtendedUses;
 
@@ -301,16 +321,16 @@ class TypePromotionTransaction;
                                        bool isPreheader);
     bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
     bool optimizeInst(Instruction *I, bool &ModifiedDT);
-    bool optimizeMemoryInst(Instruction *I, Value *Addr,
-                            Type *AccessTy, unsigned AS);
+    bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                            Type *AccessTy, unsigned AddrSpace);
     bool optimizeInlineAsmInst(CallInst *CS);
     bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
     bool optimizeExt(Instruction *&I);
     bool optimizeExtUses(Instruction *I);
-    bool optimizeLoadExt(LoadInst *I);
+    bool optimizeLoadExt(LoadInst *Load);
     bool optimizeSelectInst(SelectInst *SI);
-    bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
-    bool optimizeSwitchInst(SwitchInst *CI);
+    bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
+    bool optimizeSwitchInst(SwitchInst *SI);
     bool optimizeExtractElementInst(Instruction *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
     bool placeDbgValues(Function &F);
@@ -321,6 +341,7 @@ class TypePromotionTransaction;
                           SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
                           unsigned CreatedInstsCost = 0);
     bool mergeSExts(Function &F);
+    bool splitLargeGEPOffsets();
     bool performAddressTypePromotion(
         Instruction *&Inst,
         bool AllowPromotionWithoutCommonHeader,
@@ -414,6 +435,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     SeenChainsForSExt.clear();
     ValToSExtendedUses.clear();
     RemovedInsts.clear();
+    LargeOffsetGEPMap.clear();
+    LargeOffsetGEPID.clear();
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
@@ -425,6 +448,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     }
     if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
       MadeChange |= mergeSExts(F);
+    if (!LargeOffsetGEPMap.empty())
+      MadeChange |= splitLargeGEPOffsets();
 
     // Really free removed instructions during promotion.
     for (Instruction *I : RemovedInsts)
@@ -437,7 +462,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
   if (!DisableBranchOpts) {
     MadeChange = false;
-    SmallPtrSet<BasicBlock*, 8> WorkList;
+    // Use a set vector to get deterministic iteration order. The order the
+    // blocks are removed may affect whether or not PHI nodes in successors
+    // are removed.
+    SmallSetVector<BasicBlock*, 8> WorkList;
     for (BasicBlock &BB : F) {
       SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
       MadeChange |= ConstantFoldTerminator(&BB, true);
@@ -452,8 +480,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     // Delete the dead blocks and any of their dead successors.
     MadeChange |= !WorkList.empty();
     while (!WorkList.empty()) {
-      BasicBlock *BB = *WorkList.begin();
-      WorkList.erase(BB);
+      BasicBlock *BB = WorkList.pop_back_val();
       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
 
       DeleteDeadBlock(BB);
@@ -491,8 +518,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 bool CodeGenPrepare::eliminateFallThrough(Function &F) {
   bool Changed = false;
   // Scan all of the blocks in the function, except for the entry block.
-  for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
-    BasicBlock *BB = &*I++;
+  // Use a temporary array to avoid iterator being invalidated when
+  // deleting blocks.
+  SmallVector<WeakTrackingVH, 16> Blocks;
+  for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
+    Blocks.push_back(&Block);
+
+  for (auto &Block : Blocks) {
+    auto *BB = cast_or_null<BasicBlock>(Block);
+    if (!BB)
+      continue;
     // If the destination block has a single pred, then this is a trivial
     // edge, just collapse it.
     BasicBlock *SinglePred = BB->getSinglePredecessor();
@@ -503,17 +538,10 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) {
     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
     if (Term && !Term->isConditional()) {
       Changed = true;
-      DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
-      // Remember if SinglePred was the entry block of the function.
-      // If so, we will need to move BB back to the entry position.
-      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(BB, nullptr);
-
-      if (isEntry && BB != &BB->getParent()->getEntryBlock())
-        BB->moveBefore(&BB->getParent()->getEntryBlock());
+      LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
 
-      // We have erased a block. Update the iterator.
-      I = BB->getIterator();
+      // Merge BB into SinglePred and delete it.
+      MergeBlockIntoPredecessor(BB);
     }
   }
   return Changed;
@@ -566,9 +594,17 @@ bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
   }
 
   bool MadeChange = false;
+  // Copy blocks into a temporary array to avoid iterator invalidation issues
+  // as we remove them.
   // Note that this intentionally skips the entry block.
-  for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
-    BasicBlock *BB = &*I++;
+  SmallVector<WeakTrackingVH, 16> Blocks;
+  for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
+    Blocks.push_back(&Block);
+
+  for (auto &Block : Blocks) {
+    BasicBlock *BB = cast_or_null<BasicBlock>(Block);
+    if (!BB)
+      continue;
     BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
     if (!DestBB ||
         !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
@@ -730,21 +766,20 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   BasicBlock *DestBB = BI->getSuccessor(0);
 
-  DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB);
+  LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
+                    << *BB << *DestBB);
 
   // If the destination block has a single pred, then this is a trivial edge,
   // just collapse it.
   if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
     if (SinglePred != DestBB) {
-      // Remember if SinglePred was the entry block of the function.  If so, we
-      // will need to move BB back to the entry position.
-      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(DestBB, nullptr);
-
-      if (isEntry && BB != &BB->getParent()->getEntryBlock())
-        BB->moveBefore(&BB->getParent()->getEntryBlock());
-
-      DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+      assert(SinglePred == BB &&
+             "Single predecessor not the same as predecessor");
+      // Merge DestBB into SinglePred/BB and delete it.
+      MergeBlockIntoPredecessor(DestBB);
+      // Note: BB(=SinglePred) will not be deleted on this path.
+      // DestBB(=its single successor) is the one that was deleted.
+      LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
       return;
     }
   }
@@ -782,7 +817,7 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
   BB->eraseFromParent();
   ++NumBlocksElim;
 
-  DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+  LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 }
 
 // Computes a map of base pointer relocation instructions to corresponding
@@ -1024,6 +1059,7 @@ static bool SinkCast(CastInst *CI) {
       assert(InsertPt != UserBB->end());
       InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
                                       CI->getType(), "", &*InsertPt);
+      InsertedCast->setDebugLoc(CI->getDebugLoc());
     }
 
     // Replace a use of the cast with a use of the new cast.
@@ -1247,8 +1283,8 @@ static bool sinkAndCmp0Expression(Instruction *AndI,
   if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
     return false;
 
-  DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
-  DEBUG(AndI->getParent()->dump());
+  LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
+  LLVM_DEBUG(AndI->getParent()->dump());
 
   // Push the 'and' into the same block as the icmp 0.  There should only be
   // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
@@ -1261,7 +1297,7 @@ static bool sinkAndCmp0Expression(Instruction *AndI,
     // Preincrement use iterator so we don't invalidate it.
     ++UI;
 
-    DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
+    LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
 
     // Keep the 'and' in the same place if the use is already in the same block.
     Instruction *InsertPt =
@@ -1275,7 +1311,7 @@ static bool sinkAndCmp0Expression(Instruction *AndI,
     // Replace a use of the 'and' with a use of the new 'and'.
     TheUse = InsertedAnd;
     ++NumAndUses;
-    DEBUG(User->getParent()->dump());
+    LLVM_DEBUG(User->getParent()->dump());
   }
 
   // We removed all uses, nuke the and.
@@ -1388,7 +1424,7 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
 ///   %x.extract.shift.1 = lshr i64 %arg1, 32
 ///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
 ///
-/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
+/// CodeGen will recognize the pattern in BB2 and generate BitExtract
 /// instruction.
 /// Return true if any changes are made.
 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
@@ -1434,7 +1470,7 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
       // cmp i16 trunc.result, opnd2
       //
       if (isa<TruncInst>(User) && shiftIsLegal
-          // If the type of the truncate is legal, no trucate will be
+          // If the type of the truncate is legal, no truncate will be
           // introduced in other basic blocks.
           &&
           (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
@@ -1581,7 +1617,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       // if size - offset meets the size threshold.
       if (!Arg->getType()->isPointerTy())
         continue;
-      APInt Offset(DL->getPointerSizeInBits(
+      APInt Offset(DL->getIndexSizeInBits(
                        cast<PointerType>(Arg->getType())->getAddressSpace()),
                    0);
       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
@@ -1606,11 +1642,14 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
     // If this is a memcpy (or similar) then we may be able to improve the
     // alignment
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
-      unsigned Align = getKnownAlignment(MI->getDest(), *DL);
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
-        Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
-      if (Align > MI->getAlignment())
-        MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
+      unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL);
+      if (DestAlign > MI->getDestAlignment())
+        MI->setDestAlignment(DestAlign);
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+        unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+        if (SrcAlign > MTI->getSourceAlignment())
+          MTI->setSourceAlignment(SrcAlign);
+      }
     }
   }
 
@@ -1664,7 +1703,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       InsertedInsts.insert(ExtVal);
       return true;
     }
-    case Intrinsic::invariant_group_barrier:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
       II->replaceAllUsesWith(II->getArgOperand(0));
       II->eraseFromParent();
       return true;
@@ -2018,11 +2058,11 @@ LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
 
 namespace {
 
-/// \brief This class provides transaction based operation on the IR.
+/// This class provides transaction based operation on the IR.
 /// Every change made through this class is recorded in the internal state and
 /// can be undone (rollback) until commit is called.
 class TypePromotionTransaction {
-  /// \brief This represents the common interface of the individual transaction.
+  /// This represents the common interface of the individual transaction.
   /// Each class implements the logic for doing one specific modification on
   /// the IR via the TypePromotionTransaction.
   class TypePromotionAction {
@@ -2031,20 +2071,20 @@ class TypePromotionTransaction {
     Instruction *Inst;
 
   public:
-    /// \brief Constructor of the action.
+    /// Constructor of the action.
     /// The constructor performs the related action on the IR.
     TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
 
     virtual ~TypePromotionAction() = default;
 
-    /// \brief Undo the modification done by this action.
+    /// Undo the modification done by this action.
     /// When this method is called, the IR must be in the same state as it was
     /// before this action was applied.
     /// \pre Undoing the action works if and only if the IR is in the exact same
     /// state as it was directly after this action was applied.
     virtual void undo() = 0;
 
-    /// \brief Advocate every change made by this action.
+    /// Advocate every change made by this action.
     /// When the results on the IR of the action are to be kept, it is important
     /// to call this function, otherwise hidden information may be kept forever.
     virtual void commit() {
@@ -2052,12 +2092,12 @@ class TypePromotionTransaction {
     }
   };
 
-  /// \brief Utility to remember the position of an instruction.
+  /// Utility to remember the position of an instruction.
   class InsertionHandler {
     /// Position of an instruction.
     /// Either an instruction:
     /// - Is the first in a basic block: BB is used.
-    /// - Has a previous instructon: PrevInst is used.
+    /// - Has a previous instruction: PrevInst is used.
     union {
       Instruction *PrevInst;
       BasicBlock *BB;
@@ -2067,7 +2107,7 @@ class TypePromotionTransaction {
     bool HasPrevInstruction;
 
   public:
-    /// \brief Record the position of \p Inst.
+    /// Record the position of \p Inst.
     InsertionHandler(Instruction *Inst) {
       BasicBlock::iterator It = Inst->getIterator();
       HasPrevInstruction = (It != (Inst->getParent()->begin()));
@@ -2077,7 +2117,7 @@ class TypePromotionTransaction {
         Point.BB = Inst->getParent();
     }
 
-    /// \brief Insert \p Inst at the recorded position.
+    /// Insert \p Inst at the recorded position.
     void insert(Instruction *Inst) {
       if (HasPrevInstruction) {
         if (Inst->getParent())
@@ -2093,27 +2133,28 @@ class TypePromotionTransaction {
     }
   };
 
-  /// \brief Move an instruction before another.
+  /// Move an instruction before another.
   class InstructionMoveBefore : public TypePromotionAction {
     /// Original position of the instruction.
     InsertionHandler Position;
 
   public:
-    /// \brief Move \p Inst before \p Before.
+    /// Move \p Inst before \p Before.
     InstructionMoveBefore(Instruction *Inst, Instruction *Before)
         : TypePromotionAction(Inst), Position(Inst) {
-      DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before << "\n");
+      LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
+                        << "\n");
       Inst->moveBefore(Before);
     }
 
-    /// \brief Move the instruction back to its original position.
+    /// Move the instruction back to its original position.
     void undo() override {
-      DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
       Position.insert(Inst);
     }
   };
 
-  /// \brief Set the operand of an instruction with a new value.
+  /// Set the operand of an instruction with a new value.
   class OperandSetter : public TypePromotionAction {
     /// Original operand of the instruction.
     Value *Origin;
@@ -2122,35 +2163,35 @@ class TypePromotionTransaction {
     unsigned Idx;
 
   public:
-    /// \brief Set \p Idx operand of \p Inst with \p NewVal.
+    /// Set \p Idx operand of \p Inst with \p NewVal.
     OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
         : TypePromotionAction(Inst), Idx(Idx) {
-      DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
-                   << "for:" << *Inst << "\n"
-                   << "with:" << *NewVal << "\n");
+      LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
+                        << "for:" << *Inst << "\n"
+                        << "with:" << *NewVal << "\n");
       Origin = Inst->getOperand(Idx);
       Inst->setOperand(Idx, NewVal);
     }
 
-    /// \brief Restore the original value of the instruction.
+    /// Restore the original value of the instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
-                   << "for: " << *Inst << "\n"
-                   << "with: " << *Origin << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
+                        << "for: " << *Inst << "\n"
+                        << "with: " << *Origin << "\n");
       Inst->setOperand(Idx, Origin);
     }
   };
 
-  /// \brief Hide the operands of an instruction.
+  /// Hide the operands of an instruction.
   /// Do as if this instruction was not using any of its operands.
   class OperandsHider : public TypePromotionAction {
     /// The list of original operands.
     SmallVector<Value *, 4> OriginalValues;
 
   public:
-    /// \brief Remove \p Inst from the uses of the operands of \p Inst.
+    /// Remove \p Inst from the uses of the operands of \p Inst.
     OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
-      DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
       unsigned NumOpnds = Inst->getNumOperands();
       OriginalValues.reserve(NumOpnds);
       for (unsigned It = 0; It < NumOpnds; ++It) {
@@ -2164,114 +2205,114 @@ class TypePromotionTransaction {
       }
     }
 
-    /// \brief Restore the original list of uses.
+    /// Restore the original list of uses.
     void undo() override {
-      DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
       for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
         Inst->setOperand(It, OriginalValues[It]);
     }
   };
 
-  /// \brief Build a truncate instruction.
+  /// Build a truncate instruction.
   class TruncBuilder : public TypePromotionAction {
     Value *Val;
 
   public:
-    /// \brief Build a truncate instruction of \p Opnd producing a \p Ty
+    /// Build a truncate instruction of \p Opnd producing a \p Ty
     /// result.
     /// trunc Opnd to Ty.
     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
       IRBuilder<> Builder(Opnd);
       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
-      DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
+      LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built value.
+    /// Get the built value.
     Value *getBuiltValue() { return Val; }
 
-    /// \brief Remove the built instruction.
+    /// Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
       if (Instruction *IVal = dyn_cast<Instruction>(Val))
         IVal->eraseFromParent();
     }
   };
 
-  /// \brief Build a sign extension instruction.
+  /// Build a sign extension instruction.
   class SExtBuilder : public TypePromotionAction {
     Value *Val;
 
   public:
-    /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
+    /// Build a sign extension instruction of \p Opnd producing a \p Ty
     /// result.
     /// sext Opnd to Ty.
     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
         : TypePromotionAction(InsertPt) {
       IRBuilder<> Builder(InsertPt);
       Val = Builder.CreateSExt(Opnd, Ty, "promoted");
-      DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
+      LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built value.
+    /// Get the built value.
     Value *getBuiltValue() { return Val; }
 
-    /// \brief Remove the built instruction.
+    /// Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
       if (Instruction *IVal = dyn_cast<Instruction>(Val))
         IVal->eraseFromParent();
     }
   };
 
-  /// \brief Build a zero extension instruction.
+  /// Build a zero extension instruction.
   class ZExtBuilder : public TypePromotionAction {
     Value *Val;
 
   public:
-    /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
+    /// Build a zero extension instruction of \p Opnd producing a \p Ty
     /// result.
     /// zext Opnd to Ty.
     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
         : TypePromotionAction(InsertPt) {
       IRBuilder<> Builder(InsertPt);
       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
-      DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
+      LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built value.
+    /// Get the built value.
     Value *getBuiltValue() { return Val; }
 
-    /// \brief Remove the built instruction.
+    /// Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
       if (Instruction *IVal = dyn_cast<Instruction>(Val))
         IVal->eraseFromParent();
     }
   };
 
-  /// \brief Mutate an instruction to another type.
+  /// Mutate an instruction to another type.
   class TypeMutator : public TypePromotionAction {
     /// Record the original type.
     Type *OrigTy;
 
   public:
-    /// \brief Mutate the type of \p Inst into \p NewTy.
+    /// Mutate the type of \p Inst into \p NewTy.
     TypeMutator(Instruction *Inst, Type *NewTy)
         : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
-      DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
+                        << "\n");
       Inst->mutateType(NewTy);
     }
 
-    /// \brief Mutate the instruction back to its original type.
+    /// Mutate the instruction back to its original type.
     void undo() override {
-      DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
+                        << "\n");
       Inst->mutateType(OrigTy);
     }
   };
 
-  /// \brief Replace the uses of an instruction by another instruction.
+  /// Replace the uses of an instruction by another instruction.
   class UsesReplacer : public TypePromotionAction {
     /// Helper structure to keep track of the replaced uses.
     struct InstructionAndIdx {
@@ -2291,10 +2332,10 @@ class TypePromotionTransaction {
     using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
 
   public:
-    /// \brief Replace all the use of \p Inst by \p New.
+    /// Replace all the use of \p Inst by \p New.
     UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
-      DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
+                        << "\n");
       // Record the original uses.
       for (Use &U : Inst->uses()) {
         Instruction *UserI = cast<Instruction>(U.getUser());
@@ -2304,9 +2345,9 @@ class TypePromotionTransaction {
       Inst->replaceAllUsesWith(New);
     }
 
-    /// \brief Reassign the original uses of Inst to Inst.
+    /// Reassign the original uses of Inst to Inst.
     void undo() override {
-      DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
       for (use_iterator UseIt = OriginalUses.begin(),
                         EndIt = OriginalUses.end();
            UseIt != EndIt; ++UseIt) {
@@ -2315,7 +2356,7 @@ class TypePromotionTransaction {
     }
   };
 
-  /// \brief Remove an instruction from the IR.
+  /// Remove an instruction from the IR.
   class InstructionRemover : public TypePromotionAction {
     /// Original position of the instruction.
     InsertionHandler Inserter;
@@ -2331,7 +2372,7 @@ class TypePromotionTransaction {
     SetOfInstrs &RemovedInsts;
 
   public:
-    /// \brief Remove all reference of \p Inst and optinally replace all its
+    /// Remove all reference of \p Inst and optionally replace all its
     /// uses with New.
     /// \p RemovedInsts Keep track of the instructions removed by this Action.
     /// \pre If !Inst->use_empty(), then New != nullptr
@@ -2341,7 +2382,7 @@ class TypePromotionTransaction {
           RemovedInsts(RemovedInsts) {
       if (New)
         Replacer = new UsesReplacer(Inst, New);
-      DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
       RemovedInsts.insert(Inst);
       /// The instructions removed here will be freed after completing
       /// optimizeBlock() for all blocks as we need to keep track of the
@@ -2351,10 +2392,10 @@ class TypePromotionTransaction {
 
     ~InstructionRemover() override { delete Replacer; }
 
-    /// \brief Resurrect the instruction and reassign it to the proper uses if
+    /// Resurrect the instruction and reassign it to the proper uses if
     /// new value was provided when build this action.
     void undo() override {
-      DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
       Inserter.insert(Inst);
       if (Replacer)
         Replacer->undo();
@@ -2496,7 +2537,7 @@ void TypePromotionTransaction::rollback(
 
 namespace {
 
-/// \brief A helper class for matching addressing modes.
+/// A helper class for matching addressing modes.
 ///
 /// This encapsulates the logic for matching the target-legal addressing modes.
 class AddressingModeMatcher {
@@ -2524,22 +2565,23 @@ class AddressingModeMatcher {
   /// The ongoing transaction where every action should be registered.
   TypePromotionTransaction &TPT;
 
+  // A GEP which has too large offset to be folded into the addressing mode.
+  std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
+
   /// This is set to true when we should not do profitability checks.
   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
   bool IgnoreProfitability;
 
-  AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
-                        const TargetLowering &TLI,
-                        const TargetRegisterInfo &TRI,
-                        Type *AT, unsigned AS,
-                        Instruction *MI, ExtAddrMode &AM,
-                        const SetOfInstrs &InsertedInsts,
-                        InstrToOrigTy &PromotedInsts,
-                        TypePromotionTransaction &TPT)
+  AddressingModeMatcher(
+      SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
+      const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
+      ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
+      InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
+      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP)
       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
-        PromotedInsts(PromotedInsts), TPT(TPT) {
+        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) {
     IgnoreProfitability = false;
   }
 
@@ -2551,28 +2593,27 @@ public:
   /// optimizations.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p The ongoing transaction where every action should be registered.
-  static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
-                           Instruction *MemoryInst,
-                           SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetLowering &TLI,
-                           const TargetRegisterInfo &TRI,
-                           const SetOfInstrs &InsertedInsts,
-                           InstrToOrigTy &PromotedInsts,
-                           TypePromotionTransaction &TPT) {
+  static ExtAddrMode
+  Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
+        SmallVectorImpl<Instruction *> &AddrModeInsts,
+        const TargetLowering &TLI, const TargetRegisterInfo &TRI,
+        const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
+        TypePromotionTransaction &TPT,
+        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI,
-                                         AccessTy, AS,
+    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
-                                         PromotedInsts, TPT).matchAddr(V, 0);
+                                         PromotedInsts, TPT, LargeOffsetGEP)
+                       .matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
     return Result;
   }
 
 private:
   bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
-  bool matchAddr(Value *V, unsigned Depth);
-  bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
+  bool matchAddr(Value *Addr, unsigned Depth);
+  bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
                           bool *MovedAway = nullptr);
   bool isProfitableToFoldIntoAddressingMode(Instruction *I,
                                             ExtAddrMode &AMBefore,
@@ -2582,20 +2623,21 @@ private:
                              Value *PromotedOperand) const;
 };
 
-/// \brief Keep track of simplification of Phi nodes.
+/// Keep track of simplification of Phi nodes.
 /// Accept the set of all phi nodes and erase phi node from this set
 /// if it is simplified.
 class SimplificationTracker {
   DenseMap<Value *, Value *> Storage;
   const SimplifyQuery &SQ;
-  SmallPtrSetImpl<PHINode *> &AllPhiNodes;
-  SmallPtrSetImpl<SelectInst *> &AllSelectNodes;
+  // Tracks newly created Phi nodes. We use a SetVector to get deterministic
+  // order when iterating over the set in MatchPhiSet.
+  SmallSetVector<PHINode *, 32> AllPhiNodes;
+  // Tracks newly created Select nodes.
+  SmallPtrSet<SelectInst *, 32> AllSelectNodes;
 
 public:
-  SimplificationTracker(const SimplifyQuery &sq,
-                        SmallPtrSetImpl<PHINode *> &APN,
-                        SmallPtrSetImpl<SelectInst *> &ASN)
-      : SQ(sq), AllPhiNodes(APN), AllSelectNodes(ASN) {}
+  SimplificationTracker(const SimplifyQuery &sq)
+      : SQ(sq) {}
 
   Value *Get(Value *V) {
     do {
@@ -2621,7 +2663,7 @@ public:
           Put(PI, V);
           PI->replaceAllUsesWith(V);
           if (auto *PHI = dyn_cast<PHINode>(PI))
-            AllPhiNodes.erase(PHI);
+            AllPhiNodes.remove(PHI);
           if (auto *Select = dyn_cast<SelectInst>(PI))
             AllSelectNodes.erase(Select);
           PI->eraseFromParent();
@@ -2633,9 +2675,48 @@ public:
   void Put(Value *From, Value *To) {
     Storage.insert({ From, To });
   }
+
+  void ReplacePhi(PHINode *From, PHINode *To) {
+    Value* OldReplacement = Get(From);
+    while (OldReplacement != From) {
+      From = To;
+      To = dyn_cast<PHINode>(OldReplacement);
+      OldReplacement = Get(From);
+    }
+    assert(Get(To) == To && "Replacement PHI node is already replaced.");
+    Put(From, To);
+    From->replaceAllUsesWith(To);
+    AllPhiNodes.remove(From);
+    From->eraseFromParent();
+  }
+
+  SmallSetVector<PHINode *, 32>& newPhiNodes() { return AllPhiNodes; }
+
+  void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
+
+  void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }
+
+  unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
+
+  unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
+
+  void destroyNewNodes(Type *CommonType) {
+    // For safe erasing, replace the uses with dummy value first.
+    auto Dummy = UndefValue::get(CommonType);
+    for (auto I : AllPhiNodes) {
+      I->replaceAllUsesWith(Dummy);
+      I->eraseFromParent();
+    }
+    AllPhiNodes.clear();
+    for (auto I : AllSelectNodes) {
+      I->replaceAllUsesWith(Dummy);
+      I->eraseFromParent();
+    }
+    AllSelectNodes.clear();
+  }
 };
 
-/// \brief A helper class for combining addressing modes.
+/// A helper class for combining addressing modes.
 class AddressingModeCombiner {
   typedef std::pair<Value *, BasicBlock *> ValueInBB;
   typedef DenseMap<ValueInBB, Value *> FoldAddrToValueMapping;
@@ -2664,12 +2745,12 @@ public:
   AddressingModeCombiner(const SimplifyQuery &_SQ, ValueInBB OriginalValue)
       : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
 
-  /// \brief Get the combined AddrMode
+  /// Get the combined AddrMode
   const ExtAddrMode &getAddrMode() const {
     return AddrModes[0];
   }
 
-  /// \brief Add a new AddrMode if it's compatible with the AddrModes we already
+  /// Add a new AddrMode if it's compatible with the AddrModes we already
   /// have.
   /// \return True iff we succeeded in doing so.
   bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
@@ -2694,29 +2775,35 @@ public:
     else if (DifferentField != ThisDifferentField)
       DifferentField = ExtAddrMode::MultipleFields;
 
-    // If NewAddrMode differs in only one dimension, and that dimension isn't
-    // the amount that ScaledReg is scaled by, then we can handle it by
-    // inserting a phi/select later on. Even if NewAddMode is the same
-    // we still need to collect it due to original value is different.
-    // And later we will need all original values as anchors during
-    // finding the common Phi node.
+    // If NewAddrMode differs in more than one dimension we cannot handle it.
+    bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
+
+    // If Scale Field is different then we reject.
+    CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
+
     // We also must reject the case when base offset is different and
     // scale reg is not null, we cannot handle this case due to merge of
     // different offsets will be used as ScaleReg.
-    if (DifferentField != ExtAddrMode::MultipleFields &&
-        DifferentField != ExtAddrMode::ScaleField &&
-        (DifferentField != ExtAddrMode::BaseOffsField ||
-         !NewAddrMode.ScaledReg)) {
+    CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
+                              !NewAddrMode.ScaledReg);
+
+    // We also must reject the case when GV is different and BaseReg installed
+    // due to we want to use base reg as a merge of GV values.
+    CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
+                              !NewAddrMode.HasBaseReg);
+
+    // Even if NewAddMode is the same we still need to collect it due to
+    // original value is different. And later we will need all original values
+    // as anchors during finding the common Phi node.
+    if (CanHandle)
       AddrModes.emplace_back(NewAddrMode);
-      return true;
-    }
+    else
+      AddrModes.clear();
 
-    // We couldn't combine NewAddrMode with the rest, so return failure.
-    AddrModes.clear();
-    return false;
+    return CanHandle;
   }
 
-  /// \brief Combine the addressing modes we've collected into a single
+  /// Combine the addressing modes we've collected into a single
   /// addressing mode.
   /// \return True iff we successfully combined them or we only had one so
   /// didn't need to combine them anyway.
@@ -2751,7 +2838,7 @@ public:
   }
 
 private:
-  /// \brief Initialize Map with anchor values. For address seen in some BB
+  /// Initialize Map with anchor values. For address seen in some BB
   /// we set the value of different field saw in this address.
   /// If address is not an instruction than basic block is set to null.
   /// At the same time we find a common type for different field we will
@@ -2784,9 +2871,9 @@ private:
     return true;
   }
 
-  /// \brief We have mapping between value A and basic block where value A
+  /// We have mapping between value A and basic block where value A
   /// seen to other value B where B was a field in addressing mode represented
-  /// by A. Also we have an original value C representin an address in some
+  /// by A. Also we have an original value C representing an address in some
   /// basic block. Traversing from C through phi and selects we ended up with
   /// A's in a map. This utility function tries to find a value V which is a
   /// field in addressing mode C and traversing through phi nodes and selects
@@ -2809,62 +2896,46 @@ private:
   //   <p, BB3> -> ?
   // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3
   Value *findCommon(FoldAddrToValueMapping &Map) {
-    // Tracks of new created Phi nodes.
-    SmallPtrSet<PHINode *, 32> NewPhiNodes;
-    // Tracks of new created Select nodes.
-    SmallPtrSet<SelectInst *, 32> NewSelectNodes;
-    // Tracks the simplification of new created phi nodes. The reason we use
+    // Tracks the simplification of newly created phi nodes. The reason we use
     // this mapping is because we will add new created Phi nodes in AddrToBase.
     // Simplification of Phi nodes is recursive, so some Phi node may
     // be simplified after we added it to AddrToBase.
     // Using this mapping we can find the current value in AddrToBase.
-    SimplificationTracker ST(SQ, NewPhiNodes, NewSelectNodes);
+    SimplificationTracker ST(SQ);
 
     // First step, DFS to create PHI nodes for all intermediate blocks.
     // Also fill traverse order for the second step.
     SmallVector<ValueInBB, 32> TraverseOrder;
-    InsertPlaceholders(Map, TraverseOrder, NewPhiNodes, NewSelectNodes);
+    InsertPlaceholders(Map, TraverseOrder, ST);
 
     // Second Step, fill new nodes by merged values and simplify if possible.
     FillPlaceholders(Map, TraverseOrder, ST);
 
-    if (!AddrSinkNewSelects && NewSelectNodes.size() > 0) {
-      DestroyNodes(NewPhiNodes);
-      DestroyNodes(NewSelectNodes);
+    if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
+      ST.destroyNewNodes(CommonType);
       return nullptr;
     }
 
     // Now we'd like to match New Phi nodes to existed ones.
     unsigned PhiNotMatchedCount = 0;
-    if (!MatchPhiSet(NewPhiNodes, ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
-      DestroyNodes(NewPhiNodes);
-      DestroyNodes(NewSelectNodes);
+    if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
+      ST.destroyNewNodes(CommonType);
       return nullptr;
     }
 
     auto *Result = ST.Get(Map.find(Original)->second);
     if (Result) {
-      NumMemoryInstsPhiCreated += NewPhiNodes.size() + PhiNotMatchedCount;
-      NumMemoryInstsSelectCreated += NewSelectNodes.size();
+      NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
+      NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
     }
     return Result;
   }
 
-  /// \brief Destroy nodes from a set.
-  template <typename T> void DestroyNodes(SmallPtrSetImpl<T *> &Instructions) {
-    // For safe erasing, replace the Phi with dummy value first.
-    auto Dummy = UndefValue::get(CommonType);
-    for (auto I : Instructions) {
-      I->replaceAllUsesWith(Dummy);
-      I->eraseFromParent();
-    }
-  }
-
-  /// \brief Try to match PHI node to Candidate.
+  /// Try to match PHI node to Candidate.
   /// Matcher tracks the matched Phi nodes.
   bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
-                    DenseSet<PHIPair> &Matcher,
-                    SmallPtrSetImpl<PHINode *> &PhiNodesToMatch) {
+                    SmallSetVector<PHIPair, 8> &Matcher,
+                    SmallSetVector<PHINode *, 32> &PhiNodesToMatch) {
     SmallVector<PHIPair, 8> WorkList;
     Matcher.insert({ PHI, Candidate });
     WorkList.push_back({ PHI, Candidate });
@@ -2908,13 +2979,16 @@ private:
     return true;
   }
 
-  /// \brief For the given set of PHI nodes try to find their equivalents.
+  /// For the given set of PHI nodes (in the SimplificationTracker) try
+  /// to find their equivalents.
   /// Returns false if this matching fails and creation of new Phi is disabled.
-  bool MatchPhiSet(SmallPtrSetImpl<PHINode *> &PhiNodesToMatch,
-                   SimplificationTracker &ST, bool AllowNewPhiNodes,
+  bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
                    unsigned &PhiNotMatchedCount) {
-    DenseSet<PHIPair> Matched;
+    // Use a SetVector for Matched to make sure we do replacements (ReplacePhi)
+    // in a deterministic order below.
+    SmallSetVector<PHIPair, 8> Matched;
     SmallPtrSet<PHINode *, 8> WillNotMatch;
+    SmallSetVector<PHINode *, 32> &PhiNodesToMatch = ST.newPhiNodes();
     while (PhiNodesToMatch.size()) {
       PHINode *PHI = *PhiNodesToMatch.begin();
 
@@ -2938,12 +3012,8 @@ private:
       }
       if (IsMatched) {
         // Replace all matched values and erase them.
-        for (auto MV : Matched) {
-          MV.first->replaceAllUsesWith(MV.second);
-          PhiNodesToMatch.erase(MV.first);
-          ST.Put(MV.first, MV.second);
-          MV.first->eraseFromParent();
-        }
+        for (auto MV : Matched)
+          ST.ReplacePhi(MV.first, MV.second);
         Matched.clear();
         continue;
       }
@@ -2953,11 +3023,11 @@ private:
       // Just remove all seen values in matcher. They will not match anything.
       PhiNotMatchedCount += WillNotMatch.size();
       for (auto *P : WillNotMatch)
-        PhiNodesToMatch.erase(P);
+        PhiNodesToMatch.remove(P);
     }
     return true;
   }
-  /// \brief Fill the placeholder with values from predecessors and simplify it.
+  /// Fill the placeholder with values from predecessors and simplify it.
   void FillPlaceholders(FoldAddrToValueMapping &Map,
                         SmallVectorImpl<ValueInBB> &TraverseOrder,
                         SimplificationTracker &ST) {
@@ -3011,8 +3081,7 @@ private:
   /// Also reports and order in what basic blocks have been traversed.
   void InsertPlaceholders(FoldAddrToValueMapping &Map,
                           SmallVectorImpl<ValueInBB> &TraverseOrder,
-                          SmallPtrSetImpl<PHINode *> &NewPhiNodes,
-                          SmallPtrSetImpl<SelectInst *> &NewSelectNodes) {
+                          SimplificationTracker &ST) {
     SmallVector<ValueInBB, 32> Worklist;
     assert((isa<PHINode>(Original.first) || isa<SelectInst>(Original.first)) &&
            "Address must be a Phi or Select node");
@@ -3038,8 +3107,7 @@ private:
       Instruction *CurrentI = cast<Instruction>(CurrentValue);
       bool IsDefinedInThisBB = CurrentI->getParent() == CurrentBlock;
 
-      unsigned PredCount =
-          std::distance(pred_begin(CurrentBlock), pred_end(CurrentBlock));
+      unsigned PredCount = pred_size(CurrentBlock);
       // if Current Value is not defined in this basic block we are interested
       // in values in predecessors.
       if (!IsDefinedInThisBB) {
@@ -3047,7 +3115,7 @@ private:
         PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
                                        &CurrentBlock->front());
         Map[Current] = PHI;
-        NewPhiNodes.insert(PHI);
+        ST.insertNewPhi(PHI);
         // Add all predecessors in work list.
         for (auto B : predecessors(CurrentBlock))
           Worklist.push_back({ CurrentValue, B });
@@ -3061,7 +3129,7 @@ private:
             SelectInst::Create(OrigSelect->getCondition(), Dummy, Dummy,
                                OrigSelect->getName(), OrigSelect, OrigSelect);
         Map[Current] = Select;
-        NewSelectNodes.insert(Select);
+        ST.insertNewSelect(Select);
         // We are interested in True and False value in this basic block.
         Worklist.push_back({ OrigSelect->getTrueValue(), CurrentBlock });
         Worklist.push_back({ OrigSelect->getFalseValue(), CurrentBlock });
@@ -3073,7 +3141,7 @@ private:
         PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
                                        &CurrentBlock->front());
         Map[Current] = PHI;
-        NewPhiNodes.insert(PHI);
+        ST.insertNewPhi(PHI);
 
         // Add all predecessors in work list.
         for (auto B : predecessors(CurrentBlock))
@@ -3167,7 +3235,7 @@ static bool MightBeFoldableInst(Instruction *I) {
     // Don't touch identity bitcasts.
     if (I->getType() == I->getOperand(0)->getType())
       return false;
-    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
+    return I->getType()->isIntOrPtrTy();
   case Instruction::PtrToInt:
     // PtrToInt is always a noop, as we know that the int type is pointer sized.
     return true;
@@ -3187,7 +3255,7 @@ static bool MightBeFoldableInst(Instruction *I) {
   }
 }
 
-/// \brief Check whether or not \p Val is a legal instruction for \p TLI.
+/// Check whether or not \p Val is a legal instruction for \p TLI.
 /// \note \p Val is assumed to be the product of some type promotion.
 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
 /// to be legal, as the non-promoted value would have had the same state.
@@ -3207,9 +3275,9 @@ static bool isPromotedInstructionLegal(const TargetLowering &TLI,
 
 namespace {
 
-/// \brief Hepler class to perform type promotion.
+/// Hepler class to perform type promotion.
 class TypePromotionHelper {
-  /// \brief Utility function to check whether or not a sign or zero extension
+  /// Utility function to check whether or not a sign or zero extension
   /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
   /// either using the operands of \p Inst or promoting \p Inst.
   /// The type of the extension is defined by \p IsSExt.
@@ -3223,13 +3291,13 @@ class TypePromotionHelper {
   static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
                             const InstrToOrigTy &PromotedInsts, bool IsSExt);
 
-  /// \brief Utility function to determine if \p OpIdx should be promoted when
+  /// Utility function to determine if \p OpIdx should be promoted when
   /// promoting \p Inst.
   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
     return !(isa<SelectInst>(Inst) && OpIdx == 0);
   }
 
-  /// \brief Utility function to promote the operand of \p Ext when this
+  /// Utility function to promote the operand of \p Ext when this
   /// operand is a promotable trunc or sext or zext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInstsCost[out] contains the cost of all instructions
@@ -3244,7 +3312,7 @@ class TypePromotionHelper {
       SmallVectorImpl<Instruction *> *Exts,
       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
 
-  /// \brief Utility function to promote the operand of \p Ext when this
+  /// Utility function to promote the operand of \p Ext when this
   /// operand is promotable and is not a supported trunc or sext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInstsCost[out] contains the cost of all the instructions
@@ -3290,7 +3358,7 @@ public:
                             SmallVectorImpl<Instruction *> *Truncs,
                             const TargetLowering &TLI);
 
-  /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
+  /// Given a sign/zero extend instruction \p Ext, return the appropriate
   /// action to promote the operand of \p Ext instead of using Ext.
   /// \return NULL if no promotable action is possible with the current
   /// sign extension.
@@ -3332,6 +3400,47 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
        (IsSExt && BinOp->hasNoSignedWrap())))
     return true;
 
+  // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
+  if ((Inst->getOpcode() == Instruction::And ||
+       Inst->getOpcode() == Instruction::Or))
+    return true;
+
+  // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
+  if (Inst->getOpcode() == Instruction::Xor) {
+    const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    // Make sure it is not a NOT.
+    if (Cst && !Cst->getValue().isAllOnesValue())
+      return true;
+  }
+
+  // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
+  // It may change a poisoned value into a regular value, like
+  //     zext i32 (shrl i8 %val, 12)  -->  shrl i32 (zext i8 %val), 12
+  //          poisoned value                    regular value
+  // It should be OK since undef covers valid value.
+  if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
+    return true;
+
+  // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
+  // It may change a poisoned value into a regular value, like
+  //     zext i32 (shl i8 %val, 12)  -->  shl i32 (zext i8 %val), 12
+  //          poisoned value                    regular value
+  // It should be OK since undef covers valid value.
+  if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
+    const Instruction *ExtInst =
+        dyn_cast<const Instruction>(*Inst->user_begin());
+    if (ExtInst->hasOneUse()) {
+      const Instruction *AndInst =
+          dyn_cast<const Instruction>(*ExtInst->user_begin());
+      if (AndInst && AndInst->getOpcode() == Instruction::And) {
+        const ConstantInt *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
+        if (Cst &&
+            Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
+          return true;
+      }
+    }
+  }
+
   // Check if we can do the following simplification.
   // ext(trunc(opnd)) --> ext(opnd)
   if (!isa<TruncInst>(Inst))
@@ -3496,19 +3605,19 @@ Value *TypePromotionHelper::promoteOperandForOther(
   // Step #3.
   Instruction *ExtForOpnd = Ext;
 
-  DEBUG(dbgs() << "Propagate Ext to operands\n");
+  LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
   for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
        ++OpIdx) {
-    DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
+    LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
     if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
         !shouldExtOperand(ExtOpnd, OpIdx)) {
-      DEBUG(dbgs() << "No need to propagate\n");
+      LLVM_DEBUG(dbgs() << "No need to propagate\n");
       continue;
     }
     // Check if we can statically extend the operand.
     Value *Opnd = ExtOpnd->getOperand(OpIdx);
     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
-      DEBUG(dbgs() << "Statically extend\n");
+      LLVM_DEBUG(dbgs() << "Statically extend\n");
       unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
       APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
                             : Cst->getValue().zext(BitWidth);
@@ -3517,16 +3626,16 @@ Value *TypePromotionHelper::promoteOperandForOther(
     }
     // UndefValue are typed, so we have to statically sign extend them.
     if (isa<UndefValue>(Opnd)) {
-      DEBUG(dbgs() << "Statically extend\n");
+      LLVM_DEBUG(dbgs() << "Statically extend\n");
       TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
       continue;
     }
 
-    // Otherwise we have to explicity sign extend the operand.
+    // Otherwise we have to explicitly sign extend the operand.
     // Check if Ext was reused to extend an operand.
     if (!ExtForOpnd) {
       // If yes, create a new one.
-      DEBUG(dbgs() << "More operands to ext\n");
+      LLVM_DEBUG(dbgs() << "More operands to ext\n");
       Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
         : TPT.createZExt(Ext, Opnd, Ext->getType());
       if (!isa<Instruction>(ValForExtOpnd)) {
@@ -3547,7 +3656,7 @@ Value *TypePromotionHelper::promoteOperandForOther(
     ExtForOpnd = nullptr;
   }
   if (ExtForOpnd == Ext) {
-    DEBUG(dbgs() << "Extension is useless now\n");
+    LLVM_DEBUG(dbgs() << "Extension is useless now\n");
     TPT.eraseInstruction(Ext);
   }
   return ExtOpnd;
@@ -3563,7 +3672,8 @@ Value *TypePromotionHelper::promoteOperandForOther(
 /// \return True if the promotion is profitable, false otherwise.
 bool AddressingModeMatcher::isPromotionProfitable(
     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
-  DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
+  LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
+                    << '\n');
   // The cost of the new extensions is greater than the cost of the
   // old extension plus what we folded.
   // This is not profitable.
@@ -3613,8 +3723,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
   case Instruction::BitCast:
     // BitCast is always a noop, and we can handle it as long as it is
     // int->int or pointer->pointer (we don't want int<->fp or something).
-    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
-         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
+    if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
         // Don't touch identity bitcasts.  These were probably put here by LSR,
         // and we don't want to mess around with them.  Assume it knows what it
         // is doing.
@@ -3714,6 +3823,30 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
         // Check to see if we can fold the base pointer in too.
         if (matchAddr(AddrInst->getOperand(0), Depth+1))
           return true;
+      } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
+                 TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
+                 ConstantOffset > 0) {
+        // Record GEPs with non-zero offsets as candidates for splitting in the
+        // event that the offset cannot fit into the r+i addressing mode.
+        // Simple and common case that only one GEP is used in calculating the
+        // address for the memory access.
+        Value *Base = AddrInst->getOperand(0);
+        auto *BaseI = dyn_cast<Instruction>(Base);
+        auto *GEP = cast<GetElementPtrInst>(AddrInst);
+        if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
+            (BaseI && !isa<CastInst>(BaseI) &&
+             !isa<GetElementPtrInst>(BaseI))) {
+          // If the base is an instruction, make sure the GEP is not in the same
+          // basic block as the base. If the base is an argument or global
+          // value, make sure the GEP is not in the entry block.  Otherwise,
+          // instruction selection can undo the split.  Also make sure the
+          // parent block allows inserting non-PHI instructions before the
+          // terminator.
+          BasicBlock *Parent =
+              BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
+          if (GEP->getParent() != Parent && !Parent->getTerminator()->isEHPad())
+            LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
+        }
       }
       AddrMode.BaseOffs -= ConstantOffset;
       return false;
@@ -3810,7 +3943,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
                                PromotedOperand)) {
       AddrMode = BackupAddrMode;
       AddrModeInsts.resize(OldSize);
-      DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
+      LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
       TPT.rollback(LastKnownGood);
       return false;
     }
@@ -4124,12 +4257,13 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     // will tell us if the addressing mode for the memory operation will
     // *actually* cover the shared instruction.
     ExtAddrMode Result;
+    std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
+                                                                      0);
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI,
-                                  AddressAccessTy, AS,
-                                  MemoryInst, Result, InsertedInsts,
-                                  PromotedInsts, TPT);
+    AddressingModeMatcher Matcher(
+        MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
     Matcher.IgnoreProfitability = true;
     bool Success = Matcher.matchAddr(Address, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -4231,11 +4365,24 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // the result may differ depending on what other uses our candidate
     // addressing instructions might have.
     AddrModeInsts.clear();
+    std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
+                                                                      0);
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
-        InsertedInsts, PromotedInsts, TPT);
-    NewAddrMode.OriginalValue = V;
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
+
+    GetElementPtrInst *GEP = LargeOffsetGEP.first;
+    if (GEP && GEP->getParent() != MemoryInst->getParent() &&
+        !NewGEPBases.count(GEP)) {
+      // If splitting the underlying data structure can reduce the offset of a
+      // GEP, collect the GEP.  Skip the GEPs that are the new bases of
+      // previously split data structures.
+      LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
+      if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
+        LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
+    }
 
+    NewAddrMode.OriginalValue = V;
     if (!AddrModes.addNewAddrMode(NewAddrMode))
       break;
   }
@@ -4259,7 +4406,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
         return IsNonLocalValue(V, MemoryInst->getParent());
                   })) {
-    DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    LLVM_DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode
+                      << "\n");
     return false;
   }
 
@@ -4278,17 +4426,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
   Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
   if (SunkAddr) {
-    DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst << "\n");
+    LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
+                      << " for " << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
   } else if (AddrSinkUsingGEPs ||
-             (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
-              SubtargetInfo->useAA())) {
+             (!AddrSinkUsingGEPs.getNumOccurrences() && TM && TTI->useAA())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
-    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst << "\n");
+    LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
+                      << " for " << *MemoryInst << "\n");
     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
 
@@ -4427,8 +4574,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
          DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
       return false;
 
-    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst << "\n");
+    LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
+                      << " for " << *MemoryInst << "\n");
     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
     Value *Result = nullptr;
 
@@ -4554,7 +4701,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   return MadeChange;
 }
 
-/// \brief Check if all the uses of \p Val are equivalent (or free) zero or
+/// Check if all the uses of \p Val are equivalent (or free) zero or
 /// sign extensions.
 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
   assert(!Val->use_empty() && "Input must have at least one use");
@@ -4602,7 +4749,7 @@ static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
   return true;
 }
 
-/// \brief Try to speculatively promote extensions in \p Exts and continue
+/// Try to speculatively promote extensions in \p Exts and continue
 /// promoting through newly promoted operands recursively as far as doing so is
 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
 /// When some promotion happened, \p TPT contains the proper state to revert
@@ -4728,7 +4875,7 @@ bool CodeGenPrepare::mergeSExts(Function &F) {
         }
         if (!DT.dominates(Pt, Inst))
           // Give up if we need to merge in a common dominator as the
-          // expermients show it is not profitable.
+          // experiments show it is not profitable.
           continue;
         Inst->replaceAllUsesWith(Pt);
         RemovedInsts.insert(Inst);
@@ -4744,6 +4891,154 @@ bool CodeGenPrepare::mergeSExts(Function &F) {
   return Changed;
 }
 
+// Spliting large data structures so that the GEPs accessing them can have
+// smaller offsets so that they can be sunk to the same blocks as their users.
+// For example, a large struct starting from %base is splitted into two parts
+// where the second part starts from %new_base.
+//
+// Before:
+// BB0:
+//   %base     =
+//
+// BB1:
+//   %gep0     = gep %base, off0
+//   %gep1     = gep %base, off1
+//   %gep2     = gep %base, off2
+//
+// BB2:
+//   %load1    = load %gep0
+//   %load2    = load %gep1
+//   %load3    = load %gep2
+//
+// After:
+// BB0:
+//   %base     =
+//   %new_base = gep %base, off0
+//
+// BB1:
+//   %new_gep0 = %new_base
+//   %new_gep1 = gep %new_base, off1 - off0
+//   %new_gep2 = gep %new_base, off2 - off0
+//
+// BB2:
+//   %load1    = load i32, i32* %new_gep0
+//   %load2    = load i32, i32* %new_gep1
+//   %load3    = load i32, i32* %new_gep2
+//
+// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
+// their offsets are smaller enough to fit into the addressing mode.
+bool CodeGenPrepare::splitLargeGEPOffsets() {
+  bool Changed = false;
+  for (auto &Entry : LargeOffsetGEPMap) {
+    Value *OldBase = Entry.first;
+    SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
+        &LargeOffsetGEPs = Entry.second;
+    auto compareGEPOffset =
+        [&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
+            const std::pair<GetElementPtrInst *, int64_t> &RHS) {
+          if (LHS.first == RHS.first)
+            return false;
+          if (LHS.second != RHS.second)
+            return LHS.second < RHS.second;
+          return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
+        };
+    // Sorting all the GEPs of the same data structures based on the offsets.
+    llvm::sort(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end(),
+               compareGEPOffset);
+    LargeOffsetGEPs.erase(
+        std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
+        LargeOffsetGEPs.end());
+    // Skip if all the GEPs have the same offsets.
+    if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
+      continue;
+    GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
+    int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
+    Value *NewBaseGEP = nullptr;
+
+    auto LargeOffsetGEP = LargeOffsetGEPs.begin();
+    while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
+      GetElementPtrInst *GEP = LargeOffsetGEP->first;
+      int64_t Offset = LargeOffsetGEP->second;
+      if (Offset != BaseOffset) {
+        TargetLowering::AddrMode AddrMode;
+        AddrMode.BaseOffs = Offset - BaseOffset;
+        // The result type of the GEP might not be the type of the memory
+        // access.
+        if (!TLI->isLegalAddressingMode(*DL, AddrMode,
+                                        GEP->getResultElementType(),
+                                        GEP->getAddressSpace())) {
+          // We need to create a new base if the offset to the current base is
+          // too large to fit into the addressing mode. So, a very large struct
+          // may be splitted into several parts.
+          BaseGEP = GEP;
+          BaseOffset = Offset;
+          NewBaseGEP = nullptr;
+        }
+      }
+
+      // Generate a new GEP to replace the current one.
+      IRBuilder<> Builder(GEP);
+      Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+      Type *I8PtrTy =
+          Builder.getInt8PtrTy(GEP->getType()->getPointerAddressSpace());
+      Type *I8Ty = Builder.getInt8Ty();
+
+      if (!NewBaseGEP) {
+        // Create a new base if we don't have one yet.  Find the insertion
+        // pointer for the new base first.
+        BasicBlock::iterator NewBaseInsertPt;
+        BasicBlock *NewBaseInsertBB;
+        if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
+          // If the base of the struct is an instruction, the new base will be
+          // inserted close to it.
+          NewBaseInsertBB = BaseI->getParent();
+          if (isa<PHINode>(BaseI))
+            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+          else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
+            NewBaseInsertBB =
+                SplitEdge(NewBaseInsertBB, Invoke->getNormalDest());
+            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+          } else
+            NewBaseInsertPt = std::next(BaseI->getIterator());
+        } else {
+          // If the current base is an argument or global value, the new base
+          // will be inserted to the entry block.
+          NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
+          NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+        }
+        IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
+        // Create a new base.
+        Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset);
+        NewBaseGEP = OldBase;
+        if (NewBaseGEP->getType() != I8PtrTy)
+          NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
+        NewBaseGEP =
+            NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
+        NewGEPBases.insert(NewBaseGEP);
+      }
+
+      Value *NewGEP = NewBaseGEP;
+      if (Offset == BaseOffset) {
+        if (GEP->getType() != I8PtrTy)
+          NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+      } else {
+        // Calculate the new offset for the new GEP.
+        Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset);
+        NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
+
+        if (GEP->getType() != I8PtrTy)
+          NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+      }
+      GEP->replaceAllUsesWith(NewGEP);
+      LargeOffsetGEPID.erase(GEP);
+      LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
+      GEP->eraseFromParent();
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
 /// Return true, if an ext(load) can be formed from an extension in
 /// \p MovedExts.
 bool CodeGenPrepare::canFormExtLd(
@@ -5053,8 +5348,7 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
 //   x = phi x1', x2'
 //   y = and x, 0xff
 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
-  if (!Load->isSimple() ||
-      !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
+  if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
     return false;
 
   // Skip loads we've already transformed.
@@ -5519,7 +5813,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
 
 namespace {
 
-/// \brief Helper class to promote a scalar operation to a vector one.
+/// Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.
 /// E.g.,
 /// a = vector_op <2 x i32>
@@ -5556,7 +5850,7 @@ class VectorPromoteHelper {
   /// Instruction that will be combined with the transition.
   Instruction *CombineInst = nullptr;
 
-  /// \brief The instruction that represents the current end of the transition.
+  /// The instruction that represents the current end of the transition.
   /// Since we are faking the promotion until we reach the end of the chain
   /// of computation, we need a way to get the current end of the transition.
   Instruction *getEndOfTransition() const {
@@ -5565,7 +5859,7 @@ class VectorPromoteHelper {
     return InstsToBePromoted.back();
   }
 
-  /// \brief Return the index of the original value in the transition.
+  /// Return the index of the original value in the transition.
   /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
   /// c, is at index 0.
   unsigned getTransitionOriginalValueIdx() const {
@@ -5574,7 +5868,7 @@ class VectorPromoteHelper {
     return 0;
   }
 
-  /// \brief Return the index of the index in the transition.
+  /// Return the index of the index in the transition.
   /// E.g., for "extractelement <2 x i32> c, i32 0" the index
   /// is at index 1.
   unsigned getTransitionIdx() const {
@@ -5583,7 +5877,7 @@ class VectorPromoteHelper {
     return 1;
   }
 
-  /// \brief Get the type of the transition.
+  /// Get the type of the transition.
   /// This is the type of the original value.
   /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
   /// transition is <2 x i32>.
@@ -5591,7 +5885,7 @@ class VectorPromoteHelper {
     return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
   }
 
-  /// \brief Promote \p ToBePromoted by moving \p Def downward through.
+  /// Promote \p ToBePromoted by moving \p Def downward through.
   /// I.e., we have the following sequence:
   /// Def = Transition <ty1> a to <ty2>
   /// b = ToBePromoted <ty2> Def, ...
@@ -5600,7 +5894,7 @@ class VectorPromoteHelper {
   /// Def = Transition <ty1> ToBePromoted to <ty2>
   void promoteImpl(Instruction *ToBePromoted);
 
-  /// \brief Check whether or not it is profitable to promote all the
+  /// Check whether or not it is profitable to promote all the
   /// instructions enqueued to be promoted.
   bool isProfitableToPromote() {
     Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
@@ -5646,12 +5940,13 @@ class VectorPromoteHelper {
       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
                                                Arg0OVK, Arg1OVK);
     }
-    DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
-                 << ScalarCost << "\nVector: " << VectorCost << '\n');
+    LLVM_DEBUG(
+        dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+               << ScalarCost << "\nVector: " << VectorCost << '\n');
     return ScalarCost > VectorCost;
   }
 
-  /// \brief Generate a constant vector with \p Val with the same
+  /// Generate a constant vector with \p Val with the same
   /// number of elements as the transition.
   /// \p UseSplat defines whether or not \p Val should be replicated
   /// across the whole vector.
@@ -5686,7 +5981,7 @@ class VectorPromoteHelper {
     return ConstantVector::get(ConstVec);
   }
 
-  /// \brief Check if promoting to a vector type an operand at \p OperandIdx
+  /// Check if promoting to a vector type an operand at \p OperandIdx
   /// in \p Use can trigger undefined behavior.
   static bool canCauseUndefinedBehavior(const Instruction *Use,
                                         unsigned OperandIdx) {
@@ -5718,13 +6013,13 @@ public:
     assert(Transition && "Do not know how to promote null");
   }
 
-  /// \brief Check if we can promote \p ToBePromoted to \p Type.
+  /// Check if we can promote \p ToBePromoted to \p Type.
   bool canPromote(const Instruction *ToBePromoted) const {
     // We could support CastInst too.
     return isa<BinaryOperator>(ToBePromoted);
   }
 
-  /// \brief Check if it is profitable to promote \p ToBePromoted
+  /// Check if it is profitable to promote \p ToBePromoted
   /// by moving downward the transition through.
   bool shouldPromote(const Instruction *ToBePromoted) const {
     // Promote only if all the operands can be statically expanded.
@@ -5752,23 +6047,23 @@ public:
                ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
   }
 
-  /// \brief Check whether or not \p Use can be combined
+  /// Check whether or not \p Use can be combined
   /// with the transition.
   /// I.e., is it possible to do Use(Transition) => AnotherUse?
   bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
 
-  /// \brief Record \p ToBePromoted as part of the chain to be promoted.
+  /// Record \p ToBePromoted as part of the chain to be promoted.
   void enqueueForPromotion(Instruction *ToBePromoted) {
     InstsToBePromoted.push_back(ToBePromoted);
   }
 
-  /// \brief Set the instruction that will be combined with the transition.
+  /// Set the instruction that will be combined with the transition.
   void recordCombineInstruction(Instruction *ToBeCombined) {
     assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
     CombineInst = ToBeCombined;
   }
 
-  /// \brief Promote all the instructions enqueued for promotion if it is
+  /// Promote all the instructions enqueued for promotion if it is
   /// is profitable.
   /// \return True if the promotion happened, false otherwise.
   bool promote() {
@@ -5852,35 +6147,36 @@ bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
   //   => we would need to check that we are moving it at a cheaper place and
   //      we do not do that for now.
   BasicBlock *Parent = Inst->getParent();
-  DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+  LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
   VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
   // If the transition has more than one use, assume this is not going to be
   // beneficial.
   while (Inst->hasOneUse()) {
     Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
-    DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+    LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
 
     if (ToBePromoted->getParent() != Parent) {
-      DEBUG(dbgs() << "Instruction to promote is in a different block ("
-                   << ToBePromoted->getParent()->getName()
-                   << ") than the transition (" << Parent->getName() << ").\n");
+      LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
+                        << ToBePromoted->getParent()->getName()
+                        << ") than the transition (" << Parent->getName()
+                        << ").\n");
       return false;
     }
 
     if (VPH.canCombine(ToBePromoted)) {
-      DEBUG(dbgs() << "Assume " << *Inst << '\n'
-                   << "will be combined with: " << *ToBePromoted << '\n');
+      LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
+                        << "will be combined with: " << *ToBePromoted << '\n');
       VPH.recordCombineInstruction(ToBePromoted);
       bool Changed = VPH.promote();
       NumStoreExtractExposed += Changed;
       return Changed;
     }
 
-    DEBUG(dbgs() << "Try promoting.\n");
+    LLVM_DEBUG(dbgs() << "Try promoting.\n");
     if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
       return false;
 
-    DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+    LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
 
     VPH.enqueueForPromotion(ToBePromoted);
     Inst = ToBePromoted;
@@ -5890,7 +6186,7 @@ bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
 
 /// For the instruction sequence of store below, F and I values
 /// are bundled together as an i64 value before being stored into memory.
-/// Sometimes it is more efficent to generate separate stores for F and I,
+/// Sometimes it is more efficient to generate separate stores for F and I,
 /// which can remove the bitwise instructions or sink them to colder places.
 ///
 ///   (store (or (zext (bitcast F to i32) to i64),
@@ -5978,12 +6274,13 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
   if (HBC && HBC->getParent() != SI.getParent())
     HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
 
+  bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
   auto CreateSplitStore = [&](Value *V, bool Upper) {
     V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
     Value *Addr = Builder.CreateBitCast(
         SI.getOperand(1),
         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
-    if (Upper)
+    if ((IsLE && Upper) || (!IsLE && !Upper))
       Addr = Builder.CreateGEP(
           SplitStoreType, Addr,
           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
@@ -6270,6 +6567,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
                                         GEPI->getName(), GEPI);
+      NC->setDebugLoc(GEPI->getDebugLoc());
       GEPI->replaceAllUsesWith(NC);
       GEPI->eraseFromParent();
       ++NumGEPsElim;
@@ -6374,7 +6672,8 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
         // after it.
         if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
           continue;
-        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
+                          << *DVI << ' ' << *VI);
         DVI->removeFromParent();
         if (isa<PHINode>(VI))
           DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
@@ -6388,7 +6687,7 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
   return MadeChange;
 }
 
-/// \brief Scale down both weights to fit into uint32_t.
+/// Scale down both weights to fit into uint32_t.
 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
   uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
@@ -6396,7 +6695,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
   NewFalse = NewFalse / Scale;
 }
 
-/// \brief Some targets prefer to split a conditional branch like:
+/// Some targets prefer to split a conditional branch like:
 /// \code
 ///   %0 = icmp ne i32 %a, 0
 ///   %1 = icmp ne i32 %b, 0
@@ -6453,7 +6752,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
         !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
       continue;
 
-    DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
+    LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
 
     // Create a new BB.
     auto TmpBB =
@@ -6465,8 +6764,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
     Br1->setCondition(Cond1);
     LogicOp->eraseFromParent();
 
-    // Depending on the conditon we have to either replace the true or the false
-    // successor of the original branch instruction.
+    // Depending on the condition we have to either replace the true or the
+    // false successor of the original branch instruction.
     if (Opc == Instruction::And)
       Br1->setSuccessor(0, TmpBB);
     else
@@ -6519,8 +6818,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
       // We have flexibility in setting Prob for BB1 and Prob for NewBB.
       // The requirement is that
       //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
-      //     = TrueProb for orignal BB.
-      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      //     = TrueProb for original BB.
+      // Assuming the original weights are A and B, one choice is to set BB1's
       // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
       // assumes that
       //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
@@ -6554,8 +6853,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
       // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
       // The requirement is that
       //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
-      //     = FalseProb for orignal BB.
-      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      //     = FalseProb for original BB.
+      // Assuming the original weights are A and B, one choice is to set BB1's
       // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
       // assumes that
       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
@@ -6581,8 +6880,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
 
     MadeChange = true;
 
-    DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
-          TmpBB->dump());
+    LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
+               TmpBB->dump());
   }
   return MadeChange;
 }
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 98e22b24d37a..840e5ede6444 100644
--- a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -113,7 +113,7 @@ void CriticalAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
   // FIXME: It may be possible to remove the isKill() restriction once PR18663
   // has been properly fixed. There can be value in processing kills as seen in
   // the AggressiveAntiDepBreaker class.
-  if (MI.isDebugValue() || MI.isKill())
+  if (MI.isDebugInstr() || MI.isKill())
     return;
   assert(Count < InsertPosIndex && "Instruction index out of expected range!");
 
@@ -170,11 +170,11 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
   // FIXME: The issue with predicated instruction is more complex. We are being
   // conservative here because the kill markers cannot be trusted after
   // if-conversion:
-  // %r6 = LDR %sp, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // %r6 = LDR %sp, %reg0, 92, 14, %reg0; mem:LD4[FixedStack14]
   // ...
-  // STR %r0, killed %r6, %reg0, 0, pred:0, pred:%cpsr; mem:ST4[%395]
-  // %r6 = LDR %sp, %reg0, 100, pred:0, pred:%cpsr; mem:LD4[FixedStack12]
-  // STR %r0, killed %r6, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  // STR %r0, killed %r6, %reg0, 0, 0, %cpsr; mem:ST4[%395]
+  // %r6 = LDR %sp, %reg0, 100, 0, %cpsr; mem:LD4[FixedStack12]
+  // STR %r0, killed %r6, %reg0, 0, 14, %reg0; mem:ST4[%396](align=8)
   //
   // The first R6 kill is not really a kill since it's killed by a predicated
   // instruction which may not be executed. The second R6 def may or may not
@@ -461,14 +461,14 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
 
 #ifndef NDEBUG
   {
-    DEBUG(dbgs() << "Critical path has total latency "
-          << (Max->getDepth() + Max->Latency) << "\n");
-    DEBUG(dbgs() << "Available regs:");
+    LLVM_DEBUG(dbgs() << "Critical path has total latency "
+                      << (Max->getDepth() + Max->Latency) << "\n");
+    LLVM_DEBUG(dbgs() << "Available regs:");
     for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
       if (KillIndices[Reg] == ~0u)
-        DEBUG(dbgs() << " " << printReg(Reg, TRI));
+        LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI));
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
 #endif
 
@@ -534,7 +534,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
     // FIXME: It may be possible to remove the isKill() restriction once PR18663
     // has been properly fixed. There can be value in processing kills as seen
     // in the AggressiveAntiDepBreaker class.
-    if (MI.isDebugValue() || MI.isKill())
+    if (MI.isDebugInstr() || MI.isKill())
       continue;
 
     // Check if this instruction has a dependence on the critical path that
@@ -645,10 +645,10 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
                                                      AntiDepReg,
                                                      LastNewReg[AntiDepReg],
                                                      RC, ForbidRegs)) {
-        DEBUG(dbgs() << "Breaking anti-dependence edge on "
-                     << printReg(AntiDepReg, TRI) << " with "
-                     << RegRefs.count(AntiDepReg) << " references"
-                     << " using " << printReg(NewReg, TRI) << "!\n");
+        LLVM_DEBUG(dbgs() << "Breaking anti-dependence edge on "
+                          << printReg(AntiDepReg, TRI) << " with "
+                          << RegRefs.count(AntiDepReg) << " references"
+                          << " using " << printReg(NewReg, TRI) << "!\n");
 
         // Update the references to the old register to refer to the new
         // register.
diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
index 848db444270d..cd302e78cc3e 100644
--- a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -222,7 +222,7 @@ VLIWPacketizerList::~VLIWPacketizerList() {
 // End the current packet, bundle packet instructions and reset DFA state.
 void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator MI) {
-  DEBUG({
+  LLVM_DEBUG({
     if (!CurrentPacketMIs.empty()) {
       dbgs() << "Finalizing packet:\n";
       for (MachineInstr *MI : CurrentPacketMIs)
@@ -235,7 +235,7 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
   }
   CurrentPacketMIs.clear();
   ResourceTracker->clearResources();
-  DEBUG(dbgs() << "End packet\n");
+  LLVM_DEBUG(dbgs() << "End packet\n");
 }
 
 // Bundle machine instructions into packets.
@@ -248,7 +248,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                              std::distance(BeginItr, EndItr));
   VLIWScheduler->schedule();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Scheduling DAG of the packetize region\n";
     for (SUnit &SU : VLIWScheduler->SUnits)
       SU.dumpAll(VLIWScheduler);
@@ -287,10 +287,10 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
     assert(SUI && "Missing SUnit Info!");
 
     // Ask DFA if machine resource is available for MI.
-    DEBUG(dbgs() << "Checking resources for adding MI to packet " << MI);
+    LLVM_DEBUG(dbgs() << "Checking resources for adding MI to packet " << MI);
 
     bool ResourceAvail = ResourceTracker->canReserveResources(MI);
-    DEBUG({
+    LLVM_DEBUG({
       if (ResourceAvail)
         dbgs() << "  Resources are available for adding MI to packet\n";
       else
@@ -302,31 +302,33 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
         SUnit *SUJ = MIToSUnit[MJ];
         assert(SUJ && "Missing SUnit Info!");
 
-        DEBUG(dbgs() << "  Checking against MJ " << *MJ);
+        LLVM_DEBUG(dbgs() << "  Checking against MJ " << *MJ);
         // Is it legal to packetize SUI and SUJ together.
         if (!isLegalToPacketizeTogether(SUI, SUJ)) {
-          DEBUG(dbgs() << "  Not legal to add MI, try to prune\n");
+          LLVM_DEBUG(dbgs() << "  Not legal to add MI, try to prune\n");
           // Allow packetization if dependency can be pruned.
           if (!isLegalToPruneDependencies(SUI, SUJ)) {
             // End the packet if dependency cannot be pruned.
-            DEBUG(dbgs() << "  Could not prune dependencies for adding MI\n");
+            LLVM_DEBUG(dbgs()
+                       << "  Could not prune dependencies for adding MI\n");
             endPacket(MBB, MI);
             break;
           }
-          DEBUG(dbgs() << "  Pruned dependence for adding MI\n");
+          LLVM_DEBUG(dbgs() << "  Pruned dependence for adding MI\n");
         }
       }
     } else {
-      DEBUG(if (ResourceAvail)
-        dbgs() << "Resources are available, but instruction should not be "
-                  "added to packet\n  " << MI);
+      LLVM_DEBUG(if (ResourceAvail) dbgs()
+                 << "Resources are available, but instruction should not be "
+                    "added to packet\n  "
+                 << MI);
       // End the packet if resource is not available, or if the instruction
       // shoud not be added to the current packet.
       endPacket(MBB, MI);
     }
 
     // Add MI to the current packet.
-    DEBUG(dbgs() << "* Adding MI to packet " << MI << '\n');
+    LLVM_DEBUG(dbgs() << "* Adding MI to packet " << MI << '\n');
     BeginItr = addToPacket(MI);
   } // For all instructions in the packetization range.
 
diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index e6a54bb300f2..ff44c5660bad 100644
--- a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -125,7 +125,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
 
       // If the instruction is dead, delete it!
       if (isDead(MI)) {
-        DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
+        LLVM_DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
         // It is possible that some DBG_VALUE instructions refer to this
         // instruction.  They get marked as undef and will be deleted
         // in the live debug variable analysis.
diff --git a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
index 7d7eb57352a2..c83db476a4de 100644
--- a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
+++ b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -439,7 +439,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
           const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
           CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
           if (CrossCopy)
-            DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);
+            LLVM_DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);
         }
 
         if (!CrossCopy)
@@ -520,17 +520,15 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
       transferDefinedLanesStep(MO, Info.DefinedLanes);
   }
 
-  DEBUG(
-    dbgs() << "Defined/Used lanes:\n";
-    for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
-      unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
-      const VRegInfo &Info = VRegInfos[RegIdx];
-      dbgs() << printReg(Reg, nullptr)
-             << " Used: " << PrintLaneMask(Info.UsedLanes)
-             << " Def: " << PrintLaneMask(Info.DefinedLanes) << '\n';
-    }
-    dbgs() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Defined/Used lanes:\n"; for (unsigned RegIdx = 0;
+                                                     RegIdx < NumVirtRegs;
+                                                     ++RegIdx) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    const VRegInfo &Info = VRegInfos[RegIdx];
+    dbgs() << printReg(Reg, nullptr)
+           << " Used: " << PrintLaneMask(Info.UsedLanes)
+           << " Def: " << PrintLaneMask(Info.DefinedLanes) << '\n';
+  } dbgs() << "\n";);
 
   bool Again = false;
   // Mark operands as dead/unused.
@@ -545,18 +543,19 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
         unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
         const VRegInfo &RegInfo = VRegInfos[RegIdx];
         if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes.none()) {
-          DEBUG(dbgs() << "Marking operand '" << MO << "' as dead in " << MI);
+          LLVM_DEBUG(dbgs()
+                     << "Marking operand '" << MO << "' as dead in " << MI);
           MO.setIsDead();
         }
         if (MO.readsReg()) {
           bool CrossCopy = false;
           if (isUndefRegAtInput(MO, RegInfo)) {
-            DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in "
-                  << MI);
+            LLVM_DEBUG(dbgs()
+                       << "Marking operand '" << MO << "' as undef in " << MI);
             MO.setIsUndef();
           } else if (isUndefInput(MO, &CrossCopy)) {
-            DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in "
-                  << MI);
+            LLVM_DEBUG(dbgs()
+                       << "Marking operand '" << MO << "' as undef in " << MI);
             MO.setIsUndef();
             if (CrossCopy)
               Again = true;
@@ -577,7 +576,7 @@ bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
   // so we safe the compile time.
   MRI = &MF.getRegInfo();
   if (!MRI->subRegLivenessEnabled()) {
-    DEBUG(dbgs() << "Skipping Detect dead lanes pass\n");
+    LLVM_DEBUG(dbgs() << "Skipping Detect dead lanes pass\n");
     return false;
   }
 
diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index 39d80c0bf9bd..4586649d17f0 100644
--- a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -33,7 +34,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cstddef>
 
 using namespace llvm;
@@ -195,9 +195,9 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
   if (Resumes.empty())
     return false;
 
-  // Check the personality, don't do anything if it's funclet-based.
+  // Check the personality, don't do anything if it's scope-based.
   EHPersonality Pers = classifyEHPersonality(Fn.getPersonalityFn());
-  if (isFuncletEHPersonality(Pers))
+  if (isScopedEHPersonality(Pers))
     return false;
 
   LLVMContext &Ctx = Fn.getContext();
diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 6294ff450113..098afd885f2f 100644
--- a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -185,7 +185,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
   // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
   // get right.
   if (!MBB->livein_empty()) {
-    DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
     return false;
   }
 
@@ -195,18 +195,18 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
   // terminators never have side effects or define any used register values.
   for (MachineBasicBlock::iterator I = MBB->begin(),
        E = MBB->getFirstTerminator(); I != E; ++I) {
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     if (++InstrCount > BlockInstrLimit && !Stress) {
-      DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
-                   << BlockInstrLimit << " instructions.\n");
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
       return false;
     }
 
     // There shouldn't normally be any phis in a single-predecessor block.
     if (I->isPHI()) {
-      DEBUG(dbgs() << "Can't hoist: " << *I);
+      LLVM_DEBUG(dbgs() << "Can't hoist: " << *I);
       return false;
     }
 
@@ -214,21 +214,21 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
     // speculate GOT or constant pool loads that are guaranteed not to trap,
     // but we don't support that for now.
     if (I->mayLoad()) {
-      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      LLVM_DEBUG(dbgs() << "Won't speculate load: " << *I);
       return false;
     }
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
     if (!I->isSafeToMove(nullptr, DontMoveAcrossStore)) {
-      DEBUG(dbgs() << "Can't speculate: " << *I);
+      LLVM_DEBUG(dbgs() << "Can't speculate: " << *I);
       return false;
     }
 
     // Check for any dependencies on Head instructions.
     for (const MachineOperand &MO : I->operands()) {
       if (MO.isRegMask()) {
-        DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+        LLVM_DEBUG(dbgs() << "Won't speculate regmask: " << *I);
         return false;
       }
       if (!MO.isReg())
@@ -246,9 +246,10 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
       if (!DefMI || DefMI->getParent() != Head)
         continue;
       if (InsertAfter.insert(DefMI).second)
-        DEBUG(dbgs() << printMBBReference(*MBB) << " depends on " << *DefMI);
+        LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " depends on "
+                          << *DefMI);
       if (DefMI->isTerminator()) {
-        DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+        LLVM_DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
         return false;
       }
     }
@@ -279,7 +280,7 @@ bool SSAIfConv::findInsertionPoint() {
     --I;
     // Some of the conditional code depends in I.
     if (InsertAfter.count(&*I)) {
-      DEBUG(dbgs() << "Can't insert code after " << *I);
+      LLVM_DEBUG(dbgs() << "Can't insert code after " << *I);
       return false;
     }
 
@@ -313,7 +314,7 @@ bool SSAIfConv::findInsertionPoint() {
     // Some of the clobbered registers are live before I, not a valid insertion
     // point.
     if (!LiveRegUnits.empty()) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Would clobber";
         for (SparseSet<unsigned>::const_iterator
              i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i)
@@ -325,10 +326,10 @@ bool SSAIfConv::findInsertionPoint() {
 
     // This is a valid insertion point.
     InsertionPoint = I;
-    DEBUG(dbgs() << "Can insert before " << *I);
+    LLVM_DEBUG(dbgs() << "Can insert before " << *I);
     return true;
   }
-  DEBUG(dbgs() << "No legal insertion point found.\n");
+  LLVM_DEBUG(dbgs() << "No legal insertion point found.\n");
   return false;
 }
 
@@ -361,39 +362,39 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
     if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
         Succ1->succ_begin()[0] != Tail)
       return false;
-    DEBUG(dbgs() << "\nDiamond: " << printMBBReference(*Head) << " -> "
-                 << printMBBReference(*Succ0) << "/"
-                 << printMBBReference(*Succ1) << " -> "
-                 << printMBBReference(*Tail) << '\n');
+    LLVM_DEBUG(dbgs() << "\nDiamond: " << printMBBReference(*Head) << " -> "
+                      << printMBBReference(*Succ0) << "/"
+                      << printMBBReference(*Succ1) << " -> "
+                      << printMBBReference(*Tail) << '\n');
 
     // Live-in physregs are tricky to get right when speculating code.
     if (!Tail->livein_empty()) {
-      DEBUG(dbgs() << "Tail has live-ins.\n");
+      LLVM_DEBUG(dbgs() << "Tail has live-ins.\n");
       return false;
     }
   } else {
-    DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
-                 << printMBBReference(*Succ0) << " -> "
-                 << printMBBReference(*Tail) << '\n');
+    LLVM_DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
+                      << printMBBReference(*Succ0) << " -> "
+                      << printMBBReference(*Tail) << '\n');
   }
 
   // This is a triangle or a diamond.
   // If Tail doesn't have any phis, there must be side effects.
   if (Tail->empty() || !Tail->front().isPHI()) {
-    DEBUG(dbgs() << "No phis in tail.\n");
+    LLVM_DEBUG(dbgs() << "No phis in tail.\n");
     return false;
   }
 
   // The branch we're looking to eliminate must be analyzable.
   Cond.clear();
   if (TII->analyzeBranch(*Head, TBB, FBB, Cond)) {
-    DEBUG(dbgs() << "Branch not analyzable.\n");
+    LLVM_DEBUG(dbgs() << "Branch not analyzable.\n");
     return false;
   }
 
   // This is weird, probably some sort of degenerate CFG.
   if (!TBB) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    LLVM_DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
     return false;
   }
 
@@ -422,7 +423,7 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
     // Get target information.
     if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
                               PI.CondCycles, PI.TCycles, PI.FCycles)) {
-      DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      LLVM_DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
       return false;
     }
   }
@@ -459,10 +460,10 @@ void SSAIfConv::replacePHIInstrs() {
   // Convert all PHIs to select instructions inserted before FirstTerm.
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
     PHIInfo &PI = PHIs[i];
-    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
     unsigned DstReg = PI.PHI->getOperand(0).getReg();
     TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
-    DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
+    LLVM_DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
     PI.PHI->eraseFromParent();
     PI.PHI = nullptr;
   }
@@ -481,7 +482,7 @@ void SSAIfConv::rewritePHIOperands() {
     PHIInfo &PI = PHIs[i];
     unsigned DstReg = 0;
 
-    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
     if (PI.TReg == PI.FReg) {
       // We do not need the select instruction if both incoming values are
       // equal.
@@ -491,7 +492,7 @@ void SSAIfConv::rewritePHIOperands() {
       DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
       TII->insertSelect(*Head, FirstTerm, HeadDL,
                          DstReg, Cond, PI.TReg, PI.FReg);
-      DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
+      LLVM_DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
     }
 
     // Rewrite PHI operands TPred -> (DstReg, Head), remove FPred.
@@ -505,7 +506,7 @@ void SSAIfConv::rewritePHIOperands() {
         PI.PHI->RemoveOperand(i-2);
       }
     }
-    DEBUG(dbgs() << "          --> " << *PI.PHI);
+    LLVM_DEBUG(dbgs() << "          --> " << *PI.PHI);
   }
 }
 
@@ -563,8 +564,8 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
   assert(Head->succ_empty() && "Additional head successors?");
   if (!ExtraPreds && Head->isLayoutSuccessor(Tail)) {
     // Splice Tail onto the end of Head.
-    DEBUG(dbgs() << "Joining tail " << printMBBReference(*Tail) << " into head "
-                 << printMBBReference(*Head) << '\n');
+    LLVM_DEBUG(dbgs() << "Joining tail " << printMBBReference(*Tail)
+                      << " into head " << printMBBReference(*Head) << '\n');
     Head->splice(Head->end(), Tail,
                      Tail->begin(), Tail->end());
     Head->transferSuccessorsAndUpdatePHIs(Tail);
@@ -572,12 +573,12 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
     Tail->eraseFromParent();
   } else {
     // We need a branch to Tail, let code placement work it out later.
-    DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    LLVM_DEBUG(dbgs() << "Converting to unconditional branch.\n");
     SmallVector<MachineOperand, 0> EmptyCond;
     TII->insertBranch(*Head, Tail, nullptr, EmptyCond, HeadDL);
     Head->addSuccessor(Tail);
   }
-  DEBUG(dbgs() << *Head);
+  LLVM_DEBUG(dbgs() << *Head);
 }
 
 
@@ -692,7 +693,7 @@ bool EarlyIfConverter::shouldConvertIf() {
 
   MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred());
   MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred());
-  DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
+  LLVM_DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
   unsigned MinCrit = std::min(TBBTrace.getCriticalPath(),
                               FBBTrace.getCriticalPath());
 
@@ -706,10 +707,10 @@ bool EarlyIfConverter::shouldConvertIf() {
   if (IfConv.TBB != IfConv.Tail)
     ExtraBlocks.push_back(IfConv.TBB);
   unsigned ResLength = FBBTrace.getResourceLength(ExtraBlocks);
-  DEBUG(dbgs() << "Resource length " << ResLength
-               << ", minimal critical path " << MinCrit << '\n');
+  LLVM_DEBUG(dbgs() << "Resource length " << ResLength
+                    << ", minimal critical path " << MinCrit << '\n');
   if (ResLength > MinCrit + CritLimit) {
-    DEBUG(dbgs() << "Not enough available ILP.\n");
+    LLVM_DEBUG(dbgs() << "Not enough available ILP.\n");
     return false;
   }
 
@@ -719,7 +720,7 @@ bool EarlyIfConverter::shouldConvertIf() {
   MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
   unsigned BranchDepth =
       HeadTrace.getInstrCycles(*IfConv.Head->getFirstTerminator()).Depth;
-  DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
+  LLVM_DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
 
   // Look at all the tail phis, and compute the critical path extension caused
   // by inserting select instructions.
@@ -728,15 +729,15 @@ bool EarlyIfConverter::shouldConvertIf() {
     SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
     unsigned Slack = TailTrace.getInstrSlack(*PI.PHI);
     unsigned MaxDepth = Slack + TailTrace.getInstrCycles(*PI.PHI).Depth;
-    DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
+    LLVM_DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
 
     // The condition is pulled into the critical path.
     unsigned CondDepth = adjCycles(BranchDepth, PI.CondCycles);
     if (CondDepth > MaxDepth) {
       unsigned Extra = CondDepth - MaxDepth;
-      DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      LLVM_DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
       if (Extra > CritLimit) {
-        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
         return false;
       }
     }
@@ -745,9 +746,9 @@ bool EarlyIfConverter::shouldConvertIf() {
     unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(*PI.PHI), PI.TCycles);
     if (TDepth > MaxDepth) {
       unsigned Extra = TDepth - MaxDepth;
-      DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      LLVM_DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
       if (Extra > CritLimit) {
-        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
         return false;
       }
     }
@@ -756,9 +757,9 @@ bool EarlyIfConverter::shouldConvertIf() {
     unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(*PI.PHI), PI.FCycles);
     if (FDepth > MaxDepth) {
       unsigned Extra = FDepth - MaxDepth;
-      DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      LLVM_DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
       if (Extra > CritLimit) {
-        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
         return false;
       }
     }
@@ -783,8 +784,8 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
 }
 
 bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   if (skipFunction(MF.getFunction()))
     return false;
 
diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
deleted file mode 100644
index 61ec3f4be1dc..000000000000
--- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
+++ /dev/null
@@ -1,755 +0,0 @@
-//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/ExecutionDepsFix.h"
-
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "execution-deps-fix"
-
-/// Translate TRI register number to a list of indices into our smaller tables
-/// of interesting registers.
-iterator_range<SmallVectorImpl<int>::const_iterator>
-ExecutionDepsFix::regIndices(unsigned Reg) const {
-  assert(Reg < AliasMap.size() && "Invalid register");
-  const auto &Entry = AliasMap[Reg];
-  return make_range(Entry.begin(), Entry.end());
-}
-
-DomainValue *ExecutionDepsFix::alloc(int domain) {
-  DomainValue *dv = Avail.empty() ?
-                      new(Allocator.Allocate()) DomainValue :
-                      Avail.pop_back_val();
-  if (domain >= 0)
-    dv->addDomain(domain);
-  assert(dv->Refs == 0 && "Reference count wasn't cleared");
-  assert(!dv->Next && "Chained DomainValue shouldn't have been recycled");
-  return dv;
-}
-
-/// Release a reference to DV.  When the last reference is released,
-/// collapse if needed.
-void ExecutionDepsFix::release(DomainValue *DV) {
-  while (DV) {
-    assert(DV->Refs && "Bad DomainValue");
-    if (--DV->Refs)
-      return;
-
-    // There are no more DV references. Collapse any contained instructions.
-    if (DV->AvailableDomains && !DV->isCollapsed())
-      collapse(DV, DV->getFirstDomain());
-
-    DomainValue *Next = DV->Next;
-    DV->clear();
-    Avail.push_back(DV);
-    // Also release the next DomainValue in the chain.
-    DV = Next;
-  }
-}
-
-/// Follow the chain of dead DomainValues until a live DomainValue is reached.
-/// Update the referenced pointer when necessary.
-DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) {
-  DomainValue *DV = DVRef;
-  if (!DV || !DV->Next)
-    return DV;
-
-  // DV has a chain. Find the end.
-  do DV = DV->Next;
-  while (DV->Next);
-
-  // Update DVRef to point to DV.
-  retain(DV);
-  release(DVRef);
-  DVRef = DV;
-  return DV;
-}
-
-/// Set LiveRegs[rx] = dv, updating reference counts.
-void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) {
-  assert(unsigned(rx) < NumRegs && "Invalid index");
-  assert(LiveRegs && "Must enter basic block first.");
-
-  if (LiveRegs[rx].Value == dv)
-    return;
-  if (LiveRegs[rx].Value)
-    release(LiveRegs[rx].Value);
-  LiveRegs[rx].Value = retain(dv);
-}
-
-// Kill register rx, recycle or collapse any DomainValue.
-void ExecutionDepsFix::kill(int rx) {
-  assert(unsigned(rx) < NumRegs && "Invalid index");
-  assert(LiveRegs && "Must enter basic block first.");
-  if (!LiveRegs[rx].Value)
-    return;
-
-  release(LiveRegs[rx].Value);
-  LiveRegs[rx].Value = nullptr;
-}
-
-/// Force register rx into domain.
-void ExecutionDepsFix::force(int rx, unsigned domain) {
-  assert(unsigned(rx) < NumRegs && "Invalid index");
-  assert(LiveRegs && "Must enter basic block first.");
-  if (DomainValue *dv = LiveRegs[rx].Value) {
-    if (dv->isCollapsed())
-      dv->addDomain(domain);
-    else if (dv->hasDomain(domain))
-      collapse(dv, domain);
-    else {
-      // This is an incompatible open DomainValue. Collapse it to whatever and
-      // force the new value into domain. This costs a domain crossing.
-      collapse(dv, dv->getFirstDomain());
-      assert(LiveRegs[rx].Value && "Not live after collapse?");
-      LiveRegs[rx].Value->addDomain(domain);
-    }
-  } else {
-    // Set up basic collapsed DomainValue.
-    setLiveReg(rx, alloc(domain));
-  }
-}
-
-/// Collapse open DomainValue into given domain. If there are multiple
-/// registers using dv, they each get a unique collapsed DomainValue.
-void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) {
-  assert(dv->hasDomain(domain) && "Cannot collapse");
-
-  // Collapse all the instructions.
-  while (!dv->Instrs.empty())
-    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
-  dv->setSingleDomain(domain);
-
-  // If there are multiple users, give them new, unique DomainValues.
-  if (LiveRegs && dv->Refs > 1)
-    for (unsigned rx = 0; rx != NumRegs; ++rx)
-      if (LiveRegs[rx].Value == dv)
-        setLiveReg(rx, alloc(domain));
-}
-
-/// All instructions and registers in B are moved to A, and B is released.
-bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) {
-  assert(!A->isCollapsed() && "Cannot merge into collapsed");
-  assert(!B->isCollapsed() && "Cannot merge from collapsed");
-  if (A == B)
-    return true;
-  // Restrict to the domains that A and B have in common.
-  unsigned common = A->getCommonDomains(B->AvailableDomains);
-  if (!common)
-    return false;
-  A->AvailableDomains = common;
-  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
-
-  // Clear the old DomainValue so we won't try to swizzle instructions twice.
-  B->clear();
-  // All uses of B are referred to A.
-  B->Next = retain(A);
-
-  for (unsigned rx = 0; rx != NumRegs; ++rx) {
-    assert(LiveRegs && "no space allocated for live registers");
-    if (LiveRegs[rx].Value == B)
-      setLiveReg(rx, A);
-  }
-  return true;
-}
-
-/// Set up LiveRegs by merging predecessor live-out values.
-void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
-  // Reset instruction counter in each basic block.
-  CurInstr = 0;
-
-  // Set up UndefReads to track undefined register reads.
-  UndefReads.clear();
-  LiveRegSet.clear();
-
-  // Set up LiveRegs to represent registers entering MBB.
-  if (!LiveRegs)
-    LiveRegs = new LiveReg[NumRegs];
-
-  // Default values are 'nothing happened a long time ago'.
-  for (unsigned rx = 0; rx != NumRegs; ++rx) {
-    LiveRegs[rx].Value = nullptr;
-    LiveRegs[rx].Def = -(1 << 20);
-  }
-
-  // This is the entry block.
-  if (MBB->pred_empty()) {
-    for (const auto &LI : MBB->liveins()) {
-      for (int rx : regIndices(LI.PhysReg)) {
-        // Treat function live-ins as if they were defined just before the first
-        // instruction.  Usually, function arguments are set up immediately
-        // before the call.
-        LiveRegs[rx].Def = -1;
-      }
-    }
-    DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n");
-    return;
-  }
-
-  // Try to coalesce live-out registers from predecessors.
-  for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
-       pe = MBB->pred_end(); pi != pe; ++pi) {
-    auto fi = MBBInfos.find(*pi);
-    assert(fi != MBBInfos.end() &&
-           "Should have pre-allocated MBBInfos for all MBBs");
-    LiveReg *Incoming = fi->second.OutRegs;
-    // Incoming is null if this is a backedge from a BB
-    // we haven't processed yet
-    if (Incoming == nullptr) {
-      continue;
-    }
-
-    for (unsigned rx = 0; rx != NumRegs; ++rx) {
-      // Use the most recent predecessor def for each register.
-      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def);
-
-      DomainValue *pdv = resolve(Incoming[rx].Value);
-      if (!pdv)
-        continue;
-      if (!LiveRegs[rx].Value) {
-        setLiveReg(rx, pdv);
-        continue;
-      }
-
-      // We have a live DomainValue from more than one predecessor.
-      if (LiveRegs[rx].Value->isCollapsed()) {
-        // We are already collapsed, but predecessor is not. Force it.
-        unsigned Domain = LiveRegs[rx].Value->getFirstDomain();
-        if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
-          collapse(pdv, Domain);
-        continue;
-      }
-
-      // Currently open, merge in predecessor.
-      if (!pdv->isCollapsed())
-        merge(LiveRegs[rx].Value, pdv);
-      else
-        force(rx, pdv->getFirstDomain());
-    }
-  }
-  DEBUG(
-      dbgs() << printMBBReference(*MBB)
-             << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
-}
-
-void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
-  assert(LiveRegs && "Must enter basic block first.");
-  LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
-  // Save register clearances at end of MBB - used by enterBasicBlock().
-  MBBInfos[MBB].OutRegs = LiveRegs;
-
-  // While processing the basic block, we kept `Def` relative to the start
-  // of the basic block for convenience. However, future use of this information
-  // only cares about the clearance from the end of the block, so adjust
-  // everything to be relative to the end of the basic block.
-  for (unsigned i = 0, e = NumRegs; i != e; ++i)
-    LiveRegs[i].Def -= CurInstr;
-  if (OldOutRegs) {
-    // This must be the second pass.
-    // Release all the DomainValues instead of keeping them.
-    for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      release(OldOutRegs[i].Value);
-    delete[] OldOutRegs;
-  }
-  LiveRegs = nullptr;
-}
-
-bool ExecutionDepsFix::visitInstr(MachineInstr *MI) {
-  // Update instructions with explicit execution domains.
-  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
-  if (DomP.first) {
-    if (DomP.second)
-      visitSoftInstr(MI, DomP.second);
-    else
-      visitHardInstr(MI, DomP.first);
-  }
-
-  return !DomP.first;
-}
-
-/// \brief Helps avoid false dependencies on undef registers by updating the
-/// machine instructions' undef operand to use a register that the instruction
-/// is truly dependent on, or use a register with clearance higher than Pref.
-/// Returns true if it was able to find a true dependency, thus not requiring
-/// a dependency breaking instruction regardless of clearance.
-bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI,
-                                                unsigned OpIdx, unsigned Pref) {
-  MachineOperand &MO = MI->getOperand(OpIdx);
-  assert(MO.isUndef() && "Expected undef machine operand");
-
-  unsigned OriginalReg = MO.getReg();
-
-  // Update only undef operands that are mapped to one register.
-  if (AliasMap[OriginalReg].size() != 1)
-    return false;
-
-  // Get the undef operand's register class
-  const TargetRegisterClass *OpRC =
-      TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
-
-  // If the instruction has a true dependency, we can hide the false depdency
-  // behind it.
-  for (MachineOperand &CurrMO : MI->operands()) {
-    if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() ||
-        !OpRC->contains(CurrMO.getReg()))
-      continue;
-    // We found a true dependency - replace the undef register with the true
-    // dependency.
-    MO.setReg(CurrMO.getReg());
-    return true;
-  }
-
-  // Go over all registers in the register class and find the register with
-  // max clearance or clearance higher than Pref.
-  unsigned MaxClearance = 0;
-  unsigned MaxClearanceReg = OriginalReg;
-  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(OpRC);
-  for (auto Reg : Order) {
-    assert(AliasMap[Reg].size() == 1 &&
-           "Reg is expected to be mapped to a single index");
-    int RCrx = *regIndices(Reg).begin();
-    unsigned Clearance = CurInstr - LiveRegs[RCrx].Def;
-    if (Clearance <= MaxClearance)
-      continue;
-    MaxClearance = Clearance;
-    MaxClearanceReg = Reg;
-
-    if (MaxClearance > Pref)
-      break;
-  }
-
-  // Update the operand if we found a register with better clearance.
-  if (MaxClearanceReg != OriginalReg)
-    MO.setReg(MaxClearanceReg);
-
-  return false;
-}
-
-/// \brief Return true to if it makes sense to break dependence on a partial def
-/// or undef use.
-bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
-                                             unsigned Pref) {
-  unsigned reg = MI->getOperand(OpIdx).getReg();
-  for (int rx : regIndices(reg)) {
-    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
-    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
-
-    if (Pref > Clearance) {
-      DEBUG(dbgs() << ": Break dependency.\n");
-      continue;
-    }
-    DEBUG(dbgs() << ": OK .\n");
-    return false;
-  }
-  return true;
-}
-
-// Update def-ages for registers defined by MI.
-// If Kill is set, also kill off DomainValues clobbered by the defs.
-//
-// Also break dependencies on partial defs and undef uses.
-void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
-                                   bool Kill) {
-  assert(!MI->isDebugValue() && "Won't process debug values");
-
-  // Break dependence on undef uses. Do this before updating LiveRegs below.
-  unsigned OpNum;
-  if (breakDependency) {
-    unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
-    if (Pref) {
-      bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref);
-      // We don't need to bother trying to break a dependency if this
-      // instruction has a true dependency on that register through another
-      // operand - we'll have to wait for it to be available regardless.
-      if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref))
-        UndefReads.push_back(std::make_pair(MI, OpNum));
-    }
-  }
-  const MCInstrDesc &MCID = MI->getDesc();
-  for (unsigned i = 0,
-         e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
-         i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    if (MO.isUse())
-      continue;
-    for (int rx : regIndices(MO.getReg())) {
-      // This instruction explicitly defines rx.
-      DEBUG(dbgs() << printReg(RC->getRegister(rx), TRI) << ":\t" << CurInstr
-                   << '\t' << *MI);
-
-      if (breakDependency) {
-        // Check clearance before partial register updates.
-        // Call breakDependence before setting LiveRegs[rx].Def.
-        unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
-        if (Pref && shouldBreakDependence(MI, i, Pref))
-          TII->breakPartialRegDependency(*MI, i, TRI);
-      }
-
-      // How many instructions since rx was last written?
-      LiveRegs[rx].Def = CurInstr;
-
-      // Kill off domains redefined by generic instructions.
-      if (Kill)
-        kill(rx);
-    }
-  }
-  ++CurInstr;
-}
-
-/// \break Break false dependencies on undefined register reads.
-///
-/// Walk the block backward computing precise liveness. This is expensive, so we
-/// only do it on demand. Note that the occurrence of undefined register reads
-/// that should be broken is very rare, but when they occur we may have many in
-/// a single block.
-void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) {
-  if (UndefReads.empty())
-    return;
-
-  // Collect this block's live out register units.
-  LiveRegSet.init(*TRI);
-  // We do not need to care about pristine registers as they are just preserved
-  // but not actually used in the function.
-  LiveRegSet.addLiveOutsNoPristines(*MBB);
-
-  MachineInstr *UndefMI = UndefReads.back().first;
-  unsigned OpIdx = UndefReads.back().second;
-
-  for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
-    // Update liveness, including the current instruction's defs.
-    LiveRegSet.stepBackward(I);
-
-    if (UndefMI == &I) {
-      if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
-        TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
-
-      UndefReads.pop_back();
-      if (UndefReads.empty())
-        return;
-
-      UndefMI = UndefReads.back().first;
-      OpIdx = UndefReads.back().second;
-    }
-  }
-}
-
-// A hard instruction only works in one domain. All input registers will be
-// forced into that domain.
-void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
-  // Collapse all uses.
-  for (unsigned i = mi->getDesc().getNumDefs(),
-                e = mi->getDesc().getNumOperands(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
-    if (!mo.isReg()) continue;
-    for (int rx : regIndices(mo.getReg())) {
-      force(rx, domain);
-    }
-  }
-
-  // Kill all defs and force them.
-  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
-    if (!mo.isReg()) continue;
-    for (int rx : regIndices(mo.getReg())) {
-      kill(rx);
-      force(rx, domain);
-    }
-  }
-}
-
-// A soft instruction can be changed to work in other domains given by mask.
-void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
-  // Bitmask of available domains for this instruction after taking collapsed
-  // operands into account.
-  unsigned available = mask;
-
-  // Scan the explicit use operands for incoming domains.
-  SmallVector<int, 4> used;
-  if (LiveRegs)
-    for (unsigned i = mi->getDesc().getNumDefs(),
-                  e = mi->getDesc().getNumOperands(); i != e; ++i) {
-      MachineOperand &mo = mi->getOperand(i);
-      if (!mo.isReg()) continue;
-      for (int rx : regIndices(mo.getReg())) {
-        DomainValue *dv = LiveRegs[rx].Value;
-        if (dv == nullptr)
-          continue;
-        // Bitmask of domains that dv and available have in common.
-        unsigned common = dv->getCommonDomains(available);
-        // Is it possible to use this collapsed register for free?
-        if (dv->isCollapsed()) {
-          // Restrict available domains to the ones in common with the operand.
-          // If there are no common domains, we must pay the cross-domain
-          // penalty for this operand.
-          if (common) available = common;
-        } else if (common)
-          // Open DomainValue is compatible, save it for merging.
-          used.push_back(rx);
-        else
-          // Open DomainValue is not compatible with instruction. It is useless
-          // now.
-          kill(rx);
-      }
-    }
-
-  // If the collapsed operands force a single domain, propagate the collapse.
-  if (isPowerOf2_32(available)) {
-    unsigned domain = countTrailingZeros(available);
-    TII->setExecutionDomain(*mi, domain);
-    visitHardInstr(mi, domain);
-    return;
-  }
-
-  // Kill off any remaining uses that don't match available, and build a list of
-  // incoming DomainValues that we want to merge.
-  SmallVector<const LiveReg *, 4> Regs;
-  for (int rx : used) {
-    assert(LiveRegs && "no space allocated for live registers");
-    const LiveReg &LR = LiveRegs[rx];
-    // This useless DomainValue could have been missed above.
-    if (!LR.Value->getCommonDomains(available)) {
-      kill(rx);
-      continue;
-    }
-    // Sorted insertion.
-    auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR,
-                              [](const LiveReg *LHS, const LiveReg *RHS) {
-                                return LHS->Def < RHS->Def;
-                              });
-    Regs.insert(I, &LR);
-  }
-
-  // doms are now sorted in order of appearance. Try to merge them all, giving
-  // priority to the latest ones.
-  DomainValue *dv = nullptr;
-  while (!Regs.empty()) {
-    if (!dv) {
-      dv = Regs.pop_back_val()->Value;
-      // Force the first dv to match the current instruction.
-      dv->AvailableDomains = dv->getCommonDomains(available);
-      assert(dv->AvailableDomains && "Domain should have been filtered");
-      continue;
-    }
-
-    DomainValue *Latest = Regs.pop_back_val()->Value;
-    // Skip already merged values.
-    if (Latest == dv || Latest->Next)
-      continue;
-    if (merge(dv, Latest))
-      continue;
-
-    // If latest didn't merge, it is useless now. Kill all registers using it.
-    for (int i : used) {
-      assert(LiveRegs && "no space allocated for live registers");
-      if (LiveRegs[i].Value == Latest)
-        kill(i);
-    }
-  }
-
-  // dv is the DomainValue we are going to use for this instruction.
-  if (!dv) {
-    dv = alloc();
-    dv->AvailableDomains = available;
-  }
-  dv->Instrs.push_back(mi);
-
-  // Finally set all defs and non-collapsed uses to dv. We must iterate through
-  // all the operators, including imp-def ones.
-  for (MachineInstr::mop_iterator ii = mi->operands_begin(),
-                                  ee = mi->operands_end();
-                                  ii != ee; ++ii) {
-    MachineOperand &mo = *ii;
-    if (!mo.isReg()) continue;
-    for (int rx : regIndices(mo.getReg())) {
-      if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
-        kill(rx);
-        setLiveReg(rx, dv);
-      }
-    }
-  }
-}
-
-void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB,
-                                         bool PrimaryPass) {
-  enterBasicBlock(MBB);
-  // If this block is not done, it makes little sense to make any decisions
-  // based on clearance information. We need to make a second pass anyway,
-  // and by then we'll have better information, so we can avoid doing the work
-  // to try and break dependencies now.
-  bool breakDependency = isBlockDone(MBB);
-  for (MachineInstr &MI : *MBB) {
-    if (!MI.isDebugValue()) {
-      bool Kill = false;
-      if (PrimaryPass)
-        Kill = visitInstr(&MI);
-      processDefs(&MI, breakDependency, Kill);
-    }
-  }
-  if (breakDependency)
-    processUndefReads(MBB);
-  leaveBasicBlock(MBB);
-}
-
-bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) {
-  return MBBInfos[MBB].PrimaryCompleted &&
-         MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
-         MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
-}
-
-bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) {
-  if (skipFunction(mf.getFunction()))
-    return false;
-  MF = &mf;
-  TII = MF->getSubtarget().getInstrInfo();
-  TRI = MF->getSubtarget().getRegisterInfo();
-  RegClassInfo.runOnMachineFunction(mf);
-  LiveRegs = nullptr;
-  assert(NumRegs == RC->getNumRegs() && "Bad regclass");
-
-  DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
-               << TRI->getRegClassName(RC) << " **********\n");
-
-  // If no relevant registers are used in the function, we can skip it
-  // completely.
-  bool anyregs = false;
-  const MachineRegisterInfo &MRI = mf.getRegInfo();
-  for (unsigned Reg : *RC) {
-    if (MRI.isPhysRegUsed(Reg)) {
-      anyregs = true;
-      break;
-    }
-  }
-  if (!anyregs) return false;
-
-  // Initialize the AliasMap on the first use.
-  if (AliasMap.empty()) {
-    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
-    // therefore the LiveRegs array.
-    AliasMap.resize(TRI->getNumRegs());
-    for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
-      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
-           AI.isValid(); ++AI)
-        AliasMap[*AI].push_back(i);
-  }
-
-  // Initialize the MMBInfos
-  for (auto &MBB : mf) {
-    MBBInfo InitialInfo;
-    MBBInfos.insert(std::make_pair(&MBB, InitialInfo));
-  }
-
-  /*
-   *  We want to visit every instruction in every basic block in order to update
-   *  it's execution domain or break any false dependencies. However, for the
-   *  dependency breaking, we need to know clearances from all predecessors
-   *  (including any backedges). One way to do so would be to do two complete
-   *  passes over all basic blocks/instructions, the first for recording
-   *  clearances, the second to break the dependencies. However, for functions
-   *  without backedges, or functions with a lot of straight-line code, and
-   *  a small loop, that would be a lot of unnecessary work (since only the
-   *  BBs that are part of the loop require two passes). As an example,
-   *  consider the following loop.
-   *
-   *
-   *     PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
-   *           ^                                  |
-   *           +----------------------------------+
-   *
-   *  The iteration order is as follows:
-   *  Naive: PH A B C D A' B' C' D'
-   *  Optimized: PH A B C A' B' C' D
-   *
-   *  Note that we avoid processing D twice, because we can entirely process
-   *  the predecessors before getting to D. We call a block that is ready
-   *  for its second round of processing `done` (isBlockDone). Once we finish
-   *  processing some block, we update the counters in MBBInfos and re-process
-   *  any successors that are now done.
-   */
-
-  MachineBasicBlock *Entry = &*MF->begin();
-  ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
-  SmallVector<MachineBasicBlock *, 4> Workqueue;
-  for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
-         MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock *MBB = *MBBI;
-    // N.B: IncomingProcessed and IncomingCompleted were already updated while
-    // processing this block's predecessors.
-    MBBInfos[MBB].PrimaryCompleted = true;
-    MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
-    bool Primary = true;
-    Workqueue.push_back(MBB);
-    while (!Workqueue.empty()) {
-      MachineBasicBlock *ActiveMBB = &*Workqueue.back();
-      Workqueue.pop_back();
-      processBasicBlock(ActiveMBB, Primary);
-      bool Done = isBlockDone(ActiveMBB);
-      for (auto *Succ : ActiveMBB->successors()) {
-        if (!isBlockDone(Succ)) {
-          if (Primary) {
-            MBBInfos[Succ].IncomingProcessed++;
-          }
-          if (Done) {
-            MBBInfos[Succ].IncomingCompleted++;
-          }
-          if (isBlockDone(Succ)) {
-            Workqueue.push_back(Succ);
-          }
-        }
-      }
-      Primary = false;
-    }
-  }
-
-  // We need to go through again and finalize any blocks that are not done yet.
-  // This is possible if blocks have dead predecessors, so we didn't visit them
-  // above.
-  for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator
-           MBBI = RPOT.begin(),
-           MBBE = RPOT.end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock *MBB = *MBBI;
-    if (!isBlockDone(MBB)) {
-      processBasicBlock(MBB, false);
-      // Don't update successors here. We'll get to them anyway through this
-      // loop.
-    }
-  }
-
-  // Clear the LiveOuts vectors and collapse any remaining DomainValues.
-  for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
-         MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
-    auto FI = MBBInfos.find(*MBBI);
-    if (FI == MBBInfos.end() || !FI->second.OutRegs)
-      continue;
-    for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      if (FI->second.OutRegs[i].Value)
-        release(FI->second.OutRegs[i].Value);
-    delete[] FI->second.OutRegs;
-  }
-  MBBInfos.clear();
-  UndefReads.clear();
-  Avail.clear();
-  Allocator.DestroyAll();
-
-  return false;
-}
diff --git a/contrib/llvm/lib/CodeGen/ExecutionDomainFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDomainFix.cpp
new file mode 100644
index 000000000000..458dcf2b0e26
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExecutionDomainFix.cpp
@@ -0,0 +1,473 @@
+//===- ExecutionDomainFix.cpp - Fix execution domain issues ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExecutionDomainFix.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "execution-deps-fix"
+
+iterator_range<SmallVectorImpl<int>::const_iterator>
+ExecutionDomainFix::regIndices(unsigned Reg) const {
+  assert(Reg < AliasMap.size() && "Invalid register");
+  const auto &Entry = AliasMap[Reg];
+  return make_range(Entry.begin(), Entry.end());
+}
+
+DomainValue *ExecutionDomainFix::alloc(int domain) {
+  DomainValue *dv = Avail.empty() ? new (Allocator.Allocate()) DomainValue
+                                  : Avail.pop_back_val();
+  if (domain >= 0)
+    dv->addDomain(domain);
+  assert(dv->Refs == 0 && "Reference count wasn't cleared");
+  assert(!dv->Next && "Chained DomainValue shouldn't have been recycled");
+  return dv;
+}
+
+void ExecutionDomainFix::release(DomainValue *DV) {
+  while (DV) {
+    assert(DV->Refs && "Bad DomainValue");
+    if (--DV->Refs)
+      return;
+
+    // There are no more DV references. Collapse any contained instructions.
+    if (DV->AvailableDomains && !DV->isCollapsed())
+      collapse(DV, DV->getFirstDomain());
+
+    DomainValue *Next = DV->Next;
+    DV->clear();
+    Avail.push_back(DV);
+    // Also release the next DomainValue in the chain.
+    DV = Next;
+  }
+}
+
+DomainValue *ExecutionDomainFix::resolve(DomainValue *&DVRef) {
+  DomainValue *DV = DVRef;
+  if (!DV || !DV->Next)
+    return DV;
+
+  // DV has a chain. Find the end.
+  do
+    DV = DV->Next;
+  while (DV->Next);
+
+  // Update DVRef to point to DV.
+  retain(DV);
+  release(DVRef);
+  DVRef = DV;
+  return DV;
+}
+
+void ExecutionDomainFix::setLiveReg(int rx, DomainValue *dv) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+
+  if (LiveRegs[rx] == dv)
+    return;
+  if (LiveRegs[rx])
+    release(LiveRegs[rx]);
+  LiveRegs[rx] = retain(dv);
+}
+
+void ExecutionDomainFix::kill(int rx) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  if (!LiveRegs[rx])
+    return;
+
+  release(LiveRegs[rx]);
+  LiveRegs[rx] = nullptr;
+}
+
+void ExecutionDomainFix::force(int rx, unsigned domain) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  if (DomainValue *dv = LiveRegs[rx]) {
+    if (dv->isCollapsed())
+      dv->addDomain(domain);
+    else if (dv->hasDomain(domain))
+      collapse(dv, domain);
+    else {
+      // This is an incompatible open DomainValue. Collapse it to whatever and
+      // force the new value into domain. This costs a domain crossing.
+      collapse(dv, dv->getFirstDomain());
+      assert(LiveRegs[rx] && "Not live after collapse?");
+      LiveRegs[rx]->addDomain(domain);
+    }
+  } else {
+    // Set up basic collapsed DomainValue.
+    setLiveReg(rx, alloc(domain));
+  }
+}
+
+void ExecutionDomainFix::collapse(DomainValue *dv, unsigned domain) {
+  assert(dv->hasDomain(domain) && "Cannot collapse");
+
+  // Collapse all the instructions.
+  while (!dv->Instrs.empty())
+    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
+  dv->setSingleDomain(domain);
+
+  // If there are multiple users, give them new, unique DomainValues.
+  if (!LiveRegs.empty() && dv->Refs > 1)
+    for (unsigned rx = 0; rx != NumRegs; ++rx)
+      if (LiveRegs[rx] == dv)
+        setLiveReg(rx, alloc(domain));
+}
+
+bool ExecutionDomainFix::merge(DomainValue *A, DomainValue *B) {
+  assert(!A->isCollapsed() && "Cannot merge into collapsed");
+  assert(!B->isCollapsed() && "Cannot merge from collapsed");
+  if (A == B)
+    return true;
+  // Restrict to the domains that A and B have in common.
+  unsigned common = A->getCommonDomains(B->AvailableDomains);
+  if (!common)
+    return false;
+  A->AvailableDomains = common;
+  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+
+  // Clear the old DomainValue so we won't try to swizzle instructions twice.
+  B->clear();
+  // All uses of B are referred to A.
+  B->Next = retain(A);
+
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    assert(!LiveRegs.empty() && "no space allocated for live registers");
+    if (LiveRegs[rx] == B)
+      setLiveReg(rx, A);
+  }
+  return true;
+}
+
+void ExecutionDomainFix::enterBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+
+  MachineBasicBlock *MBB = TraversedMBB.MBB;
+
+  // Set up LiveRegs to represent registers entering MBB.
+  // Set default domain values to 'no domain' (nullptr)
+  if (LiveRegs.empty())
+    LiveRegs.assign(NumRegs, nullptr);
+
+  // This is the entry block.
+  if (MBB->pred_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n");
+    return;
+  }
+
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock *pred : MBB->predecessors()) {
+    assert(unsigned(pred->getNumber()) < MBBOutRegsInfos.size() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    LiveRegsDVInfo &Incoming = MBBOutRegsInfos[pred->getNumber()];
+    // Incoming is null if this is a backedge from a BB
+    // we haven't processed yet
+    if (Incoming.empty())
+      continue;
+
+    for (unsigned rx = 0; rx != NumRegs; ++rx) {
+      DomainValue *pdv = resolve(Incoming[rx]);
+      if (!pdv)
+        continue;
+      if (!LiveRegs[rx]) {
+        setLiveReg(rx, pdv);
+        continue;
+      }
+
+      // We have a live DomainValue from more than one predecessor.
+      if (LiveRegs[rx]->isCollapsed()) {
+        // We are already collapsed, but predecessor is not. Force it.
+        unsigned Domain = LiveRegs[rx]->getFirstDomain();
+        if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
+          collapse(pdv, Domain);
+        continue;
+      }
+
+      // Currently open, merge in predecessor.
+      if (!pdv->isCollapsed())
+        merge(LiveRegs[rx], pdv);
+      else
+        force(rx, pdv->getFirstDomain());
+    }
+  }
+  LLVM_DEBUG(dbgs() << printMBBReference(*MBB)
+                    << (!TraversedMBB.IsDone ? ": incomplete\n"
+                                             : ": all preds known\n"));
+}
+
+void ExecutionDomainFix::leaveBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  unsigned MBBNumber = TraversedMBB.MBB->getNumber();
+  assert(MBBNumber < MBBOutRegsInfos.size() &&
+         "Unexpected basic block number.");
+  // Save register clearances at end of MBB - used by enterBasicBlock().
+  for (DomainValue *OldLiveReg : MBBOutRegsInfos[MBBNumber]) {
+    release(OldLiveReg);
+  }
+  MBBOutRegsInfos[MBBNumber] = LiveRegs;
+  LiveRegs.clear();
+}
+
+bool ExecutionDomainFix::visitInstr(MachineInstr *MI) {
+  // Update instructions with explicit execution domains.
+  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
+  if (DomP.first) {
+    if (DomP.second)
+      visitSoftInstr(MI, DomP.second);
+    else
+      visitHardInstr(MI, DomP.first);
+  }
+
+  return !DomP.first;
+}
+
+void ExecutionDomainFix::processDefs(MachineInstr *MI, bool Kill) {
+  assert(!MI->isDebugInstr() && "Won't process debug values");
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0,
+                e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+       i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    for (int rx : regIndices(MO.getReg())) {
+      // This instruction explicitly defines rx.
+      LLVM_DEBUG(dbgs() << printReg(RC->getRegister(rx), TRI) << ":\t" << *MI);
+
+      // Kill off domains redefined by generic instructions.
+      if (Kill)
+        kill(rx);
+    }
+  }
+}
+
+void ExecutionDomainFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+  // Collapse all uses.
+  for (unsigned i = mi->getDesc().getNumDefs(),
+                e = mi->getDesc().getNumOperands();
+       i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg())
+      continue;
+    for (int rx : regIndices(mo.getReg())) {
+      force(rx, domain);
+    }
+  }
+
+  // Kill all defs and force them.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg())
+      continue;
+    for (int rx : regIndices(mo.getReg())) {
+      kill(rx);
+      force(rx, domain);
+    }
+  }
+}
+
+void ExecutionDomainFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+  // Bitmask of available domains for this instruction after taking collapsed
+  // operands into account.
+  unsigned available = mask;
+
+  // Scan the explicit use operands for incoming domains.
+  SmallVector<int, 4> used;
+  if (!LiveRegs.empty())
+    for (unsigned i = mi->getDesc().getNumDefs(),
+                  e = mi->getDesc().getNumOperands();
+         i != e; ++i) {
+      MachineOperand &mo = mi->getOperand(i);
+      if (!mo.isReg())
+        continue;
+      for (int rx : regIndices(mo.getReg())) {
+        DomainValue *dv = LiveRegs[rx];
+        if (dv == nullptr)
+          continue;
+        // Bitmask of domains that dv and available have in common.
+        unsigned common = dv->getCommonDomains(available);
+        // Is it possible to use this collapsed register for free?
+        if (dv->isCollapsed()) {
+          // Restrict available domains to the ones in common with the operand.
+          // If there are no common domains, we must pay the cross-domain
+          // penalty for this operand.
+          if (common)
+            available = common;
+        } else if (common)
+          // Open DomainValue is compatible, save it for merging.
+          used.push_back(rx);
+        else
+          // Open DomainValue is not compatible with instruction. It is useless
+          // now.
+          kill(rx);
+      }
+    }
+
+  // If the collapsed operands force a single domain, propagate the collapse.
+  if (isPowerOf2_32(available)) {
+    unsigned domain = countTrailingZeros(available);
+    TII->setExecutionDomain(*mi, domain);
+    visitHardInstr(mi, domain);
+    return;
+  }
+
+  // Kill off any remaining uses that don't match available, and build a list of
+  // incoming DomainValues that we want to merge.
+  SmallVector<int, 4> Regs;
+  for (int rx : used) {
+    assert(!LiveRegs.empty() && "no space allocated for live registers");
+    DomainValue *&LR = LiveRegs[rx];
+    // This useless DomainValue could have been missed above.
+    if (!LR->getCommonDomains(available)) {
+      kill(rx);
+      continue;
+    }
+    // Sorted insertion.
+    // Enables giving priority to the latest domains during merging.
+    auto I = std::upper_bound(
+        Regs.begin(), Regs.end(), rx, [&](int LHS, const int RHS) {
+          return RDA->getReachingDef(mi, RC->getRegister(LHS)) <
+                 RDA->getReachingDef(mi, RC->getRegister(RHS));
+        });
+    Regs.insert(I, rx);
+  }
+
+  // doms are now sorted in order of appearance. Try to merge them all, giving
+  // priority to the latest ones.
+  DomainValue *dv = nullptr;
+  while (!Regs.empty()) {
+    if (!dv) {
+      dv = LiveRegs[Regs.pop_back_val()];
+      // Force the first dv to match the current instruction.
+      dv->AvailableDomains = dv->getCommonDomains(available);
+      assert(dv->AvailableDomains && "Domain should have been filtered");
+      continue;
+    }
+
+    DomainValue *Latest = LiveRegs[Regs.pop_back_val()];
+    // Skip already merged values.
+    if (Latest == dv || Latest->Next)
+      continue;
+    if (merge(dv, Latest))
+      continue;
+
+    // If latest didn't merge, it is useless now. Kill all registers using it.
+    for (int i : used) {
+      assert(!LiveRegs.empty() && "no space allocated for live registers");
+      if (LiveRegs[i] == Latest)
+        kill(i);
+    }
+  }
+
+  // dv is the DomainValue we are going to use for this instruction.
+  if (!dv) {
+    dv = alloc();
+    dv->AvailableDomains = available;
+  }
+  dv->Instrs.push_back(mi);
+
+  // Finally set all defs and non-collapsed uses to dv. We must iterate through
+  // all the operators, including imp-def ones.
+  for (MachineOperand &mo : mi->operands()) {
+    if (!mo.isReg())
+      continue;
+    for (int rx : regIndices(mo.getReg())) {
+      if (!LiveRegs[rx] || (mo.isDef() && LiveRegs[rx] != dv)) {
+        kill(rx);
+        setLiveReg(rx, dv);
+      }
+    }
+  }
+}
+
+void ExecutionDomainFix::processBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  enterBasicBlock(TraversedMBB);
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid doing the work
+  // to try and break dependencies now.
+  for (MachineInstr &MI : *TraversedMBB.MBB) {
+    if (!MI.isDebugInstr()) {
+      bool Kill = false;
+      if (TraversedMBB.PrimaryPass)
+        Kill = visitInstr(&MI);
+      processDefs(&MI, Kill);
+    }
+  }
+  leaveBasicBlock(TraversedMBB);
+}
+
+bool ExecutionDomainFix::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+  MF = &mf;
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  LiveRegs.clear();
+  assert(NumRegs == RC->getNumRegs() && "Bad regclass");
+
+  LLVM_DEBUG(dbgs() << "********** FIX EXECUTION DOMAIN: "
+                    << TRI->getRegClassName(RC) << " **********\n");
+
+  // If no relevant registers are used in the function, we can skip it
+  // completely.
+  bool anyregs = false;
+  const MachineRegisterInfo &MRI = mf.getRegInfo();
+  for (unsigned Reg : *RC) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      anyregs = true;
+      break;
+    }
+  }
+  if (!anyregs)
+    return false;
+
+  RDA = &getAnalysis<ReachingDefAnalysis>();
+
+  // Initialize the AliasMap on the first use.
+  if (AliasMap.empty()) {
+    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
+    // therefore the LiveRegs array.
+    AliasMap.resize(TRI->getNumRegs());
+    for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
+      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); AI.isValid();
+           ++AI)
+        AliasMap[*AI].push_back(i);
+  }
+
+  // Initialize the MBBOutRegsInfos
+  MBBOutRegsInfos.resize(mf.getNumBlockIDs());
+
+  // Traverse the basic blocks.
+  LoopTraversal Traversal;
+  LoopTraversal::TraversalOrder TraversedMBBOrder = Traversal.traverse(mf);
+  for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder) {
+    processBasicBlock(TraversedMBB);
+  }
+
+  for (LiveRegsDVInfo OutLiveRegs : MBBOutRegsInfos) {
+    for (DomainValue *OutLiveReg : OutLiveRegs) {
+      if (OutLiveReg)
+        release(OutLiveReg);
+    }
+  }
+  MBBOutRegsInfos.clear();
+  Avail.clear();
+  Allocator.DestroyAll();
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/ExpandMemCmp.cpp b/contrib/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 09c808463a41..d7562cbf1e90 100644
--- a/contrib/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/contrib/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -32,7 +32,7 @@ STATISTIC(NumMemCmpGreaterThanMax,
           "Number of memcmp calls with size greater than max size");
 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
 
-static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock(
     "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
     cl::desc("The number of loads per basic block for inline expansion of "
              "memcmp that is only being compared against zero."));
@@ -56,7 +56,7 @@ class MemCmpExpansion {
   const uint64_t Size;
   unsigned MaxLoadSize;
   uint64_t NumLoadsNonOneByte;
-  const uint64_t NumLoadsPerBlock;
+  const uint64_t NumLoadsPerBlockForZeroCmp;
   std::vector<BasicBlock *> LoadCmpBlocks;
   BasicBlock *EndBlock;
   PHINode *PhiRes;
@@ -102,7 +102,7 @@ class MemCmpExpansion {
   MemCmpExpansion(CallInst *CI, uint64_t Size,
                   const TargetTransformInfo::MemCmpExpansionOptions &Options,
                   unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
-                  unsigned NumLoadsPerBlock, const DataLayout &DL);
+                  unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout);
 
   unsigned getNumBlocks();
   uint64_t getNumLoads() const { return LoadSequence.size(); }
@@ -122,12 +122,12 @@ MemCmpExpansion::MemCmpExpansion(
     CallInst *const CI, uint64_t Size,
     const TargetTransformInfo::MemCmpExpansionOptions &Options,
     const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
-    const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
+    const unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout)
     : CI(CI),
       Size(Size),
       MaxLoadSize(0),
       NumLoadsNonOneByte(0),
-      NumLoadsPerBlock(NumLoadsPerBlock),
+      NumLoadsPerBlockForZeroCmp(MaxLoadsPerBlockForZeroCmp),
       IsUsedForZeroCmp(IsUsedForZeroCmp),
       DL(TheDataLayout),
       Builder(CI) {
@@ -171,8 +171,8 @@ MemCmpExpansion::MemCmpExpansion(
 
 unsigned MemCmpExpansion::getNumBlocks() {
   if (IsUsedForZeroCmp)
-    return getNumLoads() / NumLoadsPerBlock +
-           (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
+    return getNumLoads() / NumLoadsPerBlockForZeroCmp +
+           (getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);
   return getNumLoads();
 }
 
@@ -249,7 +249,7 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
   Value *Diff;
 
   const unsigned NumLoads =
-      std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
+      std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp);
 
   // For a single-block expansion, start inserting before the memcmp call.
   if (LoadCmpBlocks.empty())
@@ -519,8 +519,6 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
 /// A memcmp expansion that only has one block of load and compare can bypass
 /// the compare, branch, and phi IR that is required in the general case.
 Value *MemCmpExpansion::getMemCmpOneBlock() {
-  assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
-
   Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
   Value *Source1 = CI->getArgOperand(0);
   Value *Source2 = CI->getArgOperand(1);
@@ -566,11 +564,8 @@ Value *MemCmpExpansion::getMemCmpOneBlock() {
 // This function expands the memcmp call into an inline expansion and returns
 // the memcmp result.
 Value *MemCmpExpansion::getMemCmpExpansion() {
-  // A memcmp with zero-comparison with only one block of load and compare does
-  // not need to set up any extra blocks. This case could be handled in the DAG,
-  // but since we have all of the machinery to flexibly expand any memcpy here,
-  // we choose to handle this case too to avoid fragmented lowering.
-  if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
+  // Create the basic block framework for a multi-block expansion.
+  if (getNumBlocks() != 1) {
     BasicBlock *StartBlock = CI->getParent();
     EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
     setupEndBlockPHINodes();
@@ -596,8 +591,8 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
     return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
                                : getMemCmpExpansionZeroCase();
 
-  // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
-  if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
+  if (getNumBlocks() == 1)
+    return getMemCmpOneBlock();
 
   for (unsigned I = 0; I < getNumBlocks(); ++I) {
     emitLoadCompareBlock(I);
@@ -709,8 +704,12 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   const unsigned MaxNumLoads =
       TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
 
+  unsigned NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()
+                                  ? MemCmpEqZeroNumLoadsPerBlock
+                                  : TLI->getMemcmpEqZeroLoadsPerBlock();
+
   MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
-                            IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
+                            IsUsedForZeroCmp, NumLoadsPerBlock, *DL);
 
   // Don't expand if this will require more loads than desired by the target.
   if (Expansion.getNumLoads() == 0) {
diff --git a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
index 6ef97d6dd5ec..bc747fc610f8 100644
--- a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -93,11 +93,11 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
          "Inserted value must be in a physical register");
 
-  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
+  LLVM_DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
 
   if (MI->allDefsAreDead()) {
     MI->setDesc(TII->get(TargetOpcode::KILL));
-    DEBUG(dbgs() << "subreg: replaced by: " << *MI);
+    LLVM_DEBUG(dbgs() << "subreg: replaced by: " << *MI);
     return true;
   }
 
@@ -110,10 +110,10 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
       MI->setDesc(TII->get(TargetOpcode::KILL));
       MI->RemoveOperand(3);     // SubIdx
       MI->RemoveOperand(1);     // Imm
-      DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      LLVM_DEBUG(dbgs() << "subreg: replace by: " << *MI);
       return true;
     }
-    DEBUG(dbgs() << "subreg: eliminated!");
+    LLVM_DEBUG(dbgs() << "subreg: eliminated!");
   } else {
     TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
                      MI->getOperand(2).isKill());
@@ -122,10 +122,10 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
     MachineBasicBlock::iterator CopyMI = MI;
     --CopyMI;
     CopyMI->addRegisterDefined(DstReg);
-    DEBUG(dbgs() << "subreg: " << *CopyMI);
+    LLVM_DEBUG(dbgs() << "subreg: " << *CopyMI);
   }
 
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
   MBB->erase(MI);
   return true;
 }
@@ -133,9 +133,9 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
 bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
 
   if (MI->allDefsAreDead()) {
-    DEBUG(dbgs() << "dead copy: " << *MI);
+    LLVM_DEBUG(dbgs() << "dead copy: " << *MI);
     MI->setDesc(TII->get(TargetOpcode::KILL));
-    DEBUG(dbgs() << "replaced by: " << *MI);
+    LLVM_DEBUG(dbgs() << "replaced by: " << *MI);
     return true;
   }
 
@@ -144,14 +144,15 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
 
   bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
   if (IdentityCopy || SrcMO.isUndef()) {
-    DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy:    ") << *MI);
+    LLVM_DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy:    ")
+                      << *MI);
     // No need to insert an identity copy instruction, but replace with a KILL
     // if liveness is changed.
     if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
       // We must make sure the super-register gets killed. Replace the
       // instruction with KILL.
       MI->setDesc(TII->get(TargetOpcode::KILL));
-      DEBUG(dbgs() << "replaced by:   " << *MI);
+      LLVM_DEBUG(dbgs() << "replaced by:   " << *MI);
       return true;
     }
     // Vanilla identity copy.
@@ -159,13 +160,13 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
     return true;
   }
 
-  DEBUG(dbgs() << "real copy:   " << *MI);
+  LLVM_DEBUG(dbgs() << "real copy:   " << *MI);
   TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
                    DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
 
   if (MI->getNumOperands() > 2)
     TransferImplicitOperands(MI);
-  DEBUG({
+  LLVM_DEBUG({
     MachineBasicBlock::iterator dMI = MI;
     dbgs() << "replaced by: " << *(--dMI);
   });
@@ -177,9 +178,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
 /// copies.
 ///
 bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "Machine Function\n"
-               << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Machine Function\n"
+                    << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
 
diff --git a/contrib/llvm/lib/CodeGen/ExpandReductions.cpp b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp
index abf487a4f198..7552ba8cd85d 100644
--- a/contrib/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -78,13 +78,15 @@ RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
 
 bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
   bool Changed = false;
-  SmallVector<IntrinsicInst*, 4> Worklist;
+  SmallVector<IntrinsicInst *, 4> Worklist;
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
     if (auto II = dyn_cast<IntrinsicInst>(&*I))
       Worklist.push_back(II);
 
   for (auto *II : Worklist) {
     IRBuilder<> Builder(II);
+    bool IsOrdered = false;
+    Value *Acc = nullptr;
     Value *Vec = nullptr;
     auto ID = II->getIntrinsicID();
     auto MRK = RecurrenceDescriptor::MRK_Invalid;
@@ -92,11 +94,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     case Intrinsic::experimental_vector_reduce_fadd:
     case Intrinsic::experimental_vector_reduce_fmul:
       // FMFs must be attached to the call, otherwise it's an ordered reduction
-      // and it can't be handled by generating this shuffle sequence.
-      // TODO: Implement scalarization of ordered reductions here for targets
-      // without native support.
+      // and it can't be handled by generating a shuffle sequence.
       if (!II->getFastMathFlags().isFast())
-        continue;
+        IsOrdered = true;
+      Acc = II->getArgOperand(0);
       Vec = II->getArgOperand(1);
       break;
     case Intrinsic::experimental_vector_reduce_add:
@@ -118,7 +119,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     }
     if (!TTI->shouldExpandReduction(II))
       continue;
-    auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+    Value *Rdx =
+        IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
+                  : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
     II->replaceAllUsesWith(Rdx);
     II->eraseFromParent();
     Changed = true;
diff --git a/contrib/llvm/lib/CodeGen/FaultMaps.cpp b/contrib/llvm/lib/CodeGen/FaultMaps.cpp
index 2924b011e0c1..361558a0e562 100644
--- a/contrib/llvm/lib/CodeGen/FaultMaps.cpp
+++ b/contrib/llvm/lib/CodeGen/FaultMaps.cpp
@@ -62,17 +62,17 @@ void FaultMaps::serializeToFaultMapSection() {
   // Emit a dummy symbol to force section inclusion.
   OS.EmitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_FaultMaps")));
 
-  DEBUG(dbgs() << "********** Fault Map Output **********\n");
+  LLVM_DEBUG(dbgs() << "********** Fault Map Output **********\n");
 
   // Header
   OS.EmitIntValue(FaultMapVersion, 1); // Version.
   OS.EmitIntValue(0, 1);               // Reserved.
   OS.EmitIntValue(0, 2);               // Reserved.
 
-  DEBUG(dbgs() << WFMP << "#functions = " << FunctionInfos.size() << "\n");
+  LLVM_DEBUG(dbgs() << WFMP << "#functions = " << FunctionInfos.size() << "\n");
   OS.EmitIntValue(FunctionInfos.size(), 4);
 
-  DEBUG(dbgs() << WFMP << "functions:\n");
+  LLVM_DEBUG(dbgs() << WFMP << "functions:\n");
 
   for (const auto &FFI : FunctionInfos)
     emitFunctionInfo(FFI.first, FFI.second);
@@ -82,25 +82,25 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
                                  const FunctionFaultInfos &FFI) {
   MCStreamer &OS = *AP.OutStreamer;
 
-  DEBUG(dbgs() << WFMP << "  function addr: " << *FnLabel << "\n");
+  LLVM_DEBUG(dbgs() << WFMP << "  function addr: " << *FnLabel << "\n");
   OS.EmitSymbolValue(FnLabel, 8);
 
-  DEBUG(dbgs() << WFMP << "  #faulting PCs: " << FFI.size() << "\n");
+  LLVM_DEBUG(dbgs() << WFMP << "  #faulting PCs: " << FFI.size() << "\n");
   OS.EmitIntValue(FFI.size(), 4);
 
   OS.EmitIntValue(0, 4); // Reserved
 
   for (auto &Fault : FFI) {
-    DEBUG(dbgs() << WFMP << "    fault type: "
-          << faultTypeToString(Fault.Kind) << "\n");
+    LLVM_DEBUG(dbgs() << WFMP << "    fault type: "
+                      << faultTypeToString(Fault.Kind) << "\n");
     OS.EmitIntValue(Fault.Kind, 4);
 
-    DEBUG(dbgs() << WFMP << "    faulting PC offset: "
-          << *Fault.FaultingOffsetExpr << "\n");
+    LLVM_DEBUG(dbgs() << WFMP << "    faulting PC offset: "
+                      << *Fault.FaultingOffsetExpr << "\n");
     OS.EmitValue(Fault.FaultingOffsetExpr, 4);
 
-    DEBUG(dbgs() << WFMP << "    fault handler PC offset: "
-          << *Fault.HandlerOffsetExpr << "\n");
+    LLVM_DEBUG(dbgs() << WFMP << "    fault handler PC offset: "
+                      << *Fault.HandlerOffsetExpr << "\n");
     OS.EmitValue(Fault.HandlerOffsetExpr, 4);
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
index 9c71b18619a1..581cd423f2d4 100644
--- a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
+++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
@@ -41,8 +41,11 @@ INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE,
                 "Contiguously Lay Out Funclets", false, false)
 
 bool FuncletLayout::runOnMachineFunction(MachineFunction &F) {
+  // Even though this gets information from getEHScopeMembership(), this pass is
+  // only necessary for funclet-based EH personalities, in which these EH scopes
+  // are outlined at the end.
   DenseMap<const MachineBasicBlock *, int> FuncletMembership =
-      getFuncletMembership(F);
+      getEHScopeMembership(F);
   if (FuncletMembership.empty())
     return false;
 
diff --git a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
index 4361d8b248c8..31ddeadbd97a 100644
--- a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -38,7 +38,7 @@ namespace {
 /// directed by the GCStrategy. It also performs automatic root initialization
 /// and custom intrinsic lowering.
 class LowerIntrinsics : public FunctionPass {
-  bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
+  bool PerformDefaultLowering(Function &F, GCStrategy &S);
 
 public:
   static char ID;
@@ -61,7 +61,7 @@ class GCMachineCodeAnalysis : public MachineFunctionPass {
   const TargetInstrInfo *TII;
 
   void FindSafePoints(MachineFunction &MF);
-  void VisitCallPoint(MachineBasicBlock::iterator MI);
+  void VisitCallPoint(MachineBasicBlock::iterator CI);
   MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                         const DebugLoc &DL) const;
 
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 114c068749eb..07de31bec660 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -38,6 +38,9 @@ bool CallLowering::lowerCall(
     ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS);
+    // We don't currently support swifterror or swiftself args.
+    if (OrigArg.Flags.isSwiftError() || OrigArg.Flags.isSwiftSelf())
+      return false;
     OrigArgs.push_back(OrigArg);
     ++i;
   }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
new file mode 100644
index 000000000000..0bc5b87de150
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -0,0 +1,81 @@
+//===-- lib/CodeGen/GlobalISel/GICombiner.cpp -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file constains common code to combine machine functions at generic
+// level.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+Combiner::Combiner(CombinerInfo &Info, const TargetPassConfig *TPC)
+    : CInfo(Info), TPC(TPC) {
+  (void)this->TPC; // FIXME: Remove when used.
+}
+
+bool Combiner::combineMachineInstrs(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running this pass.
+  // FIXME: Should this be here or in individual combiner passes.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  Builder.setMF(MF);
+
+  LLVM_DEBUG(dbgs() << "Generic MI Combiner for: " << MF.getName() << '\n');
+
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+
+  bool MFChanged = false;
+  bool Changed;
+
+  do {
+    // Collect all instructions. Do a post order traversal for basic blocks and
+    // insert with list bottom up, so while we pop_back_val, we'll traverse top
+    // down RPOT.
+    Changed = false;
+    GISelWorkList<512> WorkList;
+    for (MachineBasicBlock *MBB : post_order(&MF)) {
+      if (MBB->empty())
+        continue;
+      for (auto MII = MBB->rbegin(), MIE = MBB->rend(); MII != MIE;) {
+        MachineInstr *CurMI = &*MII;
+        ++MII;
+        // Erase dead insts before even adding to the list.
+        if (isTriviallyDead(*CurMI, *MRI)) {
+          LLVM_DEBUG(dbgs() << *CurMI << "Is dead; erasing.\n");
+          CurMI->eraseFromParentAndMarkDBGValuesForRemoval();
+          continue;
+        }
+        WorkList.insert(CurMI);
+      }
+    }
+    // Main Loop. Process the instructions here.
+    while (!WorkList.empty()) {
+      MachineInstr *CurrInst = WorkList.pop_back_val();
+      LLVM_DEBUG(dbgs() << "Try combining " << *CurrInst << "\n";);
+      Changed |= CInfo.combine(*CurrInst, Builder);
+    }
+    MFChanged |= Changed;
+  } while (Changed);
+
+  return MFChanged;
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
new file mode 100644
index 000000000000..44e904a6391b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -0,0 +1,41 @@
+//== ---lib/CodeGen/GlobalISel/GICombinerHelper.cpp --------------------- == //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "gi-combine"
+
+using namespace llvm;
+
+CombinerHelper::CombinerHelper(MachineIRBuilder &B) :
+  Builder(B), MRI(Builder.getMF().getRegInfo()) {}
+
+bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
+  if (MI.getOpcode() != TargetOpcode::COPY)
+    return false;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg);
+  // Simple Copy Propagation.
+  // a(sx) = COPY b(sx) -> Replace all uses of a with b.
+  if (DstTy.isValid() && SrcTy.isValid() && DstTy == SrcTy) {
+    MI.eraseFromParent();
+    MRI.replaceRegWith(DstReg, SrcReg);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::tryCombine(MachineInstr &MI) {
+  return tryCombineCopy(MI);
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index a329a71e2c95..bafb7a05536d 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -102,37 +103,103 @@ IRTranslator::IRTranslator() : MachineFunctionPass(ID) {
 }
 
 void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<StackProtector>();
   AU.addRequired<TargetPassConfig>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
-  unsigned &ValReg = ValToVReg[&Val];
+static void computeValueLLTs(const DataLayout &DL, Type &Ty,
+                             SmallVectorImpl<LLT> &ValueTys,
+                             SmallVectorImpl<uint64_t> *Offsets = nullptr,
+                             uint64_t StartingOffset = 0) {
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(&Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    for (unsigned I = 0, E = STy->getNumElements(); I != E; ++I)
+      computeValueLLTs(DL, *STy->getElementType(I), ValueTys, Offsets,
+                       StartingOffset + SL->getElementOffset(I));
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(&Ty)) {
+    Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy);
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      computeValueLLTs(DL, *EltTy, ValueTys, Offsets,
+                       StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty.isVoidTy())
+    return;
+  // Base case: we can get an LLT for this LLVM IR type.
+  ValueTys.push_back(getLLTForType(Ty, DL));
+  if (Offsets != nullptr)
+    Offsets->push_back(StartingOffset * 8);
+}
+
+IRTranslator::ValueToVRegInfo::VRegListT &
+IRTranslator::allocateVRegs(const Value &Val) {
+  assert(!VMap.contains(Val) && "Value already allocated in VMap");
+  auto *Regs = VMap.getVRegs(Val);
+  auto *Offsets = VMap.getOffsets(Val);
+  SmallVector<LLT, 4> SplitTys;
+  computeValueLLTs(*DL, *Val.getType(), SplitTys,
+                   Offsets->empty() ? Offsets : nullptr);
+  for (unsigned i = 0; i < SplitTys.size(); ++i)
+    Regs->push_back(0);
+  return *Regs;
+}
+
+ArrayRef<unsigned> IRTranslator::getOrCreateVRegs(const Value &Val) {
+  auto VRegsIt = VMap.findVRegs(Val);
+  if (VRegsIt != VMap.vregs_end())
+    return *VRegsIt->second;
 
-  if (ValReg)
-    return ValReg;
+  if (Val.getType()->isVoidTy())
+    return *VMap.getVRegs(Val);
+
+  // Create entry for this type.
+  auto *VRegs = VMap.getVRegs(Val);
+  auto *Offsets = VMap.getOffsets(Val);
 
-  // Fill ValRegsSequence with the sequence of registers
-  // we need to concat together to produce the value.
   assert(Val.getType()->isSized() &&
          "Don't know how to create an empty vreg");
-  unsigned VReg =
-      MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL));
-  ValReg = VReg;
 
-  if (auto CV = dyn_cast<Constant>(&Val)) {
-    bool Success = translate(*CV, VReg);
+  SmallVector<LLT, 4> SplitTys;
+  computeValueLLTs(*DL, *Val.getType(), SplitTys,
+                   Offsets->empty() ? Offsets : nullptr);
+
+  if (!isa<Constant>(Val)) {
+    for (auto Ty : SplitTys)
+      VRegs->push_back(MRI->createGenericVirtualRegister(Ty));
+    return *VRegs;
+  }
+
+  if (Val.getType()->isAggregateType()) {
+    // UndefValue, ConstantAggregateZero
+    auto &C = cast<Constant>(Val);
+    unsigned Idx = 0;
+    while (auto Elt = C.getAggregateElement(Idx++)) {
+      auto EltRegs = getOrCreateVRegs(*Elt);
+      std::copy(EltRegs.begin(), EltRegs.end(), std::back_inserter(*VRegs));
+    }
+  } else {
+    assert(SplitTys.size() == 1 && "unexpectedly split LLT");
+    VRegs->push_back(MRI->createGenericVirtualRegister(SplitTys[0]));
+    bool Success = translate(cast<Constant>(Val), VRegs->front());
     if (!Success) {
       OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                  MF->getFunction().getSubprogram(),
                                  &MF->getFunction().getEntryBlock());
       R << "unable to translate constant: " << ore::NV("Type", Val.getType());
       reportTranslationError(*MF, *TPC, *ORE, R);
-      return VReg;
+      return *VRegs;
     }
   }
 
-  return VReg;
+  return *VRegs;
 }
 
 int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
@@ -164,6 +231,20 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     Alignment = LI->getAlignment();
     ValTy = LI->getType();
+  } else if (const AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+    // TODO(PR27168): This instruction has no alignment attribute, but unlike
+    // the default alignment for load/store, the default here is to assume
+    // it has NATURAL alignment, not DataLayout-specified alignment.
+    const DataLayout &DL = AI->getModule()->getDataLayout();
+    Alignment = DL.getTypeStoreSize(AI->getCompareOperand()->getType());
+    ValTy = AI->getCompareOperand()->getType();
+  } else if (const AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
+    // TODO(PR27168): This instruction has no alignment attribute, but unlike
+    // the default alignment for load/store, the default here is to assume
+    // it has NATURAL alignment, not DataLayout-specified alignment.
+    const DataLayout &DL = AI->getModule()->getDataLayout();
+    Alignment = DL.getTypeStoreSize(AI->getValOperand()->getType());
+    ValTy = AI->getType();
   } else {
     OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
     R << "unable to translate memop: " << ore::NV("Opcode", &I);
@@ -243,7 +324,11 @@ bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) {
   // The target may mess up with the insertion point, but
   // this is not important as a return is the last instruction
   // of the block anyway.
-  return CLI->lowerReturn(MIRBuilder, Ret, !Ret ? 0 : getOrCreateVReg(*Ret));
+
+  // FIXME: this interface should simplify when CallLowering gets adapted to
+  // multiple VRegs per Value.
+  unsigned VReg = Ret ? packRegs(*Ret, MIRBuilder) : 0;
+  return CLI->lowerReturn(MIRBuilder, Ret, VReg);
 }
 
 bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
@@ -342,15 +427,23 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   if (DL->getTypeStoreSize(LI.getType()) == 0)
     return true;
 
-  unsigned Res = getOrCreateVReg(LI);
-  unsigned Addr = getOrCreateVReg(*LI.getPointerOperand());
+  ArrayRef<unsigned> Regs = getOrCreateVRegs(LI);
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(LI);
+  unsigned Base = getOrCreateVReg(*LI.getPointerOperand());
+
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    unsigned Addr = 0;
+    MIRBuilder.materializeGEP(Addr, Base, LLT::scalar(64), Offsets[i] / 8);
+
+    MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
+    unsigned BaseAlign = getMemOpAlignment(LI);
+    auto MMO = MF->getMachineMemOperand(
+        Ptr, Flags, (MRI->getType(Regs[i]).getSizeInBits() + 7) / 8,
+        MinAlign(BaseAlign, Offsets[i] / 8), AAMDNodes(), nullptr,
+        LI.getSyncScopeID(), LI.getOrdering());
+    MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
+  }
 
-  MIRBuilder.buildLoad(
-      Res, Addr,
-      *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
-                                Flags, DL->getTypeStoreSize(LI.getType()),
-                                getMemOpAlignment(LI), AAMDNodes(), nullptr,
-                                LI.getSyncScopeID(), LI.getOrdering()));
   return true;
 }
 
@@ -363,50 +456,61 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   if (DL->getTypeStoreSize(SI.getValueOperand()->getType()) == 0)
     return true;
 
-  unsigned Val = getOrCreateVReg(*SI.getValueOperand());
-  unsigned Addr = getOrCreateVReg(*SI.getPointerOperand());
-
-  MIRBuilder.buildStore(
-      Val, Addr,
-      *MF->getMachineMemOperand(
-          MachinePointerInfo(SI.getPointerOperand()), Flags,
-          DL->getTypeStoreSize(SI.getValueOperand()->getType()),
-          getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSyncScopeID(),
-          SI.getOrdering()));
+  ArrayRef<unsigned> Vals = getOrCreateVRegs(*SI.getValueOperand());
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(*SI.getValueOperand());
+  unsigned Base = getOrCreateVReg(*SI.getPointerOperand());
+
+  for (unsigned i = 0; i < Vals.size(); ++i) {
+    unsigned Addr = 0;
+    MIRBuilder.materializeGEP(Addr, Base, LLT::scalar(64), Offsets[i] / 8);
+
+    MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
+    unsigned BaseAlign = getMemOpAlignment(SI);
+    auto MMO = MF->getMachineMemOperand(
+        Ptr, Flags, (MRI->getType(Vals[i]).getSizeInBits() + 7) / 8,
+        MinAlign(BaseAlign, Offsets[i] / 8), AAMDNodes(), nullptr,
+        SI.getSyncScopeID(), SI.getOrdering());
+    MIRBuilder.buildStore(Vals[i], Addr, *MMO);
+  }
   return true;
 }
 
-bool IRTranslator::translateExtractValue(const User &U,
-                                         MachineIRBuilder &MIRBuilder) {
+static uint64_t getOffsetFromIndices(const User &U, const DataLayout &DL) {
   const Value *Src = U.getOperand(0);
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
-  SmallVector<Value *, 1> Indices;
-
-  // If Src is a single element ConstantStruct, translate extractvalue
-  // to that element to avoid inserting a cast instruction.
-  if (auto CS = dyn_cast<ConstantStruct>(Src))
-    if (CS->getNumOperands() == 1) {
-      unsigned Res = getOrCreateVReg(*CS->getOperand(0));
-      ValToVReg[&U] = Res;
-      return true;
-    }
 
   // getIndexedOffsetInType is designed for GEPs, so the first index is the
   // usual array element rather than looking into the actual aggregate.
+  SmallVector<Value *, 1> Indices;
   Indices.push_back(ConstantInt::get(Int32Ty, 0));
 
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(&U)) {
     for (auto Idx : EVI->indices())
       Indices.push_back(ConstantInt::get(Int32Ty, Idx));
+  } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&U)) {
+    for (auto Idx : IVI->indices())
+      Indices.push_back(ConstantInt::get(Int32Ty, Idx));
   } else {
     for (unsigned i = 1; i < U.getNumOperands(); ++i)
       Indices.push_back(U.getOperand(i));
   }
 
-  uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
+  return 8 * static_cast<uint64_t>(
+                 DL.getIndexedOffsetInType(Src->getType(), Indices));
+}
 
-  unsigned Res = getOrCreateVReg(U);
-  MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset);
+bool IRTranslator::translateExtractValue(const User &U,
+                                         MachineIRBuilder &MIRBuilder) {
+  const Value *Src = U.getOperand(0);
+  uint64_t Offset = getOffsetFromIndices(U, *DL);
+  ArrayRef<unsigned> SrcRegs = getOrCreateVRegs(*Src);
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(*Src);
+  unsigned Idx = std::lower_bound(Offsets.begin(), Offsets.end(), Offset) -
+                 Offsets.begin();
+  auto &DstRegs = allocateVRegs(U);
+
+  for (unsigned i = 0; i < DstRegs.size(); ++i)
+    DstRegs[i] = SrcRegs[Idx++];
 
   return true;
 }
@@ -414,37 +518,33 @@ bool IRTranslator::translateExtractValue(const User &U,
 bool IRTranslator::translateInsertValue(const User &U,
                                         MachineIRBuilder &MIRBuilder) {
   const Value *Src = U.getOperand(0);
-  Type *Int32Ty = Type::getInt32Ty(U.getContext());
-  SmallVector<Value *, 1> Indices;
-
-  // getIndexedOffsetInType is designed for GEPs, so the first index is the
-  // usual array element rather than looking into the actual aggregate.
-  Indices.push_back(ConstantInt::get(Int32Ty, 0));
-
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&U)) {
-    for (auto Idx : IVI->indices())
-      Indices.push_back(ConstantInt::get(Int32Ty, Idx));
-  } else {
-    for (unsigned i = 2; i < U.getNumOperands(); ++i)
-      Indices.push_back(U.getOperand(i));
+  uint64_t Offset = getOffsetFromIndices(U, *DL);
+  auto &DstRegs = allocateVRegs(U);
+  ArrayRef<uint64_t> DstOffsets = *VMap.getOffsets(U);
+  ArrayRef<unsigned> SrcRegs = getOrCreateVRegs(*Src);
+  ArrayRef<unsigned> InsertedRegs = getOrCreateVRegs(*U.getOperand(1));
+  auto InsertedIt = InsertedRegs.begin();
+
+  for (unsigned i = 0; i < DstRegs.size(); ++i) {
+    if (DstOffsets[i] >= Offset && InsertedIt != InsertedRegs.end())
+      DstRegs[i] = *InsertedIt++;
+    else
+      DstRegs[i] = SrcRegs[i];
   }
 
-  uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
-
-  unsigned Res = getOrCreateVReg(U);
-  unsigned Inserted = getOrCreateVReg(*U.getOperand(1));
-  MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), Inserted, Offset);
-
   return true;
 }
 
 bool IRTranslator::translateSelect(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
-  unsigned Res = getOrCreateVReg(U);
   unsigned Tst = getOrCreateVReg(*U.getOperand(0));
-  unsigned Op0 = getOrCreateVReg(*U.getOperand(1));
-  unsigned Op1 = getOrCreateVReg(*U.getOperand(2));
-  MIRBuilder.buildSelect(Res, Tst, Op0, Op1);
+  ArrayRef<unsigned> ResRegs = getOrCreateVRegs(U);
+  ArrayRef<unsigned> Op0Regs = getOrCreateVRegs(*U.getOperand(1));
+  ArrayRef<unsigned> Op1Regs = getOrCreateVRegs(*U.getOperand(2));
+
+  for (unsigned i = 0; i < ResRegs.size(); ++i)
+    MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i]);
+
   return true;
 }
 
@@ -453,15 +553,16 @@ bool IRTranslator::translateBitCast(const User &U,
   // If we're bitcasting to the source type, we can reuse the source vreg.
   if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
       getLLTForType(*U.getType(), *DL)) {
-    // Get the source vreg now, to avoid invalidating ValToVReg.
     unsigned SrcReg = getOrCreateVReg(*U.getOperand(0));
-    unsigned &Reg = ValToVReg[&U];
+    auto &Regs = *VMap.getVRegs(U);
     // If we already assigned a vreg for this bitcast, we can't change that.
     // Emit a copy to satisfy the users we already emitted.
-    if (Reg)
-      MIRBuilder.buildCopy(Reg, SrcReg);
-    else
-      Reg = SrcReg;
+    if (!Regs.empty())
+      MIRBuilder.buildCopy(Regs[0], SrcReg);
+    else {
+      Regs.push_back(SrcReg);
+      VMap.getOffsets(U)->push_back(0);
+    }
     return true;
   }
   return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
@@ -516,10 +617,6 @@ bool IRTranslator::translateGetElementPtr(const User &U,
         Offset = 0;
       }
 
-      // N = N + Idx * ElementSize;
-      unsigned ElementSizeReg =
-          getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));
-
       unsigned IdxReg = getOrCreateVReg(*Idx);
       if (MRI->getType(IdxReg) != OffsetTy) {
         unsigned NewIdxReg = MRI->createGenericVirtualRegister(OffsetTy);
@@ -527,11 +624,20 @@ bool IRTranslator::translateGetElementPtr(const User &U,
         IdxReg = NewIdxReg;
       }
 
-      unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-      MIRBuilder.buildMul(OffsetReg, ElementSizeReg, IdxReg);
+      // N = N + Idx * ElementSize;
+      // Avoid doing it for ElementSize of 1.
+      unsigned GepOffsetReg;
+      if (ElementSize != 1) {
+        unsigned ElementSizeReg =
+            getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));
+
+        GepOffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
+        MIRBuilder.buildMul(GepOffsetReg, ElementSizeReg, IdxReg);
+      } else
+        GepOffsetReg = IdxReg;
 
       unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
-      MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);
+      MIRBuilder.buildGEP(NewBaseReg, BaseReg, GepOffsetReg);
       BaseReg = NewBaseReg;
     }
   }
@@ -607,14 +713,10 @@ void IRTranslator::getStackGuard(unsigned DstReg,
 
 bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                               MachineIRBuilder &MIRBuilder) {
-  LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL);
-  LLT s1 = LLT::scalar(1);
-  unsigned Width = Ty.getSizeInBits();
-  unsigned Res = MRI->createGenericVirtualRegister(Ty);
-  unsigned Overflow = MRI->createGenericVirtualRegister(s1);
+  ArrayRef<unsigned> ResRegs = getOrCreateVRegs(CI);
   auto MIB = MIRBuilder.buildInstr(Op)
-                 .addDef(Res)
-                 .addDef(Overflow)
+                 .addDef(ResRegs[0])
+                 .addDef(ResRegs[1])
                  .addUse(getOrCreateVReg(*CI.getOperand(0)))
                  .addUse(getOrCreateVReg(*CI.getOperand(1)));
 
@@ -624,7 +726,6 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
     MIB.addUse(Zero);
   }
 
-  MIRBuilder.buildSequence(getOrCreateVReg(CI), {Res, Overflow}, {0, Width});
   return true;
 }
 
@@ -647,7 +748,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     const Value *Address = DI.getAddress();
     if (!Address || isa<UndefValue>(Address)) {
-      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       return true;
     }
 
@@ -741,6 +842,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
     return true;
+  case Intrinsic::fabs:
+    MIRBuilder.buildInstr(TargetOpcode::G_FABS)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
   case Intrinsic::fma:
     MIRBuilder.buildInstr(TargetOpcode::G_FMA)
         .addDef(getOrCreateVReg(CI))
@@ -748,6 +854,25 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(2)));
     return true;
+  case Intrinsic::fmuladd: {
+    const TargetMachine &TM = MF->getTarget();
+    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+    unsigned Dst = getOrCreateVReg(CI);
+    unsigned Op0 = getOrCreateVReg(*CI.getArgOperand(0));
+    unsigned Op1 = getOrCreateVReg(*CI.getArgOperand(1));
+    unsigned Op2 = getOrCreateVReg(*CI.getArgOperand(2));
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isFMAFasterThanFMulAndFAdd(TLI.getValueType(*DL, CI.getType()))) {
+      // TODO: Revisit this to see if we should move this part of the
+      // lowering to the combiner.
+      MIRBuilder.buildInstr(TargetOpcode::G_FMA, Dst, Op0, Op1, Op2);
+    } else {
+      LLT Ty = getLLTForType(*CI.getType(), *DL);
+      auto FMul = MIRBuilder.buildInstr(TargetOpcode::G_FMUL, Ty, Op0, Op1);
+      MIRBuilder.buildInstr(TargetOpcode::G_FADD, Dst, FMul, Op2);
+    }
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
@@ -807,6 +932,34 @@ bool IRTranslator::translateInlineAsm(const CallInst &CI,
   return true;
 }
 
+unsigned IRTranslator::packRegs(const Value &V,
+                                  MachineIRBuilder &MIRBuilder) {
+  ArrayRef<unsigned> Regs = getOrCreateVRegs(V);
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(V);
+  LLT BigTy = getLLTForType(*V.getType(), *DL);
+
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  unsigned Dst = MRI->createGenericVirtualRegister(BigTy);
+  MIRBuilder.buildUndef(Dst);
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    unsigned NewDst = MRI->createGenericVirtualRegister(BigTy);
+    MIRBuilder.buildInsert(NewDst, Dst, Regs[i], Offsets[i]);
+    Dst = NewDst;
+  }
+  return Dst;
+}
+
+void IRTranslator::unpackRegs(const Value &V, unsigned Src,
+                                MachineIRBuilder &MIRBuilder) {
+  ArrayRef<unsigned> Regs = getOrCreateVRegs(V);
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(V);
+
+  for (unsigned i = 0; i < Regs.size(); ++i)
+    MIRBuilder.buildExtract(Regs[i], Src, Offsets[i]);
+}
+
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
@@ -826,16 +979,24 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       ID = static_cast<Intrinsic::ID>(TII->getIntrinsicID(F));
   }
 
+  bool IsSplitType = valueIsSplit(CI);
   if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic) {
-    unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
+    unsigned Res = IsSplitType ? MRI->createGenericVirtualRegister(
+                                     getLLTForType(*CI.getType(), *DL))
+                               : getOrCreateVReg(CI);
+
     SmallVector<unsigned, 8> Args;
     for (auto &Arg: CI.arg_operands())
-      Args.push_back(getOrCreateVReg(*Arg));
+      Args.push_back(packRegs(*Arg, MIRBuilder));
 
     MF->getFrameInfo().setHasCalls(true);
-    return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {
+    bool Success = CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {
       return getOrCreateVReg(*CI.getCalledValue());
     });
+
+    if (IsSplitType)
+      unpackRegs(CI, Res, MIRBuilder);
+    return Success;
   }
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
@@ -843,7 +1004,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
-  unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
+  unsigned Res = 0;
+  if (!CI.getType()->isVoidTy()) {
+    if (IsSplitType)
+      Res =
+          MRI->createGenericVirtualRegister(getLLTForType(*CI.getType(), *DL));
+    else
+      Res = getOrCreateVReg(CI);
+  }
   MachineInstrBuilder MIB =
       MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory());
 
@@ -851,9 +1019,12 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
     // Some intrinsics take metadata parameters. Reject them.
     if (isa<MetadataAsValue>(Arg))
       return false;
-    MIB.addUse(getOrCreateVReg(*Arg));
+    MIB.addUse(packRegs(*Arg, MIRBuilder));
   }
 
+  if (IsSplitType)
+    unpackRegs(CI, Res, MIRBuilder);
+
   // Add a MachineMemOperand if it is a target mem intrinsic.
   const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   TargetLowering::IntrinsicInfo Info;
@@ -897,15 +1068,18 @@ bool IRTranslator::translateInvoke(const User &U,
   MCSymbol *BeginSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
-  unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I);
+  unsigned Res =
+        MRI->createGenericVirtualRegister(getLLTForType(*I.getType(), *DL));
   SmallVector<unsigned, 8> Args;
   for (auto &Arg: I.arg_operands())
-    Args.push_back(getOrCreateVReg(*Arg));
+    Args.push_back(packRegs(*Arg, MIRBuilder));
 
   if (!CLI->lowerCall(MIRBuilder, &I, Res, Args,
                       [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
     return false;
 
+  unpackRegs(I, Res, MIRBuilder);
+
   MCSymbol *EndSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
 
@@ -964,27 +1138,18 @@ bool IRTranslator::translateLandingPad(const User &U,
     return false;
 
   MBB.addLiveIn(ExceptionReg);
-  unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]),
-           Tmp = MRI->createGenericVirtualRegister(Ty);
-  MIRBuilder.buildCopy(VReg, ExceptionReg);
-  MIRBuilder.buildInsert(Tmp, Undef, VReg, 0);
+  ArrayRef<unsigned> ResRegs = getOrCreateVRegs(LP);
+  MIRBuilder.buildCopy(ResRegs[0], ExceptionReg);
 
   unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
   if (!SelectorReg)
     return false;
 
   MBB.addLiveIn(SelectorReg);
-
-  // N.b. the exception selector register always has pointer type and may not
-  // match the actual IR-level type in the landingpad so an extra cast is
-  // needed.
   unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
   MIRBuilder.buildCopy(PtrVReg, SelectorReg);
+  MIRBuilder.buildCast(ResRegs[1], PtrVReg);
 
-  VReg = MRI->createGenericVirtualRegister(Tys[1]);
-  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg);
-  MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg,
-                         Tys[0].getSizeInBits());
   return true;
 }
 
@@ -992,6 +1157,9 @@ bool IRTranslator::translateAlloca(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
   auto &AI = cast<AllocaInst>(U);
 
+  if (AI.isSwiftError())
+    return false;
+
   if (AI.isStaticAlloca()) {
     unsigned Res = getOrCreateVReg(AI);
     int FI = getOrCreateFrameIndex(AI);
@@ -999,6 +1167,10 @@ bool IRTranslator::translateAlloca(const User &U,
     return true;
   }
 
+  // FIXME: support stack probing for Windows.
+  if (MF->getTarget().getTargetTriple().isOSWindows())
+    return false;
+
   // Now we're in the harder dynamic case.
   Type *Ty = AI.getAllocatedType();
   unsigned Align =
@@ -1070,9 +1242,16 @@ bool IRTranslator::translateInsertElement(const User &U,
   // not a legal vector type in LLT.
   if (U.getType()->getVectorNumElements() == 1) {
     unsigned Elt = getOrCreateVReg(*U.getOperand(1));
-    ValToVReg[&U] = Elt;
+    auto &Regs = *VMap.getVRegs(U);
+    if (Regs.empty()) {
+      Regs.push_back(Elt);
+      VMap.getOffsets(U)->push_back(0);
+    } else {
+      MIRBuilder.buildCopy(Regs[0], Elt);
+    }
     return true;
   }
+
   unsigned Res = getOrCreateVReg(U);
   unsigned Val = getOrCreateVReg(*U.getOperand(0));
   unsigned Elt = getOrCreateVReg(*U.getOperand(1));
@@ -1087,7 +1266,13 @@ bool IRTranslator::translateExtractElement(const User &U,
   // not a legal vector type in LLT.
   if (U.getOperand(0)->getType()->getVectorNumElements() == 1) {
     unsigned Elt = getOrCreateVReg(*U.getOperand(0));
-    ValToVReg[&U] = Elt;
+    auto &Regs = *VMap.getVRegs(U);
+    if (Regs.empty()) {
+      Regs.push_back(Elt);
+      VMap.getOffsets(U)->push_back(0);
+    } else {
+      MIRBuilder.buildCopy(Regs[0], Elt);
+    }
     return true;
   }
   unsigned Res = getOrCreateVReg(U);
@@ -1109,17 +1294,115 @@ bool IRTranslator::translateShuffleVector(const User &U,
 
 bool IRTranslator::translatePHI(const User &U, MachineIRBuilder &MIRBuilder) {
   const PHINode &PI = cast<PHINode>(U);
-  auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
-  MIB.addDef(getOrCreateVReg(PI));
 
-  PendingPHIs.emplace_back(&PI, MIB.getInstr());
+  SmallVector<MachineInstr *, 4> Insts;
+  for (auto Reg : getOrCreateVRegs(PI)) {
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, Reg);
+    Insts.push_back(MIB.getInstr());
+  }
+
+  PendingPHIs.emplace_back(&PI, std::move(Insts));
+  return true;
+}
+
+bool IRTranslator::translateAtomicCmpXchg(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
+
+  if (I.isWeak())
+    return false;
+
+  auto Flags = I.isVolatile() ? MachineMemOperand::MOVolatile
+                              : MachineMemOperand::MONone;
+  Flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+  Type *ResType = I.getType();
+  Type *ValType = ResType->Type::getStructElementType(0);
+
+  auto Res = getOrCreateVRegs(I);
+  unsigned OldValRes = Res[0];
+  unsigned SuccessRes = Res[1];
+  unsigned Addr = getOrCreateVReg(*I.getPointerOperand());
+  unsigned Cmp = getOrCreateVReg(*I.getCompareOperand());
+  unsigned NewVal = getOrCreateVReg(*I.getNewValOperand());
+
+  MIRBuilder.buildAtomicCmpXchgWithSuccess(
+      OldValRes, SuccessRes, Addr, Cmp, NewVal,
+      *MF->getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
+                                Flags, DL->getTypeStoreSize(ValType),
+                                getMemOpAlignment(I), AAMDNodes(), nullptr,
+                                I.getSyncScopeID(), I.getSuccessOrdering(),
+                                I.getFailureOrdering()));
+  return true;
+}
+
+bool IRTranslator::translateAtomicRMW(const User &U,
+                                      MachineIRBuilder &MIRBuilder) {
+  const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
+
+  auto Flags = I.isVolatile() ? MachineMemOperand::MOVolatile
+                              : MachineMemOperand::MONone;
+  Flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+  Type *ResType = I.getType();
+
+  unsigned Res = getOrCreateVReg(I);
+  unsigned Addr = getOrCreateVReg(*I.getPointerOperand());
+  unsigned Val = getOrCreateVReg(*I.getValOperand());
+
+  unsigned Opcode = 0;
+  switch (I.getOperation()) {
+  default:
+    llvm_unreachable("Unknown atomicrmw op");
+    return false;
+  case AtomicRMWInst::Xchg:
+    Opcode = TargetOpcode::G_ATOMICRMW_XCHG;
+    break;
+  case AtomicRMWInst::Add:
+    Opcode = TargetOpcode::G_ATOMICRMW_ADD;
+    break;
+  case AtomicRMWInst::Sub:
+    Opcode = TargetOpcode::G_ATOMICRMW_SUB;
+    break;
+  case AtomicRMWInst::And:
+    Opcode = TargetOpcode::G_ATOMICRMW_AND;
+    break;
+  case AtomicRMWInst::Nand:
+    Opcode = TargetOpcode::G_ATOMICRMW_NAND;
+    break;
+  case AtomicRMWInst::Or:
+    Opcode = TargetOpcode::G_ATOMICRMW_OR;
+    break;
+  case AtomicRMWInst::Xor:
+    Opcode = TargetOpcode::G_ATOMICRMW_XOR;
+    break;
+  case AtomicRMWInst::Max:
+    Opcode = TargetOpcode::G_ATOMICRMW_MAX;
+    break;
+  case AtomicRMWInst::Min:
+    Opcode = TargetOpcode::G_ATOMICRMW_MIN;
+    break;
+  case AtomicRMWInst::UMax:
+    Opcode = TargetOpcode::G_ATOMICRMW_UMAX;
+    break;
+  case AtomicRMWInst::UMin:
+    Opcode = TargetOpcode::G_ATOMICRMW_UMIN;
+    break;
+  }
+
+  MIRBuilder.buildAtomicRMW(
+      Opcode, Res, Addr, Val,
+      *MF->getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
+                                Flags, DL->getTypeStoreSize(ResType),
+                                getMemOpAlignment(I), AAMDNodes(), nullptr,
+                                I.getSyncScopeID(), I.getOrdering()));
   return true;
 }
 
 void IRTranslator::finishPendingPhis() {
-  for (std::pair<const PHINode *, MachineInstr *> &Phi : PendingPHIs) {
+  for (auto &Phi : PendingPHIs) {
     const PHINode *PI = Phi.first;
-    MachineInstrBuilder MIB(*MF, Phi.second);
+    ArrayRef<MachineInstr *> ComponentPHIs = Phi.second;
 
     // All MachineBasicBlocks exist, add them to the PHI. We assume IRTranslator
     // won't create extra control flow here, otherwise we need to find the
@@ -1133,17 +1416,27 @@ void IRTranslator::finishPendingPhis() {
         continue;
 
       HandledPreds.insert(IRPred);
-      unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i));
+      ArrayRef<unsigned> ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i));
       for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
-        assert(Pred->isSuccessor(MIB->getParent()) &&
+        assert(Pred->isSuccessor(ComponentPHIs[0]->getParent()) &&
                "incorrect CFG at MachineBasicBlock level");
-        MIB.addUse(ValReg);
-        MIB.addMBB(Pred);
+        for (unsigned j = 0; j < ValRegs.size(); ++j) {
+          MachineInstrBuilder MIB(*MF, ComponentPHIs[j]);
+          MIB.addUse(ValRegs[j]);
+          MIB.addMBB(Pred);
+        }
       }
     }
   }
 }
 
+bool IRTranslator::valueIsSplit(const Value &V,
+                                SmallVectorImpl<uint64_t> *Offsets) {
+  SmallVector<LLT, 4> SplitTys;
+  computeValueLLTs(*DL, *V.getType(), SplitTys, Offsets);
+  return SplitTys.size() > 1;
+}
+
 bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder.setDebugLoc(Inst.getDebugLoc());
   switch(Inst.getOpcode()) {
@@ -1162,9 +1455,15 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
     EntryBuilder.buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
     EntryBuilder.buildUndef(Reg);
-  else if (isa<ConstantPointerNull>(C))
-    EntryBuilder.buildConstant(Reg, 0);
-  else if (auto GV = dyn_cast<GlobalValue>(&C))
+  else if (isa<ConstantPointerNull>(C)) {
+    // As we are trying to build a constant val of 0 into a pointer,
+    // insert a cast to make them correct with respect to types.
+    unsigned NullSize = DL->getTypeSizeInBits(C.getType());
+    auto *ZeroTy = Type::getIntNTy(C.getContext(), NullSize);
+    auto *ZeroVal = ConstantInt::get(ZeroTy, 0);
+    unsigned ZeroReg = getOrCreateVReg(*ZeroVal);
+    EntryBuilder.buildCast(Reg, ZeroReg);
+  } else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder.buildGlobalValue(Reg, GV);
   else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
     if (!CAZ->getType()->isVectorTy())
@@ -1196,23 +1495,6 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
     default:
       return false;
     }
-  } else if (auto CS = dyn_cast<ConstantStruct>(&C)) {
-    // Return the element if it is a single element ConstantStruct.
-    if (CS->getNumOperands() == 1) {
-      unsigned EltReg = getOrCreateVReg(*CS->getOperand(0));
-      EntryBuilder.buildCast(Reg, EltReg);
-      return true;
-    }
-    SmallVector<unsigned, 4> Ops;
-    SmallVector<uint64_t, 4> Indices;
-    uint64_t Offset = 0;
-    for (unsigned i = 0; i < CS->getNumOperands(); ++i) {
-      unsigned OpReg = getOrCreateVReg(*CS->getOperand(i));
-      Ops.push_back(OpReg);
-      Indices.push_back(Offset);
-      Offset += MRI->getType(OpReg).getSizeInBits();
-    }
-    EntryBuilder.buildSequence(Reg, Ops, Indices);
   } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
     if (CV->getNumOperands() == 1)
       return translate(*CV->getOperand(0), Reg);
@@ -1231,7 +1513,7 @@ void IRTranslator::finalizeFunction() {
   // Release the memory used by the different maps we
   // needed during the translation.
   PendingPHIs.clear();
-  ValToVReg.clear();
+  VMap.reset();
   FrameIndices.clear();
   MachinePreds.clear();
   // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it
@@ -1291,8 +1573,22 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   for (const Argument &Arg: F.args()) {
     if (DL->getTypeStoreSize(Arg.getType()) == 0)
       continue; // Don't handle zero sized types.
-    VRegArgs.push_back(getOrCreateVReg(Arg));
+    VRegArgs.push_back(
+        MRI->createGenericVirtualRegister(getLLTForType(*Arg.getType(), *DL)));
   }
+
+  // We don't currently support translating swifterror or swiftself functions.
+  for (auto &Arg : F.args()) {
+    if (Arg.hasSwiftErrorAttr() || Arg.hasSwiftSelfAttr()) {
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 F.getSubprogram(), &F.getEntryBlock());
+      R << "unable to lower arguments due to swifterror/swiftself: "
+        << ore::NV("Prototype", F.getType());
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return false;
+    }
+  }
+
   if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) {
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
@@ -1301,14 +1597,28 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     return false;
   }
 
+  auto ArgIt = F.arg_begin();
+  for (auto &VArg : VRegArgs) {
+    // If the argument is an unsplit scalar then don't use unpackRegs to avoid
+    // creating redundant copies.
+    if (!valueIsSplit(*ArgIt, VMap.getOffsets(*ArgIt))) {
+      auto &VRegs = *VMap.getVRegs(cast<Value>(*ArgIt));
+      assert(VRegs.empty() && "VRegs already populated?");
+      VRegs.push_back(VArg);
+    } else {
+      unpackRegs(*ArgIt, VArg, EntryBuilder);
+    }
+    ArgIt++;
+  }
+
   // And translate the function!
-  for (const BasicBlock &BB: F) {
+  for (const BasicBlock &BB : F) {
     MachineBasicBlock &MBB = getMBB(BB);
     // Set the insertion point of all the following translations to
     // the end of this basic block.
     CurBuilder.setMBB(MBB);
 
-    for (const Instruction &Inst: BB) {
+    for (const Instruction &Inst : BB) {
       if (translate(Inst))
         continue;
 
@@ -1358,5 +1668,9 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   assert(&MF->front() == &NewEntryBB &&
          "New entry wasn't next in the list of basic block!");
 
+  // Initialize stack protector information.
+  StackProtector &SP = getAnalysis<StackProtector>();
+  SP.copyToMachineFrameInfo(MF->getFrameInfo());
+
   return false;
 }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 422cc2219aa8..c83c791327e4 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -57,23 +56,17 @@ InstructionSelect::InstructionSelect() : MachineFunctionPass(ID) {
 
 void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  // No matter what happens, whether we successfully select the function or not,
-  // nothing is going to use the vreg types after us.  Make sure they disappear.
-  auto ClearVRegTypesOnReturn =
-      make_scope_exit([&]() { MRI.getVRegToType().clear(); });
-
   // If the ISel pipeline failed, do not bother running that pass.
   if (MF.getProperties().hasProperty(
           MachineFunctionProperties::Property::FailedISel))
     return false;
 
-  DEBUG(dbgs() << "Selecting function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Selecting function: " << MF.getName() << '\n');
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
@@ -85,23 +78,18 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
   // FIXME: There are many other MF/MFI fields we need to initialize.
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 #ifndef NDEBUG
   // Check that our input is fully legal: we require the function to have the
   // Legalized property, so it should be.
-  // FIXME: This should be in the MachineVerifier, but it can't use the
-  // LegalizerInfo as it's currently in the separate GlobalISel library.
-  // The RegBankSelected property is already checked in the verifier. Note
-  // that it has the same layering problem, but we only use inline methods so
-  // end up not needing to link against the GlobalISel library.
-  if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo())
-    for (MachineBasicBlock &MBB : MF)
-      for (MachineInstr &MI : MBB)
-        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
-          reportGISelFailure(MF, TPC, MORE, "gisel-select",
-                             "instruction is not legal", MI);
-          return false;
-        }
-
+  // FIXME: This should be in the MachineVerifier, as the RegBankSelected
+  // property check already is.
+  if (!DisableGISelLegalityCheck)
+    if (const MachineInstr *MI = machineFunctionIsIllegal(MF)) {
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "instruction is not legal", *MI);
+      return false;
+    }
 #endif
   // FIXME: We could introduce new blocks and will need to fix the outer loop.
   // Until then, keep track of the number of blocks to assert that we don't.
@@ -129,12 +117,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
       else
         --MII;
 
-      DEBUG(dbgs() << "Selecting: \n  " << MI);
+      LLVM_DEBUG(dbgs() << "Selecting: \n  " << MI);
 
       // We could have folded this instruction away already, making it dead.
       // If so, erase it.
       if (isTriviallyDead(MI, MRI)) {
-        DEBUG(dbgs() << "Is dead; erasing.\n");
+        LLVM_DEBUG(dbgs() << "Is dead; erasing.\n");
         MI.eraseFromParentAndMarkDBGValuesForRemoval();
         continue;
       }
@@ -147,7 +135,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // Dump the range of instructions that MI expanded into.
-      DEBUG({
+      LLVM_DEBUG({
         auto InsertedBegin = ReachedBegin ? MBB->begin() : std::next(MII);
         dbgs() << "Into:\n";
         for (auto &InsertedMI : make_range(InsertedBegin, AfterIt))
@@ -159,30 +147,63 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+
+    // Try to find redundant copies b/w vregs of the same register class.
+    bool ReachedBegin = false;
+    for (auto MII = std::prev(MBB.end()), Begin = MBB.begin(); !ReachedBegin;) {
+      // Select this instruction.
+      MachineInstr &MI = *MII;
+
+      // And have our iterator point to the next instruction, if there is one.
+      if (MII == Begin)
+        ReachedBegin = true;
+      else
+        --MII;
+      if (MI.getOpcode() != TargetOpcode::COPY)
+        continue;
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+          TargetRegisterInfo::isVirtualRegister(DstReg)) {
+        auto SrcRC = MRI.getRegClass(SrcReg);
+        auto DstRC = MRI.getRegClass(DstReg);
+        if (SrcRC == DstRC) {
+          MRI.replaceRegWith(DstReg, SrcReg);
+          MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        }
+      }
+    }
+  }
+
   // Now that selection is complete, there are no more generic vregs.  Verify
   // that the size of the now-constrained vreg is unchanged and that it has a
   // register class.
-  for (auto &VRegToType : MRI.getVRegToType()) {
-    unsigned VReg = VRegToType.first;
-    auto *RC = MRI.getRegClassOrNull(VReg);
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(I);
+
     MachineInstr *MI = nullptr;
     if (!MRI.def_empty(VReg))
       MI = &*MRI.def_instr_begin(VReg);
     else if (!MRI.use_empty(VReg))
       MI = &*MRI.use_instr_begin(VReg);
+    if (!MI)
+      continue;
 
-    if (MI && !RC) {
+    const TargetRegisterClass *RC = MRI.getRegClassOrNull(VReg);
+    if (!RC) {
       reportGISelFailure(MF, TPC, MORE, "gisel-select",
                          "VReg has no regclass after selection", *MI);
       return false;
-    } else if (!RC)
-      continue;
+    }
 
-    if (VRegToType.second.isValid() &&
-        VRegToType.second.getSizeInBits() > TRI.getRegSizeInBits(*RC)) {
-      reportGISelFailure(MF, TPC, MORE, "gisel-select",
-                         "VReg has explicit size different from class size",
-                         *MI);
+    const LLT Ty = MRI.getType(VReg);
+    if (Ty.isValid() && Ty.getSizeInBits() > TRI.getRegSizeInBits(*RC)) {
+      reportGISelFailure(
+          MF, TPC, MORE, "gisel-select",
+          "VReg's low-level type and register class have different sizes", *MI);
       return false;
     }
   }
@@ -199,6 +220,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   auto &TLI = *MF.getSubtarget().getTargetLowering();
   TLI.finalizeLowering(MF);
 
+  LLVM_DEBUG({
+    dbgs() << "Rules covered by selecting function: " << MF.getName() << ":";
+    for (auto RuleID : CoverageInfo.covered())
+      dbgs() << " id" << RuleID;
+    dbgs() << "\n\n";
+  });
   CoverageInfo.emit(CoveragePrefix,
                     MF.getSubtarget()
                         .getTargetLowering()
@@ -206,6 +233,11 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
                         .getTarget()
                         .getBackendName());
 
+  // If we successfully selected the function nothing is going to use the vreg
+  // types after us (otherwise MIRPrinter would need them). Make sure the types
+  // disappear.
+  MRI.clearVirtRegTypes();
+
   // FIXME: Should we accurately track changes?
   return true;
 }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 88669bd68c00..5e77fcbb0ed9 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -46,50 +46,6 @@ bool InstructionSelector::constrainOperandRegToRegClass(
       constrainRegToClass(MRI, TII, RBI, I, I.getOperand(OpIdx).getReg(), RC);
 }
 
-bool InstructionSelector::constrainSelectedInstRegOperands(
-    MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
-    const RegisterBankInfo &RBI) const {
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  for (unsigned OpI = 0, OpE = I.getNumExplicitOperands(); OpI != OpE; ++OpI) {
-    MachineOperand &MO = I.getOperand(OpI);
-
-    // There's nothing to be done on non-register operands.
-    if (!MO.isReg())
-      continue;
-
-    DEBUG(dbgs() << "Converting operand: " << MO << '\n');
-    assert(MO.isReg() && "Unsupported non-reg operand");
-
-    unsigned Reg = MO.getReg();
-    // Physical registers don't need to be constrained.
-    if (TRI.isPhysicalRegister(Reg))
-      continue;
-
-    // Register operands with a value of 0 (e.g. predicate operands) don't need
-    // to be constrained.
-    if (Reg == 0)
-      continue;
-
-    // If the operand is a vreg, we should constrain its regclass, and only
-    // insert COPYs if that's impossible.
-    // constrainOperandRegClass does that for us.
-    MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),
-                                       Reg, OpI));
-
-    // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been
-    // done.
-    if (MO.isUse()) {
-      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO);
-      if (DefIdx != -1 && !I.isRegTiedToUseOperand(DefIdx))
-        I.tieOperands(DefIdx, OpI);
-    }
-  }
-  return true;
-}
-
 bool InstructionSelector::isOperandImmEqual(
     const MachineOperand &MO, int64_t Value,
     const MachineRegisterInfo &MRI) const {
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
new file mode 100644
index 000000000000..344f573a67f5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -0,0 +1,101 @@
+//===- lib/CodeGen/GlobalISel/LegalizerPredicates.cpp - Predicates --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A library of predicate factories to use for LegalityPredicate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+using namespace llvm;
+
+LegalityPredicate LegalityPredicates::typeIs(unsigned TypeIdx, LLT Type) {
+  return
+      [=](const LegalityQuery &Query) { return Query.Types[TypeIdx] == Type; };
+}
+
+LegalityPredicate
+LegalityPredicates::typeInSet(unsigned TypeIdx,
+                              std::initializer_list<LLT> TypesInit) {
+  SmallVector<LLT, 4> Types = TypesInit;
+  return [=](const LegalityQuery &Query) {
+    return std::find(Types.begin(), Types.end(), Query.Types[TypeIdx]) != Types.end();
+  };
+}
+
+LegalityPredicate LegalityPredicates::typePairInSet(
+    unsigned TypeIdx0, unsigned TypeIdx1,
+    std::initializer_list<std::pair<LLT, LLT>> TypesInit) {
+  SmallVector<std::pair<LLT, LLT>, 4> Types = TypesInit;
+  return [=](const LegalityQuery &Query) {
+    std::pair<LLT, LLT> Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1]};
+    return std::find(Types.begin(), Types.end(), Match) != Types.end();
+  };
+}
+
+LegalityPredicate LegalityPredicates::typePairAndMemSizeInSet(
+    unsigned TypeIdx0, unsigned TypeIdx1, unsigned MMOIdx,
+    std::initializer_list<TypePairAndMemSize> TypesAndMemSizeInit) {
+  SmallVector<TypePairAndMemSize, 4> TypesAndMemSize = TypesAndMemSizeInit;
+  return [=](const LegalityQuery &Query) {
+    TypePairAndMemSize Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1],
+                                Query.MMODescrs[MMOIdx].Size};
+    return std::find(TypesAndMemSize.begin(), TypesAndMemSize.end(), Match) !=
+           TypesAndMemSize.end();
+  };
+}
+
+LegalityPredicate LegalityPredicates::isScalar(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].isScalar();
+  };
+}
+
+LegalityPredicate LegalityPredicates::narrowerThan(unsigned TypeIdx,
+                                                   unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT &QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && QueryTy.getSizeInBits() < Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::widerThan(unsigned TypeIdx,
+                                                unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT &QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && QueryTy.getSizeInBits() > Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT &QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && !isPowerOf2_32(QueryTy.getSizeInBits());
+  };
+}
+
+LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
+  return [=](const LegalityQuery &Query) {
+    return !isPowerOf2_32(Query.MMODescrs[MMOIdx].Size /* In Bytes */);
+  };
+}
+
+LegalityPredicate LegalityPredicates::numElementsNotPow2(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT &QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isVector() && isPowerOf2_32(QueryTy.getNumElements());
+  };
+}
+
+LegalityPredicate LegalityPredicates::atomicOrderingAtLeastOrStrongerThan(
+    unsigned MMOIdx, AtomicOrdering Ordering) {
+  return [=](const LegalityQuery &Query) {
+    return isAtLeastOrStrongerThan(Query.MMODescrs[MMOIdx].Ordering, Ordering);
+  };
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
new file mode 100644
index 000000000000..a29b32ecdc03
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -0,0 +1,51 @@
+//===- lib/CodeGen/GlobalISel/LegalizerMutations.cpp - Mutations ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A library of mutation factories to use for LegalityMutation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+using namespace llvm;
+
+LegalizeMutation LegalizeMutations::changeTo(unsigned TypeIdx, LLT Ty) {
+  return
+      [=](const LegalityQuery &Query) { return std::make_pair(TypeIdx, Ty); };
+}
+
+LegalizeMutation LegalizeMutations::changeTo(unsigned TypeIdx,
+                                             unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return std::make_pair(TypeIdx, Query.Types[FromTypeIdx]);
+  };
+}
+
+LegalizeMutation LegalizeMutations::widenScalarToNextPow2(unsigned TypeIdx,
+                                                          unsigned Min) {
+  return [=](const LegalityQuery &Query) {
+    unsigned NewSizeInBits =
+        1 << Log2_32_Ceil(Query.Types[TypeIdx].getSizeInBits());
+    if (NewSizeInBits < Min)
+      NewSizeInBits = Min;
+    return std::make_pair(TypeIdx, LLT::scalar(NewSizeInBits));
+  };
+}
+
+LegalizeMutation LegalizeMutations::moreElementsToNextPow2(unsigned TypeIdx,
+                                                           unsigned Min) {
+  return [=](const LegalityQuery &Query) {
+    const LLT &VecTy = Query.Types[TypeIdx];
+    unsigned NewNumElements = 1 << Log2_32_Ceil(VecTy.getNumElements());
+    if (NewNumElements < Min)
+      NewNumElements = Min;
+    return std::make_pair(
+        TypeIdx, LLT::vector(NewNumElements, VecTy.getScalarSizeInBits()));
+  };
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index f09b0d9f11e7..9a2aac998a84 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -47,6 +47,7 @@ Legalizer::Legalizer() : MachineFunctionPass(ID) {
 
 void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -72,7 +73,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   if (MF.getProperties().hasProperty(
           MachineFunctionProperties::Property::FailedISel))
     return false;
-  DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');
   init(MF);
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
@@ -112,7 +113,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       else
         InstList.insert(MI);
     }
-    DEBUG(dbgs() << ".. .. New MI: " << *MI;);
+    LLVM_DEBUG(dbgs() << ".. .. New MI: " << *MI;);
   });
   const LegalizerInfo &LInfo(Helper.getLegalizerInfo());
   LegalizationArtifactCombiner ArtCombiner(Helper.MIRBuilder, MF.getRegInfo(), LInfo);
@@ -127,7 +128,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *InstList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
       if (isTriviallyDead(MI, MRI)) {
-        DEBUG(dbgs() << MI << "Is dead; erasing.\n");
+        LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
         MI.eraseFromParentAndMarkDBGValuesForRemoval();
         continue;
       }
@@ -148,7 +149,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *ArtifactList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
       if (isTriviallyDead(MI, MRI)) {
-        DEBUG(dbgs() << MI << "Is dead; erasing.\n");
+        LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
         RemoveDeadInstFromLists(&MI);
         MI.eraseFromParentAndMarkDBGValuesForRemoval();
         continue;
@@ -156,7 +157,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       SmallVector<MachineInstr *, 4> DeadInstructions;
       if (ArtCombiner.tryCombineInstruction(MI, DeadInstructions)) {
         for (auto *DeadMI : DeadInstructions) {
-          DEBUG(dbgs() << ".. Erasing Dead Instruction " << *DeadMI);
+          LLVM_DEBUG(dbgs() << ".. Erasing Dead Instruction " << *DeadMI);
           RemoveDeadInstFromLists(DeadMI);
           DeadMI->eraseFromParentAndMarkDBGValuesForRemoval();
         }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6bebe180fefd..87086af121b7 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -26,6 +26,7 @@
 #define DEBUG_TYPE "legalizer"
 
 using namespace llvm;
+using namespace LegalizeActions;
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF)
     : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {
@@ -34,34 +35,34 @@ LegalizerHelper::LegalizerHelper(MachineFunction &MF)
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
-  DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs()));
+  LLVM_DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs()));
 
-  auto Action = LI.getAction(MI, MRI);
-  switch (std::get<0>(Action)) {
-  case LegalizerInfo::Legal:
-    DEBUG(dbgs() << ".. Already legal\n");
+  auto Step = LI.getAction(MI, MRI);
+  switch (Step.Action) {
+  case Legal:
+    LLVM_DEBUG(dbgs() << ".. Already legal\n");
     return AlreadyLegal;
-  case LegalizerInfo::Libcall:
-    DEBUG(dbgs() << ".. Convert to libcall\n");
+  case Libcall:
+    LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
     return libcall(MI);
-  case LegalizerInfo::NarrowScalar:
-    DEBUG(dbgs() << ".. Narrow scalar\n");
-    return narrowScalar(MI, std::get<1>(Action), std::get<2>(Action));
-  case LegalizerInfo::WidenScalar:
-    DEBUG(dbgs() << ".. Widen scalar\n");
-    return widenScalar(MI, std::get<1>(Action), std::get<2>(Action));
-  case LegalizerInfo::Lower:
-    DEBUG(dbgs() << ".. Lower\n");
-    return lower(MI, std::get<1>(Action), std::get<2>(Action));
-  case LegalizerInfo::FewerElements:
-    DEBUG(dbgs() << ".. Reduce number of elements\n");
-    return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action));
-  case LegalizerInfo::Custom:
-    DEBUG(dbgs() << ".. Custom legalization\n");
+  case NarrowScalar:
+    LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
+    return narrowScalar(MI, Step.TypeIdx, Step.NewType);
+  case WidenScalar:
+    LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
+    return widenScalar(MI, Step.TypeIdx, Step.NewType);
+  case Lower:
+    LLVM_DEBUG(dbgs() << ".. Lower\n");
+    return lower(MI, Step.TypeIdx, Step.NewType);
+  case FewerElements:
+    LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
+    return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
+  case Custom:
+    LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
     return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized
                                                   : UnableToLegalize;
   default:
-    DEBUG(dbgs() << ".. Unable to legalize\n");
+    LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
     return UnableToLegalize;
   }
 }
@@ -103,6 +104,9 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32;
   case TargetOpcode::G_FPOW:
     return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;
+  case TargetOpcode::G_FMA:
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::FMA_F64 : RTLIB::FMA_F32;
   }
   llvm_unreachable("Unknown libcall function");
 }
@@ -123,13 +127,47 @@ llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
   return LegalizerHelper::Legalized;
 }
 
+// Useful for libcalls where all operands have the same type.
 static LegalizerHelper::LegalizeResult
 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
               Type *OpType) {
   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
+
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  for (unsigned i = 1; i < MI.getNumOperands(); i++)
+    Args.push_back({MI.getOperand(i).getReg(), OpType});
   return createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), OpType},
-                       {{MI.getOperand(1).getReg(), OpType},
-                        {MI.getOperand(2).getReg(), OpType}});
+                       Args);
+}
+
+static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
+                                       Type *FromType) {
+  auto ToMVT = MVT::getVT(ToType);
+  auto FromMVT = MVT::getVT(FromType);
+
+  switch (Opcode) {
+  case TargetOpcode::G_FPEXT:
+    return RTLIB::getFPEXT(FromMVT, ToMVT);
+  case TargetOpcode::G_FPTRUNC:
+    return RTLIB::getFPROUND(FromMVT, ToMVT);
+  case TargetOpcode::G_FPTOSI:
+    return RTLIB::getFPTOSINT(FromMVT, ToMVT);
+  case TargetOpcode::G_FPTOUI:
+    return RTLIB::getFPTOUINT(FromMVT, ToMVT);
+  case TargetOpcode::G_SITOFP:
+    return RTLIB::getSINTTOFP(FromMVT, ToMVT);
+  case TargetOpcode::G_UITOFP:
+    return RTLIB::getUINTTOFP(FromMVT, ToMVT);
+  }
+  llvm_unreachable("Unsupported libcall function");
+}
+
+static LegalizerHelper::LegalizeResult
+conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
+                  Type *FromType) {
+  RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
+  return createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ToType},
+                       {{MI.getOperand(1).getReg(), FromType}});
 }
 
 LegalizerHelper::LegalizeResult
@@ -157,6 +195,7 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FMA:
   case TargetOpcode::G_FPOW:
   case TargetOpcode::G_FREM: {
     Type *HLTy = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
@@ -165,6 +204,59 @@ LegalizerHelper::libcall(MachineInstr &MI) {
       return Status;
     break;
   }
+  case TargetOpcode::G_FPEXT: {
+    // FIXME: Support other floating point types (half, fp128 etc)
+    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if (ToSize != 64 || FromSize != 32)
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(
+        MI, MIRBuilder, Type::getDoubleTy(Ctx), Type::getFloatTy(Ctx));
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_FPTRUNC: {
+    // FIXME: Support other floating point types (half, fp128 etc)
+    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if (ToSize != 32 || FromSize != 64)
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(
+        MI, MIRBuilder, Type::getFloatTy(Ctx), Type::getDoubleTy(Ctx));
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
+    // FIXME: Support other types
+    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if (ToSize != 32 || (FromSize != 32 && FromSize != 64))
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(
+        MI, MIRBuilder, Type::getInt32Ty(Ctx),
+        FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP: {
+    // FIXME: Support other types
+    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if (FromSize != 32 || (ToSize != 32 && ToSize != 64))
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(
+        MI, MIRBuilder,
+        ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
+        Type::getInt32Ty(Ctx));
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   }
 
   MI.eraseFromParent();
@@ -180,8 +272,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
   MIRBuilder.setInstr(MI);
 
-  int64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-  int64_t NarrowSize = NarrowTy.getSizeInBits();
+  uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+  uint64_t NarrowSize = NarrowTy.getSizeInBits();
 
   switch (MI.getOpcode()) {
   default:
@@ -194,11 +286,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     int NumParts = SizeOp0 / NarrowSize;
 
     SmallVector<unsigned, 2> DstRegs;
-    for (int i = 0; i < NumParts; ++i) {
-      unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy);
-      MIRBuilder.buildUndef(Dst);
-      DstRegs.push_back(Dst);
-    }
+    for (int i = 0; i < NumParts; ++i)
+      DstRegs.push_back(
+          MIRBuilder.buildUndef(NarrowTy)->getOperand(0).getReg());
     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     MI.eraseFromParent();
     return Legalized;
@@ -249,8 +339,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
 
     unsigned OpReg = MI.getOperand(0).getReg();
-    int64_t OpStart = MI.getOperand(2).getImm();
-    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    uint64_t OpStart = MI.getOperand(2).getImm();
+    uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
     for (int i = 0; i < NumParts; ++i) {
       unsigned SrcStart = i * NarrowSize;
 
@@ -265,7 +355,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
       // OpSegStart is where this destination segment would start in OpReg if it
       // extended infinitely in both directions.
-      int64_t ExtractOffset, SegSize;
+      int64_t ExtractOffset;
+      uint64_t SegSize;
       if (OpStart < SrcStart) {
         ExtractOffset = 0;
         SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
@@ -301,8 +392,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
 
     unsigned OpReg = MI.getOperand(2).getReg();
-    int64_t OpStart = MI.getOperand(3).getImm();
-    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    uint64_t OpStart = MI.getOperand(3).getImm();
+    uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstStart = i * NarrowSize;
 
@@ -319,7 +410,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
       // OpSegStart is where this destination segment would start in OpReg if it
       // extended infinitely in both directions.
-      int64_t ExtractOffset, InsertOffset, SegSize;
+      int64_t ExtractOffset, InsertOffset;
+      uint64_t SegSize;
       if (OpStart < DstStart) {
         InsertOffset = 0;
         ExtractOffset = DstStart - OpStart;
@@ -353,6 +445,14 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
+
+    const auto &MMO = **MI.memoperands_begin();
+    // This implementation doesn't work for atomics. Give up instead of doing
+    // something invalid.
+    if (MMO.getOrdering() != AtomicOrdering::NotAtomic ||
+        MMO.getFailureOrdering() != AtomicOrdering::NotAtomic)
+      return UnableToLegalize;
+
     int NumParts = SizeOp0 / NarrowSize;
     LLT OffsetTy = LLT::scalar(
         MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
@@ -363,12 +463,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       unsigned SrcReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
 
+      MachineMemOperand *SplitMMO = MIRBuilder.getMF().getMachineMemOperand(
+          MMO.getPointerInfo().getWithOffset(Adjustment), MMO.getFlags(),
+          NarrowSize / 8, i == 0 ? MMO.getAlignment() : NarrowSize / 8,
+          MMO.getAAInfo(), MMO.getRanges(), MMO.getSyncScopeID(),
+          MMO.getOrdering(), MMO.getFailureOrdering());
+
       MIRBuilder.materializeGEP(SrcReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
 
-      // TODO: This is conservatively correct, but we probably want to split the
-      // memory operands in the future.
-      MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin());
+      MIRBuilder.buildLoad(DstReg, SrcReg, *SplitMMO);
 
       DstRegs.push_back(DstReg);
     }
@@ -382,6 +486,14 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
+
+    const auto &MMO = **MI.memoperands_begin();
+    // This implementation doesn't work for atomics. Give up instead of doing
+    // something invalid.
+    if (MMO.getOrdering() != AtomicOrdering::NotAtomic ||
+        MMO.getFailureOrdering() != AtomicOrdering::NotAtomic)
+      return UnableToLegalize;
+
     int NumParts = SizeOp0 / NarrowSize;
     LLT OffsetTy = LLT::scalar(
         MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
@@ -393,12 +505,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       unsigned DstReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
 
+      MachineMemOperand *SplitMMO = MIRBuilder.getMF().getMachineMemOperand(
+          MMO.getPointerInfo().getWithOffset(Adjustment), MMO.getFlags(),
+          NarrowSize / 8, i == 0 ? MMO.getAlignment() : NarrowSize / 8,
+          MMO.getAAInfo(), MMO.getRanges(), MMO.getSyncScopeID(),
+          MMO.getOrdering(), MMO.getFailureOrdering());
+
       MIRBuilder.materializeGEP(DstReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
 
-      // TODO: This is conservatively correct, but we probably want to split the
-      // memory operands in the future.
-      MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin());
+      MIRBuilder.buildStore(SrcRegs[i], DstReg, *SplitMMO);
     }
     MI.eraseFromParent();
     return Legalized;
@@ -475,6 +591,22 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   }
 }
 
+void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
+                                     unsigned OpIdx, unsigned ExtOpcode) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  auto ExtB = MIRBuilder.buildInstr(ExtOpcode, WideTy, MO.getReg());
+  MO.setReg(ExtB->getOperand(0).getReg());
+}
+
+void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
+                                     unsigned OpIdx, unsigned TruncOpcode) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+  MIRBuilder.buildInstr(TruncOpcode, MO.getReg(), DstExt);
+  MO.setReg(DstExt);
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   MIRBuilder.setInstr(MI);
@@ -482,303 +614,201 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
+
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_MUL:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
   case TargetOpcode::G_SUB:
-  case TargetOpcode::G_SHL: {
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
-    unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
-    unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(1).getReg());
-    MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(2).getReg());
-
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(MI.getOpcode())
-        .addDef(DstExt)
-        .addUse(Src1Ext)
-        .addUse(Src2Ext);
-
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
+
+  case TargetOpcode::G_SHL:
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    // The "number of bits to shift" operand must preserve its value as an
+    // unsigned integer:
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
+
   case TargetOpcode::G_SDIV:
-  case TargetOpcode::G_UDIV:
   case TargetOpcode::G_SREM:
-  case TargetOpcode::G_UREM:
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
+
   case TargetOpcode::G_ASHR:
-  case TargetOpcode::G_LSHR: {
-    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV ||
-                             MI.getOpcode() == TargetOpcode::G_SREM ||
-                             MI.getOpcode() == TargetOpcode::G_ASHR
-                         ? TargetOpcode::G_SEXT
-                         : TargetOpcode::G_ZEXT;
-
-    unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse(
-        MI.getOperand(1).getReg());
-
-    unsigned RHSExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(ExtOp).addDef(RHSExt).addUse(
-        MI.getOperand(2).getReg());
-
-    unsigned ResExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(MI.getOpcode())
-        .addDef(ResExt)
-        .addUse(LHSExt)
-        .addUse(RHSExt);
-
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), ResExt);
-    MI.eraseFromParent();
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+    // The "number of bits to shift" operand must preserve its value as an
+    // unsigned integer:
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
-  case TargetOpcode::G_SELECT: {
+
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_LSHR:
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
+
+  case TargetOpcode::G_SELECT:
     if (TypeIdx != 0)
       return UnableToLegalize;
-
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
-    unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
-    unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg());
-    MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg());
-
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(TargetOpcode::G_SELECT)
-        .addDef(DstExt)
-        .addReg(MI.getOperand(1).getReg())
-        .addUse(Src1Ext)
-        .addUse(Src2Ext);
-
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+    widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
+
   case TargetOpcode::G_FPTOSI:
-  case TargetOpcode::G_FPTOUI: {
+  case TargetOpcode::G_FPTOUI:
     if (TypeIdx != 0)
       return UnableToLegalize;
-
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(MI.getOpcode())
-        .addDef(DstExt)
-        .addUse(MI.getOperand(1).getReg());
-
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
+
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_UITOFP: {
     if (TypeIdx != 1)
       return UnableToLegalize;
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
 
-    unsigned Src = MI.getOperand(1).getReg();
-    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
-
-    if (MI.getOpcode() == TargetOpcode::G_SITOFP) {
-      MIRBuilder.buildSExt(SrcExt, Src);
-    } else {
-      assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op");
-      MIRBuilder.buildZExt(SrcExt, Src);
-    }
-
-    MIRBuilder.buildInstr(MI.getOpcode())
-        .addDef(MI.getOperand(0).getReg())
-        .addUse(SrcExt);
-
-    MI.eraseFromParent();
+  case TargetOpcode::G_UITOFP:
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
-  case TargetOpcode::G_INSERT: {
+
+  case TargetOpcode::G_INSERT:
     if (TypeIdx != 0)
       return UnableToLegalize;
-
-    unsigned Src = MI.getOperand(1).getReg();
-    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(SrcExt, Src);
-
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(),
-                                      MI.getOperand(3).getImm());
-    for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) {
-      MIB.addReg(MI.getOperand(OpNum).getReg());
-      MIB.addImm(MI.getOperand(OpNum + 1).getImm());
-    }
-
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
-  case TargetOpcode::G_LOAD: {
-    assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
-               WideTy.getSizeInBits() &&
-           "illegal to increase number of bytes loaded");
-
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildLoad(DstExt, MI.getOperand(1).getReg(),
-                         **MI.memoperands_begin());
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+
+  case TargetOpcode::G_LOAD:
+    // For some types like i24, we might try to widen to i32. To properly handle
+    // this we should be using a dedicated extending load, until then avoid
+    // trying to legalize.
+    if (alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) !=
+        WideTy.getSizeInBits())
+      return UnableToLegalize;
+    LLVM_FALLTHROUGH;
+  case TargetOpcode::G_SEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
+
   case TargetOpcode::G_STORE: {
     if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) ||
         WideTy != LLT::scalar(8))
       return UnableToLegalize;
 
-    auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-    auto Content = TLI.getBooleanContents(false, false);
-
-    unsigned ExtOp = TargetOpcode::G_ANYEXT;
-    if (Content == TargetLoweringBase::ZeroOrOneBooleanContent)
-      ExtOp = TargetOpcode::G_ZEXT;
-    else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent)
-      ExtOp = TargetOpcode::G_SEXT;
-    else
-      ExtOp = TargetOpcode::G_ANYEXT;
-
-    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse(
-        MI.getOperand(0).getReg());
-    MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(),
-                          **MI.memoperands_begin());
-    MI.eraseFromParent();
+    widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ZEXT);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
   }
   case TargetOpcode::G_CONSTANT: {
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildConstant(DstExt, *MI.getOperand(1).getCImm());
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+    MachineOperand &SrcMO = MI.getOperand(1);
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
+    const APInt &Val = SrcMO.getCImm()->getValue().sext(WideTy.getSizeInBits());
+    SrcMO.setCImm(ConstantInt::get(Ctx, Val));
+
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
   }
   case TargetOpcode::G_FCONSTANT: {
-    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    const ConstantFP *CFP = MI.getOperand(1).getFPImm();
-    APFloat Val = CFP->getValueAPF();
+    MachineOperand &SrcMO = MI.getOperand(1);
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
-    auto LLT2Sem = [](LLT Ty) {
-      switch (Ty.getSizeInBits()) {
-      case 32:
-        return &APFloat::IEEEsingle();
-        break;
-      case 64:
-        return &APFloat::IEEEdouble();
-        break;
-      default:
-        llvm_unreachable("Unhandled fp widen type");
-      }
-    };
+    APFloat Val = SrcMO.getFPImm()->getValueAPF();
     bool LosesInfo;
-    Val.convert(*LLT2Sem(WideTy), APFloat::rmTowardZero, &LosesInfo);
-    MIRBuilder.buildFConstant(DstExt, *ConstantFP::get(Ctx, Val));
-    MIRBuilder.buildFPTrunc(MI.getOperand(0).getReg(), DstExt);
-    MI.eraseFromParent();
+    switch (WideTy.getSizeInBits()) {
+    case 32:
+      Val.convert(APFloat::IEEEsingle(), APFloat::rmTowardZero, &LosesInfo);
+      break;
+    case 64:
+      Val.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo);
+      break;
+    default:
+      llvm_unreachable("Unhandled fp widen type");
+    }
+    SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
+
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
   }
-  case TargetOpcode::G_BRCOND: {
-    unsigned TstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(TstExt, MI.getOperand(0).getReg());
-    MIRBuilder.buildBrCond(TstExt, *MI.getOperand(1).getMBB());
-    MI.eraseFromParent();
+  case TargetOpcode::G_BRCOND:
+    widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
-  case TargetOpcode::G_FCMP: {
-    unsigned Op0Ext, Op1Ext, DstReg;
-    unsigned Cmp1 = MI.getOperand(2).getReg();
-    unsigned Cmp2 = MI.getOperand(3).getReg();
-    if (TypeIdx == 0) {
-      Op0Ext = Cmp1;
-      Op1Ext = Cmp2;
-      DstReg = MRI.createGenericVirtualRegister(WideTy);
-    } else {
-      Op0Ext = MRI.createGenericVirtualRegister(WideTy);
-      Op1Ext = MRI.createGenericVirtualRegister(WideTy);
-      DstReg = MI.getOperand(0).getReg();
-      MIRBuilder.buildInstr(TargetOpcode::G_FPEXT, Op0Ext, Cmp1);
-      MIRBuilder.buildInstr(TargetOpcode::G_FPEXT, Op1Ext, Cmp2);
-    }
-    MIRBuilder.buildFCmp(
-        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()),
-        DstReg, Op0Ext, Op1Ext);
+
+  case TargetOpcode::G_FCMP:
     if (TypeIdx == 0)
-      MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, MI.getOperand(0).getReg(),
-                            DstReg);
-    MI.eraseFromParent();
-    return Legalized;
-  }
-  case TargetOpcode::G_ICMP: {
-    bool IsSigned = CmpInst::isSigned(
-        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()));
-    unsigned Cmp1 = MI.getOperand(2).getReg();
-    unsigned Cmp2 = MI.getOperand(3).getReg();
-    unsigned Op0Ext, Op1Ext, DstReg;
-    if (TypeIdx == 0) {
-      Op0Ext = Cmp1;
-      Op1Ext = Cmp2;
-      DstReg = MRI.createGenericVirtualRegister(WideTy);
-    } else {
-      Op0Ext = MRI.createGenericVirtualRegister(WideTy);
-      Op1Ext = MRI.createGenericVirtualRegister(WideTy);
-      DstReg = MI.getOperand(0).getReg();
-      if (IsSigned) {
-        MIRBuilder.buildSExt(Op0Ext, Cmp1);
-        MIRBuilder.buildSExt(Op1Ext, Cmp2);
-      } else {
-        MIRBuilder.buildZExt(Op0Ext, Cmp1);
-        MIRBuilder.buildZExt(Op1Ext, Cmp2);
-      }
+      widenScalarDst(MI, WideTy);
+    else {
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
     }
-    MIRBuilder.buildICmp(
-        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()),
-        DstReg, Op0Ext, Op1Ext);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
+
+  case TargetOpcode::G_ICMP:
     if (TypeIdx == 0)
-      MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, MI.getOperand(0).getReg(),
-                            DstReg);
-    MI.eraseFromParent();
+      widenScalarDst(MI, WideTy);
+    else {
+      unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
+                               MI.getOperand(1).getPredicate()))
+                               ? TargetOpcode::G_SEXT
+                               : TargetOpcode::G_ZEXT;
+      widenScalarSrc(MI, WideTy, 2, ExtOpcode);
+      widenScalarSrc(MI, WideTy, 3, ExtOpcode);
+    }
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
-  case TargetOpcode::G_GEP: {
+
+  case TargetOpcode::G_GEP:
     assert(TypeIdx == 1 && "unable to legalize pointer of GEP");
-    unsigned OffsetExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildSExt(OffsetExt, MI.getOperand(2).getReg());
-    MI.getOperand(2).setReg(OffsetExt);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
-  }
+
   case TargetOpcode::G_PHI: {
     assert(TypeIdx == 0 && "Expecting only Idx 0");
-    auto getExtendedReg = [&](unsigned Reg, MachineBasicBlock &MBB) {
-      auto FirstTermIt = MBB.getFirstTerminator();
-      MIRBuilder.setInsertPt(MBB, FirstTermIt);
-      MachineInstr *DefMI = MRI.getVRegDef(Reg);
-      MachineInstrBuilder MIB;
-      if (DefMI->getOpcode() == TargetOpcode::G_TRUNC)
-        MIB = MIRBuilder.buildAnyExtOrTrunc(WideTy,
-                                            DefMI->getOperand(1).getReg());
-      else
-        MIB = MIRBuilder.buildAnyExt(WideTy, Reg);
-      return MIB->getOperand(0).getReg();
-    };
-    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, WideTy);
-    for (auto OpIt = MI.operands_begin() + 1, OpE = MI.operands_end();
-         OpIt != OpE;) {
-      unsigned Reg = OpIt++->getReg();
-      MachineBasicBlock *OpMBB = OpIt++->getMBB();
-      MIB.addReg(getExtendedReg(Reg, *OpMBB));
-      MIB.addMBB(OpMBB);
+
+    for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
+      MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
+      MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
+      widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
     }
-    auto *MBB = MI.getParent();
-    MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(),
-                          MIB->getOperand(0).getReg());
-    MI.eraseFromParent();
+
+    MachineBasicBlock &MBB = *MI.getParent();
+    MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
+    widenScalarDst(MI, WideTy);
+    MIRBuilder.recordInsertion(&MI);
     return Legalized;
   }
   }
@@ -874,11 +904,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     }
     ConstantFP &ZeroForNegation =
         *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
-    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
-    MIRBuilder.buildFConstant(Zero, ZeroForNegation);
+    auto Zero = MIRBuilder.buildFConstant(Ty, ZeroForNegation);
     MIRBuilder.buildInstr(TargetOpcode::G_FSUB)
         .addDef(Res)
-        .addUse(Zero)
+        .addUse(Zero->getOperand(0).getReg())
         .addUse(MI.getOperand(1).getReg());
     MI.eraseFromParent();
     return Legalized;
@@ -887,7 +916,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
     // First, check if G_FNEG is marked as Lower. If so, we may
     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
-    if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower)
+    if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
       return UnableToLegalize;
     unsigned Res = MI.getOperand(0).getReg();
     unsigned LHS = MI.getOperand(1).getReg();
@@ -913,6 +942,48 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_SEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD: {
+    // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned PtrReg = MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+    auto &MMO = **MI.memoperands_begin();
+
+    if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) {
+      // In the case of G_LOAD, this was a non-extending load already and we're
+      // about to lower to the same instruction.
+      if (MI.getOpcode() == TargetOpcode::G_LOAD)
+          return UnableToLegalize;
+      MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    if (DstTy.isScalar()) {
+      unsigned TmpReg = MRI.createGenericVirtualRegister(
+          LLT::scalar(MMO.getSize() /* in bytes */ * 8));
+      MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
+      switch (MI.getOpcode()) {
+      default:
+        llvm_unreachable("Unexpected opcode");
+      case TargetOpcode::G_LOAD:
+        MIRBuilder.buildAnyExt(DstReg, TmpReg);
+        break;
+      case TargetOpcode::G_SEXTLOAD:
+        MIRBuilder.buildSExt(DstReg, TmpReg);
+        break;
+      case TargetOpcode::G_ZEXTLOAD:
+        MIRBuilder.buildZExt(DstReg, TmpReg);
+        break;
+      }
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    return UnableToLegalize;
+  }
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 9c27c59a0654..ae061b64a38c 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -24,12 +24,87 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <map>
+
 using namespace llvm;
+using namespace LegalizeActions;
+
+#define DEBUG_TYPE "legalizer-info"
+
+cl::opt<bool> llvm::DisableGISelLegalityCheck(
+    "disable-gisel-legality-check",
+    cl::desc("Don't verify that MIR is fully legal between GlobalISel passes"),
+    cl::Hidden);
+
+raw_ostream &LegalityQuery::print(raw_ostream &OS) const {
+  OS << Opcode << ", Tys={";
+  for (const auto &Type : Types) {
+    OS << Type << ", ";
+  }
+  OS << "}, Opcode=";
+
+  OS << Opcode << ", MMOs={";
+  for (const auto &MMODescr : MMODescrs) {
+    OS << MMODescr.Size << ", ";
+  }
+  OS << "}";
+
+  return OS;
+}
+
+LegalizeActionStep LegalizeRuleSet::apply(const LegalityQuery &Query) const {
+  LLVM_DEBUG(dbgs() << "Applying legalizer ruleset to: "; Query.print(dbgs());
+             dbgs() << "\n");
+  if (Rules.empty()) {
+    LLVM_DEBUG(dbgs() << ".. fallback to legacy rules (no rules defined)\n");
+    return {LegalizeAction::UseLegacyRules, 0, LLT{}};
+  }
+  for (const auto &Rule : Rules) {
+    if (Rule.match(Query)) {
+      LLVM_DEBUG(dbgs() << ".. match\n");
+      std::pair<unsigned, LLT> Mutation = Rule.determineMutation(Query);
+      LLVM_DEBUG(dbgs() << ".. .. " << (unsigned)Rule.getAction() << ", "
+                        << Mutation.first << ", " << Mutation.second << "\n");
+      assert((Query.Types[Mutation.first] != Mutation.second ||
+              Rule.getAction() == Lower ||
+              Rule.getAction() == MoreElements ||
+              Rule.getAction() == FewerElements) &&
+             "Simple loop detected");
+      return {Rule.getAction(), Mutation.first, Mutation.second};
+    } else
+      LLVM_DEBUG(dbgs() << ".. no match\n");
+  }
+  LLVM_DEBUG(dbgs() << ".. unsupported\n");
+  return {LegalizeAction::Unsupported, 0, LLT{}};
+}
+
+bool LegalizeRuleSet::verifyTypeIdxsCoverage(unsigned NumTypeIdxs) const {
+#ifndef NDEBUG
+  if (Rules.empty()) {
+    LLVM_DEBUG(
+        dbgs() << ".. type index coverage check SKIPPED: no rules defined\n");
+    return true;
+  }
+  const int64_t FirstUncovered = TypeIdxsCovered.find_first_unset();
+  if (FirstUncovered < 0) {
+    LLVM_DEBUG(dbgs() << ".. type index coverage check SKIPPED:"
+                         " user-defined predicate detected\n");
+    return true;
+  }
+  const bool AllCovered = (FirstUncovered >= NumTypeIdxs);
+  LLVM_DEBUG(dbgs() << ".. the first uncovered type index: " << FirstUncovered
+                    << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
+  return AllCovered;
+#else
+  return true;
+#endif
+}
 
 LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
   // Set defaults.
@@ -104,15 +179,16 @@ void LegalizerInfo::computeTables() {
         if (TypeIdx < ScalarSizeChangeStrategies[OpcodeIdx].size() &&
             ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
           S = ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx];
-        std::sort(ScalarSpecifiedActions.begin(), ScalarSpecifiedActions.end());
+        llvm::sort(ScalarSpecifiedActions.begin(),
+                   ScalarSpecifiedActions.end());
         checkPartialSizeAndActionsVector(ScalarSpecifiedActions);
         setScalarAction(Opcode, TypeIdx, S(ScalarSpecifiedActions));
       }
 
       // 2. Handle pointer types
       for (auto PointerSpecifiedActions : AddressSpace2SpecifiedActions) {
-        std::sort(PointerSpecifiedActions.second.begin(),
-                  PointerSpecifiedActions.second.end());
+        llvm::sort(PointerSpecifiedActions.second.begin(),
+                   PointerSpecifiedActions.second.end());
         checkPartialSizeAndActionsVector(PointerSpecifiedActions.second);
         // For pointer types, we assume that there isn't a meaningfull way
         // to change the number of bits used in the pointer.
@@ -124,8 +200,8 @@ void LegalizerInfo::computeTables() {
       // 3. Handle vector types
       SizeAndActionsVec ElementSizesSeen;
       for (auto VectorSpecifiedActions : ElemSize2SpecifiedActions) {
-        std::sort(VectorSpecifiedActions.second.begin(),
-                  VectorSpecifiedActions.second.end());
+        llvm::sort(VectorSpecifiedActions.second.begin(),
+                   VectorSpecifiedActions.second.end());
         const uint16_t ElementSize = VectorSpecifiedActions.first;
         ElementSizesSeen.push_back({ElementSize, Legal});
         checkPartialSizeAndActionsVector(VectorSpecifiedActions.second);
@@ -143,7 +219,7 @@ void LegalizerInfo::computeTables() {
             Opcode, TypeIdx, ElementSize,
             moreToWiderTypesAndLessToWidest(NumElementsActions));
       }
-      std::sort(ElementSizesSeen.begin(), ElementSizesSeen.end());
+      llvm::sort(ElementSizesSeen.begin(), ElementSizesSeen.end());
       SizeChangeStrategy VectorElementSizeChangeStrategy =
           &unsupportedForDifferentSizes;
       if (TypeIdx < VectorElementSizeChangeStrategies[OpcodeIdx].size() &&
@@ -162,8 +238,8 @@ void LegalizerInfo::computeTables() {
 // probably going to need specialized lookup structures for various types before
 // we have any hope of doing well with something like <13 x i3>. Even the common
 // cases should do better than what we have now.
-std::pair<LegalizerInfo::LegalizeAction, LLT>
-LegalizerInfo::getAction(const InstrAspect &Aspect) const {
+std::pair<LegalizeAction, LLT>
+LegalizerInfo::getAspectAction(const InstrAspect &Aspect) const {
   assert(TablesInitialized && "backend forgot to call computeTables");
   // These *have* to be implemented for now, they're the fundamental basis of
   // how everything else is transformed.
@@ -186,9 +262,87 @@ static LLT getTypeFromTypeIdx(const MachineInstr &MI,
   return MRI.getType(MI.getOperand(OpIdx).getReg());
 }
 
-std::tuple<LegalizerInfo::LegalizeAction, unsigned, LLT>
+unsigned LegalizerInfo::getOpcodeIdxForOpcode(unsigned Opcode) const {
+  assert(Opcode >= FirstOp && Opcode <= LastOp && "Unsupported opcode");
+  return Opcode - FirstOp;
+}
+
+unsigned LegalizerInfo::getActionDefinitionsIdx(unsigned Opcode) const {
+  unsigned OpcodeIdx = getOpcodeIdxForOpcode(Opcode);
+  if (unsigned Alias = RulesForOpcode[OpcodeIdx].getAlias()) {
+    LLVM_DEBUG(dbgs() << ".. opcode " << Opcode << " is aliased to " << Alias
+                      << "\n");
+    OpcodeIdx = getOpcodeIdxForOpcode(Alias);
+    LLVM_DEBUG(dbgs() << ".. opcode " << Alias << " is aliased to "
+                      << RulesForOpcode[OpcodeIdx].getAlias() << "\n");
+    assert(RulesForOpcode[OpcodeIdx].getAlias() == 0 && "Cannot chain aliases");
+  }
+
+  return OpcodeIdx;
+}
+
+const LegalizeRuleSet &
+LegalizerInfo::getActionDefinitions(unsigned Opcode) const {
+  unsigned OpcodeIdx = getActionDefinitionsIdx(Opcode);
+  return RulesForOpcode[OpcodeIdx];
+}
+
+LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(unsigned Opcode) {
+  unsigned OpcodeIdx = getActionDefinitionsIdx(Opcode);
+  auto &Result = RulesForOpcode[OpcodeIdx];
+  assert(!Result.isAliasedByAnother() && "Modifying this opcode will modify aliases");
+  return Result;
+}
+
+LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(
+    std::initializer_list<unsigned> Opcodes) {
+  unsigned Representative = *Opcodes.begin();
+
+  assert(Opcodes.begin() != Opcodes.end() &&
+         Opcodes.begin() + 1 != Opcodes.end() &&
+         "Initializer list must have at least two opcodes");
+
+  for (auto I = Opcodes.begin() + 1, E = Opcodes.end(); I != E; ++I)
+    aliasActionDefinitions(Representative, *I);
+
+  auto &Return = getActionDefinitionsBuilder(Representative);
+  Return.setIsAliasedByAnother();
+  return Return;
+}
+
+void LegalizerInfo::aliasActionDefinitions(unsigned OpcodeTo,
+                                           unsigned OpcodeFrom) {
+  assert(OpcodeTo != OpcodeFrom && "Cannot alias to self");
+  assert(OpcodeTo >= FirstOp && OpcodeTo <= LastOp && "Unsupported opcode");
+  const unsigned OpcodeFromIdx = getOpcodeIdxForOpcode(OpcodeFrom);
+  RulesForOpcode[OpcodeFromIdx].aliasTo(OpcodeTo);
+}
+
+LegalizeActionStep
+LegalizerInfo::getAction(const LegalityQuery &Query) const {
+  LegalizeActionStep Step = getActionDefinitions(Query.Opcode).apply(Query);
+  if (Step.Action != LegalizeAction::UseLegacyRules) {
+    return Step;
+  }
+
+  for (unsigned i = 0; i < Query.Types.size(); ++i) {
+    auto Action = getAspectAction({Query.Opcode, i, Query.Types[i]});
+    if (Action.first != Legal) {
+      LLVM_DEBUG(dbgs() << ".. (legacy) Type " << i
+                        << " Action=" << (unsigned)Action.first << ", "
+                        << Action.second << "\n");
+      return {Action.first, i, Action.second};
+    } else
+      LLVM_DEBUG(dbgs() << ".. (legacy) Type " << i << " Legal\n");
+  }
+  LLVM_DEBUG(dbgs() << ".. (legacy) Legal\n");
+  return {Legal, 0, LLT{}};
+}
+
+LegalizeActionStep
 LegalizerInfo::getAction(const MachineInstr &MI,
                          const MachineRegisterInfo &MRI) const {
+  SmallVector<LLT, 2> Types;
   SmallBitVector SeenTypes(8);
   const MCOperandInfo *OpInfo = MI.getDesc().OpInfo;
   // FIXME: probably we'll need to cache the results here somehow?
@@ -205,16 +359,20 @@ LegalizerInfo::getAction(const MachineInstr &MI,
     SeenTypes.set(TypeIdx);
 
     LLT Ty = getTypeFromTypeIdx(MI, MRI, i, TypeIdx);
-    auto Action = getAction({MI.getOpcode(), TypeIdx, Ty});
-    if (Action.first != Legal)
-      return std::make_tuple(Action.first, TypeIdx, Action.second);
+    Types.push_back(Ty);
   }
-  return std::make_tuple(Legal, 0, LLT{});
+
+  SmallVector<LegalityQuery::MemDesc, 2> MemDescrs;
+  for (const auto &MMO : MI.memoperands())
+    MemDescrs.push_back(
+        {MMO->getSize() /* in bytes */ * 8, MMO->getOrdering()});
+
+  return getAction({MI.getOpcode(), Types, MemDescrs});
 }
 
 bool LegalizerInfo::isLegal(const MachineInstr &MI,
                             const MachineRegisterInfo &MRI) const {
-  return std::get<0>(getAction(MI, MRI)) == Legal;
+  return getAction(MI, MRI).Action == Legal;
 }
 
 bool LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -312,17 +470,18 @@ LegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) {
   case Unsupported:
     return {Size, Unsupported};
   case NotFound:
+  case UseLegacyRules:
     llvm_unreachable("NotFound");
   }
   llvm_unreachable("Action has an unknown enum value");
 }
 
-std::pair<LegalizerInfo::LegalizeAction, LLT>
+std::pair<LegalizeAction, LLT>
 LegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
   assert(Aspect.Type.isScalar() || Aspect.Type.isPointer());
   if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
     return {NotFound, LLT()};
-  const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+  const unsigned OpcodeIdx = getOpcodeIdxForOpcode(Aspect.Opcode);
   if (Aspect.Type.isPointer() &&
       AddrSpace2PointerActions[OpcodeIdx].find(Aspect.Type.getAddressSpace()) ==
           AddrSpace2PointerActions[OpcodeIdx].end()) {
@@ -346,14 +505,14 @@ LegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
                                                 SizeAndAction.first)};
 }
 
-std::pair<LegalizerInfo::LegalizeAction, LLT>
+std::pair<LegalizeAction, LLT>
 LegalizerInfo::findVectorLegalAction(const InstrAspect &Aspect) const {
   assert(Aspect.Type.isVector());
   // First legalize the vector element size, then legalize the number of
   // lanes in the vector.
   if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
     return {NotFound, Aspect.Type};
-  const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+  const unsigned OpcodeIdx = getOpcodeIdxForOpcode(Aspect.Opcode);
   const unsigned TypeIdx = Aspect.Idx;
   if (TypeIdx >= ScalarInVectorActions[OpcodeIdx].size())
     return {NotFound, Aspect.Type};
@@ -380,3 +539,53 @@ LegalizerInfo::findVectorLegalAction(const InstrAspect &Aspect) const {
           LLT::vector(NumElementsAndAction.first,
                       IntermediateType.getScalarSizeInBits())};
 }
+
+/// \pre Type indices of every opcode form a dense set starting from 0.
+void LegalizerInfo::verify(const MCInstrInfo &MII) const {
+#ifndef NDEBUG
+  std::vector<unsigned> FailedOpcodes;
+  for (unsigned Opcode = FirstOp; Opcode <= LastOp; ++Opcode) {
+    const MCInstrDesc &MCID = MII.get(Opcode);
+    const unsigned NumTypeIdxs = std::accumulate(
+        MCID.opInfo_begin(), MCID.opInfo_end(), 0U,
+        [](unsigned Acc, const MCOperandInfo &OpInfo) {
+          return OpInfo.isGenericType()
+                     ? std::max(OpInfo.getGenericTypeIndex() + 1U, Acc)
+                     : Acc;
+        });
+    LLVM_DEBUG(dbgs() << MII.getName(Opcode) << " (opcode " << Opcode
+                      << "): " << NumTypeIdxs << " type ind"
+                      << (NumTypeIdxs == 1 ? "ex" : "ices") << "\n");
+    const LegalizeRuleSet &RuleSet = getActionDefinitions(Opcode);
+    if (!RuleSet.verifyTypeIdxsCoverage(NumTypeIdxs))
+      FailedOpcodes.push_back(Opcode);
+  }
+  if (!FailedOpcodes.empty()) {
+    errs() << "The following opcodes have ill-defined legalization rules:";
+    for (unsigned Opcode : FailedOpcodes)
+      errs() << " " << MII.getName(Opcode);
+    errs() << "\n";
+
+    report_fatal_error("ill-defined LegalizerInfo"
+                       ", try -debug-only=legalizer-info for details");
+  }
+#endif
+}
+
+#ifndef NDEBUG
+// FIXME: This should be in the MachineVerifier, but it can't use the
+// LegalizerInfo as it's currently in the separate GlobalISel library.
+// Note that RegBankSelected property already checked in the verifier
+// has the same layering problem, but we only use inline methods so
+// end up not needing to link against the GlobalISel library.
+const MachineInstr *llvm::machineFunctionIsIllegal(const MachineFunction &MF) {
+  if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) {
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    for (const MachineBasicBlock &MBB : MF)
+      for (const MachineInstr &MI : MBB)
+        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI))
+	  return &MI;
+  }
+  return nullptr;
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index 8e16470b6f90..52b340753a50 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -44,6 +44,11 @@ bool Localizer::shouldLocalize(const MachineInstr &MI) {
   }
 }
 
+void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
 bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
                            MachineBasicBlock *&InsertMBB) {
   MachineInstr &MIUse = *MOUse.getParent();
@@ -59,7 +64,7 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
           MachineFunctionProperties::Property::FailedISel))
     return false;
 
-  DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
 
   init(MF);
 
@@ -73,7 +78,7 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
     for (MachineInstr &MI : MBB) {
       if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
         continue;
-      DEBUG(dbgs() << "Should localize: " << MI);
+      LLVM_DEBUG(dbgs() << "Should localize: " << MI);
       assert(MI.getDesc().getNumDefs() == 1 &&
              "More than one definition not supported yet");
       unsigned Reg = MI.getOperand(0).getReg();
@@ -85,12 +90,12 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &MOUse = *MOIt++;
         // Check if the use is already local.
         MachineBasicBlock *InsertMBB;
-        DEBUG(MachineInstr &MIUse = *MOUse.getParent();
-              dbgs() << "Checking use: " << MIUse
-                     << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
+        LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
+                   dbgs() << "Checking use: " << MIUse
+                          << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
         if (isLocalUse(MOUse, MI, InsertMBB))
           continue;
-        DEBUG(dbgs() << "Fixing non-local use\n");
+        LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
         Changed = true;
         auto MBBAndReg = std::make_pair(InsertMBB, Reg);
         auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
@@ -111,10 +116,10 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
           LocalizedMI->getOperand(0).setReg(NewReg);
           NewVRegIt =
               MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
-          DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
+          LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
         }
-        DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
-                     << '\n');
+        LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
+                          << '\n');
         // Update the user reg.
         MOUse.setReg(NewVRegIt->second);
       }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 475bb82e5b9c..9df931eb81b3 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -22,96 +22,103 @@
 
 using namespace llvm;
 
-void MachineIRBuilder::setMF(MachineFunction &MF) {
-  this->MF = &MF;
-  this->MBB = nullptr;
-  this->MRI = &MF.getRegInfo();
-  this->TII = MF.getSubtarget().getInstrInfo();
-  this->DL = DebugLoc();
-  this->II = MachineBasicBlock::iterator();
-  this->InsertedInstr = nullptr;
-}
-
-void MachineIRBuilder::setMBB(MachineBasicBlock &MBB) {
-  this->MBB = &MBB;
-  this->II = MBB.end();
+void MachineIRBuilderBase::setMF(MachineFunction &MF) {
+  State.MF = &MF;
+  State.MBB = nullptr;
+  State.MRI = &MF.getRegInfo();
+  State.TII = MF.getSubtarget().getInstrInfo();
+  State.DL = DebugLoc();
+  State.II = MachineBasicBlock::iterator();
+  State.InsertedInstr = nullptr;
+}
+
+void MachineIRBuilderBase::setMBB(MachineBasicBlock &MBB) {
+  State.MBB = &MBB;
+  State.II = MBB.end();
   assert(&getMF() == MBB.getParent() &&
          "Basic block is in a different function");
 }
 
-void MachineIRBuilder::setInstr(MachineInstr &MI) {
+void MachineIRBuilderBase::setInstr(MachineInstr &MI) {
   assert(MI.getParent() && "Instruction is not part of a basic block");
   setMBB(*MI.getParent());
-  this->II = MI.getIterator();
+  State.II = MI.getIterator();
 }
 
-void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator II) {
+void MachineIRBuilderBase::setInsertPt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator II) {
   assert(MBB.getParent() == &getMF() &&
          "Basic block is in a different function");
-  this->MBB = &MBB;
-  this->II = II;
+  State.MBB = &MBB;
+  State.II = II;
 }
 
-void MachineIRBuilder::recordInsertions(
+void MachineIRBuilderBase::recordInsertion(MachineInstr *InsertedInstr) const {
+  if (State.InsertedInstr)
+    State.InsertedInstr(InsertedInstr);
+}
+
+void MachineIRBuilderBase::recordInsertions(
     std::function<void(MachineInstr *)> Inserted) {
-  InsertedInstr = std::move(Inserted);
+  State.InsertedInstr = std::move(Inserted);
 }
 
-void MachineIRBuilder::stopRecordingInsertions() {
-  InsertedInstr = nullptr;
+void MachineIRBuilderBase::stopRecordingInsertions() {
+  State.InsertedInstr = nullptr;
 }
 
 //------------------------------------------------------------------------------
 // Build instruction variants.
 //------------------------------------------------------------------------------
 
-MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opcode) {
+MachineInstrBuilder MachineIRBuilderBase::buildInstr(unsigned Opcode) {
   return insertInstr(buildInstrNoInsert(Opcode));
 }
 
-MachineInstrBuilder MachineIRBuilder::buildInstrNoInsert(unsigned Opcode) {
-  MachineInstrBuilder MIB = BuildMI(getMF(), DL, getTII().get(Opcode));
+MachineInstrBuilder MachineIRBuilderBase::buildInstrNoInsert(unsigned Opcode) {
+  MachineInstrBuilder MIB = BuildMI(getMF(), getDL(), getTII().get(Opcode));
   return MIB;
 }
 
-
-MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) {
+MachineInstrBuilder MachineIRBuilderBase::insertInstr(MachineInstrBuilder MIB) {
   getMBB().insert(getInsertPt(), MIB);
-  if (InsertedInstr)
-    InsertedInstr(MIB);
+  recordInsertion(MIB);
   return MIB;
 }
 
 MachineInstrBuilder
-MachineIRBuilder::buildDirectDbgValue(unsigned Reg, const MDNode *Variable,
-                                      const MDNode *Expr) {
+MachineIRBuilderBase::buildDirectDbgValue(unsigned Reg, const MDNode *Variable,
+                                          const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
-  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
-         "Expected inlined-at fields to agree");
-  return insertInstr(BuildMI(getMF(), DL, getTII().get(TargetOpcode::DBG_VALUE),
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
+  return insertInstr(BuildMI(getMF(), getDL(),
+                             getTII().get(TargetOpcode::DBG_VALUE),
                              /*IsIndirect*/ false, Reg, Variable, Expr));
 }
 
-MachineInstrBuilder
-MachineIRBuilder::buildIndirectDbgValue(unsigned Reg, const MDNode *Variable,
-                                        const MDNode *Expr) {
+MachineInstrBuilder MachineIRBuilderBase::buildIndirectDbgValue(
+    unsigned Reg, const MDNode *Variable, const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
-  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
-         "Expected inlined-at fields to agree");
-  return insertInstr(BuildMI(getMF(), DL, getTII().get(TargetOpcode::DBG_VALUE),
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
+  return insertInstr(BuildMI(getMF(), getDL(),
+                             getTII().get(TargetOpcode::DBG_VALUE),
                              /*IsIndirect*/ true, Reg, Variable, Expr));
 }
 
-MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
-                                                      const MDNode *Variable,
-                                                      const MDNode *Expr) {
+MachineInstrBuilder
+MachineIRBuilderBase::buildFIDbgValue(int FI, const MDNode *Variable,
+                                      const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
-  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
-         "Expected inlined-at fields to agree");
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
   return buildInstr(TargetOpcode::DBG_VALUE)
       .addFrameIndex(FI)
       .addImm(0)
@@ -119,13 +126,13 @@ MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
       .addMetadata(Expr);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
-                                                         const MDNode *Variable,
-                                                         const MDNode *Expr) {
+MachineInstrBuilder MachineIRBuilderBase::buildConstDbgValue(
+    const Constant &C, const MDNode *Variable, const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
-  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
-         "Expected inlined-at fields to agree");
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
   auto MIB = buildInstr(TargetOpcode::DBG_VALUE);
   if (auto *CI = dyn_cast<ConstantInt>(&C)) {
     if (CI->getBitWidth() > 64)
@@ -142,17 +149,18 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
   return MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) {
-  assert(MRI->getType(Res).isPointer() && "invalid operand type");
+MachineInstrBuilder MachineIRBuilderBase::buildFrameIndex(unsigned Res,
+                                                          int Idx) {
+  assert(getMRI()->getType(Res).isPointer() && "invalid operand type");
   return buildInstr(TargetOpcode::G_FRAME_INDEX)
       .addDef(Res)
       .addFrameIndex(Idx);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res,
-                                                       const GlobalValue *GV) {
-  assert(MRI->getType(Res).isPointer() && "invalid operand type");
-  assert(MRI->getType(Res).getAddressSpace() ==
+MachineInstrBuilder
+MachineIRBuilderBase::buildGlobalValue(unsigned Res, const GlobalValue *GV) {
+  assert(getMRI()->getType(Res).isPointer() && "invalid operand type");
+  assert(getMRI()->getType(Res).getAddressSpace() ==
              GV->getType()->getAddressSpace() &&
          "address space mismatch");
 
@@ -161,29 +169,20 @@ MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res,
       .addGlobalAddress(GV);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildBinaryOp(unsigned Opcode, unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
+void MachineIRBuilderBase::validateBinaryOp(unsigned Res, unsigned Op0,
+                                            unsigned Op1) {
+  assert((getMRI()->getType(Res).isScalar() ||
+          getMRI()->getType(Res).isVector()) &&
          "invalid operand type");
-  assert(MRI->getType(Res) == MRI->getType(Op0) &&
-         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
-
-  return buildInstr(Opcode)
-      .addDef(Res)
-      .addUse(Op0)
-      .addUse(Op1);
-}
-
-MachineInstrBuilder MachineIRBuilder::buildAdd(unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  return buildBinaryOp(TargetOpcode::G_ADD, Res, Op0, Op1);
+  assert(getMRI()->getType(Res) == getMRI()->getType(Op0) &&
+         getMRI()->getType(Res) == getMRI()->getType(Op1) && "type mismatch");
 }
 
-MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  assert(MRI->getType(Res).isPointer() &&
-         MRI->getType(Res) == MRI->getType(Op0) && "type mismatch");
-  assert(MRI->getType(Op1).isScalar()  && "invalid offset type");
+MachineInstrBuilder MachineIRBuilderBase::buildGEP(unsigned Res, unsigned Op0,
+                                                   unsigned Op1) {
+  assert(getMRI()->getType(Res).isPointer() &&
+         getMRI()->getType(Res) == getMRI()->getType(Op0) && "type mismatch");
+  assert(getMRI()->getType(Op1).isScalar() && "invalid offset type");
 
   return buildInstr(TargetOpcode::G_GEP)
       .addDef(Res)
@@ -192,8 +191,8 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
 }
 
 Optional<MachineInstrBuilder>
-MachineIRBuilder::materializeGEP(unsigned &Res, unsigned Op0,
-                                 const LLT &ValueTy, uint64_t Value) {
+MachineIRBuilderBase::materializeGEP(unsigned &Res, unsigned Op0,
+                                     const LLT &ValueTy, uint64_t Value) {
   assert(Res == 0 && "Res is a result argument");
   assert(ValueTy.isScalar()  && "invalid offset type");
 
@@ -202,17 +201,18 @@ MachineIRBuilder::materializeGEP(unsigned &Res, unsigned Op0,
     return None;
   }
 
-  Res = MRI->createGenericVirtualRegister(MRI->getType(Op0));
-  unsigned TmpReg = MRI->createGenericVirtualRegister(ValueTy);
+  Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0));
+  unsigned TmpReg = getMRI()->createGenericVirtualRegister(ValueTy);
 
   buildConstant(TmpReg, Value);
   return buildGEP(Res, Op0, TmpReg);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0,
-                                                   uint32_t NumBits) {
-  assert(MRI->getType(Res).isPointer() &&
-         MRI->getType(Res) == MRI->getType(Op0) && "type mismatch");
+MachineInstrBuilder MachineIRBuilderBase::buildPtrMask(unsigned Res,
+                                                       unsigned Op0,
+                                                       uint32_t NumBits) {
+  assert(getMRI()->getType(Res).isPointer() &&
+         getMRI()->getType(Res) == getMRI()->getType(Op0) && "type mismatch");
 
   return buildInstr(TargetOpcode::G_PTR_MASK)
       .addDef(Res)
@@ -220,92 +220,88 @@ MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0,
       .addImm(NumBits);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  return buildBinaryOp(TargetOpcode::G_SUB, Res, Op0, Op1);
-}
-
-MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  return buildBinaryOp(TargetOpcode::G_MUL, Res, Op0, Op1);
-}
-
-MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  return buildBinaryOp(TargetOpcode::G_AND, Res, Op0, Op1);
-}
-
-MachineInstrBuilder MachineIRBuilder::buildOr(unsigned Res, unsigned Op0,
-                                              unsigned Op1) {
-  return buildBinaryOp(TargetOpcode::G_OR, Res, Op0, Op1);
-}
-
-MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
+MachineInstrBuilder MachineIRBuilderBase::buildBr(MachineBasicBlock &Dest) {
   return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) {
-  assert(MRI->getType(Tgt).isPointer() && "invalid branch destination");
+MachineInstrBuilder MachineIRBuilderBase::buildBrIndirect(unsigned Tgt) {
+  assert(getMRI()->getType(Tgt).isPointer() && "invalid branch destination");
   return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) {
-  assert(MRI->getType(Res) == LLT() || MRI->getType(Op) == LLT() ||
-         MRI->getType(Res) == MRI->getType(Op));
+MachineInstrBuilder MachineIRBuilderBase::buildCopy(unsigned Res, unsigned Op) {
+  assert(getMRI()->getType(Res) == LLT() || getMRI()->getType(Op) == LLT() ||
+         getMRI()->getType(Res) == getMRI()->getType(Op));
   return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildConstant(unsigned Res,
-                                                    const ConstantInt &Val) {
-  LLT Ty = MRI->getType(Res);
+MachineInstrBuilder
+MachineIRBuilderBase::buildConstant(unsigned Res, const ConstantInt &Val) {
+  LLT Ty = getMRI()->getType(Res);
 
   assert((Ty.isScalar() || Ty.isPointer()) && "invalid operand type");
 
   const ConstantInt *NewVal = &Val;
   if (Ty.getSizeInBits() != Val.getBitWidth())
-    NewVal = ConstantInt::get(MF->getFunction().getContext(),
+    NewVal = ConstantInt::get(getMF().getFunction().getContext(),
                               Val.getValue().sextOrTrunc(Ty.getSizeInBits()));
 
   return buildInstr(TargetOpcode::G_CONSTANT).addDef(Res).addCImm(NewVal);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildConstant(unsigned Res,
-                                                    int64_t Val) {
-  auto IntN = IntegerType::get(MF->getFunction().getContext(),
-                               MRI->getType(Res).getSizeInBits());
+MachineInstrBuilder MachineIRBuilderBase::buildConstant(unsigned Res,
+                                                        int64_t Val) {
+  auto IntN = IntegerType::get(getMF().getFunction().getContext(),
+                               getMRI()->getType(Res).getSizeInBits());
   ConstantInt *CI = ConstantInt::get(IntN, Val, true);
   return buildConstant(Res, *CI);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildFConstant(unsigned Res,
-                                                     const ConstantFP &Val) {
-  assert(MRI->getType(Res).isScalar() && "invalid operand type");
+MachineInstrBuilder
+MachineIRBuilderBase::buildFConstant(unsigned Res, const ConstantFP &Val) {
+  assert(getMRI()->getType(Res).isScalar() && "invalid operand type");
 
   return buildInstr(TargetOpcode::G_FCONSTANT).addDef(Res).addFPImm(&Val);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildBrCond(unsigned Tst,
-                                                  MachineBasicBlock &Dest) {
-  assert(MRI->getType(Tst).isScalar() && "invalid operand type");
+MachineInstrBuilder MachineIRBuilderBase::buildFConstant(unsigned Res,
+                                                         double Val) {
+  LLT DstTy = getMRI()->getType(Res);
+  auto &Ctx = getMF().getFunction().getContext();
+  auto *CFP =
+      ConstantFP::get(Ctx, getAPFloatFromSize(Val, DstTy.getSizeInBits()));
+  return buildFConstant(Res, *CFP);
+}
+
+MachineInstrBuilder MachineIRBuilderBase::buildBrCond(unsigned Tst,
+                                                      MachineBasicBlock &Dest) {
+  assert(getMRI()->getType(Tst).isScalar() && "invalid operand type");
 
   return buildInstr(TargetOpcode::G_BRCOND).addUse(Tst).addMBB(&Dest);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildLoad(unsigned Res, unsigned Addr,
-                                                MachineMemOperand &MMO) {
-  assert(MRI->getType(Res).isValid() && "invalid operand type");
-  assert(MRI->getType(Addr).isPointer() && "invalid operand type");
+MachineInstrBuilder MachineIRBuilderBase::buildLoad(unsigned Res, unsigned Addr,
+                                                    MachineMemOperand &MMO) {
+  return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilderBase::buildLoadInstr(unsigned Opcode, unsigned Res,
+                                     unsigned Addr, MachineMemOperand &MMO) {
+  assert(getMRI()->getType(Res).isValid() && "invalid operand type");
+  assert(getMRI()->getType(Addr).isPointer() && "invalid operand type");
 
-  return buildInstr(TargetOpcode::G_LOAD)
+  return buildInstr(Opcode)
       .addDef(Res)
       .addUse(Addr)
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildStore(unsigned Val, unsigned Addr,
-                                                 MachineMemOperand &MMO) {
-  assert(MRI->getType(Val).isValid() && "invalid operand type");
-  assert(MRI->getType(Addr).isPointer() && "invalid operand type");
+MachineInstrBuilder MachineIRBuilderBase::buildStore(unsigned Val,
+                                                     unsigned Addr,
+                                                     MachineMemOperand &MMO) {
+  assert(getMRI()->getType(Val).isValid() && "invalid operand type");
+  assert(getMRI()->getType(Addr).isPointer() && "invalid operand type");
 
   return buildInstr(TargetOpcode::G_STORE)
       .addUse(Val)
@@ -313,15 +309,16 @@ MachineInstrBuilder MachineIRBuilder::buildStore(unsigned Val, unsigned Addr,
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildUAdde(unsigned Res,
-                                                 unsigned CarryOut,
-                                                 unsigned Op0, unsigned Op1,
-                                                 unsigned CarryIn) {
-  assert(MRI->getType(Res).isScalar() && "invalid operand type");
-  assert(MRI->getType(Res) == MRI->getType(Op0) &&
-         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
-  assert(MRI->getType(CarryOut).isScalar() && "invalid operand type");
-  assert(MRI->getType(CarryOut) == MRI->getType(CarryIn) && "type mismatch");
+MachineInstrBuilder MachineIRBuilderBase::buildUAdde(unsigned Res,
+                                                     unsigned CarryOut,
+                                                     unsigned Op0, unsigned Op1,
+                                                     unsigned CarryIn) {
+  assert(getMRI()->getType(Res).isScalar() && "invalid operand type");
+  assert(getMRI()->getType(Res) == getMRI()->getType(Op0) &&
+         getMRI()->getType(Res) == getMRI()->getType(Op1) && "type mismatch");
+  assert(getMRI()->getType(CarryOut).isScalar() && "invalid operand type");
+  assert(getMRI()->getType(CarryOut) == getMRI()->getType(CarryIn) &&
+         "type mismatch");
 
   return buildInstr(TargetOpcode::G_UADDE)
       .addDef(Res)
@@ -331,58 +328,64 @@ MachineInstrBuilder MachineIRBuilder::buildUAdde(unsigned Res,
       .addUse(CarryIn);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildAnyExt(unsigned Res, unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildAnyExt(unsigned Res,
+                                                      unsigned Op) {
   validateTruncExt(Res, Op, true);
   return buildInstr(TargetOpcode::G_ANYEXT).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildSExt(unsigned Res, unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildSExt(unsigned Res, unsigned Op) {
   validateTruncExt(Res, Op, true);
   return buildInstr(TargetOpcode::G_SEXT).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildZExt(unsigned Res, unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildZExt(unsigned Res, unsigned Op) {
   validateTruncExt(Res, Op, true);
   return buildInstr(TargetOpcode::G_ZEXT).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder
-MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc, unsigned Res, unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildExtOrTrunc(unsigned ExtOpc,
+                                                          unsigned Res,
+                                                          unsigned Op) {
   assert((TargetOpcode::G_ANYEXT == ExtOpc || TargetOpcode::G_ZEXT == ExtOpc ||
           TargetOpcode::G_SEXT == ExtOpc) &&
          "Expecting Extending Opc");
-  assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector());
-  assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar());
+  assert(getMRI()->getType(Res).isScalar() ||
+         getMRI()->getType(Res).isVector());
+  assert(getMRI()->getType(Res).isScalar() == getMRI()->getType(Op).isScalar());
 
   unsigned Opcode = TargetOpcode::COPY;
-  if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
+  if (getMRI()->getType(Res).getSizeInBits() >
+      getMRI()->getType(Op).getSizeInBits())
     Opcode = ExtOpc;
-  else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
+  else if (getMRI()->getType(Res).getSizeInBits() <
+           getMRI()->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_TRUNC;
   else
-    assert(MRI->getType(Res) == MRI->getType(Op));
+    assert(getMRI()->getType(Res) == getMRI()->getType(Op));
 
   return buildInstr(Opcode).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res,
-                                                       unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildSExtOrTrunc(unsigned Res,
+                                                           unsigned Op) {
   return buildExtOrTrunc(TargetOpcode::G_SEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res,
-                                                       unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildZExtOrTrunc(unsigned Res,
+                                                           unsigned Op) {
   return buildExtOrTrunc(TargetOpcode::G_ZEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildAnyExtOrTrunc(unsigned Res,
-                                                         unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildAnyExtOrTrunc(unsigned Res,
+                                                             unsigned Op) {
   return buildExtOrTrunc(TargetOpcode::G_ANYEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) {
-  LLT SrcTy = MRI->getType(Src);
-  LLT DstTy = MRI->getType(Dst);
+MachineInstrBuilder MachineIRBuilderBase::buildCast(unsigned Dst,
+                                                    unsigned Src) {
+  LLT SrcTy = getMRI()->getType(Src);
+  LLT DstTy = getMRI()->getType(Dst);
   if (SrcTy == DstTy)
     return buildCopy(Dst, Src);
 
@@ -399,17 +402,18 @@ MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) {
   return buildInstr(Opcode).addDef(Dst).addUse(Src);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
-                                                   uint64_t Index) {
+MachineInstrBuilder
+MachineIRBuilderBase::buildExtract(unsigned Res, unsigned Src, uint64_t Index) {
 #ifndef NDEBUG
-  assert(MRI->getType(Src).isValid() && "invalid operand type");
-  assert(MRI->getType(Res).isValid() && "invalid operand type");
-  assert(Index + MRI->getType(Res).getSizeInBits() <=
-             MRI->getType(Src).getSizeInBits() &&
+  assert(getMRI()->getType(Src).isValid() && "invalid operand type");
+  assert(getMRI()->getType(Res).isValid() && "invalid operand type");
+  assert(Index + getMRI()->getType(Res).getSizeInBits() <=
+             getMRI()->getType(Src).getSizeInBits() &&
          "extracting off end of register");
 #endif
 
-  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) {
+  if (getMRI()->getType(Res).getSizeInBits() ==
+      getMRI()->getType(Src).getSizeInBits()) {
     assert(Index == 0 && "insertion past the end of a register");
     return buildCast(Res, Src);
   }
@@ -420,25 +424,25 @@ MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
       .addImm(Index);
 }
 
-void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
-                                     ArrayRef<uint64_t> Indices) {
+void MachineIRBuilderBase::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
+                                         ArrayRef<uint64_t> Indices) {
 #ifndef NDEBUG
   assert(Ops.size() == Indices.size() && "incompatible args");
   assert(!Ops.empty() && "invalid trivial sequence");
   assert(std::is_sorted(Indices.begin(), Indices.end()) &&
          "sequence offsets must be in ascending order");
 
-  assert(MRI->getType(Res).isValid() && "invalid operand type");
+  assert(getMRI()->getType(Res).isValid() && "invalid operand type");
   for (auto Op : Ops)
-    assert(MRI->getType(Op).isValid() && "invalid operand type");
+    assert(getMRI()->getType(Op).isValid() && "invalid operand type");
 #endif
 
-  LLT ResTy = MRI->getType(Res);
-  LLT OpTy = MRI->getType(Ops[0]);
+  LLT ResTy = getMRI()->getType(Res);
+  LLT OpTy = getMRI()->getType(Ops[0]);
   unsigned OpSize = OpTy.getSizeInBits();
   bool MaybeMerge = true;
   for (unsigned i = 0; i < Ops.size(); ++i) {
-    if (MRI->getType(Ops[i]) != OpTy || Indices[i] != i * OpSize) {
+    if (getMRI()->getType(Ops[i]) != OpTy || Indices[i] != i * OpSize) {
       MaybeMerge = false;
       break;
     }
@@ -449,31 +453,32 @@ void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
     return;
   }
 
-  unsigned ResIn = MRI->createGenericVirtualRegister(ResTy);
+  unsigned ResIn = getMRI()->createGenericVirtualRegister(ResTy);
   buildUndef(ResIn);
 
   for (unsigned i = 0; i < Ops.size(); ++i) {
-    unsigned ResOut =
-        i + 1 == Ops.size() ? Res : MRI->createGenericVirtualRegister(ResTy);
+    unsigned ResOut = i + 1 == Ops.size()
+                          ? Res
+                          : getMRI()->createGenericVirtualRegister(ResTy);
     buildInsert(ResOut, ResIn, Ops[i], Indices[i]);
     ResIn = ResOut;
   }
 }
 
-MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) {
+MachineInstrBuilder MachineIRBuilderBase::buildUndef(unsigned Res) {
   return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
-                                                 ArrayRef<unsigned> Ops) {
+MachineInstrBuilder MachineIRBuilderBase::buildMerge(unsigned Res,
+                                                     ArrayRef<unsigned> Ops) {
 
 #ifndef NDEBUG
   assert(!Ops.empty() && "invalid trivial sequence");
-  LLT Ty = MRI->getType(Ops[0]);
+  LLT Ty = getMRI()->getType(Ops[0]);
   for (auto Reg : Ops)
-    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
-  assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() ==
-             MRI->getType(Res).getSizeInBits() &&
+    assert(getMRI()->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Ops.size() * getMRI()->getType(Ops[0]).getSizeInBits() ==
+             getMRI()->getType(Res).getSizeInBits() &&
          "input operands do not cover output register");
 #endif
 
@@ -487,16 +492,16 @@ MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
   return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
-                                                   unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildUnmerge(ArrayRef<unsigned> Res,
+                                                       unsigned Op) {
 
 #ifndef NDEBUG
   assert(!Res.empty() && "invalid trivial sequence");
-  LLT Ty = MRI->getType(Res[0]);
+  LLT Ty = getMRI()->getType(Res[0]);
   for (auto Reg : Res)
-    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
-  assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() ==
-             MRI->getType(Op).getSizeInBits() &&
+    assert(getMRI()->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Res.size() * getMRI()->getType(Res[0]).getSizeInBits() ==
+             getMRI()->getType(Op).getSizeInBits() &&
          "input operands do not cover output register");
 #endif
 
@@ -507,13 +512,15 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
   return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
-                                                  unsigned Op, unsigned Index) {
-  assert(Index + MRI->getType(Op).getSizeInBits() <=
-             MRI->getType(Res).getSizeInBits() &&
+MachineInstrBuilder MachineIRBuilderBase::buildInsert(unsigned Res,
+                                                      unsigned Src, unsigned Op,
+                                                      unsigned Index) {
+  assert(Index + getMRI()->getType(Op).getSizeInBits() <=
+             getMRI()->getType(Res).getSizeInBits() &&
          "insertion past the end of a register");
 
-  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) {
+  if (getMRI()->getType(Res).getSizeInBits() ==
+      getMRI()->getType(Op).getSizeInBits()) {
     return buildCast(Res, Op);
   }
 
@@ -524,9 +531,9 @@ MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
       .addImm(Index);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
-                                                     unsigned Res,
-                                                     bool HasSideEffects) {
+MachineInstrBuilder MachineIRBuilderBase::buildIntrinsic(Intrinsic::ID ID,
+                                                         unsigned Res,
+                                                         bool HasSideEffects) {
   auto MIB =
       buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
                                 : TargetOpcode::G_INTRINSIC);
@@ -536,28 +543,30 @@ MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
   return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilder::buildTrunc(unsigned Res, unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildTrunc(unsigned Res,
+                                                     unsigned Op) {
   validateTruncExt(Res, Op, false);
   return buildInstr(TargetOpcode::G_TRUNC).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildFPTrunc(unsigned Res, unsigned Op) {
+MachineInstrBuilder MachineIRBuilderBase::buildFPTrunc(unsigned Res,
+                                                       unsigned Op) {
   validateTruncExt(Res, Op, false);
   return buildInstr(TargetOpcode::G_FPTRUNC).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred,
-                                                unsigned Res, unsigned Op0,
-                                                unsigned Op1) {
+MachineInstrBuilder MachineIRBuilderBase::buildICmp(CmpInst::Predicate Pred,
+                                                    unsigned Res, unsigned Op0,
+                                                    unsigned Op1) {
 #ifndef NDEBUG
-  assert(MRI->getType(Op0) == MRI->getType(Op0) && "type mismatch");
+  assert(getMRI()->getType(Op0) == getMRI()->getType(Op0) && "type mismatch");
   assert(CmpInst::isIntPredicate(Pred) && "invalid predicate");
-  if (MRI->getType(Op0).isScalar() || MRI->getType(Op0).isPointer())
-    assert(MRI->getType(Res).isScalar() && "type mismatch");
+  if (getMRI()->getType(Op0).isScalar() || getMRI()->getType(Op0).isPointer())
+    assert(getMRI()->getType(Res).isScalar() && "type mismatch");
   else
-    assert(MRI->getType(Res).isVector() &&
-           MRI->getType(Res).getNumElements() ==
-               MRI->getType(Op0).getNumElements() &&
+    assert(getMRI()->getType(Res).isVector() &&
+           getMRI()->getType(Res).getNumElements() ==
+               getMRI()->getType(Op0).getNumElements() &&
            "type mismatch");
 #endif
 
@@ -568,20 +577,21 @@ MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred,
       .addUse(Op1);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
-                                                unsigned Res, unsigned Op0,
-                                                unsigned Op1) {
+MachineInstrBuilder MachineIRBuilderBase::buildFCmp(CmpInst::Predicate Pred,
+                                                    unsigned Res, unsigned Op0,
+                                                    unsigned Op1) {
 #ifndef NDEBUG
-  assert((MRI->getType(Op0).isScalar() || MRI->getType(Op0).isVector()) &&
+  assert((getMRI()->getType(Op0).isScalar() ||
+          getMRI()->getType(Op0).isVector()) &&
          "invalid operand type");
-  assert(MRI->getType(Op0) == MRI->getType(Op1) && "type mismatch");
+  assert(getMRI()->getType(Op0) == getMRI()->getType(Op1) && "type mismatch");
   assert(CmpInst::isFPPredicate(Pred) && "invalid predicate");
-  if (MRI->getType(Op0).isScalar())
-    assert(MRI->getType(Res).isScalar() && "type mismatch");
+  if (getMRI()->getType(Op0).isScalar())
+    assert(getMRI()->getType(Res).isScalar() && "type mismatch");
   else
-    assert(MRI->getType(Res).isVector() &&
-           MRI->getType(Res).getNumElements() ==
-               MRI->getType(Op0).getNumElements() &&
+    assert(getMRI()->getType(Res).isVector() &&
+           getMRI()->getType(Res).getNumElements() ==
+               getMRI()->getType(Op0).getNumElements() &&
            "type mismatch");
 #endif
 
@@ -592,21 +602,23 @@ MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
       .addUse(Op1);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
-                                                  unsigned Op0, unsigned Op1) {
+MachineInstrBuilder MachineIRBuilderBase::buildSelect(unsigned Res,
+                                                      unsigned Tst,
+                                                      unsigned Op0,
+                                                      unsigned Op1) {
 #ifndef NDEBUG
-  LLT ResTy = MRI->getType(Res);
+  LLT ResTy = getMRI()->getType(Res);
   assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) &&
          "invalid operand type");
-  assert(ResTy == MRI->getType(Op0) && ResTy == MRI->getType(Op1) &&
+  assert(ResTy == getMRI()->getType(Op0) && ResTy == getMRI()->getType(Op1) &&
          "type mismatch");
   if (ResTy.isScalar() || ResTy.isPointer())
-    assert(MRI->getType(Tst).isScalar() && "type mismatch");
+    assert(getMRI()->getType(Tst).isScalar() && "type mismatch");
   else
-    assert((MRI->getType(Tst).isScalar() ||
-            (MRI->getType(Tst).isVector() &&
-             MRI->getType(Tst).getNumElements() ==
-                 MRI->getType(Op0).getNumElements())) &&
+    assert((getMRI()->getType(Tst).isScalar() ||
+            (getMRI()->getType(Tst).isVector() &&
+             getMRI()->getType(Tst).getNumElements() ==
+                 getMRI()->getType(Op0).getNumElements())) &&
            "type mismatch");
 #endif
 
@@ -617,15 +629,14 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
       .addUse(Op1);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res,
-                                                               unsigned Val,
-                                                               unsigned Elt,
-                                                               unsigned Idx) {
+MachineInstrBuilder
+MachineIRBuilderBase::buildInsertVectorElement(unsigned Res, unsigned Val,
+                                               unsigned Elt, unsigned Idx) {
 #ifndef NDEBUG
-  LLT ResTy = MRI->getType(Res);
-  LLT ValTy = MRI->getType(Val);
-  LLT EltTy = MRI->getType(Elt);
-  LLT IdxTy = MRI->getType(Idx);
+  LLT ResTy = getMRI()->getType(Res);
+  LLT ValTy = getMRI()->getType(Val);
+  LLT EltTy = getMRI()->getType(Elt);
+  LLT IdxTy = getMRI()->getType(Idx);
   assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type");
   assert(IdxTy.isScalar() && "invalid operand type");
   assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch");
@@ -639,13 +650,13 @@ MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res,
       .addUse(Idx);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res,
-                                                                unsigned Val,
-                                                                unsigned Idx) {
+MachineInstrBuilder
+MachineIRBuilderBase::buildExtractVectorElement(unsigned Res, unsigned Val,
+                                                unsigned Idx) {
 #ifndef NDEBUG
-  LLT ResTy = MRI->getType(Res);
-  LLT ValTy = MRI->getType(Val);
-  LLT IdxTy = MRI->getType(Idx);
+  LLT ResTy = getMRI()->getType(Res);
+  LLT ValTy = getMRI()->getType(Val);
+  LLT IdxTy = getMRI()->getType(Idx);
   assert(ValTy.isVector() && "invalid operand type");
   assert((ResTy.isScalar() || ResTy.isPointer()) && "invalid operand type");
   assert(IdxTy.isScalar() && "invalid operand type");
@@ -658,15 +669,42 @@ MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res,
       .addUse(Idx);
 }
 
+MachineInstrBuilder MachineIRBuilderBase::buildAtomicCmpXchgWithSuccess(
+    unsigned OldValRes, unsigned SuccessRes, unsigned Addr, unsigned CmpVal,
+    unsigned NewVal, MachineMemOperand &MMO) {
+#ifndef NDEBUG
+  LLT OldValResTy = getMRI()->getType(OldValRes);
+  LLT SuccessResTy = getMRI()->getType(SuccessRes);
+  LLT AddrTy = getMRI()->getType(Addr);
+  LLT CmpValTy = getMRI()->getType(CmpVal);
+  LLT NewValTy = getMRI()->getType(NewVal);
+  assert(OldValResTy.isScalar() && "invalid operand type");
+  assert(SuccessResTy.isScalar() && "invalid operand type");
+  assert(AddrTy.isPointer() && "invalid operand type");
+  assert(CmpValTy.isValid() && "invalid operand type");
+  assert(NewValTy.isValid() && "invalid operand type");
+  assert(OldValResTy == CmpValTy && "type mismatch");
+  assert(OldValResTy == NewValTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+      .addDef(OldValRes)
+      .addDef(SuccessRes)
+      .addUse(Addr)
+      .addUse(CmpVal)
+      .addUse(NewVal)
+      .addMemOperand(&MMO);
+}
+
 MachineInstrBuilder
-MachineIRBuilder::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
-                                     unsigned CmpVal, unsigned NewVal,
-                                     MachineMemOperand &MMO) {
+MachineIRBuilderBase::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
+                                         unsigned CmpVal, unsigned NewVal,
+                                         MachineMemOperand &MMO) {
 #ifndef NDEBUG
-  LLT OldValResTy = MRI->getType(OldValRes);
-  LLT AddrTy = MRI->getType(Addr);
-  LLT CmpValTy = MRI->getType(CmpVal);
-  LLT NewValTy = MRI->getType(NewVal);
+  LLT OldValResTy = getMRI()->getType(OldValRes);
+  LLT AddrTy = getMRI()->getType(Addr);
+  LLT CmpValTy = getMRI()->getType(CmpVal);
+  LLT NewValTy = getMRI()->getType(NewVal);
   assert(OldValResTy.isScalar() && "invalid operand type");
   assert(AddrTy.isPointer() && "invalid operand type");
   assert(CmpValTy.isValid() && "invalid operand type");
@@ -683,14 +721,102 @@ MachineIRBuilder::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
       .addMemOperand(&MMO);
 }
 
-void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src,
-                                        bool IsExtend) {
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMW(unsigned Opcode, unsigned OldValRes,
+                                     unsigned Addr, unsigned Val,
+                                     MachineMemOperand &MMO) {
+#ifndef NDEBUG
+  LLT OldValResTy = getMRI()->getType(OldValRes);
+  LLT AddrTy = getMRI()->getType(Addr);
+  LLT ValTy = getMRI()->getType(Val);
+  assert(OldValResTy.isScalar() && "invalid operand type");
+  assert(AddrTy.isPointer() && "invalid operand type");
+  assert(ValTy.isValid() && "invalid operand type");
+  assert(OldValResTy == ValTy && "type mismatch");
+#endif
+
+  return buildInstr(Opcode)
+      .addDef(OldValRes)
+      .addUse(Addr)
+      .addUse(Val)
+      .addMemOperand(&MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWXchg(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XCHG, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWAdd(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_ADD, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWSub(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_SUB, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWAnd(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_AND, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWNand(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_NAND, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWOr(unsigned OldValRes, unsigned Addr,
+                                       unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_OR, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWXor(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XOR, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWMax(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MAX, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWMin(unsigned OldValRes, unsigned Addr,
+                                        unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MIN, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWUmax(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMAX, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilderBase::buildAtomicRMWUmin(unsigned OldValRes, unsigned Addr,
+                                         unsigned Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMIN, OldValRes, Addr, Val,
+                        MMO);
+}
+
+void MachineIRBuilderBase::validateTruncExt(unsigned Dst, unsigned Src,
+                                            bool IsExtend) {
 #ifndef NDEBUG
-  LLT SrcTy = MRI->getType(Src);
-  LLT DstTy = MRI->getType(Dst);
+  LLT SrcTy = getMRI()->getType(Src);
+  LLT DstTy = getMRI()->getType(Dst);
 
   if (DstTy.isVector()) {
-    assert(SrcTy.isVector() && "mismatched cast between vecot and non-vector");
+    assert(SrcTy.isVector() && "mismatched cast between vector and non-vector");
     assert(SrcTy.getNumElements() == DstTy.getNumElements() &&
            "different number of elements in a trunc/ext");
   } else
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 006c9ea23034..9e2d48d1dc42 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
@@ -75,7 +76,7 @@ RegBankSelect::RegBankSelect(Mode RunningMode)
   if (RegBankSelectMode.getNumOccurrences() != 0) {
     OptMode = RegBankSelectMode;
     if (RegBankSelectMode != RunningMode)
-      DEBUG(dbgs() << "RegBankSelect mode overrided by command line\n");
+      LLVM_DEBUG(dbgs() << "RegBankSelect mode overrided by command line\n");
   }
 }
 
@@ -104,6 +105,7 @@ void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.addRequired<MachineBranchProbabilityInfo>();
   }
   AU.addRequired<TargetPassConfig>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -122,11 +124,11 @@ bool RegBankSelect::assignmentMatch(
   // Reg is free of assignment, a simple assignment will make the
   // register bank to match.
   OnlyAssign = CurRegBank == nullptr;
-  DEBUG(dbgs() << "Does assignment already match: ";
-        if (CurRegBank) dbgs() << *CurRegBank; else dbgs() << "none";
-        dbgs() << " against ";
-        assert(DesiredRegBrank && "The mapping must be valid");
-        dbgs() << *DesiredRegBrank << '\n';);
+  LLVM_DEBUG(dbgs() << "Does assignment already match: ";
+             if (CurRegBank) dbgs() << *CurRegBank; else dbgs() << "none";
+             dbgs() << " against ";
+             assert(DesiredRegBrank && "The mapping must be valid");
+             dbgs() << *DesiredRegBrank << '\n';);
   return CurRegBank == DesiredRegBrank;
 }
 
@@ -159,8 +161,8 @@ bool RegBankSelect::repairReg(
   // same types because the type is a placeholder when this function is called.
   MachineInstr *MI =
       MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY).addDef(Dst).addUse(Src);
-  DEBUG(dbgs() << "Copy: " << printReg(Src) << " to: " << printReg(Dst)
-               << '\n');
+  LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << " to: " << printReg(Dst)
+                    << '\n');
   // TODO:
   // Check if MI is legal. if not, we need to legalize all the
   // instructions we are going to insert.
@@ -245,7 +247,7 @@ const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
     MappingCost CurCost =
         computeMapping(MI, *CurMapping, LocalRepairPts, &Cost);
     if (CurCost < Cost) {
-      DEBUG(dbgs() << "New best: " << CurCost << '\n');
+      LLVM_DEBUG(dbgs() << "New best: " << CurCost << '\n');
       Cost = CurCost;
       BestMapping = CurMapping;
       RepairPts.clear();
@@ -397,11 +399,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
   MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent()) : 1);
   bool Saturated = Cost.addLocalCost(InstrMapping.getCost());
   assert(!Saturated && "Possible mapping saturated the cost");
-  DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
-  DEBUG(dbgs() << "With: " << InstrMapping << '\n');
+  LLVM_DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
+  LLVM_DEBUG(dbgs() << "With: " << InstrMapping << '\n');
   RepairPts.clear();
   if (BestCost && Cost > *BestCost) {
-    DEBUG(dbgs() << "Mapping is too expensive from the start\n");
+    LLVM_DEBUG(dbgs() << "Mapping is too expensive from the start\n");
     return Cost;
   }
 
@@ -417,17 +419,17 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
-    DEBUG(dbgs() << "Opd" << OpIdx << '\n');
+    LLVM_DEBUG(dbgs() << "Opd" << OpIdx << '\n');
     const RegisterBankInfo::ValueMapping &ValMapping =
         InstrMapping.getOperandMapping(OpIdx);
     // If Reg is already properly mapped, this is free.
     bool Assign;
     if (assignmentMatch(Reg, ValMapping, Assign)) {
-      DEBUG(dbgs() << "=> is free (match).\n");
+      LLVM_DEBUG(dbgs() << "=> is free (match).\n");
       continue;
     }
     if (Assign) {
-      DEBUG(dbgs() << "=> is free (simple assignment).\n");
+      LLVM_DEBUG(dbgs() << "=> is free (simple assignment).\n");
       RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this,
                                                 RepairingPlacement::Reassign));
       continue;
@@ -446,7 +448,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
     // Check that the materialization of the repairing is possible.
     if (!RepairPt.canMaterialize()) {
-      DEBUG(dbgs() << "Mapping involves impossible repairing\n");
+      LLVM_DEBUG(dbgs() << "Mapping involves impossible repairing\n");
       return MappingCost::ImpossibleCost();
     }
 
@@ -473,7 +475,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
     // This is an impossible to repair cost.
     if (RepairCost == std::numeric_limits<unsigned>::max())
-      continue;
+      return MappingCost::ImpossibleCost();
 
     // Bias used for splitting: 5%.
     const uint64_t PercentageForBias = 5;
@@ -509,7 +511,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
       // Stop looking into what it takes to repair, this is already
       // too expensive.
       if (BestCost && Cost > *BestCost) {
-        DEBUG(dbgs() << "Mapping is too expensive, stop processing\n");
+        LLVM_DEBUG(dbgs() << "Mapping is too expensive, stop processing\n");
         return Cost;
       }
 
@@ -519,7 +521,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
         break;
     }
   }
-  DEBUG(dbgs() << "Total cost is: " << Cost << "\n");
+  LLVM_DEBUG(dbgs() << "Total cost is: " << Cost << "\n");
   return Cost;
 }
 
@@ -559,14 +561,14 @@ bool RegBankSelect::applyMapping(
   }
 
   // Second, rewrite the instruction.
-  DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
+  LLVM_DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
   RBI->applyMapping(OpdMapper);
 
   return true;
 }
 
 bool RegBankSelect::assignInstr(MachineInstr &MI) {
-  DEBUG(dbgs() << "Assign: " << MI);
+  LLVM_DEBUG(dbgs() << "Assign: " << MI);
   // Remember the repairing placement for all the operands.
   SmallVector<RepairingPlacement, 4> RepairPts;
 
@@ -587,7 +589,7 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
   // Make sure the mapping is valid for MI.
   assert(BestMapping->verify(MI) && "Invalid instruction mapping");
 
-  DEBUG(dbgs() << "Best Mapping: " << *BestMapping << '\n');
+  LLVM_DEBUG(dbgs() << "Best Mapping: " << *BestMapping << '\n');
 
   // After this call, MI may not be valid anymore.
   // Do not use it.
@@ -600,7 +602,7 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
           MachineFunctionProperties::Property::FailedISel))
     return false;
 
-  DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n');
   const Function &F = MF.getFunction();
   Mode SaveOptMode = OptMode;
   if (F.hasFnAttribute(Attribute::OptimizeNone))
@@ -610,20 +612,13 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
 #ifndef NDEBUG
   // Check that our input is fully legal: we require the function to have the
   // Legalized property, so it should be.
-  // FIXME: This should be in the MachineVerifier, but it can't use the
-  // LegalizerInfo as it's currently in the separate GlobalISel library.
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) {
-    for (MachineBasicBlock &MBB : MF) {
-      for (MachineInstr &MI : MBB) {
-        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
-          reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
-                             "instruction is not legal", MI);
-          return false;
-        }
-      }
+  // FIXME: This should be in the MachineVerifier.
+  if (!DisableGISelLegalityCheck)
+    if (const MachineInstr *MI = machineFunctionIsIllegal(MF)) {
+      reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                         "instruction is not legal", *MI);
+      return false;
     }
-  }
 #endif
 
   // Walk the function and assign register banks to all operands.
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 4d3ae69d3a9d..16f67a217ce1 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 
 #define DEBUG_TYPE "registerbank"
 
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index b3d9209ae6eb..dd15567ef1c1 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -72,7 +73,7 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
     const RegisterBank &RegBank = getRegBank(Idx);
     assert(Idx == RegBank.getID() &&
            "ID does not match the index in the array");
-    DEBUG(dbgs() << "Verify " << RegBank << '\n');
+    LLVM_DEBUG(dbgs() << "Verify " << RegBank << '\n');
     assert(RegBank.verify(TRI) && "RegBank is invalid");
   }
 #endif // NDEBUG
@@ -403,18 +404,18 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
 void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
   MachineInstr &MI = OpdMapper.getMI();
   MachineRegisterInfo &MRI = OpdMapper.getMRI();
-  DEBUG(dbgs() << "Applying default-like mapping\n");
+  LLVM_DEBUG(dbgs() << "Applying default-like mapping\n");
   for (unsigned OpIdx = 0,
                 EndIdx = OpdMapper.getInstrMapping().getNumOperands();
        OpIdx != EndIdx; ++OpIdx) {
-    DEBUG(dbgs() << "OpIdx " << OpIdx);
+    LLVM_DEBUG(dbgs() << "OpIdx " << OpIdx);
     MachineOperand &MO = MI.getOperand(OpIdx);
     if (!MO.isReg()) {
-      DEBUG(dbgs() << " is not a register, nothing to be done\n");
+      LLVM_DEBUG(dbgs() << " is not a register, nothing to be done\n");
       continue;
     }
     if (!MO.getReg()) {
-      DEBUG(dbgs() << " is %%noreg, nothing to be done\n");
+      LLVM_DEBUG(dbgs() << " is %%noreg, nothing to be done\n");
       continue;
     }
     assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
@@ -426,14 +427,14 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
     iterator_range<SmallVectorImpl<unsigned>::const_iterator> NewRegs =
         OpdMapper.getVRegs(OpIdx);
     if (NewRegs.begin() == NewRegs.end()) {
-      DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
+      LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
     unsigned OrigReg = MO.getReg();
     unsigned NewReg = *NewRegs.begin();
-    DEBUG(dbgs() << " changed, replace " << printReg(OrigReg, nullptr));
+    LLVM_DEBUG(dbgs() << " changed, replace " << printReg(OrigReg, nullptr));
     MO.setReg(NewReg);
-    DEBUG(dbgs() << " with " << printReg(NewReg, nullptr));
+    LLVM_DEBUG(dbgs() << " with " << printReg(NewReg, nullptr));
 
     // The OperandsMapper creates plain scalar, we may have to fix that.
     // Check if the types match and if not, fix that.
@@ -447,35 +448,27 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
       assert(OrigTy.getSizeInBits() <= NewTy.getSizeInBits() &&
              "Types with difference size cannot be handled by the default "
              "mapping");
-      DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
-                   << OrigTy);
+      LLVM_DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
+                        << OrigTy);
       MRI.setType(NewReg, OrigTy);
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
 }
 
 unsigned RegisterBankInfo::getSizeInBits(unsigned Reg,
                                          const MachineRegisterInfo &MRI,
                                          const TargetRegisterInfo &TRI) const {
-  const TargetRegisterClass *RC = nullptr;
   if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
     // The size is not directly available for physical registers.
     // Instead, we need to access a register class that contains Reg and
     // get the size of that register class.
-    RC = &getMinimalPhysRegClass(Reg, TRI);
-  } else {
-    LLT Ty = MRI.getType(Reg);
-    unsigned RegSize = Ty.isValid() ? Ty.getSizeInBits() : 0;
-    // If Reg is not a generic register, query the register class to
-    // get its size.
-    if (RegSize)
-      return RegSize;
-    // Since Reg is not a generic register, it must have a register class.
-    RC = MRI.getRegClass(Reg);
+    // Because this is expensive, we'll cache the register class by calling
+    auto *RC = &getMinimalPhysRegClass(Reg, TRI);
+    assert(RC && "Expecting Register class");
+    return TRI.getRegSizeInBits(*RC);
   }
-  assert(RC && "Unable to deduce the register class");
-  return TRI.getRegSizeInBits(*RC);
+  return TRI.getRegSizeInBits(Reg, MRI);
 }
 
 //------------------------------------------------------------------------------
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index ef990b49aceb..1a5f88743d5f 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -11,12 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -42,20 +44,94 @@ unsigned llvm::constrainRegToClass(MachineRegisterInfo &MRI,
   return Reg;
 }
 
-
 unsigned llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt, const MCInstrDesc &II,
-    unsigned Reg, unsigned OpIdx) {
+    const MachineOperand &RegMO, unsigned OpIdx) {
+  unsigned Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
   assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
          "PhysReg not implemented");
 
   const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF);
+  // Some of the target independent instructions, like COPY, may not impose any
+  // register class constraints on some of their operands: If it's a use, we can
+  // skip constraining as the instruction defining the register would constrain
+  // it.
+
+  // We can't constrain unallocatable register classes, because we can't create
+  // virtual registers for these classes, so we need to let targets handled this
+  // case.
+  if (RegClass && !RegClass->isAllocatable())
+    RegClass = TRI.getConstrainedRegClassForOperand(RegMO, MRI);
+
+  if (!RegClass) {
+    assert((!isTargetSpecificOpcode(II.getOpcode()) || RegMO.isUse()) &&
+           "Register class constraint is required unless either the "
+           "instruction is target independent or the operand is a use");
+    // FIXME: Just bailing out like this here could be not enough, unless we
+    // expect the users of this function to do the right thing for PHIs and
+    // COPY:
+    //   v1 = COPY v0
+    //   v2 = COPY v1
+    // v1 here may end up not being constrained at all. Please notice that to
+    // reproduce the issue we likely need a destination pattern of a selection
+    // rule producing such extra copies, not just an input GMIR with them as
+    // every existing target using selectImpl handles copies before calling it
+    // and they never reach this function.
+    return Reg;
+  }
   return constrainRegToClass(MRI, TII, RBI, InsertPt, Reg, *RegClass);
 }
 
+bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
+                                            const TargetInstrInfo &TII,
+                                            const TargetRegisterInfo &TRI,
+                                            const RegisterBankInfo &RBI) {
+  assert(!isPreISelGenericOpcode(I.getOpcode()) &&
+         "A selected instruction is expected");
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (unsigned OpI = 0, OpE = I.getNumExplicitOperands(); OpI != OpE; ++OpI) {
+    MachineOperand &MO = I.getOperand(OpI);
+
+    // There's nothing to be done on non-register operands.
+    if (!MO.isReg())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Converting operand: " << MO << '\n');
+    assert(MO.isReg() && "Unsupported non-reg operand");
+
+    unsigned Reg = MO.getReg();
+    // Physical registers don't need to be constrained.
+    if (TRI.isPhysicalRegister(Reg))
+      continue;
+
+    // Register operands with a value of 0 (e.g. predicate operands) don't need
+    // to be constrained.
+    if (Reg == 0)
+      continue;
+
+    // If the operand is a vreg, we should constrain its regclass, and only
+    // insert COPYs if that's impossible.
+    // constrainOperandRegClass does that for us.
+    MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),
+                                       MO, OpI));
+
+    // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been
+    // done.
+    if (MO.isUse()) {
+      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO);
+      if (DefIdx != -1 && !I.isRegTiedToUseOperand(DefIdx))
+        I.tieOperands(DefIdx, OpI);
+    }
+  }
+  return true;
+}
+
 bool llvm::isTriviallyDead(const MachineInstr &MI,
                            const MachineRegisterInfo &MRI) {
   // If we can move an instruction, we can remove it.  Otherwise, it has
@@ -101,7 +177,7 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
                                     MI.getDebugLoc(), MI.getParent());
   R << Msg;
   // Printing MI is expensive;  only do it if expensive remarks are enabled.
-  if (MORE.allowExtraAnalysis(PassName))
+  if (TPC.isGlobalISelAbortEnabled() || MORE.allowExtraAnalysis(PassName))
     R << ": " << ore::MNV("Inst", MI);
   reportGISelFailure(MF, TPC, MORE, R);
 }
@@ -145,3 +221,20 @@ llvm::MachineInstr *llvm::getOpcodeDef(unsigned Opcode, unsigned Reg,
   }
   return DefMI->getOpcode() == Opcode ? DefMI : nullptr;
 }
+
+APFloat llvm::getAPFloatFromSize(double Val, unsigned Size) {
+  if (Size == 32)
+    return APFloat(float(Val));
+  if (Size == 64)
+    return APFloat(Val);
+  if (Size != 16)
+    llvm_unreachable("Unsupported FPConstant size");
+  bool Ignored;
+  APFloat APF(Val);
+  APF.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+  return APF;
+}
+
+void llvm::getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU) {
+  AU.addPreserved<StackProtector>();
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
index 3888226fa059..ca56f4e0c4f1 100644
--- a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -70,7 +70,6 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -89,6 +88,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -159,13 +159,13 @@ namespace {
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
 
-    /// \brief Merge everything in \p Globals for which the corresponding bit
+    /// Merge everything in \p Globals for which the corresponding bit
     /// in \p GlobalSet is set.
     bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
                  const BitVector &GlobalSet, Module &M, bool isConst,
                  unsigned AddrSpace) const;
 
-    /// \brief Check if the given variable has been identified as must keep
+    /// Check if the given variable has been identified as must keep
     /// \pre setMustKeepGlobalVariables must have been called on the Module that
     ///      contains GV
     bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
@@ -177,7 +177,7 @@ namespace {
     void setMustKeepGlobalVariables(Module &M);
 
     /// Collect every variables marked as "used"
-    void collectUsedGlobalVariables(Module &M);
+    void collectUsedGlobalVariables(Module &M, StringRef Name);
 
     /// Keep track of the GlobalVariable that must not be merged away
     SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;
@@ -242,7 +242,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   // code (currently, a Function) to the set of globals seen so far that are
   // used together in that unit (GlobalUsesByFunction).
   //
-  // When we look at the Nth global, we now that any new set is either:
+  // When we look at the Nth global, we know that any new set is either:
   // - the singleton set {N}, containing this global only, or
   // - the union of {N} and a previously-discovered set, containing some
   //   combination of the previous N-1 globals.
@@ -440,28 +440,44 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
   assert(Globals.size() > 1);
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
   auto &DL = M.getDataLayout();
 
-  DEBUG(dbgs() << " Trying to merge set, starts with #"
-               << GlobalSet.find_first() << "\n");
+  LLVM_DEBUG(dbgs() << " Trying to merge set, starts with #"
+                    << GlobalSet.find_first() << "\n");
 
+  bool Changed = false;
   ssize_t i = GlobalSet.find_first();
   while (i != -1) {
     ssize_t j = 0;
     uint64_t MergedSize = 0;
     std::vector<Type*> Tys;
     std::vector<Constant*> Inits;
+    std::vector<unsigned> StructIdxs;
 
     bool HasExternal = false;
     StringRef FirstExternalName;
+    unsigned MaxAlign = 1;
+    unsigned CurIdx = 0;
     for (j = i; j != -1; j = GlobalSet.find_next(j)) {
       Type *Ty = Globals[j]->getValueType();
+      unsigned Align = DL.getPreferredAlignment(Globals[j]);
+      unsigned Padding = alignTo(MergedSize, Align) - MergedSize;
+      MergedSize += Padding;
       MergedSize += DL.getTypeAllocSize(Ty);
       if (MergedSize > MaxOffset) {
         break;
       }
+      if (Padding) {
+        Tys.push_back(ArrayType::get(Int8Ty, Padding));
+        Inits.push_back(ConstantAggregateZero::get(Tys.back()));
+        ++CurIdx;
+      }
       Tys.push_back(Ty);
       Inits.push_back(Globals[j]->getInitializer());
+      StructIdxs.push_back(CurIdx++);
+
+      MaxAlign = std::max(MaxAlign, Align);
 
       if (Globals[j]->hasExternalLinkage() && !HasExternal) {
         HasExternal = true;
@@ -469,12 +485,19 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       }
     }
 
+    // Exit early if there is only one global to merge.
+    if (Tys.size() < 2) {
+      i = j;
+      continue;
+    }
+
     // If merged variables doesn't have external linkage, we needn't to expose
     // the symbol after merging.
     GlobalValue::LinkageTypes Linkage = HasExternal
                                             ? GlobalValue::ExternalLinkage
                                             : GlobalValue::InternalLinkage;
-    StructType *MergedTy = StructType::get(M.getContext(), Tys);
+    // Use a packed struct so we can control alignment.
+    StructType *MergedTy = StructType::get(M.getContext(), Tys, true);
     Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
 
     // On Darwin external linkage needs to be preserved, otherwise
@@ -492,19 +515,23 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
         M, MergedTy, isConst, MergedLinkage, MergedInit, MergedName, nullptr,
         GlobalVariable::NotThreadLocal, AddrSpace);
 
-    const StructLayout *MergedLayout = DL.getStructLayout(MergedTy);
+    MergedGV->setAlignment(MaxAlign);
 
+    const StructLayout *MergedLayout = DL.getStructLayout(MergedTy);
     for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) {
       GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
       std::string Name = Globals[k]->getName();
+      GlobalValue::DLLStorageClassTypes DLLStorage =
+          Globals[k]->getDLLStorageClass();
 
       // Copy metadata while adjusting any debug info metadata by the original
       // global's offset within the merged global.
-      MergedGV->copyMetadata(Globals[k], MergedLayout->getElementOffset(idx));
+      MergedGV->copyMetadata(Globals[k],
+                             MergedLayout->getElementOffset(StructIdxs[idx]));
 
       Constant *Idx[2] = {
-        ConstantInt::get(Int32Ty, 0),
-        ConstantInt::get(Int32Ty, idx),
+          ConstantInt::get(Int32Ty, 0),
+          ConstantInt::get(Int32Ty, StructIdxs[idx]),
       };
       Constant *GEP =
           ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx);
@@ -517,20 +544,23 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       // It's not safe on Mach-O as the alias (and thus the portion of the
       // MergedGlobals variable) may be dead stripped at link time.
       if (Linkage != GlobalValue::InternalLinkage || !IsMachO) {
-        GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M);
+        GlobalAlias *GA = GlobalAlias::create(Tys[StructIdxs[idx]], AddrSpace,
+                                              Linkage, Name, GEP, &M);
+        GA->setDLLStorageClass(DLLStorage);
       }
 
       NumMerged++;
     }
+    Changed = true;
     i = j;
   }
 
-  return true;
+  return Changed;
 }
 
-void GlobalMerge::collectUsedGlobalVariables(Module &M) {
+void GlobalMerge::collectUsedGlobalVariables(Module &M, StringRef Name) {
   // Extract global variables from llvm.used array
-  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  const GlobalVariable *GV = M.getGlobalVariable(Name);
   if (!GV || !GV->hasInitializer()) return;
 
   // Should be an array of 'i8*'.
@@ -543,7 +573,8 @@ void GlobalMerge::collectUsedGlobalVariables(Module &M) {
 }
 
 void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
-  collectUsedGlobalVariables(M);
+  collectUsedGlobalVariables(M, "llvm.used");
+  collectUsedGlobalVariables(M, "llvm.compiler.used");
 
   for (Function &F : M) {
     for (BasicBlock &BB : F) {
@@ -577,8 +608,7 @@ bool GlobalMerge::doInitialization(Module &M) {
   for (auto &GV : M.globals()) {
     // Merge is safe for "normal" internal or external globals only
     if (GV.isDeclaration() || GV.isThreadLocal() ||
-        GV.hasSection() || GV.hasImplicitSection() ||
-        GV.hasDLLExportStorageClass())
+        GV.hasSection() || GV.hasImplicitSection())
       continue;
 
     // It's not safe to merge globals that may be preempted
@@ -594,12 +624,6 @@ bool GlobalMerge::doInitialization(Module &M) {
 
     unsigned AddressSpace = PT->getAddressSpace();
 
-    // Ignore fancy-aligned globals for now.
-    unsigned Alignment = DL.getPreferredAlignment(&GV);
-    Type *Ty = GV.getValueType();
-    if (Alignment > DL.getABITypeAlignment(Ty))
-      continue;
-
     // Ignore all 'special' globals.
     if (GV.getName().startswith("llvm.") ||
         GV.getName().startswith(".llvm."))
@@ -609,6 +633,7 @@ bool GlobalMerge::doInitialization(Module &M) {
     if (isMustKeepGlobalVariable(&GV))
       continue;
 
+    Type *Ty = GV.getValueType();
     if (DL.getTypeAllocSize(Ty) < MaxOffset) {
       if (TM &&
           TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal())
diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp
index d8ce90e63a9d..f12d00071b24 100644
--- a/contrib/llvm/lib/CodeGen/IfConversion.cpp
+++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp
@@ -252,7 +252,7 @@ namespace {
         BBInfo &TrueBBI, BBInfo &FalseBBI) const;
     void AnalyzeBlock(MachineBasicBlock &MBB,
                       std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
-    bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
+    bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Pred,
                              bool isTriangle = false, bool RevBranch = false,
                              bool hasCommonTail = false);
     void AnalyzeBlocks(MachineFunction &MF,
@@ -347,7 +347,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   BranchFolder::MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
-  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  SchedModel.init(&ST);
 
   if (!TII) return false;
 
@@ -361,14 +361,14 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
                                    getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
-  DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
-               << MF.getName() << "\'");
+  LLVM_DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum << ") \'"
+                    << MF.getName() << "\'");
 
   if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
-    DEBUG(dbgs() << " skipped\n");
+    LLVM_DEBUG(dbgs() << " skipped\n");
     return false;
   }
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "\n");
 
   MF.RenumberBlocks();
   BBAnalysis.resize(MF.getNumBlockIDs());
@@ -406,14 +406,14 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
       case ICSimpleFalse: {
         bool isFalse = Kind == ICSimpleFalse;
         if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
-        DEBUG(dbgs() << "Ifcvt (Simple"
-                     << (Kind == ICSimpleFalse ? " false" : "")
-                     << "): " << printMBBReference(*BBI.BB) << " ("
-                     << ((Kind == ICSimpleFalse) ? BBI.FalseBB->getNumber()
-                                                 : BBI.TrueBB->getNumber())
-                     << ") ");
+        LLVM_DEBUG(dbgs() << "Ifcvt (Simple"
+                          << (Kind == ICSimpleFalse ? " false" : "")
+                          << "): " << printMBBReference(*BBI.BB) << " ("
+                          << ((Kind == ICSimpleFalse) ? BBI.FalseBB->getNumber()
+                                                      : BBI.TrueBB->getNumber())
+                          << ") ");
         RetVal = IfConvertSimple(BBI, Kind);
-        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) {
           if (isFalse) ++NumSimpleFalse;
           else         ++NumSimple;
@@ -430,16 +430,16 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
         if (DisableTriangleR && !isFalse && isRev) break;
         if (DisableTriangleF && isFalse && !isRev) break;
         if (DisableTriangleFR && isFalse && isRev) break;
-        DEBUG(dbgs() << "Ifcvt (Triangle");
+        LLVM_DEBUG(dbgs() << "Ifcvt (Triangle");
         if (isFalse)
-          DEBUG(dbgs() << " false");
+          LLVM_DEBUG(dbgs() << " false");
         if (isRev)
-          DEBUG(dbgs() << " rev");
-        DEBUG(dbgs() << "): " << printMBBReference(*BBI.BB)
-                     << " (T:" << BBI.TrueBB->getNumber()
-                     << ",F:" << BBI.FalseBB->getNumber() << ") ");
+          LLVM_DEBUG(dbgs() << " rev");
+        LLVM_DEBUG(dbgs() << "): " << printMBBReference(*BBI.BB)
+                          << " (T:" << BBI.TrueBB->getNumber()
+                          << ",F:" << BBI.FalseBB->getNumber() << ") ");
         RetVal = IfConvertTriangle(BBI, Kind);
-        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) {
           if (isFalse) {
             if (isRev) ++NumTriangleFRev;
@@ -453,24 +453,25 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
       }
       case ICDiamond:
         if (DisableDiamond) break;
-        DEBUG(dbgs() << "Ifcvt (Diamond): " << printMBBReference(*BBI.BB)
-                     << " (T:" << BBI.TrueBB->getNumber()
-                     << ",F:" << BBI.FalseBB->getNumber() << ") ");
+        LLVM_DEBUG(dbgs() << "Ifcvt (Diamond): " << printMBBReference(*BBI.BB)
+                          << " (T:" << BBI.TrueBB->getNumber()
+                          << ",F:" << BBI.FalseBB->getNumber() << ") ");
         RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2,
                                   Token->TClobbersPred,
                                   Token->FClobbersPred);
-        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) ++NumDiamonds;
         break;
       case ICForkedDiamond:
         if (DisableForkedDiamond) break;
-        DEBUG(dbgs() << "Ifcvt (Forked Diamond): " << printMBBReference(*BBI.BB)
-                     << " (T:" << BBI.TrueBB->getNumber()
-                     << ",F:" << BBI.FalseBB->getNumber() << ") ");
+        LLVM_DEBUG(dbgs() << "Ifcvt (Forked Diamond): "
+                          << printMBBReference(*BBI.BB)
+                          << " (T:" << BBI.TrueBB->getNumber()
+                          << ",F:" << BBI.FalseBB->getNumber() << ") ");
         RetVal = IfConvertForkedDiamond(BBI, Kind, NumDups, NumDups2,
                                       Token->TClobbersPred,
                                       Token->FClobbersPred);
-        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) ++NumForkedDiamonds;
         break;
       }
@@ -948,7 +949,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI,
   BBI.ExtraCost2 = 0;
   BBI.ClobbersPred = false;
   for (MachineInstr &MI : make_range(Begin, End)) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
     // It's unsafe to duplicate convergent instructions in this context, so set
@@ -1726,14 +1727,14 @@ bool IfConverter::IfConvertDiamondCommon(
   for (unsigned i = 0; i < NumDups1; ++DI1) {
     if (DI1 == MBB1.end())
       break;
-    if (!DI1->isDebugValue())
+    if (!DI1->isDebugInstr())
       ++i;
   }
   while (NumDups1 != 0) {
     ++DI2;
     if (DI2 == MBB2.end())
       break;
-    if (!DI2->isDebugValue())
+    if (!DI2->isDebugInstr())
       --NumDups1;
   }
 
@@ -1767,7 +1768,7 @@ bool IfConverter::IfConvertDiamondCommon(
     assert(DI1 != MBB1.begin());
     --DI1;
     // skip dbg_value instructions
-    if (!DI1->isDebugValue())
+    if (!DI1->isDebugInstr())
       ++i;
   }
   MBB1.erase(DI1, MBB1.end());
@@ -1782,7 +1783,7 @@ bool IfConverter::IfConvertDiamondCommon(
     // instructions could be found.
     while (DI2 != MBB2.begin()) {
       MachineBasicBlock::iterator Prev = std::prev(DI2);
-      if (!Prev->isBranch() && !Prev->isDebugValue())
+      if (!Prev->isBranch() && !Prev->isDebugInstr())
         break;
       DI2 = Prev;
     }
@@ -1793,7 +1794,7 @@ bool IfConverter::IfConvertDiamondCommon(
     assert(DI2 != MBB2.begin());
     --DI2;
     // skip dbg_value instructions
-    if (!DI2->isDebugValue())
+    if (!DI2->isDebugInstr())
       --NumDups2;
   }
 
@@ -1809,7 +1810,7 @@ bool IfConverter::IfConvertDiamondCommon(
   SmallSet<unsigned, 4> ExtUses;
   if (TII->isProfitableToUnpredicate(MBB1, MBB2)) {
     for (const MachineInstr &FI : make_range(MBB2.begin(), DI2)) {
-      if (FI.isDebugValue())
+      if (FI.isDebugInstr())
         continue;
       SmallVector<unsigned, 4> Defs;
       for (const MachineOperand &MO : FI.operands()) {
@@ -2002,7 +2003,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != nullptr;
   for (MachineInstr &I : make_range(BBI.BB->begin(), E)) {
-    if (I.isDebugValue() || TII->isPredicated(I))
+    if (I.isDebugInstr() || TII->isPredicated(I))
       continue;
     // It may be possible not to predicate an instruction if it's the 'true'
     // side of a diamond and the 'false' side may re-define the instruction's
@@ -2058,7 +2059,7 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
       ToBBI.ExtraCost += NumCycles-1;
     ToBBI.ExtraCost2 += ExtraPredCost;
 
-    if (!TII->isPredicated(I) && !MI->isDebugValue()) {
+    if (!TII->isPredicated(I) && !MI->isDebugInstr()) {
       if (!TII->PredicateInstruction(*MI, Cond)) {
 #ifndef NDEBUG
         dbgs() << "Unable to predicate " << I << "!\n";
diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 308b6d293d3d..0a447bc613b1 100644
--- a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -115,7 +115,7 @@ class ImplicitNullChecks : public MachineFunctionPass {
   /// \c canHandle should return true for all instructions in \p
   /// Insts.
   DependenceResult computeDependence(const MachineInstr *MI,
-                                     ArrayRef<MachineInstr *> Insts);
+                                     ArrayRef<MachineInstr *> Block);
 
   /// Represents one null check that can be made implicit.
   class NullCheck {
@@ -134,7 +134,7 @@ class ImplicitNullChecks : public MachineFunctionPass {
     // The block branched to if the pointer is null.
     MachineBasicBlock *NullSucc;
 
-    // If this is non-null, then MemOperation has a dependency on on this
+    // If this is non-null, then MemOperation has a dependency on this
     // instruction; and it needs to be hoisted to execute before MemOperation.
     MachineInstr *OnlyDependency;
 
@@ -198,7 +198,7 @@ class ImplicitNullChecks : public MachineFunctionPass {
   SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
                                        ArrayRef<MachineInstr *> PrevInsts);
 
-  /// Return true if \p FaultingMI can be hoisted from after the the
+  /// Return true if \p FaultingMI can be hoisted from after the
   /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a
   /// non-null value if we also need to (and legally can) hoist a depedency.
   bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg,
@@ -496,6 +496,32 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   if (NotNullSucc->pred_size() != 1)
     return false;
 
+  // To prevent the invalid transformation of the following code:
+  //
+  //   mov %rax, %rcx
+  //   test %rax, %rax
+  //   %rax = ...
+  //   je throw_npe
+  //   mov(%rcx), %r9
+  //   mov(%rax), %r10
+  //
+  // into:
+  //
+  //   mov %rax, %rcx
+  //   %rax = ....
+  //   faulting_load_op("movl (%rax), %r10", throw_npe)
+  //   mov(%rcx), %r9
+  //
+  // we must ensure that there are no instructions between the 'test' and
+  // conditional jump that modify %rax.
+  const unsigned PointerReg = MBP.LHS.getReg();
+
+  assert(MBP.ConditionDef->getParent() ==  &MBB && "Should be in basic block");
+
+  for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
+    if (I->modifiesRegister(PointerReg, TRI))
+      return false;
+
   // Starting with a code fragment like:
   //
   //   test %rax, %rax
@@ -550,8 +576,6 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   // ptr could be some non-null invalid reference that never gets loaded from
   // because some_cond is always true.
 
-  const unsigned PointerReg = MBP.LHS.getReg();
-
   SmallVector<MachineInstr *, 8> InstsSeenSoFar;
 
   for (auto &MI : *NotNullSucc) {
@@ -596,9 +620,8 @@ MachineInstr *ImplicitNullChecks::insertFaultingInstr(
 
   unsigned DefReg = NoRegister;
   if (NumDefs != 0) {
-    DefReg = MI->defs().begin()->getReg();
-    assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 &&
-           "expected exactly one def!");
+    DefReg = MI->getOperand(0).getReg();
+    assert(NumDefs == 1 && "expected exactly one def!");
   }
 
   FaultMaps::FaultKind FK;
diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
index 86ce4b7a9464..007e9283d833 100644
--- a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -46,6 +46,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
@@ -335,7 +336,7 @@ void InlineSpiller::collectRegsToSpill() {
     if (isRegToSpill(SnipReg))
       continue;
     RegsToSpill.push_back(SnipReg);
-    DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
+    LLVM_DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
     ++NumSnippets;
   }
 }
@@ -387,8 +388,8 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   LiveInterval &OrigLI = LIS.getInterval(Original);
   VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
   StackInt->MergeValueInAsValue(OrigLI, OrigVNI, StackInt->getValNumInfo(0));
-  DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
-               << *StackInt << '\n');
+  LLVM_DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
+                    << *StackInt << '\n');
 
   // We are going to spill SrcVNI immediately after its def, so clear out
   // any later spills of the same value.
@@ -409,7 +410,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
                           MRI.getRegClass(SrcReg), &TRI);
   --MII; // Point to store instruction.
   LIS.InsertMachineInstrInMaps(*MII);
-  DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
+  LLVM_DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
 
   HSpiller.addToMergeableSpills(*MII, StackSlot, Original);
   ++NumSpills;
@@ -428,8 +429,8 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     LiveInterval *LI;
     std::tie(LI, VNI) = WorkList.pop_back_val();
     unsigned Reg = LI->reg;
-    DEBUG(dbgs() << "Checking redundant spills for "
-                 << VNI->id << '@' << VNI->def << " in " << *LI << '\n');
+    LLVM_DEBUG(dbgs() << "Checking redundant spills for " << VNI->id << '@'
+                      << VNI->def << " in " << *LI << '\n');
 
     // Regs to spill are taken care of.
     if (isRegToSpill(Reg))
@@ -437,7 +438,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
 
     // Add all of VNI's live range to StackInt.
     StackInt->MergeValueInAsValue(*LI, VNI, StackInt->getValNumInfo(0));
-    DEBUG(dbgs() << "Merged to stack int: " << *StackInt << '\n');
+    LLVM_DEBUG(dbgs() << "Merged to stack int: " << *StackInt << '\n');
 
     // Find all spills and copies of VNI.
     for (MachineRegisterInfo::use_instr_nodbg_iterator
@@ -465,7 +466,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
       // Erase spills.
       int FI;
       if (Reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) {
-        DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << MI);
+        LLVM_DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << MI);
         // eliminateDeadDefs won't normally remove stores, so switch opcode.
         MI.setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(&MI);
@@ -527,13 +528,13 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
 
   if (!ParentVNI) {
-    DEBUG(dbgs() << "\tadding <undef> flags: ");
+    LLVM_DEBUG(dbgs() << "\tadding <undef> flags: ");
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg)
         MO.setIsUndef();
     }
-    DEBUG(dbgs() << UseIdx << '\t' << MI);
+    LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI);
     return true;
   }
 
@@ -547,7 +548,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
 
   if (!Edit->canRematerializeAt(RM, OrigVNI, UseIdx, false)) {
     markValueUsed(&VirtReg, ParentVNI);
-    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
+    LLVM_DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
     return false;
   }
 
@@ -555,7 +556,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   // same register for uses and defs.
   if (RI.Tied) {
     markValueUsed(&VirtReg, ParentVNI);
-    DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << MI);
+    LLVM_DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << MI);
     return false;
   }
 
@@ -581,8 +582,8 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   NewMI->setDebugLoc(MI.getDebugLoc());
 
   (void)DefIdx;
-  DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
-               << *LIS.getInstructionFromIndex(DefIdx));
+  LLVM_DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
+                    << *LIS.getInstructionFromIndex(DefIdx));
 
   // Replace operands
   for (const auto &OpPair : Ops) {
@@ -592,7 +593,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
       MO.setIsKill();
     }
   }
-  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << MI << '\n');
+  LLVM_DEBUG(dbgs() << "\t        " << UseIdx << '\t' << MI << '\n');
 
   ++NumRemats;
   return true;
@@ -619,6 +620,9 @@ void InlineSpiller::reMaterializeAll() {
       if (MI.isDebugValue())
         continue;
 
+      assert(!MI.isDebugInstr() && "Did not expect to find a use in debug "
+             "instruction that isn't a DBG_VALUE");
+
       anyRemat |= reMaterializeFor(LI, MI);
     }
   }
@@ -637,7 +641,7 @@ void InlineSpiller::reMaterializeAll() {
       MI->addRegisterDead(Reg, &TRI);
       if (!MI->allDefsAreDead())
         continue;
-      DEBUG(dbgs() << "All defs dead: " << *MI);
+      LLVM_DEBUG(dbgs() << "All defs dead: " << *MI);
       DeadDefs.push_back(MI);
     }
   }
@@ -646,7 +650,7 @@ void InlineSpiller::reMaterializeAll() {
   // deleted here.
   if (DeadDefs.empty())
     return;
-  DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
+  LLVM_DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
   Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
 
   // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions
@@ -669,7 +673,8 @@ void InlineSpiller::reMaterializeAll() {
     RegsToSpill[ResultPos++] = Reg;
   }
   RegsToSpill.erase(RegsToSpill.begin() + ResultPos, RegsToSpill.end());
-  DEBUG(dbgs() << RegsToSpill.size() << " registers to spill after remat.\n");
+  LLVM_DEBUG(dbgs() << RegsToSpill.size()
+                    << " registers to spill after remat.\n");
 }
 
 //===----------------------------------------------------------------------===//
@@ -691,7 +696,7 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   if (!IsLoad)
     HSpiller.rmFromMergeableSpills(*MI, StackSlot);
 
-  DEBUG(dbgs() << "Coalescing stack access: " << *MI);
+  LLVM_DEBUG(dbgs() << "Coalescing stack access: " << *MI);
   LIS.RemoveMachineInstrFromMaps(*MI);
   MI->eraseFromParent();
 
@@ -848,8 +853,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
         FoldMI->RemoveOperand(i - 1);
     }
 
-  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
-                                           "folded"));
+  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
+                                                "folded"));
 
   if (!WasCopy)
     ++NumFolded;
@@ -872,8 +877,8 @@ void InlineSpiller::insertReload(unsigned NewVReg,
 
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
 
-  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
-                                           NewVReg));
+  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
+                                                NewVReg));
   ++NumReloads;
 }
 
@@ -912,8 +917,8 @@ void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
 
   LIS.InsertMachineInstrRangeInMaps(std::next(MI), MIS.end());
 
-  DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
-                                           "spill"));
+  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
+                                                "spill"));
   ++NumSpills;
   if (IsRealSpill)
     HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
@@ -921,7 +926,7 @@ void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
 
 /// spillAroundUses - insert spill code around each use of Reg.
 void InlineSpiller::spillAroundUses(unsigned Reg) {
-  DEBUG(dbgs() << "spillAroundUses " << printReg(Reg) << '\n');
+  LLVM_DEBUG(dbgs() << "spillAroundUses " << printReg(Reg) << '\n');
   LiveInterval &OldLI = LIS.getInterval(Reg);
 
   // Iterate over instructions using Reg.
@@ -934,12 +939,15 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     if (MI->isDebugValue()) {
       // Modify DBG_VALUE now that the value is in a spill slot.
       MachineBasicBlock *MBB = MI->getParent();
-      DEBUG(dbgs() << "Modifying debug info due to spill:\t" << *MI);
+      LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:\t" << *MI);
       buildDbgValueForSpill(*MBB, MI, *MI, StackSlot);
       MBB->erase(MI);
       continue;
     }
 
+    assert(!MI->isDebugInstr() && "Did not expect to find a use in debug "
+           "instruction that isn't a DBG_VALUE");
+
     // Ignore copies to/from snippets. We'll delete them.
     if (SnippetCopies.count(MI))
       continue;
@@ -965,7 +973,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {
-        DEBUG(dbgs() << "Found new snippet copy: " << *MI);
+        LLVM_DEBUG(dbgs() << "Found new snippet copy: " << *MI);
         SnippetCopies.insert(MI);
         continue;
       }
@@ -1008,7 +1016,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
           hasLiveDef = true;
       }
     }
-    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
+    LLVM_DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
 
     // FIXME: Use a second vreg if instruction has no tied ops.
     if (RI.Writes)
@@ -1034,7 +1042,7 @@ void InlineSpiller::spillAll() {
   for (unsigned Reg : RegsToSpill)
     StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg),
                                      StackInt->getValNumInfo(0));
-  DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
+  LLVM_DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
 
   // Spill around uses of all RegsToSpill.
   for (unsigned Reg : RegsToSpill)
@@ -1042,7 +1050,7 @@ void InlineSpiller::spillAll() {
 
   // Hoisted spills may cause dead code.
   if (!DeadDefs.empty()) {
-    DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
+    LLVM_DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
     Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
   }
 
@@ -1074,10 +1082,10 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
   StackSlot = VRM.getStackSlot(Original);
   StackInt = nullptr;
 
-  DEBUG(dbgs() << "Inline spilling "
-               << TRI.getRegClassName(MRI.getRegClass(edit.getReg()))
-               << ':' << edit.getParent()
-               << "\nFrom original " << printReg(Original) << '\n');
+  LLVM_DEBUG(dbgs() << "Inline spilling "
+                    << TRI.getRegClassName(MRI.getRegClass(edit.getReg()))
+                    << ':' << edit.getParent() << "\nFrom original "
+                    << printReg(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
          "Attempting to spill already spilled value.");
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
@@ -1261,11 +1269,11 @@ void HoistSpillHelper::getVisitOrders(
          "Orders have different size with WorkSet");
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  LLVM_DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
   SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
   for (; RIt != Orders.rend(); RIt++)
-    DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
-  DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  LLVM_DEBUG(dbgs() << "\n");
 #endif
 }
 
@@ -1374,7 +1382,7 @@ void HoistSpillHelper::runHoistSpills(
       // Current Block is the BB containing the new hoisted spill. Add it to
       // SpillsToKeep. LiveReg is the source of the new spill.
       SpillsToKeep[*RIt] = LiveReg;
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "spills in BB: ";
         for (const auto Rspill : SpillsInSubTree)
           dbgs() << Rspill->getBlock()->getNumber() << " ";
@@ -1430,7 +1438,7 @@ void HoistSpillHelper::hoistAllSpills() {
     if (Ent.second.empty())
       continue;
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
              << "Equal spills in BB: ";
       for (const auto spill : EqValSpills)
@@ -1445,7 +1453,7 @@ void HoistSpillHelper::hoistAllSpills() {
 
     runHoistSpills(OrigLI, *OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "Finally inserted spills in BB: ";
       for (const auto Ispill : SpillsToIns)
         dbgs() << Ispill.first->getNumber() << " ";
diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
index 72227cc7bba9..82f6e8d8e234 100644
--- a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
+++ b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -48,8 +48,8 @@ void InterferenceCache::reinitPhysRegEntries() {
   if (PhysRegEntriesCount == TRI->getNumRegs()) return;
   free(PhysRegEntries);
   PhysRegEntriesCount = TRI->getNumRegs();
-  PhysRegEntries = (unsigned char*)
-    calloc(PhysRegEntriesCount, sizeof(unsigned char));
+  PhysRegEntries = static_cast<unsigned char*>(
+      safe_calloc(PhysRegEntriesCount, sizeof(unsigned char)));
 }
 
 void InterferenceCache::init(MachineFunction *mf,
diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9c906d309639..fd2ff162630a 100644
--- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -104,15 +104,15 @@ private:
   /// The maximum supported interleave factor.
   unsigned MaxFactor;
 
-  /// \brief Transform an interleaved load into target specific intrinsics.
+  /// Transform an interleaved load into target specific intrinsics.
   bool lowerInterleavedLoad(LoadInst *LI,
                             SmallVector<Instruction *, 32> &DeadInsts);
 
-  /// \brief Transform an interleaved store into target specific intrinsics.
+  /// Transform an interleaved store into target specific intrinsics.
   bool lowerInterleavedStore(StoreInst *SI,
                              SmallVector<Instruction *, 32> &DeadInsts);
 
-  /// \brief Returns true if the uses of an interleaved load by the
+  /// Returns true if the uses of an interleaved load by the
   /// extractelement instructions in \p Extracts can be replaced by uses of the
   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
   /// replacements are also performed.
@@ -136,7 +136,7 @@ FunctionPass *llvm::createInterleavedAccessPass() {
   return new InterleavedAccess();
 }
 
-/// \brief Check if the mask is a DE-interleave mask of the given factor
+/// Check if the mask is a DE-interleave mask of the given factor
 /// \p Factor like:
 ///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
 static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
@@ -158,7 +158,7 @@ static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
   return false;
 }
 
-/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
+/// Check if the mask is a DE-interleave mask for an interleaved load.
 ///
 /// E.g. DE-interleave masks (Factor = 2) could be:
 ///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
@@ -176,7 +176,7 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
   return false;
 }
 
-/// \brief Check if the mask can be used in an interleaved store.
+/// Check if the mask can be used in an interleaved store.
 //
 /// It checks for a more general pattern than the RE-interleave mask.
 /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
@@ -332,7 +332,7 @@ bool InterleavedAccess::lowerInterleavedLoad(
   if (!tryReplaceExtracts(Extracts, Shuffles))
     return false;
 
-  DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
 
   // Try to create target specific intrinsics to replace the load and shuffles.
   if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
@@ -424,7 +424,7 @@ bool InterleavedAccess::lowerInterleavedStore(
   if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
     return false;
 
-  DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
 
   // Try to create target specific intrinsics to replace the store and shuffle.
   if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
@@ -441,7 +441,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
   if (!TPC || !LowerInterleavedAccesses)
     return false;
 
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &TM = TPC->getTM<TargetMachine>();
diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 12777d5ed110..eb4099964242 100644
--- a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -456,6 +456,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
   }
 
   case Intrinsic::dbg_declare:
+  case Intrinsic::dbg_label:
     break;    // Simply strip out debugging intrinsics
 
   case Intrinsic::eh_typeid_for:
diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 4c6e21ab315a..2cd389ce2c11 100644
--- a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -16,7 +16,6 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -24,16 +23,22 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableTrapUnreachable("trap-unreachable",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable generating trap for unreachable"));
+
 void LLVMTargetMachine::initAsmInfo() {
   MRI = TheTarget.createMCRegInfo(getTargetTriple().str());
   MII = TheTarget.createMCInstrInfo();
@@ -79,6 +84,9 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T,
   this->RM = RM;
   this->CMModel = CM;
   this->OptLevel = OL;
+
+  if (EnableTrapUnreachable)
+    this->Options.TrapUnreachable = true;
 }
 
 TargetTransformInfo
@@ -113,8 +121,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
 }
 
 bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
-    raw_pwrite_stream &Out, CodeGenFileType FileType,
-    MCContext &Context) {
+                                      raw_pwrite_stream &Out,
+                                      raw_pwrite_stream *DwoOut,
+                                      CodeGenFileType FileType,
+                                      MCContext &Context) {
   if (Options.MCOptions.MCSaveTempLabels)
     Context.setAllowTemporaryLabels(false);
 
@@ -131,17 +141,17 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
         getTargetTriple(), MAI.getAssemblerDialect(), MAI, MII, MRI);
 
     // Create a code emitter if asked to show the encoding.
-    MCCodeEmitter *MCE = nullptr;
+    std::unique_ptr<MCCodeEmitter> MCE;
     if (Options.MCOptions.ShowMCEncoding)
-      MCE = getTarget().createMCCodeEmitter(MII, MRI, Context);
+      MCE.reset(getTarget().createMCCodeEmitter(MII, MRI, Context));
 
-    MCAsmBackend *MAB =
-        getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions);
+    std::unique_ptr<MCAsmBackend> MAB(
+        getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
     auto FOut = llvm::make_unique<formatted_raw_ostream>(Out);
     MCStreamer *S = getTarget().createAsmStreamer(
         Context, std::move(FOut), Options.MCOptions.AsmVerbose,
-        Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB,
-        Options.MCOptions.ShowMCInst);
+        Options.MCOptions.MCUseDwarfDirectory, InstPrinter, std::move(MCE),
+        std::move(MAB), Options.MCOptions.ShowMCInst);
     AsmStreamer.reset(S);
     break;
   }
@@ -159,7 +169,9 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
 
     Triple T(getTargetTriple().str());
     AsmStreamer.reset(getTarget().createMCObjectStreamer(
-        T, Context, std::unique_ptr<MCAsmBackend>(MAB), Out,
+        T, Context, std::unique_ptr<MCAsmBackend>(MAB),
+        DwoOut ? MAB->createDwoObjectWriter(Out, *DwoOut)
+               : MAB->createObjectWriter(Out),
         std::unique_ptr<MCCodeEmitter>(MCE), STI, Options.MCOptions.MCRelaxAll,
         Options.MCOptions.MCIncrementalLinkerCompatible,
         /*DWARFMustBeAtTheEnd*/ true));
@@ -184,6 +196,7 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
 
 bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             raw_pwrite_stream &Out,
+                                            raw_pwrite_stream *DwoOut,
                                             CodeGenFileType FileType,
                                             bool DisableVerify,
                                             MachineModuleInfo *MMI) {
@@ -194,7 +207,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   if (!Context)
     return true;
 
-  if (WillCompleteCodeGenPipeline && addAsmPrinter(PM, Out, FileType, *Context))
+  if (WillCompleteCodeGenPipeline &&
+      addAsmPrinter(PM, Out, DwoOut, FileType, *Context))
     return true;
 
   PM.add(createFreeMachineFunctionPass());
@@ -234,7 +248,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
 
   const Triple &T = getTargetTriple();
   std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
-      T, *Ctx, std::unique_ptr<MCAsmBackend>(MAB), Out,
+      T, *Ctx, std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(Out),
       std::unique_ptr<MCCodeEmitter>(MCE), STI, Options.MCOptions.MCRelaxAll,
       Options.MCOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ true));
diff --git a/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp b/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
index 8ffd51a550fc..5dbce841cfd5 100644
--- a/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -139,3 +140,14 @@ void LatencyPriorityQueue::remove(SUnit *SU) {
     std::swap(*I, Queue.back());
   Queue.pop_back();
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LatencyPriorityQueue::dump(ScheduleDAG *DAG) const {
+  dbgs() << "Latency Priority Queue\n";
+  dbgs() << "  Number of Queue Entries: " << Queue.size() << "\n";
+  for (auto const &SU : Queue) {
+    dbgs() << "    ";
+    SU->dump(DAG);
+  }
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
index 996d40ca6e1e..5b52cc66a297 100644
--- a/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -57,23 +57,23 @@ MachineBlockFrequencyInfo &
 LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
   auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
   if (MBFI) {
-    DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n");
+    LLVM_DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n");
     return *MBFI;
   }
 
   auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
   auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
   auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
-  DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
-  DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
+  LLVM_DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
+  LLVM_DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
 
   if (!MLI) {
-    DEBUG(dbgs() << "Building LoopInfo on the fly\n");
+    LLVM_DEBUG(dbgs() << "Building LoopInfo on the fly\n");
     // First create a dominator tree.
-    DEBUG(if (MDT) dbgs() << "DominatorTree is available\n");
+    LLVM_DEBUG(if (MDT) dbgs() << "DominatorTree is available\n");
 
     if (!MDT) {
-      DEBUG(dbgs() << "Building DominatorTree on the fly\n");
+      LLVM_DEBUG(dbgs() << "Building DominatorTree on the fly\n");
       OwnedMDT = make_unique<MachineDominatorTree>();
       OwnedMDT->getBase().recalculate(*MF);
       MDT = OwnedMDT.get();
diff --git a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
index 8c54751ee833..d06821bdfcce 100644
--- a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
index 19ec281079cb..fea83e92de8f 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -40,6 +40,8 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -64,7 +66,7 @@ using namespace llvm;
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 
-// \brief If @MI is a DBG_VALUE with debug value described by a defined
+// If @MI is a DBG_VALUE with debug value described by a defined
 // register, returns the number of this register. In the other case, returns 0.
 static unsigned isDbgValueDescribedByReg(const MachineInstr &MI) {
   assert(MI.isDebugValue() && "expected a DBG_VALUE");
@@ -81,6 +83,7 @@ private:
   const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
   const TargetFrameLowering *TFI;
+  BitVector CalleeSavedRegs;
   LexicalScopes LS;
 
   /// Keeps track of lexical scopes associated with a user value's source
@@ -178,11 +181,11 @@ private:
   using VarLocMap = UniqueVector<VarLoc>;
   using VarLocSet = SparseBitVector<>;
   using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
-  struct SpillDebugPair {
-    MachineInstr *SpillInst;
+  struct TransferDebugPair {
+    MachineInstr *TransferInst;
     MachineInstr *DebugInst;
   };
-  using SpillMap = SmallVector<SpillDebugPair, 4>;
+  using TransferMap = SmallVector<TransferDebugPair, 4>;
 
   /// This holds the working set of currently open ranges. For fast
   /// access, this is done both as a set of VarLocIDs, and a map of
@@ -235,18 +238,23 @@ private:
   bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
                           unsigned &Reg);
   int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg);
+  void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                               TransferMap &Transfers, VarLocMap &VarLocIDs,
+                               unsigned OldVarID, unsigned NewReg = 0);
 
   void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
                           VarLocMap &VarLocIDs);
   void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                         VarLocMap &VarLocIDs, SpillMap &Spills);
+                         VarLocMap &VarLocIDs, TransferMap &Transfers);
+  void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                            VarLocMap &VarLocIDs, TransferMap &Transfers);
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
                            const VarLocMap &VarLocIDs);
   bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
                               VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
-  bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills,
-                bool transferSpills);
+  bool process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
+               TransferMap &Transfers, bool transferChanges);
 
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
@@ -369,6 +377,54 @@ void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
   }
 }
 
+/// Create new TransferDebugPair and insert it in \p Transfers. The VarLoc
+/// with \p OldVarID should be deleted form \p OpenRanges and replaced with
+/// new VarLoc. If \p NewReg is different than default zero value then the
+/// new location will be register location created by the copy like instruction,
+/// otherwise it is variable's location on the stack.
+void LiveDebugValues::insertTransferDebugPair(
+    MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
+    VarLocMap &VarLocIDs, unsigned OldVarID, unsigned NewReg) {
+  const MachineInstr *DMI = &VarLocIDs[OldVarID].MI;
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineInstr *NewDMI;
+  if (NewReg) {
+    // Create a DBG_VALUE instruction to describe the Var in its new
+    // register location.
+    NewDMI = BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(),
+                     DMI->isIndirectDebugValue(), NewReg,
+                     DMI->getDebugVariable(), DMI->getDebugExpression());
+    if (DMI->isIndirectDebugValue())
+      NewDMI->getOperand(1).setImm(DMI->getOperand(1).getImm());
+    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register copy: ";
+               NewDMI->print(dbgs(), false, false, false, TII));
+  } else {
+    // Create a DBG_VALUE instruction to describe the Var in its spilled
+    // location.
+    unsigned SpillBase;
+    int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase);
+    auto *SpillExpr = DIExpression::prepend(DMI->getDebugExpression(),
+                                            DIExpression::NoDeref, SpillOffset);
+    NewDMI = BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase,
+                     DMI->getDebugVariable(), SpillExpr);
+    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
+               NewDMI->print(dbgs(), false, false, false, TII));
+  }
+
+  // The newly created DBG_VALUE instruction NewDMI must be inserted after
+  // MI. Keep track of the pairing.
+  TransferDebugPair MIP = {&MI, NewDMI};
+  Transfers.push_back(MIP);
+
+  // End all previous ranges of Var.
+  OpenRanges.erase(VarLocIDs[OldVarID].Var);
+
+  // Add the VarLoc to OpenRanges.
+  VarLoc VL(*NewDMI, LS);
+  unsigned LocID = VarLocIDs.insert(VL);
+  OpenRanges.insert(LocID, VL.Var);
+}
+
 /// A definition of a register may mark the end of a range.
 void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
                                           OpenRangesSet &OpenRanges,
@@ -426,28 +482,51 @@ bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
         FrameInfo.isSpillSlotObjectIndex(FI)))
     return false;
 
-  // In a spill instruction generated by the InlineSpiller the spilled register
-  // has its kill flag set. Return false if we don't find such a register.
-  Reg = 0;
+  auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
+    if (!MO.isReg() || !MO.isUse()) {
+      Reg = 0;
+      return false;
+    }
+    Reg = MO.getReg();
+    return MO.isKill();
+  };
+
   for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isReg() && MO.isUse() && MO.isKill()) {
-      Reg = MO.getReg();
-      break;
+    // In a spill instruction generated by the InlineSpiller the spilled
+    // register has its kill flag set.
+    if (isKilledReg(MO, Reg))
+      return true;
+    if (Reg != 0) {
+      // Check whether next instruction kills the spilled register.
+      // FIXME: Current solution does not cover search for killed register in
+      // bundles and instructions further down the chain.
+      auto NextI = std::next(MI.getIterator());
+      // Skip next instruction that points to basic block end iterator.
+      if (MI.getParent()->end() == NextI)
+        continue;
+      unsigned RegNext;
+      for (const MachineOperand &MONext : NextI->operands()) {
+        // Return true if we came across the register from the
+        // previous spill instruction that is killed in NextI.
+        if (isKilledReg(MONext, RegNext) && RegNext == Reg)
+          return true;
+      }
     }
   }
-  return Reg != 0;
+  // Return false if we didn't find spilled register.
+  return false;
 }
 
 /// A spilled register may indicate that we have to end the current range of
 /// a variable and create a new one for the spill location.
-/// We don't want to insert any instructions in transfer(), so we just create
-/// the DBG_VALUE witout inserting it and keep track of it in @Spills.
+/// We don't want to insert any instructions in process(), so we just create
+/// the DBG_VALUE without inserting it and keep track of it in \p Transfers.
 /// It will be inserted into the BB when we're done iterating over the
 /// instructions.
 void LiveDebugValues::transferSpillInst(MachineInstr &MI,
                                         OpenRangesSet &OpenRanges,
                                         VarLocMap &VarLocIDs,
-                                        SpillMap &Spills) {
+                                        TransferMap &Transfers) {
   unsigned Reg;
   MachineFunction *MF = MI.getMF();
   if (!isSpillInstruction(MI, MF, Reg))
@@ -456,35 +535,49 @@ void LiveDebugValues::transferSpillInst(MachineInstr &MI,
   // Check if the register is the location of a debug value.
   for (unsigned ID : OpenRanges.getVarLocs()) {
     if (VarLocIDs[ID].isDescribedByReg() == Reg) {
-      DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
-                   << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
-
-      // Create a DBG_VALUE instruction to describe the Var in its spilled
-      // location, but don't insert it yet to avoid invalidating the
-      // iterator in our caller.
-      unsigned SpillBase;
-      int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase);
-      const MachineInstr *DMI = &VarLocIDs[ID].MI;
-      auto *SpillExpr = DIExpression::prepend(
-          DMI->getDebugExpression(), DIExpression::NoDeref, SpillOffset);
-      MachineInstr *SpDMI =
-          BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase,
-                  DMI->getDebugVariable(), SpillExpr);
-      DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
-            SpDMI->print(dbgs(), false, TII));
-
-      // The newly created DBG_VALUE instruction SpDMI must be inserted after
-      // MI. Keep track of the pairing.
-      SpillDebugPair MIP = {&MI, SpDMI};
-      Spills.push_back(MIP);
-
-      // End all previous ranges of Var.
-      OpenRanges.erase(VarLocIDs[ID].Var);
-
-      // Add the VarLoc to OpenRanges.
-      VarLoc VL(*SpDMI, LS);
-      unsigned SpillLocID = VarLocIDs.insert(VL);
-      OpenRanges.insert(SpillLocID, VL.Var);
+      LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
+                        << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
+      insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID);
+      return;
+    }
+  }
+}
+
+/// If \p MI is a register copy instruction, that copies a previously tracked
+/// value from one register to another register that is callee saved, we
+/// create new DBG_VALUE instruction  described with copy destination register.
+void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
+                                           OpenRangesSet &OpenRanges,
+                                           VarLocMap &VarLocIDs,
+                                           TransferMap &Transfers) {
+  const MachineOperand *SrcRegOp, *DestRegOp;
+
+  if (!TII->isCopyInstr(MI, SrcRegOp, DestRegOp) || !SrcRegOp->isKill() ||
+      !DestRegOp->isDef())
+    return;
+
+  auto isCalleSavedReg = [&](unsigned Reg) {
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      if (CalleeSavedRegs.test(*RAI))
+        return true;
+    return false;
+  };
+
+  unsigned SrcReg = SrcRegOp->getReg();
+  unsigned DestReg = DestRegOp->getReg();
+
+  // We want to recognize instructions where destination register is callee
+  // saved register. If register that could be clobbered by the call is
+  // included, there would be a great chance that it is going to be clobbered
+  // soon. It is more likely that previous register location, which is callee
+  // saved, is going to stay unclobbered longer, even if it is killed.
+  if (!isCalleSavedReg(DestReg))
+    return;
+
+  for (unsigned ID : OpenRanges.getVarLocs()) {
+    if (VarLocIDs[ID].isDescribedByReg() == SrcReg) {
+      insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID,
+                              DestReg);
       return;
     }
   }
@@ -497,16 +590,18 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
                                              const VarLocMap &VarLocIDs) {
   bool Changed = false;
   const MachineBasicBlock *CurMBB = MI.getParent();
-  if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back())))
+  if (!(MI.isTerminator() || (&MI == &CurMBB->back())))
     return false;
 
   if (OpenRanges.empty())
     return false;
 
-  DEBUG(for (unsigned ID : OpenRanges.getVarLocs()) {
-          // Copy OpenRanges to OutLocs, if not already present.
-          dbgs() << "Add to OutLocs: "; VarLocIDs[ID].dump();
-        });
+  LLVM_DEBUG(for (unsigned ID
+                  : OpenRanges.getVarLocs()) {
+    // Copy OpenRanges to OutLocs, if not already present.
+    dbgs() << "Add to OutLocs: ";
+    VarLocIDs[ID].dump();
+  });
   VarLocSet &VLS = OutLocs[CurMBB];
   Changed = VLS |= OpenRanges.getVarLocs();
   OpenRanges.clear();
@@ -514,14 +609,16 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
 }
 
 /// This routine creates OpenRanges and OutLocs.
-bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
-                               SpillMap &Spills, bool transferSpills) {
+bool LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                              VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
+                              TransferMap &Transfers, bool transferChanges) {
   bool Changed = false;
   transferDebugValue(MI, OpenRanges, VarLocIDs);
   transferRegisterDef(MI, OpenRanges, VarLocIDs);
-  if (transferSpills)
-    transferSpillInst(MI, OpenRanges, VarLocIDs, Spills);
+  if (transferChanges) {
+    transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
+    transferSpillInst(MI, OpenRanges, VarLocIDs, Transfers);
+  }
   Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
   return Changed;
 }
@@ -532,7 +629,7 @@ bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
 bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
                            VarLocInMBB &InLocs, const VarLocMap &VarLocIDs,
                            SmallPtrSet<const MachineBasicBlock *, 16> &Visited) {
-  DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
   bool Changed = false;
 
   VarLocSet InLocsT; // Temporary incoming locations.
@@ -583,7 +680,7 @@ bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
   for (auto ID : Diff) {
     // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
     // new range is started for the var from the mbb's beginning by inserting
-    // a new DBG_VALUE. transfer() will end this range however appropriate.
+    // a new DBG_VALUE. process() will end this range however appropriate.
     const VarLoc &DiffIt = VarLocIDs[ID];
     const MachineInstr *DMI = &DiffIt.MI;
     MachineInstr *MI =
@@ -592,7 +689,7 @@ bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
                 DMI->getDebugVariable(), DMI->getDebugExpression());
     if (DMI->isIndirectDebugValue())
       MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
-    DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
     ILS.set(ID);
     ++NumInserted;
     Changed = true;
@@ -603,7 +700,7 @@ bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
 bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
-  DEBUG(dbgs() << "\nDebug Range Extension\n");
+  LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
 
   bool Changed = false;
   bool OLChanged = false;
@@ -613,7 +710,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   OpenRangesSet OpenRanges; // Ranges that are open until end of bb.
   VarLocInMBB OutLocs;      // Ranges that exist beyond bb.
   VarLocInMBB InLocs;       // Ranges that are incoming after joining.
-  SpillMap Spills;          // DBG_VALUEs associated with spills.
+  TransferMap Transfers;    // DBG_VALUEs associated with spills.
 
   DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
   DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
@@ -624,6 +721,8 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
                       std::greater<unsigned int>>
       Pending;
 
+  enum : bool { dontTransferChanges = false, transferChanges = true };
+
   // Initialize every mbb with OutLocs.
   // We are not looking at any spill instructions during the initial pass
   // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
@@ -631,11 +730,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   // within the BB in which the spill occurs.
   for (auto &MBB : MF)
     for (auto &MI : MBB)
-      transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
-               /*transferSpills=*/false);
+      process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers,
+              dontTransferChanges);
 
-  DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",
-                         dbgs()));
+  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                              "OutLocs after initialization", dbgs()));
 
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   unsigned int RPONumber = 0;
@@ -646,7 +745,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     ++RPONumber;
   }
   // This is a standard "union of predecessor outs" dataflow problem.
-  // To solve it, we perform join() and transfer() using the two worklist method
+  // To solve it, we perform join() and process() using the two worklist method
   // until the ranges converge.
   // Ranges have converged when both worklists are empty.
   SmallPtrSet<const MachineBasicBlock *, 16> Visited;
@@ -655,7 +754,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     // thing twice.  We could avoid this with a custom priority queue, but this
     // is probably not worth it.
     SmallPtrSet<MachineBasicBlock *, 16> OnPending;
-    DEBUG(dbgs() << "Processing Worklist\n");
+    LLVM_DEBUG(dbgs() << "Processing Worklist\n");
     while (!Worklist.empty()) {
       MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
       Worklist.pop();
@@ -668,19 +767,19 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
         // examine spill instructions to see whether they spill registers that
         // correspond to user variables.
         for (auto &MI : *MBB)
-          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
-                                /*transferSpills=*/true);
+          OLChanged |= process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers,
+                               transferChanges);
 
         // Add any DBG_VALUE instructions necessitated by spills.
-        for (auto &SP : Spills)
-          MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst),
-                           SP.DebugInst);
-        Spills.clear();
+        for (auto &TR : Transfers)
+          MBB->insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
+                           TR.DebugInst);
+        Transfers.clear();
 
-        DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
-                               "OutLocs after propagating", dbgs()));
-        DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
-                               "InLocs after propagating", dbgs()));
+        LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                                    "OutLocs after propagating", dbgs()));
+        LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
+                                    "InLocs after propagating", dbgs()));
 
         if (OLChanged) {
           OLChanged = false;
@@ -697,8 +796,8 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     assert(Pending.empty() && "Pending should be empty");
   }
 
-  DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
-  DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
+  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
+  LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
   return Changed;
 }
 
@@ -715,6 +814,8 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
   TFI = MF.getSubtarget().getFrameLowering();
+  TFI->determineCalleeSaves(MF, CalleeSavedRegs,
+                            make_unique<RegScavenger>().get());
   LS.initialize(MF);
 
   bool Changed = ExtendRanges(MF);
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 4ffcffcea693..3ff03ec4a7ee 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -44,6 +44,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -223,7 +224,12 @@ public:
     return L1;
   }
 
-  /// getLocationNo - Return the location number that matches Loc.
+  /// Return the location number that matches Loc.
+  ///
+  /// For undef values we always return location number UndefLocNo without
+  /// inserting anything in locations. Since locations is a vector and the
+  /// location number is the position in the vector and UndefLocNo is ~0,
+  /// we would need a very big vector to put the value at the right position.
   unsigned getLocationNo(const MachineOperand &LocMO) {
     if (LocMO.isReg()) {
       if (LocMO.getReg() == 0)
@@ -301,7 +307,7 @@ public:
 
   /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
-  bool splitRegister(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+  bool splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
                      LiveIntervals &LIS);
 
   /// rewriteLocations - Rewrite virtual register locations according to the
@@ -510,7 +516,7 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   if (MI.getNumOperands() != 4 ||
       !(MI.getOperand(1).isReg() || MI.getOperand(1).isImm()) ||
       !MI.getOperand(2).isMetadata()) {
-    DEBUG(dbgs() << "Can't handle " << MI);
+    LLVM_DEBUG(dbgs() << "Can't handle " << MI);
     return false;
   }
 
@@ -529,8 +535,8 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
       // The DBG_VALUE is described by a virtual register that does not have a
       // live interval. Discard the DBG_VALUE.
       Discard = true;
-      DEBUG(dbgs() << "Discarding debug info (no LIS interval): "
-            << Idx << " " << MI);
+      LLVM_DEBUG(dbgs() << "Discarding debug info (no LIS interval): " << Idx
+                        << " " << MI);
     } else {
       // The DBG_VALUE is only valid if either Reg is live out from Idx, or Reg
       // is defined dead at Idx (where Idx is the slot index for the instruction
@@ -541,8 +547,8 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
         // We have found a DBG_VALUE with the value in a virtual register that
         // is not live. Discard the DBG_VALUE.
         Discard = true;
-        DEBUG(dbgs() << "Discarding debug info (reg not live): "
-              << Idx << " " << MI);
+        LLVM_DEBUG(dbgs() << "Discarding debug info (reg not live): " << Idx
+                          << " " << MI);
       }
     }
   }
@@ -687,7 +693,8 @@ void UserValue::addDefsFromCopies(
   if (CopyValues.empty())
     return;
 
-  DEBUG(dbgs() << "Got " << CopyValues.size() << " copies of " << *LI << '\n');
+  LLVM_DEBUG(dbgs() << "Got " << CopyValues.size() << " copies of " << *LI
+                    << '\n');
 
   // Try to add defs of the copied values for each kill point.
   for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
@@ -701,8 +708,8 @@ void UserValue::addDefsFromCopies(
       LocMap::iterator I = locInts.find(Idx);
       if (I.valid() && I.start() <= Idx)
         continue;
-      DEBUG(dbgs() << "Kill at " << Idx << " covered by valno #"
-                   << DstVNI->id << " in " << *DstLI << '\n');
+      LLVM_DEBUG(dbgs() << "Kill at " << Idx << " covered by valno #"
+                        << DstVNI->id << " in " << *DstLI << '\n');
       MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
       assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
       unsigned LocNo = getLocationNo(CopyMI->getOperand(0));
@@ -759,13 +766,6 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
     // function).
   }
 
-  // Erase all the undefs.
-  for (LocMap::iterator I = locInts.begin(); I.valid();)
-    if (I.value().isUndef())
-      I.erase();
-    else
-      ++I;
-
   // The computed intervals may extend beyond the range of the debug
   // location's lexical scope. In this case, splitting of an interval
   // can result in an interval outside of the scope being created,
@@ -850,12 +850,12 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   LIS = &pass.getAnalysis<LiveIntervals>();
   TRI = mf.getSubtarget().getRegisterInfo();
-  DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
-               << mf.getName() << " **********\n");
+  LLVM_DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
+                    << mf.getName() << " **********\n");
 
   bool Changed = collectDebugValues(mf);
   computeIntervals();
-  DEBUG(print(dbgs()));
+  LLVM_DEBUG(print(dbgs()));
   ModifiedMF = Changed;
   return Changed;
 }
@@ -901,7 +901,7 @@ LiveDebugVariables::~LiveDebugVariables() {
 bool
 UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
                          LiveIntervals& LIS) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Splitting Loc" << OldLocNo << '\t';
     print(dbgs(), nullptr);
   });
@@ -984,17 +984,22 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
   while (LocMapI.valid()) {
     DbgValueLocation v = LocMapI.value();
     if (v.locNo() == OldLocNo) {
-      DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';'
-                   << LocMapI.stop() << ")\n");
+      LLVM_DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';'
+                        << LocMapI.stop() << ")\n");
       LocMapI.erase();
     } else {
-      if (v.locNo() > OldLocNo)
+      // Undef values always have location number UndefLocNo, so don't change
+      // locNo in that case. See getLocationNo().
+      if (!v.isUndef() && v.locNo() > OldLocNo)
         LocMapI.setValueUnchecked(v.changeLocNo(v.locNo() - 1));
       ++LocMapI;
     }
   }
 
-  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), nullptr);});
+  LLVM_DEBUG({
+    dbgs() << "Split result: \t";
+    print(dbgs(), nullptr);
+  });
   return DidChange;
 }
 
@@ -1094,6 +1099,10 @@ void UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI,
   // physical register.
   for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
     DbgValueLocation Loc = I.value();
+    // Undef values don't exist in locations (and thus not in LocNoMap either)
+    // so skip over them. See getLocationNo().
+    if (Loc.isUndef())
+      continue;
     unsigned NewLocNo = LocNoMap[Loc.locNo()];
     I.setValueUnchecked(Loc.changeLocNo(NewLocNo));
     I.setStart(I.start());
@@ -1136,7 +1145,7 @@ findNextInsertLocation(MachineBasicBlock *MBB,
   unsigned Reg = LocMO.getReg();
 
   // Find the next instruction in the MBB that define the register Reg.
-  while (I != MBB->end()) {
+  while (I != MBB->end() && !I->isTerminator()) {
     if (!LIS.isNotInMIMap(*I) &&
         SlotIndex::isEarlierEqualInstr(StopIdx, LIS.getInstructionIndex(*I)))
       break;
@@ -1158,7 +1167,15 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
   // Only search within the current MBB.
   StopIdx = (MBBEndIdx < StopIdx) ? MBBEndIdx : StopIdx;
   MachineBasicBlock::iterator I = findInsertLocation(MBB, StartIdx, LIS);
-  MachineOperand &MO = locations[Loc.locNo()];
+  // Undef values don't exist in locations so create new "noreg" register MOs
+  // for them. See getLocationNo().
+  MachineOperand MO = !Loc.isUndef() ?
+    locations[Loc.locNo()] :
+    MachineOperand::CreateReg(/* Reg */ 0, /* isDef */ false, /* isImp */ false,
+                              /* isKill */ false, /* isDead */ false,
+                              /* isUndef */ false, /* isEarlyClobber */ false,
+                              /* SubReg */ 0, /* isDebug */ true);
+
   ++NumInsertedDebugValues;
 
   assert(cast<DILocalVariable>(Variable)
@@ -1179,14 +1196,8 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
   assert((!Spilled || MO.isFI()) && "a spilled location must be a frame index");
 
   do {
-    MachineInstrBuilder MIB =
-      BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
-          .add(MO);
-    if (IsIndirect)
-      MIB.addImm(0U);
-    else
-      MIB.addReg(0U, RegState::Debug);
-    MIB.addMetadata(Variable).addMetadata(Expr);
+    BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
+            IsIndirect, MO, Variable, Expr);
 
     // Continue and insert DBG_VALUES after every redefinition of register
     // associated with the debug value within the range
@@ -1212,11 +1223,11 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
     if (trimmedDefs.count(Start))
       Start = Start.getPrevIndex();
 
-    DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << Loc.locNo());
+    LLVM_DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << Loc.locNo());
     MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
     SlotIndex MBBEnd = LIS.getMBBEndIdx(&*MBB);
 
-    DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
+    LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
     insertDebugValue(&*MBB, Start, Stop, Loc, Spilled, LIS, TII, TRI);
     // This interval may span multiple basic blocks.
     // Insert a DBG_VALUE into each one.
@@ -1226,10 +1237,10 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
       if (++MBB == MFEnd)
         break;
       MBBEnd = LIS.getMBBEndIdx(&*MBB);
-      DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
+      LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
       insertDebugValue(&*MBB, Start, Stop, Loc, Spilled, LIS, TII, TRI);
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
     if (MBB == MFEnd)
       break;
 
@@ -1238,13 +1249,13 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
 }
 
 void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
-  DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
+  LLVM_DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
   if (!MF)
     return;
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   BitVector SpilledLocations;
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
-    DEBUG(userValues[i]->print(dbgs(), TRI));
+    LLVM_DEBUG(userValues[i]->print(dbgs(), TRI));
     userValues[i]->rewriteLocations(*VRM, *TRI, SpilledLocations);
     userValues[i]->emitDebugValues(VRM, *LIS, *TII, *TRI, SpilledLocations);
   }
diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
index 302c75133e35..83dd982587c6 100644
--- a/contrib/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -991,6 +992,7 @@ void LiveInterval::print(raw_ostream &OS) const {
   // Print subranges
   for (const SubRange &SR : subranges())
     OS << SR;
+  OS << " weight:" << weight;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index 3e742a6c2f21..36428e0335f9 100644
--- a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -187,7 +187,7 @@ void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
   clear();
   Size = NSize;
   LIUs = static_cast<LiveIntervalUnion*>(
-    malloc(sizeof(LiveIntervalUnion)*NSize));
+      safe_malloc(sizeof(LiveIntervalUnion)*NSize));
   for (unsigned i = 0; i != Size; ++i)
     new(LIUs + i) LiveIntervalUnion(Alloc);
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervals.cpp b/contrib/llvm/lib/CodeGen/LiveIntervals.cpp
index 79fdba7e062a..471775f8706b 100644
--- a/contrib/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
@@ -147,7 +148,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
     for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
       getRegUnit(i);
   }
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
   return true;
 }
 
@@ -310,7 +311,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
 /// entering the entry block or a landing pad.
 void LiveIntervals::computeLiveInRegUnits() {
   RegUnitRanges.resize(TRI->getNumRegUnits());
-  DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
+  LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
 
   // Keep track of the live range sets allocated.
   SmallVector<unsigned, 8> NewRanges;
@@ -323,7 +324,7 @@ void LiveIntervals::computeLiveInRegUnits() {
 
     // Create phi-defs at Begin for all live-in registers.
     SlotIndex Begin = Indexes->getMBBStartIdx(&MBB);
-    DEBUG(dbgs() << Begin << "\t" << printMBBReference(MBB));
+    LLVM_DEBUG(dbgs() << Begin << "\t" << printMBBReference(MBB));
     for (const auto &LI : MBB.liveins()) {
       for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
@@ -335,12 +336,12 @@ void LiveIntervals::computeLiveInRegUnits() {
         }
         VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
         (void)VNI;
-        DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << '#' << VNI->id);
+        LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << '#' << VNI->id);
       }
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
-  DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
+  LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
   // Compute the 'normal' part of the ranges.
   for (unsigned Unit : NewRanges)
@@ -357,26 +358,40 @@ static void createSegmentsForValues(LiveRange &LR,
   }
 }
 
-using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>;
-
-static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
-                                 ShrinkToUsesWorkList &WorkList,
-                                 const LiveRange &OldRange) {
+void LiveIntervals::extendSegmentsToUses(LiveRange &Segments,
+                                         ShrinkToUsesWorkList &WorkList,
+                                         unsigned Reg, LaneBitmask LaneMask) {
   // Keep track of the PHIs that are in use.
   SmallPtrSet<VNInfo*, 8> UsedPHIs;
   // Blocks that have already been added to WorkList as live-out.
   SmallPtrSet<const MachineBasicBlock*, 16> LiveOut;
 
+  auto getSubRange = [](const LiveInterval &I, LaneBitmask M)
+        -> const LiveRange& {
+    if (M.none())
+      return I;
+    for (const LiveInterval::SubRange &SR : I.subranges()) {
+      if ((SR.LaneMask & M).any()) {
+        assert(SR.LaneMask == M && "Expecting lane masks to match exactly");
+        return SR;
+      }
+    }
+    llvm_unreachable("Subrange for mask not found");
+  };
+
+  const LiveInterval &LI = getInterval(Reg);
+  const LiveRange &OldRange = getSubRange(LI, LaneMask);
+
   // Extend intervals to reach all uses in WorkList.
   while (!WorkList.empty()) {
     SlotIndex Idx = WorkList.back().first;
     VNInfo *VNI = WorkList.back().second;
     WorkList.pop_back();
-    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Idx.getPrevSlot());
-    SlotIndex BlockStart = Indexes.getMBBStartIdx(MBB);
+    const MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = Indexes->getMBBStartIdx(MBB);
 
     // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = LR.extendInBlock(BlockStart, Idx)) {
+    if (VNInfo *ExtVNI = Segments.extendInBlock(BlockStart, Idx)) {
       assert(ExtVNI == VNI && "Unexpected existing value number");
       (void)ExtVNI;
       // Is this a PHIDef we haven't seen before?
@@ -387,7 +402,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
       for (const MachineBasicBlock *Pred : MBB->predecessors()) {
         if (!LiveOut.insert(Pred).second)
           continue;
-        SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+        SlotIndex Stop = Indexes->getMBBEndIdx(Pred);
         // A predecessor is not required to have a live-out value for a PHI.
         if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop))
           WorkList.push_back(std::make_pair(Stop, PVNI));
@@ -396,24 +411,37 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
     }
 
     // VNI is live-in to MBB.
-    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+    LLVM_DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    Segments.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
 
     // Make sure VNI is live-out from the predecessors.
     for (const MachineBasicBlock *Pred : MBB->predecessors()) {
       if (!LiveOut.insert(Pred).second)
         continue;
-      SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
-      assert(OldRange.getVNInfoBefore(Stop) == VNI &&
-             "Wrong value out of predecessor");
-      WorkList.push_back(std::make_pair(Stop, VNI));
+      SlotIndex Stop = Indexes->getMBBEndIdx(Pred);
+      if (VNInfo *OldVNI = OldRange.getVNInfoBefore(Stop)) {
+        assert(OldVNI == VNI && "Wrong value out of predecessor");
+        (void)OldVNI;
+        WorkList.push_back(std::make_pair(Stop, VNI));
+      } else {
+#ifndef NDEBUG
+        // There was no old VNI. Verify that Stop is jointly dominated
+        // by <undef>s for this live range.
+        assert(LaneMask.any() &&
+               "Missing value out of predecessor for main range");
+        SmallVector<SlotIndex,8> Undefs;
+        LI.computeSubRangeUndefs(Undefs, LaneMask, *MRI, *Indexes);
+        assert(LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes) &&
+               "Missing value out of predecessor for subrange");
+#endif
+      }
     }
   }
 }
 
 bool LiveIntervals::shrinkToUses(LiveInterval *li,
                                  SmallVectorImpl<MachineInstr*> *dead) {
-  DEBUG(dbgs() << "Shrink: " << *li << '\n');
+  LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n');
   assert(TargetRegisterInfo::isVirtualRegister(li->reg)
          && "Can only shrink virtual registers");
 
@@ -442,9 +470,10 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       // This shouldn't happen: readsVirtualRegister returns true, but there is
       // no live value. It is likely caused by a target getting <undef> flags
       // wrong.
-      DEBUG(dbgs() << Idx << '\t' << UseMI
-                   << "Warning: Instr claims to read non-existent value in "
-                   << *li << '\n');
+      LLVM_DEBUG(
+          dbgs() << Idx << '\t' << UseMI
+                 << "Warning: Instr claims to read non-existent value in "
+                 << *li << '\n');
       continue;
     }
     // Special case: An early-clobber tied operand reads and writes the
@@ -458,14 +487,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   // Create new live ranges with only minimal live segments per def.
   LiveRange NewLR;
   createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end()));
-  extendSegmentsToUses(NewLR, *Indexes, WorkList, *li);
+  extendSegmentsToUses(NewLR, WorkList, Reg, LaneBitmask::getNone());
 
   // Move the trimmed segments back.
   li->segments.swap(NewLR.segments);
 
   // Handle dead values.
   bool CanSeparate = computeDeadValues(*li, dead);
-  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  LLVM_DEBUG(dbgs() << "Shrunk: " << *li << '\n');
   return CanSeparate;
 }
 
@@ -495,7 +524,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
       // This is a dead PHI. Remove it.
       VNI->markUnused();
       LI.removeSegment(I);
-      DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
+      LLVM_DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
       MayHaveSplitComponents = true;
     } else {
       // This is a dead def. Make sure the instruction knows.
@@ -503,7 +532,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
       assert(MI && "No instruction defining live value");
       MI->addRegisterDead(LI.reg, TRI);
       if (dead && MI->allDefsAreDead()) {
-        DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
+        LLVM_DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
         dead->push_back(MI);
       }
     }
@@ -512,7 +541,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
 }
 
 void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
-  DEBUG(dbgs() << "Shrink: " << SR << '\n');
+  LLVM_DEBUG(dbgs() << "Shrink: " << SR << '\n');
   assert(TargetRegisterInfo::isVirtualRegister(Reg)
          && "Can only shrink virtual registers");
   // Find all the values used, including PHI kills.
@@ -556,7 +585,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
   // Create a new live ranges with only minimal live segments per def.
   LiveRange NewLR;
   createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end()));
-  extendSegmentsToUses(NewLR, *Indexes, WorkList, SR);
+  extendSegmentsToUses(NewLR, WorkList, Reg, SR.LaneMask);
 
   // Move the trimmed ranges back.
   SR.segments.swap(NewLR.segments);
@@ -571,13 +600,14 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
-      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
+      LLVM_DEBUG(dbgs() << "Dead PHI at " << VNI->def
+                        << " may separate interval\n");
       VNI->markUnused();
       SR.removeSegment(*Segment);
     }
   }
 
-  DEBUG(dbgs() << "Shrunk: " << SR << '\n');
+  LLVM_DEBUG(dbgs() << "Shrunk: " << SR << '\n');
 }
 
 void LiveIntervals::extendToIndices(LiveRange &LR,
@@ -785,7 +815,7 @@ MachineBasicBlock*
 LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
   // A local live range must be fully contained inside the block, meaning it is
   // defined and killed at instructions, not at block boundaries. It is not
-  // live in or or out of any block.
+  // live in or out of any block.
   //
   // It is technically possible to have a PHI-defined live range identical to a
   // single block, but we are going to return false in that case.
@@ -942,7 +972,8 @@ public:
   /// Update all live ranges touched by MI, assuming a move from OldIdx to
   /// NewIdx.
   void updateAllRanges(MachineInstr *MI) {
-    DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": " << *MI);
+    LLVM_DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": "
+                      << *MI);
     bool hasRegMask = false;
     for (MachineOperand &MO : MI->operands()) {
       if (MO.isRegMask())
@@ -992,7 +1023,7 @@ private:
   void updateRange(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
     if (!Updated.insert(&LR).second)
       return;
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "     ";
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         dbgs() << printReg(Reg);
@@ -1007,7 +1038,7 @@ private:
       handleMoveDown(LR);
     else
       handleMoveUp(LR, Reg, LaneMask);
-    DEBUG(dbgs() << "        -->\t" << LR << '\n');
+    LLVM_DEBUG(dbgs() << "        -->\t" << LR << '\n');
     LR.verify();
   }
 
@@ -1291,6 +1322,36 @@ private:
           if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end))
             OldIdxIn->end = NewIdx.getRegSlot();
         }
+      } else if (OldIdxIn != E
+          && SlotIndex::isEarlierInstr(NewIdxOut->start, NewIdx)
+          && SlotIndex::isEarlierInstr(NewIdx, NewIdxOut->end)) {
+        // OldIdxVNI is a dead def that has been moved into the middle of
+        // another value in LR. That can happen when LR is a whole register,
+        // but the dead def is a write to a subreg that is dead at NewIdx.
+        // The dead def may have been moved across other values
+        // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
+        // down one position.
+        //    |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - |
+        // => |- X0/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -|
+        std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut));
+        // Modify the segment at NewIdxOut and the following segment to meet at
+        // the point of the dead def, with the following segment getting
+        // OldIdxVNI as its value number.
+        *NewIdxOut = LiveRange::Segment(
+            NewIdxOut->start, NewIdxDef.getRegSlot(), NewIdxOut->valno);
+        *(NewIdxOut + 1) = LiveRange::Segment(
+            NewIdxDef.getRegSlot(), (NewIdxOut + 1)->end, OldIdxVNI);
+        OldIdxVNI->def = NewIdxDef;
+        // Modify subsequent segments to be defined by the moved def OldIdxVNI.
+        for (auto Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx)
+          Idx->valno = OldIdxVNI;
+        // Aggressively remove all dead flags from the former dead definition.
+        // Kill/dead flags shouldn't be used while live intervals exist; they
+        // will be reinserted by VirtRegRewriter.
+        if (MachineInstr *KillMI = LIS.getInstructionFromIndex(NewIdx))
+          for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO)
+            if (MO->isReg() && !MO->isUse())
+              MO->setIsDead(false);
       } else {
         // OldIdxVNI is a dead def. It may have been moved across other values
         // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
@@ -1360,7 +1421,7 @@ private:
 
     MachineBasicBlock::iterator Begin = MBB->begin();
     while (MII != Begin) {
-      if ((--MII)->isDebugValue())
+      if ((--MII)->isDebugInstr())
         continue;
       SlotIndex Idx = Indexes->getInstructionIndex(*MII);
 
@@ -1422,7 +1483,7 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
     MachineInstr &MI = *I;
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
     SlotIndex instrIdx = getInstructionIndex(MI);
@@ -1519,7 +1580,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
     MachineInstr &MI = *I;
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
     for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
                                           MOE = MI.operands_end();
@@ -1580,7 +1641,7 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
   unsigned NumComp = ConEQ.Classify(LI);
   if (NumComp <= 1)
     return;
-  DEBUG(dbgs() << "  Split " << NumComp << " components: " << LI << '\n');
+  LLVM_DEBUG(dbgs() << "  Split " << NumComp << " components: " << LI << '\n');
   unsigned Reg = LI.reg;
   const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
   for (unsigned I = 1; I < NumComp; ++I) {
diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
index 277212cf7dac..86c6c8e29f9a 100644
--- a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -18,12 +18,13 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 
-/// \brief Remove all registers from the set that get clobbered by the register
+/// Remove all registers from the set that get clobbered by the register
 /// mask.
 /// The clobbers set will be the list of live registers clobbered
 /// by the regmask.
@@ -44,7 +45,7 @@ void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
 void LivePhysRegs::removeDefs(const MachineInstr &MI) {
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg()) {
-      if (!O->isDef())
+      if (!O->isDef() || O->isDebug())
         continue;
       unsigned Reg = O->getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(Reg))
@@ -58,7 +59,7 @@ void LivePhysRegs::removeDefs(const MachineInstr &MI) {
 /// Add uses to the set.
 void LivePhysRegs::addUses(const MachineInstr &MI) {
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
-    if (!O->isReg() || !O->readsReg())
+    if (!O->isReg() || !O->readsReg() || O->isDebug())
       continue;
     unsigned Reg = O->getReg();
     if (!TargetRegisterInfo::isPhysicalRegister(Reg))
@@ -85,7 +86,7 @@ void LivePhysRegs::stepForward(const MachineInstr &MI,
         SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
   // Remove killed registers from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
-    if (O->isReg()) {
+    if (O->isReg() && !O->isDebug()) {
       unsigned Reg = O->getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(Reg))
         continue;
@@ -105,9 +106,13 @@ void LivePhysRegs::stepForward(const MachineInstr &MI,
 
   // Add defs to the set.
   for (auto Reg : Clobbers) {
-    // Skip dead defs.  They shouldn't be added to the set.
+    // Skip dead defs and registers clobbered by regmasks. They shouldn't
+    // be added to the set.
     if (Reg.second->isReg() && Reg.second->isDead())
       continue;
+    if (Reg.second->isRegMask() &&
+        MachineOperand::clobbersPhysReg(Reg.second->getRegMask(), Reg.first))
+      continue;
     addReg(Reg.first);
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 66c23b7b69ce..04324943dfad 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -584,3 +584,24 @@ void LiveRangeCalc::updateSSA() {
     }
   } while (Changed);
 }
+
+bool LiveRangeCalc::isJointlyDominated(const MachineBasicBlock *MBB,
+                                       ArrayRef<SlotIndex> Defs,
+                                       const SlotIndexes &Indexes) {
+  const MachineFunction &MF = *MBB->getParent();
+  BitVector DefBlocks(MF.getNumBlockIDs());
+  for (SlotIndex I : Defs)
+    DefBlocks.set(Indexes.getMBBFromIndex(I)->getNumber());
+
+  SetVector<unsigned> PredQueue;
+  PredQueue.insert(MBB->getNumber());
+  for (unsigned i = 0; i != PredQueue.size(); ++i) {
+    unsigned BN = PredQueue[i];
+    if (DefBlocks[BN])
+      return true;
+    const MachineBasicBlock *B = MF.getBlockNumbered(BN);
+    for (const MachineBasicBlock *P : B->predecessors())
+      PredQueue.insert(P->getNumber());
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
index c4914f23f56d..9f226b154a67 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
@@ -147,7 +147,7 @@ class LiveRangeCalc {
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
   bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
-                        SlotIndex Kill, unsigned PhysReg,
+                        SlotIndex Use, unsigned PhysReg,
                         ArrayRef<SlotIndex> Undefs);
 
   /// updateSSA - Compute the values that will be live in to all requested
@@ -282,6 +282,15 @@ public:
   /// Every predecessor of a live-in block must have been given a value with
   /// setLiveOutValue, the value may be null for live-trough blocks.
   void calculateValues();
+
+  /// A diagnostic function to check if the end of the block @p MBB is
+  /// jointly dominated by the blocks corresponding to the slot indices
+  /// in @p Defs. This function is mainly for use in self-verification
+  /// checks.
+  LLVM_ATTRIBUTE_UNUSED
+  static bool isJointlyDominated(const MachineBasicBlock *MBB,
+                                 ArrayRef<SlotIndex> Defs,
+                                 const SlotIndexes &Indexes);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 86cfbd87f5b1..8dfe8b68c3af 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -31,21 +31,24 @@ STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
 
 void LiveRangeEdit::Delegate::anchor() { }
 
-LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
+                                                     bool createSubRanges) {
   unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
-  if (VRM) {
+  if (VRM)
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
-  }
+
   LiveInterval &LI = LIS.createEmptyInterval(VReg);
   if (Parent && !Parent->isSpillable())
     LI.markNotSpillable();
-  // Create empty subranges if the OldReg's interval has them. Do not create
-  // the main range here---it will be constructed later after the subranges
-  // have been finalized.
-  LiveInterval &OldLI = LIS.getInterval(OldReg);
-  VNInfo::Allocator &Alloc = LIS.getVNInfoAllocator();
-  for (LiveInterval::SubRange &S : OldLI.subranges())
-    LI.createSubRange(Alloc, S.LaneMask);
+  if (createSubRanges) {
+    // Create empty subranges if the OldReg's interval has them. Do not create
+    // the main range here---it will be constructed later after the subranges
+    // have been finalized.
+    LiveInterval &OldLI = LIS.getInterval(OldReg);
+    VNInfo::Allocator &Alloc = LIS.getVNInfoAllocator();
+    for (LiveInterval::SubRange &S : OldLI.subranges())
+      LI.createSubRange(Alloc, S.LaneMask);
+  }
   return LI;
 }
 
@@ -217,8 +220,8 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (!DefMI->isSafeToMove(nullptr, SawStore))
     return false;
 
-  DEBUG(dbgs() << "Try to fold single def: " << *DefMI
-               << "       into single use: " << *UseMI);
+  LLVM_DEBUG(dbgs() << "Try to fold single def: " << *DefMI
+                    << "       into single use: " << *UseMI);
 
   SmallVector<unsigned, 8> Ops;
   if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second)
@@ -227,7 +230,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
   if (!FoldMI)
     return false;
-  DEBUG(dbgs() << "                folded: " << *FoldMI);
+  LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
   LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, nullptr);
@@ -264,18 +267,18 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
   }
   // Never delete inline asm.
   if (MI->isInlineAsm()) {
-    DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
+    LLVM_DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
     return;
   }
 
   // Use the same criteria as DeadMachineInstructionElim.
   bool SawStore = false;
   if (!MI->isSafeToMove(nullptr, SawStore)) {
-    DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
+    LLVM_DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
     return;
   }
 
-  DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
+  LLVM_DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
 
   // Collect virtual registers to be erased after MI is gone.
   SmallVector<unsigned, 8> RegsToErase;
@@ -349,7 +352,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
         continue;
       MI->RemoveOperand(i-1);
     }
-    DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+    LLVM_DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
   } else {
     // If the dest of MI is an original reg and MI is reMaterializable,
     // don't delete the inst. Replace the dest with a new reg, and keep
@@ -357,12 +360,11 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // LiveRangeEdit::DeadRemats and will be deleted after all the
     // allocations of the func are done.
     if (isOrigDef && DeadRemats && TII.isTriviallyReMaterializable(*MI, AA)) {
-      LiveInterval &NewLI = createEmptyIntervalFrom(Dest);
-      NewLI.removeEmptySubRanges();
+      LiveInterval &NewLI = createEmptyIntervalFrom(Dest, false);
       VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
       NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI));
       pop_back();
-      markDeadRemat(MI);
+      DeadRemats->insert(MI);
       const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
       MI->substituteRegister(Dest, NewLI.reg, 0, TRI);
       MI->getOperand(0).setIsDead(true);
@@ -463,7 +465,7 @@ LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg))
-      DEBUG({
+      LLVM_DEBUG({
         const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
         dbgs() << "Inflated " << printReg(LI.reg) << " to "
                << TRI->getRegClassName(MRI.getRegClass(LI.reg)) << '\n';
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp b/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp
index 02e1f3b01ade..f75d513c89f5 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -111,7 +111,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
 
   InstOrderMap IOM;
   // Map from register to instruction order (value of IOM) where the
@@ -130,7 +130,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
       MachineInstr &MI = *Next;
       ++Next;
-      if (MI.isPHI() || MI.isDebugValue())
+      if (MI.isPHI() || MI.isDebugInstr())
         continue;
       if (MI.mayStore())
         SawStore = true;
@@ -218,7 +218,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
       if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
         MachineBasicBlock::iterator I = std::next(Insert->getIterator());
         // Skip all the PHI and debug instructions.
-        while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
+        while (I != MBB.end() && (I->isPHI() || I->isDebugInstr()))
           I = std::next(I);
         if (I == MI.getIterator())
           continue;
diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
index bd435968296d..e72977b02675 100644
--- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -102,37 +102,37 @@ static bool foreachUnit(const TargetRegisterInfo *TRI,
 }
 
 void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
-  DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg, TRI)
-               << " to " << printReg(PhysReg, TRI) << ':');
+  LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg, TRI) << " to "
+                    << printReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
 
-  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
-                                         const LiveRange &Range) {
-    DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range);
-    Matrix[Unit].unify(VirtReg, Range);
-    return false;
-  });
+  foreachUnit(
+      TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) {
+        LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range);
+        Matrix[Unit].unify(VirtReg, Range);
+        return false;
+      });
 
   ++NumAssigned;
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 }
 
 void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
   unsigned PhysReg = VRM->getPhys(VirtReg.reg);
-  DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg, TRI)
-               << " from " << printReg(PhysReg, TRI) << ':');
+  LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg, TRI) << " from "
+                    << printReg(PhysReg, TRI) << ':');
   VRM->clearVirt(VirtReg.reg);
 
-  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
-                                         const LiveRange &Range) {
-    DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI));
-    Matrix[Unit].extract(VirtReg, Range);
-    return false;
-  });
+  foreachUnit(TRI, VirtReg, PhysReg,
+              [&](unsigned Unit, const LiveRange &Range) {
+                LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI));
+                Matrix[Unit].extract(VirtReg, Range);
+                return false;
+              });
 
   ++NumUnassigned;
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 }
 
 bool LiveRegMatrix::isPhysRegUsed(unsigned PhysReg) const {
@@ -205,3 +205,19 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
 
   return IK_Free;
 }
+
+bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
+                                      unsigned PhysReg) {
+  // Construct artificial live range containing only one segment [Start, End).
+  VNInfo valno(0, Start);
+  LiveRange::Segment Seg(Start, End, &valno);
+  LiveRange LR;
+  LR.addSegment(Seg);
+
+  // Check for interference with that segment
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    if (query(LR, *Units).checkInterference())
+      return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp
index 9f28db6287ba..c22681385492 100644
--- a/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -46,7 +46,7 @@ void LiveRegUnits::stepBackward(const MachineInstr &MI) {
   // Remove defined registers and regmask kills from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg()) {
-      if (!O->isDef())
+      if (!O->isDef() || O->isDebug())
         continue;
       unsigned Reg = O->getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(Reg))
@@ -58,7 +58,7 @@ void LiveRegUnits::stepBackward(const MachineInstr &MI) {
 
   // Add uses to the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
-    if (!O->isReg() || !O->readsReg())
+    if (!O->isReg() || !O->readsReg() || O->isDebug())
       continue;
     unsigned Reg = O->getReg();
     if (!TargetRegisterInfo::isPhysicalRegister(Reg))
diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
index 032dd66ae1d2..0b92eab83806 100644
--- a/contrib/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -498,7 +499,7 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
 
 void LiveVariables::runOnInstr(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &Defs) {
-  assert(!MI.isDebugValue());
+  assert(!MI.isDebugInstr());
   // Process all of the operands of the instruction...
   unsigned NumOperandsToProcess = MI.getNumOperands();
 
@@ -575,7 +576,7 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
   DistanceMap.clear();
   unsigned Dist = 0;
   for (MachineInstr &MI : *MBB) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
     DistanceMap.insert(std::make_pair(&MI, Dist++));
 
diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index c0da37ede849..f90ce0c8cd2a 100644
--- a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -99,7 +98,6 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<StackProtector>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -109,12 +107,8 @@ namespace {
 char LocalStackSlotPass::ID = 0;
 
 char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
-
-INITIALIZE_PASS_BEGIN(LocalStackSlotPass, DEBUG_TYPE,
-                      "Local Stack Slot Allocation", false, false)
-INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_END(LocalStackSlotPass, DEBUG_TYPE,
-                    "Local Stack Slot Allocation", false, false)
+INITIALIZE_PASS(LocalStackSlotPass, DEBUG_TYPE,
+                "Local Stack Slot Allocation", false, false)
 
 bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -164,8 +158,8 @@ void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI,
   Offset = (Offset + Align - 1) / Align * Align;
 
   int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
-  DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
-        << LocalOffset << "\n");
+  LLVM_DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
+                    << LocalOffset << "\n");
   // Keep the offset available for base register allocation
   LocalOffsets[FrameIdx] = LocalOffset;
   // And tell MFI about it for PEI to use later
@@ -202,7 +196,6 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int64_t Offset = 0;
   unsigned MaxAlign = 0;
-  StackProtector *SP = &getAnalysis<StackProtector>();
 
   // Make sure that the stack protector comes before the local variables on the
   // stack.
@@ -222,16 +215,16 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
       if (MFI.getStackProtectorIndex() == (int)i)
         continue;
 
-      switch (SP->getSSPLayout(MFI.getObjectAllocation(i))) {
-      case StackProtector::SSPLK_None:
+      switch (MFI.getObjectSSPLayout(i)) {
+      case MachineFrameInfo::SSPLK_None:
         continue;
-      case StackProtector::SSPLK_SmallArray:
+      case MachineFrameInfo::SSPLK_SmallArray:
         SmallArrayObjs.insert(i);
         continue;
-      case StackProtector::SSPLK_AddrOf:
+      case MachineFrameInfo::SSPLK_AddrOf:
         AddrOfObjs.insert(i);
         continue;
-      case StackProtector::SSPLK_LargeArray:
+      case MachineFrameInfo::SSPLK_LargeArray:
         LargeArrayObjs.insert(i);
         continue;
       }
@@ -304,7 +297,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     for (MachineInstr &MI : BB) {
       // Debug value, stackmap and patchpoint instructions can't be out of
       // range, so they don't need any updates.
-      if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STATEPOINT ||
+      if (MI.isDebugInstr() || MI.getOpcode() == TargetOpcode::STATEPOINT ||
           MI.getOpcode() == TargetOpcode::STACKMAP ||
           MI.getOpcode() == TargetOpcode::PATCHPOINT)
         continue;
@@ -335,7 +328,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
 
   // Sort the frame references by local offset.
   // Use frame index as a tie-breaker in case MI's have the same offset.
-  std::sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end());
+  llvm::sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end());
 
   MachineBasicBlock *Entry = &Fn.front();
 
@@ -351,7 +344,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     assert(MFI.isObjectPreAllocated(FrameIdx) &&
            "Only pre-allocated locals expected!");
 
-    DEBUG(dbgs() << "Considering: " << MI);
+    LLVM_DEBUG(dbgs() << "Considering: " << MI);
 
     unsigned idx = 0;
     for (unsigned f = MI.getNumOperands(); idx != f; ++idx) {
@@ -367,7 +360,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     int64_t Offset = 0;
     int64_t FrameSizeAdjust = StackGrowsDown ? MFI.getLocalFrameSize() : 0;
 
-    DEBUG(dbgs() << "  Replacing FI in: " << MI);
+    LLVM_DEBUG(dbgs() << "  Replacing FI in: " << MI);
 
     // If we have a suitable base register available, use it; otherwise
     // create a new one. Note that any offset encoded in the
@@ -377,7 +370,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     if (UsedBaseReg &&
         lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust,
                                LocalOffset, MI, TRI)) {
-      DEBUG(dbgs() << "  Reusing base register " << BaseReg << "\n");
+      LLVM_DEBUG(dbgs() << "  Reusing base register " << BaseReg << "\n");
       // We found a register to reuse.
       Offset = FrameSizeAdjust + LocalOffset - BaseOffset;
     } else {
@@ -405,8 +398,9 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
       BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
 
-      DEBUG(dbgs() << "  Materializing base register " << BaseReg <<
-            " at frame local offset " << LocalOffset + InstrOffset << "\n");
+      LLVM_DEBUG(dbgs() << "  Materializing base register " << BaseReg
+                        << " at frame local offset "
+                        << LocalOffset + InstrOffset << "\n");
 
       // Tell the target to insert the instruction to initialize
       // the base register.
@@ -427,7 +421,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     // Modify the instruction to use the new base register rather
     // than the frame index operand.
     TRI->resolveFrameIndex(MI, BaseReg, Offset);
-    DEBUG(dbgs() << "Resolved: " << MI);
+    LLVM_DEBUG(dbgs() << "Resolved: " << MI);
 
     ++NumReplacements;
   }
diff --git a/contrib/llvm/lib/CodeGen/LoopTraversal.cpp b/contrib/llvm/lib/CodeGen/LoopTraversal.cpp
new file mode 100644
index 000000000000..a02d10e09d7d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LoopTraversal.cpp
@@ -0,0 +1,77 @@
+//===- LoopTraversal.cpp - Optimal basic block traversal order --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LoopTraversal.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+using namespace llvm;
+
+bool LoopTraversal::isBlockDone(MachineBasicBlock *MBB) {
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBInfos.size() && "Unexpected basic block number.");
+  return MBBInfos[MBBNumber].PrimaryCompleted &&
+         MBBInfos[MBBNumber].IncomingCompleted ==
+             MBBInfos[MBBNumber].PrimaryIncoming &&
+         MBBInfos[MBBNumber].IncomingProcessed == MBB->pred_size();
+}
+
+LoopTraversal::TraversalOrder LoopTraversal::traverse(MachineFunction &MF) {
+  // Initialize the MMBInfos
+  MBBInfos.assign(MF.getNumBlockIDs(), MBBInfo());
+
+  MachineBasicBlock *Entry = &*MF.begin();
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
+  SmallVector<MachineBasicBlock *, 4> Workqueue;
+  SmallVector<TraversedMBBInfo, 4> MBBTraversalOrder;
+  for (MachineBasicBlock *MBB : RPOT) {
+    // N.B: IncomingProcessed and IncomingCompleted were already updated while
+    // processing this block's predecessors.
+    unsigned MBBNumber = MBB->getNumber();
+    assert(MBBNumber < MBBInfos.size() && "Unexpected basic block number.");
+    MBBInfos[MBBNumber].PrimaryCompleted = true;
+    MBBInfos[MBBNumber].PrimaryIncoming = MBBInfos[MBBNumber].IncomingProcessed;
+    bool Primary = true;
+    Workqueue.push_back(MBB);
+    while (!Workqueue.empty()) {
+      MachineBasicBlock *ActiveMBB = &*Workqueue.back();
+      Workqueue.pop_back();
+      bool Done = isBlockDone(ActiveMBB);
+      MBBTraversalOrder.push_back(TraversedMBBInfo(ActiveMBB, Primary, Done));
+      for (MachineBasicBlock *Succ : ActiveMBB->successors()) {
+        unsigned SuccNumber = Succ->getNumber();
+        assert(SuccNumber < MBBInfos.size() &&
+               "Unexpected basic block number.");
+        if (!isBlockDone(Succ)) {
+          if (Primary)
+            MBBInfos[SuccNumber].IncomingProcessed++;
+          if (Done)
+            MBBInfos[SuccNumber].IncomingCompleted++;
+          if (isBlockDone(Succ))
+            Workqueue.push_back(Succ);
+        }
+      }
+      Primary = false;
+    }
+  }
+
+  // We need to go through again and finalize any blocks that are not done yet.
+  // This is possible if blocks have dead predecessors, so we didn't visit them
+  // above.
+  for (MachineBasicBlock *MBB : RPOT) {
+    if (!isBlockDone(MBB))
+      MBBTraversalOrder.push_back(TraversedMBBInfo(MBB, false, true));
+    // Don't update successors here. We'll get to them anyway through this
+    // loop.
+  }
+
+  MBBInfos.clear();
+
+  return MBBTraversalOrder;
+}
diff --git a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
index 0cf578b50563..36c1d358a9bd 100644
--- a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
+++ b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -68,7 +68,7 @@ bool LowerEmuTLS::runOnModule(Module &M) {
     return false;
 
   auto &TM = TPC->getTM<TargetMachine>();
-  if (!TM.Options.EmulatedTLS)
+  if (!TM.useEmulatedTLS())
     return false;
 
   bool Changed = false;
diff --git a/contrib/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/contrib/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index 4b676a60a8cd..fa43d13b1b85 100644
--- a/contrib/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -43,14 +43,13 @@ extern char &MIRCanonicalizerID;
 #define DEBUG_TYPE "mir-canonicalizer"
 
 static cl::opt<unsigned>
-CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u),
-                           cl::value_desc("N"),
-                           cl::desc("Function number to canonicalize."));
+    CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u),
+                               cl::value_desc("N"),
+                               cl::desc("Function number to canonicalize."));
 
-static cl::opt<unsigned>
-CanonicalizeBasicBlockNumber("canon-nth-basicblock", cl::Hidden, cl::init(~0u),
-                             cl::value_desc("N"),
-                             cl::desc("BasicBlock number to canonicalize."));
+static cl::opt<unsigned> CanonicalizeBasicBlockNumber(
+    "canon-nth-basicblock", cl::Hidden, cl::init(~0u), cl::value_desc("N"),
+    cl::desc("BasicBlock number to canonicalize."));
 
 namespace {
 
@@ -84,9 +83,9 @@ public:
     assert(type != RSE_Reg && "Expected a non-register type.");
   }
 
-  bool isReg()        const { return type == RSE_Reg;          }
-  bool isFrameIndex() const { return type == RSE_FrameIndex;   }
-  bool isCandidate()  const { return type == RSE_NewCandidate; }
+  bool isReg() const { return type == RSE_Reg; }
+  bool isFrameIndex() const { return type == RSE_FrameIndex; }
+  bool isCandidate() const { return type == RSE_NewCandidate; }
 
   VRType getType() const { return type; }
   unsigned getReg() const {
@@ -115,23 +114,49 @@ static std::vector<MachineBasicBlock *> GetRPOList(MachineFunction &MF) {
   return RPOList;
 }
 
-// Set a dummy vreg. We use this vregs register class to generate throw-away
-// vregs that are used to skip vreg numbers so that vreg numbers line up.
-static unsigned GetDummyVReg(const MachineFunction &MF) {
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      for (auto &MO : MI.operands()) {
-        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-          continue;
-        return MO.getReg();
-      }
-    }
+static bool
+rescheduleLexographically(std::vector<MachineInstr *> instructions,
+                          MachineBasicBlock *MBB,
+                          std::function<MachineBasicBlock::iterator()> getPos) {
+
+  bool Changed = false;
+  using StringInstrPair = std::pair<std::string, MachineInstr *>;
+  std::vector<StringInstrPair> StringInstrMap;
+
+  for (auto *II : instructions) {
+    std::string S;
+    raw_string_ostream OS(S);
+    II->print(OS);
+    OS.flush();
+
+    // Trim the assignment, or start from the begining in the case of a store.
+    const size_t i = S.find("=");
+    StringInstrMap.push_back({(i == std::string::npos) ? S : S.substr(i), II});
+  }
+
+  llvm::sort(StringInstrMap.begin(), StringInstrMap.end(),
+            [](const StringInstrPair &a, const StringInstrPair &b) -> bool {
+              return (a.first < b.first);
+            });
+
+  for (auto &II : StringInstrMap) {
+
+    LLVM_DEBUG({
+      dbgs() << "Splicing ";
+      II.second->dump();
+      dbgs() << " right before: ";
+      getPos()->dump();
+    });
+
+    Changed = true;
+    MBB->splice(getPos(), MBB, II.second);
   }
 
-  return ~0U;
+  return Changed;
 }
 
-static bool rescheduleCanonically(MachineBasicBlock *MBB) {
+static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
+                                  MachineBasicBlock *MBB) {
 
   bool Changed = false;
 
@@ -153,15 +178,62 @@ static bool rescheduleCanonically(MachineBasicBlock *MBB) {
     Instructions.push_back(&MI);
   }
 
+  std::map<MachineInstr *, std::vector<MachineInstr *>> MultiUsers;
+  std::vector<MachineInstr *> PseudoIdempotentInstructions;
+  std::vector<unsigned> PhysRegDefs;
+  for (auto *II : Instructions) {
+    for (unsigned i = 1; i < II->getNumOperands(); i++) {
+      MachineOperand &MO = II->getOperand(i);
+      if (!MO.isReg())
+        continue;
+
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+
+      if (!MO.isDef())
+        continue;
+
+      PhysRegDefs.push_back(MO.getReg());
+    }
+  }
+
   for (auto *II : Instructions) {
     if (II->getNumOperands() == 0)
       continue;
+    if (II->mayLoadOrStore())
+      continue;
 
     MachineOperand &MO = II->getOperand(0);
     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       continue;
+    if (!MO.isDef())
+      continue;
+
+    bool IsPseudoIdempotent = true;
+    for (unsigned i = 1; i < II->getNumOperands(); i++) {
+
+      if (II->getOperand(i).isImm()) {
+        continue;
+      }
+
+      if (II->getOperand(i).isReg()) {
+        if (!TargetRegisterInfo::isVirtualRegister(II->getOperand(i).getReg()))
+          if (llvm::find(PhysRegDefs, II->getOperand(i).getReg()) ==
+              PhysRegDefs.end()) {
+            continue;
+          }
+      }
 
-    DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump(););
+      IsPseudoIdempotent = false;
+      break;
+    }
+
+    if (IsPseudoIdempotent) {
+      PseudoIdempotentInstructions.push_back(II);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump(););
 
     MachineInstr *Def = II;
     unsigned Distance = ~0U;
@@ -194,9 +266,6 @@ static bool rescheduleCanonically(MachineBasicBlock *MBB) {
       if (DefI != BBE && UseI != BBE)
         break;
 
-      if ((&*BBI != Def) && (&*BBI != UseToBringDefCloserTo))
-        continue;
-
       if (&*BBI == Def) {
         DefI = BBI;
         continue;
@@ -211,17 +280,80 @@ static bool rescheduleCanonically(MachineBasicBlock *MBB) {
     if (DefI == BBE || UseI == BBE)
       continue;
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "Splicing ";
       DefI->dump();
       dbgs() << " right before: ";
       UseI->dump();
     });
 
+    MultiUsers[UseToBringDefCloserTo].push_back(Def);
     Changed = true;
     MBB->splice(UseI, MBB, DefI);
   }
 
+  // Sort the defs for users of multiple defs lexographically.
+  for (const auto &E : MultiUsers) {
+
+    auto UseI =
+        std::find_if(MBB->instr_begin(), MBB->instr_end(),
+                     [&](MachineInstr &MI) -> bool { return &MI == E.first; });
+
+    if (UseI == MBB->instr_end())
+      continue;
+
+    LLVM_DEBUG(
+        dbgs() << "Rescheduling Multi-Use Instructions Lexographically.";);
+    Changed |= rescheduleLexographically(
+        E.second, MBB, [&]() -> MachineBasicBlock::iterator { return UseI; });
+  }
+
+  PseudoIdempotentInstCount = PseudoIdempotentInstructions.size();
+  LLVM_DEBUG(
+      dbgs() << "Rescheduling Idempotent Instructions Lexographically.";);
+  Changed |= rescheduleLexographically(
+      PseudoIdempotentInstructions, MBB,
+      [&]() -> MachineBasicBlock::iterator { return MBB->begin(); });
+
+  return Changed;
+}
+
+static bool propagateLocalCopies(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  std::vector<MachineInstr *> Copies;
+  for (MachineInstr &MI : MBB->instrs()) {
+    if (MI.isCopy())
+      Copies.push_back(&MI);
+  }
+
+  for (MachineInstr *MI : Copies) {
+
+    if (!MI->getOperand(0).isReg())
+      continue;
+    if (!MI->getOperand(1).isReg())
+      continue;
+
+    const unsigned Dst = MI->getOperand(0).getReg();
+    const unsigned Src = MI->getOperand(1).getReg();
+
+    if (!TargetRegisterInfo::isVirtualRegister(Dst))
+      continue;
+    if (!TargetRegisterInfo::isVirtualRegister(Src))
+      continue;
+    if (MRI.getRegClass(Dst) != MRI.getRegClass(Src))
+      continue;
+
+    for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
+      MachineOperand *MO = &*UI;
+      MO->setReg(Src);
+      Changed = true;
+    }
+
+    MI->eraseFromParent();
+  }
+
   return Changed;
 }
 
@@ -245,7 +377,8 @@ static std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
       DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst);
 
       for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
-        if (DoesMISideEffect) break;
+        if (DoesMISideEffect)
+          break;
         DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
       }
     }
@@ -253,7 +386,7 @@ static std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
     if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
       continue;
 
-    DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
+    LLVM_DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
     Candidates.push_back(MI);
   }
 
@@ -274,7 +407,7 @@ static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
     RegQueue.pop();
 
     if (TReg.isFrameIndex()) {
-      DEBUG(dbgs() << "Popping frame index.\n";);
+      LLVM_DEBUG(dbgs() << "Popping frame index.\n";);
       VRegs.push_back(TypedVReg(RSE_FrameIndex));
       continue;
     }
@@ -283,7 +416,7 @@ static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
     unsigned Reg = TReg.getReg();
 
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Popping vreg ";
         MRI.def_begin(Reg)->dump();
         dbgs() << "\n";
@@ -295,7 +428,7 @@ static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
         VRegs.push_back(TypedVReg(Reg));
       }
     } else {
-      DEBUG(dbgs() << "Popping physreg.\n";);
+      LLVM_DEBUG(dbgs() << "Popping physreg.\n";);
       VRegs.push_back(TypedVReg(Reg));
       continue;
     }
@@ -311,7 +444,7 @@ static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
         break;
       }
 
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "\n========================\n";
         dbgs() << "Visited MI: ";
         Def->dump();
@@ -323,7 +456,7 @@ static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
 
         MachineOperand &MO = Def->getOperand(I);
         if (MO.isFI()) {
-          DEBUG(dbgs() << "Pushing frame index.\n";);
+          LLVM_DEBUG(dbgs() << "Pushing frame index.\n";);
           RegQueue.push(TypedVReg(RSE_FrameIndex));
         }
 
@@ -335,33 +468,56 @@ static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
   }
 }
 
-// TODO: Work to remove this in the future. One day when we have named vregs
-// we should be able to form the canonical name based on some characteristic
-// we see in that point of the expression tree (like if we were to name based
-// on some sort of value numbering scheme).
-static void SkipVRegs(unsigned &VRegGapIndex, MachineRegisterInfo &MRI,
-                      const TargetRegisterClass *RC) {
-  const unsigned VR_GAP = (++VRegGapIndex * 1000);
-
-  DEBUG({
-    dbgs() << "Adjusting per-BB VR_GAP for BB" << VRegGapIndex << " to "
-           << VR_GAP << "\n";
-  });
+namespace {
+class NamedVRegCursor {
+  MachineRegisterInfo &MRI;
+  unsigned virtualVRegNumber;
+
+public:
+  NamedVRegCursor(MachineRegisterInfo &MRI) : MRI(MRI) {
+    unsigned VRegGapIndex = 0;
+    const unsigned VR_GAP = (++VRegGapIndex * 1000);
+
+    unsigned I = MRI.createIncompleteVirtualRegister();
+    const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
 
-  unsigned I = MRI.createVirtualRegister(RC);
-  const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
-  while (I != E) {
-    I = MRI.createVirtualRegister(RC);
+    virtualVRegNumber = E;
   }
-}
+
+  void SkipVRegs() {
+    unsigned VRegGapIndex = 1;
+    const unsigned VR_GAP = (++VRegGapIndex * 1000);
+
+    unsigned I = virtualVRegNumber;
+    const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
+
+    virtualVRegNumber = E;
+  }
+
+  unsigned getVirtualVReg() const { return virtualVRegNumber; }
+
+  unsigned incrementVirtualVReg(unsigned incr = 1) {
+    virtualVRegNumber += incr;
+    return virtualVRegNumber;
+  }
+
+  unsigned createVirtualRegister(const TargetRegisterClass *RC) {
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "namedVReg" << (virtualVRegNumber & ~0x80000000);
+    OS.flush();
+    virtualVRegNumber++;
+
+    return MRI.createVirtualRegister(RC, OS.str());
+  }
+};
+} // namespace
 
 static std::map<unsigned, unsigned>
 GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
                  const std::vector<unsigned> &renamedInOtherBB,
-                 MachineRegisterInfo &MRI,
-                 const TargetRegisterClass *RC) {
+                 MachineRegisterInfo &MRI, NamedVRegCursor &NVC) {
   std::map<unsigned, unsigned> VRegRenameMap;
-  unsigned LastRenameReg = MRI.createVirtualRegister(RC);
   bool FirstCandidate = true;
 
   for (auto &vreg : VRegs) {
@@ -370,8 +526,9 @@ GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
       // (especially when comparing SelectionDAG to GlobalISel generated MIR)
       // that in the other file we are just getting an incoming vreg that comes
       // from a copy from a frame index. So it's safe to skip by one.
-      LastRenameReg = MRI.createVirtualRegister(RC);
-      DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
+      unsigned LastRenameReg = NVC.incrementVirtualVReg();
+      (void)LastRenameReg;
+      LLVM_DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
       continue;
     } else if (vreg.isCandidate()) {
 
@@ -380,20 +537,15 @@ GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
       // same vreg number making it more likely that the canonical walk from the
       // candidate insruction. We don't need to skip from the first candidate of
       // the BasicBlock because we already skip ahead several vregs for each BB.
-      while (LastRenameReg % 10) {
-        if (!FirstCandidate) break;
-        LastRenameReg = MRI.createVirtualRegister(RC);
-
-        DEBUG({
-          dbgs() << "Skipping rename for new candidate " << LastRenameReg
-                 << "\n";
-        });
-      }
+      unsigned LastRenameReg = NVC.getVirtualVReg();
+      if (FirstCandidate)
+        NVC.incrementVirtualVReg(LastRenameReg % 10);
       FirstCandidate = false;
       continue;
     } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) {
-      LastRenameReg = MRI.createVirtualRegister(RC);
-      DEBUG({
+      unsigned LastRenameReg = NVC.incrementVirtualVReg();
+      (void)LastRenameReg;
+      LLVM_DEBUG({
         dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
       });
       continue;
@@ -401,27 +553,27 @@ GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
 
     auto Reg = vreg.getReg();
     if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
-      DEBUG(dbgs() << "Vreg " << Reg << " already renamed in other BB.\n";);
+      LLVM_DEBUG(dbgs() << "Vreg " << Reg
+                        << " already renamed in other BB.\n";);
       continue;
     }
 
-    auto Rename = MRI.createVirtualRegister(MRI.getRegClass(Reg));
-    LastRenameReg = Rename;
+    auto Rename = NVC.createVirtualRegister(MRI.getRegClass(Reg));
 
     if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
-      DEBUG(dbgs() << "Mapping vreg ";);
+      LLVM_DEBUG(dbgs() << "Mapping vreg ";);
       if (MRI.reg_begin(Reg) != MRI.reg_end()) {
-        DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
+        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
       } else {
-        DEBUG(dbgs() << Reg;);
+        LLVM_DEBUG(dbgs() << Reg;);
       }
-      DEBUG(dbgs() << " to ";);
+      LLVM_DEBUG(dbgs() << " to ";);
       if (MRI.reg_begin(Rename) != MRI.reg_end()) {
-        DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
+        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
       } else {
-        DEBUG(dbgs() << Rename;);
+        LLVM_DEBUG(dbgs() << Rename;);
       }
-      DEBUG(dbgs() << "\n";);
+      LLVM_DEBUG(dbgs() << "\n";);
 
       VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
     }
@@ -483,23 +635,25 @@ static bool doDefKillClear(MachineBasicBlock *MBB) {
 static bool runOnBasicBlock(MachineBasicBlock *MBB,
                             std::vector<StringRef> &bbNames,
                             std::vector<unsigned> &renamedInOtherBB,
-                            unsigned &basicBlockNum, unsigned &VRegGapIndex) {
+                            unsigned &basicBlockNum, unsigned &VRegGapIndex,
+                            NamedVRegCursor &NVC) {
 
   if (CanonicalizeBasicBlockNumber != ~0U) {
     if (CanonicalizeBasicBlockNumber != basicBlockNum++)
       return false;
-    DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName() << "\n";);
+    LLVM_DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName()
+                      << "\n";);
   }
 
   if (llvm::find(bbNames, MBB->getName()) != bbNames.end()) {
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "Found potentially duplicate BasicBlocks: " << MBB->getName()
              << "\n";
     });
     return false;
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "\n\n  NEW BASIC BLOCK: " << MBB->getName() << "  \n\n";
     dbgs() << "\n\n================================================\n\n";
   });
@@ -508,17 +662,18 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  const unsigned DummyVReg = GetDummyVReg(MF);
-  const TargetRegisterClass *DummyRC =
-    (DummyVReg == ~0U) ? nullptr : MRI.getRegClass(DummyVReg);
-  if (!DummyRC) return false;
-
   bbNames.push_back(MBB->getName());
-  DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";);
+  LLVM_DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";);
 
-  DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump(););
-  Changed |= rescheduleCanonically(MBB);
-  DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
+  LLVM_DEBUG(dbgs() << "MBB Before Canonical Copy Propagation:\n";
+             MBB->dump(););
+  Changed |= propagateLocalCopies(MBB);
+  LLVM_DEBUG(dbgs() << "MBB After Canonical Copy Propagation:\n"; MBB->dump(););
+
+  LLVM_DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump(););
+  unsigned IdempotentInstCount = 0;
+  Changed |= rescheduleCanonically(IdempotentInstCount, MBB);
+  LLVM_DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
 
   std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
   std::vector<MachineInstr *> VisitedMIs;
@@ -543,7 +698,7 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
         continue;
 
-      DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
+      LLVM_DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
       RegQueue.push(TypedVReg(MO.getReg()));
     }
 
@@ -560,10 +715,10 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
       if (!MO.isReg() && !MO.isFI())
         continue;
 
-      DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
+      LLVM_DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
 
-      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg()) :
-                                  TypedVReg(RSE_FrameIndex));
+      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg())
+                               : TypedVReg(RSE_FrameIndex));
     }
 
     doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
@@ -574,15 +729,38 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
   if (VRegs.size() == 0)
     return Changed;
 
-  // Skip some vregs, so we can recon where we'll land next.
-  SkipVRegs(VRegGapIndex, MRI, DummyRC);
-
-  auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, DummyRC);
+  auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, NVC);
   Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+
+  // Here we renumber the def vregs for the idempotent instructions from the top
+  // of the MachineBasicBlock so that they are named in the order that we sorted
+  // them alphabetically. Eventually we wont need SkipVRegs because we will use
+  // named vregs instead.
+  NVC.SkipVRegs();
+
+  auto MII = MBB->begin();
+  for (unsigned i = 0; i < IdempotentInstCount && MII != MBB->end(); ++i) {
+    MachineInstr &MI = *MII++;
+    Changed = true;
+    unsigned vRegToRename = MI.getOperand(0).getReg();
+    auto Rename = NVC.createVirtualRegister(MRI.getRegClass(vRegToRename));
+
+    std::vector<MachineOperand *> RenameMOs;
+    for (auto &MO : MRI.reg_operands(vRegToRename)) {
+      RenameMOs.push_back(&MO);
+    }
+
+    for (auto *MO : RenameMOs) {
+      MO->setReg(Rename);
+    }
+  }
+
   Changed |= doDefKillClear(MBB);
 
-  DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump(); dbgs() << "\n";);
-  DEBUG(dbgs() << "\n\n================================================\n\n");
+  LLVM_DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump();
+             dbgs() << "\n";);
+  LLVM_DEBUG(
+      dbgs() << "\n\n================================================\n\n");
   return Changed;
 }
 
@@ -592,22 +770,21 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
   if (CanonicalizeFunctionNumber != ~0U) {
     if (CanonicalizeFunctionNumber != functionNum++)
       return false;
-    DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName() << "\n";);
+    LLVM_DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName()
+                      << "\n";);
   }
 
   // we need a valid vreg to create a vreg type for skipping all those
   // stray vreg numbers so reach alignment/canonical vreg values.
-  std::vector<MachineBasicBlock*> RPOList = GetRPOList(MF);
+  std::vector<MachineBasicBlock *> RPOList = GetRPOList(MF);
 
-  DEBUG(
-    dbgs() << "\n\n  NEW MACHINE FUNCTION: " << MF.getName() << "  \n\n";
-    dbgs() << "\n\n================================================\n\n";
-    dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n";
-    for (auto MBB : RPOList) {
-      dbgs() << MBB->getName() << "\n";
-    }
-    dbgs() << "\n\n================================================\n\n";
-  );
+  LLVM_DEBUG(
+      dbgs() << "\n\n  NEW MACHINE FUNCTION: " << MF.getName() << "  \n\n";
+      dbgs() << "\n\n================================================\n\n";
+      dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n";
+      for (auto MBB
+           : RPOList) { dbgs() << MBB->getName() << "\n"; } dbgs()
+      << "\n\n================================================\n\n";);
 
   std::vector<StringRef> BBNames;
   std::vector<unsigned> RenamedInOtherBB;
@@ -617,9 +794,11 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  NamedVRegCursor NVC(MRI);
   for (auto MBB : RPOList)
-    Changed |= runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx);
+    Changed |=
+        runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx, NVC);
 
   return Changed;
 }
-
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 6adb7f1288d7..da05c9a22785 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -179,23 +179,6 @@ static Cursor lexName(Cursor C, MIToken &Token, MIToken::TokenKind Type,
   return C;
 }
 
-static Cursor maybeLexIntegerOrScalarType(Cursor C, MIToken &Token) {
-  if ((C.peek() != 'i' && C.peek() != 's' && C.peek() != 'p') ||
-      !isdigit(C.peek(1)))
-    return None;
-  char Kind = C.peek();
-  auto Range = C;
-  C.advance(); // Skip 'i', 's', or 'p'
-  while (isdigit(C.peek()))
-    C.advance();
-
-  Token.reset(Kind == 'i'
-                  ? MIToken::IntegerType
-                  : (Kind == 's' ? MIToken::ScalarType : MIToken::PointerType),
-              Range.upto(C));
-  return C;
-}
-
 static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
   return StringSwitch<MIToken::TokenKind>(Identifier)
       .Case("_", MIToken::underscore)
@@ -211,6 +194,14 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("renamable", MIToken::kw_renamable)
       .Case("tied-def", MIToken::kw_tied_def)
       .Case("frame-setup", MIToken::kw_frame_setup)
+      .Case("frame-destroy", MIToken::kw_frame_destroy)
+      .Case("nnan", MIToken::kw_nnan)
+      .Case("ninf", MIToken::kw_ninf)
+      .Case("nsz", MIToken::kw_nsz)
+      .Case("arcp", MIToken::kw_arcp)
+      .Case("contract", MIToken::kw_contract)
+      .Case("afn", MIToken::kw_afn)
+      .Case("reassoc", MIToken::kw_reassoc)
       .Case("debug-location", MIToken::kw_debug_location)
       .Case("same_value", MIToken::kw_cfi_same_value)
       .Case("offset", MIToken::kw_cfi_offset)
@@ -241,6 +232,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("dereferenceable", MIToken::kw_dereferenceable)
       .Case("invariant", MIToken::kw_invariant)
       .Case("align", MIToken::kw_align)
+      .Case("addrspace", MIToken::kw_addrspace)
       .Case("stack", MIToken::kw_stack)
       .Case("got", MIToken::kw_got)
       .Case("jump-table", MIToken::kw_jump_table)
@@ -408,17 +400,38 @@ static bool isRegisterChar(char C) {
   return isIdentifierChar(C) && C != '.';
 }
 
-static Cursor maybeLexRegister(Cursor C, MIToken &Token) {
-  if (C.peek() != '%')
+static Cursor lexNamedVirtualRegister(Cursor C, MIToken &Token) {
+  Cursor Range = C;
+  C.advance(); // Skip '%'
+  while (isRegisterChar(C.peek()))
+    C.advance();
+  Token.reset(MIToken::NamedVirtualRegister, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '%'
+  return C;
+}
+
+static Cursor maybeLexRegister(Cursor C, MIToken &Token,
+                               ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '%' && C.peek() != '$')
+    return None;
+
+  if (C.peek() == '%') {
+    if (isdigit(C.peek(1)))
+      return lexVirtualRegister(C, Token);
+
+    if (isRegisterChar(C.peek(1)))
+      return lexNamedVirtualRegister(C, Token);
+
     return None;
-  if (isdigit(C.peek(1)))
-    return lexVirtualRegister(C, Token);
+  }
+
+  assert(C.peek() == '$');
   auto Range = C;
-  C.advance(); // Skip '%'
+  C.advance(); // Skip '$'
   while (isRegisterChar(C.peek()))
     C.advance();
   Token.reset(MIToken::NamedRegister, Range.upto(C))
-      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '%'
+      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '$'
   return C;
 }
 
@@ -441,7 +454,7 @@ static Cursor maybeLexGlobalValue(Cursor C, MIToken &Token,
 
 static Cursor maybeLexExternalSymbol(Cursor C, MIToken &Token,
                                      ErrorCallbackType ErrorCallback) {
-  if (C.peek() != '$')
+  if (C.peek() != '&')
     return None;
   return lexName(C, Token, MIToken::ExternalSymbol, /*PrefixLength=*/1,
                  ErrorCallback);
@@ -620,8 +633,6 @@ StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
     return C.remaining();
   }
 
-  if (Cursor R = maybeLexIntegerOrScalarType(C, Token))
-    return R.remaining();
   if (Cursor R = maybeLexMachineBasicBlock(C, Token, ErrorCallback))
     return R.remaining();
   if (Cursor R = maybeLexIdentifier(C, Token))
@@ -640,7 +651,7 @@ StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
     return R.remaining();
   if (Cursor R = maybeLexIRValue(C, Token, ErrorCallback))
     return R.remaining();
-  if (Cursor R = maybeLexRegister(C, Token))
+  if (Cursor R = maybeLexRegister(C, Token, ErrorCallback))
     return R.remaining();
   if (Cursor R = maybeLexGlobalValue(C, Token, ErrorCallback))
     return R.remaining();
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
index 0204d549d5d4..e21c71532f79 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -63,6 +63,14 @@ struct MIToken {
     kw_renamable,
     kw_tied_def,
     kw_frame_setup,
+    kw_frame_destroy,
+    kw_nnan,
+    kw_ninf,
+    kw_nsz,
+    kw_arcp,
+    kw_contract,
+    kw_afn,
+    kw_reassoc,
     kw_debug_location,
     kw_cfi_same_value,
     kw_cfi_offset,
@@ -92,6 +100,7 @@ struct MIToken {
     kw_non_temporal,
     kw_invariant,
     kw_align,
+    kw_addrspace,
     kw_stack,
     kw_got,
     kw_jump_table,
@@ -114,12 +123,10 @@ struct MIToken {
 
     // Identifier tokens
     Identifier,
-    IntegerType,
     NamedRegister,
+    NamedVirtualRegister,
     MachineBasicBlockLabel,
     MachineBasicBlock,
-    PointerType,
-    ScalarType,
     StackObject,
     FixedStackObject,
     NamedGlobalValue,
@@ -168,7 +175,7 @@ public:
 
   bool isRegister() const {
     return Kind == NamedRegister || Kind == underscore ||
-           Kind == VirtualRegister;
+           Kind == NamedVirtualRegister || Kind == VirtualRegister;
   }
 
   bool isRegisterFlag() const {
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 1a78ae3aad07..a61e7872f1ae 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -98,6 +98,18 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) {
   return *I.first->second;
 }
 
+VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
+  assert(RegName != "" && "Expected named reg.");
+
+  auto I = VRegInfosNamed.insert(std::make_pair(RegName.str(), nullptr));
+  if (I.second) {
+    VRegInfo *Info = new (Allocator) VRegInfo;
+    Info->VReg = MF.getRegInfo().createIncompleteVirtualRegister(RegName);
+    I.first->second = Info;
+  }
+  return *I.first->second;
+}
+
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -182,6 +194,7 @@ public:
 
   bool parseNamedRegister(unsigned &Reg);
   bool parseVirtualRegister(VRegInfo *&Info);
+  bool parseNamedVirtualRegister(VRegInfo *&Info);
   bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo);
   bool parseRegisterFlag(unsigned &Flags);
   bool parseRegisterClassOrBank(VRegInfo &RegInfo);
@@ -190,7 +203,7 @@ public:
   bool parseRegisterOperand(MachineOperand &Dest,
                             Optional<unsigned> &TiedDefIdx, bool IsDef = false);
   bool parseImmediateOperand(MachineOperand &Dest);
-  bool parseIRConstant(StringRef::iterator Loc, StringRef Source,
+  bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
                        const Constant *&C);
   bool parseIRConstant(StringRef::iterator Loc, const Constant *&C);
   bool parseLowLevelType(StringRef::iterator Loc, LLT &Ty);
@@ -209,7 +222,7 @@ public:
   bool parseJumpTableIndexOperand(MachineOperand &Dest);
   bool parseExternalSymbolOperand(MachineOperand &Dest);
   bool parseMDNode(MDNode *&Node);
-  bool parseDIExpression(MDNode *&Node);
+  bool parseDIExpression(MDNode *&Expr);
   bool parseMetadataOperand(MachineOperand &Dest);
   bool parseCFIOffset(int &Offset);
   bool parseCFIRegister(unsigned &Reg);
@@ -228,6 +241,7 @@ public:
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
+  bool parseAddrspace(unsigned &Addrspace);
   bool parseOperandsOffset(MachineOperand &Op);
   bool parseIRValue(const Value *&V);
   bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
@@ -915,15 +929,43 @@ bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
       continue;
     return error(Operands.empty() ? Token.location() : Operands.back().End,
                  Twine("missing implicit register operand '") +
-                     printImplicitRegisterFlag(I) + " %" +
+                     printImplicitRegisterFlag(I) + " $" +
                      getRegisterName(TRI, I.getReg()) + "'");
   }
   return false;
 }
 
 bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
-  if (Token.is(MIToken::kw_frame_setup)) {
-    Flags |= MachineInstr::FrameSetup;
+  // Allow frame and fast math flags for OPCODE
+  while (Token.is(MIToken::kw_frame_setup) ||
+         Token.is(MIToken::kw_frame_destroy) ||
+         Token.is(MIToken::kw_nnan) ||
+         Token.is(MIToken::kw_ninf) ||
+         Token.is(MIToken::kw_nsz) ||
+         Token.is(MIToken::kw_arcp) ||
+         Token.is(MIToken::kw_contract) ||
+         Token.is(MIToken::kw_afn) ||
+         Token.is(MIToken::kw_reassoc)) {
+    // Mine frame and fast math flags
+    if (Token.is(MIToken::kw_frame_setup))
+      Flags |= MachineInstr::FrameSetup;
+    if (Token.is(MIToken::kw_frame_destroy))
+      Flags |= MachineInstr::FrameDestroy;
+    if (Token.is(MIToken::kw_nnan))
+      Flags |= MachineInstr::FmNoNans;
+    if (Token.is(MIToken::kw_ninf))
+      Flags |= MachineInstr::FmNoInfs;
+    if (Token.is(MIToken::kw_nsz))
+      Flags |= MachineInstr::FmNsz;
+    if (Token.is(MIToken::kw_arcp))
+      Flags |= MachineInstr::FmArcp;
+    if (Token.is(MIToken::kw_contract))
+      Flags |= MachineInstr::FmContract;
+    if (Token.is(MIToken::kw_afn))
+      Flags |= MachineInstr::FmAfn;
+    if (Token.is(MIToken::kw_reassoc))
+      Flags |= MachineInstr::FmReassoc;
+
     lex();
   }
   if (Token.isNot(MIToken::Identifier))
@@ -943,7 +985,18 @@ bool MIParser::parseNamedRegister(unsigned &Reg) {
   return false;
 }
 
+bool MIParser::parseNamedVirtualRegister(VRegInfo *&Info) {
+  assert(Token.is(MIToken::NamedVirtualRegister) && "Expected NamedVReg token");
+  StringRef Name = Token.stringValue();
+  // TODO: Check that the VReg name is not the same as a physical register name.
+  //       If it is, then print a warning (when warnings are implemented).
+  Info = &PFS.getVRegInfoNamed(Name);
+  return false;
+}
+
 bool MIParser::parseVirtualRegister(VRegInfo *&Info) {
+  if (Token.is(MIToken::NamedVirtualRegister))
+    return parseNamedVirtualRegister(Info);
   assert(Token.is(MIToken::VirtualRegister) && "Needs VirtualRegister token");
   unsigned ID;
   if (getUnsigned(ID))
@@ -959,6 +1012,7 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) {
     return false;
   case MIToken::NamedRegister:
     return parseNamedRegister(Reg);
+  case MIToken::NamedVirtualRegister:
   case MIToken::VirtualRegister:
     if (parseVirtualRegister(Info))
       return true;
@@ -1249,11 +1303,17 @@ bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
 }
 
 bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
-  if (Token.is(MIToken::ScalarType)) {
+  if (Token.range().front() == 's' || Token.range().front() == 'p') {
+    StringRef SizeStr = Token.range().drop_front();
+    if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+      return error("expected integers after 's'/'p' type character");
+  }
+
+  if (Token.range().front() == 's') {
     Ty = LLT::scalar(APSInt(Token.range().drop_front()).getZExtValue());
     lex();
     return false;
-  } else if (Token.is(MIToken::PointerType)) {
+  } else if (Token.range().front() == 'p') {
     const DataLayout &DL = MF.getDataLayout();
     unsigned AS = APSInt(Token.range().drop_front()).getZExtValue();
     Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
@@ -1264,38 +1324,60 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
   // Now we're looking for a vector.
   if (Token.isNot(MIToken::less))
     return error(Loc,
-                 "expected unsized, pN, sN or <N x sM> for GlobalISel type");
-
+                 "expected sN, pA, <M x sN>, or <M x pA> for GlobalISel type");
   lex();
 
   if (Token.isNot(MIToken::IntegerLiteral))
-    return error(Loc, "expected <N x sM> for vctor type");
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
   uint64_t NumElements = Token.integerValue().getZExtValue();
   lex();
 
   if (Token.isNot(MIToken::Identifier) || Token.stringValue() != "x")
-    return error(Loc, "expected '<N x sM>' for vector type");
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
   lex();
 
-  if (Token.isNot(MIToken::ScalarType))
-    return error(Loc, "expected '<N x sM>' for vector type");
-  uint64_t ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+  if (Token.range().front() != 's' && Token.range().front() != 'p')
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
+  StringRef SizeStr = Token.range().drop_front();
+  if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+    return error("expected integers after 's'/'p' type character");
+
+  if (Token.range().front() == 's')
+    Ty = LLT::scalar(APSInt(Token.range().drop_front()).getZExtValue());
+  else if (Token.range().front() == 'p') {
+    const DataLayout &DL = MF.getDataLayout();
+    unsigned AS = APSInt(Token.range().drop_front()).getZExtValue();
+    Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+  } else
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
   lex();
 
   if (Token.isNot(MIToken::greater))
-    return error(Loc, "expected '<N x sM>' for vector type");
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
   lex();
 
-  Ty = LLT::vector(NumElements, ScalarSize);
+  Ty = LLT::vector(NumElements, Ty);
   return false;
 }
 
 bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
-  assert(Token.is(MIToken::IntegerType));
+  assert(Token.is(MIToken::Identifier));
+  StringRef TypeStr = Token.range();
+  if (TypeStr.front() != 'i' && TypeStr.front() != 's' &&
+      TypeStr.front() != 'p')
+    return error(
+        "a typed immediate operand should start with one of 'i', 's', or 'p'");
+  StringRef SizeStr = Token.range().drop_front();
+  if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+    return error("expected integers after 'i'/'s'/'p' type character");
+
   auto Loc = Token.location();
   lex();
-  if (Token.isNot(MIToken::IntegerLiteral))
-    return error("expected an integer literal");
+  if (Token.isNot(MIToken::IntegerLiteral)) {
+    if (Token.isNot(MIToken::Identifier) ||
+        !(Token.range() == "true" || Token.range() == "false"))
+      return error("expected an integer literal");
+  }
   const Constant *C = nullptr;
   if (parseIRConstant(Loc, C))
     return true;
@@ -1876,13 +1958,11 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
 
 bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
   assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask");
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  assert(TRI && "Expected target register info");
   lex();
   if (expectAndConsume(MIToken::lparen))
     return true;
 
-  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  uint32_t *Mask = MF.allocateRegMask();
   while (true) {
     if (Token.isNot(MIToken::NamedRegister))
       return error("expected a named register");
@@ -1905,9 +1985,7 @@ bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
 
 bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_liveout));
-  const auto *TRI = MF.getSubtarget().getRegisterInfo();
-  assert(TRI && "Expected target register info");
-  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  uint32_t *Mask = MF.allocateRegMask();
   lex();
   if (expectAndConsume(MIToken::lparen))
     return true;
@@ -1946,11 +2024,10 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
   case MIToken::underscore:
   case MIToken::NamedRegister:
   case MIToken::VirtualRegister:
+  case MIToken::NamedVirtualRegister:
     return parseRegisterOperand(Dest, TiedDefIdx);
   case MIToken::IntegerLiteral:
     return parseImmediateOperand(Dest);
-  case MIToken::IntegerType:
-    return parseTypedImmediateOperand(Dest);
   case MIToken::kw_half:
   case MIToken::kw_float:
   case MIToken::kw_double:
@@ -2011,8 +2088,10 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       Dest = MachineOperand::CreateRegMask(RegMask);
       lex();
       break;
-    } else
+    } else if (Token.stringValue() == "CustomRegMask") {
       return parseCustomRegisterMaskOperand(Dest);
+    } else
+      return parseTypedImmediateOperand(Dest);
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2091,6 +2170,17 @@ bool MIParser::parseAlignment(unsigned &Alignment) {
   return false;
 }
 
+bool MIParser::parseAddrspace(unsigned &Addrspace) {
+  assert(Token.is(MIToken::kw_addrspace));
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected an integer literal after 'addrspace'");
+  if (getUnsigned(Addrspace))
+    return true;
+  lex();
+  return false;
+}
+
 bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   int64_t Offset = 0;
   if (parseOffset(Offset))
@@ -2402,6 +2492,10 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
       if (parseAlignment(BaseAlignment))
         return true;
       break;
+    case MIToken::kw_addrspace:
+      if (parseAddrspace(Ptr.AddrSpace))
+        return true;
+      break;
     case MIToken::md_tbaa:
       lex();
       if (parseMDNode(AAInfo.TBAA))
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
index 2307881068ef..b06ceb21b740 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
@@ -56,6 +56,7 @@ struct PerFunctionMIParsingState {
 
   DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
   DenseMap<unsigned, VRegInfo*> VRegInfos;
+  StringMap<VRegInfo*> VRegInfosNamed;
   DenseMap<unsigned, int> FixedStackObjectSlots;
   DenseMap<unsigned, int> StackObjectSlots;
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
@@ -66,7 +67,8 @@ struct PerFunctionMIParsingState {
                             const Name2RegClassMap &Names2RegClasses,
                             const Name2RegBankMap &Names2RegBanks);
 
-  VRegInfo &getVRegInfo(unsigned VReg);
+  VRegInfo &getVRegInfo(unsigned Num);
+  VRegInfo &getVRegInfoNamed(StringRef RegName);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 7d8e62736a34..3d2db97acb48 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -122,8 +122,9 @@ public:
                                 const yaml::StringValue &RegisterSource,
                                 bool IsRestored, int FrameIdx);
 
+  template <typename T>
   bool parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
-                                  const yaml::MachineStackObject &Object,
+                                  const T &Object,
                                   int FrameIdx);
 
   bool initializeConstantPool(PerFunctionMIParsingState &PFS,
@@ -237,7 +238,7 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
           dyn_cast_or_null<yaml::BlockScalarNode>(In.getCurrentNode())) {
     SMDiagnostic Error;
     M = parseAssembly(MemoryBufferRef(BSN->getValue(), Filename), Error,
-                      Context, &IRSlots);
+                      Context, &IRSlots, /*UpgradeDebugInfo=*/false);
     if (!M) {
       reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange()));
       return nullptr;
@@ -362,6 +363,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
         MachineFunctionProperties::Property::RegBankSelected);
   if (YamlMF.Selected)
     MF.getProperties().set(MachineFunctionProperties::Property::Selected);
+  if (YamlMF.FailedISel)
+    MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
 
   PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses,
                                 Names2RegBanks);
@@ -417,6 +420,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
 
   computeFunctionProperties(MF);
 
+  MF.getSubtarget().mirFileLoaded(MF);
+
   MF.verify();
   return false;
 }
@@ -508,13 +513,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool Error = false;
   // Create VRegs
-  for (auto P : PFS.VRegInfos) {
-    const VRegInfo &Info = *P.second;
+  auto populateVRegInfo = [&] (const VRegInfo &Info, Twine Name) {
     unsigned Reg = Info.VReg;
     switch (Info.Kind) {
     case VRegInfo::UNKNOWN:
       error(Twine("Cannot determine class/bank of virtual register ") +
-            Twine(P.first) + " in function '" + MF.getName() + "'");
+            Name + " in function '" + MF.getName() + "'");
       Error = true;
       break;
     case VRegInfo::NORMAL:
@@ -528,6 +532,17 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
       MRI.setRegBank(Reg, *Info.D.RegBank);
       break;
     }
+  };
+
+  for (auto I = PFS.VRegInfosNamed.begin(), E = PFS.VRegInfosNamed.end();
+       I != E; I++) {
+    const VRegInfo &Info = *I->second;
+    populateVRegInfo(Info, Twine(I->first()));
+  }
+
+  for (auto P : PFS.VRegInfos) {
+    const VRegInfo &Info = *P.second;
+    populateVRegInfo(Info, Twine(P.first));
   }
 
   // Compute MachineRegisterInfo::UsedPhysRegMask
@@ -568,6 +583,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
   MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment);
   MFI.setHasVAStart(YamlMFI.HasVAStart);
   MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
+  MFI.setLocalFrameSize(YamlMFI.LocalFrameSize);
   if (!YamlMFI.SavePoint.Value.empty()) {
     MachineBasicBlock *MBB = nullptr;
     if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
@@ -601,6 +617,8 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
     if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
                                  Object.CalleeSavedRestored, ObjectIdx))
       return true;
+    if (parseStackObjectsDebugInfo(PFS, Object, ObjectIdx))
+      return true;
   }
 
   // Initialize the ordinary frame objects.
@@ -685,11 +703,11 @@ static bool typecheckMDNode(T *&Result, MDNode *Node,
   return false;
 }
 
+template <typename T>
 bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
-    const yaml::MachineStackObject &Object, int FrameIdx) {
+    const T &Object, int FrameIdx) {
   // Debug information can only be attached to stack objects; Fixed stack
   // objects aren't supported.
-  assert(FrameIdx >= 0 && "Expected a stack object frame index");
   MDNode *Var = nullptr, *Expr = nullptr, *Loc = nullptr;
   if (parseMDNode(PFS, Var, Object.DebugVar) ||
       parseMDNode(PFS, Expr, Object.DebugExpr) ||
@@ -704,7 +722,7 @@ bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
       typecheckMDNode(DIExpr, Expr, Object.DebugExpr, "DIExpression", *this) ||
       typecheckMDNode(DILoc, Loc, Object.DebugLoc, "DILocation", *this))
     return true;
-  PFS.MF.setVariableDbgInfo(DIVar, DIExpr, unsigned(FrameIdx), DILoc);
+  PFS.MF.setVariableDbgInfo(DIVar, DIExpr, FrameIdx, DILoc);
   return false;
 }
 
diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
index f91cca6e4e50..bf8cd1489ec5 100644
--- a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
@@ -157,14 +156,10 @@ public:
   void print(const MachineBasicBlock &MBB);
 
   void print(const MachineInstr &MI);
-  void printIRValueReference(const Value &V);
   void printStackObjectReference(int FrameIndex);
   void print(const MachineInstr &MI, unsigned OpIdx,
              const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies,
              LLT TypeToPrint, bool PrintDef = true);
-  void print(const LLVMContext &Context, const TargetInstrInfo &TII,
-             const MachineMemOperand &Op);
-  void printSyncScope(const LLVMContext &Context, SyncScope::ID SSID);
 };
 
 } // end namespace llvm
@@ -207,6 +202,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
       MachineFunctionProperties::Property::RegBankSelected);
   YamlMF.Selected = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Selected);
+  YamlMF.FailedISel = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::FailedISel);
 
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
   ModuleSlotTracker MST(MF.getFunction().getParent());
@@ -259,6 +256,21 @@ static void printRegClassOrBank(unsigned Reg, yaml::StringValue &Dest,
   OS << printRegClassOrBank(Reg, RegInfo, TRI);
 }
 
+template <typename T>
+static void
+printStackObjectDbgInfo(const MachineFunction::VariableDbgInfo &DebugVar,
+                        T &Object, ModuleSlotTracker &MST) {
+  std::array<std::string *, 3> Outputs{{&Object.DebugVar.Value,
+                                        &Object.DebugExpr.Value,
+                                        &Object.DebugLoc.Value}};
+  std::array<const Metadata *, 3> Metas{{DebugVar.Var,
+                                        DebugVar.Expr,
+                                        DebugVar.Loc}};
+  for (unsigned i = 0; i < 3; ++i) {
+    raw_string_ostream StrOS(*Outputs[i]);
+    Metas[i]->printAsOperand(StrOS, MST);
+  }
+}
 
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineRegisterInfo &RegInfo,
@@ -270,6 +282,8 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
     unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
     yaml::VirtualRegisterDefinition VReg;
     VReg.ID = I;
+    if (RegInfo.getVRegName(Reg) != "")
+      continue;
     ::printRegClassOrBank(Reg, VReg.Class, RegInfo, TRI);
     unsigned PreferredReg = RegInfo.getSimpleHint(Reg);
     if (PreferredReg)
@@ -316,6 +330,7 @@ void MIRPrinter::convert(ModuleSlotTracker &MST,
   YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment();
   YamlMFI.HasVAStart = MFI.hasVAStart();
   YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
+  YamlMFI.LocalFrameSize = MFI.getLocalFrameSize();
   if (MFI.getSavePoint()) {
     raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
     StrOS << printMBBReference(*MFI.getSavePoint());
@@ -421,19 +436,12 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
     assert(StackObjectInfo != StackObjectOperandMapping.end() &&
            "Invalid stack object index");
     const FrameIndexOperand &StackObject = StackObjectInfo->second;
-    assert(!StackObject.IsFixed && "Expected a non-fixed stack object");
-    auto &Object = YMF.StackObjects[StackObject.ID];
-    {
-      raw_string_ostream StrOS(Object.DebugVar.Value);
-      DebugVar.Var->printAsOperand(StrOS, MST);
-    }
-    {
-      raw_string_ostream StrOS(Object.DebugExpr.Value);
-      DebugVar.Expr->printAsOperand(StrOS, MST);
-    }
-    {
-      raw_string_ostream StrOS(Object.DebugLoc.Value);
-      DebugVar.Loc->printAsOperand(StrOS, MST);
+    if (StackObject.IsFixed) {
+      auto &Object = YMF.FixedStackObjects[StackObject.ID];
+      printStackObjectDbgInfo(DebugVar, Object, MST);
+    } else {
+      auto &Object = YMF.StackObjects[StackObject.ID];
+      printStackObjectDbgInfo(DebugVar, Object, MST);
     }
   }
 }
@@ -670,6 +678,23 @@ void MIPrinter::print(const MachineInstr &MI) {
     OS << " = ";
   if (MI.getFlag(MachineInstr::FrameSetup))
     OS << "frame-setup ";
+  if (MI.getFlag(MachineInstr::FrameDestroy))
+    OS << "frame-destroy ";
+  if (MI.getFlag(MachineInstr::FmNoNans))
+    OS << "nnan ";
+  if (MI.getFlag(MachineInstr::FmNoInfs))
+    OS << "ninf ";
+  if (MI.getFlag(MachineInstr::FmNsz))
+    OS << "nsz ";
+  if (MI.getFlag(MachineInstr::FmArcp))
+    OS << "arcp ";
+  if (MI.getFlag(MachineInstr::FmContract))
+    OS << "contract ";
+  if (MI.getFlag(MachineInstr::FmAfn))
+    OS << "afn ";
+  if (MI.getFlag(MachineInstr::FmReassoc))
+    OS << "reassoc ";
+
   OS << TII->getName(MI.getOpcode());
   if (I < E)
     OS << ' ';
@@ -683,46 +708,27 @@ void MIPrinter::print(const MachineInstr &MI) {
     NeedComma = true;
   }
 
-  if (MI.getDebugLoc()) {
+  if (const DebugLoc &DL = MI.getDebugLoc()) {
     if (NeedComma)
       OS << ',';
     OS << " debug-location ";
-    MI.getDebugLoc()->printAsOperand(OS, MST);
+    DL->printAsOperand(OS, MST);
   }
 
   if (!MI.memoperands_empty()) {
     OS << " :: ";
     const LLVMContext &Context = MF->getFunction().getContext();
+    const MachineFrameInfo &MFI = MF->getFrameInfo();
     bool NeedComma = false;
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      print(Context, *TII, *Op);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII);
       NeedComma = true;
     }
   }
 }
 
-void MIPrinter::printIRValueReference(const Value &V) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V));
-}
-
 void MIPrinter::printStackObjectReference(int FrameIndex) {
   auto ObjectInfo = StackObjectOperandMapping.find(FrameIndex);
   assert(ObjectInfo != StackObjectOperandMapping.end() &&
@@ -741,7 +747,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   case MachineOperand::MO_Immediate:
     if (MI.isOperandSubregIdx(OpIdx)) {
       MachineOperand::printTargetFlags(OS, Op);
-      MachineOperand::printSubregIdx(OS, Op.getImm(), TRI);
+      MachineOperand::printSubRegIdx(OS, Op.getImm(), TRI);
       break;
     }
     LLVM_FALLTHROUGH;
@@ -765,8 +771,8 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, PrintDef, ShouldPrintRegisterTies,
-             TiedOperandIdx, TRI, TII);
+    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
+             ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
   case MachineOperand::MO_FrameIndex:
@@ -783,132 +789,6 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
-static const char *getTargetMMOFlagName(const TargetInstrInfo &TII,
-                                        unsigned TMMOFlag) {
-  auto Flags = TII.getSerializableMachineMemOperandTargetFlags();
-  for (const auto &I : Flags) {
-    if (I.first == TMMOFlag) {
-      return I.second;
-    }
-  }
-  return nullptr;
-}
-
-void MIPrinter::print(const LLVMContext &Context, const TargetInstrInfo &TII,
-                      const MachineMemOperand &Op) {
-  OS << '(';
-  if (Op.isVolatile())
-    OS << "volatile ";
-  if (Op.isNonTemporal())
-    OS << "non-temporal ";
-  if (Op.isDereferenceable())
-    OS << "dereferenceable ";
-  if (Op.isInvariant())
-    OS << "invariant ";
-  if (Op.getFlags() & MachineMemOperand::MOTargetFlag1)
-    OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag1)
-       << "\" ";
-  if (Op.getFlags() & MachineMemOperand::MOTargetFlag2)
-    OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag2)
-       << "\" ";
-  if (Op.getFlags() & MachineMemOperand::MOTargetFlag3)
-    OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag3)
-       << "\" ";
-
-  assert((Op.isLoad() || Op.isStore()) && "machine memory operand must be a load or store (or both)");
-  if (Op.isLoad())
-    OS << "load ";
-  if (Op.isStore())
-    OS << "store ";
-
-  printSyncScope(Context, Op.getSyncScopeID());
-
-  if (Op.getOrdering() != AtomicOrdering::NotAtomic)
-    OS << toIRString(Op.getOrdering()) << ' ';
-  if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic)
-    OS << toIRString(Op.getFailureOrdering()) << ' ';
-
-  OS << Op.getSize();
-  if (const Value *Val = Op.getValue()) {
-    OS << ((Op.isLoad() && Op.isStore()) ? " on "
-                                         : Op.isLoad() ? " from " : " into ");
-    printIRValueReference(*Val);
-  } else if (const PseudoSourceValue *PVal = Op.getPseudoValue()) {
-    OS << ((Op.isLoad() && Op.isStore()) ? " on "
-                                         : Op.isLoad() ? " from " : " into ");
-    assert(PVal && "Expected a pseudo source value");
-    switch (PVal->kind()) {
-    case PseudoSourceValue::Stack:
-      OS << "stack";
-      break;
-    case PseudoSourceValue::GOT:
-      OS << "got";
-      break;
-    case PseudoSourceValue::JumpTable:
-      OS << "jump-table";
-      break;
-    case PseudoSourceValue::ConstantPool:
-      OS << "constant-pool";
-      break;
-    case PseudoSourceValue::FixedStack:
-      printStackObjectReference(
-          cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex());
-      break;
-    case PseudoSourceValue::GlobalValueCallEntry:
-      OS << "call-entry ";
-      cast<GlobalValuePseudoSourceValue>(PVal)->getValue()->printAsOperand(
-          OS, /*PrintType=*/false, MST);
-      break;
-    case PseudoSourceValue::ExternalSymbolCallEntry:
-      OS << "call-entry $";
-      printLLVMNameWithoutPrefix(
-          OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
-      break;
-    case PseudoSourceValue::TargetCustom:
-      llvm_unreachable("TargetCustom pseudo source values are not supported");
-      break;
-    }
-  }
-  MachineOperand::printOperandOffset(OS, Op.getOffset());
-  if (Op.getBaseAlignment() != Op.getSize())
-    OS << ", align " << Op.getBaseAlignment();
-  auto AAInfo = Op.getAAInfo();
-  if (AAInfo.TBAA) {
-    OS << ", !tbaa ";
-    AAInfo.TBAA->printAsOperand(OS, MST);
-  }
-  if (AAInfo.Scope) {
-    OS << ", !alias.scope ";
-    AAInfo.Scope->printAsOperand(OS, MST);
-  }
-  if (AAInfo.NoAlias) {
-    OS << ", !noalias ";
-    AAInfo.NoAlias->printAsOperand(OS, MST);
-  }
-  if (Op.getRanges()) {
-    OS << ", !range ";
-    Op.getRanges()->printAsOperand(OS, MST);
-  }
-  OS << ')';
-}
-
-void MIPrinter::printSyncScope(const LLVMContext &Context, SyncScope::ID SSID) {
-  switch (SSID) {
-  case SyncScope::System: {
-    break;
-  }
-  default: {
-    if (SSNs.empty())
-      Context.getSyncScopeNames(SSNs);
-
-    OS << "syncscope(\"";
-    PrintEscapedString(SSNs[SSID], OS);
-    OS << "\") ";
-    break;
-  }
-  }
-}
-
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
index cd67449e3acf..38e8369dc739 100644
--- a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -173,7 +174,7 @@ MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) {
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
 
   iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() ||
+  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() ||
                     TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels / dbg_values
@@ -186,7 +187,7 @@ MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) {
 
 MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
   iterator B = begin(), E = end(), I = E;
-  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+  while (I != B && ((--I)->isTerminator() || I->isDebugInstr()))
     ; /*noop */
   while (I != E && !I->isTerminator())
     ++I;
@@ -195,7 +196,7 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
 
 MachineBasicBlock::instr_iterator MachineBasicBlock::getFirstInstrTerminator() {
   instr_iterator B = instr_begin(), E = instr_end(), I = E;
-  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+  while (I != B && ((--I)->isTerminator() || I->isDebugInstr()))
     ; /*noop */
   while (I != E && !I->isTerminator())
     ++I;
@@ -213,7 +214,7 @@ MachineBasicBlock::iterator MachineBasicBlock::getLastNonDebugInstr() {
   while (I != B) {
     --I;
     // Return instruction that starts a bundle.
-    if (I->isDebugValue() || I->isInsideBundle())
+    if (I->isDebugInstr() || I->isInsideBundle())
       continue;
     return I;
   }
@@ -259,8 +260,8 @@ std::string MachineBasicBlock::getFullName() const {
   return Name;
 }
 
-void MachineBasicBlock::print(raw_ostream &OS, const SlotIndexes *Indexes)
-    const {
+void MachineBasicBlock::print(raw_ostream &OS, const SlotIndexes *Indexes,
+                              bool IsStandalone) const {
   const MachineFunction *MF = getParent();
   if (!MF) {
     OS << "Can't print out MachineBasicBlock because parent MachineFunction"
@@ -270,11 +271,13 @@ void MachineBasicBlock::print(raw_ostream &OS, const SlotIndexes *Indexes)
   const Function &F = MF->getFunction();
   const Module *M = F.getParent();
   ModuleSlotTracker MST(M);
-  print(OS, MST, Indexes);
+  MST.incorporateFunction(F);
+  print(OS, MST, Indexes, IsStandalone);
 }
 
 void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                              const SlotIndexes *Indexes) const {
+                              const SlotIndexes *Indexes,
+                              bool IsStandalone) const {
   const MachineFunction *MF = getParent();
   if (!MF) {
     OS << "Can't print out MachineBasicBlock because parent MachineFunction"
@@ -285,70 +288,143 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   if (Indexes)
     OS << Indexes->getMBBStartIdx(this) << '\t';
 
-  OS << printMBBReference(*this) << ": ";
-
-  const char *Comma = "";
-  if (const BasicBlock *LBB = getBasicBlock()) {
-    OS << Comma << "derived from LLVM BB ";
-    LBB->printAsOperand(OS, /*PrintType=*/false, MST);
-    Comma = ", ";
+  OS << "bb." << getNumber();
+  bool HasAttributes = false;
+  if (const auto *BB = getBasicBlock()) {
+    if (BB->hasName()) {
+      OS << "." << BB->getName();
+    } else {
+      HasAttributes = true;
+      OS << " (";
+      int Slot = MST.getLocalSlot(BB);
+      if (Slot == -1)
+        OS << "<ir-block badref>";
+      else
+        OS << (Twine("%ir-block.") + Twine(Slot)).str();
+    }
   }
-  if (isEHPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
-  if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
-  if (Alignment)
-    OS << Comma << "Align " << Alignment << " (" << (1u << Alignment)
-       << " bytes)";
 
-  OS << '\n';
+  if (hasAddressTaken()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "address-taken";
+    HasAttributes = true;
+  }
+  if (isEHPad()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "landing-pad";
+    HasAttributes = true;
+  }
+  if (getAlignment()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "align " << getAlignment();
+    HasAttributes = true;
+  }
+  if (HasAttributes)
+    OS << ")";
+  OS << ":\n";
 
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  if (!livein_empty()) {
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo &TII = *getParent()->getSubtarget().getInstrInfo();
+  bool HasLineAttributes = false;
+
+  // Print the preds of this block according to the CFG.
+  if (!pred_empty() && IsStandalone) {
     if (Indexes) OS << '\t';
-    OS << "    Live Ins:";
-    for (const auto &LI : LiveIns) {
-      OS << ' ' << printReg(LI.PhysReg, TRI);
-      if (!LI.LaneMask.all())
-        OS << ':' << PrintLaneMask(LI.LaneMask);
+    // Don't indent(2), align with previous line attributes.
+    OS << "; predecessors: ";
+    for (auto I = pred_begin(), E = pred_end(); I != E; ++I) {
+      if (I != pred_begin())
+        OS << ", ";
+      OS << printMBBReference(**I);
     }
     OS << '\n';
+    HasLineAttributes = true;
   }
-  // Print the preds of this block according to the CFG.
-  if (!pred_empty()) {
+
+  if (!succ_empty()) {
     if (Indexes) OS << '\t';
-    OS << "    Predecessors according to CFG:";
-    for (const_pred_iterator PI = pred_begin(), E = pred_end(); PI != E; ++PI)
-      OS << " " << printMBBReference(*(*PI));
+    // Print the successors
+    OS.indent(2) << "successors: ";
+    for (auto I = succ_begin(), E = succ_end(); I != E; ++I) {
+      if (I != succ_begin())
+        OS << ", ";
+      OS << printMBBReference(**I);
+      if (!Probs.empty())
+        OS << '('
+           << format("0x%08" PRIx32, getSuccProbability(I).getNumerator())
+           << ')';
+    }
+    if (!Probs.empty() && IsStandalone) {
+      // Print human readable probabilities as comments.
+      OS << "; ";
+      for (auto I = succ_begin(), E = succ_end(); I != E; ++I) {
+        const BranchProbability &BP = *getProbabilityIterator(I);
+        if (I != succ_begin())
+          OS << ", ";
+        OS << printMBBReference(**I) << '('
+           << format("%.2f%%",
+                     rint(((double)BP.getNumerator() / BP.getDenominator()) *
+                          100.0 * 100.0) /
+                         100.0)
+           << ')';
+      }
+    }
+
     OS << '\n';
+    HasLineAttributes = true;
   }
 
-  for (auto &I : instrs()) {
+  if (!livein_empty() && MRI.tracksLiveness()) {
+    if (Indexes) OS << '\t';
+    OS.indent(2) << "liveins: ";
+
+    bool First = true;
+    for (const auto &LI : liveins()) {
+      if (!First)
+        OS << ", ";
+      First = false;
+      OS << printReg(LI.PhysReg, TRI);
+      if (!LI.LaneMask.all())
+        OS << ":0x" << PrintLaneMask(LI.LaneMask);
+    }
+    HasLineAttributes = true;
+  }
+
+  if (HasLineAttributes)
+    OS << '\n';
+
+  bool IsInBundle = false;
+  for (const MachineInstr &MI : instrs()) {
     if (Indexes) {
-      if (Indexes->hasIndex(I))
-        OS << Indexes->getInstructionIndex(I);
+      if (Indexes->hasIndex(MI))
+        OS << Indexes->getInstructionIndex(MI);
       OS << '\t';
     }
-    OS << '\t';
-    if (I.isInsideBundle())
-      OS << "  * ";
-    I.print(OS, MST);
-  }
 
-  // Print the successors of this block according to the CFG.
-  if (!succ_empty()) {
-    if (Indexes) OS << '\t';
-    OS << "    Successors according to CFG:";
-    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
-      OS << " " << printMBBReference(*(*SI));
-      if (!Probs.empty())
-        OS << '(' << *getProbabilityIterator(SI) << ')';
+    if (IsInBundle && !MI.isInsideBundle()) {
+      OS.indent(2) << "}\n";
+      IsInBundle = false;
+    }
+
+    OS.indent(IsInBundle ? 4 : 2);
+    MI.print(OS, MST, IsStandalone, /*SkipOpers=*/false, /*SkipDebugLoc=*/false,
+             /*AddNewLine=*/false, &TII);
+
+    if (!IsInBundle && MI.getFlag(MachineInstr::BundledSucc)) {
+      OS << " {";
+      IsInBundle = true;
     }
     OS << '\n';
   }
-  if (IrrLoopHeaderWeight) {
+
+  if (IsInBundle)
+    OS.indent(2) << "}\n";
+
+  if (IrrLoopHeaderWeight && IsStandalone) {
     if (Indexes) OS << '\t';
-    OS << "    Irreducible loop header weight: "
-       << IrrLoopHeaderWeight.getValue();
-    OS << '\n';
+    OS.indent(2) << "; Irreducible loop header weight: "
+                 << IrrLoopHeaderWeight.getValue() << '\n';
   }
 }
 
@@ -382,10 +458,10 @@ bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const {
 }
 
 void MachineBasicBlock::sortUniqueLiveIns() {
-  std::sort(LiveIns.begin(), LiveIns.end(),
-            [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
-              return LI0.PhysReg < LI1.PhysReg;
-            });
+  llvm::sort(LiveIns.begin(), LiveIns.end(),
+             [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
+               return LI0.PhysReg < LI1.PhysReg;
+             });
   // Liveins are sorted by physreg now we can merge their lanemasks.
   LiveInVector::const_iterator I = LiveIns.begin();
   LiveInVector::const_iterator J;
@@ -583,6 +659,25 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
   Succ->addPredecessor(this);
 }
 
+void MachineBasicBlock::splitSuccessor(MachineBasicBlock *Old,
+                                       MachineBasicBlock *New,
+                                       bool NormalizeSuccProbs) {
+  succ_iterator OldI = llvm::find(successors(), Old);
+  assert(OldI != succ_end() && "Old is not a successor of this block!");
+  assert(llvm::find(successors(), New) == succ_end() &&
+         "New is already a successor of this block!");
+
+  // Add a new successor with equal probability as the original one. Note
+  // that we directly copy the probability using the iterator rather than
+  // getting a potentially synthetic probability computed when unknown. This
+  // preserves the probabilities as-is and then we can renormalize them and
+  // query them effectively afterward.
+  addSuccessor(New, Probs.empty() ? BranchProbability::getUnknown()
+                                  : *getProbabilityIterator(OldI));
+  if (NormalizeSuccProbs)
+    normalizeSuccProbs();
+}
+
 void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ,
                                         bool NormalizeSuccProbs) {
   succ_iterator I = find(Successors, Succ);
@@ -779,9 +874,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
 
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
   MF->insert(std::next(MachineFunction::iterator(this)), NMBB);
-  DEBUG(dbgs() << "Splitting critical edge: " << printMBBReference(*this)
-               << " -- " << printMBBReference(*NMBB) << " -- "
-               << printMBBReference(*Succ) << '\n');
+  LLVM_DEBUG(dbgs() << "Splitting critical edge: " << printMBBReference(*this)
+                    << " -- " << printMBBReference(*NMBB) << " -- "
+                    << printMBBReference(*Succ) << '\n');
 
   LiveIntervals *LIS = P.getAnalysisIfAvailable<LiveIntervals>();
   SlotIndexes *Indexes = P.getAnalysisIfAvailable<SlotIndexes>();
@@ -810,7 +905,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
         if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
             LV->getVarInfo(Reg).removeKill(*MI)) {
           KilledRegs.push_back(Reg);
-          DEBUG(dbgs() << "Removing terminator kill: " << *MI);
+          LLVM_DEBUG(dbgs() << "Removing terminator kill: " << *MI);
           OI->setIsKill(false);
         }
       }
@@ -901,7 +996,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
           continue;
         if (TargetRegisterInfo::isVirtualRegister(Reg))
           LV->getVarInfo(Reg).Kills.push_back(&*I);
-        DEBUG(dbgs() << "Restored terminator kill: " << *I);
+        LLVM_DEBUG(dbgs() << "Restored terminator kill: " << *I);
         break;
       }
     }
@@ -1034,8 +1129,8 @@ bool MachineBasicBlock::canSplitCriticalEdge(
   // case that we can't handle. Since this never happens in properly optimized
   // code, just skip those edges.
   if (TBB && TBB == FBB) {
-    DEBUG(dbgs() << "Won't split critical edge after degenerate "
-                 << printMBBReference(*this) << '\n');
+    LLVM_DEBUG(dbgs() << "Won't split critical edge after degenerate "
+                      << printMBBReference(*this) << '\n');
     return false;
   }
   return true;
@@ -1189,6 +1284,16 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return {};
 }
 
+/// Find the previous valid DebugLoc preceding MBBI, skipping and DBG_VALUE
+/// instructions.  Return UnknownLoc if there is none.
+DebugLoc MachineBasicBlock::findPrevDebugLoc(instr_iterator MBBI) {
+  if (MBBI == instr_begin()) return {};
+  // Skip debug declarations, we don't want a DebugLoc from them.
+  MBBI = skipDebugInstructionsBackward(std::prev(MBBI), instr_begin());
+  if (!MBBI->isDebugInstr()) return MBBI->getDebugLoc();
+  return {};
+}
+
 /// Find and return the merged DebugLoc of the branch instructions of the block.
 /// Return UnknownLoc if there is none.
 DebugLoc
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 167135b56ec0..21350df624e7 100644
--- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -198,10 +198,10 @@ namespace {
 
 class BlockChain;
 
-/// \brief Type for our function-wide basic block -> block chain mapping.
+/// Type for our function-wide basic block -> block chain mapping.
 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
 
-/// \brief A chain of blocks which will be laid out contiguously.
+/// A chain of blocks which will be laid out contiguously.
 ///
 /// This is the datastructure representing a chain of consecutive blocks that
 /// are profitable to layout together in order to maximize fallthrough
@@ -213,13 +213,13 @@ using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
 /// them. They participate in a block-to-chain mapping, which is updated
 /// automatically as chains are merged together.
 class BlockChain {
-  /// \brief The sequence of blocks belonging to this chain.
+  /// The sequence of blocks belonging to this chain.
   ///
   /// This is the sequence of blocks for a particular chain. These will be laid
   /// out in-order within the function.
   SmallVector<MachineBasicBlock *, 4> Blocks;
 
-  /// \brief A handle to the function-wide basic block to block chain mapping.
+  /// A handle to the function-wide basic block to block chain mapping.
   ///
   /// This is retained in each block chain to simplify the computation of child
   /// block chains for SCC-formation and iteration. We store the edges to child
@@ -228,7 +228,7 @@ class BlockChain {
   BlockToChainMapType &BlockToChain;
 
 public:
-  /// \brief Construct a new BlockChain.
+  /// Construct a new BlockChain.
   ///
   /// This builds a new block chain representing a single basic block in the
   /// function. It also registers itself as the chain that block participates
@@ -239,15 +239,15 @@ public:
     BlockToChain[BB] = this;
   }
 
-  /// \brief Iterator over blocks within the chain.
+  /// Iterator over blocks within the chain.
   using iterator = SmallVectorImpl<MachineBasicBlock *>::iterator;
   using const_iterator = SmallVectorImpl<MachineBasicBlock *>::const_iterator;
 
-  /// \brief Beginning of blocks within the chain.
+  /// Beginning of blocks within the chain.
   iterator begin() { return Blocks.begin(); }
   const_iterator begin() const { return Blocks.begin(); }
 
-  /// \brief End of blocks within the chain.
+  /// End of blocks within the chain.
   iterator end() { return Blocks.end(); }
   const_iterator end() const { return Blocks.end(); }
 
@@ -261,7 +261,7 @@ public:
     return false;
   }
 
-  /// \brief Merge a block chain into this one.
+  /// Merge a block chain into this one.
   ///
   /// This routine merges a block chain into this one. It takes care of forming
   /// a contiguous sequence of basic blocks, updating the edge list, and
@@ -293,14 +293,14 @@ public:
   }
 
 #ifndef NDEBUG
-  /// \brief Dump the blocks in this chain.
+  /// Dump the blocks in this chain.
   LLVM_DUMP_METHOD void dump() {
     for (MachineBasicBlock *MBB : *this)
       MBB->dump();
   }
 #endif // NDEBUG
 
-  /// \brief Count of predecessors of any block within the chain which have not
+  /// Count of predecessors of any block within the chain which have not
   /// yet been scheduled.  In general, we will delay scheduling this chain
   /// until those predecessors are scheduled (or we find a sufficiently good
   /// reason to override this heuristic.)  Note that when forming loop chains,
@@ -313,7 +313,7 @@ public:
 };
 
 class MachineBlockPlacement : public MachineFunctionPass {
-  /// \brief A type for a block filter set.
+  /// A type for a block filter set.
   using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
 
   /// Pair struct containing basic block and taildup profitiability
@@ -329,47 +329,47 @@ class MachineBlockPlacement : public MachineFunctionPass {
     MachineBasicBlock *Dest;
   };
 
-  /// \brief work lists of blocks that are ready to be laid out
+  /// work lists of blocks that are ready to be laid out
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
 
   /// Edges that have already been computed as optimal.
   DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
 
-  /// \brief Machine Function
+  /// Machine Function
   MachineFunction *F;
 
-  /// \brief A handle to the branch probability pass.
+  /// A handle to the branch probability pass.
   const MachineBranchProbabilityInfo *MBPI;
 
-  /// \brief A handle to the function-wide block frequency pass.
+  /// A handle to the function-wide block frequency pass.
   std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
 
-  /// \brief A handle to the loop info.
+  /// A handle to the loop info.
   MachineLoopInfo *MLI;
 
-  /// \brief Preferred loop exit.
+  /// Preferred loop exit.
   /// Member variable for convenience. It may be removed by duplication deep
   /// in the call stack.
   MachineBasicBlock *PreferredLoopExit;
 
-  /// \brief A handle to the target's instruction info.
+  /// A handle to the target's instruction info.
   const TargetInstrInfo *TII;
 
-  /// \brief A handle to the target's lowering info.
+  /// A handle to the target's lowering info.
   const TargetLoweringBase *TLI;
 
-  /// \brief A handle to the post dominator tree.
+  /// A handle to the post dominator tree.
   MachinePostDominatorTree *MPDT;
 
-  /// \brief Duplicator used to duplicate tails during placement.
+  /// Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
   /// since tail duplication affects placement decisions of later blocks, it
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief Allocator and owner of BlockChain structures.
+  /// Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
   /// a function. To reduce malloc traffic, we allocate them using this
@@ -378,7 +378,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// the chains.
   SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
 
-  /// \brief Function wide BasicBlock to BlockChain mapping.
+  /// Function wide BasicBlock to BlockChain mapping.
   ///
   /// This mapping allows efficiently moving from any given basic block to the
   /// BlockChain it participates in, if any. We use it to, among other things,
@@ -425,7 +425,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
       MachineBasicBlock *BB, MachineBasicBlock *LPred,
       BlockChain &Chain, BlockFilterSet *BlockFilter,
       MachineFunction::iterator &PrevUnplacedBlockIt,
-      bool &DuplicatedToPred);
+      bool &DuplicatedToLPred);
   bool hasBetterLayoutPredecessor(
       const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
       const BlockChain &SuccChain, BranchProbability SuccProb,
@@ -441,7 +441,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
       MachineFunction::iterator &PrevUnplacedBlockIt,
       const BlockFilterSet *BlockFilter);
 
-  /// \brief Add a basic block to the work list if it is appropriate.
+  /// Add a basic block to the work list if it is appropriate.
   ///
   /// If the optional parameter BlockFilter is provided, only MBB
   /// present in the set will be added to the worklist. If nullptr
@@ -474,7 +474,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// fallthroughs.
   bool isProfitableToTailDup(
     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-    BranchProbability AdjustedSumProb,
+    BranchProbability QProb,
     const BlockChain &Chain, const BlockFilterSet *BlockFilter);
 
   /// Check for a trellis layout.
@@ -545,7 +545,7 @@ INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
                     "Branch Probability Basic Block Placement", false, false)
 
 #ifndef NDEBUG
-/// \brief Helper to print the name of a MBB.
+/// Helper to print the name of a MBB.
 ///
 /// Only used by debug logging.
 static std::string getBlockName(const MachineBasicBlock *BB) {
@@ -558,7 +558,7 @@ static std::string getBlockName(const MachineBasicBlock *BB) {
 }
 #endif
 
-/// \brief Mark a chain's successors as having one fewer preds.
+/// Mark a chain's successors as having one fewer preds.
 ///
 /// When a chain is being merged into the "placed" chain, this routine will
 /// quickly walk the successors of each block in the chain and mark them as
@@ -574,7 +574,7 @@ void MachineBlockPlacement::markChainSuccessors(
   }
 }
 
-/// \brief Mark a single block's successors as having one fewer preds.
+/// Mark a single block's successors as having one fewer preds.
 ///
 /// Under normal circumstances, this is only called by markChainSuccessors,
 /// but if a block that was to be placed is completely tail-duplicated away,
@@ -643,7 +643,8 @@ BranchProbability MachineBlockPlacement::collectViableSuccessors(
       if (SuccChain == &Chain) {
         SkipSucc = true;
       } else if (Succ != *SuccChain->begin()) {
-        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
+        LLVM_DEBUG(dbgs() << "    " << getBlockName(Succ)
+                          << " -> Mid chain!\n");
         continue;
       }
     }
@@ -1010,7 +1011,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
     // If we have a trellis, and BB doesn't have the best fallthrough edges,
     // we shouldn't choose any successor. We've already looked and there's a
     // better fallthrough edge for all the successors.
-    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
     return Result;
   }
 
@@ -1027,10 +1028,11 @@ MachineBlockPlacement::getBestTrellisSuccessor(
         canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
         isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
                               Chain, BlockFilter)) {
-      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
-                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
-            dbgs() << "    Selected: " << getBlockName(Succ2)
-                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+      LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                     MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+                 dbgs() << "    Selected: " << getBlockName(Succ2)
+                        << ", probability: " << Succ2Prob
+                        << " (Tail Duplicate)\n");
       Result.BB = Succ2;
       Result.ShouldTailDup = true;
       return Result;
@@ -1041,10 +1043,10 @@ MachineBlockPlacement::getBestTrellisSuccessor(
   ComputedEdges[BestB.Src] = { BestB.Dest, false };
 
   auto TrellisSucc = BestA.Dest;
-  DEBUG(BranchProbability SuccProb = getAdjustedProbability(
-            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
-        dbgs() << "    Selected: " << getBlockName(TrellisSucc)
-               << ", probability: " << SuccProb << " (Trellis)\n");
+  LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+                 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+             dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+                    << ", probability: " << SuccProb << " (Trellis)\n");
   Result.BB = TrellisSucc;
   return Result;
 }
@@ -1150,7 +1152,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
   if (TriangleChainCount == 0)
     return;
 
-  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
   // Map from last block to the chain that contains it. This allows us to extend
   // chains as we find new triangles.
   DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
@@ -1224,8 +1226,9 @@ void MachineBlockPlacement::precomputeTriangleChains() {
     MachineBasicBlock *dst = Chain.Edges.back();
     Chain.Edges.pop_back();
     for (MachineBasicBlock *src : reverse(Chain.Edges)) {
-      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
-            getBlockName(dst) << " as pre-computed based on triangles.\n");
+      LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
+                        << getBlockName(dst)
+                        << " as pre-computed based on triangles.\n");
 
       auto InsertResult = ComputedEdges.insert({src, {dst, true}});
       assert(InsertResult.second && "Block seen twice.");
@@ -1431,15 +1434,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   }
 
   if (BadCFGConflict) {
-    DEBUG(dbgs() << "    Not a candidate: " << getBlockName(Succ) << " -> " << SuccProb
-                 << " (prob) (non-cold CFG conflict)\n");
+    LLVM_DEBUG(dbgs() << "    Not a candidate: " << getBlockName(Succ) << " -> "
+                      << SuccProb << " (prob) (non-cold CFG conflict)\n");
     return true;
   }
 
   return false;
 }
 
-/// \brief Select the best successor for a block.
+/// Select the best successor for a block.
 ///
 /// This looks across all successors of a particular block and attempts to
 /// select the "best" one to be the layout successor. It only considers direct
@@ -1462,7 +1465,8 @@ MachineBlockPlacement::selectBestSuccessor(
   auto AdjustedSumProb =
       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
 
-  DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
+  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
+                    << "\n");
 
   // if we already precomputed the best successor for BB, return that if still
   // applicable.
@@ -1503,18 +1507,18 @@ MachineBlockPlacement::selectBestSuccessor(
       continue;
     }
 
-    DEBUG(
-        dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
-               << SuccProb
+    LLVM_DEBUG(
+        dbgs() << "    Candidate: " << getBlockName(Succ)
+               << ", probability: " << SuccProb
                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
                << "\n");
 
     if (BestSucc.BB && BestProb >= SuccProb) {
-      DEBUG(dbgs() << "    Not the best candidate, continuing\n");
+      LLVM_DEBUG(dbgs() << "    Not the best candidate, continuing\n");
       continue;
     }
 
-    DEBUG(dbgs() << "    Setting it as best candidate\n");
+    LLVM_DEBUG(dbgs() << "    Setting it as best candidate\n");
     BestSucc.BB = Succ;
     BestProb = SuccProb;
   }
@@ -1539,10 +1543,9 @@ MachineBlockPlacement::selectBestSuccessor(
       break;
     if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
         && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
-      DEBUG(
-          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
-                 << DupProb
-                 << " (Tail Duplicate)\n");
+      LLVM_DEBUG(dbgs() << "    Candidate: " << getBlockName(Succ)
+                        << ", probability: " << DupProb
+                        << " (Tail Duplicate)\n");
       BestSucc.BB = Succ;
       BestSucc.ShouldTailDup = true;
       break;
@@ -1550,12 +1553,12 @@ MachineBlockPlacement::selectBestSuccessor(
   }
 
   if (BestSucc.BB)
-    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
+    LLVM_DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
 
   return BestSucc;
 }
 
-/// \brief Select the best block from a worklist.
+/// Select the best block from a worklist.
 ///
 /// This looks through the provided worklist as a list of candidate basic
 /// blocks and select the most profitable one to place. The definition of
@@ -1596,8 +1599,8 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
            "Found CFG-violating block");
 
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
-    DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
-          MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
+    LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
+               MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
 
     // For ehpad, we layout the least probable first as to avoid jumping back
     // from least probable landingpads to more probable ones.
@@ -1627,7 +1630,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
   return BestBlock;
 }
 
-/// \brief Retrieve the first unplaced basic block.
+/// Retrieve the first unplaced basic block.
 ///
 /// This routine is called when we are unable to use the CFG to walk through
 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
@@ -1723,8 +1726,8 @@ void MachineBlockPlacement::buildChain(
       if (!BestSucc)
         break;
 
-      DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
-                      "layout successor until the CFG reduces\n");
+      LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
+                           "layout successor until the CFG reduces\n");
     }
 
     // Placement may have changed tail duplication opportunities.
@@ -1743,18 +1746,18 @@ void MachineBlockPlacement::buildChain(
     // Zero out UnscheduledPredecessors for the successor we're about to merge in case
     // we selected a successor that didn't fit naturally into the CFG.
     SuccChain.UnscheduledPredecessors = 0;
-    DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
-                 << getBlockName(BestSucc) << "\n");
+    LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
+                      << getBlockName(BestSucc) << "\n");
     markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
     Chain.merge(BestSucc, &SuccChain);
     BB = *std::prev(Chain.end());
   }
 
-  DEBUG(dbgs() << "Finished forming chain for header block "
-               << getBlockName(*Chain.begin()) << "\n");
+  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
+                    << getBlockName(*Chain.begin()) << "\n");
 }
 
-/// \brief Find the best loop top block for layout.
+/// Find the best loop top block for layout.
 ///
 /// Look for a block which is strictly better than the loop header for laying
 /// out at the top of the loop. This looks for one and only one pattern:
@@ -1784,17 +1787,17 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
   if (!LoopBlockSet.count(*HeaderChain.begin()))
     return L.getHeader();
 
-  DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader())
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Finding best loop top for: "
+                    << getBlockName(L.getHeader()) << "\n");
 
   BlockFrequency BestPredFreq;
   MachineBasicBlock *BestPred = nullptr;
   for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
     if (!LoopBlockSet.count(Pred))
       continue;
-    DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", has "
-                 << Pred->succ_size() << " successors, ";
-          MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
+    LLVM_DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", has "
+                      << Pred->succ_size() << " successors, ";
+               MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
     if (Pred->succ_size() > 1)
       continue;
 
@@ -1809,7 +1812,7 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
 
   // If no direct predecessor is fine, just use the loop header.
   if (!BestPred) {
-    DEBUG(dbgs() << "    final top unchanged\n");
+    LLVM_DEBUG(dbgs() << "    final top unchanged\n");
     return L.getHeader();
   }
 
@@ -1819,11 +1822,11 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
          *BestPred->pred_begin() != L.getHeader())
     BestPred = *BestPred->pred_begin();
 
-  DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
+  LLVM_DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
   return BestPred;
 }
 
-/// \brief Find the best loop exiting block for layout.
+/// Find the best loop exiting block for layout.
 ///
 /// This routine implements the logic to analyze the loop looking for the best
 /// block to layout at the top of the loop. Typically this is done to maximize
@@ -1851,8 +1854,8 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
   // blocks where rotating to exit with that block will reach an outer loop.
   SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
 
-  DEBUG(dbgs() << "Finding best loop exit for: " << getBlockName(L.getHeader())
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
+                    << getBlockName(L.getHeader()) << "\n");
   for (MachineBasicBlock *MBB : L.getBlocks()) {
     BlockChain &Chain = *BlockToChain[MBB];
     // Ensure that this block is at the end of a chain; otherwise it could be
@@ -1875,15 +1878,15 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
       BlockChain &SuccChain = *BlockToChain[Succ];
       // Don't split chains, either this chain or the successor's chain.
       if (&Chain == &SuccChain) {
-        DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (chain conflict)\n");
+        LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                          << getBlockName(Succ) << " (chain conflict)\n");
         continue;
       }
 
       auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
       if (LoopBlockSet.count(Succ)) {
-        DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
+        LLVM_DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
+                          << getBlockName(Succ) << " (" << SuccProb << ")\n");
         HasLoopingSucc = true;
         continue;
       }
@@ -1896,9 +1899,10 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
       }
 
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
-      DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
-                   << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
-            MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
+      LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                        << getBlockName(Succ) << " [L:" << SuccLoopDepth
+                        << "] (";
+                 MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
       // Note that we bias this toward an existing layout successor to retain
       // incoming order in the absence of better information. The exit must have
       // a frequency higher than the current exit before we consider breaking
@@ -1922,11 +1926,12 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
   // Without a candidate exiting block or with only a single block in the
   // loop, just use the loop header to layout the loop.
   if (!ExitingBB) {
-    DEBUG(dbgs() << "    No other candidate exit blocks, using loop header\n");
+    LLVM_DEBUG(
+        dbgs() << "    No other candidate exit blocks, using loop header\n");
     return nullptr;
   }
   if (L.getNumBlocks() == 1) {
-    DEBUG(dbgs() << "    Loop has 1 block, using loop header as exit\n");
+    LLVM_DEBUG(dbgs() << "    Loop has 1 block, using loop header as exit\n");
     return nullptr;
   }
 
@@ -1937,11 +1942,12 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
       !BlocksExitingToOuterLoop.count(ExitingBB))
     return nullptr;
 
-  DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
+  LLVM_DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB)
+                    << "\n");
   return ExitingBB;
 }
 
-/// \brief Attempt to rotate an exiting block to the bottom of the loop.
+/// Attempt to rotate an exiting block to the bottom of the loop.
 ///
 /// Once we have built a chain, try to rotate it to line up the hot exit block
 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
@@ -2014,12 +2020,12 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
         return;
   }
 
-  DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
-               << " at bottom\n");
+  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
+                    << " at bottom\n");
   std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
 }
 
-/// \brief Attempt to rotate a loop based on profile data to reduce branch cost.
+/// Attempt to rotate a loop based on profile data to reduce branch cost.
 ///
 /// With profile data, we can determine the cost in terms of missed fall through
 /// opportunities when rotating a loop chain and select the best rotation.
@@ -2150,8 +2156,9 @@ void MachineBlockPlacement::rotateLoopWithProfile(
       }
     }
 
-    DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockName(*Iter)
-                 << " to the top: " << Cost.getFrequency() << "\n");
+    LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
+                      << getBlockName(*Iter)
+                      << " to the top: " << Cost.getFrequency() << "\n");
 
     if (Cost < SmallestRotationCost) {
       SmallestRotationCost = Cost;
@@ -2160,13 +2167,13 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   }
 
   if (RotationPos != LoopChain.end()) {
-    DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
-                 << " to the top\n");
+    LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
+                      << " to the top\n");
     std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
   }
 }
 
-/// \brief Collect blocks in the given loop that are to be placed.
+/// Collect blocks in the given loop that are to be placed.
 ///
 /// When profile data is available, exclude cold blocks from the returned set;
 /// otherwise, collect all blocks in the loop.
@@ -2202,7 +2209,7 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   return LoopBlockSet;
 }
 
-/// \brief Forms basic block chains from the natural loop structures.
+/// Forms basic block chains from the natural loop structures.
 ///
 /// These chains are designed to preserve the existing *structure* of the code
 /// as much as possible. We can then stitch the chains together in a way which
@@ -2265,7 +2272,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   else
     rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
 
-  DEBUG({
+  LLVM_DEBUG({
     // Crash at the end so we get all of the debugging output first.
     bool BadLoop = false;
     if (LoopChain.UnscheduledPredecessors) {
@@ -2324,9 +2331,9 @@ void MachineBlockPlacement::buildCFGChains() {
       // Ensure that the layout successor is a viable block, as we know that
       // fallthrough is a possibility.
       assert(NextFI != FE && "Can't fallthrough past the last block.");
-      DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
-                   << getBlockName(BB) << " -> " << getBlockName(NextBB)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                        << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                        << "\n");
       Chain->merge(NextBB, nullptr);
 #ifndef NDEBUG
       BlocksWithUnanalyzableExits.insert(&*BB);
@@ -2356,7 +2363,7 @@ void MachineBlockPlacement::buildCFGChains() {
 #ifndef NDEBUG
   using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
 #endif
-  DEBUG({
+  LLVM_DEBUG({
     // Crash at the end so we get all of the debugging output first.
     bool BadFunc = false;
     FunctionBlockSetType FunctionBlockSet;
@@ -2381,11 +2388,11 @@ void MachineBlockPlacement::buildCFGChains() {
 
   // Splice the blocks into place.
   MachineFunction::iterator InsertPos = F->begin();
-  DEBUG(dbgs() << "[MBP] Function: "<< F->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
   for (MachineBasicBlock *ChainBB : FunctionChain) {
-    DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
-                                                       : "          ... ")
-                 << getBlockName(ChainBB) << "\n");
+    LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
+                                                            : "          ... ")
+                      << getBlockName(ChainBB) << "\n");
     if (InsertPos != MachineFunction::iterator(ChainBB))
       F->splice(InsertPos, ChainBB);
     else
@@ -2470,11 +2477,11 @@ void MachineBlockPlacement::optimizeBranches() {
           MBPI->getEdgeProbability(ChainBB, FBB) >
               MBPI->getEdgeProbability(ChainBB, TBB) &&
           !TII->reverseBranchCondition(Cond)) {
-        DEBUG(dbgs() << "Reverse order of the two branches: "
-                     << getBlockName(ChainBB) << "\n");
-        DEBUG(dbgs() << "    Edge probability: "
-                     << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
-                     << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
+        LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
+                          << getBlockName(ChainBB) << "\n");
+        LLVM_DEBUG(dbgs() << "    Edge probability: "
+                          << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
+                          << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
         TII->removeBranch(*ChainBB);
         TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
@@ -2638,8 +2645,8 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   if (!shouldTailDuplicate(BB))
     return false;
 
-  DEBUG(dbgs() << "Redoing tail duplication for Succ#"
-        << BB->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
+                    << "\n");
 
   // This has to be a callback because none of it can be done after
   // BB is deleted.
@@ -2687,8 +2694,8 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
         if (RemBB == PreferredLoopExit)
           PreferredLoopExit = nullptr;
 
-        DEBUG(dbgs() << "TailDuplicator deleted block: "
-              << getBlockName(RemBB) << "\n");
+        LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
+                          << getBlockName(RemBB) << "\n");
       };
   auto RemovalCallbackRef =
       function_ref<void(MachineBasicBlock*)>(RemovalCallback);
@@ -2752,7 +2759,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     TailDupSize = TailDupPlacementAggressiveThreshold;
 
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
-  // For agressive optimization, we can adjust some thresholds to be less
+  // For aggressive optimization, we can adjust some thresholds to be less
   // conservative.
   if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
     // At O3 we should be more willing to copy blocks for tail duplication. This
@@ -2834,17 +2841,17 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
 namespace {
 
-/// \brief A pass to compute block placement statistics.
+/// A pass to compute block placement statistics.
 ///
 /// A separate pass to compute interesting statistics for evaluating block
 /// placement. This is separate from the actual placement pass so that they can
 /// be computed in the absence of any placement transformations or when using
 /// alternative placement strategies.
 class MachineBlockPlacementStats : public MachineFunctionPass {
-  /// \brief A handle to the branch probability pass.
+  /// A handle to the branch probability pass.
   const MachineBranchProbabilityInfo *MBPI;
 
-  /// \brief A handle to the function-wide block frequency pass.
+  /// A handle to the function-wide block frequency pass.
   const MachineBlockFrequencyInfo *MBFI;
 
 public:
diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
index 53c0d840ac84..6c92b1d426d6 100644
--- a/contrib/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
@@ -176,11 +176,10 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
     // class given a super-reg class and subreg index.
     if (DefMI->getOperand(1).getSubReg())
       continue;
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    if (!MRI->constrainRegClass(SrcReg, RC))
+    if (!MRI->constrainRegAttrs(SrcReg, Reg))
       continue;
-    DEBUG(dbgs() << "Coalescing: " << *DefMI);
-    DEBUG(dbgs() << "***     to: " << *MI);
+    LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    LLVM_DEBUG(dbgs() << "***     to: " << *MI);
     // Propagate SrcReg of copies to MI.
     MO.setReg(SrcReg);
     MRI->clearKillFlags(SrcReg);
@@ -315,7 +314,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
   unsigned LookAheadLeft = LookAheadLimit;
   while (LookAheadLeft) {
     // Skip over dbg_value's.
-    while (I != E && I != EE && I->isDebugValue())
+    while (I != E && I != EE && I->isDebugInstr())
       ++I;
 
     if (I == EE) {
@@ -354,7 +353,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
 
 bool MachineCSE::isCSECandidate(MachineInstr *MI) {
   if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() ||
-      MI->isInlineAsm() || MI->isDebugValue())
+      MI->isInlineAsm() || MI->isDebugInstr())
     return false;
 
   // Ignore copies.
@@ -446,25 +445,23 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // Heuristics #3: If the common subexpression is used by PHIs, do not reuse
   // it unless the defined value is already used in the BB of the new use.
   bool HasPHI = false;
-  SmallPtrSet<MachineBasicBlock*, 4> CSBBs;
-  for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
-    HasPHI |= MI.isPHI();
-    CSBBs.insert(MI.getParent());
+  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(CSReg)) {
+    HasPHI |= UseMI.isPHI();
+    if (UseMI.getParent() == MI->getParent())
+      return true;
   }
 
-  if (!HasPHI)
-    return true;
-  return CSBBs.count(MI->getParent());
+  return !HasPHI;
 }
 
 void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
   ScopeType *Scope = new ScopeType(VNT);
   ScopeMap[MBB] = Scope;
 }
 
 void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
   DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
   assert(SI != ScopeMap.end());
   delete SI->second;
@@ -548,13 +545,12 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     // Found a common subexpression, eliminate it.
     unsigned CSVN = VNT.lookup(MI);
     MachineInstr *CSMI = Exps[CSVN];
-    DEBUG(dbgs() << "Examining: " << *MI);
-    DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
+    LLVM_DEBUG(dbgs() << "Examining: " << *MI);
+    LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
 
     // Check if it's profitable to perform this CSE.
     bool DoCSE = true;
-    unsigned NumDefs = MI->getDesc().getNumDefs() +
-                       MI->getDesc().getNumImplicitDefs();
+    unsigned NumDefs = MI->getNumDefs();
 
     for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
@@ -583,16 +579,17 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
              "Do not CSE physical register defs!");
 
       if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
-        DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
+        LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
         DoCSE = false;
         break;
       }
 
-      // Don't perform CSE if the result of the old instruction cannot exist
-      // within the register class of the new instruction.
-      const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
-      if (!MRI->constrainRegClass(NewReg, OldRC)) {
-        DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n");
+      // Don't perform CSE if the result of the new instruction cannot exist
+      // within the constraints (register class, bank, or low-level type) of
+      // the old instruction.
+      if (!MRI->constrainRegAttrs(NewReg, OldReg)) {
+        LLVM_DEBUG(
+            dbgs() << "*** Not the same register constraints, avoid CSE!\n");
         DoCSE = false;
         break;
       }
diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
index 702d21228477..0c6efff7bb40 100644
--- a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -39,8 +39,27 @@ inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
               cl::desc("Incremental depth computation will be used for basic "
                        "blocks with more instructions."), cl::init(500));
 
+static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
+                                cl::desc("Dump all substituted intrs"),
+                                cl::init(false));
+
+#ifdef EXPENSIVE_CHECKS
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(true));
+#else
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(false));
+#endif
+
 namespace {
 class MachineCombiner : public MachineFunctionPass {
+  const TargetSubtargetInfo *STI;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MCSchedModel SchedModel;
@@ -85,6 +104,14 @@ private:
                             SmallVectorImpl<MachineInstr *> &DelInstrs);
   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+  std::pair<unsigned, unsigned>
+  getLatenciesForInstrSequences(MachineInstr &MI,
+                                SmallVectorImpl<MachineInstr *> &InsInstrs,
+                                SmallVectorImpl<MachineInstr *> &DelInstrs,
+                                MachineTraceMetrics::Trace BlockTrace);
+
+  void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
+                          SmallVector<MachineCombinerPattern, 16> &Patterns);
 };
 }
 
@@ -140,9 +167,6 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
   for (auto *InstrPtr : InsInstrs) { // for each Use
     unsigned IDepth = 0;
-    DEBUG(dbgs() << "NEW INSTR ";
-          InstrPtr->print(dbgs(), TII);
-          dbgs() << "\n";);
     for (const MachineOperand &MO : InstrPtr->operands()) {
       // Check for virtual register operand.
       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
@@ -242,6 +266,29 @@ static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
   }
 }
 
+/// Estimate the latency of the new and original instruction sequence by summing
+/// up the latencies of the inserted and deleted instructions. This assumes
+/// that the inserted and deleted instructions are dependent instruction chains,
+/// which might not hold in all cases.
+std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
+    MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineTraceMetrics::Trace BlockTrace) {
+  assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
+  unsigned NewRootLatency = 0;
+  // NewRoot is the last instruction in the \p InsInstrs vector.
+  MachineInstr *NewRoot = InsInstrs.back();
+  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
+    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
+  NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
+
+  unsigned RootLatency = 0;
+  for (auto I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
+  return {NewRootLatency, RootLatency};
+}
+
 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
 /// sequence to replace the old sequence is that it cannot lengthen the critical
@@ -257,56 +304,50 @@ bool MachineCombiner::improvesCriticalPathLen(
     bool SlackIsAccurate) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
          "Missing machine model\n");
-  // NewRoot is the last instruction in the \p InsInstrs vector.
-  unsigned NewRootIdx = InsInstrs.size() - 1;
-  MachineInstr *NewRoot = InsInstrs[NewRootIdx];
-
   // Get depth and latency of NewRoot and Root.
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
 
-  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n";
-        dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
-        dbgs() << " RootDepth: " << RootDepth << "\n");
+  LLVM_DEBUG(dbgs() << "  Dependence data for " << *Root << "\tNewRootDepth: "
+                    << NewRootDepth << "\tRootDepth: " << RootDepth);
 
   // For a transform such as reassociation, the cost equation is
   // conservatively calculated so that we must improve the depth (data
   // dependency cycles) in the critical path to proceed with the transform.
   // Being conservative also protects against inaccuracies in the underlying
   // machine trace metrics and CPU models.
-  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
+  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
+    LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
+    LLVM_DEBUG(NewRootDepth < RootDepth
+                   ? dbgs() << "\t  and it does it\n"
+                   : dbgs() << "\t  but it does NOT do it\n");
     return NewRootDepth < RootDepth;
+  }
 
   // A more flexible cost calculation for the critical path includes the slack
   // of the original code sequence. This may allow the transform to proceed
   // even if the instruction depths (data dependency cycles) become worse.
 
   // Account for the latency of the inserted and deleted instructions by
-  // adding up their latencies. This assumes that the inserted and deleted
-  // instructions are dependent instruction chains, which might not hold
-  // in all cases.
-  unsigned NewRootLatency = 0;
-  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
-    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
-  NewRootLatency += getLatency(Root, NewRoot, BlockTrace);
-
-  unsigned RootLatency = 0;
-  for (auto I : DelInstrs)
-    RootLatency += TSchedModel.computeInstrLatency(I);
+  unsigned NewRootLatency, RootLatency;
+  std::tie(NewRootLatency, RootLatency) =
+      getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
 
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
-  unsigned OldCycleCount = RootDepth + RootLatency +
-                           (SlackIsAccurate ? RootSlack : 0);
-  DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
-        dbgs() << " RootLatency: " << RootLatency << "\n";
-        dbgs() << " RootSlack: " << RootSlack << " SlackIsAccurate="
-               << SlackIsAccurate << "\n";
-        dbgs() << " NewRootDepth + NewRootLatency = "
-               << NewCycleCount << "\n";
-        dbgs() << " RootDepth + RootLatency + RootSlack = "
-               << OldCycleCount << "\n";
-        );
+  unsigned OldCycleCount =
+      RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
+  LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
+                    << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
+                    << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
+                    << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
+                    << "\n\tRootDepth + RootLatency + RootSlack = "
+                    << OldCycleCount;);
+  LLVM_DEBUG(NewCycleCount <= OldCycleCount
+                 ? dbgs() << "\n\t  It IMPROVES PathLen because"
+                 : dbgs() << "\n\t  It DOES NOT improve PathLen because");
+  LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
+                    << ", OldCycleCount = " << OldCycleCount << "\n");
 
   return NewCycleCount <= OldCycleCount;
 }
@@ -352,9 +393,14 @@ bool MachineCombiner::preservesResourceLen(
   unsigned ResLenAfterCombine =
       BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
 
-  DEBUG(dbgs() << "RESOURCE DATA: \n";
-        dbgs() << " resource len before: " << ResLenBeforeCombine
-               << " after: " << ResLenAfterCombine << "\n";);
+  LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
+                    << ResLenBeforeCombine
+                    << " and after: " << ResLenAfterCombine << "\n";);
+  LLVM_DEBUG(
+      ResLenAfterCombine <= ResLenBeforeCombine
+          ? dbgs() << "\t\t  As result it IMPROVES/PRESERVES Resource Length\n"
+          : dbgs() << "\t\t  As result it DOES NOT improve/preserve Resource "
+                      "Length\n");
 
   return ResLenAfterCombine <= ResLenBeforeCombine;
 }
@@ -409,6 +455,35 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
   NumInstCombined++;
 }
 
+// Check that the difference between original and new latency is decreasing for
+// later patterns. This helps to discover sub-optimal pattern orderings.
+void MachineCombiner::verifyPatternOrder(
+    MachineBasicBlock *MBB, MachineInstr &Root,
+    SmallVector<MachineCombinerPattern, 16> &Patterns) {
+  long PrevLatencyDiff = std::numeric_limits<long>::max();
+  (void)PrevLatencyDiff; // Variable is used in assert only.
+  for (auto P : Patterns) {
+    SmallVector<MachineInstr *, 16> InsInstrs;
+    SmallVector<MachineInstr *, 16> DelInstrs;
+    DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+    TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg);
+    // Found pattern, but did not generate alternative sequence.
+    // This can happen e.g. when an immediate could not be materialized
+    // in a single instruction.
+    if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
+      continue;
+
+    unsigned NewRootLatency, RootLatency;
+    std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
+        Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
+    long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
+    assert(CurrentLatencyDiff <= PrevLatencyDiff &&
+           "Current pattern is better than previous pattern.");
+    PrevLatencyDiff = CurrentLatencyDiff;
+  }
+}
+
 /// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
@@ -418,7 +493,7 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
 /// sequence is shorter.
 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
   bool Changed = false;
-  DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
 
   bool IncrementalUpdate = false;
   auto BlockIter = MBB->begin();
@@ -433,8 +508,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
 
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
-
-    DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
     SmallVector<MachineCombinerPattern, 16> Patterns;
     // The motivating example is:
     //
@@ -459,11 +532,16 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
     // The algorithm does not try to evaluate all patterns and pick the best.
     // This is only an artificial restriction though. In practice there is
     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
-    // based on an internal cost heuristic.
+    // based on an internal cost heuristic. If
+    // machine-combiner-verify-pattern-order is enabled, all patterns are
+    // checked to ensure later patterns do not provide better latency savings.
 
     if (!TII->getMachineCombinerPatterns(MI, Patterns))
       continue;
 
+    if (VerifyPatternOrder)
+      verifyPatternOrder(MBB, MI, Patterns);
+
     for (auto P : Patterns) {
       SmallVector<MachineInstr *, 16> InsInstrs;
       SmallVector<MachineInstr *, 16> DelInstrs;
@@ -478,6 +556,19 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       if (!NewInstCount)
         continue;
 
+      LLVM_DEBUG(if (dump_intrs) {
+        dbgs() << "\tFor the Pattern (" << (int)P << ") these instructions could be removed\n";
+        for (auto const *InstrPtr : DelInstrs) {
+          dbgs() << "\t\t" << STI->getSchedInfoStr(*InstrPtr) << ": ";
+          InstrPtr->print(dbgs(), false, false, false, TII);
+        }
+        dbgs() << "\tThese instructions could replace the removed ones\n";
+        for (auto const *InstrPtr : InsInstrs) {
+          dbgs() << "\t\t" << STI->getSchedInfoStr(*InstrPtr) << ": ";
+          InstrPtr->print(dbgs(), false, false, false, TII);
+        }
+      });
+
       bool SubstituteAlways = false;
       if (ML && TII->isThroughputPattern(P))
         SubstituteAlways = true;
@@ -539,20 +630,22 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
 }
 
 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  TII = STI.getInstrInfo();
-  TRI = STI.getRegisterInfo();
-  SchedModel = STI.getSchedModel();
-  TSchedModel.init(SchedModel, &STI, TII);
+  STI = &MF.getSubtarget();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  SchedModel = STI->getSchedModel();
+  TSchedModel.init(STI);
   MRI = &MF.getRegInfo();
   MLI = &getAnalysis<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
   OptSize = MF.getFunction().optForSize();
 
-  DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
   if (!TII->useMachineCombiner()) {
-    DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "  Skipping pass: Target does not support machine combiner\n");
     return false;
   }
 
diff --git a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index fcec05adc732..3bf8147a06c3 100644
--- a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -9,6 +9,35 @@
 //
 // This is an extremely simple MachineInstr-level copy propagation pass.
 //
+// This pass forwards the source of COPYs to the users of their destinations
+// when doing so is legal.  For example:
+//
+//   %reg1 = COPY %reg0
+//   ...
+//   ... = OP %reg1
+//
+// If
+//   - %reg0 has not been clobbered by the time of the use of %reg1
+//   - the register class constraints are satisfied
+//   - the COPY def is the only value that reaches OP
+// then this pass replaces the above with:
+//
+//   %reg1 = COPY %reg0
+//   ...
+//   ... = OP %reg0
+//
+// This pass also removes some redundant COPYs.  For example:
+//
+//    %R1 = COPY %R0
+//    ... // No clobber of %R1
+//    %R0 = COPY %R1 <<< Removed
+//
+// or
+//
+//    %R1 = COPY %R0
+//    ... // No clobber of %R0
+//    %R1 = COPY %R0 <<< Removed
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
@@ -23,11 +52,13 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <iterator>
@@ -37,6 +68,9 @@ using namespace llvm;
 #define DEBUG_TYPE "machine-cp"
 
 STATISTIC(NumDeletes, "Number of dead copies deleted");
+STATISTIC(NumCopyForwards, "Number of copy uses forwarded");
+DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
+              "Controls which register COPYs are forwarded");
 
 namespace {
 
@@ -73,6 +107,10 @@ using Reg2MIMap = DenseMap<unsigned, MachineInstr *>;
     void ReadRegister(unsigned Reg);
     void CopyPropagateBlock(MachineBasicBlock &MBB);
     bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
+    void forwardUses(MachineInstr &MI);
+    bool isForwardableRegClassCopy(const MachineInstr &Copy,
+                                   const MachineInstr &UseI, unsigned UseIdx);
+    bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
 
     /// Candidates for deletion.
     SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;
@@ -143,7 +181,8 @@ void MachineCopyPropagation::ReadRegister(unsigned Reg) {
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
     Reg2MIMap::iterator CI = CopyMap.find(*AI);
     if (CI != CopyMap.end()) {
-      DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
+      LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: ";
+                 CI->second->dump());
       MaybeDeadCopies.remove(CI->second);
     }
   }
@@ -191,7 +230,7 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
   if (!isNopCopy(PrevCopy, Src, Def, TRI))
     return false;
 
-  DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
+  LLVM_DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
 
   // Copy was redundantly redefining either Src or Def. Remove earlier kill
   // flags between Copy and PrevCopy because the value will be reused now.
@@ -208,14 +247,163 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
   return true;
 }
 
+/// Decide whether we should forward the source of \param Copy to its use in
+/// \param UseI based on the physical register class constraints of the opcode
+/// and avoiding introducing more cross-class COPYs.
+bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
+                                                       const MachineInstr &UseI,
+                                                       unsigned UseIdx) {
+
+  unsigned CopySrcReg = Copy.getOperand(1).getReg();
+
+  // If the new register meets the opcode register constraints, then allow
+  // forwarding.
+  if (const TargetRegisterClass *URC =
+          UseI.getRegClassConstraint(UseIdx, TII, TRI))
+    return URC->contains(CopySrcReg);
+
+  if (!UseI.isCopy())
+    return false;
+
+  /// COPYs don't have register class constraints, so if the user instruction
+  /// is a COPY, we just try to avoid introducing additional cross-class
+  /// COPYs.  For example:
+  ///
+  ///   RegClassA = COPY RegClassB  // Copy parameter
+  ///   ...
+  ///   RegClassB = COPY RegClassA  // UseI parameter
+  ///
+  /// which after forwarding becomes
+  ///
+  ///   RegClassA = COPY RegClassB
+  ///   ...
+  ///   RegClassB = COPY RegClassB
+  ///
+  /// so we have reduced the number of cross-class COPYs and potentially
+  /// introduced a nop COPY that can be removed.
+  const TargetRegisterClass *UseDstRC =
+      TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg());
+
+  const TargetRegisterClass *SuperRC = UseDstRC;
+  for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses();
+       SuperRC; SuperRC = *SuperRCI++)
+    if (SuperRC->contains(CopySrcReg))
+      return true;
+
+  return false;
+}
+
+/// Check that \p MI does not have implicit uses that overlap with it's \p Use
+/// operand (the register being replaced), since these can sometimes be
+/// implicitly tied to other operands.  For example, on AMDGPU:
+///
+/// V_MOVRELS_B32_e32 %VGPR2, %M0<imp-use>, %EXEC<imp-use>, %VGPR2_VGPR3_VGPR4_VGPR5<imp-use>
+///
+/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no
+/// way of knowing we need to update the latter when updating the former.
+bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
+                                                const MachineOperand &Use) {
+  for (const MachineOperand &MIUse : MI.uses())
+    if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() &&
+        MIUse.isUse() && TRI->regsOverlap(Use.getReg(), MIUse.getReg()))
+      return true;
+
+  return false;
+}
+
+/// Look for available copies whose destination register is used by \p MI and
+/// replace the use in \p MI with the copy's source register.
+void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
+  if (AvailCopyMap.empty())
+    return;
+
+  // Look for non-tied explicit vreg uses that have an active COPY
+  // instruction that defines the physical register allocated to them.
+  // Replace the vreg with the source of the active COPY.
+  for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx < OpEnd;
+       ++OpIdx) {
+    MachineOperand &MOUse = MI.getOperand(OpIdx);
+    // Don't forward into undef use operands since doing so can cause problems
+    // with the machine verifier, since it doesn't treat undef reads as reads,
+    // so we can end up with a live range that ends on an undef read, leading to
+    // an error that the live range doesn't end on a read of the live range
+    // register.
+    if (!MOUse.isReg() || MOUse.isTied() || MOUse.isUndef() || MOUse.isDef() ||
+        MOUse.isImplicit())
+      continue;
+
+    if (!MOUse.getReg())
+      continue;
+
+    // Check that the register is marked 'renamable' so we know it is safe to
+    // rename it without violating any constraints that aren't expressed in the
+    // IR (e.g. ABI or opcode requirements).
+    if (!MOUse.isRenamable())
+      continue;
+
+    auto CI = AvailCopyMap.find(MOUse.getReg());
+    if (CI == AvailCopyMap.end())
+      continue;
+
+    MachineInstr &Copy = *CI->second;
+    unsigned CopyDstReg = Copy.getOperand(0).getReg();
+    const MachineOperand &CopySrc = Copy.getOperand(1);
+    unsigned CopySrcReg = CopySrc.getReg();
+
+    // FIXME: Don't handle partial uses of wider COPYs yet.
+    if (MOUse.getReg() != CopyDstReg) {
+      LLVM_DEBUG(
+          dbgs() << "MCP: FIXME! Not forwarding COPY to sub-register use:\n  "
+                 << MI);
+      continue;
+    }
+
+    // Don't forward COPYs of reserved regs unless they are constant.
+    if (MRI->isReserved(CopySrcReg) && !MRI->isConstantPhysReg(CopySrcReg))
+      continue;
+
+    if (!isForwardableRegClassCopy(Copy, MI, OpIdx))
+      continue;
+
+    if (hasImplicitOverlap(MI, MOUse))
+      continue;
+
+    if (!DebugCounter::shouldExecute(FwdCounter)) {
+      LLVM_DEBUG(dbgs() << "MCP: Skipping forwarding due to debug counter:\n  "
+                        << MI);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MOUse.getReg(), TRI)
+                      << "\n     with " << printReg(CopySrcReg, TRI)
+                      << "\n     in " << MI << "     from " << Copy);
+
+    MOUse.setReg(CopySrcReg);
+    if (!CopySrc.isRenamable())
+      MOUse.setIsRenamable(false);
+
+    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+
+    // Clear kill markers that may have been invalidated.
+    for (MachineInstr &KMI :
+         make_range(Copy.getIterator(), std::next(MI.getIterator())))
+      KMI.clearRegisterKills(CopySrcReg, TRI);
+
+    ++NumCopyForwards;
+    Changed = true;
+  }
+}
+
 void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
-  DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
 
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
 
-    if (MI->isCopy()) {
+    // Analyze copies (which don't overlap themselves).
+    if (MI->isCopy() && !TRI->regsOverlap(MI->getOperand(0).getReg(),
+                                          MI->getOperand(1).getReg())) {
       unsigned Def = MI->getOperand(0).getReg();
       unsigned Src = MI->getOperand(1).getReg();
 
@@ -241,6 +429,11 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
         continue;
 
+      forwardUses(*MI);
+
+      // Src may have been changed by forwardUses()
+      Src = MI->getOperand(1).getReg();
+
       // If Src is defined by a previous copy, the previous copy cannot be
       // eliminated.
       ReadRegister(Src);
@@ -253,7 +446,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
         ReadRegister(Reg);
       }
 
-      DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
+      LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
 
       // Copy is now a candidate for deletion.
       if (!MRI->isReserved(Def))
@@ -292,6 +485,20 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    // Clobber any earlyclobber regs first.
+    for (const MachineOperand &MO : MI->operands())
+      if (MO.isReg() && MO.isEarlyClobber()) {
+        unsigned Reg = MO.getReg();
+        // If we have a tied earlyclobber, that means it is also read by this
+        // instruction, so we need to make sure we don't remove it as dead
+        // later.
+        if (MO.isTied())
+          ReadRegister(Reg);
+        ClobberRegister(Reg);
+      }
+
+    forwardUses(*MI);
+
     // Not a copy.
     SmallVector<unsigned, 2> Defs;
     const MachineOperand *RegMask = nullptr;
@@ -307,10 +514,10 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
              "MachineCopyPropagation should be run after register allocation!");
 
-      if (MO.isDef()) {
+      if (MO.isDef() && !MO.isEarlyClobber()) {
         Defs.push_back(Reg);
         continue;
-      } else if (MO.readsReg())
+      } else if (!MO.isDebug() && MO.readsReg())
         ReadRegister(Reg);
     }
 
@@ -331,8 +538,8 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
           continue;
         }
 
-        DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
-              MaybeDead->dump());
+        LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
+                   MaybeDead->dump());
 
         // erase() will return the next valid iterator pointing to the next
         // element after the erased one.
@@ -364,6 +571,8 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
   // since we don't want to trust live-in lists.
   if (MBB.succ_empty()) {
     for (MachineInstr *MaybeDead : MaybeDeadCopies) {
+      LLVM_DEBUG(dbgs() << "MCP: Removing copy due to no live-out succ: ";
+                 MaybeDead->dump());
       assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
       MaybeDead->eraseFromParent();
       Changed = true;
diff --git a/contrib/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
index 517ac29b6450..6b2802626456 100644
--- a/contrib/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
@@ -65,8 +65,21 @@ void MachineDominatorTree::releaseMemory() {
 }
 
 void MachineDominatorTree::verifyAnalysis() const {
-  if (DT && VerifyMachineDomInfo)
-    verifyDomTree();
+  if (DT && VerifyMachineDomInfo) {
+    MachineFunction &F = *getRoot()->getParent();
+
+    DomTreeBase<MachineBasicBlock> OtherDT;
+    OtherDT.recalculate(F);
+    if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
+        DT->compare(OtherDT)) {
+      errs() << "MachineDominatorTree for function " << F.getName()
+            << " is not up to date!\nComputed:\n";
+      DT->print(errs());
+      errs() << "\nActual:\n";
+      OtherDT.print(errs());
+      abort();
+    }
+  }
 }
 
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
@@ -138,21 +151,3 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
   NewBBs.clear();
   CriticalEdgesToSplit.clear();
 }
-
-void MachineDominatorTree::verifyDomTree() const {
-  if (!DT)
-    return;
-  MachineFunction &F = *getRoot()->getParent();
-
-  DomTreeBase<MachineBasicBlock> OtherDT;
-  OtherDT.recalculate(F);
-  if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
-      DT->compare(OtherDT)) {
-    errs() << "MachineDominatorTree for function " << F.getName()
-           << " is not up to date!\nComputed:\n";
-    DT->print(errs());
-    errs() << "\nActual:\n";
-    OtherDT.print(errs());
-    abort();
-  }
-}
diff --git a/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 2aa9d6b816c8..0b316871dbdf 100644
--- a/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -40,9 +41,9 @@ static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
                                            unsigned StackAlign) {
   if (!ShouldClamp || Align <= StackAlign)
     return Align;
-  DEBUG(dbgs() << "Warning: requested alignment " << Align
-               << " exceeds the stack alignment " << StackAlign
-               << " when stack realignment is off" << '\n');
+  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << Align
+                    << " exceeds the stack alignment " << StackAlign
+                    << " when stack realignment is off" << '\n');
   return StackAlign;
 }
 
@@ -217,7 +218,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
     OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
 
     if (SO.StackID != 0)
-      OS << "id=" << SO.StackID << ' ';
+      OS << "id=" << static_cast<unsigned>(SO.StackID) << ' ';
 
     if (SO.Size == ~0ULL) {
       OS << "dead\n";
diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
index bc8eb1429d92..dd668bcf6193 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
@@ -37,7 +37,9 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -174,6 +176,11 @@ void MachineFunction::init() {
     WinEHInfo = new (Allocator) WinEHFuncInfo();
   }
 
+  if (isScopedEHPersonality(classifyEHPersonality(
+          F.hasPersonalityFn() ? F.getPersonalityFn() : nullptr))) {
+    WasmEHInfo = new (Allocator) WasmEHFuncInfo();
+  }
+
   assert(Target.isCompatibleDataLayout(getDataLayout()) &&
          "Can't create a MachineFunction using a Module with a "
          "Target-incompatible DataLayout attached\n");
@@ -195,6 +202,7 @@ void MachineFunction::clear() {
   // Do call MachineBasicBlock destructors, it contains std::vectors.
   for (iterator I = begin(), E = end(); I != E; I = BasicBlocks.erase(I))
     I->Insts.clearAndLeakNodesUnsafely();
+  MBBNumbering.clear();
 
   InstructionRecycler.clear(Allocator);
   OperandRecycler.clear(Allocator);
@@ -478,6 +486,14 @@ const char *MachineFunction::createExternalSymbolName(StringRef Name) {
   return Dest;
 }
 
+uint32_t *MachineFunction::allocateRegMask() {
+  unsigned NumRegs = getSubtarget().getRegisterInfo()->getNumRegs();
+  unsigned Size = MachineOperand::getRegMaskSize(NumRegs);
+  uint32_t *Mask = Allocator.Allocate<uint32_t>(Size);
+  memset(Mask, 0, Size * sizeof(Mask[0]));
+  return Mask;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MachineFunction::dump() const {
   print(dbgs());
@@ -522,7 +538,8 @@ void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
   MST.incorporateFunction(getFunction());
   for (const auto &BB : *this) {
     OS << '\n';
-    BB.print(OS, MST, Indexes);
+    // If we print the whole function, print it at its most verbose level.
+    BB.print(OS, MST, Indexes, /*IsStandalone=*/true);
   }
 
   OS << "\n# End machine code for function " << getName() << ".\n\n";
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
index 5ffe33006131..67ac95740e3e 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 
@@ -85,7 +84,6 @@ void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MemoryDependenceWrapperPass>();
   AU.addPreserved<ScalarEvolutionWrapperPass>();
   AU.addPreserved<SCEVAAWrapperPass>();
-  AU.addPreserved<StackProtector>();
 
   FunctionPass::getAnalysisUsage(AU);
 }
diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
index 14655c6eb700..96fcfdb72ad7 100644
--- a/contrib/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -74,6 +75,29 @@
 
 using namespace llvm;
 
+static const MachineFunction *getMFIfAvailable(const MachineInstr &MI) {
+  if (const MachineBasicBlock *MBB = MI.getParent())
+    if (const MachineFunction *MF = MBB->getParent())
+      return MF;
+  return nullptr;
+}
+
+// Try to crawl up to the machine function and get TRI and IntrinsicInfo from
+// it.
+static void tryToGetTargetInfo(const MachineInstr &MI,
+                               const TargetRegisterInfo *&TRI,
+                               const MachineRegisterInfo *&MRI,
+                               const TargetIntrinsicInfo *&IntrinsicInfo,
+                               const TargetInstrInfo *&TII) {
+
+  if (const MachineFunction *MF = getMFIfAvailable(MI)) {
+    TRI = MF->getSubtarget().getRegisterInfo();
+    MRI = &MF->getRegInfo();
+    IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
+    TII = MF->getSubtarget().getInstrInfo();
+  }
+}
+
 void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
   if (MCID->ImplicitDefs)
     for (const MCPhysReg *ImpDefs = MCID->getImplicitDefs(); *ImpDefs;
@@ -358,6 +382,12 @@ MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
   return std::make_pair(MemBegin, CombinedNumMemRefs);
 }
 
+uint16_t MachineInstr::mergeFlagsWith(const MachineInstr &Other) const {
+  // For now, the just return the union of the flags. If the flags get more
+  // complicated over time, we might need more logic here.
+  return getFlags() | Other.getFlags();
+}
+
 bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
   assert(!isBundledWithPred() && "Must be called on bundle header");
   for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
@@ -437,8 +467,8 @@ bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
         return false;
     }
   }
-  // If DebugLoc does not match then two dbg.values are not identical.
-  if (isDebugValue())
+  // If DebugLoc does not match then two debug instructions are not identical.
+  if (isDebugInstr())
     if (getDebugLoc() && Other.getDebugLoc() &&
         getDebugLoc() != Other.getDebugLoc())
       return false;
@@ -489,21 +519,39 @@ void MachineInstr::eraseFromBundle() {
   getParent()->erase_instr(this);
 }
 
-/// getNumExplicitOperands - Returns the number of non-implicit operands.
-///
 unsigned MachineInstr::getNumExplicitOperands() const {
   unsigned NumOperands = MCID->getNumOperands();
   if (!MCID->isVariadic())
     return NumOperands;
 
-  for (unsigned i = NumOperands, e = getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = getOperand(i);
-    if (!MO.isReg() || !MO.isImplicit())
-      NumOperands++;
+  for (unsigned I = NumOperands, E = getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = getOperand(I);
+    // The operands must always be in the following order:
+    // - explicit reg defs,
+    // - other explicit operands (reg uses, immediates, etc.),
+    // - implicit reg defs
+    // - implicit reg uses
+    if (MO.isReg() && MO.isImplicit())
+      break;
+    ++NumOperands;
   }
   return NumOperands;
 }
 
+unsigned MachineInstr::getNumExplicitDefs() const {
+  unsigned NumDefs = MCID->getNumDefs();
+  if (!MCID->isVariadic())
+    return NumDefs;
+
+  for (unsigned I = NumDefs, E = getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = getOperand(I);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      break;
+    ++NumDefs;
+  }
+  return NumDefs;
+}
+
 void MachineInstr::bundleWithPred() {
   assert(!isBundledWithPred() && "MI is already bundled with its predecessor");
   setFlag(BundledPred);
@@ -583,6 +631,11 @@ int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
   return -1;
 }
 
+const DILabel *MachineInstr::getDebugLabel() const {
+  assert(isDebugLabel() && "not a DBG_LABEL");
+  return cast<DILabel>(getOperand(0).getMetadata());
+}
+
 const DILocalVariable *MachineInstr::getDebugVariable() const {
   assert(isDebugValue() && "not a DBG_VALUE");
   return cast<DILocalVariable>(getOperand(2).getMetadata());
@@ -905,8 +958,7 @@ void MachineInstr::clearKillInfo() {
   }
 }
 
-void MachineInstr::substituteRegister(unsigned FromReg,
-                                      unsigned ToReg,
+void MachineInstr::substituteRegister(unsigned FromReg, unsigned ToReg,
                                       unsigned SubIdx,
                                       const TargetRegisterInfo &RegInfo) {
   if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
@@ -941,7 +993,7 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
     return false;
   }
 
-  if (isPosition() || isDebugValue() || isTerminator() ||
+  if (isPosition() || isDebugInstr() || isTerminator() ||
       hasUnmodeledSideEffects())
     return false;
 
@@ -1195,8 +1247,12 @@ LLT MachineInstr::getTypeToPrint(unsigned OpIdx, SmallBitVector &PrintedTypes,
   if (PrintedTypes[OpInfo.getGenericTypeIndex()])
     return LLT{};
 
-  PrintedTypes.set(OpInfo.getGenericTypeIndex());
-  return MRI.getType(Op.getReg());
+  LLT TypeToPrint = MRI.getType(Op.getReg());
+  // Don't mark the type index printed if it wasn't actually printed: maybe
+  // another operand with the same type index has an actual type attached:
+  if (TypeToPrint.isValid())
+    PrintedTypes.set(OpInfo.getGenericTypeIndex());
+  return TypeToPrint;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1206,39 +1262,36 @@ LLVM_DUMP_METHOD void MachineInstr::dump() const {
 }
 #endif
 
-void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc,
+void MachineInstr::print(raw_ostream &OS, bool IsStandalone, bool SkipOpers,
+                         bool SkipDebugLoc, bool AddNewLine,
                          const TargetInstrInfo *TII) const {
   const Module *M = nullptr;
-  if (const MachineBasicBlock *MBB = getParent())
-    if (const MachineFunction *MF = MBB->getParent())
-      M = MF->getFunction().getParent();
+  const Function *F = nullptr;
+  if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+    F = &MF->getFunction();
+    M = F->getParent();
+    if (!TII)
+      TII = MF->getSubtarget().getInstrInfo();
+  }
 
   ModuleSlotTracker MST(M);
-  print(OS, MST, SkipOpers, SkipDebugLoc, TII);
+  if (F)
+    MST.incorporateFunction(*F);
+  print(OS, MST, IsStandalone, SkipOpers, SkipDebugLoc, TII);
 }
 
 void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                         bool SkipOpers, bool SkipDebugLoc,
-                         const TargetInstrInfo *TII) const {
+                         bool IsStandalone, bool SkipOpers, bool SkipDebugLoc,
+                         bool AddNewLine, const TargetInstrInfo *TII) const {
   // We can be a bit tidier if we know the MachineFunction.
   const MachineFunction *MF = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
 
-  if (const MachineBasicBlock *MBB = getParent()) {
-    MF = MBB->getParent();
-    if (MF) {
-      MRI = &MF->getRegInfo();
-      TRI = MF->getSubtarget().getRegisterInfo();
-      if (!TII)
-        TII = MF->getSubtarget().getInstrInfo();
-      IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
-    }
-  }
-
-  // Save a list of virtual registers.
-  SmallVector<unsigned, 8> VirtRegs;
+  if (isCFIInstruction())
+    assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
 
   SmallBitVector PrintedTypes(8);
   bool ShouldPrintRegisterTies = hasComplexRegisterTies();
@@ -1250,26 +1303,47 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       return findTiedOperandIdx(OpIdx);
     return 0U;
   };
+  unsigned StartOp = 0;
+  unsigned e = getNumOperands();
+
   // Print explicitly defined operands on the left of an assignment syntax.
-  unsigned StartOp = 0, e = getNumOperands();
-  for (; StartOp < e && getOperand(StartOp).isReg() &&
-         getOperand(StartOp).isDef() && !getOperand(StartOp).isImplicit();
-       ++StartOp) {
+  while (StartOp < e) {
+    const MachineOperand &MO = getOperand(StartOp);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      break;
+
     if (StartOp != 0)
       OS << ", ";
+
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    getOperand(StartOp).print(OS, MST, TypeToPrint, /*PrintDef=*/false,
-                              ShouldPrintRegisterTies, TiedOperandIdx, TRI,
-                              IntrinsicInfo);
-    unsigned Reg = getOperand(StartOp).getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
-      VirtRegs.push_back(Reg);
+    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
+             ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
+    ++StartOp;
   }
 
   if (StartOp != 0)
     OS << " = ";
 
+  if (getFlag(MachineInstr::FrameSetup))
+    OS << "frame-setup ";
+  if (getFlag(MachineInstr::FrameDestroy))
+    OS << "frame-destroy ";
+  if (getFlag(MachineInstr::FmNoNans))
+    OS << "nnan ";
+  if (getFlag(MachineInstr::FmNoInfs))
+    OS << "ninf ";
+  if (getFlag(MachineInstr::FmNsz))
+    OS << "nsz ";
+  if (getFlag(MachineInstr::FmArcp))
+    OS << "arcp ";
+  if (getFlag(MachineInstr::FmContract))
+    OS << "contract ";
+  if (getFlag(MachineInstr::FmAfn))
+    OS << "afn ";
+  if (getFlag(MachineInstr::FmReassoc))
+    OS << "reassoc ";
+
   // Print the opcode name.
   if (TII)
     OS << TII->getName(getOpcode());
@@ -1290,7 +1364,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1318,18 +1392,9 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
 
-    if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      VirtRegs.push_back(MO.getReg());
-
     if (FirstOp) FirstOp = false; else OS << ",";
     OS << " ";
-    if (i < getDesc().NumOperands) {
-      const MCOperandInfo &MCOI = getDesc().OpInfo[i];
-      if (MCOI.isPredicate())
-        OS << "pred:";
-      if (MCOI.isOptionalDef())
-        OS << "opt:";
-    }
+
     if (isDebugValue() && MO.isMetadata()) {
       // Pretty print DBG_VALUE instructions.
       auto *DIV = dyn_cast<DILocalVariable>(MO.getMetadata());
@@ -1338,12 +1403,20 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+                 ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
+      }
+    } else if (isDebugLabel() && MO.isMetadata()) {
+      // Pretty print DBG_LABEL instructions.
+      auto *DIL = dyn_cast<DILabel>(MO.getMetadata());
+      if (DIL && !DIL->getName().empty())
+        OS << "\"" << DIL->getName() << '\"';
+      else {
+        LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
+        unsigned TiedOperandIdx = getTiedOperandIdx(i);
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
-    } else if (TRI && (isInsertSubreg() || isRegSequence() ||
-                       (isSubregToReg() && i == 3)) && MO.isImm()) {
-      OS << TRI->getSubRegIndexName(MO.getImm());
     } else if (i == AsmDescOp && MO.isImm()) {
       // Pretty print the inline asm operand descriptor.
       OS << '$' << AsmOpCount++;
@@ -1406,77 +1479,66 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
       unsigned TiedOperandIdx = getTiedOperandIdx(i);
       if (MO.isImm() && isOperandSubregIdx(i))
-        MachineOperand::printSubregIdx(OS, MO.getImm(), TRI);
+        MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
 
-  bool HaveSemi = false;
-  const unsigned PrintableFlags = FrameSetup | FrameDestroy;
-  if (Flags & PrintableFlags) {
-    if (!HaveSemi) {
-      OS << ";";
-      HaveSemi = true;
+  if (!SkipDebugLoc) {
+    if (const DebugLoc &DL = getDebugLoc()) {
+      if (!FirstOp)
+        OS << ',';
+      OS << " debug-location ";
+      DL->printAsOperand(OS, MST);
     }
-    OS << " flags: ";
-
-    if (Flags & FrameSetup)
-      OS << "FrameSetup";
-
-    if (Flags & FrameDestroy)
-      OS << "FrameDestroy";
   }
 
   if (!memoperands_empty()) {
-    if (!HaveSemi) {
-      OS << ";";
-      HaveSemi = true;
+    SmallVector<StringRef, 0> SSNs;
+    const LLVMContext *Context = nullptr;
+    std::unique_ptr<LLVMContext> CtxPtr;
+    const MachineFrameInfo *MFI = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+      MFI = &MF->getFrameInfo();
+      Context = &MF->getFunction().getContext();
+    } else {
+      CtxPtr = llvm::make_unique<LLVMContext>();
+      Context = CtxPtr.get();
     }
 
-    OS << " mem:";
-    for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
-         i != e; ++i) {
-      (*i)->print(OS, MST);
-      if (std::next(i) != e)
-        OS << " ";
+    OS << " :: ";
+    bool NeedComma = false;
+    for (const MachineMemOperand *Op : memoperands()) {
+      if (NeedComma)
+        OS << ", ";
+      Op->print(OS, MST, SSNs, *Context, MFI, TII);
+      NeedComma = true;
     }
   }
 
-  // Print the regclass of any virtual registers encountered.
-  if (MRI && !VirtRegs.empty()) {
+  if (SkipDebugLoc)
+    return;
+
+  bool HaveSemi = false;
+
+  // Print debug location information.
+  if (const DebugLoc &DL = getDebugLoc()) {
     if (!HaveSemi) {
-      OS << ";";
+      OS << ';';
       HaveSemi = true;
     }
-    for (unsigned i = 0; i != VirtRegs.size(); ++i) {
-      const RegClassOrRegBank &RC = MRI->getRegClassOrRegBank(VirtRegs[i]);
-      if (!RC)
-        continue;
-      // Generic virtual registers do not have register classes.
-      if (RC.is<const RegisterBank *>())
-        OS << " " << RC.get<const RegisterBank *>()->getName();
-      else
-        OS << " "
-           << TRI->getRegClassName(RC.get<const TargetRegisterClass *>());
-      OS << ':' << printReg(VirtRegs[i]);
-      for (unsigned j = i+1; j != VirtRegs.size();) {
-        if (MRI->getRegClassOrRegBank(VirtRegs[j]) != RC) {
-          ++j;
-          continue;
-        }
-        if (VirtRegs[i] != VirtRegs[j])
-          OS << "," << printReg(VirtRegs[j]);
-        VirtRegs.erase(VirtRegs.begin()+j);
-      }
-    }
+    OS << ' ';
+    DL.print(OS);
   }
 
-  // Print debug location information.
+  // Print extra comments for DEBUG_VALUE.
   if (isDebugValue() && getOperand(e - 2).isMetadata()) {
-    if (!HaveSemi)
+    if (!HaveSemi) {
       OS << ";";
+      HaveSemi = true;
+    }
     auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
     OS << " line no:" <<  DV->getLine();
     if (auto *InlinedAt = debugLoc->getInlinedAt()) {
@@ -1489,16 +1551,11 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
     if (isIndirectDebugValue())
       OS << " indirect";
-  } else if (SkipDebugLoc) {
-    return;
-  } else if (debugLoc && MF) {
-    if (!HaveSemi)
-      OS << ";";
-    OS << " dbg:";
-    debugLoc.print(OS);
   }
+  // TODO: DBG_LABEL
 
-  OS << '\n';
+  if (AddNewLine)
+    OS << '\n';
 }
 
 bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
@@ -1737,33 +1794,55 @@ MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
+  auto MIB = BuildMI(MF, DL, MCID).addReg(Reg, RegState::Debug);
   if (IsIndirect)
-    return BuildMI(MF, DL, MCID)
-        .addReg(Reg, RegState::Debug)
-        .addImm(0U)
-        .addMetadata(Variable)
-        .addMetadata(Expr);
+    MIB.addImm(0U);
   else
-    return BuildMI(MF, DL, MCID)
-        .addReg(Reg, RegState::Debug)
-        .addReg(0U, RegState::Debug)
-        .addMetadata(Variable)
-        .addMetadata(Expr);
+    MIB.addReg(0U, RegState::Debug);
+  return MIB.addMetadata(Variable).addMetadata(Expr);
 }
 
+MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
+                                  const MCInstrDesc &MCID, bool IsIndirect,
+                                  MachineOperand &MO, const MDNode *Variable,
+                                  const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  if (MO.isReg())
+    return BuildMI(MF, DL, MCID, IsIndirect, MO.getReg(), Variable, Expr);
+
+  auto MIB = BuildMI(MF, DL, MCID).add(MO);
+  if (IsIndirect)
+    MIB.addImm(0U);
+  else
+    MIB.addReg(0U, RegState::Debug);
+  return MIB.addMetadata(Variable).addMetadata(Expr);
+ }
+
 MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, const MCInstrDesc &MCID,
                                   bool IsIndirect, unsigned Reg,
                                   const MDNode *Variable, const MDNode *Expr) {
-  assert(isa<DILocalVariable>(Variable) && "not a variable");
-  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   MachineFunction &MF = *BB.getParent();
   MachineInstr *MI = BuildMI(MF, DL, MCID, IsIndirect, Reg, Variable, Expr);
   BB.insert(I, MI);
   return MachineInstrBuilder(MF, MI);
 }
 
+MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, const MCInstrDesc &MCID,
+                                  bool IsIndirect, MachineOperand &MO,
+                                  const MDNode *Variable, const MDNode *Expr) {
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI = BuildMI(MF, DL, MCID, IsIndirect, MO, Variable, Expr);
+  BB.insert(I, MI);
+  return MachineInstrBuilder(MF, *MI);
+}
+
 /// Compute the new DIExpression to use with a DBG_VALUE for a spill slot.
 /// This prepends DW_OP_deref when spilling an indirect DBG_VALUE.
 static const DIExpression *computeExprForSpill(const MachineInstr &MI) {
diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
index 75d449c7ac6f..7332b7162030 100644
--- a/contrib/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
@@ -71,6 +71,10 @@ SinkInstsToAvoidSpills("sink-insts-to-avoid-spills",
                        cl::desc("MachineLICM should sink instructions into "
                                 "loops to avoid register spills"),
                        cl::init(false), cl::Hidden);
+static cl::opt<bool>
+HoistConstStores("hoist-const-stores",
+                 cl::desc("Hoist invariant stores"),
+                 cl::init(true), cl::Hidden);
 
 STATISTIC(NumHoisted,
           "Number of machine instructions hoisted out of loops");
@@ -82,17 +86,19 @@ STATISTIC(NumCSEed,
           "Number of hoisted machine instructions CSEed");
 STATISTIC(NumPostRAHoisted,
           "Number of machine instructions hoisted out of loops post regalloc");
+STATISTIC(NumStoreConst,
+          "Number of stores of const phys reg hoisted out of loops");
 
 namespace {
 
-  class MachineLICM : public MachineFunctionPass {
+  class MachineLICMBase : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetLoweringBase *TLI;
     const TargetRegisterInfo *TRI;
     const MachineFrameInfo *MFI;
     MachineRegisterInfo *MRI;
     TargetSchedModel SchedModel;
-    bool PreRegAlloc = true;
+    bool PreRegAlloc;
 
     // Various analyses that we use...
     AliasAnalysis        *AA;      // Alias analysis info.
@@ -138,16 +144,8 @@ namespace {
     unsigned SpeculationState;
 
   public:
-    static char ID; // Pass identification, replacement for typeid
-
-    MachineLICM() : MachineFunctionPass(ID) {
-      initializeMachineLICMPass(*PassRegistry::getPassRegistry());
-    }
-
-    explicit MachineLICM(bool PreRA)
-        : MachineFunctionPass(ID), PreRegAlloc(PreRA) {
-        initializeMachineLICMPass(*PassRegistry::getPassRegistry());
-    }
+    MachineLICMBase(char &PassID, bool PreRegAlloc)
+        : MachineFunctionPass(PassID), PreRegAlloc(PreRegAlloc) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -218,7 +216,7 @@ namespace {
         DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
         DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
 
-    void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
+    void HoistOutOfLoop(MachineDomTreeNode *HeaderN);
 
     void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
 
@@ -252,11 +250,29 @@ namespace {
     MachineBasicBlock *getCurPreheader();
   };
 
+  class MachineLICM : public MachineLICMBase {
+  public:
+    static char ID;
+    MachineLICM() : MachineLICMBase(ID, false) {
+      initializeMachineLICMPass(*PassRegistry::getPassRegistry());
+    }
+  };
+
+  class EarlyMachineLICM : public MachineLICMBase {
+  public:
+    static char ID;
+    EarlyMachineLICM() : MachineLICMBase(ID, true) {
+      initializeEarlyMachineLICMPass(*PassRegistry::getPassRegistry());
+    }
+  };
+
 } // end anonymous namespace
 
-char MachineLICM::ID = 0;
+char MachineLICM::ID;
+char EarlyMachineLICM::ID;
 
 char &llvm::MachineLICMID = MachineLICM::ID;
+char &llvm::EarlyMachineLICMID = EarlyMachineLICM::ID;
 
 INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE,
                       "Machine Loop Invariant Code Motion", false, false)
@@ -266,6 +282,14 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE,
                     "Machine Loop Invariant Code Motion", false, false)
 
+INITIALIZE_PASS_BEGIN(EarlyMachineLICM, "early-machinelicm",
+                      "Early Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(EarlyMachineLICM, "early-machinelicm",
+                    "Early Machine Loop Invariant Code Motion", false, false)
+
 /// Test if the given loop is the outer-most loop that has a unique predecessor.
 static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
   // Check whether this loop even has a unique predecessor.
@@ -279,7 +303,7 @@ static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
   return true;
 }
 
-bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
+bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -290,15 +314,15 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   MFI = &MF.getFrameInfo();
   MRI = &MF.getRegInfo();
-  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  SchedModel.init(&ST);
 
   PreRegAlloc = MRI->isSSA();
 
   if (PreRegAlloc)
-    DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
+    LLVM_DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
   else
-    DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
-  DEBUG(dbgs() << MF.getName() << " ********\n");
+    LLVM_DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
+  LLVM_DEBUG(dbgs() << MF.getName() << " ********\n");
 
   if (PreRegAlloc) {
     // Estimate register pressure during pre-regalloc pass.
@@ -350,6 +374,10 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
 
 /// Return true if instruction stores to the specified frame.
 static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
+  // Check mayStore before memory operands so that e.g. DBG_VALUEs will return
+  // true since they have no memory operands.
+  if (!MI->mayStore())
+     return false;
   // If we lost memory operands, conservatively assume that the instruction
   // writes to all slots.
   if (MI->memoperands_empty())
@@ -368,11 +396,11 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
 
 /// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
-void MachineLICM::ProcessMI(MachineInstr *MI,
-                            BitVector &PhysRegDefs,
-                            BitVector &PhysRegClobbers,
-                            SmallSet<int, 32> &StoredFIs,
-                            SmallVectorImpl<CandidateInfo> &Candidates) {
+void MachineLICMBase::ProcessMI(MachineInstr *MI,
+                                BitVector &PhysRegDefs,
+                                BitVector &PhysRegClobbers,
+                                SmallSet<int, 32> &StoredFIs,
+                                SmallVectorImpl<CandidateInfo> &Candidates) {
   bool RuledOut = false;
   bool HasNonInvariantUse = false;
   unsigned Def = 0;
@@ -455,7 +483,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
 
 /// Walk the specified region of the CFG and hoist loop invariants out to the
 /// preheader.
-void MachineLICM::HoistRegionPostRA() {
+void MachineLICMBase::HoistRegionPostRA() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
     return;
@@ -541,7 +569,7 @@ void MachineLICM::HoistRegionPostRA() {
 
 /// Add register 'Reg' to the livein sets of BBs in the current loop, and make
 /// sure it is not killed by any instructions in the loop.
-void MachineLICM::AddToLiveIns(unsigned Reg) {
+void MachineLICMBase::AddToLiveIns(unsigned Reg) {
   const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (MachineBasicBlock *BB : Blocks) {
     if (!BB->isLiveIn(Reg))
@@ -558,13 +586,14 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
 
 /// When an instruction is found to only use loop invariant operands that is
 /// safe to hoist, this instruction is called to do the dirty work.
-void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
+void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *Preheader = getCurPreheader();
 
   // Now move the instructions to the predecessor, inserting it before any
   // terminator instructions.
-  DEBUG(dbgs() << "Hoisting to " << printMBBReference(*Preheader) << " from "
-               << printMBBReference(*MI->getParent()) << ": " << *MI);
+  LLVM_DEBUG(dbgs() << "Hoisting to " << printMBBReference(*Preheader)
+                    << " from " << printMBBReference(*MI->getParent()) << ": "
+                    << *MI);
 
   // Splice the instruction to the preheader.
   MachineBasicBlock *MBB = MI->getParent();
@@ -581,7 +610,7 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
 
 /// Check if this mbb is guaranteed to execute. If not then a load from this mbb
 /// may not be safe to hoist.
-bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
+bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   if (SpeculationState != SpeculateUnknown)
     return SpeculationState == SpeculateFalse;
 
@@ -600,24 +629,24 @@ bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   return true;
 }
 
-void MachineLICM::EnterScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n');
+void MachineLICMBase::EnterScope(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n');
 
   // Remember livein register pressure.
   BackTrace.push_back(RegPressure);
 }
 
-void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Exiting " << printMBBReference(*MBB) << '\n');
+void MachineLICMBase::ExitScope(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Exiting " << printMBBReference(*MBB) << '\n');
   BackTrace.pop_back();
 }
 
 /// Destroy scope for the MBB that corresponds to the given dominator tree node
 /// if its a leaf or all of its children are done. Walk up the dominator tree to
 /// destroy ancestors which are now done.
-void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+void MachineLICMBase::ExitScopeIfDone(MachineDomTreeNode *Node,
+    DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+    DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
   if (OpenChildren[Node])
     return;
 
@@ -638,7 +667,7 @@ void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
 /// specified header block, and that are in the current loop) in depth first
 /// order w.r.t the DominatorTree. This allows us to visit definitions before
 /// uses, allowing us to hoist a loop body in one pass without iteration.
-void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
+void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
     return;
@@ -708,6 +737,8 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
       MachineInstr *MI = &*MII;
       if (!Hoist(MI, Preheader))
         UpdateRegPressure(MI);
+      // If we have hoisted an instruction that may store, it can only be a
+      // constant store.
       MII = NextMII;
     }
 
@@ -719,7 +750,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
 /// Sink instructions into loops if profitable. This especially tries to prevent
 /// register spills caused by register pressure if there is little to no
 /// overhead moving instructions into loops.
-void MachineLICM::SinkIntoLoop() {
+void MachineLICMBase::SinkIntoLoop() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
     return;
@@ -773,7 +804,7 @@ static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
 /// Find all virtual register references that are liveout of the preheader to
 /// initialize the starting "register pressure". Note this does not count live
 /// through (livein but not used) registers.
-void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
+void MachineLICMBase::InitRegPressure(MachineBasicBlock *BB) {
   std::fill(RegPressure.begin(), RegPressure.end(), 0);
 
   // If the preheader has only a single predecessor and it ends with a
@@ -792,8 +823,8 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
 }
 
 /// Update estimate of register pressure after the specified instruction.
-void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
-                                    bool ConsiderUnseenAsDef) {
+void MachineLICMBase::UpdateRegPressure(const MachineInstr *MI,
+                                        bool ConsiderUnseenAsDef) {
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
   for (const auto &RPIdAndCost : Cost) {
     unsigned Class = RPIdAndCost.first;
@@ -811,8 +842,8 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
 /// figure out which usages are live-ins.
 /// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
 DenseMap<unsigned, int>
-MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
-                              bool ConsiderUnseenAsDef) {
+MachineLICMBase::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
+                                  bool ConsiderUnseenAsDef) {
   DenseMap<unsigned, int> Cost;
   if (MI->isImplicitDef())
     return Cost;
@@ -871,13 +902,86 @@ static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
   return false;
 }
 
+// This function iterates through all the operands of the input store MI and
+// checks that each register operand statisfies isCallerPreservedPhysReg.
+// This means, the value being stored and the address where it is being stored
+// is constant throughout the body of the function (not including prologue and
+// epilogue). When called with an MI that isn't a store, it returns false.
+// A future improvement can be to check if the store registers are constant
+// throughout the loop rather than throughout the funtion.
+static bool isInvariantStore(const MachineInstr &MI,
+                             const TargetRegisterInfo *TRI,
+                             const MachineRegisterInfo *MRI) {
+
+  bool FoundCallerPresReg = false;
+  if (!MI.mayStore() || MI.hasUnmodeledSideEffects() ||
+      (MI.getNumOperands() == 0))
+    return false;
+
+  // Check that all register operands are caller-preserved physical registers.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg()) {
+      unsigned Reg = MO.getReg();
+      // If operand is a virtual register, check if it comes from a copy of a
+      // physical register.
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        Reg = TRI->lookThruCopyLike(MO.getReg(), MRI);
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        return false;
+      if (!TRI->isCallerPreservedPhysReg(Reg, *MI.getMF()))
+        return false;
+      else
+        FoundCallerPresReg = true;
+    } else if (!MO.isImm()) {
+        return false;
+    }
+  }
+  return FoundCallerPresReg;
+}
+
+// Return true if the input MI is a copy instruction that feeds an invariant
+// store instruction. This means that the src of the copy has to satisfy
+// isCallerPreservedPhysReg and atleast one of it's users should satisfy
+// isInvariantStore.
+static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
+                                        const MachineRegisterInfo *MRI,
+                                        const TargetRegisterInfo *TRI) {
+
+  // FIXME: If targets would like to look through instructions that aren't
+  // pure copies, this can be updated to a query.
+  if (!MI.isCopy())
+    return false;
+
+  const MachineFunction *MF = MI.getMF();
+  // Check that we are copying a constant physical register.
+  unsigned CopySrcReg = MI.getOperand(1).getReg();
+  if (TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+    return false;
+
+  if (!TRI->isCallerPreservedPhysReg(CopySrcReg, *MF))
+    return false;
+
+  unsigned CopyDstReg = MI.getOperand(0).getReg();
+  // Check if any of the uses of the copy are invariant stores.
+  assert (TargetRegisterInfo::isVirtualRegister(CopyDstReg) &&
+          "copy dst is not a virtual reg");
+
+  for (MachineInstr &UseMI : MRI->use_instructions(CopyDstReg)) {
+    if (UseMI.mayStore() && isInvariantStore(UseMI, TRI, MRI))
+      return true;
+  }
+  return false;
+}
+
 /// Returns true if the instruction may be a suitable candidate for LICM.
 /// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
-bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
+bool MachineLICMBase::IsLICMCandidate(MachineInstr &I) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = true;
-  if (!I.isSafeToMove(AA, DontMoveAcrossStore))
+  if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
+      !(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
     return false;
+  }
 
   // If it is load then check if it is guaranteed to execute by making sure that
   // it dominates all exiting blocks. If it doesn't, then there is a path out of
@@ -896,7 +1000,7 @@ bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
 /// I.e., all virtual register operands are defined outside of the loop,
 /// physical registers aren't accessed explicitly, and there are no side
 /// effects that aren't captured by the operands or other flags.
-bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
+bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I) {
   if (!IsLICMCandidate(I))
     return false;
 
@@ -949,7 +1053,7 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
 
 /// Return true if the specified instruction is used by a phi node and hoisting
 /// it could cause a copy to be inserted.
-bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
+bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const {
   SmallVector<const MachineInstr*, 8> Work(1, MI);
   do {
     MI = Work.pop_back_val();
@@ -984,8 +1088,9 @@ bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
 
 /// Compute operand latency between a def of 'Reg' and an use in the current
 /// loop, return true if the target considered it high.
-bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
-                                        unsigned DefIdx, unsigned Reg) const {
+bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI,
+                                            unsigned DefIdx,
+                                            unsigned Reg) const {
   if (MRI->use_nodbg_empty(Reg))
     return false;
 
@@ -1015,7 +1120,7 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
 
 /// Return true if the instruction is marked "cheap" or the operand latency
 /// between its def and a use is one or less.
-bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
+bool MachineLICMBase::IsCheapInstruction(MachineInstr &MI) const {
   if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
     return true;
 
@@ -1040,8 +1145,9 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
 
 /// Visit BBs from header to current BB, check if hoisting an instruction of the
 /// given cost matrix can cause high register pressure.
-bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
-                                          bool CheapInstr) {
+bool
+MachineLICMBase::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
+                                         bool CheapInstr) {
   for (const auto &RPIdAndCost : Cost) {
     if (RPIdAndCost.second <= 0)
       continue;
@@ -1065,7 +1171,7 @@ bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
 /// Traverse the back trace from header to the current block and update their
 /// register pressures to reflect the effect of hoisting MI from the current
 /// block to the preheader.
-void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
+void MachineLICMBase::UpdateBackTraceRegPressure(const MachineInstr *MI) {
   // First compute the 'cost' of the instruction, i.e. its contribution
   // to register pressure.
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/false,
@@ -1079,7 +1185,7 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
 
 /// Return true if it is potentially profitable to hoist the given loop
 /// invariant.
-bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
+bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
   if (MI.isImplicitDef())
     return true;
 
@@ -1095,12 +1201,15 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   // - When hoisting the last use of a value in the loop, that value no longer
   //   needs to be live in the loop. This lowers register pressure in the loop.
 
+  if (HoistConstStores &&  isCopyFeedingInvariantStore(MI, MRI, TRI))
+    return true;
+
   bool CheapInstr = IsCheapInstruction(MI);
   bool CreatesCopy = HasLoopPHIUse(&MI);
 
   // Don't hoist a cheap instruction if it would create a copy in the loop.
   if (CheapInstr && CreatesCopy) {
-    DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI);
+    LLVM_DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI);
     return false;
   }
 
@@ -1119,7 +1228,7 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
     if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) {
-      DEBUG(dbgs() << "Hoist High Latency: " << MI);
+      LLVM_DEBUG(dbgs() << "Hoist High Latency: " << MI);
       ++NumHighLatency;
       return true;
     }
@@ -1137,14 +1246,14 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   // Visit BBs from header to current BB, if hoisting this doesn't cause
   // high register pressure, then it's safe to proceed.
   if (!CanCauseHighRegPressure(Cost, CheapInstr)) {
-    DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI);
+    LLVM_DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI);
     ++NumLowRP;
     return true;
   }
 
   // Don't risk increasing register pressure if it would create copies.
   if (CreatesCopy) {
-    DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI);
+    LLVM_DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI);
     return false;
   }
 
@@ -1153,7 +1262,7 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   // conservative.
   if (AvoidSpeculation &&
       (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) {
-    DEBUG(dbgs() << "Won't speculate: " << MI);
+    LLVM_DEBUG(dbgs() << "Won't speculate: " << MI);
     return false;
   }
 
@@ -1161,7 +1270,7 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   // to be remat'ed.
   if (!TII->isTriviallyReMaterializable(MI, AA) &&
       !MI.isDereferenceableInvariantLoad(AA)) {
-    DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
+    LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
     return false;
   }
 
@@ -1171,7 +1280,7 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
 /// Unfold a load from the given machineinstr if the load itself could be
 /// hoisted. Return the unfolded and hoistable load, or null if the load
 /// couldn't be unfolded or if it wouldn't be hoistable.
-MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
+MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) {
   // Don't unfold simple loads.
   if (MI->canFoldAsLoad())
     return nullptr;
@@ -1229,7 +1338,7 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
 /// Initialize the CSE map with instructions that are in the current loop
 /// preheader that may become duplicates of instructions that are hoisted
 /// out of the loop.
-void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
+void MachineLICMBase::InitCSEMap(MachineBasicBlock *BB) {
   for (MachineInstr &MI : *BB)
     CSEMap[MI.getOpcode()].push_back(&MI);
 }
@@ -1237,8 +1346,8 @@ void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
 /// Find an instruction amount PrevMIs that is a duplicate of MI.
 /// Return this instruction if it's found.
 const MachineInstr*
-MachineLICM::LookForDuplicate(const MachineInstr *MI,
-                              std::vector<const MachineInstr*> &PrevMIs) {
+MachineLICMBase::LookForDuplicate(const MachineInstr *MI,
+                                  std::vector<const MachineInstr*> &PrevMIs) {
   for (const MachineInstr *PrevMI : PrevMIs)
     if (TII->produceSameValue(*MI, *PrevMI, (PreRegAlloc ? MRI : nullptr)))
       return PrevMI;
@@ -1250,15 +1359,15 @@ MachineLICM::LookForDuplicate(const MachineInstr *MI,
 /// computes the same value. If it's found, do a RAU on with the definition of
 /// the existing instruction rather than hoisting the instruction to the
 /// preheader.
-bool MachineLICM::EliminateCSE(MachineInstr *MI,
-          DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI) {
+bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
+    DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI) {
   // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
   // the undef property onto uses.
   if (CI == CSEMap.end() || MI->isImplicitDef())
     return false;
 
   if (const MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
-    DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
+    LLVM_DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
 
     // Replace virtual registers defined by MI by their counterparts defined
     // by Dup.
@@ -1308,7 +1417,7 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
 
 /// Return true if the given instruction will be CSE'd if it's hoisted out of
 /// the loop.
-bool MachineLICM::MayCSE(MachineInstr *MI) {
+bool MachineLICMBase::MayCSE(MachineInstr *MI) {
   unsigned Opcode = MI->getOpcode();
   DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator
     CI = CSEMap.find(Opcode);
@@ -1323,7 +1432,7 @@ bool MachineLICM::MayCSE(MachineInstr *MI) {
 /// When an instruction is found to use only loop invariant operands
 /// that are safe to hoist, this instruction is called to do the dirty work.
 /// It returns true if the instruction is hoisted.
-bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
+bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   // First check whether we should hoist this instruction.
   if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
     // If not, try unfolding a hoistable load.
@@ -1331,16 +1440,21 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
     if (!MI) return false;
   }
 
+  // If we have hoisted an instruction that may store, it can only be a constant
+  // store.
+  if (MI->mayStore())
+    NumStoreConst++;
+
   // Now move the instructions to the predecessor, inserting it before any
   // terminator instructions.
-  DEBUG({
-      dbgs() << "Hoisting " << *MI;
-      if (MI->getParent()->getBasicBlock())
-        dbgs() << " from " << printMBBReference(*MI->getParent());
-      if (Preheader->getBasicBlock())
-        dbgs() << " to " << printMBBReference(*Preheader);
-      dbgs() << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Hoisting " << *MI;
+    if (MI->getParent()->getBasicBlock())
+      dbgs() << " from " << printMBBReference(*MI->getParent());
+    if (Preheader->getBasicBlock())
+      dbgs() << " to " << printMBBReference(*Preheader);
+    dbgs() << "\n";
+  });
 
   // If this is the first instruction being hoisted to the preheader,
   // initialize the CSE map with potential common expressions.
@@ -1386,7 +1500,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
 }
 
 /// Get the preheader for the current loop, splitting a critical edge if needed.
-MachineBasicBlock *MachineLICM::getCurPreheader() {
+MachineBasicBlock *MachineLICMBase::getCurPreheader() {
   // Determine the block to which to hoist instructions. If we can't find a
   // suitable loop predecessor, we can't do any hoisting.
 
diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
index a9aa1d954e70..2bce59235057 100644
--- a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 8f0b89657d02..054cc97f8374 100644
--- a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -27,6 +26,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
diff --git a/contrib/llvm/lib/CodeGen/MachineOperand.cpp b/contrib/llvm/lib/CodeGen/MachineOperand.cpp
index ec81c6391171..8098333832b4 100644
--- a/contrib/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineOperand.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -19,6 +20,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/ModuleSlotTracker.h"
@@ -50,6 +52,9 @@ void MachineOperand::setReg(unsigned Reg) {
   if (getReg() == Reg)
     return; // No change.
 
+  // Clear the IsRenamable bit to keep it conservatively correct.
+  IsRenamable = false;
+
   // Otherwise, we have to change the register.  If this operand is embedded
   // into a machine function, we need to update the old and new register's
   // use/def lists.
@@ -110,30 +115,27 @@ bool MachineOperand::isRenamable() const {
   assert(isReg() && "Wrong MachineOperand accessor");
   assert(TargetRegisterInfo::isPhysicalRegister(getReg()) &&
          "isRenamable should only be checked on physical registers");
-  return IsRenamable;
+  if (!IsRenamable)
+    return false;
+
+  const MachineInstr *MI = getParent();
+  if (!MI)
+    return true;
+
+  if (isDef())
+    return !MI->hasExtraDefRegAllocReq(MachineInstr::IgnoreBundle);
+
+  assert(isUse() && "Reg is not def or use");
+  return !MI->hasExtraSrcRegAllocReq(MachineInstr::IgnoreBundle);
 }
 
 void MachineOperand::setIsRenamable(bool Val) {
   assert(isReg() && "Wrong MachineOperand accessor");
   assert(TargetRegisterInfo::isPhysicalRegister(getReg()) &&
          "setIsRenamable should only be called on physical registers");
-  if (const MachineInstr *MI = getParent())
-    if ((isDef() && MI->hasExtraDefRegAllocReq()) ||
-        (isUse() && MI->hasExtraSrcRegAllocReq()))
-      assert(!Val && "isRenamable should be false for "
-                     "hasExtraDefRegAllocReq/hasExtraSrcRegAllocReq opcodes");
   IsRenamable = Val;
 }
 
-void MachineOperand::setIsRenamableIfNoExtraRegAllocReq() {
-  if (const MachineInstr *MI = getParent())
-    if ((isDef() && MI->hasExtraDefRegAllocReq()) ||
-        (isUse() && MI->hasExtraSrcRegAllocReq()))
-      return;
-
-  setIsRenamable(true);
-}
-
 // If this operand is currently a register operand, and if this is in a
 // function, deregister the operand from the register's use/def list.
 void MachineOperand::removeRegFromUses() {
@@ -440,7 +442,70 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
-void MachineOperand::printSubregIdx(raw_ostream &OS, uint64_t Index,
+static void printIRValueReference(raw_ostream &OS, const Value &V,
+                                  ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V));
+}
+
+static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
+                           SyncScope::ID SSID,
+                           SmallVectorImpl<StringRef> &SSNs) {
+  switch (SSID) {
+  case SyncScope::System:
+    break;
+  default:
+    if (SSNs.empty())
+      Context.getSyncScopeNames(SSNs);
+
+    OS << "syncscope(\"";
+    printEscapedString(SSNs[SSID], OS);
+    OS << "\") ";
+    break;
+  }
+}
+
+static const char *getTargetMMOFlagName(const TargetInstrInfo &TII,
+                                        unsigned TMMOFlag) {
+  auto Flags = TII.getSerializableMachineMemOperandTargetFlags();
+  for (const auto &I : Flags) {
+    if (I.first == TMMOFlag) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+static void printFrameIndex(raw_ostream& OS, int FrameIndex, bool IsFixed,
+                            const MachineFrameInfo *MFI) {
+  StringRef Name;
+  if (MFI) {
+    IsFixed = MFI->isFixedObjectIndex(FrameIndex);
+    if (const AllocaInst *Alloca = MFI->getObjectAllocation(FrameIndex))
+      if (Alloca->hasName())
+        Name = Alloca->getName();
+    if (IsFixed)
+      FrameIndex -= MFI->getObjectIndexBegin();
+  }
+  MachineOperand::printStackObjectReference(OS, FrameIndex, IsFixed, Name);
+}
+
+void MachineOperand::printSubRegIdx(raw_ostream &OS, uint64_t Index,
                                     const TargetRegisterInfo *TRI) {
   OS << "%subreg.";
   if (TRI)
@@ -639,15 +704,21 @@ static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI,
 
 void MachineOperand::print(raw_ostream &OS, const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
+  print(OS, LLT{}, TRI, IntrinsicInfo);
+}
+
+void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
+                           const TargetRegisterInfo *TRI,
+                           const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, LLT{}, /*PrintDef=*/false,
+  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, bool PrintDef,
+                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
                            bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
@@ -675,7 +746,15 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << "debug-use ";
     if (TargetRegisterInfo::isPhysicalRegister(getReg()) && isRenamable())
       OS << "renamable ";
-    OS << printReg(Reg, TRI);
+
+    const MachineRegisterInfo *MRI = nullptr;
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+        MRI = &MF->getRegInfo();
+      }
+    }
+
+    OS << printReg(Reg, TRI, 0, MRI);
     // Print the sub register.
     if (unsigned SubReg = getSubReg()) {
       if (TRI)
@@ -687,7 +766,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       if (const MachineFunction *MF = getMFIfAvailable(*this)) {
         const MachineRegisterInfo &MRI = MF->getRegInfo();
-        if (!PrintDef || MRI.def_empty(Reg)) {
+        if (IsStandalone || !PrintDef || MRI.def_empty(Reg)) {
           OS << ':';
           OS << printRegClassOrBank(Reg, MRI, TRI);
         }
@@ -716,17 +795,10 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
   case MachineOperand::MO_FrameIndex: {
     int FrameIndex = getIndex();
     bool IsFixed = false;
-    StringRef Name;
-    if (const MachineFunction *MF = getMFIfAvailable(*this)) {
-      const MachineFrameInfo &MFI = MF->getFrameInfo();
-      IsFixed = MFI.isFixedObjectIndex(FrameIndex);
-      if (const AllocaInst *Alloca = MFI.getObjectAllocation(FrameIndex))
-        if (Alloca->hasName())
-          Name = Alloca->getName();
-      if (IsFixed)
-        FrameIndex -= MFI.getObjectIndexBegin();
-    }
-    printStackObjectReference(OS, FrameIndex, IsFixed, Name);
+    const MachineFrameInfo *MFI = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      MFI = &MF->getFrameInfo();
+    printFrameIndex(OS, FrameIndex, IsFixed, MFI);
     break;
   }
   case MachineOperand::MO_ConstantPoolIndex:
@@ -752,7 +824,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     break;
   case MachineOperand::MO_ExternalSymbol: {
     StringRef Name = getSymbolName();
-    OS << '$';
+    OS << '&';
     if (Name.empty()) {
       OS << "\"\"";
     } else {
@@ -905,7 +977,7 @@ MachinePointerInfo MachinePointerInfo::getUnknownStack(MachineFunction &MF) {
 }
 
 MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
-                                     uint64_t s, unsigned int a,
+                                     uint64_t s, uint64_t a,
                                      const AAMDNodes &AAInfo,
                                      const MDNode *Ranges, SyncScope::ID SSID,
                                      AtomicOrdering Ordering,
@@ -961,108 +1033,121 @@ void MachineMemOperand::print(raw_ostream &OS) const {
   ModuleSlotTracker DummyMST(nullptr);
   print(OS, DummyMST);
 }
+
 void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
-  assert((isLoad() || isStore()) && "SV has to be a load, store or both.");
+  SmallVector<StringRef, 0> SSNs;
+  LLVMContext Ctx;
+  print(OS, MST, SSNs, Ctx, nullptr, nullptr);
+}
 
+void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                              SmallVectorImpl<StringRef> &SSNs,
+                              const LLVMContext &Context,
+                              const MachineFrameInfo *MFI,
+                              const TargetInstrInfo *TII) const {
+  OS << '(';
   if (isVolatile())
-    OS << "Volatile ";
-
+    OS << "volatile ";
+  if (isNonTemporal())
+    OS << "non-temporal ";
+  if (isDereferenceable())
+    OS << "dereferenceable ";
+  if (isInvariant())
+    OS << "invariant ";
+  if (getFlags() & MachineMemOperand::MOTargetFlag1)
+    OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag1)
+       << "\" ";
+  if (getFlags() & MachineMemOperand::MOTargetFlag2)
+    OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag2)
+       << "\" ";
+  if (getFlags() & MachineMemOperand::MOTargetFlag3)
+    OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag3)
+       << "\" ";
+
+  assert((isLoad() || isStore()) &&
+         "machine memory operand must be a load or store (or both)");
   if (isLoad())
-    OS << "LD";
+    OS << "load ";
   if (isStore())
-    OS << "ST";
-  OS << getSize();
+    OS << "store ";
 
-  // Print the address information.
-  OS << "[";
-  if (const Value *V = getValue())
-    V->printAsOperand(OS, /*PrintType=*/false, MST);
-  else if (const PseudoSourceValue *PSV = getPseudoValue())
-    PSV->printCustom(OS);
-  else
-    OS << "<unknown>";
+  printSyncScope(OS, Context, getSyncScopeID(), SSNs);
 
-  unsigned AS = getAddrSpace();
-  if (AS != 0)
-    OS << "(addrspace=" << AS << ')';
-
-  // If the alignment of the memory reference itself differs from the alignment
-  // of the base pointer, print the base alignment explicitly, next to the base
-  // pointer.
-  if (getBaseAlignment() != getAlignment())
-    OS << "(align=" << getBaseAlignment() << ")";
-
-  if (getOffset() != 0)
-    OS << "+" << getOffset();
-  OS << "]";
-
-  // Print the alignment of the reference.
-  if (getBaseAlignment() != getAlignment() || getBaseAlignment() != getSize())
-    OS << "(align=" << getAlignment() << ")";
-
-  // Print TBAA info.
-  if (const MDNode *TBAAInfo = getAAInfo().TBAA) {
-    OS << "(tbaa=";
-    if (TBAAInfo->getNumOperands() > 0)
-      TBAAInfo->getOperand(0)->printAsOperand(OS, MST);
-    else
-      OS << "<unknown>";
-    OS << ")";
-  }
+  if (getOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(getOrdering()) << ' ';
+  if (getFailureOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(getFailureOrdering()) << ' ';
 
-  // Print AA scope info.
-  if (const MDNode *ScopeInfo = getAAInfo().Scope) {
-    OS << "(alias.scope=";
-    if (ScopeInfo->getNumOperands() > 0)
-      for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
-        ScopeInfo->getOperand(i)->printAsOperand(OS, MST);
-        if (i != ie - 1)
-          OS << ",";
-      }
-    else
-      OS << "<unknown>";
-    OS << ")";
+  OS << getSize();
+  if (const Value *Val = getValue()) {
+    OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
+    printIRValueReference(OS, *Val, MST);
+  } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
+    OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
+    assert(PVal && "Expected a pseudo source value");
+    switch (PVal->kind()) {
+    case PseudoSourceValue::Stack:
+      OS << "stack";
+      break;
+    case PseudoSourceValue::GOT:
+      OS << "got";
+      break;
+    case PseudoSourceValue::JumpTable:
+      OS << "jump-table";
+      break;
+    case PseudoSourceValue::ConstantPool:
+      OS << "constant-pool";
+      break;
+    case PseudoSourceValue::FixedStack: {
+      int FrameIndex = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+      bool IsFixed = true;
+      printFrameIndex(OS, FrameIndex, IsFixed, MFI);
+      break;
+    }
+    case PseudoSourceValue::GlobalValueCallEntry:
+      OS << "call-entry ";
+      cast<GlobalValuePseudoSourceValue>(PVal)->getValue()->printAsOperand(
+          OS, /*PrintType=*/false, MST);
+      break;
+    case PseudoSourceValue::ExternalSymbolCallEntry:
+      OS << "call-entry &";
+      printLLVMNameWithoutPrefix(
+          OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
+      break;
+    case PseudoSourceValue::TargetCustom:
+      // FIXME: This is not necessarily the correct MIR serialization format for
+      // a custom pseudo source value, but at least it allows
+      // -print-machineinstrs to work on a target with custom pseudo source
+      // values.
+      OS << "custom ";
+      PVal->printCustom(OS);
+      break;
+    }
   }
-
-  // Print AA noalias scope info.
-  if (const MDNode *NoAliasInfo = getAAInfo().NoAlias) {
-    OS << "(noalias=";
-    if (NoAliasInfo->getNumOperands() > 0)
-      for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
-        NoAliasInfo->getOperand(i)->printAsOperand(OS, MST);
-        if (i != ie - 1)
-          OS << ",";
-      }
-    else
-      OS << "<unknown>";
-    OS << ")";
+  MachineOperand::printOperandOffset(OS, getOffset());
+  if (getBaseAlignment() != getSize())
+    OS << ", align " << getBaseAlignment();
+  auto AAInfo = getAAInfo();
+  if (AAInfo.TBAA) {
+    OS << ", !tbaa ";
+    AAInfo.TBAA->printAsOperand(OS, MST);
   }
-
-  if (const MDNode *Ranges = getRanges()) {
-    unsigned NumRanges = Ranges->getNumOperands();
-    if (NumRanges != 0) {
-      OS << "(ranges=";
-
-      for (unsigned I = 0; I != NumRanges; ++I) {
-        Ranges->getOperand(I)->printAsOperand(OS, MST);
-        if (I != NumRanges - 1)
-          OS << ',';
-      }
-
-      OS << ')';
-    }
+  if (AAInfo.Scope) {
+    OS << ", !alias.scope ";
+    AAInfo.Scope->printAsOperand(OS, MST);
   }
+  if (AAInfo.NoAlias) {
+    OS << ", !noalias ";
+    AAInfo.NoAlias->printAsOperand(OS, MST);
+  }
+  if (getRanges()) {
+    OS << ", !range ";
+    getRanges()->printAsOperand(OS, MST);
+  }
+  // FIXME: Implement addrspace printing/parsing in MIR.
+  // For now, print this even though parsing it is not available in MIR.
+  if (unsigned AS = getAddrSpace())
+    OS << ", addrspace " << AS;
 
-  if (isNonTemporal())
-    OS << "(nontemporal)";
-  if (isDereferenceable())
-    OS << "(dereferenceable)";
-  if (isInvariant())
-    OS << "(invariant)";
-  if (getFlags() & MOTargetFlag1)
-    OS << "(flag1)";
-  if (getFlags() & MOTargetFlag2)
-    OS << "(flag2)";
-  if (getFlags() & MOTargetFlag3)
-    OS << "(flag3)";
+  OS << ')';
 }
diff --git a/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index ca4452218da1..906d5560d568 100644
--- a/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -27,7 +27,8 @@ DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
   Key = MKey;
 
   raw_string_ostream OS(Val);
-  MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true);
+  MI.print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
+           /*SkipDebugLoc=*/true);
 }
 
 Optional<uint64_t>
diff --git a/contrib/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp
index e4eb8802ac66..28e4e2c6c87a 100644
--- a/contrib/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -25,9 +25,8 @@
 ///
 /// Targets must implement
 ///   * getOutliningCandidateInfo
-///   * insertOutlinerEpilogue
+///   * buildOutlinedFrame
 ///   * insertOutlinedCall
-///   * insertOutlinerPrologue
 ///   * isFunctionSafeToOutlineFrom
 ///
 /// in order to make use of the MachineOutliner.
@@ -56,18 +55,22 @@
 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
 ///
 //===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineOutliner.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <functional>
@@ -80,121 +83,23 @@
 
 using namespace llvm;
 using namespace ore;
+using namespace outliner;
 
 STATISTIC(NumOutlined, "Number of candidates outlined");
 STATISTIC(FunctionsCreated, "Number of functions created");
 
-namespace {
-
-/// \brief An individual sequence of instructions to be replaced with a call to
-/// an outlined function.
-struct Candidate {
-private:
-  /// The start index of this \p Candidate in the instruction list.
-  unsigned StartIdx;
-
-  /// The number of instructions in this \p Candidate.
-  unsigned Len;
-
-public:
-  /// Set to false if the candidate overlapped with another candidate.
-  bool InCandidateList = true;
-
-  /// \brief The index of this \p Candidate's \p OutlinedFunction in the list of
-  /// \p OutlinedFunctions.
-  unsigned FunctionIdx;
-
-  /// Contains all target-specific information for this \p Candidate.
-  TargetInstrInfo::MachineOutlinerInfo MInfo;
-
-  /// Return the number of instructions in this Candidate.
-  unsigned getLength() const { return Len; }
-
-  /// Return the start index of this candidate.
-  unsigned getStartIdx() const { return StartIdx; }
-
-  // Return the end index of this candidate.
-  unsigned getEndIdx() const { return StartIdx + Len - 1; }
-
-  /// \brief The number of instructions that would be saved by outlining every
-  /// candidate of this type.
-  ///
-  /// This is a fixed value which is not updated during the candidate pruning
-  /// process. It is only used for deciding which candidate to keep if two
-  /// candidates overlap. The true benefit is stored in the OutlinedFunction
-  /// for some given candidate.
-  unsigned Benefit = 0;
-
-  Candidate(unsigned StartIdx, unsigned Len, unsigned FunctionIdx)
-      : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {}
-
-  Candidate() {}
-
-  /// \brief Used to ensure that \p Candidates are outlined in an order that
-  /// preserves the start and end indices of other \p Candidates.
-  bool operator<(const Candidate &RHS) const {
-    return getStartIdx() > RHS.getStartIdx();
-  }
-};
-
-/// \brief The information necessary to create an outlined function for some
-/// class of candidate.
-struct OutlinedFunction {
-
-private:
-  /// The number of candidates for this \p OutlinedFunction.
-  unsigned OccurrenceCount = 0;
-
-public:
-  std::vector<std::shared_ptr<Candidate>> Candidates;
-
-  /// The actual outlined function created.
-  /// This is initialized after we go through and create the actual function.
-  MachineFunction *MF = nullptr;
-
-  /// A number assigned to this function which appears at the end of its name.
-  unsigned Name;
+// Set to true if the user wants the outliner to run on linkonceodr linkage
+// functions. This is false by default because the linker can dedupe linkonceodr
+// functions. Since the outliner is confined to a single module (modulo LTO),
+// this is off by default. It should, however, be the default behaviour in
+// LTO.
+static cl::opt<bool> EnableLinkOnceODROutlining(
+    "enable-linkonceodr-outlining",
+    cl::Hidden,
+    cl::desc("Enable the machine outliner on linkonceodr functions"),
+    cl::init(false));
 
-  /// \brief The sequence of integers corresponding to the instructions in this
-  /// function.
-  std::vector<unsigned> Sequence;
-
-  /// Contains all target-specific information for this \p OutlinedFunction.
-  TargetInstrInfo::MachineOutlinerInfo MInfo;
-
-  /// Return the number of candidates for this \p OutlinedFunction.
-  unsigned getOccurrenceCount() { return OccurrenceCount; }
-
-  /// Decrement the occurrence count of this OutlinedFunction and return the
-  /// new count.
-  unsigned decrement() {
-    assert(OccurrenceCount > 0 && "Can't decrement an empty function!");
-    OccurrenceCount--;
-    return getOccurrenceCount();
-  }
-
-  /// \brief Return the number of instructions it would take to outline this
-  /// function.
-  unsigned getOutliningCost() {
-    return (OccurrenceCount * MInfo.CallOverhead) + Sequence.size() +
-           MInfo.FrameOverhead;
-  }
-
-  /// \brief Return the number of instructions that would be saved by outlining
-  /// this function.
-  unsigned getBenefit() {
-    unsigned NotOutlinedCost = OccurrenceCount * Sequence.size();
-    unsigned OutlinedCost = getOutliningCost();
-    return (NotOutlinedCost < OutlinedCost) ? 0
-                                            : NotOutlinedCost - OutlinedCost;
-  }
-
-  OutlinedFunction(unsigned Name, unsigned OccurrenceCount,
-                   const std::vector<unsigned> &Sequence,
-                   TargetInstrInfo::MachineOutlinerInfo &MInfo)
-      : OccurrenceCount(OccurrenceCount), Name(Name), Sequence(Sequence),
-        MInfo(MInfo) {}
-};
+namespace {
 
 /// Represents an undefined index in the suffix tree.
 const unsigned EmptyIdx = -1;
@@ -242,7 +147,7 @@ struct SuffixTreeNode {
   /// For all other nodes, this is ignored.
   unsigned SuffixIdx = EmptyIdx;
 
-  /// \brief For internal nodes, a pointer to the internal node representing
+  /// For internal nodes, a pointer to the internal node representing
   /// the same sequence with the first character chopped off.
   ///
   /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
@@ -356,7 +261,7 @@ private:
   /// The end index of each leaf in the tree.
   unsigned LeafEndIdx = -1;
 
-  /// \brief Helper struct which keeps track of the next insertion point in
+  /// Helper struct which keeps track of the next insertion point in
   /// Ukkonen's algorithm.
   struct ActiveState {
     /// The next node to insert at.
@@ -369,7 +274,7 @@ private:
     unsigned Len = 0;
   };
 
-  /// \brief The point the next insertion will take place at in the
+  /// The point the next insertion will take place at in the
   /// construction algorithm.
   ActiveState Active;
 
@@ -416,7 +321,7 @@ private:
     return N;
   }
 
-  /// \brief Set the suffix indices of the leaves to the start indices of their
+  /// Set the suffix indices of the leaves to the start indices of their
   /// respective suffixes. Also stores each leaf in \p LeafVector at its
   /// respective suffix index.
   ///
@@ -454,7 +359,7 @@ private:
     }
   }
 
-  /// \brief Construct the suffix tree for the prefix of the input ending at
+  /// Construct the suffix tree for the prefix of the input ending at
   /// \p EndIdx.
   ///
   /// Used to construct the full suffix tree iteratively. At the end of each
@@ -615,16 +520,16 @@ public:
   }
 };
 
-/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings.
+/// Maps \p MachineInstrs to unsigned integers and stores the mappings.
 struct InstructionMapper {
 
-  /// \brief The next available integer to assign to a \p MachineInstr that
+  /// The next available integer to assign to a \p MachineInstr that
   /// cannot be outlined.
   ///
   /// Set to -3 for compatability with \p DenseMapInfo<unsigned>.
   unsigned IllegalInstrNumber = -3;
 
-  /// \brief The next available integer to assign to a \p MachineInstr that can
+  /// The next available integer to assign to a \p MachineInstr that can
   /// be outlined.
   unsigned LegalInstrNumber = 0;
 
@@ -639,11 +544,11 @@ struct InstructionMapper {
   /// The vector of unsigned integers that the module is mapped to.
   std::vector<unsigned> UnsignedVec;
 
-  /// \brief Stores the location of the instruction associated with the integer
+  /// Stores the location of the instruction associated with the integer
   /// at index i in \p UnsignedVec for each index i.
   std::vector<MachineBasicBlock::iterator> InstrList;
 
-  /// \brief Maps \p *It to a legal integer.
+  /// Maps \p *It to a legal integer.
   ///
   /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
   /// \p IntegerInstructionMap, and \p LegalInstrNumber.
@@ -706,7 +611,7 @@ struct InstructionMapper {
     return MINumber;
   }
 
-  /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds
+  /// Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds
   /// and appends it to \p UnsignedVec and \p InstrList.
   ///
   /// Two instructions are assigned the same integer if they are identical.
@@ -720,20 +625,29 @@ struct InstructionMapper {
   void convertToUnsignedVec(MachineBasicBlock &MBB,
                             const TargetRegisterInfo &TRI,
                             const TargetInstrInfo &TII) {
+    unsigned Flags = TII.getMachineOutlinerMBBFlags(MBB);
+
     for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et;
          It++) {
 
       // Keep track of where this instruction is in the module.
-      switch (TII.getOutliningType(*It)) {
-      case TargetInstrInfo::MachineOutlinerInstrType::Illegal:
+      switch (TII.getOutliningType(It, Flags)) {
+      case InstrType::Illegal:
         mapToIllegalUnsigned(It);
         break;
 
-      case TargetInstrInfo::MachineOutlinerInstrType::Legal:
+      case InstrType::Legal:
         mapToLegalUnsigned(It);
         break;
 
-      case TargetInstrInfo::MachineOutlinerInstrType::Invisible:
+      case InstrType::LegalTerminator:
+        mapToLegalUnsigned(It);
+        InstrList.push_back(It);
+        UnsignedVec.push_back(IllegalInstrNumber);
+        IllegalInstrNumber--;
+        break;
+
+      case InstrType::Invisible:
         break;
       }
     }
@@ -757,7 +671,7 @@ struct InstructionMapper {
   }
 };
 
-/// \brief An interprocedural pass which finds repeated sequences of
+/// An interprocedural pass which finds repeated sequences of
 /// instructions and replaces them with calls to functions.
 ///
 /// Each instruction is mapped to an unsigned integer and placed in a string.
@@ -770,10 +684,19 @@ struct MachineOutliner : public ModulePass {
 
   static char ID;
 
-  /// \brief Set to true if the outliner should consider functions with
+  /// Set to true if the outliner should consider functions with
   /// linkonceodr linkage.
   bool OutlineFromLinkOnceODRs = false;
 
+  /// Set to true if the outliner should run on all functions in the module
+  /// considered safe for outlining.
+  /// Set to true by default for compatibility with llc's -run-pass option.
+  /// Set when the pass is constructed in TargetPassConfig.
+  bool RunOnAllFunctions = true;
+
+  // Collection of IR functions created by the outliner.
+  std::vector<Function *> CreatedIRFunctions;
+
   StringRef getPassName() const override { return "Machine Outliner"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -783,27 +706,35 @@ struct MachineOutliner : public ModulePass {
     ModulePass::getAnalysisUsage(AU);
   }
 
-  MachineOutliner(bool OutlineFromLinkOnceODRs = false)
-      : ModulePass(ID), OutlineFromLinkOnceODRs(OutlineFromLinkOnceODRs) {
+  MachineOutliner() : ModulePass(ID) {
     initializeMachineOutlinerPass(*PassRegistry::getPassRegistry());
   }
 
+  /// Remark output explaining that not outlining a set of candidates would be
+  /// better than outlining that set.
+  void emitNotOutliningCheaperRemark(
+      unsigned StringLen, std::vector<Candidate> &CandidatesForRepeatedSeq,
+      OutlinedFunction &OF);
+
+  /// Remark output explaining that a function was outlined.
+  void emitOutlinedFunctionRemark(OutlinedFunction &OF);
+
   /// Find all repeated substrings that satisfy the outlining cost model.
   ///
   /// If a substring appears at least twice, then it must be represented by
-  /// an internal node which appears in at least two suffixes. Each suffix is
-  /// represented by a leaf node. To do this, we visit each internal node in
-  /// the tree, using the leaf children of each internal node. If an internal
-  /// node represents a beneficial substring, then we use each of its leaf
-  /// children to find the locations of its substring.
+  /// an internal node which appears in at least two suffixes. Each suffix
+  /// is represented by a leaf node. To do this, we visit each internal node
+  /// in the tree, using the leaf children of each internal node. If an
+  /// internal node represents a beneficial substring, then we use each of
+  /// its leaf children to find the locations of its substring.
   ///
   /// \param ST A suffix tree to query.
   /// \param TII TargetInstrInfo for the target.
   /// \param Mapper Contains outlining mapping information.
   /// \param[out] CandidateList Filled with candidates representing each
   /// beneficial substring.
-  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each
-  /// type of candidate.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
+  /// each type of candidate.
   ///
   /// \returns The length of the longest candidate found.
   unsigned
@@ -812,7 +743,7 @@ struct MachineOutliner : public ModulePass {
                  std::vector<std::shared_ptr<Candidate>> &CandidateList,
                  std::vector<OutlinedFunction> &FunctionList);
 
-  /// \brief Replace the sequences of instructions represented by the
+  /// Replace the sequences of instructions represented by the
   /// \p Candidates in \p CandidateList with calls to \p MachineFunctions
   /// described in \p FunctionList.
   ///
@@ -852,7 +783,7 @@ struct MachineOutliner : public ModulePass {
   /// Removes \p C from the candidate list, and updates its \p OutlinedFunction.
   void prune(Candidate &C, std::vector<OutlinedFunction> &FunctionList);
 
-  /// \brief Remove any overlapping candidates that weren't handled by the
+  /// Remove any overlapping candidates that weren't handled by the
   /// suffix tree's pruning method.
   ///
   /// Pruning from the suffix tree doesn't necessarily remove all overlaps.
@@ -873,6 +804,16 @@ struct MachineOutliner : public ModulePass {
   /// Construct a suffix tree on the instructions in \p M and outline repeated
   /// strings from that tree.
   bool runOnModule(Module &M) override;
+
+  /// Return a DISubprogram for OF if one exists, and null otherwise. Helper
+  /// function for remark emission.
+  DISubprogram *getSubprogramOrNull(const OutlinedFunction &OF) {
+    DISubprogram *SP;
+    for (const std::shared_ptr<Candidate> &C : OF.Candidates)
+      if (C && C->getMF() && (SP = C->getMF()->getFunction().getSubprogram()))
+        return SP;
+    return nullptr;
+  }
 };
 
 } // Anonymous namespace.
@@ -880,8 +821,10 @@ struct MachineOutliner : public ModulePass {
 char MachineOutliner::ID = 0;
 
 namespace llvm {
-ModulePass *createMachineOutlinerPass(bool OutlineFromLinkOnceODRs) {
-  return new MachineOutliner(OutlineFromLinkOnceODRs);
+ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions) {
+  MachineOutliner *OL = new MachineOutliner();
+  OL->RunOnAllFunctions = RunOnAllFunctions;
+  return OL;
 }
 
 } // namespace llvm
@@ -889,6 +832,65 @@ ModulePass *createMachineOutlinerPass(bool OutlineFromLinkOnceODRs) {
 INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
                 false)
 
+void MachineOutliner::emitNotOutliningCheaperRemark(
+    unsigned StringLen, std::vector<Candidate> &CandidatesForRepeatedSeq,
+    OutlinedFunction &OF) {
+  Candidate &C = CandidatesForRepeatedSeq.front();
+  MachineOptimizationRemarkEmitter MORE(*(C.getMF()), nullptr);
+  MORE.emit([&]() {
+    MachineOptimizationRemarkMissed R(DEBUG_TYPE, "NotOutliningCheaper",
+                                      C.front()->getDebugLoc(), C.getMBB());
+    R << "Did not outline " << NV("Length", StringLen) << " instructions"
+      << " from " << NV("NumOccurrences", CandidatesForRepeatedSeq.size())
+      << " locations."
+      << " Bytes from outlining all occurrences ("
+      << NV("OutliningCost", OF.getOutliningCost()) << ")"
+      << " >= Unoutlined instruction bytes ("
+      << NV("NotOutliningCost", OF.getNotOutlinedCost()) << ")"
+      << " (Also found at: ";
+
+    // Tell the user the other places the candidate was found.
+    for (unsigned i = 1, e = CandidatesForRepeatedSeq.size(); i < e; i++) {
+      R << NV((Twine("OtherStartLoc") + Twine(i)).str(),
+              CandidatesForRepeatedSeq[i].front()->getDebugLoc());
+      if (i != e - 1)
+        R << ", ";
+    }
+
+    R << ")";
+    return R;
+  });
+}
+
+void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
+  MachineBasicBlock *MBB = &*OF.MF->begin();
+  MachineOptimizationRemarkEmitter MORE(*OF.MF, nullptr);
+  MachineOptimizationRemark R(DEBUG_TYPE, "OutlinedFunction",
+                              MBB->findDebugLoc(MBB->begin()), MBB);
+  R << "Saved " << NV("OutliningBenefit", OF.getBenefit()) << " bytes by "
+    << "outlining " << NV("Length", OF.Sequence.size()) << " instructions "
+    << "from " << NV("NumOccurrences", OF.getOccurrenceCount())
+    << " locations. "
+    << "(Found at: ";
+
+  // Tell the user the other places the candidate was found.
+  for (size_t i = 0, e = OF.Candidates.size(); i < e; i++) {
+
+    // Skip over things that were pruned.
+    if (!OF.Candidates[i]->InCandidateList)
+      continue;
+
+    R << NV((Twine("StartLoc") + Twine(i)).str(),
+            OF.Candidates[i]->front()->getDebugLoc());
+    if (i != e - 1)
+      R << ", ";
+  }
+
+  R << ")";
+
+  MORE.emit(R);
+}
+
 unsigned MachineOutliner::findCandidates(
     SuffixTree &ST, const TargetInstrInfo &TII, InstructionMapper &Mapper,
     std::vector<std::shared_ptr<Candidate>> &CandidateList,
@@ -923,14 +925,6 @@ unsigned MachineOutliner::findCandidates(
     // this vector.
     std::vector<Candidate> CandidatesForRepeatedSeq;
 
-    // Describes the start and end point of each candidate. This allows the
-    // target to infer some information about each occurrence of each repeated
-    // sequence.
-    // FIXME: CandidatesForRepeatedSeq and this should be combined.
-    std::vector<
-        std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-        RepeatedSequenceLocs;
-
     // Figure out the call overhead for each instance of the sequence.
     for (auto &ChildPair : Parent.Children) {
       SuffixTreeNode *M = ChildPair.second;
@@ -966,17 +960,18 @@ unsigned MachineOutliner::findCandidates(
                         CandidatesForRepeatedSeq.end(),
                         [&StartIdx, &EndIdx](const Candidate &C) {
                           return (EndIdx < C.getStartIdx() ||
-                                  StartIdx > C.getEndIdx()); 
+                                  StartIdx > C.getEndIdx());
                         })) {
           // It doesn't overlap with anything, so we can outline it.
           // Each sequence is over [StartIt, EndIt].
+          // Save the candidate and its location.
+
           MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
           MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
 
-          // Save the candidate and its location.
-          CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen,
+          CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
+                                                EndIt, StartIt->getParent(),
                                                 FunctionList.size());
-          RepeatedSequenceLocs.emplace_back(std::make_pair(StartIt, EndIt));
         }
       }
     }
@@ -984,69 +979,33 @@ unsigned MachineOutliner::findCandidates(
     // We've found something we might want to outline.
     // Create an OutlinedFunction to store it and check if it'd be beneficial
     // to outline.
-    TargetInstrInfo::MachineOutlinerInfo MInfo =
-        TII.getOutlininingCandidateInfo(RepeatedSequenceLocs);
+    OutlinedFunction OF =
+        TII.getOutliningCandidateInfo(CandidatesForRepeatedSeq);
+
+    // If we deleted every candidate, then there's nothing to outline.
+    if (OF.Candidates.empty())
+      continue;
+
     std::vector<unsigned> Seq;
     for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
       Seq.push_back(ST.Str[i]);
-    OutlinedFunction OF(FunctionList.size(), CandidatesForRepeatedSeq.size(),
-                        Seq, MInfo);
-    unsigned Benefit = OF.getBenefit();
+    OF.Sequence = Seq;
+    OF.Name = FunctionList.size();
 
     // Is it better to outline this candidate than not?
-    if (Benefit < 1) {
-      // Outlining this candidate would take more instructions than not
-      // outlining.
-      // Emit a remark explaining why we didn't outline this candidate.
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator> C =
-          RepeatedSequenceLocs[0];
-      MachineOptimizationRemarkEmitter MORE(
-          *(C.first->getParent()->getParent()), nullptr);
-      MORE.emit([&]() {
-        MachineOptimizationRemarkMissed R(DEBUG_TYPE, "NotOutliningCheaper",
-                                          C.first->getDebugLoc(),
-                                          C.first->getParent());
-        R << "Did not outline " << NV("Length", StringLen) << " instructions"
-          << " from " << NV("NumOccurrences", RepeatedSequenceLocs.size())
-          << " locations."
-          << " Instructions from outlining all occurrences ("
-          << NV("OutliningCost", OF.getOutliningCost()) << ")"
-          << " >= Unoutlined instruction count ("
-          << NV("NotOutliningCost", StringLen * OF.getOccurrenceCount()) << ")"
-          << " (Also found at: ";
-
-        // Tell the user the other places the candidate was found.
-        for (unsigned i = 1, e = RepeatedSequenceLocs.size(); i < e; i++) {
-          R << NV((Twine("OtherStartLoc") + Twine(i)).str(),
-                  RepeatedSequenceLocs[i].first->getDebugLoc());
-          if (i != e - 1)
-            R << ", ";
-        }
-
-        R << ")";
-        return R;
-      });
-
-      // Move to the next candidate.
+    if (OF.getBenefit() < 1) {
+      emitNotOutliningCheaperRemark(StringLen, CandidatesForRepeatedSeq, OF);
       continue;
     }
 
     if (StringLen > MaxLen)
       MaxLen = StringLen;
 
-    // At this point, the candidate class is seen as beneficial. Set their
-    // benefit values and save them in the candidate list.
-    std::vector<std::shared_ptr<Candidate>> CandidatesForFn;
-    for (Candidate &C : CandidatesForRepeatedSeq) {
-      C.Benefit = Benefit;
-      C.MInfo = MInfo;
-      std::shared_ptr<Candidate> Cptr = std::make_shared<Candidate>(C);
-      CandidateList.push_back(Cptr);
-      CandidatesForFn.push_back(Cptr);
-    }
-
+    // The function is beneficial. Save its candidates to the candidate list
+    // for pruning.
+    for (std::shared_ptr<Candidate> &C : OF.Candidates)
+      CandidateList.push_back(C);
     FunctionList.push_back(OF);
-    FunctionList.back().Candidates = CandidatesForFn;
 
     // Move to the next function.
     Parent.IsInTree = false;
@@ -1067,11 +1026,11 @@ void MachineOutliner::prune(Candidate &C,
   // Remove C from the CandidateList.
   C.InCandidateList = false;
 
-  DEBUG(dbgs() << "- Removed a Candidate \n";
-        dbgs() << "--- Num fns left for candidate: " << F.getOccurrenceCount()
-               << "\n";
-        dbgs() << "--- Candidate's functions's benefit: " << F.getBenefit()
-               << "\n";);
+  LLVM_DEBUG(dbgs() << "- Removed a Candidate \n";
+             dbgs() << "--- Num fns left for candidate: "
+                    << F.getOccurrenceCount() << "\n";
+             dbgs() << "--- Candidate's functions's benefit: " << F.getBenefit()
+                    << "\n";);
 }
 
 void MachineOutliner::pruneOverlaps(
@@ -1119,7 +1078,7 @@ void MachineOutliner::pruneOverlaps(
     if (C1.getStartIdx() > MaxCandidateLen)
       FarthestPossibleIdx = C1.getStartIdx() - MaxCandidateLen;
 
-    // Compare against the candidates in the list that start at at most
+    // Compare against the candidates in the list that start at most
     // FarthestPossibleIdx indices away from C1. There are at most
     // MaxCandidateLen of these.
     for (auto Sit = It + 1; Sit != Et; Sit++) {
@@ -1205,9 +1164,20 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
 
   // NOTE: If this is linkonceodr, then we can take advantage of linker deduping
   // which gives us better results when we outline from linkonceodr functions.
-  F->setLinkage(GlobalValue::PrivateLinkage);
+  F->setLinkage(GlobalValue::InternalLinkage);
   F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 
+  // FIXME: Set nounwind, so we don't generate eh_frame? Haven't verified it's
+  // necessary.
+
+  // Set optsize/minsize, so we don't insert padding between outlined
+  // functions.
+  F->addFnAttr(Attribute::OptimizeForSize);
+  F->addFnAttr(Attribute::MinSize);
+
+  // Save F so that we can add debug info later if we need to.
+  CreatedIRFunctions.push_back(F);
+
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
@@ -1221,8 +1191,6 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
   // Insert the new function into the module.
   MF.insert(MF.begin(), &MBB);
 
-  TII.insertOutlinerPrologue(MBB, MF, OF.MInfo);
-
   // Copy over the instructions for the function using the integer mappings in
   // its sequence.
   for (unsigned Str : OF.Sequence) {
@@ -1231,13 +1199,53 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
     NewMI->dropMemRefs();
 
     // Don't keep debug information for outlined instructions.
-    // FIXME: This means outlined functions are currently undebuggable.
     NewMI->setDebugLoc(DebugLoc());
     MBB.insert(MBB.end(), NewMI);
   }
 
-  TII.insertOutlinerEpilogue(MBB, MF, OF.MInfo);
+  TII.buildOutlinedFrame(MBB, MF, OF);
+
+  // If there's a DISubprogram associated with this outlined function, then
+  // emit debug info for the outlined function.
+  if (DISubprogram *SP = getSubprogramOrNull(OF)) {
+    // We have a DISubprogram. Get its DICompileUnit.
+    DICompileUnit *CU = SP->getUnit();
+    DIBuilder DB(M, true, CU);
+    DIFile *Unit = SP->getFile();
+    Mangler Mg;
+
+    // Walk over each IR function we created in the outliner and create
+    // DISubprograms for each function.
+    for (Function *F : CreatedIRFunctions) {
+      // Get the mangled name of the function for the linkage name.
+      std::string Dummy;
+      llvm::raw_string_ostream MangledNameStream(Dummy);
+      Mg.getNameWithPrefix(MangledNameStream, F, false);
+
+      DISubprogram *SP = DB.createFunction(
+          Unit /* Context */, F->getName(), StringRef(MangledNameStream.str()),
+          Unit /* File */,
+          0 /* Line 0 is reserved for compiler-generated code. */,
+          DB.createSubroutineType(
+              DB.getOrCreateTypeArray(None)), /* void type */
+          false, true, 0, /* Line 0 is reserved for compiler-generated code. */
+          DINode::DIFlags::FlagArtificial /* Compiler-generated code. */,
+          true /* Outlined code is optimized code by definition. */);
+
+      // Don't add any new variables to the subprogram.
+      DB.finalizeSubprogram(SP);
+
+      // Attach subprogram to the function.
+      F->setSubprogram(SP);
+    }
+
+    // We're done with the DIBuilder.
+    DB.finalize();
+  }
 
+  // Outlined functions shouldn't preserve liveness.
+  MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness);
+  MF.getRegInfo().freezeReservedRegs(MF);
   return &MF;
 }
 
@@ -1260,79 +1268,73 @@ bool MachineOutliner::outline(
     if (OF.getBenefit() < 1)
       continue;
 
-    // If not, then outline it.
-    assert(C.getStartIdx() < Mapper.InstrList.size() &&
-           "Candidate out of bounds!");
-    MachineBasicBlock *MBB = (*Mapper.InstrList[C.getStartIdx()]).getParent();
-    MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.getStartIdx()];
-    unsigned EndIdx = C.getEndIdx();
-
-    assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
-    MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
-    assert(EndIt != MBB->end() && "EndIt out of bounds!");
-
-    EndIt++; // Erase needs one past the end index.
-
     // Does this candidate have a function yet?
     if (!OF.MF) {
       OF.MF = createOutlinedFunction(M, OF, Mapper);
-      MachineBasicBlock *MBB = &*OF.MF->begin();
-
-      // Output a remark telling the user that an outlined function was created,
-      // and explaining where it came from.
-      MachineOptimizationRemarkEmitter MORE(*OF.MF, nullptr);
-      MachineOptimizationRemark R(DEBUG_TYPE, "OutlinedFunction",
-                                  MBB->findDebugLoc(MBB->begin()), MBB);
-      R << "Saved " << NV("OutliningBenefit", OF.getBenefit())
-        << " instructions by "
-        << "outlining " << NV("Length", OF.Sequence.size()) << " instructions "
-        << "from " << NV("NumOccurrences", OF.getOccurrenceCount())
-        << " locations. "
-        << "(Found at: ";
-
-      // Tell the user the other places the candidate was found.
-      for (size_t i = 0, e = OF.Candidates.size(); i < e; i++) {
-
-        // Skip over things that were pruned.
-        if (!OF.Candidates[i]->InCandidateList)
-          continue;
-
-        R << NV(
-            (Twine("StartLoc") + Twine(i)).str(),
-            Mapper.InstrList[OF.Candidates[i]->getStartIdx()]->getDebugLoc());
-        if (i != e - 1)
-          R << ", ";
-      }
-
-      R << ")";
-
-      MORE.emit(R);
+      emitOutlinedFunctionRemark(OF);
       FunctionsCreated++;
     }
 
     MachineFunction *MF = OF.MF;
+    MachineBasicBlock &MBB = *C.getMBB();
+    MachineBasicBlock::iterator StartIt = C.front();
+    MachineBasicBlock::iterator EndIt = C.back();
+    assert(StartIt != C.getMBB()->end() && "StartIt out of bounds!");
+    assert(EndIt != C.getMBB()->end() && "EndIt out of bounds!");
+
     const TargetSubtargetInfo &STI = MF->getSubtarget();
     const TargetInstrInfo &TII = *STI.getInstrInfo();
 
     // Insert a call to the new function and erase the old sequence.
-    TII.insertOutlinedCall(M, *MBB, StartIt, *MF, C.MInfo);
-    StartIt = Mapper.InstrList[C.getStartIdx()];
-    MBB->erase(StartIt, EndIt);
+    auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *OF.MF, C);
+
+    // If the caller tracks liveness, then we need to make sure that anything
+    // we outline doesn't break liveness assumptions.
+    // The outlined functions themselves currently don't track liveness, but
+    // we should make sure that the ranges we yank things out of aren't
+    // wrong.
+    if (MBB.getParent()->getProperties().hasProperty(
+            MachineFunctionProperties::Property::TracksLiveness)) {
+      // Helper lambda for adding implicit def operands to the call instruction.
+      auto CopyDefs = [&CallInst](MachineInstr &MI) {
+        for (MachineOperand &MOP : MI.operands()) {
+          // Skip over anything that isn't a register.
+          if (!MOP.isReg())
+            continue;
+
+          // If it's a def, add it to the call instruction.
+          if (MOP.isDef())
+            CallInst->addOperand(
+                MachineOperand::CreateReg(MOP.getReg(), true, /* isDef = true */
+                                          true /* isImp = true */));
+        }
+      };
+
+      // Copy over the defs in the outlined range.
+      // First inst in outlined range <-- Anything that's defined in this
+      // ...                           .. range has to be added as an implicit
+      // Last inst in outlined range  <-- def to the call instruction.
+      std::for_each(CallInst, std::next(EndIt), CopyDefs);
+    }
 
+    // Erase from the point after where the call was inserted up to, and
+    // including, the final instruction in the sequence.
+    // Erase needs one past the end, so we need std::next there too.
+    MBB.erase(std::next(StartIt), std::next(EndIt));
     OutlinedSomething = true;
 
     // Statistics.
     NumOutlined++;
   }
 
-  DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);
+  LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);
 
   return OutlinedSomething;
 }
 
 bool MachineOutliner::runOnModule(Module &M) {
-
-  // Is there anything in the module at all?
+  // Check if there's anything in the module. If it's empty, then there's
+  // nothing to outline.
   if (M.empty())
     return false;
 
@@ -1342,25 +1344,67 @@ bool MachineOutliner::runOnModule(Module &M) {
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
   const TargetInstrInfo *TII = STI.getInstrInfo();
 
+  // If the user passed -enable-machine-outliner=always or
+  // -enable-machine-outliner, the pass will run on all functions in the module.
+  // Otherwise, if the target supports default outlining, it will run on all
+  // functions deemed by the target to be worth outlining from by default. Tell
+  // the user how the outliner is running.
+  LLVM_DEBUG(
+    dbgs() << "Machine Outliner: Running on ";
+    if (RunOnAllFunctions)
+      dbgs() << "all functions";
+    else
+      dbgs() << "target-default functions";
+    dbgs() << "\n"
+  );
+
+  // If the user specifies that they want to outline from linkonceodrs, set
+  // it here.
+  OutlineFromLinkOnceODRs = EnableLinkOnceODROutlining;
+
   InstructionMapper Mapper;
 
-  // Build instruction mappings for each function in the module.
+  // Build instruction mappings for each function in the module. Start by
+  // iterating over each Function in M.
   for (Function &F : M) {
-    MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
 
-    // Is the function empty? Safe to outline from?
-    if (F.empty() ||
-        !TII->isFunctionSafeToOutlineFrom(MF, OutlineFromLinkOnceODRs))
+    // If there's nothing in F, then there's no reason to try and outline from
+    // it.
+    if (F.empty())
+      continue;
+
+    // There's something in F. Check if it has a MachineFunction associated with
+    // it.
+    MachineFunction *MF = MMI.getMachineFunction(F);
+
+    // If it doesn't, then there's nothing to outline from. Move to the next
+    // Function.
+    if (!MF)
+      continue;
+
+    if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF))
       continue;
 
-    // If it is, look at each MachineBasicBlock in the function.
-    for (MachineBasicBlock &MBB : MF) {
+    // We have a MachineFunction. Ask the target if it's suitable for outlining.
+    // If it isn't, then move on to the next Function in the module.
+    if (!TII->isFunctionSafeToOutlineFrom(*MF, OutlineFromLinkOnceODRs))
+      continue;
 
-      // Is there anything in MBB?
+    // We have a function suitable for outlining. Iterate over every
+    // MachineBasicBlock in MF and try to map its instructions to a list of
+    // unsigned integers.
+    for (MachineBasicBlock &MBB : *MF) {
+      // If there isn't anything in MBB, then there's no point in outlining from
+      // it.
       if (MBB.empty())
         continue;
 
-      // If yes, map it.
+      // Check if MBB could be the target of an indirect branch. If it is, then
+      // we don't want to outline from it.
+      if (MBB.hasAddressTaken())
+        continue;
+
+      // MBB is suitable for outlining. Map it to a list of unsigneds.
       Mapper.convertToUnsignedVec(MBB, *TRI, *TII);
     }
   }
@@ -1378,5 +1422,7 @@ bool MachineOutliner::runOnModule(Module &M) {
   pruneOverlaps(CandidateList, FunctionList, Mapper, MaxCandidateLen, *TII);
 
   // Outline each of the candidates and return true if something was outlined.
-  return outline(M, CandidateList, FunctionList, Mapper);
+  bool OutlinedSomething = outline(M, CandidateList, FunctionList, Mapper);
+
+  return OutlinedSomething;
 }
diff --git a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp
index 18cb9af499a6..9bb00aaef86d 100644
--- a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -10,14 +10,14 @@
 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
 //
 // Software pipelining (SWP) is an instruction scheduling technique for loops
-// that overlap loop iterations and explioits ILP via a compiler transformation.
+// that overlap loop iterations and exploits ILP via a compiler transformation.
 //
 // Swing Modulo Scheduling is an implementation of software pipelining
 // that generates schedules that are near optimal in terms of initiation
 // interval, register requirements, and stage count. See the papers:
 //
 // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
-// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Processings of the 1996
+// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
 // Conference on Parallel Architectures and Compilation Techiniques.
 //
 // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
@@ -93,6 +93,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -125,6 +126,7 @@ using namespace llvm;
 
 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
 STATISTIC(NumPipelined, "Number of loops software pipelined");
+STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
 
 /// A command line option to turn software pipelining on or off.
 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
@@ -138,7 +140,7 @@ static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
 
 /// A command line argument to limit minimum initial interval for pipelining.
 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
-                              cl::desc("Size limit for the the MII."),
+                              cl::desc("Size limit for the MII."),
                               cl::Hidden, cl::init(27));
 
 /// A command line argument to limit the number of stages in the pipeline.
@@ -217,6 +219,7 @@ public:
   }
 
 private:
+  void preprocessPhiNodes(MachineBasicBlock &B);
   bool canPipelineLoop(MachineLoop &L);
   bool scheduleLoop(MachineLoop &L);
   bool swingModuloScheduler(MachineLoop &L);
@@ -241,6 +244,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   struct NodeInfo {
     int ASAP = 0;
     int ALAP = 0;
+    int ZeroLatencyDepth = 0;
+    int ZeroLatencyHeight = 0;
 
     NodeInfo() = default;
   };
@@ -313,15 +318,27 @@ public:
   /// Return the latest time an instruction my be scheduled.
   int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
 
-  /// The mobility function, which the the number of slots in which
+  /// The mobility function, which the number of slots in which
   /// an instruction may be scheduled.
   int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
 
   /// The depth, in the dependence graph, for a node.
-  int getDepth(SUnit *Node) { return Node->getDepth(); }
+  unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
+
+  /// The maximum unweighted length of a path from an arbitrary node to the
+  /// given node in which each edge has latency 0
+  int getZeroLatencyDepth(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
+  }
 
   /// The height, in the dependence graph, for a node.
-  int getHeight(SUnit *Node) { return Node->getHeight(); }
+  unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
+
+  /// The maximum unweighted length of a path from the given node to an
+  /// arbitrary node in which each edge has latency 0
+  int getZeroLatencyHeight(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
+  }
 
   /// Return true if the dependence is a back-edge in the data dependence graph.
   /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
@@ -332,29 +349,7 @@ public:
     return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
   }
 
-  /// Return true if the dependence is an order dependence between non-Phis.
-  static bool isOrder(SUnit *Source, const SDep &Dep) {
-    if (Dep.getKind() != SDep::Order)
-      return false;
-    return (!Source->getInstr()->isPHI() &&
-            !Dep.getSUnit()->getInstr()->isPHI());
-  }
-
-  bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
-
-  /// The latency of the dependence.
-  unsigned getLatency(SUnit *Source, const SDep &Dep) {
-    // Anti dependences represent recurrences, so use the latency of the
-    // instruction on the back-edge.
-    if (Dep.getKind() == SDep::Anti) {
-      if (Source->getInstr()->isPHI())
-        return Dep.getSUnit()->Latency;
-      if (Dep.getSUnit()->getInstr()->isPHI())
-        return Source->Latency;
-      return Dep.getLatency();
-    }
-    return Dep.getLatency();
-  }
+  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
 
   /// The distance function, which indicates that operation V of iteration I
   /// depends on operations U of iteration I-distance.
@@ -404,6 +399,7 @@ private:
   void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
                          SetVector<SUnit *> &NodesAdded);
   void computeNodeOrder(NodeSetType &NodeSets);
+  void checkValidNodeOrder(const NodeSetType &Circuits) const;
   bool schedulePipeline(SMSchedule &Schedule);
   void generatePipelinedLoop(SMSchedule &Schedule);
   void generateProlog(SMSchedule &Schedule, unsigned LastStage,
@@ -438,7 +434,7 @@ private:
                                     unsigned InstStageNum,
                                     SMSchedule &Schedule);
   void updateInstruction(MachineInstr *NewMI, bool LastDef,
-                         unsigned CurStageNum, unsigned InstStageNum,
+                         unsigned CurStageNum, unsigned InstrStageNum,
                          SMSchedule &Schedule, ValueMapTy *VRMap);
   MachineInstr *findDefInLoop(unsigned Reg);
   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
@@ -465,15 +461,22 @@ class NodeSet {
   bool HasRecurrence = false;
   unsigned RecMII = 0;
   int MaxMOV = 0;
-  int MaxDepth = 0;
+  unsigned MaxDepth = 0;
   unsigned Colocate = 0;
   SUnit *ExceedPressure = nullptr;
+  unsigned Latency = 0;
 
 public:
   using iterator = SetVector<SUnit *>::const_iterator;
 
   NodeSet() = default;
-  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
+  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
+    Latency = 0;
+    for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
+      for (const SDep &Succ : Nodes[i]->Succs)
+        if (Nodes.count(Succ.getSUnit()))
+          Latency += Succ.getLatency();
+  }
 
   bool insert(SUnit *SU) { return Nodes.insert(SU); }
 
@@ -513,6 +516,10 @@ public:
     }
   }
 
+  unsigned getLatency() { return Latency; }
+
+  unsigned getMaxDepth() { return MaxDepth; }
+
   void clear() {
     Nodes.clear();
     RecMII = 0;
@@ -563,7 +570,7 @@ public:
 #endif
 };
 
-/// This class repesents the scheduled code.  The main data structure is a
+/// This class represents the scheduled code.  The main data structure is a
 /// map from scheduled cycle to instructions.  During scheduling, the
 /// data structure explicitly represents all stages/iterations.   When
 /// the algorithm finshes, the schedule is collapsed into a single stage,
@@ -700,10 +707,10 @@ public:
 
   bool isValidSchedule(SwingSchedulerDAG *SSD);
   void finalizeSchedule(SwingSchedulerDAG *SSD);
-  bool orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
+  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
                        std::deque<SUnit *> &Insts);
   bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
-  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Inst,
+  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
                              MachineOperand &MO);
   void print(raw_ostream &os) const;
   void dump() const;
@@ -804,20 +811,41 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
   if (!L.getLoopPreheader())
     return false;
 
-  // If any of the Phis contain subregs, then we can't pipeline
-  // because we don't know how to maintain subreg information in the
-  // VMap structure.
-  MachineBasicBlock *MBB = L.getHeader();
-  for (MachineBasicBlock::iterator BBI = MBB->instr_begin(),
-                                   BBE = MBB->getFirstNonPHI();
-       BBI != BBE; ++BBI)
-    for (unsigned i = 1; i != BBI->getNumOperands(); i += 2)
-      if (BBI->getOperand(i).getSubReg() != 0)
-        return false;
-
+  // Remove any subregisters from inputs to phi nodes.
+  preprocessPhiNodes(*L.getHeader());
   return true;
 }
 
+void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
+
+  for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
+    MachineOperand &DefOp = PI.getOperand(0);
+    assert(DefOp.getSubReg() == 0);
+    auto *RC = MRI.getRegClass(DefOp.getReg());
+
+    for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
+      MachineOperand &RegOp = PI.getOperand(i);
+      if (RegOp.getSubReg() == 0)
+        continue;
+
+      // If the operand uses a subregister, replace it with a new register
+      // without subregisters, and generate a copy to the new register.
+      unsigned NewReg = MRI.createVirtualRegister(RC);
+      MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
+      MachineBasicBlock::iterator At = PredB.getFirstTerminator();
+      const DebugLoc &DL = PredB.findDebugLoc(At);
+      auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
+                    .addReg(RegOp.getReg(), getRegState(RegOp),
+                            RegOp.getSubReg());
+      Slots.insertMachineInstrInMaps(*Copy);
+      RegOp.setReg(NewReg);
+      RegOp.setSubReg(0);
+    }
+  }
+}
+
 /// The SMS algorithm consists of the following main steps:
 /// 1. Computation and analysis of the dependence graph.
 /// 2. Ordering of the nodes (instructions).
@@ -858,13 +886,14 @@ void SwingSchedulerDAG::schedule() {
   Topo.InitDAGTopologicalSorting();
   postprocessDAG();
   changeDependences();
-  DEBUG({
+  LLVM_DEBUG({
     for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
       SUnits[su].dumpAll(this);
   });
 
   NodeSetType NodeSets;
   findCircuits(NodeSets);
+  NodeSetType Circuits = NodeSets;
 
   // Calculate the MII.
   unsigned ResMII = calculateResMII();
@@ -877,8 +906,8 @@ void SwingSchedulerDAG::schedule() {
     RecMII = 0;
 
   MII = std::max(ResMII, RecMII);
-  DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII << ", res=" << ResMII
-               << ")\n");
+  LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII
+                    << ", res=" << ResMII << ")\n");
 
   // Can't schedule a loop without a valid MII.
   if (MII == 0)
@@ -896,20 +925,20 @@ void SwingSchedulerDAG::schedule() {
 
   checkNodeSets(NodeSets);
 
-  DEBUG({
+  LLVM_DEBUG({
     for (auto &I : NodeSets) {
       dbgs() << "  Rec NodeSet ";
       I.dump();
     }
   });
 
-  std::sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
+  std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
 
   groupRemainingNodes(NodeSets);
 
   removeDuplicateNodes(NodeSets);
 
-  DEBUG({
+  LLVM_DEBUG({
     for (auto &I : NodeSets) {
       dbgs() << "  NodeSet ";
       I.dump();
@@ -918,6 +947,9 @@ void SwingSchedulerDAG::schedule() {
 
   computeNodeOrder(NodeSets);
 
+  // check for node order issues
+  checkValidNodeOrder(Circuits);
+
   SMSchedule Schedule(Pass.MF);
   Scheduled = schedulePipeline(Schedule);
 
@@ -972,7 +1004,7 @@ static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
   return 0;
 }
 
-/// Return the Phi register value that comes the the loop block.
+/// Return the Phi register value that comes the loop block.
 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
@@ -1022,6 +1054,13 @@ static void getUnderlyingObjects(MachineInstr *MI,
   if (!MM->getValue())
     return;
   GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
+  for (Value *V : Objs) {
+    if (!isIdentifiedObject(V)) {
+      Objs.clear();
+      return;
+    }
+    Objs.push_back(V);
+  }
 }
 
 /// Add a chain edge between a load and store if the store can be an
@@ -1030,6 +1069,8 @@ static void getUnderlyingObjects(MachineInstr *MI,
 /// but that code doesn't create loop carried dependences.
 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
   MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
+  Value *UnknownValue =
+    UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
   for (auto &SU : SUnits) {
     MachineInstr &MI = *SU.getInstr();
     if (isDependenceBarrier(MI, AA))
@@ -1037,6 +1078,8 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
     else if (MI.mayLoad()) {
       SmallVector<Value *, 4> Objs;
       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
+      if (Objs.empty())
+        Objs.push_back(UnknownValue);
       for (auto V : Objs) {
         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
         SUs.push_back(&SU);
@@ -1044,6 +1087,8 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
     } else if (MI.mayStore()) {
       SmallVector<Value *, 4> Objs;
       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
+      if (Objs.empty())
+        Objs.push_back(UnknownValue);
       for (auto V : Objs) {
         MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
             PendingLoads.find(V);
@@ -1058,33 +1103,39 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
           // offset, then mark the dependence as loop carried potentially.
           unsigned BaseReg1, BaseReg2;
           int64_t Offset1, Offset2;
-          if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
-              !TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
-            SU.addPred(SDep(Load, SDep::Barrier));
-            continue;            
-          }
-          if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
-            assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
-                   "What happened to the chain edge?");
-            SU.addPred(SDep(Load, SDep::Barrier));
-            continue;
+          if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) &&
+              TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
+            if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
+              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
+                     "What happened to the chain edge?");
+              SDep Dep(Load, SDep::Barrier);
+              Dep.setLatency(1);
+              SU.addPred(Dep);
+              continue;
+            }
           }
           // Second, the more expensive check that uses alias analysis on the
           // base registers. If they alias, and the load offset is less than
           // the store offset, the mark the dependence as loop carried.
           if (!AA) {
-            SU.addPred(SDep(Load, SDep::Barrier));
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
             continue;
           }
           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
           MachineMemOperand *MMO2 = *MI.memoperands_begin();
           if (!MMO1->getValue() || !MMO2->getValue()) {
-            SU.addPred(SDep(Load, SDep::Barrier));
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
             continue;
           }
           if (MMO1->getValue() == MMO2->getValue() &&
               MMO1->getOffset() <= MMO2->getOffset()) {
-            SU.addPred(SDep(Load, SDep::Barrier));
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
             continue;
           }
           AliasResult AAResult = AA->alias(
@@ -1093,8 +1144,11 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
               MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
                              MMO2->getAAInfo()));
 
-          if (AAResult != NoAlias)
-            SU.addPred(SDep(Load, SDep::Barrier));
+          if (AAResult != NoAlias) {
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
+          }
         }
       }
     }
@@ -1136,6 +1190,7 @@ void SwingSchedulerDAG::updatePhiDependences() {
           if (SU != nullptr && UseMI->isPHI()) {
             if (!MI->isPHI()) {
               SDep Dep(SU, SDep::Anti, Reg);
+              Dep.setLatency(1);
               I.addPred(Dep);
             } else {
               HasPhiDef = Reg;
@@ -1382,7 +1437,7 @@ unsigned SwingSchedulerDAG::calculateResMII() {
 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
 /// for each circuit. The II needs to satisfy the inequality
 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
-/// II that satistifies the inequality, and the RecMII is the maximum
+/// II that satisfies the inequality, and the RecMII is the maximum
 /// of those values.
 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
   unsigned RecMII = 0;
@@ -1391,7 +1446,7 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
     if (Nodes.empty())
       continue;
 
-    unsigned Delay = Nodes.size() - 1;
+    unsigned Delay = Nodes.getLatency();
     unsigned Distance = 1;
 
     // ii = ceil(delay / distance)
@@ -1437,10 +1492,23 @@ static void swapAntiDependences(std::vector<SUnit> &SUnits) {
 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
     SwingSchedulerDAG *DAG) {
   BitVector Added(SUnits.size());
+  DenseMap<int, int> OutputDeps;
   for (int i = 0, e = SUnits.size(); i != e; ++i) {
     Added.reset();
     // Add any successor to the adjacency matrix and exclude duplicates.
     for (auto &SI : SUnits[i].Succs) {
+      // Only create a back-edge on the first and last nodes of a dependence
+      // chain. This records any chains and adds them later.
+      if (SI.getKind() == SDep::Output) {
+        int N = SI.getSUnit()->NodeNum;
+        int BackEdge = i;
+        auto Dep = OutputDeps.find(BackEdge);
+        if (Dep != OutputDeps.end()) {
+          BackEdge = Dep->second;
+          OutputDeps.erase(Dep);
+        }
+        OutputDeps[N] = BackEdge;
+      }
       // Do not process a boundary node and a back-edge is processed only
       // if it goes to a Phi.
       if (SI.getSUnit()->isBoundaryNode() ||
@@ -1456,7 +1524,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
     // adjacency matrix.
     for (auto &PI : SUnits[i].Preds) {
       if (!SUnits[i].getInstr()->mayStore() ||
-          !DAG->isLoopCarriedOrder(&SUnits[i], PI, false))
+          !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
         continue;
       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
         int N = PI.getSUnit()->NodeNum;
@@ -1467,6 +1535,12 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
       }
     }
   }
+  // Add back-eges in the adjacency matrix for the output dependences.
+  for (auto &OD : OutputDeps)
+    if (!Added.test(OD.second)) {
+      AdjK[OD.first].push_back(OD.second);
+      Added.set(OD.second);
+    }
 }
 
 /// Identify an elementary circuit in the dependence graph starting at the
@@ -1543,7 +1617,7 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
 }
 
 /// Return true for DAG nodes that we ignore when computing the cost functions.
-/// We ignore the back-edge recurrence in order to avoid unbounded recurison
+/// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
 static bool ignoreDependence(const SDep &D, bool isPred) {
   if (D.isArtificial())
@@ -1560,7 +1634,7 @@ static bool ignoreDependence(const SDep &D, bool isPred) {
 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
   ScheduleInfo.resize(SUnits.size());
 
-  DEBUG({
+  LLVM_DEBUG({
     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
                                                     E = Topo.end();
          I != E; ++I) {
@@ -1570,49 +1644,59 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
   });
 
   int maxASAP = 0;
-  // Compute ASAP.
+  // Compute ASAP and ZeroLatencyDepth.
   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
                                                   E = Topo.end();
        I != E; ++I) {
     int asap = 0;
+    int zeroLatencyDepth = 0;
     SUnit *SU = &SUnits[*I];
     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
                                     EP = SU->Preds.end();
          IP != EP; ++IP) {
+      SUnit *pred = IP->getSUnit();
+      if (IP->getLatency() == 0)
+        zeroLatencyDepth =
+            std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
       if (ignoreDependence(*IP, true))
         continue;
-      SUnit *pred = IP->getSUnit();
-      asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
+      asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
                                   getDistance(pred, SU, *IP) * MII));
     }
     maxASAP = std::max(maxASAP, asap);
     ScheduleInfo[*I].ASAP = asap;
+    ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
   }
 
-  // Compute ALAP and MOV.
+  // Compute ALAP, ZeroLatencyHeight, and MOV.
   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
                                                           E = Topo.rend();
        I != E; ++I) {
     int alap = maxASAP;
+    int zeroLatencyHeight = 0;
     SUnit *SU = &SUnits[*I];
     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
                                     ES = SU->Succs.end();
          IS != ES; ++IS) {
+      SUnit *succ = IS->getSUnit();
+      if (IS->getLatency() == 0)
+        zeroLatencyHeight =
+            std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
       if (ignoreDependence(*IS, true))
         continue;
-      SUnit *succ = IS->getSUnit();
-      alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
+      alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
                                   getDistance(SU, succ, *IS) * MII));
     }
 
     ScheduleInfo[*I].ALAP = alap;
+    ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
   }
 
   // After computing the node functions, compute the summary for each node set.
   for (NodeSet &I : NodeSets)
     I.computeNodeSetInfo(this);
 
-  DEBUG({
+  LLVM_DEBUG({
     for (unsigned i = 0; i < SUnits.size(); i++) {
       dbgs() << "\tNode " << i << ":\n";
       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
@@ -1620,6 +1704,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
+      dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
+      dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
     }
   });
 }
@@ -1778,7 +1864,8 @@ void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
     RecRPTracker.closeBottom();
 
     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
-    std::sort(SUnits.begin(), SUnits.end(), [](const SUnit *A, const SUnit *B) {
+    llvm::sort(SUnits.begin(), SUnits.end(),
+               [](const SUnit *A, const SUnit *B) {
       return A->NodeNum > B->NodeNum;
     });
 
@@ -1796,9 +1883,10 @@ void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
                                              CriticalPSets,
                                              RecRegPressure.MaxSetPressure);
       if (RPDelta.Excess.isValid()) {
-        DEBUG(dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
-                     << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
-                     << ":" << RPDelta.Excess.getUnitInc());
+        LLVM_DEBUG(
+            dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
+                   << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
+                   << ":" << RPDelta.Excess.getUnitInc());
         NS.setExceedPressure(SU);
         break;
       }
@@ -1834,25 +1922,23 @@ void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
 
 /// Check if the existing node-sets are profitable. If not, then ignore the
 /// recurrent node-sets, and attempt to schedule all nodes together. This is
-/// a heuristic. If the MII is large and there is a non-recurrent node with
-/// a large depth compared to the MII, then it's best to try and schedule
-/// all instruction together instead of starting with the recurrent node-sets.
+/// a heuristic. If the MII is large and all the recurrent node-sets are small,
+/// then it's best to try to schedule all instructions together instead of
+/// starting with the recurrent node-sets.
 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
   // Look for loops with a large MII.
-  if (MII <= 20)
+  if (MII < 17)
     return;
   // Check if the node-set contains only a simple add recurrence.
-  for (auto &NS : NodeSets)
-    if (NS.size() > 2)
+  for (auto &NS : NodeSets) {
+    if (NS.getRecMII() > 2)
       return;
-  // If the depth of any instruction is significantly larger than the MII, then
-  // ignore the recurrent node-sets and treat all instructions equally.
-  for (auto &SU : SUnits)
-    if (SU.getDepth() > MII * 1.5) {
-      NodeSets.clear();
-      DEBUG(dbgs() << "Clear recurrence node-sets\n");
+    if (NS.getMaxDepth() > MII)
       return;
-    }
+  }
+  NodeSets.clear();
+  LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
+  return;
 }
 
 /// Add the nodes that do not belong to a recurrence set into groups
@@ -1907,7 +1993,7 @@ void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
   if (!NewSet.empty())
     NodeSets.push_back(NewSet);
 
-  // Create new nodes sets with the connected nodes any any remaining node that
+  // Create new nodes sets with the connected nodes any remaining node that
   // has no predecessor.
   for (unsigned i = 0; i < SUnits.size(); ++i) {
     SUnit *SU = &SUnits[i];
@@ -1988,14 +2074,6 @@ void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
     }
 }
 
-/// Return true if Inst1 defines a value that is used in Inst2.
-static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) {
-  for (auto &SI : Inst1->Succs)
-    if (SI.getSUnit() == Inst2 && SI.getKind() == SDep::Data)
-      return true;
-  return false;
-}
-
 /// Compute an ordered list of the dependence graph nodes, which
 /// indicates the order that the nodes will be scheduled.  This is a
 /// two-level algorithm. First, a partial order is created, which
@@ -2005,59 +2083,62 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
   NodeOrder.clear();
 
   for (auto &Nodes : NodeSets) {
-    DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
+    LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
     OrderKind Order;
     SmallSetVector<SUnit *, 8> N;
     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
       R.insert(N.begin(), N.end());
       Order = BottomUp;
-      DEBUG(dbgs() << "  Bottom up (preds) ");
+      LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
       R.insert(N.begin(), N.end());
       Order = TopDown;
-      DEBUG(dbgs() << "  Top down (succs) ");
+      LLVM_DEBUG(dbgs() << "  Top down (succs) ");
     } else if (isIntersect(N, Nodes, R)) {
       // If some of the successors are in the existing node-set, then use the
       // top-down ordering.
       Order = TopDown;
-      DEBUG(dbgs() << "  Top down (intersect) ");
+      LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
     } else if (NodeSets.size() == 1) {
       for (auto &N : Nodes)
         if (N->Succs.size() == 0)
           R.insert(N);
       Order = BottomUp;
-      DEBUG(dbgs() << "  Bottom up (all) ");
+      LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
     } else {
       // Find the node with the highest ASAP.
       SUnit *maxASAP = nullptr;
       for (SUnit *SU : Nodes) {
-        if (maxASAP == nullptr || getASAP(SU) >= getASAP(maxASAP))
+        if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
+            (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
           maxASAP = SU;
       }
       R.insert(maxASAP);
       Order = BottomUp;
-      DEBUG(dbgs() << "  Bottom up (default) ");
+      LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
     }
 
     while (!R.empty()) {
       if (Order == TopDown) {
         // Choose the node with the maximum height.  If more than one, choose
-        // the node with the lowest MOV. If still more than one, check if there
-        // is a dependence between the instructions.
+        // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
+        // choose the node with the lowest MOV.
         while (!R.empty()) {
           SUnit *maxHeight = nullptr;
           for (SUnit *I : R) {
             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
               maxHeight = I;
             else if (getHeight(I) == getHeight(maxHeight) &&
-                     getMOV(I) < getMOV(maxHeight) &&
-                     !hasDataDependence(maxHeight, I))
+                     getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
               maxHeight = I;
-            else if (hasDataDependence(I, maxHeight))
+            else if (getHeight(I) == getHeight(maxHeight) &&
+                     getZeroLatencyHeight(I) ==
+                         getZeroLatencyHeight(maxHeight) &&
+                     getMOV(I) < getMOV(maxHeight))
               maxHeight = I;
           }
           NodeOrder.insert(maxHeight);
-          DEBUG(dbgs() << maxHeight->NodeNum << " ");
+          LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
           R.remove(maxHeight);
           for (const auto &I : maxHeight->Succs) {
             if (Nodes.count(I.getSUnit()) == 0)
@@ -2080,28 +2161,29 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
           }
         }
         Order = BottomUp;
-        DEBUG(dbgs() << "\n   Switching order to bottom up ");
+        LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
         SmallSetVector<SUnit *, 8> N;
         if (pred_L(NodeOrder, N, &Nodes))
           R.insert(N.begin(), N.end());
       } else {
         // Choose the node with the maximum depth.  If more than one, choose
-        // the node with the lowest MOV. If there is still more than one, check
-        // for a dependence between the instructions.
+        // the node with the maximum ZeroLatencyDepth. If still more than one,
+        // choose the node with the lowest MOV.
         while (!R.empty()) {
           SUnit *maxDepth = nullptr;
           for (SUnit *I : R) {
             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
               maxDepth = I;
             else if (getDepth(I) == getDepth(maxDepth) &&
-                     getMOV(I) < getMOV(maxDepth) &&
-                     !hasDataDependence(I, maxDepth))
+                     getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
               maxDepth = I;
-            else if (hasDataDependence(maxDepth, I))
+            else if (getDepth(I) == getDepth(maxDepth) &&
+                     getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
+                     getMOV(I) < getMOV(maxDepth))
               maxDepth = I;
           }
           NodeOrder.insert(maxDepth);
-          DEBUG(dbgs() << maxDepth->NodeNum << " ");
+          LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
           R.remove(maxDepth);
           if (Nodes.isExceedSU(maxDepth)) {
             Order = TopDown;
@@ -2114,8 +2196,6 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
               continue;
             if (NodeOrder.count(I.getSUnit()) != 0)
               continue;
-            if (I.getKind() == SDep::Anti)
-              continue;
             R.insert(I.getSUnit());
           }
           // Back-edges are predecessors with an anti-dependence.
@@ -2130,16 +2210,16 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
           }
         }
         Order = TopDown;
-        DEBUG(dbgs() << "\n   Switching order to top down ");
+        LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
         SmallSetVector<SUnit *, 8> N;
         if (succ_L(NodeOrder, N, &Nodes))
           R.insert(N.begin(), N.end());
       }
     }
-    DEBUG(dbgs() << "\nDone with Nodeset\n");
+    LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Node order: ";
     for (SUnit *I : NodeOrder)
       dbgs() << " " << I->NodeNum << " ";
@@ -2158,7 +2238,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
     Schedule.reset();
     Schedule.setInitiationInterval(II);
-    DEBUG(dbgs() << "Try to schedule with " << II << "\n");
+    LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
 
     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
     SetVector<SUnit *>::iterator NE = NodeOrder.end();
@@ -2175,12 +2255,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       int SchedStart = INT_MIN;
       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
                             II, this);
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Inst (" << SU->NodeNum << ") ";
         SU->getInstr()->dump();
         dbgs() << "\n";
       });
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
                << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
       });
@@ -2216,7 +2296,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
           scheduleFound = false;
 
-      DEBUG({
+      LLVM_DEBUG({
         if (!scheduleFound)
           dbgs() << "\tCan't schedule\n";
       });
@@ -2227,7 +2307,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       scheduleFound = Schedule.isValidSchedule(this);
   }
 
-  DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
+  LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
 
   if (scheduleFound)
     Schedule.finalizeSchedule(this);
@@ -2250,7 +2330,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
 
   // Remember the registers that are used in different stages. The index is
   // the iteration, or stage, that the instruction is scheduled in.  This is
-  // a map between register names in the orignal block and the names created
+  // a map between register names in the original block and the names created
   // in each stage of the pipelined loop.
   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
   InstrMapTy InstrMap;
@@ -2297,7 +2377,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
                InstrMap, MaxStageCount, MaxStageCount, false);
 
-  DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
+  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
 
   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
   // Generate the epilog instructions to complete the pipeline.
@@ -2315,6 +2395,8 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
 
   // Remove the original loop since it's no longer referenced.
+  for (auto &I : *BB)
+    LIS.RemoveMachineInstrFromMaps(I);
   BB->clear();
   BB->eraseFromParent();
 
@@ -2364,7 +2446,7 @@ void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
       }
     }
     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "prolog:\n";
       NewBB->dump();
     });
@@ -2431,7 +2513,9 @@ void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
           continue;
         MachineInstr *In = &BBI;
         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
-          MachineInstr *NewMI = cloneInstr(In, EpilogStage - LastStage, 0);
+          // Instructions with memoperands in the epilog are updated with
+          // conservative values.
+          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
           NewBB->push_back(NewMI);
           InstrMap[NewMI] = In;
@@ -2444,7 +2528,7 @@ void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
                  InstrMap, LastStage, EpilogStage, i == 1);
     PredBB = NewBB;
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "epilog:\n";
       NewBB->dump();
     });
@@ -2550,24 +2634,20 @@ void SwingSchedulerDAG::generateExistingPhis(
       // of the Phi value.
       unsigned NewReg = VRMap[PrevStage][LoopVal];
       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
-                            Def, NewReg);
+                            Def, InitVal, NewReg);
       if (VRMap[CurStageNum].count(LoopVal))
         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
     }
     // Adjust the number of Phis needed depending on the number of prologs left,
-    // and the distance from where the Phi is first scheduled.
-    unsigned NumPhis = NumStages;
-    if (!InKernel && (int)PrologStage < LoopValStage)
-      // The NumPhis is the maximum number of new Phis needed during the steady
-      // state. If the Phi has not been scheduled in current prolog, then we
-      // need to generate less Phis.
-      NumPhis = std::max((int)NumPhis - (int)(LoopValStage - PrologStage), 1);
-    // The number of Phis cannot exceed the number of prolog stages. Each
-    // stage can potentially define two values.
-    NumPhis = std::min(NumPhis, PrologStage + 2);
+    // and the distance from where the Phi is first scheduled. The number of
+    // Phis cannot exceed the number of prolog stages. Each stage can
+    // potentially define two values.
+    unsigned MaxPhis = PrologStage + 2;
+    if (!InKernel && (int)PrologStage <= LoopValStage)
+      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
+    unsigned NumPhis = std::min(NumStages, MaxPhis);
 
     unsigned NewReg = 0;
-
     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
     // In the epilog, we may need to look back one stage to get the correct
     // Phi name because the epilog and prolog blocks execute the same stage.
@@ -2659,19 +2739,20 @@ void SwingSchedulerDAG::generateExistingPhis(
       // references another Phi, and the other Phi is scheduled in an
       // earlier stage. We can try to reuse an existing Phi up until the last
       // stage of the current Phi.
-      if (LoopDefIsPhi && (int)PrologStage >= StageScheduled) {
+      if (LoopDefIsPhi && (int)(PrologStage - np) >= StageScheduled) {
         int LVNumStages = Schedule.getStagesForPhi(LoopVal);
         int StageDiff = (StageScheduled - LoopValStage);
         LVNumStages -= StageDiff;
-        if (LVNumStages > (int)np) {
+        // Make sure the loop value Phi has been processed already.
+        if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
           NewReg = PhiOp2;
           unsigned ReuseStage = CurStageNum;
           if (Schedule.isLoopCarried(this, *PhiInst))
             ReuseStage -= LVNumStages;
           // Check if the Phi to reuse has been generated yet. If not, then
           // there is nothing to reuse.
-          if (VRMap[ReuseStage].count(LoopVal)) {
-            NewReg = VRMap[ReuseStage][LoopVal];
+          if (VRMap[ReuseStage - np].count(LoopVal)) {
+            NewReg = VRMap[ReuseStage - np][LoopVal];
 
             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
                                   &*BBI, Def, NewReg);
@@ -2744,7 +2825,7 @@ void SwingSchedulerDAG::generateExistingPhis(
 
 /// Generate Phis for the specified block in the generated pipelined code.
 /// These are new Phis needed because the definition is scheduled after the
-/// use in the pipelened sequence.
+/// use in the pipelined sequence.
 void SwingSchedulerDAG::generatePhis(
     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
@@ -2874,6 +2955,13 @@ void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
         if (!MOI->isReg() || !MOI->isDef())
           continue;
         unsigned reg = MOI->getReg();
+        // Assume physical registers are used, unless they are marked dead.
+        if (TargetRegisterInfo::isPhysicalRegister(reg)) {
+          used = !MOI->isDead();
+          if (used)
+            break;
+          continue;
+        }
         unsigned realUses = 0;
         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
                                                EI = MRI.use_end();
@@ -2891,6 +2979,7 @@ void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
         used = false;
       }
       if (!used) {
+        LIS.RemoveMachineInstrFromMaps(*MI);
         MI++->eraseFromParent();
         continue;
       }
@@ -2905,6 +2994,7 @@ void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
     ++BBI;
     unsigned reg = MI->getOperand(0).getReg();
     if (MRI.use_begin(reg) == MRI.use_end()) {
+      LIS.RemoveMachineInstrFromMaps(*MI);
       MI->eraseFromParent();
     }
   }
@@ -2924,10 +3014,8 @@ void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
                                        MBBVectorTy &EpilogBBs,
                                        SMSchedule &Schedule) {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
-                                   BBF = KernelBB->getFirstNonPHI();
-       BBI != BBF; ++BBI) {
-    unsigned Def = BBI->getOperand(0).getReg();
+  for (auto &PHI : KernelBB->phis()) {
+    unsigned Def = PHI.getOperand(0).getReg();
     // Check for any Phi definition that used as an operand of another Phi
     // in the same block.
     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
@@ -2935,7 +3023,7 @@ void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
          I != E; ++I) {
       if (I->isPHI() && I->getParent() == KernelBB) {
         // Get the loop carried definition.
-        unsigned LCDef = getLoopPhiReg(*BBI, KernelBB);
+        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
         if (!LCDef)
           continue;
         MachineInstr *MI = MRI.getVRegDef(LCDef);
@@ -3099,12 +3187,14 @@ void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
       continue;
     }
     unsigned Delta;
-    if (computeDelta(OldMI, Delta)) {
+    if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
       int64_t AdjOffset = Delta * Num;
       NewMemRefs[Refs++] =
           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize());
-    } else
-      NewMemRefs[Refs++] = MF.getMachineMemOperand(MMO, 0, UINT64_MAX);
+    } else {
+      NewMI.dropMemRefs();
+      return;
+    }
   }
   NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs);
 }
@@ -3249,13 +3339,11 @@ void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
                                          SMSchedule &Schedule,
                                          ValueMapTy *VRMap,
                                          InstrMapTy &InstrMap) {
-  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                   BBE = BB->getFirstNonPHI();
-       BBI != BBE; ++BBI) {
+  for (auto &PHI : BB->phis()) {
     unsigned InitVal = 0;
     unsigned LoopVal = 0;
-    getPhiRegs(*BBI, BB, InitVal, LoopVal);
-    unsigned PhiDef = BBI->getOperand(0).getReg();
+    getPhiRegs(PHI, BB, InitVal, LoopVal);
+    unsigned PhiDef = PHI.getOperand(0).getReg();
 
     unsigned PhiStage =
         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
@@ -3269,7 +3357,7 @@ void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
       if (!NewVal)
         NewVal = InitVal;
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &*BBI,
+      rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
                             PhiDef, NewVal);
     }
   }
@@ -3375,10 +3463,15 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
     return false;
 
-  // Make sure offset values are both positive or both negative.
+  // Make sure that the instructions do not access the same memory location in
+  // the next iteration.
   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
-  if ((LoadOffset >= 0) != (StoreOffset >= 0))
+  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+  NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
+  bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
+  MF.DeleteMachineInstr(NewMI);
+  if (!Disjoint)
     return false;
 
   // Set the return value once we determine that we return true.
@@ -3425,17 +3518,21 @@ void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
   }
 }
 
-/// Return true for an order dependence that is loop carried potentially.
-/// An order dependence is loop carried if the destination defines a value
-/// that may be used by the source in a subsequent iteration.
-bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
-                                           bool isSucc) {
-  if (!isOrder(Source, Dep) || Dep.isArtificial())
+/// Return true for an order or output dependence that is loop carried
+/// potentially. A dependence is loop carried if the destination defines a valu
+/// that may be used or defined by the source in a subsequent iteration.
+bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
+                                         bool isSucc) {
+  if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
+      Dep.isArtificial())
     return false;
 
   if (!SwpPruneLoopCarried)
     return true;
 
+  if (Dep.getKind() == SDep::Output)
+    return true;
+
   MachineInstr *SI = Source->getInstr();
   MachineInstr *DI = Dep.getSUnit()->getInstr();
   if (!isSucc)
@@ -3465,6 +3562,19 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
   if (BaseRegS != BaseRegD)
     return true;
 
+  // Check that the base register is incremented by a constant value for each
+  // iteration.
+  MachineInstr *Def = MRI.getVRegDef(BaseRegS);
+  if (!Def || !Def->isPHI())
+    return true;
+  unsigned InitVal = 0;
+  unsigned LoopVal = 0;
+  getPhiRegs(*Def, BB, InitVal, LoopVal);
+  MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
+  int D = 0;
+  if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
+    return true;
+
   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
 
@@ -3516,7 +3626,7 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
     }
     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
         Resources->canReserveResources(*SU->getInstr())) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "\tinsert at cycle " << curCycle << " ";
         SU->getInstr()->dump();
       });
@@ -3529,7 +3639,7 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
         FirstCycle = curCycle;
       return true;
     }
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
       SU->getInstr()->dump();
     });
@@ -3553,7 +3663,7 @@ int SMSchedule::earliestCycleInChain(const SDep &Dep) {
       continue;
     EarlyCycle = std::min(EarlyCycle, it->second);
     for (const auto &PI : PrevSU->Preds)
-      if (SwingSchedulerDAG::isOrder(PrevSU, PI))
+      if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
         Worklist.push_back(PI);
     Visited.insert(PrevSU);
   }
@@ -3576,7 +3686,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) {
       continue;
     LateCycle = std::max(LateCycle, it->second);
     for (const auto &SI : SuccSU->Succs)
-      if (SwingSchedulerDAG::isOrder(SuccSU, SI))
+      if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
         Worklist.push_back(SI);
     Visited.insert(SuccSU);
   }
@@ -3590,7 +3700,7 @@ static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
   for (auto &P : SU->Preds)
     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
       for (auto &S : P.getSUnit()->Succs)
-        if (S.getKind() == SDep::Order && S.getSUnit()->getInstr()->isPHI())
+        if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
           return P.getSUnit();
   return nullptr;
 }
@@ -3601,7 +3711,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
                               int *MinEnd, int *MaxStart, int II,
                               SwingSchedulerDAG *DAG) {
   // Iterate over each instruction that has been scheduled already.  The start
-  // slot computuation depends on whether the previously scheduled instruction
+  // slot computation depends on whether the previously scheduled instruction
   // is a predecessor or successor of the specified instruction.
   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
 
@@ -3613,15 +3723,15 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
         const SDep &Dep = SU->Preds[i];
         if (Dep.getSUnit() == I) {
           if (!DAG->isBackedge(SU, Dep)) {
-            int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
+            int EarlyStart = cycle + Dep.getLatency() -
                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
-            if (DAG->isLoopCarriedOrder(SU, Dep, false)) {
+            if (DAG->isLoopCarriedDep(SU, Dep, false)) {
               int End = earliestCycleInChain(Dep) + (II - 1);
               *MinEnd = std::min(*MinEnd, End);
             }
           } else {
-            int LateStart = cycle - DAG->getLatency(SU, Dep) +
+            int LateStart = cycle - Dep.getLatency() +
                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
             *MinLateStart = std::min(*MinLateStart, LateStart);
           }
@@ -3633,23 +3743,24 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
             !SU->isPred(I))
           *MinLateStart = std::min(*MinLateStart, cycle);
       }
-      for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i)
+      for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
         if (SU->Succs[i].getSUnit() == I) {
           const SDep &Dep = SU->Succs[i];
           if (!DAG->isBackedge(SU, Dep)) {
-            int LateStart = cycle - DAG->getLatency(SU, Dep) +
+            int LateStart = cycle - Dep.getLatency() +
                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
             *MinLateStart = std::min(*MinLateStart, LateStart);
-            if (DAG->isLoopCarriedOrder(SU, Dep)) {
+            if (DAG->isLoopCarriedDep(SU, Dep)) {
               int Start = latestCycleInChain(Dep) + 1 - II;
               *MaxStart = std::max(*MaxStart, Start);
             }
           } else {
-            int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
+            int EarlyStart = cycle + Dep.getLatency() -
                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
           }
         }
+      }
     }
   }
 }
@@ -3657,7 +3768,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
 /// Order the instructions within a cycle so that the definitions occur
 /// before the uses. Returns true if the instruction is added to the start
 /// of the list, or false if added to the end.
-bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
+void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
                                  std::deque<SUnit *> &Insts) {
   MachineInstr *MI = SU->getInstr();
   bool OrderBeforeUse = false;
@@ -3670,13 +3781,11 @@ bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
   unsigned Pos = 0;
   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
        ++I, ++Pos) {
-    // Relative order of Phis does not matter.
-    if (MI->isPHI() && (*I)->getInstr()->isPHI())
-      continue;
     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
         continue;
+
       unsigned Reg = MO.getReg();
       unsigned BasePos, OffsetPos;
       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
@@ -3688,7 +3797,8 @@ bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
         OrderBeforeUse = true;
-        MoveUse = Pos;
+        if (MoveUse == 0)
+          MoveUse = Pos;
       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
         // Add the instruction after the scheduled instruction.
         OrderAfterDef = true;
@@ -3696,14 +3806,16 @@ bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
           OrderBeforeUse = true;
-          MoveUse = Pos;
+          if (MoveUse == 0)
+            MoveUse = Pos;
         } else {
           OrderAfterDef = true;
           MoveDef = Pos;
         }
       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
         OrderBeforeUse = true;
-        MoveUse = Pos;
+        if (MoveUse == 0)
+          MoveUse = Pos;
         if (MoveUse != 0) {
           OrderAfterDef = true;
           MoveDef = Pos - 1;
@@ -3711,49 +3823,35 @@ bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
         // Add the instruction before the scheduled instruction.
         OrderBeforeUse = true;
-        MoveUse = Pos;
+        if (MoveUse == 0)
+          MoveUse = Pos;
       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
-        OrderBeforeDef = true;
-        MoveUse = Pos;
+        if (MoveUse == 0) {
+          OrderBeforeDef = true;
+          MoveUse = Pos;
+        }
       }
     }
     // Check for order dependences between instructions. Make sure the source
     // is ordered before the destination.
-    for (auto &S : SU->Succs)
-      if (S.getKind() == SDep::Order) {
-        if (S.getSUnit() == *I && stageScheduled(*I) == StageInst1) {
-          OrderBeforeUse = true;
-          MoveUse = Pos;
-        }
-      } else if (TargetRegisterInfo::isPhysicalRegister(S.getReg())) {
-        if (cycleScheduled(SU) != cycleScheduled(S.getSUnit())) {
-          if (S.isAssignedRegDep()) {
-            OrderAfterDef = true;
-            MoveDef = Pos;
-          }
-        } else {
-          OrderBeforeUse = true;
+    for (auto &S : SU->Succs) {
+      if (S.getSUnit() != *I)
+        continue;
+      if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
+        OrderBeforeUse = true;
+        if (Pos < MoveUse)
           MoveUse = Pos;
-        }
       }
-    for (auto &P : SU->Preds)
-      if (P.getKind() == SDep::Order) {
-        if (P.getSUnit() == *I && stageScheduled(*I) == StageInst1) {
-          OrderAfterDef = true;
-          MoveDef = Pos;
-        }
-      } else if (TargetRegisterInfo::isPhysicalRegister(P.getReg())) {
-        if (cycleScheduled(SU) != cycleScheduled(P.getSUnit())) {
-          if (P.isAssignedRegDep()) {
-            OrderBeforeUse = true;
-            MoveUse = Pos;
-          }
-        } else {
-          OrderAfterDef = true;
-          MoveDef = Pos;
-        }
+    }
+    for (auto &P : SU->Preds) {
+      if (P.getSUnit() != *I)
+        continue;
+      if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
+        OrderAfterDef = true;
+        MoveDef = Pos;
       }
+    }
   }
 
   // A circular dependence.
@@ -3777,16 +3875,10 @@ bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
       Insts.erase(Insts.begin() + MoveDef);
       Insts.erase(Insts.begin() + MoveUse);
     }
-    if (orderDependence(SSD, UseSU, Insts)) {
-      Insts.push_front(SU);
-      orderDependence(SSD, DefSU, Insts);
-      return true;
-    }
-    Insts.pop_back();
-    Insts.push_back(SU);
-    Insts.push_back(UseSU);
+    orderDependence(SSD, UseSU, Insts);
+    orderDependence(SSD, SU, Insts);
     orderDependence(SSD, DefSU, Insts);
-    return false;
+    return;
   }
   // Put the new instruction first if there is a use in the list. Otherwise,
   // put it at the end of the list.
@@ -3794,14 +3886,13 @@ bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
     Insts.push_front(SU);
   else
     Insts.push_back(SU);
-  return OrderBeforeUse;
 }
 
 /// Return true if the scheduled Phi has a loop carried operand.
 bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
   if (!Phi.isPHI())
     return false;
-  assert(Phi.isPHI() && "Expecing a Phi.");
+  assert(Phi.isPHI() && "Expecting a Phi.");
   SUnit *DefSU = SSD->getSUnit(&Phi);
   unsigned DefCycle = cycleScheduled(DefSU);
   int DefStage = stageScheduled(DefSU);
@@ -3868,6 +3959,100 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
   return true;
 }
 
+/// A property of the node order in swing-modulo-scheduling is
+/// that for nodes outside circuits the following holds:
+/// none of them is scheduled after both a successor and a
+/// predecessor.
+/// The method below checks whether the property is met.
+/// If not, debug information is printed and statistics information updated.
+/// Note that we do not use an assert statement.
+/// The reason is that although an invalid node oder may prevent
+/// the pipeliner from finding a pipelined schedule for arbitrary II,
+/// it does not lead to the generation of incorrect code.
+void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
+
+  // a sorted vector that maps each SUnit to its index in the NodeOrder
+  typedef std::pair<SUnit *, unsigned> UnitIndex;
+  std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
+
+  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
+    Indices.push_back(std::make_pair(NodeOrder[i], i));
+
+  auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
+    return std::get<0>(i1) < std::get<0>(i2);
+  };
+
+  // sort, so that we can perform a binary search
+  llvm::sort(Indices.begin(), Indices.end(), CompareKey);
+
+  bool Valid = true;
+  (void)Valid;
+  // for each SUnit in the NodeOrder, check whether
+  // it appears after both a successor and a predecessor
+  // of the SUnit. If this is the case, and the SUnit
+  // is not part of circuit, then the NodeOrder is not
+  // valid.
+  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
+    SUnit *SU = NodeOrder[i];
+    unsigned Index = i;
+
+    bool PredBefore = false;
+    bool SuccBefore = false;
+
+    SUnit *Succ;
+    SUnit *Pred;
+    (void)Succ;
+    (void)Pred;
+
+    for (SDep &PredEdge : SU->Preds) {
+      SUnit *PredSU = PredEdge.getSUnit();
+      unsigned PredIndex =
+          std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
+                                        std::make_pair(PredSU, 0), CompareKey));
+      if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
+        PredBefore = true;
+        Pred = PredSU;
+        break;
+      }
+    }
+
+    for (SDep &SuccEdge : SU->Succs) {
+      SUnit *SuccSU = SuccEdge.getSUnit();
+      unsigned SuccIndex =
+          std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
+                                        std::make_pair(SuccSU, 0), CompareKey));
+      if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
+        SuccBefore = true;
+        Succ = SuccSU;
+        break;
+      }
+    }
+
+    if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
+      // instructions in circuits are allowed to be scheduled
+      // after both a successor and predecessor.
+      bool InCircuit = std::any_of(
+          Circuits.begin(), Circuits.end(),
+          [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
+      if (InCircuit)
+        LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
+      else {
+        Valid = false;
+        NumNodeOrderIssues++;
+        LLVM_DEBUG(dbgs() << "Predecessor ";);
+      }
+      LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
+                        << " are scheduled before node " << SU->NodeNum
+                        << "\n";);
+    }
+  }
+
+  LLVM_DEBUG({
+    if (!Valid)
+      dbgs() << "Invalid node order found!\n";
+  });
+}
+
 /// Attempt to fix the degenerate cases when the instruction serialization
 /// causes the register lifetimes to overlap. For example,
 ///   p' = store_pi(p, b)
@@ -3987,27 +4172,25 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
   // generated code.
   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
-    std::deque<SUnit *> newOrderZC;
-    // Put the zero-cost, pseudo instructions at the start of the cycle.
+    std::deque<SUnit *> newOrderPhi;
     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
       SUnit *SU = cycleInstrs[i];
-      if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
-        orderDependence(SSD, SU, newOrderZC);
+      if (SU->getInstr()->isPHI())
+        newOrderPhi.push_back(SU);
     }
     std::deque<SUnit *> newOrderI;
-    // Then, add the regular instructions back.
     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
       SUnit *SU = cycleInstrs[i];
-      if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
+      if (!SU->getInstr()->isPHI())
         orderDependence(SSD, SU, newOrderI);
     }
     // Replace the old order with the new order.
-    cycleInstrs.swap(newOrderZC);
+    cycleInstrs.swap(newOrderPhi);
     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
     SSD->fixupRegisterOverlaps(cycleInstrs);
   }
 
-  DEBUG(dump(););
+  LLVM_DEBUG(dump(););
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
index 1e74104e89ed..2619d8f78276 100644
--- a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -89,7 +90,7 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
 
   RI.recalculate(F, DT, PDT, DF);
 
-  DEBUG(RI.dump());
+  LLVM_DEBUG(RI.dump());
 
   return false;
 }
diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index b82ab02a6e6c..6095bdd06b69 100644
--- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -65,23 +66,66 @@ void MachineRegisterInfo::setRegBank(unsigned Reg,
   VRegInfo[Reg].first = &RegBank;
 }
 
-const TargetRegisterClass *
-MachineRegisterInfo::constrainRegClass(unsigned Reg,
-                                       const TargetRegisterClass *RC,
-                                       unsigned MinNumRegs) {
-  const TargetRegisterClass *OldRC = getRegClass(Reg);
+static const TargetRegisterClass *
+constrainRegClass(MachineRegisterInfo &MRI, unsigned Reg,
+                  const TargetRegisterClass *OldRC,
+                  const TargetRegisterClass *RC, unsigned MinNumRegs) {
   if (OldRC == RC)
     return RC;
   const TargetRegisterClass *NewRC =
-    getTargetRegisterInfo()->getCommonSubClass(OldRC, RC);
+      MRI.getTargetRegisterInfo()->getCommonSubClass(OldRC, RC);
   if (!NewRC || NewRC == OldRC)
     return NewRC;
   if (NewRC->getNumRegs() < MinNumRegs)
     return nullptr;
-  setRegClass(Reg, NewRC);
+  MRI.setRegClass(Reg, NewRC);
   return NewRC;
 }
 
+const TargetRegisterClass *
+MachineRegisterInfo::constrainRegClass(unsigned Reg,
+                                       const TargetRegisterClass *RC,
+                                       unsigned MinNumRegs) {
+  return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs);
+}
+
+bool
+MachineRegisterInfo::constrainRegAttrs(unsigned Reg,
+                                       unsigned ConstrainingReg,
+                                       unsigned MinNumRegs) {
+  auto const *OldRC = getRegClassOrNull(Reg);
+  auto const *RC = getRegClassOrNull(ConstrainingReg);
+  // A virtual register at any point must have either a low-level type
+  // or a class assigned, but not both. The only exception is the internals of
+  // GlobalISel's instruction selection pass, which is allowed to temporarily
+  // introduce registers with types and classes both.
+  assert((OldRC || getType(Reg).isValid()) && "Reg has neither class nor type");
+  assert((!OldRC || !getType(Reg).isValid()) && "Reg has class and type both");
+  assert((RC || getType(ConstrainingReg).isValid()) &&
+         "ConstrainingReg has neither class nor type");
+  assert((!RC || !getType(ConstrainingReg).isValid()) &&
+         "ConstrainingReg has class and type both");
+  if (OldRC && RC)
+    return ::constrainRegClass(*this, Reg, OldRC, RC, MinNumRegs);
+  // If one of the virtual registers is generic (used in generic machine
+  // instructions, has a low-level type, doesn't have a class), and the other is
+  // concrete (used in target specific instructions, doesn't have a low-level
+  // type, has a class), we can not unify them.
+  if (OldRC || RC)
+    return false;
+  // At this point, both registers are guaranteed to have a valid low-level
+  // type, and they must agree.
+  if (getType(Reg) != getType(ConstrainingReg))
+    return false;
+  auto const *OldRB = getRegBankOrNull(Reg);
+  auto const *RB = getRegBankOrNull(ConstrainingReg);
+  if (OldRB)
+    return !RB || RB == OldRB;
+  if (RB)
+    setRegBank(Reg, *RB);
+  return true;
+}
+
 bool
 MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
@@ -107,10 +151,11 @@ MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
   return true;
 }
 
-unsigned MachineRegisterInfo::createIncompleteVirtualRegister() {
+unsigned MachineRegisterInfo::createIncompleteVirtualRegister(StringRef Name) {
   unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
   VRegInfo.grow(Reg);
   RegAllocHints.grow(Reg);
+  insertVRegByName(Name, Reg);
   return Reg;
 }
 
@@ -118,47 +163,42 @@ unsigned MachineRegisterInfo::createIncompleteVirtualRegister() {
 /// function with the specified register class.
 ///
 unsigned
-MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
+MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
+                                           StringRef Name) {
   assert(RegClass && "Cannot create register without RegClass!");
   assert(RegClass->isAllocatable() &&
          "Virtual register RegClass must be allocatable.");
 
   // New virtual register number.
-  unsigned Reg = createIncompleteVirtualRegister();
+  unsigned Reg = createIncompleteVirtualRegister(Name);
   VRegInfo[Reg].first = RegClass;
   if (TheDelegate)
     TheDelegate->MRI_NoteNewVirtualRegister(Reg);
   return Reg;
 }
 
-LLT MachineRegisterInfo::getType(unsigned VReg) const {
-  VRegToTypeMap::const_iterator TypeIt = getVRegToType().find(VReg);
-  return TypeIt != getVRegToType().end() ? TypeIt->second : LLT{};
-}
-
 void MachineRegisterInfo::setType(unsigned VReg, LLT Ty) {
   // Check that VReg doesn't have a class.
   assert((getRegClassOrRegBank(VReg).isNull() ||
          !getRegClassOrRegBank(VReg).is<const TargetRegisterClass *>()) &&
          "Can't set the size of a non-generic virtual register");
-  getVRegToType()[VReg] = Ty;
+  VRegToType.grow(VReg);
+  VRegToType[VReg] = Ty;
 }
 
 unsigned
-MachineRegisterInfo::createGenericVirtualRegister(LLT Ty) {
+MachineRegisterInfo::createGenericVirtualRegister(LLT Ty, StringRef Name) {
   // New virtual register number.
-  unsigned Reg = createIncompleteVirtualRegister();
+  unsigned Reg = createIncompleteVirtualRegister(Name);
   // FIXME: Should we use a dummy register class?
   VRegInfo[Reg].first = static_cast<RegisterBank *>(nullptr);
-  getVRegToType()[Reg] = Ty;
+  setType(Reg, Ty);
   if (TheDelegate)
     TheDelegate->MRI_NoteNewVirtualRegister(Reg);
   return Reg;
 }
 
-void MachineRegisterInfo::clearVirtRegTypes() {
-  getVRegToType().clear();
-}
+void MachineRegisterInfo::clearVirtRegTypes() { VRegToType.clear(); }
 
 /// clearVirtRegs - Remove all virtual registers (after physreg assignment).
 void MachineRegisterInfo::clearVirtRegs() {
diff --git a/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 36844e9fb30a..773661965f18 100644
--- a/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -204,7 +204,7 @@ unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
 
-  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
   return InsertedPHI->getOperand(0).getReg();
 }
 
diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
index e15eb658a05c..502d18f08f93 100644
--- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -32,7 +32,6 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
@@ -48,6 +47,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -55,6 +55,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -271,7 +272,7 @@ priorNonDebug(MachineBasicBlock::const_iterator I,
               MachineBasicBlock::const_iterator Beg) {
   assert(I != Beg && "reached the top of the region, cannot decrement");
   while (--I != Beg) {
-    if (!I->isDebugValue())
+    if (!I->isDebugInstr())
       break;
   }
   return I;
@@ -291,7 +292,7 @@ static MachineBasicBlock::const_iterator
 nextIfDebug(MachineBasicBlock::const_iterator I,
             MachineBasicBlock::const_iterator End) {
   for(; I != End; ++I) {
-    if (!I->isDebugValue())
+    if (!I->isDebugInstr())
       break;
   }
   return I;
@@ -344,7 +345,7 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
 /// This design avoids exposing scheduling boundaries to the DAG builder,
 /// simplifying the DAG builder's support for "special" target instructions.
 /// At the same time the design allows target schedulers to operate across
-/// scheduling boundaries, for example to bundle the boudary instructions
+/// scheduling boundaries, for example to bundle the boundary instructions
 /// without reordering them. This creates complexity, because the target
 /// scheduler must update the RegionBegin and RegionEnd positions cached by
 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
@@ -360,7 +361,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   } else if (!mf.getSubtarget().enableMachineScheduler())
     return false;
 
-  DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
+  LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
 
   // Initialize the context of the pass.
   MF = &mf;
@@ -372,7 +373,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   LIS = &getAnalysis<LiveIntervals>();
 
   if (VerifyScheduling) {
-    DEBUG(LIS->dump());
+    LLVM_DEBUG(LIS->dump());
     MF->verify(this, "Before machine scheduling.");
   }
   RegClassInfo->runOnMachineFunction(*MF);
@@ -382,7 +383,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
   scheduleRegions(*Scheduler, false);
 
-  DEBUG(LIS->dump());
+  LLVM_DEBUG(LIS->dump());
   if (VerifyScheduling)
     MF->verify(this, "After machine scheduling.");
   return true;
@@ -396,10 +397,10 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     if (!EnablePostRAMachineSched)
       return false;
   } else if (!mf.getSubtarget().enablePostRAScheduler()) {
-    DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
+    LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
     return false;
   }
-  DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
+  LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
 
   // Initialize the context of the pass.
   MF = &mf;
@@ -481,7 +482,7 @@ getSchedRegions(MachineBasicBlock *MBB,
       MachineInstr &MI = *std::prev(I);
       if (isSchedBoundary(&MI, &*MBB, MF, TII))
         break;
-      if (!MI.isDebugValue())
+      if (!MI.isDebugInstr())
         // MBB::size() uses instr_iterator to count. Here we need a bundle to
         // count as a single instruction.
         ++NumRegionInstrs;
@@ -547,12 +548,13 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
         Scheduler.exitRegion();
         continue;
       }
-      DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB) << " "
-                   << MBB->getName() << "\n  From: " << *I << "    To: ";
-            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-            else dbgs() << "End";
-            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+      LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB)
+                        << " " << MBB->getName() << "\n  From: " << *I
+                        << "    To: ";
+                 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+                 else dbgs() << "End";
+                 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
       if (DumpCriticalPathLength) {
         errs() << MF->getName();
         errs() << ":%bb. " << MBB->getNumber();
@@ -749,8 +751,8 @@ bool ScheduleDAGMI::checkSchedLimit() {
 /// does not consider liveness or register pressure. It is useful for PostRA
 /// scheduling and potentially other custom schedulers.
 void ScheduleDAGMI::schedule() {
-  DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
-  DEBUG(SchedImpl->dumpPolicy());
+  LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
+  LLVM_DEBUG(SchedImpl->dumpPolicy());
 
   // Build the DAG.
   buildSchedGraph(AA);
@@ -762,26 +764,22 @@ void ScheduleDAGMI::schedule() {
   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   findRootsAndBiasEdges(TopRoots, BotRoots);
 
+  LLVM_DEBUG(if (EntrySU.getInstr() != nullptr) EntrySU.dumpAll(this);
+             for (const SUnit &SU
+                  : SUnits) SU.dumpAll(this);
+             if (ExitSU.getInstr() != nullptr) ExitSU.dumpAll(this););
+  if (ViewMISchedDAGs) viewGraph();
+
   // Initialize the strategy before modifying the DAG.
   // This may initialize a DFSResult to be used for queue priority.
   SchedImpl->initialize(this);
 
-  DEBUG(
-    if (EntrySU.getInstr() != nullptr)
-      EntrySU.dumpAll(this);
-    for (const SUnit &SU : SUnits)
-      SU.dumpAll(this);
-    if (ExitSU.getInstr() != nullptr)
-      ExitSU.dumpAll(this);
-  );
-  if (ViewMISchedDAGs) viewGraph();
-
   // Initialize ready queues now that the DAG and priority data are finalized.
   initQueues(TopRoots, BotRoots);
 
   bool IsTopNode = false;
   while (true) {
-    DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
+    LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
     SUnit *SU = SchedImpl->pickNode(IsTopNode);
     if (!SU) break;
 
@@ -821,7 +819,7 @@ void ScheduleDAGMI::schedule() {
 
   placeDebugValues();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "*** Final schedule for "
            << printMBBReference(*begin()->getParent()) << " ***\n";
     dumpSchedule();
@@ -1016,7 +1014,7 @@ void ScheduleDAGMILive::initRegPressure() {
   // Close the RPTracker to finalize live ins.
   RPTracker.closeRegion();
 
-  DEBUG(RPTracker.dump());
+  LLVM_DEBUG(RPTracker.dump());
 
   // Initialize the live ins and live outs.
   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
@@ -1031,8 +1029,8 @@ void ScheduleDAGMILive::initRegPressure() {
   BotRPTracker.initLiveThru(RPTracker);
   if (!BotRPTracker.getLiveThru().empty()) {
     TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
-    DEBUG(dbgs() << "Live Thru: ";
-          dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
+    LLVM_DEBUG(dbgs() << "Live Thru: ";
+               dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
   };
 
   // For each live out vreg reduce the pressure change associated with other
@@ -1046,15 +1044,13 @@ void ScheduleDAGMILive::initRegPressure() {
     updatePressureDiffs(LiveUses);
   }
 
-  DEBUG(
-    dbgs() << "Top Pressure:\n";
-    dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
-    dbgs() << "Bottom Pressure:\n";
-    dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
-  );
+  LLVM_DEBUG(dbgs() << "Top Pressure:\n";
+             dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+             dbgs() << "Bottom Pressure:\n";
+             dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI););
 
   assert((BotRPTracker.getPos() == RegionEnd ||
-          (RegionEnd->isDebugValue() &&
+          (RegionEnd->isDebugInstr() &&
            BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
          "Can't find the region bottom");
 
@@ -1066,17 +1062,16 @@ void ScheduleDAGMILive::initRegPressure() {
   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
     unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
     if (RegionPressure[i] > Limit) {
-      DEBUG(dbgs() << TRI->getRegPressureSetName(i)
-            << " Limit " << Limit
-            << " Actual " << RegionPressure[i] << "\n");
+      LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit
+                        << " Actual " << RegionPressure[i] << "\n");
       RegionCriticalPSets.push_back(PressureChange(i));
     }
   }
-  DEBUG(dbgs() << "Excess PSets: ";
-        for (const PressureChange &RCPS : RegionCriticalPSets)
-          dbgs() << TRI->getRegPressureSetName(
-            RCPS.getPSet()) << " ";
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Excess PSets: ";
+             for (const PressureChange &RCPS
+                  : RegionCriticalPSets) dbgs()
+             << TRI->getRegPressureSetName(RCPS.getPSet()) << " ";
+             dbgs() << "\n");
 }
 
 void ScheduleDAGMILive::
@@ -1097,10 +1092,11 @@ updateScheduledPressure(const SUnit *SU,
     }
     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
     if (NewMaxPressure[ID] >= Limit - 2) {
-      DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
-            << NewMaxPressure[ID]
-            << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ") << Limit
-            << "(+ " << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
+      LLVM_DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
+                        << NewMaxPressure[ID]
+                        << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ")
+                        << Limit << "(+ " << BotRPTracker.getLiveThru()[ID]
+                        << " livethru)\n");
     }
   }
 }
@@ -1130,17 +1126,14 @@ void ScheduleDAGMILive::updatePressureDiffs(
 
         PressureDiff &PDiff = getPressureDiff(&SU);
         PDiff.addPressureChange(Reg, Decrement, &MRI);
-        DEBUG(
-          dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
-                 << printReg(Reg, TRI) << ':' << PrintLaneMask(P.LaneMask)
-                 << ' ' << *SU.getInstr();
-          dbgs() << "              to ";
-          PDiff.dump(*TRI);
-        );
+        LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
+                          << printReg(Reg, TRI) << ':'
+                          << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr();
+                   dbgs() << "              to "; PDiff.dump(*TRI););
       }
     } else {
       assert(P.LaneMask.any());
-      DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
+      LLVM_DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
       // This may be called before CurrentBottom has been initialized. However,
       // BotRPTracker must have a valid position. We want the value live into the
       // instruction or live out of the block, so ask for the previous
@@ -1168,12 +1161,9 @@ void ScheduleDAGMILive::updatePressureDiffs(
           if (LRQ.valueIn() == VNI) {
             PressureDiff &PDiff = getPressureDiff(SU);
             PDiff.addPressureChange(Reg, true, &MRI);
-            DEBUG(
-              dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
-                     << *SU->getInstr();
-              dbgs() << "              to ";
-              PDiff.dump(*TRI);
-            );
+            LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+                              << *SU->getInstr();
+                       dbgs() << "              to "; PDiff.dump(*TRI););
           }
         }
       }
@@ -1192,8 +1182,8 @@ void ScheduleDAGMILive::updatePressureDiffs(
 /// ScheduleDAGMILive then it will want to override this virtual method in order
 /// to update any specialized state.
 void ScheduleDAGMILive::schedule() {
-  DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
-  DEBUG(SchedImpl->dumpPolicy());
+  LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
+  LLVM_DEBUG(SchedImpl->dumpPolicy());
   buildDAGWithRegPressure();
 
   Topo.InitDAGTopologicalSorting();
@@ -1207,26 +1197,22 @@ void ScheduleDAGMILive::schedule() {
   // This may initialize a DFSResult to be used for queue priority.
   SchedImpl->initialize(this);
 
-  DEBUG(
-    if (EntrySU.getInstr() != nullptr)
-      EntrySU.dumpAll(this);
-    for (const SUnit &SU : SUnits) {
-      SU.dumpAll(this);
-      if (ShouldTrackPressure) {
-        dbgs() << "  Pressure Diff      : ";
-        getPressureDiff(&SU).dump(*TRI);
-      }
-      dbgs() << "  Single Issue       : ";
-      if (SchedModel.mustBeginGroup(SU.getInstr()) &&
-         SchedModel.mustEndGroup(SU.getInstr()))
-        dbgs() << "true;";
-      else
-        dbgs() << "false;";
-      dbgs() << '\n';
-    }
-    if (ExitSU.getInstr() != nullptr)
-      ExitSU.dumpAll(this);
-  );
+  LLVM_DEBUG(if (EntrySU.getInstr() != nullptr) EntrySU.dumpAll(this);
+             for (const SUnit &SU
+                  : SUnits) {
+               SU.dumpAll(this);
+               if (ShouldTrackPressure) {
+                 dbgs() << "  Pressure Diff      : ";
+                 getPressureDiff(&SU).dump(*TRI);
+               }
+               dbgs() << "  Single Issue       : ";
+               if (SchedModel.mustBeginGroup(SU.getInstr()) &&
+                   SchedModel.mustEndGroup(SU.getInstr()))
+                 dbgs() << "true;";
+               else
+                 dbgs() << "false;";
+               dbgs() << '\n';
+             } if (ExitSU.getInstr() != nullptr) ExitSU.dumpAll(this););
   if (ViewMISchedDAGs) viewGraph();
 
   // Initialize ready queues now that the DAG and priority data are finalized.
@@ -1234,7 +1220,7 @@ void ScheduleDAGMILive::schedule() {
 
   bool IsTopNode = false;
   while (true) {
-    DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
+    LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
     SUnit *SU = SchedImpl->pickNode(IsTopNode);
     if (!SU) break;
 
@@ -1262,7 +1248,7 @@ void ScheduleDAGMILive::schedule() {
 
   placeDebugValues();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "*** Final schedule for "
            << printMBBReference(*begin()->getParent()) << " ***\n";
     dumpSchedule();
@@ -1379,13 +1365,13 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
       } else
         CyclicLatency = 0;
 
-      DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
-            << SU->NodeNum << ") = " << CyclicLatency << "c\n");
+      LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+                        << SU->NodeNum << ") = " << CyclicLatency << "c\n");
       if (CyclicLatency > MaxCyclicLatency)
         MaxCyclicLatency = CyclicLatency;
     }
   }
-  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+  LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
   return MaxCyclicLatency;
 }
 
@@ -1429,10 +1415,8 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 
       TopRPTracker.advance(RegOpers);
       assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
-      DEBUG(
-        dbgs() << "Top Pressure:\n";
-        dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
-      );
+      LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure(
+                     TopRPTracker.getRegSetPressureAtPos(), TRI););
 
       updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
     }
@@ -1449,6 +1433,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
       }
       moveInstruction(MI, CurrentBottom);
       CurrentBottom = MI;
+      BotRPTracker.setPos(CurrentBottom);
     }
     if (ShouldTrackPressure) {
       RegisterOperands RegOpers;
@@ -1467,10 +1452,8 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
       SmallVector<RegisterMaskPair, 8> LiveUses;
       BotRPTracker.recede(RegOpers, &LiveUses);
       assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
-      DEBUG(
-        dbgs() << "Bottom Pressure:\n";
-        dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
-      );
+      LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure(
+                     BotRPTracker.getRegSetPressureAtPos(), TRI););
 
       updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
       updatePressureDiffs(LiveUses);
@@ -1484,7 +1467,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 
 namespace {
 
-/// \brief Post-process the DAG to create cluster edges between neighboring
+/// Post-process the DAG to create cluster edges between neighboring
 /// loads or between neighboring stores.
 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   struct MemOpInfo {
@@ -1561,7 +1544,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
   if (MemOpRecords.size() < 2)
     return;
 
-  std::sort(MemOpRecords.begin(), MemOpRecords.end());
+  llvm::sort(MemOpRecords.begin(), MemOpRecords.end());
   unsigned ClusterLength = 1;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     SUnit *SUa = MemOpRecords[Idx].SU;
@@ -1570,8 +1553,8 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
                                  *SUb->getInstr(), MemOpRecords[Idx+1].BaseReg,
                                  ClusterLength) &&
         DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-      DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
-            << SUb->NodeNum << ")\n");
+      LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
+                        << SUb->NodeNum << ")\n");
       // Copy successor edges from SUa to SUb. Interleaving computation
       // dependent on SUa can prevent load combining due to register reuse.
       // Predecessor edges do not need to be copied from SUb to SUa since nearby
@@ -1579,7 +1562,8 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
       for (const SDep &Succ : SUa->Succs) {
         if (Succ.getSUnit() == SUb)
           continue;
-        DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
+        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                          << ")\n");
         DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
       }
       ++ClusterLength;
@@ -1588,7 +1572,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
   }
 }
 
-/// \brief Callback from DAG postProcessing to create cluster edges for loads.
+/// Callback from DAG postProcessing to create cluster edges for loads.
 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 
@@ -1629,7 +1613,7 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
 
 namespace {
 
-/// \brief Post-process the DAG to create weak edges from all uses of a copy to
+/// Post-process the DAG to create weak edges from all uses of a copy to
 /// the one use that defines the copy's source vreg, most likely an induction
 /// variable increment.
 class CopyConstrain : public ScheduleDAGMutation {
@@ -1724,7 +1708,7 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
 
   // If GlobalSegment is killed at the LocalLI->start, the call to find()
   // returned the next global segment. But if GlobalSegment overlaps with
-  // LocalLI->start, then advance to the next segement. If a hole in GlobalLI
+  // LocalLI->start, then advance to the next segment. If a hole in GlobalLI
   // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
   if (GlobalSegment->contains(LocalLI->beginIndex()))
     ++GlobalSegment;
@@ -1788,23 +1772,23 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
       return;
     GlobalUses.push_back(Pred.getSUnit());
   }
-  DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
+  LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
   // Add the weak edges.
   for (SmallVectorImpl<SUnit*>::const_iterator
          I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
-    DEBUG(dbgs() << "  Local use SU(" << (*I)->NodeNum << ") -> SU("
-          << GlobalSU->NodeNum << ")\n");
+    LLVM_DEBUG(dbgs() << "  Local use SU(" << (*I)->NodeNum << ") -> SU("
+                      << GlobalSU->NodeNum << ")\n");
     DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
   }
   for (SmallVectorImpl<SUnit*>::const_iterator
          I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
-    DEBUG(dbgs() << "  Global use SU(" << (*I)->NodeNum << ") -> SU("
-          << FirstLocalSU->NodeNum << ")\n");
+    LLVM_DEBUG(dbgs() << "  Global use SU(" << (*I)->NodeNum << ") -> SU("
+                      << FirstLocalSU->NodeNum << ")\n");
     DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
   }
 }
 
-/// \brief Callback from DAG postProcessing to create weak edges to encourage
+/// Callback from DAG postProcessing to create weak edges to encourage
 /// copy elimination.
 void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
@@ -1941,7 +1925,7 @@ getNextResourceCycle(unsigned PIdx, unsigned Cycles) {
 /// The scheduler supports two modes of hazard recognition. The first is the
 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
 /// supports highly complicated in-order reservation tables
-/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
 ///
 /// The second is a streamlined mechanism that checks for hazards based on
 /// simple counters that the scheduler itself maintains. It explicitly checks
@@ -1957,16 +1941,16 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
 
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
-    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
-          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
+    LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+                      << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
   }
 
   if (CurrMOps > 0 &&
       ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
        (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
-    DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
-                 << (isTop()? "begin" : "end") << " group\n");
+    LLVM_DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
+                      << (isTop() ? "begin" : "end") << " group\n");
     return true;
   }
 
@@ -1982,9 +1966,9 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
 #ifndef NDEBUG
         MaxObservedStall = std::max(Cycles, MaxObservedStall);
 #endif
-        DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
-              << SchedModel->getResourceName(ResIdx)
-              << "=" << NRCycle << "c\n");
+        LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
+                          << SchedModel->getResourceName(ResIdx) << "="
+                          << NRCycle << "c\n");
         return true;
       }
     }
@@ -2005,8 +1989,8 @@ findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
     }
   }
   if (LateSU) {
-    DEBUG(dbgs() << Available.getName() << " RemLatency SU("
-          << LateSU->NodeNum << ") " << RemLatency << "c\n");
+    LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU("
+                      << LateSU->NodeNum << ") " << RemLatency << "c\n");
   }
   return RemLatency;
 }
@@ -2022,8 +2006,8 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 
   unsigned OtherCritCount = Rem->RemIssueCount
     + (RetiredMOps * SchedModel->getMicroOpFactor());
-  DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
-        << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
+  LLVM_DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
+                    << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
   for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
        PIdx != PEnd; ++PIdx) {
     unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
@@ -2033,9 +2017,10 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
     }
   }
   if (OtherCritIdx) {
-    DEBUG(dbgs() << "  " << Available.getName() << " + Remain CritRes: "
-          << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
-          << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
+    LLVM_DEBUG(
+        dbgs() << "  " << Available.getName() << " + Remain CritRes: "
+               << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
+               << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
   }
   return OtherCritCount;
 }
@@ -2099,7 +2084,8 @@ void SchedBoundary::bumpCycle(unsigned NextCycle) {
       checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
                          getScheduledLatency());
 
-  DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName()
+                    << '\n');
 }
 
 void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
@@ -2119,8 +2105,8 @@ unsigned SchedBoundary::
 countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) {
   unsigned Factor = SchedModel->getResourceFactor(PIdx);
   unsigned Count = Factor * Cycles;
-  DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx)
-        << " +" << Cycles << "x" << Factor << "u\n");
+  LLVM_DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx) << " +"
+                    << Cycles << "x" << Factor << "u\n");
 
   // Update Executed resources counts.
   incExecutedResources(PIdx, Count);
@@ -2131,16 +2117,17 @@ countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) {
   // becomes the critical resource.
   if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
     ZoneCritResIdx = PIdx;
-    DEBUG(dbgs() << "  *** Critical resource "
-          << SchedModel->getResourceName(PIdx) << ": "
-          << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
+    LLVM_DEBUG(dbgs() << "  *** Critical resource "
+                      << SchedModel->getResourceName(PIdx) << ": "
+                      << getResourceCount(PIdx) / SchedModel->getLatencyFactor()
+                      << "c\n");
   }
   // For reserved resources, record the highest cycle using the resource.
   unsigned NextAvailable = getNextResourceCycle(PIdx, Cycles);
   if (NextAvailable > CurrCycle) {
-    DEBUG(dbgs() << "  Resource conflict: "
-          << SchedModel->getProcResource(PIdx)->Name << " reserved until @"
-          << NextAvailable << "\n");
+    LLVM_DEBUG(dbgs() << "  Resource conflict: "
+                      << SchedModel->getProcResource(PIdx)->Name
+                      << " reserved until @" << NextAvailable << "\n");
   }
   return NextAvailable;
 }
@@ -2165,7 +2152,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
       "Cannot schedule this instruction's MicroOps in the current cycle.");
 
   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
-  DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
+  LLVM_DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
 
   unsigned NextCycle = CurrCycle;
   switch (SchedModel->getMicroOpBufferSize()) {
@@ -2175,7 +2162,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   case 1:
     if (ReadyCycle > NextCycle) {
       NextCycle = ReadyCycle;
-      DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
+      LLVM_DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
     }
     break;
   default:
@@ -2204,8 +2191,9 @@ void SchedBoundary::bumpNode(SUnit *SU) {
       if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
           >= (int)SchedModel->getLatencyFactor()) {
         ZoneCritResIdx = 0;
-        DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
-              << ScaledMOps / SchedModel->getLatencyFactor() << "c\n");
+        LLVM_DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
+                          << ScaledMOps / SchedModel->getLatencyFactor()
+                          << "c\n");
       }
     }
     for (TargetSchedModel::ProcResIter
@@ -2241,13 +2229,13 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
   if (SU->getDepth() > TopLatency) {
     TopLatency = SU->getDepth();
-    DEBUG(dbgs() << "  " << Available.getName()
-          << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n");
+    LLVM_DEBUG(dbgs() << "  " << Available.getName() << " TopLatency SU("
+                      << SU->NodeNum << ") " << TopLatency << "c\n");
   }
   if (SU->getHeight() > BotLatency) {
     BotLatency = SU->getHeight();
-    DEBUG(dbgs() << "  " << Available.getName()
-          << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n");
+    LLVM_DEBUG(dbgs() << "  " << Available.getName() << " BotLatency SU("
+                      << SU->NodeNum << ") " << BotLatency << "c\n");
   }
   // If we stall for any reason, bump the cycle.
   if (NextCycle > CurrCycle)
@@ -2271,17 +2259,17 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   // currCycle to X.
   if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
       (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
-    DEBUG(dbgs() << "  Bump cycle to "
-                 << (isTop() ? "end" : "begin") << " group\n");
+    LLVM_DEBUG(dbgs() << "  Bump cycle to " << (isTop() ? "end" : "begin")
+                      << " group\n");
     bumpCycle(++NextCycle);
   }
 
   while (CurrMOps >= SchedModel->getIssueWidth()) {
-    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
-          << " at cycle " << CurrCycle << '\n');
+    LLVM_DEBUG(dbgs() << "  *** Max MOps " << CurrMOps << " at cycle "
+                      << CurrCycle << '\n');
     bumpCycle(++NextCycle);
   }
-  DEBUG(dumpScheduledState());
+  LLVM_DEBUG(dumpScheduledState());
 }
 
 /// Release pending ready nodes in to the available queue. This makes them
@@ -2354,8 +2342,8 @@ SUnit *SchedBoundary::pickOnlyChoice() {
     releasePending();
   }
 
-  DEBUG(Pending.dump());
-  DEBUG(Available.dump());
+  LLVM_DEBUG(Pending.dump());
+  LLVM_DEBUG(Available.dump());
 
   if (Available.size() == 1)
     return *Available.begin();
@@ -2453,27 +2441,24 @@ void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
   if (!OtherResLimited) {
     if (IsPostRA || (RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath)) {
       Policy.ReduceLatency |= true;
-      DEBUG(dbgs() << "  " << CurrZone.Available.getName()
-            << " RemainingLatency " << RemLatency << " + "
-            << CurrZone.getCurrCycle() << "c > CritPath "
-            << Rem.CriticalPath << "\n");
+      LLVM_DEBUG(dbgs() << "  " << CurrZone.Available.getName()
+                        << " RemainingLatency " << RemLatency << " + "
+                        << CurrZone.getCurrCycle() << "c > CritPath "
+                        << Rem.CriticalPath << "\n");
     }
   }
   // If the same resource is limiting inside and outside the zone, do nothing.
   if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
     return;
 
-  DEBUG(
-    if (CurrZone.isResourceLimited()) {
-      dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
-             << SchedModel->getResourceName(CurrZone.getZoneCritResIdx())
-             << "\n";
-    }
-    if (OtherResLimited)
-      dbgs() << "  RemainingLimit: "
-             << SchedModel->getResourceName(OtherCritIdx) << "\n";
-    if (!CurrZone.isResourceLimited() && !OtherResLimited)
-      dbgs() << "  Latency limited both directions.\n");
+  LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
+    dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
+           << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
+  } if (OtherResLimited) dbgs()
+                 << "  RemainingLimit: "
+                 << SchedModel->getResourceName(OtherCritIdx) << "\n";
+             if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
+             << "  Latency limited both directions.\n");
 
   if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
     Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
@@ -2560,11 +2545,12 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
+namespace llvm {
 /// Return true if this heuristic determines order.
-static bool tryLess(int TryVal, int CandVal,
-                    GenericSchedulerBase::SchedCandidate &TryCand,
-                    GenericSchedulerBase::SchedCandidate &Cand,
-                    GenericSchedulerBase::CandReason Reason) {
+bool tryLess(int TryVal, int CandVal,
+             GenericSchedulerBase::SchedCandidate &TryCand,
+             GenericSchedulerBase::SchedCandidate &Cand,
+             GenericSchedulerBase::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -2577,10 +2563,10 @@ static bool tryLess(int TryVal, int CandVal,
   return false;
 }
 
-static bool tryGreater(int TryVal, int CandVal,
-                       GenericSchedulerBase::SchedCandidate &TryCand,
-                       GenericSchedulerBase::SchedCandidate &Cand,
-                       GenericSchedulerBase::CandReason Reason) {
+bool tryGreater(int TryVal, int CandVal,
+                GenericSchedulerBase::SchedCandidate &TryCand,
+                GenericSchedulerBase::SchedCandidate &Cand,
+                GenericSchedulerBase::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -2593,9 +2579,9 @@ static bool tryGreater(int TryVal, int CandVal,
   return false;
 }
 
-static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
-                       GenericSchedulerBase::SchedCandidate &Cand,
-                       SchedBoundary &Zone) {
+bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                GenericSchedulerBase::SchedCandidate &Cand,
+                SchedBoundary &Zone) {
   if (Zone.isTop()) {
     if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
       if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
@@ -2617,10 +2603,11 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
   }
   return false;
 }
+} // end namespace llvm
 
 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
-  DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
-        << GenericSchedulerBase::getReasonStr(Reason) << '\n');
+  LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
+                    << GenericSchedulerBase::getReasonStr(Reason) << '\n');
 }
 
 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
@@ -2742,14 +2729,14 @@ void GenericScheduler::checkAcyclicLatency() {
 
   Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
 
-  DEBUG(dbgs() << "IssueCycles="
-        << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
-        << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
-        << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
-        << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
-        << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
-        if (Rem.IsAcyclicLatencyLimited)
-          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
+  LLVM_DEBUG(
+      dbgs() << "IssueCycles="
+             << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+             << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+             << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount
+             << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+             << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+      if (Rem.IsAcyclicLatencyLimited) dbgs() << "  ACYCLIC LATENCY LIMIT\n");
 }
 
 void GenericScheduler::registerRoots() {
@@ -2760,7 +2747,7 @@ void GenericScheduler::registerRoots() {
     if (SU->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = SU->getDepth();
   }
-  DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
+  LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
   if (DumpCriticalPathLength) {
     errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
   }
@@ -2771,13 +2758,14 @@ void GenericScheduler::registerRoots() {
   }
 }
 
-static bool tryPressure(const PressureChange &TryP,
-                        const PressureChange &CandP,
-                        GenericSchedulerBase::SchedCandidate &TryCand,
-                        GenericSchedulerBase::SchedCandidate &Cand,
-                        GenericSchedulerBase::CandReason Reason,
-                        const TargetRegisterInfo *TRI,
-                        const MachineFunction &MF) {
+namespace llvm {
+bool tryPressure(const PressureChange &TryP,
+                 const PressureChange &CandP,
+                 GenericSchedulerBase::SchedCandidate &TryCand,
+                 GenericSchedulerBase::SchedCandidate &Cand,
+                 GenericSchedulerBase::CandReason Reason,
+                 const TargetRegisterInfo *TRI,
+                 const MachineFunction &MF) {
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
@@ -2810,7 +2798,7 @@ static bool tryPressure(const PressureChange &TryP,
   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
 }
 
-static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+unsigned getWeakLeft(const SUnit *SU, bool isTop) {
   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
 }
 
@@ -2821,7 +2809,7 @@ static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
 /// with the operation that produces or consumes the physreg. We'll do this when
 /// regalloc has support for parallel copies.
-static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
+int biasPhysRegCopy(const SUnit *SU, bool isTop) {
   const MachineInstr *MI = SU->getInstr();
   if (!MI->isCopy())
     return 0;
@@ -2841,6 +2829,7 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
     return AtBoundary ? -1 : 1;
   return 0;
 }
+} // end namespace llvm
 
 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
@@ -2873,13 +2862,13 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
       }
     }
   }
-  DEBUG(if (Cand.RPDelta.Excess.isValid())
-          dbgs() << "  Try  SU(" << Cand.SU->NodeNum << ") "
-                 << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet())
-                 << ":" << Cand.RPDelta.Excess.getUnitInc() << "\n");
+  LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs()
+             << "  Try  SU(" << Cand.SU->NodeNum << ") "
+             << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":"
+             << Cand.RPDelta.Excess.getUnitInc() << "\n");
 }
 
-/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// Apply a set of heuristics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
 /// queue. But it's really done to make the heuristics easier to debug and
@@ -2891,7 +2880,7 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
 //              if Cand is from a different zone than TryCand.
 void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                     SchedCandidate &TryCand,
-                                    SchedBoundary *Zone) {
+                                    SchedBoundary *Zone) const {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = NodeOrder;
@@ -3017,7 +3006,7 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(DAG, SchedModel);
       Cand.setBest(TryCand);
-      DEBUG(traceCandidate(Cand));
+      LLVM_DEBUG(traceCandidate(Cand));
     }
   }
 }
@@ -3046,14 +3035,14 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 
   // See if BotCand is still valid (because we previously scheduled from Top).
-  DEBUG(dbgs() << "Picking from Bot:\n");
+  LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
-    DEBUG(traceCandidate(BotCand));
+    LLVM_DEBUG(traceCandidate(BotCand));
 #ifndef NDEBUG
     if (VerifyScheduling) {
       SchedCandidate TCand;
@@ -3066,14 +3055,14 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   }
 
   // Check if the top Q has a better candidate.
-  DEBUG(dbgs() << "Picking from Top:\n");
+  LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
-    DEBUG(traceCandidate(TopCand));
+    LLVM_DEBUG(traceCandidate(TopCand));
 #ifndef NDEBUG
     if (VerifyScheduling) {
       SchedCandidate TCand;
@@ -3093,7 +3082,7 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   tryCandidate(Cand, TopCand, nullptr);
   if (TopCand.Reason != NoCand) {
     Cand.setBest(TopCand);
-    DEBUG(traceCandidate(Cand));
+    LLVM_DEBUG(traceCandidate(Cand));
   }
 
   IsTopNode = Cand.AtTop;
@@ -3142,7 +3131,8 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
   return SU;
 }
 
@@ -3163,8 +3153,8 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
     MachineInstr *Copy = DepSU->getInstr();
     if (!Copy->isCopy())
       continue;
-    DEBUG(dbgs() << "  Rescheduling physreg copy ";
-          Dep.getSUnit()->dump(DAG));
+    LLVM_DEBUG(dbgs() << "  Rescheduling physreg copy ";
+               Dep.getSUnit()->dump(DAG));
     DAG->moveInstruction(Copy, InsertPos);
   }
 }
@@ -3243,13 +3233,13 @@ void PostGenericScheduler::registerRoots() {
     if (SU->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = SU->getDepth();
   }
-  DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
+  LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
   if (DumpCriticalPathLength) {
     errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
   }
 }
 
-/// Apply a set of heursitics to a new candidate for PostRA scheduling.
+/// Apply a set of heuristics to a new candidate for PostRA scheduling.
 ///
 /// \param Cand provides the policy and current best candidate.
 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
@@ -3301,7 +3291,7 @@ void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
     tryCandidate(Cand, TryCand);
     if (TryCand.Reason != NoCand) {
       Cand.setBest(TryCand);
-      DEBUG(traceCandidate(Cand));
+      LLVM_DEBUG(traceCandidate(Cand));
     }
   }
 }
@@ -3333,7 +3323,8 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
   IsTopNode = true;
   Top.removeReady(SU);
 
-  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
   return SU;
 }
 
@@ -3355,7 +3346,7 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
 
 namespace {
 
-/// \brief Order nodes by the ILP metric.
+/// Order nodes by the ILP metric.
 struct ILPOrder {
   const SchedDFSResult *DFSResult = nullptr;
   const BitVector *ScheduledTrees = nullptr;
@@ -3363,7 +3354,7 @@ struct ILPOrder {
 
   ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
 
-  /// \brief Apply a less-than relation on node priority.
+  /// Apply a less-than relation on node priority.
   ///
   /// (Return true if A comes after B in the Q.)
   bool operator()(const SUnit *A, const SUnit *B) const {
@@ -3388,7 +3379,7 @@ struct ILPOrder {
   }
 };
 
-/// \brief Schedule based on the ILP metric.
+/// Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
   ScheduleDAGMILive *DAG = nullptr;
   ILPOrder Cmp;
@@ -3422,16 +3413,19 @@ public:
     SUnit *SU = ReadyQ.back();
     ReadyQ.pop_back();
     IsTopNode = false;
-    DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") "
-          << " ILP: " << DAG->getDFSResult()->getILP(SU)
-          << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @"
-          << DAG->getDFSResult()->getSubtreeLevel(
-            DAG->getDFSResult()->getSubtreeID(SU)) << '\n'
-          << "Scheduling " << *SU->getInstr());
+    LLVM_DEBUG(dbgs() << "Pick node "
+                      << "SU(" << SU->NodeNum << ") "
+                      << " ILP: " << DAG->getDFSResult()->getILP(SU)
+                      << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU)
+                      << " @"
+                      << DAG->getDFSResult()->getSubtreeLevel(
+                             DAG->getDFSResult()->getSubtreeID(SU))
+                      << '\n'
+                      << "Scheduling " << *SU->getInstr());
     return SU;
   }
 
-  /// \brief Scheduler callback to notify that a new subtree is scheduled.
+  /// Scheduler callback to notify that a new subtree is scheduled.
   void scheduleTree(unsigned SubtreeID) override {
     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
   }
diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp
index bedfdd84b1ca..354f46e9e625 100644
--- a/contrib/llvm/lib/CodeGen/MachineSink.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp
@@ -77,6 +77,7 @@ static cl::opt<unsigned> SplitEdgeProbabilityThreshold(
 STATISTIC(NumSunk,      "Number of machine instructions sunk");
 STATISTIC(NumSplit,     "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
+STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
 
 namespace {
 
@@ -138,7 +139,7 @@ namespace {
                                      MachineBasicBlock *From,
                                      MachineBasicBlock *To);
 
-    /// \brief Postpone the splitting of the given critical
+    /// Postpone the splitting of the given critical
     /// edge (\p From, \p To).
     ///
     /// We do not split the edges on the fly. Indeed, this invalidates
@@ -210,8 +211,8 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
   MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
   if (DefMI->isCopyLike())
     return false;
-  DEBUG(dbgs() << "Coalescing: " << *DefMI);
-  DEBUG(dbgs() << "*** to: " << MI);
+  LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
+  LLVM_DEBUG(dbgs() << "*** to: " << MI);
   MRI->replaceRegWith(DstReg, SrcReg);
   MI.eraseFromParent();
 
@@ -295,7 +296,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  DEBUG(dbgs() << "******** Machine Sinking ********\n");
+  LLVM_DEBUG(dbgs() << "******** Machine Sinking ********\n");
 
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
@@ -322,14 +323,14 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     for (auto &Pair : ToSplit) {
       auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, *this);
       if (NewSucc != nullptr) {
-        DEBUG(dbgs() << " *** Splitting critical edge: "
-                     << printMBBReference(*Pair.first) << " -- "
-                     << printMBBReference(*NewSucc) << " -- "
-                     << printMBBReference(*Pair.second) << '\n');
+        LLVM_DEBUG(dbgs() << " *** Splitting critical edge: "
+                          << printMBBReference(*Pair.first) << " -- "
+                          << printMBBReference(*NewSucc) << " -- "
+                          << printMBBReference(*Pair.second) << '\n');
         MadeChange = true;
         ++NumSplit;
       } else
-        DEBUG(dbgs() << " *** Not legal to break critical edge\n");
+        LLVM_DEBUG(dbgs() << " *** Not legal to break critical edge\n");
     }
     // If this iteration over the code changed anything, keep iterating.
     if (!MadeChange) break;
@@ -371,7 +372,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
     if (!ProcessedBegin)
       --I;
 
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
     bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
@@ -708,7 +709,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   return SuccToSinkTo;
 }
 
-/// \brief Return true if MI is likely to be usable as a memory operation by the
+/// Return true if MI is likely to be usable as a memory operation by the
 /// implicit null check optimization.
 ///
 /// This is a "best effort" heuristic, and should not be relied upon for
@@ -752,6 +753,37 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
          MBP.LHS.getReg() == BaseReg;
 }
 
+/// Sink an instruction and its associated debug instructions.
+static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
+                        MachineBasicBlock::iterator InsertPos) {
+  // Collect matching debug values.
+  SmallVector<MachineInstr *, 2> DbgValuesToSink;
+  collectDebugValues(MI, DbgValuesToSink);
+
+  // If we cannot find a location to use (merge with), then we erase the debug
+  // location to prevent debug-info driven tools from potentially reporting
+  // wrong location information.
+  if (!SuccToSinkTo.empty() && InsertPos != SuccToSinkTo.end())
+    MI.setDebugLoc(DILocation::getMergedLocation(MI.getDebugLoc(),
+                                                 InsertPos->getDebugLoc()));
+  else
+    MI.setDebugLoc(DebugLoc());
+
+  // Move the instruction.
+  MachineBasicBlock *ParentBlock = MI.getParent();
+  SuccToSinkTo.splice(InsertPos, ParentBlock, MI,
+                      ++MachineBasicBlock::iterator(MI));
+
+  // Move previously adjacent debug value instructions to the insert position.
+  for (SmallVectorImpl<MachineInstr *>::iterator DBI = DbgValuesToSink.begin(),
+                                                 DBE = DbgValuesToSink.end();
+       DBI != DBE; ++DBI) {
+    MachineInstr *DbgMI = *DBI;
+    SuccToSinkTo.splice(InsertPos, ParentBlock, DbgMI,
+                        ++MachineBasicBlock::iterator(DbgMI));
+  }
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
@@ -803,7 +835,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
       return false;
   }
 
-  DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
+  LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
 
   // If the block has multiple predecessors, this is a critical edge.
   // Decide if we can sink along it or need to break the edge.
@@ -813,26 +845,26 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     bool TryBreak = false;
     bool store = true;
     if (!MI.isSafeToMove(AA, store)) {
-      DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
+      LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
       TryBreak = true;
     }
 
     // We don't want to sink across a critical edge if we don't dominate the
     // successor. We could be introducing calculations to new code paths.
     if (!TryBreak && !DT->dominates(ParentBlock, SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** NOTE: Critical edge found\n");
+      LLVM_DEBUG(dbgs() << " *** NOTE: Critical edge found\n");
       TryBreak = true;
     }
 
     // Don't sink instructions into a loop.
     if (!TryBreak && LI->isLoopHeader(SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** NOTE: Loop header found\n");
+      LLVM_DEBUG(dbgs() << " *** NOTE: Loop header found\n");
       TryBreak = true;
     }
 
     // Otherwise we are OK with sinking along a critical edge.
     if (!TryBreak)
-      DEBUG(dbgs() << "Sinking along critical edge.\n");
+      LLVM_DEBUG(dbgs() << "Sinking along critical edge.\n");
     else {
       // Mark this edge as to be split.
       // If the edge can actually be split, the next iteration of the main loop
@@ -840,8 +872,8 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
       bool Status =
         PostponeSplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
       if (!Status)
-        DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
-              "break critical edge\n");
+        LLVM_DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+                             "break critical edge\n");
       // The instruction will not be sunk this time.
       return false;
     }
@@ -854,8 +886,8 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     bool Status = PostponeSplitCriticalEdge(MI, ParentBlock,
                                             SuccToSinkTo, BreakPHIEdge);
     if (!Status)
-      DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
-            "break critical edge\n");
+      LLVM_DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+                           "break critical edge\n");
     // The instruction will not be sunk this time.
     return false;
   }
@@ -865,30 +897,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
     ++InsertPos;
 
-  // collect matching debug values.
-  SmallVector<MachineInstr *, 2> DbgValuesToSink;
-  collectDebugValues(MI, DbgValuesToSink);
-
-  // Merge or erase debug location to ensure consistent stepping in profilers
-  // and debuggers.
-  if (!SuccToSinkTo->empty() && InsertPos != SuccToSinkTo->end())
-    MI.setDebugLoc(DILocation::getMergedLocation(MI.getDebugLoc(),
-                                                 InsertPos->getDebugLoc()));
-  else
-    MI.setDebugLoc(DebugLoc());
-
-
-  // Move the instruction.
-  SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
-                       ++MachineBasicBlock::iterator(MI));
-
-  // Move previously adjacent debug value instructions to the insert position.
-  for (SmallVectorImpl<MachineInstr *>::iterator DBI = DbgValuesToSink.begin(),
-         DBE = DbgValuesToSink.end(); DBI != DBE; ++DBI) {
-    MachineInstr *DbgMI = *DBI;
-    SuccToSinkTo->splice(InsertPos, ParentBlock,  DbgMI,
-                         ++MachineBasicBlock::iterator(DbgMI));
-  }
+  performSink(MI, *SuccToSinkTo, InsertPos);
 
   // Conservatively, clear any kill flags, since it's possible that they are no
   // longer correct.
@@ -902,3 +911,282 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
 
   return true;
 }
+
+//===----------------------------------------------------------------------===//
+// This pass is not intended to be a replacement or a complete alternative
+// for the pre-ra machine sink pass. It is only designed to sink COPY
+// instructions which should be handled after RA.
+//
+// This pass sinks COPY instructions into a successor block, if the COPY is not
+// used in the current block and the COPY is live-in to a single successor
+// (i.e., doesn't require the COPY to be duplicated).  This avoids executing the
+// copy on paths where their results aren't needed.  This also exposes
+// additional opportunites for dead copy elimination and shrink wrapping.
+//
+// These copies were either not handled by or are inserted after the MachineSink
+// pass. As an example of the former case, the MachineSink pass cannot sink
+// COPY instructions with allocatable source registers; for AArch64 these type
+// of copy instructions are frequently used to move function parameters (PhyReg)
+// into virtual registers in the entry block.
+//
+// For the machine IR below, this pass will sink %w19 in the entry into its
+// successor (%bb.1) because %w19 is only live-in in %bb.1.
+// %bb.0:
+//   %wzr = SUBSWri %w1, 1
+//   %w19 = COPY %w0
+//   Bcc 11, %bb.2
+// %bb.1:
+//   Live Ins: %w19
+//   BL @fun
+//   %w0 = ADDWrr %w0, %w19
+//   RET %w0
+// %bb.2:
+//   %w0 = COPY %wzr
+//   RET %w0
+// As we sink %w19 (CSR in AArch64) into %bb.1, the shrink-wrapping pass will be
+// able to see %bb.0 as a candidate.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class PostRAMachineSinking : public MachineFunctionPass {
+public:
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+  PostRAMachineSinking() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override { return "PostRA Machine Sink"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  /// Track which register units have been modified and used.
+  LiveRegUnits ModifiedRegUnits, UsedRegUnits;
+
+  /// Sink Copy instructions unused in the same block close to their uses in
+  /// successors.
+  bool tryToSinkCopy(MachineBasicBlock &BB, MachineFunction &MF,
+                     const TargetRegisterInfo *TRI, const TargetInstrInfo *TII);
+};
+} // namespace
+
+char PostRAMachineSinking::ID = 0;
+char &llvm::PostRAMachineSinkingID = PostRAMachineSinking::ID;
+
+INITIALIZE_PASS(PostRAMachineSinking, "postra-machine-sink",
+                "PostRA Machine Sink", false, false)
+
+static bool aliasWithRegsInLiveIn(MachineBasicBlock &MBB, unsigned Reg,
+                                  const TargetRegisterInfo *TRI) {
+  LiveRegUnits LiveInRegUnits(*TRI);
+  LiveInRegUnits.addLiveIns(MBB);
+  return !LiveInRegUnits.available(Reg);
+}
+
+static MachineBasicBlock *
+getSingleLiveInSuccBB(MachineBasicBlock &CurBB,
+                      const SmallPtrSetImpl<MachineBasicBlock *> &SinkableBBs,
+                      unsigned Reg, const TargetRegisterInfo *TRI) {
+  // Try to find a single sinkable successor in which Reg is live-in.
+  MachineBasicBlock *BB = nullptr;
+  for (auto *SI : SinkableBBs) {
+    if (aliasWithRegsInLiveIn(*SI, Reg, TRI)) {
+      // If BB is set here, Reg is live-in to at least two sinkable successors,
+      // so quit.
+      if (BB)
+        return nullptr;
+      BB = SI;
+    }
+  }
+  // Reg is not live-in to any sinkable successors.
+  if (!BB)
+    return nullptr;
+
+  // Check if any register aliased with Reg is live-in in other successors.
+  for (auto *SI : CurBB.successors()) {
+    if (!SinkableBBs.count(SI) && aliasWithRegsInLiveIn(*SI, Reg, TRI))
+      return nullptr;
+  }
+  return BB;
+}
+
+static MachineBasicBlock *
+getSingleLiveInSuccBB(MachineBasicBlock &CurBB,
+                      const SmallPtrSetImpl<MachineBasicBlock *> &SinkableBBs,
+                      ArrayRef<unsigned> DefedRegsInCopy,
+                      const TargetRegisterInfo *TRI) {
+  MachineBasicBlock *SingleBB = nullptr;
+  for (auto DefReg : DefedRegsInCopy) {
+    MachineBasicBlock *BB =
+        getSingleLiveInSuccBB(CurBB, SinkableBBs, DefReg, TRI);
+    if (!BB || (SingleBB && SingleBB != BB))
+      return nullptr;
+    SingleBB = BB;
+  }
+  return SingleBB;
+}
+
+static void clearKillFlags(MachineInstr *MI, MachineBasicBlock &CurBB,
+                           SmallVectorImpl<unsigned> &UsedOpsInCopy,
+                           LiveRegUnits &UsedRegUnits,
+                           const TargetRegisterInfo *TRI) {
+  for (auto U : UsedOpsInCopy) {
+    MachineOperand &MO = MI->getOperand(U);
+    unsigned SrcReg = MO.getReg();
+    if (!UsedRegUnits.available(SrcReg)) {
+      MachineBasicBlock::iterator NI = std::next(MI->getIterator());
+      for (MachineInstr &UI : make_range(NI, CurBB.end())) {
+        if (UI.killsRegister(SrcReg, TRI)) {
+          UI.clearRegisterKills(SrcReg, TRI);
+          MO.setIsKill(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void updateLiveIn(MachineInstr *MI, MachineBasicBlock *SuccBB,
+                         SmallVectorImpl<unsigned> &UsedOpsInCopy,
+                         SmallVectorImpl<unsigned> &DefedRegsInCopy) {
+  for (auto DefReg : DefedRegsInCopy)
+    SuccBB->removeLiveIn(DefReg);
+  for (auto U : UsedOpsInCopy) {
+    unsigned Reg = MI->getOperand(U).getReg();
+    if (!SuccBB->isLiveIn(Reg))
+      SuccBB->addLiveIn(Reg);
+  }
+}
+
+static bool hasRegisterDependency(MachineInstr *MI,
+                                  SmallVectorImpl<unsigned> &UsedOpsInCopy,
+                                  SmallVectorImpl<unsigned> &DefedRegsInCopy,
+                                  LiveRegUnits &ModifiedRegUnits,
+                                  LiveRegUnits &UsedRegUnits) {
+  bool HasRegDependency = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef()) {
+      if (!ModifiedRegUnits.available(Reg) || !UsedRegUnits.available(Reg)) {
+        HasRegDependency = true;
+        break;
+      }
+      DefedRegsInCopy.push_back(Reg);
+
+      // FIXME: instead of isUse(), readsReg() would be a better fix here,
+      // For example, we can ignore modifications in reg with undef. However,
+      // it's not perfectly clear if skipping the internal read is safe in all
+      // other targets.
+    } else if (MO.isUse()) {
+      if (!ModifiedRegUnits.available(Reg)) {
+        HasRegDependency = true;
+        break;
+      }
+      UsedOpsInCopy.push_back(i);
+    }
+  }
+  return HasRegDependency;
+}
+
+bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
+                                         MachineFunction &MF,
+                                         const TargetRegisterInfo *TRI,
+                                         const TargetInstrInfo *TII) {
+  SmallPtrSet<MachineBasicBlock *, 2> SinkableBBs;
+  // FIXME: For now, we sink only to a successor which has a single predecessor
+  // so that we can directly sink COPY instructions to the successor without
+  // adding any new block or branch instruction.
+  for (MachineBasicBlock *SI : CurBB.successors())
+    if (!SI->livein_empty() && SI->pred_size() == 1)
+      SinkableBBs.insert(SI);
+
+  if (SinkableBBs.empty())
+    return false;
+
+  bool Changed = false;
+
+  // Track which registers have been modified and used between the end of the
+  // block and the current instruction.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+
+  for (auto I = CurBB.rbegin(), E = CurBB.rend(); I != E;) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    if (MI->isDebugInstr())
+      continue;
+
+    // Do not move any instruction across function call.
+    if (MI->isCall())
+      return false;
+
+    if (!MI->isCopy() || !MI->getOperand(0).isRenamable()) {
+      LiveRegUnits::accumulateUsedDefed(*MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      continue;
+    }
+
+    // Track the operand index for use in Copy.
+    SmallVector<unsigned, 2> UsedOpsInCopy;
+    // Track the register number defed in Copy.
+    SmallVector<unsigned, 2> DefedRegsInCopy;
+
+    // Don't sink the COPY if it would violate a register dependency.
+    if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
+                              ModifiedRegUnits, UsedRegUnits)) {
+      LiveRegUnits::accumulateUsedDefed(*MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      continue;
+    }
+    assert((!UsedOpsInCopy.empty() && !DefedRegsInCopy.empty()) &&
+           "Unexpect SrcReg or DefReg");
+    MachineBasicBlock *SuccBB =
+        getSingleLiveInSuccBB(CurBB, SinkableBBs, DefedRegsInCopy, TRI);
+    // Don't sink if we cannot find a single sinkable successor in which Reg
+    // is live-in.
+    if (!SuccBB) {
+      LiveRegUnits::accumulateUsedDefed(*MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      continue;
+    }
+    assert((SuccBB->pred_size() == 1 && *SuccBB->pred_begin() == &CurBB) &&
+           "Unexpected predecessor");
+
+    // Clear the kill flag if SrcReg is killed between MI and the end of the
+    // block.
+    clearKillFlags(MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
+    MachineBasicBlock::iterator InsertPos = SuccBB->getFirstNonPHI();
+    performSink(*MI, *SuccBB, InsertPos);
+    updateLiveIn(MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
+
+    Changed = true;
+    ++NumPostRACopySink;
+  }
+  return Changed;
+}
+
+bool PostRAMachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  ModifiedRegUnits.init(*TRI);
+  UsedRegUnits.init(*TRI);
+  for (auto &BB : MF)
+    Changed |= tryToSinkCopy(BB, MF, TRI, TII);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index d81c6f8a31e1..b444cd31eba2 100644
--- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -70,7 +70,7 @@ bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
   TRI = ST.getRegisterInfo();
   MRI = &MF->getRegInfo();
   Loops = &getAnalysis<MachineLoopInfo>();
-  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  SchedModel.init(&ST);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcResourceCycles.resize(MF->getNumBlockIDs() *
                             SchedModel.getNumProcResourceKinds());
@@ -396,8 +396,8 @@ MachineTraceMetrics::getEnsemble(MachineTraceMetrics::Strategy strategy) {
 }
 
 void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
-               << '\n');
+  LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
+                    << '\n');
   BlockInfo[MBB->getNumber()].invalidate();
   for (unsigned i = 0; i != TS_NumStrategies; ++i)
     if (Ensembles[i])
@@ -477,8 +477,8 @@ public:
 
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Computing " << getName() << " trace through "
-               << printMBBReference(*MBB) << '\n');
+  LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
+                    << printMBBReference(*MBB) << '\n');
   // Set up loop bounds for the backwards post-order traversal.
   LoopBounds Bounds(BlockInfo, MTM.Loops);
 
@@ -486,11 +486,11 @@ void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   Bounds.Downward = false;
   Bounds.Visited.clear();
   for (auto I : inverse_post_order_ext(MBB, Bounds)) {
-    DEBUG(dbgs() << "  pred for " << printMBBReference(*I) << ": ");
+    LLVM_DEBUG(dbgs() << "  pred for " << printMBBReference(*I) << ": ");
     TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
     // All the predecessors have been visited, pick the preferred one.
     TBI.Pred = pickTracePred(I);
-    DEBUG({
+    LLVM_DEBUG({
       if (TBI.Pred)
         dbgs() << printMBBReference(*TBI.Pred) << '\n';
       else
@@ -504,11 +504,11 @@ void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   Bounds.Downward = true;
   Bounds.Visited.clear();
   for (auto I : post_order_ext(MBB, Bounds)) {
-    DEBUG(dbgs() << "  succ for " << printMBBReference(*I) << ": ");
+    LLVM_DEBUG(dbgs() << "  succ for " << printMBBReference(*I) << ": ");
     TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
     // All the successors have been visited, pick the preferred one.
     TBI.Succ = pickTraceSucc(I);
-    DEBUG({
+    LLVM_DEBUG({
       if (TBI.Succ)
         dbgs() << printMBBReference(*TBI.Succ) << '\n';
       else
@@ -531,8 +531,8 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
     WorkList.push_back(BadMBB);
     do {
       const MachineBasicBlock *MBB = WorkList.pop_back_val();
-      DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
-                   << getName() << " height.\n");
+      LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
+                        << getName() << " height.\n");
       // Find any MBB predecessors that have MBB as their preferred successor.
       // They are the only ones that need to be invalidated.
       for (const MachineBasicBlock *Pred : MBB->predecessors()) {
@@ -556,8 +556,8 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
     WorkList.push_back(BadMBB);
     do {
       const MachineBasicBlock *MBB = WorkList.pop_back_val();
-      DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
-                   << getName() << " depth.\n");
+      LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
+                        << getName() << " depth.\n");
       // Find any MBB successors that have MBB as their preferred predecessor.
       // They are the only ones that need to be invalidated.
       for (const MachineBasicBlock *Succ : MBB->successors()) {
@@ -653,7 +653,7 @@ static bool getDataDeps(const MachineInstr &UseMI,
                         SmallVectorImpl<DataDep> &Deps,
                         const MachineRegisterInfo *MRI) {
   // Debug values should not be included in any calculations.
-  if (UseMI.isDebugValue())
+  if (UseMI.isDebugInstr())
     return false;
   
   bool HasPhysRegs = false;
@@ -813,9 +813,9 @@ updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
   if (TBI.HasValidInstrHeights) {
     // Update critical path length.
     TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
-    DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI);
+    LLVM_DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI);
   } else {
-    DEBUG(dbgs() << Cycle << '\t' << UseMI);
+    LLVM_DEBUG(dbgs() << Cycle << '\t' << UseMI);
   }
 }
 
@@ -860,13 +860,13 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
   // Go through trace blocks in top-down order, stopping after the center block.
   while (!Stack.empty()) {
     MBB = Stack.pop_back_val();
-    DEBUG(dbgs() << "\nDepths for " << printMBBReference(*MBB) << ":\n");
+    LLVM_DEBUG(dbgs() << "\nDepths for " << printMBBReference(*MBB) << ":\n");
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     TBI.HasValidInstrDepths = true;
     TBI.CriticalPath = 0;
 
     // Print out resource depths here as well.
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << format("%7u Instructions\n", TBI.InstrDepth);
       ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());
       for (unsigned K = 0; K != PRDepths.size(); ++K)
@@ -1045,12 +1045,12 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   SmallVector<DataDep, 8> Deps;
   for (;!Stack.empty(); Stack.pop_back()) {
     MBB = Stack.back();
-    DEBUG(dbgs() << "Heights for " << printMBBReference(*MBB) << ":\n");
+    LLVM_DEBUG(dbgs() << "Heights for " << printMBBReference(*MBB) << ":\n");
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     TBI.HasValidInstrHeights = true;
     TBI.CriticalPath = 0;
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << format("%7u Instructions\n", TBI.InstrHeight);
       ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());
       for (unsigned K = 0; K != PRHeights.size(); ++K)
@@ -1081,7 +1081,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
         if (!Deps.empty()) {
           // Loop header PHI heights are all 0.
           unsigned Height = TBI.Succ ? Cycles.lookup(&PHI).Height : 0;
-          DEBUG(dbgs() << "pred\t" << Height << '\t' << PHI);
+          LLVM_DEBUG(dbgs() << "pred\t" << Height << '\t' << PHI);
           if (pushDepHeight(Deps.front(), PHI, Height, Heights, MTM.SchedModel,
                             MTM.TII))
             addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
@@ -1122,38 +1122,38 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
       InstrCycles &MICycles = Cycles[&MI];
       MICycles.Height = Cycle;
       if (!TBI.HasValidInstrDepths) {
-        DEBUG(dbgs() << Cycle << '\t' << MI);
+        LLVM_DEBUG(dbgs() << Cycle << '\t' << MI);
         continue;
       }
       // Update critical path length.
       TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
-      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << MI);
+      LLVM_DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << MI);
     }
 
     // Update virtual live-in heights. They were added by addLiveIns() with a 0
     // height because the final height isn't known until now.
-    DEBUG(dbgs() << printMBBReference(*MBB) << " Live-ins:");
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " Live-ins:");
     for (LiveInReg &LIR : TBI.LiveIns) {
       const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
       LIR.Height = Heights.lookup(DefMI);
-      DEBUG(dbgs() << ' ' << printReg(LIR.Reg) << '@' << LIR.Height);
+      LLVM_DEBUG(dbgs() << ' ' << printReg(LIR.Reg) << '@' << LIR.Height);
     }
 
     // Transfer the live regunits to the live-in list.
     for (SparseSet<LiveRegUnit>::const_iterator
          RI = RegUnits.begin(), RE = RegUnits.end(); RI != RE; ++RI) {
       TBI.LiveIns.push_back(LiveInReg(RI->RegUnit, RI->Cycle));
-      DEBUG(dbgs() << ' ' << printRegUnit(RI->RegUnit, MTM.TRI)
-                   << '@' << RI->Cycle);
+      LLVM_DEBUG(dbgs() << ' ' << printRegUnit(RI->RegUnit, MTM.TRI) << '@'
+                        << RI->Cycle);
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
 
     if (!TBI.HasValidInstrDepths)
       continue;
     // Add live-ins to the critical path length.
     TBI.CriticalPath = std::max(TBI.CriticalPath,
                                 computeCrossBlockCriticalPath(TBI));
-    DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
+    LLVM_DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
index e0cc2ca9a2a2..d644e41abc5b 100644
--- a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -239,7 +239,8 @@ namespace {
     void report(const char *msg, const MachineFunction *MF);
     void report(const char *msg, const MachineBasicBlock *MBB);
     void report(const char *msg, const MachineInstr *MI);
-    void report(const char *msg, const MachineOperand *MO, unsigned MONum);
+    void report(const char *msg, const MachineOperand *MO, unsigned MONum,
+                LLT MOVRegType = LLT{});
 
     void report_context(const LiveInterval &LI) const;
     void report_context(const LiveRange &LR, unsigned VRegUnit,
@@ -250,16 +251,16 @@ namespace {
     void report_context_liverange(const LiveRange &LR) const;
     void report_context_lanemask(LaneBitmask LaneMask) const;
     void report_context_vreg(unsigned VReg) const;
-    void report_context_vreg_regunit(unsigned VRegOrRegUnit) const;
+    void report_context_vreg_regunit(unsigned VRegOrUnit) const;
 
     void verifyInlineAsm(const MachineInstr *MI);
 
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum,
-                            SlotIndex UseIdx, const LiveRange &LR, unsigned Reg,
+                            SlotIndex UseIdx, const LiveRange &LR, unsigned VRegOrUnit,
                             LaneBitmask LaneMask = LaneBitmask::getNone());
     void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum,
-                            SlotIndex DefIdx, const LiveRange &LR, unsigned Reg,
+                            SlotIndex DefIdx, const LiveRange &LR, unsigned VRegOrUnit,
                             LaneBitmask LaneMask = LaneBitmask::getNone());
 
     void markReachable(const MachineBasicBlock *MBB);
@@ -359,11 +360,15 @@ unsigned MachineVerifier::verify(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
-  isFunctionRegBankSelected = MF.getProperties().hasProperty(
-      MachineFunctionProperties::Property::RegBankSelected);
-  isFunctionSelected = MF.getProperties().hasProperty(
-      MachineFunctionProperties::Property::Selected);
-
+  const bool isFunctionFailedISel = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::FailedISel);
+  isFunctionRegBankSelected =
+      !isFunctionFailedISel &&
+      MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::RegBankSelected);
+  isFunctionSelected = !isFunctionFailedISel &&
+                       MF.getProperties().hasProperty(
+                           MachineFunctionProperties::Property::Selected);
   LiveVars = nullptr;
   LiveInts = nullptr;
   LiveStks = nullptr;
@@ -486,15 +491,14 @@ void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   if (Indexes && Indexes->hasIndex(*MI))
     errs() << Indexes->getInstructionIndex(*MI) << '\t';
   MI->print(errs(), /*SkipOpers=*/true);
-  errs() << '\n';
 }
 
-void MachineVerifier::report(const char *msg,
-                             const MachineOperand *MO, unsigned MONum) {
+void MachineVerifier::report(const char *msg, const MachineOperand *MO,
+                             unsigned MONum, LLT MOVRegType) {
   assert(MO);
   report(msg, MO->getParent());
   errs() << "- operand " << MONum << ":   ";
-  MO->print(errs(), TRI);
+  MO->print(errs(), MOVRegType, TRI);
   errs() << "\n";
 }
 
@@ -642,7 +646,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       !(AsmInfo &&
         AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
         BB && isa<SwitchInst>(BB->getTerminator())) &&
-      !isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+      !isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
     report("MBB has more than one landing pad successor", MBB);
 
   // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
@@ -873,11 +877,11 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
     errs() << MCID.getNumOperands() << " operands expected, but "
-        << MI->getNumOperands() << " given.\n";
+           << MI->getNumOperands() << " given.\n";
   }
 
   if (MI->isPHI() && MF->getProperties().hasProperty(
-          MachineFunctionProperties::Property::NoPHIs))
+                         MachineFunctionProperties::Property::NoPHIs))
     report("Found PHI instruction with NoPHIs property set", MI);
 
   // Check the tied operands.
@@ -886,7 +890,8 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
 
   // Check the MachineMemOperands for basic consistency.
   for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
-       E = MI->memoperands_end(); I != E; ++I) {
+                                  E = MI->memoperands_end();
+       I != E; ++I) {
     if ((*I)->isLoad() && !MI->mayLoad())
       report("Missing mayLoad flag", MI);
     if ((*I)->isStore() && !MI->mayStore())
@@ -897,7 +902,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   // Other instructions must have one, unless they are inside a bundle.
   if (LiveInts) {
     bool mapped = !LiveInts->isNotInMIMap(*MI);
-    if (MI->isDebugValue()) {
+    if (MI->isDebugInstr()) {
       if (mapped)
         report("Debug instruction has a slot index", MI);
     } else if (MI->isInsideBundle()) {
@@ -909,32 +914,42 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
-  // Check types.
   if (isPreISelGenericOpcode(MCID.getOpcode())) {
     if (isFunctionSelected)
       report("Unexpected generic instruction in a Selected function", MI);
 
-    // Generic instructions specify equality constraints between some
-    // of their operands. Make sure these are consistent.
+    // Check types.
     SmallVector<LLT, 4> Types;
-    for (unsigned i = 0; i < MCID.getNumOperands(); ++i) {
-      if (!MCID.OpInfo[i].isGenericType())
+    for (unsigned I = 0; I < MCID.getNumOperands(); ++I) {
+      if (!MCID.OpInfo[I].isGenericType())
         continue;
-      size_t TypeIdx = MCID.OpInfo[i].getGenericTypeIndex();
+      // Generic instructions specify type equality constraints between some of
+      // their operands. Make sure these are consistent.
+      size_t TypeIdx = MCID.OpInfo[I].getGenericTypeIndex();
       Types.resize(std::max(TypeIdx + 1, Types.size()));
 
-      LLT OpTy = MRI->getType(MI->getOperand(i).getReg());
-      if (Types[TypeIdx].isValid() && Types[TypeIdx] != OpTy)
-        report("type mismatch in generic instruction", MI);
-      Types[TypeIdx] = OpTy;
+      const MachineOperand *MO = &MI->getOperand(I);
+      LLT OpTy = MRI->getType(MO->getReg());
+      // Don't report a type mismatch if there is no actual mismatch, only a
+      // type missing, to reduce noise:
+      if (OpTy.isValid()) {
+        // Only the first valid type for a type index will be printed: don't
+        // overwrite it later so it's always clear which type was expected:
+        if (!Types[TypeIdx].isValid())
+          Types[TypeIdx] = OpTy;
+        else if (Types[TypeIdx] != OpTy)
+          report("Type mismatch in generic instruction", MO, I, OpTy);
+      } else {
+        // Generic instructions must have types attached to their operands.
+        report("Generic instruction is missing a virtual register type", MO, I);
+      }
     }
-  }
 
-  // Generic opcodes must not have physical register operands.
-  if (isPreISelGenericOpcode(MCID.getOpcode())) {
-    for (auto &Op : MI->operands()) {
-      if (Op.isReg() && TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
-        report("Generic instruction cannot have physical register", MI);
+    // Generic opcodes must not have physical register operands.
+    for (unsigned I = 0; I < MI->getNumOperands(); ++I) {
+      const MachineOperand *MO = &MI->getOperand(I);
+      if (MO->isReg() && TargetRegisterInfo::isPhysicalRegister(MO->getReg()))
+        report("Generic instruction cannot have physical register", MO, I);
     }
   }
 
@@ -971,6 +986,88 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
              MI);
     break;
   }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_FPEXT:
+  case TargetOpcode::G_FPTRUNC: {
+    // Number of operands and presense of types is already checked (and
+    // reported in case of any issues), so no need to report them again. As
+    // we're trying to report as many issues as possible at once, however, the
+    // instructions aren't guaranteed to have the right number of operands or
+    // types attached to them at this point
+    assert(MCID.getNumOperands() == 2 && "Expected 2 operands G_*{EXT,TRUNC}");
+    if (MI->getNumOperands() < MCID.getNumOperands())
+      break;
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid())
+      break;
+
+    LLT DstElTy = DstTy.isVector() ? DstTy.getElementType() : DstTy;
+    LLT SrcElTy = SrcTy.isVector() ? SrcTy.getElementType() : SrcTy;
+    if (DstElTy.isPointer() || SrcElTy.isPointer())
+      report("Generic extend/truncate can not operate on pointers", MI);
+
+    if (DstTy.isVector() != SrcTy.isVector()) {
+      report("Generic extend/truncate must be all-vector or all-scalar", MI);
+      // Generally we try to report as many issues as possible at once, but in
+      // this case it's not clear what should we be comparing the size of the
+      // scalar with: the size of the whole vector or its lane. Instead of
+      // making an arbitrary choice and emitting not so helpful message, let's
+      // avoid the extra noise and stop here.
+      break;
+    }
+    if (DstTy.isVector() && DstTy.getNumElements() != SrcTy.getNumElements())
+      report("Generic vector extend/truncate must preserve number of lanes",
+             MI);
+    unsigned DstSize = DstElTy.getSizeInBits();
+    unsigned SrcSize = SrcElTy.getSizeInBits();
+    switch (MI->getOpcode()) {
+    default:
+      if (DstSize <= SrcSize)
+        report("Generic extend has destination type no larger than source", MI);
+      break;
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_FPTRUNC:
+      if (DstSize >= SrcSize)
+        report("Generic truncate has destination type no smaller than source",
+               MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::COPY: {
+    if (foundErrors)
+      break;
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    LLT DstTy = MRI->getType(DstOp.getReg());
+    LLT SrcTy = MRI->getType(SrcOp.getReg());
+    if (SrcTy.isValid() && DstTy.isValid()) {
+      // If both types are valid, check that the types are the same.
+      if (SrcTy != DstTy) {
+        report("Copy Instruction is illegal with mismatching types", MI);
+        errs() << "Def = " << DstTy << ", Src = " << SrcTy << "\n";
+      }
+    }
+    if (SrcTy.isValid() || DstTy.isValid()) {
+      // If one of them have valid types, let's just check they have the same
+      // size.
+      unsigned SrcSize = TRI->getRegSizeInBits(SrcOp.getReg(), *MRI);
+      unsigned DstSize = TRI->getRegSizeInBits(DstOp.getReg(), *MRI);
+      assert(SrcSize && "Expecting size here");
+      assert(DstSize && "Expecting size here");
+      if (SrcSize != DstSize)
+        if (!DstOp.getSubReg() && !SrcOp.getSubReg()) {
+          report("Copy Instruction is illegal with mismatching sizes", MI);
+          errs() << "Def Size = " << DstSize << ", Src Size = " << SrcSize
+                 << "\n";
+        }
+    }
+    break;
+  }
   case TargetOpcode::STATEPOINT:
     if (!MI->getOperand(StatepointOpers::IDPos).isImm() ||
         !MI->getOperand(StatepointOpers::NBytesPos).isImm() ||
@@ -1101,12 +1198,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           }
         }
       }
-      if (MO->isRenamable() &&
-          ((MO->isDef() && MI->hasExtraDefRegAllocReq()) ||
-           (MO->isUse() && MI->hasExtraSrcRegAllocReq()))) {
-        report("Illegal isRenamable setting for opcode with extra regalloc "
-               "requirements",
-               MO, MONum);
+      if (MO->isRenamable()) {
+        if (MRI->isReserved(Reg)) {
+          report("isRenamable set on reserved register", MO, MONum);
+          return;
+        }
+      }
+      if (MI->isDebugValue() && MO->isUse() && !MO->isDebug()) {
+        report("Use-reg is not IsDebug in a DBG_VALUE", MO, MONum);
         return;
       }
     } else {
diff --git a/contrib/llvm/lib/CodeGen/MacroFusion.cpp b/contrib/llvm/lib/CodeGen/MacroFusion.cpp
index e7f426c469a0..62dadbba0c1a 100644
--- a/contrib/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/contrib/llvm/lib/CodeGen/MacroFusion.cpp
@@ -66,11 +66,11 @@ static bool fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
     if (SI.getSUnit() == &FirstSU)
       SI.setLatency(0);
 
-  DEBUG(dbgs() << "Macro fuse: ";
-        FirstSU.print(dbgs(), &DAG); dbgs() << " - ";
-        SecondSU.print(dbgs(), &DAG); dbgs() << " /  ";
-        dbgs() << DAG.TII->getName(FirstSU.getInstr()->getOpcode()) << " - " <<
-                  DAG.TII->getName(SecondSU.getInstr()->getOpcode()) << '\n'; );
+  LLVM_DEBUG(
+      dbgs() << "Macro fuse: "; FirstSU.print(dbgs(), &DAG); dbgs() << " - ";
+      SecondSU.print(dbgs(), &DAG); dbgs() << " /  ";
+      dbgs() << DAG.TII->getName(FirstSU.getInstr()->getOpcode()) << " - "
+             << DAG.TII->getName(SecondSU.getInstr()->getOpcode()) << '\n';);
 
   // Make data dependencies from the FirstSU also dependent on the SecondSU to
   // prevent them from being scheduled between the FirstSU and the SecondSU.
@@ -80,24 +80,32 @@ static bool fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
       if (SI.isWeak() || isHazard(SI) ||
           SU == &DAG.ExitSU || SU == &SecondSU || SU->isPred(&SecondSU))
         continue;
-      DEBUG(dbgs() << "  Bind ";
-            SecondSU.print(dbgs(), &DAG); dbgs() << " - ";
-            SU->print(dbgs(), &DAG); dbgs() << '\n';);
+      LLVM_DEBUG(dbgs() << "  Bind "; SecondSU.print(dbgs(), &DAG);
+                 dbgs() << " - "; SU->print(dbgs(), &DAG); dbgs() << '\n';);
       DAG.addEdge(SU, SDep(&SecondSU, SDep::Artificial));
     }
 
   // Make the FirstSU also dependent on the dependencies of the SecondSU to
   // prevent them from being scheduled between the FirstSU and the SecondSU.
-  if (&FirstSU != &DAG.EntrySU)
+  if (&FirstSU != &DAG.EntrySU) {
     for (const SDep &SI : SecondSU.Preds) {
       SUnit *SU = SI.getSUnit();
       if (SI.isWeak() || isHazard(SI) || &FirstSU == SU || FirstSU.isSucc(SU))
         continue;
-      DEBUG(dbgs() << "  Bind ";
-            SU->print(dbgs(), &DAG); dbgs() << " - ";
-            FirstSU.print(dbgs(), &DAG); dbgs() << '\n';);
+      LLVM_DEBUG(dbgs() << "  Bind "; SU->print(dbgs(), &DAG); dbgs() << " - ";
+                 FirstSU.print(dbgs(), &DAG); dbgs() << '\n';);
       DAG.addEdge(&FirstSU, SDep(SU, SDep::Artificial));
     }
+    // ExitSU comes last by design, which acts like an implicit dependency
+    // between ExitSU and any bottom root in the graph. We should transfer
+    // this to FirstSU as well.
+    if (&SecondSU == &DAG.ExitSU) {
+      for (SUnit &SU : DAG.SUnits) {
+        if (SU.Succs.empty())
+          DAG.addEdge(&FirstSU, SDep(&SU, SDep::Artificial));
+      }
+    }
+  }
 
   ++NumFused;
   return true;
@@ -105,7 +113,7 @@ static bool fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
 
 namespace {
 
-/// \brief Post-process the DAG to create cluster edges between instrs that may
+/// Post-process the DAG to create cluster edges between instrs that may
 /// be fused by the processor into a single operation.
 class MacroFusion : public ScheduleDAGMutation {
   ShouldSchedulePredTy shouldScheduleAdjacent;
@@ -135,7 +143,7 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
     scheduleAdjacentImpl(*DAG, DAG->ExitSU);
 }
 
-/// \brief Implement the fusion of instr pairs in the scheduling DAG,
+/// Implement the fusion of instr pairs in the scheduling DAG,
 /// anchored at the instr in AnchorSU..
 bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU) {
   const MachineInstr &AnchorMI = *AnchorSU.getInstr();
diff --git a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
index 8972867ba083..befa8422d399 100644
--- a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
+++ b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
@@ -45,7 +45,7 @@ namespace {
       initializeOptimizePHIsPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnMachineFunction(MachineFunction &MF) override;
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
index 54c5a940275d..7a5c20000066 100644
--- a/contrib/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
@@ -75,7 +75,7 @@ namespace {
       initializePHIEliminationPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnMachineFunction(MachineFunction &Fn) override;
+    bool runOnMachineFunction(MachineFunction &MF) override;
     void getAnalysisUsage(AnalysisUsage &AU) const override;
 
   private:
@@ -91,7 +91,7 @@ namespace {
     /// register which is used in a PHI node. We map that to the BB the
     /// vreg is coming from. This is used later to determine when the vreg
     /// is killed in the BB.
-    void analyzePHINodes(const MachineFunction& Fn);
+    void analyzePHINodes(const MachineFunction& MF);
 
     /// Split critical edges where necessary for good coalescer performance.
     bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -270,7 +270,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       IncomingReg = entry;
       reusedIncoming = true;
       ++NumReused;
-      DEBUG(dbgs() << "Reusing " << printReg(IncomingReg) << " for " << *MPhi);
+      LLVM_DEBUG(dbgs() << "Reusing " << printReg(IncomingReg) << " for "
+                        << *MPhi);
     } else {
       const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
       entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
@@ -295,9 +296,9 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // AfterPHIsIt, so it appears before the current PHICopy.
       if (reusedIncoming)
         if (MachineInstr *OldKill = VI.findKill(&MBB)) {
-          DEBUG(dbgs() << "Remove old kill from " << *OldKill);
+          LLVM_DEBUG(dbgs() << "Remove old kill from " << *OldKill);
           LV->removeVirtualRegisterKilled(IncomingReg, *OldKill);
-          DEBUG(MBB.dump());
+          LLVM_DEBUG(MBB.dump());
         }
 
       // Add information to LiveVariables to know that the incoming value is
@@ -452,7 +453,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
           KillInst = FirstTerm;
           while (KillInst != opBlock.begin()) {
             --KillInst;
-            if (KillInst->isDebugValue())
+            if (KillInst->isDebugInstr())
               continue;
             if (KillInst->readsRegister(SrcReg))
               break;
@@ -512,7 +513,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
               KillInst = FirstTerm;
               while (KillInst != opBlock.begin()) {
                 --KillInst;
-                if (KillInst->isDebugValue())
+                if (KillInst->isDebugInstr())
                   continue;
                 if (KillInst->readsRegister(SrcReg))
                   break;
@@ -593,9 +594,9 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       if (!ShouldSplit && !NoPhiElimLiveOutEarlyExit)
         continue;
       if (ShouldSplit) {
-        DEBUG(dbgs() << printReg(Reg) << " live-out before critical edge "
-                     << printMBBReference(*PreMBB) << " -> "
-                     << printMBBReference(MBB) << ": " << *BBI);
+        LLVM_DEBUG(dbgs() << printReg(Reg) << " live-out before critical edge "
+                          << printMBBReference(*PreMBB) << " -> "
+                          << printMBBReference(MBB) << ": " << *BBI);
       }
 
       // If Reg is not live-in to MBB, it means it must be live-in to some
@@ -610,10 +611,12 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
 
       // Check for a loop exiting edge.
       if (!ShouldSplit && CurLoop != PreLoop) {
-        DEBUG({
+        LLVM_DEBUG({
           dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
-          if (PreLoop) dbgs() << "PreLoop: " << *PreLoop;
-          if (CurLoop) dbgs() << "CurLoop: " << *CurLoop;
+          if (PreLoop)
+            dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop)
+            dbgs() << "CurLoop: " << *CurLoop;
         });
         // This edge could be entering a loop, exiting a loop, or it could be
         // both: Jumping directly form one loop to the header of a sibling
@@ -624,7 +627,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       if (!ShouldSplit && !SplitAllCriticalEdges)
         continue;
       if (!PreMBB->SplitCriticalEdge(&MBB, *this)) {
-        DEBUG(dbgs() << "Failed to split critical edge.\n");
+        LLVM_DEBUG(dbgs() << "Failed to split critical edge.\n");
         continue;
       }
       Changed = true;
diff --git a/contrib/llvm/lib/CodeGen/ParallelCG.cpp b/contrib/llvm/lib/CodeGen/ParallelCG.cpp
index ff8680a0540d..bc3f2a6e9b5a 100644
--- a/contrib/llvm/lib/CodeGen/ParallelCG.cpp
+++ b/contrib/llvm/lib/CodeGen/ParallelCG.cpp
@@ -30,7 +30,7 @@ static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
                     TargetMachine::CodeGenFileType FileType) {
   std::unique_ptr<TargetMachine> TM = TMFactory();
   legacy::PassManager CodeGenPasses;
-  if (TM->addPassesToEmitFile(CodeGenPasses, OS, FileType))
+  if (TM->addPassesToEmitFile(CodeGenPasses, OS, nullptr, FileType))
     report_fatal_error("Failed to setup codegen");
   CodeGenPasses.run(*M);
 }
@@ -44,7 +44,7 @@ std::unique_ptr<Module> llvm::splitCodeGen(
 
   if (OSs.size() == 1) {
     if (!BCOSs.empty())
-      WriteBitcodeToFile(M.get(), *BCOSs[0]);
+      WriteBitcodeToFile(*M, *BCOSs[0]);
     codegen(M.get(), *OSs[0], TMFactory, FileType);
     return M;
   }
@@ -66,7 +66,7 @@ std::unique_ptr<Module> llvm::splitCodeGen(
           // FIXME: Provide a more direct way to do this in LLVM.
           SmallString<0> BC;
           raw_svector_ostream BCOS(BC);
-          WriteBitcodeToFile(MPart.get(), BCOS);
+          WriteBitcodeToFile(*MPart, BCOS);
 
           if (!BCOSs.empty()) {
             BCOSs[ThreadCount]->write(BC.begin(), BC.size());
diff --git a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
index 0957705b19bb..afb4b0a7e174 100644
--- a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
+++ b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -49,6 +49,7 @@ static bool doesNotGeneratecode(const MachineInstr &MI) {
   case TargetOpcode::EH_LABEL:
   case TargetOpcode::GC_LABEL:
   case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::DBG_LABEL:
     return true;
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 1320f9985553..1d058ccfb633 100644
--- a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -202,7 +202,7 @@ namespace {
     bool foldImmediate(MachineInstr &MI, SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
 
-    /// \brief Finds recurrence cycles, but only ones that formulated around
+    /// Finds recurrence cycles, but only ones that formulated around
     /// a def operand and a use operand that are tied. If there is a use
     /// operand commutable with the tied use operand, find recurrence cycle
     /// along that operand as well.
@@ -210,7 +210,7 @@ namespace {
                               const SmallSet<unsigned, 2> &TargetReg,
                               RecurrenceCycle &RC);
 
-    /// \brief If copy instruction \p MI is a virtual register copy, track it in
+    /// If copy instruction \p MI is a virtual register copy, track it in
     /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
     /// previously seen as a copy, replace the uses of this copy with the
     /// previously seen copy's destination register.
@@ -221,7 +221,7 @@ namespace {
     /// Is the register \p Reg a non-allocatable physical register?
     bool isNAPhysCopy(unsigned Reg);
 
-    /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// If copy instruction \p MI is a non-allocatable virtual<->physical
     /// register copy, track it in the \p NAPhysToVirtMIs map. If this
     /// non-allocatable physical register was previously copied to a virtual
     /// registered and hasn't been clobbered, the virt->phys copy can be
@@ -232,7 +232,7 @@ namespace {
     bool isLoadFoldable(MachineInstr &MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
 
-    /// \brief Check whether \p MI is understood by the register coalescer
+    /// Check whether \p MI is understood by the register coalescer
     /// but may require some rewriting.
     bool isCoalescableCopy(const MachineInstr &MI) {
       // SubregToRegs are not interesting, because they are already register
@@ -242,7 +242,7 @@ namespace {
                               MI.isExtractSubreg()));
     }
 
-    /// \brief Check whether \p MI is a copy like instruction that is
+    /// Check whether \p MI is a copy like instruction that is
     /// not recognized by the register coalescer.
     bool isUncoalescableCopy(const MachineInstr &MI) {
       return MI.isBitcast() ||
@@ -345,7 +345,7 @@ namespace {
     }
   };
 
-  /// \brief Helper class to track the possible sources of a value defined by
+  /// Helper class to track the possible sources of a value defined by
   /// a (chain of) copy related instructions.
   /// Given a definition (instruction and definition index), this class
   /// follows the use-def chain to find successive suitable sources.
@@ -425,7 +425,7 @@ namespace {
       }
     }
 
-    /// \brief Following the use-def chain, get the next available source
+    /// Following the use-def chain, get the next available source
     /// for the tracked value.
     /// \return A ValueTrackerResult containing a set of registers
     /// and sub registers with tracked values. A ValueTrackerResult with
@@ -646,7 +646,7 @@ bool PeepholeOptimizer::optimizeCondBranch(MachineInstr &MI) {
   return TII->optimizeCondBranch(MI);
 }
 
-/// \brief Try to find the next source that share the same register file
+/// Try to find the next source that share the same register file
 /// for the value defined by \p Reg and \p SubReg.
 /// When true is returned, the \p RewriteMap can be used by the client to
 /// retrieve all Def -> Use along the way up to the next source. Any found
@@ -696,7 +696,8 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
         // An existent entry with multiple sources is a PHI cycle we must avoid.
         // Otherwise it's an entry with a valid next source we already found.
         if (CurSrcRes.getNumSources() > 1) {
-          DEBUG(dbgs() << "findNextSource: found PHI cycle, aborting...\n");
+          LLVM_DEBUG(dbgs()
+                     << "findNextSource: found PHI cycle, aborting...\n");
           return false;
         }
         break;
@@ -709,7 +710,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
       if (NumSrcs > 1) {
         PHICount++;
         if (PHICount >= RewritePHILimit) {
-          DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
+          LLVM_DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
           return false;
         }
 
@@ -746,7 +747,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
   return CurSrcPair.Reg != Reg;
 }
 
-/// \brief Insert a PHI instruction with incoming edges \p SrcRegs that are
+/// Insert a PHI instruction with incoming edges \p SrcRegs that are
 /// guaranteed to have the same register class. This is necessary whenever we
 /// successfully traverse a PHI instruction and find suitable sources coming
 /// from its edges. By inserting a new PHI, we provide a rewritten PHI def
@@ -791,7 +792,7 @@ public:
   Rewriter(MachineInstr &CopyLike) : CopyLike(CopyLike) {}
   virtual ~Rewriter() {}
 
-  /// \brief Get the next rewritable source (SrcReg, SrcSubReg) and
+  /// Get the next rewritable source (SrcReg, SrcSubReg) and
   /// the related value that it affects (DstReg, DstSubReg).
   /// A source is considered rewritable if its register class and the
   /// register class of the related DstReg may not be register
@@ -859,7 +860,7 @@ public:
   }
 };
 
-/// \brief Helper class to rewrite uncoalescable copy like instructions
+/// Helper class to rewrite uncoalescable copy like instructions
 /// into new COPY (coalescable friendly) instructions.
 class UncoalescableRewriter : public Rewriter {
   unsigned NumDefs;  ///< Number of defs in the bitcast.
@@ -1101,7 +1102,7 @@ static Rewriter *getCopyRewriter(MachineInstr &MI, const TargetInstrInfo &TII) {
   }
 }
 
-/// \brief Given a \p Def.Reg and Def.SubReg  pair, use \p RewriteMap to find
+/// Given a \p Def.Reg and Def.SubReg  pair, use \p RewriteMap to find
 /// the new source to use for rewrite. If \p HandleMultipleSources is true and
 /// multiple sources for a given \p Def are found along the way, we found a
 /// PHI instructions that needs to be rewritten.
@@ -1143,9 +1144,9 @@ getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
     // Build the new PHI node and return its def register as the new source.
     MachineInstr &OrigPHI = const_cast<MachineInstr &>(*Res.getInst());
     MachineInstr &NewPHI = insertPHI(*MRI, *TII, NewPHISrcs, OrigPHI);
-    DEBUG(dbgs() << "-- getNewSource\n");
-    DEBUG(dbgs() << "   Replacing: " << OrigPHI);
-    DEBUG(dbgs() << "        With: " << NewPHI);
+    LLVM_DEBUG(dbgs() << "-- getNewSource\n");
+    LLVM_DEBUG(dbgs() << "   Replacing: " << OrigPHI);
+    LLVM_DEBUG(dbgs() << "        With: " << NewPHI);
     const MachineOperand &MODef = NewPHI.getOperand(0);
     return RegSubRegPair(MODef.getReg(), MODef.getSubReg());
   }
@@ -1213,7 +1214,7 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
   return Changed;
 }
 
-/// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
+/// Rewrite the source found through \p Def, by using the \p RewriteMap
 /// and create a new COPY instruction. More info about RewriteMap in
 /// PeepholeOptimizer::findNextSource. Right now this is only used to handle
 /// Uncoalescable copies, since they are copy like instructions that aren't
@@ -1241,9 +1242,9 @@ PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
     NewCopy->getOperand(0).setIsUndef();
   }
 
-  DEBUG(dbgs() << "-- RewriteSource\n");
-  DEBUG(dbgs() << "   Replacing: " << CopyLike);
-  DEBUG(dbgs() << "        With: " << *NewCopy);
+  LLVM_DEBUG(dbgs() << "-- RewriteSource\n");
+  LLVM_DEBUG(dbgs() << "   Replacing: " << CopyLike);
+  LLVM_DEBUG(dbgs() << "        With: " << *NewCopy);
   MRI->replaceRegWith(Def.Reg, NewVReg);
   MRI->clearKillFlags(NewVReg);
 
@@ -1254,7 +1255,7 @@ PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
   return *NewCopy;
 }
 
-/// \brief Optimize copy-like instructions to create
+/// Optimize copy-like instructions to create
 /// register coalescer friendly instruction.
 /// The optimization tries to kill-off the \p MI by looking
 /// through a chain of copies to find a source that has a compatible
@@ -1462,7 +1463,8 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   if (PrevCopy == NAPhysToVirtMIs.end()) {
     // We can't remove the copy: there was an intervening clobber of the
     // non-allocatable physical register after the copy to virtual.
-    DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << MI);
+    LLVM_DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing "
+                      << MI);
     return false;
   }
 
@@ -1470,7 +1472,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   if (PrevDstReg == SrcReg) {
     // Remove the virt->phys copy: we saw the virtual register definition, and
     // the non-allocatable physical register's state hasn't changed since then.
-    DEBUG(dbgs() << "NAPhysCopy: erasing " << MI);
+    LLVM_DEBUG(dbgs() << "NAPhysCopy: erasing " << MI);
     ++NumNAPhysCopies;
     return true;
   }
@@ -1479,7 +1481,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   // register get a copy of the non-allocatable physical register, and we only
   // track one such copy. Avoid getting confused by this new non-allocatable
   // physical register definition, and remove it from the tracked copies.
-  DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << MI);
+  LLVM_DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << MI);
   NAPhysToVirtMIs.erase(PrevCopy);
   return false;
 }
@@ -1575,15 +1577,15 @@ bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
   if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
     // Commutes operands of instructions in RC if necessary so that the copy to
     // be generated from PHI can be coalesced.
-    DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
+    LLVM_DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
     for (auto &RI : RC) {
-      DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+      LLVM_DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
       auto CP = RI.getCommutePair();
       if (CP) {
         Changed = true;
         TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
                                 (*CP).second);
-        DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+        LLVM_DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
       }
     }
   }
@@ -1595,8 +1597,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
-  DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');
 
   if (DisablePeephole)
     return false;
@@ -1643,8 +1645,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       ++MII;
       LocalMIs.insert(MI);
 
-      // Skip debug values. They should not affect this peephole optimization.
-      if (MI->isDebugValue())
+      // Skip debug instructions. They should not affect this peephole optimization.
+      if (MI->isDebugInstr())
           continue;
 
       if (MI->isPosition())
@@ -1667,7 +1669,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               if (Def != NAPhysToVirtMIs.end()) {
                 // A new definition of the non-allocatable physical register
                 // invalidates previous copies.
-                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI);
+                LLVM_DEBUG(dbgs()
+                           << "NAPhysCopy: invalidating because of " << *MI);
                 NAPhysToVirtMIs.erase(Def);
               }
             }
@@ -1676,7 +1679,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
             for (auto &RegMI : NAPhysToVirtMIs) {
               unsigned Def = RegMI.first;
               if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
-                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI);
+                LLVM_DEBUG(dbgs()
+                           << "NAPhysCopy: invalidating because of " << *MI);
                 NAPhysToVirtMIs.erase(Def);
               }
             }
@@ -1692,7 +1696,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         // don't know what's correct anymore.
         //
         // FIXME: handle explicit asm clobbers.
-        DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI);
+        LLVM_DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to "
+                          << *MI);
         NAPhysToVirtMIs.clear();
       }
 
@@ -1768,8 +1773,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
                     TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
               // Update LocalMIs since we replaced MI with FoldMI and deleted
               // DefMI.
-              DEBUG(dbgs() << "Replacing: " << *MI);
-              DEBUG(dbgs() << "     With: " << *FoldMI);
+              LLVM_DEBUG(dbgs() << "Replacing: " << *MI);
+              LLVM_DEBUG(dbgs() << "     With: " << *FoldMI);
               LocalMIs.erase(MI);
               LocalMIs.erase(DefMI);
               LocalMIs.insert(FoldMI);
@@ -1791,7 +1796,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       // the load candidates.  Note: We might be able to fold *into* this
       // instruction, so this needs to be after the folding logic.
       if (MI->isLoadFoldBarrier()) {
-        DEBUG(dbgs() << "Encountered load fold barrier on " << *MI);
+        LLVM_DEBUG(dbgs() << "Encountered load fold barrier on " << *MI);
         FoldAsLoadDefCandidates.clear();
       }
     }
diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
index 5d86faafdd85..215da630caf4 100644
--- a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -242,11 +243,11 @@ void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
 
 /// Print the schedule before exiting the region.
 void SchedulePostRATDList::exitRegion() {
-  DEBUG({
-      dbgs() << "*** Final schedule ***\n";
-      dumpSchedule();
-      dbgs() << '\n';
-    });
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
   ScheduleDAGInstrs::exitRegion();
 }
 
@@ -308,7 +309,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
          : TargetSubtargetInfo::ANTIDEP_NONE);
   }
 
-  DEBUG(dbgs() << "PostRAScheduler\n");
+  LLVM_DEBUG(dbgs() << "PostRAScheduler\n");
 
   SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,
                                  CriticalPathRCs);
@@ -412,13 +413,12 @@ void SchedulePostRATDList::schedule() {
 
   postprocessDAG();
 
-  DEBUG(dbgs() << "********** List Scheduling **********\n");
-  DEBUG(
-    for (const SUnit &SU : SUnits) {
-      SU.dumpAll(this);
-      dbgs() << '\n';
-    }
-  );
+  LLVM_DEBUG(dbgs() << "********** List Scheduling **********\n");
+  LLVM_DEBUG(for (const SUnit &SU
+                  : SUnits) {
+    SU.dumpAll(this);
+    dbgs() << '\n';
+  });
 
   AvailableQueue.initNodes(SUnits);
   ListScheduleTopDown();
@@ -501,8 +501,8 @@ void SchedulePostRATDList::ReleaseSuccessors(SUnit *SU) {
 /// count of its successors. If a successor pending count is zero, add it to
 /// the Available queue.
 void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
-  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
-  DEBUG(SU->dump(this));
+  LLVM_DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(SU->dump(this));
 
   Sequence.push_back(SU);
   assert(CurCycle >= SU->getDepth() &&
@@ -516,7 +516,7 @@ void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
 
 /// emitNoop - Add a noop to the current instruction sequence.
 void SchedulePostRATDList::emitNoop(unsigned CurCycle) {
-  DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
+  LLVM_DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
   HazardRec->EmitNoop();
   Sequence.push_back(nullptr);   // NULL here means noop
   ++NumNoops;
@@ -568,7 +568,8 @@ void SchedulePostRATDList::ListScheduleTopDown() {
         MinDepth = PendingQueue[i]->getDepth();
     }
 
-    DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this));
+    LLVM_DEBUG(dbgs() << "\n*** Examining Available\n";
+               AvailableQueue.dump(this));
 
     SUnit *FoundSUnit = nullptr, *NotPreferredSUnit = nullptr;
     bool HasNoopHazards = false;
@@ -604,7 +605,8 @@ void SchedulePostRATDList::ListScheduleTopDown() {
     // non-preferred node.
     if (NotPreferredSUnit) {
       if (!FoundSUnit) {
-        DEBUG(dbgs() << "*** Will schedule a non-preferred instruction...\n");
+        LLVM_DEBUG(
+            dbgs() << "*** Will schedule a non-preferred instruction...\n");
         FoundSUnit = NotPreferredSUnit;
       } else {
         AvailableQueue.push(NotPreferredSUnit);
@@ -631,19 +633,20 @@ void SchedulePostRATDList::ListScheduleTopDown() {
       HazardRec->EmitInstruction(FoundSUnit);
       CycleHasInsts = true;
       if (HazardRec->atIssueLimit()) {
-        DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle << '\n');
+        LLVM_DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle
+                          << '\n');
         HazardRec->AdvanceCycle();
         ++CurCycle;
         CycleHasInsts = false;
       }
     } else {
       if (CycleHasInsts) {
-        DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
+        LLVM_DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
         HazardRec->AdvanceCycle();
       } else if (!HasNoopHazards) {
         // Otherwise, we have a pipeline stall, but no other problem,
         // just advance the current cycle and try again.
-        DEBUG(dbgs() << "*** Stall in cycle " << CurCycle << '\n');
+        LLVM_DEBUG(dbgs() << "*** Stall in cycle " << CurCycle << '\n');
         HazardRec->AdvanceCycle();
         ++NumStalls;
       } else {
diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
index 48b48c5f6499..7e9b4af12ee9 100644
--- a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -44,7 +44,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &au) const override;
 
-  bool runOnMachineFunction(MachineFunction &fn) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
 };
 } // end anonymous namespace
 
@@ -73,7 +73,7 @@ bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
 }
 
 void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
-  DEBUG(dbgs() << "Processing " << *MI);
+  LLVM_DEBUG(dbgs() << "Processing " << *MI);
   unsigned Reg = MI->getOperand(0).getReg();
 
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
@@ -84,7 +84,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
       MachineInstr *UserMI = MO.getParent();
       if (!canTurnIntoImplicitDef(UserMI))
         continue;
-      DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
+      LLVM_DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
       UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
       WorkList.insert(UserMI);
     }
@@ -116,7 +116,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
 
   // If we found the using MI, we can erase the IMPLICIT_DEF.
   if (Found) {
-    DEBUG(dbgs() << "Physreg user: " << *UserMI);
+    LLVM_DEBUG(dbgs() << "Physreg user: " << *UserMI);
     MI->eraseFromParent();
     return;
   }
@@ -125,15 +125,15 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
   for (unsigned i = MI->getNumOperands() - 1; i; --i)
     MI->RemoveOperand(i);
-  DEBUG(dbgs() << "Keeping physreg: " << *MI);
+  LLVM_DEBUG(dbgs() << "Keeping physreg: " << *MI);
 }
 
 /// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
 /// <undef> operands.
 bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
 
-  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
 
   bool Changed = false;
 
@@ -154,8 +154,8 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
     if (WorkList.empty())
       continue;
 
-    DEBUG(dbgs() << printMBBReference(*MFI) << " has " << WorkList.size()
-                 << " implicit defs.\n");
+    LLVM_DEBUG(dbgs() << printMBBReference(*MFI) << " has " << WorkList.size()
+                      << " implicit defs.\n");
     Changed = true;
 
     // Drain the WorkList to recursively process any new implicit defs.
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index a8d8ad8ac7dc..fc62c8caf59e 100644
--- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -38,7 +38,6 @@
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -90,7 +89,7 @@ public:
 
   /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
   /// frame indexes with appropriate references.
-  bool runOnMachineFunction(MachineFunction &Fn) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
   RegScavenger *RS;
@@ -117,15 +116,15 @@ private:
   // Emit remarks.
   MachineOptimizationRemarkEmitter *ORE = nullptr;
 
-  void calculateCallFrameInfo(MachineFunction &Fn);
-  void calculateSaveRestoreBlocks(MachineFunction &Fn);
+  void calculateCallFrameInfo(MachineFunction &MF);
+  void calculateSaveRestoreBlocks(MachineFunction &MF);
   void spillCalleeSavedRegs(MachineFunction &MF);
 
-  void calculateFrameObjectOffsets(MachineFunction &Fn);
-  void replaceFrameIndices(MachineFunction &Fn);
-  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+  void calculateFrameObjectOffsets(MachineFunction &MF);
+  void replaceFrameIndices(MachineFunction &MF);
+  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
                            int &SPAdj);
-  void insertPrologEpilogCode(MachineFunction &Fn);
+  void insertPrologEpilogCode(MachineFunction &MF);
 };
 
 } // end anonymous namespace
@@ -143,7 +142,6 @@ INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(StackProtector)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_END(PEI, DEBUG_TYPE,
                     "Prologue/Epilogue Insertion & Frame Finalization", false,
@@ -160,7 +158,6 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<MachineLoopInfo>();
   AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<StackProtector>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -170,36 +167,36 @@ using StackObjSet = SmallSetVector<int, 8>;
 
 /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
 /// frame indexes with appropriate references.
-bool PEI::runOnMachineFunction(MachineFunction &Fn) {
-  const Function &F = Fn.getFunction();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+bool PEI::runOnMachineFunction(MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
-  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
-  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
+  RS = TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr;
+  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(MF);
   FrameIndexEliminationScavenging = (RS && !FrameIndexVirtualScavenging) ||
-    TRI->requiresFrameIndexReplacementScavenging(Fn);
+    TRI->requiresFrameIndexReplacementScavenging(MF);
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
 
   // Calculate the MaxCallFrameSize and AdjustsStack variables for the
   // function's frame information. Also eliminates call frame pseudo
   // instructions.
-  calculateCallFrameInfo(Fn);
+  calculateCallFrameInfo(MF);
 
   // Determine placement of CSR spill/restore code and prolog/epilog code:
   // place all spills in the entry block, all restores in return blocks.
-  calculateSaveRestoreBlocks(Fn);
+  calculateSaveRestoreBlocks(MF);
 
   // Handle CSR spilling and restoring, for targets that need it.
-  if (Fn.getTarget().usesPhysRegsForPEI())
-    spillCalleeSavedRegs(Fn);
+  if (MF.getTarget().usesPhysRegsForPEI())
+    spillCalleeSavedRegs(MF);
 
   // Allow the target machine to make final modifications to the function
   // before the frame layout is finalized.
-  TFI->processFunctionBeforeFrameFinalized(Fn, RS);
+  TFI->processFunctionBeforeFrameFinalized(MF, RS);
 
   // Calculate actual frame offsets for all abstract stack objects...
-  calculateFrameObjectOffsets(Fn);
+  calculateFrameObjectOffsets(MF);
 
   // Add prolog and epilog code to the function.  This function is required
   // to align the stack frame as necessary for any stack variables or
@@ -207,26 +204,32 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // must be called before this function in order to set the AdjustsStack
   // and MaxCallFrameSize variables.
   if (!F.hasFnAttribute(Attribute::Naked))
-    insertPrologEpilogCode(Fn);
+    insertPrologEpilogCode(MF);
 
   // Replace all MO_FrameIndex operands with physical register references
   // and actual offsets.
   //
-  replaceFrameIndices(Fn);
+  replaceFrameIndices(MF);
 
   // If register scavenging is needed, as we've enabled doing it as a
   // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
-  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
-    scavengeFrameVirtualRegs(Fn, *RS);
+  if (TRI->requiresRegisterScavenging(MF) && FrameIndexVirtualScavenging)
+    scavengeFrameVirtualRegs(MF, *RS);
 
   // Warn on stack size when we exceeds the given limit.
-  MachineFrameInfo &MFI = Fn.getFrameInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   uint64_t StackSize = MFI.getStackSize();
   if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
     DiagnosticInfoStackSize DiagStackSize(F, StackSize);
     F.getContext().diagnose(DiagStackSize);
   }
+  ORE->emit([&]() {
+    return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "StackSize",
+                                             MF.getFunction().getSubprogram(),
+                                             &MF.front())
+           << ore::NV("NumStackBytes", StackSize) << " stack bytes in function";
+  });
 
   delete RS;
   SaveBlocks.clear();
@@ -239,10 +242,10 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
 /// Calculate the MaxCallFrameSize and AdjustsStack
 /// variables for the function's frame information and eliminate call frame
 /// pseudo instructions.
-void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  MachineFrameInfo &MFI = Fn.getFrameInfo();
+void PEI::calculateCallFrameInfo(MachineFunction &MF) {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
 
   unsigned MaxCallFrameSize = 0;
   bool AdjustsStack = MFI.adjustsStack();
@@ -257,7 +260,7 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
     return;
 
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
-  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
       if (TII.isFrameInstr(*I)) {
         unsigned Size = TII.getFrameSize(*I);
@@ -285,15 +288,15 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
     // the target doesn't indicate otherwise, remove the call frame pseudos
     // here. The sub/add sp instruction pairs are still inserted, but we don't
     // need to track the SP adjustment for frame index elimination.
-    if (TFI->canSimplifyCallFramePseudos(Fn))
-      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+    if (TFI->canSimplifyCallFramePseudos(MF))
+      TFI->eliminateCallFramePseudoInstr(MF, *I->getParent(), I);
   }
 }
 
 /// Compute the sets of entry and return blocks for saving and restoring
 /// callee-saved registers, and placing prolog and epilog code.
-void PEI::calculateSaveRestoreBlocks(MachineFunction &Fn) {
-  const MachineFrameInfo &MFI = Fn.getFrameInfo();
+void PEI::calculateSaveRestoreBlocks(MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // Even when we do not change any CSR, we still want to insert the
   // prologue and epilogue of the function.
@@ -313,8 +316,8 @@ void PEI::calculateSaveRestoreBlocks(MachineFunction &Fn) {
   }
 
   // Save refs to entry and return blocks.
-  SaveBlocks.push_back(&Fn.front());
-  for (MachineBasicBlock &MBB : Fn) {
+  SaveBlocks.push_back(&MF.front());
+  for (MachineBasicBlock &MBB : MF) {
     if (MBB.isEHFuncletEntry())
       SaveBlocks.push_back(&MBB);
     if (MBB.isReturnBlock())
@@ -457,10 +460,10 @@ static void updateLiveness(MachineFunction &MF) {
 /// Insert restore code for the callee-saved registers used in the function.
 static void insertCSRSaves(MachineBasicBlock &SaveBlock,
                            ArrayRef<CalleeSavedInfo> CSI) {
-  MachineFunction &Fn = *SaveBlock.getParent();
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  MachineFunction &MF = *SaveBlock.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   MachineBasicBlock::iterator I = SaveBlock.begin();
   if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
@@ -477,10 +480,10 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
 /// Insert restore code for the callee-saved registers used in the function.
 static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
                               std::vector<CalleeSavedInfo> &CSI) {
-  MachineFunction &Fn = *RestoreBlock.getParent();
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  MachineFunction &MF = *RestoreBlock.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   // Restore all registers immediately before the return and any
   // terminators that precede it.
@@ -499,27 +502,27 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   }
 }
 
-void PEI::spillCalleeSavedRegs(MachineFunction &Fn) {
+void PEI::spillCalleeSavedRegs(MachineFunction &MF) {
   // We can't list this requirement in getRequiredProperties because some
   // targets (WebAssembly) use virtual registers past this point, and the pass
   // pipeline is set up without giving the passes a chance to look at the
   // TargetMachine.
   // FIXME: Find a way to express this in getRequiredProperties.
-  assert(Fn.getProperties().hasProperty(
+  assert(MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::NoVRegs));
 
-  const Function &F = Fn.getFunction();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  MachineFrameInfo &MFI = Fn.getFrameInfo();
+  const Function &F = MF.getFunction();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   MinCSFrameIndex = std::numeric_limits<unsigned>::max();
   MaxCSFrameIndex = 0;
 
   // Determine which of the registers in the callee save list should be saved.
   BitVector SavedRegs;
-  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
+  TFI->determineCalleeSaves(MF, SavedRegs, RS);
 
   // Assign stack slots for any callee-saved registers that must be spilled.
-  assignCalleeSavedSpillSlots(Fn, SavedRegs, MinCSFrameIndex, MaxCSFrameIndex);
+  assignCalleeSavedSpillSlots(MF, SavedRegs, MinCSFrameIndex, MaxCSFrameIndex);
 
   // Add the code to save and restore the callee saved registers.
   if (!F.hasFnAttribute(Attribute::Naked)) {
@@ -531,7 +534,7 @@ void PEI::spillCalleeSavedRegs(MachineFunction &Fn) {
         insertCSRSaves(*SaveBlock, CSI);
         // Update the live-in information of all the blocks up to the save
         // point.
-        updateLiveness(Fn);
+        updateLiveness(MF);
       }
       for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
         insertCSRRestores(*RestoreBlock, CSI);
@@ -558,10 +561,12 @@ AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
   Offset = alignTo(Offset, Align, Skew);
 
   if (StackGrowsDown) {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
+                      << "]\n");
     MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
   } else {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset
+                      << "]\n");
     MFI.setObjectOffset(FrameIdx, Offset);
     Offset += MFI.getObjectSize(FrameIdx);
   }
@@ -654,12 +659,12 @@ static inline bool scavengeStackSlot(MachineFrameInfo &MFI, int FrameIdx,
 
   if (StackGrowsDown) {
     int ObjStart = -(FreeStart + ObjSize);
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP[" << ObjStart
-                 << "]\n");
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP["
+                      << ObjStart << "]\n");
     MFI.setObjectOffset(FrameIdx, ObjStart);
   } else {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP[" << FreeStart
-                 << "]\n");
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP["
+                      << FreeStart << "]\n");
     MFI.setObjectOffset(FrameIdx, FreeStart);
   }
 
@@ -685,15 +690,14 @@ AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
 
 /// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
 /// abstract stack objects.
-void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-  StackProtector *SP = &getAnalysis<StackProtector>();
+void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
 
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
 
   // Loop over all of the stack objects, assigning sequential addresses...
-  MachineFrameInfo &MFI = Fn.getFrameInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // Start at the beginning of the local area.
   // The Offset is the distance from the stack top in the direction
@@ -706,7 +710,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   int64_t Offset = LocalAreaOffset;
 
   // Skew to be applied to alignment.
-  unsigned Skew = TFI.getStackAlignmentSkew(Fn);
+  unsigned Skew = TFI.getStackAlignmentSkew(MF);
 
   // If there are fixed sized objects that are preallocated in the local area,
   // non-fixed objects can't be allocated right at the start of local area.
@@ -739,7 +743,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
       // Adjust to alignment boundary
       Offset = alignTo(Offset, Align, Skew);
 
-      DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << -Offset << "]\n");
+      LLVM_DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << -Offset << "]\n");
       MFI.setObjectOffset(i, -Offset);        // Set the computed offset
     }
   } else if (MaxCSFrameIndex >= MinCSFrameIndex) {
@@ -752,7 +756,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
       // Adjust to alignment boundary
       Offset = alignTo(Offset, Align, Skew);
 
-      DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << Offset << "]\n");
+      LLVM_DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << Offset << "]\n");
       MFI.setObjectOffset(i, Offset);
       Offset += MFI.getObjectSize(i);
     }
@@ -766,11 +770,11 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // Make sure the special register scavenging spill slot is closest to the
   // incoming stack pointer if a frame pointer is required and is closer
   // to the incoming rather than the final stack pointer.
-  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
-  bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  bool EarlyScavengingSlots = (TFI.hasFP(MF) &&
                                TFI.isFPCloseToIncomingSP() &&
-                               RegInfo->useFPForScavengingIndex(Fn) &&
-                               !RegInfo->needsStackRealignment(Fn));
+                               RegInfo->useFPForScavengingIndex(MF) &&
+                               !RegInfo->needsStackRealignment(MF));
   if (RS && EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);
@@ -789,14 +793,14 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // Adjust to alignment boundary.
     Offset = alignTo(Offset, Align, Skew);
 
-    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+    LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
     // Resolve offsets for objects in the local block.
     for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
       std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
       int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
-      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
-            FIOffset << "]\n");
+      LLVM_DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << FIOffset
+                        << "]\n");
       MFI.setObjectOffset(Entry.first, FIOffset);
     }
     // Allocate the local block
@@ -807,7 +811,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // Retrieve the Exception Handler registration node.
   int EHRegNodeFrameIndex = std::numeric_limits<int>::max();
-  if (const WinEHFuncInfo *FuncInfo = Fn.getWinEHFuncInfo())
+  if (const WinEHFuncInfo *FuncInfo = MF.getWinEHFuncInfo())
     EHRegNodeFrameIndex = FuncInfo->EHRegNodeFrameIndex;
 
   // Make sure that the stack protector comes before the local variables on the
@@ -836,16 +840,16 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
           EHRegNodeFrameIndex == (int)i)
         continue;
 
-      switch (SP->getSSPLayout(MFI.getObjectAllocation(i))) {
-      case StackProtector::SSPLK_None:
+      switch (MFI.getObjectSSPLayout(i)) {
+      case MachineFrameInfo::SSPLK_None:
         continue;
-      case StackProtector::SSPLK_SmallArray:
+      case MachineFrameInfo::SSPLK_SmallArray:
         SmallArrayObjs.insert(i);
         continue;
-      case StackProtector::SSPLK_AddrOf:
+      case MachineFrameInfo::SSPLK_AddrOf:
         AddrOfObjs.insert(i);
         continue;
-      case StackProtector::SSPLK_LargeArray:
+      case MachineFrameInfo::SSPLK_LargeArray:
         LargeArrayObjs.insert(i);
         continue;
       }
@@ -889,9 +893,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
                       MaxAlign, Skew);
 
   // Give the targets a chance to order the objects the way they like it.
-  if (Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
-      Fn.getTarget().Options.StackSymbolOrdering)
-    TFI.orderFrameObjects(Fn, ObjectsToAllocate);
+  if (MF.getTarget().getOptLevel() != CodeGenOpt::None &&
+      MF.getTarget().Options.StackSymbolOrdering)
+    TFI.orderFrameObjects(MF, ObjectsToAllocate);
 
   // Keep track of which bytes in the fixed and callee-save range are used so we
   // can use the holes when allocating later stack objects.  Only do this if
@@ -899,8 +903,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // optimizing.
   BitVector StackBytesFree;
   if (!ObjectsToAllocate.empty() &&
-      Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
-      MFI.getStackProtectorIndex() < 0 && TFI.enableStackSlotScavenging(Fn))
+      MF.getTarget().getOptLevel() != CodeGenOpt::None &&
+      MFI.getStackProtectorIndex() < 0 && TFI.enableStackSlotScavenging(MF))
     computeFreeStackSlots(MFI, StackGrowsDown, MinCSFrameIndex, MaxCSFrameIndex,
                           FixedCSEnd, StackBytesFree);
 
@@ -924,7 +928,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // If we have reserved argument space for call sites in the function
     // immediately on entry to the current function, count it as part of the
     // overall stack size.
-    if (MFI.adjustsStack() && TFI.hasReservedCallFrame(Fn))
+    if (MFI.adjustsStack() && TFI.hasReservedCallFrame(MF))
       Offset += MFI.getMaxCallFrameSize();
 
     // Round up the size to a multiple of the alignment.  If the function has
@@ -934,7 +938,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // value.
     unsigned StackAlign;
     if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
-        (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
+        (RegInfo->needsStackRealignment(MF) && MFI.getObjectIndexEnd() != 0))
       StackAlign = TFI.getStackAlignment();
     else
       StackAlign = TFI.getTransientStackAlignment();
@@ -949,68 +953,61 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   int64_t StackSize = Offset - LocalAreaOffset;
   MFI.setStackSize(StackSize);
   NumBytesStackSpace += StackSize;
-
-  ORE->emit([&]() {
-    return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "StackSize",
-                                             Fn.getFunction().getSubprogram(),
-                                             &Fn.front())
-           << ore::NV("NumStackBytes", StackSize) << " stack bytes in function";
-  });
 }
 
 /// insertPrologEpilogCode - Scan the function for modified callee saved
 /// registers, insert spill code for these callee saved registers, then add
 /// prolog and epilog code to the function.
-void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+void PEI::insertPrologEpilogCode(MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
 
   // Add prologue to the function...
   for (MachineBasicBlock *SaveBlock : SaveBlocks)
-    TFI.emitPrologue(Fn, *SaveBlock);
+    TFI.emitPrologue(MF, *SaveBlock);
 
   // Add epilogue to restore the callee-save registers in each exiting block.
   for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
-    TFI.emitEpilogue(Fn, *RestoreBlock);
+    TFI.emitEpilogue(MF, *RestoreBlock);
 
   for (MachineBasicBlock *SaveBlock : SaveBlocks)
-    TFI.inlineStackProbe(Fn, *SaveBlock);
+    TFI.inlineStackProbe(MF, *SaveBlock);
 
   // Emit additional code that is required to support segmented stacks, if
   // we've been asked for it.  This, when linked with a runtime with support
   // for segmented stacks (libgcc is one), will result in allocating stack
   // space in small chunks instead of one large contiguous block.
-  if (Fn.shouldSplitStack()) {
+  if (MF.shouldSplitStack()) {
     for (MachineBasicBlock *SaveBlock : SaveBlocks)
-      TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
+      TFI.adjustForSegmentedStacks(MF, *SaveBlock);
     // Record that there are split-stack functions, so we will emit a
     // special section to tell the linker.
-    Fn.getMMI().setHasSplitStack(true);
+    MF.getMMI().setHasSplitStack(true);
   } else
-    Fn.getMMI().setHasNosplitStack(true);
+    MF.getMMI().setHasNosplitStack(true);
 
   // Emit additional code that is required to explicitly handle the stack in
   // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
   // approach is rather similar to that of Segmented Stacks, but it uses a
   // different conditional check and another BIF for allocating more stack
   // space.
-  if (Fn.getFunction().getCallingConv() == CallingConv::HiPE)
+  if (MF.getFunction().getCallingConv() == CallingConv::HiPE)
     for (MachineBasicBlock *SaveBlock : SaveBlocks)
-      TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
+      TFI.adjustForHiPEPrologue(MF, *SaveBlock);
 }
 
 /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
 /// register references and actual offsets.
-void PEI::replaceFrameIndices(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-  if (!TFI.needsFrameIndexResolution(Fn)) return;
+void PEI::replaceFrameIndices(MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(MF)) return;
 
   // Store SPAdj at exit of a basic block.
   SmallVector<int, 8> SPState;
-  SPState.resize(Fn.getNumBlockIDs());
+  SPState.resize(MF.getNumBlockIDs());
   df_iterator_default_set<MachineBasicBlock*> Reachable;
 
   // Iterate over the reachable blocks in DFS order.
-  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+  for (auto DFI = df_ext_begin(&MF, Reachable), DFE = df_ext_end(&MF, Reachable);
        DFI != DFE; ++DFI) {
     int SPAdj = 0;
     // Check the exit state of the DFS stack predecessor.
@@ -1021,27 +1018,27 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
       SPAdj = SPState[StackPred->getNumber()];
     }
     MachineBasicBlock *BB = *DFI;
-    replaceFrameIndices(BB, Fn, SPAdj);
+    replaceFrameIndices(BB, MF, SPAdj);
     SPState[BB->getNumber()] = SPAdj;
   }
 
   // Handle the unreachable blocks.
-  for (auto &BB : Fn) {
+  for (auto &BB : MF) {
     if (Reachable.count(&BB))
       // Already handled in DFS traversal.
       continue;
     int SPAdj = 0;
-    replaceFrameIndices(&BB, Fn, SPAdj);
+    replaceFrameIndices(&BB, MF, SPAdj);
   }
 }
 
-void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
                               int &SPAdj) {
-  assert(Fn.getSubtarget().getRegisterInfo() &&
+  assert(MF.getSubtarget().getRegisterInfo() &&
          "getRegisterInfo() must be implemented!");
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   if (RS && FrameIndexEliminationScavenging)
     RS->enterBasicBlock(*BB);
@@ -1052,7 +1049,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
     if (TII.isFrameInstr(*I)) {
       InsideCallSequence = TII.isFrameSetup(*I);
       SPAdj += TII.getSPAdjust(*I);
-      I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
+      I = TFI->eliminateCallFramePseudoInstr(MF, *BB, I);
       continue;
     }
 
@@ -1071,8 +1068,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
                          "operand of a DBG_VALUE machine instruction");
         unsigned Reg;
         int64_t Offset =
-            TFI->getFrameIndexReference(Fn, MI.getOperand(0).getIndex(), Reg);
+            TFI->getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
         MI.getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
+        MI.getOperand(0).setIsDebug();
         auto *DIExpr = DIExpression::prepend(MI.getDebugExpression(),
                                              DIExpression::NoDeref, Offset);
         MI.getOperand(3).setMetadata(DIExpr);
@@ -1091,7 +1089,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
         unsigned Reg;
         MachineOperand &Offset = MI.getOperand(i + 1);
         int refOffset = TFI->getFrameIndexReferencePreferSP(
-            Fn, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
+            MF, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
         Offset.setImm(Offset.getImm() + refOffset);
         MI.getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
         continue;
diff --git a/contrib/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/contrib/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
new file mode 100644
index 000000000000..050fef5d25ed
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -0,0 +1,195 @@
+//===---- ReachingDefAnalysis.cpp - Reaching Def Analysis ---*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "reaching-deps-analysis"
+
+char ReachingDefAnalysis::ID = 0;
+INITIALIZE_PASS(ReachingDefAnalysis, DEBUG_TYPE, "ReachingDefAnalysis", false,
+                true)
+
+void ReachingDefAnalysis::enterBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+
+  MachineBasicBlock *MBB = TraversedMBB.MBB;
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  MBBReachingDefs[MBBNumber].resize(NumRegUnits);
+
+  // Reset instruction counter in each basic block.
+  CurInstr = 0;
+
+  // Set up LiveRegs to represent registers entering MBB.
+  // Default values are 'nothing happened a long time ago'.
+  if (LiveRegs.empty())
+    LiveRegs.assign(NumRegUnits, ReachingDefDefaultVal);
+
+  // This is the entry block.
+  if (MBB->pred_empty()) {
+    for (const auto &LI : MBB->liveins()) {
+      for (MCRegUnitIterator Unit(LI.PhysReg, TRI); Unit.isValid(); ++Unit) {
+        // Treat function live-ins as if they were defined just before the first
+        // instruction.  Usually, function arguments are set up immediately
+        // before the call.
+        LiveRegs[*Unit] = -1;
+        MBBReachingDefs[MBBNumber][*Unit].push_back(LiveRegs[*Unit]);
+      }
+    }
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n");
+    return;
+  }
+
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock *pred : MBB->predecessors()) {
+    assert(unsigned(pred->getNumber()) < MBBOutRegsInfos.size() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    const LiveRegsDefInfo &Incoming = MBBOutRegsInfos[pred->getNumber()];
+    // Incoming is null if this is a backedge from a BB
+    // we haven't processed yet
+    if (Incoming.empty())
+      continue;
+
+    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) {
+      // Use the most recent predecessor def for each register.
+      LiveRegs[Unit] = std::max(LiveRegs[Unit], Incoming[Unit]);
+      if ((LiveRegs[Unit] != ReachingDefDefaultVal))
+        MBBReachingDefs[MBBNumber][Unit].push_back(LiveRegs[Unit]);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << printMBBReference(*MBB)
+                    << (!TraversedMBB.IsDone ? ": incomplete\n"
+                                             : ": all preds known\n"));
+}
+
+void ReachingDefAnalysis::leaveBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  unsigned MBBNumber = TraversedMBB.MBB->getNumber();
+  assert(MBBNumber < MBBOutRegsInfos.size() &&
+         "Unexpected basic block number.");
+  // Save register clearances at end of MBB - used by enterBasicBlock().
+  MBBOutRegsInfos[MBBNumber] = LiveRegs;
+
+  // While processing the basic block, we kept `Def` relative to the start
+  // of the basic block for convenience. However, future use of this information
+  // only cares about the clearance from the end of the block, so adjust
+  // everything to be relative to the end of the basic block.
+  for (int &OutLiveReg : MBBOutRegsInfos[MBBNumber])
+    OutLiveReg -= CurInstr;
+  LiveRegs.clear();
+}
+
+void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
+  assert(!MI->isDebugInstr() && "Won't process debug instructions");
+
+  unsigned MBBNumber = MI->getParent()->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0,
+                e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+       i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    for (MCRegUnitIterator Unit(MO.getReg(), TRI); Unit.isValid(); ++Unit) {
+      // This instruction explicitly defines the current reg unit.
+      LLVM_DEBUG(dbgs() << printReg(MO.getReg(), TRI) << ":\t" << CurInstr
+                        << '\t' << *MI);
+
+      // How many instructions since this reg unit was last written?
+      LiveRegs[*Unit] = CurInstr;
+      MBBReachingDefs[MBBNumber][*Unit].push_back(CurInstr);
+    }
+  }
+  InstIds[MI] = CurInstr;
+  ++CurInstr;
+}
+
+void ReachingDefAnalysis::processBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  enterBasicBlock(TraversedMBB);
+  for (MachineInstr &MI : *TraversedMBB.MBB) {
+    if (!MI.isDebugInstr())
+      processDefs(&MI);
+  }
+  leaveBasicBlock(TraversedMBB);
+}
+
+bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+  MF = &mf;
+  TRI = MF->getSubtarget().getRegisterInfo();
+
+  LiveRegs.clear();
+  NumRegUnits = TRI->getNumRegUnits();
+
+  MBBReachingDefs.resize(mf.getNumBlockIDs());
+
+  LLVM_DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n");
+
+  // Initialize the MBBOutRegsInfos
+  MBBOutRegsInfos.resize(mf.getNumBlockIDs());
+
+  // Traverse the basic blocks.
+  LoopTraversal Traversal;
+  LoopTraversal::TraversalOrder TraversedMBBOrder = Traversal.traverse(mf);
+  for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder) {
+    processBasicBlock(TraversedMBB);
+  }
+
+  // Sorting all reaching defs found for a ceartin reg unit in a given BB.
+  for (MBBDefsInfo &MBBDefs : MBBReachingDefs) {
+    for (MBBRegUnitDefs &RegUnitDefs : MBBDefs)
+      llvm::sort(RegUnitDefs.begin(), RegUnitDefs.end());
+  }
+
+  return false;
+}
+
+void ReachingDefAnalysis::releaseMemory() {
+  // Clear the internal vectors.
+  MBBOutRegsInfos.clear();
+  MBBReachingDefs.clear();
+  InstIds.clear();
+}
+
+int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) {
+  assert(InstIds.count(MI) && "Unexpected machine instuction.");
+  int InstId = InstIds[MI];
+  int DefRes = ReachingDefDefaultVal;
+  unsigned MBBNumber = MI->getParent()->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  int LatestDef = ReachingDefDefaultVal;
+  for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
+    for (int Def : MBBReachingDefs[MBBNumber][*Unit]) {
+      if (Def >= InstId)
+        break;
+      DefRes = Def;
+    }
+    LatestDef = std::max(LatestDef, DefRes);
+  }
+  return LatestDef;
+}
+
+int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) {
+  assert(InstIds.count(MI) && "Unexpected machine instuction.");
+  return InstIds[MI] - getReachingDef(MI, PhysReg);
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
index 74c1592634aa..bc28a054c680 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -91,7 +91,7 @@ void RegAllocBase::allocatePhysRegs() {
 
     // Unused registers can appear when the spiller coalesces snippets.
     if (MRI->reg_nodbg_empty(VirtReg->reg)) {
-      DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      LLVM_DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
       aboutToRemoveInterval(*VirtReg);
       LIS->removeInterval(VirtReg->reg);
       continue;
@@ -103,9 +103,9 @@ void RegAllocBase::allocatePhysRegs() {
     // selectOrSplit requests the allocator to return an available physical
     // register if possible and populate a list of new live intervals that
     // result from splitting.
-    DEBUG(dbgs() << "\nselectOrSplit "
-          << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
-          << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
+    LLVM_DEBUG(dbgs() << "\nselectOrSplit "
+                      << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
+                      << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
 
     using VirtRegVec = SmallVector<unsigned, 4>;
 
@@ -145,12 +145,12 @@ void RegAllocBase::allocatePhysRegs() {
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         assert(SplitVirtReg->empty() && "Non-empty but used interval");
-        DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        LLVM_DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
         aboutToRemoveInterval(*SplitVirtReg);
         LIS->removeInterval(SplitVirtReg->reg);
         continue;
       }
-      DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
+      LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
       assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) &&
              "expect split value in virtual register");
       enqueue(SplitVirtReg);
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
index 1125d2c62bef..daeff3fc3963 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -219,8 +219,8 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
       Intfs.push_back(Intf);
     }
   }
-  DEBUG(dbgs() << "spilling " << printReg(PhysReg, TRI)
-               << " interferences with " << VirtReg << "\n");
+  LLVM_DEBUG(dbgs() << "spilling " << printReg(PhysReg, TRI)
+                    << " interferences with " << VirtReg << "\n");
   assert(!Intfs.empty() && "expected interference");
 
   // Spill each interfering vreg allocated to PhysReg or an alias.
@@ -292,7 +292,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   }
 
   // No other spill candidates were found, so spill the current VirtReg.
-  DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
+  LLVM_DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
   LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
@@ -304,9 +304,8 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
 }
 
 bool RABasic::runOnMachineFunction(MachineFunction &mf) {
-  DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
-               << "********** Function: "
-               << mf.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
+                    << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
@@ -323,7 +322,7 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   postOptimization();
 
   // Diagnostic output before rewriting
-  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
+  LLVM_DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
 
   releaseMemory();
   return true;
diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
index 17d9492d942e..7b57c6cbcdb8 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -179,7 +179,7 @@ namespace {
     }
 
   private:
-    bool runOnMachineFunction(MachineFunction &Fn) override;
+    bool runOnMachineFunction(MachineFunction &MF) override;
     void allocateBasicBlock(MachineBasicBlock &MBB);
     void handleThroughOperands(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &VirtDead);
@@ -206,7 +206,7 @@ namespace {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
 
-    LiveRegMap::iterator assignVirtToPhysReg(unsigned VReg, MCPhysReg PhysReg);
+    LiveRegMap::iterator assignVirtToPhysReg(unsigned VirtReg, MCPhysReg PhysReg);
     LiveRegMap::iterator allocVirtReg(MachineInstr &MI, LiveRegMap::iterator,
                                       unsigned Hint);
     LiveRegMap::iterator defineVirtReg(MachineInstr &MI, unsigned OpNum,
@@ -322,11 +322,11 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
     // instruction, not on the spill.
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
-    DEBUG(dbgs() << "Spilling " << printReg(LRI->VirtReg, TRI)
-                 << " in " << printReg(LR.PhysReg, TRI));
+    LLVM_DEBUG(dbgs() << "Spilling " << printReg(LRI->VirtReg, TRI) << " in "
+                      << printReg(LR.PhysReg, TRI));
     const TargetRegisterClass &RC = *MRI->getRegClass(LRI->VirtReg);
     int FI = getStackSpaceFor(LRI->VirtReg, RC);
-    DEBUG(dbgs() << " to stack slot #" << FI << "\n");
+    LLVM_DEBUG(dbgs() << " to stack slot #" << FI << "\n");
     TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, &RC, TRI);
     ++NumStores;   // Update statistics
 
@@ -339,7 +339,9 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
       MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI);
       assert(NewDV->getParent() == MBB && "dangling parent pointer");
       (void)NewDV;
-      DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
+      LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:"
+                        << "\n"
+                        << *NewDV);
     }
     // Now this register is spilled there is should not be any DBG_VALUE
     // pointing to this register because they are all pointing to spilled value
@@ -470,13 +472,14 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
   }
 }
 
-/// \brief Return the cost of spilling clearing out PhysReg and aliases so it is
+/// Return the cost of spilling clearing out PhysReg and aliases so it is
 /// free for allocation. Returns 0 when PhysReg is free or disabled with all
 /// aliases disabled - it can be allocated directly.
 /// \returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
   if (isRegUsedInInstr(PhysReg)) {
-    DEBUG(dbgs() << printReg(PhysReg, TRI) << " is already used in instr.\n");
+    LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI)
+                      << " is already used in instr.\n");
     return spillImpossible;
   }
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
@@ -485,8 +488,8 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
   case regFree:
     return 0;
   case regReserved:
-    DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
-                 << printReg(PhysReg, TRI) << " is reserved already.\n");
+    LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
+                      << printReg(PhysReg, TRI) << " is reserved already.\n");
     return spillImpossible;
   default: {
     LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
@@ -496,7 +499,7 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
   }
 
   // This is a disabled register, add up cost of aliases.
-  DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n");
+  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n");
   unsigned Cost = 0;
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     MCPhysReg Alias = *AI;
@@ -519,12 +522,12 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
   return Cost;
 }
 
-/// \brief This method updates local state so that we know that PhysReg is the
+/// This method updates local state so that we know that PhysReg is the
 /// proper container for VirtReg now.  The physical register must not be used
 /// for anything else when this is called.
 void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
-  DEBUG(dbgs() << "Assigning " << printReg(LR.VirtReg, TRI) << " to "
-               << printReg(PhysReg, TRI) << "\n");
+  LLVM_DEBUG(dbgs() << "Assigning " << printReg(LR.VirtReg, TRI) << " to "
+                    << printReg(PhysReg, TRI) << "\n");
   PhysRegState[PhysReg] = LR.VirtReg;
   assert(!LR.PhysReg && "Already assigned a physreg");
   LR.PhysReg = PhysReg;
@@ -570,16 +573,16 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
     }
   }
 
-  DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from "
-               << TRI->getRegClassName(&RC) << "\n");
+  LLVM_DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from "
+                    << TRI->getRegClassName(&RC) << "\n");
 
   unsigned BestReg = 0;
   unsigned BestCost = spillImpossible;
   for (MCPhysReg PhysReg : AO) {
     unsigned Cost = calcSpillCost(PhysReg);
-    DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << "\n");
-    DEBUG(dbgs() << "\tCost: " << Cost << "\n");
-    DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
+    LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "\tCost: " << Cost << "\n");
+    LLVM_DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0) {
       assignVirtToPhysReg(*LRI, PhysReg);
@@ -654,22 +657,22 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
     LRI = allocVirtReg(MI, LRI, Hint);
     const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
     int FrameIndex = getStackSpaceFor(VirtReg, RC);
-    DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
-                 << printReg(LRI->PhysReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
+                      << printReg(LRI->PhysReg, TRI) << "\n");
     TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, &RC, TRI);
     ++NumLoads;
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
-      DEBUG(dbgs() << "Killing last use: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Killing last use: " << MO << "\n");
       if (MO.isUse())
         MO.setIsKill();
       else
         MO.setIsDead();
     } else if (MO.isKill()) {
-      DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
       MO.setIsKill(false);
     } else if (MO.isDead()) {
-      DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
       MO.setIsDead(false);
     }
   } else if (MO.isKill()) {
@@ -677,10 +680,10 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
     // register would be killed immediately, and there might be a second use:
     //   %foo = OR killed %x, %x
     // This would cause a second reload of %x into a different register.
-    DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
+    LLVM_DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
     MO.setIsKill(false);
   } else if (MO.isDead()) {
-    DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    LLVM_DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
     MO.setIsDead(false);
   }
   assert(LRI->PhysReg && "Register not assigned");
@@ -699,13 +702,13 @@ bool RegAllocFast::setPhysReg(MachineInstr &MI, unsigned OpNum,
   bool Dead = MO.isDead();
   if (!MO.getSubReg()) {
     MO.setReg(PhysReg);
-    MO.setIsRenamableIfNoExtraRegAllocReq();
+    MO.setIsRenamable(true);
     return MO.isKill() || Dead;
   }
 
   // Handle subregister index.
   MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
-  MO.setIsRenamableIfNoExtraRegAllocReq();
+  MO.setIsRenamable(true);
   MO.setSubReg(0);
 
   // A kill flag implies killing the full register. Add corresponding super
@@ -727,7 +730,7 @@ bool RegAllocFast::setPhysReg(MachineInstr &MI, unsigned OpNum,
 // there are additional physreg defines.
 void RegAllocFast::handleThroughOperands(MachineInstr &MI,
                                          SmallVectorImpl<unsigned> &VirtDead) {
-  DEBUG(dbgs() << "Scanning for through registers:");
+  LLVM_DEBUG(dbgs() << "Scanning for through registers:");
   SmallSet<unsigned, 8> ThroughRegs;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg()) continue;
@@ -737,13 +740,13 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (MO.isEarlyClobber() || (MO.isUse() && MO.isTied()) ||
         (MO.getSubReg() && MI.readsVirtualRegister(Reg))) {
       if (ThroughRegs.insert(Reg).second)
-        DEBUG(dbgs() << ' ' << printReg(Reg));
+        LLVM_DEBUG(dbgs() << ' ' << printReg(Reg));
     }
   }
 
   // If any physreg defines collide with preallocated through registers,
   // we must spill and reallocate.
-  DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
+  LLVM_DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
@@ -756,7 +759,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   }
 
   SmallVector<unsigned, 8> PartialDefs;
-  DEBUG(dbgs() << "Allocating tied uses.\n");
+  LLVM_DEBUG(dbgs() << "Allocating tied uses.\n");
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
@@ -764,15 +767,16 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
     if (MO.isUse()) {
       if (!MO.isTied()) continue;
-      DEBUG(dbgs() << "Operand " << I << "("<< MO << ") is tied to operand "
-        << MI.findTiedOperandIdx(I) << ".\n");
+      LLVM_DEBUG(dbgs() << "Operand " << I << "(" << MO
+                        << ") is tied to operand " << MI.findTiedOperandIdx(I)
+                        << ".\n");
       LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
       MCPhysReg PhysReg = LRI->PhysReg;
       setPhysReg(MI, I, PhysReg);
       // Note: we don't update the def operand yet. That would cause the normal
       // def-scan to attempt spilling.
     } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) {
-      DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
       // Reload the register, but don't assign to the operand just yet.
       // That would confuse the later phys-def processing pass.
       LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
@@ -780,7 +784,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     }
   }
 
-  DEBUG(dbgs() << "Allocating early clobbers.\n");
+  LLVM_DEBUG(dbgs() << "Allocating early clobbers.\n");
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
@@ -801,8 +805,8 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-    DEBUG(dbgs() << "\tSetting " << printReg(Reg, TRI)
-                 << " as used in instr\n");
+    LLVM_DEBUG(dbgs() << "\tSetting " << printReg(Reg, TRI)
+                      << " as used in instr\n");
     markRegUsedInInstr(Reg);
   }
 
@@ -848,7 +852,7 @@ void RegAllocFast::dumpState() {
 
 void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
-  DEBUG(dbgs() << "\nAllocating " << MBB);
+  LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
   PhysRegState.assign(TRI->getNumRegs(), regDisabled);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
@@ -866,10 +870,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   // Otherwise, sequentially allocate each instruction in the MBB.
   for (MachineInstr &MI : MBB) {
     const MCInstrDesc &MCID = MI.getDesc();
-    DEBUG(
-      dbgs() << "\n>> " << MI << "Regs:";
-      dumpState()
-    );
+    LLVM_DEBUG(dbgs() << "\n>> " << MI << "Regs:"; dumpState());
 
     // Debug values are not allowed to change codegen in any way.
     if (MI.isDebugValue()) {
@@ -894,13 +895,13 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
         if (SS != -1) {
           // Modify DBG_VALUE now that the value is in a spill slot.
           updateDbgValueForSpill(*DebugMI, SS);
-          DEBUG(dbgs() << "Modifying debug info due to spill:"
-                       << "\t" << *DebugMI);
+          LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:"
+                            << "\t" << *DebugMI);
           continue;
         }
 
         // We can't allocate a physreg for a DebugValue, sorry!
-        DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
+        LLVM_DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
         MO.setReg(0);
       }
 
@@ -910,6 +911,9 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    if (MI.isDebugLabel())
+      continue;
+
     // If this is a copy, we may be able to coalesce.
     unsigned CopySrcReg = 0;
     unsigned CopyDstReg = 0;
@@ -1025,7 +1029,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
       // as call-clobbered, this is not correct because some of those
       // definitions may be used later on and we do not want to reuse
       // those for virtual registers in between.
-      DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
+      LLVM_DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
       spillAll(MI);
     }
 
@@ -1060,15 +1064,15 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
     VirtDead.clear();
 
     if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) {
-      DEBUG(dbgs() << "-- coalescing: " << MI);
+      LLVM_DEBUG(dbgs() << "-- coalescing: " << MI);
       Coalesced.push_back(&MI);
     } else {
-      DEBUG(dbgs() << "<< " << MI);
+      LLVM_DEBUG(dbgs() << "<< " << MI);
     }
   }
 
   // Spill all physical registers holding virtual registers now.
-  DEBUG(dbgs() << "Spilling live registers at end of block.\n");
+  LLVM_DEBUG(dbgs() << "Spilling live registers at end of block.\n");
   spillAll(MBB.getFirstTerminator());
 
   // Erase all the coalesced copies. We are delaying it until now because
@@ -1077,13 +1081,13 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
     MBB.erase(MI);
   NumCopies += Coalesced.size();
 
-  DEBUG(MBB.dump());
+  LLVM_DEBUG(MBB.dump());
 }
 
 /// Allocates registers for a function.
 bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   MRI = &MF.getRegInfo();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   TRI = STI.getRegisterInfo();
diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
index e492c481a540..3333e1f2fb8b 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -125,6 +125,12 @@ static cl::opt<bool> EnableDeferredSpilling(
              "variable because of other evicted variables."),
     cl::init(false));
 
+static cl::opt<unsigned>
+    HugeSizeForSplit("huge-size-for-split", cl::Hidden,
+                     cl::desc("A threshold of live range size which may cause "
+                              "high compile time cost in global splitting."),
+                     cl::init(5000));
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned>
 CSRFirstTimeCost("regalloc-csr-first-time-cost",
@@ -292,7 +298,7 @@ class RAGreedy : public MachineFunctionPass,
   public:
     using EvictorInfo =
         std::pair<unsigned /* evictor */, unsigned /* physreg */>;
-    using EvicteeInfo = llvm::MapVector<unsigned /* evictee */, EvictorInfo>;
+    using EvicteeInfo = llvm::DenseMap<unsigned /* evictee */, EvictorInfo>;
 
   private:
     /// Each Vreg that has been evicted in the last stage of selectOrSplit will
@@ -300,28 +306,28 @@ class RAGreedy : public MachineFunctionPass,
     EvicteeInfo Evictees;
 
   public:
-    /// \brief Clear all eviction information.
+    /// Clear all eviction information.
     void clear() { Evictees.clear(); }
 
-    /// \brief  Clear eviction information for the given evictee Vreg.
+    ///  Clear eviction information for the given evictee Vreg.
     /// E.g. when Vreg get's a new allocation, the old eviction info is no
     /// longer relevant.
     /// \param Evictee The evictee Vreg for whom we want to clear collected
     /// eviction info.
     void clearEvicteeInfo(unsigned Evictee) { Evictees.erase(Evictee); }
 
-    /// \brief Track new eviction.
+    /// Track new eviction.
     /// The Evictor vreg has evicted the Evictee vreg from Physreg.
-    /// \praram PhysReg The phisical register Evictee was evicted from.
-    /// \praram Evictor The evictor Vreg that evicted Evictee.
-    /// \praram Evictee The evictee Vreg.
+    /// \param PhysReg The phisical register Evictee was evicted from.
+    /// \param Evictor The evictor Vreg that evicted Evictee.
+    /// \param Evictee The evictee Vreg.
     void addEviction(unsigned PhysReg, unsigned Evictor, unsigned Evictee) {
       Evictees[Evictee].first = Evictor;
       Evictees[Evictee].second = PhysReg;
     }
 
     /// Return the Evictor Vreg which evicted Evictee Vreg from PhysReg.
-    /// \praram Evictee The evictee vreg.
+    /// \param Evictee The evictee vreg.
     /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if
     /// nobody has evicted Evictee from PhysReg.
     EvictorInfo getEvictor(unsigned Evictee) {
@@ -399,7 +405,7 @@ class RAGreedy : public MachineFunctionPass,
   /// obtained from the TargetSubtargetInfo.
   bool EnableLocalReassign;
 
-  /// Enable or not the the consideration of the cost of local intervals created
+  /// Enable or not the consideration of the cost of local intervals created
   /// by a split candidate when choosing the best split candidate.
   bool EnableAdvancedRASplitCost;
 
@@ -448,13 +454,16 @@ private:
   bool splitCanCauseEvictionChain(unsigned Evictee, GlobalSplitCandidate &Cand,
                                   unsigned BBNumber,
                                   const AllocationOrder &Order);
+  bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
+                               GlobalSplitCandidate &Cand, unsigned BBNumber,
+                               const AllocationOrder &Order);
   BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
                                      const AllocationOrder &Order,
                                      bool *CanCauseEvictionChain);
   bool calcCompactRegion(GlobalSplitCandidate&);
   void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
   void calcGapWeights(unsigned, SmallVectorImpl<float>&);
-  unsigned canReassign(LiveInterval &VirtReg, unsigned PhysReg);
+  unsigned canReassign(LiveInterval &VirtReg, unsigned PrevReg);
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
   bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
   bool canEvictInterferenceInRange(LiveInterval &VirtReg, unsigned PhysReg,
@@ -475,6 +484,7 @@ private:
                     SmallVectorImpl<unsigned>&, unsigned = ~0u);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
                           SmallVectorImpl<unsigned>&);
+  unsigned isSplitBenefitWorthCost(LiveInterval &VirtReg);
   /// Calculate cost of region splitting.
   unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
                                     AllocationOrder &Order,
@@ -763,7 +773,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
   // preferred register.
   if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg))
     if (Order.isHint(Hint)) {
-      DEBUG(dbgs() << "missed hint " << printReg(Hint, TRI) << '\n');
+      LLVM_DEBUG(dbgs() << "missed hint " << printReg(Hint, TRI) << '\n');
       EvictionCost MaxCost;
       MaxCost.setBrokenHints(1);
       if (canEvictInterference(VirtReg, Hint, true, MaxCost)) {
@@ -782,8 +792,8 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
   if (!Cost)
     return PhysReg;
 
-  DEBUG(dbgs() << printReg(PhysReg, TRI) << " is available at cost " << Cost
-               << '\n');
+  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is available at cost "
+                    << Cost << '\n');
   unsigned CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost);
   return CheapReg ? CheapReg : PhysReg;
 }
@@ -811,9 +821,9 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
       break;
   }
   if (PhysReg)
-    DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
-          << printReg(PrevReg, TRI) << " to " << printReg(PhysReg, TRI)
-          << '\n');
+    LLVM_DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
+                      << printReg(PrevReg, TRI) << " to "
+                      << printReg(PhysReg, TRI) << '\n');
   return PhysReg;
 }
 
@@ -840,7 +850,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
     return true;
 
   if (A.weight > B.weight) {
-    DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight << '\n');
+    LLVM_DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight << '\n');
     return true;
   }
   return false;
@@ -934,7 +944,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   return true;
 }
 
-/// \brief Return true if all interferences between VirtReg and PhysReg between
+/// Return true if all interferences between VirtReg and PhysReg between
 /// Start and End can be evicted.
 ///
 /// \param VirtReg Live range that is about to be assigned.
@@ -986,7 +996,7 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
   return true;
 }
 
-/// \brief Return tthe physical register that will be best
+/// Return the physical register that will be best
 /// candidate for eviction by a local split interval that will be created
 /// between Start and End.
 ///
@@ -1032,8 +1042,8 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   if (!Cascade)
     Cascade = ExtraRegInfo[VirtReg.reg].Cascade = NextCascade++;
 
-  DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
-               << " interference: Cascade " << Cascade << '\n');
+  LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
+                    << " interference: Cascade " << Cascade << '\n');
 
   // Collect all interfering virtregs first.
   SmallVector<LiveInterval*, 8> Intfs;
@@ -1104,8 +1114,8 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
     const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
     unsigned MinCost = RegClassInfo.getMinCost(RC);
     if (MinCost >= CostPerUseLimit) {
-      DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = " << MinCost
-                   << ", no cheaper registers to be found.\n");
+      LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = "
+                        << MinCost << ", no cheaper registers to be found.\n");
       return 0;
     }
 
@@ -1113,7 +1123,8 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
     // the same cost. We don't need to look at them if they're too expensive.
     if (TRI->getCostPerUse(Order.getOrder().back()) >= CostPerUseLimit) {
       OrderLimit = RegClassInfo.getLastCostChange(RC);
-      DEBUG(dbgs() << "Only trying the first " << OrderLimit << " regs.\n");
+      LLVM_DEBUG(dbgs() << "Only trying the first " << OrderLimit
+                        << " regs.\n");
     }
   }
 
@@ -1124,9 +1135,10 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
     // The first use of a callee-saved register in a function has cost 1.
     // Don't start using a CSR when the CostPerUseLimit is low.
     if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
-      DEBUG(dbgs() << printReg(PhysReg, TRI) << " would clobber CSR "
-            << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
-            << '\n');
+      LLVM_DEBUG(
+          dbgs() << printReg(PhysReg, TRI) << " would clobber CSR "
+                 << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
+                 << '\n');
       continue;
     }
 
@@ -1313,7 +1325,7 @@ void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
     // Perhaps iterating can enable more bundles?
     SpillPlacer->iterate();
   }
-  DEBUG(dbgs() << ", v=" << Visited);
+  LLVM_DEBUG(dbgs() << ", v=" << Visited);
 }
 
 /// calcCompactRegion - Compute the set of edge bundles that should be live
@@ -1331,7 +1343,7 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
   // Compact regions don't correspond to any physreg.
   Cand.reset(IntfCache, 0);
 
-  DEBUG(dbgs() << "Compact region bundles");
+  LLVM_DEBUG(dbgs() << "Compact region bundles");
 
   // Use the spill placer to determine the live bundles. GrowRegion pretends
   // that all the through blocks have interference when PhysReg is unset.
@@ -1340,7 +1352,7 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
   // The static split cost will be zero since Cand.Intf reports no interference.
   BlockFrequency Cost;
   if (!addSplitConstraints(Cand.Intf, Cost)) {
-    DEBUG(dbgs() << ", none.\n");
+    LLVM_DEBUG(dbgs() << ", none.\n");
     return false;
   }
 
@@ -1348,11 +1360,11 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
   SpillPlacer->finish();
 
   if (!Cand.LiveBundles.any()) {
-    DEBUG(dbgs() << ", none.\n");
+    LLVM_DEBUG(dbgs() << ", none.\n");
     return false;
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     for (int i : Cand.LiveBundles.set_bits())
       dbgs() << " EB#" << i;
     dbgs() << ".\n";
@@ -1378,7 +1390,7 @@ BlockFrequency RAGreedy::calcSpillCost() {
   return Cost;
 }
 
-/// \brief Check if splitting Evictee will create a local split interval in
+/// Check if splitting Evictee will create a local split interval in
 /// basic block number BBNumber that may cause a bad eviction chain. This is
 /// intended to prevent bad eviction sequences like:
 /// movl	%ebp, 8(%esp)           # 4-byte Spill
@@ -1401,7 +1413,7 @@ BlockFrequency RAGreedy::calcSpillCost() {
 /// Evictee %0 is intended for region splitting with split candidate
 /// physreg0 (the reg %0 was evicted from).
 /// Region splitting creates a local interval because of interference with the
-/// evictor %1 (normally region spliitting creates 2 interval, the "by reg"
+/// evictor %1 (normally region splitting creates 2 interval, the "by reg"
 /// and "by stack" intervals and local interval created when interference
 /// occurs).
 /// One of the split intervals ends up evicting %2 from physreg1.
@@ -1427,7 +1439,7 @@ BlockFrequency RAGreedy::calcSpillCost() {
 ///                 we are splitting for and the interferences.
 /// \param BBNumber The number of a BB for which the region split process will
 ///                 create a local split interval.
-/// \param Order    The phisical registers that may get evicted by a split
+/// \param Order    The physical registers that may get evicted by a split
 ///                 artifact of Evictee.
 /// \return True if splitting Evictee may cause a bad eviction chain, false
 /// otherwise.
@@ -1448,8 +1460,8 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
       getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee),
                                Cand.Intf.first(), Cand.Intf.last(), &MaxWeight);
 
-  // The bad eviction chain occurs when either the split candidate the the
-  // evited reg or one of the split artifact will evict the evicting reg.
+  // The bad eviction chain occurs when either the split candidate is the
+  // evicting reg or one of the split artifact will evict the evicting reg.
   if ((PhysReg != Cand.PhysReg) && (PhysReg != FutureEvictedPhysReg))
     return false;
 
@@ -1479,6 +1491,54 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
   return true;
 }
 
+/// Check if splitting VirtRegToSplit will create a local split interval
+/// in basic block number BBNumber that may cause a spill.
+///
+/// \param VirtRegToSplit The register considered to be split.
+/// \param Cand           The split candidate that determines the physical
+///                       register we are splitting for and the interferences.
+/// \param BBNumber       The number of a BB for which the region split process
+///                       will create a local split interval.
+/// \param Order          The physical registers that may get evicted by a
+///                       split artifact of VirtRegToSplit.
+/// \return True if splitting VirtRegToSplit may cause a spill, false
+/// otherwise.
+bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit,
+                                       GlobalSplitCandidate &Cand,
+                                       unsigned BBNumber,
+                                       const AllocationOrder &Order) {
+  Cand.Intf.moveToBlock(BBNumber);
+
+  // Check if the local interval will find a non interfereing assignment.
+  for (auto PhysReg : Order.getOrder()) {
+    if (!Matrix->checkInterference(Cand.Intf.first().getPrevIndex(),
+                                   Cand.Intf.last(), PhysReg))
+      return false;
+  }
+
+  // Check if the local interval will evict a cheaper interval.
+  float CheapestEvictWeight = 0;
+  unsigned FutureEvictedPhysReg = getCheapestEvicteeWeight(
+      Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(),
+      Cand.Intf.last(), &CheapestEvictWeight);
+
+  // Have we found an interval that can be evicted?
+  if (FutureEvictedPhysReg) {
+    VirtRegAuxInfo VRAI(*MF, *LIS, VRM, getAnalysis<MachineLoopInfo>(), *MBFI);
+    float splitArtifactWeight =
+        VRAI.futureWeight(LIS->getInterval(VirtRegToSplit),
+                          Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
+    // Will the weight of the local interval be higher than the cheapest evictee
+    // weight? If so it will evict it and will not cause a spill.
+    if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight)
+      return false;
+  }
+
+  // The local interval is not able to find non interferencing assignment and
+  // not able to evict a less worthy interval, therfore, it can cause a spill.
+  return true;
+}
+
 /// calcGlobalSplitCost - Return the global split cost of following the split
 /// pattern in LiveBundles. This cost should be added to the local cost of the
 /// interference pattern in SplitConstraints.
@@ -1499,19 +1559,26 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
 
     Cand.Intf.moveToBlock(BC.Number);
     // Check wheather a local interval is going to be created during the region
-    // split.
-    if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
-        Cand.Intf.hasInterference() && BI.LiveIn && BI.LiveOut && RegIn &&
-        RegOut) {
-
-      if (splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
-        // This interfernce cause our eviction from this assignment, we might
-        // evict somebody else, add that cost.
+    // split. Calculate adavanced spilt cost (cost of local intervals) if option
+    // is enabled.
+    if (EnableAdvancedRASplitCost && Cand.Intf.hasInterference() && BI.LiveIn &&
+        BI.LiveOut && RegIn && RegOut) {
+
+      if (CanCauseEvictionChain &&
+          splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
+        // This interference causes our eviction from this assignment, we might
+        // evict somebody else and eventually someone will spill, add that cost.
         // See splitCanCauseEvictionChain for detailed description of scenarios.
         GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
         GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
 
         *CanCauseEvictionChain = true;
+
+      } else if (splitCanCauseLocalSpill(VirtRegToSplit, Cand, BC.Number,
+                                         Order)) {
+        // This interference causes local interval to spill, add that cost.
+        GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
+        GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
       }
     }
 
@@ -1540,7 +1607,7 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
         // region split.
         if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
             splitCanCauseEvictionChain(VirtRegToSplit, Cand, Number, Order)) {
-          // This interfernce cause our eviction from this assignment, we might
+          // This interference cause our eviction from this assignment, we might
           // evict somebody else, add that cost.
           // See splitCanCauseEvictionChain for detailed description of
           // scenarios.
@@ -1575,7 +1642,8 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // These are the intervals created for new global ranges. We may create more
   // intervals for local ranges.
   const unsigned NumGlobalIntvs = LREdit.size();
-  DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs << " globals.\n");
+  LLVM_DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs
+                    << " globals.\n");
   assert(NumGlobalIntvs && "No global intervals configured");
 
   // Isolate even single instructions when dealing with a proper sub-class.
@@ -1612,7 +1680,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 
     // Create separate intervals for isolated blocks with multiple uses.
     if (!IntvIn && !IntvOut) {
-      DEBUG(dbgs() << printMBBReference(*BI.MBB) << " isolated.\n");
+      LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " isolated.\n");
       if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
         SE->splitSingleBlock(BI);
       continue;
@@ -1694,8 +1762,8 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     // blocks is strictly decreasing.
     if (IntvMap[i] < NumGlobalIntvs) {
       if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
-        DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
-                     << " blocks as original.\n");
+        LLVM_DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
+                          << " blocks as original.\n");
         // Don't allow repeated splitting as a safe guard against looping.
         setStage(Reg, RS_Split2);
       }
@@ -1710,8 +1778,21 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     MF->verify(this, "After splitting live range around region");
 }
 
+// Global split has high compile time cost especially for large live range.
+// Return false for the case here where the potential benefit will never
+// worth the cost.
+unsigned RAGreedy::isSplitBenefitWorthCost(LiveInterval &VirtReg) {
+  MachineInstr *MI = MRI->getUniqueVRegDef(VirtReg.reg);
+  if (MI && TII->isTriviallyReMaterializable(*MI, AA) &&
+      VirtReg.size() > HugeSizeForSplit)
+    return false;
+  return true;
+}
+
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                                   SmallVectorImpl<unsigned> &NewVRegs) {
+  if (!isSplitBenefitWorthCost(VirtReg))
+    return 0;
   unsigned NumCands = 0;
   BlockFrequency SpillCost = calcSpillCost();
   BlockFrequency BestCost;
@@ -1726,8 +1807,8 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // No benefit from the compact region, our fallback will be per-block
     // splitting. Make sure we find a solution that is cheaper than spilling.
     BestCost = SpillCost;
-    DEBUG(dbgs() << "Cost of isolating all blocks = ";
-                 MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
+    LLVM_DEBUG(dbgs() << "Cost of isolating all blocks = ";
+               MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
   }
 
   bool CanCauseEvictionChain = false;
@@ -1790,13 +1871,13 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
     SpillPlacer->prepare(Cand.LiveBundles);
     BlockFrequency Cost;
     if (!addSplitConstraints(Cand.Intf, Cost)) {
-      DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
+      LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
       continue;
     }
-    DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
-                 MBFI->printBlockFreq(dbgs(), Cost));
+    LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
+               MBFI->printBlockFreq(dbgs(), Cost));
     if (Cost >= BestCost) {
-      DEBUG({
+      LLVM_DEBUG({
         if (BestCand == NoCand)
           dbgs() << " worse than no bundles\n";
         else
@@ -1811,15 +1892,15 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
 
     // No live bundles, defer to splitSingleBlocks().
     if (!Cand.LiveBundles.any()) {
-      DEBUG(dbgs() << " no bundles.\n");
+      LLVM_DEBUG(dbgs() << " no bundles.\n");
       continue;
     }
 
     bool HasEvictionChain = false;
     Cost += calcGlobalSplitCost(Cand, Order, &HasEvictionChain);
-    DEBUG({
-      dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost)
-                                << " with bundles";
+    LLVM_DEBUG({
+      dbgs() << ", total = ";
+      MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
       for (int i : Cand.LiveBundles.set_bits())
         dbgs() << " EB#" << i;
       dbgs() << ".\n";
@@ -1838,11 +1919,11 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
   if (CanCauseEvictionChain && BestCand != NoCand) {
     // See splitCanCauseEvictionChain for detailed description of bad
     // eviction chain scenarios.
-    DEBUG(dbgs() << "Best split candidate of vreg "
-                 << printReg(VirtReg.reg, TRI) << "  may ");
+    LLVM_DEBUG(dbgs() << "Best split candidate of vreg "
+                      << printReg(VirtReg.reg, TRI) << "  may ");
     if (!(*CanCauseEvictionChain))
-      DEBUG(dbgs() << "not ");
-    DEBUG(dbgs() << "cause bad eviction chain\n");
+      LLVM_DEBUG(dbgs() << "not ");
+    LLVM_DEBUG(dbgs() << "cause bad eviction chain\n");
   }
 
   return BestCand;
@@ -1865,8 +1946,8 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
     if (unsigned B = Cand.getBundles(BundleCand, BestCand)) {
       UsedCands.push_back(BestCand);
       Cand.IntvIdx = SE->openIntv();
-      DEBUG(dbgs() << "Split for " << printReg(Cand.PhysReg, TRI) << " in "
-                   << B << " bundles, intv " << Cand.IntvIdx << ".\n");
+      LLVM_DEBUG(dbgs() << "Split for " << printReg(Cand.PhysReg, TRI) << " in "
+                        << B << " bundles, intv " << Cand.IntvIdx << ".\n");
       (void)B;
     }
   }
@@ -1878,8 +1959,8 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
     if (unsigned B = Cand.getBundles(BundleCand, 0)) {
       UsedCands.push_back(0);
       Cand.IntvIdx = SE->openIntv();
-      DEBUG(dbgs() << "Split for compact region in " << B << " bundles, intv "
-                   << Cand.IntvIdx << ".\n");
+      LLVM_DEBUG(dbgs() << "Split for compact region in " << B
+                        << " bundles, intv " << Cand.IntvIdx << ".\n");
       (void)B;
     }
   }
@@ -1978,7 +2059,8 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (Uses.size() <= 1)
     return 0;
 
-  DEBUG(dbgs() << "Split around " << Uses.size() << " individual instrs.\n");
+  LLVM_DEBUG(dbgs() << "Split around " << Uses.size()
+                    << " individual instrs.\n");
 
   const TargetRegisterClass *SuperRC =
       TRI->getLargestLegalSuperClass(CurRC, *MF);
@@ -1993,7 +2075,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
           SuperRCNumAllocatableRegs ==
               getNumAllocatableRegsForConstraints(MI, VirtReg.reg, SuperRC, TII,
                                                   TRI, RCI)) {
-        DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
+        LLVM_DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
         continue;
       }
     SE->openIntv();
@@ -2003,7 +2085,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   }
 
   if (LREdit.empty()) {
-    DEBUG(dbgs() << "All uses were copies.\n");
+    LLVM_DEBUG(dbgs() << "All uses were copies.\n");
     return 0;
   }
 
@@ -2121,7 +2203,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     return 0;
   const unsigned NumGaps = Uses.size()-1;
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "tryLocalSplit: ";
     for (unsigned i = 0, e = Uses.size(); i != e; ++i)
       dbgs() << ' ' << Uses[i];
@@ -2134,7 +2216,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (Matrix->checkRegMaskInterference(VirtReg)) {
     // Get regmask slots for the whole block.
     ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
-    DEBUG(dbgs() << RMS.size() << " regmasks in block:");
+    LLVM_DEBUG(dbgs() << RMS.size() << " regmasks in block:");
     // Constrain to VirtReg's live range.
     unsigned ri = std::lower_bound(RMS.begin(), RMS.end(),
                                    Uses.front().getRegSlot()) - RMS.begin();
@@ -2148,14 +2230,15 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       // overlap the live range.
       if (SlotIndex::isSameInstr(Uses[i+1], RMS[ri]) && i+1 == NumGaps)
         break;
-      DEBUG(dbgs() << ' ' << RMS[ri] << ':' << Uses[i] << '-' << Uses[i+1]);
+      LLVM_DEBUG(dbgs() << ' ' << RMS[ri] << ':' << Uses[i] << '-'
+                        << Uses[i + 1]);
       RegMaskGaps.push_back(i);
       // Advance ri to the next gap. A regmask on one of the uses counts in
       // both gaps.
       while (ri != re && SlotIndex::isEarlierInstr(RMS[ri], Uses[i+1]))
         ++ri;
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
 
   // Since we allow local split results to be split again, there is a risk of
@@ -2214,13 +2297,12 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       const bool LiveBefore = SplitBefore != 0 || BI.LiveIn;
       const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
 
-      DEBUG(dbgs() << printReg(PhysReg, TRI) << ' '
-                   << Uses[SplitBefore] << '-' << Uses[SplitAfter]
-                   << " i=" << MaxGap);
+      LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << ' ' << Uses[SplitBefore]
+                        << '-' << Uses[SplitAfter] << " i=" << MaxGap);
 
       // Stop before the interval gets so big we wouldn't be making progress.
       if (!LiveBefore && !LiveAfter) {
-        DEBUG(dbgs() << " all\n");
+        LLVM_DEBUG(dbgs() << " all\n");
         break;
       }
       // Should the interval be extended or shrunk?
@@ -2245,12 +2327,12 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
             1);
         // Would this split be possible to allocate?
         // Never allocate all gaps, we wouldn't be making progress.
-        DEBUG(dbgs() << " w=" << EstWeight);
+        LLVM_DEBUG(dbgs() << " w=" << EstWeight);
         if (EstWeight * Hysteresis >= MaxGap) {
           Shrink = false;
           float Diff = EstWeight - MaxGap;
           if (Diff > BestDiff) {
-            DEBUG(dbgs() << " (best)");
+            LLVM_DEBUG(dbgs() << " (best)");
             BestDiff = Hysteresis * Diff;
             BestBefore = SplitBefore;
             BestAfter = SplitAfter;
@@ -2261,7 +2343,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       // Try to shrink.
       if (Shrink) {
         if (++SplitBefore < SplitAfter) {
-          DEBUG(dbgs() << " shrink\n");
+          LLVM_DEBUG(dbgs() << " shrink\n");
           // Recompute the max when necessary.
           if (GapWeight[SplitBefore - 1] >= MaxGap) {
             MaxGap = GapWeight[SplitBefore];
@@ -2275,11 +2357,11 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
       // Try to extend the interval.
       if (SplitAfter >= NumGaps) {
-        DEBUG(dbgs() << " end\n");
+        LLVM_DEBUG(dbgs() << " end\n");
         break;
       }
 
-      DEBUG(dbgs() << " extend\n");
+      LLVM_DEBUG(dbgs() << " extend\n");
       MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
     }
   }
@@ -2288,9 +2370,9 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (BestBefore == NumGaps)
     return 0;
 
-  DEBUG(dbgs() << "Best local split range: " << Uses[BestBefore]
-               << '-' << Uses[BestAfter] << ", " << BestDiff
-               << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
+  LLVM_DEBUG(dbgs() << "Best local split range: " << Uses[BestBefore] << '-'
+                    << Uses[BestAfter] << ", " << BestDiff << ", "
+                    << (BestAfter - BestBefore + 1) << " instrs\n");
 
   LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit);
@@ -2310,14 +2392,14 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
   unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
   if (NewGaps >= NumGaps) {
-    DEBUG(dbgs() << "Tagging non-progress ranges: ");
+    LLVM_DEBUG(dbgs() << "Tagging non-progress ranges: ");
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
       if (IntvMap[i] == 1) {
         setStage(LIS->getInterval(LREdit.get(i)), RS_Split2);
-        DEBUG(dbgs() << printReg(LREdit.get(i)));
+        LLVM_DEBUG(dbgs() << printReg(LREdit.get(i)));
       }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
   ++NumLocalSplits;
 
@@ -2410,7 +2492,7 @@ RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
     // chances are one would not be recolorable.
     if (Q.collectInterferingVRegs(LastChanceRecoloringMaxInterference) >=
         LastChanceRecoloringMaxInterference && !ExhaustiveSearch) {
-      DEBUG(dbgs() << "Early abort: too many interferences.\n");
+      LLVM_DEBUG(dbgs() << "Early abort: too many interferences.\n");
       CutOffInfo |= CO_Interf;
       return false;
     }
@@ -2424,7 +2506,8 @@ RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
             MRI->getRegClass(Intf->reg) == CurRC) &&
            !(hasTiedDef(MRI, VirtReg.reg) && !hasTiedDef(MRI, Intf->reg))) ||
           FixedRegisters.count(Intf->reg)) {
-        DEBUG(dbgs() << "Early abort: the interference is not recolorable.\n");
+        LLVM_DEBUG(
+            dbgs() << "Early abort: the interference is not recolorable.\n");
         return false;
       }
       RecoloringCandidates.insert(Intf);
@@ -2477,7 +2560,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
                                            SmallVectorImpl<unsigned> &NewVRegs,
                                            SmallVirtRegSet &FixedRegisters,
                                            unsigned Depth) {
-  DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
+  LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
   // Ranges must be Done.
   assert((getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
          "Last chance recoloring should really be last chance");
@@ -2486,7 +2569,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
   // for target with hundreds of registers.
   // Indeed, in that case we may want to cut the search space earlier.
   if (Depth >= LastChanceRecoloringMaxDepth && !ExhaustiveSearch) {
-    DEBUG(dbgs() << "Abort because max depth has been reached.\n");
+    LLVM_DEBUG(dbgs() << "Abort because max depth has been reached.\n");
     CutOffInfo |= CO_Depth;
     return ~0u;
   }
@@ -2503,8 +2586,8 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
 
   Order.rewind();
   while (unsigned PhysReg = Order.next()) {
-    DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
-                 << printReg(PhysReg, TRI) << '\n');
+    LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
+                      << printReg(PhysReg, TRI) << '\n');
     RecoloringCandidates.clear();
     VirtRegToPhysReg.clear();
     CurrentNewVRegs.clear();
@@ -2512,7 +2595,8 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     // It is only possible to recolor virtual register interference.
     if (Matrix->checkInterference(VirtReg, PhysReg) >
         LiveRegMatrix::IK_VirtReg) {
-      DEBUG(dbgs() << "Some interferences are not with virtual registers.\n");
+      LLVM_DEBUG(
+          dbgs() << "Some interferences are not with virtual registers.\n");
 
       continue;
     }
@@ -2521,7 +2605,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     // the interferences.
     if (!mayRecolorAllInterferences(PhysReg, VirtReg, RecoloringCandidates,
                                     FixedRegisters)) {
-      DEBUG(dbgs() << "Some interferences cannot be recolored.\n");
+      LLVM_DEBUG(dbgs() << "Some interferences cannot be recolored.\n");
       continue;
     }
 
@@ -2535,7 +2619,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
       unsigned ItVirtReg = (*It)->reg;
       enqueue(RecoloringQueue, *It);
       assert(VRM->hasPhys(ItVirtReg) &&
-             "Interferences are supposed to be with allocated vairables");
+             "Interferences are supposed to be with allocated variables");
 
       // Record the current allocation.
       VirtRegToPhysReg[ItVirtReg] = VRM->getPhys(ItVirtReg);
@@ -2563,8 +2647,8 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
       return PhysReg;
     }
 
-    DEBUG(dbgs() << "Fail to assign: " << VirtReg << " to "
-                 << printReg(PhysReg, TRI) << '\n');
+    LLVM_DEBUG(dbgs() << "Fail to assign: " << VirtReg << " to "
+                      << printReg(PhysReg, TRI) << '\n');
 
     // The recoloring attempt failed, undo the changes.
     FixedRegisters = SaveFixedRegisters;
@@ -2611,7 +2695,7 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
                                        unsigned Depth) {
   while (!RecoloringQueue.empty()) {
     LiveInterval *LI = dequeue(RecoloringQueue);
-    DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
+    LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
     unsigned PhysReg;
     PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters, Depth + 1);
     // When splitting happens, the live-range may actually be empty.
@@ -2623,11 +2707,12 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
 
     if (!PhysReg) {
       assert(LI->empty() && "Only empty live-range do not require a register");
-      DEBUG(dbgs() << "Recoloring of " << *LI << " succeeded. Empty LI.\n");
+      LLVM_DEBUG(dbgs() << "Recoloring of " << *LI
+                        << " succeeded. Empty LI.\n");
       continue;
     }
-    DEBUG(dbgs() << "Recoloring of " << *LI
-                 << " succeeded with: " << printReg(PhysReg, TRI) << '\n');
+    LLVM_DEBUG(dbgs() << "Recoloring of " << *LI
+                      << " succeeded with: " << printReg(PhysReg, TRI) << '\n');
 
     Matrix->assign(*LI, PhysReg);
     FixedRegisters.insert(LI->reg);
@@ -2735,7 +2820,7 @@ void RAGreedy::initializeCSRCost() {
     CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
 }
 
-/// \brief Collect the hint info for \p Reg.
+/// Collect the hint info for \p Reg.
 /// The results are stored into \p Out.
 /// \p Out is not cleared before being populated.
 void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
@@ -2759,7 +2844,7 @@ void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
   }
 }
 
-/// \brief Using the given \p List, compute the cost of the broken hints if
+/// Using the given \p List, compute the cost of the broken hints if
 /// \p PhysReg was used.
 /// \return The cost of \p List for \p PhysReg.
 BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
@@ -2772,7 +2857,7 @@ BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
   return Cost;
 }
 
-/// \brief Using the register assigned to \p VirtReg, try to recolor
+/// Using the register assigned to \p VirtReg, try to recolor
 /// all the live ranges that are copy-related with \p VirtReg.
 /// The recoloring is then propagated to all the live-ranges that have
 /// been recolored and so on, until no more copies can be coalesced or
@@ -2794,8 +2879,8 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   Visited.insert(Reg);
   RecoloringCandidates.push_back(Reg);
 
-  DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI) << '('
-               << printReg(PhysReg, TRI) << ")\n");
+  LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI)
+                    << '(' << printReg(PhysReg, TRI) << ")\n");
 
   do {
     Reg = RecoloringCandidates.pop_back_val();
@@ -2816,8 +2901,8 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
                                 Matrix->checkInterference(LI, PhysReg)))
       continue;
 
-    DEBUG(dbgs() << printReg(Reg, TRI) << '(' << printReg(CurrPhys, TRI)
-                 << ") is recolorable.\n");
+    LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << '(' << printReg(CurrPhys, TRI)
+                      << ") is recolorable.\n");
 
     // Gather the hint info.
     Info.clear();
@@ -2825,19 +2910,20 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
     // Check if recoloring the live-range will increase the cost of the
     // non-identity copies.
     if (CurrPhys != PhysReg) {
-      DEBUG(dbgs() << "Checking profitability:\n");
+      LLVM_DEBUG(dbgs() << "Checking profitability:\n");
       BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
       BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
-      DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
-                   << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
+      LLVM_DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
+                        << "\nNew Cost: " << NewCopiesCost.getFrequency()
+                        << '\n');
       if (OldCopiesCost < NewCopiesCost) {
-        DEBUG(dbgs() << "=> Not profitable.\n");
+        LLVM_DEBUG(dbgs() << "=> Not profitable.\n");
         continue;
       }
       // At this point, the cost is either cheaper or equal. If it is
       // equal, we consider this is profitable because it may expose
       // more recoloring opportunities.
-      DEBUG(dbgs() << "=> Profitable.\n");
+      LLVM_DEBUG(dbgs() << "=> Profitable.\n");
       // Recolor the live-range.
       Matrix->unassign(LI);
       Matrix->assign(LI, PhysReg);
@@ -2851,7 +2937,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   } while (!RecoloringCandidates.empty());
 }
 
-/// \brief Try to recolor broken hints.
+/// Try to recolor broken hints.
 /// Broken hints may be repaired by recoloring when an evicted variable
 /// freed up a register for a larger live-range.
 /// Consider the following example:
@@ -2925,8 +3011,8 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   }
 
   LiveRangeStage Stage = getStage(VirtReg);
-  DEBUG(dbgs() << StageName[Stage]
-               << " Cascade " << ExtraRegInfo[VirtReg.reg].Cascade << '\n');
+  LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "
+                    << ExtraRegInfo[VirtReg.reg].Cascade << '\n');
 
   // Try to evict a less worthy live range, but only for ranges from the primary
   // queue. The RS_Split ranges already failed to do this, and they should not
@@ -2955,7 +3041,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // This gives a better picture of the interference to split around.
   if (Stage < RS_Split) {
     setStage(VirtReg, RS_Split);
-    DEBUG(dbgs() << "wait for second round\n");
+    LLVM_DEBUG(dbgs() << "wait for second round\n");
     NewVRegs.push_back(VirtReg.reg);
     return 0;
   }
@@ -2984,7 +3070,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     // We would need a deep integration with the spiller to do the
     // right thing here. Anyway, that is still good for early testing.
     setStage(VirtReg, RS_Memory);
-    DEBUG(dbgs() << "Do as if this register is in memory\n");
+    LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n");
     NewVRegs.push_back(VirtReg.reg);
   } else {
     NamedRegionTimer T("spill", "Spiller", TimerGroupName,
@@ -3070,8 +3156,8 @@ void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
 }
 
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
-  DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
-               << "********** Function: " << mf.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
+                    << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
   TRI = MF->getSubtarget().getRegisterInfo();
@@ -3106,7 +3192,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
 
   calculateSpillWeightsAndHints(*LIS, mf, VRM, *Loops, *MBFI);
 
-  DEBUG(LIS->dump());
+  LLVM_DEBUG(LIS->dump());
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
   SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI));
diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 69a879701fae..c19001c8403d 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -62,6 +62,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -159,25 +160,25 @@ private:
   /// always available for the remat of all the siblings of the original reg.
   SmallPtrSet<MachineInstr *, 32> DeadRemats;
 
-  /// \brief Finds the initial set of vreg intervals to allocate.
+  /// Finds the initial set of vreg intervals to allocate.
   void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
 
-  /// \brief Constructs an initial graph.
+  /// Constructs an initial graph.
   void initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, Spiller &VRegSpiller);
 
-  /// \brief Spill the given VReg.
+  /// Spill the given VReg.
   void spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals,
                  MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
                  Spiller &VRegSpiller);
 
-  /// \brief Given a solved PBQP problem maps this solution back to a register
+  /// Given a solved PBQP problem maps this solution back to a register
   /// assignment.
   bool mapPBQPToRegAlloc(const PBQPRAGraph &G,
                          const PBQP::Solution &Solution,
                          VirtRegMap &VRM,
                          Spiller &VRegSpiller);
 
-  /// \brief Postprocessing before final spilling. Sets basic block "live in"
+  /// Postprocessing before final spilling. Sets basic block "live in"
   /// variables.
   void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
                      VirtRegMap &VRM) const;
@@ -187,7 +188,7 @@ private:
 
 char RegAllocPBQP::ID = 0;
 
-/// @brief Set spill costs for each node in the PBQP reg-alloc graph.
+/// Set spill costs for each node in the PBQP reg-alloc graph.
 class SpillCosts : public PBQPRAConstraint {
 public:
   void apply(PBQPRAGraph &G) override {
@@ -211,7 +212,7 @@ public:
   }
 };
 
-/// @brief Add interference edges between overlapping vregs.
+/// Add interference edges between overlapping vregs.
 class Interference : public PBQPRAConstraint {
 private:
   using AllowedRegVecPtr = const PBQP::RegAlloc::AllowedRegVector *;
@@ -561,16 +562,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
     unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
-    LiveInterval &LI = LIS.getInterval(Reg);
-
-    // If this live interval is non-empty we will use pbqp to allocate it.
-    // Empty intervals we allocate in a simple post-processing stage in
-    // finalizeAlloc.
-    if (!LI.empty()) {
-      VRegsToAlloc.insert(LI.reg);
-    } else {
-      EmptyIntervalVRegs.insert(LI.reg);
-    }
+    VRegsToAlloc.insert(Reg);
   }
 }
 
@@ -594,13 +586,24 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
 
   std::vector<unsigned> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
 
+  std::map<unsigned, std::vector<unsigned>> VRegAllowedMap;
+
   while (!Worklist.empty()) {
     unsigned VReg = Worklist.back();
     Worklist.pop_back();
 
-    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
     LiveInterval &VRegLI = LIS.getInterval(VReg);
 
+    // If this is an empty interval move it to the EmptyIntervalVRegs set then
+    // continue.
+    if (VRegLI.empty()) {
+      EmptyIntervalVRegs.insert(VRegLI.reg);
+      VRegsToAlloc.erase(VRegLI.reg);
+      continue;
+    }
+
+    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
+
     // Record any overlaps with regmask operands.
     BitVector RegMaskOverlaps;
     LIS.checkRegMaskInterference(VRegLI, RegMaskOverlaps);
@@ -639,8 +642,22 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
       spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
       Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
       continue;
+    } else
+      VRegAllowedMap[VReg] = std::move(VRegAllowed);
+  }
+
+  for (auto &KV : VRegAllowedMap) {
+    auto VReg = KV.first;
+
+    // Move empty intervals to the EmptyIntervalVReg set.
+    if (LIS.getInterval(VReg).empty()) {
+      EmptyIntervalVRegs.insert(VReg);
+      VRegsToAlloc.erase(VReg);
+      continue;
     }
 
+    auto &VRegAllowed = KV.second;
+
     PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
 
     // Tweak cost of callee saved registers, as using then force spilling and
@@ -668,8 +685,8 @@ void RegAllocPBQP::spillVReg(unsigned VReg,
 
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   (void)TRI;
-  DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> SPILLED (Cost: "
-               << LRE.getParent().weight << ", New vregs: ");
+  LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> SPILLED (Cost: "
+                    << LRE.getParent().weight << ", New vregs: ");
 
   // Copy any newly inserted live intervals into the list of regs to
   // allocate.
@@ -677,11 +694,11 @@ void RegAllocPBQP::spillVReg(unsigned VReg,
        I != E; ++I) {
     const LiveInterval &LI = LIS.getInterval(*I);
     assert(!LI.empty() && "Empty spill range.");
-    DEBUG(dbgs() << printReg(LI.reg, &TRI) << " ");
+    LLVM_DEBUG(dbgs() << printReg(LI.reg, &TRI) << " ");
     VRegsToAlloc.insert(LI.reg);
   }
 
-  DEBUG(dbgs() << ")\n");
+  LLVM_DEBUG(dbgs() << ")\n");
 }
 
 bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
@@ -707,8 +724,8 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
 
     if (AllocOption != PBQP::RegAlloc::getSpillOptionIdx()) {
       unsigned PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOption - 1];
-      DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> "
-            << TRI.getName(PReg) << "\n");
+      LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> "
+                        << TRI.getName(PReg) << "\n");
       assert(PReg != 0 && "Invalid preg selected.");
       VRM.assignVirt2Phys(VReg, PReg);
     } else {
@@ -784,7 +801,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   MF.getRegInfo().freezeReservedRegs(MF);
 
-  DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
 
   // Allocator main loop:
   //
@@ -819,7 +836,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
     unsigned Round = 0;
 
     while (!PBQPAllocComplete) {
-      DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
+      LLVM_DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
 
       PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
       initializeGraph(G, VRM, *VRegSpiller);
@@ -833,8 +850,8 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
                                     ".pbqpgraph";
         std::error_code EC;
         raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
-        DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
-              << GraphFileName << "\"\n");
+        LLVM_DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
+                          << GraphFileName << "\"\n");
         G.dump(OS);
       }
 #endif
@@ -851,7 +868,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   VRegsToAlloc.clear();
   EmptyIntervalVRegs.clear();
 
-  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << VRM << "\n");
+  LLVM_DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << VRM << "\n");
 
   return true;
 }
diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
index f49ea25bbf35..f1c442ac38ae 100644
--- a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -36,11 +36,8 @@ using namespace llvm;
 STATISTIC(NumCSROpt,
           "Number of functions optimized for callee saved registers");
 
-namespace llvm {
-void initializeRegUsageInfoCollectorPass(PassRegistry &);
-}
-
 namespace {
+
 class RegUsageInfoCollector : public MachineFunctionPass {
 public:
   RegUsageInfoCollector() : MachineFunctionPass(ID) {
@@ -52,12 +49,21 @@ public:
     return "Register Usage Information Collector Pass";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<PhysicalRegisterUsageInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  // Call determineCalleeSaves and then also set the bits for subregs and
+  // fully saved superregs.
+  static void computeCalleeSavedRegs(BitVector &SavedRegs, MachineFunction &MF);
+
   static char ID;
 };
+
 } // end of anonymous namespace
 
 char RegUsageInfoCollector::ID = 0;
@@ -72,36 +78,32 @@ FunctionPass *llvm::createRegUsageInfoCollector() {
   return new RegUsageInfoCollector();
 }
 
-void RegUsageInfoCollector::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<PhysicalRegisterUsageInfo>();
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
 bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const TargetMachine &TM = MF.getTarget();
 
-  DEBUG(dbgs() << " -------------------- " << getPassName()
-               << " -------------------- \n");
-  DEBUG(dbgs() << "Function Name : " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << " -------------------- " << getPassName()
+                    << " -------------------- \n");
+  LLVM_DEBUG(dbgs() << "Function Name : " << MF.getName() << "\n");
 
   std::vector<uint32_t> RegMask;
 
   // Compute the size of the bit vector to represent all the registers.
   // The bit vector is broken into 32-bit chunks, thus takes the ceil of
   // the number of registers divided by 32 for the size.
-  unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
-  RegMask.resize(RegMaskSize, 0xFFFFFFFF);
+  unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+  RegMask.resize(RegMaskSize, ~((uint32_t)0));
 
   const Function &F = MF.getFunction();
 
-  PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
+  PhysicalRegisterUsageInfo &PRUI = getAnalysis<PhysicalRegisterUsageInfo>();
+  PRUI.setTargetMachine(TM);
 
-  PRUI->setTargetMachine(&TM);
+  LLVM_DEBUG(dbgs() << "Clobbered Registers: ");
 
-  DEBUG(dbgs() << "Clobbered Registers: ");
+  BitVector SavedRegs;
+  computeCalleeSavedRegs(SavedRegs, MF);
 
   const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask();
   auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
@@ -110,42 +112,82 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
   // Scan all the physical registers. When a register is defined in the current
   // function set it and all the aliasing registers as defined in the regmask.
   for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
-    // If a register is in the UsedPhysRegsMask set then mark it as defined.
-    // All it's aliases will also be in the set, so we can skip setting
-    // as defined all the aliases here.
-    if (UsedPhysRegsMask.test(PReg)) {
-      SetRegAsDefined(PReg);
+    // Don't count registers that are saved and restored.
+    if (SavedRegs.test(PReg))
       continue;
-    }
     // If a register is defined by an instruction mark it as defined together
-    // with all it's aliases.
+    // with all it's unsaved aliases.
     if (!MRI->def_empty(PReg)) {
       for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
-        SetRegAsDefined(*AI);
+        if (!SavedRegs.test(*AI))
+          SetRegAsDefined(*AI);
+      continue;
     }
+    // If a register is in the UsedPhysRegsMask set then mark it as defined.
+    // All clobbered aliases will also be in the set, so we can skip setting
+    // as defined all the aliases here.
+    if (UsedPhysRegsMask.test(PReg))
+      SetRegAsDefined(PReg);
   }
 
-  if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {
-    const uint32_t *CallPreservedMask =
-        TRI->getCallPreservedMask(MF, F.getCallingConv());
-    if (CallPreservedMask) {
-      // Set callee saved register as preserved.
-      for (unsigned i = 0; i < RegMaskSize; ++i)
-        RegMask[i] = RegMask[i] | CallPreservedMask[i];
-    }
-  } else {
+  if (TargetFrameLowering::isSafeForNoCSROpt(F)) {
     ++NumCSROpt;
-    DEBUG(dbgs() << MF.getName()
-                 << " function optimized for not having CSR.\n");
+    LLVM_DEBUG(dbgs() << MF.getName()
+                      << " function optimized for not having CSR.\n");
   }
 
   for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
     if (MachineOperand::clobbersPhysReg(&(RegMask[0]), PReg))
-      DEBUG(dbgs() << printReg(PReg, TRI) << " ");
+      LLVM_DEBUG(dbgs() << printReg(PReg, TRI) << " ");
 
-  DEBUG(dbgs() << " \n----------------------------------------\n");
+  LLVM_DEBUG(dbgs() << " \n----------------------------------------\n");
 
-  PRUI->storeUpdateRegUsageInfo(&F, std::move(RegMask));
+  PRUI.storeUpdateRegUsageInfo(F, RegMask);
 
   return false;
 }
+
+void RegUsageInfoCollector::
+computeCalleeSavedRegs(BitVector &SavedRegs, MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // Target will return the set of registers that it saves/restores as needed.
+  SavedRegs.clear();
+  TFI.determineCalleeSaves(MF, SavedRegs);
+
+  // Insert subregs.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (SavedRegs.test(Reg))
+      for (MCSubRegIterator SR(Reg, &TRI, false); SR.isValid(); ++SR)
+        SavedRegs.set(*SR);
+  }
+
+  // Insert any register fully saved via subregisters.
+  for (unsigned PReg = 1, PRegE = TRI.getNumRegs(); PReg < PRegE; ++PReg) {
+    if (SavedRegs.test(PReg))
+      continue;
+
+    // Check if PReg is fully covered by its subregs.
+    bool CoveredBySubRegs = false;
+    for (const TargetRegisterClass *RC : TRI.regclasses())
+      if (RC->CoveredBySubRegs && RC->contains(PReg)) {
+        CoveredBySubRegs = true;
+        break;
+      }
+    if (!CoveredBySubRegs)
+      continue;
+
+    // Add PReg to SavedRegs if all subregs are saved.
+    bool AllSubRegsSaved = true;
+    for (MCSubRegIterator SR(PReg, &TRI, false); SR.isValid(); ++SR)
+      if (!SavedRegs.test(*SR)) {
+        AllSubRegsSaved = false;
+        break;
+      }
+    if (AllSubRegsSaved)
+      SavedRegs.set(PReg);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
index 5b12d00e126f..256de295821d 100644
--- a/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
+++ b/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -34,10 +34,6 @@
 #include <map>
 #include <string>
 
-namespace llvm {
-void initializeRegUsageInfoPropagationPassPass(PassRegistry &);
-}
-
 using namespace llvm;
 
 #define DEBUG_TYPE "ip-regalloc"
@@ -45,54 +41,56 @@ using namespace llvm;
 #define RUIP_NAME "Register Usage Information Propagation"
 
 namespace {
-class RegUsageInfoPropagationPass : public MachineFunctionPass {
 
+class RegUsageInfoPropagation : public MachineFunctionPass {
 public:
-  RegUsageInfoPropagationPass() : MachineFunctionPass(ID) {
+  RegUsageInfoPropagation() : MachineFunctionPass(ID) {
     PassRegistry &Registry = *PassRegistry::getPassRegistry();
-    initializeRegUsageInfoPropagationPassPass(Registry);
+    initializeRegUsageInfoPropagationPass(Registry);
   }
 
   StringRef getPassName() const override { return RUIP_NAME; }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<PhysicalRegisterUsageInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 
   static char ID;
 
 private:
-  static void setRegMask(MachineInstr &MI, const uint32_t *RegMask) {
+  static void setRegMask(MachineInstr &MI, ArrayRef<uint32_t> RegMask) {
+    assert(RegMask.size() ==
+           MachineOperand::getRegMaskSize(MI.getParent()->getParent()
+                                          ->getRegInfo().getTargetRegisterInfo()
+                                          ->getNumRegs())
+           && "expected register mask size");
     for (MachineOperand &MO : MI.operands()) {
       if (MO.isRegMask())
-        MO.setRegMask(RegMask);
+        MO.setRegMask(RegMask.data());
     }
   }
 };
+
 } // end of anonymous namespace
-char RegUsageInfoPropagationPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(RegUsageInfoPropagationPass, "reg-usage-propagation",
+INITIALIZE_PASS_BEGIN(RegUsageInfoPropagation, "reg-usage-propagation",
                       RUIP_NAME, false, false)
 INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
-INITIALIZE_PASS_END(RegUsageInfoPropagationPass, "reg-usage-propagation",
+INITIALIZE_PASS_END(RegUsageInfoPropagation, "reg-usage-propagation",
                     RUIP_NAME, false, false)
 
-FunctionPass *llvm::createRegUsageInfoPropPass() {
-  return new RegUsageInfoPropagationPass();
-}
-
-void RegUsageInfoPropagationPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<PhysicalRegisterUsageInfo>();
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
+char RegUsageInfoPropagation::ID = 0;
 
 // Assumes call instructions have a single reference to a function.
-static const Function *findCalledFunction(const Module &M, MachineInstr &MI) {
-  for (MachineOperand &MO : MI.operands()) {
+static const Function *findCalledFunction(const Module &M,
+                                          const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isGlobal())
-      return dyn_cast<Function>(MO.getGlobal());
+      return dyn_cast<const Function>(MO.getGlobal());
 
     if (MO.isSymbol())
       return M.getFunction(MO.getSymbolName());
@@ -101,13 +99,13 @@ static const Function *findCalledFunction(const Module &M, MachineInstr &MI) {
   return nullptr;
 }
 
-bool RegUsageInfoPropagationPass::runOnMachineFunction(MachineFunction &MF) {
-  const Module *M = MF.getFunction().getParent();
+bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) {
+  const Module &M = *MF.getFunction().getParent();
   PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
 
-  DEBUG(dbgs() << " ++++++++++++++++++++ " << getPassName()
-               << " ++++++++++++++++++++  \n");
-  DEBUG(dbgs() << "MachineFunction : " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << " ++++++++++++++++++++ " << getPassName()
+                    << " ++++++++++++++++++++  \n");
+  LLVM_DEBUG(dbgs() << "MachineFunction : " << MF.getName() << "\n");
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   if (!MFI.hasCalls() && !MFI.hasTailCall())
@@ -119,30 +117,37 @@ bool RegUsageInfoPropagationPass::runOnMachineFunction(MachineFunction &MF) {
     for (MachineInstr &MI : MBB) {
       if (!MI.isCall())
         continue;
-      DEBUG(dbgs()
-            << "Call Instruction Before Register Usage Info Propagation : \n");
-      DEBUG(dbgs() << MI << "\n");
-
-      auto UpdateRegMask = [&](const Function *F) {
-        const auto *RegMask = PRUI->getRegUsageInfo(F);
-        if (!RegMask)
+      LLVM_DEBUG(
+          dbgs()
+          << "Call Instruction Before Register Usage Info Propagation : \n");
+      LLVM_DEBUG(dbgs() << MI << "\n");
+
+      auto UpdateRegMask = [&](const Function &F) {
+        const ArrayRef<uint32_t> RegMask = PRUI->getRegUsageInfo(F);
+        if (RegMask.empty())
           return;
-        setRegMask(MI, &(*RegMask)[0]);
+        setRegMask(MI, RegMask);
         Changed = true;
       };
 
-      if (const Function *F = findCalledFunction(*M, MI)) {
-        UpdateRegMask(F);
+      if (const Function *F = findCalledFunction(M, MI)) {
+        UpdateRegMask(*F);
       } else {
-        DEBUG(dbgs() << "Failed to find call target function\n");
+        LLVM_DEBUG(dbgs() << "Failed to find call target function\n");
       }
 
-      DEBUG(dbgs() << "Call Instruction After Register Usage Info Propagation : "
-            << MI << '\n');
+      LLVM_DEBUG(
+          dbgs() << "Call Instruction After Register Usage Info Propagation : "
+                 << MI << '\n');
     }
   }
 
-  DEBUG(dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
-                  "++++++ \n");
+  LLVM_DEBUG(
+      dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+                "++++++ \n");
   return Changed;
 }
+
+FunctionPass *llvm::createRegUsageInfoPropPass() {
+  return new RegUsageInfoPropagation();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
index b0eeb81f583e..add8faec97d4 100644
--- a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -49,9 +49,6 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   if (MF->getSubtarget().getRegisterInfo() != TRI) {
     TRI = MF->getSubtarget().getRegisterInfo();
     RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
-    unsigned NumPSets = TRI->getNumRegPressureSets();
-    PSetLimits.reset(new unsigned[NumPSets]);
-    std::fill(&PSetLimits[0], &PSetLimits[NumPSets], 0);
     Update = true;
   }
 
@@ -80,8 +77,12 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   }
 
   // Invalidate cached information from previous function.
-  if (Update)
+  if (Update) {
+    unsigned NumPSets = TRI->getNumRegPressureSets();
+    PSetLimits.reset(new unsigned[NumPSets]);
+    std::fill(&PSetLimits[0], &PSetLimits[NumPSets], 0);
     ++Tag;
+  }
 }
 
 /// compute - Compute the preferred allocation order for RC with reserved
@@ -150,7 +151,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   RCI.MinCost = uint8_t(MinCost);
   RCI.LastCostChange = LastCostChange;
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "AllocationOrder(" << TRI->getRegClassName(RC) << ") = [";
     for (unsigned I = 0; I != RCI.NumRegs; ++I)
       dbgs() << ' ' << printReg(RCI.Order[I], TRI);
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 00a2e93c71ca..cad13a60efd2 100644
--- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -115,11 +115,11 @@ namespace {
     /// checked for smaller live intervals.
     bool ShrinkMainRange;
 
-    /// \brief True if the coalescer should aggressively coalesce global copies
+    /// True if the coalescer should aggressively coalesce global copies
     /// in favor of keeping local copies.
     bool JoinGlobalCopies;
 
-    /// \brief True if the coalescer should aggressively coalesce fall-thru
+    /// True if the coalescer should aggressively coalesce fall-thru
     /// blocks exclusively containing copies.
     bool JoinSplitEdges;
 
@@ -162,7 +162,7 @@ namespace {
     /// was successfully coalesced away. If it is not currently possible to
     /// coalesce this interval, but it may be possible if other things get
     /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *TheCopy, bool &Again);
+    bool joinCopy(MachineInstr *CopyMI, bool &Again);
 
     /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
@@ -233,9 +233,11 @@ namespace {
     void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
                       MachineOperand &MO, unsigned SubRegIdx);
 
-    /// Handle copies of undef values.
-    /// Returns true if @p CopyMI was a copy of an undef value and eliminated.
-    bool eliminateUndefCopy(MachineInstr *CopyMI);
+    /// Handle copies of undef values. If the undef value is an incoming
+    /// PHI value, it will convert @p CopyMI to an IMPLICIT_DEF.
+    /// Returns nullptr if @p CopyMI was not in any way eliminable. Otherwise,
+    /// it returns @p CopyMI (which could be an IMPLICIT_DEF at this point).
+    MachineInstr *eliminateUndefCopy(MachineInstr *CopyMI);
 
     /// Check whether or not we should apply the terminal rule on the
     /// destination (Dst) of \p Copy.
@@ -568,7 +570,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   // in IntB, we can merge them.
   if (ValS+1 != BS) return false;
 
-  DEBUG(dbgs() << "Extending: " << printReg(IntB.reg, TRI));
+  LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg, TRI));
 
   SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
@@ -587,6 +589,13 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 
   // Do the same for the subregister segments.
   for (LiveInterval::SubRange &S : IntB.subranges()) {
+    // Check for SubRange Segments of the form [1234r,1234d:0) which can be
+    // removed to prevent creating bogus SubRange Segments.
+    LiveInterval::iterator SS = S.FindSegmentContaining(CopyIdx);
+    if (SS != S.end() && SlotIndex::isSameInstr(SS->start, SS->end)) {
+      S.removeSegment(*SS, true);
+      continue;
+    }
     VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
     S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
     VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
@@ -594,7 +603,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
       S.MergeValueNumberInto(SubBValNo, SubValSNo);
   }
 
-  DEBUG(dbgs() << "   result = " << IntB << '\n');
+  LLVM_DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
@@ -603,11 +612,21 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
     ValSEndInst->getOperand(UIdx).setIsKill(false);
   }
 
-  // Rewrite the copy. If the copy instruction was killing the destination
-  // register before the merge, find the last use and trim the live range. That
-  // will also add the isKill marker.
+  // Rewrite the copy.
   CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
-  if (AS->end == CopyIdx)
+  // If the copy instruction was killing the destination register or any
+  // subrange before the merge trim the live range.
+  bool RecomputeLiveRange = AS->end == CopyIdx;
+  if (!RecomputeLiveRange) {
+    for (LiveInterval::SubRange &S : IntA.subranges()) {
+      LiveInterval::iterator SS = S.FindSegmentContaining(CopyUseIdx);
+      if (SS != S.end() && SS->end == CopyIdx) {
+        RecomputeLiveRange = true;
+        break;
+      }
+    }
+  }
+  if (RecomputeLiveRange)
     shrinkToUses(&IntA);
 
   ++numExtends;
@@ -641,7 +660,7 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
   return false;
 }
 
-/// Copy segements with value number @p SrcValNo from liverange @p Src to live
+/// Copy segments with value number @p SrcValNo from liverange @p Src to live
 /// range @Dst and use value number @p DstValNo there.
 static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
                                  const LiveRange &Src, const VNInfo *SrcValNo) {
@@ -742,8 +761,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       return false;
   }
 
-  DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
-               << *DefMI);
+  LLVM_DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
+                    << *DefMI);
 
   // At this point we have decided that it is legal to do this
   // transformation.  Start by commuting the instruction.
@@ -812,7 +831,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     VNInfo *DVNI = IntB.getVNInfoAt(DefIdx);
     if (!DVNI)
       continue;
-    DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
+    LLVM_DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
     assert(DVNI->def == DefIdx);
     BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
     for (LiveInterval::SubRange &S : IntB.subranges()) {
@@ -853,11 +872,11 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   BValNo->def = AValNo->def;
   addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
-  DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
+  LLVM_DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
   LIS->removeVRegDefAt(IntA, AValNo->def);
 
-  DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
+  LLVM_DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
   ++numCommutes;
   return true;
 }
@@ -989,13 +1008,24 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
   if (CopyLeftBB && CopyLeftBB->succ_size() > 1)
     return false;
 
-  // Now ok to move copy.
+  // Now (almost sure it's) ok to move copy.
   if (CopyLeftBB) {
-    DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to "
-                 << printMBBReference(*CopyLeftBB) << '\t' << CopyMI);
+    // Position in CopyLeftBB where we should insert new copy.
+    auto InsPos = CopyLeftBB->getFirstTerminator();
+
+    // Make sure that B isn't referenced in the terminators (if any) at the end
+    // of the predecessor since we're about to insert a new definition of B
+    // before them.
+    if (InsPos != CopyLeftBB->end()) {
+      SlotIndex InsPosIdx = LIS->getInstructionIndex(*InsPos).getRegSlot(true);
+      if (IntB.overlaps(InsPosIdx, LIS->getMBBEndIdx(CopyLeftBB)))
+        return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to "
+                      << printMBBReference(*CopyLeftBB) << '\t' << CopyMI);
 
     // Insert new copy to CopyLeftBB.
-    auto InsPos = CopyLeftBB->getFirstTerminator();
     MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
                                       TII->get(TargetOpcode::COPY), IntB.reg)
                                   .addReg(IntA.reg);
@@ -1010,8 +1040,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     // the deleted list.
     ErasedInstrs.erase(NewCopyMI);
   } else {
-    DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from "
-                 << printMBBReference(MBB) << '\t' << CopyMI);
+    LLVM_DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from "
+                      << printMBBReference(MBB) << '\t' << CopyMI);
   }
 
   // Remove CopyMI.
@@ -1039,6 +1069,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     BValNo->markUnused();
     LIS->extendToIndices(SR, EndPoints);
   }
+  // If any dead defs were extended, truncate them.
+  shrinkToUses(&IntB);
 
   // Finally, update the live-range of IntA.
   shrinkToUses(&IntA);
@@ -1174,7 +1206,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
        I != E; ++I) {
     MachineOperand &MO = CopyMI->getOperand(I);
     if (MO.isReg()) {
-      assert(MO.isImplicit() && "No explicit operands after implict operands.");
+      assert(MO.isImplicit() && "No explicit operands after implicit operands.");
       // Discard VReg implicit defs.
       if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
         ImplicitOps.push_back(MO);
@@ -1220,6 +1252,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     // Update machine operands and add flags.
     updateRegDefsUses(DstReg, DstReg, DstIdx);
     NewMI.getOperand(0).setSubReg(NewIdx);
+    // updateRegDefUses can add an "undef" flag to the definition, since
+    // it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
+    // sure that "undef" is not set.
+    if (NewIdx == 0)
+      NewMI.getOperand(0).setIsUndef(false);
     // Add dead subregister definitions if we are defining the whole register
     // but only part of it is live.
     // This could happen if the rematerialization instruction is rematerializing
@@ -1266,8 +1303,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
       bool UpdatedSubRanges = false;
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
         if ((SR.LaneMask & DstMask).none()) {
-          DEBUG(dbgs() << "Removing undefined SubRange "
-                << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "Removing undefined SubRange "
+                     << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
           // VNI is in ValNo - remove any segments in this SubRange that have this ValNo
           if (VNInfo *RmValNo = SR.getVNInfoAt(CurrIdx.getRegSlot())) {
             SR.removeValNo(RmValNo);
@@ -1299,7 +1337,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     // %1 = somedef ; %1 GR8
     // dead ECX = remat ; implicit-def CL
     // = somedef %1 ; %1 GR8
-    // %1 will see the inteferences with CL but not with CH since
+    // %1 will see the interferences with CL but not with CH since
     // no live-ranges would have been created for ECX.
     // Fix that!
     SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
@@ -1324,7 +1362,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
-  DEBUG(dbgs() << "Remat: " << NewMI);
+  LLVM_DEBUG(dbgs() << "Remat: " << NewMI);
   ++NumReMats;
 
   // The source interval can become smaller because we removed a use.
@@ -1339,7 +1377,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         // Move the debug value directly after the def of the rematerialized
         // value in DstReg.
         MBB->splice(std::next(NewMI.getIterator()), UseMI->getParent(), UseMI);
-        DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
+        LLVM_DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
       }
     }
     eliminateDeadDefs();
@@ -1348,9 +1386,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   return true;
 }
 
-bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
-  // ProcessImpicitDefs may leave some copies of <undef> values, it only removes
-  // local variables. When we have a copy like:
+MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
+  // ProcessImplicitDefs may leave some copies of <undef> values, it only
+  // removes local variables. When we have a copy like:
   //
   //   %1 = COPY undef %2
   //
@@ -1372,16 +1410,34 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
       if ((SR.LaneMask & SrcMask).none())
         continue;
       if (SR.liveAt(Idx))
-        return false;
+        return nullptr;
     }
   } else if (SrcLI.liveAt(Idx))
-    return false;
+    return nullptr;
 
-  DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
-
-  // Remove any DstReg segments starting at the instruction.
+  // If the undef copy defines a live-out value (i.e. an input to a PHI def),
+  // then replace it with an IMPLICIT_DEF.
   LiveInterval &DstLI = LIS->getInterval(DstReg);
   SlotIndex RegIndex = Idx.getRegSlot();
+  LiveRange::Segment *Seg = DstLI.getSegmentContaining(RegIndex);
+  assert(Seg != nullptr && "No segment for defining instruction");
+  if (VNInfo *V = DstLI.getVNInfoAt(Seg->end)) {
+    if (V->isPHIDef()) {
+      CopyMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      for (unsigned i = CopyMI->getNumOperands(); i != 0; --i) {
+        MachineOperand &MO = CopyMI->getOperand(i-1);
+        if (MO.isReg() && MO.isUse())
+          CopyMI->RemoveOperand(i-1);
+      }
+      LLVM_DEBUG(dbgs() << "\tReplaced copy of <undef> value with an "
+                           "implicit def\n");
+      return CopyMI;
+    }
+  }
+
+  // Remove any DstReg segments starting at the instruction.
+  LLVM_DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
+
   // Remove value or merge with previous one in case of a subregister def.
   if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
     VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
@@ -1424,7 +1480,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
     if (isLive)
       continue;
     MO.setIsUndef(true);
-    DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
+    LLVM_DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
   }
 
   // A def of a subregister may be a use of the other subregisters, so
@@ -1437,7 +1493,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
       MO.setIsUndef(true);
   LIS->shrinkToUses(&DstLI);
 
-  return true;
+  return CopyMI;
 }
 
 void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
@@ -1539,12 +1595,12 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
         MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
-    DEBUG({
-        dbgs() << "\t\tupdated: ";
-        if (!UseMI->isDebugValue())
-          dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
-        dbgs() << *UseMI;
-      });
+    LLVM_DEBUG({
+      dbgs() << "\t\tupdated: ";
+      if (!UseMI->isDebugValue())
+        dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
+      dbgs() << *UseMI;
+    });
   }
 }
 
@@ -1553,7 +1609,7 @@ bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
   // reserved register. This doesn't increase register pressure, so it is
   // always beneficial.
   if (!MRI->isReserved(CP.getDstReg())) {
-    DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
+    LLVM_DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
     return false;
   }
 
@@ -1561,17 +1617,18 @@ bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
   if (JoinVInt.containsOneValue())
     return true;
 
-  DEBUG(dbgs() << "\tCannot join complex intervals into reserved register.\n");
+  LLVM_DEBUG(
+      dbgs() << "\tCannot join complex intervals into reserved register.\n");
   return false;
 }
 
 bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   Again = false;
-  DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
+  LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
   CoalescerPair CP(*TRI);
   if (!CP.setRegisters(CopyMI)) {
-    DEBUG(dbgs() << "\tNot coalescable.\n");
+    LLVM_DEBUG(dbgs() << "\tNot coalescable.\n");
     return false;
   }
 
@@ -1586,7 +1643,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     }
     if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
                              CP.getNewRC(), *LIS)) {
-      DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
+      LLVM_DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
       return false;
     }
   }
@@ -1595,16 +1652,21 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // sometimes dead copies slip through, and we can't generate invalid live
   // ranges.
   if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
-    DEBUG(dbgs() << "\tCopy is dead.\n");
+    LLVM_DEBUG(dbgs() << "\tCopy is dead.\n");
     DeadDefs.push_back(CopyMI);
     eliminateDeadDefs();
     return true;
   }
 
   // Eliminate undefs.
-  if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
-    deleteInstr(CopyMI);
-    return false;  // Not coalescable.
+  if (!CP.isPhys()) {
+    // If this is an IMPLICIT_DEF, leave it alone, but don't try to coalesce.
+    if (MachineInstr *UndefMI = eliminateUndefCopy(CopyMI)) {
+      if (UndefMI->isImplicitDef())
+        return false;
+      deleteInstr(CopyMI);
+      return false;  // Not coalescable.
+    }
   }
 
   // Coalesced copies are normally removed immediately, but transformations
@@ -1612,7 +1674,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // When that happens, just join the values and remove the copy.
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
-    DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
+    LLVM_DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
     const SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
     LiveQueryResult LRQ = LI.Query(CopyIdx);
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
@@ -1629,7 +1691,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
           S.MergeValueNumberInto(SDefVNI, SReadVNI);
         }
       }
-      DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
+      LLVM_DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
     }
     deleteInstr(CopyMI);
     return true;
@@ -1637,9 +1699,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   // Enforce policies.
   if (CP.isPhys()) {
-    DEBUG(dbgs() << "\tConsidering merging " << printReg(CP.getSrcReg(), TRI)
-                 << " with " << printReg(CP.getDstReg(), TRI, CP.getSrcIdx())
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "\tConsidering merging "
+                      << printReg(CP.getSrcReg(), TRI) << " with "
+                      << printReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n');
     if (!canJoinPhys(CP)) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
@@ -1656,7 +1718,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
                            LIS->getInterval(CP.getDstReg()).size())
       CP.flip();
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "\tConsidering merging to "
              << TRI->getRegClassName(CP.getNewRC()) << " with ";
       if (CP.getDstIdx() && CP.getSrcIdx())
@@ -1692,7 +1754,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       if (adjustCopiesBackFrom(CP, CopyMI) ||
           removeCopyByCommutingDef(CP, CopyMI)) {
         deleteInstr(CopyMI);
-        DEBUG(dbgs() << "\tTrivial!\n");
+        LLVM_DEBUG(dbgs() << "\tTrivial!\n");
         return true;
       }
     }
@@ -1704,7 +1766,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
         return true;
 
     // Otherwise, we are unable to join the intervals.
-    DEBUG(dbgs() << "\tInterference!\n");
+    LLVM_DEBUG(dbgs() << "\tInterference!\n");
     Again = true;  // May be possible to coalesce later.
     return false;
   }
@@ -1738,8 +1800,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     for (LiveInterval::SubRange &S : LI.subranges()) {
       if ((S.LaneMask & ShrinkMask).none())
         continue;
-      DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
-                   << ")\n");
+      LLVM_DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
+                        << ")\n");
       LIS->shrinkToUses(S, LI.reg);
     }
     LI.removeEmptySubRanges();
@@ -1756,7 +1818,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // Update regalloc hint.
   TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "\tSuccess: " << printReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
            << " -> " << printReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
     dbgs() << "\tResult = ";
@@ -1777,7 +1839,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   assert(CP.isPhys() && "Must be a physreg copy");
   assert(MRI->isReserved(DstReg) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(SrcReg);
-  DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
+  LLVM_DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
   assert(RHS.containsOneValue() && "Invalid join with reserved register");
 
@@ -1796,7 +1858,8 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
           return false;
       }
       if (RHS.overlaps(LIS->getRegUnit(*UI))) {
-        DEBUG(dbgs() << "\t\tInterference: " << printRegUnit(*UI, TRI) << '\n');
+        LLVM_DEBUG(dbgs() << "\t\tInterference: " << printRegUnit(*UI, TRI)
+                          << '\n');
         return false;
       }
     }
@@ -1805,7 +1868,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
     BitVector RegMaskUsable;
     if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) &&
         !RegMaskUsable.test(DstReg)) {
-      DEBUG(dbgs() << "\t\tRegMask interference\n");
+      LLVM_DEBUG(dbgs() << "\t\tRegMask interference\n");
       return false;
     }
   }
@@ -1835,12 +1898,12 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
     //   %y = def
     //   ...
     if (!MRI->hasOneNonDBGUse(SrcReg)) {
-      DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
+      LLVM_DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
       return false;
     }
 
     if (!LIS->intervalIsInOneMBB(RHS)) {
-      DEBUG(dbgs() << "\t\tComplex control flow!\n");
+      LLVM_DEBUG(dbgs() << "\t\tComplex control flow!\n");
       return false;
     }
 
@@ -1858,7 +1921,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
            SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
         MachineInstr *MI = LIS->getInstructionFromIndex(SI);
         if (MI->readsRegister(DstReg, TRI)) {
-          DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
+          LLVM_DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
           return false;
         }
       }
@@ -1866,8 +1929,8 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
 
     // We're going to remove the copy which defines a physical reserved
     // register, so remove its valno, etc.
-    DEBUG(dbgs() << "\t\tRemoving phys reg def of " << printReg(DstReg, TRI)
-          << " at " << CopyRegIdx << "\n");
+    LLVM_DEBUG(dbgs() << "\t\tRemoving phys reg def of "
+                      << printReg(DstReg, TRI) << " at " << CopyRegIdx << "\n");
 
     LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
     // Create a new dead def at the new def location.
@@ -2057,6 +2120,13 @@ class JoinVals {
     /// True once Pruned above has been computed.
     bool PrunedComputed = false;
 
+    /// True if this value is determined to be identical to OtherVNI
+    /// (in valuesIdentical). This is used with CR_Erase where the erased
+    /// copy is redundant, i.e. the source value is already the same as
+    /// the destination. In such cases the subranges need to be updated
+    /// properly. See comment at pruneSubRegValues for more info.
+    bool Identical = false;
+
     Val() = default;
 
     bool isAnalyzed() const { return WriteLanes.any(); }
@@ -2073,7 +2143,7 @@ class JoinVals {
   /// Find the ultimate value that VNI was copied from.
   std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
 
-  bool valuesIdentical(VNInfo *Val0, VNInfo *Val1, const JoinVals &Other) const;
+  bool valuesIdentical(VNInfo *Value0, VNInfo *Value1, const JoinVals &Other) const;
 
   /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
   /// Return a conflict resolution when possible, but leave the hard cases as
@@ -2191,17 +2261,17 @@ LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
 
 std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
     const VNInfo *VNI) const {
-  unsigned Reg = this->Reg;
+  unsigned TrackReg = Reg;
 
   while (!VNI->isPHIDef()) {
     SlotIndex Def = VNI->def;
     MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
     assert(MI && "No defining instruction");
     if (!MI->isFullCopy())
-      return std::make_pair(VNI, Reg);
+      return std::make_pair(VNI, TrackReg);
     unsigned SrcReg = MI->getOperand(1).getReg();
     if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
-      return std::make_pair(VNI, Reg);
+      return std::make_pair(VNI, TrackReg);
 
     const LiveInterval &LI = LIS->getInterval(SrcReg);
     const VNInfo *ValueIn;
@@ -2210,7 +2280,8 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
       LiveQueryResult LRQ = LI.Query(Def);
       ValueIn = LRQ.valueIn();
     } else {
-      // Query subranges. Pick the first matching one.
+      // Query subranges. Ensure that all matching ones take us to the same def
+      // (allowing some of them to be undef).
       ValueIn = nullptr;
       for (const LiveInterval::SubRange &S : LI.subranges()) {
         // Transform lanemask to a mask in the joined live interval.
@@ -2218,16 +2289,27 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
         if ((SMask & LaneMask).none())
           continue;
         LiveQueryResult LRQ = S.Query(Def);
-        ValueIn = LRQ.valueIn();
-        break;
+        if (!ValueIn) {
+          ValueIn = LRQ.valueIn();
+          continue;
+        }
+        if (LRQ.valueIn() && ValueIn != LRQ.valueIn())
+          return std::make_pair(VNI, TrackReg);
       }
     }
-    if (ValueIn == nullptr)
-      break;
+    if (ValueIn == nullptr) {
+      // Reaching an undefined value is legitimate, for example:
+      //
+      // 1   undef %0.sub1 = ...  ;; %0.sub0 == undef
+      // 2   %1 = COPY %0         ;; %1 is defined here.
+      // 3   %0 = COPY %1         ;; Now %0.sub0 has a definition,
+      //                          ;; but it's equivalent to "undef".
+      return std::make_pair(nullptr, SrcReg);
+    }
     VNI = ValueIn;
-    Reg = SrcReg;
+    TrackReg = SrcReg;
   }
-  return std::make_pair(VNI, Reg);
+  return std::make_pair(VNI, TrackReg);
 }
 
 bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
@@ -2235,12 +2317,17 @@ bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
   const VNInfo *Orig0;
   unsigned Reg0;
   std::tie(Orig0, Reg0) = followCopyChain(Value0);
-  if (Orig0 == Value1)
+  if (Orig0 == Value1 && Reg0 == Other.Reg)
     return true;
 
   const VNInfo *Orig1;
   unsigned Reg1;
   std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
+  // If both values are undefined, and the source registers are the same
+  // register, the values are identical. Filter out cases where only one
+  // value is defined.
+  if (Orig0 == nullptr || Orig1 == nullptr)
+    return Orig0 == Orig1 && Reg0 == Reg1;
 
   // The values are equal if they are defined at the same place and use the
   // same register. Note that we cannot compare VNInfos directly as some of
@@ -2375,9 +2462,10 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   // to erase the IMPLICIT_DEF instruction.
   if (OtherV.ErasableImplicitDef && DefMI &&
       DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) {
-    DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
-                 << " extends into " << printMBBReference(*DefMI->getParent())
-                 << ", keeping it.\n");
+    LLVM_DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
+                      << " extends into "
+                      << printMBBReference(*DefMI->getParent())
+                      << ", keeping it.\n");
     OtherV.ErasableImplicitDef = false;
   }
 
@@ -2415,9 +2503,11 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   //   %other = COPY %ext
   //   %this  = COPY %ext <-- Erase this copy
   //
-  if (DefMI->isFullCopy() && !CP.isPartial()
-      && valuesIdentical(VNI, V.OtherVNI, Other))
+  if (DefMI->isFullCopy() && !CP.isPartial() &&
+      valuesIdentical(VNI, V.OtherVNI, Other)) {
+    V.Identical = true;
     return CR_Erase;
+  }
 
   // If the lanes written by this instruction were all undef in OtherVNI, it is
   // still safe to join the live ranges. This can't be done with a simple value
@@ -2487,11 +2577,11 @@ void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
     assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
     assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
     Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
-    DEBUG(dbgs() << "\t\tmerge " << printReg(Reg) << ':' << ValNo << '@'
-                 << LR.getValNumInfo(ValNo)->def << " into "
-                 << printReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
-                 << V.OtherVNI->def << " --> @"
-                 << NewVNInfo[Assignments[ValNo]]->def << '\n');
+    LLVM_DEBUG(dbgs() << "\t\tmerge " << printReg(Reg) << ':' << ValNo << '@'
+                      << LR.getValNumInfo(ValNo)->def << " into "
+                      << printReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
+                      << V.OtherVNI->def << " --> @"
+                      << NewVNInfo[Assignments[ValNo]]->def << '\n');
     break;
   case CR_Replace:
   case CR_Unresolved: {
@@ -2517,8 +2607,8 @@ bool JoinVals::mapValues(JoinVals &Other) {
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     computeAssignment(i, Other);
     if (Vals[i].Resolution == CR_Impossible) {
-      DEBUG(dbgs() << "\t\tinterference at " << printReg(Reg) << ':' << i
-                   << '@' << LR.getValNumInfo(i)->def << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tinterference at " << printReg(Reg) << ':' << i
+                        << '@' << LR.getValNumInfo(i)->def << '\n');
       return false;
     }
   }
@@ -2540,13 +2630,13 @@ taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
     // lanes escape the block.
     SlotIndex End = OtherI->end;
     if (End >= MBBEnd) {
-      DEBUG(dbgs() << "\t\ttaints global " << printReg(Other.Reg) << ':'
-                   << OtherI->valno->id << '@' << OtherI->start << '\n');
+      LLVM_DEBUG(dbgs() << "\t\ttaints global " << printReg(Other.Reg) << ':'
+                        << OtherI->valno->id << '@' << OtherI->start << '\n');
       return false;
     }
-    DEBUG(dbgs() << "\t\ttaints local " << printReg(Other.Reg) << ':'
-                 << OtherI->valno->id << '@' << OtherI->start
-                 << " to " << End << '\n');
+    LLVM_DEBUG(dbgs() << "\t\ttaints local " << printReg(Other.Reg) << ':'
+                      << OtherI->valno->id << '@' << OtherI->start << " to "
+                      << End << '\n');
     // A dead def is not a problem.
     if (End.isDead())
       break;
@@ -2567,7 +2657,7 @@ taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
 
 bool JoinVals::usesLanes(const MachineInstr &MI, unsigned Reg, unsigned SubIdx,
                          LaneBitmask Lanes) const {
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return false;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || MO.isDef() || MO.getReg() != Reg)
@@ -2587,8 +2677,8 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     assert(V.Resolution != CR_Impossible && "Unresolvable conflict");
     if (V.Resolution != CR_Unresolved)
       continue;
-    DEBUG(dbgs() << "\t\tconflict at " << printReg(Reg) << ':' << i
-                 << '@' << LR.getValNumInfo(i)->def << '\n');
+    LLVM_DEBUG(dbgs() << "\t\tconflict at " << printReg(Reg) << ':' << i << '@'
+                      << LR.getValNumInfo(i)->def << '\n');
     if (SubRangeJoin)
       return false;
 
@@ -2625,7 +2715,7 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     while (true) {
       assert(MI != MBB->end() && "Bad LastMI");
       if (usesLanes(*MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
-        DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
+        LLVM_DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
         return false;
       }
       // LastMI is the last instruction to use the current value.
@@ -2698,8 +2788,8 @@ void JoinVals::pruneValues(JoinVals &Other,
         if (!EraseImpDef)
           EndPoints.push_back(Def);
       }
-      DEBUG(dbgs() << "\t\tpruned " << printReg(Other.Reg) << " at " << Def
-                   << ": " << Other.LR << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tpruned " << printReg(Other.Reg) << " at " << Def
+                        << ": " << Other.LR << '\n');
       break;
     }
     case CR_Erase:
@@ -2710,8 +2800,8 @@ void JoinVals::pruneValues(JoinVals &Other,
         // computeAssignment(), the value that was originally copied could have
         // been replaced.
         LIS->pruneValue(LR, Def, &EndPoints);
-        DEBUG(dbgs() << "\t\tpruned all of " << printReg(Reg) << " at "
-                     << Def << ": " << LR << '\n');
+        LLVM_DEBUG(dbgs() << "\t\tpruned all of " << printReg(Reg) << " at "
+                          << Def << ": " << LR << '\n');
       }
       break;
     case CR_Unresolved:
@@ -2721,21 +2811,65 @@ void JoinVals::pruneValues(JoinVals &Other,
   }
 }
 
+/// Consider the following situation when coalescing the copy between
+/// %31 and %45 at 800. (The vertical lines represent live range segments.)
+///
+///                              Main range         Subrange 0004 (sub2)
+///                              %31    %45           %31    %45
+///  544    %45 = COPY %28               +                    +
+///                                      | v1                 | v1
+///  560B bb.1:                          +                    +
+///  624        = %45.sub2               | v2                 | v2
+///  800    %31 = COPY %45        +      +             +      +
+///                               | v0                 | v0
+///  816    %31.sub1 = ...        +                    |
+///  880    %30 = COPY %31        | v1                 +
+///  928    %45 = COPY %30        |      +                    +
+///                               |      | v0                 | v0  <--+
+///  992B   ; backedge -> bb.1    |      +                    +        |
+/// 1040        = %31.sub0        +                                    |
+///                                                 This value must remain
+///                                                 live-out!
+///
+/// Assuming that %31 is coalesced into %45, the copy at 928 becomes
+/// redundant, since it copies the value from %45 back into it. The
+/// conflict resolution for the main range determines that %45.v0 is
+/// to be erased, which is ok since %31.v1 is identical to it.
+/// The problem happens with the subrange for sub2: it has to be live
+/// on exit from the block, but since 928 was actually a point of
+/// definition of %45.sub2, %45.sub2 was not live immediately prior
+/// to that definition. As a result, when 928 was erased, the value v0
+/// for %45.sub2 was pruned in pruneSubRegValues. Consequently, an
+/// IMPLICIT_DEF was inserted as a "backedge" definition for %45.sub2,
+/// providing an incorrect value to the use at 624.
+///
+/// Since the main-range values %31.v1 and %45.v0 were proved to be
+/// identical, the corresponding values in subranges must also be the
+/// same. A redundant copy is removed because it's not needed, and not
+/// because it copied an undefined value, so any liveness that originated
+/// from that copy cannot disappear. When pruning a value that started
+/// at the removed copy, the corresponding identical value must be
+/// extended to replace it.
 void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
   // Look for values being erased.
   bool DidPrune = false;
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
     // We should trigger in all cases in which eraseInstrs() does something.
     // match what eraseInstrs() is doing, print a message so
-    if (Vals[i].Resolution != CR_Erase &&
-        (Vals[i].Resolution != CR_Keep || !Vals[i].ErasableImplicitDef ||
-         !Vals[i].Pruned))
+    if (V.Resolution != CR_Erase &&
+        (V.Resolution != CR_Keep || !V.ErasableImplicitDef || !V.Pruned))
       continue;
 
     // Check subranges at the point where the copy will be removed.
     SlotIndex Def = LR.getValNumInfo(i)->def;
+    SlotIndex OtherDef;
+    if (V.Identical)
+      OtherDef = V.OtherVNI->def;
+
     // Print message so mismatches with eraseInstrs() can be diagnosed.
-    DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def << '\n');
+    LLVM_DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def
+                      << '\n');
     for (LiveInterval::SubRange &S : LI.subranges()) {
       LiveQueryResult Q = S.Query(Def);
 
@@ -2743,19 +2877,28 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
       // copied and we must remove that subrange value as well.
       VNInfo *ValueOut = Q.valueOutOrDead();
       if (ValueOut != nullptr && Q.valueIn() == nullptr) {
-        DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
-                     << " at " << Def << "\n");
-        LIS->pruneValue(S, Def, nullptr);
+        LLVM_DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
+                          << " at " << Def << "\n");
+        SmallVector<SlotIndex,8> EndPoints;
+        LIS->pruneValue(S, Def, &EndPoints);
         DidPrune = true;
         // Mark value number as unused.
         ValueOut->markUnused();
+
+        if (V.Identical && S.Query(OtherDef).valueOut()) {
+          // If V is identical to V.OtherVNI (and S was live at OtherDef),
+          // then we can't simply prune V from S. V needs to be replaced
+          // with V.OtherVNI.
+          LIS->extendToIndices(S, EndPoints);
+        }
         continue;
       }
       // If a subrange ends at the copy, then a value was copied but only
       // partially used later. Shrink the subregister range appropriately.
       if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
-        DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask)
-                     << " at " << Def << "\n");
+        LLVM_DEBUG(dbgs() << "\t\tDead uses at sublane "
+                          << PrintLaneMask(S.LaneMask) << " at " << Def
+                          << "\n");
         ShrinkMask |= S.LaneMask;
       }
     }
@@ -2867,7 +3010,7 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
             std::prev(S)->end = NewEnd;
         }
       }
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n';
         if (LI != nullptr)
           dbgs() << "\t\t  LHS = " << *LI << '\n';
@@ -2885,7 +3028,7 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
           ShrinkRegs.push_back(Reg);
       }
       ErasedInstrs.insert(MI);
-      DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
+      LLVM_DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
       LIS->RemoveMachineInstrFromMaps(*MI);
       MI->eraseFromParent();
       break;
@@ -2940,13 +3083,14 @@ void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
   LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
               NewVNInfo);
 
-  DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tjoined lanes: " << PrintLaneMask(LaneMask)
+                    << ' ' << LRange << "\n");
   if (EndPoints.empty())
     return;
 
   // Recompute the parts of the live range we had to remove because of
   // CR_Replace conflicts.
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
     for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
       dbgs() << EndPoints[i];
@@ -2985,9 +3129,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), LaneBitmask::getNone(),
                    NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);
 
-  DEBUG(dbgs() << "\t\tRHS = " << RHS
-               << "\n\t\tLHS = " << LHS
-               << '\n');
+  LLVM_DEBUG(dbgs() << "\t\tRHS = " << RHS << "\n\t\tLHS = " << LHS << '\n');
 
   // First compute NewVNInfo and the simple value mappings.
   // Detect impossible conflicts early.
@@ -3018,8 +3160,8 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
         R.LaneMask = Mask;
       }
     }
-    DEBUG(dbgs() << "\t\tLHST = " << printReg(CP.getDstReg())
-                 << ' ' << LHS << '\n');
+    LLVM_DEBUG(dbgs() << "\t\tLHST = " << printReg(CP.getDstReg()) << ' ' << LHS
+                      << '\n');
 
     // Determine lanemasks of RHS in the coalesced register and merge subranges.
     unsigned SrcIdx = CP.getSrcIdx();
@@ -3034,7 +3176,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
         mergeSubRangeInto(LHS, R, Mask, CP);
       }
     }
-    DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+    LLVM_DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
 
     // Pruning implicit defs from subranges may result in the main range
     // having stale segments.
@@ -3072,7 +3214,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   if (!EndPoints.empty()) {
     // Recompute the parts of the live range we had to remove because of
     // CR_Replace conflicts.
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
       for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
         dbgs() << EndPoints[i];
@@ -3220,7 +3362,8 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
       continue;
     // Check that OtherReg interfere with DstReg.
     if (LIS->getInterval(OtherReg).overlaps(DstLI)) {
-      DEBUG(dbgs() << "Apply terminal rule for: " << printReg(DstReg) << '\n');
+      LLVM_DEBUG(dbgs() << "Apply terminal rule for: " << printReg(DstReg)
+                        << '\n');
       return true;
     }
   }
@@ -3229,7 +3372,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
 
 void
 RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << MBB->getName() << ":\n");
+  LLVM_DEBUG(dbgs() << MBB->getName() << ":\n");
 
   // Collect all copy-like instructions in MBB. Don't start coalescing anything
   // yet, it might invalidate the iterator.
@@ -3294,7 +3437,7 @@ void RegisterCoalescer::coalesceLocals() {
 }
 
 void RegisterCoalescer::joinAllIntervals() {
-  DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+  LLVM_DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
   assert(WorkList.empty() && LocalWorkList.empty() && "Old data still around.");
 
   std::vector<MBBPriorityInfo> MBBs;
@@ -3350,8 +3493,8 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   // splitting optimization.
   JoinSplitEdges = EnableJoinSplits;
 
-  DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
-               << "********** Function: " << MF->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
+                    << "********** Function: " << MF->getName() << '\n');
 
   if (VerifyCoalescing)
     MF->verify(this, "Before register coalescing");
@@ -3368,14 +3511,15 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   array_pod_sort(InflateRegs.begin(), InflateRegs.end());
   InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
                     InflateRegs.end());
-  DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size() << " regs.\n");
+  LLVM_DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size()
+                    << " regs.\n");
   for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
     unsigned Reg = InflateRegs[i];
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     if (MRI->recomputeRegClass(Reg)) {
-      DEBUG(dbgs() << printReg(Reg) << " inflated to "
-                   << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      LLVM_DEBUG(dbgs() << printReg(Reg) << " inflated to "
+                        << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
       ++NumInflated;
 
       LiveInterval &LI = LIS->getInterval(Reg);
@@ -3398,7 +3542,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
     }
   }
 
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
   if (VerifyCoalescing)
     MF->verify(this, "After register coalescing");
   return true;
diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
index 9ac810c7c723..51414de518fd 100644
--- a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
@@ -587,7 +588,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
   for (auto I = Defs.begin(); I != Defs.end(); ) {
     LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
                                            Pos.getDeadSlot());
-    // If the the def is all that is live after the instruction, then in case
+    // If the def is all that is live after the instruction, then in case
     // of a subregister def we need a read-undef flag.
     unsigned RegUnit = I->RegUnit;
     if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
@@ -635,7 +636,7 @@ void PressureDiffs::init(unsigned N) {
   }
   Max = Size;
   free(PDiffArray);
-  PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff)));
+  PDiffArray = static_cast<PressureDiff*>(safe_calloc(N, sizeof(PressureDiff)));
 }
 
 void PressureDiffs::addInstruction(unsigned Idx,
@@ -747,7 +748,7 @@ void RegPressureTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
 /// instruction independent of liveness.
 void RegPressureTracker::recede(const RegisterOperands &RegOpers,
                                 SmallVectorImpl<RegisterMaskPair> *LiveUses) {
-  assert(!CurrPos->isDebugValue());
+  assert(!CurrPos->isDebugInstr());
 
   // Boost pressure for all dead defs together.
   bumpDeadDefs(RegOpers.DeadDefs);
@@ -1018,7 +1019,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
 /// This is intended for speculative queries. It leaves pressure inconsistent
 /// with the current position, so must be restored by the caller.
 void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
-  assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
+  assert(!MI->isDebugInstr() && "Expect a nondebug instruction.");
 
   SlotIndex SlotIdx;
   if (RequireIntervals)
@@ -1259,7 +1260,7 @@ LaneBitmask RegPressureTracker::getLiveThroughAt(unsigned RegUnit,
 /// This is intended for speculative queries. It leaves pressure inconsistent
 /// with the current position, so must be restored by the caller.
 void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
-  assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
+  assert(!MI->isDebugInstr() && "Expect a nondebug instruction.");
 
   SlotIndex SlotIdx;
   if (RequireIntervals)
diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
index 97967124add6..a878c34f9aa4 100644
--- a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -111,7 +111,7 @@ void RegScavenger::determineKillsAndDefs() {
   assert(Tracking && "Must be tracking to determine kills and defs");
 
   MachineInstr &MI = *MBBI;
-  assert(!MI.isDebugValue() && "Debug values have no kills or defs");
+  assert(!MI.isDebugInstr() && "Debug values have no kills or defs");
 
   // Find out which registers are early clobbered, killed, defined, and marked
   // def-dead in this instruction.
@@ -158,12 +158,12 @@ void RegScavenger::unprocess() {
   assert(Tracking && "Cannot unprocess because we're not tracking");
 
   MachineInstr &MI = *MBBI;
-  if (!MI.isDebugValue()) {
+  if (!MI.isDebugInstr()) {
     determineKillsAndDefs();
 
     // Commit the changes.
-    setUsed(KillRegUnits);
     setUnused(DefRegUnits);
+    setUsed(KillRegUnits);
   }
 
   if (MBBI == MBB->begin()) {
@@ -195,7 +195,7 @@ void RegScavenger::forward() {
     I->Restore = nullptr;
   }
 
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return;
 
   determineKillsAndDefs();
@@ -288,8 +288,8 @@ bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
   for (unsigned Reg : *RC) {
     if (!isRegUsed(Reg)) {
-      DEBUG(dbgs() << "Scavenger found unused reg: " << printReg(Reg, TRI)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Scavenger found unused reg: " << printReg(Reg, TRI)
+                        << "\n");
       return Reg;
     }
   }
@@ -318,7 +318,7 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
 
   bool inVirtLiveRange = false;
   for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) {
-    if (MI->isDebugValue()) {
+    if (MI->isDebugInstr()) {
       ++InstrLimit; // Don't count debug instructions
       continue;
     }
@@ -561,15 +561,15 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
 
   // If we found an unused register there is no reason to spill it.
   if (!isRegUsed(SReg)) {
-    DEBUG(dbgs() << "Scavenged register: " << printReg(SReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Scavenged register: " << printReg(SReg, TRI) << "\n");
     return SReg;
   }
 
   ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI);
   Scavenged.Restore = &*std::prev(UseMI);
 
-  DEBUG(dbgs() << "Scavenged register (with spill): " << printReg(SReg, TRI)
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Scavenged register (with spill): "
+                    << printReg(SReg, TRI) << "\n");
 
   return SReg;
 }
@@ -594,14 +594,15 @@ unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
     MachineBasicBlock::iterator ReloadAfter =
       RestoreAfter ? std::next(MBBI) : MBBI;
     MachineBasicBlock::iterator ReloadBefore = std::next(ReloadAfter);
-    DEBUG(dbgs() << "Reload before: " << *ReloadBefore << '\n');
+    LLVM_DEBUG(dbgs() << "Reload before: " << *ReloadBefore << '\n');
     ScavengedInfo &Scavenged = spill(Reg, RC, SPAdj, SpillBefore, ReloadBefore);
     Scavenged.Restore = &*std::prev(SpillBefore);
     LiveUnits.removeReg(Reg);
-    DEBUG(dbgs() << "Scavenged register with spill: " << printReg(Reg, TRI)
-                 << " until " << *SpillBefore);
+    LLVM_DEBUG(dbgs() << "Scavenged register with spill: " << printReg(Reg, TRI)
+                      << " until " << *SpillBefore);
   } else {
-    DEBUG(dbgs() << "Scavenged free register: " << printReg(Reg, TRI) << '\n');
+    LLVM_DEBUG(dbgs() << "Scavenged free register: " << printReg(Reg, TRI)
+                      << '\n');
   }
   return Reg;
 }
@@ -757,8 +758,8 @@ void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) {
 
     bool Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
     if (Again) {
-      DEBUG(dbgs() << "Warning: Required two scavenging passes for block "
-            << MBB.getName() << '\n');
+      LLVM_DEBUG(dbgs() << "Warning: Required two scavenging passes for block "
+                        << MBB.getName() << '\n');
       Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
       // The target required a 2nd run (because it created new vregs while
       // spilling). Refuse to do another pass to keep compiletime in check.
diff --git a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
index 4e42deb406e1..6a31118cc562 100644
--- a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -31,8 +31,6 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "ip-regalloc"
-
 static cl::opt<bool> DumpRegUsage(
     "print-regusage", cl::init(false), cl::Hidden,
     cl::desc("print register usage details collected for analysis."));
@@ -42,7 +40,9 @@ INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
 
 char PhysicalRegisterUsageInfo::ID = 0;
 
-void PhysicalRegisterUsageInfo::anchor() {}
+void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) {
+  this->TM = &TM;
+}
 
 bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
   RegMasks.grow(M.size());
@@ -58,22 +58,19 @@ bool PhysicalRegisterUsageInfo::doFinalization(Module &M) {
 }
 
 void PhysicalRegisterUsageInfo::storeUpdateRegUsageInfo(
-    const Function *FP, std::vector<uint32_t> RegMask) {
-  assert(FP != nullptr && "Function * can't be nullptr.");
-  RegMasks[FP] = std::move(RegMask);
+    const Function &FP, ArrayRef<uint32_t> RegMask) {
+  RegMasks[&FP] = RegMask;
 }
 
-const std::vector<uint32_t> *
-PhysicalRegisterUsageInfo::getRegUsageInfo(const Function *FP) {
-  auto It = RegMasks.find(FP);
+ArrayRef<uint32_t>
+PhysicalRegisterUsageInfo::getRegUsageInfo(const Function &FP) {
+  auto It = RegMasks.find(&FP);
   if (It != RegMasks.end())
-    return &(It->second);
-  return nullptr;
+    return makeArrayRef<uint32_t>(It->second);
+  return ArrayRef<uint32_t>();
 }
 
 void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
-  const TargetRegisterInfo *TRI;
-
   using FuncPtrRegMaskPair = std::pair<const Function *, std::vector<uint32_t>>;
 
   SmallVector<const FuncPtrRegMaskPair *, 64> FPRMPairVector;
@@ -83,7 +80,7 @@ void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
     FPRMPairVector.push_back(&RegMask);
 
   // sort the vector to print analysis in alphabatic order of function name.
-  std::sort(
+  llvm::sort(
       FPRMPairVector.begin(), FPRMPairVector.end(),
       [](const FuncPtrRegMaskPair *A, const FuncPtrRegMaskPair *B) -> bool {
         return A->first->getName() < B->first->getName();
@@ -92,8 +89,9 @@ void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
   for (const FuncPtrRegMaskPair *FPRMPair : FPRMPairVector) {
     OS << FPRMPair->first->getName() << " "
        << "Clobbered Registers: ";
-    TRI = TM->getSubtarget<TargetSubtargetInfo>(*(FPRMPair->first))
-              .getRegisterInfo();
+    const TargetRegisterInfo *TRI
+        = TM->getSubtarget<TargetSubtargetInfo>(*(FPRMPair->first))
+          .getRegisterInfo();
 
     for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
       if (MachineOperand::clobbersPhysReg(&(FPRMPair->second[0]), PReg))
diff --git a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
index 1e1f36a35ecc..156d1c81c238 100644
--- a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -77,20 +77,20 @@ private:
   /// Split unrelated subregister components and rename them to new vregs.
   bool renameComponents(LiveInterval &LI) const;
 
-  /// \brief Build a vector of SubRange infos and a union find set of
+  /// Build a vector of SubRange infos and a union find set of
   /// equivalence classes.
   /// Returns true if more than 1 equivalence class was found.
   bool findComponents(IntEqClasses &Classes,
                       SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
                       LiveInterval &LI) const;
 
-  /// \brief Distribute the LiveInterval segments into the new LiveIntervals
+  /// Distribute the LiveInterval segments into the new LiveIntervals
   /// belonging to their class.
   void distribute(const IntEqClasses &Classes,
                   const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
                   const SmallVectorImpl<LiveInterval*> &Intervals) const;
 
-  /// \brief Constructs main liverange and add missing undef+dead flags.
+  /// Constructs main liverange and add missing undef+dead flags.
   void computeMainRangesFixFlags(const IntEqClasses &Classes,
       const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
       const SmallVectorImpl<LiveInterval*> &Intervals) const;
@@ -134,17 +134,17 @@ bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
   const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
   SmallVector<LiveInterval*, 4> Intervals;
   Intervals.push_back(&LI);
-  DEBUG(dbgs() << printReg(Reg) << ": Found " << Classes.getNumClasses()
-        << " equivalence classes.\n");
-  DEBUG(dbgs() << printReg(Reg) << ": Splitting into newly created:");
+  LLVM_DEBUG(dbgs() << printReg(Reg) << ": Found " << Classes.getNumClasses()
+                    << " equivalence classes.\n");
+  LLVM_DEBUG(dbgs() << printReg(Reg) << ": Splitting into newly created:");
   for (unsigned I = 1, NumClasses = Classes.getNumClasses(); I < NumClasses;
        ++I) {
     unsigned NewVReg = MRI->createVirtualRegister(RegClass);
     LiveInterval &NewLI = LIS->createEmptyInterval(NewVReg);
     Intervals.push_back(&NewLI);
-    DEBUG(dbgs() << ' ' << printReg(NewVReg));
+    LLVM_DEBUG(dbgs() << ' ' << printReg(NewVReg));
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   rewriteOperands(Classes, SubRangeInfos, Intervals);
   distribute(Classes, SubRangeInfos, Intervals);
@@ -219,7 +219,8 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
     if (!MO.isDef() && !MO.readsReg())
       continue;
 
-    SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+    auto *MI = MO.getParent();
+    SlotIndex Pos = LIS->getInstructionIndex(*MI);
     Pos = MO.isDef() ? Pos.getRegSlot(MO.isEarlyClobber())
                      : Pos.getBaseIndex();
     unsigned SubRegIdx = MO.getSubReg();
@@ -245,11 +246,14 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
     MO.setReg(VReg);
 
     if (MO.isTied() && Reg != VReg) {
-      /// Undef use operands are not tracked in the equivalence class but need
-      /// to be update if they are tied.
-      MO.getParent()->substituteRegister(Reg, VReg, 0, TRI);
-
-      // substituteRegister breaks the iterator, so restart.
+      /// Undef use operands are not tracked in the equivalence class,
+      /// but need to be updated if they are tied; take care to only
+      /// update the tied operand.
+      unsigned OperandNo = MI->getOperandNo(&MO);
+      unsigned TiedIdx = MI->findTiedOperandIdx(OperandNo);
+      MI->getOperand(TiedIdx).setReg(VReg);
+
+      // above substitution breaks the iterator, so restart.
       I = MRI->reg_nodbg_begin(Reg);
     }
   }
@@ -376,8 +380,8 @@ bool RenameIndependentSubregs::runOnMachineFunction(MachineFunction &MF) {
   if (!MRI->subRegLivenessEnabled())
     return false;
 
-  DEBUG(dbgs() << "Renaming independent subregister live ranges in "
-        << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Renaming independent subregister live ranges in "
+                    << MF.getName() << '\n');
 
   LIS = &getAnalysis<LiveIntervals>();
   TII = MF.getSubtarget().getInstrInfo();
diff --git a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
index f1885aa74285..a02302e6ff99 100644
--- a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
@@ -13,9 +13,12 @@
 /// happen is that the MachineFunction has the FailedISel property.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/Debug.h"
@@ -42,12 +45,23 @@ namespace {
 
     StringRef getPassName() const override { return "ResetMachineFunction"; }
 
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<StackProtector>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
     bool runOnMachineFunction(MachineFunction &MF) override {
+      // No matter what happened, whether we successfully selected the function
+      // or not, nothing is going to use the vreg types after us. Make sure they
+      // disappear.
+      auto ClearVRegTypesOnReturn =
+          make_scope_exit([&MF]() { MF.getRegInfo().clearVirtRegTypes(); });
+
       if (MF.getProperties().hasProperty(
               MachineFunctionProperties::Property::FailedISel)) {
         if (AbortOnFailedISel)
           report_fatal_error("Instruction selection failed");
-        DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n');
+        LLVM_DEBUG(dbgs() << "Resetting: " << MF.getName() << '\n');
         ++NumFunctionsReset;
         MF.reset();
         if (EmitFallbackDiag) {
@@ -65,7 +79,7 @@ namespace {
 
 char ResetMachineFunction::ID = 0;
 INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE,
-                "reset machine function if ISel failed", false, false)
+                "Reset machine function if ISel failed", false, false)
 
 MachineFunctionPass *
 llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false,
diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp
index 51233be521be..cbbbf7c385aa 100644
--- a/contrib/llvm/lib/CodeGen/SafeStack.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp
@@ -24,10 +24,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -61,7 +63,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -88,6 +90,13 @@ STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
 } // namespace llvm
 
+/// Use __safestack_pointer_address even if the platform has a faster way of
+/// access safe stack pointer.
+static cl::opt<bool>
+    SafeStackUsePointerAddress("safestack-use-pointer-address",
+                                  cl::init(false), cl::Hidden);
+
+
 namespace {
 
 /// Rewrite an SCEV expression for a memory access address to an expression that
@@ -134,14 +143,14 @@ class SafeStack {
   /// might expect to appear on the stack on most common targets.
   enum { StackAlignment = 16 };
 
-  /// \brief Return the value of the stack canary.
+  /// Return the value of the stack canary.
   Value *getStackGuard(IRBuilder<> &IRB, Function &F);
 
-  /// \brief Load stack guard from the frame and check if it has changed.
+  /// Load stack guard from the frame and check if it has changed.
   void checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
                        AllocaInst *StackGuardSlot, Value *StackGuard);
 
-  /// \brief Find all static allocas, dynamic allocas, return instructions and
+  /// Find all static allocas, dynamic allocas, return instructions and
   /// stack restore points (exception unwind blocks and setjmp calls) in the
   /// given function and append them to the respective vectors.
   void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
@@ -150,11 +159,11 @@ class SafeStack {
                  SmallVectorImpl<ReturnInst *> &Returns,
                  SmallVectorImpl<Instruction *> &StackRestorePoints);
 
-  /// \brief Calculate the allocation size of a given alloca. Returns 0 if the
+  /// Calculate the allocation size of a given alloca. Returns 0 if the
   /// size can not be statically determined.
   uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
 
-  /// \brief Allocate space for all static allocas in \p StaticAllocas,
+  /// Allocate space for all static allocas in \p StaticAllocas,
   /// replace allocas with pointers into the unsafe stack and generate code to
   /// restore the stack pointer before all return instructions in \p Returns.
   ///
@@ -167,7 +176,7 @@ class SafeStack {
                                         Instruction *BasePointer,
                                         AllocaInst *StackGuardSlot);
 
-  /// \brief Generate code to restore the stack after all stack restore points
+  /// Generate code to restore the stack after all stack restore points
   /// in \p StackRestorePoints.
   ///
   /// \returns A local variable in which to maintain the dynamic top of the
@@ -177,7 +186,7 @@ class SafeStack {
                            ArrayRef<Instruction *> StackRestorePoints,
                            Value *StaticTop, bool NeedDynamicTop);
 
-  /// \brief Replace all allocas in \p DynamicAllocas with code to allocate
+  /// Replace all allocas in \p DynamicAllocas with code to allocate
   /// space dynamically on the unsafe stack and store the dynamic unsafe stack
   /// top to \p DynamicTop if non-null.
   void moveDynamicAllocasToUnsafeStack(Function &F, Value *UnsafeStackPtr,
@@ -191,6 +200,9 @@ class SafeStack {
   bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
                     uint64_t AllocaSize);
 
+  bool ShouldInlinePointerAddress(CallSite &CS);
+  void TryInlinePointerAddress();
+
 public:
   SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
             ScalarEvolution &SE)
@@ -230,16 +242,17 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
       ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize));
   bool Safe = AllocaRange.contains(AccessRange);
 
-  DEBUG(dbgs() << "[SafeStack] "
-               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
-               << *AllocaPtr << "\n"
-               << "            Access " << *Addr << "\n"
-               << "            SCEV " << *Expr
-               << " U: " << SE.getUnsignedRange(Expr)
-               << ", S: " << SE.getSignedRange(Expr) << "\n"
-               << "            Range " << AccessRange << "\n"
-               << "            AllocaRange " << AllocaRange << "\n"
-               << "            " << (Safe ? "safe" : "unsafe") << "\n");
+  LLVM_DEBUG(
+      dbgs() << "[SafeStack] "
+             << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+             << *AllocaPtr << "\n"
+             << "            Access " << *Addr << "\n"
+             << "            SCEV " << *Expr
+             << " U: " << SE.getUnsignedRange(Expr)
+             << ", S: " << SE.getSignedRange(Expr) << "\n"
+             << "            Range " << AccessRange << "\n"
+             << "            AllocaRange " << AllocaRange << "\n"
+             << "            " << (Safe ? "safe" : "unsafe") << "\n");
 
   return Safe;
 }
@@ -286,8 +299,9 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
       case Instruction::Store:
         if (V == I->getOperand(0)) {
           // Stored the pointer - conservatively assume it may be unsafe.
-          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
-                       << "\n            store of address: " << *I << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                     << "\n            store of address: " << *I << "\n");
           return false;
         }
 
@@ -312,9 +326,9 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 
         if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
           if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) {
-            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
-                         << "\n            unsafe memintrinsic: " << *I
-                         << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                       << "\n            unsafe memintrinsic: " << *I << "\n");
             return false;
           }
           continue;
@@ -332,8 +346,8 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
           if (A->get() == V)
             if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
                                                CS.doesNotAccessMemory()))) {
-              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
-                           << "\n            unsafe call: " << *I << "\n");
+              LLVM_DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                                << "\n            unsafe call: " << *I << "\n");
               return false;
             }
         continue;
@@ -545,6 +559,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
 
   for (Argument *Arg : ByValArguments) {
     unsigned Offset = SSL.getObjectOffset(Arg);
+    unsigned Align = SSL.getObjectAlignment(Arg);
     Type *Ty = Arg->getType()->getPointerElementType();
 
     uint64_t Size = DL.getTypeStoreSize(Ty);
@@ -561,7 +576,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
                       DIExpression::NoDeref, -Offset, DIExpression::NoDeref);
     Arg->replaceAllUsesWith(NewArg);
     IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
-    IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
+    IRB.CreateMemCpy(Off, Align, Arg, Arg->getParamAlignment(), Size);
   }
 
   // Allocate space for every unsafe static AllocaInst on the unsafe stack.
@@ -695,6 +710,35 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
   }
 }
 
+bool SafeStack::ShouldInlinePointerAddress(CallSite &CS) {
+  Function *Callee = CS.getCalledFunction();
+  if (CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))
+    return true;
+  if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
+      CS.isNoInline())
+    return false;
+  return true;
+}
+
+void SafeStack::TryInlinePointerAddress() {
+  if (!isa<CallInst>(UnsafeStackPtr))
+    return;
+
+  if(F.hasFnAttribute(Attribute::OptimizeNone))
+    return;
+
+  CallSite CS(UnsafeStackPtr);
+  Function *Callee = CS.getCalledFunction();
+  if (!Callee || Callee->isDeclaration())
+    return;
+
+  if (!ShouldInlinePointerAddress(CS))
+    return;
+
+  InlineFunctionInfo IFI;
+  InlineFunction(CS, IFI);
+}
+
 bool SafeStack::run() {
   assert(F.hasFnAttribute(Attribute::SafeStack) &&
          "Can't run SafeStack on a function without the attribute");
@@ -731,7 +775,13 @@ bool SafeStack::run() {
     ++NumUnsafeStackRestorePointsFunctions;
 
   IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
-  UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
+  if (SafeStackUsePointerAddress) {
+    Value *Fn = F.getParent()->getOrInsertFunction(
+        "__safestack_pointer_address", StackPtrTy->getPointerTo(0));
+    UnsafeStackPtr = IRB.CreateCall(Fn);
+  } else {
+    UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
+  }
 
   // Load the current stack pointer (we'll also use it as a base pointer).
   // FIXME: use a dedicated register for it ?
@@ -779,7 +829,9 @@ bool SafeStack::run() {
     IRB.CreateStore(BasePointer, UnsafeStackPtr);
   }
 
-  DEBUG(dbgs() << "[SafeStack]     safestack applied\n");
+  TryInlinePointerAddress();
+
+  LLVM_DEBUG(dbgs() << "[SafeStack]     safestack applied\n");
   return true;
 }
 
@@ -800,17 +852,17 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
-    DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
 
     if (!F.hasFnAttribute(Attribute::SafeStack)) {
-      DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
-                      " for this function\n");
+      LLVM_DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                           " for this function\n");
       return false;
     }
 
     if (F.isDeclaration()) {
-      DEBUG(dbgs() << "[SafeStack]     function definition"
-                      " is not available\n");
+      LLVM_DEBUG(dbgs() << "[SafeStack]     function definition"
+                           " is not available\n");
       return false;
     }
 
diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
index 072e6e090e1e..329458778a98 100644
--- a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Instruction.h"
@@ -101,10 +102,10 @@ void StackColoring::collectMarkers() {
   // For each basic block, compute
   // * the list of markers in the instruction order
   // * the sets of allocas whose lifetime starts or ends in this BB
-  DEBUG(dbgs() << "Instructions:\n");
+  LLVM_DEBUG(dbgs() << "Instructions:\n");
   unsigned InstNo = 0;
   for (BasicBlock *BB : depth_first(&F)) {
-    DEBUG(dbgs() << "  " << InstNo << ": BB " << BB->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "  " << InstNo << ": BB " << BB->getName() << "\n");
     unsigned BBStart = InstNo++;
 
     BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
@@ -121,9 +122,9 @@ void StackColoring::collectMarkers() {
     }
 
     auto ProcessMarker = [&](Instruction *I, const Marker &M) {
-      DEBUG(dbgs() << "  " << InstNo << ":  "
-                   << (M.IsStart ? "start " : "end   ") << M.AllocaNo << ", "
-                   << *I << "\n");
+      LLVM_DEBUG(dbgs() << "  " << InstNo << ":  "
+                        << (M.IsStart ? "start " : "end   ") << M.AllocaNo
+                        << ", " << *I << "\n");
 
       BBMarkers[BB].push_back({InstNo, M});
 
@@ -280,7 +281,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {
 #endif
 
 void StackColoring::run() {
-  DEBUG(dumpAllocas());
+  LLVM_DEBUG(dumpAllocas());
 
   for (unsigned I = 0; I < NumAllocas; ++I)
     AllocaNumbering[Allocas[I]] = I;
@@ -303,7 +304,7 @@ void StackColoring::run() {
       LiveRanges[I] = getFullLiveRange();
 
   calculateLocalLiveness();
-  DEBUG(dumpBlockLiveness());
+  LLVM_DEBUG(dumpBlockLiveness());
   calculateLiveIntervals();
-  DEBUG(dumpLiveRanges());
+  LLVM_DEBUG(dumpLiveRanges());
 }
diff --git a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
index b1759359e46f..07b6a5d1883b 100644
--- a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -42,6 +42,7 @@ LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
 void StackLayout::addObject(const Value *V, unsigned Size, unsigned Alignment,
                             const StackColoring::LiveRange &Range) {
   StackObjects.push_back({V, Size, Alignment, Range});
+  ObjectAlignments[V] = Alignment;
   MaxAlignment = std::max(MaxAlignment, Alignment);
 }
 
@@ -62,30 +63,30 @@ void StackLayout::layoutObject(StackObject &Obj) {
     return;
   }
 
-  DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align " << Obj.Alignment
-               << ", range " << Obj.Range << "\n");
+  LLVM_DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align "
+                    << Obj.Alignment << ", range " << Obj.Range << "\n");
   assert(Obj.Alignment <= MaxAlignment);
   unsigned Start = AdjustStackOffset(0, Obj.Size, Obj.Alignment);
   unsigned End = Start + Obj.Size;
-  DEBUG(dbgs() << "  First candidate: " << Start << " .. " << End << "\n");
+  LLVM_DEBUG(dbgs() << "  First candidate: " << Start << " .. " << End << "\n");
   for (const StackRegion &R : Regions) {
-    DEBUG(dbgs() << "  Examining region: " << R.Start << " .. " << R.End
-                 << ", range " << R.Range << "\n");
+    LLVM_DEBUG(dbgs() << "  Examining region: " << R.Start << " .. " << R.End
+                      << ", range " << R.Range << "\n");
     assert(End >= R.Start);
     if (Start >= R.End) {
-      DEBUG(dbgs() << "  Does not intersect, skip.\n");
+      LLVM_DEBUG(dbgs() << "  Does not intersect, skip.\n");
       continue;
     }
     if (Obj.Range.Overlaps(R.Range)) {
       // Find the next appropriate location.
       Start = AdjustStackOffset(R.End, Obj.Size, Obj.Alignment);
       End = Start + Obj.Size;
-      DEBUG(dbgs() << "  Overlaps. Next candidate: " << Start << " .. " << End
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "  Overlaps. Next candidate: " << Start << " .. "
+                        << End << "\n");
       continue;
     }
     if (End <= R.End) {
-      DEBUG(dbgs() << "  Reusing region(s).\n");
+      LLVM_DEBUG(dbgs() << "  Reusing region(s).\n");
       break;
     }
   }
@@ -94,13 +95,13 @@ void StackLayout::layoutObject(StackObject &Obj) {
   if (End > LastRegionEnd) {
     // Insert a new region at the end. Maybe two.
     if (Start > LastRegionEnd) {
-      DEBUG(dbgs() << "  Creating gap region: " << LastRegionEnd << " .. "
-                   << Start << "\n");
+      LLVM_DEBUG(dbgs() << "  Creating gap region: " << LastRegionEnd << " .. "
+                        << Start << "\n");
       Regions.emplace_back(LastRegionEnd, Start, StackColoring::LiveRange());
       LastRegionEnd = Start;
     }
-    DEBUG(dbgs() << "  Creating new region: " << LastRegionEnd << " .. " << End
-                 << ", range " << Obj.Range << "\n");
+    LLVM_DEBUG(dbgs() << "  Creating new region: " << LastRegionEnd << " .. "
+                      << End << ", range " << Obj.Range << "\n");
     Regions.emplace_back(LastRegionEnd, End, Obj.Range);
     LastRegionEnd = End;
   }
@@ -149,5 +150,5 @@ void StackLayout::computeLayout() {
   for (auto &Obj : StackObjects)
     layoutObject(Obj);
 
-  DEBUG(print(dbgs()));
+  LLVM_DEBUG(print(dbgs()));
 }
diff --git a/contrib/llvm/lib/CodeGen/SafeStackLayout.h b/contrib/llvm/lib/CodeGen/SafeStackLayout.h
index 7c1292f251f7..ac531d800f6e 100644
--- a/contrib/llvm/lib/CodeGen/SafeStackLayout.h
+++ b/contrib/llvm/lib/CodeGen/SafeStackLayout.h
@@ -47,6 +47,7 @@ class StackLayout {
   SmallVector<StackObject, 8> StackObjects;
 
   DenseMap<const Value *, unsigned> ObjectOffsets;
+  DenseMap<const Value *, unsigned> ObjectAlignments;
 
   void layoutObject(StackObject &Obj);
 
@@ -64,6 +65,9 @@ public:
   /// Returns the offset to the object start in the stack frame.
   unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; }
 
+  /// Returns the alignment of the object
+  unsigned getObjectAlignment(const Value *V) { return ObjectAlignments[V]; }
+
   /// Returns the size of the entire frame.
   unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; }
 
diff --git a/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index cef413f9d410..9387722bfebd 100644
--- a/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -586,9 +586,6 @@ static void scalarizeMaskedScatter(CallInst *CI) {
 }
 
 bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
   bool EverMadeChange = false;
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
index 0635e8f41ee7..46064012d9d8 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 9249fa84b38b..d1c5ddabb975 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
@@ -118,7 +119,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
-  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  SchedModel.init(&ST);
 }
 
 /// If this machine instr has memory reference information and it can be
@@ -266,7 +267,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-/// \brief Adds register dependencies (data, anti, and output) from this SUnit
+/// Adds register dependencies (data, anti, and output) from this SUnit
 /// to following instructions in the same scheduling region that depend the
 /// physical register referenced at OperIdx.
 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
@@ -317,13 +318,14 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   } else {
     addPhysRegDataDeps(SU, OperIdx);
 
-    // clear this register's use list
-    if (Uses.contains(Reg))
-      Uses.eraseAll(Reg);
-
-    if (!MO.isDead()) {
-      Defs.eraseAll(Reg);
-    } else if (SU->isCall) {
+    // Clear previous uses and defs of this register and its subergisters.
+    for (MCSubRegIterator SubReg(Reg, TRI, true); SubReg.isValid(); ++SubReg) {
+      if (Uses.contains(*SubReg))
+        Uses.eraseAll(*SubReg);
+      if (!MO.isDead())
+        Defs.eraseAll(*SubReg);
+    }
+    if (MO.isDead() && SU->isCall) {
       // Calls will not be reordered because of chain dependencies (see
       // below). Since call operands are dead, calls may continue to be added
       // to the DefList making dependence checking quadratic in the size of
@@ -468,7 +470,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
-/// \brief Adds a register data dependency if the instruction that defines the
+/// Adds a register data dependency if the instruction that defines the
 /// virtual register used at OperIdx is mapped to an SUnit. Add a register
 /// antidependency from this SUnit to instructions that occur later in the same
 /// scheduling region if they write the virtual register.
@@ -514,7 +516,7 @@ void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
   }
 }
 
-/// \brief Creates an SUnit for each real instruction, numbered in top-down
+/// Creates an SUnit for each real instruction, numbered in top-down
 /// topological order. The instruction order A < B, implies that no edge exists
 /// from B to A.
 ///
@@ -532,7 +534,7 @@ void ScheduleDAGInstrs::initSUnits() {
   SUnits.reserve(NumRegionInstrs);
 
   for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
     SUnit *SU = newSUnit(&MI);
@@ -763,6 +765,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       DbgMI = &MI;
       continue;
     }
+    if (MI.isDebugLabel())
+      continue;
+
     SUnit *SU = MISUnitMap[&MI];
     assert(SU && "No SUnit mapped to this MI");
 
@@ -845,8 +850,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         BarrierChain->addPredBarrier(SU);
       BarrierChain = SU;
 
-      DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
-            << BarrierChain->NodeNum << ").\n";);
+      LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
+                        << BarrierChain->NodeNum << ").\n";);
 
       // Add dependencies against everything below it and clear maps.
       addBarrierChain(Stores);
@@ -934,11 +939,12 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
     // Reduce maps if they grow huge.
     if (Stores.size() + Loads.size() >= HugeRegion) {
-      DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
+      LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
       reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
     }
     if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
-      DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
+      LLVM_DEBUG(
+          dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
       reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
     }
   }
@@ -978,10 +984,8 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() {
 
 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
                                               Value2SUsMap &loads, unsigned N) {
-  DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
-        stores.dump();
-        dbgs() << "Loading SUnits:\n";
-        loads.dump());
+  LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump();
+             dbgs() << "Loading SUnits:\n"; loads.dump());
 
   // Insert all SU's NodeNums into a vector and sort it.
   std::vector<unsigned> NodeNums;
@@ -992,7 +996,7 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
   for (auto &I : loads)
     for (auto *SU : I.second)
       NodeNums.push_back(SU->NodeNum);
-  std::sort(NodeNums.begin(), NodeNums.end());
+  llvm::sort(NodeNums.begin(), NodeNums.end());
 
   // The N last elements in NodeNums will be removed, and the SU with
   // the lowest NodeNum of them will become the new BarrierChain to
@@ -1007,12 +1011,12 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
     if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
       BarrierChain->addPredBarrier(newBarrierChain);
       BarrierChain = newBarrierChain;
-      DEBUG(dbgs() << "Inserting new barrier chain: SU("
-            << BarrierChain->NodeNum << ").\n";);
+      LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU("
+                        << BarrierChain->NodeNum << ").\n";);
     }
     else
-      DEBUG(dbgs() << "Keeping old barrier chain: SU("
-            << BarrierChain->NodeNum << ").\n";);
+      LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU("
+                        << BarrierChain->NodeNum << ").\n";);
   }
   else
     BarrierChain = newBarrierChain;
@@ -1020,10 +1024,8 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
   insertBarrierChain(stores);
   insertBarrierChain(loads);
 
-  DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n";
-        stores.dump();
-        dbgs() << "Loading SUnits:\n";
-        loads.dump());
+  LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump();
+             dbgs() << "Loading SUnits:\n"; loads.dump());
 }
 
 static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
@@ -1044,14 +1046,14 @@ static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
 }
 
 void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
-  DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n');
+  LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n');
 
   LiveRegs.init(*TRI);
   LiveRegs.addLiveOuts(MBB);
 
   // Examine block from end to start...
   for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
     // Update liveness.  Registers that are defed but not used in this
@@ -1087,7 +1089,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
       while (I->isBundledWithSucc())
         ++I;
       do {
-        if (!I->isDebugValue())
+        if (!I->isDebugInstr())
           toggleKills(MRI, LiveRegs, *I, true);
         --I;
       } while(I != First);
@@ -1212,7 +1214,7 @@ public:
     RootSet[SU->NodeNum] = RData;
   }
 
-  /// \brief Called once for each tree edge after calling visitPostOrderNode on
+  /// Called once for each tree edge after calling visitPostOrderNode on
   /// the predecessor. Increment the parent node's instruction count and
   /// preemptively join this subtree to its parent's if it is small enough.
   void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
@@ -1245,11 +1247,11 @@ public:
     }
     R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
     R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
-    DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
+    LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
     for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
       R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
-      DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
-            << R.DFSNodeData[Idx].SubtreeID << '\n');
+      LLVM_DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
+                        << R.DFSNodeData[Idx].SubtreeID << '\n');
     }
     for (const std::pair<const SUnit*, const SUnit*> &P : ConnectionPairs) {
       unsigned PredTree = SubtreeClasses[P.first->NodeNum];
@@ -1404,8 +1406,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
   for (const Connection &C : SubtreeConnections[SubtreeID]) {
     SubtreeConnectLevels[C.TreeID] =
       std::max(SubtreeConnectLevels[C.TreeID], C.Level);
-    DEBUG(dbgs() << "  Tree: " << C.TreeID
-          << " @" << SubtreeConnectLevels[C.TreeID] << '\n');
+    LLVM_DEBUG(dbgs() << "  Tree: " << C.TreeID << " @"
+                      << SubtreeConnectLevels[C.TreeID] << '\n');
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index 37c4a470bd0a..ff2085aae865 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -61,7 +61,7 @@ namespace llvm {
     }
 
 
-    std::string getNodeLabel(const SUnit *Node, const ScheduleDAG *Graph);
+    std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
     static std::string getNodeAttributes(const SUnit *N,
                                          const ScheduleDAG *Graph) {
       return "shape=Mrecord";
diff --git a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index b789e2d9c52c..b8bfe69a76e1 100644
--- a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/Compiler.h"
@@ -68,12 +69,12 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
 
   // If MaxLookAhead is not set above, then we are not enabled.
   if (!isEnabled())
-    DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
+    LLVM_DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
   else {
     // A nonempty itinerary must have a SchedModel.
     IssueWidth = ItinData->SchedModel.IssueWidth;
-    DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
-          << ScoreboardDepth << '\n');
+    LLVM_DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
+                      << ScoreboardDepth << '\n');
   }
 }
 
@@ -155,9 +156,9 @@ ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       }
 
       if (!freeUnits) {
-        DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
-        DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
-        DEBUG(DAG->dumpNode(SU));
+        LLVM_DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
+        LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
+        LLVM_DEBUG(DAG->dumpNode(SU));
         return Hazard;
       }
     }
@@ -223,8 +224,8 @@ void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
     cycle += IS->getNextCycles();
   }
 
-  DEBUG(ReservedScoreboard.dump());
-  DEBUG(RequiredScoreboard.dump());
+  LLVM_DEBUG(ReservedScoreboard.dump());
+  LLVM_DEBUG(RequiredScoreboard.dump());
 }
 
 void ScoreboardHazardRecognizer::AdvanceCycle() {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 03cb2e310c7e..7a99687757f8 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -36,7 +36,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
@@ -60,6 +59,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -122,7 +122,7 @@ namespace {
     bool LegalTypes = false;
     bool ForCodeSize;
 
-    /// \brief Worklist of all of the nodes that need to be simplified.
+    /// Worklist of all of the nodes that need to be simplified.
     ///
     /// This must behave as a stack -- new nodes to process are pushed onto the
     /// back and when processing we pop off of the back.
@@ -131,14 +131,14 @@ namespace {
     /// due to nodes being deleted from the underlying DAG.
     SmallVector<SDNode *, 64> Worklist;
 
-    /// \brief Mapping from an SDNode to its position on the worklist.
+    /// Mapping from an SDNode to its position on the worklist.
     ///
     /// This is used to find and remove nodes from the worklist (by nulling
     /// them) when they are deleted from the underlying DAG. It relies on
     /// stable indices of nodes within the worklist.
     DenseMap<SDNode *, unsigned> WorklistMap;
 
-    /// \brief Set of nodes which have been combined (at least once).
+    /// Set of nodes which have been combined (at least once).
     ///
     /// This is used to allow us to reliably add any operands of a DAG node
     /// which have not yet been combined to the worklist.
@@ -232,14 +232,25 @@ namespace {
       return SimplifyDemandedBits(Op, Demanded);
     }
 
+    /// Check the specified vector node value to see if it can be simplified or
+    /// if things it uses can be simplified as it only uses some of the
+    /// elements. If so, return true.
+    bool SimplifyDemandedVectorElts(SDValue Op) {
+      unsigned NumElts = Op.getValueType().getVectorNumElements();
+      APInt Demanded = APInt::getAllOnesValue(NumElts);
+      return SimplifyDemandedVectorElts(Op, Demanded);
+    }
+
     bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
+    bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded,
+                                    bool AssumeSingleUse = false);
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
     SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
-    /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
+    /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
     ///   load.
     ///
     /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
@@ -258,10 +269,6 @@ namespace {
     SDValue PromoteExtend(SDValue Op);
     bool PromoteLoad(SDValue Op);
 
-    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, SDValue Trunc,
-                         SDValue ExtLoad, const SDLoc &DL,
-                         ISD::NodeType ExtType);
-
     /// Call the node-specific routine that knows how to fold each
     /// particular type of node. If that doesn't do anything, try the
     /// target-specific DAG combines.
@@ -292,7 +299,9 @@ namespace {
     SDValue visitMUL(SDNode *N);
     SDValue useDivRem(SDNode *N);
     SDValue visitSDIV(SDNode *N);
+    SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitUDIV(SDNode *N);
+    SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitREM(SDNode *N);
     SDValue visitMULHU(SDNode *N);
     SDValue visitMULHS(SDNode *N);
@@ -302,9 +311,9 @@ namespace {
     SDValue visitUMULO(SDNode *N);
     SDValue visitIMINMAX(SDNode *N);
     SDValue visitAND(SDNode *N);
-    SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference);
+    SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitOR(SDNode *N);
-    SDValue visitORLike(SDValue N0, SDValue N1, SDNode *LocReference);
+    SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitXOR(SDNode *N);
     SDValue SimplifyVBinOp(SDNode *N);
     SDValue visitSHL(SDNode *N);
@@ -323,7 +332,6 @@ namespace {
     SDValue visitVSELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
     SDValue visitSETCC(SDNode *N);
-    SDValue visitSETCCE(SDNode *N);
     SDValue visitSETCCCARRY(SDNode *N);
     SDValue visitSIGN_EXTEND(SDNode *N);
     SDValue visitZERO_EXTEND(SDNode *N);
@@ -385,8 +393,8 @@ namespace {
     SDValue visitFMULForFMADistributiveCombine(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
-    SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue LHS,
-                           SDValue RHS);
+    SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+                           SDValue N1);
 
     SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
 
@@ -403,8 +411,11 @@ namespace {
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                               const SDLoc &DL);
+    SDValue unfoldMaskedMerge(SDNode *N);
+    SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
-                          const SDLoc &DL, bool foldBooleans = true);
+                          const SDLoc &DL, bool foldBooleans);
+    SDValue rebuildSetCC(SDValue N);
 
     bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
                            SDValue &CC) const;
@@ -414,20 +425,21 @@ namespace {
                                          unsigned HiOp);
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
     SDValue CombineExtLoad(SDNode *N);
+    SDValue CombineZExtLogicopShiftLoad(SDNode *N);
     SDValue combineRepeatedFPDivisors(SDNode *N);
     SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
-    SDValue BuildLogBase2(SDValue Op, const SDLoc &DL);
+    SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
     SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
-    SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
+    SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
                                 SDNodeFlags Flags, bool Reciprocal);
-    SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
+    SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
                                 SDNodeFlags Flags, bool Reciprocal);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
@@ -442,13 +454,14 @@ namespace {
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
+    SDValue convertBuildVecZextToZext(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
     SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                   ArrayRef<int> VectorMask, SDValue VecIn1,
                                   SDValue VecIn2, unsigned LeftIdx);
-    SDValue matchVSelectOpSizesWithSetCC(SDNode *N);
+    SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
 
     /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
@@ -500,15 +513,15 @@ namespace {
     bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                           EVT LoadResultTy, EVT &ExtVT);
 
-    /// Helper function to calculate whether the given Load can have its
+    /// Helper function to calculate whether the given Load/Store can have its
     /// width reduced to ExtVT.
-    bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
-                           EVT &ExtVT, unsigned ShAmt = 0);
+    bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
+                           EVT &MemVT, unsigned ShAmt = 0);
 
     /// Used by BackwardsPropagateMask to find suitable loads.
     bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads,
-                           SmallPtrSetImpl<SDNode*> &NodeWithConsts,
-                           ConstantSDNode *Mask, SDNode *&UncombinedNode);
+                           SmallPtrSetImpl<SDNode*> &NodesWithConsts,
+                           ConstantSDNode *Mask, SDNode *&NodeToMask);
     /// Attempt to propagate a given AND node back to load leaves so that they
     /// can be combined into narrow loads.
     bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);
@@ -530,23 +543,28 @@ namespace {
 
     /// This is a helper function for MergeConsecutiveStores. Stores
     /// that potentially may be merged with St are placed in
-    /// StoreNodes.
+    /// StoreNodes. RootNode is a chain predecessor to all store
+    /// candidates.
     void getStoreMergeCandidates(StoreSDNode *St,
-                                 SmallVectorImpl<MemOpLink> &StoreNodes);
+                                 SmallVectorImpl<MemOpLink> &StoreNodes,
+                                 SDNode *&Root);
 
     /// Helper function for MergeConsecutiveStores. Checks if
     /// candidate stores have indirect dependency through their
-    /// operands. \return True if safe to merge.
+    /// operands. RootNode is the predecessor to all stores calculated
+    /// by getStoreMergeCandidates and is used to prune the dependency check.
+    /// \return True if safe to merge.
     bool checkMergeStoreCandidatesForDependencies(
-        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);
+        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
+        SDNode *RootNode);
 
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return number of stores that were merged into a merged store (the
     /// affected nodes are stored as a prefix in \p StoreNodes).
-    bool MergeConsecutiveStores(StoreSDNode *N);
+    bool MergeConsecutiveStores(StoreSDNode *St);
 
-    /// \brief Try to transform a truncation where C is a constant:
+    /// Try to transform a truncation where C is a constant:
     ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
     ///
     /// \p N needs to be a truncation and its first operand an AND. Other
@@ -554,6 +572,16 @@ namespace {
     /// single-use) and if missed an empty SDValue is returned.
     SDValue distributeTruncateThroughAnd(SDNode *N);
 
+    /// Helper function to determine whether the target supports operation
+    /// given by \p Opcode for type \p VT, that is, whether the operation
+    /// is legal or custom before legalizing operations, and whether is
+    /// legal (but not custom) after legalization.
+    bool hasOperation(unsigned Opcode, EVT VT) {
+      if (LegalOperations)
+        return TLI.isOperationLegal(Opcode, VT);
+      return TLI.isOperationLegalOrCustom(Opcode, VT);
+    }
+
   public:
     /// Runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
@@ -564,11 +592,7 @@ namespace {
     /// legalization these can be huge.
     EVT getShiftAmountTy(EVT LHSTy) {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
-      if (LHSTy.isVector())
-        return LHSTy;
-      auto &DL = DAG.getDataLayout();
-      return LegalTypes ? TLI.getScalarShiftAmountTy(DL, LHSTy)
-                        : TLI.getPointerTy(DL);
+      return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
     }
 
     /// This method returns true if we are running before type legalization or
@@ -582,6 +606,10 @@ namespace {
     EVT getSetCCResultType(EVT VT) const {
       return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
     }
+
+    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
+                         SDValue OrigLoad, SDValue ExtLoad,
+                         ISD::NodeType ExtType);
   };
 
 /// This class is a DAGUpdateListener that removes any deleted
@@ -657,8 +685,13 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return 2;
 
-  // Don't allow anything with multiple uses.
-  if (!Op.hasOneUse()) return 0;
+  // Don't allow anything with multiple uses unless we know it is free.
+  EVT VT = Op.getValueType();
+  const SDNodeFlags Flags = Op->getFlags();
+  if (!Op.hasOneUse())
+    if (!(Op.getOpcode() == ISD::FP_EXTEND &&
+          TLI.isFPExtFree(VT, Op.getOperand(0).getValueType())))
+      return 0;
 
   // Don't recurse exponentially.
   if (Depth > 6) return 0;
@@ -671,17 +704,15 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
 
     // Don't invert constant FP values after legalization unless the target says
     // the negated constant is legal.
-    EVT VT = Op.getValueType();
     return TLI.isOperationLegal(ISD::ConstantFP, VT) ||
       TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT);
   }
   case ISD::FADD:
-    // FIXME: determine better conditions for this xform.
-    if (!Options->UnsafeFPMath) return 0;
+    if (!Options->UnsafeFPMath && !Flags.hasNoSignedZeros())
+      return 0;
 
     // After operation legalization, it might not be legal to create new FSUBs.
-    if (LegalOperations &&
-        !TLI.isOperationLegalOrCustom(ISD::FSUB,  Op.getValueType()))
+    if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
       return 0;
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
@@ -694,7 +725,7 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
     if (!Options->NoSignedZerosFPMath &&
-        !Op.getNode()->getFlags().hasNoSignedZeros())
+        !Flags.hasNoSignedZeros())
       return 0;
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
@@ -702,8 +733,6 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
 
   case ISD::FMUL:
   case ISD::FDIV:
-    if (Options->HonorSignDependentRoundingFPMath()) return 0;
-
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
     if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
                                     Options, Depth + 1))
@@ -727,9 +756,6 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
 
-  // Don't allow anything with multiple uses.
-  assert(Op.hasOneUse() && "Unknown reuse!");
-
   assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
 
   const SDNodeFlags Flags = Op.getNode()->getFlags();
@@ -742,8 +768,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
     return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
   }
   case ISD::FADD:
-    // FIXME: determine better conditions for this xform.
-    assert(Options.UnsafeFPMath);
+    assert(Options.UnsafeFPMath || Flags.hasNoSignedZeros());
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
@@ -769,8 +794,6 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
   case ISD::FMUL:
   case ISD::FDIV:
-    assert(!Options.HonorSignDependentRoundingFPMath());
-
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
                            DAG.getTargetLoweringInfo(), &Options, Depth+1))
@@ -846,7 +869,13 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   return false;
 }
 
-// \brief Returns the SDNode if it is a constant float BuildVector
+static SDValue peekThroughBitcast(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  return V;
+}
+
+// Returns the SDNode if it is a constant float BuildVector
 // or constant float.
 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
   if (isa<ConstantFPSDNode>(N))
@@ -880,6 +909,7 @@ static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
 // constant null integer (with no undefs).
 // Build vector implicit truncation is not an issue for null values.
 static bool isNullConstantOrNullSplatConstant(SDValue N) {
+  // TODO: may want to use peekThroughBitcast() here.
   if (ConstantSDNode *Splat = isConstOrConstSplat(N))
     return Splat->isNullValue();
   return false;
@@ -889,6 +919,7 @@ static bool isNullConstantOrNullSplatConstant(SDValue N) {
 // constant integer of one (with no undefs).
 // Do not permit build vector implicit truncation.
 static bool isOneConstantOrOneSplatConstant(SDValue N) {
+  // TODO: may want to use peekThroughBitcast() here.
   unsigned BitWidth = N.getScalarValueSizeInBits();
   if (ConstantSDNode *Splat = isConstOrConstSplat(N))
     return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth;
@@ -899,6 +930,7 @@ static bool isOneConstantOrOneSplatConstant(SDValue N) {
 // constant integer of all ones (with no undefs).
 // Do not permit build vector implicit truncation.
 static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) {
+  N = peekThroughBitcast(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
   if (ConstantSDNode *Splat = isConstOrConstSplat(N))
     return Splat->isAllOnesValue() &&
@@ -913,56 +945,6 @@ static bool isAnyConstantBuildVector(const SDNode *N) {
          ISD::isBuildVectorOfConstantFPSDNodes(N);
 }
 
-// Attempt to match a unary predicate against a scalar/splat constant or
-// every element of a constant BUILD_VECTOR.
-static bool matchUnaryPredicate(SDValue Op,
-                                std::function<bool(ConstantSDNode *)> Match) {
-  if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
-    return Match(Cst);
-
-  if (ISD::BUILD_VECTOR != Op.getOpcode())
-    return false;
-
-  EVT SVT = Op.getValueType().getScalarType();
-  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
-    auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
-    if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
-      return false;
-  }
-  return true;
-}
-
-// Attempt to match a binary predicate against a pair of scalar/splat constants
-// or every element of a pair of constant BUILD_VECTORs.
-static bool matchBinaryPredicate(
-    SDValue LHS, SDValue RHS,
-    std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) {
-  if (LHS.getValueType() != RHS.getValueType())
-    return false;
-
-  if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
-    if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
-      return Match(LHSCst, RHSCst);
-
-  if (ISD::BUILD_VECTOR != LHS.getOpcode() ||
-      ISD::BUILD_VECTOR != RHS.getOpcode())
-    return false;
-
-  EVT SVT = LHS.getValueType().getScalarType();
-  for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
-    auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
-    auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
-    if (!LHSCst || !RHSCst)
-      return false;
-    if (LHSCst->getValueType(0) != SVT ||
-        LHSCst->getValueType(0) != RHSCst->getValueType(0))
-      return false;
-    if (!Match(LHSCst, RHSCst))
-      return false;
-  }
-  return true;
-}
-
 SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                     SDValue N1) {
   EVT VT = N0.getValueType();
@@ -1013,11 +995,9 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                                bool AddTo) {
   assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
   ++NodesCombined;
-  DEBUG(dbgs() << "\nReplacing.1 ";
-        N->dump(&DAG);
-        dbgs() << "\nWith: ";
-        To[0].getNode()->dump(&DAG);
-        dbgs() << " and " << NumTo-1 << " other values\n");
+  LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             To[0].getNode()->dump(&DAG);
+             dbgs() << " and " << NumTo - 1 << " other values\n");
   for (unsigned i = 0, e = NumTo; i != e; ++i)
     assert((!To[i].getNode() ||
             N->getValueType(i) == To[i].getValueType()) &&
@@ -1074,11 +1054,33 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
 
   // Replace the old value with the new one.
   ++NodesCombined;
-  DEBUG(dbgs() << "\nReplacing.2 ";
-        TLO.Old.getNode()->dump(&DAG);
-        dbgs() << "\nWith: ";
-        TLO.New.getNode()->dump(&DAG);
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
+             dbgs() << '\n');
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+/// Check the specified vector node value to see if it can be simplified or
+/// if things it uses can be simplified as it only uses some of the elements.
+/// If so, return true.
+bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded,
+                                             bool AssumeSingleUse) {
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  APInt KnownUndef, KnownZero;
+  if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO,
+                                      0, AssumeSingleUse))
+    return false;
+
+  // Revisit the node.
+  AddToWorklist(Op.getNode());
+
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
+             dbgs() << '\n');
 
   CommitTargetLoweringOpt(TLO);
   return true;
@@ -1089,11 +1091,8 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
   EVT VT = Load->getValueType(0);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
 
-  DEBUG(dbgs() << "\nReplacing.9 ";
-        Load->dump(&DAG);
-        dbgs() << "\nWith: ";
-        Trunc.getNode()->dump(&DAG);
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
+             Trunc.getNode()->dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
@@ -1107,10 +1106,8 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   if (ISD::isUNINDEXEDLoad(Op.getNode())) {
     LoadSDNode *LD = cast<LoadSDNode>(Op);
     EVT MemVT = LD->getMemoryVT();
-    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
-                                                       : ISD::EXTLOAD)
-      : LD->getExtensionType();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
+                                                      : LD->getExtensionType();
     Replace = true;
     return DAG.getExtLoad(ExtType, DL, PVT,
                           LD->getChain(), LD->getBasePtr(),
@@ -1194,7 +1191,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
 
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
@@ -1259,7 +1256,7 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
 
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
@@ -1311,8 +1308,7 @@ SDValue DAGCombiner::PromoteExtend(SDValue Op) {
     // fold (aext (aext x)) -> (aext x)
     // fold (aext (zext x)) -> (zext x)
     // fold (aext (sext x)) -> (sext x)
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
   }
   return SDValue();
@@ -1345,20 +1341,15 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     SDNode *N = Op.getNode();
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT MemVT = LD->getMemoryVT();
-    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
-                                                       : ISD::EXTLOAD)
-      : LD->getExtensionType();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
+                                                      : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
                                    LD->getChain(), LD->getBasePtr(),
                                    MemVT, LD->getMemOperand());
     SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
 
-    DEBUG(dbgs() << "\nPromoting ";
-          N->dump(&DAG);
-          dbgs() << "\nTo: ";
-          Result.getNode()->dump(&DAG);
-          dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
+               Result.getNode()->dump(&DAG); dbgs() << '\n');
     WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
@@ -1369,7 +1360,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
   return false;
 }
 
-/// \brief Recursively delete a node which has no uses and any operands for
+/// Recursively delete a node which has no uses and any operands for
 /// which it is the only use.
 ///
 /// Note that this both deletes the nodes and removes them from the worklist.
@@ -1453,7 +1444,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
         continue;
     }
 
-    DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
 
     // Add any operands of the new node which have not yet been combined to the
     // worklist as well. Because the worklist uniques things already, this
@@ -1481,8 +1472,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
            RV.getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
-    DEBUG(dbgs() << " ... into: ";
-          RV.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG));
 
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
@@ -1558,7 +1548,6 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
-  case ISD::SETCCE:             return visitSETCCE(N);
   case ISD::SETCCCARRY:         return visitSETCCCARRY(N);
   case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
   case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
@@ -1708,6 +1697,10 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       return N->getOperand(1);
   }
 
+  // Don't simplify token factors if optnone.
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
   SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
   SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
   SmallPtrSet<SDNode*, 16> SeenOps;
@@ -1893,16 +1886,16 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
           BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) &&
          "Unexpected binary operator");
 
-  // Bail out if any constants are opaque because we can't constant fold those.
-  SDValue C1 = BO->getOperand(1);
-  if (!isConstantOrConstantVector(C1, true) &&
-      !isConstantFPBuildVectorOrConstantFP(C1))
-    return SDValue();
-
   // Don't do this unless the old select is going away. We want to eliminate the
   // binary operator, not replace a binop with a select.
   // TODO: Handle ISD::SELECT_CC.
+  unsigned SelOpNo = 0;
   SDValue Sel = BO->getOperand(0);
+  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
+    SelOpNo = 1;
+    Sel = BO->getOperand(1);
+  }
+
   if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
     return SDValue();
 
@@ -1916,19 +1909,48 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
       !isConstantFPBuildVectorOrConstantFP(CF))
     return SDValue();
 
+  // Bail out if any constants are opaque because we can't constant fold those.
+  // The exception is "and" and "or" with either 0 or -1 in which case we can
+  // propagate non constant operands into select. I.e.:
+  // and (select Cond, 0, -1), X --> select Cond, 0, X
+  // or X, (select Cond, -1, 0) --> select Cond, -1, X
+  bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
+                         (isNullConstantOrNullSplatConstant(CT) ||
+                          isAllOnesConstantOrAllOnesSplatConstant(CT)) &&
+                         (isNullConstantOrNullSplatConstant(CF) ||
+                          isAllOnesConstantOrAllOnesSplatConstant(CF));
+
+  SDValue CBO = BO->getOperand(SelOpNo ^ 1);
+  if (!CanFoldNonConst &&
+      !isConstantOrConstantVector(CBO, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CBO))
+    return SDValue();
+
+  EVT VT = Sel.getValueType();
+
+  // In case of shift value and shift amount may have different VT. For instance
+  // on x86 shift amount is i8 regardles of LHS type. Bail out if we have
+  // swapped operands and value types do not match. NB: x86 is fine if operands
+  // are not swapped with shift amount VT being not bigger than shifted value.
+  // TODO: that is possible to check for a shift operation, correct VTs and
+  // still perform optimization on x86 if needed.
+  if (SelOpNo && VT != CBO.getValueType())
+    return SDValue();
+
   // We have a select-of-constants followed by a binary operator with a
   // constant. Eliminate the binop by pulling the constant math into the select.
-  // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1
-  EVT VT = Sel.getValueType();
+  // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
   SDLoc DL(Sel);
-  SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1);
-  if (!NewCT.isUndef() &&
+  SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT)
+                          : DAG.getNode(BinOpcode, DL, VT, CT, CBO);
+  if (!CanFoldNonConst && !NewCT.isUndef() &&
       !isConstantOrConstantVector(NewCT, true) &&
       !isConstantFPBuildVectorOrConstantFP(NewCT))
     return SDValue();
 
-  SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1);
-  if (!NewCF.isUndef() &&
+  SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
+                          : DAG.getNode(BinOpcode, DL, VT, CF, CBO);
+  if (!CanFoldNonConst && !NewCF.isUndef() &&
       !isConstantOrConstantVector(NewCF, true) &&
       !isConstantFPBuildVectorOrConstantFP(NewCF))
     return SDValue();
@@ -1936,6 +1958,84 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
 }
 
+static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+         "Expecting add or sub");
+
+  // Match a constant operand and a zext operand for the math instruction:
+  // add Z, C
+  // sub C, Z
+  bool IsAdd = N->getOpcode() == ISD::ADD;
+  SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
+  SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
+  auto *CN = dyn_cast<ConstantSDNode>(C);
+  if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND)
+    return SDValue();
+
+  // Match the zext operand as a setcc of a boolean.
+  if (Z.getOperand(0).getOpcode() != ISD::SETCC ||
+      Z.getOperand(0).getValueType() != MVT::i1)
+    return SDValue();
+
+  // Match the compare as: setcc (X & 1), 0, eq.
+  SDValue SetCC = Z.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) ||
+      SetCC.getOperand(0).getOpcode() != ISD::AND ||
+      !isOneConstant(SetCC.getOperand(0).getOperand(1)))
+    return SDValue();
+
+  // We are adding/subtracting a constant and an inverted low bit. Turn that
+  // into a subtract/add of the low bit with incremented/decremented constant:
+  // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
+  // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
+  EVT VT = C.getValueType();
+  SDLoc DL(N);
+  SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
+  SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
+                       DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
+  return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
+}
+
+/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
+/// a shift and add with a different constant.
+static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+         "Expecting add or sub");
+
+  // We need a constant operand for the add/sub, and the other operand is a
+  // logical shift right: add (srl), C or sub C, (srl).
+  bool IsAdd = N->getOpcode() == ISD::ADD;
+  SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
+  SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
+  ConstantSDNode *C = isConstOrConstSplat(ConstantOp);
+  if (!C || ShiftOp.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // The shift must be of a 'not' value.
+  // TODO: Use isBitwiseNot() if it works with vectors.
+  SDValue Not = ShiftOp.getOperand(0);
+  if (!Not.hasOneUse() || Not.getOpcode() != ISD::XOR ||
+      !isAllOnesConstantOrAllOnesSplatConstant(Not.getOperand(1)))
+    return SDValue();
+
+  // The shift must be moving the sign bit to the least-significant-bit.
+  EVT VT = ShiftOp.getValueType();
+  SDValue ShAmt = ShiftOp.getOperand(1);
+  ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
+  if (!ShAmtC || ShAmtC->getZExtValue() != VT.getScalarSizeInBits() - 1)
+    return SDValue();
+
+  // Eliminate the 'not' by adjusting the shift and add/sub constant:
+  // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
+  // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
+  SDLoc DL(N);
+  auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL;
+  SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt);
+  APInt NewC = IsAdd ? C->getAPIntValue() + 1 : C->getAPIntValue() - 1;
+  return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT));
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2067,6 +2167,12 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
                          DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
   }
 
+  if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
+    return V;
+
+  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+    return V;
+
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
@@ -2075,6 +2181,11 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       DAG.haveNoCommonBitsSet(N0, N1))
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
+  // fold (add (xor a, -1), 1) -> (sub 0, a)
+  if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1))
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                       N0.getOperand(0));
+
   if (SDValue Combined = visitADDLike(N0, N1, N))
     return Combined;
 
@@ -2210,6 +2321,38 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   return SDValue();
 }
 
+static SDValue flipBoolean(SDValue V, const SDLoc &DL, EVT VT,
+                           SelectionDAG &DAG, const TargetLowering &TLI) {
+  SDValue Cst;
+  switch (TLI.getBooleanContents(VT)) {
+  case TargetLowering::ZeroOrOneBooleanContent:
+  case TargetLowering::UndefinedBooleanContent:
+    Cst = DAG.getConstant(1, DL, VT);
+    break;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    Cst = DAG.getConstant(-1, DL, VT);
+    break;
+  }
+
+  return DAG.getNode(ISD::XOR, DL, VT, V, Cst);
+}
+
+static bool isBooleanFlip(SDValue V, EVT VT, const TargetLowering &TLI) {
+  if (V.getOpcode() != ISD::XOR) return false;
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!Const) return false;
+
+  switch(TLI.getBooleanContents(VT)) {
+    case TargetLowering::ZeroOrOneBooleanContent:
+      return Const->isOne();
+    case TargetLowering::ZeroOrNegativeOneBooleanContent:
+      return Const->isAllOnesValue();
+    case TargetLowering::UndefinedBooleanContent:
+      return (Const->getAPIntValue() & 0x01) == 1;
+  }
+  llvm_unreachable("Unsupported boolean content");
+}
+
 SDValue DAGCombiner::visitUADDO(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2240,6 +2383,15 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) {
     return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
                      DAG.getConstant(0, DL, CarryVT));
 
+  // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
+  if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) {
+    SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
+                              DAG.getConstant(0, DL, VT),
+                              N0.getOperand(0));
+    return CombineTo(N, Sub,
+                     flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI));
+  }
+
   if (SDValue Combined = visitUADDOLike(N0, N1, N))
     return Combined;
 
@@ -2303,13 +2455,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
     return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
 
   // fold (addcarry x, y, false) -> (uaddo x, y)
-  if (isNullConstant(CarryIn))
-    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
+      return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
+  }
+
+  EVT CarryVT = CarryIn.getValueType();
 
   // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
   if (isNullConstant(N0) && isNullConstant(N1)) {
     EVT VT = N0.getValueType();
-    EVT CarryVT = CarryIn.getValueType();
     SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
     AddToWorklist(CarryExt.getNode());
     return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
@@ -2317,6 +2473,16 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
                      DAG.getConstant(0, DL, CarryVT));
   }
 
+  // fold (addcarry (xor a, -1), 0, !b) -> (subcarry 0, a, b) and flip carry.
+  if (isBitwiseNot(N0) && isNullConstant(N1) &&
+      isBooleanFlip(CarryIn, CarryVT, TLI)) {
+    SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(),
+                              DAG.getConstant(0, DL, N0.getValueType()),
+                              N0.getOperand(0), CarryIn.getOperand(0));
+    return CombineTo(N, Sub,
+                     flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI));
+  }
+
   if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
     return Combined;
 
@@ -2458,6 +2624,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (isAllOnesConstantOrAllOnesSplatConstant(N0))
     return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
 
+  // fold (A - (0-B)) -> A+B
+  if (N1.getOpcode() == ISD::SUB &&
+      isNullConstantOrNullSplatConstant(N1.getOperand(0)))
+    return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
+
   // fold A-(A-B) -> B
   if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
     return N1.getOperand(1);
@@ -2500,12 +2671,50 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
                        N0.getOperand(1).getOperand(0));
 
+  // fold (X - (-Y * Z)) -> (X + (Y * Z))
+  if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
+    if (N1.getOperand(0).getOpcode() == ISD::SUB &&
+        isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
+                                N1.getOperand(0).getOperand(1),
+                                N1.getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
+    }
+    if (N1.getOperand(1).getOpcode() == ISD::SUB &&
+        isNullConstantOrNullSplatConstant(N1.getOperand(1).getOperand(0))) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
+                                N1.getOperand(0),
+                                N1.getOperand(1).getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
+    }
+  }
+
   // If either operand of a sub is undef, the result is undef
   if (N0.isUndef())
     return N0;
   if (N1.isUndef())
     return N1;
 
+  if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
+    return V;
+
+  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+    return V;
+
+  // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
+  if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
+      SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
+      SDValue S0 = N1.getOperand(0);
+      if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) {
+        unsigned OpSizeInBits = VT.getScalarSizeInBits();
+        if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
+          if (C->getAPIntValue() == (OpSizeInBits - 1))
+            return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
+      }
+    }
+  }
+
   // If the relocation model supports it, consider symbol offsets.
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
@@ -2612,8 +2821,11 @@ SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
   SDValue CarryIn = N->getOperand(2);
 
   // fold (subcarry x, y, false) -> (usubo x, y)
-  if (isNullConstant(CarryIn))
-    return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
+      return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
+  }
 
   return SDValue();
 }
@@ -2689,11 +2901,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
       (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
     SDLoc DL(N);
     SDValue LogBase2 = BuildLogBase2(N1, DL);
-    AddToWorklist(LogBase2.getNode());
-
     EVT ShiftVT = getShiftAmountTy(N0.getValueType());
     SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
-    AddToWorklist(Trunc.getNode());
     return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
   }
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
@@ -2816,9 +3025,10 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   SDValue Op1 = Node->getOperand(1);
   SDValue combined;
   for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
-         UE = Op0.getNode()->use_end(); UI != UE;) {
-    SDNode *User = *UI++;
-    if (User == Node || User->use_empty())
+         UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
+        User->use_empty())
       continue;
     // Convert the other matching node(s), too;
     // otherwise, the DIVREM may get target-legalized into something
@@ -2868,6 +3078,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
 
   // fold vector ops
   if (VT.isVector())
@@ -2887,6 +3098,11 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
+  // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
+  if (N1C && N1C->getAPIntValue().isMinSignedValue())
+    return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
+                         DAG.getConstant(1, DL, VT),
+                         DAG.getConstant(0, DL, VT));
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
@@ -2899,45 +3115,90 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
 
+  if (SDValue V = visitSDIVLike(N0, N1, N))
+    return V;
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
+  // true.  Otherwise, we break the simplification logic in visitREM().
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+  unsigned BitWidth = VT.getScalarSizeInBits();
+
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+
+  // Helper for determining whether a value is a power-2 constant scalar or a
+  // vector of such elements.
+  auto IsPowerOfTwo = [](ConstantSDNode *C) {
+    if (C->isNullValue() || C->isOpaque())
+      return false;
+    if (C->getAPIntValue().isPowerOf2())
+      return true;
+    if ((-C->getAPIntValue()).isPowerOf2())
+      return true;
+    return false;
+  };
+
   // fold (sdiv X, pow2) -> simple ops after legalize
   // FIXME: We check for the exact bit here because the generic lowering gives
   // better results in that case. The target-specific lowering should learn how
   // to handle exact sdivs efficiently.
-  if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
-      !N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() ||
-                                    (-N1C->getAPIntValue()).isPowerOf2())) {
+  if (!N->getFlags().hasExact() &&
+      ISD::matchUnaryPredicate(N1C ? SDValue(N1C, 0) : N1, IsPowerOfTwo)) {
     // Target-specific implementation of sdiv x, pow2.
     if (SDValue Res = BuildSDIVPow2(N))
       return Res;
 
-    unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
+    // Create constants that are functions of the shift amount value.
+    EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
+    SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
+    SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
+    C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
+    SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
+    if (!isConstantOrConstantVector(Inexact))
+      return SDValue();
 
     // Splat the sign bit into the register
-    SDValue SGN =
-        DAG.getNode(ISD::SRA, DL, VT, N0,
-                    DAG.getConstant(VT.getScalarSizeInBits() - 1, DL,
-                                    getShiftAmountTy(N0.getValueType())));
-    AddToWorklist(SGN.getNode());
+    SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
+                               DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
+    AddToWorklist(Sign.getNode());
 
     // Add (N0 < 0) ? abs2 - 1 : 0;
-    SDValue SRL =
-        DAG.getNode(ISD::SRL, DL, VT, SGN,
-                    DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL,
-                                    getShiftAmountTy(SGN.getValueType())));
-    SDValue ADD = DAG.getNode(ISD::ADD, DL, VT, N0, SRL);
-    AddToWorklist(SRL.getNode());
-    AddToWorklist(ADD.getNode());    // Divide by pow2
-    SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD,
-                  DAG.getConstant(lg2, DL,
-                                  getShiftAmountTy(ADD.getValueType())));
-
-    // If we're dividing by a positive value, we're done.  Otherwise, we must
-    // negate the result.
-    if (N1C->getAPIntValue().isNonNegative())
-      return SRA;
-
-    AddToWorklist(SRA.getNode());
-    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+    SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
+    AddToWorklist(Srl.getNode());
+    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
+    AddToWorklist(Add.getNode());
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
+    AddToWorklist(Sra.getNode());
+
+    // Special case: (sdiv X, 1) -> X
+    // Special Case: (sdiv X, -1) -> 0-X
+    SDValue One = DAG.getConstant(1, DL, VT);
+    SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+    SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
+    SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
+    SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
+    Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);
+
+    // If dividing by a positive value, we're done. Otherwise, the result must
+    // be negated.
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);
+
+    // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
+    SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
+    SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
+    return Res;
   }
 
   // If integer divide is expensive and we satisfy the requirements, emit an
@@ -2948,13 +3209,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     if (SDValue Op = BuildSDIV(N))
       return Op;
 
-  // sdiv, srem -> sdivrem
-  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
-  // true.  Otherwise, we break the simplification logic in visitREM().
-  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
-    if (SDValue DivRem = useDivRem(N))
-        return DivRem;
-
   return SDValue();
 }
 
@@ -2962,6 +3216,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
 
   // fold vector ops
   if (VT.isVector())
@@ -2977,6 +3232,14 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
                                                     N0C, N1C))
       return Folded;
+  // fold (udiv X, 1) -> X
+  if (N1C && N1C->isOne())
+    return N0;
+  // fold (udiv X, -1) -> select(X == -1, 1, 0)
+  if (N1C && N1C->getAPIntValue().isAllOnesValue())
+    return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
+                         DAG.getConstant(1, DL, VT),
+                         DAG.getConstant(0, DL, VT));
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
@@ -2984,6 +3247,26 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
+  if (SDValue V = visitUDIVLike(N0, N1, N))
+    return V;
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
+  // true.  Otherwise, we break the simplification logic in visitREM().
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+
   // fold (udiv x, (1 << c)) -> x >>u c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1)) {
@@ -3019,13 +3302,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (SDValue Op = BuildUDIV(N))
       return Op;
 
-  // sdiv, srem -> sdivrem
-  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
-  // true.  Otherwise, we break the simplification logic in visitREM().
-  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
-    if (SDValue DivRem = useDivRem(N))
-        return DivRem;
-
   return SDValue();
 }
 
@@ -3035,6 +3311,8 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+
   bool isSigned = (Opcode == ISD::SREM);
   SDLoc DL(N);
 
@@ -3044,6 +3322,10 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   if (N0C && N1C)
     if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
       return Folded;
+  // fold (urem X, -1) -> select(X == -1, 0, x)
+  if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())
+    return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
+                         DAG.getConstant(0, DL, VT), N0);
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
@@ -3077,22 +3359,19 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
 
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
-  // To avoid mangling nodes, this simplification requires that the combine()
-  // call for the speculative DIV must not cause a DIVREM conversion.  We guard
-  // against this by skipping the simplification if isIntDivCheap().  When
-  // div is not cheap, combine will not return a DIVREM.  Regardless,
-  // checking cheapness here makes sense since the simplification results in
-  // fatter code.
-  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) {
-    unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
-    SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1);
-    AddToWorklist(Div.getNode());
-    SDValue OptimizedDiv = combine(Div.getNode());
-    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
-      assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) &&
-             (OptimizedDiv.getOpcode() != ISD::SDIVREM));
+  // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
+  // speculative DIV must not cause a DIVREM conversion.  We guard against this
+  // by skipping the simplification if isIntDivCheap().  When div is not cheap,
+  // combine will not return a DIVREM.  Regardless, checking cheapness here
+  // makes sense since the simplification results in fatter code.
+  if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
+    SDValue OptimizedDiv =
+        isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
+    if (OptimizedDiv.getNode() && OptimizedDiv.getOpcode() != ISD::UDIVREM &&
+        OptimizedDiv.getOpcode() != ISD::SDIVREM) {
       SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
       SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
+      AddToWorklist(OptimizedDiv.getNode());
       AddToWorklist(Mul.getNode());
       return Sub;
     }
@@ -3350,6 +3629,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
+  // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
+  // Only do this if the current op isn't legal and the flipped is.
+  unsigned Opcode = N->getOpcode();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegal(Opcode, VT) &&
+      (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
+      (N1.isUndef() || DAG.SignBitIsZero(N1))) {
+    unsigned AltOpcode;
+    switch (Opcode) {
+    case ISD::SMIN: AltOpcode = ISD::UMIN; break;
+    case ISD::SMAX: AltOpcode = ISD::UMAX; break;
+    case ISD::UMIN: AltOpcode = ISD::SMIN; break;
+    case ISD::UMAX: AltOpcode = ISD::SMAX; break;
+    default: llvm_unreachable("Unknown MINMAX opcode");
+    }
+    if (TLI.isOperationLegal(AltOpcode, VT))
+      return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1);
+  }
+
   return SDValue();
 }
 
@@ -3469,9 +3767,9 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
           ShOp = SDValue();
       }
 
-      // (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
-      // (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
-      // (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
+      // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C)
+      // (OR  (shuf (A, C), shuf (B, C))) -> shuf (OR  (A, B), C)
+      // (XOR (shuf (A, C), shuf (B, C))) -> shuf (XOR (A, B), V_0)
       if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
         SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       N0->getOperand(0), N1->getOperand(0));
@@ -3490,9 +3788,9 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
           ShOp = SDValue();
       }
 
-      // (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
-      // (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
-      // (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
+      // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B))
+      // (OR  (shuf (C, A), shuf (C, B))) -> shuf (C, OR  (A, B))
+      // (XOR (shuf (C, A), shuf (C, B))) -> shuf (V_0, XOR (A, B))
       if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
         SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       N0->getOperand(1), N1->getOperand(1));
@@ -3525,7 +3823,7 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
   // operations on the left and right operands, so those types must match.
   EVT VT = N0.getValueType();
   EVT OpVT = LL.getValueType();
-  if (LegalOperations || VT != MVT::i1)
+  if (LegalOperations || VT.getScalarType() != MVT::i1)
     if (VT != getSetCCResultType(OpVT))
       return SDValue();
   if (OpVT != RL.getValueType())
@@ -3762,53 +4060,78 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
   return true;
 }
 
-bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
-                                    EVT &ExtVT, unsigned ShAmt) {
-  // Don't transform one with multiple uses, this would require adding a new
-  // load.
-  if (!SDValue(LoadN, 0).hasOneUse())
+bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
+                                    ISD::LoadExtType ExtType, EVT &MemVT,
+                                    unsigned ShAmt) {
+  if (!LDST)
     return false;
-
-  if (LegalOperations &&
-      !TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT))
+  // Only allow byte offsets.
+  if (ShAmt % 8)
     return false;
 
   // Do not generate loads of non-round integer types since these can
   // be expensive (and would be wrong if the type is not byte sized).
-  if (!ExtVT.isRound())
+  if (!MemVT.isRound())
     return false;
 
   // Don't change the width of a volatile load.
-  if (LoadN->isVolatile())
+  if (LDST->isVolatile())
     return false;
 
   // Verify that we are actually reducing a load width here.
-  if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits())
+  if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
     return false;
 
-  // For the transform to be legal, the load must produce only two values
-  // (the value loaded and the chain).  Don't transform a pre-increment
-  // load, for example, which produces an extra value.  Otherwise the
-  // transformation is not equivalent, and the downstream logic to replace
-  // uses gets things wrong.
-  if (LoadN->getNumValues() > 2)
-    return false;
-
-  // If the load that we're shrinking is an extload and we're not just
-  // discarding the extension we can't simply shrink the load. Bail.
-  // TODO: It would be possible to merge the extensions in some cases.
-  if (LoadN->getExtensionType() != ISD::NON_EXTLOAD &&
-      LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
-    return false;
-
-  if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT))
+  // Ensure that this isn't going to produce an unsupported unaligned access.
+  if (ShAmt &&
+      !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                              LDST->getAddressSpace(), ShAmt / 8))
     return false;
 
   // It's not possible to generate a constant of extended or untyped type.
-  EVT PtrType = LoadN->getOperand(1).getValueType();
+  EVT PtrType = LDST->getBasePtr().getValueType();
   if (PtrType == MVT::Untyped || PtrType.isExtended())
     return false;
 
+  if (isa<LoadSDNode>(LDST)) {
+    LoadSDNode *Load = cast<LoadSDNode>(LDST);
+    // Don't transform one with multiple uses, this would require adding a new
+    // load.
+    if (!SDValue(Load, 0).hasOneUse())
+      return false;
+
+    if (LegalOperations &&
+        !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
+      return false;
+
+    // For the transform to be legal, the load must produce only two values
+    // (the value loaded and the chain).  Don't transform a pre-increment
+    // load, for example, which produces an extra value.  Otherwise the
+    // transformation is not equivalent, and the downstream logic to replace
+    // uses gets things wrong.
+    if (Load->getNumValues() > 2)
+      return false;
+
+    // If the load that we're shrinking is an extload and we're not just
+    // discarding the extension we can't simply shrink the load. Bail.
+    // TODO: It would be possible to merge the extensions in some cases.
+    if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
+        Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
+      return false;
+
+    if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
+      return false;
+  } else {
+    assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
+    StoreSDNode *Store = cast<StoreSDNode>(LDST);
+    // Can't write outside the original store
+    if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
+      return false;
+
+    if (LegalOperations &&
+        !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
+      return false;
+  }
   return true;
 }
 
@@ -3841,7 +4164,7 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
       auto *Load = cast<LoadSDNode>(Op);
       EVT ExtVT;
       if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
-          isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {
+          isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {
 
         // ZEXTLOAD is already small enough.
         if (Load->getExtensionType() == ISD::ZEXTLOAD &&
@@ -3882,7 +4205,23 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
     // Allow one node which will masked along with any loads found.
     if (NodeToMask)
       return false;
+ 
+    // Also ensure that the node to be masked only produces one data result. 
     NodeToMask = Op.getNode();
+    if (NodeToMask->getNumValues() > 1) {
+      bool HasValue = false;
+      for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
+        MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
+        if (VT != MVT::Glue && VT != MVT::Other) {
+          if (HasValue) {
+            NodeToMask = nullptr;
+            return false;
+          }
+          HasValue = true;
+        }
+      }
+      assert(HasValue && "Node to be masked has no data result?");
+    }
   }
   return true;
 }
@@ -3906,19 +4245,19 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
     if (Loads.size() == 0)
       return false;
 
-    DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
+    LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
     SDValue MaskOp = N->getOperand(1);
 
     // If it exists, fixup the single node we allow in the tree that needs
     // masking.
     if (FixupNode) {
-      DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
+      LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
       SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
                                 FixupNode->getValueType(0),
                                 SDValue(FixupNode, 0), MaskOp);
       DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
-      DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0),
-                             MaskOp);
+      if (And.getOpcode() == ISD ::AND)
+        DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
     }
 
     // Narrow any constants that need it.
@@ -3937,11 +4276,13 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
 
     // Create narrow loads.
     for (auto *Load : Loads) {
-      DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
+      LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
       SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
                                 SDValue(Load, 0), MaskOp);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
-      DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp);
+      if (And.getOpcode() == ISD ::AND)
+        And = SDValue(
+            DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
       SDValue NewLoad = ReduceLoadWidth(And.getNode());
       assert(NewLoad &&
              "Shouldn't be masking the load if it can't be narrowed");
@@ -3953,6 +4294,60 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
   return false;
 }
 
+// Unfold
+//    x &  (-1 'logical shift' y)
+// To
+//    (x 'opposite logical shift' y) 'logical shift' y
+// if it is better for performance.
+SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Do we actually prefer shifts over mask?
+  if (!TLI.preferShiftsToClearExtremeBits(N0))
+    return SDValue();
+
+  // Try to match  (-1 '[outer] logical shift' y)
+  unsigned OuterShift;
+  unsigned InnerShift; // The opposite direction to the OuterShift.
+  SDValue Y;           // Shift amount.
+  auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
+    if (!M.hasOneUse())
+      return false;
+    OuterShift = M->getOpcode();
+    if (OuterShift == ISD::SHL)
+      InnerShift = ISD::SRL;
+    else if (OuterShift == ISD::SRL)
+      InnerShift = ISD::SHL;
+    else
+      return false;
+    if (!isAllOnesConstant(M->getOperand(0)))
+      return false;
+    Y = M->getOperand(1);
+    return true;
+  };
+
+  SDValue X;
+  if (matchMask(N1))
+    X = N0;
+  else if (matchMask(N0))
+    X = N1;
+  else
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  //     tmp = x   'opposite logical shift' y
+  SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
+  //     ret = tmp 'logical shift' y
+  SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);
+
+  return T1;
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4019,7 +4414,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
   };
   if (N0.getOpcode() == ISD::OR &&
-      matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
+      ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
     return N1;
   // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
@@ -4250,6 +4645,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return BSwap;
   }
 
+  if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
+    return Shifts;
+
   return SDValue();
 }
 
@@ -4276,7 +4674,10 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
     if (!N0.getNode()->hasOneUse())
       return SDValue();
     ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-    if (!N01C || N01C->getZExtValue() != 0xFF00)
+    // Also handle 0xffff since the LHS is guaranteed to have zeros there.
+    // This is needed for X86.
+    if (!N01C || (N01C->getZExtValue() != 0xFF00 &&
+                  N01C->getZExtValue() != 0xFFFF))
       return SDValue();
     N0 = N0.getOperand(0);
     LookPassAnd0 = true;
@@ -4323,7 +4724,10 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
     if (!N10.getNode()->hasOneUse())
       return SDValue();
     ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
-    if (!N101C || N101C->getZExtValue() != 0xFF00)
+    // Also allow 0xFFFF since the bits will be shifted out. This is needed
+    // for X86.
+    if (!N101C || (N101C->getZExtValue() != 0xFF00 &&
+                   N101C->getZExtValue() != 0xFFFF))
       return SDValue();
     N10 = N10.getOperand(0);
     LookPassAnd1 = true;
@@ -4394,6 +4798,14 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
     return false;
   case 0xFF:       MaskByteOffset = 0; break;
   case 0xFF00:     MaskByteOffset = 1; break;
+  case 0xFFFF:
+    // In case demanded bits didn't clear the bits that will be shifted out.
+    // This is needed for X86.
+    if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) {
+      MaskByteOffset = 1;
+      break;
+    }
+    return false;
   case 0xFF0000:   MaskByteOffset = 2; break;
   case 0xFF000000: MaskByteOffset = 3; break;
   }
@@ -4708,7 +5120,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     return LHS->getAPIntValue().intersects(RHS->getAPIntValue());
   };
   if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
-      matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) {
+      ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) {
     if (SDValue COR = DAG.FoldConstantArithmetic(
             ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {
       SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
@@ -4764,7 +5176,8 @@ bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
 // in direction shift1 by Neg.  The range [0, EltSize) means that we only need
 // to consider shift amounts with defined behavior.
-static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
+static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
+                           SelectionDAG &DAG) {
   // If EltSize is a power of 2 then:
   //
   //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
@@ -4799,9 +5212,13 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
   unsigned MaskLoBits = 0;
   if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
     if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
-      if (NegC->getAPIntValue() == EltSize - 1) {
+      KnownBits Known;
+      DAG.computeKnownBits(Neg.getOperand(0), Known);
+      unsigned Bits = Log2_64(EltSize);
+      if (NegC->getAPIntValue().getActiveBits() <= Bits &&
+          ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) {
         Neg = Neg.getOperand(0);
-        MaskLoBits = Log2_64(EltSize);
+        MaskLoBits = Bits;
       }
     }
   }
@@ -4816,10 +5233,16 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
 
   // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
   // Pos'.  The truncation is redundant for the purpose of the equality.
-  if (MaskLoBits && Pos.getOpcode() == ISD::AND)
-    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
-      if (PosC->getAPIntValue() == EltSize - 1)
+  if (MaskLoBits && Pos.getOpcode() == ISD::AND) {
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) {
+      KnownBits Known;
+      DAG.computeKnownBits(Pos.getOperand(0), Known);
+      if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits &&
+          ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >=
+           MaskLoBits))
         Pos = Pos.getOperand(0);
+    }
+  }
 
   // The condition we need is now:
   //
@@ -4875,7 +5298,7 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   //          (srl x, (*ext y))) ->
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
-  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg).getNode();
@@ -4893,8 +5316,8 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   if (!TLI.isTypeLegal(VT)) return nullptr;
 
   // The target must have at least one rotate flavor.
-  bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
-  bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
+  bool HasROTL = hasOperation(ISD::ROTL, VT);
+  bool HasROTR = hasOperation(ISD::ROTR, VT);
   if (!HasROTL && !HasROTR) return nullptr;
 
   // Check for truncated rotate.
@@ -4943,7 +5366,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
                                         ConstantSDNode *RHS) {
     return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
   };
-  if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
+  if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
                               LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
 
@@ -5200,7 +5623,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   Optional<BaseIndexOffset> Base;
   SDValue Chain;
 
-  SmallSet<LoadSDNode *, 8> Loads;
+  SmallPtrSet<LoadSDNode *, 8> Loads;
   Optional<ByteProvider> FirstByteProvider;
   int64_t FirstOffset = INT64_MAX;
 
@@ -5299,6 +5722,88 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
 }
 
+// If the target has andn, bsl, or a similar bit-select instruction,
+// we want to unfold masked merge, with canonical pattern of:
+//   |        A  |  |B|
+//   ((x ^ y) & m) ^ y
+//    |  D  |
+// Into:
+//   (x & m) | (y & ~m)
+// If y is a constant, and the 'andn' does not work with immediates,
+// we unfold into a different pattern:
+//   ~(~x & m) & (m | y)
+// NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
+//       the very least that breaks andnpd / andnps patterns, and because those
+//       patterns are simplified in IR and shouldn't be created in the DAG
+SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
+  assert(N->getOpcode() == ISD::XOR);
+
+  // Don't touch 'not' (i.e. where y = -1).
+  if (isAllOnesConstantOrAllOnesSplatConstant(N->getOperand(1)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // There are 3 commutable operators in the pattern,
+  // so we have to deal with 8 possible variants of the basic pattern.
+  SDValue X, Y, M;
+  auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
+    if (And.getOpcode() != ISD::AND || !And.hasOneUse())
+      return false;
+    SDValue Xor = And.getOperand(XorIdx);
+    if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse())
+      return false;
+    SDValue Xor0 = Xor.getOperand(0);
+    SDValue Xor1 = Xor.getOperand(1);
+    // Don't touch 'not' (i.e. where y = -1).
+    if (isAllOnesConstantOrAllOnesSplatConstant(Xor1))
+      return false;
+    if (Other == Xor0)
+      std::swap(Xor0, Xor1);
+    if (Other != Xor1)
+      return false;
+    X = Xor0;
+    Y = Xor1;
+    M = And.getOperand(XorIdx ? 0 : 1);
+    return true;
+  };
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
+      !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
+    return SDValue();
+
+  // Don't do anything if the mask is constant. This should not be reachable.
+  // InstCombine should have already unfolded this pattern, and DAGCombiner
+  // probably shouldn't produce it, too.
+  if (isa<ConstantSDNode>(M.getNode()))
+    return SDValue();
+
+  // We can transform if the target has AndNot
+  if (!TLI.hasAndNot(M))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // If Y is a constant, check that 'andn' works with immediates.
+  if (!TLI.hasAndNot(Y)) {
+    assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
+    // If not, we need to do a bit more work to make sure andn is still used.
+    SDValue NotX = DAG.getNOT(DL, X, VT);
+    SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
+    SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
+    SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
+    return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
+  }
+
+  SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
+  SDValue NotM = DAG.getNOT(DL, M, VT);
+  SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
+
+  return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+}
+
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5378,7 +5883,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
-  if (isOneConstant(N1) && VT == MVT::i1 &&
+  if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
       (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
@@ -5390,7 +5895,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     }
   }
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
-  if (isAllOnesConstant(N1) &&
+  if (isAllOnesConstant(N1) && N0.hasOneUse() &&
       (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
@@ -5411,13 +5916,19 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
-  unsigned OpSizeInBits = VT.getScalarSizeInBits();
-  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
-      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) &&
-      TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
-    if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
-      if (C->getAPIntValue() == (OpSizeInBits - 1))
-        return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0));
+  if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    SDValue A = N0.getOpcode() == ISD::ADD ? N0 : N1;
+    SDValue S = N0.getOpcode() == ISD::SRA ? N0 : N1;
+    if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
+      SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
+      SDValue S0 = S.getOperand(0);
+      if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) {
+        unsigned OpSizeInBits = VT.getScalarSizeInBits();
+        if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
+          if (C->getAPIntValue() == (OpSizeInBits - 1))
+            return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
+      }
+    }
   }
 
   // fold (xor x, x) -> 0
@@ -5454,6 +5965,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
       return Tmp;
 
+  // Unfold  ((x ^ y) & m) ^ y  into  (x & m) | (y & ~m)  if profitable
+  if (SDValue MM = unfoldMaskedMerge(N))
+    return MM;
+
   // Simplify the expression using non-local knowledge.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
@@ -5656,7 +6171,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
     return Val->getAPIntValue().uge(OpSizeInBits);
   };
-  if (matchUnaryPredicate(N1, MatchShiftTooBig))
+  if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
     return DAG.getUNDEF(VT);
   // fold (shl x, 0) -> x
   if (N1C && N1C->isNullValue())
@@ -5691,7 +6206,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).uge(OpSizeInBits);
     };
-    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
       return DAG.getConstant(0, SDLoc(N), VT);
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
@@ -5701,7 +6216,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).ult(OpSizeInBits);
     };
-    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDLoc DL(N);
       EVT ShiftVT = N1.getValueType();
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
@@ -5877,7 +6392,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
     return Val->getAPIntValue().uge(OpSizeInBits);
   };
-  if (matchUnaryPredicate(N1, MatchShiftTooBig))
+  if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
     return DAG.getUNDEF(VT);
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
@@ -5912,7 +6427,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).uge(OpSizeInBits);
     };
-    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
       return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
                          DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT));
 
@@ -5923,7 +6438,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).ult(OpSizeInBits);
     };
-    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);
     }
@@ -6041,7 +6556,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
     return Val->getAPIntValue().uge(OpSizeInBits);
   };
-  if (matchUnaryPredicate(N1, MatchShiftTooBig))
+  if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
     return DAG.getUNDEF(VT);
   // fold (srl x, 0) -> x
   if (N1C && N1C->isNullValue())
@@ -6064,7 +6579,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).uge(OpSizeInBits);
     };
-    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
       return DAG.getConstant(0, SDLoc(N), VT);
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
@@ -6074,7 +6589,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).ult(OpSizeInBits);
     };
-    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDLoc DL(N);
       EVT ShiftVT = N1.getValueType();
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
@@ -6285,6 +6800,13 @@ SDValue DAGCombiner::visitCTLZ(SDNode *N) {
   // fold (ctlz c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
+
+  // If the value is known never to be zero, switch to the undef version.
+  if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
+    if (DAG.isKnownNeverZero(N0))
+      return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  }
+
   return SDValue();
 }
 
@@ -6305,6 +6827,13 @@ SDValue DAGCombiner::visitCTTZ(SDNode *N) {
   // fold (cttz c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
+
+  // If the value is known never to be zero, switch to the undef version.
+  if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
+    if (DAG.isKnownNeverZero(N0))
+      return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  }
+
   return SDValue();
 }
 
@@ -6328,7 +6857,7 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   return SDValue();
 }
 
-/// \brief Generate Min/Max node
+/// Generate Min/Max node
 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                    SDValue RHS, SDValue True, SDValue False,
                                    ISD::CondCode CC, const TargetLowering &TLI,
@@ -6443,9 +6972,9 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   // in another basic block or it could require searching a complicated
   // expression.
   if (CondVT.isInteger() &&
-      TLI.getBooleanContents(false, true) ==
+      TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) ==
           TargetLowering::ZeroOrOneBooleanContent &&
-      TLI.getBooleanContents(false, false) ==
+      TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
           TargetLowering::ZeroOrOneBooleanContent &&
       C1->isNullValue() && C2->isOne()) {
     SDValue NotCond =
@@ -6574,15 +7103,10 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     }
   }
 
-  // select (xor Cond, 1), X, Y -> select Cond, Y, X
   if (VT0 == MVT::i1) {
-    if (N0->getOpcode() == ISD::XOR) {
-      if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) {
-        SDValue Cond0 = N0->getOperand(0);
-        if (C->isOne())
-          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N2, N1);
-      }
-    }
+    // select (not Cond), N1, N2 -> select Cond, N2, N1
+    if (isBitwiseNot(N0))
+      return DAG.getNode(ISD::SELECT, DL, VT, N0->getOperand(0), N2, N1);
   }
 
   // fold selects based on a setcc into other things, such as min/max/abs
@@ -6726,6 +7250,7 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   SDValue DataLo, DataHi;
   std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
+  SDValue Scale = MSC->getScale();
   SDValue BasePtr = MSC->getBasePtr();
   SDValue IndexLo, IndexHi;
   std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
@@ -6735,11 +7260,11 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
                           Alignment, MSC->getAAInfo(), MSC->getRanges());
 
-  SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo };
+  SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale };
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
                             DL, OpsLo, MMO);
 
-  SDValue OpsHi[] = {Chain, DataHi, MaskHi, BasePtr, IndexHi};
+  SDValue OpsHi[] = { Chain, DataHi, MaskHi, BasePtr, IndexHi, Scale };
   Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
                             DL, OpsHi, MMO);
 
@@ -6800,12 +7325,12 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                      MST->isCompressingStore());
+    unsigned HiOffset = LoMemVT.getStoreSize();
 
-    MMO = DAG.getMachineFunction().
-      getMachineMemOperand(MST->getPointerInfo(),
-                           MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
-                           SecondHalfAlignment, MST->getAAInfo(),
-                           MST->getRanges());
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MST->getPointerInfo().getWithOffset(HiOffset),
+        MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment,
+        MST->getAAInfo(), MST->getRanges());
 
     Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
                             MST->isTruncatingStore(),
@@ -6859,6 +7384,7 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
+  SDValue Scale = MGT->getScale();
   SDValue BasePtr = MGT->getBasePtr();
   SDValue Index = MGT->getIndex();
   SDValue IndexLo, IndexHi;
@@ -6869,13 +7395,13 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
                           MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                           Alignment, MGT->getAAInfo(), MGT->getRanges());
 
-  SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo };
+  SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo, Scale };
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
-                            MMO);
+                           MMO);
 
-  SDValue OpsHi[] = {Chain, Src0Hi, MaskHi, BasePtr, IndexHi};
+  SDValue OpsHi[] = { Chain, Src0Hi, MaskHi, BasePtr, IndexHi, Scale };
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
-                            MMO);
+                           MMO);
 
   AddToWorklist(Lo.getNode());
   AddToWorklist(Hi.getNode());
@@ -6949,11 +7475,12 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                      MLD->isExpandingLoad());
+    unsigned HiOffset = LoMemVT.getStoreSize();
 
-    MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
-                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MLD->getPointerInfo().getWithOffset(HiOffset),
+        MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment,
+        MLD->getAAInfo(), MLD->getRanges());
 
     Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
                            ISD::NON_EXTLOAD, MLD->isExpandingLoad());
@@ -7071,6 +7598,36 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
     }
+
+    // If this select has a condition (setcc) with narrower operands than the
+    // select, try to widen the compare to match the select width.
+    // TODO: This should be extended to handle any constant.
+    // TODO: This could be extended to handle non-loading patterns, but that
+    //       requires thorough testing to avoid regressions.
+    if (isNullConstantOrNullSplatConstant(RHS)) {
+      EVT NarrowVT = LHS.getValueType();
+      EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
+      EVT SetCCVT = getSetCCResultType(LHS.getValueType());
+      unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
+      unsigned WideWidth = WideVT.getScalarSizeInBits();
+      bool IsSigned = isSignedIntSetCC(CC);
+      auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+      if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
+          SetCCWidth != 1 && SetCCWidth < WideWidth &&
+          TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
+          TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
+        // Both compare operands can be widened for free. The LHS can use an
+        // extended load, and the RHS is a constant:
+        //   vselect (ext (setcc load(X), C)), N1, N2 -->
+        //   vselect (setcc extload(X), C'), N1, N2
+        auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+        SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
+        SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
+        EVT WideSetCCVT = getSetCCResultType(WideVT);
+        SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
+        return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
+      }
+    }
   }
 
   if (SimplifySelectOps(N, N1, N2))
@@ -7142,22 +7699,33 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitSETCC(SDNode *N) {
-  return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
-                       cast<CondCodeSDNode>(N->getOperand(2))->get(),
-                       SDLoc(N));
-}
+  // setcc is very commonly used as an argument to brcond. This pattern
+  // also lend itself to numerous combines and, as a result, it is desired
+  // we keep the argument to a brcond as a setcc as much as possible.
+  bool PreferSetCC =
+      N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
 
-SDValue DAGCombiner::visitSETCCE(SDNode *N) {
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  SDValue Carry = N->getOperand(2);
-  SDValue Cond = N->getOperand(3);
+  SDValue Combined = SimplifySetCC(
+      N->getValueType(0), N->getOperand(0), N->getOperand(1),
+      cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC);
 
-  // If Carry is false, fold to a regular SETCC.
-  if (Carry.getOpcode() == ISD::CARRY_FALSE)
-    return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
+  if (!Combined)
+    return SDValue();
 
-  return SDValue();
+  // If we prefer to have a setcc, and we don't, we'll try our best to
+  // recreate one using rebuildSetCC.
+  if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
+    SDValue NewSetCC = rebuildSetCC(Combined);
+
+    // We don't have anything interesting to combine to.
+    if (NewSetCC.getNode() == N)
+      return SDValue();
+
+    if (NewSetCC)
+      return NewSetCC;
+  }
+
+  return Combined;
 }
 
 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
@@ -7237,12 +7805,12 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
 // transformation. Returns true if extension are possible and the above
 // mentioned transformation is profitable.
-static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
+static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
                                     unsigned ExtOpc,
                                     SmallVectorImpl<SDNode *> &ExtendNodes,
                                     const TargetLowering &TLI) {
   bool HasCopyToRegUses = false;
-  bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
+  bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
   for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
                             UE = N0.getNode()->use_end();
        UI != UE; ++UI) {
@@ -7298,16 +7866,16 @@ static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
 }
 
 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
-                                  SDValue Trunc, SDValue ExtLoad,
-                                  const SDLoc &DL, ISD::NodeType ExtType) {
+                                  SDValue OrigLoad, SDValue ExtLoad,
+                                  ISD::NodeType ExtType) {
   // Extend SetCC uses if necessary.
-  for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
-    SDNode *SetCC = SetCCs[i];
+  SDLoc DL(ExtLoad);
+  for (SDNode *SetCC : SetCCs) {
     SmallVector<SDValue, 4> Ops;
 
     for (unsigned j = 0; j != 2; ++j) {
       SDValue SOp = SetCC->getOperand(j);
-      if (SOp == Trunc)
+      if (SOp == OrigLoad)
         Ops.push_back(ExtLoad);
       else
         Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
@@ -7356,7 +7924,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
     return SDValue();
 
   SmallVector<SDNode *, 4> SetCCs;
-  if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
+  if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
     return SDValue();
 
   ISD::LoadExtType ExtType =
@@ -7387,7 +7955,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
     const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
 
     SDValue SplitLoad = DAG.getExtLoad(
-        ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
+        ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
         LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
         LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
 
@@ -7410,12 +7978,82 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   // with a truncate of the concatenated sextloaded vectors.
   SDValue Trunc =
       DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+  ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
   CombineTo(N0.getNode(), Trunc, NewChain);
-  ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
-                  (ISD::NodeType)N->getOpcode());
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
+// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
+//      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
+SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
+  assert(N->getOpcode() == ISD::ZERO_EXTEND);
+  EVT VT = N->getValueType(0);
+
+  // and/or/xor
+  SDValue N0 = N->getOperand(0);
+  if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+        N0.getOpcode() == ISD::XOR) ||
+      N0.getOperand(1).getOpcode() != ISD::Constant ||
+      (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
+    return SDValue();
+
+  // shl/shr
+  SDValue N1 = N0->getOperand(0);
+  if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
+      N1.getOperand(1).getOpcode() != ISD::Constant ||
+      (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
+    return SDValue();
+
+  // load
+  if (!isa<LoadSDNode>(N1.getOperand(0)))
+    return SDValue();
+  LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
+  EVT MemVT = Load->getMemoryVT();
+  if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
+      Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
+    return SDValue();
+
+
+  // If the shift op is SHL, the logic op must be AND, otherwise the result
+  // will be wrong.
+  if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SmallVector<SDNode*, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
+                               ISD::ZERO_EXTEND, SetCCs, TLI))
+    return SDValue();
+
+  // Actually do the transformation.
+  SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
+                                   Load->getChain(), Load->getBasePtr(),
+                                   Load->getMemoryVT(), Load->getMemOperand());
+
+  SDLoc DL1(N1);
+  SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
+                              N1.getOperand(1));
+
+  APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+  Mask = Mask.zext(VT.getSizeInBits());
+  SDLoc DL0(N0);
+  SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
+                            DAG.getConstant(Mask, DL0, VT));
+
+  ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
+  CombineTo(N, And);
+  if (SDValue(Load, 0).hasOneUse()) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
+  } else {
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
+                                Load->getValueType(0), ExtLoad);
+    CombineTo(Load, Trunc, ExtLoad.getValue(1));
+  }
+  return SDValue(N,0); // Return N so it doesn't get rechecked!
+}
+
 /// If we're narrowing or widening the result of a vector select and the final
 /// size is the same size as a setcc (compare) feeding the select, then try to
 /// apply the cast operation to the select's operands because matching vector
@@ -7461,6 +8099,106 @@ SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
   return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
 }
 
+// fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
+// fold ([s|z]ext (     extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
+static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
+                                     const TargetLowering &TLI, EVT VT,
+                                     bool LegalOperations, SDNode *N,
+                                     SDValue N0, ISD::LoadExtType ExtLoadType) {
+  SDNode *N0Node = N0.getNode();
+  bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
+                                                   : ISD::isZEXTLoad(N0Node);
+  if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) ||
+      !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse())
+    return {};
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  EVT MemVT = LN0->getMemoryVT();
+  if ((LegalOperations || LN0->isVolatile()) &&
+      !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
+    return {};
+
+  SDValue ExtLoad =
+      DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
+                     LN0->getBasePtr(), MemVT, LN0->getMemOperand());
+  Combiner.CombineTo(N, ExtLoad);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
+// fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x)))
+// Only generate vector extloads when 1) they're legal, and 2) they are
+// deemed desirable by the target.
+static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
+                                  const TargetLowering &TLI, EVT VT,
+                                  bool LegalOperations, SDNode *N, SDValue N0,
+                                  ISD::LoadExtType ExtLoadType,
+                                  ISD::NodeType ExtOpc) {
+  if (!ISD::isNON_EXTLoad(N0.getNode()) ||
+      !ISD::isUNINDEXEDLoad(N0.getNode()) ||
+      ((LegalOperations || VT.isVector() ||
+        cast<LoadSDNode>(N0)->isVolatile()) &&
+       !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
+    return {};
+
+  bool DoXform = true;
+  SmallVector<SDNode *, 4> SetCCs;
+  if (!N0.hasOneUse())
+    DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
+  if (VT.isVector())
+    DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
+  if (!DoXform)
+    return {};
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
+                                   LN0->getBasePtr(), N0.getValueType(),
+                                   LN0->getMemOperand());
+  Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
+  // If the load value is used only by N, replace it via CombineTo N.
+  bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+  Combiner.CombineTo(N, ExtLoad);
+  if (NoReplaceTrunc) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+  } else {
+    SDValue Trunc =
+        DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
+    Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+  }
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
+static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
+                                       bool LegalOperations) {
+  assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");
+
+  SDValue SetCC = N->getOperand(0);
+  if (LegalOperations || SetCC.getOpcode() != ISD::SETCC ||
+      !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1)
+    return SDValue();
+
+  SDValue X = SetCC.getOperand(0);
+  SDValue Ones = SetCC.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  EVT VT = N->getValueType(0);
+  EVT XVT = X.getValueType();
+  // setge X, C is canonicalized to setgt, so we do not need to match that
+  // pattern. The setlt sibling is folded in SimplifySelectCC() because it does
+  // not require the 'not' op.
+  if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
+    // Invert and smear/shift the sign bit:
+    // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
+    // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
+    SDLoc DL(N);
+    SDValue NotX = DAG.getNOT(DL, X, VT);
+    SDValue ShiftAmount = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+    auto ShiftOpcode = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
+    return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7525,62 +8263,21 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     }
   }
 
-  // fold (sext (load x)) -> (sext (truncate (sextload x)))
-  // Only generate vector extloads when 1) they're legal, and 2) they are
-  // deemed desirable by the target.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !VT.isVector() &&
-        !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
-    bool DoXform = true;
-    SmallVector<SDNode*, 4> SetCCs;
-    if (!N0.hasOneUse())
-      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
-    if (VT.isVector())
-      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
-    if (DoXform) {
-      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
-                                       LN0->getBasePtr(), N0.getValueType(),
-                                       LN0->getMemOperand());
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), ExtLoad);
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
-      // If the load value is used only by N, replace it via CombineTo N.
-      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
-      CombineTo(N, ExtLoad);
-      if (NoReplaceTrunc)
-        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
-      else
-        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
-      return SDValue(N, 0);
-    }
-  }
+  // Try to simplify (sext (load x)).
+  if (SDValue foldedExt =
+          tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                             ISD::SEXTLOAD, ISD::SIGN_EXTEND))
+    return foldedExt;
 
   // fold (sext (load x)) to multiple smaller sextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
     return ExtLoad;
 
-  // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
-  // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
-  if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    EVT MemVT = LN0->getMemoryVT();
-    if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
-                                       LN0->getBasePtr(), MemVT,
-                                       LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
-      CombineTo(N0.getNode(),
-                DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                            N0.getValueType(), ExtLoad),
-                ExtLoad.getValue(1));
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
-    }
-  }
+  // Try to simplify (sext (sextload x)).
+  if (SDValue foldedExt = tryToFoldExtOfExtload(
+          DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
+    return foldedExt;
 
   // fold (sext (and/or/xor (load x), cst)) ->
   //      (and/or/xor (sextload x), (sext cst))
@@ -7588,30 +8285,26 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
-    if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
-      bool DoXform = true;
+    LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
+    EVT MemVT = LN00->getMemoryVT();
+    if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
+      LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
       SmallVector<SDNode*, 4> SetCCs;
-      if (!N0.hasOneUse())
-        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
-                                          SetCCs, TLI);
+      bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
+                                             ISD::SIGN_EXTEND, SetCCs, TLI);
       if (DoXform) {
-        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
-                                         LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getMemoryVT(),
-                                         LN0->getMemOperand());
+        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
+                                         LN00->getChain(), LN00->getBasePtr(),
+                                         LN00->getMemoryVT(),
+                                         LN00->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
-        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
-                                    SDLoc(N0.getOperand(0)),
-                                    N0.getOperand(0).getValueType(), ExtLoad);
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
+        ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
         bool NoReplaceTruncAnd = !N0.hasOneUse();
-        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+        bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
         CombineTo(N, And);
         // If N0 has multiple uses, change other uses as well.
         if (NoReplaceTruncAnd) {
@@ -7619,15 +8312,21 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
               DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
           CombineTo(N0.getNode(), TruncAnd);
         }
-        if (NoReplaceTrunc)
-          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
-        else
-          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        if (NoReplaceTrunc) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
+        } else {
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
+                                      LN00->getValueType(0), ExtLoad);
+          CombineTo(LN00, Trunc, ExtLoad.getValue(1));
+        }
         return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
 
+  if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
+    return V;
+
   if (N0.getOpcode() == ISD::SETCC) {
     SDValue N00 = N0.getOperand(0);
     SDValue N01 = N0.getOperand(1);
@@ -7674,8 +8373,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     // If the type of the setcc is larger (say, i8) then the value of the high
     // bit depends on getBooleanContents(), so ask TLI for a real "true" value
     // of the appropriate width.
-    SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT)
-                                           : TLI.getConstTrueVal(DAG, VT, DL);
+    SDValue ExtTrueVal = (SetCCWidth == 1)
+                             ? DAG.getAllOnesConstant(DL, VT)
+                             : DAG.getBoolConstant(true, DL, VT, N00VT);
     SDValue Zero = DAG.getConstant(0, DL, VT);
     if (SDValue SCC =
             SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
@@ -7792,13 +8492,16 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
     // Try to mask before the extension to avoid having to generate a larger mask,
     // possibly over several sub-vectors.
-    if (SrcVT.bitsLT(VT)) {
+    if (SrcVT.bitsLT(VT) && VT.isVector()) {
       if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
                                TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
         SDValue Op = N0.getOperand(0);
         Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
         AddToWorklist(Op.getNode());
-        return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
+        SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
+        // Transfer the debug info; the new node is equivalent to N0.
+        DAG.transferDbgValues(N0, ZExtOrTrunc);
+        return ZExtOrTrunc;
       }
     }
 
@@ -7830,39 +8533,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                        X, DAG.getConstant(Mask, DL, VT));
   }
 
-  // fold (zext (load x)) -> (zext (truncate (zextload x)))
-  // Only generate vector extloads when 1) they're legal, and 2) they are
-  // deemed desirable by the target.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !VT.isVector() &&
-        !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
-    bool DoXform = true;
-    SmallVector<SDNode*, 4> SetCCs;
-    if (!N0.hasOneUse())
-      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
-    if (VT.isVector())
-      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
-    if (DoXform) {
-      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), N0.getValueType(),
-                                       LN0->getMemOperand());
-
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), ExtLoad);
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND);
-      // If the load value is used only by N, replace it via CombineTo N.
-      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
-      CombineTo(N, ExtLoad);
-      if (NoReplaceTrunc)
-        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
-      else
-        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
-      return SDValue(N, 0); // Return N so it doesn't get rechecked!
-    }
-  }
+  // Try to simplify (zext (load x)).
+  if (SDValue foldedExt =
+          tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                             ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
+    return foldedExt;
 
   // fold (zext (load x)) to multiple smaller zextloads.
   // Only on illegal but splittable vectors.
@@ -7877,10 +8552,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
-    if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
+    LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
+    EVT MemVT = LN00->getMemoryVT();
+    if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
+        LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
       bool DoXform = true;
       SmallVector<SDNode*, 4> SetCCs;
       if (!N0.hasOneUse()) {
@@ -7888,29 +8564,26 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
           auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
           EVT LoadResultTy = AndC->getValueType(0);
           EVT ExtVT;
-          if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT))
+          if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
             DoXform = false;
         }
-        if (DoXform)
-          DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0),
-                                            ISD::ZERO_EXTEND, SetCCs, TLI);
       }
+      if (DoXform)
+        DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
+                                          ISD::ZERO_EXTEND, SetCCs, TLI);
       if (DoXform) {
-        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
-                                         LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getMemoryVT(),
-                                         LN0->getMemOperand());
+        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
+                                         LN00->getChain(), LN00->getBasePtr(),
+                                         LN00->getMemoryVT(),
+                                         LN00->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.zext(VT.getSizeInBits());
         SDLoc DL(N);
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
-        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
-                                    SDLoc(N0.getOperand(0)),
-                                    N0.getOperand(0).getValueType(), ExtLoad);
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
+        ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
         bool NoReplaceTruncAnd = !N0.hasOneUse();
-        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+        bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
         CombineTo(N, And);
         // If N0 has multiple uses, change other uses as well.
         if (NoReplaceTruncAnd) {
@@ -7918,35 +8591,30 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
               DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
           CombineTo(N0.getNode(), TruncAnd);
         }
-        if (NoReplaceTrunc)
-          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
-        else
-          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        if (NoReplaceTrunc) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
+        } else {
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
+                                      LN00->getValueType(0), ExtLoad);
+          CombineTo(LN00, Trunc, ExtLoad.getValue(1));
+        }
         return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
 
-  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
-  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
-  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    EVT MemVT = LN0->getMemoryVT();
-    if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), MemVT,
-                                       LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
-      CombineTo(N0.getNode(),
-                DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
-                            ExtLoad),
-                ExtLoad.getValue(1));
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
-    }
-  }
+  // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
+  //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
+  if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
+    return ZExtLoad;
+
+  // Try to simplify (zext (zextload x)).
+  if (SDValue foldedExt = tryToFoldExtOfExtload(
+          DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
+    return foldedExt;
+
+  if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
+    return V;
 
   if (N0.getOpcode() == ISD::SETCC) {
     // Only do this before legalize for now.
@@ -8084,24 +8752,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
-      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
+      DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs,
+                                        TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), ExtLoad);
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
-                      ISD::ANY_EXTEND);
+      ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
       // If the load value is used only by N, replace it via CombineTo N.
       bool NoReplaceTrunc = N0.hasOneUse();
       CombineTo(N, ExtLoad);
-      if (NoReplaceTrunc)
+      if (NoReplaceTrunc) {
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
-      else
+      } else {
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                                    N0.getValueType(), ExtLoad);
         CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      }
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
@@ -8109,9 +8778,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
   // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
   // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
-  if (N0.getOpcode() == ISD::LOAD &&
-      !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      N0.hasOneUse()) {
+  if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
@@ -8120,10 +8788,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                                        VT, LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
       CombineTo(N, ExtLoad);
-      CombineTo(N0.getNode(),
-                DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                            N0.getValueType(), ExtLoad),
-                ExtLoad.getValue(1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -8263,8 +8928,9 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   unsigned ShAmt = 0;
   if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
-    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-      ShAmt = N01->getZExtValue();
+    SDValue SRL = N0;
+    if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
+      ShAmt = ConstShift->getZExtValue();
       unsigned EVTBits = ExtVT.getSizeInBits();
       // Is the shift amount a multiple of size of VT?
       if ((ShAmt & (EVTBits-1)) == 0) {
@@ -8277,17 +8943,36 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       // At this point, we must have a load or else we can't do the transform.
       if (!isa<LoadSDNode>(N0)) return SDValue();
 
+      auto *LN0 = cast<LoadSDNode>(N0);
+
       // Because a SRL must be assumed to *need* to zero-extend the high bits
       // (as opposed to anyext the high bits), we can't combine the zextload
       // lowering of SRL and an sextload.
-      if (cast<LoadSDNode>(N0)->getExtensionType() == ISD::SEXTLOAD)
+      if (LN0->getExtensionType() == ISD::SEXTLOAD)
         return SDValue();
 
       // If the shift amount is larger than the input type then we're not
       // accessing any of the loaded bytes.  If the load was a zextload/extload
       // then the result of the shift+trunc is zero/undef (handled elsewhere).
-      if (ShAmt >= cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits())
+      if (ShAmt >= LN0->getMemoryVT().getSizeInBits())
         return SDValue();
+
+      // If the SRL is only used by a masking AND, we may be able to adjust
+      // the ExtVT to make the AND redundant.
+      SDNode *Mask = *(SRL->use_begin());
+      if (Mask->getOpcode() == ISD::AND &&
+          isa<ConstantSDNode>(Mask->getOperand(1))) {
+        const APInt &ShiftMask =
+          cast<ConstantSDNode>(Mask->getOperand(1))->getAPIntValue();
+        if (ShiftMask.isMask()) {
+          EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
+                                           ShiftMask.countTrailingOnes());
+          // If the mask is smaller, recompute the type.
+          if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) &&
+              TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT))
+            ExtVT = MaskedVT;
+        }
+      }
     }
   }
 
@@ -8307,7 +8992,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-  if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt))
+  if (!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
     return SDValue();
 
   // For big endian targets, we need to adjust the offset to the pointer to
@@ -8403,7 +9088,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
-  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x)
+  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
   if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
        N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
        N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
@@ -8777,6 +9462,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
   }
 
+  // fold (truncate (extract_subvector(ext x))) ->
+  //      (extract_subvector x)
+  // TODO: This can be generalized to cover cases where the truncate and extract
+  // do not fully cancel each other out.
+  if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == ISD::SIGN_EXTEND ||
+        N00.getOpcode() == ISD::ZERO_EXTEND ||
+        N00.getOpcode() == ISD::ANY_EXTEND) {
+      if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
+          VT.getVectorElementType())
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
+                           N00.getOperand(0), N0.getOperand(1));
+    }
+  }
+
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
@@ -8897,17 +9598,17 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   }
 
   // If the input is a constant, let getNode fold it.
-  if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
-    // If we can't allow illegal operations, we need to check that this is just
-    // a fp -> int or int -> conversion and that the resulting operation will
-    // be legal.
-    if (!LegalOperations ||
-        (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
-         TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
-        (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
-         TLI.isOperationLegal(ISD::Constant, VT)))
-      return DAG.getBitcast(VT, N0);
-  }
+  // We always need to check that this is just a fp -> int or int -> conversion
+  // otherwise we will get back N which will confuse the caller into thinking
+  // we used CombineTo. This can block target combines from running. If we can't
+  // allowed legal operations, we need to ensure the resulting operation will be
+  // legal.
+  // TODO: Maybe we should check that the return value isn't N explicitly?
+  if ((isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
+       (!LegalOperations || TLI.isOperationLegal(ISD::ConstantFP, VT))) ||
+      (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
+       (!LegalOperations || TLI.isOperationLegal(ISD::Constant, VT))))
+    return DAG.getBitcast(VT, N0);
 
   // (conv (conv x, t1), t2) -> (conv x, t2)
   if (N0.getOpcode() == ISD::BITCAST)
@@ -9253,7 +9954,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
 
 static bool isContractable(SDNode *N) {
   SDNodeFlags F = N->getFlags();
-  return F.hasAllowContract() || F.hasUnsafeAlgebra();
+  return F.hasAllowContract() || F.hasAllowReassociation();
 }
 
 /// Try to perform FMA combining on a given FADD node.
@@ -9277,8 +9978,10 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  SDNodeFlags Flags = N->getFlags();
+  bool CanFuse = Options.UnsafeFPMath || isContractable(N);
   bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                              Options.UnsafeFPMath || HasFMAD);
+                              CanFuse || HasFMAD);
   // If the addition is not contractable, do not combine.
   if (!AllowFusionGlobally && !isContractable(N))
     return SDValue();
@@ -9308,14 +10011,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
   if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                       N0.getOperand(0), N0.getOperand(1), N1);
+                       N0.getOperand(0), N0.getOperand(1), N1, Flags);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
   if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                       N1.getOperand(0), N1.getOperand(1), N0);
+                       N1.getOperand(0), N1.getOperand(1), N0, Flags);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
@@ -9329,7 +10032,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N00.getOperand(0)),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N00.getOperand(1)), N1);
+                                     N00.getOperand(1)), N1, Flags);
     }
   }
 
@@ -9343,16 +10046,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N10.getOperand(0)),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N10.getOperand(1)), N0);
+                                     N10.getOperand(1)), N0, Flags);
     }
   }
 
   // More folding opportunities when target permits.
   if (Aggressive) {
     // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
-    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
-    // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath &&
+    if (CanFuse &&
         N0.getOpcode() == PreferredFusedOpcode &&
         N0.getOperand(2).getOpcode() == ISD::FMUL &&
         N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
@@ -9361,13 +10062,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      N0.getOperand(2).getOperand(0),
                                      N0.getOperand(2).getOperand(1),
-                                     N1));
+                                     N1, Flags), Flags);
     }
 
     // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
-    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
-    // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath &&
+    if (CanFuse &&
         N1->getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FMUL &&
         N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
@@ -9376,19 +10075,20 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      N1.getOperand(2).getOperand(0),
                                      N1.getOperand(2).getOperand(1),
-                                     N0));
+                                     N0, Flags), Flags);
     }
 
 
     // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y, (fma (fpext u), (fpext v), z))
     auto FoldFAddFMAFPExtFMul = [&] (
-      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
+      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
+      SDNodeFlags Flags) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
-                                     Z));
+                                     Z, Flags), Flags);
     };
     if (N0.getOpcode() == PreferredFusedOpcode) {
       SDValue N02 = N0.getOperand(2);
@@ -9398,7 +10098,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
           return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
                                       N020.getOperand(0), N020.getOperand(1),
-                                      N1);
+                                      N1, Flags);
         }
       }
     }
@@ -9409,14 +10109,15 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
     auto FoldFAddFPExtFMAFMul = [&] (
-      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
+      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
+      SDNodeFlags Flags) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
-                                     Z));
+                                     Z, Flags), Flags);
     };
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
@@ -9426,7 +10127,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
           return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
                                       N002.getOperand(0), N002.getOperand(1),
-                                      N1);
+                                      N1, Flags);
         }
       }
     }
@@ -9441,7 +10142,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
           return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
                                       N120.getOperand(0), N120.getOperand(1),
-                                      N0);
+                                      N0, Flags);
         }
       }
     }
@@ -9459,7 +10160,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
           return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
                                       N102.getOperand(0), N102.getOperand(1),
-                                      N0);
+                                      N0, Flags);
         }
       }
     }
@@ -9488,8 +10189,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  const SDNodeFlags Flags = N->getFlags();
+  bool CanFuse = Options.UnsafeFPMath || isContractable(N);
   bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                              Options.UnsafeFPMath || HasFMAD);
+                              CanFuse || HasFMAD);
+
   // If the subtraction is not contractable, do not combine.
   if (!AllowFusionGlobally && !isContractable(N))
     return SDValue();
@@ -9514,16 +10218,17 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1),
-                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+                       DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
   }
 
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
-  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse()))
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT,
                                    N1.getOperand(0)),
-                       N1.getOperand(1), N0);
+                       N1.getOperand(1), N0, Flags);
+  }
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
   if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
@@ -9532,7 +10237,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     SDValue N01 = N0.getOperand(0).getOperand(1);
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
-                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+                       DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
@@ -9548,7 +10253,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                      N00.getOperand(0)),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N00.getOperand(1)),
-                         DAG.getNode(ISD::FNEG, SL, VT, N1));
+                         DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
     }
   }
 
@@ -9565,7 +10270,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                                  N10.getOperand(0))),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N10.getOperand(1)),
-                         N0);
+                         N0, Flags);          
     }
   }
 
@@ -9587,7 +10292,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                                    N000.getOperand(0)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N000.getOperand(1)),
-                                       N1));
+                                       N1, Flags));
       }
     }
   }
@@ -9610,7 +10315,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                                    N000.getOperand(0)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N000.getOperand(1)),
-                                       N1));
+                                       N1, Flags));
       }
     }
   }
@@ -9619,9 +10324,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (Aggressive) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
-    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
-    // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode &&
+    if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
         N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -9630,14 +10333,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                      N0.getOperand(2).getOperand(0),
                                      N0.getOperand(2).getOperand(1),
                                      DAG.getNode(ISD::FNEG, SL, VT,
-                                                 N1)));
+                                                 N1), Flags), Flags);          
     }
 
     // fold (fsub x, (fma y, z, (fmul u, v)))
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
-    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
-    // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode &&
+    if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N1.getOperand(2))) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
@@ -9647,8 +10348,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                          N1.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FNEG, SL, VT, N20),
-
-                                     N21, N0));
+                                     N21, N0, Flags), Flags);      
     }
 
 
@@ -9668,7 +10368,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                      N020.getOperand(1)),
                                          DAG.getNode(ISD::FNEG, SL, VT,
-                                                     N1)));
+                                                     N1), Flags), Flags);              
         }
       }
     }
@@ -9696,7 +10396,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                      N002.getOperand(1)),
                                          DAG.getNode(ISD::FNEG, SL, VT,
-                                                     N1)));
+                                                     N1), Flags), Flags);              
         }
       }
     }
@@ -9719,7 +10419,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                                                VT, N1200)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N1201),
-                                       N0));
+                                       N0, Flags), Flags);        
       }
     }
 
@@ -9750,7 +10450,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                                                VT, N1020)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N1021),
-                                       N0));
+                                       N0, Flags), Flags);        
       }
     }
   }
@@ -9766,6 +10466,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
+  const SDNodeFlags Flags = N->getFlags();
 
   assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
 
@@ -9797,52 +10498,54 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
 
   // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
   // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
-  auto FuseFADD = [&](SDValue X, SDValue Y) {
+  auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
       auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
       if (XC1 && XC1->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                           Y, Flags);
       if (XC1 && XC1->isExactlyValue(-1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);      
     }
     return SDValue();
   };
 
-  if (SDValue FMA = FuseFADD(N0, N1))
+  if (SDValue FMA = FuseFADD(N0, N1, Flags))
     return FMA;
-  if (SDValue FMA = FuseFADD(N1, N0))
+  if (SDValue FMA = FuseFADD(N1, N0, Flags))
     return FMA;
 
   // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
   // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
   // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
   // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
-  auto FuseFSUB = [&](SDValue X, SDValue Y) {
+  auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
       auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
       if (XC0 && XC0->isExactlyValue(+1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                           Y);
+                           Y, Flags);
       if (XC0 && XC0->isExactlyValue(-1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);      
 
       auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
       if (XC1 && XC1->isExactlyValue(+1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
       if (XC1 && XC1->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                           Y, Flags);
     }
     return SDValue();
   };
 
-  if (SDValue FMA = FuseFSUB(N0, N1))
+  if (SDValue FMA = FuseFSUB(N0, N1, Flags))
     return FMA;
-  if (SDValue FMA = FuseFSUB(N1, N0))
+  if (SDValue FMA = FuseFSUB(N1, N0, Flags))
     return FMA;
 
   return SDValue();
@@ -9904,35 +10607,42 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
   }
 
-  // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) {
-    // fold (fadd A, 0) -> A
-    if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
-      if (N1C->isZero())
-        return N0;
+  ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1);
+  if (N1C && N1C->isZero()) {
+    if (N1C->isNegative() || Options.UnsafeFPMath ||
+        Flags.hasNoSignedZeros()) {
+      // fold (fadd A, 0) -> A
+      return N0;
+    }
   }
 
-  // If 'unsafe math' is enabled, fold lots of things.
-  if (Options.UnsafeFPMath) {
-    // No FP constant should be created after legalization as Instruction
-    // Selection pass has a hard time dealing with FP constants.
-    bool AllowNewConst = (Level < AfterLegalizeDAG);
-
-    // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
-    if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
-        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)))
-      return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0),
-                         DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1,
-                                     Flags),
-                         Flags);
+  // No FP constant should be created after legalization as Instruction
+  // Selection pass has a hard time dealing with FP constants.
+  bool AllowNewConst = (Level < AfterLegalizeDAG);
 
+  // If 'unsafe math' or nnan is enabled, fold lots of things.
+  if ((Options.UnsafeFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
     // If allowed, fold (fadd (fneg x), x) -> 0.0
-    if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
+    if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
       return DAG.getConstantFP(0.0, DL, VT);
 
     // If allowed, fold (fadd x, (fneg x)) -> 0.0
-    if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
+    if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
       return DAG.getConstantFP(0.0, DL, VT);
+  }
+
+  // If 'unsafe math' or reassoc and nsz, fold lots of things.
+  // TODO: break out portions of the transformations below for which Unsafe is
+  //       considered and which do not require both nsz and reassoc
+  if ((Options.UnsafeFPMath ||
+       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+      AllowNewConst) {
+    // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
+    if (N1CFP && N0.getOpcode() == ISD::FADD &&
+        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags);
+      return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags);
+    }
 
     // We can fold chains of FADD's of the same value into multiplications.
     // This transform is not safe in general because we are reducing the number
@@ -9980,7 +10690,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
         }
       }
 
-      if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
+      if (N0.getOpcode() == ISD::FADD) {
         bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
         // (fadd (fadd x, x), x) -> (fmul x, 3.0)
         if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
@@ -9990,7 +10700,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
         }
       }
 
-      if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
+      if (N1.getOpcode() == ISD::FADD) {
         bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
         // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
         if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
@@ -10001,8 +10711,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       }
 
       // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
-      if (AllowNewConst &&
-          N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
+      if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
           N0.getOperand(0) == N0.getOperand(1) &&
           N1.getOperand(0) == N1.getOperand(1) &&
           N0.getOperand(0) == N1.getOperand(0)) {
@@ -10042,15 +10751,23 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
-  // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
-    return DAG.getNode(ISD::FADD, DL, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations), Flags);
+  // (fsub A, 0) -> A
+  if (N1CFP && N1CFP->isZero()) {
+    if (!N1CFP->isNegative() || Options.UnsafeFPMath ||
+        Flags.hasNoSignedZeros()) {
+      return N0;
+    }
+  }
 
-  // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.NoSignedZerosFPMath  || N->getFlags().hasNoSignedZeros()) {
-    // (fsub 0, B) -> -B
-    if (N0CFP && N0CFP->isZero()) {
+  if (N0 == N1) {
+    // (fsub x, x) -> 0.0
+    if (Options.UnsafeFPMath || Flags.hasNoNaNs())
+      return DAG.getConstantFP(0.0f, DL, VT);
+  }
+
+  // (fsub 0, B) -> -B
+  if (N0CFP && N0CFP->isZero()) {
+    if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) {
       if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N1, DAG, LegalOperations);
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
@@ -10058,16 +10775,13 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
+  // fold (fsub A, (fneg B)) -> (fadd A, B)
+  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
+    return DAG.getNode(ISD::FADD, DL, VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations), Flags);
+
   // If 'unsafe math' is enabled, fold lots of things.
   if (Options.UnsafeFPMath) {
-    // (fsub A, 0) -> A
-    if (N1CFP && N1CFP->isZero())
-      return N0;
-
-    // (fsub x, x) -> 0.0
-    if (N0 == N1)
-      return DAG.getConstantFP(0.0f, DL, VT);
-
     // (fsub x, (fadd x, y)) -> (fneg y)
     // (fsub x, (fadd y, x)) -> (fneg y)
     if (N1.getOpcode() == ISD::FADD) {
@@ -10124,12 +10838,15 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
-  if (Options.UnsafeFPMath) {
+  if (Options.UnsafeFPMath || 
+      (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) {
     // fold (fmul A, 0) -> 0
     if (N1CFP && N1CFP->isZero())
       return N1;
+  } 
 
-    // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+  if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
+    // fmul (fmul X, C1), C2 -> fmul X, C1 * C2
     if (N0.getOpcode() == ISD::FMUL) {
       // Fold scalars or any vector constants (not just splats).
       // This fold is done in general by InstCombine, but extra fmul insts
@@ -10153,13 +10870,10 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
       }
     }
 
-    // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
-    // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
-    // during an early run of DAGCombiner can prevent folding with fmuls
-    // inserted during lowering.
-    if (N0.getOpcode() == ISD::FADD &&
-        (N0.getOperand(0) == N0.getOperand(1)) &&
-        N0.hasOneUse()) {
+    // Match a special-case: we convert X * 2.0 into fadd.
+    // fmul (fadd X, X), C -> fmul X, 2.0 * C
+    if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
+        N0.getOperand(0) == N0.getOperand(1)) {
       const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
       SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
       return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
@@ -10253,6 +10967,10 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
 
+  // FMA nodes have flags that propagate to the created nodes.
+  const SDNodeFlags Flags = N->getFlags();
+  bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N);
+
   // Constant fold FMA.
   if (isa<ConstantFPSDNode>(N0) &&
       isa<ConstantFPSDNode>(N1) &&
@@ -10260,7 +10978,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
   }
 
-  if (Options.UnsafeFPMath) {
+  if (UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
       return N2;
     if (N1CFP && N1CFP->isZero())
@@ -10277,12 +10995,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
-  // TODO: FMA nodes should have flags that propagate to the created nodes.
-  // For now, create a Flags object for use with all unsafe math transforms.
-  SDNodeFlags Flags;
-  Flags.setUnsafeAlgebra(true);
-
-  if (Options.UnsafeFPMath) {
+  if (UnsafeFPMath) {
     // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
     if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
         isConstantFPBuildVectorOrConstantFP(N1) &&
@@ -10328,7 +11041,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     }
   }
 
-  if (Options.UnsafeFPMath) {
+  if (UnsafeFPMath) {
     // (fma x, c, x) -> (fmul x, (c+1))
     if (N1CFP && N0 == N2) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
@@ -10435,7 +11148,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
-  if (Options.UnsafeFPMath) {
+  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
       // Compute the reciprocal 1.0 / c2.
@@ -10544,17 +11257,16 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (!DAG.getTarget().Options.UnsafeFPMath)
+  SDNodeFlags Flags = N->getFlags();
+  if (!DAG.getTarget().Options.UnsafeFPMath && 
+      !Flags.hasApproximateFuncs())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   if (TLI.isFsqrtCheap(N0, DAG))
     return SDValue();
 
-  // TODO: FSQRT nodes should have flags that propagate to the created nodes.
-  // For now, create a Flags object for use with all unsafe math transforms.
-  SDNodeFlags Flags;
-  Flags.setUnsafeAlgebra(true);
+  // FSQRT nodes have flags that propagate to the created nodes.
   return buildSqrtEstimate(N0, Flags);
 }
 
@@ -10622,6 +11334,41 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   return SDValue();
 }
 
+static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
+                               const TargetLowering &TLI) {
+  // This optimization is guarded by a function attribute because it may produce
+  // unexpected results. Ie, programs may be relying on the platform-specific
+  // undefined behavior when the float-to-int conversion overflows.
+  const Function &F = DAG.getMachineFunction().getFunction();
+  Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow");
+  if (StrictOverflow.getValueAsString().equals("false"))
+    return SDValue();
+
+  // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
+  // replacing casts with a libcall. We also must be allowed to ignore -0.0
+  // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
+  // conversions would return +0.0.
+  // FIXME: We should be able to use node-level FMF here.
+  // TODO: If strict math, should we use FABS (+ range check for signed cast)?
+  EVT VT = N->getValueType(0);
+  if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
+      !DAG.getTarget().Options.NoSignedZerosFPMath)
+    return SDValue();
+
+  // fptosi/fptoui round towards zero, so converting from FP to integer and
+  // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
+  SDValue N0 = N->getOperand(0);
+  if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
+      N0.getOperand(0).getValueType() == VT)
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+  if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
+      N0.getOperand(0).getValueType() == VT)
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -10673,6 +11420,9 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
     }
   }
 
+  if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
+    return FTrunc;
+
   return SDValue();
 }
 
@@ -10712,6 +11462,9 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
     }
   }
 
+  if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
+    return FTrunc;
+
   return SDValue();
 }
 
@@ -11118,16 +11871,22 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
                        N1.getOperand(0), N1.getOperand(1), N2);
   }
 
-  if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) ||
-      ((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) &&
-       (N1.getOperand(0).hasOneUse() &&
-        N1.getOperand(0).getOpcode() == ISD::SRL))) {
-    SDNode *Trunc = nullptr;
-    if (N1.getOpcode() == ISD::TRUNCATE) {
-      // Look pass the truncate.
-      Trunc = N1.getNode();
-      N1 = N1.getOperand(0);
-    }
+  if (N1.hasOneUse()) {
+    if (SDValue NewN1 = rebuildSetCC(N1))
+      return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, NewN1, N2);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::rebuildSetCC(SDValue N) {
+  if (N.getOpcode() == ISD::SRL ||
+      (N.getOpcode() == ISD::TRUNCATE &&
+       (N.getOperand(0).hasOneUse() &&
+        N.getOperand(0).getOpcode() == ISD::SRL))) {
+    // Look pass the truncate.
+    if (N.getOpcode() == ISD::TRUNCATE)
+      N = N.getOperand(0);
 
     // Match this pattern so that we can generate simpler code:
     //
@@ -11146,74 +11905,55 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
     // This applies only when the AND constant value has one bit set and the
     // SRL constant is equal to the log2 of the AND constant. The back-end is
     // smart enough to convert the result into a TEST/JMP sequence.
-    SDValue Op0 = N1.getOperand(0);
-    SDValue Op1 = N1.getOperand(1);
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
 
-    if (Op0.getOpcode() == ISD::AND &&
-        Op1.getOpcode() == ISD::Constant) {
+    if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
       SDValue AndOp1 = Op0.getOperand(1);
 
       if (AndOp1.getOpcode() == ISD::Constant) {
         const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
 
         if (AndConst.isPowerOf2() &&
-            cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
+            cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
           SDLoc DL(N);
-          SDValue SetCC =
-            DAG.getSetCC(DL,
-                         getSetCCResultType(Op0.getValueType()),
-                         Op0, DAG.getConstant(0, DL, Op0.getValueType()),
-                         ISD::SETNE);
-
-          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, DL,
-                                          MVT::Other, Chain, SetCC, N2);
-          // Don't add the new BRCond into the worklist or else SimplifySelectCC
-          // will convert it back to (X & C1) >> C2.
-          CombineTo(N, NewBRCond, false);
-          // Truncate is dead.
-          if (Trunc)
-            deleteAndRecombine(Trunc);
-          // Replace the uses of SRL with SETCC
-          WorklistRemover DeadNodes(*this);
-          DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
-          deleteAndRecombine(N1.getNode());
-          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+          return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
+                              Op0, DAG.getConstant(0, DL, Op0.getValueType()),
+                              ISD::SETNE);
         }
       }
     }
-
-    if (Trunc)
-      // Restore N1 if the above transformation doesn't match.
-      N1 = N->getOperand(1);
   }
 
   // Transform br(xor(x, y)) -> br(x != y)
   // Transform br(xor(xor(x,y), 1)) -> br (x == y)
-  if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) {
-    SDNode *TheXor = N1.getNode();
+  if (N.getOpcode() == ISD::XOR) {
+    // Because we may call this on a speculatively constructed
+    // SimplifiedSetCC Node, we need to simplify this node first.
+    // Ideally this should be folded into SimplifySetCC and not
+    // here. For now, grab a handle to N so we don't lose it from
+    // replacements interal to the visit.
+    HandleSDNode XORHandle(N);
+    while (N.getOpcode() == ISD::XOR) {
+      SDValue Tmp = visitXOR(N.getNode());
+      // No simplification done.
+      if (!Tmp.getNode())
+        break;
+      // Returning N is form in-visit replacement that may invalidated
+      // N. Grab value from Handle.
+      if (Tmp.getNode() == N.getNode())
+        N = XORHandle.getValue();
+      else // Node simplified. Try simplifying again.
+        N = Tmp;
+    }
+
+    if (N.getOpcode() != ISD::XOR)
+      return N;
+
+    SDNode *TheXor = N.getNode();
+
     SDValue Op0 = TheXor->getOperand(0);
     SDValue Op1 = TheXor->getOperand(1);
-    if (Op0.getOpcode() == Op1.getOpcode()) {
-      // Avoid missing important xor optimizations.
-      if (SDValue Tmp = visitXOR(TheXor)) {
-        if (Tmp.getNode() != TheXor) {
-          DEBUG(dbgs() << "\nReplacing.8 ";
-                TheXor->dump(&DAG);
-                dbgs() << "\nWith: ";
-                Tmp.getNode()->dump(&DAG);
-                dbgs() << '\n');
-          WorklistRemover DeadNodes(*this);
-          DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
-          deleteAndRecombine(TheXor);
-          return DAG.getNode(ISD::BRCOND, SDLoc(N),
-                             MVT::Other, Chain, Tmp, N2);
-        }
-
-        // visitXOR has changed XOR's operands or replaced the XOR completely,
-        // bail out.
-        return SDValue(N, 0);
-      }
-    }
 
     if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
       bool Equal = false;
@@ -11223,19 +11963,12 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
         Equal = true;
       }
 
-      EVT SetCCVT = N1.getValueType();
+      EVT SetCCVT = N.getValueType();
       if (LegalTypes)
         SetCCVT = getSetCCResultType(SetCCVT);
-      SDValue SetCC = DAG.getSetCC(SDLoc(TheXor),
-                                   SetCCVT,
-                                   Op0, Op1,
-                                   Equal ? ISD::SETEQ : ISD::SETNE);
       // Replace the uses of XOR with SETCC
-      WorklistRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
-      deleteAndRecombine(N1.getNode());
-      return DAG.getNode(ISD::BRCOND, SDLoc(N),
-                         MVT::Other, Chain, SetCC, N2);
+      return DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1,
+                          Equal ? ISD::SETEQ : ISD::SETNE);
     }
   }
 
@@ -11467,11 +12200,8 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
                                  BasePtr, Offset, AM);
   ++PreIndexedNodes;
   ++NodesCombined;
-  DEBUG(dbgs() << "\nReplacing.4 ";
-        N->dump(&DAG);
-        dbgs() << "\nWith: ";
-        Result.getNode()->dump(&DAG);
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             Result.getNode()->dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   if (isLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
@@ -11636,11 +12366,9 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
                                 BasePtr, Offset, AM);
         ++PostIndexedNodes;
         ++NodesCombined;
-        DEBUG(dbgs() << "\nReplacing.5 ";
-              N->dump(&DAG);
-              dbgs() << "\nWith: ";
-              Result.getNode()->dump(&DAG);
-              dbgs() << '\n');
+        LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
+                   dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
+                   dbgs() << '\n');
         WorklistRemover DeadNodes(*this);
         if (isLoad) {
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
@@ -11664,7 +12392,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
-/// \brief Return the base-pointer arithmetic from an indexed \p LD.
+/// Return the base-pointer arithmetic from an indexed \p LD.
 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   assert(AM != ISD::UNINDEXED);
@@ -11706,11 +12434,9 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
         // v3         = add v2, c
         // Now we replace use of chain2 with chain1.  This makes the second load
         // isomorphic to the one we are deleting, and thus makes this load live.
-        DEBUG(dbgs() << "\nReplacing.6 ";
-              N->dump(&DAG);
-              dbgs() << "\nWith chain: ";
-              Chain.getNode()->dump(&DAG);
-              dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
+                   dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG);
+                   dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
         AddUsersToWorklist(Chain.getNode());
@@ -11741,11 +12467,9 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
           AddUsersToWorklist(N);
         } else
           Index = DAG.getUNDEF(N->getValueType(1));
-        DEBUG(dbgs() << "\nReplacing.7 ";
-              N->dump(&DAG);
-              dbgs() << "\nWith: ";
-              Undef.getNode()->dump(&DAG);
-              dbgs() << " and 2 other values\n");
+        LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
+                   dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG);
+                   dbgs() << " and 2 other values\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
@@ -11773,13 +12497,14 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > LD->getMemOperand()->getBaseAlignment()) {
+      if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) {
         SDValue NewLoad = DAG.getExtLoad(
             LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
             LD->getPointerInfo(), LD->getMemoryVT(), Align,
             LD->getMemOperand()->getFlags(), LD->getAAInfo());
-        if (NewLoad.getNode() != N)
-          return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
+        // NewLoad will always be N as we are only refining the alignment
+        assert(NewLoad.getNode() == N);
+        (void)NewLoad;
       }
     }
   }
@@ -11826,7 +12551,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
 namespace {
 
-/// \brief Helper structure used to slice a load in smaller loads.
+/// Helper structure used to slice a load in smaller loads.
 /// Basically a slice is obtained from the following sequence:
 /// Origin = load Ty1, Base
 /// Shift = srl Ty1 Origin, CstTy Amount
@@ -11839,7 +12564,7 @@ namespace {
 /// SliceTy is deduced from the number of bits that are actually used to
 /// build Inst.
 struct LoadedSlice {
-  /// \brief Helper structure used to compute the cost of a slice.
+  /// Helper structure used to compute the cost of a slice.
   struct Cost {
     /// Are we optimizing for code size.
     bool ForCodeSize;
@@ -11853,7 +12578,7 @@ struct LoadedSlice {
 
     Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}
 
-    /// \brief Get the cost of one isolated slice.
+    /// Get the cost of one isolated slice.
     Cost(const LoadedSlice &LS, bool ForCodeSize = false)
         : ForCodeSize(ForCodeSize), Loads(1) {
       EVT TruncType = LS.Inst->getValueType(0);
@@ -11863,7 +12588,7 @@ struct LoadedSlice {
         ZExts = 1;
     }
 
-    /// \brief Account for slicing gain in the current cost.
+    /// Account for slicing gain in the current cost.
     /// Slicing provide a few gains like removing a shift or a
     /// truncate. This method allows to grow the cost of the original
     /// load with the gain from this slice.
@@ -11936,7 +12661,7 @@ struct LoadedSlice {
               unsigned Shift = 0, SelectionDAG *DAG = nullptr)
       : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
 
-  /// \brief Get the bits used in a chunk of bits \p BitWidth large.
+  /// Get the bits used in a chunk of bits \p BitWidth large.
   /// \return Result is \p BitWidth and has used bits set to 1 and
   ///         not used bits set to 0.
   APInt getUsedBits() const {
@@ -11956,14 +12681,14 @@ struct LoadedSlice {
     return UsedBits;
   }
 
-  /// \brief Get the size of the slice to be loaded in bytes.
+  /// Get the size of the slice to be loaded in bytes.
   unsigned getLoadedSize() const {
     unsigned SliceSize = getUsedBits().countPopulation();
     assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
     return SliceSize / 8;
   }
 
-  /// \brief Get the type that will be loaded for this slice.
+  /// Get the type that will be loaded for this slice.
   /// Note: This may not be the final type for the slice.
   EVT getLoadedType() const {
     assert(DAG && "Missing context");
@@ -11971,7 +12696,7 @@ struct LoadedSlice {
     return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
   }
 
-  /// \brief Get the alignment of the load used for this slice.
+  /// Get the alignment of the load used for this slice.
   unsigned getAlignment() const {
     unsigned Alignment = Origin->getAlignment();
     unsigned Offset = getOffsetFromBase();
@@ -11980,7 +12705,7 @@ struct LoadedSlice {
     return Alignment;
   }
 
-  /// \brief Check if this slice can be rewritten with legal operations.
+  /// Check if this slice can be rewritten with legal operations.
   bool isLegal() const {
     // An invalid slice is not legal.
     if (!Origin || !Inst || !DAG)
@@ -12024,7 +12749,7 @@ struct LoadedSlice {
     return true;
   }
 
-  /// \brief Get the offset in bytes of this slice in the original chunk of
+  /// Get the offset in bytes of this slice in the original chunk of
   /// bits.
   /// \pre DAG != nullptr.
   uint64_t getOffsetFromBase() const {
@@ -12045,7 +12770,7 @@ struct LoadedSlice {
     return Offset;
   }
 
-  /// \brief Generate the sequence of instructions to load the slice
+  /// Generate the sequence of instructions to load the slice
   /// represented by this object and redirect the uses of this slice to
   /// this new sequence of instructions.
   /// \pre this->Inst && this->Origin are valid Instructions and this
@@ -12083,7 +12808,7 @@ struct LoadedSlice {
     return LastInst;
   }
 
-  /// \brief Check if this slice can be merged with an expensive cross register
+  /// Check if this slice can be merged with an expensive cross register
   /// bank copy. E.g.,
   /// i = load i32
   /// f = bitcast i32 i to float
@@ -12132,7 +12857,7 @@ struct LoadedSlice {
 
 } // end anonymous namespace
 
-/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
+/// Check that all bits set in \p UsedBits form a dense region, i.e.,
 /// \p UsedBits looks like 0..0 1..1 0..0.
 static bool areUsedBitsDense(const APInt &UsedBits) {
   // If all the bits are one, this is dense!
@@ -12148,7 +12873,7 @@ static bool areUsedBitsDense(const APInt &UsedBits) {
   return NarrowedUsedBits.isAllOnesValue();
 }
 
-/// \brief Check whether or not \p First and \p Second are next to each other
+/// Check whether or not \p First and \p Second are next to each other
 /// in memory. This means that there is no hole between the bits loaded
 /// by \p First and the bits loaded by \p Second.
 static bool areSlicesNextToEachOther(const LoadedSlice &First,
@@ -12162,7 +12887,7 @@ static bool areSlicesNextToEachOther(const LoadedSlice &First,
   return areUsedBitsDense(UsedBits);
 }
 
-/// \brief Adjust the \p GlobalLSCost according to the target
+/// Adjust the \p GlobalLSCost according to the target
 /// paring capabilities and the layout of the slices.
 /// \pre \p GlobalLSCost should account for at least as many loads as
 /// there is in the slices in \p LoadedSlices.
@@ -12175,8 +12900,8 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
 
   // Sort the slices so that elements that are likely to be next to each
   // other in memory are next to each other in the list.
-  std::sort(LoadedSlices.begin(), LoadedSlices.end(),
-            [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
+  llvm::sort(LoadedSlices.begin(), LoadedSlices.end(),
+             [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
     assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
     return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
   });
@@ -12223,7 +12948,7 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
   }
 }
 
-/// \brief Check the profitability of all involved LoadedSlice.
+/// Check the profitability of all involved LoadedSlice.
 /// Currently, it is considered profitable if there is exactly two
 /// involved slices (1) which are (2) next to each other in memory, and
 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
@@ -12267,7 +12992,7 @@ static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
   return OrigCost > GlobalSlicingCost;
 }
 
-/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// If the given load, \p LI, is used only by trunc or trunc(lshr)
 /// operations, split it in the various pieces being extracted.
 ///
 /// This sort of thing is introduced by SROA.
@@ -12386,22 +13111,6 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
   if (LD->getBasePtr() != Ptr) return Result;  // Not from same pointer.
 
-  // The store should be chained directly to the load or be an operand of a
-  // tokenfactor.
-  if (LD == Chain.getNode())
-    ; // ok.
-  else if (Chain->getOpcode() != ISD::TokenFactor)
-    return Result; // Fail.
-  else {
-    bool isOk = false;
-    for (const SDValue &ChainOp : Chain->op_values())
-      if (ChainOp.getNode() == LD) {
-        isOk = true;
-        break;
-      }
-    if (!isOk) return Result;
-  }
-
   // This only handles simple types.
   if (V.getValueType() != MVT::i16 &&
       V.getValueType() != MVT::i32 &&
@@ -12438,6 +13147,24 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   // is aligned the same as the access width.
   if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
 
+  // For narrowing to be valid, it must be the case that the load the
+  // immediately preceeding memory operation before the store.
+  if (LD == Chain.getNode())
+    ; // ok.
+  else if (Chain->getOpcode() == ISD::TokenFactor &&
+           SDValue(LD, 1).hasOneUse()) {
+    // LD has only 1 chain use so they are no indirect dependencies.
+    bool isOk = false;
+    for (const SDValue &ChainOp : Chain->op_values())
+      if (ChainOp.getNode() == LD) {
+        isOk = true;
+        break;
+      }
+    if (!isOk)
+      return Result;
+  } else
+    return Result; // Fail.
+
   Result.first = MaskedBytes;
   Result.second = NotMaskTZ/8;
   return Result;
@@ -12756,12 +13483,6 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
   return false;
 }
 
-static SDValue peekThroughBitcast(SDValue V) {
-  while (V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-  return V;
-}
-
 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
                                          unsigned NumStores) {
   SmallVector<SDValue, 8> Chains;
@@ -12886,6 +13607,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
 
       SDValue Val = St->getValue();
+      Val = peekThroughBitcast(Val);
       StoreInt <<= ElementSizeBits;
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
         StoreInt |= C->getAPIntValue()
@@ -12918,13 +13640,13 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
                             FirstInChain->getPointerInfo(),
                             FirstInChain->getAlignment());
   } else { // Must be realized as a trunc store
-    EVT LegalizedStoredValueTy =
+    EVT LegalizedStoredValTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
-    unsigned LegalizedStoreSize = LegalizedStoredValueTy.getSizeInBits();
+    unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
     ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
     SDValue ExtendedStoreVal =
         DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
-                        LegalizedStoredValueTy);
+                        LegalizedStoredValTy);
     NewStore = DAG.getTruncStore(
         NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
         FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
@@ -12941,7 +13663,8 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
 }
 
 void DAGCombiner::getStoreMergeCandidates(
-    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
+    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
+    SDNode *&RootNode) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
@@ -12970,6 +13693,12 @@ void DAGCombiner::getStoreMergeCandidates(
     // Load and store should be the same type.
     if (MemVT != LoadVT)
       return;
+    // Loads must only have one use.
+    if (!Ld->hasNUsesOfValue(1, 0))
+      return;
+    // The memory operands must not be volatile.
+    if (Ld->isVolatile() || Ld->isIndexed())
+      return;
   }
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
@@ -12987,6 +13716,12 @@ void DAGCombiner::getStoreMergeCandidates(
         auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
         if (LoadVT != OtherLd->getMemoryVT())
           return false;
+        // Loads must only have one use.
+        if (!OtherLd->hasNUsesOfValue(1, 0))
+          return false;
+        // The memory operands must not be volatile.
+        if (OtherLd->isVolatile() || OtherLd->isIndexed())
+          return false;
         if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
           return false;
       } else
@@ -13028,7 +13763,7 @@ void DAGCombiner::getStoreMergeCandidates(
   // FIXME: We should be able to climb and
   // descend TokenFactors to find candidates as well.
 
-  SDNode *RootNode = (St->getChain()).getNode();
+  RootNode = St->getChain().getNode();
 
   if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
     RootNode = Ldn->getChain().getNode();
@@ -13059,31 +13794,54 @@ void DAGCombiner::getStoreMergeCandidates(
 // through the chain). Check in parallel by searching up from
 // non-chain operands of candidates.
 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
-    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
+    SDNode *RootNode) {
   // FIXME: We should be able to truncate a full search of
   // predecessors by doing a BFS and keeping tabs the originating
   // stores from which worklist nodes come from in a similar way to
   // TokenFactor simplfication.
 
-  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 8> Worklist;
-  unsigned int Max = 8192;
+
+  // RootNode is a predecessor to all candidates so we need not search
+  // past it. Add RootNode (peeking through TokenFactors). Do not count
+  // these towards size check.
+
+  Worklist.push_back(RootNode);
+  while (!Worklist.empty()) {
+    auto N = Worklist.pop_back_val();
+    if (N->getOpcode() == ISD::TokenFactor) {
+      for (SDValue Op : N->ops())
+        Worklist.push_back(Op.getNode());
+    }
+    Visited.insert(N);
+  }
+
+  // Don't count pruning nodes towards max.
+  unsigned int Max = 1024 + Visited.size();
   // Search Ops of store candidates.
   for (unsigned i = 0; i < NumStores; ++i) {
-    SDNode *n = StoreNodes[i].MemNode;
-    // Potential loops may happen only through non-chain operands
-    for (unsigned j = 1; j < n->getNumOperands(); ++j)
-      Worklist.push_back(n->getOperand(j).getNode());
+    SDNode *N = StoreNodes[i].MemNode;
+    // Of the 4 Store Operands:
+    //   * Chain (Op 0) -> We have already considered these
+    //                    in candidate selection and can be
+    //                    safely ignored
+    //   * Value (Op 1) -> Cycles may happen (e.g. through load chains)
+    //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant
+    //                      and so no cycles are possible.
+    //   * (Op 3) -> appears to always be undef. Cannot be source of cycle.
+    //
+    // Thus we need only check predecessors of the value operands.
+    auto *Op = N->getOperand(1).getNode();
+    if (Visited.insert(Op).second)
+      Worklist.push_back(Op);
   }
   // Search through DAG. We can stop early if we find a store node.
-  for (unsigned i = 0; i < NumStores; ++i) {
+  for (unsigned i = 0; i < NumStores; ++i)
     if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
                                      Max))
       return false;
-    // Check if we ended early, failing conservatively if so.
-    if (Visited.size() >= Max)
-      return false;
-  }
   return true;
 }
 
@@ -13121,8 +13879,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     return false;
 
   SmallVector<MemOpLink, 8> StoreNodes;
+  SDNode *RootNode;
   // Find potential store merge candidates by searching through chain sub-DAG
-  getStoreMergeCandidates(St, StoreNodes);
+  getStoreMergeCandidates(St, StoreNodes, RootNode);
 
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
@@ -13130,10 +13889,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
 
   // Sort the memory operands according to their distance from the
   // base pointer.
-  std::sort(StoreNodes.begin(), StoreNodes.end(),
-            [](MemOpLink LHS, MemOpLink RHS) {
-              return LHS.OffsetFromBase < RHS.OffsetFromBase;
-            });
+  llvm::sort(StoreNodes.begin(), StoreNodes.end(),
+             [](MemOpLink LHS, MemOpLink RHS) {
+               return LHS.OffsetFromBase < RHS.OffsetFromBase;
+             });
 
   // Store Merge attempts to merge the lowest stores. This generally
   // works out as if successful, as the remaining stores are checked
@@ -13177,178 +13936,191 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       continue;
     }
 
-    // Check that we can merge these candidates without causing a cycle
-    if (!checkMergeStoreCandidatesForDependencies(StoreNodes,
-                                                  NumConsecutiveStores)) {
-      StoreNodes.erase(StoreNodes.begin(),
-                       StoreNodes.begin() + NumConsecutiveStores);
-      continue;
-    }
-
     // The node with the lowest store address.
     LLVMContext &Context = *DAG.getContext();
     const DataLayout &DL = DAG.getDataLayout();
 
     // Store the constants into memory as one consecutive store.
     if (IsConstantSrc) {
-      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-      unsigned FirstStoreAlign = FirstInChain->getAlignment();
-      unsigned LastLegalType = 1;
-      unsigned LastLegalVectorType = 1;
-      bool LastIntegerTrunc = false;
-      bool NonZero = false;
-      unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
-      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
-        StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
-        SDValue StoredVal = ST->getValue();
-        bool IsElementZero = false;
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
-          IsElementZero = C->isNullValue();
-        else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
-          IsElementZero = C->getConstantFPValue()->isNullValue();
-        if (IsElementZero) {
-          if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
-            FirstZeroAfterNonZero = i;
-        }
-        NonZero |= !IsElementZero;
+      while (NumConsecutiveStores >= 2) {
+        LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+        unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+        unsigned FirstStoreAlign = FirstInChain->getAlignment();
+        unsigned LastLegalType = 1;
+        unsigned LastLegalVectorType = 1;
+        bool LastIntegerTrunc = false;
+        bool NonZero = false;
+        unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
+        for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+          StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
+          SDValue StoredVal = ST->getValue();
+          bool IsElementZero = false;
+          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
+            IsElementZero = C->isNullValue();
+          else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
+            IsElementZero = C->getConstantFPValue()->isNullValue();
+          if (IsElementZero) {
+            if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
+              FirstZeroAfterNonZero = i;
+          }
+          NonZero |= !IsElementZero;
 
-        // Find a legal type for the constant store.
-        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
-        EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-        bool IsFast = false;
-        if (TLI.isTypeLegal(StoreTy) &&
-            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
-            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                                   FirstStoreAlign, &IsFast) &&
-            IsFast) {
-          LastIntegerTrunc = false;
-          LastLegalType = i + 1;
-          // Or check whether a truncstore is legal.
-        } else if (TLI.getTypeAction(Context, StoreTy) ==
-                   TargetLowering::TypePromoteInteger) {
-          EVT LegalizedStoredValueTy =
-              TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
-          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-              TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
+          // Find a legal type for the constant store.
+          unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+          EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+          bool IsFast = false;
+
+          // Break early when size is too large to be legal.
+          if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
+            break;
+
+          if (TLI.isTypeLegal(StoreTy) &&
+              TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
               TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                      FirstStoreAlign, &IsFast) &&
               IsFast) {
-            LastIntegerTrunc = true;
+            LastIntegerTrunc = false;
             LastLegalType = i + 1;
+            // Or check whether a truncstore is legal.
+          } else if (TLI.getTypeAction(Context, StoreTy) ==
+                     TargetLowering::TypePromoteInteger) {
+            EVT LegalizedStoredValTy =
+                TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+            if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
+                TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
+                TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                       FirstStoreAlign, &IsFast) &&
+                IsFast) {
+              LastIntegerTrunc = true;
+              LastLegalType = i + 1;
+            }
           }
-        }
 
-        // We only use vectors if the constant is known to be zero or the target
-        // allows it and the function is not marked with the noimplicitfloat
-        // attribute.
-        if ((!NonZero ||
-             TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
-            !NoVectors) {
-          // Find a legal type for the vector store.
-          unsigned Elts = (i + 1) * NumMemElts;
-          EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
-          if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
-              TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
-              TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
-                                     FirstStoreAlign, &IsFast) &&
-              IsFast)
-            LastLegalVectorType = i + 1;
+          // We only use vectors if the constant is known to be zero or the
+          // target allows it and the function is not marked with the
+          // noimplicitfloat attribute.
+          if ((!NonZero ||
+               TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
+              !NoVectors) {
+            // Find a legal type for the vector store.
+            unsigned Elts = (i + 1) * NumMemElts;
+            EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+            if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
+                TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+                TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                       FirstStoreAlign, &IsFast) &&
+                IsFast)
+              LastLegalVectorType = i + 1;
+          }
         }
-      }
 
-      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
-      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
+        bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+        unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
+
+        // Check if we found a legal integer type that creates a meaningful
+        // merge.
+        if (NumElem < 2) {
+          // We know that candidate stores are in order and of correct
+          // shape. While there is no mergeable sequence from the
+          // beginning one may start later in the sequence. The only
+          // reason a merge of size N could have failed where another of
+          // the same size would not have, is if the alignment has
+          // improved or we've dropped a non-zero value. Drop as many
+          // candidates as we can here.
+          unsigned NumSkip = 1;
+          while (
+              (NumSkip < NumConsecutiveStores) &&
+              (NumSkip < FirstZeroAfterNonZero) &&
+              (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+            NumSkip++;
+
+          StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+          NumConsecutiveStores -= NumSkip;
+          continue;
+        }
 
-      // Check if we found a legal integer type that creates a meaningful merge.
-      if (NumElem < 2) {
-        // We know that candidate stores are in order and of correct
-        // shape. While there is no mergeable sequence from the
-        // beginning one may start later in the sequence. The only
-        // reason a merge of size N could have failed where another of
-        // the same size would not have, is if the alignment has
-        // improved or we've dropped a non-zero value. Drop as many
-        // candidates as we can here.
-        unsigned NumSkip = 1;
-        while (
-            (NumSkip < NumConsecutiveStores) &&
-            (NumSkip < FirstZeroAfterNonZero) &&
-            (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) {
-          NumSkip++;
+        // Check that we can merge these candidates without causing a cycle.
+        if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
+                                                      RootNode)) {
+          StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+          NumConsecutiveStores -= NumElem;
+          continue;
         }
-        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
-        continue;
-      }
 
-      bool Merged = MergeStoresOfConstantsOrVecElts(
-          StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
-      RV |= Merged;
+        RV |= MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, true,
+                                              UseVector, LastIntegerTrunc);
 
-      // Remove merged stores for next iteration.
-      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+        // Remove merged stores for next iteration.
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+        NumConsecutiveStores -= NumElem;
+      }
       continue;
     }
 
     // When extracting multiple vector elements, try to store them
     // in one vector store rather than a sequence of scalar stores.
     if (IsExtractVecSrc) {
-      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-      unsigned FirstStoreAlign = FirstInChain->getAlignment();
-      unsigned NumStoresToMerge = 1;
-      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
-        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-        SDValue StVal = peekThroughBitcast(St->getValue());
-        // This restriction could be loosened.
-        // Bail out if any stored values are not elements extracted from a
-        // vector. It should be possible to handle mixed sources, but load
-        // sources need more careful handling (see the block of code below that
-        // handles consecutive loads).
-        if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
-            StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-          return RV;
+      // Loop on Consecutive Stores on success.
+      while (NumConsecutiveStores >= 2) {
+        LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+        unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+        unsigned FirstStoreAlign = FirstInChain->getAlignment();
+        unsigned NumStoresToMerge = 1;
+        for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+          // Find a legal type for the vector store.
+          unsigned Elts = (i + 1) * NumMemElts;
+          EVT Ty =
+              EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+          bool IsFast;
 
-        // Find a legal type for the vector store.
-        unsigned Elts = (i + 1) * NumMemElts;
-        EVT Ty =
-            EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
-        bool IsFast;
-        if (TLI.isTypeLegal(Ty) &&
-            TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
-            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
-                                   FirstStoreAlign, &IsFast) &&
-            IsFast)
-          NumStoresToMerge = i + 1;
-      }
+          // Break early when size is too large to be legal.
+          if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
+            break;
 
-      // Check if we found a legal integer type that creates a meaningful merge.
-      if (NumStoresToMerge < 2) {
-        // We know that candidate stores are in order and of correct
-        // shape. While there is no mergeable sequence from the
-        // beginning one may start later in the sequence. The only
-        // reason a merge of size N could have failed where another of
-        // the same size would not have, is if the alignment has
-        // improved. Drop as many candidates as we can here.
-        unsigned NumSkip = 1;
-        while ((NumSkip < NumConsecutiveStores) &&
-               (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
-          NumSkip++;
+          if (TLI.isTypeLegal(Ty) &&
+              TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+              TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                     FirstStoreAlign, &IsFast) &&
+              IsFast)
+            NumStoresToMerge = i + 1;
+        }
 
-        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
-        continue;
-      }
+        // Check if we found a legal integer type creating a meaningful
+        // merge.
+        if (NumStoresToMerge < 2) {
+          // We know that candidate stores are in order and of correct
+          // shape. While there is no mergeable sequence from the
+          // beginning one may start later in the sequence. The only
+          // reason a merge of size N could have failed where another of
+          // the same size would not have, is if the alignment has
+          // improved. Drop as many candidates as we can here.
+          unsigned NumSkip = 1;
+          while (
+              (NumSkip < NumConsecutiveStores) &&
+              (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+            NumSkip++;
+
+          StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+          NumConsecutiveStores -= NumSkip;
+          continue;
+        }
+
+        // Check that we can merge these candidates without causing a cycle.
+        if (!checkMergeStoreCandidatesForDependencies(
+                StoreNodes, NumStoresToMerge, RootNode)) {
+          StoreNodes.erase(StoreNodes.begin(),
+                           StoreNodes.begin() + NumStoresToMerge);
+          NumConsecutiveStores -= NumStoresToMerge;
+          continue;
+        }
+
+        RV |= MergeStoresOfConstantsOrVecElts(
+            StoreNodes, MemVT, NumStoresToMerge, false, true, false);
 
-      bool Merged = MergeStoresOfConstantsOrVecElts(
-          StoreNodes, MemVT, NumStoresToMerge, false, true, false);
-      if (!Merged) {
         StoreNodes.erase(StoreNodes.begin(),
                          StoreNodes.begin() + NumStoresToMerge);
-        continue;
+        NumConsecutiveStores -= NumStoresToMerge;
       }
-      // Remove merged stores for next iteration.
-      StoreNodes.erase(StoreNodes.begin(),
-                       StoreNodes.begin() + NumStoresToMerge);
-      RV = true;
       continue;
     }
 
@@ -13362,24 +14134,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     // Find acceptable loads. Loads need to have the same chain (token factor),
     // must not be zext, volatile, indexed, and they must be consecutive.
     BaseIndexOffset LdBasePtr;
+
     for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
       StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
       SDValue Val = peekThroughBitcast(St->getValue());
-      LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val);
-      if (!Ld)
-        break;
-
-      // Loads must only have one use.
-      if (!Ld->hasNUsesOfValue(1, 0))
-        break;
-
-      // The memory operands must not be volatile.
-      if (Ld->isVolatile() || Ld->isIndexed())
-        break;
-
-      // The stored memory type must be the same.
-      if (Ld->getMemoryVT() != MemVT)
-        break;
+      LoadSDNode *Ld = cast<LoadSDNode>(Val);
 
       BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
       // If this is not the first ptr that we check.
@@ -13397,90 +14156,75 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       LoadNodes.push_back(MemOpLink(Ld, LdOffset));
     }
 
-    if (LoadNodes.size() < 2) {
-      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
-      continue;
-    }
+    while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
+      // If we have load/store pair instructions and we only have two values,
+      // don't bother merging.
+      unsigned RequiredAlignment;
+      if (LoadNodes.size() == 2 &&
+          TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+          StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
+        LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
+        break;
+      }
+      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+      unsigned FirstStoreAlign = FirstInChain->getAlignment();
+      LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+      unsigned FirstLoadAS = FirstLoad->getAddressSpace();
+      unsigned FirstLoadAlign = FirstLoad->getAlignment();
 
-    // If we have load/store pair instructions and we only have two values,
-    // don't bother merging.
-    unsigned RequiredAlignment;
-    if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
-        StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
-      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
-      continue;
-    }
-    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-    unsigned FirstStoreAlign = FirstInChain->getAlignment();
-    LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
-    unsigned FirstLoadAS = FirstLoad->getAddressSpace();
-    unsigned FirstLoadAlign = FirstLoad->getAlignment();
+      // Scan the memory operations on the chain and find the first
+      // non-consecutive load memory address. These variables hold the index in
+      // the store node array.
 
-    // Scan the memory operations on the chain and find the first
-    // non-consecutive load memory address. These variables hold the index in
-    // the store node array.
-    unsigned LastConsecutiveLoad = 1;
-    // This variable refers to the size and not index in the array.
-    unsigned LastLegalVectorType = 1;
-    unsigned LastLegalIntegerType = 1;
-    bool isDereferenceable = true;
-    bool DoIntegerTruncate = false;
-    StartAddress = LoadNodes[0].OffsetFromBase;
-    SDValue FirstChain = FirstLoad->getChain();
-    for (unsigned i = 1; i < LoadNodes.size(); ++i) {
-      // All loads must share the same chain.
-      if (LoadNodes[i].MemNode->getChain() != FirstChain)
-        break;
+      unsigned LastConsecutiveLoad = 1;
 
-      int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
-      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-        break;
-      LastConsecutiveLoad = i;
-
-      if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
-        isDereferenceable = false;
-
-      // Find a legal type for the vector store.
-      unsigned Elts = (i + 1) * NumMemElts;
-      EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
-
-      bool IsFastSt, IsFastLd;
-      if (TLI.isTypeLegal(StoreTy) &&
-          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                                 FirstStoreAlign, &IsFastSt) &&
-          IsFastSt &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
-                                 FirstLoadAlign, &IsFastLd) &&
-          IsFastLd) {
-        LastLegalVectorType = i + 1;
-      }
+      // This variable refers to the size and not index in the array.
+      unsigned LastLegalVectorType = 1;
+      unsigned LastLegalIntegerType = 1;
+      bool isDereferenceable = true;
+      bool DoIntegerTruncate = false;
+      StartAddress = LoadNodes[0].OffsetFromBase;
+      SDValue FirstChain = FirstLoad->getChain();
+      for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+        // All loads must share the same chain.
+        if (LoadNodes[i].MemNode->getChain() != FirstChain)
+          break;
+
+        int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+        if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+          break;
+        LastConsecutiveLoad = i;
+
+        if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
+          isDereferenceable = false;
+
+        // Find a legal type for the vector store.
+        unsigned Elts = (i + 1) * NumMemElts;
+        EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+
+        // Break early when size is too large to be legal.
+        if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
+          break;
+
+        bool IsFastSt, IsFastLd;
+        if (TLI.isTypeLegal(StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                   FirstStoreAlign, &IsFastSt) &&
+            IsFastSt &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                   FirstLoadAlign, &IsFastLd) &&
+            IsFastLd) {
+          LastLegalVectorType = i + 1;
+        }
 
-      // Find a legal type for the integer store.
-      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
-      StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-      if (TLI.isTypeLegal(StoreTy) &&
-          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                                 FirstStoreAlign, &IsFastSt) &&
-          IsFastSt &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
-                                 FirstLoadAlign, &IsFastLd) &&
-          IsFastLd) {
-        LastLegalIntegerType = i + 1;
-        DoIntegerTruncate = false;
-        // Or check whether a truncstore and extload is legal.
-      } else if (TLI.getTypeAction(Context, StoreTy) ==
-                 TargetLowering::TypePromoteInteger) {
-        EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
-        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
-            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy,
-                               StoreTy) &&
-            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
-                               StoreTy) &&
-            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+        // Find a legal type for the integer store.
+        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+        StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+        if (TLI.isTypeLegal(StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                    FirstStoreAlign, &IsFastSt) &&
             IsFastSt &&
@@ -13488,105 +14232,140 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
                                    FirstLoadAlign, &IsFastLd) &&
             IsFastLd) {
           LastLegalIntegerType = i + 1;
-          DoIntegerTruncate = true;
+          DoIntegerTruncate = false;
+          // Or check whether a truncstore and extload is legal.
+        } else if (TLI.getTypeAction(Context, StoreTy) ==
+                   TargetLowering::TypePromoteInteger) {
+          EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
+          if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
+              TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
+              TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy,
+                                 StoreTy) &&
+              TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy,
+                                 StoreTy) &&
+              TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
+              TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                     FirstStoreAlign, &IsFastSt) &&
+              IsFastSt &&
+              TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                     FirstLoadAlign, &IsFastLd) &&
+              IsFastLd) {
+            LastLegalIntegerType = i + 1;
+            DoIntegerTruncate = true;
+          }
         }
       }
-    }
 
-    // Only use vector types if the vector type is larger than the integer type.
-    // If they are the same, use integers.
-    bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
-    unsigned LastLegalType =
-        std::max(LastLegalVectorType, LastLegalIntegerType);
-
-    // We add +1 here because the LastXXX variables refer to location while
-    // the NumElem refers to array/index size.
-    unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
-    NumElem = std::min(LastLegalType, NumElem);
-
-    if (NumElem < 2) {
-      // We know that candidate stores are in order and of correct
-      // shape. While there is no mergeable sequence from the
-      // beginning one may start later in the sequence. The only
-      // reason a merge of size N could have failed where another of
-      // the same size would not have is if the alignment or either
-      // the load or store has improved. Drop as many candidates as we
-      // can here.
-      unsigned NumSkip = 1;
-      while ((NumSkip < LoadNodes.size()) &&
-             (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
-             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
-        NumSkip++;
-      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
-      continue;
-    }
+      // Only use vector types if the vector type is larger than the integer
+      // type. If they are the same, use integers.
+      bool UseVectorTy =
+          LastLegalVectorType > LastLegalIntegerType && !NoVectors;
+      unsigned LastLegalType =
+          std::max(LastLegalVectorType, LastLegalIntegerType);
 
-    // Find if it is better to use vectors or integers to load and store
-    // to memory.
-    EVT JointMemOpVT;
-    if (UseVectorTy) {
-      // Find a legal type for the vector store.
-      unsigned Elts = NumElem * NumMemElts;
-      JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
-    } else {
-      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-      JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
-    }
-
-    SDLoc LoadDL(LoadNodes[0].MemNode);
-    SDLoc StoreDL(StoreNodes[0].MemNode);
-
-    // The merged loads are required to have the same incoming chain, so
-    // using the first's chain is acceptable.
-
-    SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
-    AddToWorklist(NewStoreChain.getNode());
-
-    MachineMemOperand::Flags MMOFlags = isDereferenceable ?
-                                          MachineMemOperand::MODereferenceable:
-                                          MachineMemOperand::MONone;
-
-    SDValue NewLoad, NewStore;
-    if (UseVectorTy || !DoIntegerTruncate) {
-      NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
-                            FirstLoad->getBasePtr(),
-                            FirstLoad->getPointerInfo(), FirstLoadAlign,
-                            MMOFlags);
-      NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad,
-                              FirstInChain->getBasePtr(),
-                              FirstInChain->getPointerInfo(), FirstStoreAlign);
-    } else { // This must be the truncstore/extload case
-      EVT ExtendedTy =
-          TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
-      NewLoad =
-          DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(),
-                         FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
-                         JointMemOpVT, FirstLoadAlign, MMOFlags);
-      NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
-                                   FirstInChain->getBasePtr(),
-                                   FirstInChain->getPointerInfo(), JointMemOpVT,
-                                   FirstInChain->getAlignment(),
-                                   FirstInChain->getMemOperand()->getFlags());
-    }
-
-    // Transfer chain users from old loads to the new load.
-    for (unsigned i = 0; i < NumElem; ++i) {
-      LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                    SDValue(NewLoad.getNode(), 1));
-    }
-
-    // Replace the all stores with the new store. Recursively remove
-    // corresponding value if its no longer used.
-    for (unsigned i = 0; i < NumElem; ++i) {
-      SDValue Val = StoreNodes[i].MemNode->getOperand(1);
-      CombineTo(StoreNodes[i].MemNode, NewStore);
-      if (Val.getNode()->use_empty())
-        recursivelyDeleteUnusedNodes(Val.getNode());
-    }
-
-    RV = true;
-    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      // We add +1 here because the LastXXX variables refer to location while
+      // the NumElem refers to array/index size.
+      unsigned NumElem =
+          std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
+      NumElem = std::min(LastLegalType, NumElem);
+
+      if (NumElem < 2) {
+        // We know that candidate stores are in order and of correct
+        // shape. While there is no mergeable sequence from the
+        // beginning one may start later in the sequence. The only
+        // reason a merge of size N could have failed where another of
+        // the same size would not have is if the alignment or either
+        // the load or store has improved. Drop as many candidates as we
+        // can here.
+        unsigned NumSkip = 1;
+        while ((NumSkip < LoadNodes.size()) &&
+               (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
+               (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+          NumSkip++;
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+        LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
+        NumConsecutiveStores -= NumSkip;
+        continue;
+      }
+
+      // Check that we can merge these candidates without causing a cycle.
+      if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
+                                                    RootNode)) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+        LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
+        NumConsecutiveStores -= NumElem;
+        continue;
+      }
+
+      // Find if it is better to use vectors or integers to load and store
+      // to memory.
+      EVT JointMemOpVT;
+      if (UseVectorTy) {
+        // Find a legal type for the vector store.
+        unsigned Elts = NumElem * NumMemElts;
+        JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+      } else {
+        unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+        JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+      }
+
+      SDLoc LoadDL(LoadNodes[0].MemNode);
+      SDLoc StoreDL(StoreNodes[0].MemNode);
+
+      // The merged loads are required to have the same incoming chain, so
+      // using the first's chain is acceptable.
+
+      SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+      AddToWorklist(NewStoreChain.getNode());
+
+      MachineMemOperand::Flags MMOFlags =
+          isDereferenceable ? MachineMemOperand::MODereferenceable
+                            : MachineMemOperand::MONone;
+
+      SDValue NewLoad, NewStore;
+      if (UseVectorTy || !DoIntegerTruncate) {
+        NewLoad =
+            DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
+                        FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
+                        FirstLoadAlign, MMOFlags);
+        NewStore = DAG.getStore(
+            NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
+            FirstInChain->getPointerInfo(), FirstStoreAlign);
+      } else { // This must be the truncstore/extload case
+        EVT ExtendedTy =
+            TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
+        NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
+                                 FirstLoad->getChain(), FirstLoad->getBasePtr(),
+                                 FirstLoad->getPointerInfo(), JointMemOpVT,
+                                 FirstLoadAlign, MMOFlags);
+        NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
+                                     FirstInChain->getBasePtr(),
+                                     FirstInChain->getPointerInfo(),
+                                     JointMemOpVT, FirstInChain->getAlignment(),
+                                     FirstInChain->getMemOperand()->getFlags());
+      }
+
+      // Transfer chain users from old loads to the new load.
+      for (unsigned i = 0; i < NumElem; ++i) {
+        LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                      SDValue(NewLoad.getNode(), 1));
+      }
+
+      // Replace the all stores with the new store. Recursively remove
+      // corresponding value if its no longer used.
+      for (unsigned i = 0; i < NumElem; ++i) {
+        SDValue Val = StoreNodes[i].MemNode->getOperand(1);
+        CombineTo(StoreNodes[i].MemNode, NewStore);
+        if (Val.getNode()->use_empty())
+          recursivelyDeleteUnusedNodes(Val.getNode());
+      }
+
+      RV = true;
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
+      NumConsecutiveStores -= NumElem;
+    }
   }
   return RV;
 }
@@ -13728,13 +14507,14 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > ST->getAlignment()) {
+      if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) {
         SDValue NewStore =
             DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
                               ST->getMemoryVT(), Align,
                               ST->getMemOperand()->getFlags(), ST->getAAInfo());
-        if (NewStore.getNode() != N)
-          return CombineTo(ST, NewStore, true);
+        // NewStore will always be N as we are only refining the alignment
+        assert(NewStore.getNode() == N);
+        (void)NewStore;
       }
     }
   }
@@ -14216,6 +14996,10 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue EltNo = N->getOperand(1);
   ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
 
+  // extract_vector_elt of out-of-bounds element -> UNDEF
+  if (ConstEltNo && ConstEltNo->getAPIntValue().uge(VT.getVectorNumElements()))
+    return DAG.getUNDEF(NVT);
+
   // extract_vector_elt (build_vector x, y), 1 -> y
   if (ConstEltNo &&
       InVec.getOpcode() == ISD::BUILD_VECTOR &&
@@ -14301,6 +15085,23 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
+  // simplify it based on the (valid) extraction indices.
+  if (llvm::all_of(InVec->uses(), [&](SDNode *Use) {
+        return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+               Use->getOperand(0) == InVec &&
+               isa<ConstantSDNode>(Use->getOperand(1));
+      })) {
+    APInt DemandedElts = APInt::getNullValue(VT.getVectorNumElements());
+    for (SDNode *Use : InVec->uses()) {
+      auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
+      if (CstElt->getAPIntValue().ult(VT.getVectorNumElements()))
+        DemandedElts.setBit(CstElt->getZExtValue());
+    }
+    if (SimplifyDemandedVectorElts(InVec, DemandedElts, true))
+      return SDValue(N, 0);
+  }
+
   bool BCNumEltsChanged = false;
   EVT ExtVT = VT.getVectorElementType();
   EVT LVT = ExtVT;
@@ -14507,7 +15308,10 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
          "Invalid vector size");
   // Check if the new vector type is legal.
-  if (!isTypeLegal(VecVT)) return SDValue();
+  if (!isTypeLegal(VecVT) ||
+      (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
+       TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
+    return SDValue();
 
   // Make the new BUILD_VECTOR.
   SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);
@@ -14754,12 +15558,16 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
     }
 
     // Not an undef or zero. If the input is something other than an
-    // EXTRACT_VECTOR_ELT with a constant index, bail out.
+    // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Op.getOperand(1)))
       return SDValue();
     SDValue ExtractedFromVec = Op.getOperand(0);
 
+    APInt ExtractIdx = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+    if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+      return SDValue();
+
     // All inputs must have the same element type as the output.
     if (VT.getVectorElementType() !=
         ExtractedFromVec.getValueType().getVectorElementType())
@@ -14915,6 +15723,54 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   return Shuffles[0];
 }
 
+// Try to turn a build vector of zero extends of extract vector elts into a
+// a vector zero extend and possibly an extract subvector.
+// TODO: Support sign extend or any extend?
+// TODO: Allow undef elements?
+// TODO: Don't require the extracts to start at element 0.
+SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
+  if (LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  SDValue Op0 = N->getOperand(0);
+  auto checkElem = [&](SDValue Op) -> int64_t {
+    if (Op.getOpcode() == ISD::ZERO_EXTEND &&
+        Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
+        return C->getZExtValue();
+    return -1;
+  };
+
+  // Make sure the first element matches
+  // (zext (extract_vector_elt X, C))
+  int64_t Offset = checkElem(Op0);
+  if (Offset < 0)
+    return SDValue();
+
+  unsigned NumElems = N->getNumOperands();
+  SDValue In = Op0.getOperand(0).getOperand(0);
+  EVT InSVT = In.getValueType().getScalarType();
+  EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);
+
+  // Don't create an illegal input type after type legalization.
+  if (LegalTypes && !TLI.isTypeLegal(InVT))
+    return SDValue();
+
+  // Ensure all the elements come from the same vector and are adjacent.
+  for (unsigned i = 1; i != NumElems; ++i) {
+    if ((Offset + i) != checkElem(N->getOperand(i)))
+      return SDValue();
+  }
+
+  SDLoc DL(N);
+  In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
+                   Op0.getOperand(0).getOperand(1));
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In);
+}
+
 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   EVT VT = N->getValueType(0);
 
@@ -14922,6 +15778,32 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (ISD::allOperandsUndef(N))
     return DAG.getUNDEF(VT);
 
+  // If this is a splat of a bitcast from another vector, change to a
+  // concat_vector.
+  // For example:
+  //   (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
+  //     (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
+  //
+  // If X is a build_vector itself, the concat can become a larger build_vector.
+  // TODO: Maybe this is useful for non-splat too?
+  if (!LegalOperations) {
+    if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+      Splat = peekThroughBitcast(Splat);
+      EVT SrcVT = Splat.getValueType();
+      if (SrcVT.isVector()) {
+        unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
+        EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
+                                     SrcVT.getVectorElementType(), NumElts);
+        if (!LegalTypes || TLI.isTypeLegal(NewVT)) {
+          SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
+          SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
+                                       NewVT, Ops);
+          return DAG.getBitcast(VT, Concat);
+        }
+      }
+    }
+  }
+
   // Check if we can express BUILD VECTOR via subvector extract.
   if (!LegalTypes && (N->getNumOperands() > 1)) {
     SDValue Op0 = N->getOperand(0);
@@ -14951,6 +15833,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
                          Op0.getOperand(0), Op0.getOperand(1));
   }
 
+  if (SDValue V = convertBuildVecZextToZext(N))
+    return V;
+
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
@@ -15140,6 +16025,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
         return SDValue();
 
+      // Bail out if the vector size is not a multiple of the scalar size.
+      if (VT.getSizeInBits() % SclTy.getSizeInBits())
+        return SDValue();
+
       unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
       if (VNTNumElms < 2)
         return SDValue();
@@ -15418,13 +16307,22 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
       // Only do this if we won't split any elements.
       if (ExtractSize % EltSize == 0) {
         unsigned NumElems = ExtractSize / EltSize;
-        EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
-                                         InVT.getVectorElementType(), NumElems);
-        if ((!LegalOperations ||
-             TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) &&
+        EVT EltVT = InVT.getVectorElementType();
+        EVT ExtractVT = NumElems == 1 ? EltVT :
+          EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
+        if ((Level < AfterLegalizeDAG ||
+             (NumElems == 1 ||
+              TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
             (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
           unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) /
                             EltSize;
+          if (NumElems == 1) {
+            SDValue Src = V->getOperand(IdxVal);
+            if (EltVT != Src.getValueType())
+              Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
+
+            return DAG.getBitcast(NVT, Src);
+          }
 
           // Extract the pieces from the original build_vector.
           SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
@@ -15466,122 +16364,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
     return NarrowBOp;
 
-  return SDValue();
-}
-
-static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
-                                                 SDValue V, SelectionDAG &DAG) {
-  SDLoc DL(V);
-  EVT VT = V.getValueType();
-
-  switch (V.getOpcode()) {
-  default:
-    return V;
-
-  case ISD::CONCAT_VECTORS: {
-    EVT OpVT = V->getOperand(0).getValueType();
-    int OpSize = OpVT.getVectorNumElements();
-    SmallBitVector OpUsedElements(OpSize, false);
-    bool FoundSimplification = false;
-    SmallVector<SDValue, 4> NewOps;
-    NewOps.reserve(V->getNumOperands());
-    for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
-      SDValue Op = V->getOperand(i);
-      bool OpUsed = false;
-      for (int j = 0; j < OpSize; ++j)
-        if (UsedElements[i * OpSize + j]) {
-          OpUsedElements[j] = true;
-          OpUsed = true;
-        }
-      NewOps.push_back(
-          OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
-                 : DAG.getUNDEF(OpVT));
-      FoundSimplification |= Op == NewOps.back();
-      OpUsedElements.reset();
-    }
-    if (FoundSimplification)
-      V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
-    return V;
-  }
-
-  case ISD::INSERT_SUBVECTOR: {
-    SDValue BaseV = V->getOperand(0);
-    SDValue SubV = V->getOperand(1);
-    auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
-    if (!IdxN)
-      return V;
-
-    int SubSize = SubV.getValueType().getVectorNumElements();
-    int Idx = IdxN->getZExtValue();
-    bool SubVectorUsed = false;
-    SmallBitVector SubUsedElements(SubSize, false);
-    for (int i = 0; i < SubSize; ++i)
-      if (UsedElements[i + Idx]) {
-        SubVectorUsed = true;
-        SubUsedElements[i] = true;
-        UsedElements[i + Idx] = false;
-      }
-
-    // Now recurse on both the base and sub vectors.
-    SDValue SimplifiedSubV =
-        SubVectorUsed
-            ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
-            : DAG.getUNDEF(SubV.getValueType());
-    SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
-    if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV)
-      V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                      SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
-    return V;
-  }
-  }
-}
-
-static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
-                                       SDValue N1, SelectionDAG &DAG) {
-  EVT VT = SVN->getValueType(0);
-  int NumElts = VT.getVectorNumElements();
-  SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
-  for (int M : SVN->getMask())
-    if (M >= 0 && M < NumElts)
-      N0UsedElements[M] = true;
-    else if (M >= NumElts)
-      N1UsedElements[M - NumElts] = true;
-
-  SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
-  SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
-  if (S0 == N0 && S1 == N1)
-    return SDValue();
-
-  return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
-}
-
-static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0,
-                                   SDValue N1, SelectionDAG &DAG) {
-  auto isUndefElt = [](SDValue V, int Idx) {
-    // TODO - handle more cases as required.
-    if (V.getOpcode() == ISD::BUILD_VECTOR)
-      return V.getOperand(Idx).isUndef();
-    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return (Idx != 0) || V.getOperand(0).isUndef();
-    return false;
-  };
-
-  EVT VT = SVN->getValueType(0);
-  unsigned NumElts = VT.getVectorNumElements();
-
-  bool Changed = false;
-  SmallVector<int, 8> NewMask;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int Idx = SVN->getMaskElt(i);
-    if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) ||
-        ((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) {
-      Changed = true;
-      Idx = -1;
-    }
-    NewMask.push_back(Idx);
-  }
-  if (Changed)
-    return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask);
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
 
   return SDValue();
 }
@@ -16028,10 +16812,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
   }
 
-  // Simplify shuffle mask if a referenced element is UNDEF.
-  if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG))
-    return V;
-
   if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
     return InsElt;
 
@@ -16092,11 +16872,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     }
   }
 
-  // There are various patterns used to build up a vector from smaller vectors,
-  // subvectors, or elements. Scan chains of these and replace unused insertions
-  // or components with undef.
-  if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
-    return S;
+  // Simplify source operands based on shuffle mask.
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
 
   // Match shuffles that can be converted to any_vector_extend_in_reg.
   if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))
@@ -16422,10 +17200,11 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
     SDValue CN0 = N0.getOperand(0);
     SDValue CN1 = N1.getOperand(0);
-    if (CN0.getValueType().getVectorElementType() ==
-            CN1.getValueType().getVectorElementType() &&
-        CN0.getValueType().getVectorNumElements() ==
-            VT.getVectorNumElements()) {
+    EVT CN0VT = CN0.getValueType();
+    EVT CN1VT = CN1.getValueType();
+    if (CN0VT.isVector() && CN1VT.isVector() &&
+        CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
+        CN0VT.getVectorNumElements() == VT.getVectorNumElements()) {
       SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
                                       CN0.getValueType(), CN0, CN1, N2);
       return DAG.getBitcast(VT, NewINSERT);
@@ -16680,14 +17459,14 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
       const ConstantFPSDNode *Zero = nullptr;
 
       if (TheSelect->getOpcode() == ISD::SELECT_CC) {
-        CC = dyn_cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
+        CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
         CmpLHS = TheSelect->getOperand(0);
         Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
       } else {
         // SELECT or VSELECT
         SDValue Cmp = TheSelect->getOperand(0);
         if (Cmp.getOpcode() == ISD::SETCC) {
-          CC = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
+          CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
           CmpLHS = Cmp.getOperand(0);
           Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
         }
@@ -16905,24 +17684,6 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
     return !SCCC->isNullValue() ? N2 : N3;
   }
 
-  // Check to see if we can simplify the select into an fabs node
-  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
-    // Allow either -0.0 or 0.0
-    if (CFP->isZero()) {
-      // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
-      if ((CC == ISD::SETGE || CC == ISD::SETGT) &&
-          N0 == N2 && N3.getOpcode() == ISD::FNEG &&
-          N2 == N3.getOperand(0))
-        return DAG.getNode(ISD::FABS, DL, VT, N0);
-
-      // select (setl[te] X, +/-0.0), fneg(X), X -> fabs
-      if ((CC == ISD::SETLT || CC == ISD::SETLE) &&
-          N0 == N3 && N2.getOpcode() == ISD::FNEG &&
-          N2.getOperand(0) == N3)
-        return DAG.getNode(ISD::FABS, DL, VT, N3);
-    }
-  }
-
   // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
   // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
   // in it.  This is a win when the constant is not otherwise available because
@@ -17400,19 +18161,34 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
             : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
 
       if (!Reciprocal) {
-        // Unfortunately, Est is now NaN if the input was exactly 0.0.
-        // Select out this case and force the answer to 0.0.
+        // The estimate is now completely wrong if the input was exactly 0.0 or
+        // possibly a denormal. Force the answer to 0.0 for those cases.
         EVT VT = Op.getValueType();
         SDLoc DL(Op);
-
-        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
         EVT CCVT = getSetCCResultType(VT);
-        SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
-        AddToWorklist(ZeroCmp.getNode());
-
-        Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
-                          ZeroCmp, FPZero, Est);
-        AddToWorklist(Est.getNode());
+        ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+        const Function &F = DAG.getMachineFunction().getFunction();
+        Attribute Denorms = F.getFnAttribute("denormal-fp-math");
+        if (Denorms.getValueAsString().equals("ieee")) {
+          // fabs(X) < SmallestNormal ? 0.0 : Est
+          const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+          APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
+          SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
+          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+          SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
+          SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
+          Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
+          AddToWorklist(Fabs.getNode());
+          AddToWorklist(IsDenorm.getNode());
+          AddToWorklist(Est.getNode());
+        } else {
+          // X == 0.0 ? 0.0 : Est
+          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+          SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+          Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
+          AddToWorklist(IsZero.getNode());
+          AddToWorklist(Est.getNode());
+        }
       }
     }
     return Est;
@@ -17715,7 +18491,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
         Index = nullptr;
         break;
       }
-    } // end while
+    }// end while
   }
 
   // At this point, ChainedStores lists all of the Store nodes
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 3c856914053b..e4a9d557d386 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -61,7 +61,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -99,6 +98,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -113,6 +113,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "isel"
 
+// FIXME: Remove this after the feature has proven reliable.
+static cl::opt<bool> SinkLocalValues("fast-isel-sink-local-values",
+                                     cl::init(true), cl::Hidden,
+                                     cl::desc("Sink local values in FastISel"));
+
 STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
                                          "target-independent selector");
 STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
@@ -120,9 +125,10 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
 STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 
 /// Set the current block to which generated machine instructions will be
-/// appended, and clear the local CSE map.
+/// appended.
 void FastISel::startNewBlock() {
-  LocalValueMap.clear();
+  assert(LocalValueMap.empty() &&
+         "local values should be cleared after finishing a BB");
 
   // Instructions are appended to FuncInfo.MBB. If the basic block already
   // contains labels or copies, use the last instruction as the last local
@@ -133,6 +139,9 @@ void FastISel::startNewBlock() {
   LastLocalValue = EmitStartPt;
 }
 
+/// Flush the local CSE map and sink anything we can.
+void FastISel::finishBasicBlock() { flushLocalValueMap(); }
+
 bool FastISel::lowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     // Fallback to SDISel argument lowering code to deal with sret pointer
@@ -153,11 +162,168 @@ bool FastISel::lowerArguments() {
   return true;
 }
 
+/// Return the defined register if this instruction defines exactly one
+/// virtual register and uses no other virtual registers. Otherwise return 0.
+static unsigned findSinkableLocalRegDef(MachineInstr &MI) {
+  unsigned RegDef = 0;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef()) {
+      if (RegDef)
+        return 0;
+      RegDef = MO.getReg();
+    } else if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      // This is another use of a vreg. Don't try to sink it.
+      return 0;
+    }
+  }
+  return RegDef;
+}
+
 void FastISel::flushLocalValueMap() {
+  // Try to sink local values down to their first use so that we can give them a
+  // better debug location. This has the side effect of shrinking local value
+  // live ranges, which helps out fast regalloc.
+  if (SinkLocalValues && LastLocalValue != EmitStartPt) {
+    // Sink local value materialization instructions between EmitStartPt and
+    // LastLocalValue. Visit them bottom-up, starting from LastLocalValue, to
+    // avoid inserting into the range that we're iterating over.
+    MachineBasicBlock::reverse_iterator RE =
+        EmitStartPt ? MachineBasicBlock::reverse_iterator(EmitStartPt)
+                    : FuncInfo.MBB->rend();
+    MachineBasicBlock::reverse_iterator RI(LastLocalValue);
+
+    InstOrderMap OrderMap;
+    for (; RI != RE;) {
+      MachineInstr &LocalMI = *RI;
+      ++RI;
+      bool Store = true;
+      if (!LocalMI.isSafeToMove(nullptr, Store))
+        continue;
+      unsigned DefReg = findSinkableLocalRegDef(LocalMI);
+      if (DefReg == 0)
+        continue;
+
+      sinkLocalValueMaterialization(LocalMI, DefReg, OrderMap);
+    }
+  }
+
   LocalValueMap.clear();
   LastLocalValue = EmitStartPt;
   recomputeInsertPt();
   SavedInsertPt = FuncInfo.InsertPt;
+  LastFlushPoint = FuncInfo.InsertPt;
+}
+
+static bool isRegUsedByPhiNodes(unsigned DefReg,
+                                FunctionLoweringInfo &FuncInfo) {
+  for (auto &P : FuncInfo.PHINodesToUpdate)
+    if (P.second == DefReg)
+      return true;
+  return false;
+}
+
+/// Build a map of instruction orders. Return the first terminator and its
+/// order. Consider EH_LABEL instructions to be terminators as well, since local
+/// values for phis after invokes must be materialized before the call.
+void FastISel::InstOrderMap::initialize(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator LastFlushPoint) {
+  unsigned Order = 0;
+  for (MachineInstr &I : *MBB) {
+    if (!FirstTerminator &&
+        (I.isTerminator() || (I.isEHLabel() && &I != &MBB->front()))) {
+      FirstTerminator = &I;
+      FirstTerminatorOrder = Order;
+    }
+    Orders[&I] = Order++;
+
+    // We don't need to order instructions past the last flush point.
+    if (I.getIterator() == LastFlushPoint)
+      break;
+  }
+}
+
+void FastISel::sinkLocalValueMaterialization(MachineInstr &LocalMI,
+                                             unsigned DefReg,
+                                             InstOrderMap &OrderMap) {
+  // If this register is used by a register fixup, MRI will not contain all
+  // the uses until after register fixups, so don't attempt to sink or DCE
+  // this instruction. Register fixups typically come from no-op cast
+  // instructions, which replace the cast instruction vreg with the local
+  // value vreg.
+  if (FuncInfo.RegsWithFixups.count(DefReg))
+    return;
+
+  // We can DCE this instruction if there are no uses and it wasn't a
+  // materialized for a successor PHI node.
+  bool UsedByPHI = isRegUsedByPhiNodes(DefReg, FuncInfo);
+  if (!UsedByPHI && MRI.use_nodbg_empty(DefReg)) {
+    if (EmitStartPt == &LocalMI)
+      EmitStartPt = EmitStartPt->getPrevNode();
+    LLVM_DEBUG(dbgs() << "removing dead local value materialization "
+                      << LocalMI);
+    OrderMap.Orders.erase(&LocalMI);
+    LocalMI.eraseFromParent();
+    return;
+  }
+
+  // Number the instructions if we haven't yet so we can efficiently find the
+  // earliest use.
+  if (OrderMap.Orders.empty())
+    OrderMap.initialize(FuncInfo.MBB, LastFlushPoint);
+
+  // Find the first user in the BB.
+  MachineInstr *FirstUser = nullptr;
+  unsigned FirstOrder = std::numeric_limits<unsigned>::max();
+  for (MachineInstr &UseInst : MRI.use_nodbg_instructions(DefReg)) {
+    auto I = OrderMap.Orders.find(&UseInst);
+    assert(I != OrderMap.Orders.end() &&
+           "local value used by instruction outside local region");
+    unsigned UseOrder = I->second;
+    if (UseOrder < FirstOrder) {
+      FirstOrder = UseOrder;
+      FirstUser = &UseInst;
+    }
+  }
+
+  // The insertion point will be the first terminator or the first user,
+  // whichever came first. If there was no terminator, this must be a
+  // fallthrough block and the insertion point is the end of the block.
+  MachineBasicBlock::instr_iterator SinkPos;
+  if (UsedByPHI && OrderMap.FirstTerminatorOrder < FirstOrder) {
+    FirstOrder = OrderMap.FirstTerminatorOrder;
+    SinkPos = OrderMap.FirstTerminator->getIterator();
+  } else if (FirstUser) {
+    SinkPos = FirstUser->getIterator();
+  } else {
+    assert(UsedByPHI && "must be users if not used by a phi");
+    SinkPos = FuncInfo.MBB->instr_end();
+  }
+
+  // Collect all DBG_VALUEs before the new insertion position so that we can
+  // sink them.
+  SmallVector<MachineInstr *, 1> DbgValues;
+  for (MachineInstr &DbgVal : MRI.use_instructions(DefReg)) {
+    if (!DbgVal.isDebugValue())
+      continue;
+    unsigned UseOrder = OrderMap.Orders[&DbgVal];
+    if (UseOrder < FirstOrder)
+      DbgValues.push_back(&DbgVal);
+  }
+
+  // Sink LocalMI before SinkPos and assign it the same DebugLoc.
+  LLVM_DEBUG(dbgs() << "sinking local value to first use " << LocalMI);
+  FuncInfo.MBB->remove(&LocalMI);
+  FuncInfo.MBB->insert(SinkPos, &LocalMI);
+  if (SinkPos != FuncInfo.MBB->end())
+    LocalMI.setDebugLoc(SinkPos->getDebugLoc());
+
+  // Sink any debug values that we've collected.
+  for (MachineInstr *DI : DbgValues) {
+    FuncInfo.MBB->remove(DI);
+    FuncInfo.MBB->insert(SinkPos, DI);
+  }
 }
 
 bool FastISel::hasTrivialKill(const Value *V) {
@@ -328,8 +494,10 @@ void FastISel::updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
     AssignedReg = Reg;
   else if (Reg != AssignedReg) {
     // Arrange for uses of AssignedReg to be replaced by uses of Reg.
-    for (unsigned i = 0; i < NumRegs; i++)
+    for (unsigned i = 0; i < NumRegs; i++) {
       FuncInfo.RegFixups[AssignedReg + i] = Reg + i;
+      FuncInfo.RegsWithFixups.insert(Reg + i);
+    }
 
     AssignedReg = Reg;
   }
@@ -681,7 +849,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   return true;
 }
 
-/// \brief Lower an argument list according to the target calling convention.
+/// Lower an argument list according to the target calling convention.
 ///
 /// This is a helper for lowering intrinsics that follow a target calling
 /// convention or require stack pointer adjustment. Only a subset of the
@@ -702,7 +870,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, ArgIdx);
+    Entry.setAttributes(&CS, ArgI);
     Args.push_back(Entry);
   }
 
@@ -874,10 +1042,31 @@ bool FastISel::selectXRayCustomEvent(const CallInst *I) {
               TII.get(TargetOpcode::PATCHABLE_EVENT_CALL));
   for (auto &MO : Ops)
     MIB.add(MO);
+
   // Insert the Patchable Event Call instruction, that gets lowered properly.
   return true;
 }
 
+bool FastISel::selectXRayTypedEvent(const CallInst *I) {
+  const auto &Triple = TM.getTargetTriple();
+  if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
+    return true; // don't do anything to this instruction.
+  SmallVector<MachineOperand, 8> Ops;
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)),
+                                          /*IsDef=*/false));
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(1)),
+                                          /*IsDef=*/false));
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(2)),
+                                          /*IsDef=*/false));
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::PATCHABLE_TYPED_EVENT_CALL));
+  for (auto &MO : Ops)
+    MIB.add(MO);
+
+  // Insert the Patchable Typed Event Call instruction, that gets lowered properly.
+  return true;
+}
 
 /// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
@@ -1141,13 +1330,13 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     assert(DI->getVariable() && "Missing variable");
     if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
       return true;
     }
 
     const Value *Address = DI->getAddress();
     if (!Address || isa<UndefValue>(Address)) {
-      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
       return true;
     }
 
@@ -1182,24 +1371,15 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     if (Op) {
       assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
              "Expected inlined-at fields to agree");
-      if (Op->isReg()) {
-        Op->setIsDebug(true);
-        // A dbg.declare describes the address of a source variable, so lower it
-        // into an indirect DBG_VALUE.
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true,
-                Op->getReg(), DI->getVariable(), DI->getExpression());
-      } else
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(TargetOpcode::DBG_VALUE))
-            .add(*Op)
-            .addImm(0)
-            .addMetadata(DI->getVariable())
-            .addMetadata(DI->getExpression());
+      // A dbg.declare describes the address of a source variable, so lower it
+      // into an indirect DBG_VALUE.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true,
+              *Op, DI->getVariable(), DI->getExpression());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
-      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
     }
     return true;
   }
@@ -1242,7 +1422,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
-      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
     }
     return true;
   }
@@ -1256,7 +1436,8 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, ResultReg);
     return true;
   }
-  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
   case Intrinsic::expect: {
     unsigned ResultReg = getRegForValue(II->getArgOperand(0));
     if (!ResultReg)
@@ -1272,6 +1453,8 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
 
   case Intrinsic::xray_customevent:
     return selectXRayCustomEvent(II);
+  case Intrinsic::xray_typedevent:
+    return selectXRayTypedEvent(II);
   }
 
   return fastLowerIntrinsicCall(II);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 81347fa4bd46..42c7181dac41 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -118,6 +119,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       }
     }
   }
+  if (Personality == EHPersonality::Wasm_CXX) {
+    WasmEHFuncInfo &EHInfo = *MF->getWasmEHFuncInfo();
+    calculateWasmEHInfo(&fn, EHInfo);
+  }
 
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
@@ -226,9 +231,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       const Instruction *PadInst = BB.getFirstNonPHI();
       // If this is a non-landingpad EH pad, mark this function as using
       // funclets.
-      // FIXME: SEH catchpads do not create funclets, so we could avoid setting
-      // this in such cases in order to improve frame layout.
+      // FIXME: SEH catchpads do not create EH scope/funclets, so we could avoid
+      // setting this in such cases in order to improve frame layout.
       if (!isa<LandingPadInst>(PadInst)) {
+        MF->setHasEHScopes(true);
         MF->setHasEHFunclets(true);
         MF->getFrameInfo().setHasOpaqueSPAdjustment(true);
       }
@@ -281,28 +287,46 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
     }
   }
 
-  if (!isFuncletEHPersonality(Personality))
-    return;
-
-  WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+  if (isFuncletEHPersonality(Personality)) {
+    WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
 
-  // Map all BB references in the WinEH data to MBBs.
-  for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
-    for (WinEHHandlerType &H : TBME.HandlerArray) {
-      if (H.Handler)
-        H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()];
+    // Map all BB references in the WinEH data to MBBs.
+    for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+      for (WinEHHandlerType &H : TBME.HandlerArray) {
+        if (H.Handler)
+          H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()];
+      }
+    }
+    for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap)
+      if (UME.Cleanup)
+        UME.Cleanup = MBBMap[UME.Cleanup.get<const BasicBlock *>()];
+    for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) {
+      const auto *BB = UME.Handler.get<const BasicBlock *>();
+      UME.Handler = MBBMap[BB];
+    }
+    for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) {
+      const auto *BB = CME.Handler.get<const BasicBlock *>();
+      CME.Handler = MBBMap[BB];
     }
   }
-  for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap)
-    if (UME.Cleanup)
-      UME.Cleanup = MBBMap[UME.Cleanup.get<const BasicBlock *>()];
-  for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) {
-    const BasicBlock *BB = UME.Handler.get<const BasicBlock *>();
-    UME.Handler = MBBMap[BB];
-  }
-  for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) {
-    const BasicBlock *BB = CME.Handler.get<const BasicBlock *>();
-    CME.Handler = MBBMap[BB];
+
+  else if (Personality == EHPersonality::Wasm_CXX) {
+    WasmEHFuncInfo &EHInfo = *MF->getWasmEHFuncInfo();
+    // Map all BB references in the WinEH data to MBBs.
+    DenseMap<BBOrMBB, BBOrMBB> NewMap;
+    for (auto &KV : EHInfo.EHPadUnwindMap) {
+      const auto *Src = KV.first.get<const BasicBlock *>();
+      const auto *Dst = KV.second.get<const BasicBlock *>();
+      NewMap[MBBMap[Src]] = MBBMap[Dst];
+    }
+    EHInfo.EHPadUnwindMap = std::move(NewMap);
+    NewMap.clear();
+    for (auto &KV : EHInfo.ThrowUnwindMap) {
+      const auto *Src = KV.first.get<const BasicBlock *>();
+      const auto *Dst = KV.second.get<const BasicBlock *>();
+      NewMap[MBBMap[Src]] = MBBMap[Dst];
+    }
+    EHInfo.ThrowUnwindMap = std::move(NewMap);
   }
 }
 
@@ -312,12 +336,14 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
 void FunctionLoweringInfo::clear() {
   MBBMap.clear();
   ValueMap.clear();
+  VirtReg2Value.clear();
   StaticAllocaMap.clear();
   LiveOutRegInfo.clear();
   VisitedBBs.clear();
   ArgDbgValues.clear();
   ByValArgFrameIndexMap.clear();
   RegFixups.clear();
+  RegsWithFixups.clear();
   StatepointStackSlots.clear();
   StatepointSpillMaps.clear();
   PreferredExtendType.clear();
@@ -483,7 +509,7 @@ int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
   auto I = ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
     return I->second;
-  DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
+  LLVM_DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
   return INT_MAX;
 }
 
@@ -547,3 +573,13 @@ FunctionLoweringInfo::getOrCreateSwiftErrorVRegUseAt(const Instruction *I, const
   }
   return std::make_pair(It->second, false);
 }
+
+const Value *
+FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) {
+  if (VirtReg2Value.empty()) {
+    for (auto &P : ValueMap) {
+      VirtReg2Value[P.second] = P.first;
+    }
+  }
+  return VirtReg2Value[Vreg];
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index cc9b41b4b487..d6171f3177d7 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -394,11 +394,26 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
   } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
     MIB.addFPImm(F->getConstantFPValue());
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
+    unsigned VReg = R->getReg();
+    MVT OpVT = Op.getSimpleValueType();
+    const TargetRegisterClass *OpRC =
+        TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT) : nullptr;
+    const TargetRegisterClass *IIRC =
+        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF))
+           : nullptr;
+
+    if (OpRC && IIRC && OpRC != IIRC &&
+        TargetRegisterInfo::isVirtualRegister(VReg)) {
+      unsigned NewVReg = MRI->createVirtualRegister(IIRC);
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+               TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+      VReg = NewVReg;
+    }
     // Turn additional physreg operands into implicit uses on non-variadic
     // instructions. This is used by call and return instructions passing
     // arguments in registers.
     bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
-    MIB.addReg(R->getReg(), getImplRegState(Imp));
+    MIB.addReg(VReg, getImplRegState(Imp));
   } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
     MIB.addRegMask(RM->getRegMask());
   } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
@@ -682,11 +697,15 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   if (SD->getKind() == SDDbgValue::FRAMEIX) {
     // Stack address; this needs to be lowered in target-dependent fashion.
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
-    return BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
-        .addFrameIndex(SD->getFrameIx())
-        .addImm(0)
-        .addMetadata(Var)
-        .addMetadata(Expr);
+    auto FrameMI = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
+                       .addFrameIndex(SD->getFrameIx());
+    if (SD->isIndirect())
+      // Push [fi + 0] onto the DIExpression stack.
+      FrameMI.addImm(0);
+    else
+      // Push fi onto the DIExpression stack.
+      FrameMI.addReg(0);
+    return FrameMI.addMetadata(Var).addMetadata(Expr);
   }
   // Otherwise, we're going to create an instruction here.
   const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
@@ -705,6 +724,8 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     else
       AddOperand(MIB, Op, (*MIB).getNumOperands(), &II, VRBaseMap,
                  /*IsDebug=*/true, /*IsClone=*/false, /*IsCloned=*/false);
+  } else if (SD->getKind() == SDDbgValue::VREG) {
+    MIB.addReg(SD->getVReg(), RegState::Debug);
   } else if (SD->getKind() == SDDbgValue::CONST) {
     const Value *V = SD->getConst();
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
@@ -736,6 +757,20 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   return &*MIB;
 }
 
+MachineInstr *
+InstrEmitter::EmitDbgLabel(SDDbgLabel *SD) {
+  MDNode *Label = SD->getLabel();
+  DebugLoc DL = SD->getDebugLoc();
+  assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+
+  const MCInstrDesc &II = TII->get(TargetOpcode::DBG_LABEL);
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, II);
+  MIB.addMetadata(Label);
+
+  return &*MIB;
+}
+
 /// EmitMachineNode - Generate machine code for a target-specific node and
 /// needed dependencies.
 ///
@@ -807,9 +842,34 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Add result register values for things that are defined by this
   // instruction.
-  if (NumResults)
+  if (NumResults) {
     CreateVirtualRegisters(Node, MIB, II, IsClone, IsCloned, VRBaseMap);
 
+    // Transfer any IR flags from the SDNode to the MachineInstr
+    MachineInstr *MI = MIB.getInstr();
+    const SDNodeFlags Flags = Node->getFlags();
+    if (Flags.hasNoSignedZeros())
+      MI->setFlag(MachineInstr::MIFlag::FmNsz);
+
+    if (Flags.hasAllowReciprocal())
+      MI->setFlag(MachineInstr::MIFlag::FmArcp);
+
+    if (Flags.hasNoNaNs())
+      MI->setFlag(MachineInstr::MIFlag::FmNoNans);
+
+    if (Flags.hasNoInfs())
+      MI->setFlag(MachineInstr::MIFlag::FmNoInfs);
+
+    if (Flags.hasAllowContract())
+      MI->setFlag(MachineInstr::MIFlag::FmContract);
+
+    if (Flags.hasApproximateFuncs())
+      MI->setFlag(MachineInstr::MIFlag::FmAfn);
+
+    if (Flags.hasAllowReassociation())
+      MI->setFlag(MachineInstr::MIFlag::FmReassoc);
+  }
+
   // Emit all of the actual operands of this instruction, adding them to the
   // instruction as appropriate.
   bool HasOptPRefs = NumDefs > NumResults;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 8a8a1bbd18f7..701b6368690b 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -113,6 +113,9 @@ public:
   MachineInstr *EmitDbgValue(SDDbgValue *SD,
                              DenseMap<SDValue, unsigned> &VRBaseMap);
 
+  /// Generate machine instruction for a dbg_label node.
+  MachineInstr *EmitDbgLabel(SDDbgLabel *SD);
+
   /// EmitNode - Generate machine code for a node and needed dependencies.
   ///
   void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3a2fb0c0a836..2b7ba1ffb309 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -41,6 +40,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -87,11 +87,11 @@ class SelectionDAGLegalize {
   const TargetLowering &TLI;
   SelectionDAG &DAG;
 
-  /// \brief The set of nodes which have already been legalized. We hold a
+  /// The set of nodes which have already been legalized. We hold a
   /// reference to it in order to update as necessary on node deletion.
   SmallPtrSetImpl<SDNode *> &LegalizedNodes;
 
-  /// \brief A set of all the nodes updated during legalization.
+  /// A set of all the nodes updated during legalization.
   SmallSetVector<SDNode *, 16> *UpdatedNodes;
 
   EVT getSetCCResultType(EVT VT) const {
@@ -107,7 +107,7 @@ public:
       : TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
         LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}
 
-  /// \brief Legalizes the given operation.
+  /// Legalizes the given operation.
   void LegalizeOp(SDNode *Node);
 
 private:
@@ -167,7 +167,7 @@ private:
                           SDValue NewIntValue) const;
   SDValue ExpandFCOPYSIGN(SDNode *Node) const;
   SDValue ExpandFABS(SDNode *Node) const;
-  SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
+  SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT,
                                const SDLoc &dl);
   SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
                                 const SDLoc &dl);
@@ -200,8 +200,8 @@ public:
   }
 
   void ReplaceNode(SDNode *Old, SDNode *New) {
-    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
-          dbgs() << "     with:      "; New->dump(&DAG));
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
 
     assert(Old->getNumValues() == New->getNumValues() &&
            "Replacing one node with another that produces a different number "
@@ -213,8 +213,8 @@ public:
   }
 
   void ReplaceNode(SDValue Old, SDValue New) {
-    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
-          dbgs() << "     with:      "; New->dump(&DAG));
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
 
     DAG.ReplaceAllUsesWith(Old, New);
     if (UpdatedNodes)
@@ -223,13 +223,12 @@ public:
   }
 
   void ReplaceNode(SDNode *Old, const SDValue *New) {
-    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
 
     DAG.ReplaceAllUsesWith(Old, New);
     for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
-      DEBUG(dbgs() << (i == 0 ? "     with:      "
-                              : "      and:      ");
-            New[i]->dump(&DAG));
+      LLVM_DEBUG(dbgs() << (i == 0 ? "     with:      " : "      and:      ");
+                 New[i]->dump(&DAG));
       if (UpdatedNodes)
         UpdatedNodes->insert(New[i].getNode());
     }
@@ -408,7 +407,7 @@ SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
 }
 
 SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
-  DEBUG(dbgs() << "Optimizing float store operations\n");
+  LLVM_DEBUG(dbgs() << "Optimizing float store operations\n");
   // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
   // FIXME: We shouldn't do this for TargetConstantFP's.
   // FIXME: move this to the DAG Combiner!  Note that we can't regress due
@@ -477,7 +476,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
   AAMDNodes AAInfo = ST->getAAInfo();
 
   if (!ST->isTruncatingStore()) {
-    DEBUG(dbgs() << "Legalizing store operation\n");
+    LLVM_DEBUG(dbgs() << "Legalizing store operation\n");
     if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
       ReplaceNode(ST, OptStore);
       return;
@@ -495,15 +494,15 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       unsigned Align = ST->getAlignment();
       const DataLayout &DL = DAG.getDataLayout();
       if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
-        DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
+        LLVM_DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
       } else
-        DEBUG(dbgs() << "Legal store\n");
+        LLVM_DEBUG(dbgs() << "Legal store\n");
       break;
     }
     case TargetLowering::Custom: {
-      DEBUG(dbgs() << "Trying custom lowering\n");
+      LLVM_DEBUG(dbgs() << "Trying custom lowering\n");
       SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
       if (Res && Res != SDValue(Node, 0))
         ReplaceNode(SDValue(Node, 0), Res);
@@ -524,7 +523,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     return;
   }
 
-  DEBUG(dbgs() << "Legalizing truncating store operations\n");
+  LLVM_DEBUG(dbgs() << "Legalizing truncating store operations\n");
   SDValue Value = ST->getValue();
   EVT StVT = ST->getMemoryVT();
   unsigned StWidth = StVT.getSizeInBits();
@@ -656,7 +655,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   if (ExtType == ISD::NON_EXTLOAD) {
-    DEBUG(dbgs() << "Legalizing non-extending load operation\n");
+    LLVM_DEBUG(dbgs() << "Legalizing non-extending load operation\n");
     MVT VT = Node->getSimpleValueType(0);
     SDValue RVal = SDValue(Node, 0);
     SDValue RChain = SDValue(Node, 1);
@@ -706,7 +705,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     return;
   }
 
-  DEBUG(dbgs() << "Legalizing extending load operation\n");
+  LLVM_DEBUG(dbgs() << "Legalizing extending load operation\n");
   EVT SrcVT = LD->getMemoryVT();
   unsigned SrcWidth = SrcVT.getSizeInBits();
   unsigned Alignment = LD->getAlignment();
@@ -947,39 +946,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   }
 }
 
-static TargetLowering::LegalizeAction
-getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) {
-  unsigned EqOpc;
-  switch (Opcode) {
-    default: llvm_unreachable("Unexpected FP pseudo-opcode");
-    case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
-    case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
-    case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
-    case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
-    case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
-    case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
-    case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
-    case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
-    case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
-    case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
-    case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
-    case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
-    case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
-  }
-
-  auto Action = TLI.getOperationAction(EqOpc, VT);
-
-  // We don't currently handle Custom or Promote for strict FP pseudo-ops.
-  // For now, we just expand for those cases.
-  if (Action != TargetLowering::Legal)
-    Action = TargetLowering::Expand;
-
-  return Action;
-}
-
 /// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
-  DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
+  LLVM_DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
 
   // Allow illegal target nodes and illegal registers.
   if (Node->getOpcode() == ISD::TargetConstant ||
@@ -1043,8 +1012,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::SETCC:
   case ISD::BR_CC: {
     unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
-                         Node->getOpcode() == ISD::SETCC ? 2 :
-                         Node->getOpcode() == ISD::SETCCE ? 3 : 1;
+                         Node->getOpcode() == ISD::SETCC ? 2 : 1;
     unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
     MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
     ISD::CondCode CCCode =
@@ -1122,6 +1090,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       return;
     }
     break;
+  case ISD::STRICT_FADD:
+  case ISD::STRICT_FSUB:
+  case ISD::STRICT_FMUL:
+  case ISD::STRICT_FDIV:
   case ISD::STRICT_FSQRT:
   case ISD::STRICT_FMA:
   case ISD::STRICT_FPOW:
@@ -1139,8 +1111,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
     // ISD::STRICT_FSQRT.
-    Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(),
-                                     Node->getValueType(0));
+    Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                            Node->getValueType(0));
     break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
@@ -1202,10 +1174,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     }
     switch (Action) {
     case TargetLowering::Legal:
-      DEBUG(dbgs() << "Legal node: nothing to do\n");
+      LLVM_DEBUG(dbgs() << "Legal node: nothing to do\n");
       return;
     case TargetLowering::Custom:
-      DEBUG(dbgs() << "Trying custom legalization\n");
+      LLVM_DEBUG(dbgs() << "Trying custom legalization\n");
       // FIXME: The handling for custom lowering with multiple results is
       // a complete mess.
       if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
@@ -1213,7 +1185,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
           return;
 
         if (Node->getNumValues() == 1) {
-          DEBUG(dbgs() << "Successfully custom legalized node\n");
+          LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
           // We can just directly replace this node with the lowered value.
           ReplaceNode(SDValue(Node, 0), Res);
           return;
@@ -1222,11 +1194,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
         SmallVector<SDValue, 8> ResultVals;
         for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
           ResultVals.push_back(Res.getValue(i));
-        DEBUG(dbgs() << "Successfully custom legalized node\n");
+        LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
         ReplaceNode(Node, ResultVals.data());
         return;
       }
-      DEBUG(dbgs() << "Could not custom legalize node\n");
+      LLVM_DEBUG(dbgs() << "Could not custom legalize node\n");
       LLVM_FALLTHROUGH;
     case TargetLowering::Expand:
       if (ExpandNode(Node))
@@ -1623,6 +1595,7 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
   NeedInvert = false;
+  bool NeedSwap = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
@@ -1630,23 +1603,37 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
     break;
   case TargetLowering::Expand: {
     ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
-    if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+    if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
       std::swap(LHS, RHS);
       CC = DAG.getCondCode(InvCC);
       return true;
     }
+    // Swapping operands didn't work. Try inverting the condition.
+    InvCC = getSetCCInverse(CCCode, OpVT.isInteger());
+    if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
+      // If inverting the condition is not enough, try swapping operands
+      // on top of it.
+      InvCC = ISD::getSetCCSwappedOperands(InvCC);
+      NeedSwap = true;
+    }
+    if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
+      CC = DAG.getCondCode(InvCC);
+      NeedInvert = true;
+      if (NeedSwap)
+        std::swap(LHS, RHS);
+      return true;
+    }
+
     ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
     case ISD::SETO:
-        assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
-            == TargetLowering::Legal
+        assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT)
             && "If SETO is expanded, SETOEQ must be legal!");
         CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
     case ISD::SETUO:
-        assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
-            == TargetLowering::Legal
+        assert(TLI.isCondCodeLegal(ISD::SETUNE, OpVT)
             && "If SETUO is expanded, SETUNE must be legal!");
         CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;  break;
     case ISD::SETOEQ:
@@ -1676,20 +1663,10 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
     case ISD::SETGT:
     case ISD::SETGE:
     case ISD::SETLT:
-      // We only support using the inverted operation, which is computed above
-      // and not a different manner of supporting expanding these cases.
-      llvm_unreachable("Don't know how to expand this condition!");
     case ISD::SETNE:
     case ISD::SETEQ:
-      // Try inverting the result of the inverse condition.
-      InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
-      if (TLI.isCondCodeLegal(InvCC, OpVT)) {
-        CC = DAG.getCondCode(InvCC);
-        NeedInvert = true;
-        return true;
-      }
-      // If inverting the condition didn't work then we have no means to expand
-      // the condition.
+      // If all combinations of inverting the condition and swapping operands
+      // didn't work then we have no means to expand the condition.
       llvm_unreachable("Don't know how to expand this condition!");
     }
 
@@ -2036,12 +2013,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode()) {
-    DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
+    LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
     // It's a tailcall, return the chain (which is the DAG root).
     return DAG.getRoot();
   }
 
-  DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
+  LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
   return CallInfo.first;
 }
 
@@ -2327,10 +2304,10 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
                                                    EVT DestVT,
                                                    const SDLoc &dl) {
   // TODO: Should any fast-math-flags be set for the created nodes?
-  DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
+  LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
   if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
-    DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double "
-                    "expansion\n");
+    LLVM_DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double "
+                         "expansion\n");
 
     // Get the stack frame index of a 8 byte buffer.
     SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
@@ -2395,7 +2372,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
   // and in all alternate rounding modes.
   // TODO: Generalize this for use with other types.
   if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
-    DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
+    LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
     SDValue TwoP52 =
       DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
     SDValue TwoP84PlusTwoP52 =
@@ -2418,7 +2395,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
 
   // TODO: Generalize this for use with other types.
   if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
-    DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
+    LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
     // For unsigned conversions, convert them to signed conversions using the
     // algorithm from the x86_64 __floatundidf in compiler_rt.
     if (!isSigned) {
@@ -2853,7 +2830,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
 }
 
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
-  DEBUG(dbgs() << "Trying to expand node\n");
+  LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4;
@@ -3311,7 +3288,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     break;
   case ISD::FP_TO_FP16:
-    DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
+    LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
     if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
       SDValue Op = Node->getOperand(0);
       MVT SVT = Op.getSimpleValueType();
@@ -3525,15 +3502,25 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::USUBO: {
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
-    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
-                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
-                              LHS, RHS);
+    bool IsAdd = Node->getOpcode() == ISD::UADDO;
+    // If ADD/SUBCARRY is legal, use that instead.
+    unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY;
+    if (TLI.isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) {
+      SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1));
+      SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(),
+                                      { LHS, RHS, CarryIn });
+      Results.push_back(SDValue(NodeCarry.getNode(), 0));
+      Results.push_back(SDValue(NodeCarry.getNode(), 1));
+      break;
+    }
+
+    SDValue Sum = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
+                              LHS.getValueType(), LHS, RHS);
     Results.push_back(Sum);
 
     EVT ResultType = Node->getValueType(1);
     EVT SetCCType = getSetCCResultType(Node->getValueType(0));
-    ISD::CondCode CC
-      = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
+    ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
     SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
 
     Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
@@ -3684,8 +3671,17 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
-    Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
-                        DAG.getConstant(EntrySize, dl, Index.getValueType()));
+    // For power-of-two jumptable entry sizes convert multiplication to a shift.
+    // This transformation needs to be done here since otherwise the MIPS
+    // backend will end up emitting a three instruction multiply sequence
+    // instead of a single shift and MSP430 will call a runtime function.
+    if (llvm::isPowerOf2_32(EntrySize))
+      Index = DAG.getNode(
+          ISD::SHL, dl, Index.getValueType(), Index,
+          DAG.getConstant(llvm::Log2_32(EntrySize), dl, Index.getValueType()));
+    else
+      Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                          DAG.getConstant(EntrySize, dl, Index.getValueType()));
     SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
                                Index, Table);
 
@@ -3701,7 +3697,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
                           TLI.getPICJumpTableRelocBase(Table, DAG));
     }
-    Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
+
+    Tmp1 = TLI.expandIndirectJTBranch(dl, LD.getValue(1), Addr, DAG);
     Results.push_back(Tmp1);
     break;
   }
@@ -3720,7 +3717,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       if (Tmp2.isUndef() ||
           (Tmp2.getOpcode() == ISD::AND &&
            isa<ConstantSDNode>(Tmp2.getOperand(1)) &&
-           dyn_cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1))
+           cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1))
         Tmp3 = Tmp2;
       else
         Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
@@ -3759,7 +3756,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // illegal; expand it into a SELECT_CC.
     EVT VT = Node->getValueType(0);
     int TrueValue;
-    switch (TLI.getBooleanContents(Tmp1->getValueType(0))) {
+    switch (TLI.getBooleanContents(Tmp1.getValueType())) {
     case TargetLowering::ZeroOrOneBooleanContent:
     case TargetLowering::UndefinedBooleanContent:
       TrueValue = 1;
@@ -3784,7 +3781,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue CC = Node->getOperand(4);
     ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();
 
-    if (TLI.isCondCodeLegal(CCOp, Tmp1.getSimpleValueType())) {
+    if (TLI.isCondCodeLegalOrCustom(CCOp, Tmp1.getSimpleValueType())) {
       // If the condition code is legal, then we need to expand this
       // node using SETCC and SELECT.
       EVT CmpVT = Tmp1.getValueType();
@@ -3805,7 +3802,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // version (or vice versa).
     ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
                                                Tmp1.getValueType().isInteger());
-    if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
+    if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) {
       // Use the new condition code and swap true and false
       Legalized = true;
       Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
@@ -3813,7 +3810,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       // If The inverse is not legal, then try to swap the arguments using
       // the inverse condition code.
       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
-      if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
+      if (TLI.isCondCodeLegalOrCustom(SwapInvCC, Tmp1.getSimpleValueType())) {
         // The swapped inverse condition is legal, so swap true and false,
         // lhs and rhs.
         Legalized = true;
@@ -3906,6 +3903,46 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     ReplaceNode(SDValue(Node, 0), Result);
     break;
   }
+  case ISD::ROTL:
+  case ISD::ROTR: {
+    bool IsLeft = Node->getOpcode() == ISD::ROTL;
+    SDValue Op0 = Node->getOperand(0), Op1 = Node->getOperand(1);
+    EVT ResVT = Node->getValueType(0);
+    EVT OpVT = Op0.getValueType();
+    assert(OpVT == ResVT &&
+           "The result and the operand types of rotate should match");
+    EVT ShVT = Op1.getValueType();
+    SDValue Width = DAG.getConstant(OpVT.getScalarSizeInBits(), dl, ShVT);
+
+    // If a rotate in the other direction is legal, use it.
+    unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
+    if (TLI.isOperationLegal(RevRot, ResVT)) {
+      SDValue Sub = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1);
+      Results.push_back(DAG.getNode(RevRot, dl, ResVT, Op0, Sub));
+      break;
+    }
+
+    // Otherwise,
+    //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
+    //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
+    //
+    assert(isPowerOf2_32(OpVT.getScalarSizeInBits()) &&
+           "Expecting the type bitwidth to be a power of 2");
+    unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
+    unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
+    SDValue Width1 = DAG.getNode(ISD::SUB, dl, ShVT,
+                                 Width, DAG.getConstant(1, dl, ShVT));
+    SDValue NegOp1 = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1);
+    SDValue And0 = DAG.getNode(ISD::AND, dl, ShVT, Op1, Width1);
+    SDValue And1 = DAG.getNode(ISD::AND, dl, ShVT, NegOp1, Width1);
+
+    SDValue Or = DAG.getNode(ISD::OR, dl, ResVT,
+                             DAG.getNode(ShOpc, dl, ResVT, Op0, And0),
+                             DAG.getNode(HsOpc, dl, ResVT, Op0, And1));
+    Results.push_back(Or);
+    break;
+  }
+
   case ISD::GLOBAL_OFFSET_TABLE:
   case ISD::GlobalAddress:
   case ISD::GlobalTLSAddress:
@@ -3921,19 +3958,21 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
   // Replace the original node with the legalized result.
   if (Results.empty()) {
-    DEBUG(dbgs() << "Cannot expand node\n");
+    LLVM_DEBUG(dbgs() << "Cannot expand node\n");
     return false;
   }
 
-  DEBUG(dbgs() << "Succesfully expanded node\n");
+  LLVM_DEBUG(dbgs() << "Succesfully expanded node\n");
   ReplaceNode(Node, Results.data());
   return true;
 }
 
 void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
-  DEBUG(dbgs() << "Trying to convert node to libcall\n");
+  LLVM_DEBUG(dbgs() << "Trying to convert node to libcall\n");
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
+  // FIXME: Check flags on the node to see if we can use a finite call.
+  bool CanUseFiniteLibCall = TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath;
   unsigned Opc = Node->getOpcode();
   switch (Opc) {
   case ISD::ATOMIC_FENCE: {
@@ -3962,6 +4001,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
@@ -4028,33 +4068,68 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::FLOG:
   case ISD::STRICT_FLOG:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
-                                      RTLIB::LOG_F80, RTLIB::LOG_F128,
-                                      RTLIB::LOG_PPCF128));
+    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log_finite))
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_FINITE_F32,
+                                        RTLIB::LOG_FINITE_F64,
+                                        RTLIB::LOG_FINITE_F80,
+                                        RTLIB::LOG_FINITE_F128,
+                                        RTLIB::LOG_FINITE_PPCF128));
+    else
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                        RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                        RTLIB::LOG_PPCF128));
     break;
   case ISD::FLOG2:
   case ISD::STRICT_FLOG2:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
-                                      RTLIB::LOG2_F80, RTLIB::LOG2_F128,
-                                      RTLIB::LOG2_PPCF128));
+    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log2_finite))
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_FINITE_F32,
+                                        RTLIB::LOG2_FINITE_F64,
+                                        RTLIB::LOG2_FINITE_F80,
+                                        RTLIB::LOG2_FINITE_F128,
+                                        RTLIB::LOG2_FINITE_PPCF128));
+    else
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                        RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                        RTLIB::LOG2_PPCF128));
     break;
   case ISD::FLOG10:
   case ISD::STRICT_FLOG10:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
-                                      RTLIB::LOG10_F80, RTLIB::LOG10_F128,
-                                      RTLIB::LOG10_PPCF128));
+    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log10_finite))
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_FINITE_F32,
+                                        RTLIB::LOG10_FINITE_F64,
+                                        RTLIB::LOG10_FINITE_F80,
+                                        RTLIB::LOG10_FINITE_F128,
+                                        RTLIB::LOG10_FINITE_PPCF128));
+    else
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                        RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                        RTLIB::LOG10_PPCF128));
     break;
   case ISD::FEXP:
   case ISD::STRICT_FEXP:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
-                                      RTLIB::EXP_F80, RTLIB::EXP_F128,
-                                      RTLIB::EXP_PPCF128));
+    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_exp_finite))
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_FINITE_F32,
+                                        RTLIB::EXP_FINITE_F64,
+                                        RTLIB::EXP_FINITE_F80,
+                                        RTLIB::EXP_FINITE_F128,
+                                        RTLIB::EXP_FINITE_PPCF128));
+    else
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                        RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                        RTLIB::EXP_PPCF128));
     break;
   case ISD::FEXP2:
   case ISD::STRICT_FEXP2:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
-                                      RTLIB::EXP2_F80, RTLIB::EXP2_F128,
-                                      RTLIB::EXP2_PPCF128));
+    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_exp2_finite))
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_FINITE_F32,
+                                        RTLIB::EXP2_FINITE_F64,
+                                        RTLIB::EXP2_FINITE_F80,
+                                        RTLIB::EXP2_FINITE_F128,
+                                        RTLIB::EXP2_FINITE_PPCF128));
+    else
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                        RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                        RTLIB::EXP2_PPCF128));
     break;
   case ISD::FTRUNC:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
@@ -4100,9 +4175,16 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::FPOW:
   case ISD::STRICT_FPOW:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
-                                      RTLIB::POW_F80, RTLIB::POW_F128,
-                                      RTLIB::POW_PPCF128));
+    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_pow_finite))
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_FINITE_F32,
+                                        RTLIB::POW_FINITE_F64,
+                                        RTLIB::POW_FINITE_F80,
+                                        RTLIB::POW_FINITE_F128,
+                                        RTLIB::POW_FINITE_PPCF128));
+    else
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
+                                        RTLIB::POW_F80, RTLIB::POW_F128,
+                                        RTLIB::POW_PPCF128));
     break;
   case ISD::FDIV:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
@@ -4186,10 +4268,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
 
   // Replace the original node with the legalized result.
   if (!Results.empty()) {
-    DEBUG(dbgs() << "Successfully converted node to libcall\n");
+    LLVM_DEBUG(dbgs() << "Successfully converted node to libcall\n");
     ReplaceNode(Node, Results.data());
   } else
-    DEBUG(dbgs() << "Could not convert node to libcall\n");
+    LLVM_DEBUG(dbgs() << "Could not convert node to libcall\n");
 }
 
 // Determine the vector type to use in place of an original scalar element when
@@ -4203,7 +4285,7 @@ static MVT getPromotedVectorElementType(const TargetLowering &TLI,
 }
 
 void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
-  DEBUG(dbgs() << "Trying to promote node\n");
+  LLVM_DEBUG(dbgs() << "Trying to promote node\n");
   SmallVector<SDValue, 8> Results;
   MVT OVT = Node->getSimpleValueType(0);
   if (Node->getOpcode() == ISD::UINT_TO_FP ||
@@ -4256,7 +4338,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
         ISD::SRL, dl, NVT, Tmp1,
         DAG.getConstant(DiffBits, dl,
                         TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
-    Results.push_back(Tmp1);
+
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     break;
   }
   case ISD::FP_TO_UINT:
@@ -4640,10 +4723,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
 
   // Replace the original node with the legalized result.
   if (!Results.empty()) {
-    DEBUG(dbgs() << "Successfully promoted node\n");
+    LLVM_DEBUG(dbgs() << "Successfully promoted node\n");
     ReplaceNode(Node, Results.data());
   } else
-    DEBUG(dbgs() << "Could not promote node\n");
+    LLVM_DEBUG(dbgs() << "Could not promote node\n");
 }
 
 /// This is the entry point for the file.
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index e28a3aa47ca3..b0ae1e0399fb 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -47,8 +47,8 @@ static RTLIB::Libcall GetFPLibCall(EVT VT,
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
@@ -738,8 +738,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Res = SDValue();
 
   switch (N->getOpcode()) {
@@ -1039,7 +1039,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
 /// have invalid operands or may have other results that need promotion, we just
 /// know that (at least) one result needs expansion.
 void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Lo, Hi;
   Lo = Hi = SDValue();
 
@@ -1538,7 +1538,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
 /// types of the node are known to be legal, but other operands of the node may
 /// need promotion or expansion as well as the specified one.
 bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom expand this node.
@@ -1658,18 +1658,6 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
-  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
-  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
-  if (RVT == MVT::i32) {
-    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
-           "Logic only correct for ppcf128!");
-    SDValue Res = DAG.getNode(ISD::FP_ROUND_INREG, dl, MVT::ppcf128,
-                              N->getOperand(0), DAG.getValueType(MVT::f64));
-    Res = DAG.getNode(ISD::FP_ROUND, dl, MVT::f64, Res,
-                      DAG.getIntPtrConstant(1, dl));
-    return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
-  }
-
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first;
@@ -1679,31 +1667,6 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
-  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
-  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
-  if (RVT == MVT::i32) {
-    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
-           "Logic only correct for ppcf128!");
-    const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
-    APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
-    SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
-    //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
-    // FIXME: generated code sucks.
-    // TODO: Are there fast-math-flags to propagate to this FSUB?
-    return DAG.getSelectCC(dl, N->getOperand(0), Tmp,
-                           DAG.getNode(ISD::ADD, dl, MVT::i32,
-                                       DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
-                                                   DAG.getNode(ISD::FSUB, dl,
-                                                               MVT::ppcf128,
-                                                               N->getOperand(0),
-                                                               Tmp)),
-                                       DAG.getConstant(0x80000000, dl,
-                                                       MVT::i32)),
-                           DAG.getNode(ISD::FP_TO_SINT, dl,
-                                       MVT::i32, N->getOperand(0)),
-                           ISD::SETGE);
-  }
-
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
@@ -2139,13 +2102,12 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
 
   // Load the value as an integer value with the same number of bits.
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
-  auto MMOFlags =
-      L->getMemOperand()->getFlags() &
-      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
   SDValue newL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), IVT,
                              SDLoc(N), L->getChain(), L->getBasePtr(),
                              L->getOffset(), L->getPointerInfo(), IVT,
-                             L->getAlignment(), MMOFlags, L->getAAInfo());
+                             L->getAlignment(),
+                             L->getMemOperand()->getFlags(),
+                             L->getAAInfo());
   // Legalize the chain result by replacing uses of the old value chain with the
   // new one
   ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 29f0bb475b08..63a1ea13a5f5 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -36,12 +36,13 @@ using namespace llvm;
 /// may also have invalid operands or may have other results that need
 /// expansion, we just know that (at least) one result needs promotion.
 void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
-    DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
     return;
   }
 
@@ -146,6 +147,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
@@ -501,7 +503,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
 
   SDLoc dl(N);
   SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(),
-                   N->getIndex()};
+                   N->getIndex(), N->getScale() };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
                                     N->getMemOperand());
@@ -586,43 +588,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
-  EVT SVT = getSetCCResultType(N->getOperand(0).getValueType());
-
+  EVT InVT = N->getOperand(0).getValueType();
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
 
-  // Only use the result of getSetCCResultType if it is legal,
-  // otherwise just use the promoted result type (NVT).
-  if (!TLI.isTypeLegal(SVT))
-    SVT = NVT;
+  EVT SVT = getSetCCResultType(InVT);
+
+  // If we got back a type that needs to be promoted, this likely means the
+  // the input type also needs to be promoted. So get the promoted type for
+  // the input and try the query again.
+  if (getTypeAction(SVT) == TargetLowering::TypePromoteInteger) {
+    if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) {
+      InVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+      SVT = getSetCCResultType(InVT);
+    } else {
+      // Input type isn't promoted, just use the default promoted type.
+      SVT = NVT;
+    }
+  }
 
   SDLoc dl(N);
   assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() &&
          "Vector compare must return a vector result!");
 
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  if (LHS.getValueType() != RHS.getValueType()) {
-    if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger &&
-        !LHS.getValueType().isVector())
-      LHS = GetPromotedInteger(LHS);
-    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger &&
-        !RHS.getValueType().isVector())
-      RHS = GetPromotedInteger(RHS);
-  }
-
   // Get the SETCC result using the canonical SETCC type.
-  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS,
-                              N->getOperand(2));
+  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2));
 
   // Convert to the expected type.
   return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
-  SDValue LHS = N->getOperand(0);
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    LHS = GetPromotedInteger(LHS);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
   return DAG.getNode(ISD::SHL, SDLoc(N), LHS.getValueType(), LHS, RHS);
@@ -661,22 +659,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
   // The input value must be properly sign extended.
-  if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    LHS = SExtPromotedInteger(LHS);
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
   return DAG.getNode(ISD::SRA, SDLoc(N), LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
   // The input value must be properly zero extended.
-  if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    LHS = ZExtPromotedInteger(LHS);
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
   return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS);
@@ -904,11 +898,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
 /// result types of the node are known to be legal, but other operands of the
 /// node may need promotion or expansion as well as the specified one.
 bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Res = SDValue();
 
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
-    DEBUG(dbgs() << "Node has been custom lowered, done\n");
+    LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
     return false;
   }
 
@@ -1001,11 +996,11 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
     // than the width of NewLHS/NewRH, we can avoid inserting real truncate
     // instruction, which is redudant eventually.
     unsigned OpLEffectiveBits =
-        OpL.getValueSizeInBits() - DAG.ComputeNumSignBits(OpL) + 1;
+        OpL.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(OpL) + 1;
     unsigned OpREffectiveBits =
-        OpR.getValueSizeInBits() - DAG.ComputeNumSignBits(OpR) + 1;
-    if (OpLEffectiveBits <= NewLHS.getValueSizeInBits() &&
-        OpREffectiveBits <= NewRHS.getValueSizeInBits()) {
+        OpR.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(OpR) + 1;
+    if (OpLEffectiveBits <= NewLHS.getScalarValueSizeInBits() &&
+        OpREffectiveBits <= NewRHS.getScalarValueSizeInBits()) {
       NewLHS = OpL;
       NewRHS = OpR;
     } else {
@@ -1356,7 +1351,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) {
 /// have invalid operands or may have other results that need promotion, we just
 /// know that (at least) one result needs expansion.
 void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Lo, Hi;
   Lo = Hi = SDValue();
 
@@ -1413,6 +1409,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
@@ -2893,7 +2890,8 @@ void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
 /// result types of the node are known to be legal, but other operands of the
 /// node may need promotion or expansion as well as the specified one.
 bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Res = SDValue();
 
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
@@ -2915,7 +2913,6 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
   case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
   case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
-  case ISD::SETCCE:            Res = ExpandIntOp_SETCCE(N); break;
   case ISD::SETCCCARRY:        Res = ExpandIntOp_SETCCCARRY(N); break;
   case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
@@ -3051,15 +3048,14 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
     return;
   }
 
-  // Lower with SETCCE or SETCCCARRY if the target supports it.
+  // Lower with SETCCCARRY if the target supports it.
   EVT HiVT = LHSHi.getValueType();
   EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT);
   bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT);
 
   // FIXME: Make all targets support this, then remove the other lowering.
-  if (HasSETCCCARRY ||
-      TLI.getOperationAction(ISD::SETCCE, ExpandVT) == TargetLowering::Custom) {
-    // SETCCE/SETCCCARRY can detect < and >= directly. For > and <=, flip
+  if (HasSETCCCARRY) {
+    // SETCCCARRY can detect < and >= directly. For > and <=, flip
     // operands and condition code.
     bool FlipOperands = false;
     switch (CCCode) {
@@ -3074,17 +3070,15 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
       std::swap(LHSHi, RHSHi);
     }
     // Perform a wide subtraction, feeding the carry from the low part into
-    // SETCCE/SETCCCARRY. The SETCCE/SETCCCARRY operation is essentially
-    // looking at the high part of the result of LHS - RHS. It is negative
-    // iff LHS < RHS. It is zero or positive iff LHS >= RHS.
+    // SETCCCARRY. The SETCCCARRY operation is essentially looking at the high
+    // part of the result of LHS - RHS. It is negative iff LHS < RHS. It is
+    // zero or positive iff LHS >= RHS.
     EVT LoVT = LHSLo.getValueType();
-    SDVTList VTList = DAG.getVTList(
-        LoVT, HasSETCCCARRY ? getSetCCResultType(LoVT) : MVT::Glue);
-    SDValue LowCmp = DAG.getNode(HasSETCCCARRY ? ISD::USUBO : ISD::SUBC, dl,
-                                 VTList, LHSLo, RHSLo);
-    SDValue Res = DAG.getNode(HasSETCCCARRY ? ISD::SETCCCARRY : ISD::SETCCE, dl,
-                              getSetCCResultType(HiVT), LHSHi, RHSHi,
-                              LowCmp.getValue(1), DAG.getCondCode(CCCode));
+    SDVTList VTList = DAG.getVTList(LoVT, getSetCCResultType(LoVT));
+    SDValue LowCmp = DAG.getNode(ISD::USUBO, dl, VTList, LHSLo, RHSLo);
+    SDValue Res = DAG.getNode(ISD::SETCCCARRY, dl, getSetCCResultType(HiVT),
+                              LHSHi, RHSHi, LowCmp.getValue(1),
+                              DAG.getCondCode(CCCode));
     NewLHS = Res;
     NewRHS = SDValue();
     return;
@@ -3152,24 +3146,6 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
       DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0);
 }
 
-SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) {
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  SDValue Carry = N->getOperand(2);
-  SDValue Cond = N->getOperand(3);
-  SDLoc dl = SDLoc(N);
-
-  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
-  GetExpandedInteger(LHS, LHSLo, LHSHi);
-  GetExpandedInteger(RHS, RHSLo, RHSHi);
-
-  // Expand to a SUBE for the low part and a smaller SETCCE for the high.
-  SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue);
-  SDValue LowCmp = DAG.getNode(ISD::SUBE, dl, VTList, LHSLo, RHSLo, Carry);
-  return DAG.getNode(ISD::SETCCE, dl, N->getValueType(0), LHSHi, RHSHi,
-                     LowCmp.getValue(1), Cond);
-}
-
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
@@ -3497,21 +3473,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   assert(NumElem * NumOperands == NumOutElem &&
          "Unexpected number of elements");
 
-  // If the input type is legal and we can promote it to a legal type with the
-  // same element size, go ahead do that to create a new concat.
-  if (getTypeAction(N->getOperand(0).getValueType()) ==
-      TargetLowering::TypeLegal) {
-    EVT InPromotedTy = EVT::getVectorVT(*DAG.getContext(), OutElemTy, NumElem);
-    if (TLI.isTypeLegal(InPromotedTy)) {
-      SmallVector<SDValue, 8> Ops(NumOperands);
-      for (unsigned i = 0; i < NumOperands; ++i) {
-        Ops[i] = DAG.getNode(ISD::ANY_EXTEND, dl, InPromotedTy,
-                             N->getOperand(i));
-      }
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, NOutVT, Ops);
-    }
-  }
-
   // Take the elements from the first vector.
   SmallVector<SDValue, 8> Ops(NumOutElem);
   for (unsigned i = 0; i < NumOperands; ++i) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 4438ee7878b8..a9f144c06e9a 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -84,9 +84,11 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
       SDValue Res(&Node, i);
       EVT VT = Res.getValueType();
       bool Failed = false;
+      // Don't create a value in map.
+      auto ResId = (ValueToIdMap.count(Res)) ? ValueToIdMap[Res] : 0;
 
       unsigned Mapped = 0;
-      if (ReplacedValues.find(Res) != ReplacedValues.end()) {
+      if (ResId && (ReplacedValues.find(ResId) != ReplacedValues.end())) {
         Mapped |= 1;
         // Check that remapped values are only used by nodes marked NewNode.
         for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
@@ -97,30 +99,32 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
 
         // Check that the final result of applying ReplacedValues is not
         // marked NewNode.
-        SDValue NewVal = ReplacedValues[Res];
-        DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(NewVal);
+        auto NewValId = ReplacedValues[ResId];
+        auto I = ReplacedValues.find(NewValId);
         while (I != ReplacedValues.end()) {
-          NewVal = I->second;
-          I = ReplacedValues.find(NewVal);
+          NewValId = I->second;
+          I = ReplacedValues.find(NewValId);
         }
+        SDValue NewVal = getSDValue(NewValId);
+        (void)NewVal;
         assert(NewVal.getNode()->getNodeId() != NewNode &&
                "ReplacedValues maps to a new node!");
       }
-      if (PromotedIntegers.find(Res) != PromotedIntegers.end())
+      if (ResId && PromotedIntegers.find(ResId) != PromotedIntegers.end())
         Mapped |= 2;
-      if (SoftenedFloats.find(Res) != SoftenedFloats.end())
+      if (ResId && SoftenedFloats.find(ResId) != SoftenedFloats.end())
         Mapped |= 4;
-      if (ScalarizedVectors.find(Res) != ScalarizedVectors.end())
+      if (ResId && ScalarizedVectors.find(ResId) != ScalarizedVectors.end())
         Mapped |= 8;
-      if (ExpandedIntegers.find(Res) != ExpandedIntegers.end())
+      if (ResId && ExpandedIntegers.find(ResId) != ExpandedIntegers.end())
         Mapped |= 16;
-      if (ExpandedFloats.find(Res) != ExpandedFloats.end())
+      if (ResId && ExpandedFloats.find(ResId) != ExpandedFloats.end())
         Mapped |= 32;
-      if (SplitVectors.find(Res) != SplitVectors.end())
+      if (ResId && SplitVectors.find(ResId) != SplitVectors.end())
         Mapped |= 64;
-      if (WidenedVectors.find(Res) != WidenedVectors.end())
+      if (ResId && WidenedVectors.find(ResId) != WidenedVectors.end())
         Mapped |= 128;
-      if (PromotedFloats.find(Res) != PromotedFloats.end())
+      if (ResId && PromotedFloats.find(ResId) != PromotedFloats.end())
         Mapped |= 256;
 
       if (Node.getNodeId() != Processed) {
@@ -224,9 +228,9 @@ bool DAGTypeLegalizer::run() {
     assert(N->getNodeId() == ReadyToProcess &&
            "Node should be ready if on worklist!");
 
-    DEBUG(dbgs() << "Legalizing node: "; N->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Legalizing node: "; N->dump(&DAG));
     if (IgnoreNodeResults(N)) {
-      DEBUG(dbgs() << "Ignoring node results\n");
+      LLVM_DEBUG(dbgs() << "Ignoring node results\n");
       goto ScanOperands;
     }
 
@@ -234,11 +238,11 @@ bool DAGTypeLegalizer::run() {
     // types are illegal.
     for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) {
       EVT ResultVT = N->getValueType(i);
-      DEBUG(dbgs() << "Analyzing result type: " <<
-                      ResultVT.getEVTString() << "\n");
+      LLVM_DEBUG(dbgs() << "Analyzing result type: " << ResultVT.getEVTString()
+                        << "\n");
       switch (getTypeAction(ResultVT)) {
       case TargetLowering::TypeLegal:
-        DEBUG(dbgs() << "Legal result type\n");
+        LLVM_DEBUG(dbgs() << "Legal result type\n");
         break;
       // The following calls must take care of *all* of the node's results,
       // not just the illegal result they were passed (this includes results
@@ -296,11 +300,11 @@ ScanOperands:
         continue;
 
       const auto Op = N->getOperand(i);
-      DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG));
+      LLVM_DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG));
       EVT OpVT = Op.getValueType();
       switch (getTypeAction(OpVT)) {
       case TargetLowering::TypeLegal:
-        DEBUG(dbgs() << "Legal operand\n");
+        LLVM_DEBUG(dbgs() << "Legal operand\n");
         continue;
       // The following calls must either replace all of the node's results
       // using ReplaceValueWith, and return "false"; or update the node's
@@ -370,7 +374,8 @@ ScanOperands:
     }
 
     if (i == NumOperands) {
-      DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG); dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG);
+                 dbgs() << "\n");
     }
     }
 NodeDone:
@@ -490,9 +495,6 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
   if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
     return N;
 
-  // Remove any stale map entries.
-  ExpungeNode(N);
-
   // Okay, we know that this node is new.  Recursively walk all of its operands
   // to see if they are new also.  The depth of this walk is bounded by the size
   // of the new tree that was constructed (usually 2-3 nodes), so we don't worry
@@ -543,7 +545,6 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
       // to remap the operands, since they are the same as the operands we
       // remapped above.
       N = M;
-      ExpungeNode(N);
     }
   }
 
@@ -564,100 +565,25 @@ void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
     RemapValue(Val);
 }
 
-/// If N has a bogus mapping in ReplacedValues, eliminate it.
-/// This can occur when a node is deleted then reallocated as a new node -
-/// the mapping in ReplacedValues applies to the deleted node, not the new
-/// one.
-/// The only map that can have a deleted node as a source is ReplacedValues.
-/// Other maps can have deleted nodes as targets, but since their looked-up
-/// values are always immediately remapped using RemapValue, resulting in a
-/// not-deleted node, this is harmless as long as ReplacedValues/RemapValue
-/// always performs correct mappings.  In order to keep the mapping correct,
-/// ExpungeNode should be called on any new nodes *before* adding them as
-/// either source or target to ReplacedValues (which typically means calling
-/// Expunge when a new node is first seen, since it may no longer be marked
-/// NewNode by the time it is added to ReplacedValues).
-void DAGTypeLegalizer::ExpungeNode(SDNode *N) {
-  if (N->getNodeId() != NewNode)
-    return;
-
-  // If N is not remapped by ReplacedValues then there is nothing to do.
-  unsigned i, e;
-  for (i = 0, e = N->getNumValues(); i != e; ++i)
-    if (ReplacedValues.find(SDValue(N, i)) != ReplacedValues.end())
-      break;
-
-  if (i == e)
-    return;
-
-  // Remove N from all maps - this is expensive but rare.
-
-  for (DenseMap<SDValue, SDValue>::iterator I = PromotedIntegers.begin(),
-       E = PromotedIntegers.end(); I != E; ++I) {
-    assert(I->first.getNode() != N);
-    RemapValue(I->second);
-  }
-
-  for (DenseMap<SDValue, SDValue>::iterator I = SoftenedFloats.begin(),
-       E = SoftenedFloats.end(); I != E; ++I) {
-    assert(I->first.getNode() != N);
-    RemapValue(I->second);
-  }
-
-  for (DenseMap<SDValue, SDValue>::iterator I = ScalarizedVectors.begin(),
-       E = ScalarizedVectors.end(); I != E; ++I) {
-    assert(I->first.getNode() != N);
-    RemapValue(I->second);
-  }
-
-  for (DenseMap<SDValue, SDValue>::iterator I = WidenedVectors.begin(),
-       E = WidenedVectors.end(); I != E; ++I) {
-    assert(I->first.getNode() != N);
-    RemapValue(I->second);
-  }
-
-  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
-       I = ExpandedIntegers.begin(), E = ExpandedIntegers.end(); I != E; ++I){
-    assert(I->first.getNode() != N);
-    RemapValue(I->second.first);
-    RemapValue(I->second.second);
-  }
-
-  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
-       I = ExpandedFloats.begin(), E = ExpandedFloats.end(); I != E; ++I) {
-    assert(I->first.getNode() != N);
-    RemapValue(I->second.first);
-    RemapValue(I->second.second);
-  }
-
-  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
-       I = SplitVectors.begin(), E = SplitVectors.end(); I != E; ++I) {
-    assert(I->first.getNode() != N);
-    RemapValue(I->second.first);
-    RemapValue(I->second.second);
-  }
-
-  for (DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.begin(),
-       E = ReplacedValues.end(); I != E; ++I)
-    RemapValue(I->second);
-
-  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
-    ReplacedValues.erase(SDValue(N, i));
-}
-
 /// If the specified value was already legalized to another value,
 /// replace it by that value.
-void DAGTypeLegalizer::RemapValue(SDValue &N) {
-  DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(N);
+void DAGTypeLegalizer::RemapValue(SDValue &V) {
+  auto Id = getTableId(V);
+  V = getSDValue(Id);
+}
+
+void DAGTypeLegalizer::RemapId(TableId &Id) {
+  auto I = ReplacedValues.find(Id);
   if (I != ReplacedValues.end()) {
+    assert(Id != I->second && "Id is mapped to itself.");
     // Use path compression to speed up future lookups if values get multiply
     // replaced with other values.
-    RemapValue(I->second);
-    N = I->second;
+    RemapId(I->second);
+    Id = I->second;
 
-    // Note that it is possible to have N.getNode()->getNodeId() == NewNode at
-    // this point because it is possible for a node to be put in the map before
-    // being processed.
+    // Note that N = IdToValueMap[Id] it is possible to have
+    // N.getNode()->getNodeId() == NewNode at this point because it is possible
+    // for a node to be put in the map before being processed.
   }
 }
 
@@ -714,19 +640,22 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
   assert(From.getNode() != To.getNode() && "Potential legalization loop!");
 
   // If expansion produced new nodes, make sure they are properly marked.
-  ExpungeNode(From.getNode());
-  AnalyzeNewValue(To); // Expunges To.
+  AnalyzeNewValue(To);
 
   // Anything that used the old node should now use the new one.  Note that this
   // can potentially cause recursive merging.
   SmallSetVector<SDNode*, 16> NodesToAnalyze;
   NodeUpdateListener NUL(*this, NodesToAnalyze);
   do {
-    DAG.ReplaceAllUsesOfValueWith(From, To);
 
-    // The old node may still be present in a map like ExpandedIntegers or
-    // PromotedIntegers.  Inform maps about the replacement.
-    ReplacedValues[From] = To;
+    // The old node may be present in a map like ExpandedIntegers or
+    // PromotedIntegers. Inform maps about the replacement.
+    auto FromId = getTableId(From);
+    auto ToId = getTableId(To);
+
+    if (FromId != ToId)
+      ReplacedValues[FromId] = ToId;
+    DAG.ReplaceAllUsesOfValueWith(From, To);
 
     // Process the list of nodes that need to be reanalyzed.
     while (!NodesToAnalyze.empty()) {
@@ -751,12 +680,15 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
           SDValue NewVal(M, i);
           if (M->getNodeId() == Processed)
             RemapValue(NewVal);
-          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
           // OldVal may be a target of the ReplacedValues map which was marked
           // NewNode to force reanalysis because it was updated.  Ensure that
           // anything that ReplacedValues mapped to OldVal will now be mapped
           // all the way to NewVal.
-          ReplacedValues[OldVal] = NewVal;
+          auto OldValId = getTableId(OldVal);
+          auto NewValId = getTableId(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
+          if (OldValId != NewValId)
+            ReplacedValues[OldValId] = NewValId;
         }
         // The original node continues to exist in the DAG, marked NewNode.
       }
@@ -773,9 +705,11 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
          "Invalid type for promoted integer");
   AnalyzeNewValue(Result);
 
-  SDValue &OpEntry = PromotedIntegers[Op];
-  assert(!OpEntry.getNode() && "Node is already promoted!");
-  OpEntry = Result;
+  auto &OpIdEntry = PromotedIntegers[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already promoted!");
+  OpIdEntry = getTableId(Result);
+
+  DAG.transferDbgValues(Op, Result);
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
@@ -788,15 +722,15 @@ void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
-  SDValue &OpEntry = SoftenedFloats[Op];
+  auto &OpIdEntry = SoftenedFloats[getTableId(Op)];
   // Allow repeated calls to save f128 type nodes
   // or any node with type that transforms to itself.
   // Many operations on these types are not softened.
-  assert((!OpEntry.getNode()||
+  assert(((OpIdEntry == 0) ||
           Op.getValueType() ==
-          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+              TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
          "Node is already converted to integer!");
-  OpEntry = Result;
+  OpIdEntry = getTableId(Result);
 }
 
 void DAGTypeLegalizer::SetPromotedFloat(SDValue Op, SDValue Result) {
@@ -805,9 +739,9 @@ void DAGTypeLegalizer::SetPromotedFloat(SDValue Op, SDValue Result) {
          "Invalid type for promoted float");
   AnalyzeNewValue(Result);
 
-  SDValue &OpEntry = PromotedFloats[Op];
-  assert(!OpEntry.getNode() && "Node is already promoted!");
-  OpEntry = Result;
+  auto &OpIdEntry = PromotedFloats[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already promoted!");
+  OpIdEntry = getTableId(Result);
 }
 
 void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
@@ -818,19 +752,17 @@ void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
          "Invalid type for scalarized vector");
   AnalyzeNewValue(Result);
 
-  SDValue &OpEntry = ScalarizedVectors[Op];
-  assert(!OpEntry.getNode() && "Node is already scalarized!");
-  OpEntry = Result;
+  auto &OpIdEntry = ScalarizedVectors[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already scalarized!");
+  OpIdEntry = getTableId(Result);
 }
 
 void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
                                           SDValue &Hi) {
-  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
-  RemapValue(Entry.first);
-  RemapValue(Entry.second);
-  assert(Entry.first.getNode() && "Operand isn't expanded");
-  Lo = Entry.first;
-  Hi = Entry.second;
+  std::pair<TableId, TableId> &Entry = ExpandedIntegers[getTableId(Op)];
+  assert((Entry.first != 0) && "Operand isn't expanded");
+  Lo = getSDValue(Entry.first);
+  Hi = getSDValue(Entry.second);
 }
 
 void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
@@ -856,20 +788,18 @@ void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
   }
 
   // Remember that this is the result of the node.
-  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
-  assert(!Entry.first.getNode() && "Node already expanded");
-  Entry.first = Lo;
-  Entry.second = Hi;
+  std::pair<TableId, TableId> &Entry = ExpandedIntegers[getTableId(Op)];
+  assert((Entry.first == 0) && "Node already expanded");
+  Entry.first = getTableId(Lo);
+  Entry.second = getTableId(Hi);
 }
 
 void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
                                         SDValue &Hi) {
-  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
-  RemapValue(Entry.first);
-  RemapValue(Entry.second);
-  assert(Entry.first.getNode() && "Operand isn't expanded");
-  Lo = Entry.first;
-  Hi = Entry.second;
+  std::pair<TableId, TableId> &Entry = ExpandedFloats[getTableId(Op)];
+  assert((Entry.first != 0) && "Operand isn't expanded");
+  Lo = getSDValue(Entry.first);
+  Hi = getSDValue(Entry.second);
 }
 
 void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
@@ -882,21 +812,19 @@ void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
   AnalyzeNewValue(Lo);
   AnalyzeNewValue(Hi);
 
-  // Remember that this is the result of the node.
-  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
-  assert(!Entry.first.getNode() && "Node already expanded");
-  Entry.first = Lo;
-  Entry.second = Hi;
+  std::pair<TableId, TableId> &Entry = ExpandedFloats[getTableId(Op)];
+  assert((Entry.first == 0) && "Node already expanded");
+  Entry.first = getTableId(Lo);
+  Entry.second = getTableId(Hi);
 }
 
 void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
                                       SDValue &Hi) {
-  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
-  RemapValue(Entry.first);
-  RemapValue(Entry.second);
-  assert(Entry.first.getNode() && "Operand isn't split");
-  Lo = Entry.first;
-  Hi = Entry.second;
+  std::pair<TableId, TableId> &Entry = SplitVectors[getTableId(Op)];
+  Lo = getSDValue(Entry.first);
+  Hi = getSDValue(Entry.second);
+  assert(Lo.getNode() && "Operand isn't split");
+  ;
 }
 
 void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
@@ -912,10 +840,10 @@ void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
   AnalyzeNewValue(Hi);
 
   // Remember that this is the result of the node.
-  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
-  assert(!Entry.first.getNode() && "Node already split");
-  Entry.first = Lo;
-  Entry.second = Hi;
+  std::pair<TableId, TableId> &Entry = SplitVectors[getTableId(Op)];
+  assert((Entry.first == 0) && "Node already split");
+  Entry.first = getTableId(Lo);
+  Entry.second = getTableId(Hi);
 }
 
 void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
@@ -924,9 +852,9 @@ void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
          "Invalid type for widened vector");
   AnalyzeNewValue(Result);
 
-  SDValue &OpEntry = WidenedVectors[Op];
-  assert(!OpEntry.getNode() && "Node already widened!");
-  OpEntry = Result;
+  auto &OpIdEntry = WidenedVectors[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node already widened!");
+  OpIdEntry = getTableId(Result);
 }
 
 
@@ -1064,11 +992,11 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
                               LVT.getSizeInBits() + HVT.getSizeInBits());
 
+  EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout(), false);
   Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
   Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
   Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
-                   DAG.getConstant(LVT.getSizeInBits(), dlHi,
-                                   TLI.getPointerTy(DAG.getDataLayout())));
+                   DAG.getConstant(LVT.getSizeInBits(), dlHi, ShiftAmtVT));
   return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
 }
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 64cb80e0d853..2c6b1ee7900f 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -93,46 +93,81 @@ private:
            N->getOpcode() == ISD::Register;
   }
 
+  // Bijection from SDValue to unique id. As each created node gets a
+  // new id we do not need to worry about reuse expunging.  Should we
+  // run out of ids, we can do a one time expensive compactifcation.
+  typedef unsigned TableId;
+
+  TableId NextValueId = 1;
+
+  SmallDenseMap<SDValue, TableId, 8> ValueToIdMap;
+  SmallDenseMap<TableId, SDValue, 8> IdToValueMap;
+
   /// For integer nodes that are below legal width, this map indicates what
   /// promoted value to use.
-  SmallDenseMap<SDValue, SDValue, 8> PromotedIntegers;
+  SmallDenseMap<TableId, TableId, 8> PromotedIntegers;
 
   /// For integer nodes that need to be expanded this map indicates which
   /// operands are the expanded version of the input.
-  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedIntegers;
+  SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> ExpandedIntegers;
 
   /// For floating-point nodes converted to integers of the same size, this map
   /// indicates the converted value to use.
-  SmallDenseMap<SDValue, SDValue, 8> SoftenedFloats;
+  SmallDenseMap<TableId, TableId, 8> SoftenedFloats;
 
   /// For floating-point nodes that have a smaller precision than the smallest
   /// supported precision, this map indicates what promoted value to use.
-  SmallDenseMap<SDValue, SDValue, 8> PromotedFloats;
+  SmallDenseMap<TableId, TableId, 8> PromotedFloats;
 
   /// For float nodes that need to be expanded this map indicates which operands
   /// are the expanded version of the input.
-  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedFloats;
+  SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> ExpandedFloats;
 
   /// For nodes that are <1 x ty>, this map indicates the scalar value of type
   /// 'ty' to use.
-  SmallDenseMap<SDValue, SDValue, 8> ScalarizedVectors;
+  SmallDenseMap<TableId, TableId, 8> ScalarizedVectors;
 
   /// For nodes that need to be split this map indicates which operands are the
   /// expanded version of the input.
-  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> SplitVectors;
+  SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> SplitVectors;
 
   /// For vector nodes that need to be widened, indicates the widened value to
   /// use.
-  SmallDenseMap<SDValue, SDValue, 8> WidenedVectors;
+  SmallDenseMap<TableId, TableId, 8> WidenedVectors;
 
   /// For values that have been replaced with another, indicates the replacement
   /// value to use.
-  SmallDenseMap<SDValue, SDValue, 8> ReplacedValues;
+  SmallDenseMap<TableId, TableId, 8> ReplacedValues;
 
   /// This defines a worklist of nodes to process. In order to be pushed onto
   /// this worklist, all operands of a node must have already been processed.
   SmallVector<SDNode*, 128> Worklist;
 
+  TableId getTableId(SDValue V) {
+    assert(V.getNode() && "Getting TableId on SDValue()");
+
+    auto I = ValueToIdMap.find(V);
+    if (I != ValueToIdMap.end()) {
+      // replace if there's been a shift.
+      RemapId(I->second);
+      assert(I->second && "All Ids should be nonzero");
+      return I->second;
+    }
+    // Add if it's not there.
+    ValueToIdMap.insert(std::make_pair(V, NextValueId));
+    IdToValueMap.insert(std::make_pair(NextValueId, V));
+    ++NextValueId;
+    assert(NextValueId != 0 &&
+           "Ran out of Ids. Increase id type size or add compactification");
+    return NextValueId - 1;
+  }
+
+  const SDValue &getSDValue(TableId &Id) {
+    RemapId(Id);
+    assert(Id && "TableId should be non-zero");
+    return IdToValueMap[Id];
+  }
+
 public:
   explicit DAGTypeLegalizer(SelectionDAG &dag)
     : TLI(dag.getTargetLoweringInfo()), DAG(dag),
@@ -147,10 +182,25 @@ public:
   bool run();
 
   void NoteDeletion(SDNode *Old, SDNode *New) {
-    ExpungeNode(Old);
-    ExpungeNode(New);
-    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
-      ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
+      TableId NewId = getTableId(SDValue(New, i));
+      TableId OldId = getTableId(SDValue(Old, i));
+
+      if (OldId != NewId)
+        ReplacedValues[OldId] = NewId;
+
+      // Delete Node from tables.
+      ValueToIdMap.erase(SDValue(Old, i));
+      IdToValueMap.erase(OldId);
+      PromotedIntegers.erase(OldId);
+      ExpandedIntegers.erase(OldId);
+      SoftenedFloats.erase(OldId);
+      PromotedFloats.erase(OldId);
+      ExpandedFloats.erase(OldId);
+      ScalarizedVectors.erase(OldId);
+      SplitVectors.erase(OldId);
+      WidenedVectors.erase(OldId);
+    }
   }
 
   SelectionDAG &getDAG() const { return DAG; }
@@ -158,9 +208,9 @@ public:
 private:
   SDNode *AnalyzeNewNode(SDNode *N);
   void AnalyzeNewValue(SDValue &Val);
-  void ExpungeNode(SDNode *N);
   void PerformExpensiveChecks();
-  void RemapValue(SDValue &N);
+  void RemapId(TableId &Id);
+  void RemapValue(SDValue &V);
 
   // Common routines.
   SDValue BitConvertToInteger(SDValue Op);
@@ -207,8 +257,8 @@ private:
   /// returns an i32, the lower 16 bits of which coincide with Op, and the upper
   /// 16 bits of which contain rubbish.
   SDValue GetPromotedInteger(SDValue Op) {
-    SDValue &PromotedOp = PromotedIntegers[Op];
-    RemapValue(PromotedOp);
+    TableId &PromotedId = PromotedIntegers[getTableId(Op)];
+    SDValue PromotedOp = getSDValue(PromotedId);
     assert(PromotedOp.getNode() && "Operand wasn't promoted?");
     return PromotedOp;
   }
@@ -282,7 +332,7 @@ private:
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
 
   // Integer Operand Promotion.
-  bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
+  bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
   SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
   SDValue PromoteIntOp_BITCAST(SDNode *N);
@@ -373,11 +423,10 @@ private:
   bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   // Integer Operand Expansion.
-  bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo);
+  bool ExpandIntegerOperand(SDNode *N, unsigned OpNo);
   SDValue ExpandIntOp_BR_CC(SDNode *N);
   SDValue ExpandIntOp_SELECT_CC(SDNode *N);
   SDValue ExpandIntOp_SETCC(SDNode *N);
-  SDValue ExpandIntOp_SETCCE(SDNode *N);
   SDValue ExpandIntOp_SETCCCARRY(SDNode *N);
   SDValue ExpandIntOp_Shift(SDNode *N);
   SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
@@ -403,16 +452,15 @@ private:
   /// stay in a register, the Op is not converted to an integer.
   /// In that case, the given op is returned.
   SDValue GetSoftenedFloat(SDValue Op) {
-    auto Iter = SoftenedFloats.find(Op);
+    TableId Id = getTableId(Op);
+    auto Iter = SoftenedFloats.find(Id);
     if (Iter == SoftenedFloats.end()) {
       assert(isSimpleLegalType(Op.getValueType()) &&
              "Operand wasn't converted to integer?");
       return Op;
     }
-
-    SDValue &SoftenedOp = Iter->second;
+    SDValue SoftenedOp = getSDValue(Iter->second);
     assert(SoftenedOp.getNode() && "Unconverted op in SoftenedFloats?");
-    RemapValue(SoftenedOp);
     return SoftenedOp;
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
@@ -531,7 +579,7 @@ private:
   void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   // Float Operand Expansion.
-  bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
+  bool ExpandFloatOperand(SDNode *N, unsigned OpNo);
   SDValue ExpandFloatOp_BR_CC(SDNode *N);
   SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
   SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
@@ -549,8 +597,8 @@ private:
   //===--------------------------------------------------------------------===//
 
   SDValue GetPromotedFloat(SDValue Op) {
-    SDValue &PromotedOp = PromotedFloats[Op];
-    RemapValue(PromotedOp);
+    TableId &PromotedId = PromotedFloats[getTableId(Op)];
+    SDValue PromotedOp = getSDValue(PromotedId);
     assert(PromotedOp.getNode() && "Operand wasn't promoted?");
     return PromotedOp;
   }
@@ -572,7 +620,7 @@ private:
   SDValue PromoteFloatRes_UNDEF(SDNode *N);
   SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
 
-  bool PromoteFloatOperand(SDNode *N, unsigned ResNo);
+  bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
@@ -589,15 +637,15 @@ private:
   /// element type, this returns the element. For example, if Op is a v1i32,
   /// Op = < i32 val >, this method returns val, an i32.
   SDValue GetScalarizedVector(SDValue Op) {
-    SDValue &ScalarizedOp = ScalarizedVectors[Op];
-    RemapValue(ScalarizedOp);
+    TableId &ScalarizedId = ScalarizedVectors[getTableId(Op)];
+    SDValue ScalarizedOp = getSDValue(ScalarizedId);
     assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?");
     return ScalarizedOp;
   }
   void SetScalarizedVector(SDValue Op, SDValue Result);
 
   // Vector Result Scalarization: <1 x ty> -> ty.
-  void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
+  void ScalarizeVectorResult(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_BinOp(SDNode *N);
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
@@ -646,13 +694,14 @@ private:
   void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
 
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
-  void SplitVectorResult(SDNode *N, unsigned OpNo);
+  void SplitVectorResult(SDNode *N, unsigned ResNo);
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -662,9 +711,9 @@ private:
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_MGATHER(MaskedGatherSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
@@ -684,7 +733,7 @@ private:
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
-  SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
@@ -701,8 +750,8 @@ private:
   /// method returns a v4i32 for which the first two elements are the same as
   /// those of Op, while the last two elements contain rubbish.
   SDValue GetWidenedVector(SDValue Op) {
-    SDValue &WidenedOp = WidenedVectors[Op];
-    RemapValue(WidenedOp);
+    TableId &WidenedId = WidenedVectors[getTableId(Op)];
+    SDValue WidenedOp = getSDValue(WidenedId);
     assert(WidenedOp.getNode() && "Operand wasn't widened?");
     return WidenedOp;
   }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 993465ae9dc2..df3134828af5 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -300,6 +300,7 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
   Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
+  Chain = Hi.getValue(1);
 
   // Handle endianness of the load.
   if (TLI.hasBigEndianPartOrdering(OVT, DAG.getDataLayout()))
@@ -307,7 +308,7 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // Modified the chain - switch anything that used the old chain to use
   // the new one.
-  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+  ReplaceValueWith(SDValue(N, 1), Chain);
 }
 
 
@@ -384,7 +385,7 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
 
   // Build a vector of twice the length out of the expanded elements.
   // For example <3 x i64> -> <6 x i32>.
-  std::vector<SDValue> NewElts;
+  SmallVector<SDValue, 16> NewElts;
   NewElts.reserve(NumElts*2);
 
   for (unsigned i = 0; i < NumElts; ++i) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6a141818bb6d..67928d4bdbd5 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -32,7 +32,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -41,6 +40,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -63,7 +63,7 @@ class VectorLegalizer {
   /// legalizing the same thing more than once.
   SmallDenseMap<SDValue, SDValue, 64> LegalizedNodes;
 
-  /// \brief Adds a node to the translation cache.
+  /// Adds a node to the translation cache.
   void AddLegalizedOperand(SDValue From, SDValue To) {
     LegalizedNodes.insert(std::make_pair(From, To));
     // If someone requests legalization of the new node, return itself.
@@ -71,55 +71,55 @@ class VectorLegalizer {
       LegalizedNodes.insert(std::make_pair(To, To));
   }
 
-  /// \brief Legalizes the given node.
+  /// Legalizes the given node.
   SDValue LegalizeOp(SDValue Op);
 
-  /// \brief Assuming the node is legal, "legalize" the results.
+  /// Assuming the node is legal, "legalize" the results.
   SDValue TranslateLegalizeResults(SDValue Op, SDValue Result);
 
-  /// \brief Implements unrolling a VSETCC.
+  /// Implements unrolling a VSETCC.
   SDValue UnrollVSETCC(SDValue Op);
 
-  /// \brief Implement expand-based legalization of vector operations.
+  /// Implement expand-based legalization of vector operations.
   ///
   /// This is just a high-level routine to dispatch to specific code paths for
   /// operations to legalize them.
   SDValue Expand(SDValue Op);
 
-  /// \brief Implements expansion for FNEG; falls back to UnrollVectorOp if
+  /// Implements expansion for FNEG; falls back to UnrollVectorOp if
   /// FSUB isn't legal.
   ///
   /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
   /// SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
 
-  /// \brief Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
+  /// Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
   SDValue ExpandSEXTINREG(SDValue Op);
 
-  /// \brief Implement expansion for ANY_EXTEND_VECTOR_INREG.
+  /// Implement expansion for ANY_EXTEND_VECTOR_INREG.
   ///
   /// Shuffles the low lanes of the operand into place and bitcasts to the proper
   /// type. The contents of the bits in the extended part of each element are
   /// undef.
   SDValue ExpandANY_EXTEND_VECTOR_INREG(SDValue Op);
 
-  /// \brief Implement expansion for SIGN_EXTEND_VECTOR_INREG.
+  /// Implement expansion for SIGN_EXTEND_VECTOR_INREG.
   ///
   /// Shuffles the low lanes of the operand into place, bitcasts to the proper
   /// type, then shifts left and arithmetic shifts right to introduce a sign
   /// extension.
   SDValue ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op);
 
-  /// \brief Implement expansion for ZERO_EXTEND_VECTOR_INREG.
+  /// Implement expansion for ZERO_EXTEND_VECTOR_INREG.
   ///
   /// Shuffles the low lanes of the operand into place and blends zeros into
   /// the remaining lanes, finally bitcasting to the proper type.
   SDValue ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op);
 
-  /// \brief Expand bswap of vectors into a shuffle if legal.
+  /// Expand bswap of vectors into a shuffle if legal.
   SDValue ExpandBSWAP(SDValue Op);
 
-  /// \brief Implement vselect in terms of XOR, AND, OR when blend is not
+  /// Implement vselect in terms of XOR, AND, OR when blend is not
   /// supported by the target.
   SDValue ExpandVSELECT(SDValue Op);
   SDValue ExpandSELECT(SDValue Op);
@@ -130,19 +130,20 @@ class VectorLegalizer {
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
-
-  /// \brief Implements vector promotion.
+  SDValue ExpandStrictFPOp(SDValue Op);
+  
+  /// Implements vector promotion.
   ///
   /// This is essentially just bitcasting the operands to a different type and
   /// bitcasting the result back to the original type.
   SDValue Promote(SDValue Op);
 
-  /// \brief Implements [SU]INT_TO_FP vector promotion.
+  /// Implements [SU]INT_TO_FP vector promotion.
   ///
   /// This is a [zs]ext of the input operand to a larger integer type.
   SDValue PromoteINT_TO_FP(SDValue Op);
 
-  /// \brief Implements FP_TO_[SU]INT vector promotion of the result type.
+  /// Implements FP_TO_[SU]INT vector promotion of the result type.
   ///
   /// It is promoted to a larger integer type.  The result is then
   /// truncated back to the original type.
@@ -152,7 +153,7 @@ public:
   VectorLegalizer(SelectionDAG& dag) :
       DAG(dag), TLI(dag.getTargetLoweringInfo()) {}
 
-  /// \brief Begin legalizer the vector operations in the DAG.
+  /// Begin legalizer the vector operations in the DAG.
   bool Run();
 };
 
@@ -222,14 +223,16 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   for (const SDValue &Op : Node->op_values())
     Ops.push_back(LegalizeOp(Op));
 
-  SDValue Result = SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops), 0);
+  SDValue Result = SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops),
+                           Op.getResNo());
 
   bool HasVectorValue = false;
   if (Op.getOpcode() == ISD::LOAD) {
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
     ISD::LoadExtType ExtType = LD->getExtensionType();
     if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) {
-      DEBUG(dbgs() << "\nLegalizing extending vector load: "; Node->dump(&DAG));
+      LLVM_DEBUG(dbgs() << "\nLegalizing extending vector load: ";
+                 Node->dump(&DAG));
       switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
                                    LD->getMemoryVT())) {
       default: llvm_unreachable("This action is not supported yet!");
@@ -261,8 +264,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     EVT StVT = ST->getMemoryVT();
     MVT ValVT = ST->getValue().getSimpleValueType();
     if (StVT.isVector() && ST->isTruncatingStore()) {
-      DEBUG(dbgs() << "\nLegalizing truncating vector store: ";
-            Node->dump(&DAG));
+      LLVM_DEBUG(dbgs() << "\nLegalizing truncating vector store: ";
+                 Node->dump(&DAG));
       switch (TLI.getTruncStoreAction(ValVT, StVT)) {
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
@@ -287,10 +290,34 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   if (!HasVectorValue)
     return TranslateLegalizeResults(Op, Result);
 
-  EVT QueryType;
+  TargetLowering::LegalizeAction Action = TargetLowering::Legal;
   switch (Op.getOpcode()) {
   default:
     return TranslateLegalizeResults(Op, Result);
+  case ISD::STRICT_FADD:
+  case ISD::STRICT_FSUB:
+  case ISD::STRICT_FMUL:
+  case ISD::STRICT_FDIV:
+  case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FMA:
+  case ISD::STRICT_FPOW:
+  case ISD::STRICT_FPOWI:
+  case ISD::STRICT_FSIN:
+  case ISD::STRICT_FCOS:
+  case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
+  case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG10:
+  case ISD::STRICT_FLOG2:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
+    // These pseudo-ops get legalized as if they were their non-strict
+    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
+    // is also legal, but if ISD::FSQRT requires expansion then so does
+    // ISD::STRICT_FSQRT.
+    Action = TLI.getStrictFPOperationAction(Node->getOpcode(), 
+                                            Node->getValueType(0));
+    break;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
@@ -366,42 +393,47 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::UMAX:
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
-    QueryType = Node->getValueType(0);
+  case ISD::FCANONICALIZE:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::FP_ROUND_INREG:
-    QueryType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Action = TLI.getOperationAction(Node->getOpcode(), 
+               cast<VTSDNode>(Node->getOperand(1))->getVT());
     break;
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    QueryType = Node->getOperand(0).getValueType();
+    Action = TLI.getOperationAction(Node->getOpcode(), 
+                                    Node->getOperand(0).getValueType());
     break;
   case ISD::MSCATTER:
-    QueryType = cast<MaskedScatterSDNode>(Node)->getValue().getValueType();
+    Action = TLI.getOperationAction(Node->getOpcode(),
+               cast<MaskedScatterSDNode>(Node)->getValue().getValueType());
     break;
   case ISD::MSTORE:
-    QueryType = cast<MaskedStoreSDNode>(Node)->getValue().getValueType();
+    Action = TLI.getOperationAction(Node->getOpcode(),
+               cast<MaskedStoreSDNode>(Node)->getValue().getValueType());
     break;
   }
 
-  DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG));
+  LLVM_DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG));
 
-  switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
+  switch (Action) {
   default: llvm_unreachable("This action is not supported yet!");
   case TargetLowering::Promote:
     Result = Promote(Op);
     Changed = true;
     break;
   case TargetLowering::Legal:
-    DEBUG(dbgs() << "Legal node: nothing to do\n");
+    LLVM_DEBUG(dbgs() << "Legal node: nothing to do\n");
     break;
   case TargetLowering::Custom: {
-    DEBUG(dbgs() << "Trying custom legalization\n");
+    LLVM_DEBUG(dbgs() << "Trying custom legalization\n");
     if (SDValue Tmp1 = TLI.LowerOperation(Op, DAG)) {
-      DEBUG(dbgs() << "Successfully custom legalized node\n");
+      LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
       Result = Tmp1;
       break;
     }
-    DEBUG(dbgs() << "Could not custom legalize node\n");
+    LLVM_DEBUG(dbgs() << "Could not custom legalize node\n");
     LLVM_FALLTHROUGH;
   }
   case TargetLowering::Expand:
@@ -649,9 +681,14 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
     Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals);
   } else {
     SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG);
-
-    NewChain = Scalarized.getValue(1);
-    Value = Scalarized.getValue(0);
+    // Skip past MERGE_VALUE node if known.
+    if (Scalarized->getOpcode() == ISD::MERGE_VALUES) {
+      NewChain = Scalarized.getOperand(1);
+      Value = Scalarized.getOperand(0);
+    } else {
+      NewChain = Scalarized.getValue(1);
+      Value = Scalarized.getValue(0);
+    }
   }
 
   AddLegalizedOperand(Op.getValue(0), Value);
@@ -662,35 +699,6 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
 
 SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
-
-  EVT StVT = ST->getMemoryVT();
-  EVT MemSclVT = StVT.getScalarType();
-  unsigned ScalarSize = MemSclVT.getSizeInBits();
-
-  // Round odd types to the next pow of two.
-  if (!isPowerOf2_32(ScalarSize)) {
-    // FIXME: This is completely broken and inconsistent with ExpandLoad
-    // handling.
-
-    // For sub-byte element sizes, this ends up with 0 stride between elements,
-    // so the same element just gets re-written to the same location. There seem
-    // to be tests explicitly testing for this broken behavior though.  tests
-    // for this broken behavior.
-
-    LLVMContext &Ctx = *DAG.getContext();
-
-    EVT NewMemVT
-      = EVT::getVectorVT(Ctx,
-                         MemSclVT.getIntegerVT(Ctx, NextPowerOf2(ScalarSize)),
-                         StVT.getVectorNumElements());
-
-    SDValue NewVectorStore = DAG.getTruncStore(
-        ST->getChain(), SDLoc(Op), ST->getValue(), ST->getBasePtr(),
-        ST->getPointerInfo(), NewMemVT, ST->getAlignment(),
-        ST->getMemOperand()->getFlags(), ST->getAAInfo());
-    ST = cast<StoreSDNode>(NewVectorStore.getNode());
-  }
-
   SDValue TF = TLI.scalarizeVectorStore(ST, DAG);
   AddLegalizedOperand(Op, TF);
   return TF;
@@ -727,6 +735,24 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandCTLZ(Op);
   case ISD::CTTZ_ZERO_UNDEF:
     return ExpandCTTZ_ZERO_UNDEF(Op);
+  case ISD::STRICT_FADD:
+  case ISD::STRICT_FSUB: 
+  case ISD::STRICT_FMUL:
+  case ISD::STRICT_FDIV:
+  case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FMA:
+  case ISD::STRICT_FPOW:
+  case ISD::STRICT_FPOWI:
+  case ISD::STRICT_FSIN:
+  case ISD::STRICT_FCOS:
+  case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
+  case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG10:
+  case ISD::STRICT_FLOG2:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
+    return ExpandStrictFPOp(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
   }
@@ -1020,7 +1046,7 @@ SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   SDValue HalfWordMask = DAG.getConstant(HWMask, DL, VT);
 
   // Two to the power of half-word-size.
-  SDValue TWOHW = DAG.getConstantFP(1 << (BW / 2), DL, Op.getValueType());
+  SDValue TWOHW = DAG.getConstantFP(1ULL << (BW / 2), DL, Op.getValueType());
 
   // Clear upper part of LO, lower HI
   SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord);
@@ -1113,6 +1139,53 @@ SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumOpers = Op.getNumOperands();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ValueVTs[] = {EltVT, MVT::Other};
+  SDValue Chain = Op.getOperand(0);
+  SDLoc dl(Op);
+
+  SmallVector<SDValue, 32> OpValues;
+  SmallVector<SDValue, 32> OpChains;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SmallVector<SDValue, 4> Opers;
+    SDValue Idx = DAG.getConstant(i, dl, 
+                                  TLI.getVectorIdxTy(DAG.getDataLayout()));
+
+    // The Chain is the first operand.
+    Opers.push_back(Chain);
+
+    // Now process the remaining operands. 
+    for (unsigned j = 1; j < NumOpers; ++j) {
+      SDValue Oper = Op.getOperand(j);
+      EVT OperVT = Oper.getValueType();
+
+      if (OperVT.isVector())
+        Oper = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 
+                           EltVT, Oper, Idx);
+
+      Opers.push_back(Oper);
+    }
+ 
+    SDValue ScalarOp = DAG.getNode(Op->getOpcode(), dl, ValueVTs, Opers);
+
+    OpValues.push_back(ScalarOp.getValue(0));
+    OpChains.push_back(ScalarOp.getValue(1));
+  }
+
+  SDValue Result = DAG.getBuildVector(VT, dl, OpValues);
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OpChains);
+
+  AddLegalizedOperand(Op.getValue(0), Result);
+  AddLegalizedOperand(Op.getValue(1), NewChain);
+
+  return NewChain;
+}
+
 SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
   EVT VT = Op.getValueType();
   unsigned NumElems = VT.getVectorNumElements();
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index df1cbeb92740..1cd43ace48f3 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -33,9 +33,8 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Scalarize node result " << ResNo << ": ";
-        N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Scalarize node result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
@@ -169,9 +168,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().isVector()
+      && Op.getValueType().getVectorNumElements() == 1
+      && !isSimpleLegalType(Op.getValueType()))
+    Op = GetScalarizedVector(Op);
   EVT NewVT = N->getValueType(0).getVectorElementType();
   return DAG.getNode(ISD::BITCAST, SDLoc(N),
-                     NewVT, N->getOperand(0));
+                     NewVT, Op);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
@@ -338,8 +342,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
       ScalarBool = TargetLowering::UndefinedBooleanContent;
   }
 
+  EVT CondVT = Cond.getValueType();
   if (ScalarBool != VecBool) {
-    EVT CondVT = Cond.getValueType();
     switch (ScalarBool) {
       case TargetLowering::UndefinedBooleanContent:
         break;
@@ -360,6 +364,11 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
     }
   }
 
+  // Truncate the condition if needed
+  auto BoolVT = getSetCCResultType(CondVT);
+  if (BoolVT.bitsLT(CondVT))
+    Cond = DAG.getNode(ISD::TRUNCATE, SDLoc(N), BoolVT, Cond);
+
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), Cond, LHS,
                        GetScalarizedVector(N->getOperand(2)));
@@ -433,9 +442,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": ";
-        N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Res = SDValue();
 
   if (!Res.getNode()) {
@@ -515,7 +523,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
                            N->getValueType(0).getScalarType(), Elt);
   // Revectorize the result so the types line up with what the uses of this
   // expression expect.
-  return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Op);
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
 /// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
@@ -618,9 +626,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
 /// invalid operands or may have other results that need legalization, we just
 /// know that (at least) one result needs vector splitting.
 void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Split node result: ";
-        N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Split node result: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Lo, Hi;
 
   // See if the target wants to custom expand this node.
@@ -749,6 +755,25 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMA:
     SplitVecRes_TernaryOp(N, Lo, Hi);
     break;
+  case ISD::STRICT_FADD:
+  case ISD::STRICT_FSUB:
+  case ISD::STRICT_FMUL:
+  case ISD::STRICT_FDIV:
+  case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FMA:
+  case ISD::STRICT_FPOW:
+  case ISD::STRICT_FPOWI:
+  case ISD::STRICT_FSIN:
+  case ISD::STRICT_FCOS:
+  case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
+  case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG10:
+  case ISD::STRICT_FLOG2:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
+    SplitVecRes_StrictFPOp(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1028,6 +1053,56 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  unsigned NumOps = N->getNumOperands();
+  SDValue Chain = N->getOperand(0);
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  SmallVector<SDValue, 4> OpsLo;
+  SmallVector<SDValue, 4> OpsHi;
+
+  // The Chain is the first operand.
+  OpsLo.push_back(Chain);
+  OpsHi.push_back(Chain);
+
+  // Now process the remaining operands. 
+  for (unsigned i = 1; i < NumOps; ++i) {
+    SDValue Op = N->getOperand(i); 
+    SDValue OpLo = Op; 
+    SDValue OpHi = Op;   
+
+    EVT InVT = Op.getValueType();
+    if (InVT.isVector()) { 
+      // If the input also splits, handle it directly for a
+      // compile time speedup. Otherwise split it by hand.
+      if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
+        GetSplitVector(Op, OpLo, OpHi);
+      else
+        std::tie(OpLo, OpHi) = DAG.SplitVectorOperand(N, i);
+    }
+
+    OpsLo.push_back(OpLo);
+    OpsHi.push_back(OpHi);
+  }
+
+  EVT LoValueVTs[] = {LoVT, MVT::Other};
+  EVT HiValueVTs[] = {HiVT, MVT::Other};
+  Lo = DAG.getNode(N->getOpcode(), dl, LoValueVTs, OpsLo);
+  Hi = DAG.getNode(N->getOpcode(), dl, HiValueVTs, OpsHi);
+  
+  // Build a factor node to remember that this Op is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                      Lo.getValue(1), Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
 void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
@@ -1200,16 +1275,16 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
   Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
                                    MLD->isExpandingLoad());
+  unsigned HiOffset = LoMemVT.getStoreSize();
 
-  MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
-                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+  MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MLD->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOLoad,
+      HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(),
+      MLD->getRanges());
 
   Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
                          ExtType, MLD->isExpandingLoad());
 
-
   // Build a factor node to remember that this load is independent of the
   // other one.
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
@@ -1232,6 +1307,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Mask = MGT->getMask();
   SDValue Src0 = MGT->getValue();
   SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
   unsigned Alignment = MGT->getOriginalAlignment();
 
   // Split Mask operand
@@ -1263,11 +1339,11 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(), MGT->getRanges());
 
-  SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo};
+  SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
                            MMO);
 
-  SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi};
+  SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi, Scale};
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
                            MMO);
 
@@ -1365,8 +1441,8 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
     std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
     if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
         TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
-      DEBUG(dbgs() << "Split vector extend via incremental extend:";
-            N->dump(&DAG); dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Split vector extend via incremental extend:";
+                 N->dump(&DAG); dbgs() << "\n");
       // Extend the source vector by one step.
       SDValue NewSrc =
           DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
@@ -1501,9 +1577,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
 /// the node are known to be legal, but other operands of the node may need
 /// legalization as well as the specified one.
 bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Split node operand: ";
-        N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Split node operand: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom split this node.
@@ -1683,8 +1757,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
 
   // Use the appropriate scalar instruction on the split subvectors before
   // reducing the now partially reduced smaller vector.
-  SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
-  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+  SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi, N->getFlags());
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
@@ -1810,6 +1884,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Ch = MGT->getChain();
   SDValue Ptr = MGT->getBasePtr();
   SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
   SDValue Mask = MGT->getMask();
   SDValue Src0 = MGT->getValue();
   unsigned Alignment = MGT->getOriginalAlignment();
@@ -1842,7 +1917,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(), MGT->getRanges());
 
-  SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo};
+  SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo, Scale};
   SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
                                    OpsLo, MMO);
 
@@ -1852,7 +1927,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                          Alignment, MGT->getAAInfo(),
                          MGT->getRanges());
 
-  SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi};
+  SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi, Scale};
   SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
                                    OpsHi, MMO);
 
@@ -1916,10 +1991,12 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
 
   Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                    N->isCompressingStore());
-  MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(),
-                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
-                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());
+  unsigned HiOffset = LoMemVT.getStoreSize();
+
+  MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
+      HiMemVT.getStoreSize(), SecondHalfAlignment, N->getAAInfo(),
+      N->getRanges());
 
   Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
                           N->isTruncatingStore(), N->isCompressingStore());
@@ -1935,6 +2012,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   SDValue Ptr = N->getBasePtr();
   SDValue Mask = N->getMask();
   SDValue Index = N->getIndex();
+  SDValue Scale = N->getScale();
   SDValue Data = N->getValue();
   EVT MemoryVT = N->getMemoryVT();
   unsigned Alignment = N->getOriginalAlignment();
@@ -1970,7 +2048,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
-  SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo};
+  SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
                             DL, OpsLo, MMO);
 
@@ -1982,7 +2060,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   // The order of the Scatter operation after split is well defined. The "Hi"
   // part comes after the "Lo". So these two operations should be chained one
   // after another.
-  SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi};
+  SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
                               DL, OpsHi, MMO);
 }
@@ -2005,6 +2083,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
+  // Scalarize if the split halves are not byte-sized.
+  if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized())
+    return TLI.scalarizeVectorStore(N, DAG);
+
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
@@ -2089,9 +2171,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
     return SplitVecOp_UnaryOp(N);
   SDLoc DL(N);
 
-  // Extract the halves of the input via extract_subvector.
+  // Get the split input vector.
   SDValue InLoVec, InHiVec;
-  std::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL);
+  GetSplitVector(InVec, InLoVec, InHiVec);
   // Truncate them to 1/2 the element size.
   EVT HalfElementVT = IsFloat ?
     EVT::getFloatingPointVT(InElementSize/2) :
@@ -2164,9 +2246,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
 //===----------------------------------------------------------------------===//
 
 void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Widen node result " << ResNo << ": ";
-        N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Widen node result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
 
   // See if the target wants to custom widen this node.
   if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
@@ -2948,6 +3029,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
   SDValue Mask = N->getMask();
   EVT MaskVT = Mask.getValueType();
   SDValue Src0 = GetWidenedVector(N->getValue());
+  SDValue Scale = N->getScale();
   unsigned NumElts = WideVT.getVectorNumElements();
   SDLoc dl(N);
 
@@ -2963,7 +3045,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
                                      Index.getValueType().getScalarType(),
                                      NumElts);
   Index = ModifyToType(Index, WideIndexVT);
-  SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+  SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, Scale };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
                                     N->getMemOperand());
@@ -3309,9 +3391,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
 // Widen Vector Operand
 //===----------------------------------------------------------------------===//
 bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
-  DEBUG(dbgs() << "Widen node operand " << OpNo << ": ";
-        N->dump(&DAG);
-        dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Widen node operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom widen this node.
@@ -3420,7 +3501,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   // low lanes.
   switch (N->getOpcode()) {
   default:
-    llvm_unreachable("Extend legalization on on extend operation!");
+    llvm_unreachable("Extend legalization on extend operation!");
   case ISD::ANY_EXTEND:
     return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
   case ISD::SIGN_EXTEND:
@@ -3544,6 +3625,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   // vector type.
   StoreSDNode *ST = cast<StoreSDNode>(N);
 
+  if (!ST->getMemoryVT().getScalarType().isByteSized())
+    return TLI.scalarizeVectorStore(ST, DAG);
+
   SmallVector<SDValue, 16> StChain;
   if (ST->isTruncatingStore())
     GenWidenVectorTruncStores(StChain, ST);
@@ -3587,6 +3671,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
   SDValue DataOp = MSC->getValue();
   SDValue Mask = MSC->getMask();
   EVT MaskVT = Mask.getValueType();
+  SDValue Scale = MSC->getScale();
 
   // Widen the value.
   SDValue WideVal = GetWidenedVector(DataOp);
@@ -3606,7 +3691,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
                                      NumElts);
   Index = ModifyToType(Index, WideIndexVT);
 
-  SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index};
+  SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index,
+                   Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
                               MSC->getMemoryVT(), dl, Ops,
                               MSC->getMemOperand());
@@ -3616,6 +3702,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
   SDLoc dl(N);
+  EVT VT = N->getValueType(0);
 
   // WARNING: In this code we widen the compare instruction with garbage.
   // This garbage may contain denormal floats which may be slow. Is this a real
@@ -3625,18 +3712,23 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   // Only some of the compared elements are legal.
   EVT SVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                    InOp0.getValueType());
+  // The result type is legal, if its vXi1, keep vXi1 for the new SETCC.
+  if (VT.getScalarType() == MVT::i1)
+    SVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                           SVT.getVectorNumElements());
+
   SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
-                     SVT, InOp0, InOp1, N->getOperand(2));
+                                  SVT, InOp0, InOp1, N->getOperand(2));
 
   // Extract the needed results from the result vector.
   EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
                                SVT.getVectorElementType(),
-                               N->getValueType(0).getVectorNumElements());
+                               VT.getVectorNumElements());
   SDValue CC = DAG.getNode(
       ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-  return PromoteTargetBoolean(CC, N->getValueType(0));
+  return PromoteTargetBoolean(CC, VT);
 }
 
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index cf92907a8b5f..7e6b57426338 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -32,7 +32,8 @@ public:
   enum DbgValueKind {
     SDNODE = 0,             ///< Value is the result of an expression.
     CONST = 1,              ///< Value is a constant.
-    FRAMEIX = 2             ///< Value is contents of a stack location.
+    FRAMEIX = 2,            ///< Value is contents of a stack location.
+    VREG = 3                ///< Value is a virtual register.
   };
 private:
   union {
@@ -42,6 +43,7 @@ private:
     } s;
     const Value *Const;     ///< Valid for constants.
     unsigned FrameIx;       ///< Valid for stack objects.
+    unsigned VReg;          ///< Valid for registers.
   } u;
   DIVariable *Var;
   DIExpression *Expr;
@@ -69,12 +71,18 @@ public:
     u.Const = C;
   }
 
-  /// Constructor for frame indices.
-  SDDbgValue(DIVariable *Var, DIExpression *Expr, unsigned FI, DebugLoc dl,
-             unsigned O)
-      : Var(Var), Expr(Expr), DL(std::move(dl)), Order(O), IsIndirect(false) {
-    kind = FRAMEIX;
-    u.FrameIx = FI;
+  /// Constructor for virtual registers and frame indices.
+  SDDbgValue(DIVariable *Var, DIExpression *Expr, unsigned VRegOrFrameIdx,
+             bool IsIndirect, DebugLoc DL, unsigned Order,
+             enum DbgValueKind Kind)
+      : Var(Var), Expr(Expr), DL(DL), Order(Order), IsIndirect(IsIndirect) {
+    assert((Kind == VREG || Kind == FRAMEIX) &&
+           "Invalid SDDbgValue constructor");
+    kind = Kind;
+    if (kind == VREG)
+      u.VReg = VRegOrFrameIdx;
+    else
+      u.FrameIx = VRegOrFrameIdx;
   }
 
   /// Returns the kind.
@@ -98,6 +106,9 @@ public:
   /// Returns the FrameIx for a stack object
   unsigned getFrameIx() const { assert (kind==FRAMEIX); return u.FrameIx; }
 
+  /// Returns the Virtual Register for a VReg
+  unsigned getVReg() const { assert (kind==VREG); return u.VReg; }
+
   /// Returns whether this is an indirect value.
   bool isIndirect() const { return IsIndirect; }
 
@@ -115,6 +126,28 @@ public:
   bool isInvalidated() const { return Invalid; }
 };
 
+/// Holds the information from a dbg_label node through SDISel.
+/// We do not use SDValue here to avoid including its header.
+class SDDbgLabel {
+  MDNode *Label;
+  DebugLoc DL;
+  unsigned Order;
+
+public:
+  SDDbgLabel(MDNode *Label, DebugLoc dl, unsigned O)
+      : Label(Label), DL(std::move(dl)), Order(O) {}
+
+  /// Returns the MDNode pointer for the label.
+  MDNode *getLabel() const { return Label; }
+
+  /// Returns the DebugLoc.
+  DebugLoc getDebugLoc() const { return DL; }
+
+  /// Returns the SDNodeOrder.  This is the order of the preceding node in the
+  /// input.
+  unsigned getOrder() const { return Order; }
+};
+
 } // end llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 698e14453d1d..3944d7df286d 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -13,6 +13,7 @@
 
 #include "InstrEmitter.h"
 #include "ScheduleDAGSDNodes.h"
+#include "SDNodeDbgValue.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -115,7 +116,7 @@ private:
 
 /// Schedule - Schedule the DAG using list scheduling.
 void ScheduleDAGFast::Schedule() {
-  DEBUG(dbgs() << "********** List Scheduling **********\n");
+  LLVM_DEBUG(dbgs() << "********** List Scheduling **********\n");
 
   NumLiveRegs = 0;
   LiveRegDefs.resize(TRI->getNumRegs(), nullptr);
@@ -124,8 +125,8 @@ void ScheduleDAGFast::Schedule() {
   // Build the scheduling graph.
   BuildSchedGraph(nullptr);
 
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
+  LLVM_DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) SUnits[su]
+                 .dumpAll(this));
 
   // Execute the actual scheduling loop.
   ListScheduleBottomUp();
@@ -180,8 +181,8 @@ void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
 /// count of its predecessors. If a predecessor pending count is zero, add it to
 /// the Available queue.
 void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
-  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
-  DEBUG(SU->dump(this));
+  LLVM_DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(SU->dump(this));
 
   assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
   SU->setHeightToAtLeast(CurCycle);
@@ -236,7 +237,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
     if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
       return nullptr;
 
-    DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
+    LLVM_DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
     assert(NewNodes.size() == 2 && "Expected a load folding node!");
 
     N = NewNodes[1];
@@ -346,7 +347,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
     SU = NewSU;
   }
 
-  DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
+  LLVM_DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
   NewSU = Clone(SU);
 
   // New SUnit has the exact same predecessors.
@@ -592,14 +593,14 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
           // Issue copies, these can be expensive cross register class copies.
           SmallVector<SUnit*, 2> Copies;
           InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
-          DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
-                       << " to SU #" << Copies.front()->NodeNum << "\n");
+          LLVM_DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
+                            << " to SU #" << Copies.front()->NodeNum << "\n");
           AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
           NewDef = Copies.back();
         }
 
-        DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
-                     << " to SU #" << TrySU->NodeNum << "\n");
+        LLVM_DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
+                          << " to SU #" << TrySU->NodeNum << "\n");
         LiveRegDefs[Reg] = NewDef;
         AddPred(NewDef, SDep(TrySU, SDep::Artificial));
         TrySU->isAvailable = false;
@@ -666,8 +667,8 @@ void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
     // These nodes do not need to be translated into MIs.
     return;
 
-  DEBUG(dbgs() << "\n*** Scheduling: ");
-  DEBUG(N->dump(DAG));
+  LLVM_DEBUG(dbgs() << "\n*** Scheduling: ");
+  LLVM_DEBUG(N->dump(DAG));
   Sequence.push_back(N);
 
   unsigned NumOps = N->getNumOperands();
@@ -713,7 +714,7 @@ static SDNode *findGluedUser(SDNode *N) {
 }
 
 void ScheduleDAGLinearize::Schedule() {
-  DEBUG(dbgs() << "********** DAG Linearization **********\n");
+  LLVM_DEBUG(dbgs() << "********** DAG Linearization **********\n");
 
   SmallVector<SDNode*, 8> Glues;
   unsigned DAGSize = 0;
@@ -763,19 +764,29 @@ ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   InstrEmitter Emitter(BB, InsertPos);
   DenseMap<SDValue, unsigned> VRBaseMap;
 
-  DEBUG({
-      dbgs() << "\n*** Final schedule ***\n";
-    });
+  LLVM_DEBUG({ dbgs() << "\n*** Final schedule ***\n"; });
 
-  // FIXME: Handle dbg_values.
   unsigned NumNodes = Sequence.size();
+  MachineBasicBlock *BB = Emitter.getBlock();
   for (unsigned i = 0; i != NumNodes; ++i) {
     SDNode *N = Sequence[NumNodes-i-1];
-    DEBUG(N->dump(DAG));
+    LLVM_DEBUG(N->dump(DAG));
     Emitter.EmitNode(N, false, false, VRBaseMap);
+
+    // Emit any debug values associated with the node.
+    if (N->getHasDebugValue()) {
+      MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+      for (auto DV : DAG->GetDbgValues(N)) {
+        if (DV->isInvalidated())
+          continue;
+        if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap))
+          BB->insert(InsertPos, DbgMI);
+        DV->setIsInvalidated();
+      }
+    }
   }
 
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   InsertPos = Emitter.getInsertPos();
   return Emitter.getBlock();
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 82337d43c5c9..43e8ffd3839c 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
@@ -37,6 +36,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -46,6 +46,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -346,8 +347,8 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
 /// Schedule - Schedule the DAG using list scheduling.
 void ScheduleDAGRRList::Schedule() {
-  DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
-               << " '" << BB->getName() << "' **********\n");
+  LLVM_DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
+                    << " '" << BB->getName() << "' **********\n");
 
   CurCycle = 0;
   IssueCount = 0;
@@ -364,8 +365,7 @@ void ScheduleDAGRRList::Schedule() {
   // Build the scheduling graph.
   BuildSchedGraph(nullptr);
 
-  DEBUG(for (SUnit &SU : SUnits)
-          SU.dumpAll(this));
+  LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
   Topo.InitDAGTopologicalSorting();
 
   AvailableQueue->initNodes(SUnits);
@@ -377,11 +377,11 @@ void ScheduleDAGRRList::Schedule() {
 
   AvailableQueue->releaseState();
 
-  DEBUG({
-      dbgs() << "*** Final schedule ***\n";
-      dumpSchedule();
-      dbgs() << '\n';
-    });
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
 }
 
 //===----------------------------------------------------------------------===//
@@ -728,13 +728,13 @@ static void resetVRegCycle(SUnit *SU);
 /// count of its predecessors. If a predecessor pending count is zero, add it to
 /// the Available queue.
 void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
-  DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
-  DEBUG(SU->dump(this));
+  LLVM_DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(SU->dump(this));
 
 #ifndef NDEBUG
   if (CurCycle < SU->getHeight())
-    DEBUG(dbgs() << "   Height [" << SU->getHeight()
-          << "] pipeline stall!\n");
+    LLVM_DEBUG(dbgs() << "   Height [" << SU->getHeight()
+                      << "] pipeline stall!\n");
 #endif
 
   // FIXME: Do not modify node height. It may interfere with
@@ -827,8 +827,8 @@ void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
 /// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
 /// its predecessor states to reflect the change.
 void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
-  DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
-  DEBUG(SU->dump(this));
+  LLVM_DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
+  LLVM_DEBUG(SU->dump(this));
 
   for (SDep &Pred : SU->Preds) {
     CapturePred(&Pred);
@@ -1010,7 +1010,35 @@ SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) {
     computeLatency(LoadSU);
   }
 
-  DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
+  bool isNewN = true;
+  SUnit *NewSU;
+  // This can only happen when isNewLoad is false.
+  if (N->getNodeId() != -1) {
+    NewSU = &SUnits[N->getNodeId()];
+    // If NewSU has already been scheduled, we need to clone it, but this
+    // negates the benefit to unfolding so just return SU.
+    if (NewSU->isScheduled)
+      return SU;
+    isNewN = false;
+  } else {
+    NewSU = CreateNewSUnit(N);
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    InitNumRegDefsLeft(NewSU);
+    computeLatency(NewSU);
+  }
+
+  LLVM_DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
 
   // Now that we are committed to unfolding replace DAG Uses.
   for (unsigned i = 0; i != NumVals; ++i)
@@ -1018,23 +1046,6 @@ SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) {
   DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1),
                                  SDValue(LoadNode, 1));
 
-  SUnit *NewSU = CreateNewSUnit(N);
-  assert(N->getNodeId() == -1 && "Node already inserted!");
-  N->setNodeId(NewSU->NodeNum);
-
-  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
-  for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
-    if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
-      NewSU->isTwoAddress = true;
-      break;
-    }
-  }
-  if (MCID.isCommutable())
-    NewSU->isCommutable = true;
-
-  InitNumRegDefsLeft(NewSU);
-  computeLatency(NewSU);
-
   // Record all the edges to and from the old SU, by category.
   SmallVector<SDep, 4> ChainPreds;
   SmallVector<SDep, 4> ChainSuccs;
@@ -1100,7 +1111,8 @@ SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) {
 
   if (isNewLoad)
     AvailableQueue->addNode(LoadSU);
-  AvailableQueue->addNode(NewSU);
+  if (isNewN)
+    AvailableQueue->addNode(NewSU);
 
   ++NumUnfolds;
 
@@ -1117,12 +1129,13 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   if (!N)
     return nullptr;
 
-  DEBUG(dbgs() << "Considering duplicating the SU\n");
-  DEBUG(SU->dump(this));
+  LLVM_DEBUG(dbgs() << "Considering duplicating the SU\n");
+  LLVM_DEBUG(SU->dump(this));
 
   if (N->getGluedNode() &&
       !TII->canCopyGluedNodeDuringSchedule(N)) {
-    DEBUG(dbgs()
+    LLVM_DEBUG(
+        dbgs()
         << "Giving up because it has incoming glue and the target does not "
            "want to copy it\n");
     return nullptr;
@@ -1133,7 +1146,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
     MVT VT = N->getSimpleValueType(i);
     if (VT == MVT::Glue) {
-      DEBUG(dbgs() << "Giving up because it has outgoing glue\n");
+      LLVM_DEBUG(dbgs() << "Giving up because it has outgoing glue\n");
       return nullptr;
     } else if (VT == MVT::Other)
       TryUnfold = true;
@@ -1141,8 +1154,9 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   for (const SDValue &Op : N->op_values()) {
     MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
     if (VT == MVT::Glue && !TII->canCopyGluedNodeDuringSchedule(N)) {
-      DEBUG(dbgs() << "Giving up because it one of the operands is glue and "
-                      "the target does not want to copy it\n");
+      LLVM_DEBUG(
+          dbgs() << "Giving up because it one of the operands is glue and "
+                    "the target does not want to copy it\n");
       return nullptr;
     }
   }
@@ -1159,7 +1173,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
       return SU;
   }
 
-  DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
+  LLVM_DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
   NewSU = CreateClone(SU);
 
   // New SUnit has the exact same predecessors.
@@ -1420,7 +1434,7 @@ void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
     // Furthermore, it may have been made available again, in which case it is
     // now already in the AvailableQueue.
     if (SU->isAvailable && !SU->NodeQueueId) {
-      DEBUG(dbgs() << "    Repushing SU #" << SU->NodeNum << '\n');
+      LLVM_DEBUG(dbgs() << "    Repushing SU #" << SU->NodeNum << '\n');
       AvailableQueue->push(SU);
     }
     if (i < Interferences.size())
@@ -1441,12 +1455,10 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
       SmallVector<unsigned, 4> LRegs;
       if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
         break;
-      DEBUG(dbgs() << "    Interfering reg ";
-            if (LRegs[0] == TRI->getNumRegs())
-              dbgs() << "CallResource";
-            else
-              dbgs() << printReg(LRegs[0], TRI);
-            dbgs() << " SU #" << CurSU->NodeNum << '\n');
+      LLVM_DEBUG(dbgs() << "    Interfering reg ";
+                 if (LRegs[0] == TRI->getNumRegs()) dbgs() << "CallResource";
+                 else dbgs() << printReg(LRegs[0], TRI);
+                 dbgs() << " SU #" << CurSU->NodeNum << '\n');
       std::pair<LRegsMapT::iterator, bool> LRegsPair =
         LRegsMap.insert(std::make_pair(CurSU, LRegs));
       if (LRegsPair.second) {
@@ -1492,17 +1504,17 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
         if (!BtSU->isPending)
           AvailableQueue->remove(BtSU);
       }
-      DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum << ") to SU("
-            << TrySU->NodeNum << ")\n");
+      LLVM_DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum
+                        << ") to SU(" << TrySU->NodeNum << ")\n");
       AddPred(TrySU, SDep(BtSU, SDep::Artificial));
 
       // If one or more successors has been unscheduled, then the current
       // node is no longer available.
       if (!TrySU->isAvailable || !TrySU->NodeQueueId) {
-        DEBUG(dbgs() << "TrySU not available; choosing node from queue\n");
+        LLVM_DEBUG(dbgs() << "TrySU not available; choosing node from queue\n");
         CurSU = AvailableQueue->pop();
       } else {
-        DEBUG(dbgs() << "TrySU available\n");
+        LLVM_DEBUG(dbgs() << "TrySU available\n");
         // Available and in AvailableQueue
         AvailableQueue->remove(TrySU);
         CurSU = TrySU;
@@ -1546,14 +1558,14 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
       // Issue copies, these can be expensive cross register class copies.
       SmallVector<SUnit*, 2> Copies;
       InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
-      DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
-            << " to SU #" << Copies.front()->NodeNum << "\n");
+      LLVM_DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
+                        << " to SU #" << Copies.front()->NodeNum << "\n");
       AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
       NewDef = Copies.back();
     }
 
-    DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
-          << " to SU #" << TrySU->NodeNum << "\n");
+    LLVM_DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
+                      << " to SU #" << TrySU->NodeNum << "\n");
     LiveRegDefs[Reg] = NewDef;
     AddPred(NewDef, SDep(TrySU, SDep::Artificial));
     TrySU->isAvailable = false;
@@ -1581,8 +1593,8 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
   // priority. If it is not ready put it back.  Schedule the node.
   Sequence.reserve(SUnits.size());
   while (!AvailableQueue->empty() || !Interferences.empty()) {
-    DEBUG(dbgs() << "\nExamining Available:\n";
-          AvailableQueue->dump(this));
+    LLVM_DEBUG(dbgs() << "\nExamining Available:\n";
+               AvailableQueue->dump(this));
 
     // Pick the best node to schedule taking all constraints into
     // consideration.
@@ -2045,8 +2057,8 @@ LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const {
     unsigned Id = RC->getID();
     unsigned RP = RegPressure[Id];
     if (!RP) continue;
-    DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
-          << RegLimit[Id] << '\n');
+    LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
+                      << RegLimit[Id] << '\n');
   }
 }
 #endif
@@ -2198,14 +2210,15 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
     if (RegPressure[RCId] < Cost) {
       // Register pressure tracking is imprecise. This can happen. But we try
       // hard not to let it happen because it likely results in poor scheduling.
-      DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") has too many regdefs\n");
+      LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum
+                        << ") has too many regdefs\n");
       RegPressure[RCId] = 0;
     }
     else {
       RegPressure[RCId] -= Cost;
     }
   }
-  DEBUG(dumpRegPressure());
+  LLVM_DEBUG(dumpRegPressure());
 }
 
 void RegReductionPQBase::unscheduledNode(SUnit *SU) {
@@ -2285,7 +2298,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
     }
   }
 
-  DEBUG(dumpRegPressure());
+  LLVM_DEBUG(dumpRegPressure());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2380,7 +2393,7 @@ static void initVRegCycle(SUnit *SU) {
   if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU))
     return;
 
-  DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
+  LLVM_DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
 
   SU->isVRegCycle = true;
 
@@ -2418,7 +2431,7 @@ static bool hasVRegCycleUse(const SUnit *SU) {
     if (Pred.isCtrl()) continue;  // ignore chain preds
     if (Pred.getSUnit()->isVRegCycle &&
         Pred.getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
-      DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
+      LLVM_DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
       return true;
     }
   }
@@ -2478,9 +2491,9 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
     int LDepth = left->getDepth() - LPenalty;
     int RDepth = right->getDepth() - RPenalty;
     if (LDepth != RDepth) {
-      DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
-            << ") depth " << LDepth << " vs SU (" << right->NodeNum
-            << ") depth " << RDepth << "\n");
+      LLVM_DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+                        << ") depth " << LDepth << " vs SU (" << right->NodeNum
+                        << ") depth " << RDepth << "\n");
       return LDepth < RDepth ? 1 : -1;
     }
     if (left->Latency != right->Latency)
@@ -2502,9 +2515,9 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
       static const char *const PhysRegMsg[] = { " has no physreg",
                                                 " defines a physreg" };
       #endif
-      DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
-            << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
-            << PhysRegMsg[RHasPhysReg] << "\n");
+      LLVM_DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+                        << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum
+                        << ") " << PhysRegMsg[RHasPhysReg] << "\n");
       return LHasPhysReg < RHasPhysReg;
     }
   }
@@ -2648,13 +2661,13 @@ bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
   // Avoid causing spills. If register pressure is high, schedule for
   // register pressure reduction.
   if (LHigh && !RHigh) {
-    DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
-          << right->NodeNum << ")\n");
+    LLVM_DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
+                      << right->NodeNum << ")\n");
     return true;
   }
   else if (!LHigh && RHigh) {
-    DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
-          << left->NodeNum << ")\n");
+    LLVM_DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
+                      << left->NodeNum << ")\n");
     return false;
   }
   if (!LHigh && !RHigh) {
@@ -2716,8 +2729,9 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
     RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
   }
   if (!DisableSchedRegPressure && LPDiff != RPDiff) {
-    DEBUG(dbgs() << "RegPressureDiff SU(" << left->NodeNum << "): " << LPDiff
-          << " != SU(" << right->NodeNum << "): " << RPDiff << "\n");
+    LLVM_DEBUG(dbgs() << "RegPressureDiff SU(" << left->NodeNum
+                      << "): " << LPDiff << " != SU(" << right->NodeNum
+                      << "): " << RPDiff << "\n");
     return LPDiff > RPDiff;
   }
 
@@ -2729,8 +2743,9 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
   }
 
   if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
-    DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
-          << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n");
+    LLVM_DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
+                      << " != SU(" << right->NodeNum << "): " << RLiveUses
+                      << "\n");
     return LLiveUses < RLiveUses;
   }
 
@@ -2744,9 +2759,9 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
   if (!DisableSchedCriticalPath) {
     int spread = (int)left->getDepth() - (int)right->getDepth();
     if (std::abs(spread) > MaxReorderWindow) {
-      DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
-            << left->getDepth() << " != SU(" << right->NodeNum << "): "
-            << right->getDepth() << "\n");
+      LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+                        << left->getDepth() << " != SU(" << right->NodeNum
+                        << "): " << right->getDepth() << "\n");
       return left->getDepth() < right->getDepth();
     }
   }
@@ -2967,9 +2982,10 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
 
     // Ok, the transformation is safe and the heuristics suggest it is
     // profitable. Update the graph.
-    DEBUG(dbgs() << "    Prescheduling SU #" << SU.NodeNum
-                 << " next to PredSU #" << PredSU->NodeNum
-                 << " to guide scheduling in the presence of multiple uses\n");
+    LLVM_DEBUG(
+        dbgs() << "    Prescheduling SU #" << SU.NodeNum << " next to PredSU #"
+               << PredSU->NodeNum
+               << " to guide scheduling in the presence of multiple uses\n");
     for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
       SDep Edge = PredSU->Succs[i];
       assert(!Edge.isAssignedRegDep());
@@ -3058,8 +3074,9 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
              (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
              (!SU.isCommutable && SuccSU->isCommutable)) &&
             !scheduleDAG->IsReachable(SuccSU, &SU)) {
-          DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
-                       << SU.NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "    Adding a pseudo-two-addr edge from SU #"
+                     << SU.NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
           scheduleDAG->AddPred(&SU, SDep(SuccSU, SDep::Artificial));
         }
       }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index c09b47af26a6..430d8fb34476 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -243,7 +244,7 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
     return;
 
   // Sort them in increasing order.
-  std::sort(Offsets.begin(), Offsets.end());
+  llvm::sort(Offsets.begin(), Offsets.end());
 
   // Check if the loads are close enough.
   SmallVector<SDNode*, 4> Loads;
@@ -910,6 +911,39 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     MachineBasicBlock *InsertBB = Emitter.getBlock();
     MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
     InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());
+
+    SDDbgInfo::DbgLabelIterator DLI = DAG->DbgLabelBegin();
+    SDDbgInfo::DbgLabelIterator DLE = DAG->DbgLabelEnd();
+    // Now emit the rest according to source order.
+    LastOrder = 0;
+    for (const auto &InstrOrder : Orders) {
+      unsigned Order = InstrOrder.first;
+      MachineInstr *MI = InstrOrder.second;
+      if (!MI)
+        continue;
+
+      // Insert all SDDbgLabel's whose order(s) are before "Order".
+      for (; DLI != DLE &&
+             (*DLI)->getOrder() >= LastOrder && (*DLI)->getOrder() < Order;
+             ++DLI) {
+        MachineInstr *DbgMI = Emitter.EmitDbgLabel(*DLI);
+        if (DbgMI) {
+          if (!LastOrder)
+            // Insert to start of the BB (after PHIs).
+            BB->insert(BBBegin, DbgMI);
+          else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
+            MachineBasicBlock::iterator Pos = MI;
+            MI->getParent()->insert(Pos, DbgMI);
+          }
+        }
+      }
+      if (DLI == DLE)
+        break;
+
+      LastOrder = Order;
+    }
   }
 
   InsertPos = Emitter.getInsertPos();
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index a058942c5689..6417e16bd0fd 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -17,10 +17,10 @@
 
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cassert>
 #include <string>
 #include <vector>
@@ -88,7 +88,7 @@ class InstrItineraryData;
     /// Clone - Creates a clone of the specified SUnit. It does not copy the
     /// predecessors / successors info nor the temporary scheduling states.
     ///
-    SUnit *Clone(SUnit *N);
+    SUnit *Clone(SUnit *Old);
 
     /// BuildSchedGraph - Build the SUnit graph from the selection dag that we
     /// are input.  This SUnit graph is similar to the SelectionDAG, but
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 07b46b9183ab..84055f8ecc1a 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -93,8 +93,8 @@ private:
 
 /// Schedule - Schedule the DAG using list scheduling.
 void ScheduleDAGVLIW::Schedule() {
-  DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
-               << " '" << BB->getName() << "' **********\n");
+  LLVM_DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
+                    << " '" << BB->getName() << "' **********\n");
 
   // Build the scheduling graph.
   BuildSchedGraph(AA);
@@ -151,8 +151,8 @@ void ScheduleDAGVLIW::releaseSuccessors(SUnit *SU) {
 /// count of its successors. If a successor pending count is zero, add it to
 /// the Available queue.
 void ScheduleDAGVLIW::scheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
-  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
-  DEBUG(SU->dump(this));
+  LLVM_DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(SU->dump(this));
 
   Sequence.push_back(SU);
   assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
@@ -246,7 +246,7 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
     } else if (!HasNoopHazards) {
       // Otherwise, we have a pipeline stall, but no other problem, just advance
       // the current cycle and try again.
-      DEBUG(dbgs() << "*** Advancing cycle, no work to do\n");
+      LLVM_DEBUG(dbgs() << "*** Advancing cycle, no work to do\n");
       HazardRec->AdvanceCycle();
       ++NumStalls;
       ++CurCycle;
@@ -254,7 +254,7 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
       // Otherwise, we have no instructions to issue and we have instructions
       // that will fault if we don't do this right.  This is the case for
       // processors without pipeline interlocks and other cases.
-      DEBUG(dbgs() << "*** Emitting noop\n");
+      LLVM_DEBUG(dbgs() << "*** Emitting noop\n");
       HazardRec->EmitNoop();
       Sequence.push_back(nullptr);   // NULL here means noop
       ++NumNoops;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3ffc6fa9a059..48e03c6da68f 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -32,7 +32,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -58,6 +57,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Mutex.h"
@@ -89,11 +89,16 @@ void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
 
 #define DEBUG_TYPE "selectiondag"
 
+static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
+       cl::Hidden, cl::init(true),
+       cl::desc("Gang up loads and stores generated by inlining of memcpy"));
+
+static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
+       cl::desc("Number limit for gluing ld/st of memcpy."),
+       cl::Hidden, cl::init(0));
+
 static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
-  DEBUG(
-    dbgs() << Msg;
-    V.getNode()->dump(G);
-  );
+  LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
 }
 
 //===----------------------------------------------------------------------===//
@@ -263,6 +268,52 @@ bool ISD::allOperandsUndef(const SDNode *N) {
   return true;
 }
 
+bool ISD::matchUnaryPredicate(SDValue Op,
+                              std::function<bool(ConstantSDNode *)> Match) {
+  if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
+    return Match(Cst);
+
+  if (ISD::BUILD_VECTOR != Op.getOpcode())
+    return false;
+
+  EVT SVT = Op.getValueType().getScalarType();
+  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+    auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
+    if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
+      return false;
+  }
+  return true;
+}
+
+bool ISD::matchBinaryPredicate(
+    SDValue LHS, SDValue RHS,
+    std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) {
+  if (LHS.getValueType() != RHS.getValueType())
+    return false;
+
+  if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
+    if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
+      return Match(LHSCst, RHSCst);
+
+  if (ISD::BUILD_VECTOR != LHS.getOpcode() ||
+      ISD::BUILD_VECTOR != RHS.getOpcode())
+    return false;
+
+  EVT SVT = LHS.getValueType().getScalarType();
+  for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+    auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
+    auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
+    if (!LHSCst || !RHSCst)
+      return false;
+    if (LHSCst->getValueType(0) != SVT ||
+        LHSCst->getValueType(0) != RHSCst->getValueType(0))
+      return false;
+    if (!Match(LHSCst, RHSCst))
+      return false;
+  }
+  return true;
+}
+
 ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   switch (ExtType) {
   case ISD::EXTLOAD:
@@ -487,12 +538,41 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
+  case ISD::MLOAD: {
+    const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
+    ID.AddInteger(MLD->getMemoryVT().getRawBits());
+    ID.AddInteger(MLD->getRawSubclassData());
+    ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::MSTORE: {
+    const MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+    ID.AddInteger(MST->getMemoryVT().getRawBits());
+    ID.AddInteger(MST->getRawSubclassData());
+    ID.AddInteger(MST->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::MGATHER: {
+    const MaskedGatherSDNode *MG = cast<MaskedGatherSDNode>(N);
+    ID.AddInteger(MG->getMemoryVT().getRawBits());
+    ID.AddInteger(MG->getRawSubclassData());
+    ID.AddInteger(MG->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::MSCATTER: {
+    const MaskedScatterSDNode *MS = cast<MaskedScatterSDNode>(N);
+    ID.AddInteger(MS->getMemoryVT().getRawBits());
+    ID.AddInteger(MS->getRawSubclassData());
+    ID.AddInteger(MS->getPointerInfo().getAddrSpace());
+    break;
+  }
   case ISD::ATOMIC_CMP_SWAP:
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
@@ -726,7 +806,7 @@ static void VerifySDNode(SDNode *N) {
 }
 #endif // NDEBUG
 
-/// \brief Insert a newly allocated node into the DAG.
+/// Insert a newly allocated node into the DAG.
 ///
 /// Handles insertion into the all nodes list and CSE map, as well as
 /// verification and other common operations when a new node is allocated.
@@ -903,13 +983,16 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
 
 void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE,
-                        Pass *PassPtr) {
+                        Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
+                        DivergenceAnalysis * Divergence) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
   ORE = &NewORE;
   TLI = getSubtarget().getTargetLowering();
   TSI = getSubtarget().getSelectionDAGInfo();
+  LibInfo = LibraryInfo;
   Context = &MF->getFunction().getContext();
+  DA = Divergence;
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -1077,21 +1160,25 @@ SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
 }
 
 SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
-  EVT EltVT = VT.getScalarType();
-  SDValue TrueValue;
-  switch (TLI->getBooleanContents(VT)) {
-    case TargetLowering::ZeroOrOneBooleanContent:
-    case TargetLowering::UndefinedBooleanContent:
-      TrueValue = getConstant(1, DL, VT);
-      break;
-    case TargetLowering::ZeroOrNegativeOneBooleanContent:
-      TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL,
-                              VT);
-      break;
-  }
+  SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
   return getNode(ISD::XOR, DL, VT, Val, TrueValue);
 }
 
+SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
+                                      EVT OpVT) {
+  if (!V)
+    return getConstant(0, DL, VT);
+
+  switch (TLI->getBooleanContents(OpVT)) {
+  case TargetLowering::ZeroOrOneBooleanContent:
+  case TargetLowering::UndefinedBooleanContent:
+    return getConstant(1, DL, VT);
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return getAllOnesConstant(DL, VT);
+  }
+  llvm_unreachable("Unexpected boolean content enum!");
+}
+
 SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
                                   bool isT, bool isO) {
   EVT EltVT = VT.getScalarType();
@@ -1184,7 +1271,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
       return SDValue(N, 0);
 
   if (!N) {
-    N = newSDNode<ConstantSDNode>(isT, isO, Elt, DL.getDebugLoc(), EltVT);
+    N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
     NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
@@ -1227,7 +1314,7 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
       return SDValue(N, 0);
 
   if (!N) {
-    N = newSDNode<ConstantFPSDNode>(isTarget, &V, DL.getDebugLoc(), EltVT);
+    N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
   }
@@ -1503,33 +1590,35 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
   if (N1.isUndef())
     commuteShuffle(N1, N2, MaskVec);
 
-  // If shuffling a splat, try to blend the splat instead. We do this here so
-  // that even when this arises during lowering we don't have to re-handle it.
-  auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
-    BitVector UndefElements;
-    SDValue Splat = BV->getSplatValue(&UndefElements);
-    if (!Splat)
-      return;
+  if (TLI->hasVectorBlend()) {
+    // If shuffling a splat, try to blend the splat instead. We do this here so
+    // that even when this arises during lowering we don't have to re-handle it.
+    auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
+      BitVector UndefElements;
+      SDValue Splat = BV->getSplatValue(&UndefElements);
+      if (!Splat)
+        return;
 
-    for (int i = 0; i < NElts; ++i) {
-      if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
-        continue;
+      for (int i = 0; i < NElts; ++i) {
+        if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
+          continue;
 
-      // If this input comes from undef, mark it as such.
-      if (UndefElements[MaskVec[i] - Offset]) {
-        MaskVec[i] = -1;
-        continue;
-      }
+        // If this input comes from undef, mark it as such.
+        if (UndefElements[MaskVec[i] - Offset]) {
+          MaskVec[i] = -1;
+          continue;
+        }
 
-      // If we can blend a non-undef lane, use that instead.
-      if (!UndefElements[i])
-        MaskVec[i] = i + Offset;
-    }
-  };
-  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
-    BlendSplat(N1BV, 0);
-  if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
-    BlendSplat(N2BV, NElts);
+        // If we can blend a non-undef lane, use that instead.
+        if (!UndefElements[i])
+          MaskVec[i] = i + Offset;
+      }
+    };
+    if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+      BlendSplat(N1BV, 0);
+    if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
+      BlendSplat(N2BV, NElts);
+  }
 
   // Canonicalize all index into lhs, -> shuffle lhs, undef
   // Canonicalize all index into rhs, -> shuffle rhs, undef
@@ -1643,7 +1732,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
 }
 
 SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
-  MVT VT = SV.getSimpleValueType(0);
+  EVT VT = SV.getValueType(0);
   SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
   ShuffleVectorSDNode::commuteMask(MaskVec);
 
@@ -1661,6 +1750,7 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
     return SDValue(E, 0);
 
   auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
+  N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1870,19 +1960,15 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
 
 SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
                                 ISD::CondCode Cond, const SDLoc &dl) {
+  EVT OpVT = N1.getValueType();
+
   // These setcc operations always fold.
   switch (Cond) {
   default: break;
   case ISD::SETFALSE:
-  case ISD::SETFALSE2: return getConstant(0, dl, VT);
+  case ISD::SETFALSE2: return getBoolConstant(false, dl, VT, OpVT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2: {
-    TargetLowering::BooleanContent Cnt =
-        TLI->getBooleanContents(N1->getValueType(0));
-    return getConstant(
-        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl,
-        VT);
-  }
+  case ISD::SETTRUE2: return getBoolConstant(true, dl, VT, OpVT);
 
   case ISD::SETOEQ:
   case ISD::SETOGT:
@@ -1905,16 +1991,16 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
 
       switch (Cond) {
       default: llvm_unreachable("Unknown integer setcc!");
-      case ISD::SETEQ:  return getConstant(C1 == C2, dl, VT);
-      case ISD::SETNE:  return getConstant(C1 != C2, dl, VT);
-      case ISD::SETULT: return getConstant(C1.ult(C2), dl, VT);
-      case ISD::SETUGT: return getConstant(C1.ugt(C2), dl, VT);
-      case ISD::SETULE: return getConstant(C1.ule(C2), dl, VT);
-      case ISD::SETUGE: return getConstant(C1.uge(C2), dl, VT);
-      case ISD::SETLT:  return getConstant(C1.slt(C2), dl, VT);
-      case ISD::SETGT:  return getConstant(C1.sgt(C2), dl, VT);
-      case ISD::SETLE:  return getConstant(C1.sle(C2), dl, VT);
-      case ISD::SETGE:  return getConstant(C1.sge(C2), dl, VT);
+      case ISD::SETEQ:  return getBoolConstant(C1 == C2, dl, VT, OpVT);
+      case ISD::SETNE:  return getBoolConstant(C1 != C2, dl, VT, OpVT);
+      case ISD::SETULT: return getBoolConstant(C1.ult(C2), dl, VT, OpVT);
+      case ISD::SETUGT: return getBoolConstant(C1.ugt(C2), dl, VT, OpVT);
+      case ISD::SETULE: return getBoolConstant(C1.ule(C2), dl, VT, OpVT);
+      case ISD::SETUGE: return getBoolConstant(C1.uge(C2), dl, VT, OpVT);
+      case ISD::SETLT:  return getBoolConstant(C1.slt(C2), dl, VT, OpVT);
+      case ISD::SETGT:  return getBoolConstant(C1.sgt(C2), dl, VT, OpVT);
+      case ISD::SETLE:  return getBoolConstant(C1.sle(C2), dl, VT, OpVT);
+      case ISD::SETGE:  return getBoolConstant(C1.sge(C2), dl, VT, OpVT);
       }
     }
   }
@@ -1926,41 +2012,54 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
       case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
-      case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, dl, VT);
+      case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT,
+                                               OpVT);
       case ISD::SETNE:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
-      case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan ||
-                                           R==APFloat::cmpLessThan, dl, VT);
+      case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                               R==APFloat::cmpLessThan, dl, VT,
+                                               OpVT);
       case ISD::SETLT:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
-      case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, dl, VT);
+      case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT,
+                                               OpVT);
       case ISD::SETGT:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
-      case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, dl, VT);
+      case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl,
+                                               VT, OpVT);
       case ISD::SETLE:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
-      case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan ||
-                                           R==APFloat::cmpEqual, dl, VT);
+      case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan ||
+                                               R==APFloat::cmpEqual, dl, VT,
+                                               OpVT);
       case ISD::SETGE:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
-      case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan ||
-                                           R==APFloat::cmpEqual, dl, VT);
-      case ISD::SETO:   return getConstant(R!=APFloat::cmpUnordered, dl, VT);
-      case ISD::SETUO:  return getConstant(R==APFloat::cmpUnordered, dl, VT);
-      case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered ||
-                                           R==APFloat::cmpEqual, dl, VT);
-      case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, dl, VT);
-      case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered ||
-                                           R==APFloat::cmpLessThan, dl, VT);
-      case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan ||
-                                           R==APFloat::cmpUnordered, dl, VT);
-      case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, dl, VT);
-      case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, dl, VT);
+      case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpEqual, dl, VT, OpVT);
+      case ISD::SETO:   return getBoolConstant(R!=APFloat::cmpUnordered, dl, VT,
+                                               OpVT);
+      case ISD::SETUO:  return getBoolConstant(R==APFloat::cmpUnordered, dl, VT,
+                                               OpVT);
+      case ISD::SETUEQ: return getBoolConstant(R==APFloat::cmpUnordered ||
+                                               R==APFloat::cmpEqual, dl, VT,
+                                               OpVT);
+      case ISD::SETUNE: return getBoolConstant(R!=APFloat::cmpEqual, dl, VT,
+                                               OpVT);
+      case ISD::SETULT: return getBoolConstant(R==APFloat::cmpUnordered ||
+                                               R==APFloat::cmpLessThan, dl, VT,
+                                               OpVT);
+      case ISD::SETUGT: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                               R==APFloat::cmpUnordered, dl, VT,
+                                               OpVT);
+      case ISD::SETULE: return getBoolConstant(R!=APFloat::cmpGreaterThan, dl,
+                                               VT, OpVT);
+      case ISD::SETUGE: return getBoolConstant(R!=APFloat::cmpLessThan, dl, VT,
+                                               OpVT);
       }
     } else {
       // Ensure that the constant occurs on the RHS.
@@ -2297,10 +2396,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
       break;
     }
 
-    // Support big-endian targets when it becomes useful.
     bool IsLE = getDataLayout().isLittleEndian();
-    if (!IsLE)
-      break;
 
     // Bitcast 'small element' vector to 'large element' scalar/vector.
     if ((BitWidth % SubBitWidth) == 0) {
@@ -2319,8 +2415,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(N0, Known2, SubDemandedElts.shl(i),
                          Depth + 1);
-        Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * i);
-        Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i);
+        unsigned Shifts = IsLE ? i : SubScale - 1 - i;
+        Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * Shifts);
+        Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * Shifts);
       }
     }
 
@@ -2342,7 +2439,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
       Known.Zero.setAllBits(); Known.One.setAllBits();
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i]) {
-          unsigned Offset = (i % SubScale) * BitWidth;
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
           Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
           Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
           // If we don't know any bits, early out.
@@ -2441,6 +2539,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     break;
   case ISD::SMULO:
   case ISD::UMULO:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     if (Op.getResNo() != 1)
       break;
     // The boolean result conforms to getBooleanContents.
@@ -2904,11 +3003,38 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   }
   case ISD::SMIN:
   case ISD::SMAX: {
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts,
-                     Depth + 1);
-    // If we don't know any bits, early out.
-    if (Known.isUnknown())
-      break;
+    // If we have a clamp pattern, we know that the number of sign bits will be
+    // the minimum of the clamp min/max range.
+    bool IsMax = (Opcode == ISD::SMAX);
+    ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
+    if ((CstLow = isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)))
+      if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
+        CstHigh = isConstOrDemandedConstSplat(Op.getOperand(0).getOperand(1),
+                                              DemandedElts);
+    if (CstLow && CstHigh) {
+      if (!IsMax)
+        std::swap(CstLow, CstHigh);
+
+      const APInt &ValueLow = CstLow->getAPIntValue();
+      const APInt &ValueHigh = CstHigh->getAPIntValue();
+      if (ValueLow.sle(ValueHigh)) {
+        unsigned LowSignBits = ValueLow.getNumSignBits();
+        unsigned HighSignBits = ValueHigh.getNumSignBits();
+        unsigned MinSignBits = std::min(LowSignBits, HighSignBits);
+        if (ValueLow.isNegative() && ValueHigh.isNegative()) {
+          Known.One.setHighBits(MinSignBits);
+          break;
+        }
+        if (ValueLow.isNonNegative() && ValueHigh.isNonNegative()) {
+          Known.Zero.setHighBits(MinSignBits);
+          break;
+        }
+      }
+    }
+
+    // Fallback - just get the shared known bits of the operands.
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    if (Known.isUnknown()) break; // Early-out
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
@@ -3038,7 +3164,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   if (!DemandedElts)
     return 1;  // No demanded elts, better to assume we don't know anything.
 
-  switch (Op.getOpcode()) {
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
   default: break;
   case ISD::AssertSext:
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
@@ -3189,7 +3316,32 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return std::min(Tmp, Tmp2);
 
   case ISD::SMIN:
-  case ISD::SMAX:
+  case ISD::SMAX: {
+    // If we have a clamp pattern, we know that the number of sign bits will be
+    // the minimum of the clamp min/max range.
+    bool IsMax = (Opcode == ISD::SMAX);
+    ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
+    if ((CstLow = isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)))
+      if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
+        CstHigh = isConstOrDemandedConstSplat(Op.getOperand(0).getOperand(1),
+                                              DemandedElts);
+    if (CstLow && CstHigh) {
+      if (!IsMax)
+        std::swap(CstLow, CstHigh);
+      if (CstLow->getAPIntValue().sle(CstHigh->getAPIntValue())) {
+        Tmp = CstLow->getAPIntValue().getNumSignBits();
+        Tmp2 = CstHigh->getAPIntValue().getNumSignBits();
+        return std::min(Tmp, Tmp2);
+      }
+    }
+
+    // Fallback - just get the minimum number of sign bits of the operands.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (Tmp == 1)
+      return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    return std::min(Tmp, Tmp2);
+  }
   case ISD::UMIN:
   case ISD::UMAX:
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
@@ -3225,7 +3377,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       unsigned RotAmt = C->getAPIntValue().urem(VTBits);
 
       // Handle rotate right by N like a rotate left by 32-N.
-      if (Op.getOpcode() == ISD::ROTR)
+      if (Opcode == ISD::ROTR)
         RotAmt = (VTBits - RotAmt) % VTBits;
 
       // If we aren't rotating out all of the known-in sign bits, return the
@@ -3423,10 +3575,10 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   }
 
   // Allow the target to implement this method for its nodes.
-  if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
-      Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
-      Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
-      Op.getOpcode() == ISD::INTRINSIC_VOID) {
+  if (Opcode >= ISD::BUILTIN_OP_END ||
+      Opcode == ISD::INTRINSIC_WO_CHAIN ||
+      Opcode == ISD::INTRINSIC_W_CHAIN ||
+      Opcode == ISD::INTRINSIC_VOID) {
     unsigned NumBits =
         TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
     if (NumBits > 1)
@@ -3487,17 +3639,33 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
   return false;
 }
 
-bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
+bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
+  assert(Op.getValueType().isFloatingPoint() &&
+         "Floating point type expected");
+
   // If the value is a constant, we can obviously see if it is a zero or not.
+  // TODO: Add BuildVector support.
   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
     return !C->isZero();
+  return false;
+}
+
+bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
+  assert(!Op.getValueType().isFloatingPoint() &&
+         "Floating point types unsupported - use isKnownNeverZeroFloat");
+
+  // If the value is a constant, we can obviously see if it is a zero or not.
+  if (ISD::matchUnaryPredicate(
+          Op, [](ConstantSDNode *C) { return !C->isNullValue(); }))
+    return true;
 
   // TODO: Recognize more cases here.
   switch (Op.getOpcode()) {
   default: break;
   case ISD::OR:
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
-      return !C->isNullValue();
+    if (isKnownNeverZero(Op.getOperand(1)) ||
+        isKnownNeverZero(Op.getOperand(0)))
+      return true;
     break;
   }
 
@@ -3517,6 +3685,8 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   return false;
 }
 
+// FIXME: unify with llvm::haveNoCommonBitsSet.
+// FIXME: could also handle masked merge pattern (X & ~M) op (Y & M)
 bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   assert(A.getValueType() == B.getValueType() &&
          "Values must have the same type");
@@ -3841,11 +4011,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     else if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
 
-    // (ext (trunx x)) -> x
+    // (ext (trunc x)) -> x
     if (OpOpcode == ISD::TRUNCATE) {
       SDValue OpOp = Operand.getOperand(0);
-      if (OpOp.getValueType() == VT)
+      if (OpOp.getValueType() == VT) {
+        transferDbgValues(Operand, OpOp);
         return OpOp;
+      }
     }
     break;
   case ISD::TRUNCATE:
@@ -3921,10 +4093,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   case ISD::FNEG:
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
-    if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
-      // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
+    if ((getTarget().Options.UnsafeFPMath || Flags.hasNoSignedZeros()) &&
+        OpOpcode == ISD::FSUB)
       return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
-                     Operand.getOperand(0), Operand.getNode()->getFlags());
+                     Operand.getOperand(0), Flags);
     if (OpOpcode == ISD::FNEG)  // --X -> X
       return Operand.getOperand(0);
     break;
@@ -4314,24 +4486,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
-    if (getTarget().Options.UnsafeFPMath) {
-      if (Opcode == ISD::FADD) {
-        // x+0 --> x
-        if (N2CFP && N2CFP->getValueAPF().isZero())
-          return N1;
-      } else if (Opcode == ISD::FSUB) {
-        // x-0 --> x
-        if (N2CFP && N2CFP->getValueAPF().isZero())
-          return N1;
-      } else if (Opcode == ISD::FMUL) {
-        // x*0 --> 0
-        if (N2CFP && N2CFP->isZero())
-          return N2;
-        // x*1 --> x
-        if (N2CFP && N2CFP->isExactlyValue(1.0))
-          return N1;
-      }
-    }
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
@@ -4448,12 +4602,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT:
+    assert(VT.getSizeInBits() >= N1.getValueType().getScalarSizeInBits() &&
+           "The result of EXTRACT_VECTOR_ELT must be at least as wide as the \
+             element type of the vector.");
+
     // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
     if (N1.isUndef())
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
-    if (N2C && N2C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+    if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
@@ -4635,6 +4793,18 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
   }
 
+  // Any FP binop with an undef operand is folded to NaN. This matches the
+  // behavior of the IR optimizer.
+  switch (Opcode) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    if (N1.isUndef() || N2.isUndef())
+      return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
+  }
+
   // Canonicalize an UNDEF to the RHS, even over a constant.
   if (N1.isUndef()) {
     if (TLI->isCommutativeBinOp(Opcode)) {
@@ -4644,22 +4814,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       case ISD::FP_ROUND_INREG:
       case ISD::SIGN_EXTEND_INREG:
       case ISD::SUB:
-      case ISD::FSUB:
-      case ISD::FDIV:
-      case ISD::FREM:
-      case ISD::SRA:
-        return N1;     // fold op(undef, arg2) -> undef
+        return getUNDEF(VT);     // fold op(undef, arg2) -> undef
       case ISD::UDIV:
       case ISD::SDIV:
       case ISD::UREM:
       case ISD::SREM:
+      case ISD::SRA:
       case ISD::SRL:
       case ISD::SHL:
-        if (!VT.isVector())
-          return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
-        // For vectors, we can't easily build an all zero vector, just return
-        // the LHS.
-        return N2;
+        return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
       }
     }
   }
@@ -4681,32 +4844,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::SDIV:
     case ISD::UREM:
     case ISD::SREM:
-      return N2;       // fold op(arg1, undef) -> undef
-    case ISD::FADD:
-    case ISD::FSUB:
-    case ISD::FMUL:
-    case ISD::FDIV:
-    case ISD::FREM:
-      if (getTarget().Options.UnsafeFPMath)
-        return N2;
-      break;
-    case ISD::MUL:
-    case ISD::AND:
+    case ISD::SRA:
     case ISD::SRL:
     case ISD::SHL:
-      if (!VT.isVector())
-        return getConstant(0, DL, VT);  // fold op(arg1, undef) -> 0
-      // For vectors, we can't easily build an all zero vector, just return
-      // the LHS.
-      return N1;
+      return getUNDEF(VT);       // fold op(arg1, undef) -> undef
+    case ISD::MUL:
+    case ISD::AND:
+      return getConstant(0, DL, VT);  // fold op(arg1, undef) -> 0
     case ISD::OR:
-      if (!VT.isVector())
-        return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
-      // For vectors, we can't easily build an all one vector, just return
-      // the LHS.
-      return N1;
-    case ISD::SRA:
-      return N1;
+      return getAllOnesConstant(DL, VT);
     }
   }
 
@@ -4739,10 +4885,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
-                              SDValue N1, SDValue N2, SDValue N3) {
+                              SDValue N1, SDValue N2, SDValue N3,
+                              const SDNodeFlags Flags) {
   // Perform various simplifications.
   switch (Opcode) {
   case ISD::FMA: {
+    assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+    assert(N1.getValueType() == VT && N2.getValueType() == VT &&
+           N3.getValueType() == VT && "FMA types must match!");
     ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
     ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
     ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
@@ -4833,10 +4983,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
+    }
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    N->setFlags(Flags);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
@@ -5107,6 +5260,31 @@ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
   return MF.getFunction().optForSize();
 }
 
+static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                          SmallVector<SDValue, 32> &OutChains, unsigned From,
+                          unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
+                          SmallVector<SDValue, 16> &OutStoreChains) {
+  assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
+  assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
+  SmallVector<SDValue, 16> GluedLoadChains;
+  for (unsigned i = From; i < To; ++i) {
+    OutChains.push_back(OutLoadChains[i]);
+    GluedLoadChains.push_back(OutLoadChains[i]);
+  }
+
+  // Chain for all loads.
+  SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  GluedLoadChains);
+
+  for (unsigned i = From; i < To; ++i) {
+    StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
+    SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+    OutChains.push_back(NewStore);
+  }
+}
+
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
                                        uint64_t Size, unsigned Align,
@@ -5171,7 +5349,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
   MachineMemOperand::Flags MMOFlags =
       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
-  SmallVector<SDValue, 8> OutChains;
+  SmallVector<SDValue, 16> OutLoadChains;
+  SmallVector<SDValue, 16> OutStoreChains;
+  SmallVector<SDValue, 32> OutChains;
   unsigned NumMemOps = MemOps.size();
   uint64_t SrcOff = 0, DstOff = 0;
   for (unsigned i = 0; i != NumMemOps; ++i) {
@@ -5205,11 +5385,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
         SubSlice.Length = VTSize;
       }
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
-      if (Value.getNode())
+      if (Value.getNode()) {
         Store = DAG.getStore(Chain, dl, Value,
                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
                              DstPtrInfo.getWithOffset(DstOff), Align,
                              MMOFlags);
+        OutChains.push_back(Store);
+      }
     }
 
     if (!Store.getNode()) {
@@ -5231,17 +5413,61 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
                              MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
-      OutChains.push_back(Value.getValue(1));
+      OutLoadChains.push_back(Value.getValue(1));
+
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
           DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+      OutStoreChains.push_back(Store);
     }
-    OutChains.push_back(Store);
     SrcOff += VTSize;
     DstOff += VTSize;
     Size -= VTSize;
   }
 
+  unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
+                                TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
+  unsigned NumLdStInMemcpy = OutStoreChains.size();
+
+  if (NumLdStInMemcpy) {
+    // It may be that memcpy might be converted to memset if it's memcpy
+    // of constants. In such a case, we won't have loads and stores, but
+    // just stores. In the absence of loads, there is nothing to gang up.
+    if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
+      // If target does not care, just leave as it.
+      for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
+        OutChains.push_back(OutLoadChains[i]);
+        OutChains.push_back(OutStoreChains[i]);
+      }
+    } else {
+      // Ld/St less than/equal limit set by target.
+      if (NumLdStInMemcpy <= GluedLdStLimit) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        NumLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+      } else {
+        unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
+        unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
+        unsigned GlueIter = 0;
+
+        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+          unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                       OutLoadChains, OutStoreChains);
+          GlueIter += GluedLdStLimit;
+        }
+
+        // Residual ld/st.
+        if (RemainingLdStInMemcpy) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        RemainingLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+        }
+      }
+    }
+  }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
@@ -5334,7 +5560,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
-/// \brief Lower the call to 'memset' intrinsic function into a series of store
+/// Lower the call to 'memset' intrinsic function into a series of store
 /// operations.
 ///
 /// \param DAG Selection DAG where lowered code is placed.
@@ -5518,6 +5744,47 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   return CallResult.second;
 }
 
+SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
+                                      SDValue Dst, unsigned DstAlign,
+                                      SDValue Src, unsigned SrcAlign,
+                                      SDValue Size, Type *SizeTy,
+                                      unsigned ElemSz, bool isTailCall,
+                                      MachinePointerInfo DstPtrInfo,
+                                      MachinePointerInfo SrcPtrInfo) {
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  Entry.Node = Src;
+  Args.push_back(Entry);
+
+  Entry.Ty = SizeTy;
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  RTLIB::Libcall LibraryCall =
+      RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+  if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported element size");
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                    Type::getVoidTy(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
 SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                  SDValue Src, SDValue Size, unsigned Align,
                                  bool isVol, bool isTailCall,
@@ -5579,6 +5846,47 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   return CallResult.second;
 }
 
+SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
+                                       SDValue Dst, unsigned DstAlign,
+                                       SDValue Src, unsigned SrcAlign,
+                                       SDValue Size, Type *SizeTy,
+                                       unsigned ElemSz, bool isTailCall,
+                                       MachinePointerInfo DstPtrInfo,
+                                       MachinePointerInfo SrcPtrInfo) {
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  Entry.Node = Src;
+  Args.push_back(Entry);
+
+  Entry.Ty = SizeTy;
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  RTLIB::Libcall LibraryCall =
+      RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+  if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported element size");
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                    Type::getVoidTy(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
 SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 SDValue Src, SDValue Size, unsigned Align,
                                 bool isVol, bool isTailCall,
@@ -5641,6 +5949,46 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   return CallResult.second;
 }
 
+SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
+                                      SDValue Dst, unsigned DstAlign,
+                                      SDValue Value, SDValue Size, Type *SizeTy,
+                                      unsigned ElemSz, bool isTailCall,
+                                      MachinePointerInfo DstPtrInfo) {
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  Entry.Ty = Type::getInt8Ty(*getContext());
+  Entry.Node = Value;
+  Args.push_back(Entry);
+
+  Entry.Ty = SizeTy;
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  RTLIB::Libcall LibraryCall =
+      RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+  if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported element size");
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                    Type::getVoidTy(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
 SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDVTList VTList, ArrayRef<SDValue> Ops,
                                 MachineMemOperand *MMO) {
@@ -5736,6 +6084,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
   assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
           Opcode == ISD::ATOMIC_LOAD_SUB ||
           Opcode == ISD::ATOMIC_LOAD_AND ||
+          Opcode == ISD::ATOMIC_LOAD_CLR ||
           Opcode == ISD::ATOMIC_LOAD_OR ||
           Opcode == ISD::ATOMIC_LOAD_XOR ||
           Opcode == ISD::ATOMIC_LOAD_NAND ||
@@ -6207,7 +6556,7 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
                                       MachineMemOperand *MMO) {
-  assert(Ops.size() == 5 && "Incompatible number of operands");
+  assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
@@ -6233,6 +6582,9 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
   assert(N->getIndex().getValueType().getVectorNumElements() ==
              N->getValueType(0).getVectorNumElements() &&
          "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -6244,7 +6596,7 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
                                        MachineMemOperand *MMO) {
-  assert(Ops.size() == 5 && "Incompatible number of operands");
+  assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
@@ -6267,6 +6619,9 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
   assert(N->getIndex().getValueType().getVectorNumElements() ==
              N->getValue().getValueType().getVectorNumElements() &&
          "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -6558,6 +6913,7 @@ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
   // Now we update the operands.
   N->OperandList[0].set(Op);
 
+  updateDivergence(N);
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
   return N;
@@ -6586,6 +6942,7 @@ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
   if (N->OperandList[1] != Op2)
     N->OperandList[1].set(Op2);
 
+  updateDivergence(N);
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
   return N;
@@ -6636,6 +6993,7 @@ UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
     if (N->OperandList[i] != Ops[i])
       N->OperandList[i].set(Ops[i]);
 
+  updateDivergence(N);
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
   return N;
@@ -7061,11 +7419,24 @@ SDDbgValue *SelectionDAG::getConstantDbgValue(DIVariable *Var,
 /// FrameIndex
 SDDbgValue *SelectionDAG::getFrameIndexDbgValue(DIVariable *Var,
                                                 DIExpression *Expr, unsigned FI,
+                                                bool IsIndirect,
                                                 const DebugLoc &DL,
                                                 unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
-  return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, FI, DL, O);
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(Var, Expr, FI, IsIndirect, DL, O, SDDbgValue::FRAMEIX);
+}
+
+/// VReg
+SDDbgValue *SelectionDAG::getVRegDbgValue(DIVariable *Var,
+                                          DIExpression *Expr,
+                                          unsigned VReg, bool IsIndirect,
+                                          const DebugLoc &DL, unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(Var, Expr, VReg, IsIndirect, DL, O, SDDbgValue::VREG);
 }
 
 void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
@@ -7155,8 +7526,9 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
                         DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
         ClonedDVs.push_back(Clone);
         DV->setIsInvalidated();
-        DEBUG(dbgs() << "SALVAGE: Rewriting"; N0.getNode()->dumprFull(this);
-              dbgs() << " into " << *DIExpr << '\n');
+        LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
+                   N0.getNode()->dumprFull(this);
+                   dbgs() << " into " << *DIExpr << '\n');
       }
     }
   }
@@ -7165,6 +7537,14 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
     AddDbgValue(Dbg, Dbg->getSDNode(), false);
 }
 
+/// Creates a SDDbgLabel node.
+SDDbgLabel *SelectionDAG::getDbgLabel(DILabel *Label,
+                                      const DebugLoc &DL, unsigned O) {
+  assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc()) SDDbgLabel(Label, DL, O);
+}
+
 namespace {
 
 /// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
@@ -7227,8 +7607,9 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
       SDUse &Use = UI.getUse();
       ++UI;
       Use.set(To);
+      if (To->isDivergent() != From->isDivergent())
+        updateDivergence(User);
     } while (UI != UE && *UI == User);
-
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
@@ -7282,6 +7663,8 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
       SDUse &Use = UI.getUse();
       ++UI;
       Use.setNode(To);
+      if (To->isDivergent() != From->isDivergent())
+        updateDivergence(User);
     } while (UI != UE && *UI == User);
 
     // Now that we have modified User, add it back to the CSE maps.  If it
@@ -7326,8 +7709,9 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
       const SDValue &ToOp = To[Use.getResNo()];
       ++UI;
       Use.set(ToOp);
+      if (To->getNode()->isDivergent() != From->isDivergent())
+        updateDivergence(User);
     } while (UI != UE && *UI == User);
-
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
@@ -7385,8 +7769,9 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
 
       ++UI;
       Use.set(To);
+      if (To->isDivergent() != From->isDivergent())
+        updateDivergence(User);
     } while (UI != UE && *UI == User);
-
     // We are iterating over all uses of the From node, so if a use
     // doesn't use the specific value, no changes are made.
     if (!UserRemovedFromCSEMaps)
@@ -7419,6 +7804,72 @@ namespace {
 
 } // end anonymous namespace
 
+void SelectionDAG::updateDivergence(SDNode * N)
+{
+  if (TLI->isSDNodeAlwaysUniform(N))
+    return;
+  bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
+  for (auto &Op : N->ops()) {
+    if (Op.Val.getValueType() != MVT::Other)
+      IsDivergent |= Op.getNode()->isDivergent();
+  }
+  if (N->SDNodeBits.IsDivergent != IsDivergent) {
+    N->SDNodeBits.IsDivergent = IsDivergent;
+    for (auto U : N->uses()) {
+      updateDivergence(U);
+    }
+  }
+}
+
+
+void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode*>& Order) {
+  DenseMap<SDNode *, unsigned> Degree;
+  Order.reserve(AllNodes.size());
+  for (auto & N : allnodes()) {
+    unsigned NOps = N.getNumOperands();
+    Degree[&N] = NOps;
+    if (0 == NOps)
+      Order.push_back(&N);
+  }
+  for (std::vector<SDNode *>::iterator I = Order.begin();
+  I!=Order.end();++I) {
+    SDNode * N = *I;
+    for (auto U : N->uses()) {
+      unsigned &UnsortedOps = Degree[U];
+      if (0 == --UnsortedOps)
+        Order.push_back(U);
+    }
+  }
+}
+
+void SelectionDAG::VerifyDAGDiverence()
+{
+  std::vector<SDNode*> TopoOrder;
+  CreateTopologicalOrder(TopoOrder);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  DenseMap<const SDNode *, bool> DivergenceMap;
+  for (auto &N : allnodes()) {
+    DivergenceMap[&N] = false;
+  }
+  for (auto N : TopoOrder) {
+    bool IsDivergent = DivergenceMap[N];
+    bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA);
+    for (auto &Op : N->ops()) {
+      if (Op.Val.getValueType() != MVT::Other)
+        IsSDNodeDivergent |= DivergenceMap[Op.getNode()];
+    }
+    if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) {
+      DivergenceMap[N] = true;
+    }
+  }
+  for (auto &N : allnodes()) {
+    (void)N;
+    assert(DivergenceMap[&N] == N.isDivergent() &&
+           "Divergence bit inconsistency detected\n");
+  }
+}
+
+
 /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The same value
 /// may appear in both the From and To list.  The Deleted vector is
@@ -7450,7 +7901,7 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
   }
 
   // Sort the uses, so that all the uses from a given User are together.
-  std::sort(Uses.begin(), Uses.end());
+  llvm::sort(Uses.begin(), Uses.end());
 
   for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
        UseIndex != UseIndexEnd; ) {
@@ -7579,6 +8030,10 @@ void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
   DbgInfo->add(DB, SD, isParameter);
 }
 
+void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
+  DbgInfo->add(DB);
+}
+
 SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
                                                    SDValue NewMemOp) {
   assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
@@ -7963,8 +8418,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   const GlobalValue *GV;
   int64_t GVOffset = 0;
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
-    unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
-    KnownBits Known(PtrWidth);
+    unsigned IdxWidth = getDataLayout().getIndexTypeSizeInBits(GV->getType());
+    KnownBits Known(IdxWidth);
     llvm::computeKnownBits(GV, Known, getDataLayout());
     unsigned AlignBits = Known.countMinTrailingZeros();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
@@ -8198,7 +8653,7 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
   return true;
 }
 
-// \brief Returns the SDNode if it is a constant integer BuildVector
+// Returns the SDNode if it is a constant integer BuildVector
 // or constant integer.
 SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
   if (isa<ConstantSDNode>(N))
@@ -8224,6 +8679,26 @@ SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
   return nullptr;
 }
 
+void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
+  assert(!Node->OperandList && "Node already has operands");
+  SDUse *Ops = OperandRecycler.allocate(
+    ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
+
+  bool IsDivergent = false;
+  for (unsigned I = 0; I != Vals.size(); ++I) {
+    Ops[I].setUser(Node);
+    Ops[I].setInitial(Vals[I]);
+    if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
+      IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent();
+  }
+  Node->NumOperands = Vals.size();
+  Node->OperandList = Ops;
+  IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
+  if (!TLI->isSDNodeAlwaysUniform(Node))
+    Node->SDNodeBits.IsDivergent = IsDivergent;
+  checkForCycles(Node);
+}
+
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSetImpl<const SDNode*> &Visited,
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index da1574f60524..c859f16e74fe 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -99,16 +99,43 @@ BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N,
   }
 
   // Consume constant adds & ors with appropriate masking.
-  while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) {
-    if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
+  while (true) {
+    switch (Base->getOpcode()) {
+    case ISD::OR:
       // Only consider ORs which act as adds.
-      if (Base->getOpcode() == ISD::OR &&
-          !DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue()))
-        break;
-      Offset += C->getSExtValue();
-      Base = Base->getOperand(0);
-      continue;
+      if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1)))
+        if (DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue())) {
+          Offset += C->getSExtValue();
+          Base = Base->getOperand(0);
+          continue;
+        }
+      break;
+    case ISD::ADD:
+      if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
+        Offset += C->getSExtValue();
+        Base = Base->getOperand(0);
+        continue;
+      }
+      break;
+    case ISD::LOAD:
+    case ISD::STORE: {
+      auto *LSBase = cast<LSBaseSDNode>(Base.getNode());
+      unsigned int IndexResNo = (Base->getOpcode() == ISD::LOAD) ? 1 : 0;
+      if (LSBase->isIndexed() && Base.getResNo() == IndexResNo)
+        if (auto *C = dyn_cast<ConstantSDNode>(LSBase->getOffset())) {
+          auto Off = C->getSExtValue();
+          if (LSBase->getAddressingMode() == ISD::PRE_DEC ||
+              LSBase->getAddressingMode() == ISD::POST_DEC)
+            Offset -= Off;
+          else
+            Offset += Off;
+          Base = LSBase->getBasePtr();
+          continue;
+        }
+      break;
+    }
     }
+    // If we get here break out of the loop.
     break;
   }
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 68bbd62e1321..1aa8df29af3b 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SelectionDAGBuilder.h"
+#include "SDNodeDbgValue.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -49,7 +50,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -102,6 +102,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -777,8 +778,8 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
     EVT ValueVT = ValueVTs[Value];
     unsigned NumRegs = RegCount[Value];
     MVT RegisterVT = IsABIMangled
-                         ? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
-                         : RegVTs[Value];
+      ? TLI.getRegisterTypeForCallingConv(*DAG.getContext(), RegVTs[Value])
+      : RegVTs[Value];
 
     Parts.resize(NumRegs);
     for (unsigned i = 0; i != NumRegs; ++i) {
@@ -818,32 +819,15 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 
       // FIXME: We capture more information than the dag can represent.  For
       // now, just use the tightest assertzext/assertsext possible.
-      bool isSExt = true;
+      bool isSExt;
       EVT FromVT(MVT::Other);
-      if (NumSignBits == RegSize) {
-        isSExt = true;   // ASSERT SEXT 1
-        FromVT = MVT::i1;
-      } else if (NumZeroBits >= RegSize - 1) {
-        isSExt = false;  // ASSERT ZEXT 1
-        FromVT = MVT::i1;
-      } else if (NumSignBits > RegSize - 8) {
-        isSExt = true;   // ASSERT SEXT 8
-        FromVT = MVT::i8;
-      } else if (NumZeroBits >= RegSize - 8) {
-        isSExt = false;  // ASSERT ZEXT 8
-        FromVT = MVT::i8;
-      } else if (NumSignBits > RegSize - 16) {
-        isSExt = true;   // ASSERT SEXT 16
-        FromVT = MVT::i16;
-      } else if (NumZeroBits >= RegSize - 16) {
-        isSExt = false;  // ASSERT ZEXT 16
-        FromVT = MVT::i16;
-      } else if (NumSignBits > RegSize - 32) {
-        isSExt = true;   // ASSERT SEXT 32
-        FromVT = MVT::i32;
-      } else if (NumZeroBits >= RegSize - 32) {
-        isSExt = false;  // ASSERT ZEXT 32
-        FromVT = MVT::i32;
+      if (NumZeroBits) {
+        FromVT = EVT::getIntegerVT(*DAG.getContext(), RegSize - NumZeroBits);
+        isSExt = false;
+      } else if (NumSignBits > 1) {
+        FromVT =
+            EVT::getIntegerVT(*DAG.getContext(), RegSize - NumSignBits + 1);
+        isSExt = true;
       } else {
         continue;
       }
@@ -876,8 +860,8 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
     unsigned NumParts = RegCount[Value];
 
     MVT RegisterVT = IsABIMangled
-                         ? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
-                         : RegVTs[Value];
+      ? TLI.getRegisterTypeForCallingConv(*DAG.getContext(), RegVTs[Value])
+      : RegVTs[Value];
 
     if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
       ExtendKind = ISD::ZERO_EXTEND;
@@ -970,6 +954,20 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   }
 }
 
+SmallVector<std::pair<unsigned, unsigned>, 4>
+RegsForValue::getRegsAndSizes() const {
+  SmallVector<std::pair<unsigned, unsigned>, 4> OutVec;
+  unsigned I = 0;
+  for (auto CountAndVT : zip_first(RegCount, RegVTs)) {
+    unsigned RegCount = std::get<0>(CountAndVT);
+    MVT RegisterVT = std::get<1>(CountAndVT);
+    unsigned RegisterSize = RegisterVT.getSizeInBits();
+    for (unsigned E = I + RegCount; I != E; ++I)
+      OutVec.push_back(std::make_pair(Regs[I], RegisterSize));
+  }
+  return OutVec;
+}
+
 void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
                                const TargetLibraryInfo *li) {
   AA = aa;
@@ -1054,6 +1052,22 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
 
   visit(I.getOpcode(), I);
 
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
+    // Propagate the fast-math-flags of this IR instruction to the DAG node that
+    // maps to this instruction.
+    // TODO: We could handle all flags (nsw, etc) here.
+    // TODO: If an IR instruction maps to >1 node, only the final node will have
+    //       flags set.
+    if (SDNode *Node = getNodeForIRValue(&I)) {
+      SDNodeFlags IncomingFlags;
+      IncomingFlags.copyFMF(*FPMO);
+      if (!Node->getFlags().isDefined())
+        Node->setFlags(IncomingFlags);
+      else
+        Node->intersectFlagsWith(IncomingFlags);
+    }
+  }
+
   if (!isa<TerminatorInst>(&I) && !HasTailCall &&
       !isStatepoint(&I)) // statepoints handle their exports internally
     CopyToExportRegsIfNeeded(&I);
@@ -1077,14 +1091,39 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   }
 }
 
+void SelectionDAGBuilder::dropDanglingDebugInfo(const DILocalVariable *Variable,
+                                                const DIExpression *Expr) {
+  auto isMatchingDbgValue = [&](DanglingDebugInfo &DDI) {
+    const DbgValueInst *DI = DDI.getDI();
+    DIVariable *DanglingVariable = DI->getVariable();
+    DIExpression *DanglingExpr = DI->getExpression();
+    if (DanglingVariable == Variable && Expr->fragmentsOverlap(DanglingExpr)) {
+      LLVM_DEBUG(dbgs() << "Dropping dangling debug info for " << *DI << "\n");
+      return true;
+    }
+    return false;
+  };
+
+  for (auto &DDIMI : DanglingDebugInfoMap) {
+    DanglingDebugInfoVector &DDIV = DDIMI.second;
+    DDIV.erase(remove_if(DDIV, isMatchingDbgValue), DDIV.end());
+  }
+}
+
 // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
 // generate the debug data structures now that we've seen its definition.
 void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
                                                    SDValue Val) {
-  DanglingDebugInfo &DDI = DanglingDebugInfoMap[V];
-  if (DDI.getDI()) {
+  auto DanglingDbgInfoIt = DanglingDebugInfoMap.find(V);
+  if (DanglingDbgInfoIt == DanglingDebugInfoMap.end())
+    return;
+
+  DanglingDebugInfoVector &DDIV = DanglingDbgInfoIt->second;
+  for (auto &DDI : DDIV) {
     const DbgValueInst *DI = DDI.getDI();
+    assert(DI && "Ill-formed DanglingDebugInfo");
     DebugLoc dl = DDI.getdl();
+    unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
     unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
     DILocalVariable *Variable = DI->getVariable();
     DIExpression *Expr = DI->getExpression();
@@ -1093,13 +1132,26 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
     SDDbgValue *SDV;
     if (Val.getNode()) {
       if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, false, Val)) {
-        SDV = getDbgValue(Val, Variable, Expr, dl, DbgSDNodeOrder);
+        LLVM_DEBUG(dbgs() << "Resolve dangling debug info [order="
+                          << DbgSDNodeOrder << "] for:\n  " << *DI << "\n");
+        LLVM_DEBUG(dbgs() << "  By mapping to:\n    "; Val.dump());
+        // Increase the SDNodeOrder for the DbgValue here to make sure it is
+        // inserted after the definition of Val when emitting the instructions
+        // after ISel. An alternative could be to teach
+        // ScheduleDAGSDNodes::EmitSchedule to delay the insertion properly.
+        LLVM_DEBUG(if (ValSDNodeOrder > DbgSDNodeOrder) dbgs()
+                   << "changing SDNodeOrder from " << DbgSDNodeOrder << " to "
+                   << ValSDNodeOrder << "\n");
+        SDV = getDbgValue(Val, Variable, Expr, dl,
+                          std::max(DbgSDNodeOrder, ValSDNodeOrder));
         DAG.AddDbgValue(SDV, Val.getNode(), false);
-      }
+      } else
+        LLVM_DEBUG(dbgs() << "Resolved dangling debug info for " << *DI
+                          << "in EmitFuncArgumentDbgValue\n");
     } else
-      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
-    DanglingDebugInfoMap[V] = DanglingDebugInfo();
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
   }
+  DDIV.clear();
 }
 
 /// getCopyFromRegs - If there was virtual register allocated for the value V
@@ -1315,12 +1367,18 @@ void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
   auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
   bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
   bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
+  bool IsSEH = isAsynchronousEHPersonality(Pers);
+  bool IsWasmCXX = Pers == EHPersonality::Wasm_CXX;
   MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
+  if (!IsSEH)
+    CatchPadMBB->setIsEHScopeEntry();
   // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
   if (IsMSVCCXX || IsCoreCLR)
     CatchPadMBB->setIsEHFuncletEntry();
-
-  DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other, getControlRoot()));
+  // Wasm does not need catchpads anymore
+  if (!IsWasmCXX)
+    DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other,
+                            getControlRoot()));
 }
 
 void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
@@ -1363,7 +1421,8 @@ void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
 
 void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
   // Don't emit any special code for the cleanuppad instruction. It just marks
-  // the start of a funclet.
+  // the start of an EH scope/funclet.
+  FuncInfo.MBB->setIsEHScopeEntry();
   FuncInfo.MBB->setIsEHFuncletEntry();
   FuncInfo.MBB->setIsCleanupFuncletEntry();
 }
@@ -1385,6 +1444,7 @@ static void findUnwindDestinations(
     classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
   bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
   bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
+  bool IsSEH = isAsynchronousEHPersonality(Personality);
 
   while (EHPadBB) {
     const Instruction *Pad = EHPadBB->getFirstNonPHI();
@@ -1397,6 +1457,7 @@ static void findUnwindDestinations(
       // Stop on cleanup pads. Cleanups are always funclet entries for all known
       // personalities.
       UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      UnwindDests.back().first->setIsEHScopeEntry();
       UnwindDests.back().first->setIsEHFuncletEntry();
       break;
     } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
@@ -1406,6 +1467,8 @@ static void findUnwindDestinations(
         // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
         if (IsMSVCCXX || IsCoreCLR)
           UnwindDests.back().first->setIsEHFuncletEntry();
+        if (!IsSEH)
+          UnwindDests.back().first->setIsEHScopeEntry();
       }
       NewEHPadBB = CatchSwitch->getUnwindDest();
     } else {
@@ -1653,8 +1716,7 @@ SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
   if (!BPI) {
     // If BPI is not available, set the default probability as 1 / N, where N is
     // the number of successors.
-    auto SuccSize = std::max<uint32_t>(
-        std::distance(succ_begin(SrcBB), succ_end(SrcBB)), 1);
+    auto SuccSize = std::max<uint32_t>(succ_size(SrcBB), 1);
     return BranchProbability(1, SuccSize);
   }
   return BPI->getEdgeProbability(SrcBB, DstBB);
@@ -2489,8 +2551,8 @@ void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) {
     assert(CC.Low == CC.High && "Input clusters must be single-case");
 #endif
 
-  std::sort(Clusters.begin(), Clusters.end(),
-            [](const CaseCluster &a, const CaseCluster &b) {
+  llvm::sort(Clusters.begin(), Clusters.end(),
+             [](const CaseCluster &a, const CaseCluster &b) {
     return a.Low->getValue().slt(b.Low->getValue());
   });
 
@@ -2551,9 +2613,23 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
 }
 
 void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
-  if (DAG.getTarget().Options.TrapUnreachable)
-    DAG.setRoot(
-        DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+  if (!DAG.getTarget().Options.TrapUnreachable)
+    return;
+
+  // We may be able to ignore unreachable behind a noreturn call.
+  if (DAG.getTarget().Options.NoTrapAfterNoreturn) {
+    const BasicBlock &BB = *I.getParent();
+    if (&I != &BB.front()) {
+      BasicBlock::const_iterator PredI =
+        std::prev(BasicBlock::const_iterator(&I));
+      if (const CallInst *Call = dyn_cast<CallInst>(&*PredI)) {
+        if (Call->doesNotReturn())
+          return;
+      }
+    }
+  }
+
+  DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
 void SelectionDAGBuilder::visitFSub(const User &I) {
@@ -2597,6 +2673,10 @@ static bool isVectorReductionOp(const User *I) {
   }
 
   unsigned ElemNum = Inst->getType()->getVectorNumElements();
+  // Ensure the reduction size is a power of 2.
+  if (!isPowerOf2_32(ElemNum))
+    return false;
+
   unsigned ElemNumToReduce = ElemNum;
 
   // Do DFS search on the def-use chain from the given instruction. We only
@@ -2682,7 +2762,7 @@ static bool isVectorReductionOp(const User *I) {
           return false;
 
         const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
-        if (!Val || Val->getZExtValue() != 0)
+        if (!Val || !Val->isZero())
           return false;
 
         ReduxExtracted = true;
@@ -2693,45 +2773,23 @@ static bool isVectorReductionOp(const User *I) {
   return ReduxExtracted;
 }
 
-void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
-  SDValue Op1 = getValue(I.getOperand(0));
-  SDValue Op2 = getValue(I.getOperand(1));
-
-  bool nuw = false;
-  bool nsw = false;
-  bool exact = false;
-  bool vec_redux = false;
-  FastMathFlags FMF;
-
-  if (const OverflowingBinaryOperator *OFBinOp =
-          dyn_cast<const OverflowingBinaryOperator>(&I)) {
-    nuw = OFBinOp->hasNoUnsignedWrap();
-    nsw = OFBinOp->hasNoSignedWrap();
+void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
+  SDNodeFlags Flags;
+  if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
+    Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
+    Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
+  }
+  if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
+    Flags.setExact(ExactOp->isExact());
   }
-  if (const PossiblyExactOperator *ExactOp =
-          dyn_cast<const PossiblyExactOperator>(&I))
-    exact = ExactOp->isExact();
-  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
-    FMF = FPOp->getFastMathFlags();
-
   if (isVectorReductionOp(&I)) {
-    vec_redux = true;
-    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
+    Flags.setVectorReduction(true);
+    LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
   }
 
-  SDNodeFlags Flags;
-  Flags.setExact(exact);
-  Flags.setNoSignedWrap(nsw);
-  Flags.setNoUnsignedWrap(nuw);
-  Flags.setVectorReduction(vec_redux);
-  Flags.setAllowReciprocal(FMF.allowReciprocal());
-  Flags.setAllowContract(FMF.allowContract());
-  Flags.setNoInfs(FMF.noInfs());
-  Flags.setNoNaNs(FMF.noNaNs());
-  Flags.setNoSignedZeros(FMF.noSignedZeros());
-  Flags.setUnsafeAlgebra(FMF.isFast());
-
-  SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  SDValue BinNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(),
                                      Op1, Op2, Flags);
   setValue(&I, BinNodeValue);
 }
@@ -2823,13 +2881,12 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
     predicate = FCmpInst::Predicate(FC->getPredicate());
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
-  ISD::CondCode Condition = getFCmpCondCode(predicate);
 
-  // FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them.
-  // FIXME: We should propagate the fast-math-flags to the DAG node itself for
-  // further optimization, but currently FMF is only applicable to binary nodes.
-  if (TM.Options.NoNaNsFPMath)
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+  auto *FPMO = dyn_cast<FPMathOperator>(&I);
+  if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
+
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
@@ -3424,10 +3481,9 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
                         DAG.getConstant(Offset, dl, N.getValueType()), Flags);
       }
     } else {
-      MVT PtrTy =
-          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout(), AS);
-      unsigned PtrSize = PtrTy.getSizeInBits();
-      APInt ElementSize(PtrSize, DL->getTypeAllocSize(GTI.getIndexedType()));
+      unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
+      MVT IdxTy = MVT::getIntegerVT(IdxSize);
+      APInt ElementSize(IdxSize, DL->getTypeAllocSize(GTI.getIndexedType()));
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
@@ -3439,11 +3495,11 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (CI) {
         if (CI->isZero())
           continue;
-        APInt Offs = ElementSize * CI->getValue().sextOrTrunc(PtrSize);
+        APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
         LLVMContext &Context = *DAG.getContext();
         SDValue OffsVal = VectorWidth ?
-          DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, PtrTy, VectorWidth)) :
-          DAG.getConstant(Offs, dl, PtrTy);
+          DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
+          DAG.getConstant(Offs, dl, IdxTy);
 
         // In an inbouds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
@@ -3867,7 +3923,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
 static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
-                           SelectionDAGBuilder* SDB) {
+                           SDValue &Scale, SelectionDAGBuilder* SDB) {
   SelectionDAG& DAG = SDB->DAG;
   LLVMContext &Context = *DAG.getContext();
 
@@ -3897,6 +3953,10 @@ static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
   if (!SDB->findValue(Ptr) || !SDB->findValue(IndexVal))
     return false;
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  Scale = DAG.getTargetConstant(DL.getTypeAllocSize(GEP->getResultElementType()),
+                                SDB->getCurSDLoc(), TLI.getPointerTy(DL));
   Base = SDB->getValue(Ptr);
   Index = SDB->getValue(IndexVal);
 
@@ -3926,8 +3986,9 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
+  SDValue Scale;
   const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
 
   const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -3935,10 +3996,11 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
                          Alignment, AAInfo);
   if (!UniformBase) {
-    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
+    Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
-  SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index };
+  SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
                                          Ops, MMO);
   DAG.setRoot(Scatter);
@@ -3997,10 +4059,8 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
 
   SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
                                    ISD::NON_EXTLOAD, IsExpanding);
-  if (AddToChain) {
-    SDValue OutChain = Load.getValue(1);
-    DAG.setRoot(OutChain);
-  }
+  if (AddToChain)
+    PendingLoads.push_back(Load.getValue(1));
   setValue(&I, Load);
 }
 
@@ -4025,8 +4085,9 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
+  SDValue Scale;
   const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
   bool ConstantMemory = false;
   if (UniformBase &&
       AA && AA->pointsToConstantMemory(MemoryLocation(
@@ -4044,10 +4105,11 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
                          Alignment, AAInfo, Ranges);
 
   if (!UniformBase) {
-    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
+    Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
-  SDValue Ops[] = { Root, Src0, Mask, Base, Index };
+  SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
                                        Ops, MMO);
 
@@ -4868,26 +4930,18 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
       const auto &TLI = DAG.getTargetLoweringInfo();
       RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
                        V->getType(), isABIRegCopy(V));
-      unsigned NumRegs =
-          std::accumulate(RFV.RegCount.begin(), RFV.RegCount.end(), 0);
-      if (NumRegs > 1) {
-        unsigned I = 0;
+      if (RFV.occupiesMultipleRegs()) {
         unsigned Offset = 0;
-        auto RegisterVT = RFV.RegVTs.begin();
-        for (auto RegCount : RFV.RegCount) {
-          unsigned RegisterSize = (RegisterVT++)->getSizeInBits();
-          for (unsigned E = I + RegCount; I != E; ++I) {
-            // The vregs are guaranteed to be allocated in sequence.
-            Op = MachineOperand::CreateReg(VMI->second + I, false);
-            auto FragmentExpr = DIExpression::createFragmentExpression(
-                Expr, Offset, RegisterSize);
-            if (!FragmentExpr)
-              continue;
-            FuncInfo.ArgDbgValues.push_back(
-                BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
-                        Op->getReg(), Variable, *FragmentExpr));
-            Offset += RegisterSize;
-          }
+        for (auto RegAndSize : RFV.getRegsAndSizes()) {
+          Op = MachineOperand::CreateReg(RegAndSize.first, false);
+          auto FragmentExpr = DIExpression::createFragmentExpression(
+              Expr, Offset, RegAndSize.second);
+          if (!FragmentExpr)
+            continue;
+          FuncInfo.ArgDbgValues.push_back(
+              BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
+                      Op->getReg(), Variable, *FragmentExpr));
+          Offset += RegAndSize.second;
         }
         return true;
       }
@@ -4901,17 +4955,10 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
 
   assert(Variable->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
-  if (Op->isReg())
-    FuncInfo.ArgDbgValues.push_back(
-        BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
-                Op->getReg(), Variable, Expr));
-  else
-    FuncInfo.ArgDbgValues.push_back(
-        BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE))
-            .add(*Op)
-            .addImm(0)
-            .addMetadata(Variable)
-            .addMetadata(Expr));
+  IsIndirect = (Op->isReg()) ? IsIndirect : true;
+  FuncInfo.ArgDbgValues.push_back(
+      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+              *Op, Variable, Expr));
 
   return true;
 }
@@ -4924,13 +4971,20 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
                                              unsigned DbgSDNodeOrder) {
   if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
     // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
-    // stack slot locations as such instead of as indirectly addressed
-    // locations.
-    return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), dl,
-                                     DbgSDNodeOrder);
+    // stack slot locations. 
+    //
+    // Consider "int x = 0; int *px = &x;". There are two kinds of interesting
+    // debug values here after optimization:
+    //
+    //   dbg.value(i32* %px, !"int *px", !DIExpression()), and
+    //   dbg.value(i32* %px, !"int x", !DIExpression(DW_OP_deref))
+    //
+    // Both describe the direct values of their associated variables.
+    return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(),
+                                     /*IsIndirect*/ false, dl, DbgSDNodeOrder);
   }
-  return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, dl,
-                         DbgSDNodeOrder);
+  return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(),
+                         /*IsIndirect*/ false, dl, DbgSDNodeOrder);
 }
 
 // VisualStudio defines setjmp as _setjmp
@@ -5000,14 +5054,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::longjmp:
     return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
+    const auto &MCI = cast<MemCpyInst>(I);
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
-    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-    if (!Align)
-      Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment.
-    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    // @llvm.memcpy defines 0 and 1 to both mean no alignment.
+    unsigned DstAlign = std::max<unsigned>(MCI.getDestAlignment(), 1);
+    unsigned SrcAlign = std::max<unsigned>(MCI.getSourceAlignment(), 1);
+    unsigned Align = MinAlign(DstAlign, SrcAlign);
+    bool isVol = MCI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    // FIXME: Support passing different dest/src alignments to the memcpy DAG
+    // node.
     SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                false, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
@@ -5016,13 +5074,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::memset: {
+    const auto &MSI = cast<MemSetInst>(I);
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
-    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-    if (!Align)
-      Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment.
-    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    // @llvm.memset defines 0 and 1 to both mean no alignment.
+    unsigned Align = std::max<unsigned>(MSI.getDestAlignment(), 1);
+    bool isVol = MSI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                isTC, MachinePointerInfo(I.getArgOperand(0)));
@@ -5030,14 +5088,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::memmove: {
+    const auto &MMI = cast<MemMoveInst>(I);
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
-    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-    if (!Align)
-      Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment.
-    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    // @llvm.memmove defines 0 and 1 to both mean no alignment.
+    unsigned DstAlign = std::max<unsigned>(MMI.getDestAlignment(), 1);
+    unsigned SrcAlign = std::max<unsigned>(MMI.getSourceAlignment(), 1);
+    unsigned Align = MinAlign(DstAlign, SrcAlign);
+    bool isVol = MMI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    // FIXME: Support passing different dest/src alignments to the memmove DAG
+    // node.
     SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                 isTC, MachinePointerInfo(I.getArgOperand(0)),
                                 MachinePointerInfo(I.getArgOperand(1)));
@@ -5050,36 +5112,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Src = getValue(MI.getRawSource());
     SDValue Length = getValue(MI.getLength());
 
-    // Emit a library call.
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-    Entry.Node = Dst;
-    Args.push_back(Entry);
-
-    Entry.Node = Src;
-    Args.push_back(Entry);
-
-    Entry.Ty = MI.getLength()->getType();
-    Entry.Node = Length;
-    Args.push_back(Entry);
-
-    uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
-    RTLIB::Libcall LibraryCall =
-        RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
-    if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
-      report_fatal_error("Unsupported element size");
-
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
-        TLI.getLibcallCallingConv(LibraryCall),
-        Type::getVoidTy(*DAG.getContext()),
-        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
-                              TLI.getPointerTy(DAG.getDataLayout())),
-        std::move(Args));
-
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    DAG.setRoot(CallResult.second);
+    unsigned DstAlign = MI.getDestAlignment();
+    unsigned SrcAlign = MI.getSourceAlignment();
+    Type *LengthTy = MI.getLength()->getType();
+    unsigned ElemSz = MI.getElementSizeInBytes();
+    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    SDValue MC = DAG.getAtomicMemcpy(getRoot(), sdl, Dst, DstAlign, Src,
+                                     SrcAlign, Length, LengthTy, ElemSz, isTC,
+                                     MachinePointerInfo(MI.getRawDest()),
+                                     MachinePointerInfo(MI.getRawSource()));
+    updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::memmove_element_unordered_atomic: {
@@ -5088,36 +5130,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Src = getValue(MI.getRawSource());
     SDValue Length = getValue(MI.getLength());
 
-    // Emit a library call.
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-    Entry.Node = Dst;
-    Args.push_back(Entry);
-
-    Entry.Node = Src;
-    Args.push_back(Entry);
-
-    Entry.Ty = MI.getLength()->getType();
-    Entry.Node = Length;
-    Args.push_back(Entry);
-
-    uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
-    RTLIB::Libcall LibraryCall =
-        RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
-    if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
-      report_fatal_error("Unsupported element size");
-
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
-        TLI.getLibcallCallingConv(LibraryCall),
-        Type::getVoidTy(*DAG.getContext()),
-        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
-                              TLI.getPointerTy(DAG.getDataLayout())),
-        std::move(Args));
-
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    DAG.setRoot(CallResult.second);
+    unsigned DstAlign = MI.getDestAlignment();
+    unsigned SrcAlign = MI.getSourceAlignment();
+    Type *LengthTy = MI.getLength()->getType();
+    unsigned ElemSz = MI.getElementSizeInBytes();
+    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    SDValue MC = DAG.getAtomicMemmove(getRoot(), sdl, Dst, DstAlign, Src,
+                                      SrcAlign, Length, LengthTy, ElemSz, isTC,
+                                      MachinePointerInfo(MI.getRawDest()),
+                                      MachinePointerInfo(MI.getRawSource()));
+    updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::memset_element_unordered_atomic: {
@@ -5126,37 +5148,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Val = getValue(MI.getValue());
     SDValue Length = getValue(MI.getLength());
 
-    // Emit a library call.
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-    Entry.Node = Dst;
-    Args.push_back(Entry);
-
-    Entry.Ty = Type::getInt8Ty(*DAG.getContext());
-    Entry.Node = Val;
-    Args.push_back(Entry);
-
-    Entry.Ty = MI.getLength()->getType();
-    Entry.Node = Length;
-    Args.push_back(Entry);
-
-    uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
-    RTLIB::Libcall LibraryCall =
-        RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
-    if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
-      report_fatal_error("Unsupported element size");
-
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
-        TLI.getLibcallCallingConv(LibraryCall),
-        Type::getVoidTy(*DAG.getContext()),
-        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
-                              TLI.getPointerTy(DAG.getDataLayout())),
-        std::move(Args));
-
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    DAG.setRoot(CallResult.second);
+    unsigned DstAlign = MI.getDestAlignment();
+    Type *LengthTy = MI.getLength()->getType();
+    unsigned ElemSz = MI.getElementSizeInBytes();
+    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    SDValue MC = DAG.getAtomicMemset(getRoot(), sdl, Dst, DstAlign, Val, Length,
+                                     LengthTy, ElemSz, isTC,
+                                     MachinePointerInfo(MI.getRawDest()));
+    updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::dbg_addr:
@@ -5164,13 +5163,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     const DbgInfoIntrinsic &DI = cast<DbgInfoIntrinsic>(I);
     DILocalVariable *Variable = DI.getVariable();
     DIExpression *Expression = DI.getExpression();
+    dropDanglingDebugInfo(Variable, Expression);
     assert(Variable && "Missing variable");
 
     // Check if address has undef value.
     const Value *Address = DI.getVariableLocation();
     if (!Address || isa<UndefValue>(Address) ||
         (Address->use_empty() && !isa<Argument>(Address))) {
-      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       return nullptr;
     }
 
@@ -5195,10 +5195,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // DBG_VALUE instructions. llvm.dbg.declare is handled as a frame index in
     // the MachineFunction variable table.
     if (FI != std::numeric_limits<int>::max()) {
-      if (Intrinsic == Intrinsic::dbg_addr)
-        DAG.AddDbgValue(DAG.getFrameIndexDbgValue(Variable, Expression, FI, dl,
-                                                  SDNodeOrder),
-                        getRoot().getNode(), isParameter);
+      if (Intrinsic == Intrinsic::dbg_addr) {
+        SDDbgValue *SDV = DAG.getFrameIndexDbgValue(
+            Variable, Expression, FI, /*IsIndirect*/ true, dl, SDNodeOrder);
+        DAG.AddDbgValue(SDV, getRoot().getNode(), isParameter);
+      }
       return nullptr;
     }
 
@@ -5214,8 +5215,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
       if (isParameter && FINode) {
         // Byval parameter. We have a frame index at this point.
-        SDV = DAG.getFrameIndexDbgValue(Variable, Expression,
-                                        FINode->getIndex(), dl, SDNodeOrder);
+        SDV =
+            DAG.getFrameIndexDbgValue(Variable, Expression, FINode->getIndex(),
+                                      /*IsIndirect*/ true, dl, SDNodeOrder);
       } else if (isa<Argument>(Address)) {
         // Address is an argument, so try to emit its dbg value using
         // virtual register info from the FuncInfo.ValueMap.
@@ -5231,17 +5233,28 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       // virtual register info from the FuncInfo.ValueMap.
       if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true,
                                     N)) {
-        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+        LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       }
     }
     return nullptr;
   }
+  case Intrinsic::dbg_label: {
+    const DbgLabelInst &DI = cast<DbgLabelInst>(I);
+    DILabel *Label = DI.getLabel();
+    assert(Label && "Missing label");
+
+    SDDbgLabel *SDV;
+    SDV = DAG.getDbgLabel(Label, dl, SDNodeOrder);
+    DAG.AddDbgLabel(SDV);
+    return nullptr;
+  }
   case Intrinsic::dbg_value: {
     const DbgValueInst &DI = cast<DbgValueInst>(I);
     assert(DI.getVariable() && "Missing variable");
 
     DILocalVariable *Variable = DI.getVariable();
     DIExpression *Expression = DI.getExpression();
+    dropDanglingDebugInfo(Variable, Expression);
     const Value *V = DI.getValue();
     if (!V)
       return nullptr;
@@ -5266,16 +5279,64 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return nullptr;
     }
 
+    // PHI nodes have already been selected, so we should know which VReg that
+    // is assigns to already.
+    if (isa<PHINode>(V)) {
+      auto VMI = FuncInfo.ValueMap.find(V);
+      if (VMI != FuncInfo.ValueMap.end()) {
+        unsigned Reg = VMI->second;
+        // The PHI node may be split up into several MI PHI nodes (in
+        // FunctionLoweringInfo::set).
+        RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
+                         V->getType(), false);
+        if (RFV.occupiesMultipleRegs()) {
+          unsigned Offset = 0;
+          unsigned BitsToDescribe = 0;
+          if (auto VarSize = Variable->getSizeInBits())
+            BitsToDescribe = *VarSize;
+          if (auto Fragment = Expression->getFragmentInfo())
+            BitsToDescribe = Fragment->SizeInBits;
+          for (auto RegAndSize : RFV.getRegsAndSizes()) {
+            unsigned RegisterSize = RegAndSize.second;
+            // Bail out if all bits are described already.
+            if (Offset >= BitsToDescribe)
+              break;
+            unsigned FragmentSize = (Offset + RegisterSize > BitsToDescribe)
+                ? BitsToDescribe - Offset
+                : RegisterSize;
+            auto FragmentExpr = DIExpression::createFragmentExpression(
+                Expression, Offset, FragmentSize);
+            if (!FragmentExpr)
+                continue;
+            SDV = DAG.getVRegDbgValue(Variable, *FragmentExpr, RegAndSize.first,
+                                      false, dl, SDNodeOrder);
+            DAG.AddDbgValue(SDV, nullptr, false);
+            Offset += RegisterSize;
+          }
+        } else {
+          SDV = DAG.getVRegDbgValue(Variable, Expression, Reg, false, dl,
+                                    SDNodeOrder);
+          DAG.AddDbgValue(SDV, nullptr, false);
+        }
+        return nullptr;
+      }
+    }
+
+    // TODO: When we get here we will either drop the dbg.value completely, or
+    // we try to move it forward by letting it dangle for awhile. So we should
+    // probably add an extra DbgValue to the DAG here, with a reference to
+    // "noreg", to indicate that we have lost the debug location for the
+    // variable.
+
     if (!V->use_empty() ) {
       // Do not call getValue(V) yet, as we don't want to generate code.
       // Remember it for later.
-      DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
-      DanglingDebugInfoMap[V] = DDI;
+      DanglingDebugInfoMap[V].emplace_back(&DI, dl, SDNodeOrder);
       return nullptr;
     }
 
-    DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
-    DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
+    LLVM_DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
     return nullptr;
   }
 
@@ -5609,6 +5670,52 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
     return nullptr;
   }
+  case Intrinsic::fshl:
+  case Intrinsic::fshr: {
+    bool IsFSHL = Intrinsic == Intrinsic::fshl;
+    SDValue X = getValue(I.getArgOperand(0));
+    SDValue Y = getValue(I.getArgOperand(1));
+    SDValue Z = getValue(I.getArgOperand(2));
+    EVT VT = X.getValueType();
+
+    // When X == Y, this is rotate. Create the node directly if legal.
+    // TODO: This should also be done if the operation is custom, but we have
+    // to make sure targets are handling the modulo shift amount as expected.
+    // TODO: If the rotate direction (left or right) corresponding to the shift
+    // is not available, adjust the shift value and invert the direction.
+    auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
+    if (X == Y && TLI.isOperationLegal(RotateOpcode, VT)) {
+      setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
+      return nullptr;
+    }
+
+    // Get the shift amount and inverse shift amount, modulo the bit-width.
+    SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
+    SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
+    SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, Z);
+    SDValue InvShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
+
+    // fshl: (X << (Z % BW)) | (Y >> ((BW - Z) % BW))
+    // fshr: (X << ((BW - Z) % BW)) | (Y >> (Z % BW))
+    SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
+    SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
+    SDValue Res = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);
+
+    // If (Z % BW == 0), then (BW - Z) % BW is also zero, so the result would
+    // be X | Y. If X == Y (rotate), that's fine. If not, we have to select.
+    if (X != Y) {
+      SDValue Zero = DAG.getConstant(0, sdl, VT);
+      EVT CCVT = MVT::i1;
+      if (VT.isVector())
+        CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());
+      // For fshl, 0 shift returns the 1st arg (X).
+      // For fshr, 0 shift returns the 2nd arg (Y).
+      SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
+      Res = DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Res);
+    }
+    setValue(&I, Res);
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
@@ -5703,7 +5810,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
-  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
     // Drop the intrinsic, but forward the value
     setValue(&I, getValue(I.getOperand(0)));
     return nullptr;
@@ -5822,17 +5930,23 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Ops[5];
     unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
     auto Flags = rw == 0 ? MachineMemOperand::MOLoad :MachineMemOperand::MOStore;
-    Ops[0] = getRoot();
+    Ops[0] = DAG.getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = getValue(I.getArgOperand(3));
-    DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
-                                        DAG.getVTList(MVT::Other), Ops,
-                                        EVT::getIntegerVT(*Context, 8),
-                                        MachinePointerInfo(I.getArgOperand(0)),
-                                        0, /* align */
-                                        Flags));
+    SDValue Result = DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
+                                             DAG.getVTList(MVT::Other), Ops,
+                                             EVT::getIntegerVT(*Context, 8),
+                                             MachinePointerInfo(I.getArgOperand(0)),
+                                             0, /* align */
+                                             Flags);
+
+    // Chain the prefetch in parallell with any pending loads, to stay out of
+    // the way of later optimizations.
+    PendingLoads.push_back(Result);
+    Result = getRoot();
+    DAG.setRoot(Result);
     return nullptr;
   }
   case Intrinsic::lifetime_start:
@@ -6004,6 +6118,41 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, patchableNode);
     return nullptr;
   }
+  case Intrinsic::xray_typedevent: {
+    // Here we want to make sure that the intrinsic behaves as if it has a
+    // specific calling convention, and only for x86_64.
+    // FIXME: Support other platforms later.
+    const auto &Triple = DAG.getTarget().getTargetTriple();
+    if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
+      return nullptr;
+
+    SDLoc DL = getCurSDLoc();
+    SmallVector<SDValue, 8> Ops;
+
+    // We want to say that we always want the arguments in registers.
+    // It's unclear to me how manipulating the selection DAG here forces callers
+    // to provide arguments in registers instead of on the stack.
+    SDValue LogTypeId = getValue(I.getArgOperand(0));
+    SDValue LogEntryVal = getValue(I.getArgOperand(1));
+    SDValue StrSizeVal = getValue(I.getArgOperand(2));
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Chain = getRoot();
+    Ops.push_back(LogTypeId);
+    Ops.push_back(LogEntryVal);
+    Ops.push_back(StrSizeVal);
+    Ops.push_back(Chain);
+
+    // We need to enforce the calling convention for the callsite, so that
+    // argument ordering is enforced correctly, and that register allocation can
+    // see that some registers may be assumed clobbered and have to preserve
+    // them across calls to the intrinsic.
+    MachineSDNode *MN = DAG.getMachineNode(
+        TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, DL, NodeTys, Ops);
+    SDValue patchableNode = SDValue(MN, 0);
+    DAG.setRoot(patchableNode);
+    setValue(&I, patchableNode);
+    return nullptr;
+  }
   case Intrinsic::experimental_deoptimize:
     LowerDeoptimizeCall(&I);
     return nullptr;
@@ -6023,6 +6172,66 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_vector_reduce_fmin:
     visitVectorReduce(I, Intrinsic);
     return nullptr;
+
+  case Intrinsic::icall_branch_funnel: {
+    SmallVector<SDValue, 16> Ops;
+    Ops.push_back(DAG.getRoot());
+    Ops.push_back(getValue(I.getArgOperand(0)));
+
+    int64_t Offset;
+    auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+        I.getArgOperand(1), Offset, DAG.getDataLayout()));
+    if (!Base)
+      report_fatal_error(
+          "llvm.icall.branch.funnel operand must be a GlobalValue");
+    Ops.push_back(DAG.getTargetGlobalAddress(Base, getCurSDLoc(), MVT::i64, 0));
+
+    struct BranchFunnelTarget {
+      int64_t Offset;
+      SDValue Target;
+    };
+    SmallVector<BranchFunnelTarget, 8> Targets;
+
+    for (unsigned Op = 1, N = I.getNumArgOperands(); Op != N; Op += 2) {
+      auto *ElemBase = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+          I.getArgOperand(Op), Offset, DAG.getDataLayout()));
+      if (ElemBase != Base)
+        report_fatal_error("all llvm.icall.branch.funnel operands must refer "
+                           "to the same GlobalValue");
+
+      SDValue Val = getValue(I.getArgOperand(Op + 1));
+      auto *GA = dyn_cast<GlobalAddressSDNode>(Val);
+      if (!GA)
+        report_fatal_error(
+            "llvm.icall.branch.funnel operand must be a GlobalValue");
+      Targets.push_back({Offset, DAG.getTargetGlobalAddress(
+                                     GA->getGlobal(), getCurSDLoc(),
+                                     Val.getValueType(), GA->getOffset())});
+    }
+    llvm::sort(Targets.begin(), Targets.end(),
+               [](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) {
+                 return T1.Offset < T2.Offset;
+               });
+
+    for (auto &T : Targets) {
+      Ops.push_back(DAG.getTargetConstant(T.Offset, getCurSDLoc(), MVT::i32));
+      Ops.push_back(T.Target);
+    }
+
+    SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL,
+                                 getCurSDLoc(), MVT::Other, Ops),
+              0);
+    DAG.setRoot(N);
+    setValue(&I, N);
+    HasTailCall = true;
+    return nullptr;
+  }
+
+  case Intrinsic::wasm_landingpad_index: {
+    // TODO store landing pad index in a map, which will be used when generating
+    // LSDA information
+    return nullptr;
+  }
   }
 }
 
@@ -6172,7 +6381,10 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
     DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));
 
     // Inform MachineModuleInfo of range.
-    if (MF.hasEHFunclets()) {
+    auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+    // There is a platform (e.g. wasm) that uses funclet style IR but does not
+    // actually use outlined funclets and their LSDA info style.
+    if (MF.hasEHFunclets() && isFuncletEHPersonality(Pers)) {
       assert(CLI.CS);
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
       EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
@@ -6630,14 +6842,13 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   const char *RenameFn = nullptr;
   if (Function *F = I.getCalledFunction()) {
     if (F->isDeclaration()) {
-      if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
-        if (unsigned IID = II->getIntrinsicID(F)) {
-          RenameFn = visitIntrinsicCall(I, IID);
-          if (!RenameFn)
-            return;
-        }
-      }
-      if (Intrinsic::ID IID = F->getIntrinsicID()) {
+      // Is this an LLVM intrinsic or a target-specific intrinsic?
+      unsigned IID = F->getIntrinsicID();
+      if (!IID)
+        if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())
+          IID = II->getIntrinsicID(F);
+
+      if (IID) {
         RenameFn = visitIntrinsicCall(I, IID);
         if (!RenameFn)
           return;
@@ -6989,27 +7200,37 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
 
   unsigned NumRegs = 1;
   if (OpInfo.ConstraintVT != MVT::Other) {
-    // If this is a FP input in an integer register (or visa versa) insert a bit
-    // cast of the input value.  More generally, handle any case where the input
-    // value disagrees with the register class we plan to stick this in.
-    if (OpInfo.Type == InlineAsm::isInput && PhysReg.second &&
+    // If this is a FP operand in an integer register (or visa versa), or more
+    // generally if the operand value disagrees with the register class we plan
+    // to stick it in, fix the operand type.
+    //
+    // If this is an input value, the bitcast to the new type is done now.
+    // Bitcast for output value is done at the end of visitInlineAsm().
+    if ((OpInfo.Type == InlineAsm::isOutput ||
+         OpInfo.Type == InlineAsm::isInput) &&
+        PhysReg.second &&
         !TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) {
       // Try to convert to the first EVT that the reg class contains.  If the
       // types are identical size, use a bitcast to convert (e.g. two differing
-      // vector types).
+      // vector types).  Note: output bitcast is done at the end of
+      // visitInlineAsm().
       MVT RegVT = *TRI.legalclasstypes_begin(*PhysReg.second);
-      if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) {
-        OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
-                                         RegVT, OpInfo.CallOperand);
+      if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
+        // Exclude indirect inputs while they are unsupported because the code
+        // to perform the load is missing and thus OpInfo.CallOperand still
+        // refer to the input address rather than the pointed-to value.
+        if (OpInfo.Type == InlineAsm::isInput && !OpInfo.isIndirect)
+          OpInfo.CallOperand =
+              DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
         OpInfo.ConstraintVT = RegVT;
+        // If the operand is a FP value and we want it in integer registers,
+        // use the corresponding integer type. This turns an f64 value into
+        // i64, which can be passed with two i32 values on a 32-bit machine.
       } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
-        // If the input is a FP value and we want it in FP registers, do a
-        // bitcast to the corresponding integer type.  This turns an f64 value
-        // into i64, which can be passed with two i32 values on a 32-bit
-        // machine.
         RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
-        OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
-                                         RegVT, OpInfo.CallOperand);
+        if (OpInfo.Type == InlineAsm::isInput)
+          OpInfo.CallOperand =
+              DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
         OpInfo.ConstraintVT = RegVT;
       }
     }
@@ -7246,7 +7467,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       continue;
 
     // If this is a memory input, and if the operand is not indirect, do what we
-    // need to to provide an address for the memory input.
+    // need to provide an address for the memory input.
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         !OpInfo.isIndirect) {
       assert((OpInfo.isMultipleAlternative ||
@@ -7521,12 +7742,18 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
       EVT ResultType = TLI.getValueType(DAG.getDataLayout(), CS.getType());
 
-      // If any of the results of the inline asm is a vector, it may have the
-      // wrong width/num elts.  This can happen for register classes that can
-      // contain multiple different value types.  The preg or vreg allocated may
-      // not have the same VT as was expected.  Convert it to the right type
-      // with bit_convert.
-      if (ResultType != Val.getValueType() && Val.getValueType().isVector()) {
+      // If the type of the inline asm call site return value is different but
+      // has same size as the type of the asm output bitcast it.  One example
+      // of this is for vectors with different width / number of elements.
+      // This can happen for register classes that can contain multiple
+      // different value types.  The preg or vreg allocated may not have the
+      // same VT as was expected.
+      //
+      // This can also happen for a return value that disagrees with the
+      // register class it is put in, eg. a double in a general-purpose
+      // register on a 32-bit machine.
+      if (ResultType != Val.getValueType() &&
+          ResultType.getSizeInBits() == Val.getValueSizeInBits()) {
         Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(),
                           ResultType, Val);
 
@@ -7581,8 +7808,17 @@ void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
 
   // Make sure we leave the DAG in a valid state
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  auto VT = TLI.getValueType(DAG.getDataLayout(), CS.getType());
-  setValue(CS.getInstruction(), DAG.getUNDEF(VT));
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
+
+  if (ValueVTs.empty())
+    return;
+
+  SmallVector<SDValue, 1> Ops;
+  for (unsigned i = 0, e = ValueVTs.size(); i != e; ++i)
+    Ops.push_back(DAG.getUNDEF(ValueVTs[i]));
+
+  setValue(CS.getInstruction(), DAG.getMergeValues(Ops, getCurSDLoc()));
 }
 
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
@@ -7656,7 +7892,7 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
   return DAG.getMergeValues(Ops, SL);
 }
 
-/// \brief Populate a CallLowerinInfo (into \p CLI) based on the properties of
+/// Populate a CallLowerinInfo (into \p CLI) based on the properties of
 /// the call being lowered.
 ///
 /// This is a helper for lowering intrinsics that follow a target calling
@@ -7680,7 +7916,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
     TargetLowering::ArgListEntry Entry;
     Entry.Node = getValue(V);
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, ArgIdx);
+    Entry.setAttributes(&CS, ArgI);
     Args.push_back(Entry);
   }
 
@@ -7691,7 +7927,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
       .setIsPatchPoint(IsPatchPoint);
 }
 
-/// \brief Add a stack map intrinsic call's live variable operands to a stackmap
+/// Add a stack map intrinsic call's live variable operands to a stackmap
 /// or patchpoint target node's operand list.
 ///
 /// Constants are converted to TargetConstants purely as an optimization to
@@ -7727,7 +7963,7 @@ static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
   }
 }
 
-/// \brief Lower llvm.experimental.stackmap directly to its target opcode.
+/// Lower llvm.experimental.stackmap directly to its target opcode.
 void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
   //                                  [live variables...])
@@ -7790,7 +8026,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   FuncInfo.MF->getFrameInfo().setHasStackMap();
 }
 
-/// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
+/// Lower llvm.experimental.patchpoint directly to its target opcode.
 void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
                                           const BasicBlock *EHPadBB) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
@@ -7954,8 +8190,6 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
   FastMathFlags FMF;
   if (isa<FPMathOperator>(I))
     FMF = I.getFastMathFlags();
-  SDNodeFlags SDFlags;
-  SDFlags.setNoNaNs(FMF.noNaNs());
 
   switch (Intrinsic) {
   case Intrinsic::experimental_vector_reduce_fadd:
@@ -7998,10 +8232,10 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
     Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_fmax:
-    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_fmin:
-    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1);
     break;
   default:
     llvm_unreachable("Unhandled vector reduce intrinsic");
@@ -8220,8 +8454,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       else if (Args[i].IsZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
-      // Conservatively only handle 'returned' on non-vectors for now
-      if (Args[i].IsReturned && !Op.getValueType().isVector()) {
+      // Conservatively only handle 'returned' on non-vectors that can be lowered,
+      // for now.
+      if (Args[i].IsReturned && !Op.getValueType().isVector() &&
+          CanLowerReturn) {
         assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
                "unexpected use of 'returned'");
         // Before passing 'returned' to the target lowering code, ensure that
@@ -8500,7 +8736,8 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
       continue;
     }
 
-    DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');
+    LLVM_DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI
+                      << '\n');
 
     // Mark this alloca and store for argument copy elision.
     *Info = StaticAllocaInfo::Elidable;
@@ -8541,8 +8778,9 @@ static void tryToElideArgumentCopy(
   int OldIndex = AllocaIndex;
   MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
   if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
-    DEBUG(dbgs() << "  argument copy elision failed due to bad fixed stack "
-                    "object size\n");
+    LLVM_DEBUG(
+        dbgs() << "  argument copy elision failed due to bad fixed stack "
+                  "object size\n");
     return;
   }
   unsigned RequiredAlignment = AI->getAlignment();
@@ -8551,16 +8789,16 @@ static void tryToElideArgumentCopy(
         AI->getAllocatedType());
   }
   if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
-    DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
-                    "greater than stack argument alignment ("
-                 << RequiredAlignment << " vs "
-                 << MFI.getObjectAlignment(FixedIndex) << ")\n");
+    LLVM_DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
+                         "greater than stack argument alignment ("
+                      << RequiredAlignment << " vs "
+                      << MFI.getObjectAlignment(FixedIndex) << ")\n");
     return;
   }
 
   // Perform the elision. Delete the old stack object and replace its only use
   // in the variable info map. Mark the stack object as mutable.
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
            << "  Replacing frame index " << OldIndex << " with " << FixedIndex
            << '\n';
@@ -8732,14 +8970,14 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
          "LowerFormalArguments didn't return a valid chain!");
   assert(InVals.size() == Ins.size() &&
          "LowerFormalArguments didn't emit the correct number of values!");
-  DEBUG({
-      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-        assert(InVals[i].getNode() &&
-               "LowerFormalArguments emitted a null value!");
-        assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
-               "LowerFormalArguments emitted a value with the wrong type!");
-      }
-    });
+  LLVM_DEBUG({
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+      assert(InVals[i].getNode() &&
+             "LowerFormalArguments emitted a null value!");
+      assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+             "LowerFormalArguments emitted a value with the wrong type!");
+    }
+  });
 
   // Update the DAG with the new chain value resulting from argument lowering.
   DAG.setRoot(NewRoot);
@@ -9351,7 +9589,7 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
   }
 
   BitTestInfo BTI;
-  std::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) {
+  llvm::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) {
     // Sort by probability first, number of bits second, bit mask third.
     if (a.ExtraProb != b.ExtraProb)
       return a.ExtraProb > b.ExtraProb;
@@ -9550,15 +9788,15 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
     // checked first. However, two clusters can have the same probability in
     // which case their relative ordering is non-deterministic. So we use Low
     // as a tie-breaker as clusters are guaranteed to never overlap.
-    std::sort(W.FirstCluster, W.LastCluster + 1,
-              [](const CaseCluster &a, const CaseCluster &b) {
+    llvm::sort(W.FirstCluster, W.LastCluster + 1,
+               [](const CaseCluster &a, const CaseCluster &b) {
       return a.Prob != b.Prob ?
              a.Prob > b.Prob :
              a.Low->getValue().slt(b.Low->getValue());
     });
 
     // Rearrange the case blocks so that the last one falls through if possible
-    // without without changing the order of probabilities.
+    // without changing the order of probabilities.
     for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
       --I;
       if (I->Prob > W.LastCluster->Prob)
@@ -9883,8 +10121,8 @@ MachineBasicBlock *SelectionDAGBuilder::peelDominantCaseCluster(
   if (!SwitchPeeled)
     return SwitchMBB;
 
-  DEBUG(dbgs() << "Peeled one top case in switch stmt, prob: " << TopCaseProb
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Peeled one top case in switch stmt, prob: "
+                    << TopCaseProb << "\n");
 
   // Record the MBB for the peeled switch statement.
   MachineFunction::iterator BBI(SwitchMBB);
@@ -9901,10 +10139,11 @@ MachineBasicBlock *SelectionDAGBuilder::peelDominantCaseCluster(
 
   Clusters.erase(PeeledCaseIt);
   for (CaseCluster &CC : Clusters) {
-    DEBUG(dbgs() << "Scale the probablity for one cluster, before scaling: "
-                 << CC.Prob << "\n");
+    LLVM_DEBUG(
+        dbgs() << "Scale the probablity for one cluster, before scaling: "
+               << CC.Prob << "\n");
     CC.Prob = scaleCaseProbality(CC.Prob, TopCaseProb);
-    DEBUG(dbgs() << "After scaling: " << CC.Prob << "\n");
+    LLVM_DEBUG(dbgs() << "After scaling: " << CC.Prob << "\n");
   }
   PeeledCaseProb = TopCaseProb;
   return PeeledSwitchMBB;
@@ -9983,11 +10222,13 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   findJumpTables(Clusters, &SI, DefaultMBB);
   findBitTestClusters(Clusters, &SI);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Case clusters: ";
     for (const CaseCluster &C : Clusters) {
-      if (C.Kind == CC_JumpTable) dbgs() << "JT:";
-      if (C.Kind == CC_BitTests) dbgs() << "BT:";
+      if (C.Kind == CC_JumpTable)
+        dbgs() << "JT:";
+      if (C.Kind == CC_BitTests)
+        dbgs() << "BT:";
 
       C.Low->getValue().print(dbgs(), true);
       if (C.Low != C.High) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9e7c2bc6821b..e421984b8af2 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -21,7 +21,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -33,6 +32,7 @@
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -116,9 +116,12 @@ class SelectionDAGBuilder {
     unsigned getSDNodeOrder() { return SDNodeOrder; }
   };
 
+  /// DanglingDebugInfoVector - Helper type for DanglingDebugInfoMap.
+  typedef std::vector<DanglingDebugInfo> DanglingDebugInfoVector;
+
   /// DanglingDebugInfoMap - Keeps track of dbg_values for which we have not
   /// yet seen the referent.  We defer handling these until we do see it.
-  DenseMap<const Value*, DanglingDebugInfo> DanglingDebugInfoMap;
+  DenseMap<const Value*, DanglingDebugInfoVector> DanglingDebugInfoMap;
 
 public:
   /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
@@ -671,6 +674,12 @@ public:
   /// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
   SDValue getCopyFromRegs(const Value *V, Type *Ty);
 
+  /// If we have dangling debug info that describes \p Variable, or an
+  /// overlapping part of variable considering the \p Expr, then this method
+  /// weill drop that debug info as it isn't valid any longer.
+  void dropDanglingDebugInfo(const DILocalVariable *Variable,
+                             const DIExpression *Expr);
+
   // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
   // generate the debug data structures now that we've seen its definition.
   void resolveDanglingDebugInfo(const Value *V, SDValue Val);
@@ -678,6 +687,13 @@ public:
   SDValue getValue(const Value *V);
   bool findValue(const Value *V) const;
 
+  /// Return the SDNode for the specified IR value if it exists.
+  SDNode *getNodeForIRValue(const Value *V) {
+    if (NodeMap.find(V) == NodeMap.end())
+      return nullptr;
+    return NodeMap[V].getNode();
+  }
+
   SDValue getNonRegisterValue(const Value *V);
   SDValue getValueImpl(const Value *V);
 
@@ -696,13 +712,13 @@ public:
   void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
                             MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
                             MachineBasicBlock *SwitchBB,
-                            Instruction::BinaryOps Opc, BranchProbability TW,
-                            BranchProbability FW, bool InvertCond);
+                            Instruction::BinaryOps Opc, BranchProbability TProb,
+                            BranchProbability FProb, bool InvertCond);
   void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     MachineBasicBlock *CurBB,
                                     MachineBasicBlock *SwitchBB,
-                                    BranchProbability TW, BranchProbability FW,
+                                    BranchProbability TProb, BranchProbability FProb,
                                     bool InvertCond);
   bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
   bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
@@ -774,11 +790,11 @@ public:
   };
 
   /// Lower \p SLI into a STATEPOINT instruction.
-  SDValue LowerAsSTATEPOINT(StatepointLoweringInfo &SLI);
+  SDValue LowerAsSTATEPOINT(StatepointLoweringInfo &SI);
 
   // This function is responsible for the whole statepoint lowering process.
   // It uniformly handles invoke and call statepoints.
-  void LowerStatepoint(ImmutableStatepoint Statepoint,
+  void LowerStatepoint(ImmutableStatepoint ISP,
                        const BasicBlock *EHPadBB = nullptr);
 
   void LowerCallSiteWithDeoptBundle(ImmutableCallSite CS, SDValue Callee,
@@ -838,7 +854,7 @@ private:
   void visitInvoke(const InvokeInst &I);
   void visitResume(const ResumeInst &I);
 
-  void visitBinary(const User &I, unsigned OpCode);
+  void visitBinary(const User &I, unsigned Opcode);
   void visitShift(const User &I, unsigned Opcode);
   void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
   void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
@@ -881,7 +897,7 @@ private:
 
   void visitExtractValue(const User &I);
   void visitInsertValue(const User &I);
-  void visitLandingPad(const LandingPadInst &I);
+  void visitLandingPad(const LandingPadInst &LP);
 
   void visitGetElementPtr(const User &I);
   void visitSelect(const User &I);
@@ -926,7 +942,7 @@ private:
                        const BasicBlock *EHPadBB = nullptr);
 
   // These two are implemented in StatepointLowering.cpp
-  void visitGCRelocate(const GCRelocateInst &I);
+  void visitGCRelocate(const GCRelocateInst &Relocate);
   void visitGCResult(const GCResultInst &I);
 
   void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
@@ -1036,9 +1052,17 @@ struct RegsForValue {
   /// Add this value to the specified inlineasm node operand list. This adds the
   /// code marker, matching input operand index (if applicable), and includes
   /// the number of values added into it.
-  void AddInlineAsmOperands(unsigned Kind, bool HasMatching,
+  void AddInlineAsmOperands(unsigned Code, bool HasMatching,
                             unsigned MatchingIdx, const SDLoc &dl,
                             SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+
+  /// Check if the total RegCount is greater than one.
+  bool occupiesMultipleRegs() const {
+    return std::accumulate(RegCount.begin(), RegCount.end(), 0) > 1;
+  }
+
+  /// Return a list of registers and their sizes.
+  SmallVector<std::pair<unsigned, unsigned>, 4> getRegsAndSizes() const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index dd30dc16378c..fa341e8b5fa5 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -28,18 +27,21 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -85,6 +87,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
   case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
   case ISD::ATOMIC_LOAD_AND:            return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_CLR:            return "AtomicLoadClr";
   case ISD::ATOMIC_LOAD_OR:             return "AtomicLoadOr";
   case ISD::ATOMIC_LOAD_XOR:            return "AtomicLoadXor";
   case ISD::ATOMIC_LOAD_NAND:           return "AtomicLoadNand";
@@ -176,20 +179,30 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FMAXNAN:                    return "fmaxnan";
   case ISD::FNEG:                       return "fneg";
   case ISD::FSQRT:                      return "fsqrt";
+  case ISD::STRICT_FSQRT:               return "strict_fsqrt";
   case ISD::FSIN:                       return "fsin";
+  case ISD::STRICT_FSIN:                return "strict_fsin";
   case ISD::FCOS:                       return "fcos";
+  case ISD::STRICT_FCOS:                return "strict_fcos";
   case ISD::FSINCOS:                    return "fsincos";
   case ISD::FTRUNC:                     return "ftrunc";
   case ISD::FFLOOR:                     return "ffloor";
   case ISD::FCEIL:                      return "fceil";
   case ISD::FRINT:                      return "frint";
+  case ISD::STRICT_FRINT:               return "strict_frint";
   case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::STRICT_FNEARBYINT:          return "strict_fnearbyint";
   case ISD::FROUND:                     return "fround";
   case ISD::FEXP:                       return "fexp";
+  case ISD::STRICT_FEXP:                return "strict_fexp";
   case ISD::FEXP2:                      return "fexp2";
+  case ISD::STRICT_FEXP2:               return "strict_fexp2";
   case ISD::FLOG:                       return "flog";
+  case ISD::STRICT_FLOG:                return "strict_flog";
   case ISD::FLOG2:                      return "flog2";
+  case ISD::STRICT_FLOG2:               return "strict_flog2";
   case ISD::FLOG10:                     return "flog10";
+  case ISD::STRICT_FLOG10:              return "strict_flog10";
 
   // Binary operators
   case ISD::ADD:                        return "add";
@@ -214,24 +227,31 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ROTL:                       return "rotl";
   case ISD::ROTR:                       return "rotr";
   case ISD::FADD:                       return "fadd";
+  case ISD::STRICT_FADD:                return "strict_fadd";
   case ISD::FSUB:                       return "fsub";
+  case ISD::STRICT_FSUB:                return "strict_fsub";
   case ISD::FMUL:                       return "fmul";
+  case ISD::STRICT_FMUL:                return "strict_fmul";
   case ISD::FDIV:                       return "fdiv";
+  case ISD::STRICT_FDIV:                return "strict_fdiv";
   case ISD::FMA:                        return "fma";
+  case ISD::STRICT_FMA:                 return "strict_fma";
   case ISD::FMAD:                       return "fmad";
   case ISD::FREM:                       return "frem";
+  case ISD::STRICT_FREM:                return "strict_frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
   case ISD::FGETSIGN:                   return "fgetsign";
   case ISD::FCANONICALIZE:              return "fcanonicalize";
   case ISD::FPOW:                       return "fpow";
+  case ISD::STRICT_FPOW:                return "strict_fpow";
   case ISD::SMIN:                       return "smin";
   case ISD::SMAX:                       return "smax";
   case ISD::UMIN:                       return "umin";
   case ISD::UMAX:                       return "umax";
 
   case ISD::FPOWI:                      return "fpowi";
+  case ISD::STRICT_FPOWI:               return "strict_fpowi";
   case ISD::SETCC:                      return "setcc";
-  case ISD::SETCCE:                     return "setcce";
   case ISD::SETCCCARRY:                 return "setcccarry";
   case ISD::SELECT:                     return "select";
   case ISD::VSELECT:                    return "vselect";
@@ -366,7 +386,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     case ISD::SETFALSE2:                return "setfalse2";
     }
   case ISD::VECREDUCE_FADD:             return "vecreduce_fadd";
+  case ISD::VECREDUCE_STRICT_FADD:      return "vecreduce_strict_fadd";
   case ISD::VECREDUCE_FMUL:             return "vecreduce_fmul";
+  case ISD::VECREDUCE_STRICT_FMUL:      return "vecreduce_strict_fmul";
   case ISD::VECREDUCE_ADD:              return "vecreduce_add";
   case ISD::VECREDUCE_MUL:              return "vecreduce_mul";
   case ISD::VECREDUCE_AND:              return "vecreduce_and";
@@ -401,6 +423,32 @@ static Printable PrintNodeId(const SDNode &Node) {
   });
 }
 
+// Print the MMO with more information from the SelectionDAG.
+static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
+                            const MachineFunction *MF, const Module *M,
+                            const MachineFrameInfo *MFI,
+                            const TargetInstrInfo *TII, LLVMContext &Ctx) {
+  ModuleSlotTracker MST(M);
+  if (MF)
+    MST.incorporateFunction(MF->getFunction());
+  SmallVector<StringRef, 0> SSNs;
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
+}
+
+static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
+                            const SelectionDAG *G) {
+  if (G) {
+    const MachineFunction *MF = &G->getMachineFunction();
+    return printMemOperand(OS, MMO, MF, MF->getFunction().getParent(),
+                           &MF->getFrameInfo(), G->getSubtarget().getInstrInfo(),
+                           *G->getContext());
+  } else {
+    LLVMContext Ctx;
+    return printMemOperand(OS, MMO, /*MF=*/nullptr, /*M=*/nullptr,
+                           /*MFI=*/nullptr, /*TII=*/nullptr, Ctx);
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); }
 
@@ -430,9 +478,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasExact())
     OS << " exact";
 
-  if (getFlags().hasUnsafeAlgebra())
-    OS << " unsafe";
-
   if (getFlags().hasNoNaNs())
     OS << " nnan";
 
@@ -448,6 +493,12 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasAllowContract())
     OS << " contract";
 
+  if (getFlags().hasApproximateFuncs())
+    OS << " afn";
+
+  if (getFlags().hasAllowReassociation())
+    OS << " reassoc";
+
   if (getFlags().hasVectorReduction())
     OS << " vector-reduction";
 
@@ -457,7 +508,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << "Mem:";
       for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
            e = MN->memoperands_end(); i != e; ++i) {
-        OS << **i;
+        printMemOperand(OS, **i, G);
         if (std::next(i) != e)
           OS << " ";
       }
@@ -549,7 +600,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     OS << ":" << N->getVT().getEVTString();
   }
   else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
-    OS << "<" << *LD->getMemOperand();
+    OS << "<";
+
+    printMemOperand(OS, *LD->getMemOperand(), G);
 
     bool doExt = true;
     switch (LD->getExtensionType()) {
@@ -567,7 +620,8 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
 
     OS << ">";
   } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
-    OS << "<" << *ST->getMemOperand();
+    OS << "<";
+    printMemOperand(OS, *ST->getMemOperand(), G);
 
     if (ST->isTruncatingStore())
       OS << ", trunc to " << ST->getMemoryVT().getEVTString();
@@ -578,7 +632,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
 
     OS << ">";
   } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
-    OS << "<" << *M->getMemOperand() << ">";
+    OS << "<";
+    printMemOperand(OS, *M->getMemOperand(), G);
+    OS << ">";
   } else if (const BlockAddressSDNode *BA =
                dyn_cast<BlockAddressSDNode>(this)) {
     int64_t offset = BA->getOffset();
@@ -608,6 +664,8 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
 
     if (getNodeId() != -1)
       OS << " [ID=" << getNodeId() << ']';
+    if (!(isa<ConstantSDNode>(this) || (isa<ConstantFPSDNode>(this))))
+      OS << "# D:" << isDivergent();
 
     if (!G)
       return;
@@ -779,4 +837,8 @@ void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
     if (i) OS << ", "; else OS << " ";
     printOperand(OS, G, getOperand(i));
   }
+  if (DebugLoc DL = getDebugLoc()) {
+    OS << ", ";
+    DL.print(OS);
+  }
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index bd9fcfb5c1e8..f7bd8847bee3 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -43,7 +44,6 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -82,6 +82,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -196,7 +197,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
 namespace llvm {
 
   //===--------------------------------------------------------------------===//
-  /// \brief This class is used by SelectionDAGISel to temporarily override
+  /// This class is used by SelectionDAGISel to temporarily override
   /// the optimization level on a per-function basis.
   class OptLevelChanger {
     SelectionDAGISel &IS;
@@ -211,26 +212,27 @@ namespace llvm {
         return;
       IS.OptLevel = NewOptLevel;
       IS.TM.setOptLevel(NewOptLevel);
-      DEBUG(dbgs() << "\nChanging optimization level for Function "
-            << IS.MF->getFunction().getName() << "\n");
-      DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
-            << " ; After: -O" << NewOptLevel << "\n");
+      LLVM_DEBUG(dbgs() << "\nChanging optimization level for Function "
+                        << IS.MF->getFunction().getName() << "\n");
+      LLVM_DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel << " ; After: -O"
+                        << NewOptLevel << "\n");
       SavedFastISel = IS.TM.Options.EnableFastISel;
       if (NewOptLevel == CodeGenOpt::None) {
         IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
-        DEBUG(dbgs() << "\tFastISel is "
-              << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
-              << "\n");
+        LLVM_DEBUG(
+            dbgs() << "\tFastISel is "
+                   << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
+                   << "\n");
       }
     }
 
     ~OptLevelChanger() {
       if (IS.OptLevel == SavedOptLevel)
         return;
-      DEBUG(dbgs() << "\nRestoring optimization level for Function "
-            << IS.MF->getFunction().getName() << "\n");
-      DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel
-            << " ; After: -O" << SavedOptLevel << "\n");
+      LLVM_DEBUG(dbgs() << "\nRestoring optimization level for Function "
+                        << IS.MF->getFunction().getName() << "\n");
+      LLVM_DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel << " ; After: -O"
+                        << SavedOptLevel << "\n");
       IS.OptLevel = SavedOptLevel;
       IS.TM.setOptLevel(SavedOptLevel);
       IS.TM.setFastISel(SavedFastISel);
@@ -326,9 +328,9 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<StackProtector>();
-  AU.addPreserved<StackProtector>();
   AU.addPreserved<GCModuleInfo>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -410,11 +412,12 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
-  DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
-  CurDAG->init(*MF, *ORE, this);
+  CurDAG->init(*MF, *ORE, this, LibInfo,
+   getAnalysisIfAvailable<DivergenceAnalysis>());
   FuncInfo->set(Fn, *MF, CurDAG);
 
   // Now get the optional analyzes if we want to.
@@ -513,8 +516,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         // FIXME: VR def may not be in entry block.
         Def->getParent()->insert(std::next(InsertPos), MI);
       } else
-        DEBUG(dbgs() << "Dropping debug info for dead vreg"
-              << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
+        LLVM_DEBUG(dbgs() << "Dropping debug info for dead vreg"
+                          << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
@@ -621,8 +624,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // at this point.
   FuncInfo->clear();
 
-  DEBUG(dbgs() << "*** MachineFunction at end of ISel ***\n");
-  DEBUG(MF->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "*** MachineFunction at end of ISel ***\n");
+  LLVM_DEBUG(MF->print(dbgs()));
 
   return true;
 }
@@ -711,6 +714,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   int BlockNumber = -1;
   (void)BlockNumber;
   bool MatchFilterBB = false; (void)MatchFilterBB;
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*FuncInfo->Fn);
 
   // Pre-type legalization allow creation of any node types.
   CurDAG->NewNodesMustHaveLegalTypes = false;
@@ -718,7 +723,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 #ifndef NDEBUG
   MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
                    FilterDAGBasicBlockName ==
-                       FuncInfo->MBB->getBasicBlock()->getName().str());
+                       FuncInfo->MBB->getBasicBlock()->getName());
 #endif
 #ifdef NDEBUG
   if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
@@ -730,9 +735,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     BlockName =
         (MF->getName() + ":" + FuncInfo->MBB->getBasicBlock()->getName()).str();
   }
-  DEBUG(dbgs() << "Initial selection DAG: " << printMBBReference(*FuncInfo->MBB)
-               << " '" << BlockName << "'\n";
-        CurDAG->dump());
+  LLVM_DEBUG(dbgs() << "Initial selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
 
   if (ViewDAGCombine1 && MatchFilterBB)
     CurDAG->viewGraph("dag-combine1 input for " + BlockName);
@@ -744,10 +750,13 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
   }
 
-  DEBUG(dbgs() << "Optimized lowered selection DAG: "
-               << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-               << "'\n";
-        CurDAG->dump());
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+
+  LLVM_DEBUG(dbgs() << "Optimized lowered selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
@@ -761,10 +770,13 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     Changed = CurDAG->LegalizeTypes();
   }
 
-  DEBUG(dbgs() << "Type-legalized selection DAG: "
-               << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-               << "'\n";
-        CurDAG->dump());
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+
+  LLVM_DEBUG(dbgs() << "Type-legalized selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
 
   // Only allow creation of legal node types.
   CurDAG->NewNodesMustHaveLegalTypes = true;
@@ -780,10 +792,13 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
     }
 
-    DEBUG(dbgs() << "Optimized type-legalized selection DAG: "
-                 << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-                 << "'\n";
-          CurDAG->dump());
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDiverence();
+
+    LLVM_DEBUG(dbgs() << "Optimized type-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
   }
 
   {
@@ -793,10 +808,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   }
 
   if (Changed) {
-    DEBUG(dbgs() << "Vector-legalized selection DAG: "
-                 << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-                 << "'\n";
-          CurDAG->dump());
+    LLVM_DEBUG(dbgs() << "Vector-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
 
     {
       NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
@@ -804,10 +819,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       CurDAG->LegalizeTypes();
     }
 
-    DEBUG(dbgs() << "Vector/type-legalized selection DAG: "
-                 << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-                 << "'\n";
-          CurDAG->dump());
+    LLVM_DEBUG(dbgs() << "Vector/type-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
 
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
@@ -819,10 +834,13 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
     }
 
-    DEBUG(dbgs() << "Optimized vector-legalized selection DAG: "
-                 << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-                 << "'\n";
-          CurDAG->dump());
+    LLVM_DEBUG(dbgs() << "Optimized vector-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
+
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDiverence();
   }
 
   if (ViewLegalizeDAGs && MatchFilterBB)
@@ -834,10 +852,13 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     CurDAG->Legalize();
   }
 
-  DEBUG(dbgs() << "Legalized selection DAG: "
-               << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-               << "'\n";
-        CurDAG->dump());
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+
+  LLVM_DEBUG(dbgs() << "Legalized selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
 
   if (ViewDAGCombine2 && MatchFilterBB)
     CurDAG->viewGraph("dag-combine2 input for " + BlockName);
@@ -849,10 +870,13 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
   }
 
-  DEBUG(dbgs() << "Optimized legalized selection DAG: "
-               << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-               << "'\n";
-        CurDAG->dump());
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+
+  LLVM_DEBUG(dbgs() << "Optimized legalized selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
 
   if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
@@ -868,10 +892,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     DoInstructionSelection();
   }
 
-  DEBUG(dbgs() << "Selected selection DAG: "
-               << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
-               << "'\n";
-        CurDAG->dump());
+  LLVM_DEBUG(dbgs() << "Selected selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
 
   if (ViewSchedDAGs && MatchFilterBB)
     CurDAG->viewGraph("scheduler input for " + BlockName);
@@ -937,10 +961,62 @@ public:
 
 } // end anonymous namespace
 
+// This function is used to enforce the topological node id property
+// property leveraged during Instruction selection. Before selection all
+// nodes are given a non-negative id such that all nodes have a larger id than
+// their operands. As this holds transitively we can prune checks that a node N
+// is a predecessor of M another by not recursively checking through M's
+// operands if N's ID is larger than M's ID. This is significantly improves
+// performance of for various legality checks (e.g. IsLegalToFold /
+// UpdateChains).
+
+// However, when we fuse multiple nodes into a single node
+// during selection we may induce a predecessor relationship between inputs and
+// outputs of distinct nodes being merged violating the topological property.
+// Should a fused node have a successor which has yet to be selected, our
+// legality checks would be incorrect. To avoid this we mark all unselected
+// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x =>
+// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M.
+// We use bit-negation to more clearly enforce that node id -1 can only be
+// achieved by selected nodes). As the conversion is reversable the original Id,
+// topological pruning can still be leveraged when looking for unselected nodes.
+// This method is call internally in all ISel replacement calls.
+void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
+  SmallVector<SDNode *, 4> Nodes;
+  Nodes.push_back(Node);
+
+  while (!Nodes.empty()) {
+    SDNode *N = Nodes.pop_back_val();
+    for (auto *U : N->uses()) {
+      auto UId = U->getNodeId();
+      if (UId > 0) {
+        InvalidateNodeId(U);
+        Nodes.push_back(U);
+      }
+    }
+  }
+}
+
+// InvalidateNodeId - As discusses in EnforceNodeIdInvariant, mark a
+// NodeId with the equivalent node id which is invalid for topological
+// pruning.
+void SelectionDAGISel::InvalidateNodeId(SDNode *N) {
+  int InvalidId = -(N->getNodeId() + 1);
+  N->setNodeId(InvalidId);
+}
+
+// getUninvalidatedNodeId - get original uninvalidated node id.
+int SelectionDAGISel::getUninvalidatedNodeId(SDNode *N) {
+  int Id = N->getNodeId();
+  if (Id < -1)
+    return -(Id + 1);
+  return Id;
+}
+
 void SelectionDAGISel::DoInstructionSelection() {
-  DEBUG(dbgs() << "===== Instruction selection begins: "
-               << printMBBReference(*FuncInfo->MBB) << " '"
-               << FuncInfo->MBB->getName() << "'\n");
+  LLVM_DEBUG(dbgs() << "===== Instruction selection begins: "
+                    << printMBBReference(*FuncInfo->MBB) << " '"
+                    << FuncInfo->MBB->getName() << "'\n");
 
   PreprocessISelDAG();
 
@@ -972,6 +1048,33 @@ void SelectionDAGISel::DoInstructionSelection() {
       if (Node->use_empty())
         continue;
 
+#ifndef NDEBUG
+      SmallVector<SDNode *, 4> Nodes;
+      Nodes.push_back(Node);
+
+      while (!Nodes.empty()) {
+        auto N = Nodes.pop_back_val();
+        if (N->getOpcode() == ISD::TokenFactor || N->getNodeId() < 0)
+          continue;
+        for (const SDValue &Op : N->op_values()) {
+          if (Op->getOpcode() == ISD::TokenFactor)
+            Nodes.push_back(Op.getNode());
+          else {
+            // We rely on topological ordering of node ids for checking for
+            // cycles when fusing nodes during selection. All unselected nodes
+            // successors of an already selected node should have a negative id.
+            // This assertion will catch such cases. If this assertion triggers
+            // it is likely you using DAG-level Value/Node replacement functions
+            // (versus equivalent ISEL replacement) in backend-specific
+            // selections. See comment in EnforceNodeIdInvariant for more
+            // details.
+            assert(Op->getNodeId() != -1 &&
+                   "Node has already selected predecessor node");
+          }
+        }
+      }
+#endif
+
       // When we are using non-default rounding modes or FP exception behavior
       // FP operations are represented by StrictFP pseudo-operations.  They
       // need to be simplified here so that the target-specific instruction
@@ -985,13 +1088,16 @@ void SelectionDAGISel::DoInstructionSelection() {
       if (Node->isStrictFPOpcode())
         Node = CurDAG->mutateStrictFPToFP(Node);
 
+      LLVM_DEBUG(dbgs() << "\nISEL: Starting selection on root node: ";
+                 Node->dump(CurDAG));
+
       Select(Node);
     }
 
     CurDAG->setRoot(Dummy.getValue());
   }
 
-  DEBUG(dbgs() << "===== Instruction selection ends:\n");
+  LLVM_DEBUG(dbgs() << "\n===== Instruction selection ends:\n");
 
   PostprocessISelDAG();
 }
@@ -1264,7 +1370,7 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
       }
 
       auto DLoc = isa<Instruction>(SwiftErrorVal)
-                      ? dyn_cast<Instruction>(SwiftErrorVal)->getDebugLoc()
+                      ? cast<Instruction>(SwiftErrorVal)->getDebugLoc()
                       : DebugLoc();
       const auto *TII = FuncInfo->MF->getSubtarget().getInstrInfo();
 
@@ -1381,7 +1487,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel) {
-    DEBUG(dbgs() << "Enabling fast-isel\n");
+    LLVM_DEBUG(dbgs() << "Enabling fast-isel\n");
     FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
   }
 
@@ -1398,6 +1504,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
   FuncInfo->InsertPt = FuncInfo->MBB->begin();
 
+  CurDAG->setFunctionLoweringInfo(FuncInfo);
+
   if (!FastIS) {
     LowerArguments(Fn);
   } else {
@@ -1435,6 +1543,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   processDbgDeclares(FuncInfo);
 
   // Iterate over all basic blocks in the function.
+  StackProtector &SP = getAnalysis<StackProtector>();
   for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
@@ -1604,7 +1713,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       FastIS->recomputeInsertPt();
     }
 
-    if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
+    if (SP.shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
           TLI->getSSPStackGuardCheck(*Fn.getParent());
       SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->MBBMap[LLVMBB],
@@ -1630,11 +1739,15 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());
     }
 
+    if (FastIS)
+      FastIS->finishBasicBlock();
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
     ElidedArgCopyInstrs.clear();
   }
 
+  SP.copyToMachineFrameInfo(MF->getFrameInfo());
+
   propagateSwiftErrorVRegs(FuncInfo);
 
   delete FastIS;
@@ -1728,12 +1841,12 @@ FindSplitPointForStackProtector(MachineBasicBlock *BB) {
 
 void
 SelectionDAGISel::FinishBasicBlock() {
-  DEBUG(dbgs() << "Total amount of phi nodes to update: "
-               << FuncInfo->PHINodesToUpdate.size() << "\n";
-        for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
-          dbgs() << "Node " << i << " : ("
-                 << FuncInfo->PHINodesToUpdate[i].first
-                 << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
+  LLVM_DEBUG(dbgs() << "Total amount of phi nodes to update: "
+                    << FuncInfo->PHINodesToUpdate.size() << "\n";
+             for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e;
+                  ++i) dbgs()
+             << "Node " << i << " : (" << FuncInfo->PHINodesToUpdate[i].first
+             << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
 
   // Next, now that we know what the last MBB the LLVM BB expanded is, update
   // PHI nodes in successors.
@@ -2012,7 +2125,7 @@ bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
     return true;
 
   // If the actual AND mask is allowing unallowed bits, this doesn't match.
-  if (ActualMask.intersects(~DesiredMask))
+  if (!ActualMask.isSubsetOf(DesiredMask))
     return false;
 
   // Otherwise, the DAG Combiner may have proven that the value coming in is
@@ -2041,7 +2154,7 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
     return true;
 
   // If the actual AND mask is allowing unallowed bits, this doesn't match.
-  if (ActualMask.intersects(~DesiredMask))
+  if (!ActualMask.isSubsetOf(DesiredMask))
     return false;
 
   // Otherwise, the DAG Combiner may have proven that the value coming in is
@@ -2134,52 +2247,44 @@ static SDNode *findGlueUse(SDNode *N) {
   return nullptr;
 }
 
-/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
-/// This function iteratively traverses up the operand chain, ignoring
-/// certain nodes.
-static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
-                          SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
+/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path
+/// beyond "ImmedUse".  We may ignore chains as they are checked separately.
+static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
                           bool IgnoreChains) {
-  // The NodeID's are given uniques ID's where a node ID is guaranteed to be
-  // greater than all of its (recursive) operands.  If we scan to a point where
-  // 'use' is smaller than the node we're scanning for, then we know we will
-  // never find it.
-  //
-  // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
-  // happen because we scan down to newly selected nodes in the case of glue
-  // uses.
-  std::vector<SDNode *> WorkList;
-  WorkList.push_back(Use);
-
-  while (!WorkList.empty()) {
-    Use = WorkList.back();
-    WorkList.pop_back();
-    if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)
-      continue;
+  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallVector<const SDNode *, 16> WorkList;
+  // Only check if we have non-immediate uses of Def.
+  if (ImmedUse->isOnlyUserOf(Def))
+    return false;
 
-    // Don't revisit nodes if we already scanned it and didn't fail, we know we
-    // won't fail if we scan it again.
-    if (!Visited.insert(Use).second)
+  // We don't care about paths to Def that go through ImmedUse so mark it
+  // visited and mark non-def operands as used.
+  Visited.insert(ImmedUse);
+  for (const SDValue &Op : ImmedUse->op_values()) {
+    SDNode *N = Op.getNode();
+    // Ignore chain deps (they are validated by
+    // HandleMergeInputChains) and immediate uses
+    if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
       continue;
+    if (!Visited.insert(N).second)
+      continue;
+    WorkList.push_back(N);
+  }
 
-    for (const SDValue &Op : Use->op_values()) {
-      // Ignore chain uses, they are validated by HandleMergeInputChains.
-      if (Op.getValueType() == MVT::Other && IgnoreChains)
-        continue;
-
+  // Initialize worklist to operands of Root.
+  if (Root != ImmedUse) {
+    for (const SDValue &Op : Root->op_values()) {
       SDNode *N = Op.getNode();
-      if (N == Def) {
-        if (Use == ImmedUse || Use == Root)
-          continue;  // We are not looking for immediate use.
-        assert(N != Root);
-        return true;
-      }
-
-      // Traverse up the operand chain.
+      // Ignore chains (they are validated by HandleMergeInputChains)
+      if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
+        continue;
+      if (!Visited.insert(N).second)
+        continue;
       WorkList.push_back(N);
     }
   }
-  return false;
+
+  return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true);
 }
 
 /// IsProfitableToFold - Returns true if it's profitable to fold the specific
@@ -2199,7 +2304,7 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
 
   // If Root use can somehow reach N through a path that that doesn't contain
   // U then folding N would create a cycle. e.g. In the following
-  // diagram, Root can reach N through X. If N is folded into into Root, then
+  // diagram, Root can reach N through X. If N is folded into Root, then
   // X is both a predecessor and a successor of U.
   //
   //          [N*]           //
@@ -2251,13 +2356,12 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
 
     // If our query node has a glue result with a use, we've walked up it.  If
     // the user (which has already been selected) has a chain or indirectly uses
-    // the chain, our WalkChainUsers predicate will not consider it.  Because of
+    // the chain, HandleMergeInputChains will not consider it.  Because of
     // this, we cannot ignore chains in this predicate.
     IgnoreChains = false;
   }
 
-  SmallPtrSet<SDNode*, 16> Visited;
-  return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
+  return !findNonImmUse(Root, N.getNode(), U, IgnoreChains);
 }
 
 void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
@@ -2360,7 +2464,8 @@ void SelectionDAGISel::UpdateChains(
             std::replace(ChainNodesMatched.begin(), ChainNodesMatched.end(), N,
                          static_cast<SDNode *>(nullptr));
           });
-      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
+      if (ChainNode->getOpcode() != ISD::TokenFactor)
+        ReplaceUses(ChainVal, InputChain);
 
       // If the node became dead and we haven't already seen it, delete it.
       if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
@@ -2372,144 +2477,7 @@ void SelectionDAGISel::UpdateChains(
   if (!NowDeadNodes.empty())
     CurDAG->RemoveDeadNodes(NowDeadNodes);
 
-  DEBUG(dbgs() << "ISEL: Match complete!\n");
-}
-
-enum ChainResult {
-  CR_Simple,
-  CR_InducesCycle,
-  CR_LeadsToInteriorNode
-};
-
-/// WalkChainUsers - Walk down the users of the specified chained node that is
-/// part of the pattern we're matching, looking at all of the users we find.
-/// This determines whether something is an interior node, whether we have a
-/// non-pattern node in between two pattern nodes (which prevent folding because
-/// it would induce a cycle) and whether we have a TokenFactor node sandwiched
-/// between pattern nodes (in which case the TF becomes part of the pattern).
-///
-/// The walk we do here is guaranteed to be small because we quickly get down to
-/// already selected nodes "below" us.
-static ChainResult
-WalkChainUsers(const SDNode *ChainedNode,
-               SmallVectorImpl<SDNode *> &ChainedNodesInPattern,
-               DenseMap<const SDNode *, ChainResult> &TokenFactorResult,
-               SmallVectorImpl<SDNode *> &InteriorChainedNodes) {
-  ChainResult Result = CR_Simple;
-
-  for (SDNode::use_iterator UI = ChainedNode->use_begin(),
-         E = ChainedNode->use_end(); UI != E; ++UI) {
-    // Make sure the use is of the chain, not some other value we produce.
-    if (UI.getUse().getValueType() != MVT::Other) continue;
-
-    SDNode *User = *UI;
-
-    if (User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
-      continue;
-
-    // If we see an already-selected machine node, then we've gone beyond the
-    // pattern that we're selecting down into the already selected chunk of the
-    // DAG.
-    unsigned UserOpcode = User->getOpcode();
-    if (User->isMachineOpcode() ||
-        UserOpcode == ISD::CopyToReg ||
-        UserOpcode == ISD::CopyFromReg ||
-        UserOpcode == ISD::INLINEASM ||
-        UserOpcode == ISD::EH_LABEL ||
-        UserOpcode == ISD::LIFETIME_START ||
-        UserOpcode == ISD::LIFETIME_END) {
-      // If their node ID got reset to -1 then they've already been selected.
-      // Treat them like a MachineOpcode.
-      if (User->getNodeId() == -1)
-        continue;
-    }
-
-    // If we have a TokenFactor, we handle it specially.
-    if (User->getOpcode() != ISD::TokenFactor) {
-      // If the node isn't a token factor and isn't part of our pattern, then it
-      // must be a random chained node in between two nodes we're selecting.
-      // This happens when we have something like:
-      //   x = load ptr
-      //   call
-      //   y = x+4
-      //   store y -> ptr
-      // Because we structurally match the load/store as a read/modify/write,
-      // but the call is chained between them.  We cannot fold in this case
-      // because it would induce a cycle in the graph.
-      if (!std::count(ChainedNodesInPattern.begin(),
-                      ChainedNodesInPattern.end(), User))
-        return CR_InducesCycle;
-
-      // Otherwise we found a node that is part of our pattern.  For example in:
-      //   x = load ptr
-      //   y = x+4
-      //   store y -> ptr
-      // This would happen when we're scanning down from the load and see the
-      // store as a user.  Record that there is a use of ChainedNode that is
-      // part of the pattern and keep scanning uses.
-      Result = CR_LeadsToInteriorNode;
-      InteriorChainedNodes.push_back(User);
-      continue;
-    }
-
-    // If we found a TokenFactor, there are two cases to consider: first if the
-    // TokenFactor is just hanging "below" the pattern we're matching (i.e. no
-    // uses of the TF are in our pattern) we just want to ignore it.  Second,
-    // the TokenFactor can be sandwiched in between two chained nodes, like so:
-    //     [Load chain]
-    //         ^
-    //         |
-    //       [Load]
-    //       ^    ^
-    //       |    \                    DAG's like cheese
-    //      /       \                       do you?
-    //     /         |
-    // [TokenFactor] [Op]
-    //     ^          ^
-    //     |          |
-    //      \        /
-    //       \      /
-    //       [Store]
-    //
-    // In this case, the TokenFactor becomes part of our match and we rewrite it
-    // as a new TokenFactor.
-    //
-    // To distinguish these two cases, do a recursive walk down the uses.
-    auto MemoizeResult = TokenFactorResult.find(User);
-    bool Visited = MemoizeResult != TokenFactorResult.end();
-    // Recursively walk chain users only if the result is not memoized.
-    if (!Visited) {
-      auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult,
-                                InteriorChainedNodes);
-      MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first;
-    }
-    switch (MemoizeResult->second) {
-    case CR_Simple:
-      // If the uses of the TokenFactor are just already-selected nodes, ignore
-      // it, it is "below" our pattern.
-      continue;
-    case CR_InducesCycle:
-      // If the uses of the TokenFactor lead to nodes that are not part of our
-      // pattern that are not selected, folding would turn this into a cycle,
-      // bail out now.
-      return CR_InducesCycle;
-    case CR_LeadsToInteriorNode:
-      break;  // Otherwise, keep processing.
-    }
-
-    // Okay, we know we're in the interesting interior case.  The TokenFactor
-    // is now going to be considered part of the pattern so that we rewrite its
-    // uses (it may have uses that are not part of the pattern) with the
-    // ultimate chain result of the generated code.  We will also add its chain
-    // inputs as inputs to the ultimate TokenFactor we create.
-    Result = CR_LeadsToInteriorNode;
-    if (!Visited) {
-      ChainedNodesInPattern.push_back(User);
-      InteriorChainedNodes.push_back(User);
-    }
-  }
-
-  return Result;
+  LLVM_DEBUG(dbgs() << "ISEL: Match complete!\n");
 }
 
 /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
@@ -2521,47 +2489,56 @@ WalkChainUsers(const SDNode *ChainedNode,
 static SDValue
 HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
                        SelectionDAG *CurDAG) {
-  // Used for memoization. Without it WalkChainUsers could take exponential
-  // time to run.
-  DenseMap<const SDNode *, ChainResult> TokenFactorResult;
-  // Walk all of the chained nodes we've matched, recursively scanning down the
-  // users of the chain result. This adds any TokenFactor nodes that are caught
-  // in between chained nodes to the chained and interior nodes list.
-  SmallVector<SDNode*, 3> InteriorChainedNodes;
-  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
-    if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
-                       TokenFactorResult,
-                       InteriorChainedNodes) == CR_InducesCycle)
-      return SDValue(); // Would induce a cycle.
-  }
 
-  // Okay, we have walked all the matched nodes and collected TokenFactor nodes
-  // that we are interested in.  Form our input TokenFactor node.
+  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallVector<const SDNode *, 8> Worklist;
   SmallVector<SDValue, 3> InputChains;
-  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
-    // Add the input chain of this node to the InputChains list (which will be
-    // the operands of the generated TokenFactor) if it's not an interior node.
-    SDNode *N = ChainNodesMatched[i];
-    if (N->getOpcode() != ISD::TokenFactor) {
-      if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
-        continue;
+  unsigned int Max = 8192;
 
-      // Otherwise, add the input chain.
-      SDValue InChain = ChainNodesMatched[i]->getOperand(0);
-      assert(InChain.getValueType() == MVT::Other && "Not a chain");
-      InputChains.push_back(InChain);
-      continue;
-    }
+  // Quick exit on trivial merge.
+  if (ChainNodesMatched.size() == 1)
+    return ChainNodesMatched[0]->getOperand(0);
 
-    // If we have a token factor, we want to add all inputs of the token factor
-    // that are not part of the pattern we're matching.
-    for (const SDValue &Op : N->op_values()) {
-      if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
-                      Op.getNode()))
-        InputChains.push_back(Op);
-    }
+  // Add chains that aren't already added (internal). Peek through
+  // token factors.
+  std::function<void(const SDValue)> AddChains = [&](const SDValue V) {
+    if (V.getValueType() != MVT::Other)
+      return;
+    if (V->getOpcode() == ISD::EntryToken)
+      return;
+    if (!Visited.insert(V.getNode()).second)
+      return;
+    if (V->getOpcode() == ISD::TokenFactor) {
+      for (const SDValue &Op : V->op_values())
+        AddChains(Op);
+    } else
+      InputChains.push_back(V);
+  };
+
+  for (auto *N : ChainNodesMatched) {
+    Worklist.push_back(N);
+    Visited.insert(N);
   }
 
+  while (!Worklist.empty())
+    AddChains(Worklist.pop_back_val()->getOperand(0));
+
+  // Skip the search if there are no chain dependencies.
+  if (InputChains.size() == 0)
+    return CurDAG->getEntryNode();
+
+  // If one of these chains is a successor of input, we must have a
+  // node that is both the predecessor and successor of the
+  // to-be-merged nodes. Fail.
+  Visited.clear();
+  for (SDValue V : InputChains)
+    Worklist.push_back(V.getNode());
+
+  for (auto *N : ChainNodesMatched)
+    if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true))
+      return SDValue();
+
+  // Return merged chain.
   if (InputChains.size() == 1)
     return InputChains[0];
   return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
@@ -2606,8 +2583,8 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
   // Move the glue if needed.
   if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
       (unsigned)OldGlueResultNo != ResNumResults-1)
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo),
-                                      SDValue(Res, ResNumResults-1));
+    ReplaceUses(SDValue(Node, OldGlueResultNo),
+                SDValue(Res, ResNumResults - 1));
 
   if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
     --ResNumResults;
@@ -2615,14 +2592,15 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
   // Move the chain reference if needed.
   if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
       (unsigned)OldChainResultNo != ResNumResults-1)
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo),
-                                      SDValue(Res, ResNumResults-1));
+    ReplaceUses(SDValue(Node, OldChainResultNo),
+                SDValue(Res, ResNumResults - 1));
 
   // Otherwise, no replacement happened because the node already exists. Replace
   // Uses of the old node with the new one.
   if (Res != Node) {
-    CurDAG->ReplaceAllUsesWith(Node, Res);
-    CurDAG->RemoveDeadNode(Node);
+    ReplaceNode(Node, Res);
+  } else {
+    EnforceNodeIdInvariant(Res);
   }
 
   return Res;
@@ -2861,7 +2839,7 @@ struct MatchScope {
   bool HasChainNodesMatched;
 };
 
-/// \\brief A DAG update listener to keep the matching state
+/// \A DAG update listener to keep the matching state
 /// (i.e. RecordedNodes and MatchScope) uptodate if the target is allowed to
 /// change the DAG while matching.  X86 addressing mode matcher is an example
 /// for this.
@@ -2939,8 +2917,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     return;
   case ISD::AssertSext:
   case ISD::AssertZext:
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
-                                      NodeToMatch->getOperand(0));
+    ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0));
     CurDAG->RemoveDeadNode(NodeToMatch);
     return;
   case ISD::INLINEASM:
@@ -2988,9 +2965,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   // update the chain results when the pattern is complete.
   SmallVector<SDNode*, 3> ChainNodesMatched;
 
-  DEBUG(dbgs() << "ISEL: Starting pattern match on root node: ";
-        NodeToMatch->dump(CurDAG);
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "ISEL: Starting pattern match\n");
 
   // Determine where to start the interpreter.  Normally we start at opcode #0,
   // but if the state machine starts with an OPC_SwitchOpcode, then we
@@ -3002,7 +2977,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // Already computed the OpcodeOffset table, just index into it.
     if (N.getOpcode() < OpcodeOffset.size())
       MatcherIndex = OpcodeOffset[N.getOpcode()];
-    DEBUG(dbgs() << "  Initial Opcode index to " << MatcherIndex << "\n");
+    LLVM_DEBUG(dbgs() << "  Initial Opcode index to " << MatcherIndex << "\n");
 
   } else if (MatcherTable[0] == OPC_SwitchOpcode) {
     // Otherwise, the table isn't computed, but the state machine does start
@@ -3069,9 +3044,10 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         if (!Result)
           break;
 
-        DEBUG(dbgs() << "  Skipped scope entry (due to false predicate) at "
-                     << "index " << MatcherIndexOfPredicate
-                     << ", continuing at " << FailIndex << "\n");
+        LLVM_DEBUG(
+            dbgs() << "  Skipped scope entry (due to false predicate) at "
+                   << "index " << MatcherIndexOfPredicate << ", continuing at "
+                   << FailIndex << "\n");
         ++NumDAGIselRetries;
 
         // Otherwise, we know that this case of the Scope is guaranteed to fail,
@@ -3120,11 +3096,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       if (auto *MN = dyn_cast<MemSDNode>(N))
         MatchedMemRefs.push_back(MN->getMemOperand());
       else {
-        DEBUG(
-          dbgs() << "Expected MemSDNode ";
-          N->dump(CurDAG);
-          dbgs() << '\n'
-        );
+        LLVM_DEBUG(dbgs() << "Expected MemSDNode "; N->dump(CurDAG);
+                   dbgs() << '\n');
       }
 
       continue;
@@ -3245,8 +3218,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       if (CaseSize == 0) break;
 
       // Otherwise, execute the case we found.
-      DEBUG(dbgs() << "  OpcodeSwitch from " << SwitchStart
-                   << " to " << MatcherIndex << "\n");
+      LLVM_DEBUG(dbgs() << "  OpcodeSwitch from " << SwitchStart << " to "
+                        << MatcherIndex << "\n");
       continue;
     }
 
@@ -3277,8 +3250,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       if (CaseSize == 0) break;
 
       // Otherwise, execute the case we found.
-      DEBUG(dbgs() << "  TypeSwitch[" << EVT(CurNodeVT).getEVTString()
-                   << "] from " << SwitchStart << " to " << MatcherIndex<<'\n');
+      LLVM_DEBUG(dbgs() << "  TypeSwitch[" << EVT(CurNodeVT).getEVTString()
+                        << "] from " << SwitchStart << " to " << MatcherIndex
+                        << '\n');
       continue;
     }
     case OPC_CheckChild0Type: case OPC_CheckChild1Type:
@@ -3658,16 +3632,11 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         Res->setMemRefs(MemRefs, MemRefs + NumMemRefs);
       }
 
-      DEBUG(
-        if (!MatchedMemRefs.empty() && Res->memoperands_empty())
-          dbgs() << "  Dropping mem operands\n";
-        dbgs() << "  "
-               << (IsMorphNodeTo ? "Morphed" : "Created")
-               << " node: ";
-        Res->dump(CurDAG);
-
-        dbgs() << '\n';
-      );
+      LLVM_DEBUG(if (!MatchedMemRefs.empty() && Res->memoperands_empty()) dbgs()
+                     << "  Dropping mem operands\n";
+                 dbgs() << "  " << (IsMorphNodeTo ? "Morphed" : "Created")
+                        << " node: ";
+                 Res->dump(CurDAG););
 
       // If this was a MorphNodeTo then we're completely done!
       if (IsMorphNodeTo) {
@@ -3702,7 +3671,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
                 NodeToMatch->getValueType(i).getSizeInBits() ==
                     Res.getValueSizeInBits()) &&
                "invalid replacement");
-        CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
+        ReplaceUses(SDValue(NodeToMatch, i), Res);
       }
 
       // Update chain uses.
@@ -3715,8 +3684,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
               MVT::Glue &&
           InputGlue.getNode())
-        CurDAG->ReplaceAllUsesOfValueWith(
-            SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue);
+        ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1),
+                    InputGlue);
 
       assert(NodeToMatch->use_empty() &&
              "Didn't replace all uses of the node?");
@@ -3729,7 +3698,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // If the code reached this point, then the match failed.  See if there is
     // another child to try in the current 'Scope', otherwise pop it until we
     // find a case to check.
-    DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
+    LLVM_DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex
+                      << "\n");
     ++NumDAGIselRetries;
     while (true) {
       if (MatchScopes.empty()) {
@@ -3749,7 +3719,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         MatchedMemRefs.resize(LastScope.NumMatchedMemRefs);
       MatcherIndex = LastScope.FailIndex;
 
-      DEBUG(dbgs() << "  Continuing at " << MatcherIndex << "\n");
+      LLVM_DEBUG(dbgs() << "  Continuing at " << MatcherIndex << "\n");
 
       InputChain = LastScope.InputChain;
       InputGlue = LastScope.InputGlue;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index be4ab094bf49..3b19bff4743d 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -229,7 +229,7 @@ bool SelectionDAG::setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet
   if (level >= 20) {
     if (!printed) {
       printed = true;
-      DEBUG(dbgs() << "setSubgraphColor hit max level\n");
+      LLVM_DEBUG(dbgs() << "setSubgraphColor hit max level\n");
     }
     return true;
   }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 3f64b49e3555..5cf06e62b80c 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -43,6 +42,7 @@
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cassert>
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d76e52d78870..fa867fcec366 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,6 +31,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cctype>
 using namespace llvm;
@@ -96,7 +96,7 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
   return true;
 }
 
-/// \brief Set CallLoweringInfo attribute flags based on a call instruction
+/// Set CallLoweringInfo attribute flags based on a call instruction
 /// and called function attributes.
 void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS,
                                                      unsigned ArgIdx) {
@@ -524,6 +524,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   }
 
   // Other users may use these bits.
+  EVT VT = Op.getValueType();
   if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
     if (Depth != 0) {
       // If not at the root, Just compute the Known bits to
@@ -537,7 +538,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   } else if (DemandedMask == 0) {
     // Not demanding any bits from Op.
     if (!Op.isUndef())
-      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(Op.getValueType()));
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
     return false;
   } else if (Depth == 6) {        // Limit search depth.
     return false;
@@ -580,7 +581,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       KnownBits LHSKnown;
       // Do not increment Depth here; that can cause an infinite loop.
       TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth);
-      // If the LHS already has zeros where RHSC does, this and is dead.
+      // If the LHS already has zeros where RHSC does, this 'and' is dead.
       if ((LHSKnown.Zero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
         return TLO.CombineTo(Op, Op0);
 
@@ -596,8 +597,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1
       if (isBitwiseNot(Op0) && Op0.hasOneUse() &&
           LHSKnown.One == ~RHSC->getAPIntValue()) {
-        SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, Op.getValueType(),
-                                      Op0.getOperand(0), Op.getOperand(1));
+        SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, VT, Op0.getOperand(0),
+                                      Op.getOperand(1));
         return TLO.CombineTo(Op, Xor);
       }
     }
@@ -618,7 +619,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If all of the demanded bits in the inputs are known zeros, return zero.
     if (NewMask.isSubsetOf(Known.Zero | Known2.Zero))
-      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType()));
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
     // If the RHS is a constant, see if we can simplify it.
     if (ShrinkDemandedConstant(Op, ~Known2.Zero & NewMask, TLO))
       return true;
@@ -680,7 +681,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // (but not both) turn this into an *inclusive* or.
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
     if ((NewMask & ~Known.Zero & ~Known2.Zero) == 0)
-      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(),
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT,
                                                Op.getOperand(0),
                                                Op.getOperand(1)));
 
@@ -696,7 +697,6 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // NB: it is okay if more bits are known than are requested
     if (NewMask.isSubsetOf(Known.Zero|Known.One)) { // all known on one side
       if (Known.One == Known2.One) { // set bits are the same on both sides
-        EVT VT = Op.getValueType();
         SDValue ANDC = TLO.DAG.getConstant(~Known.One & NewMask, dl, VT);
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT,
                                                  Op.getOperand(0), ANDC));
@@ -710,7 +710,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (C && !C->isAllOnesValue()) {
       if (NewMask.isSubsetOf(C->getAPIntValue())) {
         // We're flipping all demanded bits. Flip the undemanded bits too.
-        SDValue New = TLO.DAG.getNOT(dl, Op.getOperand(0), Op.getValueType());
+        SDValue New = TLO.DAG.getNOT(dl, Op.getOperand(0), VT);
         return TLO.CombineTo(Op, New);
       }
       // If we can't turn this into a 'not', try to shrink the constant.
@@ -761,7 +761,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // width as the setcc result, and (3) the result of a setcc conforms to 0 or
     // -1, we may be able to bypass the setcc.
     if (NewMask.isSignMask() && Op0.getScalarValueSizeInBits() == BitWidth &&
-        getBooleanContents(Op.getValueType()) ==
+        getBooleanContents(VT) ==
             BooleanContent::ZeroOrNegativeOneBooleanContent) {
       // If we're testing X < 0, then this compare isn't needed - just use X!
       // FIXME: We're limiting to integer types here, but this should also work
@@ -807,7 +807,6 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
               SDValue NewSA =
                 TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
-              EVT VT = Op.getValueType();
               return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
                                                        InOp.getOperand(0),
                                                        NewSA));
@@ -835,8 +834,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                             TLO.DAG.getConstant(ShAmt, dl, ShTy));
           return
             TLO.CombineTo(Op,
-                          TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
-                                          NarrowShl));
+                          TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
         }
         // Repeat the SHL optimization above in cases where an extension
         // intervenes: (shl (anyext (shr x, c1)), c2) to
@@ -854,7 +852,6 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
               SDValue NewSA =
                 TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
                                     Op.getOperand(1).getValueType());
-              EVT VT = Op.getValueType();
               SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
                                                InnerOp.getOperand(0));
               return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
@@ -904,7 +901,6 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
               SDValue NewSA =
                 TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
-              EVT VT = Op.getValueType();
               return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
                                                        InOp.getOperand(0),
                                                        NewSA));
@@ -930,12 +926,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // the shift amount is >= the size of the datatype, which is undefined.
     if (NewMask.isOneValue())
       return TLO.CombineTo(Op,
-                           TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
-                                           Op.getOperand(0), Op.getOperand(1)));
+                           TLO.DAG.getNode(ISD::SRL, dl, VT, Op.getOperand(0),
+                                           Op.getOperand(1)));
 
     if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) {
-      EVT VT = Op.getValueType();
-
       // If the shift count is an invalid immediate, don't do anything.
       if (SA->getAPIntValue().uge(BitWidth))
         break;
@@ -1000,14 +994,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       if (!AlreadySignExtended) {
         // Compute the correct shift amount type, which must be getShiftAmountTy
         // for scalar types after legalization.
-        EVT ShiftAmtTy = Op.getValueType();
+        EVT ShiftAmtTy = VT;
         if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
           ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL);
 
         SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ExVTBits, dl,
                                                ShiftAmtTy);
-        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
-                                                 Op.getValueType(), InOp,
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, InOp,
                                                  ShiftAmt));
       }
     }
@@ -1072,8 +1065,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     // If none of the top bits are demanded, convert this into an any_extend.
     if (NewMask.getActiveBits() <= OperandBitWidth)
-      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
-                                               Op.getValueType(),
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
                                                Op.getOperand(0)));
 
     APInt InMask = NewMask.trunc(OperandBitWidth);
@@ -1089,8 +1081,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     // If none of the top bits are demanded, convert this into an any_extend.
     if (NewMask.getActiveBits() <= InBits)
-      return TLO.CombineTo(Op,TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
-                                              Op.getValueType(),
+      return TLO.CombineTo(Op,TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
                                               Op.getOperand(0)));
 
     // Since some of the sign extended bits are demanded, we know that the sign
@@ -1107,8 +1098,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     // If the sign bit is known zero, convert this to a zero extend.
     if (Known.isNonNegative())
-      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl,
-                                               Op.getValueType(),
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT,
                                                Op.getOperand(0)));
     break;
   }
@@ -1139,8 +1129,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       case ISD::SRL:
         // Shrink SRL by a constant if none of the high bits shifted in are
         // demanded.
-        if (TLO.LegalTypes() &&
-            !isTypeDesirableForOp(ISD::SRL, Op.getValueType()))
+        if (TLO.LegalTypes() && !isTypeDesirableForOp(ISD::SRL, VT))
           // Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
           // undesirable.
           break;
@@ -1150,8 +1139,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         SDValue Shift = In.getOperand(1);
         if (TLO.LegalTypes()) {
           uint64_t ShVal = ShAmt->getZExtValue();
-          Shift = TLO.DAG.getConstant(ShVal, dl,
-                                      getShiftAmountTy(Op.getValueType(), DL));
+          Shift = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));
         }
 
         if (ShAmt->getZExtValue() < BitWidth) {
@@ -1163,12 +1151,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
           if (!(HighBits & NewMask)) {
             // None of the shifted in bits are needed.  Add a truncate of the
             // shift input, then shift it.
-            SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
-                                               Op.getValueType(),
+            SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl, VT,
                                                In.getOperand(0));
-            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
-                                                     Op.getValueType(),
-                                                     NewTrunc,
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc,
                                                      Shift));
           }
         }
@@ -1182,9 +1167,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   case ISD::AssertZext: {
     // AssertZext demands all of the high bits, plus any of the low bits
     // demanded by its users.
-    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-    APInt InMask = APInt::getLowBitsSet(BitWidth,
-                                        VT.getSizeInBits());
+    EVT ZVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, ZVT.getSizeInBits());
     if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask,
                              Known, TLO, Depth+1))
       return true;
@@ -1196,40 +1180,45 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   case ISD::BITCAST:
     // If this is an FP->Int bitcast and if the sign bit is the only
     // thing demanded, turn this into a FGETSIGN.
-    if (!TLO.LegalOperations() &&
-        !Op.getValueType().isVector() &&
+    if (!TLO.LegalOperations() && !VT.isVector() &&
         !Op.getOperand(0).getValueType().isVector() &&
         NewMask == APInt::getSignMask(Op.getValueSizeInBits()) &&
         Op.getOperand(0).getValueType().isFloatingPoint()) {
-      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
+      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, VT);
       bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
-      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() &&
+      if ((OpVTLegal || i32Legal) && VT.isSimple() &&
+           Op.getOperand(0).getValueType() != MVT::f16 &&
            Op.getOperand(0).getValueType() != MVT::f128) {
         // Cannot eliminate/lower SHL for f128 yet.
-        EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
+        EVT Ty = OpVTLegal ? VT : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
         SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0));
         unsigned OpVTSizeInBits = Op.getValueSizeInBits();
         if (!OpVTLegal && OpVTSizeInBits > 32)
-          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Sign);
+          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Sign);
         unsigned ShVal = Op.getValueSizeInBits() - 1;
-        SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, Op.getValueType());
-        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
-                                                 Op.getValueType(),
-                                                 Sign, ShAmt));
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, VT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
       }
     }
+    // If this is a bitcast, let computeKnownBits handle it.  Only do this on a
+    // recursive call where Known may be useful to the caller.
+    if (Depth > 0) {
+      TLO.DAG.computeKnownBits(Op, Known, Depth);
+      return false;
+    }
     break;
   case ISD::ADD:
   case ISD::MUL:
   case ISD::SUB: {
     // Add, Sub, and Mul don't demand any bits in positions beyond that
     // of the highest bit demanded of them.
-    APInt LoMask = APInt::getLowBitsSet(BitWidth,
-                                        BitWidth - NewMask.countLeadingZeros());
-    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, Known2, TLO, Depth+1) ||
-        SimplifyDemandedBits(Op.getOperand(1), LoMask, Known2, TLO, Depth+1) ||
+    SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+    unsigned NewMaskLZ = NewMask.countLeadingZeros();
+    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - NewMaskLZ);
+    if (SimplifyDemandedBits(Op0, LoMask, Known2, TLO, Depth + 1) ||
+        SimplifyDemandedBits(Op1, LoMask, Known2, TLO, Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) {
       SDNodeFlags Flags = Op.getNode()->getFlags();
@@ -1238,13 +1227,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         // won't wrap after simplification.
         Flags.setNoSignedWrap(false);
         Flags.setNoUnsignedWrap(false);
-        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, Op.getValueType(),
-                                        Op.getOperand(0), Op.getOperand(1),
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1,
                                         Flags);
         return TLO.CombineTo(Op, NewOp);
       }
       return true;
     }
+
+    // If we have a constant operand, we may be able to turn it into -1 if we
+    // do not demand the high bits. This can make the constant smaller to
+    // encode, allow more general folding, or match specialized instruction
+    // patterns (eg, 'blsr' on x86). Don't bother changing 1 to -1 because that
+    // is probably not useful (and could be detrimental).
+    ConstantSDNode *C = isConstOrConstSplat(Op1);
+    APInt HighMask = APInt::getHighBitsSet(NewMask.getBitWidth(), NewMaskLZ);
+    if (C && !C->isAllOnesValue() && !C->isOne() &&
+        (C->getAPIntValue() | HighMask).isAllOnesValue()) {
+      SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
+      // We can't guarantee that the new math op doesn't wrap, so explicitly
+      // clear those flags to prevent folding with a potential existing node
+      // that has those flags set.
+      SDNodeFlags Flags;
+      Flags.setNoSignedWrap(false);
+      Flags.setNoUnsignedWrap(false);
+      SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags);
+      return TLO.CombineTo(Op, NewOp);
+    }
+
     LLVM_FALLTHROUGH;
   }
   default:
@@ -1265,10 +1274,384 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         if (C->isOpaque())
           return false;
     }
-    return TLO.CombineTo(Op,
-                         TLO.DAG.getConstant(Known.One, dl, Op.getValueType()));
+    return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
+  }
+
+  return false;
+}
+
+bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
+                                                const APInt &DemandedElts,
+                                                APInt &KnownUndef,
+                                                APInt &KnownZero,
+                                                DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+
+  bool Simplified =
+      SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
+  if (Simplified)
+    DCI.CommitTargetLoweringOpt(TLO);
+  return Simplified;
+}
+
+bool TargetLowering::SimplifyDemandedVectorElts(
+    SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef,
+    APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
+    bool AssumeSingleUse) const {
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = DemandedEltMask;
+  unsigned NumElts = DemandedElts.getBitWidth();
+  assert(VT.isVector() && "Expected vector op");
+  assert(VT.getVectorNumElements() == NumElts &&
+         "Mask size mismatches value type element count!");
+
+  KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+
+  // Undef operand.
+  if (Op.isUndef()) {
+    KnownUndef.setAllBits();
+    return false;
+  }
+
+  // If Op has other users, assume that all elements are needed.
+  if (!Op.getNode()->hasOneUse() && !AssumeSingleUse)
+    DemandedElts.setAllBits();
+
+  // Not demanding any elements from Op.
+  if (DemandedElts == 0) {
+    KnownUndef.setAllBits();
+    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+  }
+
+  // Limit search depth.
+  if (Depth >= 6)
+    return false;
+
+  SDLoc DL(Op);
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  switch (Op.getOpcode()) {
+  case ISD::SCALAR_TO_VECTOR: {
+    if (!DemandedElts[0]) {
+      KnownUndef.setAllBits();
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+    }
+    KnownUndef.setHighBits(NumElts - 1);
+    break;
+  }
+  case ISD::BITCAST: {
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // We only handle vectors here.
+    // TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits?
+    if (!SrcVT.isVector())
+      break;
+
+    // Fast handling of 'identity' bitcasts.
+    unsigned NumSrcElts = SrcVT.getVectorNumElements();
+    if (NumSrcElts == NumElts)
+      return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef,
+                                        KnownZero, TLO, Depth + 1);
+
+    APInt SrcZero, SrcUndef;
+    APInt SrcDemandedElts = APInt::getNullValue(NumSrcElts);
+
+    // Bitcast from 'large element' src vector to 'small element' vector, we
+    // must demand a source element if any DemandedElt maps to it.
+    if ((NumElts % NumSrcElts) == 0) {
+      unsigned Scale = NumElts / NumSrcElts;
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i])
+          SrcDemandedElts.setBit(i / Scale);
+
+      if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+                                     TLO, Depth + 1))
+        return true;
+
+      // If the src element is zero/undef then all the output elements will be -
+      // only demanded elements are guaranteed to be correct.
+      for (unsigned i = 0; i != NumSrcElts; ++i) {
+        if (SrcDemandedElts[i]) {
+          if (SrcZero[i])
+            KnownZero.setBits(i * Scale, (i + 1) * Scale);
+          if (SrcUndef[i])
+            KnownUndef.setBits(i * Scale, (i + 1) * Scale);
+        }
+      }
+    }
+
+    // Bitcast from 'small element' src vector to 'large element' vector, we
+    // demand all smaller source elements covered by the larger demanded element
+    // of this vector.
+    if ((NumSrcElts % NumElts) == 0) {
+      unsigned Scale = NumSrcElts / NumElts;
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i])
+          SrcDemandedElts.setBits(i * Scale, (i + 1) * Scale);
+
+      if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+                                     TLO, Depth + 1))
+        return true;
+
+      // If all the src elements covering an output element are zero/undef, then
+      // the output element will be as well, assuming it was demanded.
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (DemandedElts[i]) {
+          if (SrcZero.extractBits(Scale, i * Scale).isAllOnesValue())
+            KnownZero.setBit(i);
+          if (SrcUndef.extractBits(Scale, i * Scale).isAllOnesValue())
+            KnownUndef.setBit(i);
+        }
+      }
+    }
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    // Check all elements and simplify any unused elements with UNDEF.
+    if (!DemandedElts.isAllOnesValue()) {
+      // Don't simplify BROADCASTS.
+      if (llvm::any_of(Op->op_values(),
+                       [&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
+        SmallVector<SDValue, 32> Ops(Op->op_begin(), Op->op_end());
+        bool Updated = false;
+        for (unsigned i = 0; i != NumElts; ++i) {
+          if (!DemandedElts[i] && !Ops[i].isUndef()) {
+            Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType());
+            KnownUndef.setBit(i);
+            Updated = true;
+          }
+        }
+        if (Updated)
+          return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops));
+      }
+    }
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue SrcOp = Op.getOperand(i);
+      if (SrcOp.isUndef()) {
+        KnownUndef.setBit(i);
+      } else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() &&
+                 (isNullConstant(SrcOp) || isNullFPConstant(SrcOp))) {
+        KnownZero.setBit(i);
+      }
+    }
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    EVT SubVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVecs = Op.getNumOperands();
+    unsigned NumSubElts = SubVT.getVectorNumElements();
+    for (unsigned i = 0; i != NumSubVecs; ++i) {
+      SDValue SubOp = Op.getOperand(i);
+      APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+      APInt SubUndef, SubZero;
+      if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
+                                     Depth + 1))
+        return true;
+      KnownUndef.insertBits(SubUndef, i * NumSubElts);
+      KnownZero.insertBits(SubZero, i * NumSubElts);
+    }
+    break;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    if (!isa<ConstantSDNode>(Op.getOperand(2)))
+      break;
+    SDValue Base = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    EVT SubVT = Sub.getValueType();
+    unsigned NumSubElts = SubVT.getVectorNumElements();
+    const APInt& Idx = cast<ConstantSDNode>(Op.getOperand(2))->getAPIntValue();
+    if (Idx.uge(NumElts - NumSubElts))
+      break;
+    unsigned SubIdx = Idx.getZExtValue();
+    APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
+    APInt SubUndef, SubZero;
+    if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO,
+                                   Depth + 1))
+      return true;
+    APInt BaseElts = DemandedElts;
+    BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
+    if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
+                                   Depth + 1))
+      return true;
+    KnownUndef.insertBits(SubUndef, SubIdx);
+    KnownZero.insertBits(SubZero, SubIdx);
+    break;
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    if (!isa<ConstantSDNode>(Op.getOperand(1)))
+      break;
+    SDValue Src = Op.getOperand(0);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    const APInt& Idx = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+    if (Idx.uge(NumSrcElts - NumElts))
+      break;
+    // Offset the demanded elts by the subvector index.
+    uint64_t SubIdx = Idx.getZExtValue();
+    APInt SrcElts = DemandedElts.zext(NumSrcElts).shl(SubIdx);
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
+      return true;
+    KnownUndef = SrcUndef.extractBits(NumElts, SubIdx);
+    KnownZero = SrcZero.extractBits(NumElts, SubIdx);
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue Vec = Op.getOperand(0);
+    SDValue Scl = Op.getOperand(1);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+
+    // For a legal, constant insertion index, if we don't need this insertion
+    // then strip it, else remove it from the demanded elts.
+    if (CIdx && CIdx->getAPIntValue().ult(NumElts)) {
+      unsigned Idx = CIdx->getZExtValue();
+      if (!DemandedElts[Idx])
+        return TLO.CombineTo(Op, Vec);
+      DemandedElts.clearBit(Idx);
+
+      if (SimplifyDemandedVectorElts(Vec, DemandedElts, KnownUndef,
+                                     KnownZero, TLO, Depth + 1))
+        return true;
+
+      KnownUndef.clearBit(Idx);
+      if (Scl.isUndef())
+        KnownUndef.setBit(Idx);
+
+      KnownZero.clearBit(Idx);
+      if (isNullConstant(Scl) || isNullFPConstant(Scl))
+        KnownZero.setBit(Idx);
+      break;
+    }
+
+    APInt VecUndef, VecZero;
+    if (SimplifyDemandedVectorElts(Vec, DemandedElts, VecUndef, VecZero, TLO,
+                                   Depth + 1))
+      return true;
+    // Without knowing the insertion index we can't set KnownUndef/KnownZero.
+    break;
+  }
+  case ISD::VSELECT: {
+    APInt DemandedLHS(DemandedElts);
+    APInt DemandedRHS(DemandedElts);
+
+    // TODO - add support for constant vselect masks.
+
+    // See if we can simplify either vselect operand.
+    APInt UndefLHS, ZeroLHS;
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedLHS, UndefLHS,
+                                   ZeroLHS, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedRHS, UndefRHS,
+                                   ZeroRHS, TLO, Depth + 1))
+      return true;
+
+    KnownUndef = UndefLHS & UndefRHS;
+    KnownZero = ZeroLHS & ZeroRHS;
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // Collect demanded elements from shuffle operands..
+    APInt DemandedLHS(NumElts, 0);
+    APInt DemandedRHS(NumElts, 0);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0 || !DemandedElts[i])
+        continue;
+      assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
+      if (M < (int)NumElts)
+        DemandedLHS.setBit(M);
+      else
+        DemandedRHS.setBit(M - NumElts);
+    }
+
+    // See if we can simplify either shuffle operand.
+    APInt UndefLHS, ZeroLHS;
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, UndefLHS,
+                                   ZeroLHS, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, UndefRHS,
+                                   ZeroRHS, TLO, Depth + 1))
+      return true;
+
+    // Simplify mask using undef elements from LHS/RHS.
+    bool Updated = false;
+    bool IdentityLHS = true, IdentityRHS = true;
+    SmallVector<int, 32> NewMask(ShuffleMask.begin(), ShuffleMask.end());
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int &M = NewMask[i];
+      if (M < 0)
+        continue;
+      if (!DemandedElts[i] || (M < (int)NumElts && UndefLHS[M]) ||
+          (M >= (int)NumElts && UndefRHS[M - NumElts])) {
+        Updated = true;
+        M = -1;
+      }
+      IdentityLHS &= (M < 0) || (M == (int)i);
+      IdentityRHS &= (M < 0) || ((M - NumElts) == i);
+    }
+
+    // Update legal shuffle masks based on demanded elements if it won't reduce
+    // to Identity which can cause premature removal of the shuffle mask.
+    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
+        isShuffleMaskLegal(NewMask, VT))
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
+                                                    Op.getOperand(1), NewMask));
+
+    // Propagate undef/zero elements from LHS/RHS.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0) {
+        KnownUndef.setBit(i);
+      } else if (M < (int)NumElts) {
+        if (UndefLHS[M])
+          KnownUndef.setBit(i);
+        if (ZeroLHS[M])
+          KnownZero.setBit(i);
+      } else {
+        if (UndefRHS[M - NumElts])
+          KnownUndef.setBit(i);
+        if (ZeroRHS[M - NumElts])
+          KnownZero.setBit(i);
+      }
+    }
+    break;
+  }
+  case ISD::ADD:
+  case ISD::SUB: {
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
+                                   SrcZero, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
+                                   KnownZero, TLO, Depth + 1))
+      return true;
+    KnownZero &= SrcZero;
+    KnownUndef &= SrcUndef;
+    break;
+  }
+  case ISD::TRUNCATE:
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
+                                   KnownZero, TLO, Depth + 1))
+      return true;
+    break;
+  default: {
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
+      if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
+                                                  KnownZero, TLO, Depth))
+        return true;
+    break;
+  }
   }
 
+  assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");
   return false;
 }
 
@@ -1316,6 +1699,18 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   return 1;
 }
 
+bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
+    SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
+    TargetLoweringOpt &TLO, unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use SimplifyDemandedVectorElts if you don't know whether Op"
+         " is a target node!");
+  return false;
+}
+
 // FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
 // work with truncating build vectors and vectors with elements of less than
 // 8 bits.
@@ -1353,16 +1748,6 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   llvm_unreachable("Invalid boolean contents");
 }
 
-SDValue TargetLowering::getConstTrueVal(SelectionDAG &DAG, EVT VT,
-                                        const SDLoc &DL) const {
-  unsigned ElementWidth = VT.getScalarSizeInBits();
-  APInt TrueInt =
-      getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent
-          ? APInt(ElementWidth, 1)
-          : APInt::getAllOnesValue(ElementWidth);
-  return DAG.getConstant(TrueInt, DL, VT);
-}
-
 bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   if (!N)
     return false;
@@ -1466,6 +1851,89 @@ SDValue TargetLowering::simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+/// There are multiple IR patterns that could be checking whether certain
+/// truncation of a signed number would be lossy or not. The pattern which is
+/// best at IR level, may not lower optimally. Thus, we want to unfold it.
+/// We are looking for the following pattern: (KeptBits is a constant)
+///   (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
+/// KeptBits won't be bitwidth(x), that will be constant-folded to true/false.
+/// KeptBits also can't be 1, that would have been folded to  %x dstcond 0
+/// We will unfold it into the natural trunc+sext pattern:
+///   ((%x << C) a>> C) dstcond %x
+/// Where  C = bitwidth(x) - KeptBits  and  C u< bitwidth(x)
+SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
+    EVT SCCVT, SDValue N0, SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI,
+    const SDLoc &DL) const {
+  // We must be comparing with a constant.
+  ConstantSDNode *C1;
+  if (!(C1 = dyn_cast<ConstantSDNode>(N1)))
+    return SDValue();
+
+  // N0 should be:  add %x, (1 << (KeptBits-1))
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // And we must be 'add'ing a constant.
+  ConstantSDNode *C01;
+  if (!(C01 = dyn_cast<ConstantSDNode>(N0->getOperand(1))))
+    return SDValue();
+
+  SDValue X = N0->getOperand(0);
+  EVT XVT = X.getValueType();
+
+  // Validate constants ...
+
+  APInt I1 = C1->getAPIntValue();
+
+  ISD::CondCode NewCond;
+  if (Cond == ISD::CondCode::SETULT) {
+    NewCond = ISD::CondCode::SETEQ;
+  } else if (Cond == ISD::CondCode::SETULE) {
+    NewCond = ISD::CondCode::SETEQ;
+    // But need to 'canonicalize' the constant.
+    I1 += 1;
+  } else if (Cond == ISD::CondCode::SETUGT) {
+    NewCond = ISD::CondCode::SETNE;
+    // But need to 'canonicalize' the constant.
+    I1 += 1;
+  } else if (Cond == ISD::CondCode::SETUGE) {
+    NewCond = ISD::CondCode::SETNE;
+  } else
+    return SDValue();
+
+  const APInt &I01 = C01->getAPIntValue();
+  // Both of them must be power-of-two, and the constant from setcc is bigger.
+  if (!(I1.ugt(I01) && I1.isPowerOf2() && I01.isPowerOf2()))
+    return SDValue();
+
+  // They are power-of-two, so which bit is set?
+  const unsigned KeptBits = I1.logBase2();
+  const unsigned KeptBitsMinusOne = I01.logBase2();
+
+  // Magic!
+  if (KeptBits != (KeptBitsMinusOne + 1))
+    return SDValue();
+  assert(KeptBits > 0 && KeptBits < XVT.getSizeInBits() && "unreachable");
+
+  // We don't want to do this in every single case.
+  SelectionDAG &DAG = DCI.DAG;
+  if (!DAG.getTargetLoweringInfo().shouldTransformSignedTruncationCheck(
+          XVT, KeptBits))
+    return SDValue();
+
+  const unsigned MaskedBits = XVT.getSizeInBits() - KeptBits;
+  assert(MaskedBits > 0 && MaskedBits < XVT.getSizeInBits() && "unreachable");
+
+  // Unfold into:  ((%x << C) a>> C) cond %x
+  // Where 'cond' will be either 'eq' or 'ne'.
+  SDValue ShiftAmt = DAG.getConstant(MaskedBits, DL, XVT);
+  SDValue T0 = DAG.getNode(ISD::SHL, DL, XVT, X, ShiftAmt);
+  SDValue T1 = DAG.getNode(ISD::SRA, DL, XVT, T0, ShiftAmt);
+  SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, X, NewCond);
+
+  return T2;
+}
+
 /// Try to simplify a setcc built with the specified operands and cc. If it is
 /// unable to simplify it, return a null SDValue.
 SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
@@ -1473,25 +1941,21 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                       DAGCombinerInfo &DCI,
                                       const SDLoc &dl) const {
   SelectionDAG &DAG = DCI.DAG;
+  EVT OpVT = N0.getValueType();
 
   // These setcc operations always fold.
   switch (Cond) {
   default: break;
   case ISD::SETFALSE:
-  case ISD::SETFALSE2: return DAG.getConstant(0, dl, VT);
+  case ISD::SETFALSE2: return DAG.getBoolConstant(false, dl, VT, OpVT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2: {
-    TargetLowering::BooleanContent Cnt =
-        getBooleanContents(N0->getValueType(0));
-    return DAG.getConstant(
-        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl,
-        VT);
-  }
+  case ISD::SETTRUE2:  return DAG.getBoolConstant(true, dl, VT, OpVT);
   }
 
   // Ensure that the constant occurs on the RHS and fold constant comparisons.
+  // TODO: Handle non-splat vector constants. All undef causes trouble.
   ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
-  if (isa<ConstantSDNode>(N0.getNode()) &&
+  if (isConstOrConstSplat(N0) &&
       (DCI.isBeforeLegalizeOps() ||
        isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
     return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
@@ -1737,7 +2201,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         EVT newVT = N0.getOperand(0).getValueType();
         if (DCI.isBeforeLegalizeOps() ||
             (isOperationLegal(ISD::SETCC, newVT) &&
-             getCondCodeAction(Cond, newVT.getSimpleVT()) == Legal)) {
+             isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
           EVT NewSetCCVT =
               getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), newVT);
           SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
@@ -1867,8 +2331,18 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
     }
 
+    if (SDValue V =
+            optimizeSetCCOfSignedTruncationCheck(VT, N0, N1, Cond, DCI, dl))
+      return V;
+  }
+
+  // These simplifications apply to splat vectors as well.
+  // TODO: Handle more splat vector cases.
+  if (auto *N1C = isConstOrConstSplat(N1)) {
+    const APInt &C1 = N1C->getAPIntValue();
+
     APInt MinVal, MaxVal;
-    unsigned OperandBitSize = N1C->getValueType(0).getSizeInBits();
+    unsigned OperandBitSize = N1C->getValueType(0).getScalarSizeInBits();
     if (ISD::isSignedIntSetCC(Cond)) {
       MinVal = APInt::getSignedMinValue(OperandBitSize);
       MaxVal = APInt::getSignedMaxValue(OperandBitSize);
@@ -1881,84 +2355,105 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
       // X >= MIN --> true
       if (C1 == MinVal)
-        return DAG.getConstant(1, dl, VT);
-
-      // X >= C0 --> X > (C0 - 1)
-      APInt C = C1 - 1;
-      ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
-      if ((DCI.isBeforeLegalizeOps() ||
-           isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
-          (!N1C->isOpaque() || (N1C->isOpaque() && C.getBitWidth() <= 64 &&
-                                isLegalICmpImmediate(C.getSExtValue())))) {
-        return DAG.getSetCC(dl, VT, N0,
-                            DAG.getConstant(C, dl, N1.getValueType()),
-                            NewCC);
+        return DAG.getBoolConstant(true, dl, VT, OpVT);
+
+      if (!VT.isVector()) { // TODO: Support this for vectors.
+        // X >= C0 --> X > (C0 - 1)
+        APInt C = C1 - 1;
+        ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
+        if ((DCI.isBeforeLegalizeOps() ||
+             isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
+            (!N1C->isOpaque() || (C.getBitWidth() <= 64 &&
+                                  isLegalICmpImmediate(C.getSExtValue())))) {
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(C, dl, N1.getValueType()),
+                              NewCC);
+        }
       }
     }
 
     if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
       // X <= MAX --> true
       if (C1 == MaxVal)
-          return DAG.getConstant(1, dl, VT);
+        return DAG.getBoolConstant(true, dl, VT, OpVT);
 
       // X <= C0 --> X < (C0 + 1)
-      APInt C = C1 + 1;
-      ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
-      if ((DCI.isBeforeLegalizeOps() ||
-           isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
-          (!N1C->isOpaque() || (N1C->isOpaque() && C.getBitWidth() <= 64 &&
-                                isLegalICmpImmediate(C.getSExtValue())))) {
-        return DAG.getSetCC(dl, VT, N0,
-                            DAG.getConstant(C, dl, N1.getValueType()),
-                            NewCC);
-      }
-    }
-
-    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal)
-      return DAG.getConstant(0, dl, VT);      // X < MIN --> false
-    if ((Cond == ISD::SETGE || Cond == ISD::SETUGE) && C1 == MinVal)
-      return DAG.getConstant(1, dl, VT);      // X >= MIN --> true
-    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal)
-      return DAG.getConstant(0, dl, VT);      // X > MAX --> false
-    if ((Cond == ISD::SETLE || Cond == ISD::SETULE) && C1 == MaxVal)
-      return DAG.getConstant(1, dl, VT);      // X <= MAX --> true
-
-    // Canonicalize setgt X, Min --> setne X, Min
-    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MinVal)
-      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
-    // Canonicalize setlt X, Max --> setne X, Max
-    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MaxVal)
-      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
-
-    // If we have setult X, 1, turn it into seteq X, 0
-    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal+1)
-      return DAG.getSetCC(dl, VT, N0,
-                          DAG.getConstant(MinVal, dl, N0.getValueType()),
-                          ISD::SETEQ);
-    // If we have setugt X, Max-1, turn it into seteq X, Max
-    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal-1)
-      return DAG.getSetCC(dl, VT, N0,
-                          DAG.getConstant(MaxVal, dl, N0.getValueType()),
-                          ISD::SETEQ);
+      if (!VT.isVector()) { // TODO: Support this for vectors.
+        APInt C = C1 + 1;
+        ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
+        if ((DCI.isBeforeLegalizeOps() ||
+             isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
+            (!N1C->isOpaque() || (C.getBitWidth() <= 64 &&
+                                  isLegalICmpImmediate(C.getSExtValue())))) {
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(C, dl, N1.getValueType()),
+                              NewCC);
+        }
+      }
+    }
 
-    // If we have "setcc X, C0", check to see if we can shrink the immediate
-    // by changing cc.
+    if (Cond == ISD::SETLT || Cond == ISD::SETULT) {
+      if (C1 == MinVal)
+        return DAG.getBoolConstant(false, dl, VT, OpVT); // X < MIN --> false
+
+      // TODO: Support this for vectors after legalize ops.
+      if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
+        // Canonicalize setlt X, Max --> setne X, Max
+        if (C1 == MaxVal)
+          return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+        // If we have setult X, 1, turn it into seteq X, 0
+        if (C1 == MinVal+1)
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(MinVal, dl, N0.getValueType()),
+                              ISD::SETEQ);
+      }
+    }
 
-    // SETUGT X, SINTMAX  -> SETLT X, 0
-    if (Cond == ISD::SETUGT &&
-        C1 == APInt::getSignedMaxValue(OperandBitSize))
-      return DAG.getSetCC(dl, VT, N0,
-                          DAG.getConstant(0, dl, N1.getValueType()),
-                          ISD::SETLT);
+    if (Cond == ISD::SETGT || Cond == ISD::SETUGT) {
+      if (C1 == MaxVal)
+        return DAG.getBoolConstant(false, dl, VT, OpVT); // X > MAX --> false
+
+      // TODO: Support this for vectors after legalize ops.
+      if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
+        // Canonicalize setgt X, Min --> setne X, Min
+        if (C1 == MinVal)
+          return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+        // If we have setugt X, Max-1, turn it into seteq X, Max
+        if (C1 == MaxVal-1)
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(MaxVal, dl, N0.getValueType()),
+                              ISD::SETEQ);
+      }
+    }
 
-    // SETULT X, SINTMIN  -> SETGT X, -1
-    if (Cond == ISD::SETULT &&
-        C1 == APInt::getSignedMinValue(OperandBitSize)) {
-      SDValue ConstMinusOne =
-          DAG.getConstant(APInt::getAllOnesValue(OperandBitSize), dl,
-                          N1.getValueType());
-      return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
+    // If we have "setcc X, C0", check to see if we can shrink the immediate
+    // by changing cc.
+    // TODO: Support this for vectors after legalize ops.
+    if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
+      // SETUGT X, SINTMAX  -> SETLT X, 0
+      if (Cond == ISD::SETUGT &&
+          C1 == APInt::getSignedMaxValue(OperandBitSize))
+        return DAG.getSetCC(dl, VT, N0,
+                            DAG.getConstant(0, dl, N1.getValueType()),
+                            ISD::SETLT);
+
+      // SETULT X, SINTMIN  -> SETGT X, -1
+      if (Cond == ISD::SETULT &&
+          C1 == APInt::getSignedMinValue(OperandBitSize)) {
+        SDValue ConstMinusOne =
+            DAG.getConstant(APInt::getAllOnesValue(OperandBitSize), dl,
+                            N1.getValueType());
+        return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
+      }
     }
+  }
+
+  // Back to non-vector simplifications.
+  // TODO: Can we do these for vector splats?
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
 
     // Fold bit comparisons when we can.
     if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
@@ -1967,9 +2462,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         N0.getOpcode() == ISD::AND) {
       auto &DL = DAG.getDataLayout();
       if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-        EVT ShiftTy = DCI.isBeforeLegalize()
-                          ? getPointerTy(DL)
-                          : getShiftAmountTy(N0.getValueType(), DL);
+        EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
+                                       !DCI.isBeforeLegalize());
         if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
           // Perform the xform if the AND RHS is a single bit.
           if (AndRHS->getAPIntValue().isPowerOf2()) {
@@ -2001,9 +2495,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
             unsigned ShiftBits = AndRHSC.countTrailingZeros();
             auto &DL = DAG.getDataLayout();
-            EVT ShiftTy = DCI.isBeforeLegalize()
-                              ? getPointerTy(DL)
-                              : getShiftAmountTy(N0.getValueType(), DL);
+            EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
+                                           !DCI.isBeforeLegalize());
             EVT CmpTy = N0.getValueType();
             SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
                                         DAG.getConstant(ShiftBits, dl,
@@ -2033,9 +2526,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         if (ShiftBits && NewC.getMinSignedBits() <= 64 &&
           isLegalICmpImmediate(NewC.getSExtValue())) {
           auto &DL = DAG.getDataLayout();
-          EVT ShiftTy = DCI.isBeforeLegalize()
-                            ? getPointerTy(DL)
-                            : getShiftAmountTy(N0.getValueType(), DL);
+          EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
+                                         !DCI.isBeforeLegalize());
           EVT CmpTy = N0.getValueType();
           SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
                                       DAG.getConstant(ShiftBits, dl, ShiftTy));
@@ -2058,9 +2550,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       switch (ISD::getUnorderedFlavor(Cond)) {
       default: llvm_unreachable("Unknown flavor!");
       case 0:  // Known false.
-        return DAG.getConstant(0, dl, VT);
+        return DAG.getBoolConstant(false, dl, VT, OpVT);
       case 1:  // Known true.
-        return DAG.getConstant(1, dl, VT);
+        return DAG.getBoolConstant(true, dl, VT, OpVT);
       case 2:  // Undefined.
         return DAG.getUNDEF(VT);
       }
@@ -2124,31 +2616,24 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   if (N0 == N1) {
     // The sext(setcc()) => setcc() optimization relies on the appropriate
     // constant being emitted.
-    uint64_t EqVal = 0;
-    switch (getBooleanContents(N0.getValueType())) {
-    case UndefinedBooleanContent:
-    case ZeroOrOneBooleanContent:
-      EqVal = ISD::isTrueWhenEqual(Cond);
-      break;
-    case ZeroOrNegativeOneBooleanContent:
-      EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0;
-      break;
-    }
+
+    bool EqTrue = ISD::isTrueWhenEqual(Cond);
 
     // We can always fold X == X for integer setcc's.
-    if (N0.getValueType().isInteger()) {
-      return DAG.getConstant(EqVal, dl, VT);
-    }
+    if (N0.getValueType().isInteger())
+      return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
+
     unsigned UOF = ISD::getUnorderedFlavor(Cond);
     if (UOF == 2)   // FP operators that are undefined on NaNs.
-      return DAG.getConstant(EqVal, dl, VT);
-    if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
-      return DAG.getConstant(EqVal, dl, VT);
+      return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
+    if (UOF == unsigned(EqTrue))
+      return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
     // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
     // if it is not already.
     ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
-    if (NewCond != Cond && (DCI.isBeforeLegalizeOps() ||
-          getCondCodeAction(NewCond, N0.getSimpleValueType()) == Legal))
+    if (NewCond != Cond &&
+        (DCI.isBeforeLegalizeOps() ||
+         isCondCodeLegal(NewCond, N0.getSimpleValueType())))
       return DAG.getSetCC(dl, VT, N0, N1, NewCond);
   }
 
@@ -2237,7 +2722,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             SDValue SH = DAG.getNode(
                 ISD::SHL, dl, N1.getValueType(), N1,
                 DAG.getConstant(1, dl,
-                                getShiftAmountTy(N1.getValueType(), DL)));
+                                getShiftAmountTy(N1.getValueType(), DL,
+                                                 !DCI.isBeforeLegalize())));
             if (!DCI.isCalledByLegalizer())
               DCI.AddToWorklist(SH.getNode());
             return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
@@ -2262,7 +2748,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           // X == (Z-X)  --> X<<1 == Z
           SDValue SH = DAG.getNode(
               ISD::SHL, dl, N1.getValueType(), N0,
-              DAG.getConstant(1, dl, getShiftAmountTy(N0.getValueType(), DL)));
+              DAG.getConstant(1, dl, getShiftAmountTy(N0.getValueType(), DL,
+                                                      !DCI.isBeforeLegalize())));
           if (!DCI.isCalledByLegalizer())
             DCI.AddToWorklist(SH.getNode());
           return DAG.getSetCC(dl, VT, SH, N1.getOperand(0), Cond);
@@ -2276,50 +2763,52 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
   // Fold away ALL boolean setcc's.
   SDValue Temp;
-  if (N0.getValueType() == MVT::i1 && foldBooleans) {
+  if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) {
+    EVT OpVT = N0.getValueType();
     switch (Cond) {
     default: llvm_unreachable("Unknown integer setcc!");
     case ISD::SETEQ:  // X == Y  -> ~(X^Y)
-      Temp = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
-      N0 = DAG.getNOT(dl, Temp, MVT::i1);
+      Temp = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
+      N0 = DAG.getNOT(dl, Temp, OpVT);
       if (!DCI.isCalledByLegalizer())
         DCI.AddToWorklist(Temp.getNode());
       break;
     case ISD::SETNE:  // X != Y   -->  (X^Y)
-      N0 = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      N0 = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
       break;
     case ISD::SETGT:  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
     case ISD::SETULT: // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
-      Temp = DAG.getNOT(dl, N0, MVT::i1);
-      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N1, Temp);
+      Temp = DAG.getNOT(dl, N0, OpVT);
+      N0 = DAG.getNode(ISD::AND, dl, OpVT, N1, Temp);
       if (!DCI.isCalledByLegalizer())
         DCI.AddToWorklist(Temp.getNode());
       break;
     case ISD::SETLT:  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
     case ISD::SETUGT: // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
-      Temp = DAG.getNOT(dl, N1, MVT::i1);
-      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N0, Temp);
+      Temp = DAG.getNOT(dl, N1, OpVT);
+      N0 = DAG.getNode(ISD::AND, dl, OpVT, N0, Temp);
       if (!DCI.isCalledByLegalizer())
         DCI.AddToWorklist(Temp.getNode());
       break;
     case ISD::SETULE: // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
     case ISD::SETGE:  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
-      Temp = DAG.getNOT(dl, N0, MVT::i1);
-      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N1, Temp);
+      Temp = DAG.getNOT(dl, N0, OpVT);
+      N0 = DAG.getNode(ISD::OR, dl, OpVT, N1, Temp);
       if (!DCI.isCalledByLegalizer())
         DCI.AddToWorklist(Temp.getNode());
       break;
     case ISD::SETUGE: // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
     case ISD::SETLE:  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
-      Temp = DAG.getNOT(dl, N1, MVT::i1);
-      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N0, Temp);
+      Temp = DAG.getNOT(dl, N1, OpVT);
+      N0 = DAG.getNode(ISD::OR, dl, OpVT, N0, Temp);
       break;
     }
-    if (VT != MVT::i1) {
+    if (VT.getScalarType() != MVT::i1) {
       if (!DCI.isCalledByLegalizer())
         DCI.AddToWorklist(N0.getNode());
       // FIXME: If running after legalize, we probably can't do this.
-      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0);
+      ISD::NodeType ExtendCode = getExtendForContent(getBooleanContents(OpVT));
+      N0 = DAG.getNode(ExtendCode, dl, VT, N0);
     }
     return N0;
   }
@@ -2928,7 +3417,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
   }
 }
 
-/// \brief Given an exact SDIV by a constant, create a multiplication
+/// Given an exact SDIV by a constant, create a multiplication
 /// with the multiplicative inverse of the constant.
 static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
                               const SDLoc &dl, SelectionDAG &DAG,
@@ -2970,7 +3459,7 @@ SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   return SDValue();
 }
 
-/// \brief Given an ISD::SDIV node expressing a divide by constant,
+/// Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
@@ -3034,7 +3523,7 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
 
-/// \brief Given an ISD::UDIV node expressing a divide by constant,
+/// Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
@@ -3413,9 +3902,6 @@ SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
   return DAG.getMergeValues({ Value, NewChain }, SL);
 }
 
-// FIXME: This relies on each element having a byte size, otherwise the stride
-// is 0 and just overwrites the same location. ExpandStore currently expects
-// this broken behavior.
 SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
                                              SelectionDAG &DAG) const {
   SDLoc SL(ST);
@@ -3432,11 +3918,43 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
   // The type of data as saved in memory.
   EVT MemSclVT = StVT.getScalarType();
 
-  // Store Stride in bytes
-  unsigned Stride = MemSclVT.getSizeInBits() / 8;
   EVT IdxVT = getVectorIdxTy(DAG.getDataLayout());
   unsigned NumElem = StVT.getVectorNumElements();
 
+  // A vector must always be stored in memory as-is, i.e. without any padding
+  // between the elements, since various code depend on it, e.g. in the
+  // handling of a bitcast of a vector type to int, which may be done with a
+  // vector store followed by an integer load. A vector that does not have
+  // elements that are byte-sized must therefore be stored as an integer
+  // built out of the extracted vector elements.
+  if (!MemSclVT.isByteSized()) {
+    unsigned NumBits = StVT.getSizeInBits();
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
+
+    SDValue CurrVal = DAG.getConstant(0, SL, IntVT);
+
+    for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
+                                DAG.getConstant(Idx, SL, IdxVT));
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MemSclVT, Elt);
+      SDValue ExtElt = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Trunc);
+      unsigned ShiftIntoIdx =
+          (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
+      SDValue ShiftAmount =
+          DAG.getConstant(ShiftIntoIdx * MemSclVT.getSizeInBits(), SL, IntVT);
+      SDValue ShiftedElt =
+          DAG.getNode(ISD::SHL, SL, IntVT, ExtElt, ShiftAmount);
+      CurrVal = DAG.getNode(ISD::OR, SL, IntVT, CurrVal, ShiftedElt);
+    }
+
+    return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(),
+                        ST->getAlignment(), ST->getMemOperand()->getFlags(),
+                        ST->getAAInfo());
+  }
+
+  // Store Stride in bytes
+  unsigned Stride = MemSclVT.getSizeInBits() / 8;
+  assert (Stride && "Zero stride!");
   // Extract each of the elements from the original vector and save them into
   // memory individually.
   SmallVector<SDValue, 8> Stores;
@@ -3475,6 +3993,8 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
       if (!isOperationLegalOrCustom(ISD::LOAD, intVT)) {
         // Scalarize the load and let the individual components be handled.
         SDValue Scalarized = scalarizeVectorLoad(LD, DAG);
+        if (Scalarized->getOpcode() == ISD::MERGE_VALUES)
+	  return std::make_pair(Scalarized.getOperand(0), Scalarized.getOperand(1));
         return std::make_pair(Scalarized.getValue(0), Scalarized.getValue(1));
       }
 
diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
index b35bf6ba3a7b..d3454ca6ba6a 100644
--- a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -53,6 +53,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -62,11 +63,13 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
@@ -97,7 +100,7 @@ EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden,
 
 namespace {
 
-/// \brief Class to determine where the safe point to insert the
+/// Class to determine where the safe point to insert the
 /// prologue and epilogue are.
 /// Unlike the paper from Fred C. Chow, PLDI'88, that introduces the
 /// shrink-wrapping term for prologue/epilogue placement, this pass
@@ -128,6 +131,9 @@ class ShrinkWrap : public MachineFunctionPass {
   /// are in the same loop.
   MachineLoopInfo *MLI;
 
+  // Emit remarks.
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
+
   /// Frequency of the Entry block.
   uint64_t EntryFreq;
 
@@ -137,6 +143,9 @@ class ShrinkWrap : public MachineFunctionPass {
   /// Current opcode for frame destroy.
   unsigned FrameDestroyOpcode;
 
+  /// Stack pointer register, used by llvm.{savestack,restorestack}
+  unsigned SP;
+
   /// Entry block.
   const MachineBasicBlock *Entry;
 
@@ -148,7 +157,7 @@ class ShrinkWrap : public MachineFunctionPass {
   /// Current MachineFunction.
   MachineFunction *MachineFunc;
 
-  /// \brief Check if \p MI uses or defines a callee-saved register or
+  /// Check if \p MI uses or defines a callee-saved register or
   /// a frame index. If this is the case, this means \p MI must happen
   /// after Save and before Restore.
   bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const;
@@ -168,14 +177,14 @@ class ShrinkWrap : public MachineFunctionPass {
     return CurrentCSRs;
   }
 
-  /// \brief Update the Save and Restore points such that \p MBB is in
+  /// Update the Save and Restore points such that \p MBB is in
   /// the region that is dominated by Save and post-dominated by Restore
   /// and Save and Restore still match the safe point definition.
   /// Such point may not exist and Save and/or Restore may be null after
   /// this call.
   void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS);
 
-  /// \brief Initialize the pass for \p MF.
+  /// Initialize the pass for \p MF.
   void init(MachineFunction &MF) {
     RCI.runOnMachineFunction(MF);
     MDT = &getAnalysis<MachineDominatorTree>();
@@ -184,10 +193,13 @@ class ShrinkWrap : public MachineFunctionPass {
     Restore = nullptr;
     MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
     MLI = &getAnalysis<MachineLoopInfo>();
+    ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
     EntryFreq = MBFI->getEntryFreq();
-    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+    const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
+    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
     FrameSetupOpcode = TII.getCallFrameSetupOpcode();
     FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+    SP = Subtarget.getTargetLowering()->getStackPointerRegisterToSaveRestore();
     Entry = &MF.front();
     CurrentCSRs.clear();
     MachineFunc = &MF;
@@ -199,7 +211,7 @@ class ShrinkWrap : public MachineFunctionPass {
   /// shrink-wrapping.
   bool ArePointsInteresting() const { return Save != Entry && Save && Restore; }
 
-  /// \brief Check if shrink wrapping is enabled for this target and function.
+  /// Check if shrink wrapping is enabled for this target and function.
   static bool isShrinkWrapEnabled(const MachineFunction &MF);
 
 public:
@@ -215,12 +227,18 @@ public:
     AU.addRequired<MachineDominatorTree>();
     AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineOptimizationRemarkEmitterPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::NoVRegs);
+  }
+
   StringRef getPassName() const override { return "Shrink Wrapping analysis"; }
 
-  /// \brief Perform the shrink-wrapping analysis and update
+  /// Perform the shrink-wrapping analysis and update
   /// the MachineFrameInfo attached to \p MF with the results.
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
@@ -236,28 +254,34 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
 
 bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
                                  RegScavenger *RS) const {
-  // Ignore DBG_VALUE and other meta instructions that must not affect codegen.
-  if (MI.isMetaInstruction())
-    return false;
-
   if (MI.getOpcode() == FrameSetupOpcode ||
       MI.getOpcode() == FrameDestroyOpcode) {
-    DEBUG(dbgs() << "Frame instruction: " << MI << '\n');
+    LLVM_DEBUG(dbgs() << "Frame instruction: " << MI << '\n');
     return true;
   }
   for (const MachineOperand &MO : MI.operands()) {
     bool UseOrDefCSR = false;
     if (MO.isReg()) {
+      // Ignore instructions like DBG_VALUE which don't read/def the register.
+      if (!MO.isDef() && !MO.readsReg())
+        continue;
       unsigned PhysReg = MO.getReg();
       if (!PhysReg)
         continue;
       assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
              "Unallocated register?!");
-      UseOrDefCSR = RCI.getLastCalleeSavedAlias(PhysReg);
+      // The stack pointer is not normally described as a callee-saved register
+      // in calling convention definitions, so we need to watch for it
+      // separately. An SP mentioned by a call instruction, we can ignore,
+      // though, as it's harmless and we do not want to effectively disable tail
+      // calls by forcing the restore point to post-dominate them.
+      UseOrDefCSR = (!MI.isCall() && PhysReg == SP) ||
+                    RCI.getLastCalleeSavedAlias(PhysReg);
     } else if (MO.isRegMask()) {
       // Check if this regmask clobbers any of the CSRs.
       for (unsigned Reg : getCurrentCSRs(RS)) {
@@ -267,16 +291,17 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
         }
       }
     }
-    if (UseOrDefCSR || MO.isFI()) {
-      DEBUG(dbgs() << "Use or define CSR(" << UseOrDefCSR << ") or FI("
-                   << MO.isFI() << "): " << MI << '\n');
+    // Skip FrameIndex operands in DBG_VALUE instructions.
+    if (UseOrDefCSR || (MO.isFI() && !MI.isDebugValue())) {
+      LLVM_DEBUG(dbgs() << "Use or define CSR(" << UseOrDefCSR << ") or FI("
+                        << MO.isFI() << "): " << MI << '\n');
       return true;
     }
   }
   return false;
 }
 
-/// \brief Helper function to find the immediate (post) dominator.
+/// Helper function to find the immediate (post) dominator.
 template <typename ListOfBBs, typename DominanceAnalysis>
 static MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs,
                                    DominanceAnalysis &Dom) {
@@ -300,7 +325,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
     Save = MDT->findNearestCommonDominator(Save, &MBB);
 
   if (!Save) {
-    DEBUG(dbgs() << "Found a block that is not reachable from Entry\n");
+    LLVM_DEBUG(dbgs() << "Found a block that is not reachable from Entry\n");
     return;
   }
 
@@ -334,7 +359,8 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
   }
 
   if (!Restore) {
-    DEBUG(dbgs() << "Restore point needs to be spanned on several blocks\n");
+    LLVM_DEBUG(
+        dbgs() << "Restore point needs to be spanned on several blocks\n");
     return;
   }
 
@@ -413,38 +439,16 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
   }
 }
 
-/// Check whether the edge (\p SrcBB, \p DestBB) is a backedge according to MLI.
-/// I.e., check if it exists a loop that contains SrcBB and where DestBB is the
-/// loop header.
-static bool isProperBackedge(const MachineLoopInfo &MLI,
-                             const MachineBasicBlock *SrcBB,
-                             const MachineBasicBlock *DestBB) {
-  for (const MachineLoop *Loop = MLI.getLoopFor(SrcBB); Loop;
-       Loop = Loop->getParentLoop()) {
-    if (Loop->getHeader() == DestBB)
-      return true;
-  }
-  return false;
-}
+static bool giveUpWithRemarks(MachineOptimizationRemarkEmitter *ORE,
+                              StringRef RemarkName, StringRef RemarkMessage,
+                              const DiagnosticLocation &Loc,
+                              const MachineBasicBlock *MBB) {
+  ORE->emit([&]() {
+    return MachineOptimizationRemarkMissed(DEBUG_TYPE, RemarkName, Loc, MBB)
+           << RemarkMessage;
+  });
 
-/// Check if the CFG of \p MF is irreducible.
-static bool isIrreducibleCFG(const MachineFunction &MF,
-                             const MachineLoopInfo &MLI) {
-  const MachineBasicBlock *Entry = &*MF.begin();
-  ReversePostOrderTraversal<const MachineBasicBlock *> RPOT(Entry);
-  BitVector VisitedBB(MF.getNumBlockIDs());
-  for (const MachineBasicBlock *MBB : RPOT) {
-    VisitedBB.set(MBB->getNumber());
-    for (const MachineBasicBlock *SuccBB : MBB->successors()) {
-      if (!VisitedBB.test(SuccBB->getNumber()))
-        continue;
-      // We already visited SuccBB, thus MBB->SuccBB must be a backedge.
-      // Check that the head matches what we have in the loop information.
-      // Otherwise, we have an irreducible graph.
-      if (!isProperBackedge(MLI, MBB, SuccBB))
-        return true;
-    }
-  }
+  LLVM_DEBUG(dbgs() << RemarkMessage << '\n');
   return false;
 }
 
@@ -452,19 +456,21 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
     return false;
 
-  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
 
   init(MF);
 
-  if (isIrreducibleCFG(MF, *MLI)) {
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  if (containsIrreducibleCFG<MachineBasicBlock *>(RPOT, *MLI)) {
     // If MF is irreducible, a block may be in a loop without
     // MachineLoopInfo reporting it. I.e., we may use the
     // post-dominance property in loops, which lead to incorrect
     // results. Moreover, we may miss that the prologue and
     // epilogue are not in the same loop, leading to unbalanced
     // construction/deconstruction of the stack frame.
-    DEBUG(dbgs() << "Irreducible CFGs are not supported yet\n");
-    return false;
+    return giveUpWithRemarks(ORE, "UnsupportedIrreducibleCFG",
+                             "Irreducible CFGs are not supported yet.",
+                             MF.getFunction().getSubprogram(), &MF.front());
   }
 
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
@@ -472,12 +478,28 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
       TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr);
 
   for (MachineBasicBlock &MBB : MF) {
-    DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' << MBB.getName()
-                 << '\n');
-
-    if (MBB.isEHFuncletEntry()) {
-      DEBUG(dbgs() << "EH Funclets are not supported yet.\n");
-      return false;
+    LLVM_DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' '
+                      << MBB.getName() << '\n');
+
+    if (MBB.isEHFuncletEntry())
+      return giveUpWithRemarks(ORE, "UnsupportedEHFunclets",
+                               "EH Funclets are not supported yet.",
+                               MBB.front().getDebugLoc(), &MBB);
+
+    if (MBB.isEHPad()) {
+      // Push the prologue and epilogue outside of
+      // the region that may throw by making sure
+      // that all the landing pads are at least at the
+      // boundary of the save and restore points.
+      // The problem with exceptions is that the throw
+      // is not properly modeled and in particular, a
+      // basic block can jump out from the middle.
+      updateSaveRestorePoints(MBB, RS.get());
+      if (!ArePointsInteresting()) {
+        LLVM_DEBUG(dbgs() << "EHPad prevents shrink-wrapping\n");
+        return false;
+      }
+      continue;
     }
 
     for (const MachineInstr &MI : MBB) {
@@ -489,7 +511,7 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
       // If we are at a point where we cannot improve the placement of
       // save/restore instructions, just give up.
       if (!ArePointsInteresting()) {
-        DEBUG(dbgs() << "No Shrink wrap candidate found\n");
+        LLVM_DEBUG(dbgs() << "No Shrink wrap candidate found\n");
         return false;
       }
       // No need to look for other instructions, this basic block
@@ -502,20 +524,21 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
     // because it means we did not encounter any frame/CSR related code.
     // Otherwise, we would have returned from the previous loop.
     assert(!Save && !Restore && "We miss a shrink-wrap opportunity?!");
-    DEBUG(dbgs() << "Nothing to shrink-wrap\n");
+    LLVM_DEBUG(dbgs() << "Nothing to shrink-wrap\n");
     return false;
   }
 
-  DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: " << EntryFreq
-               << '\n');
+  LLVM_DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: " << EntryFreq
+                    << '\n');
 
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   do {
-    DEBUG(dbgs() << "Shrink wrap candidates (#, Name, Freq):\nSave: "
-                 << Save->getNumber() << ' ' << Save->getName() << ' '
-                 << MBFI->getBlockFreq(Save).getFrequency() << "\nRestore: "
-                 << Restore->getNumber() << ' ' << Restore->getName() << ' '
-                 << MBFI->getBlockFreq(Restore).getFrequency() << '\n');
+    LLVM_DEBUG(dbgs() << "Shrink wrap candidates (#, Name, Freq):\nSave: "
+                      << Save->getNumber() << ' ' << Save->getName() << ' '
+                      << MBFI->getBlockFreq(Save).getFrequency()
+                      << "\nRestore: " << Restore->getNumber() << ' '
+                      << Restore->getName() << ' '
+                      << MBFI->getBlockFreq(Restore).getFrequency() << '\n');
 
     bool IsSaveCheap, TargetCanUseSaveAsPrologue = false;
     if (((IsSaveCheap = EntryFreq >= MBFI->getBlockFreq(Save).getFrequency()) &&
@@ -523,7 +546,8 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
         ((TargetCanUseSaveAsPrologue = TFI->canUseAsPrologue(*Save)) &&
          TFI->canUseAsEpilogue(*Restore)))
       break;
-    DEBUG(dbgs() << "New points are too expensive or invalid for the target\n");
+    LLVM_DEBUG(
+        dbgs() << "New points are too expensive or invalid for the target\n");
     MachineBasicBlock *NewBB;
     if (!IsSaveCheap || !TargetCanUseSaveAsPrologue) {
       Save = FindIDom<>(*Save, Save->predecessors(), *MDT);
@@ -545,9 +569,10 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 
-  DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: " << Save->getNumber()
-               << ' ' << Save->getName() << "\nRestore: "
-               << Restore->getNumber() << ' ' << Restore->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: "
+                    << Save->getNumber() << ' ' << Save->getName()
+                    << "\nRestore: " << Restore->getNumber() << ' '
+                    << Restore->getName() << '\n');
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setSavePoint(Save);
diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 17a3a84ecda5..5d2669f5ae92 100644
--- a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -27,7 +28,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "sjljehprepare"
@@ -64,7 +64,6 @@ public:
 
 private:
   bool setupEntryBlockAndCallSites(Function &F);
-  bool undoSwiftErrorSelect(Function &F);
   void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
   Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
   void lowerIncomingArguments(Function &F);
@@ -233,6 +232,13 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
   assert(AfterAllocaInsPt != F.front().end());
 
   for (auto &AI : F.args()) {
+    // Swift error really is a register that we model as memory -- instruction
+    // selection will perform mem-to-reg for us and spill/reload appropriately
+    // around calls that clobber it. There is no need to spill this
+    // value to the stack and doing so would not be allowed.
+    if (AI.isSwiftError())
+      continue;
+
     Type *Ty = AI.getType();
 
     // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction.
@@ -301,8 +307,8 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
       for (InvokeInst *Invoke : Invokes) {
         BasicBlock *UnwindBlock = Invoke->getUnwindDest();
         if (UnwindBlock != &BB && LiveBBs.count(UnwindBlock)) {
-          DEBUG(dbgs() << "SJLJ Spill: " << Inst << " around "
-                       << UnwindBlock->getName() << "\n");
+          LLVM_DEBUG(dbgs() << "SJLJ Spill: " << Inst << " around "
+                            << UnwindBlock->getName() << "\n");
           NeedsSpill = true;
           break;
         }
@@ -462,25 +468,6 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   return true;
 }
 
-bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) {
-  // We have inserted dummy copies 'select true, arg, undef' in the entry block
-  // for arguments to simplify this pass.
-  // swifterror arguments cannot be used in this way. Undo the select for the
-  // swifterror argument.
-  for (auto &AI : F.args()) {
-    if (AI.isSwiftError()) {
-      assert(AI.hasOneUse() && "Must have converted the argument to a select");
-      auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser());
-      assert(Select && "There must be single select user");
-      auto *OrigSwiftError = cast<Argument>(Select->getTrueValue());
-      Select->replaceAllUsesWith(OrigSwiftError);
-      Select->eraseFromParent();
-      return true;
-    }
-  }
-  return false;
-}
-
 bool SjLjEHPrepare::runOnFunction(Function &F) {
   Module &M = *F.getParent();
   RegisterFn = M.getOrInsertFunction(
@@ -499,7 +486,5 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
 
   bool Res = setupEntryBlockAndCallSites(F);
-  if (Res)
-    Res |= undoSwiftErrorSelect(F);
   return Res;
 }
diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
index ea74c777e1e2..ed74b3e4fa19 100644
--- a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -10,6 +10,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -73,7 +74,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
     SlotIndex blockStartIndex(&indexList.back(), SlotIndex::Slot_Block);
 
     for (MachineInstr &MI : MBB) {
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       // Insert a store index for the instr.
@@ -94,9 +95,9 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   }
 
   // Sort the Idx2MBBMap
-  std::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
+  llvm::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
 
-  DEBUG(mf->print(dbgs(), this));
+  LLVM_DEBUG(mf->print(dbgs(), this));
 
   // And we're done!
   return false;
@@ -146,7 +147,7 @@ void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) {
 
 void SlotIndexes::renumberIndexes() {
   // Renumber updates the index of every element of the index list.
-  DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
+  LLVM_DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
   ++NumGlobalRenum;
 
   unsigned index = 0;
@@ -173,8 +174,8 @@ void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
     // If the next index is bigger, we have caught up.
   } while (curItr != indexList.end() && curItr->getIndex() <= index);
 
-  DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << startItr->getIndex() << '-'
-               << index << " ***\n");
+  LLVM_DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << startItr->getIndex()
+                    << '-' << index << " ***\n");
   ++NumLocalRenum;
 }
 
@@ -244,7 +245,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
     MachineInstr &MI = *I;
-    if (!MI.isDebugValue() && mi2iMap.find(&MI) == mi2iMap.end())
+    if (!MI.isDebugInstr() && mi2iMap.find(&MI) == mi2iMap.end())
       insertMachineInstrInMaps(MI);
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
index b989b54d4190..f6786b30b21c 100644
--- a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -246,7 +246,7 @@ void SpillPlacement::activate(unsigned n) {
   }
 }
 
-/// \brief Set the threshold for a given entry frequency.
+/// Set the threshold for a given entry frequency.
 ///
 /// Set the threshold relative to \c Entry.  Since the threshold is used as a
 /// bound on the open interval (-Threshold;Threshold), 1 is the minimum
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp
index 1628ee28b8a3..d639f4475301 100644
--- a/contrib/llvm/lib/CodeGen/SplitKit.cpp
+++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp
@@ -39,6 +39,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Allocator.h"
@@ -191,7 +192,7 @@ void SplitAnalysis::analyzeUses() {
     // I am looking at you, RegisterCoalescer!
     DidRepairRange = true;
     ++NumRepairs;
-    DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
+    LLVM_DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
     const_cast<LiveIntervals&>(LIS)
       .shrinkToUses(const_cast<LiveInterval*>(CurLI));
     UseBlocks.clear();
@@ -201,10 +202,9 @@ void SplitAnalysis::analyzeUses() {
     assert(fixed && "Couldn't fix broken live interval");
   }
 
-  DEBUG(dbgs() << "Analyze counted "
-               << UseSlots.size() << " instrs in "
-               << UseBlocks.size() << " blocks, through "
-               << NumThroughBlocks << " blocks.\n");
+  LLVM_DEBUG(dbgs() << "Analyze counted " << UseSlots.size() << " instrs in "
+                    << UseBlocks.size() << " blocks, through "
+                    << NumThroughBlocks << " blocks.\n");
 }
 
 /// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
@@ -685,20 +685,20 @@ unsigned SplitEditor::openIntv() {
 void SplitEditor::selectIntv(unsigned Idx) {
   assert(Idx != 0 && "Cannot select the complement interval");
   assert(Idx < Edit->size() && "Can only select previously opened interval");
-  DEBUG(dbgs() << "    selectIntv " << OpenIdx << " -> " << Idx << '\n');
+  LLVM_DEBUG(dbgs() << "    selectIntv " << OpenIdx << " -> " << Idx << '\n');
   OpenIdx = Idx;
 }
 
 SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
   assert(OpenIdx && "openIntv not called before enterIntvBefore");
-  DEBUG(dbgs() << "    enterIntvBefore " << Idx);
+  LLVM_DEBUG(dbgs() << "    enterIntvBefore " << Idx);
   Idx = Idx.getBaseIndex();
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
   if (!ParentVNI) {
-    DEBUG(dbgs() << ": not live\n");
+    LLVM_DEBUG(dbgs() << ": not live\n");
     return Idx;
   }
-  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
   MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
   assert(MI && "enterIntvBefore called with invalid index");
 
@@ -708,14 +708,14 @@ SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
 
 SlotIndex SplitEditor::enterIntvAfter(SlotIndex Idx) {
   assert(OpenIdx && "openIntv not called before enterIntvAfter");
-  DEBUG(dbgs() << "    enterIntvAfter " << Idx);
+  LLVM_DEBUG(dbgs() << "    enterIntvAfter " << Idx);
   Idx = Idx.getBoundaryIndex();
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
   if (!ParentVNI) {
-    DEBUG(dbgs() << ": not live\n");
+    LLVM_DEBUG(dbgs() << ": not live\n");
     return Idx;
   }
-  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
   MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
   assert(MI && "enterIntvAfter called with invalid index");
 
@@ -728,18 +728,18 @@ SlotIndex SplitEditor::enterIntvAtEnd(MachineBasicBlock &MBB) {
   assert(OpenIdx && "openIntv not called before enterIntvAtEnd");
   SlotIndex End = LIS.getMBBEndIdx(&MBB);
   SlotIndex Last = End.getPrevSlot();
-  DEBUG(dbgs() << "    enterIntvAtEnd " << printMBBReference(MBB) << ", "
-               << Last);
+  LLVM_DEBUG(dbgs() << "    enterIntvAtEnd " << printMBBReference(MBB) << ", "
+                    << Last);
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Last);
   if (!ParentVNI) {
-    DEBUG(dbgs() << ": not live\n");
+    LLVM_DEBUG(dbgs() << ": not live\n");
     return End;
   }
-  DEBUG(dbgs() << ": valno " << ParentVNI->id);
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id);
   VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Last, MBB,
                               SA.getLastSplitPointIter(&MBB));
   RegAssign.insert(VNI->def, End, OpenIdx);
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
   return VNI->def;
 }
 
@@ -750,23 +750,23 @@ void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
 
 void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
   assert(OpenIdx && "openIntv not called before useIntv");
-  DEBUG(dbgs() << "    useIntv [" << Start << ';' << End << "):");
+  LLVM_DEBUG(dbgs() << "    useIntv [" << Start << ';' << End << "):");
   RegAssign.insert(Start, End, OpenIdx);
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
 }
 
 SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
   assert(OpenIdx && "openIntv not called before leaveIntvAfter");
-  DEBUG(dbgs() << "    leaveIntvAfter " << Idx);
+  LLVM_DEBUG(dbgs() << "    leaveIntvAfter " << Idx);
 
   // The interval must be live beyond the instruction at Idx.
   SlotIndex Boundary = Idx.getBoundaryIndex();
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Boundary);
   if (!ParentVNI) {
-    DEBUG(dbgs() << ": not live\n");
+    LLVM_DEBUG(dbgs() << ": not live\n");
     return Boundary.getNextSlot();
   }
-  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
   MachineInstr *MI = LIS.getInstructionFromIndex(Boundary);
   assert(MI && "No instruction at index");
 
@@ -788,16 +788,16 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
 
 SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
   assert(OpenIdx && "openIntv not called before leaveIntvBefore");
-  DEBUG(dbgs() << "    leaveIntvBefore " << Idx);
+  LLVM_DEBUG(dbgs() << "    leaveIntvBefore " << Idx);
 
   // The interval must be live into the instruction at Idx.
   Idx = Idx.getBaseIndex();
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
   if (!ParentVNI) {
-    DEBUG(dbgs() << ": not live\n");
+    LLVM_DEBUG(dbgs() << ": not live\n");
     return Idx.getNextSlot();
   }
-  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
 
   MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
   assert(MI && "No instruction at index");
@@ -808,19 +808,19 @@ SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
 SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
   assert(OpenIdx && "openIntv not called before leaveIntvAtTop");
   SlotIndex Start = LIS.getMBBStartIdx(&MBB);
-  DEBUG(dbgs() << "    leaveIntvAtTop " << printMBBReference(MBB) << ", "
-               << Start);
+  LLVM_DEBUG(dbgs() << "    leaveIntvAtTop " << printMBBReference(MBB) << ", "
+                    << Start);
 
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
   if (!ParentVNI) {
-    DEBUG(dbgs() << ": not live\n");
+    LLVM_DEBUG(dbgs() << ": not live\n");
     return Start;
   }
 
   VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
                               MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
   RegAssign.insert(Start, VNI->def, OpenIdx);
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
   return VNI->def;
 }
 
@@ -835,9 +835,9 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
   // The complement interval will be extended as needed by LRCalc.extend().
   if (ParentVNI)
     forceRecompute(0, *ParentVNI);
-  DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
+  LLVM_DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
   RegAssign.insert(Start, End, OpenIdx);
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
 }
 
 //===----------------------------------------------------------------------===//
@@ -846,7 +846,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
 
 void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
   LiveInterval *LI = &LIS.getInterval(Edit->get(0));
-  DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
+  LLVM_DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
   RegAssignMap::iterator AssignI;
   AssignI.setMap(RegAssign);
 
@@ -859,9 +859,9 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
     MachineBasicBlock::iterator MBBI(MI);
     bool AtBegin;
     do AtBegin = MBBI == MBB->begin();
-    while (!AtBegin && (--MBBI)->isDebugValue());
+    while (!AtBegin && (--MBBI)->isDebugInstr());
 
-    DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
+    LLVM_DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
     LIS.removeVRegDefAt(*LI, Def);
     LIS.RemoveMachineInstrFromMaps(*MI);
     MI->eraseFromParent();
@@ -876,11 +876,12 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
       continue;
     unsigned RegIdx = AssignI.value();
     if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
-      DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
+      LLVM_DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx
+                        << '\n');
       forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
     } else {
       SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
-      DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
+      LLVM_DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
       AssignI.setStop(Kill);
     }
   }
@@ -907,15 +908,17 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
     // MBB isn't in a loop, it doesn't get any better.  All dominators have a
     // higher frequency by definition.
     if (!Loop) {
-      DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB) << " dominates "
-                   << printMBBReference(*MBB) << " at depth 0\n");
+      LLVM_DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB)
+                        << " dominates " << printMBBReference(*MBB)
+                        << " at depth 0\n");
       return MBB;
     }
 
     // We'll never be able to exit the DefLoop.
     if (Loop == DefLoop) {
-      DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB) << " dominates "
-                   << printMBBReference(*MBB) << " in the same loop\n");
+      LLVM_DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB)
+                        << " dominates " << printMBBReference(*MBB)
+                        << " in the same loop\n");
       return MBB;
     }
 
@@ -924,8 +927,9 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
     if (Depth < BestDepth) {
       BestMBB = MBB;
       BestDepth = Depth;
-      DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB) << " dominates "
-                   << printMBBReference(*MBB) << " at depth " << Depth << '\n');
+      LLVM_DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB)
+                        << " dominates " << printMBBReference(*MBB)
+                        << " at depth " << Depth << '\n');
     }
 
     // Leave loop by going to the immediate dominator of the loop header.
@@ -1031,14 +1035,14 @@ void SplitEditor::hoistCopies() {
     // instruction in the complement range.  All other copies of ParentVNI
     // should be eliminated.
     if (VNI->def == ParentVNI->def) {
-      DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
+      LLVM_DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
       Dom = DomPair(ValMBB, VNI->def);
       continue;
     }
     // Skip the singly mapped values.  There is nothing to gain from hoisting a
     // single back-copy.
     if (Values.lookup(std::make_pair(0, ParentVNI->id)).getPointer()) {
-      DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
+      LLVM_DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
       continue;
     }
 
@@ -1062,10 +1066,11 @@ void SplitEditor::hoistCopies() {
       Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
     }
 
-    DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
-                 << " for parent " << ParentVNI->id << '@' << ParentVNI->def
-                 << " hoist to " << printMBBReference(*Dom.first) << ' '
-                 << Dom.second << '\n');
+    LLVM_DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@'
+                      << VNI->def << " for parent " << ParentVNI->id << '@'
+                      << ParentVNI->def << " hoist to "
+                      << printMBBReference(*Dom.first) << ' ' << Dom.second
+                      << '\n');
   }
 
   // Insert the hoisted copies.
@@ -1118,7 +1123,7 @@ bool SplitEditor::transferValues() {
   bool Skipped = false;
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
   for (const LiveRange::Segment &S : Edit->getParent()) {
-    DEBUG(dbgs() << "  blit " << S << ':');
+    LLVM_DEBUG(dbgs() << "  blit " << S << ':');
     VNInfo *ParentVNI = S.valno;
     // RegAssign has holes where RegIdx 0 should be used.
     SlotIndex Start = S.start;
@@ -1140,14 +1145,14 @@ bool SplitEditor::transferValues() {
       }
 
       // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
-      DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx
-                   << '(' << printReg(Edit->get(RegIdx)) << ')');
+      LLVM_DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx << '('
+                        << printReg(Edit->get(RegIdx)) << ')');
       LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
 
       // Check for a simply defined value that can be blitted directly.
       ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
       if (VNInfo *VNI = VFP.getPointer()) {
-        DEBUG(dbgs() << ':' << VNI->id);
+        LLVM_DEBUG(dbgs() << ':' << VNI->id);
         LI.addSegment(LiveInterval::Segment(Start, End, VNI));
         Start = End;
         continue;
@@ -1155,7 +1160,7 @@ bool SplitEditor::transferValues() {
 
       // Skip values with forced recomputation.
       if (VFP.getInt()) {
-        DEBUG(dbgs() << "(recalc)");
+        LLVM_DEBUG(dbgs() << "(recalc)");
         Skipped = true;
         Start = End;
         continue;
@@ -1174,7 +1179,7 @@ bool SplitEditor::transferValues() {
       if (Start != BlockStart) {
         VNInfo *VNI = LI.extendInBlock(BlockStart, std::min(BlockEnd, End));
         assert(VNI && "Missing def for complex mapped value");
-        DEBUG(dbgs() << ':' << VNI->id << "*" << printMBBReference(*MBB));
+        LLVM_DEBUG(dbgs() << ':' << VNI->id << "*" << printMBBReference(*MBB));
         // MBB has its own def. Is it also live-out?
         if (BlockEnd <= End)
           LRC.setLiveOutValue(&*MBB, VNI);
@@ -1187,7 +1192,7 @@ bool SplitEditor::transferValues() {
       // Handle the live-in blocks covered by [Start;End).
       assert(Start <= BlockStart && "Expected live-in block");
       while (BlockStart < End) {
-        DEBUG(dbgs() << ">" << printMBBReference(*MBB));
+        LLVM_DEBUG(dbgs() << ">" << printMBBReference(*MBB));
         BlockEnd = LIS.getMBBEndIdx(&*MBB);
         if (BlockStart == ParentVNI->def) {
           // This block has the def of a parent PHI, so it isn't live-in.
@@ -1212,7 +1217,7 @@ bool SplitEditor::transferValues() {
       }
       Start = End;
     } while (Start != S.end);
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
 
   LRCalc[0].calculateValues();
@@ -1314,7 +1319,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     ++RI;
     // LiveDebugVariables should have handled all DBG_VALUE instructions.
     if (MI->isDebugValue()) {
-      DEBUG(dbgs() << "Zapping " << *MI);
+      LLVM_DEBUG(dbgs() << "Zapping " << *MI);
       MO.setReg(0);
       continue;
     }
@@ -1330,8 +1335,8 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     unsigned RegIdx = RegAssign.lookup(Idx);
     LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
     MO.setReg(LI.reg);
-    DEBUG(dbgs() << "  rewr " << printMBBReference(*MI->getParent()) << '\t'
-                 << Idx << ':' << RegIdx << '\t' << *MI);
+    LLVM_DEBUG(dbgs() << "  rewr " << printMBBReference(*MI->getParent())
+                      << '\t' << Idx << ':' << RegIdx << '\t' << *MI);
 
     // Extend liveness to Idx if the instruction reads reg.
     if (!ExtendRanges || MO.isUndef())
@@ -1416,7 +1421,7 @@ void SplitEditor::deleteRematVictims() {
       if (!MI->allDefsAreDead())
         continue;
 
-      DEBUG(dbgs() << "All defs dead: " << *MI);
+      LLVM_DEBUG(dbgs() << "All defs dead: " << *MI);
       Dead.push_back(MI);
     }
   }
@@ -1598,9 +1603,9 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
   SlotIndex Start, Stop;
   std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(MBBNum);
 
-  DEBUG(dbgs() << "%bb." << MBBNum << " [" << Start << ';' << Stop << ") intf "
-               << LeaveBefore << '-' << EnterAfter << ", live-through "
-               << IntvIn << " -> " << IntvOut);
+  LLVM_DEBUG(dbgs() << "%bb." << MBBNum << " [" << Start << ';' << Stop
+                    << ") intf " << LeaveBefore << '-' << EnterAfter
+                    << ", live-through " << IntvIn << " -> " << IntvOut);
 
   assert((IntvIn || IntvOut) && "Use splitSingleBlock for isolated blocks");
 
@@ -1611,7 +1616,7 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
   MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
 
   if (!IntvOut) {
-    DEBUG(dbgs() << ", spill on entry.\n");
+    LLVM_DEBUG(dbgs() << ", spill on entry.\n");
     //
     //        <<<<<<<<<    Possible LeaveBefore interference.
     //    |-----------|    Live through.
@@ -1625,7 +1630,7 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
   }
 
   if (!IntvIn) {
-    DEBUG(dbgs() << ", reload on exit.\n");
+    LLVM_DEBUG(dbgs() << ", reload on exit.\n");
     //
     //    >>>>>>>          Possible EnterAfter interference.
     //    |-----------|    Live through.
@@ -1639,7 +1644,7 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
   }
 
   if (IntvIn == IntvOut && !LeaveBefore && !EnterAfter) {
-    DEBUG(dbgs() << ", straight through.\n");
+    LLVM_DEBUG(dbgs() << ", straight through.\n");
     //
     //    |-----------|    Live through.
     //    -------------    Straight through, same intv, no interference.
@@ -1655,7 +1660,7 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
 
   if (IntvIn != IntvOut && (!LeaveBefore || !EnterAfter ||
                   LeaveBefore.getBaseIndex() > EnterAfter.getBoundaryIndex())) {
-    DEBUG(dbgs() << ", switch avoiding interference.\n");
+    LLVM_DEBUG(dbgs() << ", switch avoiding interference.\n");
     //
     //    >>>>     <<<<    Non-overlapping EnterAfter/LeaveBefore interference.
     //    |-----------|    Live through.
@@ -1676,7 +1681,7 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
     return;
   }
 
-  DEBUG(dbgs() << ", create local intv for interference.\n");
+  LLVM_DEBUG(dbgs() << ", create local intv for interference.\n");
   //
   //    >>><><><><<<<    Overlapping EnterAfter/LeaveBefore interference.
   //    |-----------|    Live through.
@@ -1700,17 +1705,18 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
   SlotIndex Start, Stop;
   std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
-  DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';' << Stop
-               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
-               << ", reg-in " << IntvIn << ", leave before " << LeaveBefore
-               << (BI.LiveOut ? ", stack-out" : ", killed in block"));
+  LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';'
+                    << Stop << "), uses " << BI.FirstInstr << '-'
+                    << BI.LastInstr << ", reg-in " << IntvIn
+                    << ", leave before " << LeaveBefore
+                    << (BI.LiveOut ? ", stack-out" : ", killed in block"));
 
   assert(IntvIn && "Must have register in");
   assert(BI.LiveIn && "Must be live-in");
   assert((!LeaveBefore || LeaveBefore > Start) && "Bad interference");
 
   if (!BI.LiveOut && (!LeaveBefore || LeaveBefore >= BI.LastInstr)) {
-    DEBUG(dbgs() << " before interference.\n");
+    LLVM_DEBUG(dbgs() << " before interference.\n");
     //
     //               <<<    Interference after kill.
     //     |---o---x   |    Killed in block.
@@ -1735,13 +1741,13 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
     //            \_____    Stack interval is live-out.
     //
     if (BI.LastInstr < LSP) {
-      DEBUG(dbgs() << ", spill after last use before interference.\n");
+      LLVM_DEBUG(dbgs() << ", spill after last use before interference.\n");
       selectIntv(IntvIn);
       SlotIndex Idx = leaveIntvAfter(BI.LastInstr);
       useIntv(Start, Idx);
       assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
     } else {
-      DEBUG(dbgs() << ", spill before last split point.\n");
+      LLVM_DEBUG(dbgs() << ", spill before last split point.\n");
       selectIntv(IntvIn);
       SlotIndex Idx = leaveIntvBefore(LSP);
       overlapIntv(Idx, BI.LastInstr);
@@ -1756,7 +1762,7 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
   // different register.
   unsigned LocalIntv = openIntv();
   (void)LocalIntv;
-  DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");
+  LLVM_DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");
 
   if (!BI.LiveOut || BI.LastInstr < LSP) {
     //
@@ -1792,10 +1798,11 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   SlotIndex Start, Stop;
   std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
-  DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';' << Stop
-               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
-               << ", reg-out " << IntvOut << ", enter after " << EnterAfter
-               << (BI.LiveIn ? ", stack-in" : ", defined in block"));
+  LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';'
+                    << Stop << "), uses " << BI.FirstInstr << '-'
+                    << BI.LastInstr << ", reg-out " << IntvOut
+                    << ", enter after " << EnterAfter
+                    << (BI.LiveIn ? ", stack-in" : ", defined in block"));
 
   SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());
 
@@ -1804,7 +1811,7 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   assert((!EnterAfter || EnterAfter < LSP) && "Bad interference");
 
   if (!BI.LiveIn && (!EnterAfter || EnterAfter <= BI.FirstInstr)) {
-    DEBUG(dbgs() << " after interference.\n");
+    LLVM_DEBUG(dbgs() << " after interference.\n");
     //
     //    >>>>             Interference before def.
     //    |   o---o---|    Defined in block.
@@ -1816,7 +1823,7 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   }
 
   if (!EnterAfter || EnterAfter < BI.FirstInstr.getBaseIndex()) {
-    DEBUG(dbgs() << ", reload after interference.\n");
+    LLVM_DEBUG(dbgs() << ", reload after interference.\n");
     //
     //    >>>>             Interference before def.
     //    |---o---o---|    Live-through, stack-in.
@@ -1832,7 +1839,7 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   // The interference is overlapping somewhere we wanted to use IntvOut. That
   // means we need to create a local interval that can be allocated a
   // different register.
-  DEBUG(dbgs() << ", interference overlaps uses.\n");
+  LLVM_DEBUG(dbgs() << ", interference overlaps uses.\n");
   //
   //    >>>>>>>          Interference overlapping uses.
   //    |---o---o---|    Live-through, stack-in.
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm/lib/CodeGen/SplitKit.h
index 2dafaf587801..ed664e4f81a3 100644
--- a/contrib/llvm/lib/CodeGen/SplitKit.h
+++ b/contrib/llvm/lib/CodeGen/SplitKit.h
@@ -421,7 +421,7 @@ private:
 
   SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
       MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
-      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy);
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def);
 
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp
index 608845498b48..81a41970f9e2 100644
--- a/contrib/llvm/lib/CodeGen/StackColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp
@@ -39,9 +39,9 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
@@ -422,9 +422,6 @@ class StackColoring : public MachineFunctionPass {
   /// SlotIndex analysis object.
   SlotIndexes *Indexes;
 
-  /// The stack protector object.
-  StackProtector *SP;
-
   /// The list of lifetime markers found. These markers are to be removed
   /// once the coloring is done.
   SmallVector<MachineInstr*, 8> Markers;
@@ -448,7 +445,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &Func) override;
 
 private:
   /// Used in collectMarkers
@@ -523,13 +520,11 @@ char &llvm::StackColoringID = StackColoring::ID;
 INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE,
                       "Merge disjoint stack slots", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(StackProtector)
 INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE,
                     "Merge disjoint stack slots", false, false)
 
 void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<SlotIndexes>();
-  AU.addRequired<StackProtector>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -600,12 +595,12 @@ bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
       isStart = false;
       return true;
     }
-    if (! applyFirstUse(Slot)) {
+    if (!applyFirstUse(Slot)) {
       isStart = true;
       return true;
     }
   } else if (LifetimeStartOnFirstUse && !ProtectFromEscapedAllocas) {
-    if (! MI.isDebugValue()) {
+    if (!MI.isDebugInstr()) {
       bool found = false;
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isFI())
@@ -672,13 +667,13 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
         }
         const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
         if (Allocation) {
-          DEBUG(dbgs() << "Found a lifetime ");
-          DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START
-                               ? "start"
-                               : "end"));
-          DEBUG(dbgs() << " marker for slot #" << Slot);
-          DEBUG(dbgs() << " with allocation: " << Allocation->getName()
-                       << "\n");
+          LLVM_DEBUG(dbgs() << "Found a lifetime ");
+          LLVM_DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START
+                                    ? "start"
+                                    : "end"));
+          LLVM_DEBUG(dbgs() << " marker for slot #" << Slot);
+          LLVM_DEBUG(dbgs()
+                     << " with allocation: " << Allocation->getName() << "\n");
         }
         Markers.push_back(&MI);
         MarkersFound += 1;
@@ -707,7 +702,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   for (unsigned slot = 0; slot < NumSlot; ++slot)
     if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1)
       ConservativeSlots.set(slot);
-  DEBUG(dumpBV("Conservative slots", ConservativeSlots));
+  LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots));
 
   // Step 2: compute begin/end sets for each block
 
@@ -738,14 +733,16 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
           BlockInfo.End.set(Slot);
         } else {
           for (auto Slot : slots) {
-            DEBUG(dbgs() << "Found a use of slot #" << Slot);
-            DEBUG(dbgs() << " at " << printMBBReference(*MBB) << " index ");
-            DEBUG(Indexes->getInstructionIndex(MI).print(dbgs()));
+            LLVM_DEBUG(dbgs() << "Found a use of slot #" << Slot);
+            LLVM_DEBUG(dbgs()
+                       << " at " << printMBBReference(*MBB) << " index ");
+            LLVM_DEBUG(Indexes->getInstructionIndex(MI).print(dbgs()));
             const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
             if (Allocation) {
-              DEBUG(dbgs() << " with allocation: "<< Allocation->getName());
+              LLVM_DEBUG(dbgs()
+                         << " with allocation: " << Allocation->getName());
             }
-            DEBUG(dbgs() << "\n");
+            LLVM_DEBUG(dbgs() << "\n");
             if (BlockInfo.End.test(Slot)) {
               BlockInfo.End.reset(Slot);
             }
@@ -779,8 +776,11 @@ void StackColoring::calculateLocalLiveness() {
       for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
            PE = BB->pred_end(); PI != PE; ++PI) {
         LivenessMap::const_iterator I = BlockLiveness.find(*PI);
-        assert(I != BlockLiveness.end() && "Predecessor not found");
-        LocalLiveIn |= I->second.LiveOut;
+        // PR37130: transformations prior to stack coloring can
+        // sometimes leave behind statically unreachable blocks; these
+        // can be safely skipped here.
+        if (I != BlockLiveness.end())
+          LocalLiveIn |= I->second.LiveOut;
       }
 
       // Compute LiveOut by subtracting out lifetimes that end in this
@@ -880,7 +880,7 @@ bool StackColoring::removeAllMarkers() {
   }
   Markers.clear();
 
-  DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n");
+  LLVM_DEBUG(dbgs() << "Removed " << Count << " markers.\n");
   return Count;
 }
 
@@ -894,8 +894,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     if (!VI.Var)
       continue;
     if (SlotRemap.count(VI.Slot)) {
-      DEBUG(dbgs() << "Remapping debug info for ["
-                   << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
+      LLVM_DEBUG(dbgs() << "Remapping debug info for ["
+                        << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
       VI.Slot = SlotRemap[VI.Slot];
       FixedDbg++;
     }
@@ -930,9 +930,17 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     MergedAllocas.insert(From);
     MergedAllocas.insert(To);
 
-    // Allow the stack protector to adjust its value map to account for the
-    // upcoming replacement.
-    SP->adjustForColoring(From, To);
+    // Transfer the stack protector layout tag, but make sure that SSPLK_AddrOf
+    // does not overwrite SSPLK_SmallArray or SSPLK_LargeArray, and make sure
+    // that SSPLK_SmallArray does not overwrite SSPLK_LargeArray.
+    MachineFrameInfo::SSPLayoutKind FromKind
+        = MFI->getObjectSSPLayout(SI.first);
+    MachineFrameInfo::SSPLayoutKind ToKind = MFI->getObjectSSPLayout(SI.second);
+    if (FromKind != MachineFrameInfo::SSPLK_None &&
+        (ToKind == MachineFrameInfo::SSPLK_None ||
+         (ToKind != MachineFrameInfo::SSPLK_LargeArray &&
+          FromKind != MachineFrameInfo::SSPLK_AddrOf)))
+      MFI->setObjectSSPLayout(SI.second, FromKind);
 
     // The new alloca might not be valid in a llvm.dbg.declare for this
     // variable, so undef out the use to make the verifier happy.
@@ -993,13 +1001,13 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         // the calculated range then it means that the alloca usage moved
         // outside of the lifetime markers, or that the user has a bug.
         // NOTE: Alloca address calculations which happen outside the lifetime
-        // zone are are okay, despite the fact that we don't have a good way
+        // zone are okay, despite the fact that we don't have a good way
         // for validating all of the usages of the calculation.
 #ifndef NDEBUG
         bool TouchesMemory = I.mayLoad() || I.mayStore();
         // If we *don't* protect the user from escaped allocas, don't bother
         // validating the instructions.
-        if (!I.isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
+        if (!I.isDebugInstr() && TouchesMemory && ProtectFromEscapedAllocas) {
           SlotIndex Index = Indexes->getInstructionIndex(I);
           const LiveInterval *Interval = &*Intervals[FromSlot];
           assert(Interval->find(Index) != Interval->end() &&
@@ -1064,16 +1072,16 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
             SlotRemap.count(H.CatchObj.FrameIndex))
           H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex];
 
-  DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
-  DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
-  DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
+  LLVM_DEBUG(dbgs() << "Fixed " << FixedMemOp << " machine memory operands.\n");
+  LLVM_DEBUG(dbgs() << "Fixed " << FixedDbg << " debug locations.\n");
+  LLVM_DEBUG(dbgs() << "Fixed " << FixedInstr << " machine instructions.\n");
 }
 
 void StackColoring::removeInvalidSlotRanges() {
   for (MachineBasicBlock &BB : *MF)
     for (MachineInstr &I : BB) {
       if (I.getOpcode() == TargetOpcode::LIFETIME_START ||
-          I.getOpcode() == TargetOpcode::LIFETIME_END || I.isDebugValue())
+          I.getOpcode() == TargetOpcode::LIFETIME_END || I.isDebugInstr())
         continue;
 
       // Some intervals are suspicious! In some cases we find address
@@ -1104,7 +1112,7 @@ void StackColoring::removeInvalidSlotRanges() {
         SlotIndex Index = Indexes->getInstructionIndex(I);
         if (Interval->find(Index) == Interval->end()) {
           Interval->clear();
-          DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
+          LLVM_DEBUG(dbgs() << "Invalidating range #" << Slot << "\n");
           EscapedAllocas++;
         }
       }
@@ -1128,12 +1136,11 @@ void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
 }
 
 bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
-  DEBUG(dbgs() << "********** Stack Coloring **********\n"
-               << "********** Function: " << Func.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Stack Coloring **********\n"
+                    << "********** Function: " << Func.getName() << '\n');
   MF = &Func;
   MFI = &MF->getFrameInfo();
   Indexes = &getAnalysis<SlotIndexes>();
-  SP = &getAnalysis<StackProtector>();
   BlockLiveness.clear();
   BasicBlocks.clear();
   BasicBlockNumbering.clear();
@@ -1156,21 +1163,23 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   unsigned NumMarkers = collectMarkers(NumSlots);
 
   unsigned TotalSize = 0;
-  DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
-  DEBUG(dbgs()<<"Slot structure:\n");
+  LLVM_DEBUG(dbgs() << "Found " << NumMarkers << " markers and " << NumSlots
+                    << " slots\n");
+  LLVM_DEBUG(dbgs() << "Slot structure:\n");
 
   for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
-    DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
+    LLVM_DEBUG(dbgs() << "Slot #" << i << " - " << MFI->getObjectSize(i)
+                      << " bytes.\n");
     TotalSize += MFI->getObjectSize(i);
   }
 
-  DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");
+  LLVM_DEBUG(dbgs() << "Total Stack size: " << TotalSize << " bytes\n\n");
 
   // Don't continue because there are not enough lifetime markers, or the
   // stack is too small, or we are told not to optimize the slots.
   if (NumMarkers < 2 || TotalSize < 16 || DisableColoring ||
       skipFunction(Func.getFunction())) {
-    DEBUG(dbgs()<<"Will not try to merge slots.\n");
+    LLVM_DEBUG(dbgs() << "Will not try to merge slots.\n");
     return removeAllMarkers();
   }
 
@@ -1183,12 +1192,12 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
 
   // Calculate the liveness of each block.
   calculateLocalLiveness();
-  DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
-  DEBUG(dump());
+  LLVM_DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
+  LLVM_DEBUG(dump());
 
   // Propagate the liveness information.
   calculateLiveIntervals(NumSlots);
-  DEBUG(dumpIntervals());
+  LLVM_DEBUG(dumpIntervals());
 
   // Search for allocas which are used outside of the declared lifetime
   // markers.
@@ -1224,7 +1233,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   });
 
   for (auto &s : LiveStarts)
-    std::sort(s.begin(), s.end());
+    llvm::sort(s.begin(), s.end());
 
   bool Changed = true;
   while (Changed) {
@@ -1259,8 +1268,8 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
 
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
-          DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
-                SecondSlot<<" together.\n");
+          LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"
+                            << SecondSlot << " together.\n");
           unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
                                            MFI->getObjectAlignment(SecondSlot));
 
@@ -1280,8 +1289,8 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   // Record statistics.
   StackSpaceSaved += ReducedSize;
   StackSlotMerged += RemovedSlots;
-  DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<<
-        ReducedSize<<" bytes\n");
+  LLVM_DEBUG(dbgs() << "Merge " << RemovedSlots << " slots. Saved "
+                    << ReducedSize << " bytes\n");
 
   // Scan the entire function and update all machine operands that use frame
   // indices to use the remapped frame index.
diff --git a/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
index cc9af92c395f..00cf8070be5e 100644
--- a/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -39,7 +39,7 @@ STATISTIC(NumBBsHaveNoStackmap,   "Number of basic blocks with no stackmap");
 STATISTIC(NumStackMaps,           "Number of StackMaps visited");
 
 namespace {
-/// \brief This pass calculates the liveness information for each basic block in
+/// This pass calculates the liveness information for each basic block in
 /// a function and attaches the register live-out information to a patchpoint
 /// intrinsic if present.
 ///
@@ -54,10 +54,10 @@ class StackMapLiveness : public MachineFunctionPass {
 public:
   static char ID;
 
-  /// \brief Default construct and initialize the pass.
+  /// Default construct and initialize the pass.
   StackMapLiveness();
 
-  /// \brief Tell the pass manager which passes we depend on and what
+  /// Tell the pass manager which passes we depend on and what
   /// information we preserve.
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
@@ -66,17 +66,17 @@ public:
         MachineFunctionProperties::Property::NoVRegs);
   }
 
-  /// \brief Calculate the liveness information for the given machine function.
+  /// Calculate the liveness information for the given machine function.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
-  /// \brief Performs the actual liveness calculation for the function.
+  /// Performs the actual liveness calculation for the function.
   bool calculateLiveness(MachineFunction &MF);
 
-  /// \brief Add the current register live set to the instruction.
+  /// Add the current register live set to the instruction.
   void addLiveOutSetToMI(MachineFunction &MF, MachineInstr &MI);
 
-  /// \brief Create a register mask and initialize it with the registers from
+  /// Create a register mask and initialize it with the registers from
   /// the register live set.
   uint32_t *createRegisterMask(MachineFunction &MF) const;
 };
@@ -106,8 +106,8 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
   if (!EnablePatchPointLiveness)
     return false;
 
-  DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: " << MF.getName()
-               << " **********\n");
+  LLVM_DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
+                    << MF.getName() << " **********\n");
   TRI = MF.getSubtarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
@@ -124,7 +124,7 @@ bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
   bool HasChanged = false;
   // For all basic blocks in the function.
   for (auto &MBB : MF) {
-    DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
+    LLVM_DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
     LiveRegs.init(*TRI);
     // FIXME: This should probably be addLiveOuts().
     LiveRegs.addLiveOutsNoPristines(MBB);
@@ -138,7 +138,7 @@ bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
         HasStackMap = true;
         ++NumStackMaps;
       }
-      DEBUG(dbgs() << "   " << LiveRegs << "   " << *I);
+      LLVM_DEBUG(dbgs() << "   " << LiveRegs << "   " << *I);
       LiveRegs.stepBackward(*I);
     }
     ++NumBBsVisited;
@@ -160,7 +160,7 @@ void StackMapLiveness::addLiveOutSetToMI(MachineFunction &MF,
 /// register live set.
 uint32_t *StackMapLiveness::createRegisterMask(MachineFunction &MF) const {
   // The mask is owned and cleaned up by the Machine Function.
-  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  uint32_t *Mask = MF.allocateRegMask();
   for (auto Reg : LiveRegs)
     Mask[Reg / 32] |= 1U << (Reg % 32);
 
diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp
index e66a25bec911..19a191c01db9 100644
--- a/contrib/llvm/lib/CodeGen/StackMaps.cpp
+++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp
@@ -268,11 +268,11 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   // in the list. Merge entries that refer to the same dwarf register and use
   // the maximum size that needs to be spilled.
 
-  std::sort(LiveOuts.begin(), LiveOuts.end(),
-            [](const LiveOutReg &LHS, const LiveOutReg &RHS) {
-              // Only sort by the dwarf register number.
-              return LHS.DwarfRegNum < RHS.DwarfRegNum;
-            });
+  llvm::sort(LiveOuts.begin(), LiveOuts.end(),
+             [](const LiveOutReg &LHS, const LiveOutReg &RHS) {
+               // Only sort by the dwarf register number.
+               return LHS.DwarfRegNum < RHS.DwarfRegNum;
+             });
 
   for (auto I = LiveOuts.begin(), E = LiveOuts.end(); I != E; ++I) {
     for (auto II = std::next(I); II != E; ++II) {
@@ -420,13 +420,13 @@ void StackMaps::emitStackmapHeader(MCStreamer &OS) {
   OS.EmitIntValue(0, 2);               // Reserved.
 
   // Num functions.
-  DEBUG(dbgs() << WSMP << "#functions = " << FnInfos.size() << '\n');
+  LLVM_DEBUG(dbgs() << WSMP << "#functions = " << FnInfos.size() << '\n');
   OS.EmitIntValue(FnInfos.size(), 4);
   // Num constants.
-  DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.size() << '\n');
+  LLVM_DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.size() << '\n');
   OS.EmitIntValue(ConstPool.size(), 4);
   // Num callsites.
-  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
+  LLVM_DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
   OS.EmitIntValue(CSInfos.size(), 4);
 }
 
@@ -439,11 +439,11 @@ void StackMaps::emitStackmapHeader(MCStreamer &OS) {
 /// }
 void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
   // Function Frame records.
-  DEBUG(dbgs() << WSMP << "functions:\n");
+  LLVM_DEBUG(dbgs() << WSMP << "functions:\n");
   for (auto const &FR : FnInfos) {
-    DEBUG(dbgs() << WSMP << "function addr: " << FR.first
-                 << " frame size: " << FR.second.StackSize
-                 << " callsite count: " << FR.second.RecordCount << '\n');
+    LLVM_DEBUG(dbgs() << WSMP << "function addr: " << FR.first
+                      << " frame size: " << FR.second.StackSize
+                      << " callsite count: " << FR.second.RecordCount << '\n');
     OS.EmitSymbolValue(FR.first, 8);
     OS.EmitIntValue(FR.second.StackSize, 8);
     OS.EmitIntValue(FR.second.RecordCount, 8);
@@ -455,9 +455,9 @@ void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
 /// int64  : Constants[NumConstants]
 void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
   // Constant pool entries.
-  DEBUG(dbgs() << WSMP << "constants:\n");
+  LLVM_DEBUG(dbgs() << WSMP << "constants:\n");
   for (const auto &ConstEntry : ConstPool) {
-    DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
+    LLVM_DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
     OS.EmitIntValue(ConstEntry.second, 8);
   }
 }
@@ -492,7 +492,7 @@ void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
 ///   0x4, Constant, Offset              (small constant)
 ///   0x5, ConstIndex, Constants[Offset] (large constant)
 void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
-  DEBUG(print(dbgs()));
+  LLVM_DEBUG(print(dbgs()));
   // Callsite entries.
   for (const auto &CSI : CSInfos) {
     const LocationVec &CSLocs = CSI.Locations;
@@ -569,7 +569,7 @@ void StackMaps::serializeToStackMapSection() {
   OS.EmitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_StackMaps")));
 
   // Serialize data.
-  DEBUG(dbgs() << "********** Stack Map Output **********\n");
+  LLVM_DEBUG(dbgs() << "********** Stack Map Output **********\n");
   emitStackmapHeader(OS);
   emitFunctionFrameRecords(OS);
   emitConstantPoolEntries(OS);
diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp
index 62cef95a4af2..cb12c7ce6e82 100644
--- a/contrib/llvm/lib/CodeGen/StackProtector.cpp
+++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
@@ -69,32 +70,6 @@ INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE,
 
 FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); }
 
-StackProtector::SSPLayoutKind
-StackProtector::getSSPLayout(const AllocaInst *AI) const {
-  return AI ? Layout.lookup(AI) : SSPLK_None;
-}
-
-void StackProtector::adjustForColoring(const AllocaInst *From,
-                                       const AllocaInst *To) {
-  // When coloring replaces one alloca with another, transfer the SSPLayoutKind
-  // tag from the remapped to the target alloca. The remapped alloca should
-  // have a size smaller than or equal to the replacement alloca.
-  SSPLayoutMap::iterator I = Layout.find(From);
-  if (I != Layout.end()) {
-    SSPLayoutKind Kind = I->second;
-    Layout.erase(I);
-
-    // Transfer the tag, but make sure that SSPLK_AddrOf does not overwrite
-    // SSPLK_SmallArray or SSPLK_LargeArray, and make sure that
-    // SSPLK_SmallArray does not overwrite SSPLK_LargeArray.
-    I = Layout.find(To);
-    if (I == Layout.end())
-      Layout.insert(std::make_pair(To, Kind));
-    else if (I->second != SSPLK_LargeArray && Kind != SSPLK_AddrOf)
-      I->second = Kind;
-  }
-}
-
 void StackProtector::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
   AU.addPreserved<DominatorTreeWrapperPass>();
@@ -182,6 +157,14 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
   return NeedsProtector;
 }
 
+static bool isLifetimeInst(const Instruction *I) {
+  if (const auto Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+    const auto Id = Intrinsic->getIntrinsicID();
+    return Id == Intrinsic::lifetime_start || Id == Intrinsic::lifetime_end;
+  }
+  return false;
+}
+
 bool StackProtector::HasAddressTaken(const Instruction *AI) {
   for (const User *U : AI->users()) {
     if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
@@ -190,8 +173,10 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
     } else if (const PtrToIntInst *SI = dyn_cast<PtrToIntInst>(U)) {
       if (AI == SI->getOperand(0))
         return true;
-    } else if (isa<CallInst>(U)) {
-      return true;
+    } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
+      // Ignore intrinsics that are not calls. TODO: Use isLoweredToCall().
+      if (!isa<DbgInfoIntrinsic>(CI) && !isLifetimeInst(CI))
+        return true;
     } else if (isa<InvokeInst>(U)) {
       return true;
     } else if (const SelectInst *SI = dyn_cast<SelectInst>(U)) {
@@ -214,7 +199,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
   return false;
 }
 
-/// \brief Check whether or not this function needs a stack protector based
+/// Check whether or not this function needs a stack protector based
 /// upon the stack protector level.
 ///
 /// We use two heuristics: a standard (ssp) and strong (sspstrong).
@@ -278,18 +263,21 @@ bool StackProtector::RequiresStackProtector() {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
-              Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              Layout.insert(std::make_pair(AI,
+                                           MachineFrameInfo::SSPLK_LargeArray));
               ORE.emit(RemarkBuilder);
               NeedsProtector = true;
             } else if (Strong) {
               // Require protectors for all alloca calls in strong mode.
-              Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              Layout.insert(std::make_pair(AI,
+                                           MachineFrameInfo::SSPLK_SmallArray));
               ORE.emit(RemarkBuilder);
               NeedsProtector = true;
             }
           } else {
             // A call to alloca with a variable size requires protectors.
-            Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            Layout.insert(std::make_pair(AI,
+                                         MachineFrameInfo::SSPLK_LargeArray));
             ORE.emit(RemarkBuilder);
             NeedsProtector = true;
           }
@@ -298,8 +286,9 @@ bool StackProtector::RequiresStackProtector() {
 
         bool IsLarge = false;
         if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
-          Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
-                                                   : SSPLK_SmallArray));
+          Layout.insert(std::make_pair(AI, IsLarge
+                                       ? MachineFrameInfo::SSPLK_LargeArray
+                                       : MachineFrameInfo::SSPLK_SmallArray));
           ORE.emit([&]() {
             return OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I)
                    << "Stack protection applied to function "
@@ -313,7 +302,7 @@ bool StackProtector::RequiresStackProtector() {
 
         if (Strong && HasAddressTaken(AI)) {
           ++NumAddrTaken;
-          Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          Layout.insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf));
           ORE.emit([&]() {
             return OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken",
                                       &I)
@@ -523,3 +512,23 @@ BasicBlock *StackProtector::CreateFailBB() {
 bool StackProtector::shouldEmitSDCheck(const BasicBlock &BB) const {
   return HasPrologue && !HasIRCheck && dyn_cast<ReturnInst>(BB.getTerminator());
 }
+
+void StackProtector::copyToMachineFrameInfo(MachineFrameInfo &MFI) const {
+  if (Layout.empty())
+    return;
+
+  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    const AllocaInst *AI = MFI.getObjectAllocation(I);
+    if (!AI)
+      continue;
+
+    SSPLayoutMap::const_iterator LI = Layout.find(AI);
+    if (LI == Layout.end())
+      continue;
+
+    MFI.setObjectSSPLayout(I, LI->second);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
index 8fc7a4a32842..eb15b15a24a6 100644
--- a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -82,14 +82,14 @@ namespace {
     // AllColors - If index is set, it's a spill slot, i.e. color.
     // FIXME: This assumes PEI locate spill slot with smaller indices
     // closest to stack pointer / frame pointer. Therefore, smaller
-    // index == better color.
-    BitVector AllColors;
+    // index == better color. This is per stack ID.
+    SmallVector<BitVector, 2> AllColors;
 
-    // NextColor - Next "color" that's not yet used.
-    int NextColor = -1;
+    // NextColor - Next "color" that's not yet used. This is per stack ID.
+    SmallVector<int, 2> NextColors = { -1 };
 
-    // UsedColors - "Colors" that have been assigned.
-    BitVector UsedColors;
+    // UsedColors - "Colors" that have been assigned. This is per stack ID
+    SmallVector<BitVector, 2> UsedColors;
 
     // Assignments - Color to intervals mapping.
     SmallVector<SmallVector<LiveInterval*,4>, 16> Assignments;
@@ -196,10 +196,15 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
 /// to a sorted (by weight) list.
 void StackSlotColoring::InitializeSlots() {
   int LastFI = MFI->getObjectIndexEnd();
+
+  // There is always at least one stack ID.
+  AllColors.resize(1);
+  UsedColors.resize(1);
+
   OrigAlignments.resize(LastFI);
   OrigSizes.resize(LastFI);
-  AllColors.resize(LastFI);
-  UsedColors.resize(LastFI);
+  AllColors[0].resize(LastFI);
+  UsedColors[0].resize(LastFI);
   Assignments.resize(LastFI);
 
   using Pair = std::iterator_traits<LiveStacks::iterator>::value_type;
@@ -209,29 +214,42 @@ void StackSlotColoring::InitializeSlots() {
   Intervals.reserve(LS->getNumIntervals());
   for (auto &I : *LS)
     Intervals.push_back(&I);
-  std::sort(Intervals.begin(), Intervals.end(),
-            [](Pair *LHS, Pair *RHS) { return LHS->first < RHS->first; });
+  llvm::sort(Intervals.begin(), Intervals.end(),
+             [](Pair *LHS, Pair *RHS) { return LHS->first < RHS->first; });
 
   // Gather all spill slots into a list.
-  DEBUG(dbgs() << "Spill slot intervals:\n");
+  LLVM_DEBUG(dbgs() << "Spill slot intervals:\n");
   for (auto *I : Intervals) {
     LiveInterval &li = I->second;
-    DEBUG(li.dump());
+    LLVM_DEBUG(li.dump());
     int FI = TargetRegisterInfo::stackSlot2Index(li.reg);
     if (MFI->isDeadObjectIndex(FI))
       continue;
+
     SSIntervals.push_back(&li);
     OrigAlignments[FI] = MFI->getObjectAlignment(FI);
     OrigSizes[FI]      = MFI->getObjectSize(FI);
-    AllColors.set(FI);
+
+    auto StackID = MFI->getStackID(FI);
+    if (StackID != 0) {
+      AllColors.resize(StackID + 1);
+      UsedColors.resize(StackID + 1);
+      AllColors[StackID].resize(LastFI);
+      UsedColors[StackID].resize(LastFI);
+    }
+
+    AllColors[StackID].set(FI);
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   // Sort them by weight.
   std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
 
+  NextColors.resize(AllColors.size());
+
   // Get first "color".
-  NextColor = AllColors.find_first();
+  for (unsigned I = 0, E = AllColors.size(); I != E; ++I)
+    NextColors[I] = AllColors[I].find_first();
 }
 
 /// OverlapWithAssignments - Return true if LiveInterval overlaps with any
@@ -252,37 +270,41 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) {
   int Color = -1;
   bool Share = false;
   int FI = TargetRegisterInfo::stackSlot2Index(li->reg);
+  uint8_t StackID = MFI->getStackID(FI);
 
   if (!DisableSharing) {
+
     // Check if it's possible to reuse any of the used colors.
-    Color = UsedColors.find_first();
+    Color = UsedColors[StackID].find_first();
     while (Color != -1) {
       if (!OverlapWithAssignments(li, Color)) {
         Share = true;
         ++NumEliminated;
         break;
       }
-      Color = UsedColors.find_next(Color);
+      Color = UsedColors[StackID].find_next(Color);
     }
   }
 
   if (Color != -1 && MFI->getStackID(Color) != MFI->getStackID(FI)) {
-    DEBUG(dbgs() << "cannot share FIs with different stack IDs\n");
+    LLVM_DEBUG(dbgs() << "cannot share FIs with different stack IDs\n");
     Share = false;
   }
 
   // Assign it to the first available color (assumed to be the best) if it's
   // not possible to share a used color with other objects.
   if (!Share) {
-    assert(NextColor != -1 && "No more spill slots?");
-    Color = NextColor;
-    UsedColors.set(Color);
-    NextColor = AllColors.find_next(NextColor);
+    assert(NextColors[StackID] != -1 && "No more spill slots?");
+    Color = NextColors[StackID];
+    UsedColors[StackID].set(Color);
+    NextColors[StackID] = AllColors[StackID].find_next(NextColors[StackID]);
   }
 
+  assert(MFI->getStackID(Color) == MFI->getStackID(FI));
+
   // Record the assignment.
   Assignments[Color].push_back(li);
-  DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n");
+  LLVM_DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n");
 
   // Change size and alignment of the allocated slot. If there are multiple
   // objects sharing the same slot, then make sure the size and alignment
@@ -305,7 +327,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
   BitVector UsedColors(NumObjs);
 
-  DEBUG(dbgs() << "Color spill slot intervals:\n");
+  LLVM_DEBUG(dbgs() << "Color spill slot intervals:\n");
   bool Changed = false;
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
@@ -319,7 +341,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
     Changed |= (SS != NewSS);
   }
 
-  DEBUG(dbgs() << "\nSpill slots after coloring:\n");
+  LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n");
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
     int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
@@ -330,8 +352,8 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i)
-    DEBUG(SSIntervals[i]->dump());
-  DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(SSIntervals[i]->dump());
+  LLVM_DEBUG(dbgs() << '\n');
 #endif
 
   if (!Changed)
@@ -357,10 +379,13 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   }
 
   // Delete unused stack slots.
-  while (NextColor != -1) {
-    DEBUG(dbgs() << "Removing unused stack object fi#" << NextColor << "\n");
-    MFI->RemoveStackObject(NextColor);
-    NextColor = AllColors.find_next(NextColor);
+  for (int StackID = 0, E = AllColors.size(); StackID != E; ++StackID) {
+    int NextColor = NextColors[StackID];
+    while (NextColor != -1) {
+      LLVM_DEBUG(dbgs() << "Removing unused stack object fi#" << NextColor << "\n");
+      MFI->RemoveStackObject(NextColor);
+      NextColor = AllColors[StackID].find_next(NextColor);
+    }
   }
 
   return true;
@@ -382,6 +407,8 @@ void StackSlotColoring::RewriteInstruction(MachineInstr &MI,
     int NewFI = SlotMapping[OldFI];
     if (NewFI == -1 || NewFI == OldFI)
       continue;
+
+    assert(MFI->getStackID(OldFI) == MFI->getStackID(NewFI));
     MO.setIndex(NewFI);
   }
 
@@ -418,17 +445,21 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
 
     unsigned LoadReg = 0;
     unsigned StoreReg = 0;
-    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS)))
+    unsigned LoadSize = 0;
+    unsigned StoreSize = 0;
+    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS, LoadSize)))
       continue;
     // Skip the ...pseudo debugging... instructions between a load and store.
-    while ((NextMI != E) && NextMI->isDebugValue()) {
+    while ((NextMI != E) && NextMI->isDebugInstr()) {
       ++NextMI;
       ++I;
     }
     if (NextMI == E) continue;
-    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS)))
+    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS, StoreSize)))
+      continue;
+    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1 ||
+        LoadSize != StoreSize)
       continue;
-    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
 
     ++NumDead;
     changed = true;
@@ -450,10 +481,13 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
 }
 
 bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
-      dbgs() << "********** Stack Slot Coloring **********\n"
-             << "********** Function: " << MF.getName() << '\n';
-    });
+  LLVM_DEBUG({
+    dbgs() << "********** Stack Slot Coloring **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  if (skipFunction(MF.getFunction()))
+    return false;
 
   MFI = &MF.getFrameInfo();
   TII = MF.getSubtarget().getInstrInfo();
@@ -479,7 +513,9 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   InitializeSlots();
   Changed = ColorSlots(MF);
 
-  NextColor = -1;
+  for (int &Next : NextColors)
+    Next = -1;
+
   SSIntervals.clear();
   for (unsigned i = 0, e = SSRefs.size(); i != e; ++i)
     SSRefs[i].clear();
diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
index df1eebf43b2b..25cd7802264e 100644
--- a/contrib/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
@@ -7,8 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass duplicates basic blocks ending in unconditional branches into
-// the tails of their predecessors, using the TailDuplicator utility class.
+/// \file This pass duplicates basic blocks ending in unconditional branches
+/// into the tails of their predecessors, using the TailDuplicator utility
+/// class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -26,38 +27,55 @@ using namespace llvm;
 
 namespace {
 
-/// Perform tail duplication. Delegates to TailDuplicator
-class TailDuplicatePass : public MachineFunctionPass {
+class TailDuplicateBase : public MachineFunctionPass {
   TailDuplicator Duplicator;
-
+  bool PreRegAlloc;
 public:
-  static char ID;
-
-  explicit TailDuplicatePass() : MachineFunctionPass(ID) {}
+  TailDuplicateBase(char &PassID, bool PreRegAlloc)
+    : MachineFunctionPass(PassID), PreRegAlloc(PreRegAlloc) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class TailDuplicate : public TailDuplicateBase {
+public:
+  static char ID;
+  TailDuplicate() : TailDuplicateBase(ID, false) {
+    initializeTailDuplicatePass(*PassRegistry::getPassRegistry());
+  }
+};
+
+class EarlyTailDuplicate : public TailDuplicateBase {
+public:
+  static char ID;
+  EarlyTailDuplicate() : TailDuplicateBase(ID, true) {
+    initializeEarlyTailDuplicatePass(*PassRegistry::getPassRegistry());
+  }
 };
 
 } // end anonymous namespace
 
-char TailDuplicatePass::ID = 0;
+char TailDuplicate::ID;
+char EarlyTailDuplicate::ID;
 
-char &llvm::TailDuplicateID = TailDuplicatePass::ID;
+char &llvm::TailDuplicateID = TailDuplicate::ID;
+char &llvm::EarlyTailDuplicateID = EarlyTailDuplicate::ID;
 
-INITIALIZE_PASS(TailDuplicatePass, DEBUG_TYPE, "Tail Duplication", false, false)
+INITIALIZE_PASS(TailDuplicate, DEBUG_TYPE, "Tail Duplication", false, false)
+INITIALIZE_PASS(EarlyTailDuplicate, "early-tailduplication",
+                "Early Tail Duplication", false, false)
 
-bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
+bool TailDuplicateBase::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
   auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-
-  // TODO: Querying isSSA() to determine pre-/post-regalloc is fragile, better
-  // split this into two passes instead.
-  bool PreRegAlloc = MF.getRegInfo().isSSA();
-  Duplicator.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ false);
+  Duplicator.initMF(MF, PreRegAlloc, MBPI, /*LayoutMode=*/false);
 
   bool MadeChange = false;
   while (Duplicator.tailDuplicateBlocks())
@@ -65,8 +83,3 @@ bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
 
   return MadeChange;
 }
-
-void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
diff --git a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
index f51c884839b3..b118c176a897 100644
--- a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -261,7 +262,7 @@ bool TailDuplicator::tailDuplicateBlocks() {
   bool MadeChange = false;
 
   if (PreRegAlloc && TailDupVerify) {
-    DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
+    LLVM_DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
     VerifyPHIs(*MF, true);
   }
 
@@ -371,6 +372,13 @@ void TailDuplicator::duplicateInstruction(
     MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
     DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
     const DenseSet<unsigned> &UsedByPhi) {
+  // Allow duplication of CFI instructions.
+  if (MI->isCFIInstruction()) {
+    BuildMI(*PredBB, PredBB->end(), PredBB->findDebugLoc(PredBB->begin()),
+      TII->get(TargetOpcode::CFI_INSTRUCTION)).addCFIIndex(
+      MI->getOperand(0).getCFIIndex());
+    return;
+  }
   MachineInstr &NewMI = TII->duplicate(*PredBB, PredBB->end(), *MI);
   if (PreRegAlloc) {
     for (unsigned i = 0, e = NewMI.getNumOperands(); i != e; ++i) {
@@ -585,7 +593,13 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   unsigned InstrCount = 0;
   for (MachineInstr &MI : TailBB) {
     // Non-duplicable things shouldn't be tail-duplicated.
-    if (MI.isNotDuplicable())
+    // CFI instructions are marked as non-duplicable, because Darwin compact
+    // unwind info emission can't handle multiple prologue setups. In case of
+    // DWARF, allow them be duplicated, so that their existence doesn't prevent
+    // tail duplication of some basic blocks, that would be duplicated otherwise.
+    if (MI.isNotDuplicable() &&
+        (TailBB.getParent()->getTarget().getTargetTriple().isOSDarwin() ||
+        !MI.isCFIInstruction()))
       return false;
 
     // Convergent instructions can be duplicated only if doing so doesn't add
@@ -605,7 +619,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
     if (PreRegAlloc && MI.isCall())
       return false;
 
-    if (!MI.isPHI() && !MI.isDebugValue())
+    if (!MI.isPHI() && !MI.isMetaInstruction())
       InstrCount += 1;
 
     if (InstrCount > MaxDuplicateCount)
@@ -704,8 +718,8 @@ bool TailDuplicator::duplicateSimpleBB(
       continue;
 
     Changed = true;
-    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
-                 << "From simple Succ: " << *TailBB);
+    LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                      << "From simple Succ: " << *TailBB);
 
     MachineBasicBlock *NewTarget = *TailBB->succ_begin();
     MachineBasicBlock *NextBB = PredBB->getNextNode();
@@ -785,8 +799,8 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
                                    MachineBasicBlock *ForcedLayoutPred,
                                    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                                    SmallVectorImpl<MachineInstr *> &Copies) {
-  DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB)
-               << '\n');
+  LLVM_DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB)
+                    << '\n');
 
   DenseSet<unsigned> UsedByPhi;
   getRegsUsedByPHIs(*TailBB, &UsedByPhi);
@@ -816,8 +830,8 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     if (IsLayoutSuccessor)
       continue;
 
-    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
-                 << "From Succ: " << *TailBB);
+    LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                      << "From Succ: " << *TailBB);
 
     TDBBs.push_back(PredBB);
 
@@ -879,8 +893,8 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
       (!PriorTBB || PriorTBB == TailBB) &&
       TailBB->pred_size() == 1 &&
       !TailBB->hasAddressTaken()) {
-    DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
-                 << "From MBB: " << *TailBB);
+    LLVM_DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
+                      << "From MBB: " << *TailBB);
     // There may be a branch to the layout successor. This is unlikely but it
     // happens. The correct thing to do is to remove the branch before
     // duplicating the instructions in all cases.
@@ -985,7 +999,7 @@ void TailDuplicator::removeDeadBlock(
     MachineBasicBlock *MBB,
     function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   assert(MBB->pred_empty() && "MBB must be dead!");
-  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+  LLVM_DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
   if (RemovalCallback)
     (*RemovalCallback)(MBB);
diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index b2151eb49655..f0cfa2fbe4fd 100644
--- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -36,6 +36,13 @@ bool TargetFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
   return Attr.getValueAsString() == "true";
 }
 
+bool TargetFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const {
+  assert(MF.getFunction().hasFnAttribute(Attribute::NoReturn) &&
+         MF.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+         !MF.getFunction().hasFnAttribute(Attribute::UWTable));
+  return false;
+}
+
 /// Returns the displacement from the frame register to the stack
 /// frame of the specified index, along with the frame register used
 /// (in output arg FrameReg). This is the default implementation which
@@ -85,6 +92,19 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MF.getFunction().hasFnAttribute(Attribute::Naked))
     return;
 
+  // Noreturn+nounwind functions never restore CSR, so no saves are needed.
+  // Purely noreturn functions may still return through throws, so those must
+  // save CSR for caller exception handlers.
+  //
+  // If the function uses longjmp to break out of its current path of
+  // execution we do not need the CSR spills either: setjmp stores all CSRs
+  // it was called with into the jmp_buf, which longjmp then restores.
+  if (MF.getFunction().hasFnAttribute(Attribute::NoReturn) &&
+        MF.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+        !MF.getFunction().hasFnAttribute(Attribute::UWTable) &&
+        enableCalleeSaveSkip(MF))
+    return;
+
   // Functions which call __builtin_unwind_init get all their registers saved.
   bool CallsUnwindInit = MF.callsUnwindInit();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -100,7 +120,16 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
   // When HHVM function is called, the stack is skewed as the return address
   // is removed from the stack before we enter the function.
   if (LLVM_UNLIKELY(MF.getFunction().getCallingConv() == CallingConv::HHVM))
-    return MF.getTarget().getPointerSize();
+    return MF.getTarget().getAllocaPointerSize();
 
   return 0;
 }
+
+int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+  llvm_unreachable("getInitialCFAOffset() not implemented!");
+}
+
+unsigned TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF)
+    const {
+  llvm_unreachable("getInitialCFARegister() not implemented!");
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
index bd90ed5b55b8..963f8178b509 100644
--- a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -174,6 +174,14 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   bool Reg2IsUndef = MI.getOperand(Idx2).isUndef();
   bool Reg1IsInternal = MI.getOperand(Idx1).isInternalRead();
   bool Reg2IsInternal = MI.getOperand(Idx2).isInternalRead();
+  // Avoid calling isRenamable for virtual registers since we assert that
+  // renamable property is only queried/set for physical registers.
+  bool Reg1IsRenamable = TargetRegisterInfo::isPhysicalRegister(Reg1)
+                             ? MI.getOperand(Idx1).isRenamable()
+                             : false;
+  bool Reg2IsRenamable = TargetRegisterInfo::isPhysicalRegister(Reg2)
+                             ? MI.getOperand(Idx2).isRenamable()
+                             : false;
   // If destination is tied to either of the commuted source register, then
   // it must be updated.
   if (HasDef && Reg0 == Reg1 &&
@@ -211,6 +219,12 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   CommutedMI->getOperand(Idx1).setIsUndef(Reg2IsUndef);
   CommutedMI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);
   CommutedMI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
+  // Avoid calling setIsRenamable for virtual registers since we assert that
+  // renamable property is only queried/set for physical registers.
+  if (TargetRegisterInfo::isPhysicalRegister(Reg1))
+    CommutedMI->getOperand(Idx2).setIsRenamable(Reg1IsRenamable);
+  if (TargetRegisterInfo::isPhysicalRegister(Reg2))
+    CommutedMI->getOperand(Idx1).setIsRenamable(Reg2IsRenamable);
   return CommutedMI;
 }
 
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b29a33ac1c14..43f4bad595e3 100644
--- a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -50,6 +49,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
@@ -118,7 +118,7 @@ static cl::opt<int> MinPercentageForPredictableBranch(
 void TargetLoweringBase::InitLibcalls(const Triple &TT) {
 #define HANDLE_LIBCALL(code, name) \
   setLibcallName(RTLIB::code, name);
-#include "llvm/CodeGen/RuntimeLibcalls.def"
+#include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
   // Initialize calling conventions to their default.
   for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
@@ -192,6 +192,9 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
       return FPEXT_F64_F128;
     else if (RetVT == MVT::ppcf128)
       return FPEXT_F64_PPCF128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::f128)
+      return FPEXT_F80_F128;
   }
 
   return UNKNOWN_LIBCALL;
@@ -227,6 +230,9 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
       return FPROUND_F128_F64;
     if (OpVT == MVT::ppcf128)
       return FPROUND_PPCF128_F64;
+  } else if (RetVT == MVT::f80) {
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F80;
   }
 
   return UNKNOWN_LIBCALL;
@@ -529,6 +535,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   // Perform these initializations only once.
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
       MaxLoadsPerMemcmp = 8;
+  MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
@@ -614,6 +621,12 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SUBCARRY, VT, Expand);
     setOperationAction(ISD::SETCCCARRY, VT, Expand);
 
+    // ADDC/ADDE/SUBC/SUBE default to expand.
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+
     // These default to Expand so they will be expanded to CTLZ/CTTZ by default.
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
@@ -679,12 +692,13 @@ MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
   return MVT::getIntegerVT(8 * DL.getPointerSize(0));
 }
 
-EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy,
-                                         const DataLayout &DL) const {
+EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
+                                         bool LegalTypes) const {
   assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
   if (LHSTy.isVector())
     return LHSTy;
-  return getScalarShiftAmountTy(DL, LHSTy);
+  return LegalTypes ? getScalarShiftAmountTy(DL, LHSTy)
+                    : getPointerTy(DL);
 }
 
 bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
@@ -979,6 +993,36 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
   return MBB;
 }
 
+MachineBasicBlock *
+TargetLoweringBase::emitXRayCustomEvent(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const {
+  assert(MI.getOpcode() == TargetOpcode::PATCHABLE_EVENT_CALL &&
+         "Called emitXRayCustomEvent on the wrong MI!");
+  auto &MF = *MI.getMF();
+  auto MIB = BuildMI(MF, MI.getDebugLoc(), MI.getDesc());
+  for (unsigned OpIdx = 0; OpIdx != MI.getNumOperands(); ++OpIdx)
+    MIB.add(MI.getOperand(OpIdx));
+
+  MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+TargetLoweringBase::emitXRayTypedEvent(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const {
+  assert(MI.getOpcode() == TargetOpcode::PATCHABLE_TYPED_EVENT_CALL &&
+         "Called emitXRayTypedEvent on the wrong MI!");
+  auto &MF = *MI.getMF();
+  auto MIB = BuildMI(MF, MI.getDebugLoc(), MI.getDesc());
+  for (unsigned OpIdx = 0; OpIdx != MI.getNumOperands(); ++OpIdx)
+    MIB.add(MI.getOperand(OpIdx));
+
+  MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+  MI.eraseFromParent();
+  return MBB;
+}
+
 /// findRepresentativeClass - Return the largest legal super-reg register class
 /// of the register class for the specified type and its associated "cost".
 // This function is in TargetLowering because it uses RegClassForVT which would
@@ -1587,13 +1631,16 @@ Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const {
 // Currently only support "standard" __stack_chk_guard.
 // TODO: add LOAD_STACK_GUARD support.
 void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
-  M.getOrInsertGlobal("__stack_chk_guard", Type::getInt8PtrTy(M.getContext()));
+  if (!M.getNamedValue("__stack_chk_guard"))
+    new GlobalVariable(M, Type::getInt8PtrTy(M.getContext()), false,
+                       GlobalVariable::ExternalLinkage,
+                       nullptr, "__stack_chk_guard");
 }
 
 // Currently only support "standard" __stack_chk_guard.
 // TODO: add LOAD_STACK_GUARD support.
 Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
-  return M.getGlobalVariable("__stack_chk_guard", true);
+  return M.getNamedValue("__stack_chk_guard");
 }
 
 Value *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
@@ -1683,7 +1730,7 @@ static int getOpEnabled(bool IsSqrt, EVT VT, StringRef Override) {
     return TargetLoweringBase::ReciprocalEstimate::Unspecified;
 
   SmallVector<StringRef, 4> OverrideVector;
-  SplitString(Override, OverrideVector, ",");
+  Override.split(OverrideVector, ',');
   unsigned NumArgs = OverrideVector.size();
 
   // Check if "all", "none", or "default" was specified.
@@ -1743,7 +1790,7 @@ static int getOpRefinementSteps(bool IsSqrt, EVT VT, StringRef Override) {
     return TargetLoweringBase::ReciprocalEstimate::Unspecified;
 
   SmallVector<StringRef, 4> OverrideVector;
-  SplitString(Override, OverrideVector, ",");
+  Override.split(OverrideVector, ',');
   unsigned NumArgs = OverrideVector.size();
 
   // Check if "all", "default", or "none" was specified.
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 24d4baa31e1f..b5dd2d4cca89 100644
--- a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -91,23 +91,86 @@ static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags,
 //                                  ELF
 //===----------------------------------------------------------------------===//
 
-void TargetLoweringObjectFileELF::emitModuleMetadata(
-    MCStreamer &Streamer, Module &M, const TargetMachine &TM) const {
+void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TgtM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TgtM);
+  TM = &TgtM;
+}
+
+void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
+                                                     Module &M) const {
+  auto &C = getContext();
+
+  if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
+    auto *S = C.getELFSection(".linker-options", ELF::SHT_LLVM_LINKER_OPTIONS,
+                              ELF::SHF_EXCLUDE);
+
+    Streamer.SwitchSection(S);
+
+    for (const auto &Operand : LinkerOptions->operands()) {
+      if (cast<MDNode>(Operand)->getNumOperands() != 2)
+        report_fatal_error("invalid llvm.linker.options");
+      for (const auto &Option : cast<MDNode>(Operand)->operands()) {
+        Streamer.EmitBytes(cast<MDString>(Option)->getString());
+        Streamer.EmitIntValue(0, 1);
+      }
+    }
+  }
+
   unsigned Version = 0;
   unsigned Flags = 0;
   StringRef Section;
 
   GetObjCImageInfo(M, Version, Flags, Section);
-  if (Section.empty())
+  if (!Section.empty()) {
+    auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+    Streamer.SwitchSection(S);
+    Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+    Streamer.EmitIntValue(Version, 4);
+    Streamer.EmitIntValue(Flags, 4);
+    Streamer.AddBlankLine();
+  }
+
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+
+  MDNode *CFGProfile = nullptr;
+
+  for (const auto &MFE : ModuleFlags) {
+    StringRef Key = MFE.Key->getString();
+    if (Key == "CG Profile") {
+      CFGProfile = cast<MDNode>(MFE.Val);
+      break;
+    }
+  }
+
+  if (!CFGProfile)
     return;
 
-  auto &C = getContext();
-  auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
-  Streamer.SwitchSection(S);
-  Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
-  Streamer.EmitIntValue(Version, 4);
-  Streamer.EmitIntValue(Flags, 4);
-  Streamer.AddBlankLine();
+  auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * {
+    if (!MDO)
+      return nullptr;
+    auto V = cast<ValueAsMetadata>(MDO);
+    const Function *F = cast<Function>(V->getValue());
+    return TM->getSymbol(F);
+  };
+
+  for (const auto &Edge : CFGProfile->operands()) {
+    MDNode *E = cast<MDNode>(Edge);
+    const MCSymbol *From = GetSym(E->getOperand(0));
+    const MCSymbol *To = GetSym(E->getOperand(1));
+    // Skip null functions. This can happen if functions are dead stripped after
+    // the CGProfile pass has been run.
+    if (!From || !To)
+      continue;
+    uint64_t Count = cast<ConstantAsMetadata>(E->getOperand(2))
+                         ->getValue()
+                         ->getUniqueInteger()
+                         .getZExtValue();
+    Streamer.emitCGProfileEntry(
+        MCSymbolRefExpr::create(From, MCSymbolRefExpr::VK_None, C),
+        MCSymbolRefExpr::create(To, MCSymbolRefExpr::VK_None, C), Count);
+  }
 }
 
 MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
@@ -170,7 +233,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
 }
 
 static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
-  // N.B.: The defaults used in here are no the same ones used in MC.
+  // N.B.: The defaults used in here are not the same ones used in MC.
   // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
   // both gas and MC will produce a section with no flags. Given
   // section(".eh_frame") gcc will produce:
@@ -183,7 +246,7 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
 
   if (Name.empty() || Name[0] != '.') return K;
 
-  // Some lame default implementation based on some magic section names.
+  // Default implementation based on some magic section names.
   if (Name == ".bss" ||
       Name.startswith(".bss.") ||
       Name.startswith(".gnu.linkonce.b.") ||
@@ -335,7 +398,8 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
       /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
   // Make sure that we did not get some other section with incompatible sh_link.
   // This should not be possible due to UniqueID code above.
-  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol &&
+         "Associated symbol mismatch between sections");
   return Section;
 }
 
@@ -617,8 +681,8 @@ void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
   }
 }
 
-void TargetLoweringObjectFileMachO::emitModuleMetadata(
-    MCStreamer &Streamer, Module &M, const TargetMachine &TM) const {
+void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
+                                                       Module &M) const {
   // Emit the linker options if present.
   if (auto *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
     for (const auto &Option : LinkerOptions->operands()) {
@@ -727,6 +791,8 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
   if (GO->isWeakForLinker()) {
     if (Kind.isReadOnly())
       return ConstTextCoalSection;
+    if (Kind.isReadOnlyWithRel())
+      return ConstDataCoalSection;
     return DataCoalSection;
   }
 
@@ -1040,7 +1106,7 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
                                      Selection);
 }
 
-static const char *getCOFFSectionNameForUniqueGlobal(SectionKind Kind) {
+static StringRef getCOFFSectionNameForUniqueGlobal(SectionKind Kind) {
   if (Kind.isText())
     return ".text";
   if (Kind.isBSS())
@@ -1063,7 +1129,8 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
     EmitUniquedSection = TM.getDataSections();
 
   if ((EmitUniquedSection && !Kind.isCommon()) || GO->hasComdat()) {
-    const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
+    SmallString<256> Name = getCOFFSectionNameForUniqueGlobal(Kind);
+
     unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
 
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
@@ -1083,6 +1150,12 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
     if (!ComdatGV->hasPrivateLinkage()) {
       MCSymbol *Sym = TM.getSymbol(ComdatGV);
       StringRef COMDATSymName = Sym->getName();
+
+      // Append "$symbol" to the section name when targetting mingw. The ld.bfd
+      // COFF linker will not properly handle comdats otherwise.
+      if (getTargetTriple().isWindowsGNUEnvironment())
+        raw_svector_ostream(Name) << '$' << COMDATSymName;
+
       return getContext().getCOFFSection(Name, Characteristics, Kind,
                                          COMDATSymName, Selection, UniqueID);
     } else {
@@ -1140,17 +1213,18 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable(
   StringRef COMDATSymName = Sym->getName();
 
   SectionKind Kind = SectionKind::getReadOnly();
-  const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
+  StringRef SecName = getCOFFSectionNameForUniqueGlobal(Kind);
   unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
   unsigned UniqueID = NextUniqueID++;
 
-  return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName,
-                                     COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
+  return getContext().getCOFFSection(
+      SecName, Characteristics, Kind, COMDATSymName,
+      COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
 }
 
-void TargetLoweringObjectFileCOFF::emitModuleMetadata(
-    MCStreamer &Streamer, Module &M, const TargetMachine &TM) const {
+void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
+                                                      Module &M) const {
   if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
     // Emit the linker options to the linker .drectve section.  According to the
     // spec, this section is a space-separated string containing flags for
@@ -1250,19 +1324,136 @@ void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
   emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler());
 }
 
+void TargetLoweringObjectFileCOFF::emitLinkerFlagsForUsed(
+    raw_ostream &OS, const GlobalValue *GV) const {
+  emitLinkerFlagsForUsedCOFF(OS, GV, getTargetTriple(), getMangler());
+}
+
+const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  const Triple &T = TM.getTargetTriple();
+  if (!T.isKnownWindowsMSVCEnvironment() &&
+      !T.isWindowsItaniumEnvironment() &&
+      !T.isWindowsCoreCLREnvironment())
+    return nullptr;
+
+  // Our symbols should exist in address space zero, cowardly no-op if
+  // otherwise.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  // Both ptrtoint instructions must wrap global objects:
+  // - Only global variables are eligible for image relative relocations.
+  // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
+  // We expect __ImageBase to be a global variable without a section, externally
+  // defined.
+  //
+  // It should look something like this: @__ImageBase = external constant i8
+  if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
+      LHS->isThreadLocal() || RHS->isThreadLocal() ||
+      RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
+      cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
+    return nullptr;
+
+  return MCSymbolRefExpr::create(TM.getSymbol(LHS),
+                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+  unsigned Width = (AI.getBitWidth() / 8) * 2;
+  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  unsigned Size = HexString.size();
+  assert(Width >= Size && "hex string is too large!");
+  HexString.insert(HexString.begin(), Width - Size, '0');
+
+  return HexString;
+}
+
+static std::string scalarConstantToHexString(const Constant *C) {
+  Type *Ty = C->getType();
+  if (isa<UndefValue>(C)) {
+    return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
+  } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
+    return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
+  } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
+    return APIntToHexString(CI->getValue());
+  } else {
+    unsigned NumElements;
+    if (isa<VectorType>(Ty))
+      NumElements = Ty->getVectorNumElements();
+    else
+      NumElements = Ty->getArrayNumElements();
+    std::string HexString;
+    for (int I = NumElements - 1, E = -1; I != E; --I)
+      HexString += scalarConstantToHexString(C->getAggregateElement(I));
+    return HexString;
+  }
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
+  if (Kind.isMergeableConst() && C &&
+      getContext().getAsmInfo()->hasCOFFComdatConstants()) {
+    // This creates comdat sections with the given symbol name, but unless
+    // AsmPrinter::GetCPISymbol actually makes the symbol global, the symbol
+    // will be created with a null storage class, which makes GNU binutils
+    // error out.
+    const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_LNK_COMDAT;
+    std::string COMDATSymName;
+    if (Kind.isMergeableConst4()) {
+      if (Align <= 4) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Align = 4;
+      }
+    } else if (Kind.isMergeableConst8()) {
+      if (Align <= 8) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Align = 8;
+      }
+    } else if (Kind.isMergeableConst16()) {
+      // FIXME: These may not be appropriate for non-x86 architectures.
+      if (Align <= 16) {
+        COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+        Align = 16;
+      }
+    } else if (Kind.isMergeableConst32()) {
+      if (Align <= 32) {
+        COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
+        Align = 32;
+      }
+    }
+
+    if (!COMDATSymName.empty())
+      return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+                                         COMDATSymName,
+                                         COFF::IMAGE_COMDAT_SELECT_ANY);
+  }
+
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
+}
+
+
 //===----------------------------------------------------------------------===//
 //                                  Wasm
 //===----------------------------------------------------------------------===//
 
-static void checkWasmComdat(const GlobalValue *GV) {
+static const Comdat *getWasmComdat(const GlobalValue *GV) {
   const Comdat *C = GV->getComdat();
   if (!C)
-    return;
+    return nullptr;
 
-  // TODO(sbc): At some point we may need COMDAT support but currently
-  // they are not supported.
-  report_fatal_error("WebAssembly doesn't support COMDATs, '" + C->getName() +
-                     "' cannot be lowered.");
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("WebAssembly COMDATs only support "
+                       "SelectionKind::Any, '" + C->getName() + "' cannot be "
+                       "lowered.");
+
+  return C;
 }
 
 static SectionKind getWasmKindForNamedSection(StringRef Name, SectionKind K) {
@@ -1277,17 +1468,32 @@ static SectionKind getWasmKindForNamedSection(StringRef Name, SectionKind K) {
 
 MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // We don't support explict section names for functions in the wasm object
+  // format.  Each function has to be in its own unique section.
+  if (isa<Function>(GO)) {
+    return SelectSectionForGlobal(GO, Kind, TM);
+  }
+
   StringRef Name = GO->getSection();
-  checkWasmComdat(GO);
+
   Kind = getWasmKindForNamedSection(Name, Kind);
-  return getContext().getWasmSection(Name, Kind);
+
+  StringRef Group = "";
+  if (const Comdat *C = getWasmComdat(GO)) {
+    Group = C->getName();
+  }
+
+  return getContext().getWasmSection(Name, Kind, Group,
+                                     MCContext::GenericSectionID);
 }
 
 static MCSectionWasm *selectWasmSectionForGlobal(
     MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM, bool EmitUniqueSection, unsigned *NextUniqueID) {
   StringRef Group = "";
-  checkWasmComdat(GO);
+  if (const Comdat *C = getWasmComdat(GO)) {
+    Group = C->getName();
+  }
 
   bool UniqueSectionNames = TM.getUniqueSectionNames();
   SmallString<128> Name = getSectionPrefixForGlobal(Kind);
diff --git a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
index 3e6ad3eeef0f..3fca2f4ee4fe 100644
--- a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include <cassert>
 #include <string>
@@ -80,6 +81,9 @@ static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
     cl::desc("Disable Machine Sinking"));
+static cl::opt<bool> DisablePostRAMachineSink("disable-postra-machine-sink",
+    cl::Hidden,
+    cl::desc("Disable PostRA Machine Sinking"));
 static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
     cl::desc("Disable Loop Strength Reduction Pass"));
 static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
@@ -94,10 +98,9 @@ static cl::opt<bool> EnableImplicitNullChecks(
     "enable-implicit-null-checks",
     cl::desc("Fold null checks into faulting memory operations"),
     cl::init(false), cl::Hidden);
-static cl::opt<bool>
-    EnableMergeICmps("enable-mergeicmps",
-                     cl::desc("Merge ICmp chains into a single memcmp"),
-                     cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableMergeICmps("disable-mergeicmps",
+    cl::desc("Disable MergeICmps Pass"),
+    cl::init(false), cl::Hidden);
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
@@ -108,14 +111,16 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(false),
     cl::ZeroOrMore);
-static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner",
-    cl::Hidden,
-    cl::desc("Enable machine outliner"));
-static cl::opt<bool> EnableLinkOnceODROutlining(
-    "enable-linkonceodr-outlining",
-    cl::Hidden,
-    cl::desc("Enable the machine outliner on linkonceodr functions"),
-    cl::init(false));
+enum RunOutliner { AlwaysOutline, NeverOutline, TargetDefault };
+// Enable or disable the MachineOutliner.
+static cl::opt<RunOutliner> EnableMachineOutliner(
+    "enable-machine-outliner", cl::desc("Enable the machine outliner"),
+    cl::Hidden, cl::ValueOptional, cl::init(TargetDefault),
+    cl::values(clEnumValN(AlwaysOutline, "always",
+                          "Run on all functions guaranteed to be beneficial"),
+               clEnumValN(NeverOutline, "never", "Disable all outlining"),
+               // Sentinel value for unspecified option.
+               clEnumValN(AlwaysOutline, "", "")));
 // Enable or disable FastISel. Both options are needed, because
 // FastISel is enabled by default with -fast, and we wish to be
 // able to enable or disable fast-isel independently from -O0.
@@ -123,9 +128,9 @@ static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
-static cl::opt<cl::boolOrDefault>
-    EnableGlobalISel("global-isel", cl::Hidden,
-                     cl::desc("Enable the \"global\" instruction selector"));
+static cl::opt<cl::boolOrDefault> EnableGlobalISelOption(
+    "global-isel", cl::Hidden,
+    cl::desc("Enable the \"global\" instruction selector"));
 
 static cl::opt<std::string> PrintMachineInstrs(
     "print-machineinstrs", cl::ValueOptional, cl::desc("Print machine instrs"),
@@ -226,7 +231,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   if (StandardID == &TailDuplicateID)
     return applyDisable(TargetID, DisableTailDuplicate);
 
-  if (StandardID == &TargetPassConfig::EarlyTailDuplicateID)
+  if (StandardID == &EarlyTailDuplicateID)
     return applyDisable(TargetID, DisableEarlyTailDup);
 
   if (StandardID == &MachineBlockPlacementID)
@@ -241,18 +246,21 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   if (StandardID == &EarlyIfConverterID)
     return applyDisable(TargetID, DisableEarlyIfConversion);
 
-  if (StandardID == &MachineLICMID)
+  if (StandardID == &EarlyMachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
 
   if (StandardID == &MachineCSEID)
     return applyDisable(TargetID, DisableMachineCSE);
 
-  if (StandardID == &TargetPassConfig::PostRAMachineLICMID)
+  if (StandardID == &MachineLICMID)
     return applyDisable(TargetID, DisablePostRAMachineLICM);
 
   if (StandardID == &MachineSinkingID)
     return applyDisable(TargetID, DisableMachineSink);
 
+  if (StandardID == &PostRAMachineSinkingID)
+    return applyDisable(TargetID, DisablePostRAMachineSink);
+
   if (StandardID == &MachineCopyPropagationID)
     return applyDisable(TargetID, DisableCopyProp);
 
@@ -267,10 +275,6 @@ INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
                 "Target Pass Configuration", false, false)
 char TargetPassConfig::ID = 0;
 
-// Pseudo Pass IDs.
-char TargetPassConfig::EarlyTailDuplicateID = 0;
-char TargetPassConfig::PostRAMachineLICMID = 0;
-
 namespace {
 
 struct InsertedPass {
@@ -366,10 +370,6 @@ TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
   initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
   initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
 
-  // Substitute Pseudo Pass IDs for real ones.
-  substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
-  substitutePass(&PostRAMachineLICMID, &MachineLICMID);
-
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
     TM.Options.PrintMachineCode = true;
 
@@ -604,7 +604,7 @@ void TargetPassConfig::addIRPasses() {
     // loads and compares. ExpandMemCmpPass then tries to expand those calls
     // into optimally-sized loads and compares. The transforms are enabled by a
     // target lowering hook.
-    if (EnableMergeICmps)
+    if (!DisableMergeICmps)
       addPass(createMergeICmpsPass());
     addPass(createExpandMemCmpPass());
   }
@@ -662,6 +662,14 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     addPass(createWinEHPass());
     addPass(createDwarfEHPass());
     break;
+  case ExceptionHandling::Wasm:
+    // Wasm EH uses Windows EH instructions, but it does not need to demote PHIs
+    // on catchpads and cleanuppads because it does not outline them into
+    // funclets. Catchswitch blocks are not lowered in SelectionDAG, so we
+    // should remove PHIs there.
+    addPass(createWinEHPass(/*DemoteCatchSwitchPHIOnly=*/false));
+    addPass(createWasmEHPass());
+    break;
   case ExceptionHandling::None:
     addPass(createLowerInvokePass());
 
@@ -704,19 +712,18 @@ void TargetPassConfig::addISelPrepare() {
 }
 
 bool TargetPassConfig::addCoreISelPasses() {
-  // Enable FastISel with -fast, but allow that to be overridden.
+  // Enable FastISel with -fast-isel, but allow that to be overridden.
   TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
   if (EnableFastISelOption == cl::BOU_TRUE ||
       (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel()))
     TM->setFastISel(true);
 
-  // Ask the target for an isel.
-  // Enable GlobalISel if the target wants to, but allow that to be overriden.
+  // Ask the target for an instruction selector.
   // Explicitly enabling fast-isel should override implicitly enabled
   // global-isel.
-  if (EnableGlobalISel == cl::BOU_TRUE ||
-      (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled() &&
-       EnableFastISelOption != cl::BOU_TRUE)) {
+  if (EnableGlobalISelOption == cl::BOU_TRUE ||
+      (EnableGlobalISelOption == cl::BOU_UNSET &&
+       TM->Options.EnableGlobalISel && EnableFastISelOption != cl::BOU_TRUE)) {
     TM->setFastISel(false);
 
     if (addIRTranslator())
@@ -755,7 +762,7 @@ bool TargetPassConfig::addCoreISelPasses() {
 }
 
 bool TargetPassConfig::addISelPasses() {
-  if (TM->Options.EmulatedTLS)
+  if (TM->useEmulatedTLS())
     addPass(createLowerEmuTLSPass());
 
   addPass(createPreISelIntrinsicLoweringPass());
@@ -844,8 +851,10 @@ void TargetPassConfig::addMachinePasses() {
   addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(&PostRAMachineSinkingID);
     addPass(&ShrinkWrapID);
+  }
 
   // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
   // do so if it hasn't been disabled, substituted, or overridden.
@@ -904,8 +913,14 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&XRayInstrumentationID, false);
   addPass(&PatchableFunctionID, false);
 
-  if (EnableMachineOutliner)
-    PM->add(createMachineOutlinerPass(EnableLinkOnceODROutlining));
+  if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOpt::None &&
+      EnableMachineOutliner != NeverOutline) {
+    bool RunOnAllFunctions = (EnableMachineOutliner == AlwaysOutline);
+    bool AddOutliner = RunOnAllFunctions ||
+                       TM->Options.SupportsDefaultOutlining;
+    if (AddOutliner)
+      addPass(createMachineOutlinerPass(RunOnAllFunctions));
+  }
 
   // Add passes that directly emit MI after all other MI passes.
   addPreEmitPass2();
@@ -941,7 +956,7 @@ void TargetPassConfig::addMachineSSAOptimization() {
   // loop info, just like LICM and CSE below.
   addILPOpts();
 
-  addPass(&MachineLICMID, false);
+  addPass(&EarlyMachineLICMID, false);
   addPass(&MachineCSEID, false);
 
   addPass(&MachineSinkingID);
@@ -1090,10 +1105,14 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
     // kill markers.
     addPass(&StackSlotColoringID);
 
+    // Copy propagate to forward register uses and try to eliminate COPYs that
+    // were not coalesced.
+    addPass(&MachineCopyPropagationID);
+
     // Run post-ra machine LICM to hoist reloads / remats.
     //
     // FIXME: can this move into MachineLateOptimization?
-    addPass(&PostRAMachineLICMID);
+    addPass(&MachineLICMID);
   }
 }
 
@@ -1135,18 +1154,13 @@ void TargetPassConfig::addBlockPlacement() {
 //===---------------------------------------------------------------------===//
 /// GlobalISel Configuration
 //===---------------------------------------------------------------------===//
-
-bool TargetPassConfig::isGlobalISelEnabled() const {
-  return false;
-}
-
 bool TargetPassConfig::isGlobalISelAbortEnabled() const {
   if (EnableGlobalISelAbort.getNumOccurrences() > 0)
     return EnableGlobalISelAbort == 1;
 
   // When no abort behaviour is specified, we don't abort if the target says
   // that GISel is enabled.
-  return !isGlobalISelEnabled();
+  return !TM->Options.EnableGlobalISel;
 }
 
 bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const {
diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index f03c3b8300f3..661dc18f7a85 100644
--- a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -19,15 +19,16 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
@@ -86,18 +87,24 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
 namespace llvm {
 
 Printable printReg(unsigned Reg, const TargetRegisterInfo *TRI,
-                   unsigned SubIdx) {
-  return Printable([Reg, TRI, SubIdx](raw_ostream &OS) {
+                   unsigned SubIdx, const MachineRegisterInfo *MRI) {
+  return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
     if (!Reg)
-      OS << "%noreg";
+      OS << "$noreg";
     else if (TargetRegisterInfo::isStackSlot(Reg))
       OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-    else if (TargetRegisterInfo::isVirtualRegister(Reg))
-      OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
+    else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      StringRef Name = MRI ? MRI->getVRegName(Reg) : "";
+      if (Name != "") {
+        OS << '%' << Name;
+      } else {
+        OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
+      }
+    }
     else if (!TRI)
-      OS << '%' << "physreg" << Reg;
+      OS << '$' << "physreg" << Reg;
     else if (Reg < TRI->getNumRegs()) {
-      OS << '%';
+      OS << '$';
       printLowerCase(TRI->getName(Reg), OS);
     } else
       llvm_unreachable("Register kind is unsupported.");
@@ -338,7 +345,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
   return BestRC;
 }
 
-/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
+/// Check if the registers defined by the pair (RegisterClass, SubReg)
 /// share the same register file.
 static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
                                   const TargetRegisterClass *DefRC,
@@ -436,7 +443,8 @@ bool TargetRegisterInfo::needsStackRealignment(
   if (F.hasFnAttribute("stackrealign") || requiresRealignment) {
     if (canRealignStack(MF))
       return true;
-    DEBUG(dbgs() << "Can't realign function's stack: " << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Can't realign function's stack: " << F.getName()
+                      << "\n");
   }
   return false;
 }
@@ -450,6 +458,51 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
   return true;
 }
 
+unsigned TargetRegisterInfo::getRegSizeInBits(unsigned Reg,
+                                         const MachineRegisterInfo &MRI) const {
+  const TargetRegisterClass *RC{};
+  if (isPhysicalRegister(Reg)) {
+    // The size is not directly available for physical registers.
+    // Instead, we need to access a register class that contains Reg and
+    // get the size of that register class.
+    RC = getMinimalPhysRegClass(Reg);
+  } else {
+    LLT Ty = MRI.getType(Reg);
+    unsigned RegSize = Ty.isValid() ? Ty.getSizeInBits() : 0;
+    // If Reg is not a generic register, query the register class to
+    // get its size.
+    if (RegSize)
+      return RegSize;
+    // Since Reg is not a generic register, it must have a register class.
+    RC = MRI.getRegClass(Reg);
+  }
+  assert(RC && "Unable to deduce the register class");
+  return getRegSizeInBits(*RC);
+}
+
+unsigned
+TargetRegisterInfo::lookThruCopyLike(unsigned SrcReg,
+                                     const MachineRegisterInfo *MRI) const {
+  while (true) {
+    const MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    if (!MI->isCopyLike())
+      return SrcReg;
+
+    unsigned CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    if (!isVirtualRegister(CopySrcReg))
+      return CopySrcReg;
+
+    SrcReg = CopySrcReg;
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
index 86dbf1b2aeab..3cff31ad4933 100644
--- a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -61,12 +61,10 @@ static unsigned lcm(unsigned A, unsigned B) {
   return LCM;
 }
 
-void TargetSchedModel::init(const MCSchedModel &sm,
-                            const TargetSubtargetInfo *sti,
-                            const TargetInstrInfo *tii) {
-  SchedModel = sm;
-  STI = sti;
-  TII = tii;
+void TargetSchedModel::init(const TargetSubtargetInfo *TSInfo) {
+  STI = TSInfo;
+  SchedModel = TSInfo->getSchedModel();
+  TII = TSInfo->getInstrInfo();
   STI->initInstrItins(InstrItins);
 
   unsigned NumRes = SchedModel.getNumProcResourceKinds();
@@ -257,31 +255,19 @@ unsigned TargetSchedModel::computeOperandLatency(
 
 unsigned
 TargetSchedModel::computeInstrLatency(const MCSchedClassDesc &SCDesc) const {
-  unsigned Latency = 0;
-  for (unsigned DefIdx = 0, DefEnd = SCDesc.NumWriteLatencyEntries;
-       DefIdx != DefEnd; ++DefIdx) {
-    // Lookup the definition's write latency in SubtargetInfo.
-    const MCWriteLatencyEntry *WLEntry =
-      STI->getWriteLatencyEntry(&SCDesc, DefIdx);
-    Latency = std::max(Latency, capLatency(WLEntry->Cycles));
-  }
-  return Latency;
+  return capLatency(MCSchedModel::computeInstrLatency(*STI, SCDesc));
 }
 
 unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
   assert(hasInstrSchedModel() && "Only call this function with a SchedModel");
-
   unsigned SCIdx = TII->get(Opcode).getSchedClass();
-  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SCIdx);
-
-  if (SCDesc->isValid() && !SCDesc->isVariant())
-    return computeInstrLatency(*SCDesc);
+  return capLatency(SchedModel.computeInstrLatency(*STI, SCIdx));
+}
 
-  if (SCDesc->isValid()) {
-    assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()");
-    return computeInstrLatency(*SCDesc);
-  }
-  return 0;
+unsigned TargetSchedModel::computeInstrLatency(const MCInst &Inst) const {
+  if (hasInstrSchedModel())
+    return capLatency(SchedModel.computeInstrLatency(*STI, *TII, Inst));
+  return computeInstrLatency(Inst.getOpcode());
 }
 
 unsigned
@@ -336,71 +322,39 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   return 0;
 }
 
-static Optional<double>
-getRThroughputFromItineraries(unsigned schedClass,
-                              const InstrItineraryData *IID){
-  Optional<double> Throughput;
-
-  for (const InstrStage *IS = IID->beginStage(schedClass),
-                        *E = IID->endStage(schedClass);
-       IS != E; ++IS) {
-    if (IS->getCycles()) {
-      double Temp = countPopulation(IS->getUnits()) * 1.0 / IS->getCycles();
-      Throughput = Throughput.hasValue()
-                        ? std::min(Throughput.getValue(), Temp)
-                        : Temp;
-    }
-  }
-  if (Throughput.hasValue())
-    // We need reciprocal throughput that's why we return such value.
-    return 1 / Throughput.getValue();
-  return Throughput;
-}
-
-static Optional<double>
-getRThroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc,
-                                  const TargetSubtargetInfo *STI,
-                                  const MCSchedModel &SchedModel) {
-  Optional<double> Throughput;
-
-  for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc),
-                                 *WEnd = STI->getWriteProcResEnd(SCDesc);
-       WPR != WEnd; ++WPR) {
-    if (WPR->Cycles) {
-      unsigned NumUnits =
-          SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits;
-      double Temp = NumUnits * 1.0 / WPR->Cycles;
-      Throughput = Throughput.hasValue()
-                       ? std::min(Throughput.getValue(), Temp)
-                       : Temp;
-    }
+double
+TargetSchedModel::computeReciprocalThroughput(const MachineInstr *MI) const {
+  if (hasInstrItineraries()) {
+    unsigned SchedClass = MI->getDesc().getSchedClass();
+    return MCSchedModel::getReciprocalThroughput(SchedClass,
+                                                 *getInstrItineraries());
   }
-  if (Throughput.hasValue())
-    // We need reciprocal throughput that's why we return such value.
-    return 1 / Throughput.getValue();
-  return Throughput;
-}
 
-Optional<double>
-TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const {
-  if (hasInstrItineraries())
-    return getRThroughputFromItineraries(MI->getDesc().getSchedClass(),
-                                         getInstrItineraries());
   if (hasInstrSchedModel())
-    return getRThroughputFromInstrSchedModel(resolveSchedClass(MI), STI,
-                                             SchedModel);
-  return Optional<double>();
+    return MCSchedModel::getReciprocalThroughput(*STI, *resolveSchedClass(MI));
+
+  return 0.0;
 }
 
-Optional<double>
-TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const {
+double
+TargetSchedModel::computeReciprocalThroughput(unsigned Opcode) const {
   unsigned SchedClass = TII->get(Opcode).getSchedClass();
   if (hasInstrItineraries())
-    return getRThroughputFromItineraries(SchedClass, getInstrItineraries());
+    return MCSchedModel::getReciprocalThroughput(SchedClass,
+                                                 *getInstrItineraries());
   if (hasInstrSchedModel()) {
-    const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
-    if (SCDesc->isValid() && !SCDesc->isVariant())
-      return getRThroughputFromInstrSchedModel(SCDesc, STI, SchedModel);
+    const MCSchedClassDesc &SCDesc = *SchedModel.getSchedClassDesc(SchedClass);
+    if (SCDesc.isValid() && !SCDesc.isVariant())
+      return MCSchedModel::getReciprocalThroughput(*STI, SCDesc);
   }
-  return Optional<double>();
+
+  return 0.0;
 }
+
+double
+TargetSchedModel::computeReciprocalThroughput(const MCInst &MI) const {
+  if (hasInstrSchedModel())
+    return SchedModel.getReciprocalThroughput(*STI, *TII, MI);
+  return computeReciprocalThroughput(MI.getOpcode());
+}
+
diff --git a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
index 8693f344f9be..fa29c05fd6c2 100644
--- a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -67,18 +67,15 @@ bool TargetSubtargetInfo::useAA() const {
   return false;
 }
 
-static std::string createSchedInfoStr(unsigned Latency,
-                                     Optional<double> RThroughput) {
+static std::string createSchedInfoStr(unsigned Latency, double RThroughput) {
   static const char *SchedPrefix = " sched: [";
   std::string Comment;
   raw_string_ostream CS(Comment);
-  if (Latency > 0 && RThroughput.hasValue())
-    CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue())
+  if (RThroughput != 0.0)
+    CS << SchedPrefix << Latency << format(":%2.2f", RThroughput)
        << "]";
-  else if (Latency > 0)
+  else
     CS << SchedPrefix << Latency << ":?]";
-  else if (RThroughput.hasValue())
-    CS << SchedPrefix << "?:" << RThroughput.getValue() << "]";
   CS.flush();
   return Comment;
 }
@@ -90,9 +87,9 @@ std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const {
   // We don't cache TSchedModel because it depends on TargetInstrInfo
   // that could be changed during the compilation
   TargetSchedModel TSchedModel;
-  TSchedModel.init(getSchedModel(), this, getInstrInfo());
+  TSchedModel.init(this);
   unsigned Latency = TSchedModel.computeInstrLatency(&MI);
-  Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI);
+  double RThroughput = TSchedModel.computeReciprocalThroughput(&MI);
   return createSchedInfoStr(Latency, RThroughput);
 }
 
@@ -101,17 +98,19 @@ std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const {
   // We don't cache TSchedModel because it depends on TargetInstrInfo
   // that could be changed during the compilation
   TargetSchedModel TSchedModel;
-  TSchedModel.init(getSchedModel(), this, getInstrInfo());
+  TSchedModel.init(this);
   unsigned Latency;
   if (TSchedModel.hasInstrSchedModel())
-    Latency = TSchedModel.computeInstrLatency(MCI.getOpcode());
+    Latency = TSchedModel.computeInstrLatency(MCI);
   else if (TSchedModel.hasInstrItineraries()) {
     auto *ItinData = TSchedModel.getInstrItineraries();
     Latency = ItinData->getStageLatency(
         getInstrInfo()->get(MCI.getOpcode()).getSchedClass());
   } else
     return std::string();
-  Optional<double> RThroughput =
-      TSchedModel.computeInstrRThroughput(MCI.getOpcode());
+  double RThroughput = TSchedModel.computeReciprocalThroughput(MCI);
   return createSchedInfoStr(Latency, RThroughput);
 }
+
+void TargetSubtargetInfo::mirFileLoaded(MachineFunction &MF) const {
+}
diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 774b76f84b7f..0ca435016ead 100644
--- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -290,8 +290,8 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
 
   unsigned NumVisited = 0;
   for (MachineInstr &OtherMI : make_range(std::next(OldPos), KillPos)) {
-    // DBG_VALUE cannot be counted against the limit.
-    if (OtherMI.isDebugValue())
+    // Debug instructions cannot be counted against the limit.
+    if (OtherMI.isDebugInstr())
       continue;
     if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -685,15 +685,15 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
                                                    unsigned RegCIdx,
                                                    unsigned Dist) {
   unsigned RegC = MI->getOperand(RegCIdx).getReg();
-  DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
+  LLVM_DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
   MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
 
   if (NewMI == nullptr) {
-    DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
+    LLVM_DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
     return false;
   }
 
-  DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
+  LLVM_DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
   assert(NewMI == MI &&
          "TargetInstrInfo::commuteInstruction() should not return a new "
          "instruction unless it was requested.");
@@ -740,8 +740,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   if (!NewMI)
     return false;
 
-  DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
+  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+  LLVM_DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
   bool Sunk = false;
 
   if (LIS)
@@ -940,8 +940,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
   for (MachineInstr &OtherMI : make_range(End, KillPos)) {
-    // DBG_VALUE cannot be counted against the limit.
-    if (OtherMI.isDebugValue())
+    // Debug instructions cannot be counted against the limit.
+    if (OtherMI.isDebugInstr())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -985,7 +985,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   }
 
   // Move debug info as well.
-  while (Begin != MBB->begin() && std::prev(Begin)->isDebugValue())
+  while (Begin != MBB->begin() && std::prev(Begin)->isDebugInstr())
     --Begin;
 
   nmi = End;
@@ -1014,7 +1014,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     LV->addVirtualRegisterKilled(Reg, *MI);
   }
 
-  DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
+  LLVM_DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
   return true;
 }
 
@@ -1114,8 +1114,8 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   unsigned NumVisited = 0;
   for (MachineInstr &OtherMI :
        make_range(mi, MachineBasicBlock::iterator(KillMI))) {
-    // DBG_VALUE cannot be counted against the limit.
-    if (OtherMI.isDebugValue())
+    // Debug instructions cannot be counted against the limit.
+    if (OtherMI.isDebugInstr())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -1162,11 +1162,11 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
 
   // Move the old kill above MI, don't forget to move debug info as well.
   MachineBasicBlock::iterator InsertPos = mi;
-  while (InsertPos != MBB->begin() && std::prev(InsertPos)->isDebugValue())
+  while (InsertPos != MBB->begin() && std::prev(InsertPos)->isDebugInstr())
     --InsertPos;
   MachineBasicBlock::iterator From = KillMI;
   MachineBasicBlock::iterator To = std::next(From);
-  while (std::prev(From)->isDebugValue())
+  while (std::prev(From)->isDebugInstr())
     --From;
   MBB->splice(InsertPos, MBB, From, To);
 
@@ -1181,7 +1181,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     LV->addVirtualRegisterKilled(Reg, *MI);
   }
 
-  DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
+  LLVM_DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
   return true;
 }
 
@@ -1205,6 +1205,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
   if (!MI->isCommutable())
     return false;
 
+  bool MadeChange = false;
   unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
   unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
   unsigned OpsNum = MI->getDesc().getNumOperands();
@@ -1223,8 +1224,8 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
 
     // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
     // operands. This makes the live ranges of DstOp and OtherOp joinable.
-    bool DoCommute =
-        !BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);
+    bool OtherOpKilled = isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);
+    bool DoCommute = !BaseOpKilled && OtherOpKilled;
 
     if (!DoCommute &&
         isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) {
@@ -1235,13 +1236,21 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
     // If it's profitable to commute, try to do so.
     if (DoCommute && commuteInstruction(MI, DstOpIdx, BaseOpIdx, OtherOpIdx,
                                         Dist)) {
+      MadeChange = true;
       ++NumCommuted;
-      if (AggressiveCommute)
+      if (AggressiveCommute) {
         ++NumAggrCommuted;
-      return true;
+        // There might be more than two commutable operands, update BaseOp and
+        // continue scanning.
+        BaseOpReg = OtherOpReg;
+        BaseOpKilled = OtherOpKilled;
+        continue;
+      }
+      // If this was a commute based on kill, we won't do better continuing.
+      return MadeChange;
     }
   }
-  return false;
+  return MadeChange;
 }
 
 /// For the case where an instruction has a single pair of tied register
@@ -1343,7 +1352,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
       const MCInstrDesc &UnfoldMCID = TII->get(NewOpc);
       if (UnfoldMCID.getNumDefs() == 1) {
         // Unfold the load.
-        DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
+        LLVM_DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
         const TargetRegisterClass *RC =
           TRI->getAllocatableClass(
             TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
@@ -1352,7 +1361,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
         if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
                                       /*UnfoldLoad=*/true,
                                       /*UnfoldStore=*/false, NewMIs)) {
-          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          LLVM_DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
           return false;
         }
         assert(NewMIs.size() == 2 &&
@@ -1365,8 +1374,8 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
         MBB->insert(mi, NewMIs[0]);
         MBB->insert(mi, NewMIs[1]);
 
-        DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
-                     << "2addr:    NEW INST: " << *NewMIs[1]);
+        LLVM_DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                          << "2addr:    NEW INST: " << *NewMIs[1]);
 
         // Transform the instruction, now that it no longer has a load.
         unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
@@ -1431,7 +1440,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
           // Transforming didn't eliminate the tie and didn't lead to an
           // improvement. Clean up the unfolded instructions and keep the
           // original.
-          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          LLVM_DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
           NewMIs[0]->eraseFromParent();
           NewMIs[1]->eraseFromParent();
         }
@@ -1475,7 +1484,7 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
           MRI->constrainRegClass(DstReg, RC);
       SrcMO.setReg(DstReg);
       SrcMO.setSubReg(0);
-      DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
+      LLVM_DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
       continue;
     }
     TiedOperands[SrcReg].push_back(std::make_pair(SrcIdx, DstIdx));
@@ -1574,7 +1583,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
       }
     }
 
-    DEBUG(dbgs() << "\t\tprepend:\t" << *MIB);
+    LLVM_DEBUG(dbgs() << "\t\tprepend:\t" << *MIB);
 
     MachineOperand &MO = MI->getOperand(SrcIdx);
     assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() &&
@@ -1668,9 +1677,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 
   bool MadeChange = false;
 
-  DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
-  DEBUG(dbgs() << "********** Function: "
-        << MF->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << MF->getName() << '\n');
 
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();
@@ -1690,7 +1698,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
       MachineBasicBlock::iterator nmi = std::next(mi);
       // Don't revisit an instruction previously converted by target. It may
       // contain undef register operands (%noreg), which are not handled.
-      if (mi->isDebugValue() || SunkInstrs.count(&*mi)) {
+      if (mi->isDebugInstr() || SunkInstrs.count(&*mi)) {
         mi = nmi;
         continue;
       }
@@ -1713,7 +1721,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 
       ++NumTwoAddressInstrs;
       MadeChange = true;
-      DEBUG(dbgs() << '\t' << *mi);
+      LLVM_DEBUG(dbgs() << '\t' << *mi);
 
       // If the instruction has a single pair of tied operands, try some
       // transformations that may either eliminate the tied operands or
@@ -1740,7 +1748,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
       // Now iterate over the information collected above.
       for (auto &TO : TiedOperands) {
         processTiedPairs(&*mi, TO.second, Dist);
-        DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+        LLVM_DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
       }
 
       // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
@@ -1754,7 +1762,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
         mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
         mi->RemoveOperand(1);
         mi->setDesc(TII->get(TargetOpcode::COPY));
-        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+        LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
       }
 
       // Clear TiedOperands here instead of at the top of the loop
@@ -1787,7 +1795,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   if (MI.getOperand(0).getSubReg() ||
       TargetRegisterInfo::isPhysicalRegister(DstReg) ||
       !(MI.getNumOperands() & 1)) {
-    DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
+    LLVM_DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
     llvm_unreachable(nullptr);
   }
 
@@ -1838,19 +1846,19 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
       LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
 
-    DEBUG(dbgs() << "Inserted: " << *CopyMI);
+    LLVM_DEBUG(dbgs() << "Inserted: " << *CopyMI);
   }
 
   MachineBasicBlock::iterator EndMBBI =
       std::next(MachineBasicBlock::iterator(MI));
 
   if (!DefEmitted) {
-    DEBUG(dbgs() << "Turned: " << MI << " into an IMPLICIT_DEF");
+    LLVM_DEBUG(dbgs() << "Turned: " << MI << " into an IMPLICIT_DEF");
     MI.setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
     for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
       MI.RemoveOperand(j);
   } else {
-    DEBUG(dbgs() << "Eliminated: " << MI);
+    LLVM_DEBUG(dbgs() << "Eliminated: " << MI);
     MI.eraseFromParent();
   }
 
diff --git a/contrib/llvm/lib/IR/ValueTypes.cpp b/contrib/llvm/lib/CodeGen/ValueTypes.cpp
index 22f9fe7a66d7..adb7075de651 100644
--- a/contrib/llvm/lib/IR/ValueTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/ValueTypes.cpp
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements methods in the CodeGen/ValueTypes.h header.
-//
-//===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/ADT/StringExtras.h"
@@ -196,6 +192,7 @@ std::string EVT::getEVTString() const {
   case MVT::v8f64:   return "v8f64";
   case MVT::Metadata:return "Metadata";
   case MVT::Untyped: return "Untyped";
+  case MVT::ExceptRef: return "ExceptRef";
   }
 }
 
@@ -272,8 +269,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v16f32:   return VectorType::get(Type::getFloatTy(Context), 16);
   case MVT::v1f64:   return VectorType::get(Type::getDoubleTy(Context), 1);
   case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
-  case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4); 
-  case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8); 
+  case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4);
+  case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8);
   case MVT::Metadata: return Type::getMetadataTy(Context);
  }
 }
diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
index 13f7e83f3dd0..0ead2b8340ab 100644
--- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -241,10 +242,9 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
   Indexes = &getAnalysis<SlotIndexes>();
   LIS = &getAnalysis<LiveIntervals>();
   VRM = &getAnalysis<VirtRegMap>();
-  DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
-               << "********** Function: "
-               << MF->getName() << '\n');
-  DEBUG(VRM->dump());
+  LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
+                    << "********** Function: " << MF->getName() << '\n');
+  LLVM_DEBUG(VRM->dump());
 
   // Add kill flags while we still have virtual registers.
   LIS->addKillFlags(VRM);
@@ -376,7 +376,7 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
 void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
   if (!MI.isIdentityCopy())
     return;
-  DEBUG(dbgs() << "Identity copy: " << MI);
+  LLVM_DEBUG(dbgs() << "Identity copy: " << MI);
   ++NumIdCopies;
 
   // Copies like:
@@ -387,14 +387,14 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
   // instruction to maintain this information.
   if (MI.getOperand(0).isUndef() || MI.getNumOperands() > 2) {
     MI.setDesc(TII->get(TargetOpcode::KILL));
-    DEBUG(dbgs() << "  replace by: " << MI);
+    LLVM_DEBUG(dbgs() << "  replace by: " << MI);
     return;
   }
 
   if (Indexes)
     Indexes->removeSingleMachineInstrFromMaps(MI);
   MI.eraseFromBundle();
-  DEBUG(dbgs() << "  deleted.\n");
+  LLVM_DEBUG(dbgs() << "  deleted.\n");
 }
 
 /// The liverange splitting logic sometimes produces bundles of copies when
@@ -406,6 +406,8 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
     return;
 
   if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
+    SmallVector<MachineInstr *, 2> MIs({&MI});
+
     // Only do this when the complete bundle is made out of COPYs.
     MachineBasicBlock &MBB = *MI.getParent();
     for (MachineBasicBlock::reverse_instr_iterator I =
@@ -413,16 +415,53 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
          I != E && I->isBundledWithSucc(); ++I) {
       if (!I->isCopy())
         return;
+      MIs.push_back(&*I);
+    }
+    MachineInstr *FirstMI = MIs.back();
+
+    auto anyRegsAlias = [](const MachineInstr *Dst,
+                           ArrayRef<MachineInstr *> Srcs,
+                           const TargetRegisterInfo *TRI) {
+      for (const MachineInstr *Src : Srcs)
+        if (Src != Dst)
+          if (TRI->regsOverlap(Dst->getOperand(0).getReg(),
+                               Src->getOperand(1).getReg()))
+            return true;
+      return false;
+    };
+
+    // If any of the destination registers in the bundle of copies alias any of
+    // the source registers, try to schedule the instructions to avoid any
+    // clobbering.
+    for (int E = MIs.size(), PrevE = E; E > 1; PrevE = E) {
+      for (int I = E; I--; )
+        if (!anyRegsAlias(MIs[I], makeArrayRef(MIs).take_front(E), TRI)) {
+          if (I + 1 != E)
+            std::swap(MIs[I], MIs[E - 1]);
+          --E;
+        }
+      if (PrevE == E) {
+        MF->getFunction().getContext().emitError(
+            "register rewriting failed: cycle in copy bundle");
+        break;
+      }
     }
 
-    for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator();
-         I->isBundledWithPred(); ) {
-      MachineInstr &MI = *I;
-      ++I;
+    MachineInstr *BundleStart = FirstMI;
+    for (MachineInstr *BundledMI : llvm::reverse(MIs)) {
+      // If instruction is in the middle of the bundle, move it before the
+      // bundle starts, otherwise, just unbundle it. When we get to the last
+      // instruction, the bundle will have been completely undone.
+      if (BundledMI != BundleStart) {
+        BundledMI->removeFromBundle();
+        MBB.insert(FirstMI, BundledMI);
+      } else if (BundledMI->isBundledWithSucc()) {
+        BundledMI->unbundleFromSucc();
+        BundleStart = &*std::next(BundledMI->getIterator());
+      }
 
-      MI.unbundleFromPred();
-      if (Indexes)
-        Indexes->insertMachineInstrInMaps(MI);
+      if (Indexes && BundledMI != FirstMI)
+        Indexes->insertMachineInstrInMaps(*BundledMI);
     }
   }
 }
@@ -461,7 +500,7 @@ void VirtRegRewriter::rewrite() {
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
-    DEBUG(MBBI->print(dbgs(), Indexes));
+    LLVM_DEBUG(MBBI->print(dbgs(), Indexes));
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
       MachineInstr *MI = &*MII;
@@ -530,7 +569,7 @@ void VirtRegRewriter::rewrite() {
         // Rewrite. Note we could have used MachineOperand::substPhysReg(), but
         // we need the inlining here.
         MO.setReg(PhysReg);
-        MO.setIsRenamableIfNoExtraRegAllocReq();
+        MO.setIsRenamable(true);
       }
 
       // Add any missing super-register kills after rewriting the whole
@@ -544,7 +583,7 @@ void VirtRegRewriter::rewrite() {
       while (!SuperDefs.empty())
         MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI);
 
-      DEBUG(dbgs() << "> " << *MI);
+      LLVM_DEBUG(dbgs() << "> " << *MI);
 
       expandCopyBundle(*MI);
 
diff --git a/contrib/llvm/lib/CodeGen/WasmEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WasmEHPrepare.cpp
new file mode 100644
index 000000000000..83d04da5dd0c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -0,0 +1,374 @@
+//===-- WasmEHPrepare - Prepare excepton handling for WebAssembly --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which use
+// WebAssembly exception handling scheme.
+//
+// WebAssembly exception handling uses Windows exception IR for the middle level
+// representation. This pass does the following transformation for every
+// catchpad block:
+// (In C-style pseudocode)
+//
+// - Before:
+//   catchpad ...
+//   exn = wasm.get.exception();
+//   selector = wasm.get.selector();
+//   ...
+//
+// - After:
+//   catchpad ...
+//   exn = wasm.catch(0); // 0 is a tag for C++
+//   wasm.landingpad.index(index);
+//   // Only add below in case it's not a single catch (...)
+//   __wasm_lpad_context.lpad_index = index;
+//   __wasm_lpad_context.lsda = wasm.lsda();
+//   _Unwind_CallPersonality(exn);
+//   int selector = __wasm.landingpad_context.selector;
+//   ...
+//
+// Also, does the following for a cleanuppad block with a call to
+// __clang_call_terminate():
+// - Before:
+//   cleanuppad ...
+//   exn = wasm.get.exception();
+//   __clang_call_terminate(exn);
+//
+// - After:
+//   cleanuppad ...
+//   exn = wasm.catch(0); // 0 is a tag for C++
+//   __clang_call_terminate(exn);
+//
+//
+// * Background: WebAssembly EH instructions
+// WebAssembly's try and catch instructions are structured as follows:
+// try
+//   instruction*
+// catch (C++ tag)
+//   instruction*
+// ...
+// catch_all
+//   instruction*
+// try_end
+//
+// A catch instruction in WebAssembly does not correspond to a C++ catch clause.
+// In WebAssembly, there is a single catch instruction for all C++ exceptions.
+// There can be more catch instructions for exceptions in other languages, but
+// they are not generated for now. catch_all catches all exceptions including
+// foreign exceptions (e.g. JavaScript). We turn catchpads into catch (C++ tag)
+// and cleanuppads into catch_all, with one exception: cleanuppad with a call to
+// __clang_call_terminate should be both in catch (C++ tag) and catch_all.
+//
+//
+// * Background: Direct personality function call
+// In WebAssembly EH, the VM is responsible for unwinding the stack once an
+// exception is thrown. After the stack is unwound, the control flow is
+// transfered to WebAssembly 'catch' instruction, which returns a caught
+// exception object.
+//
+// Unwinding the stack is not done by libunwind but the VM, so the personality
+// function in libcxxabi cannot be called from libunwind during the unwinding
+// process. So after a catch instruction, we insert a call to a wrapper function
+// in libunwind that in turn calls the real personality function.
+//
+// In Itanium EH, if the personality function decides there is no matching catch
+// clause in a call frame and no cleanup action to perform, the unwinder doesn't
+// stop there and continues unwinding. But in Wasm EH, the unwinder stops at
+// every call frame with a catch intruction, after which the personality
+// function is called from the compiler-generated user code here.
+//
+// In libunwind, we have this struct that serves as a communincation channel
+// between the compiler-generated user code and the personality function in
+// libcxxabi.
+//
+// struct _Unwind_LandingPadContext {
+//   uintptr_t lpad_index;
+//   uintptr_t lsda;
+//   uintptr_t selector;
+// };
+// struct _Unwind_LandingPadContext __wasm_lpad_context = ...;
+//
+// And this wrapper in libunwind calls the personality function.
+//
+// _Unwind_Reason_Code _Unwind_CallPersonality(void *exception_ptr) {
+//   struct _Unwind_Exception *exception_obj =
+//       (struct _Unwind_Exception *)exception_ptr;
+//   _Unwind_Reason_Code ret = __gxx_personality_v0(
+//       1, _UA_CLEANUP_PHASE, exception_obj->exception_class, exception_obj,
+//       (struct _Unwind_Context *)__wasm_lpad_context);
+//   return ret;
+// }
+//
+// We pass a landing pad index, and the address of LSDA for the current function
+// to the wrapper function _Unwind_CallPersonality in libunwind, and we retrieve
+// the selector after it returns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasmehprepare"
+
+namespace {
+class WasmEHPrepare : public FunctionPass {
+  Type *LPadContextTy = nullptr; // type of 'struct _Unwind_LandingPadContext'
+  GlobalVariable *LPadContextGV = nullptr; // __wasm_lpad_context
+
+  // Field addresses of struct _Unwind_LandingPadContext
+  Value *LPadIndexField = nullptr; // lpad_index field
+  Value *LSDAField = nullptr;      // lsda field
+  Value *SelectorField = nullptr;  // selector
+
+  Function *CatchF = nullptr;           // wasm.catch.extract() intrinsic
+  Function *LPadIndexF = nullptr;       // wasm.landingpad.index() intrinsic
+  Function *LSDAF = nullptr;            // wasm.lsda() intrinsic
+  Function *GetExnF = nullptr;          // wasm.get.exception() intrinsic
+  Function *GetSelectorF = nullptr;     // wasm.get.ehselector() intrinsic
+  Function *CallPersonalityF = nullptr; // _Unwind_CallPersonality() wrapper
+  Function *ClangCallTermF = nullptr;   // __clang_call_terminate() function
+
+  void prepareEHPad(BasicBlock *BB, unsigned Index);
+  void prepareTerminateCleanupPad(BasicBlock *BB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  WasmEHPrepare() : FunctionPass(ID) {}
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "WebAssembly Exception handling preparation";
+  }
+};
+} // end anonymous namespace
+
+char WasmEHPrepare::ID = 0;
+INITIALIZE_PASS(WasmEHPrepare, DEBUG_TYPE, "Prepare WebAssembly exceptions",
+                false, false)
+
+FunctionPass *llvm::createWasmEHPass() { return new WasmEHPrepare(); }
+
+bool WasmEHPrepare::doInitialization(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  LPadContextTy = StructType::get(IRB.getInt32Ty(),   // lpad_index
+                                  IRB.getInt8PtrTy(), // lsda
+                                  IRB.getInt32Ty()    // selector
+  );
+  return false;
+}
+
+bool WasmEHPrepare::runOnFunction(Function &F) {
+  SmallVector<BasicBlock *, 16> CatchPads;
+  SmallVector<BasicBlock *, 16> CleanupPads;
+  for (BasicBlock &BB : F) {
+    if (!BB.isEHPad())
+      continue;
+    auto *Pad = BB.getFirstNonPHI();
+    if (isa<CatchPadInst>(Pad))
+      CatchPads.push_back(&BB);
+    else if (isa<CleanupPadInst>(Pad))
+      CleanupPads.push_back(&BB);
+  }
+
+  if (CatchPads.empty() && CleanupPads.empty())
+    return false;
+  assert(F.hasPersonalityFn() && "Personality function not found");
+
+  Module &M = *F.getParent();
+  IRBuilder<> IRB(F.getContext());
+
+  // __wasm_lpad_context global variable
+  LPadContextGV = cast<GlobalVariable>(
+      M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
+  LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
+                                          "lpad_index_gep");
+  LSDAField =
+      IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 1, "lsda_gep");
+  SelectorField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 2,
+                                         "selector_gep");
+
+  // wasm.catch() intinsic, which will be lowered to wasm 'catch' instruction.
+  CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch);
+  // wasm.landingpad.index() intrinsic, which is to specify landingpad index
+  LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index);
+  // wasm.lsda() intrinsic. Returns the address of LSDA table for the current
+  // function.
+  LSDAF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_lsda);
+  // wasm.get.exception() and wasm.get.ehselector() intrinsics. Calls to these
+  // are generated in clang.
+  GetExnF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_exception);
+  GetSelectorF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_ehselector);
+
+  // _Unwind_CallPersonality() wrapper function, which calls the personality
+  CallPersonalityF = cast<Function>(M.getOrInsertFunction(
+      "_Unwind_CallPersonality", IRB.getInt32Ty(), IRB.getInt8PtrTy()));
+  CallPersonalityF->setDoesNotThrow();
+
+  // __clang_call_terminate() function, which is inserted by clang in case a
+  // cleanup throws
+  ClangCallTermF = M.getFunction("__clang_call_terminate");
+
+  unsigned Index = 0;
+  for (auto *BB : CatchPads) {
+    auto *CPI = cast<CatchPadInst>(BB->getFirstNonPHI());
+    // In case of a single catch (...), we don't need to emit LSDA
+    if (CPI->getNumArgOperands() == 1 &&
+        cast<Constant>(CPI->getArgOperand(0))->isNullValue())
+      prepareEHPad(BB, -1);
+    else
+      prepareEHPad(BB, Index++);
+  }
+
+  if (!ClangCallTermF)
+    return !CatchPads.empty();
+
+  // Cleanuppads will turn into catch_all later, but cleanuppads with a call to
+  // __clang_call_terminate() is a special case. __clang_call_terminate() takes
+  // an exception object, so we have to duplicate call in both 'catch <C++ tag>'
+  // and 'catch_all' clauses. Here we only insert a call to catch; the
+  // duplication will be done later. In catch_all, the exception object will be
+  // set to null.
+  for (auto *BB : CleanupPads)
+    for (auto &I : *BB)
+      if (auto *CI = dyn_cast<CallInst>(&I))
+        if (CI->getCalledValue() == ClangCallTermF)
+          prepareEHPad(BB, -1);
+
+  return true;
+}
+
+void WasmEHPrepare::prepareEHPad(BasicBlock *BB, unsigned Index) {
+  assert(BB->isEHPad() && "BB is not an EHPad!");
+  IRBuilder<> IRB(BB->getContext());
+
+  IRB.SetInsertPoint(&*BB->getFirstInsertionPt());
+  // The argument to wasm.catch() is the tag for C++ exceptions, which we set to
+  // 0 for this module.
+  // Pseudocode: void *exn = wasm.catch(0);
+  Instruction *Exn = IRB.CreateCall(CatchF, IRB.getInt32(0), "exn");
+  // Replace the return value of wasm.get.exception() with the return value from
+  // wasm.catch().
+  auto *FPI = cast<FuncletPadInst>(BB->getFirstNonPHI());
+  Instruction *GetExnCI = nullptr, *GetSelectorCI = nullptr;
+  for (auto &U : FPI->uses()) {
+    if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
+      if (CI->getCalledValue() == GetExnF)
+        GetExnCI = CI;
+      else if (CI->getCalledValue() == GetSelectorF)
+        GetSelectorCI = CI;
+    }
+  }
+
+  assert(GetExnCI && "wasm.get.exception() call does not exist");
+  GetExnCI->replaceAllUsesWith(Exn);
+  GetExnCI->eraseFromParent();
+
+  // In case it is a catchpad with single catch (...) or a cleanuppad, we don't
+  // need to call personality function because we don't need a selector.
+  if (FPI->getNumArgOperands() == 0 ||
+      (FPI->getNumArgOperands() == 1 &&
+       cast<Constant>(FPI->getArgOperand(0))->isNullValue())) {
+    if (GetSelectorCI) {
+      assert(GetSelectorCI->use_empty() &&
+             "wasm.get.ehselector() still has uses!");
+      GetSelectorCI->eraseFromParent();
+    }
+    return;
+  }
+  IRB.SetInsertPoint(Exn->getNextNode());
+
+  // This is to create a map of <landingpad EH label, landingpad index> in
+  // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
+  // Pseudocode: wasm.landingpad.index(Index);
+  IRB.CreateCall(LPadIndexF, IRB.getInt32(Index));
+
+  // Pseudocode: __wasm_lpad_context.lpad_index = index;
+  IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
+
+  // Store LSDA address only if this catchpad belongs to a top-level
+  // catchswitch. If there is another catchpad that dominates this pad, we don't
+  // need to store LSDA address again, because they are the same throughout the
+  // function and have been already stored before.
+  // TODO Can we not store LSDA address in user function but make libcxxabi
+  // compute it?
+  auto *CPI = cast<CatchPadInst>(FPI);
+  if (isa<ConstantTokenNone>(CPI->getCatchSwitch()->getParentPad()))
+    // Pseudocode: __wasm_lpad_context.lsda = wasm.lsda();
+    IRB.CreateStore(IRB.CreateCall(LSDAF), LSDAField);
+
+  // Pseudocode: _Unwind_CallPersonality(exn);
+  CallInst *PersCI =
+      IRB.CreateCall(CallPersonalityF, Exn, OperandBundleDef("funclet", CPI));
+  PersCI->setDoesNotThrow();
+
+  // Pseudocode: int selector = __wasm.landingpad_context.selector;
+  Instruction *Selector = IRB.CreateLoad(SelectorField, "selector");
+
+  // Replace the return value from wasm.get.ehselector() with the selector value
+  // loaded from __wasm_lpad_context.selector.
+  assert(GetSelectorCI && "wasm.get.ehselector() call does not exist");
+  GetSelectorCI->replaceAllUsesWith(Selector);
+  GetSelectorCI->eraseFromParent();
+}
+
+void llvm::calculateWasmEHInfo(const Function *F, WasmEHFuncInfo &EHInfo) {
+  for (const auto &BB : *F) {
+    if (!BB.isEHPad())
+      continue;
+    const Instruction *Pad = BB.getFirstNonPHI();
+
+    // If an exception is not caught by a catchpad (i.e., it is a foreign
+    // exception), it will unwind to its parent catchswitch's unwind
+    // destination. We don't record an unwind destination for cleanuppads
+    // because every exception should be caught by it.
+    if (const auto *CatchPad = dyn_cast<CatchPadInst>(Pad)) {
+      const auto *UnwindBB = CatchPad->getCatchSwitch()->getUnwindDest();
+      if (!UnwindBB)
+        continue;
+      const Instruction *UnwindPad = UnwindBB->getFirstNonPHI();
+      if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UnwindPad))
+        // Currently there should be only one handler per a catchswitch.
+        EHInfo.setEHPadUnwindDest(&BB, *CatchSwitch->handlers().begin());
+      else // cleanuppad
+        EHInfo.setEHPadUnwindDest(&BB, UnwindBB);
+    }
+  }
+
+  // Record the unwind destination for invoke and cleanupret instructions.
+  for (const auto &BB : *F) {
+    const Instruction *TI = BB.getTerminator();
+    BasicBlock *UnwindBB = nullptr;
+    if (const auto *Invoke = dyn_cast<InvokeInst>(TI))
+      UnwindBB = Invoke->getUnwindDest();
+    else if (const auto *CleanupRet = dyn_cast<CleanupReturnInst>(TI))
+      UnwindBB = CleanupRet->getUnwindDest();
+    if (!UnwindBB)
+      continue;
+    const Instruction *UnwindPad = UnwindBB->getFirstNonPHI();
+    if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UnwindPad))
+      // Currently there should be only one handler per a catchswitch.
+      EHInfo.setThrowUnwindDest(&BB, *CatchSwitch->handlers().begin());
+    else // cleanuppad
+      EHInfo.setThrowUnwindDest(&BB, UnwindBB);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
index 0b16a113640d..e629c13f133f 100644
--- a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
@@ -31,7 +32,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -41,7 +41,7 @@ using namespace llvm;
 static cl::opt<bool> DisableDemotion(
     "disable-demotion", cl::Hidden,
     cl::desc(
-        "Clone multicolor basic blocks but do not demote cross funclet values"),
+        "Clone multicolor basic blocks but do not demote cross scopes"),
     cl::init(false));
 
 static cl::opt<bool> DisableCleanups(
@@ -49,12 +49,17 @@ static cl::opt<bool> DisableCleanups(
     cl::desc("Do not remove implausible terminators or other similar cleanups"),
     cl::init(false));
 
+static cl::opt<bool> DemoteCatchSwitchPHIOnlyOpt(
+    "demote-catchswitch-only", cl::Hidden,
+    cl::desc("Demote catchswitch BBs only (for wasm EH)"), cl::init(false));
+
 namespace {
   
 class WinEHPrepare : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
-  WinEHPrepare() : FunctionPass(ID) {}
+  WinEHPrepare(bool DemoteCatchSwitchPHIOnly = false)
+      : FunctionPass(ID), DemoteCatchSwitchPHIOnly(DemoteCatchSwitchPHIOnly) {}
 
   bool runOnFunction(Function &Fn) override;
 
@@ -77,12 +82,14 @@ private:
   bool prepareExplicitEH(Function &F);
   void colorFunclets(Function &F);
 
-  void demotePHIsOnFunclets(Function &F);
+  void demotePHIsOnFunclets(Function &F, bool DemoteCatchSwitchPHIOnly);
   void cloneCommonBlocks(Function &F);
   void removeImplausibleInstructions(Function &F);
   void cleanupPreparedFunclets(Function &F);
   void verifyPreparedFunclets(Function &F);
 
+  bool DemoteCatchSwitchPHIOnly;
+
   // All fields are reset by runOnFunction.
   EHPersonality Personality = EHPersonality::Unknown;
 
@@ -97,7 +104,9 @@ char WinEHPrepare::ID = 0;
 INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions",
                 false, false)
 
-FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); }
+FunctionPass *llvm::createWinEHPass(bool DemoteCatchSwitchPHIOnly) {
+  return new WinEHPrepare(DemoteCatchSwitchPHIOnly);
+}
 
 bool WinEHPrepare::runOnFunction(Function &Fn) {
   if (!Fn.hasPersonalityFn())
@@ -106,8 +115,8 @@ bool WinEHPrepare::runOnFunction(Function &Fn) {
   // Classify the personality to see what kind of preparation we need.
   Personality = classifyEHPersonality(Fn.getPersonalityFn());
 
-  // Do nothing if this is not a funclet-based personality.
-  if (!isFuncletEHPersonality(Personality))
+  // Do nothing if this is not a scope-based personality.
+  if (!isScopedEHPersonality(Personality))
     return false;
 
   DL = &Fn.getParent()->getDataLayout();
@@ -271,10 +280,11 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
     }
     int CatchHigh = FuncInfo.getLastStateNumber();
     addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers);
-    DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n');
-    DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh << '\n');
-    DEBUG(dbgs() << "CatchHigh[" << BB->getName() << "]: " << CatchHigh
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n');
+    LLVM_DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh
+                      << '\n');
+    LLVM_DEBUG(dbgs() << "CatchHigh[" << BB->getName() << "]: " << CatchHigh
+                      << '\n');
   } else {
     auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
 
@@ -285,8 +295,8 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
 
     int CleanupState = addUnwindMapEntry(FuncInfo, ParentState, BB);
     FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
-    DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
-                 << BB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                      << BB->getName() << '\n');
     for (const BasicBlock *PredBlock : predecessors(BB)) {
       if ((PredBlock = getEHPadFromPredecessor(PredBlock,
                                                CleanupPad->getParentPad()))) {
@@ -351,8 +361,8 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
 
     // Everything in the __try block uses TryState as its parent state.
     FuncInfo.EHPadStateMap[CatchSwitch] = TryState;
-    DEBUG(dbgs() << "Assigning state #" << TryState << " to BB "
-                 << CatchPadBB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Assigning state #" << TryState << " to BB "
+                      << CatchPadBB->getName() << '\n');
     for (const BasicBlock *PredBlock : predecessors(BB))
       if ((PredBlock = getEHPadFromPredecessor(PredBlock,
                                                CatchSwitch->getParentPad())))
@@ -387,8 +397,8 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
 
     int CleanupState = addSEHFinally(FuncInfo, ParentState, BB);
     FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
-    DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
-                 << BB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                      << BB->getName() << '\n');
     for (const BasicBlock *PredBlock : predecessors(BB))
       if ((PredBlock =
                getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad())))
@@ -677,13 +687,17 @@ void WinEHPrepare::colorFunclets(Function &F) {
   }
 }
 
-void WinEHPrepare::demotePHIsOnFunclets(Function &F) {
+void WinEHPrepare::demotePHIsOnFunclets(Function &F,
+                                        bool DemoteCatchSwitchPHIOnly) {
   // Strip PHI nodes off of EH pads.
   SmallVector<PHINode *, 16> PHINodes;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
     BasicBlock *BB = &*FI++;
     if (!BB->isEHPad())
       continue;
+    if (DemoteCatchSwitchPHIOnly && !isa<CatchSwitchInst>(BB->getFirstNonPHI()))
+      continue;
+
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
       Instruction *I = &*BI++;
       auto *PN = dyn_cast<PHINode>(I);
@@ -1031,20 +1045,21 @@ bool WinEHPrepare::prepareExplicitEH(Function &F) {
   cloneCommonBlocks(F);
 
   if (!DisableDemotion)
-    demotePHIsOnFunclets(F);
+    demotePHIsOnFunclets(F, DemoteCatchSwitchPHIOnly ||
+                                DemoteCatchSwitchPHIOnlyOpt);
 
   if (!DisableCleanups) {
-    DEBUG(verifyFunction(F));
+    LLVM_DEBUG(verifyFunction(F));
     removeImplausibleInstructions(F);
 
-    DEBUG(verifyFunction(F));
+    LLVM_DEBUG(verifyFunction(F));
     cleanupPreparedFunclets(F);
   }
 
-  DEBUG(verifyPreparedFunclets(F));
+  LLVM_DEBUG(verifyPreparedFunclets(F));
   // Recolor the CFG to verify that all is well.
-  DEBUG(colorFunclets(F));
-  DEBUG(verifyPreparedFunclets(F));
+  LLVM_DEBUG(colorFunclets(F));
+  LLVM_DEBUG(verifyPreparedFunclets(F));
 
   BlockColors.clear();
   FuncletBlocks.clear();
diff --git a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
index 3d83afcf1fc5..32a7457c2060 100644
--- a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -52,7 +52,6 @@ struct XRayInstrumentation : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfo>();
     AU.addPreserved<MachineLoopInfo>();
     AU.addPreserved<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -160,11 +159,26 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
     for (const auto &MBB : MF)
       MICount += MBB.size();
 
+    // Get MachineDominatorTree or compute it on the fly if it's unavailable
+    auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+    MachineDominatorTree ComputedMDT;
+    if (!MDT) {
+      ComputedMDT.getBase().recalculate(MF);
+      MDT = &ComputedMDT;
+    }
+
+    // Get MachineLoopInfo or compute it on the fly if it's unavailable
+    auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    MachineLoopInfo ComputedMLI;
+    if (!MLI) {
+      ComputedMLI.getBase().analyze(MDT->getBase());
+      MLI = &ComputedMLI;
+    }
+
     // Check if we have a loop.
     // FIXME: Maybe make this smarter, and see whether the loops are dependent
     // on inputs or side-effects?
-    MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-    if (MLI.empty() && MICount < XRayThreshold)
+    if (MLI->empty() && MICount < XRayThreshold)
       return false; // Function is too small and has no loops.
   }
 
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
index ccc20eb74887..0f155a95d607 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
@@ -109,7 +109,7 @@ Error DebugChecksumsSubsection::commit(BinaryStreamWriter &Writer) const {
 }
 
 uint32_t DebugChecksumsSubsection::mapChecksumOffset(StringRef FileName) const {
-  uint32_t Offset = Strings.getStringId(FileName);
+  uint32_t Offset = Strings.getIdForString(FileName);
   auto Iter = OffsetMap.find(Offset);
   assert(Iter != OffsetMap.end());
   return Iter->second;
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
index 88c0076915b5..bf9dd7c86862 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
@@ -79,14 +79,14 @@ Error DebugCrossModuleImportsSubsection::commit(
   for (const auto &M : Mappings)
     Ids.push_back(&M);
 
-  std::sort(Ids.begin(), Ids.end(), [this](const T &L1, const T &L2) {
-    return Strings.getStringId(L1->getKey()) <
-           Strings.getStringId(L2->getKey());
+  llvm::sort(Ids.begin(), Ids.end(), [this](const T &L1, const T &L2) {
+    return Strings.getIdForString(L1->getKey()) <
+           Strings.getIdForString(L2->getKey());
   });
 
   for (const auto &Item : Ids) {
     CrossModuleImport Imp;
-    Imp.ModuleNameOffset = Strings.getStringId(Item->getKey());
+    Imp.ModuleNameOffset = Strings.getIdForString(Item->getKey());
     Imp.Count = Item->getValue().size();
     if (auto EC = Writer.writeObject(Imp))
       return EC;
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
index d723282eb715..d2acc9a21003 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
@@ -46,12 +46,15 @@ DebugStringTableSubsection::DebugStringTableSubsection()
     : DebugSubsection(DebugSubsectionKind::StringTable) {}
 
 uint32_t DebugStringTableSubsection::insert(StringRef S) {
-  auto P = Strings.insert({S, StringSize});
+  auto P = StringToId.insert({S, StringSize});
 
   // If a given string didn't exist in the string table, we want to increment
-  // the string table size.
-  if (P.second)
+  // the string table size and insert it into the reverse lookup.
+  if (P.second) {
+    IdToString.insert({P.first->getValue(), P.first->getKey()});
     StringSize += S.size() + 1; // +1 for '\0'
+  }
+
   return P.first->second;
 }
 
@@ -67,7 +70,7 @@ Error DebugStringTableSubsection::commit(BinaryStreamWriter &Writer) const {
   if (auto EC = Writer.writeCString(StringRef()))
     return EC;
 
-  for (auto &Pair : Strings) {
+  for (auto &Pair : StringToId) {
     StringRef S = Pair.getKey();
     uint32_t Offset = Begin + Pair.getValue();
     Writer.setOffset(Offset);
@@ -81,10 +84,25 @@ Error DebugStringTableSubsection::commit(BinaryStreamWriter &Writer) const {
   return Error::success();
 }
 
-uint32_t DebugStringTableSubsection::size() const { return Strings.size(); }
+uint32_t DebugStringTableSubsection::size() const { return StringToId.size(); }
+
+std::vector<uint32_t> DebugStringTableSubsection::sortedIds() const {
+  std::vector<uint32_t> Result;
+  Result.reserve(IdToString.size());
+  for (const auto &Entry : IdToString)
+    Result.push_back(Entry.first);
+  llvm::sort(Result.begin(), Result.end());
+  return Result;
+}
+
+uint32_t DebugStringTableSubsection::getIdForString(StringRef S) const {
+  auto Iter = StringToId.find(S);
+  assert(Iter != StringToId.end());
+  return Iter->second;
+}
 
-uint32_t DebugStringTableSubsection::getStringId(StringRef S) const {
-  auto Iter = Strings.find(S);
-  assert(Iter != Strings.end());
+StringRef DebugStringTableSubsection::getStringForId(uint32_t Id) const {
+  auto Iter = IdToString.find(Id);
+  assert(Iter != IdToString.end());
   return Iter->second;
 }
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
index 3ecd684c1e39..e76f9e12f0af 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
@@ -55,9 +55,12 @@ Optional<TypeIndex> GlobalTypeTableBuilder::getNext(TypeIndex Prev) {
 CVType GlobalTypeTableBuilder::getType(TypeIndex Index) {
   CVType Type;
   Type.RecordData = SeenRecords[Index.toArrayIndex()];
-  const RecordPrefix *P =
-      reinterpret_cast<const RecordPrefix *>(Type.RecordData.data());
-  Type.Type = static_cast<TypeLeafKind>(uint16_t(P->RecordKind));
+  if (!Type.RecordData.empty()) {
+    assert(Type.RecordData.size() >= sizeof(RecordPrefix));
+    const RecordPrefix *P =
+        reinterpret_cast<const RecordPrefix *>(Type.RecordData.data());
+    Type.Type = static_cast<TypeLeafKind>(uint16_t(P->RecordKind));
+  }
   return Type;
 }
 
@@ -89,31 +92,15 @@ void GlobalTypeTableBuilder::reset() {
   SeenRecords.clear();
 }
 
-static inline ArrayRef<uint8_t> stabilize(BumpPtrAllocator &Alloc,
-                                          ArrayRef<uint8_t> Data) {
-  uint8_t *Stable = Alloc.Allocate<uint8_t>(Data.size());
-  memcpy(Stable, Data.data(), Data.size());
-  return makeArrayRef(Stable, Data.size());
-}
-
-TypeIndex GlobalTypeTableBuilder::insertRecordAs(GloballyHashedType Hash,
-                                                 CreateRecord Create) {
-  auto Result = HashedRecords.try_emplace(Hash, nextTypeIndex());
-
-  if (Result.second) {
-    ArrayRef<uint8_t> RecordData = stabilize(RecordStorage, Create());
-    SeenRecords.push_back(RecordData);
-    SeenHashes.push_back(Hash);
-  }
-
-  // Update the caller's copy of Record to point a stable copy.
-  return Result.first->second;
-}
-
 TypeIndex GlobalTypeTableBuilder::insertRecordBytes(ArrayRef<uint8_t> Record) {
   GloballyHashedType GHT =
       GloballyHashedType::hashType(Record, SeenHashes, SeenHashes);
-  return insertRecordAs(GHT, [Record]() { return Record; });
+  return insertRecordAs(GHT, Record.size(),
+                        [Record](MutableArrayRef<uint8_t> Data) {
+                          assert(Data.size() == Record.size());
+                          ::memcpy(Data.data(), Record.data(), Record.size());
+                          return Data;
+                        });
 }
 
 TypeIndex
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/RecordName.cpp b/contrib/llvm/lib/DebugInfo/CodeView/RecordName.cpp
index 15fb1724d23d..e50c43a1d481 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/RecordName.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/RecordName.cpp
@@ -167,13 +167,6 @@ Error TypeNameComputer::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
     StringRef Class = Types.getTypeName(MI.getContainingType());
     Name = formatv("{0} {1}::*", Pointee, Class);
   } else {
-    if (Ptr.isConst())
-      Name.append("const ");
-    if (Ptr.isVolatile())
-      Name.append("volatile ");
-    if (Ptr.isUnaligned())
-      Name.append("__unaligned ");
-
     Name.append(Types.getTypeName(Ptr.getReferentType()));
 
     if (Ptr.getMode() == PointerMode::LValueReference)
@@ -182,6 +175,17 @@ Error TypeNameComputer::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
       Name.append("&&");
     else if (Ptr.getMode() == PointerMode::Pointer)
       Name.append("*");
+
+    // Qualifiers in pointer records apply to the pointer, not the pointee, so
+    // they go on the right.
+    if (Ptr.isConst())
+      Name.append(" const");
+    if (Ptr.isVolatile())
+      Name.append(" volatile");
+    if (Ptr.isUnaligned())
+      Name.append(" __unaligned");
+    if (Ptr.isRestrict())
+      Name.append(" __restrict");
   }
   return Error::success();
 }
@@ -189,7 +193,6 @@ Error TypeNameComputer::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
 Error TypeNameComputer::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
 
-  SmallString<256> TypeName;
   if (Mods & uint16_t(ModifierOptions::Const))
     Name.append("const ");
   if (Mods & uint16_t(ModifierOptions::Volatile))
@@ -233,6 +236,16 @@ Error TypeNameComputer::visitKnownRecord(CVType &CVR, LabelRecord &R) {
   return Error::success();
 }
 
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         PrecompRecord &Precomp) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         EndPrecompRecord &EndPrecomp) {
+  return Error::success();
+}
+
 std::string llvm::codeview::computeTypeName(TypeCollection &Types,
                                             TypeIndex Index) {
   TypeNameComputer Computer(Types);
@@ -273,6 +286,8 @@ static int getSymbolNameOffset(CVSymbol Sym) {
   case SymbolKind::S_GMANDATA:
   case SymbolKind::S_LTHREAD32:
   case SymbolKind::S_GTHREAD32:
+  case SymbolKind::S_PROCREF:
+  case SymbolKind::S_LPROCREF:
     return 10;
   // See RegisterSym and LocalSym
   case SymbolKind::S_REGISTER:
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp b/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
index df75f52661e1..af249adc9774 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -129,6 +129,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, BlockSym &Block) {
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, Thunk32Sym &Thunk) {
+  W.printString("Name", Thunk.Name);
   W.printNumber("Parent", Thunk.Parent);
   W.printNumber("End", Thunk.End);
   W.printNumber("Next", Thunk.Next);
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index e7998b8732fe..7c68c9167c98 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -370,6 +370,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   W->printNumber("IsConst", Ptr.isConst());
   W->printNumber("IsVolatile", Ptr.isVolatile());
   W->printNumber("IsUnaligned", Ptr.isUnaligned());
+  W->printNumber("IsRestrict", Ptr.isRestrict());
   W->printNumber("SizeOf", Ptr.getSize());
 
   if (Ptr.isPointerToMember()) {
@@ -552,3 +553,18 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, LabelRecord &LR) {
   W->printEnum("Mode", uint16_t(LR.Mode), makeArrayRef(LabelTypeEnum));
   return Error::success();
 }
+
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        PrecompRecord &Precomp) {
+  W->printHex("StartIndex", Precomp.getStartTypeIndex());
+  W->printHex("Count", Precomp.getTypesCount());
+  W->printHex("Signature", Precomp.getSignature());
+  W->printString("PrecompFile", Precomp.getPrecompFilePath());
+  return Error::success();
+}
+
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        EndPrecompRecord &EndPrecomp) {
+  W->printHex("Signature", EndPrecomp.getSignature());
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
index f5b28b2a2070..826faef35875 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
@@ -18,10 +18,10 @@ using namespace llvm::codeview;
 LocallyHashedType DenseMapInfo<LocallyHashedType>::Empty{0, {}};
 LocallyHashedType DenseMapInfo<LocallyHashedType>::Tombstone{hash_code(-1), {}};
 
-static std::array<uint8_t, 20> EmptyHash;
-static std::array<uint8_t, 20> TombstoneHash = {
-    {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
+static std::array<uint8_t, 8> EmptyHash = {
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
+static std::array<uint8_t, 8> TombstoneHash = {
+    {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
 
 GloballyHashedType DenseMapInfo<GloballyHashedType>::Empty{EmptyHash};
 GloballyHashedType DenseMapInfo<GloballyHashedType>::Tombstone{TombstoneHash};
@@ -39,6 +39,7 @@ GloballyHashedType::hashType(ArrayRef<uint8_t> RecordData,
   SHA1 S;
   S.init();
   uint32_t Off = 0;
+  S.update(RecordData.take_front(sizeof(RecordPrefix)));
   RecordData = RecordData.drop_front(sizeof(RecordPrefix));
   for (const auto &Ref : Refs) {
     // Hash any data that comes before this TiRef.
@@ -70,5 +71,5 @@ GloballyHashedType::hashType(ArrayRef<uint8_t> RecordData,
   auto TrailingBytes = RecordData.drop_front(Off);
   S.update(TrailingBytes);
 
-  return {S.final()};
+  return {S.final().take_back(8)};
 }
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
index d283e9e6d2f1..95082d4a8e03 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
@@ -58,7 +58,7 @@ static inline uint32_t getEncodedIntegerLength(ArrayRef<uint8_t> Data) {
       8,  // LF_UQUADWORD
   };
 
-  return Sizes[N - LF_NUMERIC];
+  return 2 + Sizes[N - LF_NUMERIC];
 }
 
 static inline uint32_t getCStringLength(ArrayRef<uint8_t> Data) {
@@ -393,7 +393,7 @@ static bool discoverTypeIndices(ArrayRef<uint8_t> Content, SymbolKind Kind,
     Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
     break;
   case SymbolKind::S_REGISTER:
-    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type;
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
     break;
   case SymbolKind::S_CONSTANT:
     Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index 9b8a6053da84..3203ff64d3b1 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -480,3 +480,18 @@ Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
 
   return Error::success();
 }
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
+                                          PrecompRecord &Precomp) {
+  error(IO.mapInteger(Precomp.StartTypeIndex));
+  error(IO.mapInteger(Precomp.TypesCount));
+  error(IO.mapInteger(Precomp.Signature));
+  error(IO.mapStringZ(Precomp.PrecompFilePath));
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
+                                          EndPrecompRecord &EndPrecomp) {
+  error(IO.mapInteger(EndPrecomp.Signature));
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 6a94952c175b..e4f39dd988e1 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -20,6 +20,11 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
+static inline size_t slotForIndex(TypeIndex Idx) {
+  assert(!Idx.isSimple() && "simple type indices have no slots");
+  return Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
+}
+
 namespace {
 
 /// Implementation of CodeView type stream merging.
@@ -94,8 +99,22 @@ private:
 
   void addMapping(TypeIndex Idx);
 
-  bool remapTypeIndex(TypeIndex &Idx);
-  bool remapItemIndex(TypeIndex &Idx);
+  inline bool remapTypeIndex(TypeIndex &Idx) {
+    // If we're mapping a pure index stream, then IndexMap only contains
+    // mappings from OldIdStream -> NewIdStream, in which case we will need to
+    // use the special mapping from OldTypeStream -> NewTypeStream which was
+    // computed externally.  Regardless, we use this special map if and only if
+    // we are doing an id-only mapping.
+    if (!hasTypeStream())
+      return remapIndex(Idx, TypeLookup);
+
+    assert(TypeLookup.empty());
+    return remapIndex(Idx, IndexMap);
+  }
+  inline bool remapItemIndex(TypeIndex &Idx) {
+    assert(hasIdStream());
+    return remapIndex(Idx, IndexMap);
+  }
 
   bool hasTypeStream() const {
     return (UseGlobalHashes) ? (!!DestGlobalTypeStream) : (!!DestTypeStream);
@@ -105,17 +124,34 @@ private:
     return (UseGlobalHashes) ? (!!DestGlobalIdStream) : (!!DestIdStream);
   }
 
-  ArrayRef<uint8_t> serializeRemapped(const RemappedType &Record);
+  ArrayRef<uint8_t> remapIndices(const CVType &OriginalType,
+                                 MutableArrayRef<uint8_t> Storage);
 
-  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs);
+  inline bool remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map) {
+    if (LLVM_LIKELY(remapIndexSimple(Idx, Map)))
+      return true;
 
-  bool remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map);
+    return remapIndexFallback(Idx, Map);
+  }
+
+  inline bool remapIndexSimple(TypeIndex &Idx, ArrayRef<TypeIndex> Map) const {
+    // Simple types are unchanged.
+    if (Idx.isSimple())
+      return true;
 
-  size_t slotForIndex(TypeIndex Idx) const {
-    assert(!Idx.isSimple() && "simple type indices have no slots");
-    return Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
+    // Check if this type index refers to a record we've already translated
+    // successfully. If it refers to a type later in the stream or a record we
+    // had to defer, defer it until later pass.
+    unsigned MapPos = slotForIndex(Idx);
+    if (LLVM_UNLIKELY(MapPos >= Map.size() || Map[MapPos] == Untranslated))
+      return false;
+
+    Idx = Map[MapPos];
+    return true;
   }
 
+  bool remapIndexFallback(TypeIndex &Idx, ArrayRef<TypeIndex> Map);
+
   Error errorCorruptRecord() const {
     return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
   }
@@ -153,27 +189,6 @@ private:
 
 } // end anonymous namespace
 
-ArrayRef<uint8_t>
-TypeStreamMerger::serializeRemapped(const RemappedType &Record) {
-  TypeIndex TI;
-  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.RecordData;
-  if (Record.Mappings.empty())
-    return OriginalData;
-
-  // At least one type index was remapped.  We copy the full record bytes,
-  // re-write each type index, then return that.
-  RemapStorage.resize(OriginalData.size());
-  ::memcpy(&RemapStorage[0], OriginalData.data(), OriginalData.size());
-  uint8_t *ContentBegin = RemapStorage.data() + sizeof(RecordPrefix);
-  for (const auto &M : Record.Mappings) {
-    // First 4 bytes of every record are the record prefix, but the mapping
-    // offset is relative to the content which starts after.
-    *(TypeIndex *)(ContentBegin + M.first) = M.second;
-  }
-  auto RemapRef = makeArrayRef(RemapStorage);
-  return RemapRef;
-}
-
 const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
 
 static bool isIdRecord(TypeLeafKind K) {
@@ -202,19 +217,9 @@ void TypeStreamMerger::addMapping(TypeIndex Idx) {
   }
 }
 
-bool TypeStreamMerger::remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map) {
-  // Simple types are unchanged.
-  if (Idx.isSimple())
-    return true;
-
-  // Check if this type index refers to a record we've already translated
-  // successfully. If it refers to a type later in the stream or a record we
-  // had to defer, defer it until later pass.
-  unsigned MapPos = slotForIndex(Idx);
-  if (MapPos < Map.size() && Map[MapPos] != Untranslated) {
-    Idx = Map[MapPos];
-    return true;
-  }
+bool TypeStreamMerger::remapIndexFallback(TypeIndex &Idx,
+                                          ArrayRef<TypeIndex> Map) {
+  size_t MapPos = slotForIndex(Idx);
 
   // If this is the second pass and this index isn't in the map, then it points
   // outside the current type stream, and this is a corrupt record.
@@ -232,24 +237,6 @@ bool TypeStreamMerger::remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map) {
   return false;
 }
 
-bool TypeStreamMerger::remapTypeIndex(TypeIndex &Idx) {
-  // If we're mapping a pure index stream, then IndexMap only contains mappings
-  // from OldIdStream -> NewIdStream, in which case we will need to use the
-  // special mapping from OldTypeStream -> NewTypeStream which was computed
-  // externally.  Regardless, we use this special map if and only if we are
-  // doing an id-only mapping.
-  if (!hasTypeStream())
-    return remapIndex(Idx, TypeLookup);
-
-  assert(TypeLookup.empty());
-  return remapIndex(Idx, IndexMap);
-}
-
-bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) {
-  assert(hasIdStream());
-  return remapIndex(Idx, IndexMap);
-}
-
 // Local hashing entry points
 Error TypeStreamMerger::mergeTypeRecords(MergingTypeTableBuilder &Dest,
                                          const CVTypeArray &Types) {
@@ -346,35 +333,34 @@ Error TypeStreamMerger::doit(const CVTypeArray &Types) {
 }
 
 Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) {
-  for (const CVType &Type : Types)
-    if (auto EC = remapType(Type))
-      return EC;
-  return Error::success();
+  BinaryStreamRef Stream = Types.getUnderlyingStream();
+  ArrayRef<uint8_t> Buffer;
+  cantFail(Stream.readBytes(0, Stream.getLength(), Buffer));
+
+  return forEachCodeViewRecord<CVType>(
+      Buffer, [this](const CVType &T) { return remapType(T); });
 }
 
 Error TypeStreamMerger::remapType(const CVType &Type) {
-  auto DoSerialize = [this, Type]() -> ArrayRef<uint8_t> {
-    RemappedType R(Type);
-    SmallVector<TiReference, 32> Refs;
-    discoverTypeIndices(Type.RecordData, Refs);
-    if (!remapIndices(R, Refs))
-      return {};
-    return serializeRemapped(R);
+  auto DoSerialize =
+      [this, Type](MutableArrayRef<uint8_t> Storage) -> ArrayRef<uint8_t> {
+    return remapIndices(Type, Storage);
   };
 
   TypeIndex DestIdx = Untranslated;
-  if (UseGlobalHashes) {
+  if (LLVM_LIKELY(UseGlobalHashes)) {
     GlobalTypeTableBuilder &Dest =
         isIdRecord(Type.kind()) ? *DestGlobalIdStream : *DestGlobalTypeStream;
     GloballyHashedType H = GlobalHashes[CurIndex.toArrayIndex()];
-    DestIdx = Dest.insertRecordAs(H, DoSerialize);
+    DestIdx = Dest.insertRecordAs(H, Type.RecordData.size(), DoSerialize);
   } else {
     MergingTypeTableBuilder &Dest =
         isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
 
-    auto Data = DoSerialize();
-    if (!Data.empty())
-      DestIdx = Dest.insertRecordBytes(Data);
+    RemapStorage.resize(Type.RecordData.size());
+    ArrayRef<uint8_t> Result = DoSerialize(RemapStorage);
+    if (!Result.empty())
+      DestIdx = Dest.insertRecordBytes(Result);
   }
   addMapping(DestIdx);
 
@@ -384,27 +370,32 @@ Error TypeStreamMerger::remapType(const CVType &Type) {
   return Error::success();
 }
 
-bool TypeStreamMerger::remapIndices(RemappedType &Record,
-                                    ArrayRef<TiReference> Refs) {
-  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.content();
-  bool Success = true;
+ArrayRef<uint8_t>
+TypeStreamMerger::remapIndices(const CVType &OriginalType,
+                               MutableArrayRef<uint8_t> Storage) {
+  SmallVector<TiReference, 4> Refs;
+  discoverTypeIndices(OriginalType.RecordData, Refs);
+  if (Refs.empty())
+    return OriginalType.RecordData;
+
+  ::memcpy(Storage.data(), OriginalType.RecordData.data(),
+           OriginalType.RecordData.size());
+
+  uint8_t *DestContent = Storage.data() + sizeof(RecordPrefix);
+
   for (auto &Ref : Refs) {
-    uint32_t Offset = Ref.Offset;
-    ArrayRef<uint8_t> Bytes = OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
-    ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
-                            Ref.Count);
-    for (auto TI : TIs) {
-      TypeIndex NewTI = TI;
-      bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
-                             ? remapItemIndex(NewTI)
-                             : remapTypeIndex(NewTI);
-      if (ThisSuccess && NewTI != TI)
-        Record.Mappings.emplace_back(Offset, NewTI);
-      Offset += sizeof(TypeIndex);
-      Success &= ThisSuccess;
+    TypeIndex *DestTIs =
+        reinterpret_cast<TypeIndex *>(DestContent + Ref.Offset);
+
+    for (size_t I = 0; I < Ref.Count; ++I) {
+      TypeIndex &TI = DestTIs[I];
+      bool Success = (Ref.Kind == TiRefKind::IndexRef) ? remapItemIndex(TI)
+                                                       : remapTypeIndex(TI);
+      if (LLVM_UNLIKELY(!Success))
+        return {};
     }
   }
-  return Success;
+  return Storage;
 }
 
 Error llvm::codeview::mergeTypeRecords(MergingTypeTableBuilder &Dest,
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index f593953c62ff..adada672af00 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -16,6 +16,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstddef>
 #include <cstdint>
@@ -96,8 +97,7 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
       default:
         // The form has a byte size that doesn't depend on Params.
         // If it's a fixed size, keep track of it.
-        if ((ByteSize =
-                 DWARFFormValue::getFixedByteSize(F, DWARFFormParams()))) {
+        if ((ByteSize = dwarf::getFixedFormByteSize(F, dwarf::FormParams()))) {
           if (FixedAttributeSize)
             FixedAttributeSize->NumBytes += *ByteSize;
           break;
@@ -127,26 +127,11 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
 }
 
 void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
-  auto tagString = TagString(getTag());
   OS << '[' << getCode() << "] ";
-  if (!tagString.empty())
-    OS << tagString;
-  else
-    OS << format("DW_TAG_Unknown_%x", getTag());
+  OS << formatv("{0}", getTag());
   OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
   for (const AttributeSpec &Spec : AttributeSpecs) {
-    OS << '\t';
-    auto attrString = AttributeString(Spec.Attr);
-    if (!attrString.empty())
-      OS << attrString;
-    else
-      OS << format("DW_AT_Unknown_%x", Spec.Attr);
-    OS << '\t';
-    auto formString = FormEncodingString(Spec.Form);
-    if (!formString.empty())
-      OS << formString;
-    else
-      OS << format("DW_FORM_Unknown_%x", Spec.Form);
+    OS << formatv("\t{0}\t{1}", Spec.Attr, Spec.Form);
     if (Spec.isImplicitConst())
       OS << '\t' << Spec.getImplicitConstValue();
     OS << '\n';
@@ -217,8 +202,7 @@ Optional<int64_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
   if (ByteSize.HasByteSize)
     return ByteSize.ByteSize;
   Optional<int64_t> S;
-  auto FixedByteSize =
-      DWARFFormValue::getFixedByteSize(Form, U.getFormParams());
+  auto FixedByteSize = dwarf::getFixedFormByteSize(Form, U.getFormParams());
   if (FixedByteSize)
     S = *FixedByteSize;
   return S;
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 6a6b7fc6fc20..4582e036f9fc 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -13,7 +13,10 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/DJB.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstddef>
 #include <cstdint>
@@ -21,7 +24,24 @@
 
 using namespace llvm;
 
-llvm::Error DWARFAcceleratorTable::extract() {
+namespace {
+struct Atom {
+  unsigned Value;
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const Atom &A) {
+  StringRef Str = dwarf::AtomTypeString(A.Value);
+  if (!Str.empty())
+    return OS << Str;
+  return OS << "DW_ATOM_unknown_" << format("%x", A.Value);
+}
+} // namespace
+
+static Atom formatAtom(unsigned Atom) { return {Atom}; }
+
+DWARFAcceleratorTable::~DWARFAcceleratorTable() = default;
+
+llvm::Error AppleAcceleratorTable::extract() {
   uint32_t Offset = 0;
 
   // Check that we can at least read the header.
@@ -32,8 +52,8 @@ llvm::Error DWARFAcceleratorTable::extract() {
   Hdr.Magic = AccelSection.getU32(&Offset);
   Hdr.Version = AccelSection.getU16(&Offset);
   Hdr.HashFunction = AccelSection.getU16(&Offset);
-  Hdr.NumBuckets = AccelSection.getU32(&Offset);
-  Hdr.NumHashes = AccelSection.getU32(&Offset);
+  Hdr.BucketCount = AccelSection.getU32(&Offset);
+  Hdr.HashCount = AccelSection.getU32(&Offset);
   Hdr.HeaderDataLength = AccelSection.getU32(&Offset);
 
   // Check that we can read all the hashes and offsets from the
@@ -41,7 +61,7 @@ llvm::Error DWARFAcceleratorTable::extract() {
   // We need to substract one because we're checking for an *offset* which is
   // equal to the size for an empty table and hence pointer after the section.
   if (!AccelSection.isValidOffset(sizeof(Hdr) + Hdr.HeaderDataLength +
-                                  Hdr.NumBuckets * 4 + Hdr.NumHashes * 8 - 1))
+                                  Hdr.BucketCount * 4 + Hdr.HashCount * 8 - 1))
     return make_error<StringError>(
         "Section too small: cannot read buckets and hashes.",
         inconvertibleErrorCode());
@@ -59,20 +79,20 @@ llvm::Error DWARFAcceleratorTable::extract() {
   return Error::success();
 }
 
-uint32_t DWARFAcceleratorTable::getNumBuckets() { return Hdr.NumBuckets; }
-uint32_t DWARFAcceleratorTable::getNumHashes() { return Hdr.NumHashes; }
-uint32_t DWARFAcceleratorTable::getSizeHdr() { return sizeof(Hdr); }
-uint32_t DWARFAcceleratorTable::getHeaderDataLength() {
+uint32_t AppleAcceleratorTable::getNumBuckets() { return Hdr.BucketCount; }
+uint32_t AppleAcceleratorTable::getNumHashes() { return Hdr.HashCount; }
+uint32_t AppleAcceleratorTable::getSizeHdr() { return sizeof(Hdr); }
+uint32_t AppleAcceleratorTable::getHeaderDataLength() {
   return Hdr.HeaderDataLength;
 }
 
-ArrayRef<std::pair<DWARFAcceleratorTable::HeaderData::AtomType,
-                   DWARFAcceleratorTable::HeaderData::Form>>
-DWARFAcceleratorTable::getAtomsDesc() {
+ArrayRef<std::pair<AppleAcceleratorTable::HeaderData::AtomType,
+                   AppleAcceleratorTable::HeaderData::Form>>
+AppleAcceleratorTable::getAtomsDesc() {
   return HdrData.Atoms;
 }
 
-bool DWARFAcceleratorTable::validateForms() {
+bool AppleAcceleratorTable::validateForms() {
   for (auto Atom : getAtomsDesc()) {
     DWARFFormValue FormValue(Atom.second);
     switch (Atom.first) {
@@ -92,10 +112,10 @@ bool DWARFAcceleratorTable::validateForms() {
 }
 
 std::pair<uint32_t, dwarf::Tag>
-DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
+AppleAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
   uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
   dwarf::Tag DieTag = dwarf::DW_TAG_null;
-  DWARFFormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
+  dwarf::FormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
 
   for (auto Atom : getAtomsDesc()) {
     DWARFFormValue FormValue(Atom.second);
@@ -114,144 +134,219 @@ DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
   return {DieOffset, DieTag};
 }
 
-LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
+void AppleAcceleratorTable::Header::dump(ScopedPrinter &W) const {
+  DictScope HeaderScope(W, "Header");
+  W.printHex("Magic", Magic);
+  W.printHex("Version", Version);
+  W.printHex("Hash function", HashFunction);
+  W.printNumber("Bucket count", BucketCount);
+  W.printNumber("Hashes count", HashCount);
+  W.printNumber("HeaderData length", HeaderDataLength);
+}
+
+Optional<uint64_t> AppleAcceleratorTable::HeaderData::extractOffset(
+    Optional<DWARFFormValue> Value) const {
+  if (!Value)
+    return None;
+
+  switch (Value->getForm()) {
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_ref8:
+  case dwarf::DW_FORM_ref_udata:
+    return Value->getRawUValue() + DIEOffsetBase;
+  default:
+    return Value->getAsSectionOffset();
+  }
+}
+
+bool AppleAcceleratorTable::dumpName(ScopedPrinter &W,
+                                     SmallVectorImpl<DWARFFormValue> &AtomForms,
+                                     uint32_t *DataOffset) const {
+  dwarf::FormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
+  uint32_t NameOffset = *DataOffset;
+  if (!AccelSection.isValidOffsetForDataOfSize(*DataOffset, 4)) {
+    W.printString("Incorrectly terminated list.");
+    return false;
+  }
+  unsigned StringOffset = AccelSection.getRelocatedValue(4, DataOffset);
+  if (!StringOffset)
+    return false; // End of list
+
+  DictScope NameScope(W, ("Name@0x" + Twine::utohexstr(NameOffset)).str());
+  W.startLine() << format("String: 0x%08x", StringOffset);
+  W.getOStream() << " \"" << StringSection.getCStr(&StringOffset) << "\"\n";
+
+  unsigned NumData = AccelSection.getU32(DataOffset);
+  for (unsigned Data = 0; Data < NumData; ++Data) {
+    ListScope DataScope(W, ("Data " + Twine(Data)).str());
+    unsigned i = 0;
+    for (auto &Atom : AtomForms) {
+      W.startLine() << format("Atom[%d]: ", i);
+      if (Atom.extractValue(AccelSection, DataOffset, FormParams)) {
+        Atom.dump(W.getOStream());
+        if (Optional<uint64_t> Val = Atom.getAsUnsignedConstant()) {
+          StringRef Str = dwarf::AtomValueString(HdrData.Atoms[i].first, *Val);
+          if (!Str.empty())
+            W.getOStream() << " (" << Str << ")";
+        }
+      } else
+        W.getOStream() << "Error extracting the value";
+      W.getOStream() << "\n";
+      i++;
+    }
+  }
+  return true; // more entries follow
+}
+
+LLVM_DUMP_METHOD void AppleAcceleratorTable::dump(raw_ostream &OS) const {
   if (!IsValid)
     return;
 
-  // Dump the header.
-  OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
-     << "Version = " << format("0x%04x", Hdr.Version) << '\n'
-     << "Hash function = " << format("0x%08x", Hdr.HashFunction) << '\n'
-     << "Bucket count = " << Hdr.NumBuckets << '\n'
-     << "Hashes count = " << Hdr.NumHashes << '\n'
-     << "HeaderData length = " << Hdr.HeaderDataLength << '\n'
-     << "DIE offset base = " << HdrData.DIEOffsetBase << '\n'
-     << "Number of atoms = " << HdrData.Atoms.size() << '\n';
-
-  unsigned i = 0;
+  ScopedPrinter W(OS);
+
+  Hdr.dump(W);
+
+  W.printNumber("DIE offset base", HdrData.DIEOffsetBase);
+  W.printNumber("Number of atoms", uint64_t(HdrData.Atoms.size()));
   SmallVector<DWARFFormValue, 3> AtomForms;
-  for (const auto &Atom: HdrData.Atoms) {
-    OS << format("Atom[%d] Type: ", i++);
-    auto TypeString = dwarf::AtomTypeString(Atom.first);
-    if (!TypeString.empty())
-      OS << TypeString;
-    else
-      OS << format("DW_ATOM_Unknown_0x%x", Atom.first);
-    OS << " Form: ";
-    auto FormString = dwarf::FormEncodingString(Atom.second);
-    if (!FormString.empty())
-      OS << FormString;
-    else
-      OS << format("DW_FORM_Unknown_0x%x", Atom.second);
-    OS << '\n';
-    AtomForms.push_back(DWARFFormValue(Atom.second));
+  {
+    ListScope AtomsScope(W, "Atoms");
+    unsigned i = 0;
+    for (const auto &Atom : HdrData.Atoms) {
+      DictScope AtomScope(W, ("Atom " + Twine(i++)).str());
+      W.startLine() << "Type: " << formatAtom(Atom.first) << '\n';
+      W.startLine() << "Form: " << formatv("{0}", Atom.second) << '\n';
+      AtomForms.push_back(DWARFFormValue(Atom.second));
+    }
   }
 
   // Now go through the actual tables and dump them.
   uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
-  unsigned HashesBase = Offset + Hdr.NumBuckets * 4;
-  unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
-  DWARFFormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
+  unsigned HashesBase = Offset + Hdr.BucketCount * 4;
+  unsigned OffsetsBase = HashesBase + Hdr.HashCount * 4;
 
-  for (unsigned Bucket = 0; Bucket < Hdr.NumBuckets; ++Bucket) {
+  for (unsigned Bucket = 0; Bucket < Hdr.BucketCount; ++Bucket) {
     unsigned Index = AccelSection.getU32(&Offset);
 
-    OS << format("Bucket[%d]\n", Bucket);
+    ListScope BucketScope(W, ("Bucket " + Twine(Bucket)).str());
     if (Index == UINT32_MAX) {
-      OS << "  EMPTY\n";
+      W.printString("EMPTY");
       continue;
     }
 
-    for (unsigned HashIdx = Index; HashIdx < Hdr.NumHashes; ++HashIdx) {
+    for (unsigned HashIdx = Index; HashIdx < Hdr.HashCount; ++HashIdx) {
       unsigned HashOffset = HashesBase + HashIdx*4;
       unsigned OffsetsOffset = OffsetsBase + HashIdx*4;
       uint32_t Hash = AccelSection.getU32(&HashOffset);
 
-      if (Hash % Hdr.NumBuckets != Bucket)
+      if (Hash % Hdr.BucketCount != Bucket)
         break;
 
       unsigned DataOffset = AccelSection.getU32(&OffsetsOffset);
-      OS << format("  Hash = 0x%08x Offset = 0x%08x\n", Hash, DataOffset);
+      ListScope HashScope(W, ("Hash 0x" + Twine::utohexstr(Hash)).str());
       if (!AccelSection.isValidOffset(DataOffset)) {
-        OS << "    Invalid section offset\n";
+        W.printString("Invalid section offset");
         continue;
       }
-      while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
-        unsigned StringOffset = AccelSection.getRelocatedValue(4, &DataOffset);
-        if (!StringOffset)
-          break;
-        OS << format("    Name: %08x \"%s\"\n", StringOffset,
-                     StringSection.getCStr(&StringOffset));
-        unsigned NumData = AccelSection.getU32(&DataOffset);
-        for (unsigned Data = 0; Data < NumData; ++Data) {
-          OS << format("    Data[%d] => ", Data);
-          unsigned i = 0;
-          for (auto &Atom : AtomForms) {
-            OS << format("{Atom[%d]: ", i++);
-            if (Atom.extractValue(AccelSection, &DataOffset, FormParams))
-              Atom.dump(OS);
-            else
-              OS << "Error extracting the value";
-            OS << "} ";
-          }
-          OS << '\n';
-        }
-      }
+      while (dumpName(W, AtomForms, &DataOffset))
+        /*empty*/;
     }
   }
 }
 
-DWARFAcceleratorTable::ValueIterator::ValueIterator(
-    const DWARFAcceleratorTable &AccelTable, unsigned Offset)
-    : AccelTable(&AccelTable), DataOffset(Offset) {
+AppleAcceleratorTable::Entry::Entry(
+    const AppleAcceleratorTable::HeaderData &HdrData)
+    : HdrData(&HdrData) {
+  Values.reserve(HdrData.Atoms.size());
+  for (const auto &Atom : HdrData.Atoms)
+    Values.push_back(DWARFFormValue(Atom.second));
+}
+
+void AppleAcceleratorTable::Entry::extract(
+    const AppleAcceleratorTable &AccelTable, uint32_t *Offset) {
+
+  dwarf::FormParams FormParams = {AccelTable.Hdr.Version, 0,
+                                  dwarf::DwarfFormat::DWARF32};
+  for (auto &Atom : Values)
+    Atom.extractValue(AccelTable.AccelSection, Offset, FormParams);
+}
+
+Optional<DWARFFormValue>
+AppleAcceleratorTable::Entry::lookup(HeaderData::AtomType Atom) const {
+  assert(HdrData && "Dereferencing end iterator?");
+  assert(HdrData->Atoms.size() == Values.size());
+  for (const auto &Tuple : zip_first(HdrData->Atoms, Values)) {
+    if (std::get<0>(Tuple).first == Atom)
+      return std::get<1>(Tuple);
+  }
+  return None;
+}
+
+Optional<uint64_t> AppleAcceleratorTable::Entry::getDIESectionOffset() const {
+  return HdrData->extractOffset(lookup(dwarf::DW_ATOM_die_offset));
+}
+
+Optional<uint64_t> AppleAcceleratorTable::Entry::getCUOffset() const {
+  return HdrData->extractOffset(lookup(dwarf::DW_ATOM_cu_offset));
+}
+
+Optional<dwarf::Tag> AppleAcceleratorTable::Entry::getTag() const {
+  Optional<DWARFFormValue> Tag = lookup(dwarf::DW_ATOM_die_tag);
+  if (!Tag)
+    return None;
+  if (Optional<uint64_t> Value = Tag->getAsUnsignedConstant())
+    return dwarf::Tag(*Value);
+  return None;
+}
+
+AppleAcceleratorTable::ValueIterator::ValueIterator(
+    const AppleAcceleratorTable &AccelTable, unsigned Offset)
+    : AccelTable(&AccelTable), Current(AccelTable.HdrData), DataOffset(Offset) {
   if (!AccelTable.AccelSection.isValidOffsetForDataOfSize(DataOffset, 4))
     return;
 
-  for (const auto &Atom : AccelTable.HdrData.Atoms)
-    AtomForms.push_back(DWARFFormValue(Atom.second));
-
   // Read the first entry.
   NumData = AccelTable.AccelSection.getU32(&DataOffset);
   Next();
 }
 
-void DWARFAcceleratorTable::ValueIterator::Next() {
+void AppleAcceleratorTable::ValueIterator::Next() {
   assert(NumData > 0 && "attempted to increment iterator past the end");
   auto &AccelSection = AccelTable->AccelSection;
   if (Data >= NumData ||
       !AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
     NumData = 0;
+    DataOffset = 0;
     return;
   }
-  DWARFFormParams FormParams = {AccelTable->Hdr.Version, 0,
-                                dwarf::DwarfFormat::DWARF32};
-  for (auto &Atom : AtomForms)
-    Atom.extractValue(AccelSection, &DataOffset, FormParams);
+  Current.extract(*AccelTable, &DataOffset);
   ++Data;
 }
 
-iterator_range<DWARFAcceleratorTable::ValueIterator>
-DWARFAcceleratorTable::equal_range(StringRef Key) const {
+iterator_range<AppleAcceleratorTable::ValueIterator>
+AppleAcceleratorTable::equal_range(StringRef Key) const {
   if (!IsValid)
     return make_range(ValueIterator(), ValueIterator());
 
   // Find the bucket.
-  unsigned HashValue = dwarf::djbHash(Key);
-  unsigned Bucket = HashValue % Hdr.NumBuckets;
+  unsigned HashValue = djbHash(Key);
+  unsigned Bucket = HashValue % Hdr.BucketCount;
   unsigned BucketBase = sizeof(Hdr) + Hdr.HeaderDataLength;
-  unsigned HashesBase = BucketBase + Hdr.NumBuckets * 4;
-  unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
+  unsigned HashesBase = BucketBase + Hdr.BucketCount * 4;
+  unsigned OffsetsBase = HashesBase + Hdr.HashCount * 4;
 
   unsigned BucketOffset = BucketBase + Bucket * 4;
   unsigned Index = AccelSection.getU32(&BucketOffset);
 
   // Search through all hashes in the bucket.
-  for (unsigned HashIdx = Index; HashIdx < Hdr.NumHashes; ++HashIdx) {
+  for (unsigned HashIdx = Index; HashIdx < Hdr.HashCount; ++HashIdx) {
     unsigned HashOffset = HashesBase + HashIdx * 4;
     unsigned OffsetsOffset = OffsetsBase + HashIdx * 4;
     uint32_t Hash = AccelSection.getU32(&HashOffset);
 
-    if (Hash % Hdr.NumBuckets != Bucket)
+    if (Hash % Hdr.BucketCount != Bucket)
       // We are already in the next bucket.
       break;
 
@@ -266,3 +361,529 @@ DWARFAcceleratorTable::equal_range(StringRef Key) const {
   }
   return make_range(ValueIterator(), ValueIterator());
 }
+
+void DWARFDebugNames::Header::dump(ScopedPrinter &W) const {
+  DictScope HeaderScope(W, "Header");
+  W.printHex("Length", UnitLength);
+  W.printNumber("Version", Version);
+  W.printHex("Padding", Padding);
+  W.printNumber("CU count", CompUnitCount);
+  W.printNumber("Local TU count", LocalTypeUnitCount);
+  W.printNumber("Foreign TU count", ForeignTypeUnitCount);
+  W.printNumber("Bucket count", BucketCount);
+  W.printNumber("Name count", NameCount);
+  W.printHex("Abbreviations table size", AbbrevTableSize);
+  W.startLine() << "Augmentation: '" << AugmentationString << "'\n";
+}
+
+llvm::Error DWARFDebugNames::Header::extract(const DWARFDataExtractor &AS,
+                                             uint32_t *Offset) {
+  // Check that we can read the fixed-size part.
+  if (!AS.isValidOffset(*Offset + sizeof(HeaderPOD) - 1))
+    return make_error<StringError>("Section too small: cannot read header.",
+                                   inconvertibleErrorCode());
+
+  UnitLength = AS.getU32(Offset);
+  Version = AS.getU16(Offset);
+  Padding = AS.getU16(Offset);
+  CompUnitCount = AS.getU32(Offset);
+  LocalTypeUnitCount = AS.getU32(Offset);
+  ForeignTypeUnitCount = AS.getU32(Offset);
+  BucketCount = AS.getU32(Offset);
+  NameCount = AS.getU32(Offset);
+  AbbrevTableSize = AS.getU32(Offset);
+  AugmentationStringSize = alignTo(AS.getU32(Offset), 4);
+
+  if (!AS.isValidOffsetForDataOfSize(*Offset, AugmentationStringSize))
+    return make_error<StringError>(
+        "Section too small: cannot read header augmentation.",
+        inconvertibleErrorCode());
+  AugmentationString.resize(AugmentationStringSize);
+  AS.getU8(Offset, reinterpret_cast<uint8_t *>(AugmentationString.data()),
+           AugmentationStringSize);
+  return Error::success();
+}
+
+void DWARFDebugNames::Abbrev::dump(ScopedPrinter &W) const {
+  DictScope AbbrevScope(W, ("Abbreviation 0x" + Twine::utohexstr(Code)).str());
+  W.startLine() << formatv("Tag: {0}\n", Tag);
+
+  for (const auto &Attr : Attributes)
+    W.startLine() << formatv("{0}: {1}\n", Attr.Index, Attr.Form);
+}
+
+static constexpr DWARFDebugNames::AttributeEncoding sentinelAttrEnc() {
+  return {dwarf::Index(0), dwarf::Form(0)};
+}
+
+static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE) {
+  return AE == sentinelAttrEnc();
+}
+
+static DWARFDebugNames::Abbrev sentinelAbbrev() {
+  return DWARFDebugNames::Abbrev(0, dwarf::Tag(0), {});
+}
+
+static bool isSentinel(const DWARFDebugNames::Abbrev &Abbr) {
+  return Abbr.Code == 0;
+}
+
+DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getEmptyKey() {
+  return sentinelAbbrev();
+}
+
+DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() {
+  return DWARFDebugNames::Abbrev(~0, dwarf::Tag(0), {});
+}
+
+Expected<DWARFDebugNames::AttributeEncoding>
+DWARFDebugNames::NameIndex::extractAttributeEncoding(uint32_t *Offset) {
+  if (*Offset >= EntriesBase) {
+    return make_error<StringError>("Incorrectly terminated abbreviation table.",
+                                   inconvertibleErrorCode());
+  }
+
+  uint32_t Index = Section.AccelSection.getULEB128(Offset);
+  uint32_t Form = Section.AccelSection.getULEB128(Offset);
+  return AttributeEncoding(dwarf::Index(Index), dwarf::Form(Form));
+}
+
+Expected<std::vector<DWARFDebugNames::AttributeEncoding>>
+DWARFDebugNames::NameIndex::extractAttributeEncodings(uint32_t *Offset) {
+  std::vector<AttributeEncoding> Result;
+  for (;;) {
+    auto AttrEncOr = extractAttributeEncoding(Offset);
+    if (!AttrEncOr)
+      return AttrEncOr.takeError();
+    if (isSentinel(*AttrEncOr))
+      return std::move(Result);
+
+    Result.emplace_back(*AttrEncOr);
+  }
+}
+
+Expected<DWARFDebugNames::Abbrev>
+DWARFDebugNames::NameIndex::extractAbbrev(uint32_t *Offset) {
+  if (*Offset >= EntriesBase) {
+    return make_error<StringError>("Incorrectly terminated abbreviation table.",
+                                   inconvertibleErrorCode());
+  }
+
+  uint32_t Code = Section.AccelSection.getULEB128(Offset);
+  if (Code == 0)
+    return sentinelAbbrev();
+
+  uint32_t Tag = Section.AccelSection.getULEB128(Offset);
+  auto AttrEncOr = extractAttributeEncodings(Offset);
+  if (!AttrEncOr)
+    return AttrEncOr.takeError();
+  return Abbrev(Code, dwarf::Tag(Tag), std::move(*AttrEncOr));
+}
+
+Error DWARFDebugNames::NameIndex::extract() {
+  const DWARFDataExtractor &AS = Section.AccelSection;
+  uint32_t Offset = Base;
+  if (Error E = Hdr.extract(AS, &Offset))
+    return E;
+
+  CUsBase = Offset;
+  Offset += Hdr.CompUnitCount * 4;
+  Offset += Hdr.LocalTypeUnitCount * 4;
+  Offset += Hdr.ForeignTypeUnitCount * 8;
+  BucketsBase = Offset;
+  Offset += Hdr.BucketCount * 4;
+  HashesBase = Offset;
+  if (Hdr.BucketCount > 0)
+    Offset += Hdr.NameCount * 4;
+  StringOffsetsBase = Offset;
+  Offset += Hdr.NameCount * 4;
+  EntryOffsetsBase = Offset;
+  Offset += Hdr.NameCount * 4;
+
+  if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize))
+    return make_error<StringError>(
+        "Section too small: cannot read abbreviations.",
+        inconvertibleErrorCode());
+
+  EntriesBase = Offset + Hdr.AbbrevTableSize;
+
+  for (;;) {
+    auto AbbrevOr = extractAbbrev(&Offset);
+    if (!AbbrevOr)
+      return AbbrevOr.takeError();
+    if (isSentinel(*AbbrevOr))
+      return Error::success();
+
+    if (!Abbrevs.insert(std::move(*AbbrevOr)).second) {
+      return make_error<StringError>("Duplicate abbreviation code.",
+                                     inconvertibleErrorCode());
+    }
+  }
+}
+DWARFDebugNames::Entry::Entry(const NameIndex &NameIdx, const Abbrev &Abbr)
+    : NameIdx(&NameIdx), Abbr(&Abbr) {
+  // This merely creates form values. It is up to the caller
+  // (NameIndex::getEntry) to populate them.
+  Values.reserve(Abbr.Attributes.size());
+  for (const auto &Attr : Abbr.Attributes)
+    Values.emplace_back(Attr.Form);
+}
+
+Optional<DWARFFormValue>
+DWARFDebugNames::Entry::lookup(dwarf::Index Index) const {
+  assert(Abbr->Attributes.size() == Values.size());
+  for (const auto &Tuple : zip_first(Abbr->Attributes, Values)) {
+    if (std::get<0>(Tuple).Index == Index)
+      return std::get<1>(Tuple);
+  }
+  return None;
+}
+
+Optional<uint64_t> DWARFDebugNames::Entry::getDIEUnitOffset() const {
+  if (Optional<DWARFFormValue> Off = lookup(dwarf::DW_IDX_die_offset))
+    return Off->getAsReferenceUVal();
+  return None;
+}
+
+Optional<uint64_t> DWARFDebugNames::Entry::getCUIndex() const {
+  if (Optional<DWARFFormValue> Off = lookup(dwarf::DW_IDX_compile_unit))
+    return Off->getAsUnsignedConstant();
+  // In a per-CU index, the entries without a DW_IDX_compile_unit attribute
+  // implicitly refer to the single CU.
+  if (NameIdx->getCUCount() == 1)
+    return 0;
+  return None;
+}
+
+Optional<uint64_t> DWARFDebugNames::Entry::getCUOffset() const {
+  Optional<uint64_t> Index = getCUIndex();
+  if (!Index || *Index >= NameIdx->getCUCount())
+    return None;
+  return NameIdx->getCUOffset(*Index);
+}
+
+void DWARFDebugNames::Entry::dump(ScopedPrinter &W) const {
+  W.printHex("Abbrev", Abbr->Code);
+  W.startLine() << formatv("Tag: {0}\n", Abbr->Tag);
+  assert(Abbr->Attributes.size() == Values.size());
+  for (const auto &Tuple : zip_first(Abbr->Attributes, Values)) {
+    W.startLine() << formatv("{0}: ", std::get<0>(Tuple).Index);
+    std::get<1>(Tuple).dump(W.getOStream());
+    W.getOStream() << '\n';
+  }
+}
+
+char DWARFDebugNames::SentinelError::ID;
+std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const {
+  return inconvertibleErrorCode();
+}
+
+uint32_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
+  assert(CU < Hdr.CompUnitCount);
+  uint32_t Offset = CUsBase + 4 * CU;
+  return Section.AccelSection.getRelocatedValue(4, &Offset);
+}
+
+uint32_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
+  assert(TU < Hdr.LocalTypeUnitCount);
+  uint32_t Offset = CUsBase + Hdr.CompUnitCount * 4;
+  return Section.AccelSection.getRelocatedValue(4, &Offset);
+}
+
+uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const {
+  assert(TU < Hdr.ForeignTypeUnitCount);
+  uint32_t Offset = CUsBase + (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) * 4;
+  return Section.AccelSection.getU64(&Offset);
+}
+
+Expected<DWARFDebugNames::Entry>
+DWARFDebugNames::NameIndex::getEntry(uint32_t *Offset) const {
+  const DWARFDataExtractor &AS = Section.AccelSection;
+  if (!AS.isValidOffset(*Offset))
+    return make_error<StringError>("Incorrectly terminated entry list.",
+                                   inconvertibleErrorCode());
+
+  uint32_t AbbrevCode = AS.getULEB128(Offset);
+  if (AbbrevCode == 0)
+    return make_error<SentinelError>();
+
+  const auto AbbrevIt = Abbrevs.find_as(AbbrevCode);
+  if (AbbrevIt == Abbrevs.end())
+    return make_error<StringError>("Invalid abbreviation.",
+                                   inconvertibleErrorCode());
+
+  Entry E(*this, *AbbrevIt);
+
+  dwarf::FormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
+  for (auto &Value : E.Values) {
+    if (!Value.extractValue(AS, Offset, FormParams))
+      return make_error<StringError>("Error extracting index attribute values.",
+                                     inconvertibleErrorCode());
+  }
+  return std::move(E);
+}
+
+DWARFDebugNames::NameTableEntry
+DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
+  assert(0 < Index && Index <= Hdr.NameCount);
+  uint32_t StringOffsetOffset = StringOffsetsBase + 4 * (Index - 1);
+  uint32_t EntryOffsetOffset = EntryOffsetsBase + 4 * (Index - 1);
+  const DWARFDataExtractor &AS = Section.AccelSection;
+
+  uint32_t StringOffset = AS.getRelocatedValue(4, &StringOffsetOffset);
+  uint32_t EntryOffset = AS.getU32(&EntryOffsetOffset);
+  EntryOffset += EntriesBase;
+  return {Section.StringSection, Index, StringOffset, EntryOffset};
+}
+
+uint32_t
+DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const {
+  assert(Bucket < Hdr.BucketCount);
+  uint32_t BucketOffset = BucketsBase + 4 * Bucket;
+  return Section.AccelSection.getU32(&BucketOffset);
+}
+
+uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const {
+  assert(0 < Index && Index <= Hdr.NameCount);
+  uint32_t HashOffset = HashesBase + 4 * (Index - 1);
+  return Section.AccelSection.getU32(&HashOffset);
+}
+
+// Returns true if we should continue scanning for entries, false if this is the
+// last (sentinel) entry). In case of a parsing error we also return false, as
+// it's not possible to recover this entry list (but the other lists may still
+// parse OK).
+bool DWARFDebugNames::NameIndex::dumpEntry(ScopedPrinter &W,
+                                           uint32_t *Offset) const {
+  uint32_t EntryId = *Offset;
+  auto EntryOr = getEntry(Offset);
+  if (!EntryOr) {
+    handleAllErrors(EntryOr.takeError(), [](const SentinelError &) {},
+                    [&W](const ErrorInfoBase &EI) { EI.log(W.startLine()); });
+    return false;
+  }
+
+  DictScope EntryScope(W, ("Entry @ 0x" + Twine::utohexstr(EntryId)).str());
+  EntryOr->dump(W);
+  return true;
+}
+
+void DWARFDebugNames::NameIndex::dumpName(ScopedPrinter &W,
+                                          const NameTableEntry &NTE,
+                                          Optional<uint32_t> Hash) const {
+  DictScope NameScope(W, ("Name " + Twine(NTE.getIndex())).str());
+  if (Hash)
+    W.printHex("Hash", *Hash);
+
+  W.startLine() << format("String: 0x%08x", NTE.getStringOffset());
+  W.getOStream() << " \"" << NTE.getString() << "\"\n";
+
+  uint32_t EntryOffset = NTE.getEntryOffset();
+  while (dumpEntry(W, &EntryOffset))
+    /*empty*/;
+}
+
+void DWARFDebugNames::NameIndex::dumpCUs(ScopedPrinter &W) const {
+  ListScope CUScope(W, "Compilation Unit offsets");
+  for (uint32_t CU = 0; CU < Hdr.CompUnitCount; ++CU)
+    W.startLine() << format("CU[%u]: 0x%08x\n", CU, getCUOffset(CU));
+}
+
+void DWARFDebugNames::NameIndex::dumpLocalTUs(ScopedPrinter &W) const {
+  if (Hdr.LocalTypeUnitCount == 0)
+    return;
+
+  ListScope TUScope(W, "Local Type Unit offsets");
+  for (uint32_t TU = 0; TU < Hdr.LocalTypeUnitCount; ++TU)
+    W.startLine() << format("LocalTU[%u]: 0x%08x\n", TU, getLocalTUOffset(TU));
+}
+
+void DWARFDebugNames::NameIndex::dumpForeignTUs(ScopedPrinter &W) const {
+  if (Hdr.ForeignTypeUnitCount == 0)
+    return;
+
+  ListScope TUScope(W, "Foreign Type Unit signatures");
+  for (uint32_t TU = 0; TU < Hdr.ForeignTypeUnitCount; ++TU) {
+    W.startLine() << format("ForeignTU[%u]: 0x%016" PRIx64 "\n", TU,
+                            getForeignTUSignature(TU));
+  }
+}
+
+void DWARFDebugNames::NameIndex::dumpAbbreviations(ScopedPrinter &W) const {
+  ListScope AbbrevsScope(W, "Abbreviations");
+  for (const auto &Abbr : Abbrevs)
+    Abbr.dump(W);
+}
+
+void DWARFDebugNames::NameIndex::dumpBucket(ScopedPrinter &W,
+                                            uint32_t Bucket) const {
+  ListScope BucketScope(W, ("Bucket " + Twine(Bucket)).str());
+  uint32_t Index = getBucketArrayEntry(Bucket);
+  if (Index == 0) {
+    W.printString("EMPTY");
+    return;
+  }
+  if (Index > Hdr.NameCount) {
+    W.printString("Name index is invalid");
+    return;
+  }
+
+  for (; Index <= Hdr.NameCount; ++Index) {
+    uint32_t Hash = getHashArrayEntry(Index);
+    if (Hash % Hdr.BucketCount != Bucket)
+      break;
+
+    dumpName(W, getNameTableEntry(Index), Hash);
+  }
+}
+
+LLVM_DUMP_METHOD void DWARFDebugNames::NameIndex::dump(ScopedPrinter &W) const {
+  DictScope UnitScope(W, ("Name Index @ 0x" + Twine::utohexstr(Base)).str());
+  Hdr.dump(W);
+  dumpCUs(W);
+  dumpLocalTUs(W);
+  dumpForeignTUs(W);
+  dumpAbbreviations(W);
+
+  if (Hdr.BucketCount > 0) {
+    for (uint32_t Bucket = 0; Bucket < Hdr.BucketCount; ++Bucket)
+      dumpBucket(W, Bucket);
+    return;
+  }
+
+  W.startLine() << "Hash table not present\n";
+  for (NameTableEntry NTE : *this)
+    dumpName(W, NTE, None);
+}
+
+llvm::Error DWARFDebugNames::extract() {
+  uint32_t Offset = 0;
+  while (AccelSection.isValidOffset(Offset)) {
+    NameIndex Next(*this, Offset);
+    if (llvm::Error E = Next.extract())
+      return E;
+    Offset = Next.getNextUnitOffset();
+    NameIndices.push_back(std::move(Next));
+  }
+  return Error::success();
+}
+
+iterator_range<DWARFDebugNames::ValueIterator>
+DWARFDebugNames::NameIndex::equal_range(StringRef Key) const {
+  return make_range(ValueIterator(*this, Key), ValueIterator());
+}
+
+LLVM_DUMP_METHOD void DWARFDebugNames::dump(raw_ostream &OS) const {
+  ScopedPrinter W(OS);
+  for (const NameIndex &NI : NameIndices)
+    NI.dump(W);
+}
+
+Optional<uint32_t>
+DWARFDebugNames::ValueIterator::findEntryOffsetInCurrentIndex() {
+  const Header &Hdr = CurrentIndex->Hdr;
+  if (Hdr.BucketCount == 0) {
+    // No Hash Table, We need to search through all names in the Name Index.
+    for (NameTableEntry NTE : *CurrentIndex) {
+      if (NTE.getString() == Key)
+        return NTE.getEntryOffset();
+    }
+    return None;
+  }
+
+  // The Name Index has a Hash Table, so use that to speed up the search.
+  // Compute the Key Hash, if it has not been done already.
+  if (!Hash)
+    Hash = caseFoldingDjbHash(Key);
+  uint32_t Bucket = *Hash % Hdr.BucketCount;
+  uint32_t Index = CurrentIndex->getBucketArrayEntry(Bucket);
+  if (Index == 0)
+    return None; // Empty bucket
+
+  for (; Index <= Hdr.NameCount; ++Index) {
+    uint32_t Hash = CurrentIndex->getHashArrayEntry(Index);
+    if (Hash % Hdr.BucketCount != Bucket)
+      return None; // End of bucket
+
+    NameTableEntry NTE = CurrentIndex->getNameTableEntry(Index);
+    if (NTE.getString() == Key)
+      return NTE.getEntryOffset();
+  }
+  return None;
+}
+
+bool DWARFDebugNames::ValueIterator::getEntryAtCurrentOffset() {
+  auto EntryOr = CurrentIndex->getEntry(&DataOffset);
+  if (!EntryOr) {
+    consumeError(EntryOr.takeError());
+    return false;
+  }
+  CurrentEntry = std::move(*EntryOr);
+  return true;
+}
+
+bool DWARFDebugNames::ValueIterator::findInCurrentIndex() {
+  Optional<uint32_t> Offset = findEntryOffsetInCurrentIndex();
+  if (!Offset)
+    return false;
+  DataOffset = *Offset;
+  return getEntryAtCurrentOffset();
+}
+
+void DWARFDebugNames::ValueIterator::searchFromStartOfCurrentIndex() {
+  for (const NameIndex *End = CurrentIndex->Section.NameIndices.end();
+       CurrentIndex != End; ++CurrentIndex) {
+    if (findInCurrentIndex())
+      return;
+  }
+  setEnd();
+}
+
+void DWARFDebugNames::ValueIterator::next() {
+  assert(CurrentIndex && "Incrementing an end() iterator?");
+
+  // First try the next entry in the current Index.
+  if (getEntryAtCurrentOffset())
+    return;
+
+  // If we're a local iterator or we have reached the last Index, we're done.
+  if (IsLocal || CurrentIndex == &CurrentIndex->Section.NameIndices.back()) {
+    setEnd();
+    return;
+  }
+
+  // Otherwise, try the next index.
+  ++CurrentIndex;
+  searchFromStartOfCurrentIndex();
+}
+
+DWARFDebugNames::ValueIterator::ValueIterator(const DWARFDebugNames &AccelTable,
+                                              StringRef Key)
+    : CurrentIndex(AccelTable.NameIndices.begin()), IsLocal(false), Key(Key) {
+  searchFromStartOfCurrentIndex();
+}
+
+DWARFDebugNames::ValueIterator::ValueIterator(
+    const DWARFDebugNames::NameIndex &NI, StringRef Key)
+    : CurrentIndex(&NI), IsLocal(true), Key(Key) {
+  if (!findInCurrentIndex())
+    setEnd();
+}
+
+iterator_range<DWARFDebugNames::ValueIterator>
+DWARFDebugNames::equal_range(StringRef Key) const {
+  if (NameIndices.empty())
+    return make_range(ValueIterator(), ValueIterator());
+  return make_range(ValueIterator(*this, Key), ValueIterator());
+}
+
+const DWARFDebugNames::NameIndex *
+DWARFDebugNames::getCUNameIndex(uint32_t CUOffset) {
+  if (CUToNameIndex.size() == 0 && NameIndices.size() > 0) {
+    for (const auto &NI : *this) {
+      for (uint32_t CU = 0; CU < NI.getCUCount(); ++CU)
+        CUToNameIndex.try_emplace(NI.getCUOffset(CU), &NI);
+    }
+  }
+  return CUToNameIndex.lookup(CUOffset);
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
new file mode 100644
index 000000000000..86c8d19c02f4
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
@@ -0,0 +1,29 @@
+//===- DWARFDebugAranges.cpp ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
+
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFAddressRange::dump(raw_ostream &OS, uint32_t AddressSize,
+                             DIDumpOptions DumpOpts) const {
+
+  OS << (DumpOpts.DisplayRawContents ? " " : "[");
+  OS << format("0x%*.*" PRIx64 ", ", AddressSize * 2, AddressSize * 2, LowPC)
+     << format("0x%*.*" PRIx64, AddressSize * 2, AddressSize * 2, HighPC);
+  OS << (DumpOpts.DisplayRawContents ? "" : ")");
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const DWARFAddressRange &R) {
+  R.dump(OS, /* AddressSize */ 8);
+  return OS;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 43b235621d18..00a23b3898fa 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -22,9 +22,10 @@ void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   if (getVersion() >= 5)
     OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
   OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize())
-     << " (next unit at " << format("0x%08x", getNextUnitOffset())
-     << ")\n";
+     << " addr_size = " << format("0x%02x", getAddressByteSize());
+  if (getVersion() >= 5 && getUnitType() != dwarf::DW_UT_compile)
+    OS << " DWO_id = " << format("0x%016" PRIx64, *getDWOId());
+  OS << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
 
   if (DWARFDie CUDie = getUnitDIE(false))
     CUDie.dump(OS, 0, DumpOpts);
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index eb23ca8229a3..da13c5047f77 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -25,6 +25,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
@@ -43,9 +44,11 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
+#include <deque>
 #include <map>
 #include <string>
 #include <utility>
@@ -82,7 +85,8 @@ static void dumpUUID(raw_ostream &OS, const ObjectFile &Obj) {
       OS << "UUID: ";
       memcpy(&UUID, LC.Ptr+sizeof(LC.C), sizeof(UUID));
       OS.write_uuid(UUID);
-      OS << ' ' << MachO->getFileFormatName();
+      Triple T = MachO->getArchTriple();
+      OS << " (" << T.getArchName() << ')';
       OS << ' ' << MachO->getFileName() << '\n';
     }
   }
@@ -106,12 +110,12 @@ collectContributionData(DWARFContext::cu_iterator_range CUs,
   // Sort the contributions so that any invalid ones are placed at
   // the start of the contributions vector. This way they are reported
   // first.
-  std::sort(Contributions.begin(), Contributions.end(),
-            [](const Optional<StrOffsetsContributionDescriptor> &L,
-               const Optional<StrOffsetsContributionDescriptor> &R) {
-              if (L && R) return L->Base < R->Base;
-              return R.hasValue();
-            });
+  llvm::sort(Contributions.begin(), Contributions.end(),
+             [](const Optional<StrOffsetsContributionDescriptor> &L,
+                const Optional<StrOffsetsContributionDescriptor> &R) {
+               if (L && R) return L->Base < R->Base;
+               return R.hasValue();
+             });
 
   // Uniquify contributions, as it is possible that units (specifically
   // type units in dwo or dwp files) share contributions. We don't want
@@ -169,7 +173,11 @@ static void dumpDWARFv5StringOffsetsSection(
       OS << (ContributionHeader - Offset) << "\n";
     }
     OS << format("0x%8.8x: ", (uint32_t)ContributionHeader);
-    OS << "Contribution size = " << Contribution->Size
+    // In DWARF v5 the contribution size in the descriptor does not equal
+    // the originally encoded length (it does not contain the length of the
+    // version field and the padding, a total of 4 bytes). Add them back in
+    // for reporting.
+    OS << "Contribution size = " << (Contribution->Size + (Version < 5 ? 0 : 4))
        << ", Format = " << (Format == DWARF32 ? "DWARF32" : "DWARF64")
        << ", Version = " << Version << "\n";
 
@@ -241,26 +249,26 @@ static void dumpStringOffsetsSection(
   }
 }
 
-// We want to supply the Unit associated with a .debug_line[.dwo] table when
-// we dump it, if possible, but still dump the table even if there isn't a Unit.
-// Therefore, collect up handles on all the Units that point into the
-// line-table section.
-typedef std::map<uint64_t, DWARFUnit *> LineToUnitMap;
-
-static LineToUnitMap
-buildLineToUnitMap(DWARFContext::cu_iterator_range CUs,
-                   DWARFContext::tu_section_iterator_range TUSections) {
-  LineToUnitMap LineToUnit;
-  for (const auto &CU : CUs)
-    if (auto CUDIE = CU->getUnitDIE())
-      if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list)))
-        LineToUnit.insert(std::make_pair(*StmtOffset, &*CU));
-  for (const auto &TUS : TUSections)
-    for (const auto &TU : TUS)
-      if (auto TUDIE = TU->getUnitDIE())
-        if (auto StmtOffset = toSectionOffset(TUDIE.find(DW_AT_stmt_list)))
-          LineToUnit.insert(std::make_pair(*StmtOffset, &*TU));
-  return LineToUnit;
+// Dump the .debug_rnglists or .debug_rnglists.dwo section (DWARF v5).
+static void dumpRnglistsSection(raw_ostream &OS,
+                                DWARFDataExtractor &rnglistData,
+                                DIDumpOptions DumpOpts) {
+  uint32_t Offset = 0;
+  while (rnglistData.isValidOffset(Offset)) {
+    llvm::DWARFDebugRnglistTable Rnglists;
+    uint32_t TableOffset = Offset;
+    if (Error Err = Rnglists.extract(rnglistData, &Offset)) {
+      WithColor::error() << toString(std::move(Err)) << '\n';
+      uint64_t Length = Rnglists.length();
+      // Keep going after an error, if we can, assuming that the length field
+      // could be read. If it couldn't, stop reading the section.
+      if (Length == 0)
+        break;
+      Offset = TableOffset + Length;
+    } else {
+      Rnglists.dump(OS, DumpOpts);
+    }
+  }
 }
 
 void DWARFContext::dump(
@@ -347,11 +355,11 @@ void DWARFContext::dump(
 
   if (shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
                  DObj->getDebugFrameSection()))
-    getDebugFrame()->dump(OS, DumpOffset);
+    getDebugFrame()->dump(OS, getRegisterInfo(), DumpOffset);
 
   if (shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
                  DObj->getEHFrameSection()))
-    getEHFrame()->dump(OS, DumpOffset);
+    getEHFrame()->dump(OS, getRegisterInfo(), DumpOffset);
 
   if (DumpType & DIDT_DebugMacro) {
     if (Explicit || !getDebugMacro()->empty()) {
@@ -369,63 +377,39 @@ void DWARFContext::dump(
       set.dump(OS);
   }
 
-  if (shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine,
-                 DObj->getLineSection().Data)) {
-    LineToUnitMap LineToUnit =
-        buildLineToUnitMap(compile_units(), type_unit_sections());
-    unsigned Offset = 0;
-    DWARFDataExtractor LineData(*DObj, DObj->getLineSection(), isLittleEndian(),
-                                0);
-    while (Offset < LineData.getData().size()) {
-      DWARFUnit *U = nullptr;
-      auto It = LineToUnit.find(Offset);
-      if (It != LineToUnit.end())
-        U = It->second;
-      LineData.setAddressSize(U ? U->getAddressByteSize() : 0);
-      DWARFDebugLine::LineTable LineTable;
-      if (DumpOffset && Offset != *DumpOffset) {
-        // Find the size of this part of the line table section and skip it.
-        unsigned OldOffset = Offset;
-        LineTable.Prologue.parse(LineData, &Offset, U);
-        Offset = OldOffset + LineTable.Prologue.TotalLength +
-                 LineTable.Prologue.sizeofTotalLength();
+  auto DumpLineSection = [&](DWARFDebugLine::SectionParser Parser,
+                             DIDumpOptions DumpOpts) {
+    while (!Parser.done()) {
+      if (DumpOffset && Parser.getOffset() != *DumpOffset) {
+        Parser.skip();
         continue;
       }
-      // Verbose dumping is done during parsing and not on the intermediate
-      // representation.
-      OS << "debug_line[" << format("0x%8.8x", Offset) << "]\n";
-      unsigned OldOffset = Offset;
+      OS << "debug_line[" << format("0x%8.8x", Parser.getOffset()) << "]\n";
       if (DumpOpts.Verbose) {
-        LineTable.parse(LineData, &Offset, U, &OS);
+        Parser.parseNext(DWARFDebugLine::warn, DWARFDebugLine::warn, &OS);
       } else {
-        LineTable.parse(LineData, &Offset, U);
-        LineTable.dump(OS);
+        DWARFDebugLine::LineTable LineTable = Parser.parseNext();
+        LineTable.dump(OS, DumpOpts);
       }
-      // Check for unparseable prologue, to avoid infinite loops.
-      if (OldOffset == Offset)
-        break;
     }
+  };
+
+  if (shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine,
+                 DObj->getLineSection().Data)) {
+    DWARFDataExtractor LineData(*DObj, DObj->getLineSection(), isLittleEndian(),
+                                0);
+    DWARFDebugLine::SectionParser Parser(LineData, *this, compile_units(),
+                                         type_unit_sections());
+    DumpLineSection(Parser, DumpOpts);
   }
 
   if (shouldDump(ExplicitDWO, ".debug_line.dwo", DIDT_ID_DebugLine,
                  DObj->getLineDWOSection().Data)) {
-    LineToUnitMap LineToUnit =
-        buildLineToUnitMap(dwo_compile_units(), dwo_type_unit_sections());
-    unsigned Offset = 0;
     DWARFDataExtractor LineData(*DObj, DObj->getLineDWOSection(),
                                 isLittleEndian(), 0);
-    while (Offset < LineData.getData().size()) {
-      DWARFUnit *U = nullptr;
-      auto It = LineToUnit.find(Offset);
-      if (It != LineToUnit.end())
-        U = It->second;
-      DWARFDebugLine::LineTable LineTable;
-      unsigned OldOffset = Offset;
-      if (!LineTable.Prologue.parse(LineData, &Offset, U))
-        break;
-      if (!DumpOffset || OldOffset == *DumpOffset)
-        LineTable.dump(OS);
-    }
+    DWARFDebugLine::SectionParser Parser(LineData, *this, dwo_compile_units(),
+                                         dwo_type_unit_sections());
+    DumpLineSection(Parser, DumpOpts);
   }
 
   if (shouldDump(Explicit, ".debug_cu_index", DIDT_ID_DebugCUIndex,
@@ -458,6 +442,18 @@ void DWARFContext::dump(
       strDWOOffset = offset;
     }
   }
+  if (shouldDump(Explicit, ".debug_line_str", DIDT_ID_DebugLineStr,
+                 DObj->getLineStringSection())) {
+    DataExtractor strData(DObj->getLineStringSection(), isLittleEndian(), 0);
+    uint32_t offset = 0;
+    uint32_t strOffset = 0;
+    while (const char *s = strData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"", strOffset);
+      OS.write_escaped(s);
+      OS << "\"\n";
+      strOffset = offset;
+    }
+  }
 
   if (shouldDump(Explicit, ".debug_ranges", DIDT_ID_DebugRanges,
                  DObj->getRangeSection().Data)) {
@@ -475,8 +471,27 @@ void DWARFContext::dump(
                                   isLittleEndian(), savedAddressByteSize);
     uint32_t offset = 0;
     DWARFDebugRangeList rangeList;
-    while (rangeList.extract(rangesData, &offset))
+    while (rangesData.isValidOffset(offset)) {
+      if (Error E = rangeList.extract(rangesData, &offset)) {
+        WithColor::error() << toString(std::move(E)) << '\n';
+        break;  
+      }
       rangeList.dump(OS);
+    }
+  }
+
+  if (shouldDump(Explicit, ".debug_rnglists", DIDT_ID_DebugRnglists,
+                 DObj->getRnglistsSection().Data)) {
+    DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsSection(),
+                                   isLittleEndian(), 0);
+    dumpRnglistsSection(OS, RnglistData, DumpOpts);
+  }
+
+  if (shouldDump(ExplicitDWO, ".debug_rnglists.dwo", DIDT_ID_DebugRnglists,
+                 DObj->getRnglistsDWOSection().Data)) {
+    DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsDWOSection(),
+                                   isLittleEndian(), 0);
+    dumpRnglistsSection(OS, RnglistData, DumpOpts);
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
@@ -534,6 +549,9 @@ void DWARFContext::dump(
   if (shouldDump(Explicit, ".apple_objc", DIDT_ID_AppleObjC,
                  DObj->getAppleObjCSection().Data))
     getAppleObjC().dump(OS);
+  if (shouldDump(Explicit, ".debug_names", DIDT_ID_DebugNames,
+                 DObj->getDebugNamesSection().Data))
+    getDebugNames().dump(OS);
 }
 
 DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
@@ -549,9 +567,19 @@ DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
   // probably only one unless this is something like LTO - though an in-process
   // built/cached lookup table could be used in that case to improve repeated
   // lookups of different CUs in the DWO.
-  for (const auto &DWOCU : dwo_compile_units())
+  for (const auto &DWOCU : dwo_compile_units()) {
+    // Might not have parsed DWO ID yet.
+    if (!DWOCU->getDWOId()) {
+      if (Optional<uint64_t> DWOId =
+          toUnsigned(DWOCU->getUnitDIE().find(DW_AT_GNU_dwo_id)))
+        DWOCU->setDWOId(*DWOId);
+      else
+        // No DWO ID?
+        continue;
+    }
     if (DWOCU->getDWOId() == Hash)
       return DWOCU.get();
+  }
   return nullptr;
 }
 
@@ -633,7 +661,7 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
     return Loc.get();
 
   Loc.reset(new DWARFDebugLoc);
-  // assume all compile units have the same address byte size
+  // Assume all compile units have the same address byte size.
   if (getNumCompileUnits()) {
     DWARFDataExtractor LocData(*DObj, DObj->getLocSection(), isLittleEndian(),
                                getCompileUnitAtIndex(0)->getAddressByteSize());
@@ -646,9 +674,13 @@ const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() {
   if (LocDWO)
     return LocDWO.get();
 
-  DataExtractor LocData(DObj->getLocDWOSection().Data, isLittleEndian(), 0);
   LocDWO.reset(new DWARFDebugLocDWO());
-  LocDWO->parse(LocData);
+  // Assume all compile units have the same address byte size.
+  if (getNumCompileUnits()) {
+    DataExtractor LocData(DObj->getLocDWOSection().Data, isLittleEndian(),
+                          getCompileUnitAtIndex(0)->getAddressByteSize());
+    LocDWO->parse(LocData);
+  }
   return LocDWO.get();
 }
 
@@ -674,8 +706,8 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
   // provides this information). This problem is fixed in DWARFv4
   // See this dwarf-discuss discussion for more details:
   // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
-  DataExtractor debugFrameData(DObj->getDebugFrameSection(), isLittleEndian(),
-                               DObj->getAddressSize());
+  DWARFDataExtractor debugFrameData(DObj->getDebugFrameSection(),
+                                    isLittleEndian(), DObj->getAddressSize());
   DebugFrame.reset(new DWARFDebugFrame(false /* IsEH */));
   DebugFrame->parse(debugFrameData);
   return DebugFrame.get();
@@ -685,8 +717,8 @@ const DWARFDebugFrame *DWARFContext::getEHFrame() {
   if (EHFrame)
     return EHFrame.get();
 
-  DataExtractor debugFrameData(DObj->getEHFrameSection(), isLittleEndian(),
-                               DObj->getAddressSize());
+  DWARFDataExtractor debugFrameData(DObj->getEHFrameSection(), isLittleEndian(),
+                                    DObj->getAddressSize());
   DebugFrame.reset(new DWARFDebugFrame(true /* IsEH */));
   DebugFrame->parse(debugFrameData);
   return DebugFrame.get();
@@ -702,43 +734,59 @@ const DWARFDebugMacro *DWARFContext::getDebugMacro() {
   return Macro.get();
 }
 
-static DWARFAcceleratorTable &
-getAccelTable(std::unique_ptr<DWARFAcceleratorTable> &Cache,
-              const DWARFObject &Obj, const DWARFSection &Section,
-              StringRef StringSection, bool IsLittleEndian) {
+template <typename T>
+static T &getAccelTable(std::unique_ptr<T> &Cache, const DWARFObject &Obj,
+                        const DWARFSection &Section, StringRef StringSection,
+                        bool IsLittleEndian) {
   if (Cache)
     return *Cache;
   DWARFDataExtractor AccelSection(Obj, Section, IsLittleEndian, 0);
   DataExtractor StrData(StringSection, IsLittleEndian, 0);
-  Cache.reset(new DWARFAcceleratorTable(AccelSection, StrData));
+  Cache.reset(new T(AccelSection, StrData));
   if (Error E = Cache->extract())
     llvm::consumeError(std::move(E));
   return *Cache;
 }
 
-const DWARFAcceleratorTable &DWARFContext::getAppleNames() {
+const DWARFDebugNames &DWARFContext::getDebugNames() {
+  return getAccelTable(Names, *DObj, DObj->getDebugNamesSection(),
+                       DObj->getStringSection(), isLittleEndian());
+}
+
+const AppleAcceleratorTable &DWARFContext::getAppleNames() {
   return getAccelTable(AppleNames, *DObj, DObj->getAppleNamesSection(),
                        DObj->getStringSection(), isLittleEndian());
 }
 
-const DWARFAcceleratorTable &DWARFContext::getAppleTypes() {
+const AppleAcceleratorTable &DWARFContext::getAppleTypes() {
   return getAccelTable(AppleTypes, *DObj, DObj->getAppleTypesSection(),
                        DObj->getStringSection(), isLittleEndian());
 }
 
-const DWARFAcceleratorTable &DWARFContext::getAppleNamespaces() {
+const AppleAcceleratorTable &DWARFContext::getAppleNamespaces() {
   return getAccelTable(AppleNamespaces, *DObj,
                        DObj->getAppleNamespacesSection(),
                        DObj->getStringSection(), isLittleEndian());
 }
 
-const DWARFAcceleratorTable &DWARFContext::getAppleObjC() {
+const AppleAcceleratorTable &DWARFContext::getAppleObjC() {
   return getAccelTable(AppleObjC, *DObj, DObj->getAppleObjCSection(),
                        DObj->getStringSection(), isLittleEndian());
 }
 
-const DWARFLineTable *
+const DWARFDebugLine::LineTable *
 DWARFContext::getLineTableForUnit(DWARFUnit *U) {
+  Expected<const DWARFDebugLine::LineTable *> ExpectedLineTable =
+      getLineTableForUnit(U, DWARFDebugLine::warn);
+  if (!ExpectedLineTable) {
+    DWARFDebugLine::warn(ExpectedLineTable.takeError());
+    return nullptr;
+  }
+  return *ExpectedLineTable;
+}
+
+Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
+    DWARFUnit *U, std::function<void(Error)> RecoverableErrorCallback) {
   if (!Line)
     Line.reset(new DWARFDebugLine);
 
@@ -762,7 +810,8 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   // We have to parse it first.
   DWARFDataExtractor lineData(*DObj, U->getLineSection(), isLittleEndian(),
                               U->getAddressByteSize());
-  return Line->getOrParseLineTable(lineData, stmtOffset, U);
+  return Line->getOrParseLineTable(lineData, stmtOffset, *this, U,
+                                   RecoverableErrorCallback);
 }
 
 void DWARFContext::parseCompileUnits() {
@@ -1119,7 +1168,7 @@ static bool isRelocScattered(const object::ObjectFile &Obj,
 }
 
 ErrorPolicy DWARFContext::defaultErrorHandler(Error E) {
-  errs() << "error: " + toString(std::move(E)) << '\n';
+  WithColor::error() << toString(std::move(E)) << '\n';
   return ErrorPolicy::Continue;
 }
 
@@ -1145,17 +1194,20 @@ class DWARFObjInMemory final : public DWARFObject {
   DWARFSectionMap LocSection;
   DWARFSectionMap LineSection;
   DWARFSectionMap RangeSection;
+  DWARFSectionMap RnglistsSection;
   DWARFSectionMap StringOffsetSection;
   DWARFSectionMap InfoDWOSection;
   DWARFSectionMap LineDWOSection;
   DWARFSectionMap LocDWOSection;
   DWARFSectionMap StringOffsetDWOSection;
   DWARFSectionMap RangeDWOSection;
+  DWARFSectionMap RnglistsDWOSection;
   DWARFSectionMap AddrSection;
   DWARFSectionMap AppleNamesSection;
   DWARFSectionMap AppleTypesSection;
   DWARFSectionMap AppleNamespacesSection;
   DWARFSectionMap AppleObjCSection;
+  DWARFSectionMap DebugNamesSection;
 
   DWARFSectionMap *mapNameToDWARFSection(StringRef Name) {
     return StringSwitch<DWARFSectionMap *>(Name)
@@ -1164,9 +1216,12 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("debug_line", &LineSection)
         .Case("debug_str_offsets", &StringOffsetSection)
         .Case("debug_ranges", &RangeSection)
+        .Case("debug_rnglists", &RnglistsSection)
         .Case("debug_info.dwo", &InfoDWOSection)
         .Case("debug_loc.dwo", &LocDWOSection)
         .Case("debug_line.dwo", &LineDWOSection)
+        .Case("debug_names", &DebugNamesSection)
+        .Case("debug_rnglists.dwo", &RnglistsDWOSection)
         .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
         .Case("debug_addr", &AddrSection)
         .Case("apple_names", &AppleNamesSection)
@@ -1192,8 +1247,11 @@ class DWARFObjInMemory final : public DWARFObject {
   StringRef CUIndexSection;
   StringRef GdbIndexSection;
   StringRef TUIndexSection;
+  StringRef LineStringSection;
 
-  SmallVector<SmallString<32>, 4> UncompressedSections;
+  // A deque holding section data whose iterators are not invalidated when
+  // new decompressed sections are inserted at the end.
+  std::deque<SmallString<0>> UncompressedSections;
 
   StringRef *mapSectionToMember(StringRef Name) {
     if (DWARFSection *Sec = mapNameToDWARFSection(Name))
@@ -1214,6 +1272,7 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("debug_cu_index", &CUIndexSection)
         .Case("debug_tu_index", &TUIndexSection)
         .Case("gdb_index", &GdbIndexSection)
+        .Case("debug_line_str", &LineStringSection)
         // Any more debug info sections go here.
         .Default(nullptr);
   }
@@ -1230,11 +1289,11 @@ class DWARFObjInMemory final : public DWARFObject {
     if (!Decompressor)
       return Decompressor.takeError();
 
-    SmallString<32> Out;
+    SmallString<0> Out;
     if (auto Err = Decompressor->resizeAndDecompress(Out))
       return Err;
 
-    UncompressedSections.emplace_back(std::move(Out));
+    UncompressedSections.push_back(std::move(Out));
     Data = UncompressedSections.back();
 
     return Error::success();
@@ -1423,6 +1482,9 @@ public:
   const DWARFSection &getRangeDWOSection() const override {
     return RangeDWOSection;
   }
+  const DWARFSection &getRnglistsDWOSection() const override {
+    return RnglistsDWOSection;
+  }
   const DWARFSection &getAddrSection() const override { return AddrSection; }
   StringRef getCUIndexSection() const override { return CUIndexSection; }
   StringRef getGdbIndexSection() const override { return GdbIndexSection; }
@@ -1432,6 +1494,7 @@ public:
   const DWARFSection &getStringOffsetSection() const override {
     return StringOffsetSection;
   }
+  StringRef getLineStringSection() const override { return LineStringSection; }
 
   // Sections for DWARF5 split dwarf proposal.
   const DWARFSection &getInfoDWOSection() const override {
@@ -1451,6 +1514,9 @@ public:
   const DWARFSection &getLineSection() const override { return LineSection; }
   StringRef getStringSection() const override { return StringSection; }
   const DWARFSection &getRangeSection() const override { return RangeSection; }
+  const DWARFSection &getRnglistsSection() const override {
+    return RnglistsSection;
+  }
   StringRef getMacinfoSection() const override { return MacinfoSection; }
   StringRef getPubNamesSection() const override { return PubNamesSection; }
   StringRef getPubTypesSection() const override { return PubTypesSection; }
@@ -1472,6 +1538,9 @@ public:
   const DWARFSection &getAppleObjCSection() const override {
     return AppleObjCSection;
   }
+  const DWARFSection &getDebugNamesSection() const override {
+    return DebugNamesSection;
+  }
 
   StringRef getFileName() const override { return FileName; }
   uint8_t getAddressSize() const override { return AddressSize; }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
index 861dd313fb09..03e317461396 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 
 using namespace llvm;
@@ -25,3 +26,71 @@ uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint32_t *Off,
     *SecNdx = Rel->SectionIndex;
   return getUnsigned(Off, Size) + Rel->Value;
 }
+
+Optional<uint64_t>
+DWARFDataExtractor::getEncodedPointer(uint32_t *Offset, uint8_t Encoding,
+                                      uint64_t PCRelOffset) const {
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return None;
+
+  uint64_t Result = 0;
+  uint32_t OldOffset = *Offset;
+  // First get value
+  switch (Encoding & 0x0F) {
+  case dwarf::DW_EH_PE_absptr:
+    switch (getAddressSize()) {
+    case 2:
+    case 4:
+    case 8:
+      Result = getUnsigned(Offset, getAddressSize());
+      break;
+    default:
+      return None;
+    }
+    break;
+  case dwarf::DW_EH_PE_uleb128:
+    Result = getULEB128(Offset);
+    break;
+  case dwarf::DW_EH_PE_sleb128:
+    Result = getSLEB128(Offset);
+    break;
+  case dwarf::DW_EH_PE_udata2:
+    Result = getUnsigned(Offset, 2);
+    break;
+  case dwarf::DW_EH_PE_udata4:
+    Result = getUnsigned(Offset, 4);
+    break;
+  case dwarf::DW_EH_PE_udata8:
+    Result = getUnsigned(Offset, 8);
+    break;
+  case dwarf::DW_EH_PE_sdata2:
+    Result = getSigned(Offset, 2);
+    break;
+  case dwarf::DW_EH_PE_sdata4:
+    Result = getSigned(Offset, 4);
+    break;
+  case dwarf::DW_EH_PE_sdata8:
+    Result = getSigned(Offset, 8);
+    break;
+  default:
+    return None;
+  }
+  // Then add relative offset, if required
+  switch (Encoding & 0x70) {
+  case dwarf::DW_EH_PE_absptr:
+    // do nothing
+    break;
+  case dwarf::DW_EH_PE_pcrel:
+    Result += PCRelOffset;
+    break;
+  case dwarf::DW_EH_PE_datarel:
+  case dwarf::DW_EH_PE_textrel:
+  case dwarf::DW_EH_PE_funcrel:
+  case dwarf::DW_EH_PE_aligned:
+  default:
+    *Offset = OldOffset;
+    return None;
+  }
+
+  return Result;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index ed5d726ae4e2..b9ef6905912a 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -17,6 +17,13 @@
 
 using namespace llvm;
 
+void DWARFDebugArangeSet::Descriptor::dump(raw_ostream &OS,
+                                           uint32_t AddressSize) const {
+  OS << format("[0x%*.*" PRIx64 ", ", AddressSize * 2, AddressSize * 2, Address)
+     << format(" 0x%*.*" PRIx64 ")", AddressSize * 2, AddressSize * 2,
+               getEndAddress());
+}
+
 void DWARFDebugArangeSet::clear() {
   Offset = -1U;
   std::memset(&HeaderData, 0, sizeof(Header));
@@ -98,10 +105,8 @@ void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
      << format("cu_offset = 0x%8.8x, addr_size = 0x%2.2x, seg_size = 0x%2.2x\n",
                HeaderData.CuOffset, HeaderData.AddrSize, HeaderData.SegSize);
 
-  const uint32_t hex_width = HeaderData.AddrSize * 2;
   for (const auto &Desc : ArangeDescriptors) {
-    OS << format("[0x%*.*" PRIx64 " -", hex_width, hex_width, Desc.Address)
-       << format(" 0x%*.*" PRIx64 ")\n",
-                 hex_width, hex_width, Desc.getEndAddress());
+    Desc.dump(OS, HeaderData.AddrSize);
+    OS << '\n';
   }
 }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index a3ecb15e3661..19bfcaed2021 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -80,7 +80,7 @@ void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
 void DWARFDebugAranges::construct() {
   std::multiset<uint32_t> ValidCUs;  // Maintain the set of CUs describing
                                      // a current address range.
-  std::sort(Endpoints.begin(), Endpoints.end());
+  llvm::sort(Endpoints.begin(), Endpoints.end());
   uint64_t PrevAddress = -1ULL;
   for (const auto &E : Endpoints) {
     if (PrevAddress < E.Address && !ValidCUs.empty()) {
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 3312da67804b..73333395f4c1 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -8,10 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -31,87 +29,13 @@
 using namespace llvm;
 using namespace dwarf;
 
-/// \brief Abstract frame entry defining the common interface concrete
-/// entries implement.
-class llvm::FrameEntry {
-public:
-  enum FrameKind {FK_CIE, FK_FDE};
-
-  FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length)
-      : Kind(K), Offset(Offset), Length(Length) {}
-
-  virtual ~FrameEntry() = default;
-
-  FrameKind getKind() const { return Kind; }
-  virtual uint64_t getOffset() const { return Offset; }
-
-  /// Parse and store a sequence of CFI instructions from Data,
-  /// starting at *Offset and ending at EndOffset. If everything
-  /// goes well, *Offset should be equal to EndOffset when this method
-  /// returns. Otherwise, an error occurred.
-  virtual void parseInstructions(DataExtractor Data, uint32_t *Offset,
-                                 uint32_t EndOffset);
-
-  /// Dump the entry header to the given output stream.
-  virtual void dumpHeader(raw_ostream &OS) const = 0;
-
-  /// Dump the entry's instructions to the given output stream.
-  virtual void dumpInstructions(raw_ostream &OS) const;
-
-  /// Dump the entire entry to the given output stream.
-  void dump(raw_ostream &OS) const {
-    dumpHeader(OS);
-    dumpInstructions(OS);
-    OS << "\n";
-  }
-
-protected:
-  const FrameKind Kind;
-
-  /// \brief Offset of this entry in the section.
-  uint64_t Offset;
-
-  /// \brief Entry length as specified in DWARF.
-  uint64_t Length;
-
-  /// An entry may contain CFI instructions. An instruction consists of an
-  /// opcode and an optional sequence of operands.
-  using Operands = std::vector<uint64_t>;
-  struct Instruction {
-    Instruction(uint8_t Opcode)
-      : Opcode(Opcode)
-    {}
-
-    uint8_t Opcode;
-    Operands Ops;
-  };
-
-  std::vector<Instruction> Instructions;
-
-  /// Convenience methods to add a new instruction with the given opcode and
-  /// operands to the Instructions vector.
-  void addInstruction(uint8_t Opcode) {
-    Instructions.push_back(Instruction(Opcode));
-  }
-
-  void addInstruction(uint8_t Opcode, uint64_t Operand1) {
-    Instructions.push_back(Instruction(Opcode));
-    Instructions.back().Ops.push_back(Operand1);
-  }
-
-  void addInstruction(uint8_t Opcode, uint64_t Operand1, uint64_t Operand2) {
-    Instructions.push_back(Instruction(Opcode));
-    Instructions.back().Ops.push_back(Operand1);
-    Instructions.back().Ops.push_back(Operand2);
-  }
-};
 
 // See DWARF standard v3, section 7.23
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
 
-void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
-                                   uint32_t EndOffset) {
+Error CFIProgram::parse(DataExtractor Data, uint32_t *Offset,
+                        uint32_t EndOffset) {
   while (*Offset < EndOffset) {
     uint8_t Opcode = Data.getU8(Offset);
     // Some instructions have a primary opcode encoded in the top bits.
@@ -122,67 +46,73 @@ void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
       // bits of the opcode itself.
       uint64_t Op1 = Opcode & DWARF_CFI_PRIMARY_OPERAND_MASK;
       switch (Primary) {
-        default: llvm_unreachable("Impossible primary CFI opcode");
-        case DW_CFA_advance_loc:
-        case DW_CFA_restore:
-          addInstruction(Primary, Op1);
-          break;
-        case DW_CFA_offset:
-          addInstruction(Primary, Op1, Data.getULEB128(Offset));
-          break;
+      default:
+        return make_error<StringError>(
+            "Invalid primary CFI opcode",
+            std::make_error_code(std::errc::illegal_byte_sequence));
+      case DW_CFA_advance_loc:
+      case DW_CFA_restore:
+        addInstruction(Primary, Op1);
+        break;
+      case DW_CFA_offset:
+        addInstruction(Primary, Op1, Data.getULEB128(Offset));
+        break;
       }
     } else {
       // Extended opcode - its value is Opcode itself.
       switch (Opcode) {
-        default: llvm_unreachable("Invalid extended CFI opcode");
-        case DW_CFA_nop:
-        case DW_CFA_remember_state:
-        case DW_CFA_restore_state:
-        case DW_CFA_GNU_window_save:
-          // No operands
-          addInstruction(Opcode);
-          break;
-        case DW_CFA_set_loc:
-          // Operands: Address
-          addInstruction(Opcode, Data.getAddress(Offset));
-          break;
-        case DW_CFA_advance_loc1:
-          // Operands: 1-byte delta
-          addInstruction(Opcode, Data.getU8(Offset));
-          break;
-        case DW_CFA_advance_loc2:
-          // Operands: 2-byte delta
-          addInstruction(Opcode, Data.getU16(Offset));
-          break;
-        case DW_CFA_advance_loc4:
-          // Operands: 4-byte delta
-          addInstruction(Opcode, Data.getU32(Offset));
-          break;
-        case DW_CFA_restore_extended:
-        case DW_CFA_undefined:
-        case DW_CFA_same_value:
-        case DW_CFA_def_cfa_register:
-        case DW_CFA_def_cfa_offset:
-        case DW_CFA_GNU_args_size:
-          // Operands: ULEB128
-          addInstruction(Opcode, Data.getULEB128(Offset));
-          break;
-        case DW_CFA_def_cfa_offset_sf:
-          // Operands: SLEB128
-          addInstruction(Opcode, Data.getSLEB128(Offset));
-          break;
-        case DW_CFA_offset_extended:
-        case DW_CFA_register:
-        case DW_CFA_def_cfa:
-        case DW_CFA_val_offset: {
-          // Operands: ULEB128, ULEB128
-          // Note: We can not embed getULEB128 directly into function
-          // argument list. getULEB128 changes Offset and order of evaluation
-          // for arguments is unspecified.
-          auto op1 = Data.getULEB128(Offset);
-          auto op2 = Data.getULEB128(Offset);
-          addInstruction(Opcode, op1, op2);
-          break;
+      default:
+        return make_error<StringError>(
+            "Invalid extended CFI opcode",
+            std::make_error_code(std::errc::illegal_byte_sequence));
+      case DW_CFA_nop:
+      case DW_CFA_remember_state:
+      case DW_CFA_restore_state:
+      case DW_CFA_GNU_window_save:
+        // No operands
+        addInstruction(Opcode);
+        break;
+      case DW_CFA_set_loc:
+        // Operands: Address
+        addInstruction(Opcode, Data.getAddress(Offset));
+        break;
+      case DW_CFA_advance_loc1:
+        // Operands: 1-byte delta
+        addInstruction(Opcode, Data.getU8(Offset));
+        break;
+      case DW_CFA_advance_loc2:
+        // Operands: 2-byte delta
+        addInstruction(Opcode, Data.getU16(Offset));
+        break;
+      case DW_CFA_advance_loc4:
+        // Operands: 4-byte delta
+        addInstruction(Opcode, Data.getU32(Offset));
+        break;
+      case DW_CFA_restore_extended:
+      case DW_CFA_undefined:
+      case DW_CFA_same_value:
+      case DW_CFA_def_cfa_register:
+      case DW_CFA_def_cfa_offset:
+      case DW_CFA_GNU_args_size:
+        // Operands: ULEB128
+        addInstruction(Opcode, Data.getULEB128(Offset));
+        break;
+      case DW_CFA_def_cfa_offset_sf:
+        // Operands: SLEB128
+        addInstruction(Opcode, Data.getSLEB128(Offset));
+        break;
+      case DW_CFA_offset_extended:
+      case DW_CFA_register:
+      case DW_CFA_def_cfa:
+      case DW_CFA_val_offset: {
+        // Operands: ULEB128, ULEB128
+        // Note: We can not embed getULEB128 directly into function
+        // argument list. getULEB128 changes Offset and order of evaluation
+        // for arguments is unspecified.
+        auto op1 = Data.getULEB128(Offset);
+        auto op2 = Data.getULEB128(Offset);
+        addInstruction(Opcode, op1, op2);
+        break;
         }
         case DW_CFA_offset_extended_sf:
         case DW_CFA_def_cfa_sf:
@@ -194,162 +124,49 @@ void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
           addInstruction(Opcode, op1, op2);
           break;
         }
-        case DW_CFA_def_cfa_expression:
-          // FIXME: Parse the actual instruction.
-          *Offset += Data.getULEB128(Offset);
+        case DW_CFA_def_cfa_expression: {
+          uint32_t ExprLength = Data.getULEB128(Offset);
+          addInstruction(Opcode, 0);
+          DataExtractor Extractor(
+              Data.getData().slice(*Offset, *Offset + ExprLength),
+              Data.isLittleEndian(), Data.getAddressSize());
+          Instructions.back().Expression = DWARFExpression(
+              Extractor, Data.getAddressSize(), dwarf::DWARF_VERSION);
+          *Offset += ExprLength;
           break;
+        }
         case DW_CFA_expression:
         case DW_CFA_val_expression: {
-          // FIXME: Parse the actual instruction.
-          Data.getULEB128(Offset);
-          *Offset += Data.getULEB128(Offset);
+          auto RegNum = Data.getULEB128(Offset);
+          auto BlockLength = Data.getULEB128(Offset);
+          addInstruction(Opcode, RegNum, 0);
+          DataExtractor Extractor(
+              Data.getData().slice(*Offset, *Offset + BlockLength),
+              Data.isLittleEndian(), Data.getAddressSize());
+          Instructions.back().Expression = DWARFExpression(
+              Extractor, Data.getAddressSize(), dwarf::DWARF_VERSION);
+          *Offset += BlockLength;
           break;
         }
       }
     }
   }
+
+  return Error::success();
 }
 
 namespace {
 
-/// \brief DWARF Common Information Entry (CIE)
-class CIE : public FrameEntry {
-public:
-  // CIEs (and FDEs) are simply container classes, so the only sensible way to
-  // create them is by providing the full parsed contents in the constructor.
-  CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
-      SmallString<8> Augmentation, uint8_t AddressSize,
-      uint8_t SegmentDescriptorSize, uint64_t CodeAlignmentFactor,
-      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister,
-      SmallString<8> AugmentationData, uint32_t FDEPointerEncoding,
-      uint32_t LSDAPointerEncoding)
-      : FrameEntry(FK_CIE, Offset, Length), Version(Version),
-        Augmentation(std::move(Augmentation)), AddressSize(AddressSize),
-        SegmentDescriptorSize(SegmentDescriptorSize),
-        CodeAlignmentFactor(CodeAlignmentFactor),
-        DataAlignmentFactor(DataAlignmentFactor),
-        ReturnAddressRegister(ReturnAddressRegister),
-        AugmentationData(std::move(AugmentationData)),
-        FDEPointerEncoding(FDEPointerEncoding),
-        LSDAPointerEncoding(LSDAPointerEncoding) {}
-
-  ~CIE() override = default;
-
-  StringRef getAugmentationString() const { return Augmentation; }
-  uint64_t getCodeAlignmentFactor() const { return CodeAlignmentFactor; }
-  int64_t getDataAlignmentFactor() const { return DataAlignmentFactor; }
-
-  uint32_t getFDEPointerEncoding() const {
-    return FDEPointerEncoding;
-  }
-
-  uint32_t getLSDAPointerEncoding() const {
-    return LSDAPointerEncoding;
-  }
-
-  void dumpHeader(raw_ostream &OS) const override {
-    OS << format("%08x %08x %08x CIE",
-                 (uint32_t)Offset, (uint32_t)Length, DW_CIE_ID)
-       << "\n";
-    OS << format("  Version:               %d\n", Version);
-    OS << "  Augmentation:          \"" << Augmentation << "\"\n";
-    if (Version >= 4) {
-      OS << format("  Address size:          %u\n",
-                   (uint32_t)AddressSize);
-      OS << format("  Segment desc size:     %u\n",
-                   (uint32_t)SegmentDescriptorSize);
-    }
-    OS << format("  Code alignment factor: %u\n",
-                 (uint32_t)CodeAlignmentFactor);
-    OS << format("  Data alignment factor: %d\n",
-                 (int32_t)DataAlignmentFactor);
-    OS << format("  Return address column: %d\n",
-                 (int32_t)ReturnAddressRegister);
-    if (!AugmentationData.empty()) {
-      OS << "  Augmentation data:    ";
-      for (uint8_t Byte : AugmentationData)
-        OS << ' ' << hexdigit(Byte >> 4) << hexdigit(Byte & 0xf);
-      OS << "\n";
-    }
-    OS << "\n";
-  }
-
-  static bool classof(const FrameEntry *FE) {
-    return FE->getKind() == FK_CIE;
-  }
-
-private:
-  /// The following fields are defined in section 6.4.1 of the DWARF standard v4
-  uint8_t Version;
-  SmallString<8> Augmentation;
-  uint8_t AddressSize;
-  uint8_t SegmentDescriptorSize;
-  uint64_t CodeAlignmentFactor;
-  int64_t DataAlignmentFactor;
-  uint64_t ReturnAddressRegister;
-
-  // The following are used when the CIE represents an EH frame entry.
-  SmallString<8> AugmentationData;
-  uint32_t FDEPointerEncoding;
-  uint32_t LSDAPointerEncoding;
-};
-
-/// \brief DWARF Frame Description Entry (FDE)
-class FDE : public FrameEntry {
-public:
-  // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
-  // an offset to the CIE (provided by parsing the FDE header). The CIE itself
-  // is obtained lazily once it's actually required.
-  FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
-      uint64_t InitialLocation, uint64_t AddressRange,
-      CIE *Cie)
-      : FrameEntry(FK_FDE, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
-        InitialLocation(InitialLocation), AddressRange(AddressRange),
-        LinkedCIE(Cie) {}
-
-  ~FDE() override = default;
-
-  CIE *getLinkedCIE() const { return LinkedCIE; }
-
-  void dumpHeader(raw_ostream &OS) const override {
-    OS << format("%08x %08x %08x FDE ",
-                 (uint32_t)Offset, (uint32_t)Length, (int32_t)LinkedCIEOffset);
-    OS << format("cie=%08x pc=%08x...%08x\n",
-                 (int32_t)LinkedCIEOffset,
-                 (uint32_t)InitialLocation,
-                 (uint32_t)InitialLocation + (uint32_t)AddressRange);
-  }
-
-  static bool classof(const FrameEntry *FE) {
-    return FE->getKind() == FK_FDE;
-  }
-
-private:
-  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
-  uint64_t LinkedCIEOffset;
-  uint64_t InitialLocation;
-  uint64_t AddressRange;
-  CIE *LinkedCIE;
-};
-
-/// \brief Types of operands to CF instructions.
-enum OperandType {
-  OT_Unset,
-  OT_None,
-  OT_Address,
-  OT_Offset,
-  OT_FactoredCodeOffset,
-  OT_SignedFactDataOffset,
-  OT_UnsignedFactDataOffset,
-  OT_Register,
-  OT_Expression
-};
 
 } // end anonymous namespace
 
-/// \brief Initialize the array describing the types of operands.
-static ArrayRef<OperandType[2]> getOperandTypes() {
+ArrayRef<CFIProgram::OperandType[2]> CFIProgram::getOperandTypes() {
   static OperandType OpTypes[DW_CFA_restore+1][2];
+  static bool Initialized = false;
+  if (Initialized) {
+    return ArrayRef<OperandType[2]>(&OpTypes[0], DW_CFA_restore+1);
+  }
+  Initialized = true;
 
 #define DECLARE_OP2(OP, OPTYPE0, OPTYPE1)       \
   do {                                          \
@@ -396,15 +213,13 @@ static ArrayRef<OperandType[2]> getOperandTypes() {
   return ArrayRef<OperandType[2]>(&OpTypes[0], DW_CFA_restore+1);
 }
 
-static ArrayRef<OperandType[2]> OpTypes = getOperandTypes();
-
-/// \brief Print \p Opcode's operand number \p OperandIdx which has
-/// value \p Operand.
-static void printOperand(raw_ostream &OS, uint8_t Opcode, unsigned OperandIdx,
-                         uint64_t Operand, uint64_t CodeAlignmentFactor,
-                         int64_t DataAlignmentFactor) {
+/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
+void CFIProgram::printOperand(raw_ostream &OS, const MCRegisterInfo *MRI,
+                              bool IsEH, const Instruction &Instr,
+                              unsigned OperandIdx, uint64_t Operand) const {
   assert(OperandIdx < 2);
-  OperandType Type = OpTypes[Opcode][OperandIdx];
+  uint8_t Opcode = Instr.Opcode;
+  OperandType Type = getOperandTypes()[Opcode][OperandIdx];
 
   switch (Type) {
   case OT_Unset: {
@@ -449,36 +264,68 @@ static void printOperand(raw_ostream &OS, uint8_t Opcode, unsigned OperandIdx,
     OS << format(" reg%" PRId64, Operand);
     break;
   case OT_Expression:
-    OS << " expression";
+    assert(Instr.Expression && "missing DWARFExpression object");
+    OS << " ";
+    Instr.Expression->print(OS, MRI, IsEH);
     break;
   }
 }
 
-void FrameEntry::dumpInstructions(raw_ostream &OS) const {
-  uint64_t CodeAlignmentFactor = 0;
-  int64_t DataAlignmentFactor = 0;
-  const CIE *Cie = dyn_cast<CIE>(this);
-
-  if (!Cie)
-    Cie = cast<FDE>(this)->getLinkedCIE();
-  if (Cie) {
-    CodeAlignmentFactor = Cie->getCodeAlignmentFactor();
-    DataAlignmentFactor = Cie->getDataAlignmentFactor();
-  }
-
+void CFIProgram::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
+                      unsigned IndentLevel) const {
   for (const auto &Instr : Instructions) {
     uint8_t Opcode = Instr.Opcode;
     if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
       Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
-    OS << "  " << CallFrameString(Opcode) << ":";
+    OS.indent(2 * IndentLevel);
+    OS << CallFrameString(Opcode) << ":";
     for (unsigned i = 0; i < Instr.Ops.size(); ++i)
-      printOperand(OS, Opcode, i, Instr.Ops[i], CodeAlignmentFactor,
-                   DataAlignmentFactor);
+      printOperand(OS, MRI, IsEH, Instr, i, Instr.Ops[i]);
     OS << '\n';
   }
 }
 
-DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {}
+void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
+  OS << format("%08x %08x %08x CIE", (uint32_t)Offset, (uint32_t)Length,
+               DW_CIE_ID)
+     << "\n";
+  OS << format("  Version:               %d\n", Version);
+  OS << "  Augmentation:          \"" << Augmentation << "\"\n";
+  if (Version >= 4) {
+    OS << format("  Address size:          %u\n", (uint32_t)AddressSize);
+    OS << format("  Segment desc size:     %u\n",
+                 (uint32_t)SegmentDescriptorSize);
+  }
+  OS << format("  Code alignment factor: %u\n", (uint32_t)CodeAlignmentFactor);
+  OS << format("  Data alignment factor: %d\n", (int32_t)DataAlignmentFactor);
+  OS << format("  Return address column: %d\n", (int32_t)ReturnAddressRegister);
+  if (Personality)
+    OS << format("  Personality Address: %08x\n", *Personality);
+  if (!AugmentationData.empty()) {
+    OS << "  Augmentation data:    ";
+    for (uint8_t Byte : AugmentationData)
+      OS << ' ' << hexdigit(Byte >> 4) << hexdigit(Byte & 0xf);
+    OS << "\n";
+  }
+  OS << "\n";
+  CFIs.dump(OS, MRI, IsEH);
+  OS << "\n";
+}
+
+void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
+  OS << format("%08x %08x %08x FDE ", (uint32_t)Offset, (uint32_t)Length,
+               (int32_t)LinkedCIEOffset);
+  OS << format("cie=%08x pc=%08x...%08x\n", (int32_t)LinkedCIEOffset,
+               (uint32_t)InitialLocation,
+               (uint32_t)InitialLocation + (uint32_t)AddressRange);
+  if (LSDAAddress)
+    OS << format("  LSDA Address: %08x\n", *LSDAAddress);
+  CFIs.dump(OS, MRI, IsEH);
+  OS << "\n";
+}
+
+DWARFDebugFrame::DWARFDebugFrame(bool IsEH, uint64_t EHFrameAddress)
+    : IsEH(IsEH), EHFrameAddress(EHFrameAddress) {}
 
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
@@ -492,40 +339,6 @@ static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
   errs() << "\n";
 }
 
-static unsigned getSizeForEncoding(const DataExtractor &Data,
-                                   unsigned symbolEncoding) {
-  unsigned format = symbolEncoding & 0x0f;
-  switch (format) {
-    default: llvm_unreachable("Unknown Encoding");
-    case DW_EH_PE_absptr:
-    case DW_EH_PE_signed:
-      return Data.getAddressSize();
-    case DW_EH_PE_udata2:
-    case DW_EH_PE_sdata2:
-      return 2;
-    case DW_EH_PE_udata4:
-    case DW_EH_PE_sdata4:
-      return 4;
-    case DW_EH_PE_udata8:
-    case DW_EH_PE_sdata8:
-      return 8;
-  }
-}
-
-static uint64_t readPointer(const DataExtractor &Data, uint32_t &Offset,
-                            unsigned Encoding) {
-  switch (getSizeForEncoding(Data, Encoding)) {
-    case 2:
-      return Data.getU16(&Offset);
-    case 4:
-      return Data.getU32(&Offset);
-    case 8:
-      return Data.getU64(&Offset);
-    default:
-      llvm_unreachable("Illegal data size");
-  }
-}
-
 // This is a workaround for old compilers which do not allow
 // noreturn attribute usage in lambdas. Once the support for those
 // compilers are phased out, we can remove this and return back to
@@ -539,7 +352,7 @@ static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset,
   report_fatal_error(Str);
 }
 
-void DWARFDebugFrame::parse(DataExtractor Data) {
+void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
   uint32_t Offset = 0;
   DenseMap<uint32_t, CIE *> CIEs;
 
@@ -569,9 +382,8 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
 
     // The Id field's size depends on the DWARF format
     Id = Data.getUnsigned(&Offset, (IsDWARF64 && !IsEH) ? 8 : 4);
-    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) ||
-                  Id == DW_CIE_ID ||
-                  (IsEH && !Id));
+    bool IsCIE =
+        ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID || (IsEH && !Id));
 
     if (IsCIE) {
       uint8_t Version = Data.getU8(&Offset);
@@ -587,12 +399,11 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
 
       // Parse the augmentation data for EH CIEs
       StringRef AugmentationData("");
-      uint32_t FDEPointerEncoding = DW_EH_PE_omit;
+      uint32_t FDEPointerEncoding = DW_EH_PE_absptr;
       uint32_t LSDAPointerEncoding = DW_EH_PE_omit;
+      Optional<uint64_t> Personality;
+      Optional<uint32_t> PersonalityEncoding;
       if (IsEH) {
-        Optional<uint32_t> PersonalityEncoding;
-        Optional<uint64_t> Personality;
-
         Optional<uint64_t> AugmentationLength;
         uint32_t StartAugmentationOffset;
         uint32_t EndAugmentationOffset;
@@ -611,12 +422,17 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
                 ReportError(StartOffset,
                             "Duplicate personality in entry at %lx");
               PersonalityEncoding = Data.getU8(&Offset);
-              Personality = readPointer(Data, Offset, *PersonalityEncoding);
+              Personality = Data.getEncodedPointer(
+                  &Offset, *PersonalityEncoding,
+                  EHFrameAddress ? EHFrameAddress + Offset : 0);
               break;
             }
             case 'R':
               FDEPointerEncoding = Data.getU8(&Offset);
               break;
+            case 'S':
+              // Current frame is a signal trampoline.
+              break;
             case 'z':
               if (i)
                 ReportError(StartOffset,
@@ -639,14 +455,11 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
         }
       }
 
-      auto Cie = llvm::make_unique<CIE>(StartOffset, Length, Version,
-                                        AugmentationString, AddressSize,
-                                        SegmentDescriptorSize,
-                                        CodeAlignmentFactor,
-                                        DataAlignmentFactor,
-                                        ReturnAddressRegister,
-                                        AugmentationData, FDEPointerEncoding,
-                                        LSDAPointerEncoding);
+      auto Cie = llvm::make_unique<CIE>(
+          StartOffset, Length, Version, AugmentationString, AddressSize,
+          SegmentDescriptorSize, CodeAlignmentFactor, DataAlignmentFactor,
+          ReturnAddressRegister, AugmentationData, FDEPointerEncoding,
+          LSDAPointerEncoding, Personality, PersonalityEncoding);
       CIEs[StartOffset] = Cie.get();
       Entries.emplace_back(std::move(Cie));
     } else {
@@ -654,6 +467,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
       uint64_t CIEPointer = Id;
       uint64_t InitialLocation = 0;
       uint64_t AddressRange = 0;
+      Optional<uint64_t> LSDAAddress;
       CIE *Cie = CIEs[IsEH ? (StartStructureOffset - CIEPointer) : CIEPointer];
 
       if (IsEH) {
@@ -662,10 +476,15 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
           ReportError(StartOffset,
                       "Parsing FDE data at %lx failed due to missing CIE");
 
-        InitialLocation = readPointer(Data, Offset,
-                                      Cie->getFDEPointerEncoding());
-        AddressRange = readPointer(Data, Offset,
-                                   Cie->getFDEPointerEncoding());
+        if (auto Val = Data.getEncodedPointer(
+                &Offset, Cie->getFDEPointerEncoding(),
+                EHFrameAddress ? EHFrameAddress + Offset : 0)) {
+          InitialLocation = *Val;
+        }
+        if (auto Val = Data.getEncodedPointer(
+                &Offset, Cie->getFDEPointerEncoding(), 0)) {
+          AddressRange = *Val;
+        }
 
         StringRef AugmentationString = Cie->getAugmentationString();
         if (!AugmentationString.empty()) {
@@ -676,8 +495,11 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
             Offset + static_cast<uint32_t>(AugmentationLength);
 
           // Decode the LSDA if the CIE augmentation string said we should.
-          if (Cie->getLSDAPointerEncoding() != DW_EH_PE_omit)
-            readPointer(Data, Offset, Cie->getLSDAPointerEncoding());
+          if (Cie->getLSDAPointerEncoding() != DW_EH_PE_omit) {
+            LSDAAddress = Data.getEncodedPointer(
+                &Offset, Cie->getLSDAPointerEncoding(),
+                EHFrameAddress ? Offset + EHFrameAddress : 0);
+          }
 
           if (Offset != EndAugmentationOffset)
             ReportError(StartOffset, "Parsing augmentation data at %lx failed");
@@ -689,10 +511,13 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
 
       Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
                                    InitialLocation, AddressRange,
-                                   Cie));
+                                   Cie, LSDAAddress));
     }
 
-    Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
+    if (Error E =
+            Entries.back()->cfis().parse(Data, &Offset, EndStructureOffset)) {
+      report_fatal_error(toString(std::move(E)));
+    }
 
     if (Offset != EndStructureOffset)
       ReportError(StartOffset, "Parsing entry instructions at %lx failed");
@@ -709,14 +534,15 @@ FrameEntry *DWARFDebugFrame::getEntryAtOffset(uint64_t Offset) const {
   return nullptr;
 }
 
-void DWARFDebugFrame::dump(raw_ostream &OS, Optional<uint64_t> Offset) const {
+void DWARFDebugFrame::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+                           Optional<uint64_t> Offset) const {
   if (Offset) {
     if (auto *Entry = getEntryAtOffset(*Offset))
-      Entry->dump(OS);
+      Entry->dump(OS, MRI, IsEH);
     return;
   }
 
   OS << "\n";
   for (const auto &Entry : Entries)
-    Entry->dump(OS);
+    Entry->dump(OS, MRI, IsEH);
 }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index e5ef4eaceebe..53a8e193ef56 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -16,6 +17,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -40,6 +42,28 @@ using ContentDescriptors = SmallVector<ContentDescriptor, 4>;
 
 } // end anonmyous namespace
 
+void DWARFDebugLine::ContentTypeTracker::trackContentType(
+    dwarf::LineNumberEntryFormat ContentType) {
+  switch (ContentType) {
+  case dwarf::DW_LNCT_timestamp:
+    HasModTime = true;
+    break;
+  case dwarf::DW_LNCT_size:
+    HasLength = true;
+    break;
+  case dwarf::DW_LNCT_MD5:
+    HasMD5 = true;
+    break;
+  case dwarf::DW_LNCT_LLVM_source:
+    HasSource = true;
+    break;
+  default:
+    // We only care about values we consider optional, and new values may be
+    // added in the vendor extension range, so we do not match exhaustively.
+    break;
+  }
+}
+
 DWARFDebugLine::Prologue::Prologue() { clear(); }
 
 void DWARFDebugLine::Prologue::clear() {
@@ -47,14 +71,15 @@ void DWARFDebugLine::Prologue::clear() {
   SegSelectorSize = 0;
   MinInstLength = MaxOpsPerInst = DefaultIsStmt = LineBase = LineRange = 0;
   OpcodeBase = 0;
-  FormParams = DWARFFormParams({0, 0, DWARF32});
-  HasMD5 = false;
+  FormParams = dwarf::FormParams({0, 0, DWARF32});
+  ContentTypes = ContentTypeTracker();
   StandardOpcodeLengths.clear();
   IncludeDirectories.clear();
   FileNames.clear();
 }
 
-void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
+void DWARFDebugLine::Prologue::dump(raw_ostream &OS,
+                                    DIDumpOptions DumpOptions) const {
   OS << "Line table prologue:\n"
      << format("    total_length: 0x%8.8" PRIx64 "\n", TotalLength)
      << format("         version: %u\n", getVersion());
@@ -73,29 +98,37 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
     OS << format("standard_opcode_lengths[%s] = %u\n",
                  LNStandardString(I + 1).data(), StandardOpcodeLengths[I]);
 
-  if (!IncludeDirectories.empty())
-    for (uint32_t I = 0; I != IncludeDirectories.size(); ++I)
-      OS << format("include_directories[%3u] = '", I + 1)
-         << IncludeDirectories[I] << "'\n";
+  if (!IncludeDirectories.empty()) {
+    // DWARF v5 starts directory indexes at 0.
+    uint32_t DirBase = getVersion() >= 5 ? 0 : 1;
+    for (uint32_t I = 0; I != IncludeDirectories.size(); ++I) {
+      OS << format("include_directories[%3u] = ", I + DirBase);
+      IncludeDirectories[I].dump(OS, DumpOptions);
+      OS << '\n';
+    }
+  }
 
   if (!FileNames.empty()) {
-    if (HasMD5)
-      OS << "                Dir  MD5 Checksum                     File Name\n"
-         << "                ---- -------------------------------- -----------"
-            "---------------\n";
-    else
-      OS << "                Dir  Mod Time   File Len   File Name\n"
-         << "                ---- ---------- ---------- -----------"
-            "----------------\n";
+    // DWARF v5 starts file indexes at 0.
+    uint32_t FileBase = getVersion() >= 5 ? 0 : 1;
     for (uint32_t I = 0; I != FileNames.size(); ++I) {
       const FileNameEntry &FileEntry = FileNames[I];
-      OS << format("file_names[%3u] %4" PRIu64 " ", I + 1, FileEntry.DirIdx);
-      if (HasMD5)
-        OS << FileEntry.Checksum.digest();
-      else
-        OS << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64, FileEntry.ModTime,
-                     FileEntry.Length);
-      OS << ' ' << FileEntry.Name << '\n';
+      OS <<   format("file_names[%3u]:\n", I + FileBase);
+      OS <<          "           name: ";
+      FileEntry.Name.dump(OS, DumpOptions);
+      OS << '\n'
+         <<   format("      dir_index: %" PRIu64 "\n", FileEntry.DirIdx);
+      if (ContentTypes.HasMD5)
+        OS <<        "   md5_checksum: " << FileEntry.Checksum.digest() << '\n';
+      if (ContentTypes.HasModTime)
+        OS << format("       mod_time: 0x%8.8" PRIx64 "\n", FileEntry.ModTime);
+      if (ContentTypes.HasLength)
+        OS << format("         length: 0x%8.8" PRIx64 "\n", FileEntry.Length);
+      if (ContentTypes.HasSource) {
+        OS <<        "         source: ";
+        FileEntry.Source.dump(OS, DumpOptions);
+        OS << '\n';
+      }
     }
   }
 }
@@ -104,13 +137,16 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
 static void
 parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
                      uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
-                     std::vector<StringRef> &IncludeDirectories,
+                     DWARFDebugLine::ContentTypeTracker &ContentTypes,
+                     std::vector<DWARFFormValue> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
   while (*OffsetPtr < EndPrologueOffset) {
     StringRef S = DebugLineData.getCStrRef(OffsetPtr);
     if (S.empty())
       break;
-    IncludeDirectories.push_back(S);
+    DWARFFormValue Dir(dwarf::DW_FORM_string);
+    Dir.setPValue(S.data());
+    IncludeDirectories.push_back(Dir);
   }
 
   while (*OffsetPtr < EndPrologueOffset) {
@@ -118,20 +154,25 @@ parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
     if (Name.empty())
       break;
     DWARFDebugLine::FileNameEntry FileEntry;
-    FileEntry.Name = Name;
+    FileEntry.Name.setForm(dwarf::DW_FORM_string);
+    FileEntry.Name.setPValue(Name.data());
     FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
     FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
     FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
     FileNames.push_back(FileEntry);
   }
+
+  ContentTypes.HasModTime = true;
+  ContentTypes.HasLength = true;
 }
 
 // Parse v5 directory/file entry content descriptions.
 // Returns the descriptors, or an empty vector if we did not find a path or
 // ran off the end of the prologue.
 static ContentDescriptors
-parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
-                   uint64_t EndPrologueOffset, bool *HasMD5) {
+parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t
+    *OffsetPtr, uint64_t EndPrologueOffset, DWARFDebugLine::ContentTypeTracker
+    *ContentTypes) {
   ContentDescriptors Descriptors;
   int FormatCount = DebugLineData.getU8(OffsetPtr);
   bool HasPath = false;
@@ -144,8 +185,8 @@ parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
     Descriptor.Form = dwarf::Form(DebugLineData.getULEB128(OffsetPtr));
     if (Descriptor.Type == dwarf::DW_LNCT_path)
       HasPath = true;
-    else if (Descriptor.Type == dwarf::DW_LNCT_MD5 && HasMD5)
-      *HasMD5 = true;
+    if (ContentTypes)
+      ContentTypes->trackContentType(Descriptor.Type);
     Descriptors.push_back(Descriptor);
   }
   return HasPath ? Descriptors : ContentDescriptors();
@@ -154,8 +195,10 @@ parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
 static bool
 parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
                      uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
-                     const DWARFFormParams &FormParams, const DWARFUnit *U,
-                     bool &HasMD5, std::vector<StringRef> &IncludeDirectories,
+                     const dwarf::FormParams &FormParams,
+                     const DWARFContext &Ctx, const DWARFUnit *U,
+                     DWARFDebugLine::ContentTypeTracker &ContentTypes,
+                     std::vector<DWARFFormValue> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
   // Get the directory entry description.
   ContentDescriptors DirDescriptors =
@@ -172,9 +215,9 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
       DWARFFormValue Value(Descriptor.Form);
       switch (Descriptor.Type) {
       case DW_LNCT_path:
-        if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, U))
+        if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, &Ctx, U))
           return false;
-        IncludeDirectories.push_back(Value.getAsCString().getValue());
+        IncludeDirectories.push_back(Value);
         break;
       default:
         if (!Value.skipValue(DebugLineData, OffsetPtr, FormParams))
@@ -185,7 +228,8 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
 
   // Get the file entry description.
   ContentDescriptors FileDescriptors =
-      parseV5EntryFormat(DebugLineData, OffsetPtr, EndPrologueOffset, &HasMD5);
+      parseV5EntryFormat(DebugLineData, OffsetPtr, EndPrologueOffset,
+          &ContentTypes);
   if (FileDescriptors.empty())
     return false;
 
@@ -197,11 +241,14 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
     DWARFDebugLine::FileNameEntry FileEntry;
     for (auto Descriptor : FileDescriptors) {
       DWARFFormValue Value(Descriptor.Form);
-      if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, U))
+      if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, &Ctx, U))
         return false;
       switch (Descriptor.Type) {
       case DW_LNCT_path:
-        FileEntry.Name = Value.getAsCString().getValue();
+        FileEntry.Name = Value;
+        break;
+      case DW_LNCT_LLVM_source:
+        FileEntry.Source = Value;
         break;
       case DW_LNCT_directory_index:
         FileEntry.DirIdx = Value.getAsUnsignedConstant().getValue();
@@ -226,8 +273,28 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
   return true;
 }
 
-bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
-                                     uint32_t *OffsetPtr, const DWARFUnit *U) {
+template <typename... Ts>
+static std::string formatErrorString(char const *Fmt, const Ts &... Vals) {
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+  Stream << format(Fmt, Vals...);
+  return Stream.str();
+}
+
+template <typename... Ts>
+static Error createError(char const *Fmt, const Ts &... Vals) {
+  return make_error<StringError>(formatErrorString(Fmt, Vals...),
+                                 inconvertibleErrorCode());
+}
+
+static Error createError(char const *Msg) {
+  return make_error<StringError>(Msg, inconvertibleErrorCode());
+}
+
+Error DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
+                                      uint32_t *OffsetPtr,
+                                      const DWARFContext &Ctx,
+                                      const DWARFUnit *U) {
   const uint64_t PrologueOffset = *OffsetPtr;
 
   clear();
@@ -236,11 +303,16 @@ bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
     FormParams.Format = dwarf::DWARF64;
     TotalLength = DebugLineData.getU64(OffsetPtr);
   } else if (TotalLength >= 0xffffff00) {
-    return false;
+    return createError(
+        "parsing line table prologue at offset 0x%8.8" PRIx64
+        " unsupported reserved unit length found of value 0x%8.8" PRIx64,
+        PrologueOffset, TotalLength);
   }
   FormParams.Version = DebugLineData.getU16(OffsetPtr);
   if (getVersion() < 2)
-    return false;
+    return createError("parsing line table prologue at offset 0x%8.8" PRIx64
+                       " found unsupported version 0x%2.2" PRIx16,
+                       PrologueOffset, getVersion());
 
   if (getVersion() >= 5) {
     FormParams.AddrSize = DebugLineData.getU8(OffsetPtr);
@@ -268,27 +340,24 @@ bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
 
   if (getVersion() >= 5) {
     if (!parseV5DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
-                              getFormParams(), U, HasMD5, IncludeDirectories,
-                              FileNames)) {
-      fprintf(stderr,
-              "warning: parsing line table prologue at 0x%8.8" PRIx64
-              " found an invalid directory or file table description at"
-              " 0x%8.8" PRIx64 "\n", PrologueOffset, (uint64_t)*OffsetPtr);
-      return false;
+                              FormParams, Ctx, U, ContentTypes,
+                              IncludeDirectories, FileNames)) {
+      return createError(
+          "parsing line table prologue at 0x%8.8" PRIx64
+          " found an invalid directory or file table description at"
+          " 0x%8.8" PRIx64,
+          PrologueOffset, (uint64_t)*OffsetPtr);
     }
   } else
     parseV2DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
-                         IncludeDirectories, FileNames);
-
-  if (*OffsetPtr != EndPrologueOffset) {
-    fprintf(stderr,
-            "warning: parsing line table prologue at 0x%8.8" PRIx64
-            " should have ended at 0x%8.8" PRIx64
-            " but it ended at 0x%8.8" PRIx64 "\n",
-            PrologueOffset, EndPrologueOffset, (uint64_t)*OffsetPtr);
-    return false;
-  }
-  return true;
+                         ContentTypes, IncludeDirectories, FileNames);
+
+  if (*OffsetPtr != EndPrologueOffset)
+    return createError("parsing line table prologue at 0x%8.8" PRIx64
+                       " should have ended at 0x%8.8" PRIx64
+                       " but it ended at 0x%8.8" PRIx64,
+                       PrologueOffset, EndPrologueOffset, (uint64_t)*OffsetPtr);
+  return Error::success();
 }
 
 DWARFDebugLine::Row::Row(bool DefaultIsStmt) { reset(DefaultIsStmt); }
@@ -340,8 +409,9 @@ void DWARFDebugLine::Sequence::reset() {
 
 DWARFDebugLine::LineTable::LineTable() { clear(); }
 
-void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
-  Prologue.dump(OS);
+void DWARFDebugLine::LineTable::dump(raw_ostream &OS,
+                                     DIDumpOptions DumpOptions) const {
+  Prologue.dump(OS, DumpOptions);
   OS << '\n';
 
   if (!Rows.empty()) {
@@ -396,34 +466,45 @@ DWARFDebugLine::getLineTable(uint32_t Offset) const {
   return nullptr;
 }
 
-const DWARFDebugLine::LineTable *
-DWARFDebugLine::getOrParseLineTable(DWARFDataExtractor &DebugLineData,
-                                    uint32_t Offset, const DWARFUnit *U) {
+Expected<const DWARFDebugLine::LineTable *> DWARFDebugLine::getOrParseLineTable(
+    DWARFDataExtractor &DebugLineData, uint32_t Offset, const DWARFContext &Ctx,
+    const DWARFUnit *U, std::function<void(Error)> RecoverableErrorCallback) {
+  if (!DebugLineData.isValidOffset(Offset))
+    return createError("offset 0x%8.8" PRIx32
+                       " is not a valid debug line section offset",
+                       Offset);
+
   std::pair<LineTableIter, bool> Pos =
       LineTableMap.insert(LineTableMapTy::value_type(Offset, LineTable()));
   LineTable *LT = &Pos.first->second;
   if (Pos.second) {
-    if (!LT->parse(DebugLineData, &Offset, U))
-      return nullptr;
+    if (Error Err =
+            LT->parse(DebugLineData, &Offset, Ctx, U, RecoverableErrorCallback))
+      return std::move(Err);
+    return LT;
   }
   return LT;
 }
 
-bool DWARFDebugLine::LineTable::parse(DWARFDataExtractor &DebugLineData,
-                                      uint32_t *OffsetPtr, const DWARFUnit *U,
-                                      raw_ostream *OS) {
+Error DWARFDebugLine::LineTable::parse(
+    DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+    const DWARFContext &Ctx, const DWARFUnit *U,
+    std::function<void(Error)> RecoverableErrorCallback, raw_ostream *OS) {
   const uint32_t DebugLineOffset = *OffsetPtr;
 
   clear();
 
-  if (!Prologue.parse(DebugLineData, OffsetPtr, U)) {
-    // Restore our offset and return false to indicate failure!
-    *OffsetPtr = DebugLineOffset;
-    return false;
+  Error PrologueErr = Prologue.parse(DebugLineData, OffsetPtr, Ctx, U);
+
+  if (OS) {
+    // The presence of OS signals verbose dumping.
+    DIDumpOptions DumpOptions;
+    DumpOptions.Verbose = true;
+    Prologue.dump(*OS, DumpOptions);
   }
 
-  if (OS)
-    Prologue.dump(*OS);
+  if (PrologueErr)
+    return PrologueErr;
 
   const uint32_t EndOffset =
       DebugLineOffset + Prologue.TotalLength + Prologue.sizeofTotalLength();
@@ -493,8 +574,12 @@ bool DWARFDebugLine::LineTable::parse(DWARFDataExtractor &DebugLineData,
         // from the size of the operand.
         if (DebugLineData.getAddressSize() == 0)
           DebugLineData.setAddressSize(Len - 1);
-        else
-          assert(DebugLineData.getAddressSize() == Len - 1);
+        else if (DebugLineData.getAddressSize() != Len - 1) {
+          return createError("mismatching address size at offset 0x%8.8" PRIx32
+                             " expected 0x%2.2" PRIx8 " found 0x%2.2" PRIx64,
+                             ExtOffset, DebugLineData.getAddressSize(),
+                             Len - 1);
+        }
         State.Row.Address = DebugLineData.getRelocatedAddress(OffsetPtr);
         if (OS)
           *OS << format(" (0x%16.16" PRIx64 ")", State.Row.Address);
@@ -523,14 +608,15 @@ bool DWARFDebugLine::LineTable::parse(DWARFDataExtractor &DebugLineData,
         // the file register of the state machine.
         {
           FileNameEntry FileEntry;
-          FileEntry.Name = DebugLineData.getCStr(OffsetPtr);
+          const char *Name = DebugLineData.getCStr(OffsetPtr);
+          FileEntry.Name.setForm(dwarf::DW_FORM_string);
+          FileEntry.Name.setPValue(Name);
           FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
           FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
           FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
           Prologue.FileNames.push_back(FileEntry);
           if (OS)
-            *OS << " (" << FileEntry.Name.str()
-                << ", dir=" << FileEntry.DirIdx << ", mod_time="
+            *OS << " (" << Name << ", dir=" << FileEntry.DirIdx << ", mod_time="
                 << format("(0x%16.16" PRIx64 ")", FileEntry.ModTime)
                 << ", length=" << FileEntry.Length << ")";
         }
@@ -553,14 +639,10 @@ bool DWARFDebugLine::LineTable::parse(DWARFDataExtractor &DebugLineData,
       }
       // Make sure the stated and parsed lengths are the same.
       // Otherwise we have an unparseable line-number program.
-      if (*OffsetPtr - ExtOffset != Len) {
-        fprintf(stderr, "Unexpected line op length at offset 0x%8.8" PRIx32
-                " expected 0x%2.2" PRIx64 " found 0x%2.2" PRIx32 "\n",
-                ExtOffset, Len, *OffsetPtr - ExtOffset);
-        // Skip the rest of the line-number program.
-        *OffsetPtr = EndOffset;
-        return false;
-      }
+      if (*OffsetPtr - ExtOffset != Len)
+        return createError("unexpected line op length at offset 0x%8.8" PRIx32
+                           " expected 0x%2.2" PRIx64 " found 0x%2.2" PRIx32,
+                           ExtOffset, Len, *OffsetPtr - ExtOffset);
     } else if (Opcode < Prologue.OpcodeBase) {
       if (OS)
         *OS << LNStandardString(Opcode);
@@ -763,14 +845,13 @@ bool DWARFDebugLine::LineTable::parse(DWARFDataExtractor &DebugLineData,
       *OS << "\n";
   }
 
-  if (!State.Sequence.Empty) {
-    fprintf(stderr, "warning: last sequence in debug line table is not"
-                    "terminated!\n");
-  }
+  if (!State.Sequence.Empty)
+    RecoverableErrorCallback(
+        createError("last sequence in debug line table is not terminated!"));
 
   // Sort all sequences so that address lookup will work faster.
   if (!Sequences.empty()) {
-    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
+    llvm::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
     // Note: actually, instruction address ranges of sequences should not
     // overlap (in shared objects and executables). If they do, the address
     // lookup would still work, though, but result would be ambiguous.
@@ -779,7 +860,7 @@ bool DWARFDebugLine::LineTable::parse(DWARFDataExtractor &DebugLineData,
     // rudimentary sequences for address ranges [0x0, 0xsomething).
   }
 
-  return EndOffset;
+  return Error::success();
 }
 
 uint32_t
@@ -887,6 +968,24 @@ bool DWARFDebugLine::LineTable::hasFileAtIndex(uint64_t FileIndex) const {
   return FileIndex != 0 && FileIndex <= Prologue.FileNames.size();
 }
 
+Optional<StringRef> DWARFDebugLine::LineTable::getSourceByIndex(uint64_t FileIndex,
+                                                                FileLineInfoKind Kind) const {
+  if (Kind == FileLineInfoKind::None || !hasFileAtIndex(FileIndex))
+    return None;
+  const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
+  if (Optional<const char *> source = Entry.Source.getAsCString())
+    return StringRef(*source);
+  return None;
+}
+
+static bool isPathAbsoluteOnWindowsOrPosix(const Twine &Path) {
+  // Debug info can contain paths from any OS, not necessarily
+  // an OS we're currently running on. Moreover different compilation units can
+  // be compiled on different operating systems and linked together later.
+  return sys::path::is_absolute(Path, sys::path::Style::posix) ||
+         sys::path::is_absolute(Path, sys::path::Style::windows);
+}
+
 bool DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
                                                    const char *CompDir,
                                                    FileLineInfoKind Kind,
@@ -894,9 +993,9 @@ bool DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
   if (Kind == FileLineInfoKind::None || !hasFileAtIndex(FileIndex))
     return false;
   const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
-  StringRef FileName = Entry.Name;
+  StringRef FileName = Entry.Name.getAsCString().getValue();
   if (Kind != FileLineInfoKind::AbsoluteFilePath ||
-      sys::path::is_absolute(FileName)) {
+      isPathAbsoluteOnWindowsOrPosix(FileName)) {
     Result = FileName;
     return true;
   }
@@ -907,13 +1006,15 @@ bool DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
   // Be defensive about the contents of Entry.
   if (IncludeDirIndex > 0 &&
       IncludeDirIndex <= Prologue.IncludeDirectories.size())
-    IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
+    IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1]
+                     .getAsCString()
+                     .getValue();
 
   // We may still need to append compilation directory of compile unit.
   // We know that FileName is not absolute, the only way to have an
   // absolute path at this point would be if IncludeDir is absolute.
   if (CompDir && Kind == FileLineInfoKind::AbsoluteFilePath &&
-      sys::path::is_relative(IncludeDir))
+      !isPathAbsoluteOnWindowsOrPosix(IncludeDir))
     sys::path::append(FilePath, CompDir);
 
   // sys::path::append skips empty strings.
@@ -936,5 +1037,97 @@ bool DWARFDebugLine::LineTable::getFileLineInfoForAddress(
   Result.Line = Row.Line;
   Result.Column = Row.Column;
   Result.Discriminator = Row.Discriminator;
+  Result.Source = getSourceByIndex(Row.File, Kind);
   return true;
 }
+
+// We want to supply the Unit associated with a .debug_line[.dwo] table when
+// we dump it, if possible, but still dump the table even if there isn't a Unit.
+// Therefore, collect up handles on all the Units that point into the
+// line-table section.
+static DWARFDebugLine::SectionParser::LineToUnitMap
+buildLineToUnitMap(DWARFDebugLine::SectionParser::cu_range CUs,
+                   DWARFDebugLine::SectionParser::tu_range TUSections) {
+  DWARFDebugLine::SectionParser::LineToUnitMap LineToUnit;
+  for (const auto &CU : CUs)
+    if (auto CUDIE = CU->getUnitDIE())
+      if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list)))
+        LineToUnit.insert(std::make_pair(*StmtOffset, &*CU));
+  for (const auto &TUS : TUSections)
+    for (const auto &TU : TUS)
+      if (auto TUDIE = TU->getUnitDIE())
+        if (auto StmtOffset = toSectionOffset(TUDIE.find(DW_AT_stmt_list)))
+          LineToUnit.insert(std::make_pair(*StmtOffset, &*TU));
+  return LineToUnit;
+}
+
+DWARFDebugLine::SectionParser::SectionParser(DWARFDataExtractor &Data,
+                                             const DWARFContext &C,
+                                             cu_range CUs, tu_range TUs)
+    : DebugLineData(Data), Context(C) {
+  LineToUnit = buildLineToUnitMap(CUs, TUs);
+  if (!DebugLineData.isValidOffset(Offset))
+    Done = true;
+}
+
+bool DWARFDebugLine::Prologue::totalLengthIsValid() const {
+  return TotalLength == 0xffffffff || TotalLength < 0xffffff00;
+}
+
+DWARFDebugLine::LineTable DWARFDebugLine::SectionParser::parseNext(
+    function_ref<void(Error)> RecoverableErrorCallback,
+    function_ref<void(Error)> UnrecoverableErrorCallback, raw_ostream *OS) {
+  assert(DebugLineData.isValidOffset(Offset) &&
+         "parsing should have terminated");
+  DWARFUnit *U = prepareToParse(Offset);
+  uint32_t OldOffset = Offset;
+  LineTable LT;
+  if (Error Err = LT.parse(DebugLineData, &Offset, Context, U,
+                           RecoverableErrorCallback, OS))
+    UnrecoverableErrorCallback(std::move(Err));
+  moveToNextTable(OldOffset, LT.Prologue);
+  return LT;
+}
+
+void DWARFDebugLine::SectionParser::skip(
+    function_ref<void(Error)> ErrorCallback) {
+  assert(DebugLineData.isValidOffset(Offset) &&
+         "parsing should have terminated");
+  DWARFUnit *U = prepareToParse(Offset);
+  uint32_t OldOffset = Offset;
+  LineTable LT;
+  if (Error Err = LT.Prologue.parse(DebugLineData, &Offset, Context, U))
+    ErrorCallback(std::move(Err));
+  moveToNextTable(OldOffset, LT.Prologue);
+}
+
+DWARFUnit *DWARFDebugLine::SectionParser::prepareToParse(uint32_t Offset) {
+  DWARFUnit *U = nullptr;
+  auto It = LineToUnit.find(Offset);
+  if (It != LineToUnit.end())
+    U = It->second;
+  DebugLineData.setAddressSize(U ? U->getAddressByteSize() : 0);
+  return U;
+}
+
+void DWARFDebugLine::SectionParser::moveToNextTable(uint32_t OldOffset,
+                                                    const Prologue &P) {
+  // If the length field is not valid, we don't know where the next table is, so
+  // cannot continue to parse. Mark the parser as done, and leave the Offset
+  // value as it currently is. This will be the end of the bad length field.
+  if (!P.totalLengthIsValid()) {
+    Done = true;
+    return;
+  }
+
+  Offset = OldOffset + P.TotalLength + P.sizeofTotalLength();
+  if (!DebugLineData.isValidOffset(Offset)) {
+    Done = true;
+  }
+}
+
+void DWARFDebugLine::warn(Error Err) {
+  handleAllErrors(std::move(Err), [](ErrorInfoBase &Info) {
+    WithColor::warning() << Info.message() << '\n';
+  });
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 58f88536f317..617b914ecce9 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -16,6 +16,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cinttypes>
@@ -33,18 +34,22 @@ static void dumpExpression(raw_ostream &OS, ArrayRef<char> Data,
                            const MCRegisterInfo *MRI) {
   DWARFDataExtractor Extractor(StringRef(Data.data(), Data.size()),
                                IsLittleEndian, AddressSize);
-  DWARFExpression(Extractor, AddressSize, dwarf::DWARF_VERSION).print(OS, MRI);
+  DWARFExpression(Extractor, dwarf::DWARF_VERSION, AddressSize).print(OS, MRI);
 }
 
 void DWARFDebugLoc::LocationList::dump(raw_ostream &OS, bool IsLittleEndian,
                                        unsigned AddressSize,
                                        const MCRegisterInfo *MRI,
+                                       uint64_t BaseAddress,
                                        unsigned Indent) const {
   for (const Entry &E : Entries) {
     OS << '\n';
     OS.indent(Indent);
-    OS << format("0x%016" PRIx64, E.Begin) << " - "
-       << format("0x%016" PRIx64, E.End) << ": ";
+    OS << format("[0x%*.*" PRIx64 ", ", AddressSize * 2, AddressSize * 2,
+                 BaseAddress + E.Begin);
+    OS << format(" 0x%*.*" PRIx64 ")", AddressSize * 2, AddressSize * 2,
+                 BaseAddress + E.End);
+    OS << ": ";
 
     dumpExpression(OS, E.Loc, IsLittleEndian, AddressSize, MRI);
   }
@@ -64,7 +69,7 @@ void DWARFDebugLoc::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
                          Optional<uint64_t> Offset) const {
   auto DumpLocationList = [&](const LocationList &L) {
     OS << format("0x%8.8x: ", L.Offset);
-    L.dump(OS, IsLittleEndian, AddressSize, MRI, 12);
+    L.dump(OS, IsLittleEndian, AddressSize, MRI, 0, 12);
     OS << "\n\n";
   };
 
@@ -89,7 +94,7 @@ DWARFDebugLoc::parseOneLocationList(DWARFDataExtractor Data, unsigned *Offset) {
   while (true) {
     Entry E;
     if (!Data.isValidOffsetForDataOfSize(*Offset, 2 * Data.getAddressSize())) {
-      llvm::errs() << "Location list overflows the debug_loc section.\n";
+      WithColor::error() << "location list overflows the debug_loc section.\n";
       return None;
     }
 
@@ -106,13 +111,13 @@ DWARFDebugLoc::parseOneLocationList(DWARFDataExtractor Data, unsigned *Offset) {
       return LL;
 
     if (!Data.isValidOffsetForDataOfSize(*Offset, 2)) {
-      llvm::errs() << "Location list overflows the debug_loc section.\n";
+      WithColor::error() << "location list overflows the debug_loc section.\n";
       return None;
     }
 
     unsigned Bytes = Data.getU16(Offset);
     if (!Data.isValidOffsetForDataOfSize(*Offset, Bytes)) {
-      llvm::errs() << "Location list overflows the debug_loc section.\n";
+      WithColor::error() << "location list overflows the debug_loc section.\n";
       return None;
     }
     // A single location description describing the location of the object...
@@ -136,7 +141,7 @@ void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
       break;
   }
   if (data.isValidOffset(Offset))
-    errs() << "error: failed to consume entire .debug_loc section\n";
+    WithColor::error() << "failed to consume entire .debug_loc section\n";
 }
 
 Optional<DWARFDebugLocDWO::LocationList>
@@ -148,8 +153,8 @@ DWARFDebugLocDWO::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
   while (auto Kind =
              static_cast<dwarf::LocationListEntry>(Data.getU8(Offset))) {
     if (Kind != dwarf::DW_LLE_startx_length) {
-      llvm::errs() << "error: dumping support for LLE of kind " << (int)Kind
-                   << " not implemented\n";
+      WithColor::error() << "dumping support for LLE of kind " << (int)Kind
+                         << " not implemented\n";
       return None;
     }
 
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 1b77be6192dd..6d789c3027a5 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -8,14 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
-#include "SyntaxHighlighting.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 
 using namespace llvm;
 using namespace dwarf;
-using namespace syntax;
 
 void DWARFDebugMacro::dump(raw_ostream &OS) const {
   unsigned IndLevel = 0;
@@ -29,7 +28,7 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
       OS << "  ";
     IndLevel += (E.Type == DW_MACINFO_start_file);
 
-    WithColor(OS, syntax::Macro).get() << MacinfoString(E.Type);
+    WithColor(OS, HighlightColor::Macro).get() << MacinfoString(E.Type);
     switch (E.Type) {
     default:
       // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index 956a91e9c4d6..de8b6e543fab 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -37,7 +37,7 @@ DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
       if (DieRef == 0)
         break;
       uint8_t IndexEntryValue = GnuStyle ? PubNames.getU8(&Offset) : 0;
-      const char *Name = PubNames.getCStr(&Offset);
+      StringRef Name = PubNames.getCStrRef(&Offset);
       SetData.Entries.push_back(
           {DieRef, PubIndexEntryDescriptor(IndexEntryValue), Name});
     }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index f0b7ec2751de..a565718debd0 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -13,13 +13,16 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cinttypes>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const DWARFAddressRange &R) {
-  return OS << format("[0x%16.16" PRIx64 ", 0x%16.16" PRIx64 ")", R.LowPC,
-                      R.HighPC);
+// FIXME: There are several versions of this. Consolidate them.
+template <typename... Ts>
+static Error createError(char const *Fmt, const Ts &... Vals) {
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+  Stream << format(Fmt, Vals...);
+  return make_error<StringError>(Stream.str(), inconvertibleErrorCode());
 }
 
 void DWARFDebugRangeList::clear() {
@@ -28,14 +31,15 @@ void DWARFDebugRangeList::clear() {
   Entries.clear();
 }
 
-bool DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
-                                  uint32_t *offset_ptr) {
+Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
+                                   uint32_t *offset_ptr) {
   clear();
   if (!data.isValidOffset(*offset_ptr))
-    return false;
+    return createError("invalid range list offset 0x%" PRIx32, *offset_ptr);
+
   AddressSize = data.getAddressSize();
   if (AddressSize != 4 && AddressSize != 8)
-    return false;
+    return createError("invalid address size: %d", AddressSize);
   Offset = *offset_ptr;
   while (true) {
     RangeListEntry Entry;
@@ -49,13 +53,14 @@ bool DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
     // Check that both values were extracted correctly.
     if (*offset_ptr != prev_offset + 2 * AddressSize) {
       clear();
-      return false;
+      return createError("invalid range list entry at offset 0x%" PRIx32,
+                         prev_offset);
     }
     if (Entry.isEndOfListEntry())
       break;
     Entries.push_back(Entry);
   }
-  return true;
+  return Error::success();
 }
 
 void DWARFDebugRangeList::dump(raw_ostream &OS) const {
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
new file mode 100644
index 000000000000..b19c808a8fb3
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -0,0 +1,205 @@
+//===- DWARFDebugRnglists.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+template <typename... Ts>
+static Error createError(char const *Fmt, const Ts &... Vals) {
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+  Stream << format(Fmt, Vals...);
+  return make_error<StringError>(Stream.str(), inconvertibleErrorCode());
+}
+
+namespace llvm {   // workaround for gcc bug
+template <>
+Error DWARFListType<RangeListEntry>::createError(const char *Fmt, const char *s,
+                                                 uint32_t Val) {
+  return ::createError(Fmt, s, Val);
+}
+}
+
+Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
+                              uint32_t *OffsetPtr) {
+  Offset = *OffsetPtr;
+  SectionIndex = -1ULL;
+  // The caller should guarantee that we have at least 1 byte available, so
+  // we just assert instead of revalidate.
+  assert(*OffsetPtr < End &&
+         "not enough space to extract a rangelist encoding");
+  uint8_t Encoding = Data.getU8(OffsetPtr);
+
+  switch (Encoding) {
+  case dwarf::DW_RLE_end_of_list:
+    Value0 = Value1 = 0;
+    break;
+  // TODO: Support other encodings.
+  case dwarf::DW_RLE_base_addressx:
+    return createError("unsupported rnglists encoding DW_RLE_base_addressx "
+                       "at offset 0x%" PRIx32,
+                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_startx_endx:
+    return createError("unsupported rnglists encoding DW_RLE_startx_endx at "
+                       "offset 0x%" PRIx32,
+                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_startx_length:
+    return createError("unsupported rnglists encoding DW_RLE_startx_length "
+                       "at offset 0x%" PRIx32,
+                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_offset_pair: {
+    uint32_t PreviousOffset = *OffsetPtr - 1;
+    Value0 = Data.getULEB128(OffsetPtr);
+    Value1 = Data.getULEB128(OffsetPtr);
+    if (End < *OffsetPtr)
+      return createError("read past end of table when reading "
+                         "DW_RLE_offset_pair encoding at offset 0x%" PRIx32,
+                         PreviousOffset);
+    break;
+  }
+  case dwarf::DW_RLE_base_address: {
+    if ((End - *OffsetPtr) < Data.getAddressSize())
+      return createError("insufficient space remaining in table for "
+                         "DW_RLE_base_address encoding at offset 0x%" PRIx32,
+                         *OffsetPtr - 1);
+    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+    break;
+  }
+  case dwarf::DW_RLE_start_end: {
+    if ((End - *OffsetPtr) < unsigned(Data.getAddressSize() * 2))
+      return createError("insufficient space remaining in table for "
+                         "DW_RLE_start_end encoding "
+                         "at offset 0x%" PRIx32,
+                         *OffsetPtr - 1);
+    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+    Value1 = Data.getRelocatedAddress(OffsetPtr);
+    break;
+  }
+  case dwarf::DW_RLE_start_length: {
+    uint32_t PreviousOffset = *OffsetPtr - 1;
+    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+    Value1 = Data.getULEB128(OffsetPtr);
+    if (End < *OffsetPtr)
+      return createError("read past end of table when reading "
+                         "DW_RLE_start_length encoding at offset 0x%" PRIx32,
+                         PreviousOffset);
+    break;
+  }
+  default:
+    return createError("unknown rnglists encoding 0x%" PRIx32
+                       " at offset 0x%" PRIx32,
+                       uint32_t(Encoding), *OffsetPtr - 1);
+  }
+
+  EntryKind = Encoding;
+  return Error::success();
+}
+
+DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
+    llvm::Optional<BaseAddress> BaseAddr) const {
+  DWARFAddressRangesVector Res;
+  for (const RangeListEntry &RLE : Entries) {
+    if (RLE.EntryKind == dwarf::DW_RLE_end_of_list)
+      break;
+    if (RLE.EntryKind == dwarf::DW_RLE_base_address) {
+      BaseAddr = {RLE.Value0, RLE.SectionIndex};
+      continue;
+    }
+
+    DWARFAddressRange E;
+    E.SectionIndex = RLE.SectionIndex;
+    if (BaseAddr && E.SectionIndex == -1ULL)
+      E.SectionIndex = BaseAddr->SectionIndex;
+
+    switch (RLE.EntryKind) {
+    case dwarf::DW_RLE_offset_pair:
+      E.LowPC = RLE.Value0;
+      E.HighPC = RLE.Value1;
+      if (BaseAddr) {
+        E.LowPC += BaseAddr->Address;
+        E.HighPC += BaseAddr->Address;
+      }
+      break;
+    case dwarf::DW_RLE_start_end:
+      E.LowPC = RLE.Value0;
+      E.HighPC = RLE.Value1;
+      break;
+    case dwarf::DW_RLE_start_length:
+      E.LowPC = RLE.Value0;
+      E.HighPC = E.LowPC + RLE.Value1;
+      break;
+    default:
+      // Unsupported encodings should have been reported during extraction,
+      // so we should not run into any here.
+      llvm_unreachable("Unsupported range list encoding");
+    }
+    Res.push_back(E);
+  }
+  return Res;
+}
+
+void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
+                          uint8_t MaxEncodingStringLength,
+                          uint64_t &CurrentBase, DIDumpOptions DumpOpts) const {
+  auto PrintRawEntry = [](raw_ostream &OS, const RangeListEntry &Entry,
+                          uint8_t AddrSize, DIDumpOptions DumpOpts) {
+    if (DumpOpts.Verbose) {
+      DumpOpts.DisplayRawContents = true;
+      DWARFAddressRange(Entry.Value0, Entry.Value1)
+          .dump(OS, AddrSize, DumpOpts);
+      OS << " => ";
+    }
+  };
+
+  if (DumpOpts.Verbose) {
+    // Print the section offset in verbose mode.
+    OS << format("0x%8.8" PRIx32 ":", Offset);
+    auto EncodingString = dwarf::RangeListEncodingString(EntryKind);
+    // Unsupported encodings should have been reported during parsing.
+    assert(!EncodingString.empty() && "Unknown range entry encoding");
+    OS << format(" [%s%*c", EncodingString.data(),
+                 MaxEncodingStringLength - EncodingString.size() + 1, ']');
+    if (EntryKind != dwarf::DW_RLE_end_of_list)
+      OS << ": ";
+  }
+
+  switch (EntryKind) {
+  case dwarf::DW_RLE_end_of_list:
+    OS << (DumpOpts.Verbose ? "" : "<End of list>");
+    break;
+  case dwarf::DW_RLE_base_address:
+    // In non-verbose mode we do not print anything for this entry.
+    CurrentBase = Value0;
+    if (!DumpOpts.Verbose)
+      return;
+    OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+    break;
+  case dwarf::DW_RLE_start_length:
+    PrintRawEntry(OS, *this, AddrSize, DumpOpts);
+    DWARFAddressRange(Value0, Value0 + Value1).dump(OS, AddrSize, DumpOpts);
+    break;
+  case dwarf::DW_RLE_offset_pair:
+    PrintRawEntry(OS, *this, AddrSize, DumpOpts);
+    DWARFAddressRange(Value0 + CurrentBase, Value1 + CurrentBase)
+        .dump(OS, AddrSize, DumpOpts);
+    break;
+  case dwarf::DW_RLE_start_end:
+    DWARFAddressRange(Value0, Value1).dump(OS, AddrSize, DumpOpts);
+    break;
+  default:
+    llvm_unreachable("Unsupported range list encoding");
+  }
+  OS << "\n";
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 91f0f8501f0c..904ceab7b286 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
-#include "SyntaxHighlighting.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
@@ -22,7 +22,9 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -34,7 +36,6 @@
 using namespace llvm;
 using namespace dwarf;
 using namespace object;
-using namespace syntax;
 
 static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
   OS << " (";
@@ -62,13 +63,11 @@ static void dumpRanges(const DWARFObject &Obj, raw_ostream &OS,
   if (DumpOpts.Verbose)
     SectionNames = Obj.getSectionNames();
 
-  for (size_t I = 0; I < Ranges.size(); ++I) {
-    const DWARFAddressRange &R = Ranges[I];
+  for (const DWARFAddressRange &R : Ranges) {
 
     OS << '\n';
     OS.indent(Indent);
-    OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")", AddressSize * 2,
-                 R.LowPC, AddressSize * 2, R.HighPC);
+    R.dump(OS, AddressSize);
 
     if (SectionNames.empty() || R.SectionIndex == -1ULL)
       continue;
@@ -103,15 +102,18 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
     const DWARFSection &LocSection = Obj.getLocSection();
     const DWARFSection &LocDWOSection = Obj.getLocDWOSection();
     uint32_t Offset = *FormValue.getAsSectionOffset();
-
     if (!LocSection.Data.empty()) {
       DWARFDebugLoc DebugLoc;
       DWARFDataExtractor Data(Obj, LocSection, Ctx.isLittleEndian(),
                               Obj.getAddressSize());
       auto LL = DebugLoc.parseOneLocationList(Data, &Offset);
-      if (LL)
-        LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, Indent);
-      else
+      if (LL) {
+        uint64_t BaseAddr = 0;
+        if (Optional<BaseAddress> BA = U->getBaseAddress())
+          BaseAddr = BA->Address;
+        LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, BaseAddr,
+                 Indent);
+      } else
         OS << "error extracting location list.";
     } else if (!LocDWOSection.Data.empty()) {
       DataExtractor Data(LocDWOSection.Data, Ctx.isLittleEndian(), 0);
@@ -191,19 +193,10 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   const char BaseIndent[] = "            ";
   OS << BaseIndent;
   OS.indent(Indent + 2);
-  auto attrString = AttributeString(Attr);
-  if (!attrString.empty())
-    WithColor(OS, syntax::Attribute) << attrString;
-  else
-    WithColor(OS, syntax::Attribute).get() << format("DW_AT_Unknown_%x", Attr);
+  WithColor(OS, HighlightColor::Attribute) << formatv("{0}", Attr);
 
-  if (DumpOpts.Verbose || DumpOpts.ShowForm) {
-    auto formString = FormEncodingString(Form);
-    if (!formString.empty())
-      OS << " [" << formString << ']';
-    else
-      OS << format(" [DW_FORM_Unknown_%x]", Form);
-  }
+  if (DumpOpts.Verbose || DumpOpts.ShowForm)
+    OS << formatv(" [{0}]", Form);
 
   DWARFUnit *U = Die.getDwarfUnit();
   DWARFFormValue formValue(Form);
@@ -216,9 +209,9 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
 
   StringRef Name;
   std::string File;
-  auto Color = syntax::Enumerator;
+  auto Color = HighlightColor::Enumerator;
   if (Attr == DW_AT_decl_file || Attr == DW_AT_call_file) {
-    Color = syntax::String;
+    Color = HighlightColor::String;
     if (const auto *LT = U->getContext().getLineTableForUnit(U))
       if (LT->getFileNameByIndex(
               formValue.getAsUnsignedConstant().getValue(),
@@ -267,8 +260,22 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
       dumpApplePropertyAttribute(OS, *OptVal);
   } else if (Attr == DW_AT_ranges) {
     const DWARFObject &Obj = Die.getDwarfUnit()->getContext().getDWARFObj();
-    dumpRanges(Obj, OS, Die.getAddressRanges(), U->getAddressByteSize(),
-               sizeof(BaseIndent) + Indent + 4, DumpOpts);
+    // For DW_FORM_rnglistx we need to dump the offset separately, since
+    // we have only dumped the index so far.
+    Optional<DWARFFormValue> Value = Die.find(DW_AT_ranges);
+    if (Value && Value->getForm() == DW_FORM_rnglistx)
+      if (auto RangeListOffset =
+              U->getRnglistOffset(*Value->getAsSectionOffset())) {
+        DWARFFormValue FV(dwarf::DW_FORM_sec_offset);
+        FV.setUValue(*RangeListOffset);
+        FV.dump(OS, DumpOpts);
+      }
+    if (auto RangesOrError = Die.getAddressRanges())
+      dumpRanges(Obj, OS, RangesOrError.get(), U->getAddressByteSize(),
+                 sizeof(BaseIndent) + Indent + 4, DumpOpts);
+    else
+      WithColor::error() << "decoding address ranges: "
+                         << toString(RangesOrError.takeError()) << '\n';
   }
 
   OS << ")\n";
@@ -306,18 +313,37 @@ DWARFDie::find(ArrayRef<dwarf::Attribute> Attrs) const {
 
 Optional<DWARFFormValue>
 DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
-  if (!isValid())
-    return None;
-  if (auto Value = find(Attrs))
-    return Value;
-  if (auto Die = getAttributeValueAsReferencedDie(DW_AT_abstract_origin)) {
-    if (auto Value = Die.findRecursively(Attrs))
-      return Value;
-  }
-  if (auto Die = getAttributeValueAsReferencedDie(DW_AT_specification)) {
-    if (auto Value = Die.findRecursively(Attrs))
+  std::vector<DWARFDie> Worklist;
+  Worklist.push_back(*this);
+
+  // Keep track if DIEs already seen to prevent infinite recursion.
+  // Empirically we rarely see a depth of more than 3 when dealing with valid
+  // DWARF. This corresponds to following the DW_AT_abstract_origin and
+  // DW_AT_specification just once.
+  SmallSet<DWARFDie, 3> Seen;
+
+  while (!Worklist.empty()) {
+    DWARFDie Die = Worklist.back();
+    Worklist.pop_back();
+
+    if (!Die.isValid())
+      continue;
+
+    if (Seen.count(Die))
+      continue;
+
+    Seen.insert(Die);
+
+    if (auto Value = Die.find(Attrs))
       return Value;
+
+    if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
+      Worklist.push_back(D);
+
+    if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification))
+      Worklist.push_back(D);
   }
+
   return None;
 }
 
@@ -363,20 +389,19 @@ bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC,
   return false;
 }
 
-DWARFAddressRangesVector DWARFDie::getAddressRanges() const {
+Expected<DWARFAddressRangesVector> DWARFDie::getAddressRanges() const {
   if (isNULL())
     return DWARFAddressRangesVector();
   // Single range specified by low/high PC.
   uint64_t LowPC, HighPC, Index;
   if (getLowAndHighPC(LowPC, HighPC, Index))
-    return {{LowPC, HighPC, Index}};
-
-  // Multiple ranges from .debug_ranges section.
-  auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
-  if (RangesOffset) {
-    DWARFDebugRangeList RangeList;
-    if (U->extractRangeList(*RangesOffset, RangeList))
-      return RangeList.getAbsoluteRanges(U->getBaseAddress());
+    return DWARFAddressRangesVector{{LowPC, HighPC, Index}};
+
+  Optional<DWARFFormValue> Value = find(DW_AT_ranges);
+  if (Value) {
+    if (Value->getForm() == DW_FORM_rnglistx)
+      return U->findRnglistFromIndex(*Value->getAsSectionOffset());
+    return U->findRnglistFromOffset(*Value->getAsSectionOffset());
   }
   return DWARFAddressRangesVector();
 }
@@ -386,8 +411,11 @@ void DWARFDie::collectChildrenAddressRanges(
   if (isNULL())
     return;
   if (isSubprogramDIE()) {
-    const auto &DIERanges = getAddressRanges();
-    Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end());
+    if (auto DIERangesOrError = getAddressRanges())
+      Ranges.insert(Ranges.end(), DIERangesOrError.get().begin(),
+                    DIERangesOrError.get().end());
+    else
+      llvm::consumeError(DIERangesOrError.takeError());
   }
 
   for (auto Child : children())
@@ -395,10 +423,15 @@ void DWARFDie::collectChildrenAddressRanges(
 }
 
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
-  for (const auto &R : getAddressRanges()) {
+  auto RangesOrError = getAddressRanges();
+  if (!RangesOrError) {
+    llvm::consumeError(RangesOrError.takeError());
+    return false;
+  }
+
+  for (const auto &R : RangesOrError.get())
     if (R.LowPC <= Address && Address < R.HighPC)
       return true;
-  }
   return false;
 }
 
@@ -454,25 +487,23 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
   const uint32_t Offset = getOffset();
   uint32_t offset = Offset;
   if (DumpOpts.ShowParents) {
-    DumpOpts.ShowParents = false;
-    Indent = dumpParentChain(getParent(), OS, Indent, DumpOpts);
+    DIDumpOptions ParentDumpOpts = DumpOpts;
+    ParentDumpOpts.ShowParents = false;
+    ParentDumpOpts.ShowChildren = false;
+    Indent = dumpParentChain(getParent(), OS, Indent, ParentDumpOpts);
   }
 
   if (debug_info_data.isValidOffset(offset)) {
     uint32_t abbrCode = debug_info_data.getULEB128(&offset);
     if (DumpOpts.ShowAddresses)
-      WithColor(OS, syntax::Address).get() << format("\n0x%8.8x: ", Offset);
+      WithColor(OS, HighlightColor::Address).get()
+          << format("\n0x%8.8x: ", Offset);
 
     if (abbrCode) {
       auto AbbrevDecl = getAbbreviationDeclarationPtr();
       if (AbbrevDecl) {
-        auto tagString = TagString(getTag());
-        if (!tagString.empty())
-          WithColor(OS, syntax::Tag).get().indent(Indent) << tagString;
-        else
-          WithColor(OS, syntax::Tag).get().indent(Indent)
-              << format("DW_TAG_Unknown_%x", getTag());
-
+        WithColor(OS, HighlightColor::Tag).get().indent(Indent)
+            << formatv("{0}", getTag());
         if (DumpOpts.Verbose)
           OS << format(" [%u] %c", abbrCode,
                        AbbrevDecl->hasChildren() ? '*' : ' ');
@@ -493,8 +524,10 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
         DWARFDie child = getFirstChild();
         if (DumpOpts.ShowChildren && DumpOpts.RecurseDepth > 0 && child) {
           DumpOpts.RecurseDepth--;
+          DIDumpOptions ChildDumpOpts = DumpOpts;
+          ChildDumpOpts.ShowParents = false;
           while (child) {
-            child.dump(OS, Indent + 2, DumpOpts);
+            child.dump(OS, Indent + 2, ChildDumpOpts);
             child = child.getSibling();
           }
         }
@@ -522,12 +555,24 @@ DWARFDie DWARFDie::getSibling() const {
   return DWARFDie();
 }
 
+DWARFDie DWARFDie::getPreviousSibling() const {
+  if (isValid())
+    return U->getPreviousSibling(Die);
+  return DWARFDie();
+}
+
 DWARFDie DWARFDie::getFirstChild() const {
   if (isValid())
     return U->getFirstChild(Die);
   return DWARFDie();
 }
 
+DWARFDie DWARFDie::getLastChild() const {
+  if (isValid())
+    return U->getLastChild(Die);
+  return DWARFDie();
+}
+
 iterator_range<DWARFDie::attribute_iterator> DWARFDie::attributes() const {
   return make_range(attribute_iterator(*this, false),
                     attribute_iterator(*this, true));
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index c704c2901aef..a9ea26c476ca 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -258,9 +258,10 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
   return true;
 }
 
-void DWARFExpression::print(raw_ostream &OS, const MCRegisterInfo *RegInfo) {
+void DWARFExpression::print(raw_ostream &OS, const MCRegisterInfo *RegInfo,
+                            bool IsEH) const {
   for (auto &Op : *this) {
-    if (!Op.print(OS, this, RegInfo, /* isEH */ false)) {
+    if (!Op.print(OS, this, RegInfo, IsEH)) {
       uint32_t FailOffset = Op.getEndOffset();
       while (FailOffset < Data.getData().size())
         OS << format(" %02x", Data.getU8(&FailOffset));
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 44886de2e3d5..1aa43c6b6517 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "SyntaxHighlighting.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -19,6 +18,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cinttypes>
 #include <cstdint>
@@ -26,9 +26,8 @@
 
 using namespace llvm;
 using namespace dwarf;
-using namespace syntax;
 
-static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
+static const DWARFFormValue::FormClass DWARF5FormClasses[] = {
     DWARFFormValue::FC_Unknown,  // 0x0
     DWARFFormValue::FC_Address,  // 0x01 DW_FORM_addr
     DWARFFormValue::FC_Unknown,  // 0x02 unused
@@ -57,96 +56,31 @@ static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
     DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
     DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
     DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
-};
-
-Optional<uint8_t>
-DWARFFormValue::getFixedByteSize(dwarf::Form Form,
-                                 const DWARFFormParams Params) {
-  switch (Form) {
-  case DW_FORM_addr:
-    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
-    return Params.AddrSize;
-
-  case DW_FORM_block:          // ULEB128 length L followed by L bytes.
-  case DW_FORM_block1:         // 1 byte length L followed by L bytes.
-  case DW_FORM_block2:         // 2 byte length L followed by L bytes.
-  case DW_FORM_block4:         // 4 byte length L followed by L bytes.
-  case DW_FORM_string:         // C-string with null terminator.
-  case DW_FORM_sdata:          // SLEB128.
-  case DW_FORM_udata:          // ULEB128.
-  case DW_FORM_ref_udata:      // ULEB128.
-  case DW_FORM_indirect:       // ULEB128.
-  case DW_FORM_exprloc:        // ULEB128 length L followed by L bytes.
-  case DW_FORM_strx:           // ULEB128.
-  case DW_FORM_addrx:          // ULEB128.
-  case DW_FORM_loclistx:       // ULEB128.
-  case DW_FORM_rnglistx:       // ULEB128.
-  case DW_FORM_GNU_addr_index: // ULEB128.
-  case DW_FORM_GNU_str_index:  // ULEB128.
-    return None;
-
-  case DW_FORM_ref_addr:
-    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
-    return Params.getRefAddrByteSize();
-
-  case DW_FORM_flag:
-  case DW_FORM_data1:
-  case DW_FORM_ref1:
-  case DW_FORM_strx1:
-  case DW_FORM_addrx1:
-    return 1;
-
-  case DW_FORM_data2:
-  case DW_FORM_ref2:
-  case DW_FORM_strx2:
-  case DW_FORM_addrx2:
-    return 2;
-
-  case DW_FORM_strx3:
-    return 3;
-
-  case DW_FORM_data4:
-  case DW_FORM_ref4:
-  case DW_FORM_ref_sup4:
-  case DW_FORM_strx4:
-  case DW_FORM_addrx4:
-    return 4;
-
-  case DW_FORM_strp:
-  case DW_FORM_GNU_ref_alt:
-  case DW_FORM_GNU_strp_alt:
-  case DW_FORM_line_strp:
-  case DW_FORM_sec_offset:
-  case DW_FORM_strp_sup:
-    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
-    return Params.getDwarfOffsetByteSize();
-
-  case DW_FORM_data8:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_sig8:
-  case DW_FORM_ref_sup8:
-    return 8;
-
-  case DW_FORM_flag_present:
-    return 0;
+    DWARFFormValue::FC_String,        // 0x1a DW_FORM_strx
+    DWARFFormValue::FC_Address,       // 0x1b DW_FORM_addrx
+    DWARFFormValue::FC_Reference,     // 0x1c DW_FORM_ref_sup4
+    DWARFFormValue::FC_String,        // 0x1d DW_FORM_strp_sup
+    DWARFFormValue::FC_Constant,      // 0x1e DW_FORM_data16
+    DWARFFormValue::FC_String,        // 0x1f DW_FORM_line_strp
+    DWARFFormValue::FC_Reference,     // 0x20 DW_FORM_ref_sig8
+    DWARFFormValue::FC_Constant,      // 0x21 DW_FORM_implicit_const
+    DWARFFormValue::FC_SectionOffset, // 0x22 DW_FORM_loclistx
+    DWARFFormValue::FC_SectionOffset, // 0x23 DW_FORM_rnglistx
+    DWARFFormValue::FC_Reference,     // 0x24 DW_FORM_ref_sup8
+    DWARFFormValue::FC_String,        // 0x25 DW_FORM_strx1
+    DWARFFormValue::FC_String,        // 0x26 DW_FORM_strx2
+    DWARFFormValue::FC_String,        // 0x27 DW_FORM_strx3
+    DWARFFormValue::FC_String,        // 0x28 DW_FORM_strx4
+    DWARFFormValue::FC_Address,       // 0x29 DW_FORM_addrx1
+    DWARFFormValue::FC_Address,       // 0x2a DW_FORM_addrx2
+    DWARFFormValue::FC_Address,       // 0x2b DW_FORM_addrx3
+    DWARFFormValue::FC_Address,       // 0x2c DW_FORM_addrx4
 
-  case DW_FORM_data16:
-    return 16;
-
-  case DW_FORM_implicit_const:
-    // The implicit value is stored in the abbreviation as a SLEB128, and
-    // there no data in debug info.
-    return 0;
-
-  default:
-    llvm_unreachable("Handle this form in this switch statement");
-  }
-  return None;
-}
+};
 
 bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
                                uint32_t *OffsetPtr,
-                               const DWARFFormParams Params) {
+                               const dwarf::FormParams Params) {
   bool Indirect = false;
   do {
     switch (Form) {
@@ -208,7 +142,7 @@ bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
       if (Optional<uint8_t> FixedSize =
-              DWARFFormValue::getFixedByteSize(Form, Params)) {
+              dwarf::getFixedFormByteSize(Form, Params)) {
         *OffsetPtr += *FixedSize;
         return true;
       }
@@ -243,42 +177,38 @@ bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
 }
 
 bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
-  // First, check DWARF4 form classes.
-  if (Form < makeArrayRef(DWARF4FormClasses).size() &&
-      DWARF4FormClasses[Form] == FC)
+  // First, check DWARF5 form classes.
+  if (Form < makeArrayRef(DWARF5FormClasses).size() &&
+      DWARF5FormClasses[Form] == FC)
     return true;
-  // Check more forms from DWARF4 and DWARF5 proposals.
+  // Check more forms from extensions and proposals.
   switch (Form) {
-  case DW_FORM_ref_sig8:
   case DW_FORM_GNU_ref_alt:
     return (FC == FC_Reference);
   case DW_FORM_GNU_addr_index:
     return (FC == FC_Address);
   case DW_FORM_GNU_str_index:
   case DW_FORM_GNU_strp_alt:
-  case DW_FORM_strx:
-  case DW_FORM_strx1:
-  case DW_FORM_strx2:
-  case DW_FORM_strx3:
-  case DW_FORM_strx4:
     return (FC == FC_String);
-  case DW_FORM_implicit_const:
-    return (FC == FC_Constant);
   default:
     break;
   }
   // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
   // Don't check for DWARF version here, as some producers may still do this
-  // by mistake. Also accept DW_FORM_strp since this is .debug_str section
-  // offset.
+  // by mistake. Also accept DW_FORM_[line_]strp since these are
+  // .debug_[line_]str section offsets.
   return (Form == DW_FORM_data4 || Form == DW_FORM_data8 ||
-          Form == DW_FORM_strp) &&
+          Form == DW_FORM_strp || Form == DW_FORM_line_strp) &&
          FC == FC_SectionOffset;
 }
 
 bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
-                                  uint32_t *OffsetPtr, DWARFFormParams FP,
+                                  uint32_t *OffsetPtr, dwarf::FormParams FP,
+                                  const DWARFContext *Ctx,
                                   const DWARFUnit *CU) {
+  if (!Ctx && CU)
+    Ctx = &CU->getContext();
+  C = Ctx;
   U = CU;
   bool Indirect = false;
   bool IsBlock = false;
@@ -350,6 +280,7 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
       break;
     case DW_FORM_udata:
     case DW_FORM_ref_udata:
+    case DW_FORM_rnglistx:
       Value.uval = Data.getULEB128(OffsetPtr);
       break;
     case DW_FORM_string:
@@ -402,8 +333,9 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
 void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   uint64_t UValue = Value.uval;
   bool CURelativeOffset = false;
-  raw_ostream &AddrOS =
-      DumpOpts.ShowAddresses ? WithColor(OS, syntax::Address).get() : nulls();
+  raw_ostream &AddrOS = DumpOpts.ShowAddresses
+                            ? WithColor(OS, HighlightColor::Address).get()
+                            : nulls();
   switch (Form) {
   case DW_FORM_addr:
     AddrOS << format("0x%016" PRIx64, UValue);
@@ -494,6 +426,11 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
       OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)UValue);
     dumpString(OS);
     break;
+  case DW_FORM_line_strp:
+    if (DumpOpts.Verbose)
+      OS << format(" .debug_line_str[0x%8.8x] = ", (uint32_t)UValue);
+    dumpString(OS);
+    break;
   case DW_FORM_strx:
   case DW_FORM_strx1:
   case DW_FORM_strx2:
@@ -514,23 +451,28 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     break;
   case DW_FORM_ref1:
     CURelativeOffset = true;
-    AddrOS << format("cu + 0x%2.2x", (uint8_t)UValue);
+    if (DumpOpts.Verbose)
+      AddrOS << format("cu + 0x%2.2x", (uint8_t)UValue);
     break;
   case DW_FORM_ref2:
     CURelativeOffset = true;
-    AddrOS << format("cu + 0x%4.4x", (uint16_t)UValue);
+    if (DumpOpts.Verbose)
+      AddrOS << format("cu + 0x%4.4x", (uint16_t)UValue);
     break;
   case DW_FORM_ref4:
     CURelativeOffset = true;
-    AddrOS << format("cu + 0x%4.4x", (uint32_t)UValue);
+    if (DumpOpts.Verbose)
+      AddrOS << format("cu + 0x%4.4x", (uint32_t)UValue);
     break;
   case DW_FORM_ref8:
     CURelativeOffset = true;
-    AddrOS << format("cu + 0x%8.8" PRIx64, UValue);
+    if (DumpOpts.Verbose)
+      AddrOS << format("cu + 0x%8.8" PRIx64, UValue);
     break;
   case DW_FORM_ref_udata:
     CURelativeOffset = true;
-    AddrOS << format("cu + 0x%" PRIx64, UValue);
+    if (DumpOpts.Verbose)
+      AddrOS << format("cu + 0x%" PRIx64, UValue);
     break;
   case DW_FORM_GNU_ref_alt:
     AddrOS << format("<alt 0x%" PRIx64 ">", UValue);
@@ -542,6 +484,10 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     OS << "DW_FORM_indirect";
     break;
 
+  case DW_FORM_rnglistx:
+    OS << format("indexed (0x%x) rangelist = ", (uint32_t)UValue);
+    break;
+
   // Should be formatted to 64-bit for DWARF64.
   case DW_FORM_sec_offset:
     AddrOS << format("0x%08x", (uint32_t)UValue);
@@ -552,21 +498,23 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     break;
   }
 
-  if (CURelativeOffset && DumpOpts.Verbose) {
-    OS << " => {";
-    WithColor(OS, syntax::Address).get()
+  if (CURelativeOffset) {
+    if (DumpOpts.Verbose)
+      OS << " => {";
+    WithColor(OS, HighlightColor::Address).get()
         << format("0x%8.8" PRIx64, UValue + (U ? U->getOffset() : 0));
-    OS << "}";
+    if (DumpOpts.Verbose)
+      OS << "}";
   }
 }
 
 void DWARFFormValue::dumpString(raw_ostream &OS) const {
   Optional<const char *> DbgStr = getAsCString();
   if (DbgStr.hasValue()) {
-    raw_ostream &COS = WithColor(OS, syntax::String);
-    COS << '"';
-    COS.write_escaped(DbgStr.getValue());
-    COS << '"';
+    auto COS = WithColor(OS, HighlightColor::String);
+    COS.get() << '"';
+    COS.get().write_escaped(DbgStr.getValue());
+    COS.get() << '"';
   }
 }
 
@@ -576,20 +524,32 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
   if (Form == DW_FORM_string)
     return Value.cstr;
   // FIXME: Add support for DW_FORM_GNU_strp_alt
-  if (Form == DW_FORM_GNU_strp_alt || U == nullptr)
+  if (Form == DW_FORM_GNU_strp_alt || C == nullptr)
     return None;
   uint32_t Offset = Value.uval;
+  if (Form == DW_FORM_line_strp) {
+    // .debug_line_str is tracked in the Context.
+    if (const char *Str = C->getLineStringExtractor().getCStr(&Offset))
+      return Str;
+    return None;
+  }
   if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx ||
       Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 ||
       Form == DW_FORM_strx4) {
     uint64_t StrOffset;
-    if (!U->getStringOffsetSectionItem(Offset, StrOffset))
+    if (!U || !U->getStringOffsetSectionItem(Offset, StrOffset))
       return None;
     Offset = StrOffset;
   }
-  if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
-    return Str;
+  // Prefer the Unit's string extractor, because for .dwo it will point to
+  // .debug_str.dwo, while the Context's extractor always uses .debug_str.
+  if (U) {
+    if (const char *Str = U->getStringExtractor().getCStr(&Offset))
+      return Str;
+    return None;
   }
+  if (const char *Str = C->getStringExtractor().getCStr(&Offset))
+    return Str;
   return None;
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
new file mode 100644
index 000000000000..559afc7559bd
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -0,0 +1,109 @@
+//===- DWARFListTable.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFListTable.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+template <typename... Ts>
+static Error createError(char const *Fmt, const Ts &... Vals) {
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+  Stream << format(Fmt, Vals...);
+  return make_error<StringError>(Stream.str(), inconvertibleErrorCode());
+}
+
+Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
+                                    uint32_t *OffsetPtr) {
+  HeaderOffset = *OffsetPtr;
+  // Read and verify the length field.
+  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t)))
+    return createError("section is not large enough to contain a "
+                       "%s table length at offset 0x%" PRIx32,
+                       SectionName.data(), *OffsetPtr);
+  // TODO: Add support for DWARF64.
+  HeaderData.Length = Data.getU32(OffsetPtr);
+  if (HeaderData.Length == 0xffffffffu)
+    return createError("DWARF64 is not supported in %s at offset 0x%" PRIx32,
+                       SectionName.data(), HeaderOffset);
+  Format = dwarf::DwarfFormat::DWARF32;
+  if (HeaderData.Length + sizeof(uint32_t) < sizeof(Header))
+    return createError("%s table at offset 0x%" PRIx32
+                       " has too small length (0x%" PRIx32
+                       ") to contain a complete header",
+                       SectionName.data(), HeaderOffset, length());
+  uint32_t End = HeaderOffset + length();
+  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, End - HeaderOffset))
+    return createError("section is not large enough to contain a %s table "
+                       "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
+                       SectionName.data(), length(), HeaderOffset);
+
+  HeaderData.Version = Data.getU16(OffsetPtr);
+  HeaderData.AddrSize = Data.getU8(OffsetPtr);
+  HeaderData.SegSize = Data.getU8(OffsetPtr);
+  HeaderData.OffsetEntryCount = Data.getU32(OffsetPtr);
+
+  // Perform basic validation of the remaining header fields.
+  if (HeaderData.Version != 5)
+    return createError("unrecognised %s table version %" PRIu16
+                       " in table at offset 0x%" PRIx32,
+                       SectionName.data(), HeaderData.Version, HeaderOffset);
+  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
+    return createError("%s table at offset 0x%" PRIx32
+                       " has unsupported address size %hhu",
+                       SectionName.data(), HeaderOffset, HeaderData.AddrSize);
+  if (HeaderData.SegSize != 0)
+    return createError("%s table at offset 0x%" PRIx32
+                       " has unsupported segment selector size %" PRIu8,
+                       SectionName.data(), HeaderOffset, HeaderData.SegSize);
+  if (End < HeaderOffset + sizeof(HeaderData) +
+                HeaderData.OffsetEntryCount * sizeof(uint32_t))
+    return createError(
+        "%s table at offset 0x%" PRIx32 " has more offset entries (%" PRIu32
+        ") than there is space for",
+        SectionName.data(), HeaderOffset, HeaderData.OffsetEntryCount);
+  Data.setAddressSize(HeaderData.AddrSize);
+  for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I)
+    Offsets.push_back(Data.getU32(OffsetPtr));
+  return Error::success();
+}
+
+void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
+  if (DumpOpts.Verbose)
+    OS << format("0x%8.8" PRIx32 ": ", HeaderOffset);
+  OS << format(
+      "%s list header: length = 0x%8.8" PRIx32 ", version = 0x%4.4" PRIx16 ", "
+      "addr_size = 0x%2.2" PRIx8 ", seg_size = 0x%2.2" PRIx8
+      ", offset_entry_count = "
+      "0x%8.8" PRIx32 "\n",
+      ListTypeString.data(), HeaderData.Length, HeaderData.Version,
+      HeaderData.AddrSize, HeaderData.SegSize, HeaderData.OffsetEntryCount);
+
+  if (HeaderData.OffsetEntryCount > 0) {
+    OS << "offsets: [";
+    for (const auto &Off : Offsets) {
+      OS << format("\n0x%8.8" PRIx32, Off);
+      if (DumpOpts.Verbose)
+        OS << format(" => 0x%8.8" PRIx32,
+                     Off + HeaderOffset + sizeof(HeaderData));
+    }
+    OS << "\n]\n";
+  }
+}
+
+uint32_t DWARFListTableHeader::length() const {
+  if (HeaderData.Length == 0)
+    return 0;
+  // TODO: DWARF64 support.
+  return HeaderData.Length + sizeof(uint32_t);
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index 206c12fa403f..00be75e1a94d 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -18,26 +18,13 @@
 
 using namespace llvm;
 
-bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
-                                uint32_t *offset_ptr) {
-  if (!DWARFUnit::extractImpl(debug_info, offset_ptr))
-    return false;
-  TypeHash = debug_info.getU64(offset_ptr);
-  TypeOffset = debug_info.getU32(offset_ptr);
-  // TypeOffset is relative to the beginning of the header,
-  // so we have to account for the leading length field.
-  // FIXME: The size of the length field is 12 in DWARF64.
-  unsigned SizeOfLength = 4;
-  return TypeOffset < getLength() + SizeOfLength;
-}
-
 void DWARFTypeUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
-  DWARFDie TD = getDIEForOffset(TypeOffset + getOffset());
+  DWARFDie TD = getDIEForOffset(getTypeOffset() + getOffset());
   const char *Name = TD.getName(DINameKind::ShortName);
 
   if (DumpOpts.SummarizeTypes) {
     OS << "name = '" << Name << "'"
-       << " type_signature = " << format("0x%016" PRIx64, TypeHash)
+       << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
        << " length = " << format("0x%08x", getLength()) << '\n';
     return;
   }
@@ -50,8 +37,8 @@ void DWARFTypeUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " name = '" << Name << "'"
-     << " type_signature = " << format("0x%016" PRIx64, TypeHash)
-     << " type_offset = " << format("0x%04x", TypeOffset)
+     << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
+     << " type_offset = " << format("0x%04x", getTypeOffset())
      << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
 
   if (DWARFDie TU = getUnitDIE(false))
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index df55d7debf92..3b408857d29f 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -8,17 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/WithColor.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -32,7 +33,7 @@ using namespace dwarf;
 
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
   const DWARFObject &D = C.getDWARFObj();
-  parseImpl(C, Section, C.getDebugAbbrev(), &D.getRangeSection(),
+  parseImpl(C, D, Section, C.getDebugAbbrev(), &D.getRangeSection(),
             D.getStringSection(), D.getStringOffsetSection(),
             &D.getAddrSection(), D.getLineSection(), D.isLittleEndian(), false,
             false);
@@ -41,22 +42,22 @@ void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
 void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
                                     const DWARFSection &DWOSection, bool Lazy) {
   const DWARFObject &D = C.getDWARFObj();
-  parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), &D.getRangeDWOSection(),
+  parseImpl(C, D, DWOSection, C.getDebugAbbrevDWO(), &D.getRangeDWOSection(),
             D.getStringDWOSection(), D.getStringOffsetDWOSection(),
             &D.getAddrSection(), D.getLineDWOSection(), C.isLittleEndian(),
             true, Lazy);
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
+                     const DWARFUnitHeader &Header,
                      const DWARFDebugAbbrev *DA, const DWARFSection *RS,
                      StringRef SS, const DWARFSection &SOS,
                      const DWARFSection *AOS, const DWARFSection &LS, bool LE,
-                     bool IsDWO, const DWARFUnitSectionBase &UnitSection,
-                     const DWARFUnitIndex::Entry *IndexEntry)
-    : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
-      LineSection(LS), StringSection(SS), StringOffsetSection(SOS),
-      AddrOffsetSection(AOS), isLittleEndian(LE), isDWO(IsDWO),
-      UnitSection(UnitSection), IndexEntry(IndexEntry) {
+                     bool IsDWO, const DWARFUnitSectionBase &UnitSection)
+    : Context(DC), InfoSection(Section), Header(Header), Abbrev(DA),
+      RangeSection(RS), LineSection(LS), StringSection(SS),
+      StringOffsetSection(SOS),  AddrOffsetSection(AOS), isLittleEndian(LE),
+      isDWO(IsDWO), UnitSection(UnitSection) {
   clear();
 }
 
@@ -92,9 +93,16 @@ bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
   return true;
 }
 
-bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
+bool DWARFUnitHeader::extract(DWARFContext &Context,
+                              const DWARFDataExtractor &debug_info,
+                              uint32_t *offset_ptr,
+                              DWARFSectionKind SectionKind,
+                              const DWARFUnitIndex *Index) {
+  Offset = *offset_ptr;
+  IndexEntry = Index ? Index->getFromOffset(*offset_ptr) : nullptr;
   Length = debug_info.getU32(offset_ptr);
   // FIXME: Support DWARF64.
+  unsigned SizeOfLength = 4;
   FormParams.Format = DWARF32;
   FormParams.Version = debug_info.getU16(offset_ptr);
   if (FormParams.Version >= 5) {
@@ -102,8 +110,14 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
     FormParams.AddrSize = debug_info.getU8(offset_ptr);
     AbbrOffset = debug_info.getU32(offset_ptr);
   } else {
-    AbbrOffset = debug_info.getU32(offset_ptr);
+    AbbrOffset = debug_info.getRelocatedValue(4, offset_ptr);
     FormParams.AddrSize = debug_info.getU8(offset_ptr);
+    // Fake a unit type based on the section type.  This isn't perfect,
+    // but distinguishing compile and type units is generally enough.
+    if (SectionKind == DW_SECT_TYPES)
+      UnitType = DW_UT_type;
+    else
+      UnitType = DW_UT_compile;
   }
   if (IndexEntry) {
     if (AbbrOffset)
@@ -116,12 +130,27 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
       return false;
     AbbrOffset = AbbrEntry->Offset;
   }
-
+  if (isTypeUnit()) {
+    TypeHash = debug_info.getU64(offset_ptr);
+    TypeOffset = debug_info.getU32(offset_ptr);
+  } else if (UnitType == DW_UT_split_compile || UnitType == DW_UT_skeleton)
+    DWOId = debug_info.getU64(offset_ptr);
+
+  // Header fields all parsed, capture the size of this unit header.
+  assert(*offset_ptr - Offset <= 255 && "unexpected header size");
+  Size = uint8_t(*offset_ptr - Offset);
+
+  // Type offset is unit-relative; should be after the header and before
+  // the end of the current unit.
+  bool TypeOffsetOK =
+      !isTypeUnit()
+          ? true
+          : TypeOffset >= Size && TypeOffset < getLength() + SizeOfLength;
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
   bool VersionOK = DWARFContext::isSupportedVersion(getVersion());
   bool AddrSizeOK = getAddressByteSize() == 4 || getAddressByteSize() == 8;
 
-  if (!LengthOK || !VersionOK || !AddrSizeOK)
+  if (!LengthOK || !VersionOK || !AddrSizeOK || !TypeOffsetOK)
     return false;
 
   // Keep track of the highest DWARF version we encounter across all units.
@@ -129,24 +158,31 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   return true;
 }
 
-bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
-  clear();
-
-  Offset = *offset_ptr;
-
-  if (debug_info.isValidOffset(*offset_ptr)) {
-    if (extractImpl(debug_info, offset_ptr))
-      return true;
-
-    // reset the offset to where we tried to parse from if anything went wrong
-    *offset_ptr = Offset;
+// Parse the rangelist table header, including the optional array of offsets
+// following it (DWARF v5 and later).
+static Expected<DWARFDebugRnglistTable>
+parseRngListTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
+  // TODO: Support DWARF64
+  // We are expected to be called with Offset 0 or pointing just past the table
+  // header, which is 12 bytes long for DWARF32.
+  if (Offset > 0) {
+    if (Offset < 12U) {
+      std::string Buffer;
+      raw_string_ostream Stream(Buffer);
+      Stream << format(
+          "Did not detect a valid range list table with base = 0x%x", Offset);
+      return make_error<StringError>(Stream.str(), inconvertibleErrorCode());
+    }
+    Offset -= 12U;
   }
-
-  return false;
+  llvm::DWARFDebugRnglistTable Table;
+  if (Error E = Table.extractHeaderAndOffsets(DA, &Offset))
+    return std::move(E);
+  return Table;
 }
 
-bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
-                                 DWARFDebugRangeList &RangeList) const {
+Error DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+                                  DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
   assert(!DieArray.empty());
   DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
@@ -156,10 +192,7 @@ bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
 }
 
 void DWARFUnit::clear() {
-  Offset = 0;
-  Length = 0;
   Abbrevs = nullptr;
-  FormParams = DWARFFormParams({0, 0, DWARF32});
   BaseAddr.reset();
   RangeSectionBase = 0;
   AddrOffsetSectionBase = 0;
@@ -171,10 +204,6 @@ const char *DWARFUnit::getCompilationDir() {
   return dwarf::toString(getUnitDIE().find(DW_AT_comp_dir), nullptr);
 }
 
-Optional<uint64_t> DWARFUnit::getDWOId() {
-  return toUnsigned(getUnitDIE().find(DW_AT_GNU_dwo_id));
-}
-
 void DWARFUnit::extractDIEsToVector(
     bool AppendCUDie, bool AppendNonCUDies,
     std::vector<DWARFDebugInfoEntry> &Dies) const {
@@ -183,7 +212,7 @@ void DWARFUnit::extractDIEsToVector(
 
   // Set the offset to that of the first DIE and calculate the start of the
   // next compilation unit header.
-  uint32_t DIEOffset = Offset + getHeaderSize();
+  uint32_t DIEOffset = getOffset() + getHeaderSize();
   uint32_t NextCUOffset = getNextUnitOffset();
   DWARFDebugInfoEntry DIE;
   DWARFDataExtractor DebugInfoData = getDebugInfoExtractor();
@@ -224,8 +253,9 @@ void DWARFUnit::extractDIEsToVector(
   // should always terminate at or before the start of the next compilation
   // unit header).
   if (DIEOffset > NextCUOffset)
-    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
-                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), DIEOffset);
+    WithColor::warning() << format("DWARF compile unit extends beyond its "
+                                   "bounds cu 0x%8.8x at 0x%8.8x\n",
+                                   getOffset(), DIEOffset);
 }
 
 size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
@@ -242,10 +272,8 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   // If CU DIE was just parsed, copy several attribute values from it.
   if (!HasCUDie) {
     DWARFDie UnitDie = getUnitDIE();
-    Optional<DWARFFormValue> PC = UnitDie.find({DW_AT_low_pc, DW_AT_entry_pc});
-    if (Optional<uint64_t> Addr = toAddress(PC))
-        setBaseAddress({*Addr, PC->getSectionIndex()});
-
+    if (Optional<uint64_t> DWOId = toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id)))
+      Header.setDWOId(*DWOId);
     if (!isDWO) {
       assert(AddrOffsetSectionBase == 0);
       assert(RangeSectionBase == 0);
@@ -263,6 +291,7 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     // which may differ from the unit's format.
     uint64_t StringOffsetsContributionBase =
         isDWO ? 0 : toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);
+    auto IndexEntry = Header.getIndexEntry();
     if (IndexEntry)
       if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
         StringOffsetsContributionBase += C->Offset;
@@ -277,6 +306,34 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
       StringOffsetsTableContribution = determineStringOffsetsTableContribution(
           DA, StringOffsetsContributionBase);
 
+    // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
+    // describe address ranges.
+    if (getVersion() >= 5) {
+      if (isDWO)
+        setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
+      else
+        setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
+                         toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
+      if (RangeSection->Data.size()) {
+        // Parse the range list table header. Individual range lists are
+        // extracted lazily.
+        DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
+                                    isLittleEndian, 0);
+        if (auto TableOrError =
+                parseRngListTableHeader(RangesDA, RangeSectionBase))
+          RngListTable = TableOrError.get();
+        else
+          WithColor::error() << "parsing a range list table: "
+                             << toString(TableOrError.takeError())
+                             << '\n';
+
+        // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
+        // Adjust RangeSectionBase to point past the table header.
+        if (isDWO && RngListTable)
+          RangeSectionBase = RngListTable->getHeaderSize();
+      }
+    }
+
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
@@ -315,8 +372,23 @@ bool DWARFUnit::parseDWO() {
   DWO = std::shared_ptr<DWARFCompileUnit>(std::move(DWOContext), DWOCU);
   // Share .debug_addr and .debug_ranges section with compile unit in .dwo
   DWO->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
-  auto DWORangesBase = UnitDie.getRangesBaseAttribute();
-  DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
+  if (getVersion() >= 5) {
+    DWO->setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
+    DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
+                                isLittleEndian, 0);
+    if (auto TableOrError = parseRngListTableHeader(RangesDA, RangeSectionBase))
+      DWO->RngListTable = TableOrError.get();
+    else
+      WithColor::error() << "parsing a range list table: "
+                         << toString(TableOrError.takeError())
+                         << '\n';
+    if (DWO->RngListTable)
+      DWO->RangeSectionBase = DWO->RngListTable->getHeaderSize();
+  } else {
+    auto DWORangesBase = UnitDie.getRangesBaseAttribute();
+    DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
+  }
+
   return true;
 }
 
@@ -327,16 +399,56 @@ void DWARFUnit::clearDIEs(bool KeepCUDie) {
   }
 }
 
+Expected<DWARFAddressRangesVector>
+DWARFUnit::findRnglistFromOffset(uint32_t Offset) {
+  if (getVersion() <= 4) {
+    DWARFDebugRangeList RangeList;
+    if (Error E = extractRangeList(Offset, RangeList))
+      return std::move(E);
+    return RangeList.getAbsoluteRanges(getBaseAddress());
+  }
+  if (RngListTable) {
+    DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
+                                  isLittleEndian, RngListTable->getAddrSize());
+    auto RangeListOrError = RngListTable->findList(RangesData, Offset);
+    if (RangeListOrError)
+      return RangeListOrError.get().getAbsoluteRanges(getBaseAddress());
+    return RangeListOrError.takeError();
+  }
+
+  return make_error<StringError>("missing or invalid range list table",
+                                 inconvertibleErrorCode());
+}
+
+Expected<DWARFAddressRangesVector>
+DWARFUnit::findRnglistFromIndex(uint32_t Index) {
+  if (auto Offset = getRnglistOffset(Index))
+    return findRnglistFromOffset(*Offset + RangeSectionBase);
+
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+  if (RngListTable)
+    Stream << format("invalid range list table index %d", Index);
+  else
+    Stream << "missing or invalid range list table";
+  return make_error<StringError>(Stream.str(), inconvertibleErrorCode());
+}
+
 void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
   DWARFDie UnitDie = getUnitDIE();
   if (!UnitDie)
     return;
   // First, check if unit DIE describes address ranges for the whole unit.
-  const auto &CUDIERanges = UnitDie.getAddressRanges();
-  if (!CUDIERanges.empty()) {
-    CURanges.insert(CURanges.end(), CUDIERanges.begin(), CUDIERanges.end());
-    return;
-  }
+  auto CUDIERangesOrError = UnitDie.getAddressRanges();
+  if (CUDIERangesOrError) {
+    if (!CUDIERangesOrError.get().empty()) {
+      CURanges.insert(CURanges.end(), CUDIERangesOrError.get().begin(),
+                      CUDIERangesOrError.get().end());
+      return;
+    }
+  } else
+    WithColor::error() << "decoding address ranges: "
+                       << toString(CUDIERangesOrError.takeError()) << '\n';
 
   // This function is usually called if there in no .debug_aranges section
   // in order to produce a compile unit level set of address ranges that
@@ -360,378 +472,49 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
     clearDIEs(true);
 }
 
-// Populates a map from PC addresses to subprogram DIEs.
-//
-// This routine tries to look at the smallest amount of the debug info it can
-// to locate the DIEs. This is because many subprograms will never end up being
-// read or needed at all. We want to be as lazy as possible.
-void DWARFUnit::buildSubprogramDIEAddrMap() {
-  assert(SubprogramDIEAddrMap.empty() && "Must only build this map once!");
-  SmallVector<DWARFDie, 16> Worklist;
-  Worklist.push_back(getUnitDIE());
-  do {
-    DWARFDie Die = Worklist.pop_back_val();
-
-    // Queue up child DIEs to recurse through.
-    // FIXME: This causes us to read a lot more debug info than we really need.
-    // We should look at pruning out DIEs which cannot transitively hold
-    // separate subprograms.
-    for (DWARFDie Child : Die.children())
-      Worklist.push_back(Child);
-
-    // If handling a non-subprogram DIE, nothing else to do.
-    if (!Die.isSubprogramDIE())
-      continue;
-
-    // For subprogram DIEs, store them, and insert relevant markers into the
-    // address map. We don't care about overlap at all here as DWARF doesn't
-    // meaningfully support that, so we simply will insert a range with no DIE
-    // starting from the high PC. In the event there are overlaps, sorting
-    // these may truncate things in surprising ways but still will allow
-    // lookups to proceed.
-    int DIEIndex = SubprogramDIEAddrInfos.size();
-    SubprogramDIEAddrInfos.push_back({Die, (uint64_t)-1, {}});
-    for (const auto &R : Die.getAddressRanges()) {
-      // Ignore 0-sized ranges.
-      if (R.LowPC == R.HighPC)
-        continue;
-
-      SubprogramDIEAddrMap.push_back({R.LowPC, DIEIndex});
-      SubprogramDIEAddrMap.push_back({R.HighPC, -1});
-
-      if (R.LowPC < SubprogramDIEAddrInfos.back().SubprogramBasePC)
-        SubprogramDIEAddrInfos.back().SubprogramBasePC = R.LowPC;
-    }
-  } while (!Worklist.empty());
-
-  if (SubprogramDIEAddrMap.empty()) {
-    // If we found no ranges, create a no-op map so that lookups remain simple
-    // but never find anything.
-    SubprogramDIEAddrMap.push_back({0, -1});
-    return;
-  }
-
-  // Next, sort the ranges and remove both exact duplicates and runs with the
-  // same DIE index. We order the ranges so that non-empty ranges are
-  // preferred. Because there may be ties, we also need to use stable sort.
-  std::stable_sort(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(),
-                   [](const std::pair<uint64_t, int64_t> &LHS,
-                      const std::pair<uint64_t, int64_t> &RHS) {
-                     if (LHS.first < RHS.first)
-                       return true;
-                     if (LHS.first > RHS.first)
-                       return false;
-
-                     // For ranges that start at the same address, keep the one
-                     // with a DIE.
-                     if (LHS.second != -1 && RHS.second == -1)
-                       return true;
-
-                     return false;
-                   });
-  SubprogramDIEAddrMap.erase(
-      std::unique(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(),
-                  [](const std::pair<uint64_t, int64_t> &LHS,
-                     const std::pair<uint64_t, int64_t> &RHS) {
-                    // If the start addresses are exactly the same, we can
-                    // remove all but the first one as it is the only one that
-                    // will be found and used.
-                    //
-                    // If the DIE indices are the same, we can "merge" the
-                    // ranges by eliminating the second.
-                    return LHS.first == RHS.first || LHS.second == RHS.second;
-                  }),
-      SubprogramDIEAddrMap.end());
-
-  assert(SubprogramDIEAddrMap.back().second == -1 &&
-         "The last interval must not have a DIE as each DIE's address range is "
-         "bounded.");
-}
-
-// Build the second level of mapping from PC to DIE, specifically one that maps
-// a PC *within* a particular DWARF subprogram into a precise, maximally nested
-// inlined subroutine DIE (if any exists). We build a separate map for each
-// subprogram because many subprograms will never get queried for an address
-// and this allows us to be significantly lazier in reading the DWARF itself.
-void DWARFUnit::buildInlinedSubroutineDIEAddrMap(
-    SubprogramDIEAddrInfo &SPInfo) {
-  auto &AddrMap = SPInfo.InlinedSubroutineDIEAddrMap;
-  uint64_t BasePC = SPInfo.SubprogramBasePC;
-
-  auto SubroutineAddrMapSorter = [](const std::pair<int, int> &LHS,
-                                    const std::pair<int, int> &RHS) {
-    if (LHS.first < RHS.first)
-      return true;
-    if (LHS.first > RHS.first)
-      return false;
-
-    // For ranges that start at the same address, keep the
-    // non-empty one.
-    if (LHS.second != -1 && RHS.second == -1)
-      return true;
-
-    return false;
-  };
-  auto SubroutineAddrMapUniquer = [](const std::pair<int, int> &LHS,
-                                     const std::pair<int, int> &RHS) {
-    // If the start addresses are exactly the same, we can
-    // remove all but the first one as it is the only one that
-    // will be found and used.
-    //
-    // If the DIE indices are the same, we can "merge" the
-    // ranges by eliminating the second.
-    return LHS.first == RHS.first || LHS.second == RHS.second;
-  };
-
-  struct DieAndParentIntervalRange {
-    DWARFDie Die;
-    int ParentIntervalsBeginIdx, ParentIntervalsEndIdx;
-  };
-
-  SmallVector<DieAndParentIntervalRange, 16> Worklist;
-  auto EnqueueChildDIEs = [&](const DWARFDie &Die, int ParentIntervalsBeginIdx,
-                              int ParentIntervalsEndIdx) {
-    for (DWARFDie Child : Die.children())
-      Worklist.push_back(
-          {Child, ParentIntervalsBeginIdx, ParentIntervalsEndIdx});
-  };
-  EnqueueChildDIEs(SPInfo.SubprogramDIE, 0, 0);
-  while (!Worklist.empty()) {
-    DWARFDie Die = Worklist.back().Die;
-    int ParentIntervalsBeginIdx = Worklist.back().ParentIntervalsBeginIdx;
-    int ParentIntervalsEndIdx = Worklist.back().ParentIntervalsEndIdx;
-    Worklist.pop_back();
-
-    // If we encounter a nested subprogram, simply ignore it. We map to
-    // (disjoint) subprograms before arriving here and we don't want to examine
-    // any inlined subroutines of an unrelated subpragram.
-    if (Die.getTag() == DW_TAG_subprogram)
-      continue;
-
-    // For non-subroutines, just recurse to keep searching for inlined
-    // subroutines.
-    if (Die.getTag() != DW_TAG_inlined_subroutine) {
-      EnqueueChildDIEs(Die, ParentIntervalsBeginIdx, ParentIntervalsEndIdx);
-      continue;
-    }
-
-    // Capture the inlined subroutine DIE that we will reference from the map.
-    int DIEIndex = InlinedSubroutineDIEs.size();
-    InlinedSubroutineDIEs.push_back(Die);
-
-    int DieIntervalsBeginIdx = AddrMap.size();
-    // First collect the PC ranges for this DIE into our subroutine interval
-    // map.
-    for (auto R : Die.getAddressRanges()) {
-      // Clamp the PCs to be above the base.
-      R.LowPC = std::max(R.LowPC, BasePC);
-      R.HighPC = std::max(R.HighPC, BasePC);
-      // Compute relative PCs from the subprogram base and drop down to an
-      // unsigned 32-bit int to represent them within the data structure. This
-      // lets us cover a 4gb single subprogram. Because subprograms may be
-      // partitioned into distant parts of a binary (think hot/cold
-      // partitioning) we want to preserve as much as we can here without
-      // burning extra memory. Past that, we will simply truncate and lose the
-      // ability to map those PCs to a DIE more precise than the subprogram.
-      const uint32_t MaxRelativePC = std::numeric_limits<uint32_t>::max();
-      uint32_t RelativeLowPC = (R.LowPC - BasePC) > (uint64_t)MaxRelativePC
-                                   ? MaxRelativePC
-                                   : (uint32_t)(R.LowPC - BasePC);
-      uint32_t RelativeHighPC = (R.HighPC - BasePC) > (uint64_t)MaxRelativePC
-                                    ? MaxRelativePC
-                                    : (uint32_t)(R.HighPC - BasePC);
-      // Ignore empty or bogus ranges.
-      if (RelativeLowPC >= RelativeHighPC)
-        continue;
-      AddrMap.push_back({RelativeLowPC, DIEIndex});
-      AddrMap.push_back({RelativeHighPC, -1});
-    }
-
-    // If there are no address ranges, there is nothing to do to map into them
-    // and there cannot be any child subroutine DIEs with address ranges of
-    // interest as those would all be required to nest within this DIE's
-    // non-existent ranges, so we can immediately continue to the next DIE in
-    // the worklist.
-    if (DieIntervalsBeginIdx == (int)AddrMap.size())
-      continue;
-
-    // The PCs from this DIE should never overlap, so we can easily sort them
-    // here.
-    std::sort(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(),
-              SubroutineAddrMapSorter);
-    // Remove any dead ranges. These should only come from "empty" ranges that
-    // were clobbered by some other range.
-    AddrMap.erase(std::unique(AddrMap.begin() + DieIntervalsBeginIdx,
-                              AddrMap.end(), SubroutineAddrMapUniquer),
-                  AddrMap.end());
-
-    // Compute the end index of this DIE's addr map intervals.
-    int DieIntervalsEndIdx = AddrMap.size();
-
-    assert(DieIntervalsBeginIdx != DieIntervalsEndIdx &&
-           "Must not have an empty map for this layer!");
-    assert(AddrMap.back().second == -1 && "Must end with an empty range!");
-    assert(std::is_sorted(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(),
-                          less_first()) &&
-           "Failed to sort this DIE's interals!");
-
-    // If we have any parent intervals, walk the newly added ranges and find
-    // the parent ranges they were inserted into. Both of these are sorted and
-    // neither has any overlaps. We need to append new ranges to split up any
-    // parent ranges these new ranges would overlap when we merge them.
-    if (ParentIntervalsBeginIdx != ParentIntervalsEndIdx) {
-      int ParentIntervalIdx = ParentIntervalsBeginIdx;
-      for (int i = DieIntervalsBeginIdx, e = DieIntervalsEndIdx - 1; i < e;
-           ++i) {
-        const uint32_t IntervalStart = AddrMap[i].first;
-        const uint32_t IntervalEnd = AddrMap[i + 1].first;
-        const int IntervalDieIdx = AddrMap[i].second;
-        if (IntervalDieIdx == -1) {
-          // For empty intervals, nothing is required. This is a bit surprising
-          // however. If the prior interval overlaps a parent interval and this
-          // would be necessary to mark the end, we will synthesize a new end
-          // that switches back to the parent DIE below. And this interval will
-          // get dropped in favor of one with a DIE attached. However, we'll
-          // still include this and so worst-case, it will still end the prior
-          // interval.
-          continue;
-        }
-
-        // We are walking the new ranges in order, so search forward from the
-        // last point for a parent range that might overlap.
-        auto ParentIntervalsRange =
-            make_range(AddrMap.begin() + ParentIntervalIdx,
-                       AddrMap.begin() + ParentIntervalsEndIdx);
-        assert(std::is_sorted(ParentIntervalsRange.begin(),
-                              ParentIntervalsRange.end(), less_first()) &&
-               "Unsorted parent intervals can't be searched!");
-        auto PI = std::upper_bound(
-            ParentIntervalsRange.begin(), ParentIntervalsRange.end(),
-            IntervalStart,
-            [](uint32_t LHS, const std::pair<uint32_t, int32_t> &RHS) {
-              return LHS < RHS.first;
-            });
-        if (PI == ParentIntervalsRange.begin() ||
-            PI == ParentIntervalsRange.end())
-          continue;
-
-        ParentIntervalIdx = PI - AddrMap.begin();
-        int32_t &ParentIntervalDieIdx = std::prev(PI)->second;
-        uint32_t &ParentIntervalStart = std::prev(PI)->first;
-        const uint32_t ParentIntervalEnd = PI->first;
-
-        // If the new range starts exactly at the position of the parent range,
-        // we need to adjust the parent range. Note that these collisions can
-        // only happen with the original parent range because we will merge any
-        // adjacent ranges in the child.
-        if (IntervalStart == ParentIntervalStart) {
-          // If there will be a tail, just shift the start of the parent
-          // forward. Note that this cannot change the parent ordering.
-          if (IntervalEnd < ParentIntervalEnd) {
-            ParentIntervalStart = IntervalEnd;
-            continue;
-          }
-          // Otherwise, mark this as becoming empty so we'll remove it and
-          // prefer the child range.
-          ParentIntervalDieIdx = -1;
+void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
+  if (Die.isSubroutineDIE()) {
+    auto DIERangesOrError = Die.getAddressRanges();
+    if (DIERangesOrError) {
+      for (const auto &R : DIERangesOrError.get()) {
+        // Ignore 0-sized ranges.
+        if (R.LowPC == R.HighPC)
           continue;
+        auto B = AddrDieMap.upper_bound(R.LowPC);
+        if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
+          // The range is a sub-range of existing ranges, we need to split the
+          // existing range.
+          if (R.HighPC < B->second.first)
+            AddrDieMap[R.HighPC] = B->second;
+          if (R.LowPC > B->first)
+            AddrDieMap[B->first].first = R.LowPC;
         }
-
-        // Finally, if the parent interval will need to remain as a prefix to
-        // this one, insert a new interval to cover any tail.
-        if (IntervalEnd < ParentIntervalEnd)
-          AddrMap.push_back({IntervalEnd, ParentIntervalDieIdx});
+        AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
       }
-    }
-
-    // Note that we don't need to re-sort even this DIE's address map intervals
-    // after this. All of the newly added intervals actually fill in *gaps* in
-    // this DIE's address map, and we know that children won't need to lookup
-    // into those gaps.
-
-    // Recurse through its children, giving them the interval map range of this
-    // DIE to use as their parent intervals.
-    EnqueueChildDIEs(Die, DieIntervalsBeginIdx, DieIntervalsEndIdx);
-  }
-
-  if (AddrMap.empty()) {
-    AddrMap.push_back({0, -1});
-    return;
+    } else
+      llvm::consumeError(DIERangesOrError.takeError());
   }
-
-  // Now that we've added all of the intervals needed, we need to resort and
-  // unique them. Most notably, this will remove all the empty ranges that had
-  // a parent range covering, etc. We only expect a single non-empty interval
-  // at any given start point, so we just use std::sort. This could potentially
-  // produce non-deterministic maps for invalid DWARF.
-  std::sort(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapSorter);
-  AddrMap.erase(
-      std::unique(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapUniquer),
-      AddrMap.end());
+  // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
+  // simplify the logic to update AddrDieMap. The child's range will always
+  // be equal or smaller than the parent's range. With this assumption, when
+  // adding one range into the map, it will at most split a range into 3
+  // sub-ranges.
+  for (DWARFDie Child = Die.getFirstChild(); Child; Child = Child.getSibling())
+    updateAddressDieMap(Child);
 }
 
 DWARFDie DWARFUnit::getSubroutineForAddress(uint64_t Address) {
   extractDIEsIfNeeded(false);
-
-  // We use a two-level mapping structure to locate subroutines for a given PC
-  // address.
-  //
-  // First, we map the address to a subprogram. This can be done more cheaply
-  // because subprograms cannot nest within each other. It also allows us to
-  // avoid detailed examination of many subprograms, instead only focusing on
-  // the ones which we end up actively querying.
-  if (SubprogramDIEAddrMap.empty())
-    buildSubprogramDIEAddrMap();
-
-  assert(!SubprogramDIEAddrMap.empty() &&
-         "We must always end up with a non-empty map!");
-
-  auto I = std::upper_bound(
-      SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), Address,
-      [](uint64_t LHS, const std::pair<uint64_t, int64_t> &RHS) {
-        return LHS < RHS.first;
-      });
-  // If we find the beginning, then the address is before the first subprogram.
-  if (I == SubprogramDIEAddrMap.begin())
+  if (AddrDieMap.empty())
+    updateAddressDieMap(getUnitDIE());
+  auto R = AddrDieMap.upper_bound(Address);
+  if (R == AddrDieMap.begin())
     return DWARFDie();
-  // Back up to the interval containing the address and see if it
-  // has a DIE associated with it.
-  --I;
-  if (I->second == -1)
+  // upper_bound's previous item contains Address.
+  --R;
+  if (Address >= R->second.first)
     return DWARFDie();
-
-  auto &SPInfo = SubprogramDIEAddrInfos[I->second];
-
-  // Now that we have the subprogram for this address, we do the second level
-  // mapping by building a map within a subprogram's PC range to any specific
-  // inlined subroutine.
-  if (SPInfo.InlinedSubroutineDIEAddrMap.empty())
-    buildInlinedSubroutineDIEAddrMap(SPInfo);
-
-  // We lookup within the inlined subroutine using a subprogram-relative
-  // address.
-  assert(Address >= SPInfo.SubprogramBasePC &&
-         "Address isn't above the start of the subprogram!");
-  uint32_t RelativeAddr = ((Address - SPInfo.SubprogramBasePC) >
-                           (uint64_t)std::numeric_limits<uint32_t>::max())
-                              ? std::numeric_limits<uint32_t>::max()
-                              : (uint32_t)(Address - SPInfo.SubprogramBasePC);
-
-  auto J =
-      std::upper_bound(SPInfo.InlinedSubroutineDIEAddrMap.begin(),
-                       SPInfo.InlinedSubroutineDIEAddrMap.end(), RelativeAddr,
-                       [](uint32_t LHS, const std::pair<uint32_t, int32_t> &RHS) {
-                         return LHS < RHS.first;
-                       });
-  // If we find the beginning, the address is before any inlined subroutine so
-  // return the subprogram DIE.
-  if (J == SPInfo.InlinedSubroutineDIEAddrMap.begin())
-    return SPInfo.SubprogramDIE;
-  // Back up `J` and return the inlined subroutine if we have one or the
-  // subprogram if we don't.
-  --J;
-  return J->second == -1 ? SPInfo.SubprogramDIE
-                         : InlinedSubroutineDIEs[J->second];
+  return R->second.second;
 }
 
 void
@@ -745,11 +528,15 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address,
   DWARFDie SubroutineDIE =
       (DWO ? DWO.get() : this)->getSubroutineForAddress(Address);
 
-  while (SubroutineDIE) {
-    if (SubroutineDIE.isSubroutineDIE())
+  if (!SubroutineDIE)
+    return;
+
+  while (!SubroutineDIE.isSubprogramDIE()) {
+    if (SubroutineDIE.getTag() == DW_TAG_inlined_subroutine)
       InlinedChain.push_back(SubroutineDIE);
     SubroutineDIE  = SubroutineDIE.getParent();
   }
+  InlinedChain.push_back(SubroutineDIE);
 }
 
 const DWARFUnitIndex &llvm::getDWARFUnitIndex(DWARFContext &Context,
@@ -799,6 +586,25 @@ DWARFDie DWARFUnit::getSibling(const DWARFDebugInfoEntry *Die) {
   return DWARFDie();
 }
 
+DWARFDie DWARFUnit::getPreviousSibling(const DWARFDebugInfoEntry *Die) {
+  if (!Die)
+    return DWARFDie();
+  uint32_t Depth = Die->getDepth();
+  // Unit DIEs always have a depth of zero and never have siblings.
+  if (Depth == 0)
+    return DWARFDie();
+
+  // Find the previous DIE whose depth is the same as the Die's depth.
+  for (size_t I = getDIEIndex(Die); I > 0;) {
+    --I;
+    if (DieArray[I].getDepth() == Depth - 1)
+      return DWARFDie();
+    if (DieArray[I].getDepth() == Depth)
+      return DWARFDie(this, &DieArray[I]);
+  }
+  return DWARFDie();
+}
+
 DWARFDie DWARFUnit::getFirstChild(const DWARFDebugInfoEntry *Die) {
   if (!Die->hasChildren())
     return DWARFDie();
@@ -810,12 +616,39 @@ DWARFDie DWARFUnit::getFirstChild(const DWARFDebugInfoEntry *Die) {
   return DWARFDie(this, &DieArray[I]);
 }
 
+DWARFDie DWARFUnit::getLastChild(const DWARFDebugInfoEntry *Die) {
+  if (!Die->hasChildren())
+    return DWARFDie();
+
+  uint32_t Depth = Die->getDepth();
+  for (size_t I = getDIEIndex(Die) + 1, EndIdx = DieArray.size(); I < EndIdx;
+       ++I) {
+    if (DieArray[I].getDepth() == Depth + 1 &&
+        DieArray[I].getTag() == dwarf::DW_TAG_null)
+      return DWARFDie(this, &DieArray[I]);
+    assert(DieArray[I].getDepth() > Depth && "Not processing children?");
+  }
+  return DWARFDie();
+}
+
 const DWARFAbbreviationDeclarationSet *DWARFUnit::getAbbreviations() const {
   if (!Abbrevs)
-    Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
+    Abbrevs = Abbrev->getAbbreviationDeclarationSet(Header.getAbbrOffset());
   return Abbrevs;
 }
 
+llvm::Optional<BaseAddress> DWARFUnit::getBaseAddress() {
+  if (BaseAddr)
+    return BaseAddr;
+
+  DWARFDie UnitDie = getUnitDIE();
+  Optional<DWARFFormValue> PC = UnitDie.find({DW_AT_low_pc, DW_AT_entry_pc});
+  if (Optional<uint64_t> Addr = toAddress(PC))
+    BaseAddr = {*Addr, PC->getSectionIndex()};
+
+  return BaseAddr;
+}
+
 Optional<StrOffsetsContributionDescriptor>
 StrOffsetsContributionDescriptor::validateContributionSize(
     DWARFDataExtractor &DA) {
@@ -843,7 +676,9 @@ parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   uint64_t Size = DA.getU64(&Offset);
   uint8_t Version = DA.getU16(&Offset);
   (void)DA.getU16(&Offset); // padding
-  return StrOffsetsContributionDescriptor(Offset, Size, Version, DWARF64);
+  // The encoded length includes the 2-byte version field and the 2-byte
+  // padding, so we need to subtract them out when we populate the descriptor.
+  return StrOffsetsContributionDescriptor(Offset, Size - 4, Version, DWARF64);
   //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
 }
 
@@ -858,7 +693,10 @@ parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
     return Optional<StrOffsetsContributionDescriptor>();
   uint8_t Version = DA.getU16(&Offset);
   (void)DA.getU16(&Offset); // padding
-  return StrOffsetsContributionDescriptor(Offset, ContributionSize, Version, DWARF32);
+  // The encoded length includes the 2-byte version field and the 2-byte
+  // padding, so we need to subtract them out when we populate the descriptor.
+  return StrOffsetsContributionDescriptor(Offset, ContributionSize - 4, Version,
+                                          DWARF32);
   //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
 }
 
@@ -891,6 +729,7 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
   // index table (in a package file). In a .dwo file it is simply
   // the length of the string offsets section.
   uint64_t Size = 0;
+  auto IndexEntry = Header.getIndexEntry();
   if (!IndexEntry)
     Size = StringOffsetSection.Data.size();
   else if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 3d473698b463..82d52c467bc0 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SyntaxHighlighting.h"
 #include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
@@ -16,8 +16,9 @@
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
-#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/Support/DJB.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <set>
@@ -26,7 +27,6 @@
 using namespace llvm;
 using namespace dwarf;
 using namespace object;
-using namespace syntax;
 
 DWARFVerifier::DieRangeInfo::address_range_iterator
 DWARFVerifier::DieRangeInfo::insert(const DWARFAddressRange &R) {
@@ -171,7 +171,7 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
   return Success;
 }
 
-bool DWARFVerifier::verifyUnitContents(DWARFUnit Unit, uint8_t UnitType) {
+bool DWARFVerifier::verifyUnitContents(DWARFUnit &Unit, uint8_t UnitType) {
   uint32_t NumUnitErrors = 0;
   unsigned NumDies = Unit.getNumDIEs();
   for (unsigned I = 0; I < NumDies; ++I) {
@@ -274,16 +274,17 @@ bool DWARFVerifier::handleDebugInfo() {
       if (isUnitDWARF64)
         break;
     } else {
+      DWARFUnitHeader Header;
+      Header.extract(DCtx, DebugInfoData, &OffsetStart);
       std::unique_ptr<DWARFUnit> Unit;
       switch (UnitType) {
       case dwarf::DW_UT_type:
       case dwarf::DW_UT_split_type: {
         Unit.reset(new DWARFTypeUnit(
-            DCtx, DObj.getInfoSection(), DCtx.getDebugAbbrev(),
+            DCtx, DObj.getInfoSection(), Header, DCtx.getDebugAbbrev(),
             &DObj.getRangeSection(), DObj.getStringSection(),
             DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
-            DObj.getLineSection(), DCtx.isLittleEndian(), false, TUSection,
-            nullptr));
+            DObj.getLineSection(), DCtx.isLittleEndian(), false, TUSection));
         break;
       }
       case dwarf::DW_UT_skeleton:
@@ -294,16 +295,14 @@ bool DWARFVerifier::handleDebugInfo() {
       // verifying a compile unit in DWARF v4.
       case 0: {
         Unit.reset(new DWARFCompileUnit(
-            DCtx, DObj.getInfoSection(), DCtx.getDebugAbbrev(),
+            DCtx, DObj.getInfoSection(), Header, DCtx.getDebugAbbrev(),
             &DObj.getRangeSection(), DObj.getStringSection(),
             DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
-            DObj.getLineSection(), DCtx.isLittleEndian(), false, CUSection,
-            nullptr));
+            DObj.getLineSection(), DCtx.isLittleEndian(), false, CUSection));
         break;
       }
       default: { llvm_unreachable("Invalid UnitType."); }
       }
-      Unit->extract(DebugInfoData, &OffsetStart);
       if (!verifyUnitContents(*Unit, UnitType))
         ++NumDebugInfoErrors;
     }
@@ -325,8 +324,15 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   if (!Die.isValid())
     return NumErrors;
 
-  DWARFAddressRangesVector Ranges = Die.getAddressRanges();
+  auto RangesOrError = Die.getAddressRanges();
+  if (!RangesOrError) {
+    // FIXME: Report the error.
+    ++NumErrors;
+    llvm::consumeError(RangesOrError.takeError());
+    return NumErrors;
+  }
 
+  DWARFAddressRangesVector Ranges = RangesOrError.get();
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
@@ -363,10 +369,9 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
                              ParentRI.Die.getTag() == DW_TAG_subprogram);
   if (ShouldBeContained && !ParentRI.contains(RI)) {
     ++NumErrors;
-    error() << "DIE address ranges are not "
-               "contained in its parent's ranges:";
-    Die.dump(OS, 0);
+    error() << "DIE address ranges are not contained in its parent's ranges:";
     ParentRI.Die.dump(OS, 0);
+    Die.dump(OS, 2);
     OS << "\n";
   }
 
@@ -410,22 +415,27 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     ReportError("DIE has invalid DW_AT_stmt_list encoding:");
     break;
   case DW_AT_location: {
-    Optional<ArrayRef<uint8_t>> Expr = AttrValue.Value.getAsBlock();
-    if (!Expr) {
-      ReportError("DIE has invalid DW_AT_location encoding:");
-      break;
+    auto VerifyLocationExpr = [&](StringRef D) {
+      DWARFUnit *U = Die.getDwarfUnit();
+      DataExtractor Data(D, DCtx.isLittleEndian(), 0);
+      DWARFExpression Expression(Data, U->getVersion(),
+                                 U->getAddressByteSize());
+      bool Error = llvm::any_of(Expression, [](DWARFExpression::Operation &Op) {
+        return Op.isError();
+      });
+      if (Error)
+        ReportError("DIE contains invalid DWARF expression:");
+    };
+    if (Optional<ArrayRef<uint8_t>> Expr = AttrValue.Value.getAsBlock()) {
+      // Verify inlined location.
+      VerifyLocationExpr(llvm::toStringRef(*Expr));
+    } else if (auto LocOffset = AttrValue.Value.getAsSectionOffset()) {
+      // Verify location list.
+      if (auto DebugLoc = DCtx.getDebugLoc())
+        if (auto LocList = DebugLoc->getLocationListAtOffset(*LocOffset))
+          for (const auto &Entry : LocList->Entries)
+            VerifyLocationExpr({Entry.Loc.data(), Entry.Loc.size()});
     }
-
-    DWARFUnit *U = Die.getDwarfUnit();
-    DataExtractor Data(
-        StringRef(reinterpret_cast<const char *>(Expr->data()), Expr->size()),
-        DCtx.isLittleEndian(), 0);
-    DWARFExpression Expression(Data, U->getVersion(), U->getAddressByteSize());
-    bool Error = llvm::any_of(Expression, [](DWARFExpression::Operation &Op) {
-      return Op.isError();
-    });
-    if (Error)
-      ReportError("DIE contains invalid DWARF expression:");
     break;
   }
 
@@ -669,13 +679,13 @@ bool DWARFVerifier::handleDebugLine() {
   return NumDebugLineErrors == 0;
 }
 
-unsigned DWARFVerifier::verifyAccelTable(const DWARFSection *AccelSection,
-                                         DataExtractor *StrData,
-                                         const char *SectionName) {
+unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection,
+                                              DataExtractor *StrData,
+                                              const char *SectionName) {
   unsigned NumErrors = 0;
   DWARFDataExtractor AccelSectionData(DCtx.getDWARFObj(), *AccelSection,
                                       DCtx.isLittleEndian(), 0);
-  DWARFAcceleratorTable AccelTable(AccelSectionData, *StrData);
+  AppleAcceleratorTable AccelTable(AccelSectionData, *StrData);
 
   OS << "Verifying " << SectionName << "...\n";
 
@@ -773,33 +783,572 @@ unsigned DWARFVerifier::verifyAccelTable(const DWARFSection *AccelSection,
   return NumErrors;
 }
 
+unsigned
+DWARFVerifier::verifyDebugNamesCULists(const DWARFDebugNames &AccelTable) {
+  // A map from CU offset to the (first) Name Index offset which claims to index
+  // this CU.
+  DenseMap<uint32_t, uint32_t> CUMap;
+  const uint32_t NotIndexed = std::numeric_limits<uint32_t>::max();
+
+  CUMap.reserve(DCtx.getNumCompileUnits());
+  for (const auto &CU : DCtx.compile_units())
+    CUMap[CU->getOffset()] = NotIndexed;
+
+  unsigned NumErrors = 0;
+  for (const DWARFDebugNames::NameIndex &NI : AccelTable) {
+    if (NI.getCUCount() == 0) {
+      error() << formatv("Name Index @ {0:x} does not index any CU\n",
+                         NI.getUnitOffset());
+      ++NumErrors;
+      continue;
+    }
+    for (uint32_t CU = 0, End = NI.getCUCount(); CU < End; ++CU) {
+      uint32_t Offset = NI.getCUOffset(CU);
+      auto Iter = CUMap.find(Offset);
+
+      if (Iter == CUMap.end()) {
+        error() << formatv(
+            "Name Index @ {0:x} references a non-existing CU @ {1:x}\n",
+            NI.getUnitOffset(), Offset);
+        ++NumErrors;
+        continue;
+      }
+
+      if (Iter->second != NotIndexed) {
+        error() << formatv("Name Index @ {0:x} references a CU @ {1:x}, but "
+                          "this CU is already indexed by Name Index @ {2:x}\n",
+                          NI.getUnitOffset(), Offset, Iter->second);
+        continue;
+      }
+      Iter->second = NI.getUnitOffset();
+    }
+  }
+
+  for (const auto &KV : CUMap) {
+    if (KV.second == NotIndexed)
+      warn() << formatv("CU @ {0:x} not covered by any Name Index\n", KV.first);
+  }
+
+  return NumErrors;
+}
+
+unsigned
+DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI,
+                                      const DataExtractor &StrData) {
+  struct BucketInfo {
+    uint32_t Bucket;
+    uint32_t Index;
+
+    constexpr BucketInfo(uint32_t Bucket, uint32_t Index)
+        : Bucket(Bucket), Index(Index) {}
+    bool operator<(const BucketInfo &RHS) const { return Index < RHS.Index; };
+  };
+
+  uint32_t NumErrors = 0;
+  if (NI.getBucketCount() == 0) {
+    warn() << formatv("Name Index @ {0:x} does not contain a hash table.\n",
+                      NI.getUnitOffset());
+    return NumErrors;
+  }
+
+  // Build up a list of (Bucket, Index) pairs. We use this later to verify that
+  // each Name is reachable from the appropriate bucket.
+  std::vector<BucketInfo> BucketStarts;
+  BucketStarts.reserve(NI.getBucketCount() + 1);
+  for (uint32_t Bucket = 0, End = NI.getBucketCount(); Bucket < End; ++Bucket) {
+    uint32_t Index = NI.getBucketArrayEntry(Bucket);
+    if (Index > NI.getNameCount()) {
+      error() << formatv("Bucket {0} of Name Index @ {1:x} contains invalid "
+                         "value {2}. Valid range is [0, {3}].\n",
+                         Bucket, NI.getUnitOffset(), Index, NI.getNameCount());
+      ++NumErrors;
+      continue;
+    }
+    if (Index > 0)
+      BucketStarts.emplace_back(Bucket, Index);
+  }
+
+  // If there were any buckets with invalid values, skip further checks as they
+  // will likely produce many errors which will only confuse the actual root
+  // problem.
+  if (NumErrors > 0)
+    return NumErrors;
+
+  // Sort the list in the order of increasing "Index" entries.
+  array_pod_sort(BucketStarts.begin(), BucketStarts.end());
+
+  // Insert a sentinel entry at the end, so we can check that the end of the
+  // table is covered in the loop below.
+  BucketStarts.emplace_back(NI.getBucketCount(), NI.getNameCount() + 1);
+
+  // Loop invariant: NextUncovered is the (1-based) index of the first Name
+  // which is not reachable by any of the buckets we processed so far (and
+  // hasn't been reported as uncovered).
+  uint32_t NextUncovered = 1;
+  for (const BucketInfo &B : BucketStarts) {
+    // Under normal circumstances B.Index be equal to NextUncovered, but it can
+    // be less if a bucket points to names which are already known to be in some
+    // bucket we processed earlier. In that case, we won't trigger this error,
+    // but report the mismatched hash value error instead. (We know the hash
+    // will not match because we have already verified that the name's hash
+    // puts it into the previous bucket.)
+    if (B.Index > NextUncovered) {
+      error() << formatv("Name Index @ {0:x}: Name table entries [{1}, {2}] "
+                         "are not covered by the hash table.\n",
+                         NI.getUnitOffset(), NextUncovered, B.Index - 1);
+      ++NumErrors;
+    }
+    uint32_t Idx = B.Index;
+
+    // The rest of the checks apply only to non-sentinel entries.
+    if (B.Bucket == NI.getBucketCount())
+      break;
+
+    // This triggers if a non-empty bucket points to a name with a mismatched
+    // hash. Clients are likely to interpret this as an empty bucket, because a
+    // mismatched hash signals the end of a bucket, but if this is indeed an
+    // empty bucket, the producer should have signalled this by marking the
+    // bucket as empty.
+    uint32_t FirstHash = NI.getHashArrayEntry(Idx);
+    if (FirstHash % NI.getBucketCount() != B.Bucket) {
+      error() << formatv(
+          "Name Index @ {0:x}: Bucket {1} is not empty but points to a "
+          "mismatched hash value {2:x} (belonging to bucket {3}).\n",
+          NI.getUnitOffset(), B.Bucket, FirstHash,
+          FirstHash % NI.getBucketCount());
+      ++NumErrors;
+    }
+
+    // This find the end of this bucket and also verifies that all the hashes in
+    // this bucket are correct by comparing the stored hashes to the ones we
+    // compute ourselves.
+    while (Idx <= NI.getNameCount()) {
+      uint32_t Hash = NI.getHashArrayEntry(Idx);
+      if (Hash % NI.getBucketCount() != B.Bucket)
+        break;
+
+      const char *Str = NI.getNameTableEntry(Idx).getString();
+      if (caseFoldingDjbHash(Str) != Hash) {
+        error() << formatv("Name Index @ {0:x}: String ({1}) at index {2} "
+                           "hashes to {3:x}, but "
+                           "the Name Index hash is {4:x}\n",
+                           NI.getUnitOffset(), Str, Idx,
+                           caseFoldingDjbHash(Str), Hash);
+        ++NumErrors;
+      }
+
+      ++Idx;
+    }
+    NextUncovered = std::max(NextUncovered, Idx);
+  }
+  return NumErrors;
+}
+
+unsigned DWARFVerifier::verifyNameIndexAttribute(
+    const DWARFDebugNames::NameIndex &NI, const DWARFDebugNames::Abbrev &Abbr,
+    DWARFDebugNames::AttributeEncoding AttrEnc) {
+  StringRef FormName = dwarf::FormEncodingString(AttrEnc.Form);
+  if (FormName.empty()) {
+    error() << formatv("NameIndex @ {0:x}: Abbreviation {1:x}: {2} uses an "
+                       "unknown form: {3}.\n",
+                       NI.getUnitOffset(), Abbr.Code, AttrEnc.Index,
+                       AttrEnc.Form);
+    return 1;
+  }
+
+  if (AttrEnc.Index == DW_IDX_type_hash) {
+    if (AttrEnc.Form != dwarf::DW_FORM_data8) {
+      error() << formatv(
+          "NameIndex @ {0:x}: Abbreviation {1:x}: DW_IDX_type_hash "
+          "uses an unexpected form {2} (should be {3}).\n",
+          NI.getUnitOffset(), Abbr.Code, AttrEnc.Form, dwarf::DW_FORM_data8);
+      return 1;
+    }
+  }
+
+  // A list of known index attributes and their expected form classes.
+  // DW_IDX_type_hash is handled specially in the check above, as it has a
+  // specific form (not just a form class) we should expect.
+  struct FormClassTable {
+    dwarf::Index Index;
+    DWARFFormValue::FormClass Class;
+    StringLiteral ClassName;
+  };
+  static constexpr FormClassTable Table[] = {
+      {dwarf::DW_IDX_compile_unit, DWARFFormValue::FC_Constant, {"constant"}},
+      {dwarf::DW_IDX_type_unit, DWARFFormValue::FC_Constant, {"constant"}},
+      {dwarf::DW_IDX_die_offset, DWARFFormValue::FC_Reference, {"reference"}},
+      {dwarf::DW_IDX_parent, DWARFFormValue::FC_Constant, {"constant"}},
+  };
+
+  ArrayRef<FormClassTable> TableRef(Table);
+  auto Iter = find_if(TableRef, [AttrEnc](const FormClassTable &T) {
+    return T.Index == AttrEnc.Index;
+  });
+  if (Iter == TableRef.end()) {
+    warn() << formatv("NameIndex @ {0:x}: Abbreviation {1:x} contains an "
+                      "unknown index attribute: {2}.\n",
+                      NI.getUnitOffset(), Abbr.Code, AttrEnc.Index);
+    return 0;
+  }
+
+  if (!DWARFFormValue(AttrEnc.Form).isFormClass(Iter->Class)) {
+    error() << formatv("NameIndex @ {0:x}: Abbreviation {1:x}: {2} uses an "
+                       "unexpected form {3} (expected form class {4}).\n",
+                       NI.getUnitOffset(), Abbr.Code, AttrEnc.Index,
+                       AttrEnc.Form, Iter->ClassName);
+    return 1;
+  }
+  return 0;
+}
+
+unsigned
+DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) {
+  if (NI.getLocalTUCount() + NI.getForeignTUCount() > 0) {
+    warn() << formatv("Name Index @ {0:x}: Verifying indexes of type units is "
+                      "not currently supported.\n",
+                      NI.getUnitOffset());
+    return 0;
+  }
+
+  unsigned NumErrors = 0;
+  for (const auto &Abbrev : NI.getAbbrevs()) {
+    StringRef TagName = dwarf::TagString(Abbrev.Tag);
+    if (TagName.empty()) {
+      warn() << formatv("NameIndex @ {0:x}: Abbreviation {1:x} references an "
+                        "unknown tag: {2}.\n",
+                        NI.getUnitOffset(), Abbrev.Code, Abbrev.Tag);
+    }
+    SmallSet<unsigned, 5> Attributes;
+    for (const auto &AttrEnc : Abbrev.Attributes) {
+      if (!Attributes.insert(AttrEnc.Index).second) {
+        error() << formatv("NameIndex @ {0:x}: Abbreviation {1:x} contains "
+                           "multiple {2} attributes.\n",
+                           NI.getUnitOffset(), Abbrev.Code, AttrEnc.Index);
+        ++NumErrors;
+        continue;
+      }
+      NumErrors += verifyNameIndexAttribute(NI, Abbrev, AttrEnc);
+    }
+
+    if (NI.getCUCount() > 1 && !Attributes.count(dwarf::DW_IDX_compile_unit)) {
+      error() << formatv("NameIndex @ {0:x}: Indexing multiple compile units "
+                         "and abbreviation {1:x} has no {2} attribute.\n",
+                         NI.getUnitOffset(), Abbrev.Code,
+                         dwarf::DW_IDX_compile_unit);
+      ++NumErrors;
+    }
+    if (!Attributes.count(dwarf::DW_IDX_die_offset)) {
+      error() << formatv(
+          "NameIndex @ {0:x}: Abbreviation {1:x} has no {2} attribute.\n",
+          NI.getUnitOffset(), Abbrev.Code, dwarf::DW_IDX_die_offset);
+      ++NumErrors;
+    }
+  }
+  return NumErrors;
+}
+
+static SmallVector<StringRef, 2> getNames(const DWARFDie &DIE) {
+  SmallVector<StringRef, 2> Result;
+  if (const char *Str = DIE.getName(DINameKind::ShortName))
+    Result.emplace_back(Str);
+  else if (DIE.getTag() == dwarf::DW_TAG_namespace)
+    Result.emplace_back("(anonymous namespace)");
+
+  if (const char *Str = DIE.getName(DINameKind::LinkageName)) {
+    if (Result.empty() || Result[0] != Str)
+      Result.emplace_back(Str);
+  }
+
+  return Result;
+}
+
+unsigned DWARFVerifier::verifyNameIndexEntries(
+    const DWARFDebugNames::NameIndex &NI,
+    const DWARFDebugNames::NameTableEntry &NTE) {
+  // Verifying type unit indexes not supported.
+  if (NI.getLocalTUCount() + NI.getForeignTUCount() > 0)
+    return 0;
+
+  const char *CStr = NTE.getString();
+  if (!CStr) {
+    error() << formatv(
+        "Name Index @ {0:x}: Unable to get string associated with name {1}.\n",
+        NI.getUnitOffset(), NTE.getIndex());
+    return 1;
+  }
+  StringRef Str(CStr);
+
+  unsigned NumErrors = 0;
+  unsigned NumEntries = 0;
+  uint32_t EntryID = NTE.getEntryOffset();
+  uint32_t NextEntryID = EntryID;
+  Expected<DWARFDebugNames::Entry> EntryOr = NI.getEntry(&NextEntryID);
+  for (; EntryOr; ++NumEntries, EntryID = NextEntryID,
+                                EntryOr = NI.getEntry(&NextEntryID)) {
+    uint32_t CUIndex = *EntryOr->getCUIndex();
+    if (CUIndex > NI.getCUCount()) {
+      error() << formatv("Name Index @ {0:x}: Entry @ {1:x} contains an "
+                         "invalid CU index ({2}).\n",
+                         NI.getUnitOffset(), EntryID, CUIndex);
+      ++NumErrors;
+      continue;
+    }
+    uint32_t CUOffset = NI.getCUOffset(CUIndex);
+    uint64_t DIEOffset = CUOffset + *EntryOr->getDIEUnitOffset();
+    DWARFDie DIE = DCtx.getDIEForOffset(DIEOffset);
+    if (!DIE) {
+      error() << formatv("Name Index @ {0:x}: Entry @ {1:x} references a "
+                         "non-existing DIE @ {2:x}.\n",
+                         NI.getUnitOffset(), EntryID, DIEOffset);
+      ++NumErrors;
+      continue;
+    }
+    if (DIE.getDwarfUnit()->getOffset() != CUOffset) {
+      error() << formatv("Name Index @ {0:x}: Entry @ {1:x}: mismatched CU of "
+                         "DIE @ {2:x}: index - {3:x}; debug_info - {4:x}.\n",
+                         NI.getUnitOffset(), EntryID, DIEOffset, CUOffset,
+                         DIE.getDwarfUnit()->getOffset());
+      ++NumErrors;
+    }
+    if (DIE.getTag() != EntryOr->tag()) {
+      error() << formatv("Name Index @ {0:x}: Entry @ {1:x}: mismatched Tag of "
+                         "DIE @ {2:x}: index - {3}; debug_info - {4}.\n",
+                         NI.getUnitOffset(), EntryID, DIEOffset, EntryOr->tag(),
+                         DIE.getTag());
+      ++NumErrors;
+    }
+
+    auto EntryNames = getNames(DIE);
+    if (!is_contained(EntryNames, Str)) {
+      error() << formatv("Name Index @ {0:x}: Entry @ {1:x}: mismatched Name "
+                         "of DIE @ {2:x}: index - {3}; debug_info - {4}.\n",
+                         NI.getUnitOffset(), EntryID, DIEOffset, Str,
+                         make_range(EntryNames.begin(), EntryNames.end()));
+      ++NumErrors;
+    }
+  }
+  handleAllErrors(EntryOr.takeError(),
+                  [&](const DWARFDebugNames::SentinelError &) {
+                    if (NumEntries > 0)
+                      return;
+                    error() << formatv("Name Index @ {0:x}: Name {1} ({2}) is "
+                                       "not associated with any entries.\n",
+                                       NI.getUnitOffset(), NTE.getIndex(), Str);
+                    ++NumErrors;
+                  },
+                  [&](const ErrorInfoBase &Info) {
+                    error()
+                        << formatv("Name Index @ {0:x}: Name {1} ({2}): {3}\n",
+                                   NI.getUnitOffset(), NTE.getIndex(), Str,
+                                   Info.message());
+                    ++NumErrors;
+                  });
+  return NumErrors;
+}
+
+static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) {
+  Optional<DWARFFormValue> Location = Die.findRecursively(DW_AT_location);
+  if (!Location)
+    return false;
+
+  auto ContainsInterestingOperators = [&](StringRef D) {
+    DWARFUnit *U = Die.getDwarfUnit();
+    DataExtractor Data(D, DCtx.isLittleEndian(), U->getAddressByteSize());
+    DWARFExpression Expression(Data, U->getVersion(), U->getAddressByteSize());
+    return any_of(Expression, [](DWARFExpression::Operation &Op) {
+      return !Op.isError() && (Op.getCode() == DW_OP_addr ||
+                               Op.getCode() == DW_OP_form_tls_address ||
+                               Op.getCode() == DW_OP_GNU_push_tls_address);
+    });
+  };
+
+  if (Optional<ArrayRef<uint8_t>> Expr = Location->getAsBlock()) {
+    // Inlined location.
+    if (ContainsInterestingOperators(toStringRef(*Expr)))
+      return true;
+  } else if (Optional<uint64_t> Offset = Location->getAsSectionOffset()) {
+    // Location list.
+    if (const DWARFDebugLoc *DebugLoc = DCtx.getDebugLoc()) {
+      if (const DWARFDebugLoc::LocationList *LocList =
+              DebugLoc->getLocationListAtOffset(*Offset)) {
+        if (any_of(LocList->Entries, [&](const DWARFDebugLoc::Entry &E) {
+              return ContainsInterestingOperators({E.Loc.data(), E.Loc.size()});
+            }))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+unsigned DWARFVerifier::verifyNameIndexCompleteness(
+    const DWARFDie &Die, const DWARFDebugNames::NameIndex &NI) {
+
+  // First check, if the Die should be indexed. The code follows the DWARF v5
+  // wording as closely as possible.
+
+  // "All non-defining declarations (that is, debugging information entries
+  // with a DW_AT_declaration attribute) are excluded."
+  if (Die.find(DW_AT_declaration))
+    return 0;
+
+  // "DW_TAG_namespace debugging information entries without a DW_AT_name
+  // attribute are included with the name “(anonymous namespace)”.
+  // All other debugging information entries without a DW_AT_name attribute
+  // are excluded."
+  // "If a subprogram or inlined subroutine is included, and has a
+  // DW_AT_linkage_name attribute, there will be an additional index entry for
+  // the linkage name."
+  auto EntryNames = getNames(Die);
+  if (EntryNames.empty())
+    return 0;
+
+  // We deviate from the specification here, which says:
+  // "The name index must contain an entry for each debugging information entry
+  // that defines a named subprogram, label, variable, type, or namespace,
+  // subject to ..."
+  // Instead whitelisting all TAGs representing a "type" or a "subprogram", to
+  // make sure we catch any missing items, we instead blacklist all TAGs that we
+  // know shouldn't be indexed.
+  switch (Die.getTag()) {
+  // Compile unit has a name but it shouldn't be indexed.
+  case DW_TAG_compile_unit:
+    return 0;
+
+  // Function and template parameters are not globally visible, so we shouldn't
+  // index them.
+  case DW_TAG_formal_parameter:
+  case DW_TAG_template_value_parameter:
+  case DW_TAG_template_type_parameter:
+  case DW_TAG_GNU_template_parameter_pack:
+  case DW_TAG_GNU_template_template_param:
+    return 0;
+
+  // Object members aren't globally visible.
+  case DW_TAG_member:
+    return 0;
+
+  // According to a strict reading of the specification, enumerators should not
+  // be indexed (and LLVM currently does not do that). However, this causes
+  // problems for the debuggers, so we may need to reconsider this.
+  case DW_TAG_enumerator:
+    return 0;
+
+  // Imported declarations should not be indexed according to the specification
+  // and LLVM currently does not do that.
+  case DW_TAG_imported_declaration:
+    return 0;
+
+  // "DW_TAG_subprogram, DW_TAG_inlined_subroutine, and DW_TAG_label debugging
+  // information entries without an address attribute (DW_AT_low_pc,
+  // DW_AT_high_pc, DW_AT_ranges, or DW_AT_entry_pc) are excluded."
+  case DW_TAG_subprogram:
+  case DW_TAG_inlined_subroutine:
+  case DW_TAG_label:
+    if (Die.findRecursively(
+            {DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_entry_pc}))
+      break;
+    return 0;
+
+  // "DW_TAG_variable debugging information entries with a DW_AT_location
+  // attribute that includes a DW_OP_addr or DW_OP_form_tls_address operator are
+  // included; otherwise, they are excluded."
+  //
+  // LLVM extension: We also add DW_OP_GNU_push_tls_address to this list.
+  case DW_TAG_variable:
+    if (isVariableIndexable(Die, DCtx))
+      break;
+    return 0;
+
+  default:
+    break;
+  }
+
+  // Now we know that our Die should be present in the Index. Let's check if
+  // that's the case.
+  unsigned NumErrors = 0;
+  uint64_t DieUnitOffset = Die.getOffset() - Die.getDwarfUnit()->getOffset();
+  for (StringRef Name : EntryNames) {
+    if (none_of(NI.equal_range(Name), [&](const DWARFDebugNames::Entry &E) {
+          return E.getDIEUnitOffset() == DieUnitOffset;
+        })) {
+      error() << formatv("Name Index @ {0:x}: Entry for DIE @ {1:x} ({2}) with "
+                         "name {3} missing.\n",
+                         NI.getUnitOffset(), Die.getOffset(), Die.getTag(),
+                         Name);
+      ++NumErrors;
+    }
+  }
+  return NumErrors;
+}
+
+unsigned DWARFVerifier::verifyDebugNames(const DWARFSection &AccelSection,
+                                         const DataExtractor &StrData) {
+  unsigned NumErrors = 0;
+  DWARFDataExtractor AccelSectionData(DCtx.getDWARFObj(), AccelSection,
+                                      DCtx.isLittleEndian(), 0);
+  DWARFDebugNames AccelTable(AccelSectionData, StrData);
+
+  OS << "Verifying .debug_names...\n";
+
+  // This verifies that we can read individual name indices and their
+  // abbreviation tables.
+  if (Error E = AccelTable.extract()) {
+    error() << toString(std::move(E)) << '\n';
+    return 1;
+  }
+
+  NumErrors += verifyDebugNamesCULists(AccelTable);
+  for (const auto &NI : AccelTable)
+    NumErrors += verifyNameIndexBuckets(NI, StrData);
+  for (const auto &NI : AccelTable)
+    NumErrors += verifyNameIndexAbbrevs(NI);
+
+  // Don't attempt Entry validation if any of the previous checks found errors
+  if (NumErrors > 0)
+    return NumErrors;
+  for (const auto &NI : AccelTable)
+    for (DWARFDebugNames::NameTableEntry NTE : NI)
+      NumErrors += verifyNameIndexEntries(NI, NTE);
+
+  if (NumErrors > 0)
+    return NumErrors;
+
+  for (const std::unique_ptr<DWARFCompileUnit> &CU : DCtx.compile_units()) {
+    if (const DWARFDebugNames::NameIndex *NI =
+            AccelTable.getCUNameIndex(CU->getOffset())) {
+      for (const DWARFDebugInfoEntry &Die : CU->dies())
+        NumErrors += verifyNameIndexCompleteness(DWARFDie(CU.get(), &Die), *NI);
+    }
+  }
+  return NumErrors;
+}
+
 bool DWARFVerifier::handleAccelTables() {
   const DWARFObject &D = DCtx.getDWARFObj();
   DataExtractor StrData(D.getStringSection(), DCtx.isLittleEndian(), 0);
   unsigned NumErrors = 0;
   if (!D.getAppleNamesSection().Data.empty())
     NumErrors +=
-        verifyAccelTable(&D.getAppleNamesSection(), &StrData, ".apple_names");
+        verifyAppleAccelTable(&D.getAppleNamesSection(), &StrData, ".apple_names");
   if (!D.getAppleTypesSection().Data.empty())
     NumErrors +=
-        verifyAccelTable(&D.getAppleTypesSection(), &StrData, ".apple_types");
+        verifyAppleAccelTable(&D.getAppleTypesSection(), &StrData, ".apple_types");
   if (!D.getAppleNamespacesSection().Data.empty())
-    NumErrors += verifyAccelTable(&D.getAppleNamespacesSection(), &StrData,
+    NumErrors += verifyAppleAccelTable(&D.getAppleNamespacesSection(), &StrData,
                                   ".apple_namespaces");
   if (!D.getAppleObjCSection().Data.empty())
     NumErrors +=
-        verifyAccelTable(&D.getAppleObjCSection(), &StrData, ".apple_objc");
+        verifyAppleAccelTable(&D.getAppleObjCSection(), &StrData, ".apple_objc");
+
+  if (!D.getDebugNamesSection().Data.empty())
+    NumErrors += verifyDebugNames(D.getDebugNamesSection(), StrData);
   return NumErrors == 0;
 }
 
-raw_ostream &DWARFVerifier::error() const {
-  return WithColor(OS, syntax::Error).get() << "error: ";
-}
+raw_ostream &DWARFVerifier::error() const { return WithColor::error(OS); }
 
-raw_ostream &DWARFVerifier::warn() const {
-  return WithColor(OS, syntax::Warning).get() << "warning: ";
-}
+raw_ostream &DWARFVerifier::warn() const { return WithColor::warning(OS); }
 
-raw_ostream &DWARFVerifier::note() const {
-  return WithColor(OS, syntax::Note).get() << "note: ";
-}
+raw_ostream &DWARFVerifier::note() const { return WithColor::note(OS); }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
deleted file mode 100644
index 65d66fc8f514..000000000000
--- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===- SyntaxHighlighting.cpp ---------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SyntaxHighlighting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-using namespace dwarf;
-using namespace syntax;
-
-static cl::opt<cl::boolOrDefault>
-    UseColor("color",
-             cl::desc("use colored syntax highlighting (default=autodetect)"),
-             cl::init(cl::BOU_UNSET));
-
-WithColor::WithColor(raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
-  // Detect color from terminal type unless the user passed the --color option.
-  if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE) {
-    switch (Type) {
-    case Address:    OS.changeColor(raw_ostream::YELLOW);         break;
-    case String:     OS.changeColor(raw_ostream::GREEN);          break;
-    case Tag:        OS.changeColor(raw_ostream::BLUE);           break;
-    case Attribute:  OS.changeColor(raw_ostream::CYAN);           break;
-    case Enumerator: OS.changeColor(raw_ostream::MAGENTA);        break;
-    case Macro:      OS.changeColor(raw_ostream::RED);            break;
-    case Error:      OS.changeColor(raw_ostream::RED, true);      break;
-    case Warning:    OS.changeColor(raw_ostream::MAGENTA, true);  break;
-    case Note:       OS.changeColor(raw_ostream::BLACK, true);    break;
-    }
-  }
-}
-
-WithColor::~WithColor() {
-  if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE)
-    OS.resetColor();
-}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
deleted file mode 100644
index 686cf2c77608..000000000000
--- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===- SyntaxHighlighting.h -------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
-#define LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
-
-namespace llvm {
-
-class raw_ostream;
-
-namespace dwarf {
-namespace syntax {
-
-// Symbolic names for various syntax elements.
-enum HighlightColor {
-  Address,
-  String,
-  Tag,
-  Attribute,
-  Enumerator,
-  Macro,
-  Error,
-  Warning,
-  Note
-};
-
-/// An RAII object that temporarily switches an output stream to a
-/// specific color.
-class WithColor {
-  raw_ostream &OS;
-
-public:
-  /// To be used like this: WithColor(OS, syntax::String) << "text";
-  WithColor(raw_ostream &OS, enum HighlightColor Type);
-  ~WithColor();
-
-  raw_ostream &get() { return OS; }
-  operator raw_ostream &() { return OS; }
-};
-
-} // end namespace syntax
-} // end namespace dwarf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
diff --git a/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
index 9cd22ab7d887..71609919558a 100644
--- a/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -7,11 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/MSF/MSFError.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -29,7 +33,7 @@ static const uint32_t kFreePageMap0Block = 1;
 static const uint32_t kFreePageMap1Block = 2;
 static const uint32_t kNumReservedPages = 3;
 
-static const uint32_t kDefaultFreePageMap = kFreePageMap0Block;
+static const uint32_t kDefaultFreePageMap = kFreePageMap1Block;
 static const uint32_t kDefaultBlockMapAddr = kNumReservedPages;
 
 MSFBuilder::MSFBuilder(uint32_t BlockSize, uint32_t MinBlockCount, bool CanGrow,
@@ -112,11 +116,11 @@ Error MSFBuilder::allocateBlocks(uint32_t NumBlocks,
     FreeBlocks.resize(NewBlockCount, true);
     // If we crossed over an fpm page, we actually need to allocate 2 extra
     // blocks for each FPM group crossed and mark both blocks from the group as
-    // used.  We may not actually use them since there are many more FPM blocks
-    // present than are required to represent all blocks in a given PDB, but we
-    // need to make sure they aren't allocated to a stream or something else.
-    // At the end when committing the PDB, we'll go through and mark the
-    // extraneous ones unused.
+    // used.  FPM blocks are marked as allocated regardless of whether or not
+    // they ultimately describe the status of blocks in the file.  This means
+    // that not only are extraneous blocks at the end of the main FPM marked as
+    // allocated, but also blocks from the alternate FPM are always marked as
+    // allocated.
     while (NextFpmBlock < NewBlockCount) {
       NewBlockCount += 2;
       FreeBlocks.resize(NewBlockCount, true);
@@ -244,20 +248,7 @@ uint32_t MSFBuilder::computeDirectoryByteSize() const {
   return Size;
 }
 
-static void finalizeFpmBlockStatus(uint32_t B, ArrayRef<ulittle32_t> &FpmBlocks,
-                                   BitVector &Fpm) {
-  if (FpmBlocks.empty() || FpmBlocks.front() != B) {
-    Fpm.set(B);
-    return;
-  }
-
-  // If the next block in the actual layout is this block, it should *not* be
-  // free.
-  assert(!Fpm.test(B));
-  FpmBlocks = FpmBlocks.drop_front();
-}
-
-Expected<MSFLayout> MSFBuilder::build() {
+Expected<MSFLayout> MSFBuilder::generateLayout() {
   SuperBlock *SB = Allocator.Allocate<SuperBlock>();
   MSFLayout L;
   L.SB = SB;
@@ -315,20 +306,77 @@ Expected<MSFLayout> MSFBuilder::build() {
     }
   }
 
-  // FPM blocks occur in pairs at every `BlockLength` interval.  While blocks of
-  // this form are reserved for FPM blocks, not all blocks of this form will
-  // actually be needed for FPM data because there are more blocks of this form
-  // than are required to represent a PDB file with a given number of blocks.
-  // So we need to find out which blocks are *actually* going to be real FPM
-  // blocks, then mark the reset of the reserved blocks as unallocated.
-  MSFStreamLayout FpmLayout = msf::getFpmStreamLayout(L, true);
-  auto FpmBlocks = makeArrayRef(FpmLayout.Blocks);
-  for (uint32_t B = kFreePageMap0Block; B < SB->NumBlocks;
-       B += msf::getFpmIntervalLength(L)) {
-    finalizeFpmBlockStatus(B, FpmBlocks, FreeBlocks);
-    finalizeFpmBlockStatus(B + 1, FpmBlocks, FreeBlocks);
-  }
   L.FreePageMap = FreeBlocks;
 
   return L;
 }
+
+static void commitFpm(WritableBinaryStream &MsfBuffer, const MSFLayout &Layout,
+                      BumpPtrAllocator &Allocator) {
+  auto FpmStream =
+      WritableMappedBlockStream::createFpmStream(Layout, MsfBuffer, Allocator);
+
+  // We only need to create the alt fpm stream so that it gets initialized.
+  WritableMappedBlockStream::createFpmStream(Layout, MsfBuffer, Allocator,
+                                             true);
+
+  uint32_t BI = 0;
+  BinaryStreamWriter FpmWriter(*FpmStream);
+  while (BI < Layout.SB->NumBlocks) {
+    uint8_t ThisByte = 0;
+    for (uint32_t I = 0; I < 8; ++I) {
+      bool IsFree =
+          (BI < Layout.SB->NumBlocks) ? Layout.FreePageMap.test(BI) : true;
+      uint8_t Mask = uint8_t(IsFree) << I;
+      ThisByte |= Mask;
+      ++BI;
+    }
+    cantFail(FpmWriter.writeObject(ThisByte));
+  }
+  assert(FpmWriter.bytesRemaining() == 0);
+}
+
+Expected<FileBufferByteStream> MSFBuilder::commit(StringRef Path,
+                                                  MSFLayout &Layout) {
+  Expected<MSFLayout> L = generateLayout();
+  if (!L)
+    return L.takeError();
+
+  Layout = std::move(*L);
+
+  uint64_t FileSize = Layout.SB->BlockSize * Layout.SB->NumBlocks;
+  auto OutFileOrError = FileOutputBuffer::create(Path, FileSize);
+  if (auto EC = OutFileOrError.takeError())
+    return std::move(EC);
+
+  FileBufferByteStream Buffer(std::move(*OutFileOrError),
+                              llvm::support::little);
+  BinaryStreamWriter Writer(Buffer);
+
+  if (auto EC = Writer.writeObject(*Layout.SB))
+    return std::move(EC);
+
+  commitFpm(Buffer, Layout, Allocator);
+
+  uint32_t BlockMapOffset =
+      msf::blockToOffset(Layout.SB->BlockMapAddr, Layout.SB->BlockSize);
+  Writer.setOffset(BlockMapOffset);
+  if (auto EC = Writer.writeArray(Layout.DirectoryBlocks))
+    return std::move(EC);
+
+  auto DirStream = WritableMappedBlockStream::createDirectoryStream(
+      Layout, Buffer, Allocator);
+  BinaryStreamWriter DW(*DirStream);
+  if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size()))
+    return std::move(EC);
+
+  if (auto EC = DW.writeArray(Layout.StreamSizes))
+    return std::move(EC);
+
+  for (const auto &Blocks : Layout.StreamMap) {
+    if (auto EC = DW.writeArray(Blocks))
+      return std::move(EC);
+  }
+
+  return std::move(Buffer);
+}
diff --git a/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp b/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp
index d7e1dcf31a3a..d398304375ac 100644
--- a/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp
+++ b/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp
@@ -64,15 +64,13 @@ MSFStreamLayout llvm::msf::getFpmStreamLayout(const MSFLayout &Msf,
                                               bool IncludeUnusedFpmData,
                                               bool AltFpm) {
   MSFStreamLayout FL;
-  uint32_t NumFpmIntervals = getNumFpmIntervals(Msf, IncludeUnusedFpmData);
-  support::ulittle32_t FpmBlock = Msf.SB->FreeBlockMapBlock;
-  assert(FpmBlock == 1 || FpmBlock == 2);
-  if (AltFpm) {
-    // If they requested the alternate FPM, then 2 becomes 1 and 1 becomes 2.
-    FpmBlock = 3U - FpmBlock;
-  }
+  uint32_t NumFpmIntervals =
+      getNumFpmIntervals(Msf, IncludeUnusedFpmData, AltFpm);
+
+  uint32_t FpmBlock = AltFpm ? Msf.alternateFpmBlock() : Msf.mainFpmBlock();
+
   for (uint32_t I = 0; I < NumFpmIntervals; ++I) {
-    FL.Blocks.push_back(FpmBlock);
+    FL.Blocks.push_back(support::ulittle32_t(FpmBlock));
     FpmBlock += msf::getFpmIntervalLength(Msf);
   }
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumInjectedSources.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumInjectedSources.cpp
new file mode 100644
index 000000000000..d7c908e04593
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumInjectedSources.cpp
@@ -0,0 +1,52 @@
+//==- DIAEnumSourceFiles.cpp - DIA Source File Enumerator impl ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAInjectedSource.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+DIAEnumInjectedSources::DIAEnumInjectedSources(
+    const DIASession &PDBSession,
+    CComPtr<IDiaEnumInjectedSources> DiaEnumerator)
+    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumInjectedSources::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBInjectedSource>
+DIAEnumInjectedSources::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaInjectedSource> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBInjectedSource>(new DIAInjectedSource(Item));
+}
+
+std::unique_ptr<IPDBInjectedSource> DIAEnumInjectedSources::getNext() {
+  CComPtr<IDiaInjectedSource> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBInjectedSource>(new DIAInjectedSource(Item));
+}
+
+void DIAEnumInjectedSources::reset() { Enumerator->Reset(); }
+
+DIAEnumInjectedSources *DIAEnumInjectedSources::clone() const {
+  CComPtr<IDiaEnumInjectedSources> EnumeratorClone;
+  if (S_OK != Enumerator->Clone(&EnumeratorClone))
+    return nullptr;
+  return new DIAEnumInjectedSources(Session, EnumeratorClone);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSectionContribs.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSectionContribs.cpp
new file mode 100644
index 000000000000..1f405f049198
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSectionContribs.cpp
@@ -0,0 +1,54 @@
+//==- DIAEnumSectionContribs.cpp ---------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASectionContrib.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+DIAEnumSectionContribs::DIAEnumSectionContribs(
+    const DIASession &PDBSession,
+    CComPtr<IDiaEnumSectionContribs> DiaEnumerator)
+    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumSectionContribs::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBSectionContrib>
+DIAEnumSectionContribs::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaSectionContrib> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBSectionContrib>(
+      new DIASectionContrib(Session, Item));
+}
+
+std::unique_ptr<IPDBSectionContrib> DIAEnumSectionContribs::getNext() {
+  CComPtr<IDiaSectionContrib> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBSectionContrib>(
+      new DIASectionContrib(Session, Item));
+}
+
+void DIAEnumSectionContribs::reset() { Enumerator->Reset(); }
+
+DIAEnumSectionContribs *DIAEnumSectionContribs::clone() const {
+  CComPtr<IDiaEnumSectionContribs> EnumeratorClone;
+  if (S_OK != Enumerator->Clone(&EnumeratorClone))
+    return nullptr;
+  return new DIAEnumSectionContribs(Session, EnumeratorClone);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAInjectedSource.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAInjectedSource.cpp
new file mode 100644
index 000000000000..1d642f221d79
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAInjectedSource.cpp
@@ -0,0 +1,63 @@
+//===- DIAInjectedSource.cpp - DIA impl for IPDBInjectedSource --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAInjectedSource.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAUtils.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+DIAInjectedSource::DIAInjectedSource(CComPtr<IDiaInjectedSource> DiaSourceFile)
+    : SourceFile(DiaSourceFile) {}
+
+uint32_t DIAInjectedSource::getCrc32() const {
+  DWORD Crc;
+  return (S_OK == SourceFile->get_crc(&Crc)) ? Crc : 0;
+}
+
+uint64_t DIAInjectedSource::getCodeByteSize() const {
+  ULONGLONG Size;
+  return (S_OK == SourceFile->get_length(&Size)) ? Size : 0;
+}
+
+std::string DIAInjectedSource::getFileName() const {
+  return invokeBstrMethod(*SourceFile, &IDiaInjectedSource::get_filename);
+}
+
+std::string DIAInjectedSource::getObjectFileName() const {
+  return invokeBstrMethod(*SourceFile, &IDiaInjectedSource::get_objectFilename);
+}
+
+std::string DIAInjectedSource::getVirtualFileName() const {
+  return invokeBstrMethod(*SourceFile,
+                          &IDiaInjectedSource::get_virtualFilename);
+}
+
+PDB_SourceCompression DIAInjectedSource::getCompression() const {
+  DWORD Compression = 0;
+  if (S_OK != SourceFile->get_sourceCompression(&Compression))
+    return PDB_SourceCompression::None;
+  return static_cast<PDB_SourceCompression>(Compression);
+}
+
+std::string DIAInjectedSource::getCode() const {
+  DWORD DataSize;
+  if (S_OK != SourceFile->get_source(0, &DataSize, nullptr))
+    return "";
+
+  std::vector<uint8_t> Buffer(DataSize);
+  if (S_OK != SourceFile->get_source(DataSize, &DataSize, Buffer.data()))
+    return "";
+  assert(Buffer.size() == DataSize);
+  return std::string(reinterpret_cast<const char *>(Buffer.data()),
+                     Buffer.size());
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index 8e4b1f8aa8c9..7d6cb254e1d1 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -11,7 +11,9 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
+#include "llvm/DebugInfo/PDB/DIA/DIALineNumber.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
@@ -178,7 +180,7 @@ void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
   OS << "\n";
   OS.indent(Indent);
   Variant V = VariantFromVARIANT(Value);
-  OS << V;
+  OS << Name << ": " << V;
 }
 }
 
@@ -400,6 +402,47 @@ DIARawSymbol::findChildren(PDB_SymType Type, StringRef Name,
 }
 
 std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findChildrenByAddr(PDB_SymType Type, StringRef Name,
+                                 PDB_NameSearchFlags Flags, uint32_t Section,
+                                 uint32_t Offset) const {
+  llvm::SmallVector<UTF16, 32> Name16;
+  llvm::convertUTF8ToUTF16String(Name, Name16);
+
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+
+  DWORD CompareFlags = static_cast<DWORD>(Flags);
+  wchar_t *Name16Str = reinterpret_cast<wchar_t *>(Name16.data());
+
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK !=
+      Symbol->findChildrenExByAddr(EnumVal, Name16Str, CompareFlags, Section,
+                                   Offset, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findChildrenByVA(PDB_SymType Type, StringRef Name,
+                               PDB_NameSearchFlags Flags, uint64_t VA) const {
+  llvm::SmallVector<UTF16, 32> Name16;
+  llvm::convertUTF8ToUTF16String(Name, Name16);
+
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+
+  DWORD CompareFlags = static_cast<DWORD>(Flags);
+  wchar_t *Name16Str = reinterpret_cast<wchar_t *>(Name16.data());
+
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK !=
+      Symbol->findChildrenExByVA(EnumVal, Name16Str, CompareFlags, VA,
+                                  &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
 DIARawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
                                 PDB_NameSearchFlags Flags, uint32_t RVA) const {
   llvm::SmallVector<UTF16, 32> Name16;
@@ -419,6 +462,15 @@ DIARawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
 }
 
 std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findInlineFramesByAddr(uint32_t Section, uint32_t Offset) const {
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK != Symbol->findInlineFramesByAddr(Section, Offset, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
 DIARawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
   CComPtr<IDiaEnumSymbols> DiaEnumerator;
   if (S_OK != Symbol->findInlineFramesByRVA(RVA, &DiaEnumerator))
@@ -427,6 +479,51 @@ DIARawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
   return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
+std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findInlineFramesByVA(uint64_t VA) const {
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK != Symbol->findInlineFramesByVA(VA, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers> DIARawSymbol::findInlineeLines() const {
+  CComPtr<IDiaEnumLineNumbers> DiaEnumerator;
+  if (S_OK != Symbol->findInlineeLines(&DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+DIARawSymbol::findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
+                                     uint32_t Length) const {
+  CComPtr<IDiaEnumLineNumbers> DiaEnumerator;
+  if (S_OK != Symbol->findInlineeLinesByAddr(Section, Offset, Length, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+DIARawSymbol::findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const {
+  CComPtr<IDiaEnumLineNumbers> DiaEnumerator;
+  if (S_OK != Symbol->findInlineeLinesByRVA(RVA, Length, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+DIARawSymbol::findInlineeLinesByVA(uint64_t VA, uint32_t Length) const {
+  CComPtr<IDiaEnumLineNumbers> DiaEnumerator;
+  if (S_OK != Symbol->findInlineeLinesByVA(VA, Length, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+}
+
 void DIARawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const {
   bytes.clear();
 
@@ -652,6 +749,15 @@ std::string DIARawSymbol::getSourceFileName() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_sourceFileName);
 }
 
+std::unique_ptr<IPDBLineNumber>
+DIARawSymbol::getSrcLineOnTypeDefn() const {
+  CComPtr<IDiaLineNumber> LineNumber;
+  if (FAILED(Symbol->getSrcLineOnTypeDefn(&LineNumber)) || !LineNumber)
+    return nullptr;
+
+  return llvm::make_unique<DIALineNumber>(LineNumber);
+}
+
 uint32_t DIARawSymbol::getStride() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_stride);
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp
new file mode 100644
index 000000000000..b7dc49f53e23
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp
@@ -0,0 +1,126 @@
+//===- DIASectionContrib.cpp - DIA impl. of IPDBSectionContrib ---- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIASectionContrib.h"
+#include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+DIASectionContrib::DIASectionContrib(const DIASession &PDBSession,
+                                     CComPtr<IDiaSectionContrib> DiaSection)
+  : Session(PDBSession), Section(DiaSection) {}
+
+std::unique_ptr<PDBSymbolCompiland> DIASectionContrib::getCompiland() const {
+  CComPtr<IDiaSymbol> Symbol;
+  if (FAILED(Section->get_compiland(&Symbol)))
+    return nullptr;
+
+  auto RawSymbol = llvm::make_unique<DIARawSymbol>(Session, Symbol);
+  return llvm::make_unique<PDBSymbolCompiland>(Session, std::move(RawSymbol));
+}
+
+template <typename ArgType>
+ArgType
+PrivateGetDIAValue(IDiaSectionContrib *Section,
+                   HRESULT (__stdcall IDiaSectionContrib::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (Section->*Method)(&Value))
+    return static_cast<ArgType>(Value);
+
+  return ArgType();
+}
+
+uint32_t DIASectionContrib::getAddressSection() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_addressSection);
+}
+
+uint32_t DIASectionContrib::getAddressOffset() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_addressOffset);
+}
+
+uint64_t DIASectionContrib::getVirtualAddress() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_virtualAddress);
+}
+
+uint32_t DIASectionContrib::getRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(Section,
+                            &IDiaSectionContrib::get_relativeVirtualAddress);
+}
+
+uint32_t DIASectionContrib::getLength() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_length);
+}
+
+bool DIASectionContrib::isNotPaged() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_notPaged);
+}
+
+bool DIASectionContrib::hasCode() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_code);
+}
+
+bool DIASectionContrib::hasCode16Bit() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_code16bit);
+}
+
+bool DIASectionContrib::hasInitializedData() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_initializedData);
+}
+
+bool DIASectionContrib::hasUninitializedData() const {
+  return PrivateGetDIAValue(Section,
+                            &IDiaSectionContrib::get_uninitializedData);
+}
+
+bool DIASectionContrib::isRemoved() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_remove);
+}
+
+bool DIASectionContrib::hasComdat() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_comdat);
+}
+
+bool DIASectionContrib::isDiscardable() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_discardable);
+}
+
+bool DIASectionContrib::isNotCached() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_notCached);
+}
+
+bool DIASectionContrib::isShared() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_share);
+}
+
+bool DIASectionContrib::isExecutable() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_execute);
+}
+
+bool DIASectionContrib::isReadable() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_read);
+}
+
+bool DIASectionContrib::isWritable() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_write);
+}
+
+uint32_t DIASectionContrib::getDataCrc32() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_dataCrc);
+}
+
+uint32_t DIASectionContrib::getRelocationsCrc32() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_relocationsCrc);
+}
+
+uint32_t DIASectionContrib::getCompilandId() const {
+  return PrivateGetDIAValue(Section, &IDiaSectionContrib::get_compilandId);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
index b8aaebbf7380..d81f59400eb3 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -9,7 +9,9 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumTables.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAError.h"
@@ -104,7 +106,7 @@ Error DIASession::createFromPdb(StringRef Path,
   if (!llvm::convertUTF8ToUTF16String(Path, Path16))
     return make_error<GenericError>(generic_error_code::invalid_path);
 
-  const wchar_t *Path16Str = reinterpret_cast<const wchar_t*>(Path16.data());
+  const wchar_t *Path16Str = reinterpret_cast<const wchar_t *>(Path16.data());
   HRESULT HR;
   if (FAILED(HR = DiaDataSource->loadDataFromPdb(Path16Str))) {
     return ErrorFromHResult(HR, "Calling loadDataFromPdb {0}", Path);
@@ -148,8 +150,8 @@ uint64_t DIASession::getLoadAddress() const {
   return (success) ? LoadAddress : 0;
 }
 
-void DIASession::setLoadAddress(uint64_t Address) {
-  Session->put_loadAddress(Address);
+bool DIASession::setLoadAddress(uint64_t Address) {
+  return (S_OK == Session->put_loadAddress(Address));
 }
 
 std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() {
@@ -164,6 +166,28 @@ std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() {
   return ExeSymbol;
 }
 
+bool DIASession::addressForVA(uint64_t VA, uint32_t &Section,
+                              uint32_t &Offset) const {
+  DWORD ArgSection, ArgOffset = 0;
+  if (S_OK == Session->addressForVA(VA, &ArgSection, &ArgOffset)) {
+    Section = static_cast<uint32_t>(ArgSection);
+    Offset = static_cast<uint32_t>(ArgOffset);
+    return true;
+  }
+  return false;
+}
+
+bool DIASession::addressForRVA(uint32_t RVA, uint32_t &Section,
+                               uint32_t &Offset) const {
+  DWORD ArgSection, ArgOffset = 0;
+  if (S_OK == Session->addressForRVA(RVA, &ArgSection, &ArgOffset)) {
+    Section = static_cast<uint32_t>(ArgSection);
+    Offset = static_cast<uint32_t>(ArgOffset);
+    return true;
+  }
+  return false;
+}
+
 std::unique_ptr<PDBSymbol> DIASession::getSymbolById(uint32_t SymbolId) const {
   CComPtr<IDiaSymbol> LocatedSymbol;
   if (S_OK != Session->symbolById(SymbolId, &LocatedSymbol))
@@ -190,6 +214,31 @@ DIASession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
   return PDBSymbol::create(*this, std::move(RawSymbol));
 }
 
+std::unique_ptr<PDBSymbol> DIASession::findSymbolByRVA(uint32_t RVA,
+                                                       PDB_SymType Type) const {
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+
+  CComPtr<IDiaSymbol> Symbol;
+  if (S_OK != Session->findSymbolByRVA(RVA, EnumVal, &Symbol))
+    return nullptr;
+
+  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, Symbol);
+  return PDBSymbol::create(*this, std::move(RawSymbol));
+}
+
+std::unique_ptr<PDBSymbol>
+DIASession::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
+                                   PDB_SymType Type) const {
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+
+  CComPtr<IDiaSymbol> Symbol;
+  if (S_OK != Session->findSymbolByAddr(Sect, Offset, EnumVal, &Symbol))
+    return nullptr;
+
+  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, Symbol);
+  return PDBSymbol::create(*this, std::move(RawSymbol));
+}
+
 std::unique_ptr<IPDBEnumLineNumbers>
 DIASession::findLineNumbers(const PDBSymbolCompiland &Compiland,
                             const IPDBSourceFile &File) const {
@@ -198,9 +247,8 @@ DIASession::findLineNumbers(const PDBSymbolCompiland &Compiland,
   const DIASourceFile &RawFile = static_cast<const DIASourceFile &>(File);
 
   CComPtr<IDiaEnumLineNumbers> LineNumbers;
-  if (S_OK !=
-      Session->findLines(RawCompiland.getDiaSymbol(), RawFile.getDiaFile(),
-                         &LineNumbers))
+  if (S_OK != Session->findLines(RawCompiland.getDiaSymbol(),
+                                 RawFile.getDiaFile(), &LineNumbers))
     return nullptr;
 
   return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
@@ -209,7 +257,31 @@ DIASession::findLineNumbers(const PDBSymbolCompiland &Compiland,
 std::unique_ptr<IPDBEnumLineNumbers>
 DIASession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
   CComPtr<IDiaEnumLineNumbers> LineNumbers;
-  if (S_OK != Session->findLinesByVA(Address, Length, &LineNumbers))
+  if (S_OK != Session->findLinesByVA(Address, Length, &LineNumbers)) {
+    ULONGLONG LoadAddr = 0;
+    if (S_OK != Session->get_loadAddress(&LoadAddr))
+      return nullptr;
+    DWORD RVA = static_cast<DWORD>(Address - LoadAddr);
+    if (S_OK != Session->findLinesByRVA(RVA, Length, &LineNumbers))
+      return nullptr;
+  }
+  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+DIASession::findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const {
+  CComPtr<IDiaEnumLineNumbers> LineNumbers;
+  if (S_OK != Session->findLinesByRVA(RVA, Length, &LineNumbers))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+DIASession::findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
+                                        uint32_t Length) const {
+  CComPtr<IDiaEnumLineNumbers> LineNumbers;
+  if (S_OK != Session->findLinesByAddr(Section, Offset, Length, &LineNumbers))
     return nullptr;
 
   return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
@@ -310,3 +382,40 @@ std::unique_ptr<IPDBEnumTables> DIASession::getEnumTables() const {
 
   return llvm::make_unique<DIAEnumTables>(DiaEnumerator);
 }
+
+template <class T> static CComPtr<T> getTableEnumerator(IDiaSession &Session) {
+  CComPtr<T> Enumerator;
+  CComPtr<IDiaEnumTables> ET;
+  CComPtr<IDiaTable> Table;
+  ULONG Count = 0;
+
+  if (Session.getEnumTables(&ET) != S_OK)
+    return nullptr;
+
+  while (ET->Next(1, &Table, &Count) == S_OK && Count == 1) {
+    // There is only one table that matches the given iid
+    if (S_OK == Table->QueryInterface(__uuidof(T), (void **)&Enumerator))
+      break;
+    Table.Release();
+  }
+  return Enumerator;
+}
+std::unique_ptr<IPDBEnumInjectedSources>
+DIASession::getInjectedSources() const {
+  CComPtr<IDiaEnumInjectedSources> Files =
+      getTableEnumerator<IDiaEnumInjectedSources>(*Session);
+  if (!Files)
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumInjectedSources>(*this, Files);
+}
+
+std::unique_ptr<IPDBEnumSectionContribs>
+DIASession::getSectionContribs() const {
+  CComPtr<IDiaEnumSectionContribs> Sections =
+      getTableEnumerator<IDiaEnumSectionContribs>(*Session);
+  if (!Sections)
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSectionContribs>(*this, Sections);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp b/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp
index 4fcecb92fd15..2a677b9abe2d 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp
@@ -30,7 +30,7 @@ public:
       return "Type server PDB was not found.";
     case generic_error_code::dia_sdk_not_present:
       return "LLVM was not compiled with support for DIA.  This usually means "
-             "that you are are not using MSVC, or your Visual Studio "
+             "that you are not using MSVC, or your Visual Studio "
              "installation "
              "is corrupt.";
     case generic_error_code::invalid_path:
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
index dabcc3447ee5..931ac7bb81db 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
@@ -49,6 +49,10 @@ uint16_t DbiModuleDescriptor::getTypeServerIndex() const {
          ModInfoFlags::TypeServerIndexShift;
 }
 
+const SectionContrib &DbiModuleDescriptor::getSectionContrib() const {
+  return Layout->SC;
+}
+
 uint16_t DbiModuleDescriptor::getModuleStreamIndex() const {
   return Layout->ModDiStream;
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index d765485bdb6d..b97f1e90bcf8 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -60,6 +60,11 @@ void DbiModuleDescriptorBuilder::setPdbFilePathNI(uint32_t NI) {
   PdbFilePathNI = NI;
 }
 
+void DbiModuleDescriptorBuilder::setFirstSectionContrib(
+    const SectionContrib &SC) {
+  Layout.SC = SC;
+}
+
 void DbiModuleDescriptorBuilder::addSymbol(CVSymbol Symbol) {
   Symbols.push_back(Symbol);
   // Symbols written to a PDB file are required to be 4 byte aligned.  The same
@@ -90,7 +95,7 @@ uint32_t DbiModuleDescriptorBuilder::calculateSerializedLength() const {
 }
 
 void DbiModuleDescriptorBuilder::finalize() {
-  Layout.SC.ModuleIndex = Layout.Mod;
+  Layout.SC.Imod = Layout.Mod;
   Layout.FileNameOffs = 0; // TODO: Fix this
   Layout.Flags = 0;        // TODO: Fix this
   Layout.C11Bytes = 0;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 04e6664c68db..edaa783398ca 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -45,12 +45,12 @@ static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
   return Error::success();
 }
 
-DbiStream::DbiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
-    : Pdb(File), Stream(std::move(Stream)), Header(nullptr) {}
+DbiStream::DbiStream(std::unique_ptr<BinaryStream> Stream)
+    : Stream(std::move(Stream)), Header(nullptr) {}
 
 DbiStream::~DbiStream() = default;
 
-Error DbiStream::reload() {
+Error DbiStream::reload(PDBFile *Pdb) {
   BinaryStreamReader Reader(*Stream);
 
   if (Stream->getLength() < sizeof(DbiStreamHeader))
@@ -123,11 +123,11 @@ Error DbiStream::reload() {
 
   if (auto EC = initializeSectionContributionData())
     return EC;
-  if (auto EC = initializeSectionHeadersData())
+  if (auto EC = initializeSectionHeadersData(Pdb))
     return EC;
   if (auto EC = initializeSectionMapData())
     return EC;
-  if (auto EC = initializeFpoRecords())
+  if (auto EC = initializeFpoRecords(Pdb))
     return EC;
 
   if (Reader.bytesRemaining() > 0)
@@ -246,7 +246,10 @@ Error DbiStream::initializeSectionContributionData() {
 }
 
 // Initializes this->SectionHeaders.
-Error DbiStream::initializeSectionHeadersData() {
+Error DbiStream::initializeSectionHeadersData(PDBFile *Pdb) {
+  if (!Pdb)
+    return Error::success();
+
   if (DbgStreams.size() == 0)
     return Error::success();
 
@@ -254,11 +257,11 @@ Error DbiStream::initializeSectionHeadersData() {
   if (StreamNum == kInvalidStreamIndex)
     return Error::success();
 
-  if (StreamNum >= Pdb.getNumStreams())
+  if (StreamNum >= Pdb->getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
 
   auto SHS = MappedBlockStream::createIndexedStream(
-      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator());
+      Pdb->getMsfLayout(), Pdb->getMsfBuffer(), StreamNum, Pdb->getAllocator());
 
   size_t StreamLen = SHS->getLength();
   if (StreamLen % sizeof(object::coff_section))
@@ -276,7 +279,10 @@ Error DbiStream::initializeSectionHeadersData() {
 }
 
 // Initializes this->Fpos.
-Error DbiStream::initializeFpoRecords() {
+Error DbiStream::initializeFpoRecords(PDBFile *Pdb) {
+  if (!Pdb)
+    return Error::success();
+
   if (DbgStreams.size() == 0)
     return Error::success();
 
@@ -286,11 +292,11 @@ Error DbiStream::initializeFpoRecords() {
   if (StreamNum == kInvalidStreamIndex)
     return Error::success();
 
-  if (StreamNum >= Pdb.getNumStreams())
+  if (StreamNum >= Pdb->getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
 
   auto FS = MappedBlockStream::createIndexedStream(
-      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator());
+      Pdb->getMsfLayout(), Pdb->getMsfBuffer(), StreamNum, Pdb->getAllocator());
 
   size_t StreamLen = FS->getLength();
   if (StreamLen % sizeof(object::FpoData))
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index c96553ff9b16..f6043bfd7cf9 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -27,7 +27,7 @@ using namespace llvm::pdb;
 DbiStreamBuilder::DbiStreamBuilder(msf::MSFBuilder &Msf)
     : Msf(Msf), Allocator(Msf.getAllocator()), Age(1), BuildNumber(0),
       PdbDllVersion(0), PdbDllRbld(0), Flags(0), MachineType(PDB_Machine::x86),
-      Header(nullptr), DbgStreams((int)DbgHeaderType::Max) {}
+      Header(nullptr) {}
 
 DbiStreamBuilder::~DbiStreamBuilder() {}
 
@@ -37,6 +37,14 @@ void DbiStreamBuilder::setAge(uint32_t A) { Age = A; }
 
 void DbiStreamBuilder::setBuildNumber(uint16_t B) { BuildNumber = B; }
 
+void DbiStreamBuilder::setBuildNumber(uint8_t Major, uint8_t Minor) {
+  BuildNumber = (uint16_t(Major) << DbiBuildNo::BuildMajorShift) &
+                DbiBuildNo::BuildMajorMask;
+  BuildNumber |= (uint16_t(Minor) << DbiBuildNo::BuildMinorShift) &
+                 DbiBuildNo::BuildMinorMask;
+  BuildNumber |= DbiBuildNo::NewVersionFormatMask;
+}
+
 void DbiStreamBuilder::setPdbDllVersion(uint16_t V) { PdbDllVersion = V; }
 
 void DbiStreamBuilder::setPdbDllRbld(uint16_t R) { PdbDllRbld = R; }
@@ -45,6 +53,11 @@ void DbiStreamBuilder::setFlags(uint16_t F) { Flags = F; }
 
 void DbiStreamBuilder::setMachineType(PDB_Machine M) { MachineType = M; }
 
+void DbiStreamBuilder::setMachineType(COFF::MachineTypes M) {
+  // These enums are mirrors of each other, so we can just cast the value.
+  MachineType = static_cast<pdb::PDB_Machine>(static_cast<unsigned>(M));
+}
+
 void DbiStreamBuilder::setSectionMap(ArrayRef<SecMapEntry> SecMap) {
   SectionMap = SecMap;
 }
@@ -63,15 +76,8 @@ void DbiStreamBuilder::setPublicsStreamIndex(uint32_t Index) {
 
 Error DbiStreamBuilder::addDbgStream(pdb::DbgHeaderType Type,
                                      ArrayRef<uint8_t> Data) {
-  if (DbgStreams[(int)Type].StreamNumber != kInvalidStreamIndex)
-    return make_error<RawError>(raw_error_code::duplicate_entry,
-                                "The specified stream type already exists");
-  auto ExpectedIndex = Msf.addStream(Data.size());
-  if (!ExpectedIndex)
-    return ExpectedIndex.takeError();
-  uint32_t Index = std::move(*ExpectedIndex);
-  DbgStreams[(int)Type].Data = Data;
-  DbgStreams[(int)Type].StreamNumber = Index;
+  DbgStreams[(int)Type].emplace();
+  DbgStreams[(int)Type]->Data = Data;
   return Error::success();
 }
 
@@ -258,7 +264,7 @@ Error DbiStreamBuilder::finalize() {
   H->TypeServerSize = 0;
   H->SymRecordStreamIndex = SymRecordStreamIndex;
   H->PublicSymbolStreamIndex = PublicsStreamIndex;
-  H->MFCTypeServerIndex = kInvalidStreamIndex;
+  H->MFCTypeServerIndex = 0; // Not sure what this is, but link.exe writes 0.
   H->GlobalSymbolStreamIndex = GlobalsStreamIndex;
 
   Header = H;
@@ -266,6 +272,15 @@ Error DbiStreamBuilder::finalize() {
 }
 
 Error DbiStreamBuilder::finalizeMsfLayout() {
+  for (auto &S : DbgStreams) {
+    if (!S.hasValue())
+      continue;
+    auto ExpectedIndex = Msf.addStream(S->Data.size());
+    if (!ExpectedIndex)
+      return ExpectedIndex.takeError();
+    S->StreamNumber = *ExpectedIndex;
+  }
+
   for (auto &MI : ModiList) {
     if (auto EC = MI->finalizeMsfLayout())
       return EC;
@@ -375,17 +390,23 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = ECNamesBuilder.commit(Writer))
     return EC;
 
-  for (auto &Stream : DbgStreams)
-    if (auto EC = Writer.writeInteger(Stream.StreamNumber))
+  for (auto &Stream : DbgStreams) {
+    uint16_t StreamNumber = kInvalidStreamIndex;
+    if (Stream.hasValue())
+      StreamNumber = Stream->StreamNumber;
+    if (auto EC = Writer.writeInteger(StreamNumber))
       return EC;
+  }
 
   for (auto &Stream : DbgStreams) {
-    if (Stream.StreamNumber == kInvalidStreamIndex)
+    if (!Stream.hasValue())
       continue;
+    assert(Stream->StreamNumber != kInvalidStreamIndex);
+
     auto WritableStream = WritableMappedBlockStream::createIndexedStream(
-        Layout, MsfBuffer, Stream.StreamNumber, Allocator);
+        Layout, MsfBuffer, Stream->StreamNumber, Allocator);
     BinaryStreamWriter DbgStreamWriter(*WritableStream);
-    if (auto EC = DbgStreamWriter.writeArray(Stream.Data))
+    if (auto EC = DbgStreamWriter.writeArray(Stream->Data))
       return EC;
   }
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index e84f25dfeefa..58efc2256ae1 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -82,8 +82,29 @@ Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
   return Error::success();
 }
 
+static bool isAsciiString(StringRef S) {
+  return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
+}
+
+// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
+static bool gsiRecordLess(StringRef S1, StringRef S2) {
+  size_t LS = S1.size();
+  size_t RS = S2.size();
+  // Shorter strings always compare less than longer strings.
+  if (LS != RS)
+    return LS < RS;
+
+  // If either string contains non ascii characters, memcmp them.
+  if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
+    return memcmp(S1.data(), S2.data(), LS) < 0;
+
+  // Both strings are ascii, perform a case-insenstive comparison.
+  return S1.compare_lower(S2.data()) < 0;
+}
+
 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
-  std::array<std::vector<PSHashRecord>, IPHR_HASH + 1> TmpBuckets;
+  std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1>
+      TmpBuckets;
   uint32_t SymOffset = RecordZeroOffset;
   for (const CVSymbol &Sym : Records) {
     PSHashRecord HR;
@@ -94,8 +115,7 @@ void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
     // Hash the name to figure out which bucket this goes into.
     StringRef Name = getSymbolName(Sym);
     size_t BucketIdx = hashStringV1(Name) % IPHR_HASH;
-    TmpBuckets[BucketIdx].push_back(HR); // FIXME: Does order matter?
-
+    TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR));
     SymOffset += Sym.length();
   }
 
@@ -117,8 +137,21 @@ void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
     ulittle32_t ChainStartOff =
         ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
     HashBuckets.push_back(ChainStartOff);
-    for (const auto &HR : Bucket)
-      HashRecords.push_back(HR);
+
+    // Sort each bucket by memcmp of the symbol's name.  It's important that
+    // we use the same sorting algorithm as is used by the reference
+    // implementation to ensure that the search for a record within a bucket
+    // can properly early-out when it detects the record won't be found.  The
+    // algorithm used here corredsponds to the function
+    // caseInsensitiveComparePchPchCchCch in the reference implementation.
+    llvm::sort(Bucket.begin(), Bucket.end(),
+              [](const std::pair<StringRef, PSHashRecord> &Left,
+                 const std::pair<StringRef, PSHashRecord> &Right) {
+                return gsiRecordLess(Left.first, Right.first);
+              });
+
+    for (const auto &Entry : Bucket)
+      HashRecords.push_back(Entry.second);
   }
 }
 
@@ -150,14 +183,14 @@ Error GSIStreamBuilder::finalizeMsfLayout() {
   PSH->finalizeBuckets(PSHZero);
   GSH->finalizeBuckets(GSHZero);
 
-  Expected<uint32_t> Idx = Msf.addStream(calculatePublicsHashStreamSize());
+  Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
   if (!Idx)
     return Idx.takeError();
-  PSH->StreamIndex = *Idx;
-  Idx = Msf.addStream(calculateGlobalsHashStreamSize());
+  GSH->StreamIndex = *Idx;
+  Idx = Msf.addStream(calculatePublicsHashStreamSize());
   if (!Idx)
     return Idx.takeError();
-  GSH->StreamIndex = *Idx;
+  PSH->StreamIndex = *Idx;
 
   uint32_t RecordBytes =
       GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize();
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
index 439217f91d04..cfabc9cd1ad8 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
@@ -22,200 +22,7 @@
 using namespace llvm;
 using namespace llvm::pdb;
 
-HashTable::HashTable() : HashTable(8) {}
-
-HashTable::HashTable(uint32_t Capacity) { Buckets.resize(Capacity); }
-
-Error HashTable::load(BinaryStreamReader &Stream) {
-  const Header *H;
-  if (auto EC = Stream.readObject(H))
-    return EC;
-  if (H->Capacity == 0)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid Hash Table Capacity");
-  if (H->Size > maxLoad(H->Capacity))
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid Hash Table Size");
-
-  Buckets.resize(H->Capacity);
-
-  if (auto EC = readSparseBitVector(Stream, Present))
-    return EC;
-  if (Present.count() != H->Size)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Present bit vector does not match size!");
-
-  if (auto EC = readSparseBitVector(Stream, Deleted))
-    return EC;
-  if (Present.intersects(Deleted))
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Present bit vector interesects deleted!");
-
-  for (uint32_t P : Present) {
-    if (auto EC = Stream.readInteger(Buckets[P].first))
-      return EC;
-    if (auto EC = Stream.readInteger(Buckets[P].second))
-      return EC;
-  }
-
-  return Error::success();
-}
-
-uint32_t HashTable::calculateSerializedLength() const {
-  uint32_t Size = sizeof(Header);
-
-  int NumBitsP = Present.find_last() + 1;
-  int NumBitsD = Deleted.find_last() + 1;
-
-  // Present bit set number of words, followed by that many actual words.
-  Size += sizeof(uint32_t);
-  Size += alignTo(NumBitsP, sizeof(uint32_t));
-
-  // Deleted bit set number of words, followed by that many actual words.
-  Size += sizeof(uint32_t);
-  Size += alignTo(NumBitsD, sizeof(uint32_t));
-
-  // One (Key, Value) pair for each entry Present.
-  Size += 2 * sizeof(uint32_t) * size();
-
-  return Size;
-}
-
-Error HashTable::commit(BinaryStreamWriter &Writer) const {
-  Header H;
-  H.Size = size();
-  H.Capacity = capacity();
-  if (auto EC = Writer.writeObject(H))
-    return EC;
-
-  if (auto EC = writeSparseBitVector(Writer, Present))
-    return EC;
-
-  if (auto EC = writeSparseBitVector(Writer, Deleted))
-    return EC;
-
-  for (const auto &Entry : *this) {
-    if (auto EC = Writer.writeInteger(Entry.first))
-      return EC;
-    if (auto EC = Writer.writeInteger(Entry.second))
-      return EC;
-  }
-  return Error::success();
-}
-
-void HashTable::clear() {
-  Buckets.resize(8);
-  Present.clear();
-  Deleted.clear();
-}
-
-uint32_t HashTable::capacity() const { return Buckets.size(); }
-
-uint32_t HashTable::size() const { return Present.count(); }
-
-HashTableIterator HashTable::begin() const { return HashTableIterator(*this); }
-
-HashTableIterator HashTable::end() const {
-  return HashTableIterator(*this, 0, true);
-}
-
-HashTableIterator HashTable::find(uint32_t K) {
-  uint32_t H = K % capacity();
-  uint32_t I = H;
-  Optional<uint32_t> FirstUnused;
-  do {
-    if (isPresent(I)) {
-      if (Buckets[I].first == K)
-        return HashTableIterator(*this, I, false);
-    } else {
-      if (!FirstUnused)
-        FirstUnused = I;
-      // Insertion occurs via linear probing from the slot hint, and will be
-      // inserted at the first empty / deleted location.  Therefore, if we are
-      // probing and find a location that is neither present nor deleted, then
-      // nothing must have EVER been inserted at this location, and thus it is
-      // not possible for a matching value to occur later.
-      if (!isDeleted(I))
-        break;
-    }
-    I = (I + 1) % capacity();
-  } while (I != H);
-
-  // The only way FirstUnused would not be set is if every single entry in the
-  // table were Present.  But this would violate the load factor constraints
-  // that we impose, so it should never happen.
-  assert(FirstUnused);
-  return HashTableIterator(*this, *FirstUnused, true);
-}
-
-void HashTable::set(uint32_t K, uint32_t V) {
-  auto Entry = find(K);
-  if (Entry != end()) {
-    assert(isPresent(Entry.index()));
-    assert(Buckets[Entry.index()].first == K);
-    // We're updating, no need to do anything special.
-    Buckets[Entry.index()].second = V;
-    return;
-  }
-
-  auto &B = Buckets[Entry.index()];
-  assert(!isPresent(Entry.index()));
-  assert(Entry.isEnd());
-  B.first = K;
-  B.second = V;
-  Present.set(Entry.index());
-  Deleted.reset(Entry.index());
-
-  grow();
-
-  assert(find(K) != end());
-}
-
-void HashTable::remove(uint32_t K) {
-  auto Iter = find(K);
-  // It wasn't here to begin with, just exit.
-  if (Iter == end())
-    return;
-
-  assert(Present.test(Iter.index()));
-  assert(!Deleted.test(Iter.index()));
-  Deleted.set(Iter.index());
-  Present.reset(Iter.index());
-}
-
-uint32_t HashTable::get(uint32_t K) {
-  auto I = find(K);
-  assert(I != end());
-  return (*I).second;
-}
-
-uint32_t HashTable::maxLoad(uint32_t capacity) { return capacity * 2 / 3 + 1; }
-
-void HashTable::grow() {
-  uint32_t S = size();
-  if (S < maxLoad(capacity()))
-    return;
-  assert(capacity() != UINT32_MAX && "Can't grow Hash table!");
-
-  uint32_t NewCapacity =
-      (capacity() <= INT32_MAX) ? capacity() * 2 : UINT32_MAX;
-
-  // Growing requires rebuilding the table and re-hashing every item.  Make a
-  // copy with a larger capacity, insert everything into the copy, then swap
-  // it in.
-  HashTable NewMap(NewCapacity);
-  for (auto I : Present) {
-    NewMap.set(Buckets[I].first, Buckets[I].second);
-  }
-
-  Buckets.swap(NewMap.Buckets);
-  std::swap(Present, NewMap.Present);
-  std::swap(Deleted, NewMap.Deleted);
-  assert(capacity() == NewCapacity);
-  assert(size() == S);
-}
-
-Error HashTable::readSparseBitVector(BinaryStreamReader &Stream,
+Error llvm::pdb::readSparseBitVector(BinaryStreamReader &Stream,
                                      SparseBitVector<> &V) {
   uint32_t NumWords;
   if (auto EC = Stream.readInteger(NumWords))
@@ -237,18 +44,20 @@ Error HashTable::readSparseBitVector(BinaryStreamReader &Stream,
   return Error::success();
 }
 
-Error HashTable::writeSparseBitVector(BinaryStreamWriter &Writer,
+Error llvm::pdb::writeSparseBitVector(BinaryStreamWriter &Writer,
                                       SparseBitVector<> &Vec) {
+  constexpr int BitsPerWord = 8 * sizeof(uint32_t);
+
   int ReqBits = Vec.find_last() + 1;
-  uint32_t NumWords = alignTo(ReqBits, sizeof(uint32_t)) / sizeof(uint32_t);
-  if (auto EC = Writer.writeInteger(NumWords))
+  uint32_t ReqWords = alignTo(ReqBits, BitsPerWord) / BitsPerWord;
+  if (auto EC = Writer.writeInteger(ReqWords))
     return joinErrors(
         std::move(EC),
         make_error<RawError>(raw_error_code::corrupt_file,
                              "Could not write linear map number of words"));
 
   uint32_t Idx = 0;
-  for (uint32_t I = 0; I != NumWords; ++I) {
+  for (uint32_t I = 0; I != ReqWords; ++I) {
     uint32_t Word = 0;
     for (uint32_t WordIdx = 0; WordIdx < 32; ++WordIdx, ++Idx) {
       if (Vec.test(Idx))
@@ -261,48 +70,3 @@ Error HashTable::writeSparseBitVector(BinaryStreamWriter &Writer,
   }
   return Error::success();
 }
-
-HashTableIterator::HashTableIterator(const HashTable &Map, uint32_t Index,
-                                     bool IsEnd)
-    : Map(&Map), Index(Index), IsEnd(IsEnd) {}
-
-HashTableIterator::HashTableIterator(const HashTable &Map) : Map(&Map) {
-  int I = Map.Present.find_first();
-  if (I == -1) {
-    Index = 0;
-    IsEnd = true;
-  } else {
-    Index = static_cast<uint32_t>(I);
-    IsEnd = false;
-  }
-}
-
-HashTableIterator &HashTableIterator::operator=(const HashTableIterator &R) {
-  Map = R.Map;
-  return *this;
-}
-
-bool HashTableIterator::operator==(const HashTableIterator &R) const {
-  if (IsEnd && R.IsEnd)
-    return true;
-  if (IsEnd != R.IsEnd)
-    return false;
-
-  return (Map == R.Map) && (Index == R.Index);
-}
-
-const std::pair<uint32_t, uint32_t> &HashTableIterator::operator*() const {
-  assert(Map->Present.test(Index));
-  return Map->Buckets[Index];
-}
-
-HashTableIterator &HashTableIterator::operator++() {
-  while (Index < Map->Buckets.size()) {
-    ++Index;
-    if (Map->Present.test(Index))
-      return *this;
-  }
-
-  IsEnd = true;
-  return *this;
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
index 17c9392a9dd5..973a520ffca9 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -20,20 +20,19 @@ using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
-    : Stream(std::move(Stream)) {}
+InfoStream::InfoStream(std::unique_ptr<BinaryStream> Stream)
+    : Stream(std::move(Stream)), Header(nullptr) {}
 
 Error InfoStream::reload() {
   BinaryStreamReader Reader(*Stream);
 
-  const InfoStreamHeader *H;
-  if (auto EC = Reader.readObject(H))
+  if (auto EC = Reader.readObject(Header))
     return joinErrors(
         std::move(EC),
         make_error<RawError>(raw_error_code::corrupt_file,
                              "PDB Stream does not contain a header."));
 
-  switch (H->Version) {
+  switch (Header->Version) {
   case PdbImplVC70:
   case PdbImplVC80:
   case PdbImplVC110:
@@ -44,11 +43,6 @@ Error InfoStream::reload() {
                                 "Unsupported PDB stream version.");
   }
 
-  Version = H->Version;
-  Signature = H->Signature;
-  Age = H->Age;
-  Guid = H->Guid;
-
   uint32_t Offset = Reader.getOffset();
   if (auto EC = NamedStreams.load(Reader))
     return EC;
@@ -92,15 +86,14 @@ Error InfoStream::reload() {
 
 uint32_t InfoStream::getStreamSize() const { return Stream->getLength(); }
 
-uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
+Expected<uint32_t> InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
   uint32_t Result;
   if (!NamedStreams.get(Name, Result))
-    return 0;
+    return make_error<RawError>(raw_error_code::no_stream);
   return Result;
 }
 
-iterator_range<StringMapConstIterator<uint32_t>>
-InfoStream::named_streams() const {
+StringMap<uint32_t> InfoStream::named_streams() const {
   return NamedStreams.entries();
 }
 
@@ -109,14 +102,16 @@ bool InfoStream::containsIdStream() const {
 }
 
 PdbRaw_ImplVer InfoStream::getVersion() const {
-  return static_cast<PdbRaw_ImplVer>(Version);
+  return static_cast<PdbRaw_ImplVer>(uint32_t(Header->Version));
 }
 
-uint32_t InfoStream::getSignature() const { return Signature; }
+uint32_t InfoStream::getSignature() const {
+  return uint32_t(Header->Signature);
+}
 
-uint32_t InfoStream::getAge() const { return Age; }
+uint32_t InfoStream::getAge() const { return uint32_t(Header->Age); }
 
-GUID InfoStream::getGuid() const { return Guid; }
+GUID InfoStream::getGuid() const { return Header->Guid; }
 
 uint32_t InfoStream::getNamedStreamMapByteSize() const {
   return NamedStreamMapByteSize;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index 6450ae752f96..54d6835f1121 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -25,15 +25,17 @@ using namespace llvm::pdb;
 
 InfoStreamBuilder::InfoStreamBuilder(msf::MSFBuilder &Msf,
                                      NamedStreamMap &NamedStreams)
-    : Msf(Msf), Ver(PdbRaw_ImplVer::PdbImplVC70), Sig(-1), Age(0),
-      NamedStreams(NamedStreams) {}
+    : Msf(Msf), Ver(PdbRaw_ImplVer::PdbImplVC70), Age(0),
+      NamedStreams(NamedStreams) {
+  ::memset(&Guid, 0, sizeof(Guid));
+}
 
 void InfoStreamBuilder::setVersion(PdbRaw_ImplVer V) { Ver = V; }
 
-void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; }
-
 void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
 
+void InfoStreamBuilder::setSignature(uint32_t S) { Signature = S; }
+
 void InfoStreamBuilder::setGuid(GUID G) { Guid = G; }
 
 void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
@@ -41,7 +43,8 @@ void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
 }
 
 Error InfoStreamBuilder::finalizeMsfLayout() {
-  uint32_t Length = sizeof(InfoStreamHeader) + NamedStreams.finalize() +
+  uint32_t Length = sizeof(InfoStreamHeader) +
+                    NamedStreams.calculateSerializedLength() +
                     (Features.size() + 1) * sizeof(uint32_t);
   if (auto EC = Msf.setStreamSize(StreamPDB, Length))
     return EC;
@@ -55,10 +58,10 @@ Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
   BinaryStreamWriter Writer(*InfoS);
 
   InfoStreamHeader H;
-  H.Age = Age;
-  H.Signature = Sig;
+  // Leave the build id fields 0 so they can be set as the last step before
+  // committing the file to disk.
+  ::memset(&H, 0, sizeof(H));
   H.Version = Ver;
-  H.Guid = Guid;
   if (auto EC = Writer.writeObject(H))
     return EC;
 
@@ -70,5 +73,6 @@ Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
     if (auto EC = Writer.writeEnum(E))
       return EC;
   }
+  assert(Writer.bytesRemaining() == 0);
   return Error::success();
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
index 6cdf6dde04d9..a4eaed90837d 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -26,127 +27,101 @@
 using namespace llvm;
 using namespace llvm::pdb;
 
-// FIXME: This shouldn't be necessary, but if we insert the strings in any
-// other order, cvdump cannot read the generated name map.  This suggests that
-// we may be using the wrong hash function.  A closer inspection of the cvdump
-// source code may reveal something, but for now this at least makes us work,
-// even if only by accident.
-static constexpr const char *OrderedStreamNames[] = {"/LinkInfo", "/names",
-                                                     "/src/headerblock"};
+NamedStreamMapTraits::NamedStreamMapTraits(NamedStreamMap &NS) : NS(&NS) {}
 
-NamedStreamMap::NamedStreamMap() = default;
+uint16_t NamedStreamMapTraits::hashLookupKey(StringRef S) const {
+  // In the reference implementation, this uses
+  // HASH Hasher<ULONG*, USHORT*>::hashPbCb(PB pb, size_t cb, ULONG ulMod).
+  // Here, the type HASH is a typedef of unsigned short.
+  // ** It is not a bug that we truncate the result of hashStringV1, in fact
+  //    it is a bug if we do not! **
+  return static_cast<uint16_t>(hashStringV1(S));
+}
 
-Error NamedStreamMap::load(BinaryStreamReader &Stream) {
-  Mapping.clear();
-  FinalizedHashTable.clear();
-  FinalizedInfo.reset();
+StringRef NamedStreamMapTraits::storageKeyToLookupKey(uint32_t Offset) const {
+  return NS->getString(Offset);
+}
 
+uint32_t NamedStreamMapTraits::lookupKeyToStorageKey(StringRef S) {
+  return NS->appendStringData(S);
+}
+
+NamedStreamMap::NamedStreamMap()
+    : HashTraits(*this), OffsetIndexMap(1, HashTraits) {}
+
+Error NamedStreamMap::load(BinaryStreamReader &Stream) {
   uint32_t StringBufferSize;
   if (auto EC = Stream.readInteger(StringBufferSize))
     return joinErrors(std::move(EC),
                       make_error<RawError>(raw_error_code::corrupt_file,
                                            "Expected string buffer size"));
 
-  BinaryStreamRef StringsBuffer;
-  if (auto EC = Stream.readStreamRef(StringsBuffer, StringBufferSize))
+  StringRef Buffer;
+  if (auto EC = Stream.readFixedString(Buffer, StringBufferSize))
     return EC;
+  NamesBuffer.assign(Buffer.begin(), Buffer.end());
 
-  HashTable OffsetIndexMap;
-  if (auto EC = OffsetIndexMap.load(Stream))
-    return EC;
-
-  uint32_t NameOffset;
-  uint32_t NameIndex;
-  for (const auto &Entry : OffsetIndexMap) {
-    std::tie(NameOffset, NameIndex) = Entry;
-
-    // Compute the offset of the start of the string relative to the stream.
-    BinaryStreamReader NameReader(StringsBuffer);
-    NameReader.setOffset(NameOffset);
-    // Pump out our c-string from the stream.
-    StringRef Str;
-    if (auto EC = NameReader.readCString(Str))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name"));
-
-    // Add this to a string-map from name to stream number.
-    Mapping.insert({Str, NameIndex});
-  }
-
-  return Error::success();
+  return OffsetIndexMap.load(Stream);
 }
 
 Error NamedStreamMap::commit(BinaryStreamWriter &Writer) const {
-  assert(FinalizedInfo.hasValue());
-
   // The first field is the number of bytes of string data.
-  if (auto EC = Writer.writeInteger(FinalizedInfo->StringDataBytes))
+  if (auto EC = Writer.writeInteger<uint32_t>(NamesBuffer.size()))
     return EC;
 
-  for (const auto &Name : OrderedStreamNames) {
-    auto Item = Mapping.find(Name);
-    if (Item == Mapping.end())
-      continue;
-    if (auto EC = Writer.writeCString(Item->getKey()))
-      return EC;
-  }
+  // Then the actual string data.
+  StringRef Data(NamesBuffer.data(), NamesBuffer.size());
+  if (auto EC = Writer.writeFixedString(Data))
+    return EC;
 
   // And finally the Offset Index map.
-  if (auto EC = FinalizedHashTable.commit(Writer))
+  if (auto EC = OffsetIndexMap.commit(Writer))
     return EC;
 
   return Error::success();
 }
 
-uint32_t NamedStreamMap::finalize() {
-  if (FinalizedInfo.hasValue())
-    return FinalizedInfo->SerializedLength;
-
-  // Build the finalized hash table.
-  FinalizedHashTable.clear();
-  FinalizedInfo.emplace();
+uint32_t NamedStreamMap::calculateSerializedLength() const {
+  return sizeof(uint32_t)                              // String data size
+         + NamesBuffer.size()                          // String data
+         + OffsetIndexMap.calculateSerializedLength(); // Offset Index Map
+}
 
-  for (const auto &Name : OrderedStreamNames) {
-    auto Item = Mapping.find(Name);
-    if (Item == Mapping.end())
-      continue;
-    FinalizedHashTable.set(FinalizedInfo->StringDataBytes, Item->getValue());
-    FinalizedInfo->StringDataBytes += Item->getKeyLength() + 1;
-  }
+uint32_t NamedStreamMap::size() const { return OffsetIndexMap.size(); }
 
-  // Number of bytes of string data.
-  FinalizedInfo->SerializedLength += sizeof(support::ulittle32_t);
-  // Followed by that many actual bytes of string data.
-  FinalizedInfo->SerializedLength += FinalizedInfo->StringDataBytes;
-  // Followed by the mapping from Offset to Index.
-  FinalizedInfo->SerializedLength +=
-      FinalizedHashTable.calculateSerializedLength();
-  return FinalizedInfo->SerializedLength;
+StringRef NamedStreamMap::getString(uint32_t Offset) const {
+  assert(NamesBuffer.size() > Offset);
+  return StringRef(NamesBuffer.data() + Offset);
 }
 
-iterator_range<StringMapConstIterator<uint32_t>>
-NamedStreamMap::entries() const {
-  return make_range<StringMapConstIterator<uint32_t>>(Mapping.begin(),
-                                                      Mapping.end());
+uint32_t NamedStreamMap::hashString(uint32_t Offset) const {
+  return hashStringV1(getString(Offset));
 }
 
-uint32_t NamedStreamMap::size() const { return Mapping.size(); }
-
 bool NamedStreamMap::get(StringRef Stream, uint32_t &StreamNo) const {
-  auto Iter = Mapping.find(Stream);
-  if (Iter == Mapping.end())
+  auto Iter = OffsetIndexMap.find_as(Stream);
+  if (Iter == OffsetIndexMap.end())
     return false;
-  StreamNo = Iter->second;
+  StreamNo = (*Iter).second;
   return true;
 }
 
-void NamedStreamMap::set(StringRef Stream, uint32_t StreamNo) {
-  FinalizedInfo.reset();
-  Mapping[Stream] = StreamNo;
+StringMap<uint32_t> NamedStreamMap::entries() const {
+  StringMap<uint32_t> Result;
+  for (const auto &Entry : OffsetIndexMap) {
+    StringRef Stream(NamesBuffer.data() + Entry.first);
+    Result.try_emplace(Stream, Entry.second);
+  }
+  return Result;
 }
 
-void NamedStreamMap::remove(StringRef Stream) {
-  FinalizedInfo.reset();
-  Mapping.erase(Stream);
+uint32_t NamedStreamMap::appendStringData(StringRef S) {
+  uint32_t Offset = NamesBuffer.size();
+  NamesBuffer.insert(NamesBuffer.end(), S.begin(), S.end());
+  NamesBuffer.push_back('\0');
+  return Offset;
+}
+
+void NamedStreamMap::set(StringRef Stream, uint32_t StreamNo) {
+  OffsetIndexMap.set_as(Stream, support::ulittle32_t(StreamNo));
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index d23ee0a09196..a4b029596314 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 
@@ -30,16 +31,60 @@ NativeRawSymbol::findChildren(PDB_SymType Type, StringRef Name,
 }
 
 std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildrenByAddr(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags, uint32_t Section, uint32_t Offset) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildrenByVA(PDB_SymType Type, StringRef Name,
+   PDB_NameSearchFlags Flags, uint64_t VA) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
     PDB_NameSearchFlags Flags, uint32_t RVA) const {
   return nullptr;
 }
 
 std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findInlineFramesByAddr(uint32_t Section,
+                                        uint32_t Offset) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
   return nullptr;
 }
 
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findInlineFramesByVA(uint64_t VA) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeRawSymbol::findInlineeLines() const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeRawSymbol::findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
+                                        uint32_t Length) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeRawSymbol::findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeRawSymbol::findInlineeLinesByVA(uint64_t VA, uint32_t Length) const {
+  return nullptr;
+}
+
 void NativeRawSymbol::getDataBytes(SmallVector<uint8_t, 32> &bytes) const {
   bytes.clear();
 }
@@ -143,7 +188,7 @@ uint32_t NativeRawSymbol::getLiveRangeStartRelativeVirtualAddress() const {
 }
 
 codeview::RegisterId NativeRawSymbol::getLocalBasePointerRegisterId() const {
-  return codeview::RegisterId::EAX;
+  return codeview::RegisterId::CVRegEAX;
 }
 
 uint32_t NativeRawSymbol::getLowerBoundId() const {
@@ -203,7 +248,7 @@ uint32_t NativeRawSymbol::getRank() const {
 }
 
 codeview::RegisterId NativeRawSymbol::getRegisterId() const {
-  return codeview::RegisterId::EAX;
+  return codeview::RegisterId::CVRegEAX;
 }
 
 uint32_t NativeRawSymbol::getRegisterType() const {
@@ -234,6 +279,11 @@ std::string NativeRawSymbol::getSourceFileName() const {
   return {};
 }
 
+std::unique_ptr<IPDBLineNumber>
+NativeRawSymbol::getSrcLineOnTypeDefn() const {
+  return nullptr;
+}
+
 uint32_t NativeRawSymbol::getStride() const {
   return 0;
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index b01c2b54796c..086da13135c5 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -165,7 +165,7 @@ SymIndexId NativeSession::findSymbolByTypeIndex(codeview::TypeIndex Index) {
 
 uint64_t NativeSession::getLoadAddress() const { return 0; }
 
-void NativeSession::setLoadAddress(uint64_t Address) {}
+bool NativeSession::setLoadAddress(uint64_t Address) { return false; }
 
 std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
   const auto Id = static_cast<SymIndexId>(SymbolCache.size());
@@ -185,11 +185,32 @@ NativeSession::getSymbolById(uint32_t SymbolId) const {
              : nullptr;
 }
 
+bool NativeSession::addressForVA(uint64_t VA, uint32_t &Section,
+                                 uint32_t &Offset) const {
+  return false;
+}
+
+bool NativeSession::addressForRVA(uint32_t VA, uint32_t &Section,
+                                  uint32_t &Offset) const {
+  return false;
+}
+
 std::unique_ptr<PDBSymbol>
 NativeSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
   return nullptr;
 }
 
+std::unique_ptr<PDBSymbol>
+NativeSession::findSymbolByRVA(uint32_t RVA, PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbol>
+NativeSession::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
+                                      PDB_SymType Type) const {
+  return nullptr;
+}
+
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
                                const IPDBSourceFile &File) const {
@@ -202,6 +223,17 @@ NativeSession::findLineNumbersByAddress(uint64_t Address,
   return nullptr;
 }
 
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeSession::findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeSession::findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
+                                           uint32_t Length) const {
+  return nullptr;
+}
+
 std::unique_ptr<IPDBEnumSourceFiles>
 NativeSession::findSourceFiles(const PDBSymbolCompiland *Compiland,
                                StringRef Pattern,
@@ -249,3 +281,13 @@ std::unique_ptr<IPDBEnumDataStreams> NativeSession::getDebugStreams() const {
 std::unique_ptr<IPDBEnumTables> NativeSession::getEnumTables() const {
   return nullptr;
 }
+
+std::unique_ptr<IPDBEnumInjectedSources>
+NativeSession::getInjectedSources() const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSectionContribs>
+NativeSession::getSectionContribs() const {
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 15b31d821b1c..78b11937f051 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -289,8 +289,8 @@ Expected<DbiStream &> PDBFile::getPDBDbiStream() {
     auto DbiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamDBI);
     if (!DbiS)
       return DbiS.takeError();
-    auto TempDbi = llvm::make_unique<DbiStream>(*this, std::move(*DbiS));
-    if (auto EC = TempDbi->reload())
+    auto TempDbi = llvm::make_unique<DbiStream>(std::move(*DbiS));
+    if (auto EC = TempDbi->reload(this))
       return std::move(EC);
     Dbi = std::move(TempDbi);
   }
@@ -370,7 +370,10 @@ Expected<PDBStringTable &> PDBFile::getStringTable() {
     if (!IS)
       return IS.takeError();
 
-    uint32_t NameStreamIndex = IS->getNamedStreamIndex("/names");
+    Expected<uint32_t> ExpectedNSI = IS->getNamedStreamIndex("/names");
+    if (!ExpectedNSI)
+      return ExpectedNSI.takeError();
+    uint32_t NameStreamIndex = *ExpectedNSI;
 
     auto NS =
         safelyCreateIndexedStream(ContainerLayout, *Buffer, NameStreamIndex);
@@ -445,7 +448,13 @@ bool PDBFile::hasPDBStringTable() {
   auto IS = getPDBInfoStream();
   if (!IS)
     return false;
-  return IS->getNamedStreamIndex("/names") < getNumStreams();
+  Expected<uint32_t> ExpectedNSI = IS->getNamedStreamIndex("/names");
+  if (!ExpectedNSI) {
+    consumeError(ExpectedNSI.takeError());
+    return false;
+  }
+  assert(*ExpectedNSI < getNumStreams());
+  return true;
 }
 
 /// Wrapper around MappedBlockStream::createIndexedStream() that checks if a
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index dee27c621fac..e164e7cf1c52 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -24,6 +24,8 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
 #include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/JamCRC.h"
+#include "llvm/Support/Path.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -32,7 +34,8 @@ using namespace llvm::pdb;
 using namespace llvm::support;
 
 PDBFileBuilder::PDBFileBuilder(BumpPtrAllocator &Allocator)
-    : Allocator(Allocator) {}
+    : Allocator(Allocator), InjectedSourceHashTraits(Strings),
+      InjectedSourceTable(2, InjectedSourceHashTraits) {}
 
 PDBFileBuilder::~PDBFileBuilder() {}
 
@@ -80,15 +83,46 @@ GSIStreamBuilder &PDBFileBuilder::getGsiBuilder() {
   return *Gsi;
 }
 
-Error PDBFileBuilder::addNamedStream(StringRef Name, uint32_t Size) {
+Expected<uint32_t> PDBFileBuilder::allocateNamedStream(StringRef Name,
+                                                       uint32_t Size) {
   auto ExpectedStream = Msf->addStream(Size);
-  if (!ExpectedStream)
-    return ExpectedStream.takeError();
-  NamedStreams.set(Name, *ExpectedStream);
+  if (ExpectedStream)
+    NamedStreams.set(Name, *ExpectedStream);
+  return ExpectedStream;
+}
+
+Error PDBFileBuilder::addNamedStream(StringRef Name, StringRef Data) {
+  Expected<uint32_t> ExpectedIndex = allocateNamedStream(Name, Data.size());
+  if (!ExpectedIndex)
+    return ExpectedIndex.takeError();
+  assert(NamedStreamData.count(*ExpectedIndex) == 0);
+  NamedStreamData[*ExpectedIndex] = Data;
   return Error::success();
 }
 
-Expected<msf::MSFLayout> PDBFileBuilder::finalizeMsfLayout() {
+void PDBFileBuilder::addInjectedSource(StringRef Name,
+                                       std::unique_ptr<MemoryBuffer> Buffer) {
+  // Stream names must be exact matches, since they get looked up in a hash
+  // table and the hash value is dependent on the exact contents of the string.
+  // link.exe lowercases a path and converts / to \, so we must do the same.
+  SmallString<64> VName;
+  sys::path::native(Name.lower(), VName);
+
+  uint32_t NI = getStringTableBuilder().insert(Name);
+  uint32_t VNI = getStringTableBuilder().insert(VName);
+
+  InjectedSourceDescriptor Desc;
+  Desc.Content = std::move(Buffer);
+  Desc.NameIndex = NI;
+  Desc.VNameIndex = VNI;
+  Desc.StreamName = "/src/files/";
+
+  Desc.StreamName += VName;
+
+  InjectedSources.push_back(std::move(Desc));
+}
+
+Error PDBFileBuilder::finalizeMsfLayout() {
 
   if (Ipi && Ipi->getRecordCount() > 0) {
     // In theory newer PDBs always have an ID stream, but by saying that we're
@@ -101,38 +135,85 @@ Expected<msf::MSFLayout> PDBFileBuilder::finalizeMsfLayout() {
 
   uint32_t StringsLen = Strings.calculateSerializedSize();
 
-  if (auto EC = addNamedStream("/names", StringsLen))
-    return std::move(EC);
-  if (auto EC = addNamedStream("/LinkInfo", 0))
-    return std::move(EC);
+  Expected<uint32_t> SN = allocateNamedStream("/LinkInfo", 0);
+  if (!SN)
+    return SN.takeError();
 
-  if (Info) {
-    if (auto EC = Info->finalizeMsfLayout())
-      return std::move(EC);
-  }
-  if (Dbi) {
-    if (auto EC = Dbi->finalizeMsfLayout())
-      return std::move(EC);
+  if (Gsi) {
+    if (auto EC = Gsi->finalizeMsfLayout())
+      return EC;
+    if (Dbi) {
+      Dbi->setPublicsStreamIndex(Gsi->getPublicsStreamIndex());
+      Dbi->setGlobalsStreamIndex(Gsi->getGlobalsStreamIndex());
+      Dbi->setSymbolRecordStreamIndex(Gsi->getRecordStreamIdx());
+    }
   }
   if (Tpi) {
     if (auto EC = Tpi->finalizeMsfLayout())
-      return std::move(EC);
+      return EC;
   }
+  if (Dbi) {
+    if (auto EC = Dbi->finalizeMsfLayout())
+      return EC;
+  }
+  SN = allocateNamedStream("/names", StringsLen);
+  if (!SN)
+    return SN.takeError();
+
   if (Ipi) {
     if (auto EC = Ipi->finalizeMsfLayout())
-      return std::move(EC);
+      return EC;
   }
-  if (Gsi) {
-    if (auto EC = Gsi->finalizeMsfLayout())
-      return std::move(EC);
-    if (Dbi) {
-      Dbi->setPublicsStreamIndex(Gsi->getPublicsStreamIndex());
-      Dbi->setGlobalsStreamIndex(Gsi->getGlobalsStreamIndex());
-      Dbi->setSymbolRecordStreamIndex(Gsi->getRecordStreamIdx());
+
+  // Do this last, since it relies on the named stream map being complete, and
+  // that can be updated by previous steps in the finalization.
+  if (Info) {
+    if (auto EC = Info->finalizeMsfLayout())
+      return EC;
+  }
+
+  if (!InjectedSources.empty()) {
+    for (const auto &IS : InjectedSources) {
+      JamCRC CRC(0);
+      CRC.update(makeArrayRef(IS.Content->getBufferStart(),
+                              IS.Content->getBufferSize()));
+
+      SrcHeaderBlockEntry Entry;
+      ::memset(&Entry, 0, sizeof(SrcHeaderBlockEntry));
+      Entry.Size = sizeof(SrcHeaderBlockEntry);
+      Entry.FileSize = IS.Content->getBufferSize();
+      Entry.FileNI = IS.NameIndex;
+      Entry.VFileNI = IS.VNameIndex;
+      Entry.ObjNI = 1;
+      Entry.IsVirtual = 0;
+      Entry.Version =
+          static_cast<uint32_t>(PdbRaw_SrcHeaderBlockVer::SrcVerOne);
+      Entry.CRC = CRC.getCRC();
+      StringRef VName = getStringTableBuilder().getStringForId(IS.VNameIndex);
+      InjectedSourceTable.set_as(VName, std::move(Entry));
     }
+
+    uint32_t SrcHeaderBlockSize =
+        sizeof(SrcHeaderBlockHeader) +
+        InjectedSourceTable.calculateSerializedLength();
+    SN = allocateNamedStream("/src/headerblock", SrcHeaderBlockSize);
+    if (!SN)
+      return SN.takeError();
+    for (const auto &IS : InjectedSources) {
+      SN = allocateNamedStream(IS.StreamName, IS.Content->getBufferSize());
+      if (!SN)
+        return SN.takeError();
+    }
+  }
+
+  // Do this last, since it relies on the named stream map being complete, and
+  // that can be updated by previous steps in the finalization.
+  if (Info) {
+    if (auto EC = Info->finalizeMsfLayout())
+      return EC;
   }
 
-  return Msf->build();
+  return Error::success();
 }
 
 Expected<uint32_t> PDBFileBuilder::getNamedStreamIndex(StringRef Name) const {
@@ -142,70 +223,55 @@ Expected<uint32_t> PDBFileBuilder::getNamedStreamIndex(StringRef Name) const {
   return SN;
 }
 
-void PDBFileBuilder::commitFpm(WritableBinaryStream &MsfBuffer,
-                               const MSFLayout &Layout) {
-  auto FpmStream =
-      WritableMappedBlockStream::createFpmStream(Layout, MsfBuffer, Allocator);
-
-  // We only need to create the alt fpm stream so that it gets initialized.
-  WritableMappedBlockStream::createFpmStream(Layout, MsfBuffer, Allocator,
-                                             true);
-
-  uint32_t BI = 0;
-  BinaryStreamWriter FpmWriter(*FpmStream);
-  while (BI < Layout.SB->NumBlocks) {
-    uint8_t ThisByte = 0;
-    for (uint32_t I = 0; I < 8; ++I) {
-      bool IsFree =
-          (BI < Layout.SB->NumBlocks) ? Layout.FreePageMap.test(BI) : true;
-      uint8_t Mask = uint8_t(IsFree) << I;
-      ThisByte |= Mask;
-      ++BI;
-    }
-    cantFail(FpmWriter.writeObject(ThisByte));
-  }
-  assert(FpmWriter.bytesRemaining() == 0);
+void PDBFileBuilder::commitSrcHeaderBlock(WritableBinaryStream &MsfBuffer,
+                                          const msf::MSFLayout &Layout) {
+  assert(!InjectedSourceTable.empty());
+
+  uint32_t SN = cantFail(getNamedStreamIndex("/src/headerblock"));
+  auto Stream = WritableMappedBlockStream::createIndexedStream(
+      Layout, MsfBuffer, SN, Allocator);
+  BinaryStreamWriter Writer(*Stream);
+
+  SrcHeaderBlockHeader Header;
+  ::memset(&Header, 0, sizeof(Header));
+  Header.Version = static_cast<uint32_t>(PdbRaw_SrcHeaderBlockVer::SrcVerOne);
+  Header.Size = Writer.bytesRemaining();
+
+  cantFail(Writer.writeObject(Header));
+  cantFail(InjectedSourceTable.commit(Writer));
+
+  assert(Writer.bytesRemaining() == 0);
 }
 
-Error PDBFileBuilder::commit(StringRef Filename) {
-  assert(!Filename.empty());
-  auto ExpectedLayout = finalizeMsfLayout();
-  if (!ExpectedLayout)
-    return ExpectedLayout.takeError();
-  auto &Layout = *ExpectedLayout;
-
-  uint64_t Filesize = Layout.SB->BlockSize * Layout.SB->NumBlocks;
-  auto OutFileOrError = FileOutputBuffer::create(Filename, Filesize);
-  if (auto E = OutFileOrError.takeError())
-    return E;
-  FileBufferByteStream Buffer(std::move(*OutFileOrError),
-                              llvm::support::little);
-  BinaryStreamWriter Writer(Buffer);
-
-  if (auto EC = Writer.writeObject(*Layout.SB))
-    return EC;
+void PDBFileBuilder::commitInjectedSources(WritableBinaryStream &MsfBuffer,
+                                           const msf::MSFLayout &Layout) {
+  if (InjectedSourceTable.empty())
+    return;
 
-  commitFpm(Buffer, Layout);
+  commitSrcHeaderBlock(MsfBuffer, Layout);
 
-  uint32_t BlockMapOffset =
-      msf::blockToOffset(Layout.SB->BlockMapAddr, Layout.SB->BlockSize);
-  Writer.setOffset(BlockMapOffset);
-  if (auto EC = Writer.writeArray(Layout.DirectoryBlocks))
-    return EC;
+  for (const auto &IS : InjectedSources) {
+    uint32_t SN = cantFail(getNamedStreamIndex(IS.StreamName));
 
-  auto DirStream = WritableMappedBlockStream::createDirectoryStream(
-      Layout, Buffer, Allocator);
-  BinaryStreamWriter DW(*DirStream);
-  if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size()))
-    return EC;
+    auto SourceStream = WritableMappedBlockStream::createIndexedStream(
+        Layout, MsfBuffer, SN, Allocator);
+    BinaryStreamWriter SourceWriter(*SourceStream);
+    assert(SourceWriter.bytesRemaining() == IS.Content->getBufferSize());
+    cantFail(SourceWriter.writeBytes(
+        arrayRefFromStringRef(IS.Content->getBuffer())));
+  }
+}
 
-  if (auto EC = DW.writeArray(Layout.StreamSizes))
+Error PDBFileBuilder::commit(StringRef Filename) {
+  assert(!Filename.empty());
+  if (auto EC = finalizeMsfLayout())
     return EC;
 
-  for (const auto &Blocks : Layout.StreamMap) {
-    if (auto EC = DW.writeArray(Blocks))
-      return EC;
-  }
+  MSFLayout Layout;
+  auto ExpectedMsfBuffer = Msf->commit(Filename, Layout);
+  if (!ExpectedMsfBuffer)
+    return ExpectedMsfBuffer.takeError();
+  FileBufferByteStream Buffer = std::move(*ExpectedMsfBuffer);
 
   auto ExpectedSN = getNamedStreamIndex("/names");
   if (!ExpectedSN)
@@ -217,6 +283,17 @@ Error PDBFileBuilder::commit(StringRef Filename) {
   if (auto EC = Strings.commit(NSWriter))
     return EC;
 
+  for (const auto &NSE : NamedStreamData) {
+    if (NSE.second.empty())
+      continue;
+
+    auto NS = WritableMappedBlockStream::createIndexedStream(
+        Layout, Buffer, NSE.first, Allocator);
+    BinaryStreamWriter NSW(*NS);
+    if (auto EC = NSW.writeBytes(arrayRefFromStringRef(NSE.second)))
+      return EC;
+  }
+
   if (Info) {
     if (auto EC = Info->commit(Layout, Buffer))
       return EC;
@@ -242,5 +319,22 @@ Error PDBFileBuilder::commit(StringRef Filename) {
       return EC;
   }
 
+  auto InfoStreamBlocks = Layout.StreamMap[StreamPDB];
+  assert(!InfoStreamBlocks.empty());
+  uint64_t InfoStreamFileOffset =
+      blockToOffset(InfoStreamBlocks.front(), Layout.SB->BlockSize);
+  InfoStreamHeader *H = reinterpret_cast<InfoStreamHeader *>(
+      Buffer.getBufferStart() + InfoStreamFileOffset);
+
+  commitInjectedSources(Buffer, Layout);
+
+  // Set the build id at the very end, after every other byte of the PDB
+  // has been written.
+  // FIXME: Use a hash of the PDB rather than time(nullptr) for the signature.
+  H->Age = Info->getAge();
+  H->Guid = Info->getGuid();
+  Optional<uint32_t> Sig = Info->getSignature();
+  H->Signature = Sig.hasValue() ? *Sig : time(nullptr);
+
   return Buffer.commit();
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
index f1c10357132b..afeea32043dd 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
@@ -122,7 +122,10 @@ Expected<uint32_t> PDBStringTable::getIDForString(StringRef Str) const {
     // we iterate the entire array.
     uint32_t Index = (Start + I) % Count;
 
+    // If we find 0, it means the item isn't in the hash table.
     uint32_t ID = IDs[Index];
+    if (ID == 0)
+      return make_error<RawError>(raw_error_code::no_entry);
     auto ExpectedStr = getStringForID(ID);
     if (!ExpectedStr)
       return ExpectedStr.takeError();
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
index ece3e00b1a87..d9dcabf3d958 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
@@ -15,23 +15,101 @@
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 
+#include <map>
+
 using namespace llvm;
 using namespace llvm::msf;
 using namespace llvm::support;
 using namespace llvm::support::endian;
 using namespace llvm::pdb;
 
+StringTableHashTraits::StringTableHashTraits(PDBStringTableBuilder &Table)
+    : Table(&Table) {}
+
+uint32_t StringTableHashTraits::hashLookupKey(StringRef S) const {
+  return Table->getIdForString(S);
+}
+
+StringRef StringTableHashTraits::storageKeyToLookupKey(uint32_t Offset) const {
+  return Table->getStringForId(Offset);
+}
+
+uint32_t StringTableHashTraits::lookupKeyToStorageKey(StringRef S) {
+  return Table->insert(S);
+}
+
 uint32_t PDBStringTableBuilder::insert(StringRef S) {
   return Strings.insert(S);
 }
 
+uint32_t PDBStringTableBuilder::getIdForString(StringRef S) const {
+  return Strings.getIdForString(S);
+}
+
+StringRef PDBStringTableBuilder::getStringForId(uint32_t Id) const {
+  return Strings.getStringForId(Id);
+}
+
+// This is a precomputed list of Buckets given the specified number of
+// strings.  Matching the reference algorithm exactly is not strictly
+// necessary for correctness, but it helps when comparing LLD's PDBs with
+// Microsoft's PDBs so as to eliminate superfluous differences.
+static std::map<uint32_t, uint32_t> StringsToBuckets = {
+    {1, 2},
+    {2, 4},
+    {4, 7},
+    {6, 11},
+    {9, 17},
+    {13, 26},
+    {20, 40},
+    {31, 61},
+    {46, 92},
+    {70, 139},
+    {105, 209},
+    {157, 314},
+    {236, 472},
+    {355, 709},
+    {532, 1064},
+    {799, 1597},
+    {1198, 2396},
+    {1798, 3595},
+    {2697, 5393},
+    {4045, 8090},
+    {6068, 12136},
+    {9103, 18205},
+    {13654, 27308},
+    {20482, 40963},
+    {30723, 61445},
+    {46084, 92168},
+    {69127, 138253},
+    {103690, 207380},
+    {155536, 311071},
+    {233304, 466607},
+    {349956, 699911},
+    {524934, 1049867},
+    {787401, 1574801},
+    {1181101, 2362202},
+    {1771652, 3543304},
+    {2657479, 5314957},
+    {3986218, 7972436},
+    {5979328, 11958655},
+    {8968992, 17937983},
+    {13453488, 26906975},
+    {20180232, 40360463},
+    {30270348, 60540695},
+    {45405522, 90811043},
+    {68108283, 136216565},
+    {102162424, 204324848},
+    {153243637, 306487273},
+    {229865455, 459730910},
+    {344798183, 689596366},
+    {517197275, 1034394550},
+    {775795913, 1551591826}};
+
 static uint32_t computeBucketCount(uint32_t NumStrings) {
-  // The /names stream is basically an on-disk open-addressing hash table.
-  // Hash collisions are resolved by linear probing. We cannot make
-  // utilization 100% because it will make the linear probing extremely
-  // slow. But lower utilization wastes disk space. As a reasonable
-  // load factor, we choose 80%. We need +1 because slot 0 is reserved.
-  return (NumStrings + 1) * 1.25;
+  auto Entry = StringsToBuckets.lower_bound(NumStrings);
+  assert(Entry != StringsToBuckets.end());
+  return Entry->second;
 }
 
 uint32_t PDBStringTableBuilder::calculateHashTableSize() const {
@@ -89,8 +167,6 @@ Error PDBStringTableBuilder::writeHashTable(BinaryStreamWriter &Writer) const {
 
     for (uint32_t I = 0; I != BucketCount; ++I) {
       uint32_t Slot = (Hash + I) % BucketCount;
-      if (Slot == 0)
-        continue; // Skip reserved slot
       if (Buckets[Slot] != 0)
         continue;
       Buckets[Slot] = Offset;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStream.cpp
index d3ef87d9009d..0680b673380a 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -152,7 +152,9 @@ FixedStreamArray<TypeIndexOffset> TpiStream::getTypeIndexOffsets() const {
   return TypeIndexOffsets;
 }
 
-HashTable &TpiStream::getHashAdjusters() { return HashAdjusters; }
+HashTable<support::ulittle32_t> &TpiStream::getHashAdjusters() {
+  return HashAdjusters;
+}
 
 CVTypeRange TpiStream::types(bool *HadError) const {
   return make_range(TypeRecords.begin(HadError), TypeRecords.end());
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index ee752cda346e..a4e316417f96 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -113,6 +113,8 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_LocType &Loc) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, IlRel, "IL rel", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, MetaData, "metadata", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, Constant, "constant", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, RegRelAliasIndir,
+                               "regrelaliasindir", OS)
   default:
     OS << "Unknown";
   }
@@ -139,6 +141,7 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, None, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, MD5, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, SHA1, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, SHA256, OS)
   }
   return OS;
 }
@@ -254,6 +257,18 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
   return OS;
 }
 
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const PDB_SourceCompression &Compression) {
+  switch (Compression) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SourceCompression, None, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SourceCompression, Huffman, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SourceCompression, LZ, OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_SourceCompression, RunLengthEncoded, "RLE",
+                               OS)
+  }
+  return OS;
+}
+
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const Variant &Value) {
   switch (Value.Type) {
     case PDB_VariantType::Bool:
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index b2b03fbe167b..c62796507a01 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -12,8 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
+#include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSectionContrib.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/IPDBTable.h"
 
@@ -29,3 +31,7 @@ IPDBRawSymbol::~IPDBRawSymbol() = default;
 IPDBLineNumber::~IPDBLineNumber() = default;
 
 IPDBTable::~IPDBTable() = default;
+
+IPDBInjectedSource::~IPDBInjectedSource() = default;
+
+IPDBSectionContrib::~IPDBSectionContrib() = default;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index 854cf42d1bae..8798c7b9db88 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -1,4 +1,4 @@
-//===- PDBSymbolCompiland.cpp - compiland details --------*- C++ -*-===//
+//===- PDBSymbolCompiland.cpp - compiland details ---------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
-
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Path.h"
 #include <utility>
 
 using namespace llvm;
@@ -27,20 +32,85 @@ void PDBSymbolCompiland::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
 
-std::string PDBSymbolCompiland::getSourceFileName() const
-{
-    std::string Result = RawSymbol->getSourceFileName();
-    if (!Result.empty())
-        return Result;
-    auto Envs = findAllChildren<PDBSymbolCompilandEnv>();
-    if (!Envs)
-        return std::string();
-    while (auto Env = Envs->getNext()) {
+std::string PDBSymbolCompiland::getSourceFileName() const {
+  return sys::path::filename(getSourceFileFullPath()).str();
+}
+
+std::string PDBSymbolCompiland::getSourceFileFullPath() const {
+  std::string SourceFileFullPath;
+
+  // RecordedResult could be the basename, relative path or full path of the
+  // source file. Usually it is retrieved and recorded from the command that
+  // compiles this compiland.
+  //
+  //  cmd FileName          -> RecordedResult = .\\FileName
+  //  cmd (Path)\\FileName  -> RecordedResult = (Path)\\FileName
+  //
+  std::string RecordedResult = RawSymbol->getSourceFileName();
+
+  if (RecordedResult.empty()) {
+    if (auto Envs = findAllChildren<PDBSymbolCompilandEnv>()) {
+      std::string EnvWorkingDir, EnvSrc;
+
+      while (auto Env = Envs->getNext()) {
         std::string Var = Env->getName();
-        if (Var != "src")
-            continue;
-        std::string Value = Env->getValue();
-        return Value;
+        if (Var == "cwd") {
+          EnvWorkingDir = Env->getValue();
+          continue;
+        }
+        if (Var == "src") {
+          EnvSrc = Env->getValue();
+          if (sys::path::is_absolute(EnvSrc))
+            return EnvSrc;
+          RecordedResult = EnvSrc;
+          continue;
+        }
+      }
+      if (!EnvWorkingDir.empty() && !EnvSrc.empty()) {
+        auto Len = EnvWorkingDir.length();
+        if (EnvWorkingDir[Len - 1] != '/' && EnvWorkingDir[Len - 1] != '\\') {
+          std::string Path = EnvWorkingDir + "\\" + EnvSrc;
+          std::replace(Path.begin(), Path.end(), '/', '\\');
+          // We will return it as full path if we can't find a better one.
+          if (sys::path::is_absolute(Path))
+            SourceFileFullPath = Path;
+        }
+      }
+    }
+  }
+
+  if (!RecordedResult.empty()) {
+    if (sys::path::is_absolute(RecordedResult))
+      return RecordedResult;
+
+    // This searches name that has same basename as the one in RecordedResult.
+    auto OneSrcFile = Session.findOneSourceFile(
+        this, RecordedResult, PDB_NameSearchFlags::NS_CaseInsensitive);
+    if (OneSrcFile)
+      return OneSrcFile->getFileName();
+  }
+
+  // At this point, we have to walk through all source files of this compiland,
+  // and determine the right source file if any that is used to generate this
+  // compiland based on language indicated in compilanddetails language field.
+  auto Details = findOneChild<PDBSymbolCompilandDetails>();
+  PDB_Lang Lang = Details ? Details->getLanguage() : PDB_Lang::Cpp;
+  auto SrcFiles = Session.getSourceFilesForCompiland(*this);
+  if (SrcFiles) {
+    bool LangC = (Lang == PDB_Lang::Cpp || Lang == PDB_Lang::C);
+    while (auto File = SrcFiles->getNext()) {
+      std::string FileName = File->getFileName();
+      auto file_extension = sys::path::extension(FileName);
+      if (StringSwitch<bool>(file_extension.lower())
+              .Case(".cpp", LangC)
+              .Case(".c", LangC)
+              .Case(".cc", LangC)
+              .Case(".cxx", LangC)
+              .Case(".asm", Lang == PDB_Lang::Masm)
+              .Default(false))
+        return File->getFileName();
     }
-    return std::string();
+  }
+
+  return SourceFileFullPath;
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
index 60026689c6f1..ae4a8038ccd7 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
-
+#include "llvm/DebugInfo/PDB/IPDBSectionContrib.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
@@ -24,3 +24,52 @@ PDBSymbolData::PDBSymbolData(const IPDBSession &PDBSession,
 }
 
 void PDBSymbolData::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
+
+std::unique_ptr<IPDBEnumLineNumbers> PDBSymbolData::getLineNumbers() const {
+  auto Len = RawSymbol->getLength();
+  Len = Len ? Len : 1;
+  if (auto RVA = RawSymbol->getRelativeVirtualAddress())
+    return Session.findLineNumbersByRVA(RVA, Len);
+
+  if (auto Section = RawSymbol->getAddressSection())
+    return Session.findLineNumbersBySectOffset(
+        Section, RawSymbol->getAddressOffset(), Len);
+
+  return nullptr;
+}
+
+uint32_t PDBSymbolData::getCompilandId() const {
+  if (auto Lines = getLineNumbers()) {
+    if (auto FirstLine = Lines->getNext())
+      return FirstLine->getCompilandId();
+  }
+
+  uint32_t DataSection = RawSymbol->getAddressSection();
+  uint32_t DataOffset = RawSymbol->getAddressOffset();
+  if (DataSection == 0) {
+    if (auto RVA = RawSymbol->getRelativeVirtualAddress())
+      Session.addressForRVA(RVA, DataSection, DataOffset);
+  }
+
+  if (DataSection) {
+    if (auto SecContribs = Session.getSectionContribs()) {
+      while (auto Section = SecContribs->getNext()) {
+        if (Section->getAddressSection() == DataSection &&
+            Section->getAddressOffset() <= DataOffset &&
+            (Section->getAddressOffset() + Section->getLength()) > DataOffset)
+          return Section->getCompilandId();
+      }
+    }
+  } else {
+    auto LexParentId = RawSymbol->getLexicalParentId();
+    while (auto LexParent = Session.getSymbolById(LexParentId)) {
+      if (LexParent->getSymTag() == PDB_SymType::Exe)
+        break;
+      if (LexParent->getSymTag() == PDB_SymType::Compiland)
+        return LexParentId;
+      LexParentId = LexParent->getRawSymbol().getLexicalParentId();
+    }
+  }
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index c8c44d97e2f7..37ca1abe86e9 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -105,3 +105,18 @@ bool PDBSymbolFunc::isDestructor() const {
     return true;
   return false;
 }
+
+std::unique_ptr<IPDBEnumLineNumbers> PDBSymbolFunc::getLineNumbers() const {
+  auto Len = RawSymbol->getLength();
+  return Session.findLineNumbersByAddress(RawSymbol->getVirtualAddress(),
+                                          Len ? Len : 1);
+}
+
+uint32_t PDBSymbolFunc::getCompilandId() const {
+  if (auto Lines = getLineNumbers()) {
+    if (auto FirstLine = Lines->getNext()) {
+      return FirstLine->getCompilandId();
+    }
+  }
+  return 0;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
index 0304c6286c8f..8fd3b49155c9 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
 
 #include <utility>
@@ -84,3 +85,21 @@ void PDBSymbolTypeFunctionSig::dump(PDBSymDumper &Dumper) const {
 void PDBSymbolTypeFunctionSig::dumpRight(PDBSymDumper &Dumper) const {
   Dumper.dumpRight(*this);
 }
+
+bool PDBSymbolTypeFunctionSig::isCVarArgs() const {
+  auto SigArguments = getArguments();
+  if (!SigArguments)
+    return false;
+  uint32_t NumArgs = SigArguments->getChildCount();
+  if (NumArgs == 0)
+    return false;
+  auto Last = SigArguments->getChildAtIndex(NumArgs - 1);
+  if (auto Builtin = llvm::dyn_cast_or_null<PDBSymbolTypeBuiltin>(Last.get())) {
+    if (Builtin->getBuiltinType() == PDB_BuiltinType::None)
+      return true;
+  }
+
+  // Note that for a variadic template signature, this method always returns
+  // false since the parameters of the template are specialized.
+  return false;
+}
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 0aeac55dd209..95a356d33eb4 100644
--- a/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -21,6 +21,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBContext.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
@@ -185,14 +186,19 @@ bool findDebugBinary(const std::string &OrigPath,
     return true;
   }
   // Try /path/to/original_binary/.debug/debuglink_name
-  DebugPath = OrigRealPath;
+  DebugPath = OrigDir;
   llvm::sys::path::append(DebugPath, ".debug", DebuglinkName);
   if (checkFileCRC(DebugPath, CRCHash)) {
     Result = DebugPath.str();
     return true;
   }
+#if defined(__NetBSD__)
+  // Try /usr/libdata/debug/path/to/original_binary/debuglink_name
+  DebugPath = "/usr/libdata/debug";
+#else
   // Try /usr/lib/debug/path/to/original_binary/debuglink_name
   DebugPath = "/usr/lib/debug";
+#endif
   llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir),
                           DebuglinkName);
   if (checkFileCRC(DebugPath, CRCHash)) {
@@ -465,28 +471,22 @@ StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
 
 } // end anonymous namespace
 
-#if !defined(_MSC_VER)
-// Assume that __cxa_demangle is provided by libcxxabi (except for Windows).
-extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer,
-                                size_t *length, int *status);
-#endif
-
 std::string
 LLVMSymbolizer::DemangleName(const std::string &Name,
                              const SymbolizableModule *DbiModuleDescriptor) {
-#if !defined(_MSC_VER)
   // We can spoil names of symbols with C linkage, so use an heuristic
   // approach to check if the name should be demangled.
   if (Name.substr(0, 2) == "_Z") {
     int status = 0;
-    char *DemangledName = __cxa_demangle(Name.c_str(), nullptr, nullptr, &status);
+    char *DemangledName = itaniumDemangle(Name.c_str(), nullptr, nullptr, &status);
     if (status != 0)
       return Name;
     std::string Result = DemangledName;
     free(DemangledName);
     return Result;
   }
-#else
+
+#if defined(_MSC_VER)
   if (!Name.empty() && Name.front() == '?') {
     // Only do MSVC C++ demangling on symbols starting with '?'.
     char DemangledName[1024] = {0};
diff --git a/contrib/llvm/lib/Demangle/Compiler.h b/contrib/llvm/lib/Demangle/Compiler.h
new file mode 100644
index 000000000000..248d6e3a7faa
--- /dev/null
+++ b/contrib/llvm/lib/Demangle/Compiler.h
@@ -0,0 +1,93 @@
+//===--- Compiler.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+// This file contains a variety of feature test macros copied from
+// include/llvm/Support/Compiler.h so that LLVMDemangle does not need to take
+// a dependency on LLVMSupport.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEMANGLE_COMPILER_H
+#define LLVM_DEMANGLE_COMPILER_H
+
+#ifdef _MSC_VER
+// snprintf is implemented in VS 2015
+#if _MSC_VER < 1900
+#define snprintf _snprintf_s
+#endif
+#endif
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#ifndef __has_cpp_attribute
+#define __has_cpp_attribute(x) 0
+#endif
+
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#ifndef LLVM_GNUC_PREREQ
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define LLVM_GNUC_PREREQ(maj, min, patch)                                      \
+  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >=          \
+   ((maj) << 20) + ((min) << 10) + (patch))
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#define LLVM_GNUC_PREREQ(maj, min, patch)                                      \
+  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
+#else
+#define LLVM_GNUC_PREREQ(maj, min, patch) 0
+#endif
+#endif
+
+#if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0)
+#define LLVM_ATTRIBUTE_USED __attribute__((__used__))
+#else
+#define LLVM_ATTRIBUTE_USED
+#endif
+
+#if __has_builtin(__builtin_unreachable) || LLVM_GNUC_PREREQ(4, 5, 0)
+#define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define LLVM_BUILTIN_UNREACHABLE __assume(false)
+#endif
+
+#if __has_attribute(noinline) || LLVM_GNUC_PREREQ(3, 4, 0)
+#define LLVM_ATTRIBUTE_NOINLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define LLVM_ATTRIBUTE_NOINLINE __declspec(noinline)
+#else
+#define LLVM_ATTRIBUTE_NOINLINE
+#endif
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED
+#else
+#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE
+#endif
+
+#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
+#define LLVM_FALLTHROUGH [[fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define LLVM_FALLTHROUGH [[gnu::fallthrough]]
+#elif !__cplusplus
+// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
+// error when __has_cpp_attribute is given a scoped attribute in C mode.
+#define LLVM_FALLTHROUGH
+#elif __has_cpp_attribute(clang::fallthrough)
+#define LLVM_FALLTHROUGH [[clang::fallthrough]]
+#else
+#define LLVM_FALLTHROUGH
+#endif
+
+#endif
diff --git a/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp b/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp
index 9c2258f5b933..5bfd2e6ff87e 100644
--- a/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -1,4 +1,4 @@
-//===- ItaniumDemangle.cpp ------------------------------------------------===//
+//===------------------------- ItaniumDemangle.cpp ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,1975 +7,2280 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Demangle/Demangle.h"
-#include "llvm/Support/Compiler.h"
+// FIXME: (possibly) incomplete list of features that clang mangles that this
+// file does not yet support:
+//   - C++ modules TS
 
-// This file exports a single function: llvm::itanium_demangle.
-// It also has no dependencies on the rest of llvm. It is implemented this way
-// so that it can be easily reused in libcxxabi.
+#include "Compiler.h"
+#include "StringView.h"
+#include "Utility.h"
+#include "llvm/Demangle/Demangle.h"
 
-#include <algorithm>
+#include <cassert>
 #include <cctype>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <numeric>
-#include <string>
+#include <utility>
 #include <vector>
 
-#ifdef _MSC_VER
-// snprintf is implemented in VS 2015
-#if _MSC_VER < 1900
-#define snprintf _snprintf_s
-#endif
+namespace {
+// Base class of all AST nodes. The AST is built by the parser, then is
+// traversed by the printLeft/Right functions to produce a demangled string.
+class Node {
+public:
+  enum Kind : unsigned char {
+    KNodeArrayNode,
+    KDotSuffix,
+    KVendorExtQualType,
+    KQualType,
+    KConversionOperatorType,
+    KPostfixQualifiedType,
+    KElaboratedTypeSpefType,
+    KNameType,
+    KAbiTagAttr,
+    KEnableIfAttr,
+    KObjCProtoName,
+    KPointerType,
+    KReferenceType,
+    KPointerToMemberType,
+    KArrayType,
+    KFunctionType,
+    KNoexceptSpec,
+    KDynamicExceptionSpec,
+    KFunctionEncoding,
+    KLiteralOperator,
+    KSpecialName,
+    KCtorVtableSpecialName,
+    KQualifiedName,
+    KNestedName,
+    KLocalName,
+    KVectorType,
+    KParameterPack,
+    KTemplateArgumentPack,
+    KParameterPackExpansion,
+    KTemplateArgs,
+    KForwardTemplateReference,
+    KNameWithTemplateArgs,
+    KGlobalQualifiedName,
+    KStdQualifiedName,
+    KExpandedSpecialSubstitution,
+    KSpecialSubstitution,
+    KCtorDtorName,
+    KDtorName,
+    KUnnamedTypeName,
+    KClosureTypeName,
+    KStructuredBindingName,
+    KExpr,
+    KBracedExpr,
+    KBracedRangeExpr,
+  };
+
+  Kind K;
+
+  /// Three-way bool to track a cached value. Unknown is possible if this node
+  /// has an unexpanded parameter pack below it that may affect this cache.
+  enum class Cache : unsigned char { Yes, No, Unknown, };
+
+  /// Tracks if this node has a component on its right side, in which case we
+  /// need to call printRight.
+  Cache RHSComponentCache;
+
+  /// Track if this node is a (possibly qualified) array type. This can affect
+  /// how we format the output string.
+  Cache ArrayCache;
+
+  /// Track if this node is a (possibly qualified) function type. This can
+  /// affect how we format the output string.
+  Cache FunctionCache;
+
+  Node(Kind K_, Cache RHSComponentCache_ = Cache::No,
+       Cache ArrayCache_ = Cache::No, Cache FunctionCache_ = Cache::No)
+      : K(K_), RHSComponentCache(RHSComponentCache_), ArrayCache(ArrayCache_),
+        FunctionCache(FunctionCache_) {}
+
+  bool hasRHSComponent(OutputStream &S) const {
+    if (RHSComponentCache != Cache::Unknown)
+      return RHSComponentCache == Cache::Yes;
+    return hasRHSComponentSlow(S);
+  }
+
+  bool hasArray(OutputStream &S) const {
+    if (ArrayCache != Cache::Unknown)
+      return ArrayCache == Cache::Yes;
+    return hasArraySlow(S);
+  }
+
+  bool hasFunction(OutputStream &S) const {
+    if (FunctionCache != Cache::Unknown)
+      return FunctionCache == Cache::Yes;
+    return hasFunctionSlow(S);
+  }
+
+  Kind getKind() const { return K; }
+
+  virtual bool hasRHSComponentSlow(OutputStream &) const { return false; }
+  virtual bool hasArraySlow(OutputStream &) const { return false; }
+  virtual bool hasFunctionSlow(OutputStream &) const { return false; }
+
+  // Dig through "glue" nodes like ParameterPack and ForwardTemplateReference to
+  // get at a node that actually represents some concrete syntax.
+  virtual const Node *getSyntaxNode(OutputStream &) const {
+    return this;
+  }
+
+  void print(OutputStream &S) const {
+    printLeft(S);
+    if (RHSComponentCache != Cache::No)
+      printRight(S);
+  }
+
+  // Print the "left" side of this Node into OutputStream.
+  virtual void printLeft(OutputStream &) const = 0;
+
+  // Print the "right". This distinction is necessary to represent C++ types
+  // that appear on the RHS of their subtype, such as arrays or functions.
+  // Since most types don't have such a component, provide a default
+  // implementation.
+  virtual void printRight(OutputStream &) const {}
+
+  virtual StringView getBaseName() const { return StringView(); }
+
+  // Silence compiler warnings, this dtor will never be called.
+  virtual ~Node() = default;
+
+#ifndef NDEBUG
+  LLVM_DUMP_METHOD void dump() const {
+    char *Buffer = static_cast<char*>(std::malloc(1024));
+    OutputStream S(Buffer, 1024);
+    print(S);
+    S += '\0';
+    printf("Symbol dump for %p: %s\n", (const void*)this, S.getBuffer());
+    std::free(S.getBuffer());
+  }
 #endif
+};
+
+class NodeArray {
+  Node **Elements;
+  size_t NumElements;
+
+public:
+  NodeArray() : Elements(nullptr), NumElements(0) {}
+  NodeArray(Node **Elements_, size_t NumElements_)
+      : Elements(Elements_), NumElements(NumElements_) {}
+
+  bool empty() const { return NumElements == 0; }
+  size_t size() const { return NumElements; }
+
+  Node **begin() const { return Elements; }
+  Node **end() const { return Elements + NumElements; }
+
+  Node *operator[](size_t Idx) const { return Elements[Idx]; }
+
+  void printWithComma(OutputStream &S) const {
+    bool FirstElement = true;
+    for (size_t Idx = 0; Idx != NumElements; ++Idx) {
+      size_t BeforeComma = S.getCurrentPosition();
+      if (!FirstElement)
+        S += ", ";
+      size_t AfterComma = S.getCurrentPosition();
+      Elements[Idx]->print(S);
+
+      // Elements[Idx] is an empty parameter pack expansion, we should erase the
+      // comma we just printed.
+      if (AfterComma == S.getCurrentPosition()) {
+        S.setCurrentPosition(BeforeComma);
+        continue;
+      }
 
-enum {
-  unknown_error = -4,
-  invalid_args = -3,
-  invalid_mangled_name,
-  memory_alloc_failure,
-  success
+      FirstElement = false;
+    }
+  }
 };
 
-enum {
-  CV_const = (1 << 0),
-  CV_volatile = (1 << 1),
-  CV_restrict = (1 << 2),
+struct NodeArrayNode : Node {
+  NodeArray Array;
+  NodeArrayNode(NodeArray Array_) : Node(KNodeArrayNode), Array(Array_) {}
+  void printLeft(OutputStream &S) const override {
+    Array.printWithComma(S);
+  }
 };
 
-template <class C>
-static const char *parse_type(const char *first, const char *last, C &db);
-template <class C>
-static const char *parse_encoding(const char *first, const char *last, C &db);
-template <class C>
-static const char *parse_name(const char *first, const char *last, C &db,
-                              bool *ends_with_template_args = 0);
-template <class C>
-static const char *parse_expression(const char *first, const char *last, C &db);
-template <class C>
-static const char *parse_template_args(const char *first, const char *last,
-                                       C &db);
-template <class C>
-static const char *parse_operator_name(const char *first, const char *last,
-                                       C &db);
-template <class C>
-static const char *parse_unqualified_name(const char *first, const char *last,
-                                          C &db);
-template <class C>
-static const char *parse_decltype(const char *first, const char *last, C &db);
+class DotSuffix final : public Node {
+  const Node *Prefix;
+  const StringView Suffix;
 
-// <number> ::= [n] <non-negative decimal integer>
+public:
+  DotSuffix(Node *Prefix_, StringView Suffix_)
+      : Node(KDotSuffix), Prefix(Prefix_), Suffix(Suffix_) {}
+
+  void printLeft(OutputStream &s) const override {
+    Prefix->print(s);
+    s += " (";
+    s += Suffix;
+    s += ")";
+  }
+};
 
-static const char *parse_number(const char *first, const char *last) {
-  if (first != last) {
-    const char *t = first;
-    if (*t == 'n')
-      ++t;
-    if (t != last) {
-      if (*t == '0') {
-        first = t + 1;
-      } else if ('1' <= *t && *t <= '9') {
-        first = t + 1;
-        while (first != last && std::isdigit(*first))
-          ++first;
-      }
-    }
+class VendorExtQualType final : public Node {
+  const Node *Ty;
+  StringView Ext;
+
+public:
+  VendorExtQualType(Node *Ty_, StringView Ext_)
+      : Node(KVendorExtQualType), Ty(Ty_), Ext(Ext_) {}
+
+  void printLeft(OutputStream &S) const override {
+    Ty->print(S);
+    S += " ";
+    S += Ext;
   }
-  return first;
+};
+
+enum FunctionRefQual : unsigned char {
+  FrefQualNone,
+  FrefQualLValue,
+  FrefQualRValue,
+};
+
+enum Qualifiers {
+  QualNone = 0,
+  QualConst = 0x1,
+  QualVolatile = 0x2,
+  QualRestrict = 0x4,
+};
+
+void addQualifiers(Qualifiers &Q1, Qualifiers Q2) {
+  Q1 = static_cast<Qualifiers>(Q1 | Q2);
 }
 
-namespace {
-template <class Float> struct float_data;
+class QualType : public Node {
+protected:
+  const Qualifiers Quals;
+  const Node *Child;
+
+  void printQuals(OutputStream &S) const {
+    if (Quals & QualConst)
+      S += " const";
+    if (Quals & QualVolatile)
+      S += " volatile";
+    if (Quals & QualRestrict)
+      S += " restrict";
+  }
+
+public:
+  QualType(Node *Child_, Qualifiers Quals_)
+      : Node(KQualType, Child_->RHSComponentCache,
+             Child_->ArrayCache, Child_->FunctionCache),
+        Quals(Quals_), Child(Child_) {}
+
+  bool hasRHSComponentSlow(OutputStream &S) const override {
+    return Child->hasRHSComponent(S);
+  }
+  bool hasArraySlow(OutputStream &S) const override {
+    return Child->hasArray(S);
+  }
+  bool hasFunctionSlow(OutputStream &S) const override {
+    return Child->hasFunction(S);
+  }
+
+  void printLeft(OutputStream &S) const override {
+    Child->printLeft(S);
+    printQuals(S);
+  }
 
-template <> struct float_data<float> {
-  static const size_t mangled_size = 8;
-  static const size_t max_demangled_size = 24;
-  static const char *spec;
+  void printRight(OutputStream &S) const override { Child->printRight(S); }
 };
-const char *float_data<float>::spec = "%af";
 
-template <> struct float_data<double> {
-  static const size_t mangled_size = 16;
-  static const size_t max_demangled_size = 32;
-  static const char *spec;
+class ConversionOperatorType final : public Node {
+  const Node *Ty;
+
+public:
+  ConversionOperatorType(Node *Ty_)
+      : Node(KConversionOperatorType), Ty(Ty_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "operator ";
+    Ty->print(S);
+  }
 };
 
-const char *float_data<double>::spec = "%a";
+class PostfixQualifiedType final : public Node {
+  const Node *Ty;
+  const StringView Postfix;
 
-template <> struct float_data<long double> {
-#if defined(__mips__) && defined(__mips_n64) || defined(__aarch64__) ||        \
-    defined(__wasm__)
-  static const size_t mangled_size = 32;
-#elif defined(__arm__) || defined(__mips__) || defined(__hexagon__)
-  static const size_t mangled_size = 16;
-#else
-  static const size_t mangled_size =
-      20; // May need to be adjusted to 16 or 24 on other platforms
-#endif
-  static const size_t max_demangled_size = 40;
-  static const char *spec;
+public:
+  PostfixQualifiedType(Node *Ty_, StringView Postfix_)
+      : Node(KPostfixQualifiedType), Ty(Ty_), Postfix(Postfix_) {}
+
+  void printLeft(OutputStream &s) const override {
+    Ty->printLeft(s);
+    s += Postfix;
+  }
 };
 
-const char *float_data<long double>::spec = "%LaL";
-}
+class NameType final : public Node {
+  const StringView Name;
 
-template <class Float, class C>
-static const char *parse_floating_number(const char *first, const char *last,
-                                         C &db) {
-  const size_t N = float_data<Float>::mangled_size;
-  if (static_cast<std::size_t>(last - first) > N) {
-    last = first + N;
-    union {
-      Float value;
-      char buf[sizeof(Float)];
-    };
-    const char *t = first;
-    char *e = buf;
-    for (; t != last; ++t, ++e) {
-      if (!isxdigit(*t))
-        return first;
-      unsigned d1 = isdigit(*t) ? static_cast<unsigned>(*t - '0')
-                                : static_cast<unsigned>(*t - 'a' + 10);
-      ++t;
-      unsigned d0 = isdigit(*t) ? static_cast<unsigned>(*t - '0')
-                                : static_cast<unsigned>(*t - 'a' + 10);
-      *e = static_cast<char>((d1 << 4) + d0);
-    }
-    if (*t == 'E') {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-      std::reverse(buf, e);
-#endif
-      char num[float_data<Float>::max_demangled_size] = {0};
-      int n = snprintf(num, sizeof(num), float_data<Float>::spec, value);
-      if (static_cast<std::size_t>(n) >= sizeof(num))
-        return first;
-      db.names.push_back(std::string(num, static_cast<std::size_t>(n)));
-      first = t + 1;
+public:
+  NameType(StringView Name_) : Node(KNameType), Name(Name_) {}
+
+  StringView getName() const { return Name; }
+  StringView getBaseName() const override { return Name; }
+
+  void printLeft(OutputStream &s) const override { s += Name; }
+};
+
+class ElaboratedTypeSpefType : public Node {
+  StringView Kind;
+  Node *Child;
+public:
+  ElaboratedTypeSpefType(StringView Kind_, Node *Child_)
+      : Node(KElaboratedTypeSpefType), Kind(Kind_), Child(Child_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += Kind;
+    S += ' ';
+    Child->print(S);
+  }
+};
+
+struct AbiTagAttr : Node {
+  Node *Base;
+  StringView Tag;
+
+  AbiTagAttr(Node* Base_, StringView Tag_)
+      : Node(KAbiTagAttr, Base_->RHSComponentCache,
+             Base_->ArrayCache, Base_->FunctionCache),
+        Base(Base_), Tag(Tag_) {}
+
+  void printLeft(OutputStream &S) const override {
+    Base->printLeft(S);
+    S += "[abi:";
+    S += Tag;
+    S += "]";
+  }
+};
+
+class EnableIfAttr : public Node {
+  NodeArray Conditions;
+public:
+  EnableIfAttr(NodeArray Conditions_)
+      : Node(KEnableIfAttr), Conditions(Conditions_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += " [enable_if:";
+    Conditions.printWithComma(S);
+    S += ']';
+  }
+};
+
+class ObjCProtoName : public Node {
+  Node *Ty;
+  StringView Protocol;
+
+  friend class PointerType;
+
+public:
+  ObjCProtoName(Node *Ty_, StringView Protocol_)
+      : Node(KObjCProtoName), Ty(Ty_), Protocol(Protocol_) {}
+
+  bool isObjCObject() const {
+    return Ty->getKind() == KNameType &&
+           static_cast<NameType *>(Ty)->getName() == "objc_object";
+  }
+
+  void printLeft(OutputStream &S) const override {
+    Ty->print(S);
+    S += "<";
+    S += Protocol;
+    S += ">";
+  }
+};
+
+class PointerType final : public Node {
+  const Node *Pointee;
+
+public:
+  PointerType(Node *Pointee_)
+      : Node(KPointerType, Pointee_->RHSComponentCache),
+        Pointee(Pointee_) {}
+
+  bool hasRHSComponentSlow(OutputStream &S) const override {
+    return Pointee->hasRHSComponent(S);
+  }
+
+  void printLeft(OutputStream &s) const override {
+    // We rewrite objc_object<SomeProtocol>* into id<SomeProtocol>.
+    if (Pointee->getKind() != KObjCProtoName ||
+        !static_cast<const ObjCProtoName *>(Pointee)->isObjCObject()) {
+      Pointee->printLeft(s);
+      if (Pointee->hasArray(s))
+        s += " ";
+      if (Pointee->hasArray(s) || Pointee->hasFunction(s))
+        s += "(";
+      s += "*";
+    } else {
+      const auto *objcProto = static_cast<const ObjCProtoName *>(Pointee);
+      s += "id<";
+      s += objcProto->Protocol;
+      s += ">";
     }
   }
-  return first;
-}
 
-// <source-name> ::= <positive length number> <identifier>
+  void printRight(OutputStream &s) const override {
+    if (Pointee->getKind() != KObjCProtoName ||
+        !static_cast<const ObjCProtoName *>(Pointee)->isObjCObject()) {
+      if (Pointee->hasArray(s) || Pointee->hasFunction(s))
+        s += ")";
+      Pointee->printRight(s);
+    }
+  }
+};
 
-template <class C>
-static const char *parse_source_name(const char *first, const char *last,
-                                     C &db) {
-  if (first != last) {
-    char c = *first;
-    if (isdigit(c) && first + 1 != last) {
-      const char *t = first + 1;
-      size_t n = static_cast<size_t>(c - '0');
-      for (c = *t; isdigit(c); c = *t) {
-        n = n * 10 + static_cast<size_t>(c - '0');
-        if (++t == last)
-          return first;
-      }
-      if (static_cast<size_t>(last - t) >= n) {
-        std::string r(t, n);
-        if (r.substr(0, 10) == "_GLOBAL__N")
-          db.names.push_back("(anonymous namespace)");
-        else
-          db.names.push_back(std::move(r));
-        first = t + n;
-      }
+enum class ReferenceKind {
+  LValue,
+  RValue,
+};
+
+// Represents either a LValue or an RValue reference type.
+class ReferenceType : public Node {
+  const Node *Pointee;
+  ReferenceKind RK;
+
+  // Dig through any refs to refs, collapsing the ReferenceTypes as we go. The
+  // rule here is rvalue ref to rvalue ref collapses to a rvalue ref, and any
+  // other combination collapses to a lvalue ref.
+  std::pair<ReferenceKind, const Node *> collapse(OutputStream &S) const {
+    auto SoFar = std::make_pair(RK, Pointee);
+    for (;;) {
+      const Node *SN = SoFar.second->getSyntaxNode(S);
+      if (SN->getKind() != KReferenceType)
+        break;
+      auto *RT = static_cast<const ReferenceType *>(SN);
+      SoFar.second = RT->Pointee;
+      SoFar.first = std::min(SoFar.first, RT->RK);
     }
+    return SoFar;
   }
-  return first;
-}
 
-// <substitution> ::= S <seq-id> _
-//                ::= S_
-// <substitution> ::= Sa # ::std::allocator
-// <substitution> ::= Sb # ::std::basic_string
-// <substitution> ::= Ss # ::std::basic_string < char,
-//                                               ::std::char_traits<char>,
-//                                               ::std::allocator<char> >
-// <substitution> ::= Si # ::std::basic_istream<char,  std::char_traits<char> >
-// <substitution> ::= So # ::std::basic_ostream<char,  std::char_traits<char> >
-// <substitution> ::= Sd # ::std::basic_iostream<char, std::char_traits<char> >
+public:
+  ReferenceType(Node *Pointee_, ReferenceKind RK_)
+      : Node(KReferenceType, Pointee_->RHSComponentCache),
+        Pointee(Pointee_), RK(RK_) {}
 
-template <class C>
-static const char *parse_substitution(const char *first, const char *last,
-                                      C &db) {
-  if (last - first >= 2) {
-    if (*first == 'S') {
-      switch (first[1]) {
-      case 'a':
-        db.names.push_back("std::allocator");
-        first += 2;
-        break;
-      case 'b':
-        db.names.push_back("std::basic_string");
-        first += 2;
-        break;
-      case 's':
-        db.names.push_back("std::string");
-        first += 2;
-        break;
-      case 'i':
-        db.names.push_back("std::istream");
-        first += 2;
-        break;
-      case 'o':
-        db.names.push_back("std::ostream");
-        first += 2;
-        break;
-      case 'd':
-        db.names.push_back("std::iostream");
-        first += 2;
-        break;
-      case '_':
-        if (!db.subs.empty()) {
-          for (const auto &n : db.subs.front())
-            db.names.push_back(n);
-          first += 2;
-        }
-        break;
-      default:
-        if (std::isdigit(first[1]) || std::isupper(first[1])) {
-          size_t sub = 0;
-          const char *t = first + 1;
-          if (std::isdigit(*t))
-            sub = static_cast<size_t>(*t - '0');
-          else
-            sub = static_cast<size_t>(*t - 'A') + 10;
-          for (++t; t != last && (std::isdigit(*t) || std::isupper(*t)); ++t) {
-            sub *= 36;
-            if (std::isdigit(*t))
-              sub += static_cast<size_t>(*t - '0');
-            else
-              sub += static_cast<size_t>(*t - 'A') + 10;
-          }
-          if (t == last || *t != '_')
-            return first;
-          ++sub;
-          if (sub < db.subs.size()) {
-            for (const auto &n : db.subs[sub])
-              db.names.push_back(n);
-            first = t + 1;
-          }
-        }
-        break;
-      }
+  bool hasRHSComponentSlow(OutputStream &S) const override {
+    return Pointee->hasRHSComponent(S);
+  }
+
+  void printLeft(OutputStream &s) const override {
+    std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
+    Collapsed.second->printLeft(s);
+    if (Collapsed.second->hasArray(s))
+      s += " ";
+    if (Collapsed.second->hasArray(s) || Collapsed.second->hasFunction(s))
+      s += "(";
+
+    s += (Collapsed.first == ReferenceKind::LValue ? "&" : "&&");
+  }
+  void printRight(OutputStream &s) const override {
+    std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
+    if (Collapsed.second->hasArray(s) || Collapsed.second->hasFunction(s))
+      s += ")";
+    Collapsed.second->printRight(s);
+  }
+};
+
+class PointerToMemberType final : public Node {
+  const Node *ClassType;
+  const Node *MemberType;
+
+public:
+  PointerToMemberType(Node *ClassType_, Node *MemberType_)
+      : Node(KPointerToMemberType, MemberType_->RHSComponentCache),
+        ClassType(ClassType_), MemberType(MemberType_) {}
+
+  bool hasRHSComponentSlow(OutputStream &S) const override {
+    return MemberType->hasRHSComponent(S);
+  }
+
+  void printLeft(OutputStream &s) const override {
+    MemberType->printLeft(s);
+    if (MemberType->hasArray(s) || MemberType->hasFunction(s))
+      s += "(";
+    else
+      s += " ";
+    ClassType->print(s);
+    s += "::*";
+  }
+
+  void printRight(OutputStream &s) const override {
+    if (MemberType->hasArray(s) || MemberType->hasFunction(s))
+      s += ")";
+    MemberType->printRight(s);
+  }
+};
+
+class NodeOrString {
+  const void *First;
+  const void *Second;
+
+public:
+  /* implicit */ NodeOrString(StringView Str) {
+    const char *FirstChar = Str.begin();
+    const char *SecondChar = Str.end();
+    if (SecondChar == nullptr) {
+      assert(FirstChar == SecondChar);
+      ++FirstChar, ++SecondChar;
     }
+    First = static_cast<const void *>(FirstChar);
+    Second = static_cast<const void *>(SecondChar);
   }
-  return first;
-}
 
-// <builtin-type> ::= v    # void
-//                ::= w    # wchar_t
-//                ::= b    # bool
-//                ::= c    # char
-//                ::= a    # signed char
-//                ::= h    # unsigned char
-//                ::= s    # short
-//                ::= t    # unsigned short
-//                ::= i    # int
-//                ::= j    # unsigned int
-//                ::= l    # long
-//                ::= m    # unsigned long
-//                ::= x    # long long, __int64
-//                ::= y    # unsigned long long, __int64
-//                ::= n    # __int128
-//                ::= o    # unsigned __int128
-//                ::= f    # float
-//                ::= d    # double
-//                ::= e    # long double, __float80
-//                ::= g    # __float128
-//                ::= z    # ellipsis
-//                ::= Dd   # IEEE 754r decimal floating point (64 bits)
-//                ::= De   # IEEE 754r decimal floating point (128 bits)
-//                ::= Df   # IEEE 754r decimal floating point (32 bits)
-//                ::= Dh   # IEEE 754r half-precision floating point (16 bits)
-//                ::= Di   # char32_t
-//                ::= Ds   # char16_t
-//                ::= Da   # auto (in dependent new-expressions)
-//                ::= Dc   # decltype(auto)
-//                ::= Dn   # std::nullptr_t (i.e., decltype(nullptr))
-//                ::= u <source-name>    # vendor extended type
-
-template <class C>
-static const char *parse_builtin_type(const char *first, const char *last,
-                                      C &db) {
-  if (first != last) {
-    switch (*first) {
-    case 'v':
-      db.names.push_back("void");
-      ++first;
-      break;
-    case 'w':
-      db.names.push_back("wchar_t");
-      ++first;
-      break;
-    case 'b':
-      db.names.push_back("bool");
-      ++first;
-      break;
-    case 'c':
-      db.names.push_back("char");
-      ++first;
-      break;
-    case 'a':
-      db.names.push_back("signed char");
-      ++first;
-      break;
-    case 'h':
-      db.names.push_back("unsigned char");
-      ++first;
-      break;
-    case 's':
-      db.names.push_back("short");
-      ++first;
-      break;
-    case 't':
-      db.names.push_back("unsigned short");
-      ++first;
-      break;
-    case 'i':
-      db.names.push_back("int");
-      ++first;
-      break;
-    case 'j':
-      db.names.push_back("unsigned int");
-      ++first;
-      break;
-    case 'l':
-      db.names.push_back("long");
-      ++first;
-      break;
-    case 'm':
-      db.names.push_back("unsigned long");
-      ++first;
+  /* implicit */ NodeOrString(Node *N)
+      : First(static_cast<const void *>(N)), Second(nullptr) {}
+  NodeOrString() : First(nullptr), Second(nullptr) {}
+
+  bool isString() const { return Second && First; }
+  bool isNode() const { return First && !Second; }
+  bool isEmpty() const { return !First && !Second; }
+
+  StringView asString() const {
+    assert(isString());
+    return StringView(static_cast<const char *>(First),
+                      static_cast<const char *>(Second));
+  }
+
+  const Node *asNode() const {
+    assert(isNode());
+    return static_cast<const Node *>(First);
+  }
+};
+
+class ArrayType final : public Node {
+  Node *Base;
+  NodeOrString Dimension;
+
+public:
+  ArrayType(Node *Base_, NodeOrString Dimension_)
+      : Node(KArrayType,
+             /*RHSComponentCache=*/Cache::Yes,
+             /*ArrayCache=*/Cache::Yes),
+        Base(Base_), Dimension(Dimension_) {}
+
+  // Incomplete array type.
+  ArrayType(Node *Base_)
+      : Node(KArrayType,
+             /*RHSComponentCache=*/Cache::Yes,
+             /*ArrayCache=*/Cache::Yes),
+        Base(Base_) {}
+
+  bool hasRHSComponentSlow(OutputStream &) const override { return true; }
+  bool hasArraySlow(OutputStream &) const override { return true; }
+
+  void printLeft(OutputStream &S) const override { Base->printLeft(S); }
+
+  void printRight(OutputStream &S) const override {
+    if (S.back() != ']')
+      S += " ";
+    S += "[";
+    if (Dimension.isString())
+      S += Dimension.asString();
+    else if (Dimension.isNode())
+      Dimension.asNode()->print(S);
+    S += "]";
+    Base->printRight(S);
+  }
+};
+
+class FunctionType final : public Node {
+  Node *Ret;
+  NodeArray Params;
+  Qualifiers CVQuals;
+  FunctionRefQual RefQual;
+  Node *ExceptionSpec;
+
+public:
+  FunctionType(Node *Ret_, NodeArray Params_, Qualifiers CVQuals_,
+               FunctionRefQual RefQual_, Node *ExceptionSpec_)
+      : Node(KFunctionType,
+             /*RHSComponentCache=*/Cache::Yes, /*ArrayCache=*/Cache::No,
+             /*FunctionCache=*/Cache::Yes),
+        Ret(Ret_), Params(Params_), CVQuals(CVQuals_), RefQual(RefQual_),
+        ExceptionSpec(ExceptionSpec_) {}
+
+  bool hasRHSComponentSlow(OutputStream &) const override { return true; }
+  bool hasFunctionSlow(OutputStream &) const override { return true; }
+
+  // Handle C++'s ... quirky decl grammar by using the left & right
+  // distinction. Consider:
+  //   int (*f(float))(char) {}
+  // f is a function that takes a float and returns a pointer to a function
+  // that takes a char and returns an int. If we're trying to print f, start
+  // by printing out the return types's left, then print our parameters, then
+  // finally print right of the return type.
+  void printLeft(OutputStream &S) const override {
+    Ret->printLeft(S);
+    S += " ";
+  }
+
+  void printRight(OutputStream &S) const override {
+    S += "(";
+    Params.printWithComma(S);
+    S += ")";
+    Ret->printRight(S);
+
+    if (CVQuals & QualConst)
+      S += " const";
+    if (CVQuals & QualVolatile)
+      S += " volatile";
+    if (CVQuals & QualRestrict)
+      S += " restrict";
+
+    if (RefQual == FrefQualLValue)
+      S += " &";
+    else if (RefQual == FrefQualRValue)
+      S += " &&";
+
+    if (ExceptionSpec != nullptr) {
+      S += ' ';
+      ExceptionSpec->print(S);
+    }
+  }
+};
+
+class NoexceptSpec : public Node {
+  Node *E;
+public:
+  NoexceptSpec(Node *E_) : Node(KNoexceptSpec), E(E_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "noexcept(";
+    E->print(S);
+    S += ")";
+  }
+};
+
+class DynamicExceptionSpec : public Node {
+  NodeArray Types;
+public:
+  DynamicExceptionSpec(NodeArray Types_)
+      : Node(KDynamicExceptionSpec), Types(Types_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "throw(";
+    Types.printWithComma(S);
+    S += ')';
+  }
+};
+
+class FunctionEncoding final : public Node {
+  Node *Ret;
+  Node *Name;
+  NodeArray Params;
+  Node *Attrs;
+  Qualifiers CVQuals;
+  FunctionRefQual RefQual;
+
+public:
+  FunctionEncoding(Node *Ret_, Node *Name_, NodeArray Params_,
+                   Node *Attrs_, Qualifiers CVQuals_, FunctionRefQual RefQual_)
+      : Node(KFunctionEncoding,
+             /*RHSComponentCache=*/Cache::Yes, /*ArrayCache=*/Cache::No,
+             /*FunctionCache=*/Cache::Yes),
+        Ret(Ret_), Name(Name_), Params(Params_), Attrs(Attrs_),
+        CVQuals(CVQuals_), RefQual(RefQual_) {}
+
+  Qualifiers getCVQuals() const { return CVQuals; }
+  FunctionRefQual getRefQual() const { return RefQual; }
+  NodeArray getParams() const { return Params; }
+  Node *getReturnType() const { return Ret; }
+
+  bool hasRHSComponentSlow(OutputStream &) const override { return true; }
+  bool hasFunctionSlow(OutputStream &) const override { return true; }
+
+  Node *getName() { return const_cast<Node *>(Name); }
+
+  void printLeft(OutputStream &S) const override {
+    if (Ret) {
+      Ret->printLeft(S);
+      if (!Ret->hasRHSComponent(S))
+        S += " ";
+    }
+    Name->print(S);
+  }
+
+  void printRight(OutputStream &S) const override {
+    S += "(";
+    Params.printWithComma(S);
+    S += ")";
+    if (Ret)
+      Ret->printRight(S);
+
+    if (CVQuals & QualConst)
+      S += " const";
+    if (CVQuals & QualVolatile)
+      S += " volatile";
+    if (CVQuals & QualRestrict)
+      S += " restrict";
+
+    if (RefQual == FrefQualLValue)
+      S += " &";
+    else if (RefQual == FrefQualRValue)
+      S += " &&";
+
+    if (Attrs != nullptr)
+      Attrs->print(S);
+  }
+};
+
+class LiteralOperator : public Node {
+  const Node *OpName;
+
+public:
+  LiteralOperator(Node *OpName_) : Node(KLiteralOperator), OpName(OpName_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "operator\"\" ";
+    OpName->print(S);
+  }
+};
+
+class SpecialName final : public Node {
+  const StringView Special;
+  const Node *Child;
+
+public:
+  SpecialName(StringView Special_, Node* Child_)
+      : Node(KSpecialName), Special(Special_), Child(Child_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += Special;
+    Child->print(S);
+  }
+};
+
+class CtorVtableSpecialName final : public Node {
+  const Node *FirstType;
+  const Node *SecondType;
+
+public:
+  CtorVtableSpecialName(Node *FirstType_, Node *SecondType_)
+      : Node(KCtorVtableSpecialName),
+        FirstType(FirstType_), SecondType(SecondType_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "construction vtable for ";
+    FirstType->print(S);
+    S += "-in-";
+    SecondType->print(S);
+  }
+};
+
+struct NestedName : Node {
+  Node *Qual;
+  Node *Name;
+
+  NestedName(Node *Qual_, Node *Name_)
+      : Node(KNestedName), Qual(Qual_), Name(Name_) {}
+
+  StringView getBaseName() const override { return Name->getBaseName(); }
+
+  void printLeft(OutputStream &S) const override {
+    Qual->print(S);
+    S += "::";
+    Name->print(S);
+  }
+};
+
+struct LocalName : Node {
+  Node *Encoding;
+  Node *Entity;
+
+  LocalName(Node *Encoding_, Node *Entity_)
+      : Node(KLocalName), Encoding(Encoding_), Entity(Entity_) {}
+
+  void printLeft(OutputStream &S) const override {
+    Encoding->print(S);
+    S += "::";
+    Entity->print(S);
+  }
+};
+
+class QualifiedName final : public Node {
+  // qualifier::name
+  const Node *Qualifier;
+  const Node *Name;
+
+public:
+  QualifiedName(Node* Qualifier_, Node* Name_)
+      : Node(KQualifiedName), Qualifier(Qualifier_), Name(Name_) {}
+
+  StringView getBaseName() const override { return Name->getBaseName(); }
+
+  void printLeft(OutputStream &S) const override {
+    Qualifier->print(S);
+    S += "::";
+    Name->print(S);
+  }
+};
+
+class VectorType final : public Node {
+  const Node *BaseType;
+  const NodeOrString Dimension;
+  const bool IsPixel;
+
+public:
+  VectorType(NodeOrString Dimension_)
+      : Node(KVectorType), BaseType(nullptr), Dimension(Dimension_),
+        IsPixel(true) {}
+  VectorType(Node *BaseType_, NodeOrString Dimension_)
+      : Node(KVectorType), BaseType(BaseType_),
+        Dimension(Dimension_), IsPixel(false) {}
+
+  void printLeft(OutputStream &S) const override {
+    if (IsPixel) {
+      S += "pixel vector[";
+      S += Dimension.asString();
+      S += "]";
+    } else {
+      BaseType->print(S);
+      S += " vector[";
+      if (Dimension.isNode())
+        Dimension.asNode()->print(S);
+      else if (Dimension.isString())
+        S += Dimension.asString();
+      S += "]";
+    }
+  }
+};
+
+/// An unexpanded parameter pack (either in the expression or type context). If
+/// this AST is correct, this node will have a ParameterPackExpansion node above
+/// it.
+///
+/// This node is created when some <template-args> are found that apply to an
+/// <encoding>, and is stored in the TemplateParams table. In order for this to
+/// appear in the final AST, it has to referenced via a <template-param> (ie,
+/// T_).
+class ParameterPack final : public Node {
+  NodeArray Data;
+
+  // Setup OutputStream for a pack expansion unless we're already expanding one.
+  void initializePackExpansion(OutputStream &S) const {
+    if (S.CurrentPackMax == std::numeric_limits<unsigned>::max()) {
+      S.CurrentPackMax = static_cast<unsigned>(Data.size());
+      S.CurrentPackIndex = 0;
+    }
+  }
+
+public:
+  ParameterPack(NodeArray Data_) : Node(KParameterPack), Data(Data_) {
+    ArrayCache = FunctionCache = RHSComponentCache = Cache::Unknown;
+    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
+          return P->ArrayCache == Cache::No;
+        }))
+      ArrayCache = Cache::No;
+    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
+          return P->FunctionCache == Cache::No;
+        }))
+      FunctionCache = Cache::No;
+    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
+          return P->RHSComponentCache == Cache::No;
+        }))
+      RHSComponentCache = Cache::No;
+  }
+
+  bool hasRHSComponentSlow(OutputStream &S) const override {
+    initializePackExpansion(S);
+    size_t Idx = S.CurrentPackIndex;
+    return Idx < Data.size() && Data[Idx]->hasRHSComponent(S);
+  }
+  bool hasArraySlow(OutputStream &S) const override {
+    initializePackExpansion(S);
+    size_t Idx = S.CurrentPackIndex;
+    return Idx < Data.size() && Data[Idx]->hasArray(S);
+  }
+  bool hasFunctionSlow(OutputStream &S) const override {
+    initializePackExpansion(S);
+    size_t Idx = S.CurrentPackIndex;
+    return Idx < Data.size() && Data[Idx]->hasFunction(S);
+  }
+  const Node *getSyntaxNode(OutputStream &S) const override {
+    initializePackExpansion(S);
+    size_t Idx = S.CurrentPackIndex;
+    return Idx < Data.size() ? Data[Idx]->getSyntaxNode(S) : this;
+  }
+
+  void printLeft(OutputStream &S) const override {
+    initializePackExpansion(S);
+    size_t Idx = S.CurrentPackIndex;
+    if (Idx < Data.size())
+      Data[Idx]->printLeft(S);
+  }
+  void printRight(OutputStream &S) const override {
+    initializePackExpansion(S);
+    size_t Idx = S.CurrentPackIndex;
+    if (Idx < Data.size())
+      Data[Idx]->printRight(S);
+  }
+};
+
+/// A variadic template argument. This node represents an occurrence of
+/// J<something>E in some <template-args>. It isn't itself unexpanded, unless
+/// one of it's Elements is. The parser inserts a ParameterPack into the
+/// TemplateParams table if the <template-args> this pack belongs to apply to an
+/// <encoding>.
+class TemplateArgumentPack final : public Node {
+  NodeArray Elements;
+public:
+  TemplateArgumentPack(NodeArray Elements_)
+      : Node(KTemplateArgumentPack), Elements(Elements_) {}
+
+  NodeArray getElements() const { return Elements; }
+
+  void printLeft(OutputStream &S) const override {
+    Elements.printWithComma(S);
+  }
+};
+
+/// A pack expansion. Below this node, there are some unexpanded ParameterPacks
+/// which each have Child->ParameterPackSize elements.
+class ParameterPackExpansion final : public Node {
+  const Node *Child;
+
+public:
+  ParameterPackExpansion(Node* Child_)
+      : Node(KParameterPackExpansion), Child(Child_) {}
+
+  const Node *getChild() const { return Child; }
+
+  void printLeft(OutputStream &S) const override {
+    constexpr unsigned Max = std::numeric_limits<unsigned>::max();
+    SwapAndRestore<unsigned> SavePackIdx(S.CurrentPackIndex, Max);
+    SwapAndRestore<unsigned> SavePackMax(S.CurrentPackMax, Max);
+    size_t StreamPos = S.getCurrentPosition();
+
+    // Print the first element in the pack. If Child contains a ParameterPack,
+    // it will set up S.CurrentPackMax and print the first element.
+    Child->print(S);
+
+    // No ParameterPack was found in Child. This can occur if we've found a pack
+    // expansion on a <function-param>.
+    if (S.CurrentPackMax == Max) {
+      S += "...";
+      return;
+    }
+
+    // We found a ParameterPack, but it has no elements. Erase whatever we may
+    // of printed.
+    if (S.CurrentPackMax == 0) {
+      S.setCurrentPosition(StreamPos);
+      return;
+    }
+
+    // Else, iterate through the rest of the elements in the pack.
+    for (unsigned I = 1, E = S.CurrentPackMax; I < E; ++I) {
+      S += ", ";
+      S.CurrentPackIndex = I;
+      Child->print(S);
+    }
+  }
+};
+
+class TemplateArgs final : public Node {
+  NodeArray Params;
+
+public:
+  TemplateArgs(NodeArray Params_) : Node(KTemplateArgs), Params(Params_) {}
+
+  NodeArray getParams() { return Params; }
+
+  void printLeft(OutputStream &S) const override {
+    S += "<";
+    Params.printWithComma(S);
+    if (S.back() == '>')
+      S += " ";
+    S += ">";
+  }
+};
+
+struct ForwardTemplateReference : Node {
+  size_t Index;
+  Node *Ref = nullptr;
+
+  // If we're currently printing this node. It is possible (though invalid) for
+  // a forward template reference to refer to itself via a substitution. This
+  // creates a cyclic AST, which will stack overflow printing. To fix this, bail
+  // out if more than one print* function is active.
+  mutable bool Printing = false;
+
+  ForwardTemplateReference(size_t Index_)
+      : Node(KForwardTemplateReference, Cache::Unknown, Cache::Unknown,
+             Cache::Unknown),
+        Index(Index_) {}
+
+  bool hasRHSComponentSlow(OutputStream &S) const override {
+    if (Printing)
+      return false;
+    SwapAndRestore<bool> SavePrinting(Printing, true);
+    return Ref->hasRHSComponent(S);
+  }
+  bool hasArraySlow(OutputStream &S) const override {
+    if (Printing)
+      return false;
+    SwapAndRestore<bool> SavePrinting(Printing, true);
+    return Ref->hasArray(S);
+  }
+  bool hasFunctionSlow(OutputStream &S) const override {
+    if (Printing)
+      return false;
+    SwapAndRestore<bool> SavePrinting(Printing, true);
+    return Ref->hasFunction(S);
+  }
+  const Node *getSyntaxNode(OutputStream &S) const override {
+    if (Printing)
+      return this;
+    SwapAndRestore<bool> SavePrinting(Printing, true);
+    return Ref->getSyntaxNode(S);
+  }
+
+  void printLeft(OutputStream &S) const override {
+    if (Printing)
+      return;
+    SwapAndRestore<bool> SavePrinting(Printing, true);
+    Ref->printLeft(S);
+  }
+  void printRight(OutputStream &S) const override {
+    if (Printing)
+      return;
+    SwapAndRestore<bool> SavePrinting(Printing, true);
+    Ref->printRight(S);
+  }
+};
+
+struct NameWithTemplateArgs : Node {
+  // name<template_args>
+  Node *Name;
+  Node *TemplateArgs;
+
+  NameWithTemplateArgs(Node *Name_, Node *TemplateArgs_)
+      : Node(KNameWithTemplateArgs), Name(Name_), TemplateArgs(TemplateArgs_) {}
+
+  StringView getBaseName() const override { return Name->getBaseName(); }
+
+  void printLeft(OutputStream &S) const override {
+    Name->print(S);
+    TemplateArgs->print(S);
+  }
+};
+
+class GlobalQualifiedName final : public Node {
+  Node *Child;
+
+public:
+  GlobalQualifiedName(Node* Child_)
+      : Node(KGlobalQualifiedName), Child(Child_) {}
+
+  StringView getBaseName() const override { return Child->getBaseName(); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "::";
+    Child->print(S);
+  }
+};
+
+struct StdQualifiedName : Node {
+  Node *Child;
+
+  StdQualifiedName(Node *Child_) : Node(KStdQualifiedName), Child(Child_) {}
+
+  StringView getBaseName() const override { return Child->getBaseName(); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "std::";
+    Child->print(S);
+  }
+};
+
+enum class SpecialSubKind {
+  allocator,
+  basic_string,
+  string,
+  istream,
+  ostream,
+  iostream,
+};
+
+class ExpandedSpecialSubstitution final : public Node {
+  SpecialSubKind SSK;
+
+public:
+  ExpandedSpecialSubstitution(SpecialSubKind SSK_)
+      : Node(KExpandedSpecialSubstitution), SSK(SSK_) {}
+
+  StringView getBaseName() const override {
+    switch (SSK) {
+    case SpecialSubKind::allocator:
+      return StringView("allocator");
+    case SpecialSubKind::basic_string:
+      return StringView("basic_string");
+    case SpecialSubKind::string:
+      return StringView("basic_string");
+    case SpecialSubKind::istream:
+      return StringView("basic_istream");
+    case SpecialSubKind::ostream:
+      return StringView("basic_ostream");
+    case SpecialSubKind::iostream:
+      return StringView("basic_iostream");
+    }
+    LLVM_BUILTIN_UNREACHABLE;
+  }
+
+  void printLeft(OutputStream &S) const override {
+    switch (SSK) {
+    case SpecialSubKind::allocator:
+      S += "std::basic_string<char, std::char_traits<char>, "
+           "std::allocator<char> >";
       break;
-    case 'x':
-      db.names.push_back("long long");
-      ++first;
+    case SpecialSubKind::basic_string:
+    case SpecialSubKind::string:
+      S += "std::basic_string<char, std::char_traits<char>, "
+           "std::allocator<char> >";
       break;
-    case 'y':
-      db.names.push_back("unsigned long long");
-      ++first;
+    case SpecialSubKind::istream:
+      S += "std::basic_istream<char, std::char_traits<char> >";
       break;
-    case 'n':
-      db.names.push_back("__int128");
-      ++first;
+    case SpecialSubKind::ostream:
+      S += "std::basic_ostream<char, std::char_traits<char> >";
       break;
-    case 'o':
-      db.names.push_back("unsigned __int128");
-      ++first;
+    case SpecialSubKind::iostream:
+      S += "std::basic_iostream<char, std::char_traits<char> >";
       break;
-    case 'f':
-      db.names.push_back("float");
-      ++first;
+    }
+  }
+};
+
+class SpecialSubstitution final : public Node {
+public:
+  SpecialSubKind SSK;
+
+  SpecialSubstitution(SpecialSubKind SSK_)
+      : Node(KSpecialSubstitution), SSK(SSK_) {}
+
+  StringView getBaseName() const override {
+    switch (SSK) {
+    case SpecialSubKind::allocator:
+      return StringView("allocator");
+    case SpecialSubKind::basic_string:
+      return StringView("basic_string");
+    case SpecialSubKind::string:
+      return StringView("string");
+    case SpecialSubKind::istream:
+      return StringView("istream");
+    case SpecialSubKind::ostream:
+      return StringView("ostream");
+    case SpecialSubKind::iostream:
+      return StringView("iostream");
+    }
+    LLVM_BUILTIN_UNREACHABLE;
+  }
+
+  void printLeft(OutputStream &S) const override {
+    switch (SSK) {
+    case SpecialSubKind::allocator:
+      S += "std::allocator";
       break;
-    case 'd':
-      db.names.push_back("double");
-      ++first;
+    case SpecialSubKind::basic_string:
+      S += "std::basic_string";
       break;
-    case 'e':
-      db.names.push_back("long double");
-      ++first;
+    case SpecialSubKind::string:
+      S += "std::string";
       break;
-    case 'g':
-      db.names.push_back("__float128");
-      ++first;
+    case SpecialSubKind::istream:
+      S += "std::istream";
       break;
-    case 'z':
-      db.names.push_back("...");
-      ++first;
+    case SpecialSubKind::ostream:
+      S += "std::ostream";
       break;
-    case 'u': {
-      const char *t = parse_source_name(first + 1, last, db);
-      if (t != first + 1)
-        first = t;
-    } break;
-    case 'D':
-      if (first + 1 != last) {
-        switch (first[1]) {
-        case 'd':
-          db.names.push_back("decimal64");
-          first += 2;
-          break;
-        case 'e':
-          db.names.push_back("decimal128");
-          first += 2;
-          break;
-        case 'f':
-          db.names.push_back("decimal32");
-          first += 2;
-          break;
-        case 'h':
-          db.names.push_back("decimal16");
-          first += 2;
-          break;
-        case 'i':
-          db.names.push_back("char32_t");
-          first += 2;
-          break;
-        case 's':
-          db.names.push_back("char16_t");
-          first += 2;
-          break;
-        case 'a':
-          db.names.push_back("auto");
-          first += 2;
-          break;
-        case 'c':
-          db.names.push_back("decltype(auto)");
-          first += 2;
-          break;
-        case 'n':
-          db.names.push_back("std::nullptr_t");
-          first += 2;
-          break;
-        }
-      }
+    case SpecialSubKind::iostream:
+      S += "std::iostream";
       break;
     }
   }
-  return first;
-}
+};
 
-// <CV-qualifiers> ::= [r] [V] [K]
+class CtorDtorName final : public Node {
+  const Node *Basename;
+  const bool IsDtor;
 
-static const char *parse_cv_qualifiers(const char *first, const char *last,
-                                       unsigned &cv) {
-  cv = 0;
-  if (first != last) {
-    if (*first == 'r') {
-      cv |= CV_restrict;
-      ++first;
-    }
-    if (*first == 'V') {
-      cv |= CV_volatile;
-      ++first;
-    }
-    if (*first == 'K') {
-      cv |= CV_const;
-      ++first;
-    }
+public:
+  CtorDtorName(Node *Basename_, bool IsDtor_)
+      : Node(KCtorDtorName), Basename(Basename_), IsDtor(IsDtor_) {}
+
+  void printLeft(OutputStream &S) const override {
+    if (IsDtor)
+      S += "~";
+    S += Basename->getBaseName();
   }
-  return first;
-}
+};
 
-// <template-param> ::= T_    # first template parameter
-//                  ::= T <parameter-2 non-negative number> _
+class DtorName : public Node {
+  const Node *Base;
 
-template <class C>
-static const char *parse_template_param(const char *first, const char *last,
-                                        C &db) {
-  if (last - first >= 2) {
-    if (*first == 'T') {
-      if (first[1] == '_') {
-        if (db.template_param.empty())
-          return first;
-        if (!db.template_param.back().empty()) {
-          for (auto &t : db.template_param.back().front())
-            db.names.push_back(t);
-          first += 2;
-        } else {
-          db.names.push_back("T_");
-          first += 2;
-          db.fix_forward_references = true;
-        }
-      } else if (isdigit(first[1])) {
-        const char *t = first + 1;
-        size_t sub = static_cast<size_t>(*t - '0');
-        for (++t; t != last && isdigit(*t); ++t) {
-          sub *= 10;
-          sub += static_cast<size_t>(*t - '0');
-        }
-        if (t == last || *t != '_' || db.template_param.empty())
-          return first;
-        ++sub;
-        if (sub < db.template_param.back().size()) {
-          for (auto &temp : db.template_param.back()[sub])
-            db.names.push_back(temp);
-          first = t + 1;
-        } else {
-          db.names.push_back(std::string(first, t + 1));
-          first = t + 1;
-          db.fix_forward_references = true;
-        }
-      }
-    }
+public:
+  DtorName(Node *Base_) : Node(KDtorName), Base(Base_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "~";
+    Base->printLeft(S);
   }
-  return first;
-}
+};
 
-// cc <type> <expression>                               # const_cast<type>
-// (expression)
-
-template <class C>
-static const char *parse_const_cast_expr(const char *first, const char *last,
-                                         C &db) {
-  if (last - first >= 3 && first[0] == 'c' && first[1] == 'c') {
-    const char *t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_expression(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto expr = db.names.back().move_full();
-        db.names.pop_back();
-        if (db.names.empty())
-          return first;
-        db.names.back() =
-            "const_cast<" + db.names.back().move_full() + ">(" + expr + ")";
-        first = t1;
-      }
-    }
+class UnnamedTypeName : public Node {
+  const StringView Count;
+
+public:
+  UnnamedTypeName(StringView Count_) : Node(KUnnamedTypeName), Count(Count_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "'unnamed";
+    S += Count;
+    S += "\'";
   }
-  return first;
-}
+};
 
-// dc <type> <expression>                               # dynamic_cast<type>
-// (expression)
-
-template <class C>
-static const char *parse_dynamic_cast_expr(const char *first, const char *last,
-                                           C &db) {
-  if (last - first >= 3 && first[0] == 'd' && first[1] == 'c') {
-    const char *t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_expression(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto expr = db.names.back().move_full();
-        db.names.pop_back();
-        if (db.names.empty())
-          return first;
-        db.names.back() =
-            "dynamic_cast<" + db.names.back().move_full() + ">(" + expr + ")";
-        first = t1;
-      }
-    }
+class ClosureTypeName : public Node {
+  NodeArray Params;
+  StringView Count;
+
+public:
+  ClosureTypeName(NodeArray Params_, StringView Count_)
+      : Node(KClosureTypeName), Params(Params_), Count(Count_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "\'lambda";
+    S += Count;
+    S += "\'(";
+    Params.printWithComma(S);
+    S += ")";
   }
-  return first;
-}
+};
 
-// rc <type> <expression>                               # reinterpret_cast<type>
-// (expression)
-
-template <class C>
-static const char *parse_reinterpret_cast_expr(const char *first,
-                                               const char *last, C &db) {
-  if (last - first >= 3 && first[0] == 'r' && first[1] == 'c') {
-    const char *t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_expression(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto expr = db.names.back().move_full();
-        db.names.pop_back();
-        if (db.names.empty())
-          return first;
-        db.names.back() = "reinterpret_cast<" + db.names.back().move_full() +
-                          ">(" + expr + ")";
-        first = t1;
-      }
-    }
+class StructuredBindingName : public Node {
+  NodeArray Bindings;
+public:
+  StructuredBindingName(NodeArray Bindings_)
+      : Node(KStructuredBindingName), Bindings(Bindings_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += '[';
+    Bindings.printWithComma(S);
+    S += ']';
   }
-  return first;
-}
+};
 
-// sc <type> <expression>                               # static_cast<type>
-// (expression)
-
-template <class C>
-static const char *parse_static_cast_expr(const char *first, const char *last,
-                                          C &db) {
-  if (last - first >= 3 && first[0] == 's' && first[1] == 'c') {
-    const char *t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_expression(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto expr = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back() =
-            "static_cast<" + db.names.back().move_full() + ">(" + expr + ")";
-        first = t1;
-      }
-    }
+// -- Expression Nodes --
+
+struct Expr : public Node {
+  Expr(Kind K = KExpr) : Node(K) {}
+};
+
+class BinaryExpr : public Expr {
+  const Node *LHS;
+  const StringView InfixOperator;
+  const Node *RHS;
+
+public:
+  BinaryExpr(Node *LHS_, StringView InfixOperator_, Node *RHS_)
+      : LHS(LHS_), InfixOperator(InfixOperator_), RHS(RHS_) {}
+
+  void printLeft(OutputStream &S) const override {
+    // might be a template argument expression, then we need to disambiguate
+    // with parens.
+    if (InfixOperator == ">")
+      S += "(";
+
+    S += "(";
+    LHS->print(S);
+    S += ") ";
+    S += InfixOperator;
+    S += " (";
+    RHS->print(S);
+    S += ")";
+
+    if (InfixOperator == ">")
+      S += ")";
   }
-  return first;
-}
+};
 
-// sp <expression>                                  # pack expansion
+class ArraySubscriptExpr : public Expr {
+  const Node *Op1;
+  const Node *Op2;
 
-template <class C>
-static const char *parse_pack_expansion(const char *first, const char *last,
-                                        C &db) {
-  if (last - first >= 3 && first[0] == 's' && first[1] == 'p') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2)
-      first = t;
+public:
+  ArraySubscriptExpr(Node *Op1_, Node *Op2_) : Op1(Op1_), Op2(Op2_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Op1->print(S);
+    S += ")[";
+    Op2->print(S);
+    S += "]";
   }
-  return first;
-}
+};
 
-// st <type>                                            # sizeof (a type)
+class PostfixExpr : public Expr {
+  const Node *Child;
+  const StringView Operand;
 
-template <class C>
-static const char *parse_sizeof_type_expr(const char *first, const char *last,
-                                          C &db) {
-  if (last - first >= 3 && first[0] == 's' && first[1] == 't') {
-    const char *t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back() = "sizeof (" + db.names.back().move_full() + ")";
-      first = t;
-    }
+public:
+  PostfixExpr(Node *Child_, StringView Operand_)
+      : Child(Child_), Operand(Operand_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Child->print(S);
+    S += ")";
+    S += Operand;
   }
-  return first;
-}
+};
+
+class ConditionalExpr : public Expr {
+  const Node *Cond;
+  const Node *Then;
+  const Node *Else;
+
+public:
+  ConditionalExpr(Node *Cond_, Node *Then_, Node *Else_)
+      : Cond(Cond_), Then(Then_), Else(Else_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Cond->print(S);
+    S += ") ? (";
+    Then->print(S);
+    S += ") : (";
+    Else->print(S);
+    S += ")";
+  }
+};
+
+class MemberExpr : public Expr {
+  const Node *LHS;
+  const StringView Kind;
+  const Node *RHS;
+
+public:
+  MemberExpr(Node *LHS_, StringView Kind_, Node *RHS_)
+      : LHS(LHS_), Kind(Kind_), RHS(RHS_) {}
 
-// sz <expr>                                            # sizeof (a expression)
+  void printLeft(OutputStream &S) const override {
+    LHS->print(S);
+    S += Kind;
+    RHS->print(S);
+  }
+};
+
+class EnclosingExpr : public Expr {
+  const StringView Prefix;
+  const Node *Infix;
+  const StringView Postfix;
+
+public:
+  EnclosingExpr(StringView Prefix_, Node *Infix_, StringView Postfix_)
+      : Prefix(Prefix_), Infix(Infix_), Postfix(Postfix_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += Prefix;
+    Infix->print(S);
+    S += Postfix;
+  }
+};
+
+class CastExpr : public Expr {
+  // cast_kind<to>(from)
+  const StringView CastKind;
+  const Node *To;
+  const Node *From;
+
+public:
+  CastExpr(StringView CastKind_, Node *To_, Node *From_)
+      : CastKind(CastKind_), To(To_), From(From_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += CastKind;
+    S += "<";
+    To->printLeft(S);
+    S += ">(";
+    From->printLeft(S);
+    S += ")";
+  }
+};
 
-template <class C>
-static const char *parse_sizeof_expr_expr(const char *first, const char *last,
-                                          C &db) {
-  if (last - first >= 3 && first[0] == 's' && first[1] == 'z') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back() = "sizeof (" + db.names.back().move_full() + ")";
-      first = t;
+class SizeofParamPackExpr : public Expr {
+  Node *Pack;
+
+public:
+  SizeofParamPackExpr(Node *Pack_) : Pack(Pack_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "sizeof...(";
+    ParameterPackExpansion PPE(Pack);
+    PPE.printLeft(S);
+    S += ")";
+  }
+};
+
+class CallExpr : public Expr {
+  const Node *Callee;
+  NodeArray Args;
+
+public:
+  CallExpr(Node *Callee_, NodeArray Args_) : Callee(Callee_), Args(Args_) {}
+
+  void printLeft(OutputStream &S) const override {
+    Callee->print(S);
+    S += "(";
+    Args.printWithComma(S);
+    S += ")";
+  }
+};
+
+class NewExpr : public Expr {
+  // new (expr_list) type(init_list)
+  NodeArray ExprList;
+  Node *Type;
+  NodeArray InitList;
+  bool IsGlobal; // ::operator new ?
+  bool IsArray;  // new[] ?
+public:
+  NewExpr(NodeArray ExprList_, Node *Type_, NodeArray InitList_, bool IsGlobal_,
+          bool IsArray_)
+      : ExprList(ExprList_), Type(Type_), InitList(InitList_),
+        IsGlobal(IsGlobal_), IsArray(IsArray_) {}
+
+  void printLeft(OutputStream &S) const override {
+    if (IsGlobal)
+      S += "::operator ";
+    S += "new";
+    if (IsArray)
+      S += "[]";
+    S += ' ';
+    if (!ExprList.empty()) {
+      S += "(";
+      ExprList.printWithComma(S);
+      S += ")";
     }
+    Type->print(S);
+    if (!InitList.empty()) {
+      S += "(";
+      InitList.printWithComma(S);
+      S += ")";
+    }
+
   }
-  return first;
-}
+};
 
-// sZ <template-param>                                  # size of a parameter
-// pack
-
-template <class C>
-static const char *parse_sizeof_param_pack_expr(const char *first,
-                                                const char *last, C &db) {
-  if (last - first >= 3 && first[0] == 's' && first[1] == 'Z' &&
-      first[2] == 'T') {
-    size_t k0 = db.names.size();
-    const char *t = parse_template_param(first + 2, last, db);
-    size_t k1 = db.names.size();
-    if (t != first + 2) {
-      std::string tmp("sizeof...(");
-      size_t k = k0;
-      if (k != k1) {
-        tmp += db.names[k].move_full();
-        for (++k; k != k1; ++k)
-          tmp += ", " + db.names[k].move_full();
-      }
-      tmp += ")";
-      for (; k1 != k0; --k1)
-        db.names.pop_back();
-      db.names.push_back(std::move(tmp));
-      first = t;
+class DeleteExpr : public Expr {
+  Node *Op;
+  bool IsGlobal;
+  bool IsArray;
+
+public:
+  DeleteExpr(Node *Op_, bool IsGlobal_, bool IsArray_)
+      : Op(Op_), IsGlobal(IsGlobal_), IsArray(IsArray_) {}
+
+  void printLeft(OutputStream &S) const override {
+    if (IsGlobal)
+      S += "::";
+    S += "delete";
+    if (IsArray)
+      S += "[] ";
+    Op->print(S);
+  }
+};
+
+class PrefixExpr : public Expr {
+  StringView Prefix;
+  Node *Child;
+
+public:
+  PrefixExpr(StringView Prefix_, Node *Child_) : Prefix(Prefix_), Child(Child_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += Prefix;
+    S += "(";
+    Child->print(S);
+    S += ")";
+  }
+};
+
+class FunctionParam : public Expr {
+  StringView Number;
+
+public:
+  FunctionParam(StringView Number_) : Number(Number_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "fp";
+    S += Number;
+  }
+};
+
+class ConversionExpr : public Expr {
+  const Node *Type;
+  NodeArray Expressions;
+
+public:
+  ConversionExpr(const Node *Type_, NodeArray Expressions_)
+      : Type(Type_), Expressions(Expressions_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Type->print(S);
+    S += ")(";
+    Expressions.printWithComma(S);
+    S += ")";
+  }
+};
+
+class InitListExpr : public Expr {
+  Node *Ty;
+  NodeArray Inits;
+public:
+  InitListExpr(Node *Ty_, NodeArray Inits_) : Ty(Ty_), Inits(Inits_) {}
+
+  void printLeft(OutputStream &S) const override {
+    if (Ty)
+      Ty->print(S);
+    S += '{';
+    Inits.printWithComma(S);
+    S += '}';
+  }
+};
+
+class BracedExpr : public Expr {
+  Node *Elem;
+  Node *Init;
+  bool IsArray;
+public:
+  BracedExpr(Node *Elem_, Node *Init_, bool IsArray_)
+      : Expr(KBracedExpr), Elem(Elem_), Init(Init_), IsArray(IsArray_) {}
+
+  void printLeft(OutputStream &S) const override {
+    if (IsArray) {
+      S += '[';
+      Elem->print(S);
+      S += ']';
+    } else {
+      S += '.';
+      Elem->print(S);
     }
+    if (Init->getKind() != KBracedExpr && Init->getKind() != KBracedRangeExpr)
+      S += " = ";
+    Init->print(S);
   }
-  return first;
-}
+};
+
+class BracedRangeExpr : public Expr {
+  Node *First;
+  Node *Last;
+  Node *Init;
+public:
+  BracedRangeExpr(Node *First_, Node *Last_, Node *Init_)
+      : Expr(KBracedRangeExpr), First(First_), Last(Last_), Init(Init_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += '[';
+    First->print(S);
+    S += " ... ";
+    Last->print(S);
+    S += ']';
+    if (Init->getKind() != KBracedExpr && Init->getKind() != KBracedRangeExpr)
+      S += " = ";
+    Init->print(S);
+  }
+};
 
-// <function-param> ::= fp <top-level CV-qualifiers> _ # L == 0, first parameter
-//                  ::= fp <top-level CV-qualifiers> <parameter-2 non-negative
-//                  number> _   # L == 0, second and later parameters
-//                  ::= fL <L-1 non-negative number> p <top-level CV-qualifiers>
-//                  _         # L > 0, first parameter
-//                  ::= fL <L-1 non-negative number> p <top-level CV-qualifiers>
-//                  <parameter-2 non-negative number> _   # L > 0, second and
-//                  later parameters
-
-template <class C>
-static const char *parse_function_param(const char *first, const char *last,
-                                        C &db) {
-  if (last - first >= 3 && *first == 'f') {
-    if (first[1] == 'p') {
-      unsigned cv;
-      const char *t = parse_cv_qualifiers(first + 2, last, cv);
-      const char *t1 = parse_number(t, last);
-      if (t1 != last && *t1 == '_') {
-        db.names.push_back("fp" + std::string(t, t1));
-        first = t1 + 1;
+struct FoldExpr : Expr {
+  Node *Pack, *Init;
+  StringView OperatorName;
+  bool IsLeftFold;
+
+  FoldExpr(bool IsLeftFold_, StringView OperatorName_, Node *Pack_, Node *Init_)
+      : Pack(Pack_), Init(Init_), OperatorName(OperatorName_),
+        IsLeftFold(IsLeftFold_) {}
+
+  void printLeft(OutputStream &S) const override {
+    auto PrintPack = [&] {
+      S += '(';
+      ParameterPackExpansion(Pack).print(S);
+      S += ')';
+    };
+
+    S += '(';
+
+    if (IsLeftFold) {
+      // init op ... op pack
+      if (Init != nullptr) {
+        Init->print(S);
+        S += ' ';
+        S += OperatorName;
+        S += ' ';
       }
-    } else if (first[1] == 'L') {
-      unsigned cv;
-      const char *t0 = parse_number(first + 2, last);
-      if (t0 != last && *t0 == 'p') {
-        ++t0;
-        const char *t = parse_cv_qualifiers(t0, last, cv);
-        const char *t1 = parse_number(t, last);
-        if (t1 != last && *t1 == '_') {
-          db.names.push_back("fp" + std::string(t, t1));
-          first = t1 + 1;
-        }
+      // ... op pack
+      S += "... ";
+      S += OperatorName;
+      S += ' ';
+      PrintPack();
+    } else { // !IsLeftFold
+      // pack op ...
+      PrintPack();
+      S += ' ';
+      S += OperatorName;
+      S += " ...";
+      // pack op ... op init
+      if (Init != nullptr) {
+        S += ' ';
+        S += OperatorName;
+        S += ' ';
+        Init->print(S);
       }
     }
+    S += ')';
   }
-  return first;
-}
+};
 
-// sZ <function-param>                                  # size of a function
-// parameter pack
+class ThrowExpr : public Expr {
+  const Node *Op;
 
-template <class C>
-static const char *parse_sizeof_function_param_pack_expr(const char *first,
-                                                         const char *last,
-                                                         C &db) {
-  if (last - first >= 3 && first[0] == 's' && first[1] == 'Z' &&
-      first[2] == 'f') {
-    const char *t = parse_function_param(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back() = "sizeof...(" + db.names.back().move_full() + ")";
-      first = t;
-    }
+public:
+  ThrowExpr(Node *Op_) : Op(Op_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "throw ";
+    Op->print(S);
   }
-  return first;
-}
+};
 
-// te <expression>                                      # typeid (expression)
-// ti <type>                                            # typeid (type)
-
-template <class C>
-static const char *parse_typeid_expr(const char *first, const char *last,
-                                     C &db) {
-  if (last - first >= 3 && first[0] == 't' &&
-      (first[1] == 'e' || first[1] == 'i')) {
-    const char *t;
-    if (first[1] == 'e')
-      t = parse_expression(first + 2, last, db);
-    else
-      t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back() = "typeid(" + db.names.back().move_full() + ")";
-      first = t;
-    }
+class BoolExpr : public Expr {
+  bool Value;
+
+public:
+  BoolExpr(bool Value_) : Value(Value_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += Value ? StringView("true") : StringView("false");
   }
-  return first;
-}
+};
+
+class IntegerCastExpr : public Expr {
+  // ty(integer)
+  Node *Ty;
+  StringView Integer;
+
+public:
+  IntegerCastExpr(Node *Ty_, StringView Integer_)
+      : Ty(Ty_), Integer(Integer_) {}
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Ty->print(S);
+    S += ")";
+    S += Integer;
+  }
+};
+
+class IntegerExpr : public Expr {
+  StringView Type;
+  StringView Value;
 
-// tw <expression>                                      # throw expression
+public:
+  IntegerExpr(StringView Type_, StringView Value_) : Type(Type_), Value(Value_) {}
 
-template <class C>
-static const char *parse_throw_expr(const char *first, const char *last,
-                                    C &db) {
-  if (last - first >= 3 && first[0] == 't' && first[1] == 'w') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back() = "throw " + db.names.back().move_full();
-      first = t;
+  void printLeft(OutputStream &S) const override {
+    if (Type.size() > 3) {
+      S += "(";
+      S += Type;
+      S += ")";
     }
+
+    if (Value[0] == 'n') {
+      S += "-";
+      S += Value.dropFront(1);
+    } else
+      S += Value;
+
+    if (Type.size() <= 3)
+      S += Type;
   }
-  return first;
-}
+};
+
+template <class Float> struct FloatData;
 
-// ds <expression> <expression>                         # expr.*expr
-
-template <class C>
-static const char *parse_dot_star_expr(const char *first, const char *last,
-                                       C &db) {
-  if (last - first >= 3 && first[0] == 'd' && first[1] == 's') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_expression(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto expr = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back().first += ".*" + expr;
-        first = t1;
+template <class Float> class FloatExpr : public Expr {
+  const StringView Contents;
+
+public:
+  FloatExpr(StringView Contents_) : Contents(Contents_) {}
+
+  void printLeft(OutputStream &s) const override {
+    const char *first = Contents.begin();
+    const char *last = Contents.end() + 1;
+
+    const size_t N = FloatData<Float>::mangled_size;
+    if (static_cast<std::size_t>(last - first) > N) {
+      last = first + N;
+      union {
+        Float value;
+        char buf[sizeof(Float)];
+      };
+      const char *t = first;
+      char *e = buf;
+      for (; t != last; ++t, ++e) {
+        unsigned d1 = isdigit(*t) ? static_cast<unsigned>(*t - '0')
+                                  : static_cast<unsigned>(*t - 'a' + 10);
+        ++t;
+        unsigned d0 = isdigit(*t) ? static_cast<unsigned>(*t - '0')
+                                  : static_cast<unsigned>(*t - 'a' + 10);
+        *e = static_cast<char>((d1 << 4) + d0);
       }
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+      std::reverse(buf, e);
+#endif
+      char num[FloatData<Float>::max_demangled_size] = {0};
+      int n = snprintf(num, sizeof(num), FloatData<Float>::spec, value);
+      s += StringView(num, num + n);
     }
   }
-  return first;
-}
+};
 
-// <simple-id> ::= <source-name> [ <template-args> ]
+class BumpPointerAllocator {
+  struct BlockMeta {
+    BlockMeta* Next;
+    size_t Current;
+  };
 
-template <class C>
-static const char *parse_simple_id(const char *first, const char *last, C &db) {
-  if (first != last) {
-    const char *t = parse_source_name(first, last, db);
-    if (t != first) {
-      const char *t1 = parse_template_args(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto args = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back().first += std::move(args);
-      }
-      first = t1;
-    } else
-      first = t;
+  static constexpr size_t AllocSize = 4096;
+  static constexpr size_t UsableAllocSize = AllocSize - sizeof(BlockMeta);
+
+  alignas(long double) char InitialBuffer[AllocSize];
+  BlockMeta* BlockList = nullptr;
+
+  void grow() {
+    char* NewMeta = static_cast<char *>(std::malloc(AllocSize));
+    if (NewMeta == nullptr)
+      std::terminate();
+    BlockList = new (NewMeta) BlockMeta{BlockList, 0};
   }
-  return first;
-}
 
-// <unresolved-type> ::= <template-param>
-//                   ::= <decltype>
-//                   ::= <substitution>
+  void* allocateMassive(size_t NBytes) {
+    NBytes += sizeof(BlockMeta);
+    BlockMeta* NewMeta = reinterpret_cast<BlockMeta*>(std::malloc(NBytes));
+    if (NewMeta == nullptr)
+      std::terminate();
+    BlockList->Next = new (NewMeta) BlockMeta{BlockList->Next, 0};
+    return static_cast<void*>(NewMeta + 1);
+  }
 
-template <class C>
-static const char *parse_unresolved_type(const char *first, const char *last,
-                                         C &db) {
-  if (first != last) {
-    const char *t = first;
-    switch (*first) {
-    case 'T': {
-      size_t k0 = db.names.size();
-      t = parse_template_param(first, last, db);
-      size_t k1 = db.names.size();
-      if (t != first && k1 == k0 + 1) {
-        db.subs.push_back(typename C::sub_type(1, db.names.back()));
-        first = t;
-      } else {
-        for (; k1 != k0; --k1)
-          db.names.pop_back();
-      }
-      break;
+public:
+  BumpPointerAllocator()
+      : BlockList(new (InitialBuffer) BlockMeta{nullptr, 0}) {}
+
+  void* allocate(size_t N) {
+    N = (N + 15u) & ~15u;
+    if (N + BlockList->Current >= UsableAllocSize) {
+      if (N > UsableAllocSize)
+        return allocateMassive(N);
+      grow();
     }
-    case 'D':
-      t = parse_decltype(first, last, db);
-      if (t != first) {
-        if (db.names.empty())
-          return first;
-        db.subs.push_back(typename C::sub_type(1, db.names.back()));
-        first = t;
-      }
-      break;
-    case 'S':
-      t = parse_substitution(first, last, db);
-      if (t != first)
-        first = t;
-      else {
-        if (last - first > 2 && first[1] == 't') {
-          t = parse_unqualified_name(first + 2, last, db);
-          if (t != first + 2) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first.insert(0, "std::");
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-            first = t;
-          }
-        }
-      }
-      break;
+    BlockList->Current += N;
+    return static_cast<void*>(reinterpret_cast<char*>(BlockList + 1) +
+                              BlockList->Current - N);
+  }
+
+  void reset() {
+    while (BlockList) {
+      BlockMeta* Tmp = BlockList;
+      BlockList = BlockList->Next;
+      if (reinterpret_cast<char*>(Tmp) != InitialBuffer)
+        std::free(Tmp);
     }
+    BlockList = new (InitialBuffer) BlockMeta{nullptr, 0};
   }
-  return first;
-}
 
-// <destructor-name> ::= <unresolved-type>                               # e.g.,
-// ~T or ~decltype(f())
-//                   ::= <simple-id>                                     # e.g.,
-//                   ~A<2*N>
-
-template <class C>
-static const char *parse_destructor_name(const char *first, const char *last,
-                                         C &db) {
-  if (first != last) {
-    const char *t = parse_unresolved_type(first, last, db);
-    if (t == first)
-      t = parse_simple_id(first, last, db);
-    if (t != first) {
-      if (db.names.empty())
-        return first;
-      db.names.back().first.insert(0, "~");
-      first = t;
-    }
-  }
-  return first;
-}
+  ~BumpPointerAllocator() { reset(); }
+};
 
-// <base-unresolved-name> ::= <simple-id>                                #
-// unresolved name
-//          extension     ::= <operator-name>                            #
-//          unresolved operator-function-id
-//          extension     ::= <operator-name> <template-args>            #
-//          unresolved operator template-id
-//                        ::= on <operator-name>                         #
-//                        unresolved operator-function-id
-//                        ::= on <operator-name> <template-args>         #
-//                        unresolved operator template-id
-//                        ::= dn <destructor-name>                       #
-//                        destructor or pseudo-destructor;
-//                                                                         #
-//                                                                         e.g.
-//                                                                         ~X or
-//                                                                         ~X<N-1>
-
-template <class C>
-static const char *parse_base_unresolved_name(const char *first,
-                                              const char *last, C &db) {
-  if (last - first >= 2) {
-    if ((first[0] == 'o' || first[0] == 'd') && first[1] == 'n') {
-      if (first[0] == 'o') {
-        const char *t = parse_operator_name(first + 2, last, db);
-        if (t != first + 2) {
-          first = parse_template_args(t, last, db);
-          if (first != t) {
-            if (db.names.size() < 2)
-              return first;
-            auto args = db.names.back().move_full();
-            db.names.pop_back();
-            db.names.back().first += std::move(args);
-          }
-        }
-      } else {
-        const char *t = parse_destructor_name(first + 2, last, db);
-        if (t != first + 2)
-          first = t;
-      }
+template <class T, size_t N>
+class PODSmallVector {
+  static_assert(std::is_pod<T>::value,
+                "T is required to be a plain old data type");
+
+  T* First;
+  T* Last;
+  T* Cap;
+  T Inline[N];
+
+  bool isInline() const { return First == Inline; }
+
+  void clearInline() {
+    First = Inline;
+    Last = Inline;
+    Cap = Inline + N;
+  }
+
+  void reserve(size_t NewCap) {
+    size_t S = size();
+    if (isInline()) {
+      auto* Tmp = static_cast<T*>(std::malloc(NewCap * sizeof(T)));
+      if (Tmp == nullptr)
+        std::terminate();
+      std::copy(First, Last, Tmp);
+      First = Tmp;
     } else {
-      const char *t = parse_simple_id(first, last, db);
-      if (t == first) {
-        t = parse_operator_name(first, last, db);
-        if (t != first) {
-          first = parse_template_args(t, last, db);
-          if (first != t) {
-            if (db.names.size() < 2)
-              return first;
-            auto args = db.names.back().move_full();
-            db.names.pop_back();
-            db.names.back().first += std::move(args);
-          }
-        }
-      } else
-        first = t;
+      First = static_cast<T*>(std::realloc(First, NewCap * sizeof(T)));
+      if (First == nullptr)
+        std::terminate();
     }
+    Last = First + S;
+    Cap = First + NewCap;
   }
-  return first;
-}
 
-// <unresolved-qualifier-level> ::= <simple-id>
+public:
+  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
 
-template <class C>
-static const char *parse_unresolved_qualifier_level(const char *first,
-                                                    const char *last, C &db) {
-  return parse_simple_id(first, last, db);
-}
+  PODSmallVector(const PODSmallVector&) = delete;
+  PODSmallVector& operator=(const PODSmallVector&) = delete;
 
-// <unresolved-name>
-//  extension        ::= srN <unresolved-type> [<template-args>]
-//  <unresolved-qualifier-level>* E <base-unresolved-name>
-//                   ::= [gs] <base-unresolved-name>                     # x or
-//                   (with "gs") ::x
-//                   ::= [gs] sr <unresolved-qualifier-level>+ E
-//                   <base-unresolved-name>
-//                                                                       # A::x,
-//                                                                       N::y,
-//                                                                       A<T>::z;
-//                                                                       "gs"
-//                                                                       means
-//                                                                       leading
-//                                                                       "::"
-//                   ::= sr <unresolved-type> <base-unresolved-name>     # T::x
-//                   / decltype(p)::x
-//  extension        ::= sr <unresolved-type> <template-args>
-//  <base-unresolved-name>
-//                                                                       #
-//                                                                       T::N::x
-//                                                                       /decltype(p)::N::x
-//  (ignored)        ::= srN <unresolved-type>  <unresolved-qualifier-level>+ E
-//  <base-unresolved-name>
-
-template <class C>
-static const char *parse_unresolved_name(const char *first, const char *last,
-                                         C &db) {
-  if (last - first > 2) {
-    const char *t = first;
-    bool global = false;
-    if (t[0] == 'g' && t[1] == 's') {
-      global = true;
-      t += 2;
-    }
-    const char *t2 = parse_base_unresolved_name(t, last, db);
-    if (t2 != t) {
-      if (global) {
-        if (db.names.empty())
-          return first;
-        db.names.back().first.insert(0, "::");
-      }
-      first = t2;
-    } else if (last - t > 2 && t[0] == 's' && t[1] == 'r') {
-      if (t[2] == 'N') {
-        t += 3;
-        const char *t1 = parse_unresolved_type(t, last, db);
-        if (t1 == t || t1 == last)
-          return first;
-        t = t1;
-        t1 = parse_template_args(t, last, db);
-        if (t1 != t) {
-          if (db.names.size() < 2)
-            return first;
-          auto args = db.names.back().move_full();
-          db.names.pop_back();
-          db.names.back().first += std::move(args);
-          t = t1;
-          if (t == last) {
-            db.names.pop_back();
-            return first;
-          }
-        }
-        while (*t != 'E') {
-          t1 = parse_unresolved_qualifier_level(t, last, db);
-          if (t1 == t || t1 == last || db.names.size() < 2)
-            return first;
-          auto s = db.names.back().move_full();
-          db.names.pop_back();
-          db.names.back().first += "::" + std::move(s);
-          t = t1;
-        }
-        ++t;
-        t1 = parse_base_unresolved_name(t, last, db);
-        if (t1 == t) {
-          if (!db.names.empty())
-            db.names.pop_back();
-          return first;
-        }
-        if (db.names.size() < 2)
-          return first;
-        auto s = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back().first += "::" + std::move(s);
-        first = t1;
-      } else {
-        t += 2;
-        const char *t1 = parse_unresolved_type(t, last, db);
-        if (t1 != t) {
-          t = t1;
-          t1 = parse_template_args(t, last, db);
-          if (t1 != t) {
-            if (db.names.size() < 2)
-              return first;
-            auto args = db.names.back().move_full();
-            db.names.pop_back();
-            db.names.back().first += std::move(args);
-            t = t1;
-          }
-          t1 = parse_base_unresolved_name(t, last, db);
-          if (t1 == t) {
-            if (!db.names.empty())
-              db.names.pop_back();
-            return first;
-          }
-          if (db.names.size() < 2)
-            return first;
-          auto s = db.names.back().move_full();
-          db.names.pop_back();
-          db.names.back().first += "::" + std::move(s);
-          first = t1;
-        } else {
-          t1 = parse_unresolved_qualifier_level(t, last, db);
-          if (t1 == t || t1 == last)
-            return first;
-          t = t1;
-          if (global) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first.insert(0, "::");
-          }
-          while (*t != 'E') {
-            t1 = parse_unresolved_qualifier_level(t, last, db);
-            if (t1 == t || t1 == last || db.names.size() < 2)
-              return first;
-            auto s = db.names.back().move_full();
-            db.names.pop_back();
-            db.names.back().first += "::" + std::move(s);
-            t = t1;
-          }
-          ++t;
-          t1 = parse_base_unresolved_name(t, last, db);
-          if (t1 == t) {
-            if (!db.names.empty())
-              db.names.pop_back();
-            return first;
-          }
-          if (db.names.size() < 2)
-            return first;
-          auto s = db.names.back().move_full();
-          db.names.pop_back();
-          db.names.back().first += "::" + std::move(s);
-          first = t1;
-        }
-      }
+  PODSmallVector(PODSmallVector&& Other) : PODSmallVector() {
+    if (Other.isInline()) {
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return;
     }
+
+    First = Other.First;
+    Last = Other.Last;
+    Cap = Other.Cap;
+    Other.clearInline();
   }
-  return first;
-}
 
-// dt <expression> <unresolved-name>                    # expr.name
-
-template <class C>
-static const char *parse_dot_expr(const char *first, const char *last, C &db) {
-  if (last - first >= 3 && first[0] == 'd' && first[1] == 't') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_unresolved_name(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto name = db.names.back().move_full();
-        db.names.pop_back();
-        if (db.names.empty())
-          return first;
-        db.names.back().first += "." + name;
-        first = t1;
+  PODSmallVector& operator=(PODSmallVector&& Other) {
+    if (Other.isInline()) {
+      if (!isInline()) {
+        std::free(First);
+        clearInline();
       }
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return *this;
     }
-  }
-  return first;
-}
 
-// cl <expression>+ E                                   # call
-
-template <class C>
-static const char *parse_call_expr(const char *first, const char *last, C &db) {
-  if (last - first >= 4 && first[0] == 'c' && first[1] == 'l') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      if (t == last)
-        return first;
-      if (db.names.empty())
-        return first;
-      db.names.back().first += db.names.back().second;
-      db.names.back().second = std::string();
-      db.names.back().first.append("(");
-      bool first_expr = true;
-      while (*t != 'E') {
-        const char *t1 = parse_expression(t, last, db);
-        if (t1 == t || t1 == last)
-          return first;
-        if (db.names.empty())
-          return first;
-        auto tmp = db.names.back().move_full();
-        db.names.pop_back();
-        if (!tmp.empty()) {
-          if (db.names.empty())
-            return first;
-          if (!first_expr) {
-            db.names.back().first.append(", ");
-            first_expr = false;
-          }
-          db.names.back().first.append(tmp);
-        }
-        t = t1;
-      }
-      ++t;
-      if (db.names.empty())
-        return first;
-      db.names.back().first.append(")");
-      first = t;
+    if (isInline()) {
+      First = Other.First;
+      Last = Other.Last;
+      Cap = Other.Cap;
+      Other.clearInline();
+      return *this;
     }
+
+    std::swap(First, Other.First);
+    std::swap(Last, Other.Last);
+    std::swap(Cap, Other.Cap);
+    Other.clear();
+    return *this;
   }
-  return first;
-}
 
-// [gs] nw <expression>* _ <type> E                     # new (expr-list) type
-// [gs] nw <expression>* _ <type> <initializer>         # new (expr-list) type
-// (init)
-// [gs] na <expression>* _ <type> E                     # new[] (expr-list) type
-// [gs] na <expression>* _ <type> <initializer>         # new[] (expr-list) type
-// (init)
-// <initializer> ::= pi <expression>* E                 # parenthesized
-// initialization
-
-template <class C>
-static const char *parse_new_expr(const char *first, const char *last, C &db) {
-  if (last - first >= 4) {
-    const char *t = first;
-    bool parsed_gs = false;
-    if (t[0] == 'g' && t[1] == 's') {
-      t += 2;
-      parsed_gs = true;
-    }
-    if (t[0] == 'n' && (t[1] == 'w' || t[1] == 'a')) {
-      bool is_array = t[1] == 'a';
-      t += 2;
-      if (t == last)
-        return first;
-      bool has_expr_list = false;
-      bool first_expr = true;
-      while (*t != '_') {
-        const char *t1 = parse_expression(t, last, db);
-        if (t1 == t || t1 == last)
-          return first;
-        has_expr_list = true;
-        if (!first_expr) {
-          if (db.names.empty())
-            return first;
-          auto tmp = db.names.back().move_full();
-          db.names.pop_back();
-          if (!tmp.empty()) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first.append(", ");
-            db.names.back().first.append(tmp);
-            first_expr = false;
-          }
-        }
-        t = t1;
-      }
-      ++t;
-      const char *t1 = parse_type(t, last, db);
-      if (t1 == t || t1 == last)
-        return first;
-      t = t1;
-      bool has_init = false;
-      if (last - t >= 3 && t[0] == 'p' && t[1] == 'i') {
-        t += 2;
-        has_init = true;
-        first_expr = true;
-        while (*t != 'E') {
-          t1 = parse_expression(t, last, db);
-          if (t1 == t || t1 == last)
-            return first;
-          if (!first_expr) {
-            if (db.names.empty())
-              return first;
-            auto tmp = db.names.back().move_full();
-            db.names.pop_back();
-            if (!tmp.empty()) {
-              if (db.names.empty())
-                return first;
-              db.names.back().first.append(", ");
-              db.names.back().first.append(tmp);
-              first_expr = false;
-            }
-          }
-          t = t1;
-        }
-      }
-      if (*t != 'E')
-        return first;
-      std::string init_list;
-      if (has_init) {
-        if (db.names.empty())
-          return first;
-        init_list = db.names.back().move_full();
-        db.names.pop_back();
-      }
-      if (db.names.empty())
-        return first;
-      auto type = db.names.back().move_full();
-      db.names.pop_back();
-      std::string expr_list;
-      if (has_expr_list) {
-        if (db.names.empty())
-          return first;
-        expr_list = db.names.back().move_full();
-        db.names.pop_back();
-      }
-      std::string r;
-      if (parsed_gs)
-        r = "::";
-      if (is_array)
-        r += "[] ";
-      else
-        r += " ";
-      if (has_expr_list)
-        r += "(" + expr_list + ") ";
-      r += type;
-      if (has_init)
-        r += " (" + init_list + ")";
-      db.names.push_back(std::move(r));
-      first = t + 1;
-    }
+  void push_back(const T& Elem) {
+    if (Last == Cap)
+      reserve(size() * 2);
+    *Last++ = Elem;
   }
-  return first;
-}
 
-// cv <type> <expression>                               # conversion with one
-// argument
-// cv <type> _ <expression>* E                          # conversion with a
-// different number of arguments
-
-template <class C>
-static const char *parse_conversion_expr(const char *first, const char *last,
-                                         C &db) {
-  if (last - first >= 3 && first[0] == 'c' && first[1] == 'v') {
-    bool try_to_parse_template_args = db.try_to_parse_template_args;
-    db.try_to_parse_template_args = false;
-    const char *t = parse_type(first + 2, last, db);
-    db.try_to_parse_template_args = try_to_parse_template_args;
-    if (t != first + 2 && t != last) {
-      if (*t != '_') {
-        const char *t1 = parse_expression(t, last, db);
-        if (t1 == t)
-          return first;
-        t = t1;
-      } else {
-        ++t;
-        if (t == last)
-          return first;
-        if (*t == 'E')
-          db.names.emplace_back();
-        else {
-          bool first_expr = true;
-          while (*t != 'E') {
-            const char *t1 = parse_expression(t, last, db);
-            if (t1 == t || t1 == last)
-              return first;
-            if (!first_expr) {
-              if (db.names.empty())
-                return first;
-              auto tmp = db.names.back().move_full();
-              db.names.pop_back();
-              if (!tmp.empty()) {
-                if (db.names.empty())
-                  return first;
-                db.names.back().first.append(", ");
-                db.names.back().first.append(tmp);
-                first_expr = false;
-              }
-            }
-            t = t1;
-          }
-        }
-        ++t;
-      }
-      if (db.names.size() < 2)
-        return first;
-      auto tmp = db.names.back().move_full();
-      db.names.pop_back();
-      db.names.back() = "(" + db.names.back().move_full() + ")(" + tmp + ")";
-      first = t;
-    }
+  void pop_back() {
+    assert(Last != First && "Popping empty vector!");
+    --Last;
   }
-  return first;
-}
 
-// pt <expression> <expression>                    # expr->name
-
-template <class C>
-static const char *parse_arrow_expr(const char *first, const char *last,
-                                    C &db) {
-  if (last - first >= 3 && first[0] == 'p' && first[1] == 't') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      const char *t1 = parse_expression(t, last, db);
-      if (t1 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto tmp = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back().first += "->";
-        db.names.back().first += tmp;
-        first = t1;
-      }
+  void dropBack(size_t Index) {
+    assert(Index <= size() && "dropBack() can't expand!");
+    Last = First + Index;
+  }
+
+  T* begin() { return First; }
+  T* end() { return Last; }
+
+  bool empty() const { return First == Last; }
+  size_t size() const { return static_cast<size_t>(Last - First); }
+  T& back() {
+    assert(Last != First && "Calling back() on empty vector!");
+    return *(Last - 1);
+  }
+  T& operator[](size_t Index) {
+    assert(Index < size() && "Invalid access!");
+    return *(begin() + Index);
+  }
+  void clear() { Last = First; }
+
+  ~PODSmallVector() {
+    if (!isInline())
+      std::free(First);
+  }
+};
+
+struct Db {
+  const char *First;
+  const char *Last;
+
+  // Name stack, this is used by the parser to hold temporary names that were
+  // parsed. The parser collapses multiple names into new nodes to construct
+  // the AST. Once the parser is finished, names.size() == 1.
+  PODSmallVector<Node *, 32> Names;
+
+  // Substitution table. Itanium supports name substitutions as a means of
+  // compression. The string "S42_" refers to the 44nd entry (base-36) in this
+  // table.
+  PODSmallVector<Node *, 32> Subs;
+
+  // Template parameter table. Like the above, but referenced like "T42_".
+  // This has a smaller size compared to Subs and Names because it can be
+  // stored on the stack.
+  PODSmallVector<Node *, 8> TemplateParams;
+
+  // Set of unresolved forward <template-param> references. These can occur in a
+  // conversion operator's type, and are resolved in the enclosing <encoding>.
+  PODSmallVector<ForwardTemplateReference *, 4> ForwardTemplateRefs;
+
+  bool TryToParseTemplateArgs = true;
+  bool PermitForwardTemplateReferences = false;
+  bool ParsingLambdaParams = false;
+
+  BumpPointerAllocator ASTAllocator;
+
+  Db(const char *First_, const char *Last_) : First(First_), Last(Last_) {}
+
+  void reset(const char *First_, const char *Last_) {
+    First = First_;
+    Last = Last_;
+    Names.clear();
+    Subs.clear();
+    TemplateParams.clear();
+    ParsingLambdaParams = false;
+    TryToParseTemplateArgs = true;
+    PermitForwardTemplateReferences = false;
+    ASTAllocator.reset();
+  }
+
+  template <class T, class... Args> T *make(Args &&... args) {
+    return new (ASTAllocator.allocate(sizeof(T)))
+        T(std::forward<Args>(args)...);
+  }
+
+  template <class It> NodeArray makeNodeArray(It begin, It end) {
+    size_t sz = static_cast<size_t>(end - begin);
+    void *mem = ASTAllocator.allocate(sizeof(Node *) * sz);
+    Node **data = new (mem) Node *[sz];
+    std::copy(begin, end, data);
+    return NodeArray(data, sz);
+  }
+
+  NodeArray popTrailingNodeArray(size_t FromPosition) {
+    assert(FromPosition <= Names.size());
+    NodeArray res =
+        makeNodeArray(Names.begin() + (long)FromPosition, Names.end());
+    Names.dropBack(FromPosition);
+    return res;
+  }
+
+  bool consumeIf(StringView S) {
+    if (StringView(First, Last).startsWith(S)) {
+      First += S.size();
+      return true;
     }
+    return false;
   }
-  return first;
-}
 
-//  <ref-qualifier> ::= R                   # & ref-qualifier
-//  <ref-qualifier> ::= O                   # && ref-qualifier
-
-// <function-type> ::= F [Y] <bare-function-type> [<ref-qualifier>] E
-
-template <class C>
-static const char *parse_function_type(const char *first, const char *last,
-                                       C &db) {
-  if (first != last && *first == 'F') {
-    const char *t = first + 1;
-    if (t != last) {
-      if (*t == 'Y') {
-        /* extern "C" */
-        if (++t == last)
-          return first;
-      }
-      const char *t1 = parse_type(t, last, db);
-      if (t1 != t) {
-        t = t1;
-        std::string sig("(");
-        int ref_qual = 0;
-        while (true) {
-          if (t == last) {
-            if (!db.names.empty())
-              db.names.pop_back();
-            return first;
-          }
-          if (*t == 'E') {
-            ++t;
-            break;
-          }
-          if (*t == 'v') {
-            ++t;
-            continue;
-          }
-          if (*t == 'R' && t + 1 != last && t[1] == 'E') {
-            ref_qual = 1;
-            ++t;
-            continue;
-          }
-          if (*t == 'O' && t + 1 != last && t[1] == 'E') {
-            ref_qual = 2;
-            ++t;
-            continue;
-          }
-          size_t k0 = db.names.size();
-          t1 = parse_type(t, last, db);
-          size_t k1 = db.names.size();
-          if (t1 == t || t1 == last)
-            return first;
-          for (size_t k = k0; k < k1; ++k) {
-            if (sig.size() > 1)
-              sig += ", ";
-            sig += db.names[k].move_full();
-          }
-          for (size_t k = k0; k < k1; ++k)
-            db.names.pop_back();
-          t = t1;
-        }
-        sig += ")";
-        switch (ref_qual) {
-        case 1:
-          sig += " &";
-          break;
-        case 2:
-          sig += " &&";
-          break;
-        }
-        if (db.names.empty())
-          return first;
-        db.names.back().first += " ";
-        db.names.back().second.insert(0, sig);
-        first = t;
-      }
+  bool consumeIf(char C) {
+    if (First != Last && *First == C) {
+      ++First;
+      return true;
     }
+    return false;
   }
-  return first;
-}
 
-// <pointer-to-member-type> ::= M <class type> <member type>
+  char consume() { return First != Last ? *First++ : '\0'; }
 
-template <class C>
-static const char *parse_pointer_to_member_type(const char *first,
-                                                const char *last, C &db) {
-  if (first != last && *first == 'M') {
-    const char *t = parse_type(first + 1, last, db);
-    if (t != first + 1) {
-      const char *t2 = parse_type(t, last, db);
-      if (t2 != t) {
-        if (db.names.size() < 2)
-          return first;
-        auto func = std::move(db.names.back());
-        db.names.pop_back();
-        auto class_type = std::move(db.names.back());
-        if (!func.second.empty() && func.second.front() == '(') {
-          db.names.back().first =
-              std::move(func.first) + "(" + class_type.move_full() + "::*";
-          db.names.back().second = ")" + std::move(func.second);
-        } else {
-          db.names.back().first =
-              std::move(func.first) + " " + class_type.move_full() + "::*";
-          db.names.back().second = std::move(func.second);
-        }
-        first = t2;
-      }
+  char look(unsigned Lookahead = 0) {
+    if (static_cast<size_t>(Last - First) <= Lookahead)
+      return '\0';
+    return First[Lookahead];
+  }
+
+  size_t numLeft() const { return static_cast<size_t>(Last - First); }
+
+  StringView parseNumber(bool AllowNegative = false);
+  Qualifiers parseCVQualifiers();
+  bool parsePositiveInteger(size_t *Out);
+  StringView parseBareSourceName();
+
+  bool parseSeqId(size_t *Out);
+  Node *parseSubstitution();
+  Node *parseTemplateParam();
+  Node *parseTemplateArgs(bool TagTemplates = false);
+  Node *parseTemplateArg();
+
+  /// Parse the <expr> production.
+  Node *parseExpr();
+  Node *parsePrefixExpr(StringView Kind);
+  Node *parseBinaryExpr(StringView Kind);
+  Node *parseIntegerLiteral(StringView Lit);
+  Node *parseExprPrimary();
+  template <class Float> Node *parseFloatingLiteral();
+  Node *parseFunctionParam();
+  Node *parseNewExpr();
+  Node *parseConversionExpr();
+  Node *parseBracedExpr();
+  Node *parseFoldExpr();
+
+  /// Parse the <type> production.
+  Node *parseType();
+  Node *parseFunctionType();
+  Node *parseVectorType();
+  Node *parseDecltype();
+  Node *parseArrayType();
+  Node *parsePointerToMemberType();
+  Node *parseClassEnumType();
+  Node *parseQualifiedType();
+
+  Node *parseEncoding();
+  bool parseCallOffset();
+  Node *parseSpecialName();
+
+  /// Holds some extra information about a <name> that is being parsed. This
+  /// information is only pertinent if the <name> refers to an <encoding>.
+  struct NameState {
+    bool CtorDtorConversion = false;
+    bool EndsWithTemplateArgs = false;
+    Qualifiers CVQualifiers = QualNone;
+    FunctionRefQual ReferenceQualifier = FrefQualNone;
+    size_t ForwardTemplateRefsBegin;
+
+    NameState(Db *Enclosing)
+        : ForwardTemplateRefsBegin(Enclosing->ForwardTemplateRefs.size()) {}
+  };
+
+  bool resolveForwardTemplateRefs(NameState &State) {
+    size_t I = State.ForwardTemplateRefsBegin;
+    size_t E = ForwardTemplateRefs.size();
+    for (; I < E; ++I) {
+      size_t Idx = ForwardTemplateRefs[I]->Index;
+      if (Idx >= TemplateParams.size())
+        return true;
+      ForwardTemplateRefs[I]->Ref = TemplateParams[Idx];
     }
+    ForwardTemplateRefs.dropBack(State.ForwardTemplateRefsBegin);
+    return false;
   }
-  return first;
-}
 
-// <array-type> ::= A <positive dimension number> _ <element type>
-//              ::= A [<dimension expression>] _ <element type>
+  /// Parse the <name> production>
+  Node *parseName(NameState *State = nullptr);
+  Node *parseLocalName(NameState *State);
+  Node *parseOperatorName(NameState *State);
+  Node *parseUnqualifiedName(NameState *State);
+  Node *parseUnnamedTypeName(NameState *State);
+  Node *parseSourceName(NameState *State);
+  Node *parseUnscopedName(NameState *State);
+  Node *parseNestedName(NameState *State);
+  Node *parseCtorDtorName(Node *&SoFar, NameState *State);
+
+  Node *parseAbiTags(Node *N);
+
+  /// Parse the <unresolved-name> production.
+  Node *parseUnresolvedName();
+  Node *parseSimpleId();
+  Node *parseBaseUnresolvedName();
+  Node *parseUnresolvedType();
+  Node *parseDestructorName();
+
+  /// Top-level entry point into the parser.
+  Node *parse();
+};
 
-template <class C>
-static const char *parse_array_type(const char *first, const char *last,
-                                    C &db) {
-  if (first != last && *first == 'A' && first + 1 != last) {
-    if (first[1] == '_') {
-      const char *t = parse_type(first + 2, last, db);
-      if (t != first + 2) {
-        if (db.names.empty())
-          return first;
-        if (db.names.back().second.substr(0, 2) == " [")
-          db.names.back().second.erase(0, 1);
-        db.names.back().second.insert(0, " []");
-        first = t;
-      }
-    } else if ('1' <= first[1] && first[1] <= '9') {
-      const char *t = parse_number(first + 1, last);
-      if (t != last && *t == '_') {
-        const char *t2 = parse_type(t + 1, last, db);
-        if (t2 != t + 1) {
-          if (db.names.empty())
-            return first;
-          if (db.names.back().second.substr(0, 2) == " [")
-            db.names.back().second.erase(0, 1);
-          db.names.back().second.insert(0,
-                                        " [" + std::string(first + 1, t) + "]");
-          first = t2;
-        }
-      }
-    } else {
-      const char *t = parse_expression(first + 1, last, db);
-      if (t != first + 1 && t != last && *t == '_') {
-        const char *t2 = parse_type(++t, last, db);
-        if (t2 != t) {
-          if (db.names.size() < 2)
-            return first;
-          auto type = std::move(db.names.back());
-          db.names.pop_back();
-          auto expr = std::move(db.names.back());
-          db.names.back().first = std::move(type.first);
-          if (type.second.substr(0, 2) == " [")
-            type.second.erase(0, 1);
-          db.names.back().second =
-              " [" + expr.move_full() + "]" + std::move(type.second);
-          first = t2;
-        }
-      }
-    }
+const char* parse_discriminator(const char* first, const char* last);
+
+// <name> ::= <nested-name> // N
+//        ::= <local-name> # See Scope Encoding below  // Z
+//        ::= <unscoped-template-name> <template-args>
+//        ::= <unscoped-name>
+//
+// <unscoped-template-name> ::= <unscoped-name>
+//                          ::= <substitution>
+Node *Db::parseName(NameState *State) {
+  consumeIf('L'); // extension
+
+  if (look() == 'N')
+    return parseNestedName(State);
+  if (look() == 'Z')
+    return parseLocalName(State);
+
+  //        ::= <unscoped-template-name> <template-args>
+  if (look() == 'S' && look(1) != 't') {
+    Node *S = parseSubstitution();
+    if (S == nullptr)
+      return nullptr;
+    if (look() != 'I')
+      return nullptr;
+    Node *TA = parseTemplateArgs(State != nullptr);
+    if (TA == nullptr)
+      return nullptr;
+    if (State) State->EndsWithTemplateArgs = true;
+    return make<NameWithTemplateArgs>(S, TA);
   }
-  return first;
+
+  Node *N = parseUnscopedName(State);
+  if (N == nullptr)
+    return nullptr;
+  //        ::= <unscoped-template-name> <template-args>
+  if (look() == 'I') {
+    Subs.push_back(N);
+    Node *TA = parseTemplateArgs(State != nullptr);
+    if (TA == nullptr)
+      return nullptr;
+    if (State) State->EndsWithTemplateArgs = true;
+    return make<NameWithTemplateArgs>(N, TA);
+  }
+  //        ::= <unscoped-name>
+  return N;
 }
 
-// <decltype>  ::= Dt <expression> E  # decltype of an id-expression or class
-// member access (C++0x)
-//             ::= DT <expression> E  # decltype of an expression (C++0x)
+// <local-name> := Z <function encoding> E <entity name> [<discriminator>]
+//              := Z <function encoding> E s [<discriminator>]
+//              := Z <function encoding> Ed [ <parameter number> ] _ <entity name>
+Node *Db::parseLocalName(NameState *State) {
+  if (!consumeIf('Z'))
+    return nullptr;
+  Node *Encoding = parseEncoding();
+  if (Encoding == nullptr || !consumeIf('E'))
+    return nullptr;
 
-template <class C>
-static const char *parse_decltype(const char *first, const char *last, C &db) {
-  if (last - first >= 4 && first[0] == 'D') {
-    switch (first[1]) {
-    case 't':
-    case 'T': {
-      const char *t = parse_expression(first + 2, last, db);
-      if (t != first + 2 && t != last && *t == 'E') {
-        if (db.names.empty())
-          return first;
-        db.names.back() = "decltype(" + db.names.back().move_full() + ")";
-        first = t + 1;
-      }
-    } break;
-    }
+  if (consumeIf('s')) {
+    First = parse_discriminator(First, Last);
+    return make<LocalName>(Encoding, make<NameType>("string literal"));
   }
-  return first;
+
+  if (consumeIf('d')) {
+    parseNumber(true);
+    if (!consumeIf('_'))
+      return nullptr;
+    Node *N = parseName(State);
+    if (N == nullptr)
+      return nullptr;
+    return make<LocalName>(Encoding, N);
+  }
+
+  Node *Entity = parseName(State);
+  if (Entity == nullptr)
+    return nullptr;
+  First = parse_discriminator(First, Last);
+  return make<LocalName>(Encoding, Entity);
 }
 
-// extension:
-// <vector-type>           ::= Dv <positive dimension number> _
-//                                    <extended element type>
-//                         ::= Dv [<dimension expression>] _ <element type>
-// <extended element type> ::= <element type>
-//                         ::= p # AltiVec vector pixel
+// <unscoped-name> ::= <unqualified-name>
+//                 ::= St <unqualified-name>   # ::std::
+// extension       ::= StL<unqualified-name>
+Node *Db::parseUnscopedName(NameState *State) {
+ if (consumeIf("StL") || consumeIf("St")) {
+   Node *R = parseUnqualifiedName(State);
+   if (R == nullptr)
+     return nullptr;
+   return make<StdQualifiedName>(R);
+ }
+ return parseUnqualifiedName(State);
+}
 
-template <class C>
-static const char *parse_vector_type(const char *first, const char *last,
-                                     C &db) {
-  if (last - first > 3 && first[0] == 'D' && first[1] == 'v') {
-    if ('1' <= first[2] && first[2] <= '9') {
-      const char *t = parse_number(first + 2, last);
-      if (t == last || *t != '_')
-        return first;
-      const char *num = first + 2;
-      size_t sz = static_cast<size_t>(t - num);
-      if (++t != last) {
-        if (*t != 'p') {
-          const char *t1 = parse_type(t, last, db);
-          if (t1 != t) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first += " vector[" + std::string(num, sz) + "]";
-            first = t1;
-          }
-        } else {
-          ++t;
-          db.names.push_back("pixel vector[" + std::string(num, sz) + "]");
-          first = t;
-        }
-      }
-    } else {
-      std::string num;
-      const char *t1 = first + 2;
-      if (*t1 != '_') {
-        const char *t = parse_expression(t1, last, db);
-        if (t != t1) {
-          if (db.names.empty())
-            return first;
-          num = db.names.back().move_full();
-          db.names.pop_back();
-          t1 = t;
-        }
-      }
-      if (t1 != last && *t1 == '_' && ++t1 != last) {
-        const char *t = parse_type(t1, last, db);
-        if (t != t1) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first += " vector[" + num + "]";
-          first = t;
-        }
-      }
-    }
-  }
-  return first;
+// <unqualified-name> ::= <operator-name> [abi-tags]
+//                    ::= <ctor-dtor-name>
+//                    ::= <source-name>
+//                    ::= <unnamed-type-name>
+//                    ::= DC <source-name>+ E      # structured binding declaration
+Node *Db::parseUnqualifiedName(NameState *State) {
+ // <ctor-dtor-name>s are special-cased in parseNestedName().
+ Node *Result;
+ if (look() == 'U')
+   Result = parseUnnamedTypeName(State);
+ else if (look() >= '1' && look() <= '9')
+   Result = parseSourceName(State);
+ else if (consumeIf("DC")) {
+   size_t BindingsBegin = Names.size();
+   do {
+     Node *Binding = parseSourceName(State);
+     if (Binding == nullptr)
+       return nullptr;
+     Names.push_back(Binding);
+   } while (!consumeIf('E'));
+   Result = make<StructuredBindingName>(popTrailingNodeArray(BindingsBegin));
+ } else
+   Result = parseOperatorName(State);
+ if (Result != nullptr)
+   Result = parseAbiTags(Result);
+ return Result;
 }
 
-// <type> ::= <builtin-type>
-//        ::= <function-type>
-//        ::= <class-enum-type>
-//        ::= <array-type>
-//        ::= <pointer-to-member-type>
-//        ::= <template-param>
-//        ::= <template-template-param> <template-args>
-//        ::= <decltype>
-//        ::= <substitution>
-//        ::= <CV-qualifiers> <type>
-//        ::= P <type>        # pointer-to
-//        ::= R <type>        # reference-to
-//        ::= O <type>        # rvalue reference-to (C++0x)
-//        ::= C <type>        # complex pair (C 2000)
-//        ::= G <type>        # imaginary (C 2000)
-//        ::= Dp <type>       # pack expansion (C++0x)
-//        ::= U <source-name> <type>  # vendor extended type qualifier
-// extension := U <objc-name> <objc-type>  # objc-type<identifier>
-// extension := <vector-type> # <vector-type> starts with Dv
-
-// <objc-name> ::= <k0 number> objcproto <k1 number> <identifier>  # k0 = 9 +
-// <number of digits in k1> + k1
-// <objc-type> := <source-name>  # PU<11+>objcproto 11objc_object<source-name>
-// 11objc_object -> id<source-name>
-
-template <class C>
-static const char *parse_type(const char *first, const char *last, C &db) {
-  if (first != last) {
-    switch (*first) {
-    case 'r':
-    case 'V':
-    case 'K': {
-      unsigned cv = 0;
-      const char *t = parse_cv_qualifiers(first, last, cv);
-      if (t != first) {
-        bool is_function = *t == 'F';
-        size_t k0 = db.names.size();
-        const char *t1 = parse_type(t, last, db);
-        size_t k1 = db.names.size();
-        if (t1 != t) {
-          if (is_function)
-            db.subs.pop_back();
-          db.subs.emplace_back();
-          for (size_t k = k0; k < k1; ++k) {
-            if (is_function) {
-              auto &name = db.names[k].second;
-              size_t p = name.size();
-
-              if (name[p - 2] == '&' && name[p - 1] == '&')
-                p -= 2;
-              else if (name.back() == '&')
-                p -= 1;
-
-              if (cv & CV_const) {
-                name.insert(p, " const");
-                p += 6;
-              }
-              if (cv & CV_volatile) {
-                name.insert(p, " volatile");
-                p += 9;
-              }
-              if (cv & CV_restrict)
-                name.insert(p, " restrict");
-            } else {
-              if (cv & CV_const)
-                db.names[k].first.append(" const");
-              if (cv & CV_volatile)
-                db.names[k].first.append(" volatile");
-              if (cv & CV_restrict)
-                db.names[k].first.append(" restrict");
-            }
-            db.subs.back().push_back(db.names[k]);
-          }
-          first = t1;
-        }
-      }
-    } break;
-    default: {
-      const char *t = parse_builtin_type(first, last, db);
-      if (t != first) {
-        first = t;
-      } else {
-        switch (*first) {
-        case 'A':
-          t = parse_array_type(first, last, db);
-          if (t != first) {
-            if (db.names.empty())
-              return first;
-            first = t;
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          }
-          break;
-        case 'C':
-          t = parse_type(first + 1, last, db);
-          if (t != first + 1) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first.append(" complex");
-            first = t;
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          }
-          break;
-        case 'F':
-          t = parse_function_type(first, last, db);
-          if (t != first) {
-            if (db.names.empty())
-              return first;
-            first = t;
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          }
-          break;
-        case 'G':
-          t = parse_type(first + 1, last, db);
-          if (t != first + 1) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first.append(" imaginary");
-            first = t;
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          }
-          break;
-        case 'M':
-          t = parse_pointer_to_member_type(first, last, db);
-          if (t != first) {
-            if (db.names.empty())
-              return first;
-            first = t;
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          }
-          break;
-        case 'O': {
-          size_t k0 = db.names.size();
-          t = parse_type(first + 1, last, db);
-          size_t k1 = db.names.size();
-          if (t != first + 1) {
-            db.subs.emplace_back();
-            for (size_t k = k0; k < k1; ++k) {
-              if (db.names[k].second.substr(0, 2) == " [") {
-                db.names[k].first += " (";
-                db.names[k].second.insert(0, ")");
-              } else if (!db.names[k].second.empty() &&
-                         db.names[k].second.front() == '(') {
-                db.names[k].first += "(";
-                db.names[k].second.insert(0, ")");
-              }
-              db.names[k].first.append("&&");
-              db.subs.back().push_back(db.names[k]);
-            }
-            first = t;
-          }
-          break;
-        }
-        case 'P': {
-          size_t k0 = db.names.size();
-          t = parse_type(first + 1, last, db);
-          size_t k1 = db.names.size();
-          if (t != first + 1) {
-            db.subs.emplace_back();
-            for (size_t k = k0; k < k1; ++k) {
-              if (db.names[k].second.substr(0, 2) == " [") {
-                db.names[k].first += " (";
-                db.names[k].second.insert(0, ")");
-              } else if (!db.names[k].second.empty() &&
-                         db.names[k].second.front() == '(') {
-                db.names[k].first += "(";
-                db.names[k].second.insert(0, ")");
-              }
-              if (first[1] != 'U' ||
-                  db.names[k].first.substr(0, 12) != "objc_object<") {
-                db.names[k].first.append("*");
-              } else {
-                db.names[k].first.replace(0, 11, "id");
-              }
-              db.subs.back().push_back(db.names[k]);
-            }
-            first = t;
-          }
-          break;
-        }
-        case 'R': {
-          size_t k0 = db.names.size();
-          t = parse_type(first + 1, last, db);
-          size_t k1 = db.names.size();
-          if (t != first + 1) {
-            db.subs.emplace_back();
-            for (size_t k = k0; k < k1; ++k) {
-              if (db.names[k].second.substr(0, 2) == " [") {
-                db.names[k].first += " (";
-                db.names[k].second.insert(0, ")");
-              } else if (!db.names[k].second.empty() &&
-                         db.names[k].second.front() == '(') {
-                db.names[k].first += "(";
-                db.names[k].second.insert(0, ")");
-              }
-              db.names[k].first.append("&");
-              db.subs.back().push_back(db.names[k]);
-            }
-            first = t;
-          }
-          break;
-        }
-        case 'T': {
-          size_t k0 = db.names.size();
-          t = parse_template_param(first, last, db);
-          size_t k1 = db.names.size();
-          if (t != first) {
-            db.subs.emplace_back();
-            for (size_t k = k0; k < k1; ++k)
-              db.subs.back().push_back(db.names[k]);
-            if (db.try_to_parse_template_args && k1 == k0 + 1) {
-              const char *t1 = parse_template_args(t, last, db);
-              if (t1 != t) {
-                auto args = db.names.back().move_full();
-                db.names.pop_back();
-                db.names.back().first += std::move(args);
-                db.subs.push_back(typename C::sub_type(1, db.names.back()));
-                t = t1;
-              }
-            }
-            first = t;
-          }
-          break;
-        }
-        case 'U':
-          if (first + 1 != last) {
-            t = parse_source_name(first + 1, last, db);
-            if (t != first + 1) {
-              const char *t2 = parse_type(t, last, db);
-              if (t2 != t) {
-                if (db.names.size() < 2)
-                  return first;
-                auto type = db.names.back().move_full();
-                db.names.pop_back();
-                if (db.names.back().first.substr(0, 9) != "objcproto") {
-                  db.names.back() = type + " " + db.names.back().move_full();
-                } else {
-                  auto proto = db.names.back().move_full();
-                  db.names.pop_back();
-                  t = parse_source_name(proto.data() + 9,
-                                        proto.data() + proto.size(), db);
-                  if (t != proto.data() + 9) {
-                    db.names.back() =
-                        type + "<" + db.names.back().move_full() + ">";
-                  } else {
-                    db.names.push_back(type + " " + proto);
-                  }
-                }
-                db.subs.push_back(typename C::sub_type(1, db.names.back()));
-                first = t2;
-              }
-            }
-          }
-          break;
-        case 'S':
-          if (first + 1 != last && first[1] == 't') {
-            t = parse_name(first, last, db);
-            if (t != first) {
-              if (db.names.empty())
-                return first;
-              db.subs.push_back(typename C::sub_type(1, db.names.back()));
-              first = t;
-            }
-          } else {
-            t = parse_substitution(first, last, db);
-            if (t != first) {
-              first = t;
-              // Parsed a substitution.  If the substitution is a
-              //  <template-param> it might be followed by <template-args>.
-              t = parse_template_args(first, last, db);
-              if (t != first) {
-                if (db.names.size() < 2)
-                  return first;
-                auto template_args = db.names.back().move_full();
-                db.names.pop_back();
-                db.names.back().first += template_args;
-                // Need to create substitution for <template-template-param>
-                // <template-args>
-                db.subs.push_back(typename C::sub_type(1, db.names.back()));
-                first = t;
-              }
-            }
-          }
-          break;
-        case 'D':
-          if (first + 1 != last) {
-            switch (first[1]) {
-            case 'p': {
-              size_t k0 = db.names.size();
-              t = parse_type(first + 2, last, db);
-              size_t k1 = db.names.size();
-              if (t != first + 2) {
-                db.subs.emplace_back();
-                for (size_t k = k0; k < k1; ++k)
-                  db.subs.back().push_back(db.names[k]);
-                first = t;
-                return first;
-              }
-              break;
-            }
-            case 't':
-            case 'T':
-              t = parse_decltype(first, last, db);
-              if (t != first) {
-                if (db.names.empty())
-                  return first;
-                db.subs.push_back(typename C::sub_type(1, db.names.back()));
-                first = t;
-                return first;
-              }
-              break;
-            case 'v':
-              t = parse_vector_type(first, last, db);
-              if (t != first) {
-                if (db.names.empty())
-                  return first;
-                db.subs.push_back(typename C::sub_type(1, db.names.back()));
-                first = t;
-                return first;
-              }
-              break;
-            }
-          }
-          LLVM_FALLTHROUGH;
-        default:
-          // must check for builtin-types before class-enum-types to avoid
-          // ambiguities with operator-names
-          t = parse_builtin_type(first, last, db);
-          if (t != first) {
-            first = t;
-          } else {
-            t = parse_name(first, last, db);
-            if (t != first) {
-              if (db.names.empty())
-                return first;
-              db.subs.push_back(typename C::sub_type(1, db.names.back()));
-              first = t;
-            }
-          }
-          break;
-        }
-      }
-      break;
-    }
+// <unnamed-type-name> ::= Ut [<nonnegative number>] _
+//                     ::= <closure-type-name>
+//
+// <closure-type-name> ::= Ul <lambda-sig> E [ <nonnegative number> ] _
+//
+// <lambda-sig> ::= <parameter type>+  # Parameter types or "v" if the lambda has no parameters
+Node *Db::parseUnnamedTypeName(NameState *) {
+  if (consumeIf("Ut")) {
+    StringView Count = parseNumber();
+    if (!consumeIf('_'))
+      return nullptr;
+    return make<UnnamedTypeName>(Count);
+  }
+  if (consumeIf("Ul")) {
+    NodeArray Params;
+    SwapAndRestore<bool> SwapParams(ParsingLambdaParams, true);
+    if (!consumeIf("vE")) {
+      size_t ParamsBegin = Names.size();
+      do {
+        Node *P = parseType();
+        if (P == nullptr)
+          return nullptr;
+        Names.push_back(P);
+      } while (!consumeIf('E'));
+      Params = popTrailingNodeArray(ParamsBegin);
     }
+    StringView Count = parseNumber();
+    if (!consumeIf('_'))
+      return nullptr;
+    return make<ClosureTypeName>(Params, Count);
   }
-  return first;
+  return nullptr;
+}
+
+// <source-name> ::= <positive length number> <identifier>
+Node *Db::parseSourceName(NameState *) {
+  size_t Length = 0;
+  if (parsePositiveInteger(&Length))
+    return nullptr;
+  if (numLeft() < Length || Length == 0)
+    return nullptr;
+  StringView Name(First, First + Length);
+  First += Length;
+  if (Name.startsWith("_GLOBAL__N"))
+    return make<NameType>("(anonymous namespace)");
+  return make<NameType>(Name);
 }
 
-//   <operator-name>
-//                   ::= aa    # &&
+//   <operator-name> ::= aa    # &&
 //                   ::= ad    # & (unary)
 //                   ::= an    # &
 //                   ::= aN    # &=
@@ -2024,1783 +2329,2047 @@ static const char *parse_type(const char *first, const char *last, C &db) {
 //                   ::= rM    # %=
 //                   ::= rs    # >>
 //                   ::= rS    # >>=
-//                   ::= v <digit> <source-name>        # vendor extended
-//                   operator
-
-template <class C>
-static const char *parse_operator_name(const char *first, const char *last,
-                                       C &db) {
-  if (last - first >= 2) {
-    switch (first[0]) {
+//                   ::= ss    # <=> C++2a
+//                   ::= v <digit> <source-name>        # vendor extended operator
+Node *Db::parseOperatorName(NameState *State) {
+  switch (look()) {
+  case 'a':
+    switch (look(1)) {
     case 'a':
-      switch (first[1]) {
-      case 'a':
-        db.names.push_back("operator&&");
-        first += 2;
-        break;
-      case 'd':
-      case 'n':
-        db.names.push_back("operator&");
-        first += 2;
-        break;
-      case 'N':
-        db.names.push_back("operator&=");
-        first += 2;
-        break;
-      case 'S':
-        db.names.push_back("operator=");
-        first += 2;
-        break;
-      }
-      break;
-    case 'c':
-      switch (first[1]) {
-      case 'l':
-        db.names.push_back("operator()");
-        first += 2;
-        break;
-      case 'm':
-        db.names.push_back("operator,");
-        first += 2;
-        break;
-      case 'o':
-        db.names.push_back("operator~");
-        first += 2;
-        break;
-      case 'v': {
-        bool try_to_parse_template_args = db.try_to_parse_template_args;
-        db.try_to_parse_template_args = false;
-        const char *t = parse_type(first + 2, last, db);
-        db.try_to_parse_template_args = try_to_parse_template_args;
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "operator ");
-          db.parsed_ctor_dtor_cv = true;
-          first = t;
-        }
-      } break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator&&");
     case 'd':
-      switch (first[1]) {
-      case 'a':
-        db.names.push_back("operator delete[]");
-        first += 2;
-        break;
-      case 'e':
-        db.names.push_back("operator*");
-        first += 2;
-        break;
-      case 'l':
-        db.names.push_back("operator delete");
-        first += 2;
-        break;
-      case 'v':
-        db.names.push_back("operator/");
-        first += 2;
-        break;
-      case 'V':
-        db.names.push_back("operator/=");
-        first += 2;
-        break;
-      }
-      break;
+    case 'n':
+      First += 2;
+      return make<NameType>("operator&");
+    case 'N':
+      First += 2;
+      return make<NameType>("operator&=");
+    case 'S':
+      First += 2;
+      return make<NameType>("operator=");
+    }
+    return nullptr;
+  case 'c':
+    switch (look(1)) {
+    case 'l':
+      First += 2;
+      return make<NameType>("operator()");
+    case 'm':
+      First += 2;
+      return make<NameType>("operator,");
+    case 'o':
+      First += 2;
+      return make<NameType>("operator~");
+    //                   ::= cv <type>    # (cast)
+    case 'v': {
+      First += 2;
+      SwapAndRestore<bool> SaveTemplate(TryToParseTemplateArgs, false);
+      // If we're parsing an encoding, State != nullptr and the conversion
+      // operators' <type> could have a <template-param> that refers to some
+      // <template-arg>s further ahead in the mangled name.
+      SwapAndRestore<bool> SavePermit(PermitForwardTemplateReferences,
+                                      PermitForwardTemplateReferences ||
+                                          State != nullptr);
+      Node* Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      if (State) State->CtorDtorConversion = true;
+      return make<ConversionOperatorType>(Ty);
+    }
+    }
+    return nullptr;
+  case 'd':
+    switch (look(1)) {
+    case 'a':
+      First += 2;
+      return make<NameType>("operator delete[]");
     case 'e':
-      switch (first[1]) {
-      case 'o':
-        db.names.push_back("operator^");
-        first += 2;
-        break;
-      case 'O':
-        db.names.push_back("operator^=");
-        first += 2;
-        break;
-      case 'q':
-        db.names.push_back("operator==");
-        first += 2;
-        break;
-      }
-      break;
-    case 'g':
-      switch (first[1]) {
-      case 'e':
-        db.names.push_back("operator>=");
-        first += 2;
-        break;
-      case 't':
-        db.names.push_back("operator>");
-        first += 2;
-        break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator*");
+    case 'l':
+      First += 2;
+      return make<NameType>("operator delete");
+    case 'v':
+      First += 2;
+      return make<NameType>("operator/");
+    case 'V':
+      First += 2;
+      return make<NameType>("operator/=");
+    }
+    return nullptr;
+  case 'e':
+    switch (look(1)) {
+    case 'o':
+      First += 2;
+      return make<NameType>("operator^");
+    case 'O':
+      First += 2;
+      return make<NameType>("operator^=");
+    case 'q':
+      First += 2;
+      return make<NameType>("operator==");
+    }
+    return nullptr;
+  case 'g':
+    switch (look(1)) {
+    case 'e':
+      First += 2;
+      return make<NameType>("operator>=");
+    case 't':
+      First += 2;
+      return make<NameType>("operator>");
+    }
+    return nullptr;
+  case 'i':
+    if (look(1) == 'x') {
+      First += 2;
+      return make<NameType>("operator[]");
+    }
+    return nullptr;
+  case 'l':
+    switch (look(1)) {
+    case 'e':
+      First += 2;
+      return make<NameType>("operator<=");
+    //                   ::= li <source-name>  # operator ""
+    case 'i': {
+      First += 2;
+      Node *SN = parseSourceName(State);
+      if (SN == nullptr)
+        return nullptr;
+      return make<LiteralOperator>(SN);
+    }
+    case 's':
+      First += 2;
+      return make<NameType>("operator<<");
+    case 'S':
+      First += 2;
+      return make<NameType>("operator<<=");
+    case 't':
+      First += 2;
+      return make<NameType>("operator<");
+    }
+    return nullptr;
+  case 'm':
+    switch (look(1)) {
     case 'i':
-      if (first[1] == 'x') {
-        db.names.push_back("operator[]");
-        first += 2;
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator-");
+    case 'I':
+      First += 2;
+      return make<NameType>("operator-=");
     case 'l':
-      switch (first[1]) {
-      case 'e':
-        db.names.push_back("operator<=");
-        first += 2;
-        break;
-      case 'i': {
-        const char *t = parse_source_name(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "operator\"\" ");
-          first = t;
-        }
-      } break;
-      case 's':
-        db.names.push_back("operator<<");
-        first += 2;
-        break;
-      case 'S':
-        db.names.push_back("operator<<=");
-        first += 2;
-        break;
-      case 't':
-        db.names.push_back("operator<");
-        first += 2;
-        break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator*");
+    case 'L':
+      First += 2;
+      return make<NameType>("operator*=");
     case 'm':
-      switch (first[1]) {
-      case 'i':
-        db.names.push_back("operator-");
-        first += 2;
-        break;
-      case 'I':
-        db.names.push_back("operator-=");
-        first += 2;
-        break;
-      case 'l':
-        db.names.push_back("operator*");
-        first += 2;
-        break;
-      case 'L':
-        db.names.push_back("operator*=");
-        first += 2;
-        break;
-      case 'm':
-        db.names.push_back("operator--");
-        first += 2;
-        break;
-      }
-      break;
-    case 'n':
-      switch (first[1]) {
-      case 'a':
-        db.names.push_back("operator new[]");
-        first += 2;
-        break;
-      case 'e':
-        db.names.push_back("operator!=");
-        first += 2;
-        break;
-      case 'g':
-        db.names.push_back("operator-");
-        first += 2;
-        break;
-      case 't':
-        db.names.push_back("operator!");
-        first += 2;
-        break;
-      case 'w':
-        db.names.push_back("operator new");
-        first += 2;
-        break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator--");
+    }
+    return nullptr;
+  case 'n':
+    switch (look(1)) {
+    case 'a':
+      First += 2;
+      return make<NameType>("operator new[]");
+    case 'e':
+      First += 2;
+      return make<NameType>("operator!=");
+    case 'g':
+      First += 2;
+      return make<NameType>("operator-");
+    case 't':
+      First += 2;
+      return make<NameType>("operator!");
+    case 'w':
+      First += 2;
+      return make<NameType>("operator new");
+    }
+    return nullptr;
+  case 'o':
+    switch (look(1)) {
     case 'o':
-      switch (first[1]) {
-      case 'o':
-        db.names.push_back("operator||");
-        first += 2;
-        break;
-      case 'r':
-        db.names.push_back("operator|");
-        first += 2;
-        break;
-      case 'R':
-        db.names.push_back("operator|=");
-        first += 2;
-        break;
-      }
-      break;
-    case 'p':
-      switch (first[1]) {
-      case 'm':
-        db.names.push_back("operator->*");
-        first += 2;
-        break;
-      case 'l':
-        db.names.push_back("operator+");
-        first += 2;
-        break;
-      case 'L':
-        db.names.push_back("operator+=");
-        first += 2;
-        break;
-      case 'p':
-        db.names.push_back("operator++");
-        first += 2;
-        break;
-      case 's':
-        db.names.push_back("operator+");
-        first += 2;
-        break;
-      case 't':
-        db.names.push_back("operator->");
-        first += 2;
-        break;
-      }
-      break;
-    case 'q':
-      if (first[1] == 'u') {
-        db.names.push_back("operator?");
-        first += 2;
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator||");
     case 'r':
-      switch (first[1]) {
-      case 'm':
-        db.names.push_back("operator%");
-        first += 2;
-        break;
-      case 'M':
-        db.names.push_back("operator%=");
-        first += 2;
-        break;
-      case 's':
-        db.names.push_back("operator>>");
-        first += 2;
-        break;
-      case 'S':
-        db.names.push_back("operator>>=");
-        first += 2;
-        break;
-      }
-      break;
-    case 'v':
-      if (std::isdigit(first[1])) {
-        const char *t = parse_source_name(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "operator ");
-          first = t;
-        }
-      }
-      break;
+      First += 2;
+      return make<NameType>("operator|");
+    case 'R':
+      First += 2;
+      return make<NameType>("operator|=");
     }
-  }
-  return first;
-}
-
-template <class C>
-static const char *parse_integer_literal(const char *first, const char *last,
-                                         const std::string &lit, C &db) {
-  const char *t = parse_number(first, last);
-  if (t != first && t != last && *t == 'E') {
-    if (lit.size() > 3)
-      db.names.push_back("(" + lit + ")");
-    else
-      db.names.emplace_back();
-    if (*first == 'n') {
-      db.names.back().first += '-';
-      ++first;
+    return nullptr;
+  case 'p':
+    switch (look(1)) {
+    case 'm':
+      First += 2;
+      return make<NameType>("operator->*");
+    case 'l':
+      First += 2;
+      return make<NameType>("operator+");
+    case 'L':
+      First += 2;
+      return make<NameType>("operator+=");
+    case 'p':
+      First += 2;
+      return make<NameType>("operator++");
+    case 's':
+      First += 2;
+      return make<NameType>("operator+");
+    case 't':
+      First += 2;
+      return make<NameType>("operator->");
+    }
+    return nullptr;
+  case 'q':
+    if (look(1) == 'u') {
+      First += 2;
+      return make<NameType>("operator?");
+    }
+    return nullptr;
+  case 'r':
+    switch (look(1)) {
+    case 'm':
+      First += 2;
+      return make<NameType>("operator%");
+    case 'M':
+      First += 2;
+      return make<NameType>("operator%=");
+    case 's':
+      First += 2;
+      return make<NameType>("operator>>");
+    case 'S':
+      First += 2;
+      return make<NameType>("operator>>=");
+    }
+    return nullptr;
+  case 's':
+    if (look(1) == 's') {
+      First += 2;
+      return make<NameType>("operator<=>");
     }
-    db.names.back().first.append(first, t);
-    if (lit.size() <= 3)
-      db.names.back().first += lit;
-    first = t + 1;
+    return nullptr;
+  // ::= v <digit> <source-name>        # vendor extended operator
+  case 'v':
+    if (std::isdigit(look(1))) {
+      First += 2;
+      Node *SN = parseSourceName(State);
+      if (SN == nullptr)
+        return nullptr;
+      return make<ConversionOperatorType>(SN);
+    }
+    return nullptr;
   }
-  return first;
+  return nullptr;
 }
 
-// <expr-primary> ::= L <type> <value number> E                          #
-// integer literal
-//                ::= L <type> <value float> E                           #
-//                floating literal
-//                ::= L <string type> E                                  #
-//                string literal
-//                ::= L <nullptr type> E                                 #
-//                nullptr literal (i.e., "LDnE")
-//                ::= L <type> <real-part float> _ <imag-part float> E   #
-//                complex floating point literal (C 2000)
-//                ::= L <mangled-name> E                                 #
-//                external name
-
-template <class C>
-static const char *parse_expr_primary(const char *first, const char *last,
-                                      C &db) {
-  if (last - first >= 4 && *first == 'L') {
-    switch (first[1]) {
-    case 'w': {
-      const char *t = parse_integer_literal(first + 2, last, "wchar_t", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'b':
-      if (first[3] == 'E') {
-        switch (first[2]) {
-        case '0':
-          db.names.push_back("false");
-          first += 4;
-          break;
-        case '1':
-          db.names.push_back("true");
-          first += 4;
-          break;
-        }
-      }
-      break;
-    case 'c': {
-      const char *t = parse_integer_literal(first + 2, last, "char", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'a': {
-      const char *t = parse_integer_literal(first + 2, last, "signed char", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'h': {
-      const char *t =
-          parse_integer_literal(first + 2, last, "unsigned char", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 's': {
-      const char *t = parse_integer_literal(first + 2, last, "short", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 't': {
-      const char *t =
-          parse_integer_literal(first + 2, last, "unsigned short", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'i': {
-      const char *t = parse_integer_literal(first + 2, last, "", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'j': {
-      const char *t = parse_integer_literal(first + 2, last, "u", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'l': {
-      const char *t = parse_integer_literal(first + 2, last, "l", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'm': {
-      const char *t = parse_integer_literal(first + 2, last, "ul", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'x': {
-      const char *t = parse_integer_literal(first + 2, last, "ll", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'y': {
-      const char *t = parse_integer_literal(first + 2, last, "ull", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'n': {
-      const char *t = parse_integer_literal(first + 2, last, "__int128", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'o': {
-      const char *t =
-          parse_integer_literal(first + 2, last, "unsigned __int128", db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'f': {
-      const char *t = parse_floating_number<float>(first + 2, last, db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'd': {
-      const char *t = parse_floating_number<double>(first + 2, last, db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case 'e': {
-      const char *t = parse_floating_number<long double>(first + 2, last, db);
-      if (t != first + 2)
-        first = t;
-    } break;
-    case '_':
-      if (first[2] == 'Z') {
-        const char *t = parse_encoding(first + 3, last, db);
-        if (t != first + 3 && t != last && *t == 'E')
-          first = t + 1;
-      }
-      break;
-    case 'T':
-      // Invalid mangled name per
-      //   http://sourcerytools.com/pipermail/cxx-abi-dev/2011-August/002422.html
+// <ctor-dtor-name> ::= C1  # complete object constructor
+//                  ::= C2  # base object constructor
+//                  ::= C3  # complete object allocating constructor
+//   extension      ::= C5    # ?
+//                  ::= D0  # deleting destructor
+//                  ::= D1  # complete object destructor
+//                  ::= D2  # base object destructor
+//   extension      ::= D5    # ?
+Node *Db::parseCtorDtorName(Node *&SoFar, NameState *State) {
+  if (SoFar->K == Node::KSpecialSubstitution) {
+    auto SSK = static_cast<SpecialSubstitution *>(SoFar)->SSK;
+    switch (SSK) {
+    case SpecialSubKind::string:
+    case SpecialSubKind::istream:
+    case SpecialSubKind::ostream:
+    case SpecialSubKind::iostream:
+      SoFar = make<ExpandedSpecialSubstitution>(SSK);
+    default:
       break;
-    default: {
-      // might be named type
-      const char *t = parse_type(first + 1, last, db);
-      if (t != first + 1 && t != last) {
-        if (*t != 'E') {
-          const char *n = t;
-          for (; n != last && isdigit(*n); ++n)
-            ;
-          if (n != t && n != last && *n == 'E') {
-            if (db.names.empty())
-              return first;
-            db.names.back() =
-                "(" + db.names.back().move_full() + ")" + std::string(t, n);
-            first = n + 1;
-            break;
-          }
-        } else {
-          first = t + 1;
-          break;
-        }
-      }
     }
+  }
+
+  if (consumeIf('C')) {
+    bool IsInherited = consumeIf('I');
+    if (look() != '1' && look() != '2' && look() != '3' && look() != '5')
+      return nullptr;
+    ++First;
+    if (State) State->CtorDtorConversion = true;
+    if (IsInherited) {
+      if (parseName(State) == nullptr)
+        return nullptr;
     }
+    return make<CtorDtorName>(SoFar, false);
   }
-  return first;
+
+  if (look() == 'D' &&
+      (look(1) == '0' || look(1) == '1' || look(1) == '2' || look(1) == '5')) {
+    First += 2;
+    if (State) State->CtorDtorConversion = true;
+    return make<CtorDtorName>(SoFar, true);
+  }
+
+  return nullptr;
 }
 
-static std::string base_name(std::string &s) {
-  if (s.empty())
-    return s;
-  if (s == "std::string") {
-    s = "std::basic_string<char, std::char_traits<char>, std::allocator<char> "
-        ">";
-    return "basic_string";
-  }
-  if (s == "std::istream") {
-    s = "std::basic_istream<char, std::char_traits<char> >";
-    return "basic_istream";
-  }
-  if (s == "std::ostream") {
-    s = "std::basic_ostream<char, std::char_traits<char> >";
-    return "basic_ostream";
-  }
-  if (s == "std::iostream") {
-    s = "std::basic_iostream<char, std::char_traits<char> >";
-    return "basic_iostream";
-  }
-  const char *const pf = s.data();
-  const char *pe = pf + s.size();
-  if (pe[-1] == '>') {
-    unsigned c = 1;
-    while (true) {
-      if (--pe == pf)
-        return std::string();
-      if (pe[-1] == '<') {
-        if (--c == 0) {
-          --pe;
-          break;
-        }
-      } else if (pe[-1] == '>')
-        ++c;
+// <nested-name> ::= N [<CV-Qualifiers>] [<ref-qualifier>] <prefix> <unqualified-name> E
+//               ::= N [<CV-Qualifiers>] [<ref-qualifier>] <template-prefix> <template-args> E
+//
+// <prefix> ::= <prefix> <unqualified-name>
+//          ::= <template-prefix> <template-args>
+//          ::= <template-param>
+//          ::= <decltype>
+//          ::= # empty
+//          ::= <substitution>
+//          ::= <prefix> <data-member-prefix>
+//  extension ::= L
+//
+// <data-member-prefix> := <member source-name> [<template-args>] M
+//
+// <template-prefix> ::= <prefix> <template unqualified-name>
+//                   ::= <template-param>
+//                   ::= <substitution>
+Node *Db::parseNestedName(NameState *State) {
+  if (!consumeIf('N'))
+    return nullptr;
+
+  Qualifiers CVTmp = parseCVQualifiers();
+  if (State) State->CVQualifiers = CVTmp;
+
+  if (consumeIf('O')) {
+    if (State) State->ReferenceQualifier = FrefQualRValue;
+  } else if (consumeIf('R')) {
+    if (State) State->ReferenceQualifier = FrefQualLValue;
+  } else
+    if (State) State->ReferenceQualifier = FrefQualNone;
+
+  Node *SoFar = nullptr;
+  auto PushComponent = [&](Node *Comp) {
+    if (SoFar) SoFar = make<NestedName>(SoFar, Comp);
+    else       SoFar = Comp;
+    if (State) State->EndsWithTemplateArgs = false;
+  };
+
+  if (consumeIf("St"))
+    SoFar = make<NameType>("std");
+
+  while (!consumeIf('E')) {
+    consumeIf('L'); // extension
+
+    // <data-member-prefix> := <member source-name> [<template-args>] M
+    if (consumeIf('M')) {
+      if (SoFar == nullptr)
+        return nullptr;
+      continue;
     }
-  }
-  if (pe - pf <= 1)
-    return std::string();
-  const char *p0 = pe - 1;
-  for (; p0 != pf; --p0) {
-    if (*p0 == ':') {
-      ++p0;
-      break;
+
+    //          ::= <template-param>
+    if (look() == 'T') {
+      Node *TP = parseTemplateParam();
+      if (TP == nullptr)
+        return nullptr;
+      PushComponent(TP);
+      Subs.push_back(SoFar);
+      continue;
     }
-    if (!isalpha(*p0) && !isdigit(*p0) && *p0 != '_') {
-      return std::string();
+
+    //          ::= <template-prefix> <template-args>
+    if (look() == 'I') {
+      Node *TA = parseTemplateArgs(State != nullptr);
+      if (TA == nullptr || SoFar == nullptr)
+        return nullptr;
+      SoFar = make<NameWithTemplateArgs>(SoFar, TA);
+      if (State) State->EndsWithTemplateArgs = true;
+      Subs.push_back(SoFar);
+      continue;
     }
-  }
-  return std::string(p0, pe);
-}
 
-// <ctor-dtor-name> ::= C1    # complete object constructor
-//                  ::= C2    # base object constructor
-//                  ::= C3    # complete object allocating constructor
-//   extension      ::= C5    # ?
-//                  ::= D0    # deleting destructor
-//                  ::= D1    # complete object destructor
-//                  ::= D2    # base object destructor
-//   extension      ::= D5    # ?
+    //          ::= <decltype>
+    if (look() == 'D' && (look(1) == 't' || look(1) == 'T')) {
+      Node *DT = parseDecltype();
+      if (DT == nullptr)
+        return nullptr;
+      PushComponent(DT);
+      Subs.push_back(SoFar);
+      continue;
+    }
 
-template <class C>
-static const char *parse_ctor_dtor_name(const char *first, const char *last,
-                                        C &db) {
-  if (last - first >= 2 && !db.names.empty()) {
-    switch (first[0]) {
-    case 'C':
-      switch (first[1]) {
-      case '1':
-      case '2':
-      case '3':
-      case '5':
-        if (db.names.empty())
-          return first;
-        db.names.push_back(base_name(db.names.back().first));
-        first += 2;
-        db.parsed_ctor_dtor_cv = true;
-        break;
-      }
-      break;
-    case 'D':
-      switch (first[1]) {
-      case '0':
-      case '1':
-      case '2':
-      case '5':
-        if (db.names.empty())
-          return first;
-        db.names.push_back("~" + base_name(db.names.back().first));
-        first += 2;
-        db.parsed_ctor_dtor_cv = true;
-        break;
-      }
-      break;
+    //          ::= <substitution>
+    if (look() == 'S' && look(1) != 't') {
+      Node *S = parseSubstitution();
+      if (S == nullptr)
+        return nullptr;
+      PushComponent(S);
+      if (SoFar != S)
+        Subs.push_back(S);
+      continue;
+    }
+
+    // Parse an <unqualified-name> thats actually a <ctor-dtor-name>.
+    if (look() == 'C' || (look() == 'D' && look(1) != 'C')) {
+      if (SoFar == nullptr)
+        return nullptr;
+      Node *CtorDtor = parseCtorDtorName(SoFar, State);
+      if (CtorDtor == nullptr)
+        return nullptr;
+      PushComponent(CtorDtor);
+      SoFar = parseAbiTags(SoFar);
+      if (SoFar == nullptr)
+        return nullptr;
+      Subs.push_back(SoFar);
+      continue;
     }
+
+    //          ::= <prefix> <unqualified-name>
+    Node *N = parseUnqualifiedName(State);
+    if (N == nullptr)
+      return nullptr;
+    PushComponent(N);
+    Subs.push_back(SoFar);
   }
-  return first;
+
+  if (SoFar == nullptr || Subs.empty())
+    return nullptr;
+
+  Subs.pop_back();
+  return SoFar;
 }
 
-// <unnamed-type-name> ::= Ut [ <nonnegative number> ] _
-//                     ::= <closure-type-name>
-//
-// <closure-type-name> ::= Ul <lambda-sig> E [ <nonnegative number> ] _
-//
-// <lambda-sig> ::= <parameter type>+  # Parameter types or "v" if the lambda
-// has no parameters
-
-template <class C>
-static const char *parse_unnamed_type_name(const char *first, const char *last,
-                                           C &db) {
-  if (last - first > 2 && first[0] == 'U') {
-    char type = first[1];
-    switch (type) {
-    case 't': {
-      db.names.push_back(std::string("'unnamed"));
-      const char *t0 = first + 2;
-      if (t0 == last) {
-        db.names.pop_back();
-        return first;
-      }
-      if (std::isdigit(*t0)) {
-        const char *t1 = t0 + 1;
-        while (t1 != last && std::isdigit(*t1))
-          ++t1;
-        db.names.back().first.append(t0, t1);
-        t0 = t1;
-      }
-      db.names.back().first.push_back('\'');
-      if (t0 == last || *t0 != '_') {
-        db.names.pop_back();
-        return first;
-      }
-      first = t0 + 1;
-    } break;
-    case 'l': {
-      size_t lambda_pos = db.names.size();
-      db.names.push_back(std::string("'lambda'("));
-      const char *t0 = first + 2;
-      if (first[2] == 'v') {
-        db.names.back().first += ')';
-        ++t0;
-      } else {
-        bool is_first_it = true;
-        while (true) {
-          long k0 = static_cast<long>(db.names.size());
-          const char *t1 = parse_type(t0, last, db);
-          long k1 = static_cast<long>(db.names.size());
-          if (t1 == t0)
-            break;
-          if (k0 >= k1)
-            return first;
-          // If the call to parse_type above found a pack expansion
-          // substitution, then multiple names could have been
-          // inserted into the name table. Walk through the names,
-          // appending each onto the lambda's parameter list.
-          std::for_each(db.names.begin() + k0, db.names.begin() + k1,
-                        [&](typename C::sub_type::value_type &pair) {
-                          if (pair.empty())
-                            return;
-                          auto &lambda = db.names[lambda_pos].first;
-                          if (!is_first_it)
-                            lambda.append(", ");
-                          is_first_it = false;
-                          lambda.append(pair.move_full());
-                        });
-          db.names.erase(db.names.begin() + k0, db.names.end());
-          t0 = t1;
-        }
-        if (is_first_it) {
-          if (!db.names.empty())
-            db.names.pop_back();
-          return first;
-        }
-        if (db.names.empty() || db.names.size() - 1 != lambda_pos)
-          return first;
-        db.names.back().first.append(")");
-      }
-      if (t0 == last || *t0 != 'E') {
-        if (!db.names.empty())
-          db.names.pop_back();
-        return first;
-      }
-      ++t0;
-      if (t0 == last) {
-        if (!db.names.empty())
-          db.names.pop_back();
-        return first;
-      }
-      if (std::isdigit(*t0)) {
-        const char *t1 = t0 + 1;
-        while (t1 != last && std::isdigit(*t1))
-          ++t1;
-        db.names.back().first.insert(db.names.back().first.begin() + 7, t0, t1);
-        t0 = t1;
-      }
-      if (t0 == last || *t0 != '_') {
-        if (!db.names.empty())
-          db.names.pop_back();
-        return first;
-      }
-      first = t0 + 1;
-    } break;
-    }
+// <simple-id> ::= <source-name> [ <template-args> ]
+Node *Db::parseSimpleId() {
+  Node *SN = parseSourceName(/*NameState=*/nullptr);
+  if (SN == nullptr)
+    return nullptr;
+  if (look() == 'I') {
+    Node *TA = parseTemplateArgs();
+    if (TA == nullptr)
+      return nullptr;
+    return make<NameWithTemplateArgs>(SN, TA);
   }
-  return first;
+  return SN;
 }
 
-// <unqualified-name> ::= <operator-name>
-//                    ::= <ctor-dtor-name>
-//                    ::= <source-name>
-//                    ::= <unnamed-type-name>
+// <destructor-name> ::= <unresolved-type>  # e.g., ~T or ~decltype(f())
+//                   ::= <simple-id>        # e.g., ~A<2*N>
+Node *Db::parseDestructorName() {
+  Node *Result;
+  if (std::isdigit(look()))
+    Result = parseSimpleId();
+  else
+    Result = parseUnresolvedType();
+  if (Result == nullptr)
+    return nullptr;
+  return make<DtorName>(Result);
+}
 
-template <class C>
-static const char *parse_unqualified_name(const char *first, const char *last,
-                                          C &db) {
-  if (first != last) {
-    const char *t;
-    switch (*first) {
-    case 'C':
-    case 'D':
-      t = parse_ctor_dtor_name(first, last, db);
-      if (t != first)
-        first = t;
-      break;
-    case 'U':
-      t = parse_unnamed_type_name(first, last, db);
-      if (t != first)
-        first = t;
-      break;
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-      t = parse_source_name(first, last, db);
-      if (t != first)
-        first = t;
-      break;
-    default:
-      t = parse_operator_name(first, last, db);
-      if (t != first)
-        first = t;
-      break;
-    };
+// <unresolved-type> ::= <template-param>
+//                   ::= <decltype>
+//                   ::= <substitution>
+Node *Db::parseUnresolvedType() {
+  if (look() == 'T') {
+    Node *TP = parseTemplateParam();
+    if (TP == nullptr)
+      return nullptr;
+    Subs.push_back(TP);
+    return TP;
   }
-  return first;
+  if (look() == 'D') {
+    Node *DT = parseDecltype();
+    if (DT == nullptr)
+      return nullptr;
+    Subs.push_back(DT);
+    return DT;
+  }
+  return parseSubstitution();
 }
 
-// <unscoped-name> ::= <unqualified-name>
-//                 ::= St <unqualified-name>   # ::std::
-// extension       ::= StL<unqualified-name>
+// <base-unresolved-name> ::= <simple-id>                                # unresolved name
+//          extension     ::= <operator-name>                            # unresolved operator-function-id
+//          extension     ::= <operator-name> <template-args>            # unresolved operator template-id
+//                        ::= on <operator-name>                         # unresolved operator-function-id
+//                        ::= on <operator-name> <template-args>         # unresolved operator template-id
+//                        ::= dn <destructor-name>                       # destructor or pseudo-destructor;
+//                                                                         # e.g. ~X or ~X<N-1>
+Node *Db::parseBaseUnresolvedName() {
+  if (std::isdigit(look()))
+    return parseSimpleId();
 
-template <class C>
-static const char *parse_unscoped_name(const char *first, const char *last,
-                                       C &db) {
-  if (last - first >= 2) {
-    const char *t0 = first;
-    bool St = false;
-    if (first[0] == 'S' && first[1] == 't') {
-      t0 += 2;
-      St = true;
-      if (t0 != last && *t0 == 'L')
-        ++t0;
-    }
-    const char *t1 = parse_unqualified_name(t0, last, db);
-    if (t1 != t0) {
-      if (St) {
-        if (db.names.empty())
-          return first;
-        db.names.back().first.insert(0, "std::");
-      }
-      first = t1;
-    }
+  if (consumeIf("dn"))
+    return parseDestructorName();
+
+  consumeIf("on");
+
+  Node *Oper = parseOperatorName(/*NameState=*/nullptr);
+  if (Oper == nullptr)
+    return nullptr;
+  if (look() == 'I') {
+    Node *TA = parseTemplateArgs();
+    if (TA == nullptr)
+      return nullptr;
+    return make<NameWithTemplateArgs>(Oper, TA);
   }
-  return first;
+  return Oper;
 }
 
-// at <type>                                            # alignof (a type)
+// <unresolved-name>
+//  extension        ::= srN <unresolved-type> [<template-args>] <unresolved-qualifier-level>* E <base-unresolved-name>
+//                   ::= [gs] <base-unresolved-name>                     # x or (with "gs") ::x
+//                   ::= [gs] sr <unresolved-qualifier-level>+ E <base-unresolved-name>
+//                                                                       # A::x, N::y, A<T>::z; "gs" means leading "::"
+//                   ::= sr <unresolved-type> <base-unresolved-name>     # T::x / decltype(p)::x
+//  extension        ::= sr <unresolved-type> <template-args> <base-unresolved-name>
+//                                                                       # T::N::x /decltype(p)::N::x
+//  (ignored)        ::= srN <unresolved-type>  <unresolved-qualifier-level>+ E <base-unresolved-name>
+//
+// <unresolved-qualifier-level> ::= <simple-id>
+Node *Db::parseUnresolvedName() {
+  Node *SoFar = nullptr;
+
+  // srN <unresolved-type> [<template-args>] <unresolved-qualifier-level>* E <base-unresolved-name>
+  // srN <unresolved-type>                   <unresolved-qualifier-level>+ E <base-unresolved-name>
+  if (consumeIf("srN")) {
+    SoFar = parseUnresolvedType();
+    if (SoFar == nullptr)
+      return nullptr;
+
+    if (look() == 'I') {
+      Node *TA = parseTemplateArgs();
+      if (TA == nullptr)
+        return nullptr;
+      SoFar = make<NameWithTemplateArgs>(SoFar, TA);
+    }
 
-template <class C>
-static const char *parse_alignof_type(const char *first, const char *last,
-                                      C &db) {
-  if (last - first >= 3 && first[0] == 'a' && first[1] == 't') {
-    const char *t = parse_type(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back().first = "alignof (" + db.names.back().move_full() + ")";
-      first = t;
+    while (!consumeIf('E')) {
+      Node *Qual = parseSimpleId();
+      if (Qual == nullptr)
+        return nullptr;
+      SoFar = make<QualifiedName>(SoFar, Qual);
     }
+
+    Node *Base = parseBaseUnresolvedName();
+    if (Base == nullptr)
+      return nullptr;
+    return make<QualifiedName>(SoFar, Base);
   }
-  return first;
-}
 
-// az <expression>                                            # alignof (a
-// expression)
+  bool Global = consumeIf("gs");
 
-template <class C>
-static const char *parse_alignof_expr(const char *first, const char *last,
-                                      C &db) {
-  if (last - first >= 3 && first[0] == 'a' && first[1] == 'z') {
-    const char *t = parse_expression(first + 2, last, db);
-    if (t != first + 2) {
-      if (db.names.empty())
-        return first;
-      db.names.back().first = "alignof (" + db.names.back().move_full() + ")";
-      first = t;
+  // [gs] <base-unresolved-name>                     # x or (with "gs") ::x
+  if (!consumeIf("sr")) {
+    SoFar = parseBaseUnresolvedName();
+    if (SoFar == nullptr)
+      return nullptr;
+    if (Global)
+      SoFar = make<GlobalQualifiedName>(SoFar);
+    return SoFar;
+  }
+
+  // [gs] sr <unresolved-qualifier-level>+ E   <base-unresolved-name>
+  if (std::isdigit(look())) {
+    do {
+      Node *Qual = parseSimpleId();
+      if (Qual == nullptr)
+        return nullptr;
+      if (SoFar)
+        SoFar = make<QualifiedName>(SoFar, Qual);
+      else if (Global)
+        SoFar = make<GlobalQualifiedName>(Qual);
+      else
+        SoFar = Qual;
+    } while (!consumeIf('E'));
+  }
+  //      sr <unresolved-type>                 <base-unresolved-name>
+  //      sr <unresolved-type> <template-args> <base-unresolved-name>
+  else {
+    SoFar = parseUnresolvedType();
+    if (SoFar == nullptr)
+      return nullptr;
+
+    if (look() == 'I') {
+      Node *TA = parseTemplateArgs();
+      if (TA == nullptr)
+        return nullptr;
+      SoFar = make<NameWithTemplateArgs>(SoFar, TA);
     }
   }
-  return first;
+
+  assert(SoFar != nullptr);
+
+  Node *Base = parseBaseUnresolvedName();
+  if (Base == nullptr)
+    return nullptr;
+  return make<QualifiedName>(SoFar, Base);
 }
 
-template <class C>
-static const char *parse_noexcept_expression(const char *first,
-                                             const char *last, C &db) {
-  const char *t1 = parse_expression(first, last, db);
-  if (t1 != first) {
-    if (db.names.empty())
-      return first;
-    db.names.back().first = "noexcept (" + db.names.back().move_full() + ")";
-    first = t1;
-  }
-  return first;
+// <abi-tags> ::= <abi-tag> [<abi-tags>]
+// <abi-tag> ::= B <source-name>
+Node *Db::parseAbiTags(Node *N) {
+  while (consumeIf('B')) {
+    StringView SN = parseBareSourceName();
+    if (SN.empty())
+      return nullptr;
+    N = make<AbiTagAttr>(N, SN);
+  }
+  return N;
 }
 
-template <class C>
-static const char *parse_prefix_expression(const char *first, const char *last,
-                                           const std::string &op,
-                                           C &db) {
-  const char *t1 = parse_expression(first, last, db);
-  if (t1 != first) {
-    if (db.names.empty())
-      return first;
-    db.names.back().first = op + "(" + db.names.back().move_full() + ")";
-    first = t1;
-  }
-  return first;
+// <number> ::= [n] <non-negative decimal integer>
+StringView Db::parseNumber(bool AllowNegative) {
+  const char *Tmp = First;
+  if (AllowNegative)
+    consumeIf('n');
+  if (numLeft() == 0 || !std::isdigit(*First))
+    return StringView();
+  while (numLeft() != 0 && std::isdigit(*First))
+    ++First;
+  return StringView(Tmp, First);
 }
 
-template <class C>
-static const char *parse_binary_expression(const char *first, const char *last,
-                                           const std::string &op,
-                                           C &db) {
-  const char *t1 = parse_expression(first, last, db);
-  if (t1 != first) {
-    const char *t2 = parse_expression(t1, last, db);
-    if (t2 != t1) {
-      if (db.names.size() < 2)
-        return first;
-      auto op2 = db.names.back().move_full();
-      db.names.pop_back();
-      auto op1 = db.names.back().move_full();
-      auto &nm = db.names.back().first;
-      nm.clear();
-      if (op == ">")
-        nm += '(';
-      nm += "(" + op1 + ") " + op + " (" + op2 + ")";
-      if (op == ">")
-        nm += ')';
-      first = t2;
-    } else if (!db.names.empty())
-      db.names.pop_back();
-  }
-  return first;
+// <positive length number> ::= [0-9]*
+bool Db::parsePositiveInteger(size_t *Out) {
+  *Out = 0;
+  if (look() < '0' || look() > '9')
+    return true;
+  while (look() >= '0' && look() <= '9') {
+    *Out *= 10;
+    *Out += static_cast<size_t>(consume() - '0');
+  }
+  return false;
 }
 
-// <expression> ::= <unary operator-name> <expression>
-//              ::= <binary operator-name> <expression> <expression>
-//              ::= <ternary operator-name> <expression> <expression>
-//              <expression>
-//              ::= cl <expression>+ E                                   # call
-//              ::= cv <type> <expression>                               #
-//              conversion with one argument
-//              ::= cv <type> _ <expression>* E                          #
-//              conversion with a different number of arguments
-//              ::= [gs] nw <expression>* _ <type> E                     # new
-//              (expr-list) type
-//              ::= [gs] nw <expression>* _ <type> <initializer>         # new
-//              (expr-list) type (init)
-//              ::= [gs] na <expression>* _ <type> E                     # new[]
-//              (expr-list) type
-//              ::= [gs] na <expression>* _ <type> <initializer>         # new[]
-//              (expr-list) type (init)
-//              ::= [gs] dl <expression>                                 #
-//              delete expression
-//              ::= [gs] da <expression>                                 #
-//              delete[] expression
-//              ::= pp_ <expression>                                     #
-//              prefix ++
-//              ::= mm_ <expression>                                     #
-//              prefix --
-//              ::= ti <type>                                            #
-//              typeid (type)
-//              ::= te <expression>                                      #
-//              typeid (expression)
-//              ::= dc <type> <expression>                               #
-//              dynamic_cast<type> (expression)
-//              ::= sc <type> <expression>                               #
-//              static_cast<type> (expression)
-//              ::= cc <type> <expression>                               #
-//              const_cast<type> (expression)
-//              ::= rc <type> <expression>                               #
-//              reinterpret_cast<type> (expression)
-//              ::= st <type>                                            #
-//              sizeof (a type)
-//              ::= sz <expression>                                      #
-//              sizeof (an expression)
-//              ::= at <type>                                            #
-//              alignof (a type)
-//              ::= az <expression>                                      #
-//              alignof (an expression)
-//              ::= nx <expression>                                      #
-//              noexcept (expression)
-//              ::= <template-param>
-//              ::= <function-param>
-//              ::= dt <expression> <unresolved-name>                    #
-//              expr.name
-//              ::= pt <expression> <unresolved-name>                    #
-//              expr->name
-//              ::= ds <expression> <expression>                         #
-//              expr.*expr
-//              ::= sZ <template-param>                                  # size
-//              of a parameter pack
-//              ::= sZ <function-param>                                  # size
-//              of a function parameter pack
-//              ::= sp <expression>                                      # pack
-//              expansion
-//              ::= tw <expression>                                      # throw
-//              expression
-//              ::= tr                                                   # throw
-//              with no operand (rethrow)
-//              ::= <unresolved-name>                                    # f(p),
-//              N::f(p), ::f(p),
-//                                                                       #
-//                                                                       freestanding
-//                                                                       dependent
-//                                                                       name
-//                                                                       (e.g.,
-//                                                                       T::x),
-//                                                                       #
-//                                                                       objectless
-//                                                                       nonstatic
-//                                                                       member
-//                                                                       reference
-//              ::= <expr-primary>
+StringView Db::parseBareSourceName() {
+  size_t Int = 0;
+  if (parsePositiveInteger(&Int) || numLeft() < Int)
+    return StringView();
+  StringView R(First, First + Int);
+  First += Int;
+  return R;
+}
 
-template <class C>
-static const char *parse_expression(const char *first, const char *last,
-                                    C &db) {
-  if (last - first >= 2) {
-    const char *t = first;
-    bool parsed_gs = false;
-    if (last - first >= 4 && t[0] == 'g' && t[1] == 's') {
-      t += 2;
-      parsed_gs = true;
-    }
-    switch (*t) {
-    case 'L':
-      first = parse_expr_primary(first, last, db);
-      break;
-    case 'T':
-      first = parse_template_param(first, last, db);
+// <function-type> ::= [<CV-qualifiers>] [<exception-spec>] [Dx] F [Y] <bare-function-type> [<ref-qualifier>] E
+//
+// <exception-spec> ::= Do                # non-throwing exception-specification (e.g., noexcept, throw())
+//                  ::= DO <expression> E # computed (instantiation-dependent) noexcept
+//                  ::= Dw <type>+ E      # dynamic exception specification with instantiation-dependent types
+//
+// <ref-qualifier> ::= R                   # & ref-qualifier
+// <ref-qualifier> ::= O                   # && ref-qualifier
+Node *Db::parseFunctionType() {
+  Qualifiers CVQuals = parseCVQualifiers();
+
+  Node *ExceptionSpec = nullptr;
+  if (consumeIf("Do")) {
+    ExceptionSpec = make<NameType>("noexcept");
+  } else if (consumeIf("DO")) {
+    Node *E = parseExpr();
+    if (E == nullptr || !consumeIf('E'))
+      return nullptr;
+    ExceptionSpec = make<NoexceptSpec>(E);
+  } else if (consumeIf("Dw")) {
+    size_t SpecsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *T = parseType();
+      if (T == nullptr)
+        return nullptr;
+      Names.push_back(T);
+    }
+    ExceptionSpec =
+      make<DynamicExceptionSpec>(popTrailingNodeArray(SpecsBegin));
+  }
+
+  consumeIf("Dx"); // transaction safe
+
+  if (!consumeIf('F'))
+    return nullptr;
+  consumeIf('Y'); // extern "C"
+  Node *ReturnType = parseType();
+  if (ReturnType == nullptr)
+    return nullptr;
+
+  FunctionRefQual ReferenceQualifier = FrefQualNone;
+  size_t ParamsBegin = Names.size();
+  while (true) {
+    if (consumeIf('E'))
       break;
-    case 'f':
-      first = parse_function_param(first, last, db);
+    if (consumeIf('v'))
+      continue;
+    if (consumeIf("RE")) {
+      ReferenceQualifier = FrefQualLValue;
       break;
-    case 'a':
-      switch (t[1]) {
-      case 'a':
-        t = parse_binary_expression(first + 2, last, "&&", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'd':
-        t = parse_prefix_expression(first + 2, last, "&", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'n':
-        t = parse_binary_expression(first + 2, last, "&", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'N':
-        t = parse_binary_expression(first + 2, last, "&=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'S':
-        t = parse_binary_expression(first + 2, last, "=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 't':
-        first = parse_alignof_type(first, last, db);
-        break;
-      case 'z':
-        first = parse_alignof_expr(first, last, db);
-        break;
-      }
+    }
+    if (consumeIf("OE")) {
+      ReferenceQualifier = FrefQualRValue;
       break;
-    case 'c':
-      switch (t[1]) {
-      case 'c':
-        first = parse_const_cast_expr(first, last, db);
-        break;
-      case 'l':
-        first = parse_call_expr(first, last, db);
-        break;
-      case 'm':
-        t = parse_binary_expression(first + 2, last, ",", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'o':
-        t = parse_prefix_expression(first + 2, last, "~", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'v':
-        first = parse_conversion_expr(first, last, db);
-        break;
+    }
+    Node *T = parseType();
+    if (T == nullptr)
+      return nullptr;
+    Names.push_back(T);
+  }
+
+  NodeArray Params = popTrailingNodeArray(ParamsBegin);
+  return make<FunctionType>(ReturnType, Params, CVQuals,
+                            ReferenceQualifier, ExceptionSpec);
+}
+
+// extension:
+// <vector-type>           ::= Dv <positive dimension number> _ <extended element type>
+//                         ::= Dv [<dimension expression>] _ <element type>
+// <extended element type> ::= <element type>
+//                         ::= p # AltiVec vector pixel
+Node *Db::parseVectorType() {
+  if (!consumeIf("Dv"))
+    return nullptr;
+  if (look() >= '1' && look() <= '9') {
+    StringView DimensionNumber = parseNumber();
+    if (!consumeIf('_'))
+      return nullptr;
+    if (consumeIf('p'))
+      return make<VectorType>(DimensionNumber);
+    Node *ElemType = parseType();
+    if (ElemType == nullptr)
+      return nullptr;
+    return make<VectorType>(ElemType, DimensionNumber);
+  }
+
+  if (!consumeIf('_')) {
+    Node *DimExpr = parseExpr();
+    if (!DimExpr)
+      return nullptr;
+    if (!consumeIf('_'))
+      return nullptr;
+    Node *ElemType = parseType();
+    if (!ElemType)
+      return nullptr;
+    return make<VectorType>(ElemType, DimExpr);
+  }
+  Node *ElemType = parseType();
+  if (!ElemType)
+    return nullptr;
+  return make<VectorType>(ElemType, StringView());
+}
+
+// <decltype>  ::= Dt <expression> E  # decltype of an id-expression or class member access (C++0x)
+//             ::= DT <expression> E  # decltype of an expression (C++0x)
+Node *Db::parseDecltype() {
+  if (!consumeIf('D'))
+    return nullptr;
+  if (!consumeIf('t') && !consumeIf('T'))
+    return nullptr;
+  Node *E = parseExpr();
+  if (E == nullptr)
+    return nullptr;
+  if (!consumeIf('E'))
+    return nullptr;
+  return make<EnclosingExpr>("decltype(", E, ")");
+}
+
+// <array-type> ::= A <positive dimension number> _ <element type>
+//              ::= A [<dimension expression>] _ <element type>
+Node *Db::parseArrayType() {
+  if (!consumeIf('A'))
+    return nullptr;
+
+  if (std::isdigit(look())) {
+    StringView Dimension = parseNumber();
+    if (!consumeIf('_'))
+      return nullptr;
+    Node *Ty = parseType();
+    if (Ty == nullptr)
+      return nullptr;
+    return make<ArrayType>(Ty, Dimension);
+  }
+
+  if (!consumeIf('_')) {
+    Node *DimExpr = parseExpr();
+    if (DimExpr == nullptr)
+      return nullptr;
+    if (!consumeIf('_'))
+      return nullptr;
+    Node *ElementType = parseType();
+    if (ElementType == nullptr)
+      return nullptr;
+    return make<ArrayType>(ElementType, DimExpr);
+  }
+
+  Node *Ty = parseType();
+  if (Ty == nullptr)
+    return nullptr;
+  return make<ArrayType>(Ty);
+}
+
+// <pointer-to-member-type> ::= M <class type> <member type>
+Node *Db::parsePointerToMemberType() {
+  if (!consumeIf('M'))
+    return nullptr;
+  Node *ClassType = parseType();
+  if (ClassType == nullptr)
+    return nullptr;
+  Node *MemberType = parseType();
+  if (MemberType == nullptr)
+    return nullptr;
+  return make<PointerToMemberType>(ClassType, MemberType);
+}
+
+// <class-enum-type> ::= <name>     # non-dependent type name, dependent type name, or dependent typename-specifier
+//                   ::= Ts <name>  # dependent elaborated type specifier using 'struct' or 'class'
+//                   ::= Tu <name>  # dependent elaborated type specifier using 'union'
+//                   ::= Te <name>  # dependent elaborated type specifier using 'enum'
+Node *Db::parseClassEnumType() {
+  StringView ElabSpef;
+  if (consumeIf("Ts"))
+    ElabSpef = "struct";
+  else if (consumeIf("Tu"))
+    ElabSpef = "union";
+  else if (consumeIf("Te"))
+    ElabSpef = "enum";
+
+  Node *Name = parseName();
+  if (Name == nullptr)
+    return nullptr;
+
+  if (!ElabSpef.empty())
+    return make<ElaboratedTypeSpefType>(ElabSpef, Name);
+
+  return Name;
+}
+
+// <qualified-type>     ::= <qualifiers> <type>
+// <qualifiers> ::= <extended-qualifier>* <CV-qualifiers>
+// <extended-qualifier> ::= U <source-name> [<template-args>] # vendor extended type qualifier
+Node *Db::parseQualifiedType() {
+  if (consumeIf('U')) {
+    StringView Qual = parseBareSourceName();
+    if (Qual.empty())
+      return nullptr;
+
+    // FIXME parse the optional <template-args> here!
+
+    // extension            ::= U <objc-name> <objc-type>  # objc-type<identifier>
+    if (Qual.startsWith("objcproto")) {
+      StringView ProtoSourceName = Qual.dropFront(std::strlen("objcproto"));
+      StringView Proto;
+      {
+        SwapAndRestore<const char *> SaveFirst(First, ProtoSourceName.begin()),
+                                     SaveLast(Last, ProtoSourceName.end());
+        Proto = parseBareSourceName();
       }
+      if (Proto.empty())
+        return nullptr;
+      Node *Child = parseQualifiedType();
+      if (Child == nullptr)
+        return nullptr;
+      return make<ObjCProtoName>(Child, Proto);
+    }
+
+    Node *Child = parseQualifiedType();
+    if (Child == nullptr)
+      return nullptr;
+    return make<VendorExtQualType>(Child, Qual);
+  }
+
+  Qualifiers Quals = parseCVQualifiers();
+  Node *Ty = parseType();
+  if (Ty == nullptr)
+    return nullptr;
+  if (Quals != QualNone)
+    Ty = make<QualType>(Ty, Quals);
+  return Ty;
+}
+
+// <type>      ::= <builtin-type>
+//             ::= <qualified-type>
+//             ::= <function-type>
+//             ::= <class-enum-type>
+//             ::= <array-type>
+//             ::= <pointer-to-member-type>
+//             ::= <template-param>
+//             ::= <template-template-param> <template-args>
+//             ::= <decltype>
+//             ::= P <type>        # pointer
+//             ::= R <type>        # l-value reference
+//             ::= O <type>        # r-value reference (C++11)
+//             ::= C <type>        # complex pair (C99)
+//             ::= G <type>        # imaginary (C99)
+//             ::= <substitution>  # See Compression below
+// extension   ::= U <objc-name> <objc-type>  # objc-type<identifier>
+// extension   ::= <vector-type> # <vector-type> starts with Dv
+//
+// <objc-name> ::= <k0 number> objcproto <k1 number> <identifier>  # k0 = 9 + <number of digits in k1> + k1
+// <objc-type> ::= <source-name>  # PU<11+>objcproto 11objc_object<source-name> 11objc_object -> id<source-name>
+Node *Db::parseType() {
+  Node *Result = nullptr;
+
+  switch (look()) {
+  //             ::= <qualified-type>
+  case 'r':
+  case 'V':
+  case 'K': {
+    unsigned AfterQuals = 0;
+    if (look(AfterQuals) == 'r') ++AfterQuals;
+    if (look(AfterQuals) == 'V') ++AfterQuals;
+    if (look(AfterQuals) == 'K') ++AfterQuals;
+
+    if (look(AfterQuals) == 'F' ||
+        (look(AfterQuals) == 'D' &&
+         (look(AfterQuals + 1) == 'o' || look(AfterQuals + 1) == 'O' ||
+          look(AfterQuals + 1) == 'w' || look(AfterQuals + 1) == 'x'))) {
+      Result = parseFunctionType();
       break;
+    }
+    LLVM_FALLTHROUGH;
+  }
+  case 'U': {
+    Result = parseQualifiedType();
+    break;
+  }
+  // <builtin-type> ::= v    # void
+  case 'v':
+    ++First;
+    return make<NameType>("void");
+  //                ::= w    # wchar_t
+  case 'w':
+    ++First;
+    return make<NameType>("wchar_t");
+  //                ::= b    # bool
+  case 'b':
+    ++First;
+    return make<NameType>("bool");
+  //                ::= c    # char
+  case 'c':
+    ++First;
+    return make<NameType>("char");
+  //                ::= a    # signed char
+  case 'a':
+    ++First;
+    return make<NameType>("signed char");
+  //                ::= h    # unsigned char
+  case 'h':
+    ++First;
+    return make<NameType>("unsigned char");
+  //                ::= s    # short
+  case 's':
+    ++First;
+    return make<NameType>("short");
+  //                ::= t    # unsigned short
+  case 't':
+    ++First;
+    return make<NameType>("unsigned short");
+  //                ::= i    # int
+  case 'i':
+    ++First;
+    return make<NameType>("int");
+  //                ::= j    # unsigned int
+  case 'j':
+    ++First;
+    return make<NameType>("unsigned int");
+  //                ::= l    # long
+  case 'l':
+    ++First;
+    return make<NameType>("long");
+  //                ::= m    # unsigned long
+  case 'm':
+    ++First;
+    return make<NameType>("unsigned long");
+  //                ::= x    # long long, __int64
+  case 'x':
+    ++First;
+    return make<NameType>("long long");
+  //                ::= y    # unsigned long long, __int64
+  case 'y':
+    ++First;
+    return make<NameType>("unsigned long long");
+  //                ::= n    # __int128
+  case 'n':
+    ++First;
+    return make<NameType>("__int128");
+  //                ::= o    # unsigned __int128
+  case 'o':
+    ++First;
+    return make<NameType>("unsigned __int128");
+  //                ::= f    # float
+  case 'f':
+    ++First;
+    return make<NameType>("float");
+  //                ::= d    # double
+  case 'd':
+    ++First;
+    return make<NameType>("double");
+  //                ::= e    # long double, __float80
+  case 'e':
+    ++First;
+    return make<NameType>("long double");
+  //                ::= g    # __float128
+  case 'g':
+    ++First;
+    return make<NameType>("__float128");
+  //                ::= z    # ellipsis
+  case 'z':
+    ++First;
+    return make<NameType>("...");
+
+  // <builtin-type> ::= u <source-name>    # vendor extended type
+  case 'u': {
+    ++First;
+    StringView Res = parseBareSourceName();
+    if (Res.empty())
+      return nullptr;
+    return make<NameType>(Res);
+  }
+  case 'D':
+    switch (look(1)) {
+    //                ::= Dd   # IEEE 754r decimal floating point (64 bits)
     case 'd':
-      switch (t[1]) {
-      case 'a': {
-        const char *t1 = parse_expression(t + 2, last, db);
-        if (t1 != t + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first =
-              (parsed_gs ? std::string("::") : std::string()) + "delete[] " +
-              db.names.back().move_full();
-          first = t1;
-        }
-      } break;
-      case 'c':
-        first = parse_dynamic_cast_expr(first, last, db);
-        break;
-      case 'e':
-        t = parse_prefix_expression(first + 2, last, "*", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'l': {
-        const char *t1 = parse_expression(t + 2, last, db);
-        if (t1 != t + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first =
-              (parsed_gs ? std::string("::") : std::string()) + "delete " +
-              db.names.back().move_full();
-          first = t1;
-        }
-      } break;
-      case 'n':
-        return parse_unresolved_name(first, last, db);
-      case 's':
-        first = parse_dot_star_expr(first, last, db);
-        break;
-      case 't':
-        first = parse_dot_expr(first, last, db);
-        break;
-      case 'v':
-        t = parse_binary_expression(first + 2, last, "/", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'V':
-        t = parse_binary_expression(first + 2, last, "/=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("decimal64");
+    //                ::= De   # IEEE 754r decimal floating point (128 bits)
     case 'e':
-      switch (t[1]) {
-      case 'o':
-        t = parse_binary_expression(first + 2, last, "^", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'O':
-        t = parse_binary_expression(first + 2, last, "^=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'q':
-        t = parse_binary_expression(first + 2, last, "==", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
-      break;
-    case 'g':
-      switch (t[1]) {
-      case 'e':
-        t = parse_binary_expression(first + 2, last, ">=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 't':
-        t = parse_binary_expression(first + 2, last, ">", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("decimal128");
+    //                ::= Df   # IEEE 754r decimal floating point (32 bits)
+    case 'f':
+      First += 2;
+      return make<NameType>("decimal32");
+    //                ::= Dh   # IEEE 754r half-precision floating point (16 bits)
+    case 'h':
+      First += 2;
+      return make<NameType>("decimal16");
+    //                ::= Di   # char32_t
     case 'i':
-      if (t[1] == 'x') {
-        const char *t1 = parse_expression(first + 2, last, db);
-        if (t1 != first + 2) {
-          const char *t2 = parse_expression(t1, last, db);
-          if (t2 != t1) {
-            if (db.names.size() < 2)
-              return first;
-            auto op2 = db.names.back().move_full();
-            db.names.pop_back();
-            auto op1 = db.names.back().move_full();
-            db.names.back() = "(" + op1 + ")[" + op2 + "]";
-            first = t2;
-          } else if (!db.names.empty())
-            db.names.pop_back();
-        }
-      }
-      break;
-    case 'l':
-      switch (t[1]) {
-      case 'e':
-        t = parse_binary_expression(first + 2, last, "<=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 's':
-        t = parse_binary_expression(first + 2, last, "<<", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'S':
-        t = parse_binary_expression(first + 2, last, "<<=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 't':
-        t = parse_binary_expression(first + 2, last, "<", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
-      break;
-    case 'm':
-      switch (t[1]) {
-      case 'i':
-        t = parse_binary_expression(first + 2, last, "-", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'I':
-        t = parse_binary_expression(first + 2, last, "-=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'l':
-        t = parse_binary_expression(first + 2, last, "*", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'L':
-        t = parse_binary_expression(first + 2, last, "*=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'm':
-        if (first + 2 != last && first[2] == '_') {
-          t = parse_prefix_expression(first + 3, last, "--", db);
-          if (t != first + 3)
-            first = t;
-        } else {
-          const char *t1 = parse_expression(first + 2, last, db);
-          if (t1 != first + 2) {
-            if (db.names.empty())
-              return first;
-            db.names.back() = "(" + db.names.back().move_full() + ")--";
-            first = t1;
-          }
-        }
-        break;
-      }
-      break;
+      First += 2;
+      return make<NameType>("char32_t");
+    //                ::= Ds   # char16_t
+    case 's':
+      First += 2;
+      return make<NameType>("char16_t");
+    //                ::= Da   # auto (in dependent new-expressions)
+    case 'a':
+      First += 2;
+      return make<NameType>("auto");
+    //                ::= Dc   # decltype(auto)
+    case 'c':
+      First += 2;
+      return make<NameType>("decltype(auto)");
+    //                ::= Dn   # std::nullptr_t (i.e., decltype(nullptr))
     case 'n':
-      switch (t[1]) {
-      case 'a':
-      case 'w':
-        first = parse_new_expr(first, last, db);
-        break;
-      case 'e':
-        t = parse_binary_expression(first + 2, last, "!=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'g':
-        t = parse_prefix_expression(first + 2, last, "-", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 't':
-        t = parse_prefix_expression(first + 2, last, "!", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'x':
-        t = parse_noexcept_expression(first + 2, last, db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
-      break;
-    case 'o':
-      switch (t[1]) {
-      case 'n':
-        return parse_unresolved_name(first, last, db);
-      case 'o':
-        t = parse_binary_expression(first + 2, last, "||", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'r':
-        t = parse_binary_expression(first + 2, last, "|", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'R':
-        t = parse_binary_expression(first + 2, last, "|=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
+      First += 2;
+      return make<NameType>("std::nullptr_t");
+
+    //             ::= <decltype>
+    case 't':
+    case 'T': {
+      Result = parseDecltype();
       break;
-    case 'p':
-      switch (t[1]) {
-      case 'm':
-        t = parse_binary_expression(first + 2, last, "->*", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'l':
-        t = parse_binary_expression(first + 2, last, "+", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'L':
-        t = parse_binary_expression(first + 2, last, "+=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'p':
-        if (first + 2 != last && first[2] == '_') {
-          t = parse_prefix_expression(first + 3, last, "++", db);
-          if (t != first + 3)
-            first = t;
-        } else {
-          const char *t1 = parse_expression(first + 2, last, db);
-          if (t1 != first + 2) {
-            if (db.names.empty())
-              return first;
-            db.names.back() = "(" + db.names.back().move_full() + ")++";
-            first = t1;
-          }
-        }
-        break;
-      case 's':
-        t = parse_prefix_expression(first + 2, last, "+", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 't':
-        first = parse_arrow_expr(first, last, db);
-        break;
-      }
+    }
+    // extension   ::= <vector-type> # <vector-type> starts with Dv
+    case 'v': {
+      Result = parseVectorType();
       break;
-    case 'q':
-      if (t[1] == 'u') {
-        const char *t1 = parse_expression(first + 2, last, db);
-        if (t1 != first + 2) {
-          const char *t2 = parse_expression(t1, last, db);
-          if (t2 != t1) {
-            const char *t3 = parse_expression(t2, last, db);
-            if (t3 != t2) {
-              if (db.names.size() < 3)
-                return first;
-              auto op3 = db.names.back().move_full();
-              db.names.pop_back();
-              auto op2 = db.names.back().move_full();
-              db.names.pop_back();
-              auto op1 = db.names.back().move_full();
-              db.names.back() = "(" + op1 + ") ? (" + op2 + ") : (" + op3 + ")";
-              first = t3;
-            } else {
-              if (db.names.size() < 2)
-                return first;
-              db.names.pop_back();
-              db.names.pop_back();
-            }
-          } else if (!db.names.empty())
-            db.names.pop_back();
-        }
-      }
+    }
+    //           ::= Dp <type>       # pack expansion (C++0x)
+    case 'p': {
+      First += 2;
+      Node *Child = parseType();
+      if (!Child)
+        return nullptr;
+      Result = make<ParameterPackExpansion>(Child);
       break;
-    case 'r':
-      switch (t[1]) {
-      case 'c':
-        first = parse_reinterpret_cast_expr(first, last, db);
-        break;
-      case 'm':
-        t = parse_binary_expression(first + 2, last, "%", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'M':
-        t = parse_binary_expression(first + 2, last, "%=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 's':
-        t = parse_binary_expression(first + 2, last, ">>", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      case 'S':
-        t = parse_binary_expression(first + 2, last, ">>=", db);
-        if (t != first + 2)
-          first = t;
-        break;
-      }
+    }
+    // Exception specifier on a function type.
+    case 'o':
+    case 'O':
+    case 'w':
+    // Transaction safe function type.
+    case 'x':
+      Result = parseFunctionType();
       break;
-    case 's':
-      switch (t[1]) {
-      case 'c':
-        first = parse_static_cast_expr(first, last, db);
-        break;
-      case 'p':
-        first = parse_pack_expansion(first, last, db);
-        break;
-      case 'r':
-        return parse_unresolved_name(first, last, db);
-      case 't':
-        first = parse_sizeof_type_expr(first, last, db);
-        break;
-      case 'z':
-        first = parse_sizeof_expr_expr(first, last, db);
-        break;
-      case 'Z':
-        if (last - t >= 3) {
-          switch (t[2]) {
-          case 'T':
-            first = parse_sizeof_param_pack_expr(first, last, db);
-            break;
-          case 'f':
-            first = parse_sizeof_function_param_pack_expr(first, last, db);
-            break;
-          }
-        }
-        break;
-      }
+    }
+    break;
+  //             ::= <function-type>
+  case 'F': {
+    Result = parseFunctionType();
+    break;
+  }
+  //             ::= <array-type>
+  case 'A': {
+    Result = parseArrayType();
+    break;
+  }
+  //             ::= <pointer-to-member-type>
+  case 'M': {
+    Result = parsePointerToMemberType();
+    break;
+  }
+  //             ::= <template-param>
+  case 'T': {
+    // This could be an elaborate type specifier on a <class-enum-type>.
+    if (look(1) == 's' || look(1) == 'u' || look(1) == 'e') {
+      Result = parseClassEnumType();
       break;
-    case 't':
-      switch (t[1]) {
-      case 'e':
-      case 'i':
-        first = parse_typeid_expr(first, last, db);
-        break;
-      case 'r':
-        db.names.push_back("throw");
-        first += 2;
-        break;
-      case 'w':
-        first = parse_throw_expr(first, last, db);
+    }
+
+    Result = parseTemplateParam();
+    if (Result == nullptr)
+      return nullptr;
+
+    // Result could be either of:
+    //   <type>        ::= <template-param>
+    //   <type>        ::= <template-template-param> <template-args>
+    //
+    //   <template-template-param> ::= <template-param>
+    //                             ::= <substitution>
+    //
+    // If this is followed by some <template-args>, and we're permitted to
+    // parse them, take the second production.
+
+    if (TryToParseTemplateArgs && look() == 'I') {
+      Node *TA = parseTemplateArgs();
+      if (TA == nullptr)
+        return nullptr;
+      Result = make<NameWithTemplateArgs>(Result, TA);
+    }
+    break;
+  }
+  //             ::= P <type>        # pointer
+  case 'P': {
+    ++First;
+    Node *Ptr = parseType();
+    if (Ptr == nullptr)
+      return nullptr;
+    Result = make<PointerType>(Ptr);
+    break;
+  }
+  //             ::= R <type>        # l-value reference
+  case 'R': {
+    ++First;
+    Node *Ref = parseType();
+    if (Ref == nullptr)
+      return nullptr;
+    Result = make<ReferenceType>(Ref, ReferenceKind::LValue);
+    break;
+  }
+  //             ::= O <type>        # r-value reference (C++11)
+  case 'O': {
+    ++First;
+    Node *Ref = parseType();
+    if (Ref == nullptr)
+      return nullptr;
+    Result = make<ReferenceType>(Ref, ReferenceKind::RValue);
+    break;
+  }
+  //             ::= C <type>        # complex pair (C99)
+  case 'C': {
+    ++First;
+    Node *P = parseType();
+    if (P == nullptr)
+      return nullptr;
+    Result = make<PostfixQualifiedType>(P, " complex");
+    break;
+  }
+  //             ::= G <type>        # imaginary (C99)
+  case 'G': {
+    ++First;
+    Node *P = parseType();
+    if (P == nullptr)
+      return P;
+    Result = make<PostfixQualifiedType>(P, " imaginary");
+    break;
+  }
+  //             ::= <substitution>  # See Compression below
+  case 'S': {
+    if (look(1) && look(1) != 't') {
+      Node *Sub = parseSubstitution();
+      if (Sub == nullptr)
+        return nullptr;
+
+      // Sub could be either of:
+      //   <type>        ::= <substitution>
+      //   <type>        ::= <template-template-param> <template-args>
+      //
+      //   <template-template-param> ::= <template-param>
+      //                             ::= <substitution>
+      //
+      // If this is followed by some <template-args>, and we're permitted to
+      // parse them, take the second production.
+
+      if (TryToParseTemplateArgs && look() == 'I') {
+        Node *TA = parseTemplateArgs();
+        if (TA == nullptr)
+          return nullptr;
+        Result = make<NameWithTemplateArgs>(Sub, TA);
         break;
       }
-      break;
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-      return parse_unresolved_name(first, last, db);
-    }
-  }
-  return first;
-}
 
-// <template-arg> ::= <type>                                             # type
-// or template
-//                ::= X <expression> E                                   #
-//                expression
-//                ::= <expr-primary>                                     #
-//                simple expressions
-//                ::= J <template-arg>* E                                #
-//                argument pack
-//                ::= LZ <encoding> E                                    #
-//                extension
-
-template <class C>
-static const char *parse_template_arg(const char *first, const char *last,
-                                      C &db) {
-  if (first != last) {
-    const char *t;
-    switch (*first) {
-    case 'X':
-      t = parse_expression(first + 1, last, db);
-      if (t != first + 1) {
-        if (t != last && *t == 'E')
-          first = t + 1;
-      }
-      break;
-    case 'J':
-      t = first + 1;
-      if (t == last)
-        return first;
-      while (*t != 'E') {
-        const char *t1 = parse_template_arg(t, last, db);
-        if (t1 == t)
-          return first;
-        t = t1;
-      }
-      first = t + 1;
-      break;
-    case 'L':
-      // <expr-primary> or LZ <encoding> E
-      if (first + 1 != last && first[1] == 'Z') {
-        t = parse_encoding(first + 2, last, db);
-        if (t != first + 2 && t != last && *t == 'E')
-          first = t + 1;
-      } else
-        first = parse_expr_primary(first, last, db);
-      break;
-    default:
-      // <type>
-      first = parse_type(first, last, db);
-      break;
+      // If all we parsed was a substitution, don't re-insert into the
+      // substitution table.
+      return Sub;
     }
+    LLVM_FALLTHROUGH;
+  }
+  //        ::= <class-enum-type>
+  default: {
+    Result = parseClassEnumType();
+    break;
   }
-  return first;
+  }
+
+  // If we parsed a type, insert it into the substitution table. Note that all
+  // <builtin-type>s and <substitution>s have already bailed out, because they
+  // don't get substitutions.
+  if (Result != nullptr)
+    Subs.push_back(Result);
+  return Result;
 }
 
-// <template-args> ::= I <template-arg>* E
-//     extension, the abi says <template-arg>+
+Node *Db::parsePrefixExpr(StringView Kind) {
+  Node *E = parseExpr();
+  if (E == nullptr)
+    return nullptr;
+  return make<PrefixExpr>(Kind, E);
+}
 
-template <class C>
-static const char *parse_template_args(const char *first, const char *last,
-                                       C &db) {
-  if (last - first >= 2 && *first == 'I') {
-    if (db.tag_templates)
-      db.template_param.back().clear();
-    const char *t = first + 1;
-    std::string args("<");
-    while (*t != 'E') {
-      if (db.tag_templates)
-        db.template_param.emplace_back();
-      size_t k0 = db.names.size();
-      const char *t1 = parse_template_arg(t, last, db);
-      size_t k1 = db.names.size();
-      if (db.tag_templates)
-        db.template_param.pop_back();
-      if (t1 == t || t1 == last)
-        return first;
-      if (db.tag_templates) {
-        db.template_param.back().emplace_back();
-        for (size_t k = k0; k < k1; ++k)
-          db.template_param.back().back().push_back(db.names[k]);
-      }
-      for (size_t k = k0; k < k1; ++k) {
-        if (args.size() > 1)
-          args += ", ";
-        args += db.names[k].move_full();
-      }
-      for (; k1 > k0; --k1)
-        if (!db.names.empty())
-          db.names.pop_back();
-      t = t1;
-    }
-    first = t + 1;
-    if (args.back() != '>')
-      args += ">";
-    else
-      args += " >";
-    db.names.push_back(std::move(args));
+Node *Db::parseBinaryExpr(StringView Kind) {
+  Node *LHS = parseExpr();
+  if (LHS == nullptr)
+    return nullptr;
+  Node *RHS = parseExpr();
+  if (RHS == nullptr)
+    return nullptr;
+  return make<BinaryExpr>(LHS, Kind, RHS);
+}
+
+Node *Db::parseIntegerLiteral(StringView Lit) {
+  StringView Tmp = parseNumber(true);
+  if (!Tmp.empty() && consumeIf('E'))
+    return make<IntegerExpr>(Lit, Tmp);
+  return nullptr;
+}
+
+// <CV-Qualifiers> ::= [r] [V] [K]
+Qualifiers Db::parseCVQualifiers() {
+  Qualifiers CVR = QualNone;
+  if (consumeIf('r'))
+    addQualifiers(CVR, QualRestrict);
+  if (consumeIf('V'))
+    addQualifiers(CVR, QualVolatile);
+  if (consumeIf('K'))
+    addQualifiers(CVR, QualConst);
+  return CVR;
+}
+
+// <function-param> ::= fp <top-level CV-Qualifiers> _                                     # L == 0, first parameter
+//                  ::= fp <top-level CV-Qualifiers> <parameter-2 non-negative number> _   # L == 0, second and later parameters
+//                  ::= fL <L-1 non-negative number> p <top-level CV-Qualifiers> _         # L > 0, first parameter
+//                  ::= fL <L-1 non-negative number> p <top-level CV-Qualifiers> <parameter-2 non-negative number> _   # L > 0, second and later parameters
+Node *Db::parseFunctionParam() {
+  if (consumeIf("fp")) {
+    parseCVQualifiers();
+    StringView Num = parseNumber();
+    if (!consumeIf('_'))
+      return nullptr;
+    return make<FunctionParam>(Num);
+  }
+  if (consumeIf("fL")) {
+    if (parseNumber().empty())
+      return nullptr;
+    if (!consumeIf('p'))
+      return nullptr;
+    parseCVQualifiers();
+    StringView Num = parseNumber();
+    if (!consumeIf('_'))
+      return nullptr;
+    return make<FunctionParam>(Num);
   }
-  return first;
+  return nullptr;
 }
 
-// <nested-name> ::= N [<CV-qualifiers>] [<ref-qualifier>] <prefix>
-// <unqualified-name> E
-//               ::= N [<CV-qualifiers>] [<ref-qualifier>] <template-prefix>
-//               <template-args> E
-//
-// <prefix> ::= <prefix> <unqualified-name>
-//          ::= <template-prefix> <template-args>
-//          ::= <template-param>
-//          ::= <decltype>
-//          ::= # empty
-//          ::= <substitution>
-//          ::= <prefix> <data-member-prefix>
-//  extension ::= L
-//
-// <template-prefix> ::= <prefix> <template unqualified-name>
-//                   ::= <template-param>
-//                   ::= <substitution>
+// [gs] nw <expression>* _ <type> E                     # new (expr-list) type
+// [gs] nw <expression>* _ <type> <initializer>         # new (expr-list) type (init)
+// [gs] na <expression>* _ <type> E                     # new[] (expr-list) type
+// [gs] na <expression>* _ <type> <initializer>         # new[] (expr-list) type (init)
+// <initializer> ::= pi <expression>* E                 # parenthesized initialization
+Node *Db::parseNewExpr() {
+  bool Global = consumeIf("gs");
+  bool IsArray = look(1) == 'a';
+  if (!consumeIf("nw") && !consumeIf("na"))
+    return nullptr;
+  size_t Exprs = Names.size();
+  while (!consumeIf('_')) {
+    Node *Ex = parseExpr();
+    if (Ex == nullptr)
+      return nullptr;
+    Names.push_back(Ex);
+  }
+  NodeArray ExprList = popTrailingNodeArray(Exprs);
+  Node *Ty = parseType();
+  if (Ty == nullptr)
+    return Ty;
+  if (consumeIf("pi")) {
+    size_t InitsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *Init = parseExpr();
+      if (Init == nullptr)
+        return Init;
+      Names.push_back(Init);
+    }
+    NodeArray Inits = popTrailingNodeArray(InitsBegin);
+    return make<NewExpr>(ExprList, Ty, Inits, Global, IsArray);
+  } else if (!consumeIf('E'))
+    return nullptr;
+  return make<NewExpr>(ExprList, Ty, NodeArray(), Global, IsArray);
+}
 
-template <class C>
-static const char *parse_nested_name(const char *first, const char *last, C &db,
-                                     bool *ends_with_template_args) {
-  if (first != last && *first == 'N') {
-    unsigned cv;
-    const char *t0 = parse_cv_qualifiers(first + 1, last, cv);
-    if (t0 == last)
-      return first;
-    db.ref = 0;
-    if (*t0 == 'R') {
-      db.ref = 1;
-      ++t0;
-    } else if (*t0 == 'O') {
-      db.ref = 2;
-      ++t0;
-    }
-    db.names.emplace_back();
-    if (last - t0 >= 2 && t0[0] == 'S' && t0[1] == 't') {
-      t0 += 2;
-      db.names.back().first = "std";
-    }
-    if (t0 == last) {
-      db.names.pop_back();
-      return first;
-    }
-    bool pop_subs = false;
-    bool component_ends_with_template_args = false;
-    while (*t0 != 'E') {
-      component_ends_with_template_args = false;
-      const char *t1;
-      switch (*t0) {
-      case 'S':
-        if (t0 + 1 != last && t0[1] == 't')
-          goto do_parse_unqualified_name;
-        t1 = parse_substitution(t0, last, db);
-        if (t1 != t0 && t1 != last) {
-          auto name = db.names.back().move_full();
-          db.names.pop_back();
-          if (db.names.empty())
-            return first;
-          if (!db.names.back().first.empty()) {
-            db.names.back().first += "::" + name;
-            db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          } else
-            db.names.back().first = name;
-          pop_subs = true;
-          t0 = t1;
-        } else
-          return first;
-        break;
-      case 'T':
-        t1 = parse_template_param(t0, last, db);
-        if (t1 != t0 && t1 != last) {
-          auto name = db.names.back().move_full();
-          db.names.pop_back();
-          if (db.names.empty())
-            return first;
-          if (!db.names.back().first.empty())
-            db.names.back().first += "::" + name;
-          else
-            db.names.back().first = name;
-          db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          pop_subs = true;
-          t0 = t1;
-        } else
-          return first;
-        break;
-      case 'D':
-        if (t0 + 1 != last && t0[1] != 't' && t0[1] != 'T')
-          goto do_parse_unqualified_name;
-        t1 = parse_decltype(t0, last, db);
-        if (t1 != t0 && t1 != last) {
-          auto name = db.names.back().move_full();
-          db.names.pop_back();
-          if (db.names.empty())
-            return first;
-          if (!db.names.back().first.empty())
-            db.names.back().first += "::" + name;
-          else
-            db.names.back().first = name;
-          db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          pop_subs = true;
-          t0 = t1;
-        } else
-          return first;
-        break;
-      case 'I':
-        t1 = parse_template_args(t0, last, db);
-        if (t1 != t0 && t1 != last) {
-          auto name = db.names.back().move_full();
-          db.names.pop_back();
-          if (db.names.empty())
-            return first;
-          db.names.back().first += name;
-          db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          t0 = t1;
-          component_ends_with_template_args = true;
-        } else
-          return first;
-        break;
-      case 'L':
-        if (++t0 == last)
-          return first;
-        break;
-      default:
-      do_parse_unqualified_name:
-        t1 = parse_unqualified_name(t0, last, db);
-        if (t1 != t0 && t1 != last) {
-          auto name = db.names.back().move_full();
-          db.names.pop_back();
-          if (db.names.empty())
-            return first;
-          if (!db.names.back().first.empty())
-            db.names.back().first += "::" + name;
-          else
-            db.names.back().first = name;
-          db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          pop_subs = true;
-          t0 = t1;
-        } else
-          return first;
-      }
+// cv <type> <expression>                               # conversion with one argument
+// cv <type> _ <expression>* E                          # conversion with a different number of arguments
+Node *Db::parseConversionExpr() {
+  if (!consumeIf("cv"))
+    return nullptr;
+  Node *Ty;
+  {
+    SwapAndRestore<bool> SaveTemp(TryToParseTemplateArgs, false);
+    Ty = parseType();
+  }
+
+  if (Ty == nullptr)
+    return nullptr;
+
+  if (consumeIf('_')) {
+    size_t ExprsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = parseExpr();
+      if (E == nullptr)
+        return E;
+      Names.push_back(E);
     }
-    first = t0 + 1;
-    db.cv = cv;
-    if (pop_subs && !db.subs.empty())
-      db.subs.pop_back();
-    if (ends_with_template_args)
-      *ends_with_template_args = component_ends_with_template_args;
+    NodeArray Exprs = popTrailingNodeArray(ExprsBegin);
+    return make<ConversionExpr>(Ty, Exprs);
   }
-  return first;
-}
 
-// <discriminator> := _ <non-negative number>      # when number < 10
-//                 := __ <non-negative number> _   # when number >= 10
-//  extension      := decimal-digit+               # at the end of string
+  Node *E[1] = {parseExpr()};
+  if (E[0] == nullptr)
+    return nullptr;
+  return make<ConversionExpr>(Ty, makeNodeArray(E, E + 1));
+}
 
-static const char *parse_discriminator(const char *first, const char *last) {
-  // parse but ignore discriminator
-  if (first != last) {
-    if (*first == '_') {
-      const char *t1 = first + 1;
-      if (t1 != last) {
-        if (std::isdigit(*t1))
-          first = t1 + 1;
-        else if (*t1 == '_') {
-          for (++t1; t1 != last && std::isdigit(*t1); ++t1)
-            ;
-          if (t1 != last && *t1 == '_')
-            first = t1 + 1;
-        }
-      }
-    } else if (std::isdigit(*first)) {
-      const char *t1 = first + 1;
-      for (; t1 != last && std::isdigit(*t1); ++t1)
-        ;
-      if (t1 == last)
-        first = last;
+// <expr-primary> ::= L <type> <value number> E                          # integer literal
+//                ::= L <type> <value float> E                           # floating literal
+//                ::= L <string type> E                                  # string literal
+//                ::= L <nullptr type> E                                 # nullptr literal (i.e., "LDnE")
+// FIXME:         ::= L <type> <real-part float> _ <imag-part float> E   # complex floating point literal (C 2000)
+//                ::= L <mangled-name> E                                 # external name
+Node *Db::parseExprPrimary() {
+  if (!consumeIf('L'))
+    return nullptr;
+  switch (look()) {
+  case 'w':
+    ++First;
+    return parseIntegerLiteral("wchar_t");
+  case 'b':
+    if (consumeIf("b0E"))
+      return make<BoolExpr>(0);
+    if (consumeIf("b1E"))
+      return make<BoolExpr>(1);
+    return nullptr;
+  case 'c':
+    ++First;
+    return parseIntegerLiteral("char");
+  case 'a':
+    ++First;
+    return parseIntegerLiteral("signed char");
+  case 'h':
+    ++First;
+    return parseIntegerLiteral("unsigned char");
+  case 's':
+    ++First;
+    return parseIntegerLiteral("short");
+  case 't':
+    ++First;
+    return parseIntegerLiteral("unsigned short");
+  case 'i':
+    ++First;
+    return parseIntegerLiteral("");
+  case 'j':
+    ++First;
+    return parseIntegerLiteral("u");
+  case 'l':
+    ++First;
+    return parseIntegerLiteral("l");
+  case 'm':
+    ++First;
+    return parseIntegerLiteral("ul");
+  case 'x':
+    ++First;
+    return parseIntegerLiteral("ll");
+  case 'y':
+    ++First;
+    return parseIntegerLiteral("ull");
+  case 'n':
+    ++First;
+    return parseIntegerLiteral("__int128");
+  case 'o':
+    ++First;
+    return parseIntegerLiteral("unsigned __int128");
+  case 'f':
+    ++First;
+    return parseFloatingLiteral<float>();
+  case 'd':
+    ++First;
+    return parseFloatingLiteral<double>();
+  case 'e':
+    ++First;
+    return parseFloatingLiteral<long double>();
+  case '_':
+    if (consumeIf("_Z")) {
+      Node *R = parseEncoding();
+      if (R != nullptr && consumeIf('E'))
+        return R;
     }
+    return nullptr;
+  case 'T':
+    // Invalid mangled name per
+    //   http://sourcerytools.com/pipermail/cxx-abi-dev/2011-August/002422.html
+    return nullptr;
+  default: {
+    // might be named type
+    Node *T = parseType();
+    if (T == nullptr)
+      return nullptr;
+    StringView N = parseNumber();
+    if (!N.empty()) {
+      if (!consumeIf('E'))
+        return nullptr;
+      return make<IntegerCastExpr>(T, N);
+    }
+    if (consumeIf('E'))
+      return T;
+    return nullptr;
+  }
   }
-  return first;
 }
 
-// <local-name> := Z <function encoding> E <entity name> [<discriminator>]
-//              := Z <function encoding> E s [<discriminator>]
-//              := Z <function encoding> Ed [ <parameter number> ] _ <entity
-//              name>
-
-template <class C>
-static const char *parse_local_name(const char *first, const char *last, C &db,
-                                    bool *ends_with_template_args) {
-  if (first != last && *first == 'Z') {
-    const char *t = parse_encoding(first + 1, last, db);
-    if (t != first + 1 && t != last && *t == 'E' && ++t != last) {
-      switch (*t) {
-      case 's':
-        first = parse_discriminator(t + 1, last);
-        if (db.names.empty())
-          return first;
-        db.names.back().first.append("::string literal");
-        break;
-      case 'd':
-        if (++t != last) {
-          const char *t1 = parse_number(t, last);
-          if (t1 != last && *t1 == '_') {
-            t = t1 + 1;
-            t1 = parse_name(t, last, db, ends_with_template_args);
-            if (t1 != t) {
-              if (db.names.size() < 2)
-                return first;
-              auto name = db.names.back().move_full();
-              db.names.pop_back();
-              if (db.names.empty())
-                return first;
-              db.names.back().first.append("::");
-              db.names.back().first.append(name);
-              first = t1;
-            } else if (!db.names.empty())
-              db.names.pop_back();
-          }
-        }
-        break;
-      default: {
-        const char *t1 = parse_name(t, last, db, ends_with_template_args);
-        if (t1 != t) {
-          // parse but ignore discriminator
-          first = parse_discriminator(t1, last);
-          if (db.names.size() < 2)
-            return first;
-          auto name = db.names.back().move_full();
-          db.names.pop_back();
-          if (db.names.empty())
-            return first;
-          db.names.back().first.append("::");
-          db.names.back().first.append(name);
-        } else if (!db.names.empty())
-          db.names.pop_back();
-      } break;
-      }
+// <braced-expression> ::= <expression>
+//                     ::= di <field source-name> <braced-expression>    # .name = expr
+//                     ::= dx <index expression> <braced-expression>     # [expr] = expr
+//                     ::= dX <range begin expression> <range end expression> <braced-expression>
+Node *Db::parseBracedExpr() {
+  if (look() == 'd') {
+    switch (look(1)) {
+    case 'i': {
+      First += 2;
+      Node *Field = parseSourceName(/*NameState=*/nullptr);
+      if (Field == nullptr)
+        return nullptr;
+      Node *Init = parseBracedExpr();
+      if (Init == nullptr)
+        return nullptr;
+      return make<BracedExpr>(Field, Init, /*isArray=*/false);
+    }
+    case 'x': {
+      First += 2;
+      Node *Index = parseExpr();
+      if (Index == nullptr)
+        return nullptr;
+      Node *Init = parseBracedExpr();
+      if (Init == nullptr)
+        return nullptr;
+      return make<BracedExpr>(Index, Init, /*isArray=*/true);
+    }
+    case 'X': {
+      First += 2;
+      Node *RangeBegin = parseExpr();
+      if (RangeBegin == nullptr)
+        return nullptr;
+      Node *RangeEnd = parseExpr();
+      if (RangeEnd == nullptr)
+        return nullptr;
+      Node *Init = parseBracedExpr();
+      if (Init == nullptr)
+        return nullptr;
+      return make<BracedRangeExpr>(RangeBegin, RangeEnd, Init);
+    }
     }
   }
-  return first;
+  return parseExpr();
 }
 
-// <name> ::= <nested-name> // N
-//        ::= <local-name> # See Scope Encoding below  // Z
-//        ::= <unscoped-template-name> <template-args>
-//        ::= <unscoped-name>
+// (not yet in the spec)
+// <fold-expr> ::= fL <binary-operator-name> <expression> <expression>
+//             ::= fR <binary-operator-name> <expression> <expression>
+//             ::= fl <binary-operator-name> <expression>
+//             ::= fr <binary-operator-name> <expression>
+Node *Db::parseFoldExpr() {
+  if (!consumeIf('f'))
+    return nullptr;
 
-// <unscoped-template-name> ::= <unscoped-name>
-//                          ::= <substitution>
+  char FoldKind = look();
+  bool IsLeftFold, HasInitializer;
+  HasInitializer = FoldKind == 'L' || FoldKind == 'R';
+  if (FoldKind == 'l' || FoldKind == 'L')
+    IsLeftFold = true;
+  else if (FoldKind == 'r' || FoldKind == 'R')
+    IsLeftFold = false;
+  else
+    return nullptr;
+  ++First;
+
+  // FIXME: This map is duplicated in parseOperatorName and parseExpr.
+  StringView OperatorName;
+  if      (consumeIf("aa")) OperatorName = "&&";
+  else if (consumeIf("an")) OperatorName = "&";
+  else if (consumeIf("aN")) OperatorName = "&=";
+  else if (consumeIf("aS")) OperatorName = "=";
+  else if (consumeIf("cm")) OperatorName = ",";
+  else if (consumeIf("ds")) OperatorName = ".*";
+  else if (consumeIf("dv")) OperatorName = "/";
+  else if (consumeIf("dV")) OperatorName = "/=";
+  else if (consumeIf("eo")) OperatorName = "^";
+  else if (consumeIf("eO")) OperatorName = "^=";
+  else if (consumeIf("eq")) OperatorName = "==";
+  else if (consumeIf("ge")) OperatorName = ">=";
+  else if (consumeIf("gt")) OperatorName = ">";
+  else if (consumeIf("le")) OperatorName = "<=";
+  else if (consumeIf("ls")) OperatorName = "<<";
+  else if (consumeIf("lS")) OperatorName = "<<=";
+  else if (consumeIf("lt")) OperatorName = "<";
+  else if (consumeIf("mi")) OperatorName = "-";
+  else if (consumeIf("mI")) OperatorName = "-=";
+  else if (consumeIf("ml")) OperatorName = "*";
+  else if (consumeIf("mL")) OperatorName = "*=";
+  else if (consumeIf("ne")) OperatorName = "!=";
+  else if (consumeIf("oo")) OperatorName = "||";
+  else if (consumeIf("or")) OperatorName = "|";
+  else if (consumeIf("oR")) OperatorName = "|=";
+  else if (consumeIf("pl")) OperatorName = "+";
+  else if (consumeIf("pL")) OperatorName = "+=";
+  else if (consumeIf("rm")) OperatorName = "%";
+  else if (consumeIf("rM")) OperatorName = "%=";
+  else if (consumeIf("rs")) OperatorName = ">>";
+  else if (consumeIf("rS")) OperatorName = ">>=";
+  else return nullptr;
+
+  Node *Pack = parseExpr(), *Init = nullptr;
+  if (Pack == nullptr)
+    return nullptr;
+  if (HasInitializer) {
+    Init = parseExpr();
+    if (Init == nullptr)
+      return nullptr;
+  }
 
-template <class C>
-static const char *parse_name(const char *first, const char *last, C &db,
-                              bool *ends_with_template_args) {
-  if (last - first >= 2) {
-    const char *t0 = first;
-    // extension: ignore L here
-    if (*t0 == 'L')
-      ++t0;
-    switch (*t0) {
-    case 'N': {
-      const char *t1 = parse_nested_name(t0, last, db, ends_with_template_args);
-      if (t1 != t0)
-        first = t1;
-      break;
+  if (IsLeftFold && Init)
+    std::swap(Pack, Init);
+
+  return make<FoldExpr>(IsLeftFold, OperatorName, Pack, Init);
+}
+
+// <expression> ::= <unary operator-name> <expression>
+//              ::= <binary operator-name> <expression> <expression>
+//              ::= <ternary operator-name> <expression> <expression> <expression>
+//              ::= cl <expression>+ E                                   # call
+//              ::= cv <type> <expression>                               # conversion with one argument
+//              ::= cv <type> _ <expression>* E                          # conversion with a different number of arguments
+//              ::= [gs] nw <expression>* _ <type> E                     # new (expr-list) type
+//              ::= [gs] nw <expression>* _ <type> <initializer>         # new (expr-list) type (init)
+//              ::= [gs] na <expression>* _ <type> E                     # new[] (expr-list) type
+//              ::= [gs] na <expression>* _ <type> <initializer>         # new[] (expr-list) type (init)
+//              ::= [gs] dl <expression>                                 # delete expression
+//              ::= [gs] da <expression>                                 # delete[] expression
+//              ::= pp_ <expression>                                     # prefix ++
+//              ::= mm_ <expression>                                     # prefix --
+//              ::= ti <type>                                            # typeid (type)
+//              ::= te <expression>                                      # typeid (expression)
+//              ::= dc <type> <expression>                               # dynamic_cast<type> (expression)
+//              ::= sc <type> <expression>                               # static_cast<type> (expression)
+//              ::= cc <type> <expression>                               # const_cast<type> (expression)
+//              ::= rc <type> <expression>                               # reinterpret_cast<type> (expression)
+//              ::= st <type>                                            # sizeof (a type)
+//              ::= sz <expression>                                      # sizeof (an expression)
+//              ::= at <type>                                            # alignof (a type)
+//              ::= az <expression>                                      # alignof (an expression)
+//              ::= nx <expression>                                      # noexcept (expression)
+//              ::= <template-param>
+//              ::= <function-param>
+//              ::= dt <expression> <unresolved-name>                    # expr.name
+//              ::= pt <expression> <unresolved-name>                    # expr->name
+//              ::= ds <expression> <expression>                         # expr.*expr
+//              ::= sZ <template-param>                                  # size of a parameter pack
+//              ::= sZ <function-param>                                  # size of a function parameter pack
+//              ::= sP <template-arg>* E                                 # sizeof...(T), size of a captured template parameter pack from an alias template
+//              ::= sp <expression>                                      # pack expansion
+//              ::= tw <expression>                                      # throw expression
+//              ::= tr                                                   # throw with no operand (rethrow)
+//              ::= <unresolved-name>                                    # f(p), N::f(p), ::f(p),
+//                                                                       # freestanding dependent name (e.g., T::x),
+//                                                                       # objectless nonstatic member reference
+//              ::= fL <binary-operator-name> <expression> <expression>
+//              ::= fR <binary-operator-name> <expression> <expression>
+//              ::= fl <binary-operator-name> <expression>
+//              ::= fr <binary-operator-name> <expression>
+//              ::= <expr-primary>
+Node *Db::parseExpr() {
+  bool Global = consumeIf("gs");
+  if (numLeft() < 2)
+    return nullptr;
+
+  switch (*First) {
+  case 'L':
+    return parseExprPrimary();
+  case 'T':
+    return parseTemplateParam();
+  case 'f': {
+    // Disambiguate a fold expression from a <function-param>.
+    if (look(1) == 'p' || (look(1) == 'L' && std::isdigit(look(2))))
+      return parseFunctionParam();
+    return parseFoldExpr();
+  }
+  case 'a':
+    switch (First[1]) {
+    case 'a':
+      First += 2;
+      return parseBinaryExpr("&&");
+    case 'd':
+      First += 2;
+      return parsePrefixExpr("&");
+    case 'n':
+      First += 2;
+      return parseBinaryExpr("&");
+    case 'N':
+      First += 2;
+      return parseBinaryExpr("&=");
+    case 'S':
+      First += 2;
+      return parseBinaryExpr("=");
+    case 't': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      return make<EnclosingExpr>("alignof (", Ty, ")");
     }
-    case 'Z': {
-      const char *t1 = parse_local_name(t0, last, db, ends_with_template_args);
-      if (t1 != t0)
-        first = t1;
-      break;
+    case 'z': {
+      First += 2;
+      Node *Ty = parseExpr();
+      if (Ty == nullptr)
+        return nullptr;
+      return make<EnclosingExpr>("alignof (", Ty, ")");
     }
-    default: {
-      const char *t1 = parse_unscoped_name(t0, last, db);
-      if (t1 != t0) {
-        if (t1 != last &&
-            *t1 == 'I') // <unscoped-template-name> <template-args>
-        {
-          if (db.names.empty())
-            return first;
-          db.subs.push_back(typename C::sub_type(1, db.names.back()));
-          t0 = t1;
-          t1 = parse_template_args(t0, last, db);
-          if (t1 != t0) {
-            if (db.names.size() < 2)
-              return first;
-            auto tmp = db.names.back().move_full();
-            db.names.pop_back();
-            if (db.names.empty())
-              return first;
-            db.names.back().first += tmp;
-            first = t1;
-            if (ends_with_template_args)
-              *ends_with_template_args = true;
-          }
-        } else // <unscoped-name>
-          first = t1;
-      } else { // try <substitution> <template-args>
-        t1 = parse_substitution(t0, last, db);
-        if (t1 != t0 && t1 != last && *t1 == 'I') {
-          t0 = t1;
-          t1 = parse_template_args(t0, last, db);
-          if (t1 != t0) {
-            if (db.names.size() < 2)
-              return first;
-            auto tmp = db.names.back().move_full();
-            db.names.pop_back();
-            if (db.names.empty())
-              return first;
-            db.names.back().first += tmp;
-            first = t1;
-            if (ends_with_template_args)
-              *ends_with_template_args = true;
-          }
-        }
+    }
+    return nullptr;
+  case 'c':
+    switch (First[1]) {
+    // cc <type> <expression>                               # const_cast<type>(expression)
+    case 'c': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return Ty;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<CastExpr>("const_cast", Ty, Ex);
+    }
+    // cl <expression>+ E                                   # call
+    case 'l': {
+      First += 2;
+      Node *Callee = parseExpr();
+      if (Callee == nullptr)
+        return Callee;
+      size_t ExprsBegin = Names.size();
+      while (!consumeIf('E')) {
+        Node *E = parseExpr();
+        if (E == nullptr)
+          return E;
+        Names.push_back(E);
       }
-      break;
+      return make<CallExpr>(Callee, popTrailingNodeArray(ExprsBegin));
+    }
+    case 'm':
+      First += 2;
+      return parseBinaryExpr(",");
+    case 'o':
+      First += 2;
+      return parsePrefixExpr("~");
+    case 'v':
+      return parseConversionExpr();
+    }
+    return nullptr;
+  case 'd':
+    switch (First[1]) {
+    case 'a': {
+      First += 2;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<DeleteExpr>(Ex, Global, /*is_array=*/true);
+    }
+    case 'c': {
+      First += 2;
+      Node *T = parseType();
+      if (T == nullptr)
+        return T;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<CastExpr>("dynamic_cast", T, Ex);
     }
+    case 'e':
+      First += 2;
+      return parsePrefixExpr("*");
+    case 'l': {
+      First += 2;
+      Node *E = parseExpr();
+      if (E == nullptr)
+        return E;
+      return make<DeleteExpr>(E, Global, /*is_array=*/false);
+    }
+    case 'n':
+      return parseUnresolvedName();
+    case 's': {
+      First += 2;
+      Node *LHS = parseExpr();
+      if (LHS == nullptr)
+        return nullptr;
+      Node *RHS = parseExpr();
+      if (RHS == nullptr)
+        return nullptr;
+      return make<MemberExpr>(LHS, ".*", RHS);
+    }
+    case 't': {
+      First += 2;
+      Node *LHS = parseExpr();
+      if (LHS == nullptr)
+        return LHS;
+      Node *RHS = parseExpr();
+      if (RHS == nullptr)
+        return nullptr;
+      return make<MemberExpr>(LHS, ".", RHS);
+    }
+    case 'v':
+      First += 2;
+      return parseBinaryExpr("/");
+    case 'V':
+      First += 2;
+      return parseBinaryExpr("/=");
+    }
+    return nullptr;
+  case 'e':
+    switch (First[1]) {
+    case 'o':
+      First += 2;
+      return parseBinaryExpr("^");
+    case 'O':
+      First += 2;
+      return parseBinaryExpr("^=");
+    case 'q':
+      First += 2;
+      return parseBinaryExpr("==");
     }
+    return nullptr;
+  case 'g':
+    switch (First[1]) {
+    case 'e':
+      First += 2;
+      return parseBinaryExpr(">=");
+    case 't':
+      First += 2;
+      return parseBinaryExpr(">");
+    }
+    return nullptr;
+  case 'i':
+    switch (First[1]) {
+    case 'x': {
+      First += 2;
+      Node *Base = parseExpr();
+      if (Base == nullptr)
+        return nullptr;
+      Node *Index = parseExpr();
+      if (Index == nullptr)
+        return Index;
+      return make<ArraySubscriptExpr>(Base, Index);
+    }
+    case 'l': {
+      First += 2;
+      size_t InitsBegin = Names.size();
+      while (!consumeIf('E')) {
+        Node *E = parseBracedExpr();
+        if (E == nullptr)
+          return nullptr;
+        Names.push_back(E);
+      }
+      return make<InitListExpr>(nullptr, popTrailingNodeArray(InitsBegin));
+    }
+    }
+    return nullptr;
+  case 'l':
+    switch (First[1]) {
+    case 'e':
+      First += 2;
+      return parseBinaryExpr("<=");
+    case 's':
+      First += 2;
+      return parseBinaryExpr("<<");
+    case 'S':
+      First += 2;
+      return parseBinaryExpr("<<=");
+    case 't':
+      First += 2;
+      return parseBinaryExpr("<");
+    }
+    return nullptr;
+  case 'm':
+    switch (First[1]) {
+    case 'i':
+      First += 2;
+      return parseBinaryExpr("-");
+    case 'I':
+      First += 2;
+      return parseBinaryExpr("-=");
+    case 'l':
+      First += 2;
+      return parseBinaryExpr("*");
+    case 'L':
+      First += 2;
+      return parseBinaryExpr("*=");
+    case 'm':
+      First += 2;
+      if (consumeIf('_'))
+        return parsePrefixExpr("--");
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return nullptr;
+      return make<PostfixExpr>(Ex, "--");
+    }
+    return nullptr;
+  case 'n':
+    switch (First[1]) {
+    case 'a':
+    case 'w':
+      return parseNewExpr();
+    case 'e':
+      First += 2;
+      return parseBinaryExpr("!=");
+    case 'g':
+      First += 2;
+      return parsePrefixExpr("-");
+    case 't':
+      First += 2;
+      return parsePrefixExpr("!");
+    case 'x':
+      First += 2;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<EnclosingExpr>("noexcept (", Ex, ")");
+    }
+    return nullptr;
+  case 'o':
+    switch (First[1]) {
+    case 'n':
+      return parseUnresolvedName();
+    case 'o':
+      First += 2;
+      return parseBinaryExpr("||");
+    case 'r':
+      First += 2;
+      return parseBinaryExpr("|");
+    case 'R':
+      First += 2;
+      return parseBinaryExpr("|=");
+    }
+    return nullptr;
+  case 'p':
+    switch (First[1]) {
+    case 'm':
+      First += 2;
+      return parseBinaryExpr("->*");
+    case 'l':
+      First += 2;
+      return parseBinaryExpr("+");
+    case 'L':
+      First += 2;
+      return parseBinaryExpr("+=");
+    case 'p': {
+      First += 2;
+      if (consumeIf('_'))
+        return parsePrefixExpr("++");
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<PostfixExpr>(Ex, "++");
+    }
+    case 's':
+      First += 2;
+      return parsePrefixExpr("+");
+    case 't': {
+      First += 2;
+      Node *L = parseExpr();
+      if (L == nullptr)
+        return nullptr;
+      Node *R = parseExpr();
+      if (R == nullptr)
+        return nullptr;
+      return make<MemberExpr>(L, "->", R);
+    }
+    }
+    return nullptr;
+  case 'q':
+    if (First[1] == 'u') {
+      First += 2;
+      Node *Cond = parseExpr();
+      if (Cond == nullptr)
+        return nullptr;
+      Node *LHS = parseExpr();
+      if (LHS == nullptr)
+        return nullptr;
+      Node *RHS = parseExpr();
+      if (RHS == nullptr)
+        return nullptr;
+      return make<ConditionalExpr>(Cond, LHS, RHS);
+    }
+    return nullptr;
+  case 'r':
+    switch (First[1]) {
+    case 'c': {
+      First += 2;
+      Node *T = parseType();
+      if (T == nullptr)
+        return T;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<CastExpr>("reinterpret_cast", T, Ex);
+    }
+    case 'm':
+      First += 2;
+      return parseBinaryExpr("%");
+    case 'M':
+      First += 2;
+      return parseBinaryExpr("%=");
+    case 's':
+      First += 2;
+      return parseBinaryExpr(">>");
+    case 'S':
+      First += 2;
+      return parseBinaryExpr(">>=");
+    }
+    return nullptr;
+  case 's':
+    switch (First[1]) {
+    case 'c': {
+      First += 2;
+      Node *T = parseType();
+      if (T == nullptr)
+        return T;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<CastExpr>("static_cast", T, Ex);
+    }
+    case 'p': {
+      First += 2;
+      Node *Child = parseExpr();
+      if (Child == nullptr)
+        return nullptr;
+      return make<ParameterPackExpansion>(Child);
+    }
+    case 'r':
+      return parseUnresolvedName();
+    case 't': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return Ty;
+      return make<EnclosingExpr>("sizeof (", Ty, ")");
+    }
+    case 'z': {
+      First += 2;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<EnclosingExpr>("sizeof (", Ex, ")");
+    }
+    case 'Z':
+      First += 2;
+      if (look() == 'T') {
+        Node *R = parseTemplateParam();
+        if (R == nullptr)
+          return nullptr;
+        return make<SizeofParamPackExpr>(R);
+      } else if (look() == 'f') {
+        Node *FP = parseFunctionParam();
+        if (FP == nullptr)
+          return nullptr;
+        return make<EnclosingExpr>("sizeof... (", FP, ")");
+      }
+      return nullptr;
+    case 'P': {
+      First += 2;
+      size_t ArgsBegin = Names.size();
+      while (!consumeIf('E')) {
+        Node *Arg = parseTemplateArg();
+        if (Arg == nullptr)
+          return nullptr;
+        Names.push_back(Arg);
+      }
+      return make<EnclosingExpr>(
+          "sizeof... (", make<NodeArrayNode>(popTrailingNodeArray(ArgsBegin)),
+          ")");
+    }
+    }
+    return nullptr;
+  case 't':
+    switch (First[1]) {
+    case 'e': {
+      First += 2;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return Ex;
+      return make<EnclosingExpr>("typeid (", Ex, ")");
+    }
+    case 'i': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return Ty;
+      return make<EnclosingExpr>("typeid (", Ty, ")");
+    }
+    case 'l': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      size_t InitsBegin = Names.size();
+      while (!consumeIf('E')) {
+        Node *E = parseBracedExpr();
+        if (E == nullptr)
+          return nullptr;
+        Names.push_back(E);
+      }
+      return make<InitListExpr>(Ty, popTrailingNodeArray(InitsBegin));
+    }
+    case 'r':
+      First += 2;
+      return make<NameType>("throw");
+    case 'w': {
+      First += 2;
+      Node *Ex = parseExpr();
+      if (Ex == nullptr)
+        return nullptr;
+      return make<ThrowExpr>(Ex);
+    }
+    }
+    return nullptr;
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return parseUnresolvedName();
   }
-  return first;
+  return nullptr;
 }
 
 // <call-offset> ::= h <nv-offset> _
@@ -3811,26 +4380,15 @@ static const char *parse_name(const char *first, const char *last, C &db,
 //
 // <v-offset>  ::= <offset number> _ <virtual offset number>
 //               # virtual base override, with vcall offset
-
-static const char *parse_call_offset(const char *first, const char *last) {
-  if (first != last) {
-    switch (*first) {
-    case 'h': {
-      const char *t = parse_number(first + 1, last);
-      if (t != first + 1 && t != last && *t == '_')
-        first = t + 1;
-    } break;
-    case 'v': {
-      const char *t = parse_number(first + 1, last);
-      if (t != first + 1 && t != last && *t == '_') {
-        const char *t2 = parse_number(++t, last);
-        if (t2 != t && t2 != last && *t2 == '_')
-          first = t2 + 1;
-      }
-    } break;
-    }
-  }
-  return first;
+bool Db::parseCallOffset() {
+  // Just scan through the call offset, we never add this information into the
+  // output.
+  if (consumeIf('h'))
+    return parseNumber(true).empty() || !consumeIf('_');
+  if (consumeIf('v'))
+    return parseNumber(true).empty() || !consumeIf('_') ||
+           parseNumber(true).empty() || !consumeIf('_');
+  return true;
 }
 
 // <special-name> ::= TV <type>    # virtual table
@@ -3843,471 +4401,800 @@ static const char *parse_call_offset(const char *first, const char *last) {
 //                    # second call-offset is result adjustment
 //                ::= T <call-offset> <base encoding>
 //                    # base is the nominal target function of thunk
-//                ::= GV <object name> # Guard variable for one-time
-//                initialization
+//                ::= GV <object name> # Guard variable for one-time initialization
 //                                     # No <type>
 //                ::= TW <object name> # Thread-local wrapper
 //                ::= TH <object name> # Thread-local initialization
-//      extension ::= TC <first type> <number> _ <second type> # construction
-//      vtable for second-in-first
+//                ::= GR <object name> _             # First temporary
+//                ::= GR <object name> <seq-id> _    # Subsequent temporaries
+//      extension ::= TC <first type> <number> _ <second type> # construction vtable for second-in-first
 //      extension ::= GR <object name> # reference temporary for object
+Node *Db::parseSpecialName() {
+  switch (look()) {
+  case 'T':
+    switch (look(1)) {
+    // TV <type>    # virtual table
+    case 'V': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      return make<SpecialName>("vtable for ", Ty);
+    }
+    // TT <type>    # VTT structure (construction vtable index)
+    case 'T': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      return make<SpecialName>("VTT for ", Ty);
+    }
+    // TI <type>    # typeinfo structure
+    case 'I': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      return make<SpecialName>("typeinfo for ", Ty);
+    }
+    // TS <type>    # typeinfo name (null-terminated byte string)
+    case 'S': {
+      First += 2;
+      Node *Ty = parseType();
+      if (Ty == nullptr)
+        return nullptr;
+      return make<SpecialName>("typeinfo name for ", Ty);
+    }
+    // Tc <call-offset> <call-offset> <base encoding>
+    case 'c': {
+      First += 2;
+      if (parseCallOffset() || parseCallOffset())
+        return nullptr;
+      Node *Encoding = parseEncoding();
+      if (Encoding == nullptr)
+        return nullptr;
+      return make<SpecialName>("covariant return thunk to ", Encoding);
+    }
+    // extension ::= TC <first type> <number> _ <second type>
+    //               # construction vtable for second-in-first
+    case 'C': {
+      First += 2;
+      Node *FirstType = parseType();
+      if (FirstType == nullptr)
+        return nullptr;
+      if (parseNumber(true).empty() || !consumeIf('_'))
+        return nullptr;
+      Node *SecondType = parseType();
+      if (SecondType == nullptr)
+        return nullptr;
+      return make<CtorVtableSpecialName>(SecondType, FirstType);
+    }
+    // TW <object name> # Thread-local wrapper
+    case 'W': {
+      First += 2;
+      Node *Name = parseName();
+      if (Name == nullptr)
+        return nullptr;
+      return make<SpecialName>("thread-local wrapper routine for ", Name);
+    }
+    // TH <object name> # Thread-local initialization
+    case 'H': {
+      First += 2;
+      Node *Name = parseName();
+      if (Name == nullptr)
+        return nullptr;
+      return make<SpecialName>("thread-local initialization routine for ", Name);
+    }
+    // T <call-offset> <base encoding>
+    default: {
+      ++First;
+      bool IsVirt = look() == 'v';
+      if (parseCallOffset())
+        return nullptr;
+      Node *BaseEncoding = parseEncoding();
+      if (BaseEncoding == nullptr)
+        return nullptr;
+      if (IsVirt)
+        return make<SpecialName>("virtual thunk to ", BaseEncoding);
+      else
+        return make<SpecialName>("non-virtual thunk to ", BaseEncoding);
+    }
+    }
+  case 'G':
+    switch (look(1)) {
+    // GV <object name> # Guard variable for one-time initialization
+    case 'V': {
+      First += 2;
+      Node *Name = parseName();
+      if (Name == nullptr)
+        return nullptr;
+      return make<SpecialName>("guard variable for ", Name);
+    }
+    // GR <object name> # reference temporary for object
+    // GR <object name> _             # First temporary
+    // GR <object name> <seq-id> _    # Subsequent temporaries
+    case 'R': {
+      First += 2;
+      Node *Name = parseName();
+      if (Name == nullptr)
+        return nullptr;
+      size_t Count;
+      bool ParsedSeqId = !parseSeqId(&Count);
+      if (!consumeIf('_') && ParsedSeqId)
+        return nullptr;
+      return make<SpecialName>("reference temporary for ", Name);
+    }
+    }
+  }
+  return nullptr;
+}
 
-template <class C>
-static const char *parse_special_name(const char *first, const char *last,
-                                      C &db) {
-  if (last - first > 2) {
-    const char *t;
-    switch (*first) {
-    case 'T':
-      switch (first[1]) {
-      case 'V':
-        // TV <type>    # virtual table
-        t = parse_type(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "vtable for ");
-          first = t;
-        }
-        break;
-      case 'T':
-        // TT <type>    # VTT structure (construction vtable index)
-        t = parse_type(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "VTT for ");
-          first = t;
-        }
-        break;
-      case 'I':
-        // TI <type>    # typeinfo structure
-        t = parse_type(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "typeinfo for ");
-          first = t;
-        }
-        break;
-      case 'S':
-        // TS <type>    # typeinfo name (null-terminated byte string)
-        t = parse_type(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "typeinfo name for ");
-          first = t;
-        }
-        break;
-      case 'c':
-        // Tc <call-offset> <call-offset> <base encoding>
-        {
-          const char *t0 = parse_call_offset(first + 2, last);
-          if (t0 == first + 2)
-            break;
-          const char *t1 = parse_call_offset(t0, last);
-          if (t1 == t0)
-            break;
-          t = parse_encoding(t1, last, db);
-          if (t != t1) {
-            if (db.names.empty())
-              return first;
-            db.names.back().first.insert(0, "covariant return thunk to ");
-            first = t;
-          }
-        }
-        break;
-      case 'C':
-        // extension ::= TC <first type> <number> _ <second type> # construction
-        // vtable for second-in-first
-        t = parse_type(first + 2, last, db);
-        if (t != first + 2) {
-          const char *t0 = parse_number(t, last);
-          if (t0 != t && t0 != last && *t0 == '_') {
-            const char *t1 = parse_type(++t0, last, db);
-            if (t1 != t0) {
-              if (db.names.size() < 2)
-                return first;
-              auto left = db.names.back().move_full();
-              db.names.pop_back();
-              if (db.names.empty())
-                return first;
-              db.names.back().first = "construction vtable for " +
-                                      std::move(left) + "-in-" +
-                                      db.names.back().move_full();
-              first = t1;
-            }
-          }
-        }
-        break;
-      case 'W':
-        // TW <object name> # Thread-local wrapper
-        t = parse_name(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "thread-local wrapper routine for ");
-          first = t;
-        }
-        break;
-      case 'H':
-        // TH <object name> # Thread-local initialization
-        t = parse_name(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(
-              0, "thread-local initialization routine for ");
-          first = t;
-        }
-        break;
-      default:
-        // T <call-offset> <base encoding>
-        {
-          const char *t0 = parse_call_offset(first + 1, last);
-          if (t0 == first + 1)
-            break;
-          t = parse_encoding(t0, last, db);
-          if (t != t0) {
-            if (db.names.empty())
-              return first;
-            if (first[1] == 'v') {
-              db.names.back().first.insert(0, "virtual thunk to ");
-              first = t;
-            } else {
-              db.names.back().first.insert(0, "non-virtual thunk to ");
-              first = t;
-            }
-          }
-        }
-        break;
-      }
-      break;
-    case 'G':
-      switch (first[1]) {
-      case 'V':
-        // GV <object name> # Guard variable for one-time initialization
-        t = parse_name(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "guard variable for ");
-          first = t;
-        }
-        break;
-      case 'R':
-        // extension ::= GR <object name> # reference temporary for object
-        t = parse_name(first + 2, last, db);
-        if (t != first + 2) {
-          if (db.names.empty())
-            return first;
-          db.names.back().first.insert(0, "reference temporary for ");
-          first = t;
-        }
-        break;
-      }
-      break;
+// <encoding> ::= <function name> <bare-function-type>
+//            ::= <data name>
+//            ::= <special-name>
+Node *Db::parseEncoding() {
+  if (look() == 'G' || look() == 'T')
+    return parseSpecialName();
+
+  auto IsEndOfEncoding = [&] {
+    // The set of chars that can potentially follow an <encoding> (none of which
+    // can start a <type>). Enumerating these allows us to avoid speculative
+    // parsing.
+    return numLeft() == 0 || look() == 'E' || look() == '.' || look() == '_';
+  };
+
+  NameState NameInfo(this);
+  Node *Name = parseName(&NameInfo);
+  if (Name == nullptr)
+    return nullptr;
+
+  if (resolveForwardTemplateRefs(NameInfo))
+    return nullptr;
+
+  if (IsEndOfEncoding())
+    return Name;
+
+  Node *Attrs = nullptr;
+  if (consumeIf("Ua9enable_ifI")) {
+    size_t BeforeArgs = Names.size();
+    while (!consumeIf('E')) {
+      Node *Arg = parseTemplateArg();
+      if (Arg == nullptr)
+        return nullptr;
+      Names.push_back(Arg);
     }
+    Attrs = make<EnableIfAttr>(popTrailingNodeArray(BeforeArgs));
+  }
+
+  Node *ReturnType = nullptr;
+  if (!NameInfo.CtorDtorConversion && NameInfo.EndsWithTemplateArgs) {
+    ReturnType = parseType();
+    if (ReturnType == nullptr)
+      return nullptr;
   }
-  return first;
+
+  if (consumeIf('v'))
+    return make<FunctionEncoding>(ReturnType, Name, NodeArray(),
+                                  Attrs, NameInfo.CVQualifiers,
+                                  NameInfo.ReferenceQualifier);
+
+  size_t ParamsBegin = Names.size();
+  do {
+    Node *Ty = parseType();
+    if (Ty == nullptr)
+      return nullptr;
+    Names.push_back(Ty);
+  } while (!IsEndOfEncoding());
+
+  return make<FunctionEncoding>(ReturnType, Name,
+                                popTrailingNodeArray(ParamsBegin),
+                                Attrs, NameInfo.CVQualifiers,
+                                NameInfo.ReferenceQualifier);
 }
 
-namespace {
-template <class T> class save_value {
-  T &restore_;
-  T original_value_;
+template <class Float>
+struct FloatData;
 
-public:
-  save_value(T &restore) : restore_(restore), original_value_(restore) {}
+template <>
+struct FloatData<float>
+{
+    static const size_t mangled_size = 8;
+    static const size_t max_demangled_size = 24;
+    static constexpr const char* spec = "%af";
+};
 
-  ~save_value() { restore_ = std::move(original_value_); }
+constexpr const char* FloatData<float>::spec;
 
-  save_value(const save_value &) = delete;
-  save_value &operator=(const save_value &) = delete;
+template <>
+struct FloatData<double>
+{
+    static const size_t mangled_size = 16;
+    static const size_t max_demangled_size = 32;
+    static constexpr const char* spec = "%a";
 };
+
+constexpr const char* FloatData<double>::spec;
+
+template <>
+struct FloatData<long double>
+{
+#if defined(__mips__) && defined(__mips_n64) || defined(__aarch64__) || \
+    defined(__wasm__)
+    static const size_t mangled_size = 32;
+#elif defined(__arm__) || defined(__mips__) || defined(__hexagon__)
+    static const size_t mangled_size = 16;
+#else
+    static const size_t mangled_size = 20;  // May need to be adjusted to 16 or 24 on other platforms
+#endif
+    static const size_t max_demangled_size = 40;
+    static constexpr const char *spec = "%LaL";
+};
+
+constexpr const char *FloatData<long double>::spec;
+
+template <class Float> Node *Db::parseFloatingLiteral() {
+  const size_t N = FloatData<Float>::mangled_size;
+  if (numLeft() <= N)
+    return nullptr;
+  StringView Data(First, First + N);
+  for (char C : Data)
+    if (!std::isxdigit(C))
+      return nullptr;
+  First += N;
+  if (!consumeIf('E'))
+    return nullptr;
+  return make<FloatExpr<Float>>(Data);
 }
 
-// <encoding> ::= <function name> <bare-function-type>
-//            ::= <data name>
-//            ::= <special-name>
+// <seq-id> ::= <0-9A-Z>+
+bool Db::parseSeqId(size_t *Out) {
+  if (!(look() >= '0' && look() <= '9') &&
+      !(look() >= 'A' && look() <= 'Z'))
+    return true;
+
+  size_t Id = 0;
+  while (true) {
+    if (look() >= '0' && look() <= '9') {
+      Id *= 36;
+      Id += static_cast<size_t>(look() - '0');
+    } else if (look() >= 'A' && look() <= 'Z') {
+      Id *= 36;
+      Id += static_cast<size_t>(look() - 'A') + 10;
+    } else {
+      *Out = Id;
+      return false;
+    }
+    ++First;
+  }
+}
+
+// <substitution> ::= S <seq-id> _
+//                ::= S_
+// <substitution> ::= Sa # ::std::allocator
+// <substitution> ::= Sb # ::std::basic_string
+// <substitution> ::= Ss # ::std::basic_string < char,
+//                                               ::std::char_traits<char>,
+//                                               ::std::allocator<char> >
+// <substitution> ::= Si # ::std::basic_istream<char,  std::char_traits<char> >
+// <substitution> ::= So # ::std::basic_ostream<char,  std::char_traits<char> >
+// <substitution> ::= Sd # ::std::basic_iostream<char, std::char_traits<char> >
+Node *Db::parseSubstitution() {
+  if (!consumeIf('S'))
+    return nullptr;
 
-template <class C>
-static const char *parse_encoding(const char *first, const char *last, C &db) {
-  if (first != last) {
-    save_value<decltype(db.encoding_depth)> su(db.encoding_depth);
-    ++db.encoding_depth;
-    save_value<decltype(db.tag_templates)> sb(db.tag_templates);
-    if (db.encoding_depth > 1)
-      db.tag_templates = true;
-    save_value<decltype(db.parsed_ctor_dtor_cv)> sp(db.parsed_ctor_dtor_cv);
-    db.parsed_ctor_dtor_cv = false;
-    switch (*first) {
-    case 'G':
-    case 'T':
-      first = parse_special_name(first, last, db);
+  if (std::islower(look())) {
+    Node *SpecialSub;
+    switch (look()) {
+    case 'a':
+      ++First;
+      SpecialSub = make<SpecialSubstitution>(SpecialSubKind::allocator);
       break;
-    default: {
-      bool ends_with_template_args = false;
-      const char *t = parse_name(first, last, db, &ends_with_template_args);
-      unsigned cv = db.cv;
-      unsigned ref = db.ref;
-      if (t != first) {
-        if (t != last && *t != 'E' && *t != '.') {
-          save_value<bool> sb2(db.tag_templates);
-          db.tag_templates = false;
-          const char *t2;
-          std::string ret2;
-          if (db.names.empty())
-            return first;
-          const std::string &nm = db.names.back().first;
-          if (nm.empty())
-            return first;
-          if (!db.parsed_ctor_dtor_cv && ends_with_template_args) {
-            t2 = parse_type(t, last, db);
-            if (t2 == t)
-              return first;
-            if (db.names.size() < 2)
-              return first;
-            auto ret1 = std::move(db.names.back().first);
-            ret2 = std::move(db.names.back().second);
-            if (ret2.empty())
-              ret1 += ' ';
-            db.names.pop_back();
-            if (db.names.empty())
-              return first;
-
-            db.names.back().first.insert(0, ret1);
-            t = t2;
-          }
-          db.names.back().first += '(';
-          if (t != last && *t == 'v') {
-            ++t;
-          } else {
-            bool first_arg = true;
-            while (true) {
-              size_t k0 = db.names.size();
-              t2 = parse_type(t, last, db);
-              size_t k1 = db.names.size();
-              if (t2 == t)
-                break;
-              if (k1 > k0) {
-                std::string tmp;
-                for (size_t k = k0; k < k1; ++k) {
-                  if (!tmp.empty())
-                    tmp += ", ";
-                  tmp += db.names[k].move_full();
-                }
-                for (size_t k = k0; k < k1; ++k) {
-                  if (db.names.empty())
-                    return first;
-                  db.names.pop_back();
-                }
-                if (!tmp.empty()) {
-                  if (db.names.empty())
-                    return first;
-                  if (!first_arg)
-                    db.names.back().first += ", ";
-                  else
-                    first_arg = false;
-                  db.names.back().first += tmp;
-                }
-              }
-              t = t2;
-            }
-          }
-          if (db.names.empty())
-            return first;
-          db.names.back().first += ')';
-          if (cv & CV_const)
-            db.names.back().first.append(" const");
-          if (cv & CV_volatile)
-            db.names.back().first.append(" volatile");
-          if (cv & CV_restrict)
-            db.names.back().first.append(" restrict");
-          if (ref == 1)
-            db.names.back().first.append(" &");
-          else if (ref == 2)
-            db.names.back().first.append(" &&");
-          db.names.back().first += ret2;
-          first = t;
-        } else
-          first = t;
-      }
+    case 'b':
+      ++First;
+      SpecialSub = make<SpecialSubstitution>(SpecialSubKind::basic_string);
       break;
+    case 's':
+      ++First;
+      SpecialSub = make<SpecialSubstitution>(SpecialSubKind::string);
+      break;
+    case 'i':
+      ++First;
+      SpecialSub = make<SpecialSubstitution>(SpecialSubKind::istream);
+      break;
+    case 'o':
+      ++First;
+      SpecialSub = make<SpecialSubstitution>(SpecialSubKind::ostream);
+      break;
+    case 'd':
+      ++First;
+      SpecialSub = make<SpecialSubstitution>(SpecialSubKind::iostream);
+      break;
+    default:
+      return nullptr;
     }
+    // Itanium C++ ABI 5.1.2: If a name that would use a built-in <substitution>
+    // has ABI tags, the tags are appended to the substitution; the result is a
+    // substitutable component.
+    Node *WithTags = parseAbiTags(SpecialSub);
+    if (WithTags != SpecialSub) {
+      Subs.push_back(WithTags);
+      SpecialSub = WithTags;
     }
+    return SpecialSub;
   }
-  return first;
+
+  //                ::= S_
+  if (consumeIf('_')) {
+    if (Subs.empty())
+      return nullptr;
+    return Subs[0];
+  }
+
+  //                ::= S <seq-id> _
+  size_t Index = 0;
+  if (parseSeqId(&Index))
+    return nullptr;
+  ++Index;
+  if (!consumeIf('_') || Index >= Subs.size())
+    return nullptr;
+  return Subs[Index];
 }
 
-// _block_invoke
-// _block_invoke<decimal-digit>+
-// _block_invoke_<decimal-digit>+
-
-template <class C>
-static const char *parse_block_invoke(const char *first, const char *last,
-                                      C &db) {
-  if (last - first >= 13) {
-    const char test[] = "_block_invoke";
-    const char *t = first;
-    for (int i = 0; i < 13; ++i, ++t) {
-      if (*t != test[i])
-        return first;
-    }
-    if (t != last) {
-      if (*t == '_') {
-        // must have at least 1 decimal digit
-        if (++t == last || !std::isdigit(*t))
-          return first;
-        ++t;
-      }
-      // parse zero or more digits
-      while (t != last && isdigit(*t))
-        ++t;
+// <template-param> ::= T_    # first template parameter
+//                  ::= T <parameter-2 non-negative number> _
+Node *Db::parseTemplateParam() {
+  if (!consumeIf('T'))
+    return nullptr;
+
+  size_t Index = 0;
+  if (!consumeIf('_')) {
+    if (parsePositiveInteger(&Index))
+      return nullptr;
+    ++Index;
+    if (!consumeIf('_'))
+      return nullptr;
+  }
+
+  // Itanium ABI 5.1.8: In a generic lambda, uses of auto in the parameter list
+  // are mangled as the corresponding artificial template type parameter.
+  if (ParsingLambdaParams)
+    return make<NameType>("auto");
+
+  // If we're in a context where this <template-param> refers to a
+  // <template-arg> further ahead in the mangled name (currently just conversion
+  // operator types), then we should only look it up in the right context.
+  if (PermitForwardTemplateReferences) {
+    ForwardTemplateRefs.push_back(make<ForwardTemplateReference>(Index));
+    return ForwardTemplateRefs.back();
+  }
+
+  if (Index >= TemplateParams.size())
+    return nullptr;
+  return TemplateParams[Index];
+}
+
+// <template-arg> ::= <type>                    # type or template
+//                ::= X <expression> E          # expression
+//                ::= <expr-primary>            # simple expressions
+//                ::= J <template-arg>* E       # argument pack
+//                ::= LZ <encoding> E           # extension
+Node *Db::parseTemplateArg() {
+  switch (look()) {
+  case 'X': {
+    ++First;
+    Node *Arg = parseExpr();
+    if (Arg == nullptr || !consumeIf('E'))
+      return nullptr;
+    return Arg;
+  }
+  case 'J': {
+    ++First;
+    size_t ArgsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *Arg = parseTemplateArg();
+      if (Arg == nullptr)
+        return nullptr;
+      Names.push_back(Arg);
     }
-    if (db.names.empty())
-      return first;
-    db.names.back().first.insert(0, "invocation function for block in ");
-    first = t;
+    NodeArray Args = popTrailingNodeArray(ArgsBegin);
+    return make<TemplateArgumentPack>(Args);
+  }
+  case 'L': {
+    //                ::= LZ <encoding> E           # extension
+    if (look(1) == 'Z') {
+      First += 2;
+      Node *Arg = parseEncoding();
+      if (Arg == nullptr || !consumeIf('E'))
+        return nullptr;
+      return Arg;
+    }
+    //                ::= <expr-primary>            # simple expressions
+    return parseExprPrimary();
+  }
+  default:
+    return parseType();
   }
-  return first;
 }
 
-// extension
-// <dot-suffix> := .<anything and everything>
+// <template-args> ::= I <template-arg>* E
+//     extension, the abi says <template-arg>+
+Node *Db::parseTemplateArgs(bool TagTemplates) {
+  if (!consumeIf('I'))
+    return nullptr;
 
-template <class C>
-static const char *parse_dot_suffix(const char *first, const char *last,
-                                    C &db) {
-  if (first != last && *first == '.') {
-    if (db.names.empty())
-      return first;
-    db.names.back().first += " (" + std::string(first, last) + ")";
-    first = last;
+  // <template-params> refer to the innermost <template-args>. Clear out any
+  // outer args that we may have inserted into TemplateParams.
+  if (TagTemplates)
+    TemplateParams.clear();
+
+  size_t ArgsBegin = Names.size();
+  while (!consumeIf('E')) {
+    if (TagTemplates) {
+      auto OldParams = std::move(TemplateParams);
+      Node *Arg = parseTemplateArg();
+      TemplateParams = std::move(OldParams);
+      if (Arg == nullptr)
+        return nullptr;
+      Names.push_back(Arg);
+      Node *TableEntry = Arg;
+      if (Arg->getKind() == Node::KTemplateArgumentPack) {
+        TableEntry = make<ParameterPack>(
+            static_cast<TemplateArgumentPack*>(TableEntry)->getElements());
+      }
+      TemplateParams.push_back(TableEntry);
+    } else {
+      Node *Arg = parseTemplateArg();
+      if (Arg == nullptr)
+        return nullptr;
+      Names.push_back(Arg);
+    }
   }
-  return first;
+  return make<TemplateArgs>(popTrailingNodeArray(ArgsBegin));
+}
+
+// <discriminator> := _ <non-negative number>      # when number < 10
+//                 := __ <non-negative number> _   # when number >= 10
+//  extension      := decimal-digit+               # at the end of string
+
+const char*
+parse_discriminator(const char* first, const char* last)
+{
+    // parse but ignore discriminator
+    if (first != last)
+    {
+        if (*first == '_')
+        {
+            const char* t1 = first+1;
+            if (t1 != last)
+            {
+                if (std::isdigit(*t1))
+                    first = t1+1;
+                else if (*t1 == '_')
+                {
+                    for (++t1; t1 != last && std::isdigit(*t1); ++t1)
+                        ;
+                    if (t1 != last && *t1 == '_')
+                        first = t1 + 1;
+                }
+            }
+        }
+        else if (std::isdigit(*first))
+        {
+            const char* t1 = first+1;
+            for (; t1 != last && std::isdigit(*t1); ++t1)
+                ;
+            if (t1 == last)
+                first = last;
+        }
+    }
+    return first;
 }
 
-// <block-involcaton-function> ___Z<encoding>_block_invoke
-// <block-involcaton-function> ___Z<encoding>_block_invoke<decimal-digit>+
-// <block-involcaton-function> ___Z<encoding>_block_invoke_<decimal-digit>+
-// <mangled-name> ::= _Z<encoding>
+// <mangled-name> ::= _Z <encoding>
 //                ::= <type>
+// extension      ::= ___Z <encoding> _block_invoke
+// extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
+// extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
+Node *Db::parse() {
+  if (consumeIf("_Z")) {
+    Node *Encoding = parseEncoding();
+    if (Encoding == nullptr)
+      return nullptr;
+    if (look() == '.') {
+      Encoding = make<DotSuffix>(Encoding, StringView(First, Last));
+      First = Last;
+    }
+    if (numLeft() != 0)
+      return nullptr;
+    return Encoding;
+  }
 
-template <class C>
-static void demangle(const char *first, const char *last, C &db, int &status) {
-  if (first >= last) {
-    status = invalid_mangled_name;
-    return;
-  }
-  if (*first == '_') {
-    if (last - first >= 4) {
-      if (first[1] == 'Z') {
-        const char *t = parse_encoding(first + 2, last, db);
-        if (t != first + 2 && t != last && *t == '.')
-          t = parse_dot_suffix(t, last, db);
-        if (t != last)
-          status = invalid_mangled_name;
-      } else if (first[1] == '_' && first[2] == '_' && first[3] == 'Z') {
-        const char *t = parse_encoding(first + 4, last, db);
-        if (t != first + 4 && t != last) {
-          const char *t1 = parse_block_invoke(t, last, db);
-          if (t1 != last)
-            status = invalid_mangled_name;
-        } else
-          status = invalid_mangled_name;
-      } else
-        status = invalid_mangled_name;
-    } else
-      status = invalid_mangled_name;
-  } else {
-    const char *t = parse_type(first, last, db);
-    if (t != last)
-      status = invalid_mangled_name;
-  }
-  if (status == success && db.names.empty())
-    status = invalid_mangled_name;
+  if (consumeIf("___Z")) {
+    Node *Encoding = parseEncoding();
+    if (Encoding == nullptr || !consumeIf("_block_invoke"))
+      return nullptr;
+    bool RequireNumber = consumeIf('_');
+    if (parseNumber().empty() && RequireNumber)
+      return nullptr;
+    if (numLeft() != 0)
+      return nullptr;
+    return make<SpecialName>("invocation function for block in ", Encoding);
+  }
+
+  Node *Ty = parseType();
+  if (numLeft() != 0)
+    return nullptr;
+  return Ty;
 }
 
-namespace {
-template <class StrT> struct string_pair {
-  StrT first;
-  StrT second;
-
-  string_pair() = default;
-  string_pair(StrT f) : first(std::move(f)) {}
-  string_pair(StrT f, StrT s) : first(std::move(f)), second(std::move(s)) {}
-  template <size_t N> string_pair(const char (&s)[N]) : first(s, N - 1) {}
-
-  size_t size() const { return first.size() + second.size(); }
-  bool empty() const { return first.empty() && second.empty(); }
-  StrT full() const { return first + second; }
-  StrT move_full() { return std::move(first) + std::move(second); }
-};
+bool initializeOutputStream(char *Buf, size_t *N, OutputStream &S,
+                            size_t InitSize) {
+  size_t BufferSize;
+  if (Buf == nullptr) {
+    Buf = static_cast<char *>(std::malloc(InitSize));
+    if (Buf == nullptr)
+      return true;
+    BufferSize = InitSize;
+  } else
+    BufferSize = *N;
 
-struct Db {
-  typedef std::vector<string_pair<std::string>> sub_type;
-  typedef std::vector<sub_type> template_param_type;
-  sub_type names;
-  template_param_type subs;
-  std::vector<template_param_type> template_param;
-  unsigned cv = 0;
-  unsigned ref = 0;
-  unsigned encoding_depth = 0;
-  bool parsed_ctor_dtor_cv = false;
-  bool tag_templates = true;
-  bool fix_forward_references = false;
-  bool try_to_parse_template_args = true;
-
-  Db() : subs(0, names), template_param(0, subs) {}
-};
+  S.reset(Buf, BufferSize);
+  return false;
 }
 
-char *llvm::itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
-                            int *status) {
-  if (mangled_name == nullptr || (buf != nullptr && n == nullptr)) {
-    if (status)
-      *status = invalid_args;
+}  // unnamed namespace
+
+char *llvm::itaniumDemangle(const char *MangledName, char *Buf,
+                            size_t *N, int *Status) {
+  if (MangledName == nullptr || (Buf != nullptr && N == nullptr)) {
+    if (Status)
+      *Status = demangle_invalid_args;
     return nullptr;
   }
-  size_t internal_size = buf != nullptr ? *n : 0;
-  Db db;
-  db.template_param.emplace_back();
-  int internal_status = success;
-  size_t len = std::strlen(mangled_name);
-  demangle(mangled_name, mangled_name + len, db, internal_status);
-  if (internal_status == success && db.fix_forward_references &&
-      !db.template_param.empty() && !db.template_param.front().empty()) {
-    db.fix_forward_references = false;
-    db.tag_templates = false;
-    db.names.clear();
-    db.subs.clear();
-    demangle(mangled_name, mangled_name + len, db, internal_status);
-    if (db.fix_forward_references)
-      internal_status = invalid_mangled_name;
-  }
-  if (internal_status == success) {
-    size_t sz = db.names.back().size() + 1;
-    if (sz > internal_size) {
-      char *newbuf = static_cast<char *>(std::realloc(buf, sz));
-      if (newbuf == nullptr) {
-        internal_status = memory_alloc_failure;
-        buf = nullptr;
-      } else {
-        buf = newbuf;
-        if (n != nullptr)
-          *n = sz;
-      }
+
+  int InternalStatus = demangle_success;
+  Db Parser(MangledName, MangledName + std::strlen(MangledName));
+  OutputStream S;
+
+  Node *AST = Parser.parse();
+
+  if (AST == nullptr)
+    InternalStatus = demangle_invalid_mangled_name;
+  else if (initializeOutputStream(Buf, N, S, 1024))
+    InternalStatus = demangle_memory_alloc_failure;
+  else {
+    assert(Parser.ForwardTemplateRefs.empty());
+    AST->print(S);
+    S += '\0';
+    if (N != nullptr)
+      *N = S.getCurrentPosition();
+    Buf = S.getBuffer();
+  }
+
+  if (Status)
+    *Status = InternalStatus;
+  return InternalStatus == demangle_success ? Buf : nullptr;
+}
+
+namespace llvm {
+
+ItaniumPartialDemangler::ItaniumPartialDemangler()
+    : RootNode(nullptr), Context(new Db{nullptr, nullptr}) {}
+
+ItaniumPartialDemangler::~ItaniumPartialDemangler() {
+  delete static_cast<Db *>(Context);
+}
+
+ItaniumPartialDemangler::ItaniumPartialDemangler(
+    ItaniumPartialDemangler &&Other)
+    : RootNode(Other.RootNode), Context(Other.Context) {
+  Other.Context = Other.RootNode = nullptr;
+}
+
+ItaniumPartialDemangler &ItaniumPartialDemangler::
+operator=(ItaniumPartialDemangler &&Other) {
+  std::swap(RootNode, Other.RootNode);
+  std::swap(Context, Other.Context);
+  return *this;
+}
+
+// Demangle MangledName into an AST, storing it into this->RootNode.
+bool ItaniumPartialDemangler::partialDemangle(const char *MangledName) {
+  Db *Parser = static_cast<Db *>(Context);
+  size_t Len = std::strlen(MangledName);
+  Parser->reset(MangledName, MangledName + Len);
+  RootNode = Parser->parse();
+  return RootNode == nullptr;
+}
+
+static char *printNode(Node *RootNode, char *Buf, size_t *N) {
+  OutputStream S;
+  if (initializeOutputStream(Buf, N, S, 128))
+    return nullptr;
+  RootNode->print(S);
+  S += '\0';
+  if (N != nullptr)
+    *N = S.getCurrentPosition();
+  return S.getBuffer();
+}
+
+char *ItaniumPartialDemangler::getFunctionBaseName(char *Buf, size_t *N) const {
+  if (!isFunction())
+    return nullptr;
+
+  Node *Name = static_cast<FunctionEncoding *>(RootNode)->getName();
+
+  while (true) {
+    switch (Name->getKind()) {
+    case Node::KAbiTagAttr:
+      Name = static_cast<AbiTagAttr *>(Name)->Base;
+      continue;
+    case Node::KStdQualifiedName:
+      Name = static_cast<StdQualifiedName *>(Name)->Child;
+      continue;
+    case Node::KNestedName:
+      Name = static_cast<NestedName *>(Name)->Name;
+      continue;
+    case Node::KLocalName:
+      Name = static_cast<LocalName *>(Name)->Entity;
+      continue;
+    case Node::KNameWithTemplateArgs:
+      Name = static_cast<NameWithTemplateArgs *>(Name)->Name;
+      continue;
+    default:
+      return printNode(Name, Buf, N);
     }
-    if (buf != nullptr) {
-      db.names.back().first += db.names.back().second;
-      std::memcpy(buf, db.names.back().first.data(), sz - 1);
-      buf[sz - 1] = char(0);
+  }
+}
+
+char *ItaniumPartialDemangler::getFunctionDeclContextName(char *Buf,
+                                                          size_t *N) const {
+  if (!isFunction())
+    return nullptr;
+  Node *Name = static_cast<FunctionEncoding *>(RootNode)->getName();
+
+  OutputStream S;
+  if (initializeOutputStream(Buf, N, S, 128))
+    return nullptr;
+
+ KeepGoingLocalFunction:
+  while (true) {
+    if (Name->getKind() == Node::KAbiTagAttr) {
+      Name = static_cast<AbiTagAttr *>(Name)->Base;
+      continue;
     }
-  } else
-    buf = nullptr;
-  if (status)
-    *status = internal_status;
-  return buf;
+    if (Name->getKind() == Node::KNameWithTemplateArgs) {
+      Name = static_cast<NameWithTemplateArgs *>(Name)->Name;
+      continue;
+    }
+    break;
+  }
+
+  switch (Name->getKind()) {
+  case Node::KStdQualifiedName:
+    S += "std";
+    break;
+  case Node::KNestedName:
+    static_cast<NestedName *>(Name)->Qual->print(S);
+    break;
+  case Node::KLocalName: {
+    auto *LN = static_cast<LocalName *>(Name);
+    LN->Encoding->print(S);
+    S += "::";
+    Name = LN->Entity;
+    goto KeepGoingLocalFunction;
+  }
+  default:
+    break;
+  }
+  S += '\0';
+  if (N != nullptr)
+    *N = S.getCurrentPosition();
+  return S.getBuffer();
+}
+
+char *ItaniumPartialDemangler::getFunctionName(char *Buf, size_t *N) const {
+  if (!isFunction())
+    return nullptr;
+  auto *Name = static_cast<FunctionEncoding *>(RootNode)->getName();
+  return printNode(Name, Buf, N);
+}
+
+char *ItaniumPartialDemangler::getFunctionParameters(char *Buf,
+                                                     size_t *N) const {
+  if (!isFunction())
+    return nullptr;
+  NodeArray Params = static_cast<FunctionEncoding *>(RootNode)->getParams();
+
+  OutputStream S;
+  if (initializeOutputStream(Buf, N, S, 128))
+    return nullptr;
+
+  S += '(';
+  Params.printWithComma(S);
+  S += ')';
+  S += '\0';
+  if (N != nullptr)
+    *N = S.getCurrentPosition();
+  return S.getBuffer();
+}
+
+char *ItaniumPartialDemangler::getFunctionReturnType(
+    char *Buf, size_t *N) const {
+  if (!isFunction())
+    return nullptr;
+
+  OutputStream S;
+  if (initializeOutputStream(Buf, N, S, 128))
+    return nullptr;
+
+  if (Node *Ret = static_cast<FunctionEncoding *>(RootNode)->getReturnType())
+    Ret->print(S);
+
+  S += '\0';
+  if (N != nullptr)
+    *N = S.getCurrentPosition();
+  return S.getBuffer();
+}
+
+char *ItaniumPartialDemangler::finishDemangle(char *Buf, size_t *N) const {
+  assert(RootNode != nullptr && "must call partialDemangle()");
+  return printNode(static_cast<Node *>(RootNode), Buf, N);
+}
+
+bool ItaniumPartialDemangler::hasFunctionQualifiers() const {
+  assert(RootNode != nullptr && "must call partialDemangle()");
+  if (!isFunction())
+    return false;
+  auto *E = static_cast<FunctionEncoding *>(RootNode);
+  return E->getCVQuals() != QualNone || E->getRefQual() != FrefQualNone;
+}
+
+bool ItaniumPartialDemangler::isCtorOrDtor() const {
+  Node *N = static_cast<Node *>(RootNode);
+  while (N) {
+    switch (N->getKind()) {
+    default:
+      return false;
+    case Node::KCtorDtorName:
+      return true;
+
+    case Node::KAbiTagAttr:
+      N = static_cast<AbiTagAttr *>(N)->Base;
+      break;
+    case Node::KFunctionEncoding:
+      N = static_cast<FunctionEncoding *>(N)->getName();
+      break;
+    case Node::KLocalName:
+      N = static_cast<LocalName *>(N)->Entity;
+      break;
+    case Node::KNameWithTemplateArgs:
+      N = static_cast<NameWithTemplateArgs *>(N)->Name;
+      break;
+    case Node::KNestedName:
+      N = static_cast<NestedName *>(N)->Name;
+      break;
+    case Node::KStdQualifiedName:
+      N = static_cast<StdQualifiedName *>(N)->Child;
+      break;
+    }
+  }
+  return false;
+}
+
+bool ItaniumPartialDemangler::isFunction() const {
+  assert(RootNode != nullptr && "must call partialDemangle()");
+  return static_cast<Node *>(RootNode)->getKind() == Node::KFunctionEncoding;
+}
+
+bool ItaniumPartialDemangler::isSpecialName() const {
+  assert(RootNode != nullptr && "must call partialDemangle()");
+  auto K = static_cast<Node *>(RootNode)->getKind();
+  return K == Node::KSpecialName || K == Node::KCtorVtableSpecialName;
+}
+
+bool ItaniumPartialDemangler::isData() const {
+  return !isFunction() && !isSpecialName();
+}
+
 }
diff --git a/contrib/llvm/lib/Demangle/MicrosoftDemangle.cpp b/contrib/llvm/lib/Demangle/MicrosoftDemangle.cpp
new file mode 100644
index 000000000000..596359b7d990
--- /dev/null
+++ b/contrib/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -0,0 +1,1684 @@
+//===- MicrosoftDemangle.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a demangler for MSVC-style mangled symbols.
+//
+// This file has no dependencies on the rest of LLVM so that it can be
+// easily reused in other programs such as libcxxabi.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Demangle/Demangle.h"
+
+#include "Compiler.h"
+#include "StringView.h"
+#include "Utility.h"
+
+#include <cctype>
+#include <tuple>
+
+// This memory allocator is extremely fast, but it doesn't call dtors
+// for allocated objects. That means you can't use STL containers
+// (such as std::vector) with this allocator. But it pays off --
+// the demangler is 3x faster with this allocator compared to one with
+// STL containers.
+namespace {
+class ArenaAllocator {
+  struct AllocatorNode {
+    uint8_t *Buf = nullptr;
+    size_t Used = 0;
+    AllocatorNode *Next = nullptr;
+  };
+
+public:
+  ArenaAllocator() : Head(new AllocatorNode) { Head->Buf = new uint8_t[Unit]; }
+
+  ~ArenaAllocator() {
+    while (Head) {
+      assert(Head->Buf);
+      delete[] Head->Buf;
+      AllocatorNode *Next = Head->Next;
+      delete Head;
+      Head = Next;
+    }
+  }
+
+  template <typename T, typename... Args> T *alloc(Args &&... ConstructorArgs) {
+
+    size_t Size = sizeof(T);
+    assert(Size < Unit);
+    assert(Head && Head->Buf);
+
+    size_t P = (size_t)Head->Buf + Head->Used;
+    uintptr_t AlignedP =
+        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
+    uint8_t *PP = (uint8_t *)AlignedP;
+    size_t Adjustment = AlignedP - P;
+
+    Head->Used += Size + Adjustment;
+    if (Head->Used < Unit)
+      return new (PP) T(std::forward<Args>(ConstructorArgs)...);
+
+    AllocatorNode *NewHead = new AllocatorNode;
+    NewHead->Buf = new uint8_t[ArenaAllocator::Unit];
+    NewHead->Next = Head;
+    Head = NewHead;
+    NewHead->Used = Size;
+    return new (NewHead->Buf) T(std::forward<Args>(ConstructorArgs)...);
+  }
+
+private:
+  static constexpr size_t Unit = 4096;
+
+  AllocatorNode *Head = nullptr;
+};
+} // namespace
+
+static bool startsWithDigit(StringView S) {
+  return !S.empty() && std::isdigit(S.front());
+}
+
+// Writes a space if the last token does not end with a punctuation.
+static void outputSpaceIfNecessary(OutputStream &OS) {
+  if (OS.empty())
+    return;
+
+  char C = OS.back();
+  if (isalnum(C) || C == '>')
+    OS << " ";
+}
+
+// Storage classes
+enum Qualifiers : uint8_t {
+  Q_None = 0,
+  Q_Const = 1 << 0,
+  Q_Volatile = 1 << 1,
+  Q_Far = 1 << 2,
+  Q_Huge = 1 << 3,
+  Q_Unaligned = 1 << 4,
+  Q_Restrict = 1 << 5,
+  Q_Pointer64 = 1 << 6
+};
+
+enum class StorageClass : uint8_t {
+  None,
+  PrivateStatic,
+  ProtectedStatic,
+  PublicStatic,
+  Global,
+  FunctionLocalStatic
+};
+
+enum class QualifierMangleMode { Drop, Mangle, Result };
+
+enum class PointerAffinity { Pointer, Reference };
+
+// Calling conventions
+enum class CallingConv : uint8_t {
+  None,
+  Cdecl,
+  Pascal,
+  Thiscall,
+  Stdcall,
+  Fastcall,
+  Clrcall,
+  Eabi,
+  Vectorcall,
+  Regcall,
+};
+
+enum class ReferenceKind : uint8_t { None, LValueRef, RValueRef };
+
+// Types
+enum class PrimTy : uint8_t {
+  Unknown,
+  None,
+  Function,
+  Ptr,
+  Ref,
+  MemberPtr,
+  Array,
+
+  Struct,
+  Union,
+  Class,
+  Enum,
+
+  Void,
+  Bool,
+  Char,
+  Schar,
+  Uchar,
+  Short,
+  Ushort,
+  Int,
+  Uint,
+  Long,
+  Ulong,
+  Int64,
+  Uint64,
+  Wchar,
+  Float,
+  Double,
+  Ldouble,
+};
+
+// Function classes
+enum FuncClass : uint8_t {
+  Public = 1 << 0,
+  Protected = 1 << 1,
+  Private = 1 << 2,
+  Global = 1 << 3,
+  Static = 1 << 4,
+  Virtual = 1 << 5,
+  Far = 1 << 6,
+};
+
+namespace {
+
+struct Type;
+
+// Represents a list of parameters (template params or function arguments.
+// It's represented as a linked list.
+struct ParamList {
+  bool IsVariadic = false;
+
+  Type *Current = nullptr;
+
+  ParamList *Next = nullptr;
+};
+
+// The type class. Mangled symbols are first parsed and converted to
+// this type and then converted to string.
+struct Type {
+  virtual ~Type() {}
+
+  virtual Type *clone(ArenaAllocator &Arena) const;
+
+  // Write the "first half" of a given type.  This is a static functions to
+  // give the code a chance to do processing that is common to a subset of
+  // subclasses
+  static void outputPre(OutputStream &OS, Type &Ty);
+
+  // Write the "second half" of a given type.  This is a static functions to
+  // give the code a chance to do processing that is common to a subset of
+  // subclasses
+  static void outputPost(OutputStream &OS, Type &Ty);
+
+  virtual void outputPre(OutputStream &OS);
+  virtual void outputPost(OutputStream &OS);
+
+  // Primitive type such as Int.
+  PrimTy Prim = PrimTy::Unknown;
+
+  Qualifiers Quals = Q_None;
+  StorageClass Storage = StorageClass::None; // storage class
+};
+
+// Represents an identifier which may be a template.
+struct Name {
+  // Name read from an MangledName string.
+  StringView Str;
+
+  // Overloaded operators are represented as special BackReferences in mangled
+  // symbols. If this is an operator name, "op" has an operator name (e.g.
+  // ">>"). Otherwise, empty.
+  StringView Operator;
+
+  // Template parameters. Null if not a template.
+  ParamList TemplateParams;
+
+  // Nested BackReferences (e.g. "A::B::C") are represented as a linked list.
+  Name *Next = nullptr;
+};
+
+struct PointerType : public Type {
+  Type *clone(ArenaAllocator &Arena) const override;
+  void outputPre(OutputStream &OS) override;
+  void outputPost(OutputStream &OS) override;
+
+  // Represents a type X in "a pointer to X", "a reference to X",
+  // "an array of X", or "a function returning X".
+  Type *Pointee = nullptr;
+};
+
+struct MemberPointerType : public Type {
+  Type *clone(ArenaAllocator &Arena) const override;
+  void outputPre(OutputStream &OS) override;
+  void outputPost(OutputStream &OS) override;
+
+  Name *MemberName = nullptr;
+
+  // Represents a type X in "a pointer to X", "a reference to X",
+  // "an array of X", or "a function returning X".
+  Type *Pointee = nullptr;
+};
+
+struct FunctionType : public Type {
+  Type *clone(ArenaAllocator &Arena) const override;
+  void outputPre(OutputStream &OS) override;
+  void outputPost(OutputStream &OS) override;
+
+  // True if this FunctionType instance is the Pointee of a PointerType or
+  // MemberPointerType.
+  bool IsFunctionPointer = false;
+
+  Type *ReturnType = nullptr;
+  // If this is a reference, the type of reference.
+  ReferenceKind RefKind;
+
+  CallingConv CallConvention;
+  FuncClass FunctionClass;
+
+  ParamList Params;
+};
+
+struct UdtType : public Type {
+  Type *clone(ArenaAllocator &Arena) const override;
+  void outputPre(OutputStream &OS) override;
+
+  Name *UdtName = nullptr;
+};
+
+struct ArrayType : public Type {
+  Type *clone(ArenaAllocator &Arena) const override;
+  void outputPre(OutputStream &OS) override;
+  void outputPost(OutputStream &OS) override;
+
+  // Either NextDimension or ElementType will be valid.
+  ArrayType *NextDimension = nullptr;
+  uint32_t ArrayDimension = 0;
+
+  Type *ElementType = nullptr;
+};
+
+} // namespace
+
+static bool isMemberPointer(StringView MangledName) {
+  switch (MangledName.popFront()) {
+  case 'A':
+    // 'A' indicates a reference, and you cannot have a reference to a member
+    // function or member variable.
+    return false;
+  case 'P':
+  case 'Q':
+  case 'R':
+  case 'S':
+    // These 4 values indicate some kind of pointer, but we still don't know
+    // what.
+    break;
+  default:
+    assert(false && "Ty is not a pointer type!");
+  }
+
+  // If it starts with a number, then 6 indicates a non-member function
+  // pointer, and 8 indicates a member function pointer.
+  if (startsWithDigit(MangledName)) {
+    assert(MangledName[0] == '6' || MangledName[0] == '8');
+    return (MangledName[0] == '8');
+  }
+
+  // Remove ext qualifiers since those can appear on either type and are
+  // therefore not indicative.
+  MangledName.consumeFront('E'); // 64-bit
+  MangledName.consumeFront('I'); // restrict
+  MangledName.consumeFront('F'); // unaligned
+
+  assert(!MangledName.empty());
+
+  // The next value should be either ABCD (non-member) or QRST (member).
+  switch (MangledName.front()) {
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'D':
+    return false;
+  case 'Q':
+  case 'R':
+  case 'S':
+  case 'T':
+    return true;
+  default:
+    assert(false);
+  }
+  return false;
+}
+
+static void outputCallingConvention(OutputStream &OS, CallingConv CC) {
+  outputSpaceIfNecessary(OS);
+
+  switch (CC) {
+  case CallingConv::Cdecl:
+    OS << "__cdecl";
+    break;
+  case CallingConv::Fastcall:
+    OS << "__fastcall";
+    break;
+  case CallingConv::Pascal:
+    OS << "__pascal";
+    break;
+  case CallingConv::Regcall:
+    OS << "__regcall";
+    break;
+  case CallingConv::Stdcall:
+    OS << "__stdcall";
+    break;
+  case CallingConv::Thiscall:
+    OS << "__thiscall";
+    break;
+  case CallingConv::Eabi:
+    OS << "__eabi";
+    break;
+  case CallingConv::Vectorcall:
+    OS << "__vectorcall";
+    break;
+  case CallingConv::Clrcall:
+    OS << "__clrcall";
+    break;
+  default:
+    break;
+  }
+}
+
+// Write a function or template parameter list.
+static void outputParameterList(OutputStream &OS, const ParamList &Params) {
+  if (!Params.Current) {
+    OS << "void";
+    return;
+  }
+
+  const ParamList *Head = &Params;
+  while (Head) {
+    Type::outputPre(OS, *Head->Current);
+    Type::outputPost(OS, *Head->Current);
+
+    Head = Head->Next;
+
+    if (Head)
+      OS << ", ";
+  }
+}
+
+static void outputTemplateParams(OutputStream &OS, const Name &TheName) {
+  if (!TheName.TemplateParams.Current)
+    return;
+
+  OS << "<";
+  outputParameterList(OS, TheName.TemplateParams);
+  OS << ">";
+}
+
+static void outputName(OutputStream &OS, const Name *TheName) {
+  if (!TheName)
+    return;
+
+  outputSpaceIfNecessary(OS);
+
+  // Print out namespaces or outer class BackReferences.
+  for (; TheName->Next; TheName = TheName->Next) {
+    OS << TheName->Str;
+    outputTemplateParams(OS, *TheName);
+    OS << "::";
+  }
+
+  // Print out a regular name.
+  if (TheName->Operator.empty()) {
+    OS << TheName->Str;
+    outputTemplateParams(OS, *TheName);
+    return;
+  }
+
+  // Print out ctor or dtor.
+  if (TheName->Operator == "ctor" || TheName->Operator == "dtor") {
+    OS << TheName->Str;
+    outputTemplateParams(OS, *TheName);
+    OS << "::";
+    if (TheName->Operator == "dtor")
+      OS << "~";
+    OS << TheName->Str;
+    outputTemplateParams(OS, *TheName);
+    return;
+  }
+
+  // Print out an overloaded operator.
+  if (!TheName->Str.empty())
+    OS << TheName->Str << "::";
+  OS << "operator" << TheName->Operator;
+}
+
+namespace {
+
+Type *Type::clone(ArenaAllocator &Arena) const {
+  return Arena.alloc<Type>(*this);
+}
+
+// Write the "first half" of a given type.
+void Type::outputPre(OutputStream &OS, Type &Ty) {
+  // Function types require custom handling of const and static so we
+  // handle them separately.  All other types use the same decoration
+  // for these modifiers, so handle them here in common code.
+  if (Ty.Prim == PrimTy::Function) {
+    Ty.outputPre(OS);
+    return;
+  }
+
+  switch (Ty.Storage) {
+  case StorageClass::PrivateStatic:
+  case StorageClass::PublicStatic:
+  case StorageClass::ProtectedStatic:
+    OS << "static ";
+  default:
+    break;
+  }
+  Ty.outputPre(OS);
+
+  if (Ty.Quals & Q_Const) {
+    outputSpaceIfNecessary(OS);
+    OS << "const";
+  }
+
+  if (Ty.Quals & Q_Volatile) {
+    outputSpaceIfNecessary(OS);
+    OS << "volatile";
+  }
+
+  if (Ty.Quals & Q_Restrict) {
+    outputSpaceIfNecessary(OS);
+    OS << "__restrict";
+  }
+}
+
+// Write the "second half" of a given type.
+void Type::outputPost(OutputStream &OS, Type &Ty) { Ty.outputPost(OS); }
+
+void Type::outputPre(OutputStream &OS) {
+  switch (Prim) {
+  case PrimTy::Void:
+    OS << "void";
+    break;
+  case PrimTy::Bool:
+    OS << "bool";
+    break;
+  case PrimTy::Char:
+    OS << "char";
+    break;
+  case PrimTy::Schar:
+    OS << "signed char";
+    break;
+  case PrimTy::Uchar:
+    OS << "unsigned char";
+    break;
+  case PrimTy::Short:
+    OS << "short";
+    break;
+  case PrimTy::Ushort:
+    OS << "unsigned short";
+    break;
+  case PrimTy::Int:
+    OS << "int";
+    break;
+  case PrimTy::Uint:
+    OS << "unsigned int";
+    break;
+  case PrimTy::Long:
+    OS << "long";
+    break;
+  case PrimTy::Ulong:
+    OS << "unsigned long";
+    break;
+  case PrimTy::Int64:
+    OS << "__int64";
+    break;
+  case PrimTy::Uint64:
+    OS << "unsigned __int64";
+    break;
+  case PrimTy::Wchar:
+    OS << "wchar_t";
+    break;
+  case PrimTy::Float:
+    OS << "float";
+    break;
+  case PrimTy::Double:
+    OS << "double";
+    break;
+  case PrimTy::Ldouble:
+    OS << "long double";
+    break;
+  default:
+    assert(false && "Invalid primitive type!");
+  }
+}
+void Type::outputPost(OutputStream &OS) {}
+
+Type *PointerType::clone(ArenaAllocator &Arena) const {
+  return Arena.alloc<PointerType>(*this);
+}
+
+static void outputPointerIndicator(OutputStream &OS, PointerAffinity Affinity,
+                                   const Name *MemberName,
+                                   const Type *Pointee) {
+  // "[]" and "()" (for function parameters) take precedence over "*",
+  // so "int *x(int)" means "x is a function returning int *". We need
+  // parentheses to supercede the default precedence. (e.g. we want to
+  // emit something like "int (*x)(int)".)
+  if (Pointee->Prim == PrimTy::Function || Pointee->Prim == PrimTy::Array) {
+    OS << "(";
+    if (Pointee->Prim == PrimTy::Function) {
+      const FunctionType *FTy = static_cast<const FunctionType *>(Pointee);
+      assert(FTy->IsFunctionPointer);
+      outputCallingConvention(OS, FTy->CallConvention);
+      OS << " ";
+    }
+  }
+
+  if (MemberName) {
+    outputName(OS, MemberName);
+    OS << "::";
+  }
+
+  if (Affinity == PointerAffinity::Pointer)
+    OS << "*";
+  else
+    OS << "&";
+}
+
+void PointerType::outputPre(OutputStream &OS) {
+  Type::outputPre(OS, *Pointee);
+
+  outputSpaceIfNecessary(OS);
+
+  if (Quals & Q_Unaligned)
+    OS << "__unaligned ";
+
+  PointerAffinity Affinity = (Prim == PrimTy::Ptr) ? PointerAffinity::Pointer
+                                                   : PointerAffinity::Reference;
+
+  outputPointerIndicator(OS, Affinity, nullptr, Pointee);
+
+  // FIXME: We should output this, but it requires updating lots of tests.
+  // if (Ty.Quals & Q_Pointer64)
+  //  OS << " __ptr64";
+}
+
+void PointerType::outputPost(OutputStream &OS) {
+  if (Pointee->Prim == PrimTy::Function || Pointee->Prim == PrimTy::Array)
+    OS << ")";
+
+  Type::outputPost(OS, *Pointee);
+}
+
+Type *MemberPointerType::clone(ArenaAllocator &Arena) const {
+  return Arena.alloc<MemberPointerType>(*this);
+}
+
+void MemberPointerType::outputPre(OutputStream &OS) {
+  Type::outputPre(OS, *Pointee);
+
+  outputSpaceIfNecessary(OS);
+
+  outputPointerIndicator(OS, PointerAffinity::Pointer, MemberName, Pointee);
+
+  // FIXME: We should output this, but it requires updating lots of tests.
+  // if (Ty.Quals & Q_Pointer64)
+  //  OS << " __ptr64";
+  if (Quals & Q_Restrict)
+    OS << " __restrict";
+}
+
+void MemberPointerType::outputPost(OutputStream &OS) {
+  if (Pointee->Prim == PrimTy::Function || Pointee->Prim == PrimTy::Array)
+    OS << ")";
+
+  Type::outputPost(OS, *Pointee);
+}
+
+Type *FunctionType::clone(ArenaAllocator &Arena) const {
+  return Arena.alloc<FunctionType>(*this);
+}
+
+void FunctionType::outputPre(OutputStream &OS) {
+  if (!(FunctionClass & Global)) {
+    if (FunctionClass & Static)
+      OS << "static ";
+  }
+
+  if (ReturnType) {
+    Type::outputPre(OS, *ReturnType);
+    OS << " ";
+  }
+
+  // Function pointers print the calling convention as void (__cdecl *)(params)
+  // rather than void __cdecl (*)(params).  So we need to let the PointerType
+  // class handle this.
+  if (!IsFunctionPointer)
+    outputCallingConvention(OS, CallConvention);
+}
+
+void FunctionType::outputPost(OutputStream &OS) {
+  OS << "(";
+  outputParameterList(OS, Params);
+  OS << ")";
+  if (Quals & Q_Const)
+    OS << " const";
+  if (Quals & Q_Volatile)
+    OS << " volatile";
+
+  if (ReturnType)
+    Type::outputPost(OS, *ReturnType);
+  return;
+}
+
+Type *UdtType::clone(ArenaAllocator &Arena) const {
+  return Arena.alloc<UdtType>(*this);
+}
+
+void UdtType::outputPre(OutputStream &OS) {
+  switch (Prim) {
+  case PrimTy::Class:
+    OS << "class ";
+    break;
+  case PrimTy::Struct:
+    OS << "struct ";
+    break;
+  case PrimTy::Union:
+    OS << "union ";
+    break;
+  case PrimTy::Enum:
+    OS << "enum ";
+    break;
+  default:
+    assert(false && "Not a udt type!");
+  }
+
+  outputName(OS, UdtName);
+}
+
+Type *ArrayType::clone(ArenaAllocator &Arena) const {
+  return Arena.alloc<ArrayType>(*this);
+}
+
+void ArrayType::outputPre(OutputStream &OS) {
+  Type::outputPre(OS, *ElementType);
+}
+
+void ArrayType::outputPost(OutputStream &OS) {
+  if (ArrayDimension > 0)
+    OS << "[" << ArrayDimension << "]";
+  if (NextDimension)
+    Type::outputPost(OS, *NextDimension);
+  else if (ElementType)
+    Type::outputPost(OS, *ElementType);
+}
+
+} // namespace
+
+namespace {
+
+// Demangler class takes the main role in demangling symbols.
+// It has a set of functions to parse mangled symbols into Type instances.
+// It also has a set of functions to cnovert Type instances to strings.
+class Demangler {
+public:
+  Demangler(OutputStream &OS, StringView s) : OS(OS), MangledName(s) {}
+
+  // You are supposed to call parse() first and then check if error is true.  If
+  // it is false, call output() to write the formatted name to the given stream.
+  void parse();
+  void output();
+
+  // True if an error occurred.
+  bool Error = false;
+
+private:
+  Type *demangleVariableEncoding();
+  Type *demangleFunctionEncoding();
+
+  Qualifiers demanglePointerExtQualifiers();
+
+  // Parser functions. This is a recursive-descent parser.
+  Type *demangleType(QualifierMangleMode QMM);
+  Type *demangleBasicType();
+  UdtType *demangleClassType();
+  PointerType *demanglePointerType();
+  MemberPointerType *demangleMemberPointerType();
+  FunctionType *demangleFunctionType(bool HasThisQuals, bool IsFunctionPointer);
+
+  ArrayType *demangleArrayType();
+
+  ParamList demangleTemplateParameterList();
+  ParamList demangleFunctionParameterList();
+
+  int demangleNumber();
+  void demangleNamePiece(Name &Node, bool IsHead);
+
+  StringView demangleString(bool memorize);
+  void memorizeString(StringView s);
+  Name *demangleName();
+  void demangleOperator(Name *);
+  StringView demangleOperatorName();
+  FuncClass demangleFunctionClass();
+  CallingConv demangleCallingConvention();
+  StorageClass demangleVariableStorageClass();
+  ReferenceKind demangleReferenceKind();
+  void demangleThrowSpecification();
+
+  std::pair<Qualifiers, bool> demangleQualifiers();
+
+  // The result is written to this stream.
+  OutputStream OS;
+
+  // Mangled symbol. demangle* functions shorten this string
+  // as they parse it.
+  StringView MangledName;
+
+  // A parsed mangled symbol.
+  Type *SymbolType = nullptr;
+
+  // The main symbol name. (e.g. "ns::foo" in "int ns::foo()".)
+  Name *SymbolName = nullptr;
+
+  // Memory allocator.
+  ArenaAllocator Arena;
+
+  // A single type uses one global back-ref table for all function params.
+  // This means back-refs can even go "into" other types.  Examples:
+  //
+  //  // Second int* is a back-ref to first.
+  //  void foo(int *, int*);
+  //
+  //  // Second int* is not a back-ref to first (first is not a function param).
+  //  int* foo(int*);
+  //
+  //  // Second int* is a back-ref to first (ALL function types share the same
+  //  // back-ref map.
+  //  using F = void(*)(int*);
+  //  F G(int *);
+  Type *FunctionParamBackRefs[10];
+  size_t FunctionParamBackRefCount = 0;
+
+  // The first 10 BackReferences in a mangled name can be back-referenced by
+  // special name @[0-9]. This is a storage for the first 10 BackReferences.
+  StringView BackReferences[10];
+  size_t BackRefCount = 0;
+};
+} // namespace
+
+// Parser entry point.
+void Demangler::parse() {
+  // MSVC-style mangled symbols must start with '?'.
+  if (!MangledName.consumeFront("?")) {
+    SymbolName = Arena.alloc<Name>();
+    SymbolName->Str = MangledName;
+    SymbolType = Arena.alloc<Type>();
+    SymbolType->Prim = PrimTy::Unknown;
+  }
+
+  // What follows is a main symbol name. This may include
+  // namespaces or class BackReferences.
+  SymbolName = demangleName();
+
+  // Read a variable.
+  if (startsWithDigit(MangledName)) {
+    SymbolType = demangleVariableEncoding();
+    return;
+  }
+
+  // Read a function.
+  SymbolType = demangleFunctionEncoding();
+}
+
+// <type-encoding> ::= <storage-class> <variable-type>
+// <storage-class> ::= 0  # private static member
+//                 ::= 1  # protected static member
+//                 ::= 2  # public static member
+//                 ::= 3  # global
+//                 ::= 4  # static local
+
+Type *Demangler::demangleVariableEncoding() {
+  StorageClass SC = demangleVariableStorageClass();
+
+  Type *Ty = demangleType(QualifierMangleMode::Drop);
+
+  Ty->Storage = SC;
+
+  // <variable-type> ::= <type> <cvr-qualifiers>
+  //                 ::= <type> <pointee-cvr-qualifiers> # pointers, references
+  switch (Ty->Prim) {
+  case PrimTy::Ptr:
+  case PrimTy::Ref:
+  case PrimTy::MemberPtr: {
+    Qualifiers ExtraChildQuals = Q_None;
+    Ty->Quals = Qualifiers(Ty->Quals | demanglePointerExtQualifiers());
+
+    bool IsMember = false;
+    std::tie(ExtraChildQuals, IsMember) = demangleQualifiers();
+
+    if (Ty->Prim == PrimTy::MemberPtr) {
+      assert(IsMember);
+      Name *BackRefName = demangleName();
+      (void)BackRefName;
+      MemberPointerType *MPTy = static_cast<MemberPointerType *>(Ty);
+      MPTy->Pointee->Quals = Qualifiers(MPTy->Pointee->Quals | ExtraChildQuals);
+    } else {
+      PointerType *PTy = static_cast<PointerType *>(Ty);
+      PTy->Pointee->Quals = Qualifiers(PTy->Pointee->Quals | ExtraChildQuals);
+    }
+
+    break;
+  }
+  default:
+    Ty->Quals = demangleQualifiers().first;
+    break;
+  }
+
+  return Ty;
+}
+
+// Sometimes numbers are encoded in mangled symbols. For example,
+// "int (*x)[20]" is a valid C type (x is a pointer to an array of
+// length 20), so we need some way to embed numbers as part of symbols.
+// This function parses it.
+//
+// <number>               ::= [?] <non-negative integer>
+//
+// <non-negative integer> ::= <decimal digit> # when 1 <= Number <= 10
+//                        ::= <hex digit>+ @  # when Numbrer == 0 or >= 10
+//
+// <hex-digit>            ::= [A-P]           # A = 0, B = 1, ...
+int Demangler::demangleNumber() {
+  bool neg = MangledName.consumeFront("?");
+
+  if (startsWithDigit(MangledName)) {
+    int32_t Ret = MangledName[0] - '0' + 1;
+    MangledName = MangledName.dropFront(1);
+    return neg ? -Ret : Ret;
+  }
+
+  int Ret = 0;
+  for (size_t i = 0; i < MangledName.size(); ++i) {
+    char C = MangledName[i];
+    if (C == '@') {
+      MangledName = MangledName.dropFront(i + 1);
+      return neg ? -Ret : Ret;
+    }
+    if ('A' <= C && C <= 'P') {
+      Ret = (Ret << 4) + (C - 'A');
+      continue;
+    }
+    break;
+  }
+
+  Error = true;
+  return 0;
+}
+
+// Read until the next '@'.
+StringView Demangler::demangleString(bool Memorize) {
+  for (size_t i = 0; i < MangledName.size(); ++i) {
+    if (MangledName[i] != '@')
+      continue;
+    StringView ret = MangledName.substr(0, i);
+    MangledName = MangledName.dropFront(i + 1);
+
+    if (Memorize)
+      memorizeString(ret);
+    return ret;
+  }
+
+  Error = true;
+  return "";
+}
+
+// First 10 strings can be referenced by special BackReferences ?0, ?1, ..., ?9.
+// Memorize it.
+void Demangler::memorizeString(StringView S) {
+  if (BackRefCount >= sizeof(BackReferences) / sizeof(*BackReferences))
+    return;
+  for (size_t i = 0; i < BackRefCount; ++i)
+    if (S == BackReferences[i])
+      return;
+  BackReferences[BackRefCount++] = S;
+}
+
+void Demangler::demangleNamePiece(Name &Node, bool IsHead) {
+  if (startsWithDigit(MangledName)) {
+    size_t I = MangledName[0] - '0';
+    if (I >= BackRefCount) {
+      Error = true;
+      return;
+    }
+    MangledName = MangledName.dropFront();
+    Node.Str = BackReferences[I];
+  } else if (MangledName.consumeFront("?$")) {
+    // Class template.
+    Node.Str = demangleString(false);
+    Node.TemplateParams = demangleTemplateParameterList();
+  } else if (!IsHead && MangledName.consumeFront("?A")) {
+    // Anonymous namespace starts with ?A.  So does overloaded operator[],
+    // but the distinguishing factor is that namespace themselves are not
+    // mangled, only the variables and functions inside of them are.  So
+    // an anonymous namespace will never occur as the first item in the
+    // name.
+    Node.Str = "`anonymous namespace'";
+    if (!MangledName.consumeFront('@')) {
+      Error = true;
+      return;
+    }
+  } else if (MangledName.consumeFront("?")) {
+    // Overloaded operator.
+    demangleOperator(&Node);
+  } else {
+    // Non-template functions or classes.
+    Node.Str = demangleString(true);
+  }
+}
+
+// Parses a name in the form of A@B@C@@ which represents C::B::A.
+Name *Demangler::demangleName() {
+  Name *Head = nullptr;
+
+  while (!MangledName.consumeFront("@")) {
+    Name *Elem = Arena.alloc<Name>();
+
+    assert(!Error);
+    demangleNamePiece(*Elem, Head == nullptr);
+    if (Error)
+      return nullptr;
+
+    Elem->Next = Head;
+    Head = Elem;
+    if (MangledName.empty()) {
+      Error = true;
+      return nullptr;
+    }
+  }
+
+  return Head;
+}
+
+void Demangler::demangleOperator(Name *OpName) {
+  OpName->Operator = demangleOperatorName();
+  if (!Error && !MangledName.empty() && MangledName.front() != '@')
+    demangleNamePiece(*OpName, false);
+}
+
+StringView Demangler::demangleOperatorName() {
+  SwapAndRestore<StringView> RestoreOnError(MangledName, MangledName);
+  RestoreOnError.shouldRestore(false);
+
+  switch (MangledName.popFront()) {
+  case '0':
+    return "ctor";
+  case '1':
+    return "dtor";
+  case '2':
+    return " new";
+  case '3':
+    return " delete";
+  case '4':
+    return "=";
+  case '5':
+    return ">>";
+  case '6':
+    return "<<";
+  case '7':
+    return "!";
+  case '8':
+    return "==";
+  case '9':
+    return "!=";
+  case 'A':
+    return "[]";
+  case 'C':
+    return "->";
+  case 'D':
+    return "*";
+  case 'E':
+    return "++";
+  case 'F':
+    return "--";
+  case 'G':
+    return "-";
+  case 'H':
+    return "+";
+  case 'I':
+    return "&";
+  case 'J':
+    return "->*";
+  case 'K':
+    return "/";
+  case 'L':
+    return "%";
+  case 'M':
+    return "<";
+  case 'N':
+    return "<=";
+  case 'O':
+    return ">";
+  case 'P':
+    return ">=";
+  case 'Q':
+    return ",";
+  case 'R':
+    return "()";
+  case 'S':
+    return "~";
+  case 'T':
+    return "^";
+  case 'U':
+    return "|";
+  case 'V':
+    return "&&";
+  case 'W':
+    return "||";
+  case 'X':
+    return "*=";
+  case 'Y':
+    return "+=";
+  case 'Z':
+    return "-=";
+  case '_': {
+    if (MangledName.empty())
+      break;
+
+    switch (MangledName.popFront()) {
+    case '0':
+      return "/=";
+    case '1':
+      return "%=";
+    case '2':
+      return ">>=";
+    case '3':
+      return "<<=";
+    case '4':
+      return "&=";
+    case '5':
+      return "|=";
+    case '6':
+      return "^=";
+    case 'U':
+      return " new[]";
+    case 'V':
+      return " delete[]";
+    case '_':
+      if (MangledName.consumeFront("L"))
+        return " co_await";
+    }
+  }
+  }
+
+  Error = true;
+  RestoreOnError.shouldRestore(true);
+  return "";
+}
+
+FuncClass Demangler::demangleFunctionClass() {
+  SwapAndRestore<StringView> RestoreOnError(MangledName, MangledName);
+  RestoreOnError.shouldRestore(false);
+
+  switch (MangledName.popFront()) {
+  case 'A':
+    return Private;
+  case 'B':
+    return FuncClass(Private | Far);
+  case 'C':
+    return FuncClass(Private | Static);
+  case 'D':
+    return FuncClass(Private | Static);
+  case 'E':
+    return FuncClass(Private | Virtual);
+  case 'F':
+    return FuncClass(Private | Virtual);
+  case 'I':
+    return Protected;
+  case 'J':
+    return FuncClass(Protected | Far);
+  case 'K':
+    return FuncClass(Protected | Static);
+  case 'L':
+    return FuncClass(Protected | Static | Far);
+  case 'M':
+    return FuncClass(Protected | Virtual);
+  case 'N':
+    return FuncClass(Protected | Virtual | Far);
+  case 'Q':
+    return Public;
+  case 'R':
+    return FuncClass(Public | Far);
+  case 'S':
+    return FuncClass(Public | Static);
+  case 'T':
+    return FuncClass(Public | Static | Far);
+  case 'U':
+    return FuncClass(Public | Virtual);
+  case 'V':
+    return FuncClass(Public | Virtual | Far);
+  case 'Y':
+    return Global;
+  case 'Z':
+    return FuncClass(Global | Far);
+  }
+
+  Error = true;
+  RestoreOnError.shouldRestore(true);
+  return Public;
+}
+
+CallingConv Demangler::demangleCallingConvention() {
+  switch (MangledName.popFront()) {
+  case 'A':
+  case 'B':
+    return CallingConv::Cdecl;
+  case 'C':
+  case 'D':
+    return CallingConv::Pascal;
+  case 'E':
+  case 'F':
+    return CallingConv::Thiscall;
+  case 'G':
+  case 'H':
+    return CallingConv::Stdcall;
+  case 'I':
+  case 'J':
+    return CallingConv::Fastcall;
+  case 'M':
+  case 'N':
+    return CallingConv::Clrcall;
+  case 'O':
+  case 'P':
+    return CallingConv::Eabi;
+  case 'Q':
+    return CallingConv::Vectorcall;
+  }
+
+  return CallingConv::None;
+}
+
+StorageClass Demangler::demangleVariableStorageClass() {
+  assert(std::isdigit(MangledName.front()));
+
+  switch (MangledName.popFront()) {
+  case '0':
+    return StorageClass::PrivateStatic;
+  case '1':
+    return StorageClass::ProtectedStatic;
+  case '2':
+    return StorageClass::PublicStatic;
+  case '3':
+    return StorageClass::Global;
+  case '4':
+    return StorageClass::FunctionLocalStatic;
+  }
+  Error = true;
+  return StorageClass::None;
+}
+
+std::pair<Qualifiers, bool> Demangler::demangleQualifiers() {
+
+  switch (MangledName.popFront()) {
+  // Member qualifiers
+  case 'Q':
+    return std::make_pair(Q_None, true);
+  case 'R':
+    return std::make_pair(Q_Const, true);
+  case 'S':
+    return std::make_pair(Q_Volatile, true);
+  case 'T':
+    return std::make_pair(Qualifiers(Q_Const | Q_Volatile), true);
+  // Non-Member qualifiers
+  case 'A':
+    return std::make_pair(Q_None, false);
+  case 'B':
+    return std::make_pair(Q_Const, false);
+  case 'C':
+    return std::make_pair(Q_Volatile, false);
+  case 'D':
+    return std::make_pair(Qualifiers(Q_Const | Q_Volatile), false);
+  }
+  Error = true;
+  return std::make_pair(Q_None, false);
+}
+
+// <variable-type> ::= <type> <cvr-qualifiers>
+//                 ::= <type> <pointee-cvr-qualifiers> # pointers, references
+Type *Demangler::demangleType(QualifierMangleMode QMM) {
+  Qualifiers Quals = Q_None;
+  bool IsMember = false;
+  bool IsMemberKnown = false;
+  if (QMM == QualifierMangleMode::Mangle) {
+    std::tie(Quals, IsMember) = demangleQualifiers();
+    IsMemberKnown = true;
+  } else if (QMM == QualifierMangleMode::Result) {
+    if (MangledName.consumeFront('?')) {
+      std::tie(Quals, IsMember) = demangleQualifiers();
+      IsMemberKnown = true;
+    }
+  }
+
+  Type *Ty = nullptr;
+  switch (MangledName.front()) {
+  case 'T': // union
+  case 'U': // struct
+  case 'V': // class
+  case 'W': // enum
+    Ty = demangleClassType();
+    break;
+  case 'A': // foo &
+  case 'P': // foo *
+  case 'Q': // foo *const
+  case 'R': // foo *volatile
+  case 'S': // foo *const volatile
+    if (!IsMemberKnown)
+      IsMember = isMemberPointer(MangledName);
+    if (IsMember)
+      Ty = demangleMemberPointerType();
+    else
+      Ty = demanglePointerType();
+    break;
+  case 'Y':
+    Ty = demangleArrayType();
+    break;
+  default:
+    Ty = demangleBasicType();
+    break;
+  }
+  Ty->Quals = Qualifiers(Ty->Quals | Quals);
+  return Ty;
+}
+
+ReferenceKind Demangler::demangleReferenceKind() {
+  if (MangledName.consumeFront('G'))
+    return ReferenceKind::LValueRef;
+  else if (MangledName.consumeFront('H'))
+    return ReferenceKind::RValueRef;
+  return ReferenceKind::None;
+}
+
+void Demangler::demangleThrowSpecification() {
+  if (MangledName.consumeFront('Z'))
+    return;
+
+  Error = true;
+}
+
+FunctionType *Demangler::demangleFunctionType(bool HasThisQuals,
+                                              bool IsFunctionPointer) {
+  FunctionType *FTy = Arena.alloc<FunctionType>();
+  FTy->Prim = PrimTy::Function;
+  FTy->IsFunctionPointer = IsFunctionPointer;
+
+  if (HasThisQuals) {
+    FTy->Quals = demanglePointerExtQualifiers();
+    FTy->RefKind = demangleReferenceKind();
+    FTy->Quals = Qualifiers(FTy->Quals | demangleQualifiers().first);
+  }
+
+  // Fields that appear on both member and non-member functions.
+  FTy->CallConvention = demangleCallingConvention();
+
+  // <return-type> ::= <type>
+  //               ::= @ # structors (they have no declared return type)
+  bool IsStructor = MangledName.consumeFront('@');
+  if (!IsStructor)
+    FTy->ReturnType = demangleType(QualifierMangleMode::Result);
+
+  FTy->Params = demangleFunctionParameterList();
+
+  demangleThrowSpecification();
+
+  return FTy;
+}
+
+Type *Demangler::demangleFunctionEncoding() {
+  FuncClass FC = demangleFunctionClass();
+
+  bool HasThisQuals = !(FC & (Global | Static));
+  FunctionType *FTy = demangleFunctionType(HasThisQuals, false);
+  FTy->FunctionClass = FC;
+
+  return FTy;
+}
+
+// Reads a primitive type.
+Type *Demangler::demangleBasicType() {
+  Type *Ty = Arena.alloc<Type>();
+
+  switch (MangledName.popFront()) {
+  case 'X':
+    Ty->Prim = PrimTy::Void;
+    break;
+  case 'D':
+    Ty->Prim = PrimTy::Char;
+    break;
+  case 'C':
+    Ty->Prim = PrimTy::Schar;
+    break;
+  case 'E':
+    Ty->Prim = PrimTy::Uchar;
+    break;
+  case 'F':
+    Ty->Prim = PrimTy::Short;
+    break;
+  case 'G':
+    Ty->Prim = PrimTy::Ushort;
+    break;
+  case 'H':
+    Ty->Prim = PrimTy::Int;
+    break;
+  case 'I':
+    Ty->Prim = PrimTy::Uint;
+    break;
+  case 'J':
+    Ty->Prim = PrimTy::Long;
+    break;
+  case 'K':
+    Ty->Prim = PrimTy::Ulong;
+    break;
+  case 'M':
+    Ty->Prim = PrimTy::Float;
+    break;
+  case 'N':
+    Ty->Prim = PrimTy::Double;
+    break;
+  case 'O':
+    Ty->Prim = PrimTy::Ldouble;
+    break;
+  case '_': {
+    if (MangledName.empty()) {
+      Error = true;
+      return nullptr;
+    }
+    switch (MangledName.popFront()) {
+    case 'N':
+      Ty->Prim = PrimTy::Bool;
+      break;
+    case 'J':
+      Ty->Prim = PrimTy::Int64;
+      break;
+    case 'K':
+      Ty->Prim = PrimTy::Uint64;
+      break;
+    case 'W':
+      Ty->Prim = PrimTy::Wchar;
+      break;
+    default:
+      assert(false);
+    }
+    break;
+  }
+  }
+  return Ty;
+}
+
+UdtType *Demangler::demangleClassType() {
+  UdtType *UTy = Arena.alloc<UdtType>();
+
+  switch (MangledName.popFront()) {
+  case 'T':
+    UTy->Prim = PrimTy::Union;
+    break;
+  case 'U':
+    UTy->Prim = PrimTy::Struct;
+    break;
+  case 'V':
+    UTy->Prim = PrimTy::Class;
+    break;
+  case 'W':
+    if (MangledName.popFront() != '4') {
+      Error = true;
+      return nullptr;
+    }
+    UTy->Prim = PrimTy::Enum;
+    break;
+  default:
+    assert(false);
+  }
+
+  UTy->UdtName = demangleName();
+  return UTy;
+}
+
+static std::pair<Qualifiers, PointerAffinity>
+demanglePointerCVQualifiers(StringView &MangledName) {
+  switch (MangledName.popFront()) {
+  case 'A':
+    return std::make_pair(Q_None, PointerAffinity::Reference);
+  case 'P':
+    return std::make_pair(Q_None, PointerAffinity::Pointer);
+  case 'Q':
+    return std::make_pair(Q_Const, PointerAffinity::Pointer);
+  case 'R':
+    return std::make_pair(Q_Volatile, PointerAffinity::Pointer);
+  case 'S':
+    return std::make_pair(Qualifiers(Q_Const | Q_Volatile),
+                          PointerAffinity::Pointer);
+  default:
+    assert(false && "Ty is not a pointer type!");
+  }
+  return std::make_pair(Q_None, PointerAffinity::Pointer);
+}
+
+// <pointer-type> ::= E? <pointer-cvr-qualifiers> <ext-qualifiers> <type>
+//                       # the E is required for 64-bit non-static pointers
+PointerType *Demangler::demanglePointerType() {
+  PointerType *Pointer = Arena.alloc<PointerType>();
+
+  PointerAffinity Affinity;
+  std::tie(Pointer->Quals, Affinity) = demanglePointerCVQualifiers(MangledName);
+
+  Pointer->Prim =
+      (Affinity == PointerAffinity::Pointer) ? PrimTy::Ptr : PrimTy::Ref;
+  if (MangledName.consumeFront("6")) {
+    Pointer->Pointee = demangleFunctionType(false, true);
+    return Pointer;
+  }
+
+  Qualifiers ExtQuals = demanglePointerExtQualifiers();
+  Pointer->Quals = Qualifiers(Pointer->Quals | ExtQuals);
+
+  Pointer->Pointee = demangleType(QualifierMangleMode::Mangle);
+  return Pointer;
+}
+
+MemberPointerType *Demangler::demangleMemberPointerType() {
+  MemberPointerType *Pointer = Arena.alloc<MemberPointerType>();
+  Pointer->Prim = PrimTy::MemberPtr;
+
+  PointerAffinity Affinity;
+  std::tie(Pointer->Quals, Affinity) = demanglePointerCVQualifiers(MangledName);
+  assert(Affinity == PointerAffinity::Pointer);
+
+  Qualifiers ExtQuals = demanglePointerExtQualifiers();
+  Pointer->Quals = Qualifiers(Pointer->Quals | ExtQuals);
+
+  if (MangledName.consumeFront("8")) {
+    Pointer->MemberName = demangleName();
+    Pointer->Pointee = demangleFunctionType(true, true);
+  } else {
+    Qualifiers PointeeQuals = Q_None;
+    bool IsMember = false;
+    std::tie(PointeeQuals, IsMember) = demangleQualifiers();
+    assert(IsMember);
+    Pointer->MemberName = demangleName();
+
+    Pointer->Pointee = demangleType(QualifierMangleMode::Drop);
+    Pointer->Pointee->Quals = PointeeQuals;
+  }
+
+  return Pointer;
+}
+
+Qualifiers Demangler::demanglePointerExtQualifiers() {
+  Qualifiers Quals = Q_None;
+  if (MangledName.consumeFront('E'))
+    Quals = Qualifiers(Quals | Q_Pointer64);
+  if (MangledName.consumeFront('I'))
+    Quals = Qualifiers(Quals | Q_Restrict);
+  if (MangledName.consumeFront('F'))
+    Quals = Qualifiers(Quals | Q_Unaligned);
+
+  return Quals;
+}
+
+ArrayType *Demangler::demangleArrayType() {
+  assert(MangledName.front() == 'Y');
+  MangledName.popFront();
+
+  int Dimension = demangleNumber();
+  if (Dimension <= 0) {
+    Error = true;
+    return nullptr;
+  }
+
+  ArrayType *ATy = Arena.alloc<ArrayType>();
+  ArrayType *Dim = ATy;
+  for (int I = 0; I < Dimension; ++I) {
+    Dim->Prim = PrimTy::Array;
+    Dim->ArrayDimension = demangleNumber();
+    Dim->NextDimension = Arena.alloc<ArrayType>();
+    Dim = Dim->NextDimension;
+  }
+
+  if (MangledName.consumeFront("$$C")) {
+    if (MangledName.consumeFront("B"))
+      ATy->Quals = Q_Const;
+    else if (MangledName.consumeFront("C") || MangledName.consumeFront("D"))
+      ATy->Quals = Qualifiers(Q_Const | Q_Volatile);
+    else if (!MangledName.consumeFront("A"))
+      Error = true;
+  }
+
+  ATy->ElementType = demangleType(QualifierMangleMode::Drop);
+  Dim->ElementType = ATy->ElementType;
+  return ATy;
+}
+
+// Reads a function or a template parameters.
+ParamList Demangler::demangleFunctionParameterList() {
+  // Empty parameter list.
+  if (MangledName.consumeFront('X'))
+    return {};
+
+  ParamList *Head;
+  ParamList **Current = &Head;
+  while (!Error && !MangledName.startsWith('@') &&
+         !MangledName.startsWith('Z')) {
+
+    if (startsWithDigit(MangledName)) {
+      size_t N = MangledName[0] - '0';
+      if (N >= FunctionParamBackRefCount) {
+        Error = true;
+        return {};
+      }
+      MangledName = MangledName.dropFront();
+
+      *Current = Arena.alloc<ParamList>();
+      (*Current)->Current = FunctionParamBackRefs[N]->clone(Arena);
+      Current = &(*Current)->Next;
+      continue;
+    }
+
+    size_t OldSize = MangledName.size();
+
+    *Current = Arena.alloc<ParamList>();
+    (*Current)->Current = demangleType(QualifierMangleMode::Drop);
+
+    size_t CharsConsumed = OldSize - MangledName.size();
+    assert(CharsConsumed != 0);
+
+    // Single-letter types are ignored for backreferences because memorizing
+    // them doesn't save anything.
+    if (FunctionParamBackRefCount <= 9 && CharsConsumed > 1)
+      FunctionParamBackRefs[FunctionParamBackRefCount++] = (*Current)->Current;
+
+    Current = &(*Current)->Next;
+  }
+
+  if (Error)
+    return {};
+
+  // A non-empty parameter list is terminated by either 'Z' (variadic) parameter
+  // list or '@' (non variadic).  Careful not to consume "@Z", as in that case
+  // the following Z could be a throw specifier.
+  if (MangledName.consumeFront('@'))
+    return *Head;
+
+  if (MangledName.consumeFront('Z')) {
+    Head->IsVariadic = true;
+    return *Head;
+  }
+
+  Error = true;
+  return {};
+}
+
+ParamList Demangler::demangleTemplateParameterList() {
+  ParamList *Head;
+  ParamList **Current = &Head;
+  while (!Error && !MangledName.startsWith('@')) {
+
+    // Template parameter lists don't participate in back-referencing.
+    *Current = Arena.alloc<ParamList>();
+    (*Current)->Current = demangleType(QualifierMangleMode::Drop);
+
+    Current = &(*Current)->Next;
+  }
+
+  if (Error)
+    return {};
+
+  // Template parameter lists cannot be variadic, so it can only be terminated
+  // by @.
+  if (MangledName.consumeFront('@'))
+    return *Head;
+  Error = true;
+  return {};
+}
+
+void Demangler::output() {
+  // Converts an AST to a string.
+  //
+  // Converting an AST representing a C++ type to a string is tricky due
+  // to the bad grammar of the C++ declaration inherited from C. You have
+  // to construct a string from inside to outside. For example, if a type
+  // X is a pointer to a function returning int, the order you create a
+  // string becomes something like this:
+  //
+  //   (1) X is a pointer: *X
+  //   (2) (1) is a function returning int: int (*X)()
+  //
+  // So you cannot construct a result just by appending strings to a result.
+  //
+  // To deal with this, we split the function into two. outputPre() writes
+  // the "first half" of type declaration, and outputPost() writes the
+  // "second half". For example, outputPre() writes a return type for a
+  // function and outputPost() writes an parameter list.
+  Type::outputPre(OS, *SymbolType);
+  outputName(OS, SymbolName);
+  Type::outputPost(OS, *SymbolType);
+
+  // Null terminate the buffer.
+  OS << '\0';
+}
+
+char *llvm::microsoftDemangle(const char *MangledName, char *Buf, size_t *N,
+                              int *Status) {
+  OutputStream OS = OutputStream::create(Buf, N, 1024);
+
+  Demangler D(OS, StringView(MangledName));
+  D.parse();
+
+  if (D.Error)
+    *Status = llvm::demangle_invalid_mangled_name;
+  else
+    *Status = llvm::demangle_success;
+
+  D.output();
+  return OS.getBuffer();
+}
diff --git a/contrib/llvm/lib/Demangle/StringView.h b/contrib/llvm/lib/Demangle/StringView.h
new file mode 100644
index 000000000000..3416db2c2867
--- /dev/null
+++ b/contrib/llvm/lib/Demangle/StringView.h
@@ -0,0 +1,97 @@
+//===--- StringView.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+// This file contains a limited version of LLVM's StringView class.  It is
+// copied here so that LLVMDemangle need not take a dependency on LLVMSupport.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEMANGLE_STRINGVIEW_H
+#define LLVM_DEMANGLE_STRINGVIEW_H
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+
+class StringView {
+  const char *First;
+  const char *Last;
+
+public:
+  template <size_t N>
+  StringView(const char (&Str)[N]) : First(Str), Last(Str + N - 1) {}
+  StringView(const char *First_, const char *Last_)
+      : First(First_), Last(Last_) {}
+  StringView(const char *First_, size_t Len)
+      : First(First_), Last(First_ + Len) {}
+  StringView(const char *Str) : First(Str), Last(Str + std::strlen(Str)) {}
+  StringView() : First(nullptr), Last(nullptr) {}
+
+  StringView substr(size_t From) const {
+    return StringView(begin() + From, size() - From);
+  }
+
+  StringView substr(size_t From, size_t To) const {
+    if (To >= size())
+      To = size() - 1;
+    if (From >= size())
+      From = size() - 1;
+    return StringView(First + From, First + To);
+  }
+
+  StringView dropFront(size_t N = 1) const {
+    if (N >= size())
+      N = size();
+    return StringView(First + N, Last);
+  }
+
+  char front() const {
+    assert(!empty());
+    return *begin();
+  }
+
+  char popFront() {
+    assert(!empty());
+    return *First++;
+  }
+
+  bool consumeFront(char C) {
+    if (!startsWith(C))
+      return false;
+    *this = dropFront(1);
+    return true;
+  }
+
+  bool consumeFront(StringView S) {
+    if (!startsWith(S))
+      return false;
+    *this = dropFront(S.size());
+    return true;
+  }
+
+  bool startsWith(char C) const { return !empty() && *begin() == C; }
+
+  bool startsWith(StringView Str) const {
+    if (Str.size() > size())
+      return false;
+    return std::equal(Str.begin(), Str.end(), begin());
+  }
+
+  const char &operator[](size_t Idx) const { return *(begin() + Idx); }
+
+  const char *begin() const { return First; }
+  const char *end() const { return Last; }
+  size_t size() const { return static_cast<size_t>(Last - First); }
+  bool empty() const { return First == Last; }
+};
+
+inline bool operator==(const StringView &LHS, const StringView &RHS) {
+  return LHS.size() == RHS.size() &&
+         std::equal(LHS.begin(), LHS.end(), RHS.begin());
+}
+
+#endif
diff --git a/contrib/llvm/lib/Demangle/Utility.h b/contrib/llvm/lib/Demangle/Utility.h
new file mode 100644
index 000000000000..54cd99e5026b
--- /dev/null
+++ b/contrib/llvm/lib/Demangle/Utility.h
@@ -0,0 +1,188 @@
+//===--- Utility.h ----------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+// This file contains several utility classes used by the demangle library.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEMANGLE_UTILITY_H
+#define LLVM_DEMANGLE_UTILITY_H
+
+#include "StringView.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <limits>
+
+// Stream that AST nodes write their string representation into after the AST
+// has been parsed.
+class OutputStream {
+  char *Buffer;
+  size_t CurrentPosition;
+  size_t BufferCapacity;
+
+  // Ensure there is at least n more positions in buffer.
+  void grow(size_t N) {
+    if (N + CurrentPosition >= BufferCapacity) {
+      BufferCapacity *= 2;
+      if (BufferCapacity < N + CurrentPosition)
+        BufferCapacity = N + CurrentPosition;
+      Buffer = static_cast<char *>(std::realloc(Buffer, BufferCapacity));
+      if (Buffer == nullptr)
+        std::terminate();
+    }
+  }
+
+  void writeUnsigned(uint64_t N, bool isNeg = false) {
+    // Handle special case...
+    if (N == 0) {
+      *this << '0';
+      return;
+    }
+
+    char Temp[21];
+    char *TempPtr = std::end(Temp);
+
+    while (N) {
+      *--TempPtr = '0' + char(N % 10);
+      N /= 10;
+    }
+
+    // Add negative sign...
+    if (isNeg)
+      *--TempPtr = '-';
+    this->operator<<(StringView(TempPtr, std::end(Temp)));
+  }
+
+public:
+  OutputStream(char *StartBuf, size_t Size)
+      : Buffer(StartBuf), CurrentPosition(0), BufferCapacity(Size) {}
+  OutputStream() = default;
+  void reset(char *Buffer_, size_t BufferCapacity_) {
+    CurrentPosition = 0;
+    Buffer = Buffer_;
+    BufferCapacity = BufferCapacity_;
+  }
+
+  /// Create an OutputStream from a buffer and a size.  If either of these are
+  /// null a buffer is allocated.
+  static OutputStream create(char *StartBuf, size_t *Size, size_t AllocSize) {
+    OutputStream Result;
+
+    if (!StartBuf || !Size) {
+      StartBuf = static_cast<char *>(std::malloc(AllocSize));
+      if (StartBuf == nullptr)
+        std::terminate();
+      Size = &AllocSize;
+    }
+
+    Result.reset(StartBuf, *Size);
+    return Result;
+  }
+
+  /// If a ParameterPackExpansion (or similar type) is encountered, the offset
+  /// into the pack that we're currently printing.
+  unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max();
+  unsigned CurrentPackMax = std::numeric_limits<unsigned>::max();
+
+  OutputStream &operator+=(StringView R) {
+    size_t Size = R.size();
+    if (Size == 0)
+      return *this;
+    grow(Size);
+    std::memmove(Buffer + CurrentPosition, R.begin(), Size);
+    CurrentPosition += Size;
+    return *this;
+  }
+
+  OutputStream &operator+=(char C) {
+    grow(1);
+    Buffer[CurrentPosition++] = C;
+    return *this;
+  }
+
+  OutputStream &operator<<(StringView R) { return (*this += R); }
+
+  OutputStream &operator<<(char C) { return (*this += C); }
+
+  OutputStream &operator<<(long long N) {
+    if (N < 0)
+      writeUnsigned(static_cast<unsigned long long>(-N), true);
+    else
+      writeUnsigned(static_cast<unsigned long long>(N));
+    return *this;
+  }
+
+  OutputStream &operator<<(unsigned long long N) {
+    writeUnsigned(N, false);
+    return *this;
+  }
+
+  OutputStream &operator<<(long N) {
+    return this->operator<<(static_cast<long long>(N));
+  }
+
+  OutputStream &operator<<(unsigned long N) {
+    return this->operator<<(static_cast<unsigned long long>(N));
+  }
+
+  OutputStream &operator<<(int N) {
+    return this->operator<<(static_cast<long long>(N));
+  }
+
+  OutputStream &operator<<(unsigned int N) {
+    return this->operator<<(static_cast<unsigned long long>(N));
+  }
+
+  size_t getCurrentPosition() const { return CurrentPosition; }
+  void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
+
+  char back() const {
+    return CurrentPosition ? Buffer[CurrentPosition - 1] : '\0';
+  }
+
+  bool empty() const { return CurrentPosition == 0; }
+
+  char *getBuffer() { return Buffer; }
+  char *getBufferEnd() { return Buffer + CurrentPosition - 1; }
+  size_t getBufferCapacity() { return BufferCapacity; }
+};
+
+template <class T> class SwapAndRestore {
+  T &Restore;
+  T OriginalValue;
+  bool ShouldRestore = true;
+
+public:
+  SwapAndRestore(T &Restore_) : SwapAndRestore(Restore_, Restore_) {}
+
+  SwapAndRestore(T &Restore_, T NewVal)
+      : Restore(Restore_), OriginalValue(Restore) {
+    Restore = std::move(NewVal);
+  }
+  ~SwapAndRestore() {
+    if (ShouldRestore)
+      Restore = std::move(OriginalValue);
+  }
+
+  void shouldRestore(bool ShouldRestore_) { ShouldRestore = ShouldRestore_; }
+
+  void restoreNow(bool Force) {
+    if (!Force && !ShouldRestore)
+      return;
+
+    Restore = std::move(OriginalValue);
+    ShouldRestore = false;
+  }
+
+  SwapAndRestore(const SwapAndRestore &) = delete;
+  SwapAndRestore &operator=(const SwapAndRestore &) = delete;
+};
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index c59885753a8f..ae96c7f5955f 100644
--- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -49,14 +49,13 @@ STATISTIC(NumGlobals  , "Number of global vars initialized");
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
     std::unique_ptr<Module> M, std::string *ErrorStr,
     std::shared_ptr<MCJITMemoryManager> MemMgr,
-
-    std::shared_ptr<JITSymbolResolver> Resolver,
+    std::shared_ptr<LegacyJITSymbolResolver> Resolver,
     std::unique_ptr<TargetMachine> TM) = nullptr;
 
 ExecutionEngine *(*ExecutionEngine::OrcMCJITReplacementCtor)(
-  std::string *ErrorStr, std::shared_ptr<MCJITMemoryManager> MemMgr,
-  std::shared_ptr<JITSymbolResolver> Resolver,
-  std::unique_ptr<TargetMachine> TM) = nullptr;
+    std::string *ErrorStr, std::shared_ptr<MCJITMemoryManager> MemMgr,
+    std::shared_ptr<LegacyJITSymbolResolver> Resolver,
+    std::unique_ptr<TargetMachine> TM) = nullptr;
 
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(std::unique_ptr<Module> M,
                                                 std::string *ErrorStr) =nullptr;
@@ -97,14 +96,14 @@ ExecutionEngine::~ExecutionEngine() {
 }
 
 namespace {
-/// \brief Helper class which uses a value handler to automatically deletes the
+/// Helper class which uses a value handler to automatically deletes the
 /// memory block when the GlobalVariable is destroyed.
 class GVMemoryBlock final : public CallbackVH {
   GVMemoryBlock(const GlobalVariable *GV)
     : CallbackVH(const_cast<GlobalVariable*>(GV)) {}
 
 public:
-  /// \brief Returns the address the GlobalVariable should be written into.  The
+  /// Returns the address the GlobalVariable should be written into.  The
   /// GVMemoryBlock object prefixes that.
   static char *Create(const GlobalVariable *GV, const DataLayout& TD) {
     Type *ElTy = GV->getValueType();
@@ -215,7 +214,7 @@ void ExecutionEngine::addGlobalMapping(StringRef Name, uint64_t Addr) {
 
   assert(!Name.empty() && "Empty GlobalMapping symbol name!");
 
-  DEBUG(dbgs() << "JIT: Map \'" << Name  << "\' to [" << Addr << "]\n";);
+  LLVM_DEBUG(dbgs() << "JIT: Map \'" << Name << "\' to [" << Addr << "]\n";);
   uint64_t &CurVal = EEState.getGlobalAddressMap()[Name];
   assert((!CurVal || !Addr) && "GlobalMapping already established!");
   CurVal = Addr;
@@ -344,13 +343,14 @@ void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
   unsigned PtrSize = EE->getDataLayout().getPointerSize();
   Array = make_unique<char[]>((InputArgv.size()+1)*PtrSize);
 
-  DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array.get() << "\n");
+  LLVM_DEBUG(dbgs() << "JIT: ARGV = " << (void *)Array.get() << "\n");
   Type *SBytePtr = Type::getInt8PtrTy(C);
 
   for (unsigned i = 0; i != InputArgv.size(); ++i) {
     unsigned Size = InputArgv[i].size()+1;
     auto Dest = make_unique<char[]>(Size);
-    DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void*)Dest.get() << "\n");
+    LLVM_DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void *)Dest.get()
+                      << "\n");
 
     std::copy(InputArgv[i].begin(), InputArgv[i].end(), Dest.get());
     Dest[Size-1] = 0;
@@ -502,9 +502,9 @@ EngineBuilder::setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM) {
   return *this;
 }
 
-EngineBuilder&
-EngineBuilder::setSymbolResolver(std::unique_ptr<JITSymbolResolver> SR) {
-  Resolver = std::shared_ptr<JITSymbolResolver>(std::move(SR));
+EngineBuilder &
+EngineBuilder::setSymbolResolver(std::unique_ptr<LegacyJITSymbolResolver> SR) {
+  Resolver = std::shared_ptr<LegacyJITSymbolResolver>(std::move(SR));
   return *this;
 }
 
@@ -532,7 +532,6 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
   if ((WhichEngine & EngineKind::JIT) && TheTM) {
-    Triple TT(M->getTargetTriple());
     if (!TM->getTarget().hasJIT()) {
       errs() << "WARNING: This target JIT is not designed for the host"
              << " you are running.  If bad things happen, please choose"
@@ -591,7 +590,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
   return getPointerToGlobalIfAvailable(GV);
 }
 
-/// \brief Converts a Constant* into a GenericValue, including handling of
+/// Converts a Constant* into a GenericValue, including handling of
 /// ConstantExpr values.
 GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
   // If its undefined, return the garbage.
@@ -904,6 +903,9 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     Result.IntVal = cast<ConstantInt>(C)->getValue();
     break;
   case Type::PointerTyID:
+    while (auto *A = dyn_cast<GlobalAlias>(C)) {
+      C = A->getAliasee();
+    }
     if (isa<ConstantPointerNull>(C))
       Result.PointerVal = nullptr;
     else if (const Function *F = dyn_cast<Function>(C))
@@ -1182,8 +1184,8 @@ void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
 }
 
 void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
-  DEBUG(dbgs() << "JIT: Initializing " << Addr << " ");
-  DEBUG(Init->dump());
+  LLVM_DEBUG(dbgs() << "JIT: Initializing " << Addr << " ");
+  LLVM_DEBUG(Init->dump());
   if (isa<UndefValue>(Init))
     return;
 
@@ -1230,7 +1232,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
     return;
   }
 
-  DEBUG(dbgs() << "Bad Type: " << *Init->getType() << "\n");
+  LLVM_DEBUG(dbgs() << "Bad Type: " << *Init->getType() << "\n");
   llvm_unreachable("Unknown constant type to initialize memory with!");
 }
 
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 20251c23b17c..abcdaeba8eb0 100644
--- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -14,11 +14,12 @@
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CodeGenCWrappers.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/CodeGenCWrappers.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cstring>
 
@@ -411,3 +412,26 @@ void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM) {
   delete unwrap(MM);
 }
 
+/*===-- JIT Event Listener functions -------------------------------------===*/
+
+
+#if !LLVM_USE_INTEL_JITEVENTS
+LLVMJITEventListenerRef LLVMCreateIntelJITEventListener(void)
+{
+  return nullptr;
+}
+#endif
+
+#if !LLVM_USE_OPROFILE
+LLVMJITEventListenerRef LLVMCreateOProfileJITEventListener(void)
+{
+  return nullptr;
+}
+#endif
+
+#if !LLVM_USE_PERF
+LLVMJITEventListenerRef LLVMCreatePerfJITEventListener(void)
+{
+  return nullptr;
+}
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp b/contrib/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
index dad099d73c96..fd4f0746f7f9 100644
--- a/contrib/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm-c/ExecutionEngine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/Object/ObjectFile.h"
@@ -235,3 +236,8 @@ JITEventListener* JITEventListener::createGDBRegistrationListener() {
 }
 
 } // namespace llvm
+
+LLVMJITEventListenerRef LLVMCreateGDBRegistrationListener(void)
+{
+  return wrap(JITEventListener::createGDBRegistrationListener());
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index cb6dd5e57283..211f5216811f 100644
--- a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "IntelJITEventsWrapper.h"
+#include "llvm-c/ExecutionEngine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Config/config.h"
@@ -100,15 +101,17 @@ void IntelJITEventListener::NotifyObjectEmitted(
                                        const RuntimeDyld::LoadedObjectInfo &L) {
 
   OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
-  const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+  const ObjectFile *DebugObj = DebugObjOwner.getBinary();
+  if (!DebugObj)
+    return;
 
   // Get the address of the object image for use as a unique identifier
-  const void* ObjData = DebugObj.getData().data();
-  std::unique_ptr<DIContext> Context = DWARFContext::create(DebugObj);
+  const void* ObjData = DebugObj->getData().data();
+  std::unique_ptr<DIContext> Context = DWARFContext::create(*DebugObj);
   MethodAddressVector Functions;
 
   // Use symbol info to iterate functions in the object.
-  for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
+  for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(*DebugObj)) {
     SymbolRef Sym = P.first;
     std::vector<LineNumberInfo> LineInfo;
     std::string SourceFileName;
@@ -238,3 +241,7 @@ JITEventListener *JITEventListener::createIntelJITEventListener(
 
 } // namespace llvm
 
+LLVMJITEventListenerRef LLVMCreateIntelJITEventListener(void)
+{
+  return wrap(JITEventListener::createIntelJITEventListener());
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
index f2d36a76a315..bc8fea148749 100644
--- a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
@@ -26,7 +26,6 @@
 #include <pthread.h>
 #include <stdint.h>
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#include <malloc.h>
 #include <stdlib.h>
 
 #include "jitprofiling.h"
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
index 96844439e721..9e77d160c30b 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -974,11 +974,11 @@ void Interpreter::visitAllocaInst(AllocaInst &I) {
   unsigned MemToAlloc = std::max(1U, NumElements * TypeSize);
 
   // Allocate enough memory to hold the type...
-  void *Memory = malloc(MemToAlloc);
+  void *Memory = safe_malloc(MemToAlloc);
 
-  DEBUG(dbgs() << "Allocated Type: " << *Ty << " (" << TypeSize << " bytes) x " 
-               << NumElements << " (Total: " << MemToAlloc << ") at "
-               << uintptr_t(Memory) << '\n');
+  LLVM_DEBUG(dbgs() << "Allocated Type: " << *Ty << " (" << TypeSize
+                    << " bytes) x " << NumElements << " (Total: " << MemToAlloc
+                    << ") at " << uintptr_t(Memory) << '\n');
 
   GenericValue Result = PTOGV(Memory);
   assert(Result.PointerVal && "Null pointer returned by malloc!");
@@ -1025,7 +1025,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
 
   GenericValue Result;
   Result.PointerVal = ((char*)getOperandValue(Ptr, SF).PointerVal) + Total;
-  DEBUG(dbgs() << "GEP Index " << Total << " bytes.\n");
+  LLVM_DEBUG(dbgs() << "GEP Index " << Total << " bytes.\n");
   return Result;
 }
 
@@ -2118,7 +2118,7 @@ void Interpreter::run() {
     // Track the number of dynamic instructions executed.
     ++NumDynamicInsts;
 
-    DEBUG(dbgs() << "About to interpret: " << I);
+    LLVM_DEBUG(dbgs() << "About to interpret: " << I);
     visit(I);   // Dispatch to one of the visit* methods...
   }
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index f7b8a3b657ee..2c663c2e1edf 100644
--- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -39,11 +39,10 @@ static struct RegisterJIT {
 extern "C" void LLVMLinkInMCJIT() {
 }
 
-ExecutionEngine*
-MCJIT::createJIT(std::unique_ptr<Module> M,
-                 std::string *ErrorStr,
+ExecutionEngine *
+MCJIT::createJIT(std::unique_ptr<Module> M, std::string *ErrorStr,
                  std::shared_ptr<MCJITMemoryManager> MemMgr,
-                 std::shared_ptr<JITSymbolResolver> Resolver,
+                 std::shared_ptr<LegacyJITSymbolResolver> Resolver,
                  std::unique_ptr<TargetMachine> TM) {
   // Try to register the program as a source of symbols to resolve against.
   //
@@ -64,7 +63,7 @@ MCJIT::createJIT(std::unique_ptr<Module> M,
 
 MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> TM,
              std::shared_ptr<MCJITMemoryManager> MemMgr,
-             std::shared_ptr<JITSymbolResolver> Resolver)
+             std::shared_ptr<LegacyJITSymbolResolver> Resolver)
     : ExecutionEngine(TM->createDataLayout(), std::move(M)), TM(std::move(TM)),
       Ctx(nullptr), MemMgr(std::move(MemMgr)),
       Resolver(*this, std::move(Resolver)), Dyld(*this->MemMgr, this->Resolver),
@@ -143,8 +142,14 @@ void MCJIT::setObjectCache(ObjectCache* NewCache) {
 }
 
 std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
+  assert(M && "Can not emit a null module");
+
   MutexGuard locked(lock);
 
+  // Materialize all globals in the module if they have not been
+  // materialized already.
+  cantFail(M->materializeAll());
+
   // This must be a module which has already been added but not loaded to this
   // MCJIT instance, since these conditions are tested by our caller,
   // generateCodeForModule.
@@ -165,7 +170,7 @@ std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
   // Flush the output buffer to get the generated code into memory
 
   std::unique_ptr<MemoryBuffer> CompiledObjBuffer(
-                                new ObjectMemoryBuffer(std::move(ObjBufferSV)));
+      new SmallVectorMemoryBuffer(std::move(ObjBufferSV)));
 
   // If we have an object cache, tell it about the new object.
   // Note that we're using the compiled image, not the loaded image (as below).
@@ -666,3 +671,5 @@ LinkingSymbolResolver::findSymbol(const std::string &Name) {
     return nullptr;
   return ClientResolver->findSymbol(Name);
 }
+
+void LinkingSymbolResolver::anchor() {}
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
index daf578f5daae..943b14942a0f 100644
--- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -14,10 +14,10 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 
 namespace llvm {
 class MCJIT;
@@ -26,11 +26,11 @@ class MCJIT;
 // functions across modules that it owns.  It aggregates the memory manager
 // that is passed in to the MCJIT constructor and defers most functionality
 // to that object.
-class LinkingSymbolResolver : public JITSymbolResolver {
+class LinkingSymbolResolver : public LegacyJITSymbolResolver {
 public:
   LinkingSymbolResolver(MCJIT &Parent,
-                        std::shared_ptr<JITSymbolResolver> Resolver)
-    : ParentEngine(Parent), ClientResolver(std::move(Resolver)) {}
+                        std::shared_ptr<LegacyJITSymbolResolver> Resolver)
+      : ParentEngine(Parent), ClientResolver(std::move(Resolver)) {}
 
   JITSymbol findSymbol(const std::string &Name) override;
 
@@ -41,7 +41,8 @@ public:
 
 private:
   MCJIT &ParentEngine;
-  std::shared_ptr<JITSymbolResolver> ClientResolver;
+  std::shared_ptr<LegacyJITSymbolResolver> ClientResolver;
+  void anchor() override;
 };
 
 // About Module states: added->loaded->finalized.
@@ -67,7 +68,7 @@ private:
 class MCJIT : public ExecutionEngine {
   MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
         std::shared_ptr<MCJITMemoryManager> MemMgr,
-        std::shared_ptr<JITSymbolResolver> Resolver);
+        std::shared_ptr<LegacyJITSymbolResolver> Resolver);
 
   typedef llvm::SmallPtrSet<Module *, 4> ModulePtrSet;
 
@@ -300,11 +301,10 @@ public:
     MCJITCtor = createJIT;
   }
 
-  static ExecutionEngine*
-  createJIT(std::unique_ptr<Module> M,
-            std::string *ErrorStr,
+  static ExecutionEngine *
+  createJIT(std::unique_ptr<Module> M, std::string *ErrorStr,
             std::shared_ptr<MCJITMemoryManager> MemMgr,
-            std::shared_ptr<JITSymbolResolver> Resolver,
+            std::shared_ptr<LegacyJITSymbolResolver> Resolver,
             std::unique_ptr<TargetMachine> TM);
 
   // @}
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/ObjectBuffer.h b/contrib/llvm/lib/ExecutionEngine/MCJIT/ObjectBuffer.h
deleted file mode 100644
index 92310f3eb54a..000000000000
--- a/contrib/llvm/lib/ExecutionEngine/MCJIT/ObjectBuffer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===--- ObjectBuffer.h - Utility class to wrap object memory ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares a wrapper class to hold the memory into which an
-// object will be generated.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
-#define LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-class ObjectMemoryBuffer : public MemoryBuffer {
-public:
-  template <unsigned N>
-  ObjectMemoryBuffer(SmallVector<char, N> SV)
-    : SV(SV), BufferName("<in-memory object>") {
-    init(this->SV.begin(), this->SV.end(), false);
-  }
-
-  template <unsigned N>
-  ObjectMemoryBuffer(SmallVector<char, N> SV, StringRef Name)
-    : SV(SV), BufferName(Name) {
-    init(this->SV.begin(), this->SV.end(), false);
-  }
-  const char* getBufferIdentifier() const override { return BufferName.c_str(); }
-
-  BufferKind getBufferKind() const override { return MemoryBuffer_Malloc; }
-
-private:
-  SmallVector<char, 4096> SV;
-  std::string BufferName;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 3581d6458395..6f0825fb38da 100644
--- a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -12,8 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm-c/ExecutionEngine.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Config/config.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
@@ -57,9 +59,10 @@ public:
 void OProfileJITEventListener::initialize() {
   if (!Wrapper->op_open_agent()) {
     const std::string err_str = sys::StrError();
-    DEBUG(dbgs() << "Failed to connect to OProfile agent: " << err_str << "\n");
+    LLVM_DEBUG(dbgs() << "Failed to connect to OProfile agent: " << err_str
+                      << "\n");
   } else {
-    DEBUG(dbgs() << "Connected to OProfile agent.\n");
+    LLVM_DEBUG(dbgs() << "Connected to OProfile agent.\n");
   }
 }
 
@@ -67,10 +70,10 @@ OProfileJITEventListener::~OProfileJITEventListener() {
   if (Wrapper->isAgentAvailable()) {
     if (Wrapper->op_close_agent() == -1) {
       const std::string err_str = sys::StrError();
-      DEBUG(dbgs() << "Failed to disconnect from OProfile agent: "
-                   << err_str << "\n");
+      LLVM_DEBUG(dbgs() << "Failed to disconnect from OProfile agent: "
+                        << err_str << "\n");
     } else {
-      DEBUG(dbgs() << "Disconnected from OProfile agent.\n");
+      LLVM_DEBUG(dbgs() << "Disconnected from OProfile agent.\n");
     }
   }
 }
@@ -84,6 +87,7 @@ void OProfileJITEventListener::NotifyObjectEmitted(
 
   OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
   const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+  std::unique_ptr<DIContext> Context = DWARFContext::create(DebugObj);
 
   // Use symbol info to iterate functions in the object.
   for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
@@ -103,12 +107,34 @@ void OProfileJITEventListener::NotifyObjectEmitted(
 
     if (Wrapper->op_write_native_code(Name.data(), Addr, (void *)Addr, Size) ==
         -1) {
-      DEBUG(dbgs() << "Failed to tell OProfile about native function " << Name
-                   << " at [" << (void *)Addr << "-" << ((char *)Addr + Size)
-                   << "]\n");
+      LLVM_DEBUG(dbgs() << "Failed to tell OProfile about native function "
+                        << Name << " at [" << (void *)Addr << "-"
+                        << ((char *)Addr + Size) << "]\n");
+      continue;
+    }
+
+    DILineInfoTable Lines = Context->getLineInfoForAddressRange(Addr, Size);
+    size_t i = 0;
+    size_t num_entries = Lines.size();
+    struct debug_line_info *debug_line;
+    debug_line = (struct debug_line_info *)calloc(
+        num_entries, sizeof(struct debug_line_info));
+
+    for (auto& It : Lines) {
+      debug_line[i].vma = (unsigned long)It.first;
+      debug_line[i].lineno = It.second.Line;
+      debug_line[i].filename =
+          const_cast<char *>(Lines.front().second.FileName.c_str());
+      ++i;
+    }
+
+    if (Wrapper->op_write_debug_line_info((void *)Addr, num_entries,
+                                          debug_line) == -1) {
+      LLVM_DEBUG(dbgs() << "Failed to tell OProfiler about debug object at ["
+                        << (void *)Addr << "-" << ((char *)Addr + Size)
+                        << "]\n");
       continue;
     }
-    // TODO: support line number info (similar to IntelJITEventListener.cpp)
   }
 
   DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner);
@@ -135,9 +161,10 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
         uint64_t Addr = *AddrOrErr;
 
         if (Wrapper->op_unload_native_code(Addr) == -1) {
-          DEBUG(dbgs()
-                << "Failed to tell OProfile about unload of native function at "
-                << (void*)Addr << "\n");
+          LLVM_DEBUG(
+              dbgs()
+              << "Failed to tell OProfile about unload of native function at "
+              << (void *)Addr << "\n");
           continue;
         }
       }
@@ -156,3 +183,7 @@ JITEventListener *JITEventListener::createOProfileJITEventListener() {
 
 } // namespace llvm
 
+LLVMJITEventListenerRef LLVMCreateOProfileJITEventListener(void)
+{
+  return wrap(JITEventListener::createOProfileJITEventListener());
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index d96278a8137b..b473ac3faf4c 100644
--- a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -64,15 +64,16 @@ bool OProfileWrapper::initialize() {
 
   // If the oprofile daemon is not running, don't load the opagent library
   if (!isOProfileRunning()) {
-    DEBUG(dbgs() << "OProfile daemon is not detected.\n");
+    LLVM_DEBUG(dbgs() << "OProfile daemon is not detected.\n");
     return false;
   }
 
   std::string error;
   if(!DynamicLibrary::LoadLibraryPermanently("libopagent.so", &error)) {
-    DEBUG(dbgs()
-            << "OProfile connector library libopagent.so could not be loaded: "
-            << error << "\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "OProfile connector library libopagent.so could not be loaded: "
+        << error << "\n");
   }
 
   // Get the addresses of the opagent functions
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
new file mode 100644
index 000000000000..d42e7b05ba67
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -0,0 +1,343 @@
+//===----- CompileOnDemandLayer.cpp - Lazily emit IR on first call --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+template <typename MaterializerFtor>
+class LambdaValueMaterializer final : public ValueMaterializer {
+public:
+  LambdaValueMaterializer(MaterializerFtor M) : M(std::move(M)) {}
+
+  Value *materialize(Value *V) final { return M(V); }
+
+private:
+  MaterializerFtor M;
+};
+
+template <typename MaterializerFtor>
+LambdaValueMaterializer<MaterializerFtor>
+createLambdaValueMaterializer(MaterializerFtor M) {
+  return LambdaValueMaterializer<MaterializerFtor>(std::move(M));
+}
+} // namespace
+
+static void extractAliases(MaterializationResponsibility &R, Module &M,
+                           MangleAndInterner &Mangle) {
+  SymbolAliasMap Aliases;
+
+  std::vector<GlobalAlias *> ModAliases;
+  for (auto &A : M.aliases())
+    ModAliases.push_back(&A);
+
+  for (auto *A : ModAliases) {
+    Constant *Aliasee = A->getAliasee();
+    assert(A->hasName() && "Anonymous alias?");
+    assert(Aliasee->hasName() && "Anonymous aliasee");
+    std::string AliasName = A->getName();
+
+    Aliases[Mangle(AliasName)] = SymbolAliasMapEntry(
+        {Mangle(Aliasee->getName()), JITSymbolFlags::fromGlobalValue(*A)});
+
+    if (isa<Function>(Aliasee)) {
+      auto *F = cloneFunctionDecl(M, *cast<Function>(Aliasee));
+      A->replaceAllUsesWith(F);
+      A->eraseFromParent();
+      F->setName(AliasName);
+    } else if (isa<GlobalValue>(Aliasee)) {
+      auto *G = cloneGlobalVariableDecl(M, *cast<GlobalVariable>(Aliasee));
+      A->replaceAllUsesWith(G);
+      A->eraseFromParent();
+      G->setName(AliasName);
+    }
+  }
+
+  R.replace(symbolAliases(std::move(Aliases)));
+}
+
+static std::unique_ptr<Module>
+extractAndClone(Module &M, LLVMContext &NewContext, StringRef Suffix,
+                function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
+  SmallVector<char, 1> ClonedModuleBuffer;
+
+  {
+    std::set<GlobalValue *> ClonedDefsInSrc;
+    ValueToValueMapTy VMap;
+    auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
+      if (ShouldCloneDefinition(GV)) {
+        ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
+        return true;
+      }
+      return false;
+    });
+
+    for (auto *GV : ClonedDefsInSrc) {
+      // Delete the definition and bump the linkage in the source module.
+      if (isa<Function>(GV)) {
+        auto &F = *cast<Function>(GV);
+        F.deleteBody();
+        F.setPersonalityFn(nullptr);
+      } else if (isa<GlobalVariable>(GV)) {
+        cast<GlobalVariable>(GV)->setInitializer(nullptr);
+      } else
+        llvm_unreachable("Unsupported global type");
+
+      GV->setLinkage(GlobalValue::ExternalLinkage);
+    }
+
+    BitcodeWriter BCWriter(ClonedModuleBuffer);
+
+    BCWriter.writeModule(*Tmp);
+    BCWriter.writeSymtab();
+    BCWriter.writeStrtab();
+  }
+
+  MemoryBufferRef ClonedModuleBufferRef(
+      StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()),
+      "cloned module buffer");
+
+  auto ClonedModule =
+      cantFail(parseBitcodeFile(ClonedModuleBufferRef, NewContext));
+  ClonedModule->setModuleIdentifier((M.getName() + Suffix).str());
+  return ClonedModule;
+}
+
+static std::unique_ptr<Module> extractGlobals(Module &M,
+                                              LLVMContext &NewContext) {
+  return extractAndClone(M, NewContext, ".globals", [](const GlobalValue *GV) {
+    return isa<GlobalVariable>(GV);
+  });
+}
+
+namespace llvm {
+namespace orc {
+
+class ExtractingIRMaterializationUnit : public IRMaterializationUnit {
+public:
+  ExtractingIRMaterializationUnit(ExecutionSession &ES,
+                                  CompileOnDemandLayer2 &Parent,
+                                  std::unique_ptr<Module> M)
+      : IRMaterializationUnit(ES, std::move(M)), Parent(Parent) {}
+
+  ExtractingIRMaterializationUnit(std::unique_ptr<Module> M,
+                                  SymbolFlagsMap SymbolFlags,
+                                  SymbolNameToDefinitionMap SymbolToDefinition,
+                                  CompileOnDemandLayer2 &Parent)
+      : IRMaterializationUnit(std::move(M), std::move(SymbolFlags),
+                              std::move(SymbolToDefinition)),
+        Parent(Parent) {}
+
+private:
+  void materialize(MaterializationResponsibility R) override {
+    // FIXME: Need a 'notify lazy-extracting/emitting' callback to tie the
+    //        extracted module key, extracted module, and source module key
+    //        together. This could be used, for example, to provide a specific
+    //        memory manager instance to the linking layer.
+
+    auto RequestedSymbols = R.getRequestedSymbols();
+
+    // Extract the requested functions into a new module.
+    std::unique_ptr<Module> ExtractedFunctionsModule;
+    if (!RequestedSymbols.empty()) {
+      std::string Suffix;
+      std::set<const GlobalValue *> FunctionsToClone;
+      for (auto &Name : RequestedSymbols) {
+        auto I = SymbolToDefinition.find(Name);
+        assert(I != SymbolToDefinition.end() && I->second != nullptr &&
+               "Should have a non-null definition");
+        FunctionsToClone.insert(I->second);
+        Suffix += ".";
+        Suffix += *Name;
+      }
+
+      std::lock_guard<std::mutex> Lock(SourceModuleMutex);
+      ExtractedFunctionsModule =
+          extractAndClone(*M, Parent.GetAvailableContext(), Suffix,
+                          [&](const GlobalValue *GV) -> bool {
+                            return FunctionsToClone.count(GV);
+                          });
+    }
+
+    // Build a new ExtractingIRMaterializationUnit to delegate the unrequested
+    // symbols to.
+    SymbolFlagsMap DelegatedSymbolFlags;
+    IRMaterializationUnit::SymbolNameToDefinitionMap
+        DelegatedSymbolToDefinition;
+    for (auto &KV : SymbolToDefinition) {
+      if (RequestedSymbols.count(KV.first))
+        continue;
+      DelegatedSymbolFlags[KV.first] =
+          JITSymbolFlags::fromGlobalValue(*KV.second);
+      DelegatedSymbolToDefinition[KV.first] = KV.second;
+    }
+
+    if (!DelegatedSymbolFlags.empty()) {
+      assert(DelegatedSymbolFlags.size() ==
+                 DelegatedSymbolToDefinition.size() &&
+             "SymbolFlags and SymbolToDefinition should have the same number "
+             "of entries");
+      R.replace(llvm::make_unique<ExtractingIRMaterializationUnit>(
+          std::move(M), std::move(DelegatedSymbolFlags),
+          std::move(DelegatedSymbolToDefinition), Parent));
+    }
+
+    if (ExtractedFunctionsModule)
+      Parent.emitExtractedFunctionsModule(std::move(R),
+                                          std::move(ExtractedFunctionsModule));
+  }
+
+  void discard(const VSO &V, SymbolStringPtr Name) override {
+    // All original symbols were materialized by the CODLayer and should be
+    // final. The function bodies provided by M should never be overridden.
+    llvm_unreachable("Discard should never be called on an "
+                     "ExtractingIRMaterializationUnit");
+  }
+
+  mutable std::mutex SourceModuleMutex;
+  CompileOnDemandLayer2 &Parent;
+};
+
+CompileOnDemandLayer2::CompileOnDemandLayer2(
+    ExecutionSession &ES, IRLayer &BaseLayer, JITCompileCallbackManager &CCMgr,
+    IndirectStubsManagerBuilder BuildIndirectStubsManager,
+    GetAvailableContextFunction GetAvailableContext)
+    : IRLayer(ES), BaseLayer(BaseLayer), CCMgr(CCMgr),
+      BuildIndirectStubsManager(std::move(BuildIndirectStubsManager)),
+      GetAvailableContext(std::move(GetAvailableContext)) {}
+
+Error CompileOnDemandLayer2::add(VSO &V, VModuleKey K,
+                                 std::unique_ptr<Module> M) {
+  return IRLayer::add(V, K, std::move(M));
+}
+
+void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+                                 std::unique_ptr<Module> M) {
+  auto &ES = getExecutionSession();
+  assert(M && "M should not be null");
+
+  for (auto &GV : M->global_values())
+    if (GV.hasWeakLinkage())
+      GV.setLinkage(GlobalValue::ExternalLinkage);
+
+  MangleAndInterner Mangle(ES, M->getDataLayout());
+
+  extractAliases(R, *M, Mangle);
+
+  auto GlobalsModule = extractGlobals(*M, GetAvailableContext());
+
+  // Delete the bodies of any available externally functions, rename the
+  // rest, and build the compile callbacks.
+  std::map<SymbolStringPtr, std::pair<JITTargetAddress, JITSymbolFlags>>
+      StubCallbacksAndLinkages;
+  auto &TargetVSO = R.getTargetVSO();
+
+  for (auto &F : M->functions()) {
+    if (F.isDeclaration())
+      continue;
+
+    if (F.hasAvailableExternallyLinkage()) {
+      F.deleteBody();
+      F.setPersonalityFn(nullptr);
+      continue;
+    }
+
+    assert(F.hasName() && "Function should have a name");
+    std::string StubUnmangledName = F.getName();
+    F.setName(F.getName() + "$body");
+    auto StubDecl = cloneFunctionDecl(*M, F);
+    StubDecl->setName(StubUnmangledName);
+    StubDecl->setPersonalityFn(nullptr);
+    StubDecl->setLinkage(GlobalValue::ExternalLinkage);
+    F.replaceAllUsesWith(StubDecl);
+
+    auto StubName = Mangle(StubUnmangledName);
+    auto BodyName = Mangle(F.getName());
+    if (auto CallbackAddr = CCMgr.getCompileCallback(
+            [BodyName, &TargetVSO, &ES]() -> JITTargetAddress {
+              if (auto Sym = lookup({&TargetVSO}, BodyName))
+                return Sym->getAddress();
+              else {
+                ES.reportError(Sym.takeError());
+                return 0;
+              }
+            })) {
+      auto Flags = JITSymbolFlags::fromGlobalValue(F);
+      Flags &= ~JITSymbolFlags::Weak;
+      StubCallbacksAndLinkages[std::move(StubName)] =
+          std::make_pair(*CallbackAddr, Flags);
+    } else {
+      ES.reportError(CallbackAddr.takeError());
+      R.failMaterialization();
+      return;
+    }
+  }
+
+  // Build the stub inits map.
+  IndirectStubsManager::StubInitsMap StubInits;
+  for (auto &KV : StubCallbacksAndLinkages)
+    StubInits[*KV.first] = KV.second;
+
+  // Build the function-body-extracting materialization unit.
+  if (auto Err = R.getTargetVSO().define(
+          llvm::make_unique<ExtractingIRMaterializationUnit>(ES, *this,
+                                                             std::move(M)))) {
+    ES.reportError(std::move(Err));
+    R.failMaterialization();
+    return;
+  }
+
+  // Build the stubs.
+  // FIXME: Remove function bodies materialization unit if stub creation fails.
+  auto &StubsMgr = getStubsManager(TargetVSO);
+  if (auto Err = StubsMgr.createStubs(StubInits)) {
+    ES.reportError(std::move(Err));
+    R.failMaterialization();
+    return;
+  }
+
+  // Resolve and finalize stubs.
+  SymbolMap ResolvedStubs;
+  for (auto &KV : StubCallbacksAndLinkages) {
+    if (auto Sym = StubsMgr.findStub(*KV.first, false))
+      ResolvedStubs[KV.first] = Sym;
+    else
+      llvm_unreachable("Stub went missing");
+  }
+
+  R.resolve(ResolvedStubs);
+
+  BaseLayer.emit(std::move(R), std::move(K), std::move(GlobalsModule));
+}
+
+IndirectStubsManager &CompileOnDemandLayer2::getStubsManager(const VSO &V) {
+  std::lock_guard<std::mutex> Lock(CODLayerMutex);
+  StubManagersMap::iterator I = StubsMgrs.find(&V);
+  if (I == StubsMgrs.end())
+    I = StubsMgrs.insert(std::make_pair(&V, BuildIndirectStubsManager())).first;
+  return *I->second;
+}
+
+void CompileOnDemandLayer2::emitExtractedFunctionsModule(
+    MaterializationResponsibility R, std::unique_ptr<Module> M) {
+  auto K = getExecutionSession().allocateVModule();
+  BaseLayer.emit(std::move(R), std::move(K), std::move(M));
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/Core.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/Core.cpp
new file mode 100644
index 000000000000..4325d57f73d0
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -0,0 +1,1690 @@
+//===----- Core.cpp - Core ORC APIs (MaterializationUnit, VSO, etc.) ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+
+#if LLVM_ENABLE_THREADS
+#include <future>
+#endif
+
+namespace llvm {
+namespace orc {
+
+char FailedToMaterialize::ID = 0;
+char SymbolsNotFound::ID = 0;
+
+RegisterDependenciesFunction NoDependenciesToRegister =
+    RegisterDependenciesFunction();
+
+void MaterializationUnit::anchor() {}
+
+raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
+  if (Flags.isWeak())
+    OS << 'W';
+  else if (Flags.isCommon())
+    OS << 'C';
+  else
+    OS << 'S';
+
+  if (Flags.isExported())
+    OS << 'E';
+  else
+    OS << 'H';
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
+  OS << format("0x%016x", Sym.getAddress()) << " " << Sym.getFlags();
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) {
+  OS << "\"" << *KV.first << "\": " << KV.second;
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
+  OS << "{";
+  if (!Symbols.empty()) {
+    OS << " \"" << **Symbols.begin() << "\"";
+    for (auto &Sym : make_range(std::next(Symbols.begin()), Symbols.end()))
+      OS << ", \"" << *Sym << "\"";
+  }
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) {
+  OS << "{";
+  if (!Symbols.empty()) {
+    OS << " {" << *Symbols.begin() << "}";
+    for (auto &Sym : make_range(std::next(Symbols.begin()), Symbols.end()))
+      OS << ", {" << Sym << "}";
+  }
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) {
+  OS << "{";
+  if (!SymbolFlags.empty()) {
+    OS << " {\"" << *SymbolFlags.begin()->first
+       << "\": " << SymbolFlags.begin()->second << "}";
+    for (auto &KV :
+         make_range(std::next(SymbolFlags.begin()), SymbolFlags.end()))
+      OS << ", {\"" << *KV.first << "\": " << KV.second << "}";
+  }
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) {
+  OS << "{";
+  if (!Deps.empty()) {
+    OS << " { " << Deps.begin()->first->getName() << ": "
+       << Deps.begin()->second << " }";
+    for (auto &KV : make_range(std::next(Deps.begin()), Deps.end()))
+      OS << ", { " << KV.first->getName() << ": " << KV.second << " }";
+  }
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const VSOList &VSOs) {
+  OS << "[";
+  if (!VSOs.empty()) {
+    assert(VSOs.front() && "VSOList entries must not be null");
+    OS << " " << VSOs.front()->getName();
+    for (auto *V : make_range(std::next(VSOs.begin()), VSOs.end())) {
+      assert(V && "VSOList entries must not be null");
+      OS << ", " << V->getName();
+    }
+  }
+  OS << " ]";
+  return OS;
+}
+
+FailedToMaterialize::FailedToMaterialize(SymbolNameSet Symbols)
+    : Symbols(std::move(Symbols)) {
+  assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
+}
+
+std::error_code FailedToMaterialize::convertToErrorCode() const {
+  return orcError(OrcErrorCode::UnknownORCError);
+}
+
+void FailedToMaterialize::log(raw_ostream &OS) const {
+  OS << "Failed to materialize symbols: " << Symbols;
+}
+
+SymbolsNotFound::SymbolsNotFound(SymbolNameSet Symbols)
+    : Symbols(std::move(Symbols)) {
+  assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
+}
+
+std::error_code SymbolsNotFound::convertToErrorCode() const {
+  return orcError(OrcErrorCode::UnknownORCError);
+}
+
+void SymbolsNotFound::log(raw_ostream &OS) const {
+  OS << "Symbols not found: " << Symbols;
+}
+
+void ExecutionSessionBase::legacyFailQuery(AsynchronousSymbolQuery &Q,
+                                           Error Err) {
+  assert(!!Err && "Error should be in failure state");
+
+  bool SendErrorToQuery;
+  runSessionLocked([&]() {
+    Q.detach();
+    SendErrorToQuery = Q.canStillFail();
+  });
+
+  if (SendErrorToQuery)
+    Q.handleFailed(std::move(Err));
+  else
+    reportError(std::move(Err));
+}
+
+Expected<SymbolMap> ExecutionSessionBase::legacyLookup(
+    ExecutionSessionBase &ES, LegacyAsyncLookupFunction AsyncLookup,
+    SymbolNameSet Names, bool WaitUntilReady,
+    RegisterDependenciesFunction RegisterDependencies) {
+#if LLVM_ENABLE_THREADS
+  // In the threaded case we use promises to return the results.
+  std::promise<SymbolMap> PromisedResult;
+  std::mutex ErrMutex;
+  Error ResolutionError = Error::success();
+  std::promise<void> PromisedReady;
+  Error ReadyError = Error::success();
+  auto OnResolve = [&](Expected<SymbolMap> R) {
+    if (R)
+      PromisedResult.set_value(std::move(*R));
+    else {
+      {
+        ErrorAsOutParameter _(&ResolutionError);
+        std::lock_guard<std::mutex> Lock(ErrMutex);
+        ResolutionError = R.takeError();
+      }
+      PromisedResult.set_value(SymbolMap());
+    }
+  };
+
+  std::function<void(Error)> OnReady;
+  if (WaitUntilReady) {
+    OnReady = [&](Error Err) {
+      if (Err) {
+        ErrorAsOutParameter _(&ReadyError);
+        std::lock_guard<std::mutex> Lock(ErrMutex);
+        ReadyError = std::move(Err);
+      }
+      PromisedReady.set_value();
+    };
+  } else {
+    OnReady = [&](Error Err) {
+      if (Err)
+        ES.reportError(std::move(Err));
+    };
+  }
+
+#else
+  SymbolMap Result;
+  Error ResolutionError = Error::success();
+  Error ReadyError = Error::success();
+
+  auto OnResolve = [&](Expected<SymbolMap> R) {
+    ErrorAsOutParameter _(&ResolutionError);
+    if (R)
+      Result = std::move(*R);
+    else
+      ResolutionError = R.takeError();
+  };
+
+  std::function<void(Error)> OnReady;
+  if (WaitUntilReady) {
+    OnReady = [&](Error Err) {
+      ErrorAsOutParameter _(&ReadyError);
+      if (Err)
+        ReadyError = std::move(Err);
+    };
+  } else {
+    OnReady = [&](Error Err) {
+      if (Err)
+        ES.reportError(std::move(Err));
+    };
+  }
+#endif
+
+  auto Query = std::make_shared<AsynchronousSymbolQuery>(
+      Names, std::move(OnResolve), std::move(OnReady));
+  // FIXME: This should be run session locked along with the registration code
+  // and error reporting below.
+  SymbolNameSet UnresolvedSymbols = AsyncLookup(Query, std::move(Names));
+
+  // If the query was lodged successfully then register the dependencies,
+  // otherwise fail it with an error.
+  if (UnresolvedSymbols.empty())
+    RegisterDependencies(Query->QueryRegistrations);
+  else {
+    bool DeliverError = runSessionLocked([&]() {
+      Query->detach();
+      return Query->canStillFail();
+    });
+    auto Err = make_error<SymbolsNotFound>(std::move(UnresolvedSymbols));
+    if (DeliverError)
+      Query->handleFailed(std::move(Err));
+    else
+      ES.reportError(std::move(Err));
+  }
+
+#if LLVM_ENABLE_THREADS
+  auto ResultFuture = PromisedResult.get_future();
+  auto Result = ResultFuture.get();
+
+  {
+    std::lock_guard<std::mutex> Lock(ErrMutex);
+    if (ResolutionError) {
+      // ReadyError will never be assigned. Consume the success value.
+      cantFail(std::move(ReadyError));
+      return std::move(ResolutionError);
+    }
+  }
+
+  if (WaitUntilReady) {
+    auto ReadyFuture = PromisedReady.get_future();
+    ReadyFuture.get();
+
+    {
+      std::lock_guard<std::mutex> Lock(ErrMutex);
+      if (ReadyError)
+        return std::move(ReadyError);
+    }
+  } else
+    cantFail(std::move(ReadyError));
+
+  return std::move(Result);
+
+#else
+  if (ResolutionError) {
+    // ReadyError will never be assigned. Consume the success value.
+    cantFail(std::move(ReadyError));
+    return std::move(ResolutionError);
+  }
+
+  if (ReadyError)
+    return std::move(ReadyError);
+
+  return Result;
+#endif
+}
+
+void ExecutionSessionBase::lookup(
+    const VSOList &VSOs, const SymbolNameSet &Symbols,
+    SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
+    RegisterDependenciesFunction RegisterDependencies) {
+
+  // lookup can be re-entered recursively if running on a single thread. Run any
+  // outstanding MUs in case this query depends on them, otherwise the main
+  // thread will starve waiting for a result from an MU that it failed to run.
+  runOutstandingMUs();
+
+  auto Unresolved = std::move(Symbols);
+  std::map<VSO *, MaterializationUnitList> MUsMap;
+  auto Q = std::make_shared<AsynchronousSymbolQuery>(
+      Symbols, std::move(OnResolve), std::move(OnReady));
+  bool QueryIsFullyResolved = false;
+  bool QueryIsFullyReady = false;
+  bool QueryFailed = false;
+
+  runSessionLocked([&]() {
+    for (auto *V : VSOs) {
+      assert(V && "VSOList entries must not be null");
+      assert(!MUsMap.count(V) &&
+             "VSOList should not contain duplicate entries");
+      V->lodgeQuery(Q, Unresolved, MUsMap[V]);
+    }
+
+    if (Unresolved.empty()) {
+      // Query lodged successfully.
+
+      // Record whether this query is fully ready / resolved. We will use
+      // this to call handleFullyResolved/handleFullyReady outside the session
+      // lock.
+      QueryIsFullyResolved = Q->isFullyResolved();
+      QueryIsFullyReady = Q->isFullyReady();
+
+      // Call the register dependencies function.
+      if (RegisterDependencies && !Q->QueryRegistrations.empty())
+        RegisterDependencies(Q->QueryRegistrations);
+    } else {
+      // Query failed due to unresolved symbols.
+      QueryFailed = true;
+
+      // Disconnect the query from its dependencies.
+      Q->detach();
+
+      // Replace the MUs.
+      for (auto &KV : MUsMap)
+        for (auto &MU : KV.second)
+          KV.first->replace(std::move(MU));
+    }
+  });
+
+  if (QueryFailed) {
+    Q->handleFailed(make_error<SymbolsNotFound>(std::move(Unresolved)));
+    return;
+  } else {
+    if (QueryIsFullyResolved)
+      Q->handleFullyResolved();
+    if (QueryIsFullyReady)
+      Q->handleFullyReady();
+  }
+
+  // Move the MUs to the OutstandingMUs list, then materialize.
+  {
+    std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
+
+    for (auto &KV : MUsMap)
+      for (auto &MU : KV.second)
+        OutstandingMUs.push_back(std::make_pair(KV.first, std::move(MU)));
+  }
+
+  runOutstandingMUs();
+}
+
+Expected<SymbolMap>
+ExecutionSessionBase::lookup(const VSOList &VSOs, const SymbolNameSet &Symbols,
+                             RegisterDependenciesFunction RegisterDependencies,
+                             bool WaitUntilReady) {
+#if LLVM_ENABLE_THREADS
+  // In the threaded case we use promises to return the results.
+  std::promise<SymbolMap> PromisedResult;
+  std::mutex ErrMutex;
+  Error ResolutionError = Error::success();
+  std::promise<void> PromisedReady;
+  Error ReadyError = Error::success();
+  auto OnResolve = [&](Expected<SymbolMap> R) {
+    if (R)
+      PromisedResult.set_value(std::move(*R));
+    else {
+      {
+        ErrorAsOutParameter _(&ResolutionError);
+        std::lock_guard<std::mutex> Lock(ErrMutex);
+        ResolutionError = R.takeError();
+      }
+      PromisedResult.set_value(SymbolMap());
+    }
+  };
+
+  std::function<void(Error)> OnReady;
+  if (WaitUntilReady) {
+    OnReady = [&](Error Err) {
+      if (Err) {
+        ErrorAsOutParameter _(&ReadyError);
+        std::lock_guard<std::mutex> Lock(ErrMutex);
+        ReadyError = std::move(Err);
+      }
+      PromisedReady.set_value();
+    };
+  } else {
+    OnReady = [&](Error Err) {
+      if (Err)
+        reportError(std::move(Err));
+    };
+  }
+
+#else
+  SymbolMap Result;
+  Error ResolutionError = Error::success();
+  Error ReadyError = Error::success();
+
+  auto OnResolve = [&](Expected<SymbolMap> R) {
+    ErrorAsOutParameter _(&ResolutionError);
+    if (R)
+      Result = std::move(*R);
+    else
+      ResolutionError = R.takeError();
+  };
+
+  std::function<void(Error)> OnReady;
+  if (WaitUntilReady) {
+    OnReady = [&](Error Err) {
+      ErrorAsOutParameter _(&ReadyError);
+      if (Err)
+        ReadyError = std::move(Err);
+    };
+  } else {
+    OnReady = [&](Error Err) {
+      if (Err)
+        reportError(std::move(Err));
+    };
+  }
+#endif
+
+  // Perform the asynchronous lookup.
+  lookup(VSOs, Symbols, OnResolve, OnReady, RegisterDependencies);
+
+#if LLVM_ENABLE_THREADS
+  auto ResultFuture = PromisedResult.get_future();
+  auto Result = ResultFuture.get();
+
+  {
+    std::lock_guard<std::mutex> Lock(ErrMutex);
+    if (ResolutionError) {
+      // ReadyError will never be assigned. Consume the success value.
+      cantFail(std::move(ReadyError));
+      return std::move(ResolutionError);
+    }
+  }
+
+  if (WaitUntilReady) {
+    auto ReadyFuture = PromisedReady.get_future();
+    ReadyFuture.get();
+
+    {
+      std::lock_guard<std::mutex> Lock(ErrMutex);
+      if (ReadyError)
+        return std::move(ReadyError);
+    }
+  } else
+    cantFail(std::move(ReadyError));
+
+  return std::move(Result);
+
+#else
+  if (ResolutionError) {
+    // ReadyError will never be assigned. Consume the success value.
+    cantFail(std::move(ReadyError));
+    return std::move(ResolutionError);
+  }
+
+  if (ReadyError)
+    return std::move(ReadyError);
+
+  return Result;
+#endif
+}
+
+void ExecutionSessionBase::runOutstandingMUs() {
+  while (1) {
+    std::pair<VSO *, std::unique_ptr<MaterializationUnit>> VSOAndMU;
+
+    {
+      std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
+      if (!OutstandingMUs.empty()) {
+        VSOAndMU = std::move(OutstandingMUs.back());
+        OutstandingMUs.pop_back();
+      }
+    }
+
+    if (VSOAndMU.first) {
+      assert(VSOAndMU.second && "VSO, but no MU?");
+      dispatchMaterialization(*VSOAndMU.first, std::move(VSOAndMU.second));
+    } else
+      break;
+  }
+}
+
+AsynchronousSymbolQuery::AsynchronousSymbolQuery(
+    const SymbolNameSet &Symbols, SymbolsResolvedCallback NotifySymbolsResolved,
+    SymbolsReadyCallback NotifySymbolsReady)
+    : NotifySymbolsResolved(std::move(NotifySymbolsResolved)),
+      NotifySymbolsReady(std::move(NotifySymbolsReady)) {
+  NotYetResolvedCount = NotYetReadyCount = Symbols.size();
+
+  for (auto &S : Symbols)
+    ResolvedSymbols[S] = nullptr;
+}
+
+void AsynchronousSymbolQuery::resolve(const SymbolStringPtr &Name,
+                                      JITEvaluatedSymbol Sym) {
+  auto I = ResolvedSymbols.find(Name);
+  assert(I != ResolvedSymbols.end() &&
+         "Resolving symbol outside the requested set");
+  assert(I->second.getAddress() == 0 && "Redundantly resolving symbol Name");
+  I->second = std::move(Sym);
+  --NotYetResolvedCount;
+}
+
+void AsynchronousSymbolQuery::handleFullyResolved() {
+  assert(NotYetResolvedCount == 0 && "Not fully resolved?");
+  assert(NotifySymbolsResolved &&
+         "NotifySymbolsResolved already called or error occurred");
+  NotifySymbolsResolved(std::move(ResolvedSymbols));
+  NotifySymbolsResolved = SymbolsResolvedCallback();
+}
+
+void AsynchronousSymbolQuery::notifySymbolReady() {
+  assert(NotYetReadyCount != 0 && "All symbols already finalized");
+  --NotYetReadyCount;
+}
+
+void AsynchronousSymbolQuery::handleFullyReady() {
+  assert(QueryRegistrations.empty() &&
+         "Query is still registered with some symbols");
+  assert(!NotifySymbolsResolved && "Resolution not applied yet");
+  NotifySymbolsReady(Error::success());
+  NotifySymbolsReady = SymbolsReadyCallback();
+}
+
+bool AsynchronousSymbolQuery::canStillFail() {
+  return (NotifySymbolsResolved || NotifySymbolsReady);
+}
+
+void AsynchronousSymbolQuery::handleFailed(Error Err) {
+  assert(QueryRegistrations.empty() && ResolvedSymbols.empty() &&
+         NotYetResolvedCount == 0 && NotYetReadyCount == 0 &&
+         "Query should already have been abandoned");
+  if (NotifySymbolsResolved) {
+    NotifySymbolsResolved(std::move(Err));
+    NotifySymbolsResolved = SymbolsResolvedCallback();
+  } else {
+    assert(NotifySymbolsReady && "Failed after both callbacks issued?");
+    NotifySymbolsReady(std::move(Err));
+  }
+  NotifySymbolsReady = SymbolsReadyCallback();
+}
+
+void AsynchronousSymbolQuery::addQueryDependence(VSO &V, SymbolStringPtr Name) {
+  bool Added = QueryRegistrations[&V].insert(std::move(Name)).second;
+  (void)Added;
+  assert(Added && "Duplicate dependence notification?");
+}
+
+void AsynchronousSymbolQuery::removeQueryDependence(
+    VSO &V, const SymbolStringPtr &Name) {
+  auto QRI = QueryRegistrations.find(&V);
+  assert(QRI != QueryRegistrations.end() && "No dependencies registered for V");
+  assert(QRI->second.count(Name) && "No dependency on Name in V");
+  QRI->second.erase(Name);
+  if (QRI->second.empty())
+    QueryRegistrations.erase(QRI);
+}
+
+void AsynchronousSymbolQuery::detach() {
+  ResolvedSymbols.clear();
+  NotYetResolvedCount = 0;
+  NotYetReadyCount = 0;
+  for (auto &KV : QueryRegistrations)
+    KV.first->detachQueryHelper(*this, KV.second);
+  QueryRegistrations.clear();
+}
+
+MaterializationResponsibility::MaterializationResponsibility(
+    VSO &V, SymbolFlagsMap SymbolFlags)
+    : V(V), SymbolFlags(std::move(SymbolFlags)) {
+  assert(!this->SymbolFlags.empty() && "Materializing nothing?");
+
+#ifndef NDEBUG
+  for (auto &KV : this->SymbolFlags)
+    KV.second |= JITSymbolFlags::Materializing;
+#endif
+}
+
+MaterializationResponsibility::~MaterializationResponsibility() {
+  assert(SymbolFlags.empty() &&
+         "All symbols should have been explicitly materialized or failed");
+}
+
+SymbolNameSet MaterializationResponsibility::getRequestedSymbols() {
+  return V.getRequestedSymbols(SymbolFlags);
+}
+
+void MaterializationResponsibility::resolve(const SymbolMap &Symbols) {
+#ifndef NDEBUG
+  for (auto &KV : Symbols) {
+    auto I = SymbolFlags.find(KV.first);
+    assert(I != SymbolFlags.end() &&
+           "Resolving symbol outside this responsibility set");
+    assert(I->second.isMaterializing() && "Duplicate resolution");
+    I->second &= ~JITSymbolFlags::Materializing;
+    if (I->second.isWeak())
+      assert(I->second == (KV.second.getFlags() | JITSymbolFlags::Weak) &&
+             "Resolving symbol with incorrect flags");
+    else
+      assert(I->second == KV.second.getFlags() &&
+             "Resolving symbol with incorrect flags");
+  }
+#endif
+
+  V.resolve(Symbols);
+}
+
+void MaterializationResponsibility::finalize() {
+#ifndef NDEBUG
+  for (auto &KV : SymbolFlags)
+    assert(!KV.second.isMaterializing() &&
+           "Failed to resolve symbol before finalization");
+#endif // NDEBUG
+
+  V.finalize(SymbolFlags);
+  SymbolFlags.clear();
+}
+
+Error MaterializationResponsibility::defineMaterializing(
+    const SymbolFlagsMap &NewSymbolFlags) {
+  // Add the given symbols to this responsibility object.
+  // It's ok if we hit a duplicate here: In that case the new version will be
+  // discarded, and the VSO::defineMaterializing method will return a duplicate
+  // symbol error.
+  for (auto &KV : NewSymbolFlags) {
+    auto I = SymbolFlags.insert(KV).first;
+    (void)I;
+#ifndef NDEBUG
+    I->second |= JITSymbolFlags::Materializing;
+#endif
+  }
+
+  return V.defineMaterializing(NewSymbolFlags);
+}
+
+void MaterializationResponsibility::failMaterialization() {
+
+  SymbolNameSet FailedSymbols;
+  for (auto &KV : SymbolFlags)
+    FailedSymbols.insert(KV.first);
+
+  V.notifyFailed(FailedSymbols);
+  SymbolFlags.clear();
+}
+
+void MaterializationResponsibility::replace(
+    std::unique_ptr<MaterializationUnit> MU) {
+  for (auto &KV : MU->getSymbols())
+    SymbolFlags.erase(KV.first);
+
+  V.replace(std::move(MU));
+}
+
+MaterializationResponsibility
+MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
+  SymbolFlagsMap DelegatedFlags;
+
+  for (auto &Name : Symbols) {
+    auto I = SymbolFlags.find(Name);
+    assert(I != SymbolFlags.end() &&
+           "Symbol is not tracked by this MaterializationResponsibility "
+           "instance");
+
+    DelegatedFlags[Name] = std::move(I->second);
+    SymbolFlags.erase(I);
+  }
+
+  return MaterializationResponsibility(V, std::move(DelegatedFlags));
+}
+
+void MaterializationResponsibility::addDependencies(
+    const SymbolStringPtr &Name, const SymbolDependenceMap &Dependencies) {
+  assert(SymbolFlags.count(Name) &&
+         "Symbol not covered by this MaterializationResponsibility instance");
+  V.addDependencies(Name, Dependencies);
+}
+
+void MaterializationResponsibility::addDependenciesForAll(
+    const SymbolDependenceMap &Dependencies) {
+  for (auto &KV : SymbolFlags)
+    V.addDependencies(KV.first, Dependencies);
+}
+
+AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit(
+    SymbolMap Symbols)
+    : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {}
+
+void AbsoluteSymbolsMaterializationUnit::materialize(
+    MaterializationResponsibility R) {
+  R.resolve(Symbols);
+  R.finalize();
+}
+
+void AbsoluteSymbolsMaterializationUnit::discard(const VSO &V,
+                                                 SymbolStringPtr Name) {
+  assert(Symbols.count(Name) && "Symbol is not part of this MU");
+  Symbols.erase(Name);
+}
+
+SymbolFlagsMap
+AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
+  SymbolFlagsMap Flags;
+  for (const auto &KV : Symbols)
+    Flags[KV.first] = KV.second.getFlags();
+  return Flags;
+}
+
+ReExportsMaterializationUnit::ReExportsMaterializationUnit(
+    VSO *SourceVSO, SymbolAliasMap Aliases)
+    : MaterializationUnit(extractFlags(Aliases)), SourceVSO(SourceVSO),
+      Aliases(std::move(Aliases)) {}
+
+void ReExportsMaterializationUnit::materialize(
+    MaterializationResponsibility R) {
+
+  auto &ES = R.getTargetVSO().getExecutionSession();
+  VSO &TgtV = R.getTargetVSO();
+  VSO &SrcV = SourceVSO ? *SourceVSO : TgtV;
+
+  // Find the set of requested aliases and aliasees. Return any unrequested
+  // aliases back to the VSO so as to not prematurely materialize any aliasees.
+  auto RequestedSymbols = R.getRequestedSymbols();
+  SymbolAliasMap RequestedAliases;
+
+  for (auto &Name : RequestedSymbols) {
+    auto I = Aliases.find(Name);
+    assert(I != Aliases.end() && "Symbol not found in aliases map?");
+    RequestedAliases[Name] = std::move(I->second);
+    Aliases.erase(I);
+  }
+
+  if (!Aliases.empty()) {
+    if (SourceVSO)
+      R.replace(reexports(*SourceVSO, std::move(Aliases)));
+    else
+      R.replace(symbolAliases(std::move(Aliases)));
+  }
+
+  // The OnResolveInfo struct will hold the aliases and responsibilty for each
+  // query in the list.
+  struct OnResolveInfo {
+    OnResolveInfo(MaterializationResponsibility R, SymbolAliasMap Aliases)
+        : R(std::move(R)), Aliases(std::move(Aliases)) {}
+
+    MaterializationResponsibility R;
+    SymbolAliasMap Aliases;
+  };
+
+  // Build a list of queries to issue. In each round we build the largest set of
+  // aliases that we can resolve without encountering a chain definition of the
+  // form Foo -> Bar, Bar -> Baz. Such a form would deadlock as the query would
+  // be waitin on a symbol that it itself had to resolve. Usually this will just
+  // involve one round and a single query.
+
+  std::vector<std::pair<SymbolNameSet, std::shared_ptr<OnResolveInfo>>>
+      QueryInfos;
+  while (!RequestedAliases.empty()) {
+    SymbolNameSet ResponsibilitySymbols;
+    SymbolNameSet QuerySymbols;
+    SymbolAliasMap QueryAliases;
+
+    for (auto I = RequestedAliases.begin(), E = RequestedAliases.end();
+         I != E;) {
+      auto Tmp = I++;
+
+      // Chain detected. Skip this symbol for this round.
+      if (&SrcV == &TgtV && (QueryAliases.count(Tmp->second.Aliasee) ||
+                             RequestedAliases.count(Tmp->second.Aliasee)))
+        continue;
+
+      ResponsibilitySymbols.insert(Tmp->first);
+      QuerySymbols.insert(Tmp->second.Aliasee);
+      QueryAliases[Tmp->first] = std::move(Tmp->second);
+      RequestedAliases.erase(Tmp);
+    }
+    assert(!QuerySymbols.empty() && "Alias cycle detected!");
+
+    auto QueryInfo = std::make_shared<OnResolveInfo>(
+        R.delegate(ResponsibilitySymbols), std::move(QueryAliases));
+    QueryInfos.push_back(
+        make_pair(std::move(QuerySymbols), std::move(QueryInfo)));
+  }
+
+  // Issue the queries.
+  while (!QueryInfos.empty()) {
+    auto QuerySymbols = std::move(QueryInfos.back().first);
+    auto QueryInfo = std::move(QueryInfos.back().second);
+
+    QueryInfos.pop_back();
+
+    auto RegisterDependencies = [QueryInfo,
+                                 &SrcV](const SymbolDependenceMap &Deps) {
+      // If there were no materializing symbols, just bail out.
+      if (Deps.empty())
+        return;
+
+      // Otherwise the only deps should be on SrcV.
+      assert(Deps.size() == 1 && Deps.count(&SrcV) &&
+             "Unexpected dependencies for reexports");
+
+      auto &SrcVDeps = Deps.find(&SrcV)->second;
+      SymbolDependenceMap PerAliasDepsMap;
+      auto &PerAliasDeps = PerAliasDepsMap[&SrcV];
+
+      for (auto &KV : QueryInfo->Aliases)
+        if (SrcVDeps.count(KV.second.Aliasee)) {
+          PerAliasDeps = {KV.second.Aliasee};
+          QueryInfo->R.addDependencies(KV.first, PerAliasDepsMap);
+        }
+    };
+
+    auto OnResolve = [QueryInfo](Expected<SymbolMap> Result) {
+      if (Result) {
+        SymbolMap ResolutionMap;
+        for (auto &KV : QueryInfo->Aliases) {
+          assert(Result->count(KV.second.Aliasee) &&
+                 "Result map missing entry?");
+          ResolutionMap[KV.first] = JITEvaluatedSymbol(
+              (*Result)[KV.second.Aliasee].getAddress(), KV.second.AliasFlags);
+        }
+        QueryInfo->R.resolve(ResolutionMap);
+        QueryInfo->R.finalize();
+      } else {
+        auto &ES = QueryInfo->R.getTargetVSO().getExecutionSession();
+        ES.reportError(Result.takeError());
+        QueryInfo->R.failMaterialization();
+      }
+    };
+
+    auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); };
+
+    ES.lookup({&SrcV}, QuerySymbols, std::move(OnResolve), std::move(OnReady),
+              std::move(RegisterDependencies));
+  }
+}
+
+void ReExportsMaterializationUnit::discard(const VSO &V, SymbolStringPtr Name) {
+  assert(Aliases.count(Name) &&
+         "Symbol not covered by this MaterializationUnit");
+  Aliases.erase(Name);
+}
+
+SymbolFlagsMap
+ReExportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) {
+  SymbolFlagsMap SymbolFlags;
+  for (auto &KV : Aliases)
+    SymbolFlags[KV.first] = KV.second.AliasFlags;
+
+  return SymbolFlags;
+}
+
+Expected<SymbolAliasMap>
+buildSimpleReexportsAliasMap(VSO &SourceV, const SymbolNameSet &Symbols) {
+  auto Flags = SourceV.lookupFlags(Symbols);
+
+  if (Flags.size() != Symbols.size()) {
+    SymbolNameSet Unresolved = Symbols;
+    for (auto &KV : Flags)
+      Unresolved.erase(KV.first);
+    return make_error<SymbolsNotFound>(std::move(Unresolved));
+  }
+
+  SymbolAliasMap Result;
+  for (auto &Name : Symbols) {
+    assert(Flags.count(Name) && "Missing entry in flags map");
+    Result[Name] = SymbolAliasMapEntry(Name, Flags[Name]);
+  }
+
+  return Result;
+}
+
+Error VSO::defineMaterializing(const SymbolFlagsMap &SymbolFlags) {
+  return ES.runSessionLocked([&]() -> Error {
+    std::vector<SymbolMap::iterator> AddedSyms;
+
+    for (auto &KV : SymbolFlags) {
+      SymbolMap::iterator EntryItr;
+      bool Added;
+
+      auto NewFlags = KV.second;
+      NewFlags |= JITSymbolFlags::Materializing;
+
+      std::tie(EntryItr, Added) = Symbols.insert(
+          std::make_pair(KV.first, JITEvaluatedSymbol(0, NewFlags)));
+
+      if (Added)
+        AddedSyms.push_back(EntryItr);
+      else {
+        // Remove any symbols already added.
+        for (auto &SI : AddedSyms)
+          Symbols.erase(SI);
+
+        // FIXME: Return all duplicates.
+        return make_error<DuplicateDefinition>(*KV.first);
+      }
+    }
+
+    return Error::success();
+  });
+}
+
+void VSO::replace(std::unique_ptr<MaterializationUnit> MU) {
+  assert(MU != nullptr && "Can not replace with a null MaterializationUnit");
+
+  auto MustRunMU =
+      ES.runSessionLocked([&, this]() -> std::unique_ptr<MaterializationUnit> {
+
+#ifndef NDEBUG
+        for (auto &KV : MU->getSymbols()) {
+          auto SymI = Symbols.find(KV.first);
+          assert(SymI != Symbols.end() && "Replacing unknown symbol");
+          assert(!SymI->second.getFlags().isLazy() &&
+                 SymI->second.getFlags().isMaterializing() &&
+                 "Can not replace symbol that is not materializing");
+          assert(UnmaterializedInfos.count(KV.first) == 0 &&
+                 "Symbol being replaced should have no UnmaterializedInfo");
+        }
+#endif // NDEBUG
+
+        // If any symbol has pending queries against it then we need to
+        // materialize MU immediately.
+        for (auto &KV : MU->getSymbols()) {
+          auto MII = MaterializingInfos.find(KV.first);
+          if (MII != MaterializingInfos.end()) {
+            if (!MII->second.PendingQueries.empty())
+              return std::move(MU);
+          }
+        }
+
+        // Otherwise, make MU responsible for all the symbols.
+        auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU));
+        for (auto &KV : UMI->MU->getSymbols()) {
+          assert(!KV.second.isLazy() &&
+                 "Lazy flag should be managed internally.");
+          assert(!KV.second.isMaterializing() &&
+                 "Materializing flags should be managed internally.");
+
+          auto SymI = Symbols.find(KV.first);
+          JITSymbolFlags ReplaceFlags = KV.second;
+          ReplaceFlags |= JITSymbolFlags::Lazy;
+          SymI->second = JITEvaluatedSymbol(SymI->second.getAddress(),
+                                            std::move(ReplaceFlags));
+          UnmaterializedInfos[KV.first] = UMI;
+        }
+
+        return nullptr;
+      });
+
+  if (MustRunMU)
+    ES.dispatchMaterialization(*this, std::move(MustRunMU));
+}
+
+SymbolNameSet VSO::getRequestedSymbols(const SymbolFlagsMap &SymbolFlags) {
+  return ES.runSessionLocked([&]() {
+    SymbolNameSet RequestedSymbols;
+
+    for (auto &KV : SymbolFlags) {
+      assert(Symbols.count(KV.first) && "VSO does not cover this symbol?");
+      assert(Symbols[KV.first].getFlags().isMaterializing() &&
+             "getRequestedSymbols can only be called for materializing "
+             "symbols");
+      auto I = MaterializingInfos.find(KV.first);
+      if (I == MaterializingInfos.end())
+        continue;
+
+      if (!I->second.PendingQueries.empty())
+        RequestedSymbols.insert(KV.first);
+    }
+
+    return RequestedSymbols;
+  });
+}
+
+void VSO::addDependencies(const SymbolStringPtr &Name,
+                          const SymbolDependenceMap &Dependencies) {
+  assert(Symbols.count(Name) && "Name not in symbol table");
+  assert((Symbols[Name].getFlags().isLazy() ||
+          Symbols[Name].getFlags().isMaterializing()) &&
+         "Symbol is not lazy or materializing");
+
+  auto &MI = MaterializingInfos[Name];
+  assert(!MI.IsFinalized && "Can not add dependencies to finalized symbol");
+
+  for (auto &KV : Dependencies) {
+    assert(KV.first && "Null VSO in dependency?");
+    auto &OtherVSO = *KV.first;
+    auto &DepsOnOtherVSO = MI.UnfinalizedDependencies[&OtherVSO];
+
+    for (auto &OtherSymbol : KV.second) {
+#ifndef NDEBUG
+      // Assert that this symbol exists and has not been finalized already.
+      auto SymI = OtherVSO.Symbols.find(OtherSymbol);
+      assert(SymI != OtherVSO.Symbols.end() &&
+             (SymI->second.getFlags().isLazy() ||
+              SymI->second.getFlags().isMaterializing()) &&
+             "Dependency on finalized symbol");
+#endif
+
+      auto &OtherMI = OtherVSO.MaterializingInfos[OtherSymbol];
+
+      if (OtherMI.IsFinalized)
+        transferFinalizedNodeDependencies(MI, Name, OtherMI);
+      else if (&OtherVSO != this || OtherSymbol != Name) {
+        OtherMI.Dependants[this].insert(Name);
+        DepsOnOtherVSO.insert(OtherSymbol);
+      }
+    }
+
+    if (DepsOnOtherVSO.empty())
+      MI.UnfinalizedDependencies.erase(&OtherVSO);
+  }
+}
+
+void VSO::resolve(const SymbolMap &Resolved) {
+  auto FullyResolvedQueries = ES.runSessionLocked([&, this]() {
+    AsynchronousSymbolQuerySet FullyResolvedQueries;
+    for (const auto &KV : Resolved) {
+      auto &Name = KV.first;
+      auto Sym = KV.second;
+
+      assert(!Sym.getFlags().isLazy() && !Sym.getFlags().isMaterializing() &&
+             "Materializing flags should be managed internally");
+
+      auto I = Symbols.find(Name);
+
+      assert(I != Symbols.end() && "Symbol not found");
+      assert(!I->second.getFlags().isLazy() &&
+             I->second.getFlags().isMaterializing() &&
+             "Symbol should be materializing");
+      assert(I->second.getAddress() == 0 && "Symbol has already been resolved");
+
+      assert((Sym.getFlags() & ~JITSymbolFlags::Weak) ==
+                 (JITSymbolFlags::stripTransientFlags(I->second.getFlags()) &
+                  ~JITSymbolFlags::Weak) &&
+             "Resolved flags should match the declared flags");
+
+      // Once resolved, symbols can never be weak.
+      JITSymbolFlags ResolvedFlags = Sym.getFlags();
+      ResolvedFlags &= ~JITSymbolFlags::Weak;
+      ResolvedFlags |= JITSymbolFlags::Materializing;
+      I->second = JITEvaluatedSymbol(Sym.getAddress(), ResolvedFlags);
+
+      auto &MI = MaterializingInfos[Name];
+      for (auto &Q : MI.PendingQueries) {
+        Q->resolve(Name, Sym);
+        if (Q->isFullyResolved())
+          FullyResolvedQueries.insert(Q);
+      }
+    }
+
+    return FullyResolvedQueries;
+  });
+
+  for (auto &Q : FullyResolvedQueries) {
+    assert(Q->isFullyResolved() && "Q not fully resolved");
+    Q->handleFullyResolved();
+  }
+}
+
+void VSO::finalize(const SymbolFlagsMap &Finalized) {
+  auto FullyReadyQueries = ES.runSessionLocked([&, this]() {
+    AsynchronousSymbolQuerySet ReadyQueries;
+
+    for (const auto &KV : Finalized) {
+      const auto &Name = KV.first;
+
+      auto MII = MaterializingInfos.find(Name);
+      assert(MII != MaterializingInfos.end() &&
+             "Missing MaterializingInfo entry");
+
+      auto &MI = MII->second;
+
+      // For each dependant, transfer this node's unfinalized dependencies to
+      // it. If the dependant node is fully finalized then notify any pending
+      // queries.
+      for (auto &KV : MI.Dependants) {
+        auto &DependantVSO = *KV.first;
+        for (auto &DependantName : KV.second) {
+          auto DependantMII =
+              DependantVSO.MaterializingInfos.find(DependantName);
+          assert(DependantMII != DependantVSO.MaterializingInfos.end() &&
+                 "Dependant should have MaterializingInfo");
+
+          auto &DependantMI = DependantMII->second;
+
+          // Remove the dependant's dependency on this node.
+          assert(DependantMI.UnfinalizedDependencies[this].count(Name) &&
+                 "Dependant does not count this symbol as a dependency?");
+          DependantMI.UnfinalizedDependencies[this].erase(Name);
+          if (DependantMI.UnfinalizedDependencies[this].empty())
+            DependantMI.UnfinalizedDependencies.erase(this);
+
+          // Transfer unfinalized dependencies from this node to the dependant.
+          DependantVSO.transferFinalizedNodeDependencies(DependantMI,
+                                                         DependantName, MI);
+
+          // If the dependant is finalized and this node was the last of its
+          // unfinalized dependencies then notify any pending queries on the
+          // dependant node.
+          if (DependantMI.IsFinalized &&
+              DependantMI.UnfinalizedDependencies.empty()) {
+            assert(DependantMI.Dependants.empty() &&
+                   "Dependants should be empty by now");
+            for (auto &Q : DependantMI.PendingQueries) {
+              Q->notifySymbolReady();
+              if (Q->isFullyReady())
+                ReadyQueries.insert(Q);
+              Q->removeQueryDependence(DependantVSO, DependantName);
+            }
+
+            // If this dependant node was fully finalized we can erase its
+            // MaterializingInfo and update its materializing state.
+            assert(DependantVSO.Symbols.count(DependantName) &&
+                   "Dependant has no entry in the Symbols table");
+            auto &DependantSym = DependantVSO.Symbols[DependantName];
+            DependantSym.setFlags(static_cast<JITSymbolFlags::FlagNames>(
+                DependantSym.getFlags() & ~JITSymbolFlags::Materializing));
+            DependantVSO.MaterializingInfos.erase(DependantMII);
+          }
+        }
+      }
+      MI.Dependants.clear();
+      MI.IsFinalized = true;
+
+      if (MI.UnfinalizedDependencies.empty()) {
+        for (auto &Q : MI.PendingQueries) {
+          Q->notifySymbolReady();
+          if (Q->isFullyReady())
+            ReadyQueries.insert(Q);
+          Q->removeQueryDependence(*this, Name);
+        }
+        assert(Symbols.count(Name) &&
+               "Symbol has no entry in the Symbols table");
+        auto &Sym = Symbols[Name];
+        Sym.setFlags(static_cast<JITSymbolFlags::FlagNames>(
+            Sym.getFlags() & ~JITSymbolFlags::Materializing));
+        MaterializingInfos.erase(MII);
+      }
+    }
+
+    return ReadyQueries;
+  });
+
+  for (auto &Q : FullyReadyQueries) {
+    assert(Q->isFullyReady() && "Q is not fully ready");
+    Q->handleFullyReady();
+  }
+}
+
+void VSO::notifyFailed(const SymbolNameSet &FailedSymbols) {
+
+  // FIXME: This should fail any transitively dependant symbols too.
+
+  auto FailedQueriesToNotify = ES.runSessionLocked([&, this]() {
+    AsynchronousSymbolQuerySet FailedQueries;
+
+    for (auto &Name : FailedSymbols) {
+      auto I = Symbols.find(Name);
+      assert(I != Symbols.end() && "Symbol not present in this VSO");
+      Symbols.erase(I);
+
+      auto MII = MaterializingInfos.find(Name);
+
+      // If we have not created a MaterializingInfo for this symbol yet then
+      // there is nobody to notify.
+      if (MII == MaterializingInfos.end())
+        continue;
+
+      // Copy all the queries to the FailedQueries list, then abandon them.
+      // This has to be a copy, and the copy has to come before the abandon
+      // operation: Each Q.detach() call will reach back into this
+      // PendingQueries list to remove Q.
+      for (auto &Q : MII->second.PendingQueries)
+        FailedQueries.insert(Q);
+
+      for (auto &Q : FailedQueries)
+        Q->detach();
+
+      assert(MII->second.PendingQueries.empty() &&
+             "Queries remain after symbol was failed");
+
+      MaterializingInfos.erase(MII);
+    }
+
+    return FailedQueries;
+  });
+
+  for (auto &Q : FailedQueriesToNotify)
+    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
+}
+
+void VSO::setSearchOrder(VSOList NewSearchOrder, bool SearchThisVSOFirst) {
+  if (SearchThisVSOFirst && NewSearchOrder.front() != this)
+    NewSearchOrder.insert(NewSearchOrder.begin(), this);
+
+  ES.runSessionLocked([&]() { SearchOrder = std::move(NewSearchOrder); });
+}
+
+void VSO::addToSearchOrder(VSO &V) {
+  ES.runSessionLocked([&]() { SearchOrder.push_back(&V); });
+}
+
+void VSO::replaceInSearchOrder(VSO &OldV, VSO &NewV) {
+  ES.runSessionLocked([&]() {
+    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &OldV);
+
+    if (I != SearchOrder.end())
+      *I = &NewV;
+  });
+}
+
+void VSO::removeFromSearchOrder(VSO &V) {
+  ES.runSessionLocked([&]() {
+    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &V);
+    if (I != SearchOrder.end())
+      SearchOrder.erase(I);
+  });
+}
+
+SymbolFlagsMap VSO::lookupFlags(const SymbolNameSet &Names) {
+  return ES.runSessionLocked([&, this]() {
+    SymbolFlagsMap Result;
+    auto Unresolved = lookupFlagsImpl(Result, Names);
+    if (FallbackDefinitionGenerator && !Unresolved.empty()) {
+      auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
+      if (!FallbackDefs.empty()) {
+        auto Unresolved2 = lookupFlagsImpl(Result, FallbackDefs);
+        (void)Unresolved2;
+        assert(Unresolved2.empty() &&
+               "All fallback defs should have been found by lookupFlagsImpl");
+      }
+    };
+    return Result;
+  });
+}
+
+SymbolNameSet VSO::lookupFlagsImpl(SymbolFlagsMap &Flags,
+                                   const SymbolNameSet &Names) {
+  SymbolNameSet Unresolved;
+
+  for (auto &Name : Names) {
+    auto I = Symbols.find(Name);
+
+    if (I == Symbols.end()) {
+      Unresolved.insert(Name);
+      continue;
+    }
+
+    assert(!Flags.count(Name) && "Symbol already present in Flags map");
+    Flags[Name] = JITSymbolFlags::stripTransientFlags(I->second.getFlags());
+  }
+
+  return Unresolved;
+}
+
+void VSO::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                     SymbolNameSet &Unresolved, MaterializationUnitList &MUs) {
+  assert(Q && "Query can not be null");
+
+  lodgeQueryImpl(Q, Unresolved, MUs);
+  if (FallbackDefinitionGenerator && !Unresolved.empty()) {
+    auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
+    if (!FallbackDefs.empty()) {
+      for (auto &D : FallbackDefs)
+        Unresolved.erase(D);
+      lodgeQueryImpl(Q, FallbackDefs, MUs);
+      assert(FallbackDefs.empty() &&
+             "All fallback defs should have been found by lookupImpl");
+    }
+  }
+}
+
+void VSO::lodgeQueryImpl(
+    std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
+    std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
+  for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
+    auto TmpI = I++;
+    auto Name = *TmpI;
+
+    // Search for the name in Symbols. Skip it if not found.
+    auto SymI = Symbols.find(Name);
+    if (SymI == Symbols.end())
+      continue;
+
+    // If we found Name in V, remove it frome the Unresolved set and add it
+    // to the added set.
+    Unresolved.erase(TmpI);
+
+    // If the symbol has an address then resolve it.
+    if (SymI->second.getAddress() != 0)
+      Q->resolve(Name, SymI->second);
+
+    // If the symbol is lazy, get the MaterialiaztionUnit for it.
+    if (SymI->second.getFlags().isLazy()) {
+      assert(SymI->second.getAddress() == 0 &&
+             "Lazy symbol should not have a resolved address");
+      assert(!SymI->second.getFlags().isMaterializing() &&
+             "Materializing and lazy should not both be set");
+      auto UMII = UnmaterializedInfos.find(Name);
+      assert(UMII != UnmaterializedInfos.end() &&
+             "Lazy symbol should have UnmaterializedInfo");
+      auto MU = std::move(UMII->second->MU);
+      assert(MU != nullptr && "Materializer should not be null");
+
+      // Move all symbols associated with this MaterializationUnit into
+      // materializing state.
+      for (auto &KV : MU->getSymbols()) {
+        auto SymK = Symbols.find(KV.first);
+        auto Flags = SymK->second.getFlags();
+        Flags &= ~JITSymbolFlags::Lazy;
+        Flags |= JITSymbolFlags::Materializing;
+        SymK->second.setFlags(Flags);
+        UnmaterializedInfos.erase(KV.first);
+      }
+
+      // Add MU to the list of MaterializationUnits to be materialized.
+      MUs.push_back(std::move(MU));
+    } else if (!SymI->second.getFlags().isMaterializing()) {
+      // The symbol is neither lazy nor materializing. Finalize it and
+      // continue.
+      Q->notifySymbolReady();
+      continue;
+    }
+
+    // Add the query to the PendingQueries list.
+    assert(SymI->second.getFlags().isMaterializing() &&
+           "By this line the symbol should be materializing");
+    auto &MI = MaterializingInfos[Name];
+    MI.PendingQueries.push_back(Q);
+    Q->addQueryDependence(*this, Name);
+  }
+}
+
+SymbolNameSet VSO::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
+                                SymbolNameSet Names) {
+  assert(Q && "Query can not be null");
+
+  ES.runOutstandingMUs();
+
+  LookupImplActionFlags ActionFlags = None;
+  std::vector<std::unique_ptr<MaterializationUnit>> MUs;
+
+  SymbolNameSet Unresolved = std::move(Names);
+  ES.runSessionLocked([&, this]() {
+    ActionFlags = lookupImpl(Q, MUs, Unresolved);
+    if (FallbackDefinitionGenerator && !Unresolved.empty()) {
+      assert(ActionFlags == None &&
+             "ActionFlags set but unresolved symbols remain?");
+      auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
+      if (!FallbackDefs.empty()) {
+        for (auto &D : FallbackDefs)
+          Unresolved.erase(D);
+        ActionFlags = lookupImpl(Q, MUs, FallbackDefs);
+        assert(FallbackDefs.empty() &&
+               "All fallback defs should have been found by lookupImpl");
+      }
+    }
+  });
+
+  assert((MUs.empty() || ActionFlags == None) &&
+         "If action flags are set, there should be no work to do (so no MUs)");
+
+  if (ActionFlags & NotifyFullyResolved)
+    Q->handleFullyResolved();
+
+  if (ActionFlags & NotifyFullyReady)
+    Q->handleFullyReady();
+
+  // FIXME: Swap back to the old code below once RuntimeDyld works with
+  //        callbacks from asynchronous queries.
+  // Add MUs to the OutstandingMUs list.
+  {
+    std::lock_guard<std::recursive_mutex> Lock(ES.OutstandingMUsMutex);
+    for (auto &MU : MUs)
+      ES.OutstandingMUs.push_back(make_pair(this, std::move(MU)));
+  }
+  ES.runOutstandingMUs();
+
+  // Dispatch any required MaterializationUnits for materialization.
+  // for (auto &MU : MUs)
+  //  ES.dispatchMaterialization(*this, std::move(MU));
+
+  return Unresolved;
+}
+
+VSO::LookupImplActionFlags
+VSO::lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
+                SymbolNameSet &Unresolved) {
+  LookupImplActionFlags ActionFlags = None;
+
+  for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
+    auto TmpI = I++;
+    auto Name = *TmpI;
+
+    // Search for the name in Symbols. Skip it if not found.
+    auto SymI = Symbols.find(Name);
+    if (SymI == Symbols.end())
+      continue;
+
+    // If we found Name in V, remove it frome the Unresolved set and add it
+    // to the dependencies set.
+    Unresolved.erase(TmpI);
+
+    // If the symbol has an address then resolve it.
+    if (SymI->second.getAddress() != 0) {
+      Q->resolve(Name, SymI->second);
+      if (Q->isFullyResolved())
+        ActionFlags |= NotifyFullyResolved;
+    }
+
+    // If the symbol is lazy, get the MaterialiaztionUnit for it.
+    if (SymI->second.getFlags().isLazy()) {
+      assert(SymI->second.getAddress() == 0 &&
+             "Lazy symbol should not have a resolved address");
+      assert(!SymI->second.getFlags().isMaterializing() &&
+             "Materializing and lazy should not both be set");
+      auto UMII = UnmaterializedInfos.find(Name);
+      assert(UMII != UnmaterializedInfos.end() &&
+             "Lazy symbol should have UnmaterializedInfo");
+      auto MU = std::move(UMII->second->MU);
+      assert(MU != nullptr && "Materializer should not be null");
+
+      // Kick all symbols associated with this MaterializationUnit into
+      // materializing state.
+      for (auto &KV : MU->getSymbols()) {
+        auto SymK = Symbols.find(KV.first);
+        auto Flags = SymK->second.getFlags();
+        Flags &= ~JITSymbolFlags::Lazy;
+        Flags |= JITSymbolFlags::Materializing;
+        SymK->second.setFlags(Flags);
+        UnmaterializedInfos.erase(KV.first);
+      }
+
+      // Add MU to the list of MaterializationUnits to be materialized.
+      MUs.push_back(std::move(MU));
+    } else if (!SymI->second.getFlags().isMaterializing()) {
+      // The symbol is neither lazy nor materializing. Finalize it and
+      // continue.
+      Q->notifySymbolReady();
+      if (Q->isFullyReady())
+        ActionFlags |= NotifyFullyReady;
+      continue;
+    }
+
+    // Add the query to the PendingQueries list.
+    assert(SymI->second.getFlags().isMaterializing() &&
+           "By this line the symbol should be materializing");
+    auto &MI = MaterializingInfos[Name];
+    MI.PendingQueries.push_back(Q);
+    Q->addQueryDependence(*this, Name);
+  }
+
+  return ActionFlags;
+}
+
+void VSO::dump(raw_ostream &OS) {
+  ES.runSessionLocked([&, this]() {
+    OS << "VSO \"" << VSOName
+       << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
+       << "):\n"
+       << "Symbol table:\n";
+
+    for (auto &KV : Symbols) {
+      OS << "    \"" << *KV.first
+         << "\": " << format("0x%016x", KV.second.getAddress());
+      if (KV.second.getFlags().isLazy() ||
+          KV.second.getFlags().isMaterializing()) {
+        OS << " (";
+        if (KV.second.getFlags().isLazy()) {
+          auto I = UnmaterializedInfos.find(KV.first);
+          assert(I != UnmaterializedInfos.end() &&
+                 "Lazy symbol should have UnmaterializedInfo");
+          OS << " Lazy (MU=" << I->second->MU.get() << ")";
+        }
+        if (KV.second.getFlags().isMaterializing())
+          OS << " Materializing";
+        OS << " )\n";
+      } else
+        OS << "\n";
+    }
+
+    if (!MaterializingInfos.empty())
+      OS << "  MaterializingInfos entries:\n";
+    for (auto &KV : MaterializingInfos) {
+      OS << "    \"" << *KV.first << "\":\n"
+         << "      IsFinalized = " << (KV.second.IsFinalized ? "true" : "false")
+         << "\n"
+         << "      " << KV.second.PendingQueries.size()
+         << " pending queries: { ";
+      for (auto &Q : KV.second.PendingQueries)
+        OS << Q.get() << " ";
+      OS << "}\n      Dependants:\n";
+      for (auto &KV2 : KV.second.Dependants)
+        OS << "        " << KV2.first->getName() << ": " << KV2.second << "\n";
+      OS << "      Unfinalized Dependencies:\n";
+      for (auto &KV2 : KV.second.UnfinalizedDependencies)
+        OS << "        " << KV2.first->getName() << ": " << KV2.second << "\n";
+    }
+  });
+}
+
+VSO::VSO(ExecutionSessionBase &ES, std::string Name)
+    : ES(ES), VSOName(std::move(Name)) {
+  SearchOrder.push_back(this);
+}
+
+Error VSO::defineImpl(MaterializationUnit &MU) {
+  SymbolNameSet Duplicates;
+  SymbolNameSet MUDefsOverridden;
+
+  struct ExistingDefOverriddenEntry {
+    SymbolMap::iterator ExistingDefItr;
+    JITSymbolFlags NewFlags;
+  };
+  std::vector<ExistingDefOverriddenEntry> ExistingDefsOverridden;
+
+  for (auto &KV : MU.getSymbols()) {
+    assert(!KV.second.isLazy() && "Lazy flag should be managed internally.");
+    assert(!KV.second.isMaterializing() &&
+           "Materializing flags should be managed internally.");
+
+    SymbolMap::iterator EntryItr;
+    bool Added;
+
+    auto NewFlags = KV.second;
+    NewFlags |= JITSymbolFlags::Lazy;
+
+    std::tie(EntryItr, Added) = Symbols.insert(
+        std::make_pair(KV.first, JITEvaluatedSymbol(0, NewFlags)));
+
+    if (!Added) {
+      if (KV.second.isStrong()) {
+        if (EntryItr->second.getFlags().isStrong() ||
+            (EntryItr->second.getFlags() & JITSymbolFlags::Materializing))
+          Duplicates.insert(KV.first);
+        else
+          ExistingDefsOverridden.push_back({EntryItr, NewFlags});
+      } else
+        MUDefsOverridden.insert(KV.first);
+    }
+  }
+
+  if (!Duplicates.empty()) {
+    // We need to remove the symbols we added.
+    for (auto &KV : MU.getSymbols()) {
+      if (Duplicates.count(KV.first))
+        continue;
+
+      bool Found = false;
+      for (const auto &EDO : ExistingDefsOverridden)
+        if (EDO.ExistingDefItr->first == KV.first)
+          Found = true;
+
+      if (!Found)
+        Symbols.erase(KV.first);
+    }
+
+    // FIXME: Return all duplicates.
+    return make_error<DuplicateDefinition>(**Duplicates.begin());
+  }
+
+  // Update flags on existing defs and call discard on their materializers.
+  for (auto &EDO : ExistingDefsOverridden) {
+    assert(EDO.ExistingDefItr->second.getFlags().isLazy() &&
+           !EDO.ExistingDefItr->second.getFlags().isMaterializing() &&
+           "Overridden existing def should be in the Lazy state");
+
+    EDO.ExistingDefItr->second.setFlags(EDO.NewFlags);
+
+    auto UMII = UnmaterializedInfos.find(EDO.ExistingDefItr->first);
+    assert(UMII != UnmaterializedInfos.end() &&
+           "Overridden existing def should have an UnmaterializedInfo");
+
+    UMII->second->MU->doDiscard(*this, EDO.ExistingDefItr->first);
+  }
+
+  // Discard overridden symbols povided by MU.
+  for (auto &Sym : MUDefsOverridden)
+    MU.doDiscard(*this, Sym);
+
+  return Error::success();
+}
+
+void VSO::detachQueryHelper(AsynchronousSymbolQuery &Q,
+                            const SymbolNameSet &QuerySymbols) {
+  for (auto &QuerySymbol : QuerySymbols) {
+    assert(MaterializingInfos.count(QuerySymbol) &&
+           "QuerySymbol does not have MaterializingInfo");
+    auto &MI = MaterializingInfos[QuerySymbol];
+
+    auto IdenticalQuery =
+        [&](const std::shared_ptr<AsynchronousSymbolQuery> &R) {
+          return R.get() == &Q;
+        };
+
+    auto I = std::find_if(MI.PendingQueries.begin(), MI.PendingQueries.end(),
+                          IdenticalQuery);
+    assert(I != MI.PendingQueries.end() &&
+           "Query Q should be in the PendingQueries list for QuerySymbol");
+    MI.PendingQueries.erase(I);
+  }
+}
+
+void VSO::transferFinalizedNodeDependencies(
+    MaterializingInfo &DependantMI, const SymbolStringPtr &DependantName,
+    MaterializingInfo &FinalizedMI) {
+  for (auto &KV : FinalizedMI.UnfinalizedDependencies) {
+    auto &DependencyVSO = *KV.first;
+    SymbolNameSet *UnfinalizedDependenciesOnDependencyVSO = nullptr;
+
+    for (auto &DependencyName : KV.second) {
+      auto &DependencyMI = DependencyVSO.MaterializingInfos[DependencyName];
+
+      // Do not add self dependencies.
+      if (&DependencyMI == &DependantMI)
+        continue;
+
+      // If we haven't looked up the dependencies for DependencyVSO yet, do it
+      // now and cache the result.
+      if (!UnfinalizedDependenciesOnDependencyVSO)
+        UnfinalizedDependenciesOnDependencyVSO =
+            &DependantMI.UnfinalizedDependencies[&DependencyVSO];
+
+      DependencyMI.Dependants[this].insert(DependantName);
+      UnfinalizedDependenciesOnDependencyVSO->insert(DependencyName);
+    }
+  }
+}
+
+VSO &ExecutionSession::createVSO(std::string Name) {
+  return runSessionLocked([&, this]() -> VSO & {
+      VSOs.push_back(std::unique_ptr<VSO>(new VSO(*this, std::move(Name))));
+    return *VSOs.back();
+  });
+}
+
+Expected<SymbolMap> lookup(const VSOList &VSOs, SymbolNameSet Names) {
+
+  if (VSOs.empty())
+    return SymbolMap();
+
+  auto &ES = (*VSOs.begin())->getExecutionSession();
+
+  return ES.lookup(VSOs, Names, NoDependenciesToRegister, true);
+}
+
+/// Look up a symbol by searching a list of VSOs.
+Expected<JITEvaluatedSymbol> lookup(const VSOList &VSOs, SymbolStringPtr Name) {
+  SymbolNameSet Names({Name});
+  if (auto ResultMap = lookup(VSOs, std::move(Names))) {
+    assert(ResultMap->size() == 1 && "Unexpected number of results");
+    assert(ResultMap->count(Name) && "Missing result for symbol");
+    return std::move(ResultMap->begin()->second);
+  } else
+    return ResultMap.takeError();
+}
+
+MangleAndInterner::MangleAndInterner(ExecutionSessionBase &ES,
+                                     const DataLayout &DL)
+    : ES(ES), DL(DL) {}
+
+SymbolStringPtr MangleAndInterner::operator()(StringRef Name) {
+  std::string MangledName;
+  {
+    raw_string_ostream MangledNameStream(MangledName);
+    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
+  }
+  return ES.getSymbolStringPool().intern(MangledName);
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index b7220dba88e9..6157677ce355 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -13,10 +13,51 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 namespace orc {
 
+JITTargetMachineBuilder::JITTargetMachineBuilder(Triple TT)
+    : TT(std::move(TT)) {}
+
+Expected<JITTargetMachineBuilder> JITTargetMachineBuilder::detectHost() {
+  return JITTargetMachineBuilder(Triple(sys::getProcessTriple()));
+}
+
+Expected<std::unique_ptr<TargetMachine>>
+JITTargetMachineBuilder::createTargetMachine() {
+  if (!Arch.empty()) {
+    Triple::ArchType Type = Triple::getArchTypeForLLVMName(Arch);
+
+    if (Type == Triple::UnknownArch)
+      return make_error<StringError>(std::string("Unknown arch: ") + Arch,
+                                     inconvertibleErrorCode());
+  }
+
+  std::string ErrMsg;
+  auto *TheTarget = TargetRegistry::lookupTarget(TT.getTriple(), ErrMsg);
+  if (!TheTarget)
+    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
+
+  auto *TM =
+      TheTarget->createTargetMachine(TT.getTriple(), CPU, Features.getString(),
+                                     Options, RM, CM, OptLevel, /*JIT*/ true);
+  if (!TM)
+    return make_error<StringError>("Could not allocate target machine",
+                                   inconvertibleErrorCode());
+
+  return std::unique_ptr<TargetMachine>(TM);
+}
+
+JITTargetMachineBuilder &JITTargetMachineBuilder::addFeatures(
+    const std::vector<std::string> &FeatureVec) {
+  for (const auto &F : FeatureVec)
+    Features.AddFeature(F);
+  return *this;
+}
+
 CtorDtorIterator::CtorDtorIterator(const GlobalVariable *GV, bool End)
   : InitList(
       GV ? dyn_cast_or_null<ConstantArray>(GV->getInitializer()) : nullptr),
@@ -67,7 +108,9 @@ CtorDtorIterator::Element CtorDtorIterator::operator*() const {
   }
 
   ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
-  Value *Data = CS->getOperand(2);
+  Value *Data = CS->getNumOperands() == 3 ? CS->getOperand(2) : nullptr;
+  if (Data && !isa<GlobalValue>(Data))
+    Data = nullptr;
   return Element(Priority->getZExtValue(), Func, Data);
 }
 
@@ -83,20 +126,123 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M) {
                     CtorDtorIterator(DtorsList, true));
 }
 
-void LocalCXXRuntimeOverrides::runDestructors() {
+void CtorDtorRunner2::add(iterator_range<CtorDtorIterator> CtorDtors) {
+  if (CtorDtors.begin() == CtorDtors.end())
+    return;
+
+  MangleAndInterner Mangle(
+      V.getExecutionSession(),
+      (*CtorDtors.begin()).Func->getParent()->getDataLayout());
+
+  for (const auto &CtorDtor : CtorDtors) {
+    assert(CtorDtor.Func && CtorDtor.Func->hasName() &&
+           "Ctor/Dtor function must be named to be runnable under the JIT");
+
+    if (CtorDtor.Data && cast<GlobalValue>(CtorDtor.Data)->isDeclaration()) {
+      dbgs() << "  Skipping because why now?\n";
+      continue;
+    }
+
+    CtorDtorsByPriority[CtorDtor.Priority].push_back(
+        Mangle(CtorDtor.Func->getName()));
+  }
+}
+
+Error CtorDtorRunner2::run() {
+  using CtorDtorTy = void (*)();
+
+  SymbolNameSet Names;
+
+  for (auto &KV : CtorDtorsByPriority) {
+    for (auto &Name : KV.second) {
+      auto Added = Names.insert(Name).second;
+      (void)Added;
+      assert(Added && "Ctor/Dtor names clashed");
+    }
+  }
+
+  if (auto CtorDtorMap = lookup({&V}, std::move(Names))) {
+    for (auto &KV : CtorDtorsByPriority) {
+      for (auto &Name : KV.second) {
+        assert(CtorDtorMap->count(Name) && "No entry for Name");
+        auto CtorDtor = reinterpret_cast<CtorDtorTy>(
+            static_cast<uintptr_t>((*CtorDtorMap)[Name].getAddress()));
+        CtorDtor();
+      }
+    }
+    return Error::success();
+  } else
+    return CtorDtorMap.takeError();
+
+  CtorDtorsByPriority.clear();
+
+  return Error::success();
+}
+
+void LocalCXXRuntimeOverridesBase::runDestructors() {
   auto& CXXDestructorDataPairs = DSOHandleOverride;
   for (auto &P : CXXDestructorDataPairs)
     P.first(P.second);
   CXXDestructorDataPairs.clear();
 }
 
-int LocalCXXRuntimeOverrides::CXAAtExitOverride(DestructorPtr Destructor,
-                                                void *Arg, void *DSOHandle) {
+int LocalCXXRuntimeOverridesBase::CXAAtExitOverride(DestructorPtr Destructor,
+                                                    void *Arg,
+                                                    void *DSOHandle) {
   auto& CXXDestructorDataPairs =
     *reinterpret_cast<CXXDestructorDataPairList*>(DSOHandle);
   CXXDestructorDataPairs.push_back(std::make_pair(Destructor, Arg));
   return 0;
 }
 
+Error LocalCXXRuntimeOverrides2::enable(VSO &V, MangleAndInterner &Mangle) {
+  SymbolMap RuntimeInterposes(
+      {{Mangle("__dso_handle"),
+        JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride),
+                           JITSymbolFlags::Exported)},
+       {Mangle("__cxa_atexit"),
+        JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride),
+                           JITSymbolFlags::Exported)}});
+
+  return V.define(absoluteSymbols(std::move(RuntimeInterposes)));
+}
+
+DynamicLibraryFallbackGenerator::DynamicLibraryFallbackGenerator(
+    sys::DynamicLibrary Dylib, const DataLayout &DL, SymbolPredicate Allow)
+    : Dylib(std::move(Dylib)), Allow(std::move(Allow)),
+      GlobalPrefix(DL.getGlobalPrefix()) {}
+
+SymbolNameSet DynamicLibraryFallbackGenerator::
+operator()(VSO &V, const SymbolNameSet &Names) {
+  orc::SymbolNameSet Added;
+  orc::SymbolMap NewSymbols;
+
+  bool HasGlobalPrefix = (GlobalPrefix != '\0');
+
+  for (auto &Name : Names) {
+    if (!Allow(Name) || (*Name).empty())
+      continue;
+
+    if (HasGlobalPrefix && (*Name).front() != GlobalPrefix)
+      continue;
+
+    std::string Tmp((*Name).data() + (HasGlobalPrefix ? 1 : 0), (*Name).size());
+    if (void *Addr = Dylib.getAddressOfSymbol(Tmp.c_str())) {
+      Added.insert(Name);
+      NewSymbols[Name] = JITEvaluatedSymbol(
+          static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Addr)),
+          JITSymbolFlags::Exported);
+    }
+  }
+
+  // Add any new symbols to V. Since the fallback generator is only called for
+  // symbols that are not already defined, this will never trigger a duplicate
+  // definition error, so we can wrap this call in a 'cantFail'.
+  if (!NewSymbols.empty())
+    cantFail(V.define(absoluteSymbols(std::move(NewSymbols))));
+
+  return Added;
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
new file mode 100644
index 000000000000..0c17f9b7ad49
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -0,0 +1,44 @@
+//===--------------- IRCompileLayer.cpp - IR Compiling Layer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+
+namespace llvm {
+namespace orc {
+
+IRCompileLayer2::IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                                 CompileFunction Compile)
+    : IRLayer(ES), BaseLayer(BaseLayer), Compile(std::move(Compile)) {}
+
+void IRCompileLayer2::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
+  std::lock_guard<std::mutex> Lock(IRLayerMutex);
+  this->NotifyCompiled = std::move(NotifyCompiled);
+}
+
+void IRCompileLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+                           std::unique_ptr<Module> M) {
+  assert(M && "Module must not be null");
+
+  if (auto Obj = Compile(*M)) {
+    {
+      std::lock_guard<std::mutex> Lock(IRLayerMutex);
+      if (NotifyCompiled)
+        NotifyCompiled(K, std::move(M));
+      else
+        M = nullptr;
+    }
+    BaseLayer.emit(std::move(R), std::move(K), std::move(*Obj));
+  } else {
+    R.failMaterialization();
+    getExecutionSession().reportError(Obj.takeError());
+  }
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
new file mode 100644
index 000000000000..4dd3cfdfe387
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -0,0 +1,34 @@
+//===-------------- IRTransformLayer.cpp - IR Transform Layer -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+namespace orc {
+
+IRTransformLayer2::IRTransformLayer2(ExecutionSession &ES,
+                                     IRLayer &BaseLayer,
+                                     TransformFunction Transform)
+    : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
+
+void IRTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+                             std::unique_ptr<Module> M) {
+  assert(M && "Module must not be null");
+
+  if (auto TransformedMod = Transform(std::move(M)))
+    BaseLayer.emit(std::move(R), std::move(K), std::move(*TransformedMod));
+  else {
+    R.failMaterialization();
+    getExecutionSession().reportError(TransformedMod.takeError());
+  }
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 68397beae63a..9ca2c5cb4a55 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -13,38 +13,123 @@
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <sstream>
 
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+class CompileCallbackMaterializationUnit : public orc::MaterializationUnit {
+public:
+  using CompileFunction = JITCompileCallbackManager::CompileFunction;
+
+  CompileCallbackMaterializationUnit(SymbolStringPtr Name,
+                                     CompileFunction Compile)
+      : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}})),
+        Name(std::move(Name)), Compile(std::move(Compile)) {}
+
+private:
+  void materialize(MaterializationResponsibility R) {
+    SymbolMap Result;
+    Result[Name] = JITEvaluatedSymbol(Compile(), JITSymbolFlags::Exported);
+    R.resolve(Result);
+    R.finalize();
+  }
+
+  void discard(const VSO &V, SymbolStringPtr Name) {
+    llvm_unreachable("Discard should never occur on a LMU?");
+  }
+
+  SymbolStringPtr Name;
+  CompileFunction Compile;
+};
+
+} // namespace
+
 namespace llvm {
 namespace orc {
 
 void JITCompileCallbackManager::anchor() {}
 void IndirectStubsManager::anchor() {}
 
+Expected<JITTargetAddress>
+JITCompileCallbackManager::getCompileCallback(CompileFunction Compile) {
+  if (auto TrampolineAddr = getAvailableTrampolineAddr()) {
+    auto CallbackName = ES.getSymbolStringPool().intern(
+        std::string("cc") + std::to_string(++NextCallbackId));
+
+    std::lock_guard<std::mutex> Lock(CCMgrMutex);
+    AddrToSymbol[*TrampolineAddr] = CallbackName;
+    cantFail(CallbacksVSO.define(
+        llvm::make_unique<CompileCallbackMaterializationUnit>(
+            std::move(CallbackName), std::move(Compile))));
+    return *TrampolineAddr;
+  } else
+    return TrampolineAddr.takeError();
+}
+
+JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
+    JITTargetAddress TrampolineAddr) {
+  SymbolStringPtr Name;
+
+  {
+    std::unique_lock<std::mutex> Lock(CCMgrMutex);
+    auto I = AddrToSymbol.find(TrampolineAddr);
+
+    // If this address is not associated with a compile callback then report an
+    // error to the execution session and return ErrorHandlerAddress to the
+    // callee.
+    if (I == AddrToSymbol.end()) {
+      Lock.unlock();
+      std::string ErrMsg;
+      {
+        raw_string_ostream ErrMsgStream(ErrMsg);
+        ErrMsgStream << "No compile callback for trampoline at "
+                     << format("0x%016x", TrampolineAddr);
+      }
+      ES.reportError(
+          make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode()));
+      return ErrorHandlerAddress;
+    } else
+      Name = I->second;
+  }
+
+  if (auto Sym = lookup({&CallbacksVSO}, Name))
+    return Sym->getAddress();
+  else {
+    // If anything goes wrong materializing Sym then report it to the session
+    // and return the ErrorHandlerAddress;
+    ES.reportError(Sym.takeError());
+    return ErrorHandlerAddress;
+  }
+}
+
 std::unique_ptr<JITCompileCallbackManager>
-createLocalCompileCallbackManager(const Triple &T,
+createLocalCompileCallbackManager(const Triple &T, ExecutionSession &ES,
                                   JITTargetAddress ErrorHandlerAddress) {
   switch (T.getArch()) {
     default: return nullptr;
 
     case Triple::aarch64: {
       typedef orc::LocalJITCompileCallbackManager<orc::OrcAArch64> CCMgrT;
-      return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+      return llvm::make_unique<CCMgrT>(ES, ErrorHandlerAddress);
     }
 
     case Triple::x86: {
       typedef orc::LocalJITCompileCallbackManager<orc::OrcI386> CCMgrT;
-      return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+      return llvm::make_unique<CCMgrT>(ES, ErrorHandlerAddress);
     }
 
     case Triple::x86_64: {
       if ( T.getOS() == Triple::OSType::Win32 ) {
         typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64_Win32> CCMgrT;
-        return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+        return llvm::make_unique<CCMgrT>(ES, ErrorHandlerAddress);
       } else {
         typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64_SysV> CCMgrT;
-        return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+        return llvm::make_unique<CCMgrT>(ES, ErrorHandlerAddress);
       }
     }
 
@@ -54,7 +139,11 @@ createLocalCompileCallbackManager(const Triple &T,
 std::function<std::unique_ptr<IndirectStubsManager>()>
 createLocalIndirectStubsManagerBuilder(const Triple &T) {
   switch (T.getArch()) {
-    default: return nullptr;
+    default:
+      return [](){
+        return llvm::make_unique<
+                       orc::LocalIndirectStubsManager<orc::OrcGenericABI>>();
+      };
 
     case Triple::aarch64:
       return [](){
@@ -176,7 +265,6 @@ void makeAllSymbolsExternallyAccessible(Module &M) {
 
 Function* cloneFunctionDecl(Module &Dst, const Function &F,
                             ValueToValueMapTy *VMap) {
-  assert(F.getParent() != &Dst && "Can't copy decl over existing function.");
   Function *NewF =
     Function::Create(cast<FunctionType>(F.getValueType()),
                      F.getLinkage(), F.getName(), &Dst);
@@ -214,7 +302,6 @@ void moveFunctionBody(Function &OrigF, ValueToValueMapTy &VMap,
 
 GlobalVariable* cloneGlobalVariableDecl(Module &Dst, const GlobalVariable &GV,
                                         ValueToValueMapTy *VMap) {
-  assert(GV.getParent() != &Dst && "Can't copy decl over existing global var.");
   GlobalVariable *NewGV = new GlobalVariable(
       Dst, GV.getValueType(), GV.isConstant(),
       GV.getLinkage(), nullptr, GV.getName(), nullptr,
@@ -236,8 +323,8 @@ void moveGlobalVariableInitializer(GlobalVariable &OrigGV,
     assert(VMap[&OrigGV] == NewGV &&
            "Incorrect global variable mapping in VMap.");
   assert(NewGV->getParent() != OrigGV.getParent() &&
-         "moveGlobalVariable should only be used to move initializers between "
-         "modules");
+         "moveGlobalVariableInitializer should only be used to move "
+         "initializers between modules");
 
   NewGV->setInitializer(MapValue(OrigGV.getInitializer(), VMap, RF_None,
                                  nullptr, Materializer));
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
new file mode 100644
index 000000000000..52ff4efe56b2
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -0,0 +1,134 @@
+//===--------- LLJIT.cpp - An ORC-based JIT for compiling LLVM IR ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/Mangler.h"
+
+namespace llvm {
+namespace orc {
+
+Expected<std::unique_ptr<LLJIT>>
+LLJIT::Create(std::unique_ptr<ExecutionSession> ES,
+              std::unique_ptr<TargetMachine> TM, DataLayout DL) {
+  return std::unique_ptr<LLJIT>(
+      new LLJIT(std::move(ES), std::move(TM), std::move(DL)));
+}
+
+Error LLJIT::defineAbsolute(StringRef Name, JITEvaluatedSymbol Sym) {
+  auto InternedName = ES->getSymbolStringPool().intern(Name);
+  SymbolMap Symbols({{InternedName, Sym}});
+  return Main.define(absoluteSymbols(std::move(Symbols)));
+}
+
+Error LLJIT::addIRModule(VSO &V, std::unique_ptr<Module> M) {
+  assert(M && "Can not add null module");
+
+  if (auto Err = applyDataLayout(*M))
+    return Err;
+
+  auto K = ES->allocateVModule();
+  return CompileLayer.add(V, K, std::move(M));
+}
+
+Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(VSO &V,
+                                                        StringRef Name) {
+  return llvm::orc::lookup({&V}, ES->getSymbolStringPool().intern(Name));
+}
+
+LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
+             std::unique_ptr<TargetMachine> TM, DataLayout DL)
+    : ES(std::move(ES)), Main(this->ES->createVSO("main")), TM(std::move(TM)),
+      DL(std::move(DL)),
+      ObjLinkingLayer(*this->ES,
+                      [this](VModuleKey K) { return getMemoryManager(K); }),
+      CompileLayer(*this->ES, ObjLinkingLayer, SimpleCompiler(*this->TM)),
+      CtorRunner(Main), DtorRunner(Main) {}
+
+std::shared_ptr<RuntimeDyld::MemoryManager>
+LLJIT::getMemoryManager(VModuleKey K) {
+  return llvm::make_unique<SectionMemoryManager>();
+}
+
+std::string LLJIT::mangle(StringRef UnmangledName) {
+  std::string MangledName;
+  {
+    raw_string_ostream MangledNameStream(MangledName);
+    Mangler::getNameWithPrefix(MangledNameStream, UnmangledName, DL);
+  }
+  return MangledName;
+}
+
+Error LLJIT::applyDataLayout(Module &M) {
+  if (M.getDataLayout().isDefault())
+    M.setDataLayout(DL);
+
+  if (M.getDataLayout() != DL)
+    return make_error<StringError>(
+        "Added modules have incompatible data layouts",
+        inconvertibleErrorCode());
+
+  return Error::success();
+}
+
+void LLJIT::recordCtorDtors(Module &M) {
+  CtorRunner.add(getConstructors(M));
+  DtorRunner.add(getDestructors(M));
+}
+
+Expected<std::unique_ptr<LLLazyJIT>>
+LLLazyJIT::Create(std::unique_ptr<ExecutionSession> ES,
+                  std::unique_ptr<TargetMachine> TM, DataLayout DL,
+                  LLVMContext &Ctx) {
+  const Triple &TT = TM->getTargetTriple();
+
+  auto CCMgr = createLocalCompileCallbackManager(TT, *ES, 0);
+  if (!CCMgr)
+    return make_error<StringError>(
+        std::string("No callback manager available for ") + TT.str(),
+        inconvertibleErrorCode());
+
+  auto ISMBuilder = createLocalIndirectStubsManagerBuilder(TT);
+  if (!ISMBuilder)
+    return make_error<StringError>(
+        std::string("No indirect stubs manager builder for ") + TT.str(),
+        inconvertibleErrorCode());
+
+  return std::unique_ptr<LLLazyJIT>(
+      new LLLazyJIT(std::move(ES), std::move(TM), std::move(DL), Ctx,
+                    std::move(CCMgr), std::move(ISMBuilder)));
+}
+
+Error LLLazyJIT::addLazyIRModule(VSO &V, std::unique_ptr<Module> M) {
+  assert(M && "Can not add null module");
+
+  if (auto Err = applyDataLayout(*M))
+    return Err;
+
+  makeAllSymbolsExternallyAccessible(*M);
+
+  recordCtorDtors(*M);
+
+  auto K = ES->allocateVModule();
+  return CODLayer.add(V, K, std::move(M));
+}
+
+LLLazyJIT::LLLazyJIT(
+    std::unique_ptr<ExecutionSession> ES, std::unique_ptr<TargetMachine> TM,
+    DataLayout DL, LLVMContext &Ctx,
+    std::unique_ptr<JITCompileCallbackManager> CCMgr,
+    std::function<std::unique_ptr<IndirectStubsManager>()> ISMBuilder)
+    : LLJIT(std::move(ES), std::move(TM), std::move(DL)),
+      CCMgr(std::move(CCMgr)), TransformLayer(*this->ES, CompileLayer),
+      CODLayer(*this->ES, TransformLayer, *this->CCMgr, std::move(ISMBuilder),
+               [&]() -> LLVMContext & { return Ctx; }) {}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/Layer.cpp
new file mode 100644
index 000000000000..b9da3b7fb8d5
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -0,0 +1,106 @@
+//===-------------------- Layer.cpp - Layer interfaces --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Layer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+namespace orc {
+
+IRLayer::IRLayer(ExecutionSession &ES) : ES(ES) {}
+IRLayer::~IRLayer() {}
+
+Error IRLayer::add(VSO &V, VModuleKey K, std::unique_ptr<Module> M) {
+  return V.define(llvm::make_unique<BasicIRLayerMaterializationUnit>(
+      *this, std::move(K), std::move(M)));
+}
+
+IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
+                                             std::unique_ptr<Module> M)
+  : MaterializationUnit(SymbolFlagsMap()), M(std::move(M)) {
+
+  MangleAndInterner Mangle(ES, this->M->getDataLayout());
+  for (auto &G : this->M->global_values()) {
+    if (G.hasName() && !G.isDeclaration() && !G.hasLocalLinkage() &&
+        !G.hasAvailableExternallyLinkage() && !G.hasAppendingLinkage()) {
+      auto MangledName = Mangle(G.getName());
+      SymbolFlags[MangledName] = JITSymbolFlags::fromGlobalValue(G);
+      SymbolToDefinition[MangledName] = &G;
+    }
+  }
+}
+
+IRMaterializationUnit::IRMaterializationUnit(
+    std::unique_ptr<Module> M, SymbolFlagsMap SymbolFlags,
+    SymbolNameToDefinitionMap SymbolToDefinition)
+    : MaterializationUnit(std::move(SymbolFlags)), M(std::move(M)),
+      SymbolToDefinition(std::move(SymbolToDefinition)) {}
+
+void IRMaterializationUnit::discard(const VSO &V, SymbolStringPtr Name) {
+  auto I = SymbolToDefinition.find(Name);
+  assert(I != SymbolToDefinition.end() &&
+         "Symbol not provided by this MU, or previously discarded");
+  assert(!I->second->isDeclaration() &&
+         "Discard should only apply to definitions");
+  I->second->setLinkage(GlobalValue::AvailableExternallyLinkage);
+  SymbolToDefinition.erase(I);
+}
+
+BasicIRLayerMaterializationUnit::BasicIRLayerMaterializationUnit(
+    IRLayer &L, VModuleKey K, std::unique_ptr<Module> M)
+  : IRMaterializationUnit(L.getExecutionSession(), std::move(M)),
+      L(L), K(std::move(K)) {}
+
+void BasicIRLayerMaterializationUnit::materialize(
+    MaterializationResponsibility R) {
+  L.emit(std::move(R), std::move(K), std::move(M));
+}
+
+ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {}
+
+ObjectLayer::~ObjectLayer() {}
+
+Error ObjectLayer::add(VSO &V, VModuleKey K, std::unique_ptr<MemoryBuffer> O) {
+  return V.define(llvm::make_unique<BasicObjectLayerMaterializationUnit>(
+      *this, std::move(K), std::move(O)));
+}
+
+BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
+    ObjectLayer &L, VModuleKey K, std::unique_ptr<MemoryBuffer> O)
+    : MaterializationUnit(SymbolFlagsMap()), L(L), K(std::move(K)),
+      O(std::move(O)) {
+
+  auto &ES = L.getExecutionSession();
+  auto Obj = cantFail(
+      object::ObjectFile::createObjectFile(this->O->getMemBufferRef()));
+
+  for (auto &Sym : Obj->symbols()) {
+    if (!(Sym.getFlags() & object::BasicSymbolRef::SF_Undefined) &&
+         (Sym.getFlags() & object::BasicSymbolRef::SF_Exported)) {
+      auto InternedName =
+          ES.getSymbolStringPool().intern(cantFail(Sym.getName()));
+      SymbolFlags[InternedName] = JITSymbolFlags::fromObjectSymbol(Sym);
+    }
+  }
+}
+
+void BasicObjectLayerMaterializationUnit::materialize(
+    MaterializationResponsibility R) {
+  L.emit(std::move(R), std::move(K), std::move(O));
+}
+
+void BasicObjectLayerMaterializationUnit::discard(const VSO &V,
+                                                  SymbolStringPtr Name) {
+  // FIXME: Support object file level discard. This could be done by building a
+  //        filter to pass to the object layer along with the object itself.
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/Legacy.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
new file mode 100644
index 000000000000..18be9a042f7f
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
@@ -0,0 +1,68 @@
+//===------- Legacy.cpp - Adapters for ExecutionEngine API interop --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Legacy.h"
+
+namespace llvm {
+namespace orc {
+
+void SymbolResolver::anchor() {}
+
+JITSymbolResolverAdapter::JITSymbolResolverAdapter(
+    ExecutionSession &ES, SymbolResolver &R, MaterializationResponsibility *MR)
+    : ES(ES), R(R), MR(MR) {}
+
+Expected<JITSymbolResolverAdapter::LookupResult>
+JITSymbolResolverAdapter::lookup(const LookupSet &Symbols) {
+  SymbolNameSet InternedSymbols;
+  for (auto &S : Symbols)
+    InternedSymbols.insert(ES.getSymbolStringPool().intern(S));
+
+  auto LookupFn = [&, this](std::shared_ptr<AsynchronousSymbolQuery> Q,
+                            SymbolNameSet Unresolved) {
+    return R.lookup(std::move(Q), std::move(Unresolved));
+  };
+
+  auto RegisterDependencies = [&](const SymbolDependenceMap &Deps) {
+    if (MR)
+      MR->addDependenciesForAll(Deps);
+  };
+
+  auto InternedResult =
+      ES.legacyLookup(ES, std::move(LookupFn), std::move(InternedSymbols),
+                      false, RegisterDependencies);
+
+  if (!InternedResult)
+    return InternedResult.takeError();
+
+  JITSymbolResolver::LookupResult Result;
+  for (auto &KV : *InternedResult)
+    Result[*KV.first] = KV.second;
+
+  return Result;
+}
+
+Expected<JITSymbolResolverAdapter::LookupFlagsResult>
+JITSymbolResolverAdapter::lookupFlags(const LookupSet &Symbols) {
+  SymbolNameSet InternedSymbols;
+  for (auto &S : Symbols)
+    InternedSymbols.insert(ES.getSymbolStringPool().intern(S));
+
+  SymbolFlagsMap SymbolFlags = R.lookupFlags(InternedSymbols);
+  LookupFlagsResult Result;
+  for (auto &KV : SymbolFlags) {
+    ResolvedStrings.insert(KV.first);
+    Result[*KV.first] = KV.second;
+  }
+
+  return Result;
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp
index 8f2d6fd6c32b..3796e3d37bc2 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp
@@ -14,11 +14,23 @@
 namespace llvm {
 namespace orc {
 
-JITSymbol NullResolver::findSymbol(const std::string &Name) {
+SymbolFlagsMap NullResolver::lookupFlags(const SymbolNameSet &Symbols) {
+  return SymbolFlagsMap();
+}
+
+SymbolNameSet
+NullResolver::lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
+                     SymbolNameSet Symbols) {
+  assert(Symbols.empty() && "Null resolver: Symbols must be empty");
+  return Symbols;
+}
+
+JITSymbol NullLegacyResolver::findSymbol(const std::string &Name) {
   llvm_unreachable("Unexpected cross-object symbol reference");
 }
 
-JITSymbol NullResolver::findSymbolInLogicalDylib(const std::string &Name) {
+JITSymbol
+NullLegacyResolver::findSymbolInLogicalDylib(const std::string &Name) {
   llvm_unreachable("Unexpected cross-object symbol reference");
 }
 
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
new file mode 100644
index 000000000000..6980c8140fd0
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
@@ -0,0 +1,34 @@
+//===---------- ObjectTransformLayer.cpp - Object Transform Layer ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+namespace orc {
+
+ObjectTransformLayer2::ObjectTransformLayer2(ExecutionSession &ES,
+                                             ObjectLayer &BaseLayer,
+                                             TransformFunction Transform)
+    : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
+
+void ObjectTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+                                 std::unique_ptr<MemoryBuffer> O) {
+  assert(O && "Module must not be null");
+
+  if (auto TransformedObj = Transform(std::move(O)))
+    BaseLayer.emit(std::move(R), std::move(K), std::move(*TransformedObj));
+  else {
+    R.failMaterialization();
+    getExecutionSession().reportError(TransformedObj.takeError());
+  }
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
index f945acaf95ee..d6005d24a648 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
@@ -9,28 +9,20 @@
 
 #include "OrcCBindingsStack.h"
 #include "llvm-c/OrcBindings.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 
 using namespace llvm;
 
-LLVMSharedModuleRef LLVMOrcMakeSharedModule(LLVMModuleRef Mod) {
-  return wrap(new std::shared_ptr<Module>(unwrap(Mod)));
-}
-
-void LLVMOrcDisposeSharedModuleRef(LLVMSharedModuleRef SharedMod) {
-  delete unwrap(SharedMod);
-}
-
 LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) {
   TargetMachine *TM2(unwrap(TM));
 
   Triple T(TM2->getTargetTriple());
 
-  auto CompileCallbackMgr = orc::createLocalCompileCallbackManager(T, 0);
   auto IndirectStubsMgrBuilder =
       orc::createLocalIndirectStubsManagerBuilder(T);
 
-  OrcCBindingsStack *JITStack = new OrcCBindingsStack(
-      *TM2, std::move(CompileCallbackMgr), IndirectStubsMgrBuilder);
+  OrcCBindingsStack *JITStack =
+      new OrcCBindingsStack(*TM2, std::move(IndirectStubsMgrBuilder));
 
   return wrap(JITStack);
 }
@@ -75,24 +67,24 @@ LLVMOrcErrorCode LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
 
 LLVMOrcErrorCode
 LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
-                            LLVMOrcModuleHandle *RetHandle,
-                            LLVMSharedModuleRef Mod,
+                            LLVMOrcModuleHandle *RetHandle, LLVMModuleRef Mod,
                             LLVMOrcSymbolResolverFn SymbolResolver,
                             void *SymbolResolverCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  std::shared_ptr<Module> *M(unwrap(Mod));
-  return J.addIRModuleEager(*RetHandle, *M, SymbolResolver, SymbolResolverCtx);
+  std::unique_ptr<Module> M(unwrap(Mod));
+  return J.addIRModuleEager(*RetHandle, std::move(M), SymbolResolver,
+                            SymbolResolverCtx);
 }
 
 LLVMOrcErrorCode
 LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack,
-                           LLVMOrcModuleHandle *RetHandle,
-                           LLVMSharedModuleRef Mod,
+                           LLVMOrcModuleHandle *RetHandle, LLVMModuleRef Mod,
                            LLVMOrcSymbolResolverFn SymbolResolver,
                            void *SymbolResolverCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  std::shared_ptr<Module> *M(unwrap(Mod));
-  return J.addIRModuleLazy(*RetHandle, *M, SymbolResolver, SymbolResolverCtx);
+  std::unique_ptr<Module> M(unwrap(Mod));
+  return J.addIRModuleLazy(*RetHandle, std::move(M), SymbolResolver,
+                           SymbolResolverCtx);
 }
 
 LLVMOrcErrorCode
@@ -120,9 +112,27 @@ LLVMOrcErrorCode LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
   return J.findSymbolAddress(*RetAddr, SymbolName, true);
 }
 
+LLVMOrcErrorCode LLVMOrcGetSymbolAddressIn(LLVMOrcJITStackRef JITStack,
+                                           LLVMOrcTargetAddress *RetAddr,
+                                           LLVMOrcModuleHandle H,
+                                           const char *SymbolName) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  return J.findSymbolAddressIn(*RetAddr, H, SymbolName, true);
+}
+
 LLVMOrcErrorCode LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack) {
   auto *J = unwrap(JITStack);
   auto Err = J->shutdown();
   delete J;
   return Err;
 }
+
+void LLVMOrcRegisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L)
+{
+  unwrap(JITStack)->RegisterJITEventListener(unwrap(L));
+}
+
+void LLVMOrcUnregisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L)
+{
+  unwrap(JITStack)->UnregisterJITEventListener(unwrap(L));
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index 405970e063d8..b9f8a370d2f0 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
@@ -33,6 +34,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <map>
 #include <memory>
 #include <set>
 #include <string>
@@ -42,68 +44,61 @@ namespace llvm {
 
 class OrcCBindingsStack;
 
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::shared_ptr<Module>,
-                                   LLVMSharedModuleRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcCBindingsStack, LLVMOrcJITStackRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 
 namespace detail {
 
+// FIXME: Kill this off once the Layer concept becomes an interface.
+class GenericLayer {
+public:
+  virtual ~GenericLayer() = default;
 
-  class GenericHandle {
-  public:
-    virtual ~GenericHandle() = default;
-
-    virtual JITSymbol findSymbolIn(const std::string &Name,
-                                   bool ExportedSymbolsOnly) = 0;
-    virtual Error removeModule() = 0;
+  virtual JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
+                                 bool ExportedSymbolsOnly) = 0;
+  virtual Error removeModule(orc::VModuleKey K) = 0;
   };
 
-  template <typename LayerT> class GenericHandleImpl : public GenericHandle {
+  template <typename LayerT> class GenericLayerImpl : public GenericLayer {
   public:
-    GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleHandleT Handle)
-        : Layer(Layer), Handle(std::move(Handle)) {}
+    GenericLayerImpl(LayerT &Layer) : Layer(Layer) {}
 
-    JITSymbol findSymbolIn(const std::string &Name,
+    JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
                            bool ExportedSymbolsOnly) override {
-      return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
+      return Layer.findSymbolIn(K, Name, ExportedSymbolsOnly);
     }
 
-    Error removeModule() override { return Layer.removeModule(Handle); }
+    Error removeModule(orc::VModuleKey K) override {
+      return Layer.removeModule(K);
+    }
 
   private:
     LayerT &Layer;
-    typename LayerT::ModuleHandleT Handle;
   };
 
   template <>
-  class GenericHandleImpl<orc::RTDyldObjectLinkingLayer>
-    : public GenericHandle {
+  class GenericLayerImpl<orc::RTDyldObjectLinkingLayer> : public GenericLayer {
   private:
     using LayerT = orc::RTDyldObjectLinkingLayer;
   public:
+    GenericLayerImpl(LayerT &Layer) : Layer(Layer) {}
 
-    GenericHandleImpl(LayerT &Layer, typename LayerT::ObjHandleT Handle)
-        : Layer(Layer), Handle(std::move(Handle)) {}
-
-    JITSymbol findSymbolIn(const std::string &Name,
+    JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
                            bool ExportedSymbolsOnly) override {
-      return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
+      return Layer.findSymbolIn(K, Name, ExportedSymbolsOnly);
     }
 
-    Error removeModule() override { return Layer.removeObject(Handle); }
+    Error removeModule(orc::VModuleKey K) override {
+      return Layer.removeObject(K);
+    }
 
   private:
     LayerT &Layer;
-    typename LayerT::ObjHandleT Handle;
   };
 
-
-  template <typename LayerT, typename HandleT>
-  std::unique_ptr<GenericHandleImpl<LayerT>>
-  createGenericHandle(LayerT &Layer, HandleT Handle) {
-    return llvm::make_unique<GenericHandleImpl<LayerT>>(Layer,
-                                                        std::move(Handle));
+  template <typename LayerT>
+  std::unique_ptr<GenericLayerImpl<LayerT>> createGenericLayer(LayerT &Layer) {
+    return llvm::make_unique<GenericLayerImpl<LayerT>>(Layer);
   }
 
 } // end namespace detail
@@ -126,20 +121,123 @@ private:
 
   using OwningObject = object::OwningBinary<object::ObjectFile>;
 
-public:
-  using ModuleHandleT = unsigned;
+  class CBindingsResolver : public orc::SymbolResolver {
+  public:
+    CBindingsResolver(OrcCBindingsStack &Stack,
+                      LLVMOrcSymbolResolverFn ExternalResolver,
+                      void *ExternalResolverCtx)
+        : Stack(Stack), ExternalResolver(std::move(ExternalResolver)),
+          ExternalResolverCtx(std::move(ExternalResolverCtx)) {}
+
+    orc::SymbolFlagsMap
+    lookupFlags(const orc::SymbolNameSet &Symbols) override {
+      orc::SymbolFlagsMap SymbolFlags;
+
+      for (auto &S : Symbols) {
+        if (auto Sym = findSymbol(*S))
+          SymbolFlags[S] = Sym.getFlags();
+        else if (auto Err = Sym.takeError()) {
+          Stack.reportError(std::move(Err));
+          return orc::SymbolFlagsMap();
+        }
+      }
+
+      return SymbolFlags;
+    }
+
+    orc::SymbolNameSet
+    lookup(std::shared_ptr<orc::AsynchronousSymbolQuery> Query,
+           orc::SymbolNameSet Symbols) override {
+      orc::SymbolNameSet UnresolvedSymbols;
+
+      for (auto &S : Symbols) {
+        if (auto Sym = findSymbol(*S)) {
+          if (auto Addr = Sym.getAddress()) {
+            Query->resolve(S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
+            Query->notifySymbolReady();
+          } else {
+            Stack.ES.legacyFailQuery(*Query, Addr.takeError());
+            return orc::SymbolNameSet();
+          }
+        } else if (auto Err = Sym.takeError()) {
+          Stack.ES.legacyFailQuery(*Query, std::move(Err));
+          return orc::SymbolNameSet();
+        } else
+          UnresolvedSymbols.insert(S);
+      }
+
+      if (Query->isFullyResolved())
+        Query->handleFullyResolved();
+
+      if (Query->isFullyReady())
+        Query->handleFullyReady();
+
+      return UnresolvedSymbols;
+    }
+
+  private:
+    JITSymbol findSymbol(const std::string &Name) {
+      // Search order:
+      // 1. JIT'd symbols.
+      // 2. Runtime overrides.
+      // 3. External resolver (if present).
+
+      if (auto Sym = Stack.CODLayer.findSymbol(Name, true))
+        return Sym;
+      else if (auto Err = Sym.takeError())
+        return Sym.takeError();
 
+      if (auto Sym = Stack.CXXRuntimeOverrides.searchOverrides(Name))
+        return Sym;
+
+      if (ExternalResolver)
+        return JITSymbol(ExternalResolver(Name.c_str(), ExternalResolverCtx),
+                         JITSymbolFlags::Exported);
+
+      return JITSymbol(nullptr);
+    }
+
+    OrcCBindingsStack &Stack;
+    LLVMOrcSymbolResolverFn ExternalResolver;
+    void *ExternalResolverCtx = nullptr;
+  };
+
+public:
   OrcCBindingsStack(TargetMachine &TM,
-                    std::unique_ptr<CompileCallbackMgr> CCMgr,
                     IndirectStubsManagerBuilder IndirectStubsMgrBuilder)
-      : DL(TM.createDataLayout()), IndirectStubsMgr(IndirectStubsMgrBuilder()),
-        CCMgr(std::move(CCMgr)),
-        ObjectLayer(
-          []() {
-            return std::make_shared<SectionMemoryManager>();
-          }),
+      : CCMgr(createLocalCompileCallbackManager(TM.getTargetTriple(), ES, 0)),
+        DL(TM.createDataLayout()), IndirectStubsMgr(IndirectStubsMgrBuilder()),
+        ObjectLayer(ES,
+                    [this](orc::VModuleKey K) {
+                      auto ResolverI = Resolvers.find(K);
+                      assert(ResolverI != Resolvers.end() &&
+                             "No resolver for module K");
+                      auto Resolver = std::move(ResolverI->second);
+                      Resolvers.erase(ResolverI);
+                      return ObjLayerT::Resources{
+                          std::make_shared<SectionMemoryManager>(), Resolver};
+                    },
+                    nullptr,
+                    [this](orc::VModuleKey K, const object::ObjectFile &Obj,
+                           const RuntimeDyld::LoadedObjectInfo &LoadedObjInfo) {
+		      this->notifyFinalized(K, Obj, LoadedObjInfo);
+                    },
+                    [this](orc::VModuleKey K, const object::ObjectFile &Obj) {
+		      this->notifyFreed(K, Obj);
+                    }),
         CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)),
-        CODLayer(CompileLayer,
+        CODLayer(ES, CompileLayer,
+                 [this](orc::VModuleKey K) {
+                   auto ResolverI = Resolvers.find(K);
+                   assert(ResolverI != Resolvers.end() &&
+                          "No resolver for module K");
+                   return ResolverI->second;
+                 },
+                 [this](orc::VModuleKey K,
+                        std::shared_ptr<orc::SymbolResolver> Resolver) {
+                   assert(!Resolvers.count(K) && "Resolver already present");
+                   Resolvers[K] = std::move(Resolver);
+                 },
                  [](Function &F) { return std::set<Function *>({&F}); },
                  *this->CCMgr, std::move(IndirectStubsMgrBuilder), false),
         CXXRuntimeOverrides(
@@ -174,15 +272,15 @@ public:
   createLazyCompileCallback(JITTargetAddress &RetAddr,
                             LLVMOrcLazyCompileCallbackFn Callback,
                             void *CallbackCtx) {
-    if (auto CCInfoOrErr = CCMgr->getCompileCallback()) {
-      auto &CCInfo = *CCInfoOrErr;
-      CCInfo.setCompileAction([=]() -> JITTargetAddress {
-          return Callback(wrap(this), CallbackCtx);
-        });
-      RetAddr = CCInfo.getAddress();
+    auto WrappedCallback = [=]() -> JITTargetAddress {
+      return Callback(wrap(this), CallbackCtx);
+    };
+
+    if (auto CCAddr = CCMgr->getCompileCallback(std::move(WrappedCallback))) {
+      RetAddr = *CCAddr;
       return LLVMOrcErrSuccess;
     } else
-      return mapError(CCInfoOrErr.takeError());
+      return mapError(CCAddr.takeError());
   }
 
   LLVMOrcErrorCode createIndirectStub(StringRef StubName,
@@ -195,42 +293,9 @@ public:
                                           JITTargetAddress Addr) {
     return mapError(IndirectStubsMgr->updatePointer(Name, Addr));
   }
-
-  std::shared_ptr<JITSymbolResolver>
-  createResolver(LLVMOrcSymbolResolverFn ExternalResolver,
-                 void *ExternalResolverCtx) {
-    return orc::createLambdaResolver(
-        [this, ExternalResolver, ExternalResolverCtx](const std::string &Name)
-          -> JITSymbol {
-          // Search order:
-          // 1. JIT'd symbols.
-          // 2. Runtime overrides.
-          // 3. External resolver (if present).
-
-          if (auto Sym = CODLayer.findSymbol(Name, true))
-            return Sym;
-          else if (auto Err = Sym.takeError())
-            return Sym.takeError();
-
-          if (auto Sym = CXXRuntimeOverrides.searchOverrides(Name))
-            return Sym;
-
-          if (ExternalResolver)
-            return JITSymbol(
-                ExternalResolver(Name.c_str(), ExternalResolverCtx),
-                JITSymbolFlags::Exported);
-
-          return JITSymbol(nullptr);
-        },
-        [](const std::string &Name) -> JITSymbol {
-          return JITSymbol(nullptr);
-        });
-  }
-
   template <typename LayerT>
   LLVMOrcErrorCode
-  addIRModule(ModuleHandleT &RetHandle, LayerT &Layer,
-              std::shared_ptr<Module> M,
+  addIRModule(orc::VModuleKey &RetKey, LayerT &Layer, std::unique_ptr<Module> M,
               std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
               LLVMOrcSymbolResolverFn ExternalResolver,
               void *ExternalResolverCtx) {
@@ -247,79 +312,73 @@ public:
     for (auto Dtor : orc::getDestructors(*M))
       DtorNames.push_back(mangle(Dtor.Func->getName()));
 
-    // Create the resolver.
-    auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx);
-
     // Add the module to the JIT.
-    ModuleHandleT H;
-    if (auto LHOrErr = Layer.addModule(std::move(M), std::move(Resolver)))
-      H = createHandle(Layer, *LHOrErr);
-    else
-      return mapError(LHOrErr.takeError());
+    RetKey = ES.allocateVModule();
+    Resolvers[RetKey] = std::make_shared<CBindingsResolver>(
+        *this, ExternalResolver, ExternalResolverCtx);
+    if (auto Err = Layer.addModule(RetKey, std::move(M)))
+      return mapError(std::move(Err));
+
+    KeyLayers[RetKey] = detail::createGenericLayer(Layer);
 
     // Run the static constructors, and save the static destructor runner for
     // execution when the JIT is torn down.
-    orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), H);
+    orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames),
+                                                      RetKey);
     if (auto Err = CtorRunner.runViaLayer(*this))
       return mapError(std::move(Err));
 
-    IRStaticDestructorRunners.emplace_back(std::move(DtorNames), H);
+    IRStaticDestructorRunners.emplace_back(std::move(DtorNames), RetKey);
 
-    RetHandle = H;
     return LLVMOrcErrSuccess;
   }
 
-  LLVMOrcErrorCode addIRModuleEager(ModuleHandleT &RetHandle,
-                                    std::shared_ptr<Module> M,
+  LLVMOrcErrorCode addIRModuleEager(orc::VModuleKey &RetKey,
+                                    std::unique_ptr<Module> M,
                                     LLVMOrcSymbolResolverFn ExternalResolver,
                                     void *ExternalResolverCtx) {
-    return addIRModule(RetHandle, CompileLayer, std::move(M),
+    return addIRModule(RetKey, CompileLayer, std::move(M),
                        llvm::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
-  LLVMOrcErrorCode addIRModuleLazy(ModuleHandleT &RetHandle,
-                                   std::shared_ptr<Module> M,
+  LLVMOrcErrorCode addIRModuleLazy(orc::VModuleKey &RetKey,
+                                   std::unique_ptr<Module> M,
                                    LLVMOrcSymbolResolverFn ExternalResolver,
                                    void *ExternalResolverCtx) {
-    return addIRModule(RetHandle, CODLayer, std::move(M),
+    return addIRModule(RetKey, CODLayer, std::move(M),
                        llvm::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
-  LLVMOrcErrorCode removeModule(ModuleHandleT H) {
-    if (auto Err = GenericHandles[H]->removeModule())
+  LLVMOrcErrorCode removeModule(orc::VModuleKey K) {
+    // FIXME: Should error release the module key?
+    if (auto Err = KeyLayers[K]->removeModule(K))
       return mapError(std::move(Err));
-    GenericHandles[H] = nullptr;
-    FreeHandleIndexes.push_back(H);
+    ES.releaseVModule(K);
+    KeyLayers.erase(K);
     return LLVMOrcErrSuccess;
   }
 
-  LLVMOrcErrorCode addObject(ModuleHandleT &RetHandle,
+  LLVMOrcErrorCode addObject(orc::VModuleKey &RetKey,
                              std::unique_ptr<MemoryBuffer> ObjBuffer,
                              LLVMOrcSymbolResolverFn ExternalResolver,
                              void *ExternalResolverCtx) {
-    if (auto ObjOrErr =
-        object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef())) {
-      auto &Obj = *ObjOrErr;
-      auto OwningObj =
-        std::make_shared<OwningObject>(std::move(Obj), std::move(ObjBuffer));
+    if (auto Obj = object::ObjectFile::createObjectFile(
+            ObjBuffer->getMemBufferRef())) {
 
-      // Create the resolver.
-      auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx);
+      RetKey = ES.allocateVModule();
+      Resolvers[RetKey] = std::make_shared<CBindingsResolver>(
+          *this, ExternalResolver, ExternalResolverCtx);
 
-      ModuleHandleT H;
-      if (auto HOrErr = ObjectLayer.addObject(std::move(OwningObj),
-                                              std::move(Resolver)))
-        H = createHandle(ObjectLayer, *HOrErr);
-      else
-        return mapError(HOrErr.takeError());
+      if (auto Err = ObjectLayer.addObject(RetKey, std::move(ObjBuffer)))
+        return mapError(std::move(Err));
 
-      RetHandle = H;
+      KeyLayers[RetKey] = detail::createGenericLayer(ObjectLayer);
 
       return LLVMOrcErrSuccess;
     } else
-      return mapError(ObjOrErr.takeError());
+      return mapError(Obj.takeError());
   }
 
   JITSymbol findSymbol(const std::string &Name,
@@ -329,9 +388,10 @@ public:
     return CODLayer.findSymbol(mangle(Name), ExportedSymbolsOnly);
   }
 
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
+  JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
                          bool ExportedSymbolsOnly) {
-    return GenericHandles[H]->findSymbolIn(Name, ExportedSymbolsOnly);
+    assert(KeyLayers.count(K) && "looking up symbol in unknown module");
+    return KeyLayers[K]->findSymbolIn(K, mangle(Name), ExportedSymbolsOnly);
   }
 
   LLVMOrcErrorCode findSymbolAddress(JITTargetAddress &RetAddr,
@@ -354,26 +414,48 @@ public:
     return LLVMOrcErrSuccess;
   }
 
+  LLVMOrcErrorCode findSymbolAddressIn(JITTargetAddress &RetAddr,
+                                       orc::VModuleKey K,
+                                       const std::string &Name,
+                                       bool ExportedSymbolsOnly) {
+    RetAddr = 0;
+    if (auto Sym = findSymbolIn(K, Name, ExportedSymbolsOnly)) {
+      // Successful lookup, non-null symbol:
+      if (auto AddrOrErr = Sym.getAddress()) {
+        RetAddr = *AddrOrErr;
+        return LLVMOrcErrSuccess;
+      } else
+        return mapError(AddrOrErr.takeError());
+    } else if (auto Err = Sym.takeError()) {
+      // Lookup failure - report error.
+      return mapError(std::move(Err));
+    }
+    // Otherwise we had a successful lookup but got a null result. We already
+    // set RetAddr to '0' above, so just return success.
+    return LLVMOrcErrSuccess;
+  }
+
   const std::string &getErrorMessage() const { return ErrMsg; }
 
-private:
-  template <typename LayerT, typename HandleT>
-  unsigned createHandle(LayerT &Layer, HandleT Handle) {
-    unsigned NewHandle;
-    if (!FreeHandleIndexes.empty()) {
-      NewHandle = FreeHandleIndexes.back();
-      FreeHandleIndexes.pop_back();
-      GenericHandles[NewHandle] =
-        detail::createGenericHandle(Layer, std::move(Handle));
-      return NewHandle;
-    } else {
-      NewHandle = GenericHandles.size();
-      GenericHandles.push_back(
-        detail::createGenericHandle(Layer, std::move(Handle)));
+  void RegisterJITEventListener(JITEventListener *L) {
+    if (!L)
+      return;
+    EventListeners.push_back(L);
+  }
+
+  void UnregisterJITEventListener(JITEventListener *L) {
+    if (!L)
+      return;
+
+    auto I = find(reverse(EventListeners), L);
+    if (I != EventListeners.rend()) {
+      std::swap(*I, EventListeners.back());
+      EventListeners.pop_back();
     }
-    return NewHandle;
   }
 
+private:
+
   LLVMOrcErrorCode mapError(Error Err) {
     LLVMOrcErrorCode Result = LLVMOrcErrSuccess;
     handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) {
@@ -386,22 +468,44 @@ private:
     return Result;
   }
 
+  void reportError(Error Err) {
+    // FIXME: Report errors on the execution session.
+    logAllUnhandledErrors(std::move(Err), errs(), "ORC error: ");
+  };
+
+  void notifyFinalized(orc::VModuleKey K,
+		       const object::ObjectFile &Obj,
+		       const RuntimeDyld::LoadedObjectInfo &LoadedObjInfo) {
+    for (auto &Listener : EventListeners)
+      Listener->NotifyObjectEmitted(Obj, LoadedObjInfo);
+  }
+
+  void notifyFreed(orc::VModuleKey K, const object::ObjectFile &Obj) {
+    for (auto &Listener : EventListeners)
+      Listener->NotifyFreeingObject(Obj);
+  }
+
+  orc::ExecutionSession ES;
+  std::unique_ptr<CompileCallbackMgr> CCMgr;
+
+  std::vector<JITEventListener *> EventListeners;
+
   DataLayout DL;
   SectionMemoryManager CCMgrMemMgr;
 
   std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
 
-  std::unique_ptr<CompileCallbackMgr> CCMgr;
   ObjLayerT ObjectLayer;
   CompileLayerT CompileLayer;
   CODLayerT CODLayer;
 
-  std::vector<std::unique_ptr<detail::GenericHandle>> GenericHandles;
-  std::vector<unsigned> FreeHandleIndexes;
+  std::map<orc::VModuleKey, std::unique_ptr<detail::GenericLayer>> KeyLayers;
 
   orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
   std::vector<orc::CtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
   std::string ErrMsg;
+
+  std::map<orc::VModuleKey, std::shared_ptr<orc::SymbolResolver>> Resolvers;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
index c218cb9a523c..f4102b359a6b 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
@@ -29,6 +29,12 @@ public:
 
   std::string message(int condition) const override {
     switch (static_cast<OrcErrorCode>(condition)) {
+    case OrcErrorCode::UnknownORCError:
+      return "Unknown ORC error";
+    case OrcErrorCode::DuplicateDefinition:
+      return "Duplicate symbol definition";
+    case OrcErrorCode::JITSymbolNotFound:
+      return "JIT symbol not found";
     case OrcErrorCode::RemoteAllocatorDoesNotExist:
       return "Remote allocator does not exist";
     case OrcErrorCode::RemoteAllocatorIdAlreadyInUse:
@@ -45,8 +51,6 @@ public:
       return "Could not negotiate RPC function";
     case OrcErrorCode::RPCResponseAbandoned:
       return "RPC response abandoned";
-    case OrcErrorCode::JITSymbolNotFound:
-      return "JIT symbol not found";
     case OrcErrorCode::UnexpectedRPCCall:
       return "Unexpected RPC call";
     case OrcErrorCode::UnexpectedRPCResponse:
@@ -67,6 +71,7 @@ static ManagedStatic<OrcErrorCategory> OrcErrCat;
 namespace llvm {
 namespace orc {
 
+char DuplicateDefinition::ID = 0;
 char JITSymbolNotFound::ID = 0;
 
 std::error_code orcError(OrcErrorCode ErrCode) {
@@ -74,6 +79,22 @@ std::error_code orcError(OrcErrorCode ErrCode) {
   return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
 }
 
+
+DuplicateDefinition::DuplicateDefinition(std::string SymbolName)
+  : SymbolName(std::move(SymbolName)) {}
+
+std::error_code DuplicateDefinition::convertToErrorCode() const {
+  return orcError(OrcErrorCode::DuplicateDefinition);
+}
+
+void DuplicateDefinition::log(raw_ostream &OS) const {
+  OS << "Duplicate definition of symbol '" << SymbolName << "'";
+}
+
+const std::string &DuplicateDefinition::getSymbolName() const {
+  return SymbolName;
+}
+
 JITSymbolNotFound::JITSymbolNotFound(std::string SymbolName)
   : SymbolName(std::move(SymbolName)) {}
 
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
index f89f21adff41..4def579e7097 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
@@ -125,8 +125,13 @@ OrcMCJITReplacement::runFunction(Function *F,
 }
 
 void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) {
-  for (auto &M : LocalModules)
-    ExecutionEngine::runStaticConstructorsDestructors(*M, isDtors);
+  auto &CtorDtorsMap = isDtors ? UnexecutedDestructors : UnexecutedConstructors;
+
+  for (auto &KV : CtorDtorsMap)
+    cantFail(CtorDtorRunner<LazyEmitLayerT>(std::move(KV.second), KV.first)
+                 .runViaLayer(LazyEmitLayer));
+
+  CtorDtorsMap.clear();
 }
 
 } // End namespace orc.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 1dc8d4ac7bc5..abe89ce70af9 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -21,6 +21,7 @@
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
@@ -54,6 +55,7 @@ class ObjectCache;
 namespace orc {
 
 class OrcMCJITReplacement : public ExecutionEngine {
+
   // OrcMCJITReplacement needs to do a little extra book-keeping to ensure that
   // Orc's automatic finalization doesn't kick in earlier than MCJIT clients are
   // expecting - see finalizeMemory.
@@ -138,18 +140,75 @@ class OrcMCJITReplacement : public ExecutionEngine {
     std::shared_ptr<MCJITMemoryManager> ClientMM;
   };
 
-  class LinkingResolver : public JITSymbolResolver {
+  class LinkingORCResolver : public orc::SymbolResolver {
   public:
-    LinkingResolver(OrcMCJITReplacement &M) : M(M) {}
+    LinkingORCResolver(OrcMCJITReplacement &M) : M(M) {}
+
+    SymbolFlagsMap lookupFlags(const SymbolNameSet &Symbols) override {
+      SymbolFlagsMap SymbolFlags;
+
+      for (auto &S : Symbols) {
+        if (auto Sym = M.findMangledSymbol(*S)) {
+          SymbolFlags[S] = Sym.getFlags();
+        } else if (auto Err = Sym.takeError()) {
+          M.reportError(std::move(Err));
+          return SymbolFlagsMap();
+        } else {
+          if (auto Sym2 = M.ClientResolver->findSymbolInLogicalDylib(*S)) {
+            SymbolFlags[S] = Sym2.getFlags();
+          } else if (auto Err = Sym2.takeError()) {
+            M.reportError(std::move(Err));
+            return SymbolFlagsMap();
+          }
+        }
+      }
 
-    JITSymbol findSymbol(const std::string &Name) override {
-      return M.ClientResolver->findSymbol(Name);
+      return SymbolFlags;
     }
 
-    JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {
-      if (auto Sym = M.findMangledSymbol(Name))
-        return Sym;
-      return M.ClientResolver->findSymbolInLogicalDylib(Name);
+    SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
+                         SymbolNameSet Symbols) override {
+      SymbolNameSet UnresolvedSymbols;
+      bool NewSymbolsResolved = false;
+
+      for (auto &S : Symbols) {
+        if (auto Sym = M.findMangledSymbol(*S)) {
+          if (auto Addr = Sym.getAddress()) {
+            Query->resolve(S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
+            Query->notifySymbolReady();
+            NewSymbolsResolved = true;
+          } else {
+            M.ES.legacyFailQuery(*Query, Addr.takeError());
+            return SymbolNameSet();
+          }
+        } else if (auto Err = Sym.takeError()) {
+          M.ES.legacyFailQuery(*Query, std::move(Err));
+          return SymbolNameSet();
+        } else {
+          if (auto Sym2 = M.ClientResolver->findSymbol(*S)) {
+            if (auto Addr = Sym2.getAddress()) {
+              Query->resolve(S, JITEvaluatedSymbol(*Addr, Sym2.getFlags()));
+              Query->notifySymbolReady();
+              NewSymbolsResolved = true;
+            } else {
+              M.ES.legacyFailQuery(*Query, Addr.takeError());
+              return SymbolNameSet();
+            }
+          } else if (auto Err = Sym2.takeError()) {
+            M.ES.legacyFailQuery(*Query, std::move(Err));
+            return SymbolNameSet();
+          } else
+            UnresolvedSymbols.insert(S);
+        }
+      }
+
+      if (NewSymbolsResolved && Query->isFullyResolved())
+        Query->handleFullyResolved();
+
+      if (NewSymbolsResolved && Query->isFullyReady())
+        Query->handleFullyReady();
+
+      return UnresolvedSymbols;
     }
 
   private:
@@ -160,26 +219,37 @@ private:
   static ExecutionEngine *
   createOrcMCJITReplacement(std::string *ErrorMsg,
                             std::shared_ptr<MCJITMemoryManager> MemMgr,
-                            std::shared_ptr<JITSymbolResolver> Resolver,
+                            std::shared_ptr<LegacyJITSymbolResolver> Resolver,
                             std::unique_ptr<TargetMachine> TM) {
     return new OrcMCJITReplacement(std::move(MemMgr), std::move(Resolver),
                                    std::move(TM));
   }
 
+  void reportError(Error Err) {
+    logAllUnhandledErrors(std::move(Err), errs(), "MCJIT error: ");
+  }
+
 public:
-  OrcMCJITReplacement(
-      std::shared_ptr<MCJITMemoryManager> MemMgr,
-      std::shared_ptr<JITSymbolResolver> ClientResolver,
-      std::unique_ptr<TargetMachine> TM)
-      : ExecutionEngine(TM->createDataLayout()), TM(std::move(TM)),
-        MemMgr(std::make_shared<MCJITReplacementMemMgr>(*this,
-                                                        std::move(MemMgr))),
-        Resolver(std::make_shared<LinkingResolver>(*this)),
+  OrcMCJITReplacement(std::shared_ptr<MCJITMemoryManager> MemMgr,
+                      std::shared_ptr<LegacyJITSymbolResolver> ClientResolver,
+                      std::unique_ptr<TargetMachine> TM)
+      : ExecutionEngine(TM->createDataLayout()),
+        TM(std::move(TM)),
+        MemMgr(
+            std::make_shared<MCJITReplacementMemMgr>(*this, std::move(MemMgr))),
+        Resolver(std::make_shared<LinkingORCResolver>(*this)),
         ClientResolver(std::move(ClientResolver)), NotifyObjectLoaded(*this),
         NotifyFinalized(*this),
-        ObjectLayer([this]() { return this->MemMgr; }, NotifyObjectLoaded,
-                    NotifyFinalized),
-        CompileLayer(ObjectLayer, SimpleCompiler(*this->TM)),
+        ObjectLayer(
+            ES,
+            [this](VModuleKey K) {
+              return ObjectLayerT::Resources{this->MemMgr, this->Resolver};
+            },
+            NotifyObjectLoaded, NotifyFinalized),
+        CompileLayer(ObjectLayer, SimpleCompiler(*this->TM),
+                     [this](VModuleKey K, std::unique_ptr<Module> M) {
+                       Modules.push_back(std::move(M));
+                     }),
         LazyEmitLayer(CompileLayer) {}
 
   static void Register() {
@@ -194,43 +264,63 @@ public:
     } else {
       assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
     }
-    auto *MPtr = M.release();
-    ShouldDelete[MPtr] = true;
-    auto Deleter = [this](Module *Mod) {
-      auto I = ShouldDelete.find(Mod);
-      if (I != ShouldDelete.end() && I->second)
-        delete Mod;
-    };
-    LocalModules.push_back(std::shared_ptr<Module>(MPtr, std::move(Deleter)));
-    cantFail(LazyEmitLayer.addModule(LocalModules.back(), Resolver));
+
+    // Rename, bump linkage and record static constructors and destructors.
+    // We have to do this before we hand over ownership of the module to the
+    // JIT.
+    std::vector<std::string> CtorNames, DtorNames;
+    {
+      unsigned CtorId = 0, DtorId = 0;
+      for (auto Ctor : orc::getConstructors(*M)) {
+        std::string NewCtorName = ("$static_ctor." + Twine(CtorId++)).str();
+        Ctor.Func->setName(NewCtorName);
+        Ctor.Func->setLinkage(GlobalValue::ExternalLinkage);
+        Ctor.Func->setVisibility(GlobalValue::HiddenVisibility);
+        CtorNames.push_back(mangle(NewCtorName));
+      }
+      for (auto Dtor : orc::getDestructors(*M)) {
+        std::string NewDtorName = ("$static_dtor." + Twine(DtorId++)).str();
+        dbgs() << "Found dtor: " << NewDtorName << "\n";
+        Dtor.Func->setName(NewDtorName);
+        Dtor.Func->setLinkage(GlobalValue::ExternalLinkage);
+        Dtor.Func->setVisibility(GlobalValue::HiddenVisibility);
+        DtorNames.push_back(mangle(NewDtorName));
+      }
+    }
+
+    auto K = ES.allocateVModule();
+
+    UnexecutedConstructors[K] = std::move(CtorNames);
+    UnexecutedDestructors[K] = std::move(DtorNames);
+
+    cantFail(LazyEmitLayer.addModule(K, std::move(M)));
   }
 
   void addObjectFile(std::unique_ptr<object::ObjectFile> O) override {
-    auto Obj =
-      std::make_shared<object::OwningBinary<object::ObjectFile>>(std::move(O),
-                                                                 nullptr);
-    cantFail(ObjectLayer.addObject(std::move(Obj), Resolver));
+    cantFail(ObjectLayer.addObject(
+        ES.allocateVModule(), MemoryBuffer::getMemBufferCopy(O->getData())));
   }
 
   void addObjectFile(object::OwningBinary<object::ObjectFile> O) override {
-    auto Obj =
-      std::make_shared<object::OwningBinary<object::ObjectFile>>(std::move(O));
-    cantFail(ObjectLayer.addObject(std::move(Obj), Resolver));
+    std::unique_ptr<object::ObjectFile> Obj;
+    std::unique_ptr<MemoryBuffer> ObjBuffer;
+    std::tie(Obj, ObjBuffer) = O.takeBinary();
+    cantFail(ObjectLayer.addObject(ES.allocateVModule(), std::move(ObjBuffer)));
   }
 
   void addArchive(object::OwningBinary<object::Archive> A) override {
     Archives.push_back(std::move(A));
   }
-  
+
   bool removeModule(Module *M) override {
-    for (auto I = LocalModules.begin(), E = LocalModules.end(); I != E; ++I) {
-      if (I->get() == M) {
-        ShouldDelete[M] = false;
-        LocalModules.erase(I);
-        return true;
-      }
-    }
-    return false;
+    auto I = Modules.begin();
+    for (auto E = Modules.end(); I != E; ++I)
+      if (I->get() == M)
+        break;
+    if (I == Modules.end())
+      return false;
+    Modules.erase(I);
+    return true;
   }
 
   uint64_t getSymbolAddress(StringRef Name) {
@@ -238,7 +328,7 @@ public:
   }
 
   JITSymbol findSymbol(StringRef Name) {
-    return findMangledSymbol(Mangle(Name));
+    return findMangledSymbol(mangle(Name));
   }
 
   void finalizeObject() override {
@@ -318,12 +408,9 @@ private:
         }
         std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
         if (ChildBin->isObject()) {
-          std::unique_ptr<object::ObjectFile> ChildObj(
-            static_cast<object::ObjectFile*>(ChildBinOrErr->release()));
-          auto Obj =
-            std::make_shared<object::OwningBinary<object::ObjectFile>>(
-              std::move(ChildObj), nullptr);
-          cantFail(ObjectLayer.addObject(std::move(Obj), Resolver));
+          cantFail(ObjectLayer.addObject(
+              ES.allocateVModule(),
+              MemoryBuffer::getMemBufferCopy(ChildBin->getData())));
           if (auto Sym = ObjectLayer.findSymbol(Name, true))
             return Sym;
         }
@@ -339,12 +426,11 @@ private:
 
     NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
 
-    void operator()(RTDyldObjectLinkingLayerBase::ObjHandleT H,
-                    const RTDyldObjectLinkingLayer::ObjectPtr &Obj,
+    void operator()(VModuleKey K, const object::ObjectFile &Obj,
                     const RuntimeDyld::LoadedObjectInfo &Info) const {
-      M.UnfinalizedSections[H] = std::move(M.SectionsAllocatedSinceLastLoad);
+      M.UnfinalizedSections[K] = std::move(M.SectionsAllocatedSinceLastLoad);
       M.SectionsAllocatedSinceLastLoad = SectionAddrSet();
-      M.MemMgr->notifyObjectLoaded(&M, *Obj->getBinary());
+      M.MemMgr->notifyObjectLoaded(&M, Obj);
     }
   private:
     OrcMCJITReplacement &M;
@@ -354,15 +440,16 @@ private:
   public:
     NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
 
-    void operator()(RTDyldObjectLinkingLayerBase::ObjHandleT H) {
-      M.UnfinalizedSections.erase(H);
+    void operator()(VModuleKey K, const object::ObjectFile &Obj,
+                    const RuntimeDyld::LoadedObjectInfo &Info) {
+      M.UnfinalizedSections.erase(K);
     }
 
   private:
     OrcMCJITReplacement &M;
   };
 
-  std::string Mangle(StringRef Name) {
+  std::string mangle(StringRef Name) {
     std::string MangledName;
     {
       raw_string_ostream MangledNameStream(MangledName);
@@ -375,17 +462,18 @@ private:
   using CompileLayerT = IRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
   using LazyEmitLayerT = LazyEmittingLayer<CompileLayerT>;
 
+  ExecutionSession ES;
+
   std::unique_ptr<TargetMachine> TM;
   std::shared_ptr<MCJITReplacementMemMgr> MemMgr;
-  std::shared_ptr<LinkingResolver> Resolver;
-  std::shared_ptr<JITSymbolResolver> ClientResolver;
+  std::shared_ptr<LinkingORCResolver> Resolver;
+  std::shared_ptr<LegacyJITSymbolResolver> ClientResolver;
   Mangler Mang;
 
   // IMPORTANT: ShouldDelete *must* come before LocalModules: The shared_ptr
   // delete blocks in LocalModules refer to the ShouldDelete map, so
   // LocalModules needs to be destructed before ShouldDelete.
   std::map<Module*, bool> ShouldDelete;
-  std::vector<std::shared_ptr<Module>> LocalModules;
 
   NotifyObjectLoadedT NotifyObjectLoaded;
   NotifyFinalizedT NotifyFinalized;
@@ -394,19 +482,15 @@ private:
   CompileLayerT CompileLayer;
   LazyEmitLayerT LazyEmitLayer;
 
+  std::map<VModuleKey, std::vector<std::string>> UnexecutedConstructors;
+  std::map<VModuleKey, std::vector<std::string>> UnexecutedDestructors;
+
   // We need to store ObjLayerT::ObjSetHandles for each of the object sets
   // that have been emitted but not yet finalized so that we can forward the
   // mapSectionAddress calls appropriately.
   using SectionAddrSet = std::set<const void *>;
-  struct ObjHandleCompare {
-    bool operator()(ObjectLayerT::ObjHandleT H1,
-                    ObjectLayerT::ObjHandleT H2) const {
-      return &*H1 < &*H2;
-    }
-  };
   SectionAddrSet SectionsAllocatedSinceLastLoad;
-  std::map<ObjectLayerT::ObjHandleT, SectionAddrSet, ObjHandleCompare>
-      UnfinalizedSections;
+  std::map<VModuleKey, SectionAddrSet> UnfinalizedSections;
 
   std::vector<object::OwningBinary<object::Archive>> Archives;
 };
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
new file mode 100644
index 000000000000..71b4b73ca6d3
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -0,0 +1,177 @@
+//===-- RTDyldObjectLinkingLayer.cpp - RuntimeDyld backed ORC ObjectLayer -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+
+namespace {
+
+using namespace llvm;
+using namespace llvm::orc;
+
+class VSOSearchOrderResolver : public JITSymbolResolver {
+public:
+  VSOSearchOrderResolver(MaterializationResponsibility &MR) : MR(MR) {}
+
+  Expected<LookupResult> lookup(const LookupSet &Symbols) {
+    auto &ES = MR.getTargetVSO().getExecutionSession();
+    SymbolNameSet InternedSymbols;
+
+    for (auto &S : Symbols)
+      InternedSymbols.insert(ES.getSymbolStringPool().intern(S));
+
+    auto RegisterDependencies = [&](const SymbolDependenceMap &Deps) {
+      MR.addDependenciesForAll(Deps);
+    };
+
+    auto InternedResult =
+        MR.getTargetVSO().withSearchOrderDo([&](const VSOList &VSOs) {
+          return ES.lookup(VSOs, InternedSymbols, RegisterDependencies, false);
+        });
+
+    if (!InternedResult)
+      return InternedResult.takeError();
+
+    LookupResult Result;
+    for (auto &KV : *InternedResult)
+      Result[*KV.first] = std::move(KV.second);
+
+    return Result;
+  }
+
+  Expected<LookupFlagsResult> lookupFlags(const LookupSet &Symbols) {
+    auto &ES = MR.getTargetVSO().getExecutionSession();
+
+    SymbolNameSet InternedSymbols;
+
+    for (auto &S : Symbols)
+      InternedSymbols.insert(ES.getSymbolStringPool().intern(S));
+
+    SymbolFlagsMap InternedResult;
+    MR.getTargetVSO().withSearchOrderDo([&](const VSOList &VSOs) {
+      // An empty search order is pathalogical, but allowed.
+      if (VSOs.empty())
+        return;
+
+      assert(VSOs.front() && "VSOList entry can not be null");
+      InternedResult = VSOs.front()->lookupFlags(InternedSymbols);
+    });
+
+    LookupFlagsResult Result;
+    for (auto &KV : InternedResult)
+      Result[*KV.first] = std::move(KV.second);
+
+    return Result;
+  }
+
+private:
+  MaterializationResponsibility &MR;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+namespace orc {
+
+RTDyldObjectLinkingLayer2::RTDyldObjectLinkingLayer2(
+    ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager,
+    NotifyLoadedFunction NotifyLoaded, NotifyFinalizedFunction NotifyFinalized)
+    : ObjectLayer(ES), GetMemoryManager(GetMemoryManager),
+      NotifyLoaded(std::move(NotifyLoaded)),
+      NotifyFinalized(std::move(NotifyFinalized)), ProcessAllSections(false) {}
+
+void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
+                                     VModuleKey K,
+                                     std::unique_ptr<MemoryBuffer> O) {
+  assert(O && "Object must not be null");
+
+  auto &ES = getExecutionSession();
+
+  auto ObjFile = object::ObjectFile::createObjectFile(*O);
+  if (!ObjFile) {
+    getExecutionSession().reportError(ObjFile.takeError());
+    R.failMaterialization();
+  }
+
+  auto MemoryManager = GetMemoryManager(K);
+
+  VSOSearchOrderResolver Resolver(R);
+  auto RTDyld = llvm::make_unique<RuntimeDyld>(*MemoryManager, Resolver);
+  RTDyld->setProcessAllSections(ProcessAllSections);
+
+  {
+    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+
+    assert(!ActiveRTDylds.count(K) &&
+           "An active RTDyld already exists for this key?");
+    ActiveRTDylds[K] = RTDyld.get();
+
+    assert(!MemMgrs.count(K) &&
+           "A memory manager already exists for this key?");
+    MemMgrs[K] = std::move(MemoryManager);
+  }
+
+  auto Info = RTDyld->loadObject(**ObjFile);
+
+  {
+    std::set<StringRef> InternalSymbols;
+    for (auto &Sym : (*ObjFile)->symbols()) {
+      if (!(Sym.getFlags() & object::BasicSymbolRef::SF_Global)) {
+        if (auto SymName = Sym.getName())
+          InternalSymbols.insert(*SymName);
+        else {
+          ES.reportError(SymName.takeError());
+          R.failMaterialization();
+          return;
+        }
+      }
+    }
+
+    SymbolMap Symbols;
+    for (auto &KV : RTDyld->getSymbolTable())
+      if (!InternalSymbols.count(KV.first))
+        Symbols[ES.getSymbolStringPool().intern(KV.first)] = KV.second;
+
+    R.resolve(Symbols);
+  }
+
+  if (NotifyLoaded)
+    NotifyLoaded(K, **ObjFile, *Info);
+
+  RTDyld->finalizeWithMemoryManagerLocking();
+
+  {
+    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+    ActiveRTDylds.erase(K);
+  }
+
+  if (RTDyld->hasError()) {
+    ES.reportError(make_error<StringError>(RTDyld->getErrorString(),
+                                           inconvertibleErrorCode()));
+    R.failMaterialization();
+    return;
+  }
+
+  R.finalize();
+
+  if (NotifyFinalized)
+    NotifyFinalized(K);
+}
+
+void RTDyldObjectLinkingLayer2::mapSectionAddress(
+    VModuleKey K, const void *LocalAddress, JITTargetAddress TargetAddr) const {
+  std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+  auto ActiveRTDyldItr = ActiveRTDylds.find(K);
+
+  assert(ActiveRTDyldItr != ActiveRTDylds.end() &&
+         "No active RTDyld instance found for key");
+  ActiveRTDyldItr->second->mapSectionAddress(LocalAddress, TargetAddr);
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
new file mode 100644
index 000000000000..7bf8120d23df
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
@@ -0,0 +1,497 @@
+//===-- PerfJITEventListener.cpp - Tell Linux's perf about JITted code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITEventListener object that tells perf about JITted
+// functions, including source line information.
+//
+// Documentation for perf jit integration is available at:
+// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/jitdump-specification.txt
+// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/jit-interface.txt
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/SymbolSize.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <sys/mman.h>  // mmap()
+#include <sys/types.h> // getpid()
+#include <time.h>      // clock_gettime(), time(), localtime_r() */
+#include <unistd.h>    // for getpid(), read(), close()
+
+using namespace llvm;
+using namespace llvm::object;
+typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+
+namespace {
+
+// language identifier (XXX: should we generate something better from debug
+// info?)
+#define JIT_LANG "llvm-IR"
+#define LLVM_PERF_JIT_MAGIC                                                    \
+  ((uint32_t)'J' << 24 | (uint32_t)'i' << 16 | (uint32_t)'T' << 8 |            \
+   (uint32_t)'D')
+#define LLVM_PERF_JIT_VERSION 1
+
+// bit 0: set if the jitdump file is using an architecture-specific timestamp
+// clock source
+#define JITDUMP_FLAGS_ARCH_TIMESTAMP (1ULL << 0)
+
+struct LLVMPerfJitHeader;
+
+class PerfJITEventListener : public JITEventListener {
+public:
+  PerfJITEventListener();
+  ~PerfJITEventListener() {
+    if (MarkerAddr)
+      CloseMarker();
+  }
+
+  void NotifyObjectEmitted(const ObjectFile &Obj,
+                           const RuntimeDyld::LoadedObjectInfo &L) override;
+  void NotifyFreeingObject(const ObjectFile &Obj) override;
+
+private:
+  bool InitDebuggingDir();
+  bool OpenMarker();
+  void CloseMarker();
+  static bool FillMachine(LLVMPerfJitHeader &hdr);
+
+  void NotifyCode(Expected<llvm::StringRef> &Symbol, uint64_t CodeAddr,
+                  uint64_t CodeSize);
+  void NotifyDebug(uint64_t CodeAddr, DILineInfoTable Lines);
+
+  // cache lookups
+  pid_t Pid;
+
+  // base directory for output data
+  std::string JitPath;
+
+  // output data stream, closed via Dumpstream
+  int DumpFd = -1;
+
+  // output data stream
+  std::unique_ptr<raw_fd_ostream> Dumpstream;
+
+  // prevent concurrent dumps from messing up the output file
+  sys::Mutex Mutex;
+
+  // perf mmap marker
+  void *MarkerAddr = NULL;
+
+  // perf support ready
+  bool SuccessfullyInitialized = false;
+
+  // identifier for functions, primarily to identify when moving them around
+  uint64_t CodeGeneration = 1;
+};
+
+// The following are POD struct definitions from the perf jit specification
+
+enum LLVMPerfJitRecordType {
+  JIT_CODE_LOAD = 0,
+  JIT_CODE_MOVE = 1, // not emitted, code isn't moved
+  JIT_CODE_DEBUG_INFO = 2,
+  JIT_CODE_CLOSE = 3,          // not emitted, unnecessary
+  JIT_CODE_UNWINDING_INFO = 4, // not emitted
+
+  JIT_CODE_MAX
+};
+
+struct LLVMPerfJitHeader {
+  uint32_t Magic;     // characters "JiTD"
+  uint32_t Version;   // header version
+  uint32_t TotalSize; // total size of header
+  uint32_t ElfMach;   // elf mach target
+  uint32_t Pad1;      // reserved
+  uint32_t Pid;
+  uint64_t Timestamp; // timestamp
+  uint64_t Flags;     // flags
+};
+
+// record prefix (mandatory in each record)
+struct LLVMPerfJitRecordPrefix {
+  uint32_t Id; // record type identifier
+  uint32_t TotalSize;
+  uint64_t Timestamp;
+};
+
+struct LLVMPerfJitRecordCodeLoad {
+  LLVMPerfJitRecordPrefix Prefix;
+
+  uint32_t Pid;
+  uint32_t Tid;
+  uint64_t Vma;
+  uint64_t CodeAddr;
+  uint64_t CodeSize;
+  uint64_t CodeIndex;
+};
+
+struct LLVMPerfJitDebugEntry {
+  uint64_t Addr;
+  int Lineno;  // source line number starting at 1
+  int Discrim; // column discriminator, 0 is default
+  // followed by null terminated filename, \xff\0 if same as previous entry
+};
+
+struct LLVMPerfJitRecordDebugInfo {
+  LLVMPerfJitRecordPrefix Prefix;
+
+  uint64_t CodeAddr;
+  uint64_t NrEntry;
+  // followed by NrEntry LLVMPerfJitDebugEntry records
+};
+
+static inline uint64_t timespec_to_ns(const struct timespec *ts) {
+  const uint64_t NanoSecPerSec = 1000000000;
+  return ((uint64_t)ts->tv_sec * NanoSecPerSec) + ts->tv_nsec;
+}
+
+static inline uint64_t perf_get_timestamp(void) {
+  struct timespec ts;
+  int ret;
+
+  ret = clock_gettime(CLOCK_MONOTONIC, &ts);
+  if (ret)
+    return 0;
+
+  return timespec_to_ns(&ts);
+}
+
+PerfJITEventListener::PerfJITEventListener() : Pid(::getpid()) {
+  // check if clock-source is supported
+  if (!perf_get_timestamp()) {
+    errs() << "kernel does not support CLOCK_MONOTONIC\n";
+    return;
+  }
+
+  if (!InitDebuggingDir()) {
+    errs() << "could not initialize debugging directory\n";
+    return;
+  }
+
+  std::string Filename;
+  raw_string_ostream FilenameBuf(Filename);
+  FilenameBuf << JitPath << "/jit-" << Pid << ".dump";
+
+  // Need to open ourselves, because we need to hand the FD to OpenMarker() and
+  // raw_fd_ostream doesn't expose the FD.
+  using sys::fs::openFileForWrite;
+  if (auto EC =
+          openFileForReadWrite(FilenameBuf.str(), DumpFd,
+			       sys::fs::CD_CreateNew, sys::fs::OF_None)) {
+    errs() << "could not open JIT dump file " << FilenameBuf.str() << ": "
+           << EC.message() << "\n";
+    return;
+  }
+
+  Dumpstream = make_unique<raw_fd_ostream>(DumpFd, true);
+
+  LLVMPerfJitHeader Header = {0};
+  if (!FillMachine(Header))
+    return;
+
+  // signal this process emits JIT information
+  if (!OpenMarker())
+    return;
+
+  // emit dumpstream header
+  Header.Magic = LLVM_PERF_JIT_MAGIC;
+  Header.Version = LLVM_PERF_JIT_VERSION;
+  Header.TotalSize = sizeof(Header);
+  Header.Pid = Pid;
+  Header.Timestamp = perf_get_timestamp();
+  Dumpstream->write(reinterpret_cast<const char *>(&Header), sizeof(Header));
+
+  // Everything initialized, can do profiling now.
+  if (!Dumpstream->has_error())
+    SuccessfullyInitialized = true;
+}
+
+void PerfJITEventListener::NotifyObjectEmitted(
+    const ObjectFile &Obj, const RuntimeDyld::LoadedObjectInfo &L) {
+
+  if (!SuccessfullyInitialized)
+    return;
+
+  OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+  const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+
+  // Get the address of the object image for use as a unique identifier
+  std::unique_ptr<DIContext> Context = DWARFContext::create(DebugObj);
+
+  // Use symbol info to iterate over functions in the object.
+  for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
+    SymbolRef Sym = P.first;
+    std::string SourceFileName;
+
+    Expected<SymbolRef::Type> SymTypeOrErr = Sym.getType();
+    if (!SymTypeOrErr) {
+      // There's not much we can with errors here
+      consumeError(SymTypeOrErr.takeError());
+      continue;
+    }
+    SymbolRef::Type SymType = *SymTypeOrErr;
+    if (SymType != SymbolRef::ST_Function)
+      continue;
+
+    Expected<StringRef> Name = Sym.getName();
+    if (!Name) {
+      consumeError(Name.takeError());
+      continue;
+    }
+
+    Expected<uint64_t> AddrOrErr = Sym.getAddress();
+    if (!AddrOrErr) {
+      consumeError(AddrOrErr.takeError());
+      continue;
+    }
+    uint64_t Addr = *AddrOrErr;
+    uint64_t Size = P.second;
+
+    // According to spec debugging info has to come before loading the
+    // corresonding code load.
+    DILineInfoTable Lines = Context->getLineInfoForAddressRange(
+        Addr, Size, FileLineInfoKind::AbsoluteFilePath);
+
+    NotifyDebug(Addr, Lines);
+    NotifyCode(Name, Addr, Size);
+  }
+
+  Dumpstream->flush();
+}
+
+void PerfJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
+  // perf currently doesn't have an interface for unloading. But munmap()ing the
+  // code section does, so that's ok.
+}
+
+bool PerfJITEventListener::InitDebuggingDir() {
+  time_t Time;
+  struct tm LocalTime;
+  char TimeBuffer[sizeof("YYYYMMDD")];
+  SmallString<64> Path;
+
+  // search for location to dump data to
+  if (const char *BaseDir = getenv("JITDUMPDIR"))
+    Path.append(BaseDir);
+  else if (!sys::path::home_directory(Path))
+    Path = ".";
+
+  // create debug directory
+  Path += "/.debug/jit/";
+  if (auto EC = sys::fs::create_directories(Path)) {
+    errs() << "could not create jit cache directory " << Path << ": "
+           << EC.message() << "\n";
+    return false;
+  }
+
+  // create unique directory for dump data related to this process
+  time(&Time);
+  localtime_r(&Time, &LocalTime);
+  strftime(TimeBuffer, sizeof(TimeBuffer), "%Y%m%d", &LocalTime);
+  Path += JIT_LANG "-jit-";
+  Path += TimeBuffer;
+
+  SmallString<128> UniqueDebugDir;
+
+  using sys::fs::createUniqueDirectory;
+  if (auto EC = createUniqueDirectory(Path, UniqueDebugDir)) {
+    errs() << "could not create unique jit cache directory " << UniqueDebugDir
+           << ": " << EC.message() << "\n";
+    return false;
+  }
+
+  JitPath = UniqueDebugDir.str();
+
+  return true;
+}
+
+bool PerfJITEventListener::OpenMarker() {
+  // We mmap the jitdump to create an MMAP RECORD in perf.data file.  The mmap
+  // is captured either live (perf record running when we mmap) or in deferred
+  // mode, via /proc/PID/maps. The MMAP record is used as a marker of a jitdump
+  // file for more meta data info about the jitted code. Perf report/annotate
+  // detect this special filename and process the jitdump file.
+  //
+  // Mapping must be PROT_EXEC to ensure it is captured by perf record
+  // even when not using -d option.
+  MarkerAddr = ::mmap(NULL, sys::Process::getPageSize(), PROT_READ | PROT_EXEC,
+                      MAP_PRIVATE, DumpFd, 0);
+
+  if (MarkerAddr == MAP_FAILED) {
+    errs() << "could not mmap JIT marker\n";
+    return false;
+  }
+  return true;
+}
+
+void PerfJITEventListener::CloseMarker() {
+  if (!MarkerAddr)
+    return;
+
+  munmap(MarkerAddr, sys::Process::getPageSize());
+  MarkerAddr = nullptr;
+}
+
+bool PerfJITEventListener::FillMachine(LLVMPerfJitHeader &hdr) {
+  char id[16];
+  struct {
+    uint16_t e_type;
+    uint16_t e_machine;
+  } info;
+
+  size_t RequiredMemory = sizeof(id) + sizeof(info);
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+    MemoryBuffer::getFileSlice("/proc/self/exe",
+			       RequiredMemory,
+			       0);
+
+  // This'll not guarantee that enough data was actually read from the
+  // underlying file. Instead the trailing part of the buffer would be
+  // zeroed. Given the ELF signature check below that seems ok though,
+  // it's unlikely that the file ends just after that, and the
+  // consequence would just be that perf wouldn't recognize the
+  // signature.
+  if (auto EC = MB.getError()) {
+    errs() << "could not open /proc/self/exe: " << EC.message() << "\n";
+    return false;
+  }
+
+  memcpy(&id, (*MB)->getBufferStart(), sizeof(id));
+  memcpy(&info, (*MB)->getBufferStart() + sizeof(id), sizeof(info));
+
+  // check ELF signature
+  if (id[0] != 0x7f || id[1] != 'E' || id[2] != 'L' || id[3] != 'F') {
+    errs() << "invalid elf signature\n";
+    return false;
+  }
+
+  hdr.ElfMach = info.e_machine;
+
+  return true;
+}
+
+void PerfJITEventListener::NotifyCode(Expected<llvm::StringRef> &Symbol,
+                                      uint64_t CodeAddr, uint64_t CodeSize) {
+  assert(SuccessfullyInitialized);
+
+  // 0 length functions can't have samples.
+  if (CodeSize == 0)
+    return;
+
+  LLVMPerfJitRecordCodeLoad rec;
+  rec.Prefix.Id = JIT_CODE_LOAD;
+  rec.Prefix.TotalSize = sizeof(rec) +        // debug record itself
+                         Symbol->size() + 1 + // symbol name
+                         CodeSize;            // and code
+  rec.Prefix.Timestamp = perf_get_timestamp();
+
+  rec.CodeSize = CodeSize;
+  rec.Vma = 0;
+  rec.CodeAddr = CodeAddr;
+  rec.Pid = Pid;
+  rec.Tid = get_threadid();
+
+  // avoid interspersing output
+  MutexGuard Guard(Mutex);
+
+  rec.CodeIndex = CodeGeneration++; // under lock!
+
+  Dumpstream->write(reinterpret_cast<const char *>(&rec), sizeof(rec));
+  Dumpstream->write(Symbol->data(), Symbol->size() + 1);
+  Dumpstream->write(reinterpret_cast<const char *>(CodeAddr), CodeSize);
+}
+
+void PerfJITEventListener::NotifyDebug(uint64_t CodeAddr,
+                                       DILineInfoTable Lines) {
+  assert(SuccessfullyInitialized);
+
+  // Didn't get useful debug info.
+  if (Lines.empty())
+    return;
+
+  LLVMPerfJitRecordDebugInfo rec;
+  rec.Prefix.Id = JIT_CODE_DEBUG_INFO;
+  rec.Prefix.TotalSize = sizeof(rec); // will be increased further
+  rec.Prefix.Timestamp = perf_get_timestamp();
+  rec.CodeAddr = CodeAddr;
+  rec.NrEntry = Lines.size();
+
+  // compute total size size of record (variable due to filenames)
+  DILineInfoTable::iterator Begin = Lines.begin();
+  DILineInfoTable::iterator End = Lines.end();
+  for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+    DILineInfo &line = It->second;
+    rec.Prefix.TotalSize += sizeof(LLVMPerfJitDebugEntry);
+    rec.Prefix.TotalSize += line.FileName.size() + 1;
+  }
+
+  // The debug_entry describes the source line information. It is defined as
+  // follows in order:
+  // * uint64_t code_addr: address of function for which the debug information
+  // is generated
+  // * uint32_t line     : source file line number (starting at 1)
+  // * uint32_t discrim  : column discriminator, 0 is default
+  // * char name[n]      : source file name in ASCII, including null termination
+
+  // avoid interspersing output
+  MutexGuard Guard(Mutex);
+
+  Dumpstream->write(reinterpret_cast<const char *>(&rec), sizeof(rec));
+
+  for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+    LLVMPerfJitDebugEntry LineInfo;
+    DILineInfo &Line = It->second;
+
+    LineInfo.Addr = It->first;
+    // The function re-created by perf is preceded by a elf
+    // header. Need to adjust for that, otherwise the results are
+    // wrong.
+    LineInfo.Addr += 0x40;
+    LineInfo.Lineno = Line.Line;
+    LineInfo.Discrim = Line.Discriminator;
+
+    Dumpstream->write(reinterpret_cast<const char *>(&LineInfo),
+                      sizeof(LineInfo));
+    Dumpstream->write(Line.FileName.c_str(), Line.FileName.size() + 1);
+  }
+}
+
+// There should be only a single event listener per process, otherwise perf gets
+// confused.
+llvm::ManagedStatic<PerfJITEventListener> PerfListener;
+
+} // end anonymous namespace
+
+namespace llvm {
+JITEventListener *JITEventListener::createPerfJITEventListener() {
+  return &*PerfListener;
+}
+
+} // namespace llvm
+
+LLVMJITEventListenerRef LLVMCreatePerfJITEventListener(void)
+{
+  return wrap(JITEventListener::createPerfJITEventListener());
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
index 87059ef2b88f..18eb0e461921 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
@@ -47,3 +47,53 @@ ARMJITSymbolFlags llvm::ARMJITSymbolFlags::fromObjectSymbol(
     Flags |= ARMJITSymbolFlags::Thumb;
   return Flags;
 }
+
+/// Performs lookup by, for each symbol, first calling
+///        findSymbolInLogicalDylib and if that fails calling
+///        findSymbol.
+Expected<JITSymbolResolver::LookupResult>
+LegacyJITSymbolResolver::lookup(const LookupSet &Symbols) {
+  JITSymbolResolver::LookupResult Result;
+  for (auto &Symbol : Symbols) {
+    std::string SymName = Symbol.str();
+    if (auto Sym = findSymbolInLogicalDylib(SymName)) {
+      if (auto AddrOrErr = Sym.getAddress())
+        Result[Symbol] = JITEvaluatedSymbol(*AddrOrErr, Sym.getFlags());
+      else
+        return AddrOrErr.takeError();
+    } else if (auto Err = Sym.takeError())
+      return std::move(Err);
+    else {
+      // findSymbolInLogicalDylib failed. Lets try findSymbol.
+      if (auto Sym = findSymbol(SymName)) {
+        if (auto AddrOrErr = Sym.getAddress())
+          Result[Symbol] = JITEvaluatedSymbol(*AddrOrErr, Sym.getFlags());
+        else
+          return AddrOrErr.takeError();
+      } else if (auto Err = Sym.takeError())
+        return std::move(Err);
+      else
+        return make_error<StringError>("Symbol not found: " + Symbol,
+                                       inconvertibleErrorCode());
+    }
+  }
+
+  return std::move(Result);
+}
+
+/// Performs flags lookup by calling findSymbolInLogicalDylib and
+///        returning the flags value for that symbol.
+Expected<JITSymbolResolver::LookupFlagsResult>
+LegacyJITSymbolResolver::lookupFlags(const LookupSet &Symbols) {
+  JITSymbolResolver::LookupFlagsResult Result;
+
+  for (auto &Symbol : Symbols) {
+    std::string SymName = Symbol.str();
+    if (auto Sym = findSymbolInLogicalDylib(SymName))
+      Result[Symbol] = Sym.getFlags();
+    else if (auto Err = Sym.takeError())
+      return std::move(Err);
+  }
+
+  return std::move(Result);
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index 99e84b7496d4..e774af05ebdd 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -298,4 +298,6 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   return (void*)Addr;
 }
 
+void RTDyldMemoryManager::anchor() {}
+void MCJITMemoryManager::anchor() {}
 } // namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index c5e4dfa1e536..1189be599edd 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -122,10 +122,8 @@ void RuntimeDyldImpl::resolveRelocations() {
   MutexGuard locked(lock);
 
   // Print out the sections prior to relocation.
-  DEBUG(
-    for (int i = 0, e = Sections.size(); i != e; ++i)
-      dumpSectionMemory(Sections[i], "before relocations");
-  );
+  LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i)
+                 dumpSectionMemory(Sections[i], "before relocations"););
 
   // First, resolve relocations associated with external symbols.
   if (auto Err = resolveExternalSymbols()) {
@@ -140,18 +138,15 @@ void RuntimeDyldImpl::resolveRelocations() {
     // entry provides the section to which the relocation will be applied.
     int Idx = it->first;
     uint64_t Addr = Sections[Idx].getLoadAddress();
-    DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t"
-                 << format("%p", (uintptr_t)Addr) << "\n");
+    LLVM_DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t"
+                      << format("%p", (uintptr_t)Addr) << "\n");
     resolveRelocationList(it->second, Addr);
   }
   Relocations.clear();
 
   // Print out sections after relocation.
-  DEBUG(
-    for (int i = 0, e = Sections.size(); i != e; ++i)
-      dumpSectionMemory(Sections[i], "after relocations");
-  );
-
+  LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i)
+                 dumpSectionMemory(Sections[i], "after relocations"););
 }
 
 void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
@@ -202,10 +197,35 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   ObjSectionToIDMap LocalSections;
 
   // Common symbols requiring allocation, with their sizes and alignments
-  CommonSymbolList CommonSymbols;
+  CommonSymbolList CommonSymbolsToAllocate;
+
+  uint64_t CommonSize = 0;
+  uint32_t CommonAlign = 0;
+
+  // First, collect all weak and common symbols. We need to know if stronger
+  // definitions occur elsewhere.
+  JITSymbolResolver::LookupFlagsResult SymbolFlags;
+  {
+    JITSymbolResolver::LookupSet Symbols;
+    for (auto &Sym : Obj.symbols()) {
+      uint32_t Flags = Sym.getFlags();
+      if ((Flags & SymbolRef::SF_Common) || (Flags & SymbolRef::SF_Weak)) {
+        // Get symbol name.
+        if (auto NameOrErr = Sym.getName())
+          Symbols.insert(*NameOrErr);
+        else
+          return NameOrErr.takeError();
+      }
+    }
+
+    if (auto FlagsResultOrErr = Resolver.lookupFlags(Symbols))
+      SymbolFlags = std::move(*FlagsResultOrErr);
+    else
+      return FlagsResultOrErr.takeError();
+  }
 
   // Parse symbols
-  DEBUG(dbgs() << "Parse symbols:\n");
+  LLVM_DEBUG(dbgs() << "Parse symbols:\n");
   for (symbol_iterator I = Obj.symbol_begin(), E = Obj.symbol_end(); I != E;
        ++I) {
     uint32_t Flags = I->getFlags();
@@ -214,106 +234,112 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
     if (Flags & SymbolRef::SF_Undefined)
       continue;
 
-    if (Flags & SymbolRef::SF_Common)
-      CommonSymbols.push_back(*I);
-    else {
+    // Get the symbol type.
+    object::SymbolRef::Type SymType;
+    if (auto SymTypeOrErr = I->getType())
+      SymType = *SymTypeOrErr;
+    else
+      return SymTypeOrErr.takeError();
 
-      // Get the symbol type.
-      object::SymbolRef::Type SymType;
-      if (auto SymTypeOrErr = I->getType())
-        SymType =  *SymTypeOrErr;
-      else
-        return SymTypeOrErr.takeError();
+    // Get symbol name.
+    StringRef Name;
+    if (auto NameOrErr = I->getName())
+      Name = *NameOrErr;
+    else
+      return NameOrErr.takeError();
 
-      // Get symbol name.
-      StringRef Name;
-      if (auto NameOrErr = I->getName())
-        Name = *NameOrErr;
-      else
-        return NameOrErr.takeError();
-
-      // Compute JIT symbol flags.
-      JITSymbolFlags JITSymFlags = getJITSymbolFlags(*I);
-
-      // If this is a weak definition, check to see if there's a strong one.
-      // If there is, skip this symbol (we won't be providing it: the strong
-      // definition will). If there's no strong definition, make this definition
-      // strong.
-      if (JITSymFlags.isWeak()) {
-        // First check whether there's already a definition in this instance.
-        // FIXME: Override existing weak definitions with strong ones.
-        if (GlobalSymbolTable.count(Name))
-          continue;
-        // Then check the symbol resolver to see if there's a definition
-        // elsewhere in this logical dylib.
-        if (auto Sym = Resolver.findSymbolInLogicalDylib(Name)) {
-          if (Sym.getFlags().isStrongDefinition())
-            continue;
-        } else if (auto Err = Sym.takeError())
-          return std::move(Err);
-        // else
-        JITSymFlags &= ~JITSymbolFlags::Weak;
-      }
+    // Compute JIT symbol flags.
+    JITSymbolFlags JITSymFlags = getJITSymbolFlags(*I);
+
+    // If this is a weak definition, check to see if there's a strong one.
+    // If there is, skip this symbol (we won't be providing it: the strong
+    // definition will). If there's no strong definition, make this definition
+    // strong.
+    if (JITSymFlags.isWeak() || JITSymFlags.isCommon()) {
+      // First check whether there's already a definition in this instance.
+      // FIXME: Override existing weak definitions with strong ones.
+      if (GlobalSymbolTable.count(Name))
+        continue;
 
-      if (Flags & SymbolRef::SF_Absolute &&
-          SymType != object::SymbolRef::ST_File) {
-        uint64_t Addr = 0;
-        if (auto AddrOrErr = I->getAddress())
-          Addr = *AddrOrErr;
-        else
-          return AddrOrErr.takeError();
-
-        unsigned SectionID = AbsoluteSymbolSection;
-
-        DEBUG(dbgs() << "\tType: " << SymType << " (absolute) Name: " << Name
-                     << " SID: " << SectionID << " Offset: "
-                     << format("%p", (uintptr_t)Addr)
-                     << " flags: " << Flags << "\n");
-        GlobalSymbolTable[Name] =
-          SymbolTableEntry(SectionID, Addr, JITSymFlags);
-      } else if (SymType == object::SymbolRef::ST_Function ||
-                 SymType == object::SymbolRef::ST_Data ||
-                 SymType == object::SymbolRef::ST_Unknown ||
-                 SymType == object::SymbolRef::ST_Other) {
-
-        section_iterator SI = Obj.section_end();
-        if (auto SIOrErr = I->getSection())
-          SI = *SIOrErr;
-        else
-          return SIOrErr.takeError();
+      // Then check whether we found flags for an existing symbol during the
+      // flags lookup earlier.
+      auto FlagsI = SymbolFlags.find(Name);
+      if (FlagsI == SymbolFlags.end() ||
+          (JITSymFlags.isWeak() && !FlagsI->second.isStrong()) ||
+          (JITSymFlags.isCommon() && FlagsI->second.isCommon())) {
+        if (JITSymFlags.isWeak())
+          JITSymFlags &= ~JITSymbolFlags::Weak;
+        if (JITSymFlags.isCommon()) {
+          JITSymFlags &= ~JITSymbolFlags::Common;
+          uint32_t Align = I->getAlignment();
+          uint64_t Size = I->getCommonSize();
+          if (!CommonAlign)
+            CommonAlign = Align;
+          CommonSize += alignTo(CommonSize, Align) + Size;
+          CommonSymbolsToAllocate.push_back(*I);
+        }
+      } else
+        continue;
+    }
 
-        if (SI == Obj.section_end())
-          continue;
+    if (Flags & SymbolRef::SF_Absolute &&
+        SymType != object::SymbolRef::ST_File) {
+      uint64_t Addr = 0;
+      if (auto AddrOrErr = I->getAddress())
+        Addr = *AddrOrErr;
+      else
+        return AddrOrErr.takeError();
+
+      unsigned SectionID = AbsoluteSymbolSection;
+
+      LLVM_DEBUG(dbgs() << "\tType: " << SymType << " (absolute) Name: " << Name
+                        << " SID: " << SectionID
+                        << " Offset: " << format("%p", (uintptr_t)Addr)
+                        << " flags: " << Flags << "\n");
+      GlobalSymbolTable[Name] = SymbolTableEntry(SectionID, Addr, JITSymFlags);
+    } else if (SymType == object::SymbolRef::ST_Function ||
+               SymType == object::SymbolRef::ST_Data ||
+               SymType == object::SymbolRef::ST_Unknown ||
+               SymType == object::SymbolRef::ST_Other) {
+
+      section_iterator SI = Obj.section_end();
+      if (auto SIOrErr = I->getSection())
+        SI = *SIOrErr;
+      else
+        return SIOrErr.takeError();
 
-        // Get symbol offset.
-        uint64_t SectOffset;
-        if (auto Err = getOffset(*I, *SI, SectOffset))
-          return std::move(Err);
+      if (SI == Obj.section_end())
+        continue;
 
-        bool IsCode = SI->isText();
-        unsigned SectionID;
-        if (auto SectionIDOrErr = findOrEmitSection(Obj, *SI, IsCode,
-                                                    LocalSections))
-          SectionID = *SectionIDOrErr;
-        else
-          return SectionIDOrErr.takeError();
+      // Get symbol offset.
+      uint64_t SectOffset;
+      if (auto Err = getOffset(*I, *SI, SectOffset))
+        return std::move(Err);
+
+      bool IsCode = SI->isText();
+      unsigned SectionID;
+      if (auto SectionIDOrErr =
+              findOrEmitSection(Obj, *SI, IsCode, LocalSections))
+        SectionID = *SectionIDOrErr;
+      else
+        return SectionIDOrErr.takeError();
 
-        DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name
-                     << " SID: " << SectionID << " Offset: "
-                     << format("%p", (uintptr_t)SectOffset)
-                     << " flags: " << Flags << "\n");
-        GlobalSymbolTable[Name] =
+      LLVM_DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name
+                        << " SID: " << SectionID
+                        << " Offset: " << format("%p", (uintptr_t)SectOffset)
+                        << " flags: " << Flags << "\n");
+      GlobalSymbolTable[Name] =
           SymbolTableEntry(SectionID, SectOffset, JITSymFlags);
-      }
     }
   }
 
   // Allocate common symbols
-  if (auto Err = emitCommonSymbols(Obj, CommonSymbols))
+  if (auto Err = emitCommonSymbols(Obj, CommonSymbolsToAllocate, CommonSize,
+                                   CommonAlign))
     return std::move(Err);
 
   // Parse and process relocations
-  DEBUG(dbgs() << "Parse relocations:\n");
+  LLVM_DEBUG(dbgs() << "Parse relocations:\n");
   for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
     StubMap Stubs;
@@ -336,7 +362,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
     else
       return SectionIDOrErr.takeError();
 
-    DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
+    LLVM_DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
 
     for (; I != E;)
       if (auto IOrErr = processRelocationRef(SectionID, I, Obj, LocalSections, Stubs))
@@ -621,45 +647,12 @@ JITSymbolFlags RuntimeDyldImpl::getJITSymbolFlags(const BasicSymbolRef &SR) {
 }
 
 Error RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
-                                         CommonSymbolList &CommonSymbols) {
-  if (CommonSymbols.empty())
+                                         CommonSymbolList &SymbolsToAllocate,
+                                         uint64_t CommonSize,
+                                         uint32_t CommonAlign) {
+  if (SymbolsToAllocate.empty())
     return Error::success();
 
-  uint64_t CommonSize = 0;
-  uint32_t CommonAlign = CommonSymbols.begin()->getAlignment();
-  CommonSymbolList SymbolsToAllocate;
-
-  DEBUG(dbgs() << "Processing common symbols...\n");
-
-  for (const auto &Sym : CommonSymbols) {
-    StringRef Name;
-    if (auto NameOrErr = Sym.getName())
-      Name = *NameOrErr;
-    else
-      return NameOrErr.takeError();
-
-    // Skip common symbols already elsewhere.
-    if (GlobalSymbolTable.count(Name)) {
-      DEBUG(dbgs() << "\tSkipping already emitted common symbol '" << Name
-                   << "'\n");
-      continue;
-    }
-
-    if (auto Sym = Resolver.findSymbolInLogicalDylib(Name)) {
-      if (!Sym.getFlags().isCommon()) {
-        DEBUG(dbgs() << "\tSkipping common symbol '" << Name
-                     << "' in favor of stronger definition.\n");
-        continue;
-      }
-    }
-    uint32_t Align = Sym.getAlignment();
-    uint64_t Size = Sym.getCommonSize();
-
-    CommonSize = alignTo(CommonSize, Align) + Size;
-
-    SymbolsToAllocate.push_back(Sym);
-  }
-
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
   uint8_t *Addr = MemMgr.allocateDataSection(CommonSize, CommonAlign, SectionID,
@@ -671,8 +664,9 @@ Error RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
       SectionEntry("<common symbols>", Addr, CommonSize, CommonSize, 0));
   memset(Addr, 0, CommonSize);
 
-  DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID << " new addr: "
-               << format("%p", Addr) << " DataSize: " << CommonSize << "\n");
+  LLVM_DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID
+                    << " new addr: " << format("%p", Addr)
+                    << " DataSize: " << CommonSize << "\n");
 
   // Assign the address of each symbol
   for (auto &Sym : SymbolsToAllocate) {
@@ -690,8 +684,8 @@ Error RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
       Offset += AlignOffset;
     }
     JITSymbolFlags JITSymFlags = getJITSymbolFlags(Sym);
-    DEBUG(dbgs() << "Allocating common symbol " << Name << " address "
-                 << format("%p", Addr) << "\n");
+    LLVM_DEBUG(dbgs() << "Allocating common symbol " << Name << " address "
+                      << format("%p", Addr) << "\n");
     GlobalSymbolTable[Name] =
       SymbolTableEntry(SectionID, Offset, JITSymFlags);
     Offset += Size;
@@ -787,21 +781,22 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
         DataSize &= ~(getStubAlignment() - 1);
     }
 
-    DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: " << Name
-                 << " obj addr: " << format("%p", pData)
-                 << " new addr: " << format("%p", Addr)
-                 << " DataSize: " << DataSize << " StubBufSize: " << StubBufSize
-                 << " Allocate: " << Allocate << "\n");
+    LLVM_DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: "
+                      << Name << " obj addr: " << format("%p", pData)
+                      << " new addr: " << format("%p", Addr) << " DataSize: "
+                      << DataSize << " StubBufSize: " << StubBufSize
+                      << " Allocate: " << Allocate << "\n");
   } else {
     // Even if we didn't load the section, we need to record an entry for it
     // to handle later processing (and by 'handle' I mean don't do anything
     // with these sections).
     Allocate = 0;
     Addr = nullptr;
-    DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: " << Name
-                 << " obj addr: " << format("%p", data.data()) << " new addr: 0"
-                 << " DataSize: " << DataSize << " StubBufSize: " << StubBufSize
-                 << " Allocate: " << Allocate << "\n");
+    LLVM_DEBUG(
+        dbgs() << "emitSection SectionID: " << SectionID << " Name: " << Name
+               << " obj addr: " << format("%p", data.data()) << " new addr: 0"
+               << " DataSize: " << DataSize << " StubBufSize: " << StubBufSize
+               << " Allocate: " << Allocate << "\n");
   }
 
   Sections.push_back(
@@ -978,10 +973,11 @@ void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
   // Addr is a uint64_t because we can't assume the pointer width
   // of the target is the same as that of the host. Just use a generic
   // "big enough" type.
-  DEBUG(dbgs() << "Reassigning address for section " << SectionID << " ("
-               << Sections[SectionID].getName() << "): "
-               << format("0x%016" PRIx64, Sections[SectionID].getLoadAddress())
-               << " -> " << format("0x%016" PRIx64, Addr) << "\n");
+  LLVM_DEBUG(
+      dbgs() << "Reassigning address for section " << SectionID << " ("
+             << Sections[SectionID].getName() << "): "
+             << format("0x%016" PRIx64, Sections[SectionID].getLoadAddress())
+             << " -> " << format("0x%016" PRIx64, Addr) << "\n");
   Sections[SectionID].setLoadAddress(Addr);
 }
 
@@ -997,14 +993,50 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
 }
 
 Error RuntimeDyldImpl::resolveExternalSymbols() {
+  StringMap<JITEvaluatedSymbol> ExternalSymbolMap;
+
+  // Resolution can trigger emission of more symbols, so iterate until
+  // we've resolved *everything*.
+  {
+    JITSymbolResolver::LookupSet ResolvedSymbols;
+
+    while (true) {
+      JITSymbolResolver::LookupSet NewSymbols;
+
+      for (auto &RelocKV : ExternalSymbolRelocations) {
+        StringRef Name = RelocKV.first();
+        if (!Name.empty() && !GlobalSymbolTable.count(Name) &&
+            !ResolvedSymbols.count(Name))
+          NewSymbols.insert(Name);
+      }
+
+      if (NewSymbols.empty())
+        break;
+
+      auto NewResolverResults = Resolver.lookup(NewSymbols);
+      if (!NewResolverResults)
+        return NewResolverResults.takeError();
+
+      assert(NewResolverResults->size() == NewSymbols.size() &&
+             "Should have errored on unresolved symbols");
+
+      for (auto &RRKV : *NewResolverResults) {
+        assert(!ResolvedSymbols.count(RRKV.first) && "Redundant resolution?");
+        ExternalSymbolMap.insert(RRKV);
+        ResolvedSymbols.insert(RRKV.first);
+      }
+    }
+  }
+
   while (!ExternalSymbolRelocations.empty()) {
+
     StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin();
 
     StringRef Name = i->first();
     if (Name.size() == 0) {
       // This is an absolute symbol, use an address of zero.
-      DEBUG(dbgs() << "Resolving absolute relocations."
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Resolving absolute relocations."
+                        << "\n");
       RelocationList &Relocs = i->second;
       resolveRelocationList(Relocs, 0);
     } else {
@@ -1012,29 +1044,10 @@ Error RuntimeDyldImpl::resolveExternalSymbols() {
       JITSymbolFlags Flags;
       RTDyldSymbolTable::const_iterator Loc = GlobalSymbolTable.find(Name);
       if (Loc == GlobalSymbolTable.end()) {
-        // This is an external symbol, try to get its address from the symbol
-        // resolver.
-        // First search for the symbol in this logical dylib.
-        if (auto Sym = Resolver.findSymbolInLogicalDylib(Name.data())) {
-          if (auto AddrOrErr = Sym.getAddress()) {
-            Addr = *AddrOrErr;
-            Flags = Sym.getFlags();
-          } else
-            return AddrOrErr.takeError();
-        } else if (auto Err = Sym.takeError())
-          return Err;
-
-        // If that fails, try searching for an external symbol.
-        if (!Addr) {
-          if (auto Sym = Resolver.findSymbol(Name.data())) {
-            if (auto AddrOrErr = Sym.getAddress()) {
-              Addr = *AddrOrErr;
-              Flags = Sym.getFlags();
-            } else
-              return AddrOrErr.takeError();
-          } else if (auto Err = Sym.takeError())
-            return Err;
-        }
+        auto RRI = ExternalSymbolMap.find(Name);
+        assert(RRI != ExternalSymbolMap.end() && "No result for symbol");
+        Addr = RRI->second.getAddress();
+        Flags = RRI->second.getFlags();
         // The call to getSymbolAddress may have caused additional modules to
         // be loaded, which may have added new entries to the
         // ExternalSymbolRelocations map.  Consquently, we need to update our
@@ -1065,8 +1078,8 @@ Error RuntimeDyldImpl::resolveExternalSymbols() {
         // if the target symbol is Thumb.
         Addr = modifyAddressBasedOnFlags(Addr, Flags);
 
-        DEBUG(dbgs() << "Resolving relocations Name: " << Name << "\t"
-                     << format("0x%lx", Addr) << "\n");
+        LLVM_DEBUG(dbgs() << "Resolving relocations Name: " << Name << "\t"
+                          << format("0x%lx", Addr) << "\n");
         // This list may have been updated when we called getSymbolAddress, so
         // don't change this code to get the list earlier.
         RelocationList &Relocs = i->second;
@@ -1095,6 +1108,7 @@ uint64_t RuntimeDyld::LoadedObjectInfo::getSectionLoadAddress(
 
 void RuntimeDyld::MemoryManager::anchor() {}
 void JITSymbolResolver::anchor() {}
+void LegacyJITSymbolResolver::anchor() {}
 
 RuntimeDyld::RuntimeDyld(RuntimeDyld::MemoryManager &MemMgr,
                          JITSymbolResolver &Resolver)
@@ -1185,6 +1199,12 @@ JITEvaluatedSymbol RuntimeDyld::getSymbol(StringRef Name) const {
   return Dyld->getSymbol(Name);
 }
 
+std::map<StringRef, JITEvaluatedSymbol> RuntimeDyld::getSymbolTable() const {
+  if (!Dyld)
+    return std::map<StringRef, JITEvaluatedSymbol>();
+  return Dyld->getSymbolTable();
+}
+
 void RuntimeDyld::resolveRelocations() { Dyld->resolveRelocations(); }
 
 void RuntimeDyld::reassignSectionAddress(unsigned SectionID, uint64_t Addr) {
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 5bc7434e703f..fa8906869b3a 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -688,12 +688,13 @@ RuntimeDyldCheckerImpl::RuntimeDyldCheckerImpl(RuntimeDyld &RTDyld,
 
 bool RuntimeDyldCheckerImpl::check(StringRef CheckExpr) const {
   CheckExpr = CheckExpr.trim();
-  DEBUG(dbgs() << "RuntimeDyldChecker: Checking '" << CheckExpr << "'...\n");
+  LLVM_DEBUG(dbgs() << "RuntimeDyldChecker: Checking '" << CheckExpr
+                    << "'...\n");
   RuntimeDyldCheckerExprEval P(*this, ErrStream);
   bool Result = P.evaluate(CheckExpr);
   (void)Result;
-  DEBUG(dbgs() << "RuntimeDyldChecker: '" << CheckExpr << "' "
-               << (Result ? "passed" : "FAILED") << ".\n");
+  LLVM_DEBUG(dbgs() << "RuntimeDyldChecker: '" << CheckExpr << "' "
+                    << (Result ? "passed" : "FAILED") << ".\n");
   return Result;
 }
 
@@ -731,7 +732,14 @@ bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix,
 bool RuntimeDyldCheckerImpl::isSymbolValid(StringRef Symbol) const {
   if (getRTDyld().getSymbol(Symbol))
     return true;
-  return !!getRTDyld().Resolver.findSymbol(Symbol);
+  JITSymbolResolver::LookupSet Symbols({Symbol});
+  auto Result = getRTDyld().Resolver.lookup(Symbols);
+  if (!Result) {
+    logAllUnhandledErrors(Result.takeError(), errs(), "RTDyldChecker: ");
+    return false;
+  }
+  assert(Result->count(Symbol) && "Missing symbol result");
+  return true;
 }
 
 uint64_t RuntimeDyldCheckerImpl::getSymbolLocalAddr(StringRef Symbol) const {
@@ -742,7 +750,16 @@ uint64_t RuntimeDyldCheckerImpl::getSymbolLocalAddr(StringRef Symbol) const {
 uint64_t RuntimeDyldCheckerImpl::getSymbolRemoteAddr(StringRef Symbol) const {
   if (auto InternalSymbol = getRTDyld().getSymbol(Symbol))
     return InternalSymbol.getAddress();
-  return cantFail(getRTDyld().Resolver.findSymbol(Symbol).getAddress());
+
+  JITSymbolResolver::LookupSet Symbols({Symbol});
+  auto Result = getRTDyld().Resolver.lookup(Symbols);
+  if (!Result) {
+    logAllUnhandledErrors(Result.takeError(), errs(), "RTDyldChecker: ");
+    return 0;
+  }
+  auto I = Result->find(Symbol);
+  assert(I != Result->end() && "Missing symbol result");
+  return I->second.getAddress();
 }
 
 uint64_t RuntimeDyldCheckerImpl::readMemoryAtAddr(uint64_t SrcAddr,
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 2c57eee191db..cc6729d21320 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -65,7 +65,7 @@ template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
 
   typedef Elf_Ehdr_Impl<ELFT> Elf_Ehdr;
 
-  typedef typename ELFDataTypeTypedefHelper<ELFT>::value_type addr_type;
+  typedef typename ELFT::uint addr_type;
 
   DyldELFObject(ELFObjectFile<ELFT> &&Obj);
 
@@ -148,8 +148,8 @@ template <typename ELFT>
 static Expected<std::unique_ptr<DyldELFObject<ELFT>>>
 createRTDyldELFObject(MemoryBufferRef Buffer, const ObjectFile &SourceObject,
                       const LoadedELFObjectInfo &L) {
-  typedef typename ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
-  typedef typename ELFDataTypeTypedefHelper<ELFT>::value_type addr_type;
+  typedef typename ELFT::Shdr Elf_Shdr;
+  typedef typename ELFT::uint addr_type;
 
   Expected<std::unique_ptr<DyldELFObject<ELFT>>> ObjOrErr =
       DyldELFObject<ELFT>::create(Buffer);
@@ -273,8 +273,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
   case ELF::R_X86_64_64: {
     support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
         Value + Addend;
-    DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
-                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    LLVM_DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
+                      << format("%p\n", Section.getAddressWithOffset(Offset)));
     break;
   }
   case ELF::R_X86_64_32:
@@ -286,8 +286,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
     support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) =
         TruncatedAddr;
-    DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at "
-                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    LLVM_DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at "
+                      << format("%p\n", Section.getAddressWithOffset(Offset)));
     break;
   }
   case ELF::R_X86_64_PC8: {
@@ -312,6 +312,22 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     int64_t RealOffset = Value + Addend - FinalAddress;
     support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
         RealOffset;
+    LLVM_DEBUG(dbgs() << "Writing " << format("%p", RealOffset) << " at "
+                      << format("%p\n", FinalAddress));
+    break;
+  }
+  case ELF::R_X86_64_GOTOFF64: {
+    // Compute Value - GOTBase.
+    uint64_t GOTBase = 0;
+    for (const auto &Section : Sections) {
+      if (Section.getName() == ".got") {
+        GOTBase = Section.getLoadAddressWithOffset(0);
+        break;
+      }
+    }
+    assert(GOTBase != 0 && "missing GOT");
+    int64_t GOTOffset = Value - GOTBase + Addend;
+    support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) = GOTOffset;
     break;
   }
   }
@@ -326,6 +342,9 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section,
         Value + Addend;
     break;
   }
+  // Handle R_386_PLT32 like R_386_PC32 since it should be able to
+  // reach any 32 bit address.
+  case ELF::R_386_PLT32:
   case ELF::R_386_PC32: {
     uint32_t FinalAddress =
         Section.getLoadAddressWithOffset(Offset) & 0xFFFFFFFF;
@@ -351,12 +370,12 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   // Data should use target endian. Code should always use little endian.
   bool isBE = Arch == Triple::aarch64_be;
 
-  DEBUG(dbgs() << "resolveAArch64Relocation, LocalAddress: 0x"
-               << format("%llx", Section.getAddressWithOffset(Offset))
-               << " FinalAddress: 0x" << format("%llx", FinalAddress)
-               << " Value: 0x" << format("%llx", Value) << " Type: 0x"
-               << format("%x", Type) << " Addend: 0x" << format("%llx", Addend)
-               << "\n");
+  LLVM_DEBUG(dbgs() << "resolveAArch64Relocation, LocalAddress: 0x"
+                    << format("%llx", Section.getAddressWithOffset(Offset))
+                    << " FinalAddress: 0x" << format("%llx", FinalAddress)
+                    << " Value: 0x" << format("%llx", Value) << " Type: 0x"
+                    << format("%x", Type) << " Addend: 0x"
+                    << format("%llx", Addend) << "\n");
 
   switch (Type) {
   default:
@@ -471,11 +490,12 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
   uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset) & 0xFFFFFFFF;
   Value += Addend;
 
-  DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: "
-               << Section.getAddressWithOffset(Offset)
-               << " FinalAddress: " << format("%p", FinalAddress) << " Value: "
-               << format("%x", Value) << " Type: " << format("%x", Type)
-               << " Addend: " << format("%x", Addend) << "\n");
+  LLVM_DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: "
+                    << Section.getAddressWithOffset(Offset)
+                    << " FinalAddress: " << format("%p", FinalAddress)
+                    << " Value: " << format("%x", Value)
+                    << " Type: " << format("%x", Type)
+                    << " Addend: " << format("%x", Addend) << "\n");
 
   switch (Type) {
   default:
@@ -526,10 +546,11 @@ void RuntimeDyldELF::setMipsABI(const ObjectFile &Obj) {
     IsMipsN64ABI = false;
     return;
   }
-  unsigned AbiVariant;
-  Obj.getPlatformFlags(AbiVariant);
-  IsMipsO32ABI = AbiVariant & ELF::EF_MIPS_ABI_O32;
-  IsMipsN32ABI = AbiVariant & ELF::EF_MIPS_ABI2;
+  if (auto *E = dyn_cast<ELFObjectFileBase>(&Obj)) {
+    unsigned AbiVariant = E->getPlatformFlags();
+    IsMipsO32ABI = AbiVariant & ELF::EF_MIPS_ABI_O32;
+    IsMipsN32ABI = AbiVariant & ELF::EF_MIPS_ABI2;
+  }
   IsMipsN64ABI = Obj.getFileFormatName().equals("ELF64-mips");
 }
 
@@ -718,9 +739,11 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     writeInt16BE(LocalAddress, applyPPClo(Value + Addend) & ~3);
     break;
   case ELF::R_PPC64_ADDR16_HI:
+  case ELF::R_PPC64_ADDR16_HIGH:
     writeInt16BE(LocalAddress, applyPPChi(Value + Addend));
     break;
   case ELF::R_PPC64_ADDR16_HA:
+  case ELF::R_PPC64_ADDR16_HIGHA:
     writeInt16BE(LocalAddress, applyPPCha(Value + Addend));
     break;
   case ELF::R_PPC64_ADDR16_HIGHER:
@@ -767,8 +790,9 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     int64_t delta = static_cast<int64_t>(Value - FinalAddress + Addend);
     if (SignExtend64<26>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL24 overflow");
-    // Generates a 'bl <address>' instruction
-    writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
+    // We preserve bits other than LI field, i.e. PO and AA/LK fields.
+    uint32_t Inst = readBytesUnaligned(LocalAddress, 4);
+    writeInt32BE(LocalAddress, (Inst & 0xFC000003) | (delta & 0x03FFFFFC));
   } break;
   case ELF::R_PPC64_REL32: {
     uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
@@ -855,16 +879,16 @@ void RuntimeDyldELF::resolveBPFRelocation(const SectionEntry &Section,
     break;
   case ELF::R_BPF_64_64: {
     write(isBE, Section.getAddressWithOffset(Offset), Value + Addend);
-    DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
-                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    LLVM_DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
+                      << format("%p\n", Section.getAddressWithOffset(Offset)));
     break;
   }
   case ELF::R_BPF_64_32: {
     Value += Addend;
     assert(Value <= UINT32_MAX);
     write(isBE, Section.getAddressWithOffset(Offset), static_cast<uint32_t>(Value));
-    DEBUG(dbgs() << "Writing " << format("%p", Value) << " at "
-                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    LLVM_DEBUG(dbgs() << "Writing " << format("%p", Value) << " at "
+                      << format("%p\n", Section.getAddressWithOffset(Offset)));
     break;
   }
   }
@@ -1021,7 +1045,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
                                           relocation_iterator RelI,
                                           StubMap &Stubs) {
 
-  DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
+  LLVM_DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
   SectionEntry &Section = Sections[SectionID];
 
   uint64_t Offset = RelI->getOffset();
@@ -1032,10 +1056,10 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
     resolveRelocation(Section, Offset,
                       (uint64_t)Section.getAddressWithOffset(i->second),
                       RelType, 0);
-    DEBUG(dbgs() << " Stub function found\n");
+    LLVM_DEBUG(dbgs() << " Stub function found\n");
   } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
     // Create a new stub function.
-    DEBUG(dbgs() << " Create a new stub function\n");
+    LLVM_DEBUG(dbgs() << " Create a new stub function\n");
     Stubs[Value] = Section.getStubOffset();
     uint8_t *StubTargetAddr = createStubFunction(
         Section.getAddressWithOffset(Section.getStubOffset()));
@@ -1092,8 +1116,8 @@ RuntimeDyldELF::processRelocationRef(
     else
       return TargetNameOrErr.takeError();
   }
-  DEBUG(dbgs() << "\t\tRelType: " << RelType << " Addend: " << Addend
-               << " TargetName: " << TargetName << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tRelType: " << RelType << " Addend: " << Addend
+                    << " TargetName: " << TargetName << "\n");
   RelocationValueRef Value;
   // First search for the symbol in the local symbol table
   SymbolRef::Type SymType = SymbolRef::ST_Unknown;
@@ -1134,7 +1158,7 @@ RuntimeDyldELF::processRelocationRef(
       section_iterator si = *SectionOrErr;
       if (si == Obj.section_end())
         llvm_unreachable("Symbol section not found, bad object file format!");
-      DEBUG(dbgs() << "\t\tThis is section symbol\n");
+      LLVM_DEBUG(dbgs() << "\t\tThis is section symbol\n");
       bool isCode = si->isText();
       if (auto SectionIDOrErr = findOrEmitSection(Obj, (*si), isCode,
                                                   ObjSectionToID))
@@ -1166,8 +1190,8 @@ RuntimeDyldELF::processRelocationRef(
 
   uint64_t Offset = RelI->getOffset();
 
-  DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
-               << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
+                    << "\n");
   if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be)) {
     if (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26) {
       resolveAArch64Branch(SectionID, Value, RelI, Stubs);
@@ -1189,7 +1213,7 @@ RuntimeDyldELF::processRelocationRef(
     if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL ||
       RelType == ELF::R_ARM_JUMP24) {
       // This is an ARM branch relocation, need to use a stub function.
-      DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.\n");
+      LLVM_DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.\n");
       SectionEntry &Section = Sections[SectionID];
 
       // Look for an existing stub.
@@ -1199,10 +1223,10 @@ RuntimeDyldELF::processRelocationRef(
             Section, Offset,
             reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)),
             RelType, 0);
-        DEBUG(dbgs() << " Stub function found\n");
+        LLVM_DEBUG(dbgs() << " Stub function found\n");
       } else {
         // Create a new stub function.
-        DEBUG(dbgs() << " Create a new stub function\n");
+        LLVM_DEBUG(dbgs() << " Create a new stub function\n");
         Stubs[Value] = Section.getStubOffset();
         uint8_t *StubTargetAddr = createStubFunction(
             Section.getAddressWithOffset(Section.getStubOffset()));
@@ -1237,7 +1261,7 @@ RuntimeDyldELF::processRelocationRef(
     uint32_t Opcode = readBytesUnaligned(Placeholder, 4);
     if (RelType == ELF::R_MIPS_26) {
       // This is an Mips branch relocation, need to use a stub function.
-      DEBUG(dbgs() << "\t\tThis is a Mips branch relocation.");
+      LLVM_DEBUG(dbgs() << "\t\tThis is a Mips branch relocation.");
       SectionEntry &Section = Sections[SectionID];
 
       // Extract the addend from the instruction.
@@ -1252,14 +1276,13 @@ RuntimeDyldELF::processRelocationRef(
       if (i != Stubs.end()) {
         RelocationEntry RE(SectionID, Offset, RelType, i->second);
         addRelocationForSection(RE, SectionID);
-        DEBUG(dbgs() << " Stub function found\n");
+        LLVM_DEBUG(dbgs() << " Stub function found\n");
       } else {
         // Create a new stub function.
-        DEBUG(dbgs() << " Create a new stub function\n");
+        LLVM_DEBUG(dbgs() << " Create a new stub function\n");
         Stubs[Value] = Section.getStubOffset();
 
-        unsigned AbiVariant;
-        O.getPlatformFlags(AbiVariant);
+        unsigned AbiVariant = Obj.getPlatformFlags();
 
         uint8_t *StubTargetAddr = createStubFunction(
             Section.getAddressWithOffset(Section.getStubOffset()), AbiVariant);
@@ -1340,7 +1363,7 @@ RuntimeDyldELF::processRelocationRef(
         addRelocationForSection(RE, Value.SectionID);
     } else if (RelType == ELF::R_MIPS_26) {
       // This is an Mips branch relocation, need to use a stub function.
-      DEBUG(dbgs() << "\t\tThis is a Mips branch relocation.");
+      LLVM_DEBUG(dbgs() << "\t\tThis is a Mips branch relocation.");
       SectionEntry &Section = Sections[SectionID];
 
       //  Look up for existing stub.
@@ -1348,14 +1371,13 @@ RuntimeDyldELF::processRelocationRef(
       if (i != Stubs.end()) {
         RelocationEntry RE(SectionID, Offset, RelType, i->second);
         addRelocationForSection(RE, SectionID);
-        DEBUG(dbgs() << " Stub function found\n");
+        LLVM_DEBUG(dbgs() << " Stub function found\n");
       } else {
         // Create a new stub function.
-        DEBUG(dbgs() << " Create a new stub function\n");
+        LLVM_DEBUG(dbgs() << " Create a new stub function\n");
         Stubs[Value] = Section.getStubOffset();
 
-        unsigned AbiVariant;
-        O.getPlatformFlags(AbiVariant);
+        unsigned AbiVariant = Obj.getPlatformFlags();
 
         uint8_t *StubTargetAddr = createStubFunction(
             Section.getAddressWithOffset(Section.getStubOffset()), AbiVariant);
@@ -1412,8 +1434,7 @@ RuntimeDyldELF::processRelocationRef(
   } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
     if (RelType == ELF::R_PPC64_REL24) {
       // Determine ABI variant in use for this object.
-      unsigned AbiVariant;
-      Obj.getPlatformFlags(AbiVariant);
+      unsigned AbiVariant = Obj.getPlatformFlags();
       AbiVariant &= ELF::EF_PPC64_ABI;
       // A PPC branch relocation will need a stub function if the target is
       // an external symbol (either Value.SymbolName is set, or SymType is
@@ -1461,10 +1482,10 @@ RuntimeDyldELF::processRelocationRef(
                             reinterpret_cast<uint64_t>(
                                 Section.getAddressWithOffset(i->second)),
                             RelType, 0);
-          DEBUG(dbgs() << " Stub function found\n");
+          LLVM_DEBUG(dbgs() << " Stub function found\n");
         } else {
           // Create a new stub function.
-          DEBUG(dbgs() << " Create a new stub function\n");
+          LLVM_DEBUG(dbgs() << " Create a new stub function\n");
           Stubs[Value] = Section.getStubOffset();
           uint8_t *StubTargetAddr = createStubFunction(
               Section.getAddressWithOffset(Section.getStubOffset()),
@@ -1581,7 +1602,7 @@ RuntimeDyldELF::processRelocationRef(
     // parts of the stub separately.  However, as things stand, we allocate
     // a stub for every relocation, so using a GOT in JIT code should be
     // no less space efficient than using an explicit constant pool.
-    DEBUG(dbgs() << "\t\tThis is a SystemZ indirect relocation.");
+    LLVM_DEBUG(dbgs() << "\t\tThis is a SystemZ indirect relocation.");
     SectionEntry &Section = Sections[SectionID];
 
     // Look for an existing stub.
@@ -1589,10 +1610,10 @@ RuntimeDyldELF::processRelocationRef(
     uintptr_t StubAddress;
     if (i != Stubs.end()) {
       StubAddress = uintptr_t(Section.getAddressWithOffset(i->second));
-      DEBUG(dbgs() << " Stub function found\n");
+      LLVM_DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
-      DEBUG(dbgs() << " Create a new stub function\n");
+      LLVM_DEBUG(dbgs() << " Create a new stub function\n");
 
       uintptr_t BaseAddress = uintptr_t(Section.getAddress());
       uintptr_t StubAlignment = getStubAlignment();
@@ -1643,10 +1664,10 @@ RuntimeDyldELF::processRelocationRef(
         uintptr_t StubAddress;
         if (i != Stubs.end()) {
           StubAddress = uintptr_t(Section.getAddress()) + i->second;
-          DEBUG(dbgs() << " Stub function found\n");
+          LLVM_DEBUG(dbgs() << " Stub function found\n");
         } else {
           // Create a new stub function (equivalent to a PLT entry).
-          DEBUG(dbgs() << " Create a new stub function\n");
+          LLVM_DEBUG(dbgs() << " Create a new stub function\n");
 
           uintptr_t BaseAddress = uintptr_t(Section.getAddress());
           uintptr_t StubAlignment = getStubAlignment();
@@ -1695,6 +1716,29 @@ RuntimeDyldELF::processRelocationRef(
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
+    } else if (RelType == ELF::R_X86_64_GOT64) {
+      // Fill in a 64-bit GOT offset.
+      uint64_t GOTOffset = allocateGOTEntries(1);
+      resolveRelocation(Sections[SectionID], Offset, GOTOffset,
+                        ELF::R_X86_64_64, 0);
+
+      // Fill in the value of the symbol we're targeting into the GOT
+      RelocationEntry RE =
+          computeGOTOffsetRE(GOTOffset, Value.Offset, ELF::R_X86_64_64);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    } else if (RelType == ELF::R_X86_64_GOTPC64) {
+      // Materialize the address of the base of the GOT relative to the PC.
+      // This doesn't create a GOT entry, but it does mean we need a GOT
+      // section.
+      (void)allocateGOTEntries(0);
+      resolveGOTOffsetRelocation(SectionID, Offset, Addend, ELF::R_X86_64_PC64);
+    } else if (RelType == ELF::R_X86_64_GOTOFF64) {
+      // GOTOFF relocations ultimately require a section difference relocation.
+      (void)allocateGOTEntries(0);
+      processSimpleRelocation(SectionID, Offset, RelType, Value);
     } else if (RelType == ELF::R_X86_64_PC32) {
       Value.Addend += support::ulittle32_t::ref(computePlaceholderAddress(SectionID, Offset));
       processSimpleRelocation(SectionID, Offset, RelType, Value);
@@ -1866,6 +1910,7 @@ bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
   if (Arch == Triple::x86_64)
     return RelTy == ELF::R_X86_64_GOTPCREL ||
            RelTy == ELF::R_X86_64_GOTPCRELX ||
+           RelTy == ELF::R_X86_64_GOT64 ||
            RelTy == ELF::R_X86_64_REX_GOTPCRELX;
   return false;
 }
@@ -1882,6 +1927,9 @@ bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const {
   case ELF::R_X86_64_GOTPCREL:
   case ELF::R_X86_64_GOTPCRELX:
   case ELF::R_X86_64_REX_GOTPCRELX:
+  case ELF::R_X86_64_GOTPC64:
+  case ELF::R_X86_64_GOT64:
+  case ELF::R_X86_64_GOTOFF64:
   case ELF::R_X86_64_PC32:
   case ELF::R_X86_64_PC64:
   case ELF::R_X86_64_64:
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index e046a8504e9f..4d7cc36d0666 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -87,7 +87,7 @@ public:
 
   uint8_t *getAddress() const { return Address; }
 
-  /// \brief Return the address of this section with an offset.
+  /// Return the address of this section with an offset.
   uint8_t *getAddressWithOffset(unsigned OffsetBytes) const {
     assert(OffsetBytes <= AllocationSize && "Offset out of bounds!");
     return Address + OffsetBytes;
@@ -98,7 +98,7 @@ public:
   uint64_t getLoadAddress() const { return LoadAddress; }
   void setLoadAddress(uint64_t LA) { LoadAddress = LA; }
 
-  /// \brief Return the load address of this section with an offset.
+  /// Return the load address of this section with an offset.
   uint64_t getLoadAddressWithOffset(unsigned OffsetBytes) const {
     assert(OffsetBytes <= AllocationSize && "Offset out of bounds!");
     return LoadAddress + OffsetBytes;
@@ -217,7 +217,7 @@ public:
   }
 };
 
-/// @brief Symbol info for RuntimeDyld.
+/// Symbol info for RuntimeDyld.
 class SymbolTableEntry {
 public:
   SymbolTableEntry() = default;
@@ -381,13 +381,14 @@ protected:
     return Addr;
   }
 
-  /// \brief Given the common symbols discovered in the object file, emit a
+  /// Given the common symbols discovered in the object file, emit a
   /// new section for them and update the symbol mappings in the object and
   /// symbol table.
   Error emitCommonSymbols(const ObjectFile &Obj,
-                          CommonSymbolList &CommonSymbols);
+                          CommonSymbolList &CommonSymbols, uint64_t CommonSize,
+                          uint32_t CommonAlign);
 
-  /// \brief Emits section data from the object file to the MemoryManager.
+  /// Emits section data from the object file to the MemoryManager.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emits, else allocateDataSection() will be used.
   /// \return SectionID.
@@ -395,7 +396,7 @@ protected:
                                  const SectionRef &Section,
                                  bool IsCode);
 
-  /// \brief Find Section in LocalSections. If the secton is not found - emit
+  /// Find Section in LocalSections. If the secton is not found - emit
   ///        it and store in LocalSections.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emmits, else allocateDataSection() will be used.
@@ -404,26 +405,26 @@ protected:
                                        const SectionRef &Section, bool IsCode,
                                        ObjSectionToIDMap &LocalSections);
 
-  // \brief Add a relocation entry that uses the given section.
+  // Add a relocation entry that uses the given section.
   void addRelocationForSection(const RelocationEntry &RE, unsigned SectionID);
 
-  // \brief Add a relocation entry that uses the given symbol.  This symbol may
+  // Add a relocation entry that uses the given symbol.  This symbol may
   // be found in the global symbol table, or it may be external.
   void addRelocationForSymbol(const RelocationEntry &RE, StringRef SymbolName);
 
-  /// \brief Emits long jump instruction to Addr.
+  /// Emits long jump instruction to Addr.
   /// \return Pointer to the memory area for emitting target address.
   uint8_t *createStubFunction(uint8_t *Addr, unsigned AbiVariant = 0);
 
-  /// \brief Resolves relocations from Relocs list with address from Value.
+  /// Resolves relocations from Relocs list with address from Value.
   void resolveRelocationList(const RelocationList &Relocs, uint64_t Value);
 
-  /// \brief A object file specific relocation resolver
+  /// A object file specific relocation resolver
   /// \param RE The relocation to be resolved
   /// \param Value Target symbol address to apply the relocation action
   virtual void resolveRelocation(const RelocationEntry &RE, uint64_t Value) = 0;
 
-  /// \brief Parses one or more object file relocations (some object files use
+  /// Parses one or more object file relocations (some object files use
   ///        relocation pairs) and stores it to Relocations or SymbolRelocations
   ///        (this depends on the object file type).
   /// \return Iterator to the next relocation that needs to be parsed.
@@ -432,35 +433,35 @@ protected:
                        const ObjectFile &Obj, ObjSectionToIDMap &ObjSectionToID,
                        StubMap &Stubs) = 0;
 
-  /// \brief Resolve relocations to external symbols.
+  /// Resolve relocations to external symbols.
   Error resolveExternalSymbols();
 
-  // \brief Compute an upper bound of the memory that is required to load all
+  // Compute an upper bound of the memory that is required to load all
   // sections
   Error computeTotalAllocSize(const ObjectFile &Obj,
                               uint64_t &CodeSize, uint32_t &CodeAlign,
                               uint64_t &RODataSize, uint32_t &RODataAlign,
                               uint64_t &RWDataSize, uint32_t &RWDataAlign);
 
-  // \brief Compute GOT size
+  // Compute GOT size
   unsigned computeGOTSize(const ObjectFile &Obj);
 
-  // \brief Compute the stub buffer size required for a section
+  // Compute the stub buffer size required for a section
   unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
 
-  // \brief Implementation of the generic part of the loadObject algorithm.
+  // Implementation of the generic part of the loadObject algorithm.
   Expected<ObjSectionToIDMap> loadObjectImpl(const object::ObjectFile &Obj);
 
-  // \brief Return size of Global Offset Table (GOT) entry
+  // Return size of Global Offset Table (GOT) entry
   virtual size_t getGOTEntrySize() { return 0; }
 
-  // \brief Return true if the relocation R may require allocating a GOT entry.
+  // Return true if the relocation R may require allocating a GOT entry.
   virtual bool relocationNeedsGot(const RelocationRef &R) const {
     return false;
   }
 
-  // \brief Return true if the relocation R may require allocating a stub.
+  // Return true if the relocation R may require allocating a stub.
   virtual bool relocationNeedsStub(const RelocationRef &R) const {
     return true;    // Conservative answer
   }
@@ -518,6 +519,21 @@ public:
     return JITEvaluatedSymbol(TargetAddr, SymEntry.getFlags());
   }
 
+  std::map<StringRef, JITEvaluatedSymbol> getSymbolTable() const {
+    std::map<StringRef, JITEvaluatedSymbol> Result;
+
+    for (auto &KV : GlobalSymbolTable) {
+      auto SectionID = KV.second.getSectionID();
+      uint64_t SectionAddr = 0;
+      if (SectionID != AbsoluteSymbolSection)
+        SectionAddr = getSectionLoadAddress(SectionID);
+      Result[KV.first()] =
+        JITEvaluatedSymbol(SectionAddr + KV.second.getOffset(), KV.second.getFlags());
+    }
+
+    return Result;
+  }
+
   void resolveRelocations();
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index b0561f68edb3..c5a215c83331 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -196,10 +196,10 @@ Error RuntimeDyldMachO::populateIndirectSymbolPointersSection(
   assert((PTSectionSize % PTEntrySize) == 0 &&
          "Pointers section does not contain a whole number of stubs?");
 
-  DEBUG(dbgs() << "Populating pointer table section "
-               << Sections[PTSectionID].getName() << ", Section ID "
-               << PTSectionID << ", " << NumPTEntries << " entries, "
-               << PTEntrySize << " bytes each:\n");
+  LLVM_DEBUG(dbgs() << "Populating pointer table section "
+                    << Sections[PTSectionID].getName() << ", Section ID "
+                    << PTSectionID << ", " << NumPTEntries << " entries, "
+                    << PTEntrySize << " bytes each:\n");
 
   for (unsigned i = 0; i < NumPTEntries; ++i) {
     unsigned SymbolIndex =
@@ -210,8 +210,8 @@ Error RuntimeDyldMachO::populateIndirectSymbolPointersSection(
       IndirectSymbolName = *IndirectSymbolNameOrErr;
     else
       return IndirectSymbolNameOrErr.takeError();
-    DEBUG(dbgs() << "  " << IndirectSymbolName << ": index " << SymbolIndex
-          << ", PT offset: " << PTEntryOffset << "\n");
+    LLVM_DEBUG(dbgs() << "  " << IndirectSymbolName << ": index " << SymbolIndex
+                      << ", PT offset: " << PTEntryOffset << "\n");
     RelocationEntry RE(PTSectionID, PTEntryOffset,
                        MachO::GENERIC_RELOC_VANILLA, 0, false, 2);
     addRelocationForSymbol(RE, IndirectSymbolName);
@@ -275,8 +275,8 @@ unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(uint8_t *P,
                                                           int64_t DeltaForEH) {
   typedef typename Impl::TargetPtrT TargetPtrT;
 
-  DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText
-               << ", Delta for EH: " << DeltaForEH << "\n");
+  LLVM_DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText
+                    << ", Delta for EH: " << DeltaForEH << "\n");
   uint32_t Length = readBytesUnaligned(P, 4);
   P += 4;
   uint8_t *Ret = P + Length;
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 04678f224466..dd65051edad7 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -80,9 +80,9 @@ public:
     SmallString<32> RelTypeName;
     RelI->getTypeName(RelTypeName);
 #endif
-    DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
-                 << " RelType: " << RelTypeName << " TargetName: " << TargetName
-                 << " Addend " << Addend << "\n");
+    LLVM_DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
+                      << " RelType: " << RelTypeName << " TargetName: "
+                      << TargetName << " Addend " << Addend << "\n");
 
     unsigned TargetSectionID = -1;
     if (Section == Obj.section_end()) {
@@ -145,10 +145,11 @@ public:
               : Sections[RE.Sections.SectionA].getLoadAddressWithOffset(
                     RE.Addend);
       assert(Result <= UINT32_MAX && "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_I386_DIR32"
-                   << " TargetSection: " << RE.Sections.SectionA
-                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_I386_DIR32"
+                        << " TargetSection: " << RE.Sections.SectionA
+                        << " Value: " << format("0x%08" PRIx32, Result)
+                        << '\n');
       writeBytesUnaligned(Result, Target, 4);
       break;
     }
@@ -159,10 +160,11 @@ public:
           Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend) -
           Sections[0].getLoadAddress();
       assert(Result <= UINT32_MAX && "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_I386_DIR32NB"
-                   << " TargetSection: " << RE.Sections.SectionA
-                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_I386_DIR32NB"
+                        << " TargetSection: " << RE.Sections.SectionA
+                        << " Value: " << format("0x%08" PRIx32, Result)
+                        << '\n');
       writeBytesUnaligned(Result, Target, 4);
       break;
     }
@@ -176,10 +178,11 @@ public:
              "relocation overflow");
       assert(static_cast<int64_t>(Result) >= INT32_MIN &&
              "relocation underflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_I386_REL32"
-                   << " TargetSection: " << RE.Sections.SectionA
-                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_I386_REL32"
+                        << " TargetSection: " << RE.Sections.SectionA
+                        << " Value: " << format("0x%08" PRIx32, Result)
+                        << '\n');
       writeBytesUnaligned(Result, Target, 4);
       break;
     }
@@ -187,18 +190,18 @@ public:
       // 16-bit section index of the section that contains the target.
       assert(static_cast<uint32_t>(RE.SectionID) <= UINT16_MAX &&
              "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_I386_SECTION Value: " << RE.SectionID
-                   << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_I386_SECTION Value: "
+                        << RE.SectionID << '\n');
       writeBytesUnaligned(RE.SectionID, Target, 2);
       break;
     case COFF::IMAGE_REL_I386_SECREL:
       // 32-bit offset of the target from the beginning of its section.
       assert(static_cast<uint64_t>(RE.Addend) <= UINT32_MAX &&
              "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_I386_SECREL Value: " << RE.Addend
-                   << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_I386_SECREL Value: "
+                        << RE.Addend << '\n');
       writeBytesUnaligned(RE.Addend, Target, 4);
       break;
     default:
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 9000435764df..729ea1ec48a4 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -97,9 +97,9 @@ public:
     SmallString<32> RelTypeName;
     RelI->getTypeName(RelTypeName);
 #endif
-    DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
-                 << " RelType: " << RelTypeName << " TargetName: " << TargetName
-                 << " Addend " << Addend << "\n");
+    LLVM_DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
+                      << " RelType: " << RelTypeName << " TargetName: "
+                      << TargetName << " Addend " << Addend << "\n");
 
     unsigned TargetSectionID = -1;
     if (Section == Obj.section_end()) {
@@ -187,10 +187,11 @@ public:
               : Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend);
       Result |= ISASelectionBit;
       assert(Result <= UINT32_MAX && "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_ADDR32"
-                   << " TargetSection: " << RE.Sections.SectionA
-                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_ADDR32"
+                        << " TargetSection: " << RE.Sections.SectionA
+                        << " Value: " << format("0x%08" PRIx32, Result)
+                        << '\n');
       writeBytesUnaligned(Result, Target, 4);
       break;
     }
@@ -200,10 +201,11 @@ public:
       uint64_t Result = Sections[RE.Sections.SectionA].getLoadAddress() -
                         Sections[0].getLoadAddress() + RE.Addend;
       assert(Result <= UINT32_MAX && "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_ADDR32NB"
-                   << " TargetSection: " << RE.Sections.SectionA
-                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_ADDR32NB"
+                        << " TargetSection: " << RE.Sections.SectionA
+                        << " Value: " << format("0x%08" PRIx32, Result)
+                        << '\n');
       Result |= ISASelectionBit;
       writeBytesUnaligned(Result, Target, 4);
       break;
@@ -212,18 +214,18 @@ public:
       // 16-bit section index of the section that contains the target.
       assert(static_cast<uint32_t>(RE.SectionID) <= UINT16_MAX &&
              "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_SECTION Value: " << RE.SectionID
-                   << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_SECTION Value: "
+                        << RE.SectionID << '\n');
       writeBytesUnaligned(RE.SectionID, Target, 2);
       break;
     case COFF::IMAGE_REL_ARM_SECREL:
       // 32-bit offset of the target from the beginning of its section.
       assert(static_cast<uint64_t>(RE.Addend) <= UINT32_MAX &&
              "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_SECREL Value: " << RE.Addend
-                   << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_SECREL Value: " << RE.Addend
+                        << '\n');
       writeBytesUnaligned(RE.Addend, Target, 2);
       break;
     case COFF::IMAGE_REL_ARM_MOV32T: {
@@ -231,10 +233,11 @@ public:
       uint64_t Result =
           Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend);
       assert(Result <= UINT32_MAX && "relocation overflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_MOV32T"
-                   << " TargetSection: " << RE.Sections.SectionA
-                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_MOV32T"
+                        << " TargetSection: " << RE.Sections.SectionA
+                        << " Value: " << format("0x%08" PRIx32, Result)
+                        << '\n');
 
       // MOVW(T3): |11110|i|10|0|1|0|0|imm4|0|imm3|Rd|imm8|
       //            imm32 = zext imm4:i:imm3:imm8
@@ -262,9 +265,9 @@ public:
              "relocation overflow");
       assert(static_cast<int64_t>(RE.Addend) >= INT32_MIN &&
              "relocation underflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_BRANCH20T"
-                   << " Value: " << static_cast<int32_t>(Value) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_BRANCH20T"
+                        << " Value: " << static_cast<int32_t>(Value) << '\n');
       static_cast<void>(Value);
       llvm_unreachable("unimplemented relocation");
       break;
@@ -277,9 +280,9 @@ public:
              "relocation overflow");
       assert(static_cast<int64_t>(RE.Addend) >= INT32_MIN &&
              "relocation underflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_BRANCH24T"
-                   << " Value: " << static_cast<int32_t>(Value) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_BRANCH24T"
+                        << " Value: " << static_cast<int32_t>(Value) << '\n');
       static_cast<void>(Value);
       llvm_unreachable("unimplemented relocation");
       break;
@@ -292,9 +295,9 @@ public:
              "relocation overflow");
       assert(static_cast<int64_t>(RE.Addend) >= INT32_MIN &&
              "relocation underflow");
-      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
-                   << " RelType: IMAGE_REL_ARM_BLX23T"
-                   << " Value: " << static_cast<int32_t>(Value) << '\n');
+      LLVM_DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                        << " RelType: IMAGE_REL_ARM_BLX23T"
+                        << " Value: " << static_cast<int32_t>(Value) << '\n');
       static_cast<void>(Value);
       llvm_unreachable("unimplemented relocation");
       break;
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 7cbb43854151..2d6e5c4aea67 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -30,15 +30,33 @@ private:
   // unregisteredEH frame sections with the memory manager.
   SmallVector<SID, 2> UnregisteredEHFrameSections;
   SmallVector<SID, 2> RegisteredEHFrameSections;
+  uint64_t ImageBase;
+
+  // Fake an __ImageBase pointer by returning the section with the lowest adress
+  uint64_t getImageBase() {
+    if (!ImageBase) {
+      ImageBase = std::numeric_limits<uint64_t>::max();
+      for (const SectionEntry &Section : Sections)
+        ImageBase = std::min(ImageBase, Section.getLoadAddress());
+    }
+    return ImageBase;
+  }
+
+  void write32BitOffset(uint8_t *Target, int64_t Addend, uint64_t Delta) {
+    uint64_t Result = Addend + Delta;
+    assert(Result <= UINT32_MAX && "Relocation overflow");
+    writeBytesUnaligned(Result, Target, 4);
+  }
 
 public:
   RuntimeDyldCOFFX86_64(RuntimeDyld::MemoryManager &MM,
                         JITSymbolResolver &Resolver)
-    : RuntimeDyldCOFF(MM, Resolver) {}
+    : RuntimeDyldCOFF(MM, Resolver), ImageBase(0) {}
 
-  unsigned getMaxStubSize() override {
-    return 6; // 2-byte jmp instruction + 32-bit relative address
-  }
+  unsigned getStubAlignment() override { return 1; }
+
+  // 2-byte jmp instruction + 32-bit relative address + 64-bit absolute jump
+  unsigned getMaxStubSize() override { return 14; }
 
   // The target location for the relocation is described by RE.SectionID and
   // RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
@@ -85,13 +103,17 @@ public:
     }
 
     case COFF::IMAGE_REL_AMD64_ADDR32NB: {
-      // Note ADDR32NB requires a well-established notion of
-      // image base. This address must be less than or equal
-      // to every section's load address, and all sections must be
-      // within a 32 bit offset from the base.
-      //
-      // For now we just set these to zero.
-      writeBytesUnaligned(0, Target, 4);
+      // ADDR32NB requires an offset less than 2GB from 'ImageBase'.
+      // The MemoryManager can make sure this is always true by forcing the
+      // memory layout to be: CodeSection < ReadOnlySection < ReadWriteSection.
+      const uint64_t ImageBase = getImageBase();
+      if (Value < ImageBase || ((Value - ImageBase) > UINT32_MAX)) {
+        llvm::errs() << "IMAGE_REL_AMD64_ADDR32NB relocation requires an"
+                     << "ordered section layout.\n";
+        write32BitOffset(Target, 0, 0);
+      } else {
+        write32BitOffset(Target, RE.Addend, Value - ImageBase);
+      }
       break;
     }
 
@@ -106,6 +128,52 @@ public:
     }
   }
 
+  std::tuple<uint64_t, uint64_t, uint64_t>
+  generateRelocationStub(unsigned SectionID, StringRef TargetName,
+                         uint64_t Offset, uint64_t RelType, uint64_t Addend,
+                         StubMap &Stubs) {
+    uintptr_t StubOffset;
+    SectionEntry &Section = Sections[SectionID];
+
+    RelocationValueRef OriginalRelValueRef;
+    OriginalRelValueRef.SectionID = SectionID;
+    OriginalRelValueRef.Offset = Offset;
+    OriginalRelValueRef.Addend = Addend;
+    OriginalRelValueRef.SymbolName = TargetName.data();
+
+    auto Stub = Stubs.find(OriginalRelValueRef);
+    if (Stub == Stubs.end()) {
+      LLVM_DEBUG(dbgs() << " Create a new stub function for "
+                        << TargetName.data() << "\n");
+
+      StubOffset = Section.getStubOffset();
+      Stubs[OriginalRelValueRef] = StubOffset;
+      createStubFunction(Section.getAddressWithOffset(StubOffset));
+      Section.advanceStubOffset(getMaxStubSize());
+    } else {
+      LLVM_DEBUG(dbgs() << " Stub function found for " << TargetName.data()
+                        << "\n");
+      StubOffset = Stub->second;
+    }
+
+    // FIXME: If RelType == COFF::IMAGE_REL_AMD64_ADDR32NB we should be able
+    // to ignore the __ImageBase requirement and just forward to the stub
+    // directly as an offset of this section:
+    // write32BitOffset(Section.getAddressWithOffset(Offset), 0, StubOffset);
+    // .xdata exception handler's aren't having this though.
+
+    // Resolve original relocation to stub function.
+    const RelocationEntry RE(SectionID, Offset, RelType, Addend);
+    resolveRelocation(RE, Section.getLoadAddressWithOffset(StubOffset));
+
+    // adjust relocation info so resolution writes to the stub function
+    Addend = 0;
+    Offset = StubOffset + 6;
+    RelType = COFF::IMAGE_REL_AMD64_ADDR64;
+
+    return std::make_tuple(Offset, RelType, Addend);
+  }
+
   Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID,
                        relocation_iterator RelI,
@@ -131,6 +199,11 @@ public:
     SectionEntry &Section = Sections[SectionID];
     uintptr_t ObjTarget = Section.getObjAddress() + Offset;
 
+    Expected<StringRef> TargetNameOrErr = Symbol->getName();
+    if (!TargetNameOrErr)
+      return TargetNameOrErr.takeError();
+    StringRef TargetName = *TargetNameOrErr;
+
     switch (RelType) {
 
     case COFF::IMAGE_REL_AMD64_REL32:
@@ -142,6 +215,11 @@ public:
     case COFF::IMAGE_REL_AMD64_ADDR32NB: {
       uint8_t *Displacement = (uint8_t *)ObjTarget;
       Addend = readBytesUnaligned(Displacement, 4);
+
+      if (IsExtern)
+        std::tie(Offset, RelType, Addend) = generateRelocationStub(
+          SectionID, TargetName, Offset, RelType, Addend, Stubs);
+
       break;
     }
 
@@ -155,14 +233,9 @@ public:
       break;
     }
 
-    Expected<StringRef> TargetNameOrErr = Symbol->getName();
-    if (!TargetNameOrErr)
-      return TargetNameOrErr.takeError();
-    StringRef TargetName = *TargetNameOrErr;
-
-    DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
-                 << " RelType: " << RelType << " TargetName: " << TargetName
-                 << " Addend " << Addend << "\n");
+    LLVM_DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
+                      << " RelType: " << RelType << " TargetName: "
+                      << TargetName << " Addend " << Addend << "\n");
 
     if (IsExtern) {
       RelocationEntry RE(SectionID, Offset, RelType, Addend);
@@ -183,7 +256,6 @@ public:
     return ++RelI;
   }
 
-  unsigned getStubAlignment() override { return 1; }
   void registerEHFrames() override {
     for (auto const &EHFrameSID : UnregisteredEHFrameSections) {
       uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
@@ -194,6 +266,7 @@ public:
     }
     UnregisteredEHFrameSections.clear();
   }
+
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override {
     // Look for and record the EH frame section IDs.
@@ -202,11 +275,12 @@ public:
       StringRef Name;
       if (auto EC = Section.getName(Name))
         return errorCodeToError(EC);
-      // Note unwind info is split across .pdata and .xdata, so this
-      // may not be sufficiently general for all users.
-      if (Name == ".xdata") {
+
+      // Note unwind info is stored in .pdata but often points to .xdata
+      // with an IMAGE_REL_AMD64_ADDR32NB relocation. Using a memory manager
+      // that keeps sections ordered in relation to __ImageBase is necessary.
+      if (Name == ".pdata")
         UnregisteredEHFrameSections.push_back(SectionPair.second);
-      }
     }
     return Error::success();
   }
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
index fe0f48e66a81..3a166b40af2d 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
@@ -55,12 +55,12 @@ RuntimeDyldELFMips::evaluateMIPS32Relocation(const SectionEntry &Section,
                                              uint64_t Offset, uint64_t Value,
                                              uint32_t Type) {
 
-  DEBUG(dbgs() << "evaluateMIPS32Relocation, LocalAddress: 0x"
-               << format("%llx", Section.getAddressWithOffset(Offset))
-               << " FinalAddress: 0x"
-               << format("%llx", Section.getLoadAddressWithOffset(Offset))
-               << " Value: 0x" << format("%llx", Value) << " Type: 0x"
-               << format("%x", Type) << "\n");
+  LLVM_DEBUG(dbgs() << "evaluateMIPS32Relocation, LocalAddress: 0x"
+                    << format("%llx", Section.getAddressWithOffset(Offset))
+                    << " FinalAddress: 0x"
+                    << format("%llx", Section.getLoadAddressWithOffset(Offset))
+                    << " Value: 0x" << format("%llx", Value) << " Type: 0x"
+                    << format("%x", Type) << "\n");
 
   switch (Type) {
   default:
@@ -110,15 +110,16 @@ int64_t RuntimeDyldELFMips::evaluateMIPS64Relocation(
     const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type,
     int64_t Addend, uint64_t SymOffset, SID SectionID) {
 
-  DEBUG(dbgs() << "evaluateMIPS64Relocation, LocalAddress: 0x"
-               << format("%llx", Section.getAddressWithOffset(Offset))
-               << " FinalAddress: 0x"
-               << format("%llx", Section.getLoadAddressWithOffset(Offset))
-               << " Value: 0x" << format("%llx", Value) << " Type: 0x"
-               << format("%x", Type) << " Addend: 0x" << format("%llx", Addend)
-               << " Offset: " << format("%llx" PRIx64, Offset)
-               << " SID: " << format("%d", SectionID)
-               << " SymOffset: " << format("%x", SymOffset) << "\n");
+  LLVM_DEBUG(dbgs() << "evaluateMIPS64Relocation, LocalAddress: 0x"
+                    << format("%llx", Section.getAddressWithOffset(Offset))
+                    << " FinalAddress: 0x"
+                    << format("%llx", Section.getLoadAddressWithOffset(Offset))
+                    << " Value: 0x" << format("%llx", Value) << " Type: 0x"
+                    << format("%x", Type) << " Addend: 0x"
+                    << format("%llx", Addend)
+                    << " Offset: " << format("%llx" PRIx64, Offset)
+                    << " SID: " << format("%d", SectionID)
+                    << " SymOffset: " << format("%x", SymOffset) << "\n");
 
   switch (Type) {
   default:
@@ -307,13 +308,12 @@ void RuntimeDyldELFMips::resolveMIPSO32Relocation(const SectionEntry &Section,
   uint8_t *TargetPtr = Section.getAddressWithOffset(Offset);
   Value += Addend;
 
-  DEBUG(dbgs() << "resolveMIPSO32Relocation, LocalAddress: "
-               << Section.getAddressWithOffset(Offset) << " FinalAddress: "
-               << format("%p", Section.getLoadAddressWithOffset(Offset))
-               << " Value: " << format("%x", Value)
-               << " Type: " << format("%x", Type)
-               << " Addend: " << format("%x", Addend)
-               << " SymOffset: " << format("%x", Offset) << "\n");
+  LLVM_DEBUG(dbgs() << "resolveMIPSO32Relocation, LocalAddress: "
+                    << Section.getAddressWithOffset(Offset) << " FinalAddress: "
+                    << format("%p", Section.getLoadAddressWithOffset(Offset))
+                    << " Value: " << format("%x", Value) << " Type: "
+                    << format("%x", Type) << " Addend: " << format("%x", Addend)
+                    << " SymOffset: " << format("%x", Offset) << "\n");
 
   Value = evaluateMIPS32Relocation(Section, Offset, Value, Type);
 
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h
index ce54a2717673..f53b9e6bd75a 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h
@@ -39,13 +39,13 @@ protected:
                                 uint64_t SymOffset, SID SectionID);
 
 private:
-  /// \brief A object file specific relocation resolver
+  /// A object file specific relocation resolver
   /// \param RE The relocation to be resolved
   /// \param Value Target symbol address to apply the relocation action
   uint64_t evaluateRelocation(const RelocationEntry &RE, uint64_t Value,
                               uint64_t Addend);
 
-  /// \brief A object file specific relocation resolver
+  /// A object file specific relocation resolver
   /// \param RE The relocation to be resolved
   /// \param Value Target symbol address to apply the relocation action
   void applyRelocation(const RelocationEntry &RE, uint64_t Value);
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
index 97cbc153b227..2a619c549cfa 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
@@ -32,18 +32,37 @@ public:
   unsigned getStubAlignment() override { return 8; }
 
   /// Extract the addend encoded in the instruction / memory location.
-  int64_t decodeAddend(const RelocationEntry &RE) const {
+  Expected<int64_t> decodeAddend(const RelocationEntry &RE) const {
     const SectionEntry &Section = Sections[RE.SectionID];
     uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
     unsigned NumBytes = 1 << RE.Size;
     int64_t Addend = 0;
     // Verify that the relocation has the correct size and alignment.
     switch (RE.RelType) {
-    default:
-      llvm_unreachable("Unsupported relocation type!");
-    case MachO::ARM64_RELOC_UNSIGNED:
-      assert((NumBytes == 4 || NumBytes == 8) && "Invalid relocation size.");
+    default: {
+      std::string ErrMsg;
+      {
+        raw_string_ostream ErrStream(ErrMsg);
+        ErrStream << "Unsupported relocation type: "
+                  << getRelocName(RE.RelType);
+      }
+      return make_error<StringError>(std::move(ErrMsg),
+                                     inconvertibleErrorCode());
+    }
+    case MachO::ARM64_RELOC_POINTER_TO_GOT:
+    case MachO::ARM64_RELOC_UNSIGNED: {
+      if (NumBytes != 4 && NumBytes != 8) {
+        std::string ErrMsg;
+        {
+          raw_string_ostream ErrStream(ErrMsg);
+          ErrStream << "Invalid relocation size for relocation "
+                    << getRelocName(RE.RelType);
+        }
+        return make_error<StringError>(std::move(ErrMsg),
+                                       inconvertibleErrorCode());
+      }
       break;
+    }
     case MachO::ARM64_RELOC_BRANCH26:
     case MachO::ARM64_RELOC_PAGE21:
     case MachO::ARM64_RELOC_PAGEOFF12:
@@ -58,6 +77,7 @@ public:
     switch (RE.RelType) {
     default:
       llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_POINTER_TO_GOT:
     case MachO::ARM64_RELOC_UNSIGNED:
       // This could be an unaligned memory location.
       if (NumBytes == 4)
@@ -66,9 +86,11 @@ public:
         Addend = *reinterpret_cast<support::ulittle64_t *>(LocalAddress);
       break;
     case MachO::ARM64_RELOC_BRANCH26: {
-      // Verify that the relocation points to the expected branch instruction.
+      // Verify that the relocation points to a B/BL instruction.
       auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
-      assert((*p & 0xFC000000) == 0x14000000 && "Expected branch instruction.");
+      assert(((*p & 0xFC000000) == 0x14000000 ||
+              (*p & 0xFC000000) == 0x94000000) &&
+             "Expected branch instruction.");
 
       // Get the 26 bit addend encoded in the branch instruction and sign-extend
       // to 64 bit. The lower 2 bits are always zeros and are therefore implicit
@@ -137,6 +159,7 @@ public:
     switch (RelType) {
     default:
       llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_POINTER_TO_GOT:
     case MachO::ARM64_RELOC_UNSIGNED:
       assert((NumBytes == 4 || NumBytes == 8) && "Invalid relocation size.");
       break;
@@ -154,6 +177,7 @@ public:
     switch (RelType) {
     default:
       llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_POINTER_TO_GOT:
     case MachO::ARM64_RELOC_UNSIGNED:
       // This could be an unaligned memory location.
       if (NumBytes == 4)
@@ -164,7 +188,9 @@ public:
     case MachO::ARM64_RELOC_BRANCH26: {
       auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
       // Verify that the relocation points to the expected branch instruction.
-      assert((*p & 0xFC000000) == 0x14000000 && "Expected branch instruction.");
+      assert(((*p & 0xFC000000) == 0x14000000 ||
+              (*p & 0xFC000000) == 0x94000000) &&
+             "Expected branch instruction.");
 
       // Verify addend value.
       assert((Addend & 0x3) == 0 && "Branch target is not aligned");
@@ -278,7 +304,20 @@ public:
       return processSubtractRelocation(SectionID, RelI, Obj, ObjSectionToID);
 
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
-    RE.Addend = decodeAddend(RE);
+
+    if (RE.RelType == MachO::ARM64_RELOC_POINTER_TO_GOT) {
+      bool Valid =
+          (RE.Size == 2 && RE.IsPCRel) || (RE.Size == 3 && !RE.IsPCRel);
+      if (!Valid)
+        return make_error<StringError>("ARM64_RELOC_POINTER_TO_GOT supports "
+                                       "32-bit pc-rel or 64-bit absolute only",
+                                       inconvertibleErrorCode());
+    }
+
+    if (auto Addend = decodeAddend(RE))
+      RE.Addend = *Addend;
+    else
+      return Addend.takeError();
 
     assert((ExplicitAddend == 0 || RE.Addend == 0) && "Relocation has "\
       "ARM64_RELOC_ADDEND and embedded addend in the instruction.");
@@ -292,13 +331,17 @@ public:
       return ValueOrErr.takeError();
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
-    if (!IsExtern && RE.IsPCRel)
+    if (RE.RelType == MachO::ARM64_RELOC_POINTER_TO_GOT) {
+      // We'll take care of the offset in processGOTRelocation.
+      Value.Offset = 0;
+    } else if (!IsExtern && RE.IsPCRel)
       makeValueAddendPCRel(Value, RelI, 1 << RE.Size);
 
     RE.Addend = Value.Offset;
 
     if (RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGE21 ||
-        RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12)
+        RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12 ||
+        RE.RelType == MachO::ARM64_RELOC_POINTER_TO_GOT)
       processGOTRelocation(RE, Value, Stubs);
     else {
       if (Value.SymbolName)
@@ -311,7 +354,7 @@ public:
   }
 
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
-    DEBUG(dumpRelocationToResolve(RE, Value));
+    LLVM_DEBUG(dumpRelocationToResolve(RE, Value));
 
     const SectionEntry &Section = Sections[RE.SectionID];
     uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
@@ -331,6 +374,19 @@ public:
       encodeAddend(LocalAddress, 1 << RE.Size, RelType, Value + RE.Addend);
       break;
     }
+
+    case MachO::ARM64_RELOC_POINTER_TO_GOT: {
+      assert(((RE.Size == 2 && RE.IsPCRel) || (RE.Size == 3 && !RE.IsPCRel)) &&
+             "ARM64_RELOC_POINTER_TO_GOT only supports 32-bit pc-rel or 64-bit "
+             "absolute");
+      // Addend is the GOT entry address and RE.Offset the target of the
+      // relocation.
+      uint64_t Result =
+          RE.IsPCRel ? (RE.Addend - RE.Offset) : (Value + RE.Addend);
+      encodeAddend(LocalAddress, 1 << RE.Size, RelType, Result);
+      break;
+    }
+
     case MachO::ARM64_RELOC_BRANCH26: {
       assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_BRANCH26 not supported");
       // Check if branch is in range.
@@ -368,7 +424,7 @@ public:
       writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size);
       break;
     }
-    case MachO::ARM64_RELOC_POINTER_TO_GOT:
+
     case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
     case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
       llvm_unreachable("Relocation type not yet implemented!");
@@ -386,7 +442,9 @@ public:
 private:
   void processGOTRelocation(const RelocationEntry &RE,
                             RelocationValueRef &Value, StubMap &Stubs) {
-    assert(RE.Size == 2);
+    assert((RE.RelType == MachO::ARM64_RELOC_POINTER_TO_GOT &&
+            (RE.Size == 2 || RE.Size == 3)) ||
+           RE.Size == 2);
     SectionEntry &Section = Sections[RE.SectionID];
     StubMap::const_iterator i = Stubs.find(Value);
     int64_t Offset;
@@ -459,6 +517,23 @@ private:
     return ++RelI;
   }
 
+  static const char *getRelocName(uint32_t RelocType) {
+    switch (RelocType) {
+      case MachO::ARM64_RELOC_UNSIGNED: return "ARM64_RELOC_UNSIGNED";
+      case MachO::ARM64_RELOC_SUBTRACTOR: return "ARM64_RELOC_SUBTRACTOR";
+      case MachO::ARM64_RELOC_BRANCH26: return "ARM64_RELOC_BRANCH26";
+      case MachO::ARM64_RELOC_PAGE21: return "ARM64_RELOC_PAGE21";
+      case MachO::ARM64_RELOC_PAGEOFF12: return "ARM64_RELOC_PAGEOFF12";
+      case MachO::ARM64_RELOC_GOT_LOAD_PAGE21: return "ARM64_RELOC_GOT_LOAD_PAGE21";
+      case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12: return "ARM64_RELOC_GOT_LOAD_PAGEOFF12";
+      case MachO::ARM64_RELOC_POINTER_TO_GOT: return "ARM64_RELOC_POINTER_TO_GOT";
+      case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21: return "ARM64_RELOC_TLVP_LOAD_PAGE21";
+      case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12: return "ARM64_RELOC_TLVP_LOAD_PAGEOFF12";
+      case MachO::ARM64_RELOC_ADDEND: return "ARM64_RELOC_ADDEND";
+    }
+    return "Unrecognized arm64 addend";
+  }
+
 };
 }
 
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index 990629de2f1d..64a6b2901819 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -47,6 +47,18 @@ public:
     return Addr;
   }
 
+  bool isAddrTargetThumb(unsigned SectionID, uint64_t Offset) {
+    auto TargetObjAddr = Sections[SectionID].getObjAddress() + Offset;
+    for (auto &KV : GlobalSymbolTable) {
+      auto &Entry = KV.second;
+      auto SymbolObjAddr =
+          Sections[Entry.getSectionID()].getObjAddress() + Entry.getOffset();
+      if (TargetObjAddr == SymbolObjAddr)
+        return (Entry.getFlags().getTargetFlags() & ARMJITSymbolFlags::Thumb);
+    }
+    return false;
+  }
+
   Expected<int64_t> decodeAddend(const RelocationEntry &RE) const {
     const SectionEntry &Section = Sections[RE.SectionID];
     uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
@@ -161,12 +173,18 @@ public:
     // the value as being a thumb stub: we don't want to mix it up with an ARM
     // stub targeting the same function.
     if (RE.RelType == MachO::ARM_THUMB_RELOC_BR22)
-      Value.IsStubThumb = TargetIsLocalThumbFunc;
+      Value.IsStubThumb = true;
 
     if (RE.IsPCRel)
       makeValueAddendPCRel(Value, RelI,
                            (RE.RelType == MachO::ARM_THUMB_RELOC_BR22) ? 4 : 8);
 
+    // If this is a non-external branch target check whether Value points to a
+    // thumb func.
+    if (!Value.SymbolName && (RelType == MachO::ARM_RELOC_BR24 ||
+                              RelType == MachO::ARM_THUMB_RELOC_BR22))
+      RE.IsTargetThumbFunc = isAddrTargetThumb(Value.SectionID, Value.Offset);
+
     if (RE.RelType == MachO::ARM_RELOC_BR24 ||
         RE.RelType == MachO::ARM_THUMB_RELOC_BR22)
       processBranchRelocation(RE, Value, Stubs);
@@ -182,7 +200,7 @@ public:
   }
 
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
-    DEBUG(dumpRelocationToResolve(RE, Value));
+    LLVM_DEBUG(dumpRelocationToResolve(RE, Value));
     const SectionEntry &Section = Sections[RE.SectionID];
     uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
 
@@ -388,11 +406,11 @@ private:
     // addend = Encoded - Expected
     //        = Encoded - (AddrA - AddrB)
 
-    DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB
-                 << ", Addend: " << Addend << ", SectionA ID: " << SectionAID
-                 << ", SectionAOffset: " << SectionAOffset
-                 << ", SectionB ID: " << SectionBID
-                 << ", SectionBOffset: " << SectionBOffset << "\n");
+    LLVM_DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA
+                      << ", AddrB: " << AddrB << ", Addend: " << Addend
+                      << ", SectionA ID: " << SectionAID << ", SectionAOffset: "
+                      << SectionAOffset << ", SectionB ID: " << SectionBID
+                      << ", SectionBOffset: " << SectionBOffset << "\n");
     RelocationEntry R(SectionID, Offset, RelocType, Addend, SectionAID,
                       SectionAOffset, SectionBID, SectionBOffset, IsPCRel,
                       HalfDiffKindBits);
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index c42f1751a181..d384d70b8b0f 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -97,7 +97,7 @@ public:
   }
 
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
-    DEBUG(dumpRelocationToResolve(RE, Value));
+    LLVM_DEBUG(dumpRelocationToResolve(RE, Value));
 
     const SectionEntry &Section = Sections[RE.SectionID];
     uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
@@ -192,11 +192,11 @@ private:
     // Compute the addend 'C' from the original expression 'A - B + C'.
     Addend -= AddrA - AddrB;
 
-    DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB
-                 << ", Addend: " << Addend << ", SectionA ID: " << SectionAID
-                 << ", SectionAOffset: " << SectionAOffset
-                 << ", SectionB ID: " << SectionBID
-                 << ", SectionBOffset: " << SectionBOffset << "\n");
+    LLVM_DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA
+                      << ", AddrB: " << AddrB << ", Addend: " << Addend
+                      << ", SectionA ID: " << SectionAID << ", SectionAOffset: "
+                      << SectionAOffset << ", SectionB ID: " << SectionBID
+                      << ", SectionBOffset: " << SectionBOffset << "\n");
     RelocationEntry R(SectionID, Offset, RelocType, Addend, SectionAID,
                       SectionAOffset, SectionBID, SectionBOffset,
                       IsPCRel, Size);
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index 32fd3efddd0d..9732ea6a0cd2 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -85,7 +85,7 @@ public:
   }
 
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
-    DEBUG(dumpRelocationToResolve(RE, Value));
+    LLVM_DEBUG(dumpRelocationToResolve(RE, Value));
     const SectionEntry &Section = Sections[RE.SectionID];
     uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
 
diff --git a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 2dc66a1502f8..05ab4a074e37 100644
--- a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -232,6 +232,8 @@ SectionMemoryManager::~SectionMemoryManager() {
 
 SectionMemoryManager::MemoryMapper::~MemoryMapper() {}
 
+void SectionMemoryManager::anchor() {}
+
 namespace {
 // Trivial implementation of SectionMemoryManager::MemoryMapper that just calls
 // into sys::Memory.
diff --git a/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp b/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp
index 18dfa4e3c319..9626b8d3ffa3 100644
--- a/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp
@@ -97,6 +97,8 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
                                      Options, RelocModel, CMModel, OptLevel,
 				     /*JIT*/ true);
   Target->Options.EmulatedTLS = EmulatedTLS;
+  Target->Options.ExplicitEmulatedTLS = true;
+
   assert(Target && "Could not allocate target machine!");
   return Target;
 }
diff --git a/contrib/llvm/lib/FuzzMutate/FuzzerCLI.cpp b/contrib/llvm/lib/FuzzMutate/FuzzerCLI.cpp
index 158edf203895..6f5a5c067a97 100644
--- a/contrib/llvm/lib/FuzzMutate/FuzzerCLI.cpp
+++ b/contrib/llvm/lib/FuzzMutate/FuzzerCLI.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Verifier.h"
 
 using namespace llvm;
 
@@ -82,8 +83,38 @@ void llvm::handleExecNameEncodedOptimizerOpts(StringRef ExecName) {
   SmallVector<StringRef, 4> Opts;
   NameAndArgs.second.split(Opts, '-');
   for (StringRef Opt : Opts) {
-    if (Opt.startswith("instcombine")) {
+    if (Opt == "instcombine") {
       Args.push_back("-passes=instcombine");
+    } else if (Opt == "earlycse") {
+      Args.push_back("-passes=early-cse");
+    } else if (Opt == "simplifycfg") {
+      Args.push_back("-passes=simplify-cfg");
+    } else if (Opt == "gvn") {
+      Args.push_back("-passes=gvn");
+    } else if (Opt == "sccp") {
+      Args.push_back("-passes=sccp");
+    
+    } else if (Opt == "loop_predication") {
+      Args.push_back("-passes=loop-predication");
+    } else if (Opt == "guard_widening") {
+      Args.push_back("-passes=guard-widening");
+    } else if (Opt == "loop_rotate") {
+      Args.push_back("-passes=loop(rotate)");
+    } else if (Opt == "loop_unswitch") {
+      Args.push_back("-passes=loop(unswitch)");
+    } else if (Opt == "loop_unroll") {
+      Args.push_back("-passes=unroll");
+    } else if (Opt == "loop_vectorize") {
+      Args.push_back("-passes=loop-vectorize");
+    } else if (Opt == "licm") {
+      Args.push_back("-passes=licm");
+    } else if (Opt == "indvars") {
+      Args.push_back("-passes=indvars");
+    } else if (Opt == "strength_reduce") {
+      Args.push_back("-passes=strength-reduce");
+    } else if (Opt == "irce") {
+      Args.push_back("-passes=irce");
+      
     } else if (Triple(Opt).getArch()) {
       Args.push_back("-mtriple=" + Opt.str());
     } else {
@@ -160,10 +191,19 @@ size_t llvm::writeModule(const Module &M, uint8_t *Dest, size_t MaxSize) {
   std::string Buf;
   {
     raw_string_ostream OS(Buf);
-    WriteBitcodeToFile(&M, OS);
+    WriteBitcodeToFile(M, OS);
   }
   if (Buf.size() > MaxSize)
       return 0;
   memcpy(Dest, Buf.data(), Buf.size());
   return Buf.size();
 }
+
+std::unique_ptr<Module> llvm::parseAndVerify(const uint8_t *Data, size_t Size,
+                                             LLVMContext &Context) {
+  auto M = parseModule(Data, Size, Context);
+  if (!M || verifyModule(*M, &errs()))
+    return nullptr;
+  
+  return M;
+}
diff --git a/contrib/llvm/lib/FuzzMutate/IRMutator.cpp b/contrib/llvm/lib/FuzzMutate/IRMutator.cpp
index 00b558ac4dcb..2dc7dfb880a2 100644
--- a/contrib/llvm/lib/FuzzMutate/IRMutator.cpp
+++ b/contrib/llvm/lib/FuzzMutate/IRMutator.cpp
@@ -152,10 +152,14 @@ uint64_t InstDeleterIRStrategy::getWeight(size_t CurrentSize, size_t MaxSize,
 
 void InstDeleterIRStrategy::mutate(Function &F, RandomIRBuilder &IB) {
   auto RS = makeSampler<Instruction *>(IB.Rand);
-  // Avoid terminators so we don't have to worry about keeping the CFG coherent.
-  for (Instruction &Inst : instructions(F))
-    if (!Inst.isTerminator())
-      RS.sample(&Inst, /*Weight=*/1);
+  for (Instruction &Inst : instructions(F)) {
+    // TODO: We can't handle these instructions.
+    if (Inst.isTerminator() || Inst.isEHPad() ||
+        Inst.isSwiftError() || isa<PHINode>(Inst))
+      continue;
+
+    RS.sample(&Inst, /*Weight=*/1);
+  }
   if (RS.isEmpty())
     return;
 
@@ -191,4 +195,5 @@ void InstDeleterIRStrategy::mutate(Instruction &Inst, RandomIRBuilder &IB) {
     RS.sample(IB.newSource(*BB, InstsBefore, {}, Pred), /*Weight=*/1);
 
   Inst.replaceAllUsesWith(RS.getSelection());
+  Inst.eraseFromParent();
 }
diff --git a/contrib/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm/lib/IR/AsmWriter.cpp
index 0fafe82404e4..99a25a723b4a 100644
--- a/contrib/llvm/lib/IR/AsmWriter.cpp
+++ b/contrib/llvm/lib/IR/AsmWriter.cpp
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This library implements the functionality defined in llvm/IR/Writer.h
+// This library implements `print` family of functions in classes like
+// Module, Function, Value, etc. In-memory representation of those classes is
+// converted to IR strings.
 //
 // Note that these routines must be extremely tolerant of various errors in the
 // LLVM code, because it can be used for debugging transformations.
@@ -28,6 +30,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/Attributes.h"
@@ -56,6 +59,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
@@ -195,7 +199,7 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
       !isa<GlobalVariable>(V) && !isa<Function>(V) && !isa<BasicBlock>(V);
   if (auto *BA = dyn_cast<BlockAddress>(V))
     ID = OM.lookup(BA->getBasicBlock()).first;
-  std::sort(List.begin(), List.end(), [&](const Entry &L, const Entry &R) {
+  llvm::sort(List.begin(), List.end(), [&](const Entry &L, const Entry &R) {
     const Use *LU = L.first;
     const Use *RU = R.first;
     if (LU == RU)
@@ -383,16 +387,6 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   }
 }
 
-void llvm::PrintEscapedString(StringRef Name, raw_ostream &Out) {
-  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-    unsigned char C = Name[i];
-    if (isprint(C) && C != '\\' && C != '"')
-      Out << C;
-    else
-      Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
-  }
-}
-
 enum PrefixType {
   GlobalPrefix,
   ComdatPrefix,
@@ -430,7 +424,7 @@ void llvm::printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name) {
   // Okay, we need quotes.  Output the quotes and escape any scary characters as
   // needed.
   OS << '"';
-  PrintEscapedString(Name, OS);
+  printEscapedString(Name, OS);
   OS << '"';
 }
 
@@ -468,27 +462,73 @@ namespace {
 
 class TypePrinting {
 public:
-  /// NamedTypes - The named types that are used by the current module.
-  TypeFinder NamedTypes;
-
-  /// NumberedTypes - The numbered types, along with their value.
-  DenseMap<StructType*, unsigned> NumberedTypes;
+  TypePrinting(const Module *M = nullptr) : DeferredM(M) {}
 
-  TypePrinting() = default;
   TypePrinting(const TypePrinting &) = delete;
   TypePrinting &operator=(const TypePrinting &) = delete;
 
-  void incorporateTypes(const Module &M);
+  /// The named types that are used by the current module.
+  TypeFinder &getNamedTypes();
+
+  /// The numbered types, number to type mapping.
+  std::vector<StructType *> &getNumberedTypes();
+
+  bool empty();
 
   void print(Type *Ty, raw_ostream &OS);
 
   void printStructBody(StructType *Ty, raw_ostream &OS);
+
+private:
+  void incorporateTypes();
+
+  /// A module to process lazily when needed. Set to nullptr as soon as used.
+  const Module *DeferredM;
+
+  TypeFinder NamedTypes;
+
+  // The numbered types, along with their value.
+  DenseMap<StructType *, unsigned> Type2Number;
+
+  std::vector<StructType *> NumberedTypes;
 };
 
 } // end anonymous namespace
 
-void TypePrinting::incorporateTypes(const Module &M) {
-  NamedTypes.run(M, false);
+TypeFinder &TypePrinting::getNamedTypes() {
+  incorporateTypes();
+  return NamedTypes;
+}
+
+std::vector<StructType *> &TypePrinting::getNumberedTypes() {
+  incorporateTypes();
+
+  // We know all the numbers that each type is used and we know that it is a
+  // dense assignment. Convert the map to an index table, if it's not done
+  // already (judging from the sizes):
+  if (NumberedTypes.size() == Type2Number.size())
+    return NumberedTypes;
+
+  NumberedTypes.resize(Type2Number.size());
+  for (const auto &P : Type2Number) {
+    assert(P.second < NumberedTypes.size() && "Didn't get a dense numbering?");
+    assert(!NumberedTypes[P.second] && "Didn't get a unique numbering?");
+    NumberedTypes[P.second] = P.first;
+  }
+  return NumberedTypes;
+}
+
+bool TypePrinting::empty() {
+  incorporateTypes();
+  return NamedTypes.empty() && Type2Number.empty();
+}
+
+void TypePrinting::incorporateTypes() {
+  if (!DeferredM)
+    return;
+
+  NamedTypes.run(*DeferredM, false);
+  DeferredM = nullptr;
 
   // The list of struct types we got back includes all the struct types, split
   // the unnamed ones out to a numbering and remove the anonymous structs.
@@ -503,7 +543,7 @@ void TypePrinting::incorporateTypes(const Module &M) {
       continue;
 
     if (STy->getName().empty())
-      NumberedTypes[STy] = NextNumber++;
+      Type2Number[STy] = NextNumber++;
     else
       *NextToUse++ = STy;
   }
@@ -511,9 +551,8 @@ void TypePrinting::incorporateTypes(const Module &M) {
   NamedTypes.erase(NextToUse, NamedTypes.end());
 }
 
-
-/// CalcTypeName - Write the specified type to the specified raw_ostream, making
-/// use of type names or up references to shorten the type name where possible.
+/// Write the specified type to the specified raw_ostream, making use of type
+/// names or up references to shorten the type name where possible.
 void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   switch (Ty->getTypeID()) {
   case Type::VoidTyID:      OS << "void"; return;
@@ -557,8 +596,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     if (!STy->getName().empty())
       return PrintLLVMName(OS, STy->getName(), LocalPrefix);
 
-    DenseMap<StructType*, unsigned>::iterator I = NumberedTypes.find(STy);
-    if (I != NumberedTypes.end())
+    incorporateTypes();
+    const auto I = Type2Number.find(STy);
+    if (I != Type2Number.end())
       OS << '%' << I->second;
     else  // Not enumerated, print the hex address.
       OS << "%\"type " << STy << '\"';
@@ -637,6 +677,9 @@ private:
   bool FunctionProcessed = false;
   bool ShouldInitializeAllMetadata;
 
+  /// The summary index for which we are holding slot numbers.
+  const ModuleSummaryIndex *TheIndex = nullptr;
+
   /// mMap - The slot map for the module level data.
   ValueMap mMap;
   unsigned mNext = 0;
@@ -653,6 +696,14 @@ private:
   DenseMap<AttributeSet, unsigned> asMap;
   unsigned asNext = 0;
 
+  /// ModulePathMap - The slot map for Module paths used in the summary index.
+  StringMap<unsigned> ModulePathMap;
+  unsigned ModulePathNext = 0;
+
+  /// GUIDMap - The slot map for GUIDs used in the summary index.
+  DenseMap<GlobalValue::GUID, unsigned> GUIDMap;
+  unsigned GUIDNext = 0;
+
 public:
   /// Construct from a module.
   ///
@@ -670,6 +721,9 @@ public:
   explicit SlotTracker(const Function *F,
                        bool ShouldInitializeAllMetadata = false);
 
+  /// Construct from a module summary index.
+  explicit SlotTracker(const ModuleSummaryIndex *Index);
+
   SlotTracker(const SlotTracker &) = delete;
   SlotTracker &operator=(const SlotTracker &) = delete;
 
@@ -679,6 +733,8 @@ public:
   int getGlobalSlot(const GlobalValue *V);
   int getMetadataSlot(const MDNode *N);
   int getAttributeGroupSlot(AttributeSet AS);
+  int getModulePathSlot(StringRef Path);
+  int getGUIDSlot(GlobalValue::GUID GUID);
 
   /// If you'd like to deal with a function instead of just a module, use
   /// this method to get its data into the SlotTracker.
@@ -710,8 +766,12 @@ public:
   unsigned as_size() const { return asMap.size(); }
   bool as_empty() const    { return asMap.empty(); }
 
-  /// This function does the actual initialization.
-  inline void initialize();
+  /// GUID map iterators.
+  using guid_iterator = DenseMap<GlobalValue::GUID, unsigned>::iterator;
+
+  /// These functions do the actual initialization.
+  inline void initializeIfNeeded();
+  void initializeIndexIfNeeded();
 
   // Implementation Details
 private:
@@ -724,12 +784,16 @@ private:
   /// CreateFunctionSlot - Insert the specified Value* into the slot table.
   void CreateFunctionSlot(const Value *V);
 
-  /// \brief Insert the specified AttributeSet into the slot table.
+  /// Insert the specified AttributeSet into the slot table.
   void CreateAttributeSetSlot(AttributeSet AS);
 
+  inline void CreateModulePathSlot(StringRef Path);
+  void CreateGUIDSlot(GlobalValue::GUID GUID);
+
   /// Add all of the module level global variables (and their initializers)
   /// and function declarations, but not the contents of those functions.
   void processModule();
+  void processIndex();
 
   /// Add all of the functions arguments, basic blocks, and instructions.
   void processFunction();
@@ -830,7 +894,10 @@ SlotTracker::SlotTracker(const Function *F, bool ShouldInitializeAllMetadata)
     : TheModule(F ? F->getParent() : nullptr), TheFunction(F),
       ShouldInitializeAllMetadata(ShouldInitializeAllMetadata) {}
 
-inline void SlotTracker::initialize() {
+SlotTracker::SlotTracker(const ModuleSummaryIndex *Index)
+    : TheModule(nullptr), ShouldInitializeAllMetadata(false), TheIndex(Index) {}
+
+inline void SlotTracker::initializeIfNeeded() {
   if (TheModule) {
     processModule();
     TheModule = nullptr; ///< Prevent re-processing next time we're called.
@@ -840,6 +907,13 @@ inline void SlotTracker::initialize() {
     processFunction();
 }
 
+void SlotTracker::initializeIndexIfNeeded() {
+  if (!TheIndex)
+    return;
+  processIndex();
+  TheIndex = nullptr; ///< Prevent re-processing next time we're called.
+}
+
 // Iterate through all the global variables, functions, and global
 // variable initializers and create slots for them.
 void SlotTracker::processModule() {
@@ -931,6 +1005,32 @@ void SlotTracker::processFunction() {
   ST_DEBUG("end processFunction!\n");
 }
 
+// Iterate through all the GUID in the index and create slots for them.
+void SlotTracker::processIndex() {
+  ST_DEBUG("begin processIndex!\n");
+  assert(TheIndex);
+
+  // The first block of slots are just the module ids, which start at 0 and are
+  // assigned consecutively. Since the StringMap iteration order isn't
+  // guaranteed, use a std::map to order by module ID before assigning slots.
+  std::map<uint64_t, StringRef> ModuleIdToPathMap;
+  for (auto &ModPath : TheIndex->modulePaths())
+    ModuleIdToPathMap[ModPath.second.first] = ModPath.first();
+  for (auto &ModPair : ModuleIdToPathMap)
+    CreateModulePathSlot(ModPair.second);
+
+  // Start numbering the GUIDs after the module ids.
+  GUIDNext = ModulePathNext;
+
+  for (auto &GlobalList : *TheIndex)
+    CreateGUIDSlot(GlobalList.first);
+
+  for (auto &TId : TheIndex->typeIds())
+    CreateGUIDSlot(GlobalValue::getGUID(TId.first));
+
+  ST_DEBUG("end processIndex!\n");
+}
+
 void SlotTracker::processGlobalObjectMetadata(const GlobalObject &GO) {
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   GO.getAllMetadata(MDs);
@@ -977,7 +1077,7 @@ void SlotTracker::purgeFunction() {
 /// getGlobalSlot - Get the slot number of a global value.
 int SlotTracker::getGlobalSlot(const GlobalValue *V) {
   // Check for uninitialized state and do lazy initialization.
-  initialize();
+  initializeIfNeeded();
 
   // Find the value in the module map
   ValueMap::iterator MI = mMap.find(V);
@@ -987,7 +1087,7 @@ int SlotTracker::getGlobalSlot(const GlobalValue *V) {
 /// getMetadataSlot - Get the slot number of a MDNode.
 int SlotTracker::getMetadataSlot(const MDNode *N) {
   // Check for uninitialized state and do lazy initialization.
-  initialize();
+  initializeIfNeeded();
 
   // Find the MDNode in the module map
   mdn_iterator MI = mdnMap.find(N);
@@ -999,7 +1099,7 @@ int SlotTracker::getLocalSlot(const Value *V) {
   assert(!isa<Constant>(V) && "Can't get a constant or global slot with this!");
 
   // Check for uninitialized state and do lazy initialization.
-  initialize();
+  initializeIfNeeded();
 
   ValueMap::iterator FI = fMap.find(V);
   return FI == fMap.end() ? -1 : (int)FI->second;
@@ -1007,13 +1107,31 @@ int SlotTracker::getLocalSlot(const Value *V) {
 
 int SlotTracker::getAttributeGroupSlot(AttributeSet AS) {
   // Check for uninitialized state and do lazy initialization.
-  initialize();
+  initializeIfNeeded();
 
   // Find the AttributeSet in the module map.
   as_iterator AI = asMap.find(AS);
   return AI == asMap.end() ? -1 : (int)AI->second;
 }
 
+int SlotTracker::getModulePathSlot(StringRef Path) {
+  // Check for uninitialized state and do lazy initialization.
+  initializeIndexIfNeeded();
+
+  // Find the Module path in the map
+  auto I = ModulePathMap.find(Path);
+  return I == ModulePathMap.end() ? -1 : (int)I->second;
+}
+
+int SlotTracker::getGUIDSlot(GlobalValue::GUID GUID) {
+  // Check for uninitialized state and do lazy initialization.
+  initializeIndexIfNeeded();
+
+  // Find the GUID in the map
+  guid_iterator I = GUIDMap.find(GUID);
+  return I == GUIDMap.end() ? -1 : (int)I->second;
+}
+
 /// CreateModuleSlot - Insert the specified GlobalValue* into the slot table.
 void SlotTracker::CreateModuleSlot(const GlobalValue *V) {
   assert(V && "Can't insert a null Value into SlotTracker!");
@@ -1074,6 +1192,16 @@ void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
   asMap[AS] = DestSlot;
 }
 
+/// Create a new slot for the specified Module
+void SlotTracker::CreateModulePathSlot(StringRef Path) {
+  ModulePathMap[Path] = ModulePathNext++;
+}
+
+/// Create a new slot for the specified GUID
+void SlotTracker::CreateGUIDSlot(GlobalValue::GUID GUID) {
+  GUIDMap[GUID] = GUIDNext++;
+}
+
 //===----------------------------------------------------------------------===//
 // AsmWriter Implementation
 //===----------------------------------------------------------------------===//
@@ -1277,7 +1405,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     // i8 with ConstantInt values.
     if (CA->isString()) {
       Out << "c\"";
-      PrintEscapedString(CA->getAsString(), Out);
+      printEscapedString(CA->getAsString(), Out);
       Out << '"';
       return;
     }
@@ -1463,7 +1591,7 @@ struct MDFieldPrinter {
 
   void printTag(const DINode *N);
   void printMacinfoType(const DIMacroNode *N);
-  void printChecksumKind(const DIFile *N);
+  void printChecksum(const DIFile::ChecksumInfo<StringRef> &N);
   void printString(StringRef Name, StringRef Value,
                    bool ShouldSkipEmpty = true);
   void printMetadata(StringRef Name, const Metadata *MD,
@@ -1498,11 +1626,10 @@ void MDFieldPrinter::printMacinfoType(const DIMacroNode *N) {
     Out << N->getMacinfoType();
 }
 
-void MDFieldPrinter::printChecksumKind(const DIFile *N) {
-  if (N->getChecksumKind() == DIFile::CSK_None)
-    // Skip CSK_None checksum kind.
-    return;
-  Out << FS << "checksumkind: " << N->getChecksumKindAsString();
+void MDFieldPrinter::printChecksum(
+    const DIFile::ChecksumInfo<StringRef> &Checksum) {
+  Out << FS << "checksumkind: " << Checksum.getKindAsString();
+  printString("checksum", Checksum.Value, /* ShouldSkipEmpty */ false);
 }
 
 void MDFieldPrinter::printString(StringRef Name, StringRef Value,
@@ -1511,7 +1638,7 @@ void MDFieldPrinter::printString(StringRef Name, StringRef Value,
     return;
 
   Out << FS << Name << ": \"";
-  PrintEscapedString(Value, Out);
+  printEscapedString(Value, Out);
   Out << "\"";
 }
 
@@ -1571,7 +1698,7 @@ void MDFieldPrinter::printDIFlags(StringRef Name, DINode::DIFlags Flags) {
 
 void MDFieldPrinter::printEmissionKind(StringRef Name,
                                        DICompileUnit::DebugEmissionKind EK) {
-  Out << FS << Name << ": " << DICompileUnit::EmissionKindString(EK);
+  Out << FS << Name << ": " << DICompileUnit::emissionKindString(EK);
 }
 
 template <class IntTy, class Stringifier>
@@ -1621,10 +1748,15 @@ static void writeDILocation(raw_ostream &Out, const DILocation *DL,
 }
 
 static void writeDISubrange(raw_ostream &Out, const DISubrange *N,
-                            TypePrinting *, SlotTracker *, const Module *) {
+                            TypePrinting *TypePrinter, SlotTracker *Machine,
+                            const Module *Context) {
   Out << "!DISubrange(";
-  MDFieldPrinter Printer(Out);
-  Printer.printInt("count", N->getCount(), /* ShouldSkipZero */ false);
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  if (auto *CE = N->getCount().dyn_cast<ConstantInt*>())
+    Printer.printInt("count", CE->getSExtValue(), /* ShouldSkipZero */ false);
+  else
+    Printer.printMetadata("count", N->getCount().dyn_cast<DIVariable*>(),
+                          /*ShouldSkipNull */ false);
   Printer.printInt("lowerBound", N->getLowerBound());
   Out << ")";
 }
@@ -1634,7 +1766,13 @@ static void writeDIEnumerator(raw_ostream &Out, const DIEnumerator *N,
   Out << "!DIEnumerator(";
   MDFieldPrinter Printer(Out);
   Printer.printString("name", N->getName(), /* ShouldSkipEmpty */ false);
-  Printer.printInt("value", N->getValue(), /* ShouldSkipZero */ false);
+  if (N->isUnsigned()) {
+    auto Value = static_cast<uint64_t>(N->getValue());
+    Printer.printInt("value", Value, /* ShouldSkipZero */ false);
+    Printer.printBool("isUnsigned", true);
+  } else {
+    Printer.printInt("value", N->getValue(), /* ShouldSkipZero */ false);
+  }
   Out << ")";
 }
 
@@ -1696,6 +1834,7 @@ static void writeDICompositeType(raw_ostream &Out, const DICompositeType *N,
   Printer.printMetadata("vtableHolder", N->getRawVTableHolder());
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printString("identifier", N->getIdentifier());
+  Printer.printMetadata("discriminator", N->getRawDiscriminator());
   Out << ")";
 }
 
@@ -1719,8 +1858,11 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, TypePrinting *,
                       /* ShouldSkipEmpty */ false);
   Printer.printString("directory", N->getDirectory(),
                       /* ShouldSkipEmpty */ false);
-  Printer.printChecksumKind(N);
-  Printer.printString("checksum", N->getChecksum(), /* ShouldSkipEmpty */ true);
+  // Print all values for checksum together, or not at all.
+  if (N->getChecksum())
+    Printer.printChecksum(*N->getChecksum());
+  Printer.printString("source", N->getSource().getValueOr(StringRef()),
+                      /* ShouldSkipEmpty */ true);
   Out << ")";
 }
 
@@ -1778,7 +1920,7 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printMetadata("unit", N->getRawUnit());
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printMetadata("declaration", N->getRawDeclaration());
-  Printer.printMetadata("variables", N->getRawVariables());
+  Printer.printMetadata("retainedNodes", N->getRawRetainedNodes());
   Printer.printMetadata("thrownTypes", N->getRawThrownTypes());
   Out << ")";
 }
@@ -1918,6 +2060,18 @@ static void writeDILocalVariable(raw_ostream &Out, const DILocalVariable *N,
   Out << ")";
 }
 
+static void writeDILabel(raw_ostream &Out, const DILabel *N,
+                         TypePrinting *TypePrinter,
+                         SlotTracker *Machine, const Module *Context) {
+  Out << "!DILabel(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
+  Printer.printString("name", N->getName());
+  Printer.printMetadata("file", N->getRawFile());
+  Printer.printInt("line", N->getLine());
+  Out << ")";
+}
+
 static void writeDIExpression(raw_ostream &Out, const DIExpression *N,
                               TypePrinting *TypePrinter, SlotTracker *Machine,
                               const Module *Context) {
@@ -2028,9 +2182,9 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     if (IA->getDialect() == InlineAsm::AD_Intel)
       Out << "inteldialect ";
     Out << '"';
-    PrintEscapedString(IA->getAsmString(), Out);
+    printEscapedString(IA->getAsmString(), Out);
     Out << "\", \"";
-    PrintEscapedString(IA->getConstraintString(), Out);
+    printEscapedString(IA->getConstraintString(), Out);
     Out << '"';
     return;
   }
@@ -2109,7 +2263,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
 
   if (const MDString *MDS = dyn_cast<MDString>(MD)) {
     Out << "!\"";
-    PrintEscapedString(MDS->getString(), Out);
+    printEscapedString(MDS->getString(), Out);
     Out << '"';
     return;
   }
@@ -2128,11 +2282,12 @@ namespace {
 
 class AssemblyWriter {
   formatted_raw_ostream &Out;
-  const Module *TheModule;
+  const Module *TheModule = nullptr;
+  const ModuleSummaryIndex *TheIndex = nullptr;
   std::unique_ptr<SlotTracker> SlotTrackerStorage;
   SlotTracker &Machine;
   TypePrinting TypePrinter;
-  AssemblyAnnotationWriter *AnnotationWriter;
+  AssemblyAnnotationWriter *AnnotationWriter = nullptr;
   SetVector<const Comdat *> Comdats;
   bool IsForDebug;
   bool ShouldPreserveUseListOrder;
@@ -2140,6 +2295,7 @@ class AssemblyWriter {
   SmallVector<StringRef, 8> MDNames;
   /// Synchronization scope names registered with LLVMContext.
   SmallVector<StringRef, 8> SSNs;
+  DenseMap<const GlobalValueSummary *, GlobalValue::GUID> SummaryToGUIDMap;
 
 public:
   /// Construct an AssemblyWriter with an external SlotTracker
@@ -2147,6 +2303,9 @@ public:
                  AssemblyAnnotationWriter *AAW, bool IsForDebug,
                  bool ShouldPreserveUseListOrder = false);
 
+  AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
+                 const ModuleSummaryIndex *Index, bool IsForDebug);
+
   void printMDNodeBody(const MDNode *MD);
   void printNamedMDNode(const NamedMDNode *NMD);
 
@@ -2182,8 +2341,27 @@ public:
   void printUseListOrder(const UseListOrder &Order);
   void printUseLists(const Function *F);
 
+  void printModuleSummaryIndex();
+  void printSummaryInfo(unsigned Slot, const ValueInfo &VI);
+  void printSummary(const GlobalValueSummary &Summary);
+  void printAliasSummary(const AliasSummary *AS);
+  void printGlobalVarSummary(const GlobalVarSummary *GS);
+  void printFunctionSummary(const FunctionSummary *FS);
+  void printTypeIdSummary(const TypeIdSummary &TIS);
+  void printTypeTestResolution(const TypeTestResolution &TTRes);
+  void printArgs(const std::vector<uint64_t> &Args);
+  void printWPDRes(const WholeProgramDevirtResolution &WPDRes);
+  void printTypeIdInfo(const FunctionSummary::TypeIdInfo &TIDInfo);
+  void printVFuncId(const FunctionSummary::VFuncId VFId);
+  void
+  printNonConstVCalls(const std::vector<FunctionSummary::VFuncId> VCallList,
+                      const char *Tag);
+  void
+  printConstVCalls(const std::vector<FunctionSummary::ConstVCall> VCallList,
+                   const char *Tag);
+
 private:
-  /// \brief Print out metadata attachments.
+  /// Print out metadata attachments.
   void printMetadataAttachments(
       const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs,
       StringRef Separator);
@@ -2202,17 +2380,21 @@ private:
 AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                                const Module *M, AssemblyAnnotationWriter *AAW,
                                bool IsForDebug, bool ShouldPreserveUseListOrder)
-    : Out(o), TheModule(M), Machine(Mac), AnnotationWriter(AAW),
+    : Out(o), TheModule(M), Machine(Mac), TypePrinter(M), AnnotationWriter(AAW),
       IsForDebug(IsForDebug),
       ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
   if (!TheModule)
     return;
-  TypePrinter.incorporateTypes(*TheModule);
   for (const GlobalObject &GO : TheModule->global_objects())
     if (const Comdat *C = GO.getComdat())
       Comdats.insert(C);
 }
 
+AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
+                               const ModuleSummaryIndex *Index, bool IsForDebug)
+    : Out(o), TheIndex(Index), Machine(Mac), TypePrinter(/*Module=*/nullptr),
+      IsForDebug(IsForDebug), ShouldPreserveUseListOrder(false) {}
+
 void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
   if (!Operand) {
     Out << "<null operand!>";
@@ -2236,7 +2418,7 @@ void AssemblyWriter::writeSyncScope(const LLVMContext &Context,
       Context.getSyncScopeNames(SSNs);
 
     Out << " syncscope(\"";
-    PrintEscapedString(SSNs[SSID], Out);
+    printEscapedString(SSNs[SSID], Out);
     Out << "\")";
     break;
   }
@@ -2297,7 +2479,7 @@ void AssemblyWriter::writeOperandBundles(ImmutableCallSite CS) {
     FirstBundle = false;
 
     Out << '"';
-    PrintEscapedString(BU.getTagName(), Out);
+    printEscapedString(BU.getTagName(), Out);
     Out << '"';
 
     Out << '(';
@@ -2320,7 +2502,7 @@ void AssemblyWriter::writeOperandBundles(ImmutableCallSite CS) {
 }
 
 void AssemblyWriter::printModule(const Module *M) {
-  Machine.initialize();
+  Machine.initializeIfNeeded();
 
   if (ShouldPreserveUseListOrder)
     UseListOrders = predictUseListOrder(M);
@@ -2333,7 +2515,7 @@ void AssemblyWriter::printModule(const Module *M) {
 
   if (!M->getSourceFileName().empty()) {
     Out << "source_filename = \"";
-    PrintEscapedString(M->getSourceFileName(), Out);
+    printEscapedString(M->getSourceFileName(), Out);
     Out << "\"\n";
   }
 
@@ -2355,7 +2537,7 @@ void AssemblyWriter::printModule(const Module *M) {
       // We found a newline, print the portion of the asm string from the
       // last newline up to this newline.
       Out << "module asm \"";
-      PrintEscapedString(Front, Out);
+      printEscapedString(Front, Out);
       Out << "\"\n";
     } while (!Asm.empty());
   }
@@ -2414,6 +2596,428 @@ void AssemblyWriter::printModule(const Module *M) {
   }
 }
 
+void AssemblyWriter::printModuleSummaryIndex() {
+  assert(TheIndex);
+  Machine.initializeIndexIfNeeded();
+
+  Out << "\n";
+
+  // Print module path entries. To print in order, add paths to a vector
+  // indexed by module slot.
+  std::vector<std::pair<std::string, ModuleHash>> moduleVec;
+  std::string RegularLTOModuleName = "[Regular LTO]";
+  moduleVec.resize(TheIndex->modulePaths().size());
+  for (auto &ModPath : TheIndex->modulePaths())
+    moduleVec[Machine.getModulePathSlot(ModPath.first())] = std::make_pair(
+        // A module id of -1 is a special entry for a regular LTO module created
+        // during the thin link.
+        ModPath.second.first == -1u ? RegularLTOModuleName
+                                    : (std::string)ModPath.first(),
+        ModPath.second.second);
+
+  unsigned i = 0;
+  for (auto &ModPair : moduleVec) {
+    Out << "^" << i++ << " = module: (";
+    Out << "path: \"";
+    printEscapedString(ModPair.first, Out);
+    Out << "\", hash: (";
+    FieldSeparator FS;
+    for (auto Hash : ModPair.second)
+      Out << FS << Hash;
+    Out << "))\n";
+  }
+
+  // FIXME: Change AliasSummary to hold a ValueInfo instead of summary pointer
+  // for aliasee (then update BitcodeWriter.cpp and remove get/setAliaseeGUID).
+  for (auto &GlobalList : *TheIndex) {
+    auto GUID = GlobalList.first;
+    for (auto &Summary : GlobalList.second.SummaryList)
+      SummaryToGUIDMap[Summary.get()] = GUID;
+  }
+
+  // Print the global value summary entries.
+  for (auto &GlobalList : *TheIndex) {
+    auto GUID = GlobalList.first;
+    auto VI = TheIndex->getValueInfo(GlobalList);
+    printSummaryInfo(Machine.getGUIDSlot(GUID), VI);
+  }
+
+  // Print the TypeIdMap entries.
+  for (auto &TId : TheIndex->typeIds()) {
+    auto GUID = GlobalValue::getGUID(TId.first);
+    Out << "^" << Machine.getGUIDSlot(GUID) << " = typeid: (name: \""
+        << TId.first << "\"";
+    printTypeIdSummary(TId.second);
+    Out << ") ; guid = " << GUID << "\n";
+  }
+}
+
+static const char *
+getWholeProgDevirtResKindName(WholeProgramDevirtResolution::Kind K) {
+  switch (K) {
+  case WholeProgramDevirtResolution::Indir:
+    return "indir";
+  case WholeProgramDevirtResolution::SingleImpl:
+    return "singleImpl";
+  case WholeProgramDevirtResolution::BranchFunnel:
+    return "branchFunnel";
+  }
+  llvm_unreachable("invalid WholeProgramDevirtResolution kind");
+}
+
+static const char *getWholeProgDevirtResByArgKindName(
+    WholeProgramDevirtResolution::ByArg::Kind K) {
+  switch (K) {
+  case WholeProgramDevirtResolution::ByArg::Indir:
+    return "indir";
+  case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+    return "uniformRetVal";
+  case WholeProgramDevirtResolution::ByArg::UniqueRetVal:
+    return "uniqueRetVal";
+  case WholeProgramDevirtResolution::ByArg::VirtualConstProp:
+    return "virtualConstProp";
+  }
+  llvm_unreachable("invalid WholeProgramDevirtResolution::ByArg kind");
+}
+
+static const char *getTTResKindName(TypeTestResolution::Kind K) {
+  switch (K) {
+  case TypeTestResolution::Unsat:
+    return "unsat";
+  case TypeTestResolution::ByteArray:
+    return "byteArray";
+  case TypeTestResolution::Inline:
+    return "inline";
+  case TypeTestResolution::Single:
+    return "single";
+  case TypeTestResolution::AllOnes:
+    return "allOnes";
+  }
+  llvm_unreachable("invalid TypeTestResolution kind");
+}
+
+void AssemblyWriter::printTypeTestResolution(const TypeTestResolution &TTRes) {
+  Out << "typeTestRes: (kind: " << getTTResKindName(TTRes.TheKind)
+      << ", sizeM1BitWidth: " << TTRes.SizeM1BitWidth;
+
+  // The following fields are only used if the target does not support the use
+  // of absolute symbols to store constants. Print only if non-zero.
+  if (TTRes.AlignLog2)
+    Out << ", alignLog2: " << TTRes.AlignLog2;
+  if (TTRes.SizeM1)
+    Out << ", sizeM1: " << TTRes.SizeM1;
+  if (TTRes.BitMask)
+    // BitMask is uint8_t which causes it to print the corresponding char.
+    Out << ", bitMask: " << (unsigned)TTRes.BitMask;
+  if (TTRes.InlineBits)
+    Out << ", inlineBits: " << TTRes.InlineBits;
+
+  Out << ")";
+}
+
+void AssemblyWriter::printTypeIdSummary(const TypeIdSummary &TIS) {
+  Out << ", summary: (";
+  printTypeTestResolution(TIS.TTRes);
+  if (!TIS.WPDRes.empty()) {
+    Out << ", wpdResolutions: (";
+    FieldSeparator FS;
+    for (auto &WPDRes : TIS.WPDRes) {
+      Out << FS;
+      Out << "(offset: " << WPDRes.first << ", ";
+      printWPDRes(WPDRes.second);
+      Out << ")";
+    }
+    Out << ")";
+  }
+  Out << ")";
+}
+
+void AssemblyWriter::printArgs(const std::vector<uint64_t> &Args) {
+  Out << "args: (";
+  FieldSeparator FS;
+  for (auto arg : Args) {
+    Out << FS;
+    Out << arg;
+  }
+  Out << ")";
+}
+
+void AssemblyWriter::printWPDRes(const WholeProgramDevirtResolution &WPDRes) {
+  Out << "wpdRes: (kind: ";
+  Out << getWholeProgDevirtResKindName(WPDRes.TheKind);
+
+  if (WPDRes.TheKind == WholeProgramDevirtResolution::SingleImpl)
+    Out << ", singleImplName: \"" << WPDRes.SingleImplName << "\"";
+
+  if (!WPDRes.ResByArg.empty()) {
+    Out << ", resByArg: (";
+    FieldSeparator FS;
+    for (auto &ResByArg : WPDRes.ResByArg) {
+      Out << FS;
+      printArgs(ResByArg.first);
+      Out << ", byArg: (kind: ";
+      Out << getWholeProgDevirtResByArgKindName(ResByArg.second.TheKind);
+      if (ResByArg.second.TheKind ==
+              WholeProgramDevirtResolution::ByArg::UniformRetVal ||
+          ResByArg.second.TheKind ==
+              WholeProgramDevirtResolution::ByArg::UniqueRetVal)
+        Out << ", info: " << ResByArg.second.Info;
+
+      // The following fields are only used if the target does not support the
+      // use of absolute symbols to store constants. Print only if non-zero.
+      if (ResByArg.second.Byte || ResByArg.second.Bit)
+        Out << ", byte: " << ResByArg.second.Byte
+            << ", bit: " << ResByArg.second.Bit;
+
+      Out << ")";
+    }
+    Out << ")";
+  }
+  Out << ")";
+}
+
+static const char *getSummaryKindName(GlobalValueSummary::SummaryKind SK) {
+  switch (SK) {
+  case GlobalValueSummary::AliasKind:
+    return "alias";
+  case GlobalValueSummary::FunctionKind:
+    return "function";
+  case GlobalValueSummary::GlobalVarKind:
+    return "variable";
+  }
+  llvm_unreachable("invalid summary kind");
+}
+
+void AssemblyWriter::printAliasSummary(const AliasSummary *AS) {
+  Out << ", aliasee: ";
+  // The indexes emitted for distributed backends may not include the
+  // aliasee summary (only if it is being imported directly). Handle
+  // that case by just emitting "null" as the aliasee.
+  if (AS->hasAliasee())
+    Out << "^" << Machine.getGUIDSlot(SummaryToGUIDMap[&AS->getAliasee()]);
+  else
+    Out << "null";
+}
+
+void AssemblyWriter::printGlobalVarSummary(const GlobalVarSummary *GS) {
+  // Nothing for now
+}
+
+static std::string getLinkageName(GlobalValue::LinkageTypes LT) {
+  switch (LT) {
+  case GlobalValue::ExternalLinkage:
+    return "external";
+  case GlobalValue::PrivateLinkage:
+    return "private";
+  case GlobalValue::InternalLinkage:
+    return "internal";
+  case GlobalValue::LinkOnceAnyLinkage:
+    return "linkonce";
+  case GlobalValue::LinkOnceODRLinkage:
+    return "linkonce_odr";
+  case GlobalValue::WeakAnyLinkage:
+    return "weak";
+  case GlobalValue::WeakODRLinkage:
+    return "weak_odr";
+  case GlobalValue::CommonLinkage:
+    return "common";
+  case GlobalValue::AppendingLinkage:
+    return "appending";
+  case GlobalValue::ExternalWeakLinkage:
+    return "extern_weak";
+  case GlobalValue::AvailableExternallyLinkage:
+    return "available_externally";
+  }
+  llvm_unreachable("invalid linkage");
+}
+
+// When printing the linkage types in IR where the ExternalLinkage is
+// not printed, and other linkage types are expected to be printed with
+// a space after the name.
+static std::string getLinkageNameWithSpace(GlobalValue::LinkageTypes LT) {
+  if (LT == GlobalValue::ExternalLinkage)
+    return "";
+  return getLinkageName(LT) + " ";
+}
+
+static const char *getHotnessName(CalleeInfo::HotnessType HT) {
+  switch (HT) {
+  case CalleeInfo::HotnessType::Unknown:
+    return "unknown";
+  case CalleeInfo::HotnessType::Cold:
+    return "cold";
+  case CalleeInfo::HotnessType::None:
+    return "none";
+  case CalleeInfo::HotnessType::Hot:
+    return "hot";
+  case CalleeInfo::HotnessType::Critical:
+    return "critical";
+  }
+  llvm_unreachable("invalid hotness");
+}
+
+void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
+  Out << ", insts: " << FS->instCount();
+
+  FunctionSummary::FFlags FFlags = FS->fflags();
+  if (FFlags.ReadNone | FFlags.ReadOnly | FFlags.NoRecurse |
+      FFlags.ReturnDoesNotAlias) {
+    Out << ", funcFlags: (";
+    Out << "readNone: " << FFlags.ReadNone;
+    Out << ", readOnly: " << FFlags.ReadOnly;
+    Out << ", noRecurse: " << FFlags.NoRecurse;
+    Out << ", returnDoesNotAlias: " << FFlags.ReturnDoesNotAlias;
+    Out << ")";
+  }
+  if (!FS->calls().empty()) {
+    Out << ", calls: (";
+    FieldSeparator IFS;
+    for (auto &Call : FS->calls()) {
+      Out << IFS;
+      Out << "(callee: ^" << Machine.getGUIDSlot(Call.first.getGUID());
+      if (Call.second.getHotness() != CalleeInfo::HotnessType::Unknown)
+        Out << ", hotness: " << getHotnessName(Call.second.getHotness());
+      else if (Call.second.RelBlockFreq)
+        Out << ", relbf: " << Call.second.RelBlockFreq;
+      Out << ")";
+    }
+    Out << ")";
+  }
+
+  if (const auto *TIdInfo = FS->getTypeIdInfo())
+    printTypeIdInfo(*TIdInfo);
+}
+
+void AssemblyWriter::printTypeIdInfo(
+    const FunctionSummary::TypeIdInfo &TIDInfo) {
+  Out << ", typeIdInfo: (";
+  FieldSeparator TIDFS;
+  if (!TIDInfo.TypeTests.empty()) {
+    Out << TIDFS;
+    Out << "typeTests: (";
+    FieldSeparator FS;
+    for (auto &GUID : TIDInfo.TypeTests) {
+      Out << FS;
+      auto Slot = Machine.getGUIDSlot(GUID);
+      if (Slot != -1)
+        Out << "^" << Slot;
+      else
+        Out << GUID;
+    }
+    Out << ")";
+  }
+  if (!TIDInfo.TypeTestAssumeVCalls.empty()) {
+    Out << TIDFS;
+    printNonConstVCalls(TIDInfo.TypeTestAssumeVCalls, "typeTestAssumeVCalls");
+  }
+  if (!TIDInfo.TypeCheckedLoadVCalls.empty()) {
+    Out << TIDFS;
+    printNonConstVCalls(TIDInfo.TypeCheckedLoadVCalls, "typeCheckedLoadVCalls");
+  }
+  if (!TIDInfo.TypeTestAssumeConstVCalls.empty()) {
+    Out << TIDFS;
+    printConstVCalls(TIDInfo.TypeTestAssumeConstVCalls,
+                     "typeTestAssumeConstVCalls");
+  }
+  if (!TIDInfo.TypeCheckedLoadConstVCalls.empty()) {
+    Out << TIDFS;
+    printConstVCalls(TIDInfo.TypeCheckedLoadConstVCalls,
+                     "typeCheckedLoadConstVCalls");
+  }
+  Out << ")";
+}
+
+void AssemblyWriter::printVFuncId(const FunctionSummary::VFuncId VFId) {
+  Out << "vFuncId: (";
+  auto Slot = Machine.getGUIDSlot(VFId.GUID);
+  if (Slot != -1)
+    Out << "^" << Slot;
+  else
+    Out << "guid: " << VFId.GUID;
+  Out << ", offset: " << VFId.Offset;
+  Out << ")";
+}
+
+void AssemblyWriter::printNonConstVCalls(
+    const std::vector<FunctionSummary::VFuncId> VCallList, const char *Tag) {
+  Out << Tag << ": (";
+  FieldSeparator FS;
+  for (auto &VFuncId : VCallList) {
+    Out << FS;
+    printVFuncId(VFuncId);
+  }
+  Out << ")";
+}
+
+void AssemblyWriter::printConstVCalls(
+    const std::vector<FunctionSummary::ConstVCall> VCallList, const char *Tag) {
+  Out << Tag << ": (";
+  FieldSeparator FS;
+  for (auto &ConstVCall : VCallList) {
+    Out << FS;
+    printVFuncId(ConstVCall.VFunc);
+    if (!ConstVCall.Args.empty()) {
+      Out << ", ";
+      printArgs(ConstVCall.Args);
+    }
+  }
+  Out << ")";
+}
+
+void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {
+  GlobalValueSummary::GVFlags GVFlags = Summary.flags();
+  GlobalValue::LinkageTypes LT = (GlobalValue::LinkageTypes)GVFlags.Linkage;
+  Out << getSummaryKindName(Summary.getSummaryKind()) << ": ";
+  Out << "(module: ^" << Machine.getModulePathSlot(Summary.modulePath())
+      << ", flags: (";
+  Out << "linkage: " << getLinkageName(LT);
+  Out << ", notEligibleToImport: " << GVFlags.NotEligibleToImport;
+  Out << ", live: " << GVFlags.Live;
+  Out << ", dsoLocal: " << GVFlags.DSOLocal;
+  Out << ")";
+
+  if (Summary.getSummaryKind() == GlobalValueSummary::AliasKind)
+    printAliasSummary(cast<AliasSummary>(&Summary));
+  else if (Summary.getSummaryKind() == GlobalValueSummary::FunctionKind)
+    printFunctionSummary(cast<FunctionSummary>(&Summary));
+  else
+    printGlobalVarSummary(cast<GlobalVarSummary>(&Summary));
+
+  auto RefList = Summary.refs();
+  if (!RefList.empty()) {
+    Out << ", refs: (";
+    FieldSeparator FS;
+    for (auto &Ref : RefList) {
+      Out << FS;
+      Out << "^" << Machine.getGUIDSlot(Ref.getGUID());
+    }
+    Out << ")";
+  }
+
+  Out << ")";
+}
+
+void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
+  Out << "^" << Slot << " = gv: (";
+  if (!VI.name().empty())
+    Out << "name: \"" << VI.name() << "\"";
+  else
+    Out << "guid: " << VI.getGUID();
+  if (!VI.getSummaryList().empty()) {
+    Out << ", summaries: (";
+    FieldSeparator FS;
+    for (auto &Summary : VI.getSummaryList()) {
+      Out << FS;
+      printSummary(*Summary);
+    }
+    Out << ")";
+  }
+  Out << ")";
+  if (!VI.name().empty())
+    Out << " ; guid = " << VI.getGUID();
+  Out << "\n";
+}
+
 static void printMetadataIdentifier(StringRef Name,
                                     formatted_raw_ostream &Out) {
   if (Name.empty()) {
@@ -2460,34 +3064,6 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
   Out << "}\n";
 }
 
-static const char *getLinkagePrintName(GlobalValue::LinkageTypes LT) {
-  switch (LT) {
-  case GlobalValue::ExternalLinkage:
-    return "";
-  case GlobalValue::PrivateLinkage:
-    return "private ";
-  case GlobalValue::InternalLinkage:
-    return "internal ";
-  case GlobalValue::LinkOnceAnyLinkage:
-    return "linkonce ";
-  case GlobalValue::LinkOnceODRLinkage:
-    return "linkonce_odr ";
-  case GlobalValue::WeakAnyLinkage:
-    return "weak ";
-  case GlobalValue::WeakODRLinkage:
-    return "weak_odr ";
-  case GlobalValue::CommonLinkage:
-    return "common ";
-  case GlobalValue::AppendingLinkage:
-    return "appending ";
-  case GlobalValue::ExternalWeakLinkage:
-    return "extern_weak ";
-  case GlobalValue::AvailableExternallyLinkage:
-    return "available_externally ";
-  }
-  llvm_unreachable("invalid linkage");
-}
-
 static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
                             formatted_raw_ostream &Out) {
   switch (Vis) {
@@ -2497,8 +3073,13 @@ static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
   }
 }
 
-static void PrintDSOLocation(bool IsDSOLocal, formatted_raw_ostream &Out){
-  if (IsDSOLocal)
+static void PrintDSOLocation(const GlobalValue &GV,
+                             formatted_raw_ostream &Out) {
+  // GVs with local linkage or non default visibility are implicitly dso_local,
+  // so we don't print it.
+  bool Implicit = GV.hasLocalLinkage() ||
+                  (!GV.hasExternalWeakLinkage() && !GV.hasDefaultVisibility());
+  if (GV.isDSOLocal() && !Implicit)
     Out << "dso_local ";
 }
 
@@ -2571,8 +3152,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (!GV->hasInitializer() && GV->hasExternalLinkage())
     Out << "external ";
 
-  Out << getLinkagePrintName(GV->getLinkage());
-  PrintDSOLocation(GV->isDSOLocal(), Out);
+  Out << getLinkageNameWithSpace(GV->getLinkage());
+  PrintDSOLocation(*GV, Out);
   PrintVisibility(GV->getVisibility(), Out);
   PrintDLLStorageClass(GV->getDLLStorageClass(), Out);
   PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
@@ -2593,7 +3174,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
 
   if (GV->hasSection()) {
     Out << ", section \"";
-    PrintEscapedString(GV->getSection(), Out);
+    printEscapedString(GV->getSection(), Out);
     Out << '"';
   }
   maybePrintComdat(Out, *GV);
@@ -2618,8 +3199,8 @@ void AssemblyWriter::printIndirectSymbol(const GlobalIndirectSymbol *GIS) {
   WriteAsOperandInternal(Out, GIS, &TypePrinter, &Machine, GIS->getParent());
   Out << " = ";
 
-  Out << getLinkagePrintName(GIS->getLinkage());
-  PrintDSOLocation(GIS->isDSOLocal(), Out);
+  Out << getLinkageNameWithSpace(GIS->getLinkage());
+  PrintDSOLocation(*GIS, Out);
   PrintVisibility(GIS->getVisibility(), Out);
   PrintDLLStorageClass(GIS->getDLLStorageClass(), Out);
   PrintThreadLocalModel(GIS->getThreadLocalMode(), Out);
@@ -2656,39 +3237,30 @@ void AssemblyWriter::printComdat(const Comdat *C) {
 }
 
 void AssemblyWriter::printTypeIdentities() {
-  if (TypePrinter.NumberedTypes.empty() &&
-      TypePrinter.NamedTypes.empty())
+  if (TypePrinter.empty())
     return;
 
   Out << '\n';
 
-  // We know all the numbers that each type is used and we know that it is a
-  // dense assignment.  Convert the map to an index table.
-  std::vector<StructType*> NumberedTypes(TypePrinter.NumberedTypes.size());
-  for (DenseMap<StructType*, unsigned>::iterator I =
-       TypePrinter.NumberedTypes.begin(), E = TypePrinter.NumberedTypes.end();
-       I != E; ++I) {
-    assert(I->second < NumberedTypes.size() && "Didn't get a dense numbering?");
-    NumberedTypes[I->second] = I->first;
-  }
-
   // Emit all numbered types.
-  for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i) {
-    Out << '%' << i << " = type ";
+  auto &NumberedTypes = TypePrinter.getNumberedTypes();
+  for (unsigned I = 0, E = NumberedTypes.size(); I != E; ++I) {
+    Out << '%' << I << " = type ";
 
     // Make sure we print out at least one level of the type structure, so
     // that we do not get %2 = type %2
-    TypePrinter.printStructBody(NumberedTypes[i], Out);
+    TypePrinter.printStructBody(NumberedTypes[I], Out);
     Out << '\n';
   }
 
-  for (unsigned i = 0, e = TypePrinter.NamedTypes.size(); i != e; ++i) {
-    PrintLLVMName(Out, TypePrinter.NamedTypes[i]->getName(), LocalPrefix);
+  auto &NamedTypes = TypePrinter.getNamedTypes();
+  for (unsigned I = 0, E = NamedTypes.size(); I != E; ++I) {
+    PrintLLVMName(Out, NamedTypes[I]->getName(), LocalPrefix);
     Out << " = type ";
 
     // Make sure we print out at least one level of the type structure, so
     // that we do not get %FILE = type %FILE
-    TypePrinter.printStructBody(TypePrinter.NamedTypes[i], Out);
+    TypePrinter.printStructBody(NamedTypes[I], Out);
     Out << '\n';
   }
 }
@@ -2730,8 +3302,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   } else
     Out << "define ";
 
-  Out << getLinkagePrintName(F->getLinkage());
-  PrintDSOLocation(F->isDSOLocal(), Out);
+  Out << getLinkageNameWithSpace(F->getLinkage());
+  PrintDSOLocation(*F, Out);
   PrintVisibility(F->getVisibility(), Out);
   PrintDLLStorageClass(F->getDLLStorageClass(), Out);
 
@@ -2786,7 +3358,7 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
   if (F->hasSection()) {
     Out << " section \"";
-    PrintEscapedString(F->getSection(), Out);
+    printEscapedString(F->getSection(), Out);
     Out << '"';
   }
   maybePrintComdat(Out, *F);
@@ -3558,9 +4130,7 @@ static bool printWithoutType(const Value &V, raw_ostream &O,
 
 static void printAsOperandImpl(const Value &V, raw_ostream &O, bool PrintType,
                                ModuleSlotTracker &MST) {
-  TypePrinting TypePrinter;
-  if (const Module *M = MST.getModule())
-    TypePrinter.incorporateTypes(*M);
+  TypePrinting TypePrinter(MST.getModule());
   if (PrintType) {
     TypePrinter.print(V.getType(), O);
     O << ' ';
@@ -3599,9 +4169,7 @@ static void printMetadataImpl(raw_ostream &ROS, const Metadata &MD,
                               bool OnlyAsOperand) {
   formatted_raw_ostream OS(ROS);
 
-  TypePrinting TypePrinter;
-  if (M)
-    TypePrinter.incorporateTypes(*M);
+  TypePrinting TypePrinter(M);
 
   WriteAsOperandInternal(OS, &MD, &TypePrinter, MST.getMachine(), M,
                          /* FromValue */ true);
@@ -3635,6 +4203,13 @@ void Metadata::print(raw_ostream &OS, ModuleSlotTracker &MST,
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false);
 }
 
+void ModuleSummaryIndex::print(raw_ostream &ROS, bool IsForDebug) const {
+  SlotTracker SlotTable(this);
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, SlotTable, this, IsForDebug);
+  W.printModuleSummaryIndex();
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // Value::dump - allow easy printing of Values from the debugger.
 LLVM_DUMP_METHOD
@@ -3651,7 +4226,7 @@ void Module::dump() const {
         /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true);
 }
 
-// \brief Allow printing of Comdats from the debugger.
+// Allow printing of Comdats from the debugger.
 LLVM_DUMP_METHOD
 void Comdat::dump() const { print(dbgs(), /*IsForDebug=*/true); }
 
@@ -3667,4 +4242,8 @@ void Metadata::dump(const Module *M) const {
   print(dbgs(), M, /*IsForDebug=*/true);
   dbgs() << '\n';
 }
+
+// Allow printing of ModuleSummaryIndex from the debugger.
+LLVM_DUMP_METHOD
+void ModuleSummaryIndex::dump() const { print(dbgs(), /*IsForDebug=*/true); }
 #endif
diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h
index 9c7b61f67923..bb0c072e4781 100644
--- a/contrib/llvm/lib/IR/AttributeImpl.h
+++ b/contrib/llvm/lib/IR/AttributeImpl.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines various helper methods and classes used by
+/// This file defines various helper methods and classes used by
 /// LLVMContextImpl for creating and managing attributes.
 ///
 //===----------------------------------------------------------------------===//
@@ -33,7 +33,7 @@ class LLVMContext;
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class represents a single, uniqued attribute. That attribute
+/// This class represents a single, uniqued attribute. That attribute
 /// could be a single enum, a tuple, or a string.
 class AttributeImpl : public FoldingSetNode {
   unsigned char KindID; ///< Holds the AttrEntryKind of the attribute
@@ -67,7 +67,7 @@ public:
   StringRef getKindAsString() const;
   StringRef getValueAsString() const;
 
-  /// \brief Used when sorting the attributes.
+  /// Used when sorting the attributes.
   bool operator<(const AttributeImpl &AI) const;
 
   void Profile(FoldingSetNodeID &ID) const {
@@ -93,7 +93,7 @@ public:
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief A set of classes that contain the value of the
+/// A set of classes that contain the value of the
 /// attribute object. There are three main categories: enum attribute entries,
 /// represented by Attribute::AttrKind; alignment attribute entries; and string
 /// attribute enties, which are for target-dependent attributes.
@@ -148,7 +148,7 @@ public:
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class represents a group of attributes that apply to one
+/// This class represents a group of attributes that apply to one
 /// element: function, return type, or parameter.
 class AttributeSetNode final
     : public FoldingSetNode,
@@ -172,7 +172,7 @@ public:
 
   static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
 
-  /// \brief Return the number of attributes this AttributeList contains.
+  /// Return the number of attributes this AttributeList contains.
   unsigned getNumAttributes() const { return NumAttrs; }
 
   bool hasAttribute(Attribute::AttrKind Kind) const {
@@ -210,7 +210,7 @@ using IndexAttrPair = std::pair<unsigned, AttributeSet>;
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class represents a set of attributes that apply to the function,
+/// This class represents a set of attributes that apply to the function,
 /// return type, and parameters.
 class AttributeListImpl final
     : public FoldingSetNode,
@@ -236,10 +236,10 @@ public:
 
   void operator delete(void *p) { ::operator delete(p); }
 
-  /// \brief Get the context that created this AttributeListImpl.
+  /// Get the context that created this AttributeListImpl.
   LLVMContext &getContext() { return Context; }
 
-  /// \brief Return true if the AttributeSet or the FunctionIndex has an
+  /// Return true if the AttributeSet or the FunctionIndex has an
   /// enum attribute of the given kind.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
diff --git a/contrib/llvm/lib/IR/Attributes.cpp b/contrib/llvm/lib/IR/Attributes.cpp
index 1b19a0474727..9e5f55d49756 100644
--- a/contrib/llvm/lib/IR/Attributes.cpp
+++ b/contrib/llvm/lib/IR/Attributes.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // \file
-// \brief This file implements the Attribute, AttributeImpl, AttrBuilder,
+// This file implements the Attribute, AttributeImpl, AttrBuilder,
 // AttributeListImpl, and AttributeList classes.
 //
 //===----------------------------------------------------------------------===//
@@ -24,6 +24,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
@@ -38,7 +39,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <limits>
-#include <map>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -186,14 +186,14 @@ uint64_t Attribute::getValueAsInt() const {
 }
 
 StringRef Attribute::getKindAsString() const {
-  if (!pImpl) return StringRef();
+  if (!pImpl) return {};
   assert(isStringAttribute() &&
          "Invalid attribute type to get the kind as a string!");
   return pImpl->getKindAsString();
 }
 
 StringRef Attribute::getValueAsString() const {
-  if (!pImpl) return StringRef();
+  if (!pImpl) return {};
   assert(isStringAttribute() &&
          "Invalid attribute type to get the value as a string!");
   return pImpl->getValueAsString();
@@ -241,7 +241,7 @@ std::pair<unsigned, Optional<unsigned>> Attribute::getAllocSizeArgs() const {
 }
 
 std::string Attribute::getAsString(bool InAttrGrp) const {
-  if (!pImpl) return "";
+  if (!pImpl) return {};
 
   if (hasAttribute(Attribute::SanitizeAddress))
     return "sanitize_address";
@@ -299,10 +299,14 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noredzone";
   if (hasAttribute(Attribute::NoReturn))
     return "noreturn";
+  if (hasAttribute(Attribute::NoCfCheck))
+    return "nocf_check";
   if (hasAttribute(Attribute::NoRecurse))
     return "norecurse";
   if (hasAttribute(Attribute::NoUnwind))
     return "nounwind";
+  if (hasAttribute(Attribute::OptForFuzzing))
+    return "optforfuzzing";
   if (hasAttribute(Attribute::OptimizeNone))
     return "optnone";
   if (hasAttribute(Attribute::OptimizeForSize))
@@ -329,6 +333,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "sspstrong";
   if (hasAttribute(Attribute::SafeStack))
     return "safestack";
+  if (hasAttribute(Attribute::ShadowCallStack))
+    return "shadowcallstack";
   if (hasAttribute(Attribute::StrictFP))
     return "strictfp";
   if (hasAttribute(Attribute::StructRet))
@@ -413,7 +419,7 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     {
       raw_string_ostream OS(Result);
       OS << "=\"";
-      PrintEscapedString(AttrVal, OS);
+      printEscapedString(AttrVal, OS);
       OS << "\"";
     }
     return Result;
@@ -534,7 +540,7 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C,
     return *this;
 
   AttrBuilder B(AS);
-  for (Attribute I : *this)
+  for (const auto I : *this)
     B.addAttribute(I);
 
  return get(C, B);
@@ -543,26 +549,21 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C,
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
                                              Attribute::AttrKind Kind) const {
   if (!hasAttribute(Kind)) return *this;
-  AttrBuilder B;
-  B.addAttribute(Kind);
-  return removeAttributes(C, B);
+  AttrBuilder B(*this);
+  B.removeAttribute(Kind);
+  return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
                                              StringRef Kind) const {
   if (!hasAttribute(Kind)) return *this;
-  AttrBuilder B;
-  B.addAttribute(Kind);
-  return removeAttributes(C, B);
+  AttrBuilder B(*this);
+  B.removeAttribute(Kind);
+  return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
                                               const AttrBuilder &Attrs) const {
-
-  // FIXME it is not obvious how this should work for alignment.
-  // For now, say we can't pass in alignment, which no current use does.
-  assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
-
   AttrBuilder B(*this);
   B.remove(Attrs);
   return get(C, B);
@@ -638,7 +639,7 @@ AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
   // There's memory after the node where we can store the entries in.
   std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
 
-  for (Attribute I : *this) {
+  for (const auto I : *this) {
     if (!I.isStringAttribute()) {
       AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
     }
@@ -655,9 +656,9 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
   FoldingSetNodeID ID;
 
   SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
-  std::sort(SortedAttrs.begin(), SortedAttrs.end());
+  llvm::sort(SortedAttrs.begin(), SortedAttrs.end());
 
-  for (Attribute Attr : SortedAttrs)
+  for (const auto Attr : SortedAttrs)
     Attr.Profile(ID);
 
   void *InsertPoint;
@@ -720,7 +721,7 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
 }
 
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Kind))
       return true;
   return false;
@@ -728,43 +729,43 @@ bool AttributeSetNode::hasAttribute(StringRef Kind) const {
 
 Attribute AttributeSetNode::getAttribute(Attribute::AttrKind Kind) const {
   if (hasAttribute(Kind)) {
-    for (Attribute I : *this)
+    for (const auto I : *this)
       if (I.hasAttribute(Kind))
         return I;
   }
-  return Attribute();
+  return {};
 }
 
 Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Kind))
       return I;
-  return Attribute();
+  return {};
 }
 
 unsigned AttributeSetNode::getAlignment() const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Attribute::Alignment))
       return I.getAlignment();
   return 0;
 }
 
 unsigned AttributeSetNode::getStackAlignment() const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Attribute::StackAlignment))
       return I.getStackAlignment();
   return 0;
 }
 
 uint64_t AttributeSetNode::getDereferenceableBytes() const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Attribute::Dereferenceable))
       return I.getDereferenceableBytes();
   return 0;
 }
 
 uint64_t AttributeSetNode::getDereferenceableOrNullBytes() const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Attribute::DereferenceableOrNull))
       return I.getDereferenceableOrNullBytes();
   return 0;
@@ -772,7 +773,7 @@ uint64_t AttributeSetNode::getDereferenceableOrNullBytes() const {
 
 std::pair<unsigned, Optional<unsigned>>
 AttributeSetNode::getAllocSizeArgs() const {
-  for (Attribute I : *this)
+  for (const auto I : *this)
     if (I.hasAttribute(Attribute::AllocSize))
       return I.getAllocSizeArgs();
   return std::make_pair(0, 0);
@@ -814,7 +815,7 @@ AttributeListImpl::AttributeListImpl(LLVMContext &C,
                 "Too many attributes");
   static_assert(attrIdxToArrayIdx(AttributeList::FunctionIndex) == 0U,
                 "function should be stored in slot 0");
-  for (Attribute I : Sets[0]) {
+  for (const auto I : Sets[0]) {
     if (!I.isStringAttribute())
       AvailableFunctionAttrs |= 1ULL << I.getKindAsEnum();
   }
@@ -871,17 +872,17 @@ AttributeList::get(LLVMContext &C,
                    ArrayRef<std::pair<unsigned, Attribute>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeList();
+    return {};
 
   assert(std::is_sorted(Attrs.begin(), Attrs.end(),
                         [](const std::pair<unsigned, Attribute> &LHS,
                            const std::pair<unsigned, Attribute> &RHS) {
                           return LHS.first < RHS.first;
                         }) && "Misordered Attributes list!");
-  assert(none_of(Attrs,
-                 [](const std::pair<unsigned, Attribute> &Pair) {
-                   return Pair.second.hasAttribute(Attribute::None);
-                 }) &&
+  assert(llvm::none_of(Attrs,
+                       [](const std::pair<unsigned, Attribute> &Pair) {
+                         return Pair.second.hasAttribute(Attribute::None);
+                       }) &&
          "Pointless attribute!");
 
   // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
@@ -907,7 +908,7 @@ AttributeList::get(LLVMContext &C,
                    ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeList();
+    return {};
 
   assert(std::is_sorted(Attrs.begin(), Attrs.end(),
                         [](const std::pair<unsigned, AttributeSet> &LHS,
@@ -915,16 +916,20 @@ AttributeList::get(LLVMContext &C,
                           return LHS.first < RHS.first;
                         }) &&
          "Misordered Attributes list!");
-  assert(none_of(Attrs,
-                 [](const std::pair<unsigned, AttributeSet> &Pair) {
-                   return !Pair.second.hasAttributes();
-                 }) &&
+  assert(llvm::none_of(Attrs,
+                       [](const std::pair<unsigned, AttributeSet> &Pair) {
+                         return !Pair.second.hasAttributes();
+                       }) &&
          "Pointless attribute!");
 
   unsigned MaxIndex = Attrs.back().first;
+  // If the MaxIndex is FunctionIndex and there are other indices in front
+  // of it, we need to use the largest of those to get the right size.
+  if (MaxIndex == FunctionIndex && Attrs.size() > 1)
+    MaxIndex = Attrs[Attrs.size() - 2].first;
 
   SmallVector<AttributeSet, 4> AttrVec(attrIdxToArrayIdx(MaxIndex) + 1);
-  for (auto Pair : Attrs)
+  for (const auto Pair : Attrs)
     AttrVec[attrIdxToArrayIdx(Pair.first)] = Pair.second;
 
   return getImpl(C, AttrVec);
@@ -954,7 +959,7 @@ AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
 
   // If all attribute sets were empty, we can use the empty attribute list.
   if (NumSets == 0)
-    return AttributeList();
+    return {};
 
   SmallVector<AttributeSet, 8> AttrSets;
   AttrSets.reserve(NumSets);
@@ -974,7 +979,7 @@ AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  const AttrBuilder &B) {
   if (!B.hasAttributes())
-    return AttributeList();
+    return {};
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 8> AttrSets(Index + 1);
   AttrSets[Index] = AttributeSet::get(C, B);
@@ -984,7 +989,7 @@ AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  ArrayRef<Attribute::AttrKind> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (Attribute::AttrKind K : Kinds)
+  for (const auto K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
@@ -992,7 +997,7 @@ AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  ArrayRef<StringRef> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (StringRef K : Kinds)
+  for (const auto K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
@@ -1000,22 +1005,22 @@ AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
 AttributeList AttributeList::get(LLVMContext &C,
                                  ArrayRef<AttributeList> Attrs) {
   if (Attrs.empty())
-    return AttributeList();
+    return {};
   if (Attrs.size() == 1)
     return Attrs[0];
 
   unsigned MaxSize = 0;
-  for (AttributeList List : Attrs)
+  for (const auto List : Attrs)
     MaxSize = std::max(MaxSize, List.getNumAttrSets());
 
   // If every list was empty, there is no point in merging the lists.
   if (MaxSize == 0)
-    return AttributeList();
+    return {};
 
   SmallVector<AttributeSet, 8> NewAttrSets(MaxSize);
   for (unsigned I = 0; I < MaxSize; ++I) {
     AttrBuilder CurBuilder;
-    for (AttributeList List : Attrs)
+    for (const auto List : Attrs)
       CurBuilder.merge(List.getAttributes(I - 1));
     NewAttrSets[I] = AttributeSet::get(C, CurBuilder);
   }
@@ -1098,37 +1103,41 @@ AttributeList AttributeList::addParamAttribute(LLVMContext &C,
 AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
                                              Attribute::AttrKind Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  AttrBuilder B;
-  B.addAttribute(Kind);
-  return removeAttributes(C, Index, B);
+
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  assert(Index < AttrSets.size());
+
+  AttrSets[Index] = AttrSets[Index].removeAttribute(C, Kind);
+
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
                                              StringRef Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  AttrBuilder B;
-  B.addAttribute(Kind);
-  return removeAttributes(C, Index, B);
+
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  assert(Index < AttrSets.size());
+
+  AttrSets[Index] = AttrSets[Index].removeAttribute(C, Kind);
+
+  return getImpl(C, AttrSets);
 }
 
 AttributeList
 AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
                                 const AttrBuilder &AttrsToRemove) const {
   if (!pImpl)
-    return AttributeList();
-
-  // FIXME it is not obvious how this should work for alignment.
-  // For now, say we can't pass in alignment, which no current use does.
-  assert(!AttrsToRemove.hasAlignmentAttr() && "Attempt to change alignment!");
+    return {};
 
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
   if (Index >= AttrSets.size())
     AttrSets.resize(Index + 1);
 
-  AttrBuilder B(AttrSets[Index]);
-  B.remove(AttrsToRemove);
-  AttrSets[Index] = AttributeSet::get(C, B);
+  AttrSets[Index] = AttrSets[Index].removeAttributes(C, AttrsToRemove);
 
   return getImpl(C, AttrSets);
 }
@@ -1136,7 +1145,7 @@ AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
 AttributeList AttributeList::removeAttributes(LLVMContext &C,
                                               unsigned WithoutIndex) const {
   if (!pImpl)
-    return AttributeList();
+    return {};
   WithoutIndex = attrIdxToArrayIdx(WithoutIndex);
   if (WithoutIndex >= getNumAttrSets())
     return *this;
@@ -1270,7 +1279,7 @@ std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
 AttributeSet AttributeList::getAttributes(unsigned Index) const {
   Index = attrIdxToArrayIdx(Index);
   if (!pImpl || Index >= getNumAttrSets())
-    return AttributeSet();
+    return {};
   return pImpl->begin()[Index];
 }
 
@@ -1310,12 +1319,12 @@ LLVM_DUMP_METHOD void AttributeList::dump() const {
 // FIXME: Remove this ctor, use AttributeSet.
 AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) {
   AttributeSet AS = AL.getAttributes(Index);
-  for (const Attribute &A : AS)
+  for (const auto &A : AS)
     addAttribute(A);
 }
 
 AttrBuilder::AttrBuilder(AttributeSet AS) {
-  for (const Attribute &A : AS)
+  for (const auto &A : AS)
     addAttribute(A);
 }
 
@@ -1386,7 +1395,7 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) {
 }
 
 AttrBuilder &AttrBuilder::removeAttribute(StringRef A) {
-  std::map<std::string, std::string>::iterator I = TargetDepAttrs.find(A);
+  auto I = TargetDepAttrs.find(A);
   if (I != TargetDepAttrs.end())
     TargetDepAttrs.erase(I);
   return *this;
@@ -1526,7 +1535,7 @@ bool AttrBuilder::hasAttributes() const {
 bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const {
   AttributeSet AS = AL.getAttributes(Index);
 
-  for (Attribute Attr : AS) {
+  for (const auto Attr : AS) {
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       if (contains(Attr.getKindAsEnum()))
         return true;
@@ -1560,7 +1569,7 @@ bool AttrBuilder::operator==(const AttrBuilder &B) {
 // AttributeFuncs Function Defintions
 //===----------------------------------------------------------------------===//
 
-/// \brief Which attributes cannot be applied to a type.
+/// Which attributes cannot be applied to a type.
 AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
   AttrBuilder Incompatible;
 
@@ -1592,7 +1601,7 @@ static bool isEqual(const Function &Caller, const Function &Callee) {
          Callee.getFnAttribute(AttrClass::getKind());
 }
 
-/// \brief Compute the logical AND of the attributes of the caller and the
+/// Compute the logical AND of the attributes of the caller and the
 /// callee.
 ///
 /// This function sets the caller's attribute to false if the callee's attribute
@@ -1604,7 +1613,7 @@ static void setAND(Function &Caller, const Function &Callee) {
     AttrClass::set(Caller, AttrClass::getKind(), false);
 }
 
-/// \brief Compute the logical OR of the attributes of the caller and the
+/// Compute the logical OR of the attributes of the caller and the
 /// callee.
 ///
 /// This function sets the caller's attribute to true if the callee's attribute
@@ -1616,7 +1625,7 @@ static void setOR(Function &Caller, const Function &Callee) {
     AttrClass::set(Caller, AttrClass::getKind(), true);
 }
 
-/// \brief If the inlined function had a higher stack protection level than the
+/// If the inlined function had a higher stack protection level than the
 /// calling function, then bump up the caller's stack protection level.
 static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
   // If upgrading the SSP attribute, clear out the old SSP Attributes first.
@@ -1640,7 +1649,7 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
     Caller.addFnAttr(Attribute::StackProtect);
 }
 
-/// \brief If the inlined function required stack probes, then ensure that
+/// If the inlined function required stack probes, then ensure that
 /// the calling function has those too.
 static void adjustCallerStackProbes(Function &Caller, const Function &Callee) {
   if (!Caller.hasFnAttribute("probe-stack") &&
@@ -1649,7 +1658,7 @@ static void adjustCallerStackProbes(Function &Caller, const Function &Callee) {
   }
 }
 
-/// \brief If the inlined function defines the size of guard region
+/// If the inlined function defines the size of guard region
 /// on the stack, then ensure that the calling function defines a guard region
 /// that is no larger.
 static void
@@ -1673,6 +1682,33 @@ adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
   }
 }
 
+/// If the inlined function defines a min legal vector width, then ensure
+/// the calling function has the same or larger min legal vector width. This
+/// function is called after the inlining decision has been made so we have to
+/// merge the attribute this way. Heuristics that would use
+/// min-legal-vector-width to determine inline compatibility would need to be
+/// handled as part of inline cost analysis.
+static void
+adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) {
+  if (Callee.hasFnAttribute("min-legal-vector-width")) {
+    uint64_t CalleeVectorWidth;
+    Callee.getFnAttribute("min-legal-vector-width")
+          .getValueAsString()
+          .getAsInteger(0, CalleeVectorWidth);
+    if (Caller.hasFnAttribute("min-legal-vector-width")) {
+      uint64_t CallerVectorWidth;
+      Caller.getFnAttribute("min-legal-vector-width")
+            .getValueAsString()
+            .getAsInteger(0, CallerVectorWidth);
+      if (CallerVectorWidth < CalleeVectorWidth) {
+        Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width"));
+      }
+    } else {
+      Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width"));
+    }
+  }
+}
+
 #define GET_ATTR_COMPAT_FUNC
 #include "AttributesCompatFunc.inc"
 
diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp
index c56a022c6705..ef62a23b5358 100644
--- a/contrib/llvm/lib/IR/AutoUpgrade.cpp
+++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -73,11 +74,36 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
   if (Name=="ssse3.pabs.b.128" || // Added in 6.0
       Name=="ssse3.pabs.w.128" || // Added in 6.0
       Name=="ssse3.pabs.d.128" || // Added in 6.0
+      Name.startswith("fma4.vfmadd.s") || // Added in 7.0
+      Name.startswith("fma.vfmadd.") || // Added in 7.0
+      Name.startswith("fma.vfmsub.") || // Added in 7.0
+      Name.startswith("fma.vfmaddsub.") || // Added in 7.0
+      Name.startswith("fma.vfmsubadd.") || // Added in 7.0
+      Name.startswith("fma.vfnmadd.") || // Added in 7.0
+      Name.startswith("fma.vfnmsub.") || // Added in 7.0
+      Name.startswith("avx512.mask.vfmadd.") || // Added in 7.0
+      Name.startswith("avx512.mask.vfnmadd.") || // Added in 7.0
+      Name.startswith("avx512.mask.vfnmsub.") || // Added in 7.0
+      Name.startswith("avx512.mask3.vfmadd.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vfmadd.") || // Added in 7.0
+      Name.startswith("avx512.mask3.vfmsub.") || // Added in 7.0
+      Name.startswith("avx512.mask3.vfnmsub.") || // Added in 7.0
+      Name.startswith("avx512.mask.vfmaddsub.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vfmaddsub.") || // Added in 7.0
+      Name.startswith("avx512.mask3.vfmaddsub.") || // Added in 7.0
+      Name.startswith("avx512.mask3.vfmsubadd.") || // Added in 7.0
       Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
       Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
+      Name.startswith("avx512.kunpck") || //added in 6.0 
       Name.startswith("avx2.pabs.") || // Added in 6.0
       Name.startswith("avx512.mask.pabs.") || // Added in 6.0
       Name.startswith("avx512.broadcastm") || // Added in 6.0
+      Name == "sse.sqrt.ss" || // Added in 7.0
+      Name == "sse2.sqrt.sd" || // Added in 7.0
+      Name.startswith("avx512.mask.sqrt.p") || // Added in 7.0
+      Name.startswith("avx.sqrt.p") || // Added in 7.0
+      Name.startswith("sse2.sqrt.p") || // Added in 7.0
+      Name.startswith("sse.sqrt.p") || // Added in 7.0
       Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0
       Name.startswith("sse2.pcmpeq.") || // Added in 3.1
       Name.startswith("sse2.pcmpgt.") || // Added in 3.1
@@ -107,6 +133,14 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "sse2.pminu.b" || // Added in 3.9
       Name == "sse41.pminuw" || // Added in 3.9
       Name == "sse41.pminud" || // Added in 3.9
+      Name == "avx512.kand.w" || // Added in 7.0
+      Name == "avx512.kandn.w" || // Added in 7.0
+      Name == "avx512.knot.w" || // Added in 7.0
+      Name == "avx512.kor.w" || // Added in 7.0
+      Name == "avx512.kxor.w" || // Added in 7.0
+      Name == "avx512.kxnor.w" || // Added in 7.0
+      Name == "avx512.kortestc.w" || // Added in 7.0
+      Name == "avx512.kortestz.w" || // Added in 7.0
       Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
       Name.startswith("avx2.pmax") || // Added in 3.9
       Name.startswith("avx2.pmin") || // Added in 3.9
@@ -145,8 +179,37 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.pmull.") || // Added in 4.0
       Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
       Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
+      Name == "avx512.mask.cvtudq2ps.128" || // Added in 7.0
+      Name == "avx512.mask.cvtudq2ps.256" || // Added in 7.0
+      Name == "avx512.mask.cvtqq2pd.128" || // Added in 7.0
+      Name == "avx512.mask.cvtqq2pd.256" || // Added in 7.0
+      Name == "avx512.mask.cvtuqq2pd.128" || // Added in 7.0
+      Name == "avx512.mask.cvtuqq2pd.256" || // Added in 7.0
+      Name == "avx512.mask.cvtdq2ps.128" || // Added in 7.0
+      Name == "avx512.mask.cvtdq2ps.256" || // Added in 7.0
+      Name == "avx512.mask.cvtpd2dq.256" || // Added in 7.0
+      Name == "avx512.mask.cvtpd2ps.256" || // Added in 7.0
+      Name == "avx512.mask.cvttpd2dq.256" || // Added in 7.0
+      Name == "avx512.mask.cvttps2dq.128" || // Added in 7.0
+      Name == "avx512.mask.cvttps2dq.256" || // Added in 7.0
+      Name == "avx512.mask.cvtps2pd.128" || // Added in 7.0
+      Name == "avx512.mask.cvtps2pd.256" || // Added in 7.0
+      Name == "avx512.cvtusi2sd" || // Added in 7.0
+      Name.startswith("avx512.mask.permvar.") || // Added in 7.0
+      Name.startswith("avx512.mask.permvar.") || // Added in 7.0
+      Name == "sse2.pmulu.dq" || // Added in 7.0
+      Name == "sse41.pmuldq" || // Added in 7.0
+      Name == "avx2.pmulu.dq" || // Added in 7.0
+      Name == "avx2.pmul.dq" || // Added in 7.0
+      Name == "avx512.pmulu.dq.512" || // Added in 7.0
+      Name == "avx512.pmul.dq.512" || // Added in 7.0
       Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
       Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmul.hr.sw.") || // Added in 7.0
+      Name.startswith("avx512.mask.pmulh.w.") || // Added in 7.0
+      Name.startswith("avx512.mask.pmulhu.w.") || // Added in 7.0
+      Name.startswith("avx512.mask.pmaddw.d.") || // Added in 7.0
+      Name.startswith("avx512.mask.pmaddubs.w.") || // Added in 7.0
       Name.startswith("avx512.mask.packsswb.") || // Added in 5.0
       Name.startswith("avx512.mask.packssdw.") || // Added in 5.0
       Name.startswith("avx512.mask.packuswb.") || // Added in 5.0
@@ -155,31 +218,12 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
       Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
       Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.p") || // Added in 7.0
       Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
-      Name == "avx512.mask.add.pd.128" || // Added in 4.0
-      Name == "avx512.mask.add.pd.256" || // Added in 4.0
-      Name == "avx512.mask.add.ps.128" || // Added in 4.0
-      Name == "avx512.mask.add.ps.256" || // Added in 4.0
-      Name == "avx512.mask.div.pd.128" || // Added in 4.0
-      Name == "avx512.mask.div.pd.256" || // Added in 4.0
-      Name == "avx512.mask.div.ps.128" || // Added in 4.0
-      Name == "avx512.mask.div.ps.256" || // Added in 4.0
-      Name == "avx512.mask.mul.pd.128" || // Added in 4.0
-      Name == "avx512.mask.mul.pd.256" || // Added in 4.0
-      Name == "avx512.mask.mul.ps.128" || // Added in 4.0
-      Name == "avx512.mask.mul.ps.256" || // Added in 4.0
-      Name == "avx512.mask.sub.pd.128" || // Added in 4.0
-      Name == "avx512.mask.sub.pd.256" || // Added in 4.0
-      Name == "avx512.mask.sub.ps.128" || // Added in 4.0
-      Name == "avx512.mask.sub.ps.256" || // Added in 4.0
-      Name == "avx512.mask.max.pd.128" || // Added in 5.0
-      Name == "avx512.mask.max.pd.256" || // Added in 5.0
-      Name == "avx512.mask.max.ps.128" || // Added in 5.0
-      Name == "avx512.mask.max.ps.256" || // Added in 5.0
-      Name == "avx512.mask.min.pd.128" || // Added in 5.0
-      Name == "avx512.mask.min.pd.256" || // Added in 5.0
-      Name == "avx512.mask.min.ps.128" || // Added in 5.0
-      Name == "avx512.mask.min.ps.256" || // Added in 5.0
+      Name.startswith("avx512.cvtb2mask.") || // Added in 7.0
+      Name.startswith("avx512.cvtw2mask.") || // Added in 7.0
+      Name.startswith("avx512.cvtd2mask.") || // Added in 7.0
+      Name.startswith("avx512.cvtq2mask.") || // Added in 7.0
       Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
       Name.startswith("avx512.mask.psll.d") || // Added in 4.0
       Name.startswith("avx512.mask.psll.q") || // Added in 4.0
@@ -203,9 +247,45 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
       Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
       Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
+      Name.startswith("avx512.mask.pternlog.") || // Added in 7.0
+      Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0
+      Name.startswith("avx512.mask.vpermi2var.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpermt2var.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpermt2var.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpdpbusd.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpdpbusd.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpdpbusds.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpdpbusds.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpdpwssd.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpdpwssd.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpdpwssds.") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpdpwssds.") || // Added in 7.0
+      Name.startswith("avx512.mask.dbpsadbw.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpshld.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpshrd.") || // Added in 7.0
+      Name.startswith("avx512.mask.add.p") || // Added in 7.0. 128/256 in 4.0
+      Name.startswith("avx512.mask.sub.p") || // Added in 7.0. 128/256 in 4.0
+      Name.startswith("avx512.mask.mul.p") || // Added in 7.0. 128/256 in 4.0
+      Name.startswith("avx512.mask.div.p") || // Added in 7.0. 128/256 in 4.0
+      Name.startswith("avx512.mask.max.p") || // Added in 7.0. 128/256 in 5.0
+      Name.startswith("avx512.mask.min.p") || // Added in 7.0. 128/256 in 5.0
+      Name.startswith("avx512.mask.fpclass.p") || // Added in 7.0
+      Name.startswith("avx512.mask.prorv.") || // Added in 7.0
+      Name.startswith("avx512.mask.pror.") || // Added in 7.0
+      Name.startswith("avx512.mask.prolv.") || // Added in 7.0
+      Name.startswith("avx512.mask.prol.") || // Added in 7.0
+      Name == "sse.cvtsi2ss" || // Added in 7.0
+      Name == "sse.cvtsi642ss" || // Added in 7.0
+      Name == "sse2.cvtsi2sd" || // Added in 7.0
+      Name == "sse2.cvtsi642sd" || // Added in 7.0
+      Name == "sse2.cvtss2sd" || // Added in 7.0
       Name == "sse2.cvtdq2pd" || // Added in 3.9
+      Name == "sse2.cvtdq2ps" || // Added in 7.0
       Name == "sse2.cvtps2pd" || // Added in 3.9
       Name == "avx.cvtdq2.pd.256" || // Added in 3.9
+      Name == "avx.cvtdq2.ps.256" || // Added in 7.0
       Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
       Name.startswith("avx.vinsertf128.") || // Added in 3.7
       Name == "avx2.vinserti128" || // Added in 3.7
@@ -229,10 +309,14 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.store.w.") || // Added in 3.9
       Name.startswith("avx512.mask.store.d.") || // Added in 3.9
       Name.startswith("avx512.mask.store.q.") || // Added in 3.9
+      Name == "avx512.mask.store.ss" || // Added in 7.0
       Name.startswith("avx512.mask.loadu.") || // Added in 3.9
       Name.startswith("avx512.mask.load.") || // Added in 3.9
+      Name.startswith("avx512.mask.expand.load.") || // Added in 7.0
+      Name.startswith("avx512.mask.compress.store.") || // Added in 7.0
       Name == "sse42.crc32.64.8" || // Added in 3.4
       Name.startswith("avx.vbroadcast.s") || // Added in 3.5
+      Name.startswith("avx512.vbroadcast.s") || // Added in 7.0
       Name.startswith("avx512.mask.palignr.") || // Added in 3.9
       Name.startswith("avx512.mask.valign.") || // Added in 4.0
       Name.startswith("sse2.psll.dq") || // Added in 3.7
@@ -472,6 +556,17 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
     }
+    if (Name.startswith("invariant.group.barrier")) {
+      // Rename invariant.group.barrier to launder.invariant.group
+      auto Args = F->getFunctionType()->params();
+      Type* ObjectPtr[1] = {Args[0]};
+      rename(F);
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+          Intrinsic::launder_invariant_group, ObjectPtr);
+      return true;
+
+    }
+
     break;
   }
   case 'm': {
@@ -517,6 +612,37 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
     }
+    // Updating the memory intrinsics (memcpy/memmove/memset) that have an
+    // alignment parameter to embedding the alignment as an attribute of
+    // the pointer args.
+    if (Name.startswith("memcpy.") && F->arg_size() == 5) {
+      rename(F);
+      // Get the types of dest, src, and len
+      ArrayRef<Type *> ParamTypes = F->getFunctionType()->params().slice(0, 3);
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::memcpy,
+                                        ParamTypes);
+      return true;
+    }
+    if (Name.startswith("memmove.") && F->arg_size() == 5) {
+      rename(F);
+      // Get the types of dest, src, and len
+      ArrayRef<Type *> ParamTypes = F->getFunctionType()->params().slice(0, 3);
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::memmove,
+                                        ParamTypes);
+      return true;
+    }
+    if (Name.startswith("memset.") && F->arg_size() == 5) {
+      rename(F);
+      // Get the types of dest, and len
+      const auto *FT = F->getFunctionType();
+      Type *ParamTypes[2] = {
+          FT->getParamType(0), // Dest
+          FT->getParamType(2)  // len
+      };
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::memset,
+                                        ParamTypes);
+      return true;
+    }
     break;
   }
   case 'n': {
@@ -700,7 +826,7 @@ static Value *getX86MaskVec(IRBuilder<> &Builder, Value *Mask,
 
 static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
                             Value *Op0, Value *Op1) {
-  // If the mask is all ones just emit the align operation.
+  // If the mask is all ones just emit the first operation.
   if (const auto *C = dyn_cast<Constant>(Mask))
     if (C->isAllOnesValue())
       return Op0;
@@ -709,6 +835,21 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
   return Builder.CreateSelect(Mask, Op0, Op1);
 }
 
+static Value *EmitX86ScalarSelect(IRBuilder<> &Builder, Value *Mask,
+                                  Value *Op0, Value *Op1) {
+  // If the mask is all ones just emit the first operation.
+  if (const auto *C = dyn_cast<Constant>(Mask))
+    if (C->isAllOnesValue())
+      return Op0;
+
+  llvm::VectorType *MaskTy =
+    llvm::VectorType::get(Builder.getInt1Ty(),
+                          Mask->getType()->getIntegerBitWidth());
+  Mask = Builder.CreateBitCast(Mask, MaskTy);
+  Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
+  return Builder.CreateSelect(Mask, Op0, Op1);
+}
+
 // Handle autoupgrade for masked PALIGNR and VALIGND/Q intrinsics.
 // PALIGNR handles large immediates by shifting while VALIGN masks the immediate
 // so we need to handle both cases. VALIGN also doesn't have 128-bit lanes.
@@ -825,12 +966,44 @@ static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI,
   return Res;
 }
 
+static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) {
+  Type *Ty = CI.getType();
+
+  // Arguments have a vXi32 type so cast to vXi64.
+  Value *LHS = Builder.CreateBitCast(CI.getArgOperand(0), Ty);
+  Value *RHS = Builder.CreateBitCast(CI.getArgOperand(1), Ty);
+
+  if (IsSigned) {
+    // Shift left then arithmetic shift right.
+    Constant *ShiftAmt = ConstantInt::get(Ty, 32);
+    LHS = Builder.CreateShl(LHS, ShiftAmt);
+    LHS = Builder.CreateAShr(LHS, ShiftAmt);
+    RHS = Builder.CreateShl(RHS, ShiftAmt);
+    RHS = Builder.CreateAShr(RHS, ShiftAmt);
+  } else {
+    // Clear the upper bits.
+    Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
+    LHS = Builder.CreateAnd(LHS, Mask);
+    RHS = Builder.CreateAnd(RHS, Mask);
+  }
+
+  Value *Res = Builder.CreateMul(LHS, RHS);
+
+  if (CI.getNumArgOperands() == 4)
+    Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2));
+
+  return Res;
+}
+
 // Applying mask on vector of i1's and make sure result is at least 8 bits wide.
-static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder,Value *Vec, Value *Mask,
-                                     unsigned NumElts) {
-  const auto *C = dyn_cast<Constant>(Mask);
-  if (!C || !C->isAllOnesValue())
-    Vec = Builder.CreateAnd(Vec, getX86MaskVec(Builder, Mask, NumElts));
+static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
+                                     Value *Mask) {
+  unsigned NumElts = Vec->getType()->getVectorNumElements();
+  if (Mask) {
+    const auto *C = dyn_cast<Constant>(Mask);
+    if (!C || !C->isAllOnesValue())
+      Vec = Builder.CreateAnd(Vec, getX86MaskVec(Builder, Mask, NumElts));
+  }
 
   if (NumElts < 8) {
     uint32_t Indices[8];
@@ -871,14 +1044,13 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
 
   Value *Mask = CI.getArgOperand(CI.getNumArgOperands() - 1);
 
-  return ApplyX86MaskOn1BitsVec(Builder, Cmp, Mask, NumElts);
+  return ApplyX86MaskOn1BitsVec(Builder, Cmp, Mask);
 }
 
 // Replace a masked intrinsic with an older unmasked intrinsic.
 static Value *UpgradeX86MaskedShift(IRBuilder<> &Builder, CallInst &CI,
                                     Intrinsic::ID IID) {
-  Function *F = CI.getCalledFunction();
-  Function *Intrin = Intrinsic::getDeclaration(F->getParent(), IID);
+  Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID);
   Value *Rep = Builder.CreateCall(Intrin,
                                  { CI.getArgOperand(0), CI.getArgOperand(1) });
   return EmitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
@@ -907,6 +1079,321 @@ static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
   return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
 }
 
+// Replace intrinsic with unmasked version and a select.
+static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
+                                      CallInst &CI, Value *&Rep) {
+  Name = Name.substr(12); // Remove avx512.mask.
+
+  unsigned VecWidth = CI.getType()->getPrimitiveSizeInBits();
+  unsigned EltWidth = CI.getType()->getScalarSizeInBits();
+  Intrinsic::ID IID;
+  if (Name.startswith("max.p")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_sse_max_ps;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_sse2_max_pd;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx_max_ps_256;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx_max_pd_256;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("min.p")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_sse_min_ps;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_sse2_min_pd;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx_min_ps_256;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx_min_pd_256;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pshuf.b.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_ssse3_pshuf_b_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pshuf_b;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pshuf_b_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pmul.hr.sw.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_ssse3_pmul_hr_sw_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pmul_hr_sw;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pmul_hr_sw_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pmulh.w.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse2_pmulh_w;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pmulh_w;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pmulh_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pmulhu.w.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse2_pmulhu_w;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pmulhu_w;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pmulhu_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pmaddw.d.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse2_pmadd_wd;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pmadd_wd;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pmaddw_d_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pmaddubs.w.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_ssse3_pmadd_ub_sw_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pmadd_ub_sw;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pmaddubs_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("packsswb.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse2_packsswb_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_packsswb;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_packsswb_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("packssdw.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse2_packssdw_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_packssdw;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_packssdw_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("packuswb.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse2_packuswb_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_packuswb;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_packuswb_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("packusdw.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_sse41_packusdw;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx2_packusdw;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_packusdw_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("vpermilvar.")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx_vpermilvar_ps;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx_vpermilvar_pd;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx_vpermilvar_ps_256;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx_vpermilvar_pd_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_vpermilvar_ps_512;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_vpermilvar_pd_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name == "cvtpd2dq.256") {
+    IID = Intrinsic::x86_avx_cvt_pd2dq_256;
+  } else if (Name == "cvtpd2ps.256") {
+    IID = Intrinsic::x86_avx_cvt_pd2_ps_256;
+  } else if (Name == "cvttpd2dq.256") {
+    IID = Intrinsic::x86_avx_cvtt_pd2dq_256;
+  } else if (Name == "cvttps2dq.128") {
+    IID = Intrinsic::x86_sse2_cvttps2dq;
+  } else if (Name == "cvttps2dq.256") {
+    IID = Intrinsic::x86_avx_cvtt_ps2dq_256;
+  } else if (Name.startswith("permvar.")) {
+    bool IsFloat = CI.getType()->isFPOrFPVectorTy();
+    if (VecWidth == 256 && EltWidth == 32 && IsFloat)
+      IID = Intrinsic::x86_avx2_permps;
+    else if (VecWidth == 256 && EltWidth == 32 && !IsFloat)
+      IID = Intrinsic::x86_avx2_permd;
+    else if (VecWidth == 256 && EltWidth == 64 && IsFloat)
+      IID = Intrinsic::x86_avx512_permvar_df_256;
+    else if (VecWidth == 256 && EltWidth == 64 && !IsFloat)
+      IID = Intrinsic::x86_avx512_permvar_di_256;
+    else if (VecWidth == 512 && EltWidth == 32 && IsFloat)
+      IID = Intrinsic::x86_avx512_permvar_sf_512;
+    else if (VecWidth == 512 && EltWidth == 32 && !IsFloat)
+      IID = Intrinsic::x86_avx512_permvar_si_512;
+    else if (VecWidth == 512 && EltWidth == 64 && IsFloat)
+      IID = Intrinsic::x86_avx512_permvar_df_512;
+    else if (VecWidth == 512 && EltWidth == 64 && !IsFloat)
+      IID = Intrinsic::x86_avx512_permvar_di_512;
+    else if (VecWidth == 128 && EltWidth == 16)
+      IID = Intrinsic::x86_avx512_permvar_hi_128;
+    else if (VecWidth == 256 && EltWidth == 16)
+      IID = Intrinsic::x86_avx512_permvar_hi_256;
+    else if (VecWidth == 512 && EltWidth == 16)
+      IID = Intrinsic::x86_avx512_permvar_hi_512;
+    else if (VecWidth == 128 && EltWidth == 8)
+      IID = Intrinsic::x86_avx512_permvar_qi_128;
+    else if (VecWidth == 256 && EltWidth == 8)
+      IID = Intrinsic::x86_avx512_permvar_qi_256;
+    else if (VecWidth == 512 && EltWidth == 8)
+      IID = Intrinsic::x86_avx512_permvar_qi_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("dbpsadbw.")) {
+    if (VecWidth == 128)
+      IID = Intrinsic::x86_avx512_dbpsadbw_128;
+    else if (VecWidth == 256)
+      IID = Intrinsic::x86_avx512_dbpsadbw_256;
+    else if (VecWidth == 512)
+      IID = Intrinsic::x86_avx512_dbpsadbw_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("vpshld.")) {
+    if (VecWidth == 128 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshld_q_128;
+    else if (VecWidth == 128 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshld_d_128;
+    else if (VecWidth == 128 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshld_w_128;
+    else if (VecWidth == 256 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshld_q_256;
+    else if (VecWidth == 256 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshld_d_256;
+    else if (VecWidth == 256 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshld_w_256;
+    else if (VecWidth == 512 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshld_q_512;
+    else if (VecWidth == 512 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshld_d_512;
+    else if (VecWidth == 512 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshld_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("vpshrd.")) {
+    if (VecWidth == 128 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshrd_q_128;
+    else if (VecWidth == 128 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshrd_d_128;
+    else if (VecWidth == 128 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshrd_w_128;
+    else if (VecWidth == 256 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshrd_q_256;
+    else if (VecWidth == 256 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshrd_d_256;
+    else if (VecWidth == 256 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshrd_w_256;
+    else if (VecWidth == 512 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshrd_q_512;
+    else if (VecWidth == 512 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshrd_d_512;
+    else if (VecWidth == 512 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshrd_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("prorv.")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prorv_d_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prorv_d_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prorv_d_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prorv_q_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prorv_q_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prorv_q_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("prolv.")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prolv_d_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prolv_d_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prolv_d_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prolv_q_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prolv_q_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prolv_q_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pror.")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_pror_d_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_pror_d_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_pror_d_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_pror_q_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_pror_q_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_pror_q_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("prol.")) {
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prol_d_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prol_d_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_prol_d_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prol_q_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prol_q_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_prol_q_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else
+    return false;
+
+  SmallVector<Value *, 4> Args(CI.arg_operands().begin(),
+                               CI.arg_operands().end());
+  Args.pop_back();
+  Args.pop_back();
+  Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID),
+                           Args);
+  unsigned NumArgs = CI.getNumArgOperands();
+  Rep = EmitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
+                      CI.getArgOperand(NumArgs - 2));
+  return true;
+}
+
+/// Upgrade comment in call to inline asm that represents an objc retain release
+/// marker.
+void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
+  size_t Pos;
+  if (AsmStr->find("mov\tfp") == 0 &&
+      AsmStr->find("objc_retainAutoreleaseReturnValue") != std::string::npos &&
+      (Pos = AsmStr->find("# marker")) != std::string::npos) {
+    AsmStr->replace(Pos, 1, ";");
+  }
+  return;
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -1015,6 +1502,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       return;
     }
 
+    if (IsX86 && Name == "avx512.mask.store.ss") {
+      Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
+      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                         Mask, false);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    }
+
     if (IsX86 && (Name.startswith("avx512.mask.store"))) {
       // "avx512.mask.storeu." or "avx512.mask.store."
       bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
@@ -1043,6 +1540,39 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          ExtTy->getPrimitiveSizeInBits();
       Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
       Rep = Builder.CreateVectorSplat(NumElts, Rep);
+    } else if (IsX86 && (Name == "sse.sqrt.ss" ||
+                         Name == "sse2.sqrt.sd")) {
+      Value *Vec = CI->getArgOperand(0);
+      Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
+      Function *Intr = Intrinsic::getDeclaration(F->getParent(),
+                                                 Intrinsic::sqrt, Elt0->getType());
+      Elt0 = Builder.CreateCall(Intr, Elt0);
+      Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
+    } else if (IsX86 && (Name.startswith("avx.sqrt.p") ||
+                         Name.startswith("sse2.sqrt.p") ||
+                         Name.startswith("sse.sqrt.p"))) {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::sqrt,
+                                                         CI->getType()),
+                               {CI->getArgOperand(0)});
+    } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p"))) {
+      if (CI->getNumArgOperands() == 4 &&
+          (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+           cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+        Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512
+                                            : Intrinsic::x86_avx512_sqrt_pd_512;
+
+        Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(3) };
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
+                                                           IID), Args);
+      } else {
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                           Intrinsic::sqrt,
+                                                           CI->getType()),
+                                 {CI->getArgOperand(0)});
+      }
+      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("avx512.ptestm") ||
                          Name.startswith("avx512.ptestnm"))) {
       Value *Op0 = CI->getArgOperand(0);
@@ -1054,14 +1584,76 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       ICmpInst::Predicate Pred =
         Name.startswith("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
       Rep = Builder.CreateICmp(Pred, Rep, Zero);
-      unsigned NumElts = Op0->getType()->getVectorNumElements();
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask, NumElts);
+      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask);
     } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
       unsigned NumElts =
           CI->getArgOperand(1)->getType()->getVectorNumElements();
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
+      unsigned NumElts = CI->getType()->getScalarSizeInBits();
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
+      uint32_t Indices[64];
+      for (unsigned i = 0; i != NumElts; ++i)
+        Indices[i] = i;
+
+      // First extract half of each vector. This gives better codegen than
+      // doing it in a single shuffle.
+      LHS = Builder.CreateShuffleVector(LHS, LHS,
+                                        makeArrayRef(Indices, NumElts / 2));
+      RHS = Builder.CreateShuffleVector(RHS, RHS,
+                                        makeArrayRef(Indices, NumElts / 2));
+      // Concat the vectors.
+      // NOTE: Operands have to be swapped to match intrinsic definition.
+      Rep = Builder.CreateShuffleVector(RHS, LHS,
+                                        makeArrayRef(Indices, NumElts));
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 && Name == "avx512.kand.w") {
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+      Rep = Builder.CreateAnd(LHS, RHS);
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 && Name == "avx512.kandn.w") {
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+      LHS = Builder.CreateNot(LHS);
+      Rep = Builder.CreateAnd(LHS, RHS);
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 && Name == "avx512.kor.w") {
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+      Rep = Builder.CreateOr(LHS, RHS);
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 && Name == "avx512.kxor.w") {
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+      Rep = Builder.CreateXor(LHS, RHS);
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 && Name == "avx512.kxnor.w") {
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+      LHS = Builder.CreateNot(LHS);
+      Rep = Builder.CreateXor(LHS, RHS);
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 && Name == "avx512.knot.w") {
+      Rep = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Rep = Builder.CreateNot(Rep);
+      Rep = Builder.CreateBitCast(Rep, CI->getType());
+    } else if (IsX86 &&
+               (Name == "avx512.kortestz.w" || Name == "avx512.kortestc.w")) {
+      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+      Rep = Builder.CreateOr(LHS, RHS);
+      Rep = Builder.CreateBitCast(Rep, Builder.getInt16Ty());
+      Value *C;
+      if (Name[14] == 'c')
+        C = ConstantInt::getAllOnesValue(Builder.getInt16Ty());
+      else
+        C = ConstantInt::getNullValue(Builder.getInt16Ty());
+      Rep = Builder.CreateICmpEQ(Rep, C);
+      Rep = Builder.CreateZExt(Rep, Builder.getInt32Ty());
     } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
       Type *I32Ty = Type::getInt32Ty(C);
       Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
@@ -1102,12 +1694,75 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
       bool CmpEq = Name[16] == 'e';
       Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
-    } else if (IsX86 && Name.startswith("avx512.mask.cmp")) {
+    } else if (IsX86 && Name.startswith("avx512.mask.fpclass.p")) {
+      Type *OpTy = CI->getArgOperand(0)->getType();
+      unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+      unsigned EltWidth = OpTy->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_fpclass_ps_128;
+      else if (VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_fpclass_ps_256;
+      else if (VecWidth == 512 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_fpclass_ps_512;
+      else if (VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_fpclass_pd_128;
+      else if (VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_fpclass_pd_256;
+      else if (VecWidth == 512 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_fpclass_pd_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getOperand(0), CI->getArgOperand(1) });
+      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.cmp.p")) {
+      Type *OpTy = CI->getArgOperand(0)->getType();
+      unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+      unsigned EltWidth = OpTy->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_cmp_ps_128;
+      else if (VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_cmp_ps_256;
+      else if (VecWidth == 512 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_cmp_ps_512;
+      else if (VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_cmp_pd_128;
+      else if (VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_cmp_pd_256;
+      else if (VecWidth == 512 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_cmp_pd_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      SmallVector<Value *, 4> Args;
+      Args.push_back(CI->getArgOperand(0));
+      Args.push_back(CI->getArgOperand(1));
+      Args.push_back(CI->getArgOperand(2));
+      if (CI->getNumArgOperands() == 5)
+        Args.push_back(CI->getArgOperand(4));
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               Args);
+      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(3));
+    } else if (IsX86 && Name.startswith("avx512.mask.cmp.") &&
+               Name[16] != 'p') {
+      // Integer compare intrinsics.
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
-    } else if (IsX86 && Name.startswith("avx512.mask.ucmp")) {
+    } else if (IsX86 && Name.startswith("avx512.mask.ucmp.")) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
+    } else if (IsX86 && (Name.startswith("avx512.cvtb2mask.") ||
+                         Name.startswith("avx512.cvtw2mask.") ||
+                         Name.startswith("avx512.cvtd2mask.") ||
+                         Name.startswith("avx512.cvtq2mask."))) {
+      Value *Op = CI->getArgOperand(0);
+      Value *Zero = llvm::Constant::getNullValue(Op->getType());
+      Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
+      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, nullptr);
     } else if(IsX86 && (Name == "ssse3.pabs.b.128" ||
                         Name == "ssse3.pabs.w.128" ||
                         Name == "ssse3.pabs.d.128" ||
@@ -1138,35 +1793,67 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx2.pminu") ||
                          Name.startswith("avx512.mask.pminu"))) {
       Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT);
+    } else if (IsX86 && (Name == "sse2.pmulu.dq" ||
+                         Name == "avx2.pmulu.dq" ||
+                         Name == "avx512.pmulu.dq.512" ||
+                         Name.startswith("avx512.mask.pmulu.dq."))) {
+      Rep = upgradePMULDQ(Builder, *CI, /*Signed*/false);
+    } else if (IsX86 && (Name == "sse41.pmuldq" ||
+                         Name == "avx2.pmul.dq" ||
+                         Name == "avx512.pmul.dq.512" ||
+                         Name.startswith("avx512.mask.pmul.dq."))) {
+      Rep = upgradePMULDQ(Builder, *CI, /*Signed*/true);
+    } else if (IsX86 && (Name == "sse.cvtsi2ss" ||
+                         Name == "sse2.cvtsi2sd" ||
+                         Name == "sse.cvtsi642ss" ||
+                         Name == "sse2.cvtsi642sd")) {
+      Rep = Builder.CreateSIToFP(CI->getArgOperand(1),
+                                 CI->getType()->getVectorElementType());
+      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+    } else if (IsX86 && Name == "avx512.cvtusi2sd") {
+      Rep = Builder.CreateUIToFP(CI->getArgOperand(1),
+                                 CI->getType()->getVectorElementType());
+      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+    } else if (IsX86 && Name == "sse2.cvtss2sd") {
+      Rep = Builder.CreateExtractElement(CI->getArgOperand(1), (uint64_t)0);
+      Rep = Builder.CreateFPExt(Rep, CI->getType()->getVectorElementType());
+      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
     } else if (IsX86 && (Name == "sse2.cvtdq2pd" ||
-                         Name == "sse2.cvtps2pd" ||
+                         Name == "sse2.cvtdq2ps" ||
                          Name == "avx.cvtdq2.pd.256" ||
-                         Name == "avx.cvt.ps2.pd.256" ||
+                         Name == "avx.cvtdq2.ps.256" ||
                          Name.startswith("avx512.mask.cvtdq2pd.") ||
-                         Name.startswith("avx512.mask.cvtudq2pd."))) {
-      // Lossless i32/float to double conversion.
-      // Extract the bottom elements if necessary and convert to double vector.
-      Value *Src = CI->getArgOperand(0);
-      VectorType *SrcTy = cast<VectorType>(Src->getType());
-      VectorType *DstTy = cast<VectorType>(CI->getType());
+                         Name.startswith("avx512.mask.cvtudq2pd.") ||
+                         Name == "avx512.mask.cvtdq2ps.128" ||
+                         Name == "avx512.mask.cvtdq2ps.256" ||
+                         Name == "avx512.mask.cvtudq2ps.128" ||
+                         Name == "avx512.mask.cvtudq2ps.256" ||
+                         Name == "avx512.mask.cvtqq2pd.128" ||
+                         Name == "avx512.mask.cvtqq2pd.256" ||
+                         Name == "avx512.mask.cvtuqq2pd.128" ||
+                         Name == "avx512.mask.cvtuqq2pd.256" ||
+                         Name == "sse2.cvtps2pd" ||
+                         Name == "avx.cvt.ps2.pd.256" ||
+                         Name == "avx512.mask.cvtps2pd.128" ||
+                         Name == "avx512.mask.cvtps2pd.256")) {
+      Type *DstTy = CI->getType();
       Rep = CI->getArgOperand(0);
 
-      unsigned NumDstElts = DstTy->getNumElements();
-      if (NumDstElts < SrcTy->getNumElements()) {
+      unsigned NumDstElts = DstTy->getVectorNumElements();
+      if (NumDstElts < Rep->getType()->getVectorNumElements()) {
         assert(NumDstElts == 2 && "Unexpected vector size");
         uint32_t ShuffleMask[2] = { 0, 1 };
-        Rep = Builder.CreateShuffleVector(Rep, UndefValue::get(SrcTy),
-                                          ShuffleMask);
+        Rep = Builder.CreateShuffleVector(Rep, Rep, ShuffleMask);
       }
 
-      bool SInt2Double = (StringRef::npos != Name.find("cvtdq2"));
-      bool UInt2Double = (StringRef::npos != Name.find("cvtudq2"));
-      if (SInt2Double)
-        Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
-      else if (UInt2Double)
-        Rep = Builder.CreateUIToFP(Rep, DstTy, "cvtudq2pd");
-      else
+      bool IsPS2PD = (StringRef::npos != Name.find("ps2"));
+      bool IsUnsigned = (StringRef::npos != Name.find("cvtu"));
+      if (IsPS2PD)
         Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
+      else if (IsUnsigned)
+        Rep = Builder.CreateUIToFP(Rep, DstTy, "cvt");
+      else
+        Rep = Builder.CreateSIToFP(Rep, DstTy, "cvt");
 
       if (CI->getNumArgOperands() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
@@ -1179,6 +1866,36 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
                               CI->getArgOperand(1),CI->getArgOperand(2),
                               /*Aligned*/true);
+    } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) {
+      Type *ResultTy = CI->getType();
+      Type *PtrTy = ResultTy->getVectorElementType();
+
+      // Cast the pointer to element type.
+      Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
+                                         llvm::PointerType::getUnqual(PtrTy));
+
+      Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
+                                     ResultTy->getVectorNumElements());
+
+      Function *ELd = Intrinsic::getDeclaration(F->getParent(),
+                                                Intrinsic::masked_expandload,
+                                                ResultTy);
+      Rep = Builder.CreateCall(ELd, { Ptr, MaskVec, CI->getOperand(1) });
+    } else if (IsX86 && Name.startswith("avx512.mask.compress.store.")) {
+      Type *ResultTy = CI->getArgOperand(1)->getType();
+      Type *PtrTy = ResultTy->getVectorElementType();
+
+      // Cast the pointer to element type.
+      Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
+                                         llvm::PointerType::getUnqual(PtrTy));
+
+      Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
+                                     ResultTy->getVectorNumElements());
+
+      Function *CSt = Intrinsic::getDeclaration(F->getParent(),
+                                                Intrinsic::masked_compressstore,
+                                                ResultTy);
+      Rep = Builder.CreateCall(CSt, { CI->getArgOperand(1), Ptr, MaskVec });
     } else if (IsX86 && Name.startswith("xop.vpcom")) {
       Intrinsic::ID intID;
       if (Name.endswith("ub"))
@@ -1237,7 +1954,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
       Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
       Rep = Builder.CreateZExt(Rep, CI->getType(), "");
-    } else if (IsX86 && Name.startswith("avx.vbroadcast.s")) {
+    } else if (IsX86 && (Name.startswith("avx.vbroadcast.s") ||
+                         Name.startswith("avx512.vbroadcast.s"))) {
       // Replace broadcasts with a series of insertelements.
       Type *VecTy = CI->getType();
       Type *EltTy = VecTy->getVectorElementType();
@@ -1720,135 +2438,103 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.add.p"))) {
-      Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
+    } else if (IsX86 && Name.startswith("avx512.mask.add.p")) {
+      if (Name.endswith(".512")) {
+        Intrinsic::ID IID;
+        if (Name[17] == 's')
+          IID = Intrinsic::x86_avx512_add_ps_512;
+        else
+          IID = Intrinsic::x86_avx512_add_pd_512;
+
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                                 { CI->getArgOperand(0), CI->getArgOperand(1),
+                                   CI->getArgOperand(4) });
+      } else {
+        Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
+      }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.div.p")) {
-      Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
+      if (Name.endswith(".512")) {
+        Intrinsic::ID IID;
+        if (Name[17] == 's')
+          IID = Intrinsic::x86_avx512_div_ps_512;
+        else
+          IID = Intrinsic::x86_avx512_div_pd_512;
+
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                                 { CI->getArgOperand(0), CI->getArgOperand(1),
+                                   CI->getArgOperand(4) });
+      } else {
+        Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
+      }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.mul.p")) {
-      Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.sub.p")) {
-      Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
-                                                         Intrinsic::ctlz,
-                                                         CI->getType()),
-                               { CI->getArgOperand(0), Builder.getInt1(false) });
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.mask.max.p") ||
-                         Name.startswith("avx512.mask.min.p"))) {
-      bool IsMin = Name[13] == 'i';
-      VectorType *VecTy = cast<VectorType>(CI->getType());
-      unsigned VecWidth = VecTy->getPrimitiveSizeInBits();
-      unsigned EltWidth = VecTy->getScalarSizeInBits();
-      Intrinsic::ID IID;
-      if (!IsMin && VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_sse_max_ps;
-      else if (!IsMin && VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_sse2_max_pd;
-      else if (!IsMin && VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx_max_ps_256;
-      else if (!IsMin && VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx_max_pd_256;
-      else if (IsMin && VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_sse_min_ps;
-      else if (IsMin && VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_sse2_min_pd;
-      else if (IsMin && VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx_min_ps_256;
-      else if (IsMin && VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx_min_pd_256;
-      else
-        llvm_unreachable("Unexpected intrinsic");
+      if (Name.endswith(".512")) {
+        Intrinsic::ID IID;
+        if (Name[17] == 's')
+          IID = Intrinsic::x86_avx512_mul_ps_512;
+        else
+          IID = Intrinsic::x86_avx512_mul_pd_512;
 
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                                 { CI->getArgOperand(0), CI->getArgOperand(1),
+                                   CI->getArgOperand(4) });
+      } else {
+        Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
+      }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.pshuf.b.")) {
-      VectorType *VecTy = cast<VectorType>(CI->getType());
-      Intrinsic::ID IID;
-      if (VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_ssse3_pshuf_b_128;
-      else if (VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_pshuf_b;
-      else if (VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_pshuf_b_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
+    } else if (IsX86 && Name.startswith("avx512.mask.sub.p")) {
+      if (Name.endswith(".512")) {
+        Intrinsic::ID IID;
+        if (Name[17] == 's')
+          IID = Intrinsic::x86_avx512_sub_ps_512;
+        else
+          IID = Intrinsic::x86_avx512_sub_pd_512;
 
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                                 { CI->getArgOperand(0), CI->getArgOperand(1),
+                                   CI->getArgOperand(4) });
+      } else {
+        Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
+      }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.pmul.dq.") ||
-                         Name.startswith("avx512.mask.pmulu.dq."))) {
-      bool IsUnsigned = Name[16] == 'u';
-      VectorType *VecTy = cast<VectorType>(CI->getType());
+    } else if (IsX86 && Name.startswith("avx512.mask.max.p") &&
+               Name.drop_front(18) == ".512") {
       Intrinsic::ID IID;
-      if (!IsUnsigned && VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_sse41_pmuldq;
-      else if (!IsUnsigned && VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_pmul_dq;
-      else if (!IsUnsigned && VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_pmul_dq_512;
-      else if (IsUnsigned && VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_sse2_pmulu_dq;
-      else if (IsUnsigned && VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_pmulu_dq;
-      else if (IsUnsigned && VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_pmulu_dq_512;
+      if (Name[17] == 's')
+        IID = Intrinsic::x86_avx512_max_ps_512;
       else
-        llvm_unreachable("Unexpected intrinsic");
+        IID = Intrinsic::x86_avx512_max_pd_512;
 
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+                               { CI->getArgOperand(0), CI->getArgOperand(1),
+                                 CI->getArgOperand(4) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.pack")) {
-      bool IsUnsigned = Name[16] == 'u';
-      bool IsDW = Name[18] == 'd';
-      VectorType *VecTy = cast<VectorType>(CI->getType());
+    } else if (IsX86 && Name.startswith("avx512.mask.min.p") &&
+               Name.drop_front(18) == ".512") {
       Intrinsic::ID IID;
-      if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_sse2_packsswb_128;
-      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_packsswb;
-      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_packsswb_512;
-      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_sse2_packssdw_128;
-      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_packssdw;
-      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_packssdw_512;
-      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_sse2_packuswb_128;
-      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_packuswb;
-      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_packuswb_512;
-      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
-        IID = Intrinsic::x86_sse41_packusdw;
-      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
-        IID = Intrinsic::x86_avx2_packusdw;
-      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
-        IID = Intrinsic::x86_avx512_packusdw_512;
+      if (Name[17] == 's')
+        IID = Intrinsic::x86_avx512_min_ps_512;
       else
-        llvm_unreachable("Unexpected intrinsic");
+        IID = Intrinsic::x86_avx512_min_pd_512;
 
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+                               { CI->getArgOperand(0), CI->getArgOperand(1),
+                                 CI->getArgOperand(4) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::ctlz,
+                                                         CI->getType()),
+                               { CI->getArgOperand(0), Builder.getInt1(false) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
     } else if (IsX86 && Name.startswith("avx512.mask.psll")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
@@ -2055,28 +2741,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = upgradeMaskedMove(Builder, *CI);
     } else if (IsX86 && Name.startswith("avx512.cvtmask2")) {
       Rep = UpgradeMaskToInt(Builder, *CI);
-    } else if (IsX86 && Name.startswith("avx512.mask.vpermilvar.")) {
-      Intrinsic::ID IID;
-      if (Name.endswith("ps.128"))
-        IID = Intrinsic::x86_avx_vpermilvar_ps;
-      else if (Name.endswith("pd.128"))
-        IID = Intrinsic::x86_avx_vpermilvar_pd;
-      else if (Name.endswith("ps.256"))
-        IID = Intrinsic::x86_avx_vpermilvar_ps_256;
-      else if (Name.endswith("pd.256"))
-        IID = Intrinsic::x86_avx_vpermilvar_pd_256;
-      else if (Name.endswith("ps.512"))
-        IID = Intrinsic::x86_avx512_vpermilvar_ps_512;
-      else if (Name.endswith("pd.512"))
-        IID = Intrinsic::x86_avx512_vpermilvar_pd_512;
-      else
-        llvm_unreachable("Unexpected vpermilvar intrinsic");
-
-      Function *Intrin = Intrinsic::getDeclaration(F->getParent(), IID);
-      Rep = Builder.CreateCall(Intrin,
-                               { CI->getArgOperand(0), CI->getArgOperand(1) });
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
     } else if (IsX86 && Name.endswith(".movntdqa")) {
       Module *M = F->getParent();
       MDNode *Node = MDNode::get(
@@ -2110,6 +2774,416 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
       }
+    } else if (IsX86 && (Name.startswith("fma.vfmadd.") ||
+                         Name.startswith("fma.vfmsub.") ||
+                         Name.startswith("fma.vfnmadd.") ||
+                         Name.startswith("fma.vfnmsub."))) {
+      bool NegMul = Name[6] == 'n';
+      bool NegAcc = NegMul ? Name[8] == 's' : Name[7] == 's';
+      bool IsScalar = NegMul ? Name[12] == 's' : Name[11] == 's';
+
+      Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2) };
+
+      if (IsScalar) {
+        Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+        Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+        Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+      }
+
+      if (NegMul && !IsScalar)
+        Ops[0] = Builder.CreateFNeg(Ops[0]);
+      if (NegMul && IsScalar)
+        Ops[1] = Builder.CreateFNeg(Ops[1]);
+      if (NegAcc)
+        Ops[2] = Builder.CreateFNeg(Ops[2]);
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
+                                                         Intrinsic::fma,
+                                                         Ops[0]->getType()),
+                               Ops);
+
+      if (IsScalar)
+        Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep,
+                                          (uint64_t)0);
+    } else if (IsX86 && Name.startswith("fma4.vfmadd.s")) {
+      Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2) };
+
+      Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+      Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+      Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
+                                                         Intrinsic::fma,
+                                                         Ops[0]->getType()),
+                               Ops);
+
+      Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
+                                        Rep, (uint64_t)0);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.s") ||
+                         Name.startswith("avx512.maskz.vfmadd.s") ||
+                         Name.startswith("avx512.mask3.vfmadd.s") ||
+                         Name.startswith("avx512.mask3.vfmsub.s") ||
+                         Name.startswith("avx512.mask3.vfnmsub.s"))) {
+      bool IsMask3 = Name[11] == '3';
+      bool IsMaskZ = Name[11] == 'z';
+      // Drop the "avx512.mask." to make it easier.
+      Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
+      bool NegMul = Name[2] == 'n';
+      bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
+
+      Value *A = CI->getArgOperand(0);
+      Value *B = CI->getArgOperand(1);
+      Value *C = CI->getArgOperand(2);
+
+      if (NegMul && (IsMask3 || IsMaskZ))
+        A = Builder.CreateFNeg(A);
+      if (NegMul && !(IsMask3 || IsMaskZ))
+        B = Builder.CreateFNeg(B);
+      if (NegAcc)
+        C = Builder.CreateFNeg(C);
+
+      A = Builder.CreateExtractElement(A, (uint64_t)0);
+      B = Builder.CreateExtractElement(B, (uint64_t)0);
+      C = Builder.CreateExtractElement(C, (uint64_t)0);
+
+      if (!isa<ConstantInt>(CI->getArgOperand(4)) ||
+          cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4) {
+        Value *Ops[] = { A, B, C, CI->getArgOperand(4) };
+
+        Intrinsic::ID IID;
+        if (Name.back() == 'd')
+          IID = Intrinsic::x86_avx512_vfmadd_f64;
+        else
+          IID = Intrinsic::x86_avx512_vfmadd_f32;
+        Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID);
+        Rep = Builder.CreateCall(FMA, Ops);
+      } else {
+        Function *FMA = Intrinsic::getDeclaration(CI->getModule(),
+                                                  Intrinsic::fma,
+                                                  A->getType());
+        Rep = Builder.CreateCall(FMA, { A, B, C });
+      }
+
+      Value *PassThru = IsMaskZ ? Constant::getNullValue(Rep->getType()) :
+                        IsMask3 ? C : A;
+
+      // For Mask3 with NegAcc, we need to create a new extractelement that
+      // avoids the negation above.
+      if (NegAcc && IsMask3)
+        PassThru = Builder.CreateExtractElement(CI->getArgOperand(2),
+                                                (uint64_t)0);
+
+      Rep = EmitX86ScalarSelect(Builder, CI->getArgOperand(3),
+                                Rep, PassThru);
+      Rep = Builder.CreateInsertElement(CI->getArgOperand(IsMask3 ? 2 : 0),
+                                        Rep, (uint64_t)0);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.p") ||
+                         Name.startswith("avx512.mask.vfnmadd.p") ||
+                         Name.startswith("avx512.mask.vfnmsub.p") ||
+                         Name.startswith("avx512.mask3.vfmadd.p") ||
+                         Name.startswith("avx512.mask3.vfmsub.p") ||
+                         Name.startswith("avx512.mask3.vfnmsub.p") ||
+                         Name.startswith("avx512.maskz.vfmadd.p"))) {
+      bool IsMask3 = Name[11] == '3';
+      bool IsMaskZ = Name[11] == 'z';
+      // Drop the "avx512.mask." to make it easier.
+      Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
+      bool NegMul = Name[2] == 'n';
+      bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
+
+      Value *A = CI->getArgOperand(0);
+      Value *B = CI->getArgOperand(1);
+      Value *C = CI->getArgOperand(2);
+
+      if (NegMul && (IsMask3 || IsMaskZ))
+        A = Builder.CreateFNeg(A);
+      if (NegMul && !(IsMask3 || IsMaskZ))
+        B = Builder.CreateFNeg(B);
+      if (NegAcc)
+        C = Builder.CreateFNeg(C);
+
+      if (CI->getNumArgOperands() == 5 &&
+          (!isa<ConstantInt>(CI->getArgOperand(4)) ||
+           cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
+        Intrinsic::ID IID;
+        // Check the character before ".512" in string.
+        if (Name[Name.size()-5] == 's')
+          IID = Intrinsic::x86_avx512_vfmadd_ps_512;
+        else
+          IID = Intrinsic::x86_avx512_vfmadd_pd_512;
+
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                                 { A, B, C, CI->getArgOperand(4) });
+      } else {
+        Function *FMA = Intrinsic::getDeclaration(CI->getModule(),
+                                                  Intrinsic::fma,
+                                                  A->getType());
+        Rep = Builder.CreateCall(FMA, { A, B, C });
+      }
+
+      Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) :
+                        IsMask3 ? CI->getArgOperand(2) :
+                                  CI->getArgOperand(0);
+
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("fma.vfmaddsub.p") ||
+                         Name.startswith("fma.vfmsubadd.p"))) {
+      bool IsSubAdd = Name[7] == 's';
+      int NumElts = CI->getType()->getVectorNumElements();
+
+      Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2) };
+
+      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
+                                                Ops[0]->getType());
+      Value *Odd = Builder.CreateCall(FMA, Ops);
+      Ops[2] = Builder.CreateFNeg(Ops[2]);
+      Value *Even = Builder.CreateCall(FMA, Ops);
+
+      if (IsSubAdd)
+        std::swap(Even, Odd);
+
+      SmallVector<uint32_t, 32> Idxs(NumElts);
+      for (int i = 0; i != NumElts; ++i)
+        Idxs[i] = i + (i % 2) * NumElts;
+
+      Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vfmaddsub.p") ||
+                         Name.startswith("avx512.mask3.vfmaddsub.p") ||
+                         Name.startswith("avx512.maskz.vfmaddsub.p") ||
+                         Name.startswith("avx512.mask3.vfmsubadd.p"))) {
+      bool IsMask3 = Name[11] == '3';
+      bool IsMaskZ = Name[11] == 'z';
+      // Drop the "avx512.mask." to make it easier.
+      Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
+      bool IsSubAdd = Name[3] == 's';
+      if (CI->getNumArgOperands() == 5 &&
+          (!isa<ConstantInt>(CI->getArgOperand(4)) ||
+           cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
+        Intrinsic::ID IID;
+        // Check the character before ".512" in string.
+        if (Name[Name.size()-5] == 's')
+          IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
+        else
+          IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
+
+        Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(2), CI->getArgOperand(4) };
+        if (IsSubAdd)
+          Ops[2] = Builder.CreateFNeg(Ops[2]);
+
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                                 {CI->getArgOperand(0), CI->getArgOperand(1),
+                                  CI->getArgOperand(2), CI->getArgOperand(4)});
+      } else {
+        int NumElts = CI->getType()->getVectorNumElements();
+
+        Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(2) };
+
+        Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
+                                                  Ops[0]->getType());
+        Value *Odd = Builder.CreateCall(FMA, Ops);
+        Ops[2] = Builder.CreateFNeg(Ops[2]);
+        Value *Even = Builder.CreateCall(FMA, Ops);
+
+        if (IsSubAdd)
+          std::swap(Even, Odd);
+
+        SmallVector<uint32_t, 32> Idxs(NumElts);
+        for (int i = 0; i != NumElts; ++i)
+          Idxs[i] = i + (i % 2) * NumElts;
+
+        Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
+      }
+
+      Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) :
+                        IsMask3 ? CI->getArgOperand(2) :
+                                  CI->getArgOperand(0);
+
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("avx512.mask.pternlog.") ||
+                         Name.startswith("avx512.maskz.pternlog."))) {
+      bool ZeroMask = Name[11] == 'z';
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      unsigned EltWidth = CI->getType()->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_pternlog_d_128;
+      else if (VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_pternlog_d_256;
+      else if (VecWidth == 512 && EltWidth == 32)
+        IID = Intrinsic::x86_avx512_pternlog_d_512;
+      else if (VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_pternlog_q_128;
+      else if (VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_pternlog_q_256;
+      else if (VecWidth == 512 && EltWidth == 64)
+        IID = Intrinsic::x86_avx512_pternlog_q_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
+                        CI->getArgOperand(2), CI->getArgOperand(3) };
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                                 : CI->getArgOperand(0);
+      Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vpmadd52") ||
+                         Name.startswith("avx512.maskz.vpmadd52"))) {
+      bool ZeroMask = Name[11] == 'z';
+      bool High = Name[20] == 'h' || Name[21] == 'h';
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && !High)
+        IID = Intrinsic::x86_avx512_vpmadd52l_uq_128;
+      else if (VecWidth == 256 && !High)
+        IID = Intrinsic::x86_avx512_vpmadd52l_uq_256;
+      else if (VecWidth == 512 && !High)
+        IID = Intrinsic::x86_avx512_vpmadd52l_uq_512;
+      else if (VecWidth == 128 && High)
+        IID = Intrinsic::x86_avx512_vpmadd52h_uq_128;
+      else if (VecWidth == 256 && High)
+        IID = Intrinsic::x86_avx512_vpmadd52h_uq_256;
+      else if (VecWidth == 512 && High)
+        IID = Intrinsic::x86_avx512_vpmadd52h_uq_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
+                        CI->getArgOperand(2) };
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                                 : CI->getArgOperand(0);
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vpermi2var.") ||
+                         Name.startswith("avx512.mask.vpermt2var.") ||
+                         Name.startswith("avx512.maskz.vpermt2var."))) {
+      bool ZeroMask = Name[11] == 'z';
+      bool IndexForm = Name[17] == 'i';
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      unsigned EltWidth = CI->getType()->getScalarSizeInBits();
+      bool IsFloat = CI->getType()->isFPOrFPVectorTy();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && EltWidth == 32 && IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_ps_128;
+      else if (VecWidth == 128 && EltWidth == 32 && !IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_d_128;
+      else if (VecWidth == 128 && EltWidth == 64 && IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_pd_128;
+      else if (VecWidth == 128 && EltWidth == 64 && !IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_q_128;
+      else if (VecWidth == 256 && EltWidth == 32 && IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_ps_256;
+      else if (VecWidth == 256 && EltWidth == 32 && !IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_d_256;
+      else if (VecWidth == 256 && EltWidth == 64 && IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_pd_256;
+      else if (VecWidth == 256 && EltWidth == 64 && !IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_q_256;
+      else if (VecWidth == 512 && EltWidth == 32 && IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_ps_512;
+      else if (VecWidth == 512 && EltWidth == 32 && !IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_d_512;
+      else if (VecWidth == 512 && EltWidth == 64 && IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_pd_512;
+      else if (VecWidth == 512 && EltWidth == 64 && !IsFloat)
+        IID = Intrinsic::x86_avx512_vpermi2var_q_512;
+      else if (VecWidth == 128 && EltWidth == 16)
+        IID = Intrinsic::x86_avx512_vpermi2var_hi_128;
+      else if (VecWidth == 256 && EltWidth == 16)
+        IID = Intrinsic::x86_avx512_vpermi2var_hi_256;
+      else if (VecWidth == 512 && EltWidth == 16)
+        IID = Intrinsic::x86_avx512_vpermi2var_hi_512;
+      else if (VecWidth == 128 && EltWidth == 8)
+        IID = Intrinsic::x86_avx512_vpermi2var_qi_128;
+      else if (VecWidth == 256 && EltWidth == 8)
+        IID = Intrinsic::x86_avx512_vpermi2var_qi_256;
+      else if (VecWidth == 512 && EltWidth == 8)
+        IID = Intrinsic::x86_avx512_vpermi2var_qi_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
+                        CI->getArgOperand(2) };
+
+      // If this isn't index form we need to swap operand 0 and 1.
+      if (!IndexForm)
+        std::swap(Args[0], Args[1]);
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                                 : Builder.CreateBitCast(CI->getArgOperand(1),
+                                                         CI->getType());
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vpdpbusd.") ||
+                         Name.startswith("avx512.maskz.vpdpbusd.") ||
+                         Name.startswith("avx512.mask.vpdpbusds.") ||
+                         Name.startswith("avx512.maskz.vpdpbusds."))) {
+      bool ZeroMask = Name[11] == 'z';
+      bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && !IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpbusd_128;
+      else if (VecWidth == 256 && !IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpbusd_256;
+      else if (VecWidth == 512 && !IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpbusd_512;
+      else if (VecWidth == 128 && IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpbusds_128;
+      else if (VecWidth == 256 && IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpbusds_256;
+      else if (VecWidth == 512 && IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpbusds_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                        CI->getArgOperand(2)  };
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                                 : CI->getArgOperand(0);
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vpdpwssd.") ||
+                         Name.startswith("avx512.maskz.vpdpwssd.") ||
+                         Name.startswith("avx512.mask.vpdpwssds.") ||
+                         Name.startswith("avx512.maskz.vpdpwssds."))) {
+      bool ZeroMask = Name[11] == 'z';
+      bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && !IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpwssd_128;
+      else if (VecWidth == 256 && !IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpwssd_256;
+      else if (VecWidth == 512 && !IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpwssd_512;
+      else if (VecWidth == 128 && IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpwssds_128;
+      else if (VecWidth == 256 && IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpwssds_256;
+      else if (VecWidth == 512 && IsSaturating)
+        IID = Intrinsic::x86_avx512_vpdpwssds_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                        CI->getArgOperand(2)  };
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                                 : CI->getArgOperand(0);
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && Name.startswith("avx512.mask.") &&
+               upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
+      // Rep will be updated by the call in the condition.
     } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
       Value *Arg = CI->getArgOperand(0);
       Value *Neg = Builder.CreateNeg(Arg, "neg");
@@ -2164,14 +3238,17 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
-  CallInst *NewCall = nullptr;
-  switch (NewFn->getIntrinsicID()) {
-  default: {
+  const auto &DefaultCase = [&NewFn, &CI]() -> void {
     // Handle generic mangling change, but nothing else
     assert(
         (CI->getCalledFunction()->getName() != NewFn->getName()) &&
         "Unknown function for CallInst upgrade and isn't just a name change");
     CI->setCalledFunction(NewFn);
+  };
+  CallInst *NewCall = nullptr;
+  switch (NewFn->getIntrinsicID()) {
+  default: {
+    DefaultCase();
     return;
   }
 
@@ -2312,6 +3389,35 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
+
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset: {
+    // We have to make sure that the call signature is what we're expecting.
+    // We only want to change the old signatures by removing the alignment arg:
+    //  @llvm.mem[cpy|move]...(i8*, i8*, i[32|i64], i32, i1)
+    //    -> @llvm.mem[cpy|move]...(i8*, i8*, i[32|i64], i1)
+    //  @llvm.memset...(i8*, i8, i[32|64], i32, i1)
+    //    -> @llvm.memset...(i8*, i8, i[32|64], i1)
+    // Note: i8*'s in the above can be any pointer type
+    if (CI->getNumArgOperands() != 5) {
+      DefaultCase();
+      return;
+    }
+    // Remove alignment argument (3), and add alignment attributes to the
+    // dest/src pointers.
+    Value *Args[4] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2), CI->getArgOperand(4)};
+    NewCall = Builder.CreateCall(NewFn, Args);
+    auto *MemCI = cast<MemIntrinsic>(NewCall);
+    // All mem intrinsics support dest alignment.
+    const ConstantInt *Align = cast<ConstantInt>(CI->getArgOperand(3));
+    MemCI->setDestAlignment(Align->getZExtValue());
+    // Memcpy/Memmove also support source alignment.
+    if (auto *MTI = dyn_cast<MemTransferInst>(MemCI))
+      MTI->setSourceAlignment(Align->getZExtValue());
+    break;
+  }
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
@@ -2432,6 +3538,30 @@ bool llvm::UpgradeDebugInfo(Module &M) {
   return Modified;
 }
 
+bool llvm::UpgradeRetainReleaseMarker(Module &M) {
+  bool Changed = false;
+  NamedMDNode *ModRetainReleaseMarker =
+      M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker");
+  if (ModRetainReleaseMarker) {
+    MDNode *Op = ModRetainReleaseMarker->getOperand(0);
+    if (Op) {
+      MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(0));
+      if (ID) {
+        SmallVector<StringRef, 4> ValueComp;
+        ID->getString().split(ValueComp, "#");
+        if (ValueComp.size() == 2) {
+          std::string NewValue = ValueComp[0].str() + ";" + ValueComp[1].str();
+          Metadata *Ops[1] = {MDString::get(M.getContext(), NewValue)};
+          ModRetainReleaseMarker->setOperand(0,
+                                             MDNode::get(M.getContext(), Ops));
+          Changed = true;
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
 bool llvm::UpgradeModuleFlags(Module &M) {
   NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
   if (!ModFlags)
diff --git a/contrib/llvm/lib/IR/BasicBlock.cpp b/contrib/llvm/lib/IR/BasicBlock.cpp
index 938c40182b92..7c3e5862d1cd 100644
--- a/contrib/llvm/lib/IR/BasicBlock.cpp
+++ b/contrib/llvm/lib/IR/BasicBlock.cpp
@@ -90,6 +90,24 @@ void BasicBlock::setParent(Function *parent) {
   InstList.setSymTabObject(&Parent, parent);
 }
 
+iterator_range<filter_iterator<BasicBlock::const_iterator,
+                               std::function<bool(const Instruction &)>>>
+BasicBlock::instructionsWithoutDebug() const {
+  std::function<bool(const Instruction &)> Fn = [](const Instruction &I) {
+    return !isa<DbgInfoIntrinsic>(I);
+  };
+  return make_filter_range(*this, Fn);
+}
+
+iterator_range<filter_iterator<BasicBlock::iterator,
+                               std::function<bool(Instruction &)>>>
+BasicBlock::instructionsWithoutDebug() {
+  std::function<bool(Instruction &)> Fn = [](Instruction &I) {
+    return !isa<DbgInfoIntrinsic>(I);
+  };
+  return make_filter_range(*this, Fn);
+}
+
 void BasicBlock::removeFromParent() {
   getParent()->getBasicBlockList().remove(getIterator());
 }
@@ -461,3 +479,9 @@ Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
   }
   return Optional<uint64_t>();
 }
+
+BasicBlock::iterator llvm::skipDebugIntrinsics(BasicBlock::iterator It) {
+  while (isa<DbgInfoIntrinsic>(It))
+    ++It;
+  return It;
+}
diff --git a/contrib/llvm/lib/IR/Comdat.cpp b/contrib/llvm/lib/IR/Comdat.cpp
index c735f9b2eb1e..3b1f7d62cdae 100644
--- a/contrib/llvm/lib/IR/Comdat.cpp
+++ b/contrib/llvm/lib/IR/Comdat.cpp
@@ -7,13 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the Comdat class.
+// This file implements the Comdat class (including the C bindings).
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Comdat.h"
+#include "llvm-c/Comdat.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
@@ -22,3 +25,54 @@ Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {}
 Comdat::Comdat() = default;
 
 StringRef Comdat::getName() const { return Name->first(); }
+
+LLVMComdatRef LLVMGetOrInsertComdat(LLVMModuleRef M, const char *Name) {
+  return wrap(unwrap(M)->getOrInsertComdat(Name));
+}
+
+LLVMComdatRef LLVMGetComdat(LLVMValueRef V) {
+  GlobalObject *G = unwrap<GlobalObject>(V);
+  return wrap(G->getComdat());
+}
+
+void LLVMSetComdat(LLVMValueRef V, LLVMComdatRef C) {
+  GlobalObject *G = unwrap<GlobalObject>(V);
+  G->setComdat(unwrap(C));
+}
+
+LLVMComdatSelectionKind LLVMGetComdatSelectionKind(LLVMComdatRef C) {
+  switch (unwrap(C)->getSelectionKind()) {
+  case Comdat::Any:
+    return LLVMAnyComdatSelectionKind;
+  case Comdat::ExactMatch:
+    return LLVMExactMatchComdatSelectionKind;
+  case Comdat::Largest:
+    return LLVMLargestComdatSelectionKind;
+  case Comdat::NoDuplicates:
+    return LLVMNoDuplicatesComdatSelectionKind;
+  case Comdat::SameSize:
+    return LLVMSameSizeComdatSelectionKind;
+  }
+  llvm_unreachable("Invalid Comdat SelectionKind!");
+}
+
+void LLVMSetComdatSelectionKind(LLVMComdatRef C, LLVMComdatSelectionKind kind) {
+  Comdat *Cd = unwrap(C);
+  switch (kind) {
+  case LLVMAnyComdatSelectionKind:
+    Cd->setSelectionKind(Comdat::Any);
+    break;
+  case LLVMExactMatchComdatSelectionKind:
+    Cd->setSelectionKind(Comdat::ExactMatch);
+    break;
+  case LLVMLargestComdatSelectionKind:
+    Cd->setSelectionKind(Comdat::Largest);
+    break;
+  case LLVMNoDuplicatesComdatSelectionKind:
+    Cd->setSelectionKind(Comdat::NoDuplicates);
+    break;
+  case LLVMSameSizeComdatSelectionKind:
+    Cd->setSelectionKind(Comdat::SameSize);
+    break;
+  }
+}
diff --git a/contrib/llvm/lib/IR/ConstantFold.cpp b/contrib/llvm/lib/IR/ConstantFold.cpp
index adb6724fc9c0..90a8366d1696 100644
--- a/contrib/llvm/lib/IR/ConstantFold.cpp
+++ b/contrib/llvm/lib/IR/ConstantFold.cpp
@@ -71,7 +71,7 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
 /// This function determines which opcode to use to fold two constant cast
 /// expressions together. It uses CastInst::isEliminableCastPair to determine
 /// the opcode. Consequently its just a wrapper around that function.
-/// @brief Determine if it is valid to fold a cast of a cast
+/// Determine if it is valid to fold a cast of a cast
 static unsigned
 foldConstantCastPair(
   unsigned opc,          ///< opcode of the second cast constant expression
@@ -321,7 +321,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     if (ByteStart == 0 && ByteSize*8 == SrcBitSize)
       return CE->getOperand(0);
 
-    // If extracting something completely in the input, if if the input is a
+    // If extracting something completely in the input, if the input is a
     // multiple of 8 bits, recurse.
     if ((SrcBitSize&7) == 0 && (ByteStart+ByteSize)*8 <= SrcBitSize)
       return ExtractConstantBytes(CE->getOperand(0), ByteStart, ByteSize);
@@ -545,7 +545,11 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                opc != Instruction::AddrSpaceCast &&
                // Do not fold bitcast (gep) with inrange index, as this loses
                // information.
-               !cast<GEPOperator>(CE)->getInRangeIndex().hasValue()) {
+               !cast<GEPOperator>(CE)->getInRangeIndex().hasValue() &&
+               // Do not fold if the gep type is a vector, as bitcasting
+               // operand 0 of a vector gep will result in a bitcast between
+               // different sizes.
+               !CE->getType()->isVectorTy()) {
       // If all of the indexes in the GEP are null values, there is no pointer
       // adjustment going on.  We might as well cast the source pointer.
       bool isAllNull = true;
@@ -678,13 +682,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       const APInt &api = CI->getValue();
       APFloat apf(DestTy->getFltSemantics(),
                   APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
-      if (APFloat::opOverflow &
-          apf.convertFromAPInt(api, opc==Instruction::SIToFP,
-                              APFloat::rmNearestTiesToEven)) {
-        // Undefined behavior invoked - the destination type can't represent
-        // the input constant.
-        return UndefValue::get(DestTy);
-      }
+      apf.convertFromAPInt(api, opc==Instruction::SIToFP,
+                           APFloat::rmNearestTiesToEven);
       return ConstantFP::get(V->getContext(), apf);
     }
     return nullptr;
@@ -1009,8 +1008,17 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     case Instruction::FMul:
     case Instruction::FDiv:
     case Instruction::FRem:
-      // TODO: UNDEF handling for binary float instructions.
-      return nullptr;
+      // [any flop] undef, undef -> undef
+      if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
+        return C1;
+      // [any flop] C, undef -> NaN
+      // [any flop] undef, C -> NaN
+      // We could potentially specialize NaN/Inf constants vs. 'normal'
+      // constants (possibly differently depending on opcode and operand). This
+      // would allow returning undef sometimes. But it is always safe to fold to
+      // NaN because we can choose the undef operand as NaN, and any FP opcode
+      // with a NaN operand will propagate NaN.
+      return ConstantFP::getNaN(C1->getType());
     case Instruction::BinaryOpsEnd:
       llvm_unreachable("Invalid BinaryOp");
     }
@@ -1219,9 +1227,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
 
       // If any element of a divisor vector is zero, the whole op is undef.
-      if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
-           Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
-          RHS->isNullValue())
+      if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue())
         return UndefValue::get(VTy);
 
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
@@ -1494,7 +1500,12 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
       assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
       // GlobalVals can never be null unless they have external weak linkage.
       // We don't try to evaluate aliases here.
-      if (!GV->hasExternalWeakLinkage() && !isa<GlobalAlias>(GV))
+      // NOTE: We should not be doing this constant folding if null pointer
+      // is considered valid for the function. But currently there is no way to
+      // query it from the Constant type.
+      if (!GV->hasExternalWeakLinkage() && !isa<GlobalAlias>(GV) &&
+          !NullPointerIsDefined(nullptr /* F */,
+                                GV->getType()->getAddressSpace()))
         return ICmpInst::ICMP_NE;
     }
   } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
@@ -1546,8 +1557,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
 
       // If the cast is not actually changing bits, and the second operand is a
       // null pointer, do the comparison with the pre-casted value.
-      if (V2->isNullValue() &&
-          (CE1->getType()->isPointerTy() || CE1->getType()->isIntegerTy())) {
+      if (V2->isNullValue() && CE1->getType()->isIntOrPtrTy()) {
         if (CE1->getOpcode() == Instruction::ZExt) isSigned = false;
         if (CE1->getOpcode() == Instruction::SExt) isSigned = true;
         return evaluateICmpRelation(CE1Op0,
@@ -1724,7 +1734,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   if (C1->isNullValue()) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C2))
       // Don't try to evaluate aliases.  External weak GV can be null.
-      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
+      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() &&
+          !NullPointerIsDefined(nullptr /* F */,
+                                GV->getType()->getAddressSpace())) {
         if (pred == ICmpInst::ICMP_EQ)
           return ConstantInt::getFalse(C1->getContext());
         else if (pred == ICmpInst::ICMP_NE)
@@ -1734,7 +1746,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   } else if (C2->isNullValue()) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C1))
       // Don't try to evaluate aliases.  External weak GV can be null.
-      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
+      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() &&
+          !NullPointerIsDefined(nullptr /* F */,
+                                GV->getType()->getAddressSpace())) {
         if (pred == ICmpInst::ICMP_EQ)
           return ConstantInt::getFalse(C1->getContext());
         else if (pred == ICmpInst::ICMP_NE)
diff --git a/contrib/llvm/lib/IR/ConstantRange.cpp b/contrib/llvm/lib/IR/ConstantRange.cpp
index 48d16f334ba3..39a0b13c4e0c 100644
--- a/contrib/llvm/lib/IR/ConstantRange.cpp
+++ b/contrib/llvm/lib/IR/ConstantRange.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
@@ -190,8 +191,7 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
     return CR0.inverse().unionWith(CR1.inverse()).inverse();
   };
 
-  assert(BinOp >= Instruction::BinaryOpsBegin &&
-         BinOp < Instruction::BinaryOpsEnd && "Binary operators only!");
+  assert(Instruction::isBinaryOp(BinOp) && "Binary operators only!");
 
   assert((NoWrapKind == OBO::NoSignedWrap ||
           NoWrapKind == OBO::NoUnsignedWrap ||
@@ -255,6 +255,64 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
                           APInt::getSignedMinValue(BitWidth) + SignedMin));
     }
     return Result;
+  case Instruction::Mul: {
+    if (NoWrapKind == (OBO::NoSignedWrap | OBO::NoUnsignedWrap)) {
+      return SubsetIntersect(
+          makeGuaranteedNoWrapRegion(BinOp, Other, OBO::NoSignedWrap),
+          makeGuaranteedNoWrapRegion(BinOp, Other, OBO::NoUnsignedWrap));
+    }
+
+    // Equivalent to calling makeGuaranteedNoWrapRegion() on [V, V+1).
+    const bool Unsigned = NoWrapKind == OBO::NoUnsignedWrap;
+    const auto makeSingleValueRegion = [Unsigned,
+                                        BitWidth](APInt V) -> ConstantRange {
+      // Handle special case for 0, -1 and 1. See the last for reason why we
+      // specialize -1 and 1.
+      if (V == 0 || V.isOneValue())
+        return ConstantRange(BitWidth, true);
+
+      APInt MinValue, MaxValue;
+      if (Unsigned) {
+        MinValue = APInt::getMinValue(BitWidth);
+        MaxValue = APInt::getMaxValue(BitWidth);
+      } else {
+        MinValue = APInt::getSignedMinValue(BitWidth);
+        MaxValue = APInt::getSignedMaxValue(BitWidth);
+      }
+      // e.g. Returning [-127, 127], represented as [-127, -128).
+      if (!Unsigned && V.isAllOnesValue())
+        return ConstantRange(-MaxValue, MinValue);
+
+      APInt Lower, Upper;
+      if (!Unsigned && V.isNegative()) {
+        Lower = APIntOps::RoundingSDiv(MaxValue, V, APInt::Rounding::UP);
+        Upper = APIntOps::RoundingSDiv(MinValue, V, APInt::Rounding::DOWN);
+      } else if (Unsigned) {
+        Lower = APIntOps::RoundingUDiv(MinValue, V, APInt::Rounding::UP);
+        Upper = APIntOps::RoundingUDiv(MaxValue, V, APInt::Rounding::DOWN);
+      } else {
+        Lower = APIntOps::RoundingSDiv(MinValue, V, APInt::Rounding::UP);
+        Upper = APIntOps::RoundingSDiv(MaxValue, V, APInt::Rounding::DOWN);
+      }
+      if (Unsigned) {
+        Lower = Lower.zextOrSelf(BitWidth);
+        Upper = Upper.zextOrSelf(BitWidth);
+      } else {
+        Lower = Lower.sextOrSelf(BitWidth);
+        Upper = Upper.sextOrSelf(BitWidth);
+      }
+      // ConstantRange ctor take a half inclusive interval [Lower, Upper + 1).
+      // Upper + 1 is guanranteed not to overflow, because |divisor| > 1. 0, -1,
+      // and 1 are already handled as special cases.
+      return ConstantRange(Lower, Upper + 1);
+    };
+
+    if (Unsigned)
+      return makeSingleValueRegion(Other.getUnsignedMax());
+
+    return SubsetIntersect(makeSingleValueRegion(Other.getSignedMin()),
+                           makeSingleValueRegion(Other.getSignedMax()));
+  }
   }
 }
 
@@ -358,7 +416,7 @@ bool ConstantRange::contains(const ConstantRange &Other) const {
 ConstantRange ConstantRange::subtract(const APInt &Val) const {
   assert(Val.getBitWidth() == getBitWidth() && "Wrong bit width");
   // If the set is empty or full, don't modify the endpoints.
-  if (Lower == Upper) 
+  if (Lower == Upper)
     return *this;
   return ConstantRange(Lower - Val, Upper - Val);
 }
@@ -368,7 +426,7 @@ ConstantRange ConstantRange::difference(const ConstantRange &CR) const {
 }
 
 ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
-  assert(getBitWidth() == CR.getBitWidth() && 
+  assert(getBitWidth() == CR.getBitWidth() &&
          "ConstantRange types don't agree!");
 
   // Handle common cases.
@@ -442,7 +500,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 }
 
 ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
-  assert(getBitWidth() == CR.getBitWidth() && 
+  assert(getBitWidth() == CR.getBitWidth() &&
          "ConstantRange types don't agree!");
 
   if (   isFullSet() || CR.isEmptySet()) return *this;
@@ -664,8 +722,7 @@ ConstantRange ConstantRange::sextOrTrunc(uint32_t DstTySize) const {
 
 ConstantRange ConstantRange::binaryOp(Instruction::BinaryOps BinOp,
                                       const ConstantRange &Other) const {
-  assert(BinOp >= Instruction::BinaryOpsBegin &&
-         BinOp < Instruction::BinaryOpsEnd && "Binary operators only!");
+  assert(Instruction::isBinaryOp(BinOp) && "Binary operators only!");
 
   switch (BinOp) {
   case Instruction::Add:
@@ -797,7 +854,7 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   this_max = getSignedMax().sext(getBitWidth() * 2);
   Other_min = Other.getSignedMin().sext(getBitWidth() * 2);
   Other_max = Other.getSignedMax().sext(getBitWidth() * 2);
-  
+
   auto L = {this_min * Other_min, this_min * Other_max,
             this_max * Other_min, this_max * Other_max};
   auto Compare = [](const APInt &A, const APInt &B) { return A.slt(B); };
diff --git a/contrib/llvm/lib/IR/Constants.cpp b/contrib/llvm/lib/IR/Constants.cpp
index dccba779deb3..2351e7e4a389 100644
--- a/contrib/llvm/lib/IR/Constants.cpp
+++ b/contrib/llvm/lib/IR/Constants.cpp
@@ -202,6 +202,68 @@ bool Constant::isNotMinSignedValue() const {
   return false;
 }
 
+bool Constant::isFiniteNonZeroFP() const {
+  if (auto *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().isFiniteNonZero();
+  if (!getType()->isVectorTy())
+    return false;
+  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+    auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
+    if (!CFP || !CFP->getValueAPF().isFiniteNonZero())
+      return false;
+  }
+  return true;
+}
+
+bool Constant::isNormalFP() const {
+  if (auto *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().isNormal();
+  if (!getType()->isVectorTy())
+    return false;
+  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+    auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
+    if (!CFP || !CFP->getValueAPF().isNormal())
+      return false;
+  }
+  return true;
+}
+
+bool Constant::hasExactInverseFP() const {
+  if (auto *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().getExactInverse(nullptr);
+  if (!getType()->isVectorTy())
+    return false;
+  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+    auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
+    if (!CFP || !CFP->getValueAPF().getExactInverse(nullptr))
+      return false;
+  }
+  return true;
+}
+
+bool Constant::isNaN() const {
+  if (auto *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->isNaN();
+  if (!getType()->isVectorTy())
+    return false;
+  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+    auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
+    if (!CFP || !CFP->isNaN())
+      return false;
+  }
+  return true;
+}
+
+bool Constant::containsUndefElement() const {
+  if (!getType()->isVectorTy())
+    return false;
+  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i)
+    if (isa<UndefValue>(getAggregateElement(i)))
+      return true;
+
+  return false;
+}
+
 /// Constructor to create a '0' constant of arbitrary type.
 Constant *Constant::getNullValue(Type *Ty) {
   switch (Ty->getTypeID()) {
@@ -635,6 +697,17 @@ Constant *ConstantFP::get(Type *Ty, double V) {
   return C;
 }
 
+Constant *ConstantFP::get(Type *Ty, const APFloat &V) {
+  ConstantFP *C = get(Ty->getContext(), V);
+  assert(C->getType() == Ty->getScalarType() &&
+         "ConstantFP type doesn't match the type implied by its value!");
+
+  // For vectors, broadcast the value.
+  if (auto *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+
+  return C;
+}
 
 Constant *ConstantFP::get(Type *Ty, StringRef Str) {
   LLVMContext &Context = Ty->getContext();
@@ -646,7 +719,7 @@ Constant *ConstantFP::get(Type *Ty, StringRef Str) {
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     return ConstantVector::getSplat(VTy->getNumElements(), C);
 
-  return C; 
+  return C;
 }
 
 Constant *ConstantFP::getNaN(Type *Ty, bool Negative, unsigned Type) {
@@ -699,7 +772,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
     else if (&V.getSemantics() == &APFloat::IEEEquad())
       Ty = Type::getFP128Ty(Context);
     else {
-      assert(&V.getSemantics() == &APFloat::PPCDoubleDouble() && 
+      assert(&V.getSemantics() == &APFloat::PPCDoubleDouble() &&
              "Unknown FP format");
       Ty = Type::getPPC_FP128Ty(Context);
     }
@@ -952,7 +1025,7 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
   // Create a ConstantAggregateZero value if all elements are zeros.
   bool isZero = true;
   bool isUndef = false;
-  
+
   if (!V.empty()) {
     isUndef = isa<UndefValue>(V[0]);
     isZero = V[0]->isNullValue();
@@ -1213,17 +1286,17 @@ bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
   }
   case Type::X86_FP80TyID:
     return &Val2.getSemantics() == &APFloat::IEEEhalf() ||
-           &Val2.getSemantics() == &APFloat::IEEEsingle() || 
+           &Val2.getSemantics() == &APFloat::IEEEsingle() ||
            &Val2.getSemantics() == &APFloat::IEEEdouble() ||
            &Val2.getSemantics() == &APFloat::x87DoubleExtended();
   case Type::FP128TyID:
     return &Val2.getSemantics() == &APFloat::IEEEhalf() ||
-           &Val2.getSemantics() == &APFloat::IEEEsingle() || 
+           &Val2.getSemantics() == &APFloat::IEEEsingle() ||
            &Val2.getSemantics() == &APFloat::IEEEdouble() ||
            &Val2.getSemantics() == &APFloat::IEEEquad();
   case Type::PPC_FP128TyID:
     return &Val2.getSemantics() == &APFloat::IEEEhalf() ||
-           &Val2.getSemantics() == &APFloat::IEEEsingle() || 
+           &Val2.getSemantics() == &APFloat::IEEEsingle() ||
            &Val2.getSemantics() == &APFloat::IEEEdouble() ||
            &Val2.getSemantics() == &APFloat::PPCDoubleDouble();
   }
@@ -1710,8 +1783,7 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
                             unsigned Flags, Type *OnlyIfReducedTy) {
   // Check the operands for consistency first.
-  assert(Opcode >= Instruction::BinaryOpsBegin &&
-         Opcode <  Instruction::BinaryOpsEnd   &&
+  assert(Instruction::isBinaryOp(Opcode) &&
          "Invalid opcode in binary constant expression");
   assert(C1->getType() == C2->getType() &&
          "Operand types in binary constant expression should match");
@@ -1733,8 +1805,8 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
            "Tried to create a floating-point operation on a "
            "non-floating-point type!");
     break;
-  case Instruction::UDiv: 
-  case Instruction::SDiv: 
+  case Instruction::UDiv:
+  case Instruction::SDiv:
     assert(C1->getType() == C2->getType() && "Op types should be identical!");
     assert(C1->getType()->isIntOrIntVectorTy() &&
            "Tried to create an arithmetic operation on a non-arithmetic type!");
@@ -1744,8 +1816,8 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
     assert(C1->getType()->isFPOrFPVectorTy() &&
            "Tried to create an arithmetic operation on a non-arithmetic type!");
     break;
-  case Instruction::URem: 
-  case Instruction::SRem: 
+  case Instruction::URem:
+  case Instruction::SRem:
     assert(C1->getType() == C2->getType() && "Op types should be identical!");
     assert(C1->getType()->isIntOrIntVectorTy() &&
            "Tried to create an arithmetic operation on a non-arithmetic type!");
@@ -1793,7 +1865,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) {
   Constant *GEPIdx = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
   Constant *GEP = getGetElementPtr(
       Ty, Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx);
-  return getPtrToInt(GEP, 
+  return getPtrToInt(GEP,
                      Type::getInt64Ty(Ty->getContext()));
 }
 
@@ -2199,22 +2271,49 @@ Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) {
              isExact ? PossiblyExactOperator::IsExact : 0);
 }
 
-Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
-  switch (Opcode) {
-  default:
-    // Doesn't have an identity.
-    return nullptr;
-
-  case Instruction::Add:
-  case Instruction::Or:
-  case Instruction::Xor:
-    return Constant::getNullValue(Ty);
+Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty,
+                                         bool AllowRHSConstant) {
+  assert(Instruction::isBinaryOp(Opcode) && "Only binops allowed");
+
+  // Commutative opcodes: it does not matter if AllowRHSConstant is set.
+  if (Instruction::isCommutative(Opcode)) {
+    switch (Opcode) {
+      case Instruction::Add: // X + 0 = X
+      case Instruction::Or:  // X | 0 = X
+      case Instruction::Xor: // X ^ 0 = X
+        return Constant::getNullValue(Ty);
+      case Instruction::Mul: // X * 1 = X
+        return ConstantInt::get(Ty, 1);
+      case Instruction::And: // X & -1 = X
+        return Constant::getAllOnesValue(Ty);
+      case Instruction::FAdd: // X + -0.0 = X
+        // TODO: If the fadd has 'nsz', should we return +0.0?
+        return ConstantFP::getNegativeZero(Ty);
+      case Instruction::FMul: // X * 1.0 = X
+        return ConstantFP::get(Ty, 1.0);
+      default:
+        llvm_unreachable("Every commutative binop has an identity constant");
+    }
+  }
 
-  case Instruction::Mul:
-    return ConstantInt::get(Ty, 1);
+  // Non-commutative opcodes: AllowRHSConstant must be set.
+  if (!AllowRHSConstant)
+    return nullptr;
 
-  case Instruction::And:
-    return Constant::getAllOnesValue(Ty);
+  switch (Opcode) {
+    case Instruction::Sub:  // X - 0 = X
+    case Instruction::Shl:  // X << 0 = X
+    case Instruction::LShr: // X >>u 0 = X
+    case Instruction::AShr: // X >> 0 = X
+    case Instruction::FSub: // X - 0.0 = X
+      return Constant::getNullValue(Ty);
+    case Instruction::SDiv: // X / 1 = X
+    case Instruction::UDiv: // X /u 1 = X
+      return ConstantInt::get(Ty, 1);
+    case Instruction::FDiv: // X / 1.0 = X
+      return ConstantFP::get(Ty, 1.0);
+    default:
+      return nullptr;
   }
 }
 
@@ -2354,7 +2453,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
 
 void ConstantDataSequential::destroyConstantImpl() {
   // Remove the constant from the StringMap.
-  StringMap<ConstantDataSequential*> &CDSConstants = 
+  StringMap<ConstantDataSequential*> &CDSConstants =
     getType()->getContext().pImpl->CDSConstants;
 
   StringMap<ConstantDataSequential*>::iterator Slot =
@@ -2371,7 +2470,7 @@ void ConstantDataSequential::destroyConstantImpl() {
     assert((*Entry) == this && "Hash mismatch in ConstantDataSequential");
     getContext().pImpl->CDSConstants.erase(Slot);
   } else {
-    // Otherwise, there are multiple entries linked off the bucket, unlink the 
+    // Otherwise, there are multiple entries linked off the bucket, unlink the
     // node we care about but keep the bucket around.
     for (ConstantDataSequential *Node = *Entry; ;
          Entry = &Node->Next, Node = *Entry) {
@@ -2389,40 +2488,6 @@ void ConstantDataSequential::destroyConstantImpl() {
   Next = nullptr;
 }
 
-/// get() constructors - Return a constant with array type with an element
-/// count and element type matching the ArrayRef passed in.  Note that this
-/// can return a ConstantAggregateZero object.
-Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint8_t> Elts) {
-  Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size());
-  const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(Data, Elts.size() * 1), Ty);
-}
-Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
-  Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size());
-  const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
-}
-Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
-  Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size());
-  const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
-}
-Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
-  Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size());
-  const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
-}
-Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) {
-  Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
-  const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
-}
-Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
-  Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
-  const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
-}
-
 /// getFP() constructors - Return a constant with array type with an element
 /// count and element type of float with precision matching the number of
 /// bits in the ArrayRef passed in. (i.e. half for 16bits, float for 32bits,
diff --git a/contrib/llvm/lib/IR/ConstantsContext.h b/contrib/llvm/lib/IR/ConstantsContext.h
index 6585304e7674..e9f31e4ded68 100644
--- a/contrib/llvm/lib/IR/ConstantsContext.h
+++ b/contrib/llvm/lib/IR/ConstantsContext.h
@@ -695,7 +695,9 @@ public:
     return nullptr;
   }
 
-  void dump() const { DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n"); }
+  void dump() const {
+    LLVM_DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n");
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp
index 743e3710fd68..bea4dee15c13 100644
--- a/contrib/llvm/lib/IR/Core.cpp
+++ b/contrib/llvm/lib/IR/Core.cpp
@@ -234,6 +234,15 @@ void LLVMSetModuleIdentifier(LLVMModuleRef M, const char *Ident, size_t Len) {
   unwrap(M)->setModuleIdentifier(StringRef(Ident, Len));
 }
 
+const char *LLVMGetSourceFileName(LLVMModuleRef M, size_t *Len) {
+  auto &Str = unwrap(M)->getSourceFileName();
+  *Len = Str.length();
+  return Str.c_str();
+}
+
+void LLVMSetSourceFileName(LLVMModuleRef M, const char *Name, size_t Len) {
+  unwrap(M)->setSourceFileName(StringRef(Name, Len));
+}
 
 /*--.. Data layout .........................................................--*/
 const char *LLVMGetDataLayoutStr(LLVMModuleRef M) {
@@ -257,6 +266,111 @@ void LLVMSetTarget(LLVMModuleRef M, const char *Triple) {
   unwrap(M)->setTargetTriple(Triple);
 }
 
+/*--.. Module flags ........................................................--*/
+struct LLVMOpaqueModuleFlagEntry {
+  LLVMModuleFlagBehavior Behavior;
+  const char *Key;
+  size_t KeyLen;
+  LLVMMetadataRef Metadata;
+};
+
+static Module::ModFlagBehavior
+map_to_llvmModFlagBehavior(LLVMModuleFlagBehavior Behavior) {
+  switch (Behavior) {
+  case LLVMModuleFlagBehaviorError:
+    return Module::ModFlagBehavior::Error;
+  case LLVMModuleFlagBehaviorWarning:
+    return Module::ModFlagBehavior::Warning;
+  case LLVMModuleFlagBehaviorRequire:
+    return Module::ModFlagBehavior::Require;
+  case LLVMModuleFlagBehaviorOverride:
+    return Module::ModFlagBehavior::Override;
+  case LLVMModuleFlagBehaviorAppend:
+    return Module::ModFlagBehavior::Append;
+  case LLVMModuleFlagBehaviorAppendUnique:
+    return Module::ModFlagBehavior::AppendUnique;
+  }
+  llvm_unreachable("Unknown LLVMModuleFlagBehavior");
+}
+
+static LLVMModuleFlagBehavior
+map_from_llvmModFlagBehavior(Module::ModFlagBehavior Behavior) {
+  switch (Behavior) {
+  case Module::ModFlagBehavior::Error:
+    return LLVMModuleFlagBehaviorError;
+  case Module::ModFlagBehavior::Warning:
+    return LLVMModuleFlagBehaviorWarning;
+  case Module::ModFlagBehavior::Require:
+    return LLVMModuleFlagBehaviorRequire;
+  case Module::ModFlagBehavior::Override:
+    return LLVMModuleFlagBehaviorOverride;
+  case Module::ModFlagBehavior::Append:
+    return LLVMModuleFlagBehaviorAppend;
+  case Module::ModFlagBehavior::AppendUnique:
+    return LLVMModuleFlagBehaviorAppendUnique;
+  default:
+    llvm_unreachable("Unhandled Flag Behavior");
+  }
+}
+
+LLVMModuleFlagEntry *LLVMCopyModuleFlagsMetadata(LLVMModuleRef M, size_t *Len) {
+  SmallVector<Module::ModuleFlagEntry, 8> MFEs;
+  unwrap(M)->getModuleFlagsMetadata(MFEs);
+
+  LLVMOpaqueModuleFlagEntry *Result = static_cast<LLVMOpaqueModuleFlagEntry *>(
+      safe_malloc(MFEs.size() * sizeof(LLVMOpaqueModuleFlagEntry)));
+  for (unsigned i = 0; i < MFEs.size(); ++i) {
+    const auto &ModuleFlag = MFEs[i];
+    Result[i].Behavior = map_from_llvmModFlagBehavior(ModuleFlag.Behavior);
+    Result[i].Key = ModuleFlag.Key->getString().data();
+    Result[i].KeyLen = ModuleFlag.Key->getString().size();
+    Result[i].Metadata = wrap(ModuleFlag.Val);
+  }
+  *Len = MFEs.size();
+  return Result;
+}
+
+void LLVMDisposeModuleFlagsMetadata(LLVMModuleFlagEntry *Entries) {
+  free(Entries);
+}
+
+LLVMModuleFlagBehavior
+LLVMModuleFlagEntriesGetFlagBehavior(LLVMModuleFlagEntry *Entries,
+                                     unsigned Index) {
+  LLVMOpaqueModuleFlagEntry MFE =
+      static_cast<LLVMOpaqueModuleFlagEntry>(Entries[Index]);
+  return MFE.Behavior;
+}
+
+const char *LLVMModuleFlagEntriesGetKey(LLVMModuleFlagEntry *Entries,
+                                        unsigned Index, size_t *Len) {
+  LLVMOpaqueModuleFlagEntry MFE =
+      static_cast<LLVMOpaqueModuleFlagEntry>(Entries[Index]);
+  *Len = MFE.KeyLen;
+  return MFE.Key;
+}
+
+LLVMMetadataRef LLVMModuleFlagEntriesGetMetadata(LLVMModuleFlagEntry *Entries,
+                                                 unsigned Index) {
+  LLVMOpaqueModuleFlagEntry MFE =
+      static_cast<LLVMOpaqueModuleFlagEntry>(Entries[Index]);
+  return MFE.Metadata;
+}
+
+LLVMMetadataRef LLVMGetModuleFlag(LLVMModuleRef M,
+                                  const char *Key, size_t KeyLen) {
+  return wrap(unwrap(M)->getModuleFlag({Key, KeyLen}));
+}
+
+void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
+                       const char *Key, size_t KeyLen,
+                       LLVMMetadataRef Val) {
+  unwrap(M)->addModuleFlag(map_to_llvmModFlagBehavior(Behavior),
+                           {Key, KeyLen}, unwrap(Val));
+}
+
+/*--.. Printing modules ....................................................--*/
+
 void LLVMDumpModule(LLVMModuleRef M) {
   unwrap(M)->print(errs(), nullptr,
                    /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true);
@@ -295,10 +409,44 @@ char *LLVMPrintModuleToString(LLVMModuleRef M) {
 }
 
 /*--.. Operations on inline assembler ......................................--*/
+void LLVMSetModuleInlineAsm2(LLVMModuleRef M, const char *Asm, size_t Len) {
+  unwrap(M)->setModuleInlineAsm(StringRef(Asm, Len));
+}
+
 void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
   unwrap(M)->setModuleInlineAsm(StringRef(Asm));
 }
 
+void LLVMAppendModuleInlineAsm(LLVMModuleRef M, const char *Asm, size_t Len) {
+  unwrap(M)->appendModuleInlineAsm(StringRef(Asm, Len));
+}
+
+const char *LLVMGetModuleInlineAsm(LLVMModuleRef M, size_t *Len) {
+  auto &Str = unwrap(M)->getModuleInlineAsm();
+  *Len = Str.length();
+  return Str.c_str();
+}
+
+LLVMValueRef LLVMGetInlineAsm(LLVMTypeRef Ty,
+                              char *AsmString, size_t AsmStringSize,
+                              char *Constraints, size_t ConstraintsSize,
+                              LLVMBool HasSideEffects, LLVMBool IsAlignStack,
+                              LLVMInlineAsmDialect Dialect) {
+  InlineAsm::AsmDialect AD;
+  switch (Dialect) {
+  case LLVMInlineAsmDialectATT:
+    AD = InlineAsm::AD_ATT;
+    break;
+  case LLVMInlineAsmDialectIntel:
+    AD = InlineAsm::AD_Intel;
+    break;
+  }
+  return wrap(InlineAsm::get(unwrap<FunctionType>(Ty),
+                             StringRef(AsmString, AsmStringSize),
+                             StringRef(Constraints, ConstraintsSize),
+                             HasSideEffects, IsAlignStack, AD));
+}
+
 
 /*--.. Operations on module contexts ......................................--*/
 LLVMContextRef LLVMGetModuleContext(LLVMModuleRef M) {
@@ -648,6 +796,16 @@ LLVMValueKind LLVMGetValueKind(LLVMValueRef Val) {
   }
 }
 
+const char *LLVMGetValueName2(LLVMValueRef Val, size_t *Length) {
+  auto *V = unwrap(Val);
+  *Length = V->getName().size();
+  return V->getName().data();
+}
+
+void LLVMSetValueName2(LLVMValueRef Val, const char *Name, size_t NameLen) {
+  unwrap(Val)->setName(StringRef(Name, NameLen));
+}
+
 const char *LLVMGetValueName(LLVMValueRef Val) {
   return unwrap(Val)->getName().data();
 }
@@ -1521,8 +1679,9 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
     GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
     break;
   case LLVMLinkOnceODRAutoHideLinkage:
-    DEBUG(errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no "
-                    "longer supported.");
+    LLVM_DEBUG(
+        errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no "
+                  "longer supported.");
     break;
   case LLVMWeakAnyLinkage:
     GV->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -1546,19 +1705,21 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
     GV->setLinkage(GlobalValue::PrivateLinkage);
     break;
   case LLVMDLLImportLinkage:
-    DEBUG(errs()
-          << "LLVMSetLinkage(): LLVMDLLImportLinkage is no longer supported.");
+    LLVM_DEBUG(
+        errs()
+        << "LLVMSetLinkage(): LLVMDLLImportLinkage is no longer supported.");
     break;
   case LLVMDLLExportLinkage:
-    DEBUG(errs()
-          << "LLVMSetLinkage(): LLVMDLLExportLinkage is no longer supported.");
+    LLVM_DEBUG(
+        errs()
+        << "LLVMSetLinkage(): LLVMDLLExportLinkage is no longer supported.");
     break;
   case LLVMExternalWeakLinkage:
     GV->setLinkage(GlobalValue::ExternalWeakLinkage);
     break;
   case LLVMGhostLinkage:
-    DEBUG(errs()
-          << "LLVMSetLinkage(): LLVMGhostLinkage is no longer supported.");
+    LLVM_DEBUG(
+        errs() << "LLVMSetLinkage(): LLVMGhostLinkage is no longer supported.");
     break;
   case LLVMCommonLinkage:
     GV->setLinkage(GlobalValue::CommonLinkage);
@@ -1596,6 +1757,31 @@ void LLVMSetDLLStorageClass(LLVMValueRef Global, LLVMDLLStorageClass Class) {
       static_cast<GlobalValue::DLLStorageClassTypes>(Class));
 }
 
+LLVMUnnamedAddr LLVMGetUnnamedAddress(LLVMValueRef Global) {
+  switch (unwrap<GlobalValue>(Global)->getUnnamedAddr()) {
+  case GlobalVariable::UnnamedAddr::None:
+    return LLVMNoUnnamedAddr;
+  case GlobalVariable::UnnamedAddr::Local:
+    return LLVMLocalUnnamedAddr;
+  case GlobalVariable::UnnamedAddr::Global:
+    return LLVMGlobalUnnamedAddr;
+  }
+  llvm_unreachable("Unknown UnnamedAddr kind!");
+}
+
+void LLVMSetUnnamedAddress(LLVMValueRef Global, LLVMUnnamedAddr UnnamedAddr) {
+  GlobalValue *GV = unwrap<GlobalValue>(Global);
+
+  switch (UnnamedAddr) {
+  case LLVMNoUnnamedAddr:
+    return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::None);
+  case LLVMLocalUnnamedAddr:
+    return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local);
+  case LLVMGlobalUnnamedAddr:
+    return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Global);
+  }
+}
+
 LLVMBool LLVMHasUnnamedAddr(LLVMValueRef Global) {
   return unwrap<GlobalValue>(Global)->hasGlobalUnnamedAddr();
 }
@@ -1779,6 +1965,51 @@ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                                   unwrap<Constant>(Aliasee), unwrap(M)));
 }
 
+LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M,
+                                     const char *Name, size_t NameLen) {
+  return wrap(unwrap(M)->getNamedAlias(Name));
+}
+
+LLVMValueRef LLVMGetFirstGlobalAlias(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::alias_iterator I = Mod->alias_begin();
+  if (I == Mod->alias_end())
+    return nullptr;
+  return wrap(&*I);
+}
+
+LLVMValueRef LLVMGetLastGlobalAlias(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::alias_iterator I = Mod->alias_end();
+  if (I == Mod->alias_begin())
+    return nullptr;
+  return wrap(&*--I);
+}
+
+LLVMValueRef LLVMGetNextGlobalAlias(LLVMValueRef GA) {
+  GlobalAlias *Alias = unwrap<GlobalAlias>(GA);
+  Module::alias_iterator I(Alias);
+  if (++I == Alias->getParent()->alias_end())
+    return nullptr;
+  return wrap(&*I);
+}
+
+LLVMValueRef LLVMGetPreviousGlobalAlias(LLVMValueRef GA) {
+  GlobalAlias *Alias = unwrap<GlobalAlias>(GA);
+  Module::alias_iterator I(Alias);
+  if (I == Alias->getParent()->alias_begin())
+    return nullptr;
+  return wrap(&*--I);
+}
+
+LLVMValueRef LLVMAliasGetAliasee(LLVMValueRef Alias) {
+  return wrap(unwrap<GlobalAlias>(Alias)->getAliasee());
+}
+
+void LLVMAliasSetAliasee(LLVMValueRef Alias, LLVMValueRef Aliasee) {
+  unwrap<GlobalAlias>(Alias)->setAliasee(unwrap<Constant>(Aliasee));
+}
+
 /*--.. Operations on functions .............................................--*/
 
 LLVMValueRef LLVMAddFunction(LLVMModuleRef M, const char *Name,
@@ -2160,12 +2391,15 @@ LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) {
   return nullptr;
 }
 
-/*--.. Call and invoke instructions ........................................--*/
-
 unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
+  if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
+    return FPI->getNumArgOperands();
+  }
   return CallSite(unwrap<Instruction>(Instr)).getNumArgOperands();
 }
 
+/*--.. Call and invoke instructions ........................................--*/
+
 unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
   return CallSite(unwrap<Instruction>(Instr)).getCallingConv();
 }
@@ -2248,6 +2482,11 @@ LLVMBasicBlockRef LLVMGetNormalDest(LLVMValueRef Invoke) {
 }
 
 LLVMBasicBlockRef LLVMGetUnwindDest(LLVMValueRef Invoke) {
+  if (CleanupReturnInst *CRI = dyn_cast<CleanupReturnInst>(unwrap(Invoke))) {
+    return wrap(CRI->getUnwindDest());
+  } else if (CatchSwitchInst *CSI = dyn_cast<CatchSwitchInst>(unwrap(Invoke))) {
+    return wrap(CSI->getUnwindDest());
+  }
   return wrap(unwrap<InvokeInst>(Invoke)->getUnwindDest());
 }
 
@@ -2256,6 +2495,11 @@ void LLVMSetNormalDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
 }
 
 void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
+  if (CleanupReturnInst *CRI = dyn_cast<CleanupReturnInst>(unwrap(Invoke))) {
+    return CRI->setUnwindDest(unwrap(B));
+  } else if (CatchSwitchInst *CSI = dyn_cast<CatchSwitchInst>(unwrap(Invoke))) {
+    return CSI->setUnwindDest(unwrap(B));
+  }
   unwrap<InvokeInst>(Invoke)->setUnwindDest(unwrap(B));
 }
 
@@ -2477,10 +2721,53 @@ LLVMValueRef LLVMBuildLandingPad(LLVMBuilderRef B, LLVMTypeRef Ty,
   return wrap(unwrap(B)->CreateLandingPad(unwrap(Ty), NumClauses, Name));
 }
 
+LLVMValueRef LLVMBuildCatchPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
+                               LLVMValueRef *Args, unsigned NumArgs,
+                               const char *Name) {
+  return wrap(unwrap(B)->CreateCatchPad(unwrap(ParentPad),
+                                        makeArrayRef(unwrap(Args), NumArgs),
+                                        Name));
+}
+
+LLVMValueRef LLVMBuildCleanupPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
+                                 LLVMValueRef *Args, unsigned NumArgs,
+                                 const char *Name) {
+  if (ParentPad == nullptr) {
+    Type *Ty = Type::getTokenTy(unwrap(B)->getContext());
+    ParentPad = wrap(Constant::getNullValue(Ty));
+  }
+  return wrap(unwrap(B)->CreateCleanupPad(unwrap(ParentPad),
+                                          makeArrayRef(unwrap(Args), NumArgs),
+                                          Name));
+}
+
 LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn) {
   return wrap(unwrap(B)->CreateResume(unwrap(Exn)));
 }
 
+LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad,
+                                  LLVMBasicBlockRef UnwindBB,
+                                  unsigned NumHandlers, const char *Name) {
+  if (ParentPad == nullptr) {
+    Type *Ty = Type::getTokenTy(unwrap(B)->getContext());
+    ParentPad = wrap(Constant::getNullValue(Ty));
+  }
+  return wrap(unwrap(B)->CreateCatchSwitch(unwrap(ParentPad), unwrap(UnwindBB),
+                                           NumHandlers, Name));
+}
+
+LLVMValueRef LLVMBuildCatchRet(LLVMBuilderRef B, LLVMValueRef CatchPad,
+                               LLVMBasicBlockRef BB) {
+  return wrap(unwrap(B)->CreateCatchRet(unwrap<CatchPadInst>(CatchPad),
+                                        unwrap(BB)));
+}
+
+LLVMValueRef LLVMBuildCleanupRet(LLVMBuilderRef B, LLVMValueRef CatchPad,
+                                 LLVMBasicBlockRef BB) {
+  return wrap(unwrap(B)->CreateCleanupRet(unwrap<CleanupPadInst>(CatchPad),
+                                          unwrap(BB)));
+}
+
 LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef B) {
   return wrap(unwrap(B)->CreateUnreachable());
 }
@@ -2515,6 +2802,40 @@ void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val) {
   unwrap<LandingPadInst>(LandingPad)->setCleanup(Val);
 }
 
+void LLVMAddHandler(LLVMValueRef CatchSwitch, LLVMBasicBlockRef Dest) {
+  unwrap<CatchSwitchInst>(CatchSwitch)->addHandler(unwrap(Dest));
+}
+
+unsigned LLVMGetNumHandlers(LLVMValueRef CatchSwitch) {
+  return unwrap<CatchSwitchInst>(CatchSwitch)->getNumHandlers();
+}
+
+void LLVMGetHandlers(LLVMValueRef CatchSwitch, LLVMBasicBlockRef *Handlers) {
+  CatchSwitchInst *CSI = unwrap<CatchSwitchInst>(CatchSwitch);
+  for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
+                                         E = CSI->handler_end(); I != E; ++I)
+    *Handlers++ = wrap(*I);
+}
+
+LLVMValueRef LLVMGetParentCatchSwitch(LLVMValueRef CatchPad) {
+  return wrap(unwrap<CatchPadInst>(CatchPad)->getCatchSwitch());
+}
+
+void LLVMSetParentCatchSwitch(LLVMValueRef CatchPad, LLVMValueRef CatchSwitch) {
+  unwrap<CatchPadInst>(CatchPad)
+    ->setCatchSwitch(unwrap<CatchSwitchInst>(CatchSwitch));
+}
+
+/*--.. Funclets ...........................................................--*/
+
+LLVMValueRef LLVMGetArgOperand(LLVMValueRef Funclet, unsigned i) {
+  return wrap(unwrap<FuncletPadInst>(Funclet)->getArgOperand(i));
+}
+
+void LLVMSetArgOperand(LLVMValueRef Funclet, unsigned i, LLVMValueRef value) {
+  unwrap<FuncletPadInst>(Funclet)->setArgOperand(i, unwrap(value));
+}
+
 /*--.. Arithmetic ..........................................................--*/
 
 LLVMValueRef LLVMBuildAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
diff --git a/contrib/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm/lib/IR/DIBuilder.cpp
index a00c595d01c5..5c5477f4f40f 100644
--- a/contrib/llvm/lib/IR/DIBuilder.cpp
+++ b/contrib/llvm/lib/IR/DIBuilder.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/IRBuilder.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
@@ -30,9 +31,9 @@ cl::opt<bool>
                llvm::cl::desc("Use llvm.dbg.addr for all local variables"),
                cl::init(false), cl::Hidden);
 
-DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes)
-  : M(m), VMContext(M.getContext()), CUNode(nullptr),
-      DeclareFn(nullptr), ValueFn(nullptr),
+DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU)
+  : M(m), VMContext(M.getContext()), CUNode(CU),
+      DeclareFn(nullptr), ValueFn(nullptr), LabelFn(nullptr),
       AllowUnresolvedNodes(AllowUnresolvedNodes) {}
 
 void DIBuilder::trackIfUnresolved(MDNode *N) {
@@ -46,18 +47,23 @@ void DIBuilder::trackIfUnresolved(MDNode *N) {
 }
 
 void DIBuilder::finalizeSubprogram(DISubprogram *SP) {
-  MDTuple *Temp = SP->getVariables().get();
+  MDTuple *Temp = SP->getRetainedNodes().get();
   if (!Temp || !Temp->isTemporary())
     return;
 
-  SmallVector<Metadata *, 4> Variables;
+  SmallVector<Metadata *, 16> RetainedNodes;
 
   auto PV = PreservedVariables.find(SP);
   if (PV != PreservedVariables.end())
-    Variables.append(PV->second.begin(), PV->second.end());
+    RetainedNodes.append(PV->second.begin(), PV->second.end());
 
-  DINodeArray AV = getOrCreateArray(Variables);
-  TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
+  auto PL = PreservedLabels.find(SP);
+  if (PL != PreservedLabels.end())
+    RetainedNodes.append(PL->second.begin(), PL->second.end());
+
+  DINodeArray Node = getOrCreateArray(RetainedNodes);
+
+  TempMDTuple(Temp)->replaceAllUsesWith(Node.get());
 }
 
 void DIBuilder::finalize() {
@@ -204,8 +210,9 @@ DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context,
 }
 
 DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory,
-                              DIFile::ChecksumKind CSKind, StringRef Checksum) {
-  return DIFile::get(VMContext, Filename, Directory, CSKind, Checksum);
+                              Optional<DIFile::ChecksumInfo<StringRef>> CS,
+                              Optional<StringRef> Source) {
+  return DIFile::get(VMContext, Filename, Directory, CS, Source);
 }
 
 DIMacro *DIBuilder::createMacro(DIMacroFile *Parent, unsigned LineNumber,
@@ -233,9 +240,10 @@ DIMacroFile *DIBuilder::createTempMacroFile(DIMacroFile *Parent,
   return MF;
 }
 
-DIEnumerator *DIBuilder::createEnumerator(StringRef Name, int64_t Val) {
+DIEnumerator *DIBuilder::createEnumerator(StringRef Name, int64_t Val,
+                                          bool IsUnsigned) {
   assert(!Name.empty() && "Unable to create enumerator without name");
-  return DIEnumerator::get(VMContext, Val, Name);
+  return DIEnumerator::get(VMContext, Val, IsUnsigned, Name);
 }
 
 DIBasicType *DIBuilder::createUnspecifiedType(StringRef Name) {
@@ -310,10 +318,14 @@ DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
                                             uint64_t BaseOffset,
+                                            uint32_t VBPtrOffset,
                                             DINode::DIFlags Flags) {
   assert(Ty && "Unable to create inheritance");
+  Metadata *ExtraData = ConstantAsMetadata::get(
+      ConstantInt::get(IntegerType::get(VMContext, 32), VBPtrOffset));
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
-                            0, Ty, BaseTy, 0, 0, BaseOffset, None, Flags);
+                            0, Ty, BaseTy, 0, 0, BaseOffset, None,
+                            Flags, ExtraData);
 }
 
 DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
@@ -333,6 +345,19 @@ static ConstantAsMetadata *getConstantOrNull(Constant *C) {
   return nullptr;
 }
 
+DIDerivedType *DIBuilder::createVariantMemberType(DIScope *Scope, StringRef Name,
+						  DIFile *File, unsigned LineNumber,
+						  uint64_t SizeInBits,
+						  uint32_t AlignInBits,
+						  uint64_t OffsetInBits,
+						  Constant *Discriminant,
+						  DINode::DIFlags Flags, DIType *Ty) {
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
+                            LineNumber, getNonCompileUnitScope(Scope), Ty,
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags,
+                            getConstantOrNull(Discriminant));
+}
+
 DIDerivedType *DIBuilder::createBitFieldMemberType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t OffsetInBits, uint64_t StorageOffsetInBits,
@@ -458,6 +483,18 @@ DICompositeType *DIBuilder::createUnionType(
   return R;
 }
 
+DICompositeType *DIBuilder::createVariantPart(
+    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, DINode::DIFlags Flags,
+    DIDerivedType *Discriminator, DINodeArray Elements, StringRef UniqueIdentifier) {
+  auto *R = DICompositeType::get(
+      VMContext, dwarf::DW_TAG_variant_part, Name, File, LineNumber,
+      getNonCompileUnitScope(Scope), nullptr, SizeInBits, AlignInBits, 0, Flags,
+      Elements, 0, nullptr, nullptr, UniqueIdentifier, Discriminator);
+  trackIfUnresolved(R);
+  return R;
+}
+
 DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
                                                   DINode::DIFlags Flags,
                                                   unsigned CC) {
@@ -467,11 +504,12 @@ DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
 DICompositeType *DIBuilder::createEnumerationType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, DINodeArray Elements,
-    DIType *UnderlyingType, StringRef UniqueIdentifier) {
+    DIType *UnderlyingType, StringRef UniqueIdentifier, bool IsFixed) {
   auto *CTy = DICompositeType::get(
       VMContext, dwarf::DW_TAG_enumeration_type, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), UnderlyingType, SizeInBits, AlignInBits, 0,
-      DINode::FlagZero, Elements, 0, nullptr, nullptr, UniqueIdentifier);
+      IsFixed ? DINode::FlagFixedEnum : DINode::FlagZero, Elements, 0, nullptr,
+      nullptr, UniqueIdentifier);
   AllEnumTypes.push_back(CTy);
   trackIfUnresolved(CTy);
   return CTy;
@@ -497,10 +535,14 @@ DICompositeType *DIBuilder::createVectorType(uint64_t Size,
   return R;
 }
 
-static DIType *createTypeWithFlags(LLVMContext &Context, DIType *Ty,
+DISubprogram *DIBuilder::createArtificialSubprogram(DISubprogram *SP) {
+  auto NewSP = SP->cloneWithFlags(SP->getFlags() | DINode::FlagArtificial);
+  return MDNode::replaceWithDistinct(std::move(NewSP));
+}
+
+static DIType *createTypeWithFlags(const DIType *Ty,
                                    DINode::DIFlags FlagsToSet) {
-  auto NewTy = Ty->clone();
-  NewTy->setFlags(NewTy->getFlags() | FlagsToSet);
+  auto NewTy = Ty->cloneWithFlags(Ty->getFlags() | FlagsToSet);
   return MDNode::replaceWithUniqued(std::move(NewTy));
 }
 
@@ -508,7 +550,7 @@ DIType *DIBuilder::createArtificialType(DIType *Ty) {
   // FIXME: Restrict this to the nodes where it's valid.
   if (Ty->isArtificial())
     return Ty;
-  return createTypeWithFlags(VMContext, Ty, DINode::FlagArtificial);
+  return createTypeWithFlags(Ty, DINode::FlagArtificial);
 }
 
 DIType *DIBuilder::createObjectPointerType(DIType *Ty) {
@@ -516,7 +558,7 @@ DIType *DIBuilder::createObjectPointerType(DIType *Ty) {
   if (Ty->isObjectPointer())
     return Ty;
   DINode::DIFlags Flags = DINode::FlagObjectPointer | DINode::FlagArtificial;
-  return createTypeWithFlags(VMContext, Ty, Flags);
+  return createTypeWithFlags(Ty, Flags);
 }
 
 void DIBuilder::retainType(DIScope *T) {
@@ -582,6 +624,10 @@ DISubrange *DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
   return DISubrange::get(VMContext, Count, Lo);
 }
 
+DISubrange *DIBuilder::getOrCreateSubrange(int64_t Lo, Metadata *CountNode) {
+  return DISubrange::get(VMContext, CountNode, Lo);
+}
+
 static void checkGlobalVariableScope(DIScope *Context) {
 #ifndef NDEBUG
   if (auto *CT =
@@ -666,6 +712,26 @@ DILocalVariable *DIBuilder::createParameterVariable(
                              /* AlignInBits */0);
 }
 
+DILabel *DIBuilder::createLabel(
+    DIScope *Scope, StringRef Name, DIFile *File,
+    unsigned LineNo, bool AlwaysPreserve) {
+  DIScope *Context = getNonCompileUnitScope(Scope);
+
+  auto *Node =
+      DILabel::get(VMContext, cast_or_null<DILocalScope>(Context), Name,
+                   File, LineNo);
+
+  if (AlwaysPreserve) {
+    /// The optimizer may remove labels. If there is an interest
+    /// to preserve label info in such situation then append it to
+    /// the list of retained nodes of the DISubprogram.
+    DISubprogram *Fn = getDISubprogram(Scope);
+    assert(Fn && "Missing subprogram for label");
+    PreservedLabels[Fn].emplace_back(Node);
+  }
+  return Node;
+}
+
 DIExpression *DIBuilder::createExpression(ArrayRef<uint64_t> Addr) {
   return DIExpression::get(VMContext, Addr);
 }
@@ -788,6 +854,18 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   return insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd, InsertBefore);
 }
 
+Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                    Instruction *InsertBefore) {
+  return insertLabel(
+      LabelInfo, DL, InsertBefore ? InsertBefore->getParent() : nullptr,
+      InsertBefore);
+}
+
+Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                    BasicBlock *InsertAtEnd) {
+  return insertLabel(LabelInfo, DL, InsertAtEnd, nullptr);
+}
+
 Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
                                                 DILocalVariable *VarInfo,
                                                 DIExpression *Expr,
@@ -873,6 +951,24 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(
   return B.CreateCall(ValueFn, Args);
 }
 
+Instruction *DIBuilder::insertLabel(
+    DILabel *LabelInfo, const DILocation *DL,
+    BasicBlock *InsertBB, Instruction *InsertBefore) {
+  assert(LabelInfo && "empty or invalid DILabel* passed to dbg.label");
+  assert(DL && "Expected debug loc");
+  assert(DL->getScope()->getSubprogram() ==
+             LabelInfo->getScope()->getSubprogram() &&
+         "Expected matching subprograms");
+  if (!LabelFn)
+    LabelFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_label);
+
+  trackIfUnresolved(LabelInfo);
+  Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
+
+  IRBuilder<> B = getIRBForDbgInsertion(DL, InsertBB, InsertBefore);
+  return B.CreateCall(LabelFn, Args);
+}
+
 void DIBuilder::replaceVTableHolder(DICompositeType *&T,
                                     DIType *VTableHolder) {
   {
diff --git a/contrib/llvm/lib/IR/DataLayout.cpp b/contrib/llvm/lib/IR/DataLayout.cpp
index f4dddeb30d0b..62c67127276e 100644
--- a/contrib/llvm/lib/IR/DataLayout.cpp
+++ b/contrib/llvm/lib/IR/DataLayout.cpp
@@ -129,13 +129,15 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
 
 PointerAlignElem
 PointerAlignElem::get(uint32_t AddressSpace, unsigned ABIAlign,
-                      unsigned PrefAlign, uint32_t TypeByteWidth) {
+                      unsigned PrefAlign, uint32_t TypeByteWidth,
+                      uint32_t IndexWidth) {
   assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!");
   PointerAlignElem retval;
   retval.AddressSpace = AddressSpace;
   retval.ABIAlign = ABIAlign;
   retval.PrefAlign = PrefAlign;
   retval.TypeByteWidth = TypeByteWidth;
+  retval.IndexWidth = IndexWidth;
   return retval;
 }
 
@@ -144,7 +146,8 @@ PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
   return (ABIAlign == rhs.ABIAlign
           && AddressSpace == rhs.AddressSpace
           && PrefAlign == rhs.PrefAlign
-          && TypeByteWidth == rhs.TypeByteWidth);
+          && TypeByteWidth == rhs.TypeByteWidth
+          && IndexWidth == rhs.IndexWidth);
 }
 
 //===----------------------------------------------------------------------===//
@@ -181,6 +184,7 @@ void DataLayout::reset(StringRef Desc) {
   BigEndian = false;
   AllocaAddrSpace = 0;
   StackNaturalAlign = 0;
+  ProgramAddrSpace = 0;
   ManglingMode = MM_None;
   NonIntegralAddressSpaces.clear();
 
@@ -189,7 +193,7 @@ void DataLayout::reset(StringRef Desc) {
     setAlignment((AlignTypeEnum)E.AlignType, E.ABIAlign, E.PrefAlign,
                  E.TypeBitWidth);
   }
-  setPointerAlignment(0, 8, 8, 8);
+  setPointerAlignment(0, 8, 8, 8, 8);
 
   parseSpecifier(Desc);
 }
@@ -221,6 +225,13 @@ static unsigned inBytes(unsigned Bits) {
   return Bits / 8;
 }
 
+static unsigned getAddrSpace(StringRef R) {
+  unsigned AddrSpace = getInt(R);
+  if (!isUInt<24>(AddrSpace))
+    report_fatal_error("Invalid address space, must be a 24-bit integer");
+  return AddrSpace;
+}
+
 void DataLayout::parseSpecifier(StringRef Desc) {
   StringRepresentation = Desc;
   while (!Desc.empty()) {
@@ -287,6 +298,10 @@ void DataLayout::parseSpecifier(StringRef Desc) {
         report_fatal_error(
             "Pointer ABI alignment must be a power of 2");
 
+      // Size of index used in GEP for address calculation.
+      // The parameter is optional. By default it is equal to size of pointer.
+      unsigned IndexSize = PointerMemSize;
+
       // Preferred alignment.
       unsigned PointerPrefAlign = PointerABIAlign;
       if (!Rest.empty()) {
@@ -295,10 +310,17 @@ void DataLayout::parseSpecifier(StringRef Desc) {
         if (!isPowerOf2_64(PointerPrefAlign))
           report_fatal_error(
             "Pointer preferred alignment must be a power of 2");
-      }
 
+        // Now read the index. It is the second optional parameter here.
+        if (!Rest.empty()) {
+          Split = split(Rest, ':');
+          IndexSize = inBytes(getInt(Tok));
+          if (!IndexSize)
+            report_fatal_error("Invalid index size of 0 bytes");
+        }
+      }
       setPointerAlignment(AddrSpace, PointerABIAlign, PointerPrefAlign,
-                          PointerMemSize);
+                          PointerMemSize, IndexSize);
       break;
     }
     case 'i':
@@ -358,10 +380,12 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       StackNaturalAlign = inBytes(getInt(Tok));
       break;
     }
+    case 'P': { // Function address space.
+      ProgramAddrSpace = getAddrSpace(Tok);
+      break;
+    }
     case 'A': { // Default stack/alloca address space.
-      AllocaAddrSpace = getInt(Tok);
-      if (!isUInt<24>(AllocaAddrSpace))
-        report_fatal_error("Invalid address space, must be a 24bit integer");
+      AllocaAddrSpace = getAddrSpace(Tok);
       break;
     }
     case 'm':
@@ -408,6 +432,7 @@ bool DataLayout::operator==(const DataLayout &Other) const {
   bool Ret = BigEndian == Other.BigEndian &&
              AllocaAddrSpace == Other.AllocaAddrSpace &&
              StackNaturalAlign == Other.StackNaturalAlign &&
+             ProgramAddrSpace == Other.ProgramAddrSpace &&
              ManglingMode == Other.ManglingMode &&
              LegalIntWidths == Other.LegalIntWidths &&
              Alignments == Other.Alignments && Pointers == Other.Pointers;
@@ -467,8 +492,8 @@ DataLayout::findPointerLowerBound(uint32_t AddressSpace) {
 }
 
 void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
-                                     unsigned PrefAlign,
-                                     uint32_t TypeByteWidth) {
+                                     unsigned PrefAlign, uint32_t TypeByteWidth,
+                                     uint32_t IndexWidth) {
   if (PrefAlign < ABIAlign)
     report_fatal_error(
         "Preferred alignment cannot be less than the ABI alignment");
@@ -476,11 +501,12 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
   PointersTy::iterator I = findPointerLowerBound(AddrSpace);
   if (I == Pointers.end() || I->AddressSpace != AddrSpace) {
     Pointers.insert(I, PointerAlignElem::get(AddrSpace, ABIAlign, PrefAlign,
-                                             TypeByteWidth));
+                                             TypeByteWidth, IndexWidth));
   } else {
     I->ABIAlign = ABIAlign;
     I->PrefAlign = PrefAlign;
     I->TypeByteWidth = TypeByteWidth;
+    I->IndexWidth = IndexWidth;
   }
 }
 
@@ -570,10 +596,8 @@ const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
   // Otherwise, create the struct layout.  Because it is variable length, we
   // malloc it, then use placement new.
   int NumElts = Ty->getNumElements();
-  StructLayout *L =
-    (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1) * sizeof(uint64_t));
-  if (L == nullptr)
-    report_bad_alloc_error("Allocation of StructLayout elements failed.");
+  StructLayout *L = (StructLayout *)
+      safe_malloc(sizeof(StructLayout)+(NumElts-1) * sizeof(uint64_t));
 
   // Set SL before calling StructLayout's ctor.  The ctor could cause other
   // entries to be added to TheMap, invalidating our reference.
@@ -618,6 +642,22 @@ unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
   return getPointerSizeInBits(cast<PointerType>(Ty)->getAddressSpace());
 }
 
+unsigned DataLayout::getIndexSize(unsigned AS) const {
+  PointersTy::const_iterator I = findPointerLowerBound(AS);
+  if (I == Pointers.end() || I->AddressSpace != AS) {
+    I = findPointerLowerBound(0);
+    assert(I->AddressSpace == 0);
+  }
+  return I->IndexWidth;
+}
+
+unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const {
+  assert(Ty->isPtrOrPtrVectorTy() &&
+         "This should only be called with a pointer or pointer vector type");
+  Ty = Ty->getScalarType();
+  return getIndexSizeInBits(cast<PointerType>(Ty)->getAddressSpace());
+}
+
 /*!
   \param abi_or_pref Flag that determines which alignment is returned. true
   returns the ABI alignment, false returns the preferred alignment.
@@ -701,13 +741,13 @@ unsigned DataLayout::getPreferredTypeAlignmentShift(Type *Ty) const {
 
 IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
                                        unsigned AddressSpace) const {
-  return IntegerType::get(C, getPointerSizeInBits(AddressSpace));
+  return IntegerType::get(C, getIndexSizeInBits(AddressSpace));
 }
 
 Type *DataLayout::getIntPtrType(Type *Ty) const {
   assert(Ty->isPtrOrPtrVectorTy() &&
          "Expected a pointer or pointer vector type.");
-  unsigned NumBits = getPointerTypeSizeInBits(Ty);
+  unsigned NumBits = getIndexTypeSizeInBits(Ty);
   IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
     return VectorType::get(IntTy, VecTy->getNumElements());
@@ -726,6 +766,16 @@ unsigned DataLayout::getLargestLegalIntTypeSizeInBits() const {
   return Max != LegalIntWidths.end() ? *Max : 0;
 }
 
+Type *DataLayout::getIndexType(Type *Ty) const {
+  assert(Ty->isPtrOrPtrVectorTy() &&
+         "Expected a pointer or pointer vector type.");
+  unsigned NumBits = getIndexTypeSizeInBits(Ty);
+  IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+    return VectorType::get(IntTy, VecTy->getNumElements());
+  return IntTy;
+}
+
 int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy,
                                            ArrayRef<Value *> Indices) const {
   int64_t Result = 0;
diff --git a/contrib/llvm/lib/IR/DebugInfo.cpp b/contrib/llvm/lib/IR/DebugInfo.cpp
index 7fff7526b926..77585ee30cd8 100644
--- a/contrib/llvm/lib/IR/DebugInfo.cpp
+++ b/contrib/llvm/lib/IR/DebugInfo.cpp
@@ -61,49 +61,60 @@ void DebugInfoFinder::reset() {
 }
 
 void DebugInfoFinder::processModule(const Module &M) {
-  for (auto *CU : M.debug_compile_units()) {
-    addCompileUnit(CU);
-    for (auto DIG : CU->getGlobalVariables()) {
-      if (!addGlobalVariable(DIG))
-        continue;
-      auto *GV = DIG->getVariable();
-      processScope(GV->getScope());
-      processType(GV->getType().resolve());
-    }
-    for (auto *ET : CU->getEnumTypes())
-      processType(ET);
-    for (auto *RT : CU->getRetainedTypes())
-      if (auto *T = dyn_cast<DIType>(RT))
-        processType(T);
-      else
-        processSubprogram(cast<DISubprogram>(RT));
-    for (auto *Import : CU->getImportedEntities()) {
-      auto *Entity = Import->getEntity().resolve();
-      if (auto *T = dyn_cast<DIType>(Entity))
-        processType(T);
-      else if (auto *SP = dyn_cast<DISubprogram>(Entity))
-        processSubprogram(SP);
-      else if (auto *NS = dyn_cast<DINamespace>(Entity))
-        processScope(NS->getScope());
-      else if (auto *M = dyn_cast<DIModule>(Entity))
-        processScope(M->getScope());
-    }
-  }
+  for (auto *CU : M.debug_compile_units())
+    processCompileUnit(CU);
   for (auto &F : M.functions()) {
     if (auto *SP = cast_or_null<DISubprogram>(F.getSubprogram()))
       processSubprogram(SP);
     // There could be subprograms from inlined functions referenced from
     // instructions only. Walk the function to find them.
-    for (const BasicBlock &BB : F) {
-      for (const Instruction &I : BB) {
-        if (!I.getDebugLoc())
-          continue;
-        processLocation(M, I.getDebugLoc().get());
-      }
-    }
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        processInstruction(M, I);
   }
 }
 
+void DebugInfoFinder::processCompileUnit(DICompileUnit *CU) {
+  if (!addCompileUnit(CU))
+    return;
+  for (auto DIG : CU->getGlobalVariables()) {
+    if (!addGlobalVariable(DIG))
+      continue;
+    auto *GV = DIG->getVariable();
+    processScope(GV->getScope());
+    processType(GV->getType().resolve());
+  }
+  for (auto *ET : CU->getEnumTypes())
+    processType(ET);
+  for (auto *RT : CU->getRetainedTypes())
+    if (auto *T = dyn_cast<DIType>(RT))
+      processType(T);
+    else
+      processSubprogram(cast<DISubprogram>(RT));
+  for (auto *Import : CU->getImportedEntities()) {
+    auto *Entity = Import->getEntity().resolve();
+    if (auto *T = dyn_cast<DIType>(Entity))
+      processType(T);
+    else if (auto *SP = dyn_cast<DISubprogram>(Entity))
+      processSubprogram(SP);
+    else if (auto *NS = dyn_cast<DINamespace>(Entity))
+      processScope(NS->getScope());
+    else if (auto *M = dyn_cast<DIModule>(Entity))
+      processScope(M->getScope());
+  }
+}
+
+void DebugInfoFinder::processInstruction(const Module &M,
+                                         const Instruction &I) {
+  if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
+    processDeclare(M, DDI);
+  else if (auto *DVI = dyn_cast<DbgValueInst>(&I))
+    processValue(M, DVI);
+
+  if (auto DbgLoc = I.getDebugLoc())
+    processLocation(M, DbgLoc.get());
+}
+
 void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
   if (!Loc)
     return;
@@ -165,6 +176,15 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
   if (!addSubprogram(SP))
     return;
   processScope(SP->getScope().resolve());
+  // Some of the users, e.g. CloneFunctionInto / CloneModule, need to set up a
+  // ValueMap containing identity mappings for all of the DICompileUnit's, not
+  // just DISubprogram's, referenced from anywhere within the Function being
+  // cloned prior to calling MapMetadata / RemapInstruction to avoid their
+  // duplication later as DICompileUnit's are also directly referenced by
+  // llvm.dbg.cu list. Thefore we need to collect DICompileUnit's here as well.
+  // Also, DICompileUnit's may reference DISubprogram's too and therefore need
+  // to be at least looked through.
+  processCompileUnit(SP->getUnit());
   processType(SP->getType());
   for (auto *Element : SP->getTemplateParams()) {
     if (auto *TType = dyn_cast<DITemplateTypeParameter>(Element)) {
@@ -293,7 +313,7 @@ static MDNode *stripDebugLocFromLoopID(MDNode *N) {
 
 bool llvm::stripDebugInfo(Function &F) {
   bool Changed = false;
-  if (F.getMetadata(LLVMContext::MD_dbg)) {
+  if (F.hasMetadata(LLVMContext::MD_dbg)) {
     Changed = true;
     F.setSubprogram(nullptr);
   }
@@ -349,12 +369,7 @@ bool llvm::StripDebugInfo(Module &M) {
     Changed |= stripDebugInfo(F);
 
   for (auto &GV : M.globals()) {
-    SmallVector<MDNode *, 1> MDs;
-    GV.getMetadata(LLVMContext::MD_dbg, MDs);
-    if (!MDs.empty()) {
-      GV.eraseMetadata(LLVMContext::MD_dbg);
-      Changed = true;
-    }
+    Changed |= GV.eraseMetadata(LLVMContext::MD_dbg);
   }
 
   if (GVMaterializer *Materializer = M.getMaterializer())
@@ -548,7 +563,7 @@ void DebugTypeInfoRemoval::traverse(MDNode *N) {
   // parts of the graph.
   auto prune = [](MDNode *Parent, MDNode *Child) {
     if (auto *MDS = dyn_cast<DISubprogram>(Parent))
-      return Child == MDS->getVariables().get();
+      return Child == MDS->getRetainedNodes().get();
     return false;
   };
 
@@ -654,10 +669,10 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
     SmallVector<MDNode *, 8> Ops;
     for (MDNode *Op : NMD.operands())
       Ops.push_back(remap(Op));
- 
+
     if (!Changed)
       continue;
- 
+
     NMD.clearOperands();
     for (auto *Op : Ops)
       if (Op)
@@ -675,7 +690,8 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
 
 void Instruction::applyMergedLocation(const DILocation *LocA,
                                       const DILocation *LocB) {
-  setDebugLoc(DILocation::getMergedLocation(LocA, LocB, this));
+  setDebugLoc(DILocation::getMergedLocation(LocA, LocB,
+                                            DILocation::WithGeneratedLocation));
 }
 
 //===----------------------------------------------------------------------===//
@@ -692,6 +708,18 @@ case LLVMDWARFSourceLanguage##NAME: return ID;
   llvm_unreachable("Unhandled Tag");
 }
 
+template <typename DIT> DIT *unwrapDI(LLVMMetadataRef Ref) {
+  return (DIT *)(Ref ? unwrap<MDNode>(Ref) : nullptr);
+}
+
+static DINode::DIFlags map_from_llvmDIFlags(LLVMDIFlags Flags) {
+  return static_cast<DINode::DIFlags>(Flags);
+}
+
+static LLVMDIFlags map_to_llvmDIFlags(DINode::DIFlags Flags) {
+  return static_cast<LLVMDIFlags>(Flags);
+}
+
 unsigned LLVMDebugMetadataVersion() {
   return DEBUG_METADATA_VERSION;
 }
@@ -727,7 +755,7 @@ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(
     unsigned RuntimeVer, const char *SplitName, size_t SplitNameLen,
     LLVMDWARFEmissionKind Kind, unsigned DWOId, LLVMBool SplitDebugInlining,
     LLVMBool DebugInfoForProfiling) {
-  auto File = unwrap<DIFile>(FileRef);
+  auto File = unwrapDI<DIFile>(FileRef);
 
   return wrap(unwrap(Builder)->createCompileUnit(
                  map_from_llvmDWARFsourcelanguage(Lang), File,
@@ -747,9 +775,581 @@ LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename,
 }
 
 LLVMMetadataRef
+LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope,
+                          const char *Name, size_t NameLen,
+                          const char *ConfigMacros, size_t ConfigMacrosLen,
+                          const char *IncludePath, size_t IncludePathLen,
+                          const char *ISysRoot, size_t ISysRootLen) {
+  return wrap(unwrap(Builder)->createModule(
+      unwrapDI<DIScope>(ParentScope), StringRef(Name, NameLen),
+      StringRef(ConfigMacros, ConfigMacrosLen),
+      StringRef(IncludePath, IncludePathLen),
+      StringRef(ISysRoot, ISysRootLen)));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateNameSpace(LLVMDIBuilderRef Builder,
+                                             LLVMMetadataRef ParentScope,
+                                             const char *Name, size_t NameLen,
+                                             LLVMBool ExportSymbols) {
+  return wrap(unwrap(Builder)->createNameSpace(
+      unwrapDI<DIScope>(ParentScope), StringRef(Name, NameLen), ExportSymbols));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateFunction(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, const char *LinkageName, size_t LinkageNameLen,
+    LLVMMetadataRef File, unsigned LineNo, LLVMMetadataRef Ty,
+    LLVMBool IsLocalToUnit, LLVMBool IsDefinition,
+    unsigned ScopeLine, LLVMDIFlags Flags, LLVMBool IsOptimized) {
+  return wrap(unwrap(Builder)->createFunction(
+      unwrapDI<DIScope>(Scope), {Name, NameLen}, {LinkageName, LinkageNameLen},
+      unwrapDI<DIFile>(File), LineNo, unwrapDI<DISubroutineType>(Ty),
+      IsLocalToUnit, IsDefinition, ScopeLine, map_from_llvmDIFlags(Flags),
+      IsOptimized, nullptr, nullptr, nullptr));
+}
+
+
+LLVMMetadataRef LLVMDIBuilderCreateLexicalBlock(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope,
+    LLVMMetadataRef File, unsigned Line, unsigned Col) {
+  return wrap(unwrap(Builder)->createLexicalBlock(unwrapDI<DIScope>(Scope),
+                                                  unwrapDI<DIFile>(File),
+                                                  Line, Col));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef Builder,
+                                    LLVMMetadataRef Scope,
+                                    LLVMMetadataRef File,
+                                    unsigned Discriminator) {
+  return wrap(unwrap(Builder)->createLexicalBlockFile(unwrapDI<DIScope>(Scope),
+                                                      unwrapDI<DIFile>(File),
+                                                      Discriminator));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedModuleFromNamespace(LLVMDIBuilderRef Builder,
+                                               LLVMMetadataRef Scope,
+                                               LLVMMetadataRef NS,
+                                               LLVMMetadataRef File,
+                                               unsigned Line) {
+  return wrap(unwrap(Builder)->createImportedModule(unwrapDI<DIScope>(Scope),
+                                                    unwrapDI<DINamespace>(NS),
+                                                    unwrapDI<DIFile>(File),
+                                                    Line));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedModuleFromAlias(LLVMDIBuilderRef Builder,
+                                           LLVMMetadataRef Scope,
+                                           LLVMMetadataRef ImportedEntity,
+                                           LLVMMetadataRef File,
+                                           unsigned Line) {
+  return wrap(unwrap(Builder)->createImportedModule(
+                  unwrapDI<DIScope>(Scope),
+                  unwrapDI<DIImportedEntity>(ImportedEntity),
+                  unwrapDI<DIFile>(File), Line));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedModuleFromModule(LLVMDIBuilderRef Builder,
+                                            LLVMMetadataRef Scope,
+                                            LLVMMetadataRef M,
+                                            LLVMMetadataRef File,
+                                            unsigned Line) {
+  return wrap(unwrap(Builder)->createImportedModule(unwrapDI<DIScope>(Scope),
+                                                    unwrapDI<DIModule>(M),
+                                                    unwrapDI<DIFile>(File),
+                                                    Line));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateImportedDeclaration(LLVMDIBuilderRef Builder,
+                                       LLVMMetadataRef Scope,
+                                       LLVMMetadataRef Decl,
+                                       LLVMMetadataRef File,
+                                       unsigned Line,
+                                       const char *Name, size_t NameLen) {
+  return wrap(unwrap(Builder)->createImportedDeclaration(
+                  unwrapDI<DIScope>(Scope),
+                  unwrapDI<DINode>(Decl),
+                  unwrapDI<DIFile>(File), Line, {Name, NameLen}));
+}
+
+LLVMMetadataRef
 LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line,
                                  unsigned Column, LLVMMetadataRef Scope,
                                  LLVMMetadataRef InlinedAt) {
   return wrap(DILocation::get(*unwrap(Ctx), Line, Column, unwrap(Scope),
                               unwrap(InlinedAt)));
 }
+
+unsigned LLVMDILocationGetLine(LLVMMetadataRef Location) {
+  return unwrapDI<DILocation>(Location)->getLine();
+}
+
+unsigned LLVMDILocationGetColumn(LLVMMetadataRef Location) {
+  return unwrapDI<DILocation>(Location)->getColumn();
+}
+
+LLVMMetadataRef LLVMDILocationGetScope(LLVMMetadataRef Location) {
+  return wrap(unwrapDI<DILocation>(Location)->getScope());
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateEnumerationType(
+  LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+  size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+  uint64_t SizeInBits, uint32_t AlignInBits, LLVMMetadataRef *Elements,
+  unsigned NumElements, LLVMMetadataRef ClassTy) {
+auto Elts = unwrap(Builder)->getOrCreateArray({unwrap(Elements),
+                                               NumElements});
+return wrap(unwrap(Builder)->createEnumerationType(
+    unwrapDI<DIScope>(Scope), {Name, NameLen}, unwrapDI<DIFile>(File),
+    LineNumber, SizeInBits, AlignInBits, Elts, unwrapDI<DIType>(ClassTy)));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateUnionType(
+  LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+  size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+  uint64_t SizeInBits, uint32_t AlignInBits, LLVMDIFlags Flags,
+  LLVMMetadataRef *Elements, unsigned NumElements, unsigned RunTimeLang,
+  const char *UniqueId, size_t UniqueIdLen) {
+  auto Elts = unwrap(Builder)->getOrCreateArray({unwrap(Elements),
+                                                 NumElements});
+  return wrap(unwrap(Builder)->createUnionType(
+     unwrapDI<DIScope>(Scope), {Name, NameLen}, unwrapDI<DIFile>(File),
+     LineNumber, SizeInBits, AlignInBits, map_from_llvmDIFlags(Flags),
+     Elts, RunTimeLang, {UniqueId, UniqueIdLen}));
+}
+
+
+LLVMMetadataRef
+LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef Builder, uint64_t Size,
+                             uint32_t AlignInBits, LLVMMetadataRef Ty,
+                             LLVMMetadataRef *Subscripts,
+                             unsigned NumSubscripts) {
+  auto Subs = unwrap(Builder)->getOrCreateArray({unwrap(Subscripts),
+                                                 NumSubscripts});
+  return wrap(unwrap(Builder)->createArrayType(Size, AlignInBits,
+                                               unwrapDI<DIType>(Ty), Subs));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateVectorType(LLVMDIBuilderRef Builder, uint64_t Size,
+                              uint32_t AlignInBits, LLVMMetadataRef Ty,
+                              LLVMMetadataRef *Subscripts,
+                              unsigned NumSubscripts) {
+  auto Subs = unwrap(Builder)->getOrCreateArray({unwrap(Subscripts),
+                                                 NumSubscripts});
+  return wrap(unwrap(Builder)->createVectorType(Size, AlignInBits,
+                                                unwrapDI<DIType>(Ty), Subs));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef Builder, const char *Name,
+                             size_t NameLen, uint64_t SizeInBits,
+                             LLVMDWARFTypeEncoding Encoding) {
+  return wrap(unwrap(Builder)->createBasicType({Name, NameLen},
+                                               SizeInBits, Encoding));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreatePointerType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef PointeeTy,
+    uint64_t SizeInBits, uint32_t AlignInBits, unsigned AddressSpace,
+    const char *Name, size_t NameLen) {
+  return wrap(unwrap(Builder)->createPointerType(unwrapDI<DIType>(PointeeTy),
+                                         SizeInBits, AlignInBits,
+                                         AddressSpace, {Name, NameLen}));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateStructType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, LLVMDIFlags Flags,
+    LLVMMetadataRef DerivedFrom, LLVMMetadataRef *Elements,
+    unsigned NumElements, unsigned RunTimeLang, LLVMMetadataRef VTableHolder,
+    const char *UniqueId, size_t UniqueIdLen) {
+  auto Elts = unwrap(Builder)->getOrCreateArray({unwrap(Elements),
+                                                 NumElements});
+  return wrap(unwrap(Builder)->createStructType(
+      unwrapDI<DIScope>(Scope), {Name, NameLen}, unwrapDI<DIFile>(File),
+      LineNumber, SizeInBits, AlignInBits, map_from_llvmDIFlags(Flags),
+      unwrapDI<DIType>(DerivedFrom), Elts, RunTimeLang,
+      unwrapDI<DIType>(VTableHolder), {UniqueId, UniqueIdLen}));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateMemberType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNo, uint64_t SizeInBits,
+    uint32_t AlignInBits, uint64_t OffsetInBits, LLVMDIFlags Flags,
+    LLVMMetadataRef Ty) {
+  return wrap(unwrap(Builder)->createMemberType(unwrapDI<DIScope>(Scope),
+      {Name, NameLen}, unwrapDI<DIFile>(File), LineNo, SizeInBits, AlignInBits,
+      OffsetInBits, map_from_llvmDIFlags(Flags), unwrapDI<DIType>(Ty)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateUnspecifiedType(LLVMDIBuilderRef Builder, const char *Name,
+                                   size_t NameLen) {
+  return wrap(unwrap(Builder)->createUnspecifiedType({Name, NameLen}));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateStaticMemberType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    LLVMMetadataRef Type, LLVMDIFlags Flags, LLVMValueRef ConstantVal,
+    uint32_t AlignInBits) {
+  return wrap(unwrap(Builder)->createStaticMemberType(
+                  unwrapDI<DIScope>(Scope), {Name, NameLen},
+                  unwrapDI<DIFile>(File), LineNumber, unwrapDI<DIType>(Type),
+                  map_from_llvmDIFlags(Flags), unwrap<Constant>(ConstantVal),
+                  AlignInBits));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateObjCIVar(LLVMDIBuilderRef Builder,
+                            const char *Name, size_t NameLen,
+                            LLVMMetadataRef File, unsigned LineNo,
+                            uint64_t SizeInBits, uint32_t AlignInBits,
+                            uint64_t OffsetInBits, LLVMDIFlags Flags,
+                            LLVMMetadataRef Ty, LLVMMetadataRef PropertyNode) {
+  return wrap(unwrap(Builder)->createObjCIVar(
+                  {Name, NameLen}, unwrapDI<DIFile>(File), LineNo,
+                  SizeInBits, AlignInBits, OffsetInBits,
+                  map_from_llvmDIFlags(Flags), unwrapDI<DIType>(Ty),
+                  unwrapDI<MDNode>(PropertyNode)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateObjCProperty(LLVMDIBuilderRef Builder,
+                                const char *Name, size_t NameLen,
+                                LLVMMetadataRef File, unsigned LineNo,
+                                const char *GetterName, size_t GetterNameLen,
+                                const char *SetterName, size_t SetterNameLen,
+                                unsigned PropertyAttributes,
+                                LLVMMetadataRef Ty) {
+  return wrap(unwrap(Builder)->createObjCProperty(
+                  {Name, NameLen}, unwrapDI<DIFile>(File), LineNo,
+                  {GetterName, GetterNameLen}, {SetterName, SetterNameLen},
+                  PropertyAttributes, unwrapDI<DIType>(Ty)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
+                                     LLVMMetadataRef Type) {
+  return wrap(unwrap(Builder)->createObjectPointerType(unwrapDI<DIType>(Type)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Builder, LLVMMetadataRef Type,
+                           const char *Name, size_t NameLen,
+                           LLVMMetadataRef File, unsigned LineNo,
+                           LLVMMetadataRef Scope) {
+  return wrap(unwrap(Builder)->createTypedef(
+                  unwrapDI<DIType>(Type), {Name, NameLen},
+                  unwrapDI<DIFile>(File), LineNo,
+                  unwrapDI<DIScope>(Scope)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateInheritance(LLVMDIBuilderRef Builder,
+                               LLVMMetadataRef Ty, LLVMMetadataRef BaseTy,
+                               uint64_t BaseOffset, uint32_t VBPtrOffset,
+                               LLVMDIFlags Flags) {
+  return wrap(unwrap(Builder)->createInheritance(
+                  unwrapDI<DIType>(Ty), unwrapDI<DIType>(BaseTy),
+                  BaseOffset, VBPtrOffset, map_from_llvmDIFlags(Flags)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateForwardDecl(
+    LLVMDIBuilderRef Builder, unsigned Tag, const char *Name,
+    size_t NameLen, LLVMMetadataRef Scope, LLVMMetadataRef File, unsigned Line,
+    unsigned RuntimeLang, uint64_t SizeInBits, uint32_t AlignInBits,
+    const char *UniqueIdentifier, size_t UniqueIdentifierLen) {
+  return wrap(unwrap(Builder)->createForwardDecl(
+                  Tag, {Name, NameLen}, unwrapDI<DIScope>(Scope),
+                  unwrapDI<DIFile>(File), Line, RuntimeLang, SizeInBits,
+                  AlignInBits, {UniqueIdentifier, UniqueIdentifierLen}));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateReplaceableCompositeType(
+    LLVMDIBuilderRef Builder, unsigned Tag, const char *Name,
+    size_t NameLen, LLVMMetadataRef Scope, LLVMMetadataRef File, unsigned Line,
+    unsigned RuntimeLang, uint64_t SizeInBits, uint32_t AlignInBits,
+    LLVMDIFlags Flags, const char *UniqueIdentifier,
+    size_t UniqueIdentifierLen) {
+  return wrap(unwrap(Builder)->createReplaceableCompositeType(
+                  Tag, {Name, NameLen}, unwrapDI<DIScope>(Scope),
+                  unwrapDI<DIFile>(File), Line, RuntimeLang, SizeInBits,
+                  AlignInBits, map_from_llvmDIFlags(Flags),
+                  {UniqueIdentifier, UniqueIdentifierLen}));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateQualifiedType(LLVMDIBuilderRef Builder, unsigned Tag,
+                                 LLVMMetadataRef Type) {
+  return wrap(unwrap(Builder)->createQualifiedType(Tag,
+                                                   unwrapDI<DIType>(Type)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateReferenceType(LLVMDIBuilderRef Builder, unsigned Tag,
+                                 LLVMMetadataRef Type) {
+  return wrap(unwrap(Builder)->createReferenceType(Tag,
+                                                   unwrapDI<DIType>(Type)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateNullPtrType(LLVMDIBuilderRef Builder) {
+  return wrap(unwrap(Builder)->createNullPtrType());
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateMemberPointerType(LLVMDIBuilderRef Builder,
+                                     LLVMMetadataRef PointeeType,
+                                     LLVMMetadataRef ClassType,
+                                     uint64_t SizeInBits,
+                                     uint32_t AlignInBits,
+                                     LLVMDIFlags Flags) {
+  return wrap(unwrap(Builder)->createMemberPointerType(
+                  unwrapDI<DIType>(PointeeType),
+                  unwrapDI<DIType>(ClassType), AlignInBits, SizeInBits,
+                  map_from_llvmDIFlags(Flags)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateBitFieldMemberType(LLVMDIBuilderRef Builder,
+                                      LLVMMetadataRef Scope,
+                                      const char *Name, size_t NameLen,
+                                      LLVMMetadataRef File, unsigned LineNumber,
+                                      uint64_t SizeInBits,
+                                      uint64_t OffsetInBits,
+                                      uint64_t StorageOffsetInBits,
+                                      LLVMDIFlags Flags, LLVMMetadataRef Type) {
+  return wrap(unwrap(Builder)->createBitFieldMemberType(
+                  unwrapDI<DIScope>(Scope), {Name, NameLen},
+                  unwrapDI<DIFile>(File), LineNumber,
+                  SizeInBits, OffsetInBits, StorageOffsetInBits,
+                  map_from_llvmDIFlags(Flags), unwrapDI<DIType>(Type)));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateClassType(LLVMDIBuilderRef Builder,
+    LLVMMetadataRef Scope, const char *Name, size_t NameLen,
+    LLVMMetadataRef File, unsigned LineNumber, uint64_t SizeInBits,
+    uint32_t AlignInBits, uint64_t OffsetInBits, LLVMDIFlags Flags,
+    LLVMMetadataRef DerivedFrom,
+    LLVMMetadataRef *Elements, unsigned NumElements,
+    LLVMMetadataRef VTableHolder, LLVMMetadataRef TemplateParamsNode,
+    const char *UniqueIdentifier, size_t UniqueIdentifierLen) {
+  auto Elts = unwrap(Builder)->getOrCreateArray({unwrap(Elements),
+                                                 NumElements});
+  return wrap(unwrap(Builder)->createClassType(
+                  unwrapDI<DIScope>(Scope), {Name, NameLen},
+                  unwrapDI<DIFile>(File), LineNumber,
+                  SizeInBits, AlignInBits, OffsetInBits,
+                  map_from_llvmDIFlags(Flags), unwrapDI<DIType>(DerivedFrom),
+                  Elts, unwrapDI<DIType>(VTableHolder),
+                  unwrapDI<MDNode>(TemplateParamsNode),
+                  {UniqueIdentifier, UniqueIdentifierLen}));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateArtificialType(LLVMDIBuilderRef Builder,
+                                  LLVMMetadataRef Type) {
+  return wrap(unwrap(Builder)->createArtificialType(unwrapDI<DIType>(Type)));
+}
+
+const char *LLVMDITypeGetName(LLVMMetadataRef DType, size_t *Length) {
+  StringRef Str = unwrap<DIType>(DType)->getName();
+  *Length = Str.size();
+  return Str.data();
+}
+
+uint64_t LLVMDITypeGetSizeInBits(LLVMMetadataRef DType) {
+  return unwrapDI<DIType>(DType)->getSizeInBits();
+}
+
+uint64_t LLVMDITypeGetOffsetInBits(LLVMMetadataRef DType) {
+  return unwrapDI<DIType>(DType)->getOffsetInBits();
+}
+
+uint32_t LLVMDITypeGetAlignInBits(LLVMMetadataRef DType) {
+  return unwrapDI<DIType>(DType)->getAlignInBits();
+}
+
+unsigned LLVMDITypeGetLine(LLVMMetadataRef DType) {
+  return unwrapDI<DIType>(DType)->getLine();
+}
+
+LLVMDIFlags LLVMDITypeGetFlags(LLVMMetadataRef DType) {
+  return map_to_llvmDIFlags(unwrapDI<DIType>(DType)->getFlags());
+}
+
+LLVMMetadataRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef Builder,
+                                                  LLVMMetadataRef *Types,
+                                                  size_t Length) {
+  return wrap(
+      unwrap(Builder)->getOrCreateTypeArray({unwrap(Types), Length}).get());
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Builder,
+                                  LLVMMetadataRef File,
+                                  LLVMMetadataRef *ParameterTypes,
+                                  unsigned NumParameterTypes,
+                                  LLVMDIFlags Flags) {
+  auto Elts = unwrap(Builder)->getOrCreateTypeArray({unwrap(ParameterTypes),
+                                                     NumParameterTypes});
+  return wrap(unwrap(Builder)->createSubroutineType(
+    Elts, map_from_llvmDIFlags(Flags)));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Builder,
+                                              int64_t *Addr, size_t Length) {
+  return wrap(unwrap(Builder)->createExpression(ArrayRef<int64_t>(Addr,
+                                                                  Length)));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateConstantValueExpression(LLVMDIBuilderRef Builder,
+                                           int64_t Value) {
+  return wrap(unwrap(Builder)->createConstantValueExpression(Value));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateGlobalVariableExpression(LLVMDIBuilderRef Builder,
+                                            LLVMMetadataRef Scope,
+                                            const char *Name, size_t NameLen,
+                                            const char *Linkage, size_t LinkLen,
+                                            LLVMMetadataRef File,
+                                            unsigned LineNo,
+                                            LLVMMetadataRef Ty,
+                                            LLVMBool LocalToUnit,
+                                            LLVMMetadataRef Expr,
+                                            LLVMMetadataRef Decl,
+                                            uint32_t AlignInBits) {
+  return wrap(unwrap(Builder)->createGlobalVariableExpression(
+                  unwrapDI<DIScope>(Scope), {Name, NameLen}, {Linkage, LinkLen},
+                  unwrapDI<DIFile>(File), LineNo, unwrapDI<DIType>(Ty),
+                  LocalToUnit, unwrap<DIExpression>(Expr),
+                  unwrapDI<MDNode>(Decl), AlignInBits));
+}
+
+LLVMMetadataRef LLVMTemporaryMDNode(LLVMContextRef Ctx, LLVMMetadataRef *Data,
+                                    size_t Count) {
+  return wrap(
+      MDTuple::getTemporary(*unwrap(Ctx), {unwrap(Data), Count}).release());
+}
+
+void LLVMDisposeTemporaryMDNode(LLVMMetadataRef TempNode) {
+  MDNode::deleteTemporary(unwrapDI<MDNode>(TempNode));
+}
+
+void LLVMMetadataReplaceAllUsesWith(LLVMMetadataRef TargetMetadata,
+                                    LLVMMetadataRef Replacement) {
+  auto *Node = unwrapDI<MDNode>(TargetMetadata);
+  Node->replaceAllUsesWith(unwrap<Metadata>(Replacement));
+  MDNode::deleteTemporary(Node);
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateTempGlobalVariableFwdDecl(LLVMDIBuilderRef Builder,
+                                             LLVMMetadataRef Scope,
+                                             const char *Name, size_t NameLen,
+                                             const char *Linkage, size_t LnkLen,
+                                             LLVMMetadataRef File,
+                                             unsigned LineNo,
+                                             LLVMMetadataRef Ty,
+                                             LLVMBool LocalToUnit,
+                                             LLVMMetadataRef Decl,
+                                             uint32_t AlignInBits) {
+  return wrap(unwrap(Builder)->createTempGlobalVariableFwdDecl(
+                  unwrapDI<DIScope>(Scope), {Name, NameLen}, {Linkage, LnkLen},
+                  unwrapDI<DIFile>(File), LineNo, unwrapDI<DIType>(Ty),
+                  LocalToUnit, unwrapDI<MDNode>(Decl), AlignInBits));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDeclareBefore(
+  LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+  LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMValueRef Instr) {
+  return wrap(unwrap(Builder)->insertDeclare(
+                  unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
+                  unwrap<Instruction>(Instr)));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  return wrap(unwrap(Builder)->insertDeclare(
+                  unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
+                  unwrap(Block)));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
+                                               LLVMValueRef Val,
+                                               LLVMMetadataRef VarInfo,
+                                               LLVMMetadataRef Expr,
+                                               LLVMMetadataRef DebugLoc,
+                                               LLVMValueRef Instr) {
+  return wrap(unwrap(Builder)->insertDbgValueIntrinsic(
+                  unwrap(Val), unwrap<DILocalVariable>(VarInfo),
+                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
+                  unwrap<Instruction>(Instr)));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(LLVMDIBuilderRef Builder,
+                                              LLVMValueRef Val,
+                                              LLVMMetadataRef VarInfo,
+                                              LLVMMetadataRef Expr,
+                                              LLVMMetadataRef DebugLoc,
+                                              LLVMBasicBlockRef Block) {
+  return wrap(unwrap(Builder)->insertDbgValueIntrinsic(
+                  unwrap(Val), unwrap<DILocalVariable>(VarInfo),
+                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
+                  unwrap(Block)));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateAutoVariable(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNo, LLVMMetadataRef Ty,
+    LLVMBool AlwaysPreserve, LLVMDIFlags Flags, uint32_t AlignInBits) {
+  return wrap(unwrap(Builder)->createAutoVariable(
+                  unwrap<DIScope>(Scope), {Name, NameLen}, unwrap<DIFile>(File),
+                  LineNo, unwrap<DIType>(Ty), AlwaysPreserve,
+                  map_from_llvmDIFlags(Flags), AlignInBits));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateParameterVariable(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, unsigned ArgNo, LLVMMetadataRef File, unsigned LineNo,
+    LLVMMetadataRef Ty, LLVMBool AlwaysPreserve, LLVMDIFlags Flags) {
+  return wrap(unwrap(Builder)->createParameterVariable(
+                  unwrap<DIScope>(Scope), Name, ArgNo, unwrap<DIFile>(File),
+                  LineNo, unwrap<DIType>(Ty), AlwaysPreserve,
+                  map_from_llvmDIFlags(Flags)));
+}
+
+LLVMMetadataRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef Builder,
+                                                 int64_t Lo, int64_t Count) {
+  return wrap(unwrap(Builder)->getOrCreateSubrange(Lo, Count));
+}
+
+LLVMMetadataRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef Builder,
+                                              LLVMMetadataRef *Data,
+                                              size_t Length) {
+  Metadata **DataValue = unwrap(Data);
+  return wrap(unwrap(Builder)->getOrCreateArray({DataValue, Length}).get());
+}
+
+LLVMMetadataRef LLVMGetSubprogram(LLVMValueRef Func) {
+  return wrap(unwrap<Function>(Func)->getSubprogram());
+}
+
+void LLVMSetSubprogram(LLVMValueRef Func, LLVMMetadataRef SP) {
+  unwrap<Function>(Func)->setSubprogram(unwrap<DISubprogram>(SP));
+}
diff --git a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
index 75ddd47b2591..910e8c2fb74f 100644
--- a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -68,16 +68,16 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
                    Storage, Context.pImpl->DILocations);
 }
 
-const DILocation *
-DILocation::getMergedLocation(const DILocation *LocA, const DILocation *LocB,
-                              const Instruction *ForInst) {
+const DILocation *DILocation::getMergedLocation(const DILocation *LocA,
+                                                const DILocation *LocB,
+                                                bool GenerateLocation) {
   if (!LocA || !LocB)
     return nullptr;
 
   if (LocA == LocB || !LocA->canDiscriminate(*LocB))
     return LocA;
 
-  if (!dyn_cast_or_null<CallInst>(ForInst))
+  if (!GenerateLocation)
     return nullptr;
 
   SmallPtrSet<DILocation *, 5> InlinedLocationsA;
@@ -249,17 +249,26 @@ void GenericDINode::recalculateHash() {
 
 DISubrange *DISubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
                                 StorageType Storage, bool ShouldCreate) {
-  DEFINE_GETIMPL_LOOKUP(DISubrange, (Count, Lo));
-  DEFINE_GETIMPL_STORE_NO_OPS(DISubrange, (Count, Lo));
+  auto *CountNode = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), Count));
+  return getImpl(Context, CountNode, Lo, Storage, ShouldCreate);
+}
+
+DISubrange *DISubrange::getImpl(LLVMContext &Context, Metadata *CountNode,
+                                int64_t Lo, StorageType Storage,
+                                bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DISubrange, (CountNode, Lo));
+  Metadata *Ops[] = { CountNode };
+  DEFINE_GETIMPL_STORE(DISubrange, (CountNode, Lo), Ops);
 }
 
 DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, int64_t Value,
-                                    MDString *Name, StorageType Storage,
-                                    bool ShouldCreate) {
+                                    bool IsUnsigned, MDString *Name,
+                                    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIEnumerator, (Value, Name));
+  DEFINE_GETIMPL_LOOKUP(DIEnumerator, (Value, IsUnsigned, Name));
   Metadata *Ops[] = {Name};
-  DEFINE_GETIMPL_STORE(DIEnumerator, (Value), Ops);
+  DEFINE_GETIMPL_STORE(DIEnumerator, (Value, IsUnsigned), Ops);
 }
 
 DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
@@ -274,6 +283,19 @@ DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
                        Ops);
 }
 
+Optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
+  switch (getEncoding()) {
+  case dwarf::DW_ATE_signed:
+  case dwarf::DW_ATE_signed_char:
+    return Signedness::Signed;
+  case dwarf::DW_ATE_unsigned:
+  case dwarf::DW_ATE_unsigned_char:
+    return Signedness::Unsigned;
+  default:
+    return None;
+  }
+}
+
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -296,17 +318,18 @@ DICompositeType *DICompositeType::getImpl(
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
     uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
     Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
-    Metadata *TemplateParams, MDString *Identifier, StorageType Storage,
-    bool ShouldCreate) {
+    Metadata *TemplateParams, MDString *Identifier, Metadata *Discriminator,
+    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
 
   // Keep this in sync with buildODRType.
   DEFINE_GETIMPL_LOOKUP(
       DICompositeType, (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                         AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-                        VTableHolder, TemplateParams, Identifier));
+                        VTableHolder, TemplateParams, Identifier, Discriminator));
   Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
-                     Elements, VTableHolder, TemplateParams, Identifier};
+                     Elements, VTableHolder, TemplateParams, Identifier,
+                     Discriminator};
   DEFINE_GETIMPL_STORE(DICompositeType, (Tag, Line, RuntimeLang, SizeInBits,
                                          AlignInBits, OffsetInBits, Flags),
                        Ops);
@@ -317,7 +340,7 @@ DICompositeType *DICompositeType::buildODRType(
     Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
-    Metadata *VTableHolder, Metadata *TemplateParams) {
+    Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -326,7 +349,7 @@ DICompositeType *DICompositeType::buildODRType(
     return CT = DICompositeType::getDistinct(
                Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-               VTableHolder, TemplateParams, &Identifier);
+               VTableHolder, TemplateParams, &Identifier, Discriminator);
 
   // Only mutate CT if it's a forward declaration and the new operands aren't.
   assert(CT->getRawIdentifier() == &Identifier && "Wrong ODR identifier?");
@@ -337,7 +360,8 @@ DICompositeType *DICompositeType::buildODRType(
   CT->mutate(Tag, Line, RuntimeLang, SizeInBits, AlignInBits, OffsetInBits,
              Flags);
   Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
-                     Elements, VTableHolder, TemplateParams, &Identifier};
+                     Elements, VTableHolder, TemplateParams, &Identifier,
+                     Discriminator};
   assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
          "Mismatched number of operands");
   for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
@@ -351,7 +375,7 @@ DICompositeType *DICompositeType::getODRType(
     Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
-    Metadata *VTableHolder, Metadata *TemplateParams) {
+    Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -360,7 +384,7 @@ DICompositeType *DICompositeType::getODRType(
     CT = DICompositeType::getDistinct(
         Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
         AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder,
-        TemplateParams, &Identifier);
+        TemplateParams, &Identifier, Discriminator);
   return CT;
 }
 
@@ -383,34 +407,39 @@ DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, DIFlags Flags,
 
 // FIXME: Implement this string-enum correspondence with a .def file and macros,
 // so that the association is explicit rather than implied.
-static const char *ChecksumKindName[DIFile::CSK_Last + 1] = {
-  "CSK_None",
+static const char *ChecksumKindName[DIFile::CSK_Last] = {
   "CSK_MD5",
   "CSK_SHA1"
 };
 
-DIFile::ChecksumKind DIFile::getChecksumKind(StringRef CSKindStr) {
-  return StringSwitch<DIFile::ChecksumKind>(CSKindStr)
-      .Case("CSK_MD5", DIFile::CSK_MD5)
-      .Case("CSK_SHA1", DIFile::CSK_SHA1)
-      .Default(DIFile::CSK_None);
+StringRef DIFile::getChecksumKindAsString(ChecksumKind CSKind) {
+  assert(CSKind <= DIFile::CSK_Last && "Invalid checksum kind");
+  // The first space was originally the CSK_None variant, which is now
+  // obsolete, but the space is still reserved in ChecksumKind, so we account
+  // for it here.
+  return ChecksumKindName[CSKind - 1];
 }
 
-StringRef DIFile::getChecksumKindAsString() const {
-  assert(CSKind <= DIFile::CSK_Last && "Invalid checksum kind");
-  return ChecksumKindName[CSKind];
+Optional<DIFile::ChecksumKind> DIFile::getChecksumKind(StringRef CSKindStr) {
+  return StringSwitch<Optional<DIFile::ChecksumKind>>(CSKindStr)
+      .Case("CSK_MD5", DIFile::CSK_MD5)
+      .Case("CSK_SHA1", DIFile::CSK_SHA1)
+      .Default(None);
 }
 
 DIFile *DIFile::getImpl(LLVMContext &Context, MDString *Filename,
-                        MDString *Directory, DIFile::ChecksumKind CSKind,
-                        MDString *Checksum, StorageType Storage,
+                        MDString *Directory,
+                        Optional<DIFile::ChecksumInfo<MDString *>> CS,
+                        Optional<MDString *> Source, StorageType Storage,
                         bool ShouldCreate) {
   assert(isCanonical(Filename) && "Expected canonical MDString");
   assert(isCanonical(Directory) && "Expected canonical MDString");
-  assert(isCanonical(Checksum) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIFile, (Filename, Directory, CSKind, Checksum));
-  Metadata *Ops[] = {Filename, Directory, Checksum};
-  DEFINE_GETIMPL_STORE(DIFile, (CSKind), Ops);
+  assert((!CS || isCanonical(CS->Value)) && "Expected canonical MDString");
+  assert((!Source || isCanonical(*Source)) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(DIFile, (Filename, Directory, CS, Source));
+  Metadata *Ops[] = {Filename, Directory, CS ? CS->Value : nullptr,
+                     Source.getValueOr(nullptr)};
+  DEFINE_GETIMPL_STORE(DIFile, (CS, Source), Ops);
 }
 
 DICompileUnit *DICompileUnit::getImpl(
@@ -446,7 +475,7 @@ DICompileUnit::getEmissionKind(StringRef Str) {
       .Default(None);
 }
 
-const char *DICompileUnit::EmissionKindString(DebugEmissionKind EK) {
+const char *DICompileUnit::emissionKindString(DebugEmissionKind EK) {
   switch (EK) {
   case NoDebug:        return "NoDebug";
   case FullDebug:      return "FullDebug";
@@ -473,7 +502,7 @@ DISubprogram *DISubprogram::getImpl(
     bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
     Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
     int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit,
-    Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
+    Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes,
     Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
@@ -481,10 +510,10 @@ DISubprogram *DISubprogram::getImpl(
       DISubprogram, (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
                      IsDefinition, ScopeLine, ContainingType, Virtuality,
                      VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit,
-                     TemplateParams, Declaration, Variables, ThrownTypes));
+                     TemplateParams, Declaration, RetainedNodes, ThrownTypes));
   SmallVector<Metadata *, 11> Ops = {
-      File,        Scope,     Name,           LinkageName,    Type,       Unit,
-      Declaration, Variables, ContainingType, TemplateParams, ThrownTypes};
+      File,        Scope,         Name,           LinkageName,    Type,       Unit,
+      Declaration, RetainedNodes, ContainingType, TemplateParams, ThrownTypes};
   if (!ThrownTypes) {
     Ops.pop_back();
     if (!TemplateParams) {
@@ -637,6 +666,18 @@ Optional<uint64_t> DIVariable::getSizeInBits() const {
   return None;
 }
 
+DILabel *DILabel::getImpl(LLVMContext &Context, Metadata *Scope,
+                          MDString *Name, Metadata *File, unsigned Line,
+                          StorageType Storage,
+                          bool ShouldCreate) {
+  assert(Scope && "Expected scope");
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(DILabel,
+                        (Scope, Name, File, Line));
+  Metadata *Ops[] = {Scope, Name, File};
+  DEFINE_GETIMPL_STORE(DILabel, (Line), Ops);
+}
+
 DIExpression *DIExpression::getImpl(LLVMContext &Context,
                                     ArrayRef<uint64_t> Elements,
                                     StorageType Storage, bool ShouldCreate) {
@@ -695,8 +736,19 @@ bool DIExpression::isValid() const {
     case dwarf::DW_OP_plus:
     case dwarf::DW_OP_minus:
     case dwarf::DW_OP_mul:
+    case dwarf::DW_OP_div:
+    case dwarf::DW_OP_mod:
+    case dwarf::DW_OP_or:
+    case dwarf::DW_OP_and:
+    case dwarf::DW_OP_xor:
+    case dwarf::DW_OP_shl:
+    case dwarf::DW_OP_shr:
+    case dwarf::DW_OP_shra:
     case dwarf::DW_OP_deref:
     case dwarf::DW_OP_xderef:
+    case dwarf::DW_OP_lit0:
+    case dwarf::DW_OP_not:
+    case dwarf::DW_OP_dup:
       break;
     }
   }
@@ -756,31 +808,94 @@ DIExpression *DIExpression::prepend(const DIExpression *Expr, bool DerefBefore,
   SmallVector<uint64_t, 8> Ops;
   if (DerefBefore)
     Ops.push_back(dwarf::DW_OP_deref);
-  
+
   appendOffset(Ops, Offset);
   if (DerefAfter)
     Ops.push_back(dwarf::DW_OP_deref);
 
-  if (Expr)
-    for (auto Op : Expr->expr_ops()) {
-      // A DW_OP_stack_value comes at the end, but before a DW_OP_LLVM_fragment.
-      if (StackValue) {
-        if (Op.getOp() == dwarf::DW_OP_stack_value)
-          StackValue = false;
-        else if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
-          Ops.push_back(dwarf::DW_OP_stack_value);
-          StackValue = false;
-        }
+  return prependOpcodes(Expr, Ops, StackValue);
+}
+
+DIExpression *DIExpression::prependOpcodes(const DIExpression *Expr,
+                                           SmallVectorImpl<uint64_t> &Ops,
+                                           bool StackValue) {
+  assert(Expr && "Can't prepend ops to this expression");
+
+  // If there are no ops to prepend, do not even add the DW_OP_stack_value.
+  if (Ops.empty())
+    StackValue = false;
+  for (auto Op : Expr->expr_ops()) {
+    // A DW_OP_stack_value comes at the end, but before a DW_OP_LLVM_fragment.
+    if (StackValue) {
+      if (Op.getOp() == dwarf::DW_OP_stack_value)
+        StackValue = false;
+      else if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
+        Ops.push_back(dwarf::DW_OP_stack_value);
+        StackValue = false;
       }
-      Ops.push_back(Op.getOp());
-      for (unsigned I = 0; I < Op.getNumArgs(); ++I)
-        Ops.push_back(Op.getArg(I));
     }
+    Op.appendToVector(Ops);
+  }
   if (StackValue)
     Ops.push_back(dwarf::DW_OP_stack_value);
   return DIExpression::get(Expr->getContext(), Ops);
 }
 
+DIExpression *DIExpression::append(const DIExpression *Expr,
+                                   ArrayRef<uint64_t> Ops) {
+  assert(Expr && !Ops.empty() && "Can't append ops to this expression");
+
+  // Copy Expr's current op list.
+  SmallVector<uint64_t, 16> NewOps;
+  for (auto Op : Expr->expr_ops()) {
+    // Append new opcodes before DW_OP_{stack_value, LLVM_fragment}.
+    if (Op.getOp() == dwarf::DW_OP_stack_value ||
+        Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
+      NewOps.append(Ops.begin(), Ops.end());
+
+      // Ensure that the new opcodes are only appended once.
+      Ops = None;
+    }
+    Op.appendToVector(NewOps);
+  }
+
+  NewOps.append(Ops.begin(), Ops.end());
+  return DIExpression::get(Expr->getContext(), NewOps);
+}
+
+DIExpression *DIExpression::appendToStack(const DIExpression *Expr,
+                                          ArrayRef<uint64_t> Ops) {
+  assert(Expr && !Ops.empty() && "Can't append ops to this expression");
+  assert(none_of(Ops,
+                 [](uint64_t Op) {
+                   return Op == dwarf::DW_OP_stack_value ||
+                          Op == dwarf::DW_OP_LLVM_fragment;
+                 }) &&
+         "Can't append this op");
+
+  // Append a DW_OP_deref after Expr's current op list if it's non-empty and
+  // has no DW_OP_stack_value.
+  //
+  // Match .* DW_OP_stack_value (DW_OP_LLVM_fragment A B)?.
+  Optional<FragmentInfo> FI = Expr->getFragmentInfo();
+  unsigned DropUntilStackValue = FI.hasValue() ? 3 : 0;
+  ArrayRef<uint64_t> ExprOpsBeforeFragment =
+      Expr->getElements().drop_back(DropUntilStackValue);
+  bool NeedsDeref = (Expr->getNumElements() > DropUntilStackValue) &&
+                    (ExprOpsBeforeFragment.back() != dwarf::DW_OP_stack_value);
+  bool NeedsStackValue = NeedsDeref || ExprOpsBeforeFragment.empty();
+
+  // Append a DW_OP_deref after Expr's current op list if needed, then append
+  // the new ops, and finally ensure that a single DW_OP_stack_value is present.
+  SmallVector<uint64_t, 16> NewOps;
+  if (NeedsDeref)
+    NewOps.push_back(dwarf::DW_OP_deref);
+  NewOps.append(Ops.begin(), Ops.end());
+  if (NeedsStackValue)
+    NewOps.push_back(dwarf::DW_OP_stack_value);
+  return DIExpression::append(Expr, NewOps);
+}
+
 Optional<DIExpression *> DIExpression::createFragmentExpression(
     const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits) {
   SmallVector<uint64_t, 8> Ops;
@@ -800,17 +915,15 @@ Optional<DIExpression *> DIExpression::createFragmentExpression(
       case dwarf::DW_OP_LLVM_fragment: {
         // Make the new offset point into the existing fragment.
         uint64_t FragmentOffsetInBits = Op.getArg(0);
-        // Op.getArg(0) is FragmentOffsetInBits.
-        // Op.getArg(1) is FragmentSizeInBits.
-        assert((OffsetInBits + SizeInBits <= Op.getArg(0) + Op.getArg(1)) &&
+        uint64_t FragmentSizeInBits = Op.getArg(1);
+        (void)FragmentSizeInBits;
+        assert((OffsetInBits + SizeInBits <= FragmentSizeInBits) &&
                "new fragment outside of original fragment");
         OffsetInBits += FragmentOffsetInBits;
         continue;
       }
       }
-      Ops.push_back(Op.getOp());
-      for (unsigned I = 0; I < Op.getNumArgs(); ++I)
-        Ops.push_back(Op.getArg(I));
+      Op.appendToVector(Ops);
     }
   }
   Ops.push_back(dwarf::DW_OP_LLVM_fragment);
@@ -883,4 +996,3 @@ DIMacroFile *DIMacroFile::getImpl(LLVMContext &Context, unsigned MIType,
   Metadata *Ops[] = { File, Elements };
   DEFINE_GETIMPL_STORE(DIMacroFile, (MIType, Line), Ops);
 }
-
diff --git a/contrib/llvm/lib/IR/DebugLoc.cpp b/contrib/llvm/lib/IR/DebugLoc.cpp
index 0a494119c3fe..36f3e179a2c0 100644
--- a/contrib/llvm/lib/IR/DebugLoc.cpp
+++ b/contrib/llvm/lib/IR/DebugLoc.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/IR/DebugLoc.h"
 #include "LLVMContextImpl.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
 
@@ -99,19 +100,7 @@ DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void DebugLoc::dump() const {
-  if (!Loc)
-    return;
-
-  dbgs() << getLine();
-  if (getCol() != 0)
-    dbgs() << ',' << getCol();
-  if (DebugLoc InlinedAtDL = DebugLoc(getInlinedAt())) {
-    dbgs() << " @ ";
-    InlinedAtDL.dump();
-  } else
-    dbgs() << "\n";
-}
+LLVM_DUMP_METHOD void DebugLoc::dump() const { print(dbgs()); }
 #endif
 
 void DebugLoc::print(raw_ostream &OS) const {
diff --git a/contrib/llvm/lib/IR/DiagnosticHandler.cpp b/contrib/llvm/lib/IR/DiagnosticHandler.cpp
index fb1ac438ffbe..8f972785cf91 100644
--- a/contrib/llvm/lib/IR/DiagnosticHandler.cpp
+++ b/contrib/llvm/lib/IR/DiagnosticHandler.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Regular expression corresponding to the value given in one of the
+/// Regular expression corresponding to the value given in one of the
 /// -pass-remarks* command line flags. Passes whose name matches this regexp
 /// will emit a diagnostic when calling the associated diagnostic function
 /// (emitOptimizationRemark, emitOptimizationRemarkMissed or
diff --git a/contrib/llvm/lib/IR/DiagnosticInfo.cpp b/contrib/llvm/lib/IR/DiagnosticInfo.cpp
index 946df1a836ce..5ddb1196b072 100644
--- a/contrib/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/contrib/llvm/lib/IR/DiagnosticInfo.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include <atomic>
 #include <cassert>
 #include <memory>
@@ -144,7 +145,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
   else if (auto *I = dyn_cast<Instruction>(V))
     Loc = I->getDebugLoc();
 
-  // Only include names that correspond to user variables.  FIXME: we should use
+  // Only include names that correspond to user variables.  FIXME: We should use
   // debug info if available to get the name of the user variable.
   if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
     Val = GlobalValue::dropLLVMManglingEscape(V->getName());
@@ -167,6 +168,9 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, StringRef S)
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, int N)
     : Key(Key), Val(itostr(N)) {}
 
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, float N)
+    : Key(Key), Val(llvm::to_string(N)) {}
+
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, long N)
     : Key(Key), Val(itostr(N)) {}
 
diff --git a/contrib/llvm/lib/IR/DomTreeUpdater.cpp b/contrib/llvm/lib/IR/DomTreeUpdater.cpp
new file mode 100644
index 000000000000..f035a86eddae
--- /dev/null
+++ b/contrib/llvm/lib/IR/DomTreeUpdater.cpp
@@ -0,0 +1,534 @@
+//===- DomTreeUpdater.cpp - DomTree/Post DomTree Updater --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the DomTreeUpdater class, which provides a uniform way
+// to update dominator tree related data structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DomTreeUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/GenericDomTree.h"
+#include <algorithm>
+#include <functional>
+
+namespace llvm {
+
+bool DomTreeUpdater::isUpdateValid(
+    const DominatorTree::UpdateType Update) const {
+  const auto *From = Update.getFrom();
+  const auto *To = Update.getTo();
+  const auto Kind = Update.getKind();
+
+  // Discard updates by inspecting the current state of successors of From.
+  // Since isUpdateValid() must be called *after* the Terminator of From is
+  // altered we can determine if the update is unnecessary for batch updates
+  // or invalid for a single update.
+  const bool HasEdge = llvm::any_of(
+      successors(From), [To](const BasicBlock *B) { return B == To; });
+
+  // If the IR does not match the update,
+  // 1. In batch updates, this update is unnecessary.
+  // 2. When called by insertEdge*()/deleteEdge*(), this update is invalid.
+  // Edge does not exist in IR.
+  if (Kind == DominatorTree::Insert && !HasEdge)
+    return false;
+
+  // Edge exists in IR.
+  if (Kind == DominatorTree::Delete && HasEdge)
+    return false;
+
+  return true;
+}
+
+bool DomTreeUpdater::isSelfDominance(
+    const DominatorTree::UpdateType Update) const {
+  // Won't affect DomTree and PostDomTree.
+  return Update.getFrom() == Update.getTo();
+}
+
+bool DomTreeUpdater::applyLazyUpdate(DominatorTree::UpdateKind Kind,
+                                     BasicBlock *From, BasicBlock *To) {
+  assert((DT || PDT) &&
+         "Call applyLazyUpdate() when both DT and PDT are nullptrs.");
+  assert(Strategy == DomTreeUpdater::UpdateStrategy::Lazy &&
+         "Call applyLazyUpdate() with Eager strategy error");
+  // Analyze pending updates to determine if the update is unnecessary.
+  const DominatorTree::UpdateType Update = {Kind, From, To};
+  const DominatorTree::UpdateType Invert = {Kind != DominatorTree::Insert
+                                                ? DominatorTree::Insert
+                                                : DominatorTree::Delete,
+                                            From, To};
+  // Only check duplicates in updates that are not applied by both trees.
+  auto I =
+      PendUpdates.begin() + std::max(PendDTUpdateIndex, PendPDTUpdateIndex);
+  const auto E = PendUpdates.end();
+
+  assert(I <= E && "Iterator out of range.");
+
+  for (; I != E; ++I) {
+    if (Update == *I)
+      return false; // Discard duplicate updates.
+
+    if (Invert == *I) {
+      // Update and Invert are both valid (equivalent to a no-op). Remove
+      // Invert from PendUpdates and discard the Update.
+      PendUpdates.erase(I);
+      return false;
+    }
+  }
+
+  PendUpdates.push_back(Update); // Save the valid update.
+  return true;
+}
+
+void DomTreeUpdater::applyDomTreeUpdates() {
+  // No pending DomTreeUpdates.
+  if (Strategy != UpdateStrategy::Lazy || !DT)
+    return;
+
+  // Only apply updates not are applied by DomTree.
+  if (hasPendingDomTreeUpdates()) {
+    const auto I = PendUpdates.begin() + PendDTUpdateIndex;
+    const auto E = PendUpdates.end();
+    assert(I < E && "Iterator range invalid; there should be DomTree updates.");
+    DT->applyUpdates(ArrayRef<DominatorTree::UpdateType>(I, E));
+    PendDTUpdateIndex = PendUpdates.size();
+  }
+}
+
+void DomTreeUpdater::flush() {
+  applyDomTreeUpdates();
+  applyPostDomTreeUpdates();
+  dropOutOfDateUpdates();
+}
+
+void DomTreeUpdater::applyPostDomTreeUpdates() {
+  // No pending PostDomTreeUpdates.
+  if (Strategy != UpdateStrategy::Lazy || !PDT)
+    return;
+
+  // Only apply updates not are applied by PostDomTree.
+  if (hasPendingPostDomTreeUpdates()) {
+    const auto I = PendUpdates.begin() + PendPDTUpdateIndex;
+    const auto E = PendUpdates.end();
+    assert(I < E &&
+           "Iterator range invalid; there should be PostDomTree updates.");
+    PDT->applyUpdates(ArrayRef<DominatorTree::UpdateType>(I, E));
+    PendPDTUpdateIndex = PendUpdates.size();
+  }
+}
+
+void DomTreeUpdater::tryFlushDeletedBB() {
+  if (!hasPendingUpdates())
+    forceFlushDeletedBB();
+}
+
+bool DomTreeUpdater::forceFlushDeletedBB() {
+  if (DeletedBBs.empty())
+    return false;
+
+  for (auto *BB : DeletedBBs) {
+    // After calling deleteBB or callbackDeleteBB under Lazy UpdateStrategy,
+    // validateDeleteBB() removes all instructions of DelBB and adds an
+    // UnreachableInst as its terminator. So we check whether the BasicBlock to
+    // delete only has an UnreachableInst inside.
+    assert(BB->getInstList().size() == 1 &&
+           isa<UnreachableInst>(BB->getTerminator()) &&
+           "DelBB has been modified while awaiting deletion.");
+    BB->removeFromParent();
+    eraseDelBBNode(BB);
+    delete BB;
+  }
+  DeletedBBs.clear();
+  Callbacks.clear();
+  return true;
+}
+
+bool DomTreeUpdater::recalculate(Function &F) {
+  if (!DT && !PDT)
+    return false;
+
+  if (Strategy == UpdateStrategy::Eager) {
+    if (DT)
+      DT->recalculate(F);
+    if (PDT)
+      PDT->recalculate(F);
+    return true;
+  }
+
+  // Prevent forceFlushDeletedBB() from erasing DomTree or PostDomTree nodes.
+  IsRecalculatingDomTree = IsRecalculatingPostDomTree = true;
+
+  // Because all trees are going to be up-to-date after recalculation,
+  // flush awaiting deleted BasicBlocks.
+  if (forceFlushDeletedBB() || hasPendingUpdates()) {
+    if (DT)
+      DT->recalculate(F);
+    if (PDT)
+      PDT->recalculate(F);
+
+    // Resume forceFlushDeletedBB() to erase DomTree or PostDomTree nodes.
+    IsRecalculatingDomTree = IsRecalculatingPostDomTree = false;
+    PendDTUpdateIndex = PendPDTUpdateIndex = PendUpdates.size();
+    dropOutOfDateUpdates();
+    return true;
+  }
+
+  // Resume forceFlushDeletedBB() to erase DomTree or PostDomTree nodes.
+  IsRecalculatingDomTree = IsRecalculatingPostDomTree = false;
+  return false;
+}
+
+bool DomTreeUpdater::hasPendingUpdates() const {
+  return hasPendingDomTreeUpdates() || hasPendingPostDomTreeUpdates();
+}
+
+bool DomTreeUpdater::hasPendingDomTreeUpdates() const {
+  if (!DT)
+    return false;
+  return PendUpdates.size() != PendDTUpdateIndex;
+}
+
+bool DomTreeUpdater::hasPendingPostDomTreeUpdates() const {
+  if (!PDT)
+    return false;
+  return PendUpdates.size() != PendPDTUpdateIndex;
+}
+
+bool DomTreeUpdater::isBBPendingDeletion(llvm::BasicBlock *DelBB) const {
+  if (Strategy == UpdateStrategy::Eager || DeletedBBs.empty())
+    return false;
+  return DeletedBBs.count(DelBB) != 0;
+}
+
+// The DT and PDT require the nodes related to updates
+// are not deleted when update functions are called.
+// So BasicBlock deletions must be pended when the
+// UpdateStrategy is Lazy. When the UpdateStrategy is
+// Eager, the BasicBlock will be deleted immediately.
+void DomTreeUpdater::deleteBB(BasicBlock *DelBB) {
+  validateDeleteBB(DelBB);
+  if (Strategy == UpdateStrategy::Lazy) {
+    DeletedBBs.insert(DelBB);
+    return;
+  }
+
+  DelBB->removeFromParent();
+  eraseDelBBNode(DelBB);
+  delete DelBB;
+}
+
+void DomTreeUpdater::callbackDeleteBB(
+    BasicBlock *DelBB, std::function<void(BasicBlock *)> Callback) {
+  validateDeleteBB(DelBB);
+  if (Strategy == UpdateStrategy::Lazy) {
+    Callbacks.push_back(CallBackOnDeletion(DelBB, Callback));
+    DeletedBBs.insert(DelBB);
+    return;
+  }
+
+  DelBB->removeFromParent();
+  eraseDelBBNode(DelBB);
+  Callback(DelBB);
+  delete DelBB;
+}
+
+void DomTreeUpdater::eraseDelBBNode(BasicBlock *DelBB) {
+  if (DT && !IsRecalculatingDomTree)
+    if (DT->getNode(DelBB))
+      DT->eraseNode(DelBB);
+
+  if (PDT && !IsRecalculatingPostDomTree)
+    if (PDT->getNode(DelBB))
+      PDT->eraseNode(DelBB);
+}
+
+void DomTreeUpdater::validateDeleteBB(BasicBlock *DelBB) {
+  assert(DelBB && "Invalid push_back of nullptr DelBB.");
+  assert(pred_empty(DelBB) && "DelBB has one or more predecessors.");
+  // DelBB is unreachable and all its instructions are dead.
+  while (!DelBB->empty()) {
+    Instruction &I = DelBB->back();
+    // Replace used instructions with an arbitrary value (undef).
+    if (!I.use_empty())
+      I.replaceAllUsesWith(llvm::UndefValue::get(I.getType()));
+    DelBB->getInstList().pop_back();
+  }
+  // Make sure DelBB has a valid terminator instruction. As long as DelBB is a
+  // Child of Function F it must contain valid IR.
+  new UnreachableInst(DelBB->getContext(), DelBB);
+}
+
+void DomTreeUpdater::applyUpdates(ArrayRef<DominatorTree::UpdateType> Updates,
+                                  bool ForceRemoveDuplicates) {
+  if (!DT && !PDT)
+    return;
+
+  if (Strategy == UpdateStrategy::Lazy || ForceRemoveDuplicates) {
+    SmallVector<DominatorTree::UpdateType, 8> Seen;
+    for (const auto U : Updates)
+      // For Lazy UpdateStrategy, avoid duplicates to applyLazyUpdate() to save
+      // on analysis.
+      if (llvm::none_of(
+              Seen,
+              [U](const DominatorTree::UpdateType S) { return S == U; }) &&
+          isUpdateValid(U) && !isSelfDominance(U)) {
+        Seen.push_back(U);
+        if (Strategy == UpdateStrategy::Lazy)
+          applyLazyUpdate(U.getKind(), U.getFrom(), U.getTo());
+      }
+    if (Strategy == UpdateStrategy::Lazy)
+      return;
+
+    if (DT)
+      DT->applyUpdates(Seen);
+    if (PDT)
+      PDT->applyUpdates(Seen);
+    return;
+  }
+
+  if (DT)
+    DT->applyUpdates(Updates);
+  if (PDT)
+    PDT->applyUpdates(Updates);
+}
+
+DominatorTree &DomTreeUpdater::getDomTree() {
+  assert(DT && "Invalid acquisition of a null DomTree");
+  applyDomTreeUpdates();
+  dropOutOfDateUpdates();
+  return *DT;
+}
+
+PostDominatorTree &DomTreeUpdater::getPostDomTree() {
+  assert(PDT && "Invalid acquisition of a null PostDomTree");
+  applyPostDomTreeUpdates();
+  dropOutOfDateUpdates();
+  return *PDT;
+}
+
+void DomTreeUpdater::insertEdge(BasicBlock *From, BasicBlock *To) {
+
+#ifndef NDEBUG
+  assert(isUpdateValid({DominatorTree::Insert, From, To}) &&
+         "Inserted edge does not appear in the CFG");
+#endif
+
+  if (!DT && !PDT)
+    return;
+
+  // Won't affect DomTree and PostDomTree; discard update.
+  if (From == To)
+    return;
+
+  if (Strategy == UpdateStrategy::Eager) {
+    if (DT)
+      DT->insertEdge(From, To);
+    if (PDT)
+      PDT->insertEdge(From, To);
+    return;
+  }
+
+  applyLazyUpdate(DominatorTree::Insert, From, To);
+}
+
+void DomTreeUpdater::insertEdgeRelaxed(BasicBlock *From, BasicBlock *To) {
+  if (From == To)
+    return;
+
+  if (!DT && !PDT)
+    return;
+
+  if (!isUpdateValid({DominatorTree::Insert, From, To}))
+    return;
+
+  if (Strategy == UpdateStrategy::Eager) {
+    if (DT)
+      DT->insertEdge(From, To);
+    if (PDT)
+      PDT->insertEdge(From, To);
+    return;
+  }
+
+  applyLazyUpdate(DominatorTree::Insert, From, To);
+}
+
+void DomTreeUpdater::deleteEdge(BasicBlock *From, BasicBlock *To) {
+
+#ifndef NDEBUG
+  assert(isUpdateValid({DominatorTree::Delete, From, To}) &&
+         "Deleted edge still exists in the CFG!");
+#endif
+
+  if (!DT && !PDT)
+    return;
+
+  // Won't affect DomTree and PostDomTree; discard update.
+  if (From == To)
+    return;
+
+  if (Strategy == UpdateStrategy::Eager) {
+    if (DT)
+      DT->deleteEdge(From, To);
+    if (PDT)
+      PDT->deleteEdge(From, To);
+    return;
+  }
+
+  applyLazyUpdate(DominatorTree::Delete, From, To);
+}
+
+void DomTreeUpdater::deleteEdgeRelaxed(BasicBlock *From, BasicBlock *To) {
+  if (From == To)
+    return;
+
+  if (!DT && !PDT)
+    return;
+
+  if (!isUpdateValid({DominatorTree::Delete, From, To}))
+    return;
+
+  if (Strategy == UpdateStrategy::Eager) {
+    if (DT)
+      DT->deleteEdge(From, To);
+    if (PDT)
+      PDT->deleteEdge(From, To);
+    return;
+  }
+
+  applyLazyUpdate(DominatorTree::Delete, From, To);
+}
+
+void DomTreeUpdater::dropOutOfDateUpdates() {
+  if (Strategy == DomTreeUpdater::UpdateStrategy::Eager)
+    return;
+
+  tryFlushDeletedBB();
+
+  // Drop all updates applied by both trees.
+  if (!DT)
+    PendDTUpdateIndex = PendUpdates.size();
+  if (!PDT)
+    PendPDTUpdateIndex = PendUpdates.size();
+
+  const size_t dropIndex = std::min(PendDTUpdateIndex, PendPDTUpdateIndex);
+  const auto B = PendUpdates.begin();
+  const auto E = PendUpdates.begin() + dropIndex;
+  assert(B <= E && "Iterator out of range.");
+  PendUpdates.erase(B, E);
+  // Calculate current index.
+  PendDTUpdateIndex -= dropIndex;
+  PendPDTUpdateIndex -= dropIndex;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DomTreeUpdater::dump() const {
+  raw_ostream &OS = llvm::dbgs();
+
+  OS << "Available Trees: ";
+  if (DT || PDT) {
+    if (DT)
+      OS << "DomTree ";
+    if (PDT)
+      OS << "PostDomTree ";
+    OS << "\n";
+  } else
+    OS << "None\n";
+
+  OS << "UpdateStrategy: ";
+  if (Strategy == UpdateStrategy::Eager) {
+    OS << "Eager\n";
+    return;
+  } else
+    OS << "Lazy\n";
+  int Index = 0;
+
+  auto printUpdates =
+      [&](ArrayRef<DominatorTree::UpdateType>::const_iterator begin,
+          ArrayRef<DominatorTree::UpdateType>::const_iterator end) {
+        if (begin == end)
+          OS << "  None\n";
+        Index = 0;
+        for (auto It = begin, ItEnd = end; It != ItEnd; ++It) {
+          auto U = *It;
+          OS << "  " << Index << " : ";
+          ++Index;
+          if (U.getKind() == DominatorTree::Insert)
+            OS << "Insert, ";
+          else
+            OS << "Delete, ";
+          BasicBlock *From = U.getFrom();
+          if (From) {
+            auto S = From->getName();
+            if (!From->hasName())
+              S = "(no name)";
+            OS << S << "(" << From << "), ";
+          } else {
+            OS << "(badref), ";
+          }
+          BasicBlock *To = U.getTo();
+          if (To) {
+            auto S = To->getName();
+            if (!To->hasName())
+              S = "(no_name)";
+            OS << S << "(" << To << ")\n";
+          } else {
+            OS << "(badref)\n";
+          }
+        }
+      };
+
+  if (DT) {
+    const auto I = PendUpdates.begin() + PendDTUpdateIndex;
+    assert(PendUpdates.begin() <= I && I <= PendUpdates.end() &&
+           "Iterator out of range.");
+    OS << "Applied but not cleared DomTreeUpdates:\n";
+    printUpdates(PendUpdates.begin(), I);
+    OS << "Pending DomTreeUpdates:\n";
+    printUpdates(I, PendUpdates.end());
+  }
+
+  if (PDT) {
+    const auto I = PendUpdates.begin() + PendPDTUpdateIndex;
+    assert(PendUpdates.begin() <= I && I <= PendUpdates.end() &&
+           "Iterator out of range.");
+    OS << "Applied but not cleared PostDomTreeUpdates:\n";
+    printUpdates(PendUpdates.begin(), I);
+    OS << "Pending PostDomTreeUpdates:\n";
+    printUpdates(I, PendUpdates.end());
+  }
+
+  OS << "Pending DeletedBBs:\n";
+  Index = 0;
+  for (auto BB : DeletedBBs) {
+    OS << "  " << Index << " : ";
+    ++Index;
+    if (BB->hasName())
+      OS << BB->getName() << "(";
+    else
+      OS << "(no_name)(";
+    OS << BB << ")\n";
+  }
+
+  OS << "Pending Callbacks:\n";
+  Index = 0;
+  for (auto BB : Callbacks) {
+    OS << "  " << Index << " : ";
+    ++Index;
+    if (BB->hasName())
+      OS << BB->getName() << "(";
+    else
+      OS << "(no_name)(";
+    OS << BB << ")\n";
+  }
+}
+#endif
+} // namespace llvm
diff --git a/contrib/llvm/lib/IR/Dominators.cpp b/contrib/llvm/lib/IR/Dominators.cpp
index ad448a3f240c..d8971e05f476 100644
--- a/contrib/llvm/lib/IR/Dominators.cpp
+++ b/contrib/llvm/lib/IR/Dominators.cpp
@@ -17,7 +17,9 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -27,16 +29,17 @@
 #include <algorithm>
 using namespace llvm;
 
-// Always verify dominfo if expensive checking is enabled.
-#ifdef EXPENSIVE_CHECKS
-bool llvm::VerifyDomInfo = true;
-#else
 bool llvm::VerifyDomInfo = false;
-#endif
 static cl::opt<bool, true>
     VerifyDomInfoX("verify-dom-info", cl::location(VerifyDomInfo), cl::Hidden,
                    cl::desc("Verify dominator info (time consuming)"));
 
+#ifdef EXPENSIVE_CHECKS
+static constexpr bool ExpensiveChecksEnabled = true;
+#else
+static constexpr bool ExpensiveChecksEnabled = false;
+#endif
+
 bool BasicBlockEdge::isSingleEdge() const {
   const TerminatorInst *TI = Start->getTerminator();
   unsigned NumEdgesToEnd = 0;
@@ -87,9 +90,11 @@ template void llvm::DomTreeBuilder::ApplyUpdates<DomTreeBuilder::BBPostDomTree>(
     DomTreeBuilder::BBPostDomTree &DT, DomTreeBuilder::BBUpdates);
 
 template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBDomTree>(
-    const DomTreeBuilder::BBDomTree &DT);
+    const DomTreeBuilder::BBDomTree &DT,
+    DomTreeBuilder::BBDomTree::VerificationLevel VL);
 template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBPostDomTree>(
-    const DomTreeBuilder::BBPostDomTree &DT);
+    const DomTreeBuilder::BBPostDomTree &DT,
+    DomTreeBuilder::BBPostDomTree::VerificationLevel VL);
 
 bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &) {
@@ -302,31 +307,6 @@ bool DominatorTree::isReachableFromEntry(const Use &U) const {
   return isReachableFromEntry(I->getParent());
 }
 
-void DominatorTree::verifyDomTree() const {
-  // Perform the expensive checks only when VerifyDomInfo is set.
-  if (VerifyDomInfo && !verify()) {
-    errs() << "\n~~~~~~~~~~~\n\t\tDomTree verification failed!\n~~~~~~~~~~~\n";
-    print(errs());
-    abort();
-  }
-
-  Function &F = *getRoot()->getParent();
-
-  DominatorTree OtherDT;
-  OtherDT.recalculate(F);
-  if (compare(OtherDT)) {
-    errs() << "DominatorTree for function " << F.getName()
-           << " is not up to date!\nComputed:\n";
-    print(errs());
-    errs() << "\nActual:\n";
-    OtherDT.print(errs());
-    errs() << "\nCFG:\n";
-    F.print(errs());
-    errs().flush();
-    abort();
-  }
-}
-
 //===----------------------------------------------------------------------===//
 //  DominatorTreeAnalysis and related pass implementations
 //===----------------------------------------------------------------------===//
@@ -357,8 +337,9 @@ PreservedAnalyses DominatorTreePrinterPass::run(Function &F,
 
 PreservedAnalyses DominatorTreeVerifierPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
-  AM.getResult<DominatorTreeAnalysis>(F).verifyDomTree();
-
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  assert(DT.verify());
+  (void)DT;
   return PreservedAnalyses::all();
 }
 
@@ -381,11 +362,203 @@ bool DominatorTreeWrapperPass::runOnFunction(Function &F) {
 }
 
 void DominatorTreeWrapperPass::verifyAnalysis() const {
-    if (VerifyDomInfo)
-      DT.verifyDomTree();
+  if (VerifyDomInfo)
+    assert(DT.verify(DominatorTree::VerificationLevel::Full));
+  else if (ExpensiveChecksEnabled)
+    assert(DT.verify(DominatorTree::VerificationLevel::Basic));
 }
 
 void DominatorTreeWrapperPass::print(raw_ostream &OS, const Module *) const {
   DT.print(OS);
 }
 
+//===----------------------------------------------------------------------===//
+//  DeferredDominance Implementation
+//===----------------------------------------------------------------------===//
+//
+// The implementation details of the DeferredDominance class which allows
+// one to queue updates to a DominatorTree.
+//
+//===----------------------------------------------------------------------===//
+
+/// Queues multiple updates and discards duplicates.
+void DeferredDominance::applyUpdates(
+    ArrayRef<DominatorTree::UpdateType> Updates) {
+  SmallVector<DominatorTree::UpdateType, 8> Seen;
+  for (auto U : Updates)
+    // Avoid duplicates to applyUpdate() to save on analysis.
+    if (std::none_of(Seen.begin(), Seen.end(),
+                     [U](DominatorTree::UpdateType S) { return S == U; })) {
+      Seen.push_back(U);
+      applyUpdate(U.getKind(), U.getFrom(), U.getTo());
+    }
+}
+
+/// Helper method for a single edge insertion. It's almost always better
+/// to batch updates and call applyUpdates to quickly remove duplicate edges.
+/// This is best used when there is only a single insertion needed to update
+/// Dominators.
+void DeferredDominance::insertEdge(BasicBlock *From, BasicBlock *To) {
+  applyUpdate(DominatorTree::Insert, From, To);
+}
+
+/// Helper method for a single edge deletion. It's almost always better
+/// to batch updates and call applyUpdates to quickly remove duplicate edges.
+/// This is best used when there is only a single deletion needed to update
+/// Dominators.
+void DeferredDominance::deleteEdge(BasicBlock *From, BasicBlock *To) {
+  applyUpdate(DominatorTree::Delete, From, To);
+}
+
+/// Delays the deletion of a basic block until a flush() event.
+void DeferredDominance::deleteBB(BasicBlock *DelBB) {
+  assert(DelBB && "Invalid push_back of nullptr DelBB.");
+  assert(pred_empty(DelBB) && "DelBB has one or more predecessors.");
+  // DelBB is unreachable and all its instructions are dead.
+  while (!DelBB->empty()) {
+    Instruction &I = DelBB->back();
+    // Replace used instructions with an arbitrary value (undef).
+    if (!I.use_empty())
+      I.replaceAllUsesWith(llvm::UndefValue::get(I.getType()));
+    DelBB->getInstList().pop_back();
+  }
+  // Make sure DelBB has a valid terminator instruction. As long as DelBB is a
+  // Child of Function F it must contain valid IR.
+  new UnreachableInst(DelBB->getContext(), DelBB);
+  DeletedBBs.insert(DelBB);
+}
+
+/// Returns true if DelBB is awaiting deletion at a flush() event.
+bool DeferredDominance::pendingDeletedBB(BasicBlock *DelBB) {
+  if (DeletedBBs.empty())
+    return false;
+  return DeletedBBs.count(DelBB) != 0;
+}
+
+/// Returns true if pending DT updates are queued for a flush() event.
+bool DeferredDominance::pending() { return !PendUpdates.empty(); }
+
+/// Flushes all pending updates and block deletions. Returns a
+/// correct DominatorTree reference to be used by the caller for analysis.
+DominatorTree &DeferredDominance::flush() {
+  // Updates to DT must happen before blocks are deleted below. Otherwise the
+  // DT traversal will encounter badref blocks and assert.
+  if (!PendUpdates.empty()) {
+    DT.applyUpdates(PendUpdates);
+    PendUpdates.clear();
+  }
+  flushDelBB();
+  return DT;
+}
+
+/// Drops all internal state and forces a (slow) recalculation of the
+/// DominatorTree based on the current state of the LLVM IR in F. This should
+/// only be used in corner cases such as the Entry block of F being deleted.
+void DeferredDominance::recalculate(Function &F) {
+  // flushDelBB must be flushed before the recalculation. The state of the IR
+  // must be consistent before the DT traversal algorithm determines the
+  // actual DT.
+  if (flushDelBB() || !PendUpdates.empty()) {
+    DT.recalculate(F);
+    PendUpdates.clear();
+  }
+}
+
+/// Debug method to help view the state of pending updates.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DeferredDominance::dump() const {
+  raw_ostream &OS = llvm::dbgs();
+  OS << "PendUpdates:\n";
+  int I = 0;
+  for (auto U : PendUpdates) {
+    OS << "  " << I << " : ";
+    ++I;
+    if (U.getKind() == DominatorTree::Insert)
+      OS << "Insert, ";
+    else
+      OS << "Delete, ";
+    BasicBlock *From = U.getFrom();
+    if (From) {
+      auto S = From->getName();
+      if (!From->hasName())
+        S = "(no name)";
+      OS << S << "(" << From << "), ";
+    } else {
+      OS << "(badref), ";
+    }
+    BasicBlock *To = U.getTo();
+    if (To) {
+      auto S = To->getName();
+      if (!To->hasName())
+        S = "(no_name)";
+      OS << S << "(" << To << ")\n";
+    } else {
+      OS << "(badref)\n";
+    }
+  }
+  OS << "DeletedBBs:\n";
+  I = 0;
+  for (auto BB : DeletedBBs) {
+    OS << "  " << I << " : ";
+    ++I;
+    if (BB->hasName())
+      OS << BB->getName() << "(";
+    else
+      OS << "(no_name)(";
+    OS << BB << ")\n";
+  }
+}
+#endif
+
+/// Apply an update (Kind, From, To) to the internal queued updates. The
+/// update is only added when determined to be necessary. Checks for
+/// self-domination, unnecessary updates, duplicate requests, and balanced
+/// pairs of requests are all performed. Returns true if the update is
+/// queued and false if it is discarded.
+bool DeferredDominance::applyUpdate(DominatorTree::UpdateKind Kind,
+                                    BasicBlock *From, BasicBlock *To) {
+  if (From == To)
+    return false; // Cannot dominate self; discard update.
+
+  // Discard updates by inspecting the current state of successors of From.
+  // Since applyUpdate() must be called *after* the Terminator of From is
+  // altered we can determine if the update is unnecessary.
+  bool HasEdge = std::any_of(succ_begin(From), succ_end(From),
+                             [To](BasicBlock *B) { return B == To; });
+  if (Kind == DominatorTree::Insert && !HasEdge)
+    return false; // Unnecessary Insert: edge does not exist in IR.
+  if (Kind == DominatorTree::Delete && HasEdge)
+    return false; // Unnecessary Delete: edge still exists in IR.
+
+  // Analyze pending updates to determine if the update is unnecessary.
+  DominatorTree::UpdateType Update = {Kind, From, To};
+  DominatorTree::UpdateType Invert = {Kind != DominatorTree::Insert
+                                          ? DominatorTree::Insert
+                                          : DominatorTree::Delete,
+                                      From, To};
+  for (auto I = PendUpdates.begin(), E = PendUpdates.end(); I != E; ++I) {
+    if (Update == *I)
+      return false; // Discard duplicate updates.
+    if (Invert == *I) {
+      // Update and Invert are both valid (equivalent to a no-op). Remove
+      // Invert from PendUpdates and discard the Update.
+      PendUpdates.erase(I);
+      return false;
+    }
+  }
+  PendUpdates.push_back(Update); // Save the valid update.
+  return true;
+}
+
+/// Performs all pending basic block deletions. We have to defer the deletion
+/// of these blocks until after the DominatorTree updates are applied. The
+/// internal workings of the DominatorTree code expect every update's From
+/// and To blocks to exist and to be a member of the same Function.
+bool DeferredDominance::flushDelBB() {
+  if (DeletedBBs.empty())
+    return false;
+  for (auto *BB : DeletedBBs)
+    BB->eraseFromParent();
+  DeletedBBs.clear();
+  return true;
+}
diff --git a/contrib/llvm/lib/IR/Function.cpp b/contrib/llvm/lib/IR/Function.cpp
index 7063f6f40a30..aba329b80508 100644
--- a/contrib/llvm/lib/IR/Function.cpp
+++ b/contrib/llvm/lib/IR/Function.cpp
@@ -21,7 +21,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -56,6 +55,7 @@
 #include <string>
 
 using namespace llvm;
+using ProfileCount = Function::ProfileCount;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
@@ -79,7 +79,8 @@ bool Argument::hasNonNullAttr() const {
   if (getParent()->hasParamAttribute(getArgNo(), Attribute::NonNull))
     return true;
   else if (getDereferenceableBytes() > 0 &&
-           getType()->getPointerAddressSpace() == 0)
+           !NullPointerIsDefined(getParent(),
+                                 getType()->getPointerAddressSpace()))
     return true;
   return false;
 }
@@ -194,6 +195,14 @@ LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
+unsigned Function::getInstructionCount() {
+  unsigned NumInstrs = 0;
+  for (BasicBlock &BB : BasicBlocks)
+    NumInstrs += std::distance(BB.instructionsWithoutDebug().begin(),
+                               BB.instructionsWithoutDebug().end());
+  return NumInstrs;
+}
+
 void Function::removeFromParent() {
   getParent()->getFunctionList().remove(getIterator());
 }
@@ -479,13 +488,13 @@ void Function::copyAttributesFrom(const Function *Src) {
 static const char * const IntrinsicNameTable[] = {
   "not_intrinsic",
 #define GET_INTRINSIC_NAME_TABLE
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_NAME_TABLE
 };
 
 /// Table of per-target intrinsic name tables.
 #define GET_INTRINSIC_TARGET_DATA
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_TARGET_DATA
 
 /// Find the segment of \c IntrinsicNameTable for intrinsics with the same
@@ -508,7 +517,7 @@ static ArrayRef<const char *> findTargetSubtable(StringRef Name) {
   return makeArrayRef(&IntrinsicNameTable[1] + TI.Offset, TI.Count);
 }
 
-/// \brief This does the actual lookup of an intrinsic ID which
+/// This does the actual lookup of an intrinsic ID which
 /// matches the given function name.
 Intrinsic::ID Function::lookupIntrinsicID(StringRef Name) {
   ArrayRef<const char *> NameTable = findTargetSubtable(Name);
@@ -522,9 +531,11 @@ Intrinsic::ID Function::lookupIntrinsicID(StringRef Name) {
   Intrinsic::ID ID = static_cast<Intrinsic::ID>(Idx + Adjust);
 
   // If the intrinsic is not overloaded, require an exact match. If it is
-  // overloaded, require a prefix match.
-  bool IsPrefixMatch = Name.size() > strlen(NameTable[Idx]);
-  return IsPrefixMatch == isOverloaded(ID) ? ID : Intrinsic::not_intrinsic;
+  // overloaded, require either exact or prefix match.
+  const auto MatchSize = strlen(NameTable[Idx]);
+  assert(Name.size() >= MatchSize && "Expected either exact or prefix match");
+  bool IsExactMatch = Name.size() == MatchSize;
+  return IsExactMatch || isOverloaded(ID) ? ID : Intrinsic::not_intrinsic;
 }
 
 void Function::recalculateIntrinsicID() {
@@ -548,10 +559,7 @@ void Function::recalculateIntrinsicID() {
 /// which can't be confused with it's prefix.  This ensures we don't have
 /// collisions between two unrelated function types. Otherwise, you might
 /// parse ffXX as f(fXX) or f(fX)X.  (X is a placeholder for any other type.)
-/// Manglings of integers, floats, and vectors ('i', 'f', and 'v' prefix in most
-/// cases) fall back to the MVT codepath, where they could be mangled to
-/// 'x86mmx', for example; matching on derived types is not sufficient to mangle
-/// everything.
+///
 static std::string getMangledTypeStr(Type* Ty) {
   std::string Result;
   if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
@@ -579,11 +587,26 @@ static std::string getMangledTypeStr(Type* Ty) {
       Result += "vararg";
     // Ensure nested function types are distinguishable.
     Result += "f"; 
-  } else if (isa<VectorType>(Ty))
+  } else if (isa<VectorType>(Ty)) {
     Result += "v" + utostr(Ty->getVectorNumElements()) +
       getMangledTypeStr(Ty->getVectorElementType());
-  else if (Ty)
-    Result += EVT::getEVT(Ty).getEVTString();
+  } else if (Ty) {
+    switch (Ty->getTypeID()) {
+    default: llvm_unreachable("Unhandled type");
+    case Type::VoidTyID:      Result += "isVoid";   break;
+    case Type::MetadataTyID:  Result += "Metadata"; break;
+    case Type::HalfTyID:      Result += "f16";      break;
+    case Type::FloatTyID:     Result += "f32";      break;
+    case Type::DoubleTyID:    Result += "f64";      break;
+    case Type::X86_FP80TyID:  Result += "f80";      break;
+    case Type::FP128TyID:     Result += "f128";     break;
+    case Type::PPC_FP128TyID: Result += "ppcf128";  break;
+    case Type::X86_MMXTyID:   Result += "x86mmx";   break;
+    case Type::IntegerTyID:
+      Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
+      break;
+    }
+  }
   return Result;
 }
 
@@ -651,7 +674,8 @@ enum IIT_Info {
   IIT_V1024 = 37,
   IIT_STRUCT6 = 38,
   IIT_STRUCT7 = 39,
-  IIT_STRUCT8 = 40
+  IIT_STRUCT8 = 40,
+  IIT_F128 = 41
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -686,6 +710,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_F64:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0));
     return;
+  case IIT_F128:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Quad, 0));
+    return;
   case IIT_I1:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
     return;
@@ -818,7 +845,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
 }
 
 #define GET_INTRINSIC_GENERATOR_GLOBAL
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_GENERATOR_GLOBAL
 
 void Intrinsic::getIntrinsicInfoTableEntries(ID id,
@@ -870,6 +897,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Half: return Type::getHalfTy(Context);
   case IITDescriptor::Float: return Type::getFloatTy(Context);
   case IITDescriptor::Double: return Type::getDoubleTy(Context);
+  case IITDescriptor::Quad: return Type::getFP128Ty(Context);
 
   case IITDescriptor::Integer:
     return IntegerType::get(Context, D.Integer_Width);
@@ -955,7 +983,7 @@ FunctionType *Intrinsic::getType(LLVMContext &Context,
 
 bool Intrinsic::isOverloaded(ID id) {
 #define GET_INTRINSIC_OVERLOAD_TABLE
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
@@ -973,7 +1001,7 @@ bool Intrinsic::isLeaf(ID id) {
 
 /// This defines the "Intrinsic::getAttributes(ID id)" method.
 #define GET_INTRINSIC_ATTRIBUTES
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
 Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
@@ -986,12 +1014,12 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
 
 // This defines the "Intrinsic::getIntrinsicForGCCBuiltin()" method.
 #define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 
 // This defines the "Intrinsic::getIntrinsicForMSBuiltin()" method.
 #define GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
-#include "llvm/IR/Intrinsics.gen"
+#include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
 
 bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
@@ -1012,6 +1040,7 @@ bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor>
     case IITDescriptor::Half: return !Ty->isHalfTy();
     case IITDescriptor::Float: return !Ty->isFloatTy();
     case IITDescriptor::Double: return !Ty->isDoubleTy();
+    case IITDescriptor::Quad: return !Ty->isFP128Ty();
     case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
     case IITDescriptor::Vector: {
       VectorType *VT = dyn_cast<VectorType>(Ty);
@@ -1320,26 +1349,43 @@ void Function::setValueSubclassDataBit(unsigned Bit, bool On) {
     setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit));
 }
 
-void Function::setEntryCount(uint64_t Count,
+void Function::setEntryCount(ProfileCount Count,
                              const DenseSet<GlobalValue::GUID> *S) {
+  assert(Count.hasValue());
+#if !defined(NDEBUG)
+  auto PrevCount = getEntryCount();
+  assert(!PrevCount.hasValue() || PrevCount.getType() == Count.getType());
+#endif
   MDBuilder MDB(getContext());
-  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count, S));
+  setMetadata(
+      LLVMContext::MD_prof,
+      MDB.createFunctionEntryCount(Count.getCount(), Count.isSynthetic(), S));
 }
 
-Optional<uint64_t> Function::getEntryCount() const {
+void Function::setEntryCount(uint64_t Count, Function::ProfileCountType Type,
+                             const DenseSet<GlobalValue::GUID> *Imports) {
+  setEntryCount(ProfileCount(Count, Type), Imports);
+}
+
+ProfileCount Function::getEntryCount() const {
   MDNode *MD = getMetadata(LLVMContext::MD_prof);
   if (MD && MD->getOperand(0))
-    if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0)))
+    if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0))) {
       if (MDS->getString().equals("function_entry_count")) {
         ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(1));
         uint64_t Count = CI->getValue().getZExtValue();
         // A value of -1 is used for SamplePGO when there were no samples.
         // Treat this the same as unknown.
         if (Count == (uint64_t)-1)
-          return None;
-        return Count;
+          return ProfileCount::getInvalid();
+        return ProfileCount(Count, PCT_Real);
+      } else if (MDS->getString().equals("synthetic_function_entry_count")) {
+        ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(1));
+        uint64_t Count = CI->getValue().getZExtValue();
+        return ProfileCount(Count, PCT_Synthetic);
       }
-  return None;
+    }
+  return ProfileCount::getInvalid();
 }
 
 DenseSet<GlobalValue::GUID> Function::getImportGUIDs() const {
@@ -1362,11 +1408,27 @@ void Function::setSectionPrefix(StringRef Prefix) {
 
 Optional<StringRef> Function::getSectionPrefix() const {
   if (MDNode *MD = getMetadata(LLVMContext::MD_section_prefix)) {
-    assert(dyn_cast<MDString>(MD->getOperand(0))
+    assert(cast<MDString>(MD->getOperand(0))
                ->getString()
                .equals("function_section_prefix") &&
            "Metadata not match");
-    return dyn_cast<MDString>(MD->getOperand(1))->getString();
+    return cast<MDString>(MD->getOperand(1))->getString();
   }
   return None;
 }
+
+bool Function::nullPointerIsDefined() const {
+  return getFnAttribute("null-pointer-is-valid")
+          .getValueAsString()
+          .equals("true");
+}
+
+bool llvm::NullPointerIsDefined(const Function *F, unsigned AS) {
+  if (F && F->nullPointerIsDefined())
+    return true;
+
+  if (AS != 0)
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/IR/Globals.cpp b/contrib/llvm/lib/IR/Globals.cpp
index da1b6c5e0c91..20b2334a626f 100644
--- a/contrib/llvm/lib/IR/Globals.cpp
+++ b/contrib/llvm/lib/IR/Globals.cpp
@@ -281,6 +281,24 @@ Optional<ConstantRange> GlobalValue::getAbsoluteSymbolRange() const {
   return getConstantRangeFromMetadata(*MD);
 }
 
+bool GlobalValue::canBeOmittedFromSymbolTable() const {
+  if (!hasLinkOnceODRLinkage())
+    return false;
+
+  // We assume that anyone who sets global unnamed_addr on a non-constant
+  // knows what they're doing.
+  if (hasGlobalUnnamedAddr())
+    return true;
+
+  // If it is a non constant variable, it needs to be uniqued across shared
+  // objects.
+  if (auto *Var = dyn_cast<GlobalVariable>(this))
+    if (!Var->isConstant())
+      return false;
+
+  return hasAtLeastLocalUnnamedAddr();
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/IR/IRBuilder.cpp b/contrib/llvm/lib/IR/IRBuilder.cpp
index 027c0255bcec..405a56bfb31d 100644
--- a/contrib/llvm/lib/IR/IRBuilder.cpp
+++ b/contrib/llvm/lib/IR/IRBuilder.cpp
@@ -1,4 +1,4 @@
-//===---- IRBuilder.cpp - Builder for LLVM Instrs -------------------------===//
+//===- IRBuilder.cpp - Builder for LLVM Instrs ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,11 +13,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
 using namespace llvm;
 
 /// CreateGlobalString - Make a new global variable with an initializer that
@@ -29,11 +45,10 @@ GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                                   unsigned AddressSpace) {
   Constant *StrConstant = ConstantDataArray::getString(Context, Str);
   Module &M = *BB->getParent()->getParent();
-  GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(),
-                                          true, GlobalValue::PrivateLinkage,
-                                          StrConstant, Name, nullptr,
-                                          GlobalVariable::NotThreadLocal,
-                                          AddressSpace);
+  auto *GV = new GlobalVariable(M, StrConstant->getType(), true,
+                                GlobalValue::PrivateLinkage, StrConstant, Name,
+                                nullptr, GlobalVariable::NotThreadLocal,
+                                AddressSpace);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   return GV;
 }
@@ -44,10 +59,10 @@ Type *IRBuilderBase::getCurrentFunctionReturnType() const {
 }
 
 Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
-  PointerType *PT = cast<PointerType>(Ptr->getType());
+  auto *PT = cast<PointerType>(Ptr->getType());
   if (PT->getElementType()->isIntegerTy(8))
     return Ptr;
-  
+
   // Otherwise, we need to insert a bitcast.
   PT = getInt8PtrTy(PT->getAddressSpace());
   BitCastInst *BCI = new BitCastInst(Ptr, PT, "");
@@ -58,11 +73,14 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
 
 static CallInst *createCallHelper(Value *Callee, ArrayRef<Value *> Ops,
                                   IRBuilderBase *Builder,
-                                  const Twine& Name="") {
+                                  const Twine &Name = "",
+                                  Instruction *FMFSource = nullptr) {
   CallInst *CI = CallInst::Create(Callee, Ops, Name);
+  if (FMFSource)
+    CI->copyFastMathFlags(FMFSource);
   Builder->GetInsertBlock()->getInstList().insert(Builder->GetInsertPoint(),CI);
   Builder->SetInstDebugLocation(CI);
-  return CI;  
+  return CI;
 }
 
 static InvokeInst *createInvokeHelper(Value *Invokee, BasicBlock *NormalDest,
@@ -83,40 +101,81 @@ CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
              bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag,
              MDNode *NoAliasTag) {
   Ptr = getCastedInt8PtrValue(Ptr);
-  Value *Ops[] = { Ptr, Val, Size, getInt32(Align), getInt1(isVolatile) };
+  Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)};
   Type *Tys[] = { Ptr->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
-  
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  if (Align > 0)
+    cast<MemSetInst>(CI)->setDestAlignment(Align);
+
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+  return CI;
+}
+
+CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
+    Value *Ptr, Value *Val, Value *Size, unsigned Align, uint32_t ElementSize,
+    MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) {
+  assert(Align >= ElementSize &&
+         "Pointer alignment must be at least element size.");
+
+  Ptr = getCastedInt8PtrValue(Ptr);
+  Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)};
+  Type *Tys[] = {Ptr->getType(), Size->getType()};
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(
+      M, Intrinsic::memset_element_unordered_atomic, Tys);
+
   CallInst *CI = createCallHelper(TheFn, Ops, this);
-  
+
+  cast<AtomicMemSetInst>(CI)->setDestAlignment(Align);
+
   // Set the TBAA info if present.
   if (TBAATag)
     CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
 
   if (ScopeTag)
     CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
- 
+
   if (NoAliasTag)
     CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
- 
+
   return CI;
 }
 
 CallInst *IRBuilderBase::
-CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
-             bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag,
-             MDNode *ScopeTag, MDNode *NoAliasTag) {
+CreateMemCpy(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign,
+             Value *Size, bool isVolatile, MDNode *TBAATag,
+             MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) {
+  assert((DstAlign == 0 || isPowerOf2_32(DstAlign)) && "Must be 0 or a power of 2");
+  assert((SrcAlign == 0 || isPowerOf2_32(SrcAlign)) && "Must be 0 or a power of 2");
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
 
-  Value *Ops[] = { Dst, Src, Size, getInt32(Align), getInt1(isVolatile) };
+  Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
   Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
-  
+
   CallInst *CI = createCallHelper(TheFn, Ops, this);
-  
+
+  auto* MCI = cast<MemCpyInst>(CI);
+  if (DstAlign > 0)
+    MCI->setDestAlignment(DstAlign);
+  if (SrcAlign > 0)
+    MCI->setSourceAlignment(SrcAlign);
+
   // Set the TBAA info if present.
   if (TBAATag)
     CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
@@ -124,14 +183,14 @@ CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
   // Set the TBAA Struct info if present.
   if (TBAAStructTag)
     CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
- 
+
   if (ScopeTag)
     CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
- 
+
   if (NoAliasTag)
     CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
- 
-  return CI;  
+
+  return CI;
 }
 
 CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
@@ -154,8 +213,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
   CallInst *CI = createCallHelper(TheFn, Ops, this);
 
   // Set the alignment of the pointer args.
-  CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign));
-  CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), SrcAlign));
+  auto *AMCI = cast<AtomicMemCpyInst>(CI);
+  AMCI->setDestAlignment(DstAlign);
+  AMCI->setSourceAlignment(SrcAlign);
 
   // Set the TBAA info if present.
   if (TBAATag)
@@ -175,30 +235,78 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
 }
 
 CallInst *IRBuilderBase::
-CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
-              bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag,
+CreateMemMove(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign,
+              Value *Size, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag,
               MDNode *NoAliasTag) {
+  assert((DstAlign == 0 || isPowerOf2_32(DstAlign)) && "Must be 0 or a power of 2");
+  assert((SrcAlign == 0 || isPowerOf2_32(SrcAlign)) && "Must be 0 or a power of 2");
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
-  
-  Value *Ops[] = { Dst, Src, Size, getInt32(Align), getInt1(isVolatile) };
+
+  Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
   Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memmove, Tys);
-  
+
   CallInst *CI = createCallHelper(TheFn, Ops, this);
-  
+
+  auto *MMI = cast<MemMoveInst>(CI);
+  if (DstAlign > 0)
+    MMI->setDestAlignment(DstAlign);
+  if (SrcAlign > 0)
+    MMI->setSourceAlignment(SrcAlign);
+
   // Set the TBAA info if present.
   if (TBAATag)
     CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
- 
+
   if (ScopeTag)
     CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
- 
+
   if (NoAliasTag)
     CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
- 
-  return CI;  
+
+  return CI;
+}
+
+CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
+    Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size,
+    uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag,
+    MDNode *ScopeTag, MDNode *NoAliasTag) {
+  assert(DstAlign >= ElementSize &&
+         "Pointer alignment must be at least element size");
+  assert(SrcAlign >= ElementSize &&
+         "Pointer alignment must be at least element size");
+  Dst = getCastedInt8PtrValue(Dst);
+  Src = getCastedInt8PtrValue(Src);
+
+  Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)};
+  Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(
+      M, Intrinsic::memmove_element_unordered_atomic, Tys);
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  // Set the alignment of the pointer args.
+  CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign));
+  CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), SrcAlign));
+
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  // Set the TBAA Struct info if present.
+  if (TBAAStructTag)
+    CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+  return CI;
 }
 
 static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
@@ -351,7 +459,7 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
   return createCallHelper(FnAssume, Ops, this);
 }
 
-/// \brief Create a call to a Masked Load intrinsic.
+/// Create a call to a Masked Load intrinsic.
 /// \p Ptr      - base pointer for the load
 /// \p Align    - alignment of the source location
 /// \p Mask     - vector of booleans which indicates what vector lanes should
@@ -362,7 +470,7 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
 CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
                                           Value *Mask, Value *PassThru,
                                           const Twine &Name) {
-  PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
   Type *DataTy = PtrTy->getElementType();
   assert(DataTy->isVectorTy() && "Ptr should point to a vector");
   assert(Mask && "Mask should not be all-ones (null)");
@@ -374,7 +482,7 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
                                OverloadedTypes, Name);
 }
 
-/// \brief Create a call to a Masked Store intrinsic.
+/// Create a call to a Masked Store intrinsic.
 /// \p Val   - data to be stored,
 /// \p Ptr   - base pointer for the store
 /// \p Align - alignment of the destination location
@@ -382,7 +490,7 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
 ///            be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
                                            unsigned Align, Value *Mask) {
-  PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
   Type *DataTy = PtrTy->getElementType();
   assert(DataTy->isVectorTy() && "Ptr should point to a vector");
   assert(Mask && "Mask should not be all-ones (null)");
@@ -403,7 +511,7 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
   return createCallHelper(TheFn, Ops, this, Name);
 }
 
-/// \brief Create a call to a Masked Gather intrinsic.
+/// Create a call to a Masked Gather intrinsic.
 /// \p Ptrs     - vector of pointers for loading
 /// \p Align    - alignment for one element
 /// \p Mask     - vector of booleans which indicates what vector lanes should
@@ -435,7 +543,7 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
                                Name);
 }
 
-/// \brief Create a call to a Masked Scatter intrinsic.
+/// Create a call to a Masked Scatter intrinsic.
 /// \p Data  - data to be stored,
 /// \p Ptrs  - the vector of pointers, where the \p Data elements should be
 ///            stored
@@ -496,7 +604,7 @@ static CallInst *CreateGCStatepointCallCommon(
     ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs,
     const Twine &Name) {
   // Extract out the type of the callee.
-  PointerType *FuncPtrType = cast<PointerType>(ActualCallee->getType());
+  auto *FuncPtrType = cast<PointerType>(ActualCallee->getType());
   assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
          "actual callee must be a callable value");
 
@@ -507,7 +615,7 @@ static CallInst *CreateGCStatepointCallCommon(
     Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
                               ArgTypes);
 
-  std::vector<llvm::Value *> Args =
+  std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualCallee, Flags,
                         CallArgs, TransitionArgs, DeoptArgs, GCArgs);
   return createCallHelper(FnStatepoint, Args, Builder, Name);
@@ -547,7 +655,7 @@ static InvokeInst *CreateGCStatepointInvokeCommon(
     uint32_t Flags, ArrayRef<T0> InvokeArgs, ArrayRef<T1> TransitionArgs,
     ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs, const Twine &Name) {
   // Extract out the type of the callee.
-  PointerType *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
+  auto *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
   assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
          "actual callee must be a callable value");
 
@@ -556,7 +664,7 @@ static InvokeInst *CreateGCStatepointInvokeCommon(
   Function *FnStatepoint = Intrinsic::getDeclaration(
       M, Intrinsic::experimental_gc_statepoint, {FuncPtrType});
 
-  std::vector<llvm::Value *> Args =
+  std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee, Flags,
                         InvokeArgs, TransitionArgs, DeoptArgs, GCArgs);
   return createInvokeHelper(FnStatepoint, NormalDest, UnwindDest, Args, Builder,
@@ -625,7 +733,25 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
 CallInst *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID,
                                                Value *LHS, Value *RHS,
                                                const Twine &Name) {
-  Module *M = BB->getParent()->getParent();
-  Function *Fn =  Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  Module *M = BB->getModule();
+  Function *Fn = Intrinsic::getDeclaration(M, ID, { LHS->getType() });
   return createCallHelper(Fn, { LHS, RHS }, this, Name);
 }
+
+CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
+                                         Instruction *FMFSource,
+                                         const Twine &Name) {
+  Module *M = BB->getModule();
+  Function *Fn = Intrinsic::getDeclaration(M, ID);
+  return createCallHelper(Fn, {}, this, Name);
+}
+
+CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
+                                         ArrayRef<Value *> Args,
+                                         Instruction *FMFSource,
+                                         const Twine &Name) {
+  assert(!Args.empty() && "Expected at least one argument to intrinsic");
+  Module *M = BB->getModule();
+  Function *Fn = Intrinsic::getDeclaration(M, ID, { Args.front()->getType() });
+  return createCallHelper(Fn, Args, this, Name, FMFSource);
+}
diff --git a/contrib/llvm/lib/IR/IRPrintingPasses.cpp b/contrib/llvm/lib/IR/IRPrintingPasses.cpp
index 3b32814bed5c..befe1d9ffb1c 100644
--- a/contrib/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/contrib/llvm/lib/IR/IRPrintingPasses.cpp
@@ -127,13 +127,13 @@ public:
 
 char PrintModulePassWrapper::ID = 0;
 INITIALIZE_PASS(PrintModulePassWrapper, "print-module",
-                "Print module to stderr", false, false)
+                "Print module to stderr", false, true)
 char PrintFunctionPassWrapper::ID = 0;
 INITIALIZE_PASS(PrintFunctionPassWrapper, "print-function",
-                "Print function to stderr", false, false)
+                "Print function to stderr", false, true)
 char PrintBasicBlockPass::ID = 0;
 INITIALIZE_PASS(PrintBasicBlockPass, "print-bb", "Print BB to stderr", false,
-                false)
+                true)
 
 ModulePass *llvm::createPrintModulePass(llvm::raw_ostream &OS,
                                         const std::string &Banner,
@@ -150,3 +150,11 @@ BasicBlockPass *llvm::createPrintBasicBlockPass(llvm::raw_ostream &OS,
                                                 const std::string &Banner) {
   return new PrintBasicBlockPass(OS, Banner);
 }
+
+bool llvm::isIRPrintingPass(Pass *P) {
+  const char *PID = (const char*)P->getPassID();
+
+  return (PID == &PrintModulePassWrapper::ID)
+      || (PID == &PrintFunctionPassWrapper::ID)
+      || (PID == &PrintBasicBlockPass::ID);
+}
diff --git a/contrib/llvm/lib/IR/Instruction.cpp b/contrib/llvm/lib/IR/Instruction.cpp
index 5f2a6146ad81..508db9bcaf19 100644
--- a/contrib/llvm/lib/IR/Instruction.cpp
+++ b/contrib/llvm/lib/IR/Instruction.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -589,6 +590,18 @@ bool Instruction::mayThrow() const {
   return isa<ResumeInst>(this);
 }
 
+bool Instruction::isSafeToRemove() const {
+  return (!isa<CallInst>(this) || !this->mayHaveSideEffects()) &&
+         !isa<TerminatorInst>(this);
+}
+
+const Instruction *Instruction::getNextNonDebugInstruction() const {
+  for (const Instruction *I = getNextNode(); I; I = I->getNextNode())
+    if (!isa<DbgInfoIntrinsic>(I))
+      return I;
+  return nullptr;
+}
+
 bool Instruction::isAssociative() const {
   unsigned Opcode = getOpcode();
   if (isAssociative(Opcode))
@@ -597,7 +610,8 @@ bool Instruction::isAssociative() const {
   switch (Opcode) {
   case FMul:
   case FAdd:
-    return cast<FPMathOperator>(this)->isFast();
+    return cast<FPMathOperator>(this)->hasAllowReassoc() &&
+           cast<FPMathOperator>(this)->hasNoSignedZeros();
   default:
     return false;
   }
diff --git a/contrib/llvm/lib/IR/Instructions.cpp b/contrib/llvm/lib/IR/Instructions.cpp
index 490fcbce7439..e0ad0d1ea1f1 100644
--- a/contrib/llvm/lib/IR/Instructions.cpp
+++ b/contrib/llvm/lib/IR/Instructions.cpp
@@ -45,6 +45,22 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
+//                            AllocaInst Class
+//===----------------------------------------------------------------------===//
+
+Optional<uint64_t>
+AllocaInst::getAllocationSizeInBits(const DataLayout &DL) const {
+  uint64_t Size = DL.getTypeAllocSizeInBits(getAllocatedType());
+  if (isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(getArraySize());
+    if (!C)
+      return None;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
+//===----------------------------------------------------------------------===//
 //                            CallSite Class
 //===----------------------------------------------------------------------===//
 
@@ -319,31 +335,32 @@ void CallInst::init(Value *Func, const Twine &NameStr) {
   setName(NameStr);
 }
 
-CallInst::CallInst(Value *Func, const Twine &Name,
-                   Instruction *InsertBefore)
-  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
-                                   ->getElementType())->getReturnType(),
-                Instruction::Call,
-                OperandTraits<CallInst>::op_end(this) - 1,
-                1, InsertBefore) {
+CallInst::CallInst(Value *Func, const Twine &Name, Instruction *InsertBefore)
+    : CallBase<CallInst>(
+          cast<FunctionType>(
+              cast<PointerType>(Func->getType())->getElementType())
+              ->getReturnType(),
+          Instruction::Call,
+          OperandTraits<CallBase<CallInst>>::op_end(this) - 1, 1,
+          InsertBefore) {
   init(Func, Name);
 }
 
-CallInst::CallInst(Value *Func, const Twine &Name,
-                   BasicBlock *InsertAtEnd)
-  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
-                                   ->getElementType())->getReturnType(),
-                Instruction::Call,
-                OperandTraits<CallInst>::op_end(this) - 1,
-                1, InsertAtEnd) {
+CallInst::CallInst(Value *Func, const Twine &Name, BasicBlock *InsertAtEnd)
+    : CallBase<CallInst>(
+          cast<FunctionType>(
+              cast<PointerType>(Func->getType())->getElementType())
+              ->getReturnType(),
+          Instruction::Call,
+          OperandTraits<CallBase<CallInst>>::op_end(this) - 1, 1, InsertAtEnd) {
   init(Func, Name);
 }
 
 CallInst::CallInst(const CallInst &CI)
-    : Instruction(CI.getType(), Instruction::Call,
-                  OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
-                  CI.getNumOperands()),
-      Attrs(CI.Attrs), FTy(CI.FTy) {
+    : CallBase<CallInst>(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call,
+                         OperandTraits<CallBase<CallInst>>::op_end(this) -
+                             CI.getNumOperands(),
+                         CI.getNumOperands()) {
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
 
@@ -367,125 +384,14 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
   return NewCI;
 }
 
-Value *CallInst::getReturnedArgOperand() const {
-  unsigned Index;
-
-  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
-    return getArgOperand(Index - AttributeList::FirstArgIndex);
-  if (const Function *F = getCalledFunction())
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
-        Index)
-      return getArgOperand(Index - AttributeList::FirstArgIndex);
-
-  return nullptr;
-}
-
-void CallInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
-}
-
-void CallInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Attr);
-  setAttributes(PAL);
-}
-
-void CallInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  assert(ArgNo < getNumArgOperands() && "Out of bounds");
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
-}
-
-void CallInst::addParamAttr(unsigned ArgNo, Attribute Attr) {
-  assert(ArgNo < getNumArgOperands() && "Out of bounds");
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
-  setAttributes(PAL);
-}
-
-void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
-}
-
-void CallInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
-}
 
-void CallInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  assert(ArgNo < getNumArgOperands() && "Out of bounds");
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
-}
 
-void CallInst::removeParamAttr(unsigned ArgNo, StringRef Kind) {
-  assert(ArgNo < getNumArgOperands() && "Out of bounds");
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
-}
 
-void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
-}
 
-void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
-}
 
-bool CallInst::hasRetAttr(Attribute::AttrKind Kind) const {
-  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
-    return true;
 
-  // Look at the callee, if available.
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
-  return false;
-}
 
-bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
-  assert(i < getNumArgOperands() && "Param index out of bounds!");
 
-  if (Attrs.hasParamAttribute(i, Kind))
-    return true;
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasParamAttribute(i, Kind);
-  return false;
-}
-
-bool CallInst::dataOperandHasImpliedAttr(unsigned i,
-                                         Attribute::AttrKind Kind) const {
-  // There are getNumOperands() - 1 data operands.  The last operand is the
-  // callee.
-  assert(i < getNumOperands() && "Data operand index out of bounds!");
-
-  // The attribute A can either be directly specified, if the operand in
-  // question is a call argument; or be indirectly implied by the kind of its
-  // containing operand bundle, if the operand is a bundle operand.
-
-  if (i == AttributeList::ReturnIndex)
-    return hasRetAttr(Kind);
-
-  // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
-  // indices.
-  if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i - 1, Kind);
-
-  assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
-         "Must be either a call argument or an operand bundle!");
-  return bundleOperandHasAttr(i - 1, Kind);
-}
 
 /// IsConstantOne - Return true only if val is constant int 1
 static bool IsConstantOne(Value *val) {
@@ -721,11 +627,10 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
 }
 
 InvokeInst::InvokeInst(const InvokeInst &II)
-    : TerminatorInst(II.getType(), Instruction::Invoke,
-                     OperandTraits<InvokeInst>::op_end(this) -
-                         II.getNumOperands(),
-                     II.getNumOperands()),
-      Attrs(II.Attrs), FTy(II.FTy) {
+    : CallBase<InvokeInst>(II.Attrs, II.FTy, II.getType(), Instruction::Invoke,
+                           OperandTraits<CallBase<InvokeInst>>::op_end(this) -
+                               II.getNumOperands(),
+                           II.getNumOperands()) {
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
   std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -747,109 +652,6 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
   return NewII;
 }
 
-Value *InvokeInst::getReturnedArgOperand() const {
-  unsigned Index;
-
-  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
-    return getArgOperand(Index - AttributeList::FirstArgIndex);
-  if (const Function *F = getCalledFunction())
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
-        Index)
-      return getArgOperand(Index - AttributeList::FirstArgIndex);
-
-  return nullptr;
-}
-
-bool InvokeInst::hasRetAttr(Attribute::AttrKind Kind) const {
-  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
-    return true;
-
-  // Look at the callee, if available.
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
-  return false;
-}
-
-bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
-  assert(i < getNumArgOperands() && "Param index out of bounds!");
-
-  if (Attrs.hasParamAttribute(i, Kind))
-    return true;
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasParamAttribute(i, Kind);
-  return false;
-}
-
-bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
-                                           Attribute::AttrKind Kind) const {
-  // There are getNumOperands() - 3 data operands.  The last three operands are
-  // the callee and the two successor basic blocks.
-  assert(i < (getNumOperands() - 2) && "Data operand index out of bounds!");
-
-  // The attribute A can either be directly specified, if the operand in
-  // question is an invoke argument; or be indirectly implied by the kind of its
-  // containing operand bundle, if the operand is a bundle operand.
-
-  if (i == AttributeList::ReturnIndex)
-    return hasRetAttr(Kind);
-
-  // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
-  // indices.
-  if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i - 1, Kind);
-
-  assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
-         "Must be either an invoke argument or an operand bundle!");
-  return bundleOperandHasAttr(i - 1, Kind);
-}
-
-void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
-}
-
-void InvokeInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Attr);
-  setAttributes(PAL);
-}
-
-void InvokeInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
-}
-
-void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
-}
-
-void InvokeInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
-}
-
-void InvokeInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
-}
-
-void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
-}
-
-void InvokeInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
-}
 
 LandingPadInst *InvokeInst::getLandingPadInst() const {
   return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
@@ -1872,7 +1674,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
   return false;
 }
 
-int ShuffleVectorInst::getMaskValue(Constant *Mask, unsigned i) {
+int ShuffleVectorInst::getMaskValue(const Constant *Mask, unsigned i) {
   assert(i < Mask->getType()->getVectorNumElements() && "Index out of range");
   if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask))
     return CDS->getElementAsInteger(i);
@@ -1882,7 +1684,7 @@ int ShuffleVectorInst::getMaskValue(Constant *Mask, unsigned i) {
   return cast<ConstantInt>(C)->getZExtValue();
 }
 
-void ShuffleVectorInst::getShuffleMask(Constant *Mask,
+void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
   unsigned NumElts = Mask->getType()->getVectorNumElements();
   
@@ -1898,6 +1700,108 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
   }
 }
 
+bool ShuffleVectorInst::isSingleSourceMask(ArrayRef<int> Mask) {
+  assert(!Mask.empty() && "Shuffle mask must contain elements");
+  bool UsesLHS = false;
+  bool UsesRHS = false;
+  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    assert(Mask[i] >= 0 && Mask[i] < (NumElts * 2) &&
+           "Out-of-bounds shuffle mask element");
+    UsesLHS |= (Mask[i] < NumElts);
+    UsesRHS |= (Mask[i] >= NumElts);
+    if (UsesLHS && UsesRHS)
+      return false;
+  }
+  assert((UsesLHS ^ UsesRHS) && "Should have selected from exactly 1 source");
+  return true;
+}
+
+bool ShuffleVectorInst::isIdentityMask(ArrayRef<int> Mask) {
+  if (!isSingleSourceMask(Mask))
+    return false;
+  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    if (Mask[i] != i && Mask[i] != (NumElts + i))
+      return false;
+  }
+  return true;
+}
+
+bool ShuffleVectorInst::isReverseMask(ArrayRef<int> Mask) {
+  if (!isSingleSourceMask(Mask))
+    return false;
+  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    if (Mask[i] != (NumElts - 1 - i) && Mask[i] != (NumElts + NumElts - 1 - i))
+      return false;
+  }
+  return true;
+}
+
+bool ShuffleVectorInst::isZeroEltSplatMask(ArrayRef<int> Mask) {
+  if (!isSingleSourceMask(Mask))
+    return false;
+  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    if (Mask[i] != 0 && Mask[i] != NumElts)
+      return false;
+  }
+  return true;
+}
+
+bool ShuffleVectorInst::isSelectMask(ArrayRef<int> Mask) {
+  // Select is differentiated from identity. It requires using both sources.
+  if (isSingleSourceMask(Mask))
+    return false;
+  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    if (Mask[i] != i && Mask[i] != (NumElts + i))
+      return false;
+  }
+  return true;
+}
+
+bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask) {
+  // Example masks that will return true:
+  // v1 = <a, b, c, d>
+  // v2 = <e, f, g, h>
+  // trn1 = shufflevector v1, v2 <0, 4, 2, 6> = <a, e, c, g>
+  // trn2 = shufflevector v1, v2 <1, 5, 3, 7> = <b, f, d, h>
+
+  // 1. The number of elements in the mask must be a power-of-2 and at least 2.
+  int NumElts = Mask.size();
+  if (NumElts < 2 || !isPowerOf2_32(NumElts))
+    return false;
+
+  // 2. The first element of the mask must be either a 0 or a 1.
+  if (Mask[0] != 0 && Mask[0] != 1)
+    return false;
+
+  // 3. The difference between the first 2 elements must be equal to the
+  // number of elements in the mask.
+  if ((Mask[1] - Mask[0]) != NumElts)
+    return false;
+
+  // 4. The difference between consecutive even-numbered and odd-numbered
+  // elements must be equal to 2.
+  for (int i = 2; i < NumElts; ++i) {
+    int MaskEltVal = Mask[i];
+    if (MaskEltVal == -1)
+      return false;
+    int MaskEltPrevVal = Mask[i - 2];
+    if (MaskEltVal - MaskEltPrevVal != 2)
+      return false;
+  }
+  return true;
+}
+
+
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
@@ -2295,7 +2199,7 @@ bool CastInst::isLosslessCast() const {
 /// # bitcast i32* %x to i8*
 /// # bitcast <2 x i32> %x to <4 x i16> 
 /// # ptrtoint i32* %x to i32     ; on 32-bit plaforms only
-/// @brief Determine if the described cast is a no-op.
+/// Determine if the described cast is a no-op.
 bool CastInst::isNoopCast(Instruction::CastOps Opcode,
                           Type *SrcTy,
                           Type *DestTy,
@@ -2387,7 +2291,7 @@ unsigned CastInst::isEliminableCastPair(
     { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP         +- firstOp
     { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP         |
     { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc        |
-    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4, 0}, // FPExt          |
+    { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt          |
     {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt       |
     { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr       |
     {  5, 5, 5, 6, 6, 5, 5, 6, 6,16, 5, 1,14}, // BitCast        |
@@ -2481,12 +2385,6 @@ unsigned CastInst::isEliminableCastPair(
     case 9:
       // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
-    case 10:
-      // fpext followed by ftrunc is allowed if the bit size returned to is
-      // the same as the original, in which case its just a bitcast
-      if (SrcTy == DstTy)
-        return Instruction::BitCast;
-      return 0; // If the types are not the same we can't eliminate it.
     case 11: {
       // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
       if (!MidIntPtrTy)
@@ -2669,7 +2567,7 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
   return CreatePointerBitCastOrAddrSpaceCast(S, Ty, Name, InsertAtEnd);
 }
 
-/// @brief Create a BitCast or a PtrToInt cast instruction
+/// Create a BitCast or a PtrToInt cast instruction
 CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
                                       const Twine &Name,
                                       Instruction *InsertBefore) {
@@ -3437,6 +3335,29 @@ ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) {
   }
 }
 
+CmpInst::Predicate CmpInst::getFlippedStrictnessPredicate(Predicate pred) {
+  switch (pred) {
+    default: llvm_unreachable("Unknown or unsupported cmp predicate!");
+    case ICMP_SGT: return ICMP_SGE;
+    case ICMP_SLT: return ICMP_SLE;
+    case ICMP_SGE: return ICMP_SGT;
+    case ICMP_SLE: return ICMP_SLT;
+    case ICMP_UGT: return ICMP_UGE;
+    case ICMP_ULT: return ICMP_ULE;
+    case ICMP_UGE: return ICMP_UGT;
+    case ICMP_ULE: return ICMP_ULT;
+
+    case FCMP_OGT: return FCMP_OGE;
+    case FCMP_OLT: return FCMP_OLE;
+    case FCMP_OGE: return FCMP_OGT;
+    case FCMP_OLE: return FCMP_OLT;
+    case FCMP_UGT: return FCMP_UGE;
+    case FCMP_ULT: return FCMP_ULE;
+    case FCMP_UGE: return FCMP_UGT;
+    case FCMP_ULE: return FCMP_ULT;
+  }
+}
+
 CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown cmp predicate!");
@@ -3467,6 +3388,20 @@ CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
   }
 }
 
+CmpInst::Predicate CmpInst::getNonStrictPredicate(Predicate pred) {
+  switch (pred) {
+  case ICMP_SGT: return ICMP_SGE;
+  case ICMP_SLT: return ICMP_SLE;
+  case ICMP_UGT: return ICMP_UGE;
+  case ICMP_ULT: return ICMP_ULE;
+  case FCMP_OGT: return FCMP_OGE;
+  case FCMP_OLT: return FCMP_OLE;
+  case FCMP_UGT: return FCMP_UGE;
+  case FCMP_ULT: return FCMP_ULE;
+  default: return pred;
+  }
+}
+
 CmpInst::Predicate CmpInst::getSignedPredicate(Predicate pred) {
   assert(CmpInst::isUnsigned(pred) && "Call only with signed predicates!");
 
diff --git a/contrib/llvm/lib/IR/IntrinsicInst.cpp b/contrib/llvm/lib/IR/IntrinsicInst.cpp
index 67bd5b69bb0f..787889934d82 100644
--- a/contrib/llvm/lib/IR/IntrinsicInst.cpp
+++ b/contrib/llvm/lib/IR/IntrinsicInst.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -44,10 +45,19 @@ Value *DbgInfoIntrinsic::getVariableLocation(bool AllowNullOp) const {
     return V->getValue();
 
   // When the value goes to null, it gets replaced by an empty MDNode.
-  assert(!cast<MDNode>(MD)->getNumOperands() && "Expected an empty MDNode");
+  assert((isa<DbgLabelInst>(this)
+          || !cast<MDNode>(MD)->getNumOperands())
+	 && "DbgValueInst Expected an empty MDNode");
+
   return nullptr;
 }
 
+Optional<uint64_t> DbgInfoIntrinsic::getFragmentSizeInBits() const {
+  if (auto Fragment = getExpression()->getFragmentInfo())
+    return Fragment->SizeInBits;
+  return getVariable()->getSizeInBits();
+}
+
 int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
                                                StringRef Name) {
   assert(Name.startswith("llvm."));
diff --git a/contrib/llvm/lib/IR/LLVMContext.cpp b/contrib/llvm/lib/IR/LLVMContext.cpp
index c8b7c10a9a41..62d9e387162e 100644
--- a/contrib/llvm/lib/IR/LLVMContext.cpp
+++ b/contrib/llvm/lib/IR/LLVMContext.cpp
@@ -332,8 +332,12 @@ void LLVMContext::setDiscardValueNames(bool Discard) {
   pImpl->DiscardValueNames = Discard;
 }
 
-OptBisect &LLVMContext::getOptBisect() {
-  return pImpl->getOptBisect();
+OptPassGate &LLVMContext::getOptPassGate() const {
+  return pImpl->getOptPassGate();
+}
+
+void LLVMContext::setOptPassGate(OptPassGate& OPG) {
+  pImpl->setOptPassGate(OPG);
 }
 
 const DiagnosticHandler *LLVMContext::getDiagHandlerPtr() const {
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.cpp b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
index 4b44a6b69cad..3c34ca55c224 100644
--- a/contrib/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
@@ -48,6 +48,14 @@ LLVMContextImpl::~LLVMContextImpl() {
   while (!OwnedModules.empty())
     delete *OwnedModules.begin();
 
+#ifndef NDEBUG
+  // Check for metadata references from leaked Instructions.
+  for (auto &Pair : InstructionMetadata)
+    Pair.first->dump();
+  assert(InstructionMetadata.empty() &&
+         "Instructions with metadata have been leaked");
+#endif
+
   // Drop references for MDNodes.  Do this before Values get deleted to avoid
   // unnecessary RAUW when nodes are still unresolved.
   for (auto *I : DistinctMDNodes)
@@ -155,7 +163,7 @@ void Module::dropTriviallyDeadConstantArrays() {
 
 namespace llvm {
 
-/// \brief Make MDOperand transparent for hashing.
+/// Make MDOperand transparent for hashing.
 ///
 /// This overload of an implementation detail of the hashing library makes
 /// MDOperand hash to the same value as a \a Metadata pointer.
@@ -222,8 +230,8 @@ void LLVMContextImpl::getSyncScopeNames(
 
 /// Singleton instance of the OptBisect class.
 ///
-/// This singleton is accessed via the LLVMContext::getOptBisect() function.  It
-/// provides a mechanism to disable passes and individual optimizations at
+/// This singleton is accessed via the LLVMContext::getOptPassGate() function.
+/// It provides a mechanism to disable passes and individual optimizations at
 /// compile time based on a command line option (-opt-bisect-limit) in order to
 /// perform a bisecting search for optimization-related problems.
 ///
@@ -233,6 +241,12 @@ void LLVMContextImpl::getSyncScopeNames(
 /// enabled in order to enable a consistent bisect count.
 static ManagedStatic<OptBisect> OptBisector;
 
-OptBisect &LLVMContextImpl::getOptBisect() {
-  return *OptBisector;
+OptPassGate &LLVMContextImpl::getOptPassGate() const {
+  if (!OPG)
+    OPG = &(*OptBisector);
+  return *OPG;
+}
+
+void LLVMContextImpl::setOptPassGate(OptPassGate& OPG) {
+  this->OPG = &OPG;
 }
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.h b/contrib/llvm/lib/IR/LLVMContextImpl.h
index f41acfa8ea9c..d5046d644187 100644
--- a/contrib/llvm/lib/IR/LLVMContextImpl.h
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.h
@@ -202,7 +202,7 @@ struct FunctionTypeKeyInfo {
   }
 };
 
-/// \brief Structure for hashing arbitrary MDNode operands.
+/// Structure for hashing arbitrary MDNode operands.
 class MDNodeOpsKey {
   ArrayRef<Metadata *> RawOps;
   ArrayRef<MDOperand> Ops;
@@ -257,7 +257,7 @@ template <class NodeTy> struct MDNodeSubsetEqualImpl {
   }
 };
 
-/// \brief DenseMapInfo for MDTuple.
+/// DenseMapInfo for MDTuple.
 ///
 /// Note that we don't need the is-function-local bit, since that's implicit in
 /// the operands.
@@ -274,7 +274,7 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
   }
 };
 
-/// \brief DenseMapInfo for DILocation.
+/// DenseMapInfo for DILocation.
 template <> struct MDNodeKeyImpl<DILocation> {
   unsigned Line;
   unsigned Column;
@@ -298,7 +298,7 @@ template <> struct MDNodeKeyImpl<DILocation> {
   }
 };
 
-/// \brief DenseMapInfo for GenericDINode.
+/// DenseMapInfo for GenericDINode.
 template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
   unsigned Tag;
   MDString *Header;
@@ -321,31 +321,50 @@ template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
 };
 
 template <> struct MDNodeKeyImpl<DISubrange> {
-  int64_t Count;
+  Metadata *CountNode;
   int64_t LowerBound;
 
-  MDNodeKeyImpl(int64_t Count, int64_t LowerBound)
-      : Count(Count), LowerBound(LowerBound) {}
+  MDNodeKeyImpl(Metadata *CountNode, int64_t LowerBound)
+      : CountNode(CountNode), LowerBound(LowerBound) {}
   MDNodeKeyImpl(const DISubrange *N)
-      : Count(N->getCount()), LowerBound(N->getLowerBound()) {}
+      : CountNode(N->getRawCountNode()),
+        LowerBound(N->getLowerBound()) {}
 
   bool isKeyOf(const DISubrange *RHS) const {
-    return Count == RHS->getCount() && LowerBound == RHS->getLowerBound();
+    if (LowerBound != RHS->getLowerBound())
+      return false;
+
+    if (auto *RHSCount = RHS->getCount().dyn_cast<ConstantInt*>())
+      if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
+        if (RHSCount->getSExtValue() ==
+            cast<ConstantInt>(MD->getValue())->getSExtValue())
+          return true;
+
+    return CountNode == RHS->getRawCountNode();
   }
 
-  unsigned getHashValue() const { return hash_combine(Count, LowerBound); }
+  unsigned getHashValue() const {
+    if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
+      return hash_combine(cast<ConstantInt>(MD->getValue())->getSExtValue(),
+                          LowerBound);
+    return hash_combine(CountNode, LowerBound);
+  }
 };
 
 template <> struct MDNodeKeyImpl<DIEnumerator> {
   int64_t Value;
   MDString *Name;
+  bool IsUnsigned;
 
-  MDNodeKeyImpl(int64_t Value, MDString *Name) : Value(Value), Name(Name) {}
+  MDNodeKeyImpl(int64_t Value, bool IsUnsigned, MDString *Name)
+      : Value(Value), Name(Name), IsUnsigned(IsUnsigned) {}
   MDNodeKeyImpl(const DIEnumerator *N)
-      : Value(N->getValue()), Name(N->getRawName()) {}
+      : Value(N->getValue()), Name(N->getRawName()),
+        IsUnsigned(N->isUnsigned()) {}
 
   bool isKeyOf(const DIEnumerator *RHS) const {
-    return Value == RHS->getValue() && Name == RHS->getRawName();
+    return Value == RHS->getValue() && IsUnsigned == RHS->isUnsigned() &&
+           Name == RHS->getRawName();
   }
 
   unsigned getHashValue() const { return hash_combine(Value, Name); }
@@ -484,18 +503,20 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   Metadata *VTableHolder;
   Metadata *TemplateParams;
   MDString *Identifier;
+  Metadata *Discriminator;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint32_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
                 Metadata *Elements, unsigned RuntimeLang,
                 Metadata *VTableHolder, Metadata *TemplateParams,
-                MDString *Identifier)
+                MDString *Identifier, Metadata *Discriminator)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), Flags(Flags), Elements(Elements),
         RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
-        TemplateParams(TemplateParams), Identifier(Identifier) {}
+        TemplateParams(TemplateParams), Identifier(Identifier),
+        Discriminator(Discriminator) {}
   MDNodeKeyImpl(const DICompositeType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
@@ -504,7 +525,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
         Flags(N->getFlags()), Elements(N->getRawElements()),
         RuntimeLang(N->getRuntimeLang()), VTableHolder(N->getRawVTableHolder()),
         TemplateParams(N->getRawTemplateParams()),
-        Identifier(N->getRawIdentifier()) {}
+        Identifier(N->getRawIdentifier()),
+        Discriminator(N->getRawDiscriminator()) {}
 
   bool isKeyOf(const DICompositeType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -517,7 +539,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            RuntimeLang == RHS->getRuntimeLang() &&
            VTableHolder == RHS->getRawVTableHolder() &&
            TemplateParams == RHS->getRawTemplateParams() &&
-           Identifier == RHS->getRawIdentifier();
+           Identifier == RHS->getRawIdentifier() &&
+           Discriminator == RHS->getRawDiscriminator();
   }
 
   unsigned getHashValue() const {
@@ -551,26 +574,29 @@ template <> struct MDNodeKeyImpl<DISubroutineType> {
 template <> struct MDNodeKeyImpl<DIFile> {
   MDString *Filename;
   MDString *Directory;
-  DIFile::ChecksumKind CSKind;
-  MDString *Checksum;
+  Optional<DIFile::ChecksumInfo<MDString *>> Checksum;
+  Optional<MDString *> Source;
 
   MDNodeKeyImpl(MDString *Filename, MDString *Directory,
-                DIFile::ChecksumKind CSKind, MDString *Checksum)
-      : Filename(Filename), Directory(Directory), CSKind(CSKind),
-        Checksum(Checksum) {}
+                Optional<DIFile::ChecksumInfo<MDString *>> Checksum,
+                Optional<MDString *> Source)
+      : Filename(Filename), Directory(Directory), Checksum(Checksum),
+        Source(Source) {}
   MDNodeKeyImpl(const DIFile *N)
       : Filename(N->getRawFilename()), Directory(N->getRawDirectory()),
-        CSKind(N->getChecksumKind()), Checksum(N->getRawChecksum()) {}
+        Checksum(N->getRawChecksum()), Source(N->getRawSource()) {}
 
   bool isKeyOf(const DIFile *RHS) const {
     return Filename == RHS->getRawFilename() &&
            Directory == RHS->getRawDirectory() &&
-           CSKind == RHS->getChecksumKind() &&
-           Checksum == RHS->getRawChecksum();
+           Checksum == RHS->getRawChecksum() &&
+           Source == RHS->getRawSource();
   }
 
   unsigned getHashValue() const {
-    return hash_combine(Filename, Directory, CSKind, Checksum);
+    return hash_combine(
+        Filename, Directory, Checksum ? Checksum->Kind : 0,
+        Checksum ? Checksum->Value : nullptr, Source.getValueOr(nullptr));
   }
 };
 
@@ -593,7 +619,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *Unit;
   Metadata *TemplateParams;
   Metadata *Declaration;
-  Metadata *Variables;
+  Metadata *RetainedNodes;
   Metadata *ThrownTypes;
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
@@ -602,7 +628,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
                 Metadata *ContainingType, unsigned Virtuality,
                 unsigned VirtualIndex, int ThisAdjustment, unsigned Flags,
                 bool IsOptimized, Metadata *Unit, Metadata *TemplateParams,
-                Metadata *Declaration, Metadata *Variables,
+                Metadata *Declaration, Metadata *RetainedNodes,
                 Metadata *ThrownTypes)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
@@ -611,7 +637,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         VirtualIndex(VirtualIndex), ThisAdjustment(ThisAdjustment),
         Flags(Flags), IsOptimized(IsOptimized), Unit(Unit),
         TemplateParams(TemplateParams), Declaration(Declaration),
-        Variables(Variables), ThrownTypes(ThrownTypes) {}
+        RetainedNodes(RetainedNodes), ThrownTypes(ThrownTypes) {}
   MDNodeKeyImpl(const DISubprogram *N)
       : Scope(N->getRawScope()), Name(N->getRawName()),
         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
@@ -622,7 +648,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         ThisAdjustment(N->getThisAdjustment()), Flags(N->getFlags()),
         IsOptimized(N->isOptimized()), Unit(N->getRawUnit()),
         TemplateParams(N->getRawTemplateParams()),
-        Declaration(N->getRawDeclaration()), Variables(N->getRawVariables()),
+        Declaration(N->getRawDeclaration()), RetainedNodes(N->getRawRetainedNodes()),
         ThrownTypes(N->getRawThrownTypes()) {}
 
   bool isKeyOf(const DISubprogram *RHS) const {
@@ -640,7 +666,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            Unit == RHS->getUnit() &&
            TemplateParams == RHS->getRawTemplateParams() &&
            Declaration == RHS->getRawDeclaration() &&
-           Variables == RHS->getRawVariables() &&
+           RetainedNodes == RHS->getRawRetainedNodes() &&
            ThrownTypes == RHS->getRawThrownTypes();
   }
 
@@ -922,6 +948,29 @@ template <> struct MDNodeKeyImpl<DILocalVariable> {
   }
 };
 
+template <> struct MDNodeKeyImpl<DILabel> {
+  Metadata *Scope;
+  MDString *Name;
+  Metadata *File;
+  unsigned Line;
+
+  MDNodeKeyImpl(Metadata *Scope, MDString *Name, Metadata *File, unsigned Line)
+      : Scope(Scope), Name(Name), File(File), Line(Line) {}
+  MDNodeKeyImpl(const DILabel *N)
+      : Scope(N->getRawScope()), Name(N->getRawName()), File(N->getRawFile()),
+        Line(N->getLine()) {}
+
+  bool isKeyOf(const DILabel *RHS) const {
+    return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
+           File == RHS->getRawFile() && Line == RHS->getLine();
+  }
+
+  /// Using name and line to get hash value. It should already be mostly unique.
+  unsigned getHashValue() const {
+    return hash_combine(Scope, Name, Line);
+  }
+};
+
 template <> struct MDNodeKeyImpl<DIExpression> {
   ArrayRef<uint64_t> Elements;
 
@@ -1058,7 +1107,7 @@ template <> struct MDNodeKeyImpl<DIMacroFile> {
   }
 };
 
-/// \brief DenseMapInfo for MDNode subclasses.
+/// DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
   using KeyTy = MDNodeKeyImpl<NodeTy>;
   using SubsetEqualTy = MDNodeSubsetEqualImpl<NodeTy>;
@@ -1095,7 +1144,7 @@ template <class NodeTy> struct MDNodeInfo {
 #define HANDLE_MDNODE_LEAF(CLASS) using CLASS##Info = MDNodeInfo<CLASS>;
 #include "llvm/IR/Metadata.def"
 
-/// \brief Map-like storage for metadata attachments.
+/// Map-like storage for metadata attachments.
 class MDAttachmentMap {
   SmallVector<std::pair<unsigned, TrackingMDNodeRef>, 2> Attachments;
 
@@ -1103,27 +1152,27 @@ public:
   bool empty() const { return Attachments.empty(); }
   size_t size() const { return Attachments.size(); }
 
-  /// \brief Get a particular attachment (if any).
+  /// Get a particular attachment (if any).
   MDNode *lookup(unsigned ID) const;
 
-  /// \brief Set an attachment to a particular node.
+  /// Set an attachment to a particular node.
   ///
   /// Set the \c ID attachment to \c MD, replacing the current attachment at \c
   /// ID (if anyway).
   void set(unsigned ID, MDNode &MD);
 
-  /// \brief Remove an attachment.
+  /// Remove an attachment.
   ///
   /// Remove the attachment at \c ID, if any.
-  void erase(unsigned ID);
+  bool erase(unsigned ID);
 
-  /// \brief Copy out all the attachments.
+  /// Copy out all the attachments.
   ///
   /// Copies all the current attachments into \c Result, sorting by attachment
   /// ID.  This function does \em not clear \c Result.
   void getAll(SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const;
 
-  /// \brief Erase matching attachments.
+  /// Erase matching attachments.
   ///
   /// Erases all attachments matching the \c shouldRemove predicate.
   template <class PredTy> void remove_if(PredTy shouldRemove) {
@@ -1148,10 +1197,14 @@ public:
   /// Appends all attachments with the given ID to \c Result in insertion order.
   /// If the global has no attachments with the given ID, or if ID is invalid,
   /// leaves Result unchanged.
-  void get(unsigned ID, SmallVectorImpl<MDNode *> &Result);
+  void get(unsigned ID, SmallVectorImpl<MDNode *> &Result) const;
+
+  /// Returns the first attachment with the given ID or nullptr if no such
+  /// attachment exists.
+  MDNode *lookup(unsigned ID) const;
 
   void insert(unsigned ID, MDNode &MD);
-  void erase(unsigned ID);
+  bool erase(unsigned ID);
 
   /// Appends all attachments for the global to \c Result, sorting by attachment
   /// ID. Attachments with the same ID appear in insertion order. This function
@@ -1288,7 +1341,7 @@ public:
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
 
-  /// \brief A set of interned tags for operand bundles.  The StringMap maps
+  /// A set of interned tags for operand bundles.  The StringMap maps
   /// bundle tags to their IDs.
   ///
   /// \see LLVMContext::getOperandBundleTagID
@@ -1329,9 +1382,18 @@ public:
   /// Destroy the ConstantArrays if they are not used.
   void dropTriviallyDeadConstantArrays();
 
-  /// \brief Access the object which manages optimization bisection for failure
-  /// analysis.
-  OptBisect &getOptBisect();
+  mutable OptPassGate *OPG = nullptr;
+
+  /// Access the object which can disable optional passes and individual
+  /// optimizations at compile time.
+  OptPassGate &getOptPassGate() const;
+
+  /// Set the object which can disable optional passes and individual
+  /// optimizations at compile time.
+  ///
+  /// The lifetime of the object must be guaranteed to extend as long as the
+  /// LLVMContext is used by compilation.
+  void setOptPassGate(OptPassGate&);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/IR/LegacyPassManager.cpp b/contrib/llvm/lib/IR/LegacyPassManager.cpp
index 8bd9ed6ef0fa..54d602d926e5 100644
--- a/contrib/llvm/lib/IR/LegacyPassManager.cpp
+++ b/contrib/llvm/lib/IR/LegacyPassManager.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManagers.h"
@@ -28,7 +30,6 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <map>
 #include <unordered_set>
 using namespace llvm;
 using namespace llvm::legacy;
@@ -86,7 +87,7 @@ static cl::opt<bool>
     PrintModuleScope("print-module-scope",
                      cl::desc("When printing IR for print-[before|after]{-all} "
                               "always print a module IR"),
-                     cl::init(false));
+                     cl::init(false), cl::Hidden);
 
 static cl::list<std::string>
     PrintFuncsList("filter-print-funcs", cl::value_desc("function names"),
@@ -134,8 +135,60 @@ bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
   return PassDebugging >= Executions;
 }
 
+unsigned PMDataManager::initSizeRemarkInfo(Module &M) {
+  // Only calculate getInstructionCount if the size-info remark is requested.
+  return M.getInstructionCount();
+}
+
+void PMDataManager::emitInstrCountChangedRemark(Pass *P, Module &M,
+                                                unsigned CountBefore) {
+  // We need a function containing at least one basic block in order to output
+  // remarks. Since it's possible that the first function in the module doesn't
+  // actually contain a basic block, we have to go and find one that's suitable
+  // for emitting remarks.
+  auto It = std::find_if(M.begin(), M.end(),
+                         [](const Function &Fn) { return !Fn.empty(); });
+
+  // Didn't find a function. Quit.
+  if (It == M.end())
+    return;
+
+  // We found a function containing at least one basic block.
+  Function *F = &*It;
+
+  // How many instructions are in the module now?
+  unsigned CountAfter = M.getInstructionCount();
+
+  // If there was no change, don't emit a remark.
+  if (CountBefore == CountAfter)
+    return;
 
+  // If it's a pass manager, don't emit a remark. (This hinges on the assumption
+  // that the only passes that return non-null with getAsPMDataManager are pass
+  // managers.) The reason we have to do this is to avoid emitting remarks for
+  // CGSCC passes.
+  if (P->getAsPMDataManager())
+    return;
 
+  // Compute a possibly negative delta between the instruction count before
+  // running P, and after running P.
+  int64_t Delta =
+      static_cast<int64_t>(CountAfter) - static_cast<int64_t>(CountBefore);
+
+  BasicBlock &BB = *F->begin();
+  OptimizationRemarkAnalysis R("size-info", "IRSizeChange",
+                               DiagnosticLocation(), &BB);
+  // FIXME: Move ore namespace to DiagnosticInfo so that we can use it. This
+  // would let us use NV instead of DiagnosticInfoOptimizationBase::Argument.
+  R << DiagnosticInfoOptimizationBase::Argument("Pass", P->getPassName())
+    << ": IR instruction count changed from "
+    << DiagnosticInfoOptimizationBase::Argument("IRInstrsBefore", CountBefore)
+    << " to "
+    << DiagnosticInfoOptimizationBase::Argument("IRInstrsAfter", CountAfter)
+    << "; Delta: "
+    << DiagnosticInfoOptimizationBase::Argument("DeltaInstrCount", Delta);
+  F->getContext().diagnose(R); // Not using ORE for layering reasons.
+}
 
 void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
   if (!V && !M)
@@ -355,8 +408,8 @@ public:
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       ModulePass *MP = getContainedPass(Index);
       MP->dumpPassStructure(Offset + 1);
-      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
-        OnTheFlyManagers.find(MP);
+      MapVector<Pass *, FunctionPassManagerImpl *>::const_iterator I =
+          OnTheFlyManagers.find(MP);
       if (I != OnTheFlyManagers.end())
         I->second->dumpPassStructure(Offset + 2);
       dumpLastUses(MP, Offset+1);
@@ -375,7 +428,7 @@ public:
  private:
   /// Collection of on the fly FPPassManagers. These managers manage
   /// function passes that are required by module passes.
-  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+   MapVector<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
 };
 
 char MPPassManager::ID = 0;
@@ -486,7 +539,11 @@ public:
     Timer *&T = TimingData[P];
     if (!T) {
       StringRef PassName = P->getPassName();
-      T = new Timer(PassName, PassName, TG);
+      StringRef PassArgument;
+      if (const PassInfo *PI = Pass::lookupPassInfo(P->getPassID()))
+        PassArgument = PI->getPassArgument();
+      T = new Timer(PassArgument.empty() ? PassName : PassArgument, PassName,
+                    TG);
     }
     return T;
   }
@@ -585,7 +642,7 @@ AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
     // of dependencies.
     AnalysisUsage AU;
     P->getAnalysisUsage(AU);
-    
+
     AUFoldingSetNode* Node = nullptr;
     FoldingSetNodeID ID;
     AUFoldingSetNode::Profile(ID, AU);
@@ -1284,7 +1341,10 @@ bool BBPassManager::runOnFunction(Function &F) {
     return false;
 
   bool Changed = doInitialization(F);
+  Module &M = *F.getParent();
 
+  unsigned InstrCount = 0;
+  bool EmitICRemark = M.shouldEmitInstrCountChangedRemark();
   for (BasicBlock &BB : F)
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       BasicBlockPass *BP = getContainedPass(Index);
@@ -1299,8 +1359,11 @@ bool BBPassManager::runOnFunction(Function &F) {
         // If the pass crashes, remember this.
         PassManagerPrettyStackEntry X(BP, BB);
         TimeRegion PassTimer(getPassTimer(BP));
-
+        if (EmitICRemark)
+          InstrCount = initSizeRemarkInfo(M);
         LocalChanged |= BP->runOnBasicBlock(BB);
+        if (EmitICRemark)
+          emitInstrCountChangedRemark(BP, M, InstrCount);
       }
 
       Changed |= LocalChanged;
@@ -1500,10 +1563,12 @@ bool FPPassManager::runOnFunction(Function &F) {
     return false;
 
   bool Changed = false;
-
+  Module &M = *F.getParent();
   // Collect inherited analysis from Module level pass manager.
   populateInheritedAnalysis(TPM->activeStack);
 
+  unsigned InstrCount = 0;
+  bool EmitICRemark = M.shouldEmitInstrCountChangedRemark();
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
     FunctionPass *FP = getContainedPass(Index);
     bool LocalChanged = false;
@@ -1516,8 +1581,11 @@ bool FPPassManager::runOnFunction(Function &F) {
     {
       PassManagerPrettyStackEntry X(FP, F);
       TimeRegion PassTimer(getPassTimer(FP));
-
+      if (EmitICRemark)
+        InstrCount = initSizeRemarkInfo(M);
       LocalChanged |= FP->runOnFunction(F);
+      if (EmitICRemark)
+        emitInstrCountChangedRemark(FP, M, InstrCount);
     }
 
     Changed |= LocalChanged;
@@ -1581,6 +1649,8 @@ MPPassManager::runOnModule(Module &M) {
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
     Changed |= getContainedPass(Index)->doInitialization(M);
 
+  unsigned InstrCount = 0;
+  bool EmitICRemark = M.shouldEmitInstrCountChangedRemark();
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
     ModulePass *MP = getContainedPass(Index);
     bool LocalChanged = false;
@@ -1594,7 +1664,11 @@ MPPassManager::runOnModule(Module &M) {
       PassManagerPrettyStackEntry X(MP, M);
       TimeRegion PassTimer(getPassTimer(MP));
 
+      if (EmitICRemark)
+        InstrCount = initSizeRemarkInfo(M);
       LocalChanged |= MP->runOnModule(M);
+      if (EmitICRemark)
+        emitInstrCountChangedRemark(MP, M, InstrCount);
     }
 
     Changed |= LocalChanged;
diff --git a/contrib/llvm/lib/IR/MDBuilder.cpp b/contrib/llvm/lib/IR/MDBuilder.cpp
index 6d77a8f2d601..1bb23c0330f3 100644
--- a/contrib/llvm/lib/IR/MDBuilder.cpp
+++ b/contrib/llvm/lib/IR/MDBuilder.cpp
@@ -58,10 +58,14 @@ MDNode *MDBuilder::createUnpredictable() {
 }
 
 MDNode *MDBuilder::createFunctionEntryCount(
-    uint64_t Count, const DenseSet<GlobalValue::GUID> *Imports) {
+    uint64_t Count, bool Synthetic,
+    const DenseSet<GlobalValue::GUID> *Imports) {
   Type *Int64Ty = Type::getInt64Ty(Context);
   SmallVector<Metadata *, 8> Ops;
-  Ops.push_back(createString("function_entry_count"));
+  if (Synthetic)
+    Ops.push_back(createString("synthetic_function_entry_count"));
+  else
+    Ops.push_back(createString("function_entry_count"));
   Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count)));
   if (Imports) {
     SmallVector<GlobalValue::GUID, 2> OrderID(Imports->begin(), Imports->end());
@@ -129,7 +133,7 @@ MDNode *MDBuilder::createTBAARoot(StringRef Name) {
   return MDNode::get(Context, createString(Name));
 }
 
-/// \brief Return metadata for a non-root TBAA node with the given name,
+/// Return metadata for a non-root TBAA node with the given name,
 /// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
 MDNode *MDBuilder::createTBAANode(StringRef Name, MDNode *Parent,
                                   bool isConstant) {
@@ -149,7 +153,7 @@ MDNode *MDBuilder::createAliasScope(StringRef Name, MDNode *Domain) {
   return MDNode::get(Context, {createString(Name), Domain});
 }
 
-/// \brief Return metadata for a tbaa.struct node with the given
+/// Return metadata for a tbaa.struct node with the given
 /// struct field descriptions.
 MDNode *MDBuilder::createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
   SmallVector<Metadata *, 4> Vals(Fields.size() * 3);
@@ -162,7 +166,7 @@ MDNode *MDBuilder::createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
   return MDNode::get(Context, Vals);
 }
 
-/// \brief Return metadata for a TBAA struct node in the type DAG
+/// Return metadata for a TBAA struct node in the type DAG
 /// with the given name, a list of pairs (offset, field type in the type DAG).
 MDNode *MDBuilder::createTBAAStructTypeNode(
     StringRef Name, ArrayRef<std::pair<MDNode *, uint64_t>> Fields) {
@@ -176,7 +180,7 @@ MDNode *MDBuilder::createTBAAStructTypeNode(
   return MDNode::get(Context, Ops);
 }
 
-/// \brief Return metadata for a TBAA scalar type node with the
+/// Return metadata for a TBAA scalar type node with the
 /// given name, an offset and a parent in the TBAA type DAG.
 MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
                                             uint64_t Offset) {
@@ -185,7 +189,7 @@ MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
                      {createString(Name), Parent, createConstant(Off)});
 }
 
-/// \brief Return metadata for a TBAA tag node with the given
+/// Return metadata for a TBAA tag node with the given
 /// base type, access type and offset relative to the base type.
 MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
                                            uint64_t Offset, bool IsConstant) {
@@ -228,6 +232,33 @@ MDNode *MDBuilder::createTBAAAccessTag(MDNode *BaseType, MDNode *AccessType,
   return MDNode::get(Context, {BaseType, AccessType, OffsetNode, SizeNode});
 }
 
+MDNode *MDBuilder::createMutableTBAAAccessTag(MDNode *Tag) {
+  MDNode *BaseType = cast<MDNode>(Tag->getOperand(0));
+  MDNode *AccessType = cast<MDNode>(Tag->getOperand(1));
+  Metadata *OffsetNode = Tag->getOperand(2);
+  uint64_t Offset = mdconst::extract<ConstantInt>(OffsetNode)->getZExtValue();
+
+  bool NewFormat = isa<MDNode>(AccessType->getOperand(0));
+
+  // See if the tag is already mutable.
+  unsigned ImmutabilityFlagOp = NewFormat ? 4 : 3;
+  if (Tag->getNumOperands() <= ImmutabilityFlagOp)
+    return Tag;
+
+  // If Tag is already mutable then return it.
+  Metadata *ImmutabilityFlagNode = Tag->getOperand(ImmutabilityFlagOp);
+  if (!mdconst::extract<ConstantInt>(ImmutabilityFlagNode)->getValue())
+    return Tag;
+
+  // Otherwise, create another node.
+  if (!NewFormat)
+    return createTBAAStructTagNode(BaseType, AccessType, Offset);
+
+  Metadata *SizeNode = Tag->getOperand(3);
+  uint64_t Size = mdconst::extract<ConstantInt>(SizeNode)->getZExtValue();
+  return createTBAAAccessTag(BaseType, AccessType, Offset, Size);
+}
+
 MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
   SmallVector<Metadata *, 2> Vals(2);
   Vals[0] = createString("loop_header_weight");
diff --git a/contrib/llvm/lib/IR/Mangler.cpp b/contrib/llvm/lib/IR/Mangler.cpp
index 03723bfd2ddb..be3086cfcf05 100644
--- a/contrib/llvm/lib/IR/Mangler.cpp
+++ b/contrib/llvm/lib/IR/Mangler.cpp
@@ -44,6 +44,9 @@ static void getNameWithPrefixImpl(raw_ostream &OS, const Twine &GVName,
     return;
   }
 
+  if (DL.doNotMangleLeadingQuestionMark() && Name[0] == '?')
+    Prefix = '\0';
+
   if (PrefixTy == Private)
     OS << DL.getPrivateGlobalPrefix();
   else if (PrefixTy == LinkerPrivate)
@@ -135,8 +138,13 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV,
   // Mangle functions with Microsoft calling conventions specially.  Only do
   // this mangling for x86_64 vectorcall and 32-bit x86.
   const Function *MSFunc = dyn_cast<Function>(GV);
-  if (Name.startswith("\01"))
-    MSFunc = nullptr; // Don't mangle when \01 is present.
+
+  // Don't add byte count suffixes when '\01' or '?' are in the first
+  // character.
+  if (Name.startswith("\01") ||
+      (DL.doNotMangleLeadingQuestionMark() && Name.startswith("?")))
+    MSFunc = nullptr;
+
   CallingConv::ID CC =
       MSFunc ? MSFunc->getCallingConv() : (unsigned)CallingConv::C;
   if (!DL.hasMicrosoftFastStdCallMangling() &&
@@ -204,3 +212,13 @@ void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
       OS << ",data";
   }
 }
+
+void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
+                                      const Triple &T, Mangler &M) {
+  if (!T.isKnownWindowsMSVCEnvironment())
+    return;
+
+  OS << " /INCLUDE:";
+  M.getNameWithPrefix(OS, GV, false);
+}
+
diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp
index a148ab65fc83..83a22d95bd81 100644
--- a/contrib/llvm/lib/IR/Metadata.cpp
+++ b/contrib/llvm/lib/IR/Metadata.cpp
@@ -237,7 +237,7 @@ void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
   // Copy out uses since UseMap will get touched below.
   using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
-  std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
+  llvm::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
     return L.second.second < R.second.second;
   });
   for (const auto &Pair : Uses) {
@@ -290,7 +290,7 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) {
   // Copy out uses since UseMap could get touched below.
   using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
-  std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
+  llvm::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
     return L.second.second < R.second.second;
   });
   UseMap.clear();
@@ -329,12 +329,20 @@ bool ReplaceableMetadataImpl::isReplaceable(const Metadata &MD) {
   return dyn_cast<ValueAsMetadata>(&MD);
 }
 
-static Function *getLocalFunction(Value *V) {
+static DISubprogram *getLocalFunctionMetadata(Value *V) {
   assert(V && "Expected value");
-  if (auto *A = dyn_cast<Argument>(V))
-    return A->getParent();
-  if (BasicBlock *BB = cast<Instruction>(V)->getParent())
-    return BB->getParent();
+  if (auto *A = dyn_cast<Argument>(V)) {
+    if (auto *Fn = A->getParent())
+      return Fn->getSubprogram();
+    return nullptr;
+  }
+
+  if (BasicBlock *BB = cast<Instruction>(V)->getParent()) {
+    if (auto *Fn = BB->getParent())
+      return Fn->getSubprogram();
+    return nullptr;
+  }
+
   return nullptr;
 }
 
@@ -410,9 +418,9 @@ void ValueAsMetadata::handleRAUW(Value *From, Value *To) {
       delete MD;
       return;
     }
-    if (getLocalFunction(From) && getLocalFunction(To) &&
-        getLocalFunction(From) != getLocalFunction(To)) {
-      // Function changed.
+    if (getLocalFunctionMetadata(From) && getLocalFunctionMetadata(To) &&
+        getLocalFunctionMetadata(From) != getLocalFunctionMetadata(To)) {
+      // DISubprogram changed.
       MD->replaceAllUsesWith(nullptr);
       delete MD;
       return;
@@ -1102,14 +1110,14 @@ void MDAttachmentMap::set(unsigned ID, MDNode &MD) {
                            std::make_tuple(&MD));
 }
 
-void MDAttachmentMap::erase(unsigned ID) {
+bool MDAttachmentMap::erase(unsigned ID) {
   if (empty())
-    return;
+    return false;
 
   // Common case is one/last value.
   if (Attachments.back().first == ID) {
     Attachments.pop_back();
-    return;
+    return true;
   }
 
   for (auto I = Attachments.begin(), E = std::prev(Attachments.end()); I != E;
@@ -1117,8 +1125,10 @@ void MDAttachmentMap::erase(unsigned ID) {
     if (I->first == ID) {
       *I = std::move(Attachments.back());
       Attachments.pop_back();
-      return;
+      return true;
     }
+
+  return false;
 }
 
 MDNode *MDAttachmentMap::lookup(unsigned ID) const {
@@ -1141,29 +1151,31 @@ void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) {
   Attachments.push_back({ID, TrackingMDNodeRef(&MD)});
 }
 
+MDNode *MDGlobalAttachmentMap::lookup(unsigned ID) const {
+  for (const auto &A : Attachments)
+    if (A.MDKind == ID)
+      return A.Node;
+  return nullptr;
+}
+
 void MDGlobalAttachmentMap::get(unsigned ID,
-                                SmallVectorImpl<MDNode *> &Result) {
-  for (auto A : Attachments)
+                                SmallVectorImpl<MDNode *> &Result) const {
+  for (const auto &A : Attachments)
     if (A.MDKind == ID)
       Result.push_back(A.Node);
 }
 
-void MDGlobalAttachmentMap::erase(unsigned ID) {
-  auto Follower = Attachments.begin();
-  for (auto Leader = Attachments.begin(), E = Attachments.end(); Leader != E;
-       ++Leader) {
-    if (Leader->MDKind != ID) {
-      if (Follower != Leader)
-        *Follower = std::move(*Leader);
-      ++Follower;
-    }
-  }
-  Attachments.resize(Follower - Attachments.begin());
+bool MDGlobalAttachmentMap::erase(unsigned ID) {
+  auto I = std::remove_if(Attachments.begin(), Attachments.end(),
+                          [ID](const Attachment &A) { return A.MDKind == ID; });
+  bool Changed = I != Attachments.end();
+  Attachments.erase(I, Attachments.end());
+  return Changed;
 }
 
 void MDGlobalAttachmentMap::getAll(
     SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
-  for (auto &A : Attachments)
+  for (const auto &A : Attachments)
     Result.emplace_back(A.MDKind, A.Node);
 
   // Sort the resulting array so it is stable with respect to metadata IDs. We
@@ -1390,15 +1402,16 @@ void GlobalObject::addMetadata(StringRef Kind, MDNode &MD) {
   addMetadata(getContext().getMDKindID(Kind), MD);
 }
 
-void GlobalObject::eraseMetadata(unsigned KindID) {
+bool GlobalObject::eraseMetadata(unsigned KindID) {
   // Nothing to unset.
   if (!hasMetadata())
-    return;
+    return false;
 
   auto &Store = getContext().pImpl->GlobalObjectMetadata[this];
-  Store.erase(KindID);
+  bool Changed = Store.erase(KindID);
   if (Store.empty())
     clearMetadata();
+  return Changed;
 }
 
 void GlobalObject::getAllMetadata(
@@ -1429,11 +1442,9 @@ void GlobalObject::setMetadata(StringRef Kind, MDNode *N) {
 }
 
 MDNode *GlobalObject::getMetadata(unsigned KindID) const {
-  SmallVector<MDNode *, 1> MDs;
-  getMetadata(KindID, MDs);
-  if (MDs.empty())
-    return nullptr;
-  return MDs[0];
+  if (hasMetadata())
+    return getContext().pImpl->GlobalObjectMetadata[this].lookup(KindID);
+  return nullptr;
 }
 
 MDNode *GlobalObject::getMetadata(StringRef Kind) const {
diff --git a/contrib/llvm/lib/IR/Module.cpp b/contrib/llvm/lib/IR/Module.cpp
index c230a50044c7..f18024063533 100644
--- a/contrib/llvm/lib/IR/Module.cpp
+++ b/contrib/llvm/lib/IR/Module.cpp
@@ -464,6 +464,13 @@ unsigned Module::getCodeViewFlag() const {
   return cast<ConstantInt>(Val->getValue())->getZExtValue();
 }
 
+unsigned Module::getInstructionCount() {
+  unsigned NumInstrs = 0;
+  for (Function &F : FunctionList)
+    NumInstrs += F.getInstructionCount();
+  return NumInstrs;
+}
+
 Comdat *Module::getOrInsertComdat(StringRef Name) {
   auto &Entry = *ComdatSymTab.insert(std::make_pair(Name, Comdat())).first;
   Entry.second.Name = &Entry;
@@ -510,6 +517,15 @@ void Module::setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB) {
   OwnedMemoryBuffer = std::move(MB);
 }
 
+bool Module::getRtLibUseGOT() const {
+  auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("RtLibUseGOT"));
+  return Val && (cast<ConstantInt>(Val->getValue())->getZExtValue() > 0);
+}
+
+void Module::setRtLibUseGOT() {
+  addModuleFlag(ModFlagBehavior::Max, "RtLibUseGOT", 1);
+}
+
 GlobalVariable *llvm::collectUsedGlobalVariables(
     const Module &M, SmallPtrSetImpl<GlobalValue *> &Set, bool CompilerUsed) {
   const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
diff --git a/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp b/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp
index 51c4bae3332e..4c4466f9a902 100644
--- a/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -13,9 +13,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+FunctionSummary FunctionSummary::ExternalNode =
+    FunctionSummary::makeDummyFunctionSummary({});
+bool ValueInfo::isDSOLocal() const {
+  // Need to check all summaries are local in case of hash collisions.
+  return getSummaryList().size() &&
+         llvm::all_of(getSummaryList(),
+                      [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+                        return Summary->isDSOLocal();
+                      });
+}
+
 // Collect for the given module the list of function it defines
 // (GUID -> Summary).
 void ModuleSummaryIndex::collectDefinedFunctionsForModule(
@@ -69,3 +83,267 @@ bool ModuleSummaryIndex::isGUIDLive(GlobalValue::GUID GUID) const {
       return true;
   return false;
 }
+
+// TODO: write a graphviz dumper for SCCs (see ModuleSummaryIndex::exportToDot)
+// then delete this function and update its tests
+LLVM_DUMP_METHOD
+void ModuleSummaryIndex::dumpSCCs(raw_ostream &O) {
+  for (scc_iterator<ModuleSummaryIndex *> I =
+           scc_begin<ModuleSummaryIndex *>(this);
+       !I.isAtEnd(); ++I) {
+    O << "SCC (" << utostr(I->size()) << " node" << (I->size() == 1 ? "" : "s")
+      << ") {\n";
+    for (const ValueInfo V : *I) {
+      FunctionSummary *F = nullptr;
+      if (V.getSummaryList().size())
+        F = cast<FunctionSummary>(V.getSummaryList().front().get());
+      O << " " << (F == nullptr ? "External" : "") << " " << utostr(V.getGUID())
+        << (I.hasLoop() ? " (has loop)" : "") << "\n";
+    }
+    O << "}\n";
+  }
+}
+
+namespace {
+struct Attributes {
+  void add(const Twine &Name, const Twine &Value,
+           const Twine &Comment = Twine());
+  std::string getAsString() const;
+
+  std::vector<std::string> Attrs;
+  std::string Comments;
+};
+
+struct Edge {
+  uint64_t SrcMod;
+  int Hotness;
+  GlobalValue::GUID Src;
+  GlobalValue::GUID Dst;
+};
+}
+
+void Attributes::add(const Twine &Name, const Twine &Value,
+                     const Twine &Comment) {
+  std::string A = Name.str();
+  A += "=\"";
+  A += Value.str();
+  A += "\"";
+  Attrs.push_back(A);
+  if (!Comment.isTriviallyEmpty()) {
+    if (Comments.empty())
+      Comments = " // ";
+    else
+      Comments += ", ";
+    Comments += Comment.str();
+  }
+}
+
+std::string Attributes::getAsString() const {
+  if (Attrs.empty())
+    return "";
+
+  std::string Ret = "[";
+  for (auto &A : Attrs)
+    Ret += A + ",";
+  Ret.pop_back();
+  Ret += "];";
+  Ret += Comments;
+  return Ret;
+}
+
+static std::string linkageToString(GlobalValue::LinkageTypes LT) {
+  switch (LT) {
+  case GlobalValue::ExternalLinkage:
+    return "extern";
+  case GlobalValue::AvailableExternallyLinkage:
+    return "av_ext";
+  case GlobalValue::LinkOnceAnyLinkage:
+    return "linkonce";
+  case GlobalValue::LinkOnceODRLinkage:
+    return "linkonce_odr";
+  case GlobalValue::WeakAnyLinkage:
+    return "weak";
+  case GlobalValue::WeakODRLinkage:
+    return "weak_odr";
+  case GlobalValue::AppendingLinkage:
+    return "appending";
+  case GlobalValue::InternalLinkage:
+    return "internal";
+  case GlobalValue::PrivateLinkage:
+    return "private";
+  case GlobalValue::ExternalWeakLinkage:
+    return "extern_weak";
+  case GlobalValue::CommonLinkage:
+    return "common";
+  }
+
+  return "<unknown>";
+}
+
+static std::string fflagsToString(FunctionSummary::FFlags F) {
+  auto FlagValue = [](unsigned V) { return V ? '1' : '0'; };
+  char FlagRep[] = {FlagValue(F.ReadNone), FlagValue(F.ReadOnly),
+                    FlagValue(F.NoRecurse), FlagValue(F.ReturnDoesNotAlias), 0};
+
+  return FlagRep;
+}
+
+// Get string representation of function instruction count and flags.
+static std::string getSummaryAttributes(GlobalValueSummary* GVS) {
+  auto *FS = dyn_cast_or_null<FunctionSummary>(GVS);
+  if (!FS)
+    return "";
+
+  return std::string("inst: ") + std::to_string(FS->instCount()) +
+         ", ffl: " + fflagsToString(FS->fflags());
+}
+
+static std::string getNodeVisualName(const ValueInfo &VI) {
+  return VI.name().empty() ? std::string("@") + std::to_string(VI.getGUID())
+                           : VI.name().str();
+}
+
+static std::string getNodeLabel(const ValueInfo &VI, GlobalValueSummary *GVS) {
+  if (isa<AliasSummary>(GVS))
+    return getNodeVisualName(VI);
+
+  std::string Attrs = getSummaryAttributes(GVS);
+  std::string Label =
+      getNodeVisualName(VI) + "|" + linkageToString(GVS->linkage());
+  if (!Attrs.empty())
+    Label += std::string(" (") + Attrs + ")";
+  Label += "}";
+
+  return Label;
+}
+
+// Write definition of external node, which doesn't have any
+// specific module associated with it. Typically this is function
+// or variable defined in native object or library.
+static void defineExternalNode(raw_ostream &OS, const char *Pfx,
+                               const ValueInfo &VI) {
+  auto StrId = std::to_string(VI.getGUID());
+  OS << "  " << StrId << " [label=\"" << getNodeVisualName(VI)
+     << "\"]; // defined externally\n";
+}
+
+void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
+  std::vector<Edge> CrossModuleEdges;
+  DenseMap<GlobalValue::GUID, std::vector<uint64_t>> NodeMap;
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVS;
+  collectDefinedGVSummariesPerModule(ModuleToDefinedGVS);
+
+  // Get node identifier in form MXXX_<GUID>. The MXXX prefix is required,
+  // because we may have multiple linkonce functions summaries.
+  auto NodeId = [](uint64_t ModId, GlobalValue::GUID Id) {
+    return ModId == (uint64_t)-1 ? std::to_string(Id)
+                                 : std::string("M") + std::to_string(ModId) +
+                                       "_" + std::to_string(Id);
+  };
+
+  auto DrawEdge = [&](const char *Pfx, int SrcMod, GlobalValue::GUID SrcId,
+                      int DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
+    // 0 corresponds to alias edge, 1 to ref edge, 2 to call with unknown
+    // hotness, ...
+    TypeOrHotness += 2;
+    static const char *EdgeAttrs[] = {
+        " [style=dotted]; // alias",
+        " [style=dashed]; // ref",
+        " // call (hotness : Unknown)",
+        " [color=blue]; // call (hotness : Cold)",
+        " // call (hotness : None)",
+        " [color=brown]; // call (hotness : Hot)",
+        " [style=bold,color=red]; // call (hotness : Critical)"};
+
+    assert(static_cast<size_t>(TypeOrHotness) <
+           sizeof(EdgeAttrs) / sizeof(EdgeAttrs[0]));
+    OS << Pfx << NodeId(SrcMod, SrcId) << " -> " << NodeId(DstMod, DstId)
+       << EdgeAttrs[TypeOrHotness] << "\n";
+  };
+
+  OS << "digraph Summary {\n";
+  for (auto &ModIt : ModuleToDefinedGVS) {
+    auto ModId = getModuleId(ModIt.first());
+    OS << "  // Module: " << ModIt.first() << "\n";
+    OS << "  subgraph cluster_" << std::to_string(ModId) << " {\n";
+    OS << "    style = filled;\n";
+    OS << "    color = lightgrey;\n";
+    OS << "    label = \"" << sys::path::filename(ModIt.first()) << "\";\n";
+    OS << "    node [style=filled,fillcolor=lightblue];\n";
+
+    auto &GVSMap = ModIt.second;
+    auto Draw = [&](GlobalValue::GUID IdFrom, GlobalValue::GUID IdTo, int Hotness) {
+      if (!GVSMap.count(IdTo)) {
+        CrossModuleEdges.push_back({ModId, Hotness, IdFrom, IdTo});
+        return;
+      }
+      DrawEdge("    ", ModId, IdFrom, ModId, IdTo, Hotness);
+    };
+
+    for (auto &SummaryIt : GVSMap) {
+      NodeMap[SummaryIt.first].push_back(ModId);
+      auto Flags = SummaryIt.second->flags();
+      Attributes A;
+      if (isa<FunctionSummary>(SummaryIt.second)) {
+        A.add("shape", "record", "function");
+      } else if (isa<AliasSummary>(SummaryIt.second)) {
+        A.add("style", "dotted,filled", "alias");
+        A.add("shape", "box");
+      } else {
+        A.add("shape", "Mrecord", "variable");
+      }
+
+      auto VI = getValueInfo(SummaryIt.first);
+      A.add("label", getNodeLabel(VI, SummaryIt.second));
+      if (!Flags.Live)
+        A.add("fillcolor", "red", "dead");
+      else if (Flags.NotEligibleToImport)
+        A.add("fillcolor", "yellow", "not eligible to import");
+
+      OS << "    " << NodeId(ModId, SummaryIt.first) << " " << A.getAsString()
+         << "\n";
+    }
+    OS << "    // Edges:\n";
+
+    for (auto &SummaryIt : GVSMap) {
+      auto *GVS = SummaryIt.second;
+      for (auto &R : GVS->refs())
+        Draw(SummaryIt.first, R.getGUID(), -1);
+
+      if (auto *AS = dyn_cast_or_null<AliasSummary>(SummaryIt.second)) {
+        auto AliaseeOrigId = AS->getAliasee().getOriginalName();
+        auto AliaseeId = getGUIDFromOriginalID(AliaseeOrigId);
+
+        Draw(SummaryIt.first, AliaseeId ? AliaseeId : AliaseeOrigId, -2);
+        continue;
+      }
+
+      if (auto *FS = dyn_cast_or_null<FunctionSummary>(SummaryIt.second))
+        for (auto &CGEdge : FS->calls())
+          Draw(SummaryIt.first, CGEdge.first.getGUID(),
+               static_cast<int>(CGEdge.second.Hotness));
+    }
+    OS << "  }\n";
+  }
+
+  OS << "  // Cross-module edges:\n";
+  for (auto &E : CrossModuleEdges) {
+    auto &ModList = NodeMap[E.Dst];
+    if (ModList.empty()) {
+      defineExternalNode(OS, "  ", getValueInfo(E.Dst));
+      // Add fake module to the list to draw an edge to an external node
+      // in the loop below.
+      ModList.push_back(-1);
+    }
+    for (auto DstMod : ModList)
+      // The edge representing call or ref is drawn to every module where target
+      // symbol is defined. When target is a linkonce symbol there can be
+      // multiple edges representing a single call or ref, both intra-module and
+      // cross-module. As we've already drawn all intra-module edges before we
+      // skip it here.
+      if (DstMod != E.SrcMod)
+        DrawEdge("  ", E.SrcMod, E.Src, DstMod, E.Dst, E.Hotness);
+  }
+
+  OS << "}";
+}
diff --git a/contrib/llvm/lib/IR/Operator.cpp b/contrib/llvm/lib/IR/Operator.cpp
index 7d819f3aae8d..5b4c7524b672 100644
--- a/contrib/llvm/lib/IR/Operator.cpp
+++ b/contrib/llvm/lib/IR/Operator.cpp
@@ -35,8 +35,8 @@ Type *GEPOperator::getResultElementType() const {
 bool GEPOperator::accumulateConstantOffset(const DataLayout &DL,
                                            APInt &Offset) const {
   assert(Offset.getBitWidth() ==
-             DL.getPointerSizeInBits(getPointerAddressSpace()) &&
-         "The offset must have exactly as many bits as our pointer.");
+             DL.getIndexSizeInBits(getPointerAddressSpace()) &&
+         "The offset bit width does not match DL specification.");
 
   for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
        GTI != GTE; ++GTI) {
diff --git a/contrib/llvm/lib/IR/OptBisect.cpp b/contrib/llvm/lib/IR/OptBisect.cpp
index dc7dcd2e4a97..c79e1fc2b0b4 100644
--- a/contrib/llvm/lib/IR/OptBisect.cpp
+++ b/contrib/llvm/lib/IR/OptBisect.cpp
@@ -36,7 +36,7 @@ static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
                                    cl::Optional,
                                    cl::desc("Maximum optimization to perform"));
 
-OptBisect::OptBisect() {
+OptBisect::OptBisect() : OptPassGate() {
   BisectEnabled = OptBisectLimit != std::numeric_limits<int>::max();
 }
 
@@ -92,19 +92,28 @@ static std::string getDescription(const CallGraphSCC &SCC) {
   return Desc;
 }
 
-// Force instantiations.
-template bool OptBisect::shouldRunPass(const Pass *, const Module &);
-template bool OptBisect::shouldRunPass(const Pass *, const Function &);
-template bool OptBisect::shouldRunPass(const Pass *, const BasicBlock &);
-template bool OptBisect::shouldRunPass(const Pass *, const Loop &);
-template bool OptBisect::shouldRunPass(const Pass *, const CallGraphSCC &);
-template bool OptBisect::shouldRunPass(const Pass *, const Region &);
-
-template <class UnitT>
-bool OptBisect::shouldRunPass(const Pass *P, const UnitT &U) {
-  if (!BisectEnabled)
-    return true;
-  return checkPass(P->getPassName(), getDescription(U));
+bool OptBisect::shouldRunPass(const Pass *P, const Module &U) {
+  return !BisectEnabled || checkPass(P->getPassName(), getDescription(U));
+}
+
+bool OptBisect::shouldRunPass(const Pass *P, const Function &U) {
+  return !BisectEnabled || checkPass(P->getPassName(), getDescription(U));
+}
+
+bool OptBisect::shouldRunPass(const Pass *P, const BasicBlock &U) {
+  return !BisectEnabled || checkPass(P->getPassName(), getDescription(U));
+}
+
+bool OptBisect::shouldRunPass(const Pass *P, const Region &U) {
+  return !BisectEnabled || checkPass(P->getPassName(), getDescription(U));
+}
+
+bool OptBisect::shouldRunPass(const Pass *P, const Loop &U) {
+  return !BisectEnabled || checkPass(P->getPassName(), getDescription(U));
+}
+
+bool OptBisect::shouldRunPass(const Pass *P, const CallGraphSCC &U) {
+  return !BisectEnabled || checkPass(P->getPassName(), getDescription(U));
 }
 
 bool OptBisect::checkPass(const StringRef PassName,
diff --git a/contrib/llvm/lib/IR/Pass.cpp b/contrib/llvm/lib/IR/Pass.cpp
index 5e0b59476c4b..a1dc17882493 100644
--- a/contrib/llvm/lib/IR/Pass.cpp
+++ b/contrib/llvm/lib/IR/Pass.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Pass.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -56,7 +57,7 @@ PassManagerType ModulePass::getPotentialPassManagerType() const {
 }
 
 bool ModulePass::skipModule(Module &M) const {
-  return !M.getContext().getOptBisect().shouldRunPass(this, M);
+  return !M.getContext().getOptPassGate().shouldRunPass(this, M);
 }
 
 bool Pass::mustPreserveAnalysisID(char &AID) const {
@@ -155,12 +156,12 @@ PassManagerType FunctionPass::getPotentialPassManagerType() const {
 }
 
 bool FunctionPass::skipFunction(const Function &F) const {
-  if (!F.getContext().getOptBisect().shouldRunPass(this, F))
+  if (!F.getContext().getOptPassGate().shouldRunPass(this, F))
     return true;
 
   if (F.hasFnAttribute(Attribute::OptimizeNone)) {
-    DEBUG(dbgs() << "Skipping pass '" << getPassName() << "' on function "
-                 << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Skipping pass '" << getPassName() << "' on function "
+                      << F.getName() << "\n");
     return true;
   }
   return false;
@@ -189,13 +190,13 @@ bool BasicBlockPass::skipBasicBlock(const BasicBlock &BB) const {
   const Function *F = BB.getParent();
   if (!F)
     return false;
-  if (!F->getContext().getOptBisect().shouldRunPass(this, BB))
+  if (!F->getContext().getOptPassGate().shouldRunPass(this, BB))
     return true;
   if (F->hasFnAttribute(Attribute::OptimizeNone)) {
     // Report this only once per function.
     if (&BB == &F->getEntryBlock())
-      DEBUG(dbgs() << "Skipping pass '" << getPassName()
-            << "' on function " << F->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "Skipping pass '" << getPassName()
+                        << "' on function " << F->getName() << "\n");
     return true;
   }
   return false;
diff --git a/contrib/llvm/lib/IR/ProfileSummary.cpp b/contrib/llvm/lib/IR/ProfileSummary.cpp
index 2b24d1251121..491fe834df9a 100644
--- a/contrib/llvm/lib/IR/ProfileSummary.cpp
+++ b/contrib/llvm/lib/IR/ProfileSummary.cpp
@@ -69,18 +69,16 @@ Metadata *ProfileSummary::getDetailedSummaryMD(LLVMContext &Context) {
 // "SampleProfile"). The rest of the elements of the outer MDTuple are specific
 // to the kind of profile summary as returned by getFormatSpecificMD.
 Metadata *ProfileSummary::getMD(LLVMContext &Context) {
-  std::vector<Metadata *> Components;
-  Components.push_back(getKeyValMD(Context, "ProfileFormat", KindStr[PSK]));
-
-  Components.push_back(getKeyValMD(Context, "TotalCount", getTotalCount()));
-  Components.push_back(getKeyValMD(Context, "MaxCount", getMaxCount()));
-  Components.push_back(
-      getKeyValMD(Context, "MaxInternalCount", getMaxInternalCount()));
-  Components.push_back(
-      getKeyValMD(Context, "MaxFunctionCount", getMaxFunctionCount()));
-  Components.push_back(getKeyValMD(Context, "NumCounts", getNumCounts()));
-  Components.push_back(getKeyValMD(Context, "NumFunctions", getNumFunctions()));
-  Components.push_back(getDetailedSummaryMD(Context));
+  Metadata *Components[] = {
+    getKeyValMD(Context, "ProfileFormat", KindStr[PSK]),
+    getKeyValMD(Context, "TotalCount", getTotalCount()),
+    getKeyValMD(Context, "MaxCount", getMaxCount()),
+    getKeyValMD(Context, "MaxInternalCount", getMaxInternalCount()),
+    getKeyValMD(Context, "MaxFunctionCount", getMaxFunctionCount()),
+    getKeyValMD(Context, "NumCounts", getNumCounts()),
+    getKeyValMD(Context, "NumFunctions", getNumFunctions()),
+    getDetailedSummaryMD(Context),
+  };
   return MDTuple::get(Context, Components);
 }
 
@@ -144,12 +142,8 @@ static bool getSummaryFromMD(MDTuple *MD, SummaryEntryVector &Summary) {
 }
 
 ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) {
-  if (!MD)
-    return nullptr;
-  if (!isa<MDTuple>(MD))
-    return nullptr;
-  MDTuple *Tuple = cast<MDTuple>(MD);
-  if (Tuple->getNumOperands() != 8)
+  MDTuple *Tuple = dyn_cast_or_null<MDTuple>(MD);
+  if (!Tuple || Tuple->getNumOperands() != 8)
     return nullptr;
 
   auto &FormatMD = Tuple->getOperand(0);
@@ -185,7 +179,7 @@ ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) {
   SummaryEntryVector Summary;
   if (!getSummaryFromMD(dyn_cast<MDTuple>(Tuple->getOperand(7)), Summary))
     return nullptr;
-  return new ProfileSummary(SummaryKind, Summary, TotalCount, MaxCount,
-                            MaxInternalCount, MaxFunctionCount, NumCounts,
-                            NumFunctions);
+  return new ProfileSummary(SummaryKind, std::move(Summary), TotalCount,
+                            MaxCount, MaxInternalCount, MaxFunctionCount,
+                            NumCounts, NumFunctions);
 }
diff --git a/contrib/llvm/lib/IR/SafepointIRVerifier.cpp b/contrib/llvm/lib/IR/SafepointIRVerifier.cpp
index 04deb434cec2..6f73126be738 100644
--- a/contrib/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/contrib/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -59,23 +59,162 @@ using namespace llvm;
 static cl::opt<bool> PrintOnly("safepoint-ir-verifier-print-only",
                                cl::init(false));
 
-static void Verify(const Function &F, const DominatorTree &DT);
+namespace {
+
+/// This CFG Deadness finds dead blocks and edges. Algorithm starts with a set
+/// of blocks unreachable from entry then propagates deadness using foldable
+/// conditional branches without modifying CFG. So GVN does but it changes CFG
+/// by splitting critical edges. In most cases passes rely on SimplifyCFG to
+/// clean up dead blocks, but in some cases, like verification or loop passes
+/// it's not possible.
+class CFGDeadness {
+  const DominatorTree *DT = nullptr;
+  SetVector<const BasicBlock *> DeadBlocks;
+  SetVector<const Use *> DeadEdges; // Contains all dead edges from live blocks.
+
+public:
+  /// Return the edge that coresponds to the predecessor.
+  static const Use& getEdge(const_pred_iterator &PredIt) {
+    auto &PU = PredIt.getUse();
+    return PU.getUser()->getOperandUse(PU.getOperandNo());
+  }
+
+  /// Return true if there is at least one live edge that corresponds to the
+  /// basic block InBB listed in the phi node.
+  bool hasLiveIncomingEdge(const PHINode *PN, const BasicBlock *InBB) const {
+    assert(!isDeadBlock(InBB) && "block must be live");
+    const BasicBlock* BB = PN->getParent();
+    bool Listed = false;
+    for (const_pred_iterator PredIt(BB), End(BB, true); PredIt != End; ++PredIt) {
+      if (InBB == *PredIt) {
+        if (!isDeadEdge(&getEdge(PredIt)))
+          return true;
+        Listed = true;
+      }
+    }
+    assert(Listed && "basic block is not found among incoming blocks");
+    return false;
+  }
+
+
+  bool isDeadBlock(const BasicBlock *BB) const {
+    return DeadBlocks.count(BB);
+  }
+
+  bool isDeadEdge(const Use *U) const {
+    assert(dyn_cast<Instruction>(U->getUser())->isTerminator() &&
+           "edge must be operand of terminator");
+    assert(cast_or_null<BasicBlock>(U->get()) &&
+           "edge must refer to basic block");
+    assert(!isDeadBlock(dyn_cast<Instruction>(U->getUser())->getParent()) &&
+           "isDeadEdge() must be applied to edge from live block");
+    return DeadEdges.count(U);
+  }
+
+  bool hasLiveIncomingEdges(const BasicBlock *BB) const {
+    // Check if all incoming edges are dead.
+    for (const_pred_iterator PredIt(BB), End(BB, true); PredIt != End; ++PredIt) {
+      auto &PU = PredIt.getUse();
+      const Use &U = PU.getUser()->getOperandUse(PU.getOperandNo());
+      if (!isDeadBlock(*PredIt) && !isDeadEdge(&U))
+        return true; // Found a live edge.
+    }
+    return false;
+  }
+
+  void processFunction(const Function &F, const DominatorTree &DT) {
+    this->DT = &DT;
+
+    // Start with all blocks unreachable from entry.
+    for (const BasicBlock &BB : F)
+      if (!DT.isReachableFromEntry(&BB))
+        DeadBlocks.insert(&BB);
+
+    // Top-down walk of the dominator tree
+    ReversePostOrderTraversal<const Function *> RPOT(&F);
+    for (const BasicBlock *BB : RPOT) {
+      const TerminatorInst *TI = BB->getTerminator();
+      assert(TI && "blocks must be well formed");
+
+      // For conditional branches, we can perform simple conditional propagation on
+      // the condition value itself.
+      const BranchInst *BI = dyn_cast<BranchInst>(TI);
+      if (!BI || !BI->isConditional() || !isa<Constant>(BI->getCondition()))
+        continue;
+
+      // If a branch has two identical successors, we cannot declare either dead.
+      if (BI->getSuccessor(0) == BI->getSuccessor(1))
+        continue;
+
+      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+      if (!Cond)
+        continue;
+
+      addDeadEdge(BI->getOperandUse(Cond->getZExtValue() ? 1 : 2));
+    }
+  }
+
+protected:
+  void addDeadBlock(const BasicBlock *BB) {
+    SmallVector<const BasicBlock *, 4> NewDead;
+    SmallSetVector<const BasicBlock *, 4> DF;
+
+    NewDead.push_back(BB);
+    while (!NewDead.empty()) {
+      const BasicBlock *D = NewDead.pop_back_val();
+      if (isDeadBlock(D))
+        continue;
+
+      // All blocks dominated by D are dead.
+      SmallVector<BasicBlock *, 8> Dom;
+      DT->getDescendants(const_cast<BasicBlock*>(D), Dom);
+      // Do not need to mark all in and out edges dead
+      // because BB is marked dead and this is enough
+      // to run further.
+      DeadBlocks.insert(Dom.begin(), Dom.end());
+
+      // Figure out the dominance-frontier(D).
+      for (BasicBlock *B : Dom)
+        for (BasicBlock *S : successors(B))
+          if (!isDeadBlock(S) && !hasLiveIncomingEdges(S))
+            NewDead.push_back(S);
+    }
+  }
+
+  void addDeadEdge(const Use &DeadEdge) {
+    if (!DeadEdges.insert(&DeadEdge))
+      return;
+
+    BasicBlock *BB = cast_or_null<BasicBlock>(DeadEdge.get());
+    if (hasLiveIncomingEdges(BB))
+      return;
+
+    addDeadBlock(BB);
+  }
+};
+} // namespace
+
+static void Verify(const Function &F, const DominatorTree &DT,
+                   const CFGDeadness &CD);
 
 namespace {
+
 struct SafepointIRVerifier : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
-  DominatorTree DT;
   SafepointIRVerifier() : FunctionPass(ID) {
     initializeSafepointIRVerifierPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override {
-    DT.recalculate(F);
-    Verify(F, DT);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    CFGDeadness CD;
+    CD.processFunction(F, DT);
+    Verify(F, DT, CD);
     return false; // no modifications
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(DominatorTreeWrapperPass::ID);
     AU.setPreservesAll();
   }
 
@@ -95,9 +234,10 @@ FunctionPass *llvm::createSafepointIRVerifierPass() {
 }
 
 INITIALIZE_PASS_BEGIN(SafepointIRVerifier, "verify-safepoint-ir",
-                      "Safepoint IR Verifier", false, true)
+                      "Safepoint IR Verifier", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SafepointIRVerifier, "verify-safepoint-ir",
-                    "Safepoint IR Verifier", false, true)
+                    "Safepoint IR Verifier", false, false)
 
 static bool isGCPointerType(Type *T) {
   if (auto *PT = dyn_cast<PointerType>(T))
@@ -292,6 +432,7 @@ class InstructionVerifier;
 /// considered to be unrelocated and no false alarm will happen.
 class GCPtrTracker {
   const Function &F;
+  const CFGDeadness &CD;
   SpecificBumpPtrAllocator<BasicBlockState> BSAllocator;
   DenseMap<const BasicBlock *, BasicBlockState *> BlockMap;
   // This set contains defs of unrelocated pointers that are proved to be legal
@@ -302,7 +443,12 @@ class GCPtrTracker {
   DenseSet<const Value *> PoisonedDefs;
 
 public:
-  GCPtrTracker(const Function &F, const DominatorTree &DT);
+  GCPtrTracker(const Function &F, const DominatorTree &DT,
+               const CFGDeadness &CD);
+
+  bool hasLiveIncomingEdge(const PHINode *PN, const BasicBlock *InBB) const {
+    return CD.hasLiveIncomingEdge(PN, InBB);
+  }
 
   BasicBlockState *getBasicBlockState(const BasicBlock *BB);
   const BasicBlockState *getBasicBlockState(const BasicBlock *BB) const;
@@ -318,6 +464,11 @@ public:
   static void verifyFunction(GCPtrTracker &&Tracker,
                              InstructionVerifier &Verifier);
 
+  /// Returns true for reachable and live blocks.
+  bool isMapped(const BasicBlock *BB) const {
+    return BlockMap.find(BB) != BlockMap.end();
+  }
+
 private:
   /// Returns true if the instruction may be safely skipped during verification.
   bool instructionMayBeSkipped(const Instruction *I) const;
@@ -372,14 +523,17 @@ private:
 };
 } // end anonymous namespace
 
-GCPtrTracker::GCPtrTracker(const Function &F, const DominatorTree &DT) : F(F) {
-  // First, calculate Contribution of each BB.
-  for (const BasicBlock &BB : F) {
-    BasicBlockState *BBS = new (BSAllocator.Allocate()) BasicBlockState;
-    for (const auto &I : BB)
-      transferInstruction(I, BBS->Cleared, BBS->Contribution);
-    BlockMap[&BB] = BBS;
-  }
+GCPtrTracker::GCPtrTracker(const Function &F, const DominatorTree &DT,
+                           const CFGDeadness &CD) : F(F), CD(CD) {
+  // Calculate Contribution of each live BB.
+  // Allocate BB states for live blocks.
+  for (const BasicBlock &BB : F)
+    if (!CD.isDeadBlock(&BB)) {
+      BasicBlockState *BBS = new (BSAllocator.Allocate()) BasicBlockState;
+      for (const auto &I : BB)
+        transferInstruction(I, BBS->Cleared, BBS->Contribution);
+      BlockMap[&BB] = BBS;
+    }
 
   // Initialize AvailableIn/Out sets of each BB using only information about
   // dominating BBs.
@@ -396,9 +550,7 @@ GCPtrTracker::GCPtrTracker(const Function &F, const DominatorTree &DT) : F(F) {
 
 BasicBlockState *GCPtrTracker::getBasicBlockState(const BasicBlock *BB) {
   auto it = BlockMap.find(BB);
-  assert(it != BlockMap.end() &&
-         "No such BB in BlockMap! Probably BB from another function");
-  return it->second;
+  return it != BlockMap.end() ? it->second : nullptr;
 }
 
 const BasicBlockState *GCPtrTracker::getBasicBlockState(
@@ -419,6 +571,9 @@ void GCPtrTracker::verifyFunction(GCPtrTracker &&Tracker,
   ReversePostOrderTraversal<const Function *> RPOT(&Tracker.F);
   for (const BasicBlock *BB : RPOT) {
     BasicBlockState *BBS = Tracker.getBasicBlockState(BB);
+    if (!BBS)
+      continue;
+
     // We destructively modify AvailableIn as we traverse the block instruction
     // by instruction.
     AvailableValueSet &AvailableSet = BBS->AvailableIn;
@@ -448,11 +603,17 @@ void GCPtrTracker::recalculateBBsStates() {
   // The AvailableIn and AvailableOut sets decrease as we iterate.
   while (!Worklist.empty()) {
     const BasicBlock *BB = Worklist.pop_back_val();
-    BasicBlockState *BBS = BlockMap[BB];
+    BasicBlockState *BBS = getBasicBlockState(BB);
+    if (!BBS)
+      continue; // Ignore dead successors.
 
     size_t OldInCount = BBS->AvailableIn.size();
-    for (const BasicBlock *PBB : predecessors(BB))
-      set_intersect(BBS->AvailableIn, BlockMap[PBB]->AvailableOut);
+    for (const_pred_iterator PredIt(BB), End(BB, true); PredIt != End; ++PredIt) {
+      const BasicBlock *PBB = *PredIt;
+      BasicBlockState *PBBS = getBasicBlockState(PBB);
+      if (PBBS && !CD.isDeadEdge(&CFGDeadness::getEdge(PredIt)))
+        set_intersect(BBS->AvailableIn, PBBS->AvailableOut);
+    }
 
     assert(OldInCount >= BBS->AvailableIn.size() && "invariant!");
 
@@ -491,6 +652,10 @@ bool GCPtrTracker::removeValidUnrelocatedDefs(const BasicBlock *BB,
         bool HasUnrelocatedInputs = false;
         for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
           const BasicBlock *InBB = PN->getIncomingBlock(i);
+          if (!isMapped(InBB) ||
+              !CD.hasLiveIncomingEdge(PN, InBB))
+            continue; // Skip dead block or dead edge.
+
           const Value *InValue = PN->getIncomingValue(i);
 
           if (isNotExclusivelyConstantDerived(InValue)) {
@@ -535,16 +700,16 @@ bool GCPtrTracker::removeValidUnrelocatedDefs(const BasicBlock *BB,
       Contribution.erase(&I);
       PoisonedDefs.erase(&I);
       ValidUnrelocatedDefs.insert(&I);
-      DEBUG(dbgs() << "Removing urelocated " << I << " from Contribution of "
-                   << BB->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "Removing urelocated " << I
+                        << " from Contribution of " << BB->getName() << "\n");
       ContributionChanged = true;
     } else if (PoisonedPointerDef) {
       // Mark pointer as poisoned, remove its def from Contribution and trigger
       // update of all successors.
       Contribution.erase(&I);
       PoisonedDefs.insert(&I);
-      DEBUG(dbgs() << "Removing poisoned " << I << " from Contribution of "
-                   << BB->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "Removing poisoned " << I << " from Contribution of "
+                        << BB->getName() << "\n");
       ContributionChanged = true;
     } else {
       bool Cleared = false;
@@ -560,15 +725,18 @@ void GCPtrTracker::gatherDominatingDefs(const BasicBlock *BB,
                                         const DominatorTree &DT) {
   DomTreeNode *DTN = DT[const_cast<BasicBlock *>(BB)];
 
+  assert(DTN && "Unreachable blocks are ignored");
   while (DTN->getIDom()) {
     DTN = DTN->getIDom();
-    const auto &Defs = BlockMap[DTN->getBlock()]->Contribution;
+    auto BBS = getBasicBlockState(DTN->getBlock());
+    assert(BBS && "immediate dominator cannot be dead for a live block");
+    const auto &Defs = BBS->Contribution;
     Result.insert(Defs.begin(), Defs.end());
     // If this block is 'Cleared', then nothing LiveIn to this block can be
     // available after this block completes.  Note: This turns out to be
     // really important for reducing memory consuption of the initial available
     // sets and thus peak memory usage by this verifier.
-    if (BlockMap[DTN->getBlock()]->Cleared)
+    if (BBS->Cleared)
       return;
   }
 
@@ -594,11 +762,11 @@ void GCPtrTracker::transferBlock(const BasicBlock *BB, BasicBlockState &BBS,
     AvailableOut = std::move(Temp);
   }
 
-  DEBUG(dbgs() << "Transfered block " << BB->getName() << " from ";
-        PrintValueSet(dbgs(), AvailableIn.begin(), AvailableIn.end());
-        dbgs() << " to ";
-        PrintValueSet(dbgs(), AvailableOut.begin(), AvailableOut.end());
-        dbgs() << "\n";);
+  LLVM_DEBUG(dbgs() << "Transfered block " << BB->getName() << " from ";
+             PrintValueSet(dbgs(), AvailableIn.begin(), AvailableIn.end());
+             dbgs() << " to ";
+             PrintValueSet(dbgs(), AvailableOut.begin(), AvailableOut.end());
+             dbgs() << "\n";);
 }
 
 void GCPtrTracker::transferInstruction(const Instruction &I, bool &Cleared,
@@ -617,10 +785,15 @@ void InstructionVerifier::verifyInstruction(
     if (containsGCPtrType(PN->getType()))
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         const BasicBlock *InBB = PN->getIncomingBlock(i);
+        const BasicBlockState *InBBS = Tracker->getBasicBlockState(InBB);
+        if (!InBBS ||
+            !Tracker->hasLiveIncomingEdge(PN, InBB))
+          continue; // Skip dead block or dead edge.
+
         const Value *InValue = PN->getIncomingValue(i);
 
         if (isNotExclusivelyConstantDerived(InValue) &&
-            !Tracker->getBasicBlockState(InBB)->AvailableOut.count(InValue))
+            !InBBS->AvailableOut.count(InValue))
           reportInvalidUse(*InValue, *PN);
       }
   } else if (isa<CmpInst>(I) &&
@@ -697,12 +870,14 @@ void InstructionVerifier::reportInvalidUse(const Value &V,
   AnyInvalidUses = true;
 }
 
-static void Verify(const Function &F, const DominatorTree &DT) {
-  DEBUG(dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n");
+static void Verify(const Function &F, const DominatorTree &DT,
+                   const CFGDeadness &CD) {
+  LLVM_DEBUG(dbgs() << "Verifying gc pointers in function: " << F.getName()
+                    << "\n");
   if (PrintOnly)
     dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n";
 
-  GCPtrTracker Tracker(F, DT);
+  GCPtrTracker Tracker(F, DT, CD);
 
   // We now have all the information we need to decide if the use of a heap
   // reference is legal or not, given our safepoint semantics.
diff --git a/contrib/llvm/lib/IR/Type.cpp b/contrib/llvm/lib/IR/Type.cpp
index 20e9c2b5fff2..83016496ff7e 100644
--- a/contrib/llvm/lib/IR/Type.cpp
+++ b/contrib/llvm/lib/IR/Type.cpp
@@ -60,9 +60,9 @@ bool Type::isIntegerTy(unsigned Bitwidth) const {
 
 bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   // Identity cast means no change so return true
-  if (this == Ty) 
+  if (this == Ty)
     return true;
-  
+
   // They are not convertible unless they are at least first class types
   if (!this->isFirstClassType() || !Ty->isFirstClassType())
     return false;
@@ -240,7 +240,7 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
 IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
   assert(NumBits >= MIN_INT_BITS && "bitwidth too small");
   assert(NumBits <= MAX_INT_BITS && "bitwidth too large");
-  
+
   // Check for the built-in integer types
   switch (NumBits) {
   case   1: return cast<IntegerType>(Type::getInt1Ty(C));
@@ -252,12 +252,12 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
   default:
     break;
   }
-  
+
   IntegerType *&Entry = C.pImpl->IntegerTypes[NumBits];
 
   if (!Entry)
     Entry = new (C.pImpl->TypeAllocator) IntegerType(C, NumBits);
-  
+
   return Entry;
 }
 
@@ -333,7 +333,7 @@ bool FunctionType::isValidArgumentType(Type *ArgTy) {
 
 // Primitive Constructors.
 
-StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes, 
+StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
                             bool isPacked) {
   LLVMContextImpl *pImpl = Context.pImpl;
   AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
@@ -355,7 +355,7 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
 
 void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
   assert(isOpaque() && "Struct body already set!");
-  
+
   setSubclassData(getSubclassData() | SCDB_HasBody);
   if (isPacked)
     setSubclassData(getSubclassData() | SCDB_Packed);
@@ -391,7 +391,7 @@ void StructType::setName(StringRef Name) {
     }
     return;
   }
-  
+
   // Look up the entry for the name.
   auto IterBool =
       getContext().pImpl->NamedStructTypes.insert(std::make_pair(Name, this));
@@ -402,7 +402,7 @@ void StructType::setName(StringRef Name) {
     TempStr.push_back('.');
     raw_svector_ostream TmpStream(TempStr);
     unsigned NameSize = Name.size();
-   
+
     do {
       TempStr.resize(NameSize + 1);
       TmpStream << getContext().pImpl->NamedStructTypesUniqueID++;
@@ -569,7 +569,7 @@ ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
   assert(isValidElementType(ElementType) && "Invalid type for array element!");
 
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
-  ArrayType *&Entry = 
+  ArrayType *&Entry =
     pImpl->ArrayTypes[std::make_pair(ElementType, NumElements)];
 
   if (!Entry)
@@ -617,9 +617,9 @@ bool VectorType::isValidElementType(Type *ElemTy) {
 PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
   assert(EltTy && "Can't get a pointer to <null> type!");
   assert(isValidElementType(EltTy) && "Invalid type for pointer element!");
-  
+
   LLVMContextImpl *CImpl = EltTy->getContext().pImpl;
-  
+
   // Since AddressSpace #0 is the common case, we special case it.
   PointerType *&Entry = AddressSpace == 0 ? CImpl->PointerTypes[EltTy]
      : CImpl->ASPointerTypes[std::make_pair(EltTy, AddressSpace)];
diff --git a/contrib/llvm/lib/IR/TypeFinder.cpp b/contrib/llvm/lib/IR/TypeFinder.cpp
index b39678a013fb..e9af78c71bfd 100644
--- a/contrib/llvm/lib/IR/TypeFinder.cpp
+++ b/contrib/llvm/lib/IR/TypeFinder.cpp
@@ -33,18 +33,16 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
   OnlyNamed = onlyNamed;
 
   // Get types from global variables.
-  for (Module::const_global_iterator I = M.global_begin(),
-         E = M.global_end(); I != E; ++I) {
-    incorporateType(I->getType());
-    if (I->hasInitializer())
-      incorporateValue(I->getInitializer());
+  for (const auto &G : M.globals()) {
+    incorporateType(G.getType());
+    if (G.hasInitializer())
+      incorporateValue(G.getInitializer());
   }
 
   // Get types from aliases.
-  for (Module::const_alias_iterator I = M.alias_begin(),
-         E = M.alias_end(); I != E; ++I) {
-    incorporateType(I->getType());
-    if (const Value *Aliasee = I->getAliasee())
+  for (const auto &A : M.aliases()) {
+    incorporateType(A.getType());
+    if (const Value *Aliasee = A.getAliasee())
       incorporateValue(Aliasee);
   }
 
@@ -57,9 +55,8 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
       incorporateValue(U.get());
 
     // First incorporate the arguments.
-    for (Function::const_arg_iterator AI = FI.arg_begin(), AE = FI.arg_end();
-         AI != AE; ++AI)
-      incorporateValue(&*AI);
+    for (const auto &A : FI.args())
+      incorporateValue(&A);
 
     for (const BasicBlock &BB : FI)
       for (const Instruction &I : BB) {
@@ -68,26 +65,21 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
 
         // Incorporate non-instruction operand types. (We are incorporating all
         // instructions with this loop.)
-        for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
-             OI != OE; ++OI)
-          if (*OI && !isa<Instruction>(OI))
-            incorporateValue(*OI);
+        for (const auto &O : I.operands())
+          if (&*O && !isa<Instruction>(&*O))
+            incorporateValue(&*O);
 
         // Incorporate types hiding in metadata.
         I.getAllMetadataOtherThanDebugLoc(MDForInst);
-        for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
-          incorporateMDNode(MDForInst[i].second);
-
+        for (const auto &MD : MDForInst)
+          incorporateMDNode(MD.second);
         MDForInst.clear();
       }
   }
 
-  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
-         E = M.named_metadata_end(); I != E; ++I) {
-    const NamedMDNode *NMD = &*I;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      incorporateMDNode(NMD->getOperand(i));
-  }
+  for (const auto &NMD : M.named_metadata())
+    for (const auto &MDOp : NMD.operands())
+      incorporateMDNode(MDOp);
 }
 
 void TypeFinder::clear() {
@@ -150,9 +142,8 @@ void TypeFinder::incorporateValue(const Value *V) {
 
   // Look in operands for types.
   const User *U = cast<User>(V);
-  for (Constant::const_op_iterator I = U->op_begin(),
-         E = U->op_end(); I != E;++I)
-    incorporateValue(*I);
+  for (const auto &I : U->operands())
+    incorporateValue(&*I);
 }
 
 /// incorporateMDNode - This method is used to walk the operands of an MDNode to
@@ -163,8 +154,7 @@ void TypeFinder::incorporateMDNode(const MDNode *V) {
     return;
 
   // Look in operands for types.
-  for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i) {
-    Metadata *Op = V->getOperand(i);
+  for (Metadata *Op : V->operands()) {
     if (!Op)
       continue;
     if (auto *N = dyn_cast<MDNode>(Op)) {
diff --git a/contrib/llvm/lib/IR/Value.cpp b/contrib/llvm/lib/IR/Value.cpp
index 163c785f5d76..295d6ecf0db0 100644
--- a/contrib/llvm/lib/IR/Value.cpp
+++ b/contrib/llvm/lib/IR/Value.cpp
@@ -39,6 +39,10 @@
 
 using namespace llvm;
 
+static cl::opt<unsigned> NonGlobalValueMaxNameSize(
+    "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
+    cl::desc("Maximum size for the name of non-global values."));
+
 //===----------------------------------------------------------------------===//
 //                                Value Class
 //===----------------------------------------------------------------------===//
@@ -244,6 +248,11 @@ void Value::setNameImpl(const Twine &NewName) {
   if (getName() == NameRef)
     return;
 
+  // Cap the size of non-GlobalValue names.
+  if (NameRef.size() > NonGlobalValueMaxNameSize && !isa<GlobalValue>(this))
+    NameRef =
+        NameRef.substr(0, std::max(1u, (unsigned)NonGlobalValueMaxNameSize));
+
   assert(!getType()->isVoidTy() && "Cannot assign a name to void values!");
 
   // Get the symbol table to update for this object.
@@ -456,41 +465,12 @@ void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
   }
 }
 
-void Value::replaceUsesExceptBlockAddr(Value *New) {
-  SmallSetVector<Constant *, 4> Constants;
-  use_iterator UI = use_begin(), E = use_end();
-  for (; UI != E;) {
-    Use &U = *UI;
-    ++UI;
-
-    if (isa<BlockAddress>(U.getUser()))
-      continue;
-
-    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
-    // constant because they are uniqued.
-    if (auto *C = dyn_cast<Constant>(U.getUser())) {
-      if (!isa<GlobalValue>(C)) {
-        // Save unique users to avoid processing operand replacement
-        // more than once.
-        Constants.insert(C);
-        continue;
-      }
-    }
-
-    U.set(New);
-  }
-
-  // Process operand replacement of saved constants.
-  for (auto *C : Constants)
-    C->handleOperandChange(this, New);
-}
-
 namespace {
 // Various metrics for how much to strip off of pointers.
 enum PointerStripKind {
   PSK_ZeroIndices,
   PSK_ZeroIndicesAndAliases,
-  PSK_ZeroIndicesAndAliasesAndBarriers,
+  PSK_ZeroIndicesAndAliasesAndInvariantGroups,
   PSK_InBoundsConstantIndices,
   PSK_InBounds
 };
@@ -509,7 +489,7 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
     if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       switch (StripKind) {
       case PSK_ZeroIndicesAndAliases:
-      case PSK_ZeroIndicesAndAliasesAndBarriers:
+      case PSK_ZeroIndicesAndAliasesAndInvariantGroups:
       case PSK_ZeroIndices:
         if (!GEP->hasAllZeroIndices())
           return V;
@@ -537,11 +517,12 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
           V = RV;
           continue;
         }
-        // The result of invariant.group.barrier must alias it's argument,
+        // The result of launder.invariant.group must alias it's argument,
         // but it can't be marked with returned attribute, that's why it needs
         // special case.
-        if (StripKind == PSK_ZeroIndicesAndAliasesAndBarriers &&
-            CS.getIntrinsicID() == Intrinsic::invariant_group_barrier) {
+        if (StripKind == PSK_ZeroIndicesAndAliasesAndInvariantGroups &&
+            (CS.getIntrinsicID() == Intrinsic::launder_invariant_group ||
+             CS.getIntrinsicID() == Intrinsic::strip_invariant_group)) {
           V = CS.getArgOperand(0);
           continue;
         }
@@ -567,8 +548,8 @@ const Value *Value::stripInBoundsConstantOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
-const Value *Value::stripPointerCastsAndBarriers() const {
-  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliasesAndBarriers>(
+const Value *Value::stripPointerCastsAndInvariantGroups() const {
+  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliasesAndInvariantGroups>(
       this);
 }
 
@@ -578,9 +559,9 @@ Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
   if (!getType()->isPointerTy())
     return this;
 
-  assert(Offset.getBitWidth() == DL.getPointerSizeInBits(cast<PointerType>(
+  assert(Offset.getBitWidth() == DL.getIndexSizeInBits(cast<PointerType>(
                                      getType())->getAddressSpace()) &&
-         "The offset must have exactly as many bits as our pointer.");
+         "The offset bit width does not match the DL specification.");
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
@@ -676,6 +657,10 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
 
   unsigned Align = 0;
   if (auto *GO = dyn_cast<GlobalObject>(this)) {
+    // Don't make any assumptions about function pointer alignment. Some
+    // targets use the LSBs to store additional information.
+    if (isa<Function>(GO))
+      return 0;
     Align = GO->getAlignment();
     if (Align == 0) {
       if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
diff --git a/contrib/llvm/lib/IR/ValueSymbolTable.cpp b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
index 0da1990c3a3f..0a7f2803cd4c 100644
--- a/contrib/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -74,7 +75,8 @@ void ValueSymbolTable::reinsertValue(Value* V) {
 
   // Try inserting the name, assuming it won't conflict.
   if (vmap.insert(V->getValueName())) {
-    //DEBUG(dbgs() << " Inserted value: " << V->getValueName() << ": " << *V << "\n");
+    // LLVM_DEBUG(dbgs() << " Inserted value: " << V->getValueName() << ": " <<
+    // *V << "\n");
     return;
   }
   
@@ -89,7 +91,7 @@ void ValueSymbolTable::reinsertValue(Value* V) {
 }
 
 void ValueSymbolTable::removeValueName(ValueName *V) {
-  //DEBUG(dbgs() << " Removing Value: " << V->getKeyData() << "\n");
+  // LLVM_DEBUG(dbgs() << " Removing Value: " << V->getKeyData() << "\n");
   // Remove the value from the symbol table.
   vmap.remove(V);
 }
@@ -101,7 +103,7 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   // In the common case, the name is not already in the symbol table.
   auto IterBool = vmap.insert(std::make_pair(Name, V));
   if (IterBool.second) {
-    //DEBUG(dbgs() << " Inserted value: " << Entry.getKeyData() << ": "
+    // LLVM_DEBUG(dbgs() << " Inserted value: " << Entry.getKeyData() << ": "
     //           << *V << "\n");
     return &*IterBool.first;
   }
diff --git a/contrib/llvm/lib/IR/Verifier.cpp b/contrib/llvm/lib/IR/Verifier.cpp
index 1754f7d45011..e5231bb78a36 100644
--- a/contrib/llvm/lib/IR/Verifier.cpp
+++ b/contrib/llvm/lib/IR/Verifier.cpp
@@ -55,6 +55,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -206,7 +207,7 @@ private:
   template <typename... Ts> void WriteTs() {}
 
 public:
-  /// \brief A check failed, so printout out the condition and the message.
+  /// A check failed, so printout out the condition and the message.
   ///
   /// This provides a nice place to put a breakpoint if you want to see why
   /// something is not correct.
@@ -216,7 +217,7 @@ public:
     Broken = true;
   }
 
-  /// \brief A check failed (with values to print).
+  /// A check failed (with values to print).
   ///
   /// This calls the Message-only version so that the above is easier to set a
   /// breakpoint on.
@@ -254,14 +255,14 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   DominatorTree DT;
 
-  /// \brief When verifying a basic block, keep track of all of the
+  /// When verifying a basic block, keep track of all of the
   /// instructions we have seen so far.
   ///
   /// This allows us to do efficient dominance checks for the case when an
   /// instruction has an operand that is an instruction in the same block.
   SmallPtrSet<Instruction *, 16> InstsInThisBlock;
 
-  /// \brief Keep track of the metadata nodes that have been checked already.
+  /// Keep track of the metadata nodes that have been checked already.
   SmallPtrSet<const Metadata *, 32> MDNodes;
 
   /// Keep track which DISubprogram is attached to which function.
@@ -270,10 +271,10 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// Track all DICompileUnits visited.
   SmallPtrSet<const Metadata *, 2> CUVisited;
 
-  /// \brief The result type for a landingpad.
+  /// The result type for a landingpad.
   Type *LandingPadResultTy;
 
-  /// \brief Whether we've seen a call to @llvm.localescape in this function
+  /// Whether we've seen a call to @llvm.localescape in this function
   /// already.
   bool SawFrameEscape;
 
@@ -408,6 +409,7 @@ private:
   void visitModuleFlag(const MDNode *Op,
                        DenseMap<const MDString *, const MDNode *> &SeenIDs,
                        SmallVectorImpl<const MDNode *> &Requirements);
+  void visitModuleFlagCGProfileEntry(const MDOperand &MDO);
   void visitFunction(const Function &F);
   void visitBasicBlock(BasicBlock &BB);
   void visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty);
@@ -466,6 +468,7 @@ private:
   void visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS);
   void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
   void visitDbgIntrinsic(StringRef Kind, DbgInfoIntrinsic &DII);
+  void visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI);
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
   void visitAtomicRMWInst(AtomicRMWInst &RMWI);
   void visitFenceInst(FenceInst &FI);
@@ -565,10 +568,24 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
   if (GV.isDeclarationForLinker())
     Assert(!GV.hasComdat(), "Declaration may not be in a Comdat!", &GV);
 
-  if (GV.hasDLLImportStorageClass())
+  if (GV.hasDLLImportStorageClass()) {
     Assert(!GV.isDSOLocal(),
            "GlobalValue with DLLImport Storage is dso_local!", &GV);
 
+    Assert((GV.isDeclaration() && GV.hasExternalLinkage()) ||
+               GV.hasAvailableExternallyLinkage(),
+           "Global is marked as dllimport, but not external", &GV);
+  }
+
+  if (GV.hasLocalLinkage())
+    Assert(GV.isDSOLocal(),
+           "GlobalValue with private or internal linkage must be dso_local!",
+           &GV);
+
+  if (!GV.hasDefaultVisibility() && !GV.hasExternalWeakLinkage())
+    Assert(GV.isDSOLocal(),
+           "GlobalValue with non default visibility must be dso_local!", &GV);
+
   forEachUser(&GV, GlobalValueVisited, [&](const Value *V) -> bool {
     if (const Instruction *I = dyn_cast<Instruction>(V)) {
       if (!I->getParent() || !I->getParent()->getParent())
@@ -655,11 +672,6 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     }
   }
 
-  Assert(!GV.hasDLLImportStorageClass() ||
-             (GV.isDeclaration() && GV.hasExternalLinkage()) ||
-             GV.hasAvailableExternallyLinkage(),
-         "Global is marked as dllimport, but not external", &GV);
-
   // Visit any debug info attachments.
   SmallVector<MDNode *, 1> MDs;
   GV.getMetadata(LLVMContext::MD_dbg, MDs);
@@ -858,7 +870,12 @@ void Verifier::visitDIScope(const DIScope &N) {
 
 void Verifier::visitDISubrange(const DISubrange &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
-  AssertDI(N.getCount() >= -1, "invalid subrange count", &N);
+  auto Count = N.getCount();
+  AssertDI(Count, "Count must either be a signed constant or a DIVariable",
+           &N);
+  AssertDI(!Count.is<ConstantInt*>() ||
+               Count.get<ConstantInt*>()->getSExtValue() >= -1,
+           "invalid subrange count", &N);
 }
 
 void Verifier::visitDIEnumerator(const DIEnumerator &N) {
@@ -905,9 +922,12 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   }
 }
 
+/// Detect mutually exclusive flags.
 static bool hasConflictingReferenceFlags(unsigned Flags) {
-  return (Flags & DINode::FlagLValueReference) &&
-         (Flags & DINode::FlagRValueReference);
+  return ((Flags & DINode::FlagLValueReference) &&
+          (Flags & DINode::FlagRValueReference)) ||
+         ((Flags & DINode::FlagTypePassByValue) &&
+          (Flags & DINode::FlagTypePassByReference));
 }
 
 void Verifier::visitTemplateParams(const MDNode &N, const Metadata &RawParams) {
@@ -927,7 +947,8 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
                N.getTag() == dwarf::DW_TAG_structure_type ||
                N.getTag() == dwarf::DW_TAG_union_type ||
                N.getTag() == dwarf::DW_TAG_enumeration_type ||
-               N.getTag() == dwarf::DW_TAG_class_type,
+               N.getTag() == dwarf::DW_TAG_class_type ||
+               N.getTag() == dwarf::DW_TAG_variant_part,
            "invalid tag", &N);
 
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
@@ -940,6 +961,14 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
            N.getRawVTableHolder());
   AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
            "invalid reference flags", &N);
+
+  if (N.isVector()) {
+    const DINodeArray Elements = N.getElements();
+    AssertDI(Elements.size() == 1 &&
+             Elements[0]->getTag() == dwarf::DW_TAG_subrange_type,
+             "invalid vector, expected one element of type subrange", &N);
+  }
+
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
 
@@ -948,6 +977,11 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
     AssertDI(N.getFile() && !N.getFile()->getFilename().empty(),
              "class/union requires a filename", &N, N.getFile());
   }
+
+  if (auto *D = N.getRawDiscriminator()) {
+    AssertDI(isa<DIDerivedType>(D) && N.getTag() == dwarf::DW_TAG_variant_part,
+             "discriminator can only appear on variant part");
+  }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
@@ -964,8 +998,23 @@ void Verifier::visitDISubroutineType(const DISubroutineType &N) {
 
 void Verifier::visitDIFile(const DIFile &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_file_type, "invalid tag", &N);
-  AssertDI((N.getChecksumKind() != DIFile::CSK_None ||
-            N.getChecksum().empty()), "invalid checksum kind", &N);
+  Optional<DIFile::ChecksumInfo<StringRef>> Checksum = N.getChecksum();
+  if (Checksum) {
+    AssertDI(Checksum->Kind <= DIFile::ChecksumKind::CSK_Last,
+             "invalid checksum kind", &N);
+    size_t Size;
+    switch (Checksum->Kind) {
+    case DIFile::CSK_MD5:
+      Size = 32;
+      break;
+    case DIFile::CSK_SHA1:
+      Size = 40;
+      break;
+    }
+    AssertDI(Checksum->Value.size() == Size, "invalid checksum length", &N);
+    AssertDI(Checksum->Value.find_if_not(llvm::isHexDigit) == StringRef::npos,
+             "invalid checksum", &N);
+  }
 }
 
 void Verifier::visitDICompileUnit(const DICompileUnit &N) {
@@ -1038,12 +1087,13 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
   if (auto *S = N.getRawDeclaration())
     AssertDI(isa<DISubprogram>(S) && !cast<DISubprogram>(S)->isDefinition(),
              "invalid subprogram declaration", &N, S);
-  if (auto *RawVars = N.getRawVariables()) {
-    auto *Vars = dyn_cast<MDTuple>(RawVars);
-    AssertDI(Vars, "invalid variable list", &N, RawVars);
-    for (Metadata *Op : Vars->operands()) {
-      AssertDI(Op && isa<DILocalVariable>(Op), "invalid local variable", &N,
-               Vars, Op);
+  if (auto *RawNode = N.getRawRetainedNodes()) {
+    auto *Node = dyn_cast<MDTuple>(RawNode);
+    AssertDI(Node, "invalid retained nodes list", &N, RawNode);
+    for (Metadata *Op : Node->operands()) {
+      AssertDI(Op && (isa<DILocalVariable>(Op) || isa<DILabel>(Op)),
+               "invalid retained nodes, expected DILocalVariable or DILabel",
+               &N, Node, Op);
     }
   }
   AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
@@ -1175,6 +1225,17 @@ void Verifier::visitDILocalVariable(const DILocalVariable &N) {
            "local variable requires a valid scope", &N, N.getRawScope());
 }
 
+void Verifier::visitDILabel(const DILabel &N) {
+  if (auto *S = N.getRawScope())
+    AssertDI(isa<DIScope>(S), "invalid scope", &N, S);
+  if (auto *F = N.getRawFile())
+    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+
+  AssertDI(N.getTag() == dwarf::DW_TAG_label, "invalid tag", &N);
+  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+           "label requires a valid scope", &N, N.getRawScope());
+}
+
 void Verifier::visitDIExpression(const DIExpression &N) {
   AssertDI(N.isValid(), "invalid expression", &N);
 }
@@ -1351,12 +1412,35 @@ Verifier::visitModuleFlag(const MDNode *Op,
     Assert(M.getNamedMetadata("llvm.linker.options"),
            "'Linker Options' named metadata no longer supported");
   }
+
+  if (ID->getString() == "CG Profile") {
+    for (const MDOperand &MDO : cast<MDNode>(Op->getOperand(2))->operands())
+      visitModuleFlagCGProfileEntry(MDO);
+  }
+}
+
+void Verifier::visitModuleFlagCGProfileEntry(const MDOperand &MDO) {
+  auto CheckFunction = [&](const MDOperand &FuncMDO) {
+    if (!FuncMDO)
+      return;
+    auto F = dyn_cast<ValueAsMetadata>(FuncMDO);
+    Assert(F && isa<Function>(F->getValue()), "expected a Function or null",
+           FuncMDO);
+  };
+  auto Node = dyn_cast_or_null<MDNode>(MDO);
+  Assert(Node && Node->getNumOperands() == 3, "expected a MDNode triple", MDO);
+  CheckFunction(Node->getOperand(0));
+  CheckFunction(Node->getOperand(1));
+  auto Count = dyn_cast_or_null<ConstantAsMetadata>(Node->getOperand(2));
+  Assert(Count && Count->getType()->isIntegerTy(),
+         "expected an integer constant", Node->getOperand(2));
 }
 
 /// Return true if this attribute kind only applies to functions.
 static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   switch (Kind) {
   case Attribute::NoReturn:
+  case Attribute::NoCfCheck:
   case Attribute::NoUnwind:
   case Attribute::NoInline:
   case Attribute::AlwaysInline:
@@ -1365,6 +1449,7 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   case Attribute::StackProtectReq:
   case Attribute::StackProtectStrong:
   case Attribute::SafeStack:
+  case Attribute::ShadowCallStack:
   case Attribute::NoRedZone:
   case Attribute::NoImplicitFloat:
   case Attribute::Naked:
@@ -1382,6 +1467,7 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   case Attribute::Builtin:
   case Attribute::NoBuiltin:
   case Attribute::Cold:
+  case Attribute::OptForFuzzing:
   case Attribute::OptimizeNone:
   case Attribute::JumpTable:
   case Attribute::Convergent:
@@ -1692,8 +1778,11 @@ void Verifier::verifyFunctionMetadata(
              "expected string with name of the !prof annotation", MD);
       MDString *MDS = cast<MDString>(MD->getOperand(0));
       StringRef ProfName = MDS->getString();
-      Assert(ProfName.equals("function_entry_count"),
-             "first operand should be 'function_entry_count'", MD);
+      Assert(ProfName.equals("function_entry_count") ||
+                 ProfName.equals("synthetic_function_entry_count"),
+             "first operand should be 'function_entry_count'"
+             " or 'synthetic_function_entry_count'",
+             MD);
 
       // Check second operand.
       Assert(MD->getOperand(1) != nullptr, "second operand should not be null",
@@ -2151,11 +2240,6 @@ void Verifier::visitFunction(const Function &F) {
       Assert(false, "Invalid user of intrinsic instruction!", U);
   }
 
-  Assert(!F.hasDLLImportStorageClass() ||
-             (F.isDeclaration() && F.hasExternalLinkage()) ||
-             F.hasAvailableExternallyLinkage(),
-         "Function is marked as dllimport, but not external.", &F);
-
   auto *N = F.getSubprogram();
   HasDebugInfo = (N != nullptr);
   if (!HasDebugInfo)
@@ -2209,7 +2293,7 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   if (isa<PHINode>(BB.front())) {
     SmallVector<BasicBlock*, 8> Preds(pred_begin(&BB), pred_end(&BB));
     SmallVector<std::pair<BasicBlock*, Value*>, 8> Values;
-    std::sort(Preds.begin(), Preds.end());
+    llvm::sort(Preds.begin(), Preds.end());
     for (const PHINode &PN : BB.phis()) {
       // Ensure that PHI nodes have at least one entry!
       Assert(PN.getNumIncomingValues() != 0,
@@ -2227,7 +2311,7 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
       for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
         Values.push_back(
             std::make_pair(PN.getIncomingBlock(i), PN.getIncomingValue(i)));
-      std::sort(Values.begin(), Values.end());
+      llvm::sort(Values.begin(), Values.end());
 
       for (unsigned i = 0, e = Values.size(); i != e; ++i) {
         // Check to make sure that if there is more than one entry for a
@@ -2819,17 +2903,20 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
   Function *F = CI.getParent()->getParent();
   FunctionType *CallerTy = F->getFunctionType();
   FunctionType *CalleeTy = CI.getFunctionType();
-  Assert(CallerTy->getNumParams() == CalleeTy->getNumParams(),
-         "cannot guarantee tail call due to mismatched parameter counts", &CI);
+  if (!CI.getCalledFunction() || !CI.getCalledFunction()->isIntrinsic()) {
+    Assert(CallerTy->getNumParams() == CalleeTy->getNumParams(),
+           "cannot guarantee tail call due to mismatched parameter counts",
+           &CI);
+    for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
+      Assert(
+          isTypeCongruent(CallerTy->getParamType(I), CalleeTy->getParamType(I)),
+          "cannot guarantee tail call due to mismatched parameter types", &CI);
+    }
+  }
   Assert(CallerTy->isVarArg() == CalleeTy->isVarArg(),
          "cannot guarantee tail call due to mismatched varargs", &CI);
   Assert(isTypeCongruent(CallerTy->getReturnType(), CalleeTy->getReturnType()),
          "cannot guarantee tail call due to mismatched return types", &CI);
-  for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
-    Assert(
-        isTypeCongruent(CallerTy->getParamType(I), CalleeTy->getParamType(I)),
-        "cannot guarantee tail call due to mismatched parameter types", &CI);
-  }
 
   // - The calling conventions of the caller and callee must match.
   Assert(F->getCallingConv() == CI.getCallingConv(),
@@ -2865,7 +2952,7 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
   // Check the return.
   ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
-  Assert(Ret, "musttail call must be precede a ret with an optional bitcast",
+  Assert(Ret, "musttail call must precede a ret with an optional bitcast",
          &CI);
   Assert(!Ret->getReturnValue() || Ret->getReturnValue() == RetVal,
          "musttail call result must be returned", Ret);
@@ -3119,8 +3206,7 @@ void Verifier::visitLoadInst(LoadInst &LI) {
            "Load cannot have Release ordering", &LI);
     Assert(LI.getAlignment() != 0,
            "Atomic load must specify explicit alignment", &LI);
-    Assert(ElTy->isIntegerTy() || ElTy->isPointerTy() ||
-               ElTy->isFloatingPointTy(),
+    Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
            "atomic load operand must have integer, pointer, or floating point "
            "type!",
            ElTy, &LI);
@@ -3148,8 +3234,7 @@ void Verifier::visitStoreInst(StoreInst &SI) {
            "Store cannot have Acquire ordering", &SI);
     Assert(SI.getAlignment() != 0,
            "Atomic store must specify explicit alignment", &SI);
-    Assert(ElTy->isIntegerTy() || ElTy->isPointerTy() ||
-               ElTy->isFloatingPointTy(),
+    Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
            "atomic store operand must have integer, pointer, or floating point "
            "type!",
            ElTy, &SI);
@@ -3240,9 +3325,8 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
   PointerType *PTy = dyn_cast<PointerType>(CXI.getOperand(0)->getType());
   Assert(PTy, "First cmpxchg operand must be a pointer.", &CXI);
   Type *ElTy = PTy->getElementType();
-  Assert(ElTy->isIntegerTy() || ElTy->isPointerTy(),
-        "cmpxchg operand must have integer or pointer type",
-         ElTy, &CXI);
+  Assert(ElTy->isIntOrPtrTy(),
+         "cmpxchg operand must have integer or pointer type", ElTy, &CXI);
   checkAtomicMemAccessSize(ElTy, &CXI);
   Assert(ElTy == CXI.getOperand(1)->getType(),
          "Expected value type does not match pointer operand type!", &CXI,
@@ -4014,96 +4098,36 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   case Intrinsic::dbg_value: // llvm.dbg.value
     visitDbgIntrinsic("value", cast<DbgInfoIntrinsic>(*CS.getInstruction()));
     break;
+  case Intrinsic::dbg_label: // llvm.dbg.label
+    visitDbgLabelIntrinsic("label", cast<DbgLabelInst>(*CS.getInstruction()));
+    break;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset: {
-    ConstantInt *AlignCI = dyn_cast<ConstantInt>(CS.getArgOperand(3));
-    Assert(AlignCI,
-           "alignment argument of memory intrinsics must be a constant int",
-           CS);
-    const APInt &AlignVal = AlignCI->getValue();
-    Assert(AlignCI->isZero() || AlignVal.isPowerOf2(),
-           "alignment argument of memory intrinsics must be a power of 2", CS);
-    Assert(isa<ConstantInt>(CS.getArgOperand(4)),
-           "isvolatile argument of memory intrinsics must be a constant int",
-           CS);
-    break;
-  }
-  case Intrinsic::memcpy_element_unordered_atomic: {
-    const AtomicMemCpyInst *MI = cast<AtomicMemCpyInst>(CS.getInstruction());
-
-    ConstantInt *ElementSizeCI =
-        dyn_cast<ConstantInt>(MI->getRawElementSizeInBytes());
-    Assert(ElementSizeCI,
-           "element size of the element-wise unordered atomic memory "
-           "intrinsic must be a constant int",
-           CS);
-    const APInt &ElementSizeVal = ElementSizeCI->getValue();
-    Assert(ElementSizeVal.isPowerOf2(),
-           "element size of the element-wise atomic memory intrinsic "
-           "must be a power of 2",
-           CS);
-
-    if (auto *LengthCI = dyn_cast<ConstantInt>(MI->getLength())) {
-      uint64_t Length = LengthCI->getZExtValue();
-      uint64_t ElementSize = MI->getElementSizeInBytes();
-      Assert((Length % ElementSize) == 0,
-             "constant length must be a multiple of the element size in the "
-             "element-wise atomic memory intrinsic",
-             CS);
-    }
-
-    auto IsValidAlignment = [&](uint64_t Alignment) {
-      return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
+    const auto *MI = cast<MemIntrinsic>(CS.getInstruction());
+    auto IsValidAlignment = [&](unsigned Alignment) -> bool {
+      return Alignment == 0 || isPowerOf2_32(Alignment);
     };
-    uint64_t DstAlignment = CS.getParamAlignment(0),
-             SrcAlignment = CS.getParamAlignment(1);
-    Assert(IsValidAlignment(DstAlignment),
-           "incorrect alignment of the destination argument", CS);
-    Assert(IsValidAlignment(SrcAlignment),
-           "incorrect alignment of the source argument", CS);
-    break;
-  }
-  case Intrinsic::memmove_element_unordered_atomic: {
-    auto *MI = cast<AtomicMemMoveInst>(CS.getInstruction());
-
-    ConstantInt *ElementSizeCI =
-        dyn_cast<ConstantInt>(MI->getRawElementSizeInBytes());
-    Assert(ElementSizeCI,
-           "element size of the element-wise unordered atomic memory "
-           "intrinsic must be a constant int",
-           CS);
-    const APInt &ElementSizeVal = ElementSizeCI->getValue();
-    Assert(ElementSizeVal.isPowerOf2(),
-           "element size of the element-wise atomic memory intrinsic "
-           "must be a power of 2",
+    Assert(IsValidAlignment(MI->getDestAlignment()),
+           "alignment of arg 0 of memory intrinsic must be 0 or a power of 2",
            CS);
-
-    if (auto *LengthCI = dyn_cast<ConstantInt>(MI->getLength())) {
-      uint64_t Length = LengthCI->getZExtValue();
-      uint64_t ElementSize = MI->getElementSizeInBytes();
-      Assert((Length % ElementSize) == 0,
-             "constant length must be a multiple of the element size in the "
-             "element-wise atomic memory intrinsic",
+    if (const auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+      Assert(IsValidAlignment(MTI->getSourceAlignment()),
+             "alignment of arg 1 of memory intrinsic must be 0 or a power of 2",
              CS);
     }
-
-    auto IsValidAlignment = [&](uint64_t Alignment) {
-      return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
-    };
-    uint64_t DstAlignment = CS.getParamAlignment(0),
-             SrcAlignment = CS.getParamAlignment(1);
-    Assert(IsValidAlignment(DstAlignment),
-           "incorrect alignment of the destination argument", CS);
-    Assert(IsValidAlignment(SrcAlignment),
-           "incorrect alignment of the source argument", CS);
+    Assert(isa<ConstantInt>(CS.getArgOperand(3)),
+           "isvolatile argument of memory intrinsics must be a constant int",
+           CS);
     break;
   }
+  case Intrinsic::memcpy_element_unordered_atomic:
+  case Intrinsic::memmove_element_unordered_atomic:
   case Intrinsic::memset_element_unordered_atomic: {
-    auto *MI = cast<AtomicMemSetInst>(CS.getInstruction());
+    const auto *AMI = cast<AtomicMemIntrinsic>(CS.getInstruction());
 
     ConstantInt *ElementSizeCI =
-        dyn_cast<ConstantInt>(MI->getRawElementSizeInBytes());
+        dyn_cast<ConstantInt>(AMI->getRawElementSizeInBytes());
     Assert(ElementSizeCI,
            "element size of the element-wise unordered atomic memory "
            "intrinsic must be a constant int",
@@ -4114,9 +4138,9 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "must be a power of 2",
            CS);
 
-    if (auto *LengthCI = dyn_cast<ConstantInt>(MI->getLength())) {
+    if (auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength())) {
       uint64_t Length = LengthCI->getZExtValue();
-      uint64_t ElementSize = MI->getElementSizeInBytes();
+      uint64_t ElementSize = AMI->getElementSizeInBytes();
       Assert((Length % ElementSize) == 0,
              "constant length must be a multiple of the element size in the "
              "element-wise atomic memory intrinsic",
@@ -4126,9 +4150,14 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
     auto IsValidAlignment = [&](uint64_t Alignment) {
       return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
     };
-    uint64_t DstAlignment = CS.getParamAlignment(0);
+    uint64_t DstAlignment = AMI->getDestAlignment();
     Assert(IsValidAlignment(DstAlignment),
            "incorrect alignment of the destination argument", CS);
+    if (const auto *AMT = dyn_cast<AtomicMemTransferInst>(AMI)) {
+      uint64_t SrcAlignment = AMT->getSourceAlignment();
+      Assert(IsValidAlignment(SrcAlignment),
+             "incorrect alignment of the source argument", CS);
+    }
     break;
   }
   case Intrinsic::gcroot:
@@ -4428,7 +4457,7 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   };
 }
 
-/// \brief Carefully grab the subprogram from a local scope.
+/// Carefully grab the subprogram from a local scope.
 ///
 /// This carefully grabs the subprogram from a local scope, avoiding the
 /// built-in assertions that would typically fire.
@@ -4485,8 +4514,8 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgInfoIntrinsic &DII) {
   // The scopes for variables and !dbg attachments must agree.
   DILocalVariable *Var = DII.getVariable();
   DILocation *Loc = DII.getDebugLoc();
-  Assert(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
-         &DII, BB, F);
+  AssertDI(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
+           &DII, BB, F);
 
   DISubprogram *VarSP = getSubprogram(Var->getRawScope());
   DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
@@ -4501,7 +4530,40 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgInfoIntrinsic &DII) {
   verifyFnArgs(DII);
 }
 
+void Verifier::visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI) {
+  AssertDI(isa<DILabel>(DLI.getRawVariable()),
+         "invalid llvm.dbg." + Kind + " intrinsic variable", &DLI,
+         DLI.getRawVariable());
+
+  // Ignore broken !dbg attachments; they're checked elsewhere.
+  if (MDNode *N = DLI.getDebugLoc().getAsMDNode())
+    if (!isa<DILocation>(N))
+      return;
+
+  BasicBlock *BB = DLI.getParent();
+  Function *F = BB ? BB->getParent() : nullptr;
+
+  // The scopes for variables and !dbg attachments must agree.
+  DILabel *Label = DLI.getLabel();
+  DILocation *Loc = DLI.getDebugLoc();
+  Assert(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
+         &DLI, BB, F);
+
+  DISubprogram *LabelSP = getSubprogram(Label->getRawScope());
+  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
+  if (!LabelSP || !LocSP)
+    return;
+
+  AssertDI(LabelSP == LocSP, "mismatched subprogram between llvm.dbg." + Kind +
+                             " label and !dbg attachment",
+           &DLI, BB, F, Label, Label->getScope()->getSubprogram(), Loc,
+           Loc->getScope()->getSubprogram());
+}
+
 void Verifier::verifyFragmentExpression(const DbgInfoIntrinsic &I) {
+  if (dyn_cast<DbgLabelInst>(&I))
+    return;
+
   DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(I.getRawVariable());
   DIExpression *E = dyn_cast_or_null<DIExpression>(I.getRawExpression());
 
diff --git a/contrib/llvm/lib/IRReader/IRReader.cpp b/contrib/llvm/lib/IRReader/IRReader.cpp
index 999f11deb15a..36bbf719bb61 100644
--- a/contrib/llvm/lib/IRReader/IRReader.cpp
+++ b/contrib/llvm/lib/IRReader/IRReader.cpp
@@ -68,7 +68,8 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
 
 std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
                                       LLVMContext &Context,
-                                      bool UpgradeDebugInfo) {
+                                      bool UpgradeDebugInfo,
+                                      StringRef DataLayoutString) {
   NamedRegionTimer T(TimeIRParsingName, TimeIRParsingDescription,
                      TimeIRParsingGroupName, TimeIRParsingGroupDescription,
                      TimePassesIsEnabled);
@@ -83,15 +84,19 @@ std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
       });
       return nullptr;
     }
+    if (!DataLayoutString.empty())
+      ModuleOrErr.get()->setDataLayout(DataLayoutString);
     return std::move(ModuleOrErr.get());
   }
 
-  return parseAssembly(Buffer, Err, Context, nullptr, UpgradeDebugInfo);
+  return parseAssembly(Buffer, Err, Context, nullptr, UpgradeDebugInfo,
+                       DataLayoutString);
 }
 
 std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
                                           LLVMContext &Context,
-                                          bool UpgradeDebugInfo) {
+                                          bool UpgradeDebugInfo,
+                                          StringRef DataLayoutString) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -101,7 +106,7 @@ std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
   }
 
   return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context,
-                 UpgradeDebugInfo);
+                 UpgradeDebugInfo, DataLayoutString);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/LTO/Caching.cpp b/contrib/llvm/lib/LTO/Caching.cpp
index dd47eb584b7f..089e77e742eb 100644
--- a/contrib/llvm/lib/LTO/Caching.cpp
+++ b/contrib/llvm/lib/LTO/Caching.cpp
@@ -19,6 +19,12 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
 using namespace llvm;
 using namespace llvm::lto;
 
@@ -33,16 +39,32 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
     SmallString<64> EntryPath;
     sys::path::append(EntryPath, CacheDirectoryPath, "llvmcache-" + Key);
     // First, see if we have a cache hit.
-    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
-        MemoryBuffer::getFile(EntryPath);
-    if (MBOrErr) {
-      AddBuffer(Task, std::move(*MBOrErr), EntryPath);
-      return AddStreamFn();
+    int FD;
+    SmallString<64> ResultPath;
+    std::error_code EC = sys::fs::openFileForRead(
+        Twine(EntryPath), FD, sys::fs::OF_UpdateAtime, &ResultPath);
+    if (!EC) {
+      ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+          MemoryBuffer::getOpenFile(FD, EntryPath,
+                                    /*FileSize*/ -1,
+                                    /*RequiresNullTerminator*/ false);
+      close(FD);
+      if (MBOrErr) {
+        AddBuffer(Task, std::move(*MBOrErr));
+        return AddStreamFn();
+      }
+      EC = MBOrErr.getError();
     }
 
-    if (MBOrErr.getError() != errc::no_such_file_or_directory)
+    // On Windows we can fail to open a cache file with a permission denied
+    // error. This generally means that another process has requested to delete
+    // the file while it is still open, but it could also mean that another
+    // process has opened the file without the sharing permissions we need.
+    // Since the file is probably being deleted we handle it in the same way as
+    // if the file did not exist at all.
+    if (EC != errc::no_such_file_or_directory && EC != errc::permission_denied)
       report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
-                         ": " + MBOrErr.getError().message() + "\n");
+                         ": " + EC.message() + "\n");
 
     // This native object stream is responsible for commiting the resulting
     // file to the cache and calling AddBuffer to add it to the link.
@@ -103,7 +125,7 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
                              TempFile.TmpName + " to " + EntryPath + ": " +
                              toString(std::move(E)) + "\n");
 
-        AddBuffer(Task, std::move(*MBOrErr), EntryPath);
+        AddBuffer(Task, std::move(*MBOrErr));
       }
     };
 
diff --git a/contrib/llvm/lib/LTO/LTO.cpp b/contrib/llvm/lib/LTO/LTO.cpp
index 64e5186255bd..68d210cb7d73 100644
--- a/contrib/llvm/lib/LTO/LTO.cpp
+++ b/contrib/llvm/lib/LTO/LTO.cpp
@@ -12,11 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LTO/LTO.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -50,6 +52,10 @@ using namespace object;
 
 #define DEBUG_TYPE "lto"
 
+static cl::opt<bool>
+    DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
+                   cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
+
 // The values are (type identifier, summary) pairs.
 typedef DenseMap<
     GlobalValue::GUID,
@@ -132,6 +138,7 @@ static void computeCacheKey(
   AddString(Conf.AAPipeline);
   AddString(Conf.OverrideTriple);
   AddString(Conf.DefaultTriple);
+  AddString(Conf.DwoDir);
 
   // Include the hash for the current module
   auto ModHash = Index.getModuleHash(ModuleID);
@@ -149,7 +156,7 @@ static void computeCacheKey(
 
     AddUint64(Entry.second.size());
     for (auto &Fn : Entry.second)
-      AddUint64(Fn.first);
+      AddUint64(Fn);
   }
 
   // Include the hash for the resolved ODR.
@@ -177,8 +184,11 @@ static void computeCacheKey(
 
   auto AddUsedThings = [&](GlobalValueSummary *GS) {
     if (!GS) return;
-    for (const ValueInfo &VI : GS->refs())
+    AddUnsigned(GS->isLive());
+    for (const ValueInfo &VI : GS->refs()) {
+      AddUnsigned(VI.isDSOLocal());
       AddUsedCfiGlobal(VI.getGUID());
+    }
     if (auto *FS = dyn_cast<FunctionSummary>(GS)) {
       for (auto &TT : FS->type_tests())
         UsedTypeIds.insert(TT);
@@ -190,8 +200,10 @@ static void computeCacheKey(
         UsedTypeIds.insert(TT.VFunc.GUID);
       for (auto &TT : FS->type_checked_load_const_vcalls())
         UsedTypeIds.insert(TT.VFunc.GUID);
-      for (auto &ET : FS->calls())
+      for (auto &ET : FS->calls()) {
+        AddUnsigned(ET.first.isDSOLocal());
         AddUsedCfiGlobal(ET.first.getGUID());
+      }
     }
   };
 
@@ -209,7 +221,7 @@ static void computeCacheKey(
   // so we need to collect their used resolutions as well.
   for (auto &ImpM : ImportList)
     for (auto &ImpF : ImpM.second)
-      AddUsedThings(Index.findSummaryInModule(ImpF.first, ImpM.first()));
+      AddUsedThings(Index.findSummaryInModule(ImpF, ImpM.first()));
 
   auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
     AddString(TId);
@@ -388,7 +400,8 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
       Ctx(Conf), CombinedModule(llvm::make_unique<Module>("ld-temp.o", Ctx)),
       Mover(llvm::make_unique<IRMover>(*CombinedModule)) {}
 
-LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend) {
+LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
+    : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
   if (!Backend)
     this->Backend =
         createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
@@ -415,11 +428,27 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
 
-    auto &GlobalRes = GlobalResolutions[Sym.getName()];
+    StringRef Name = Sym.getName();
+    Triple TT(RegularLTO.CombinedModule->getTargetTriple());
+    // Strip the __imp_ prefix from COFF dllimport symbols (similar to the
+    // way they are handled by lld), otherwise we can end up with two
+    // global resolutions (one with and one for a copy of the symbol without).
+    if (TT.isOSBinFormatCOFF() && Name.startswith("__imp_"))
+      Name = Name.substr(strlen("__imp_"));
+    auto &GlobalRes = GlobalResolutions[Name];
     GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
     if (Res.Prevailing) {
-      assert(GlobalRes.IRName.empty() &&
+      assert(!GlobalRes.Prevailing &&
              "Multiple prevailing defs are not allowed");
+      GlobalRes.Prevailing = true;
+      GlobalRes.IRName = Sym.getIRName();
+    } else if (!GlobalRes.Prevailing && GlobalRes.IRName.empty()) {
+      // Sometimes it can be two copies of symbol in a module and prevailing
+      // symbol can have no IR name. That might happen if symbol is defined in
+      // module level inline asm block. In case we have multiple modules with
+      // the same symbol we want to use IR name of the prevailing symbol.
+      // Otherwise, if we haven't seen a prevailing symbol, set the name so that
+      // we can later use it to check if there is any prevailing copy in IR.
       GlobalRes.IRName = Sym.getIRName();
     }
 
@@ -639,7 +668,8 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
       }
 
       // Set the 'local' flag based on the linker resolution for this symbol.
-      GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit);
+      if (Res.FinalDefinitionInLinkageUnit)
+        GV->setDSOLocal(true);
     }
     // Common resolution: collect the maximum size/alignment over all commons.
     // We also record if we see an instance of a common as prevailing, so that
@@ -744,20 +774,52 @@ unsigned LTO::getMaxTasks() const {
 Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
   // Compute "dead" symbols, we don't want to import/export these!
   DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+  DenseMap<GlobalValue::GUID, PrevailingType> GUIDPrevailingResolutions;
   for (auto &Res : GlobalResolutions) {
-    if (Res.second.VisibleOutsideSummary &&
-        // IRName will be defined if we have seen the prevailing copy of
-        // this value. If not, no need to preserve any ThinLTO copies.
-        !Res.second.IRName.empty())
+    // Normally resolution have IR name of symbol. We can do nothing here
+    // otherwise. See comments in GlobalResolution struct for more details.
+    if (Res.second.IRName.empty())
+      continue;
+
+    GlobalValue::GUID GUID = GlobalValue::getGUID(
+        GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
+
+    if (Res.second.VisibleOutsideSummary && Res.second.Prevailing)
       GUIDPreservedSymbols.insert(GlobalValue::getGUID(
           GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
+
+    GUIDPrevailingResolutions[GUID] =
+        Res.second.Prevailing ? PrevailingType::Yes : PrevailingType::No;
+  }
+
+  auto isPrevailing = [&](GlobalValue::GUID G) {
+    auto It = GUIDPrevailingResolutions.find(G);
+    if (It == GUIDPrevailingResolutions.end())
+      return PrevailingType::Unknown;
+    return It->second;
+  };
+  computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols, isPrevailing);
+
+  // Setup output file to emit statistics.
+  std::unique_ptr<ToolOutputFile> StatsFile = nullptr;
+  if (!Conf.StatsFile.empty()) {
+    EnableStatistics(false);
+    std::error_code EC;
+    StatsFile =
+        llvm::make_unique<ToolOutputFile>(Conf.StatsFile, EC, sys::fs::F_None);
+    if (EC)
+      return errorCodeToError(EC);
+    StatsFile->keep();
   }
 
-  computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
+  Error Result = runRegularLTO(AddStream);
+  if (!Result)
+    Result = runThinLTO(AddStream, Cache);
+
+  if (StatsFile)
+    PrintStatisticsJSON(StatsFile->os());
 
-  if (auto E = runRegularLTO(AddStream))
-    return E;
-  return runThinLTO(AddStream, Cache);
+  return Result;
 }
 
 Error LTO::runRegularLTO(AddStreamFn AddStream) {
@@ -801,7 +863,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 
   if (!Conf.CodeGenOnly) {
     for (const auto &R : GlobalResolutions) {
-      if (R.second.IRName.empty())
+      if (!R.second.isPrevailingIRSymbol())
         continue;
       if (R.second.Partition != 0 &&
           R.second.Partition != GlobalResolution::External)
@@ -810,7 +872,8 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
       GlobalValue *GV =
           RegularLTO.CombinedModule->getNamedValue(R.second.IRName);
       // Ignore symbols defined in other partitions.
-      if (!GV || GV->hasLocalLinkage())
+      // Also skip declarations, which are not allowed to have internal linkage.
+      if (!GV || GV->hasLocalLinkage() || GV->isDeclaration())
         continue;
       GV->setUnnamedAddr(R.second.UnnamedAddr ? GlobalValue::UnnamedAddr::Global
                                               : GlobalValue::UnnamedAddr::None);
@@ -1003,20 +1066,19 @@ namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix;
   bool ShouldEmitImportsFiles;
-
-  std::string LinkedObjectsFileName;
-  std::unique_ptr<llvm::raw_fd_ostream> LinkedObjectsFile;
+  raw_fd_ostream *LinkedObjectsFile;
+  lto::IndexWriteCallback OnWrite;
 
 public:
   WriteIndexesThinBackend(
       Config &Conf, ModuleSummaryIndex &CombinedIndex,
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       std::string OldPrefix, std::string NewPrefix, bool ShouldEmitImportsFiles,
-      std::string LinkedObjectsFileName)
+      raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
         OldPrefix(OldPrefix), NewPrefix(NewPrefix),
         ShouldEmitImportsFiles(ShouldEmitImportsFiles),
-        LinkedObjectsFileName(LinkedObjectsFileName) {}
+        LinkedObjectsFile(LinkedObjectsFile), OnWrite(OnWrite) {}
 
   Error start(
       unsigned Task, BitcodeModule BM,
@@ -1028,30 +1090,29 @@ public:
     std::string NewModulePath =
         getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
 
-    std::error_code EC;
-    if (!LinkedObjectsFileName.empty()) {
-      if (!LinkedObjectsFile) {
-        LinkedObjectsFile = llvm::make_unique<raw_fd_ostream>(
-            LinkedObjectsFileName, EC, sys::fs::OpenFlags::F_None);
-        if (EC)
-          return errorCodeToError(EC);
-      }
+    if (LinkedObjectsFile)
       *LinkedObjectsFile << NewModulePath << '\n';
-    }
 
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
                                      ImportList, ModuleToSummariesForIndex);
 
+    std::error_code EC;
     raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
                       sys::fs::OpenFlags::F_None);
     if (EC)
       return errorCodeToError(EC);
     WriteIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
 
-    if (ShouldEmitImportsFiles)
-      return errorCodeToError(
-          EmitImportsFiles(ModulePath, NewModulePath + ".imports", ImportList));
+    if (ShouldEmitImportsFiles) {
+      EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
+                            ModuleToSummariesForIndex);
+      if (EC)
+        return errorCodeToError(EC);
+    }
+
+    if (OnWrite)
+      OnWrite(ModulePath);
     return Error::success();
   }
 
@@ -1059,16 +1120,15 @@ public:
 };
 } // end anonymous namespace
 
-ThinBackend lto::createWriteIndexesThinBackend(std::string OldPrefix,
-                                               std::string NewPrefix,
-                                               bool ShouldEmitImportsFiles,
-                                               std::string LinkedObjectsFile) {
+ThinBackend lto::createWriteIndexesThinBackend(
+    std::string OldPrefix, std::string NewPrefix, bool ShouldEmitImportsFiles,
+    raw_fd_ostream *LinkedObjectsFile, IndexWriteCallback OnWrite) {
   return [=](Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
              AddStreamFn AddStream, NativeObjectCache Cache) {
     return llvm::make_unique<WriteIndexesThinBackend>(
         Conf, CombinedIndex, ModuleToDefinedGVSummaries, OldPrefix, NewPrefix,
-        ShouldEmitImportsFiles, LinkedObjectsFile);
+        ShouldEmitImportsFiles, LinkedObjectsFile, OnWrite);
   };
 }
 
@@ -1102,6 +1162,9 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache) {
       ThinLTO.ModuleMap.size());
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
 
+  if (DumpThinCGSCCs)
+    ThinLTO.CombinedIndex.dumpSCCs(outs());
+
   if (Conf.OptLevel > 0)
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                              ImportLists, ExportLists);
@@ -1112,13 +1175,10 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache) {
   // undefined references during the final link.
   std::set<GlobalValue::GUID> ExportedGUIDs;
   for (auto &Res : GlobalResolutions) {
-    // First check if the symbol was flagged as having external references.
-    if (Res.second.Partition != GlobalResolution::External)
-      continue;
-    // IRName will be defined if we have seen the prevailing copy of
-    // this value. If not, no need to mark as exported from a ThinLTO
-    // partition (and we can't get the GUID).
-    if (Res.second.IRName.empty())
+    // If the symbol does not have external references or it is not prevailing,
+    // then not need to mark it as exported from a ThinLTO partition.
+    if (Res.second.Partition != GlobalResolution::External ||
+        !Res.second.isPrevailingIRSymbol())
       continue;
     auto GUID = GlobalValue::getGUID(
         GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
@@ -1175,6 +1235,8 @@ Expected<std::unique_ptr<ToolOutputFile>>
 lto::setupOptimizationRemarks(LLVMContext &Context,
                               StringRef LTORemarksFilename,
                               bool LTOPassRemarksWithHotness, int Count) {
+  if (LTOPassRemarksWithHotness)
+    Context.setDiagnosticsHotnessRequested(true);
   if (LTORemarksFilename.empty())
     return nullptr;
 
@@ -1189,8 +1251,6 @@ lto::setupOptimizationRemarks(LLVMContext &Context,
     return errorCodeToError(EC);
   Context.setDiagnosticsOutputFile(
       llvm::make_unique<yaml::Output>(DiagnosticFile->os()));
-  if (LTOPassRemarksWithHotness)
-    Context.setDiagnosticsHotnessRequested(true);
   DiagnosticFile->keep();
   return std::move(DiagnosticFile);
 }
diff --git a/contrib/llvm/lib/LTO/LTOBackend.cpp b/contrib/llvm/lib/LTO/LTOBackend.cpp
index 501d6284117b..eadbb410bd5a 100644
--- a/contrib/llvm/lib/LTO/LTOBackend.cpp
+++ b/contrib/llvm/lib/LTO/LTOBackend.cpp
@@ -30,6 +30,10 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Target/TargetMachine.h"
@@ -72,17 +76,19 @@ Error Config::addSaveTemps(std::string OutputFileName,
       // user hasn't requested using the input module's path, emit to a file
       // named from the provided OutputFileName with the Task ID appended.
       if (M.getModuleIdentifier() == "ld-temp.o" || !UseInputModulePath) {
-        PathPrefix = OutputFileName + utostr(Task);
+        PathPrefix = OutputFileName;
+        if (Task != (unsigned)-1)
+          PathPrefix += utostr(Task) + ".";
       } else
-        PathPrefix = M.getModuleIdentifier();
-      std::string Path = PathPrefix + "." + PathSuffix + ".bc";
+        PathPrefix = M.getModuleIdentifier() + ".";
+      std::string Path = PathPrefix + PathSuffix + ".bc";
       std::error_code EC;
       raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::F_None);
       // Because -save-temps is a debugging feature, we report the error
       // directly and exit.
       if (EC)
         reportOpenError(Path, EC.message());
-      WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false);
+      WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false);
       return true;
     };
   };
@@ -103,6 +109,12 @@ Error Config::addSaveTemps(std::string OutputFileName,
     if (EC)
       reportOpenError(Path, EC.message());
     WriteIndexToFile(Index, OS);
+
+    Path = OutputFileName + "index.dot";
+    raw_fd_ostream OSDot(Path, EC, sys::fs::OpenFlags::F_None);
+    if (EC)
+      reportOpenError(Path, EC.message());
+    Index.exportToDot(OSDot);
     return true;
   };
 
@@ -132,7 +144,9 @@ createTargetMachine(Config &Conf, const Target *TheTarget, Module &M) {
 }
 
 static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
-                           unsigned OptLevel, bool IsThinLTO) {
+                           unsigned OptLevel, bool IsThinLTO,
+                           ModuleSummaryIndex *ExportSummary,
+                           const ModuleSummaryIndex *ImportSummary) {
   Optional<PGOOptions> PGOOpt;
   if (!Conf.SampleProfile.empty())
     PGOOpt = PGOOptions("", "", Conf.SampleProfile, false, true);
@@ -182,9 +196,10 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
   }
 
   if (IsThinLTO)
-    MPM = PB.buildThinLTODefaultPipeline(OL, Conf.DebugPassManager);
+    MPM = PB.buildThinLTODefaultPipeline(OL, Conf.DebugPassManager,
+                                         ImportSummary);
   else
-    MPM = PB.buildLTODefaultPipeline(OL, Conf.DebugPassManager);
+    MPM = PB.buildLTODefaultPipeline(OL, Conf.DebugPassManager, ExportSummary);
   MPM.run(Mod, MAM);
 
   // FIXME (davide): verify the output.
@@ -267,7 +282,8 @@ bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
     runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
                          Conf.DisableVerify);
   else if (Conf.UseNewPM)
-    runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO);
+    runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO, ExportSummary,
+                   ImportSummary);
   else
     runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary);
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
@@ -278,11 +294,36 @@ void codegen(Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
   if (Conf.PreCodeGenModuleHook && !Conf.PreCodeGenModuleHook(Task, Mod))
     return;
 
+  std::unique_ptr<ToolOutputFile> DwoOut;
+  SmallString<1024> DwoFile(Conf.DwoPath);
+  if (!Conf.DwoDir.empty()) {
+    std::error_code EC;
+    if (auto EC = llvm::sys::fs::create_directories(Conf.DwoDir))
+      report_fatal_error("Failed to create directory " + Conf.DwoDir + ": " +
+                         EC.message());
+
+    DwoFile = Conf.DwoDir;
+    sys::path::append(DwoFile, std::to_string(Task) + ".dwo");
+  }
+
+  if (!DwoFile.empty()) {
+    std::error_code EC;
+    TM->Options.MCOptions.SplitDwarfFile = DwoFile.str().str();
+    DwoOut = llvm::make_unique<ToolOutputFile>(DwoFile, EC, sys::fs::F_None);
+    if (EC)
+      report_fatal_error("Failed to open " + DwoFile + ": " + EC.message());
+  }
+
   auto Stream = AddStream(Task);
   legacy::PassManager CodeGenPasses;
-  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS, Conf.CGFileType))
+  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
+                              DwoOut ? &DwoOut->os() : nullptr,
+                              Conf.CGFileType))
     report_fatal_error("Failed to setup codegen");
   CodeGenPasses.run(Mod);
+
+  if (DwoOut)
+    DwoOut->keep();
 }
 
 void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream,
@@ -303,7 +344,7 @@ void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream,
         // FIXME: Provide a more direct way to do this in LLVM.
         SmallString<0> BC;
         raw_svector_ostream BCOS(BC);
-        WriteBitcodeToFile(MPart.get(), BCOS);
+        WriteBitcodeToFile(*MPart, BCOS);
 
         // Enqueue the task
         CodegenThreadPool.async(
@@ -348,14 +389,15 @@ Expected<const Target *> initAndLookupTarget(Config &C, Module &Mod) {
 
 }
 
-static void
+static Error
 finalizeOptimizationRemarks(std::unique_ptr<ToolOutputFile> DiagOutputFile) {
   // Make sure we flush the diagnostic remarks file in case the linker doesn't
   // call the global destructors before exiting.
   if (!DiagOutputFile)
-    return;
+    return Error::success();
   DiagOutputFile->keep();
   DiagOutputFile->os().flush();
+  return Error::success();
 }
 
 Error lto::backend(Config &C, AddStreamFn AddStream,
@@ -377,10 +419,8 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
 
   if (!C.CodeGenOnly) {
     if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false,
-             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr)) {
-      finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
-      return Error::success();
-    }
+             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr))
+      return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
 
   if (ParallelCodeGenParallelismLevel == 1) {
@@ -389,8 +429,28 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
     splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel,
                  std::move(Mod));
   }
-  finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
-  return Error::success();
+  return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+}
+
+static void dropDeadSymbols(Module &Mod, const GVSummaryMapTy &DefinedGlobals,
+                            const ModuleSummaryIndex &Index) {
+  std::vector<GlobalValue*> DeadGVs;
+  for (auto &GV : Mod.global_values())
+    if (GlobalValueSummary *GVS = DefinedGlobals.lookup(GV.getGUID()))
+      if (!Index.isGlobalValueLive(GVS)) {
+        DeadGVs.push_back(&GV);
+        convertToDeclaration(GV);
+      }
+
+  // Now that all dead bodies have been dropped, delete the actual objects
+  // themselves when possible.
+  for (GlobalValue *GV : DeadGVs) {
+    GV->removeDeadConstantUsers();
+    // Might reference something defined in native object (i.e. dropped a
+    // non-prevailing IR def, but we need to keep the declaration).
+    if (GV->use_empty())
+      GV->eraseFromParent();
+  }
 }
 
 Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
@@ -404,27 +464,36 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
 
   std::unique_ptr<TargetMachine> TM = createTargetMachine(Conf, *TOrErr, Mod);
 
+  // Setup optimization remarks.
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Mod.getContext(), Conf.RemarksFilename, Conf.RemarksWithHotness, Task);
+  if (!DiagFileOrErr)
+    return DiagFileOrErr.takeError();
+  auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
+
   if (Conf.CodeGenOnly) {
     codegen(Conf, TM.get(), AddStream, Task, Mod);
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
 
   if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(Task, Mod))
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   renameModuleForThinLTO(Mod, CombinedIndex);
 
+  dropDeadSymbols(Mod, DefinedGlobals, CombinedIndex);
+
   thinLTOResolveWeakForLinkerModule(Mod, DefinedGlobals);
 
   if (Conf.PostPromoteModuleHook && !Conf.PostPromoteModuleHook(Task, Mod))
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   if (!DefinedGlobals.empty())
     thinLTOInternalizeModule(Mod, DefinedGlobals);
 
   if (Conf.PostInternalizeModuleHook &&
       !Conf.PostInternalizeModuleHook(Task, Mod))
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   auto ModuleLoader = [&](StringRef Identifier) {
     assert(Mod.getContext().isODRUniquingDebugTypes() &&
@@ -441,12 +510,12 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
     return Err;
 
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true,
            /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex))
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   codegen(Conf, TM.get(), AddStream, Task, Mod);
-  return Error::success();
+  return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 }
diff --git a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
index c7306df95d3d..ffe9af74cdca 100644
--- a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -220,7 +220,7 @@ bool LTOCodeGenerator::writeMergedModules(StringRef Path) {
   }
 
   // write bitcode to it
-  WriteBitcodeToFile(MergedModule.get(), Out.os(), ShouldEmbedUselists);
+  WriteBitcodeToFile(*MergedModule, Out.os(), ShouldEmbedUselists);
   Out.os().close();
 
   if (Out.os().has_error()) {
diff --git a/contrib/llvm/lib/LTO/LTOModule.cpp b/contrib/llvm/lib/LTO/LTOModule.cpp
index 626d2f5dc813..20fc0943539f 100644
--- a/contrib/llvm/lib/LTO/LTOModule.cpp
+++ b/contrib/llvm/lib/LTO/LTOModule.cpp
@@ -14,9 +14,7 @@
 
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
@@ -39,6 +37,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include <system_error>
 using namespace llvm;
@@ -57,11 +56,7 @@ LTOModule::~LTOModule() {}
 bool LTOModule::isBitcodeFile(const void *Mem, size_t Length) {
   Expected<MemoryBufferRef> BCData = IRObjectFile::findBitcodeInMemBuffer(
       MemoryBufferRef(StringRef((const char *)Mem, Length), "<mem>"));
-  if (!BCData) {
-    consumeError(BCData.takeError());
-    return false;
-  }
-  return true;
+  return !errorToBool(BCData.takeError());
 }
 
 bool LTOModule::isBitcodeFile(StringRef Path) {
@@ -72,11 +67,7 @@ bool LTOModule::isBitcodeFile(StringRef Path) {
 
   Expected<MemoryBufferRef> BCData = IRObjectFile::findBitcodeInMemBuffer(
       BufferOrErr.get()->getMemBufferRef());
-  if (!BCData) {
-    consumeError(BCData.takeError());
-    return false;
-  }
-  return true;
+  return !errorToBool(BCData.takeError());
 }
 
 bool LTOModule::isThinLTO() {
@@ -92,10 +83,8 @@ bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer,
                                    StringRef TriplePrefix) {
   Expected<MemoryBufferRef> BCOrErr =
       IRObjectFile::findBitcodeInMemBuffer(Buffer->getMemBufferRef());
-  if (!BCOrErr) {
-    consumeError(BCOrErr.takeError());
+  if (errorToBool(BCOrErr.takeError()))
     return false;
-  }
   LLVMContext Context;
   ErrorOr<std::string> TripleOrErr =
       expectedToErrorOrAndEmitErrors(Context, getBitcodeTargetTriple(*BCOrErr));
@@ -107,10 +96,8 @@ bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer,
 std::string LTOModule::getProducerString(MemoryBuffer *Buffer) {
   Expected<MemoryBufferRef> BCOrErr =
       IRObjectFile::findBitcodeInMemBuffer(Buffer->getMemBufferRef());
-  if (!BCOrErr) {
-    consumeError(BCOrErr.takeError());
+  if (errorToBool(BCOrErr.takeError()))
     return "";
-  }
   LLVMContext Context;
   ErrorOr<std::string> ProducerOrErr = expectedToErrorOrAndEmitErrors(
       Context, getBitcodeProducerString(*BCOrErr));
@@ -220,7 +207,7 @@ LTOModule::makeLTOModule(MemoryBufferRef Buffer, const TargetOptions &options,
   std::string errMsg;
   const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (!march)
-    return std::unique_ptr<LTOModule>(nullptr);
+    return make_error_code(object::object_error::arch_not_found);
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
@@ -456,7 +443,7 @@ void LTOModule::addDefinedSymbol(StringRef Name, const GlobalValue *def,
     attr |= LTO_SYMBOL_SCOPE_HIDDEN;
   else if (def->hasProtectedVisibility())
     attr |= LTO_SYMBOL_SCOPE_PROTECTED;
-  else if (canBeOmittedFromSymbolTable(def))
+  else if (def->canBeOmittedFromSymbolTable())
     attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
   else
     attr |= LTO_SYMBOL_SCOPE_DEFAULT;
diff --git a/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index abcd8905ad35..90d0f9bdb885 100644
--- a/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -23,7 +23,7 @@
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
@@ -39,6 +39,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SHA1.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
@@ -54,6 +55,12 @@
 
 #include <numeric>
 
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
 using namespace llvm;
 
 #define DEBUG_TYPE "thinlto"
@@ -82,7 +89,7 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
   if (EC)
     report_fatal_error(Twine("Failed to open ") + SaveTempPath +
                        " to save optimized bitcode\n");
-  WriteBitcodeToFile(&TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
 }
 
 static const GlobalValueSummary *
@@ -267,14 +274,14 @@ std::unique_ptr<MemoryBuffer> codegenModule(Module &TheModule,
     PM.add(createObjCARCContractPass());
 
     // Setup the codegen now.
-    if (TM.addPassesToEmitFile(PM, OS, TargetMachine::CGFT_ObjectFile,
+    if (TM.addPassesToEmitFile(PM, OS, nullptr, TargetMachine::CGFT_ObjectFile,
                                /* DisableVerify */ true))
       report_fatal_error("Failed to setup codegen");
 
     // Run codegen now. resulting binary is in OutputBuffer.
     PM.run(TheModule);
   }
-  return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
+  return make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
 }
 
 /// Manage caching for a single Module.
@@ -390,7 +397,18 @@ public:
   ErrorOr<std::unique_ptr<MemoryBuffer>> tryLoadingBuffer() {
     if (EntryPath.empty())
       return std::error_code();
-    return MemoryBuffer::getFile(EntryPath);
+    int FD;
+    SmallString<64> ResultPath;
+    std::error_code EC = sys::fs::openFileForRead(
+        Twine(EntryPath), FD, sys::fs::OF_UpdateAtime, &ResultPath);
+    if (EC)
+      return EC;
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getOpenFile(FD, EntryPath,
+                                  /*FileSize*/ -1,
+                                  /*RequiresNullTerminator*/ false);
+    close(FD);
+    return MBOrErr;
   }
 
   // Cache the Produced object file
@@ -400,9 +418,12 @@ public:
 
     // Write to a temporary to avoid race condition
     SmallString<128> TempFilename;
+    SmallString<128> CachePath(EntryPath);
     int TempFD;
-    std::error_code EC =
-        sys::fs::createTemporaryFile("Thin", "tmp.o", TempFD, TempFilename);
+    llvm::sys::path::remove_filename(CachePath);
+    sys::path::append(TempFilename, CachePath, "Thin-%%%%%%.tmp.o");
+    std::error_code EC = 
+      sys::fs::createUniqueFile(TempFilename, TempFD, TempFilename);
     if (EC) {
       errs() << "Error: " << EC.message() << "\n";
       report_fatal_error("ThinLTO: Can't get a temporary file");
@@ -411,16 +432,10 @@ public:
       raw_fd_ostream OS(TempFD, /* ShouldClose */ true);
       OS << OutputBuffer.getBuffer();
     }
-    // Rename to final destination (hopefully race condition won't matter here)
+    // Rename temp file to final destination; rename is atomic 
     EC = sys::fs::rename(TempFilename, EntryPath);
-    if (EC) {
+    if (EC)
       sys::fs::remove(TempFilename);
-      raw_fd_ostream OS(EntryPath, EC, sys::fs::F_None);
-      if (EC)
-        report_fatal_error(Twine("Failed to open ") + EntryPath +
-                           " to save cached entry\n");
-      OS << OutputBuffer.getBuffer();
-    }
   }
 };
 
@@ -476,9 +491,9 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
       raw_svector_ostream OS(OutputBuffer);
       ProfileSummaryInfo PSI(TheModule);
       auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
-      WriteBitcodeToFile(&TheModule, OS, true, &Index);
+      WriteBitcodeToFile(TheModule, OS, true, &Index);
     }
-    return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
+    return make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
   }
 
   return codegenModule(TheModule, TM);
@@ -592,7 +607,7 @@ std::unique_ptr<TargetMachine> TargetMachineBuilder::create() const {
  */
 std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
   std::unique_ptr<ModuleSummaryIndex> CombinedIndex =
-      llvm::make_unique<ModuleSummaryIndex>();
+      llvm::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
   uint64_t NextModuleId = 0;
   for (auto &ModuleBuffer : Modules) {
     if (Error Err = readModuleSummaryIndex(ModuleBuffer.getMemBuffer(),
@@ -607,6 +622,32 @@ std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
   return CombinedIndex;
 }
 
+static void internalizeAndPromoteInIndex(
+    const StringMap<FunctionImporter::ExportSetTy> &ExportLists,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    ModuleSummaryIndex &Index) {
+  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
+    const auto &ExportList = ExportLists.find(ModuleIdentifier);
+    return (ExportList != ExportLists.end() &&
+            ExportList->second.count(GUID)) ||
+           GUIDPreservedSymbols.count(GUID);
+  };
+
+  thinLTOInternalizeAndPromoteInIndex(Index, isExported);
+}
+
+static void computeDeadSymbolsInIndex(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  // We have no symbols resolution available. And can't do any better now in the
+  // case where the prevailing symbol is in a native object. It can be refined
+  // with linker information in the future.
+  auto isPrevailing = [&](GlobalValue::GUID G) {
+    return PrevailingType::Unknown;
+  };
+  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+}
+
 /**
  * Perform promotion and renaming of exported internal functions.
  * Index is updated to reflect linkage changes from weak resolution.
@@ -625,7 +666,7 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
       PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Compute "dead" symbols, we don't want to import/export these!
-  computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
@@ -642,13 +683,7 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
 
   // Promote the exported values in the index, so that they are promoted
   // in the module.
-  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
-    const auto &ExportList = ExportLists.find(ModuleIdentifier);
-    return (ExportList != ExportLists.end() &&
-            ExportList->second.count(GUID)) ||
-           GUIDPreservedSymbols.count(GUID);
-  };
-  thinLTOInternalizeAndPromoteInIndex(Index, isExported);
+  internalizeAndPromoteInIndex(ExportLists, GUIDPreservedSymbols, Index);
 
   promoteModule(TheModule, Index);
 }
@@ -670,7 +705,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
       PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Compute "dead" symbols, we don't want to import/export these!
-  computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
@@ -723,8 +758,14 @@ void ThinLTOCodeGenerator::emitImports(StringRef ModulePath,
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
                            ExportLists);
 
+  std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+  llvm::gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
+                                         ImportLists[ModulePath],
+                                         ModuleToSummariesForIndex);
+
   std::error_code EC;
-  if ((EC = EmitImportsFiles(ModulePath, OutputName, ImportLists[ModulePath])))
+  if ((EC =
+           EmitImportsFiles(ModulePath, OutputName, ModuleToSummariesForIndex)))
     report_fatal_error(Twine("Failed to open ") + OutputName +
                        " to save imports lists\n");
 }
@@ -747,7 +788,7 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
   // Compute "dead" symbols, we don't want to import/export these!
-  computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
@@ -762,13 +803,7 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
     return;
 
   // Internalization
-  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
-    const auto &ExportList = ExportLists.find(ModuleIdentifier);
-    return (ExportList != ExportLists.end() &&
-            ExportList->second.count(GUID)) ||
-           GUIDPreservedSymbols.count(GUID);
-  };
-  thinLTOInternalizeAndPromoteInIndex(Index, isExported);
+  internalizeAndPromoteInIndex(ExportLists, GUIDPreservedSymbols, Index);
   thinLTOInternalizeModule(TheModule,
                            ModuleToDefinedGVSummaries[ModuleIdentifier]);
 }
@@ -899,7 +934,7 @@ void ThinLTOCodeGenerator::run() {
       computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
 
   // Compute "dead" symbols, we don't want to import/export these!
-  computeDeadSymbols(*Index, GUIDPreservedSymbols);
+  computeDeadSymbolsInIndex(*Index, GUIDPreservedSymbols);
 
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
@@ -918,17 +953,10 @@ void ThinLTOCodeGenerator::run() {
   // impacts the caching.
   resolveWeakForLinkerInIndex(*Index, ResolvedODR);
 
-  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
-    const auto &ExportList = ExportLists.find(ModuleIdentifier);
-    return (ExportList != ExportLists.end() &&
-            ExportList->second.count(GUID)) ||
-           GUIDPreservedSymbols.count(GUID);
-  };
-
   // Use global summary-based analysis to identify symbols that can be
   // internalized (because they aren't exported or preserved as per callback).
   // Changes are made in the index, consumed in the ThinLTO backends.
-  thinLTOInternalizeAndPromoteInIndex(*Index, isExported);
+  internalizeAndPromoteInIndex(ExportLists, GUIDPreservedSymbols, *Index);
 
   // Make sure that every module has an entry in the ExportLists and
   // ResolvedODR maps to enable threaded access to these maps below.
@@ -943,12 +971,12 @@ void ThinLTOCodeGenerator::run() {
   std::vector<int> ModulesOrdering;
   ModulesOrdering.resize(Modules.size());
   std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  std::sort(ModulesOrdering.begin(), ModulesOrdering.end(),
-            [&](int LeftIndex, int RightIndex) {
-              auto LSize = Modules[LeftIndex].getBuffer().size();
-              auto RSize = Modules[RightIndex].getBuffer().size();
-              return LSize > RSize;
-            });
+  llvm::sort(ModulesOrdering.begin(), ModulesOrdering.end(),
+             [&](int LeftIndex, int RightIndex) {
+               auto LSize = Modules[LeftIndex].getBuffer().size();
+               auto RSize = Modules[RightIndex].getBuffer().size();
+               return LSize > RSize;
+             });
 
   // Parallel optimizer + codegen
   {
@@ -971,9 +999,9 @@ void ThinLTOCodeGenerator::run() {
 
         {
           auto ErrOrBuffer = CacheEntry.tryLoadingBuffer();
-          DEBUG(dbgs() << "Cache " << (ErrOrBuffer ? "hit" : "miss") << " '"
-                       << CacheEntryPath << "' for buffer " << count << " "
-                       << ModuleIdentifier << "\n");
+          LLVM_DEBUG(dbgs() << "Cache " << (ErrOrBuffer ? "hit" : "miss")
+                            << " '" << CacheEntryPath << "' for buffer "
+                            << count << " " << ModuleIdentifier << "\n");
 
           if (ErrOrBuffer) {
             // Cache Hit!
@@ -1020,15 +1048,15 @@ void ThinLTOCodeGenerator::run() {
         if (SavedObjectsDirectoryPath.empty()) {
           // We need to generated a memory buffer for the linker.
           if (!CacheEntryPath.empty()) {
-            // Cache is enabled, reload from the cache
-            // We do this to lower memory pressuree: the buffer is on the heap
-            // and releasing it frees memory that can be used for the next input
-            // file. The final binary link will read from the VFS cache
-            // (hopefully!) or from disk if the memory pressure wasn't too high.
+            // When cache is enabled, reload from the cache if possible. 
+            // Releasing the buffer from the heap and reloading it from the
+            // cache file with mmap helps us to lower memory pressure. 
+            // The freed memory can be used for the next input file. 
+            // The final binary link will read from the VFS cache (hopefully!)
+            // or from disk (if the memory pressure was too high).
             auto ReloadedBufferOrErr = CacheEntry.tryLoadingBuffer();
             if (auto EC = ReloadedBufferOrErr.getError()) {
-              // On error, keeping the preexisting buffer and printing a
-              // diagnostic is more friendly than just crashing.
+              // On error, keep the preexisting buffer and print a diagnostic.
               errs() << "error: can't reload cached file '" << CacheEntryPath
                      << "': " << EC.message() << "\n";
             } else {
diff --git a/contrib/llvm/lib/Linker/IRMover.cpp b/contrib/llvm/lib/Linker/IRMover.cpp
index f7170e714b9b..738dec8e1f29 100644
--- a/contrib/llvm/lib/Linker/IRMover.cpp
+++ b/contrib/llvm/lib/Linker/IRMover.cpp
@@ -95,6 +95,12 @@ void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
     for (StructType *Ty : SpeculativeDstOpaqueTypes)
       DstResolvedOpaqueTypes.erase(Ty);
   } else {
+    // SrcTy and DstTy are recursively ismorphic. We clear names of SrcTy
+    // and all its descendants to lower amount of renaming in LLVM context
+    // Renaming occurs because we load all source modules to the same context
+    // and declaration with existing name gets renamed (i.e Foo -> Foo.42).
+    // As a result we may get several different types in the destination
+    // module, which are in fact the same.
     for (Type *Ty : SpeculativeTypes)
       if (auto *STy = dyn_cast<StructType>(Ty))
         if (STy->hasName())
@@ -160,7 +166,6 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
   if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
     if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
       return false;
-
   } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
     if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
       return false;
@@ -235,18 +240,27 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
   // These are types that LLVM itself will unique.
   bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
 
-#ifndef NDEBUG
   if (!IsUniqued) {
+    StructType *STy = cast<StructType>(Ty);
+    // This is actually a type from the destination module, this can be reached
+    // when this type is loaded in another module, added to DstStructTypesSet,
+    // and then we reach the same type in another module where it has not been
+    // added to MappedTypes. (PR37684)
+    if (STy->getContext().isODRUniquingDebugTypes() && !STy->isOpaque() &&
+        DstStructTypesSet.hasType(STy))
+      return *Entry = STy;
+
+#ifndef NDEBUG
     for (auto &Pair : MappedTypes) {
       assert(!(Pair.first != Ty && Pair.second == Ty) &&
              "mapping to a source type");
     }
-  }
 #endif
 
-  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
-    StructType *DTy = StructType::create(Ty->getContext());
-    return *Entry = DTy;
+    if (!Visited.insert(STy).second) {
+      StructType *DTy = StructType::create(Ty->getContext());
+      return *Entry = DTy;
+    }
   }
 
   // If this is not a recursive type, then just map all of the elements and
@@ -676,6 +690,14 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
   return NewGV;
 }
 
+static StringRef getTypeNamePrefix(StringRef Name) {
+  size_t DotPos = Name.rfind('.');
+  return (DotPos == 0 || DotPos == StringRef::npos || Name.back() == '.' ||
+          !isdigit(static_cast<unsigned char>(Name[DotPos + 1])))
+             ? Name
+             : Name.substr(0, DotPos);
+}
+
 /// Loop over all of the linked values to compute type mappings.  For example,
 /// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
 /// types 'Foo' but one got renamed when the module was loaded into the same
@@ -722,15 +744,12 @@ void IRLinker::computeTypeMapping() {
       continue;
     }
 
-    // Check to see if there is a dot in the name followed by a digit.
-    size_t DotPos = ST->getName().rfind('.');
-    if (DotPos == 0 || DotPos == StringRef::npos ||
-        ST->getName().back() == '.' ||
-        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
+    auto STTypePrefix = getTypeNamePrefix(ST->getName());
+    if (STTypePrefix.size()== ST->getName().size())
       continue;
 
     // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
+    StructType *DST = DstM.getTypeByName(STTypePrefix);
     if (!DST)
       continue;
 
@@ -928,7 +947,7 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
     if (DoneLinkingBodies)
       return nullptr;
 
-    NewGV = copyGlobalValueProto(SGV, ShouldLink);
+    NewGV = copyGlobalValueProto(SGV, ShouldLink || ForAlias);
     if (ShouldLink || !ForAlias)
       forceRenaming(NewGV, SGV->getName());
   }
@@ -1040,14 +1059,10 @@ void IRLinker::prepareCompileUnitsForImport() {
     ValueMap.MD()[CU->getRawEnumTypes()].reset(nullptr);
     ValueMap.MD()[CU->getRawMacros()].reset(nullptr);
     ValueMap.MD()[CU->getRawRetainedTypes()].reset(nullptr);
-    // If we ever start importing global variable defs, we'll need to
-    // add their DIGlobalVariable to the globals list on the imported
-    // DICompileUnit. Confirm none are imported, and then we can
-    // map the list of global variables to nullptr.
-    assert(none_of(
-               ValuesToLink,
-               [](const GlobalValue *GV) { return isa<GlobalVariable>(GV); }) &&
-           "Unexpected importing of a GlobalVariable definition");
+    // We import global variables only temporarily in order for instcombine
+    // and globalopt to perform constant folding and static constructor
+    // evaluation. After that elim-avail-extern will covert imported globals
+    // back to declarations, so we don't need debug info for them.
     ValueMap.MD()[CU->getRawGlobalVariables()].reset(nullptr);
 
     // Imported entities only need to be mapped in if they have local
diff --git a/contrib/llvm/lib/MC/ELFObjectWriter.cpp b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
index 989d4bb4eb9c..db531f75c87c 100644
--- a/contrib/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/StringSaver.h"
@@ -68,9 +69,14 @@ namespace {
 using SectionIndexMapTy = DenseMap<const MCSectionELF *, uint32_t>;
 
 class ELFObjectWriter;
+struct ELFWriter;
+
+bool isDwoSection(const MCSectionELF &Sec) {
+  return Sec.getSectionName().endswith(".dwo");
+}
 
 class SymbolTableWriter {
-  ELFObjectWriter &EWriter;
+  ELFWriter &EWriter;
   bool Is64Bit;
 
   // indexes we are going to write to .symtab_shndx.
@@ -84,7 +90,7 @@ class SymbolTableWriter {
   template <typename T> void write(T Value);
 
 public:
-  SymbolTableWriter(ELFObjectWriter &EWriter, bool Is64Bit);
+  SymbolTableWriter(ELFWriter &EWriter, bool Is64Bit);
 
   void writeSymbol(uint32_t name, uint8_t info, uint64_t value, uint64_t size,
                    uint8_t other, uint32_t shndx, bool Reserved);
@@ -92,7 +98,16 @@ public:
   ArrayRef<uint32_t> getShndxIndexes() const { return ShndxIndexes; }
 };
 
-class ELFObjectWriter : public MCObjectWriter {
+struct ELFWriter {
+  ELFObjectWriter &OWriter;
+  support::endian::Writer W;
+
+  enum DwoMode {
+    AllSections,
+    NonDwoOnly,
+    DwoOnly,
+  } Mode;
+
   static uint64_t SymbolValue(const MCSymbol &Sym, const MCAsmLayout &Layout);
   static bool isInSymtab(const MCAsmLayout &Layout, const MCSymbolELF &Symbol,
                          bool Used, bool Renamed);
@@ -117,13 +132,6 @@ class ELFObjectWriter : public MCObjectWriter {
     }
   };
 
-  /// The target specific ELF writer instance.
-  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
-
-  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
-
-  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
-
   /// @}
   /// @name Symbol Table Data
   /// @{
@@ -144,14 +152,8 @@ class ELFObjectWriter : public MCObjectWriter {
   unsigned addToSectionTable(const MCSectionELF *Sec);
 
   // TargetObjectWriter wrappers.
-  bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
-  bool hasRelocationAddend() const {
-    return TargetObjectWriter->hasRelocationAddend();
-  }
-  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
-                        const MCFixup &Fixup, bool IsPCRel) const {
-    return TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
-  }
+  bool is64Bit() const;
+  bool hasRelocationAddend() const;
 
   void align(unsigned Alignment);
 
@@ -160,33 +162,20 @@ class ELFObjectWriter : public MCObjectWriter {
                              bool ZLibStyle, unsigned Alignment);
 
 public:
-  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                  raw_pwrite_stream &OS, bool IsLittleEndian)
-      : MCObjectWriter(OS, IsLittleEndian),
-        TargetObjectWriter(std::move(MOTW)) {}
-
-  ~ELFObjectWriter() override = default;
+  ELFWriter(ELFObjectWriter &OWriter, raw_pwrite_stream &OS,
+            bool IsLittleEndian, DwoMode Mode)
+      : OWriter(OWriter),
+        W(OS, IsLittleEndian ? support::little : support::big), Mode(Mode) {}
 
-  void reset() override {
-    Renames.clear();
-    Relocations.clear();
-    StrTabBuilder.clear();
-    SectionTable.clear();
-    MCObjectWriter::reset();
-  }
-
-  void WriteWord(uint64_t W) {
+  void WriteWord(uint64_t Word) {
     if (is64Bit())
-      write64(W);
+      W.write<uint64_t>(Word);
     else
-      write32(W);
+      W.write<uint32_t>(Word);
   }
 
   template <typename T> void write(T Val) {
-    if (IsLittleEndian)
-      support::endian::Writer<support::little>(getStream()).write(Val);
-    else
-      support::endian::Writer<support::big>(getStream()).write(Val);
+    W.write(Val);
   }
 
   void writeHeader(const MCAssembler &Asm);
@@ -198,15 +187,6 @@ public:
   using SectionOffsetsTy =
       std::map<const MCSectionELF *, std::pair<uint64_t, uint64_t>>;
 
-  bool shouldRelocateWithSymbol(const MCAssembler &Asm,
-                                const MCSymbolRefExpr *RefA,
-                                const MCSymbol *Sym, uint64_t C,
-                                unsigned Type) const;
-
-  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) override;
-
   // Map from a signature symbol to the group section index
   using RevGroupMapTy = DenseMap<const MCSymbol *, unsigned>;
 
@@ -220,14 +200,13 @@ public:
                           const RevGroupMapTy &RevGroupMap,
                           SectionOffsetsTy &SectionOffsets);
 
+  void writeAddrsigSection();
+
   MCSectionELF *createRelocationSection(MCContext &Ctx,
                                         const MCSectionELF &Sec);
 
   const MCSectionELF *createStringTable(MCContext &Ctx);
 
-  void executePostLayoutBinding(MCAssembler &Asm,
-                                const MCAsmLayout &Layout) override;
-
   void writeSectionHeader(const MCAsmLayout &Layout,
                           const SectionIndexMapTy &SectionIndexMap,
                           const SectionOffsetsTy &SectionOffsets);
@@ -242,26 +221,126 @@ public:
 
   void writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec);
 
-  using MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl;
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+  void writeSection(const SectionIndexMapTy &SectionIndexMap,
+                    uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
+                    const MCSectionELF &Section);
+};
+
+class ELFObjectWriter : public MCObjectWriter {
+  /// The target specific ELF writer instance.
+  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
+
+  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
+
+  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
+
+  bool EmitAddrsigSection = false;
+  std::vector<const MCSymbol *> AddrsigSyms;
+
+  bool hasRelocationAddend() const;
+
+  bool shouldRelocateWithSymbol(const MCAssembler &Asm,
+                                const MCSymbolRefExpr *RefA,
+                                const MCSymbolELF *Sym, uint64_t C,
+                                unsigned Type) const;
+
+public:
+  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW)
+      : TargetObjectWriter(std::move(MOTW)) {}
+
+  void reset() override {
+    Relocations.clear();
+    Renames.clear();
+    MCObjectWriter::reset();
+  }
+
   bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
                                               const MCSymbol &SymA,
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
-  void writeSection(const SectionIndexMapTy &SectionIndexMap,
-                    uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
-                    const MCSectionELF &Section);
+  virtual bool checkRelocation(MCContext &Ctx, SMLoc Loc,
+                               const MCSectionELF *From,
+                               const MCSectionELF *To) {
+    return true;
+  }
+
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override;
+
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override;
+
+  void emitAddrsigSection() override { EmitAddrsigSection = true; }
+  void addAddrsigSymbol(const MCSymbol *Sym) override {
+    AddrsigSyms.push_back(Sym);
+  }
+
+  friend struct ELFWriter;
+};
+
+class ELFSingleObjectWriter : public ELFObjectWriter {
+  raw_pwrite_stream &OS;
+  bool IsLittleEndian;
+
+public:
+  ELFSingleObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                        raw_pwrite_stream &OS, bool IsLittleEndian)
+      : ELFObjectWriter(std::move(MOTW)), OS(OS),
+        IsLittleEndian(IsLittleEndian) {}
+
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override {
+    return ELFWriter(*this, OS, IsLittleEndian, ELFWriter::AllSections)
+        .writeObject(Asm, Layout);
+  }
+
+  friend struct ELFWriter;
+};
+
+class ELFDwoObjectWriter : public ELFObjectWriter {
+  raw_pwrite_stream &OS, &DwoOS;
+  bool IsLittleEndian;
+
+public:
+  ELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                     raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                     bool IsLittleEndian)
+      : ELFObjectWriter(std::move(MOTW)), OS(OS), DwoOS(DwoOS),
+        IsLittleEndian(IsLittleEndian) {}
+
+  virtual bool checkRelocation(MCContext &Ctx, SMLoc Loc,
+                               const MCSectionELF *From,
+                               const MCSectionELF *To) override {
+    if (isDwoSection(*From)) {
+      Ctx.reportError(Loc, "A dwo section may not contain relocations");
+      return false;
+    }
+    if (To && isDwoSection(*To)) {
+      Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
+      return false;
+    }
+    return true;
+  }
+
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override {
+    uint64_t Size = ELFWriter(*this, OS, IsLittleEndian, ELFWriter::NonDwoOnly)
+                        .writeObject(Asm, Layout);
+    Size += ELFWriter(*this, DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
+                .writeObject(Asm, Layout);
+    return Size;
+  }
 };
 
 } // end anonymous namespace
 
-void ELFObjectWriter::align(unsigned Alignment) {
-  uint64_t Padding = OffsetToAlignment(getStream().tell(), Alignment);
-  WriteZeros(Padding);
+void ELFWriter::align(unsigned Alignment) {
+  uint64_t Padding = OffsetToAlignment(W.OS.tell(), Alignment);
+  W.OS.write_zeros(Padding);
 }
 
-unsigned ELFObjectWriter::addToSectionTable(const MCSectionELF *Sec) {
+unsigned ELFWriter::addToSectionTable(const MCSectionELF *Sec) {
   SectionTable.push_back(Sec);
   StrTabBuilder.add(Sec->getSectionName());
   return SectionTable.size();
@@ -278,7 +357,7 @@ template <typename T> void SymbolTableWriter::write(T Value) {
   EWriter.write(Value);
 }
 
-SymbolTableWriter::SymbolTableWriter(ELFObjectWriter &EWriter, bool Is64Bit)
+SymbolTableWriter::SymbolTableWriter(ELFWriter &EWriter, bool Is64Bit)
     : EWriter(EWriter), Is64Bit(Is64Bit), NumWritten(0) {}
 
 void SymbolTableWriter::writeSymbol(uint32_t name, uint8_t info, uint64_t value,
@@ -317,8 +396,16 @@ void SymbolTableWriter::writeSymbol(uint32_t name, uint8_t info, uint64_t value,
   ++NumWritten;
 }
 
+bool ELFWriter::is64Bit() const {
+  return OWriter.TargetObjectWriter->is64Bit();
+}
+
+bool ELFWriter::hasRelocationAddend() const {
+  return OWriter.hasRelocationAddend();
+}
+
 // Emit the ELF header.
-void ELFObjectWriter::writeHeader(const MCAssembler &Asm) {
+void ELFWriter::writeHeader(const MCAssembler &Asm) {
   // ELF Header
   // ----------
   //
@@ -327,51 +414,54 @@ void ELFObjectWriter::writeHeader(const MCAssembler &Asm) {
   // emitWord method behaves differently for ELF32 and ELF64, writing
   // 4 bytes in the former and 8 in the latter.
 
-  writeBytes(ELF::ElfMagic); // e_ident[EI_MAG0] to e_ident[EI_MAG3]
+  W.OS << ELF::ElfMagic; // e_ident[EI_MAG0] to e_ident[EI_MAG3]
 
-  write8(is64Bit() ? ELF::ELFCLASS64 : ELF::ELFCLASS32); // e_ident[EI_CLASS]
+  W.OS << char(is64Bit() ? ELF::ELFCLASS64 : ELF::ELFCLASS32); // e_ident[EI_CLASS]
 
   // e_ident[EI_DATA]
-  write8(isLittleEndian() ? ELF::ELFDATA2LSB : ELF::ELFDATA2MSB);
+  W.OS << char(W.Endian == support::little ? ELF::ELFDATA2LSB
+                                           : ELF::ELFDATA2MSB);
 
-  write8(ELF::EV_CURRENT);        // e_ident[EI_VERSION]
+  W.OS << char(ELF::EV_CURRENT);        // e_ident[EI_VERSION]
   // e_ident[EI_OSABI]
-  write8(TargetObjectWriter->getOSABI());
-  write8(0);                  // e_ident[EI_ABIVERSION]
+  W.OS << char(OWriter.TargetObjectWriter->getOSABI());
+  W.OS << char(0);                  // e_ident[EI_ABIVERSION]
 
-  WriteZeros(ELF::EI_NIDENT - ELF::EI_PAD);
+  W.OS.write_zeros(ELF::EI_NIDENT - ELF::EI_PAD);
 
-  write16(ELF::ET_REL);             // e_type
+  W.write<uint16_t>(ELF::ET_REL);             // e_type
 
-  write16(TargetObjectWriter->getEMachine()); // e_machine = target
+  W.write<uint16_t>(OWriter.TargetObjectWriter->getEMachine()); // e_machine = target
 
-  write32(ELF::EV_CURRENT);         // e_version
+  W.write<uint32_t>(ELF::EV_CURRENT);         // e_version
   WriteWord(0);                    // e_entry, no entry point in .o file
   WriteWord(0);                    // e_phoff, no program header for .o
   WriteWord(0);                     // e_shoff = sec hdr table off in bytes
 
   // e_flags = whatever the target wants
-  write32(Asm.getELFHeaderEFlags());
+  W.write<uint32_t>(Asm.getELFHeaderEFlags());
 
   // e_ehsize = ELF header size
-  write16(is64Bit() ? sizeof(ELF::Elf64_Ehdr) : sizeof(ELF::Elf32_Ehdr));
+  W.write<uint16_t>(is64Bit() ? sizeof(ELF::Elf64_Ehdr)
+                              : sizeof(ELF::Elf32_Ehdr));
 
-  write16(0);                  // e_phentsize = prog header entry size
-  write16(0);                  // e_phnum = # prog header entries = 0
+  W.write<uint16_t>(0);                  // e_phentsize = prog header entry size
+  W.write<uint16_t>(0);                  // e_phnum = # prog header entries = 0
 
   // e_shentsize = Section header entry size
-  write16(is64Bit() ? sizeof(ELF::Elf64_Shdr) : sizeof(ELF::Elf32_Shdr));
+  W.write<uint16_t>(is64Bit() ? sizeof(ELF::Elf64_Shdr)
+                              : sizeof(ELF::Elf32_Shdr));
 
   // e_shnum     = # of section header ents
-  write16(0);
+  W.write<uint16_t>(0);
 
   // e_shstrndx  = Section # of '.shstrtab'
   assert(StringTableIndex < ELF::SHN_LORESERVE);
-  write16(StringTableIndex);
+  W.write<uint16_t>(StringTableIndex);
 }
 
-uint64_t ELFObjectWriter::SymbolValue(const MCSymbol &Sym,
-                                      const MCAsmLayout &Layout) {
+uint64_t ELFWriter::SymbolValue(const MCSymbol &Sym,
+                                const MCAsmLayout &Layout) {
   if (Sym.isCommon() && Sym.isExternal())
     return Sym.getCommonAlignment();
 
@@ -385,45 +475,6 @@ uint64_t ELFObjectWriter::SymbolValue(const MCSymbol &Sym,
   return Res;
 }
 
-void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
-                                               const MCAsmLayout &Layout) {
-  // The presence of symbol versions causes undefined symbols and
-  // versions declared with @@@ to be renamed.
-  for (const std::pair<StringRef, const MCSymbol *> &P : Asm.Symvers) {
-    StringRef AliasName = P.first;
-    const auto &Symbol = cast<MCSymbolELF>(*P.second);
-    size_t Pos = AliasName.find('@');
-    assert(Pos != StringRef::npos);
-
-    StringRef Prefix = AliasName.substr(0, Pos);
-    StringRef Rest = AliasName.substr(Pos);
-    StringRef Tail = Rest;
-    if (Rest.startswith("@@@"))
-      Tail = Rest.substr(Symbol.isUndefined() ? 2 : 1);
-
-    auto *Alias =
-        cast<MCSymbolELF>(Asm.getContext().getOrCreateSymbol(Prefix + Tail));
-    Asm.registerSymbol(*Alias);
-    const MCExpr *Value = MCSymbolRefExpr::create(&Symbol, Asm.getContext());
-    Alias->setVariableValue(Value);
-
-    // Aliases defined with .symvar copy the binding from the symbol they alias.
-    // This is the first place we are able to copy this information.
-    Alias->setExternal(Symbol.isExternal());
-    Alias->setBinding(Symbol.getBinding());
-
-    if (!Symbol.isUndefined() && !Rest.startswith("@@@"))
-      continue;
-
-    // FIXME: produce a better error message.
-    if (Symbol.isUndefined() && Rest.startswith("@@") &&
-        !Rest.startswith("@@@"))
-      report_fatal_error("A @@ version cannot be undefined");
-
-    Renames.insert(std::make_pair(&Symbol, Alias));
-  }
-}
-
 static uint8_t mergeTypeForSet(uint8_t origType, uint8_t newType) {
   uint8_t Type = newType;
 
@@ -459,9 +510,8 @@ static uint8_t mergeTypeForSet(uint8_t origType, uint8_t newType) {
   return Type;
 }
 
-void ELFObjectWriter::writeSymbol(SymbolTableWriter &Writer,
-                                  uint32_t StringIndex, ELFSymbolData &MSD,
-                                  const MCAsmLayout &Layout) {
+void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
+                            ELFSymbolData &MSD, const MCAsmLayout &Layout) {
   const auto &Symbol = cast<MCSymbolELF>(*MSD.Symbol);
   const MCSymbolELF *Base =
       cast_or_null<MCSymbolELF>(Layout.getBaseSymbol(Symbol));
@@ -502,108 +552,6 @@ void ELFObjectWriter::writeSymbol(SymbolTableWriter &Writer,
                      IsReserved);
 }
 
-// It is always valid to create a relocation with a symbol. It is preferable
-// to use a relocation with a section if that is possible. Using the section
-// allows us to omit some local symbols from the symbol table.
-bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
-                                               const MCSymbolRefExpr *RefA,
-                                               const MCSymbol *S, uint64_t C,
-                                               unsigned Type) const {
-  const auto *Sym = cast_or_null<MCSymbolELF>(S);
-  // A PCRel relocation to an absolute value has no symbol (or section). We
-  // represent that with a relocation to a null section.
-  if (!RefA)
-    return false;
-
-  MCSymbolRefExpr::VariantKind Kind = RefA->getKind();
-  switch (Kind) {
-  default:
-    break;
-  // The .odp creation emits a relocation against the symbol ".TOC." which
-  // create a R_PPC64_TOC relocation. However the relocation symbol name
-  // in final object creation should be NULL, since the symbol does not
-  // really exist, it is just the reference to TOC base for the current
-  // object file. Since the symbol is undefined, returning false results
-  // in a relocation with a null section which is the desired result.
-  case MCSymbolRefExpr::VK_PPC_TOCBASE:
-    return false;
-
-  // These VariantKind cause the relocation to refer to something other than
-  // the symbol itself, like a linker generated table. Since the address of
-  // symbol is not relevant, we cannot replace the symbol with the
-  // section and patch the difference in the addend.
-  case MCSymbolRefExpr::VK_GOT:
-  case MCSymbolRefExpr::VK_PLT:
-  case MCSymbolRefExpr::VK_GOTPCREL:
-  case MCSymbolRefExpr::VK_PPC_GOT_LO:
-  case MCSymbolRefExpr::VK_PPC_GOT_HI:
-  case MCSymbolRefExpr::VK_PPC_GOT_HA:
-    return true;
-  }
-
-  // An undefined symbol is not in any section, so the relocation has to point
-  // to the symbol itself.
-  assert(Sym && "Expected a symbol");
-  if (Sym->isUndefined())
-    return true;
-
-  unsigned Binding = Sym->getBinding();
-  switch(Binding) {
-  default:
-    llvm_unreachable("Invalid Binding");
-  case ELF::STB_LOCAL:
-    break;
-  case ELF::STB_WEAK:
-    // If the symbol is weak, it might be overridden by a symbol in another
-    // file. The relocation has to point to the symbol so that the linker
-    // can update it.
-    return true;
-  case ELF::STB_GLOBAL:
-    // Global ELF symbols can be preempted by the dynamic linker. The relocation
-    // has to point to the symbol for a reason analogous to the STB_WEAK case.
-    return true;
-  }
-
-  // If a relocation points to a mergeable section, we have to be careful.
-  // If the offset is zero, a relocation with the section will encode the
-  // same information. With a non-zero offset, the situation is different.
-  // For example, a relocation can point 42 bytes past the end of a string.
-  // If we change such a relocation to use the section, the linker would think
-  // that it pointed to another string and subtracting 42 at runtime will
-  // produce the wrong value.
-  if (Sym->isInSection()) {
-    auto &Sec = cast<MCSectionELF>(Sym->getSection());
-    unsigned Flags = Sec.getFlags();
-    if (Flags & ELF::SHF_MERGE) {
-      if (C != 0)
-        return true;
-
-      // It looks like gold has a bug (http://sourceware.org/PR16794) and can
-      // only handle section relocations to mergeable sections if using RELA.
-      if (!hasRelocationAddend())
-        return true;
-    }
-
-    // Most TLS relocations use a got, so they need the symbol. Even those that
-    // are just an offset (@tpoff), require a symbol in gold versions before
-    // 5efeedf61e4fe720fd3e9a08e6c91c10abb66d42 (2014-09-26) which fixed
-    // http://sourceware.org/PR16773.
-    if (Flags & ELF::SHF_TLS)
-      return true;
-  }
-
-  // If the symbol is a thumb function the final relocation must set the lowest
-  // bit. With a symbol that is done by just having the symbol have that bit
-  // set, so we would lose the bit if we relocated with the section.
-  // FIXME: We could use the section but add the bit to the relocation value.
-  if (Asm.isThumbFunc(Sym))
-    return true;
-
-  if (TargetObjectWriter->needsRelocateWithSymbol(*Sym, Type))
-    return true;
-  return false;
-}
-
 // True if the assembler knows nothing about the final value of the symbol.
 // This doesn't cover the comdat issues, since in those cases the assembler
 // can at least know that all symbols in the section will move together.
@@ -624,118 +572,8 @@ static bool isWeak(const MCSymbolELF &Sym) {
   }
 }
 
-void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
-                                       const MCAsmLayout &Layout,
-                                       const MCFragment *Fragment,
-                                       const MCFixup &Fixup, MCValue Target,
-                                       uint64_t &FixedValue) {
-  MCAsmBackend &Backend = Asm.getBackend();
-  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
-                 MCFixupKindInfo::FKF_IsPCRel;
-  const MCSectionELF &FixupSection = cast<MCSectionELF>(*Fragment->getParent());
-  uint64_t C = Target.getConstant();
-  uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
-  MCContext &Ctx = Asm.getContext();
-
-  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
-    // Let A, B and C being the components of Target and R be the location of
-    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
-    // If it is pcrel, we want to compute (A - B + C - R).
-
-    // In general, ELF has no relocations for -B. It can only represent (A + C)
-    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
-    // replace B to implement it: (A - R - K + C)
-    if (IsPCRel) {
-      Ctx.reportError(
-          Fixup.getLoc(),
-          "No relocation available to represent this relative expression");
-      return;
-    }
-
-    const auto &SymB = cast<MCSymbolELF>(RefB->getSymbol());
-
-    if (SymB.isUndefined()) {
-      Ctx.reportError(Fixup.getLoc(),
-                      Twine("symbol '") + SymB.getName() +
-                          "' can not be undefined in a subtraction expression");
-      return;
-    }
-
-    assert(!SymB.isAbsolute() && "Should have been folded");
-    const MCSection &SecB = SymB.getSection();
-    if (&SecB != &FixupSection) {
-      Ctx.reportError(Fixup.getLoc(),
-                      "Cannot represent a difference across sections");
-      return;
-    }
-
-    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
-    uint64_t K = SymBOffset - FixupOffset;
-    IsPCRel = true;
-    C -= K;
-  }
-
-  // We either rejected the fixup or folded B into C at this point.
-  const MCSymbolRefExpr *RefA = Target.getSymA();
-  const auto *SymA = RefA ? cast<MCSymbolELF>(&RefA->getSymbol()) : nullptr;
-
-  bool ViaWeakRef = false;
-  if (SymA && SymA->isVariable()) {
-    const MCExpr *Expr = SymA->getVariableValue();
-    if (const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr)) {
-      if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF) {
-        SymA = cast<MCSymbolELF>(&Inner->getSymbol());
-        ViaWeakRef = true;
-      }
-    }
-  }
-
-  unsigned Type = getRelocType(Ctx, Target, Fixup, IsPCRel);
-  uint64_t OriginalC = C;
-  bool RelocateWithSymbol = shouldRelocateWithSymbol(Asm, RefA, SymA, C, Type);
-  if (!RelocateWithSymbol && SymA && !SymA->isUndefined())
-    C += Layout.getSymbolOffset(*SymA);
-
-  uint64_t Addend = 0;
-  if (hasRelocationAddend()) {
-    Addend = C;
-    C = 0;
-  }
-
-  FixedValue = C;
-
-  if (!RelocateWithSymbol) {
-    const MCSection *SecA =
-        (SymA && !SymA->isUndefined()) ? &SymA->getSection() : nullptr;
-    auto *ELFSec = cast_or_null<MCSectionELF>(SecA);
-    const auto *SectionSymbol =
-        ELFSec ? cast<MCSymbolELF>(ELFSec->getBeginSymbol()) : nullptr;
-    if (SectionSymbol)
-      SectionSymbol->setUsedInReloc();
-    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend, SymA,
-                           OriginalC);
-    Relocations[&FixupSection].push_back(Rec);
-    return;
-  }
-
-  const auto *RenamedSymA = SymA;
-  if (SymA) {
-    if (const MCSymbolELF *R = Renames.lookup(SymA))
-      RenamedSymA = R;
-
-    if (ViaWeakRef)
-      RenamedSymA->setIsWeakrefUsedInReloc();
-    else
-      RenamedSymA->setUsedInReloc();
-  }
-  ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend, SymA,
-                         OriginalC);
-  Relocations[&FixupSection].push_back(Rec);
-}
-
-bool ELFObjectWriter::isInSymtab(const MCAsmLayout &Layout,
-                                 const MCSymbolELF &Symbol, bool Used,
-                                 bool Renamed) {
+bool ELFWriter::isInSymtab(const MCAsmLayout &Layout, const MCSymbolELF &Symbol,
+                           bool Used, bool Renamed) {
   if (Symbol.isVariable()) {
     const MCExpr *Expr = Symbol.getVariableValue();
     if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Expr)) {
@@ -768,7 +606,7 @@ bool ELFObjectWriter::isInSymtab(const MCAsmLayout &Layout,
   return true;
 }
 
-void ELFObjectWriter::computeSymbolTable(
+void ELFWriter::computeSymbolTable(
     MCAssembler &Asm, const MCAsmLayout &Layout,
     const SectionIndexMapTy &SectionIndexMap, const RevGroupMapTy &RevGroupMap,
     SectionOffsetsTy &SectionOffsets) {
@@ -783,7 +621,7 @@ void ELFObjectWriter::computeSymbolTable(
   SymbolTableIndex = addToSectionTable(SymtabSection);
 
   align(SymtabSection->getAlignment());
-  uint64_t SecStart = getStream().tell();
+  uint64_t SecStart = W.OS.tell();
 
   // The first entry is the undefined symbol entry.
   Writer.writeSymbol(0, 0, 0, 0, 0, 0, false);
@@ -800,7 +638,7 @@ void ELFObjectWriter::computeSymbolTable(
     bool isSignature = Symbol.isSignature();
 
     if (!isInSymtab(Layout, Symbol, Used || WeakrefUsed || isSignature,
-                    Renames.count(&Symbol)))
+                    OWriter.Renames.count(&Symbol)))
       continue;
 
     if (Symbol.isTemporary() && Symbol.isUndefined()) {
@@ -830,6 +668,8 @@ void ELFObjectWriter::computeSymbolTable(
     } else {
       const MCSectionELF &Section =
           static_cast<const MCSectionELF &>(Symbol.getSection());
+      if (Mode == NonDwoOnly && isDwoSection(Section))
+        continue;
       MSD.SectionIndex = SectionIndexMap.lookup(&Section);
       assert(MSD.SectionIndex && "Invalid section index!");
       if (MSD.SectionIndex >= ELF::SHN_LORESERVE)
@@ -899,7 +739,7 @@ void ELFObjectWriter::computeSymbolTable(
     assert(MSD.Symbol->getBinding() != ELF::STB_LOCAL);
   }
 
-  uint64_t SecEnd = getStream().tell();
+  uint64_t SecEnd = W.OS.tell();
   SectionOffsets[SymtabSection] = std::make_pair(SecStart, SecEnd);
 
   ArrayRef<uint32_t> ShndxIndexes = Writer.getShndxIndexes();
@@ -909,19 +749,23 @@ void ELFObjectWriter::computeSymbolTable(
   }
   assert(SymtabShndxSectionIndex != 0);
 
-  SecStart = getStream().tell();
+  SecStart = W.OS.tell();
   const MCSectionELF *SymtabShndxSection =
       SectionTable[SymtabShndxSectionIndex - 1];
   for (uint32_t Index : ShndxIndexes)
     write(Index);
-  SecEnd = getStream().tell();
+  SecEnd = W.OS.tell();
   SectionOffsets[SymtabShndxSection] = std::make_pair(SecStart, SecEnd);
 }
 
-MCSectionELF *
-ELFObjectWriter::createRelocationSection(MCContext &Ctx,
-                                         const MCSectionELF &Sec) {
-  if (Relocations[&Sec].empty())
+void ELFWriter::writeAddrsigSection() {
+  for (const MCSymbol *Sym : OWriter.AddrsigSyms)
+    encodeULEB128(Sym->getIndex(), W.OS);
+}
+
+MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx,
+                                                 const MCSectionELF &Sec) {
+  if (OWriter.Relocations[&Sec].empty())
     return nullptr;
 
   const StringRef SectionName = Sec.getSectionName();
@@ -946,7 +790,7 @@ ELFObjectWriter::createRelocationSection(MCContext &Ctx,
 }
 
 // Include the debug info compression header.
-bool ELFObjectWriter::maybeWriteCompression(
+bool ELFWriter::maybeWriteCompression(
     uint64_t Size, SmallVectorImpl<char> &CompressedContents, bool ZLibStyle,
     unsigned Alignment) {
   if (ZLibStyle) {
@@ -975,13 +819,13 @@ bool ELFObjectWriter::maybeWriteCompression(
   const StringRef Magic = "ZLIB";
   if (Size <= Magic.size() + sizeof(Size) + CompressedContents.size())
     return false;
-  write(ArrayRef<char>(Magic.begin(), Magic.size()));
-  writeBE64(Size);
+  W.OS << Magic;
+  support::endian::write(W.OS, Size, support::big);
   return true;
 }
 
-void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
-                                       const MCAsmLayout &Layout) {
+void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
+                                 const MCAsmLayout &Layout) {
   MCSectionELF &Section = static_cast<MCSectionELF &>(Sec);
   StringRef SectionName = Section.getSectionName();
 
@@ -995,7 +839,7 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
       MAI->compressDebugSections() != DebugCompressionType::None;
   if (!CompressionEnabled || !SectionName.startswith(".debug_") ||
       SectionName == ".debug_frame") {
-    Asm.writeSectionData(&Section, Layout);
+    Asm.writeSectionData(W.OS, &Section, Layout);
     return;
   }
 
@@ -1005,24 +849,21 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
 
   SmallVector<char, 128> UncompressedData;
   raw_svector_ostream VecOS(UncompressedData);
-  raw_pwrite_stream &OldStream = getStream();
-  setStream(VecOS);
-  Asm.writeSectionData(&Section, Layout);
-  setStream(OldStream);
+  Asm.writeSectionData(VecOS, &Section, Layout);
 
   SmallVector<char, 128> CompressedContents;
   if (Error E = zlib::compress(
           StringRef(UncompressedData.data(), UncompressedData.size()),
           CompressedContents)) {
     consumeError(std::move(E));
-    getStream() << UncompressedData;
+    W.OS << UncompressedData;
     return;
   }
 
   bool ZlibStyle = MAI->compressDebugSections() == DebugCompressionType::Z;
   if (!maybeWriteCompression(UncompressedData.size(), CompressedContents,
                              ZlibStyle, Sec.getAlignment())) {
-    getStream() << UncompressedData;
+    W.OS << UncompressedData;
     return;
   }
 
@@ -1032,30 +873,28 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   else
     // Add "z" prefix to section name. This is zlib-gnu style.
     MC.renameELFSection(&Section, (".z" + SectionName.drop_front(1)).str());
-  getStream() << CompressedContents;
+  W.OS << CompressedContents;
 }
 
-void ELFObjectWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type,
-                                       uint64_t Flags, uint64_t Address,
-                                       uint64_t Offset, uint64_t Size,
-                                       uint32_t Link, uint32_t Info,
-                                       uint64_t Alignment,
-                                       uint64_t EntrySize) {
-  write32(Name);        // sh_name: index into string table
-  write32(Type);        // sh_type
+void ELFWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
+                                 uint64_t Address, uint64_t Offset,
+                                 uint64_t Size, uint32_t Link, uint32_t Info,
+                                 uint64_t Alignment, uint64_t EntrySize) {
+  W.write<uint32_t>(Name);        // sh_name: index into string table
+  W.write<uint32_t>(Type);        // sh_type
   WriteWord(Flags);     // sh_flags
   WriteWord(Address);   // sh_addr
   WriteWord(Offset);    // sh_offset
   WriteWord(Size);      // sh_size
-  write32(Link);        // sh_link
-  write32(Info);        // sh_info
+  W.write<uint32_t>(Link);        // sh_link
+  W.write<uint32_t>(Info);        // sh_info
   WriteWord(Alignment); // sh_addralign
   WriteWord(EntrySize); // sh_entsize
 }
 
-void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
+void ELFWriter::writeRelocations(const MCAssembler &Asm,
                                        const MCSectionELF &Sec) {
-  std::vector<ELFRelocationEntry> &Relocs = Relocations[&Sec];
+  std::vector<ELFRelocationEntry> &Relocs = OWriter.Relocations[&Sec];
 
   // We record relocations by pushing to the end of a vector. Reverse the vector
   // to get the relocations in the order they were created.
@@ -1064,7 +903,7 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
   std::reverse(Relocs.begin(), Relocs.end());
 
   // Sort the relocation entries. MIPS needs this.
-  TargetObjectWriter->sortRelocs(Asm, Relocs);
+  OWriter.TargetObjectWriter->sortRelocs(Asm, Relocs);
 
   for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
     const ELFRelocationEntry &Entry = Relocs[e - i - 1];
@@ -1072,13 +911,13 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
 
     if (is64Bit()) {
       write(Entry.Offset);
-      if (TargetObjectWriter->getEMachine() == ELF::EM_MIPS) {
+      if (OWriter.TargetObjectWriter->getEMachine() == ELF::EM_MIPS) {
         write(uint32_t(Index));
 
-        write(TargetObjectWriter->getRSsym(Entry.Type));
-        write(TargetObjectWriter->getRType3(Entry.Type));
-        write(TargetObjectWriter->getRType2(Entry.Type));
-        write(TargetObjectWriter->getRType(Entry.Type));
+        write(OWriter.TargetObjectWriter->getRSsym(Entry.Type));
+        write(OWriter.TargetObjectWriter->getRType3(Entry.Type));
+        write(OWriter.TargetObjectWriter->getRType2(Entry.Type));
+        write(OWriter.TargetObjectWriter->getRType(Entry.Type));
       } else {
         struct ELF::Elf64_Rela ERE64;
         ERE64.setSymbolAndType(Index, Entry.Type);
@@ -1096,15 +935,17 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
       if (hasRelocationAddend())
         write(uint32_t(Entry.Addend));
 
-      if (TargetObjectWriter->getEMachine() == ELF::EM_MIPS) {
-        if (uint32_t RType = TargetObjectWriter->getRType2(Entry.Type)) {
+      if (OWriter.TargetObjectWriter->getEMachine() == ELF::EM_MIPS) {
+        if (uint32_t RType =
+                OWriter.TargetObjectWriter->getRType2(Entry.Type)) {
           write(uint32_t(Entry.Offset));
 
           ERE32.setSymbolAndType(0, RType);
           write(ERE32.r_info);
           write(uint32_t(0));
         }
-        if (uint32_t RType = TargetObjectWriter->getRType3(Entry.Type)) {
+        if (uint32_t RType =
+                OWriter.TargetObjectWriter->getRType3(Entry.Type)) {
           write(uint32_t(Entry.Offset));
 
           ERE32.setSymbolAndType(0, RType);
@@ -1116,15 +957,15 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
   }
 }
 
-const MCSectionELF *ELFObjectWriter::createStringTable(MCContext &Ctx) {
+const MCSectionELF *ELFWriter::createStringTable(MCContext &Ctx) {
   const MCSectionELF *StrtabSection = SectionTable[StringTableIndex - 1];
-  StrTabBuilder.write(getStream());
+  StrTabBuilder.write(W.OS);
   return StrtabSection;
 }
 
-void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
-                                   uint32_t GroupSymbolIndex, uint64_t Offset,
-                                   uint64_t Size, const MCSectionELF &Section) {
+void ELFWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
+                             uint32_t GroupSymbolIndex, uint64_t Offset,
+                             uint64_t Size, const MCSectionELF &Section) {
   uint64_t sh_link = 0;
   uint64_t sh_info = 0;
 
@@ -1146,12 +987,13 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   }
 
   case ELF::SHT_SYMTAB:
-  case ELF::SHT_DYNSYM:
     sh_link = StringTableIndex;
     sh_info = LastLocalSymbolIndex;
     break;
 
   case ELF::SHT_SYMTAB_SHNDX:
+  case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
+  case ELF::SHT_LLVM_ADDRSIG:
     sh_link = SymbolTableIndex;
     break;
 
@@ -1173,7 +1015,7 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
                    Section.getEntrySize());
 }
 
-void ELFObjectWriter::writeSectionHeader(
+void ELFWriter::writeSectionHeader(
     const MCAsmLayout &Layout, const SectionIndexMapTy &SectionIndexMap,
     const SectionOffsetsTy &SectionOffsets) {
   const unsigned NumSections = SectionTable.size();
@@ -1204,8 +1046,9 @@ void ELFObjectWriter::writeSectionHeader(
   }
 }
 
-void ELFObjectWriter::writeObject(MCAssembler &Asm,
-                                  const MCAsmLayout &Layout) {
+uint64_t ELFWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) {
+  uint64_t StartOffset = W.OS.tell();
+
   MCContext &Ctx = Asm.getContext();
   MCSectionELF *StrtabSection =
       Ctx.getELFSection(".strtab", ELF::SHT_STRTAB, 0);
@@ -1225,16 +1068,20 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
   std::vector<MCSectionELF *> Relocations;
   for (MCSection &Sec : Asm) {
     MCSectionELF &Section = static_cast<MCSectionELF &>(Sec);
+    if (Mode == NonDwoOnly && isDwoSection(Section))
+      continue;
+    if (Mode == DwoOnly && !isDwoSection(Section))
+      continue;
 
     align(Section.getAlignment());
 
     // Remember the offset into the file for this section.
-    uint64_t SecStart = getStream().tell();
+    uint64_t SecStart = W.OS.tell();
 
     const MCSymbolELF *SignatureSymbol = Section.getGroup();
     writeSectionData(Asm, Section, Layout);
 
-    uint64_t SecEnd = getStream().tell();
+    uint64_t SecEnd = W.OS.tell();
     SectionOffsets[&Section] = std::make_pair(SecStart, SecEnd);
 
     MCSectionELF *RelSection = createRelocationSection(Ctx, Section);
@@ -1262,11 +1109,19 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     }
   }
 
+  MCSectionELF *CGProfileSection = nullptr;
+  if (!Asm.CGProfile.empty()) {
+    CGProfileSection = Ctx.getELFSection(".llvm.call-graph-profile",
+                                         ELF::SHT_LLVM_CALL_GRAPH_PROFILE,
+                                         ELF::SHF_EXCLUDE, 16, "");
+    SectionIndexMap[CGProfileSection] = addToSectionTable(CGProfileSection);
+  }
+
   for (MCSectionELF *Group : Groups) {
     align(Group->getAlignment());
 
     // Remember the offset into the file for this section.
-    uint64_t SecStart = getStream().tell();
+    uint64_t SecStart = W.OS.tell();
 
     const MCSymbol *SignatureSymbol = Group->getGroup();
     assert(SignatureSymbol);
@@ -1276,65 +1131,364 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
       write(SecIndex);
     }
 
-    uint64_t SecEnd = getStream().tell();
+    uint64_t SecEnd = W.OS.tell();
     SectionOffsets[Group] = std::make_pair(SecStart, SecEnd);
   }
 
-  // Compute symbol table information.
-  computeSymbolTable(Asm, Layout, SectionIndexMap, RevGroupMap, SectionOffsets);
+  if (Mode == DwoOnly) {
+    // dwo files don't have symbol tables or relocations, but they do have
+    // string tables.
+    StrTabBuilder.finalize();
+  } else {
+    MCSectionELF *AddrsigSection;
+    if (OWriter.EmitAddrsigSection) {
+      AddrsigSection = Ctx.getELFSection(".llvm_addrsig", ELF::SHT_LLVM_ADDRSIG,
+                                         ELF::SHF_EXCLUDE);
+      addToSectionTable(AddrsigSection);
+    }
+
+    // Compute symbol table information.
+    computeSymbolTable(Asm, Layout, SectionIndexMap, RevGroupMap,
+                       SectionOffsets);
 
-  for (MCSectionELF *RelSection : Relocations) {
-    align(RelSection->getAlignment());
+    for (MCSectionELF *RelSection : Relocations) {
+      align(RelSection->getAlignment());
 
-    // Remember the offset into the file for this section.
-    uint64_t SecStart = getStream().tell();
+      // Remember the offset into the file for this section.
+      uint64_t SecStart = W.OS.tell();
 
-    writeRelocations(Asm,
-                     cast<MCSectionELF>(*RelSection->getAssociatedSection()));
+      writeRelocations(Asm,
+                       cast<MCSectionELF>(*RelSection->getAssociatedSection()));
 
-    uint64_t SecEnd = getStream().tell();
-    SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
+      uint64_t SecEnd = W.OS.tell();
+      SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
+    }
+
+    if (OWriter.EmitAddrsigSection) {
+      uint64_t SecStart = W.OS.tell();
+      writeAddrsigSection();
+      uint64_t SecEnd = W.OS.tell();
+      SectionOffsets[AddrsigSection] = std::make_pair(SecStart, SecEnd);
+    }
+  }
+
+  if (CGProfileSection) {
+    uint64_t SecStart = W.OS.tell();
+    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
+      W.write<uint32_t>(CGPE.From->getSymbol().getIndex());
+      W.write<uint32_t>(CGPE.To->getSymbol().getIndex());
+      W.write<uint64_t>(CGPE.Count);
+    }
+    uint64_t SecEnd = W.OS.tell();
+    SectionOffsets[CGProfileSection] = std::make_pair(SecStart, SecEnd);
   }
 
   {
-    uint64_t SecStart = getStream().tell();
+    uint64_t SecStart = W.OS.tell();
     const MCSectionELF *Sec = createStringTable(Ctx);
-    uint64_t SecEnd = getStream().tell();
+    uint64_t SecEnd = W.OS.tell();
     SectionOffsets[Sec] = std::make_pair(SecStart, SecEnd);
   }
 
   uint64_t NaturalAlignment = is64Bit() ? 8 : 4;
   align(NaturalAlignment);
 
-  const uint64_t SectionHeaderOffset = getStream().tell();
+  const uint64_t SectionHeaderOffset = W.OS.tell();
 
   // ... then the section header table ...
   writeSectionHeader(Layout, SectionIndexMap, SectionOffsets);
 
-  uint16_t NumSections = (SectionTable.size() + 1 >= ELF::SHN_LORESERVE)
-                             ? (uint16_t)ELF::SHN_UNDEF
-                             : SectionTable.size() + 1;
-  if (sys::IsLittleEndianHost != IsLittleEndian)
-    sys::swapByteOrder(NumSections);
+  uint16_t NumSections = support::endian::byte_swap<uint16_t>(
+      (SectionTable.size() + 1 >= ELF::SHN_LORESERVE) ? (uint16_t)ELF::SHN_UNDEF
+                                                      : SectionTable.size() + 1,
+      W.Endian);
   unsigned NumSectionsOffset;
 
+  auto &Stream = static_cast<raw_pwrite_stream &>(W.OS);
   if (is64Bit()) {
-    uint64_t Val = SectionHeaderOffset;
-    if (sys::IsLittleEndianHost != IsLittleEndian)
-      sys::swapByteOrder(Val);
-    getStream().pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
-                       offsetof(ELF::Elf64_Ehdr, e_shoff));
+    uint64_t Val =
+        support::endian::byte_swap<uint64_t>(SectionHeaderOffset, W.Endian);
+    Stream.pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
+                  offsetof(ELF::Elf64_Ehdr, e_shoff));
     NumSectionsOffset = offsetof(ELF::Elf64_Ehdr, e_shnum);
   } else {
-    uint32_t Val = SectionHeaderOffset;
-    if (sys::IsLittleEndianHost != IsLittleEndian)
-      sys::swapByteOrder(Val);
-    getStream().pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
-                       offsetof(ELF::Elf32_Ehdr, e_shoff));
+    uint32_t Val =
+        support::endian::byte_swap<uint32_t>(SectionHeaderOffset, W.Endian);
+    Stream.pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
+                  offsetof(ELF::Elf32_Ehdr, e_shoff));
     NumSectionsOffset = offsetof(ELF::Elf32_Ehdr, e_shnum);
   }
-  getStream().pwrite(reinterpret_cast<char *>(&NumSections),
-                     sizeof(NumSections), NumSectionsOffset);
+  Stream.pwrite(reinterpret_cast<char *>(&NumSections), sizeof(NumSections),
+                NumSectionsOffset);
+
+  return W.OS.tell() - StartOffset;
+}
+
+bool ELFObjectWriter::hasRelocationAddend() const {
+  return TargetObjectWriter->hasRelocationAddend();
+}
+
+void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
+                                               const MCAsmLayout &Layout) {
+  // The presence of symbol versions causes undefined symbols and
+  // versions declared with @@@ to be renamed.
+  for (const std::pair<StringRef, const MCSymbol *> &P : Asm.Symvers) {
+    StringRef AliasName = P.first;
+    const auto &Symbol = cast<MCSymbolELF>(*P.second);
+    size_t Pos = AliasName.find('@');
+    assert(Pos != StringRef::npos);
+
+    StringRef Prefix = AliasName.substr(0, Pos);
+    StringRef Rest = AliasName.substr(Pos);
+    StringRef Tail = Rest;
+    if (Rest.startswith("@@@"))
+      Tail = Rest.substr(Symbol.isUndefined() ? 2 : 1);
+
+    auto *Alias =
+        cast<MCSymbolELF>(Asm.getContext().getOrCreateSymbol(Prefix + Tail));
+    Asm.registerSymbol(*Alias);
+    const MCExpr *Value = MCSymbolRefExpr::create(&Symbol, Asm.getContext());
+    Alias->setVariableValue(Value);
+
+    // Aliases defined with .symvar copy the binding from the symbol they alias.
+    // This is the first place we are able to copy this information.
+    Alias->setExternal(Symbol.isExternal());
+    Alias->setBinding(Symbol.getBinding());
+
+    if (!Symbol.isUndefined() && !Rest.startswith("@@@"))
+      continue;
+
+    // FIXME: produce a better error message.
+    if (Symbol.isUndefined() && Rest.startswith("@@") &&
+        !Rest.startswith("@@@"))
+      report_fatal_error("A @@ version cannot be undefined");
+
+    if (Renames.count(&Symbol) && Renames[&Symbol] != Alias)
+      report_fatal_error(llvm::Twine("Multiple symbol versions defined for ") +
+                         Symbol.getName());
+
+    Renames.insert(std::make_pair(&Symbol, Alias));
+  }
+
+  for (const MCSymbol *&Sym : AddrsigSyms) {
+    if (const MCSymbol *R = Renames.lookup(cast<MCSymbolELF>(Sym)))
+      Sym = R;
+    Sym->setUsedInReloc();
+  }
+}
+
+// It is always valid to create a relocation with a symbol. It is preferable
+// to use a relocation with a section if that is possible. Using the section
+// allows us to omit some local symbols from the symbol table.
+bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
+                                               const MCSymbolRefExpr *RefA,
+                                               const MCSymbolELF *Sym,
+                                               uint64_t C,
+                                               unsigned Type) const {
+  // A PCRel relocation to an absolute value has no symbol (or section). We
+  // represent that with a relocation to a null section.
+  if (!RefA)
+    return false;
+
+  MCSymbolRefExpr::VariantKind Kind = RefA->getKind();
+  switch (Kind) {
+  default:
+    break;
+  // The .odp creation emits a relocation against the symbol ".TOC." which
+  // create a R_PPC64_TOC relocation. However the relocation symbol name
+  // in final object creation should be NULL, since the symbol does not
+  // really exist, it is just the reference to TOC base for the current
+  // object file. Since the symbol is undefined, returning false results
+  // in a relocation with a null section which is the desired result.
+  case MCSymbolRefExpr::VK_PPC_TOCBASE:
+    return false;
+
+  // These VariantKind cause the relocation to refer to something other than
+  // the symbol itself, like a linker generated table. Since the address of
+  // symbol is not relevant, we cannot replace the symbol with the
+  // section and patch the difference in the addend.
+  case MCSymbolRefExpr::VK_GOT:
+  case MCSymbolRefExpr::VK_PLT:
+  case MCSymbolRefExpr::VK_GOTPCREL:
+  case MCSymbolRefExpr::VK_PPC_GOT_LO:
+  case MCSymbolRefExpr::VK_PPC_GOT_HI:
+  case MCSymbolRefExpr::VK_PPC_GOT_HA:
+    return true;
+  }
+
+  // An undefined symbol is not in any section, so the relocation has to point
+  // to the symbol itself.
+  assert(Sym && "Expected a symbol");
+  if (Sym->isUndefined())
+    return true;
+
+  unsigned Binding = Sym->getBinding();
+  switch(Binding) {
+  default:
+    llvm_unreachable("Invalid Binding");
+  case ELF::STB_LOCAL:
+    break;
+  case ELF::STB_WEAK:
+    // If the symbol is weak, it might be overridden by a symbol in another
+    // file. The relocation has to point to the symbol so that the linker
+    // can update it.
+    return true;
+  case ELF::STB_GLOBAL:
+    // Global ELF symbols can be preempted by the dynamic linker. The relocation
+    // has to point to the symbol for a reason analogous to the STB_WEAK case.
+    return true;
+  }
+
+  // If a relocation points to a mergeable section, we have to be careful.
+  // If the offset is zero, a relocation with the section will encode the
+  // same information. With a non-zero offset, the situation is different.
+  // For example, a relocation can point 42 bytes past the end of a string.
+  // If we change such a relocation to use the section, the linker would think
+  // that it pointed to another string and subtracting 42 at runtime will
+  // produce the wrong value.
+  if (Sym->isInSection()) {
+    auto &Sec = cast<MCSectionELF>(Sym->getSection());
+    unsigned Flags = Sec.getFlags();
+    if (Flags & ELF::SHF_MERGE) {
+      if (C != 0)
+        return true;
+
+      // It looks like gold has a bug (http://sourceware.org/PR16794) and can
+      // only handle section relocations to mergeable sections if using RELA.
+      if (!hasRelocationAddend())
+        return true;
+    }
+
+    // Most TLS relocations use a got, so they need the symbol. Even those that
+    // are just an offset (@tpoff), require a symbol in gold versions before
+    // 5efeedf61e4fe720fd3e9a08e6c91c10abb66d42 (2014-09-26) which fixed
+    // http://sourceware.org/PR16773.
+    if (Flags & ELF::SHF_TLS)
+      return true;
+  }
+
+  // If the symbol is a thumb function the final relocation must set the lowest
+  // bit. With a symbol that is done by just having the symbol have that bit
+  // set, so we would lose the bit if we relocated with the section.
+  // FIXME: We could use the section but add the bit to the relocation value.
+  if (Asm.isThumbFunc(Sym))
+    return true;
+
+  if (TargetObjectWriter->needsRelocateWithSymbol(*Sym, Type))
+    return true;
+  return false;
+}
+
+void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const MCFragment *Fragment,
+                                       const MCFixup &Fixup, MCValue Target,
+                                       uint64_t &FixedValue) {
+  MCAsmBackend &Backend = Asm.getBackend();
+  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
+                 MCFixupKindInfo::FKF_IsPCRel;
+  const MCSectionELF &FixupSection = cast<MCSectionELF>(*Fragment->getParent());
+  uint64_t C = Target.getConstant();
+  uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  MCContext &Ctx = Asm.getContext();
+
+  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+    // Let A, B and C being the components of Target and R be the location of
+    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
+    // If it is pcrel, we want to compute (A - B + C - R).
+
+    // In general, ELF has no relocations for -B. It can only represent (A + C)
+    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
+    // replace B to implement it: (A - R - K + C)
+    if (IsPCRel) {
+      Ctx.reportError(
+          Fixup.getLoc(),
+          "No relocation available to represent this relative expression");
+      return;
+    }
+
+    const auto &SymB = cast<MCSymbolELF>(RefB->getSymbol());
+
+    if (SymB.isUndefined()) {
+      Ctx.reportError(Fixup.getLoc(),
+                      Twine("symbol '") + SymB.getName() +
+                          "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    assert(!SymB.isAbsolute() && "Should have been folded");
+    const MCSection &SecB = SymB.getSection();
+    if (&SecB != &FixupSection) {
+      Ctx.reportError(Fixup.getLoc(),
+                      "Cannot represent a difference across sections");
+      return;
+    }
+
+    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
+    uint64_t K = SymBOffset - FixupOffset;
+    IsPCRel = true;
+    C -= K;
+  }
+
+  // We either rejected the fixup or folded B into C at this point.
+  const MCSymbolRefExpr *RefA = Target.getSymA();
+  const auto *SymA = RefA ? cast<MCSymbolELF>(&RefA->getSymbol()) : nullptr;
+
+  bool ViaWeakRef = false;
+  if (SymA && SymA->isVariable()) {
+    const MCExpr *Expr = SymA->getVariableValue();
+    if (const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF) {
+        SymA = cast<MCSymbolELF>(&Inner->getSymbol());
+        ViaWeakRef = true;
+      }
+    }
+  }
+
+  unsigned Type = TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
+  uint64_t OriginalC = C;
+  bool RelocateWithSymbol = shouldRelocateWithSymbol(Asm, RefA, SymA, C, Type);
+  if (!RelocateWithSymbol && SymA && !SymA->isUndefined())
+    C += Layout.getSymbolOffset(*SymA);
+
+  uint64_t Addend = 0;
+  if (hasRelocationAddend()) {
+    Addend = C;
+    C = 0;
+  }
+
+  FixedValue = C;
+
+  const MCSectionELF *SecA = (SymA && SymA->isInSection())
+                                 ? cast<MCSectionELF>(&SymA->getSection())
+                                 : nullptr;
+  if (!checkRelocation(Ctx, Fixup.getLoc(), &FixupSection, SecA))
+    return;
+
+  if (!RelocateWithSymbol) {
+    const auto *SectionSymbol =
+        SecA ? cast<MCSymbolELF>(SecA->getBeginSymbol()) : nullptr;
+    if (SectionSymbol)
+      SectionSymbol->setUsedInReloc();
+    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend, SymA,
+                           OriginalC);
+    Relocations[&FixupSection].push_back(Rec);
+    return;
+  }
+
+  const auto *RenamedSymA = SymA;
+  if (SymA) {
+    if (const MCSymbolELF *R = Renames.lookup(SymA))
+      RenamedSymA = R;
+
+    if (ViaWeakRef)
+      RenamedSymA->setIsWeakrefUsedInReloc();
+    else
+      RenamedSymA->setUsedInReloc();
+  }
+  ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend, SymA,
+                         OriginalC);
+  Relocations[&FixupSection].push_back(Rec);
 }
 
 bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
@@ -1353,6 +1507,14 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
 std::unique_ptr<MCObjectWriter>
 llvm::createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
                             raw_pwrite_stream &OS, bool IsLittleEndian) {
-  return llvm::make_unique<ELFObjectWriter>(std::move(MOTW), OS,
-                                            IsLittleEndian);
+  return llvm::make_unique<ELFSingleObjectWriter>(std::move(MOTW), OS,
+                                                  IsLittleEndian);
+}
+
+std::unique_ptr<MCObjectWriter>
+llvm::createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                               raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                               bool IsLittleEndian) {
+  return llvm::make_unique<ELFDwoObjectWriter>(std::move(MOTW), OS, DwoOS,
+                                               IsLittleEndian);
 }
diff --git a/contrib/llvm/lib/MC/MCAsmBackend.cpp b/contrib/llvm/lib/MC/MCAsmBackend.cpp
index b4a4d0a89966..92d3a8a2645f 100644
--- a/contrib/llvm/lib/MC/MCAsmBackend.cpp
+++ b/contrib/llvm/lib/MC/MCAsmBackend.cpp
@@ -11,20 +11,54 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCCodePadder.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 
 using namespace llvm;
 
-MCAsmBackend::MCAsmBackend() : CodePadder(new MCCodePadder()) {}
-
-MCAsmBackend::MCAsmBackend(std::unique_ptr<MCCodePadder> TargetCodePadder)
-    : CodePadder(std::move(TargetCodePadder)) {}
+MCAsmBackend::MCAsmBackend(support::endianness Endian)
+    : CodePadder(new MCCodePadder()), Endian(Endian) {}
 
 MCAsmBackend::~MCAsmBackend() = default;
 
+std::unique_ptr<MCObjectWriter>
+MCAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  auto TW = createObjectTargetWriter();
+  switch (TW->getFormat()) {
+  case Triple::ELF:
+    return createELFObjectWriter(cast<MCELFObjectTargetWriter>(std::move(TW)), OS,
+                                 Endian == support::little);
+  case Triple::MachO:
+    return createMachObjectWriter(cast<MCMachObjectTargetWriter>(std::move(TW)),
+                                  OS, Endian == support::little);
+  case Triple::COFF:
+    return createWinCOFFObjectWriter(
+        cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS);
+  case Triple::Wasm:
+    return createWasmObjectWriter(cast<MCWasmObjectTargetWriter>(std::move(TW)),
+                                  OS);
+  default:
+    llvm_unreachable("unexpected object format");
+  }
+}
+
+std::unique_ptr<MCObjectWriter>
+MCAsmBackend::createDwoObjectWriter(raw_pwrite_stream &OS,
+                                    raw_pwrite_stream &DwoOS) const {
+  auto TW = createObjectTargetWriter();
+  if (TW->getFormat() != Triple::ELF)
+    report_fatal_error("dwo only supported with ELF");
+  return createELFDwoObjectWriter(cast<MCELFObjectTargetWriter>(std::move(TW)),
+                                  OS, DwoOS, Endian == support::little);
+}
+
 Optional<MCFixupKind> MCAsmBackend::getFixupKind(StringRef Name) const {
   return None;
 }
@@ -50,7 +84,15 @@ const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"FK_SecRel_1", 0, 8, 0},
       {"FK_SecRel_2", 0, 16, 0},
       {"FK_SecRel_4", 0, 32, 0},
-      {"FK_SecRel_8", 0, 64, 0}};
+      {"FK_SecRel_8", 0, 64, 0},
+      {"FK_Data_Add_1", 0, 8, 0},
+      {"FK_Data_Add_2", 0, 16, 0},
+      {"FK_Data_Add_4", 0, 32, 0},
+      {"FK_Data_Add_8", 0, 64, 0},
+      {"FK_Data_Sub_1", 0, 8, 0},
+      {"FK_Data_Sub_2", 0, 16, 0},
+      {"FK_Data_Sub_4", 0, 32, 0},
+      {"FK_Data_Sub_8", 0, 64, 0}};
 
   assert((size_t)Kind <= array_lengthof(Builtins) && "Unknown fixup kind");
   return Builtins[Kind];
@@ -58,7 +100,8 @@ const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 bool MCAsmBackend::fixupNeedsRelaxationAdvanced(
     const MCFixup &Fixup, bool Resolved, uint64_t Value,
-    const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const {
+    const MCRelaxableFragment *DF, const MCAsmLayout &Layout,
+    const bool WasForced) const {
   if (!Resolved)
     return true;
   return fixupNeedsRelaxation(Fixup, Value, DF, Layout);
@@ -84,4 +127,4 @@ void MCAsmBackend::handleCodePaddingInstructionEnd(const MCInst &Inst) {
 
 bool MCAsmBackend::relaxFragment(MCPaddingFragment *PF, MCAsmLayout &Layout) {
   return CodePadder->relaxFragment(PF, Layout);
-}
-\ No newline at end of file
+}
diff --git a/contrib/llvm/lib/MC/MCAsmInfo.cpp b/contrib/llvm/lib/MC/MCAsmInfo.cpp
index f05904048e0b..30f22d2d68f4 100644
--- a/contrib/llvm/lib/MC/MCAsmInfo.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfo.cpp
@@ -17,9 +17,18 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+enum DefaultOnOff { Default, Enable, Disable };
+static cl::opt<DefaultOnOff> DwarfExtendedLoc(
+    "dwarf-extended-loc", cl::Hidden,
+    cl::desc("Disable emission of the extended flags in .loc directives."),
+    cl::values(clEnumVal(Default, "Default for platform"),
+               clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
+    cl::init(Default));
+
 MCAsmInfo::MCAsmInfo() {
   SeparatorString = ";";
   CommentString = "#";
@@ -41,6 +50,8 @@ MCAsmInfo::MCAsmInfo() {
   Data64bitsDirective = "\t.quad\t";
   GlobalDirective = "\t.globl\t";
   WeakDirective = "\t.weak\t";
+  if (DwarfExtendedLoc != Default)
+    SupportsExtendedDwarfLocDirective = DwarfExtendedLoc == Enable;
 
   // FIXME: Clang's logic should be synced with the logic used to initialize
   //        this member and the two implementations should be merged.
diff --git a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
index 85104484fd40..d8fb875b67c6 100644
--- a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -41,6 +41,15 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
 
   // At least MSVC inline-asm does AShr.
   UseLogicalShr = false;
+
+  // If this is a COFF target, assume that it supports associative comdats. It's
+  // part of the spec.
+  HasCOFFAssociativeComdats = true;
+
+  // We can generate constants in comdat sections that can be shared,
+  // but in order not to create null typed symbols, we actually need to
+  // make them global symbols as well.
+  HasCOFFComdatConstants = true;
 }
 
 void MCAsmInfoMicrosoft::anchor() {}
@@ -49,4 +58,12 @@ MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default;
 
 void MCAsmInfoGNUCOFF::anchor() {}
 
-MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() = default;
+MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
+  // If this is a GNU environment (mingw or cygwin), don't use associative
+  // comdats for jump tables, unwind information, and other data associated with
+  // a function.
+  HasCOFFAssociativeComdats = false;
+
+  // We don't create constants in comdat sections for MinGW.
+  HasCOFFComdatConstants = false;
+}
diff --git a/contrib/llvm/lib/MC/MCAsmMacro.cpp b/contrib/llvm/lib/MC/MCAsmMacro.cpp
new file mode 100644
index 000000000000..7e89c03c6c6b
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmMacro.cpp
@@ -0,0 +1,42 @@
+//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmMacro.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void MCAsmMacroParameter::dump(raw_ostream &OS) const {
+  OS << "\"" << Name << "\"";
+  if (Required)
+    OS << ":req";
+  if (Vararg)
+    OS << ":vararg";
+  if (!Value.empty()) {
+    OS << " = ";
+    bool first = true;
+    for (const AsmToken &T : Value) {
+      if (!first)
+        OS << ", ";
+      first = false;
+      OS << T.getString();
+    }
+  }
+  OS << "\n";
+}
+
+void MCAsmMacro::dump(raw_ostream &OS) const {
+  OS << "Macro " << Name << ":\n";
+  OS << "  Parameters:\n";
+  for (const MCAsmMacroParameter &P : Parameters) {
+    OS << "    ";
+    P.dump();
+  }
+  OS << "  (BEGIN BODY)" << Body << "(END BODY)\n";
+}
diff --git a/contrib/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
index 6f045a4b10ba..92f615180561 100644
--- a/contrib/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
@@ -7,12 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
@@ -21,6 +23,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
@@ -30,6 +33,7 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
 #include <cctype>
 
 using namespace llvm;
@@ -41,12 +45,12 @@ class MCAsmStreamer final : public MCStreamer {
   formatted_raw_ostream &OS;
   const MCAsmInfo *MAI;
   std::unique_ptr<MCInstPrinter> InstPrinter;
-  std::unique_ptr<MCCodeEmitter> Emitter;
-  std::unique_ptr<MCAsmBackend> AsmBackend;
+  std::unique_ptr<MCAssembler> Assembler;
 
   SmallString<128> ExplicitCommentToEmit;
   SmallString<128> CommentToEmit;
   raw_svector_ostream CommentStream;
+  raw_null_ostream NullStream;
 
   unsigned IsVerboseAsm : 1;
   unsigned ShowInst : 1;
@@ -59,18 +63,24 @@ class MCAsmStreamer final : public MCStreamer {
 public:
   MCAsmStreamer(MCContext &Context, std::unique_ptr<formatted_raw_ostream> os,
                 bool isVerboseAsm, bool useDwarfDirectory,
-                MCInstPrinter *printer, MCCodeEmitter *emitter,
-                MCAsmBackend *asmbackend, bool showInst)
+                MCInstPrinter *printer, std::unique_ptr<MCCodeEmitter> emitter,
+                std::unique_ptr<MCAsmBackend> asmbackend, bool showInst)
       : MCStreamer(Context), OSOwner(std::move(os)), OS(*OSOwner),
-        MAI(Context.getAsmInfo()), InstPrinter(printer), Emitter(emitter),
-        AsmBackend(asmbackend), CommentStream(CommentToEmit),
-        IsVerboseAsm(isVerboseAsm), ShowInst(showInst),
-        UseDwarfDirectory(useDwarfDirectory) {
+        MAI(Context.getAsmInfo()), InstPrinter(printer),
+        Assembler(llvm::make_unique<MCAssembler>(
+            Context, std::move(asmbackend), std::move(emitter),
+            (asmbackend) ? asmbackend->createObjectWriter(NullStream)
+                         : nullptr)),
+        CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
+        ShowInst(showInst), UseDwarfDirectory(useDwarfDirectory) {
     assert(InstPrinter);
     if (IsVerboseAsm)
         InstPrinter->setCommentStream(CommentStream);
   }
 
+  MCAssembler &getAssembler() { return *Assembler; }
+  MCAssembler *getAssemblerPtr() override { return nullptr; }
+
   inline void EmitEOL() {
     // Dump Explicit Comments here.
     emitExplicitComments();
@@ -86,26 +96,24 @@ public:
 
   void EmitCommentsAndEOL();
 
-  /// isVerboseAsm - Return true if this streamer supports verbose assembly at
-  /// all.
+  /// Return true if this streamer supports verbose assembly at all.
   bool isVerboseAsm() const override { return IsVerboseAsm; }
 
-  /// hasRawTextSupport - We support EmitRawText.
+  /// Do we support EmitRawText?
   bool hasRawTextSupport() const override { return true; }
 
-  /// AddComment - Add a comment that can be emitted to the generated .s
-  /// file if applicable as a QoI issue to make the output of the compiler
-  /// more readable.  This only affects the MCAsmStreamer, and only when
-  /// verbose assembly output is enabled.
+  /// Add a comment that can be emitted to the generated .s file to make the
+  /// output of the compiler more readable. This only affects the MCAsmStreamer
+  /// and only when verbose assembly output is enabled.
   void AddComment(const Twine &T, bool EOL = true) override;
 
-  /// AddEncodingComment - Add a comment showing the encoding of an instruction.
-  /// If PrintSchedInfo - is true then the comment sched:[x:y] should
-  //    be added to output if it's being supported by target
+  /// Add a comment showing the encoding of an instruction.
+  /// If PrintSchedInfo is true, then the comment sched:[x:y] will be added to
+  /// the output if supported by the target.
   void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &,
                           bool PrintSchedInfo);
 
-  /// GetCommentOS - Return a raw_ostream that comments can be written to.
+  /// Return a raw_ostream that comments can be written to.
   /// Unlike AddComment, you are required to terminate comments with \n if you
   /// use this method.
   raw_ostream &GetCommentOS() override {
@@ -119,7 +127,7 @@ public:
   void addExplicitComment(const Twine &T) override;
   void emitExplicitComments() override;
 
-  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
+  /// Emit a blank line to a .s file to pretty it up.
   void AddBlankLine() override {
     EmitEOL();
   }
@@ -154,13 +162,15 @@ public:
   void EmitCOFFSymbolType(int Type) override;
   void EndCOFFSymbolDef() override;
   void EmitCOFFSafeSEH(MCSymbol const *Symbol) override;
+  void EmitCOFFSymbolIndex(MCSymbol const *Symbol) override;
   void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
+  void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
   void emitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
 
-  /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
+  /// Emit a local common (.lcomm) symbol.
   ///
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
@@ -169,7 +179,8 @@ public:
                              unsigned ByteAlignment) override;
 
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                    uint64_t Size = 0, unsigned ByteAlignment = 0) override;
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override;
 
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
@@ -198,8 +209,6 @@ public:
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                 SMLoc Loc = SMLoc()) override;
 
-  void emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) override;
-
   void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                 SMLoc Loc = SMLoc()) override;
 
@@ -215,9 +224,16 @@ public:
                          SMLoc Loc) override;
 
   void EmitFileDirective(StringRef Filename) override;
-  unsigned EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
-                                  StringRef Filename,
-                                  unsigned CUID = 0) override;
+  Expected<unsigned> tryEmitDwarfFileDirective(unsigned FileNo,
+                                               StringRef Directory,
+                                               StringRef Filename,
+                                               MD5::MD5Result *Checksum = 0,
+                                               Optional<StringRef> Source = None,
+                                               unsigned CUID = 0) override;
+  void emitDwarfFile0Directive(StringRef Directory, StringRef Filename,
+                               MD5::MD5Result *Checksum,
+                               Optional<StringRef> Source,
+                               unsigned CUID = 0) override;
   void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                              unsigned Column, unsigned Flags,
                              unsigned Isa, unsigned Discriminator,
@@ -290,6 +306,9 @@ public:
                         SMLoc Loc) override;
   void EmitWinEHHandlerData(SMLoc Loc) override;
 
+  void emitCGProfileEntry(const MCSymbolRefExpr *From,
+                          const MCSymbolRefExpr *To, uint64_t Count) override;
+
   void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                        bool PrintSchedInfo) override;
 
@@ -298,11 +317,15 @@ public:
   void EmitBundleUnlock() override;
 
   bool EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                          const MCExpr *Expr, SMLoc Loc) override;
+                          const MCExpr *Expr, SMLoc Loc,
+                          const MCSubtargetInfo &STI) override;
+
+  void EmitAddrsig() override;
+  void EmitAddrsigSym(const MCSymbol *Sym) override;
 
-  /// EmitRawText - If this file is backed by an assembly streamer, this dumps
-  /// the specified string in the output .s file.  This capability is
-  /// indicated by the hasRawTextSupport() predicate.
+  /// If this file is backed by an assembly streamer, this dumps the specified
+  /// string in the output .s file. This capability is indicated by the
+  /// hasRawTextSupport() predicate.
   void EmitRawTextImpl(StringRef String) override;
 
   void FinishImpl() override;
@@ -310,11 +333,6 @@ public:
 
 } // end anonymous namespace.
 
-/// AddComment - Add a comment that can be emitted to the generated .s
-/// file if applicable as a QoI issue to make the output of the compiler
-/// more readable.  This only affects the MCAsmStreamer, and only when
-/// verbose assembly output is enabled.
-/// By deafult EOL is set to true so that each comment goes on its own line.
 void MCAsmStreamer::AddComment(const Twine &T, bool EOL) {
   if (!IsVerboseAsm) return;
 
@@ -536,11 +554,19 @@ void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
 }
 
 void MCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
-  Symbol->print(OS, MAI);
-  OS << " = ";
-  Value->print(OS, MAI);
+  // Do not emit a .set on inlined target assignments.
+  bool EmitSet = true;
+  if (auto *E = dyn_cast<MCTargetExpr>(Value))
+    if (E->inlineAssignedExpr())
+      EmitSet = false;
+  if (EmitSet) {
+    OS << ".set ";
+    Symbol->print(OS, MAI);
+    OS << ", ";
+    Value->print(OS, MAI);
 
-  EmitEOL();
+    EmitEOL();
+  }
 
   MCStreamer::EmitAssignment(Symbol, Value);
 }
@@ -576,7 +602,7 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     case MCSA_ELF_TypeObject:      OS << "object"; break;
     case MCSA_ELF_TypeTLS:         OS << "tls_object"; break;
     case MCSA_ELF_TypeCommon:      OS << "common"; break;
-    case MCSA_ELF_TypeNoType:      OS << "no_type"; break;
+    case MCSA_ELF_TypeNoType:      OS << "notype"; break;
     case MCSA_ELF_TypeGnuUniqueObject: OS << "gnu_unique_object"; break;
     }
     EmitEOL();
@@ -661,6 +687,12 @@ void MCAsmStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {
+  OS << "\t.symidx\t";
+  Symbol->print(OS, MAI);
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
   OS << "\t.secidx\t";
   Symbol->print(OS, MAI);
@@ -675,6 +707,16 @@ void MCAsmStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
+  OS << "\t.rva\t";
+  Symbol->print(OS, MAI);
+  if (Offset > 0)
+    OS << '+' << Offset;
+  else if (Offset < 0)
+    OS << '-' << -Offset;
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   assert(MAI->hasDotTypeDotSizeDirective());
   OS << "\t.size\t";
@@ -699,10 +741,6 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   EmitEOL();
 }
 
-/// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
-///
-/// @param Symbol - The common symbol to emit.
-/// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
   OS << "\t.lcomm\t";
@@ -726,14 +764,18 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
-                                 uint64_t Size, unsigned ByteAlignment) {
+                                 uint64_t Size, unsigned ByteAlignment,
+                                 SMLoc Loc) {
   if (Symbol)
     AssignFragment(Symbol, &Section->getDummyFragment());
 
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
+  assert(Section->getVariant() == MCSection::SV_MachO &&
+         ".zerofill is a Mach-O specific directive");
   // This is a mach-o specific directive.
+
   const MCSectionMachO *MOSection = ((const MCSectionMachO*)Section);
   OS << MOSection->getSegmentName() << "," << MOSection->getSectionName();
 
@@ -756,7 +798,11 @@ void MCAsmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 
   assert(Symbol && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
+
+  assert(Section->getVariant() == MCSection::SV_MachO &&
+         ".zerofill is a Mach-O specific directive");
   // This is a mach-o specific directive and section.
+
   OS << ".tbss ";
   Symbol->print(OS, MAI);
   OS << ", " << Size;
@@ -780,7 +826,7 @@ static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
       continue;
     }
 
-    if (isprint((unsigned char)C)) {
+    if (isPrint((unsigned char)C)) {
       OS << (char)C;
       continue;
     }
@@ -915,7 +961,7 @@ void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
     EmitULEB128IntValue(IntValue);
     return;
   }
-  OS << ".uleb128 ";
+  OS << "\t.uleb128 ";
   Value->print(OS, MAI);
   EmitEOL();
 }
@@ -926,7 +972,7 @@ void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
     EmitSLEB128IntValue(IntValue);
     return;
   }
-  OS << ".sleb128 ";
+  OS << "\t.sleb128 ";
   Value->print(OS, MAI);
   EmitEOL();
 }
@@ -992,14 +1038,6 @@ void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
   MCStreamer::emitFill(NumBytes, FillValue);
 }
 
-void MCAsmStreamer::emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) {
-  if (NumValues == 0)
-    return;
-
-  const MCExpr *E = MCConstantExpr::create(NumValues, getContext());
-  emitFill(*E, Size, Expr);
-}
-
 void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
                              int64_t Expr, SMLoc Loc) {
   // FIXME: Emit location directives
@@ -1086,20 +1124,12 @@ void MCAsmStreamer::EmitFileDirective(StringRef Filename) {
   EmitEOL();
 }
 
-unsigned MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,
-                                               StringRef Directory,
-                                               StringRef Filename,
-                                               unsigned CUID) {
-  assert(CUID == 0);
-
-  MCDwarfLineTable &Table = getContext().getMCDwarfLineTable(CUID);
-  unsigned NumFiles = Table.getMCDwarfFiles().size();
-  FileNo = Table.getFile(Directory, Filename, FileNo);
-  if (FileNo == 0)
-    return 0;
-  if (NumFiles == Table.getMCDwarfFiles().size())
-    return FileNo;
-
+static void printDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                    StringRef Filename,
+                                    MD5::MD5Result *Checksum,
+                                    Optional<StringRef> Source,
+                                    bool UseDwarfDirectory,
+                                    raw_svector_ostream &OS) {
   SmallString<128> FullPathName;
 
   if (!UseDwarfDirectory && !Directory.empty()) {
@@ -1113,51 +1143,102 @@ unsigned MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,
     }
   }
 
-  SmallString<128> Str;
-  raw_svector_ostream OS1(Str);
-  OS1 << "\t.file\t" << FileNo << ' ';
+  OS << "\t.file\t" << FileNo << ' ';
   if (!Directory.empty()) {
-    PrintQuotedString(Directory, OS1);
-    OS1 << ' ';
+    PrintQuotedString(Directory, OS);
+    OS << ' ';
   }
-  PrintQuotedString(Filename, OS1);
-  if (MCTargetStreamer *TS = getTargetStreamer()) {
+  PrintQuotedString(Filename, OS);
+  if (Checksum)
+    OS << " md5 0x" << Checksum->digest();
+  if (Source) {
+    OS << " source ";
+    PrintQuotedString(*Source, OS);
+  }
+}
+
+Expected<unsigned> MCAsmStreamer::tryEmitDwarfFileDirective(
+    unsigned FileNo, StringRef Directory, StringRef Filename,
+    MD5::MD5Result *Checksum, Optional<StringRef> Source, unsigned CUID) {
+  assert(CUID == 0 && "multiple CUs not supported by MCAsmStreamer");
+
+  MCDwarfLineTable &Table = getContext().getMCDwarfLineTable(CUID);
+  unsigned NumFiles = Table.getMCDwarfFiles().size();
+  Expected<unsigned> FileNoOrErr =
+      Table.tryGetFile(Directory, Filename, Checksum, Source, FileNo);
+  if (!FileNoOrErr)
+    return FileNoOrErr.takeError();
+  FileNo = FileNoOrErr.get();
+  if (NumFiles == Table.getMCDwarfFiles().size())
+    return FileNo;
+
+  SmallString<128> Str;
+  raw_svector_ostream OS1(Str);
+  printDwarfFileDirective(FileNo, Directory, Filename, Checksum, Source,
+                          UseDwarfDirectory, OS1);
+
+  if (MCTargetStreamer *TS = getTargetStreamer())
     TS->emitDwarfFileDirective(OS1.str());
-  } else {
+  else
     EmitRawText(OS1.str());
-  }
 
   return FileNo;
 }
 
+void MCAsmStreamer::emitDwarfFile0Directive(StringRef Directory,
+                                            StringRef Filename,
+                                            MD5::MD5Result *Checksum,
+                                            Optional<StringRef> Source,
+                                            unsigned CUID) {
+  assert(CUID == 0);
+  // .file 0 is new for DWARF v5.
+  if (getContext().getDwarfVersion() < 5)
+    return;
+  // Inform MCDwarf about the root file.
+  getContext().setMCLineTableRootFile(CUID, Directory, Filename, Checksum,
+                                      Source);
+
+  SmallString<128> Str;
+  raw_svector_ostream OS1(Str);
+  printDwarfFileDirective(0, Directory, Filename, Checksum, Source,
+                          UseDwarfDirectory, OS1);
+
+  if (MCTargetStreamer *TS = getTargetStreamer())
+    TS->emitDwarfFileDirective(OS1.str());
+  else
+    EmitRawText(OS1.str());
+}
+
 void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Column, unsigned Flags,
                                           unsigned Isa,
                                           unsigned Discriminator,
                                           StringRef FileName) {
   OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
-  if (Flags & DWARF2_FLAG_BASIC_BLOCK)
-    OS << " basic_block";
-  if (Flags & DWARF2_FLAG_PROLOGUE_END)
-    OS << " prologue_end";
-  if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
-    OS << " epilogue_begin";
-
-  unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
-  if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
-    OS << " is_stmt ";
+  if (MAI->supportsExtendedDwarfLocDirective()) {
+    if (Flags & DWARF2_FLAG_BASIC_BLOCK)
+      OS << " basic_block";
+    if (Flags & DWARF2_FLAG_PROLOGUE_END)
+      OS << " prologue_end";
+    if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
+      OS << " epilogue_begin";
+
+    unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
+    if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
+      OS << " is_stmt ";
+
+      if (Flags & DWARF2_FLAG_IS_STMT)
+        OS << "1";
+      else
+        OS << "0";
+    }
 
-    if (Flags & DWARF2_FLAG_IS_STMT)
-      OS << "1";
-    else
-      OS << "0";
+    if (Isa)
+      OS << " isa " << Isa;
+    if (Discriminator)
+      OS << " discriminator " << Discriminator;
   }
 
-  if (Isa)
-    OS << " isa " << Isa;
-  if (Discriminator)
-    OS << " discriminator " << Discriminator;
-
   if (IsVerboseAsm) {
     OS.PadToColumn(MAI->getCommentColumn());
     OS << MAI->getCommentString() << ' ' << FileName << ':'
@@ -1606,6 +1687,17 @@ void MCAsmStreamer::EmitWinCFIEndProlog(SMLoc Loc) {
   EmitEOL();
 }
 
+void MCAsmStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
+                                       const MCSymbolRefExpr *To,
+                                       uint64_t Count) {
+  OS << "\t.cg_profile ";
+  From->getSymbol().print(OS, MAI);
+  OS << ", ";
+  To->getSymbol().print(OS, MAI);
+  OS << ", " << Count;
+  EmitEOL();
+}
+
 void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
                                        const MCSubtargetInfo &STI,
                                        bool PrintSchedInfo) {
@@ -1613,7 +1705,12 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
   SmallString<256> Code;
   SmallVector<MCFixup, 4> Fixups;
   raw_svector_ostream VecOS(Code);
-  Emitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+
+  // If we have no code emitter, don't emit code.
+  if (!getAssembler().getEmitterPtr())
+    return;
+
+  getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
 
   // If we are showing fixups, create symbolic markers in the encoded
   // representation. We do this by making a per-bit map to the fixup item index,
@@ -1625,7 +1722,8 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
 
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
     MCFixup &F = Fixups[i];
-    const MCFixupKindInfo &Info = AsmBackend->getFixupKindInfo(F.getKind());
+    const MCFixupKindInfo &Info =
+        getAssembler().getBackend().getFixupKindInfo(F.getKind());
     for (unsigned j = 0; j != Info.TargetSize; ++j) {
       unsigned Index = F.getOffset() * 8 + Info.TargetOffset + j;
       assert(Index < Code.size() * 8 && "Invalid offset in fixup!");
@@ -1689,7 +1787,8 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
 
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
     MCFixup &F = Fixups[i];
-    const MCFixupKindInfo &Info = AsmBackend->getFixupKindInfo(F.getKind());
+    const MCFixupKindInfo &Info =
+        getAssembler().getBackend().getFixupKindInfo(F.getKind());
     OS << "  fixup " << char('A' + i) << " - " << "offset: " << F.getOffset()
        << ", value: " << *F.getValue() << ", kind: " << Info.Name << "\n";
   }
@@ -1702,8 +1801,7 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
          "Cannot emit contents before setting section!");
 
   // Show the encoding in a comment if we have a code emitter.
-  if (Emitter)
-    AddEncodingComment(Inst, STI, PrintSchedInfo);
+  AddEncodingComment(Inst, STI, PrintSchedInfo);
 
   // Show the MCInst if enabled.
   if (ShowInst) {
@@ -1749,7 +1847,8 @@ void MCAsmStreamer::EmitBundleUnlock() {
 }
 
 bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                       const MCExpr *Expr, SMLoc) {
+                                       const MCExpr *Expr, SMLoc,
+                                       const MCSubtargetInfo &STI) {
   OS << "\t.reloc ";
   Offset.print(OS, MAI);
   OS << ", " << Name;
@@ -1761,6 +1860,17 @@ bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
   return false;
 }
 
+void MCAsmStreamer::EmitAddrsig() {
+  OS << "\t.addrsig";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitAddrsigSym(const MCSymbol *Sym) {
+  OS << "\t.addrsig_sym ";
+  Sym->print(OS, MAI);
+  EmitEOL();
+}
+
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
@@ -1792,8 +1902,11 @@ void MCAsmStreamer::FinishImpl() {
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
                                     std::unique_ptr<formatted_raw_ostream> OS,
                                     bool isVerboseAsm, bool useDwarfDirectory,
-                                    MCInstPrinter *IP, MCCodeEmitter *CE,
-                                    MCAsmBackend *MAB, bool ShowInst) {
+                                    MCInstPrinter *IP,
+                                    std::unique_ptr<MCCodeEmitter> &&CE,
+                                    std::unique_ptr<MCAsmBackend> &&MAB,
+                                    bool ShowInst) {
   return new MCAsmStreamer(Context, std::move(OS), isVerboseAsm,
-                           useDwarfDirectory, IP, CE, MAB, ShowInst);
+                           useDwarfDirectory, IP, std::move(CE), std::move(MAB),
+                           ShowInst);
 }
diff --git a/contrib/llvm/lib/MC/MCAssembler.cpp b/contrib/llvm/lib/MC/MCAssembler.cpp
index bd881b4d6e85..1470e026d985 100644
--- a/contrib/llvm/lib/MC/MCAssembler.cpp
+++ b/contrib/llvm/lib/MC/MCAssembler.cpp
@@ -83,9 +83,12 @@ STATISTIC(PaddingFragmentsBytes,
 
 /* *** */
 
-MCAssembler::MCAssembler(MCContext &Context, MCAsmBackend &Backend,
-                         MCCodeEmitter &Emitter, MCObjectWriter &Writer)
-    : Context(Context), Backend(Backend), Emitter(Emitter), Writer(Writer),
+MCAssembler::MCAssembler(MCContext &Context,
+                         std::unique_ptr<MCAsmBackend> Backend,
+                         std::unique_ptr<MCCodeEmitter> Emitter,
+                         std::unique_ptr<MCObjectWriter> Writer)
+    : Context(Context), Backend(std::move(Backend)),
+      Emitter(std::move(Emitter)), Writer(std::move(Writer)),
       BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false),
       IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) {
   VersionInfo.Major = 0; // Major version == 0 for "none specified"
@@ -110,9 +113,12 @@ void MCAssembler::reset() {
   VersionInfo.Major = 0;
 
   // reset objects owned by us
-  getBackend().reset();
-  getEmitter().reset();
-  getWriter().reset();
+  if (getBackendPtr())
+    getBackendPtr()->reset();
+  if (getEmitterPtr())
+    getEmitterPtr()->reset();
+  if (getWriterPtr())
+    getWriterPtr()->reset();
   getLOHContainer().reset();
 }
 
@@ -191,7 +197,8 @@ const MCSymbol *MCAssembler::getAtom(const MCSymbol &S) const {
 
 bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
                                 const MCFixup &Fixup, const MCFragment *DF,
-                                MCValue &Target, uint64_t &Value) const {
+                                MCValue &Target, uint64_t &Value,
+                                bool &WasForced) const {
   ++stats::evaluateFixup;
 
   // FIXME: This code has some duplication with recordRelocation. We should
@@ -203,6 +210,7 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   const MCExpr *Expr = Fixup.getValue();
   MCContext &Ctx = getContext();
   Value = 0;
+  WasForced = false;
   if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) {
     Ctx.reportError(Fixup.getLoc(), "expected relocatable expression");
     return true;
@@ -215,10 +223,11 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
     }
   }
 
-  bool IsPCRel = Backend.getFixupKindInfo(
-    Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel;
+  assert(getBackendPtr() && "Expected assembler backend");
+  bool IsPCRel = getBackendPtr()->getFixupKindInfo(Fixup.getKind()).Flags &
+                 MCFixupKindInfo::FKF_IsPCRel;
 
-  bool IsResolved;
+  bool IsResolved = false;
   if (IsPCRel) {
     if (Target.getSymB()) {
       IsResolved = false;
@@ -229,8 +238,8 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
       const MCSymbol &SA = A->getSymbol();
       if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {
         IsResolved = false;
-      } else {
-        IsResolved = getWriter().isSymbolRefDifferenceFullyResolvedImpl(
+      } else if (auto *Writer = getWriterPtr()) {
+        IsResolved = Writer->isSymbolRefDifferenceFullyResolvedImpl(
             *this, SA, *DF, false, true);
       }
     }
@@ -251,8 +260,8 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
       Value -= Layout.getSymbolOffset(Sym);
   }
 
-  bool ShouldAlignPC = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
-                         MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
+  bool ShouldAlignPC = getBackend().getFixupKindInfo(Fixup.getKind()).Flags &
+                       MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
   assert((ShouldAlignPC ? IsPCRel : true) &&
     "FKF_IsAlignedDownTo32Bits is only allowed on PC-relative fixups!");
 
@@ -266,14 +275,17 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   }
 
   // Let the backend force a relocation if needed.
-  if (IsResolved && Backend.shouldForceRelocation(*this, Fixup, Target))
+  if (IsResolved && getBackend().shouldForceRelocation(*this, Fixup, Target)) {
     IsResolved = false;
+    WasForced = true;
+  }
 
   return IsResolved;
 }
 
 uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
                                           const MCFragment &F) const {
+  assert(getBackendPtr() && "Requires assembler backend");
   switch (F.getKind()) {
   case MCFragment::FT_Data:
     return cast<MCDataFragment>(F).getContents().size();
@@ -283,10 +295,13 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return cast<MCCompactEncodedInstFragment>(F).getContents().size();
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
-    int64_t Size = 0;
-    if (!FF.getSize().evaluateAsAbsolute(Size, Layout))
+    int64_t NumValues = 0;
+    if (!FF.getNumValues().evaluateAsAbsolute(NumValues, Layout)) {
       getContext().reportError(FF.getLoc(),
                                "expected assembly-time absolute expression");
+      return 0;
+    }
+    int64_t Size = NumValues * FF.getValueSize();
     if (Size < 0) {
       getContext().reportError(FF.getLoc(), "invalid number of bytes");
       return 0;
@@ -411,17 +426,18 @@ void MCAsmLayout::layoutFragment(MCFragment *F) {
   if (Assembler.isBundlingEnabled() && F->hasInstructions()) {
     assert(isa<MCEncodedFragment>(F) &&
            "Only MCEncodedFragment implementations have instructions");
-    uint64_t FSize = Assembler.computeFragmentSize(*this, *F);
+    MCEncodedFragment *EF = cast<MCEncodedFragment>(F);
+    uint64_t FSize = Assembler.computeFragmentSize(*this, *EF);
 
     if (!Assembler.getRelaxAll() && FSize > Assembler.getBundleAlignSize())
       report_fatal_error("Fragment can't be larger than a bundle size");
 
-    uint64_t RequiredBundlePadding = computeBundlePadding(Assembler, F,
-                                                          F->Offset, FSize);
+    uint64_t RequiredBundlePadding =
+        computeBundlePadding(Assembler, EF, EF->Offset, FSize);
     if (RequiredBundlePadding > UINT8_MAX)
       report_fatal_error("Padding cannot exceed 255 bytes");
-    F->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
-    F->Offset += RequiredBundlePadding;
+    EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
+    EF->Offset += RequiredBundlePadding;
   }
 }
 
@@ -435,18 +451,20 @@ void MCAssembler::registerSymbol(const MCSymbol &Symbol, bool *Created) {
   }
 }
 
-void MCAssembler::writeFragmentPadding(const MCFragment &F, uint64_t FSize,
-                                       MCObjectWriter *OW) const {
+void MCAssembler::writeFragmentPadding(raw_ostream &OS,
+                                       const MCEncodedFragment &EF,
+                                       uint64_t FSize) const {
+  assert(getBackendPtr() && "Expected assembler backend");
   // Should NOP padding be written out before this fragment?
-  unsigned BundlePadding = F.getBundlePadding();
+  unsigned BundlePadding = EF.getBundlePadding();
   if (BundlePadding > 0) {
     assert(isBundlingEnabled() &&
            "Writing bundle padding with disabled bundling");
-    assert(F.hasInstructions() &&
+    assert(EF.hasInstructions() &&
            "Writing bundle padding for a fragment without instructions");
 
     unsigned TotalLength = BundlePadding + static_cast<unsigned>(FSize);
-    if (F.alignToBundleEnd() && TotalLength > getBundleAlignSize()) {
+    if (EF.alignToBundleEnd() && TotalLength > getBundleAlignSize()) {
       // If the padding itself crosses a bundle boundary, it must be emitted
       // in 2 pieces, since even nop instructions must not cross boundaries.
       //             v--------------v   <- BundleAlignSize
@@ -456,30 +474,31 @@ void MCAssembler::writeFragmentPadding(const MCFragment &F, uint64_t FSize,
       // ----------------------------
       //        ^-------------------^   <- TotalLength
       unsigned DistanceToBoundary = TotalLength - getBundleAlignSize();
-      if (!getBackend().writeNopData(DistanceToBoundary, OW))
-          report_fatal_error("unable to write NOP sequence of " +
-                             Twine(DistanceToBoundary) + " bytes");
+      if (!getBackend().writeNopData(OS, DistanceToBoundary))
+        report_fatal_error("unable to write NOP sequence of " +
+                           Twine(DistanceToBoundary) + " bytes");
       BundlePadding -= DistanceToBoundary;
     }
-    if (!getBackend().writeNopData(BundlePadding, OW))
+    if (!getBackend().writeNopData(OS, BundlePadding))
       report_fatal_error("unable to write NOP sequence of " +
                          Twine(BundlePadding) + " bytes");
   }
 }
 
-/// \brief Write the fragment \p F to the output file.
-static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                          const MCFragment &F) {
-  MCObjectWriter *OW = &Asm.getWriter();
-
+/// Write the fragment \p F to the output file.
+static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
+                          const MCAsmLayout &Layout, const MCFragment &F) {
   // FIXME: Embed in fragments instead?
   uint64_t FragmentSize = Asm.computeFragmentSize(Layout, F);
 
-  Asm.writeFragmentPadding(F, FragmentSize, OW);
+  support::endianness Endian = Asm.getBackend().Endian;
+
+  if (const MCEncodedFragment *EF = dyn_cast<MCEncodedFragment>(&F))
+    Asm.writeFragmentPadding(OS, *EF, FragmentSize);
 
   // This variable (and its dummy usage) is to participate in the assert at
   // the end of the function.
-  uint64_t Start = OW->getStream().tell();
+  uint64_t Start = OS.tell();
   (void) Start;
 
   ++stats::EmittedFragments;
@@ -506,7 +525,7 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     // bytes left to fill use the Value and ValueSize to fill the rest.
     // If we are aligning with nops, ask that target to emit the right data.
     if (AF.hasEmitNops()) {
-      if (!Asm.getBackend().writeNopData(Count, OW))
+      if (!Asm.getBackend().writeNopData(OS, Count))
         report_fatal_error("unable to write nop sequence of " +
                           Twine(Count) + " bytes");
       break;
@@ -516,10 +535,16 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     for (uint64_t i = 0; i != Count; ++i) {
       switch (AF.getValueSize()) {
       default: llvm_unreachable("Invalid size!");
-      case 1: OW->write8 (uint8_t (AF.getValue())); break;
-      case 2: OW->write16(uint16_t(AF.getValue())); break;
-      case 4: OW->write32(uint32_t(AF.getValue())); break;
-      case 8: OW->write64(uint64_t(AF.getValue())); break;
+      case 1: OS << char(AF.getValue()); break;
+      case 2:
+        support::endian::write<uint16_t>(OS, AF.getValue(), Endian);
+        break;
+      case 4:
+        support::endian::write<uint32_t>(OS, AF.getValue(), Endian);
+        break;
+      case 8:
+        support::endian::write<uint64_t>(OS, AF.getValue(), Endian);
+        break;
       }
     }
     break;
@@ -527,47 +552,60 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
 
   case MCFragment::FT_Data: 
     ++stats::EmittedDataFragments;
-    OW->writeBytes(cast<MCDataFragment>(F).getContents());
+    OS << cast<MCDataFragment>(F).getContents();
     break;
 
   case MCFragment::FT_Relaxable:
     ++stats::EmittedRelaxableFragments;
-    OW->writeBytes(cast<MCRelaxableFragment>(F).getContents());
+    OS << cast<MCRelaxableFragment>(F).getContents();
     break;
 
   case MCFragment::FT_CompactEncodedInst:
     ++stats::EmittedCompactEncodedInstFragments;
-    OW->writeBytes(cast<MCCompactEncodedInstFragment>(F).getContents());
+    OS << cast<MCCompactEncodedInstFragment>(F).getContents();
     break;
 
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
     const MCFillFragment &FF = cast<MCFillFragment>(F);
-    uint8_t V = FF.getValue();
+    uint64_t V = FF.getValue();
+    unsigned VSize = FF.getValueSize();
     const unsigned MaxChunkSize = 16;
     char Data[MaxChunkSize];
-    memcpy(Data, &V, 1);
-    for (unsigned I = 1; I < MaxChunkSize; ++I)
-      Data[I] = Data[0];
-
-    uint64_t Size = FragmentSize;
-    for (unsigned ChunkSize = MaxChunkSize; ChunkSize; ChunkSize /= 2) {
-      StringRef Ref(Data, ChunkSize);
-      for (uint64_t I = 0, E = Size / ChunkSize; I != E; ++I)
-        OW->writeBytes(Ref);
-      Size = Size % ChunkSize;
+    // Duplicate V into Data as byte vector to reduce number of
+    // writes done. As such, do endian conversion here.
+    for (unsigned I = 0; I != VSize; ++I) {
+      unsigned index = Endian == support::little ? I : (VSize - I - 1);
+      Data[I] = uint8_t(V >> (index * 8));
     }
+    for (unsigned I = VSize; I < MaxChunkSize; ++I)
+      Data[I] = Data[I - VSize];
+
+    // Set to largest multiple of VSize in Data.
+    const unsigned NumPerChunk = MaxChunkSize / VSize;
+    // Set ChunkSize to largest multiple of VSize in Data
+    const unsigned ChunkSize = VSize * NumPerChunk;
+
+    // Do copies by chunk.
+    StringRef Ref(Data, ChunkSize);
+    for (uint64_t I = 0, E = FragmentSize / ChunkSize; I != E; ++I)
+      OS << Ref;
+
+    // do remainder if needed.
+    unsigned TrailingCount = FragmentSize % ChunkSize;
+    if (TrailingCount)
+      OS.write(Data, TrailingCount);
     break;
   }
 
   case MCFragment::FT_LEB: {
     const MCLEBFragment &LF = cast<MCLEBFragment>(F);
-    OW->writeBytes(LF.getContents());
+    OS << LF.getContents();
     break;
   }
 
   case MCFragment::FT_Padding: {
-    if (!Asm.getBackend().writeNopData(FragmentSize, OW))
+    if (!Asm.getBackend().writeNopData(OS, FragmentSize))
       report_fatal_error("unable to write nop sequence of " +
                          Twine(FragmentSize) + " bytes");
     break;
@@ -575,7 +613,7 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
 
   case MCFragment::FT_SymbolId: {
     const MCSymbolIdFragment &SF = cast<MCSymbolIdFragment>(F);
-    OW->write32(SF.getSymbol()->getIndex());
+    support::endian::write<uint32_t>(OS, SF.getSymbol()->getIndex(), Endian);
     break;
   }
 
@@ -584,41 +622,43 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
 
     for (uint64_t i = 0, e = FragmentSize; i != e; ++i)
-      OW->write8(uint8_t(OF.getValue()));
+      OS << char(OF.getValue());
 
     break;
   }
 
   case MCFragment::FT_Dwarf: {
     const MCDwarfLineAddrFragment &OF = cast<MCDwarfLineAddrFragment>(F);
-    OW->writeBytes(OF.getContents());
+    OS << OF.getContents();
     break;
   }
   case MCFragment::FT_DwarfFrame: {
     const MCDwarfCallFrameFragment &CF = cast<MCDwarfCallFrameFragment>(F);
-    OW->writeBytes(CF.getContents());
+    OS << CF.getContents();
     break;
   }
   case MCFragment::FT_CVInlineLines: {
     const auto &OF = cast<MCCVInlineLineTableFragment>(F);
-    OW->writeBytes(OF.getContents());
+    OS << OF.getContents();
     break;
   }
   case MCFragment::FT_CVDefRange: {
     const auto &DRF = cast<MCCVDefRangeFragment>(F);
-    OW->writeBytes(DRF.getContents());
+    OS << DRF.getContents();
     break;
   }
   case MCFragment::FT_Dummy:
     llvm_unreachable("Should not have been added");
   }
 
-  assert(OW->getStream().tell() - Start == FragmentSize &&
+  assert(OS.tell() - Start == FragmentSize &&
          "The stream should advance by fragment size");
 }
 
-void MCAssembler::writeSectionData(const MCSection *Sec,
+void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec,
                                    const MCAsmLayout &Layout) const {
+  assert(getBackendPtr() && "Expected assembler backend");
+
   // Ignore virtual sections.
   if (Sec->isVirtualSection()) {
     assert(Layout.getSectionFileSize(Sec) == 0 && "Invalid size for section!");
@@ -661,14 +701,13 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
     return;
   }
 
-  uint64_t Start = getWriter().getStream().tell();
+  uint64_t Start = OS.tell();
   (void)Start;
 
   for (const MCFragment &F : *Sec)
-    writeFragment(*this, Layout, F);
+    writeFragment(OS, *this, Layout, F);
 
-  assert(getWriter().getStream().tell() - Start ==
-         Layout.getSectionAddressSize(Sec));
+  assert(OS.tell() - Start == Layout.getSectionAddressSize(Sec));
 }
 
 std::tuple<MCValue, uint64_t, bool>
@@ -677,17 +716,39 @@ MCAssembler::handleFixup(const MCAsmLayout &Layout, MCFragment &F,
   // Evaluate the fixup.
   MCValue Target;
   uint64_t FixedValue;
-  bool IsResolved = evaluateFixup(Layout, Fixup, &F, Target, FixedValue);
+  bool WasForced;
+  bool IsResolved = evaluateFixup(Layout, Fixup, &F, Target, FixedValue,
+                                  WasForced);
   if (!IsResolved) {
     // The fixup was unresolved, we need a relocation. Inform the object
     // writer of the relocation, and give it an opportunity to adjust the
     // fixup value if need be.
-    getWriter().recordRelocation(*this, Layout, &F, Fixup, Target, FixedValue);
+    if (Target.getSymA() && Target.getSymB() &&
+        getBackend().requiresDiffExpressionRelocations()) {
+      // The fixup represents the difference between two symbols, which the
+      // backend has indicated must be resolved at link time. Split up the fixup
+      // into two relocations, one for the add, and one for the sub, and emit
+      // both of these. The constant will be associated with the add half of the
+      // expression.
+      MCFixup FixupAdd = MCFixup::createAddFor(Fixup);
+      MCValue TargetAdd =
+          MCValue::get(Target.getSymA(), nullptr, Target.getConstant());
+      getWriter().recordRelocation(*this, Layout, &F, FixupAdd, TargetAdd,
+                                   FixedValue);
+      MCFixup FixupSub = MCFixup::createSubFor(Fixup);
+      MCValue TargetSub = MCValue::get(Target.getSymB());
+      getWriter().recordRelocation(*this, Layout, &F, FixupSub, TargetSub,
+                                   FixedValue);
+    } else {
+      getWriter().recordRelocation(*this, Layout, &F, Fixup, Target,
+                                   FixedValue);
+    }
   }
   return std::make_tuple(Target, FixedValue, IsResolved);
 }
 
 void MCAssembler::layout(MCAsmLayout &Layout) {
+  assert(getBackendPtr() && "Expected assembler backend");
   DEBUG_WITH_TYPE("mc-dump", {
       errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
@@ -747,12 +808,17 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         continue;
       ArrayRef<MCFixup> Fixups;
       MutableArrayRef<char> Contents;
+      const MCSubtargetInfo *STI = nullptr;
       if (auto *FragWithFixups = dyn_cast<MCDataFragment>(&Frag)) {
         Fixups = FragWithFixups->getFixups();
         Contents = FragWithFixups->getContents();
+        STI = FragWithFixups->getSubtargetInfo();
+        assert(!FragWithFixups->hasInstructions() || STI != nullptr);
       } else if (auto *FragWithFixups = dyn_cast<MCRelaxableFragment>(&Frag)) {
         Fixups = FragWithFixups->getFixups();
         Contents = FragWithFixups->getContents();
+        STI = FragWithFixups->getSubtargetInfo();
+        assert(!FragWithFixups->hasInstructions() || STI != nullptr);
       } else if (auto *FragWithFixups = dyn_cast<MCCVDefRangeFragment>(&Frag)) {
         Fixups = FragWithFixups->getFixups();
         Contents = FragWithFixups->getContents();
@@ -765,7 +831,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         std::tie(Target, FixedValue, IsResolved) =
             handleFixup(Layout, Frag, Fixup);
         getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
-                                IsResolved);
+                                IsResolved, STI);
       }
     }
   }
@@ -776,35 +842,33 @@ void MCAssembler::Finish() {
   MCAsmLayout Layout(*this);
   layout(Layout);
 
-  raw_ostream &OS = getWriter().getStream();
-  uint64_t StartOffset = OS.tell();
-
   // Write the object file.
-  getWriter().writeObject(*this, Layout);
-
-  stats::ObjectBytes += OS.tell() - StartOffset;
+  stats::ObjectBytes += getWriter().writeObject(*this, Layout);
 }
 
 bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
                                        const MCRelaxableFragment *DF,
                                        const MCAsmLayout &Layout) const {
+  assert(getBackendPtr() && "Expected assembler backend");
   MCValue Target;
   uint64_t Value;
-  bool Resolved = evaluateFixup(Layout, Fixup, DF, Target, Value);
+  bool WasForced;
+  bool Resolved = evaluateFixup(Layout, Fixup, DF, Target, Value, WasForced);
   if (Target.getSymA() &&
       Target.getSymA()->getKind() == MCSymbolRefExpr::VK_X86_ABS8 &&
       Fixup.getKind() == FK_Data_1)
     return false;
   return getBackend().fixupNeedsRelaxationAdvanced(Fixup, Resolved, Value, DF,
-                                                   Layout);
+                                                   Layout, WasForced);
 }
 
 bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F,
                                           const MCAsmLayout &Layout) const {
+  assert(getBackendPtr() && "Expected assembler backend");
   // If this inst doesn't ever need relaxation, ignore it. This occurs when we
   // are intentionally pushing out inst fragments, or because we relaxed a
   // previous instruction to one that doesn't need relaxation.
-  if (!getBackend().mayNeedRelaxation(F->getInst()))
+  if (!getBackend().mayNeedRelaxation(F->getInst(), *F->getSubtargetInfo()))
     return false;
 
   for (const MCFixup &Fixup : F->getFixups())
@@ -816,6 +880,8 @@ bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F,
 
 bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
                                    MCRelaxableFragment &F) {
+  assert(getEmitterPtr() &&
+         "Expected CodeEmitter defined for relaxInstruction");
   if (!fragmentNeedsRelaxation(&F, Layout))
     return false;
 
@@ -827,7 +893,7 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
   // Relax the fragment.
 
   MCInst Relaxed;
-  getBackend().relaxInstruction(F.getInst(), F.getSubtargetInfo(), Relaxed);
+  getBackend().relaxInstruction(F.getInst(), *F.getSubtargetInfo(), Relaxed);
 
   // Encode the new instruction.
   //
@@ -836,7 +902,7 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
   SmallVector<MCFixup, 4> Fixups;
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
-  getEmitter().encodeInstruction(Relaxed, VecOS, Fixups, F.getSubtargetInfo());
+  getEmitter().encodeInstruction(Relaxed, VecOS, Fixups, *F.getSubtargetInfo());
 
   // Update the fragment.
   F.setInst(Relaxed);
@@ -848,6 +914,7 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
 
 bool MCAssembler::relaxPaddingFragment(MCAsmLayout &Layout,
                                        MCPaddingFragment &PF) {
+  assert(getBackendPtr() && "Expected assembler backend");
   uint64_t OldSize = PF.getSize();
   if (!getBackend().relaxFragment(&PF, Layout))
     return false;
@@ -868,10 +935,14 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   SmallString<8> &Data = LF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
+  // The compiler can generate EH table assembly that is impossible to assemble
+  // without either adding padding to an LEB fragment or adding extra padding
+  // to a later alignment fragment. To accommodate such tables, relaxation can
+  // only increase an LEB fragment size here, not decrease it. See PR35809.
   if (LF.isSigned())
-    encodeSLEB128(Value, OSE);
+    encodeSLEB128(Value, OSE, OldSize);
   else
-    encodeULEB128(Value, OSE);
+    encodeULEB128(Value, OSE, OldSize);
   return OldSize != LF.getContents().size();
 }
 
@@ -988,6 +1059,7 @@ bool MCAssembler::layoutOnce(MCAsmLayout &Layout) {
 }
 
 void MCAssembler::finishLayout(MCAsmLayout &Layout) {
+  assert(getBackendPtr() && "Expected assembler backend");
   // The layout is done. Mark every fragment as valid.
   for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
     MCSection &Section = *Layout.getSectionOrder()[i];
@@ -996,3 +1068,27 @@ void MCAssembler::finishLayout(MCAsmLayout &Layout) {
   }
   getBackend().finishLayout(*this, Layout);
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MCAssembler::dump() const{
+  raw_ostream &OS = errs();
+
+  OS << "<MCAssembler\n";
+  OS << "  Sections:[\n    ";
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if (it != begin()) OS << ",\n    ";
+    it->dump();
+  }
+  OS << "],\n";
+  OS << "  Symbols:[";
+
+  for (const_symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
+    if (it != symbol_begin()) OS << ",\n           ";
+    OS << "(";
+    it->dump();
+    OS << ", Index:" << it->getIndex() << ", ";
+    OS << ")";
+  }
+  OS << "]>\n";
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCCodeView.cpp b/contrib/llvm/lib/MC/MCCodeView.cpp
index 5fd5bde9f1eb..155fd7eeb576 100644
--- a/contrib/llvm/lib/MC/MCCodeView.cpp
+++ b/contrib/llvm/lib/MC/MCCodeView.cpp
@@ -472,6 +472,19 @@ void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
   if (Locs.empty())
     return;
 
+  // Check that the locations are all in the same section.
+#ifndef NDEBUG
+  const MCSection *FirstSec = &Locs.front().getLabel()->getSection();
+  for (const MCCVLineEntry &Loc : Locs) {
+    if (&Loc.getLabel()->getSection() != FirstSec) {
+      errs() << ".cv_loc " << Loc.getFunctionId() << ' ' << Loc.getFileNum()
+             << ' ' << Loc.getLine() << ' ' << Loc.getColumn()
+             << " is in the wrong section\n";
+      llvm_unreachable(".cv_loc crosses sections");
+    }
+  }
+#endif
+
   // Make an artificial start location using the function start and the inlinee
   // lines start location information. All deltas start relative to this
   // location.
@@ -576,7 +589,7 @@ void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
   if (!LocAfter.empty()) {
     // Only try to compute this difference if we're in the same section.
     const MCCVLineEntry &Loc = LocAfter[0];
-    if (&Loc.getLabel()->getSection(false) == &LastLabel->getSection(false))
+    if (&Loc.getLabel()->getSection() == &LastLabel->getSection())
       LocAfterLength = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
   }
 
@@ -619,7 +632,7 @@ void CodeViewContext::encodeDefRange(MCAsmLayout &Layout,
     }
     unsigned NumGaps = J - I - 1;
 
-    support::endian::Writer<support::little> LEWriter(OS);
+    support::endian::Writer LEWriter(OS, support::little);
 
     unsigned Bias = 0;
     // We must split the range into chunks of MaxDefRange, this is a fundamental
diff --git a/contrib/llvm/lib/MC/MCContext.cpp b/contrib/llvm/lib/MC/MCContext.cpp
index 5c25e902bbe7..606da2526890 100644
--- a/contrib/llvm/lib/MC/MCContext.cpp
+++ b/contrib/llvm/lib/MC/MCContext.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCContext.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -104,6 +105,7 @@ void MCContext::reset() {
   MachOUniquingMap.clear();
   ELFUniquingMap.clear();
   COFFUniquingMap.clear();
+  WasmUniquingMap.clear();
 
   NextID.clear();
   AllowTemporaryLabels = true;
@@ -490,8 +492,10 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind K,
                                          const Twine &Group, unsigned UniqueID,
                                          const char *BeginSymName) {
   MCSymbolWasm *GroupSym = nullptr;
-  if (!Group.isTriviallyEmpty() && !Group.str().empty())
+  if (!Group.isTriviallyEmpty() && !Group.str().empty()) {
     GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
+    GroupSym->setComdat(true);
+  }
 
   return getWasmSection(Section, K, GroupSym, UniqueID, BeginSymName);
 }
@@ -512,13 +516,18 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
 
   StringRef CachedName = Entry.first.SectionName;
 
-  MCSymbol *Begin = nullptr;
-  if (BeginSymName)
-    Begin = createTempSymbol(BeginSymName, false);
+  MCSymbol *Begin = createSymbol(CachedName, false, false);
+  cast<MCSymbolWasm>(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION);
 
   MCSectionWasm *Result = new (WasmAllocator.Allocate())
       MCSectionWasm(CachedName, Kind, GroupSym, UniqueID, Begin);
   Entry.second = Result;
+
+  auto *F = new MCDataFragment();
+  Result->getFragmentList().insert(Result->begin(), F);
+  F->setParent(Result);
+  Begin->setFragment(F);
+
   return Result;
 }
 
@@ -526,28 +535,61 @@ MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
   return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
 }
 
+void MCContext::addDebugPrefixMapEntry(const std::string &From,
+                                       const std::string &To) {
+  DebugPrefixMap.insert(std::make_pair(From, To));
+}
+
+void MCContext::RemapDebugPaths() {
+  const auto &DebugPrefixMap = this->DebugPrefixMap;
+  const auto RemapDebugPath = [&DebugPrefixMap](std::string &Path) {
+    for (const auto &Entry : DebugPrefixMap)
+      if (StringRef(Path).startswith(Entry.first)) {
+        std::string RemappedPath =
+            (Twine(Entry.second) + Path.substr(Entry.first.size())).str();
+        Path.swap(RemappedPath);
+      }
+  };
+
+  // Remap compilation directory.
+  std::string CompDir = CompilationDir.str();
+  RemapDebugPath(CompDir);
+  CompilationDir = CompDir;
+
+  // Remap MCDwarfDirs in all compilation units.
+  for (auto &CUIDTablePair : MCDwarfLineTablesCUMap)
+    for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs())
+      RemapDebugPath(Dir);
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Management
 //===----------------------------------------------------------------------===//
 
-/// getDwarfFile - takes a file name an number to place in the dwarf file and
+/// getDwarfFile - takes a file name and number to place in the dwarf file and
 /// directory tables.  If the file number has already been allocated it is an
 /// error and zero is returned and the client reports the error, else the
 /// allocated file number is returned.  The file numbers may be in any order.
-unsigned MCContext::getDwarfFile(StringRef Directory, StringRef FileName,
-                                 unsigned FileNumber, unsigned CUID) {
+Expected<unsigned> MCContext::getDwarfFile(StringRef Directory,
+                                           StringRef FileName,
+                                           unsigned FileNumber,
+                                           MD5::MD5Result *Checksum,
+                                           Optional<StringRef> Source,
+                                           unsigned CUID) {
   MCDwarfLineTable &Table = MCDwarfLineTablesCUMap[CUID];
-  return Table.getFile(Directory, FileName, FileNumber);
+  return Table.tryGetFile(Directory, FileName, Checksum, Source, FileNumber);
 }
 
 /// isValidDwarfFileNumber - takes a dwarf file number and returns true if it
 /// currently is assigned and false otherwise.
 bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) {
-  const SmallVectorImpl<MCDwarfFile> &MCDwarfFiles = getMCDwarfFiles(CUID);
-  if (FileNumber == 0 || FileNumber >= MCDwarfFiles.size())
+  const MCDwarfLineTable &LineTable = getMCDwarfLineTable(CUID);
+  if (FileNumber == 0)
+    return getDwarfVersion() >= 5 && LineTable.hasRootFile();
+  if (FileNumber >= LineTable.getMCDwarfFiles().size())
     return false;
 
-  return !MCDwarfFiles[FileNumber].Name.empty();
+  return !LineTable.getMCDwarfFiles()[FileNumber].Name.empty();
 }
 
 /// Remove empty sections from SectionStartEndSyms, to avoid generating
@@ -563,6 +605,11 @@ CodeViewContext &MCContext::getCVContext() {
   return *CVContext.get();
 }
 
+void MCContext::clearCVLocSeen() {
+  if (CVContext)
+    CVContext->clearCVLocSeen();
+}
+
 //===----------------------------------------------------------------------===//
 // Error Reporting
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index ef1d8335e1bd..30e0bb562644 100644
--- a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -130,7 +130,7 @@ void LLVMDisasmDispose(LLVMDisasmContextRef DCR){
   delete DC;
 }
 
-/// \brief Emits the comments that are stored in \p DC comment stream.
+/// Emits the comments that are stored in \p DC comment stream.
 /// Each comment in the comment stream must end with a newline.
 static void emitComments(LLVMDisasmContext *DC,
                          formatted_raw_ostream &FormattedOS) {
@@ -158,7 +158,7 @@ static void emitComments(LLVMDisasmContext *DC,
   DC->CommentsToEmit.clear();
 }
 
-/// \brief Gets latency information for \p Inst from the itinerary
+/// Gets latency information for \p Inst from the itinerary
 /// scheduling model, based on \p DC information.
 /// \return The maximum expected latency over all the operands or -1
 /// if no information is available.
@@ -184,7 +184,7 @@ static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   return Latency;
 }
 
-/// \brief Gets latency information for \p Inst, based on \p DC information.
+/// Gets latency information for \p Inst, based on \p DC information.
 /// \return The maximum expected latency over all the definitions or -1
 /// if no information is available.
 static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
@@ -209,7 +209,7 @@ static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
     return NoInformationAvailable;
 
   // Compute output latency.
-  int Latency = 0;
+  int16_t Latency = 0;
   for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
        DefIdx != DefEnd; ++DefIdx) {
     // Lookup the definition's write latency in SubtargetInfo.
@@ -221,7 +221,7 @@ static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   return Latency;
 }
 
-/// \brief Emits latency information in DC->CommentStream for \p Inst, based
+/// Emits latency information in DC->CommentStream for \p Inst, based
 /// on the information available in \p DC.
 static void emitLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   int Latency = getLatency(DC, Inst);
diff --git a/contrib/llvm/lib/MC/MCDwarf.cpp b/contrib/llvm/lib/MC/MCDwarf.cpp
index 9e5d9ff73c76..6131fcd658b2 100644
--- a/contrib/llvm/lib/MC/MCDwarf.cpp
+++ b/contrib/llvm/lib/MC/MCDwarf.cpp
@@ -11,7 +11,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -45,6 +46,29 @@
 
 using namespace llvm;
 
+/// Manage the .debug_line_str section contents, if we use it.
+class llvm::MCDwarfLineStr {
+  MCSymbol *LineStrLabel = nullptr;
+  StringTableBuilder LineStrings{StringTableBuilder::DWARF};
+  bool UseRelocs = false;
+
+public:
+  /// Construct an instance that can emit .debug_line_str (for use in a normal
+  /// v5 line table).
+  explicit MCDwarfLineStr(MCContext &Ctx) {
+    UseRelocs = Ctx.getAsmInfo()->doesDwarfUseRelocationsAcrossSections();
+    if (UseRelocs)
+      LineStrLabel =
+          Ctx.getObjectFileInfo()->getDwarfLineStrSection()->getBeginSymbol();
+  }
+
+  /// Emit a reference to the string.
+  void emitRef(MCStreamer *MCOS, StringRef Path);
+
+  /// Emit the .debug_line_str section if appropriate.
+  void emitSection(MCStreamer *MCOS);
+};
+
 static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
   unsigned MinInsnLength = Context.getAsmInfo()->getMinInstAlignment();
   if (MinInsnLength == 1)
@@ -108,6 +132,18 @@ static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
 }
 
 //
+// This helper routine returns an expression of Start + IntVal .
+//
+static inline const MCExpr *
+makeStartPlusIntExpr(MCContext &Ctx, const MCSymbol &Start, int IntVal) {
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *LHS = MCSymbolRefExpr::create(&Start, Variant, Ctx);
+  const MCExpr *RHS = MCConstantExpr::create(IntVal, Ctx);
+  const MCExpr *Res = MCBinaryExpr::create(MCBinaryExpr::Add, LHS, RHS, Ctx);
+  return Res;
+}
+
+//
 // This emits the Dwarf line table for the specified section from the entries
 // in the LineSection.
 //
@@ -205,22 +241,35 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
   if (LineTables.empty())
     return;
 
+  // In a v5 non-split line table, put the strings in a separate section.
+  Optional<MCDwarfLineStr> LineStr;
+  if (context.getDwarfVersion() >= 5)
+    LineStr = MCDwarfLineStr(context);
+
   // Switch to the section where the table will be emitted into.
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   // Handle the rest of the Compile Units.
-  for (const auto &CUIDTablePair : LineTables)
-    CUIDTablePair.second.EmitCU(MCOS, Params);
+  for (const auto &CUIDTablePair : LineTables) {
+    CUIDTablePair.second.EmitCU(MCOS, Params, LineStr);
+  }
+
+  if (LineStr)
+    LineStr->emitSection(MCOS);
 }
 
-void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS,
-                               MCDwarfLineTableParams Params) const {
-  MCOS.EmitLabel(Header.Emit(&MCOS, Params, None).second);
+void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params,
+                               MCSection *Section) const {
+  if (Header.MCDwarfFiles.empty())
+    return;
+  Optional<MCDwarfLineStr> NoLineStr(None);
+  MCOS.SwitchSection(Section);
+  MCOS.EmitLabel(Header.Emit(&MCOS, Params, None, NoLineStr).second);
 }
 
 std::pair<MCSymbol *, MCSymbol *>
-MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
-                             MCDwarfLineTableParams Params) const {
+MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
+                             Optional<MCDwarfLineStr> &LineStr) const {
   static const char StandardOpcodeLengths[] = {
       0, // length of DW_LNS_copy
       1, // length of DW_LNS_advance_pc
@@ -237,8 +286,10 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
   };
   assert(array_lengthof(StandardOpcodeLengths) >=
          (Params.DWARF2LineOpcodeBase - 1U));
-  return Emit(MCOS, Params, makeArrayRef(StandardOpcodeLengths,
-                                         Params.DWARF2LineOpcodeBase - 1));
+  return Emit(
+      MCOS, Params,
+      makeArrayRef(StandardOpcodeLengths, Params.DWARF2LineOpcodeBase - 1),
+      LineStr);
 }
 
 static const MCExpr *forceExpAbs(MCStreamer &OS, const MCExpr* Expr) {
@@ -257,12 +308,31 @@ static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) {
   OS.EmitValue(ABS, Size);
 }
 
-static void
-emitV2FileDirTables(MCStreamer *MCOS,
-                    const SmallVectorImpl<std::string> &MCDwarfDirs,
-                    const SmallVectorImpl<MCDwarfFile> &MCDwarfFiles) {
+void MCDwarfLineStr::emitSection(MCStreamer *MCOS) {
+  // Switch to the .debug_line_str section.
+  MCOS->SwitchSection(
+      MCOS->getContext().getObjectFileInfo()->getDwarfLineStrSection());
+  // Emit the strings without perturbing the offsets we used.
+  LineStrings.finalizeInOrder();
+  SmallString<0> Data;
+  Data.resize(LineStrings.getSize());
+  LineStrings.write((uint8_t *)Data.data());
+  MCOS->EmitBinaryData(Data.str());
+}
+
+void MCDwarfLineStr::emitRef(MCStreamer *MCOS, StringRef Path) {
+  int RefSize = 4; // FIXME: Support DWARF-64
+  size_t Offset = LineStrings.add(Path);
+  if (UseRelocs) {
+    MCContext &Ctx = MCOS->getContext();
+    MCOS->EmitValue(makeStartPlusIntExpr(Ctx, *LineStrLabel, Offset), RefSize);
+  } else
+    MCOS->EmitIntValue(Offset, RefSize);
+}
+
+void MCDwarfLineTableHeader::emitV2FileDirTables(MCStreamer *MCOS) const {
   // First the directory table.
-  for (auto Dir : MCDwarfDirs) {
+  for (auto &Dir : MCDwarfDirs) {
     MCOS->EmitBytes(Dir);                // The DirectoryName, and...
     MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
   }
@@ -280,46 +350,101 @@ emitV2FileDirTables(MCStreamer *MCOS,
   MCOS->EmitIntValue(0, 1); // Terminate the file list.
 }
 
-static void
-emitV5FileDirTables(MCStreamer *MCOS,
-                    const SmallVectorImpl<std::string> &MCDwarfDirs,
-                    const SmallVectorImpl<MCDwarfFile> &MCDwarfFiles,
-                    StringRef CompilationDir) {
-  // The directory format, which is just inline null-terminated strings.
+static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
+                               bool EmitMD5, bool HasSource,
+                               Optional<MCDwarfLineStr> &LineStr) {
+  assert(!DwarfFile.Name.empty());
+  if (LineStr)
+    LineStr->emitRef(MCOS, DwarfFile.Name);
+  else {
+    MCOS->EmitBytes(DwarfFile.Name);     // FileName and...
+    MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
+  }
+  MCOS->EmitULEB128IntValue(DwarfFile.DirIndex); // Directory number.
+  if (EmitMD5) {
+    MD5::MD5Result *Cksum = DwarfFile.Checksum;
+    MCOS->EmitBinaryData(
+        StringRef(reinterpret_cast<const char *>(Cksum->Bytes.data()),
+                  Cksum->Bytes.size()));
+  }
+  if (HasSource) {
+    if (LineStr)
+      LineStr->emitRef(MCOS, DwarfFile.Source.getValueOr(StringRef()));
+    else {
+      MCOS->EmitBytes(
+          DwarfFile.Source.getValueOr(StringRef())); // Source and...
+      MCOS->EmitBytes(StringRef("\0", 1));           // its null terminator.
+    }
+  }
+}
+
+void MCDwarfLineTableHeader::emitV5FileDirTables(
+    MCStreamer *MCOS, Optional<MCDwarfLineStr> &LineStr,
+    StringRef CtxCompilationDir) const {
+  // The directory format, which is just a list of the directory paths.  In a
+  // non-split object, these are references to .debug_line_str; in a split
+  // object, they are inline strings.
   MCOS->EmitIntValue(1, 1);
   MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_path);
-  MCOS->EmitULEB128IntValue(dwarf::DW_FORM_string);
-  // Then the list of directory paths.  CompilationDir comes first.
+  MCOS->EmitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
+                                    : dwarf::DW_FORM_string);
   MCOS->EmitULEB128IntValue(MCDwarfDirs.size() + 1);
-  MCOS->EmitBytes(CompilationDir);
-  MCOS->EmitBytes(StringRef("\0", 1));
-  for (auto Dir : MCDwarfDirs) {
-    MCOS->EmitBytes(Dir);                // The DirectoryName, and...
-    MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
+  // Try not to emit an empty compilation directory.
+  const StringRef CompDir =
+      CompilationDir.empty() ? CtxCompilationDir : StringRef(CompilationDir);
+  if (LineStr) {
+    // Record path strings, emit references here.
+    LineStr->emitRef(MCOS, CompDir);
+    for (const auto &Dir : MCDwarfDirs)
+      LineStr->emitRef(MCOS, Dir);
+  } else {
+    // The list of directory paths.  Compilation directory comes first.
+    MCOS->EmitBytes(CompDir);
+    MCOS->EmitBytes(StringRef("\0", 1));
+    for (const auto &Dir : MCDwarfDirs) {
+      MCOS->EmitBytes(Dir);                // The DirectoryName, and...
+      MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
+    }
   }
 
   // The file format, which is the inline null-terminated filename and a
   // directory index.  We don't track file size/timestamp so don't emit them
-  // in the v5 table.
-  // FIXME: Arrange to emit MD5 signatures for the source files.
-  MCOS->EmitIntValue(2, 1);
+  // in the v5 table.  Emit MD5 checksums and source if we have them.
+  uint64_t Entries = 2;
+  if (HasAllMD5)
+    Entries += 1;
+  if (HasSource)
+    Entries += 1;
+  MCOS->EmitIntValue(Entries, 1);
   MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_path);
-  MCOS->EmitULEB128IntValue(dwarf::DW_FORM_string);
+  MCOS->EmitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
+                                    : dwarf::DW_FORM_string);
   MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_directory_index);
   MCOS->EmitULEB128IntValue(dwarf::DW_FORM_udata);
-  // Then the list of file names. These start at 1 for some reason.
-  MCOS->EmitULEB128IntValue(MCDwarfFiles.size() - 1);
-  for (unsigned i = 1; i < MCDwarfFiles.size(); ++i) {
-    assert(!MCDwarfFiles[i].Name.empty());
-    MCOS->EmitBytes(MCDwarfFiles[i].Name); // FileName and...
-    MCOS->EmitBytes(StringRef("\0", 1));   // its null terminator.
-    MCOS->EmitULEB128IntValue(MCDwarfFiles[i].DirIndex); // Directory number.
+  if (HasAllMD5) {
+    MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_MD5);
+    MCOS->EmitULEB128IntValue(dwarf::DW_FORM_data16);
+  }
+  if (HasSource) {
+    MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_LLVM_source);
+    MCOS->EmitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
+                                      : dwarf::DW_FORM_string);
   }
+  // Then the counted list of files. The root file is file #0, then emit the
+  // files as provide by .file directives.  To accommodate assembler source
+  // written for DWARF v4 but trying to emit v5, if we didn't see a root file
+  // explicitly, replicate file #1.
+  MCOS->EmitULEB128IntValue(MCDwarfFiles.size());
+  emitOneV5FileEntry(MCOS, RootFile.Name.empty() ? MCDwarfFiles[1] : RootFile,
+                     HasAllMD5, HasSource, LineStr);
+  for (unsigned i = 1; i < MCDwarfFiles.size(); ++i)
+    emitOneV5FileEntry(MCOS, MCDwarfFiles[i], HasAllMD5, HasSource, LineStr);
 }
 
 std::pair<MCSymbol *, MCSymbol *>
 MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
-                             ArrayRef<char> StandardOpcodeLengths) const {
+                             ArrayRef<char> StandardOpcodeLengths,
+                             Optional<MCDwarfLineStr> &LineStr) const {
   MCContext &context = MCOS->getContext();
 
   // Create a symbol at the beginning of the line table.
@@ -384,9 +509,9 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
   // Put out the directory and file tables.  The formats vary depending on
   // the version.
   if (LineTableVersion >= 5)
-    emitV5FileDirTables(MCOS, MCDwarfDirs, MCDwarfFiles, CompilationDir);
+    emitV5FileDirTables(MCOS, LineStr, context.getCompilationDir());
   else
-    emitV2FileDirTables(MCOS, MCDwarfDirs, MCDwarfFiles);
+    emitV2FileDirTables(MCOS);
 
   // This is the end of the prologue, so set the value of the symbol at the
   // end of the prologue (that was used in a previous expression).
@@ -396,8 +521,9 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
 }
 
 void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
-                              MCDwarfLineTableParams Params) const {
-  MCSymbol *LineEndSym = Header.Emit(MCOS, Params).second;
+                              MCDwarfLineTableParams Params,
+                              Optional<MCDwarfLineStr> &LineStr) const {
+  MCSymbol *LineEndSym = Header.Emit(MCOS, Params, LineStr).second;
 
   // Put out the line tables.
   for (const auto &LineSec : MCLineSections.getMCLineEntries())
@@ -408,14 +534,20 @@ void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
   MCOS->EmitLabel(LineEndSym);
 }
 
-unsigned MCDwarfLineTable::getFile(StringRef &Directory, StringRef &FileName,
-                                   unsigned FileNumber) {
-  return Header.getFile(Directory, FileName, FileNumber);
+Expected<unsigned> MCDwarfLineTable::tryGetFile(StringRef &Directory,
+                                                StringRef &FileName,
+                                                MD5::MD5Result *Checksum,
+                                                Optional<StringRef> Source,
+                                                unsigned FileNumber) {
+  return Header.tryGetFile(Directory, FileName, Checksum, Source, FileNumber);
 }
 
-unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory,
-                                         StringRef &FileName,
-                                         unsigned FileNumber) {
+Expected<unsigned>
+MCDwarfLineTableHeader::tryGetFile(StringRef &Directory,
+                                   StringRef &FileName,
+                                   MD5::MD5Result *Checksum,
+                                   Optional<StringRef> &Source,
+                                   unsigned FileNumber) {
   if (Directory == CompilationDir)
     Directory = "";
   if (FileName.empty()) {
@@ -423,6 +555,12 @@ unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory,
     Directory = "";
   }
   assert(!FileName.empty());
+  // Keep track of whether any or all files have an MD5 checksum.
+  // If any files have embedded source, they all must.
+  if (MCDwarfFiles.empty()) {
+    trackMD5Usage(Checksum);
+    HasSource = (Source != None);
+  }
   if (FileNumber == 0) {
     // File numbers start with 1 and/or after any file numbers
     // allocated by inline-assembler .file directives.
@@ -441,9 +579,15 @@ unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory,
   // Get the new MCDwarfFile slot for this FileNumber.
   MCDwarfFile &File = MCDwarfFiles[FileNumber];
 
-  // It is an error to use see the same number more than once.
+  // It is an error to see the same number more than once.
   if (!File.Name.empty())
-    return 0;
+    return make_error<StringError>("file number already allocated",
+                                   inconvertibleErrorCode());
+
+  // If any files have embedded source, they all must.
+  if (HasSource != (Source != None))
+    return make_error<StringError>("inconsistent use of embedded source",
+                                   inconvertibleErrorCode());
 
   if (Directory.empty()) {
     // Separate the directory part from the basename of the FileName.
@@ -478,6 +622,11 @@ unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory,
 
   File.Name = FileName;
   File.DirIndex = DirIndex;
+  File.Checksum = Checksum;
+  trackMD5Usage(Checksum);
+  File.Source = Source;
+  if (Source)
+    HasSource = true;
 
   // return the allocated FileNumber.
   return FileNumber;
@@ -1653,6 +1802,8 @@ void MCDwarfFrameEmitter::EncodeAdvanceLoc(MCContext &Context,
   // Scale the address delta by the minimum instruction length.
   AddrDelta = ScaleAddrDelta(Context, AddrDelta);
 
+  support::endianness E =
+      Context.getAsmInfo()->isLittleEndian() ? support::little : support::big;
   if (AddrDelta == 0) {
   } else if (isUIntN(6, AddrDelta)) {
     uint8_t Opcode = dwarf::DW_CFA_advance_loc | AddrDelta;
@@ -1662,16 +1813,10 @@ void MCDwarfFrameEmitter::EncodeAdvanceLoc(MCContext &Context,
     OS << uint8_t(AddrDelta);
   } else if (isUInt<16>(AddrDelta)) {
     OS << uint8_t(dwarf::DW_CFA_advance_loc2);
-    if (Context.getAsmInfo()->isLittleEndian())
-      support::endian::Writer<support::little>(OS).write<uint16_t>(AddrDelta);
-    else
-      support::endian::Writer<support::big>(OS).write<uint16_t>(AddrDelta);
+    support::endian::write<uint16_t>(OS, AddrDelta, E);
   } else {
     assert(isUInt<32>(AddrDelta));
     OS << uint8_t(dwarf::DW_CFA_advance_loc4);
-    if (Context.getAsmInfo()->isLittleEndian())
-      support::endian::Writer<support::little>(OS).write<uint32_t>(AddrDelta);
-    else
-      support::endian::Writer<support::big>(OS).write<uint32_t>(AddrDelta);
+    support::endian::write<uint32_t>(OS, AddrDelta, E);
   }
 }
diff --git a/contrib/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm/lib/MC/MCELFStreamer.cpp
index 6b1c589f0389..95b48e6abc74 100644
--- a/contrib/llvm/lib/MC/MCELFStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCELFStreamer.cpp
@@ -41,9 +41,10 @@ using namespace llvm;
 
 MCELFStreamer::MCELFStreamer(MCContext &Context,
                              std::unique_ptr<MCAsmBackend> TAB,
-                             raw_pwrite_stream &OS,
+                             std::unique_ptr<MCObjectWriter> OW,
                              std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCObjectStreamer(Context, std::move(TAB), OS, std::move(Emitter)) {}
+    : MCObjectStreamer(Context, std::move(TAB), std::move(OW),
+                       std::move(Emitter)) {}
 
 bool MCELFStreamer::isBundleLocked() const {
   return getCurrentSectionOnly()->isBundleLocked();
@@ -68,13 +69,8 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
     if (RequiredBundlePadding > 0) {
       SmallString<256> Code;
       raw_svector_ostream VecOS(Code);
-      {
-        auto OW = Assembler.getBackend().createObjectWriter(VecOS);
-
-        EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
-
-        Assembler.writeFragmentPadding(*EF, FSize, OW.get());
-      }
+      EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
+      Assembler.writeFragmentPadding(VecOS, *EF, FSize);
 
       DF->getContents().append(Code.begin(), Code.end());
     }
@@ -87,7 +83,8 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
                                  DF->getContents().size());
     DF->getFixups().push_back(EF->getFixups()[i]);
   }
-  DF->setHasInstructions(true);
+  if (DF->getSubtargetInfo() == nullptr && EF->getSubtargetInfo())
+    DF->setHasInstructions(*EF->getSubtargetInfo());
   DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
 }
 
@@ -192,17 +189,6 @@ static unsigned CombineSymbolTypes(unsigned T1, unsigned T2) {
 
 bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  // Indirect symbols are handled differently, to match how 'as' handles
-  // them. This makes writing matching .o files easier.
-  if (Attribute == MCSA_IndirectSymbol) {
-    // Note that we intentionally cannot use the symbol data here; this is
-    // important for matching the string table that 'as' generates.
-    IndirectSymbolData ISD;
-    ISD.Symbol = Symbol;
-    ISD.Section = getCurrentSectionOnly();
-    getAssembler().getIndirectSymbols().push_back(ISD);
-    return true;
-  }
 
   // Adding a symbol attribute always introduces the symbol, note that an
   // important side effect of calling registerSymbol here is to register
@@ -370,6 +356,12 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
+void MCELFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
+                                       const MCSymbolRefExpr *To,
+                                       uint64_t Count) {
+  getAssembler().CGProfile.push_back({From, To, Count});
+}
+
 void MCELFStreamer::EmitIdent(StringRef IdentString) {
   MCSection *Comment = getAssembler().getContext().getELFSection(
       ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
@@ -419,6 +411,8 @@ void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_PPC_TPREL_LO:
     case MCSymbolRefExpr::VK_PPC_TPREL_HI:
     case MCSymbolRefExpr::VK_PPC_TPREL_HA:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HIGH:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HIGHA:
     case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
     case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
     case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
@@ -426,6 +420,8 @@ void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HIGH:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHA:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST:
@@ -462,6 +458,37 @@ void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
   }
 }
 
+void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
+  const MCSymbol *S = &SRE->getSymbol();
+  if (S->isTemporary()) {
+    if (!S->isInSection()) {
+      getContext().reportError(
+          SRE->getLoc(), Twine("Reference to undefined temporary symbol ") +
+                             "`" + S->getName() + "`");
+      return;
+    }
+    S = S->getSection().getBeginSymbol();
+    S->setUsedInReloc();
+    SRE =
+        MCSymbolRefExpr::create(S, SRE->getKind(), getContext(), SRE->getLoc());
+    return;
+  }
+  // Not a temporary, referece it as a weak undefined.
+  bool Created;
+  getAssembler().registerSymbol(*S, &Created);
+  if (Created) {
+    cast<MCSymbolELF>(S)->setBinding(ELF::STB_WEAK);
+    cast<MCSymbolELF>(S)->setExternal(true);
+  }
+}
+
+void MCELFStreamer::finalizeCGProfile() {
+  for (MCAssembler::CGProfileEntry &E : getAssembler().CGProfile) {
+    finalizeCGProfileEntry(E.From);
+    finalizeCGProfileEntry(E.To);
+  }
+}
+
 void MCELFStreamer::EmitInstToFragment(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
   this->MCObjectStreamer::EmitInstToFragment(Inst, STI);
@@ -471,6 +498,15 @@ void MCELFStreamer::EmitInstToFragment(const MCInst &Inst,
     fixSymbolsInTLSFixups(F.getFixups()[i].getValue());
 }
 
+// A fragment can only have one Subtarget, and when bundling is enabled we
+// sometimes need to use the same fragment. We give an error if there
+// are conflicting Subtargets.
+static void CheckBundleSubtargets(const MCSubtargetInfo *OldSTI,
+                                  const MCSubtargetInfo *NewSTI) {
+  if (OldSTI && NewSTI && OldSTI != NewSTI)
+    report_fatal_error("A Bundle can only have one Subtarget.");
+}
+
 void MCELFStreamer::EmitInstToData(const MCInst &Inst,
                                    const MCSubtargetInfo &STI) {
   MCAssembler &Assembler = getAssembler();
@@ -486,7 +522,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
   //
   // If bundling is disabled, append the encoded instruction to the current data
   // fragment (or create a new such fragment if the current fragment is not a
-  // data fragment).
+  // data fragment, or the Subtarget has changed).
   //
   // If bundling is enabled:
   // - If we're not in a bundle-locked group, emit the instruction into a
@@ -501,19 +537,23 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
 
   if (Assembler.isBundlingEnabled()) {
     MCSection &Sec = *getCurrentSectionOnly();
-    if (Assembler.getRelaxAll() && isBundleLocked())
+    if (Assembler.getRelaxAll() && isBundleLocked()) {
       // If the -mc-relax-all flag is used and we are bundle-locked, we re-use
       // the current bundle group.
       DF = BundleGroups.back();
+      CheckBundleSubtargets(DF->getSubtargetInfo(), &STI);
+    }
     else if (Assembler.getRelaxAll() && !isBundleLocked())
       // When not in a bundle-locked group and the -mc-relax-all flag is used,
       // we create a new temporary fragment which will be later merged into
       // the current fragment.
       DF = new MCDataFragment();
-    else if (isBundleLocked() && !Sec.isBundleGroupBeforeFirstInst())
+    else if (isBundleLocked() && !Sec.isBundleGroupBeforeFirstInst()) {
       // If we are bundle-locked, we re-use the current fragment.
       // The bundle-locking directive ensures this is a new data fragment.
       DF = cast<MCDataFragment>(getCurrentFragment());
+      CheckBundleSubtargets(DF->getSubtargetInfo(), &STI);
+    }
     else if (!isBundleLocked() && Fixups.size() == 0) {
       // Optimize memory usage by emitting the instruction to a
       // MCCompactEncodedInstFragment when not in a bundle-locked group and
@@ -521,6 +561,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
       MCCompactEncodedInstFragment *CEIF = new MCCompactEncodedInstFragment();
       insert(CEIF);
       CEIF->getContents().append(Code.begin(), Code.end());
+      CEIF->setHasInstructions(STI);
       return;
     } else {
       DF = new MCDataFragment();
@@ -538,7 +579,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
     // to be turned off.
     Sec.setBundleGroupBeforeFirstInst(false);
   } else {
-    DF = getOrCreateDataFragment();
+    DF = getOrCreateDataFragment(&STI);
   }
 
   // Add the fixups and data.
@@ -546,12 +587,12 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
     Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
     DF->getFixups().push_back(Fixups[i]);
   }
-  DF->setHasInstructions(true);
+  DF->setHasInstructions(STI);
   DF->getContents().append(Code.begin(), Code.end());
 
   if (Assembler.isBundlingEnabled() && Assembler.getRelaxAll()) {
     if (!isBundleLocked()) {
-      mergeFragment(getOrCreateDataFragment(), DF);
+      mergeFragment(getOrCreateDataFragment(&STI), DF);
       delete DF;
     }
   }
@@ -611,7 +652,7 @@ void MCELFStreamer::EmitBundleUnlock() {
 
     // FIXME: Use more separate fragments for nested groups.
     if (!isBundleLocked()) {
-      mergeFragment(getOrCreateDataFragment(), DF);
+      mergeFragment(getOrCreateDataFragment(DF->getSubtargetInfo()), DF);
       BundleGroups.pop_back();
       delete DF;
     }
@@ -627,6 +668,7 @@ void MCELFStreamer::FinishImpl() {
   MCSection *CurSection = getCurrentSectionOnly();
   setSectionAlignmentForBundling(getAssembler(), CurSection);
 
+  finalizeCGProfile();
   EmitFrames(nullptr);
 
   this->MCObjectStreamer::FinishImpl();
@@ -641,7 +683,8 @@ void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
 }
 
 void MCELFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
-                                 uint64_t Size, unsigned ByteAlignment) {
+                                 uint64_t Size, unsigned ByteAlignment,
+                                 SMLoc Loc) {
   llvm_unreachable("ELF doesn't support this directive");
 }
 
@@ -652,11 +695,11 @@ void MCELFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 
 MCStreamer *llvm::createELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&CE,
                                     bool RelaxAll) {
   MCELFStreamer *S =
-      new MCELFStreamer(Context, std::move(MAB), OS, std::move(CE));
+      new MCELFStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/contrib/llvm/lib/MC/MCExpr.cpp b/contrib/llvm/lib/MC/MCExpr.cpp
index f8fff4414f49..0694a8fa620e 100644
--- a/contrib/llvm/lib/MC/MCExpr.cpp
+++ b/contrib/llvm/lib/MC/MCExpr.cpp
@@ -10,6 +10,8 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -73,7 +75,10 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     case MCUnaryExpr::Not:   OS << '~'; break;
     case MCUnaryExpr::Plus:  OS << '+'; break;
     }
+    bool Binary = UE.getSubExpr()->getKind() == MCExpr::Binary;
+    if (Binary) OS << "(";
     UE.getSubExpr()->print(OS, MAI);
+    if (Binary) OS << ")";
     return;
   }
 
@@ -234,6 +239,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_LO: return "l";
   case VK_PPC_HI: return "h";
   case VK_PPC_HA: return "ha";
+  case VK_PPC_HIGH: return "high";
+  case VK_PPC_HIGHA: return "higha";
   case VK_PPC_HIGHER: return "higher";
   case VK_PPC_HIGHERA: return "highera";
   case VK_PPC_HIGHEST: return "highest";
@@ -250,6 +257,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_TPREL_LO: return "tprel@l";
   case VK_PPC_TPREL_HI: return "tprel@h";
   case VK_PPC_TPREL_HA: return "tprel@ha";
+  case VK_PPC_TPREL_HIGH: return "tprel@high";
+  case VK_PPC_TPREL_HIGHA: return "tprel@higha";
   case VK_PPC_TPREL_HIGHER: return "tprel@higher";
   case VK_PPC_TPREL_HIGHERA: return "tprel@highera";
   case VK_PPC_TPREL_HIGHEST: return "tprel@highest";
@@ -257,6 +266,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_DTPREL_LO: return "dtprel@l";
   case VK_PPC_DTPREL_HI: return "dtprel@h";
   case VK_PPC_DTPREL_HA: return "dtprel@ha";
+  case VK_PPC_DTPREL_HIGH: return "dtprel@high";
+  case VK_PPC_DTPREL_HIGHA: return "dtprel@higha";
   case VK_PPC_DTPREL_HIGHER: return "dtprel@higher";
   case VK_PPC_DTPREL_HIGHERA: return "dtprel@highera";
   case VK_PPC_DTPREL_HIGHEST: return "dtprel@highest";
@@ -298,6 +309,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
   case VK_AMDGPU_REL32_LO: return "rel32@lo";
   case VK_AMDGPU_REL32_HI: return "rel32@hi";
+  case VK_AMDGPU_REL64: return "rel64";
   }
   llvm_unreachable("Invalid variant kind");
 }
@@ -337,6 +349,8 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("l", VK_PPC_LO)
     .Case("h", VK_PPC_HI)
     .Case("ha", VK_PPC_HA)
+    .Case("high", VK_PPC_HIGH)
+    .Case("higha", VK_PPC_HIGHA)
     .Case("higher", VK_PPC_HIGHER)
     .Case("highera", VK_PPC_HIGHERA)
     .Case("highest", VK_PPC_HIGHEST)
@@ -355,6 +369,8 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("tprel@l", VK_PPC_TPREL_LO)
     .Case("tprel@h", VK_PPC_TPREL_HI)
     .Case("tprel@ha", VK_PPC_TPREL_HA)
+    .Case("tprel@high", VK_PPC_TPREL_HIGH)
+    .Case("tprel@higha", VK_PPC_TPREL_HIGHA)
     .Case("tprel@higher", VK_PPC_TPREL_HIGHER)
     .Case("tprel@highera", VK_PPC_TPREL_HIGHERA)
     .Case("tprel@highest", VK_PPC_TPREL_HIGHEST)
@@ -362,6 +378,8 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("dtprel@l", VK_PPC_DTPREL_LO)
     .Case("dtprel@h", VK_PPC_DTPREL_HI)
     .Case("dtprel@ha", VK_PPC_DTPREL_HA)
+    .Case("dtprel@high", VK_PPC_DTPREL_HIGH)
+    .Case("dtprel@higha", VK_PPC_DTPREL_HIGHA)
     .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER)
     .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA)
     .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST)
@@ -399,10 +417,13 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("lo8", VK_AVR_LO8)
     .Case("hi8", VK_AVR_HI8)
     .Case("hlo8", VK_AVR_HLO8)
+    .Case("function", VK_WebAssembly_FUNCTION)
+    .Case("typeindex", VK_WebAssembly_TYPEINDEX)
     .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO)
     .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI)
     .Case("rel32@lo", VK_AMDGPU_REL32_LO)
     .Case("rel32@hi", VK_AMDGPU_REL32_HI)
+    .Case("rel64", VK_AMDGPU_REL64)
     .Default(VK_Invalid);
 }
 
@@ -438,6 +459,10 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const {
   return evaluateAsAbsolute(Res, &Asm, nullptr, nullptr);
 }
 
+bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm) const {
+  return evaluateAsAbsolute(Res, Asm, nullptr, nullptr);
+}
+
 bool MCExpr::evaluateKnownAbsolute(int64_t &Res,
                                    const MCAsmLayout &Layout) const {
   return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, nullptr,
@@ -473,7 +498,7 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
   return IsRelocatable && Value.isAbsolute();
 }
 
-/// \brief Helper method for \see EvaluateSymbolAdd().
+/// Helper method for \see EvaluateSymbolAdd().
 static void AttemptToFoldSymbolOffsetDifference(
     const MCAssembler *Asm, const MCAsmLayout *Layout,
     const SectionAddrMap *Addrs, bool InSet, const MCSymbolRefExpr *&A,
@@ -491,7 +516,7 @@ static void AttemptToFoldSymbolOffsetDifference(
     return;
 
   if (SA.getFragment() == SB.getFragment() && !SA.isVariable() &&
-      !SB.isVariable()) {
+      !SA.isUnset() && !SB.isVariable() && !SB.isUnset()) {
     Addend += (SA.getOffset() - SB.getOffset());
 
     // Pointers to Thumb symbols need to have their low-bit set to allow
@@ -530,7 +555,7 @@ static void AttemptToFoldSymbolOffsetDifference(
   A = B = nullptr;
 }
 
-/// \brief Evaluate the result of an add between (conceptually) two MCValues.
+/// Evaluate the result of an add between (conceptually) two MCValues.
 ///
 /// This routine conceptually attempts to construct an MCValue:
 ///   Result = (Result_A - Result_B + Result_Cst)
@@ -566,8 +591,12 @@ EvaluateSymbolicAdd(const MCAssembler *Asm, const MCAsmLayout *Layout,
   assert((!Layout || Asm) &&
          "Must have an assembler object if layout is given!");
 
-  // If we have a layout, we can fold resolved differences.
-  if (Asm) {
+  // If we have a layout, we can fold resolved differences. Do not do this if
+  // the backend requires this to be emitted as individual relocations, unless
+  // the InSet flag is set to get the current difference anyway (used for
+  // example to calculate symbol sizes).
+  if (Asm &&
+      (InSet || !Asm->getBackend().requiresDiffExpressionRelocations())) {
     // First, fold out any differences which are fully resolved. By
     // reassociating terms in
     //   Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst).
@@ -749,11 +778,13 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     // Apple as.
     int64_t LHS = LHSValue.getConstant(), RHS = RHSValue.getConstant();
     int64_t Result = 0;
-    switch (ABE->getOpcode()) {
+    auto Op = ABE->getOpcode();
+    switch (Op) {
     case MCBinaryExpr::AShr: Result = LHS >> RHS; break;
     case MCBinaryExpr::Add:  Result = LHS + RHS; break;
     case MCBinaryExpr::And:  Result = LHS & RHS; break;
     case MCBinaryExpr::Div:
+    case MCBinaryExpr::Mod:
       // Handle division by zero. gas just emits a warning and keeps going,
       // we try to be stricter.
       // FIXME: Currently the caller of this function has no way to understand
@@ -762,7 +793,10 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
       // change this code to emit a better diagnostic.
       if (RHS == 0)
         return false;
-      Result = LHS / RHS;
+      if (ABE->getOpcode() == MCBinaryExpr::Div)
+        Result = LHS / RHS;
+      else
+        Result = LHS % RHS;
       break;
     case MCBinaryExpr::EQ:   Result = LHS == RHS; break;
     case MCBinaryExpr::GT:   Result = LHS > RHS; break;
@@ -772,7 +806,6 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     case MCBinaryExpr::LShr: Result = uint64_t(LHS) >> uint64_t(RHS); break;
     case MCBinaryExpr::LT:   Result = LHS < RHS; break;
     case MCBinaryExpr::LTE:  Result = LHS <= RHS; break;
-    case MCBinaryExpr::Mod:  Result = LHS % RHS; break;
     case MCBinaryExpr::Mul:  Result = LHS * RHS; break;
     case MCBinaryExpr::NE:   Result = LHS != RHS; break;
     case MCBinaryExpr::Or:   Result = LHS | RHS; break;
@@ -781,7 +814,21 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     case MCBinaryExpr::Xor:  Result = LHS ^ RHS; break;
     }
 
-    Res = MCValue::get(Result);
+    switch (Op) {
+    default:
+      Res = MCValue::get(Result);
+      break;
+    case MCBinaryExpr::EQ:
+    case MCBinaryExpr::GT:
+    case MCBinaryExpr::GTE:
+    case MCBinaryExpr::LT:
+    case MCBinaryExpr::LTE:
+    case MCBinaryExpr::NE:
+      // A comparison operator returns a -1 if true and 0 if false.
+      Res = MCValue::get(Result ? -1 : 0);
+      break;
+    }
+
     return true;
   }
   }
diff --git a/contrib/llvm/lib/MC/MCFragment.cpp b/contrib/llvm/lib/MC/MCFragment.cpp
index 1aed50aaeb77..0ebcf21a422e 100644
--- a/contrib/llvm/lib/MC/MCFragment.cpp
+++ b/contrib/llvm/lib/MC/MCFragment.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -188,7 +189,7 @@ uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const {
 }
 
 uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
-                                    const MCFragment *F,
+                                    const MCEncodedFragment *F,
                                     uint64_t FOffset, uint64_t FSize) {
   uint64_t BundleSize = Assembler.getBundleAlignSize();
   assert(BundleSize > 0 &&
@@ -235,10 +236,9 @@ void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
 MCFragment::~MCFragment() = default;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
-                       uint8_t BundlePadding, MCSection *Parent)
-    : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false),
-      BundlePadding(BundlePadding), Parent(Parent), Atom(nullptr),
-      Offset(~UINT64_C(0)) {
+                       MCSection *Parent)
+    : Kind(Kind), HasInstructions(HasInstructions), Parent(Parent),
+      Atom(nullptr), Offset(~UINT64_C(0)) {
   if (Parent && !isDummy())
     Parent->getFragmentList().push_back(this);
 }
@@ -332,10 +332,11 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
   }
 
-  OS << "<MCFragment " << (const void*) this << " LayoutOrder:" << LayoutOrder
-     << " Offset:" << Offset
-     << " HasInstructions:" << hasInstructions()
-     << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
+  OS << "<MCFragment " << (const void *)this << " LayoutOrder:" << LayoutOrder
+     << " Offset:" << Offset << " HasInstructions:" << hasInstructions();
+  if (const MCEncodedFragment *EF = dyn_cast<MCEncodedFragment>(this))
+    OS << " BundlePadding:" << static_cast<unsigned>(EF->getBundlePadding());
+  OS << ">";
 
   switch (getKind()) {
   case MCFragment::FT_Align: {
@@ -387,7 +388,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_Fill:  {
     const MCFillFragment *FF = cast<MCFillFragment>(this);
     OS << " Value:" << static_cast<unsigned>(FF->getValue())
-       << " Size:" << FF->getSize();
+       << " ValueSize:" << static_cast<unsigned>(FF->getValueSize())
+       << " NumValues:" << FF->getNumValues();
     break;
   }
   case MCFragment::FT_Relaxable:  {
@@ -463,26 +465,4 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   }
   OS << ">";
 }
-
-LLVM_DUMP_METHOD void MCAssembler::dump() const{
-  raw_ostream &OS = errs();
-
-  OS << "<MCAssembler\n";
-  OS << "  Sections:[\n    ";
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if (it != begin()) OS << ",\n    ";
-    it->dump();
-  }
-  OS << "],\n";
-  OS << "  Symbols:[";
-
-  for (const_symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
-    if (it != symbol_begin()) OS << ",\n           ";
-    OS << "(";
-    it->dump();
-    OS << ", Index:" << it->getIndex() << ", ";
-    OS << ")";
-  }
-  OS << "]>\n";
-}
 #endif
diff --git a/contrib/llvm/lib/MC/MCInst.cpp b/contrib/llvm/lib/MC/MCInst.cpp
index f6d1d3cffca0..f9b71caaf91c 100644
--- a/contrib/llvm/lib/MC/MCInst.cpp
+++ b/contrib/llvm/lib/MC/MCInst.cpp
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCInst.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -35,6 +37,23 @@ void MCOperand::print(raw_ostream &OS) const {
   OS << ">";
 }
 
+bool MCOperand::evaluateAsConstantImm(int64_t &Imm) const {
+  if (isImm()) {
+    Imm = getImm();
+    return true;
+  }
+  return false;
+}
+
+bool MCOperand::isBareSymbolRef() const {
+  assert(isExpr() &&
+         "isBareSymbolRef expects only expressions");
+  const MCExpr *Expr = getExpr();
+  MCExpr::ExprKind Kind = getExpr()->getKind();
+  return Kind == MCExpr::SymbolRef &&
+    cast<MCSymbolRefExpr>(Expr)->getKind() == MCSymbolRefExpr::VK_None;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCOperand::dump() const {
   print(dbgs());
diff --git a/contrib/llvm/lib/MC/MCInstrAnalysis.cpp b/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
index 280b5cf68c98..8223f3a5c66f 100644
--- a/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
+++ b/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCInstrAnalysis.h"
+
+#include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -15,6 +17,13 @@
 
 using namespace llvm;
 
+bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
+                                           const MCInst &Inst,
+                                           APInt &Writes) const {
+  Writes.clearAllBits();
+  return false;
+}
+
 bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
                                      uint64_t Size, uint64_t &Target) const {
   if (Inst.getNumOperands() == 0 ||
diff --git a/contrib/llvm/lib/MC/MCLabel.cpp b/contrib/llvm/lib/MC/MCLabel.cpp
index db25a46fce18..c376c83274ef 100644
--- a/contrib/llvm/lib/MC/MCLabel.cpp
+++ b/contrib/llvm/lib/MC/MCLabel.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCLabel.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp b/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp
index 97f95418e054..2f8581470ea6 100644
--- a/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp
+++ b/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp
@@ -36,7 +36,7 @@ void MCLOHDirective::emit_impl(raw_ostream &OutStream,
 
 void MCLOHDirective::emit(MachObjectWriter &ObjWriter,
                           const MCAsmLayout &Layout) const {
-  raw_ostream &OutStream = ObjWriter.getStream();
+  raw_ostream &OutStream = ObjWriter.W.OS;
   emit_impl(OutStream, ObjWriter, Layout);
 }
 
diff --git a/contrib/llvm/lib/MC/MCMachOStreamer.cpp b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
index 3969143bb2c7..43e69605787c 100644
--- a/contrib/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
@@ -63,9 +64,11 @@ private:
 
 public:
   MCMachOStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-                  raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+                  std::unique_ptr<MCObjectWriter> OW,
+                  std::unique_ptr<MCCodeEmitter> Emitter,
                   bool DWARFMustBeAtTheEnd, bool label)
-      : MCObjectStreamer(Context, std::move(MAB), OS, std::move(Emitter)),
+      : MCObjectStreamer(Context, std::move(MAB), std::move(OW),
+                         std::move(Emitter)),
         LabelSections(label), DWARFMustBeAtTheEnd(DWARFMustBeAtTheEnd),
         CreatedADWARFSection(false) {}
 
@@ -99,7 +102,8 @@ public:
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                    uint64_t Size = 0, unsigned ByteAlignment = 0) override;
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override;
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
@@ -410,9 +414,18 @@ void MCMachOStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
-                                   uint64_t Size, unsigned ByteAlignment) {
-  // On darwin all virtual sections have zerofill type.
-  assert(Section->isVirtualSection() && "Section does not have zerofill type!");
+                                   uint64_t Size, unsigned ByteAlignment,
+                                   SMLoc Loc) {
+  // On darwin all virtual sections have zerofill type. Disallow the usage of
+  // .zerofill in non-virtual functions. If something similar is needed, use
+  // .space or .zero.
+  if (!Section->isVirtualSection()) {
+    getContext().reportError(
+        Loc, "The usage of .zerofill is restricted to sections of "
+             "ZEROFILL type. Use .zero or .space instead.");
+    return; // Early returning here shouldn't harm. EmitZeros should work on any
+            // section.
+  }
 
   PushSection();
   SwitchSection(Section);
@@ -447,6 +460,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst,
     Fixup.setOffset(Fixup.getOffset() + DF->getContents().size());
     DF->getFixups().push_back(Fixup);
   }
+  DF->setHasInstructions(STI);
   DF->getContents().append(Code.begin(), Code.end());
 }
 
@@ -485,12 +499,12 @@ void MCMachOStreamer::FinishImpl() {
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context,
                                       std::unique_ptr<MCAsmBackend> &&MAB,
-                                      raw_pwrite_stream &OS,
+                                      std::unique_ptr<MCObjectWriter> &&OW,
                                       std::unique_ptr<MCCodeEmitter> &&CE,
                                       bool RelaxAll, bool DWARFMustBeAtTheEnd,
                                       bool LabelSections) {
   MCMachOStreamer *S =
-      new MCMachOStreamer(Context, std::move(MAB), OS, std::move(CE),
+      new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE),
                           DWARFMustBeAtTheEnd, LabelSections);
   const Triple &Target = Context.getObjectFileInfo()->getTargetTriple();
   S->EmitVersionForTarget(Target);
diff --git a/contrib/llvm/lib/MC/MCNullStreamer.cpp b/contrib/llvm/lib/MC/MCNullStreamer.cpp
index ccf658e1d135..a96dec184441 100644
--- a/contrib/llvm/lib/MC/MCNullStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCNullStreamer.cpp
@@ -30,7 +30,8 @@ namespace {
     void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                           unsigned ByteAlignment) override {}
     void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                      uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
+                      uint64_t Size = 0, unsigned ByteAlignment = 0,
+                      SMLoc Loc = SMLoc()) override {}
     void EmitGPRel32Value(const MCExpr *Value) override {}
     void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
     void EmitCOFFSymbolStorageClass(int StorageClass) override {}
diff --git a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
index 328f000f37c9..29d34a8c1e3e 100644
--- a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -135,6 +135,10 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   // "__DATA/__datacoal_nt" => section "__DATA/__data"
   Triple::ArchType ArchTy = T.getArch();
 
+  ConstDataSection  // .const_data
+    = Ctx->getMachOSection("__DATA", "__const", 0,
+                           SectionKind::getReadOnlyWithRel());
+
   if (ArchTy == Triple::ppc || ArchTy == Triple::ppc64) {
     TextCoalSection
       = Ctx->getMachOSection("__TEXT", "__textcoal_nt",
@@ -147,15 +151,14 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
                              SectionKind::getReadOnly());
     DataCoalSection = Ctx->getMachOSection(
         "__DATA", "__datacoal_nt", MachO::S_COALESCED, SectionKind::getData());
+    ConstDataCoalSection = DataCoalSection;
   } else {
     TextCoalSection = TextSection;
     ConstTextCoalSection = ReadOnlySection;
     DataCoalSection = DataSection;
+    ConstDataCoalSection = ConstDataSection;
   }
 
-  ConstDataSection  // .const_data
-    = Ctx->getMachOSection("__DATA", "__const", 0,
-                           SectionKind::getReadOnlyWithRel());
   DataCommonSection
     = Ctx->getMachOSection("__DATA","__common",
                            MachO::S_ZEROFILL,
@@ -201,6 +204,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   }
 
   // Debug Information.
+  DwarfDebugNamesSection =
+      Ctx->getMachOSection("__DWARF", "__debug_names", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "debug_names_begin");
   DwarfAccelNamesSection =
       Ctx->getMachOSection("__DWARF", "__apple_names", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "names_begin");
@@ -228,6 +234,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfLineSection =
       Ctx->getMachOSection("__DWARF", "__debug_line", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_line");
+  DwarfLineStrSection =
+      Ctx->getMachOSection("__DWARF", "__debug_line_str", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "section_line_str");
   DwarfFrameSection =
       Ctx->getMachOSection("__DWARF", "__debug_frame", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
@@ -258,6 +267,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfRangesSection =
       Ctx->getMachOSection("__DWARF", "__debug_ranges", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "debug_range");
+  DwarfRnglistsSection =
+      Ctx->getMachOSection("__DWARF", "__debug_rnglists", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "debug_range");
   DwarfMacinfoSection =
       Ctx->getMachOSection("__DWARF", "__debug_macinfo", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "debug_macinfo");
@@ -520,8 +532,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   // MIPS .debug_* sections should have SHT_MIPS_DWARF section type
   // to distinguish among sections contain DWARF and ECOFF debug formats.
   // Sections with ECOFF debug format are obsoleted and marked by SHT_PROGBITS.
-  if (T.getArch() == Triple::mips || T.getArch() == Triple::mipsel ||
-      T.getArch() == Triple::mips64 || T.getArch() == Triple::mips64el)
+  if (T.isMIPS())
     DebugSecType = ELF::SHT_MIPS_DWARF;
 
   // Debug Info Sections.
@@ -529,6 +540,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_abbrev", DebugSecType, 0);
   DwarfInfoSection = Ctx->getELFSection(".debug_info", DebugSecType, 0);
   DwarfLineSection = Ctx->getELFSection(".debug_line", DebugSecType, 0);
+  DwarfLineStrSection =
+      Ctx->getELFSection(".debug_line_str", DebugSecType,
+                         ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   DwarfFrameSection = Ctx->getELFSection(".debug_frame", DebugSecType, 0);
   DwarfPubNamesSection =
       Ctx->getELFSection(".debug_pubnames", DebugSecType, 0);
@@ -552,6 +566,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   // DWARF5 Experimental Debug Info
 
   // Accelerator Tables
+  DwarfDebugNamesSection =
+      Ctx->getELFSection(".debug_names", ELF::SHT_PROGBITS, 0);
   DwarfAccelNamesSection =
       Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0);
   DwarfAccelObjCSection =
@@ -565,6 +581,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   DwarfStrOffSection =
       Ctx->getELFSection(".debug_str_offsets", DebugSecType, 0);
   DwarfAddrSection = Ctx->getELFSection(".debug_addr", DebugSecType, 0);
+  DwarfRnglistsSection = Ctx->getELFSection(".debug_rnglists", DebugSecType, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
@@ -582,6 +599,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_loc.dwo", DebugSecType, 0);
   DwarfStrOffDWOSection =
       Ctx->getELFSection(".debug_str_offsets.dwo", DebugSecType, 0);
+  DwarfRnglistsDWOSection =
+      Ctx->getELFSection(".debug_rnglists.dwo", DebugSecType, 0);
 
   // DWP Sections
   DwarfCUIndexSection =
@@ -679,7 +698,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "section_line");
-
+  DwarfLineStrSection = Ctx->getCOFFSection(
+      ".debug_line_str",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "section_line_str");
   DwarfFrameSection = Ctx->getCOFFSection(
       ".debug_frame",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -785,6 +808,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata());
+  DwarfDebugNamesSection = Ctx->getCOFFSection(
+      ".debug_names",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "debug_names_begin");
   DwarfAccelNamesSection = Ctx->getCOFFSection(
       ".apple_names",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -821,6 +849,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
   SXDataSection = Ctx->getCOFFSection(".sxdata", COFF::IMAGE_SCN_LNK_INFO,
                                       SectionKind::getMetadata());
 
+  GFIDsSection = Ctx->getCOFFSection(".gfids$y",
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                         COFF::IMAGE_SCN_MEM_READ,
+                                     SectionKind::getMetadata());
+
   TLSDataSection = Ctx->getCOFFSection(
       ".tls$", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ |
                    COFF::IMAGE_SCN_MEM_WRITE,
@@ -833,22 +866,29 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
 }
 
 void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
-  // TODO: Set the section types and flags.
   TextSection = Ctx->getWasmSection(".text", SectionKind::getText());
   DataSection = Ctx->getWasmSection(".data", SectionKind::getData());
 
-  // TODO: Set the section types and flags.
-  DwarfLineSection = Ctx->getWasmSection(".debug_line", SectionKind::getMetadata());
-  DwarfStrSection = Ctx->getWasmSection(".debug_str", SectionKind::getMetadata());
-  DwarfLocSection = Ctx->getWasmSection(".debug_loc", SectionKind::getMetadata());
-  DwarfAbbrevSection = Ctx->getWasmSection(".debug_abbrev", SectionKind::getMetadata(), "section_abbrev");
+  DwarfLineSection =
+      Ctx->getWasmSection(".debug_line", SectionKind::getMetadata());
+  DwarfLineStrSection =
+      Ctx->getWasmSection(".debug_line_str", SectionKind::getMetadata());
+  DwarfStrSection =
+      Ctx->getWasmSection(".debug_str", SectionKind::getMetadata());
+  DwarfLocSection =
+      Ctx->getWasmSection(".debug_loc", SectionKind::getMetadata());
+  DwarfAbbrevSection =
+      Ctx->getWasmSection(".debug_abbrev", SectionKind::getMetadata());
   DwarfARangesSection = Ctx->getWasmSection(".debug_aranges", SectionKind::getMetadata());
-  DwarfRangesSection = Ctx->getWasmSection(".debug_ranges", SectionKind::getMetadata(), "debug_range");
-  DwarfMacinfoSection = Ctx->getWasmSection(".debug_macinfo", SectionKind::getMetadata(), "debug_macinfo");
+  DwarfRangesSection =
+      Ctx->getWasmSection(".debug_ranges", SectionKind::getMetadata());
+  DwarfMacinfoSection =
+      Ctx->getWasmSection(".debug_macinfo", SectionKind::getMetadata());
   DwarfAddrSection = Ctx->getWasmSection(".debug_addr", SectionKind::getMetadata());
   DwarfCUIndexSection = Ctx->getWasmSection(".debug_cu_index", SectionKind::getMetadata());
   DwarfTUIndexSection = Ctx->getWasmSection(".debug_tu_index", SectionKind::getMetadata());
-  DwarfInfoSection = Ctx->getWasmSection(".debug_info", SectionKind::getMetadata(), "section_info");
+  DwarfInfoSection =
+      Ctx->getWasmSection(".debug_info", SectionKind::getMetadata());
   DwarfFrameSection = Ctx->getWasmSection(".debug_frame", SectionKind::getMetadata());
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
@@ -913,3 +953,24 @@ MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const {
   return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP,
                             0, utostr(Hash));
 }
+
+MCSection *
+MCObjectFileInfo::getStackSizesSection(const MCSection &TextSec) const {
+  if (Env != IsELF)
+    return StackSizesSection;
+
+  const MCSectionELF &ElfSec = static_cast<const MCSectionELF &>(TextSec);
+  unsigned Flags = ELF::SHF_LINK_ORDER;
+  StringRef GroupName;
+  if (const MCSymbol *Group = ElfSec.getGroup()) {
+    GroupName = Group->getName();
+    Flags |= ELF::SHF_GROUP;
+  }
+
+  const MCSymbol *Link = TextSec.getBeginSymbol();
+  auto It = StackSizesUniquing.insert({Link, StackSizesUniquing.size()});
+  unsigned UniqueID = It.first->second;
+
+  return Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, Flags, 0,
+                            GroupName, UniqueID, cast<MCSymbolELF>(Link));
+}
diff --git a/contrib/llvm/lib/MC/MCObjectStreamer.cpp b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
index aecb3844622b..4b6dad5ce8f3 100644
--- a/contrib/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
@@ -25,16 +25,24 @@ using namespace llvm;
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context,
                                    std::unique_ptr<MCAsmBackend> TAB,
-                                   raw_pwrite_stream &OS,
+                                   std::unique_ptr<MCObjectWriter> OW,
                                    std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCStreamer(Context), ObjectWriter(TAB->createObjectWriter(OS)),
-      TAB(std::move(TAB)), Emitter(std::move(Emitter)),
-      Assembler(llvm::make_unique<MCAssembler>(Context, *this->TAB,
-                                               *this->Emitter, *ObjectWriter)),
+    : MCStreamer(Context),
+      Assembler(llvm::make_unique<MCAssembler>(
+          Context, std::move(TAB), std::move(Emitter), std::move(OW))),
       EmitEHFrame(true), EmitDebugFrame(false) {}
 
 MCObjectStreamer::~MCObjectStreamer() {}
 
+// AssemblerPtr is used for evaluation of expressions and causes
+// difference between asm and object outputs. Return nullptr to in
+// inline asm mode to limit divergence to assembly inputs.
+MCAssembler *MCObjectStreamer::getAssemblerPtr() {
+  if (getUseAssemblerInfoForParsing())
+    return Assembler.get();
+  return nullptr;
+}
+
 void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) {
   if (PendingLabels.empty())
     return;
@@ -51,17 +59,35 @@ void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) {
   PendingLabels.clear();
 }
 
+// As a compile-time optimization, avoid allocating and evaluating an MCExpr
+// tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment.
+static Optional<uint64_t> absoluteSymbolDiff(const MCSymbol *Hi,
+                                             const MCSymbol *Lo) {
+  assert(Hi && Lo);
+  if (!Hi->getFragment() || Hi->getFragment() != Lo->getFragment() ||
+      Hi->isVariable() || Lo->isVariable())
+    return None;
+
+  return Hi->getOffset() - Lo->getOffset();
+}
+
 void MCObjectStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi,
                                               const MCSymbol *Lo,
                                               unsigned Size) {
-  // If not assigned to the same (valid) fragment, fallback.
-  if (!Hi->getFragment() || Hi->getFragment() != Lo->getFragment() ||
-      Hi->isVariable() || Lo->isVariable()) {
-    MCStreamer::emitAbsoluteSymbolDiff(Hi, Lo, Size);
+  if (Optional<uint64_t> Diff = absoluteSymbolDiff(Hi, Lo)) {
+    EmitIntValue(*Diff, Size);
     return;
   }
+  MCStreamer::emitAbsoluteSymbolDiff(Hi, Lo, Size);
+}
 
-  EmitIntValue(Hi->getOffset() - Lo->getOffset(), Size);
+void MCObjectStreamer::emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
+                                                       const MCSymbol *Lo) {
+  if (Optional<uint64_t> Diff = absoluteSymbolDiff(Hi, Lo)) {
+    EmitULEB128IntValue(*Diff);
+    return;
+  }
+  MCStreamer::emitAbsoluteSymbolDiffAsULEB128(Hi, Lo);
 }
 
 void MCObjectStreamer::reset() {
@@ -94,12 +120,24 @@ MCFragment *MCObjectStreamer::getCurrentFragment() const {
   return nullptr;
 }
 
-MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() {
-  MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+static bool CanReuseDataFragment(const MCDataFragment &F,
+                                 const MCAssembler &Assembler,
+                                 const MCSubtargetInfo *STI) {
+  if (!F.hasInstructions())
+    return true;
   // When bundling is enabled, we don't want to add data to a fragment that
   // already has instructions (see MCELFStreamer::EmitInstToData for details)
-  if (!F || (Assembler->isBundlingEnabled() && !Assembler->getRelaxAll() &&
-             F->hasInstructions())) {
+  if (Assembler.isBundlingEnabled())
+    return Assembler.getRelaxAll();
+  // If the subtarget is changed mid fragment we start a new fragment to record
+  // the new STI.
+  return !STI || F.getSubtargetInfo() == STI;
+}
+
+MCDataFragment *
+MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
+  MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+  if (!F || !CanReuseDataFragment(*F, *Assembler, STI)) {
     F = new MCDataFragment();
     insert(F);
   }
@@ -137,7 +175,7 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
 
   // Avoid fixups when possible.
   int64_t AbsValue;
-  if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) {
+  if (Value->evaluateAsAbsolute(AbsValue, getAssemblerPtr())) {
     if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
       getContext().reportError(
           Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
@@ -199,7 +237,7 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc, MCFragment *F) {
 
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
-  if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
+  if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
     EmitULEB128IntValue(IntValue);
     return;
   }
@@ -208,7 +246,7 @@ void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
 
 void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
-  if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
+  if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
     EmitSLEB128IntValue(IntValue);
     return;
   }
@@ -229,13 +267,14 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
                                          const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   flushPendingLabels(nullptr);
+  getContext().clearCVLocSeen();
   getContext().clearDwarfLocSeen();
 
   bool Created = getAssembler().registerSection(*Section);
 
   int64_t IntSubsection = 0;
   if (Subsection &&
-      !Subsection->evaluateAsAbsolute(IntSubsection, getAssembler()))
+      !Subsection->evaluateAsAbsolute(IntSubsection, getAssemblerPtr()))
     report_fatal_error("Cannot evaluate subsection number");
   if (IntSubsection < 0 || IntSubsection > 8192)
     report_fatal_error("Subsection number out of range");
@@ -274,7 +313,7 @@ void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
 
   // If this instruction doesn't need relaxation, just emit it as data.
   MCAssembler &Assembler = getAssembler();
-  if (!Assembler.getBackend().mayNeedRelaxation(Inst)) {
+  if (!Assembler.getBackend().mayNeedRelaxation(Inst, STI)) {
     EmitInstToData(Inst, STI);
     return;
   }
@@ -288,7 +327,7 @@ void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
       (Assembler.isBundlingEnabled() && Sec->isBundleLocked())) {
     MCInst Relaxed;
     getAssembler().getBackend().relaxInstruction(Inst, STI, Relaxed);
-    while (getAssembler().getBackend().mayNeedRelaxation(Relaxed))
+    while (getAssembler().getBackend().mayNeedRelaxation(Relaxed, STI))
       getAssembler().getBackend().relaxInstruction(Relaxed, STI, Relaxed);
     EmitInstToData(Relaxed, STI);
     return;
@@ -381,7 +420,7 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
   }
   const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel);
   int64_t Res;
-  if (AddrDelta->evaluateAsAbsolute(Res, getAssembler())) {
+  if (AddrDelta->evaluateAsAbsolute(Res, getAssemblerPtr())) {
     MCDwarfLineAddr::Emit(this, Assembler->getDWARFLinetableParams(), LineDelta,
                           Res);
     return;
@@ -393,7 +432,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                                  const MCSymbol *Label) {
   const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel);
   int64_t Res;
-  if (AddrDelta->evaluateAsAbsolute(Res, getAssembler())) {
+  if (AddrDelta->evaluateAsAbsolute(Res, getAssemblerPtr())) {
     MCDwarfFrameEmitter::EmitAdvanceLoc(*this, Res);
     return;
   }
@@ -553,7 +592,8 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
 }
 
 bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                          const MCExpr *Expr, SMLoc Loc) {
+                                          const MCExpr *Expr, SMLoc Loc,
+                                          const MCSubtargetInfo &STI) {
   int64_t OffsetValue;
   if (!Offset.evaluateAsAbsolute(OffsetValue))
     llvm_unreachable("Offset is not absolute");
@@ -561,7 +601,7 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
   if (OffsetValue < 0)
     llvm_unreachable("Offset is negative");
 
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCDataFragment *DF = getOrCreateDataFragment(&STI);
   flushPendingLabels(DF, DF->getContents().size());
 
   Optional<MCFixupKind> MaybeKind = Assembler->getBackend().getFixupKind(Name);
@@ -583,32 +623,55 @@ void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
   flushPendingLabels(DF, DF->getContents().size());
 
   assert(getCurrentSectionOnly() && "need a section");
-  insert(new MCFillFragment(FillValue, NumBytes, Loc));
+  insert(new MCFillFragment(FillValue, 1, NumBytes, Loc));
 }
 
 void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
                                 int64_t Expr, SMLoc Loc) {
   int64_t IntNumValues;
-  if (!NumValues.evaluateAsAbsolute(IntNumValues, getAssembler())) {
-    getContext().reportError(Loc, "expected absolute expression");
+  // Do additional checking now if we can resolve the value.
+  if (NumValues.evaluateAsAbsolute(IntNumValues, getAssemblerPtr())) {
+    if (IntNumValues < 0) {
+      getContext().getSourceManager()->PrintMessage(
+          Loc, SourceMgr::DK_Warning,
+          "'.fill' directive with negative repeat count has no effect");
+      return;
+    }
+    // Emit now if we can for better errors.
+    int64_t NonZeroSize = Size > 4 ? 4 : Size;
+    Expr &= ~0ULL >> (64 - NonZeroSize * 8);
+    for (uint64_t i = 0, e = IntNumValues; i != e; ++i) {
+      EmitIntValue(Expr, NonZeroSize);
+      if (NonZeroSize < Size)
+        EmitIntValue(0, Size - NonZeroSize);
+    }
     return;
   }
 
-  if (IntNumValues < 0) {
-    getContext().getSourceManager()->PrintMessage(
-        Loc, SourceMgr::DK_Warning,
-        "'.fill' directive with negative repeat count has no effect");
-    return;
-  }
+  // Otherwise emit as fragment.
+  MCDataFragment *DF = getOrCreateDataFragment();
+  flushPendingLabels(DF, DF->getContents().size());
 
-  MCStreamer::emitFill(IntNumValues, Size, Expr);
+  assert(getCurrentSectionOnly() && "need a section");
+  insert(new MCFillFragment(Expr, Size, NumValues, Loc));
 }
 
 void MCObjectStreamer::EmitFileDirective(StringRef Filename) {
   getAssembler().addFileName(Filename);
 }
 
+void MCObjectStreamer::EmitAddrsig() {
+  getAssembler().getWriter().emitAddrsigSection();
+}
+
+void MCObjectStreamer::EmitAddrsigSym(const MCSymbol *Sym) {
+  getAssembler().registerSymbol(*Sym);
+  getAssembler().getWriter().addAddrsigSymbol(Sym);
+}
+
 void MCObjectStreamer::FinishImpl() {
+  getContext().RemapDebugPaths();
+
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
     MCGenDwarfInfo::Emit(this);
@@ -616,6 +679,6 @@ void MCObjectStreamer::FinishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams());
 
-  flushPendingLabels(nullptr);
+  flushPendingLabels();
   getAssembler().Finish();
 }
diff --git a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
index ce3b70bed740..39a760826d96 100644
--- a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
@@ -79,11 +80,11 @@ static cl::opt<unsigned> AsmMacroMaxNestingDepth(
 
 namespace {
 
-/// \brief Helper types for tracking macro definitions.
+/// Helper types for tracking macro definitions.
 typedef std::vector<AsmToken> MCAsmMacroArgument;
 typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
 
-/// \brief Helper class for storing information about an active macro
+/// Helper class for storing information about an active macro
 /// instantiation.
 struct MacroInstantiation {
   /// The location of the instantiation.
@@ -103,13 +104,13 @@ public:
 };
 
 struct ParseStatementInfo {
-  /// \brief The parsed operands from the last parsed statement.
+  /// The parsed operands from the last parsed statement.
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
 
-  /// \brief The opcode from the last parsed instruction.
+  /// The opcode from the last parsed instruction.
   unsigned Opcode = ~0U;
 
-  /// \brief Was there an error parsing the inline assembly?
+  /// Was there an error parsing the inline assembly?
   bool ParseError = false;
 
   SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
@@ -119,7 +120,7 @@ struct ParseStatementInfo {
     : AsmRewrites(rewrites) {}
 };
 
-/// \brief The concrete assembly parser instance.
+/// The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
 private:
   AsmLexer Lexer;
@@ -138,21 +139,21 @@ private:
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
 
-  /// \brief maps directive names to handler methods in parser
+  /// maps directive names to handler methods in parser
   /// extensions. Extensions register themselves in this map by calling
   /// addDirectiveHandler.
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
-  /// \brief Stack of active macro instantiations.
+  /// Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
-  /// \brief List of bodies of anonymous macros.
+  /// List of bodies of anonymous macros.
   std::deque<MCAsmMacro> MacroLikeBodies;
 
   /// Boolean tracking whether macro substitution is enabled.
   unsigned MacrosEnabledFlag : 1;
 
-  /// \brief Keeps track of how many .macro's have been instantiated.
+  /// Keeps track of how many .macro's have been instantiated.
   unsigned NumOfMacroInstantiations;
 
   /// The values from the last parsed cpp hash file line comment if any.
@@ -164,26 +165,21 @@ private:
   };
   CppHashInfoTy CppHashInfo;
 
-  /// \brief List of forward directional labels for diagnosis at the end.
+  /// List of forward directional labels for diagnosis at the end.
   SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;
 
-  /// When generating dwarf for assembly source files we need to calculate the
-  /// logical line number based on the last parsed cpp hash file line comment
-  /// and current line. Since this is slow and messes up the SourceMgr's
-  /// cache we save the last info we queried with SrcMgr.FindLineNumber().
-  SMLoc LastQueryIDLoc;
-  unsigned LastQueryBuffer;
-  unsigned LastQueryLine;
-
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect = ~0U;
 
-  /// \brief is Darwin compatibility enabled?
+  /// is Darwin compatibility enabled?
   bool IsDarwin = false;
 
-  /// \brief Are we parsing ms-style inline assembly?
+  /// Are we parsing ms-style inline assembly?
   bool ParsingInlineAsm = false;
 
+  /// Did we already inform the user about inconsistent MD5 usage?
+  bool ReportedInconsistentMD5 = false;
+
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
             const MCAsmInfo &MAI, unsigned CB);
@@ -250,11 +246,11 @@ public:
                              SMLoc &EndLoc) override;
   bool parseAbsoluteExpression(int64_t &Res) override;
 
-  /// \brief Parse a floating point expression using the float \p Semantics
+  /// Parse a floating point expression using the float \p Semantics
   /// and set \p Res to the value.
   bool parseRealValue(const fltSemantics &Semantics, APInt &Res);
 
-  /// \brief Parse an identifier or string (as a quoted identifier)
+  /// Parse an identifier or string (as a quoted identifier)
   /// and set \p Res to the identifier contents.
   bool parseIdentifier(StringRef &Res) override;
   void eatToEndOfStatement() override;
@@ -278,28 +274,28 @@ private:
                    ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
                    SMLoc L);
 
-  /// \brief Are macros enabled in the parser?
+  /// Are macros enabled in the parser?
   bool areMacrosEnabled() {return MacrosEnabledFlag;}
 
-  /// \brief Control a flag in the parser that enables or disables macros.
+  /// Control a flag in the parser that enables or disables macros.
   void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
 
-  /// \brief Are we inside a macro instantiation?
+  /// Are we inside a macro instantiation?
   bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
 
-  /// \brief Handle entry to macro instantiation.
+  /// Handle entry to macro instantiation.
   ///
   /// \param M The macro.
   /// \param NameLoc Instantiation location.
   bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
 
-  /// \brief Handle exit from macro instantiation.
+  /// Handle exit from macro instantiation.
   void handleMacroExit();
 
-  /// \brief Extract AsmTokens for a macro argument.
+  /// Extract AsmTokens for a macro argument.
   bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);
 
-  /// \brief Parse all macro arguments for a given macro.
+  /// Parse all macro arguments for a given macro.
   bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
 
   void printMacroInstantiations();
@@ -310,15 +306,20 @@ private:
   }
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
-  /// \brief Enter the specified file. This returns true on failure.
+  /// Should we emit DWARF describing this assembler source?  (Returns false if
+  /// the source has .file directives, which means we don't want to generate
+  /// info describing the assembler source itself.)
+  bool enabledGenDwarfForAssembly();
+
+  /// Enter the specified file. This returns true on failure.
   bool enterIncludeFile(const std::string &Filename);
 
-  /// \brief Process the specified file for the .incbin directive.
+  /// Process the specified file for the .incbin directive.
   /// This returns true on failure.
   bool processIncbinFile(const std::string &Filename, int64_t Skip = 0,
                          const MCExpr *Count = nullptr, SMLoc Loc = SMLoc());
 
-  /// \brief Reset the current lexer position to that given by \p Loc. The
+  /// Reset the current lexer position to that given by \p Loc. The
   /// current token is not set; clients should ensure Lex() is called
   /// subsequently.
   ///
@@ -326,17 +327,17 @@ private:
   /// location.
   void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);
 
-  /// \brief Parse up to the end of statement and a return the contents from the
+  /// Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
   StringRef parseStringToEndOfStatement() override;
 
-  /// \brief Parse until the end of a statement or a comma is encountered,
+  /// Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
   StringRef parseStringToComma();
 
   bool parseAssignment(StringRef Name, bool allow_redef,
-                       bool NoDeadStrip = false);
+                       bool NoDeadStrip = false, bool AllowExtendedExpr = false);
 
   unsigned getBinOpPrecedence(AsmToken::TokenKind K,
                               MCBinaryExpr::Opcode &Kind);
@@ -505,10 +506,12 @@ private:
     DK_ERROR,
     DK_WARNING,
     DK_PRINT,
+    DK_ADDRSIG,
+    DK_ADDRSIG_SYM,
     DK_END
   };
 
-  /// \brief Maps directive name --> DirectiveKind enum, for
+  /// Maps directive name --> DirectiveKind enum, for
   /// directives parsed by this class.
   StringMap<DirectiveKind> DirectiveKindMap;
 
@@ -597,7 +600,7 @@ private:
   // .sleb128 (Signed=true) and .uleb128 (Signed=false)
   bool parseDirectiveLEB128(bool Signed);
 
-  /// \brief Parse a directive like ".globl" which
+  /// Parse a directive like ".globl" which
   /// accepts a single symbol (which should be a label or an external).
   bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
 
@@ -653,6 +656,10 @@ private:
   // .print <double-quotes-string>
   bool parseDirectivePrint(SMLoc DirectiveLoc);
 
+  // Directives to support address-significance tables.
+  bool parseDirectiveAddrsig();
+  bool parseDirectiveAddrsigSym();
+
   void initializeDirectiveKindMap();
 };
 
@@ -693,7 +700,10 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
     PlatformParser.reset(createELFAsmParser());
     break;
   case MCObjectFileInfo::IsWasm:
-    llvm_unreachable("Wasm parsing not supported yet");
+    // TODO: WASM will need its own MCAsmParserExtension implementation, but
+    // for now we can re-use the ELF one, since the directives can be the
+    // same for now.
+    PlatformParser.reset(createELFAsmParser());
     break;
   }
 
@@ -773,7 +783,7 @@ bool AsmParser::processIncbinFile(const std::string &Filename, int64_t Skip,
   Bytes = Bytes.drop_front(Skip);
   if (Count) {
     int64_t Res;
-    if (!Count->evaluateAsAbsolute(Res))
+    if (!Count->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
       return Error(Loc, "expected absolute expression");
     if (Res < 0)
       return Warning(Loc, "negative count has no effect");
@@ -823,6 +833,19 @@ const AsmToken &AsmParser::Lex() {
   return *tok;
 }
 
+bool AsmParser::enabledGenDwarfForAssembly() {
+  // Check whether the user specified -g.
+  if (!getContext().getGenDwarfForAssembly())
+    return false;
+  // If we haven't encountered any .file directives (which would imply that
+  // the assembler source was produced with debug info already) then emit one
+  // describing the assembler source file itself.
+  if (getContext().getGenDwarfFileNumber() == 0)
+    getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
+        0, StringRef(), getContext().getMainFileName()));
+  return true;
+}
+
 bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Create the initial section, if requested.
   if (!NoInitialTextSection)
@@ -836,7 +859,9 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   SmallVector<AsmRewrite, 4> AsmStrRewrites;
 
   // If we are generating dwarf for assembly source files save the initial text
-  // section and generate a .file directive.
+  // section.  (Don't use enabledGenDwarfForAssembly() here, as we aren't
+  // emitting any actual debug info yet and haven't had a chance to parse any
+  // embedded .file directives.)
   if (getContext().getGenDwarfForAssembly()) {
     MCSection *Sec = getStreamer().getCurrentSectionOnly();
     if (!Sec->getBeginSymbol()) {
@@ -847,8 +872,6 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
     bool InsertResult = getContext().addGenDwarfSection(Sec);
     assert(InsertResult && ".text section should not have debug info yet");
     (void)InsertResult;
-    getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
-        0, StringRef(), getContext().getMainFileName()));
   }
 
   // While we have input, parse each statement.
@@ -942,7 +965,7 @@ bool AsmParser::checkForValidSection() {
   return false;
 }
 
-/// \brief Throw away the rest of the line for testing purposes.
+/// Throw away the rest of the line for testing purposes.
 void AsmParser::eatToEndOfStatement() {
   while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lexer.Lex();
@@ -973,7 +996,7 @@ StringRef AsmParser::parseStringToComma() {
   return StringRef(Start, End - Start);
 }
 
-/// \brief Parse a paren expression and return it.
+/// Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
 ///
 /// parenexpr ::= expr)
@@ -988,7 +1011,7 @@ bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// \brief Parse a bracket expression and return it.
+/// Parse a bracket expression and return it.
 /// NOTE: This assumes the leading '[' has already been consumed.
 ///
 /// bracketexpr ::= expr]
@@ -1002,7 +1025,7 @@ bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// \brief Parse a primary expression and return it.
+/// Parse a primary expression and return it.
 ///  primaryexpr ::= (parenexpr
 ///  primaryexpr ::= symbol
 ///  primaryexpr ::= number
@@ -1098,13 +1121,17 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
-    if (Sym->isVariable() &&
-        isa<MCConstantExpr>(Sym->getVariableValue(/*SetUsed*/ false))) {
-      if (Variant)
-        return Error(EndLoc, "unexpected modifier on variable reference");
-
-      Res = Sym->getVariableValue(/*SetUsed*/ false);
-      return false;
+    if (Sym->isVariable()) {
+      auto V = Sym->getVariableValue(/*SetUsed*/ false);
+      bool DoInline = isa<MCConstantExpr>(V);
+      if (auto TV = dyn_cast<MCTargetExpr>(V))
+        DoInline = TV->inlineAssignedExpr();
+      if (DoInline) {
+        if (Variant)
+          return Error(EndLoc, "unexpected modifier on variable reference");
+        Res = Sym->getVariableValue(/*SetUsed*/ false);
+        return false;
+      }
     }
 
     // Otherwise create a symbol ref.
@@ -1294,7 +1321,7 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
 /// the End argument will be filled with the last location pointed to the '>'
 /// character.
 
-/// There is a gap between the AltMacro's documentation and the single quote implementation. 
+/// There is a gap between the AltMacro's documentation and the single quote implementation.
 /// GCC does not fully support this feature and so we will not support it.
 /// TODO: Adding single quote as a string.
 bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
@@ -1314,7 +1341,7 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
   return false;
 }
 
-/// \brief creating a string without the escape characters '!'.
+/// creating a string without the escape characters '!'.
 void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
   for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
     if (AltMacroStr[Pos] == '!')
@@ -1323,7 +1350,7 @@ void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
   }
 }
 
-/// \brief Parse an expression and return it.
+/// Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
 ///  expr ::= expr |,^,&,! expr
@@ -1363,7 +1390,8 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex();
   }
 
-  // Try to constant fold it up front, if possible.
+  // Try to constant fold it up front, if possible. Do not exploit
+  // assembler here.
   int64_t Value;
   if (Res->evaluateAsAbsolute(Value))
     Res = MCConstantExpr::create(Value, getContext());
@@ -1404,7 +1432,7 @@ bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
   if (parseExpression(Expr))
     return true;
 
-  if (!Expr->evaluateAsAbsolute(Res))
+  if (!Expr->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
     return Error(StartLoc, "expected absolute expression");
 
   return false;
@@ -1571,7 +1599,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
                   : getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr);
 }
 
-/// \brief Parse all binary operators with precedence >= 'Precedence'.
+/// Parse all binary operators with precedence >= 'Precedence'.
 /// Res contains the LHS of the expression on input.
 bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                               SMLoc &EndLoc) {
@@ -1783,7 +1811,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
     // If we are generating dwarf for assembly source files then gather the
     // info to make a dwarf label entry for this label if needed.
-    if (getContext().getGenDwarfForAssembly())
+    if (enabledGenDwarfForAssembly())
       MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
                                  IDLoc);
 
@@ -1798,7 +1826,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // identifier '=' ... -> assignment statement
     Lex();
 
-    return parseAssignment(IDVal, true);
+    return parseAssignment(IDVal, true, /*NoDeadStrip*/ false, /*AllowExtendedExpr*/true);
 
   default: // Normal instruction or directive.
     break;
@@ -2105,6 +2133,10 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveDS(IDVal, 12);
     case DK_PRINT:
       return parseDirectivePrint(IDLoc);
+    case DK_ADDRSIG:
+      return parseDirectiveAddrsig();
+    case DK_ADDRSIG_SYM:
+      return parseDirectiveAddrsigSym();
     }
 
     return Error(IDLoc, "unknown directive");
@@ -2152,7 +2184,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
   // If we are generating dwarf for the current section then generate a .loc
   // directive for the instruction.
-  if (!ParseHadError && getContext().getGenDwarfForAssembly() &&
+  if (!ParseHadError && enabledGenDwarfForAssembly() &&
       getContext().getGenDwarfSectionSyms().count(
           getStreamer().getCurrentSectionOnly())) {
     unsigned Line;
@@ -2170,20 +2202,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
           0, StringRef(), CppHashInfo.Filename);
       getContext().setGenDwarfFileNumber(FileNumber);
 
-      // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's
-      // cache with the different Loc from the call above we save the last
-      // info we queried here with SrcMgr.FindLineNumber().
-      unsigned CppHashLocLineNo;
-      if (LastQueryIDLoc == CppHashInfo.Loc &&
-          LastQueryBuffer == CppHashInfo.Buf)
-        CppHashLocLineNo = LastQueryLine;
-      else {
-        CppHashLocLineNo =
-            SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
-        LastQueryLine = CppHashLocLineNo;
-        LastQueryIDLoc = CppHashInfo.Loc;
-        LastQueryBuffer = CppHashInfo.Buf;
-      }
+      unsigned CppHashLocLineNo =
+        SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
       Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
     }
 
@@ -2205,7 +2225,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 }
 
 // Parse and erase curly braces marking block start/end
-bool 
+bool
 AsmParser::parseCurlyBlockScope(SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
   // Identify curly brace marking block start/end
   if (Lexer.isNot(AsmToken::LCurly) && Lexer.isNot(AsmToken::RCurly))
@@ -2248,7 +2268,7 @@ bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
   return false;
 }
 
-/// \brief will use the last parsed cpp hash line filename comment
+/// will use the last parsed cpp hash line filename comment
 /// for the Filename and LineNo if any in the diagnostic.
 void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   const AsmParser *Parser = static_cast<const AsmParser *>(Context);
@@ -2613,7 +2633,8 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
         Lex();
         if (parseExpression(AbsoluteExp, EndLoc))
           return false;
-        if (!AbsoluteExp->evaluateAsAbsolute(Value))
+        if (!AbsoluteExp->evaluateAsAbsolute(Value,
+                                             getStreamer().getAssemblerPtr()))
           return Error(StrLoc, "expected absolute expression");
         const char *StrChar = StrLoc.getPointer();
         const char *EndChar = EndLoc.getPointer();
@@ -2745,11 +2766,11 @@ void AsmParser::handleMacroExit() {
 }
 
 bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
-                                bool NoDeadStrip) {
+                                bool NoDeadStrip, bool AllowExtendedExpr) {
   MCSymbol *Sym;
   const MCExpr *Value;
   if (MCParserUtils::parseAssignmentExpression(Name, allow_redef, *this, Sym,
-                                               Value))
+                                               Value, AllowExtendedExpr))
     return true;
 
   if (!Sym) {
@@ -2913,8 +2934,9 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   if (parseExpression(Offset))
     return true;
 
-  if (check(!Offset->evaluateAsAbsolute(OffsetValue), OffsetLoc,
-            "expression is not a constant value") ||
+  if (check(!Offset->evaluateAsAbsolute(OffsetValue,
+                                        getStreamer().getAssemblerPtr()),
+            OffsetLoc, "expression is not a constant value") ||
       check(OffsetValue < 0, OffsetLoc, "expression is negative") ||
       parseToken(AsmToken::Comma, "expected comma") ||
       check(getTok().isNot(AsmToken::Identifier), "expected relocation name"))
@@ -2939,7 +2961,9 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
                  "unexpected token in .reloc directive"))
       return true;
 
-  if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc))
+  const MCTargetAsmParser &MCT = getTargetParser();
+  const MCSubtargetInfo &STI = MCT.getSTI();
+  if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc, STI))
     return Error(NameLoc, "unknown relocation name");
 
   return false;
@@ -2970,6 +2994,25 @@ bool AsmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
   return false;
 }
 
+static bool parseHexOcta(AsmParser &Asm, uint64_t &hi, uint64_t &lo) {
+  if (Asm.getTok().isNot(AsmToken::Integer) &&
+      Asm.getTok().isNot(AsmToken::BigNum))
+    return Asm.TokError("unknown token in expression");
+  SMLoc ExprLoc = Asm.getTok().getLoc();
+  APInt IntValue = Asm.getTok().getAPIntVal();
+  Asm.Lex();
+  if (!IntValue.isIntN(128))
+    return Asm.Error(ExprLoc, "out of range literal value");
+  if (!IntValue.isIntN(64)) {
+    hi = IntValue.getHiBits(IntValue.getBitWidth() - 64).getZExtValue();
+    lo = IntValue.getLoBits(64).getZExtValue();
+  } else {
+    hi = 0;
+    lo = IntValue.getZExtValue();
+  }
+  return false;
+}
+
 /// ParseDirectiveOctaValue
 ///  ::= .octa [ hexconstant (, hexconstant)* ]
 
@@ -2977,21 +3020,9 @@ bool AsmParser::parseDirectiveOctaValue(StringRef IDVal) {
   auto parseOp = [&]() -> bool {
     if (checkForValidSection())
       return true;
-    if (getTok().isNot(AsmToken::Integer) && getTok().isNot(AsmToken::BigNum))
-      return TokError("unknown token in expression");
-    SMLoc ExprLoc = getTok().getLoc();
-    APInt IntValue = getTok().getAPIntVal();
     uint64_t hi, lo;
-    Lex();
-    if (!IntValue.isIntN(128))
-      return Error(ExprLoc, "out of range literal value");
-    if (!IntValue.isIntN(64)) {
-      hi = IntValue.getHiBits(IntValue.getBitWidth() - 64).getZExtValue();
-      lo = IntValue.getLoBits(64).getZExtValue();
-    } else {
-      hi = 0;
-      lo = IntValue.getZExtValue();
-    }
+    if (parseHexOcta(*this, hi, lo))
+      return true;
     if (MAI.isLittleEndian()) {
       getStreamer().EmitIntValue(lo, 8);
       getStreamer().EmitIntValue(hi, 8);
@@ -3248,21 +3279,20 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
 }
 
 /// parseDirectiveFile
-/// ::= .file [number] filename
-/// ::= .file number directory filename
+/// ::= .file filename
+/// ::= .file number [directory] filename [md5 checksum] [source source-text]
 bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
-  SMLoc FileNumberLoc = getLexer().getLoc();
   if (getLexer().is(AsmToken::Integer)) {
     FileNumber = getTok().getIntVal();
     Lex();
 
-    if (FileNumber < 1)
-      return TokError("file number less than one");
+    if (FileNumber < 0)
+      return TokError("negative file number");
   }
 
-  std::string Path = getTok().getString();
+  std::string Path;
 
   // Usually the directory and filename together, otherwise just the directory.
   // Allow the strings to have escaped octal character sequence.
@@ -3285,20 +3315,79 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
     Filename = Path;
   }
 
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.file' directive"))
-    return true;
+  uint64_t MD5Hi, MD5Lo;
+  bool HasMD5 = false;
+
+  Optional<StringRef> Source;
+  bool HasSource = false;
+  std::string SourceString;
+
+  while (!parseOptionalToken(AsmToken::EndOfStatement)) {
+    StringRef Keyword;
+    if (check(getTok().isNot(AsmToken::Identifier),
+              "unexpected token in '.file' directive") ||
+        parseIdentifier(Keyword))
+      return true;
+    if (Keyword == "md5") {
+      HasMD5 = true;
+      if (check(FileNumber == -1,
+                "MD5 checksum specified, but no file number") ||
+          parseHexOcta(*this, MD5Hi, MD5Lo))
+        return true;
+    } else if (Keyword == "source") {
+      HasSource = true;
+      if (check(FileNumber == -1,
+                "source specified, but no file number") ||
+          check(getTok().isNot(AsmToken::String),
+                "unexpected token in '.file' directive") ||
+          parseEscapedString(SourceString))
+        return true;
+    } else {
+      return TokError("unexpected token in '.file' directive");
+    }
+  }
+
+  // In case there is a -g option as well as debug info from directive .file,
+  // we turn off the -g option, directly use the existing debug info instead.
+  // Also reset any implicit ".file 0" for the assembler source.
+  if (Ctx.getGenDwarfForAssembly()) {
+    Ctx.getMCDwarfLineTable(0).resetRootFile();
+    Ctx.setGenDwarfForAssembly(false);
+  }
 
   if (FileNumber == -1)
     getStreamer().EmitFileDirective(Filename);
   else {
-    // If there is -g option as well as debug info from directive file,
-    // we turn off -g option, directly use the existing debug info instead.
-    if (getContext().getGenDwarfForAssembly())
-      getContext().setGenDwarfForAssembly(false);
-    else if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename) ==
-        0)
-      return Error(FileNumberLoc, "file number already allocated");
+    MD5::MD5Result *CKMem = nullptr;
+    if (HasMD5) {
+      CKMem = (MD5::MD5Result *)Ctx.allocate(sizeof(MD5::MD5Result), 1);
+      for (unsigned i = 0; i != 8; ++i) {
+        CKMem->Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
+        CKMem->Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
+      }
+    }
+    if (HasSource) {
+      char *SourceBuf = static_cast<char *>(Ctx.allocate(SourceString.size()));
+      memcpy(SourceBuf, SourceString.data(), SourceString.size());
+      Source = StringRef(SourceBuf, SourceString.size());
+    }
+    if (FileNumber == 0) {
+      if (Ctx.getDwarfVersion() < 5)
+        return Warning(DirectiveLoc, "file 0 not supported prior to DWARF-5");
+      getStreamer().emitDwarfFile0Directive(Directory, Filename, CKMem, Source);
+    } else {
+      Expected<unsigned> FileNumOrErr = getStreamer().tryEmitDwarfFileDirective(
+          FileNumber, Directory, Filename, CKMem, Source);
+      if (!FileNumOrErr)
+        return Error(DirectiveLoc, toString(FileNumOrErr.takeError()));
+      FileNumber = FileNumOrErr.get();
+    }
+    // Alert the user if there are some .file directives with MD5 and some not.
+    // But only do that once.
+    if (!ReportedInconsistentMD5 && !Ctx.isDwarfMD5UsageConsistent(0)) {
+      ReportedInconsistentMD5 = true;
+      return Warning(DirectiveLoc, "inconsistent use of MD5 checksums");
+    }
   }
 
   return false;
@@ -3332,7 +3421,7 @@ bool AsmParser::parseDirectiveLoc() {
   int64_t FileNumber = 0, LineNumber = 0;
   SMLoc Loc = getTok().getLoc();
   if (parseIntToken(FileNumber, "unexpected token in '.loc' directive") ||
-      check(FileNumber < 1, Loc,
+      check(FileNumber < 1 && Ctx.getDwarfVersion() < 5, Loc,
             "file number less than one in '.loc' directive") ||
       check(!getContext().isValidDwarfFileNumber(FileNumber), Loc,
             "unassigned file number in '.loc' directive"))
@@ -3816,7 +3905,7 @@ bool AsmParser::parseDirectiveCFIEndProc() {
   return false;
 }
 
-/// \brief parse register name or number.
+/// parse register name or number.
 bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
                                               SMLoc DirectiveLoc) {
   unsigned RegNo;
@@ -4211,7 +4300,10 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
   checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
+  MCAsmMacro Macro(Name, Body, std::move(Parameters));
+  DEBUG_WITH_TYPE("asm-macros", dbgs() << "Defining new macro:\n";
+                  Macro.dump());
+  getContext().defineMacro(Name, std::move(Macro));
   return false;
 }
 
@@ -4374,6 +4466,8 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
   getContext().undefineMacro(Name);
+  DEBUG_WITH_TYPE("asm-macros", dbgs()
+                                    << "Un-defining macro: " << Name << "\n");
   return false;
 }
 
@@ -5207,6 +5301,8 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".ds.w"] = DK_DS_W;
   DirectiveKindMap[".ds.x"] = DK_DS_X;
   DirectiveKindMap[".print"] = DK_PRINT;
+  DirectiveKindMap[".addrsig"] = DK_ADDRSIG;
+  DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM;
 }
 
 MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
@@ -5221,7 +5317,8 @@ MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
     }
 
     if (Lexer.is(AsmToken::Identifier) &&
-        (getTok().getIdentifier() == ".rept" ||
+        (getTok().getIdentifier() == ".rep" ||
+         getTok().getIdentifier() == ".rept" ||
          getTok().getIdentifier() == ".irp" ||
          getTok().getIdentifier() == ".irpc")) {
       ++NestLevel;
@@ -5283,7 +5380,7 @@ bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
     return true;
 
   int64_t Count;
-  if (!CountExpr->evaluateAsAbsolute(Count)) {
+  if (!CountExpr->evaluateAsAbsolute(Count, getStreamer().getAssemblerPtr())) {
     return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
   }
 
@@ -5446,6 +5543,21 @@ bool AsmParser::parseDirectivePrint(SMLoc DirectiveLoc) {
   return false;
 }
 
+bool AsmParser::parseDirectiveAddrsig() {
+  getStreamer().EmitAddrsig();
+  return false;
+}
+
+bool AsmParser::parseDirectiveAddrsigSym() {
+  StringRef Name;
+  if (check(parseIdentifier(Name),
+            "expected identifier in '.addrsig_sym' directive"))
+    return true;
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+  getStreamer().EmitAddrsigSym(Sym);
+  return false;
+}
+
 // We are comparing pointers, but the pointers are relative to a single string.
 // Thus, this should always be deterministic.
 static int rewritesSort(const AsmRewrite *AsmRewriteA,
@@ -5727,14 +5839,17 @@ static bool isSymbolUsedInExpression(const MCSymbol *Sym, const MCExpr *Value) {
 
 bool parseAssignmentExpression(StringRef Name, bool allow_redef,
                                MCAsmParser &Parser, MCSymbol *&Sym,
-                               const MCExpr *&Value) {
+                               const MCExpr *&Value, bool AllowExtendedExpr) {
 
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Parser.getTok().getLoc();
-
-  if (Parser.parseExpression(Value)) {
-    return Parser.TokError("missing expression");
-  }
+  SMLoc EndLoc;
+  if (AllowExtendedExpr) {
+    if (Parser.getTargetParser().parseAssignmentExpression(Value, EndLoc)) {
+      return Parser.TokError("missing expression");
+    }
+  } else if (Parser.parseExpression(Value, EndLoc))
+      return Parser.TokError("missing expression");
 
   // Note: we don't count b as used in "a = b". This is to allow
   // a = b
@@ -5780,7 +5895,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
 } // end namespace MCParserUtils
 } // end namespace llvm
 
-/// \brief Create an MCAsmParser instance.
+/// Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
                                      MCStreamer &Out, const MCAsmInfo &MAI,
                                      unsigned CB) {
diff --git a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 687e0cc1faa5..388304a72395 100644
--- a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -65,9 +65,11 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecRel32>(".secrel32");
-    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecIdx>(".secidx");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSymIdx>(".symidx");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveSafeSEH>(".safeseh");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecIdx>(".secidx");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveLinkOnce>(".linkonce");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveRVA>(".rva");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
@@ -130,8 +132,10 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveSecRel32(StringRef, SMLoc);
   bool ParseDirectiveSecIdx(StringRef, SMLoc);
   bool ParseDirectiveSafeSEH(StringRef, SMLoc);
+  bool ParseDirectiveSymIdx(StringRef, SMLoc);
   bool parseCOMDATType(COFF::COMDATType &Type);
   bool ParseDirectiveLinkOnce(StringRef, SMLoc);
+  bool ParseDirectiveRVA(StringRef, SMLoc);
 
   // Win64 EH directives.
   bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
@@ -490,6 +494,37 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::ParseDirectiveRVA(StringRef, SMLoc) {
+  auto parseOp = [&]() -> bool {
+    StringRef SymbolID;
+    if (getParser().parseIdentifier(SymbolID))
+      return TokError("expected identifier in directive");
+
+    int64_t Offset = 0;
+    SMLoc OffsetLoc;
+    if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
+      OffsetLoc = getLexer().getLoc();
+      if (getParser().parseAbsoluteExpression(Offset))
+        return true;
+    }
+
+    if (Offset < std::numeric_limits<int32_t>::min() ||
+        Offset > std::numeric_limits<int32_t>::max())
+      return Error(OffsetLoc, "invalid '.rva' directive offset, can't be less "
+                              "than -2147483648 or greater than "
+                              "2147483647");
+
+    MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
+
+    getStreamer().EmitCOFFImgRel32(Symbol, Offset);
+    return false;
+  };
+
+  if (getParser().parseMany(parseOp))
+    return addErrorSuffix(" in directive");
+  return false;
+}
+
 bool COFFAsmParser::ParseDirectiveSafeSEH(StringRef, SMLoc) {
   StringRef SymbolID;
   if (getParser().parseIdentifier(SymbolID))
@@ -520,6 +555,21 @@ bool COFFAsmParser::ParseDirectiveSecIdx(StringRef, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::ParseDirectiveSymIdx(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitCOFFSymbolIndex(Symbol);
+  return false;
+}
+
 /// ::= [ identifier ]
 bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) {
   StringRef TypeId = getTok().getIdentifier();
diff --git a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index 5bbf49290f17..e6fc1fac81ba 100644
--- a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -40,7 +40,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Implementation of directive handling which is shared across all
+/// Implementation of directive handling which is shared across all
 /// Darwin targets.
 class DarwinAsmParser : public MCAsmParserExtension {
   template<bool (DarwinAsmParser::*HandlerMethod)(StringRef, SMLoc)>
@@ -888,6 +888,7 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
   Lex();
 
   StringRef Section;
+  SMLoc SectionLoc = getLexer().getLoc();
   if (getParser().parseIdentifier(Section))
     return TokError("expected section name after comma in '.zerofill' "
                     "directive");
@@ -896,9 +897,10 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
   // the section but with no symbol.
   if (getLexer().is(AsmToken::EndOfStatement)) {
     // Create the zerofill section but no symbol
-    getStreamer().EmitZerofill(getContext().getMachOSection(
-                                 Segment, Section, MachO::S_ZEROFILL,
-                                 0, SectionKind::getBSS()));
+    getStreamer().EmitZerofill(
+        getContext().getMachOSection(Segment, Section, MachO::S_ZEROFILL, 0,
+                                     SectionKind::getBSS()),
+        /*Symbol=*/nullptr, /*Size=*/0, /*ByteAlignment=*/0, SectionLoc);
     return false;
   }
 
@@ -957,7 +959,7 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
   getStreamer().EmitZerofill(getContext().getMachOSection(
                                Segment, Section, MachO::S_ZEROFILL,
                                0, SectionKind::getBSS()),
-                             Sym, Size, 1 << Pow2Alignment);
+                             Sym, Size, 1 << Pow2Alignment, SectionLoc);
 
   return false;
 }
diff --git a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 17f0bf845785..67e3512cc5bd 100644
--- a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -85,6 +85,7 @@ public:
     addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".hidden");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSubsection>(".subsection");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveCGProfile>(".cg_profile");
   }
 
   // FIXME: Part of this logic is duplicated in the MCELFStreamer. What is
@@ -149,6 +150,7 @@ public:
   bool ParseDirectiveWeakref(StringRef, SMLoc);
   bool ParseDirectiveSymbolAttribute(StringRef, SMLoc);
   bool ParseDirectiveSubsection(StringRef, SMLoc);
+  bool ParseDirectiveCGProfile(StringRef, SMLoc);
 
 private:
   bool ParseSectionName(StringRef &SectionName);
@@ -380,7 +382,6 @@ bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
   return false;
 }
 
-// FIXME: This is a work in progress.
 bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) {
   return ParseSectionArguments(/*IsPush=*/false, loc);
 }
@@ -480,6 +481,34 @@ static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
   return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
 }
 
+// Return a set of section flags based on the section name that can then
+// be augmented later, otherwise return 0 if we don't have any reasonable
+// defaults.
+static unsigned defaultSectionFlags(StringRef SectionName) {
+
+  if (hasPrefix(SectionName, ".rodata.cst"))
+    return ELF::SHF_ALLOC | ELF::SHF_MERGE;
+
+  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
+    return ELF::SHF_ALLOC;
+
+  if (SectionName == ".fini" || SectionName == ".init" ||
+      hasPrefix(SectionName, ".text."))
+    return ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+
+  if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
+      hasPrefix(SectionName, ".bss.") ||
+      hasPrefix(SectionName, ".init_array.") ||
+      hasPrefix(SectionName, ".fini_array.") ||
+      hasPrefix(SectionName, ".preinit_array."))
+    return ELF::SHF_ALLOC | ELF::SHF_WRITE;
+
+  if (hasPrefix(SectionName, ".tdata.") || hasPrefix(SectionName, ".tbss."))
+    return ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
+
+  return 0;
+}
+
 bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
@@ -489,27 +518,13 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef TypeName;
   int64_t Size = 0;
   StringRef GroupName;
-  unsigned Flags = 0;
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
   MCSymbolELF *Associated = nullptr;
   int64_t UniqueID = ~0;
 
-  // Set the defaults first.
-  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
-    Flags |= ELF::SHF_ALLOC;
-  if (SectionName == ".fini" || SectionName == ".init" ||
-      hasPrefix(SectionName, ".text."))
-    Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
-  if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
-      hasPrefix(SectionName, ".bss.") ||
-      hasPrefix(SectionName, ".init_array.") ||
-      hasPrefix(SectionName, ".fini_array.") ||
-      hasPrefix(SectionName, ".preinit_array."))
-    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE;
-  if (hasPrefix(SectionName, ".tdata.") ||
-      hasPrefix(SectionName, ".tbss."))
-    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
+  // Set the default section flags first in case no others are given.
+  unsigned Flags = defaultSectionFlags(SectionName);
 
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
@@ -537,6 +552,12 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
 
     if (extraFlags == -1U)
       return TokError("unknown flag");
+
+    // If we found additional section flags on a known section then give a
+    // warning.
+    if (Flags && Flags != extraFlags)
+      Warning(loc, "setting incorrect section attributes for " + SectionName);
+
     Flags |= extraFlags;
 
     bool Mergeable = Flags & ELF::SHF_MERGE;
@@ -608,6 +629,10 @@ EndStmt:
       Type = ELF::SHT_X86_64_UNWIND;
     else if (TypeName == "llvm_odrtab")
       Type = ELF::SHT_LLVM_ODRTAB;
+    else if (TypeName == "llvm_linker_options")
+      Type = ELF::SHT_LLVM_LINKER_OPTIONS;
+    else if (TypeName == "llvm_call_graph_profile")
+      Type = ELF::SHT_LLVM_CALL_GRAPH_PROFILE;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
@@ -838,6 +863,47 @@ bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveCGProfile
+///  ::= .cg_profile identifier, identifier, <number>
+bool ELFAsmParser::ParseDirectiveCGProfile(StringRef, SMLoc) {
+  StringRef From;
+  SMLoc FromLoc = getLexer().getLoc();
+  if (getParser().parseIdentifier(From))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected a comma");
+  Lex();
+
+  StringRef To;
+  SMLoc ToLoc = getLexer().getLoc();
+  if (getParser().parseIdentifier(To))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected a comma");
+  Lex();
+
+  int64_t Count;
+  if (getParser().parseIntToken(
+          Count, "expected integer count in '.cg_profile' directive"))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *FromSym = getContext().getOrCreateSymbol(From);
+  MCSymbol *ToSym = getContext().getOrCreateSymbol(To);
+
+  getStreamer().emitCGProfileEntry(
+      MCSymbolRefExpr::create(FromSym, MCSymbolRefExpr::VK_None, getContext(),
+                              FromLoc),
+      MCSymbolRefExpr::create(ToSym, MCSymbolRefExpr::VK_None, getContext(),
+                              ToLoc),
+      Count);
+  return false;
+}
+
 namespace llvm {
 
 MCAsmParserExtension *createELFAsmParser() {
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
index 8f845ee1d76f..75cd318e4fa3 100644
--- a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
@@ -10,6 +10,8 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -34,3 +36,94 @@ SMLoc AsmToken::getEndLoc() const {
 SMRange AsmToken::getLocRange() const {
   return SMRange(getLoc(), getEndLoc());
 }
+
+void AsmToken::dump(raw_ostream &OS) const {
+  switch (Kind) {
+  case AsmToken::Error:
+    OS << "error";
+    break;
+  case AsmToken::Identifier:
+    OS << "identifier: " << getString();
+    break;
+  case AsmToken::Integer:
+    OS << "int: " << getString();
+    break;
+  case AsmToken::Real:
+    OS << "real: " << getString();
+    break;
+  case AsmToken::String:
+    OS << "string: " << getString();
+    break;
+
+  case AsmToken::Amp:                OS << "Amp"; break;
+  case AsmToken::AmpAmp:             OS << "AmpAmp"; break;
+  case AsmToken::At:                 OS << "At"; break;
+  case AsmToken::BackSlash:          OS << "BackSlash"; break;
+  case AsmToken::BigNum:             OS << "BigNum"; break;
+  case AsmToken::Caret:              OS << "Caret"; break;
+  case AsmToken::Colon:              OS << "Colon"; break;
+  case AsmToken::Comma:              OS << "Comma"; break;
+  case AsmToken::Comment:            OS << "Comment"; break;
+  case AsmToken::Dollar:             OS << "Dollar"; break;
+  case AsmToken::Dot:                OS << "Dot"; break;
+  case AsmToken::EndOfStatement:     OS << "EndOfStatement"; break;
+  case AsmToken::Eof:                OS << "Eof"; break;
+  case AsmToken::Equal:              OS << "Equal"; break;
+  case AsmToken::EqualEqual:         OS << "EqualEqual"; break;
+  case AsmToken::Exclaim:            OS << "Exclaim"; break;
+  case AsmToken::ExclaimEqual:       OS << "ExclaimEqual"; break;
+  case AsmToken::Greater:            OS << "Greater"; break;
+  case AsmToken::GreaterEqual:       OS << "GreaterEqual"; break;
+  case AsmToken::GreaterGreater:     OS << "GreaterGreater"; break;
+  case AsmToken::Hash:               OS << "Hash"; break;
+  case AsmToken::HashDirective:      OS << "HashDirective"; break;
+  case AsmToken::LBrac:              OS << "LBrac"; break;
+  case AsmToken::LCurly:             OS << "LCurly"; break;
+  case AsmToken::LParen:             OS << "LParen"; break;
+  case AsmToken::Less:               OS << "Less"; break;
+  case AsmToken::LessEqual:          OS << "LessEqual"; break;
+  case AsmToken::LessGreater:        OS << "LessGreater"; break;
+  case AsmToken::LessLess:           OS << "LessLess"; break;
+  case AsmToken::Minus:              OS << "Minus"; break;
+  case AsmToken::Percent:            OS << "Percent"; break;
+  case AsmToken::Pipe:               OS << "Pipe"; break;
+  case AsmToken::PipePipe:           OS << "PipePipe"; break;
+  case AsmToken::Plus:               OS << "Plus"; break;
+  case AsmToken::RBrac:              OS << "RBrac"; break;
+  case AsmToken::RCurly:             OS << "RCurly"; break;
+  case AsmToken::RParen:             OS << "RParen"; break;
+  case AsmToken::Slash:              OS << "Slash"; break;
+  case AsmToken::Space:              OS << "Space"; break;
+  case AsmToken::Star:               OS << "Star"; break;
+  case AsmToken::Tilde:              OS << "Tilde"; break;
+  case AsmToken::PercentCall16:      OS << "PercentCall16"; break;
+  case AsmToken::PercentCall_Hi:     OS << "PercentCall_Hi"; break;
+  case AsmToken::PercentCall_Lo:     OS << "PercentCall_Lo"; break;
+  case AsmToken::PercentDtprel_Hi:   OS << "PercentDtprel_Hi"; break;
+  case AsmToken::PercentDtprel_Lo:   OS << "PercentDtprel_Lo"; break;
+  case AsmToken::PercentGot:         OS << "PercentGot"; break;
+  case AsmToken::PercentGot_Disp:    OS << "PercentGot_Disp"; break;
+  case AsmToken::PercentGot_Hi:      OS << "PercentGot_Hi"; break;
+  case AsmToken::PercentGot_Lo:      OS << "PercentGot_Lo"; break;
+  case AsmToken::PercentGot_Ofst:    OS << "PercentGot_Ofst"; break;
+  case AsmToken::PercentGot_Page:    OS << "PercentGot_Page"; break;
+  case AsmToken::PercentGottprel:    OS << "PercentGottprel"; break;
+  case AsmToken::PercentGp_Rel:      OS << "PercentGp_Rel"; break;
+  case AsmToken::PercentHi:          OS << "PercentHi"; break;
+  case AsmToken::PercentHigher:      OS << "PercentHigher"; break;
+  case AsmToken::PercentHighest:     OS << "PercentHighest"; break;
+  case AsmToken::PercentLo:          OS << "PercentLo"; break;
+  case AsmToken::PercentNeg:         OS << "PercentNeg"; break;
+  case AsmToken::PercentPcrel_Hi:    OS << "PercentPcrel_Hi"; break;
+  case AsmToken::PercentPcrel_Lo:    OS << "PercentPcrel_Lo"; break;
+  case AsmToken::PercentTlsgd:       OS << "PercentTlsgd"; break;
+  case AsmToken::PercentTlsldm:      OS << "PercentTlsldm"; break;
+  case AsmToken::PercentTprel_Hi:    OS << "PercentTprel_Hi"; break;
+  case AsmToken::PercentTprel_Lo:    OS << "PercentTprel_Lo"; break;
+  }
+
+  // Print the token string.
+  OS << " (\"";
+  OS.write_escaped(getString());
+  OS << "\")";
+}
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
index 6a4c74cd57fe..d439734e76fc 100644
--- a/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -85,7 +86,6 @@ bool MCAsmParser::TokError(const Twine &Msg, SMRange Range) {
 }
 
 bool MCAsmParser::Error(SMLoc L, const Twine &Msg, SMRange Range) {
-  HadError = true;
 
   MCPendingError PErr;
   PErr.Loc = L;
diff --git a/contrib/llvm/lib/MC/MCSchedule.cpp b/contrib/llvm/lib/MC/MCSchedule.cpp
index f3919427bf05..929bd7f6046c 100644
--- a/contrib/llvm/lib/MC/MCSchedule.cpp
+++ b/contrib/llvm/lib/MC/MCSchedule.cpp
@@ -12,6 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include <type_traits>
 
 using namespace llvm;
@@ -31,4 +35,118 @@ const MCSchedModel MCSchedModel::Default = {DefaultIssueWidth,
                                             nullptr,
                                             0,
                                             0,
+                                            nullptr,
                                             nullptr};
+
+int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
+                                      const MCSchedClassDesc &SCDesc) {
+  int Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc.NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+        STI.getWriteLatencyEntry(&SCDesc, DefIdx);
+    // Early exit if we found an invalid latency.
+    if (WLEntry->Cycles < 0)
+      return WLEntry->Cycles;
+    Latency = std::max(Latency, static_cast<int>(WLEntry->Cycles));
+  }
+  return Latency;
+}
+
+int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
+                                      unsigned SchedClass) const {
+  const MCSchedClassDesc &SCDesc = *getSchedClassDesc(SchedClass);
+  if (!SCDesc.isValid())
+    return 0;
+  if (!SCDesc.isVariant())
+    return MCSchedModel::computeInstrLatency(STI, SCDesc);
+
+  llvm_unreachable("unsupported variant scheduling class");
+}
+
+int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
+                                      const MCInstrInfo &MCII,
+                                      const MCInst &Inst) const {
+  unsigned SchedClass = MCII.get(Inst.getOpcode()).getSchedClass();
+  const MCSchedClassDesc *SCDesc = getSchedClassDesc(SchedClass);
+  if (!SCDesc->isValid())
+    return 0;
+
+  unsigned CPUID = getProcessorID();
+  while (SCDesc->isVariant()) {
+    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, CPUID);
+    SCDesc = getSchedClassDesc(SchedClass);
+  }
+
+  if (SchedClass)
+    return MCSchedModel::computeInstrLatency(STI, *SCDesc);
+
+  llvm_unreachable("unsupported variant scheduling class");
+}
+
+double
+MCSchedModel::getReciprocalThroughput(const MCSubtargetInfo &STI,
+                                      const MCSchedClassDesc &SCDesc) {
+  Optional<double> Throughput;
+  const MCSchedModel &SM = STI.getSchedModel();
+  const MCWriteProcResEntry *I = STI.getWriteProcResBegin(&SCDesc);
+  const MCWriteProcResEntry *E = STI.getWriteProcResEnd(&SCDesc);
+  for (; I != E; ++I) {
+    if (!I->Cycles)
+      continue;
+    unsigned NumUnits = SM.getProcResource(I->ProcResourceIdx)->NumUnits;
+    double Temp = NumUnits * 1.0 / I->Cycles;
+    Throughput = Throughput ? std::min(Throughput.getValue(), Temp) : Temp;
+  }
+  if (Throughput.hasValue())
+    return 1.0 / Throughput.getValue();
+
+  // If no throughput value was calculated, assume that we can execute at the
+  // maximum issue width scaled by number of micro-ops for the schedule class.
+  return ((double)SCDesc.NumMicroOps) / SM.IssueWidth;
+}
+
+double
+MCSchedModel::getReciprocalThroughput(const MCSubtargetInfo &STI,
+                                      const MCInstrInfo &MCII,
+                                      const MCInst &Inst) const {
+  unsigned SchedClass = MCII.get(Inst.getOpcode()).getSchedClass();
+  const MCSchedClassDesc *SCDesc = getSchedClassDesc(SchedClass);
+
+  // If there's no valid class, assume that the instruction executes/completes
+  // at the maximum issue width.
+  if (!SCDesc->isValid())
+    return 1.0 / IssueWidth;
+
+  unsigned CPUID = getProcessorID();
+  while (SCDesc->isVariant()) {
+    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, CPUID);
+    SCDesc = getSchedClassDesc(SchedClass);
+  }
+
+  if (SchedClass)
+    return MCSchedModel::getReciprocalThroughput(STI, *SCDesc);
+
+  llvm_unreachable("unsupported variant scheduling class");
+}
+
+double
+MCSchedModel::getReciprocalThroughput(unsigned SchedClass,
+                                      const InstrItineraryData &IID) {
+  Optional<double> Throughput;
+  const InstrStage *I = IID.beginStage(SchedClass);
+  const InstrStage *E = IID.endStage(SchedClass);
+  for (; I != E; ++I) {
+    if (!I->getCycles())
+      continue;
+    double Temp = countPopulation(I->getUnits()) * 1.0 / I->getCycles();
+    Throughput = Throughput ? std::min(Throughput.getValue(), Temp) : Temp;
+  }
+  if (Throughput.hasValue())
+    return 1.0 / Throughput.getValue();
+
+  // If there are no execution resources specified for this class, then assume
+  // that it can execute at the maximum default issue width.
+  return 1.0 / DefaultIssueWidth;
+}
diff --git a/contrib/llvm/lib/MC/MCSection.cpp b/contrib/llvm/lib/MC/MCSection.cpp
index d141dd6627c4..97bc65387dd5 100644
--- a/contrib/llvm/lib/MC/MCSection.cpp
+++ b/contrib/llvm/lib/MC/MCSection.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/MC/MCSection.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/contrib/llvm/lib/MC/MCSectionCOFF.cpp b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
index 72a7fc36a460..c861963eec8a 100644
--- a/contrib/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
@@ -69,35 +69,40 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   OS << '"';
 
   if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
-    OS << ",";
+    if (COMDATSymbol)
+      OS << ",";
+    else
+      OS << "\n\t.linkonce\t";
     switch (Selection) {
       case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
-        OS << "one_only,";
+        OS << "one_only";
         break;
       case COFF::IMAGE_COMDAT_SELECT_ANY:
-        OS << "discard,";
+        OS << "discard";
         break;
       case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
-        OS << "same_size,";
+        OS << "same_size";
         break;
       case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
-        OS << "same_contents,";
+        OS << "same_contents";
         break;
       case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
-        OS << "associative,";
+        OS << "associative";
         break;
       case COFF::IMAGE_COMDAT_SELECT_LARGEST:
-        OS << "largest,";
+        OS << "largest";
         break;
       case COFF::IMAGE_COMDAT_SELECT_NEWEST:
-        OS << "newest,";
+        OS << "newest";
         break;
       default:
         assert(false && "unsupported COFF selection type");
         break;
     }
-    assert(COMDATSymbol);
-    COMDATSymbol->print(OS, &MAI);
+    if (COMDATSymbol) {
+      OS << ",";
+      COMDATSymbol->print(OS, &MAI);
+    }
   }
   OS << '\n';
 }
diff --git a/contrib/llvm/lib/MC/MCSectionELF.cpp b/contrib/llvm/lib/MC/MCSectionELF.cpp
index bf1fcb03273c..4d77d05cc505 100644
--- a/contrib/llvm/lib/MC/MCSectionELF.cpp
+++ b/contrib/llvm/lib/MC/MCSectionELF.cpp
@@ -148,6 +148,10 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "0x7000001e";
   else if (Type == ELF::SHT_LLVM_ODRTAB)
     OS << "llvm_odrtab";
+  else if (Type == ELF::SHT_LLVM_LINKER_OPTIONS)
+    OS << "llvm_linker_options";
+  else if (Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE)
+    OS << "llvm_call_graph_profile";
   else
     report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
                        " for section " + getSectionName());
diff --git a/contrib/llvm/lib/MC/MCStreamer.cpp b/contrib/llvm/lib/MC/MCStreamer.cpp
index db6f81deacf6..8dd4b61be68f 100644
--- a/contrib/llvm/lib/MC/MCStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCStreamer.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -74,7 +75,8 @@ void MCTargetStreamer::emitValue(const MCExpr *Value) {
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
-    : Context(Ctx), CurrentWinFrameInfo(nullptr) {
+    : Context(Ctx), CurrentWinFrameInfo(nullptr),
+      UseAssemblerInfoForParsing(false) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
@@ -120,20 +122,16 @@ void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
   EmitBytes(StringRef(buf, Size));
 }
 
-/// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
+/// EmitULEB128IntValue - Special case of EmitULEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
-void MCStreamer::EmitPaddedULEB128IntValue(uint64_t Value, unsigned PadTo) {
+void MCStreamer::EmitULEB128IntValue(uint64_t Value) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
-  encodeULEB128(Value, OSE, PadTo);
+  encodeULEB128(Value, OSE);
   EmitBytes(OSE.str());
 }
 
-void MCStreamer::EmitULEB128IntValue(uint64_t Value) {
-  EmitPaddedULEB128IntValue(Value, 0);
-}
-
-/// EmitSLEB128Value - Special case of EmitSLEB128Value that avoids the
+/// EmitSLEB128IntValue - Special case of EmitSLEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
 void MCStreamer::EmitSLEB128IntValue(int64_t Value) {
   SmallString<128> Tmp;
@@ -187,25 +185,28 @@ void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
   emitFill(*MCConstantExpr::create(NumBytes, getContext()), FillValue);
 }
 
-void MCStreamer::emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) {
-  int64_t NonZeroSize = Size > 4 ? 4 : Size;
-  Expr &= ~0ULL >> (64 - NonZeroSize * 8);
-  for (uint64_t i = 0, e = NumValues; i != e; ++i) {
-    EmitIntValue(Expr, NonZeroSize);
-    if (NonZeroSize < Size)
-      EmitIntValue(0, Size - NonZeroSize);
-  }
-}
-
 /// The implementation in this class just redirects to emitFill.
 void MCStreamer::EmitZeros(uint64_t NumBytes) {
   emitFill(NumBytes, 0);
 }
 
-unsigned MCStreamer::EmitDwarfFileDirective(unsigned FileNo,
-                                            StringRef Directory,
-                                            StringRef Filename, unsigned CUID) {
-  return getContext().getDwarfFile(Directory, Filename, FileNo, CUID);
+Expected<unsigned>
+MCStreamer::tryEmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                      StringRef Filename,
+                                      MD5::MD5Result *Checksum,
+                                      Optional<StringRef> Source,
+                                      unsigned CUID) {
+  return getContext().getDwarfFile(Directory, Filename, FileNo, Checksum,
+                                   Source, CUID);
+}
+
+void MCStreamer::emitDwarfFile0Directive(StringRef Directory,
+                                         StringRef Filename,
+                                         MD5::MD5Result *Checksum,
+                                         Optional<StringRef> Source,
+                                         unsigned CUID) {
+  getContext().setMCLineTableRootFile(CUID, Directory, Filename, Checksum,
+                                      Source);
 }
 
 void MCStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
@@ -660,6 +661,10 @@ void MCStreamer::EmitWinEHHandlerData(SMLoc Loc) {
     getContext().reportError(Loc, "Chained unwind areas can't have handlers!");
 }
 
+void MCStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
+                                    const MCSymbolRefExpr *To, uint64_t Count) {
+}
+
 static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
                                    MCSection *MainCFISec,
                                    const MCSection *TextSec) {
@@ -668,16 +673,31 @@ static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
     return MainCFISec;
 
   const auto *TextSecCOFF = cast<MCSectionCOFF>(TextSec);
+  auto *MainCFISecCOFF = cast<MCSectionCOFF>(MainCFISec);
   unsigned UniqueID = TextSecCOFF->getOrAssignWinCFISectionID(NextWinCFIID);
 
   // If this section is COMDAT, this unwind section should be COMDAT associative
   // with its group.
   const MCSymbol *KeySym = nullptr;
-  if (TextSecCOFF->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
+  if (TextSecCOFF->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
     KeySym = TextSecCOFF->getCOMDATSymbol();
 
-  return Context.getAssociativeCOFFSection(cast<MCSectionCOFF>(MainCFISec),
-                                           KeySym, UniqueID);
+    // In a GNU environment, we can't use associative comdats. Instead, do what
+    // GCC does, which is to make plain comdat selectany section named like
+    // ".[px]data$_Z3foov".
+    if (!Context.getAsmInfo()->hasCOFFAssociativeComdats()) {
+      std::string SectionName =
+          (MainCFISecCOFF->getSectionName() + "$" +
+           TextSecCOFF->getSectionName().split('$').second)
+              .str();
+      return Context.getCOFFSection(
+          SectionName,
+          MainCFISecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT,
+          MainCFISecCOFF->getKind(), "", COFF::IMAGE_COMDAT_SELECT_ANY);
+    }
+  }
+
+  return Context.getAssociativeCOFFSection(MainCFISecCOFF, KeySym, UniqueID);
 }
 
 MCSection *MCStreamer::getAssociatedPDataSection(const MCSection *TextSec) {
@@ -803,11 +823,15 @@ void MCStreamer::EmitWinCFIEndProlog(SMLoc Loc) {
 void MCStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
 }
 
+void MCStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {}
+
 void MCStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
 }
 
 void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {}
 
+void MCStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
+
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
@@ -826,10 +850,11 @@ void MCStreamer::EmitWindowsUnwindTables() {
 }
 
 void MCStreamer::Finish() {
-  if (!DwarfFrameInfos.empty() && !DwarfFrameInfos.back().End)
-    getContext().reportError(SMLoc(), "Unfinished frame!");
-  if (!WinFrameInfos.empty() && !WinFrameInfos.back()->End)
+  if ((!DwarfFrameInfos.empty() && !DwarfFrameInfos.back().End) ||
+      (!WinFrameInfos.empty() && !WinFrameInfos.back()->End)) {
     getContext().reportError(SMLoc(), "Unfinished frame!");
+    return;
+  }
 
   MCTargetStreamer *TS = getTargetStreamer();
   if (TS)
@@ -908,6 +933,16 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
   EmitSymbolValue(SetLabel, Size);
 }
 
+void MCStreamer::emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
+                                                 const MCSymbol *Lo) {
+  // Get the Hi-Lo expression.
+  const MCExpr *Diff =
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(Hi, Context),
+                              MCSymbolRefExpr::create(Lo, Context), Context);
+
+  EmitULEB128Value(Diff);
+}
+
 void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
 void MCStreamer::EmitThumbFunc(MCSymbol *Func) {}
 void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
diff --git a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
index 8b9b076382e2..f6167826fae2 100644
--- a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -51,8 +51,6 @@ MCSubtargetInfo::MCSubtargetInfo(
   InitMCProcessorInfo(CPU, FS);
 }
 
-/// ToggleFeature - Toggle a feature and returns the re-computed feature
-/// bits. This version does not change the implied bits.
 FeatureBitset MCSubtargetInfo::ToggleFeature(uint64_t FB) {
   FeatureBits.flip(FB);
   return FeatureBits;
@@ -63,8 +61,6 @@ FeatureBitset MCSubtargetInfo::ToggleFeature(const FeatureBitset &FB) {
   return FeatureBits;
 }
 
-/// ToggleFeature - Toggle a feature and returns the re-computed feature
-/// bits. This version will also change all implied bits.
 FeatureBitset MCSubtargetInfo::ToggleFeature(StringRef FS) {
   SubtargetFeatures::ToggleFeature(FeatureBits, FS, ProcFeatures);
   return FeatureBits;
@@ -118,7 +114,6 @@ MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
   return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }
 
-/// Initialize an InstrItineraryData instance.
 void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
   InstrItins = InstrItineraryData(getSchedModel(), Stages, OperandCycles,
                                   ForwardingPaths);
diff --git a/contrib/llvm/lib/MC/MCSymbol.cpp b/contrib/llvm/lib/MC/MCSymbol.cpp
index 9abaaef2fe84..5502c658f565 100644
--- a/contrib/llvm/lib/MC/MCSymbol.cpp
+++ b/contrib/llvm/lib/MC/MCSymbol.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/contrib/llvm/lib/MC/MCValue.cpp b/contrib/llvm/lib/MC/MCValue.cpp
index 32a6adbf224e..7e03913aa680 100644
--- a/contrib/llvm/lib/MC/MCValue.cpp
+++ b/contrib/llvm/lib/MC/MCValue.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCValue.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp
index 301f30d4f6ec..59082a160caf 100644
--- a/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp
+++ b/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp
@@ -7,9 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWasmObjectWriter.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/MC/MCWasmStreamer.cpp b/contrib/llvm/lib/MC/MCWasmStreamer.cpp
index d9cefbd3994f..0e5932214047 100644
--- a/contrib/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCWasmStreamer.cpp
@@ -45,7 +45,8 @@ void MCWasmStreamer::mergeFragment(MCDataFragment *DF, MCDataFragment *EF) {
                                  DF->getContents().size());
     DF->getFixups().push_back(EF->getFixups()[i]);
   }
-  DF->setHasInstructions(true);
+  if (DF->getSubtargetInfo() == nullptr && EF->getSubtargetInfo())
+    DF->setHasInstructions(*EF->getSubtargetInfo());
   DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
 }
 
@@ -66,6 +67,7 @@ void MCWasmStreamer::ChangeSection(MCSection *Section,
     Asm.registerSymbol(*Grp);
 
   this->MCObjectStreamer::ChangeSection(Section, Subsection);
+  Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
 void MCWasmStreamer::EmitWeakReference(MCSymbol *Alias,
@@ -81,9 +83,9 @@ bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
 
   auto *Symbol = cast<MCSymbolWasm>(S);
 
-  // Adding a symbol attribute always introduces the symbol, note that an
-  // important side effect of calling registerSymbol here is to register
-  // the symbol with the assembler.
+  // Adding a symbol attribute always introduces the symbol; note that an
+  // important side effect of calling registerSymbol here is to register the
+  // symbol with the assembler.
   getAssembler().registerSymbol(*Symbol);
 
   switch (Attribute) {
@@ -113,11 +115,11 @@ bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
     break;
 
   case MCSA_ELF_TypeFunction:
-    Symbol->setIsFunction(true);
+    Symbol->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
     break;
 
   case MCSA_ELF_TypeObject:
-    Symbol->setIsFunction(false);
+    Symbol->setType(wasm::WASM_SYMBOL_TYPE_DATA);
     break;
 
   default:
@@ -156,17 +158,8 @@ void MCWasmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
 }
 
 void MCWasmStreamer::EmitIdent(StringRef IdentString) {
-  MCSection *Comment = getAssembler().getContext().getWasmSection(
-      ".comment", SectionKind::getMetadata());
-  PushSection();
-  SwitchSection(Comment);
-  if (!SeenIdent) {
-    EmitIntValue(0, 1);
-    SeenIdent = true;
-  }
-  EmitBytes(IdentString);
-  EmitIntValue(0, 1);
-  PopSection();
+  // TODO(sbc): Add the ident section once we support mergable strings
+  // sections in the object format
 }
 
 void MCWasmStreamer::EmitInstToFragment(const MCInst &Inst,
@@ -191,7 +184,7 @@ void MCWasmStreamer::EmitInstToData(const MCInst &Inst,
     Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
     DF->getFixups().push_back(Fixups[i]);
   }
-  DF->setHasInstructions(true);
+  DF->setHasInstructions(STI);
   DF->getContents().append(Code.begin(), Code.end());
 }
 
@@ -203,11 +196,11 @@ void MCWasmStreamer::FinishImpl() {
 
 MCStreamer *llvm::createWasmStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&CE,
                                      bool RelaxAll) {
   MCWasmStreamer *S =
-      new MCWasmStreamer(Context, std::move(MAB), OS, std::move(CE));
+      new MCWasmStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
@@ -222,7 +215,8 @@ void MCWasmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
 }
 
 void MCWasmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
-                                  uint64_t Size, unsigned ByteAlignment) {
+                                  uint64_t Size, unsigned ByteAlignment,
+                                  SMLoc Loc) {
   llvm_unreachable("Wasm doesn't support this directive");
 }
 
diff --git a/contrib/llvm/lib/MC/MCWinCOFFStreamer.cpp b/contrib/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 8582d9adafb8..7b1dc7abf708 100644
--- a/contrib/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
@@ -44,8 +45,8 @@ using namespace llvm;
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
                                      std::unique_ptr<MCCodeEmitter> CE,
-                                     raw_pwrite_stream &OS)
-    : MCObjectStreamer(Context, std::move(MAB), OS, std::move(CE)),
+                                     std::unique_ptr<MCObjectWriter> OW)
+    : MCObjectStreamer(Context, std::move(MAB), std::move(OW), std::move(CE)),
       CurSymbol(nullptr) {}
 
 void MCWinCOFFStreamer::EmitInstToData(const MCInst &Inst,
@@ -62,7 +63,7 @@ void MCWinCOFFStreamer::EmitInstToData(const MCInst &Inst,
     Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
     DF->getFixups().push_back(Fixups[i]);
   }
-
+  DF->setHasInstructions(STI);
   DF->getContents().append(Code.begin(), Code.end());
 }
 
@@ -193,6 +194,17 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
                    << COFF::SCT_COMPLEX_TYPE_SHIFT);
 }
 
+void MCWinCOFFStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {
+  MCSection *Sec = getCurrentSectionOnly();
+  getAssembler().registerSection(*Sec);
+  if (Sec->getAlignment() < 4)
+    Sec->setAlignment(4);
+
+  new MCSymbolIdFragment(Symbol, getCurrentSectionOnly());
+
+  getAssembler().registerSymbol(*Symbol);
+}
+
 void MCWinCOFFStreamer::EmitCOFFSectionIndex(const MCSymbol *Symbol) {
   visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
@@ -220,6 +232,25 @@ void MCWinCOFFStreamer::EmitCOFFSecRel32(const MCSymbol *Symbol,
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
+void MCWinCOFFStreamer::EmitCOFFImgRel32(const MCSymbol *Symbol,
+                                         int64_t Offset) {
+  visitUsedSymbol(*Symbol);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  // Create Symbol A for the relocation relative reference.
+  const MCExpr *MCE = MCSymbolRefExpr::create(
+      Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
+  // Add the constant offset, if given.
+  if (Offset)
+    MCE = MCBinaryExpr::createAdd(
+        MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+  // Build the imgrel relocation.
+  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
+  // Record the relocation.
+  DF->getFixups().push_back(Fixup);
+  // Emit 4 bytes (zeros) to the object file.
+  DF->getContents().resize(DF->getContents().size() + 4, 0);
+}
+
 void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
                                          unsigned ByteAlignment) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
@@ -267,7 +298,8 @@ void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
 }
 
 void MCWinCOFFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
-                                     uint64_t Size, unsigned ByteAlignment) {
+                                     uint64_t Size, unsigned ByteAlignment,
+                                     SMLoc Loc) {
   llvm_unreachable("not implemented");
 }
 
diff --git a/contrib/llvm/lib/MC/MachObjectWriter.cpp b/contrib/llvm/lib/MC/MachObjectWriter.cpp
index c7eaa76ace3c..a464af1d42a7 100644
--- a/contrib/llvm/lib/MC/MachObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/MachObjectWriter.cpp
@@ -141,24 +141,29 @@ void MachObjectWriter::writeHeader(MachO::HeaderFileType Type,
   // struct mach_header (28 bytes) or
   // struct mach_header_64 (32 bytes)
 
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
-  write32(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC);
+  W.write<uint32_t>(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC);
 
-  write32(TargetObjectWriter->getCPUType());
-  write32(TargetObjectWriter->getCPUSubtype());
+  W.write<uint32_t>(TargetObjectWriter->getCPUType());
+  W.write<uint32_t>(TargetObjectWriter->getCPUSubtype());
 
-  write32(Type);
-  write32(NumLoadCommands);
-  write32(LoadCommandsSize);
-  write32(Flags);
+  W.write<uint32_t>(Type);
+  W.write<uint32_t>(NumLoadCommands);
+  W.write<uint32_t>(LoadCommandsSize);
+  W.write<uint32_t>(Flags);
   if (is64Bit())
-    write32(0); // reserved
+    W.write<uint32_t>(0); // reserved
 
-  assert(
-      getStream().tell() - Start ==
-      (is64Bit() ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header)));
+  assert(W.OS.tell() - Start == (is64Bit() ? sizeof(MachO::mach_header_64)
+                                           : sizeof(MachO::mach_header)));
+}
+
+void MachObjectWriter::writeWithPadding(StringRef Str, uint64_t Size) {
+  assert(Size >= Str.size());
+  W.OS << Str;
+  W.OS.write_zeros(Size - Str.size());
 }
 
 /// writeSegmentLoadCommand - Write a segment load command.
@@ -172,38 +177,37 @@ void MachObjectWriter::writeSegmentLoadCommand(
   // struct segment_command (56 bytes) or
   // struct segment_command_64 (72 bytes)
 
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
   unsigned SegmentLoadCommandSize =
     is64Bit() ? sizeof(MachO::segment_command_64):
     sizeof(MachO::segment_command);
-  write32(is64Bit() ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT);
-  write32(SegmentLoadCommandSize +
+  W.write<uint32_t>(is64Bit() ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT);
+  W.write<uint32_t>(SegmentLoadCommandSize +
           NumSections * (is64Bit() ? sizeof(MachO::section_64) :
                          sizeof(MachO::section)));
 
-  assert(Name.size() <= 16);
-  writeBytes(Name, 16);
+  writeWithPadding(Name, 16);
   if (is64Bit()) {
-    write64(VMAddr);                 // vmaddr
-    write64(VMSize); // vmsize
-    write64(SectionDataStartOffset); // file offset
-    write64(SectionDataSize); // file size
+    W.write<uint64_t>(VMAddr);                 // vmaddr
+    W.write<uint64_t>(VMSize); // vmsize
+    W.write<uint64_t>(SectionDataStartOffset); // file offset
+    W.write<uint64_t>(SectionDataSize); // file size
   } else {
-    write32(VMAddr);                 // vmaddr
-    write32(VMSize); // vmsize
-    write32(SectionDataStartOffset); // file offset
-    write32(SectionDataSize); // file size
+    W.write<uint32_t>(VMAddr);                 // vmaddr
+    W.write<uint32_t>(VMSize); // vmsize
+    W.write<uint32_t>(SectionDataStartOffset); // file offset
+    W.write<uint32_t>(SectionDataSize); // file size
   }
   // maxprot
-  write32(MaxProt);
+  W.write<uint32_t>(MaxProt);
   // initprot
-  write32(InitProt);
-  write32(NumSections);
-  write32(0); // flags
+  W.write<uint32_t>(InitProt);
+  W.write<uint32_t>(NumSections);
+  W.write<uint32_t>(0); // flags
 
-  assert(getStream().tell() - Start == SegmentLoadCommandSize);
+  assert(W.OS.tell() - Start == SegmentLoadCommandSize);
 }
 
 void MachObjectWriter::writeSection(const MCAsmLayout &Layout,
@@ -223,31 +227,31 @@ void MachObjectWriter::writeSection(const MCAsmLayout &Layout,
   // struct section (68 bytes) or
   // struct section_64 (80 bytes)
 
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
-  writeBytes(Section.getSectionName(), 16);
-  writeBytes(Section.getSegmentName(), 16);
+  writeWithPadding(Section.getSectionName(), 16);
+  writeWithPadding(Section.getSegmentName(), 16);
   if (is64Bit()) {
-    write64(VMAddr);      // address
-    write64(SectionSize); // size
+    W.write<uint64_t>(VMAddr);      // address
+    W.write<uint64_t>(SectionSize); // size
   } else {
-    write32(VMAddr);      // address
-    write32(SectionSize); // size
+    W.write<uint32_t>(VMAddr);      // address
+    W.write<uint32_t>(SectionSize); // size
   }
-  write32(FileOffset);
+  W.write<uint32_t>(FileOffset);
 
   assert(isPowerOf2_32(Section.getAlignment()) && "Invalid alignment!");
-  write32(Log2_32(Section.getAlignment()));
-  write32(NumRelocations ? RelocationsStart : 0);
-  write32(NumRelocations);
-  write32(Flags);
-  write32(IndirectSymBase.lookup(&Sec)); // reserved1
-  write32(Section.getStubSize()); // reserved2
+  W.write<uint32_t>(Log2_32(Section.getAlignment()));
+  W.write<uint32_t>(NumRelocations ? RelocationsStart : 0);
+  W.write<uint32_t>(NumRelocations);
+  W.write<uint32_t>(Flags);
+  W.write<uint32_t>(IndirectSymBase.lookup(&Sec)); // reserved1
+  W.write<uint32_t>(Section.getStubSize()); // reserved2
   if (is64Bit())
-    write32(0); // reserved3
+    W.write<uint32_t>(0); // reserved3
 
-  assert(getStream().tell() - Start ==
+  assert(W.OS.tell() - Start ==
          (is64Bit() ? sizeof(MachO::section_64) : sizeof(MachO::section)));
 }
 
@@ -257,17 +261,17 @@ void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset,
                                               uint32_t StringTableSize) {
   // struct symtab_command (24 bytes)
 
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
-  write32(MachO::LC_SYMTAB);
-  write32(sizeof(MachO::symtab_command));
-  write32(SymbolOffset);
-  write32(NumSymbols);
-  write32(StringTableOffset);
-  write32(StringTableSize);
+  W.write<uint32_t>(MachO::LC_SYMTAB);
+  W.write<uint32_t>(sizeof(MachO::symtab_command));
+  W.write<uint32_t>(SymbolOffset);
+  W.write<uint32_t>(NumSymbols);
+  W.write<uint32_t>(StringTableOffset);
+  W.write<uint32_t>(StringTableSize);
 
-  assert(getStream().tell() - Start == sizeof(MachO::symtab_command));
+  assert(W.OS.tell() - Start == sizeof(MachO::symtab_command));
 }
 
 void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol,
@@ -280,31 +284,31 @@ void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol,
                                                 uint32_t NumIndirectSymbols) {
   // struct dysymtab_command (80 bytes)
 
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
-  write32(MachO::LC_DYSYMTAB);
-  write32(sizeof(MachO::dysymtab_command));
-  write32(FirstLocalSymbol);
-  write32(NumLocalSymbols);
-  write32(FirstExternalSymbol);
-  write32(NumExternalSymbols);
-  write32(FirstUndefinedSymbol);
-  write32(NumUndefinedSymbols);
-  write32(0); // tocoff
-  write32(0); // ntoc
-  write32(0); // modtaboff
-  write32(0); // nmodtab
-  write32(0); // extrefsymoff
-  write32(0); // nextrefsyms
-  write32(IndirectSymbolOffset);
-  write32(NumIndirectSymbols);
-  write32(0); // extreloff
-  write32(0); // nextrel
-  write32(0); // locreloff
-  write32(0); // nlocrel
-
-  assert(getStream().tell() - Start == sizeof(MachO::dysymtab_command));
+  W.write<uint32_t>(MachO::LC_DYSYMTAB);
+  W.write<uint32_t>(sizeof(MachO::dysymtab_command));
+  W.write<uint32_t>(FirstLocalSymbol);
+  W.write<uint32_t>(NumLocalSymbols);
+  W.write<uint32_t>(FirstExternalSymbol);
+  W.write<uint32_t>(NumExternalSymbols);
+  W.write<uint32_t>(FirstUndefinedSymbol);
+  W.write<uint32_t>(NumUndefinedSymbols);
+  W.write<uint32_t>(0); // tocoff
+  W.write<uint32_t>(0); // ntoc
+  W.write<uint32_t>(0); // modtaboff
+  W.write<uint32_t>(0); // nmodtab
+  W.write<uint32_t>(0); // extrefsymoff
+  W.write<uint32_t>(0); // nextrefsyms
+  W.write<uint32_t>(IndirectSymbolOffset);
+  W.write<uint32_t>(NumIndirectSymbols);
+  W.write<uint32_t>(0); // extreloff
+  W.write<uint32_t>(0); // nextrel
+  W.write<uint32_t>(0); // locreloff
+  W.write<uint32_t>(0); // nlocrel
+
+  assert(W.OS.tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
 MachObjectWriter::MachSymbolData *
@@ -384,33 +388,33 @@ void MachObjectWriter::writeNlist(MachSymbolData &MSD,
 
   // struct nlist (12 bytes)
 
-  write32(MSD.StringIndex);
-  write8(Type);
-  write8(SectionIndex);
+  W.write<uint32_t>(MSD.StringIndex);
+  W.OS << char(Type);
+  W.OS << char(SectionIndex);
 
   // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc'
   // value.
   bool EncodeAsAltEntry =
     IsAlias && cast<MCSymbolMachO>(OrigSymbol).isAltEntry();
-  write16(cast<MCSymbolMachO>(Symbol)->getEncodedFlags(EncodeAsAltEntry));
+  W.write<uint16_t>(cast<MCSymbolMachO>(Symbol)->getEncodedFlags(EncodeAsAltEntry));
   if (is64Bit())
-    write64(Address);
+    W.write<uint64_t>(Address);
   else
-    write32(Address);
+    W.write<uint32_t>(Address);
 }
 
 void MachObjectWriter::writeLinkeditLoadCommand(uint32_t Type,
                                                 uint32_t DataOffset,
                                                 uint32_t DataSize) {
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
-  write32(Type);
-  write32(sizeof(MachO::linkedit_data_command));
-  write32(DataOffset);
-  write32(DataSize);
+  W.write<uint32_t>(Type);
+  W.write<uint32_t>(sizeof(MachO::linkedit_data_command));
+  W.write<uint32_t>(DataOffset);
+  W.write<uint32_t>(DataSize);
 
-  assert(getStream().tell() - Start == sizeof(MachO::linkedit_data_command));
+  assert(W.OS.tell() - Start == sizeof(MachO::linkedit_data_command));
 }
 
 static unsigned ComputeLinkerOptionsLoadCommandSize(
@@ -426,23 +430,23 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand(
   const std::vector<std::string> &Options)
 {
   unsigned Size = ComputeLinkerOptionsLoadCommandSize(Options, is64Bit());
-  uint64_t Start = getStream().tell();
+  uint64_t Start = W.OS.tell();
   (void) Start;
 
-  write32(MachO::LC_LINKER_OPTION);
-  write32(Size);
-  write32(Options.size());
+  W.write<uint32_t>(MachO::LC_LINKER_OPTION);
+  W.write<uint32_t>(Size);
+  W.write<uint32_t>(Options.size());
   uint64_t BytesWritten = sizeof(MachO::linker_option_command);
   for (const std::string &Option : Options) {
     // Write each string, including the null byte.
-    writeBytes(Option, Option.size() + 1);
+    W.OS << Option << '\0';
     BytesWritten += Option.size() + 1;
   }
 
   // Pad to a multiple of the pointer size.
-  writeBytes("", OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4));
+  W.OS.write_zeros(OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4));
 
-  assert(getStream().tell() - Start == Size);
+  assert(W.OS.tell() - Start == Size);
 }
 
 void MachObjectWriter::recordRelocation(MCAssembler &Asm,
@@ -593,8 +597,8 @@ void MachObjectWriter::computeSymbolTable(
   }
 
   // External and undefined symbols are required to be in lexicographic order.
-  std::sort(ExternalSymbolData.begin(), ExternalSymbolData.end());
-  std::sort(UndefinedSymbolData.begin(), UndefinedSymbolData.end());
+  llvm::sort(ExternalSymbolData.begin(), ExternalSymbolData.end());
+  llvm::sort(UndefinedSymbolData.begin(), UndefinedSymbolData.end());
 
   // Set the symbol indices.
   Index = 0;
@@ -611,7 +615,7 @@ void MachObjectWriter::computeSymbolTable(
       // Set the Index and the IsExtern bit.
       unsigned Index = Rel.Sym->getIndex();
       assert(isInt<24>(Index));
-      if (IsLittleEndian)
+      if (W.Endian == support::little)
         Rel.MRE.r_word1 = (Rel.MRE.r_word1 & (~0U << 24)) | Index | (1 << 27);
       else
         Rel.MRE.r_word1 = (Rel.MRE.r_word1 & 0xff) | Index << 8 | (1 << 4);
@@ -731,8 +735,10 @@ static MachO::LoadCommandType getLCFromMCVM(MCVersionMinType Type) {
   llvm_unreachable("Invalid mc version min type");
 }
 
-void MachObjectWriter::writeObject(MCAssembler &Asm,
-                                   const MCAsmLayout &Layout) {
+uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  uint64_t StartOffset = W.OS.tell();
+
   // Compute symbol table information and bind symbol indices.
   computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
@@ -847,19 +853,19 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
       (VersionInfo.Major << 16);
     if (VersionInfo.EmitBuildVersion) {
       // FIXME: Currently empty tools. Add clang version in the future.
-      write32(MachO::LC_BUILD_VERSION);
-      write32(sizeof(MachO::build_version_command));
-      write32(VersionInfo.TypeOrPlatform.Platform);
-      write32(EncodedVersion);
-      write32(0);         // SDK version.
-      write32(0);         // Empty tools list.
+      W.write<uint32_t>(MachO::LC_BUILD_VERSION);
+      W.write<uint32_t>(sizeof(MachO::build_version_command));
+      W.write<uint32_t>(VersionInfo.TypeOrPlatform.Platform);
+      W.write<uint32_t>(EncodedVersion);
+      W.write<uint32_t>(0);         // SDK version.
+      W.write<uint32_t>(0);         // Empty tools list.
     } else {
       MachO::LoadCommandType LCType
         = getLCFromMCVM(VersionInfo.TypeOrPlatform.Type);
-      write32(LCType);
-      write32(sizeof(MachO::version_min_command));
-      write32(EncodedVersion);
-      write32(0);         // reserved.
+      W.write<uint32_t>(LCType);
+      W.write<uint32_t>(sizeof(MachO::version_min_command));
+      W.write<uint32_t>(EncodedVersion);
+      W.write<uint32_t>(0);         // reserved.
     }
   }
 
@@ -919,14 +925,14 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
 
   // Write the actual section data.
   for (const MCSection &Sec : Asm) {
-    Asm.writeSectionData(&Sec, Layout);
+    Asm.writeSectionData(W.OS, &Sec, Layout);
 
     uint64_t Pad = getPaddingSize(&Sec, Layout);
-    WriteZeros(Pad);
+    W.OS.write_zeros(Pad);
   }
 
   // Write the extra padding.
-  WriteZeros(SectionDataPadding);
+  W.OS.write_zeros(SectionDataPadding);
 
   // Write the relocation entries.
   for (const MCSection &Sec : Asm) {
@@ -934,8 +940,8 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
     // (approximately, the exact algorithm is more complicated than this).
     std::vector<RelAndSymbol> &Relocs = Relocations[&Sec];
     for (const RelAndSymbol &Rel : make_range(Relocs.rbegin(), Relocs.rend())) {
-      write32(Rel.MRE.r_word0);
-      write32(Rel.MRE.r_word1);
+      W.write<uint32_t>(Rel.MRE.r_word0);
+      W.write<uint32_t>(Rel.MRE.r_word1);
     }
   }
 
@@ -945,26 +951,31 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
          it != ie; ++it) {
     const DataRegionData *Data = &(*it);
     uint64_t Start = getSymbolAddress(*Data->Start, Layout);
-    uint64_t End = getSymbolAddress(*Data->End, Layout);
-    DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
-                 << "  start: " << Start << "(" << Data->Start->getName() << ")"
-                 << "  end: " << End << "(" << Data->End->getName() << ")"
-                 << "  size: " << End - Start
-                 << "\n");
-    write32(Start);
-    write16(End - Start);
-    write16(Data->Kind);
+    uint64_t End;
+    if (Data->End) 
+      End = getSymbolAddress(*Data->End, Layout);
+    else
+      report_fatal_error("Data region not terminated");
+
+    LLVM_DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
+                      << "  start: " << Start << "(" << Data->Start->getName()
+                      << ")"
+                      << "  end: " << End << "(" << Data->End->getName() << ")"
+                      << "  size: " << End - Start << "\n");
+    W.write<uint32_t>(Start);
+    W.write<uint16_t>(End - Start);
+    W.write<uint16_t>(Data->Kind);
   }
 
   // Write out the loh commands, if there is one.
   if (LOHSize) {
 #ifndef NDEBUG
-    unsigned Start = getStream().tell();
+    unsigned Start = W.OS.tell();
 #endif
     Asm.getLOHContainer().emit(*this, Layout);
     // Pad to a multiple of the pointer size.
-    writeBytes("", OffsetToAlignment(LOHRawSize, is64Bit() ? 8 : 4));
-    assert(getStream().tell() - Start == LOHSize);
+    W.OS.write_zeros(OffsetToAlignment(LOHRawSize, is64Bit() ? 8 : 4));
+    assert(W.OS.tell() - Start == LOHSize);
   }
 
   // Write the symbol table data, if used.
@@ -983,12 +994,12 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
           uint32_t Flags = MachO::INDIRECT_SYMBOL_LOCAL;
           if (it->Symbol->isAbsolute())
             Flags |= MachO::INDIRECT_SYMBOL_ABS;
-          write32(Flags);
+          W.write<uint32_t>(Flags);
           continue;
         }
       }
 
-      write32(it->Symbol->getIndex());
+      W.write<uint32_t>(it->Symbol->getIndex());
     }
 
     // FIXME: Check that offsets match computed ones.
@@ -1000,8 +1011,10 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
         writeNlist(Entry, Layout);
 
     // Write the string table.
-    StringTable.write(getStream());
+    StringTable.write(W.OS);
   }
+
+  return W.OS.tell() - StartOffset;
 }
 
 std::unique_ptr<MCObjectWriter>
diff --git a/contrib/llvm/lib/MC/StringTableBuilder.cpp b/contrib/llvm/lib/MC/StringTableBuilder.cpp
index 531bc930c89b..de40a7728d3f 100644
--- a/contrib/llvm/lib/MC/StringTableBuilder.cpp
+++ b/contrib/llvm/lib/MC/StringTableBuilder.cpp
@@ -31,6 +31,7 @@ void StringTableBuilder::initSize() {
   // correct.
   switch (K) {
   case RAW:
+  case DWARF:
     Size = 0;
     break;
   case MachO:
@@ -116,6 +117,7 @@ tailcall:
 }
 
 void StringTableBuilder::finalize() {
+  assert(K != DWARF);
   finalizeStringTable(/*Optimize=*/true);
 }
 
diff --git a/contrib/llvm/lib/MC/SubtargetFeature.cpp b/contrib/llvm/lib/MC/SubtargetFeature.cpp
index b68e88ca5725..b69af24b531e 100644
--- a/contrib/llvm/lib/MC/SubtargetFeature.cpp
+++ b/contrib/llvm/lib/MC/SubtargetFeature.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
diff --git a/contrib/llvm/lib/MC/WasmObjectWriter.cpp b/contrib/llvm/lib/MC/WasmObjectWriter.cpp
index 66236e3abfab..5a979d36e81b 100644
--- a/contrib/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/WasmObjectWriter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -38,14 +39,21 @@ using namespace llvm;
 
 namespace {
 
+// Went we ceate the indirect function table we start at 1, so that there is
+// and emtpy slot at 0 and therefore calling a null function pointer will trap.
+static const uint32_t kInitialTableOffset = 1;
+
 // For patching purposes, we need to remember where each section starts, both
 // for patching up the section size field, and for patching up references to
 // locations within the section.
 struct SectionBookkeeping {
   // Where the size of the section is written.
   uint64_t SizeOffset;
-  // Where the contents of the section starts (after the header).
+  // Where the section header ends (without custom section name).
+  uint64_t PayloadOffset;
+  // Where the contents of the section starts.
   uint64_t ContentsOffset;
+  uint32_t Index;
 };
 
 // The signature of a wasm function, in a struct capable of being used as a
@@ -107,35 +115,24 @@ struct WasmDataSegment {
   SmallVector<char, 4> Data;
 };
 
-// A wasm import to be written into the import section.
-struct WasmImport {
-  StringRef ModuleName;
-  StringRef FieldName;
-  unsigned Kind;
-  int32_t Type;
-  bool IsMutable;
-};
-
 // A wasm function to be written into the function section.
 struct WasmFunction {
   int32_t Type;
   const MCSymbolWasm *Sym;
 };
 
-// A wasm export to be written into the export section.
-struct WasmExport {
-  StringRef FieldName;
-  unsigned Kind;
-  uint32_t Index;
-};
-
 // A wasm global to be written into the global section.
 struct WasmGlobal {
-  wasm::ValType Type;
-  bool IsMutable;
-  bool HasImport;
+  wasm::WasmGlobalType Type;
   uint64_t InitialValue;
-  uint32_t ImportIndex;
+};
+
+// Information about a single item which is part of a COMDAT.  For each data
+// segment or function which is in the COMDAT, there is a corresponding
+// WasmComdatEntry.
+struct WasmComdatEntry {
+  unsigned Kind;
+  uint32_t Index;
 };
 
 // Information about a single relocation.
@@ -157,6 +154,8 @@ struct WasmRelocationEntry {
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
+    case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
+    case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32:
       return true;
     default:
       return false;
@@ -164,8 +163,8 @@ struct WasmRelocationEntry {
   }
 
   void print(raw_ostream &Out) const {
-    Out << "Off=" << Offset << ", Sym=" << *Symbol << ", Addend=" << Addend
-        << ", Type=" << Type
+    Out << wasm::relocTypetoString(Type)
+        << " Off=" << Offset << ", Sym=" << *Symbol << ", Addend=" << Addend
         << ", FixupSection=" << FixupSection->getSectionName();
   }
 
@@ -174,6 +173,21 @@ struct WasmRelocationEntry {
 #endif
 };
 
+static const uint32_t INVALID_INDEX = -1;
+
+struct WasmCustomSection {
+
+  StringRef Name;
+  MCSectionWasm *Section;
+
+  uint32_t OutputContentsOffset;
+  uint32_t OutputIndex;
+
+  WasmCustomSection(StringRef Name, MCSectionWasm *Section)
+      : Name(Name), Section(Section), OutputContentsOffset(0),
+        OutputIndex(INVALID_INDEX) {}
+};
+
 #if !defined(NDEBUG)
 raw_ostream &operator<<(raw_ostream &OS, const WasmRelocationEntry &Rel) {
   Rel.print(OS);
@@ -182,38 +196,48 @@ raw_ostream &operator<<(raw_ostream &OS, const WasmRelocationEntry &Rel) {
 #endif
 
 class WasmObjectWriter : public MCObjectWriter {
-  /// Helper struct for containing some precomputed information on symbols.
-  struct WasmSymbolData {
-    const MCSymbolWasm *Symbol;
-    StringRef Name;
-
-    // Support lexicographic sorting.
-    bool operator<(const WasmSymbolData &RHS) const { return Name < RHS.Name; }
-  };
+  support::endian::Writer W;
 
   /// The target specific Wasm writer instance.
   std::unique_ptr<MCWasmObjectTargetWriter> TargetObjectWriter;
 
   // Relocations for fixing up references in the code section.
   std::vector<WasmRelocationEntry> CodeRelocations;
+  uint32_t CodeSectionIndex;
 
   // Relocations for fixing up references in the data section.
   std::vector<WasmRelocationEntry> DataRelocations;
+  uint32_t DataSectionIndex;
 
   // Index values to use for fixing up call_indirect type indices.
   // Maps function symbols to the index of the type of the function
   DenseMap<const MCSymbolWasm *, uint32_t> TypeIndices;
   // Maps function symbols to the table element index space. Used
   // for TABLE_INDEX relocation types (i.e. address taken functions).
-  DenseMap<const MCSymbolWasm *, uint32_t> IndirectSymbolIndices;
-  // Maps function/global symbols to the function/global index space.
-  DenseMap<const MCSymbolWasm *, uint32_t> SymbolIndices;
+  DenseMap<const MCSymbolWasm *, uint32_t> TableIndices;
+  // Maps function/global symbols to the function/global/section index space.
+  DenseMap<const MCSymbolWasm *, uint32_t> WasmIndices;
+  // Maps data symbols to the Wasm segment and offset/size with the segment.
+  DenseMap<const MCSymbolWasm *, wasm::WasmDataReference> DataLocations;
+
+  // Stores output data (index, relocations, content offset) for custom
+  // section.
+  std::vector<WasmCustomSection> CustomSections;
+  // Relocations for fixing up references in the custom sections.
+  DenseMap<const MCSectionWasm *, std::vector<WasmRelocationEntry>>
+      CustomSectionsRelocations;
+
+  // Map from section to defining function symbol.
+  DenseMap<const MCSection *, const MCSymbol *> SectionFunctions;
 
   DenseMap<WasmFunctionType, int32_t, WasmFunctionTypeDenseMapInfo>
       FunctionTypeIndices;
   SmallVector<WasmFunctionType, 4> FunctionTypes;
   SmallVector<WasmGlobal, 4> Globals;
+  SmallVector<WasmDataSegment, 4> DataSegments;
+  unsigned NumFunctionImports = 0;
   unsigned NumGlobalImports = 0;
+  uint32_t SectionCount = 0;
 
   // TargetObjectWriter wrappers.
   bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
@@ -221,30 +245,34 @@ class WasmObjectWriter : public MCObjectWriter {
     return TargetObjectWriter->getRelocType(Target, Fixup);
   }
 
-  void startSection(SectionBookkeeping &Section, unsigned SectionId,
-                    const char *Name = nullptr);
+  void startSection(SectionBookkeeping &Section, unsigned SectionId);
+  void startCustomSection(SectionBookkeeping &Section, StringRef Name);
   void endSection(SectionBookkeeping &Section);
 
 public:
   WasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
                    raw_pwrite_stream &OS)
-      : MCObjectWriter(OS, /*IsLittleEndian=*/true),
-        TargetObjectWriter(std::move(MOTW)) {}
+      : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {}
 
-private:
   ~WasmObjectWriter() override;
 
+private:
   void reset() override {
     CodeRelocations.clear();
     DataRelocations.clear();
     TypeIndices.clear();
-    SymbolIndices.clear();
-    IndirectSymbolIndices.clear();
+    WasmIndices.clear();
+    TableIndices.clear();
+    DataLocations.clear();
+    CustomSectionsRelocations.clear();
     FunctionTypeIndices.clear();
     FunctionTypes.clear();
     Globals.clear();
-    MCObjectWriter::reset();
+    DataSegments.clear();
+    SectionFunctions.clear();
+    NumFunctionImports = 0;
     NumGlobalImports = 0;
+    MCObjectWriter::reset();
   }
 
   void writeHeader(const MCAssembler &Asm);
@@ -256,45 +284,46 @@ private:
   void executePostLayoutBinding(MCAssembler &Asm,
                                 const MCAsmLayout &Layout) override;
 
-  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
   void writeString(const StringRef Str) {
-    encodeULEB128(Str.size(), getStream());
-    writeBytes(Str);
+    encodeULEB128(Str.size(), W.OS);
+    W.OS << Str;
   }
 
   void writeValueType(wasm::ValType Ty) {
-    encodeSLEB128(int32_t(Ty), getStream());
+    W.OS << static_cast<char>(Ty);
   }
 
   void writeTypeSection(ArrayRef<WasmFunctionType> FunctionTypes);
-  void writeImportSection(ArrayRef<WasmImport> Imports, uint32_t DataSize,
+  void writeImportSection(ArrayRef<wasm::WasmImport> Imports, uint32_t DataSize,
                           uint32_t NumElements);
   void writeFunctionSection(ArrayRef<WasmFunction> Functions);
   void writeGlobalSection();
-  void writeExportSection(ArrayRef<WasmExport> Exports);
+  void writeExportSection(ArrayRef<wasm::WasmExport> Exports);
   void writeElemSection(ArrayRef<uint32_t> TableElems);
   void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         ArrayRef<WasmFunction> Functions);
-  void writeDataSection(ArrayRef<WasmDataSegment> Segments);
-  void writeNameSection(ArrayRef<WasmFunction> Functions,
-                        ArrayRef<WasmImport> Imports,
-                        uint32_t NumFuncImports);
-  void writeCodeRelocSection();
-  void writeDataRelocSection();
+  void writeDataSection();
+  void writeRelocSection(uint32_t SectionIndex, StringRef Name,
+                         ArrayRef<WasmRelocationEntry> Relocations);
   void writeLinkingMetaDataSection(
-      ArrayRef<WasmDataSegment> Segments, uint32_t DataSize,
-      const SmallVector<std::pair<StringRef, uint32_t>, 4> &SymbolFlags,
-      const SmallVector<std::pair<uint16_t, uint32_t>, 2> &InitFuncs);
+      ArrayRef<wasm::WasmSymbolInfo> SymbolInfos,
+      ArrayRef<std::pair<uint16_t, uint32_t>> InitFuncs,
+      const std::map<StringRef, std::vector<WasmComdatEntry>> &Comdats);
+  void writeCustomSections(const MCAssembler &Asm, const MCAsmLayout &Layout);
+  void writeCustomRelocSections();
+  void
+  updateCustomSectionRelocations(const SmallVector<WasmFunction, 4> &Functions,
+                                 const MCAsmLayout &Layout);
 
   uint32_t getProvisionalValue(const WasmRelocationEntry &RelEntry);
   void applyRelocations(ArrayRef<WasmRelocationEntry> Relocations,
                         uint64_t ContentsOffset);
 
-  void writeRelocations(ArrayRef<WasmRelocationEntry> Relocations);
   uint32_t getRelocationIndexValue(const WasmRelocationEntry &RelEntry);
-  uint32_t getFunctionType(const MCSymbolWasm& Symbol);
-  uint32_t registerFunctionType(const MCSymbolWasm& Symbol);
+  uint32_t getFunctionType(const MCSymbolWasm &Symbol);
+  uint32_t registerFunctionType(const MCSymbolWasm &Symbol);
 };
 
 } // end anonymous namespace
@@ -303,55 +332,75 @@ WasmObjectWriter::~WasmObjectWriter() {}
 
 // Write out a section header and a patchable section size field.
 void WasmObjectWriter::startSection(SectionBookkeeping &Section,
-                                    unsigned SectionId,
-                                    const char *Name) {
-  assert((Name != nullptr) == (SectionId == wasm::WASM_SEC_CUSTOM) &&
-         "Only custom sections can have names");
-
-  DEBUG(dbgs() << "startSection " << SectionId << ": " << Name << "\n");
-  encodeULEB128(SectionId, getStream());
+                                    unsigned SectionId) {
+  LLVM_DEBUG(dbgs() << "startSection " << SectionId << "\n");
+  W.OS << char(SectionId);
 
-  Section.SizeOffset = getStream().tell();
+  Section.SizeOffset = W.OS.tell();
 
   // The section size. We don't know the size yet, so reserve enough space
   // for any 32-bit value; we'll patch it later.
-  encodeULEB128(UINT32_MAX, getStream());
+  encodeULEB128(UINT32_MAX, W.OS);
 
   // The position where the section starts, for measuring its size.
-  Section.ContentsOffset = getStream().tell();
+  Section.ContentsOffset = W.OS.tell();
+  Section.PayloadOffset = W.OS.tell();
+  Section.Index = SectionCount++;
+}
+
+void WasmObjectWriter::startCustomSection(SectionBookkeeping &Section,
+                                          StringRef Name) {
+  LLVM_DEBUG(dbgs() << "startCustomSection " << Name << "\n");
+  startSection(Section, wasm::WASM_SEC_CUSTOM);
+
+  // The position where the section header ends, for measuring its size.
+  Section.PayloadOffset = W.OS.tell();
 
   // Custom sections in wasm also have a string identifier.
-  if (SectionId == wasm::WASM_SEC_CUSTOM) {
-    assert(Name);
-    writeString(StringRef(Name));
-  }
+  writeString(Name);
+
+  // The position where the custom section starts.
+  Section.ContentsOffset = W.OS.tell();
 }
 
 // Now that the section is complete and we know how big it is, patch up the
 // section size field at the start of the section.
 void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
-  uint64_t Size = getStream().tell() - Section.ContentsOffset;
+  uint64_t Size = W.OS.tell() - Section.PayloadOffset;
   if (uint32_t(Size) != Size)
     report_fatal_error("section size does not fit in a uint32_t");
 
-  DEBUG(dbgs() << "endSection size=" << Size << "\n");
+  LLVM_DEBUG(dbgs() << "endSection size=" << Size << "\n");
 
   // Write the final section size to the payload_len field, which follows
   // the section id byte.
   uint8_t Buffer[16];
   unsigned SizeLen = encodeULEB128(Size, Buffer, 5);
   assert(SizeLen == 5);
-  getStream().pwrite((char *)Buffer, SizeLen, Section.SizeOffset);
+  static_cast<raw_pwrite_stream &>(W.OS).pwrite((char *)Buffer, SizeLen,
+                                                Section.SizeOffset);
 }
 
 // Emit the Wasm header.
 void WasmObjectWriter::writeHeader(const MCAssembler &Asm) {
-  writeBytes(StringRef(wasm::WasmMagic, sizeof(wasm::WasmMagic)));
-  writeLE32(wasm::WasmVersion);
+  W.OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic));
+  W.write<uint32_t>(wasm::WasmVersion);
 }
 
 void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                 const MCAsmLayout &Layout) {
+  // Build a map of sections to the function that defines them, for use
+  // in recordRelocation.
+  for (const MCSymbol &S : Asm.symbols()) {
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    if (WS.isDefined() && WS.isFunction() && !WS.isVariable()) {
+      const auto &Sec = static_cast<const MCSectionWasm &>(S.getSection());
+      auto Pair = SectionFunctions.insert(std::make_pair(&Sec, &S));
+      if (!Pair.second)
+        report_fatal_error("section already has a defining function: " +
+                           Sec.getSectionName());
+    }
+  }
 }
 
 void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
@@ -428,24 +477,54 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   // be negative and don't wrap.
   FixedValue = 0;
 
-  if (SymA)
-    SymA->setUsedInReloc();
-
+  unsigned Type = getRelocType(Target, Fixup);
   assert(!IsPCRel);
   assert(SymA);
 
-  unsigned Type = getRelocType(Target, Fixup);
+  // Absolute offset within a section or a function.
+  // Currently only supported for for metadata sections.
+  // See: test/MC/WebAssembly/blockaddress.ll
+  if (Type == wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32 ||
+      Type == wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32) {
+    if (!FixupSection.getKind().isMetadata())
+      report_fatal_error("relocations for function or section offsets are "
+                         "only supported in metadata sections");
+
+    const MCSymbol *SectionSymbol = nullptr;
+    const MCSection &SecA = SymA->getSection();
+    if (SecA.getKind().isText())
+      SectionSymbol = SectionFunctions.find(&SecA)->second;
+    else
+      SectionSymbol = SecA.getBeginSymbol();
+    if (!SectionSymbol)
+      report_fatal_error("section symbol is required for relocation");
+
+    C += Layout.getSymbolOffset(*SymA);
+    SymA = cast<MCSymbolWasm>(SectionSymbol);
+  }
+
+  // Relocation other than R_WEBASSEMBLY_TYPE_INDEX_LEB are required to be
+  // against a named symbol.
+  if (Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB) {
+    if (SymA->getName().empty())
+      report_fatal_error("relocations against un-named temporaries are not yet "
+                         "supported by wasm");
+
+    SymA->setUsedInReloc();
+  }
 
   WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection);
-  DEBUG(dbgs() << "WasmReloc: " << Rec << "\n");
+  LLVM_DEBUG(dbgs() << "WasmReloc: " << Rec << "\n");
 
-  if (FixupSection.isWasmData())
+  if (FixupSection.isWasmData()) {
     DataRelocations.push_back(Rec);
-  else if (FixupSection.getKind().isText())
+  } else if (FixupSection.getKind().isText()) {
     CodeRelocations.push_back(Rec);
-  else if (!FixupSection.getKind().isMetadata())
-    // TODO(sbc): Add support for debug sections.
+  } else if (FixupSection.getKind().isMetadata()) {
+    CustomSectionsRelocations[&FixupSection].push_back(Rec);
+  } else {
     llvm_unreachable("unexpected section type");
+  }
 }
 
 // Write X as an (unsigned) LEB value at offset Offset in Stream, padded
@@ -485,34 +564,59 @@ static const MCSymbolWasm* ResolveSymbol(const MCSymbolWasm& Symbol) {
 }
 
 // Compute a value to write into the code at the location covered
-// by RelEntry. This value isn't used by the static linker, since
-// we have addends; it just serves to make the code more readable
-// and to make standalone wasm modules directly usable.
+// by RelEntry. This value isn't used by the static linker; it just serves
+// to make the object format more readable and more likely to be directly
+// useable.
 uint32_t
 WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
-  const MCSymbolWasm *Sym = ResolveSymbol(*RelEntry.Symbol);
-
-  // For undefined symbols, use a hopefully invalid value.
-  if (!Sym->isDefined(/*SetUsed=*/false))
-    return UINT32_MAX;
-
-  uint32_t GlobalIndex = SymbolIndices[Sym];
-  const WasmGlobal& Global = Globals[GlobalIndex - NumGlobalImports];
-  uint64_t Address = Global.InitialValue + RelEntry.Addend;
-
-  // Ignore overflow. LLVM allows address arithmetic to silently wrap.
-  uint32_t Value = Address;
-
-  return Value;
+  switch (RelEntry.Type) {
+  case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+  case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: {
+    // Provisional value is table address of the resolved symbol itself
+    const MCSymbolWasm *Sym = ResolveSymbol(*RelEntry.Symbol);
+    assert(Sym->isFunction());
+    return TableIndices[Sym];
+  }
+  case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+    // Provisional value is same as the index
+    return getRelocationIndexValue(RelEntry);
+  case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+  case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
+    // Provisional value is function/global Wasm index
+    if (!WasmIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found in wasm index space: " +
+                         RelEntry.Symbol->getName());
+    return WasmIndices[RelEntry.Symbol];
+  case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
+  case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32: {
+    const auto &Section =
+        static_cast<const MCSectionWasm &>(RelEntry.Symbol->getSection());
+    return Section.getSectionOffset() + RelEntry.Addend;
+  }
+  case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
+  case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
+  case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: {
+    // Provisional value is address of the global
+    const MCSymbolWasm *Sym = ResolveSymbol(*RelEntry.Symbol);
+    // For undefined symbols, use zero
+    if (!Sym->isDefined())
+      return 0;
+    const wasm::WasmDataReference &Ref = DataLocations[Sym];
+    const WasmDataSegment &Segment = DataSegments[Ref.Segment];
+    // Ignore overflow. LLVM allows address arithmetic to silently wrap.
+    return Segment.Offset + Ref.Offset + RelEntry.Addend;
+  }
+  default:
+    llvm_unreachable("invalid relocation type");
+  }
 }
 
 static void addData(SmallVectorImpl<char> &DataBytes,
                     MCSectionWasm &DataSection) {
-  DEBUG(errs() << "addData: " << DataSection.getSectionName() << "\n");
+  LLVM_DEBUG(errs() << "addData: " << DataSection.getSectionName() << "\n");
 
   DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
 
-  size_t LastFragmentSize = 0;
   for (const MCFragment &Frag : DataSection) {
     if (Frag.hasInstructions())
       report_fatal_error("only data supported in data sections");
@@ -528,121 +632,70 @@ static void addData(SmallVectorImpl<char> &DataBytes,
                                              Align->getMaxBytesToEmit());
       DataBytes.resize(Size, Value);
     } else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
-      int64_t Size;
-      if (!Fill->getSize().evaluateAsAbsolute(Size))
+      int64_t NumValues;
+      if (!Fill->getNumValues().evaluateAsAbsolute(NumValues))
         llvm_unreachable("The fill should be an assembler constant");
-      DataBytes.insert(DataBytes.end(), Size, Fill->getValue());
+      DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
+                       Fill->getValue());
     } else {
       const auto &DataFrag = cast<MCDataFragment>(Frag);
       const SmallVectorImpl<char> &Contents = DataFrag.getContents();
 
       DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
-      LastFragmentSize = Contents.size();
     }
   }
 
-  // Don't allow empty segments, or segments that end with zero-sized
-  // fragment, otherwise the linker cannot map symbols to a unique
-  // data segment.  This can be triggered by zero-sized structs
-  // See: test/MC/WebAssembly/bss.ll
-  if (LastFragmentSize == 0)
-    DataBytes.resize(DataBytes.size() + 1);
-  DEBUG(dbgs() << "addData -> " << DataBytes.size() << "\n");
+  LLVM_DEBUG(dbgs() << "addData -> " << DataBytes.size() << "\n");
 }
 
-uint32_t WasmObjectWriter::getRelocationIndexValue(
-    const WasmRelocationEntry &RelEntry) {
-  switch (RelEntry.Type) {
-  case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
-  case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
-    if (!IndirectSymbolIndices.count(RelEntry.Symbol))
-      report_fatal_error("symbol not found in table index space: " +
-                         RelEntry.Symbol->getName());
-    return IndirectSymbolIndices[RelEntry.Symbol];
-  case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
-  case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
-  case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
-  case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
-  case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
-    if (!SymbolIndices.count(RelEntry.Symbol))
-      report_fatal_error("symbol not found in function/global index space: " +
-                         RelEntry.Symbol->getName());
-    return SymbolIndices[RelEntry.Symbol];
-  case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+uint32_t
+WasmObjectWriter::getRelocationIndexValue(const WasmRelocationEntry &RelEntry) {
+  if (RelEntry.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB) {
     if (!TypeIndices.count(RelEntry.Symbol))
       report_fatal_error("symbol not found in type index space: " +
                          RelEntry.Symbol->getName());
     return TypeIndices[RelEntry.Symbol];
-  default:
-    llvm_unreachable("invalid relocation type");
   }
+
+  return RelEntry.Symbol->getIndex();
 }
 
 // Apply the portions of the relocation records that we can handle ourselves
 // directly.
 void WasmObjectWriter::applyRelocations(
     ArrayRef<WasmRelocationEntry> Relocations, uint64_t ContentsOffset) {
-  raw_pwrite_stream &Stream = getStream();
+  auto &Stream = static_cast<raw_pwrite_stream &>(W.OS);
   for (const WasmRelocationEntry &RelEntry : Relocations) {
     uint64_t Offset = ContentsOffset +
                       RelEntry.FixupSection->getSectionOffset() +
                       RelEntry.Offset;
 
-    DEBUG(dbgs() << "applyRelocation: " << RelEntry << "\n");
+    LLVM_DEBUG(dbgs() << "applyRelocation: " << RelEntry << "\n");
+    uint32_t Value = getProvisionalValue(RelEntry);
+
     switch (RelEntry.Type) {
-    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
     case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
     case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
-    case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB: {
-      uint32_t Index = getRelocationIndexValue(RelEntry);
-      WritePatchableSLEB(Stream, Index, Offset);
-      break;
-    }
-    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: {
-      uint32_t Index = getRelocationIndexValue(RelEntry);
-      WriteI32(Stream, Index, Offset);
-      break;
-    }
-    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: {
-      uint32_t Value = getProvisionalValue(RelEntry);
-      WritePatchableSLEB(Stream, Value, Offset);
-      break;
-    }
-    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB: {
-      uint32_t Value = getProvisionalValue(RelEntry);
+    case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
       WritePatchableLEB(Stream, Value, Offset);
       break;
-    }
-    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: {
-      uint32_t Value = getProvisionalValue(RelEntry);
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
+    case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
+    case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32:
       WriteI32(Stream, Value, Offset);
       break;
-    }
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
+      WritePatchableSLEB(Stream, Value, Offset);
+      break;
     default:
       llvm_unreachable("invalid relocation type");
     }
   }
 }
 
-// Write out the portions of the relocation records that the linker will
-// need to handle.
-void WasmObjectWriter::writeRelocations(
-    ArrayRef<WasmRelocationEntry> Relocations) {
-  raw_pwrite_stream &Stream = getStream();
-  for (const WasmRelocationEntry& RelEntry : Relocations) {
-
-    uint64_t Offset = RelEntry.Offset +
-                      RelEntry.FixupSection->getSectionOffset();
-    uint32_t Index = getRelocationIndexValue(RelEntry);
-
-    encodeULEB128(RelEntry.Type, Stream);
-    encodeULEB128(Offset, Stream);
-    encodeULEB128(Index, Stream);
-    if (RelEntry.hasAddend())
-      encodeSLEB128(RelEntry.Addend, Stream);
-  }
-}
-
 void WasmObjectWriter::writeTypeSection(
     ArrayRef<WasmFunctionType> FunctionTypes) {
   if (FunctionTypes.empty())
@@ -651,14 +704,14 @@ void WasmObjectWriter::writeTypeSection(
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_TYPE);
 
-  encodeULEB128(FunctionTypes.size(), getStream());
+  encodeULEB128(FunctionTypes.size(), W.OS);
 
   for (const WasmFunctionType &FuncTy : FunctionTypes) {
-    encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream());
-    encodeULEB128(FuncTy.Params.size(), getStream());
+    W.OS << char(wasm::WASM_TYPE_FUNC);
+    encodeULEB128(FuncTy.Params.size(), W.OS);
     for (wasm::ValType Ty : FuncTy.Params)
       writeValueType(Ty);
-    encodeULEB128(FuncTy.Returns.size(), getStream());
+    encodeULEB128(FuncTy.Returns.size(), W.OS);
     for (wasm::ValType Ty : FuncTy.Returns)
       writeValueType(Ty);
   }
@@ -666,7 +719,7 @@ void WasmObjectWriter::writeTypeSection(
   endSection(Section);
 }
 
-void WasmObjectWriter::writeImportSection(ArrayRef<WasmImport> Imports,
+void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
                                           uint32_t DataSize,
                                           uint32_t NumElements) {
   if (Imports.empty())
@@ -677,29 +730,28 @@ void WasmObjectWriter::writeImportSection(ArrayRef<WasmImport> Imports,
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_IMPORT);
 
-  encodeULEB128(Imports.size(), getStream());
-  for (const WasmImport &Import : Imports) {
-    writeString(Import.ModuleName);
-    writeString(Import.FieldName);
-
-    encodeULEB128(Import.Kind, getStream());
+  encodeULEB128(Imports.size(), W.OS);
+  for (const wasm::WasmImport &Import : Imports) {
+    writeString(Import.Module);
+    writeString(Import.Field);
+    W.OS << char(Import.Kind);
 
     switch (Import.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION:
-      encodeULEB128(Import.Type, getStream());
+      encodeULEB128(Import.SigIndex, W.OS);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
-      encodeSLEB128(int32_t(Import.Type), getStream());
-      encodeULEB128(int32_t(Import.IsMutable), getStream());
+      W.OS << char(Import.Global.Type);
+      W.OS << char(Import.Global.Mutable ? 1 : 0);
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
-      encodeULEB128(0, getStream()); // flags
-      encodeULEB128(NumPages, getStream()); // initial
+      encodeULEB128(0, W.OS); // flags
+      encodeULEB128(NumPages, W.OS); // initial
       break;
     case wasm::WASM_EXTERNAL_TABLE:
-      encodeSLEB128(int32_t(Import.Type), getStream());
-      encodeULEB128(0, getStream()); // flags
-      encodeULEB128(NumElements, getStream()); // initial
+      W.OS << char(Import.Table.ElemType);
+      encodeULEB128(0, W.OS); // flags
+      encodeULEB128(NumElements, W.OS); // initial
       break;
     default:
       llvm_unreachable("unsupported import kind");
@@ -716,9 +768,9 @@ void WasmObjectWriter::writeFunctionSection(ArrayRef<WasmFunction> Functions) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_FUNCTION);
 
-  encodeULEB128(Functions.size(), getStream());
+  encodeULEB128(Functions.size(), W.OS);
   for (const WasmFunction &Func : Functions)
-    encodeULEB128(Func.Type, getStream());
+    encodeULEB128(Func.Type, W.OS);
 
   endSection(Section);
 }
@@ -730,38 +782,31 @@ void WasmObjectWriter::writeGlobalSection() {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_GLOBAL);
 
-  encodeULEB128(Globals.size(), getStream());
+  encodeULEB128(Globals.size(), W.OS);
   for (const WasmGlobal &Global : Globals) {
-    writeValueType(Global.Type);
-    write8(Global.IsMutable);
+    writeValueType(static_cast<wasm::ValType>(Global.Type.Type));
+    W.OS << char(Global.Type.Mutable);
 
-    if (Global.HasImport) {
-      assert(Global.InitialValue == 0);
-      write8(wasm::WASM_OPCODE_GET_GLOBAL);
-      encodeULEB128(Global.ImportIndex, getStream());
-    } else {
-      assert(Global.ImportIndex == 0);
-      write8(wasm::WASM_OPCODE_I32_CONST);
-      encodeSLEB128(Global.InitialValue, getStream()); // offset
-    }
-    write8(wasm::WASM_OPCODE_END);
+    W.OS << char(wasm::WASM_OPCODE_I32_CONST);
+    encodeSLEB128(Global.InitialValue, W.OS);
+    W.OS << char(wasm::WASM_OPCODE_END);
   }
 
   endSection(Section);
 }
 
-void WasmObjectWriter::writeExportSection(ArrayRef<WasmExport> Exports) {
+void WasmObjectWriter::writeExportSection(ArrayRef<wasm::WasmExport> Exports) {
   if (Exports.empty())
     return;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_EXPORT);
 
-  encodeULEB128(Exports.size(), getStream());
-  for (const WasmExport &Export : Exports) {
-    writeString(Export.FieldName);
-    encodeSLEB128(Export.Kind, getStream());
-    encodeULEB128(Export.Index, getStream());
+  encodeULEB128(Exports.size(), W.OS);
+  for (const wasm::WasmExport &Export : Exports) {
+    writeString(Export.Name);
+    W.OS << char(Export.Kind);
+    encodeULEB128(Export.Index, W.OS);
   }
 
   endSection(Section);
@@ -774,17 +819,17 @@ void WasmObjectWriter::writeElemSection(ArrayRef<uint32_t> TableElems) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_ELEM);
 
-  encodeULEB128(1, getStream()); // number of "segments"
-  encodeULEB128(0, getStream()); // the table index
+  encodeULEB128(1, W.OS); // number of "segments"
+  encodeULEB128(0, W.OS); // the table index
 
   // init expr for starting offset
-  write8(wasm::WASM_OPCODE_I32_CONST);
-  encodeSLEB128(0, getStream());
-  write8(wasm::WASM_OPCODE_END);
+  W.OS << char(wasm::WASM_OPCODE_I32_CONST);
+  encodeSLEB128(kInitialTableOffset, W.OS);
+  W.OS << char(wasm::WASM_OPCODE_END);
 
-  encodeULEB128(TableElems.size(), getStream());
+  encodeULEB128(TableElems.size(), W.OS);
   for (uint32_t Elem : TableElems)
-    encodeULEB128(Elem, getStream());
+    encodeULEB128(Elem, W.OS);
 
   endSection(Section);
 }
@@ -797,8 +842,9 @@ void WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_CODE);
+  CodeSectionIndex = Section.Index;
 
-  encodeULEB128(Functions.size(), getStream());
+  encodeULEB128(Functions.size(), W.OS);
 
   for (const WasmFunction &Func : Functions) {
     auto &FuncSection = static_cast<MCSectionWasm &>(Func.Sym->getSection());
@@ -807,9 +853,9 @@ void WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
     if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
       report_fatal_error(".size expression must be evaluatable");
 
-    encodeULEB128(Size, getStream());
-    FuncSection.setSectionOffset(getStream().tell() - Section.ContentsOffset);
-    Asm.writeSectionData(&FuncSection, Layout);
+    encodeULEB128(Size, W.OS);
+    FuncSection.setSectionOffset(W.OS.tell() - Section.ContentsOffset);
+    Asm.writeSectionData(W.OS, &FuncSection, Layout);
   }
 
   // Apply fixups.
@@ -818,23 +864,24 @@ void WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
   endSection(Section);
 }
 
-void WasmObjectWriter::writeDataSection(ArrayRef<WasmDataSegment> Segments) {
-  if (Segments.empty())
+void WasmObjectWriter::writeDataSection() {
+  if (DataSegments.empty())
     return;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_DATA);
-
-  encodeULEB128(Segments.size(), getStream()); // count
-
-  for (const WasmDataSegment & Segment : Segments) {
-    encodeULEB128(0, getStream()); // memory index
-    write8(wasm::WASM_OPCODE_I32_CONST);
-    encodeSLEB128(Segment.Offset, getStream()); // offset
-    write8(wasm::WASM_OPCODE_END);
-    encodeULEB128(Segment.Data.size(), getStream()); // size
-    Segment.Section->setSectionOffset(getStream().tell() - Section.ContentsOffset);
-    writeBytes(Segment.Data); // data
+  DataSectionIndex = Section.Index;
+
+  encodeULEB128(DataSegments.size(), W.OS); // count
+
+  for (const WasmDataSegment &Segment : DataSegments) {
+    encodeULEB128(0, W.OS); // memory index
+    W.OS << char(wasm::WASM_OPCODE_I32_CONST);
+    encodeSLEB128(Segment.Offset, W.OS); // offset
+    W.OS << char(wasm::WASM_OPCODE_END);
+    encodeULEB128(Segment.Data.size(), W.OS); // size
+    Segment.Section->setSectionOffset(W.OS.tell() - Section.ContentsOffset);
+    W.OS << Segment.Data; // data
   }
 
   // Apply fixups.
@@ -843,115 +890,117 @@ void WasmObjectWriter::writeDataSection(ArrayRef<WasmDataSegment> Segments) {
   endSection(Section);
 }
 
-void WasmObjectWriter::writeNameSection(
-    ArrayRef<WasmFunction> Functions,
-    ArrayRef<WasmImport> Imports,
-    unsigned NumFuncImports) {
-  uint32_t TotalFunctions = NumFuncImports + Functions.size();
-  if (TotalFunctions == 0)
-    return;
-
-  SectionBookkeeping Section;
-  startSection(Section, wasm::WASM_SEC_CUSTOM, "name");
-  SectionBookkeeping SubSection;
-  startSection(SubSection, wasm::WASM_NAMES_FUNCTION);
-
-  encodeULEB128(TotalFunctions, getStream());
-  uint32_t Index = 0;
-  for (const WasmImport &Import : Imports) {
-    if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
-      encodeULEB128(Index, getStream());
-      writeString(Import.FieldName);
-      ++Index;
-    }
-  }
-  for (const WasmFunction &Func : Functions) {
-    encodeULEB128(Index, getStream());
-    writeString(Func.Sym->getName());
-    ++Index;
-  }
-
-  endSection(SubSection);
-  endSection(Section);
-}
-
-void WasmObjectWriter::writeCodeRelocSection() {
+void WasmObjectWriter::writeRelocSection(
+    uint32_t SectionIndex, StringRef Name,
+    ArrayRef<WasmRelocationEntry> Relocations) {
   // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
   // for descriptions of the reloc sections.
 
-  if (CodeRelocations.empty())
+  if (Relocations.empty())
     return;
 
   SectionBookkeeping Section;
-  startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE");
+  startCustomSection(Section, std::string("reloc.") + Name.str());
 
-  encodeULEB128(wasm::WASM_SEC_CODE, getStream());
-  encodeULEB128(CodeRelocations.size(), getStream());
+  encodeULEB128(SectionIndex, W.OS);
+  encodeULEB128(Relocations.size(), W.OS);
+  for (const WasmRelocationEntry& RelEntry : Relocations) {
+    uint64_t Offset = RelEntry.Offset +
+                      RelEntry.FixupSection->getSectionOffset();
+    uint32_t Index = getRelocationIndexValue(RelEntry);
 
-  writeRelocations(CodeRelocations);
+    W.OS << char(RelEntry.Type);
+    encodeULEB128(Offset, W.OS);
+    encodeULEB128(Index, W.OS);
+    if (RelEntry.hasAddend())
+      encodeSLEB128(RelEntry.Addend, W.OS);
+  }
 
   endSection(Section);
 }
 
-void WasmObjectWriter::writeDataRelocSection() {
-  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
-  // for descriptions of the reloc sections.
-
-  if (DataRelocations.empty())
-    return;
-
-  SectionBookkeeping Section;
-  startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA");
-
-  encodeULEB128(wasm::WASM_SEC_DATA, getStream());
-  encodeULEB128(DataRelocations.size(), getStream());
-
-  writeRelocations(DataRelocations);
-
-  endSection(Section);
+void WasmObjectWriter::writeCustomRelocSections() {
+  for (const auto &Sec : CustomSections) {
+    auto &Relocations = CustomSectionsRelocations[Sec.Section];
+    writeRelocSection(Sec.OutputIndex, Sec.Name, Relocations);
+  }
 }
 
 void WasmObjectWriter::writeLinkingMetaDataSection(
-    ArrayRef<WasmDataSegment> Segments, uint32_t DataSize,
-    const SmallVector<std::pair<StringRef, uint32_t>, 4> &SymbolFlags,
-    const SmallVector<std::pair<uint16_t, uint32_t>, 2> &InitFuncs) {
+    ArrayRef<wasm::WasmSymbolInfo> SymbolInfos,
+    ArrayRef<std::pair<uint16_t, uint32_t>> InitFuncs,
+    const std::map<StringRef, std::vector<WasmComdatEntry>> &Comdats) {
   SectionBookkeeping Section;
-  startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
-  SectionBookkeeping SubSection;
+  startCustomSection(Section, "linking");
+  encodeULEB128(wasm::WasmMetadataVersion, W.OS);
 
-  if (SymbolFlags.size() != 0) {
-    startSection(SubSection, wasm::WASM_SYMBOL_INFO);
-    encodeULEB128(SymbolFlags.size(), getStream());
-    for (auto Pair: SymbolFlags) {
-      writeString(Pair.first);
-      encodeULEB128(Pair.second, getStream());
+  SectionBookkeeping SubSection;
+  if (SymbolInfos.size() != 0) {
+    startSection(SubSection, wasm::WASM_SYMBOL_TABLE);
+    encodeULEB128(SymbolInfos.size(), W.OS);
+    for (const wasm::WasmSymbolInfo &Sym : SymbolInfos) {
+      encodeULEB128(Sym.Kind, W.OS);
+      encodeULEB128(Sym.Flags, W.OS);
+      switch (Sym.Kind) {
+      case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+      case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+        encodeULEB128(Sym.ElementIndex, W.OS);
+        if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0)
+          writeString(Sym.Name);
+        break;
+      case wasm::WASM_SYMBOL_TYPE_DATA:
+        writeString(Sym.Name);
+        if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) {
+          encodeULEB128(Sym.DataRef.Segment, W.OS);
+          encodeULEB128(Sym.DataRef.Offset, W.OS);
+          encodeULEB128(Sym.DataRef.Size, W.OS);
+        }
+        break;
+      case wasm::WASM_SYMBOL_TYPE_SECTION: {
+        const uint32_t SectionIndex =
+            CustomSections[Sym.ElementIndex].OutputIndex;
+        encodeULEB128(SectionIndex, W.OS);
+        break;
+      }
+      default:
+        llvm_unreachable("unexpected kind");
+      }
     }
     endSection(SubSection);
   }
 
-  if (DataSize > 0) {
-    startSection(SubSection, wasm::WASM_DATA_SIZE);
-    encodeULEB128(DataSize, getStream());
-    endSection(SubSection);
-  }
-
-  if (Segments.size()) {
+  if (DataSegments.size()) {
     startSection(SubSection, wasm::WASM_SEGMENT_INFO);
-    encodeULEB128(Segments.size(), getStream());
-    for (const WasmDataSegment &Segment : Segments) {
+    encodeULEB128(DataSegments.size(), W.OS);
+    for (const WasmDataSegment &Segment : DataSegments) {
       writeString(Segment.Name);
-      encodeULEB128(Segment.Alignment, getStream());
-      encodeULEB128(Segment.Flags, getStream());
+      encodeULEB128(Segment.Alignment, W.OS);
+      encodeULEB128(Segment.Flags, W.OS);
     }
     endSection(SubSection);
   }
 
   if (!InitFuncs.empty()) {
     startSection(SubSection, wasm::WASM_INIT_FUNCS);
-    encodeULEB128(InitFuncs.size(), getStream());
+    encodeULEB128(InitFuncs.size(), W.OS);
     for (auto &StartFunc : InitFuncs) {
-      encodeULEB128(StartFunc.first, getStream()); // priority
-      encodeULEB128(StartFunc.second, getStream()); // function index
+      encodeULEB128(StartFunc.first, W.OS); // priority
+      encodeULEB128(StartFunc.second, W.OS); // function index
+    }
+    endSection(SubSection);
+  }
+
+  if (Comdats.size()) {
+    startSection(SubSection, wasm::WASM_COMDAT_INFO);
+    encodeULEB128(Comdats.size(), W.OS);
+    for (const auto &C : Comdats) {
+      writeString(C.first);
+      encodeULEB128(0, W.OS); // flags for future use
+      encodeULEB128(C.second.size(), W.OS);
+      for (const WasmComdatEntry &Entry : C.second) {
+        encodeULEB128(Entry.Kind, W.OS);
+        encodeULEB128(Entry.Index, W.OS);
+      }
     }
     endSection(SubSection);
   }
@@ -959,6 +1008,27 @@ void WasmObjectWriter::writeLinkingMetaDataSection(
   endSection(Section);
 }
 
+void WasmObjectWriter::writeCustomSections(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout) {
+  for (auto &CustomSection : CustomSections) {
+    SectionBookkeeping Section;
+    auto *Sec = CustomSection.Section;
+    startCustomSection(Section, CustomSection.Name);
+
+    Sec->setSectionOffset(W.OS.tell() - Section.ContentsOffset);
+    Asm.writeSectionData(W.OS, Sec, Layout);
+
+    CustomSection.OutputContentsOffset = Section.ContentsOffset;
+    CustomSection.OutputIndex = Section.Index;
+
+    endSection(Section);
+
+    // Apply fixups.
+    auto &Relocations = CustomSectionsRelocations[CustomSection.Section];
+    applyRelocations(Relocations, CustomSection.OutputContentsOffset);
+  }
+}
+
 uint32_t WasmObjectWriter::getFunctionType(const MCSymbolWasm& Symbol) {
   assert(Symbol.isFunction());
   assert(TypeIndices.count(&Symbol));
@@ -979,94 +1049,56 @@ uint32_t WasmObjectWriter::registerFunctionType(const MCSymbolWasm& Symbol) {
     FunctionTypes.push_back(F);
   TypeIndices[&Symbol] = Pair.first->second;
 
-  DEBUG(dbgs() << "registerFunctionType: " << Symbol << " new:" << Pair.second << "\n");
-  DEBUG(dbgs() << "  -> type index: " << Pair.first->second << "\n");
+  LLVM_DEBUG(dbgs() << "registerFunctionType: " << Symbol
+                    << " new:" << Pair.second << "\n");
+  LLVM_DEBUG(dbgs() << "  -> type index: " << Pair.first->second << "\n");
   return Pair.first->second;
 }
 
-void WasmObjectWriter::writeObject(MCAssembler &Asm,
-                                   const MCAsmLayout &Layout) {
-  DEBUG(dbgs() << "WasmObjectWriter::writeObject\n");
+static bool isInSymtab(const MCSymbolWasm &Sym) {
+  if (Sym.isUsedInReloc())
+    return true;
+
+  if (Sym.isComdat() && !Sym.isDefined())
+    return false;
+
+  if (Sym.isTemporary() && Sym.getName().empty())
+    return false;
+
+  if (Sym.isTemporary() && Sym.isData() && !Sym.getSize())
+    return false;
+
+  if (Sym.isSection())
+    return false;
+
+  return true;
+}
+
+uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  uint64_t StartOffset = W.OS.tell();
+
+  LLVM_DEBUG(dbgs() << "WasmObjectWriter::writeObject\n");
   MCContext &Ctx = Asm.getContext();
-  wasm::ValType PtrType = is64Bit() ? wasm::ValType::I64 : wasm::ValType::I32;
 
   // Collect information from the available symbols.
   SmallVector<WasmFunction, 4> Functions;
   SmallVector<uint32_t, 4> TableElems;
-  SmallVector<WasmImport, 4> Imports;
-  SmallVector<WasmExport, 4> Exports;
-  SmallVector<std::pair<StringRef, uint32_t>, 4> SymbolFlags;
+  SmallVector<wasm::WasmImport, 4> Imports;
+  SmallVector<wasm::WasmExport, 4> Exports;
+  SmallVector<wasm::WasmSymbolInfo, 4> SymbolInfos;
   SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs;
-  unsigned NumFuncImports = 0;
-  SmallVector<WasmDataSegment, 4> DataSegments;
+  std::map<StringRef, std::vector<WasmComdatEntry>> Comdats;
   uint32_t DataSize = 0;
 
-  // In the special .global_variables section, we've encoded global
-  // variables used by the function. Translate them into the Globals
-  // list.
-  MCSectionWasm *GlobalVars =
-      Ctx.getWasmSection(".global_variables", SectionKind::getMetadata());
-  if (!GlobalVars->getFragmentList().empty()) {
-    if (GlobalVars->getFragmentList().size() != 1)
-      report_fatal_error("only one .global_variables fragment supported");
-    const MCFragment &Frag = *GlobalVars->begin();
-    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
-      report_fatal_error("only data supported in .global_variables");
-    const auto &DataFrag = cast<MCDataFragment>(Frag);
-    if (!DataFrag.getFixups().empty())
-      report_fatal_error("fixups not supported in .global_variables");
-    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
-    for (const uint8_t *p = (const uint8_t *)Contents.data(),
-                     *end = (const uint8_t *)Contents.data() + Contents.size();
-         p != end; ) {
-      WasmGlobal G;
-      if (end - p < 3)
-        report_fatal_error("truncated global variable encoding");
-      G.Type = wasm::ValType(int8_t(*p++));
-      G.IsMutable = bool(*p++);
-      G.HasImport = bool(*p++);
-      if (G.HasImport) {
-        G.InitialValue = 0;
-
-        WasmImport Import;
-        Import.ModuleName = (const char *)p;
-        const uint8_t *nul = (const uint8_t *)memchr(p, '\0', end - p);
-        if (!nul)
-          report_fatal_error("global module name must be nul-terminated");
-        p = nul + 1;
-        nul = (const uint8_t *)memchr(p, '\0', end - p);
-        if (!nul)
-          report_fatal_error("global base name must be nul-terminated");
-        Import.FieldName = (const char *)p;
-        p = nul + 1;
-
-        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
-        Import.Type = int32_t(G.Type);
-
-        G.ImportIndex = NumGlobalImports;
-        ++NumGlobalImports;
-
-        Imports.push_back(Import);
-      } else {
-        unsigned n;
-        G.InitialValue = decodeSLEB128(p, &n);
-        G.ImportIndex = 0;
-        if ((ptrdiff_t)n > end - p)
-          report_fatal_error("global initial value must be valid SLEB128");
-        p += n;
-      }
-      Globals.push_back(G);
-    }
-  }
-
   // For now, always emit the memory import, since loads and stores are not
   // valid without it. In the future, we could perhaps be more clever and omit
   // it if there are no loads or stores.
   MCSymbolWasm *MemorySym =
       cast<MCSymbolWasm>(Ctx.getOrCreateSymbol("__linear_memory"));
-  WasmImport MemImport;
-  MemImport.ModuleName = MemorySym->getModuleName();
-  MemImport.FieldName = MemorySym->getName();
+  wasm::WasmImport MemImport;
+  MemImport.Module = MemorySym->getModuleName();
+  MemImport.Field = MemorySym->getName();
   MemImport.Kind = wasm::WASM_EXTERNAL_MEMORY;
   Imports.push_back(MemImport);
 
@@ -1075,20 +1107,21 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   // it if there are no indirect calls.
   MCSymbolWasm *TableSym =
       cast<MCSymbolWasm>(Ctx.getOrCreateSymbol("__indirect_function_table"));
-  WasmImport TableImport;
-  TableImport.ModuleName = TableSym->getModuleName();
-  TableImport.FieldName = TableSym->getName();
+  wasm::WasmImport TableImport;
+  TableImport.Module = TableSym->getModuleName();
+  TableImport.Field = TableSym->getName();
   TableImport.Kind = wasm::WASM_EXTERNAL_TABLE;
-  TableImport.Type = wasm::WASM_TYPE_ANYFUNC;
+  TableImport.Table.ElemType = wasm::WASM_TYPE_ANYFUNC;
   Imports.push_back(TableImport);
 
-  // Populate FunctionTypeIndices and Imports.
+  // Populate FunctionTypeIndices, and Imports and WasmIndices for undefined
+  // symbols.  This must be done before populating WasmIndices for defined
+  // symbols.
   for (const MCSymbol &S : Asm.symbols()) {
     const auto &WS = static_cast<const MCSymbolWasm &>(S);
 
     // Register types for all functions, including those with private linkage
-    // (making them
-    // because wasm always needs a type signature.
+    // (because wasm always needs a type signature).
     if (WS.isFunction())
       registerFunctionType(WS);
 
@@ -1096,56 +1129,84 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       continue;
 
     // If the symbol is not defined in this translation unit, import it.
-    if (!WS.isDefined(/*SetUsed=*/false) || WS.isVariable()) {
-      WasmImport Import;
-      Import.ModuleName = WS.getModuleName();
-      Import.FieldName = WS.getName();
-
+    if (!WS.isDefined() && !WS.isComdat()) {
       if (WS.isFunction()) {
+        wasm::WasmImport Import;
+        Import.Module = WS.getModuleName();
+        Import.Field = WS.getName();
         Import.Kind = wasm::WASM_EXTERNAL_FUNCTION;
-        Import.Type = getFunctionType(WS);
-        SymbolIndices[&WS] = NumFuncImports;
-        ++NumFuncImports;
-      } else {
+        Import.SigIndex = getFunctionType(WS);
+        Imports.push_back(Import);
+        WasmIndices[&WS] = NumFunctionImports++;
+      } else if (WS.isGlobal()) {
+        if (WS.isWeak())
+          report_fatal_error("undefined global symbol cannot be weak");
+
+        wasm::WasmImport Import;
+        Import.Module = WS.getModuleName();
+        Import.Field = WS.getName();
         Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
-        Import.Type = int32_t(PtrType);
-        Import.IsMutable = false;
-        SymbolIndices[&WS] = NumGlobalImports;
-
-        // If this global is the stack pointer, make it mutable.
-        if (WS.getName() == "__stack_pointer")
-          Import.IsMutable = true;
-
-        ++NumGlobalImports;
+        Import.Global = WS.getGlobalType();
+        Imports.push_back(Import);
+        WasmIndices[&WS] = NumGlobalImports++;
       }
-
-      Imports.push_back(Import);
     }
   }
 
+  // Populate DataSegments and CustomSections, which must be done before
+  // populating DataLocations.
   for (MCSection &Sec : Asm) {
     auto &Section = static_cast<MCSectionWasm &>(Sec);
-    if (!Section.isWasmData())
-      continue;
+    StringRef SectionName = Section.getSectionName();
 
     // .init_array sections are handled specially elsewhere.
-    if (cast<MCSectionWasm>(Sec).getSectionName().startswith(".init_array"))
+    if (SectionName.startswith(".init_array"))
       continue;
 
-    DataSize = alignTo(DataSize, Section.getAlignment());
-    DataSegments.emplace_back();
-    WasmDataSegment &Segment = DataSegments.back();
-    Segment.Name = Section.getSectionName();
-    Segment.Offset = DataSize;
-    Segment.Section = &Section;
-    addData(Segment.Data, Section);
-    Segment.Alignment = Section.getAlignment();
-    Segment.Flags = 0;
-    DataSize += Segment.Data.size();
-    Section.setMemoryOffset(Segment.Offset);
+    // Code is handled separately
+    if (Section.getKind().isText())
+      continue;
+
+    if (Section.isWasmData()) {
+      uint32_t SegmentIndex = DataSegments.size();
+      DataSize = alignTo(DataSize, Section.getAlignment());
+      DataSegments.emplace_back();
+      WasmDataSegment &Segment = DataSegments.back();
+      Segment.Name = SectionName;
+      Segment.Offset = DataSize;
+      Segment.Section = &Section;
+      addData(Segment.Data, Section);
+      Segment.Alignment = Section.getAlignment();
+      Segment.Flags = 0;
+      DataSize += Segment.Data.size();
+      Section.setSegmentIndex(SegmentIndex);
+
+      if (const MCSymbolWasm *C = Section.getGroup()) {
+        Comdats[C->getName()].emplace_back(
+            WasmComdatEntry{wasm::WASM_COMDAT_DATA, SegmentIndex});
+      }
+    } else {
+      // Create custom sections
+      assert(Sec.getKind().isMetadata());
+
+      StringRef Name = SectionName;
+
+      // For user-defined custom sections, strip the prefix
+      if (Name.startswith(".custom_section."))
+        Name = Name.substr(strlen(".custom_section."));
+
+      MCSymbol* Begin = Sec.getBeginSymbol();
+      if (Begin) {
+        WasmIndices[cast<MCSymbolWasm>(Begin)] = CustomSections.size();
+        if (SectionName != Begin->getName())
+          report_fatal_error("section name and begin symbol should match: " +
+                             Twine(SectionName));
+      }
+      CustomSections.emplace_back(Name, &Section);
+    }
   }
 
-  // Handle regular defined and undefined symbols.
+  // Populate WasmIndices and DataLocations for defined symbols.
   for (const MCSymbol &S : Asm.symbols()) {
     // Ignore unnamed temporary symbols, which aren't ever exported, imported,
     // or used in relocations.
@@ -1153,27 +1214,21 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       continue;
 
     const auto &WS = static_cast<const MCSymbolWasm &>(S);
-    DEBUG(dbgs() << "MCSymbol: '" << S << "'"
-                 << " isDefined=" << S.isDefined() << " isExternal="
-                 << S.isExternal() << " isTemporary=" << S.isTemporary()
-                 << " isFunction=" << WS.isFunction()
-                 << " isWeak=" << WS.isWeak()
-                 << " isHidden=" << WS.isHidden()
-                 << " isVariable=" << WS.isVariable() << "\n");
-
-    if (WS.isWeak() || WS.isHidden()) {
-      uint32_t Flags = (WS.isWeak() ? wasm::WASM_SYMBOL_BINDING_WEAK : 0) |
-          (WS.isHidden() ? wasm::WASM_SYMBOL_VISIBILITY_HIDDEN : 0);
-      SymbolFlags.emplace_back(WS.getName(), Flags);
-    }
+    LLVM_DEBUG(
+        dbgs() << "MCSymbol: " << toString(WS.getType()) << " '" << S << "'"
+               << " isDefined=" << S.isDefined() << " isExternal="
+               << S.isExternal() << " isTemporary=" << S.isTemporary()
+               << " isWeak=" << WS.isWeak() << " isHidden=" << WS.isHidden()
+               << " isVariable=" << WS.isVariable() << "\n");
 
     if (WS.isVariable())
       continue;
-
-    unsigned Index;
+    if (WS.isComdat() && !WS.isDefined())
+      continue;
 
     if (WS.isFunction()) {
-      if (WS.isDefined(/*SetUsed=*/false)) {
+      unsigned Index;
+      if (WS.isDefined()) {
         if (WS.getOffset() != 0)
           report_fatal_error(
               "function sections must contain one function each");
@@ -1182,27 +1237,34 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
           report_fatal_error(
               "function symbols must have a size set with .size");
 
-        // A definition. Take the next available index.
-        Index = NumFuncImports + Functions.size();
-
-        // Prepare the function.
+        // A definition. Write out the function body.
+        Index = NumFunctionImports + Functions.size();
         WasmFunction Func;
         Func.Type = getFunctionType(WS);
         Func.Sym = &WS;
-        SymbolIndices[&WS] = Index;
+        WasmIndices[&WS] = Index;
         Functions.push_back(Func);
+
+        auto &Section = static_cast<MCSectionWasm &>(WS.getSection());
+        if (const MCSymbolWasm *C = Section.getGroup()) {
+          Comdats[C->getName()].emplace_back(
+              WasmComdatEntry{wasm::WASM_COMDAT_FUNCTION, Index});
+        }
       } else {
         // An import; the index was assigned above.
-        Index = SymbolIndices.find(&WS)->second;
+        Index = WasmIndices.find(&WS)->second;
       }
 
-      DEBUG(dbgs() << "  -> function index: " << Index << "\n");
-   } else {
+      LLVM_DEBUG(dbgs() << "  -> function index: " << Index << "\n");
+    } else if (WS.isData()) {
       if (WS.isTemporary() && !WS.getSize())
         continue;
 
-      if (!WS.isDefined(/*SetUsed=*/false))
+      if (!WS.isDefined()) {
+        LLVM_DEBUG(dbgs() << "  -> segment index: -1"
+                          << "\n");
         continue;
+      }
 
       if (!WS.getSize())
         report_fatal_error("data symbols must have a size set with .size: " +
@@ -1212,90 +1274,113 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
         report_fatal_error(".size expression must be evaluatable");
 
-      // For each global, prepare a corresponding wasm global holding its
-      // address.  For externals these will also be named exports.
-      Index = NumGlobalImports + Globals.size();
       auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
-
-      WasmGlobal Global;
-      Global.Type = PtrType;
-      Global.IsMutable = false;
-      Global.HasImport = false;
-      Global.InitialValue = DataSection.getMemoryOffset() + Layout.getSymbolOffset(WS);
-      Global.ImportIndex = 0;
-      SymbolIndices[&WS] = Index;
-      DEBUG(dbgs() << "  -> global index: " << Index << "\n");
-      Globals.push_back(Global);
-    }
-
-    // If the symbol is visible outside this translation unit, export it.
-    if (WS.isDefined(/*SetUsed=*/false)) {
-      WasmExport Export;
-      Export.FieldName = WS.getName();
-      Export.Index = Index;
-      if (WS.isFunction())
-        Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
-      else
-        Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
-      DEBUG(dbgs() << "  -> export " << Exports.size() << "\n");
-      Exports.push_back(Export);
-      if (!WS.isExternal())
-        SymbolFlags.emplace_back(WS.getName(), wasm::WASM_SYMBOL_BINDING_LOCAL);
+      assert(DataSection.isWasmData());
+
+      // For each data symbol, export it in the symtab as a reference to the
+      // corresponding Wasm data segment.
+      wasm::WasmDataReference Ref = wasm::WasmDataReference{
+          DataSection.getSegmentIndex(),
+          static_cast<uint32_t>(Layout.getSymbolOffset(WS)),
+          static_cast<uint32_t>(Size)};
+      DataLocations[&WS] = Ref;
+      LLVM_DEBUG(dbgs() << "  -> segment index: " << Ref.Segment << "\n");
+    } else if (WS.isGlobal()) {
+      // A "true" Wasm global (currently just __stack_pointer)
+      if (WS.isDefined())
+        report_fatal_error("don't yet support defined globals");
+
+      // An import; the index was assigned above
+      LLVM_DEBUG(dbgs() << "  -> global index: "
+                        << WasmIndices.find(&WS)->second << "\n");
+    } else {
+      assert(WS.isSection());
     }
   }
 
-  // Handle weak aliases. We need to process these in a separate pass because
-  // we need to have processed the target of the alias before the alias itself
-  // and the symbols are not necessarily ordered in this way.
+  // Populate WasmIndices and DataLocations for aliased symbols.  We need to
+  // process these in a separate pass because we need to have processed the
+  // target of the alias before the alias itself and the symbols are not
+  // necessarily ordered in this way.
   for (const MCSymbol &S : Asm.symbols()) {
     if (!S.isVariable())
       continue;
 
-    assert(S.isDefined(/*SetUsed=*/false));
+    assert(S.isDefined());
 
     // Find the target symbol of this weak alias and export that index
     const auto &WS = static_cast<const MCSymbolWasm &>(S);
     const MCSymbolWasm *ResolvedSym = ResolveSymbol(WS);
-    DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *ResolvedSym << "'\n");
-    assert(SymbolIndices.count(ResolvedSym) > 0);
-    uint32_t Index = SymbolIndices.find(ResolvedSym)->second;
-    DEBUG(dbgs() << "  -> index:" << Index << "\n");
-
-    WasmExport Export;
-    Export.FieldName = WS.getName();
-    Export.Index = Index;
-    if (WS.isFunction())
-      Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
-    else
-      Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
-    DEBUG(dbgs() << "  -> export " << Exports.size() << "\n");
-    Exports.push_back(Export);
+    LLVM_DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *ResolvedSym
+                      << "'\n");
 
-    if (!WS.isExternal())
-      SymbolFlags.emplace_back(WS.getName(), wasm::WASM_SYMBOL_BINDING_LOCAL);
+    if (WS.isFunction()) {
+      assert(WasmIndices.count(ResolvedSym) > 0);
+      uint32_t WasmIndex = WasmIndices.find(ResolvedSym)->second;
+      WasmIndices[&WS] = WasmIndex;
+      LLVM_DEBUG(dbgs() << "  -> index:" << WasmIndex << "\n");
+    } else if (WS.isData()) {
+      assert(DataLocations.count(ResolvedSym) > 0);
+      const wasm::WasmDataReference &Ref =
+          DataLocations.find(ResolvedSym)->second;
+      DataLocations[&WS] = Ref;
+      LLVM_DEBUG(dbgs() << "  -> index:" << Ref.Segment << "\n");
+    } else {
+      report_fatal_error("don't yet support global aliases");
+    }
+  }
+
+  // Finally, populate the symbol table itself, in its "natural" order.
+  for (const MCSymbol &S : Asm.symbols()) {
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    if (!isInSymtab(WS)) {
+      WS.setIndex(INVALID_INDEX);
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "adding to symtab: " << WS << "\n");
+
+    uint32_t Flags = 0;
+    if (WS.isWeak())
+      Flags |= wasm::WASM_SYMBOL_BINDING_WEAK;
+    if (WS.isHidden())
+      Flags |= wasm::WASM_SYMBOL_VISIBILITY_HIDDEN;
+    if (!WS.isExternal() && WS.isDefined())
+      Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL;
+    if (WS.isUndefined())
+      Flags |= wasm::WASM_SYMBOL_UNDEFINED;
+
+    wasm::WasmSymbolInfo Info;
+    Info.Name = WS.getName();
+    Info.Kind = WS.getType();
+    Info.Flags = Flags;
+    if (!WS.isData()) {
+      assert(WasmIndices.count(&WS) > 0);
+      Info.ElementIndex = WasmIndices.find(&WS)->second;
+    } else if (WS.isDefined()) {
+      assert(DataLocations.count(&WS) > 0);
+      Info.DataRef = DataLocations.find(&WS)->second;
+    }
+    WS.setIndex(SymbolInfos.size());
+    SymbolInfos.emplace_back(Info);
   }
 
   {
     auto HandleReloc = [&](const WasmRelocationEntry &Rel) {
-      // Functions referenced by a relocation need to prepared to be called
-      // indirectly.
-      const MCSymbolWasm& WS = *Rel.Symbol;
-      if (WS.isFunction() && IndirectSymbolIndices.count(&WS) == 0) {
-        switch (Rel.Type) {
-        case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
-        case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
-        case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
-        case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: {
-          uint32_t Index = SymbolIndices.find(&WS)->second;
-          IndirectSymbolIndices[&WS] = TableElems.size();
-          DEBUG(dbgs() << "  -> adding to table: " << TableElems.size() << "\n");
-          TableElems.push_back(Index);
-          registerFunctionType(WS);
-          break;
-        }
-        default:
-          break;
-        }
+      // Functions referenced by a relocation need to put in the table.  This is
+      // purely to make the object file's provisional values readable, and is
+      // ignored by the linker, which re-calculates the relocations itself.
+      if (Rel.Type != wasm::R_WEBASSEMBLY_TABLE_INDEX_I32 &&
+          Rel.Type != wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB)
+        return;
+      assert(Rel.Symbol->isFunction());
+      const MCSymbolWasm &WS = *ResolveSymbol(*Rel.Symbol);
+      uint32_t FunctionIndex = WasmIndices.find(&WS)->second;
+      uint32_t TableIndex = TableElems.size() + kInitialTableOffset;
+      if (TableIndices.try_emplace(&WS, TableIndex).second) {
+        LLVM_DEBUG(dbgs() << "  -> adding " << WS.getName()
+                          << " to table: " << TableIndex << "\n");
+        TableElems.push_back(FunctionIndex);
+        registerFunctionType(WS);
       }
     };
 
@@ -1314,21 +1399,35 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       continue;
     if (WS.getFragmentList().empty())
       continue;
-    if (WS.getFragmentList().size() != 2)
+
+    // init_array is expected to contain a single non-empty data fragment
+    if (WS.getFragmentList().size() != 3)
       report_fatal_error("only one .init_array section fragment supported");
-    const MCFragment &AlignFrag = *WS.begin();
+
+    auto IT = WS.begin();
+    const MCFragment &EmptyFrag = *IT;
+    if (EmptyFrag.getKind() != MCFragment::FT_Data)
+      report_fatal_error(".init_array section should be aligned");
+
+    IT = std::next(IT);
+    const MCFragment &AlignFrag = *IT;
     if (AlignFrag.getKind() != MCFragment::FT_Align)
       report_fatal_error(".init_array section should be aligned");
     if (cast<MCAlignFragment>(AlignFrag).getAlignment() != (is64Bit() ? 8 : 4))
       report_fatal_error(".init_array section should be aligned for pointers");
-    const MCFragment &Frag = *std::next(WS.begin());
+
+    const MCFragment &Frag = *std::next(IT);
     if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
       report_fatal_error("only data supported in .init_array section");
+
     uint16_t Priority = UINT16_MAX;
-    if (WS.getSectionName().size() != 11) {
-      if (WS.getSectionName()[11] != '.')
+    unsigned PrefixLength = strlen(".init_array");
+    if (WS.getSectionName().size() > PrefixLength) {
+      if (WS.getSectionName()[PrefixLength] != '.')
         report_fatal_error(".init_array section priority should start with '.'");
-      if (WS.getSectionName().substr(12).getAsInteger(10, Priority))
+      if (WS.getSectionName()
+              .substr(PrefixLength + 1)
+              .getAsInteger(10, Priority))
         report_fatal_error("invalid .init_array section priority");
     }
     const auto &DataFrag = cast<MCDataFragment>(Frag);
@@ -1347,11 +1446,10 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
         report_fatal_error("fixups in .init_array should be symbol references");
       if (Sym->getKind() != MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
         report_fatal_error("symbols in .init_array should be for functions");
-      auto I = SymbolIndices.find(cast<MCSymbolWasm>(&Sym->getSymbol()));
-      if (I == SymbolIndices.end())
-        report_fatal_error("symbols in .init_array should be defined");
-      uint32_t Index = I->second;
-      InitFuncs.push_back(std::make_pair(Priority, Index));
+      if (Sym->getSymbol().getIndex() == INVALID_INDEX)
+        report_fatal_error("symbols in .init_array should exist in symbtab");
+      InitFuncs.push_back(
+          std::make_pair(Priority, Sym->getSymbol().getIndex()));
     }
   }
 
@@ -1367,22 +1465,19 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   writeExportSection(Exports);
   writeElemSection(TableElems);
   writeCodeSection(Asm, Layout, Functions);
-  writeDataSection(DataSegments);
-  writeNameSection(Functions, Imports, NumFuncImports);
-  writeCodeRelocSection();
-  writeDataRelocSection();
-  writeLinkingMetaDataSection(DataSegments, DataSize, SymbolFlags,
-                              InitFuncs);
+  writeDataSection();
+  writeCustomSections(Asm, Layout);
+  writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats);
+  writeRelocSection(CodeSectionIndex, "CODE", CodeRelocations);
+  writeRelocSection(DataSectionIndex, "DATA", DataRelocations);
+  writeCustomRelocSections();
 
   // TODO: Translate the .comment section to the output.
-  // TODO: Translate debug sections to the output.
+  return W.OS.tell() - StartOffset;
 }
 
 std::unique_ptr<MCObjectWriter>
 llvm::createWasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
                              raw_pwrite_stream &OS) {
-  // FIXME: Can't use make_unique<WasmObjectWriter>(...) as WasmObjectWriter's
-  //        destructor is private. Is that necessary?
-  return std::unique_ptr<MCObjectWriter>(
-      new WasmObjectWriter(std::move(MOTW), OS));
+  return llvm::make_unique<WasmObjectWriter>(std::move(MOTW), OS);
 }
diff --git a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 9f1db46939c7..9ffecd99df68 100644
--- a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -125,6 +125,8 @@ public:
 
 class WinCOFFObjectWriter : public MCObjectWriter {
 public:
+  support::endian::Writer W;
+
   using symbols = std::vector<std::unique_ptr<COFFSymbol>>;
   using sections = std::vector<std::unique_ptr<COFFSection>>;
 
@@ -204,7 +206,7 @@ public:
   void assignSectionNumbers();
   void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
 
-  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+  uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 };
 
 } // end anonymous namespace
@@ -225,7 +227,7 @@ void COFFSymbol::set_name_offset(uint32_t Offset) {
 
 WinCOFFObjectWriter::WinCOFFObjectWriter(
     std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
-    : MCObjectWriter(OS, true), TargetObjectWriter(std::move(MOTW)) {
+    : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {
   Header.Machine = TargetObjectWriter->getMachine();
 }
 
@@ -472,40 +474,40 @@ bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) {
 
 void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
   if (UseBigObj) {
-    writeLE16(COFF::IMAGE_FILE_MACHINE_UNKNOWN);
-    writeLE16(0xFFFF);
-    writeLE16(COFF::BigObjHeader::MinBigObjectVersion);
-    writeLE16(Header.Machine);
-    writeLE32(Header.TimeDateStamp);
-    writeBytes(StringRef(COFF::BigObjMagic, sizeof(COFF::BigObjMagic)));
-    writeLE32(0);
-    writeLE32(0);
-    writeLE32(0);
-    writeLE32(0);
-    writeLE32(Header.NumberOfSections);
-    writeLE32(Header.PointerToSymbolTable);
-    writeLE32(Header.NumberOfSymbols);
+    W.write<uint16_t>(COFF::IMAGE_FILE_MACHINE_UNKNOWN);
+    W.write<uint16_t>(0xFFFF);
+    W.write<uint16_t>(COFF::BigObjHeader::MinBigObjectVersion);
+    W.write<uint16_t>(Header.Machine);
+    W.write<uint32_t>(Header.TimeDateStamp);
+    W.OS.write(COFF::BigObjMagic, sizeof(COFF::BigObjMagic));
+    W.write<uint32_t>(0);
+    W.write<uint32_t>(0);
+    W.write<uint32_t>(0);
+    W.write<uint32_t>(0);
+    W.write<uint32_t>(Header.NumberOfSections);
+    W.write<uint32_t>(Header.PointerToSymbolTable);
+    W.write<uint32_t>(Header.NumberOfSymbols);
   } else {
-    writeLE16(Header.Machine);
-    writeLE16(static_cast<int16_t>(Header.NumberOfSections));
-    writeLE32(Header.TimeDateStamp);
-    writeLE32(Header.PointerToSymbolTable);
-    writeLE32(Header.NumberOfSymbols);
-    writeLE16(Header.SizeOfOptionalHeader);
-    writeLE16(Header.Characteristics);
+    W.write<uint16_t>(Header.Machine);
+    W.write<uint16_t>(static_cast<int16_t>(Header.NumberOfSections));
+    W.write<uint32_t>(Header.TimeDateStamp);
+    W.write<uint32_t>(Header.PointerToSymbolTable);
+    W.write<uint32_t>(Header.NumberOfSymbols);
+    W.write<uint16_t>(Header.SizeOfOptionalHeader);
+    W.write<uint16_t>(Header.Characteristics);
   }
 }
 
 void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol &S) {
-  writeBytes(StringRef(S.Data.Name, COFF::NameSize));
-  writeLE32(S.Data.Value);
+  W.OS.write(S.Data.Name, COFF::NameSize);
+  W.write<uint32_t>(S.Data.Value);
   if (UseBigObj)
-    writeLE32(S.Data.SectionNumber);
+    W.write<uint32_t>(S.Data.SectionNumber);
   else
-    writeLE16(static_cast<int16_t>(S.Data.SectionNumber));
-  writeLE16(S.Data.Type);
-  write8(S.Data.StorageClass);
-  write8(S.Data.NumberOfAuxSymbols);
+    W.write<uint16_t>(static_cast<int16_t>(S.Data.SectionNumber));
+  W.write<uint16_t>(S.Data.Type);
+  W.OS << char(S.Data.StorageClass);
+  W.OS << char(S.Data.NumberOfAuxSymbols);
   WriteAuxiliarySymbols(S.Aux);
 }
 
@@ -514,46 +516,45 @@ void WinCOFFObjectWriter::WriteAuxiliarySymbols(
   for (const AuxSymbol &i : S) {
     switch (i.AuxType) {
     case ATFunctionDefinition:
-      writeLE32(i.Aux.FunctionDefinition.TagIndex);
-      writeLE32(i.Aux.FunctionDefinition.TotalSize);
-      writeLE32(i.Aux.FunctionDefinition.PointerToLinenumber);
-      writeLE32(i.Aux.FunctionDefinition.PointerToNextFunction);
-      WriteZeros(sizeof(i.Aux.FunctionDefinition.unused));
+      W.write<uint32_t>(i.Aux.FunctionDefinition.TagIndex);
+      W.write<uint32_t>(i.Aux.FunctionDefinition.TotalSize);
+      W.write<uint32_t>(i.Aux.FunctionDefinition.PointerToLinenumber);
+      W.write<uint32_t>(i.Aux.FunctionDefinition.PointerToNextFunction);
+      W.OS.write_zeros(sizeof(i.Aux.FunctionDefinition.unused));
       if (UseBigObj)
-        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
+        W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATbfAndefSymbol:
-      WriteZeros(sizeof(i.Aux.bfAndefSymbol.unused1));
-      writeLE16(i.Aux.bfAndefSymbol.Linenumber);
-      WriteZeros(sizeof(i.Aux.bfAndefSymbol.unused2));
-      writeLE32(i.Aux.bfAndefSymbol.PointerToNextFunction);
-      WriteZeros(sizeof(i.Aux.bfAndefSymbol.unused3));
+      W.OS.write_zeros(sizeof(i.Aux.bfAndefSymbol.unused1));
+      W.write<uint16_t>(i.Aux.bfAndefSymbol.Linenumber);
+      W.OS.write_zeros(sizeof(i.Aux.bfAndefSymbol.unused2));
+      W.write<uint32_t>(i.Aux.bfAndefSymbol.PointerToNextFunction);
+      W.OS.write_zeros(sizeof(i.Aux.bfAndefSymbol.unused3));
       if (UseBigObj)
-        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
+        W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATWeakExternal:
-      writeLE32(i.Aux.WeakExternal.TagIndex);
-      writeLE32(i.Aux.WeakExternal.Characteristics);
-      WriteZeros(sizeof(i.Aux.WeakExternal.unused));
+      W.write<uint32_t>(i.Aux.WeakExternal.TagIndex);
+      W.write<uint32_t>(i.Aux.WeakExternal.Characteristics);
+      W.OS.write_zeros(sizeof(i.Aux.WeakExternal.unused));
       if (UseBigObj)
-        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
+        W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATFile:
-      writeBytes(
-          StringRef(reinterpret_cast<const char *>(&i.Aux),
-                    UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size));
+      W.OS.write(reinterpret_cast<const char *>(&i.Aux),
+                        UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size);
       break;
     case ATSectionDefinition:
-      writeLE32(i.Aux.SectionDefinition.Length);
-      writeLE16(i.Aux.SectionDefinition.NumberOfRelocations);
-      writeLE16(i.Aux.SectionDefinition.NumberOfLinenumbers);
-      writeLE32(i.Aux.SectionDefinition.CheckSum);
-      writeLE16(static_cast<int16_t>(i.Aux.SectionDefinition.Number));
-      write8(i.Aux.SectionDefinition.Selection);
-      WriteZeros(sizeof(i.Aux.SectionDefinition.unused));
-      writeLE16(static_cast<int16_t>(i.Aux.SectionDefinition.Number >> 16));
+      W.write<uint32_t>(i.Aux.SectionDefinition.Length);
+      W.write<uint16_t>(i.Aux.SectionDefinition.NumberOfRelocations);
+      W.write<uint16_t>(i.Aux.SectionDefinition.NumberOfLinenumbers);
+      W.write<uint32_t>(i.Aux.SectionDefinition.CheckSum);
+      W.write<uint16_t>(static_cast<int16_t>(i.Aux.SectionDefinition.Number));
+      W.OS << char(i.Aux.SectionDefinition.Selection);
+      W.OS.write_zeros(sizeof(i.Aux.SectionDefinition.unused));
+      W.write<uint16_t>(static_cast<int16_t>(i.Aux.SectionDefinition.Number >> 16));
       if (UseBigObj)
-        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
+        W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     }
   }
@@ -567,10 +568,10 @@ void WinCOFFObjectWriter::writeSectionHeaders() {
   std::vector<COFFSection *> Arr;
   for (auto &Section : Sections)
     Arr.push_back(Section.get());
-  std::sort(Arr.begin(), Arr.end(),
-            [](const COFFSection *A, const COFFSection *B) {
-              return A->Number < B->Number;
-            });
+  llvm::sort(Arr.begin(), Arr.end(),
+             [](const COFFSection *A, const COFFSection *B) {
+               return A->Number < B->Number;
+             });
 
   for (auto &Section : Arr) {
     if (Section->Number == -1)
@@ -579,23 +580,23 @@ void WinCOFFObjectWriter::writeSectionHeaders() {
     COFF::section &S = Section->Header;
     if (Section->Relocations.size() >= 0xffff)
       S.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-    writeBytes(StringRef(S.Name, COFF::NameSize));
-    writeLE32(S.VirtualSize);
-    writeLE32(S.VirtualAddress);
-    writeLE32(S.SizeOfRawData);
-    writeLE32(S.PointerToRawData);
-    writeLE32(S.PointerToRelocations);
-    writeLE32(S.PointerToLineNumbers);
-    writeLE16(S.NumberOfRelocations);
-    writeLE16(S.NumberOfLineNumbers);
-    writeLE32(S.Characteristics);
+    W.OS.write(S.Name, COFF::NameSize);
+    W.write<uint32_t>(S.VirtualSize);
+    W.write<uint32_t>(S.VirtualAddress);
+    W.write<uint32_t>(S.SizeOfRawData);
+    W.write<uint32_t>(S.PointerToRawData);
+    W.write<uint32_t>(S.PointerToRelocations);
+    W.write<uint32_t>(S.PointerToLineNumbers);
+    W.write<uint16_t>(S.NumberOfRelocations);
+    W.write<uint16_t>(S.NumberOfLineNumbers);
+    W.write<uint32_t>(S.Characteristics);
   }
 }
 
 void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
-  writeLE32(R.VirtualAddress);
-  writeLE32(R.SymbolTableIndex);
-  writeLE16(R.Type);
+  W.write<uint32_t>(R.VirtualAddress);
+  W.write<uint32_t>(R.SymbolTableIndex);
+  W.write<uint16_t>(R.Type);
 }
 
 // Write MCSec's contents. What this function does is essentially
@@ -608,18 +609,10 @@ uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
   // to CRC the data before we dump it into the object file.
   SmallVector<char, 128> Buf;
   raw_svector_ostream VecOS(Buf);
-  raw_pwrite_stream &OldStream = getStream();
-
-  // Redirect the output stream to our buffer and fill our buffer with
-  // the section data.
-  setStream(VecOS);
-  Asm.writeSectionData(&MCSec, Layout);
-
-  // Reset the stream back to what it was before.
-  setStream(OldStream);
+  Asm.writeSectionData(VecOS, &MCSec, Layout);
 
   // Write the section contents to the object file.
-  getStream() << Buf;
+  W.OS << Buf;
 
   // Calculate our CRC with an initial value of '0', this is not how
   // JamCRC is specified but it aligns with the expected output.
@@ -637,13 +630,13 @@ void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
 
   // Write the section contents.
   if (Sec.Header.PointerToRawData != 0) {
-    assert(getStream().tell() <= Sec.Header.PointerToRawData &&
+    assert(W.OS.tell() <= Sec.Header.PointerToRawData &&
            "Section::PointerToRawData is insane!");
 
-    unsigned PaddingSize = Sec.Header.PointerToRawData - getStream().tell();
+    unsigned PaddingSize = Sec.Header.PointerToRawData - W.OS.tell();
     assert(PaddingSize < 4 &&
            "Should only need at most three bytes of padding!");
-    WriteZeros(PaddingSize);
+    W.OS.write_zeros(PaddingSize);
 
     uint32_t CRC = writeSectionContents(Asm, Layout, MCSec);
 
@@ -662,7 +655,7 @@ void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
     return;
   }
 
-  assert(getStream().tell() == Sec.Header.PointerToRelocations &&
+  assert(W.OS.tell() == Sec.Header.PointerToRelocations &&
          "Section::PointerToRelocations is insane!");
 
   if (Sec.Relocations.size() >= 0xffff) {
@@ -697,12 +690,14 @@ void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
 bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
     const MCAssembler &Asm, const MCSymbol &SymA, const MCFragment &FB,
     bool InSet, bool IsPCRel) const {
-  // MS LINK expects to be able to replace all references to a function with a
-  // thunk to implement their /INCREMENTAL feature.  Make sure we don't optimize
-  // away any relocations to functions.
+  // Don't drop relocations between functions, even if they are in the same text
+  // section. Multiple Visual C++ linker features depend on having the
+  // relocations present. The /INCREMENTAL flag will cause these relocations to
+  // point to thunks, and the /GUARD:CF flag assumes that it can use relocations
+  // to approximate the set of all address taken functions. LLD's implementation
+  // of /GUARD:CF also relies on the existance of these relocations.
   uint16_t Type = cast<MCSymbolCOFF>(SymA).getType();
-  if (Asm.isIncrementalLinkerCompatible() &&
-      (Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
+  if ((Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
     return false;
   return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
                                                                 InSet, IsPCRel);
@@ -906,7 +901,7 @@ void WinCOFFObjectWriter::assignSectionNumbers() {
 // Assign file offsets to COFF object file structures.
 void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
                                             const MCAsmLayout &Layout) {
-  unsigned Offset = getInitialOffset();
+  unsigned Offset = W.OS.tell();
 
   Offset += UseBigObj ? COFF::Header32Size : COFF::Header16Size;
   Offset += COFF::SectionSize * Header.NumberOfSections;
@@ -967,8 +962,10 @@ void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
   Header.PointerToSymbolTable = Offset;
 }
 
-void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
-                                      const MCAsmLayout &Layout) {
+uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
+                                          const MCAsmLayout &Layout) {
+  uint64_t StartOffset = W.OS.tell();
+
   if (Sections.size() > INT32_MAX)
     report_fatal_error(
         "PE COFF object files can't have more than 2147483647 sections");
@@ -1064,7 +1061,7 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
   for (; I != IE && J != JE; ++I, ++J)
     writeSection(Asm, Layout, **I, *J);
 
-  assert(getStream().tell() == Header.PointerToSymbolTable &&
+  assert(W.OS.tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
   // Write a symbol table.
@@ -1073,7 +1070,9 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
       WriteSymbol(*Symbol);
 
   // Write a string table, which completes the entire COFF file.
-  Strings.write(getStream());
+  Strings.write(W.OS);
+
+  return W.OS.tell() - StartOffset;
 }
 
 MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
diff --git a/contrib/llvm/lib/Object/Archive.cpp b/contrib/llvm/lib/Object/Archive.cpp
index b17eefd220b8..8ec115a5566c 100644
--- a/contrib/llvm/lib/Object/Archive.cpp
+++ b/contrib/llvm/lib/Object/Archive.cpp
@@ -175,15 +175,19 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
                             "the end of the string table for archive member "
                             "header at offset " + Twine(ArchiveOffset));
     }
-    const char *addr = Parent->getStringTable().begin() + StringOffset;
 
     // GNU long file names end with a "/\n".
     if (Parent->kind() == Archive::K_GNU ||
         Parent->kind() == Archive::K_GNU64) {
-      StringRef::size_type End = StringRef(addr).find('\n');
-      return StringRef(addr, End - 1);
+      size_t End = Parent->getStringTable().find('\n', /*From=*/StringOffset);
+      if (End == StringRef::npos || End < 1 ||
+          Parent->getStringTable()[End - 1] != '/') {
+        return malformedError("string table at long name offset " +
+                              Twine(StringOffset) + "not terminated");
+      }
+      return Parent->getStringTable().slice(StringOffset, End - 1);
     }
-    return addr;
+    return Parent->getStringTable().begin() + StringOffset;
   }
 
   if (Name.startswith("#1/")) {
diff --git a/contrib/llvm/lib/Object/ArchiveWriter.cpp b/contrib/llvm/lib/Object/ArchiveWriter.cpp
index b3b812daae2e..ea17b2220a0b 100644
--- a/contrib/llvm/lib/Object/ArchiveWriter.cpp
+++ b/contrib/llvm/lib/Object/ArchiveWriter.cpp
@@ -35,15 +35,6 @@
 
 using namespace llvm;
 
-// The SYM64 format is used when an archive's member offsets are larger than
-// 32-bits can hold. The need for this shift in format is detected by
-// writeArchive. To test this we need to generate a file with a member that has
-// an offset larger than 32-bits but this demands a very slow test. To speed
-// the test up we use this flag to pretend like the cutoff happens before
-// 32-bits and instead happens at some much smaller value.
-static cl::opt<int> Sym64Threshold("sym64-threshold", cl::Hidden,
-                                   cl::init(32));
-
 NewArchiveMember::NewArchiveMember(MemoryBufferRef BufRef)
     : Buf(MemoryBuffer::getMemBuffer(BufRef, false)),
       MemberName(BufRef.getBufferIdentifier()) {}
@@ -145,10 +136,8 @@ static bool isBSDLike(object::Archive::Kind Kind) {
 
 template <class T>
 static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) {
-  if (isBSDLike(Kind))
-    support::endian::Writer<support::little>(Out).write(Val);
-  else
-    support::endian::Writer<support::big>(Out).write(Val);
+  support::endian::write(Out, Val,
+                         isBSDLike(Kind) ? support::little : support::big);
 }
 
 static void printRestOfMemberHeader(
@@ -216,7 +205,7 @@ static std::string computeRelativePath(StringRef From, StringRef To) {
   for (auto ToE = sys::path::end(To); ToI != ToE; ++ToI)
     sys::path::append(Relative, *ToI);
 
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
   // Replace backslashes with slashes so that the path is portable between *nix
   // and Windows.
   std::replace(Relative.begin(), Relative.end(), '\\', '/');
@@ -305,8 +294,7 @@ static bool isArchiveSymbol(const object::BasicSymbolRef &S) {
     return false;
   if (!(Symflags & object::SymbolRef::SF_Global))
     return false;
-  if (Symflags & object::SymbolRef::SF_Undefined &&
-      !(Symflags & object::SymbolRef::SF_Indirect))
+  if (Symflags & object::SymbolRef::SF_Undefined)
     return false;
   return true;
 }
@@ -490,6 +478,19 @@ Error llvm::writeArchive(StringRef ArcName,
       // We assume 32-bit symbols to see if 32-bit symbols are possible or not.
       MaxOffset += M.Symbols.size() * 4;
     }
+
+    // The SYM64 format is used when an archive's member offsets are larger than
+    // 32-bits can hold. The need for this shift in format is detected by
+    // writeArchive. To test this we need to generate a file with a member that
+    // has an offset larger than 32-bits but this demands a very slow test. To
+    // speed the test up we use this environment variable to pretend like the
+    // cutoff happens before 32-bits and instead happens at some much smaller
+    // value.
+    const char *Sym64Env = std::getenv("SYM64_THRESHOLD");
+    int Sym64Threshold = 32;
+    if (Sym64Env)
+      StringRef(Sym64Env).getAsInteger(10, Sym64Threshold);
+
     // If LastOffset isn't going to fit in a 32-bit varible we need to switch
     // to 64-bit. Note that the file can be larger than 4GB as long as the last
     // member starts before the 4GB offset.
diff --git a/contrib/llvm/lib/Object/Binary.cpp b/contrib/llvm/lib/Object/Binary.cpp
index c4565db459e6..d7c25921ec36 100644
--- a/contrib/llvm/lib/Object/Binary.cpp
+++ b/contrib/llvm/lib/Object/Binary.cpp
@@ -75,6 +75,9 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
     return MachOUniversalBinary::create(Buffer);
   case file_magic::windows_resource:
     return WindowsResource::createWindowsResource(Buffer);
+  case file_magic::pdb:
+    // PDB does not support the Binary interface.
+    return errorCodeToError(object_error::invalid_file_type);
   case file_magic::unknown:
   case file_magic::coff_cl_gl_object:
     // Unrecognized object file format.
diff --git a/contrib/llvm/lib/Object/COFFImportFile.cpp b/contrib/llvm/lib/Object/COFFImportFile.cpp
index 93631f1ad811..dc11cc4bcffe 100644
--- a/contrib/llvm/lib/Object/COFFImportFile.cpp
+++ b/contrib/llvm/lib/Object/COFFImportFile.cpp
@@ -91,7 +91,15 @@ static void writeStringTable(std::vector<uint8_t> &B,
 }
 
 static ImportNameType getNameType(StringRef Sym, StringRef ExtName,
-                                  MachineTypes Machine) {
+                                  MachineTypes Machine, bool MinGW) {
+  // A decorated stdcall function in MSVC is exported with the
+  // type IMPORT_NAME, and the exported function name includes the
+  // the leading underscore. In MinGW on the other hand, a decorated
+  // stdcall function still omits the underscore (IMPORT_NAME_NOPREFIX).
+  // See the comment in isDecorated in COFFModuleDefinition.cpp for more
+  // details.
+  if (ExtName.startswith("_") && ExtName.contains('@') && !MinGW)
+    return IMPORT_NAME;
   if (Sym != ExtName)
     return IMPORT_NAME_UNDECORATE;
   if (Machine == IMAGE_FILE_MACHINE_I386 && Sym.startswith("_"))
@@ -538,7 +546,12 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
        u16(0),
        IMAGE_SYM_CLASS_WEAK_EXTERNAL,
        1},
-      {{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0},
+      {{{2, 0, 0, 0, IMAGE_WEAK_EXTERN_SEARCH_ALIAS, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_NULL,
+       0},
   };
   SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);
 
@@ -558,7 +571,7 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
 
 Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
-                         MachineTypes Machine, bool MakeWeakAliases) {
+                         MachineTypes Machine, bool MinGW) {
 
   std::vector<NewArchiveMember> Members;
   ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);
@@ -576,12 +589,6 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
     if (E.Private)
       continue;
 
-    if (E.isWeak() && MakeWeakAliases) {
-      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
-      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
-      continue;
-    }
-
     ImportType ImportType = IMPORT_CODE;
     if (E.Data)
       ImportType = IMPORT_DATA;
@@ -589,7 +596,7 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       ImportType = IMPORT_CONST;
 
     StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
-    ImportNameType NameType = getNameType(SymbolName, E.Name, Machine);
+    ImportNameType NameType = getNameType(SymbolName, E.Name, Machine, MinGW);
     Expected<std::string> Name = E.ExtName.empty()
                                      ? SymbolName
                                      : replace(SymbolName, E.Name, E.ExtName);
@@ -597,6 +604,12 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
     if (!Name)
       return Name.takeError();
 
+    if (!E.AliasTarget.empty() && *Name != E.AliasTarget) {
+      Members.push_back(OF.createWeakExternal(E.AliasTarget, *Name, false));
+      Members.push_back(OF.createWeakExternal(E.AliasTarget, *Name, true));
+      continue;
+    }
+
     Members.push_back(
         OF.createShortImport(*Name, E.Ordinal, ImportType, NameType));
   }
diff --git a/contrib/llvm/lib/Object/COFFModuleDefinition.cpp b/contrib/llvm/lib/Object/COFFModuleDefinition.cpp
index a571354648d6..c703071b86e0 100644
--- a/contrib/llvm/lib/Object/COFFModuleDefinition.cpp
+++ b/contrib/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -37,6 +37,7 @@ enum Kind {
   Identifier,
   Comma,
   Equal,
+  EqualEqual,
   KwBase,
   KwConstant,
   KwData,
@@ -104,9 +105,10 @@ public:
     }
     case '=':
       Buf = Buf.drop_front();
-      // GNU dlltool accepts both = and ==.
-      if (Buf.startswith("="))
+      if (Buf.startswith("=")) {
         Buf = Buf.drop_front();
+        return Token(EqualEqual, "==");
+      }
       return Token(Equal, "=");
     case ',':
       Buf = Buf.drop_front();
@@ -282,6 +284,13 @@ private:
         E.Private = true;
         continue;
       }
+      if (Tok.K == EqualEqual) {
+        read();
+        E.AliasTarget = Tok.Value;
+        if (Machine == IMAGE_FILE_MACHINE_I386 && !isDecorated(E.AliasTarget, MingwDef))
+          E.AliasTarget = std::string("_").append(E.AliasTarget);
+        continue;
+      }
       unget();
       Info.Exports.push_back(E);
       return Error::success();
diff --git a/contrib/llvm/lib/Object/COFFObjectFile.cpp b/contrib/llvm/lib/Object/COFFObjectFile.cpp
index b544fa5c1470..d72da3187e07 100644
--- a/contrib/llvm/lib/Object/COFFObjectFile.cpp
+++ b/contrib/llvm/lib/Object/COFFObjectFile.cpp
@@ -217,10 +217,10 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   if (Symb.isExternal() || Symb.isWeakExternal())
     Result |= SymbolRef::SF_Global;
 
-  if (Symb.isWeakExternal()) {
+  if (const coff_aux_weak_external *AWE = Symb.getWeakExternal()) {
     Result |= SymbolRef::SF_Weak;
-    // We use indirect to allow the archiver to write weak externs
-    Result |= SymbolRef::SF_Indirect;
+    if (AWE->Characteristics != COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS)
+      Result |= SymbolRef::SF_Undefined;
   }
 
   if (Symb.getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE)
@@ -235,7 +235,7 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   if (Symb.isCommon())
     Result |= SymbolRef::SF_Common;
 
-  if (Symb.isAnyUndefined())
+  if (Symb.isUndefined())
     Result |= SymbolRef::SF_Undefined;
 
   return Result;
@@ -910,6 +910,12 @@ Triple::ArchType COFFObjectFile::getArch() const {
   }
 }
 
+Expected<uint64_t> COFFObjectFile::getStartAddress() const {
+  if (PE32Header)
+    return PE32Header->AddressOfEntryPoint;
+  return 0;
+}
+
 iterator_range<import_directory_iterator>
 COFFObjectFile::import_directories() const {
   return make_range(import_directory_begin(), import_directory_end());
@@ -944,7 +950,7 @@ COFFObjectFile::getPE32PlusHeader(const pe32plus_header *&Res) const {
 std::error_code
 COFFObjectFile::getDataDirectory(uint32_t Index,
                                  const data_directory *&Res) const {
-  // Error if if there's no data directory or the index is out of range.
+  // Error if there's no data directory or the index is out of range.
   if (!DataDirectory) {
     Res = nullptr;
     return object_error::parse_failed;
@@ -973,6 +979,21 @@ std::error_code COFFObjectFile::getSection(int32_t Index,
   return object_error::parse_failed;
 }
 
+std::error_code COFFObjectFile::getSection(StringRef SectionName,
+                                           const coff_section *&Result) const {
+  Result = nullptr;
+  StringRef SecName;
+  for (const SectionRef &Section : sections()) {
+    if (std::error_code E = Section.getName(SecName))
+      return E;
+    if (SecName == SectionName) {
+      Result = getCOFFSection(Section);
+      return std::error_code();
+    }
+  }
+  return object_error::parse_failed;
+}
+
 std::error_code COFFObjectFile::getString(uint32_t Offset,
                                           StringRef &Result) const {
   if (StringTableSize <= 4)
@@ -1147,13 +1168,10 @@ COFFObjectFile::getCOFFRelocation(const RelocationRef &Reloc) const {
   return toRel(Reloc.getRawDataRefImpl());
 }
 
-iterator_range<const coff_relocation *>
+ArrayRef<coff_relocation>
 COFFObjectFile::getRelocations(const coff_section *Sec) const {
-  const coff_relocation *I = getFirstReloc(Sec, Data, base());
-  const coff_relocation *E = I;
-  if (I)
-    E += getNumberOfRelocations(Sec, Data, base());
-  return make_range(I, E);
+  return {getFirstReloc(Sec, Data, base()),
+          getNumberOfRelocations(Sec, Data, base())};
 }
 
 #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(reloc_type)                           \
diff --git a/contrib/llvm/lib/Object/ELF.cpp b/contrib/llvm/lib/Object/ELF.cpp
index 5906dc5f5307..2eefb7ef13a3 100644
--- a/contrib/llvm/lib/Object/ELF.cpp
+++ b/contrib/llvm/lib/Object/ELF.cpp
@@ -125,13 +125,6 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
-  case ELF::EM_WEBASSEMBLY:
-    switch (Type) {
-#include "llvm/BinaryFormat/ELFRelocs/WebAssembly.def"
-    default:
-      break;
-    }
-    break;
   case ELF::EM_AMDGPU:
     switch (Type) {
 #include "llvm/BinaryFormat/ELFRelocs/AMDGPU.def"
@@ -154,6 +147,50 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
 
 #undef ELF_RELOC
 
+uint32_t llvm::object::getELFRelrRelocationType(uint32_t Machine) {
+  switch (Machine) {
+  case ELF::EM_X86_64:
+    return ELF::R_X86_64_RELATIVE;
+  case ELF::EM_386:
+  case ELF::EM_IAMCU:
+    return ELF::R_386_RELATIVE;
+  case ELF::EM_MIPS:
+    break;
+  case ELF::EM_AARCH64:
+    return ELF::R_AARCH64_RELATIVE;
+  case ELF::EM_ARM:
+    return ELF::R_ARM_RELATIVE;
+  case ELF::EM_ARC_COMPACT:
+  case ELF::EM_ARC_COMPACT2:
+    return ELF::R_ARC_RELATIVE;
+  case ELF::EM_AVR:
+    break;
+  case ELF::EM_HEXAGON:
+    return ELF::R_HEX_RELATIVE;
+  case ELF::EM_LANAI:
+    break;
+  case ELF::EM_PPC:
+    break;
+  case ELF::EM_PPC64:
+    return ELF::R_PPC64_RELATIVE;
+  case ELF::EM_RISCV:
+    return ELF::R_RISCV_RELATIVE;
+  case ELF::EM_S390:
+    return ELF::R_390_RELATIVE;
+  case ELF::EM_SPARC:
+  case ELF::EM_SPARC32PLUS:
+  case ELF::EM_SPARCV9:
+    return ELF::R_SPARC_RELATIVE;
+  case ELF::EM_AMDGPU:
+    break;
+  case ELF::EM_BPF:
+    break;
+  default:
+    break;
+  }
+  return 0;
+}
+
 StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
   switch (Machine) {
   case ELF::EM_ARM:
@@ -202,9 +239,14 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_PREINIT_ARRAY);
     STRINGIFY_ENUM_CASE(ELF, SHT_GROUP);
     STRINGIFY_ENUM_CASE(ELF, SHT_SYMTAB_SHNDX);
+    STRINGIFY_ENUM_CASE(ELF, SHT_RELR);
     STRINGIFY_ENUM_CASE(ELF, SHT_ANDROID_REL);
     STRINGIFY_ENUM_CASE(ELF, SHT_ANDROID_RELA);
+    STRINGIFY_ENUM_CASE(ELF, SHT_ANDROID_RELR);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_ODRTAB);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LINKER_OPTIONS);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CALL_GRAPH_PROFILE);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_ADDRSIG);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
@@ -217,6 +259,85 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
 
 template <class ELFT>
 Expected<std::vector<typename ELFT::Rela>>
+ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
+  // This function decodes the contents of an SHT_RELR packed relocation
+  // section.
+  //
+  // Proposal for adding SHT_RELR sections to generic-abi is here:
+  //   https://groups.google.com/forum/#!topic/generic-abi/bX460iggiKg
+  //
+  // The encoded sequence of Elf64_Relr entries in a SHT_RELR section looks
+  // like [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ]
+  //
+  // i.e. start with an address, followed by any number of bitmaps. The address
+  // entry encodes 1 relocation. The subsequent bitmap entries encode up to 63
+  // relocations each, at subsequent offsets following the last address entry.
+  //
+  // The bitmap entries must have 1 in the least significant bit. The assumption
+  // here is that an address cannot have 1 in lsb. Odd addresses are not
+  // supported.
+  //
+  // Excluding the least significant bit in the bitmap, each non-zero bit in
+  // the bitmap represents a relocation to be applied to a corresponding machine
+  // word that follows the base address word. The second least significant bit
+  // represents the machine word immediately following the initial address, and
+  // each bit that follows represents the next word, in linear order. As such,
+  // a single bitmap can encode up to 31 relocations in a 32-bit object, and
+  // 63 relocations in a 64-bit object.
+  //
+  // This encoding has a couple of interesting properties:
+  // 1. Looking at any entry, it is clear whether it's an address or a bitmap:
+  //    even means address, odd means bitmap.
+  // 2. Just a simple list of addresses is a valid encoding.
+
+  Elf_Rela Rela;
+  Rela.r_info = 0;
+  Rela.r_addend = 0;
+  Rela.setType(getRelrRelocationType(), false);
+  std::vector<Elf_Rela> Relocs;
+
+  // Word type: uint32_t for Elf32, and uint64_t for Elf64.
+  typedef typename ELFT::uint Word;
+
+  // Word size in number of bytes.
+  const size_t WordSize = sizeof(Word);
+
+  // Number of bits used for the relocation offsets bitmap.
+  // These many relative relocations can be encoded in a single entry.
+  const size_t NBits = 8*WordSize - 1;
+
+  Word Base = 0;
+  for (const Elf_Relr &R : relrs) {
+    Word Entry = R;
+    if ((Entry&1) == 0) {
+      // Even entry: encodes the offset for next relocation.
+      Rela.r_offset = Entry;
+      Relocs.push_back(Rela);
+      // Set base offset for subsequent bitmap entries.
+      Base = Entry + WordSize;
+      continue;
+    }
+
+    // Odd entry: encodes bitmap for relocations starting at base.
+    Word Offset = Base;
+    while (Entry != 0) {
+      Entry >>= 1;
+      if ((Entry&1) != 0) {
+        Rela.r_offset = Offset;
+        Relocs.push_back(Rela);
+      }
+      Offset += WordSize;
+    }
+
+    // Advance base offset by NBits words.
+    Base += NBits * WordSize;
+  }
+
+  return Relocs;
+}
+
+template <class ELFT>
+Expected<std::vector<typename ELFT::Rela>>
 ELFFile<ELFT>::android_relas(const Elf_Shdr *Sec) const {
   // This function reads relocations in Android's packed relocation format,
   // which is based on SLEB128 and delta encoding.
@@ -299,6 +420,144 @@ ELFFile<ELFT>::android_relas(const Elf_Shdr *Sec) const {
   return Relocs;
 }
 
+template <class ELFT>
+const char *ELFFile<ELFT>::getDynamicTagAsString(unsigned Arch,
+                                                 uint64_t Type) const {
+#define DYNAMIC_STRINGIFY_ENUM(tag, value)                                     \
+  case value:                                                                  \
+    return #tag;
+
+#define DYNAMIC_TAG(n, v)
+  switch (Arch) {
+  case ELF::EM_HEXAGON:
+    switch (Type) {
+#define HEXAGON_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef HEXAGON_DYNAMIC_TAG
+    }
+
+  case ELF::EM_MIPS:
+    switch (Type) {
+#define MIPS_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef MIPS_DYNAMIC_TAG
+    }
+
+  case ELF::EM_PPC64:
+    switch (Type) {
+#define PPC64_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef PPC64_DYNAMIC_TAG
+    }
+  }
+#undef DYNAMIC_TAG
+  switch (Type) {
+// Now handle all dynamic tags except the architecture specific ones
+#define MIPS_DYNAMIC_TAG(name, value)
+#define HEXAGON_DYNAMIC_TAG(name, value)
+#define PPC64_DYNAMIC_TAG(name, value)
+// Also ignore marker tags such as DT_HIOS (maps to DT_VERNEEDNUM), etc.
+#define DYNAMIC_TAG_MARKER(name, value)
+#define DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef DYNAMIC_TAG
+#undef MIPS_DYNAMIC_TAG
+#undef HEXAGON_DYNAMIC_TAG
+#undef PPC64_DYNAMIC_TAG
+#undef DYNAMIC_TAG_MARKER
+#undef DYNAMIC_STRINGIFY_ENUM
+  default:
+    return "unknown";
+  }
+}
+
+template <class ELFT>
+const char *ELFFile<ELFT>::getDynamicTagAsString(uint64_t Type) const {
+  return getDynamicTagAsString(getHeader()->e_machine, Type);
+}
+
+template <class ELFT>
+Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
+  ArrayRef<Elf_Dyn> Dyn;
+  size_t DynSecSize = 0;
+
+  auto ProgramHeadersOrError = program_headers();
+  if (!ProgramHeadersOrError)
+    return ProgramHeadersOrError.takeError();
+
+  for (const Elf_Phdr &Phdr : *ProgramHeadersOrError) {
+    if (Phdr.p_type == ELF::PT_DYNAMIC) {
+      Dyn = makeArrayRef(
+          reinterpret_cast<const Elf_Dyn *>(base() + Phdr.p_offset),
+          Phdr.p_filesz / sizeof(Elf_Dyn));
+      DynSecSize = Phdr.p_filesz;
+      break;
+    }
+  }
+
+  // If we can't find the dynamic section in the program headers, we just fall
+  // back on the sections.
+  if (Dyn.empty()) {
+    auto SectionsOrError = sections();
+    if (!SectionsOrError)
+      return SectionsOrError.takeError();
+
+    for (const Elf_Shdr &Sec : *SectionsOrError) {
+      if (Sec.sh_type == ELF::SHT_DYNAMIC) {
+        Expected<ArrayRef<Elf_Dyn>> DynOrError =
+            getSectionContentsAsArray<Elf_Dyn>(&Sec);
+        if (!DynOrError)
+          return DynOrError.takeError();
+        Dyn = *DynOrError;
+        DynSecSize = Sec.sh_size;
+        break;
+      }
+    }
+
+    if (!Dyn.data())
+      return ArrayRef<Elf_Dyn>();
+  }
+
+  if (Dyn.empty())
+    return createError("invalid empty dynamic section");
+
+  if (DynSecSize % sizeof(Elf_Dyn) != 0)
+    return createError("malformed dynamic section");
+
+  if (Dyn.back().d_tag != ELF::DT_NULL)
+    return createError("dynamic sections must be DT_NULL terminated");
+
+  return Dyn;
+}
+
+template <class ELFT>
+Expected<const uint8_t *> ELFFile<ELFT>::toMappedAddr(uint64_t VAddr) const {
+  auto ProgramHeadersOrError = program_headers();
+  if (!ProgramHeadersOrError)
+    return ProgramHeadersOrError.takeError();
+
+  llvm::SmallVector<Elf_Phdr *, 4> LoadSegments;
+
+  for (const Elf_Phdr &Phdr : *ProgramHeadersOrError)
+    if (Phdr.p_type == ELF::PT_LOAD)
+      LoadSegments.push_back(const_cast<Elf_Phdr *>(&Phdr));
+
+  const Elf_Phdr *const *I =
+      std::upper_bound(LoadSegments.begin(), LoadSegments.end(), VAddr,
+                       [](uint64_t VAddr, const Elf_Phdr_Impl<ELFT> *Phdr) {
+                         return VAddr < Phdr->p_vaddr;
+                       });
+
+  if (I == LoadSegments.begin())
+    return createError("Virtual address is not in any segment");
+  --I;
+  const Elf_Phdr &Phdr = **I;
+  uint64_t Delta = VAddr - Phdr.p_vaddr;
+  if (Delta >= Phdr.p_filesz)
+    return createError("Virtual address is not in any segment");
+  return base() + Phdr.p_offset + Delta;
+}
+
 template class llvm::object::ELFFile<ELF32LE>;
 template class llvm::object::ELFFile<ELF32BE>;
 template class llvm::object::ELFFile<ELF64LE>;
diff --git a/contrib/llvm/lib/Object/ELFObjectFile.cpp b/contrib/llvm/lib/Object/ELFObjectFile.cpp
index 0aad1c89a2d8..e806c8f28b15 100644
--- a/contrib/llvm/lib/Object/ELFObjectFile.cpp
+++ b/contrib/llvm/lib/Object/ELFObjectFile.cpp
@@ -76,8 +76,7 @@ ObjectFile::createELFObjectFile(MemoryBufferRef Obj) {
 
 SubtargetFeatures ELFObjectFileBase::getMIPSFeatures() const {
   SubtargetFeatures Features;
-  unsigned PlatformFlags;
-  getPlatformFlags(PlatformFlags);
+  unsigned PlatformFlags = getPlatformFlags();
 
   switch (PlatformFlags & ELF::EF_MIPS_ARCH) {
   case ELF::EF_MIPS_ARCH_1:
@@ -239,12 +238,25 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   return Features;
 }
 
+SubtargetFeatures ELFObjectFileBase::getRISCVFeatures() const {
+  SubtargetFeatures Features;
+  unsigned PlatformFlags = getPlatformFlags();
+
+  if (PlatformFlags & ELF::EF_RISCV_RVC) {
+    Features.AddFeature("c");
+  }
+
+  return Features;
+}
+
 SubtargetFeatures ELFObjectFileBase::getFeatures() const {
   switch (getEMachine()) {
   case ELF::EM_MIPS:
     return getMIPSFeatures();
   case ELF::EM_ARM:
     return getARMFeatures();
+  case ELF::EM_RISCV:
+    return getRISCVFeatures();
   default:
     return SubtargetFeatures();
   }
diff --git a/contrib/llvm/lib/Object/IRSymtab.cpp b/contrib/llvm/lib/Object/IRSymtab.cpp
index 2d8d3f7c0878..344d565349c0 100644
--- a/contrib/llvm/lib/Object/IRSymtab.cpp
+++ b/contrib/llvm/lib/Object/IRSymtab.cpp
@@ -15,7 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/ObjectUtils.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -42,6 +42,12 @@
 using namespace llvm;
 using namespace irsymtab;
 
+static const char *LibcallRoutineNames[] = {
+#define HANDLE_LIBCALL(code, name) name,
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+};
+
 namespace {
 
 const char *getExpectedProducerName() {
@@ -226,13 +232,19 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
 
   setStr(Sym.IRName, GV->getName());
 
-  if (Used.count(GV))
+  bool IsBuiltinFunc = false;
+
+  for (const char *LibcallName : LibcallRoutineNames)
+    if (GV->getName() == LibcallName)
+      IsBuiltinFunc = true;
+
+  if (Used.count(GV) || IsBuiltinFunc)
     Sym.Flags |= 1 << storage::Symbol::FB_used;
   if (GV->isThreadLocal())
     Sym.Flags |= 1 << storage::Symbol::FB_tls;
   if (GV->hasGlobalUnnamedAddr())
     Sym.Flags |= 1 << storage::Symbol::FB_unnamed_addr;
-  if (canBeOmittedFromSymbolTable(GV))
+  if (GV->canBeOmittedFromSymbolTable())
     Sym.Flags |= 1 << storage::Symbol::FB_may_omit;
   Sym.Flags |= unsigned(GV->getVisibility()) << storage::Symbol::FB_visibility;
 
diff --git a/contrib/llvm/lib/Object/MachOObjectFile.cpp b/contrib/llvm/lib/Object/MachOObjectFile.cpp
index 2e3415618e5f..e422903f2805 100644
--- a/contrib/llvm/lib/Object/MachOObjectFile.cpp
+++ b/contrib/llvm/lib/Object/MachOObjectFile.cpp
@@ -107,7 +107,8 @@ getSectionPtr(const MachOObjectFile &O, MachOObjectFile::LoadCommandInfo L,
 }
 
 static const char *getPtr(const MachOObjectFile &O, size_t Offset) {
-  return O.getData().substr(Offset, 1).data();
+  assert(Offset <= O.getData().size());
+  return O.getData().data() + Offset;
 }
 
 static MachO::nlist_base
@@ -1011,7 +1012,43 @@ static Error checkThreadCommand(const MachOObjectFile &Obj,
                               CmdName + " command");
       }
     } else if (cputype == MachO::CPU_TYPE_X86_64) {
-      if (flavor == MachO::x86_THREAD_STATE64) {
+      if (flavor == MachO::x86_THREAD_STATE) {
+        if (count != MachO::x86_THREAD_STATE_COUNT)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " count not x86_THREAD_STATE_COUNT for "
+                                "flavor number " + Twine(nflavor) + " which is "
+                                "a x86_THREAD_STATE flavor in " + CmdName +
+                                " command");
+        if (state + sizeof(MachO::x86_thread_state_t) > end)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " x86_THREAD_STATE extends past end of "
+                                "command in " + CmdName + " command");
+        state += sizeof(MachO::x86_thread_state_t);
+      } else if (flavor == MachO::x86_FLOAT_STATE) {
+        if (count != MachO::x86_FLOAT_STATE_COUNT)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " count not x86_FLOAT_STATE_COUNT for "
+                                "flavor number " + Twine(nflavor) + " which is "
+                                "a x86_FLOAT_STATE flavor in " + CmdName +
+                                " command");
+        if (state + sizeof(MachO::x86_float_state_t) > end)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " x86_FLOAT_STATE extends past end of "
+                                "command in " + CmdName + " command");
+        state += sizeof(MachO::x86_float_state_t);
+      } else if (flavor == MachO::x86_EXCEPTION_STATE) {
+        if (count != MachO::x86_EXCEPTION_STATE_COUNT)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " count not x86_EXCEPTION_STATE_COUNT for "
+                                "flavor number " + Twine(nflavor) + " which is "
+                                "a x86_EXCEPTION_STATE flavor in " + CmdName +
+                                " command");
+        if (state + sizeof(MachO::x86_exception_state_t) > end)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " x86_EXCEPTION_STATE extends past end of "
+                                "command in " + CmdName + " command");
+        state += sizeof(MachO::x86_exception_state_t);
+      } else if (flavor == MachO::x86_THREAD_STATE64) {
         if (count != MachO::x86_THREAD_STATE64_COUNT)
           return malformedError("load command " + Twine(LoadCommandIndex) +
                                 " count not x86_THREAD_STATE64_COUNT for "
@@ -1023,6 +1060,18 @@ static Error checkThreadCommand(const MachOObjectFile &Obj,
                                 " x86_THREAD_STATE64 extends past end of "
                                 "command in " + CmdName + " command");
         state += sizeof(MachO::x86_thread_state64_t);
+      } else if (flavor == MachO::x86_EXCEPTION_STATE64) {
+        if (count != MachO::x86_EXCEPTION_STATE64_COUNT)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " count not x86_EXCEPTION_STATE64_COUNT for "
+                                "flavor number " + Twine(nflavor) + " which is "
+                                "a x86_EXCEPTION_STATE64 flavor in " + CmdName +
+                                " command");
+        if (state + sizeof(MachO::x86_exception_state64_t) > end)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " x86_EXCEPTION_STATE64 extends past end of "
+                                "command in " + CmdName + " command");
+        state += sizeof(MachO::x86_exception_state64_t);
       } else {
         return malformedError("load command " + Twine(LoadCommandIndex) +
                               " unknown flavor (" + Twine(flavor) + ") for "
@@ -1659,6 +1708,10 @@ void MachOObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
 Expected<StringRef> MachOObjectFile::getSymbolName(DataRefImpl Symb) const {
   StringRef StringTable = getStringTableData();
   MachO::nlist_base Entry = getSymbolTableEntryBase(*this, Symb);
+  if (Entry.n_strx == 0)
+    // A n_strx value of 0 indicates that no name is associated with a
+    // particular symbol table entry.
+    return StringRef();
   const char *Start = &StringTable.data()[Entry.n_strx];
   if (Start < getData().begin() || Start >= getData().end()) {
     return malformedError("bad string index: " + Twine(Entry.n_strx) +
@@ -1886,6 +1939,27 @@ uint64_t MachOObjectFile::getSectionAlignment(DataRefImpl Sec) const {
   return uint64_t(1) << Align;
 }
 
+Expected<SectionRef> MachOObjectFile::getSection(unsigned SectionIndex) const {
+  if (SectionIndex < 1 || SectionIndex > Sections.size())
+    return malformedError("bad section index: " + Twine((int)SectionIndex));
+
+  DataRefImpl DRI;
+  DRI.d.a = SectionIndex - 1;
+  return SectionRef(DRI, this);
+}
+
+Expected<SectionRef> MachOObjectFile::getSection(StringRef SectionName) const {
+  StringRef SecName;
+  for (const SectionRef &Section : sections()) {
+    if (std::error_code E = Section.getName(SecName))
+      return errorCodeToError(E);
+    if (SecName == SectionName) {
+      return Section;
+    }
+  }
+  return errorCodeToError(object_error::parse_failed);
+}
+
 bool MachOObjectFile::isSectionCompressed(DataRefImpl Sec) const {
   return false;
 }
@@ -1916,8 +1990,10 @@ unsigned MachOObjectFile::getSectionID(SectionRef Sec) const {
 }
 
 bool MachOObjectFile::isSectionVirtual(DataRefImpl Sec) const {
-  // FIXME: Unimplemented.
-  return false;
+  uint32_t Flags = getSectionFlags(*this, Sec);
+  unsigned SectionType = Flags & MachO::SECTION_TYPE;
+  return SectionType == MachO::S_ZEROFILL ||
+         SectionType == MachO::S_GB_ZEROFILL;
 }
 
 bool MachOObjectFile::isSectionBitcode(DataRefImpl Sec) const {
diff --git a/contrib/llvm/lib/Object/ModuleSymbolTable.cpp b/contrib/llvm/lib/Object/ModuleSymbolTable.cpp
index f0d70aefd426..b353ef3c835b 100644
--- a/contrib/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/contrib/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -68,9 +68,9 @@ void ModuleSymbolTable::addModule(Module *M) {
   });
 }
 
-void ModuleSymbolTable::CollectAsmSymbols(
-    const Module &M,
-    function_ref<void(StringRef, BasicSymbolRef::Flags)> AsmSymbol) {
+static void
+initializeRecordStreamer(const Module &M,
+                         function_ref<void(RecordStreamer &)> Init) {
   StringRef InlineAsm = M.getModuleInlineAsm();
   if (InlineAsm.empty())
     return;
@@ -119,36 +119,53 @@ void ModuleSymbolTable::CollectAsmSymbols(
   if (Parser->Run(false))
     return;
 
-  Streamer.flushSymverDirectives();
-
-  for (auto &KV : Streamer) {
-    StringRef Key = KV.first();
-    RecordStreamer::State Value = KV.second;
-    // FIXME: For now we just assume that all asm symbols are executable.
-    uint32_t Res = BasicSymbolRef::SF_Executable;
-    switch (Value) {
-    case RecordStreamer::NeverSeen:
-      llvm_unreachable("NeverSeen should have been replaced earlier");
-    case RecordStreamer::DefinedGlobal:
-      Res |= BasicSymbolRef::SF_Global;
-      break;
-    case RecordStreamer::Defined:
-      break;
-    case RecordStreamer::Global:
-    case RecordStreamer::Used:
-      Res |= BasicSymbolRef::SF_Undefined;
-      Res |= BasicSymbolRef::SF_Global;
-      break;
-    case RecordStreamer::DefinedWeak:
-      Res |= BasicSymbolRef::SF_Weak;
-      Res |= BasicSymbolRef::SF_Global;
-      break;
-    case RecordStreamer::UndefinedWeak:
-      Res |= BasicSymbolRef::SF_Weak;
-      Res |= BasicSymbolRef::SF_Undefined;
+  Init(Streamer);
+}
+
+void ModuleSymbolTable::CollectAsmSymbols(
+    const Module &M,
+    function_ref<void(StringRef, BasicSymbolRef::Flags)> AsmSymbol) {
+  initializeRecordStreamer(M, [&](RecordStreamer &Streamer) {
+    Streamer.flushSymverDirectives();
+
+    for (auto &KV : Streamer) {
+      StringRef Key = KV.first();
+      RecordStreamer::State Value = KV.second;
+      // FIXME: For now we just assume that all asm symbols are executable.
+      uint32_t Res = BasicSymbolRef::SF_Executable;
+      switch (Value) {
+      case RecordStreamer::NeverSeen:
+        llvm_unreachable("NeverSeen should have been replaced earlier");
+      case RecordStreamer::DefinedGlobal:
+        Res |= BasicSymbolRef::SF_Global;
+        break;
+      case RecordStreamer::Defined:
+        break;
+      case RecordStreamer::Global:
+      case RecordStreamer::Used:
+        Res |= BasicSymbolRef::SF_Undefined;
+        Res |= BasicSymbolRef::SF_Global;
+        break;
+      case RecordStreamer::DefinedWeak:
+        Res |= BasicSymbolRef::SF_Weak;
+        Res |= BasicSymbolRef::SF_Global;
+        break;
+      case RecordStreamer::UndefinedWeak:
+        Res |= BasicSymbolRef::SF_Weak;
+        Res |= BasicSymbolRef::SF_Undefined;
+      }
+      AsmSymbol(Key, BasicSymbolRef::Flags(Res));
     }
-    AsmSymbol(Key, BasicSymbolRef::Flags(Res));
-  }
+  });
+}
+
+void ModuleSymbolTable::CollectAsmSymvers(
+    const Module &M, function_ref<void(StringRef, StringRef)> AsmSymver) {
+  initializeRecordStreamer(M, [&](RecordStreamer &Streamer) {
+    for (auto &KV : Streamer.symverAliases())
+      for (auto &Alias : KV.second)
+        AsmSymver(KV.first->getName(), Alias);
+  });
 }
 
 void ModuleSymbolTable::printSymbolName(raw_ostream &OS, Symbol S) const {
diff --git a/contrib/llvm/lib/Object/Object.cpp b/contrib/llvm/lib/Object/Object.cpp
index 1d2859cfbe9d..5fd823e0117e 100644
--- a/contrib/llvm/lib/Object/Object.cpp
+++ b/contrib/llvm/lib/Object/Object.cpp
@@ -228,7 +228,7 @@ uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI) {
 const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
   SmallVector<char, 0> ret;
   (*unwrap(RI))->getTypeName(ret);
-  char *str = static_cast<char*>(malloc(ret.size()));
+  char *str = static_cast<char*>(safe_malloc(ret.size()));
   std::copy(ret.begin(), ret.end(), str);
   return str;
 }
diff --git a/contrib/llvm/lib/Object/ObjectFile.cpp b/contrib/llvm/lib/Object/ObjectFile.cpp
index 652a2b2497ef..db0ff220c4d8 100644
--- a/contrib/llvm/lib/Object/ObjectFile.cpp
+++ b/contrib/llvm/lib/Object/ObjectFile.cpp
@@ -119,6 +119,7 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type) {
   case file_magic::archive:
   case file_magic::macho_universal_binary:
   case file_magic::windows_resource:
+  case file_magic::pdb:
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::elf:
   case file_magic::elf_relocatable:
diff --git a/contrib/llvm/lib/Object/RecordStreamer.cpp b/contrib/llvm/lib/Object/RecordStreamer.cpp
index 74130901d325..1f57867dd21a 100644
--- a/contrib/llvm/lib/Object/RecordStreamer.cpp
+++ b/contrib/llvm/lib/Object/RecordStreamer.cpp
@@ -107,7 +107,8 @@ bool RecordStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
 }
 
 void RecordStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
-                                  uint64_t Size, unsigned ByteAlignment) {
+                                  uint64_t Size, unsigned ByteAlignment,
+                                  SMLoc Loc) {
   markDefined(*Symbol);
 }
 
@@ -128,6 +129,11 @@ void RecordStreamer::emitELFSymverDirective(StringRef AliasName,
   SymverAliasMap[Aliasee].push_back(AliasName);
 }
 
+iterator_range<RecordStreamer::const_symver_iterator>
+RecordStreamer::symverAliases() {
+  return {SymverAliasMap.begin(), SymverAliasMap.end()};
+}
+
 void RecordStreamer::flushSymverDirectives() {
   // Mapping from mangled name to GV.
   StringMap<const GlobalValue *> MangledNameMap;
@@ -216,7 +222,10 @@ void RecordStreamer::flushSymverDirectives() {
       // TODO: Handle "@@@". Depending on SymbolAttribute value it needs to be
       // converted into @ or @@.
       const MCExpr *Value = MCSymbolRefExpr::create(Aliasee, getContext());
-      EmitAssignment(Alias, Value);
+      if (IsDefined)
+        markDefined(*Alias);
+      // Don't use EmitAssignment override as it always marks alias as defined.
+      MCStreamer::EmitAssignment(Alias, Value);
       if (Attr != MCSA_Invalid)
         EmitSymbolAttribute(Alias, Attr);
     }
diff --git a/contrib/llvm/lib/Object/RecordStreamer.h b/contrib/llvm/lib/Object/RecordStreamer.h
index 60b2d3ec3e8e..3d5ae59b58fe 100644
--- a/contrib/llvm/lib/Object/RecordStreamer.h
+++ b/contrib/llvm/lib/Object/RecordStreamer.h
@@ -47,25 +47,31 @@ private:
 public:
   RecordStreamer(MCContext &Context, const Module &M);
 
-  using const_iterator = StringMap<State>::const_iterator;
-
-  const_iterator begin();
-  const_iterator end();
   void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                        bool) override;
   void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
-                    unsigned ByteAlignment) override;
+                    unsigned ByteAlignment, SMLoc Loc = SMLoc()) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
   /// Record .symver aliases for later processing.
   void emitELFSymverDirective(StringRef AliasName,
                               const MCSymbol *Aliasee) override;
+
   // Emit ELF .symver aliases and ensure they have the same binding as the
   // defined symbol they alias with.
   void flushSymverDirectives();
+
+  // Symbols iterators
+  using const_iterator = StringMap<State>::const_iterator;
+  const_iterator begin();
+  const_iterator end();
+
+  // SymverAliasMap iterators
+  using const_symver_iterator = decltype(SymverAliasMap)::const_iterator;
+  iterator_range<const_symver_iterator> symverAliases();
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Object/SymbolSize.cpp b/contrib/llvm/lib/Object/SymbolSize.cpp
index dd49d5f116b3..004fb1b07546 100644
--- a/contrib/llvm/lib/Object/SymbolSize.cpp
+++ b/contrib/llvm/lib/Object/SymbolSize.cpp
@@ -66,6 +66,10 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
     Addresses.push_back(
         {O.symbol_end(), Address + Size, 0, getSectionID(O, Sec)});
   }
+
+  if (Addresses.empty())
+    return Ret;
+
   array_pod_sort(Addresses.begin(), Addresses.end(), compareAddress);
 
   // Compute the size as the gap to the next symbol
diff --git a/contrib/llvm/lib/Object/SymbolicFile.cpp b/contrib/llvm/lib/Object/SymbolicFile.cpp
index 2e7f2cc0d1d9..3e998a2682b8 100644
--- a/contrib/llvm/lib/Object/SymbolicFile.cpp
+++ b/contrib/llvm/lib/Object/SymbolicFile.cpp
@@ -52,6 +52,7 @@ SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type,
   case file_magic::coff_cl_gl_object:
   case file_magic::macho_universal_binary:
   case file_magic::windows_resource:
+  case file_magic::pdb:
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::elf:
   case file_magic::elf_executable:
diff --git a/contrib/llvm/lib/Object/WasmObjectFile.cpp b/contrib/llvm/lib/Object/WasmObjectFile.cpp
index 48f98df6f34d..4d4c887b2d97 100644
--- a/contrib/llvm/lib/Object/WasmObjectFile.cpp
+++ b/contrib/llvm/lib/Object/WasmObjectFile.cpp
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -33,6 +35,23 @@
 using namespace llvm;
 using namespace object;
 
+void WasmSymbol::print(raw_ostream &Out) const {
+  Out << "Name=" << Info.Name
+  << ", Kind=" << toString(wasm::WasmSymbolType(Info.Kind))
+  << ", Flags=" << Info.Flags;
+  if (!isTypeData()) {
+    Out << ", ElemIndex=" << Info.ElementIndex;
+  } else if (isDefined()) {
+    Out << ", Segment=" << Info.DataRef.Segment;
+    Out << ", Offset=" << Info.DataRef.Offset;
+    Out << ", Size=" << Info.DataRef.Size;
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WasmSymbol::dump() const { print(dbgs()); }
+#endif
+
 Expected<std::unique_ptr<WasmObjectFile>>
 ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
   Error Err = Error::success();
@@ -48,112 +67,119 @@ ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
 #define VARUINT7_MAX (1<<7)
 #define VARUINT1_MAX (1)
 
-static uint8_t readUint8(const uint8_t *&Ptr) { return *Ptr++; }
+static uint8_t readUint8(WasmObjectFile::ReadContext &Ctx) {
+  if (Ctx.Ptr == Ctx.End)
+    report_fatal_error("EOF while reading uint8");
+  return *Ctx.Ptr++;
+}
 
-static uint32_t readUint32(const uint8_t *&Ptr) {
-  uint32_t Result = support::endian::read32le(Ptr);
-  Ptr += sizeof(Result);
+static uint32_t readUint32(WasmObjectFile::ReadContext &Ctx) {
+  if (Ctx.Ptr + 4 > Ctx.End)
+    report_fatal_error("EOF while reading uint32");
+  uint32_t Result = support::endian::read32le(Ctx.Ptr);
+  Ctx.Ptr += 4;
   return Result;
 }
 
-static int32_t readFloat32(const uint8_t *&Ptr) {
+static int32_t readFloat32(WasmObjectFile::ReadContext &Ctx) {
   int32_t Result = 0;
-  memcpy(&Result, Ptr, sizeof(Result));
-  Ptr += sizeof(Result);
+  memcpy(&Result, Ctx.Ptr, sizeof(Result));
+  Ctx.Ptr += sizeof(Result);
   return Result;
 }
 
-static int64_t readFloat64(const uint8_t *&Ptr) {
+static int64_t readFloat64(WasmObjectFile::ReadContext &Ctx) {
   int64_t Result = 0;
-  memcpy(&Result, Ptr, sizeof(Result));
-  Ptr += sizeof(Result);
+  memcpy(&Result, Ctx.Ptr, sizeof(Result));
+  Ctx.Ptr += sizeof(Result);
   return Result;
 }
 
-static uint64_t readULEB128(const uint8_t *&Ptr) {
+static uint64_t readULEB128(WasmObjectFile::ReadContext &Ctx) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
-  Ptr += Count;
+  const char* Error = nullptr;
+  uint64_t Result = decodeULEB128(Ctx.Ptr, &Count, Ctx.End, &Error);
+  if (Error)
+    report_fatal_error(Error);
+  Ctx.Ptr += Count;
   return Result;
 }
 
-static StringRef readString(const uint8_t *&Ptr) {
-  uint32_t StringLen = readULEB128(Ptr);
-  StringRef Return = StringRef(reinterpret_cast<const char *>(Ptr), StringLen);
-  Ptr += StringLen;
+static StringRef readString(WasmObjectFile::ReadContext &Ctx) {
+  uint32_t StringLen = readULEB128(Ctx);
+  if (Ctx.Ptr + StringLen > Ctx.End)
+    report_fatal_error("EOF while reading string");
+  StringRef Return =
+      StringRef(reinterpret_cast<const char *>(Ctx.Ptr), StringLen);
+  Ctx.Ptr += StringLen;
   return Return;
 }
 
-static int64_t readLEB128(const uint8_t *&Ptr) {
+static int64_t readLEB128(WasmObjectFile::ReadContext &Ctx) {
   unsigned Count;
-  uint64_t Result = decodeSLEB128(Ptr, &Count);
-  Ptr += Count;
+  const char* Error = nullptr;
+  uint64_t Result = decodeSLEB128(Ctx.Ptr, &Count, Ctx.End, &Error);
+  if (Error)
+    report_fatal_error(Error);
+  Ctx.Ptr += Count;
   return Result;
 }
 
-static uint8_t readVaruint1(const uint8_t *&Ptr) {
-  int64_t result = readLEB128(Ptr);
-  assert(result <= VARUINT1_MAX && result >= 0);
-  return result;
-}
-
-static int8_t readVarint7(const uint8_t *&Ptr) {
-  int64_t result = readLEB128(Ptr);
-  assert(result <= VARINT7_MAX && result >= VARINT7_MIN);
-  return result;
-}
-
-static uint8_t readVaruint7(const uint8_t *&Ptr) {
-  uint64_t result = readULEB128(Ptr);
-  assert(result <= VARUINT7_MAX);
+static uint8_t readVaruint1(WasmObjectFile::ReadContext &Ctx) {
+  int64_t result = readLEB128(Ctx);
+  if (result > VARUINT1_MAX || result < 0)
+    report_fatal_error("LEB is outside Varuint1 range");
   return result;
 }
 
-static int32_t readVarint32(const uint8_t *&Ptr) {
-  int64_t result = readLEB128(Ptr);
-  assert(result <= INT32_MAX && result >= INT32_MIN);
+static int32_t readVarint32(WasmObjectFile::ReadContext &Ctx) {
+  int64_t result = readLEB128(Ctx);
+  if (result > INT32_MAX || result < INT32_MIN)
+    report_fatal_error("LEB is outside Varint32 range");
   return result;
 }
 
-static uint32_t readVaruint32(const uint8_t *&Ptr) {
-  uint64_t result = readULEB128(Ptr);
-  assert(result <= UINT32_MAX);
+static uint32_t readVaruint32(WasmObjectFile::ReadContext &Ctx) {
+  uint64_t result = readULEB128(Ctx);
+  if (result > UINT32_MAX)
+    report_fatal_error("LEB is outside Varuint32 range");
   return result;
 }
 
-static int64_t readVarint64(const uint8_t *&Ptr) {
-  return readLEB128(Ptr);
+static int64_t readVarint64(WasmObjectFile::ReadContext &Ctx) {
+  return readLEB128(Ctx);
 }
 
-static uint8_t readOpcode(const uint8_t *&Ptr) {
-  return readUint8(Ptr);
+static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) {
+  return readUint8(Ctx);
 }
 
-static Error readInitExpr(wasm::WasmInitExpr &Expr, const uint8_t *&Ptr) {
-  Expr.Opcode = readOpcode(Ptr);
+static Error readInitExpr(wasm::WasmInitExpr &Expr,
+                          WasmObjectFile::ReadContext &Ctx) {
+  Expr.Opcode = readOpcode(Ctx);
 
   switch (Expr.Opcode) {
   case wasm::WASM_OPCODE_I32_CONST:
-    Expr.Value.Int32 = readVarint32(Ptr);
+    Expr.Value.Int32 = readVarint32(Ctx);
     break;
   case wasm::WASM_OPCODE_I64_CONST:
-    Expr.Value.Int64 = readVarint64(Ptr);
+    Expr.Value.Int64 = readVarint64(Ctx);
     break;
   case wasm::WASM_OPCODE_F32_CONST:
-    Expr.Value.Float32 = readFloat32(Ptr);
+    Expr.Value.Float32 = readFloat32(Ctx);
     break;
   case wasm::WASM_OPCODE_F64_CONST:
-    Expr.Value.Float64 = readFloat64(Ptr);
+    Expr.Value.Float64 = readFloat64(Ctx);
     break;
   case wasm::WASM_OPCODE_GET_GLOBAL:
-    Expr.Value.Global = readULEB128(Ptr);
+    Expr.Value.Global = readULEB128(Ctx);
     break;
   default:
     return make_error<GenericBinaryError>("Invalid opcode in init_expr",
                                           object_error::parse_failed);
   }
 
-  uint8_t EndOpcode = readOpcode(Ptr);
+  uint8_t EndOpcode = readOpcode(Ctx);
   if (EndOpcode != wasm::WASM_OPCODE_END) {
     return make_error<GenericBinaryError>("Invalid init_expr",
                                           object_error::parse_failed);
@@ -161,42 +187,46 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr, const uint8_t *&Ptr) {
   return Error::success();
 }
 
-static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
+static wasm::WasmLimits readLimits(WasmObjectFile::ReadContext &Ctx) {
   wasm::WasmLimits Result;
-  Result.Flags = readVaruint1(Ptr);
-  Result.Initial = readVaruint32(Ptr);
+  Result.Flags = readVaruint1(Ctx);
+  Result.Initial = readVaruint32(Ctx);
   if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
-    Result.Maximum = readVaruint32(Ptr);
+    Result.Maximum = readVaruint32(Ctx);
   return Result;
 }
 
-static wasm::WasmTable readTable(const uint8_t *&Ptr) {
+static wasm::WasmTable readTable(WasmObjectFile::ReadContext &Ctx) {
   wasm::WasmTable Table;
-  Table.ElemType = readVarint7(Ptr);
-  Table.Limits = readLimits(Ptr);
+  Table.ElemType = readUint8(Ctx);
+  Table.Limits = readLimits(Ctx);
   return Table;
 }
 
-static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
-                         const uint8_t *Start, const uint8_t *Eof) {
-  Section.Offset = Ptr - Start;
-  Section.Type = readVaruint7(Ptr);
-  uint32_t Size = readVaruint32(Ptr);
+static Error readSection(WasmSection &Section,
+                         WasmObjectFile::ReadContext &Ctx) {
+  Section.Offset = Ctx.Ptr - Ctx.Start;
+  Section.Type = readUint8(Ctx);
+  LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n");
+  uint32_t Size = readVaruint32(Ctx);
   if (Size == 0)
     return make_error<StringError>("Zero length section",
                                    object_error::parse_failed);
-  if (Ptr + Size > Eof)
+  if (Ctx.Ptr + Size > Ctx.End)
     return make_error<StringError>("Section too large",
                                    object_error::parse_failed);
-  Section.Content = ArrayRef<uint8_t>(Ptr, Size);
-  Ptr += Size;
+  if (Section.Type == wasm::WASM_SEC_CUSTOM) {
+    const uint8_t *NameStart = Ctx.Ptr;
+    Section.Name = readString(Ctx);
+    Size -= Ctx.Ptr - NameStart;
+  }
+  Section.Content = ArrayRef<uint8_t>(Ctx.Ptr, Size);
+  Ctx.Ptr += Size;
   return Error::success();
 }
 
 WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
     : ObjectFile(Binary::ID_Wasm, Buffer) {
-  LinkingData.DataSize = 0;
-
   ErrorAsOutParameter ErrAsOutParam(&Err);
   Header.Magic = getData().substr(0, 4);
   if (Header.Magic != StringRef("\0asm", 4)) {
@@ -205,16 +235,18 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
     return;
   }
 
-  const uint8_t *Eof = getPtr(getData().size());
-  const uint8_t *Ptr = getPtr(4);
+  ReadContext Ctx;
+  Ctx.Start = getPtr(0);
+  Ctx.Ptr = Ctx.Start + 4;
+  Ctx.End = Ctx.Start + getData().size();
 
-  if (Ptr + 4 > Eof) {
+  if (Ctx.Ptr + 4 > Ctx.End) {
     Err = make_error<StringError>("Missing version number",
                                   object_error::parse_failed);
     return;
   }
 
-  Header.Version = readUint32(Ptr);
+  Header.Version = readUint32(Ctx);
   if (Header.Version != wasm::WasmVersion) {
     Err = make_error<StringError>("Bad version number",
                                   object_error::parse_failed);
@@ -222,8 +254,8 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   }
 
   WasmSection Sec;
-  while (Ptr < Eof) {
-    if ((Err = readSection(Sec, Ptr, getPtr(0), Eof)))
+  while (Ctx.Ptr < Ctx.End) {
+    if ((Err = readSection(Sec, Ctx)))
       return;
     if ((Err = parseSection(Sec)))
       return;
@@ -233,344 +265,476 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
 }
 
 Error WasmObjectFile::parseSection(WasmSection &Sec) {
-  const uint8_t* Start = Sec.Content.data();
-  const uint8_t* End = Start + Sec.Content.size();
+  ReadContext Ctx;
+  Ctx.Start = Sec.Content.data();
+  Ctx.End = Ctx.Start + Sec.Content.size();
+  Ctx.Ptr = Ctx.Start;
   switch (Sec.Type) {
   case wasm::WASM_SEC_CUSTOM:
-    return parseCustomSection(Sec, Start, End);
+    return parseCustomSection(Sec, Ctx);
   case wasm::WASM_SEC_TYPE:
-    return parseTypeSection(Start, End);
+    return parseTypeSection(Ctx);
   case wasm::WASM_SEC_IMPORT:
-    return parseImportSection(Start, End);
+    return parseImportSection(Ctx);
   case wasm::WASM_SEC_FUNCTION:
-    return parseFunctionSection(Start, End);
+    return parseFunctionSection(Ctx);
   case wasm::WASM_SEC_TABLE:
-    return parseTableSection(Start, End);
+    return parseTableSection(Ctx);
   case wasm::WASM_SEC_MEMORY:
-    return parseMemorySection(Start, End);
+    return parseMemorySection(Ctx);
   case wasm::WASM_SEC_GLOBAL:
-    return parseGlobalSection(Start, End);
+    return parseGlobalSection(Ctx);
   case wasm::WASM_SEC_EXPORT:
-    return parseExportSection(Start, End);
+    return parseExportSection(Ctx);
   case wasm::WASM_SEC_START:
-    return parseStartSection(Start, End);
+    return parseStartSection(Ctx);
   case wasm::WASM_SEC_ELEM:
-    return parseElemSection(Start, End);
+    return parseElemSection(Ctx);
   case wasm::WASM_SEC_CODE:
-    return parseCodeSection(Start, End);
+    return parseCodeSection(Ctx);
   case wasm::WASM_SEC_DATA:
-    return parseDataSection(Start, End);
+    return parseDataSection(Ctx);
   default:
     return make_error<GenericBinaryError>("Bad section type",
                                           object_error::parse_failed);
   }
 }
 
-Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
-  while (Ptr < End) {
-    uint8_t Type = readVarint7(Ptr);
-    uint32_t Size = readVaruint32(Ptr);
-    const uint8_t *SubSectionEnd = Ptr + Size;
+Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
+  llvm::DenseSet<uint64_t> Seen;
+  if (Functions.size() != FunctionTypes.size()) {
+    return make_error<GenericBinaryError>("Names must come after code section",
+                                          object_error::parse_failed);
+  }
+
+  while (Ctx.Ptr < Ctx.End) {
+    uint8_t Type = readUint8(Ctx);
+    uint32_t Size = readVaruint32(Ctx);
+    const uint8_t *SubSectionEnd = Ctx.Ptr + Size;
     switch (Type) {
     case wasm::WASM_NAMES_FUNCTION: {
-      uint32_t Count = readVaruint32(Ptr);
+      uint32_t Count = readVaruint32(Ctx);
       while (Count--) {
-        uint32_t Index = readVaruint32(Ptr);
-        StringRef Name = readString(Ptr);
-        if (!Name.empty())
-          Symbols.emplace_back(Name,
-                               WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME,
-                               Sections.size(), Index);
+        uint32_t Index = readVaruint32(Ctx);
+        if (!Seen.insert(Index).second)
+          return make_error<GenericBinaryError>("Function named more than once",
+                                                object_error::parse_failed);
+        StringRef Name = readString(Ctx);
+        if (!isValidFunctionIndex(Index) || Name.empty())
+          return make_error<GenericBinaryError>("Invalid name entry",
+                                                object_error::parse_failed);
+        DebugNames.push_back(wasm::WasmFunctionName{Index, Name});
+        if (isDefinedFunctionIndex(Index))
+          getDefinedFunction(Index).DebugName = Name;
       }
       break;
     }
     // Ignore local names for now
     case wasm::WASM_NAMES_LOCAL:
     default:
-      Ptr += Size;
+      Ctx.Ptr += Size;
       break;
     }
-    if (Ptr != SubSectionEnd)
+    if (Ctx.Ptr != SubSectionEnd)
       return make_error<GenericBinaryError>("Name sub-section ended prematurely",
                                             object_error::parse_failed);
   }
 
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Name section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-void WasmObjectFile::populateSymbolTable() {
-  // Add imports to symbol table
-  size_t GlobalIndex = 0;
-  size_t FunctionIndex = 0;
-  for (const wasm::WasmImport& Import : Imports) {
-    switch (Import.Kind) {
-    case wasm::WASM_EXTERNAL_GLOBAL:
-      assert(Import.Global.Type == wasm::WASM_TYPE_I32);
-      SymbolMap.try_emplace(Import.Field, Symbols.size());
-      Symbols.emplace_back(Import.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
-                           ImportSection, GlobalIndex++);
-      DEBUG(dbgs() << "Adding import: " << Symbols.back()
-                   << " sym index:" << Symbols.size() << "\n");
-      break;
-    case wasm::WASM_EXTERNAL_FUNCTION:
-      SymbolMap.try_emplace(Import.Field, Symbols.size());
-      Symbols.emplace_back(Import.Field,
-                           WasmSymbol::SymbolType::FUNCTION_IMPORT,
-                           ImportSection, FunctionIndex++, Import.SigIndex);
-      DEBUG(dbgs() << "Adding import: " << Symbols.back()
-                   << " sym index:" << Symbols.size() << "\n");
-      break;
-    default:
-      break;
-    }
+Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
+  HasLinkingSection = true;
+  if (Functions.size() != FunctionTypes.size()) {
+    return make_error<GenericBinaryError>(
+        "Linking data must come after code section", object_error::parse_failed);
   }
 
-  // Add exports to symbol table
-  for (const wasm::WasmExport& Export : Exports) {
-    if (Export.Kind == wasm::WASM_EXTERNAL_FUNCTION ||
-        Export.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
-      WasmSymbol::SymbolType ExportType =
-          Export.Kind == wasm::WASM_EXTERNAL_FUNCTION
-              ? WasmSymbol::SymbolType::FUNCTION_EXPORT
-              : WasmSymbol::SymbolType::GLOBAL_EXPORT;
-      auto Pair = SymbolMap.try_emplace(Export.Name, Symbols.size());
-      if (Pair.second) {
-        Symbols.emplace_back(Export.Name, ExportType,
-                             ExportSection, Export.Index);
-        DEBUG(dbgs() << "Adding export: " << Symbols.back()
-                     << " sym index:" << Symbols.size() << "\n");
-      } else {
-        uint32_t SymIndex = Pair.first->second;
-        const WasmSymbol &OldSym = Symbols[SymIndex];
-        WasmSymbol NewSym(Export.Name, ExportType, ExportSection, Export.Index);
-        NewSym.setAltIndex(OldSym.ElementIndex);
-        Symbols[SymIndex] = NewSym;
-
-        DEBUG(dbgs() << "Replacing existing symbol:  " << NewSym
-                     << " sym index:" << SymIndex << "\n");
-      }
-    }
+  LinkingData.Version = readVaruint32(Ctx);
+  if (LinkingData.Version != wasm::WasmMetadataVersion) {
+    return make_error<GenericBinaryError>(
+        "Unexpected metadata version: " + Twine(LinkingData.Version) +
+            " (Expected: " + Twine(wasm::WasmMetadataVersion) + ")",
+        object_error::parse_failed);
   }
-}
-
-Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr,
-                                          const uint8_t *End) {
-  HasLinkingSection = true;
-
-  // Only populate the symbol table with imports and exports if the object
-  // has a linking section (i.e. its a relocatable object file). Otherwise
-  // the global might not represent symbols at all.
-  populateSymbolTable();
 
-  while (Ptr < End) {
-    uint8_t Type = readVarint7(Ptr);
-    uint32_t Size = readVaruint32(Ptr);
-    const uint8_t *SubSectionEnd = Ptr + Size;
+  const uint8_t *OrigEnd = Ctx.End;
+  while (Ctx.Ptr < OrigEnd) {
+    Ctx.End = OrigEnd;
+    uint8_t Type = readUint8(Ctx);
+    uint32_t Size = readVaruint32(Ctx);
+    LLVM_DEBUG(dbgs() << "readSubsection type=" << int(Type) << " size=" << Size
+                      << "\n");
+    Ctx.End = Ctx.Ptr + Size;
     switch (Type) {
-    case wasm::WASM_SYMBOL_INFO: {
-      uint32_t Count = readVaruint32(Ptr);
-      while (Count--) {
-        StringRef Symbol = readString(Ptr);
-        DEBUG(dbgs() << "reading syminfo: " << Symbol << "\n");
-        uint32_t Flags = readVaruint32(Ptr);
-        auto iter = SymbolMap.find(Symbol);
-        if (iter == SymbolMap.end()) {
-          return make_error<GenericBinaryError>(
-              "Invalid symbol name in linking section: " + Symbol,
-              object_error::parse_failed);
-        }
-        uint32_t SymIndex = iter->second;
-        assert(SymIndex < Symbols.size());
-        Symbols[SymIndex].Flags = Flags;
-        DEBUG(dbgs() << "Set symbol flags index:"
-                     << SymIndex << " name:"
-                     << Symbols[SymIndex].Name << " expected:"
-                     << Symbol << " flags: " << Flags << "\n");
-      }
-      break;
-    }
-    case wasm::WASM_DATA_SIZE:
-      LinkingData.DataSize = readVaruint32(Ptr);
+    case wasm::WASM_SYMBOL_TABLE:
+      if (Error Err = parseLinkingSectionSymtab(Ctx))
+        return Err;
       break;
     case wasm::WASM_SEGMENT_INFO: {
-      uint32_t Count = readVaruint32(Ptr);
+      uint32_t Count = readVaruint32(Ctx);
       if (Count > DataSegments.size())
         return make_error<GenericBinaryError>("Too many segment names",
                                               object_error::parse_failed);
       for (uint32_t i = 0; i < Count; i++) {
-        DataSegments[i].Data.Name = readString(Ptr);
-        DataSegments[i].Data.Alignment = readVaruint32(Ptr);
-        DataSegments[i].Data.Flags = readVaruint32(Ptr);
+        DataSegments[i].Data.Name = readString(Ctx);
+        DataSegments[i].Data.Alignment = readVaruint32(Ctx);
+        DataSegments[i].Data.Flags = readVaruint32(Ctx);
       }
       break;
     }
     case wasm::WASM_INIT_FUNCS: {
-      uint32_t Count = readVaruint32(Ptr);
+      uint32_t Count = readVaruint32(Ctx);
       LinkingData.InitFunctions.reserve(Count);
       for (uint32_t i = 0; i < Count; i++) {
         wasm::WasmInitFunc Init;
-        Init.Priority = readVaruint32(Ptr);
-        Init.FunctionIndex = readVaruint32(Ptr);
-        if (!isValidFunctionIndex(Init.FunctionIndex))
-          return make_error<GenericBinaryError>("Invalid function index: " +
-                                                    Twine(Init.FunctionIndex),
+        Init.Priority = readVaruint32(Ctx);
+        Init.Symbol = readVaruint32(Ctx);
+        if (!isValidFunctionSymbol(Init.Symbol))
+          return make_error<GenericBinaryError>("Invalid function symbol: " +
+                                                    Twine(Init.Symbol),
                                                 object_error::parse_failed);
         LinkingData.InitFunctions.emplace_back(Init);
       }
       break;
     }
+    case wasm::WASM_COMDAT_INFO:
+      if (Error Err = parseLinkingSectionComdat(Ctx))
+        return Err;
+      break;
     default:
-      Ptr += Size;
+      Ctx.Ptr += Size;
       break;
     }
-    if (Ptr != SubSectionEnd)
+    if (Ctx.Ptr != Ctx.End)
       return make_error<GenericBinaryError>(
           "Linking sub-section ended prematurely", object_error::parse_failed);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != OrigEnd)
     return make_error<GenericBinaryError>("Linking section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-WasmSection* WasmObjectFile::findCustomSectionByName(StringRef Name) {
-  for (WasmSection& Section : Sections) {
-    if (Section.Type == wasm::WASM_SEC_CUSTOM && Section.Name == Name)
-      return &Section;
+Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
+  LinkingData.SymbolTable.reserve(Count);
+  Symbols.reserve(Count);
+  StringSet<> SymbolNames;
+
+  std::vector<wasm::WasmImport *> ImportedGlobals;
+  std::vector<wasm::WasmImport *> ImportedFunctions;
+  ImportedGlobals.reserve(Imports.size());
+  ImportedFunctions.reserve(Imports.size());
+  for (auto &I : Imports) {
+    if (I.Kind == wasm::WASM_EXTERNAL_FUNCTION)
+      ImportedFunctions.emplace_back(&I);
+    else if (I.Kind == wasm::WASM_EXTERNAL_GLOBAL)
+      ImportedGlobals.emplace_back(&I);
   }
-  return nullptr;
-}
 
-WasmSection* WasmObjectFile::findSectionByType(uint32_t Type) {
-  assert(Type != wasm::WASM_SEC_CUSTOM);
-  for (WasmSection& Section : Sections) {
-    if (Section.Type == Type)
-      return &Section;
+  while (Count--) {
+    wasm::WasmSymbolInfo Info;
+    const wasm::WasmSignature *FunctionType = nullptr;
+    const wasm::WasmGlobalType *GlobalType = nullptr;
+
+    Info.Kind = readUint8(Ctx);
+    Info.Flags = readVaruint32(Ctx);
+    bool IsDefined = (Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0;
+
+    switch (Info.Kind) {
+    case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+      Info.ElementIndex = readVaruint32(Ctx);
+      if (!isValidFunctionIndex(Info.ElementIndex) ||
+          IsDefined != isDefinedFunctionIndex(Info.ElementIndex))
+        return make_error<GenericBinaryError>("invalid function symbol index",
+                                              object_error::parse_failed);
+      if (IsDefined) {
+        Info.Name = readString(Ctx);
+        unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
+        FunctionType = &Signatures[FunctionTypes[FuncIndex]];
+        wasm::WasmFunction &Function = Functions[FuncIndex];
+        if (Function.SymbolName.empty())
+          Function.SymbolName = Info.Name;
+      } else {
+        wasm::WasmImport &Import = *ImportedFunctions[Info.ElementIndex];
+        FunctionType = &Signatures[Import.SigIndex];
+        Info.Name = Import.Field;
+        Info.Module = Import.Module;
+      }
+      break;
+
+    case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+      Info.ElementIndex = readVaruint32(Ctx);
+      if (!isValidGlobalIndex(Info.ElementIndex) ||
+          IsDefined != isDefinedGlobalIndex(Info.ElementIndex))
+        return make_error<GenericBinaryError>("invalid global symbol index",
+                                              object_error::parse_failed);
+      if (!IsDefined &&
+          (Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) ==
+              wasm::WASM_SYMBOL_BINDING_WEAK)
+        return make_error<GenericBinaryError>("undefined weak global symbol",
+                                              object_error::parse_failed);
+      if (IsDefined) {
+        Info.Name = readString(Ctx);
+        unsigned GlobalIndex = Info.ElementIndex - NumImportedGlobals;
+        wasm::WasmGlobal &Global = Globals[GlobalIndex];
+        GlobalType = &Global.Type;
+        if (Global.SymbolName.empty())
+          Global.SymbolName = Info.Name;
+      } else {
+        wasm::WasmImport &Import = *ImportedGlobals[Info.ElementIndex];
+        Info.Name = Import.Field;
+        GlobalType = &Import.Global;
+      }
+      break;
+
+    case wasm::WASM_SYMBOL_TYPE_DATA:
+      Info.Name = readString(Ctx);
+      if (IsDefined) {
+        uint32_t Index = readVaruint32(Ctx);
+        if (Index >= DataSegments.size())
+          return make_error<GenericBinaryError>("invalid data symbol index",
+                                                object_error::parse_failed);
+        uint32_t Offset = readVaruint32(Ctx);
+        uint32_t Size = readVaruint32(Ctx);
+        if (Offset + Size > DataSegments[Index].Data.Content.size())
+          return make_error<GenericBinaryError>("invalid data symbol offset",
+                                                object_error::parse_failed);
+        Info.DataRef = wasm::WasmDataReference{Index, Offset, Size};
+      }
+      break;
+
+    case wasm::WASM_SYMBOL_TYPE_SECTION: {
+      if ((Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) !=
+          wasm::WASM_SYMBOL_BINDING_LOCAL)
+        return make_error<GenericBinaryError>(
+            "Section symbols must have local binding",
+            object_error::parse_failed);
+      Info.ElementIndex = readVaruint32(Ctx);
+      // Use somewhat unique section name as symbol name.
+      StringRef SectionName = Sections[Info.ElementIndex].Name;
+      Info.Name = SectionName;
+      break;
+    }
+
+    default:
+      return make_error<GenericBinaryError>("Invalid symbol type",
+                                            object_error::parse_failed);
+    }
+
+    if ((Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) !=
+            wasm::WASM_SYMBOL_BINDING_LOCAL &&
+        !SymbolNames.insert(Info.Name).second)
+      return make_error<GenericBinaryError>("Duplicate symbol name " +
+                                                Twine(Info.Name),
+                                            object_error::parse_failed);
+    LinkingData.SymbolTable.emplace_back(Info);
+    Symbols.emplace_back(LinkingData.SymbolTable.back(), FunctionType,
+                         GlobalType);
+    LLVM_DEBUG(dbgs() << "Adding symbol: " << Symbols.back() << "\n");
   }
-  return nullptr;
-}
-
-Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr,
-                                        const uint8_t *End) {
-  uint8_t SectionCode = readVarint7(Ptr);
-  WasmSection* Section = nullptr;
-  if (SectionCode == wasm::WASM_SEC_CUSTOM) {
-    StringRef Name = readString(Ptr);
-    Section = findCustomSectionByName(Name);
-  } else {
-    Section = findSectionByType(SectionCode);
+
+  return Error::success();
+}
+
+Error WasmObjectFile::parseLinkingSectionComdat(ReadContext &Ctx) {
+  uint32_t ComdatCount = readVaruint32(Ctx);
+  StringSet<> ComdatSet;
+  for (unsigned ComdatIndex = 0; ComdatIndex < ComdatCount; ++ComdatIndex) {
+    StringRef Name = readString(Ctx);
+    if (Name.empty() || !ComdatSet.insert(Name).second)
+      return make_error<GenericBinaryError>("Bad/duplicate COMDAT name " + Twine(Name),
+                                            object_error::parse_failed);
+    LinkingData.Comdats.emplace_back(Name);
+    uint32_t Flags = readVaruint32(Ctx);
+    if (Flags != 0)
+      return make_error<GenericBinaryError>("Unsupported COMDAT flags",
+                                            object_error::parse_failed);
+
+    uint32_t EntryCount = readVaruint32(Ctx);
+    while (EntryCount--) {
+      unsigned Kind = readVaruint32(Ctx);
+      unsigned Index = readVaruint32(Ctx);
+      switch (Kind) {
+      default:
+        return make_error<GenericBinaryError>("Invalid COMDAT entry type",
+                                              object_error::parse_failed);
+      case wasm::WASM_COMDAT_DATA:
+        if (Index >= DataSegments.size())
+          return make_error<GenericBinaryError>("COMDAT data index out of range",
+                                                object_error::parse_failed);
+        if (DataSegments[Index].Data.Comdat != UINT32_MAX)
+          return make_error<GenericBinaryError>("Data segment in two COMDATs",
+                                                object_error::parse_failed);
+        DataSegments[Index].Data.Comdat = ComdatIndex;
+        break;
+      case wasm::WASM_COMDAT_FUNCTION:
+        if (!isDefinedFunctionIndex(Index))
+          return make_error<GenericBinaryError>("COMDAT function index out of range",
+                                                object_error::parse_failed);
+        if (getDefinedFunction(Index).Comdat != UINT32_MAX)
+          return make_error<GenericBinaryError>("Function in two COMDATs",
+                                                object_error::parse_failed);
+        getDefinedFunction(Index).Comdat = ComdatIndex;
+        break;
+      }
+    }
   }
-  if (!Section)
-    return make_error<GenericBinaryError>("Invalid section code",
+  return Error::success();
+}
+
+Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
+  uint32_t SectionIndex = readVaruint32(Ctx);
+  if (SectionIndex >= Sections.size())
+    return make_error<GenericBinaryError>("Invalid section index",
                                           object_error::parse_failed);
-  uint32_t RelocCount = readVaruint32(Ptr);
+  WasmSection& Section = Sections[SectionIndex];
+  uint32_t RelocCount = readVaruint32(Ctx);
+  uint32_t EndOffset = Section.Content.size();
   while (RelocCount--) {
-    wasm::WasmRelocation Reloc;
-    memset(&Reloc, 0, sizeof(Reloc));
-    Reloc.Type = readVaruint32(Ptr);
-    Reloc.Offset = readVaruint32(Ptr);
-    Reloc.Index = readVaruint32(Ptr);
+    wasm::WasmRelocation Reloc = {};
+    Reloc.Type = readVaruint32(Ctx);
+    Reloc.Offset = readVaruint32(Ctx);
+    Reloc.Index = readVaruint32(Ctx);
     switch (Reloc.Type) {
     case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+      if (!isValidFunctionSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation function index",
+                                              object_error::parse_failed);
+      break;
     case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+      if (Reloc.Index >= Signatures.size())
+        return make_error<GenericBinaryError>("Bad relocation type index",
+                                              object_error::parse_failed);
+      break;
     case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
+      if (!isValidGlobalSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation global index",
+                                              object_error::parse_failed);
       break;
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
-      Reloc.Addend = readVarint32(Ptr);
+      if (!isValidDataSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation data index",
+                                              object_error::parse_failed);
+      Reloc.Addend = readVarint32(Ctx);
+      break;
+    case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
+      if (!isValidFunctionSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation function index",
+                                              object_error::parse_failed);
+      Reloc.Addend = readVarint32(Ctx);
+      break;
+    case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32:
+      if (!isValidSectionSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation section index",
+                                              object_error::parse_failed);
+      Reloc.Addend = readVarint32(Ctx);
       break;
     default:
       return make_error<GenericBinaryError>("Bad relocation type: " +
                                                 Twine(Reloc.Type),
                                             object_error::parse_failed);
     }
-    Section->Relocations.push_back(Reloc);
+
+    // Relocations must fit inside the section, and must appear in order.  They
+    // also shouldn't overlap a function/element boundary, but we don't bother
+    // to check that.
+    uint64_t Size = 5;
+    if (Reloc.Type == wasm::R_WEBASSEMBLY_TABLE_INDEX_I32 ||
+        Reloc.Type == wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32 ||
+        Reloc.Type == wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32 ||
+        Reloc.Type == wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32)
+      Size = 4;
+    if (Reloc.Offset + Size > EndOffset)
+      return make_error<GenericBinaryError>("Bad relocation offset",
+                                            object_error::parse_failed);
+
+    Section.Relocations.push_back(Reloc);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Reloc section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseCustomSection(WasmSection &Sec,
-                                         const uint8_t *Ptr, const uint8_t *End) {
-  Sec.Name = readString(Ptr);
+Error WasmObjectFile::parseCustomSection(WasmSection &Sec, ReadContext &Ctx) {
   if (Sec.Name == "name") {
-    if (Error Err = parseNameSection(Ptr, End))
+    if (Error Err = parseNameSection(Ctx))
       return Err;
   } else if (Sec.Name == "linking") {
-    if (Error Err = parseLinkingSection(Ptr, End))
+    if (Error Err = parseLinkingSection(Ctx))
       return Err;
   } else if (Sec.Name.startswith("reloc.")) {
-    if (Error Err = parseRelocSection(Sec.Name, Ptr, End))
+    if (Error Err = parseRelocSection(Sec.Name, Ctx))
       return Err;
   }
   return Error::success();
 }
 
-Error WasmObjectFile::parseTypeSection(const uint8_t *Ptr, const uint8_t *End) {
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseTypeSection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   Signatures.reserve(Count);
   while (Count--) {
     wasm::WasmSignature Sig;
     Sig.ReturnType = wasm::WASM_TYPE_NORESULT;
-    int8_t Form = readVarint7(Ptr);
+    uint8_t Form = readUint8(Ctx);
     if (Form != wasm::WASM_TYPE_FUNC) {
       return make_error<GenericBinaryError>("Invalid signature type",
                                             object_error::parse_failed);
     }
-    uint32_t ParamCount = readVaruint32(Ptr);
+    uint32_t ParamCount = readVaruint32(Ctx);
     Sig.ParamTypes.reserve(ParamCount);
     while (ParamCount--) {
-      uint32_t ParamType = readVarint7(Ptr);
+      uint32_t ParamType = readUint8(Ctx);
       Sig.ParamTypes.push_back(ParamType);
     }
-    uint32_t ReturnCount = readVaruint32(Ptr);
+    uint32_t ReturnCount = readVaruint32(Ctx);
     if (ReturnCount) {
       if (ReturnCount != 1) {
         return make_error<GenericBinaryError>(
             "Multiple return types not supported", object_error::parse_failed);
       }
-      Sig.ReturnType = readVarint7(Ptr);
+      Sig.ReturnType = readUint8(Ctx);
     }
     Signatures.push_back(Sig);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Type section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End) {
-  ImportSection = Sections.size();
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   Imports.reserve(Count);
   for (uint32_t i = 0; i < Count; i++) {
     wasm::WasmImport Im;
-    Im.Module = readString(Ptr);
-    Im.Field = readString(Ptr);
-    Im.Kind = readUint8(Ptr);
+    Im.Module = readString(Ctx);
+    Im.Field = readString(Ctx);
+    Im.Kind = readUint8(Ctx);
     switch (Im.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION:
       NumImportedFunctions++;
-      Im.SigIndex = readVaruint32(Ptr);
+      Im.SigIndex = readVaruint32(Ctx);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
       NumImportedGlobals++;
-      Im.Global.Type = readVarint7(Ptr);
-      Im.Global.Mutable = readVaruint1(Ptr);
+      Im.Global.Type = readUint8(Ctx);
+      Im.Global.Mutable = readVaruint1(Ctx);
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
-      Im.Memory = readLimits(Ptr);
+      Im.Memory = readLimits(Ctx);
       break;
     case wasm::WASM_EXTERNAL_TABLE:
-      Im.Table = readTable(Ptr);
+      Im.Table = readTable(Ctx);
       if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC)
         return make_error<GenericBinaryError>("Invalid table element type",
                                               object_error::parse_failed);
@@ -581,90 +745,95 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End)
     }
     Imports.push_back(Im);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Import section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseFunctionSection(const uint8_t *Ptr, const uint8_t *End) {
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseFunctionSection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   FunctionTypes.reserve(Count);
+  uint32_t NumTypes = Signatures.size();
   while (Count--) {
-    FunctionTypes.push_back(readVaruint32(Ptr));
+    uint32_t Type = readVaruint32(Ctx);
+    if (Type >= NumTypes)
+      return make_error<GenericBinaryError>("Invalid function type",
+                                            object_error::parse_failed);
+    FunctionTypes.push_back(Type);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Function section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End) {
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseTableSection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   Tables.reserve(Count);
   while (Count--) {
-    Tables.push_back(readTable(Ptr));
+    Tables.push_back(readTable(Ctx));
     if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) {
       return make_error<GenericBinaryError>("Invalid table element type",
                                             object_error::parse_failed);
     }
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Table section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseMemorySection(const uint8_t *Ptr, const uint8_t *End) {
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseMemorySection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   Memories.reserve(Count);
   while (Count--) {
-    Memories.push_back(readLimits(Ptr));
+    Memories.push_back(readLimits(Ctx));
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Memory section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseGlobalSection(const uint8_t *Ptr, const uint8_t *End) {
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) {
+  GlobalSection = Sections.size();
+  uint32_t Count = readVaruint32(Ctx);
   Globals.reserve(Count);
   while (Count--) {
     wasm::WasmGlobal Global;
-    Global.Type = readVarint7(Ptr);
-    Global.Mutable = readVaruint1(Ptr);
-    if (Error Err = readInitExpr(Global.InitExpr, Ptr))
+    Global.Index = NumImportedGlobals + Globals.size();
+    Global.Type.Type = readUint8(Ctx);
+    Global.Type.Mutable = readVaruint1(Ctx);
+    if (Error Err = readInitExpr(Global.InitExpr, Ctx))
       return Err;
     Globals.push_back(Global);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Global section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End) {
-  ExportSection = Sections.size();
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   Exports.reserve(Count);
   for (uint32_t i = 0; i < Count; i++) {
     wasm::WasmExport Ex;
-    Ex.Name = readString(Ptr);
-    Ex.Kind = readUint8(Ptr);
-    Ex.Index = readVaruint32(Ptr);
+    Ex.Name = readString(Ctx);
+    Ex.Kind = readUint8(Ctx);
+    Ex.Index = readVaruint32(Ctx);
     switch (Ex.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION:
-      if (Ex.Index >= FunctionTypes.size() + NumImportedFunctions)
+      if (!isValidFunctionIndex(Ex.Index))
         return make_error<GenericBinaryError>("Invalid function export",
                                               object_error::parse_failed);
       break;
-    case wasm::WASM_EXTERNAL_GLOBAL: {
-      if (Ex.Index >= Globals.size() + NumImportedGlobals)
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      if (!isValidGlobalIndex(Ex.Index))
         return make_error<GenericBinaryError>("Invalid global export",
                                               object_error::parse_failed);
       break;
-    }
     case wasm::WASM_EXTERNAL_MEMORY:
     case wasm::WASM_EXTERNAL_TABLE:
       break;
@@ -674,27 +843,65 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
     }
     Exports.push_back(Ex);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Export section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
 bool WasmObjectFile::isValidFunctionIndex(uint32_t Index) const {
-  return Index < FunctionTypes.size() + NumImportedFunctions;
+  return Index < NumImportedFunctions + FunctionTypes.size();
+}
+
+bool WasmObjectFile::isDefinedFunctionIndex(uint32_t Index) const {
+  return Index >= NumImportedFunctions && isValidFunctionIndex(Index);
+}
+
+bool WasmObjectFile::isValidGlobalIndex(uint32_t Index) const {
+  return Index < NumImportedGlobals + Globals.size();
+}
+
+bool WasmObjectFile::isDefinedGlobalIndex(uint32_t Index) const {
+  return Index >= NumImportedGlobals && isValidGlobalIndex(Index);
 }
 
-Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
-  StartFunction = readVaruint32(Ptr);
+bool WasmObjectFile::isValidFunctionSymbol(uint32_t Index) const {
+  return Index < Symbols.size() && Symbols[Index].isTypeFunction();
+}
+
+bool WasmObjectFile::isValidGlobalSymbol(uint32_t Index) const {
+  return Index < Symbols.size() && Symbols[Index].isTypeGlobal();
+}
+
+bool WasmObjectFile::isValidDataSymbol(uint32_t Index) const {
+  return Index < Symbols.size() && Symbols[Index].isTypeData();
+}
+
+bool WasmObjectFile::isValidSectionSymbol(uint32_t Index) const {
+  return Index < Symbols.size() && Symbols[Index].isTypeSection();
+}
+
+wasm::WasmFunction &WasmObjectFile::getDefinedFunction(uint32_t Index) {
+  assert(isDefinedFunctionIndex(Index));
+  return Functions[Index - NumImportedFunctions];
+}
+
+wasm::WasmGlobal &WasmObjectFile::getDefinedGlobal(uint32_t Index) {
+  assert(isDefinedGlobalIndex(Index));
+  return Globals[Index - NumImportedGlobals];
+}
+
+Error WasmObjectFile::parseStartSection(ReadContext &Ctx) {
+  StartFunction = readVaruint32(Ctx);
   if (!isValidFunctionIndex(StartFunction))
     return make_error<GenericBinaryError>("Invalid start function",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseCodeSection(const uint8_t *Ptr, const uint8_t *End) {
-  const uint8_t *CodeSectionStart = Ptr;
-  uint32_t FunctionCount = readVaruint32(Ptr);
+Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
+  CodeSection = Sections.size();
+  uint32_t FunctionCount = readVaruint32(Ctx);
   if (FunctionCount != FunctionTypes.size()) {
     return make_error<GenericBinaryError>("Invalid function count",
                                           object_error::parse_failed);
@@ -702,83 +909,93 @@ Error WasmObjectFile::parseCodeSection(const uint8_t *Ptr, const uint8_t *End) {
 
   while (FunctionCount--) {
     wasm::WasmFunction Function;
-    const uint8_t *FunctionStart = Ptr;
-    uint32_t Size = readVaruint32(Ptr);
-    const uint8_t *FunctionEnd = Ptr + Size;
+    const uint8_t *FunctionStart = Ctx.Ptr;
+    uint32_t Size = readVaruint32(Ctx);
+    const uint8_t *FunctionEnd = Ctx.Ptr + Size;
 
-    Function.CodeSectionOffset = FunctionStart - CodeSectionStart;
+    Function.CodeOffset = Ctx.Ptr - FunctionStart;
+    Function.Index = NumImportedFunctions + Functions.size();
+    Function.CodeSectionOffset = FunctionStart - Ctx.Start;
     Function.Size = FunctionEnd - FunctionStart;
 
-    uint32_t NumLocalDecls = readVaruint32(Ptr);
+    uint32_t NumLocalDecls = readVaruint32(Ctx);
     Function.Locals.reserve(NumLocalDecls);
     while (NumLocalDecls--) {
       wasm::WasmLocalDecl Decl;
-      Decl.Count = readVaruint32(Ptr);
-      Decl.Type = readVarint7(Ptr);
+      Decl.Count = readVaruint32(Ctx);
+      Decl.Type = readUint8(Ctx);
       Function.Locals.push_back(Decl);
     }
 
-    uint32_t BodySize = FunctionEnd - Ptr;
-    Function.Body = ArrayRef<uint8_t>(Ptr, BodySize);
-    Ptr += BodySize;
-    assert(Ptr == FunctionEnd);
+    uint32_t BodySize = FunctionEnd - Ctx.Ptr;
+    Function.Body = ArrayRef<uint8_t>(Ctx.Ptr, BodySize);
+    // This will be set later when reading in the linking metadata section.
+    Function.Comdat = UINT32_MAX;
+    Ctx.Ptr += BodySize;
+    assert(Ctx.Ptr == FunctionEnd);
     Functions.push_back(Function);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Code section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseElemSection(const uint8_t *Ptr, const uint8_t *End) {
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
+  uint32_t Count = readVaruint32(Ctx);
   ElemSegments.reserve(Count);
   while (Count--) {
     wasm::WasmElemSegment Segment;
-    Segment.TableIndex = readVaruint32(Ptr);
+    Segment.TableIndex = readVaruint32(Ctx);
     if (Segment.TableIndex != 0) {
       return make_error<GenericBinaryError>("Invalid TableIndex",
                                             object_error::parse_failed);
     }
-    if (Error Err = readInitExpr(Segment.Offset, Ptr))
+    if (Error Err = readInitExpr(Segment.Offset, Ctx))
       return Err;
-    uint32_t NumElems = readVaruint32(Ptr);
+    uint32_t NumElems = readVaruint32(Ctx);
     while (NumElems--) {
-      Segment.Functions.push_back(readVaruint32(Ptr));
+      Segment.Functions.push_back(readVaruint32(Ctx));
     }
     ElemSegments.push_back(Segment);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Elem section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
-Error WasmObjectFile::parseDataSection(const uint8_t *Ptr, const uint8_t *End) {
-  const uint8_t *Start = Ptr;
-  uint32_t Count = readVaruint32(Ptr);
+Error WasmObjectFile::parseDataSection(ReadContext &Ctx) {
+  DataSection = Sections.size();
+  uint32_t Count = readVaruint32(Ctx);
   DataSegments.reserve(Count);
   while (Count--) {
     WasmSegment Segment;
-    Segment.Data.MemoryIndex = readVaruint32(Ptr);
-    if (Error Err = readInitExpr(Segment.Data.Offset, Ptr))
+    Segment.Data.MemoryIndex = readVaruint32(Ctx);
+    if (Error Err = readInitExpr(Segment.Data.Offset, Ctx))
       return Err;
-    uint32_t Size = readVaruint32(Ptr);
-    Segment.Data.Content = ArrayRef<uint8_t>(Ptr, Size);
+    uint32_t Size = readVaruint32(Ctx);
+    if (Size > (size_t)(Ctx.End - Ctx.Ptr))
+      return make_error<GenericBinaryError>("Invalid segment size",
+                                            object_error::parse_failed);
+    Segment.Data.Content = ArrayRef<uint8_t>(Ctx.Ptr, Size);
+    // The rest of these Data fields are set later, when reading in the linking
+    // metadata section.
     Segment.Data.Alignment = 0;
     Segment.Data.Flags = 0;
-    Segment.SectionOffset = Ptr - Start;
-    Ptr += Size;
+    Segment.Data.Comdat = UINT32_MAX;
+    Segment.SectionOffset = Ctx.Ptr - Ctx.Start;
+    Ctx.Ptr += Size;
     DataSegments.push_back(Segment);
   }
-  if (Ptr != End)
+  if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("Data section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
 const uint8_t *WasmObjectFile::getPtr(size_t Offset) const {
-  return reinterpret_cast<const uint8_t *>(getData().substr(Offset, 1).data());
+  return reinterpret_cast<const uint8_t *>(getData().data() + Offset);
 }
 
 const wasm::WasmObjectHeader &WasmObjectFile::getHeader() const {
@@ -791,32 +1008,17 @@ uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   uint32_t Result = SymbolRef::SF_None;
   const WasmSymbol &Sym = getWasmSymbol(Symb);
 
-  DEBUG(dbgs() << "getSymbolFlags: ptr=" << &Sym << " " << Sym << "\n");
-  if (Sym.isWeak())
+  LLVM_DEBUG(dbgs() << "getSymbolFlags: ptr=" << &Sym << " " << Sym << "\n");
+  if (Sym.isBindingWeak())
     Result |= SymbolRef::SF_Weak;
-  if (!Sym.isLocal())
+  if (!Sym.isBindingLocal())
     Result |= SymbolRef::SF_Global;
   if (Sym.isHidden())
     Result |= SymbolRef::SF_Hidden;
-
-  switch (Sym.Type) {
-  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
-    Result |= SymbolRef::SF_Undefined | SymbolRef::SF_Executable;
-    break;
-  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
-    Result |= SymbolRef::SF_Executable;
-    break;
-  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
-    Result |= SymbolRef::SF_Executable;
-    Result |= SymbolRef::SF_FormatSpecific;
-    break;
-  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+  if (!Sym.isDefined())
     Result |= SymbolRef::SF_Undefined;
-    break;
-  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
-    break;
-  }
-
+  if (Sym.isTypeFunction())
+    Result |= SymbolRef::SF_Executable;
   return Result;
 }
 
@@ -841,7 +1043,7 @@ const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
 }
 
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
-  return getWasmSymbol(Symb).Name;
+  return getWasmSymbol(Symb).Info.Name;
 }
 
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
@@ -849,20 +1051,20 @@ Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
 }
 
 uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol& Sym) const {
-  switch (Sym.Type) {
-  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
-  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
-  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
-  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
-    return Sym.ElementIndex;
-  case WasmSymbol::SymbolType::GLOBAL_EXPORT: {
-    uint32_t GlobalIndex = Sym.ElementIndex - NumImportedGlobals;
-    assert(GlobalIndex < Globals.size());
-    const wasm::WasmGlobal& Global = Globals[GlobalIndex];
-    // WasmSymbols correspond only to I32_CONST globals
-    assert(Global.InitExpr.Opcode == wasm::WASM_OPCODE_I32_CONST);
-    return Global.InitExpr.Value.Int32;
+  switch (Sym.Info.Kind) {
+  case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+  case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+    return Sym.Info.ElementIndex;
+  case wasm::WASM_SYMBOL_TYPE_DATA: {
+    // The value of a data symbol is the segment offset, plus the symbol
+    // offset within the segment.
+    uint32_t SegmentIndex = Sym.Info.DataRef.Segment;
+    const wasm::WasmDataSegment &Segment = DataSegments[SegmentIndex].Data;
+    assert(Segment.Offset.Opcode == wasm::WASM_OPCODE_I32_CONST);
+    return Segment.Offset.Value.Int32 + Sym.Info.DataRef.Offset;
   }
+  case wasm::WASM_SYMBOL_TYPE_SECTION:
+    return 0;
   }
   llvm_unreachable("invalid symbol type");
 }
@@ -885,14 +1087,15 @@ Expected<SymbolRef::Type>
 WasmObjectFile::getSymbolType(DataRefImpl Symb) const {
   const WasmSymbol &Sym = getWasmSymbol(Symb);
 
-  switch (Sym.Type) {
-  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
-  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
-  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+  switch (Sym.Info.Kind) {
+  case wasm::WASM_SYMBOL_TYPE_FUNCTION:
     return SymbolRef::ST_Function;
-  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
-  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+  case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+    return SymbolRef::ST_Other;
+  case wasm::WASM_SYMBOL_TYPE_DATA:
     return SymbolRef::ST_Data;
+  case wasm::WASM_SYMBOL_TYPE_SECTION:
+    return SymbolRef::ST_Debug;
   }
 
   llvm_unreachable("Unknown WasmSymbol::SymbolType");
@@ -901,8 +1104,28 @@ WasmObjectFile::getSymbolType(DataRefImpl Symb) const {
 
 Expected<section_iterator>
 WasmObjectFile::getSymbolSection(DataRefImpl Symb) const {
+  const WasmSymbol& Sym = getWasmSymbol(Symb);
+  if (Sym.isUndefined())
+    return section_end();
+
   DataRefImpl Ref;
-  Ref.d.a = getWasmSymbol(Symb).Section;
+  switch (Sym.Info.Kind) {
+  case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+    Ref.d.a = CodeSection;
+    break;
+  case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+    Ref.d.a = GlobalSection;
+    break;
+  case wasm::WASM_SYMBOL_TYPE_DATA:
+    Ref.d.a = DataSection;
+    break;
+  case wasm::WASM_SYMBOL_TYPE_SECTION: {
+    Ref.d.a = Sym.Info.ElementIndex;
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown WasmSymbol::SymbolType");
+  }
   return section_iterator(SectionRef(Ref, this));
 }
 
@@ -1004,10 +1227,14 @@ uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Ref) const {
   return Rel.Offset;
 }
 
-symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  SymbolRef Ref;
-  return symbol_iterator(Ref);
+symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  if (Rel.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB)
+    return symbol_end();
+  DataRefImpl Sym;
+  Sym.d.a = Rel.Index;
+  Sym.d.b = 0;
+  return symbol_iterator(SymbolRef(Sym, this));
 }
 
 uint64_t WasmObjectFile::getRelocationType(DataRefImpl Ref) const {
diff --git a/contrib/llvm/lib/Object/WindowsResource.cpp b/contrib/llvm/lib/Object/WindowsResource.cpp
index 271224ec6312..1b7282f13db0 100644
--- a/contrib/llvm/lib/Object/WindowsResource.cpp
+++ b/contrib/llvm/lib/Object/WindowsResource.cpp
@@ -334,7 +334,7 @@ private:
   void writeDirectoryTree();
   void writeDirectoryStringTable();
   void writeFirstSectionRelocations();
-  std::unique_ptr<MemoryBuffer> OutputBuffer;
+  std::unique_ptr<WritableMemoryBuffer> OutputBuffer;
   char *BufferStart;
   uint64_t CurrentOffset = 0;
   COFF::MachineTypes MachineType;
@@ -360,7 +360,7 @@ WindowsResourceCOFFWriter::WindowsResourceCOFFWriter(
       Data(Parser.getData()), StringTable(Parser.getStringTable()) {
   performFileLayout();
 
-  OutputBuffer = MemoryBuffer::getNewMemBuffer(FileSize);
+  OutputBuffer = WritableMemoryBuffer::getNewMemBuffer(FileSize);
 }
 
 void WindowsResourceCOFFWriter::performFileLayout() {
@@ -425,7 +425,7 @@ static std::time_t getTime() {
 }
 
 std::unique_ptr<MemoryBuffer> WindowsResourceCOFFWriter::write() {
-  BufferStart = const_cast<char *>(OutputBuffer->getBufferStart());
+  BufferStart = OutputBuffer->getBufferStart();
 
   writeCOFFHeader();
   writeFirstSectionHeader();
diff --git a/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp b/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp
index 937b8dc029fa..9351ef96beb2 100644
--- a/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp
@@ -562,14 +562,16 @@ void MappingTraits<COFFYAML::Section>::mapping(IO &IO, COFFYAML::Section &Sec) {
   IO.mapOptional("VirtualSize", Sec.Header.VirtualSize, 0U);
   IO.mapOptional("Alignment", Sec.Alignment, 0U);
 
-  // If this is a .debug$S .debug$T, or .debug$H section parse the semantic
-  // representation of the symbols/types.  If it is any other kind of section,
-  // just deal in raw bytes.
+  // If this is a .debug$S .debug$T .debug$P, or .debug$H section parse the
+  // semantic representation of the symbols/types.  If it is any other kind
+  // of section, just deal in raw bytes.
   IO.mapOptional("SectionData", Sec.SectionData);
   if (Sec.Name == ".debug$S")
     IO.mapOptional("Subsections", Sec.DebugS);
   else if (Sec.Name == ".debug$T")
     IO.mapOptional("Types", Sec.DebugT);
+  else if (Sec.Name == ".debug$P")
+    IO.mapOptional("PrecompTypes", Sec.DebugP);
   else if (Sec.Name == ".debug$H")
     IO.mapOptional("GlobalHashes", Sec.DebugH);
 
diff --git a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 199a65a2870e..f67a0db690eb 100644
--- a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -40,6 +40,7 @@ using namespace llvm::CodeViewYAML::detail;
 using namespace llvm::yaml;
 
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex)
+LLVM_YAML_IS_SEQUENCE_VECTOR(LocalVariableAddrGap)
 
 // We only need to declare these, the definitions are in CodeViewYAMLTypes.cpp
 LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, QuotingType::None)
@@ -181,6 +182,24 @@ void ScalarEnumerationTraits<FrameCookieKind>::enumeration(
 }
 
 namespace llvm {
+namespace yaml {
+template <> struct MappingTraits<LocalVariableAddrRange> {
+  static void mapping(IO &io, LocalVariableAddrRange &Range) {
+    io.mapRequired("OffsetStart", Range.OffsetStart);
+    io.mapRequired("ISectStart", Range.ISectStart);
+    io.mapRequired("Range", Range.Range);
+  }
+};
+template <> struct MappingTraits<LocalVariableAddrGap> {
+  static void mapping(IO &io, LocalVariableAddrGap &Gap) {
+    io.mapRequired("GapStartOffset", Gap.GapStartOffset);
+    io.mapRequired("Range", Gap.Range);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+namespace llvm {
 namespace CodeViewYAML {
 namespace detail {
 
@@ -353,32 +372,50 @@ template <> void SymbolRecordImpl<LocalSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<DefRangeSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Program", Symbol.Program);
+  IO.mapRequired("Range", Symbol.Range);
+  IO.mapRequired("Gaps", Symbol.Gaps);
 }
 
 template <> void SymbolRecordImpl<DefRangeSubfieldSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Program", Symbol.Program);
+  IO.mapRequired("OffsetInParent", Symbol.OffsetInParent);
+  IO.mapRequired("Range", Symbol.Range);
+  IO.mapRequired("Gaps", Symbol.Gaps);
 }
 
 template <> void SymbolRecordImpl<DefRangeRegisterSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Register", Symbol.Hdr.Register);
+  IO.mapRequired("MayHaveNoName", Symbol.Hdr.MayHaveNoName);
+  IO.mapRequired("Range", Symbol.Range);
+  IO.mapRequired("Gaps", Symbol.Gaps);
 }
 
 template <> void SymbolRecordImpl<DefRangeFramePointerRelSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Offset", Symbol.Offset);
+  IO.mapRequired("Range", Symbol.Range);
+  IO.mapRequired("Gaps", Symbol.Gaps);
 }
 
 template <> void SymbolRecordImpl<DefRangeSubfieldRegisterSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Register", Symbol.Hdr.Register);
+  IO.mapRequired("MayHaveNoName", Symbol.Hdr.MayHaveNoName);
+  IO.mapRequired("OffsetInParent", Symbol.Hdr.OffsetInParent);
+  IO.mapRequired("Range", Symbol.Range);
+  IO.mapRequired("Gaps", Symbol.Gaps);
 }
 
 template <>
 void SymbolRecordImpl<DefRangeFramePointerRelFullScopeSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Register", Symbol.Offset);
 }
 
 template <> void SymbolRecordImpl<DefRangeRegisterRelSym>::map(IO &IO) {
-  // TODO: Print the subfields
+  IO.mapRequired("Register", Symbol.Hdr.Register);
+  IO.mapRequired("Flags", Symbol.Hdr.Flags);
+  IO.mapRequired("BasePointerOffset", Symbol.Hdr.BasePointerOffset);
+  IO.mapRequired("Range", Symbol.Range);
+  IO.mapRequired("Gaps", Symbol.Gaps);
 }
 
 template <> void SymbolRecordImpl<BlockSym>::map(IO &IO) {
diff --git a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypeHashing.cpp b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypeHashing.cpp
index bbbd7c067720..ed117059560f 100644
--- a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypeHashing.cpp
+++ b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypeHashing.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/CodeViewYAMLTypeHashing.h"
+
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
@@ -46,16 +48,17 @@ StringRef ScalarTraits<GlobalHash>::input(StringRef Scalar, void *Ctx,
 
 DebugHSection llvm::CodeViewYAML::fromDebugH(ArrayRef<uint8_t> DebugH) {
   assert(DebugH.size() >= 8);
-  assert((DebugH.size() - 8) % 20 == 0);
+  assert((DebugH.size() - 8) % 8 == 0);
 
   BinaryStreamReader Reader(DebugH, llvm::support::little);
   DebugHSection DHS;
   cantFail(Reader.readInteger(DHS.Magic));
   cantFail(Reader.readInteger(DHS.Version));
   cantFail(Reader.readInteger(DHS.HashAlgorithm));
+
   while (Reader.bytesRemaining() != 0) {
     ArrayRef<uint8_t> S;
-    cantFail(Reader.readBytes(S, 20));
+    cantFail(Reader.readBytes(S, 8));
     DHS.Hashes.emplace_back(S);
   }
   assert(Reader.bytesRemaining() == 0);
@@ -64,19 +67,20 @@ DebugHSection llvm::CodeViewYAML::fromDebugH(ArrayRef<uint8_t> DebugH) {
 
 ArrayRef<uint8_t> llvm::CodeViewYAML::toDebugH(const DebugHSection &DebugH,
                                                BumpPtrAllocator &Alloc) {
-  uint32_t Size = 8 + 20 * DebugH.Hashes.size();
+  uint32_t Size = 8 + 8 * DebugH.Hashes.size();
   uint8_t *Data = Alloc.Allocate<uint8_t>(Size);
   MutableArrayRef<uint8_t> Buffer(Data, Size);
   BinaryStreamWriter Writer(Buffer, llvm::support::little);
+
   cantFail(Writer.writeInteger(DebugH.Magic));
   cantFail(Writer.writeInteger(DebugH.Version));
   cantFail(Writer.writeInteger(DebugH.HashAlgorithm));
-  SmallString<20> Hash;
+  SmallString<8> Hash;
   for (const auto &H : DebugH.Hashes) {
     Hash.clear();
     raw_svector_ostream OS(Hash);
     H.Hash.writeAsBinary(OS);
-    assert((Hash.size() == 20) && "Invalid hash size!");
+    assert((Hash.size() == 8) && "Invalid hash size!");
     cantFail(Writer.writeFixedString(Hash));
   }
   assert(Writer.bytesRemaining() == 0);
diff --git a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index ba4ad9382ce5..791b115dc492 100644
--- a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -595,6 +595,17 @@ template <> void LeafRecordImpl<MethodOverloadListRecord>::map(IO &IO) {
   IO.mapRequired("Methods", Record.Methods);
 }
 
+template <> void LeafRecordImpl<PrecompRecord>::map(IO &IO) {
+  IO.mapRequired("StartTypeIndex", Record.StartTypeIndex);
+  IO.mapRequired("TypesCount", Record.TypesCount);
+  IO.mapRequired("Signature", Record.Signature);
+  IO.mapRequired("PrecompFilePath", Record.PrecompFilePath);
+}
+
+template <> void LeafRecordImpl<EndPrecompRecord>::map(IO &IO) {
+  IO.mapRequired("Signature", Record.Signature);
+}
+
 template <> void MemberRecordImpl<OneMethodRecord>::map(IO &IO) {
   MappingTraits<OneMethodRecord>::mapping(IO, Record);
 }
@@ -763,14 +774,16 @@ void MappingTraits<MemberRecord>::mapping(IO &IO, MemberRecord &Obj) {
 }
 
 std::vector<LeafRecord>
-llvm::CodeViewYAML::fromDebugT(ArrayRef<uint8_t> DebugT) {
-  ExitOnError Err("Invalid .debug$T section!");
-  BinaryStreamReader Reader(DebugT, support::little);
+llvm::CodeViewYAML::fromDebugT(ArrayRef<uint8_t> DebugTorP,
+                               StringRef SectionName) {
+  ExitOnError Err("Invalid " + std::string(SectionName) + " section!");
+  BinaryStreamReader Reader(DebugTorP, support::little);
   CVTypeArray Types;
   uint32_t Magic;
 
   Err(Reader.readInteger(Magic));
-  assert(Magic == COFF::DEBUG_SECTION_MAGIC && "Invalid .debug$T section!");
+  assert(Magic == COFF::DEBUG_SECTION_MAGIC &&
+         "Invalid .debug$T or .debug$P section!");
 
   std::vector<LeafRecord> Result;
   Err(Reader.readArray(Types, Reader.bytesRemaining()));
@@ -782,7 +795,8 @@ llvm::CodeViewYAML::fromDebugT(ArrayRef<uint8_t> DebugT) {
 }
 
 ArrayRef<uint8_t> llvm::CodeViewYAML::toDebugT(ArrayRef<LeafRecord> Leafs,
-                                               BumpPtrAllocator &Alloc) {
+                                               BumpPtrAllocator &Alloc,
+                                               StringRef SectionName) {
   AppendingTypeTableBuilder TS(Alloc);
   uint32_t Size = sizeof(uint32_t);
   for (const auto &Leaf : Leafs) {
@@ -793,7 +807,8 @@ ArrayRef<uint8_t> llvm::CodeViewYAML::toDebugT(ArrayRef<LeafRecord> Leafs,
   uint8_t *ResultBuffer = Alloc.Allocate<uint8_t>(Size);
   MutableArrayRef<uint8_t> Output(ResultBuffer, Size);
   BinaryStreamWriter Writer(Output, support::little);
-  ExitOnError Err("Error writing type record to .debug$T section");
+  ExitOnError Err("Error writing type record to " + std::string(SectionName) +
+                  " section");
   Err(Writer.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC));
   for (const auto &R : TS.records()) {
     Err(Writer.writeBytes(R));
diff --git a/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index c49c2249cdfe..f23fa1237600 100644
--- a/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief The DWARF component of yaml2obj. Provided as library code for tests.
+/// The DWARF component of yaml2obj. Provided as library code for tests.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -132,7 +132,7 @@ void DWARFYAML::EmitPubSection(raw_ostream &OS,
 }
 
 namespace {
-/// \brief An extension of the DWARFYAML::ConstVisitor which writes compile
+/// An extension of the DWARFYAML::ConstVisitor which writes compile
 /// units and DIEs to a stream.
 class DumpVisitor : public DWARFYAML::ConstVisitor {
   raw_ostream &OS;
@@ -149,7 +149,6 @@ protected:
       writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
       writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
     }
-    
   }
 
   void onStartDIE(const DWARFYAML::Unit &CU,
@@ -308,11 +307,50 @@ EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
     OutputBuffers[Sec] = MemoryBuffer::getMemBufferCopy(Data);
 }
 
+namespace {
+class DIEFixupVisitor : public DWARFYAML::Visitor {
+  uint64_t Length;
+
+public:
+  DIEFixupVisitor(DWARFYAML::Data &DI) : DWARFYAML::Visitor(DI){};
+
+private:
+  virtual void onStartCompileUnit(DWARFYAML::Unit &CU) { Length = 7; }
+
+  virtual void onEndCompileUnit(DWARFYAML::Unit &CU) {
+    CU.Length.setLength(Length);
+  }
+
+  virtual void onStartDIE(DWARFYAML::Unit &CU, DWARFYAML::Entry &DIE) {
+    Length += getULEB128Size(DIE.AbbrCode);
+  }
+
+  virtual void onValue(const uint8_t U) { Length += 1; }
+  virtual void onValue(const uint16_t U) { Length += 2; }
+  virtual void onValue(const uint32_t U) { Length += 4; }
+  virtual void onValue(const uint64_t U, const bool LEB = false) {
+    if (LEB)
+      Length += getULEB128Size(U);
+    else
+      Length += 8;
+  }
+  virtual void onValue(const int64_t S, const bool LEB = false) {
+    if (LEB)
+      Length += getSLEB128Size(S);
+    else
+      Length += 8;
+  }
+  virtual void onValue(const StringRef String) { Length += String.size() + 1; }
+
+  virtual void onValue(const MemoryBufferRef MBR) {
+    Length += MBR.getBufferSize();
+  }
+};
+} // namespace
+
 Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
-DWARFYAML::EmitDebugSections(StringRef YAMLString,
+DWARFYAML::EmitDebugSections(StringRef YAMLString, bool ApplyFixups,
                              bool IsLittleEndian) {
-  StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
-
   yaml::Input YIn(YAMLString);
 
   DWARFYAML::Data DI;
@@ -321,6 +359,12 @@ DWARFYAML::EmitDebugSections(StringRef YAMLString,
   if (YIn.error())
     return errorCodeToError(YIn.error());
 
+  if (ApplyFixups) {
+    DIEFixupVisitor DIFixer(DI);
+    DIFixer.traverseDebugInfo();
+  }
+
+  StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
   EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugInfo, "debug_info",
                        DebugSections);
   EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugLine, "debug_line",
diff --git a/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h b/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h
index 81ef412eb7e6..5489031dc331 100644
--- a/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h
+++ b/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h
@@ -26,7 +26,7 @@ struct Entry;
 struct FormValue;
 struct AttributeAbbrev;
 
-/// \brief A class to visits DWARFYAML Compile Units and DIEs in preorder.
+/// A class to visits DWARFYAML Compile Units and DIEs in preorder.
 ///
 /// Extensions of this class can either maintain const or non-const references
 /// to the DWARFYAML::Data object.
diff --git a/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp b/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp
index 7e7f3d1fdded..f916b5d5f392 100644
--- a/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -50,6 +50,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_PT>::enumeration(
   ECase(PT_SHLIB);
   ECase(PT_PHDR);
   ECase(PT_TLS);
+  ECase(PT_GNU_EH_FRAME);
 #undef ECase
   IO.enumFallback<Hex32>(Value);
 }
@@ -369,8 +370,39 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_RISCV_RVE);
     break;
   case ELF::EM_AMDGPU:
-    BCaseMask(EF_AMDGPU_ARCH_R600, EF_AMDGPU_ARCH);
-    BCaseMask(EF_AMDGPU_ARCH_GCN, EF_AMDGPU_ARCH);
+    BCaseMask(EF_AMDGPU_MACH_NONE, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_R600, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_R630, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_RS880, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_RV670, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_RV710, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_RV730, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_RV770, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_CEDAR, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_CYPRESS, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_JUNIPER, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_REDWOOD, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_SUMO, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_BARTS, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_CAICOS, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_CAYMAN, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_R600_TURKS, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX600, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX601, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX700, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX701, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX702, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX703, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX704, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX801, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX802, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX803, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
+    BCase(EF_AMDGPU_XNACK);
     break;
   case ELF::EM_X86_64:
     break;
@@ -404,10 +436,15 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_PREINIT_ARRAY);
   ECase(SHT_GROUP);
   ECase(SHT_SYMTAB_SHNDX);
+  ECase(SHT_RELR);
   ECase(SHT_LOOS);
   ECase(SHT_ANDROID_REL);
   ECase(SHT_ANDROID_RELA);
+  ECase(SHT_ANDROID_RELR);
   ECase(SHT_LLVM_ODRTAB);
+  ECase(SHT_LLVM_LINKER_OPTIONS);
+  ECase(SHT_LLVM_CALL_GRAPH_PROFILE);
+  ECase(SHT_LLVM_ADDRSIG);
   ECase(SHT_GNU_ATTRIBUTES);
   ECase(SHT_GNU_HASH);
   ECase(SHT_GNU_verdef);
diff --git a/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp b/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp
index b2411395dc0f..3c20bb74d501 100644
--- a/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -57,10 +57,11 @@ static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
 static void sectionMapping(IO &IO, WasmYAML::LinkingSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Name", Section.Name);
-  IO.mapRequired("DataSize", Section.DataSize);
-  IO.mapOptional("SymbolInfo", Section.SymbolInfos);
+  IO.mapRequired("Version", Section.Version);
+  IO.mapOptional("SymbolTable", Section.SymbolTable);
   IO.mapOptional("SegmentInfo", Section.SegmentInfos);
   IO.mapOptional("InitFunctions", Section.InitFunctions);
+  IO.mapOptional("Comdats", Section.Comdats);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::CustomSection &Section) {
@@ -236,7 +237,7 @@ void ScalarEnumerationTraits<WasmYAML::SectionType>::enumeration(
 
 void MappingTraits<WasmYAML::Signature>::mapping(
     IO &IO, WasmYAML::Signature &Signature) {
-  IO.mapOptional("Index", Signature.Index);
+  IO.mapRequired("Index", Signature.Index);
   IO.mapRequired("ReturnType", Signature.ReturnType);
   IO.mapRequired("ParamTypes", Signature.ParamTypes);
 }
@@ -248,6 +249,7 @@ void MappingTraits<WasmYAML::Table>::mapping(IO &IO, WasmYAML::Table &Table) {
 
 void MappingTraits<WasmYAML::Function>::mapping(IO &IO,
                                                 WasmYAML::Function &Function) {
+  IO.mapRequired("Index", Function.Index);
   IO.mapRequired("Locals", Function.Locals);
   IO.mapRequired("Body", Function.Body);
 }
@@ -323,6 +325,7 @@ void MappingTraits<WasmYAML::Export>::mapping(IO &IO,
 
 void MappingTraits<WasmYAML::Global>::mapping(IO &IO,
                                               WasmYAML::Global &Global) {
+  IO.mapRequired("Index", Global.Index);
   IO.mapRequired("Type", Global.Type);
   IO.mapRequired("Mutable", Global.Mutable);
   IO.mapRequired("InitExpr", Global.InitExpr);
@@ -363,13 +366,50 @@ void MappingTraits<WasmYAML::DataSegment>::mapping(
 void MappingTraits<WasmYAML::InitFunction>::mapping(
     IO &IO, WasmYAML::InitFunction &Init) {
   IO.mapRequired("Priority", Init.Priority);
-  IO.mapRequired("FunctionIndex", Init.FunctionIndex);
+  IO.mapRequired("Symbol", Init.Symbol);
+}
+
+void ScalarEnumerationTraits<WasmYAML::ComdatKind>::enumeration(
+    IO &IO, WasmYAML::ComdatKind &Kind) {
+#define ECase(X) IO.enumCase(Kind, #X, wasm::WASM_COMDAT_##X);
+  ECase(FUNCTION);
+  ECase(DATA);
+#undef ECase
+}
+
+void MappingTraits<WasmYAML::ComdatEntry>::mapping(
+    IO &IO, WasmYAML::ComdatEntry &ComdatEntry) {
+  IO.mapRequired("Kind", ComdatEntry.Kind);
+  IO.mapRequired("Index", ComdatEntry.Index);
+}
+
+void MappingTraits<WasmYAML::Comdat>::mapping(
+    IO &IO, WasmYAML::Comdat &Comdat) {
+  IO.mapRequired("Name", Comdat.Name);
+  IO.mapRequired("Entries", Comdat.Entries);
 }
 
 void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
                                                   WasmYAML::SymbolInfo &Info) {
+  IO.mapRequired("Index", Info.Index);
+  IO.mapRequired("Kind", Info.Kind);
   IO.mapRequired("Name", Info.Name);
   IO.mapRequired("Flags", Info.Flags);
+  if (Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION) {
+    IO.mapRequired("Function", Info.ElementIndex);
+  } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
+    IO.mapRequired("Global", Info.ElementIndex);
+  } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA) {
+    if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) {
+      IO.mapRequired("Segment", Info.DataRef.Segment);
+      IO.mapOptional("Offset", Info.DataRef.Offset, 0u);
+      IO.mapRequired("Size", Info.DataRef.Size);
+    }
+  } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_SECTION) {
+    IO.mapRequired("Section", Info.ElementIndex);
+  } else {
+    llvm_unreachable("unsupported symbol kind");
+  }
 }
 
 void ScalarBitSetTraits<WasmYAML::LimitFlags>::bitset(
@@ -391,9 +431,20 @@ void ScalarBitSetTraits<WasmYAML::SymbolFlags>::bitset(
   BCaseMask(BINDING_MASK, BINDING_LOCAL);
   //BCaseMask(VISIBILITY_MASK, VISIBILITY_DEFAULT);
   BCaseMask(VISIBILITY_MASK, VISIBILITY_HIDDEN);
+  BCaseMask(UNDEFINED, UNDEFINED);
 #undef BCaseMask
 }
 
+void ScalarEnumerationTraits<WasmYAML::SymbolKind>::enumeration(
+    IO &IO, WasmYAML::SymbolKind &Kind) {
+#define ECase(X) IO.enumCase(Kind, #X, wasm::WASM_SYMBOL_TYPE_##X);
+  ECase(FUNCTION);
+  ECase(DATA);
+  ECase(GLOBAL);
+  ECase(SECTION);
+#undef ECase
+}
+
 void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
     IO &IO, WasmYAML::ValueType &Type) {
 #define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
diff --git a/contrib/llvm/lib/Option/Arg.cpp b/contrib/llvm/lib/Option/Arg.cpp
index e581fee8bf38..4ce40e3ab26c 100644
--- a/contrib/llvm/lib/Option/Arg.cpp
+++ b/contrib/llvm/lib/Option/Arg.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
diff --git a/contrib/llvm/lib/Option/ArgList.cpp b/contrib/llvm/lib/Option/ArgList.cpp
index cbccc1935d3c..8a7d59d24366 100644
--- a/contrib/llvm/lib/Option/ArgList.cpp
+++ b/contrib/llvm/lib/Option/ArgList.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
diff --git a/contrib/llvm/lib/Option/OptTable.cpp b/contrib/llvm/lib/Option/OptTable.cpp
index c1bb05e817f0..022b9d5d933e 100644
--- a/contrib/llvm/lib/Option/OptTable.cpp
+++ b/contrib/llvm/lib/Option/OptTable.cpp
@@ -219,7 +219,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
 
     std::vector<std::string> Result;
     for (StringRef Val : Candidates)
-      if (Val.startswith(Arg))
+      if (Val.startswith(Arg) && Arg.compare(Val))
         Result.push_back(Val);
     return Result;
   }
@@ -240,13 +240,76 @@ OptTable::findByPrefix(StringRef Cur, unsigned short DisableFlags) const {
       std::string S = std::string(In.Prefixes[I]) + std::string(In.Name) + "\t";
       if (In.HelpText)
         S += In.HelpText;
-      if (StringRef(S).startswith(Cur))
+      if (StringRef(S).startswith(Cur) && S.compare(std::string(Cur) + "\t"))
         Ret.push_back(S);
     }
   }
   return Ret;
 }
 
+unsigned OptTable::findNearest(StringRef Option, std::string &NearestString,
+                               unsigned FlagsToInclude, unsigned FlagsToExclude,
+                               unsigned MinimumLength) const {
+  assert(!Option.empty());
+
+  // Consider each option as a candidate, finding the closest match.
+  unsigned BestDistance = UINT_MAX;
+  for (const Info &CandidateInfo :
+       ArrayRef<Info>(OptionInfos).drop_front(FirstSearchableIndex)) {
+    StringRef CandidateName = CandidateInfo.Name;
+
+    // Ignore option candidates with empty names, such as "--", or names
+    // that do not meet the minimum length.
+    if (CandidateName.empty() || CandidateName.size() < MinimumLength)
+      continue;
+
+    // If FlagsToInclude were specified, ignore options that don't include
+    // those flags.
+    if (FlagsToInclude && !(CandidateInfo.Flags & FlagsToInclude))
+      continue;
+    // Ignore options that contain the FlagsToExclude.
+    if (CandidateInfo.Flags & FlagsToExclude)
+      continue;
+
+    // Ignore positional argument option candidates (which do not
+    // have prefixes).
+    if (!CandidateInfo.Prefixes)
+      continue;
+    // Find the most appropriate prefix. For example, if a user asks for
+    // "--helm", suggest "--help" over "-help".
+    StringRef Prefix = CandidateInfo.Prefixes[0];
+    for (int P = 1; CandidateInfo.Prefixes[P]; P++) {
+      if (Option.startswith(CandidateInfo.Prefixes[P]))
+        Prefix = CandidateInfo.Prefixes[P];
+    }
+
+    // Check if the candidate ends with a character commonly used when
+    // delimiting an option from its value, such as '=' or ':'. If it does,
+    // attempt to split the given option based on that delimiter.
+    std::string Delimiter = "";
+    char Last = CandidateName.back();
+    if (Last == '=' || Last == ':')
+      Delimiter = std::string(1, Last);
+
+    StringRef LHS, RHS;
+    if (Delimiter.empty())
+      LHS = Option;
+    else
+      std::tie(LHS, RHS) = Option.split(Last);
+
+    std::string NormalizedName =
+        (LHS.drop_front(Prefix.size()) + Delimiter).str();
+    unsigned Distance =
+        CandidateName.edit_distance(NormalizedName, /*AllowReplacements=*/true,
+                                    /*MaxEditDistance=*/BestDistance);
+    if (Distance < BestDistance) {
+      BestDistance = Distance;
+      NearestString = (Prefix + CandidateName + RHS).str();
+    }
+  }
+  return BestDistance;
+}
+
 bool OptTable::addValues(const char *Option, const char *Values) {
   for (size_t I = FirstSearchableIndex, E = OptionInfos.size(); I < E; I++) {
     Info &In = OptionInfos[I];
@@ -474,12 +537,9 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
 
   // Render help text into a map of group-name to a list of (option, help)
   // pairs.
-  using helpmap_ty = std::map<std::string, std::vector<OptionInfo>>;
-  helpmap_ty GroupedOptionHelp;
-
-  for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
-    unsigned Id = i + 1;
+  std::map<std::string, std::vector<OptionInfo>> GroupedOptionHelp;
 
+  for (unsigned Id = 1, e = getNumOptions() + 1; Id != e; ++Id) {
     // FIXME: Split out option groups.
     if (getOptionKind(Id) == Option::GroupClass)
       continue;
@@ -506,11 +566,10 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
     }
   }
 
-  for (helpmap_ty::iterator it = GroupedOptionHelp .begin(),
-         ie = GroupedOptionHelp.end(); it != ie; ++it) {
-    if (it != GroupedOptionHelp .begin())
+  for (auto& OptionGroup : GroupedOptionHelp) {
+    if (OptionGroup.first != GroupedOptionHelp.begin()->first)
       OS << "\n";
-    PrintHelpOptionList(OS, it->first, it->second);
+    PrintHelpOptionList(OS, OptionGroup.first, OptionGroup.second);
   }
 
   OS.flush();
diff --git a/contrib/llvm/lib/Option/Option.cpp b/contrib/llvm/lib/Option/Option.cpp
index bf9f040bde52..f9d8a5e54043 100644
--- a/contrib/llvm/lib/Option/Option.cpp
+++ b/contrib/llvm/lib/Option/Option.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
diff --git a/contrib/llvm/lib/Passes/PassBuilder.cpp b/contrib/llvm/lib/Passes/PassBuilder.cpp
index 21003c0be7e1..eb04dcc8b6ef 100644
--- a/contrib/llvm/lib/Passes/PassBuilder.cpp
+++ b/contrib/llvm/lib/Passes/PassBuilder.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PhiValues.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
@@ -59,7 +60,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/GCOVProfiler.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
@@ -79,13 +81,15 @@
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
 #include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
-#include "llvm/Transforms/PGOInstrumentation.h"
-#include "llvm/Transforms/SampleProfile.h"
+#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
@@ -101,6 +105,8 @@
 #include "llvm/Transforms/Scalar/GuardWidening.h"
 #include "llvm/Transforms/Scalar/IVUsersPrinter.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
@@ -116,6 +122,7 @@
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
@@ -144,12 +151,10 @@
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include "llvm/Transforms/Utils/Mem2Reg.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
-#include "llvm/Transforms/Utils/SimplifyInstructions.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 
-
 using namespace llvm;
 
 static cl::opt<unsigned> MaxDevirtIterations("pm-max-devirt-iterations",
@@ -176,6 +181,15 @@ static cl::opt<bool> EnableGVNSink(
     "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
 
+static cl::opt<bool> EnableUnrollAndJam(
+    "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
+    cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
+
+static cl::opt<bool> EnableSyntheticCounts(
+    "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Run synthetic function entry count generation "
+             "pass"));
+
 static Regex DefaultAliasRegex(
     "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
@@ -196,7 +210,7 @@ static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) {
 
 namespace {
 
-/// \brief No-op module pass which does nothing.
+/// No-op module pass which does nothing.
 struct NoOpModulePass {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &) {
     return PreservedAnalyses::all();
@@ -204,7 +218,7 @@ struct NoOpModulePass {
   static StringRef name() { return "NoOpModulePass"; }
 };
 
-/// \brief No-op module analysis.
+/// No-op module analysis.
 class NoOpModuleAnalysis : public AnalysisInfoMixin<NoOpModuleAnalysis> {
   friend AnalysisInfoMixin<NoOpModuleAnalysis>;
   static AnalysisKey Key;
@@ -215,7 +229,7 @@ public:
   static StringRef name() { return "NoOpModuleAnalysis"; }
 };
 
-/// \brief No-op CGSCC pass which does nothing.
+/// No-op CGSCC pass which does nothing.
 struct NoOpCGSCCPass {
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &,
                         LazyCallGraph &, CGSCCUpdateResult &UR) {
@@ -224,7 +238,7 @@ struct NoOpCGSCCPass {
   static StringRef name() { return "NoOpCGSCCPass"; }
 };
 
-/// \brief No-op CGSCC analysis.
+/// No-op CGSCC analysis.
 class NoOpCGSCCAnalysis : public AnalysisInfoMixin<NoOpCGSCCAnalysis> {
   friend AnalysisInfoMixin<NoOpCGSCCAnalysis>;
   static AnalysisKey Key;
@@ -237,7 +251,7 @@ public:
   static StringRef name() { return "NoOpCGSCCAnalysis"; }
 };
 
-/// \brief No-op function pass which does nothing.
+/// No-op function pass which does nothing.
 struct NoOpFunctionPass {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &) {
     return PreservedAnalyses::all();
@@ -245,7 +259,7 @@ struct NoOpFunctionPass {
   static StringRef name() { return "NoOpFunctionPass"; }
 };
 
-/// \brief No-op function analysis.
+/// No-op function analysis.
 class NoOpFunctionAnalysis : public AnalysisInfoMixin<NoOpFunctionAnalysis> {
   friend AnalysisInfoMixin<NoOpFunctionAnalysis>;
   static AnalysisKey Key;
@@ -256,7 +270,7 @@ public:
   static StringRef name() { return "NoOpFunctionAnalysis"; }
 };
 
-/// \brief No-op loop pass which does nothing.
+/// No-op loop pass which does nothing.
 struct NoOpLoopPass {
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &,
                         LoopStandardAnalysisResults &, LPMUpdater &) {
@@ -265,7 +279,7 @@ struct NoOpLoopPass {
   static StringRef name() { return "NoOpLoopPass"; }
 };
 
-/// \brief No-op loop analysis.
+/// No-op loop analysis.
 class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
   friend AnalysisInfoMixin<NoOpLoopAnalysis>;
   static AnalysisKey Key;
@@ -358,6 +372,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
   FPM.addPass(SimplifyCFGPass());
+  if (Level == O3)
+    FPM.addPass(AggressiveInstCombinePass());
   FPM.addPass(InstCombinePass());
 
   if (!isOptimizingForSize(Level))
@@ -381,13 +397,21 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Add the primary loop simplification pipeline.
   // FIXME: Currently this is split into two loop pass pipelines because we run
-  // some function passes in between them. These can and should be replaced by
-  // loop pass equivalenst but those aren't ready yet. Specifically,
-  // `SimplifyCFGPass` and `InstCombinePass` are used. We have
-  // `LoopSimplifyCFGPass` which isn't yet powerful enough, and the closest to
-  // the other we have is `LoopInstSimplify`.
+  // some function passes in between them. These can and should be removed
+  // and/or replaced by scheduling the loop pass equivalents in the correct
+  // positions. But those equivalent passes aren't powerful enough yet.
+  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
+  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
+  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
+  // `LoopInstSimplify`.
   LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging);
 
+  // Simplify the loop body. We do this initially to clean up after other loop
+  // passes run, either when iterating on a loop or on inner loops with
+  // implications on the outer loop.
+  LPM1.addPass(LoopInstSimplifyPass());
+  LPM1.addPass(LoopSimplifyCFGPass());
+
   // Rotate Loop - disable header duplication at -Oz
   LPM1.addPass(LoopRotatePass(Level != Oz));
   LPM1.addPass(LICMPass());
@@ -581,7 +605,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
                                            true));
   }
 
-  // Interprocedural constant propagation now that basic cleanup has occured
+  // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
   // years, it should be re-analyzed.
@@ -622,6 +646,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     MPM.addPass(PGOIndirectCallPromotion(false, false));
   }
 
+  // Synthesize function entry counts for non-PGO compilation.
+  if (EnableSyntheticCounts && !PGOOpt)
+    MPM.addPass(SyntheticCountsPropagation());
+
   // Require the GlobalsAA analysis for the module so we can query it within
   // the CGSCC pipeline.
   MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
@@ -776,6 +804,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // FIXME: It would be really good to use a loop-integrated instruction
   // combiner for cleanup here so that the unrolling and LICM can be pipelined
   // across the loop nests.
+  // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+  if (EnableUnrollAndJam) {
+    OptimizePM.addPass(
+        createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
+  }
   OptimizePM.addPass(LoopUnrollPass(Level));
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
@@ -792,7 +825,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   OptimizePM.addPass(LoopSinkPass());
 
   // And finally clean up LCSSA form before generating code.
-  OptimizePM.addPass(InstSimplifierPass());
+  OptimizePM.addPass(InstSimplifyPass());
 
   // This hoists/decomposes div/rem ops. It should run after other sink/hoist
   // passes to avoid re-sinking, but before SimplifyCFG because it can allow
@@ -811,6 +844,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
+  MPM.addPass(CGProfilePass());
+
   // Now we need to do some global optimization transforms.
   // FIXME: It would seem like these should come first in the optimization
   // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
@@ -831,6 +866,10 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Force any function attributes we want the rest of the pipeline to observe.
   MPM.addPass(ForceFunctionAttrsPass());
 
+  // Apply module pipeline start EP callback.
+  for (auto &C : PipelineStartEPCallbacks)
+    C(MPM);
+
   if (PGOOpt && PGOOpt->SamplePGOSupport)
     MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
 
@@ -857,6 +896,10 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
   if (PGOOpt && PGOOpt->SamplePGOSupport)
     MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
 
+  // Apply module pipeline start EP callback.
+  for (auto &C : PipelineStartEPCallbacks)
+    C(MPM);
+
   // If we are planning to perform ThinLTO later, we don't bloat the code with
   // unrolling/vectorization/... now. Just simplify the module as much as we
   // can.
@@ -879,15 +922,28 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
   return MPM;
 }
 
-ModulePassManager
-PassBuilder::buildThinLTODefaultPipeline(OptimizationLevel Level,
-                                         bool DebugLogging) {
-  // FIXME: The summary index is not hooked in the new pass manager yet.
-  // When it's going to be hooked, enable WholeProgramDevirt and LowerTypeTest
-  // here.
-
+ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
+    OptimizationLevel Level, bool DebugLogging,
+    const ModuleSummaryIndex *ImportSummary) {
   ModulePassManager MPM(DebugLogging);
 
+  if (ImportSummary) {
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary));
+    MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
+  }
+
   // Force any function attributes we want the rest of the pipeline to observe.
   MPM.addPass(ForceFunctionAttrsPass());
 
@@ -918,8 +974,9 @@ PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level,
   return buildPerModuleDefaultPipeline(Level, DebugLogging);
 }
 
-ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
-                                                       bool DebugLogging) {
+ModulePassManager
+PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
+                                     ModuleSummaryIndex *ExportSummary) {
   assert(Level != O0 && "Must request optimizations for the default pipeline!");
   ModulePassManager MPM(DebugLogging);
 
@@ -969,11 +1026,15 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   // Run whole program optimization of virtual call when the list of callees
   // is fixed.
-  MPM.addPass(WholeProgramDevirtPass());
+  MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
 
   // Stop here at -O1.
-  if (Level == 1)
+  if (Level == 1) {
+    // The LowerTypeTestsPass needs to run to lower type metadata and the
+    // type.test intrinsics. The pass does nothing if CFI is disabled.
+    MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
     return MPM;
+  }
 
   // Optimize globals to try and fold them into constants.
   MPM.addPass(GlobalOptPass());
@@ -993,6 +1054,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
   FunctionPassManager PeepholeFPM(DebugLogging);
+  if (Level == O3)
+    PeepholeFPM.addPass(AggressiveInstCombinePass());
   PeepholeFPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(PeepholeFPM, Level);
 
@@ -1080,12 +1143,7 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
   // to be run at link time if CFI is enabled. This pass does nothing if
   // CFI is disabled.
-  // Enable once we add support for the summary in the new PM.
-#if 0
-  MPM.addPass(LowerTypeTestsPass(Summary ? PassSummaryAction::Export :
-                                           PassSummaryAction::None,
-                                Summary));
-#endif
+  MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
 
   // Add late LTO optimization passes.
   // Delete basic blocks, which optimization passes may have killed.
@@ -1397,12 +1455,12 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
     } else if (Matches[1] == "thinlto-pre-link") {
       MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L, DebugLogging));
     } else if (Matches[1] == "thinlto") {
-      MPM.addPass(buildThinLTODefaultPipeline(L, DebugLogging));
+      MPM.addPass(buildThinLTODefaultPipeline(L, DebugLogging, nullptr));
     } else if (Matches[1] == "lto-pre-link") {
       MPM.addPass(buildLTOPreLinkDefaultPipeline(L, DebugLogging));
     } else {
       assert(Matches[1] == "lto" && "Not one of the matched options!");
-      MPM.addPass(buildLTODefaultPipeline(L, DebugLogging));
+      MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr));
     }
     return true;
   }
diff --git a/contrib/llvm/lib/Passes/PassPlugin.cpp b/contrib/llvm/lib/Passes/PassPlugin.cpp
new file mode 100644
index 000000000000..bf38fdb842e7
--- /dev/null
+++ b/contrib/llvm/lib/Passes/PassPlugin.cpp
@@ -0,0 +1,52 @@
+//===- lib/Passes/PassPluginLoader.cpp - Load Plugins for New PM Passes ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+
+using namespace llvm;
+
+Expected<PassPlugin> PassPlugin::Load(const std::string &Filename) {
+  std::string Error;
+  auto Library =
+      sys::DynamicLibrary::getPermanentLibrary(Filename.c_str(), &Error);
+  if (!Library.isValid())
+    return make_error<StringError>(Twine("Could not load library '") +
+                                       Filename + "': " + Error,
+                                   inconvertibleErrorCode());
+
+  PassPlugin P{Filename, Library};
+  intptr_t getDetailsFn =
+      (intptr_t)Library.SearchForAddressOfSymbol("llvmGetPassPluginInfo");
+
+  if (!getDetailsFn)
+    // If the symbol isn't found, this is probably a legacy plugin, which is an
+    // error
+    return make_error<StringError>(Twine("Plugin entry point not found in '") +
+                                       Filename + "'. Is this a legacy plugin?",
+                                   inconvertibleErrorCode());
+
+  P.Info = reinterpret_cast<decltype(llvmGetPassPluginInfo) *>(getDetailsFn)();
+
+  if (P.Info.APIVersion != LLVM_PLUGIN_API_VERSION)
+    return make_error<StringError>(
+        Twine("Wrong API version on plugin '") + Filename + "'. Got version " +
+            Twine(P.Info.APIVersion) + ", supported version is " +
+            Twine(LLVM_PLUGIN_API_VERSION) + ".",
+        inconvertibleErrorCode());
+
+  if (!P.Info.RegisterPassBuilderCallbacks)
+    return make_error<StringError>(Twine("Empty entry callback in plugin '") +
+                                       Filename + "'.'",
+                                   inconvertibleErrorCode());
+
+  return P;
+}
diff --git a/contrib/llvm/lib/Passes/PassRegistry.def b/contrib/llvm/lib/Passes/PassRegistry.def
index 4d9045aedfce..6ae93a476968 100644
--- a/contrib/llvm/lib/Passes/PassRegistry.def
+++ b/contrib/llvm/lib/Passes/PassRegistry.def
@@ -40,6 +40,7 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA())
 #endif
 MODULE_PASS("always-inline", AlwaysInlinerPass())
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
+MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("constmerge", ConstantMergePass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("deadargelim", DeadArgumentEliminationPass())
@@ -55,7 +56,7 @@ MODULE_PASS("instrprof", InstrProfiling())
 MODULE_PASS("internalize", InternalizePass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("ipsccp", IPSCCPPass())
-MODULE_PASS("lowertypetests", LowerTypeTestsPass())
+MODULE_PASS("lowertypetests", LowerTypeTestsPass(nullptr, nullptr))
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("partial-inliner", PartialInlinerPass())
@@ -73,7 +74,8 @@ MODULE_PASS("rewrite-symbols", RewriteSymbolPass())
 MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass())
 MODULE_PASS("sample-profile", SampleProfileLoaderPass())
 MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass())
-MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
+MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
+MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr))
 MODULE_PASS("verify", VerifierPass())
 #undef MODULE_PASS
 
@@ -110,6 +112,7 @@ FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("da", DependenceAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
 FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis())
+FUNCTION_ANALYSIS("phi-values", PhiValuesAnalysis())
 FUNCTION_ANALYSIS("regions", RegionInfoAnalysis())
 FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
 FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
@@ -138,6 +141,7 @@ FUNCTION_ALIAS_ANALYSIS("type-based-aa", TypeBasedAA())
 FUNCTION_PASS("aa-eval", AAEvaluator())
 FUNCTION_PASS("adce", ADCEPass())
 FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
+FUNCTION_PASS("aggressive-instcombine", AggressiveInstCombinePass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
@@ -156,7 +160,7 @@ FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false)
 FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true))
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("instcombine", InstCombinePass())
-FUNCTION_PASS("instsimplify", InstSimplifierPass())
+FUNCTION_PASS("instsimplify", InstSimplifyPass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 FUNCTION_PASS("float2int", Float2IntPass())
 FUNCTION_PASS("no-op-function", NoOpFunctionPass())
@@ -192,6 +196,7 @@ FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
 FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(dbgs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
 FUNCTION_PASS("print<memoryssa>", MemorySSAPrinterPass(dbgs()))
+FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(dbgs()))
 FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
@@ -236,6 +241,8 @@ LOOP_PASS("loop-deletion", LoopDeletionPass())
 LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
+LOOP_PASS("irce", IRCEPass())
+LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
diff --git a/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 8dbd58632f0e..b3c2b182e76c 100644
--- a/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -83,7 +83,7 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
     return Counter::getZero();
 
   // Group the terms by counter ID.
-  std::sort(Terms.begin(), Terms.end(), [](const Term &LHS, const Term &RHS) {
+  llvm::sort(Terms.begin(), Terms.end(), [](const Term &LHS, const Term &RHS) {
     return LHS.CounterID < RHS.CounterID;
   });
 
@@ -207,8 +207,10 @@ Error CoverageMapping::loadFunctionRecord(
   else
     OrigFuncName = getFuncNameWithoutPrefix(OrigFuncName, Record.Filenames[0]);
 
-  // Don't load records for functions we've already seen.
-  if (!FunctionNames.insert(OrigFuncName).second)
+  // Don't load records for (filenames, function) pairs we've already seen.
+  auto FilenamesHash = hash_combine_range(Record.Filenames.begin(),
+                                          Record.Filenames.end());
+  if (!RecordProvenance[FilenamesHash].insert(hash_value(OrigFuncName)).second)
     return Error::success();
 
   CounterMappingContext Ctx(Record.Expressions);
@@ -292,7 +294,7 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
 
 namespace {
 
-/// \brief Distributes functions into instantiation sets.
+/// Distributes functions into instantiation sets.
 ///
 /// An instantiation set is a collection of functions that have the same source
 /// code, ie, template functions specializations.
@@ -344,7 +346,7 @@ class SegmentBuilder {
     else
       Segments.emplace_back(StartLoc.first, StartLoc.second, IsRegionEntry);
 
-    DEBUG({
+    LLVM_DEBUG({
       const auto &Last = Segments.back();
       dbgs() << "Segment at " << Last.Line << ":" << Last.Col
              << " (count = " << Last.Count << ")"
@@ -457,8 +459,8 @@ class SegmentBuilder {
 
   /// Sort a nested sequence of regions from a single file.
   static void sortNestedRegions(MutableArrayRef<CountedRegion> Regions) {
-    std::sort(Regions.begin(), Regions.end(), [](const CountedRegion &LHS,
-                                                 const CountedRegion &RHS) {
+    llvm::sort(Regions.begin(), Regions.end(), [](const CountedRegion &LHS,
+                                                  const CountedRegion &RHS) {
       if (LHS.startLoc() != RHS.startLoc())
         return LHS.startLoc() < RHS.startLoc();
       if (LHS.endLoc() != RHS.endLoc())
@@ -522,7 +524,7 @@ public:
     sortNestedRegions(Regions);
     ArrayRef<CountedRegion> CombinedRegions = combineRegions(Regions);
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "Combined regions:\n";
       for (const auto &CR : CombinedRegions)
         dbgs() << "  " << CR.LineStart << ":" << CR.ColumnStart << " -> "
@@ -537,8 +539,8 @@ public:
       const auto &L = Segments[I - 1];
       const auto &R = Segments[I];
       if (!(L.Line < R.Line) && !(L.Line == R.Line && L.Col < R.Col)) {
-        DEBUG(dbgs() << " ! Segment " << L.Line << ":" << L.Col
-                     << " followed by " << R.Line << ":" << R.Col << "\n");
+        LLVM_DEBUG(dbgs() << " ! Segment " << L.Line << ":" << L.Col
+                          << " followed by " << R.Line << ":" << R.Col << "\n");
         assert(false && "Coverage segments not unique or sorted");
       }
     }
@@ -555,7 +557,7 @@ std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
   for (const auto &Function : getCoveredFunctions())
     Filenames.insert(Filenames.end(), Function.Filenames.begin(),
                      Function.Filenames.end());
-  std::sort(Filenames.begin(), Filenames.end());
+  llvm::sort(Filenames.begin(), Filenames.end());
   auto Last = std::unique(Filenames.begin(), Filenames.end());
   Filenames.erase(Last, Filenames.end());
   return Filenames;
@@ -611,7 +613,7 @@ CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
       }
   }
 
-  DEBUG(dbgs() << "Emitting segments for file: " << Filename << "\n");
+  LLVM_DEBUG(dbgs() << "Emitting segments for file: " << Filename << "\n");
   FileCoverage.Segments = SegmentBuilder::buildSegments(Regions);
 
   return FileCoverage;
@@ -652,7 +654,8 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
         FunctionCoverage.Expansions.emplace_back(CR, Function);
     }
 
-  DEBUG(dbgs() << "Emitting segments for function: " << Function.Name << "\n");
+  LLVM_DEBUG(dbgs() << "Emitting segments for function: " << Function.Name
+                    << "\n");
   FunctionCoverage.Segments = SegmentBuilder::buildSegments(Regions);
 
   return FunctionCoverage;
@@ -670,8 +673,8 @@ CoverageData CoverageMapping::getCoverageForExpansion(
         ExpansionCoverage.Expansions.emplace_back(CR, Expansion.Function);
     }
 
-  DEBUG(dbgs() << "Emitting segments for expansion of file " << Expansion.FileID
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Emitting segments for expansion of file "
+                    << Expansion.FileID << "\n");
   ExpansionCoverage.Segments = SegmentBuilder::buildSegments(Regions);
 
   return ExpansionCoverage;
diff --git a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 649cf507357e..ee48256bc2e5 100644
--- a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -147,7 +147,7 @@ Error RawCoverageMappingReader::readCounter(Counter &C) {
 static const unsigned EncodingExpansionRegionBit = 1
                                                    << Counter::EncodingTagBits;
 
-/// \brief Read the sub-array of regions for the given inferred file id.
+/// Read the sub-array of regions for the given inferred file id.
 /// \param NumFileIDs the number of file ids that are defined for this
 /// function.
 Error RawCoverageMappingReader::readMappingRegionsSubArray(
@@ -228,7 +228,7 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
       ColumnEnd = std::numeric_limits<unsigned>::max();
     }
 
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "Counter in file " << InferredFileID << " " << LineStart << ":"
              << ColumnStart << " -> " << (LineStart + NumLines) << ":"
              << ColumnEnd << ", ";
diff --git a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 49e82e481059..bb3f4f854e04 100644
--- a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -35,7 +35,7 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
 
 namespace {
 
-/// \brief Gather only the expressions that are used by the mapping
+/// Gather only the expressions that are used by the mapping
 /// regions in this function.
 class CounterExpressionsMinimizer {
   ArrayRef<CounterExpression> Expressions;
@@ -74,7 +74,7 @@ public:
 
   ArrayRef<CounterExpression> getExpressions() const { return UsedExpressions; }
 
-  /// \brief Adjust the given counter to correctly transition from the old
+  /// Adjust the given counter to correctly transition from the old
   /// expression ids to the new expression ids.
   Counter adjust(Counter C) const {
     if (C.isExpression())
@@ -85,7 +85,7 @@ public:
 
 } // end anonymous namespace
 
-/// \brief Encode the counter.
+/// Encode the counter.
 ///
 /// The encoding uses the following format:
 /// Low 2 bits - Tag:
diff --git a/contrib/llvm/lib/ProfileData/GCOV.cpp b/contrib/llvm/lib/ProfileData/GCOV.cpp
index d6e44389f2be..c9155439ec46 100644
--- a/contrib/llvm/lib/ProfileData/GCOV.cpp
+++ b/contrib/llvm/lib/ProfileData/GCOV.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ProfileData/GCOV.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -592,7 +593,7 @@ void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
   SmallVector<StringRef, 4> Filenames;
   for (const auto &LI : LineInfo)
     Filenames.push_back(LI.first());
-  std::sort(Filenames.begin(), Filenames.end());
+  llvm::sort(Filenames.begin(), Filenames.end());
 
   for (StringRef Filename : Filenames) {
     auto AllLines = LineConsumer(Filename);
diff --git a/contrib/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm/lib/ProfileData/InstrProf.cpp
index 8ab5df59f538..544a77ec20a5 100644
--- a/contrib/llvm/lib/ProfileData/InstrProf.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProf.cpp
@@ -355,11 +355,26 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
       }
     }
   }
-
+  Sorted = false;
   finalizeSymtab();
   return Error::success();
 }
 
+uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) {
+  finalizeSymtab();
+  auto Result =
+      std::lower_bound(AddrToMD5Map.begin(), AddrToMD5Map.end(), Address,
+                       [](const std::pair<uint64_t, uint64_t> &LHS,
+                          uint64_t RHS) { return LHS.first < RHS; });
+  // Raw function pointer collected by value profiler may be from
+  // external functions that are not instrumented. They won't have
+  // mapping data to be used by the deserializer. Force the value to
+  // be 0 in this case.
+  if (Result != AddrToMD5Map.end() && Result->first == Address)
+    return (uint64_t)Result->second;
+  return 0;
+}
+
 Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
                                 bool doCompression, std::string &Result) {
   assert(!NameStrs.empty() && "No name data to emit");
@@ -461,7 +476,6 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
     while (P < EndP && *P == 0)
       P++;
   }
-  Symtab.finalizeSymtab();
   return Error::success();
 }
 
@@ -561,32 +575,19 @@ void InstrProfRecord::scale(uint64_t Weight,
 
 // Map indirect call target name hash to name string.
 uint64_t InstrProfRecord::remapValue(uint64_t Value, uint32_t ValueKind,
-                                     ValueMapType *ValueMap) {
-  if (!ValueMap)
+                                     InstrProfSymtab *SymTab) {
+  if (!SymTab)
     return Value;
-  switch (ValueKind) {
-  case IPVK_IndirectCallTarget: {
-    auto Result =
-        std::lower_bound(ValueMap->begin(), ValueMap->end(), Value,
-                         [](const std::pair<uint64_t, uint64_t> &LHS,
-                            uint64_t RHS) { return LHS.first < RHS; });
-   // Raw function pointer collected by value profiler may be from 
-   // external functions that are not instrumented. They won't have
-   // mapping data to be used by the deserializer. Force the value to
-   // be 0 in this case.
-    if (Result != ValueMap->end() && Result->first == Value)
-      Value = (uint64_t)Result->second;
-    else
-      Value = 0;
-    break;
-  }
-  }
+
+  if (ValueKind == IPVK_IndirectCallTarget)
+    return SymTab->getFunctionHashFromAddress(Value);
+
   return Value;
 }
 
 void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
                                    InstrProfValueData *VData, uint32_t N,
-                                   ValueMapType *ValueMap) {
+                                   InstrProfSymtab *ValueMap) {
   for (uint32_t I = 0; I < N; I++) {
     VData[I].Value = remapValue(VData[I].Value, ValueKind, ValueMap);
   }
@@ -602,7 +603,7 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
 #include "llvm/ProfileData/InstrProfData.inc"
 
 /*!
- * \brief ValueProfRecordClosure Interface implementation for  InstrProfRecord
+ * ValueProfRecordClosure Interface implementation for  InstrProfRecord
  *  class. These C wrappers are used as adaptors so that C++ code can be
  *  invoked as callbacks.
  */
@@ -666,13 +667,13 @@ ValueProfData::serializeFrom(const InstrProfRecord &Record) {
 }
 
 void ValueProfRecord::deserializeTo(InstrProfRecord &Record,
-                                    InstrProfRecord::ValueMapType *VMap) {
+                                    InstrProfSymtab *SymTab) {
   Record.reserveSites(Kind, NumValueSites);
 
   InstrProfValueData *ValueData = getValueProfRecordValueData(this);
   for (uint64_t VSite = 0; VSite < NumValueSites; ++VSite) {
     uint8_t ValueDataCount = this->SiteCountArray[VSite];
-    Record.addValueData(Kind, VSite, ValueData, ValueDataCount, VMap);
+    Record.addValueData(Kind, VSite, ValueData, ValueDataCount, SymTab);
     ValueData += ValueDataCount;
   }
 }
@@ -706,13 +707,13 @@ void ValueProfRecord::swapBytes(support::endianness Old,
 }
 
 void ValueProfData::deserializeTo(InstrProfRecord &Record,
-                                  InstrProfRecord::ValueMapType *VMap) {
+                                  InstrProfSymtab *SymTab) {
   if (NumValueKinds == 0)
     return;
 
   ValueProfRecord *VR = getFirstValueProfRecord(this);
   for (uint32_t K = 0; K < NumValueKinds; K++) {
-    VR->deserializeTo(Record, VMap);
+    VR->deserializeTo(Record, SymTab);
     VR = getValueProfRecordNext(VR);
   }
 }
@@ -925,8 +926,7 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
   if (F.hasComdat())
     return true;
 
-  Triple TT(M.getTargetTriple());
-  if (!TT.isOSBinFormatELF() && !TT.isOSBinFormatWasm())
+  if (!Triple(M.getTargetTriple()).supportsCOMDAT())
     return false;
 
   // See createPGOFuncNameVar for more details. To avoid link errors, profile
diff --git a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
index 23c9a2676b9e..3b704158a5c5 100644
--- a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -129,7 +129,7 @@ bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
   StringRef buffer = Buffer.getBufferStart();
   return count == 0 ||
          std::all_of(buffer.begin(), buffer.begin() + count,
-                     [](char c) { return ::isprint(c) || ::isspace(c); });
+                     [](char c) { return isPrint(c) || ::isspace(c); });
 }
 
 // Read the profile variant flag from the header: ":FE" means this is a FE
@@ -200,9 +200,13 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         std::pair<StringRef, StringRef> VD = Line->rsplit(':');
         uint64_t TakenCount, Value;
         if (ValueKind == IPVK_IndirectCallTarget) {
-          if (Error E = Symtab->addFuncName(VD.first))
-            return E;
-          Value = IndexedInstrProf::ComputeHash(VD.first);
+          if (InstrProfSymtab::isExternalSymbol(VD.first)) {
+            Value = 0;
+          } else {
+            if (Error E = Symtab->addFuncName(VD.first))
+              return E;
+            Value = IndexedInstrProf::ComputeHash(VD.first);
+          }
         } else {
           READ_NUM(VD.first, Value);
         }
@@ -227,14 +231,13 @@ Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
     ++Line;
   // If we hit EOF while looking for a name, we're done.
   if (Line.is_at_end()) {
-    Symtab->finalizeSymtab();
     return error(instrprof_error::eof);
   }
 
   // Read the function name.
   Record.Name = *Line++;
   if (Error E = Symtab->addFuncName(Record.Name))
-    return E;
+    return error(std::move(E));
 
   // Read the function hash.
   if (Line.is_at_end())
@@ -265,11 +268,8 @@ Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
 
   // Check if value profile data exists and read it if so.
   if (Error E = readValueProfileData(Record))
-    return E;
+    return error(std::move(E));
 
-  // This is needed to avoid two pass parsing because llvm-profdata
-  // does dumping while reading.
-  Symtab->finalizeSymtab();
   return success();
 }
 
@@ -331,7 +331,6 @@ Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
       continue;
     Symtab.mapAddress(FPtr, I->NameRef);
   }
-  Symtab.finalizeSymtab();
   return success();
 }
 
@@ -439,7 +438,7 @@ Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
   // Note that besides deserialization, this also performs the conversion for
   // indirect call targets.  The function pointers from the raw profile are
   // remapped into function name hashes.
-  VDataPtrOrErr.get()->deserializeTo(Record, &Symtab->getAddrHashMap());
+  VDataPtrOrErr.get()->deserializeTo(Record, Symtab.get());
   CurValueDataSize = VDataPtrOrErr.get()->getSize();
   return success();
 }
@@ -449,23 +448,23 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   if (atEnd())
     // At this point, ValueDataStart field points to the next header.
     if (Error E = readNextHeader(getNextHeaderPos()))
-      return E;
+      return error(std::move(E));
 
   // Read name ad set it in Record.
   if (Error E = readName(Record))
-    return E;
+    return error(std::move(E));
 
   // Read FuncHash and set it in Record.
   if (Error E = readFuncHash(Record))
-    return E;
+    return error(std::move(E));
 
   // Read raw counts and set Record.
   if (Error E = readRawCounts(Record))
-    return E;
+    return error(std::move(E));
 
   // Read value data and set Record.
   if (Error E = readValueProfilingData(Record))
-    return E;
+    return error(std::move(E));
 
   // Iterate.
   advanceData();
diff --git a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
index ce3f8806e12e..18b9deec158f 100644
--- a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -48,9 +48,10 @@ namespace llvm {
 // back patching.
 class ProfOStream {
 public:
-  ProfOStream(raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
+  ProfOStream(raw_fd_ostream &FD)
+      : IsFDOStream(true), OS(FD), LE(FD, support::little) {}
   ProfOStream(raw_string_ostream &STR)
-      : IsFDOStream(false), OS(STR), LE(STR) {}
+      : IsFDOStream(false), OS(STR), LE(STR, support::little) {}
 
   uint64_t tell() { return OS.tell(); }
   void write(uint64_t V) { LE.write<uint64_t>(V); }
@@ -85,7 +86,7 @@ public:
   // true. Otherwise, \c OS will be an raw_string_ostream.
   bool IsFDOStream;
   raw_ostream &OS;
-  support::endian::Writer<support::little> LE;
+  support::endian::Writer LE;
 };
 
 class InstrProfRecordWriterTrait {
@@ -112,7 +113,7 @@ public:
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
     using namespace support;
 
-    endian::Writer<little> LE(Out);
+    endian::Writer LE(Out, little);
 
     offset_type N = K.size();
     LE.write<offset_type>(N);
@@ -139,7 +140,7 @@ public:
   void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
     using namespace support;
 
-    endian::Writer<little> LE(Out);
+    endian::Writer LE(Out, little);
     for (const auto &ProfileData : *V) {
       const InstrProfRecord &ProfRecord = ProfileData.second;
       SummaryBuilder->addRecord(ProfRecord);
@@ -361,7 +362,8 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
         if (VK == IPVK_IndirectCallTarget)
-          OS << Symtab.getFuncName(VD[I].Value) << ":" << VD[I].Count << "\n";
+          OS << Symtab.getFuncNameOrExternalSymbol(VD[I].Value) << ":"
+             << VD[I].Count << "\n";
         else
           OS << VD[I].Value << ":" << VD[I].Count << "\n";
       }
@@ -379,7 +381,6 @@ Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
     if (shouldEncodeData(I.getValue()))
       if (Error E = Symtab.addFuncName(I.getKey()))
         return E;
-  Symtab.finalizeSymtab();
 
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
diff --git a/contrib/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/contrib/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index 5fa1e2cf7d1e..62f00d693c68 100644
--- a/contrib/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/contrib/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -27,8 +27,8 @@ using namespace llvm;
 static const uint32_t DefaultCutoffsData[] = {
     10000,  /*  1% */
     100000, /* 10% */
-    200000, 300000, 400000, 500000, 600000, 500000, 600000, 700000,
-    800000, 900000, 950000, 990000, 999000, 999900, 999990, 999999};
+    200000, 300000, 400000, 500000, 600000, 700000, 800000,
+    900000, 950000, 990000, 999000, 999900, 999990, 999999};
 const ArrayRef<uint32_t> ProfileSummaryBuilder::DefaultCutoffs =
     DefaultCutoffsData;
 
@@ -58,7 +58,7 @@ void SampleProfileSummaryBuilder::addRecord(
 void ProfileSummaryBuilder::computeDetailedSummary() {
   if (DetailedSummaryCutoffs.empty())
     return;
-  std::sort(DetailedSummaryCutoffs.begin(), DetailedSummaryCutoffs.end());
+  llvm::sort(DetailedSummaryCutoffs.begin(), DetailedSummaryCutoffs.end());
   auto Iter = CountFrequencies.begin();
   const auto End = CountFrequencies.end();
 
diff --git a/contrib/llvm/lib/ProfileData/SampleProf.cpp b/contrib/llvm/lib/ProfileData/SampleProf.cpp
index eafdd2154b7b..30438ba7962a 100644
--- a/contrib/llvm/lib/ProfileData/SampleProf.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProf.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -86,7 +88,7 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
 LLVM_DUMP_METHOD void LineLocation::dump() const { print(dbgs()); }
 #endif
 
-/// \brief Print the sample record to the stream \p OS indented by \p Indent.
+/// Print the sample record to the stream \p OS indented by \p Indent.
 void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
   OS << NumSamples;
   if (hasCalls()) {
@@ -107,7 +109,7 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
-/// \brief Print the samples collected for a function on stream \p OS.
+/// Print the samples collected for a function on stream \p OS.
 void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
      << " sampled lines\n";
@@ -150,6 +152,32 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+unsigned FunctionSamples::getOffset(const DILocation *DIL) {
+  return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+      0xffff;
+}
+
+const FunctionSamples *
+FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
+  assert(DIL);
+  SmallVector<std::pair<LineLocation, StringRef>, 10> S;
+
+  const DILocation *PrevDIL = DIL;
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
+    S.push_back(std::make_pair(
+        LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
+        PrevDIL->getScope()->getSubprogram()->getLinkageName()));
+    PrevDIL = DIL;
+  }
+  if (S.size() == 0)
+    return this;
+  const FunctionSamples *FS = this;
+  for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
+  }
+  return FS;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
 #endif
diff --git a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
index 44547e3dffa0..79335e67cd98 100644
--- a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -43,7 +43,7 @@
 using namespace llvm;
 using namespace sampleprof;
 
-/// \brief Dump the function profile for \p FName.
+/// Dump the function profile for \p FName.
 ///
 /// \param FName Name of the function to print.
 /// \param OS Stream to emit the output to.
@@ -52,13 +52,13 @@ void SampleProfileReader::dumpFunctionProfile(StringRef FName,
   OS << "Function: " << FName << ": " << Profiles[FName];
 }
 
-/// \brief Dump all the function profiles found on stream \p OS.
+/// Dump all the function profiles found on stream \p OS.
 void SampleProfileReader::dump(raw_ostream &OS) {
   for (const auto &I : Profiles)
     dumpFunctionProfile(I.getKey(), OS);
 }
 
-/// \brief Parse \p Input as function head.
+/// Parse \p Input as function head.
 ///
 /// Parse one line of \p Input, and update function name in \p FName,
 /// function's total sample count in \p NumSamples, function's entry
@@ -79,10 +79,10 @@ static bool ParseHead(const StringRef &Input, StringRef &FName,
   return true;
 }
 
-/// \brief Returns true if line offset \p L is legal (only has 16 bits).
+/// Returns true if line offset \p L is legal (only has 16 bits).
 static bool isOffsetLegal(unsigned L) { return (L & 0xffff) == L; }
 
-/// \brief Parse \p Input as line sample.
+/// Parse \p Input as line sample.
 ///
 /// \param Input input line.
 /// \param IsCallsite true if the line represents an inlined callsite.
@@ -127,19 +127,52 @@ static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
       if (Rest.substr(0, n3).getAsInteger(10, NumSamples))
         return false;
     }
+    // Find call targets and their sample counts.
+    // Note: In some cases, there are symbols in the profile which are not
+    // mangled. To accommodate such cases, use colon + integer pairs as the
+    // anchor points.
+    // An example:
+    // _M_construct<char *>:1000 string_view<std::allocator<char> >:437
+    // ":1000" and ":437" are used as anchor points so the string above will
+    // be interpreted as
+    // target: _M_construct<char *>
+    // count: 1000
+    // target: string_view<std::allocator<char> >
+    // count: 437
     while (n3 != StringRef::npos) {
       n3 += Rest.substr(n3).find_first_not_of(' ');
       Rest = Rest.substr(n3);
-      n3 = Rest.find(' ');
-      StringRef pair = Rest;
-      if (n3 != StringRef::npos) {
-        pair = Rest.substr(0, n3);
-      }
-      size_t n4 = pair.find(':');
-      uint64_t count;
-      if (pair.substr(n4 + 1).getAsInteger(10, count))
+      n3 = Rest.find_first_of(':');
+      if (n3 == StringRef::npos || n3 == 0)
         return false;
-      TargetCountMap[pair.substr(0, n4)] = count;
+
+      StringRef Target;
+      uint64_t count, n4;
+      while (true) {
+        // Get the segment after the current colon.
+        StringRef AfterColon = Rest.substr(n3 + 1);
+        // Get the target symbol before the current colon.
+        Target = Rest.substr(0, n3);
+        // Check if the word after the current colon is an integer.
+        n4 = AfterColon.find_first_of(' ');
+        n4 = (n4 != StringRef::npos) ? n3 + n4 + 1 : Rest.size();
+        StringRef WordAfterColon = Rest.substr(n3 + 1, n4 - n3 - 1);
+        if (!WordAfterColon.getAsInteger(10, count))
+          break;
+
+        // Try to find the next colon.
+        uint64_t n5 = AfterColon.find_first_of(':');
+        if (n5 == StringRef::npos)
+          return false;
+        n3 += n5 + 1;
+      }
+
+      // An anchor point is found. Save the {target, count} pair
+      TargetCountMap[Target] = count;
+      if (n4 == Rest.size())
+        break;
+      // Change n3 to the next blank space after colon + integer pair.
+      n3 = n4;
     }
   } else {
     IsCallsite = true;
@@ -151,7 +184,7 @@ static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
   return true;
 }
 
-/// \brief Load samples from a text file.
+/// Load samples from a text file.
 ///
 /// See the documentation at the top of the file for an explanation of
 /// the expected format.
@@ -286,16 +319,33 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readString() {
   return Str;
 }
 
-ErrorOr<StringRef> SampleProfileReaderBinary::readStringFromTable() {
+template <typename T>
+inline ErrorOr<uint32_t> SampleProfileReaderBinary::readStringIndex(T &Table) {
   std::error_code EC;
   auto Idx = readNumber<uint32_t>();
   if (std::error_code EC = Idx.getError())
     return EC;
-  if (*Idx >= NameTable.size())
+  if (*Idx >= Table.size())
     return sampleprof_error::truncated_name_table;
+  return *Idx;
+}
+
+ErrorOr<StringRef> SampleProfileReaderRawBinary::readStringFromTable() {
+  auto Idx = readStringIndex(NameTable);
+  if (std::error_code EC = Idx.getError())
+    return EC;
+
   return NameTable[*Idx];
 }
 
+ErrorOr<StringRef> SampleProfileReaderCompactBinary::readStringFromTable() {
+  auto Idx = readStringIndex(NameTable);
+  if (std::error_code EC = Idx.getError())
+    return EC;
+
+  return StringRef(NameTable[*Idx]);
+}
+
 std::error_code
 SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
   auto NumSamples = readNumber<uint64_t>();
@@ -396,6 +446,48 @@ std::error_code SampleProfileReaderBinary::read() {
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileReaderRawBinary::verifySPMagic(uint64_t Magic) {
+  if (Magic == SPMagic())
+    return sampleprof_error::success;
+  return sampleprof_error::bad_magic;
+}
+
+std::error_code
+SampleProfileReaderCompactBinary::verifySPMagic(uint64_t Magic) {
+  if (Magic == SPMagic(SPF_Compact_Binary))
+    return sampleprof_error::success;
+  return sampleprof_error::bad_magic;
+}
+
+std::error_code SampleProfileReaderRawBinary::readNameTable() {
+  auto Size = readNumber<uint32_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+  NameTable.reserve(*Size);
+  for (uint32_t I = 0; I < *Size; ++I) {
+    auto Name(readString());
+    if (std::error_code EC = Name.getError())
+      return EC;
+    NameTable.push_back(*Name);
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderCompactBinary::readNameTable() {
+  auto Size = readNumber<uint64_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+  NameTable.reserve(*Size);
+  for (uint32_t I = 0; I < *Size; ++I) {
+    auto FID = readNumber<uint64_t>();
+    if (std::error_code EC = FID.getError())
+      return EC;
+    NameTable.push_back(std::to_string(*FID));
+  }
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileReaderBinary::readHeader() {
   Data = reinterpret_cast<const uint8_t *>(Buffer->getBufferStart());
   End = Data + Buffer->getBufferSize();
@@ -404,8 +496,8 @@ std::error_code SampleProfileReaderBinary::readHeader() {
   auto Magic = readNumber<uint64_t>();
   if (std::error_code EC = Magic.getError())
     return EC;
-  else if (*Magic != SPMagic())
-    return sampleprof_error::bad_magic;
+  else if (std::error_code EC = verifySPMagic(*Magic))
+    return EC;
 
   // Read the version number.
   auto Version = readNumber<uint64_t>();
@@ -417,18 +509,8 @@ std::error_code SampleProfileReaderBinary::readHeader() {
   if (std::error_code EC = readSummary())
     return EC;
 
-  // Read the name table.
-  auto Size = readNumber<uint32_t>();
-  if (std::error_code EC = Size.getError())
+  if (std::error_code EC = readNameTable())
     return EC;
-  NameTable.reserve(*Size);
-  for (uint32_t I = 0; I < *Size; ++I) {
-    auto Name(readString());
-    if (std::error_code EC = Name.getError())
-      return EC;
-    NameTable.push_back(*Name);
-  }
-
   return sampleprof_error::success;
 }
 
@@ -488,13 +570,20 @@ std::error_code SampleProfileReaderBinary::readSummary() {
   return sampleprof_error::success;
 }
 
-bool SampleProfileReaderBinary::hasFormat(const MemoryBuffer &Buffer) {
+bool SampleProfileReaderRawBinary::hasFormat(const MemoryBuffer &Buffer) {
   const uint8_t *Data =
       reinterpret_cast<const uint8_t *>(Buffer.getBufferStart());
   uint64_t Magic = decodeULEB128(Data);
   return Magic == SPMagic();
 }
 
+bool SampleProfileReaderCompactBinary::hasFormat(const MemoryBuffer &Buffer) {
+  const uint8_t *Data =
+      reinterpret_cast<const uint8_t *>(Buffer.getBufferStart());
+  uint64_t Magic = decodeULEB128(Data);
+  return Magic == SPMagic(SPF_Compact_Binary);
+}
+
 std::error_code SampleProfileReaderGCC::skipNextWord() {
   uint32_t dummy;
   if (!GcovBuffer.readInt(dummy))
@@ -717,7 +806,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
   return sampleprof_error::success;
 }
 
-/// \brief Read a GCC AutoFDO profile.
+/// Read a GCC AutoFDO profile.
 ///
 /// This format is generated by the Linux Perf conversion tool at
 /// https://github.com/google/autofdo.
@@ -738,7 +827,7 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == "adcg*704";
 }
 
-/// \brief Prepare a memory buffer for the contents of \p Filename.
+/// Prepare a memory buffer for the contents of \p Filename.
 ///
 /// \returns an error code indicating the status of the buffer.
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
@@ -755,7 +844,7 @@ setupMemoryBuffer(const Twine &Filename) {
   return std::move(Buffer);
 }
 
-/// \brief Create a sample profile reader based on the format of the input file.
+/// Create a sample profile reader based on the format of the input file.
 ///
 /// \param Filename The file to open.
 ///
@@ -770,7 +859,7 @@ SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
   return create(BufferOrError.get(), C);
 }
 
-/// \brief Create a sample profile reader based on the format of the input data.
+/// Create a sample profile reader based on the format of the input data.
 ///
 /// \param B The memory buffer to create the reader from (assumes ownership).
 ///
@@ -780,8 +869,10 @@ SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
 ErrorOr<std::unique_ptr<SampleProfileReader>>
 SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
   std::unique_ptr<SampleProfileReader> Reader;
-  if (SampleProfileReaderBinary::hasFormat(*B))
-    Reader.reset(new SampleProfileReaderBinary(std::move(B), C));
+  if (SampleProfileReaderRawBinary::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderRawBinary(std::move(B), C));
+  else if (SampleProfileReaderCompactBinary::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderCompactBinary(std::move(B), C));
   else if (SampleProfileReaderGCC::hasFormat(*B))
     Reader.reset(new SampleProfileReaderGCC(std::move(B), C));
   else if (SampleProfileReaderText::hasFormat(*B))
diff --git a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
index 59c4885fcdbe..b4de30118b8b 100644
--- a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
@@ -63,7 +64,7 @@ SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
   return sampleprof_error::success;
 }
 
-/// \brief Write samples to a text file.
+/// Write samples to a text file.
 ///
 /// Note: it may be tempting to implement this in terms of
 /// FunctionSamples::print().  Please don't.  The dump functionality is intended
@@ -144,13 +145,61 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
     }
 }
 
-std::error_code SampleProfileWriterBinary::writeHeader(
-    const StringMap<FunctionSamples> &ProfileMap) {
+void SampleProfileWriterBinary::stablizeNameTable(std::set<StringRef> &V) {
+  // Sort the names to make NameTable deterministic.
+  for (const auto &I : NameTable)
+    V.insert(I.first);
+  int i = 0;
+  for (const StringRef &N : V)
+    NameTable[N] = i++;
+}
+
+std::error_code SampleProfileWriterRawBinary::writeNameTable() {
+  auto &OS = *OutputStream;
+  std::set<StringRef> V;
+  stablizeNameTable(V);
+
+  // Write out the name table.
+  encodeULEB128(NameTable.size(), OS);
+  for (auto N : V) {
+    OS << N;
+    encodeULEB128(0, OS);
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterCompactBinary::writeNameTable() {
   auto &OS = *OutputStream;
+  std::set<StringRef> V;
+  stablizeNameTable(V);
 
+  // Write out the name table.
+  encodeULEB128(NameTable.size(), OS);
+  for (auto N : V) {
+    encodeULEB128(MD5Hash(N), OS);
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterRawBinary::writeMagicIdent() {
+  auto &OS = *OutputStream;
   // Write file magic identifier.
   encodeULEB128(SPMagic(), OS);
   encodeULEB128(SPVersion(), OS);
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterCompactBinary::writeMagicIdent() {
+  auto &OS = *OutputStream;
+  // Write file magic identifier.
+  encodeULEB128(SPMagic(SPF_Compact_Binary), OS);
+  encodeULEB128(SPVersion(), OS);
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterBinary::writeHeader(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  writeMagicIdent();
 
   computeSummary(ProfileMap);
   if (auto EC = writeSummary())
@@ -162,20 +211,7 @@ std::error_code SampleProfileWriterBinary::writeHeader(
     addNames(I.second);
   }
 
-  // Sort the names to make NameTable is deterministic.
-  std::set<StringRef> V;
-  for (const auto &I : NameTable)
-    V.insert(I.first);
-  int i = 0;
-  for (const StringRef &N : V)
-    NameTable[N] = i++;
-
-  // Write out the name table.
-  encodeULEB128(NameTable.size(), OS);
-  for (auto N : V) {
-    OS << N;
-    encodeULEB128(0, OS);
-  }
+  writeNameTable();
   return sampleprof_error::success;
 }
 
@@ -239,7 +275,7 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   return sampleprof_error::success;
 }
 
-/// \brief Write samples of a top-level function to a binary file.
+/// Write samples of a top-level function to a binary file.
 ///
 /// \returns true if the samples were written successfully, false otherwise.
 std::error_code SampleProfileWriterBinary::write(const FunctionSamples &S) {
@@ -247,7 +283,7 @@ std::error_code SampleProfileWriterBinary::write(const FunctionSamples &S) {
   return writeBody(S);
 }
 
-/// \brief Create a sample profile file writer based on the specified format.
+/// Create a sample profile file writer based on the specified format.
 ///
 /// \param Filename The file to create.
 ///
@@ -258,7 +294,7 @@ ErrorOr<std::unique_ptr<SampleProfileWriter>>
 SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
   std::error_code EC;
   std::unique_ptr<raw_ostream> OS;
-  if (Format == SPF_Binary)
+  if (Format == SPF_Binary || Format == SPF_Compact_Binary)
     OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_None));
   else
     OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_Text));
@@ -268,7 +304,7 @@ SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
   return create(OS, Format);
 }
 
-/// \brief Create a sample profile stream writer based on the specified format.
+/// Create a sample profile stream writer based on the specified format.
 ///
 /// \param OS The output stream to store the profile data to.
 ///
@@ -282,7 +318,9 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
   std::unique_ptr<SampleProfileWriter> Writer;
 
   if (Format == SPF_Binary)
-    Writer.reset(new SampleProfileWriterBinary(OS));
+    Writer.reset(new SampleProfileWriterRawBinary(OS));
+  else if (Format == SPF_Compact_Binary)
+    Writer.reset(new SampleProfileWriterCompactBinary(OS));
   else if (Format == SPF_Text)
     Writer.reset(new SampleProfileWriterText(OS));
   else if (Format == SPF_GCC)
diff --git a/contrib/llvm/lib/Support/AMDGPUMetadata.cpp b/contrib/llvm/lib/Support/AMDGPUMetadata.cpp
index ddb25935e0ef..a04bfc2ea299 100644
--- a/contrib/llvm/lib/Support/AMDGPUMetadata.cpp
+++ b/contrib/llvm/lib/Support/AMDGPUMetadata.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU metadata definitions and in-memory representations.
+/// AMDGPU metadata definitions and in-memory representations.
 ///
 //
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Support/APFloat.cpp b/contrib/llvm/lib/Support/APFloat.cpp
index 3489feb93a02..24005c1890c9 100644
--- a/contrib/llvm/lib/Support/APFloat.cpp
+++ b/contrib/llvm/lib/Support/APFloat.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -3031,27 +3032,29 @@ double IEEEFloat::convertToDouble() const {
 /// does not support these bit patterns:
 ///  exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
 ///  exponent = all 1's, integer bit 0, significand nonzero ("pseudoNaN")
-///  exponent = 0, integer bit 1 ("pseudodenormal")
 ///  exponent!=0 nor all 1's, integer bit 0 ("unnormal")
-/// At the moment, the first two are treated as NaNs, the second two as Normal.
+///  exponent = 0, integer bit 1 ("pseudodenormal")
+/// At the moment, the first three are treated as NaNs, the last one as Normal.
 void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) {
   assert(api.getBitWidth()==80);
   uint64_t i1 = api.getRawData()[0];
   uint64_t i2 = api.getRawData()[1];
   uint64_t myexponent = (i2 & 0x7fff);
   uint64_t mysignificand = i1;
+  uint8_t myintegerbit = mysignificand >> 63;
 
   initialize(&semX87DoubleExtended);
   assert(partCount()==2);
 
   sign = static_cast<unsigned int>(i2>>15);
-  if (myexponent==0 && mysignificand==0) {
+  if (myexponent == 0 && mysignificand == 0) {
     // exponent, significand meaningless
     category = fcZero;
   } else if (myexponent==0x7fff && mysignificand==0x8000000000000000ULL) {
     // exponent, significand meaningless
     category = fcInfinity;
-  } else if (myexponent==0x7fff && mysignificand!=0x8000000000000000ULL) {
+  } else if ((myexponent == 0x7fff && mysignificand != 0x8000000000000000ULL) ||
+             (myexponent != 0x7fff && myexponent != 0 && myintegerbit == 0)) {
     // exponent meaningless
     category = fcNaN;
     significandParts()[0] = mysignificand;
@@ -4440,8 +4443,10 @@ APFloat::APFloat(const fltSemantics &Semantics, StringRef S)
 
 APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
                                    roundingMode RM, bool *losesInfo) {
-  if (&getSemantics() == &ToSemantics)
+  if (&getSemantics() == &ToSemantics) {
+    *losesInfo = false;
     return opOK;
+  }
   if (usesLayout<IEEEFloat>(getSemantics()) &&
       usesLayout<IEEEFloat>(ToSemantics))
     return U.IEEE.convert(ToSemantics, RM, losesInfo);
diff --git a/contrib/llvm/lib/Support/APInt.cpp b/contrib/llvm/lib/Support/APInt.cpp
index 1ea6319acfad..1fae0e9b8d6d 100644
--- a/contrib/llvm/lib/Support/APInt.cpp
+++ b/contrib/llvm/lib/Support/APInt.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -33,8 +34,7 @@ using namespace llvm;
 /// A utility function for allocating memory, checking for allocation failures,
 /// and ensuring the contents are zeroed.
 inline static uint64_t* getClearedMemory(unsigned numWords) {
-  uint64_t * result = new uint64_t[numWords];
-  assert(result && "APInt memory allocation fails!");
+  uint64_t *result = new uint64_t[numWords];
   memset(result, 0, numWords * sizeof(uint64_t));
   return result;
 }
@@ -42,9 +42,7 @@ inline static uint64_t* getClearedMemory(unsigned numWords) {
 /// A utility function for allocating memory and checking for allocation
 /// failure.  The content is not zeroed.
 inline static uint64_t* getMemory(unsigned numWords) {
-  uint64_t * result = new uint64_t[numWords];
-  assert(result && "APInt memory allocation fails!");
-  return result;
+  return new uint64_t[numWords];
 }
 
 /// A utility function that converts a character to a digit.
@@ -170,7 +168,7 @@ void APInt::Profile(FoldingSetNodeID& ID) const {
     ID.AddInteger(U.pVal[i]);
 }
 
-/// @brief Prefix increment operator. Increments the APInt by one.
+/// Prefix increment operator. Increments the APInt by one.
 APInt& APInt::operator++() {
   if (isSingleWord())
     ++U.VAL;
@@ -179,7 +177,7 @@ APInt& APInt::operator++() {
   return clearUnusedBits();
 }
 
-/// @brief Prefix decrement operator. Decrements the APInt by one.
+/// Prefix decrement operator. Decrements the APInt by one.
 APInt& APInt::operator--() {
   if (isSingleWord())
     --U.VAL;
@@ -190,7 +188,7 @@ APInt& APInt::operator--() {
 
 /// Adds the RHS APint to this APInt.
 /// @returns this, after addition of RHS.
-/// @brief Addition assignment operator.
+/// Addition assignment operator.
 APInt& APInt::operator+=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
@@ -210,7 +208,7 @@ APInt& APInt::operator+=(uint64_t RHS) {
 
 /// Subtracts the RHS APInt from this APInt
 /// @returns this, after subtraction
-/// @brief Subtraction assignment operator.
+/// Subtraction assignment operator.
 APInt& APInt::operator-=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
@@ -328,7 +326,7 @@ void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) {
     U.pVal[word] = WORD_MAX;
 }
 
-/// @brief Toggle every bit to its opposite value.
+/// Toggle every bit to its opposite value.
 void APInt::flipAllBitsSlowCase() {
   tcComplement(U.pVal, getNumWords());
   clearUnusedBits();
@@ -336,7 +334,7 @@ void APInt::flipAllBitsSlowCase() {
 
 /// Toggle a given bit to its opposite value whose position is given
 /// as "bitPosition".
-/// @brief Toggles a given bit to its opposite value.
+/// Toggles a given bit to its opposite value.
 void APInt::flipBit(unsigned bitPosition) {
   assert(bitPosition < BitWidth && "Out of the bit-width range!");
   if ((*this)[bitPosition]) clearBit(bitPosition);
@@ -428,11 +426,12 @@ APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
   unsigned NumSrcWords = getNumWords();
   unsigned NumDstWords = Result.getNumWords();
 
+  uint64_t *DestPtr = Result.isSingleWord() ? &Result.U.VAL : Result.U.pVal;
   for (unsigned word = 0; word < NumDstWords; ++word) {
     uint64_t w0 = U.pVal[loWord + word];
     uint64_t w1 =
         (loWord + word + 1) < NumSrcWords ? U.pVal[loWord + word + 1] : 0;
-    Result.U.pVal[word] = (w0 >> loBit) | (w1 << (APINT_BITS_PER_WORD - loBit));
+    DestPtr[word] = (w0 >> loBit) | (w1 << (APINT_BITS_PER_WORD - loBit));
   }
 
   return Result.clearUnusedBits();
@@ -909,13 +908,13 @@ APInt APInt::sextOrSelf(unsigned width) const {
 }
 
 /// Arithmetic right-shift this APInt by shiftAmt.
-/// @brief Arithmetic right-shift function.
+/// Arithmetic right-shift function.
 void APInt::ashrInPlace(const APInt &shiftAmt) {
   ashrInPlace((unsigned)shiftAmt.getLimitedValue(BitWidth));
 }
 
 /// Arithmetic right-shift this APInt by shiftAmt.
-/// @brief Arithmetic right-shift function.
+/// Arithmetic right-shift function.
 void APInt::ashrSlowCase(unsigned ShiftAmt) {
   // Don't bother performing a no-op shift.
   if (!ShiftAmt)
@@ -924,7 +923,7 @@ void APInt::ashrSlowCase(unsigned ShiftAmt) {
   // Save the original sign bit for later.
   bool Negative = isNegative();
 
-  // WordShift is the inter-part shift; BitShift is is intra-part shift.
+  // WordShift is the inter-part shift; BitShift is intra-part shift.
   unsigned WordShift = ShiftAmt / APINT_BITS_PER_WORD;
   unsigned BitShift = ShiftAmt % APINT_BITS_PER_WORD;
 
@@ -958,19 +957,19 @@ void APInt::ashrSlowCase(unsigned ShiftAmt) {
 }
 
 /// Logical right-shift this APInt by shiftAmt.
-/// @brief Logical right-shift function.
+/// Logical right-shift function.
 void APInt::lshrInPlace(const APInt &shiftAmt) {
   lshrInPlace((unsigned)shiftAmt.getLimitedValue(BitWidth));
 }
 
 /// Logical right-shift this APInt by shiftAmt.
-/// @brief Logical right-shift function.
+/// Logical right-shift function.
 void APInt::lshrSlowCase(unsigned ShiftAmt) {
   tcShiftRight(U.pVal, getNumWords(), ShiftAmt);
 }
 
 /// Left-shift this APInt by shiftAmt.
-/// @brief Left-shift function.
+/// Left-shift function.
 APInt &APInt::operator<<=(const APInt &shiftAmt) {
   // It's undefined behavior in C to shift by BitWidth or greater.
   *this <<= (unsigned)shiftAmt.getLimitedValue(BitWidth);
@@ -1254,18 +1253,20 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
 
 // The DEBUG macros here tend to be spam in the debug output if you're not
 // debugging this code. Disable them unless KNUTH_DEBUG is defined.
-#pragma push_macro("DEBUG")
+#pragma push_macro("LLVM_DEBUG")
 #ifndef KNUTH_DEBUG
-#undef DEBUG
-#define DEBUG(X) do {} while (false)
+#undef LLVM_DEBUG
+#define LLVM_DEBUG(X)                                                          \
+  do {                                                                         \
+  } while (false)
 #endif
 
-  DEBUG(dbgs() << "KnuthDiv: m=" << m << " n=" << n << '\n');
-  DEBUG(dbgs() << "KnuthDiv: original:");
-  DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
-  DEBUG(dbgs() << " by");
-  DEBUG(for (int i = n; i >0; i--) dbgs() << " " << v[i-1]);
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "KnuthDiv: m=" << m << " n=" << n << '\n');
+  LLVM_DEBUG(dbgs() << "KnuthDiv: original:");
+  LLVM_DEBUG(for (int i = m + n; i >= 0; i--) dbgs() << " " << u[i]);
+  LLVM_DEBUG(dbgs() << " by");
+  LLVM_DEBUG(for (int i = n; i > 0; i--) dbgs() << " " << v[i - 1]);
+  LLVM_DEBUG(dbgs() << '\n');
   // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
   // u and v by d. Note that we have taken Knuth's advice here to use a power
   // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
@@ -1291,16 +1292,16 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
   }
   u[m+n] = u_carry;
 
-  DEBUG(dbgs() << "KnuthDiv:   normal:");
-  DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
-  DEBUG(dbgs() << " by");
-  DEBUG(for (int i = n; i >0; i--) dbgs() << " " << v[i-1]);
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "KnuthDiv:   normal:");
+  LLVM_DEBUG(for (int i = m + n; i >= 0; i--) dbgs() << " " << u[i]);
+  LLVM_DEBUG(dbgs() << " by");
+  LLVM_DEBUG(for (int i = n; i > 0; i--) dbgs() << " " << v[i - 1]);
+  LLVM_DEBUG(dbgs() << '\n');
 
   // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
   int j = m;
   do {
-    DEBUG(dbgs() << "KnuthDiv: quotient digit #" << j << '\n');
+    LLVM_DEBUG(dbgs() << "KnuthDiv: quotient digit #" << j << '\n');
     // D3. [Calculate q'.].
     //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
     //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
@@ -1310,7 +1311,7 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
     // value qp is one too large, and it eliminates all cases where qp is two
     // too large.
     uint64_t dividend = Make_64(u[j+n], u[j+n-1]);
-    DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
+    LLVM_DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
     uint64_t qp = dividend / v[n-1];
     uint64_t rp = dividend % v[n-1];
     if (qp == b || qp*v[n-2] > b*rp + u[j+n-2]) {
@@ -1319,7 +1320,7 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
       if (rp < b && (qp == b || qp*v[n-2] > b*rp + u[j+n-2]))
         qp--;
     }
-    DEBUG(dbgs() << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+    LLVM_DEBUG(dbgs() << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
 
     // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
     // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
@@ -1335,15 +1336,15 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
       int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
       u[j+i] = Lo_32(subres);
       borrow = Hi_32(p) - Hi_32(subres);
-      DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i]
-                   << ", borrow = " << borrow << '\n');
+      LLVM_DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j + i]
+                        << ", borrow = " << borrow << '\n');
     }
     bool isNeg = u[j+n] < borrow;
     u[j+n] -= Lo_32(borrow);
 
-    DEBUG(dbgs() << "KnuthDiv: after subtraction:");
-    DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "KnuthDiv: after subtraction:");
+    LLVM_DEBUG(for (int i = m + n; i >= 0; i--) dbgs() << " " << u[i]);
+    LLVM_DEBUG(dbgs() << '\n');
 
     // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
     // negative, go to step D6; otherwise go on to step D7.
@@ -1364,16 +1365,16 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
       }
       u[j+n] += carry;
     }
-    DEBUG(dbgs() << "KnuthDiv: after correction:");
-    DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
-    DEBUG(dbgs() << "\nKnuthDiv: digit result = " << q[j] << '\n');
+    LLVM_DEBUG(dbgs() << "KnuthDiv: after correction:");
+    LLVM_DEBUG(for (int i = m + n; i >= 0; i--) dbgs() << " " << u[i]);
+    LLVM_DEBUG(dbgs() << "\nKnuthDiv: digit result = " << q[j] << '\n');
 
-  // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
+    // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
   } while (--j >= 0);
 
-  DEBUG(dbgs() << "KnuthDiv: quotient:");
-  DEBUG(for (int i = m; i >=0; i--) dbgs() <<" " << q[i]);
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "KnuthDiv: quotient:");
+  LLVM_DEBUG(for (int i = m; i >= 0; i--) dbgs() << " " << q[i]);
+  LLVM_DEBUG(dbgs() << '\n');
 
   // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
   // remainder may be obtained by dividing u[...] by d. If r is non-null we
@@ -1384,23 +1385,23 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
     // shift right here.
     if (shift) {
       uint32_t carry = 0;
-      DEBUG(dbgs() << "KnuthDiv: remainder:");
+      LLVM_DEBUG(dbgs() << "KnuthDiv: remainder:");
       for (int i = n-1; i >= 0; i--) {
         r[i] = (u[i] >> shift) | carry;
         carry = u[i] << (32 - shift);
-        DEBUG(dbgs() << " " << r[i]);
+        LLVM_DEBUG(dbgs() << " " << r[i]);
       }
     } else {
       for (int i = n-1; i >= 0; i--) {
         r[i] = u[i];
-        DEBUG(dbgs() << " " << r[i]);
+        LLVM_DEBUG(dbgs() << " " << r[i]);
       }
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
-#pragma pop_macro("DEBUG")
+#pragma pop_macro("LLVM_DEBUG")
 }
 
 void APInt::divide(const WordType *LHS, unsigned lhsWords, const WordType *RHS,
@@ -1734,25 +1735,25 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
 
   // Check the degenerate cases
   if (lhsWords == 0) {
-    Quotient = 0;                // 0 / Y ===> 0
-    Remainder = 0;               // 0 % Y ===> 0
+    Quotient = APInt(BitWidth, 0);    // 0 / Y ===> 0
+    Remainder = APInt(BitWidth, 0);   // 0 % Y ===> 0
     return;
   }
 
   if (rhsBits == 1) {
-    Quotient = LHS;             // X / 1 ===> X
-    Remainder = 0;              // X % 1 ===> 0
+    Quotient = LHS;                   // X / 1 ===> X
+    Remainder = APInt(BitWidth, 0);   // X % 1 ===> 0
   }
 
   if (lhsWords < rhsWords || LHS.ult(RHS)) {
-    Remainder = LHS;            // X % Y ===> X, iff X < Y
-    Quotient = 0;               // X / Y ===> 0, iff X < Y
+    Remainder = LHS;                  // X % Y ===> X, iff X < Y
+    Quotient = APInt(BitWidth, 0);    // X / Y ===> 0, iff X < Y
     return;
   }
 
   if (LHS == RHS) {
-    Quotient  = 1;              // X / X ===> 1
-    Remainder = 0;              // X % X ===> 0;
+    Quotient  = APInt(BitWidth, 1);   // X / X ===> 1
+    Remainder = APInt(BitWidth, 0);   // X % X ===> 0;
     return;
   }
 
@@ -1800,25 +1801,26 @@ void APInt::udivrem(const APInt &LHS, uint64_t RHS, APInt &Quotient,
 
   // Check the degenerate cases
   if (lhsWords == 0) {
-    Quotient = 0;                // 0 / Y ===> 0
-    Remainder = 0;               // 0 % Y ===> 0
+    Quotient = APInt(BitWidth, 0);    // 0 / Y ===> 0
+    Remainder = 0;                    // 0 % Y ===> 0
     return;
   }
 
   if (RHS == 1) {
-    Quotient = LHS;             // X / 1 ===> X
-    Remainder = 0;              // X % 1 ===> 0
+    Quotient = LHS;                   // X / 1 ===> X
+    Remainder = 0;                    // X % 1 ===> 0
+    return;
   }
 
   if (LHS.ult(RHS)) {
-    Remainder = LHS.getZExtValue(); // X % Y ===> X, iff X < Y
-    Quotient = 0;                   // X / Y ===> 0, iff X < Y
+    Remainder = LHS.getZExtValue();   // X % Y ===> X, iff X < Y
+    Quotient = APInt(BitWidth, 0);    // X / Y ===> 0, iff X < Y
     return;
   }
 
   if (LHS == RHS) {
-    Quotient  = 1;              // X / X ===> 1
-    Remainder = 0;              // X % X ===> 0;
+    Quotient  = APInt(BitWidth, 1);   // X / X ===> 1
+    Remainder = 0;                    // X % X ===> 0;
     return;
   }
 
@@ -2657,3 +2659,51 @@ void APInt::tcSetLeastSignificantBits(WordType *dst, unsigned parts,
   while (i < parts)
     dst[i++] = 0;
 }
+
+APInt llvm::APIntOps::RoundingUDiv(const APInt &A, const APInt &B,
+                                   APInt::Rounding RM) {
+  // Currently udivrem always rounds down.
+  switch (RM) {
+  case APInt::Rounding::DOWN:
+  case APInt::Rounding::TOWARD_ZERO:
+    return A.udiv(B);
+  case APInt::Rounding::UP: {
+    APInt Quo, Rem;
+    APInt::udivrem(A, B, Quo, Rem);
+    if (Rem == 0)
+      return Quo;
+    return Quo + 1;
+  }
+  }
+  llvm_unreachable("Unknown APInt::Rounding enum");
+}
+
+APInt llvm::APIntOps::RoundingSDiv(const APInt &A, const APInt &B,
+                                   APInt::Rounding RM) {
+  switch (RM) {
+  case APInt::Rounding::DOWN:
+  case APInt::Rounding::UP: {
+    APInt Quo, Rem;
+    APInt::sdivrem(A, B, Quo, Rem);
+    if (Rem == 0)
+      return Quo;
+    // This algorithm deals with arbitrary rounding mode used by sdivrem.
+    // We want to check whether the non-integer part of the mathematical value
+    // is negative or not. If the non-integer part is negative, we need to round
+    // down from Quo; otherwise, if it's positive or 0, we return Quo, as it's
+    // already rounded down.
+    if (RM == APInt::Rounding::DOWN) {
+      if (Rem.isNegative() != B.isNegative())
+        return Quo - 1;
+      return Quo;
+    }
+    if (Rem.isNegative() != B.isNegative())
+      return Quo;
+    return Quo + 1;
+  }
+  // Currently sdiv rounds twards zero.
+  case APInt::Rounding::TOWARD_ZERO:
+    return A.sdiv(B);
+  }
+  llvm_unreachable("Unknown APInt::Rounding enum");
+}
diff --git a/contrib/llvm/lib/Support/ARMAttributeParser.cpp b/contrib/llvm/lib/Support/ARMAttributeParser.cpp
index e39bddc4e8f2..1f98ac2f40ba 100644
--- a/contrib/llvm/lib/Support/ARMAttributeParser.cpp
+++ b/contrib/llvm/lib/Support/ARMAttributeParser.cpp
@@ -705,4 +705,3 @@ void ARMAttributeParser::Parse(ArrayRef<uint8_t> Section, bool isLittle) {
   }
 }
 }
-
diff --git a/contrib/llvm/lib/Support/BinaryStreamRef.cpp b/contrib/llvm/lib/Support/BinaryStreamRef.cpp
index 60a03fe9930f..bdc0f54bf25a 100644
--- a/contrib/llvm/lib/Support/BinaryStreamRef.cpp
+++ b/contrib/llvm/lib/Support/BinaryStreamRef.cpp
@@ -127,5 +127,5 @@ WritableBinaryStreamRef::operator BinaryStreamRef() const {
   return BinaryStreamRef(*BorrowedImpl, ViewOffset, Length);
 }
 
-/// \brief For buffered streams, commits changes to the backing store.
+/// For buffered streams, commits changes to the backing store.
 Error WritableBinaryStreamRef::commit() { return BorrowedImpl->commit(); }
diff --git a/contrib/llvm/lib/Support/BranchProbability.cpp b/contrib/llvm/lib/Support/BranchProbability.cpp
index 44ad110d456a..31dee9561f49 100644
--- a/contrib/llvm/lib/Support/BranchProbability.cpp
+++ b/contrib/llvm/lib/Support/BranchProbability.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/lib/Support/COM.cpp b/contrib/llvm/lib/Support/COM.cpp
index cf3a133fd9b4..2e3ff66843d3 100644
--- a/contrib/llvm/lib/Support/COM.cpp
+++ b/contrib/llvm/lib/Support/COM.cpp
@@ -13,11 +13,11 @@
 
 #include "llvm/Support/COM.h"
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/COM.inc"
-#elif LLVM_ON_WIN32
+#elif _WIN32
 #include "Windows/COM.inc"
 #endif
diff --git a/contrib/llvm/lib/Support/CachePruning.cpp b/contrib/llvm/lib/Support/CachePruning.cpp
index 141573c2a1c7..7326c4fc91fb 100644
--- a/contrib/llvm/lib/Support/CachePruning.cpp
+++ b/contrib/llvm/lib/Support/CachePruning.cpp
@@ -146,7 +146,7 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
   if (Policy.Expiration == seconds(0) &&
       Policy.MaxSizePercentageOfAvailableSpace == 0 &&
       Policy.MaxSizeBytes == 0 && Policy.MaxSizeFiles == 0) {
-    DEBUG(dbgs() << "No pruning settings set, exit early\n");
+    LLVM_DEBUG(dbgs() << "No pruning settings set, exit early\n");
     // Nothing will be pruned, early exit
     return false;
   }
@@ -173,9 +173,9 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
       const auto TimeStampModTime = FileStatus.getLastModificationTime();
       auto TimeStampAge = CurrentTime - TimeStampModTime;
       if (TimeStampAge <= *Policy.Interval) {
-        DEBUG(dbgs() << "Timestamp file too recent ("
-                     << duration_cast<seconds>(TimeStampAge).count()
-                     << "s old), do not prune.\n");
+        LLVM_DEBUG(dbgs() << "Timestamp file too recent ("
+                          << duration_cast<seconds>(TimeStampAge).count()
+                          << "s old), do not prune.\n");
         return false;
       }
     }
@@ -207,7 +207,7 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
     // there.
     ErrorOr<sys::fs::basic_file_status> StatusOrErr = File->status();
     if (!StatusOrErr) {
-      DEBUG(dbgs() << "Ignore " << File->path() << " (can't stat)\n");
+      LLVM_DEBUG(dbgs() << "Ignore " << File->path() << " (can't stat)\n");
       continue;
     }
 
@@ -215,8 +215,9 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
     const auto FileAccessTime = StatusOrErr->getLastAccessedTime();
     auto FileAge = CurrentTime - FileAccessTime;
     if (Policy.Expiration != seconds(0) && FileAge > Policy.Expiration) {
-      DEBUG(dbgs() << "Remove " << File->path() << " ("
-                   << duration_cast<seconds>(FileAge).count() << "s old)\n");
+      LLVM_DEBUG(dbgs() << "Remove " << File->path() << " ("
+                        << duration_cast<seconds>(FileAge).count()
+                        << "s old)\n");
       sys::fs::remove(File->path());
       continue;
     }
@@ -235,9 +236,9 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
     // Update size
     TotalSize -= FileAndSize->first;
     NumFiles--;
-    DEBUG(dbgs() << " - Remove " << FileAndSize->second << " (size "
-                 << FileAndSize->first << "), new occupancy is " << TotalSize
-                 << "%\n");
+    LLVM_DEBUG(dbgs() << " - Remove " << FileAndSize->second << " (size "
+                      << FileAndSize->first << "), new occupancy is "
+                      << TotalSize << "%\n");
     ++FileAndSize;
   };
 
@@ -263,9 +264,10 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
         AvailableSpace * Policy.MaxSizePercentageOfAvailableSpace / 100ull,
         Policy.MaxSizeBytes);
 
-    DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
-                 << "% target is: " << Policy.MaxSizePercentageOfAvailableSpace
-                 << "%, " << Policy.MaxSizeBytes << " bytes\n");
+    LLVM_DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
+                      << "% target is: "
+                      << Policy.MaxSizePercentageOfAvailableSpace << "%, "
+                      << Policy.MaxSizeBytes << " bytes\n");
 
     // Remove the oldest accessed files first, till we get below the threshold.
     while (TotalSize > TotalSizeTarget && FileAndSize != FileSizes.rend())
diff --git a/contrib/llvm/lib/Support/Chrono.cpp b/contrib/llvm/lib/Support/Chrono.cpp
index 84f5aab6fc45..a2626a89eb63 100644
--- a/contrib/llvm/lib/Support/Chrono.cpp
+++ b/contrib/llvm/lib/Support/Chrono.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Chrono.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -32,7 +32,7 @@ static inline struct tm getStructTM(TimePoint<> TP) {
   assert(LT);
   (void)LT;
 #endif
-#if defined(LLVM_ON_WIN32)
+#if defined(_WIN32)
   int Error = ::localtime_s(&Storage, &OurTime);
   assert(!Error);
   (void)Error;
diff --git a/contrib/llvm/lib/Support/CodeGenCoverage.cpp b/contrib/llvm/lib/Support/CodeGenCoverage.cpp
index ebfe65a398c3..f0a53db4e32a 100644
--- a/contrib/llvm/lib/Support/CodeGenCoverage.cpp
+++ b/contrib/llvm/lib/Support/CodeGenCoverage.cpp
@@ -12,7 +12,7 @@
 
 #include "llvm/Support/CodeGenCoverage.h"
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -22,7 +22,7 @@
 
 #if LLVM_ON_UNIX
 #include <unistd.h>
-#elif LLVM_ON_WIN32
+#elif _WIN32
 #include <windows.h>
 #endif
 
@@ -38,12 +38,17 @@ void CodeGenCoverage::setCovered(uint64_t RuleID) {
   RuleCoverage[RuleID] = true;
 }
 
-bool CodeGenCoverage::isCovered(uint64_t RuleID) {
+bool CodeGenCoverage::isCovered(uint64_t RuleID) const {
   if (RuleCoverage.size() <= RuleID)
     return false;
   return RuleCoverage[RuleID];
 }
 
+iterator_range<CodeGenCoverage::const_covered_iterator>
+CodeGenCoverage::covered() const {
+  return RuleCoverage.set_bits();
+}
+
 bool CodeGenCoverage::parse(MemoryBuffer &Buffer, StringRef BackendName) {
   const char *CurPtr = Buffer.getBufferStart();
 
@@ -88,7 +93,7 @@ bool CodeGenCoverage::emit(StringRef CoveragePrefix,
     std::string Pid =
 #if LLVM_ON_UNIX
         llvm::to_string(::getpid());
-#elif LLVM_ON_WIN32
+#elif _WIN32
         llvm::to_string(::GetCurrentProcessId());
 #else
         "";
diff --git a/contrib/llvm/lib/Support/CommandLine.cpp b/contrib/llvm/lib/Support/CommandLine.cpp
index d95b791972c8..a1e659a01c8e 100644
--- a/contrib/llvm/lib/Support/CommandLine.cpp
+++ b/contrib/llvm/lib/Support/CommandLine.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -973,7 +974,7 @@ static bool ExpandResponseFile(StringRef FName, StringSaver &Saver,
   return true;
 }
 
-/// \brief Expand response files on a command line recursively using the given
+/// Expand response files on a command line recursively using the given
 /// StringSaver and tokenization strategy.
 bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
                              SmallVectorImpl<const char *> &Argv,
@@ -1080,7 +1081,10 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   SmallVector<const char *, 20> newArgv(argv, argv + argc);
   BumpPtrAllocator A;
   StringSaver Saver(A);
-  ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv);
+  ExpandResponseFiles(Saver,
+         Triple(sys::getProcessTriple()).isOSWindows() ?
+         cl::TokenizeWindowsCommandLine : cl::TokenizeGNUCommandLine,
+         newArgv);
   argv = &newArgv[0];
   argc = static_cast<int>(newArgv.size());
 
@@ -1266,8 +1270,15 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
     // If this is a named positional argument, just remember that it is the
     // active one...
-    if (Handler->getFormattingFlag() == cl::Positional)
+    if (Handler->getFormattingFlag() == cl::Positional) {
+      if ((Handler->getMiscFlags() & PositionalEatsArgs) && !Value.empty()) {
+        Handler->error("This argument does not take a value.\n"
+                       "\tInstead, it consumes any positional arguments until "
+                       "the next recognized option.", *Errs);
+        ErrorParsing = true;
+      }
       ActivePositionalArg = Handler;
+    }
     else
       ErrorParsing |= ProvideOption(Handler, ArgName, Value, argc, argv, i);
   }
@@ -1371,9 +1382,9 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   // Now that we know if -debug is specified, we can use it.
   // Note that if ReadResponseFiles == true, this must be done before the
   // memory allocated for the expanded command line is free()d below.
-  DEBUG(dbgs() << "Args: ";
-        for (int i = 0; i < argc; ++i) dbgs() << argv[i] << ' ';
-        dbgs() << '\n';);
+  LLVM_DEBUG(dbgs() << "Args: ";
+             for (int i = 0; i < argc; ++i) dbgs() << argv[i] << ' ';
+             dbgs() << '\n';);
 
   // Free all of the memory allocated to the map.  Command line options may only
   // be processed once!
@@ -1392,15 +1403,15 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 // Option Base class implementation
 //
 
-bool Option::error(const Twine &Message, StringRef ArgName) {
+bool Option::error(const Twine &Message, StringRef ArgName, raw_ostream &Errs) {
   if (!ArgName.data())
     ArgName = ArgStr;
   if (ArgName.empty())
-    errs() << HelpStr; // Be nice for positional arguments
+    Errs << HelpStr; // Be nice for positional arguments
   else
-    errs() << GlobalParser->ProgramName << ": for the -" << ArgName;
+    Errs << GlobalParser->ProgramName << ": for the -" << ArgName;
 
-  errs() << " option: " << Message << "\n";
+  Errs << " option: " << Message << "\n";
   return true;
 }
 
@@ -1470,8 +1481,12 @@ void alias::printOptionInfo(size_t GlobalWidth) const {
 size_t basic_parser_impl::getOptionWidth(const Option &O) const {
   size_t Len = O.ArgStr.size();
   auto ValName = getValueName();
-  if (!ValName.empty())
-    Len += getValueStr(O, ValName).size() + 3;
+  if (!ValName.empty()) {
+    size_t FormattingLen = 3;
+    if (O.getMiscFlags() & PositionalEatsArgs)
+      FormattingLen = 6;
+    Len += getValueStr(O, ValName).size() + FormattingLen;
+  }
 
   return Len + 6;
 }
@@ -1484,8 +1499,13 @@ void basic_parser_impl::printOptionInfo(const Option &O,
   outs() << "  -" << O.ArgStr;
 
   auto ValName = getValueName();
-  if (!ValName.empty())
-    outs() << "=<" << getValueStr(O, ValName) << '>';
+  if (!ValName.empty()) {
+    if (O.getMiscFlags() & PositionalEatsArgs) {
+      outs() << " <" << getValueStr(O, ValName) << ">...";
+    } else {
+      outs() << "=<" << getValueStr(O, ValName) << '>';
+    }
+  }
 
   Option::printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
 }
diff --git a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
index bd38dd88201f..fd5d097d2b7e 100644
--- a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CrashRecoveryContext.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
@@ -47,7 +47,7 @@ public:
       CurrentContext->set(Next);
   }
 
-  /// \brief Called when the separate crash-recovery thread was finished, to
+  /// Called when the separate crash-recovery thread was finished, to
   /// indicate that we don't need to clear the thread-local CurrentContext.
   void setSwitchedThread() { 
 #if defined(LLVM_ENABLE_THREADS) && LLVM_ENABLE_THREADS != 0
@@ -189,7 +189,7 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
 
 #else // !_MSC_VER
 
-#if defined(LLVM_ON_WIN32)
+#if defined(_WIN32)
 // This is a non-MSVC compiler, probably mingw gcc or clang without
 // -fms-extensions. Use vectored exception handling (VEH).
 //
@@ -272,7 +272,7 @@ static void uninstallExceptionOrSignalHandlers() {
   }
 }
 
-#else // !LLVM_ON_WIN32
+#else // !_WIN32
 
 // Generic POSIX implementation.
 //
@@ -342,7 +342,7 @@ static void uninstallExceptionOrSignalHandlers() {
     sigaction(Signals[i], &PrevActions[i], nullptr);
 }
 
-#endif // !LLVM_ON_WIN32
+#endif // !_WIN32
 
 bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
   // If crash recovery is disabled, do nothing.
diff --git a/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index f1a334bfc7be..b82aec1423f5 100644
--- a/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -124,13 +124,13 @@ private:
   /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   bool ExecuteOneTest(const changeset_ty &S) {
     // Check dependencies invariant.
-    DEBUG({
-        for (changeset_ty::const_iterator it = S.begin(),
-               ie = S.end(); it != ie; ++it)
-          for (succ_iterator_ty it2 = succ_begin(*it),
-                 ie2 = succ_end(*it); it2 != ie2; ++it2)
-            assert(S.count(*it2) && "Attempt to run invalid changeset!");
-      });
+    LLVM_DEBUG({
+      for (changeset_ty::const_iterator it = S.begin(), ie = S.end(); it != ie;
+           ++it)
+        for (succ_iterator_ty it2 = succ_begin(*it), ie2 = succ_end(*it);
+             it2 != ie2; ++it2)
+          assert(S.count(*it2) && "Attempt to run invalid changeset!");
+    });
 
     return DDA.ExecuteOneTest(S);
   }
@@ -224,60 +224,68 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(
       PredClosure[*it2].insert(*it);
   
   // Dump useful debug info.
-  DEBUG({
-      llvm::errs() << "-- DAGDeltaAlgorithmImpl --\n";
-      llvm::errs() << "Changes: [";
-      for (changeset_ty::const_iterator it = Changes.begin(),
-             ie = Changes.end(); it != ie; ++it) {
-        if (it != Changes.begin()) llvm::errs() << ", ";
-        llvm::errs() << *it;
-
-        if (succ_begin(*it) != succ_end(*it)) {
-          llvm::errs() << "(";
-          for (succ_iterator_ty it2 = succ_begin(*it),
-                 ie2 = succ_end(*it); it2 != ie2; ++it2) {
-            if (it2 != succ_begin(*it)) llvm::errs() << ", ";
-            llvm::errs() << "->" << *it2;
-          }
-          llvm::errs() << ")";
+  LLVM_DEBUG({
+    llvm::errs() << "-- DAGDeltaAlgorithmImpl --\n";
+    llvm::errs() << "Changes: [";
+    for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end();
+         it != ie; ++it) {
+      if (it != Changes.begin())
+        llvm::errs() << ", ";
+      llvm::errs() << *it;
+
+      if (succ_begin(*it) != succ_end(*it)) {
+        llvm::errs() << "(";
+        for (succ_iterator_ty it2 = succ_begin(*it), ie2 = succ_end(*it);
+             it2 != ie2; ++it2) {
+          if (it2 != succ_begin(*it))
+            llvm::errs() << ", ";
+          llvm::errs() << "->" << *it2;
         }
+        llvm::errs() << ")";
       }
-      llvm::errs() << "]\n";
-
-      llvm::errs() << "Roots: [";
-      for (std::vector<change_ty>::const_iterator it = Roots.begin(),
-             ie = Roots.end(); it != ie; ++it) {
-        if (it != Roots.begin()) llvm::errs() << ", ";
-        llvm::errs() << *it;
+    }
+    llvm::errs() << "]\n";
+
+    llvm::errs() << "Roots: [";
+    for (std::vector<change_ty>::const_iterator it = Roots.begin(),
+                                                ie = Roots.end();
+         it != ie; ++it) {
+      if (it != Roots.begin())
+        llvm::errs() << ", ";
+      llvm::errs() << *it;
+    }
+    llvm::errs() << "]\n";
+
+    llvm::errs() << "Predecessor Closure:\n";
+    for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end();
+         it != ie; ++it) {
+      llvm::errs() << format("  %-4d: [", *it);
+      for (pred_closure_iterator_ty it2 = pred_closure_begin(*it),
+                                    ie2 = pred_closure_end(*it);
+           it2 != ie2; ++it2) {
+        if (it2 != pred_closure_begin(*it))
+          llvm::errs() << ", ";
+        llvm::errs() << *it2;
       }
       llvm::errs() << "]\n";
+    }
 
-      llvm::errs() << "Predecessor Closure:\n";
-      for (changeset_ty::const_iterator it = Changes.begin(),
-             ie = Changes.end(); it != ie; ++it) {
-        llvm::errs() << format("  %-4d: [", *it);
-        for (pred_closure_iterator_ty it2 = pred_closure_begin(*it),
-               ie2 = pred_closure_end(*it); it2 != ie2; ++it2) {
-          if (it2 != pred_closure_begin(*it)) llvm::errs() << ", ";
-          llvm::errs() << *it2;
-        }
-        llvm::errs() << "]\n";
-      }
-      
-      llvm::errs() << "Successor Closure:\n";
-      for (changeset_ty::const_iterator it = Changes.begin(),
-             ie = Changes.end(); it != ie; ++it) {
-        llvm::errs() << format("  %-4d: [", *it);
-        for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
-               ie2 = succ_closure_end(*it); it2 != ie2; ++it2) {
-          if (it2 != succ_closure_begin(*it)) llvm::errs() << ", ";
-          llvm::errs() << *it2;
-        }
-        llvm::errs() << "]\n";
+    llvm::errs() << "Successor Closure:\n";
+    for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end();
+         it != ie; ++it) {
+      llvm::errs() << format("  %-4d: [", *it);
+      for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
+                                    ie2 = succ_closure_end(*it);
+           it2 != ie2; ++it2) {
+        if (it2 != succ_closure_begin(*it))
+          llvm::errs() << ", ";
+        llvm::errs() << *it2;
       }
+      llvm::errs() << "]\n";
+    }
 
-      llvm::errs() << "\n\n";
-    });
+    llvm::errs() << "\n\n";
+  });
 }
 
 bool DAGDeltaAlgorithmImpl::GetTestResult(const changeset_ty &Changes,
@@ -312,10 +320,10 @@ DAGDeltaAlgorithmImpl::Run() {
   // Invariant:  CurrentSet intersect Required == {}
   // Invariant:  Required == (Required union succ*(Required))
   while (!CurrentSet.empty()) {
-    DEBUG({
-        llvm::errs() << "DAG_DD - " << CurrentSet.size() << " active changes, "
-                     << Required.size() << " required changes\n";
-      });
+    LLVM_DEBUG({
+      llvm::errs() << "DAG_DD - " << CurrentSet.size() << " active changes, "
+                   << Required.size() << " required changes\n";
+    });
 
     // Minimize the current set of changes.
     DeltaActiveSetHelper Helper(*this, Required);
diff --git a/contrib/llvm/lib/Support/DJB.cpp b/contrib/llvm/lib/Support/DJB.cpp
new file mode 100644
index 000000000000..905dcf1b7e81
--- /dev/null
+++ b/contrib/llvm/lib/Support/DJB.cpp
@@ -0,0 +1,86 @@
+//===-- Support/DJB.cpp ---DJB Hash -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for the DJ Bernstein hash function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DJB.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Unicode.h"
+
+using namespace llvm;
+
+static UTF32 chopOneUTF32(StringRef &Buffer) {
+  UTF32 C;
+  const UTF8 *const Begin8Const =
+      reinterpret_cast<const UTF8 *>(Buffer.begin());
+  const UTF8 *Begin8 = Begin8Const;
+  UTF32 *Begin32 = &C;
+
+  // In lenient mode we will always end up with a "reasonable" value in C for
+  // non-empty input.
+  assert(!Buffer.empty());
+  ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
+                     &Begin32, &C + 1, lenientConversion);
+  Buffer = Buffer.drop_front(Begin8 - Begin8Const);
+  return C;
+}
+
+static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
+  const UTF32 *Begin32 = &C;
+  UTF8 *Begin8 = Storage.begin();
+
+  // The case-folded output should always be a valid unicode character, so use
+  // strict mode here.
+  ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
+                                           Storage.end(), strictConversion);
+  assert(CR == conversionOK && "Case folding produced invalid char?");
+  (void)CR;
+  return StringRef(reinterpret_cast<char *>(Storage.begin()),
+                   Begin8 - Storage.begin());
+}
+
+static UTF32 foldCharDwarf(UTF32 C) {
+  // DWARF v5 addition to the unicode folding rules.
+  // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
+  // Above" into "i".
+  if (C == 0x130 || C == 0x131)
+    return 'i';
+  return sys::unicode::foldCharSimple(C);
+}
+
+static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
+  UTF32 C = chopOneUTF32(Buffer);
+
+  C = foldCharDwarf(C);
+
+  std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
+  StringRef Folded = toUTF8(C, Storage);
+  return djbHash(Folded, H);
+}
+
+uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
+  while (!Buffer.empty()) {
+    unsigned char C = Buffer.front();
+    if (LLVM_LIKELY(C <= 0x7f)) {
+      // US-ASCII, encoded as one character in utf-8.
+      // This is by far the most common case, so handle this specially.
+      if (C >= 'A' && C <= 'Z')
+        C = 'a' + (C - 'A'); // fold uppercase into lowercase
+      H = (H << 5) + H + C;
+      Buffer = Buffer.drop_front();
+      continue;
+    }
+    H = caseFoldingDjbHashCharSlow(Buffer, H);
+  }
+  return H;
+}
diff --git a/contrib/llvm/lib/Support/Debug.cpp b/contrib/llvm/lib/Support/Debug.cpp
index 9132911479a1..1a70017fee32 100644
--- a/contrib/llvm/lib/Support/Debug.cpp
+++ b/contrib/llvm/lib/Support/Debug.cpp
@@ -11,15 +11,16 @@
 // code, without it being enabled all of the time, and without having to add
 // command line options to enable it.
 //
-// In particular, just wrap your code with the DEBUG() macro, and it will be
-// enabled automatically if you specify '-debug' on the command-line.
+// In particular, just wrap your code with the LLVM_DEBUG() macro, and it will
+// be enabled automatically if you specify '-debug' on the command-line.
 // Alternatively, you can also use the SET_DEBUG_TYPE("foo") macro to specify
 // that your debug code belongs to class "foo".  Then, on the command line, you
 // can specify '-debug-only=foo' to enable JUST the debug information for the
 // foo class.
 //
 // When compiling without assertions, the -debug-* options and all code in
-// DEBUG() statements disappears, so it does not affect the runtime of the code.
+// LLVM_DEBUG() statements disappears, so it does not affect the runtime of the
+// code.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Support/DebugCounter.cpp b/contrib/llvm/lib/Support/DebugCounter.cpp
index 1d46de04ee6a..5a9cecfc56d4 100644
--- a/contrib/llvm/lib/Support/DebugCounter.cpp
+++ b/contrib/llvm/lib/Support/DebugCounter.cpp
@@ -45,7 +45,7 @@ private:
 
 // Create our command line option.
 static DebugCounterList DebugCounterOption(
-    "debug-counter",
+    "debug-counter", cl::Hidden,
     cl::desc("Comma separated list of debug counter skip and count"),
     cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance()));
 
@@ -66,7 +66,7 @@ void DebugCounter::push_back(const std::string &Val) {
   }
   // Now we have counter=value.
   // First, process value.
-  long CounterVal;
+  int64_t CounterVal;
   if (CounterPair.second.getAsInteger(0, CounterVal)) {
     errs() << "DebugCounter Error: " << CounterPair.second
            << " is not a number\n";
@@ -76,26 +76,24 @@ void DebugCounter::push_back(const std::string &Val) {
   // add it to the counter values.
   if (CounterPair.first.endswith("-skip")) {
     auto CounterName = CounterPair.first.drop_back(5);
-    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    unsigned CounterID = getCounterId(CounterName);
     if (!CounterID) {
       errs() << "DebugCounter Error: " << CounterName
              << " is not a registered counter\n";
       return;
     }
-
-    auto Res = Counters.insert({CounterID, {0, -1}});
-    Res.first->second.first = CounterVal;
+    Counters[CounterID].Skip = CounterVal;
+    Counters[CounterID].IsSet = true;
   } else if (CounterPair.first.endswith("-count")) {
     auto CounterName = CounterPair.first.drop_back(6);
-    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    unsigned CounterID = getCounterId(CounterName);
     if (!CounterID) {
       errs() << "DebugCounter Error: " << CounterName
              << " is not a registered counter\n";
       return;
     }
-
-    auto Res = Counters.insert({CounterID, {0, -1}});
-    Res.first->second.second = CounterVal;
+    Counters[CounterID].StopAfter = CounterVal;
+    Counters[CounterID].IsSet = true;
   } else {
     errs() << "DebugCounter Error: " << CounterPair.first
            << " does not end with -skip or -count\n";
@@ -106,7 +104,8 @@ void DebugCounter::print(raw_ostream &OS) const {
   OS << "Counters and values:\n";
   for (const auto &KV : Counters)
     OS << left_justify(RegisteredCounters[KV.first], 32) << ": {"
-       << KV.second.first << "," << KV.second.second << "}\n";
+       << KV.second.Count << "," << KV.second.Skip << ","
+       << KV.second.StopAfter << "}\n";
 }
 
 LLVM_DUMP_METHOD void DebugCounter::dump() const {
diff --git a/contrib/llvm/lib/Support/DynamicLibrary.cpp b/contrib/llvm/lib/Support/DynamicLibrary.cpp
index d8422115eae8..530e92d99a90 100644
--- a/contrib/llvm/lib/Support/DynamicLibrary.cpp
+++ b/contrib/llvm/lib/Support/DynamicLibrary.cpp
@@ -49,7 +49,7 @@ public:
   }
 
   bool AddLibrary(void *Handle, bool IsProcess = false, bool CanClose = true) {
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
     assert((Handle == this ? IsProcess : !IsProcess) && "Bad Handle.");
 #endif
 
@@ -61,7 +61,7 @@ public:
       }
       Handles.push_back(Handle);
     } else {
-#ifndef LLVM_ON_WIN32
+#ifndef _WIN32
       if (Process) {
         if (CanClose)
           DLClose(Process);
@@ -121,7 +121,7 @@ static llvm::ManagedStatic<DynamicLibrary::HandleSet> OpenedHandles;
 static llvm::ManagedStatic<llvm::sys::SmartMutex<true>> SymbolsMutex;
 }
 
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 
 #include "Windows/DynamicLibrary.inc"
 
diff --git a/contrib/llvm/lib/Support/Error.cpp b/contrib/llvm/lib/Support/Error.cpp
index c43a1fa813e2..83345bf6edb9 100644
--- a/contrib/llvm/lib/Support/Error.cpp
+++ b/contrib/llvm/lib/Support/Error.cpp
@@ -112,6 +112,10 @@ std::error_code StringError::convertToErrorCode() const {
   return EC;
 }
 
+Error createStringError(std::error_code EC, char const *Msg) {
+  return make_error<StringError>(Msg, EC);
+}
+
 void report_fatal_error(Error Err, bool GenCrashDiag) {
   assert(Err && "report_fatal_error called with success value");
   std::string ErrMsg;
diff --git a/contrib/llvm/lib/Support/ErrorHandling.cpp b/contrib/llvm/lib/Support/ErrorHandling.cpp
index fb8ae4c1cd5e..21712c5c039e 100644
--- a/contrib/llvm/lib/Support/ErrorHandling.cpp
+++ b/contrib/llvm/lib/Support/ErrorHandling.cpp
@@ -175,6 +175,39 @@ void llvm::report_bad_alloc_error(const char *Reason, bool GenCrashDiag) {
 #endif
 }
 
+#ifdef LLVM_ENABLE_EXCEPTIONS
+// Do not set custom new handler if exceptions are enabled. In this case OOM
+// errors are handled by throwing 'std::bad_alloc'.
+void llvm::install_out_of_memory_new_handler() {
+}
+#else
+// Causes crash on allocation failure. It is called prior to the handler set by
+// 'install_bad_alloc_error_handler'.
+static void out_of_memory_new_handler() {
+  llvm::report_bad_alloc_error("Allocation failed");
+}
+
+// Installs new handler that causes crash on allocation failure. It does not
+// need to be called explicitly, if this file is linked to application, because
+// in this case it is called during construction of 'new_handler_installer'.
+void llvm::install_out_of_memory_new_handler() {
+  static bool out_of_memory_new_handler_installed = false;
+  if (!out_of_memory_new_handler_installed) {
+    std::set_new_handler(out_of_memory_new_handler);
+    out_of_memory_new_handler_installed = true;
+  }
+}
+
+// Static object that causes installation of 'out_of_memory_new_handler' before
+// execution of 'main'.
+static class NewHandlerInstaller {
+public:
+  NewHandlerInstaller() {
+    install_out_of_memory_new_handler();
+  }
+} new_handler_installer;
+#endif
+
 void llvm::llvm_unreachable_internal(const char *msg, const char *file,
                                      unsigned line) {
   // This code intentionally doesn't call the ErrorHandler callback, because
@@ -210,7 +243,7 @@ void LLVMResetFatalErrorHandler() {
   remove_fatal_error_handler();
 }
 
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 
 #include <winerror.h>
 
diff --git a/contrib/llvm/lib/Support/FileOutputBuffer.cpp b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
index c4ff563e5f44..1214b5a0ba1f 100644
--- a/contrib/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
@@ -82,9 +82,11 @@ public:
   size_t getBufferSize() const override { return Buffer.size(); }
 
   Error commit() override {
+    using namespace sys::fs;
     int FD;
     std::error_code EC;
-    if (auto EC = openFileForWrite(FinalPath, FD, fs::F_None, Mode))
+    if (auto EC =
+            openFileForWrite(FinalPath, FD, CD_CreateAlways, OF_None, Mode))
       return errorCodeToError(EC);
     raw_fd_ostream OS(FD, /*shouldClose=*/true, /*unbuffered=*/true);
     OS << StringRef((const char *)Buffer.base(), Buffer.size());
@@ -108,24 +110,30 @@ createInMemoryBuffer(StringRef Path, size_t Size, unsigned Mode) {
 }
 
 static Expected<std::unique_ptr<OnDiskBuffer>>
-createOnDiskBuffer(StringRef Path, size_t Size, unsigned Mode) {
+createOnDiskBuffer(StringRef Path, size_t Size, bool InitExisting,
+                   unsigned Mode) {
   Expected<fs::TempFile> FileOrErr =
       fs::TempFile::create(Path + ".tmp%%%%%%%", Mode);
   if (!FileOrErr)
     return FileOrErr.takeError();
   fs::TempFile File = std::move(*FileOrErr);
 
-#ifndef LLVM_ON_WIN32
-  // On Windows, CreateFileMapping (the mmap function on Windows)
-  // automatically extends the underlying file. We don't need to
-  // extend the file beforehand. _chsize (ftruncate on Windows) is
-  // pretty slow just like it writes specified amount of bytes,
-  // so we should avoid calling that function.
-  if (auto EC = fs::resize_file(File.FD, Size)) {
-    consumeError(File.discard());
-    return errorCodeToError(EC);
-  }
+  if (InitExisting) {
+    if (auto EC = sys::fs::copy_file(Path, File.FD))
+      return errorCodeToError(EC);
+  } else {
+#ifndef _WIN32
+    // On Windows, CreateFileMapping (the mmap function on Windows)
+    // automatically extends the underlying file. We don't need to
+    // extend the file beforehand. _chsize (ftruncate on Windows) is
+    // pretty slow just like it writes specified amount of bytes,
+    // so we should avoid calling that function.
+    if (auto EC = fs::resize_file(File.FD, Size)) {
+      consumeError(File.discard());
+      return errorCodeToError(EC);
+    }
 #endif
+  }
 
   // Mmap it.
   std::error_code EC;
@@ -149,6 +157,15 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
   fs::file_status Stat;
   fs::status(Path, Stat);
 
+  if ((Flags & F_modify) && Size == size_t(-1)) {
+    if (Stat.type() == fs::file_type::regular_file)
+      Size = Stat.getSize();
+    else if (Stat.type() == fs::file_type::file_not_found)
+      return errorCodeToError(errc::no_such_file_or_directory);
+    else
+      return errorCodeToError(errc::invalid_argument);
+  }
+
   // Usually, we want to create OnDiskBuffer to create a temporary file in
   // the same directory as the destination file and atomically replaces it
   // by rename(2).
@@ -163,7 +180,7 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
   case fs::file_type::regular_file:
   case fs::file_type::file_not_found:
   case fs::file_type::status_error:
-    return createOnDiskBuffer(Path, Size, Mode);
+    return createOnDiskBuffer(Path, Size, !!(Flags & F_modify), Mode);
   default:
     return createInMemoryBuffer(Path, Size, Mode);
   }
diff --git a/contrib/llvm/lib/Support/FoldingSet.cpp b/contrib/llvm/lib/Support/FoldingSet.cpp
index 942379549039..ec7d57586e8b 100644
--- a/contrib/llvm/lib/Support/FoldingSet.cpp
+++ b/contrib/llvm/lib/Support/FoldingSet.cpp
@@ -214,11 +214,8 @@ static void **GetBucketFor(unsigned Hash, void **Buckets, unsigned NumBuckets) {
 
 /// AllocateBuckets - Allocated initialized bucket memory.
 static void **AllocateBuckets(unsigned NumBuckets) {
-  void **Buckets = static_cast<void**>(calloc(NumBuckets+1, sizeof(void*)));
-
-  if (Buckets == nullptr)
-    report_bad_alloc_error("Allocation of Buckets failed.");
-  
+  void **Buckets = static_cast<void**>(safe_calloc(NumBuckets + 1,
+                                                   sizeof(void*)));
   // Set the very last bucket to be a non-null "pointer".
   Buckets[NumBuckets] = reinterpret_cast<void*>(-1);
   return Buckets;
diff --git a/contrib/llvm/lib/Support/GraphWriter.cpp b/contrib/llvm/lib/Support/GraphWriter.cpp
index fd7fab08278e..9335daffc3e2 100644
--- a/contrib/llvm/lib/Support/GraphWriter.cpp
+++ b/contrib/llvm/lib/Support/GraphWriter.cpp
@@ -66,7 +66,7 @@ std::string llvm::DOT::EscapeString(const std::string &Label) {
   return Str;
 }
 
-/// \brief Get a color string for this node number. Simply round-robin selects
+/// Get a color string for this node number. Simply round-robin selects
 /// from a reasonable number of colors.
 StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
   static const int NumColors = 20;
@@ -91,20 +91,18 @@ std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
 }
 
 // Execute the graph viewer. Return true if there were errors.
-static bool ExecGraphViewer(StringRef ExecPath, std::vector<const char *> &args,
+static bool ExecGraphViewer(StringRef ExecPath, std::vector<StringRef> &args,
                             StringRef Filename, bool wait,
                             std::string &ErrMsg) {
-  assert(args.back() == nullptr);
   if (wait) {
-    if (sys::ExecuteAndWait(ExecPath, args.data(), nullptr, {}, 0, 0,
-                            &ErrMsg)) {
+    if (sys::ExecuteAndWait(ExecPath, args, None, {}, 0, 0, &ErrMsg)) {
       errs() << "Error: " << ErrMsg << "\n";
       return true;
     }
     sys::fs::remove(Filename);
     errs() << " done. \n";
   } else {
-    sys::ExecuteNoWait(ExecPath, args.data(), nullptr, {}, 0, &ErrMsg);
+    sys::ExecuteNoWait(ExecPath, args, None, {}, 0, &ErrMsg);
     errs() << "Remember to erase graph file: " << Filename << "\n";
   }
   return false;
@@ -158,22 +156,20 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
 #ifdef __APPLE__
   wait &= !ViewBackground;
   if (S.TryFindProgram("open", ViewerPath)) {
-    std::vector<const char *> args;
-    args.push_back(ViewerPath.c_str());
+    std::vector<StringRef> args;
+    args.push_back(ViewerPath);
     if (wait)
       args.push_back("-W");
-    args.push_back(Filename.c_str());
-    args.push_back(nullptr);
+    args.push_back(Filename);
     errs() << "Trying 'open' program... ";
     if (!ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg))
       return false;
   }
 #endif
   if (S.TryFindProgram("xdg-open", ViewerPath)) {
-    std::vector<const char *> args;
-    args.push_back(ViewerPath.c_str());
-    args.push_back(Filename.c_str());
-    args.push_back(nullptr);
+    std::vector<StringRef> args;
+    args.push_back(ViewerPath);
+    args.push_back(Filename);
     errs() << "Trying 'xdg-open' program... ";
     if (!ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg))
       return false;
@@ -181,10 +177,9 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
 
   // Graphviz
   if (S.TryFindProgram("Graphviz", ViewerPath)) {
-    std::vector<const char *> args;
-    args.push_back(ViewerPath.c_str());
-    args.push_back(Filename.c_str());
-    args.push_back(nullptr);
+    std::vector<StringRef> args;
+    args.push_back(ViewerPath);
+    args.push_back(Filename);
 
     errs() << "Running 'Graphviz' program... ";
     return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
@@ -192,15 +187,13 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
 
   // xdot
   if (S.TryFindProgram("xdot|xdot.py", ViewerPath)) {
-    std::vector<const char *> args;
-    args.push_back(ViewerPath.c_str());
-    args.push_back(Filename.c_str());
+    std::vector<StringRef> args;
+    args.push_back(ViewerPath);
+    args.push_back(Filename);
 
     args.push_back("-f");
     args.push_back(getProgramName(program));
 
-    args.push_back(nullptr);
-
     errs() << "Running 'xdot.py' program... ";
     return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
   }
@@ -221,7 +214,7 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
     Viewer = VK_Ghostview;
   if (!Viewer && S.TryFindProgram("xdg-open", ViewerPath))
     Viewer = VK_XDGOpen;
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
   if (!Viewer && S.TryFindProgram("cmd", ViewerPath)) {
     Viewer = VK_CmdStart;
   }
@@ -235,18 +228,17 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
     std::string OutputFilename =
         Filename + (Viewer == VK_CmdStart ? ".pdf" : ".ps");
 
-    std::vector<const char *> args;
-    args.push_back(GeneratorPath.c_str());
+    std::vector<StringRef> args;
+    args.push_back(GeneratorPath);
     if (Viewer == VK_CmdStart)
       args.push_back("-Tpdf");
     else
       args.push_back("-Tps");
     args.push_back("-Nfontname=Courier");
     args.push_back("-Gsize=7.5,10");
-    args.push_back(Filename.c_str());
+    args.push_back(Filename);
     args.push_back("-o");
-    args.push_back(OutputFilename.c_str());
-    args.push_back(nullptr);
+    args.push_back(OutputFilename);
 
     errs() << "Running '" << GeneratorPath << "' program... ";
 
@@ -258,31 +250,30 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
     std::string StartArg;
 
     args.clear();
-    args.push_back(ViewerPath.c_str());
+    args.push_back(ViewerPath);
     switch (Viewer) {
     case VK_OSXOpen:
       args.push_back("-W");
-      args.push_back(OutputFilename.c_str());
+      args.push_back(OutputFilename);
       break;
     case VK_XDGOpen:
       wait = false;
-      args.push_back(OutputFilename.c_str());
+      args.push_back(OutputFilename);
       break;
     case VK_Ghostview:
       args.push_back("--spartan");
-      args.push_back(OutputFilename.c_str());
+      args.push_back(OutputFilename);
       break;
     case VK_CmdStart:
       args.push_back("/S");
       args.push_back("/C");
       StartArg =
           (StringRef("start ") + (wait ? "/WAIT " : "") + OutputFilename).str();
-      args.push_back(StartArg.c_str());
+      args.push_back(StartArg);
       break;
     case VK_None:
       llvm_unreachable("Invalid viewer");
     }
-    args.push_back(nullptr);
 
     ErrMsg.clear();
     return ExecGraphViewer(ViewerPath, args, OutputFilename, wait, ErrMsg);
@@ -290,13 +281,12 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
 
   // dotty
   if (S.TryFindProgram("dotty", ViewerPath)) {
-    std::vector<const char *> args;
-    args.push_back(ViewerPath.c_str());
-    args.push_back(Filename.c_str());
-    args.push_back(nullptr);
+    std::vector<StringRef> args;
+    args.push_back(ViewerPath);
+    args.push_back(Filename);
 
 // Dotty spawns another app and doesn't wait until it returns
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
     wait = false;
 #endif
     errs() << "Running 'dotty' program... ";
diff --git a/contrib/llvm/lib/Support/Host.cpp b/contrib/llvm/lib/Support/Host.cpp
index 6e65b5e6c807..2c718dd3f5a8 100644
--- a/contrib/llvm/lib/Support/Host.cpp
+++ b/contrib/llvm/lib/Support/Host.cpp
@@ -18,7 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -30,7 +30,7 @@
 #ifdef LLVM_ON_UNIX
 #include "Unix/Host.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Host.inc"
 #endif
 #ifdef _MSC_VER
@@ -65,8 +65,7 @@ static std::unique_ptr<llvm::MemoryBuffer>
   return std::move(*Text);
 }
 
-StringRef sys::detail::getHostCPUNameForPowerPC(
-    const StringRef &ProcCpuinfoContent) {
+StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
   // Access to the Processor Version Register (PVR) on PowerPC is privileged,
   // and so we must use an operating-system interface to determine the current
   // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
@@ -145,8 +144,7 @@ StringRef sys::detail::getHostCPUNameForPowerPC(
       .Default(generic);
 }
 
-StringRef sys::detail::getHostCPUNameForARM(
-    const StringRef &ProcCpuinfoContent) {
+StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
   // The cpuid register on arm is not accessible from user space. On Linux,
   // it is exposed through the /proc/cpuinfo file.
 
@@ -250,8 +248,7 @@ StringRef sys::detail::getHostCPUNameForARM(
   return "generic";
 }
 
-StringRef sys::detail::getHostCPUNameForS390x(
-    const StringRef &ProcCpuinfoContent) {
+StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
   // STIDP is a privileged operation, so use /proc/cpuinfo instead.
 
   // The "processor 0:" line comes after a fair amount of other information,
@@ -654,9 +651,11 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     // Goldmont:
     case 0x5c: // Apollo Lake
     case 0x5f: // Denverton
-    case 0x7a: // Gemini Lake
       *Type = X86::INTEL_GOLDMONT;
       break; // "goldmont"
+    case 0x7a:
+      *Type = X86::INTEL_GOLDMONT_PLUS;
+      break;
     case 0x57:
       *Type = X86::INTEL_KNL; // knl
       break;
@@ -841,9 +840,9 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       *Subtype = X86::AMDFAM15H_BDVER3;
       break; // "bdver3"; 30h-3Fh: Steamroller
     }
-    if (Model >= 0x10 && Model <= 0x1f) {
+    if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
       *Subtype = X86::AMDFAM15H_BDVER2;
-      break; // "bdver2"; 10h-1Fh: Piledriver
+      break; // "bdver2"; 02h, 10h-1Fh: Piledriver
     }
     if (Model <= 0x0f) {
       *Subtype = X86::AMDFAM15H_BDVER1;
@@ -1062,19 +1061,19 @@ StringRef sys::getHostCPUName() {
 #elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
 StringRef sys::getHostCPUName() {
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
-  const StringRef& Content = P ? P->getBuffer() : "";
+  StringRef Content = P ? P->getBuffer() : "";
   return detail::getHostCPUNameForPowerPC(Content);
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 StringRef sys::getHostCPUName() {
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
-  const StringRef& Content = P ? P->getBuffer() : "";
+  StringRef Content = P ? P->getBuffer() : "";
   return detail::getHostCPUNameForARM(Content);
 }
 #elif defined(__linux__) && defined(__s390x__)
 StringRef sys::getHostCPUName() {
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
-  const StringRef& Content = P ? P->getBuffer() : "";
+  StringRef Content = P ? P->getBuffer() : "";
   return detail::getHostCPUNameForS390x(Content);
 }
 #else
@@ -1206,6 +1205,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 
   bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
                      !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  Features["sahf"]   = HasExtLeaf1 && ((ECX >>  0) & 1);
   Features["lzcnt"]  = HasExtLeaf1 && ((ECX >>  5) & 1);
   Features["sse4a"]  = HasExtLeaf1 && ((ECX >>  6) & 1);
   Features["prfchw"] = HasExtLeaf1 && ((ECX >>  8) & 1);
@@ -1215,9 +1215,12 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["tbm"]    = HasExtLeaf1 && ((ECX >> 21) & 1);
   Features["mwaitx"] = HasExtLeaf1 && ((ECX >> 29) & 1);
 
+  // Miscellaneous memory related features, detected by
+  // using the 0x80000008 leaf of the CPUID instruction
   bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
                      !getX86CpuIDAndInfo(0x80000008, &EAX, &EBX, &ECX, &EDX);
-  Features["clzero"] = HasExtLeaf8 && ((EBX >> 0) & 1);
+  Features["clzero"]   = HasExtLeaf8 && ((EBX >> 0) & 1);
+  Features["wbnoinvd"] = HasExtLeaf8 && ((EBX >> 9) & 1);
 
   bool HasLeaf7 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
@@ -1228,6 +1231,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // AVX2 is only supported if we have the OS save support from AVX.
   Features["avx2"]       = HasLeaf7 && ((EBX >>  5) & 1) && HasAVXSave;
   Features["bmi2"]       = HasLeaf7 && ((EBX >>  8) & 1);
+  Features["invpcid"]    = HasLeaf7 && ((EBX >> 10) & 1);
   Features["rtm"]        = HasLeaf7 && ((EBX >> 11) & 1);
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]    = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
@@ -1247,6 +1251,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["prefetchwt1"]     = HasLeaf7 && ((ECX >>  0) & 1);
   Features["avx512vbmi"]      = HasLeaf7 && ((ECX >>  1) & 1) && HasAVX512Save;
   Features["pku"]             = HasLeaf7 && ((ECX >>  4) & 1);
+  Features["waitpkg"]         = HasLeaf7 && ((ECX >>  5) & 1);
   Features["avx512vbmi2"]     = HasLeaf7 && ((ECX >>  6) & 1) && HasAVX512Save;
   Features["shstk"]           = HasLeaf7 && ((ECX >>  7) & 1);
   Features["gfni"]            = HasLeaf7 && ((ECX >>  8) & 1);
@@ -1255,7 +1260,22 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avx512vnni"]      = HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save;
   Features["avx512bitalg"]    = HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save;
   Features["avx512vpopcntdq"] = HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save;
-  Features["ibt"]             = HasLeaf7 && ((EDX >> 20) & 1);
+  Features["rdpid"]           = HasLeaf7 && ((ECX >> 22) & 1);
+  Features["cldemote"]        = HasLeaf7 && ((ECX >> 25) & 1);
+  Features["movdiri"]         = HasLeaf7 && ((ECX >> 27) & 1);
+  Features["movdir64b"]       = HasLeaf7 && ((ECX >> 28) & 1);
+
+  // There are two CPUID leafs which information associated with the pconfig
+  // instruction:
+  // EAX=0x7, ECX=0x0 indicates the availability of the instruction (via the 18th
+  // bit of EDX), while the EAX=0x1b leaf returns information on the
+  // availability of specific pconfig leafs.
+  // The target feature here only refers to the the first of these two.
+  // Users might need to check for the availability of specific pconfig
+  // leaves using cpuid, since that information is ignored while
+  // detecting features using the "-march=native" flag.
+  // For more info, see X86 ISA docs.
+  Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1);
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
@@ -1265,6 +1285,11 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["xsavec"]   = HasLeafD && ((EAX >> 1) & 1) && HasAVXSave;
   Features["xsaves"]   = HasLeafD && ((EAX >> 3) & 1) && HasAVXSave;
 
+  bool HasLeaf14 = MaxLevel >= 0x14 &&
+                  !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX);
+
+  Features["ptwrite"] = HasLeaf14 && ((EBX >> 4) & 1);
+
   return true;
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
diff --git a/contrib/llvm/lib/Support/InitLLVM.cpp b/contrib/llvm/lib/Support/InitLLVM.cpp
new file mode 100644
index 000000000000..c008d0455c99
--- /dev/null
+++ b/contrib/llvm/lib/Support/InitLLVM.cpp
@@ -0,0 +1,52 @@
+//===-- InitLLVM.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Signals.h"
+#include <string>
+
+#ifdef _WIN32
+#include "Windows/WindowsSupport.h"
+#endif
+
+using namespace llvm;
+using namespace llvm::sys;
+
+InitLLVM::InitLLVM(int &Argc, const char **&Argv) : StackPrinter(Argc, Argv) {
+  sys::PrintStackTraceOnErrorSignal(Argv[0]);
+
+#ifdef _WIN32
+  // We use UTF-8 as the internal character encoding. On Windows,
+  // arguments passed to main() may not be encoded in UTF-8. In order
+  // to reliably detect encoding of command line arguments, we use an
+  // Windows API to obtain arguments, convert them to UTF-8, and then
+  // write them back to the Argv vector.
+  //
+  // There's probably other way to do the same thing (e.g. using
+  // wmain() instead of main()), but this way seems less intrusive
+  // than that.
+  std::string Banner = std::string(Argv[0]) + ": ";
+  ExitOnError ExitOnErr(Banner);
+
+  ExitOnErr(errorCodeToError(windows::GetCommandLineArguments(Args, Alloc)));
+
+  // GetCommandLineArguments doesn't terminate the vector with a
+  // nullptr.  Do it to make it compatible with the real argv.
+  Args.push_back(nullptr);
+
+  Argc = Args.size() - 1;
+  Argv = Args.data();
+#endif
+}
+
+InitLLVM::~InitLLVM() { llvm_shutdown(); }
diff --git a/contrib/llvm/lib/Support/JSON.cpp b/contrib/llvm/lib/Support/JSON.cpp
new file mode 100644
index 000000000000..a5dae7a7c2e0
--- /dev/null
+++ b/contrib/llvm/lib/Support/JSON.cpp
@@ -0,0 +1,693 @@
+//=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Format.h"
+#include <cctype>
+
+namespace llvm {
+namespace json {
+
+Value &Object::operator[](const ObjectKey &K) {
+  return try_emplace(K, nullptr).first->getSecond();
+}
+Value &Object::operator[](ObjectKey &&K) {
+  return try_emplace(std::move(K), nullptr).first->getSecond();
+}
+Value *Object::get(StringRef K) {
+  auto I = find(K);
+  if (I == end())
+    return nullptr;
+  return &I->second;
+}
+const Value *Object::get(StringRef K) const {
+  auto I = find(K);
+  if (I == end())
+    return nullptr;
+  return &I->second;
+}
+llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsNull();
+  return llvm::None;
+}
+llvm::Optional<bool> Object::getBoolean(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsBoolean();
+  return llvm::None;
+}
+llvm::Optional<double> Object::getNumber(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsNumber();
+  return llvm::None;
+}
+llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsInteger();
+  return llvm::None;
+}
+llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsString();
+  return llvm::None;
+}
+const json::Object *Object::getObject(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsObject();
+  return nullptr;
+}
+json::Object *Object::getObject(StringRef K) {
+  if (auto *V = get(K))
+    return V->getAsObject();
+  return nullptr;
+}
+const json::Array *Object::getArray(StringRef K) const {
+  if (auto *V = get(K))
+    return V->getAsArray();
+  return nullptr;
+}
+json::Array *Object::getArray(StringRef K) {
+  if (auto *V = get(K))
+    return V->getAsArray();
+  return nullptr;
+}
+bool operator==(const Object &LHS, const Object &RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+  for (const auto &L : LHS) {
+    auto R = RHS.find(L.first);
+    if (R == RHS.end() || L.second != R->second)
+      return false;
+  }
+  return true;
+}
+
+Array::Array(std::initializer_list<Value> Elements) {
+  V.reserve(Elements.size());
+  for (const Value &V : Elements) {
+    emplace_back(nullptr);
+    back().moveFrom(std::move(V));
+  }
+}
+
+Value::Value(std::initializer_list<Value> Elements)
+    : Value(json::Array(Elements)) {}
+
+void Value::copyFrom(const Value &M) {
+  Type = M.Type;
+  switch (Type) {
+  case T_Null:
+  case T_Boolean:
+  case T_Double:
+  case T_Integer:
+    memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
+    break;
+  case T_StringRef:
+    create<StringRef>(M.as<StringRef>());
+    break;
+  case T_String:
+    create<std::string>(M.as<std::string>());
+    break;
+  case T_Object:
+    create<json::Object>(M.as<json::Object>());
+    break;
+  case T_Array:
+    create<json::Array>(M.as<json::Array>());
+    break;
+  }
+}
+
+void Value::moveFrom(const Value &&M) {
+  Type = M.Type;
+  switch (Type) {
+  case T_Null:
+  case T_Boolean:
+  case T_Double:
+  case T_Integer:
+    memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
+    break;
+  case T_StringRef:
+    create<StringRef>(M.as<StringRef>());
+    break;
+  case T_String:
+    create<std::string>(std::move(M.as<std::string>()));
+    M.Type = T_Null;
+    break;
+  case T_Object:
+    create<json::Object>(std::move(M.as<json::Object>()));
+    M.Type = T_Null;
+    break;
+  case T_Array:
+    create<json::Array>(std::move(M.as<json::Array>()));
+    M.Type = T_Null;
+    break;
+  }
+}
+
+void Value::destroy() {
+  switch (Type) {
+  case T_Null:
+  case T_Boolean:
+  case T_Double:
+  case T_Integer:
+    break;
+  case T_StringRef:
+    as<StringRef>().~StringRef();
+    break;
+  case T_String:
+    as<std::string>().~basic_string();
+    break;
+  case T_Object:
+    as<json::Object>().~Object();
+    break;
+  case T_Array:
+    as<json::Array>().~Array();
+    break;
+  }
+}
+
+bool operator==(const Value &L, const Value &R) {
+  if (L.kind() != R.kind())
+    return false;
+  switch (L.kind()) {
+  case Value::Null:
+    return *L.getAsNull() == *R.getAsNull();
+  case Value::Boolean:
+    return *L.getAsBoolean() == *R.getAsBoolean();
+  case Value::Number:
+    return *L.getAsNumber() == *R.getAsNumber();
+  case Value::String:
+    return *L.getAsString() == *R.getAsString();
+  case Value::Array:
+    return *L.getAsArray() == *R.getAsArray();
+  case Value::Object:
+    return *L.getAsObject() == *R.getAsObject();
+  }
+  llvm_unreachable("Unknown value kind");
+}
+
+namespace {
+// Simple recursive-descent JSON parser.
+class Parser {
+public:
+  Parser(StringRef JSON)
+      : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
+
+  bool checkUTF8() {
+    size_t ErrOffset;
+    if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
+      return true;
+    P = Start + ErrOffset; // For line/column calculation.
+    return parseError("Invalid UTF-8 sequence");
+  }
+
+  bool parseValue(Value &Out);
+
+  bool assertEnd() {
+    eatWhitespace();
+    if (P == End)
+      return true;
+    return parseError("Text after end of document");
+  }
+
+  Error takeError() {
+    assert(Err);
+    return std::move(*Err);
+  }
+
+private:
+  void eatWhitespace() {
+    while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
+      ++P;
+  }
+
+  // On invalid syntax, parseX() functions return false and set Err.
+  bool parseNumber(char First, Value &Out);
+  bool parseString(std::string &Out);
+  bool parseUnicode(std::string &Out);
+  bool parseError(const char *Msg); // always returns false
+
+  char next() { return P == End ? 0 : *P++; }
+  char peek() { return P == End ? 0 : *P; }
+  static bool isNumber(char C) {
+    return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
+           C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
+           C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
+  }
+
+  Optional<Error> Err;
+  const char *Start, *P, *End;
+};
+
+bool Parser::parseValue(Value &Out) {
+  eatWhitespace();
+  if (P == End)
+    return parseError("Unexpected EOF");
+  switch (char C = next()) {
+  // Bare null/true/false are easy - first char identifies them.
+  case 'n':
+    Out = nullptr;
+    return (next() == 'u' && next() == 'l' && next() == 'l') ||
+           parseError("Invalid JSON value (null?)");
+  case 't':
+    Out = true;
+    return (next() == 'r' && next() == 'u' && next() == 'e') ||
+           parseError("Invalid JSON value (true?)");
+  case 'f':
+    Out = false;
+    return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
+           parseError("Invalid JSON value (false?)");
+  case '"': {
+    std::string S;
+    if (parseString(S)) {
+      Out = std::move(S);
+      return true;
+    }
+    return false;
+  }
+  case '[': {
+    Out = Array{};
+    Array &A = *Out.getAsArray();
+    eatWhitespace();
+    if (peek() == ']') {
+      ++P;
+      return true;
+    }
+    for (;;) {
+      A.emplace_back(nullptr);
+      if (!parseValue(A.back()))
+        return false;
+      eatWhitespace();
+      switch (next()) {
+      case ',':
+        eatWhitespace();
+        continue;
+      case ']':
+        return true;
+      default:
+        return parseError("Expected , or ] after array element");
+      }
+    }
+  }
+  case '{': {
+    Out = Object{};
+    Object &O = *Out.getAsObject();
+    eatWhitespace();
+    if (peek() == '}') {
+      ++P;
+      return true;
+    }
+    for (;;) {
+      if (next() != '"')
+        return parseError("Expected object key");
+      std::string K;
+      if (!parseString(K))
+        return false;
+      eatWhitespace();
+      if (next() != ':')
+        return parseError("Expected : after object key");
+      eatWhitespace();
+      if (!parseValue(O[std::move(K)]))
+        return false;
+      eatWhitespace();
+      switch (next()) {
+      case ',':
+        eatWhitespace();
+        continue;
+      case '}':
+        return true;
+      default:
+        return parseError("Expected , or } after object property");
+      }
+    }
+  }
+  default:
+    if (isNumber(C))
+      return parseNumber(C, Out);
+    return parseError("Invalid JSON value");
+  }
+}
+
+bool Parser::parseNumber(char First, Value &Out) {
+  // Read the number into a string. (Must be null-terminated for strto*).
+  SmallString<24> S;
+  S.push_back(First);
+  while (isNumber(peek()))
+    S.push_back(next());
+  char *End;
+  // Try first to parse as integer, and if so preserve full 64 bits.
+  // strtoll returns long long >= 64 bits, so check it's in range too.
+  auto I = std::strtoll(S.c_str(), &End, 10);
+  if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
+      I <= std::numeric_limits<int64_t>::max()) {
+    Out = int64_t(I);
+    return true;
+  }
+  // If it's not an integer
+  Out = std::strtod(S.c_str(), &End);
+  return End == S.end() || parseError("Invalid JSON value (number?)");
+}
+
+bool Parser::parseString(std::string &Out) {
+  // leading quote was already consumed.
+  for (char C = next(); C != '"'; C = next()) {
+    if (LLVM_UNLIKELY(P == End))
+      return parseError("Unterminated string");
+    if (LLVM_UNLIKELY((C & 0x1f) == C))
+      return parseError("Control character in string");
+    if (LLVM_LIKELY(C != '\\')) {
+      Out.push_back(C);
+      continue;
+    }
+    // Handle escape sequence.
+    switch (C = next()) {
+    case '"':
+    case '\\':
+    case '/':
+      Out.push_back(C);
+      break;
+    case 'b':
+      Out.push_back('\b');
+      break;
+    case 'f':
+      Out.push_back('\f');
+      break;
+    case 'n':
+      Out.push_back('\n');
+      break;
+    case 'r':
+      Out.push_back('\r');
+      break;
+    case 't':
+      Out.push_back('\t');
+      break;
+    case 'u':
+      if (!parseUnicode(Out))
+        return false;
+      break;
+    default:
+      return parseError("Invalid escape sequence");
+    }
+  }
+  return true;
+}
+
+static void encodeUtf8(uint32_t Rune, std::string &Out) {
+  if (Rune < 0x80) {
+    Out.push_back(Rune & 0x7F);
+  } else if (Rune < 0x800) {
+    uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
+    uint8_t SecondByte = 0x80 | (Rune & 0x3F);
+    Out.push_back(FirstByte);
+    Out.push_back(SecondByte);
+  } else if (Rune < 0x10000) {
+    uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
+    uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
+    uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
+    Out.push_back(FirstByte);
+    Out.push_back(SecondByte);
+    Out.push_back(ThirdByte);
+  } else if (Rune < 0x110000) {
+    uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
+    uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
+    uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
+    uint8_t FourthByte = 0x80 | (Rune & 0x3F);
+    Out.push_back(FirstByte);
+    Out.push_back(SecondByte);
+    Out.push_back(ThirdByte);
+    Out.push_back(FourthByte);
+  } else {
+    llvm_unreachable("Invalid codepoint");
+  }
+}
+
+// Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
+// May parse several sequential escapes to ensure proper surrogate handling.
+// We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
+// These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
+bool Parser::parseUnicode(std::string &Out) {
+  // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
+  auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
+  // Decodes 4 hex digits from the stream into Out, returns false on error.
+  auto Parse4Hex = [this](uint16_t &Out) -> bool {
+    Out = 0;
+    char Bytes[] = {next(), next(), next(), next()};
+    for (unsigned char C : Bytes) {
+      if (!std::isxdigit(C))
+        return parseError("Invalid \\u escape sequence");
+      Out <<= 4;
+      Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
+    }
+    return true;
+  };
+  uint16_t First; // UTF-16 code unit from the first \u escape.
+  if (!Parse4Hex(First))
+    return false;
+
+  // We loop to allow proper surrogate-pair error handling.
+  while (true) {
+    // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
+    if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
+      encodeUtf8(First, Out);
+      return true;
+    }
+
+    // Case 2: it's an (unpaired) trailing surrogate.
+    if (LLVM_UNLIKELY(First >= 0xDC00)) {
+      Invalid();
+      return true;
+    }
+
+    // Case 3: it's a leading surrogate. We expect a trailing one next.
+    // Case 3a: there's no trailing \u escape. Don't advance in the stream.
+    if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
+      Invalid(); // Leading surrogate was unpaired.
+      return true;
+    }
+    P += 2;
+    uint16_t Second;
+    if (!Parse4Hex(Second))
+      return false;
+    // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
+    if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
+      Invalid();      // Leading surrogate was unpaired.
+      First = Second; // Second escape still needs to be processed.
+      continue;
+    }
+    // Case 3c: a valid surrogate pair encoding an astral codepoint.
+    encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
+    return true;
+  }
+}
+
+bool Parser::parseError(const char *Msg) {
+  int Line = 1;
+  const char *StartOfLine = Start;
+  for (const char *X = Start; X < P; ++X) {
+    if (*X == 0x0A) {
+      ++Line;
+      StartOfLine = X + 1;
+    }
+  }
+  Err.emplace(
+      llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
+  return false;
+}
+} // namespace
+
+Expected<Value> parse(StringRef JSON) {
+  Parser P(JSON);
+  Value E = nullptr;
+  if (P.checkUTF8())
+    if (P.parseValue(E))
+      if (P.assertEnd())
+        return std::move(E);
+  return P.takeError();
+}
+char ParseError::ID = 0;
+
+static std::vector<const Object::value_type *> sortedElements(const Object &O) {
+  std::vector<const Object::value_type *> Elements;
+  for (const auto &E : O)
+    Elements.push_back(&E);
+  llvm::sort(Elements.begin(), Elements.end(),
+             [](const Object::value_type *L, const Object::value_type *R) {
+               return L->first < R->first;
+             });
+  return Elements;
+}
+
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
+  // Fast-path for ASCII, which is valid UTF-8.
+  if (LLVM_LIKELY(isASCII(S)))
+    return true;
+
+  const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
+  if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
+    return true;
+
+  if (ErrOffset)
+    *ErrOffset = Rest - Data;
+  return false;
+}
+
+std::string fixUTF8(llvm::StringRef S) {
+  // This isn't particularly efficient, but is only for error-recovery.
+  std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
+  const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
+  UTF32 *Out32 = Codepoints.data();
+  ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
+                     lenientConversion);
+  Codepoints.resize(Out32 - Codepoints.data());
+  std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
+  const UTF32 *In32 = Codepoints.data();
+  UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
+  ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
+                     strictConversion);
+  Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
+  return Res;
+}
+
+} // namespace json
+} // namespace llvm
+
+static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
+  OS << '\"';
+  for (unsigned char C : S) {
+    if (C == 0x22 || C == 0x5C)
+      OS << '\\';
+    if (C >= 0x20) {
+      OS << C;
+      continue;
+    }
+    OS << '\\';
+    switch (C) {
+    // A few characters are common enough to make short escapes worthwhile.
+    case '\t':
+      OS << 't';
+      break;
+    case '\n':
+      OS << 'n';
+      break;
+    case '\r':
+      OS << 'r';
+      break;
+    default:
+      OS << 'u';
+      llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
+      break;
+    }
+  }
+  OS << '\"';
+}
+
+enum IndenterAction {
+  Indent,
+  Outdent,
+  Newline,
+  Space,
+};
+
+// Prints JSON. The indenter can be used to control formatting.
+template <typename Indenter>
+void llvm::json::Value::print(raw_ostream &OS, const Indenter &I) const {
+  switch (Type) {
+  case T_Null:
+    OS << "null";
+    break;
+  case T_Boolean:
+    OS << (as<bool>() ? "true" : "false");
+    break;
+  case T_Double:
+    OS << format("%.*g", std::numeric_limits<double>::max_digits10,
+                 as<double>());
+    break;
+  case T_Integer:
+    OS << as<int64_t>();
+    break;
+  case T_StringRef:
+    quote(OS, as<StringRef>());
+    break;
+  case T_String:
+    quote(OS, as<std::string>());
+    break;
+  case T_Object: {
+    bool Comma = false;
+    OS << '{';
+    I(Indent);
+    for (const auto *P : sortedElements(as<json::Object>())) {
+      if (Comma)
+        OS << ',';
+      Comma = true;
+      I(Newline);
+      quote(OS, P->first);
+      OS << ':';
+      I(Space);
+      P->second.print(OS, I);
+    }
+    I(Outdent);
+    if (Comma)
+      I(Newline);
+    OS << '}';
+    break;
+  }
+  case T_Array: {
+    bool Comma = false;
+    OS << '[';
+    I(Indent);
+    for (const auto &E : as<json::Array>()) {
+      if (Comma)
+        OS << ',';
+      Comma = true;
+      I(Newline);
+      E.print(OS, I);
+    }
+    I(Outdent);
+    if (Comma)
+      I(Newline);
+    OS << ']';
+    break;
+  }
+  }
+}
+
+void llvm::format_provider<llvm::json::Value>::format(
+    const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
+  if (Options.empty()) {
+    OS << E;
+    return;
+  }
+  unsigned IndentAmount = 0;
+  if (Options.getAsInteger(/*Radix=*/10, IndentAmount))
+    llvm_unreachable("json::Value format options should be an integer");
+  unsigned IndentLevel = 0;
+  E.print(OS, [&](IndenterAction A) {
+    switch (A) {
+    case Newline:
+      OS << '\n';
+      OS.indent(IndentLevel);
+      break;
+    case Space:
+      OS << ' ';
+      break;
+    case Indent:
+      IndentLevel += IndentAmount;
+      break;
+    case Outdent:
+      IndentLevel -= IndentAmount;
+      break;
+    };
+  });
+}
+
+llvm::raw_ostream &llvm::json::operator<<(raw_ostream &OS, const Value &E) {
+  E.print(OS, [](IndenterAction A) { /*ignore*/ });
+  return OS;
+}
diff --git a/contrib/llvm/lib/Support/Locale.cpp b/contrib/llvm/lib/Support/Locale.cpp
index e24a28be4306..e57d377c9ab5 100644
--- a/contrib/llvm/lib/Support/Locale.cpp
+++ b/contrib/llvm/lib/Support/Locale.cpp
@@ -1,6 +1,5 @@
 #include "llvm/Support/Locale.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Unicode.h"
 
 namespace llvm {
@@ -8,7 +7,7 @@ namespace sys {
 namespace locale {
 
 int columnWidth(StringRef Text) {
-#if LLVM_ON_WIN32
+#if _WIN32
   return Text.size();
 #else
   return llvm::sys::unicode::columnWidthUTF8(Text);
@@ -16,7 +15,7 @@ int columnWidth(StringRef Text) {
 }
 
 bool isPrint(int UCS) {
-#if LLVM_ON_WIN32
+#if _WIN32
   // Restrict characters that we'll try to print to the lower part of ASCII
   // except for the control characters (0x20 - 0x7E). In general one can not
   // reliably output code points U+0080 and higher using narrow character C/C++
diff --git a/contrib/llvm/lib/Support/LockFileManager.cpp b/contrib/llvm/lib/Support/LockFileManager.cpp
index ec951f33a36a..77baf7ac4bdd 100644
--- a/contrib/llvm/lib/Support/LockFileManager.cpp
+++ b/contrib/llvm/lib/Support/LockFileManager.cpp
@@ -24,7 +24,7 @@
 #include <sys/types.h>
 #include <system_error>
 #include <tuple>
-#if LLVM_ON_WIN32
+#if _WIN32
 #include <windows.h>
 #endif
 #if LLVM_ON_UNIX
@@ -43,7 +43,7 @@
 
 using namespace llvm;
 
-/// \brief Attempt to read the lock file with the given name, if it exists.
+/// Attempt to read the lock file with the given name, if it exists.
 ///
 /// \param LockFileName The name of the lock file to read.
 ///
@@ -123,21 +123,33 @@ bool LockFileManager::processStillExecuting(StringRef HostID, int PID) {
 
 namespace {
 
-/// An RAII helper object for cleanups.
-class RAIICleanup {
-  std::function<void()> Fn;
-  bool Canceled = false;
-
+/// An RAII helper object ensure that the unique lock file is removed.
+///
+/// Ensures that if there is an error or a signal before we finish acquiring the
+/// lock, the unique file will be removed. And if we successfully take the lock,
+/// the signal handler is left in place so that signals while the lock is held
+/// will remove the unique lock file. The caller should ensure there is a
+/// matching call to sys::DontRemoveFileOnSignal when the lock is released.
+class RemoveUniqueLockFileOnSignal {
+  StringRef Filename;
+  bool RemoveImmediately;
 public:
-  RAIICleanup(std::function<void()> Fn) : Fn(Fn) {}
+  RemoveUniqueLockFileOnSignal(StringRef Name)
+  : Filename(Name), RemoveImmediately(true) {
+    sys::RemoveFileOnSignal(Filename, nullptr);
+  }
 
-  ~RAIICleanup() {
-    if (Canceled)
+  ~RemoveUniqueLockFileOnSignal() {
+    if (!RemoveImmediately) {
+      // Leave the signal handler enabled. It will be removed when the lock is
+      // released.
       return;
-    Fn();
+    }
+    sys::fs::remove(Filename);
+    sys::DontRemoveFileOnSignal(Filename);
   }
 
-  void cancel() { Canceled = true; }
+  void lockAcquired() { RemoveImmediately = false; }
 };
 
 } // end anonymous namespace
@@ -160,22 +172,16 @@ LockFileManager::LockFileManager(StringRef FileName)
     return;
 
   // Create a lock file that is unique to this instance.
-  Expected<sys::fs::TempFile> Temp =
-      sys::fs::TempFile::create(LockFileName + "-%%%%%%%%");
-  if (!Temp) {
-    std::error_code EC = errorToErrorCode(Temp.takeError());
-    std::string S("failed to create unique file with prefix ");
-    S.append(LockFileName.str());
+  UniqueLockFileName = LockFileName;
+  UniqueLockFileName += "-%%%%%%%%";
+  int UniqueLockFileID;
+  if (std::error_code EC = sys::fs::createUniqueFile(
+          UniqueLockFileName, UniqueLockFileID, UniqueLockFileName)) {
+    std::string S("failed to create unique file ");
+    S.append(UniqueLockFileName.str());
     setError(EC, S);
     return;
   }
-  UniqueLockFile = std::move(*Temp);
-
-  // Make sure we discard the temporary file on exit.
-  RAIICleanup RemoveTempFile([&]() {
-    if (Error E = UniqueLockFile->discard())
-      setError(errorToErrorCode(std::move(E)));
-  });
 
   // Write our process ID to our unique lock file.
   {
@@ -185,46 +191,54 @@ LockFileManager::LockFileManager(StringRef FileName)
       return;
     }
 
-    raw_fd_ostream Out(UniqueLockFile->FD, /*shouldClose=*/false);
+    raw_fd_ostream Out(UniqueLockFileID, /*shouldClose=*/true);
     Out << HostID << ' ';
 #if LLVM_ON_UNIX
     Out << getpid();
 #else
     Out << "1";
 #endif
-    Out.flush();
+    Out.close();
 
     if (Out.has_error()) {
       // We failed to write out PID, so report the error, remove the
       // unique lock file, and fail.
       std::string S("failed to write to ");
-      S.append(UniqueLockFile->TmpName);
+      S.append(UniqueLockFileName.str());
       setError(Out.error(), S);
+      sys::fs::remove(UniqueLockFileName);
       return;
     }
   }
 
+  // Clean up the unique file on signal, which also releases the lock if it is
+  // held since the .lock symlink will point to a nonexistent file.
+  RemoveUniqueLockFileOnSignal RemoveUniqueFile(UniqueLockFileName);
+
   while (true) {
     // Create a link from the lock file name. If this succeeds, we're done.
     std::error_code EC =
-        sys::fs::create_link(UniqueLockFile->TmpName, LockFileName);
+        sys::fs::create_link(UniqueLockFileName, LockFileName);
     if (!EC) {
-      RemoveTempFile.cancel();
+      RemoveUniqueFile.lockAcquired();
       return;
     }
 
     if (EC != errc::file_exists) {
       std::string S("failed to create link ");
       raw_string_ostream OSS(S);
-      OSS << LockFileName.str() << " to " << UniqueLockFile->TmpName;
+      OSS << LockFileName.str() << " to " << UniqueLockFileName.str();
       setError(EC, OSS.str());
       return;
     }
 
     // Someone else managed to create the lock file first. Read the process ID
     // from the lock file.
-    if ((Owner = readLockFile(LockFileName)))
-      return; // RemoveTempFile will delete out our unique lock file.
+    if ((Owner = readLockFile(LockFileName))) {
+      // Wipe out our unique lock file (it's useless now)
+      sys::fs::remove(UniqueLockFileName);
+      return;
+    }
 
     if (!sys::fs::exists(LockFileName)) {
       // The previous owner released the lock file before we could read it.
@@ -236,7 +250,7 @@ LockFileManager::LockFileManager(StringRef FileName)
     // ownership.
     if ((EC = sys::fs::remove(LockFileName))) {
       std::string S("failed to remove lockfile ");
-      S.append(LockFileName.str());
+      S.append(UniqueLockFileName.str());
       setError(EC, S);
       return;
     }
@@ -271,14 +285,17 @@ LockFileManager::~LockFileManager() {
 
   // Since we own the lock, remove the lock file and our own unique lock file.
   sys::fs::remove(LockFileName);
-  consumeError(UniqueLockFile->discard());
+  sys::fs::remove(UniqueLockFileName);
+  // The unique file is now gone, so remove it from the signal handler. This
+  // matches a sys::RemoveFileOnSignal() in LockFileManager().
+  sys::DontRemoveFileOnSignal(UniqueLockFileName);
 }
 
 LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   if (getState() != LFS_Shared)
     return Res_Success;
 
-#if LLVM_ON_WIN32
+#if _WIN32
   unsigned long Interval = 1;
 #else
   struct timespec Interval;
@@ -293,7 +310,7 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
     // finish up and remove the lock file.
     // FIXME: Should we hook in to system APIs to get a notification when the
     // lock file is deleted?
-#if LLVM_ON_WIN32
+#if _WIN32
     Sleep(Interval);
 #else
     nanosleep(&Interval, nullptr);
@@ -312,7 +329,7 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
       return Res_OwnerDied;
 
     // Exponentially increase the time we wait for the lock to be removed.
-#if LLVM_ON_WIN32
+#if _WIN32
     Interval *= 2;
 #else
     Interval.tv_sec *= 2;
@@ -323,7 +340,7 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
     }
 #endif
   } while (
-#if LLVM_ON_WIN32
+#if _WIN32
            Interval < MaxSeconds * 1000
 #else
            Interval.tv_sec < (time_t)MaxSeconds
diff --git a/contrib/llvm/lib/Support/MD5.cpp b/contrib/llvm/lib/Support/MD5.cpp
index a53172279236..9b02f62912fa 100644
--- a/contrib/llvm/lib/Support/MD5.cpp
+++ b/contrib/llvm/lib/Support/MD5.cpp
@@ -74,7 +74,7 @@
 
 using namespace llvm;
 
-/// \brief This processes one or more 64-byte data blocks, but does NOT update
+/// This processes one or more 64-byte data blocks, but does NOT update
 ///the bit counters.  There are no alignment requirements.
 const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
   const uint8_t *ptr;
@@ -229,7 +229,7 @@ void MD5::update(StringRef Str) {
   update(SVal);
 }
 
-/// \brief Finish the hash and place the resulting hash into \p result.
+/// Finish the hash and place the resulting hash into \p result.
 /// \param Result is assumed to be a minimum of 16-bytes in size.
 void MD5::final(MD5Result &Result) {
   unsigned long used, free;
diff --git a/contrib/llvm/lib/Support/ManagedStatic.cpp b/contrib/llvm/lib/Support/ManagedStatic.cpp
index fb7cd070c42d..1c884dc70fc9 100644
--- a/contrib/llvm/lib/Support/ManagedStatic.cpp
+++ b/contrib/llvm/lib/Support/ManagedStatic.cpp
@@ -28,9 +28,6 @@ static void initializeMutex() {
 }
 
 static sys::Mutex* getManagedStaticMutex() {
-  // We need to use a function local static here, since this can get called
-  // during a static constructor and we need to guarantee that it's initialized
-  // correctly.
   llvm::call_once(mutex_init_flag, initializeMutex);
   return ManagedStaticMutex;
 }
diff --git a/contrib/llvm/lib/Support/Memory.cpp b/contrib/llvm/lib/Support/Memory.cpp
index f9a4903ad015..c245eedd2c16 100644
--- a/contrib/llvm/lib/Support/Memory.cpp
+++ b/contrib/llvm/lib/Support/Memory.cpp
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Memory.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Valgrind.h"
 
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Memory.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Memory.inc"
 #endif
diff --git a/contrib/llvm/lib/Support/MemoryBuffer.cpp b/contrib/llvm/lib/Support/MemoryBuffer.cpp
index c709fc416df6..4428c2f24e32 100644
--- a/contrib/llvm/lib/Support/MemoryBuffer.cpp
+++ b/contrib/llvm/lib/Support/MemoryBuffer.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include <cassert>
 #include <cerrno>
 #include <cstring>
@@ -139,15 +140,6 @@ MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) {
   return nullptr;
 }
 
-std::unique_ptr<MemoryBuffer>
-MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
-  auto SB = WritableMemoryBuffer::getNewUninitMemBuffer(Size, BufferName);
-  if (!SB)
-    return nullptr;
-  memset(SB->getBufferStart(), 0, Size);
-  return std::move(SB);
-}
-
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
                              bool RequiresNullTerminator) {
@@ -171,7 +163,7 @@ MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize,
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.
+/// Memory maps a file descriptor using sys::fs::mapped_file_region.
 ///
 /// This handles converting the offset into a legal offset on the platform.
 template<typename MB>
@@ -193,10 +185,8 @@ class MemoryBufferMMapFile : public MB {
 public:
   MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
                        uint64_t Offset, std::error_code &EC)
-      : MFR(FD,
-            MB::Writable ? sys::fs::mapped_file_region::priv
-                         : sys::fs::mapped_file_region::readonly,
-            getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
+      : MFR(FD, MB::Mapmode, getLegalMapSize(Len, Offset),
+            getLegalMapOffset(Offset), EC) {
     if (!EC) {
       const char *Start = getStart(Len, Offset);
       MemoryBuffer::init(Start, Start + Len, RequiresNullTerminator);
@@ -226,7 +216,7 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
   // Read into Buffer until we hit EOF.
   do {
     Buffer.reserve(Buffer.size() + ChunkSize);
-    ReadBytes = sys::RetryAfterSignal(-1, read, FD, Buffer.end(), ChunkSize);
+    ReadBytes = sys::RetryAfterSignal(-1, ::read, FD, Buffer.end(), ChunkSize);
     if (ReadBytes == -1)
       return std::error_code(errno, std::generic_category());
     Buffer.set_size(Buffer.size() + ReadBytes);
@@ -254,7 +244,7 @@ static ErrorOr<std::unique_ptr<MB>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
            uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) {
   int FD;
-  std::error_code EC = sys::fs::openFileForRead(Filename, FD);
+  std::error_code EC = sys::fs::openFileForRead(Filename, FD, sys::fs::OF_None);
 
   if (EC)
     return EC;
@@ -306,6 +296,15 @@ WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName
   return std::unique_ptr<WritableMemoryBuffer>(Ret);
 }
 
+std::unique_ptr<WritableMemoryBuffer>
+WritableMemoryBuffer::getNewMemBuffer(size_t Size, const Twine &BufferName) {
+  auto SB = WritableMemoryBuffer::getNewUninitMemBuffer(Size, BufferName);
+  if (!SB)
+    return nullptr;
+  memset(SB->getBufferStart(), 0, Size);
+  return SB;
+}
+
 static bool shouldUseMmap(int FD,
                           size_t FileSize,
                           size_t MapSize,
@@ -361,6 +360,59 @@ static bool shouldUseMmap(int FD,
   return true;
 }
 
+static ErrorOr<std::unique_ptr<WriteThroughMemoryBuffer>>
+getReadWriteFile(const Twine &Filename, uint64_t FileSize, uint64_t MapSize,
+                 uint64_t Offset) {
+  int FD;
+  std::error_code EC = sys::fs::openFileForReadWrite(
+      Filename, FD, sys::fs::CD_OpenExisting, sys::fs::OF_None);
+
+  if (EC)
+    return EC;
+
+  // Default is to map the full file.
+  if (MapSize == uint64_t(-1)) {
+    // If we don't know the file size, use fstat to find out.  fstat on an open
+    // file descriptor is cheaper than stat on a random path.
+    if (FileSize == uint64_t(-1)) {
+      sys::fs::file_status Status;
+      std::error_code EC = sys::fs::status(FD, Status);
+      if (EC)
+        return EC;
+
+      // If this not a file or a block device (e.g. it's a named pipe
+      // or character device), we can't mmap it, so error out.
+      sys::fs::file_type Type = Status.type();
+      if (Type != sys::fs::file_type::regular_file &&
+          Type != sys::fs::file_type::block_file)
+        return make_error_code(errc::invalid_argument);
+
+      FileSize = Status.getSize();
+    }
+    MapSize = FileSize;
+  }
+
+  std::unique_ptr<WriteThroughMemoryBuffer> Result(
+      new (NamedBufferAlloc(Filename))
+          MemoryBufferMMapFile<WriteThroughMemoryBuffer>(false, FD, MapSize,
+                                                         Offset, EC));
+  if (EC)
+    return EC;
+  return std::move(Result);
+}
+
+ErrorOr<std::unique_ptr<WriteThroughMemoryBuffer>>
+WriteThroughMemoryBuffer::getFile(const Twine &Filename, int64_t FileSize) {
+  return getReadWriteFile(Filename, FileSize, FileSize, 0);
+}
+
+/// Map a subrange of the specified file as a WritableMemoryBuffer.
+ErrorOr<std::unique_ptr<WriteThroughMemoryBuffer>>
+WriteThroughMemoryBuffer::getFileSlice(const Twine &Filename, uint64_t MapSize,
+                                       uint64_t Offset) {
+  return getReadWriteFile(Filename, -1, MapSize, Offset);
+}
+
 template <typename MB>
 static ErrorOr<std::unique_ptr<MB>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
@@ -466,7 +518,7 @@ ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFileAsStream(const Twine &Filename) {
   int FD;
-  std::error_code EC = sys::fs::openFileForRead(Filename, FD);
+  std::error_code EC = sys::fs::openFileForRead(Filename, FD, sys::fs::OF_None);
   if (EC)
     return EC;
   ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
@@ -480,3 +532,6 @@ MemoryBufferRef MemoryBuffer::getMemBufferRef() const {
   StringRef Identifier = getBufferIdentifier();
   return MemoryBufferRef(Data, Identifier);
 }
+
+void MemoryBuffer::anchor() {}
+void SmallVectorMemoryBuffer::anchor() {}
diff --git a/contrib/llvm/lib/Support/Mutex.cpp b/contrib/llvm/lib/Support/Mutex.cpp
index b1d5e7c0d991..7138c7a4b984 100644
--- a/contrib/llvm/lib/Support/Mutex.cpp
+++ b/contrib/llvm/lib/Support/Mutex.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/ErrorHandling.h"
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
@@ -47,10 +47,7 @@ MutexImpl::MutexImpl( bool recursive)
 {
   // Declare the pthread_mutex data structures
   pthread_mutex_t* mutex =
-    static_cast<pthread_mutex_t*>(malloc(sizeof(pthread_mutex_t)));
-
-  if (mutex == nullptr)
-    report_bad_alloc_error("Mutex allocation failed");
+    static_cast<pthread_mutex_t*>(safe_malloc(sizeof(pthread_mutex_t)));
 
   pthread_mutexattr_t attr;
 
@@ -119,9 +116,9 @@ MutexImpl::tryacquire()
 
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/Mutex.inc"
-#elif defined( LLVM_ON_WIN32)
+#elif defined( _WIN32)
 #include "Windows/Mutex.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in Support/Mutex.cpp
+#warning Neither LLVM_ON_UNIX nor _WIN32 was set in Support/Mutex.cpp
 #endif
 #endif
diff --git a/contrib/llvm/lib/Support/NativeFormatting.cpp b/contrib/llvm/lib/Support/NativeFormatting.cpp
index b951a88a38db..85b4bfb81568 100644
--- a/contrib/llvm/lib/Support/NativeFormatting.cpp
+++ b/contrib/llvm/lib/Support/NativeFormatting.cpp
@@ -14,6 +14,8 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Format.h"
 
+#include <float.h>
+
 using namespace llvm;
 
 template<typename T, std::size_t N>
diff --git a/contrib/llvm/lib/Support/Parallel.cpp b/contrib/llvm/lib/Support/Parallel.cpp
index 010e42916f95..1844003b9d3d 100644
--- a/contrib/llvm/lib/Support/Parallel.cpp
+++ b/contrib/llvm/lib/Support/Parallel.cpp
@@ -9,6 +9,9 @@
 
 #include "llvm/Support/Parallel.h"
 #include "llvm/Config/llvm-config.h"
+
+#if LLVM_ENABLE_THREADS
+
 #include "llvm/Support/Threading.h"
 
 #include <atomic>
@@ -19,7 +22,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief An abstract class that takes closures and runs them asynchronously.
+/// An abstract class that takes closures and runs them asynchronously.
 class Executor {
 public:
   virtual ~Executor() = default;
@@ -28,19 +31,8 @@ public:
   static Executor *getDefaultExecutor();
 };
 
-#if !LLVM_ENABLE_THREADS
-class SyncExecutor : public Executor {
-public:
-  virtual void add(std::function<void()> F) { F(); }
-};
-
-Executor *Executor::getDefaultExecutor() {
-  static SyncExecutor Exec;
-  return &Exec;
-}
-
-#elif defined(_MSC_VER)
-/// \brief An Executor that runs tasks via ConcRT.
+#if defined(_MSC_VER)
+/// An Executor that runs tasks via ConcRT.
 class ConcRTExecutor : public Executor {
   struct Taskish {
     Taskish(std::function<void()> Task) : Task(Task) {}
@@ -67,7 +59,7 @@ Executor *Executor::getDefaultExecutor() {
 }
 
 #else
-/// \brief An implementation of an Executor that runs closures on a thread pool
+/// An implementation of an Executor that runs closures on a thread pool
 ///   in filo order.
 class ThreadPoolExecutor : public Executor {
 public:
@@ -127,7 +119,6 @@ Executor *Executor::getDefaultExecutor() {
 #endif
 }
 
-#if LLVM_ENABLE_THREADS
 void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
   L.inc();
   Executor::getDefaultExecutor()->add([&, F] {
@@ -135,4 +126,4 @@ void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
     L.dec();
   });
 }
-#endif
+#endif // LLVM_ENABLE_THREADS
diff --git a/contrib/llvm/lib/Support/Path.cpp b/contrib/llvm/lib/Support/Path.cpp
index f229f23a4f84..a806da23ec50 100644
--- a/contrib/llvm/lib/Support/Path.cpp
+++ b/contrib/llvm/lib/Support/Path.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/Path.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -37,7 +38,7 @@ namespace {
   using llvm::sys::path::Style;
 
   inline Style real_style(Style style) {
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
     return (style == Style::posix) ? Style::posix : Style::windows;
 #else
     return (style == Style::windows) ? Style::windows : Style::posix;
@@ -90,10 +91,9 @@ namespace {
     return path.substr(0, end);
   }
 
+  // Returns the first character of the filename in str. For paths ending in
+  // '/', it returns the position of the '/'.
   size_t filename_pos(StringRef str, Style style) {
-    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
-      return 0;
-
     if (str.size() > 0 && is_separator(str[str.size() - 1], style))
       return str.size() - 1;
 
@@ -110,6 +110,8 @@ namespace {
     return pos + 1;
   }
 
+  // Returns the position of the root directory in str. If there is no root
+  // directory in str, it returns StringRef::npos.
   size_t root_dir_start(StringRef str, Style style) {
     // case "c:/"
     if (real_style(style) == Style::windows) {
@@ -117,10 +119,6 @@ namespace {
         return 2;
     }
 
-    // case "//"
-    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
-      return StringRef::npos;
-
     // case "//net"
     if (str.size() > 3 && is_separator(str[0], style) && str[0] == str[1] &&
         !is_separator(str[2], style)) {
@@ -134,22 +132,29 @@ namespace {
     return StringRef::npos;
   }
 
+  // Returns the position past the end of the "parent path" of path. The parent
+  // path will not end in '/', unless the parent is the root directory. If the
+  // path has no parent, 0 is returned.
   size_t parent_path_end(StringRef path, Style style) {
     size_t end_pos = filename_pos(path, style);
 
     bool filename_was_sep =
         path.size() > 0 && is_separator(path[end_pos], style);
 
-    // Skip separators except for root dir.
-    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos), style);
-
-    while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+    // Skip separators until we reach root dir (or the start of the string).
+    size_t root_dir_pos = root_dir_start(path, style);
+    while (end_pos > 0 &&
+           (root_dir_pos == StringRef::npos || end_pos > root_dir_pos) &&
            is_separator(path[end_pos - 1], style))
       --end_pos;
 
-    if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
-      return StringRef::npos;
+    if (end_pos == root_dir_pos && !filename_was_sep) {
+      // We've reached the root dir and the input path was *not* ending in a
+      // sequence of slashes. Include the root dir in the parent path.
+      return root_dir_pos + 1;
+    }
 
+    // Otherwise, just include before the last slash.
     return end_pos;
   }
 } // end unnamed namespace
@@ -164,7 +169,7 @@ static std::error_code
 createUniqueEntity(const Twine &Model, int &ResultFD,
                    SmallVectorImpl<char> &ResultPath, bool MakeAbsolute,
                    unsigned Mode, FSEntity Type,
-                   sys::fs::OpenFlags Flags = sys::fs::F_None) {
+                   sys::fs::OpenFlags Flags = sys::fs::OF_None) {
   SmallString<128> ModelStorage;
   Model.toVector(ModelStorage);
 
@@ -196,8 +201,8 @@ retry_random_path:
   switch (Type) {
   case FS_File: {
     if (std::error_code EC =
-            sys::fs::openFileForWrite(Twine(ResultPath.begin()), ResultFD,
-                                      Flags | sys::fs::F_Excl, Mode)) {
+            sys::fs::openFileForReadWrite(Twine(ResultPath.begin()), ResultFD,
+                                          sys::fs::CD_CreateNew, Flags, Mode)) {
       if (EC == errc::file_exists)
         goto retry_random_path;
       return EC;
@@ -281,8 +286,8 @@ const_iterator &const_iterator::operator++() {
       ++Position;
     }
 
-    // Treat trailing '/' as a '.'.
-    if (Position == Path.size()) {
+    // Treat trailing '/' as a '.', unless it is the root dir.
+    if (Position == Path.size() && Component != "/") {
       --Position;
       Component = ".";
       return *this;
@@ -321,23 +326,23 @@ reverse_iterator rend(StringRef Path) {
 }
 
 reverse_iterator &reverse_iterator::operator++() {
-  // If we're at the end and the previous char was a '/', return '.' unless
-  // we are the root path.
   size_t root_dir_pos = root_dir_start(Path, S);
-  if (Position == Path.size() && Path.size() > root_dir_pos + 1 &&
-      is_separator(Path[Position - 1], S)) {
-    --Position;
-    Component = ".";
-    return *this;
-  }
 
   // Skip separators unless it's the root directory.
   size_t end_pos = Position;
-
   while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
          is_separator(Path[end_pos - 1], S))
     --end_pos;
 
+  // Treat trailing '/' as a '.', unless it is the root dir.
+  if (Position == Path.size() && !Path.empty() &&
+      is_separator(Path.back(), S) &&
+      (root_dir_pos == StringRef::npos || end_pos - 1 > root_dir_pos)) {
+    --Position;
+    Component = ".";
+    return *this;
+  }
+
   // Find next separator.
   size_t start_pos = filename_pos(Path.substr(0, end_pos), S);
   Component = Path.slice(start_pos, end_pos);
@@ -751,51 +756,64 @@ std::error_code getUniqueID(const Twine Path, UniqueID &Result) {
 
 std::error_code createUniqueFile(const Twine &Model, int &ResultFd,
                                  SmallVectorImpl<char> &ResultPath,
-                                 unsigned Mode, sys::fs::OpenFlags Flags) {
+                                 unsigned Mode) {
+  return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File);
+}
+
+static std::error_code createUniqueFile(const Twine &Model, int &ResultFd,
+                                        SmallVectorImpl<char> &ResultPath,
+                                        unsigned Mode, OpenFlags Flags) {
   return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File,
                             Flags);
 }
 
 std::error_code createUniqueFile(const Twine &Model,
-                                 SmallVectorImpl<char> &ResultPath) {
-  int Dummy;
-  return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name);
+                                 SmallVectorImpl<char> &ResultPath,
+                                 unsigned Mode) {
+  int FD;
+  auto EC = createUniqueFile(Model, FD, ResultPath, Mode);
+  if (EC)
+    return EC;
+  // FD is only needed to avoid race conditions. Close it right away.
+  close(FD);
+  return EC;
 }
 
 static std::error_code
 createTemporaryFile(const Twine &Model, int &ResultFD,
-                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type,
-                    sys::fs::OpenFlags Flags) {
+                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   SmallString<128> Storage;
   StringRef P = Model.toNullTerminatedStringRef(Storage);
   assert(P.find_first_of(separators(Style::native)) == StringRef::npos &&
          "Model must be a simple filename.");
   // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
   return createUniqueEntity(P.begin(), ResultFD, ResultPath, true,
-                            owner_read | owner_write, Type, Flags);
+                            owner_read | owner_write, Type);
 }
 
 static std::error_code
 createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD,
-                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type,
-                    sys::fs::OpenFlags Flags = sys::fs::F_None) {
+                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   const char *Middle = Suffix.empty() ? "-%%%%%%" : "-%%%%%%.";
   return createTemporaryFile(Prefix + Middle + Suffix, ResultFD, ResultPath,
-                             Type, Flags);
+                             Type);
 }
 
 std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
                                     int &ResultFD,
-                                    SmallVectorImpl<char> &ResultPath,
-                                    sys::fs::OpenFlags Flags) {
-  return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File,
-                             Flags);
+                                    SmallVectorImpl<char> &ResultPath) {
+  return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File);
 }
 
 std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
                                     SmallVectorImpl<char> &ResultPath) {
-  int Dummy;
-  return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
+  int FD;
+  auto EC = createTemporaryFile(Prefix, Suffix, FD, ResultPath);
+  if (EC)
+    return EC;
+  // FD is only needed to avoid race conditions. Close it right away.
+  close(FD);
+  return EC;
 }
 
 
@@ -804,8 +822,22 @@ std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
 std::error_code createUniqueDirectory(const Twine &Prefix,
                                       SmallVectorImpl<char> &ResultPath) {
   int Dummy;
-  return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath,
-                            true, 0, FS_Dir);
+  return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath, true, 0,
+                            FS_Dir);
+}
+
+std::error_code
+getPotentiallyUniqueFileName(const Twine &Model,
+                             SmallVectorImpl<char> &ResultPath) {
+  int Dummy;
+  return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name);
+}
+
+std::error_code
+getPotentiallyUniqueTempFileName(const Twine &Prefix, StringRef Suffix,
+                                 SmallVectorImpl<char> &ResultPath) {
+  int Dummy;
+  return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
 }
 
 static std::error_code make_absolute(const Twine &current_directory,
@@ -895,15 +927,7 @@ std::error_code create_directories(const Twine &Path, bool IgnoreExisting,
   return create_directory(P, IgnoreExisting, Perms);
 }
 
-std::error_code copy_file(const Twine &From, const Twine &To) {
-  int ReadFD, WriteFD;
-  if (std::error_code EC = openFileForRead(From, ReadFD))
-    return EC;
-  if (std::error_code EC = openFileForWrite(To, WriteFD, F_None)) {
-    close(ReadFD);
-    return EC;
-  }
-
+static std::error_code copy_file_internal(int ReadFD, int WriteFD) {
   const size_t BufSize = 4096;
   char *Buf = new char[BufSize];
   int BytesRead = 0, BytesWritten = 0;
@@ -920,8 +944,6 @@ std::error_code copy_file(const Twine &From, const Twine &To) {
     if (BytesWritten < 0)
       break;
   }
-  close(ReadFD);
-  close(WriteFD);
   delete[] Buf;
 
   if (BytesRead < 0 || BytesWritten < 0)
@@ -929,6 +951,36 @@ std::error_code copy_file(const Twine &From, const Twine &To) {
   return std::error_code();
 }
 
+std::error_code copy_file(const Twine &From, const Twine &To) {
+  int ReadFD, WriteFD;
+  if (std::error_code EC = openFileForRead(From, ReadFD, OF_None))
+    return EC;
+  if (std::error_code EC =
+          openFileForWrite(To, WriteFD, CD_CreateAlways, OF_None)) {
+    close(ReadFD);
+    return EC;
+  }
+
+  std::error_code EC = copy_file_internal(ReadFD, WriteFD);
+
+  close(ReadFD);
+  close(WriteFD);
+
+  return EC;
+}
+
+std::error_code copy_file(const Twine &From, int ToFD) {
+  int ReadFD;
+  if (std::error_code EC = openFileForRead(From, ReadFD, OF_None))
+    return EC;
+
+  std::error_code EC = copy_file_internal(ReadFD, ToFD);
+
+  close(ReadFD);
+
+  return EC;
+}
+
 ErrorOr<MD5::MD5Result> md5_contents(int FD) {
   MD5 Hash;
 
@@ -951,7 +1003,7 @@ ErrorOr<MD5::MD5Result> md5_contents(int FD) {
 
 ErrorOr<MD5::MD5Result> md5_contents(const Twine &Path) {
   int FD;
-  if (auto EC = openFileForRead(Path, FD))
+  if (auto EC = openFileForRead(Path, FD, OF_None))
     return EC;
 
   auto Result = md5_contents(FD);
@@ -1048,7 +1100,7 @@ ErrorOr<perms> getPermissions(const Twine &Path) {
 #if defined(LLVM_ON_UNIX)
 #include "Unix/Path.inc"
 #endif
-#if defined(LLVM_ON_WIN32)
+#if defined(_WIN32)
 #include "Windows/Path.inc"
 #endif
 
@@ -1070,7 +1122,7 @@ Error TempFile::discard() {
   Done = true;
   std::error_code RemoveEC;
 // On windows closing will remove the file.
-#ifndef LLVM_ON_WIN32
+#ifndef _WIN32
   // Always try to close and remove.
   if (!TmpName.empty()) {
     RemoveEC = fs::remove(TmpName);
@@ -1094,14 +1146,15 @@ Error TempFile::keep(const Twine &Name) {
   assert(!Done);
   Done = true;
   // Always try to close and rename.
-#ifdef LLVM_ON_WIN32
-  // If we cant't cancel the delete don't rename.
-  std::error_code RenameEC = cancelDeleteOnClose(FD);
+#ifdef _WIN32
+  // If we can't cancel the delete don't rename.
+  auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  std::error_code RenameEC = setDeleteDisposition(H, false);
   if (!RenameEC)
     RenameEC = rename_fd(FD, Name);
   // If we can't rename, discard the temporary file.
   if (RenameEC)
-    removeFD(FD);
+    setDeleteDisposition(H, true);
 #else
   std::error_code RenameEC = fs::rename(TmpName, Name);
   // If we can't rename, discard the temporary file.
@@ -1126,8 +1179,9 @@ Error TempFile::keep() {
   assert(!Done);
   Done = true;
 
-#ifdef LLVM_ON_WIN32
-  if (std::error_code EC = cancelDeleteOnClose(FD))
+#ifdef _WIN32
+  auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  if (std::error_code EC = setDeleteDisposition(H, false))
     return errorCodeToError(EC);
 #else
   sys::DontRemoveFileOnSignal(TmpName);
@@ -1147,12 +1201,12 @@ Error TempFile::keep() {
 Expected<TempFile> TempFile::create(const Twine &Model, unsigned Mode) {
   int FD;
   SmallString<128> ResultPath;
-  if (std::error_code EC = createUniqueFile(Model, FD, ResultPath, Mode,
-                                            sys::fs::F_RW | sys::fs::F_Delete))
+  if (std::error_code EC =
+          createUniqueFile(Model, FD, ResultPath, Mode, OF_Delete))
     return errorCodeToError(EC);
 
   TempFile Ret(ResultPath, FD);
-#ifndef LLVM_ON_WIN32
+#ifndef _WIN32
   if (sys::RemoveFileOnSignal(ResultPath)) {
     // Make sure we delete the file when RemoveFileOnSignal fails.
     consumeError(Ret.discard());
diff --git a/contrib/llvm/lib/Support/PrettyStackTrace.cpp b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
index a18e9cc50040..f5b6e6f3652d 100644
--- a/contrib/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
@@ -88,7 +88,11 @@ extern "C" {
 CRASH_REPORTER_CLIENT_HIDDEN 
 struct crashreporter_annotations_t gCRAnnotations 
         __attribute__((section("__DATA," CRASHREPORTER_ANNOTATIONS_SECTION))) 
+#if CRASHREPORTER_ANNOTATIONS_VERSION < 5
         = { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0, 0, 0 };
+#else
+        = { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0, 0, 0, 0 };
+#endif
 }
 #elif defined(__APPLE__) && HAVE_CRASHREPORTER_INFO
 extern "C" const char *__crashreporter_info__
@@ -114,9 +118,9 @@ static void CrashHandler(void *) {
   if (!TmpStr.empty()) {
 #ifdef HAVE_CRASHREPORTERCLIENT_H
     // Cast to void to avoid warning.
-    (void)CRSetCrashLogMessage(std::string(TmpStr.str()).c_str());
+    (void)CRSetCrashLogMessage(TmpStr.c_str());
 #elif HAVE_CRASHREPORTER_INFO 
-    __crashreporter_info__ = strdup(std::string(TmpStr.str()).c_str());
+    __crashreporter_info__ = strdup(TmpStr.c_str());
 #endif
     errs() << TmpStr.str();
   }
diff --git a/contrib/llvm/lib/Support/Process.cpp b/contrib/llvm/lib/Support/Process.cpp
index 1c8cc6e83ad1..3f5a9d722ca0 100644
--- a/contrib/llvm/lib/Support/Process.cpp
+++ b/contrib/llvm/lib/Support/Process.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
@@ -93,6 +93,6 @@ bool Process::AreCoreFilesPrevented() {
 #ifdef LLVM_ON_UNIX
 #include "Unix/Process.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Process.inc"
 #endif
diff --git a/contrib/llvm/lib/Support/Program.cpp b/contrib/llvm/lib/Support/Program.cpp
index 4212323bc0e1..63cdcdaabee9 100644
--- a/contrib/llvm/lib/Support/Program.cpp
+++ b/contrib/llvm/lib/Support/Program.cpp
@@ -13,7 +13,7 @@
 
 #include "llvm/Support/Program.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include <system_error>
 using namespace llvm;
 using namespace sys;
@@ -23,17 +23,19 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
-                    const char **Env, ArrayRef<Optional<StringRef>> Redirects,
+static bool Execute(ProcessInfo &PI, StringRef Program,
+                    ArrayRef<StringRef> Args, Optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<Optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg);
 
-int sys::ExecuteAndWait(StringRef Program, const char **Args, const char **Envp,
+int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
+                        Optional<ArrayRef<StringRef>> Env,
                         ArrayRef<Optional<StringRef>> Redirects,
                         unsigned SecondsToWait, unsigned MemoryLimit,
                         std::string *ErrMsg, bool *ExecutionFailed) {
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
-  if (Execute(PI, Program, Args, Envp, Redirects, MemoryLimit, ErrMsg)) {
+  if (Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
     ProcessInfo Result = Wait(
@@ -47,8 +49,8 @@ int sys::ExecuteAndWait(StringRef Program, const char **Args, const char **Envp,
   return -1;
 }
 
-ProcessInfo sys::ExecuteNoWait(StringRef Program, const char **Args,
-                               const char **Envp,
+ProcessInfo sys::ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
+                               Optional<ArrayRef<StringRef>> Env,
                                ArrayRef<Optional<StringRef>> Redirects,
                                unsigned MemoryLimit, std::string *ErrMsg,
                                bool *ExecutionFailed) {
@@ -56,17 +58,26 @@ ProcessInfo sys::ExecuteNoWait(StringRef Program, const char **Args,
   ProcessInfo PI;
   if (ExecutionFailed)
     *ExecutionFailed = false;
-  if (!Execute(PI, Program, Args, Envp, Redirects, MemoryLimit, ErrMsg))
+  if (!Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg))
     if (ExecutionFailed)
       *ExecutionFailed = true;
 
   return PI;
 }
 
+bool sys::commandLineFitsWithinSystemLimits(StringRef Program,
+                                            ArrayRef<const char *> Args) {
+  SmallVector<StringRef, 8> StringRefArgs;
+  StringRefArgs.reserve(Args.size());
+  for (const char *A : Args)
+    StringRefArgs.emplace_back(A);
+  return commandLineFitsWithinSystemLimits(Program, StringRefArgs);
+}
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Program.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Program.inc"
 #endif
diff --git a/contrib/llvm/lib/Support/RWMutex.cpp b/contrib/llvm/lib/Support/RWMutex.cpp
index 83c6d1d52b4c..8b6d74e49f31 100644
--- a/contrib/llvm/lib/Support/RWMutex.cpp
+++ b/contrib/llvm/lib/Support/RWMutex.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Config/config.h"
 
@@ -49,7 +50,7 @@ RWMutexImpl::RWMutexImpl()
 {
   // Declare the pthread_rwlock data structures
   pthread_rwlock_t* rwlock =
-    static_cast<pthread_rwlock_t*>(malloc(sizeof(pthread_rwlock_t)));
+    static_cast<pthread_rwlock_t*>(safe_malloc(sizeof(pthread_rwlock_t)));
 
 #ifdef __APPLE__
   // Workaround a bug/mis-feature in Darwin's pthread_rwlock_init.
@@ -116,9 +117,9 @@ RWMutexImpl::writer_release()
 
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/RWMutex.inc"
-#elif defined( LLVM_ON_WIN32)
+#elif defined( _WIN32)
 #include "Windows/RWMutex.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in Support/Mutex.cpp
+#warning Neither LLVM_ON_UNIX nor _WIN32 was set in Support/Mutex.cpp
 #endif
 #endif
diff --git a/contrib/llvm/lib/Support/RandomNumberGenerator.cpp b/contrib/llvm/lib/Support/RandomNumberGenerator.cpp
index 47d20159200b..f1f22af82a81 100644
--- a/contrib/llvm/lib/Support/RandomNumberGenerator.cpp
+++ b/contrib/llvm/lib/Support/RandomNumberGenerator.cpp
@@ -17,7 +17,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/WindowsSupport.h"
 #else
 #include "Unix/Unix.h"
@@ -36,10 +36,8 @@ static cl::opt<unsigned long long>
          cl::desc("Seed for the random number generator"), cl::init(0));
 
 RandomNumberGenerator::RandomNumberGenerator(StringRef Salt) {
-  DEBUG(
-    if (Seed == 0)
-      dbgs() << "Warning! Using unseeded random number generator.\n"
-  );
+  LLVM_DEBUG(if (Seed == 0) dbgs()
+             << "Warning! Using unseeded random number generator.\n");
 
   // Combine seed and salts using std::seed_seq.
   // Data: Seed-low, Seed-high, Salt
@@ -63,7 +61,7 @@ RandomNumberGenerator::result_type RandomNumberGenerator::operator()() {
 
 // Get random vector of specified size
 std::error_code llvm::getRandomBytes(void *Buffer, size_t Size) {
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
   HCRYPTPROV hProvider;
   if (CryptAcquireContext(&hProvider, 0, 0, PROV_RSA_FULL,
                            CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) {
diff --git a/contrib/llvm/lib/Support/Regex.cpp b/contrib/llvm/lib/Support/Regex.cpp
index b1087fd8853c..48caab131526 100644
--- a/contrib/llvm/lib/Support/Regex.cpp
+++ b/contrib/llvm/lib/Support/Regex.cpp
@@ -12,11 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Regex.h"
-#include "regex_impl.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include <string>
+
+// Important this comes last because it defines "_REGEX_H_". At least on
+// Darwin, if included before any header that (transitively) includes
+// xlocale.h, this will cause trouble, because of missing regex-related types.
+#include "regex_impl.h"
+
 using namespace llvm;
 
 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
@@ -25,7 +30,7 @@ Regex::Regex(StringRef regex, unsigned Flags) {
   unsigned flags = 0;
   preg = new llvm_regex();
   preg->re_endp = regex.end();
-  if (Flags & IgnoreCase) 
+  if (Flags & IgnoreCase)
     flags |= REG_ICASE;
   if (Flags & Newline)
     flags |= REG_NEWLINE;
@@ -51,9 +56,9 @@ Regex::~Regex() {
 bool Regex::isValid(std::string &Error) const {
   if (!error)
     return true;
-  
+
   size_t len = llvm_regerror(error, preg, nullptr, 0);
-  
+
   Error.resize(len - 1);
   llvm_regerror(error, preg, &Error[0], len);
   return false;
@@ -91,7 +96,7 @@ bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
 
   if (Matches) { // match position requested
     Matches->clear();
-    
+
     for (unsigned i = 0; i != nmatch; ++i) {
       if (pm[i].rm_so == -1) {
         // this group didn't match
diff --git a/contrib/llvm/lib/Support/SHA1.cpp b/contrib/llvm/lib/Support/SHA1.cpp
index 20f41c5ff447..3007a78d5e22 100644
--- a/contrib/llvm/lib/Support/SHA1.cpp
+++ b/contrib/llvm/lib/Support/SHA1.cpp
@@ -1,4 +1,4 @@
-//======- SHA1.h - Private copy of the SHA1 implementation ---*- C++ -* ======//
+//====- SHA1.cpp - Private copy of the SHA1 implementation ---*- C++ -* ======//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Support/Signals.cpp b/contrib/llvm/lib/Support/Signals.cpp
index 661f4d649cdd..6534ff69b84c 100644
--- a/contrib/llvm/lib/Support/Signals.cpp
+++ b/contrib/llvm/lib/Support/Signals.cpp
@@ -15,7 +15,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
@@ -36,19 +36,55 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
+// Use explicit storage to avoid accessing cl::opt in a signal handler.
+static bool DisableSymbolicationFlag = false;
+static cl::opt<bool, true>
     DisableSymbolication("disable-symbolication",
                          cl::desc("Disable symbolizing crash backtraces."),
-                         cl::init(false), cl::Hidden);
-
-static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>>
-    CallBacksToRun;
+                         cl::location(DisableSymbolicationFlag), cl::Hidden);
+
+// Callbacks to run in signal handler must be lock-free because a signal handler
+// could be running as we add new callbacks. We don't add unbounded numbers of
+// callbacks, an array is therefore sufficient.
+struct CallbackAndCookie {
+  sys::SignalHandlerCallback Callback;
+  void *Cookie;
+  enum class Status { Empty, Initializing, Initialized, Executing };
+  std::atomic<Status> Flag;
+};
+static constexpr size_t MaxSignalHandlerCallbacks = 8;
+static CallbackAndCookie CallBacksToRun[MaxSignalHandlerCallbacks];
+
+// Signal-safe.
 void sys::RunSignalHandlers() {
-  if (!CallBacksToRun.isConstructed())
+  for (size_t I = 0; I < MaxSignalHandlerCallbacks; ++I) {
+    auto &RunMe = CallBacksToRun[I];
+    auto Expected = CallbackAndCookie::Status::Initialized;
+    auto Desired = CallbackAndCookie::Status::Executing;
+    if (!RunMe.Flag.compare_exchange_strong(Expected, Desired))
+      continue;
+    (*RunMe.Callback)(RunMe.Cookie);
+    RunMe.Callback = nullptr;
+    RunMe.Cookie = nullptr;
+    RunMe.Flag.store(CallbackAndCookie::Status::Empty);
+  }
+}
+
+// Signal-safe.
+static void insertSignalHandler(sys::SignalHandlerCallback FnPtr,
+                                void *Cookie) {
+  for (size_t I = 0; I < MaxSignalHandlerCallbacks; ++I) {
+    auto &SetMe = CallBacksToRun[I];
+    auto Expected = CallbackAndCookie::Status::Empty;
+    auto Desired = CallbackAndCookie::Status::Initializing;
+    if (!SetMe.Flag.compare_exchange_strong(Expected, Desired))
+      continue;
+    SetMe.Callback = FnPtr;
+    SetMe.Cookie = Cookie;
+    SetMe.Flag.store(CallbackAndCookie::Status::Initialized);
     return;
-  for (auto &I : *CallBacksToRun)
-    I.first(I.second);
-  CallBacksToRun->clear();
+  }
+  report_fatal_error("too many signal callbacks already registered");
 }
 
 static bool findModulesAndOffsets(void **StackTrace, int Depth,
@@ -64,16 +100,11 @@ static FormattedNumber format_ptr(void *PC) {
   return format_hex((uint64_t)PC, PtrWidth);
 }
 
-static bool printSymbolizedStackTrace(StringRef Argv0,
-                                      void **StackTrace, int Depth,
-                                      llvm::raw_ostream &OS)
-  LLVM_ATTRIBUTE_USED;
-
 /// Helper that launches llvm-symbolizer and symbolizes a backtrace.
-static bool printSymbolizedStackTrace(StringRef Argv0,
-                                      void **StackTrace, int Depth,
-                                      llvm::raw_ostream &OS) {
-  if (DisableSymbolication)
+LLVM_ATTRIBUTE_USED
+static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
+                                      int Depth, llvm::raw_ostream &OS) {
+  if (DisableSymbolicationFlag)
     return false;
 
   // Don't recursively invoke the llvm-symbolizer binary.
@@ -123,17 +154,18 @@ static bool printSymbolizedStackTrace(StringRef Argv0,
     }
   }
 
-  Optional<StringRef> Redirects[] = {InputFile.str(), OutputFile.str(), llvm::None};
-  const char *Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
-#ifdef LLVM_ON_WIN32
-                        // Pass --relative-address on Windows so that we don't
-                        // have to add ImageBase from PE file.
-                        // FIXME: Make this the default for llvm-symbolizer.
-                        "--relative-address",
+  Optional<StringRef> Redirects[] = {StringRef(InputFile),
+                                     StringRef(OutputFile), llvm::None};
+  StringRef Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
+#ifdef _WIN32
+                      // Pass --relative-address on Windows so that we don't
+                      // have to add ImageBase from PE file.
+                      // FIXME: Make this the default for llvm-symbolizer.
+                      "--relative-address",
 #endif
-                        "--demangle", nullptr};
+                      "--demangle"};
   int RunResult =
-      sys::ExecuteAndWait(LLVMSymbolizerPath, Args, nullptr, Redirects);
+      sys::ExecuteAndWait(LLVMSymbolizerPath, Args, None, Redirects);
   if (RunResult != 0)
     return false;
 
@@ -180,6 +212,6 @@ static bool printSymbolizedStackTrace(StringRef Argv0,
 #ifdef LLVM_ON_UNIX
 #include "Unix/Signals.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Signals.inc"
 #endif
diff --git a/contrib/llvm/lib/Support/SmallPtrSet.cpp b/contrib/llvm/lib/Support/SmallPtrSet.cpp
index 119bb871d4c0..fed4a17d6635 100644
--- a/contrib/llvm/lib/Support/SmallPtrSet.cpp
+++ b/contrib/llvm/lib/Support/SmallPtrSet.cpp
@@ -32,9 +32,7 @@ void SmallPtrSetImplBase::shrink_and_clear() {
   NumNonEmpty = NumTombstones = 0;
 
   // Install the new array.  Clear all the buckets to empty.
-  CurArray = (const void**)malloc(sizeof(void*) * CurArraySize);
-  if (CurArray == nullptr)
-    report_bad_alloc_error("Allocation of SmallPtrSet bucket array failed.");
+  CurArray = (const void**)safe_malloc(sizeof(void*) * CurArraySize);
 
   memset(CurArray, -1, CurArraySize*sizeof(void*));
 }
@@ -100,9 +98,7 @@ void SmallPtrSetImplBase::Grow(unsigned NewSize) {
   bool WasSmall = isSmall();
 
   // Install the new array.  Clear all the buckets to empty.
-  const void **NewBuckets = (const void**) malloc(sizeof(void*) * NewSize);
-  if (NewBuckets == nullptr)
-    report_bad_alloc_error("Allocation of SmallPtrSet bucket array failed.");
+  const void **NewBuckets = (const void**) safe_malloc(sizeof(void*) * NewSize);
 
   // Reset member only if memory was allocated successfully
   CurArray = NewBuckets;
@@ -132,9 +128,7 @@ SmallPtrSetImplBase::SmallPtrSetImplBase(const void **SmallStorage,
     CurArray = SmallArray;
   // Otherwise, allocate new heap space (unless we were the same size)
   } else {
-    CurArray = (const void**)malloc(sizeof(void*) * that.CurArraySize);
-    if (CurArray == nullptr)
-      report_bad_alloc_error("Allocation of SmallPtrSet bucket array failed.");
+    CurArray = (const void**)safe_malloc(sizeof(void*) * that.CurArraySize);
   }
 
   // Copy over the that array.
@@ -163,16 +157,12 @@ void SmallPtrSetImplBase::CopyFrom(const SmallPtrSetImplBase &RHS) {
   // Otherwise, allocate new heap space (unless we were the same size)
   } else if (CurArraySize != RHS.CurArraySize) {
     if (isSmall())
-      CurArray = (const void**)malloc(sizeof(void*) * RHS.CurArraySize);
+      CurArray = (const void**)safe_malloc(sizeof(void*) * RHS.CurArraySize);
     else {
-      const void **T = (const void**)realloc(CurArray,
+      const void **T = (const void**)safe_realloc(CurArray,
                                              sizeof(void*) * RHS.CurArraySize);
-      if (!T)
-        free(CurArray);
       CurArray = T;
     }
-    if (CurArray == nullptr)
-      report_bad_alloc_error("Allocation of SmallPtrSet bucket array failed.");
   }
 
   CopyHelper(RHS);
diff --git a/contrib/llvm/lib/Support/SmallVector.cpp b/contrib/llvm/lib/Support/SmallVector.cpp
index 74313151c762..1070c6672edc 100644
--- a/contrib/llvm/lib/Support/SmallVector.cpp
+++ b/contrib/llvm/lib/Support/SmallVector.cpp
@@ -14,31 +14,53 @@
 #include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
+// Check that no bytes are wasted and everything is well-aligned.
+namespace {
+struct Struct16B {
+  alignas(16) void *X;
+};
+struct Struct32B {
+  alignas(32) void *X;
+};
+}
+static_assert(sizeof(SmallVector<void *, 0>) ==
+                  sizeof(unsigned) * 2 + sizeof(void *),
+              "wasted space in SmallVector size 0");
+static_assert(alignof(SmallVector<Struct16B, 0>) >= alignof(Struct16B),
+              "wrong alignment for 16-byte aligned T");
+static_assert(alignof(SmallVector<Struct32B, 0>) >= alignof(Struct32B),
+              "wrong alignment for 32-byte aligned T");
+static_assert(sizeof(SmallVector<Struct16B, 0>) >= alignof(Struct16B),
+              "missing padding for 16-byte aligned T");
+static_assert(sizeof(SmallVector<Struct32B, 0>) >= alignof(Struct32B),
+              "missing padding for 32-byte aligned T");
+static_assert(sizeof(SmallVector<void *, 1>) ==
+                  sizeof(unsigned) * 2 + sizeof(void *) * 2,
+              "wasted space in SmallVector size 1");
+
 /// grow_pod - This is an implementation of the grow() method which only works
 /// on POD-like datatypes and is out of line to reduce code duplication.
-void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinCapacity,
                                size_t TSize) {
-  size_t CurSizeBytes = size_in_bytes();
-  size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
-  if (NewCapacityInBytes < MinSizeInBytes)
-    NewCapacityInBytes = MinSizeInBytes;
+  // Ensure we can fit the new capacity in 32 bits.
+  if (MinCapacity > UINT32_MAX)
+    report_bad_alloc_error("SmallVector capacity overflow during allocation");
+
+  size_t NewCapacity = 2 * capacity() + 1; // Always grow.
+  NewCapacity =
+      std::min(std::max(NewCapacity, MinCapacity), size_t(UINT32_MAX));
 
   void *NewElts;
   if (BeginX == FirstEl) {
-    NewElts = malloc(NewCapacityInBytes);
-    if (NewElts == nullptr)
-      report_bad_alloc_error("Allocation of SmallVector element failed.");
+    NewElts = safe_malloc(NewCapacity * TSize);
 
     // Copy the elements over.  No need to run dtors on PODs.
-    memcpy(NewElts, this->BeginX, CurSizeBytes);
+    memcpy(NewElts, this->BeginX, size() * TSize);
   } else {
     // If this wasn't grown from the inline copy, grow the allocated space.
-    NewElts = realloc(this->BeginX, NewCapacityInBytes);
-    if (NewElts == nullptr)
-      report_bad_alloc_error("Reallocation of SmallVector element failed.");
+    NewElts = safe_realloc(this->BeginX, NewCapacity * TSize);
   }
 
-  this->EndX = (char*)NewElts+CurSizeBytes;
   this->BeginX = NewElts;
-  this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
+  this->Capacity = NewCapacity;
 }
diff --git a/contrib/llvm/lib/Support/SourceMgr.cpp b/contrib/llvm/lib/Support/SourceMgr.cpp
index a8f6208a558c..bc15fd4e4014 100644
--- a/contrib/llvm/lib/Support/SourceMgr.cpp
+++ b/contrib/llvm/lib/Support/SourceMgr.cpp
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -36,24 +37,6 @@ using namespace llvm;
 
 static const size_t TabStop = 8;
 
-namespace {
-
-  struct LineNoCacheTy {
-    const char *LastQuery;
-    unsigned LastQueryBufferID;
-    unsigned LineNoOfQuery;
-  };
-
-} // end anonymous namespace
-
-static LineNoCacheTy *getCache(void *Ptr) {
-  return (LineNoCacheTy*)Ptr;
-}
-
-SourceMgr::~SourceMgr() {
-  delete getCache(LineNoCache);
-}
-
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
                                    SMLoc IncludeLoc,
                                    std::string &IncludedFile) {
@@ -85,46 +68,85 @@ unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
   return 0;
 }
 
-std::pair<unsigned, unsigned>
-SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
-  if (!BufferID)
-    BufferID = FindBufferContainingLoc(Loc);
-  assert(BufferID && "Invalid Location!");
+template <typename T>
+unsigned SourceMgr::SrcBuffer::getLineNumber(const char *Ptr) const {
+
+  // Ensure OffsetCache is allocated and populated with offsets of all the
+  // '\n' bytes.
+  std::vector<T> *Offsets = nullptr;
+  if (OffsetCache.isNull()) {
+    Offsets = new std::vector<T>();
+    OffsetCache = Offsets;
+    size_t Sz = Buffer->getBufferSize();
+    assert(Sz <= std::numeric_limits<T>::max());
+    StringRef S = Buffer->getBuffer();
+    for (size_t N = 0; N < Sz; ++N) {
+      if (S[N] == '\n') {
+        Offsets->push_back(static_cast<T>(N));
+      }
+    }
+  } else {
+    Offsets = OffsetCache.get<std::vector<T> *>();
+  }
 
-  const MemoryBuffer *Buff = getMemoryBuffer(BufferID);
+  const char *BufStart = Buffer->getBufferStart();
+  assert(Ptr >= BufStart && Ptr <= Buffer->getBufferEnd());
+  ptrdiff_t PtrDiff = Ptr - BufStart;
+  assert(PtrDiff >= 0 && static_cast<size_t>(PtrDiff) <= std::numeric_limits<T>::max());
+  T PtrOffset = static_cast<T>(PtrDiff);
 
-  // Count the number of \n's between the start of the file and the specified
-  // location.
-  unsigned LineNo = 1;
+  // std::lower_bound returns the first EOL offset that's not-less-than
+  // PtrOffset, meaning the EOL that _ends the line_ that PtrOffset is on
+  // (including if PtrOffset refers to the EOL itself). If there's no such
+  // EOL, returns end().
+  auto EOL = std::lower_bound(Offsets->begin(), Offsets->end(), PtrOffset);
 
-  const char *BufStart = Buff->getBufferStart();
-  const char *Ptr = BufStart;
+  // Lines count from 1, so add 1 to the distance from the 0th line.
+  return (1 + (EOL - Offsets->begin()));
+}
 
-  // If we have a line number cache, and if the query is to a later point in the
-  // same file, start searching from the last query location.  This optimizes
-  // for the case when multiple diagnostics come out of one file in order.
-  if (LineNoCacheTy *Cache = getCache(LineNoCache))
-    if (Cache->LastQueryBufferID == BufferID &&
-        Cache->LastQuery <= Loc.getPointer()) {
-      Ptr = Cache->LastQuery;
-      LineNo = Cache->LineNoOfQuery;
-    }
+SourceMgr::SrcBuffer::SrcBuffer(SourceMgr::SrcBuffer &&Other)
+  : Buffer(std::move(Other.Buffer)),
+    OffsetCache(Other.OffsetCache),
+    IncludeLoc(Other.IncludeLoc) {
+  Other.OffsetCache = nullptr;
+}
 
-  // Scan for the location being queried, keeping track of the number of lines
-  // we see.
-  for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr)
-    if (*Ptr == '\n') ++LineNo;
+SourceMgr::SrcBuffer::~SrcBuffer() {
+  if (!OffsetCache.isNull()) {
+    if (OffsetCache.is<std::vector<uint8_t>*>())
+      delete OffsetCache.get<std::vector<uint8_t>*>();
+    else if (OffsetCache.is<std::vector<uint16_t>*>())
+      delete OffsetCache.get<std::vector<uint16_t>*>();
+    else if (OffsetCache.is<std::vector<uint32_t>*>())
+      delete OffsetCache.get<std::vector<uint32_t>*>();
+    else
+      delete OffsetCache.get<std::vector<uint64_t>*>();
+    OffsetCache = nullptr;
+  }
+}
 
-  // Allocate the line number cache if it doesn't exist.
-  if (!LineNoCache)
-    LineNoCache = new LineNoCacheTy();
+std::pair<unsigned, unsigned>
+SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
+  if (!BufferID)
+    BufferID = FindBufferContainingLoc(Loc);
+  assert(BufferID && "Invalid Location!");
 
-  // Update the line # cache.
-  LineNoCacheTy &Cache = *getCache(LineNoCache);
-  Cache.LastQueryBufferID = BufferID;
-  Cache.LastQuery = Ptr;
-  Cache.LineNoOfQuery = LineNo;
-  
+  auto &SB = getBufferInfo(BufferID);
+  const char *Ptr = Loc.getPointer();
+
+  size_t Sz = SB.Buffer->getBufferSize();
+  unsigned LineNo;
+  if (Sz <= std::numeric_limits<uint8_t>::max())
+    LineNo = SB.getLineNumber<uint8_t>(Ptr);
+  else if (Sz <= std::numeric_limits<uint16_t>::max())
+    LineNo = SB.getLineNumber<uint16_t>(Ptr);
+  else if (Sz <= std::numeric_limits<uint32_t>::max())
+    LineNo = SB.getLineNumber<uint32_t>(Ptr);
+  else
+    LineNo = SB.getLineNumber<uint64_t>(Ptr);
+
+  const char *BufStart = SB.Buffer->getBufferStart();
   size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
   if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
   return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);
@@ -247,7 +269,7 @@ SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
   : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
     Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
     FixIts(Hints.begin(), Hints.end()) {
-  std::sort(FixIts.begin(), FixIts.end());
+  llvm::sort(FixIts.begin(), FixIts.end());
 }
 
 static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
diff --git a/contrib/llvm/lib/Support/Statistic.cpp b/contrib/llvm/lib/Support/Statistic.cpp
index 544ae2d0983c..d57300a75d1d 100644
--- a/contrib/llvm/lib/Support/Statistic.cpp
+++ b/contrib/llvm/lib/Support/Statistic.cpp
@@ -52,11 +52,14 @@ static bool Enabled;
 static bool PrintOnExit;
 
 namespace {
-/// StatisticInfo - This class is used in a ManagedStatic so that it is created
-/// on demand (when the first statistic is bumped) and destroyed only when
-/// llvm_shutdown is called.  We print statistics from the destructor.
+/// This class is used in a ManagedStatic so that it is created on demand (when
+/// the first statistic is bumped) and destroyed only when llvm_shutdown is
+/// called. We print statistics from the destructor.
+/// This class is also used to look up statistic values from applications that
+/// use LLVM.
 class StatisticInfo {
-  std::vector<const Statistic*> Stats;
+  std::vector<Statistic*> Stats;
+
   friend void llvm::PrintStatistics();
   friend void llvm::PrintStatistics(raw_ostream &OS);
   friend void llvm::PrintStatisticsJSON(raw_ostream &OS);
@@ -64,14 +67,24 @@ class StatisticInfo {
   /// Sort statistics by debugtype,name,description.
   void sort();
 public:
+  using const_iterator = std::vector<Statistic *>::const_iterator;
+
   StatisticInfo();
   ~StatisticInfo();
 
-  void addStatistic(const Statistic *S) {
+  void addStatistic(Statistic *S) {
     Stats.push_back(S);
   }
+
+  const_iterator begin() const { return Stats.begin(); }
+  const_iterator end() const { return Stats.end(); }
+  iterator_range<const_iterator> statistics() const {
+    return {begin(), end()};
+  }
+
+  void reset();
 };
-}
+} // end anonymous namespace
 
 static ManagedStatic<StatisticInfo> StatInfo;
 static ManagedStatic<sys::SmartMutex<true> > StatLock;
@@ -81,17 +94,24 @@ static ManagedStatic<sys::SmartMutex<true> > StatLock;
 void Statistic::RegisterStatistic() {
   // If stats are enabled, inform StatInfo that this statistic should be
   // printed.
-  sys::SmartScopedLock<true> Writer(*StatLock);
-  if (!Initialized) {
+  // llvm_shutdown calls destructors while holding the ManagedStatic mutex.
+  // These destructors end up calling PrintStatistics, which takes StatLock.
+  // Since dereferencing StatInfo and StatLock can require taking the
+  // ManagedStatic mutex, doing so with StatLock held would lead to a lock
+  // order inversion. To avoid that, we dereference the ManagedStatics first,
+  // and only take StatLock afterwards.
+  if (!Initialized.load(std::memory_order_relaxed)) {
+    sys::SmartMutex<true> &Lock = *StatLock;
+    StatisticInfo &SI = *StatInfo;
+    sys::SmartScopedLock<true> Writer(Lock);
+    // Check Initialized again after acquiring the lock.
+    if (Initialized.load(std::memory_order_relaxed))
+      return;
     if (Stats || Enabled)
-      StatInfo->addStatistic(this);
+      SI.addStatistic(this);
 
-    TsanHappensBefore(this);
-    sys::MemoryFence();
     // Remember we have been registered.
-    TsanIgnoreWritesBegin();
-    Initialized = true;
-    TsanIgnoreWritesEnd();
+    Initialized.store(true, std::memory_order_release);
   }
 }
 
@@ -128,6 +148,28 @@ void StatisticInfo::sort() {
   });
 }
 
+void StatisticInfo::reset() {
+  sys::SmartScopedLock<true> Writer(*StatLock);
+
+  // Tell each statistic that it isn't registered so it has to register
+  // again. We're holding the lock so it won't be able to do so until we're
+  // finished. Once we've forced it to re-register (after we return), then zero
+  // the value.
+  for (auto *Stat : Stats) {
+    // Value updates to a statistic that complete before this statement in the
+    // iteration for that statistic will be lost as intended.
+    Stat->Initialized = false;
+    Stat->Value = 0;
+  }
+
+  // Clear the registration list and release the lock once we're done. Any
+  // pending updates from other threads will safely take effect after we return.
+  // That might not be what the user wants if they're measuring a compilation
+  // but it's their responsibility to prevent concurrent compilations to make
+  // a single compilation measurable.
+  Stats.clear();
+}
+
 void llvm::PrintStatistics(raw_ostream &OS) {
   StatisticInfo &Stats = *StatInfo;
 
@@ -159,6 +201,7 @@ void llvm::PrintStatistics(raw_ostream &OS) {
 }
 
 void llvm::PrintStatisticsJSON(raw_ostream &OS) {
+  sys::SmartScopedLock<true> Reader(*StatLock);
   StatisticInfo &Stats = *StatInfo;
 
   Stats.sort();
@@ -184,7 +227,8 @@ void llvm::PrintStatisticsJSON(raw_ostream &OS) {
 }
 
 void llvm::PrintStatistics() {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+#if LLVM_ENABLE_STATS
+  sys::SmartScopedLock<true> Reader(*StatLock);
   StatisticInfo &Stats = *StatInfo;
 
   // Statistics not enabled?
@@ -209,3 +253,16 @@ void llvm::PrintStatistics() {
   }
 #endif
 }
+
+const std::vector<std::pair<StringRef, unsigned>> llvm::GetStatistics() {
+  sys::SmartScopedLock<true> Reader(*StatLock);
+  std::vector<std::pair<StringRef, unsigned>> ReturnStats;
+
+  for (const auto &Stat : StatInfo->statistics())
+    ReturnStats.emplace_back(Stat->getName(), Stat->getValue());
+  return ReturnStats;
+}
+
+void llvm::ResetStatistics() {
+  StatInfo->reset();
+}
diff --git a/contrib/llvm/lib/Support/StringExtras.cpp b/contrib/llvm/lib/Support/StringExtras.cpp
index 21157a14086d..386d74a47983 100644
--- a/contrib/llvm/lib/Support/StringExtras.cpp
+++ b/contrib/llvm/lib/Support/StringExtras.cpp
@@ -58,6 +58,33 @@ void llvm::SplitString(StringRef Source,
   }
 }
 
+void llvm::printEscapedString(StringRef Name, raw_ostream &Out) {
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    unsigned char C = Name[i];
+    if (isPrint(C) && C != '\\' && C != '"')
+      Out << C;
+    else
+      Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+void llvm::printHTMLEscaped(StringRef String, raw_ostream &Out) {
+  for (char C : String) {
+    if (C == '&')
+      Out << "&amp;";
+    else if (C == '<')
+      Out << "&lt;";
+    else if (C == '>')
+      Out << "&gt;";
+    else if (C == '\"')
+      Out << "&quot;";
+    else if (C == '\'')
+      Out << "&apos;";
+    else
+      Out << C;
+  }
+}
+
 void llvm::printLowerCase(StringRef String, raw_ostream &Out) {
   for (const char C : String)
     Out << toLower(C);
diff --git a/contrib/llvm/lib/Support/StringMap.cpp b/contrib/llvm/lib/Support/StringMap.cpp
index 4341da2d97bd..c1f707ce50a5 100644
--- a/contrib/llvm/lib/Support/StringMap.cpp
+++ b/contrib/llvm/lib/Support/StringMap.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/DJB.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 
@@ -32,7 +33,7 @@ static unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
 
 StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
   ItemSize = itemSize;
-  
+
   // If a size is specified, initialize the table with that many buckets.
   if (InitSize) {
     // The table will grow when the number of entries reach 3/4 of the number of
@@ -41,7 +42,7 @@ StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
     init(getMinBucketToReserveForEntries(InitSize));
     return;
   }
-  
+
   // Otherwise, initialize it with zero buckets to avoid the allocation.
   TheTable = nullptr;
   NumBuckets = 0;
@@ -56,13 +57,10 @@ void StringMapImpl::init(unsigned InitSize) {
   unsigned NewNumBuckets = InitSize ? InitSize : 16;
   NumItems = 0;
   NumTombstones = 0;
-  
-  TheTable = (StringMapEntryBase **)calloc(NewNumBuckets+1,
-                                           sizeof(StringMapEntryBase **) +
-                                           sizeof(unsigned));
 
-  if (TheTable == nullptr)
-    report_bad_alloc_error("Allocation of StringMap table failed.");
+  TheTable = static_cast<StringMapEntryBase **>(
+      safe_calloc(NewNumBuckets+1,
+                  sizeof(StringMapEntryBase **) + sizeof(unsigned)));
 
   // Set the member only if TheTable was successfully allocated
   NumBuckets = NewNumBuckets;
@@ -83,7 +81,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
     init(16);
     HTSize = NumBuckets;
   }
-  unsigned FullHashValue = HashString(Name);
+  unsigned FullHashValue = djbHash(Name, 0);
   unsigned BucketNo = FullHashValue & (HTSize-1);
   unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
 
@@ -99,11 +97,11 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
         HashTable[FirstTombstone] = FullHashValue;
         return FirstTombstone;
       }
-      
+
       HashTable[BucketNo] = FullHashValue;
       return BucketNo;
     }
-    
+
     if (BucketItem == getTombstoneVal()) {
       // Skip over tombstones.  However, remember the first one we see.
       if (FirstTombstone == -1) FirstTombstone = BucketNo;
@@ -112,7 +110,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
       // case here is that we are only looking at the buckets (for item info
       // being non-null and for the full hash value) not at the items.  This
       // is important for cache locality.
-      
+
       // Do the comparison like this because Name isn't necessarily
       // null-terminated!
       char *ItemStr = (char*)BucketItem+ItemSize;
@@ -121,10 +119,10 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
         return BucketNo;
       }
     }
-    
+
     // Okay, we didn't find the item.  Probe to the next bucket.
     BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
-    
+
     // Use quadratic probing, it has fewer clumping artifacts than linear
     // probing and has good cache behavior in the common case.
     ++ProbeAmt;
@@ -137,7 +135,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
 int StringMapImpl::FindKey(StringRef Key) const {
   unsigned HTSize = NumBuckets;
   if (HTSize == 0) return -1;  // Really empty table?
-  unsigned FullHashValue = HashString(Key);
+  unsigned FullHashValue = djbHash(Key, 0);
   unsigned BucketNo = FullHashValue & (HTSize-1);
   unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
 
@@ -147,7 +145,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
     // If we found an empty bucket, this key isn't in the table yet, return.
     if (LLVM_LIKELY(!BucketItem))
       return -1;
-    
+
     if (BucketItem == getTombstoneVal()) {
       // Ignore tombstones.
     } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
@@ -155,7 +153,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
       // case here is that we are only looking at the buckets (for item info
       // being non-null and for the full hash value) not at the items.  This
       // is important for cache locality.
-      
+
       // Do the comparison like this because NameStart isn't necessarily
       // null-terminated!
       char *ItemStr = (char*)BucketItem+ItemSize;
@@ -164,10 +162,10 @@ int StringMapImpl::FindKey(StringRef Key) const {
         return BucketNo;
       }
     }
-    
+
     // Okay, we didn't find the item.  Probe to the next bucket.
     BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
-    
+
     // Use quadratic probing, it has fewer clumping artifacts than linear
     // probing and has good cache behavior in the common case.
     ++ProbeAmt;
@@ -188,7 +186,7 @@ void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
 StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
   int Bucket = FindKey(Key);
   if (Bucket == -1) return nullptr;
-  
+
   StringMapEntryBase *Result = TheTable[Bucket];
   TheTable[Bucket] = getTombstoneVal();
   --NumItems;
@@ -219,12 +217,8 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   unsigned NewBucketNo = BucketNo;
   // Allocate one extra bucket which will always be non-empty.  This allows the
   // iterators to stop at end.
-  StringMapEntryBase **NewTableArray =
-    (StringMapEntryBase **)calloc(NewSize+1, sizeof(StringMapEntryBase *) +
-                                             sizeof(unsigned));
-
-  if (NewTableArray == nullptr)
-    report_bad_alloc_error("Allocation of StringMap hash table failed.");
+  auto NewTableArray = static_cast<StringMapEntryBase **>(
+      safe_calloc(NewSize+1, sizeof(StringMapEntryBase *) + sizeof(unsigned)));
 
   unsigned *NewHashArray = (unsigned *)(NewTableArray + NewSize + 1);
   NewTableArray[NewSize] = (StringMapEntryBase*)2;
@@ -244,13 +238,13 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
           NewBucketNo = NewBucket;
         continue;
       }
-      
+
       // Otherwise probe for a spot.
       unsigned ProbeSize = 1;
       do {
         NewBucket = (NewBucket + ProbeSize++) & (NewSize-1);
       } while (NewTableArray[NewBucket]);
-      
+
       // Finally found a slot.  Fill it in.
       NewTableArray[NewBucket] = Bucket;
       NewHashArray[NewBucket] = FullHash;
@@ -258,9 +252,9 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
         NewBucketNo = NewBucket;
     }
   }
-  
+
   free(TheTable);
-  
+
   TheTable = NewTableArray;
   NumBuckets = NewSize;
   NumTombstones = 0;
diff --git a/contrib/llvm/lib/Support/StringSaver.cpp b/contrib/llvm/lib/Support/StringSaver.cpp
index 335fce3a7bbd..1ded2bdb09de 100644
--- a/contrib/llvm/lib/Support/StringSaver.cpp
+++ b/contrib/llvm/lib/Support/StringSaver.cpp
@@ -17,3 +17,10 @@ StringRef StringSaver::save(StringRef S) {
   P[S.size()] = '\0';
   return StringRef(P, S.size());
 }
+
+StringRef UniqueStringSaver::save(StringRef S) {
+  auto R = Unique.insert(S);
+  if (R.second)                 // cache miss, need to actually save the string
+    *R.first = Strings.save(S); // safe replacement with equal value
+  return *R.first;
+}
diff --git a/contrib/llvm/lib/Support/TarWriter.cpp b/contrib/llvm/lib/Support/TarWriter.cpp
index abc46d076576..5b4d554befe4 100644
--- a/contrib/llvm/lib/Support/TarWriter.cpp
+++ b/contrib/llvm/lib/Support/TarWriter.cpp
@@ -159,8 +159,10 @@ static void writeUstarHeader(raw_fd_ostream &OS, StringRef Prefix,
 // Creates a TarWriter instance and returns it.
 Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
                                                        StringRef BaseDir) {
+  using namespace sys::fs;
   int FD;
-  if (std::error_code EC = openFileForWrite(OutputPath, FD, sys::fs::F_None))
+  if (std::error_code EC =
+          openFileForWrite(OutputPath, FD, CD_CreateAlways, OF_None))
     return make_error<StringError>("cannot open " + OutputPath, EC);
   return std::unique_ptr<TarWriter>(new TarWriter(FD, BaseDir));
 }
diff --git a/contrib/llvm/lib/Support/TargetParser.cpp b/contrib/llvm/lib/Support/TargetParser.cpp
index b96ca084e9bf..2c167a4d086c 100644
--- a/contrib/llvm/lib/Support/TargetParser.cpp
+++ b/contrib/llvm/lib/Support/TargetParser.cpp
@@ -433,6 +433,17 @@ unsigned llvm::AArch64::getDefaultExtensions(StringRef CPU, ArchKind AK) {
     .Default(AArch64::AEK_INVALID);
 }
 
+AArch64::ArchKind llvm::AArch64::getCPUArchKind(StringRef CPU) {
+  if (CPU == "generic")
+    return AArch64::ArchKind::ARMV8A;
+
+  return StringSwitch<AArch64::ArchKind>(CPU)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+  .Case(NAME, AArch64::ArchKind:: ID)
+#include "llvm/Support/AArch64TargetParser.def"
+    .Default(AArch64::ArchKind::INVALID);
+}
+
 bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
                                      std::vector<StringRef> &Features) {
 
@@ -480,6 +491,8 @@ bool llvm::AArch64::getArchFeatures(AArch64::ArchKind AK,
     Features.push_back("+v8.2a");
   if (AK == AArch64::ArchKind::ARMV8_3A)
     Features.push_back("+v8.3a");
+  if (AK == AArch64::ArchKind::ARMV8_4A)
+    Features.push_back("+v8.4a");
 
   return AK != AArch64::ArchKind::INVALID;
 }
@@ -581,10 +594,11 @@ static StringRef getArchSynonym(StringRef Arch) {
       .Case("v7r", "v7-r")
       .Case("v7m", "v7-m")
       .Case("v7em", "v7e-m")
-      .Cases("v8", "v8a", "aarch64", "arm64", "v8-a")
+      .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a")
       .Case("v8.1a", "v8.1-a")
       .Case("v8.2a", "v8.2-a")
       .Case("v8.3a", "v8.3-a")
+      .Case("v8.4a", "v8.4-a")
       .Case("v8r", "v8-r")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
@@ -689,6 +703,20 @@ ARM::ArchKind llvm::ARM::parseCPUArch(StringRef CPU) {
   return ARM::ArchKind::INVALID;
 }
 
+void llvm::ARM::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
+  for (const CpuNames<ARM::ArchKind> &Arch : CPUNames) {
+    if (Arch.ArchID != ARM::ArchKind::INVALID)
+      Values.push_back(Arch.getName());
+  }
+}
+
+void llvm::AArch64::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
+  for (const CpuNames<AArch64::ArchKind> &Arch : AArch64CPUNames) {
+    if (Arch.ArchID != AArch64::ArchKind::INVALID)
+      Values.push_back(Arch.getName());
+  }
+}
+
 // ARM, Thumb, AArch64
 ARM::ISAKind ARM::parseArchISA(StringRef Arch) {
   return StringSwitch<ARM::ISAKind>(Arch)
@@ -738,6 +766,7 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
   case ARM::ArchKind::ARMV8_1A:
   case ARM::ArchKind::ARMV8_2A:
   case ARM::ArchKind::ARMV8_3A:
+  case ARM::ArchKind::ARMV8_4A:
     return ARM::ProfileKind::A;
   case ARM::ArchKind::ARMV2:
   case ARM::ArchKind::ARMV2A:
@@ -800,6 +829,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   case ARM::ArchKind::ARMV8_1A:
   case ARM::ArchKind::ARMV8_2A:
   case ARM::ArchKind::ARMV8_3A:
+  case ARM::ArchKind::ARMV8_4A:
   case ARM::ArchKind::ARMV8R:
   case ARM::ArchKind::ARMV8MBaseline:
   case ARM::ArchKind::ARMV8MMainline:
@@ -868,10 +898,10 @@ AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
   return ArchKind::INVALID;
 }
 
-unsigned llvm::AArch64::parseArchExt(StringRef ArchExt) {
+AArch64::ArchExtKind llvm::AArch64::parseArchExt(StringRef ArchExt) {
   for (const auto A : AArch64ARCHExtNames) {
     if (ArchExt == A.getName())
-      return A.ID;
+      return static_cast<ArchExtKind>(A.ID);
   }
   return AArch64::AEK_INVALID;
 }
@@ -903,3 +933,7 @@ ARM::ProfileKind AArch64::parseArchProfile(StringRef Arch) {
 unsigned llvm::AArch64::parseArchVersion(StringRef Arch) {
   return ARM::parseArchVersion(Arch);
 }
+
+bool llvm::AArch64::isX18ReservedByDefault(const Triple &TT) {
+  return TT.isOSDarwin() || TT.isOSFuchsia() || TT.isOSWindows();
+}
diff --git a/contrib/llvm/lib/Support/ThreadLocal.cpp b/contrib/llvm/lib/Support/ThreadLocal.cpp
index 9a75c02b351f..f6e4a652302c 100644
--- a/contrib/llvm/lib/Support/ThreadLocal.cpp
+++ b/contrib/llvm/lib/Support/ThreadLocal.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ThreadLocal.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 
 //===----------------------------------------------------------------------===//
@@ -41,8 +41,8 @@ void ThreadLocalImpl::removeInstance() {
 }
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/ThreadLocal.inc"
-#elif defined( LLVM_ON_WIN32)
+#elif defined( _WIN32)
 #include "Windows/ThreadLocal.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 set in Support/ThreadLocal.cpp
+#warning Neither LLVM_ON_UNIX nor _WIN32 set in Support/ThreadLocal.cpp
 #endif
diff --git a/contrib/llvm/lib/Support/Threading.cpp b/contrib/llvm/lib/Support/Threading.cpp
index 473c84808af1..fcb1030e1ab4 100644
--- a/contrib/llvm/lib/Support/Threading.cpp
+++ b/contrib/llvm/lib/Support/Threading.cpp
@@ -37,7 +37,7 @@ bool llvm::llvm_is_multithreaded() {
 }
 
 #if LLVM_ENABLE_THREADS == 0 ||                                                \
-    (!defined(LLVM_ON_WIN32) && !defined(HAVE_PTHREAD_H))
+    (!defined(_WIN32) && !defined(HAVE_PTHREAD_H))
 // Support for non-Win32, non-pthread implementation.
 void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
                                   unsigned RequestedStackSize) {
@@ -89,7 +89,7 @@ unsigned llvm::hardware_concurrency() {
 #ifdef LLVM_ON_UNIX
 #include "Unix/Threading.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Threading.inc"
 #endif
 
diff --git a/contrib/llvm/lib/Support/Timer.cpp b/contrib/llvm/lib/Support/Timer.cpp
index 0c85faecca84..61d3b6c6e319 100644
--- a/contrib/llvm/lib/Support/Timer.cpp
+++ b/contrib/llvm/lib/Support/Timer.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
+#include <limits>
+
 using namespace llvm;
 
 // This ugly hack is brought to you courtesy of constructor/destructor ordering
@@ -234,6 +236,15 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description)
   TimerGroupList = this;
 }
 
+TimerGroup::TimerGroup(StringRef Name, StringRef Description,
+                       const StringMap<TimeRecord> &Records)
+    : TimerGroup(Name, Description) {
+  TimersToPrint.reserve(Records.size());
+  for (const auto &P : Records)
+    TimersToPrint.emplace_back(P.getValue(), P.getKey(), P.getKey());
+  assert(TimersToPrint.size() == Records.size() && "Size mismatch");
+}
+
 TimerGroup::~TimerGroup() {
   // If the timer group is destroyed before the timers it owns, accumulate and
   // print the timing data.
@@ -284,7 +295,7 @@ void TimerGroup::addTimer(Timer &T) {
 
 void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // Sort the timers in descending order by amount of time taken.
-  std::sort(TimersToPrint.begin(), TimersToPrint.end());
+  llvm::sort(TimersToPrint.begin(), TimersToPrint.end());
 
   TimeRecord Total;
   for (const PrintRecord &Record : TimersToPrint)
@@ -336,10 +347,14 @@ void TimerGroup::prepareToPrintList() {
   // reset them.
   for (Timer *T = FirstTimer; T; T = T->Next) {
     if (!T->hasTriggered()) continue;
+    bool WasRunning = T->isRunning();
+    if (WasRunning)
+      T->stopTimer();
+
     TimersToPrint.emplace_back(T->Time, T->Name, T->Description);
 
-    // Clear out the time.
-    T->clear();
+    if (WasRunning)
+      T->startTimer();
   }
 }
 
@@ -363,13 +378,17 @@ void TimerGroup::printAll(raw_ostream &OS) {
 void TimerGroup::printJSONValue(raw_ostream &OS, const PrintRecord &R,
                                 const char *suffix, double Value) {
   assert(yaml::needsQuotes(Name) == yaml::QuotingType::None &&
-         "TimerGroup name needs no quotes");
+         "TimerGroup name should not need quotes");
   assert(yaml::needsQuotes(R.Name) == yaml::QuotingType::None &&
-         "Timer name needs no quotes");
-  OS << "\t\"time." << Name << '.' << R.Name << suffix << "\": " << Value;
+         "Timer name should not need quotes");
+  constexpr auto max_digits10 = std::numeric_limits<double>::max_digits10;
+  OS << "\t\"time." << Name << '.' << R.Name << suffix
+     << "\": " << format("%.*e", max_digits10 - 1, Value);
 }
 
 const char *TimerGroup::printJSONValues(raw_ostream &OS, const char *delim) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+
   prepareToPrintList();
   for (const PrintRecord &R : TimersToPrint) {
     OS << delim;
@@ -381,6 +400,10 @@ const char *TimerGroup::printJSONValues(raw_ostream &OS, const char *delim) {
     printJSONValue(OS, R, ".user", T.getUserTime());
     OS << delim;
     printJSONValue(OS, R, ".sys", T.getSystemTime());
+    if (T.getMemUsed()) {
+      OS << delim;
+      printJSONValue(OS, R, ".mem", T.getMemUsed());
+    }
   }
   TimersToPrint.clear();
   return delim;
diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp
index 4f0a30042b76..b14d6492b1ed 100644
--- a/contrib/llvm/lib/Support/Triple.cpp
+++ b/contrib/llvm/lib/Support/Triple.cpp
@@ -168,6 +168,7 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
   case AMD: return "amd";
   case Mesa: return "mesa";
   case SUSE: return "suse";
+  case OpenEmbedded: return "oe";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -232,9 +233,7 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case MSVC: return "msvc";
   case Itanium: return "itanium";
   case Cygnus: return "cygnus";
-  case AMDOpenCL: return "amdopencl";
   case CoreCLR: return "coreclr";
-  case OpenCL: return "opencl";
   case Simulator: return "simulator";
   }
 
@@ -384,7 +383,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
     .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
-    .Cases("powerpc", "ppc32", Triple::ppc)
+    .Cases("powerpc", "ppc", "ppc32", Triple::ppc)
     .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
     .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
@@ -465,6 +464,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("amd", Triple::AMD)
     .Case("mesa", Triple::Mesa)
     .Case("suse", Triple::SUSE)
+    .Case("oe", Triple::OpenEmbedded)
     .Default(Triple::UnknownVendor);
 }
 
@@ -523,9 +523,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
     .StartsWith("msvc", Triple::MSVC)
     .StartsWith("itanium", Triple::Itanium)
     .StartsWith("cygnus", Triple::Cygnus)
-    .StartsWith("amdopencl", Triple::AMDOpenCL)
     .StartsWith("coreclr", Triple::CoreCLR)
-    .StartsWith("opencl", Triple::OpenCL)
     .StartsWith("simulator", Triple::Simulator)
     .Default(Triple::UnknownEnvironment);
 }
@@ -594,6 +592,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8_2a;
   case ARM::ArchKind::ARMV8_3A:
     return Triple::ARMSubArch_v8_3a;
+  case ARM::ArchKind::ARMV8_4A:
+    return Triple::ARMSubArch_v8_4a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
@@ -670,8 +670,6 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumbeb:
-  case Triple::wasm32:
-  case Triple::wasm64:
   case Triple::xcore:
     return Triple::ELF;
 
@@ -680,11 +678,15 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
     if (T.isOSDarwin())
       return Triple::MachO;
     return Triple::ELF;
+
+  case Triple::wasm32:
+  case Triple::wasm64:
+    return Triple::Wasm;
   }
   llvm_unreachable("unknown architecture");
 }
 
-/// \brief Construct a triple from the string representation provided.
+/// Construct a triple from the string representation provided.
 ///
 /// This stores the string representation and parses the various pieces into
 /// enum members.
@@ -713,7 +715,7 @@ Triple::Triple(const Twine &Str)
     ObjectFormat = getDefaultFormat(*this);
 }
 
-/// \brief Construct a triple from string representations of the architecture,
+/// Construct a triple from string representations of the architecture,
 /// vendor, and OS.
 ///
 /// This joins each argument into a canonical string representation and parses
@@ -729,7 +731,7 @@ Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr)
   ObjectFormat = getDefaultFormat(*this);
 }
 
-/// \brief Construct a triple from string representations of the architecture,
+/// Construct a triple from string representations of the architecture,
 /// vendor, OS, and environment.
 ///
 /// This joins each argument into a canonical string representation and parses
diff --git a/contrib/llvm/lib/Support/Twine.cpp b/contrib/llvm/lib/Support/Twine.cpp
index d17cd4e66439..4726c8ab7494 100644
--- a/contrib/llvm/lib/Support/Twine.cpp
+++ b/contrib/llvm/lib/Support/Twine.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/lib/Support/UnicodeCaseFold.cpp b/contrib/llvm/lib/Support/UnicodeCaseFold.cpp
new file mode 100644
index 000000000000..b18d49dbafb0
--- /dev/null
+++ b/contrib/llvm/lib/Support/UnicodeCaseFold.cpp
@@ -0,0 +1,742 @@
+//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//
+//
+// This file was generated by utils/unicode-case-fold.py from the Unicode
+// case folding database at
+//    http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt
+//
+// To regenerate this file, run:
+//   utils/unicode-case-fold.py \
+//     "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \
+//     > lib/Support/UnicodeCaseFold.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Unicode.h"
+
+int llvm::sys::unicode::foldCharSimple(int C) {
+  if (C < 0x0041)
+    return C;
+  // 26 characters
+  if (C <= 0x005a)
+    return C + 32;
+  // MICRO SIGN
+  if (C == 0x00b5)
+    return 0x03bc;
+  if (C < 0x00c0)
+    return C;
+  // 23 characters
+  if (C <= 0x00d6)
+    return C + 32;
+  if (C < 0x00d8)
+    return C;
+  // 7 characters
+  if (C <= 0x00de)
+    return C + 32;
+  if (C < 0x0100)
+    return C;
+  // 24 characters
+  if (C <= 0x012e)
+    return C | 1;
+  if (C < 0x0132)
+    return C;
+  // 3 characters
+  if (C <= 0x0136)
+    return C | 1;
+  if (C < 0x0139)
+    return C;
+  // 8 characters
+  if (C <= 0x0147 && C % 2 == 1)
+    return C + 1;
+  if (C < 0x014a)
+    return C;
+  // 23 characters
+  if (C <= 0x0176)
+    return C | 1;
+  // LATIN CAPITAL LETTER Y WITH DIAERESIS
+  if (C == 0x0178)
+    return 0x00ff;
+  if (C < 0x0179)
+    return C;
+  // 3 characters
+  if (C <= 0x017d && C % 2 == 1)
+    return C + 1;
+  // LATIN SMALL LETTER LONG S
+  if (C == 0x017f)
+    return 0x0073;
+  // LATIN CAPITAL LETTER B WITH HOOK
+  if (C == 0x0181)
+    return 0x0253;
+  if (C < 0x0182)
+    return C;
+  // 2 characters
+  if (C <= 0x0184)
+    return C | 1;
+  // LATIN CAPITAL LETTER OPEN O
+  if (C == 0x0186)
+    return 0x0254;
+  // LATIN CAPITAL LETTER C WITH HOOK
+  if (C == 0x0187)
+    return 0x0188;
+  if (C < 0x0189)
+    return C;
+  // 2 characters
+  if (C <= 0x018a)
+    return C + 205;
+  // LATIN CAPITAL LETTER D WITH TOPBAR
+  if (C == 0x018b)
+    return 0x018c;
+  // LATIN CAPITAL LETTER REVERSED E
+  if (C == 0x018e)
+    return 0x01dd;
+  // LATIN CAPITAL LETTER SCHWA
+  if (C == 0x018f)
+    return 0x0259;
+  // LATIN CAPITAL LETTER OPEN E
+  if (C == 0x0190)
+    return 0x025b;
+  // LATIN CAPITAL LETTER F WITH HOOK
+  if (C == 0x0191)
+    return 0x0192;
+  // LATIN CAPITAL LETTER G WITH HOOK
+  if (C == 0x0193)
+    return 0x0260;
+  // LATIN CAPITAL LETTER GAMMA
+  if (C == 0x0194)
+    return 0x0263;
+  // LATIN CAPITAL LETTER IOTA
+  if (C == 0x0196)
+    return 0x0269;
+  // LATIN CAPITAL LETTER I WITH STROKE
+  if (C == 0x0197)
+    return 0x0268;
+  // LATIN CAPITAL LETTER K WITH HOOK
+  if (C == 0x0198)
+    return 0x0199;
+  // LATIN CAPITAL LETTER TURNED M
+  if (C == 0x019c)
+    return 0x026f;
+  // LATIN CAPITAL LETTER N WITH LEFT HOOK
+  if (C == 0x019d)
+    return 0x0272;
+  // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+  if (C == 0x019f)
+    return 0x0275;
+  if (C < 0x01a0)
+    return C;
+  // 3 characters
+  if (C <= 0x01a4)
+    return C | 1;
+  // LATIN LETTER YR
+  if (C == 0x01a6)
+    return 0x0280;
+  // LATIN CAPITAL LETTER TONE TWO
+  if (C == 0x01a7)
+    return 0x01a8;
+  // LATIN CAPITAL LETTER ESH
+  if (C == 0x01a9)
+    return 0x0283;
+  // LATIN CAPITAL LETTER T WITH HOOK
+  if (C == 0x01ac)
+    return 0x01ad;
+  // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+  if (C == 0x01ae)
+    return 0x0288;
+  // LATIN CAPITAL LETTER U WITH HORN
+  if (C == 0x01af)
+    return 0x01b0;
+  if (C < 0x01b1)
+    return C;
+  // 2 characters
+  if (C <= 0x01b2)
+    return C + 217;
+  if (C < 0x01b3)
+    return C;
+  // 2 characters
+  if (C <= 0x01b5 && C % 2 == 1)
+    return C + 1;
+  // LATIN CAPITAL LETTER EZH
+  if (C == 0x01b7)
+    return 0x0292;
+  if (C < 0x01b8)
+    return C;
+  // 2 characters
+  if (C <= 0x01bc && C % 4 == 0)
+    return C + 1;
+  // LATIN CAPITAL LETTER DZ WITH CARON
+  if (C == 0x01c4)
+    return 0x01c6;
+  // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
+  if (C == 0x01c5)
+    return 0x01c6;
+  // LATIN CAPITAL LETTER LJ
+  if (C == 0x01c7)
+    return 0x01c9;
+  // LATIN CAPITAL LETTER L WITH SMALL LETTER J
+  if (C == 0x01c8)
+    return 0x01c9;
+  // LATIN CAPITAL LETTER NJ
+  if (C == 0x01ca)
+    return 0x01cc;
+  if (C < 0x01cb)
+    return C;
+  // 9 characters
+  if (C <= 0x01db && C % 2 == 1)
+    return C + 1;
+  if (C < 0x01de)
+    return C;
+  // 9 characters
+  if (C <= 0x01ee)
+    return C | 1;
+  // LATIN CAPITAL LETTER DZ
+  if (C == 0x01f1)
+    return 0x01f3;
+  if (C < 0x01f2)
+    return C;
+  // 2 characters
+  if (C <= 0x01f4)
+    return C | 1;
+  // LATIN CAPITAL LETTER HWAIR
+  if (C == 0x01f6)
+    return 0x0195;
+  // LATIN CAPITAL LETTER WYNN
+  if (C == 0x01f7)
+    return 0x01bf;
+  if (C < 0x01f8)
+    return C;
+  // 20 characters
+  if (C <= 0x021e)
+    return C | 1;
+  // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
+  if (C == 0x0220)
+    return 0x019e;
+  if (C < 0x0222)
+    return C;
+  // 9 characters
+  if (C <= 0x0232)
+    return C | 1;
+  // LATIN CAPITAL LETTER A WITH STROKE
+  if (C == 0x023a)
+    return 0x2c65;
+  // LATIN CAPITAL LETTER C WITH STROKE
+  if (C == 0x023b)
+    return 0x023c;
+  // LATIN CAPITAL LETTER L WITH BAR
+  if (C == 0x023d)
+    return 0x019a;
+  // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
+  if (C == 0x023e)
+    return 0x2c66;
+  // LATIN CAPITAL LETTER GLOTTAL STOP
+  if (C == 0x0241)
+    return 0x0242;
+  // LATIN CAPITAL LETTER B WITH STROKE
+  if (C == 0x0243)
+    return 0x0180;
+  // LATIN CAPITAL LETTER U BAR
+  if (C == 0x0244)
+    return 0x0289;
+  // LATIN CAPITAL LETTER TURNED V
+  if (C == 0x0245)
+    return 0x028c;
+  if (C < 0x0246)
+    return C;
+  // 5 characters
+  if (C <= 0x024e)
+    return C | 1;
+  // COMBINING GREEK YPOGEGRAMMENI
+  if (C == 0x0345)
+    return 0x03b9;
+  if (C < 0x0370)
+    return C;
+  // 2 characters
+  if (C <= 0x0372)
+    return C | 1;
+  // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+  if (C == 0x0376)
+    return 0x0377;
+  // GREEK CAPITAL LETTER YOT
+  if (C == 0x037f)
+    return 0x03f3;
+  // GREEK CAPITAL LETTER ALPHA WITH TONOS
+  if (C == 0x0386)
+    return 0x03ac;
+  if (C < 0x0388)
+    return C;
+  // 3 characters
+  if (C <= 0x038a)
+    return C + 37;
+  // GREEK CAPITAL LETTER OMICRON WITH TONOS
+  if (C == 0x038c)
+    return 0x03cc;
+  if (C < 0x038e)
+    return C;
+  // 2 characters
+  if (C <= 0x038f)
+    return C + 63;
+  if (C < 0x0391)
+    return C;
+  // 17 characters
+  if (C <= 0x03a1)
+    return C + 32;
+  if (C < 0x03a3)
+    return C;
+  // 9 characters
+  if (C <= 0x03ab)
+    return C + 32;
+  // GREEK SMALL LETTER FINAL SIGMA
+  if (C == 0x03c2)
+    return 0x03c3;
+  // GREEK CAPITAL KAI SYMBOL
+  if (C == 0x03cf)
+    return 0x03d7;
+  // GREEK BETA SYMBOL
+  if (C == 0x03d0)
+    return 0x03b2;
+  // GREEK THETA SYMBOL
+  if (C == 0x03d1)
+    return 0x03b8;
+  // GREEK PHI SYMBOL
+  if (C == 0x03d5)
+    return 0x03c6;
+  // GREEK PI SYMBOL
+  if (C == 0x03d6)
+    return 0x03c0;
+  if (C < 0x03d8)
+    return C;
+  // 12 characters
+  if (C <= 0x03ee)
+    return C | 1;
+  // GREEK KAPPA SYMBOL
+  if (C == 0x03f0)
+    return 0x03ba;
+  // GREEK RHO SYMBOL
+  if (C == 0x03f1)
+    return 0x03c1;
+  // GREEK CAPITAL THETA SYMBOL
+  if (C == 0x03f4)
+    return 0x03b8;
+  // GREEK LUNATE EPSILON SYMBOL
+  if (C == 0x03f5)
+    return 0x03b5;
+  // GREEK CAPITAL LETTER SHO
+  if (C == 0x03f7)
+    return 0x03f8;
+  // GREEK CAPITAL LUNATE SIGMA SYMBOL
+  if (C == 0x03f9)
+    return 0x03f2;
+  // GREEK CAPITAL LETTER SAN
+  if (C == 0x03fa)
+    return 0x03fb;
+  if (C < 0x03fd)
+    return C;
+  // 3 characters
+  if (C <= 0x03ff)
+    return C + -130;
+  if (C < 0x0400)
+    return C;
+  // 16 characters
+  if (C <= 0x040f)
+    return C + 80;
+  if (C < 0x0410)
+    return C;
+  // 32 characters
+  if (C <= 0x042f)
+    return C + 32;
+  if (C < 0x0460)
+    return C;
+  // 17 characters
+  if (C <= 0x0480)
+    return C | 1;
+  if (C < 0x048a)
+    return C;
+  // 27 characters
+  if (C <= 0x04be)
+    return C | 1;
+  // CYRILLIC LETTER PALOCHKA
+  if (C == 0x04c0)
+    return 0x04cf;
+  if (C < 0x04c1)
+    return C;
+  // 7 characters
+  if (C <= 0x04cd && C % 2 == 1)
+    return C + 1;
+  if (C < 0x04d0)
+    return C;
+  // 48 characters
+  if (C <= 0x052e)
+    return C | 1;
+  if (C < 0x0531)
+    return C;
+  // 38 characters
+  if (C <= 0x0556)
+    return C + 48;
+  if (C < 0x10a0)
+    return C;
+  // 38 characters
+  if (C <= 0x10c5)
+    return C + 7264;
+  if (C < 0x10c7)
+    return C;
+  // 2 characters
+  if (C <= 0x10cd && C % 6 == 5)
+    return C + 7264;
+  if (C < 0x13f8)
+    return C;
+  // 6 characters
+  if (C <= 0x13fd)
+    return C + -8;
+  // CYRILLIC SMALL LETTER ROUNDED VE
+  if (C == 0x1c80)
+    return 0x0432;
+  // CYRILLIC SMALL LETTER LONG-LEGGED DE
+  if (C == 0x1c81)
+    return 0x0434;
+  // CYRILLIC SMALL LETTER NARROW O
+  if (C == 0x1c82)
+    return 0x043e;
+  if (C < 0x1c83)
+    return C;
+  // 2 characters
+  if (C <= 0x1c84)
+    return C + -6210;
+  // CYRILLIC SMALL LETTER THREE-LEGGED TE
+  if (C == 0x1c85)
+    return 0x0442;
+  // CYRILLIC SMALL LETTER TALL HARD SIGN
+  if (C == 0x1c86)
+    return 0x044a;
+  // CYRILLIC SMALL LETTER TALL YAT
+  if (C == 0x1c87)
+    return 0x0463;
+  // CYRILLIC SMALL LETTER UNBLENDED UK
+  if (C == 0x1c88)
+    return 0xa64b;
+  if (C < 0x1e00)
+    return C;
+  // 75 characters
+  if (C <= 0x1e94)
+    return C | 1;
+  // LATIN SMALL LETTER LONG S WITH DOT ABOVE
+  if (C == 0x1e9b)
+    return 0x1e61;
+  // LATIN CAPITAL LETTER SHARP S
+  if (C == 0x1e9e)
+    return 0x00df;
+  if (C < 0x1ea0)
+    return C;
+  // 48 characters
+  if (C <= 0x1efe)
+    return C | 1;
+  if (C < 0x1f08)
+    return C;
+  // 8 characters
+  if (C <= 0x1f0f)
+    return C + -8;
+  if (C < 0x1f18)
+    return C;
+  // 6 characters
+  if (C <= 0x1f1d)
+    return C + -8;
+  if (C < 0x1f28)
+    return C;
+  // 8 characters
+  if (C <= 0x1f2f)
+    return C + -8;
+  if (C < 0x1f38)
+    return C;
+  // 8 characters
+  if (C <= 0x1f3f)
+    return C + -8;
+  if (C < 0x1f48)
+    return C;
+  // 6 characters
+  if (C <= 0x1f4d)
+    return C + -8;
+  if (C < 0x1f59)
+    return C;
+  // 4 characters
+  if (C <= 0x1f5f && C % 2 == 1)
+    return C + -8;
+  if (C < 0x1f68)
+    return C;
+  // 8 characters
+  if (C <= 0x1f6f)
+    return C + -8;
+  if (C < 0x1f88)
+    return C;
+  // 8 characters
+  if (C <= 0x1f8f)
+    return C + -8;
+  if (C < 0x1f98)
+    return C;
+  // 8 characters
+  if (C <= 0x1f9f)
+    return C + -8;
+  if (C < 0x1fa8)
+    return C;
+  // 8 characters
+  if (C <= 0x1faf)
+    return C + -8;
+  if (C < 0x1fb8)
+    return C;
+  // 2 characters
+  if (C <= 0x1fb9)
+    return C + -8;
+  if (C < 0x1fba)
+    return C;
+  // 2 characters
+  if (C <= 0x1fbb)
+    return C + -74;
+  // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+  if (C == 0x1fbc)
+    return 0x1fb3;
+  // GREEK PROSGEGRAMMENI
+  if (C == 0x1fbe)
+    return 0x03b9;
+  if (C < 0x1fc8)
+    return C;
+  // 4 characters
+  if (C <= 0x1fcb)
+    return C + -86;
+  // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+  if (C == 0x1fcc)
+    return 0x1fc3;
+  if (C < 0x1fd8)
+    return C;
+  // 2 characters
+  if (C <= 0x1fd9)
+    return C + -8;
+  if (C < 0x1fda)
+    return C;
+  // 2 characters
+  if (C <= 0x1fdb)
+    return C + -100;
+  if (C < 0x1fe8)
+    return C;
+  // 2 characters
+  if (C <= 0x1fe9)
+    return C + -8;
+  if (C < 0x1fea)
+    return C;
+  // 2 characters
+  if (C <= 0x1feb)
+    return C + -112;
+  // GREEK CAPITAL LETTER RHO WITH DASIA
+  if (C == 0x1fec)
+    return 0x1fe5;
+  if (C < 0x1ff8)
+    return C;
+  // 2 characters
+  if (C <= 0x1ff9)
+    return C + -128;
+  if (C < 0x1ffa)
+    return C;
+  // 2 characters
+  if (C <= 0x1ffb)
+    return C + -126;
+  // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+  if (C == 0x1ffc)
+    return 0x1ff3;
+  // OHM SIGN
+  if (C == 0x2126)
+    return 0x03c9;
+  // KELVIN SIGN
+  if (C == 0x212a)
+    return 0x006b;
+  // ANGSTROM SIGN
+  if (C == 0x212b)
+    return 0x00e5;
+  // TURNED CAPITAL F
+  if (C == 0x2132)
+    return 0x214e;
+  if (C < 0x2160)
+    return C;
+  // 16 characters
+  if (C <= 0x216f)
+    return C + 16;
+  // ROMAN NUMERAL REVERSED ONE HUNDRED
+  if (C == 0x2183)
+    return 0x2184;
+  if (C < 0x24b6)
+    return C;
+  // 26 characters
+  if (C <= 0x24cf)
+    return C + 26;
+  if (C < 0x2c00)
+    return C;
+  // 47 characters
+  if (C <= 0x2c2e)
+    return C + 48;
+  // LATIN CAPITAL LETTER L WITH DOUBLE BAR
+  if (C == 0x2c60)
+    return 0x2c61;
+  // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
+  if (C == 0x2c62)
+    return 0x026b;
+  // LATIN CAPITAL LETTER P WITH STROKE
+  if (C == 0x2c63)
+    return 0x1d7d;
+  // LATIN CAPITAL LETTER R WITH TAIL
+  if (C == 0x2c64)
+    return 0x027d;
+  if (C < 0x2c67)
+    return C;
+  // 3 characters
+  if (C <= 0x2c6b && C % 2 == 1)
+    return C + 1;
+  // LATIN CAPITAL LETTER ALPHA
+  if (C == 0x2c6d)
+    return 0x0251;
+  // LATIN CAPITAL LETTER M WITH HOOK
+  if (C == 0x2c6e)
+    return 0x0271;
+  // LATIN CAPITAL LETTER TURNED A
+  if (C == 0x2c6f)
+    return 0x0250;
+  // LATIN CAPITAL LETTER TURNED ALPHA
+  if (C == 0x2c70)
+    return 0x0252;
+  if (C < 0x2c72)
+    return C;
+  // 2 characters
+  if (C <= 0x2c75 && C % 3 == 2)
+    return C + 1;
+  if (C < 0x2c7e)
+    return C;
+  // 2 characters
+  if (C <= 0x2c7f)
+    return C + -10815;
+  if (C < 0x2c80)
+    return C;
+  // 50 characters
+  if (C <= 0x2ce2)
+    return C | 1;
+  if (C < 0x2ceb)
+    return C;
+  // 2 characters
+  if (C <= 0x2ced && C % 2 == 1)
+    return C + 1;
+  if (C < 0x2cf2)
+    return C;
+  // 2 characters
+  if (C <= 0xa640 && C % 31054 == 11506)
+    return C + 1;
+  if (C < 0xa642)
+    return C;
+  // 22 characters
+  if (C <= 0xa66c)
+    return C | 1;
+  if (C < 0xa680)
+    return C;
+  // 14 characters
+  if (C <= 0xa69a)
+    return C | 1;
+  if (C < 0xa722)
+    return C;
+  // 7 characters
+  if (C <= 0xa72e)
+    return C | 1;
+  if (C < 0xa732)
+    return C;
+  // 31 characters
+  if (C <= 0xa76e)
+    return C | 1;
+  if (C < 0xa779)
+    return C;
+  // 2 characters
+  if (C <= 0xa77b && C % 2 == 1)
+    return C + 1;
+  // LATIN CAPITAL LETTER INSULAR G
+  if (C == 0xa77d)
+    return 0x1d79;
+  if (C < 0xa77e)
+    return C;
+  // 5 characters
+  if (C <= 0xa786)
+    return C | 1;
+  // LATIN CAPITAL LETTER SALTILLO
+  if (C == 0xa78b)
+    return 0xa78c;
+  // LATIN CAPITAL LETTER TURNED H
+  if (C == 0xa78d)
+    return 0x0265;
+  if (C < 0xa790)
+    return C;
+  // 2 characters
+  if (C <= 0xa792)
+    return C | 1;
+  if (C < 0xa796)
+    return C;
+  // 10 characters
+  if (C <= 0xa7a8)
+    return C | 1;
+  // LATIN CAPITAL LETTER H WITH HOOK
+  if (C == 0xa7aa)
+    return 0x0266;
+  // LATIN CAPITAL LETTER REVERSED OPEN E
+  if (C == 0xa7ab)
+    return 0x025c;
+  // LATIN CAPITAL LETTER SCRIPT G
+  if (C == 0xa7ac)
+    return 0x0261;
+  // LATIN CAPITAL LETTER L WITH BELT
+  if (C == 0xa7ad)
+    return 0x026c;
+  // LATIN CAPITAL LETTER SMALL CAPITAL I
+  if (C == 0xa7ae)
+    return 0x026a;
+  // LATIN CAPITAL LETTER TURNED K
+  if (C == 0xa7b0)
+    return 0x029e;
+  // LATIN CAPITAL LETTER TURNED T
+  if (C == 0xa7b1)
+    return 0x0287;
+  // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
+  if (C == 0xa7b2)
+    return 0x029d;
+  // LATIN CAPITAL LETTER CHI
+  if (C == 0xa7b3)
+    return 0xab53;
+  if (C < 0xa7b4)
+    return C;
+  // 2 characters
+  if (C <= 0xa7b6)
+    return C | 1;
+  if (C < 0xab70)
+    return C;
+  // 80 characters
+  if (C <= 0xabbf)
+    return C + -38864;
+  if (C < 0xff21)
+    return C;
+  // 26 characters
+  if (C <= 0xff3a)
+    return C + 32;
+  if (C < 0x10400)
+    return C;
+  // 40 characters
+  if (C <= 0x10427)
+    return C + 40;
+  if (C < 0x104b0)
+    return C;
+  // 36 characters
+  if (C <= 0x104d3)
+    return C + 40;
+  if (C < 0x10c80)
+    return C;
+  // 51 characters
+  if (C <= 0x10cb2)
+    return C + 64;
+  if (C < 0x118a0)
+    return C;
+  // 32 characters
+  if (C <= 0x118bf)
+    return C + 32;
+  if (C < 0x1e900)
+    return C;
+  // 34 characters
+  if (C <= 0x1e921)
+    return C + 34;
+
+  return C;
+}
diff --git a/contrib/llvm/lib/Support/Unix/Host.inc b/contrib/llvm/lib/Support/Unix/Host.inc
index 5580e63893c6..b65f84bf4444 100644
--- a/contrib/llvm/lib/Support/Unix/Host.inc
+++ b/contrib/llvm/lib/Support/Unix/Host.inc
@@ -64,5 +64,5 @@ std::string sys::getDefaultTargetTriple() {
     TargetTripleString = EnvTriple;
 #endif
 
-  return Triple::normalize(TargetTripleString);
+  return TargetTripleString;
 }
diff --git a/contrib/llvm/lib/Support/Unix/Memory.inc b/contrib/llvm/lib/Support/Unix/Memory.inc
index 848548d18177..adbfff2f59a5 100644
--- a/contrib/llvm/lib/Support/Unix/Memory.inc
+++ b/contrib/llvm/lib/Support/Unix/Memory.inc
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/Config/config.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
@@ -24,6 +25,10 @@
 #include <mach/mach.h>
 #endif
 
+#ifdef __Fuchsia__
+#include <zircon/syscalls.h>
+#endif
+
 #if defined(__mips__)
 #  if defined(__OpenBSD__)
 #    include <mips64/sysarch.h>
@@ -32,7 +37,7 @@
 #  endif
 #endif
 
-#ifdef __APPLE__
+#if defined(__APPLE__)
 extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
 #else
 extern "C" void __clear_cache(void *, void*);
@@ -205,6 +210,11 @@ void Memory::InvalidateInstructionCache(const void *Addr,
   sys_icache_invalidate(const_cast<void *>(Addr), Len);
 #  endif
 
+#elif defined(__Fuchsia__)
+
+  zx_status_t Status = zx_cache_flush(Addr, Len, ZX_CACHE_FLUSH_INSN);
+  assert(Status == ZX_OK && "cannot invalidate instruction cache");
+
 #else
 
 #  if (defined(__POWERPC__) || defined (__ppc__) || \
diff --git a/contrib/llvm/lib/Support/Unix/Path.inc b/contrib/llvm/lib/Support/Unix/Path.inc
index 2ecb97316c87..7ad57d892ff1 100644
--- a/contrib/llvm/lib/Support/Unix/Path.inc
+++ b/contrib/llvm/lib/Support/Unix/Path.inc
@@ -31,23 +31,8 @@
 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
 #endif
-#if HAVE_DIRENT_H
-# include <dirent.h>
-# define NAMLEN(dirent) strlen((dirent)->d_name)
-#else
-# define dirent direct
-# define NAMLEN(dirent) (dirent)->d_namlen
-# if HAVE_SYS_NDIR_H
-#  include <sys/ndir.h>
-# endif
-# if HAVE_SYS_DIR_H
-#  include <sys/dir.h>
-# endif
-# if HAVE_NDIR_H
-#  include <ndir.h>
-# endif
-#endif
 
+#include <dirent.h>
 #include <pwd.h>
 
 #ifdef __APPLE__
@@ -108,6 +93,9 @@ using namespace llvm;
 namespace llvm {
 namespace sys  {
 namespace fs {
+
+const file_t kInvalidFile = -1;
+
 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
     defined(__minix) || defined(__FreeBSD_kernel__) || defined(__linux__) ||   \
     defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX)
@@ -380,6 +368,12 @@ static bool is_local_impl(struct STATVFS &Vfs) {
 #elif defined(__CYGWIN__)
   // Cygwin doesn't expose this information; would need to use Win32 API.
   return false;
+#elif defined(__Fuchsia__)
+  // Fuchsia doesn't yet support remote filesystem mounts.
+  return true;
+#elif defined(__HAIKU__)
+  // Haiku doesn't expose this information.
+  return false;
 #elif defined(__sun)
   // statvfs::f_basetype contains a null-terminated FSType name of the mounted target
   StringRef fstype(Vfs.f_basetype);
@@ -530,7 +524,7 @@ static void expandTildeExpr(SmallVectorImpl<char> &Path) {
 }
 
 static std::error_code fillStatus(int StatRet, const struct stat &Status,
-                             file_status &Result) {
+                                  file_status &Result) {
   if (StatRet != 0) {
     std::error_code ec(errno, std::generic_category());
     if (ec == errc::no_such_file_or_directory)
@@ -643,7 +637,8 @@ std::error_code mapped_file_region::init(int FD, uint64_t Offset,
 
 mapped_file_region::mapped_file_region(int fd, mapmode mode, size_t length,
                                        uint64_t offset, std::error_code &ec)
-    : Size(length), Mapping() {
+    : Size(length), Mapping(), Mode(mode) {
+  (void)Mode;
   ec = init(fd, offset, mode);
   if (ec)
     Mapping = nullptr;
@@ -702,7 +697,7 @@ std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   if (cur_dir == nullptr && errno != 0) {
     return std::error_code(errno, std::generic_category());
   } else if (cur_dir != nullptr) {
-    StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
+    StringRef name(cur_dir->d_name);
     if ((name.size() == 1 && name[0] == '.') ||
         (name.size() == 2 && name[0] == '.' && name[1] == '.'))
       return directory_iterator_increment(it);
@@ -729,21 +724,83 @@ static bool hasProcSelfFD() {
 }
 #endif
 
-std::error_code openFileForRead(const Twine &Name, int &ResultFD,
-                                SmallVectorImpl<char> *RealPath) {
-  SmallString<128> Storage;
-  StringRef P = Name.toNullTerminatedStringRef(Storage);
-  int OpenFlags = O_RDONLY;
+static int nativeOpenFlags(CreationDisposition Disp, OpenFlags Flags,
+                           FileAccess Access) {
+  int Result = 0;
+  if (Access == FA_Read)
+    Result |= O_RDONLY;
+  else if (Access == FA_Write)
+    Result |= O_WRONLY;
+  else if (Access == (FA_Read | FA_Write))
+    Result |= O_RDWR;
+
+  // This is for compatibility with old code that assumed F_Append implied
+  // would open an existing file.  See Windows/Path.inc for a longer comment.
+  if (Flags & F_Append)
+    Disp = CD_OpenAlways;
+
+  if (Disp == CD_CreateNew) {
+    Result |= O_CREAT; // Create if it doesn't exist.
+    Result |= O_EXCL;  // Fail if it does.
+  } else if (Disp == CD_CreateAlways) {
+    Result |= O_CREAT; // Create if it doesn't exist.
+    Result |= O_TRUNC; // Truncate if it does.
+  } else if (Disp == CD_OpenAlways) {
+    Result |= O_CREAT; // Create if it doesn't exist.
+  } else if (Disp == CD_OpenExisting) {
+    // Nothing special, just don't add O_CREAT and we get these semantics.
+  }
+
+  if (Flags & F_Append)
+    Result |= O_APPEND;
+
 #ifdef O_CLOEXEC
-  OpenFlags |= O_CLOEXEC;
+  if (!(Flags & OF_ChildInherit))
+    Result |= O_CLOEXEC;
 #endif
-  if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags)) < 0)
+
+  return Result;
+}
+
+std::error_code openFile(const Twine &Name, int &ResultFD,
+                         CreationDisposition Disp, FileAccess Access,
+                         OpenFlags Flags, unsigned Mode) {
+  int OpenFlags = nativeOpenFlags(Disp, Flags, Access);
+
+  SmallString<128> Storage;
+  StringRef P = Name.toNullTerminatedStringRef(Storage);
+  if ((ResultFD = sys::RetryAfterSignal(-1, ::open, P.begin(), OpenFlags, Mode)) <
+      0)
     return std::error_code(errno, std::generic_category());
 #ifndef O_CLOEXEC
-  int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
-  (void)r;
-  assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
+  if (!(Flags & OF_ChildInherit)) {
+    int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
+    (void)r;
+    assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
+  }
 #endif
+  return std::error_code();
+}
+
+Expected<int> openNativeFile(const Twine &Name, CreationDisposition Disp,
+                             FileAccess Access, OpenFlags Flags,
+                             unsigned Mode) {
+
+  int FD;
+  std::error_code EC = openFile(Name, FD, Disp, Access, Flags, Mode);
+  if (EC)
+    return errorCodeToError(EC);
+  return FD;
+}
+
+std::error_code openFileForRead(const Twine &Name, int &ResultFD,
+                                OpenFlags Flags,
+                                SmallVectorImpl<char> *RealPath) {
+  std::error_code EC =
+      openFile(Name, ResultFD, CD_OpenExisting, FA_Read, Flags, 0666);
+  if (EC)
+    return EC;
+
   // Attempt to get the real name of the file, if the user asked
   if(!RealPath)
     return std::error_code();
@@ -763,6 +820,9 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
     if (CharCount > 0)
       RealPath->append(Buffer, Buffer + CharCount);
   } else {
+    SmallString<128> Storage;
+    StringRef P = Name.toNullTerminatedStringRef(Storage);
+
     // Use ::realpath to get the real path name
     if (::realpath(P.begin(), Buffer) != nullptr)
       RealPath->append(Buffer, Buffer + strlen(Buffer));
@@ -771,41 +831,18 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
   return std::error_code();
 }
 
-std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
-                            sys::fs::OpenFlags Flags, unsigned Mode) {
-  // Verify that we don't have both "append" and "excl".
-  assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
-         "Cannot specify both 'excl' and 'append' file creation flags!");
-
-  int OpenFlags = O_CREAT;
-
-#ifdef O_CLOEXEC
-  OpenFlags |= O_CLOEXEC;
-#endif
-
-  if (Flags & F_RW)
-    OpenFlags |= O_RDWR;
-  else
-    OpenFlags |= O_WRONLY;
-
-  if (Flags & F_Append)
-    OpenFlags |= O_APPEND;
-  else
-    OpenFlags |= O_TRUNC;
-
-  if (Flags & F_Excl)
-    OpenFlags |= O_EXCL;
+Expected<file_t> openNativeFileForRead(const Twine &Name, OpenFlags Flags,
+                                       SmallVectorImpl<char> *RealPath) {
+  file_t ResultFD;
+  std::error_code EC = openFileForRead(Name, ResultFD, Flags, RealPath);
+  if (EC)
+    return errorCodeToError(EC);
+  return ResultFD;
+}
 
-  SmallString<128> Storage;
-  StringRef P = Name.toNullTerminatedStringRef(Storage);
-  if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags, Mode)) < 0)
-    return std::error_code(errno, std::generic_category());
-#ifndef O_CLOEXEC
-  int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
-  (void)r;
-  assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
-#endif
-  return std::error_code();
+void closeFile(file_t &F) {
+  ::close(F);
+  F = kInvalidFile;
 }
 
 template <typename T>
@@ -860,12 +897,12 @@ std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
     return real_path(Storage, dest, false);
   }
 
-  int fd;
-  std::error_code EC = openFileForRead(path, fd, &dest);
-
-  if (EC)
-    return EC;
-  ::close(fd);
+  SmallString<128> Storage;
+  StringRef P = path.toNullTerminatedStringRef(Storage);
+  char Buffer[PATH_MAX];
+  if (::realpath(P.begin(), Buffer) == nullptr)
+    return std::error_code(errno, std::generic_category());
+  dest.append(Buffer, Buffer + strlen(Buffer));
   return std::error_code();
 }
 
diff --git a/contrib/llvm/lib/Support/Unix/Process.inc b/contrib/llvm/lib/Support/Unix/Process.inc
index e43650d707e3..fa515d44f3f2 100644
--- a/contrib/llvm/lib/Support/Unix/Process.inc
+++ b/contrib/llvm/lib/Support/Unix/Process.inc
@@ -14,6 +14,7 @@
 #include "Unix.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
@@ -78,7 +79,7 @@ unsigned Process::getPageSize() {
 #elif defined(HAVE_SYSCONF)
   static long page_size = ::sysconf(_SC_PAGE_SIZE);
 #else
-#warning Cannot get the page size on this machine
+#error Cannot get the page size on this machine
 #endif
   return static_cast<unsigned>(page_size);
 }
@@ -172,15 +173,6 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   return std::string(Val);
 }
 
-std::error_code
-Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
-                           ArrayRef<const char *> ArgsIn,
-                           SpecificBumpPtrAllocator<char> &) {
-  ArgsOut.append(ArgsIn.begin(), ArgsIn.end());
-
-  return std::error_code();
-}
-
 namespace {
 class FDCloser {
 public:
@@ -207,7 +199,7 @@ std::error_code Process::FixupStandardFileDescriptors() {
   for (int StandardFD : StandardFDs) {
     struct stat st;
     errno = 0;
-    if (RetryAfterSignal(-1, fstat, StandardFD, &st) < 0) {
+    if (RetryAfterSignal(-1, ::fstat, StandardFD, &st) < 0) {
       assert(errno && "expected errno to be set if fstat failed!");
       // fstat should return EBADF if the file descriptor is closed.
       if (errno != EBADF)
@@ -219,7 +211,7 @@ std::error_code Process::FixupStandardFileDescriptors() {
     assert(errno == EBADF && "expected errno to have EBADF at this point!");
 
     if (NullFD < 0) {
-      if ((NullFD = RetryAfterSignal(-1, open, "/dev/null", O_RDWR)) < 0)
+      if ((NullFD = RetryAfterSignal(-1, ::open, "/dev/null", O_RDWR)) < 0)
         return std::error_code(errno, std::generic_category());
     }
 
@@ -369,6 +361,21 @@ static bool terminalHasColors(int fd) {
   // Return true if we found a color capabilities for the current terminal.
   if (HasColors)
     return true;
+#else
+  // When the terminfo database is not available, check if the current terminal
+  // is one of terminals that are known to support ANSI color escape codes.
+  if (const char *TermStr = std::getenv("TERM")) {
+    return StringSwitch<bool>(TermStr)
+      .Case("ansi", true)
+      .Case("cygwin", true)
+      .Case("linux", true)
+      .StartsWith("screen", true)
+      .StartsWith("xterm", true)
+      .StartsWith("vt100", true)
+      .StartsWith("rxvt", true)
+      .EndsWith("color", true)
+      .Default(false);
+  }
 #endif
 
   // Otherwise, be conservative.
diff --git a/contrib/llvm/lib/Support/Unix/Program.inc b/contrib/llvm/lib/Support/Unix/Program.inc
index 4f791991f3e8..d0abc3763e82 100644
--- a/contrib/llvm/lib/Support/Unix/Program.inc
+++ b/contrib/llvm/lib/Support/Unix/Program.inc
@@ -23,6 +23,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/raw_ostream.h"
 #if HAVE_SYS_STAT_H
 #include <sys/stat.h>
@@ -164,8 +165,18 @@ static void SetMemoryLimits(unsigned size) {
 
 }
 
-static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
-                    const char **Envp, ArrayRef<Optional<StringRef>> Redirects,
+static std::vector<const char *>
+toNullTerminatedCStringArray(ArrayRef<StringRef> Strings, StringSaver &Saver) {
+  std::vector<const char *> Result;
+  for (StringRef S : Strings)
+    Result.push_back(Saver.save(S).data());
+  Result.push_back(nullptr);
+  return Result;
+}
+
+static bool Execute(ProcessInfo &PI, StringRef Program,
+                    ArrayRef<StringRef> Args, Optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<Optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg) {
   if (!llvm::sys::fs::exists(Program)) {
     if (ErrMsg)
@@ -174,6 +185,18 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
     return false;
   }
 
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  std::vector<const char *> ArgVector, EnvVector;
+  const char **Argv = nullptr;
+  const char **Envp = nullptr;
+  ArgVector = toNullTerminatedCStringArray(Args, Saver);
+  Argv = ArgVector.data();
+  if (Env) {
+    EnvVector = toNullTerminatedCStringArray(*Env, Saver);
+    Envp = EnvVector.data();
+  }
+
   // If this OS has posix_spawn and there is no memory limit being implied, use
   // posix_spawn.  It is more efficient than fork/exec.
 #ifdef HAVE_POSIX_SPAWN
@@ -227,7 +250,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
     // positive.
     pid_t PID = 0;
     int Err = posix_spawn(&PID, Program.str().c_str(), FileActions,
-                          /*attrp*/nullptr, const_cast<char **>(Args),
+                          /*attrp*/ nullptr, const_cast<char **>(Argv),
                           const_cast<char **>(Envp));
 
     if (FileActions)
@@ -237,6 +260,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
      return !MakeErrMsg(ErrMsg, "posix_spawn failed", Err);
 
     PI.Pid = PID;
+    PI.Process = PID;
 
     return true;
   }
@@ -279,12 +303,10 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
       // Execute!
       std::string PathStr = Program;
       if (Envp != nullptr)
-        execve(PathStr.c_str(),
-               const_cast<char **>(Args),
+        execve(PathStr.c_str(), const_cast<char **>(Argv),
                const_cast<char **>(Envp));
       else
-        execv(PathStr.c_str(),
-              const_cast<char **>(Args));
+        execv(PathStr.c_str(), const_cast<char **>(Argv));
       // If the execve() failed, we should exit. Follow Unix protocol and
       // return 127 if the executable was not found, and 126 otherwise.
       // Use _exit rather than exit so that atexit functions and static
@@ -300,6 +322,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
   }
 
   PI.Pid = child;
+  PI.Process = child;
 
   return true;
 }
@@ -404,14 +427,14 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   return WaitResult;
 }
 
-  std::error_code sys::ChangeStdinToBinary(){
+std::error_code sys::ChangeStdinToBinary() {
   // Do nothing, as Unix doesn't differentiate between text and binary.
-    return std::error_code();
+  return std::error_code();
 }
 
-  std::error_code sys::ChangeStdoutToBinary(){
+std::error_code sys::ChangeStdoutToBinary() {
   // Do nothing, as Unix doesn't differentiate between text and binary.
-    return std::error_code();
+  return std::error_code();
 }
 
 std::error_code
@@ -432,29 +455,38 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
 }
 
 bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program,
-                                                  ArrayRef<const char *> Args) {
+                                                  ArrayRef<StringRef> Args) {
   static long ArgMax = sysconf(_SC_ARG_MAX);
+  // POSIX requires that _POSIX_ARG_MAX is 4096, which is the lowest possible
+  // value for ARG_MAX on a POSIX compliant system.
+  static long ArgMin = _POSIX_ARG_MAX;
+
+  // This the same baseline used by xargs.
+  long EffectiveArgMax = 128 * 1024;
+
+  if (EffectiveArgMax > ArgMax)
+    EffectiveArgMax = ArgMax;
+  else if (EffectiveArgMax < ArgMin)
+    EffectiveArgMax = ArgMin;
 
   // System says no practical limit.
   if (ArgMax == -1)
     return true;
 
   // Conservatively account for space required by environment variables.
-  long HalfArgMax = ArgMax / 2;
+  long HalfArgMax = EffectiveArgMax / 2;
 
   size_t ArgLength = Program.size() + 1;
-  for (const char* Arg : Args) {
-    size_t length = strlen(Arg);
-
+  for (StringRef Arg : Args) {
     // Ensure that we do not exceed the MAX_ARG_STRLEN constant on Linux, which
     // does not have a constant unlike what the man pages would have you
     // believe. Since this limit is pretty high, perform the check
     // unconditionally rather than trying to be aggressive and limiting it to
     // Linux only.
-    if (length >= (32 * 4096))
+    if (Arg.size() >= (32 * 4096))
       return false;
 
-    ArgLength += length + 1;
+    ArgLength += Arg.size() + 1;
     if (ArgLength > size_t(HalfArgMax)) {
       return false;
     }
diff --git a/contrib/llvm/lib/Support/Unix/Signals.inc b/contrib/llvm/lib/Support/Unix/Signals.inc
index aaf760c5b616..de26695d64ea 100644
--- a/contrib/llvm/lib/Support/Unix/Signals.inc
+++ b/contrib/llvm/lib/Support/Unix/Signals.inc
@@ -11,9 +11,31 @@
 // Unix signals occurring while your program is running.
 //
 //===----------------------------------------------------------------------===//
+//
+// This file is extremely careful to only do signal-safe things while in a
+// signal handler. In particular, memory allocation and acquiring a mutex
+// while in a signal handler should never occur. ManagedStatic isn't usable from
+// a signal handler for 2 reasons:
+//
+//  1. Creating a new one allocates.
+//  2. The signal handler could fire while llvm_shutdown is being processed, in
+//     which case the ManagedStatic is in an unknown state because it could
+//     already have been destroyed, or be in the process of being destroyed.
+//
+// Modifying the behavior of the signal handlers (such as registering new ones)
+// can acquire a mutex, but all this guarantees is that the signal handler
+// behavior is only modified by one thread at a time. A signal handler can still
+// fire while this occurs!
+//
+// Adding work to a signal handler requires lock-freedom (and assume atomics are
+// always lock-free) because the signal handler could fire while new work is
+// being added.
+//
+//===----------------------------------------------------------------------===//
 
 #include "Unix.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/config.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
@@ -59,24 +81,133 @@ using namespace llvm;
 
 static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 
-static ManagedStatic<sys::SmartMutex<true> > SignalsMutex;
+/// The function to call if ctrl-c is pressed.
+using InterruptFunctionType = void (*)();
+static std::atomic<InterruptFunctionType> InterruptFunction =
+    ATOMIC_VAR_INIT(nullptr);
+
+namespace {
+/// Signal-safe removal of files.
+/// Inserting and erasing from the list isn't signal-safe, but removal of files
+/// themselves is signal-safe. Memory is freed when the head is freed, deletion
+/// is therefore not signal-safe either.
+class FileToRemoveList {
+  std::atomic<char *> Filename = ATOMIC_VAR_INIT(nullptr);
+  std::atomic<FileToRemoveList *> Next = ATOMIC_VAR_INIT(nullptr);
+
+  FileToRemoveList() = default;
+  // Not signal-safe.
+  FileToRemoveList(const std::string &str) : Filename(strdup(str.c_str())) {}
+
+public:
+  // Not signal-safe.
+  ~FileToRemoveList() {
+    if (FileToRemoveList *N = Next.exchange(nullptr))
+      delete N;
+    if (char *F = Filename.exchange(nullptr))
+      free(F);
+  }
+
+  // Not signal-safe.
+  static void insert(std::atomic<FileToRemoveList *> &Head,
+                     const std::string &Filename) {
+    // Insert the new file at the end of the list.
+    FileToRemoveList *NewHead = new FileToRemoveList(Filename);
+    std::atomic<FileToRemoveList *> *InsertionPoint = &Head;
+    FileToRemoveList *OldHead = nullptr;
+    while (!InsertionPoint->compare_exchange_strong(OldHead, NewHead)) {
+      InsertionPoint = &OldHead->Next;
+      OldHead = nullptr;
+    }
+  }
+
+  // Not signal-safe.
+  static void erase(std::atomic<FileToRemoveList *> &Head,
+                    const std::string &Filename) {
+    // Use a lock to avoid concurrent erase: the comparison would access
+    // free'd memory.
+    static ManagedStatic<sys::SmartMutex<true>> Lock;
+    sys::SmartScopedLock<true> Writer(*Lock);
+
+    for (FileToRemoveList *Current = Head.load(); Current;
+         Current = Current->Next.load()) {
+      if (char *OldFilename = Current->Filename.load()) {
+        if (OldFilename != Filename)
+          continue;
+        // Leave an empty filename.
+        OldFilename = Current->Filename.exchange(nullptr);
+        // The filename might have become null between the time we
+        // compared it and we exchanged it.
+        if (OldFilename)
+          free(OldFilename);
+      }
+    }
+  }
 
-/// InterruptFunction - The function to call if ctrl-c is pressed.
-static void (*InterruptFunction)() = nullptr;
+  // Signal-safe.
+  static void removeAllFiles(std::atomic<FileToRemoveList *> &Head) {
+    // If cleanup were to occur while we're removing files we'd have a bad time.
+    // Make sure we're OK by preventing cleanup from doing anything while we're
+    // removing files. If cleanup races with us and we win we'll have a leak,
+    // but we won't crash.
+    FileToRemoveList *OldHead = Head.exchange(nullptr);
+
+    for (FileToRemoveList *currentFile = OldHead; currentFile;
+         currentFile = currentFile->Next.load()) {
+      // If erasing was occuring while we're trying to remove files we'd look
+      // at free'd data. Take away the path and put it back when done.
+      if (char *path = currentFile->Filename.exchange(nullptr)) {
+        // Get the status so we can determine if it's a file or directory. If we
+        // can't stat the file, ignore it.
+        struct stat buf;
+        if (stat(path, &buf) != 0)
+          continue;
+
+        // If this is not a regular file, ignore it. We want to prevent removal
+        // of special files like /dev/null, even if the compiler is being run
+        // with the super-user permissions.
+        if (!S_ISREG(buf.st_mode))
+          continue;
+
+        // Otherwise, remove the file. We ignore any errors here as there is
+        // nothing else we can do.
+        unlink(path);
+
+        // We're done removing the file, erasing can safely proceed.
+        currentFile->Filename.exchange(path);
+      }
+    }
 
-static ManagedStatic<std::vector<std::string>> FilesToRemove;
+    // We're done removing files, cleanup can safely proceed.
+    Head.exchange(OldHead);
+  }
+};
+static std::atomic<FileToRemoveList *> FilesToRemove = ATOMIC_VAR_INIT(nullptr);
+
+/// Clean up the list in a signal-friendly manner.
+/// Recall that signals can fire during llvm_shutdown. If this occurs we should
+/// either clean something up or nothing at all, but we shouldn't crash!
+struct FilesToRemoveCleanup {
+  // Not signal-safe.
+  ~FilesToRemoveCleanup() {
+    FileToRemoveList *Head = FilesToRemove.exchange(nullptr);
+    if (Head)
+      delete Head;
+  }
+};
+} // namespace
 
 static StringRef Argv0;
 
-// IntSigs - Signals that represent requested termination. There's no bug
-// or failure, or if there is, it's not our direct responsibility. For whatever
-// reason, our continued execution is no longer desirable.
+// Signals that represent requested termination. There's no bug or failure, or
+// if there is, it's not our direct responsibility. For whatever reason, our
+// continued execution is no longer desirable.
 static const int IntSigs[] = {
   SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
 };
 
-// KillSigs - Signals that represent that we have a bug, and our prompt
-// termination has been ordered.
+// Signals that represent that we have a bug, and our prompt termination has
+// been ordered.
 static const int KillSigs[] = {
   SIGILL, SIGTRAP, SIGABRT, SIGFPE, SIGBUS, SIGSEGV, SIGQUIT
 #ifdef SIGSYS
@@ -93,30 +224,12 @@ static const int KillSigs[] = {
 #endif
 };
 
-static unsigned NumRegisteredSignals = 0;
+static std::atomic<unsigned> NumRegisteredSignals = ATOMIC_VAR_INIT(0);
 static struct {
   struct sigaction SA;
   int SigNo;
 } RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)];
 
-
-static void RegisterHandler(int Signal) {
-  assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) &&
-         "Out of space for signal handlers!");
-
-  struct sigaction NewHandler;
-
-  NewHandler.sa_handler = SignalHandler;
-  NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK;
-  sigemptyset(&NewHandler.sa_mask);
-
-  // Install the new handler, save the old one in RegisteredSignalInfo.
-  sigaction(Signal, &NewHandler,
-            &RegisteredSignalInfo[NumRegisteredSignals].SA);
-  RegisteredSignalInfo[NumRegisteredSignals].SigNo = Signal;
-  ++NumRegisteredSignals;
-}
-
 #if defined(HAVE_SIGALTSTACK)
 // Hold onto both the old and new alternate signal stack so that it's not
 // reported as a leak. We don't make any attempt to remove our alt signal
@@ -138,7 +251,7 @@ static void CreateSigAltStack() {
     return;
 
   stack_t AltStack = {};
-  AltStack.ss_sp = reinterpret_cast<char *>(malloc(AltStackSize));
+  AltStack.ss_sp = static_cast<char *>(safe_malloc(AltStackSize));
   NewAltStackPointer = AltStack.ss_sp; // Save to avoid reporting a leak.
   AltStack.ss_size = AltStackSize;
   if (sigaltstack(&AltStack, &OldAltStack) != 0)
@@ -148,64 +261,59 @@ static void CreateSigAltStack() {
 static void CreateSigAltStack() {}
 #endif
 
-static void RegisterHandlers() {
-  sys::SmartScopedLock<true> Guard(*SignalsMutex);
+static void RegisterHandlers() { // Not signal-safe.
+  // The mutex prevents other threads from registering handlers while we're
+  // doing it. We also have to protect the handlers and their count because
+  // a signal handler could fire while we're registeting handlers.
+  static ManagedStatic<sys::SmartMutex<true>> SignalHandlerRegistrationMutex;
+  sys::SmartScopedLock<true> Guard(*SignalHandlerRegistrationMutex);
 
   // If the handlers are already registered, we're done.
-  if (NumRegisteredSignals != 0) return;
+  if (NumRegisteredSignals.load() != 0)
+    return;
 
   // Create an alternate stack for signal handling. This is necessary for us to
   // be able to reliably handle signals due to stack overflow.
   CreateSigAltStack();
 
-  for (auto S : IntSigs) RegisterHandler(S);
-  for (auto S : KillSigs) RegisterHandler(S);
+  auto registerHandler = [&](int Signal) {
+    unsigned Index = NumRegisteredSignals.load();
+    assert(Index < array_lengthof(RegisteredSignalInfo) &&
+           "Out of space for signal handlers!");
+
+    struct sigaction NewHandler;
+
+    NewHandler.sa_handler = SignalHandler;
+    NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK;
+    sigemptyset(&NewHandler.sa_mask);
+
+    // Install the new handler, save the old one in RegisteredSignalInfo.
+    sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+    RegisteredSignalInfo[Index].SigNo = Signal;
+    ++NumRegisteredSignals;
+  };
+
+  for (auto S : IntSigs)
+    registerHandler(S);
+  for (auto S : KillSigs)
+    registerHandler(S);
 }
 
 static void UnregisterHandlers() {
   // Restore all of the signal handlers to how they were before we showed up.
-  for (unsigned i = 0, e = NumRegisteredSignals; i != e; ++i)
+  for (unsigned i = 0, e = NumRegisteredSignals.load(); i != e; ++i) {
     sigaction(RegisteredSignalInfo[i].SigNo,
               &RegisteredSignalInfo[i].SA, nullptr);
-  NumRegisteredSignals = 0;
+    --NumRegisteredSignals;
+  }
 }
 
-
-/// RemoveFilesToRemove - Process the FilesToRemove list. This function
-/// should be called with the SignalsMutex lock held.
-/// NB: This must be an async signal safe function. It cannot allocate or free
-/// memory, even in debug builds.
+/// Process the FilesToRemove list.
 static void RemoveFilesToRemove() {
-  // Avoid constructing ManagedStatic in the signal handler.
-  // If FilesToRemove is not constructed, there are no files to remove.
-  if (!FilesToRemove.isConstructed())
-    return;
-
-  // We avoid iterators in case of debug iterators that allocate or release
-  // memory.
-  std::vector<std::string>& FilesToRemoveRef = *FilesToRemove;
-  for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i) {
-    const char *path = FilesToRemoveRef[i].c_str();
-
-    // Get the status so we can determine if it's a file or directory. If we
-    // can't stat the file, ignore it.
-    struct stat buf;
-    if (stat(path, &buf) != 0)
-      continue;
-
-    // If this is not a regular file, ignore it. We want to prevent removal of
-    // special files like /dev/null, even if the compiler is being run with the
-    // super-user permissions.
-    if (!S_ISREG(buf.st_mode))
-      continue;
-
-    // Otherwise, remove the file. We ignore any errors here as there is nothing
-    // else we can do.
-    unlink(path);
-  }
+  FileToRemoveList::removeAllFiles(FilesToRemove);
 }
 
-// SignalHandler - The signal handler that runs.
+// The signal handler that runs.
 static RETSIGTYPE SignalHandler(int Sig) {
   // Restore the signal behavior to default, so that the program actually
   // crashes when we return and the signal reissues.  This also ensures that if
@@ -219,20 +327,13 @@ static RETSIGTYPE SignalHandler(int Sig) {
   sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
   {
-    unique_lock<sys::SmartMutex<true>> Guard(*SignalsMutex);
     RemoveFilesToRemove();
 
     if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
         != std::end(IntSigs)) {
-      if (InterruptFunction) {
-        void (*IF)() = InterruptFunction;
-        Guard.unlock();
-        InterruptFunction = nullptr;
-        IF();        // run the interrupt function.
-        return;
-      }
+      if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr))
+        return OldInterruptFunction();
 
-      Guard.unlock();
       raise(Sig);   // Execute the default handler.
       return;
    }
@@ -252,45 +353,36 @@ static RETSIGTYPE SignalHandler(int Sig) {
 }
 
 void llvm::sys::RunInterruptHandlers() {
-  sys::SmartScopedLock<true> Guard(*SignalsMutex);
   RemoveFilesToRemove();
 }
 
 void llvm::sys::SetInterruptFunction(void (*IF)()) {
-  {
-    sys::SmartScopedLock<true> Guard(*SignalsMutex);
-    InterruptFunction = IF;
-  }
+  InterruptFunction.exchange(IF);
   RegisterHandlers();
 }
 
-// RemoveFileOnSignal - The public API
+// The public API
 bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
                                    std::string* ErrMsg) {
-  {
-    sys::SmartScopedLock<true> Guard(*SignalsMutex);
-    FilesToRemove->push_back(Filename);
-  }
-
+  // Ensure that cleanup will occur as soon as one file is added.
+  static ManagedStatic<FilesToRemoveCleanup> FilesToRemoveCleanup;
+  *FilesToRemoveCleanup;
+  FileToRemoveList::insert(FilesToRemove, Filename.str());
   RegisterHandlers();
   return false;
 }
 
-// DontRemoveFileOnSignal - The public API
+// The public API
 void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
-  sys::SmartScopedLock<true> Guard(*SignalsMutex);
-  std::vector<std::string>::reverse_iterator RI =
-      find(reverse(*FilesToRemove), Filename);
-  std::vector<std::string>::iterator I = FilesToRemove->end();
-  if (RI != FilesToRemove->rend())
-    I = FilesToRemove->erase(RI.base()-1);
+  FileToRemoveList::erase(FilesToRemove, Filename.str());
 }
 
-/// AddSignalHandler - Add a function to be called when a signal is delivered
-/// to the process.  The handler can have a cookie passed to it to identify
-/// what instance of the handler it is.
-void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
-  CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
+/// Add a function to be called when a signal is delivered to the process. The
+/// handler can have a cookie passed to it to identify what instance of the
+/// handler it is.
+void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
+                                 void *Cookie) { // Signal-safe.
+  insertSignalHandler(FnPtr, Cookie);
   RegisterHandlers();
 }
 
@@ -383,8 +475,8 @@ static int unwindBacktrace(void **StackTrace, int MaxEntries) {
 }
 #endif
 
-// PrintStackTrace - In the case of a program crash or fault, print out a stack
-// trace so that the user has an indication of why and where we died.
+// In the case of a program crash or fault, print out a stack trace so that the
+// user has an indication of why and where we died.
 //
 // On glibc systems we have the 'backtrace' function, which works nicely, but
 // doesn't demangle symbols.
@@ -463,8 +555,8 @@ static void PrintStackTraceSignalHandler(void *) {
 
 void llvm::sys::DisableSystemDialogsOnCrash() {}
 
-/// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
-/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+/// When an error signal (such as SIGABRT or SIGSEGV) is delivered to the
+/// process, print a stack trace and then exit.
 void llvm::sys::PrintStackTraceOnErrorSignal(StringRef Argv0,
                                              bool DisableCrashReporting) {
   ::Argv0 = Argv0;
diff --git a/contrib/llvm/lib/Support/Unix/ThreadLocal.inc b/contrib/llvm/lib/Support/Unix/ThreadLocal.inc
index 31c3f3835b29..a6564f0fa281 100644
--- a/contrib/llvm/lib/Support/Unix/ThreadLocal.inc
+++ b/contrib/llvm/lib/Support/Unix/ThreadLocal.inc
@@ -16,6 +16,8 @@
 //===          is guaranteed to work on *all* UNIX variants.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Config/config.h"
+
 #if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_GETSPECIFIC)
 
 #include <cassert>
diff --git a/contrib/llvm/lib/Support/Unix/Threading.inc b/contrib/llvm/lib/Support/Unix/Threading.inc
index 7369cff8466c..2d49ce1ad747 100644
--- a/contrib/llvm/lib/Support/Unix/Threading.inc
+++ b/contrib/llvm/lib/Support/Unix/Threading.inc
@@ -21,8 +21,8 @@
 
 #include <pthread.h>
 
-#if defined(__FreeBSD__)
-#include <pthread_np.h> // For pthread_getthreadid_np()
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+#include <pthread_np.h> // For pthread_getthreadid_np() / pthread_set_name_np()
 #endif
 
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -98,8 +98,6 @@ uint64_t llvm::get_threadid() {
   return uint64_t(gettid());
 #elif defined(__linux__)
   return uint64_t(syscall(SYS_gettid));
-#elif defined(LLVM_ON_WIN32)
-  return uint64_t(::GetCurrentThreadId());
 #else
   return uint64_t(pthread_self());
 #endif
@@ -119,6 +117,8 @@ static constexpr uint32_t get_max_thread_name_length_impl() {
 #endif
 #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   return 16;
+#elif defined(__OpenBSD__)
+  return 32;
 #else
   return 0;
 #endif
@@ -138,8 +138,9 @@ void llvm::set_thread_name(const Twine &Name) {
   // terminated, but additionally the end of a long thread name will usually
   // be more unique than the beginning, since a common pattern is for similar
   // threads to share a common prefix.
+  // Note that the name length includes the null terminator.
   if (get_max_thread_name_length() > 0)
-    NameStr = NameStr.take_back(get_max_thread_name_length());
+    NameStr = NameStr.take_back(get_max_thread_name_length() - 1);
   (void)NameStr;
 #if defined(__linux__)
 #if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
@@ -147,7 +148,7 @@ void llvm::set_thread_name(const Twine &Name) {
   ::pthread_setname_np(::pthread_self(), NameStr.data());
 #endif
 #endif
-#elif defined(__FreeBSD__)
+#elif defined(__FreeBSD__) || defined(__OpenBSD__)
   ::pthread_set_name_np(::pthread_self(), NameStr.data());
 #elif defined(__NetBSD__)
   ::pthread_setname_np(::pthread_self(), "%s",
@@ -175,7 +176,7 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
     if (kp == nullptr || (error != 0 && errno == ENOMEM)) {
       // Add extra space in case threads are added before next call.
       len += sizeof(*kp) + len / 10;
-      nkp = (struct kinfo_proc *)realloc(kp, len);
+      nkp = (struct kinfo_proc *)::realloc(kp, len);
       if (nkp == nullptr) {
         free(kp);
         return;
@@ -203,7 +204,6 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
 
   Name.append(buf, buf + strlen(buf));
 #elif defined(__linux__)
-#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
 #if HAVE_PTHREAD_GETNAME_NP
   constexpr uint32_t len = get_max_thread_name_length_impl();
   char Buffer[len] = {'\0'};  // FIXME: working around MSan false positive.
@@ -211,5 +211,4 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
     Name.append(Buffer, Buffer + strlen(Buffer));
 #endif
 #endif
-#endif
 }
diff --git a/contrib/llvm/lib/Support/Unix/Unix.h b/contrib/llvm/lib/Support/Unix/Unix.h
index 239a6d60aaef..0c5d4de556d5 100644
--- a/contrib/llvm/lib/Support/Unix/Unix.h
+++ b/contrib/llvm/lib/Support/Unix/Unix.h
@@ -56,7 +56,7 @@
 /// This function builds an error message into \p ErrMsg using the \p prefix
 /// string and the Unix error number given by \p errnum. If errnum is -1, the
 /// default then the value of errno is used.
-/// @brief Make an error message
+/// Make an error message
 ///
 /// If the error number can be converted to a string, it will be
 /// separated from prefix by ": ".
diff --git a/contrib/llvm/lib/Support/Unix/Watchdog.inc b/contrib/llvm/lib/Support/Unix/Watchdog.inc
index 5d89c0e51b11..f4253391d952 100644
--- a/contrib/llvm/lib/Support/Unix/Watchdog.inc
+++ b/contrib/llvm/lib/Support/Unix/Watchdog.inc
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Config/config.h"
+
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
diff --git a/contrib/llvm/lib/Support/VersionTuple.cpp b/contrib/llvm/lib/Support/VersionTuple.cpp
new file mode 100644
index 000000000000..3f219bfbedfa
--- /dev/null
+++ b/contrib/llvm/lib/Support/VersionTuple.cpp
@@ -0,0 +1,110 @@
+//===- VersionTuple.cpp - Version Number Handling ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VersionTuple class, which represents a version in
+// the form major[.minor[.subminor]].
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+std::string VersionTuple::getAsString() const {
+  std::string Result;
+  {
+    llvm::raw_string_ostream Out(Result);
+    Out << *this;
+  }
+  return Result;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &Out, const VersionTuple &V) {
+  Out << V.getMajor();
+  if (Optional<unsigned> Minor = V.getMinor())
+    Out << '.' << *Minor;
+  if (Optional<unsigned> Subminor = V.getSubminor())
+    Out << '.' << *Subminor;
+  if (Optional<unsigned> Build = V.getBuild())
+    Out << '.' << *Build;
+  return Out;
+}
+
+static bool parseInt(StringRef &input, unsigned &value) {
+  assert(value == 0);
+  if (input.empty())
+    return true;
+
+  char next = input[0];
+  input = input.substr(1);
+  if (next < '0' || next > '9')
+    return true;
+  value = (unsigned)(next - '0');
+
+  while (!input.empty()) {
+    next = input[0];
+    if (next < '0' || next > '9')
+      return false;
+    input = input.substr(1);
+    value = value * 10 + (unsigned)(next - '0');
+  }
+
+  return false;
+}
+
+bool VersionTuple::tryParse(StringRef input) {
+  unsigned major = 0, minor = 0, micro = 0, build = 0;
+
+  // Parse the major version, [0-9]+
+  if (parseInt(input, major))
+    return true;
+
+  if (input.empty()) {
+    *this = VersionTuple(major);
+    return false;
+  }
+
+  // If we're not done, parse the minor version, \.[0-9]+
+  if (input[0] != '.')
+    return true;
+  input = input.substr(1);
+  if (parseInt(input, minor))
+    return true;
+
+  if (input.empty()) {
+    *this = VersionTuple(major, minor);
+    return false;
+  }
+
+  // If we're not done, parse the micro version, \.[0-9]+
+  if (input[0] != '.')
+    return true;
+  input = input.substr(1);
+  if (parseInt(input, micro))
+    return true;
+
+  if (input.empty()) {
+    *this = VersionTuple(major, minor, micro);
+    return false;
+  }
+
+  // If we're not done, parse the micro version, \.[0-9]+
+  if (input[0] != '.')
+    return true;
+  input = input.substr(1);
+  if (parseInt(input, build))
+    return true;
+
+  // If we have characters left over, it's an error.
+  if (!input.empty())
+    return true;
+
+  *this = VersionTuple(major, minor, micro, build);
+  return false;
+}
diff --git a/contrib/llvm/lib/Support/Watchdog.cpp b/contrib/llvm/lib/Support/Watchdog.cpp
index 724aa001f16e..be55e3122e70 100644
--- a/contrib/llvm/lib/Support/Watchdog.cpp
+++ b/contrib/llvm/lib/Support/Watchdog.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Watchdog.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Watchdog.inc"
 #endif
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/Watchdog.inc"
 #endif
diff --git a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
index 083ea902eeb2..1d47f0848a6d 100644
--- a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
+++ b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "WindowsSupport.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <psapi.h>
diff --git a/contrib/llvm/lib/Support/Windows/Host.inc b/contrib/llvm/lib/Support/Windows/Host.inc
index 90a6fb316703..58c4dc5d678f 100644
--- a/contrib/llvm/lib/Support/Windows/Host.inc
+++ b/contrib/llvm/lib/Support/Windows/Host.inc
@@ -30,5 +30,5 @@ std::string sys::getDefaultTargetTriple() {
     Triple = EnvTriple;
 #endif
 
-  return Triple::normalize(Triple);
+  return Triple;
 }
diff --git a/contrib/llvm/lib/Support/Windows/Path.inc b/contrib/llvm/lib/Support/Windows/Path.inc
index f81790b17df5..f425d607af47 100644
--- a/contrib/llvm/lib/Support/Windows/Path.inc
+++ b/contrib/llvm/lib/Support/Windows/Path.inc
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/WindowsError.h"
 #include <fcntl.h>
 #include <io.h>
@@ -45,6 +46,7 @@ typedef int errno_t;
 using namespace llvm;
 
 using llvm::sys::windows::UTF8ToUTF16;
+using llvm::sys::windows::CurCPToUTF16;
 using llvm::sys::windows::UTF16ToUTF8;
 using llvm::sys::path::widenPath;
 
@@ -121,6 +123,8 @@ std::error_code widenPath(const Twine &Path8,
 
 namespace fs {
 
+const file_t kInvalidFile = INVALID_HANDLE_VALUE;
+
 std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
   SmallVector<wchar_t, MAX_PATH> PathName;
   DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.capacity());
@@ -400,56 +404,6 @@ static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
   return std::error_code();
 }
 
-static std::error_code removeFD(int FD) {
-  HANDLE Handle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
-  return setDeleteDisposition(Handle, true);
-}
-
-/// In order to handle temporary files we want the following properties
-///
-/// * The temporary file is deleted on crashes
-/// * We can use (read, rename, etc) the temporary file.
-/// * We can cancel the delete to keep the file.
-///
-/// Using FILE_DISPOSITION_INFO with DeleteFile=true will create a file that is
-/// deleted on close, but it has a few problems:
-///
-/// * The file cannot be used. An attempt to open or rename the file will fail.
-///   This makes the temporary file almost useless, as it cannot be part of
-///   any other CreateFileW call in the current or in another process.
-/// * It is not atomic. A crash just after CreateFileW or just after canceling
-///   the delete will leave the file on disk.
-///
-/// Using FILE_FLAG_DELETE_ON_CLOSE solves the first issues and the first part
-/// of the second one, but there is no way to cancel it in place. What works is
-/// to create a second handle to prevent the deletion, close the first one and
-/// then clear DeleteFile with SetFileInformationByHandle. This requires
-/// changing the handle and file descriptor the caller uses.
-static std::error_code cancelDeleteOnClose(int &FD) {
-  HANDLE Handle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
-  SmallVector<wchar_t, MAX_PATH> Name;
-  if (std::error_code EC = realPathFromHandle(Handle, Name))
-    return EC;
-  HANDLE NewHandle =
-      ::CreateFileW(Name.data(), GENERIC_READ | GENERIC_WRITE | DELETE,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                    NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
-  if (NewHandle == INVALID_HANDLE_VALUE)
-    return mapWindowsError(::GetLastError());
-  if (close(FD))
-    return mapWindowsError(::GetLastError());
-
-  if (std::error_code EC = setDeleteDisposition(NewHandle, false))
-    return EC;
-
-  FD = ::_open_osfhandle(intptr_t(NewHandle), 0);
-  if (FD == -1) {
-    ::CloseHandle(NewHandle);
-    return mapWindowsError(ERROR_INVALID_HANDLE);
-  }
-  return std::error_code();
-}
-
 static std::error_code rename_internal(HANDLE FromHandle, const Twine &To,
                                        bool ReplaceIfExists) {
   SmallVector<wchar_t, 0> ToWide;
@@ -822,8 +776,9 @@ std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
 
 std::error_code mapped_file_region::init(int FD, uint64_t Offset,
                                          mapmode Mode) {
-  HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
-  if (FileHandle == INVALID_HANDLE_VALUE)
+  this->Mode = Mode;
+  HANDLE OrigFileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  if (OrigFileHandle == INVALID_HANDLE_VALUE)
     return make_error_code(errc::bad_file_descriptor);
 
   DWORD flprotect;
@@ -834,7 +789,7 @@ std::error_code mapped_file_region::init(int FD, uint64_t Offset,
   }
 
   HANDLE FileMappingHandle =
-      ::CreateFileMappingW(FileHandle, 0, flprotect,
+      ::CreateFileMappingW(OrigFileHandle, 0, flprotect,
                            Hi_32(Size),
                            Lo_32(Size),
                            0);
@@ -872,9 +827,20 @@ std::error_code mapped_file_region::init(int FD, uint64_t Offset,
     Size = mbi.RegionSize;
   }
 
-  // Close all the handles except for the view. It will keep the other handles
-  // alive.
+  // Close the file mapping handle, as it's kept alive by the file mapping. But
+  // neither the file mapping nor the file mapping handle keep the file handle
+  // alive, so we need to keep a reference to the file in case all other handles
+  // are closed and the file is deleted, which may cause invalid data to be read
+  // from the file.
   ::CloseHandle(FileMappingHandle);
+  if (!::DuplicateHandle(::GetCurrentProcess(), OrigFileHandle,
+                         ::GetCurrentProcess(), &FileHandle, 0, 0,
+                         DUPLICATE_SAME_ACCESS)) {
+    std::error_code ec = mapWindowsError(GetLastError());
+    ::UnmapViewOfFile(Mapping);
+    return ec;
+  }
+
   return std::error_code();
 }
 
@@ -887,8 +853,20 @@ mapped_file_region::mapped_file_region(int fd, mapmode mode, size_t length,
 }
 
 mapped_file_region::~mapped_file_region() {
-  if (Mapping)
+  if (Mapping) {
     ::UnmapViewOfFile(Mapping);
+
+    if (Mode == mapmode::readwrite) {
+      // There is a Windows kernel bug, the exact trigger conditions of which
+      // are not well understood.  When triggered, dirty pages are not properly
+      // flushed and subsequent process's attempts to read a file can return
+      // invalid data.  Calling FlushFileBuffers on the write handle is
+      // sufficient to ensure that this bug is not triggered.
+      ::FlushFileBuffers(FileHandle);
+    }
+
+    ::CloseHandle(FileHandle);
+  }
 }
 
 size_t mapped_file_region::size() const {
@@ -1017,35 +995,82 @@ ErrorOr<basic_file_status> directory_entry::status() const {
   return Status;
 }
 
-static std::error_code directoryRealPath(const Twine &Name,
-                                         SmallVectorImpl<char> &RealPath) {
-  SmallVector<wchar_t, 128> PathUTF16;
+static std::error_code nativeFileToFd(Expected<HANDLE> H, int &ResultFD,
+                                      OpenFlags Flags) {
+  int CrtOpenFlags = 0;
+  if (Flags & OF_Append)
+    CrtOpenFlags |= _O_APPEND;
 
-  if (std::error_code EC = widenPath(Name, PathUTF16))
-    return EC;
+  if (Flags & OF_Text)
+    CrtOpenFlags |= _O_TEXT;
 
-  HANDLE H =
-      ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
-  if (H == INVALID_HANDLE_VALUE)
-    return mapWindowsError(GetLastError());
-  std::error_code EC = realPathFromHandle(H, RealPath);
-  ::CloseHandle(H);
-  return EC;
+  ResultFD = -1;
+  if (!H)
+    return errorToErrorCode(H.takeError());
+
+  ResultFD = ::_open_osfhandle(intptr_t(*H), CrtOpenFlags);
+  if (ResultFD == -1) {
+    ::CloseHandle(*H);
+    return mapWindowsError(ERROR_INVALID_HANDLE);
+  }
+  return std::error_code();
 }
 
-std::error_code openFileForRead(const Twine &Name, int &ResultFD,
-                                SmallVectorImpl<char> *RealPath) {
+static DWORD nativeDisposition(CreationDisposition Disp, OpenFlags Flags) {
+  // This is a compatibility hack.  Really we should respect the creation
+  // disposition, but a lot of old code relied on the implicit assumption that
+  // OF_Append implied it would open an existing file.  Since the disposition is
+  // now explicit and defaults to CD_CreateAlways, this assumption would cause
+  // any usage of OF_Append to append to a new file, even if the file already
+  // existed.  A better solution might have two new creation dispositions:
+  // CD_AppendAlways and CD_AppendNew.  This would also address the problem of
+  // OF_Append being used on a read-only descriptor, which doesn't make sense.
+  if (Flags & OF_Append)
+    return OPEN_ALWAYS;
+
+  switch (Disp) {
+  case CD_CreateAlways:
+    return CREATE_ALWAYS;
+  case CD_CreateNew:
+    return CREATE_NEW;
+  case CD_OpenAlways:
+    return OPEN_ALWAYS;
+  case CD_OpenExisting:
+    return OPEN_EXISTING;
+  }
+  llvm_unreachable("unreachable!");
+}
+
+static DWORD nativeAccess(FileAccess Access, OpenFlags Flags) {
+  DWORD Result = 0;
+  if (Access & FA_Read)
+    Result |= GENERIC_READ;
+  if (Access & FA_Write)
+    Result |= GENERIC_WRITE;
+  if (Flags & OF_Delete)
+    Result |= DELETE;
+  if (Flags & OF_UpdateAtime)
+    Result |= FILE_WRITE_ATTRIBUTES;
+  return Result;
+}
+
+static std::error_code openNativeFileInternal(const Twine &Name,
+                                              file_t &ResultFile, DWORD Disp,
+                                              DWORD Access, DWORD Flags,
+                                              bool Inherit = false) {
   SmallVector<wchar_t, 128> PathUTF16;
-
   if (std::error_code EC = widenPath(Name, PathUTF16))
     return EC;
 
+  SECURITY_ATTRIBUTES SA;
+  SA.nLength = sizeof(SA);
+  SA.lpSecurityDescriptor = nullptr;
+  SA.bInheritHandle = Inherit;
+
   HANDLE H =
-      ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                    NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+      ::CreateFileW(PathUTF16.begin(), Access,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, &SA,
+                    Disp, Flags, NULL);
   if (H == INVALID_HANDLE_VALUE) {
     DWORD LastError = ::GetLastError();
     std::error_code EC = mapWindowsError(LastError);
@@ -1058,82 +1083,96 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
       return make_error_code(errc::is_a_directory);
     return EC;
   }
+  ResultFile = H;
+  return std::error_code();
+}
 
-  int FD = ::_open_osfhandle(intptr_t(H), 0);
-  if (FD == -1) {
-    ::CloseHandle(H);
-    return mapWindowsError(ERROR_INVALID_HANDLE);
+Expected<file_t> openNativeFile(const Twine &Name, CreationDisposition Disp,
+                                FileAccess Access, OpenFlags Flags,
+                                unsigned Mode) {
+  // Verify that we don't have both "append" and "excl".
+  assert((!(Disp == CD_CreateNew) || !(Flags & OF_Append)) &&
+         "Cannot specify both 'CreateNew' and 'Append' file creation flags!");
+
+  DWORD NativeDisp = nativeDisposition(Disp, Flags);
+  DWORD NativeAccess = nativeAccess(Access, Flags);
+
+  bool Inherit = false;
+  if (Flags & OF_ChildInherit)
+    Inherit = true;
+
+  file_t Result;
+  std::error_code EC = openNativeFileInternal(
+      Name, Result, NativeDisp, NativeAccess, FILE_ATTRIBUTE_NORMAL, Inherit);
+  if (EC)
+    return errorCodeToError(EC);
+
+  if (Flags & OF_UpdateAtime) {
+    FILETIME FileTime;
+    SYSTEMTIME SystemTime;
+    GetSystemTime(&SystemTime);
+    if (SystemTimeToFileTime(&SystemTime, &FileTime) == 0 ||
+        SetFileTime(Result, NULL, &FileTime, NULL) == 0) {
+      DWORD LastError = ::GetLastError();
+      ::CloseHandle(Result);
+      return errorCodeToError(mapWindowsError(LastError));
+    }
   }
 
-  // Fetch the real name of the file, if the user asked
-  if (RealPath)
-    realPathFromHandle(H, *RealPath);
-
-  ResultFD = FD;
-  return std::error_code();
+  if (Flags & OF_Delete) {
+    if ((EC = setDeleteDisposition(Result, true))) {
+      ::CloseHandle(Result);
+      return errorCodeToError(EC);
+    }
+  }
+  return Result;
 }
 
-std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
-                            sys::fs::OpenFlags Flags, unsigned Mode) {
-  // Verify that we don't have both "append" and "excl".
-  assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
-         "Cannot specify both 'excl' and 'append' file creation flags!");
+std::error_code openFile(const Twine &Name, int &ResultFD,
+                         CreationDisposition Disp, FileAccess Access,
+                         OpenFlags Flags, unsigned int Mode) {
+  Expected<file_t> Result = openNativeFile(Name, Disp, Access, Flags);
+  if (!Result)
+    return errorToErrorCode(Result.takeError());
 
-  SmallVector<wchar_t, 128> PathUTF16;
+  return nativeFileToFd(*Result, ResultFD, Flags);
+}
 
-  if (std::error_code EC = widenPath(Name, PathUTF16))
+static std::error_code directoryRealPath(const Twine &Name,
+                                         SmallVectorImpl<char> &RealPath) {
+  file_t File;
+  std::error_code EC = openNativeFileInternal(
+      Name, File, OPEN_EXISTING, GENERIC_READ, FILE_FLAG_BACKUP_SEMANTICS);
+  if (EC)
     return EC;
 
-  DWORD CreationDisposition;
-  if (Flags & F_Excl)
-    CreationDisposition = CREATE_NEW;
-  else if (Flags & F_Append)
-    CreationDisposition = OPEN_ALWAYS;
-  else
-    CreationDisposition = CREATE_ALWAYS;
-
-  DWORD Access = GENERIC_WRITE;
-  DWORD Attributes = FILE_ATTRIBUTE_NORMAL;
-  if (Flags & F_RW)
-    Access |= GENERIC_READ;
-  if (Flags & F_Delete) {
-    Access |= DELETE;
-    Attributes |= FILE_FLAG_DELETE_ON_CLOSE;
-  }
-
-  HANDLE H =
-      ::CreateFileW(PathUTF16.data(), Access,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                    NULL, CreationDisposition, Attributes, NULL);
+  EC = realPathFromHandle(File, RealPath);
+  ::CloseHandle(File);
+  return EC;
+}
 
-  if (H == INVALID_HANDLE_VALUE) {
-    DWORD LastError = ::GetLastError();
-    std::error_code EC = mapWindowsError(LastError);
-    // Provide a better error message when trying to open directories.
-    // This only runs if we failed to open the file, so there is probably
-    // no performances issues.
-    if (LastError != ERROR_ACCESS_DENIED)
-      return EC;
-    if (is_directory(Name))
-      return make_error_code(errc::is_a_directory);
-    return EC;
-  }
+std::error_code openFileForRead(const Twine &Name, int &ResultFD,
+                                OpenFlags Flags,
+                                SmallVectorImpl<char> *RealPath) {
+  Expected<HANDLE> NativeFile = openNativeFileForRead(Name, Flags, RealPath);
+  return nativeFileToFd(std::move(NativeFile), ResultFD, OF_None);
+}
 
-  int OpenFlags = 0;
-  if (Flags & F_Append)
-    OpenFlags |= _O_APPEND;
+Expected<file_t> openNativeFileForRead(const Twine &Name, OpenFlags Flags,
+                                       SmallVectorImpl<char> *RealPath) {
+  Expected<file_t> Result =
+      openNativeFile(Name, CD_OpenExisting, FA_Read, Flags);
 
-  if (Flags & F_Text)
-    OpenFlags |= _O_TEXT;
+  // Fetch the real name of the file, if the user asked
+  if (Result && RealPath)
+    realPathFromHandle(*Result, *RealPath);
 
-  int FD = ::_open_osfhandle(intptr_t(H), OpenFlags);
-  if (FD == -1) {
-    ::CloseHandle(H);
-    return mapWindowsError(ERROR_INVALID_HANDLE);
-  }
+  return Result;
+}
 
-  ResultFD = FD;
-  return std::error_code();
+void closeFile(file_t &F) {
+  ::CloseHandle(F);
+  F = kInvalidFile;
 }
 
 std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
@@ -1204,7 +1243,8 @@ std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
     return directoryRealPath(path, dest);
 
   int fd;
-  if (std::error_code EC = llvm::sys::fs::openFileForRead(path, fd, &dest))
+  if (std::error_code EC =
+          llvm::sys::fs::openFileForRead(path, fd, OF_None, &dest))
     return EC;
   ::close(fd);
   return std::error_code();
@@ -1279,23 +1319,26 @@ void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
 } // end namespace path
 
 namespace windows {
-std::error_code UTF8ToUTF16(llvm::StringRef utf8,
-                            llvm::SmallVectorImpl<wchar_t> &utf16) {
-  if (!utf8.empty()) {
-    int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8.begin(),
-                                    utf8.size(), utf16.begin(), 0);
-
-    if (len == 0)
+std::error_code CodePageToUTF16(unsigned codepage,
+                                llvm::StringRef original,
+                                llvm::SmallVectorImpl<wchar_t> &utf16) {
+  if (!original.empty()) {
+    int len = ::MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, original.begin(),
+                                    original.size(), utf16.begin(), 0);
+
+    if (len == 0) {
       return mapWindowsError(::GetLastError());
+    }
 
     utf16.reserve(len + 1);
     utf16.set_size(len);
 
-    len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8.begin(),
-                                utf8.size(), utf16.begin(), utf16.size());
+    len = ::MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, original.begin(),
+                                original.size(), utf16.begin(), utf16.size());
 
-    if (len == 0)
+    if (len == 0) {
       return mapWindowsError(::GetLastError());
+    }
   }
 
   // Make utf16 null terminated.
@@ -1305,32 +1348,44 @@ std::error_code UTF8ToUTF16(llvm::StringRef utf8,
   return std::error_code();
 }
 
+std::error_code UTF8ToUTF16(llvm::StringRef utf8,
+                            llvm::SmallVectorImpl<wchar_t> &utf16) {
+  return CodePageToUTF16(CP_UTF8, utf8, utf16);
+}
+
+std::error_code CurCPToUTF16(llvm::StringRef curcp,
+                            llvm::SmallVectorImpl<wchar_t> &utf16) {
+  return CodePageToUTF16(CP_ACP, curcp, utf16);
+}
+
 static
 std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16,
                                 size_t utf16_len,
-                                llvm::SmallVectorImpl<char> &utf8) {
+                                llvm::SmallVectorImpl<char> &converted) {
   if (utf16_len) {
     // Get length.
-    int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(),
+    int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, converted.begin(),
                                     0, NULL, NULL);
 
-    if (len == 0)
+    if (len == 0) {
       return mapWindowsError(::GetLastError());
+    }
 
-    utf8.reserve(len);
-    utf8.set_size(len);
+    converted.reserve(len);
+    converted.set_size(len);
 
     // Now do the actual conversion.
-    len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(),
-                                utf8.size(), NULL, NULL);
+    len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, converted.data(),
+                                converted.size(), NULL, NULL);
 
-    if (len == 0)
+    if (len == 0) {
       return mapWindowsError(::GetLastError());
+    }
   }
 
-  // Make utf8 null terminated.
-  utf8.push_back(0);
-  utf8.pop_back();
+  // Make the new string null terminated.
+  converted.push_back(0);
+  converted.pop_back();
 
   return std::error_code();
 }
@@ -1341,8 +1396,8 @@ std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
 }
 
 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
-                             llvm::SmallVectorImpl<char> &utf8) {
-  return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8);
+                             llvm::SmallVectorImpl<char> &curcp) {
+  return UTF16ToCodePage(CP_ACP, utf16, utf16_len, curcp);
 }
 
 } // end namespace windows
diff --git a/contrib/llvm/lib/Support/Windows/Process.inc b/contrib/llvm/lib/Support/Windows/Process.inc
index 3fe9f89f1ef5..30126568769c 100644
--- a/contrib/llvm/lib/Support/Windows/Process.inc
+++ b/contrib/llvm/lib/Support/Windows/Process.inc
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/WindowsError.h"
 #include <malloc.h>
@@ -24,14 +25,7 @@
 #include <psapi.h>
 #include <shellapi.h>
 
-#ifdef __MINGW32__
- #if (HAVE_LIBPSAPI != 1)
-  #error "libpsapi.a should be present"
- #endif
- #if (HAVE_LIBSHELL32 != 1)
-  #error "libshell32.a should be present"
- #endif
-#else
+#if !defined(__MINGW32__)
  #pragma comment(lib, "psapi.lib")
  #pragma comment(lib, "shell32.lib")
 #endif
@@ -146,39 +140,38 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   return std::string(Res.data());
 }
 
-static void AllocateAndPush(const SmallVectorImpl<char> &S,
-                            SmallVectorImpl<const char *> &Vector,
-                            SpecificBumpPtrAllocator<char> &Allocator) {
-  char *Buffer = Allocator.Allocate(S.size() + 1);
-  ::memcpy(Buffer, S.data(), S.size());
-  Buffer[S.size()] = '\0';
-  Vector.push_back(Buffer);
+static const char *AllocateString(const SmallVectorImpl<char> &S,
+                                  BumpPtrAllocator &Alloc) {
+  char *Buf = reinterpret_cast<char *>(Alloc.Allocate(S.size() + 1, 1));
+  ::memcpy(Buf, S.data(), S.size());
+  Buf[S.size()] = '\0';
+  return Buf;
 }
 
 /// Convert Arg from UTF-16 to UTF-8 and push it onto Args.
-static std::error_code
-ConvertAndPushArg(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
-                  SpecificBumpPtrAllocator<char> &Allocator) {
+static std::error_code ConvertAndPushArg(const wchar_t *Arg,
+                                         SmallVectorImpl<const char *> &Args,
+                                         BumpPtrAllocator &Alloc) {
   SmallVector<char, MAX_PATH> ArgString;
   if (std::error_code ec = windows::UTF16ToUTF8(Arg, wcslen(Arg), ArgString))
     return ec;
-  AllocateAndPush(ArgString, Args, Allocator);
+  Args.push_back(AllocateString(ArgString, Alloc));
   return std::error_code();
 }
 
-/// \brief Perform wildcard expansion of Arg, or just push it into Args if it
+/// Perform wildcard expansion of Arg, or just push it into Args if it
 /// doesn't have wildcards or doesn't match any files.
-static std::error_code
-WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
-               SpecificBumpPtrAllocator<char> &Allocator) {
+static std::error_code WildcardExpand(const wchar_t *Arg,
+                                      SmallVectorImpl<const char *> &Args,
+                                      BumpPtrAllocator &Alloc) {
   if (!wcspbrk(Arg, L"*?")) {
     // Arg does not contain any wildcard characters. This is the common case.
-    return ConvertAndPushArg(Arg, Args, Allocator);
+    return ConvertAndPushArg(Arg, Args, Alloc);
   }
 
   if (wcscmp(Arg, L"/?") == 0 || wcscmp(Arg, L"-?") == 0) {
     // Don't wildcard expand /?. Always treat it as an option.
-    return ConvertAndPushArg(Arg, Args, Allocator);
+    return ConvertAndPushArg(Arg, Args, Alloc);
   }
 
   // Extract any directory part of the argument.
@@ -195,7 +188,7 @@ WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
   WIN32_FIND_DATAW FileData;
   HANDLE FindHandle = FindFirstFileW(Arg, &FileData);
   if (FindHandle == INVALID_HANDLE_VALUE) {
-    return ConvertAndPushArg(Arg, Args, Allocator);
+    return ConvertAndPushArg(Arg, Args, Alloc);
   }
 
   std::error_code ec;
@@ -208,7 +201,7 @@ WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
 
     // Append FileName to Dir, and remove it afterwards.
     llvm::sys::path::append(Dir, StringRef(FileName.data(), FileName.size()));
-    AllocateAndPush(Dir, Args, Allocator);
+    Args.push_back(AllocateString(Dir, Alloc));
     Dir.resize(DirSize);
   } while (FindNextFileW(FindHandle, &FileData));
 
@@ -216,56 +209,65 @@ WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
   return ec;
 }
 
-static std::error_code
-ExpandShortFileName(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
-                    SpecificBumpPtrAllocator<char> &Allocator) {
-  SmallVector<wchar_t, MAX_PATH> LongPath;
-  DWORD Length = GetLongPathNameW(Arg, LongPath.data(), LongPath.capacity());
+static std::error_code GetExecutableName(SmallVectorImpl<char> &Filename) {
+  // The first argument may contain just the name of the executable (e.g.,
+  // "clang") rather than the full path, so swap it with the full path.
+  wchar_t ModuleName[MAX_PATH];
+  size_t Length = ::GetModuleFileNameW(NULL, ModuleName, MAX_PATH);
+  if (Length == 0 || Length == MAX_PATH) {
+    return mapWindowsError(GetLastError());
+  }
+
+  // If the first argument is a shortened (8.3) name (which is possible even
+  // if we got the module name), the driver will have trouble distinguishing it
+  // (e.g., clang.exe v. clang++.exe), so expand it now.
+  Length = GetLongPathNameW(ModuleName, ModuleName, MAX_PATH);
   if (Length == 0)
     return mapWindowsError(GetLastError());
-  if (Length > LongPath.capacity()) {
+  if (Length > MAX_PATH) {
     // We're not going to try to deal with paths longer than MAX_PATH, so we'll
     // treat this as an error.  GetLastError() returns ERROR_SUCCESS, which
     // isn't useful, so we'll hardcode an appropriate error value.
     return mapWindowsError(ERROR_INSUFFICIENT_BUFFER);
   }
-  LongPath.set_size(Length);
-  return ConvertAndPushArg(LongPath.data(), Args, Allocator);
+
+  std::error_code EC = windows::UTF16ToUTF8(ModuleName, Length, Filename);
+  if (EC)
+    return EC;
+
+  StringRef Base = sys::path::filename(Filename.data());
+  Filename.assign(Base.begin(), Base.end());
+  return std::error_code();
 }
 
 std::error_code
-Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
-                           ArrayRef<const char *>,
-                           SpecificBumpPtrAllocator<char> &ArgAllocator) {
+windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
+                                 BumpPtrAllocator &Alloc) {
   int ArgCount;
-  wchar_t **UnicodeCommandLine =
-      CommandLineToArgvW(GetCommandLineW(), &ArgCount);
+  std::unique_ptr<wchar_t *[], decltype(&LocalFree)> UnicodeCommandLine{
+    CommandLineToArgvW(GetCommandLineW(), &ArgCount), &LocalFree};
   if (!UnicodeCommandLine)
     return mapWindowsError(::GetLastError());
 
-  Args.reserve(ArgCount);
-  std::error_code ec;
+  std::error_code EC;
 
-  // The first argument may contain just the name of the executable (e.g.,
-  // "clang") rather than the full path, so swap it with the full path.
-  wchar_t ModuleName[MAX_PATH];
-  int Length = ::GetModuleFileNameW(NULL, ModuleName, MAX_PATH);
-  if (0 < Length && Length < MAX_PATH)
-    UnicodeCommandLine[0] = ModuleName;
-
-  // If the first argument is a shortened (8.3) name (which is possible even
-  // if we got the module name), the driver will have trouble distinguishing it
-  // (e.g., clang.exe v. clang++.exe), so expand it now.
-  ec = ExpandShortFileName(UnicodeCommandLine[0], Args, ArgAllocator);
+  Args.reserve(ArgCount);
 
-  for (int i = 1; i < ArgCount && !ec; ++i) {
-    ec = WildcardExpand(UnicodeCommandLine[i], Args, ArgAllocator);
-    if (ec)
-      break;
+  for (int I = 0; I < ArgCount; ++I) {
+    EC = WildcardExpand(UnicodeCommandLine[I], Args, Alloc);
+    if (EC)
+      return EC;
   }
 
-  LocalFree(UnicodeCommandLine);
-  return ec;
+  SmallVector<char, MAX_PATH> Arg0(Args[0], Args[0] + strlen(Args[0]));
+  SmallVector<char, MAX_PATH> Filename;
+  sys::path::remove_filename(Arg0);
+  EC = GetExecutableName(Filename);
+  if (EC)
+    return EC;
+  sys::path::append(Arg0, Filename);
+  Args[0] = AllocateString(Arg0, Alloc);
+  return std::error_code();
 }
 
 std::error_code Process::FixupStandardFileDescriptors() {
diff --git a/contrib/llvm/lib/Support/Windows/Program.inc b/contrib/llvm/lib/Support/Windows/Program.inc
index 52921cd6a203..cb68c5b10e52 100644
--- a/contrib/llvm/lib/Support/Windows/Program.inc
+++ b/contrib/llvm/lib/Support/Windows/Program.inc
@@ -16,12 +16,14 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 #include <fcntl.h>
 #include <io.h>
 #include <malloc.h>
+#include <numeric>
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
@@ -30,7 +32,7 @@
 
 namespace llvm {
 
-ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
+ProcessInfo::ProcessInfo() : Pid(0), Process(0), ReturnCode(0) {}
 
 ErrorOr<std::string> sys::findProgramByName(StringRef Name,
                                             ArrayRef<StringRef> Paths) {
@@ -145,112 +147,11 @@ static HANDLE RedirectIO(Optional<StringRef> Path, int fd,
   return h;
 }
 
-/// ArgNeedsQuotes - Check whether argument needs to be quoted when calling
-/// CreateProcess.
-static bool ArgNeedsQuotes(const char *Str) {
-  return Str[0] == '\0' || strpbrk(Str, "\t \"&\'()*<>\\`^|") != 0;
 }
 
-/// CountPrecedingBackslashes - Returns the number of backslashes preceding Cur
-/// in the C string Start.
-static unsigned int CountPrecedingBackslashes(const char *Start,
-                                              const char *Cur) {
-  unsigned int Count = 0;
-  --Cur;
-  while (Cur >= Start && *Cur == '\\') {
-    ++Count;
-    --Cur;
-  }
-  return Count;
-}
-
-/// EscapePrecedingEscapes - Append a backslash to Dst for every backslash
-/// preceding Cur in the Start string.  Assumes Dst has enough space.
-static char *EscapePrecedingEscapes(char *Dst, const char *Start,
-                                    const char *Cur) {
-  unsigned PrecedingEscapes = CountPrecedingBackslashes(Start, Cur);
-  while (PrecedingEscapes > 0) {
-    *Dst++ = '\\';
-    --PrecedingEscapes;
-  }
-  return Dst;
-}
-
-/// ArgLenWithQuotes - Check whether argument needs to be quoted when calling
-/// CreateProcess and returns length of quoted arg with escaped quotes
-static unsigned int ArgLenWithQuotes(const char *Str) {
-  const char *Start = Str;
-  bool Quoted = ArgNeedsQuotes(Str);
-  unsigned int len = Quoted ? 2 : 0;
-
-  while (*Str != '\0') {
-    if (*Str == '\"') {
-      // We need to add a backslash, but ensure that it isn't escaped.
-      unsigned PrecedingEscapes = CountPrecedingBackslashes(Start, Str);
-      len += PrecedingEscapes + 1;
-    }
-    // Note that we *don't* need to escape runs of backslashes that don't
-    // precede a double quote!  See MSDN:
-    // http://msdn.microsoft.com/en-us/library/17w5ykft%28v=vs.85%29.aspx
-
-    ++len;
-    ++Str;
-  }
-
-  if (Quoted) {
-    // Make sure the closing quote doesn't get escaped by a trailing backslash.
-    unsigned PrecedingEscapes = CountPrecedingBackslashes(Start, Str);
-    len += PrecedingEscapes + 1;
-  }
-
-  return len;
-}
-
-}
-
-static std::unique_ptr<char[]> flattenArgs(const char **Args) {
-  // First, determine the length of the command line.
-  unsigned len = 0;
-  for (unsigned i = 0; Args[i]; i++) {
-    len += ArgLenWithQuotes(Args[i]) + 1;
-  }
-
-  // Now build the command line.
-  std::unique_ptr<char[]> command(new char[len+1]);
-  char *p = command.get();
-
-  for (unsigned i = 0; Args[i]; i++) {
-    const char *arg = Args[i];
-    const char *start = arg;
-
-    bool needsQuoting = ArgNeedsQuotes(arg);
-    if (needsQuoting)
-      *p++ = '"';
-
-    while (*arg != '\0') {
-      if (*arg == '\"') {
-        // Escape all preceding escapes (if any), and then escape the quote.
-        p = EscapePrecedingEscapes(p, start, arg);
-        *p++ = '\\';
-      }
-
-      *p++ = *arg++;
-    }
-
-    if (needsQuoting) {
-      // Make sure our quote doesn't get escaped by a trailing backslash.
-      p = EscapePrecedingEscapes(p, start, arg);
-      *p++ = '"';
-    }
-    *p++ = ' ';
-  }
-
-  *p = 0;
-  return command;
-}
-
-static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
-                    const char **Envp, ArrayRef<Optional<StringRef>> Redirects,
+static bool Execute(ProcessInfo &PI, StringRef Program,
+                    ArrayRef<StringRef> Args, Optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<Optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg) {
   if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
@@ -269,18 +170,18 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
   // Windows wants a command line, not an array of args, to pass to the new
   // process.  We have to concatenate them all, while quoting the args that
   // have embedded spaces (or are empty).
-  std::unique_ptr<char[]> command = flattenArgs(Args);
+  std::string Command = flattenWindowsCommandLine(Args);
 
   // The pointer to the environment block for the new process.
   std::vector<wchar_t> EnvBlock;
 
-  if (Envp) {
+  if (Env) {
     // An environment block consists of a null-terminated block of
     // null-terminated strings. Convert the array of environment variables to
     // an environment block by concatenating them.
-    for (unsigned i = 0; Envp[i]; ++i) {
+    for (const auto E : *Env) {
       SmallVector<wchar_t, MAX_PATH> EnvString;
-      if (std::error_code ec = windows::UTF8ToUTF16(Envp[i], EnvString)) {
+      if (std::error_code ec = windows::UTF8ToUTF16(E, EnvString)) {
         SetLastError(ec.value());
         MakeErrMsg(ErrMsg, "Unable to convert environment variable to UTF-16");
         return false;
@@ -352,7 +253,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
   }
 
   SmallVector<wchar_t, MAX_PATH> CommandUtf16;
-  if (std::error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(Command, CommandUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert command-line to UTF-16"));
@@ -380,7 +281,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
   }
 
   PI.Pid = pi.dwProcessId;
-  PI.ProcessHandle = pi.hProcess;
+  PI.Process = pi.hProcess;
 
   // Make sure these get closed no matter what.
   ScopedCommonHandle hThread(pi.hThread);
@@ -413,11 +314,67 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **Args,
   return true;
 }
 
+static bool argNeedsQuotes(StringRef Arg) {
+  if (Arg.empty())
+    return true;
+  return StringRef::npos != Arg.find_first_of("\t \"&\'()*<>\\`^|");
+}
+
+static std::string quoteSingleArg(StringRef Arg) {
+  std::string Result;
+  Result.push_back('"');
+
+  while (!Arg.empty()) {
+    size_t FirstNonBackslash = Arg.find_first_not_of('\\');
+    size_t BackslashCount = FirstNonBackslash;
+    if (FirstNonBackslash == StringRef::npos) {
+      // The entire remainder of the argument is backslashes.  Escape all of
+      // them and just early out.
+      BackslashCount = Arg.size();
+      Result.append(BackslashCount * 2, '\\');
+      break;
+    }
+
+    if (Arg[FirstNonBackslash] == '\"') {
+      // This is an embedded quote.  Escape all preceding backslashes, then
+      // add one additional backslash to escape the quote.
+      Result.append(BackslashCount * 2 + 1, '\\');
+      Result.push_back('\"');
+    } else {
+      // This is just a normal character.  Don't escape any of the preceding
+      // backslashes, just append them as they are and then append the
+      // character.
+      Result.append(BackslashCount, '\\');
+      Result.push_back(Arg[FirstNonBackslash]);
+    }
+
+    // Drop all the backslashes, plus the following character.
+    Arg = Arg.drop_front(FirstNonBackslash + 1);
+  }
+
+  Result.push_back('"');
+  return Result;
+}
+
 namespace llvm {
+std::string sys::flattenWindowsCommandLine(ArrayRef<StringRef> Args) {
+  std::string Command;
+  for (StringRef Arg : Args) {
+    if (argNeedsQuotes(Arg))
+      Command += quoteSingleArg(Arg);
+    else
+      Command += Arg;
+
+    Command.push_back(' ');
+  }
+
+  return Command;
+}
+
 ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
                       bool WaitUntilChildTerminates, std::string *ErrMsg) {
   assert(PI.Pid && "invalid pid to wait on, process not started?");
-  assert(PI.ProcessHandle &&
+  assert((PI.Process && PI.Process != INVALID_HANDLE_VALUE) &&
          "invalid process handle to wait on, process not started?");
   DWORD milliSecondsToWait = 0;
   if (WaitUntilChildTerminates)
@@ -426,20 +383,20 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     milliSecondsToWait = SecondsToWait * 1000;
 
   ProcessInfo WaitResult = PI;
-  DWORD WaitStatus = WaitForSingleObject(PI.ProcessHandle, milliSecondsToWait);
+  DWORD WaitStatus = WaitForSingleObject(PI.Process, milliSecondsToWait);
   if (WaitStatus == WAIT_TIMEOUT) {
     if (SecondsToWait) {
-      if (!TerminateProcess(PI.ProcessHandle, 1)) {
+      if (!TerminateProcess(PI.Process, 1)) {
         if (ErrMsg)
           MakeErrMsg(ErrMsg, "Failed to terminate timed-out program");
 
         // -2 indicates a crash or timeout as opposed to failure to execute.
         WaitResult.ReturnCode = -2;
-        CloseHandle(PI.ProcessHandle);
+        CloseHandle(PI.Process);
         return WaitResult;
       }
-      WaitForSingleObject(PI.ProcessHandle, INFINITE);
-      CloseHandle(PI.ProcessHandle);
+      WaitForSingleObject(PI.Process, INFINITE);
+      CloseHandle(PI.Process);
     } else {
       // Non-blocking wait.
       return ProcessInfo();
@@ -448,10 +405,10 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
 
   // Get its exit status.
   DWORD status;
-  BOOL rc = GetExitCodeProcess(PI.ProcessHandle, &status);
+  BOOL rc = GetExitCodeProcess(PI.Process, &status);
   DWORD err = GetLastError();
   if (err != ERROR_INVALID_HANDLE)
-    CloseHandle(PI.ProcessHandle);
+    CloseHandle(PI.Process);
 
   if (!rc) {
     SetLastError(err);
@@ -495,7 +452,7 @@ std::error_code
 llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
                                  WindowsEncodingMethod Encoding) {
   std::error_code EC;
-  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::F_Text);
   if (EC)
     return EC;
 
@@ -536,19 +493,13 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
 }
 
 bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program,
-                                                  ArrayRef<const char *> Args) {
+                                                  ArrayRef<StringRef> Args) {
   // The documented max length of the command line passed to CreateProcess.
   static const size_t MaxCommandStringLength = 32768;
-  // Account for the trailing space for the program path and the
-  // trailing NULL of the last argument.
-  size_t ArgLength = ArgLenWithQuotes(Program.str().c_str()) + 2;
-  for (const char* Arg : Args) {
-    // Account for the trailing space for every arg
-    ArgLength += ArgLenWithQuotes(Arg) + 1;
-    if (ArgLength > MaxCommandStringLength) {
-      return false;
-    }
-  }
-  return true;
+  SmallVector<StringRef, 8> FullArgs;
+  FullArgs.push_back(Program);
+  FullArgs.append(Args.begin(), Args.end());
+  std::string Result = flattenWindowsCommandLine(FullArgs);
+  return (Result.size() + 1) <= MaxCommandStringLength;
 }
 }
diff --git a/contrib/llvm/lib/Support/Windows/RWMutex.inc b/contrib/llvm/lib/Support/Windows/RWMutex.inc
index ac60c2fc05be..5eb9351eee52 100644
--- a/contrib/llvm/lib/Support/Windows/RWMutex.inc
+++ b/contrib/llvm/lib/Support/Windows/RWMutex.inc
@@ -74,10 +74,10 @@ static bool loadSRW() {
 
 sys::RWMutexImpl::RWMutexImpl() {
   if (loadSRW()) {
-    data_ = calloc(1, sizeof(SRWLOCK));
+    data_ = safe_calloc(1, sizeof(SRWLOCK));
     fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
   } else {
-    data_ = calloc(1, sizeof(CRITICAL_SECTION));
+    data_ = safe_calloc(1, sizeof(CRITICAL_SECTION));
     InitializeCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
   }
 }
diff --git a/contrib/llvm/lib/Support/Windows/Signals.inc b/contrib/llvm/lib/Support/Windows/Signals.inc
index 21dd2dd13754..41eb5e593aa5 100644
--- a/contrib/llvm/lib/Support/Windows/Signals.inc
+++ b/contrib/llvm/lib/Support/Windows/Signals.inc
@@ -10,6 +10,7 @@
 // This file provides the Win32 specific implementation of the Signals class.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
@@ -36,9 +37,6 @@
 #ifdef _MSC_VER
  #pragma comment(lib, "psapi.lib")
 #elif __MINGW32__
- #if (HAVE_LIBPSAPI != 1)
-  #error "libpsapi.a should be present"
- #endif
  // The version of g++ that comes with MinGW does *not* properly understand
  // the ll format specifier for printf. However, MinGW passes the format
  // specifiers on to the MSVCRT entirely, and the CRT understands the ll
@@ -193,7 +191,7 @@ using namespace llvm;
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep);
 static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType);
 
-// InterruptFunction - The function to call if ctrl-c is pressed.
+// The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = 0;
 
 static std::vector<std::string> *FilesToRemove = NULL;
@@ -390,9 +388,9 @@ namespace llvm {
 //===----------------------------------------------------------------------===//
 
 #ifdef _MSC_VER
-/// AvoidMessageBoxHook - Emulates hitting "retry" from an "abort, retry,
-/// ignore" CRT debug report dialog.  "retry" raises an exception which
-/// ultimately triggers our stack dumper.
+/// Emulates hitting "retry" from an "abort, retry, ignore" CRT debug report
+/// dialog. "retry" raises an exception which ultimately triggers our stack
+/// dumper.
 static LLVM_ATTRIBUTE_UNUSED int
 AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
   // Set *Return to the retry code for the return value of _CrtDbgReport:
@@ -450,7 +448,7 @@ static void RegisterHandler() {
   // else multi-threading problems will ensue.
 }
 
-// RemoveFileOnSignal - The public API
+// The public API
 bool sys::RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg) {
   RegisterHandler();
 
@@ -469,7 +467,7 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg) {
   return false;
 }
 
-// DontRemoveFileOnSignal - The public API
+// The public API
 void sys::DontRemoveFileOnSignal(StringRef Filename) {
   if (FilesToRemove == NULL)
     return;
@@ -503,8 +501,8 @@ void sys::DisableSystemDialogsOnCrash() {
   _set_error_mode(_OUT_TO_STDERR);
 }
 
-/// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
-/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+/// When an error signal (such as SIGABRT or SIGSEGV) is delivered to the
+/// process, print a stack trace and then exit.
 void sys::PrintStackTraceOnErrorSignal(StringRef Argv0,
                                        bool DisableCrashReporting) {
   ::Argv0 = Argv0;
@@ -536,10 +534,14 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
   StackFrame.AddrPC.Offset = Context.Eip;
   StackFrame.AddrStack.Offset = Context.Esp;
   StackFrame.AddrFrame.Offset = Context.Ebp;
-#elif defined(_M_ARM64) || defined(_M_ARM)
+#elif defined(_M_ARM64)
   StackFrame.AddrPC.Offset = Context.Pc;
   StackFrame.AddrStack.Offset = Context.Sp;
   StackFrame.AddrFrame.Offset = Context.Fp;
+#elif defined(_M_ARM)
+  StackFrame.AddrPC.Offset = Context.Pc;
+  StackFrame.AddrStack.Offset = Context.Sp;
+  StackFrame.AddrFrame.Offset = Context.R11;
 #endif
   StackFrame.AddrPC.Mode = AddrModeFlat;
   StackFrame.AddrStack.Mode = AddrModeFlat;
@@ -556,11 +558,12 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) {
 }
 
 
-/// AddSignalHandler - Add a function to be called when a signal is delivered
-/// to the process.  The handler can have a cookie passed to it to identify
-/// what instance of the handler it is.
-void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
-  CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
+/// Add a function to be called when a signal is delivered to the process. The
+/// handler can have a cookie passed to it to identify what instance of the
+/// handler it is.
+void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
+                                 void *Cookie) {
+  insertSignalHandler(FnPtr, Cookie);
   RegisterHandler();
   LeaveCriticalSection(&CriticalSection);
 }
@@ -594,7 +597,7 @@ void llvm::sys::RunInterruptHandlers() {
   Cleanup();
 }
 
-/// \brief Find the Windows Registry Key for a given location.
+/// Find the Windows Registry Key for a given location.
 ///
 /// \returns a valid HKEY if the location exists, else NULL.
 static HKEY FindWERKey(const llvm::Twine &RegistryLocation) {
@@ -607,7 +610,7 @@ static HKEY FindWERKey(const llvm::Twine &RegistryLocation) {
   return Key;
 }
 
-/// \brief Populate ResultDirectory with the value for "DumpFolder" for a given
+/// Populate ResultDirectory with the value for "DumpFolder" for a given
 /// Windows Registry key.
 ///
 /// \returns true if a valid value for DumpFolder exists, false otherwise.
@@ -648,7 +651,7 @@ static bool GetDumpFolder(HKEY Key,
   return true;
 }
 
-/// \brief Populate ResultType with a valid MINIDUMP_TYPE based on the value of
+/// Populate ResultType with a valid MINIDUMP_TYPE based on the value of
 /// "DumpType" for a given Windows Registry key.
 ///
 /// According to
@@ -695,7 +698,7 @@ static bool GetDumpType(HKEY Key, MINIDUMP_TYPE &ResultType) {
   return true;
 }
 
-/// \brief Write a Windows dump file containing process information that can be
+/// Write a Windows dump file containing process information that can be
 /// used for post-mortem debugging.
 ///
 /// \returns zero error code if a mini dump created, actual error code
@@ -819,7 +822,11 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   StackFrame.AddrPC.Mode = AddrModeFlat;
   StackFrame.AddrStack.Offset = ep->ContextRecord->Sp;
   StackFrame.AddrStack.Mode = AddrModeFlat;
+#if defined(_M_ARM64)
   StackFrame.AddrFrame.Offset = ep->ContextRecord->Fp;
+#else
+  StackFrame.AddrFrame.Offset = ep->ContextRecord->R11;
+#endif
   StackFrame.AddrFrame.Mode = AddrModeFlat;
 #endif
 
diff --git a/contrib/llvm/lib/Support/Windows/WindowsSupport.h b/contrib/llvm/lib/Support/Windows/WindowsSupport.h
index d4599dca044e..c2fd6bb982d4 100644
--- a/contrib/llvm/lib/Support/Windows/WindowsSupport.h
+++ b/contrib/llvm/lib/Support/Windows/WindowsSupport.h
@@ -247,18 +247,12 @@ inline FILETIME toFILETIME(TimePoint<> TP) {
   return Time;
 }
 
-namespace path {
-std::error_code widenPath(const Twine &Path8,
-                          SmallVectorImpl<wchar_t> &Path16);
-} // end namespace path
-
 namespace windows {
-std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
-std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                            SmallVectorImpl<char> &utf8);
-/// Convert from UTF16 to the current code page used in the system
-std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
-                             SmallVectorImpl<char> &utf8);
+// Returns command line arguments. Unlike arguments given to main(),
+// this function guarantees that the returned arguments are encoded in
+// UTF-8 regardless of the current code page setting.
+std::error_code GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
+                                        BumpPtrAllocator &Alloc);
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm.
diff --git a/contrib/llvm/lib/Support/WithColor.cpp b/contrib/llvm/lib/Support/WithColor.cpp
new file mode 100644
index 000000000000..d2e13f0e86de
--- /dev/null
+++ b/contrib/llvm/lib/Support/WithColor.cpp
@@ -0,0 +1,90 @@
+//===- WithColor.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+cl::OptionCategory llvm::ColorCategory("Color Options");
+
+static cl::opt<cl::boolOrDefault>
+    UseColor("color", cl::cat(ColorCategory),
+             cl::desc("Use colors in output (default=autodetect)"),
+             cl::init(cl::BOU_UNSET));
+
+bool WithColor::colorsEnabled(raw_ostream &OS) {
+  if (UseColor == cl::BOU_UNSET)
+    return OS.has_colors();
+  return UseColor == cl::BOU_TRUE;
+}
+
+WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
+  // Detect color from terminal type unless the user passed the --color option.
+  if (colorsEnabled(OS)) {
+    switch (Color) {
+    case HighlightColor::Address:
+      OS.changeColor(raw_ostream::YELLOW);
+      break;
+    case HighlightColor::String:
+      OS.changeColor(raw_ostream::GREEN);
+      break;
+    case HighlightColor::Tag:
+      OS.changeColor(raw_ostream::BLUE);
+      break;
+    case HighlightColor::Attribute:
+      OS.changeColor(raw_ostream::CYAN);
+      break;
+    case HighlightColor::Enumerator:
+      OS.changeColor(raw_ostream::MAGENTA);
+      break;
+    case HighlightColor::Macro:
+      OS.changeColor(raw_ostream::RED);
+      break;
+    case HighlightColor::Error:
+      OS.changeColor(raw_ostream::RED, true);
+      break;
+    case HighlightColor::Warning:
+      OS.changeColor(raw_ostream::MAGENTA, true);
+      break;
+    case HighlightColor::Note:
+      OS.changeColor(raw_ostream::BLACK, true);
+      break;
+    }
+  }
+}
+
+raw_ostream &WithColor::error() { return error(errs()); }
+
+raw_ostream &WithColor::warning() { return warning(errs()); }
+
+raw_ostream &WithColor::note() { return note(errs()); }
+
+raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix) {
+  if (!Prefix.empty())
+    OS << Prefix << ": ";
+  return WithColor(OS, HighlightColor::Error).get() << "error: ";
+}
+
+raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix) {
+  if (!Prefix.empty())
+    OS << Prefix << ": ";
+  return WithColor(OS, HighlightColor::Warning).get() << "warning: ";
+}
+
+raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix) {
+  if (!Prefix.empty())
+    OS << Prefix << ": ";
+  return WithColor(OS, HighlightColor::Note).get() << "note: ";
+}
+
+WithColor::~WithColor() {
+  if (colorsEnabled(OS))
+    OS.resetColor();
+}
diff --git a/contrib/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm/lib/Support/YAMLParser.cpp
index e2f21a56a810..354b7d0740de 100644
--- a/contrib/llvm/lib/Support/YAMLParser.cpp
+++ b/contrib/llvm/lib/Support/YAMLParser.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -167,7 +168,7 @@ using TokenQueueT = BumpPtrList<Token>;
 
 namespace {
 
-/// @brief This struct is used to track simple keys.
+/// This struct is used to track simple keys.
 ///
 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
 /// which could legally be the start of a simple key. When peekNext is called,
@@ -190,7 +191,7 @@ struct SimpleKey {
 
 } // end anonymous namespace
 
-/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
+/// The Unicode scalar value of a UTF-8 minimal well-formed code unit
 ///        subsequence and the subsequence's length in code units (uint8_t).
 ///        A length of 0 represents an error.
 using UTF8Decoded = std::pair<uint32_t, unsigned>;
@@ -248,7 +249,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
 namespace llvm {
 namespace yaml {
 
-/// @brief Scans YAML tokens from a MemoryBuffer.
+/// Scans YAML tokens from a MemoryBuffer.
 class Scanner {
 public:
   Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
@@ -256,10 +257,10 @@ public:
   Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
           std::error_code *EC = nullptr);
 
-  /// @brief Parse the next token and return it without popping it.
+  /// Parse the next token and return it without popping it.
   Token &peekNext();
 
-  /// @brief Parse the next token and pop it from the queue.
+  /// Parse the next token and pop it from the queue.
   Token getNext();
 
   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
@@ -286,7 +287,7 @@ public:
     setError(Message, Current);
   }
 
-  /// @brief Returns true if an error occurred while parsing.
+  /// Returns true if an error occurred while parsing.
   bool failed() {
     return Failed;
   }
@@ -298,7 +299,7 @@ private:
     return StringRef(Current, End - Current);
   }
 
-  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
+  /// Decode a UTF-8 minimal well-formed code unit subsequence starting
   ///        at \a Position.
   ///
   /// If the UTF-8 code units starting at Position do not form a well-formed
@@ -328,7 +329,7 @@ private:
   // l-
   //   A production matching complete line(s).
 
-  /// @brief Skip a single nb-char[27] starting at Position.
+  /// Skip a single nb-char[27] starting at Position.
   ///
   /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
   ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
@@ -337,7 +338,7 @@ private:
   ///          nb-char.
   StringRef::iterator skip_nb_char(StringRef::iterator Position);
 
-  /// @brief Skip a single b-break[28] starting at Position.
+  /// Skip a single b-break[28] starting at Position.
   ///
   /// A b-break is 0xD 0xA | 0xD | 0xA
   ///
@@ -353,7 +354,7 @@ private:
   ///          s-space.
   StringRef::iterator skip_s_space(StringRef::iterator Position);
 
-  /// @brief Skip a single s-white[33] starting at Position.
+  /// Skip a single s-white[33] starting at Position.
   ///
   /// A s-white is 0x20 | 0x9
   ///
@@ -361,7 +362,7 @@ private:
   ///          s-white.
   StringRef::iterator skip_s_white(StringRef::iterator Position);
 
-  /// @brief Skip a single ns-char[34] starting at Position.
+  /// Skip a single ns-char[34] starting at Position.
   ///
   /// A ns-char is nb-char - s-white
   ///
@@ -371,7 +372,7 @@ private:
 
   using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
 
-  /// @brief Skip minimal well-formed code unit subsequences until Func
+  /// Skip minimal well-formed code unit subsequences until Func
   ///        returns its input.
   ///
   /// @returns The code unit after the last minimal well-formed code unit
@@ -383,20 +384,20 @@ private:
   /// input.
   void advanceWhile(SkipWhileFunc Func);
 
-  /// @brief Scan ns-uri-char[39]s starting at Cur.
+  /// Scan ns-uri-char[39]s starting at Cur.
   ///
   /// This updates Cur and Column while scanning.
   void scan_ns_uri_char();
 
-  /// @brief Consume a minimal well-formed code unit subsequence starting at
+  /// Consume a minimal well-formed code unit subsequence starting at
   ///        \a Cur. Return false if it is not the same Unicode scalar value as
   ///        \a Expected. This updates \a Column.
   bool consume(uint32_t Expected);
 
-  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
+  /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
   void skip(uint32_t Distance);
 
-  /// @brief Return true if the minimal well-formed code unit subsequence at
+  /// Return true if the minimal well-formed code unit subsequence at
   ///        Pos is whitespace or a new line
   bool isBlankOrBreak(StringRef::iterator Position);
 
@@ -405,77 +406,77 @@ private:
   /// Return false if the code unit at the current position isn't a line break.
   bool consumeLineBreakIfPresent();
 
-  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
+  /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
   void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
                              , unsigned AtColumn
                              , bool IsRequired);
 
-  /// @brief Remove simple keys that can no longer be valid simple keys.
+  /// Remove simple keys that can no longer be valid simple keys.
   ///
   /// Invalid simple keys are not on the current line or are further than 1024
   /// columns back.
   void removeStaleSimpleKeyCandidates();
 
-  /// @brief Remove all simple keys on FlowLevel \a Level.
+  /// Remove all simple keys on FlowLevel \a Level.
   void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
 
-  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
+  /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
   ///        tokens if needed.
   bool unrollIndent(int ToColumn);
 
-  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
+  /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
   ///        if needed.
   bool rollIndent( int ToColumn
                  , Token::TokenKind Kind
                  , TokenQueueT::iterator InsertPoint);
 
-  /// @brief Skip a single-line comment when the comment starts at the current
+  /// Skip a single-line comment when the comment starts at the current
   /// position of the scanner.
   void skipComment();
 
-  /// @brief Skip whitespace and comments until the start of the next token.
+  /// Skip whitespace and comments until the start of the next token.
   void scanToNextToken();
 
-  /// @brief Must be the first token generated.
+  /// Must be the first token generated.
   bool scanStreamStart();
 
-  /// @brief Generate tokens needed to close out the stream.
+  /// Generate tokens needed to close out the stream.
   bool scanStreamEnd();
 
-  /// @brief Scan a %BLAH directive.
+  /// Scan a %BLAH directive.
   bool scanDirective();
 
-  /// @brief Scan a ... or ---.
+  /// Scan a ... or ---.
   bool scanDocumentIndicator(bool IsStart);
 
-  /// @brief Scan a [ or { and generate the proper flow collection start token.
+  /// Scan a [ or { and generate the proper flow collection start token.
   bool scanFlowCollectionStart(bool IsSequence);
 
-  /// @brief Scan a ] or } and generate the proper flow collection end token.
+  /// Scan a ] or } and generate the proper flow collection end token.
   bool scanFlowCollectionEnd(bool IsSequence);
 
-  /// @brief Scan the , that separates entries in a flow collection.
+  /// Scan the , that separates entries in a flow collection.
   bool scanFlowEntry();
 
-  /// @brief Scan the - that starts block sequence entries.
+  /// Scan the - that starts block sequence entries.
   bool scanBlockEntry();
 
-  /// @brief Scan an explicit ? indicating a key.
+  /// Scan an explicit ? indicating a key.
   bool scanKey();
 
-  /// @brief Scan an explicit : indicating a value.
+  /// Scan an explicit : indicating a value.
   bool scanValue();
 
-  /// @brief Scan a quoted scalar.
+  /// Scan a quoted scalar.
   bool scanFlowScalar(bool IsDoubleQuoted);
 
-  /// @brief Scan an unquoted scalar.
+  /// Scan an unquoted scalar.
   bool scanPlainScalar();
 
-  /// @brief Scan an Alias or Anchor starting with * or &.
+  /// Scan an Alias or Anchor starting with * or &.
   bool scanAliasOrAnchor(bool IsAlias);
 
-  /// @brief Scan a block scalar starting with | or >.
+  /// Scan a block scalar starting with | or >.
   bool scanBlockScalar(bool IsLiteral);
 
   /// Scan a chomping indicator in a block scalar header.
@@ -502,57 +503,57 @@ private:
   bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
                              bool &IsDone);
 
-  /// @brief Scan a tag of the form !stuff.
+  /// Scan a tag of the form !stuff.
   bool scanTag();
 
-  /// @brief Dispatch to the next scanning function based on \a *Cur.
+  /// Dispatch to the next scanning function based on \a *Cur.
   bool fetchMoreTokens();
 
-  /// @brief The SourceMgr used for diagnostics and buffer management.
+  /// The SourceMgr used for diagnostics and buffer management.
   SourceMgr &SM;
 
-  /// @brief The original input.
+  /// The original input.
   MemoryBufferRef InputBuffer;
 
-  /// @brief The current position of the scanner.
+  /// The current position of the scanner.
   StringRef::iterator Current;
 
-  /// @brief The end of the input (one past the last character).
+  /// The end of the input (one past the last character).
   StringRef::iterator End;
 
-  /// @brief Current YAML indentation level in spaces.
+  /// Current YAML indentation level in spaces.
   int Indent;
 
-  /// @brief Current column number in Unicode code points.
+  /// Current column number in Unicode code points.
   unsigned Column;
 
-  /// @brief Current line number.
+  /// Current line number.
   unsigned Line;
 
-  /// @brief How deep we are in flow style containers. 0 Means at block level.
+  /// How deep we are in flow style containers. 0 Means at block level.
   unsigned FlowLevel;
 
-  /// @brief Are we at the start of the stream?
+  /// Are we at the start of the stream?
   bool IsStartOfStream;
 
-  /// @brief Can the next token be the start of a simple key?
+  /// Can the next token be the start of a simple key?
   bool IsSimpleKeyAllowed;
 
-  /// @brief True if an error has occurred.
+  /// True if an error has occurred.
   bool Failed;
 
-  /// @brief Should colors be used when printing out the diagnostic messages?
+  /// Should colors be used when printing out the diagnostic messages?
   bool ShowColors;
 
-  /// @brief Queue of tokens. This is required to queue up tokens while looking
+  /// Queue of tokens. This is required to queue up tokens while looking
   ///        for the end of a simple key. And for cases where a single character
   ///        can produce multiple tokens (e.g. BlockEnd).
   TokenQueueT TokenQueue;
 
-  /// @brief Indentation levels.
+  /// Indentation levels.
   SmallVector<int, 4> Indents;
 
-  /// @brief Potential simple keys.
+  /// Potential simple keys.
   SmallVector<SimpleKey, 4> SimpleKeys;
 
   std::error_code *EC;
@@ -687,7 +688,7 @@ bool yaml::scanTokens(StringRef Input) {
   return true;
 }
 
-std::string yaml::escape(StringRef Input) {
+std::string yaml::escape(StringRef Input, bool EscapePrintable) {
   std::string EscapedInput;
   for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
     if (*i == '\\')
@@ -734,6 +735,9 @@ std::string yaml::escape(StringRef Input) {
         EscapedInput += "\\L";
       else if (UnicodeScalarValue.first == 0x2029)
         EscapedInput += "\\P";
+      else if (!EscapePrintable &&
+               sys::unicode::isPrintable(UnicodeScalarValue.first))
+        EscapedInput += StringRef(i, UnicodeScalarValue.second);
       else {
         std::string HexStr = utohexstr(UnicodeScalarValue.first);
         if (HexStr.size() <= 2)
diff --git a/contrib/llvm/lib/Support/YAMLTraits.cpp b/contrib/llvm/lib/Support/YAMLTraits.cpp
index f8a80ba87873..d6345efd00cd 100644
--- a/contrib/llvm/lib/Support/YAMLTraits.cpp
+++ b/contrib/llvm/lib/Support/YAMLTraits.cpp
@@ -638,39 +638,22 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
   const char *Base = S.data();
 
   const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
-  const char QuoteChar = MustQuote == QuotingType::Single ? '\'' : '"';
-
   output(Quote); // Starting quote.
 
-  // When using single-quoted strings, any single quote ' must be doubled to be
-  // escaped.
-  // When using double-quoted strings, print \x + hex for non-printable ASCII
-  // characters, and escape double quotes.
-  while (j < End) {
-    if (S[j] == QuoteChar) {                  // Escape quotes.
-      output(StringRef(&Base[i], j - i));     // "flush".
-      if (MustQuote == QuotingType::Double) { // Print it as \"
-        output(StringLiteral("\\"));
-        output(StringRef(Quote, 1));
-      } else {                       // Single
-        output(StringLiteral("''")); // Print it as ''
-      }
-      i = j + 1;
-    } else if (MustQuote == QuotingType::Double &&
-               !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) {
-      // If we're double quoting non-printable characters, we prefer printing
-      // them as "\x" + their hex representation. Note that special casing is
-      // needed for UTF-8, where a byte may be part of a UTF-8 sequence and
-      // appear as non-printable, in which case we want to print the correct
-      // unicode character and not its hex representation.
-      output(StringRef(&Base[i], j - i)); // "flush"
-      output(StringLiteral("\\x"));
-
-      // Output the byte 0x0F as \x0f.
-      auto FormattedHex = format_hex_no_prefix(S[j], 2);
-      Out << FormattedHex;
-      Column += 4; // one for the '\', one for the 'x', and two for the hex
+  // When using double-quoted strings (and only in that case), non-printable characters may be
+  // present, and will be escaped using a variety of unicode-scalar and special short-form
+  // escapes. This is handled in yaml::escape.
+  if (MustQuote == QuotingType::Double) {
+    output(yaml::escape(Base, /* EscapePrintable= */ false));
+    this->outputUpToEndOfLine(Quote);
+    return;
+  }
 
+  // When using single-quoted strings, any single quote ' must be doubled to be escaped.
+  while (j < End) {
+    if (S[j] == '\'') {                    // Escape quotes.
+      output(StringRef(&Base[i], j - i));  // "flush".
+      output(StringLiteral("''"));         // Print it as ''
       i = j + 1;
     }
     ++j;
diff --git a/contrib/llvm/lib/Support/circular_raw_ostream.cpp b/contrib/llvm/lib/Support/circular_raw_ostream.cpp
index ca0d30db388c..e768f17cd00d 100644
--- a/contrib/llvm/lib/Support/circular_raw_ostream.cpp
+++ b/contrib/llvm/lib/Support/circular_raw_ostream.cpp
@@ -33,7 +33,7 @@ void circular_raw_ostream::write_impl(const char *Ptr, size_t Size) {
       Cur = BufferArray;
       Filled = true;
     }
-  }    
+  }
 }
 
 void circular_raw_ostream::flushBufferWithBanner() {
diff --git a/contrib/llvm/lib/Support/raw_ostream.cpp b/contrib/llvm/lib/Support/raw_ostream.cpp
index e02611103080..038ad00bd608 100644
--- a/contrib/llvm/lib/Support/raw_ostream.cpp
+++ b/contrib/llvm/lib/Support/raw_ostream.cpp
@@ -41,9 +41,6 @@
 #if defined(HAVE_UNISTD_H)
 # include <unistd.h>
 #endif
-#if defined(HAVE_SYS_UIO_H) && defined(HAVE_WRITEV)
-#  include <sys/uio.h>
-#endif
 
 #if defined(__CYGWIN__)
 #include <io.h>
@@ -62,7 +59,7 @@
 #endif
 #endif
 
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
 #include "Windows/WindowsSupport.h"
 #endif
 
@@ -78,9 +75,6 @@ raw_ostream::~raw_ostream() {
     delete [] OutBufStart;
 }
 
-// An out of line virtual method to provide a home for the class vtable.
-void raw_ostream::handle() {}
-
 size_t raw_ostream::preferred_buffer_size() const {
   // BUFSIZ is intended to be a reasonable default.
   return BUFSIZ;
@@ -166,7 +160,7 @@ raw_ostream &raw_ostream::write_escaped(StringRef Str,
       *this << '\\' << '"';
       break;
     default:
-      if (std::isprint(c)) {
+      if (isPrint(c)) {
         *this << c;
         break;
       }
@@ -442,7 +436,7 @@ raw_ostream &raw_ostream::operator<<(const FormattedBytes &FB) {
 
       // Print the ASCII char values for each byte on this line
       for (uint8_t Byte : Line) {
-        if (isprint(Byte))
+        if (isPrint(Byte))
           *this << static_cast<char>(Byte);
         else
           *this << '.';
@@ -458,25 +452,39 @@ raw_ostream &raw_ostream::operator<<(const FormattedBytes &FB) {
   return *this;
 }
 
-/// indent - Insert 'NumSpaces' spaces.
-raw_ostream &raw_ostream::indent(unsigned NumSpaces) {
-  static const char Spaces[] = "                                "
-                               "                                "
-                               "                ";
+template <char C>
+static raw_ostream &write_padding(raw_ostream &OS, unsigned NumChars) {
+  static const char Chars[] = {C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C,
+                               C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C,
+                               C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C,
+                               C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C,
+                               C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C};
 
   // Usually the indentation is small, handle it with a fastpath.
-  if (NumSpaces < array_lengthof(Spaces))
-    return write(Spaces, NumSpaces);
-
-  while (NumSpaces) {
-    unsigned NumToWrite = std::min(NumSpaces,
-                                   (unsigned)array_lengthof(Spaces)-1);
-    write(Spaces, NumToWrite);
-    NumSpaces -= NumToWrite;
+  if (NumChars < array_lengthof(Chars))
+    return OS.write(Chars, NumChars);
+
+  while (NumChars) {
+    unsigned NumToWrite = std::min(NumChars,
+                                   (unsigned)array_lengthof(Chars)-1);
+    OS.write(Chars, NumToWrite);
+    NumChars -= NumToWrite;
   }
-  return *this;
+  return OS;
+}
+
+/// indent - Insert 'NumSpaces' spaces.
+raw_ostream &raw_ostream::indent(unsigned NumSpaces) {
+  return write_padding<' '>(*this, NumSpaces);
 }
 
+/// write_zeros - Insert 'NumZeros' nulls.
+raw_ostream &raw_ostream::write_zeros(unsigned NumZeros) {
+  return write_padding<'\0'>(*this, NumZeros);
+}
+
+void raw_ostream::anchor() {}
+
 //===----------------------------------------------------------------------===//
 //  Formatted Output
 //===----------------------------------------------------------------------===//
@@ -490,29 +498,56 @@ void format_object_base::home() {
 //===----------------------------------------------------------------------===//
 
 static int getFD(StringRef Filename, std::error_code &EC,
+                 sys::fs::CreationDisposition Disp, sys::fs::FileAccess Access,
                  sys::fs::OpenFlags Flags) {
+  assert((Access & sys::fs::FA_Write) &&
+         "Cannot make a raw_ostream from a read-only descriptor!");
+
   // Handle "-" as stdout. Note that when we do this, we consider ourself
   // the owner of stdout and may set the "binary" flag globally based on Flags.
   if (Filename == "-") {
     EC = std::error_code();
     // If user requested binary then put stdout into binary mode if
     // possible.
-    if (!(Flags & sys::fs::F_Text))
+    if (!(Flags & sys::fs::OF_Text))
       sys::ChangeStdoutToBinary();
     return STDOUT_FILENO;
   }
 
   int FD;
-  EC = sys::fs::openFileForWrite(Filename, FD, Flags);
+  if (Access & sys::fs::FA_Read)
+    EC = sys::fs::openFileForReadWrite(Filename, FD, Disp, Flags);
+  else
+    EC = sys::fs::openFileForWrite(Filename, FD, Disp, Flags);
   if (EC)
     return -1;
 
   return FD;
 }
 
+raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC)
+    : raw_fd_ostream(Filename, EC, sys::fs::CD_CreateAlways, sys::fs::FA_Write,
+                     sys::fs::OF_None) {}
+
+raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                               sys::fs::CreationDisposition Disp)
+    : raw_fd_ostream(Filename, EC, Disp, sys::fs::FA_Write, sys::fs::OF_None) {}
+
+raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                               sys::fs::FileAccess Access)
+    : raw_fd_ostream(Filename, EC, sys::fs::CD_CreateAlways, Access,
+                     sys::fs::OF_None) {}
+
+raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                               sys::fs::OpenFlags Flags)
+    : raw_fd_ostream(Filename, EC, sys::fs::CD_CreateAlways, sys::fs::FA_Write,
+                     Flags) {}
+
 raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
+                               sys::fs::CreationDisposition Disp,
+                               sys::fs::FileAccess Access,
                                sys::fs::OpenFlags Flags)
-    : raw_fd_ostream(getFD(Filename, EC, Flags), true) {}
+    : raw_fd_ostream(getFD(Filename, EC, Disp, Access, Flags), true) {}
 
 /// FD is the file descriptor that this writes to.  If ShouldClose is true, this
 /// closes the file when the stream is destroyed.
@@ -534,7 +569,7 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
 
   // Get the starting position.
   off_t loc = ::lseek(FD, 0, SEEK_CUR);
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
   // MSVCRT's _lseek(SEEK_CUR) doesn't return -1 for pipes.
   sys::fs::file_status Status;
   std::error_code EC = status(FD, Status);
@@ -587,7 +622,7 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
   // It is observed that Linux returns EINVAL for a very large write (>2G).
   // Make it a reasonably small value.
   MaxWriteSize = 1024 * 1024 * 1024;
-#elif defined(LLVM_ON_WIN32)
+#elif defined(_WIN32)
   // Writing a large size of output to Windows console returns ENOMEM. It seems
   // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
   // the latter has a size limit (66000 bytes or less, depending on heap usage).
@@ -640,7 +675,7 @@ void raw_fd_ostream::close() {
 uint64_t raw_fd_ostream::seek(uint64_t off) {
   assert(SupportsSeeking && "Stream does not support seeking!");
   flush();
-#ifdef LLVM_ON_WIN32
+#ifdef _WIN32
   pos = ::_lseeki64(FD, off, SEEK_SET);
 #elif defined(HAVE_LSEEK64)
   pos = ::lseek64(FD, off, SEEK_SET);
@@ -730,6 +765,8 @@ bool raw_fd_ostream::has_colors() const {
   return sys::Process::FileDescriptorHasColors(FD);
 }
 
+void raw_fd_ostream::anchor() {}
+
 //===----------------------------------------------------------------------===//
 //  outs(), errs(), nulls()
 //===----------------------------------------------------------------------===//
@@ -807,3 +844,5 @@ uint64_t raw_null_ostream::current_pos() const {
 
 void raw_null_ostream::pwrite_impl(const char *Ptr, size_t Size,
                                    uint64_t Offset) {}
+
+void raw_pwrite_stream::anchor() {}
diff --git a/contrib/llvm/lib/Support/regcomp.c b/contrib/llvm/lib/Support/regcomp.c
index 354e359f676b..12669ab75d1a 100644
--- a/contrib/llvm/lib/Support/regcomp.c
+++ b/contrib/llvm/lib/Support/regcomp.c
@@ -36,6 +36,7 @@
  */
 
 #include <sys/types.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
@@ -47,12 +48,6 @@
 #include "regex2.h"
 
 #include "llvm/Config/config.h"
-#if HAVE_STDINT_H
-#include <stdint.h>
-#else
-/* Pessimistically bound memory use */
-#define SIZE_MAX UINT_MAX
-#endif
 
 /* character-class table */
 static struct cclass {
diff --git a/contrib/llvm/lib/Support/regengine.inc b/contrib/llvm/lib/Support/regengine.inc
index 62d8c267f22f..41787aff1242 100644
--- a/contrib/llvm/lib/Support/regengine.inc
+++ b/contrib/llvm/lib/Support/regengine.inc
@@ -1013,7 +1013,7 @@ pchar(int ch)
 {
 	static char pbuf[10];
 
-	if (isprint(ch) || ch == ' ')
+	if (isPrint(ch) || ch == ' ')
 		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
 	else
 		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
diff --git a/contrib/llvm/lib/Support/xxhash.cpp b/contrib/llvm/lib/Support/xxhash.cpp
index a7d990bf6a4b..df643f9bd639 100644
--- a/contrib/llvm/lib/Support/xxhash.cpp
+++ b/contrib/llvm/lib/Support/xxhash.cpp
@@ -71,12 +71,12 @@ static uint64_t mergeRound(uint64_t Acc, uint64_t Val) {
 uint64_t llvm::xxHash64(StringRef Data) {
   size_t Len = Data.size();
   uint64_t Seed = 0;
-  const char *P = Data.data();
-  const char *const BEnd = P + Len;
+  const unsigned char *P = Data.bytes_begin();
+  const unsigned char *const BEnd = Data.bytes_end();
   uint64_t H64;
 
   if (Len >= 32) {
-    const char *const Limit = BEnd - 32;
+    const unsigned char *const Limit = BEnd - 32;
     uint64_t V1 = Seed + PRIME64_1 + PRIME64_2;
     uint64_t V2 = Seed + PRIME64_2;
     uint64_t V3 = Seed + 0;
diff --git a/contrib/llvm/lib/TableGen/Error.cpp b/contrib/llvm/lib/TableGen/Error.cpp
index b4830178a269..e6171c71efc0 100644
--- a/contrib/llvm/lib/TableGen/Error.cpp
+++ b/contrib/llvm/lib/TableGen/Error.cpp
@@ -15,6 +15,7 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 
@@ -51,9 +52,7 @@ void PrintWarning(const char *Loc, const Twine &Msg) {
   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Warning, Msg);
 }
 
-void PrintWarning(const Twine &Msg) {
-  errs() << "warning:" << Msg << "\n";
-}
+void PrintWarning(const Twine &Msg) { WithColor::warning() << Msg << "\n"; }
 
 void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
   PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
@@ -63,9 +62,7 @@ void PrintError(const char *Loc, const Twine &Msg) {
   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
 }
 
-void PrintError(const Twine &Msg) {
-  errs() << "error:" << Msg << "\n";
-}
+void PrintError(const Twine &Msg) { WithColor::error() << Msg << "\n"; }
 
 void PrintFatalError(const Twine &Msg) {
   PrintError(Msg);
diff --git a/contrib/llvm/lib/TableGen/JSONBackend.cpp b/contrib/llvm/lib/TableGen/JSONBackend.cpp
new file mode 100644
index 000000000000..36cb2208a294
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/JSONBackend.cpp
@@ -0,0 +1,189 @@
+//===- JSONBackend.cpp - Generate a JSON dump of all records. -*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This TableGen back end generates a machine-readable representation
+// of all the classes and records defined by the input, in JSON format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include "llvm/Support/JSON.h"
+
+#define DEBUG_TYPE "json-emitter"
+
+using namespace llvm;
+
+namespace {
+
+class JSONEmitter {
+private:
+  RecordKeeper &Records;
+
+  json::Value translateInit(const Init &I);
+  json::Array listSuperclasses(const Record &R);
+
+public:
+  JSONEmitter(RecordKeeper &R);
+
+  void run(raw_ostream &OS);
+};
+
+} // end anonymous namespace
+
+JSONEmitter::JSONEmitter(RecordKeeper &R) : Records(R) {}
+
+json::Value JSONEmitter::translateInit(const Init &I) {
+
+  // Init subclasses that we return as JSON primitive values of one
+  // kind or another.
+
+  if (isa<UnsetInit>(&I)) {
+    return nullptr;
+  } else if (auto *Bit = dyn_cast<BitInit>(&I)) {
+    return Bit->getValue() ? 1 : 0;
+  } else if (auto *Bits = dyn_cast<BitsInit>(&I)) {
+    json::Array array;
+    for (unsigned i = 0, limit = Bits->getNumBits(); i < limit; i++)
+      array.push_back(translateInit(*Bits->getBit(i)));
+    return std::move(array);
+  } else if (auto *Int = dyn_cast<IntInit>(&I)) {
+    return Int->getValue();
+  } else if (auto *Str = dyn_cast<StringInit>(&I)) {
+    return Str->getValue();
+  } else if (auto *Code = dyn_cast<CodeInit>(&I)) {
+    return Code->getValue();
+  } else if (auto *List = dyn_cast<ListInit>(&I)) {
+    json::Array array;
+    for (auto val : *List)
+      array.push_back(translateInit(*val));
+    return std::move(array);
+  }
+
+  // Init subclasses that we return as JSON objects containing a
+  // 'kind' discriminator. For these, we also provide the same
+  // translation back into TableGen input syntax that -print-records
+  // would give.
+
+  json::Object obj;
+  obj["printable"] = I.getAsString();
+
+  if (auto *Def = dyn_cast<DefInit>(&I)) {
+    obj["kind"] = "def";
+    obj["def"] = Def->getDef()->getName();
+    return std::move(obj);
+  } else if (auto *Var = dyn_cast<VarInit>(&I)) {
+    obj["kind"] = "var";
+    obj["var"] = Var->getName();
+    return std::move(obj);
+  } else if (auto *VarBit = dyn_cast<VarBitInit>(&I)) {
+    if (auto *Var = dyn_cast<VarInit>(VarBit->getBitVar())) {
+      obj["kind"] = "varbit";
+      obj["var"] = Var->getName();
+      obj["index"] = VarBit->getBitNum();
+      return std::move(obj);
+    }
+  } else if (auto *Dag = dyn_cast<DagInit>(&I)) {
+    obj["kind"] = "dag";
+    obj["operator"] = translateInit(*Dag->getOperator());
+    if (auto name = Dag->getName())
+      obj["name"] = name->getAsUnquotedString();
+    json::Array args;
+    for (unsigned i = 0, limit = Dag->getNumArgs(); i < limit; ++i) {
+      json::Array arg;
+      arg.push_back(translateInit(*Dag->getArg(i)));
+      if (auto argname = Dag->getArgName(i))
+        arg.push_back(argname->getAsUnquotedString());
+      else
+        arg.push_back(nullptr);
+      args.push_back(std::move(arg));
+    }
+    obj["args"] = std::move(args);
+    return std::move(obj);
+  }
+
+  // Final fallback: anything that gets past here is simply given a
+  // kind field of 'complex', and the only other field is the standard
+  // 'printable' representation.
+
+  assert(!I.isConcrete());
+  obj["kind"] = "complex";
+  return std::move(obj);
+}
+
+void JSONEmitter::run(raw_ostream &OS) {
+  json::Object root;
+
+  root["!tablegen_json_version"] = 1;
+
+  // Prepare the arrays that will list the instances of every class.
+  // We mostly fill those in by iterating over the superclasses of
+  // each def, but we also want to ensure we store an empty list for a
+  // class with no instances at all, so we do a preliminary iteration
+  // over the classes, invoking std::map::operator[] to default-
+  // construct the array for each one.
+  std::map<std::string, json::Array> instance_lists;
+  for (const auto &C : Records.getClasses()) {
+    auto &Name = C.second->getNameInitAsString();
+    (void)instance_lists[Name];
+  }
+
+  // Main iteration over the defs.
+  for (const auto &D : Records.getDefs()) {
+    auto &Name = D.second->getNameInitAsString();
+    auto &Def = *D.second;
+
+    json::Object obj;
+    json::Array fields;
+
+    for (const RecordVal &RV : Def.getValues()) {
+      if (!Def.isTemplateArg(RV.getNameInit())) {
+        auto Name = RV.getNameInitAsString();
+        if (RV.getPrefix())
+          fields.push_back(Name);
+        obj[Name] = translateInit(*RV.getValue());
+      }
+    }
+
+    obj["!fields"] = std::move(fields);
+
+    json::Array superclasses;
+    for (const auto &SuperPair : Def.getSuperClasses())
+      superclasses.push_back(SuperPair.first->getNameInitAsString());
+    obj["!superclasses"] = std::move(superclasses);
+
+    obj["!name"] = Name;
+    obj["!anonymous"] = Def.isAnonymous();
+
+    root[Name] = std::move(obj);
+
+    // Add this def to the instance list for each of its superclasses.
+    for (const auto &SuperPair : Def.getSuperClasses()) {
+      auto SuperName = SuperPair.first->getNameInitAsString();
+      instance_lists[SuperName].push_back(Name);
+    }
+  }
+
+  // Make a JSON object from the std::map of instance lists.
+  json::Object instanceof;
+  for (auto kv: instance_lists)
+    instanceof[kv.first] = std::move(kv.second);
+  root["!instanceof"] = std::move(instanceof);
+
+  // Done. Write the output.
+  OS << json::Value(std::move(root)) << "\n";
+}
+
+namespace llvm {
+
+void EmitJSON(RecordKeeper &RK, raw_ostream &OS) { JSONEmitter(RK).run(OS); }
+} // end namespace llvm
diff --git a/contrib/llvm/lib/TableGen/Main.cpp b/contrib/llvm/lib/TableGen/Main.cpp
index be35f894cccd..3a0701626089 100644
--- a/contrib/llvm/lib/TableGen/Main.cpp
+++ b/contrib/llvm/lib/TableGen/Main.cpp
@@ -52,7 +52,7 @@ static int reportError(const char *ProgName, Twine Msg) {
   return 1;
 }
 
-/// \brief Create a dependency file for `-d` option.
+/// Create a dependency file for `-d` option.
 ///
 /// This functionality is really only for the benefit of the build system.
 /// It is similar to GCC's `-M*` family of options.
diff --git a/contrib/llvm/lib/TableGen/Record.cpp b/contrib/llvm/lib/TableGen/Record.cpp
index 2c5b745433b8..43d178caef30 100644
--- a/contrib/llvm/lib/TableGen/Record.cpp
+++ b/contrib/llvm/lib/TableGen/Record.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
@@ -63,6 +64,8 @@ bool RecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   return Kind == RHS->getRecTyKind();
 }
 
+bool RecTy::typeIsA(const RecTy *RHS) const { return this == RHS; }
+
 bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{
   if (RecTy::typeIsConvertibleTo(RHS) || RHS->getRecTyKind() == IntRecTyKind)
     return true;
@@ -92,15 +95,31 @@ bool BitsRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   return (kind == BitRecTyKind && Size == 1) || (kind == IntRecTyKind);
 }
 
+bool BitsRecTy::typeIsA(const RecTy *RHS) const {
+  if (const BitsRecTy *RHSb = dyn_cast<BitsRecTy>(RHS))
+    return RHSb->Size == Size;
+  return false;
+}
+
 bool IntRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   RecTyKind kind = RHS->getRecTyKind();
   return kind==BitRecTyKind || kind==BitsRecTyKind || kind==IntRecTyKind;
 }
 
+bool CodeRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
+  RecTyKind Kind = RHS->getRecTyKind();
+  return Kind == CodeRecTyKind || Kind == StringRecTyKind;
+}
+
 std::string StringRecTy::getAsString() const {
   return "string";
 }
 
+bool StringRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
+  RecTyKind Kind = RHS->getRecTyKind();
+  return Kind == StringRecTyKind || Kind == CodeRecTyKind;
+}
+
 std::string ListRecTy::getAsString() const {
   return "list<" + Ty->getAsString() + ">";
 }
@@ -111,58 +130,152 @@ bool ListRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   return false;
 }
 
+bool ListRecTy::typeIsA(const RecTy *RHS) const {
+  if (const ListRecTy *RHSl = dyn_cast<ListRecTy>(RHS))
+    return getElementType()->typeIsA(RHSl->getElementType());
+  return false;
+}
+
 std::string DagRecTy::getAsString() const {
   return "dag";
 }
 
-RecordRecTy *RecordRecTy::get(Record *R) {
-  return dyn_cast<RecordRecTy>(R->getDefInit()->getType());
+static void ProfileRecordRecTy(FoldingSetNodeID &ID,
+                               ArrayRef<Record *> Classes) {
+  ID.AddInteger(Classes.size());
+  for (Record *R : Classes)
+    ID.AddPointer(R);
+}
+
+RecordRecTy *RecordRecTy::get(ArrayRef<Record *> UnsortedClasses) {
+  if (UnsortedClasses.empty()) {
+    static RecordRecTy AnyRecord(0);
+    return &AnyRecord;
+  }
+
+  FoldingSet<RecordRecTy> &ThePool =
+      UnsortedClasses[0]->getRecords().RecordTypePool;
+
+  SmallVector<Record *, 4> Classes(UnsortedClasses.begin(),
+                                   UnsortedClasses.end());
+  llvm::sort(Classes.begin(), Classes.end(),
+             [](Record *LHS, Record *RHS) {
+               return LHS->getNameInitAsString() < RHS->getNameInitAsString();
+             });
+
+  FoldingSetNodeID ID;
+  ProfileRecordRecTy(ID, Classes);
+
+  void *IP = nullptr;
+  if (RecordRecTy *Ty = ThePool.FindNodeOrInsertPos(ID, IP))
+    return Ty;
+
+#ifndef NDEBUG
+  // Check for redundancy.
+  for (unsigned i = 0; i < Classes.size(); ++i) {
+    for (unsigned j = 0; j < Classes.size(); ++j) {
+      assert(i == j || !Classes[i]->isSubClassOf(Classes[j]));
+    }
+    assert(&Classes[0]->getRecords() == &Classes[i]->getRecords());
+  }
+#endif
+
+  void *Mem = Allocator.Allocate(totalSizeToAlloc<Record *>(Classes.size()),
+                                 alignof(RecordRecTy));
+  RecordRecTy *Ty = new(Mem) RecordRecTy(Classes.size());
+  std::uninitialized_copy(Classes.begin(), Classes.end(),
+                          Ty->getTrailingObjects<Record *>());
+  ThePool.InsertNode(Ty, IP);
+  return Ty;
+}
+
+void RecordRecTy::Profile(FoldingSetNodeID &ID) const {
+  ProfileRecordRecTy(ID, getClasses());
 }
 
 std::string RecordRecTy::getAsString() const {
-  return Rec->getName();
+  if (NumClasses == 1)
+    return getClasses()[0]->getNameInitAsString();
+
+  std::string Str = "{";
+  bool First = true;
+  for (Record *R : getClasses()) {
+    if (!First)
+      Str += ", ";
+    First = false;
+    Str += R->getNameInitAsString();
+  }
+  Str += "}";
+  return Str;
+}
+
+bool RecordRecTy::isSubClassOf(Record *Class) const {
+  return llvm::any_of(getClasses(), [Class](Record *MySuperClass) {
+                                      return MySuperClass == Class ||
+                                             MySuperClass->isSubClassOf(Class);
+                                    });
 }
 
 bool RecordRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
+  if (this == RHS)
+    return true;
+
   const RecordRecTy *RTy = dyn_cast<RecordRecTy>(RHS);
   if (!RTy)
     return false;
 
-  if (RTy->getRecord() == Rec || Rec->isSubClassOf(RTy->getRecord()))
-    return true;
+  return llvm::all_of(RTy->getClasses(), [this](Record *TargetClass) {
+                                           return isSubClassOf(TargetClass);
+                                         });
+}
 
-  for (const auto &SCPair : RTy->getRecord()->getSuperClasses())
-    if (Rec->isSubClassOf(SCPair.first))
-      return true;
+bool RecordRecTy::typeIsA(const RecTy *RHS) const {
+  return typeIsConvertibleTo(RHS);
+}
 
-  return false;
+static RecordRecTy *resolveRecordTypes(RecordRecTy *T1, RecordRecTy *T2) {
+  SmallVector<Record *, 4> CommonSuperClasses;
+  SmallVector<Record *, 4> Stack;
+
+  Stack.insert(Stack.end(), T1->classes_begin(), T1->classes_end());
+
+  while (!Stack.empty()) {
+    Record *R = Stack.back();
+    Stack.pop_back();
+
+    if (T2->isSubClassOf(R)) {
+      CommonSuperClasses.push_back(R);
+    } else {
+      R->getDirectSuperClasses(Stack);
+    }
+  }
+
+  return RecordRecTy::get(CommonSuperClasses);
 }
 
 RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
+  if (T1 == T2)
+    return T1;
+
+  if (RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
+    if (RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2))
+      return resolveRecordTypes(RecTy1, RecTy2);
+  }
+
   if (T1->typeIsConvertibleTo(T2))
     return T2;
   if (T2->typeIsConvertibleTo(T1))
     return T1;
 
-  // If one is a Record type, check superclasses
-  if (RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
-    // See if T2 inherits from a type T1 also inherits from
-    for (const auto &SuperPair1 : RecTy1->getRecord()->getSuperClasses()) {
-      RecordRecTy *SuperRecTy1 = RecordRecTy::get(SuperPair1.first);
-      RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
-      if (NewType1)
-        return NewType1;
-    }
-  }
-  if (RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2)) {
-    // See if T1 inherits from a type T2 also inherits from
-    for (const auto &SuperPair2 : RecTy2->getRecord()->getSuperClasses()) {
-      RecordRecTy *SuperRecTy2 = RecordRecTy::get(SuperPair2.first);
-      RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
-      if (NewType2)
-        return NewType2;
+  if (ListRecTy *ListTy1 = dyn_cast<ListRecTy>(T1)) {
+    if (ListRecTy *ListTy2 = dyn_cast<ListRecTy>(T2)) {
+      RecTy* NewType = resolveTypes(ListTy1->getElementType(),
+                                    ListTy2->getElementType());
+      if (NewType)
+        return NewType->getListTy();
     }
   }
+
   return nullptr;
 }
 
@@ -181,17 +294,11 @@ UnsetInit *UnsetInit::get() {
   return &TheInit;
 }
 
-Init *UnsetInit::convertInitializerTo(RecTy *Ty) const {
-  if (auto *BRT = dyn_cast<BitsRecTy>(Ty)) {
-    SmallVector<Init *, 16> NewBits(BRT->getNumBits());
-
-    for (unsigned i = 0; i != BRT->getNumBits(); ++i)
-      NewBits[i] = UnsetInit::get();
-
-    return BitsInit::get(NewBits);
-  }
+Init *UnsetInit::getCastTo(RecTy *Ty) const {
+  return const_cast<UnsetInit *>(this);
+}
 
-  // All other types can just be returned.
+Init *UnsetInit::convertInitializerTo(RecTy *Ty) const {
   return const_cast<UnsetInit *>(this);
 }
 
@@ -287,6 +394,14 @@ BitsInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
   return BitsInit::get(NewBits);
 }
 
+bool BitsInit::isConcrete() const {
+  for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
+    if (!getBit(i)->isConcrete())
+      return false;
+  }
+  return true;
+}
+
 std::string BitsInit::getAsString() const {
   std::string Result = "{ ";
   for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
@@ -299,54 +414,35 @@ std::string BitsInit::getAsString() const {
   return Result + " }";
 }
 
-// Fix bit initializer to preserve the behavior that bit reference from a unset
-// bits initializer will resolve into VarBitInit to keep the field name and bit
-// number used in targets with fixed insn length.
-static Init *fixBitInit(const RecordVal *RV, Init *Before, Init *After) {
-  if (RV || !isa<UnsetInit>(After))
-    return After;
-  return Before;
-}
-
 // resolveReferences - If there are any field references that refer to fields
 // that have been filled in, we can propagate the values now.
-Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
+Init *BitsInit::resolveReferences(Resolver &R) const {
   bool Changed = false;
   SmallVector<Init *, 16> NewBits(getNumBits());
 
-  Init *CachedInit = nullptr;
-  Init *CachedBitVar = nullptr;
-  bool CachedBitVarChanged = false;
+  Init *CachedBitVarRef = nullptr;
+  Init *CachedBitVarResolved = nullptr;
 
   for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
     Init *CurBit = getBit(i);
-    Init *CurBitVar = CurBit->getBitVar();
-
-    NewBits[i] = CurBit;
+    Init *NewBit = CurBit;
 
-    if (CurBitVar == CachedBitVar) {
-      if (CachedBitVarChanged) {
-        Init *Bit = CachedInit->getBit(CurBit->getBitNum());
-        NewBits[i] = fixBitInit(RV, CurBit, Bit);
+    if (VarBitInit *CurBitVar = dyn_cast<VarBitInit>(CurBit)) {
+      if (CurBitVar->getBitVar() != CachedBitVarRef) {
+        CachedBitVarRef = CurBitVar->getBitVar();
+        CachedBitVarResolved = CachedBitVarRef->resolveReferences(R);
       }
-      continue;
-    }
-    CachedBitVar = CurBitVar;
-    CachedBitVarChanged = false;
-
-    Init *B;
-    do {
-      B = CurBitVar;
-      CurBitVar = CurBitVar->resolveReferences(R, RV);
-      CachedBitVarChanged |= B != CurBitVar;
-      Changed |= B != CurBitVar;
-    } while (B != CurBitVar);
-    CachedInit = CurBitVar;
-
-    if (CachedBitVarChanged) {
-      Init *Bit = CurBitVar->getBit(CurBit->getBitNum());
-      NewBits[i] = fixBitInit(RV, CurBit, Bit);
+
+      NewBit = CachedBitVarResolved->getBit(CurBitVar->getBitNum());
+    } else {
+      // getBit(0) implicitly converts int and bits<1> values to bit.
+      NewBit = CurBit->resolveReferences(R)->getBit(0);
     }
+
+    if (isa<UnsetInit>(NewBit) && R.keepUnsetBits())
+      NewBit = CurBit;
+    NewBits[i] = NewBit;
+    Changed |= CurBit != NewBit;
   }
 
   if (Changed)
@@ -433,6 +529,8 @@ StringInit *StringInit::get(StringRef V) {
 Init *StringInit::convertInitializerTo(RecTy *Ty) const {
   if (isa<StringRecTy>(Ty))
     return const_cast<StringInit *>(this);
+  if (isa<CodeRecTy>(Ty))
+    return CodeInit::get(getValue());
 
   return nullptr;
 }
@@ -440,6 +538,8 @@ Init *StringInit::convertInitializerTo(RecTy *Ty) const {
 Init *CodeInit::convertInitializerTo(RecTy *Ty) const {
   if (isa<CodeRecTy>(Ty))
     return const_cast<CodeInit *>(this);
+  if (isa<StringRecTy>(Ty))
+    return StringInit::get(getValue());
 
   return nullptr;
 }
@@ -464,6 +564,9 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   if (ListInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
+  assert(Range.empty() || !isa<TypedInit>(Range[0]) ||
+         cast<TypedInit>(Range[0])->getType()->typeIsConvertibleTo(EltTy));
+
   void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *>(Range.size()),
                                  alignof(ListInit));
   ListInit *I = new(Mem) ListInit(Range.size(), EltTy);
@@ -501,7 +604,7 @@ Init *ListInit::convertInitializerTo(RecTy *Ty) const {
 
     if (!Changed)
       return const_cast<ListInit*>(this);
-    return ListInit::get(Elements, Ty);
+    return ListInit::get(Elements, ElementType);
   }
 
   return nullptr;
@@ -515,7 +618,7 @@ Init *ListInit::convertInitListSlice(ArrayRef<unsigned> Elements) const {
       return nullptr;
     Vals.push_back(getElement(Element));
   }
-  return ListInit::get(Vals, getType());
+  return ListInit::get(Vals, getElementType());
 }
 
 Record *ListInit::getElementAsRecord(unsigned i) const {
@@ -526,38 +629,28 @@ Record *ListInit::getElementAsRecord(unsigned i) const {
   return DI->getDef();
 }
 
-Init *ListInit::resolveReferences(Record &R, const RecordVal *RV) const {
+Init *ListInit::resolveReferences(Resolver &R) const {
   SmallVector<Init*, 8> Resolved;
   Resolved.reserve(size());
   bool Changed = false;
 
   for (Init *CurElt : getValues()) {
-    Init *E;
-
-    do {
-      E = CurElt;
-      CurElt = CurElt->resolveReferences(R, RV);
-      Changed |= E != CurElt;
-    } while (E != CurElt);
+    Init *E = CurElt->resolveReferences(R);
+    Changed |= E != CurElt;
     Resolved.push_back(E);
   }
 
   if (Changed)
-    return ListInit::get(Resolved, getType());
+    return ListInit::get(Resolved, getElementType());
   return const_cast<ListInit *>(this);
 }
 
-Init *ListInit::resolveListElementReference(Record &R, const RecordVal *IRV,
-                                            unsigned Elt) const {
-  if (Elt >= size())
-    return nullptr;  // Out of range reference.
-  Init *E = getElement(Elt);
-  // If the element is set to some value, or if we are resolving a reference
-  // to a specific variable and that variable is explicitly unset, then
-  // replace the VarListElementInit with it.
-  if (IRV || !isa<UnsetInit>(E))
-    return E;
-  return nullptr;
+bool ListInit::isConcrete() const {
+  for (Init *Element : *this) {
+    if (!Element->isConcrete())
+      return false;
+  }
+  return true;
 }
 
 std::string ListInit::getAsString() const {
@@ -571,24 +664,6 @@ std::string ListInit::getAsString() const {
   return Result + "]";
 }
 
-Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
-                                          unsigned Elt) const {
-  Init *Resolved = resolveReferences(R, IRV);
-  OpInit *OResolved = dyn_cast<OpInit>(Resolved);
-  if (OResolved) {
-    Resolved = OResolved->Fold(&R, nullptr);
-  }
-
-  if (Resolved != this) {
-    TypedInit *Typed = cast<TypedInit>(Resolved);
-    if (Init *New = Typed->resolveListElementReference(R, IRV, Elt))
-      return New;
-    return VarListElementInit::get(Typed, Elt);
-  }
-
-  return nullptr;
-}
-
 Init *OpInit::getBit(unsigned Bit) const {
   if (getType() == BitRecTy::get())
     return const_cast<OpInit*>(this);
@@ -621,7 +696,7 @@ void UnOpInit::Profile(FoldingSetNodeID &ID) const {
   ProfileUnOpInit(ID, getOpcode(), getOperand(), getType());
 }
 
-Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
   switch (getOpcode()) {
   case CAST:
     if (isa<StringRecTy>(getType())) {
@@ -633,60 +708,42 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
 
       if (IntInit *LHSi = dyn_cast<IntInit>(LHS))
         return StringInit::get(LHSi->getAsString());
-    } else {
+    } else if (isa<RecordRecTy>(getType())) {
       if (StringInit *Name = dyn_cast<StringInit>(LHS)) {
-        // From TGParser::ParseIDValue
-        if (CurRec) {
-          if (const RecordVal *RV = CurRec->getValue(Name)) {
-            if (RV->getType() != getType())
-              PrintFatalError("type mismatch in cast");
-            return VarInit::get(Name, RV->getType());
-          }
-
-          Init *TemplateArgName = QualifyName(*CurRec, CurMultiClass, Name,
-                                              ":");
-
-          if (CurRec->isTemplateArg(TemplateArgName)) {
-            const RecordVal *RV = CurRec->getValue(TemplateArgName);
-            assert(RV && "Template arg doesn't exist??");
-
-            if (RV->getType() != getType())
-              PrintFatalError("type mismatch in cast");
-
-            return VarInit::get(TemplateArgName, RV->getType());
-          }
-        }
-
-        if (CurMultiClass) {
-          Init *MCName = QualifyName(CurMultiClass->Rec, CurMultiClass, Name,
-                                     "::");
-
-          if (CurMultiClass->Rec.isTemplateArg(MCName)) {
-            const RecordVal *RV = CurMultiClass->Rec.getValue(MCName);
-            assert(RV && "Template arg doesn't exist??");
-
-            if (RV->getType() != getType())
-              PrintFatalError("type mismatch in cast");
-
-            return VarInit::get(MCName, RV->getType());
+        assert(CurRec && "NULL pointer");
+        Record *D;
+
+        // Self-references are allowed, but their resolution is delayed until
+        // the final resolve to ensure that we get the correct type for them.
+        if (Name == CurRec->getNameInit()) {
+          if (!IsFinal)
+            break;
+          D = CurRec;
+        } else {
+          D = CurRec->getRecords().getDef(Name->getValue());
+          if (!D) {
+            if (IsFinal)
+              PrintFatalError(CurRec->getLoc(),
+                              Twine("Undefined reference to record: '") +
+                              Name->getValue() + "'\n");
+            break;
           }
         }
-        assert(CurRec && "NULL pointer");
-        if (Record *D = (CurRec->getRecords()).getDef(Name->getValue()))
-          return DefInit::get(D);
-
-        PrintFatalError(CurRec->getLoc(),
-                        "Undefined reference:'" + Name->getValue() + "'\n");
-      }
 
-      if (isa<IntRecTy>(getType())) {
-        if (BitsInit *BI = dyn_cast<BitsInit>(LHS)) {
-          if (Init *NewInit = BI->convertInitializerTo(IntRecTy::get()))
-            return NewInit;
-          break;
+        DefInit *DI = DefInit::get(D);
+        if (!DI->getType()->typeIsA(getType())) {
+          PrintFatalError(CurRec->getLoc(),
+                          Twine("Expected type '") +
+                          getType()->getAsString() + "', got '" +
+                          DI->getType()->getAsString() + "' in: " +
+                          getAsString() + "\n");
         }
+        return DI;
       }
     }
+
+    if (Init *NewInit = LHS->convertInitializerTo(getType()))
+      return NewInit;
     break;
 
   case HEAD:
@@ -701,10 +758,15 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
       assert(!LHSl->empty() && "Empty list in tail");
       // Note the +1.  We can't just pass the result of getValues()
       // directly.
-      return ListInit::get(LHSl->getValues().slice(1), LHSl->getType());
+      return ListInit::get(LHSl->getValues().slice(1), LHSl->getElementType());
     }
     break;
 
+  case SIZE:
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
+      return IntInit::get(LHSl->size());
+    break;
+
   case EMPTY:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
       return IntInit::get(LHSl->empty());
@@ -715,12 +777,13 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   return const_cast<UnOpInit *>(this);
 }
 
-Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  Init *lhs = LHS->resolveReferences(R, RV);
+Init *UnOpInit::resolveReferences(Resolver &R) const {
+  Init *lhs = LHS->resolveReferences(R);
 
-  if (LHS != lhs)
-    return (UnOpInit::get(getOpcode(), lhs, getType()))->Fold(&R, nullptr);
-  return Fold(&R, nullptr);
+  if (LHS != lhs || (R.isFinal() && getOpcode() == CAST))
+    return (UnOpInit::get(getOpcode(), lhs, getType()))
+        ->Fold(R.getCurrentRecord(), R.isFinal());
+  return const_cast<UnOpInit *>(this);
 }
 
 std::string UnOpInit::getAsString() const {
@@ -729,6 +792,7 @@ std::string UnOpInit::getAsString() const {
   case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
   case HEAD: Result = "!head"; break;
   case TAIL: Result = "!tail"; break;
+  case SIZE: Result = "!size"; break;
   case EMPTY: Result = "!empty"; break;
   }
   return Result + "(" + LHS->getAsString() + ")";
@@ -770,7 +834,15 @@ static StringInit *ConcatStringInits(const StringInit *I0,
   return StringInit::get(Concat);
 }
 
-Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+Init *BinOpInit::getStrConcat(Init *I0, Init *I1) {
+  // Shortcut for the common case of concatenating two strings.
+  if (const StringInit *I0s = dyn_cast<StringInit>(I0))
+    if (const StringInit *I1s = dyn_cast<StringInit>(I1))
+      return ConcatStringInits(I0s, I1s);
+  return BinOpInit::get(BinOpInit::STRCONCAT, I0, I1, StringRecTy::get());
+}
+
+Init *BinOpInit::Fold(Record *CurRec) const {
   switch (getOpcode()) {
   case CONCAT: {
     DagInit *LHSs = dyn_cast<DagInit>(LHS);
@@ -778,8 +850,13 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     if (LHSs && RHSs) {
       DefInit *LOp = dyn_cast<DefInit>(LHSs->getOperator());
       DefInit *ROp = dyn_cast<DefInit>(RHSs->getOperator());
-      if (!LOp || !ROp || LOp->getDef() != ROp->getDef())
-        PrintFatalError("Concated Dag operators do not match!");
+      if (!LOp || !ROp)
+        break;
+      if (LOp->getDef() != ROp->getDef()) {
+        PrintFatalError(Twine("Concatenated Dag operators do not match: '") +
+                        LHSs->getAsString() + "' vs. '" + RHSs->getAsString() +
+                        "'");
+      }
       SmallVector<Init*, 8> Args;
       SmallVector<StringInit*, 8> ArgNames;
       for (unsigned i = 0, e = LHSs->getNumArgs(); i != e; ++i) {
@@ -801,8 +878,7 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
       SmallVector<Init *, 8> Args;
       Args.insert(Args.end(), LHSs->begin(), LHSs->end());
       Args.insert(Args.end(), RHSs->begin(), RHSs->end());
-      return ListInit::get(
-          Args, cast<ListRecTy>(LHSs->getType())->getElementType());
+      return ListInit::get(Args, LHSs->getElementType());
     }
     break;
   }
@@ -813,23 +889,43 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
       return ConcatStringInits(LHSs, RHSs);
     break;
   }
-  case EQ: {
+  case EQ:
+  case NE:
+  case LE:
+  case LT:
+  case GE:
+  case GT: {
     // try to fold eq comparison for 'bit' and 'int', otherwise fallback
     // to string objects.
     IntInit *L =
-      dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
+        dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
     IntInit *R =
-      dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
+        dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
 
-    if (L && R)
-      return IntInit::get(L->getValue() == R->getValue());
+    if (L && R) {
+      bool Result;
+      switch (getOpcode()) {
+      case EQ: Result = L->getValue() == R->getValue(); break;
+      case NE: Result = L->getValue() != R->getValue(); break;
+      case LE: Result = L->getValue() <= R->getValue(); break;
+      case LT: Result = L->getValue() < R->getValue(); break;
+      case GE: Result = L->getValue() >= R->getValue(); break;
+      case GT: Result = L->getValue() > R->getValue(); break;
+      default: llvm_unreachable("unhandled comparison");
+      }
+      return BitInit::get(Result);
+    }
 
-    StringInit *LHSs = dyn_cast<StringInit>(LHS);
-    StringInit *RHSs = dyn_cast<StringInit>(RHS);
+    if (getOpcode() == EQ || getOpcode() == NE) {
+      StringInit *LHSs = dyn_cast<StringInit>(LHS);
+      StringInit *RHSs = dyn_cast<StringInit>(RHS);
 
-    // Make sure we've resolved
-    if (LHSs && RHSs)
-      return IntInit::get(LHSs->getValue() == RHSs->getValue());
+      // Make sure we've resolved
+      if (LHSs && RHSs) {
+        bool Equal = LHSs->getValue() == RHSs->getValue();
+        return BitInit::get(getOpcode() == EQ ? Equal : !Equal);
+      }
+    }
 
     break;
   }
@@ -863,13 +959,14 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   return const_cast<BinOpInit *>(this);
 }
 
-Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  Init *lhs = LHS->resolveReferences(R, RV);
-  Init *rhs = RHS->resolveReferences(R, RV);
+Init *BinOpInit::resolveReferences(Resolver &R) const {
+  Init *lhs = LHS->resolveReferences(R);
+  Init *rhs = RHS->resolveReferences(R);
 
   if (LHS != lhs || RHS != rhs)
-    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))->Fold(&R,nullptr);
-  return Fold(&R, nullptr);
+    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))
+        ->Fold(R.getCurrentRecord());
+  return const_cast<BinOpInit *>(this);
 }
 
 std::string BinOpInit::getAsString() const {
@@ -883,6 +980,11 @@ std::string BinOpInit::getAsString() const {
   case SRA: Result = "!sra"; break;
   case SRL: Result = "!srl"; break;
   case EQ: Result = "!eq"; break;
+  case NE: Result = "!ne"; break;
+  case LE: Result = "!le"; break;
+  case LT: Result = "!lt"; break;
+  case GE: Result = "!ge"; break;
+  case GT: Result = "!gt"; break;
   case LISTCONCAT: Result = "!listconcat"; break;
   case STRCONCAT: Result = "!strconcat"; break;
   }
@@ -919,102 +1021,61 @@ void TernOpInit::Profile(FoldingSetNodeID &ID) const {
   ProfileTernOpInit(ID, getOpcode(), getLHS(), getMHS(), getRHS(), getType());
 }
 
-static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
-                           Record *CurRec, MultiClass *CurMultiClass);
-
-static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
-                               RecTy *Type, Record *CurRec,
-                               MultiClass *CurMultiClass) {
-  // If this is a dag, recurse
-  if (auto *TArg = dyn_cast<TypedInit>(Arg))
-    if (isa<DagRecTy>(TArg->getType()))
-      return ForeachHelper(LHS, Arg, RHSo, Type, CurRec, CurMultiClass);
-
-  SmallVector<Init *, 8> NewOperands;
-  NewOperands.reserve(RHSo->getNumOperands());
-  for (unsigned i = 0, e = RHSo->getNumOperands(); i < e; ++i) {
-    if (auto *RHSoo = dyn_cast<OpInit>(RHSo->getOperand(i))) {
-      if (Init *Result = EvaluateOperation(RHSoo, LHS, Arg,
-                                           Type, CurRec, CurMultiClass))
-        NewOperands.push_back(Result);
-      else
-        NewOperands.push_back(Arg);
-    } else if (LHS->getAsString() == RHSo->getOperand(i)->getAsString()) {
-      NewOperands.push_back(Arg);
-    } else {
-      NewOperands.push_back(RHSo->getOperand(i));
-    }
-  }
-
-  // Now run the operator and use its result as the new leaf
-  const OpInit *NewOp = RHSo->clone(NewOperands);
-  Init *NewVal = NewOp->Fold(CurRec, CurMultiClass);
-  return (NewVal != NewOp) ? NewVal : nullptr;
+static Init *ForeachApply(Init *LHS, Init *MHSe, Init *RHS, Record *CurRec) {
+  MapResolver R(CurRec);
+  R.set(LHS, MHSe);
+  return RHS->resolveReferences(R);
 }
 
-static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
-                           Record *CurRec, MultiClass *CurMultiClass) {
-  OpInit *RHSo = dyn_cast<OpInit>(RHS);
-
-  if (!RHSo)
-    PrintFatalError(CurRec->getLoc(), "!foreach requires an operator\n");
+static Init *ForeachDagApply(Init *LHS, DagInit *MHSd, Init *RHS,
+                             Record *CurRec) {
+  bool Change = false;
+  Init *Val = ForeachApply(LHS, MHSd->getOperator(), RHS, CurRec);
+  if (Val != MHSd->getOperator())
+    Change = true;
 
-  TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
+  SmallVector<std::pair<Init *, StringInit *>, 8> NewArgs;
+  for (unsigned int i = 0; i < MHSd->getNumArgs(); ++i) {
+    Init *Arg = MHSd->getArg(i);
+    Init *NewArg;
+    StringInit *ArgName = MHSd->getArgName(i);
 
-  if (!LHSt)
-    PrintFatalError(CurRec->getLoc(), "!foreach requires typed variable\n");
-
-  DagInit *MHSd = dyn_cast<DagInit>(MHS);
-  if (MHSd && isa<DagRecTy>(Type)) {
-    Init *Val = MHSd->getOperator();
-    if (Init *Result = EvaluateOperation(RHSo, LHS, Val,
-                                         Type, CurRec, CurMultiClass))
-      Val = Result;
-
-    SmallVector<std::pair<Init *, StringInit*>, 8> args;
-    for (unsigned int i = 0; i < MHSd->getNumArgs(); ++i) {
-      Init *Arg = MHSd->getArg(i);
-      StringInit *ArgName = MHSd->getArgName(i);
+    if (DagInit *Argd = dyn_cast<DagInit>(Arg))
+      NewArg = ForeachDagApply(LHS, Argd, RHS, CurRec);
+    else
+      NewArg = ForeachApply(LHS, Arg, RHS, CurRec);
 
-      // Process args
-      if (Init *Result = EvaluateOperation(RHSo, LHS, Arg, Type,
-                                           CurRec, CurMultiClass))
-        Arg = Result;
+    NewArgs.push_back(std::make_pair(NewArg, ArgName));
+    if (Arg != NewArg)
+      Change = true;
+  }
 
-      // TODO: Process arg names
-      args.push_back(std::make_pair(Arg, ArgName));
-    }
+  if (Change)
+    return DagInit::get(Val, nullptr, NewArgs);
+  return MHSd;
+}
 
-    return DagInit::get(Val, nullptr, args);
-  }
+// Applies RHS to all elements of MHS, using LHS as a temp variable.
+static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+                           Record *CurRec) {
+  if (DagInit *MHSd = dyn_cast<DagInit>(MHS))
+    return ForeachDagApply(LHS, MHSd, RHS, CurRec);
 
-  ListInit *MHSl = dyn_cast<ListInit>(MHS);
-  if (MHSl && isa<ListRecTy>(Type)) {
-    SmallVector<Init *, 8> NewOperands;
+  if (ListInit *MHSl = dyn_cast<ListInit>(MHS)) {
     SmallVector<Init *, 8> NewList(MHSl->begin(), MHSl->end());
 
     for (Init *&Item : NewList) {
-      NewOperands.clear();
-      for(unsigned i = 0; i < RHSo->getNumOperands(); ++i) {
-        // First, replace the foreach variable with the list item
-        if (LHS->getAsString() == RHSo->getOperand(i)->getAsString())
-          NewOperands.push_back(Item);
-        else
-          NewOperands.push_back(RHSo->getOperand(i));
-      }
-
-      // Now run the operator and use its result as the new list item
-      const OpInit *NewOp = RHSo->clone(NewOperands);
-      Init *NewItem = NewOp->Fold(CurRec, CurMultiClass);
-      if (NewItem != NewOp)
+      Init *NewItem = ForeachApply(LHS, Item, RHS, CurRec);
+      if (NewItem != Item)
         Item = NewItem;
     }
-    return ListInit::get(NewList, MHSl->getType());
+    return ListInit::get(NewList, cast<ListRecTy>(Type)->getElementType());
   }
+
   return nullptr;
 }
 
-Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+Init *TernOpInit::Fold(Record *CurRec) const {
   switch (getOpcode()) {
   case SUBST: {
     DefInit *LHSd = dyn_cast<DefInit>(LHS);
@@ -1060,154 +1121,244 @@ Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   }
 
   case FOREACH: {
-    if (Init *Result = ForeachHelper(LHS, MHS, RHS, getType(),
-                                     CurRec, CurMultiClass))
+    if (Init *Result = ForeachHelper(LHS, MHS, RHS, getType(), CurRec))
       return Result;
     break;
   }
 
   case IF: {
-    IntInit *LHSi = dyn_cast<IntInit>(LHS);
-    if (Init *I = LHS->convertInitializerTo(IntRecTy::get()))
-      LHSi = dyn_cast<IntInit>(I);
-    if (LHSi) {
+    if (IntInit *LHSi = dyn_cast_or_null<IntInit>(
+                            LHS->convertInitializerTo(IntRecTy::get()))) {
       if (LHSi->getValue())
         return MHS;
       return RHS;
     }
     break;
   }
+
+  case DAG: {
+    ListInit *MHSl = dyn_cast<ListInit>(MHS);
+    ListInit *RHSl = dyn_cast<ListInit>(RHS);
+    bool MHSok = MHSl || isa<UnsetInit>(MHS);
+    bool RHSok = RHSl || isa<UnsetInit>(RHS);
+
+    if (isa<UnsetInit>(MHS) && isa<UnsetInit>(RHS))
+      break; // Typically prevented by the parser, but might happen with template args
+
+    if (MHSok && RHSok && (!MHSl || !RHSl || MHSl->size() == RHSl->size())) {
+      SmallVector<std::pair<Init *, StringInit *>, 8> Children;
+      unsigned Size = MHSl ? MHSl->size() : RHSl->size();
+      for (unsigned i = 0; i != Size; ++i) {
+        Init *Node = MHSl ? MHSl->getElement(i) : UnsetInit::get();
+        Init *Name = RHSl ? RHSl->getElement(i) : UnsetInit::get();
+        if (!isa<StringInit>(Name) && !isa<UnsetInit>(Name))
+          return const_cast<TernOpInit *>(this);
+        Children.emplace_back(Node, dyn_cast<StringInit>(Name));
+      }
+      return DagInit::get(LHS, nullptr, Children);
+    }
+    break;
+  }
   }
 
   return const_cast<TernOpInit *>(this);
 }
 
-Init *TernOpInit::resolveReferences(Record &R,
-                                    const RecordVal *RV) const {
-  Init *lhs = LHS->resolveReferences(R, RV);
+Init *TernOpInit::resolveReferences(Resolver &R) const {
+  Init *lhs = LHS->resolveReferences(R);
 
   if (getOpcode() == IF && lhs != LHS) {
-    IntInit *Value = dyn_cast<IntInit>(lhs);
-    if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
-      Value = dyn_cast<IntInit>(I);
-    if (Value) {
+    if (IntInit *Value = dyn_cast_or_null<IntInit>(
+                             lhs->convertInitializerTo(IntRecTy::get()))) {
       // Short-circuit
-      if (Value->getValue()) {
-        Init *mhs = MHS->resolveReferences(R, RV);
-        return (TernOpInit::get(getOpcode(), lhs, mhs,
-                                RHS, getType()))->Fold(&R, nullptr);
-      }
-      Init *rhs = RHS->resolveReferences(R, RV);
-      return (TernOpInit::get(getOpcode(), lhs, MHS,
-                              rhs, getType()))->Fold(&R, nullptr);
+      if (Value->getValue())
+        return MHS->resolveReferences(R);
+      return RHS->resolveReferences(R);
     }
   }
 
-  Init *mhs = MHS->resolveReferences(R, RV);
-  Init *rhs = RHS->resolveReferences(R, RV);
+  Init *mhs = MHS->resolveReferences(R);
+  Init *rhs;
+
+  if (getOpcode() == FOREACH) {
+    ShadowResolver SR(R);
+    SR.addShadow(lhs);
+    rhs = RHS->resolveReferences(SR);
+  } else {
+    rhs = RHS->resolveReferences(R);
+  }
 
   if (LHS != lhs || MHS != mhs || RHS != rhs)
-    return (TernOpInit::get(getOpcode(), lhs, mhs, rhs,
-                            getType()))->Fold(&R, nullptr);
-  return Fold(&R, nullptr);
+    return (TernOpInit::get(getOpcode(), lhs, mhs, rhs, getType()))
+        ->Fold(R.getCurrentRecord());
+  return const_cast<TernOpInit *>(this);
 }
 
 std::string TernOpInit::getAsString() const {
   std::string Result;
+  bool UnquotedLHS = false;
   switch (getOpcode()) {
   case SUBST: Result = "!subst"; break;
-  case FOREACH: Result = "!foreach"; break;
+  case FOREACH: Result = "!foreach"; UnquotedLHS = true; break;
   case IF: Result = "!if"; break;
+  case DAG: Result = "!dag"; break;
   }
-  return Result + "(" + LHS->getAsString() + ", " + MHS->getAsString() + ", " +
-         RHS->getAsString() + ")";
+  return (Result + "(" +
+          (UnquotedLHS ? LHS->getAsUnquotedString() : LHS->getAsString()) +
+          ", " + MHS->getAsString() + ", " + RHS->getAsString() + ")");
+}
+
+static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *A, Init *B,
+                              Init *Start, Init *List, Init *Expr,
+                              RecTy *Type) {
+  ID.AddPointer(Start);
+  ID.AddPointer(List);
+  ID.AddPointer(A);
+  ID.AddPointer(B);
+  ID.AddPointer(Expr);
+  ID.AddPointer(Type);
 }
 
-RecTy *TypedInit::getFieldType(StringInit *FieldName) const {
-  if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType()))
-    if (RecordVal *Field = RecordType->getRecord()->getValue(FieldName))
-      return Field->getType();
-  return nullptr;
-}
+FoldOpInit *FoldOpInit::get(Init *Start, Init *List, Init *A, Init *B,
+                            Init *Expr, RecTy *Type) {
+  static FoldingSet<FoldOpInit> ThePool;
 
-Init *
-TypedInit::convertInitializerTo(RecTy *Ty) const {
-  if (isa<IntRecTy>(Ty)) {
-    if (getType()->typeIsConvertibleTo(Ty))
-      return const_cast<TypedInit *>(this);
-    return nullptr;
-  }
+  FoldingSetNodeID ID;
+  ProfileFoldOpInit(ID, Start, List, A, B, Expr, Type);
 
-  if (isa<StringRecTy>(Ty)) {
-    if (isa<StringRecTy>(getType()))
-      return const_cast<TypedInit *>(this);
-    return nullptr;
-  }
+  void *IP = nullptr;
+  if (FoldOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
 
-  if (isa<CodeRecTy>(Ty)) {
-    if (isa<CodeRecTy>(getType()))
-      return const_cast<TypedInit *>(this);
-    return nullptr;
-  }
+  FoldOpInit *I = new (Allocator) FoldOpInit(Start, List, A, B, Expr, Type);
+  ThePool.InsertNode(I, IP);
+  return I;
+}
 
-  if (isa<BitRecTy>(Ty)) {
-    // Accept variable if it is already of bit type!
-    if (isa<BitRecTy>(getType()))
-      return const_cast<TypedInit *>(this);
-    if (auto *BitsTy = dyn_cast<BitsRecTy>(getType())) {
-      // Accept only bits<1> expression.
-      if (BitsTy->getNumBits() == 1)
-        return const_cast<TypedInit *>(this);
-      return nullptr;
-    }
-    // Ternary !if can be converted to bit, but only if both sides are
-    // convertible to a bit.
-    if (const auto *TOI = dyn_cast<TernOpInit>(this)) {
-      if (TOI->getOpcode() == TernOpInit::TernaryOp::IF &&
-          TOI->getMHS()->convertInitializerTo(BitRecTy::get()) &&
-          TOI->getRHS()->convertInitializerTo(BitRecTy::get()))
-        return const_cast<TypedInit *>(this);
-      return nullptr;
+void FoldOpInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileFoldOpInit(ID, Start, List, A, B, Expr, getType());
+}
+
+Init *FoldOpInit::Fold(Record *CurRec) const {
+  if (ListInit *LI = dyn_cast<ListInit>(List)) {
+    Init *Accum = Start;
+    for (Init *Elt : *LI) {
+      MapResolver R(CurRec);
+      R.set(A, Accum);
+      R.set(B, Elt);
+      Accum = Expr->resolveReferences(R);
     }
-    return nullptr;
+    return Accum;
   }
+  return const_cast<FoldOpInit *>(this);
+}
 
-  if (auto *BRT = dyn_cast<BitsRecTy>(Ty)) {
-    if (BRT->getNumBits() == 1 && isa<BitRecTy>(getType()))
-      return BitsInit::get(const_cast<TypedInit *>(this));
+Init *FoldOpInit::resolveReferences(Resolver &R) const {
+  Init *NewStart = Start->resolveReferences(R);
+  Init *NewList = List->resolveReferences(R);
+  ShadowResolver SR(R);
+  SR.addShadow(A);
+  SR.addShadow(B);
+  Init *NewExpr = Expr->resolveReferences(SR);
 
-    if (getType()->typeIsConvertibleTo(BRT)) {
-      SmallVector<Init *, 16> NewBits(BRT->getNumBits());
+  if (Start == NewStart && List == NewList && Expr == NewExpr)
+    return const_cast<FoldOpInit *>(this);
 
-      for (unsigned i = 0; i != BRT->getNumBits(); ++i)
-        NewBits[i] = VarBitInit::get(const_cast<TypedInit *>(this), i);
-      return BitsInit::get(NewBits);
-    }
+  return get(NewStart, NewList, A, B, NewExpr, getType())
+      ->Fold(R.getCurrentRecord());
+}
 
-    return nullptr;
-  }
+Init *FoldOpInit::getBit(unsigned Bit) const {
+  return VarBitInit::get(const_cast<FoldOpInit *>(this), Bit);
+}
 
-  if (auto *DLRT = dyn_cast<ListRecTy>(Ty)) {
-    if (auto *SLRT = dyn_cast<ListRecTy>(getType()))
-      if (SLRT->getElementType()->typeIsConvertibleTo(DLRT->getElementType()))
-        return const_cast<TypedInit *>(this);
-    return nullptr;
-  }
+std::string FoldOpInit::getAsString() const {
+  return (Twine("!foldl(") + Start->getAsString() + ", " + List->getAsString() +
+          ", " + A->getAsUnquotedString() + ", " + B->getAsUnquotedString() +
+          ", " + Expr->getAsString() + ")")
+      .str();
+}
 
-  if (auto *DRT = dyn_cast<DagRecTy>(Ty)) {
-    if (getType()->typeIsConvertibleTo(DRT))
-      return const_cast<TypedInit *>(this);
-    return nullptr;
+static void ProfileIsAOpInit(FoldingSetNodeID &ID, RecTy *CheckType,
+                             Init *Expr) {
+  ID.AddPointer(CheckType);
+  ID.AddPointer(Expr);
+}
+
+IsAOpInit *IsAOpInit::get(RecTy *CheckType, Init *Expr) {
+  static FoldingSet<IsAOpInit> ThePool;
+
+  FoldingSetNodeID ID;
+  ProfileIsAOpInit(ID, CheckType, Expr);
+
+  void *IP = nullptr;
+  if (IsAOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  IsAOpInit *I = new (Allocator) IsAOpInit(CheckType, Expr);
+  ThePool.InsertNode(I, IP);
+  return I;
+}
+
+void IsAOpInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileIsAOpInit(ID, CheckType, Expr);
+}
+
+Init *IsAOpInit::Fold() const {
+  if (TypedInit *TI = dyn_cast<TypedInit>(Expr)) {
+    // Is the expression type known to be (a subclass of) the desired type?
+    if (TI->getType()->typeIsConvertibleTo(CheckType))
+      return IntInit::get(1);
+
+    if (isa<RecordRecTy>(CheckType)) {
+      // If the target type is not a subclass of the expression type, or if
+      // the expression has fully resolved to a record, we know that it can't
+      // be of the required type.
+      if (!CheckType->typeIsConvertibleTo(TI->getType()) || isa<DefInit>(Expr))
+        return IntInit::get(0);
+    } else {
+      // We treat non-record types as not castable.
+      return IntInit::get(0);
+    }
   }
+  return const_cast<IsAOpInit *>(this);
+}
 
-  if (auto *SRRT = dyn_cast<RecordRecTy>(Ty)) {
-    // Ensure that this is compatible with Rec.
-    if (RecordRecTy *DRRT = dyn_cast<RecordRecTy>(getType()))
-      if (DRRT->getRecord()->isSubClassOf(SRRT->getRecord()) ||
-          DRRT->getRecord() == SRRT->getRecord())
-        return const_cast<TypedInit *>(this);
-    return nullptr;
+Init *IsAOpInit::resolveReferences(Resolver &R) const {
+  Init *NewExpr = Expr->resolveReferences(R);
+  if (Expr != NewExpr)
+    return get(CheckType, NewExpr)->Fold();
+  return const_cast<IsAOpInit *>(this);
+}
+
+Init *IsAOpInit::getBit(unsigned Bit) const {
+  return VarBitInit::get(const_cast<IsAOpInit *>(this), Bit);
+}
+
+std::string IsAOpInit::getAsString() const {
+  return (Twine("!isa<") + CheckType->getAsString() + ">(" +
+          Expr->getAsString() + ")")
+      .str();
+}
+
+RecTy *TypedInit::getFieldType(StringInit *FieldName) const {
+  if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType())) {
+    for (Record *Rec : RecordType->getClasses()) {
+      if (RecordVal *Field = Rec->getValue(FieldName))
+        return Field->getType();
+    }
   }
+  return nullptr;
+}
+
+Init *
+TypedInit::convertInitializerTo(RecTy *Ty) const {
+  if (getType() == Ty || getType()->typeIsA(Ty))
+    return const_cast<TypedInit *>(this);
+
+  if (isa<BitRecTy>(getType()) && isa<BitsRecTy>(Ty) &&
+      cast<BitsRecTy>(Ty)->getNumBits() == 1)
+    return BitsInit::get({const_cast<TypedInit *>(this)});
 
   return nullptr;
 }
@@ -1228,6 +1379,24 @@ Init *TypedInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
   return BitsInit::get(NewBits);
 }
 
+Init *TypedInit::getCastTo(RecTy *Ty) const {
+  // Handle the common case quickly
+  if (getType() == Ty || getType()->typeIsA(Ty))
+    return const_cast<TypedInit *>(this);
+
+  if (Init *Converted = convertInitializerTo(Ty)) {
+    assert(!isa<TypedInit>(Converted) ||
+           cast<TypedInit>(Converted)->getType()->typeIsA(Ty));
+    return Converted;
+  }
+
+  if (!getType()->typeIsConvertibleTo(Ty))
+    return nullptr;
+
+  return UnOpInit::get(UnOpInit::CAST, const_cast<TypedInit *>(this), Ty)
+      ->Fold(nullptr);
+}
+
 Init *TypedInit::convertInitListSlice(ArrayRef<unsigned> Elements) const {
   ListRecTy *T = dyn_cast<ListRecTy>(getType());
   if (!T) return nullptr;  // Cannot subscript a non-list variable.
@@ -1240,7 +1409,7 @@ Init *TypedInit::convertInitListSlice(ArrayRef<unsigned> Elements) const {
   for (unsigned Element : Elements)
     ListInits.push_back(VarListElementInit::get(const_cast<TypedInit *>(this),
                                                 Element));
-  return ListInit::get(ListInits, T);
+  return ListInit::get(ListInits, T->getElementType());
 }
 
 
@@ -1272,55 +1441,9 @@ Init *VarInit::getBit(unsigned Bit) const {
   return VarBitInit::get(const_cast<VarInit*>(this), Bit);
 }
 
-Init *VarInit::resolveListElementReference(Record &R,
-                                           const RecordVal *IRV,
-                                           unsigned Elt) const {
-  if (R.isTemplateArg(getNameInit())) return nullptr;
-  if (IRV && IRV->getNameInit() != getNameInit()) return nullptr;
-
-  RecordVal *RV = R.getValue(getNameInit());
-  assert(RV && "Reference to a non-existent variable?");
-  ListInit *LI = dyn_cast<ListInit>(RV->getValue());
-  if (!LI)
-    return VarListElementInit::get(cast<TypedInit>(RV->getValue()), Elt);
-
-  if (Elt >= LI->size())
-    return nullptr;  // Out of range reference.
-  Init *E = LI->getElement(Elt);
-  // If the element is set to some value, or if we are resolving a reference
-  // to a specific variable and that variable is explicitly unset, then
-  // replace the VarListElementInit with it.
-  if (IRV || !isa<UnsetInit>(E))
-    return E;
-  return nullptr;
-}
-
-RecTy *VarInit::getFieldType(StringInit *FieldName) const {
-  if (RecordRecTy *RTy = dyn_cast<RecordRecTy>(getType()))
-    if (const RecordVal *RV = RTy->getRecord()->getValue(FieldName))
-      return RV->getType();
-  return nullptr;
-}
-
-Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
-                            StringInit *FieldName) const {
-  if (isa<RecordRecTy>(getType()))
-    if (const RecordVal *Val = R.getValue(VarName)) {
-      if (RV != Val && (RV || isa<UnsetInit>(Val->getValue())))
-        return nullptr;
-      Init *TheInit = Val->getValue();
-      assert(TheInit != this && "Infinite loop detected!");
-      if (Init *I = TheInit->getFieldInit(R, RV, FieldName))
-        return I;
-      return nullptr;
-    }
-  return nullptr;
-}
-
-Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  if (RecordVal *Val = R.getValue(VarName))
-    if (RV == Val || (!RV && !isa<UnsetInit>(Val->getValue())))
-      return Val->getValue();
+Init *VarInit::resolveReferences(Resolver &R) const {
+  if (Init *Val = R.resolve(VarName))
+    return Val;
   return const_cast<VarInit *>(this);
 }
 
@@ -1336,19 +1459,12 @@ VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) {
   return I;
 }
 
-Init *VarBitInit::convertInitializerTo(RecTy *Ty) const {
-  if (isa<BitRecTy>(Ty))
-    return const_cast<VarBitInit *>(this);
-
-  return nullptr;
-}
-
 std::string VarBitInit::getAsString() const {
   return TI->getAsString() + "{" + utostr(Bit) + "}";
 }
 
-Init *VarBitInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  Init *I = TI->resolveReferences(R, RV);
+Init *VarBitInit::resolveReferences(Resolver &R) const {
+  Init *I = TI->resolveReferences(R);
   if (TI != I)
     return I->getBit(getBitNum());
 
@@ -1371,11 +1487,16 @@ std::string VarListElementInit::getAsString() const {
   return TI->getAsString() + "[" + utostr(Element) + "]";
 }
 
-Init *
-VarListElementInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  if (Init *I = getVariable()->resolveListElementReference(R, RV,
-                                                           getElementNum()))
-    return I;
+Init *VarListElementInit::resolveReferences(Resolver &R) const {
+  Init *NewTI = TI->resolveReferences(R);
+  if (ListInit *List = dyn_cast<ListInit>(NewTI)) {
+    // Leave out-of-bounds array references as-is. This can happen without
+    // being an error, e.g. in the untaken "branch" of an !if expression.
+    if (getElementNum() < List->size())
+      return List->getElement(getElementNum());
+  }
+  if (NewTI != TI && isa<TypedInit>(NewTI))
+    return VarListElementInit::get(cast<TypedInit>(NewTI), getElementNum());
   return const_cast<VarListElementInit *>(this);
 }
 
@@ -1385,20 +1506,8 @@ Init *VarListElementInit::getBit(unsigned Bit) const {
   return VarBitInit::get(const_cast<VarListElementInit*>(this), Bit);
 }
 
-Init *VarListElementInit:: resolveListElementReference(Record &R,
-                                                       const RecordVal *RV,
-                                                       unsigned Elt) const {
-  if (Init *Result = TI->resolveListElementReference(R, RV, Element)) {
-    if (TypedInit *TInit = dyn_cast<TypedInit>(Result)) {
-      if (Init *Result2 = TInit->resolveListElementReference(R, RV, Elt))
-        return Result2;
-      return VarListElementInit::get(TInit, Elt);
-    }
-    return Result;
-  }
-
-  return nullptr;
-}
+DefInit::DefInit(Record *D)
+    : TypedInit(IK_DefInit, D->getType()), Def(D) {}
 
 DefInit *DefInit::get(Record *R) {
   return R->getDefInit();
@@ -1406,7 +1515,7 @@ DefInit *DefInit::get(Record *R) {
 
 Init *DefInit::convertInitializerTo(RecTy *Ty) const {
   if (auto *RRT = dyn_cast<RecordRecTy>(Ty))
-    if (getDef()->isSubClassOf(RRT->getRecord()))
+    if (getType()->typeIsConvertibleTo(RRT))
       return const_cast<DefInit *>(this);
   return nullptr;
 }
@@ -1417,15 +1526,134 @@ RecTy *DefInit::getFieldType(StringInit *FieldName) const {
   return nullptr;
 }
 
-Init *DefInit::getFieldInit(Record &R, const RecordVal *RV,
-                            StringInit *FieldName) const {
-  return Def->getValue(FieldName)->getValue();
-}
-
 std::string DefInit::getAsString() const {
   return Def->getName();
 }
 
+static void ProfileVarDefInit(FoldingSetNodeID &ID,
+                              Record *Class,
+                              ArrayRef<Init *> Args) {
+  ID.AddInteger(Args.size());
+  ID.AddPointer(Class);
+
+  for (Init *I : Args)
+    ID.AddPointer(I);
+}
+
+VarDefInit *VarDefInit::get(Record *Class, ArrayRef<Init *> Args) {
+  static FoldingSet<VarDefInit> ThePool;
+
+  FoldingSetNodeID ID;
+  ProfileVarDefInit(ID, Class, Args);
+
+  void *IP = nullptr;
+  if (VarDefInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *>(Args.size()),
+                                 alignof(VarDefInit));
+  VarDefInit *I = new(Mem) VarDefInit(Class, Args.size());
+  std::uninitialized_copy(Args.begin(), Args.end(),
+                          I->getTrailingObjects<Init *>());
+  ThePool.InsertNode(I, IP);
+  return I;
+}
+
+void VarDefInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileVarDefInit(ID, Class, args());
+}
+
+DefInit *VarDefInit::instantiate() {
+  if (!Def) {
+    RecordKeeper &Records = Class->getRecords();
+    auto NewRecOwner = make_unique<Record>(Records.getNewAnonymousName(),
+                                           Class->getLoc(), Records,
+                                           /*IsAnonymous=*/true);
+    Record *NewRec = NewRecOwner.get();
+
+    // Copy values from class to instance
+    for (const RecordVal &Val : Class->getValues())
+      NewRec->addValue(Val);
+
+    // Substitute and resolve template arguments
+    ArrayRef<Init *> TArgs = Class->getTemplateArgs();
+    MapResolver R(NewRec);
+
+    for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+      if (i < args_size())
+        R.set(TArgs[i], getArg(i));
+      else
+        R.set(TArgs[i], NewRec->getValue(TArgs[i])->getValue());
+
+      NewRec->removeValue(TArgs[i]);
+    }
+
+    NewRec->resolveReferences(R);
+
+    // Add superclasses.
+    ArrayRef<std::pair<Record *, SMRange>> SCs = Class->getSuperClasses();
+    for (const auto &SCPair : SCs)
+      NewRec->addSuperClass(SCPair.first, SCPair.second);
+
+    NewRec->addSuperClass(Class,
+                          SMRange(Class->getLoc().back(),
+                                  Class->getLoc().back()));
+
+    // Resolve internal references and store in record keeper
+    NewRec->resolveReferences();
+    Records.addDef(std::move(NewRecOwner));
+
+    Def = DefInit::get(NewRec);
+  }
+
+  return Def;
+}
+
+Init *VarDefInit::resolveReferences(Resolver &R) const {
+  TrackUnresolvedResolver UR(&R);
+  bool Changed = false;
+  SmallVector<Init *, 8> NewArgs;
+  NewArgs.reserve(args_size());
+
+  for (Init *Arg : args()) {
+    Init *NewArg = Arg->resolveReferences(UR);
+    NewArgs.push_back(NewArg);
+    Changed |= NewArg != Arg;
+  }
+
+  if (Changed) {
+    auto New = VarDefInit::get(Class, NewArgs);
+    if (!UR.foundUnresolved())
+      return New->instantiate();
+    return New;
+  }
+  return const_cast<VarDefInit *>(this);
+}
+
+Init *VarDefInit::Fold() const {
+  if (Def)
+    return Def;
+
+  TrackUnresolvedResolver R;
+  for (Init *Arg : args())
+    Arg->resolveReferences(R);
+
+  if (!R.foundUnresolved())
+    return const_cast<VarDefInit *>(this)->instantiate();
+  return const_cast<VarDefInit *>(this);
+}
+
+std::string VarDefInit::getAsString() const {
+  std::string Result = Class->getNameInitAsString() + "<";
+  const char *sep = "";
+  for (Init *Arg : args()) {
+    Result += sep;
+    sep = ", ";
+    Result += Arg->getAsString();
+  }
+  return Result + ">";
+}
+
 FieldInit *FieldInit::get(Init *R, StringInit *FN) {
   using Key = std::pair<Init *, StringInit *>;
   static DenseMap<Key, FieldInit*> ThePool;
@@ -1443,32 +1671,25 @@ Init *FieldInit::getBit(unsigned Bit) const {
   return VarBitInit::get(const_cast<FieldInit*>(this), Bit);
 }
 
-Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
-                                             unsigned Elt) const {
-  if (Init *ListVal = Rec->getFieldInit(R, RV, FieldName))
-    if (ListInit *LI = dyn_cast<ListInit>(ListVal)) {
-      if (Elt >= LI->size()) return nullptr;
-      Init *E = LI->getElement(Elt);
-
-      // If the element is set to some value, or if we are resolving a
-      // reference to a specific variable and that variable is explicitly
-      // unset, then replace the VarListElementInit with it.
-      if (RV || !isa<UnsetInit>(E))
-        return E;
-    }
-  return nullptr;
+Init *FieldInit::resolveReferences(Resolver &R) const {
+  Init *NewRec = Rec->resolveReferences(R);
+  if (NewRec != Rec)
+    return FieldInit::get(NewRec, FieldName)->Fold(R.getCurrentRecord());
+  return const_cast<FieldInit *>(this);
 }
 
-Init *FieldInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  Init *NewRec = RV ? Rec->resolveReferences(R, RV) : Rec;
-
-  if (Init *BitsVal = NewRec->getFieldInit(R, RV, FieldName)) {
-    Init *BVR = BitsVal->resolveReferences(R, RV);
-    return BVR->isComplete() ? BVR : const_cast<FieldInit *>(this);
+Init *FieldInit::Fold(Record *CurRec) const {
+  if (DefInit *DI = dyn_cast<DefInit>(Rec)) {
+    Record *Def = DI->getDef();
+    if (Def == CurRec)
+      PrintFatalError(CurRec->getLoc(),
+                      Twine("Attempting to access field '") +
+                      FieldName->getAsUnquotedString() + "' of '" +
+                      Rec->getAsString() + "' is a forbidden self-reference");
+    Init *FieldVal = Def->getValue(FieldName)->getValue();
+    if (FieldVal->isComplete())
+      return FieldVal;
   }
-
-  if (NewRec != Rec)
-    return FieldInit::get(NewRec, FieldName);
   return const_cast<FieldInit *>(this);
 }
 
@@ -1528,30 +1749,33 @@ void DagInit::Profile(FoldingSetNodeID &ID) const {
   ProfileDagInit(ID, Val, ValName, makeArrayRef(getTrailingObjects<Init *>(), NumArgs), makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames));
 }
 
-Init *DagInit::convertInitializerTo(RecTy *Ty) const {
-  if (isa<DagRecTy>(Ty))
-    return const_cast<DagInit *>(this);
-
-  return nullptr;
-}
-
-Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
+Init *DagInit::resolveReferences(Resolver &R) const {
   SmallVector<Init*, 8> NewArgs;
   NewArgs.reserve(arg_size());
   bool ArgsChanged = false;
   for (const Init *Arg : getArgs()) {
-    Init *NewArg = Arg->resolveReferences(R, RV);
+    Init *NewArg = Arg->resolveReferences(R);
     NewArgs.push_back(NewArg);
     ArgsChanged |= NewArg != Arg;
   }
 
-  Init *Op = Val->resolveReferences(R, RV);
+  Init *Op = Val->resolveReferences(R);
   if (Op != Val || ArgsChanged)
     return DagInit::get(Op, ValName, NewArgs, getArgNames());
 
   return const_cast<DagInit *>(this);
 }
 
+bool DagInit::isConcrete() const {
+  if (!Val->isConcrete())
+    return false;
+  for (const Init *Elt : getArgs()) {
+    if (!Elt->isConcrete())
+      return false;
+  }
+  return true;
+}
+
 std::string DagInit::getAsString() const {
   std::string Result = "(" + Val->getAsString();
   if (ValName)
@@ -1573,7 +1797,7 @@ std::string DagInit::getAsString() const {
 
 RecordVal::RecordVal(Init *N, RecTy *T, bool P)
   : Name(N), TyAndPrefix(T, P) {
-  Value = UnsetInit::get()->convertInitializerTo(T);
+  setValue(UnsetInit::get());
   assert(Value && "Cannot create unset value for current type!");
 }
 
@@ -1581,6 +1805,28 @@ StringRef RecordVal::getName() const {
   return cast<StringInit>(getNameInit())->getValue();
 }
 
+bool RecordVal::setValue(Init *V) {
+  if (V) {
+    Value = V->getCastTo(getType());
+    if (Value) {
+      assert(!isa<TypedInit>(Value) ||
+             cast<TypedInit>(Value)->getType()->typeIsA(getType()));
+      if (BitsRecTy *BTy = dyn_cast<BitsRecTy>(getType())) {
+        if (!isa<BitsInit>(Value)) {
+          SmallVector<Init *, 64> Bits;
+          Bits.reserve(BTy->getNumBits());
+          for (unsigned i = 0, e = BTy->getNumBits(); i < e; ++i)
+            Bits.push_back(Value->getBit(i));
+          Value = BitsInit::get(Bits);
+        }
+      }
+    }
+    return Value == nullptr;
+  }
+  Value = nullptr;
+  return false;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecordVal::dump() const { errs() << *this; }
 #endif
@@ -1597,31 +1843,26 @@ void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
 
 unsigned Record::LastID = 0;
 
-void Record::init() {
-  checkName();
-
-  // Every record potentially has a def at the top.  This value is
-  // replaced with the top-level def name at instantiation time.
-  addValue(RecordVal(StringInit::get("NAME"), StringRecTy::get(), false));
-}
-
 void Record::checkName() {
   // Ensure the record name has string type.
   const TypedInit *TypedName = cast<const TypedInit>(Name);
   if (!isa<StringRecTy>(TypedName->getType()))
-    PrintFatalError(getLoc(), "Record name is not a string!");
+    PrintFatalError(getLoc(), Twine("Record name '") + Name->getAsString() +
+                                  "' is not a string!");
+}
+
+RecordRecTy *Record::getType() {
+  SmallVector<Record *, 4> DirectSCs;
+  getDirectSuperClasses(DirectSCs);
+  return RecordRecTy::get(DirectSCs);
 }
 
 DefInit *Record::getDefInit() {
   if (!TheInit)
-    TheInit = new(Allocator) DefInit(this, new(Allocator) RecordRecTy(this));
+    TheInit = new(Allocator) DefInit(this);
   return TheInit;
 }
 
-StringRef Record::getName() const {
-  return cast<StringInit>(Name)->getValue();
-}
-
 void Record::setName(Init *NewName) {
   Name = NewName;
   checkName();
@@ -1638,28 +1879,57 @@ void Record::setName(Init *NewName) {
   // this.  See TGParser::ParseDef and TGParser::ParseDefm.
 }
 
-void Record::resolveReferencesTo(const RecordVal *RV) {
+void Record::getDirectSuperClasses(SmallVectorImpl<Record *> &Classes) const {
+  ArrayRef<std::pair<Record *, SMRange>> SCs = getSuperClasses();
+  while (!SCs.empty()) {
+    // Superclasses are in reverse preorder, so 'back' is a direct superclass,
+    // and its transitive superclasses are directly preceding it.
+    Record *SC = SCs.back().first;
+    SCs = SCs.drop_back(1 + SC->getSuperClasses().size());
+    Classes.push_back(SC);
+  }
+}
+
+void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) {
   for (RecordVal &Value : Values) {
-    if (RV == &Value) // Skip resolve the same field as the given one
+    if (SkipVal == &Value) // Skip resolve the same field as the given one
       continue;
-    if (Init *V = Value.getValue())
-      if (Value.setValue(V->resolveReferences(*this, RV)))
-        PrintFatalError(getLoc(), "Invalid value is found when setting '" +
-                        Value.getNameInitAsString() +
-                        "' after resolving references" +
-                        (RV ? " against '" + RV->getNameInitAsString() +
-                              "' of (" + RV->getValue()->getAsUnquotedString() +
-                              ")"
-                            : "") + "\n");
+    if (Init *V = Value.getValue()) {
+      Init *VR = V->resolveReferences(R);
+      if (Value.setValue(VR)) {
+        std::string Type;
+        if (TypedInit *VRT = dyn_cast<TypedInit>(VR))
+          Type =
+              (Twine("of type '") + VRT->getType()->getAsString() + "' ").str();
+        PrintFatalError(getLoc(), Twine("Invalid value ") + Type +
+                                      "is found when setting '" +
+                                      Value.getNameInitAsString() +
+                                      "' of type '" +
+                                      Value.getType()->getAsString() +
+                                      "' after resolving references: " +
+                                      VR->getAsUnquotedString() + "\n");
+      }
+    }
   }
   Init *OldName = getNameInit();
-  Init *NewName = Name->resolveReferences(*this, RV);
+  Init *NewName = Name->resolveReferences(R);
   if (NewName != OldName) {
     // Re-register with RecordKeeper.
     setName(NewName);
   }
 }
 
+void Record::resolveReferences() {
+  RecordResolver R(*this);
+  R.setFinal(true);
+  resolveReferences(R);
+}
+
+void Record::resolveReferencesTo(const RecordVal *RV) {
+  RecordValResolver R(*this, RV);
+  resolveReferences(R, RV);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Record::dump() const { errs() << *this; }
 #endif
@@ -1769,8 +2039,10 @@ int64_t Record::getValueAsInt(StringRef FieldName) const {
 
   if (IntInit *II = dyn_cast<IntInit>(R->getValue()))
     return II->getValue();
-  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
-    FieldName + "' does not have an int initializer!");
+  PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" +
+                                FieldName +
+                                "' does not have an int initializer: " +
+                                R->getValue()->getAsString());
 }
 
 std::vector<int64_t>
@@ -1781,8 +2053,10 @@ Record::getValueAsListOfInts(StringRef FieldName) const {
     if (IntInit *II = dyn_cast<IntInit>(I))
       Ints.push_back(II->getValue());
     else
-      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
-        FieldName + "' does not have a list of ints initializer!");
+      PrintFatalError(getLoc(),
+                      Twine("Record `") + getName() + "', field `" + FieldName +
+                          "' does not have a list of ints initializer: " +
+                          I->getAsString());
   }
   return Ints;
 }
@@ -1795,8 +2069,10 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
     if (StringInit *SI = dyn_cast<StringInit>(I))
       Strings.push_back(SI->getValue());
     else
-      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
-        FieldName + "' does not have a list of strings initializer!");
+      PrintFatalError(getLoc(),
+                      Twine("Record `") + getName() + "', field `" + FieldName +
+                          "' does not have a list of strings initializer: " +
+                          I->getAsString());
   }
   return Strings;
 }
@@ -1855,15 +2131,6 @@ DagInit *Record::getValueAsDag(StringRef FieldName) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void MultiClass::dump() const {
-  errs() << "Record:\n";
-  Rec.dump();
-
-  errs() << "Defs:\n";
-  for (const auto &Proto : DefPrototypes)
-    Proto->dump();
-}
-
 LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; }
 #endif
 
@@ -1878,6 +2145,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
   return OS;
 }
 
+/// GetNewAnonymousName - Generate a unique anonymous name that can be used as
+/// an identifier.
+Init *RecordKeeper::getNewAnonymousName() {
+  return StringInit::get("anonymous_" + utostr(AnonCounter++));
+}
+
 std::vector<Record *>
 RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const {
   Record *Class = getClass(ClassName);
@@ -1892,25 +2165,70 @@ RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const {
   return Defs;
 }
 
-static Init *GetStrConcat(Init *I0, Init *I1) {
-  // Shortcut for the common case of concatenating two strings.
-  if (const StringInit *I0s = dyn_cast<StringInit>(I0))
-    if (const StringInit *I1s = dyn_cast<StringInit>(I1))
-      return ConcatStringInits(I0s, I1s);
-  return BinOpInit::get(BinOpInit::STRCONCAT, I0, I1, StringRecTy::get());
+Init *MapResolver::resolve(Init *VarName) {
+  auto It = Map.find(VarName);
+  if (It == Map.end())
+    return nullptr;
+
+  Init *I = It->second.V;
+
+  if (!It->second.Resolved && Map.size() > 1) {
+    // Resolve mutual references among the mapped variables, but prevent
+    // infinite recursion.
+    Map.erase(It);
+    I = I->resolveReferences(*this);
+    Map[VarName] = {I, true};
+  }
+
+  return I;
+}
+
+Init *RecordResolver::resolve(Init *VarName) {
+  Init *Val = Cache.lookup(VarName);
+  if (Val)
+    return Val;
+
+  for (Init *S : Stack) {
+    if (S == VarName)
+      return nullptr; // prevent infinite recursion
+  }
+
+  if (RecordVal *RV = getCurrentRecord()->getValue(VarName)) {
+    if (!isa<UnsetInit>(RV->getValue())) {
+      Val = RV->getValue();
+      Stack.push_back(VarName);
+      Val = Val->resolveReferences(*this);
+      Stack.pop_back();
+    }
+  }
+
+  Cache[VarName] = Val;
+  return Val;
 }
 
-Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
-                        Init *Name, StringRef Scoper) {
-  Init *NewName = GetStrConcat(CurRec.getNameInit(), StringInit::get(Scoper));
-  NewName = GetStrConcat(NewName, Name);
-  if (CurMultiClass && Scoper != "::") {
-    Init *Prefix = GetStrConcat(CurMultiClass->Rec.getNameInit(),
-                                StringInit::get("::"));
-    NewName = GetStrConcat(Prefix, NewName);
+Init *TrackUnresolvedResolver::resolve(Init *VarName) {
+  Init *I = nullptr;
+
+  if (R) {
+    I = R->resolve(VarName);
+    if (I && !FoundUnresolved) {
+      // Do not recurse into the resolved initializer, as that would change
+      // the behavior of the resolver we're delegating, but do check to see
+      // if there are unresolved variables remaining.
+      TrackUnresolvedResolver Sub;
+      I->resolveReferences(Sub);
+      FoundUnresolved |= Sub.FoundUnresolved;
+    }
   }
 
-  if (BinOpInit *BinOp = dyn_cast<BinOpInit>(NewName))
-    NewName = BinOp->Fold(&CurRec, CurMultiClass);
-  return NewName;
+  if (!I)
+    FoundUnresolved = true;
+  return I;
+}
+
+Init *HasReferenceResolver::resolve(Init *VarName)
+{
+  if (VarName == VarNameToTrack)
+    Found = true;
+  return nullptr;
 }
diff --git a/contrib/llvm/lib/TableGen/TGLexer.cpp b/contrib/llvm/lib/TableGen/TGLexer.cpp
index 5d6f7c23e0b6..652be6e8dbbf 100644
--- a/contrib/llvm/lib/TableGen/TGLexer.cpp
+++ b/contrib/llvm/lib/TableGen/TGLexer.cpp
@@ -56,7 +56,7 @@ int TGLexer::getNextChar() {
     // a random nul in the file.  Disambiguate that here.
     if (CurPtr-1 != CurBuf.end())
       return 0;  // Just whitespace.
-    
+
     // If this is the end of an included file, pop the parent file off the
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
@@ -66,9 +66,9 @@ int TGLexer::getNextChar() {
       CurPtr = ParentIncludeLoc.getPointer();
       return getNextChar();
     }
-    
+
     // Otherwise, return end of file.
-    --CurPtr;  // Another call to lex will return EOF again.  
+    --CurPtr;  // Another call to lex will return EOF again.
     return EOF;
   }
   case '\n':
@@ -80,7 +80,7 @@ int TGLexer::getNextChar() {
         *CurPtr != CurChar)
       ++CurPtr;  // Eat the two char newline sequence.
     return '\n';
-  }  
+  }
 }
 
 int TGLexer::peekNextChar(int Index) {
@@ -115,7 +115,7 @@ tgtok::TokKind TGLexer::LexToken() {
   case '=': return tgtok::equal;
   case '?': return tgtok::question;
   case '#': return tgtok::paste;
-      
+
   case 0:
   case ' ':
   case '\t':
@@ -154,7 +154,7 @@ tgtok::TokKind TGLexer::LexToken() {
         switch (NextNextChar) {
         default:
           break;
-        case '0': case '1': 
+        case '0': case '1':
           if (NextChar == 'b')
             return LexNumber();
           LLVM_FALLTHROUGH;
@@ -184,24 +184,24 @@ tgtok::TokKind TGLexer::LexToken() {
 /// LexString - Lex "[^"]*"
 tgtok::TokKind TGLexer::LexString() {
   const char *StrStart = CurPtr;
-  
+
   CurStrVal = "";
-  
+
   while (*CurPtr != '"') {
     // If we hit the end of the buffer, report an error.
     if (*CurPtr == 0 && CurPtr == CurBuf.end())
       return ReturnError(StrStart, "End of file in string literal");
-    
+
     if (*CurPtr == '\n' || *CurPtr == '\r')
       return ReturnError(StrStart, "End of line in string literal");
-    
+
     if (*CurPtr != '\\') {
       CurStrVal += *CurPtr++;
       continue;
     }
 
     ++CurPtr;
-    
+
     switch (*CurPtr) {
     case '\\': case '\'': case '"':
       // These turn into their literal character.
@@ -215,7 +215,7 @@ tgtok::TokKind TGLexer::LexString() {
       CurStrVal += '\n';
       ++CurPtr;
       break;
-        
+
     case '\n':
     case '\r':
       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
@@ -229,7 +229,7 @@ tgtok::TokKind TGLexer::LexString() {
       return ReturnError(CurPtr, "invalid escape in string literal");
     }
   }
-  
+
   ++CurPtr;
   return tgtok::StrVal;
 }
@@ -237,10 +237,10 @@ tgtok::TokKind TGLexer::LexString() {
 tgtok::TokKind TGLexer::LexVarName() {
   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
     return ReturnError(TokStart, "Invalid variable name");
-  
+
   // Otherwise, we're ok, consume the rest of the characters.
   const char *VarNameStart = CurPtr++;
-  
+
   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
     ++CurPtr;
 
@@ -276,6 +276,7 @@ tgtok::TokKind TGLexer::LexIdentifier() {
     .Case("def", tgtok::Def)
     .Case("foreach", tgtok::Foreach)
     .Case("defm", tgtok::Defm)
+    .Case("defset", tgtok::Defset)
     .Case("multiclass", tgtok::MultiClass)
     .Case("field", tgtok::Field)
     .Case("let", tgtok::Let)
@@ -308,7 +309,7 @@ bool TGLexer::LexInclude() {
     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
     return true;
   }
-  
+
   DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
   if (Found != Dependencies.end()) {
     PrintError(getLoc(),
@@ -347,7 +348,7 @@ void TGLexer::SkipBCPLComment() {
 bool TGLexer::SkipCComment() {
   ++CurPtr;  // skip the star.
   unsigned CommentDepth = 1;
-  
+
   while (true) {
     int CurChar = getNextChar();
     switch (CurChar) {
@@ -357,7 +358,7 @@ bool TGLexer::SkipCComment() {
     case '*':
       // End of the comment?
       if (CurPtr[0] != '/') break;
-      
+
       ++CurPtr;   // End the */.
       if (--CommentDepth == 0)
         return false;
@@ -383,7 +384,7 @@ tgtok::TokKind TGLexer::LexNumber() {
       const char *NumStart = CurPtr;
       while (isxdigit(CurPtr[0]))
         ++CurPtr;
-      
+
       // Requires at least one hex digit.
       if (CurPtr == NumStart)
         return ReturnError(TokStart, "Invalid hexadecimal number");
@@ -422,7 +423,7 @@ tgtok::TokKind TGLexer::LexNumber() {
     else if (CurPtr[-1] == '+')
       return tgtok::plus;
   }
-  
+
   while (isdigit(CurPtr[0]))
     ++CurPtr;
   CurIntVal = strtoll(TokStart, nullptr, 10);
@@ -439,9 +440,9 @@ tgtok::TokKind TGLexer::LexBracket() {
   while (true) {
     int Char = getNextChar();
     if (Char == EOF) break;
-    
+
     if (Char != '}') continue;
-    
+
     Char = getNextChar();
     if (Char == EOF) break;
     if (Char == ']') {
@@ -449,7 +450,7 @@ tgtok::TokKind TGLexer::LexBracket() {
       return tgtok::CodeFragment;
     }
   }
-  
+
   return ReturnError(CodeStart-2, "Unterminated Code Block");
 }
 
@@ -457,19 +458,27 @@ tgtok::TokKind TGLexer::LexBracket() {
 tgtok::TokKind TGLexer::LexExclaim() {
   if (!isalpha(*CurPtr))
     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
-  
+
   const char *Start = CurPtr++;
   while (isalpha(*CurPtr))
     ++CurPtr;
-  
+
   // Check to see which operator this is.
   tgtok::TokKind Kind =
     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
     .Case("eq", tgtok::XEq)
+    .Case("ne", tgtok::XNe)
+    .Case("le", tgtok::XLe)
+    .Case("lt", tgtok::XLt)
+    .Case("ge", tgtok::XGe)
+    .Case("gt", tgtok::XGt)
     .Case("if", tgtok::XIf)
+    .Case("isa", tgtok::XIsA)
     .Case("head", tgtok::XHead)
     .Case("tail", tgtok::XTail)
+    .Case("size", tgtok::XSize)
     .Case("con", tgtok::XConcat)
+    .Case("dag", tgtok::XDag)
     .Case("add", tgtok::XADD)
     .Case("and", tgtok::XAND)
     .Case("or", tgtok::XOR)
@@ -479,6 +488,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
     .Case("cast", tgtok::XCast)
     .Case("empty", tgtok::XEmpty)
     .Case("subst", tgtok::XSubst)
+    .Case("foldl", tgtok::XFoldl)
     .Case("foreach", tgtok::XForEach)
     .Case("listconcat", tgtok::XListConcat)
     .Case("strconcat", tgtok::XStrConcat)
diff --git a/contrib/llvm/lib/TableGen/TGLexer.h b/contrib/llvm/lib/TableGen/TGLexer.h
index b5b58161878b..2c80743e3a68 100644
--- a/contrib/llvm/lib/TableGen/TGLexer.h
+++ b/contrib/llvm/lib/TableGen/TGLexer.h
@@ -30,7 +30,7 @@ namespace tgtok {
   enum TokKind {
     // Markers
     Eof, Error,
-    
+
     // Tokens with no info.
     minus, plus,        // - +
     l_square, r_square, // [ ]
@@ -44,11 +44,12 @@ namespace tgtok {
 
     // Keywords.
     Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
-    MultiClass, String,
+    MultiClass, String, Defset,
 
     // !keywords.
     XConcat, XADD, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast,
-    XSubst, XForEach, XHead, XTail, XEmpty, XIf, XEq,
+    XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, XIf, XEq, XIsA, XDag,
+    XNe, XLe, XLt, XGe, XGt,
 
     // Integer value.
     IntVal,
@@ -56,7 +57,7 @@ namespace tgtok {
     // Binary constant.  Note that these are sized according to the number of
     // bits given.
     BinaryIntVal,
-    
+
     // String valued tokens.
     Id, StrVal, VarName, CodeFragment
   };
@@ -65,7 +66,7 @@ namespace tgtok {
 /// TGLexer - TableGen Lexer class.
 class TGLexer {
   SourceMgr &SrcMgr;
-  
+
   const char *CurPtr;
   StringRef CurBuf;
 
@@ -95,11 +96,11 @@ public:
   const DependenciesMapTy &getDependencies() const {
     return Dependencies;
   }
-  
+
   tgtok::TokKind getCode() const { return CurCode; }
 
   const std::string &getCurStrVal() const {
-    assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 
+    assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
             CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
            "This token doesn't have a string value");
     return CurStrVal;
@@ -115,13 +116,13 @@ public:
   }
 
   SMLoc getLoc() const;
-  
+
 private:
   /// LexToken - Read the next token and return its code.
   tgtok::TokKind LexToken();
-  
+
   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
-  
+
   int getNextChar();
   int peekNextChar(int Index);
   void SkipBCPLComment();
@@ -134,7 +135,7 @@ private:
   tgtok::TokKind LexBracket();
   tgtok::TokKind LexExclaim();
 };
-  
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/TableGen/TGParser.cpp b/contrib/llvm/lib/TableGen/TGParser.cpp
index b492cf9495c0..1d1f3603c83c 100644
--- a/contrib/llvm/lib/TableGen/TGParser.cpp
+++ b/contrib/llvm/lib/TableGen/TGParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -68,6 +69,75 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
 
 } // end namespace llvm
 
+static bool checkBitsConcrete(Record &R, const RecordVal &RV) {
+  BitsInit *BV = cast<BitsInit>(RV.getValue());
+  for (unsigned i = 0, e = BV->getNumBits(); i != e; ++i) {
+    Init *Bit = BV->getBit(i);
+    bool IsReference = false;
+    if (auto VBI = dyn_cast<VarBitInit>(Bit)) {
+      if (auto VI = dyn_cast<VarInit>(VBI->getBitVar())) {
+        if (R.getValue(VI->getName()))
+          IsReference = true;
+      }
+    } else if (isa<VarInit>(Bit)) {
+      IsReference = true;
+    }
+    if (!(IsReference || Bit->isConcrete()))
+      return false;
+  }
+  return true;
+}
+
+static void checkConcrete(Record &R) {
+  for (const RecordVal &RV : R.getValues()) {
+    // HACK: Disable this check for variables declared with 'field'. This is
+    // done merely because existing targets have legitimate cases of
+    // non-concrete variables in helper defs. Ideally, we'd introduce a
+    // 'maybe' or 'optional' modifier instead of this.
+    if (RV.getPrefix())
+      continue;
+
+    if (Init *V = RV.getValue()) {
+      bool Ok = isa<BitsInit>(V) ? checkBitsConcrete(R, RV) : V->isConcrete();
+      if (!Ok) {
+        PrintError(R.getLoc(),
+                   Twine("Initializer of '") + RV.getNameInitAsString() +
+                   "' in '" + R.getNameInitAsString() +
+                   "' could not be fully resolved: " +
+                   RV.getValue()->getAsString());
+      }
+    }
+  }
+}
+
+/// Return an Init with a qualifier prefix referring
+/// to CurRec's name.
+static Init *QualifyName(Record &CurRec, MultiClass *CurMultiClass,
+                        Init *Name, StringRef Scoper) {
+  Init *NewName =
+      BinOpInit::getStrConcat(CurRec.getNameInit(), StringInit::get(Scoper));
+  NewName = BinOpInit::getStrConcat(NewName, Name);
+  if (CurMultiClass && Scoper != "::") {
+    Init *Prefix = BinOpInit::getStrConcat(CurMultiClass->Rec.getNameInit(),
+                                           StringInit::get("::"));
+    NewName = BinOpInit::getStrConcat(Prefix, NewName);
+  }
+
+  if (BinOpInit *BinOp = dyn_cast<BinOpInit>(NewName))
+    NewName = BinOp->Fold(&CurRec);
+  return NewName;
+}
+
+/// Return the qualified version of the implicit 'NAME' template argument.
+static Init *QualifiedNameOfImplicitName(Record &Rec,
+                                         MultiClass *MC = nullptr) {
+  return QualifyName(Rec, MC, StringInit::get("NAME"), MC ? "::" : ":");
+}
+
+static Init *QualifiedNameOfImplicitName(MultiClass *MC) {
+  return QualifiedNameOfImplicitName(MC->Rec, MC);
+}
+
 bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
   if (!CurRec)
     CurRec = &CurMultiClass->Rec;
@@ -104,7 +174,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
   if (BitList.empty())
     if (VarInit *VI = dyn_cast<VarInit>(V))
       if (VI->getNameInit() == ValName && !AllowSelfAssignment)
-        return true;
+        return Error(Loc, "Recursion / self-assignment forbidden");
 
   // If we are assigning to a subset of the bits in the value... then we must be
   // assigning to a field of BitsRecTy, which must have a BitsInit
@@ -117,13 +187,10 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
                    "' is not a bits type");
 
     // Convert the incoming value to a bits type of the appropriate size...
-    Init *BI = V->convertInitializerTo(BitsRecTy::get(BitList.size()));
+    Init *BI = V->getCastTo(BitsRecTy::get(BitList.size()));
     if (!BI)
       return Error(Loc, "Initializer is not compatible with bit range");
 
-    // We should have a BitsInit type now.
-    BitsInit *BInit = cast<BitsInit>(BI);
-
     SmallVector<Init *, 16> NewBits(CurVal->getNumBits());
 
     // Loop over bits, assigning values as appropriate.
@@ -132,7 +199,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
       if (NewBits[Bit])
         return Error(Loc, "Cannot set bit #" + Twine(Bit) + " of value '" +
                      ValName->getAsUnquotedString() + "' more than once");
-      NewBits[Bit] = BInit->getBit(i);
+      NewBits[Bit] = BI->getBit(i);
     }
 
     for (unsigned i = 0, e = CurVal->getNumBits(); i != e; ++i)
@@ -147,10 +214,12 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
     if (BitsInit *BI = dyn_cast<BitsInit>(V))
       InitType = (Twine("' of type bit initializer with length ") +
                   Twine(BI->getNumBits())).str();
+    else if (TypedInit *TI = dyn_cast<TypedInit>(V))
+      InitType = (Twine("' of type '") + TI->getType()->getAsString()).str();
     return Error(Loc, "Value '" + ValName->getAsUnquotedString() +
-                 "' of type '" + RV->getType()->getAsString() +
-                 "' is incompatible with initializer '" + V->getAsString() +
-                 InitType + "'");
+                          "' of type '" + RV->getType()->getAsString() +
+                          "' is incompatible with initializer '" +
+                          V->getAsString() + InitType + "'");
   }
   return false;
 }
@@ -173,27 +242,36 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
 
   // Loop over all of the template arguments, setting them to the specified
   // value or leaving them as the default if necessary.
+  MapResolver R(CurRec);
+
   for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
     if (i < SubClass.TemplateArgs.size()) {
       // If a value is specified for this template arg, set it now.
       if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i],
                    None, SubClass.TemplateArgs[i]))
         return true;
-
-      // Resolve it next.
-      CurRec->resolveReferencesTo(CurRec->getValue(TArgs[i]));
-
-      // Now remove it.
-      CurRec->removeValue(TArgs[i]);
-
     } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
       return Error(SubClass.RefRange.Start,
                    "Value not specified for template argument #" +
                    Twine(i) + " (" + TArgs[i]->getAsUnquotedString() +
                    ") of subclass '" + SC->getNameInitAsString() + "'!");
     }
+
+    R.set(TArgs[i], CurRec->getValue(TArgs[i])->getValue());
+
+    CurRec->removeValue(TArgs[i]);
   }
 
+  Init *Name;
+  if (CurRec->isClass())
+    Name =
+        VarInit::get(QualifiedNameOfImplicitName(*CurRec), StringRecTy::get());
+  else
+    Name = CurRec->getNameInit();
+  R.set(QualifiedNameOfImplicitName(*SC), Name);
+
+  CurRec->resolveReferences(R);
+
   // Since everything went well, we can now set the "superclass" list for the
   // current record.
   ArrayRef<std::pair<Record *, SMRange>> SCs = SC->getSuperClasses();
@@ -211,157 +289,189 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
   return false;
 }
 
+bool TGParser::AddSubClass(RecordsEntry &Entry, SubClassReference &SubClass) {
+  if (Entry.Rec)
+    return AddSubClass(Entry.Rec.get(), SubClass);
+
+  for (auto &E : Entry.Loop->Entries) {
+    if (AddSubClass(E, SubClass))
+      return true;
+  }
+
+  return false;
+}
+
 /// AddSubMultiClass - Add SubMultiClass as a subclass to
 /// CurMC, resolving its template args as SubMultiClass's
 /// template arguments.
 bool TGParser::AddSubMultiClass(MultiClass *CurMC,
                                 SubMultiClassReference &SubMultiClass) {
   MultiClass *SMC = SubMultiClass.MC;
-  Record *CurRec = &CurMC->Rec;
-
-  // Add all of the values in the subclass into the current class.
-  for (const auto &SMCVal : SMC->Rec.getValues())
-    if (AddValue(CurRec, SubMultiClass.RefRange.Start, SMCVal))
-      return true;
-
-  unsigned newDefStart = CurMC->DefPrototypes.size();
-
-  // Add all of the defs in the subclass into the current multiclass.
-  for (const std::unique_ptr<Record> &R : SMC->DefPrototypes) {
-    // Clone the def and add it to the current multiclass
-    auto NewDef = make_unique<Record>(*R);
-
-    // Add all of the values in the superclass into the current def.
-    for (const auto &MCVal : CurRec->getValues())
-      if (AddValue(NewDef.get(), SubMultiClass.RefRange.Start, MCVal))
-        return true;
-
-    CurMC->DefPrototypes.push_back(std::move(NewDef));
-  }
 
   ArrayRef<Init *> SMCTArgs = SMC->Rec.getTemplateArgs();
-
-  // Ensure that an appropriate number of template arguments are
-  // specified.
   if (SMCTArgs.size() < SubMultiClass.TemplateArgs.size())
     return Error(SubMultiClass.RefRange.Start,
                  "More template args specified than expected");
 
-  // Loop over all of the template arguments, setting them to the specified
-  // value or leaving them as the default if necessary.
+  // Prepare the mapping of template argument name to value, filling in default
+  // values if necessary.
+  SubstStack TemplateArgs;
   for (unsigned i = 0, e = SMCTArgs.size(); i != e; ++i) {
     if (i < SubMultiClass.TemplateArgs.size()) {
-      // If a value is specified for this template arg, set it in the
-      // superclass now.
-      if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i],
-                   None, SubMultiClass.TemplateArgs[i]))
-        return true;
-
-      // Resolve it next.
-      CurRec->resolveReferencesTo(CurRec->getValue(SMCTArgs[i]));
+      TemplateArgs.emplace_back(SMCTArgs[i], SubMultiClass.TemplateArgs[i]);
+    } else {
+      Init *Default = SMC->Rec.getValue(SMCTArgs[i])->getValue();
+      if (!Default->isComplete()) {
+        return Error(SubMultiClass.RefRange.Start,
+                     "value not specified for template argument #" + Twine(i) +
+                         " (" + SMCTArgs[i]->getAsUnquotedString() +
+                         ") of multiclass '" + SMC->Rec.getNameInitAsString() +
+                         "'");
+      }
+      TemplateArgs.emplace_back(SMCTArgs[i], Default);
+    }
+  }
 
-      // Now remove it.
-      CurRec->removeValue(SMCTArgs[i]);
+  TemplateArgs.emplace_back(
+      QualifiedNameOfImplicitName(SMC),
+      VarInit::get(QualifiedNameOfImplicitName(CurMC), StringRecTy::get()));
 
-      // If a value is specified for this template arg, set it in the
-      // new defs now.
-      for (const auto &Def :
-             makeArrayRef(CurMC->DefPrototypes).slice(newDefStart)) {
-        if (SetValue(Def.get(), SubMultiClass.RefRange.Start, SMCTArgs[i],
-                     None, SubMultiClass.TemplateArgs[i]))
-          return true;
+  // Add all of the defs in the subclass into the current multiclass.
+  return resolve(SMC->Entries, TemplateArgs, false, &CurMC->Entries);
+}
 
-        // Resolve it next.
-        Def->resolveReferencesTo(Def->getValue(SMCTArgs[i]));
+/// Add a record or foreach loop to the current context (global record keeper,
+/// current inner-most foreach loop, or multiclass).
+bool TGParser::addEntry(RecordsEntry E) {
+  assert(!E.Rec || !E.Loop);
 
-        // Now remove it
-        Def->removeValue(SMCTArgs[i]);
-      }
-    } else if (!CurRec->getValue(SMCTArgs[i])->getValue()->isComplete()) {
-      return Error(SubMultiClass.RefRange.Start,
-                   "Value not specified for template argument #" +
-                   Twine(i) + " (" + SMCTArgs[i]->getAsUnquotedString() +
-                   ") of subclass '" + SMC->Rec.getNameInitAsString() + "'!");
-    }
+  if (!Loops.empty()) {
+    Loops.back()->Entries.push_back(std::move(E));
+    return false;
   }
 
-  return false;
-}
+  if (E.Loop) {
+    SubstStack Stack;
+    return resolve(*E.Loop, Stack, CurMultiClass == nullptr,
+                   CurMultiClass ? &CurMultiClass->Entries : nullptr);
+  }
 
-/// ProcessForeachDefs - Given a record, apply all of the variable
-/// values in all surrounding foreach loops, creating new records for
-/// each combination of values.
-bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc) {
-  if (Loops.empty())
+  if (CurMultiClass) {
+    CurMultiClass->Entries.push_back(std::move(E));
     return false;
+  }
 
-  // We want to instantiate a new copy of CurRec for each combination
-  // of nested loop iterator values.  We don't want top instantiate
-  // any copies until we have values for each loop iterator.
-  IterSet IterVals;
-  return ProcessForeachDefs(CurRec, Loc, IterVals);
+  return addDefOne(std::move(E.Rec));
 }
 
-/// ProcessForeachDefs - Given a record, a loop and a loop iterator,
-/// apply each of the variable values in this loop and then process
-/// subloops.
-bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
-  // Recursively build a tuple of iterator values.
-  if (IterVals.size() != Loops.size()) {
-    assert(IterVals.size() < Loops.size());
-    ForeachLoop &CurLoop = Loops[IterVals.size()];
-    ListInit *List = dyn_cast<ListInit>(CurLoop.ListValue);
-    if (!List) {
-      Error(Loc, "Loop list is not a list");
-      return true;
-    }
+/// Resolve the entries in \p Loop, going over inner loops recursively
+/// and making the given subsitutions of (name, value) pairs.
+///
+/// The resulting records are stored in \p Dest if non-null. Otherwise, they
+/// are added to the global record keeper.
+bool TGParser::resolve(const ForeachLoop &Loop, SubstStack &Substs,
+                       bool Final, std::vector<RecordsEntry> *Dest,
+                       SMLoc *Loc) {
+  MapResolver R;
+  for (const auto &S : Substs)
+    R.set(S.first, S.second);
+  Init *List = Loop.ListValue->resolveReferences(R);
+  auto LI = dyn_cast<ListInit>(List);
+  if (!LI) {
+    if (!Final) {
+      Dest->emplace_back(make_unique<ForeachLoop>(Loop.Loc, Loop.IterVar,
+                                                  List));
+      return resolve(Loop.Entries, Substs, Final, &Dest->back().Loop->Entries,
+                     Loc);
+    }
+
+    PrintError(Loop.Loc, Twine("attempting to loop over '") +
+                              List->getAsString() + "', expected a list");
+    return true;
+  }
 
-    // Process each value.
-    for (unsigned i = 0; i < List->size(); ++i) {
-      Init *ItemVal = List->resolveListElementReference(*CurRec, nullptr, i);
-      IterVals.push_back(IterRecord(CurLoop.IterVar, ItemVal));
-      if (ProcessForeachDefs(CurRec, Loc, IterVals))
-        return true;
-      IterVals.pop_back();
-    }
-    return false;
+  bool Error = false;
+  for (auto Elt : *LI) {
+    Substs.emplace_back(Loop.IterVar->getNameInit(), Elt);
+    Error = resolve(Loop.Entries, Substs, Final, Dest);
+    Substs.pop_back();
+    if (Error)
+      break;
   }
+  return Error;
+}
 
-  // This is the bottom of the recursion. We have all of the iterator values
-  // for this point in the iteration space.  Instantiate a new record to
-  // reflect this combination of values.
-  auto IterRec = make_unique<Record>(*CurRec);
+/// Resolve the entries in \p Source, going over loops recursively and
+/// making the given substitutions of (name, value) pairs.
+///
+/// The resulting records are stored in \p Dest if non-null. Otherwise, they
+/// are added to the global record keeper.
+bool TGParser::resolve(const std::vector<RecordsEntry> &Source,
+                       SubstStack &Substs, bool Final,
+                       std::vector<RecordsEntry> *Dest, SMLoc *Loc) {
+  bool Error = false;
+  for (auto &E : Source) {
+    if (E.Loop) {
+      Error = resolve(*E.Loop, Substs, Final, Dest);
+    } else {
+      auto Rec = make_unique<Record>(*E.Rec);
+      if (Loc)
+        Rec->appendLoc(*Loc);
 
-  // Set the iterator values now.
-  for (IterRecord &IR : IterVals) {
-    VarInit *IterVar = IR.IterVar;
-    TypedInit *IVal = dyn_cast<TypedInit>(IR.IterValue);
-    if (!IVal)
-      return Error(Loc, "foreach iterator value is untyped");
+      MapResolver R(Rec.get());
+      for (const auto &S : Substs)
+        R.set(S.first, S.second);
+      Rec->resolveReferences(R);
 
-    IterRec->addValue(RecordVal(IterVar->getNameInit(), IVal->getType(), false));
+      if (Dest)
+        Dest->push_back(std::move(Rec));
+      else
+        Error = addDefOne(std::move(Rec));
+    }
+    if (Error)
+      break;
+  }
+  return Error;
+}
 
-    if (SetValue(IterRec.get(), Loc, IterVar->getNameInit(), None, IVal))
-      return Error(Loc, "when instantiating this def");
+/// Resolve the record fully and add it to the record keeper.
+bool TGParser::addDefOne(std::unique_ptr<Record> Rec) {
+  if (Record *Prev = Records.getDef(Rec->getNameInitAsString())) {
+    if (!Rec->isAnonymous()) {
+      PrintError(Rec->getLoc(),
+                 "def already exists: " + Rec->getNameInitAsString());
+      PrintNote(Prev->getLoc(), "location of previous definition");
+      return true;
+    }
+    Rec->setName(Records.getNewAnonymousName());
+  }
 
-    // Resolve it next.
-    IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getNameInit()));
+  Rec->resolveReferences();
+  checkConcrete(*Rec);
 
-    // Remove it.
-    IterRec->removeValue(IterVar->getNameInit());
+  if (!isa<StringInit>(Rec->getNameInit())) {
+    PrintError(Rec->getLoc(), Twine("record name '") +
+                                  Rec->getNameInit()->getAsString() +
+                                  "' could not be fully resolved");
+    return true;
   }
 
-  if (Records.getDef(IterRec->getNameInitAsString())) {
-    // If this record is anonymous, it's no problem, just generate a new name
-    if (!IterRec->isAnonymous())
-      return Error(Loc, "def already exists: " +IterRec->getNameInitAsString());
-
-    IterRec->setName(GetNewAnonymousName());
+  // If ObjectBody has template arguments, it's an error.
+  assert(Rec->getTemplateArgs().empty() && "How'd this get template args?");
+
+  for (DefsetRecord *Defset : Defsets) {
+    DefInit *I = Rec->getDefInit();
+    if (!I->getType()->typeIsA(Defset->EltTy)) {
+      PrintError(Rec->getLoc(), Twine("adding record of incompatible type '") +
+                                    I->getType()->getAsString() +
+                                     "' to defset");
+      PrintNote(Defset->Loc, "location of defset declaration");
+      return true;
+    }
+    Defset->Elements.push_back(I);
   }
 
-  Record *IterRecSave = IterRec.get(); // Keep a copy before release.
-  Records.addDef(std::move(IterRec));
-  IterRecSave->resolveReferences();
+  Records.addDef(std::move(Rec));
   return false;
 }
 
@@ -371,19 +481,14 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
 
 /// isObjectStart - Return true if this is a valid first token for an Object.
 static bool isObjectStart(tgtok::TokKind K) {
-  return K == tgtok::Class || K == tgtok::Def ||
-         K == tgtok::Defm || K == tgtok::Let ||
-         K == tgtok::MultiClass || K == tgtok::Foreach;
-}
-
-/// GetNewAnonymousName - Generate a unique anonymous name that can be used as
-/// an identifier.
-Init *TGParser::GetNewAnonymousName() {
-  return StringInit::get("anonymous_" + utostr(AnonCounter++));
+  return K == tgtok::Class || K == tgtok::Def || K == tgtok::Defm ||
+         K == tgtok::Let || K == tgtok::MultiClass || K == tgtok::Foreach ||
+         K == tgtok::Defset;
 }
 
-/// ParseObjectName - If an object name is specified, return it.  Otherwise,
-/// return 0.
+/// ParseObjectName - If a valid object name is specified, return it. If no
+/// name is specified, return the unset initializer. Return nullptr on parse
+/// error.
 ///   ObjectName ::= Value [ '#' Value ]*
 ///   ObjectName ::= /*empty*/
 ///
@@ -395,7 +500,7 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
     // These are all of the tokens that can begin an object body.
     // Some of these can also begin values but we disallow those cases
     // because they are unlikely to be useful.
-    return nullptr;
+    return UnsetInit::get();
   default:
     break;
   }
@@ -404,17 +509,20 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
   if (CurMultiClass)
     CurRec = &CurMultiClass->Rec;
 
-  RecTy *Type = nullptr;
-  if (CurRec) {
-    const TypedInit *CurRecName = dyn_cast<TypedInit>(CurRec->getNameInit());
-    if (!CurRecName) {
-      TokError("Record name is not typed!");
-      return nullptr;
-    }
-    Type = CurRecName->getType();
+  Init *Name = ParseValue(CurRec, StringRecTy::get(), ParseNameMode);
+  if (!Name)
+    return nullptr;
+
+  if (CurMultiClass) {
+    Init *NameStr = QualifiedNameOfImplicitName(CurMultiClass);
+    HasReferenceResolver R(NameStr);
+    Name->resolveReferences(R);
+    if (!R.found())
+      Name = BinOpInit::getStrConcat(VarInit::get(NameStr, StringRecTy::get()),
+                                     Name);
   }
 
-  return ParseValue(CurRec, Type, ParseNameMode);
+  return Name;
 }
 
 /// ParseClassID - Parse and resolve a reference to a class name.  This returns
@@ -679,6 +787,7 @@ RecTy *TGParser::ParseType() {
   case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
   case tgtok::Id:
     if (Record *R = ParseClassID()) return RecordRecTy::get(R);
+    TokError("unknown class name");
     return nullptr;
   case tgtok::Bits: {
     if (Lex.Lex() != tgtok::less) { // Eat 'bits'
@@ -723,33 +832,29 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMLoc NameLoc,
   if (CurRec) {
     if (const RecordVal *RV = CurRec->getValue(Name))
       return VarInit::get(Name, RV->getType());
-
-    Init *TemplateArgName = QualifyName(*CurRec, CurMultiClass, Name, ":");
-
-    if (CurMultiClass)
-      TemplateArgName = QualifyName(CurMultiClass->Rec, CurMultiClass, Name,
-                                    "::");
-
-    if (CurRec->isTemplateArg(TemplateArgName)) {
-      const RecordVal *RV = CurRec->getValue(TemplateArgName);
-      assert(RV && "Template arg doesn't exist??");
-      return VarInit::get(TemplateArgName, RV->getType());
-    }
   }
 
-  if (CurMultiClass) {
-    Init *MCName = QualifyName(CurMultiClass->Rec, CurMultiClass, Name, "::");
+  if ((CurRec && CurRec->isClass()) || CurMultiClass) {
+    Init *TemplateArgName;
+    if (CurMultiClass) {
+      TemplateArgName =
+          QualifyName(CurMultiClass->Rec, CurMultiClass, Name, "::");
+    } else
+      TemplateArgName = QualifyName(*CurRec, CurMultiClass, Name, ":");
 
-    if (CurMultiClass->Rec.isTemplateArg(MCName)) {
-      const RecordVal *RV = CurMultiClass->Rec.getValue(MCName);
+    Record *TemplateRec = CurMultiClass ? &CurMultiClass->Rec : CurRec;
+    if (TemplateRec->isTemplateArg(TemplateArgName)) {
+      const RecordVal *RV = TemplateRec->getValue(TemplateArgName);
       assert(RV && "Template arg doesn't exist??");
-      return VarInit::get(MCName, RV->getType());
+      return VarInit::get(TemplateArgName, RV->getType());
+    } else if (Name->getValue() == "NAME") {
+      return VarInit::get(TemplateArgName, StringRecTy::get());
     }
   }
 
   // If this is in a foreach loop, make sure it's not a loop iterator
   for (const auto &L : Loops) {
-    VarInit *IterVar = dyn_cast<VarInit>(L.IterVar);
+    VarInit *IterVar = dyn_cast<VarInit>(L->IterVar);
     if (IterVar && IterVar->getNameInit() == Name)
       return IterVar;
   }
@@ -757,15 +862,17 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMLoc NameLoc,
   if (Mode == ParseNameMode)
     return Name;
 
-  if (Record *D = Records.getDef(Name->getValue()))
-    return DefInit::get(D);
+  if (Init *I = Records.getGlobal(Name->getValue()))
+    return I;
 
-  if (Mode == ParseValueMode) {
-    Error(NameLoc, "Variable not defined: '" + Name->getValue() + "'");
-    return nullptr;
-  }
+  // Allow self-references of concrete defs, but delay the lookup so that we
+  // get the correct type.
+  if (CurRec && !CurRec->isClass() && !CurMultiClass &&
+      CurRec->getNameInit() == Name)
+    return UnOpInit::get(UnOpInit::CAST, Name, CurRec->getType());
 
-  return Name;
+  Error(NameLoc, "Variable not defined: '" + Name->getValue() + "'");
+  return nullptr;
 }
 
 /// ParseOperation - Parse an operator.  This returns null on error.
@@ -779,6 +886,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     return nullptr;
   case tgtok::XHead:
   case tgtok::XTail:
+  case tgtok::XSize:
   case tgtok::XEmpty:
   case tgtok::XCast: {  // Value ::= !unop '(' Value ')'
     UnOpInit::UnaryOp Code;
@@ -806,6 +914,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       Lex.Lex();  // eat the operation
       Code = UnOpInit::TAIL;
       break;
+    case tgtok::XSize:
+      Lex.Lex();
+      Code = UnOpInit::SIZE;
+      Type = IntRecTy::get();
+      break;
     case tgtok::XEmpty:
       Lex.Lex();  // eat the operation
       Code = UnOpInit::EMPTY;
@@ -840,12 +953,15 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         }
       }
 
-      if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL) {
+      if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL ||
+          Code == UnOpInit::SIZE) {
         if (!LHSl && !LHSt) {
           TokError("expected list type argument in unary operator");
           return nullptr;
         }
+      }
 
+      if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL) {
         if (LHSl && LHSl->empty()) {
           TokError("empty list argument in unary operator");
           return nullptr;
@@ -876,7 +992,34 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
     Lex.Lex();  // eat the ')'
-    return (UnOpInit::get(Code, LHS, Type))->Fold(CurRec, CurMultiClass);
+    return (UnOpInit::get(Code, LHS, Type))->Fold(CurRec);
+  }
+
+  case tgtok::XIsA: {
+    // Value ::= !isa '<' Type '>' '(' Value ')'
+    Lex.Lex(); // eat the operation
+
+    RecTy *Type = ParseOperatorType();
+    if (!Type)
+      return nullptr;
+
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after type of !isa");
+      return nullptr;
+    }
+    Lex.Lex(); // eat the '('
+
+    Init *LHS = ParseValue(CurRec);
+    if (!LHS)
+      return nullptr;
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in !isa");
+      return nullptr;
+    }
+    Lex.Lex(); // eat the ')'
+
+    return (IsAOpInit::get(Type, LHS))->Fold();
   }
 
   case tgtok::XConcat:
@@ -887,6 +1030,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   case tgtok::XSRL:
   case tgtok::XSHL:
   case tgtok::XEq:
+  case tgtok::XNe:
+  case tgtok::XLe:
+  case tgtok::XLt:
+  case tgtok::XGe:
+  case tgtok::XGt:
   case tgtok::XListConcat:
   case tgtok::XStrConcat: {  // Value ::= !binop '(' Value ',' Value ')'
     tgtok::TokKind OpTok = Lex.getCode();
@@ -894,28 +1042,72 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     Lex.Lex();  // eat the operation
 
     BinOpInit::BinaryOp Code;
-    RecTy *Type = nullptr;
-
     switch (OpTok) {
     default: llvm_unreachable("Unhandled code!");
-    case tgtok::XConcat: Code = BinOpInit::CONCAT;Type = DagRecTy::get(); break;
-    case tgtok::XADD:    Code = BinOpInit::ADD;   Type = IntRecTy::get(); break;
-    case tgtok::XAND:    Code = BinOpInit::AND;   Type = IntRecTy::get(); break;
-    case tgtok::XOR:     Code = BinOpInit::OR;    Type = IntRecTy::get(); break;
-    case tgtok::XSRA:    Code = BinOpInit::SRA;   Type = IntRecTy::get(); break;
-    case tgtok::XSRL:    Code = BinOpInit::SRL;   Type = IntRecTy::get(); break;
-    case tgtok::XSHL:    Code = BinOpInit::SHL;   Type = IntRecTy::get(); break;
-    case tgtok::XEq:     Code = BinOpInit::EQ;    Type = BitRecTy::get(); break;
+    case tgtok::XConcat: Code = BinOpInit::CONCAT; break;
+    case tgtok::XADD:    Code = BinOpInit::ADD; break;
+    case tgtok::XAND:    Code = BinOpInit::AND; break;
+    case tgtok::XOR:     Code = BinOpInit::OR; break;
+    case tgtok::XSRA:    Code = BinOpInit::SRA; break;
+    case tgtok::XSRL:    Code = BinOpInit::SRL; break;
+    case tgtok::XSHL:    Code = BinOpInit::SHL; break;
+    case tgtok::XEq:     Code = BinOpInit::EQ; break;
+    case tgtok::XNe:     Code = BinOpInit::NE; break;
+    case tgtok::XLe:     Code = BinOpInit::LE; break;
+    case tgtok::XLt:     Code = BinOpInit::LT; break;
+    case tgtok::XGe:     Code = BinOpInit::GE; break;
+    case tgtok::XGt:     Code = BinOpInit::GT; break;
+    case tgtok::XListConcat: Code = BinOpInit::LISTCONCAT; break;
+    case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break;
+    }
+
+    RecTy *Type = nullptr;
+    RecTy *ArgType = nullptr;
+    switch (OpTok) {
+    default:
+      llvm_unreachable("Unhandled code!");
+    case tgtok::XConcat:
+      Type = DagRecTy::get();
+      ArgType = DagRecTy::get();
+      break;
+    case tgtok::XAND:
+    case tgtok::XOR:
+    case tgtok::XSRA:
+    case tgtok::XSRL:
+    case tgtok::XSHL:
+    case tgtok::XADD:
+      Type = IntRecTy::get();
+      ArgType = IntRecTy::get();
+      break;
+    case tgtok::XEq:
+    case tgtok::XNe:
+      Type = BitRecTy::get();
+      // ArgType for Eq / Ne is not known at this point
+      break;
+    case tgtok::XLe:
+    case tgtok::XLt:
+    case tgtok::XGe:
+    case tgtok::XGt:
+      Type = BitRecTy::get();
+      ArgType = IntRecTy::get();
+      break;
     case tgtok::XListConcat:
-      Code = BinOpInit::LISTCONCAT;
       // We don't know the list type until we parse the first argument
+      ArgType = ItemType;
       break;
     case tgtok::XStrConcat:
-      Code = BinOpInit::STRCONCAT;
       Type = StringRecTy::get();
+      ArgType = StringRecTy::get();
       break;
     }
 
+    if (Type && ItemType && !Type->typeIsConvertibleTo(ItemType)) {
+      Error(OpLoc, Twine("expected value of type '") +
+                   ItemType->getAsString() + "', got '" +
+                   Type->getAsString() + "'");
+      return nullptr;
+    }
+
     if (Lex.getCode() != tgtok::l_paren) {
       TokError("expected '(' after binary operator");
       return nullptr;
@@ -924,14 +1116,52 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 
     SmallVector<Init*, 2> InitList;
 
-    InitList.push_back(ParseValue(CurRec));
-    if (!InitList.back()) return nullptr;
+    for (;;) {
+      SMLoc InitLoc = Lex.getLoc();
+      InitList.push_back(ParseValue(CurRec, ArgType));
+      if (!InitList.back()) return nullptr;
 
-    while (Lex.getCode() == tgtok::comma) {
-      Lex.Lex();  // eat the ','
+      // All BinOps require their arguments to be of compatible types.
+      TypedInit *TI = dyn_cast<TypedInit>(InitList.back());
+      if (!ArgType) {
+        ArgType = TI->getType();
 
-      InitList.push_back(ParseValue(CurRec));
-      if (!InitList.back()) return nullptr;
+        switch (Code) {
+        case BinOpInit::LISTCONCAT:
+          if (!isa<ListRecTy>(ArgType)) {
+            Error(InitLoc, Twine("expected a list, got value of type '") +
+                           ArgType->getAsString() + "'");
+            return nullptr;
+          }
+          break;
+        case BinOpInit::EQ:
+        case BinOpInit::NE:
+          if (!ArgType->typeIsConvertibleTo(IntRecTy::get()) &&
+              !ArgType->typeIsConvertibleTo(StringRecTy::get())) {
+            Error(InitLoc, Twine("expected int, bits, or string; got value of "
+                                 "type '") + ArgType->getAsString() + "'");
+            return nullptr;
+          }
+          break;
+        default: llvm_unreachable("other ops have fixed argument types");
+        }
+      } else {
+        RecTy *Resolved = resolveTypes(ArgType, TI->getType());
+        if (!Resolved) {
+          Error(InitLoc, Twine("expected value of type '") +
+                         ArgType->getAsString() + "', got '" +
+                         TI->getType()->getAsString() + "'");
+          return nullptr;
+        }
+        if (Code != BinOpInit::ADD && Code != BinOpInit::AND &&
+            Code != BinOpInit::OR && Code != BinOpInit::SRA &&
+            Code != BinOpInit::SRL && Code != BinOpInit::SHL)
+          ArgType = Resolved;
+      }
+
+      if (Lex.getCode() != tgtok::comma)
+        break;
+      Lex.Lex();  // eat the ','
     }
 
     if (Lex.getCode() != tgtok::r_paren) {
@@ -940,40 +1170,142 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     }
     Lex.Lex();  // eat the ')'
 
-    // If we are doing !listconcat, we should know the type by now
-    if (OpTok == tgtok::XListConcat) {
-      if (VarInit *Arg0 = dyn_cast<VarInit>(InitList[0]))
-        Type = Arg0->getType();
-      else if (ListInit *Arg0 = dyn_cast<ListInit>(InitList[0]))
-        Type = Arg0->getType();
-      else {
-        InitList[0]->print(errs());
-        Error(OpLoc, "expected a list");
-        return nullptr;
-      }
-    }
+    if (Code == BinOpInit::LISTCONCAT)
+      Type = ArgType;
 
     // We allow multiple operands to associative operators like !strconcat as
     // shorthand for nesting them.
-    if (Code == BinOpInit::STRCONCAT || Code == BinOpInit::LISTCONCAT) {
+    if (Code == BinOpInit::STRCONCAT || Code == BinOpInit::LISTCONCAT ||
+        Code == BinOpInit::CONCAT || Code == BinOpInit::ADD ||
+        Code == BinOpInit::AND || Code == BinOpInit::OR) {
       while (InitList.size() > 2) {
         Init *RHS = InitList.pop_back_val();
-        RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))
-                           ->Fold(CurRec, CurMultiClass);
+        RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))->Fold(CurRec);
         InitList.back() = RHS;
       }
     }
 
     if (InitList.size() == 2)
       return (BinOpInit::get(Code, InitList[0], InitList[1], Type))
-        ->Fold(CurRec, CurMultiClass);
+          ->Fold(CurRec);
 
     Error(OpLoc, "expected two operands to operator");
     return nullptr;
   }
 
+  case tgtok::XForEach: { // Value ::= !foreach '(' Id ',' Value ',' Value ')'
+    SMLoc OpLoc = Lex.getLoc();
+    Lex.Lex(); // eat the operation
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after !foreach");
+      return nullptr;
+    }
+
+    if (Lex.Lex() != tgtok::Id) { // eat the '('
+      TokError("first argument of !foreach must be an identifier");
+      return nullptr;
+    }
+
+    Init *LHS = StringInit::get(Lex.getCurStrVal());
+
+    if (CurRec && CurRec->getValue(LHS)) {
+      TokError((Twine("iteration variable '") + LHS->getAsString() +
+                "' already defined")
+                   .str());
+      return nullptr;
+    }
+
+    if (Lex.Lex() != tgtok::comma) { // eat the id
+      TokError("expected ',' in ternary operator");
+      return nullptr;
+    }
+    Lex.Lex();  // eat the ','
+
+    Init *MHS = ParseValue(CurRec);
+    if (!MHS)
+      return nullptr;
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in ternary operator");
+      return nullptr;
+    }
+    Lex.Lex();  // eat the ','
+
+    TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
+    if (!MHSt) {
+      TokError("could not get type of !foreach input");
+      return nullptr;
+    }
+
+    RecTy *InEltType = nullptr;
+    RecTy *OutEltType = nullptr;
+    bool IsDAG = false;
+
+    if (ListRecTy *InListTy = dyn_cast<ListRecTy>(MHSt->getType())) {
+      InEltType = InListTy->getElementType();
+      if (ItemType) {
+        if (ListRecTy *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
+          OutEltType = OutListTy->getElementType();
+        } else {
+          Error(OpLoc,
+                "expected value of type '" + Twine(ItemType->getAsString()) +
+                "', but got !foreach of list type");
+          return nullptr;
+        }
+      }
+    } else if (DagRecTy *InDagTy = dyn_cast<DagRecTy>(MHSt->getType())) {
+      InEltType = InDagTy;
+      if (ItemType && !isa<DagRecTy>(ItemType)) {
+        Error(OpLoc,
+              "expected value of type '" + Twine(ItemType->getAsString()) +
+              "', but got !foreach of dag type");
+        return nullptr;
+      }
+      IsDAG = true;
+    } else {
+      TokError("!foreach must have list or dag input");
+      return nullptr;
+    }
+
+    // We need to create a temporary record to provide a scope for the iteration
+    // variable while parsing top-level foreach's.
+    std::unique_ptr<Record> ParseRecTmp;
+    Record *ParseRec = CurRec;
+    if (!ParseRec) {
+      ParseRecTmp = make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
+      ParseRec = ParseRecTmp.get();
+    }
+
+    ParseRec->addValue(RecordVal(LHS, InEltType, false));
+    Init *RHS = ParseValue(ParseRec, OutEltType);
+    ParseRec->removeValue(LHS);
+    if (!RHS)
+      return nullptr;
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in binary operator");
+      return nullptr;
+    }
+    Lex.Lex();  // eat the ')'
+
+    RecTy *OutType;
+    if (IsDAG) {
+      OutType = InEltType;
+    } else {
+      TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
+      if (!RHSt) {
+        TokError("could not get type of !foreach result");
+        return nullptr;
+      }
+      OutType = RHSt->getType()->getListTy();
+    }
+
+    return (TernOpInit::get(TernOpInit::FOREACH, LHS, MHS, RHS, OutType))
+        ->Fold(CurRec);
+  }
+
+  case tgtok::XDag:
   case tgtok::XIf:
-  case tgtok::XForEach:
   case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
     TernOpInit::TernaryOp Code;
     RecTy *Type = nullptr;
@@ -982,12 +1314,14 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     Lex.Lex();  // eat the operation
     switch (LexCode) {
     default: llvm_unreachable("Unhandled code!");
+    case tgtok::XDag:
+      Code = TernOpInit::DAG;
+      Type = DagRecTy::get();
+      ItemType = nullptr;
+      break;
     case tgtok::XIf:
       Code = TernOpInit::IF;
       break;
-    case tgtok::XForEach:
-      Code = TernOpInit::FOREACH;
-      break;
     case tgtok::XSubst:
       Code = TernOpInit::SUBST;
       break;
@@ -1007,6 +1341,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     }
     Lex.Lex();  // eat the ','
 
+    SMLoc MHSLoc = Lex.getLoc();
     Init *MHS = ParseValue(CurRec, ItemType);
     if (!MHS)
       return nullptr;
@@ -1017,6 +1352,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     }
     Lex.Lex();  // eat the ','
 
+    SMLoc RHSLoc = Lex.getLoc();
     Init *RHS = ParseValue(CurRec, ItemType);
     if (!RHS)
       return nullptr;
@@ -1029,6 +1365,36 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 
     switch (LexCode) {
     default: llvm_unreachable("Unhandled code!");
+    case tgtok::XDag: {
+      TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
+      if (!MHSt && !isa<UnsetInit>(MHS)) {
+        Error(MHSLoc, "could not determine type of the child list in !dag");
+        return nullptr;
+      }
+      if (MHSt && !isa<ListRecTy>(MHSt->getType())) {
+        Error(MHSLoc, Twine("expected list of children, got type '") +
+                          MHSt->getType()->getAsString() + "'");
+        return nullptr;
+      }
+
+      TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
+      if (!RHSt && !isa<UnsetInit>(RHS)) {
+        Error(RHSLoc, "could not determine type of the name list in !dag");
+        return nullptr;
+      }
+      if (RHSt && StringRecTy::get()->getListTy() != RHSt->getType()) {
+        Error(RHSLoc, Twine("expected list<string>, got type '") +
+                          RHSt->getType()->getAsString() + "'");
+        return nullptr;
+      }
+
+      if (!MHSt && !RHSt) {
+        Error(MHSLoc,
+              "cannot have both unset children and unset names in !dag");
+        return nullptr;
+      }
+      break;
+    }
     case tgtok::XIf: {
       RecTy *MHSTy = nullptr;
       RecTy *RHSTy = nullptr;
@@ -1058,23 +1424,12 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
 
-      if (MHSTy->typeIsConvertibleTo(RHSTy)) {
-        Type = RHSTy;
-      } else if (RHSTy->typeIsConvertibleTo(MHSTy)) {
-        Type = MHSTy;
-      } else {
-        TokError("inconsistent types for !if");
-        return nullptr;
-      }
-      break;
-    }
-    case tgtok::XForEach: {
-      TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
-      if (!MHSt) {
-        TokError("could not get type for !foreach");
+      Type = resolveTypes(MHSTy, RHSTy);
+      if (!Type) {
+        TokError(Twine("inconsistent types '") + MHSTy->getAsString() +
+                 "' and '" + RHSTy->getAsString() + "' for !if");
         return nullptr;
       }
-      Type = MHSt->getType();
       break;
     }
     case tgtok::XSubst: {
@@ -1087,8 +1442,133 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       break;
     }
     }
-    return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec,
-                                                             CurMultiClass);
+    return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec);
+  }
+
+  case tgtok::XFoldl: {
+    // Value ::= !foldl '(' Id ',' Id ',' Value ',' Value ',' Value ')'
+    Lex.Lex(); // eat the operation
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after !foldl");
+      return nullptr;
+    }
+    Lex.Lex(); // eat the '('
+
+    Init *StartUntyped = ParseValue(CurRec);
+    if (!StartUntyped)
+      return nullptr;
+
+    TypedInit *Start = dyn_cast<TypedInit>(StartUntyped);
+    if (!Start) {
+      TokError(Twine("could not get type of !foldl start: '") +
+               StartUntyped->getAsString() + "'");
+      return nullptr;
+    }
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in !foldl");
+      return nullptr;
+    }
+    Lex.Lex(); // eat the ','
+
+    Init *ListUntyped = ParseValue(CurRec);
+    if (!ListUntyped)
+      return nullptr;
+
+    TypedInit *List = dyn_cast<TypedInit>(ListUntyped);
+    if (!List) {
+      TokError(Twine("could not get type of !foldl list: '") +
+               ListUntyped->getAsString() + "'");
+      return nullptr;
+    }
+
+    ListRecTy *ListType = dyn_cast<ListRecTy>(List->getType());
+    if (!ListType) {
+      TokError(Twine("!foldl list must be a list, but is of type '") +
+               List->getType()->getAsString());
+      return nullptr;
+    }
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in !foldl");
+      return nullptr;
+    }
+
+    if (Lex.Lex() != tgtok::Id) { // eat the ','
+      TokError("third argument of !foldl must be an identifier");
+      return nullptr;
+    }
+
+    Init *A = StringInit::get(Lex.getCurStrVal());
+    if (CurRec && CurRec->getValue(A)) {
+      TokError((Twine("left !foldl variable '") + A->getAsString() +
+                "' already defined")
+                   .str());
+      return nullptr;
+    }
+
+    if (Lex.Lex() != tgtok::comma) { // eat the id
+      TokError("expected ',' in !foldl");
+      return nullptr;
+    }
+
+    if (Lex.Lex() != tgtok::Id) { // eat the ','
+      TokError("fourth argument of !foldl must be an identifier");
+      return nullptr;
+    }
+
+    Init *B = StringInit::get(Lex.getCurStrVal());
+    if (CurRec && CurRec->getValue(B)) {
+      TokError((Twine("right !foldl variable '") + B->getAsString() +
+                "' already defined")
+                   .str());
+      return nullptr;
+    }
+
+    if (Lex.Lex() != tgtok::comma) { // eat the id
+      TokError("expected ',' in !foldl");
+      return nullptr;
+    }
+    Lex.Lex(); // eat the ','
+
+    // We need to create a temporary record to provide a scope for the iteration
+    // variable while parsing top-level foreach's.
+    std::unique_ptr<Record> ParseRecTmp;
+    Record *ParseRec = CurRec;
+    if (!ParseRec) {
+      ParseRecTmp = make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
+      ParseRec = ParseRecTmp.get();
+    }
+
+    ParseRec->addValue(RecordVal(A, Start->getType(), false));
+    ParseRec->addValue(RecordVal(B, ListType->getElementType(), false));
+    Init *ExprUntyped = ParseValue(ParseRec);
+    ParseRec->removeValue(A);
+    ParseRec->removeValue(B);
+    if (!ExprUntyped)
+      return nullptr;
+
+    TypedInit *Expr = dyn_cast<TypedInit>(ExprUntyped);
+    if (!Expr) {
+      TokError("could not get type of !foldl expression");
+      return nullptr;
+    }
+
+    if (Expr->getType() != Start->getType()) {
+      TokError(Twine("!foldl expression must be of same type as start (") +
+               Start->getType()->getAsString() + "), but is of type " +
+               Expr->getType()->getAsString());
+      return nullptr;
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in fold operator");
+      return nullptr;
+    }
+    Lex.Lex(); // eat the ')'
+
+    return FoldOpInit::get(Start, List, A, B, Expr, Start->getType())
+        ->Fold(CurRec);
   }
   }
 }
@@ -1204,60 +1684,49 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       return nullptr;
     }
 
-    SubClassReference SCRef;
-    ParseValueList(SCRef.TemplateArgs, CurRec, Class);
-    if (SCRef.TemplateArgs.empty()) return nullptr;
+    SmallVector<Init *, 8> Args;
+    ParseValueList(Args, CurRec, Class);
+    if (Args.empty()) return nullptr;
 
     if (Lex.getCode() != tgtok::greater) {
       TokError("expected '>' at end of value list");
       return nullptr;
     }
     Lex.Lex();  // eat the '>'
-    SMLoc EndLoc = Lex.getLoc();
-
-    // Create the new record, set it as CurRec temporarily.
-    auto NewRecOwner = llvm::make_unique<Record>(GetNewAnonymousName(), NameLoc,
-                                                 Records, /*IsAnonymous=*/true);
-    Record *NewRec = NewRecOwner.get(); // Keep a copy since we may release.
-    SCRef.RefRange = SMRange(NameLoc, EndLoc);
-    SCRef.Rec = Class;
-    // Add info about the subclass to NewRec.
-    if (AddSubClass(NewRec, SCRef))
-      return nullptr;
 
-    if (!CurMultiClass) {
-      NewRec->resolveReferences();
-      Records.addDef(std::move(NewRecOwner));
-    } else {
-      // This needs to get resolved once the multiclass template arguments are
-      // known before any use.
-      NewRec->setResolveFirst(true);
-      // Otherwise, we're inside a multiclass, add it to the multiclass.
-      CurMultiClass->DefPrototypes.push_back(std::move(NewRecOwner));
-
-      // Copy the template arguments for the multiclass into the def.
-      for (Init *TArg : CurMultiClass->Rec.getTemplateArgs()) {
-        const RecordVal *RV = CurMultiClass->Rec.getValue(TArg);
-        assert(RV && "Template arg doesn't exist?");
-        NewRec->addValue(*RV);
-      }
+    // Typecheck the template arguments list
+    ArrayRef<Init *> ExpectedArgs = Class->getTemplateArgs();
+    if (ExpectedArgs.size() < Args.size()) {
+      Error(NameLoc,
+            "More template args specified than expected");
+      return nullptr;
+    }
 
-      // We can't return the prototype def here, instead return:
-      // !cast<ItemType>(!strconcat(NAME, AnonName)).
-      const RecordVal *MCNameRV = CurMultiClass->Rec.getValue("NAME");
-      assert(MCNameRV && "multiclass record must have a NAME");
+    for (unsigned i = 0, e = ExpectedArgs.size(); i != e; ++i) {
+      RecordVal *ExpectedArg = Class->getValue(ExpectedArgs[i]);
+      if (i < Args.size()) {
+        if (TypedInit *TI = dyn_cast<TypedInit>(Args[i])) {
+          RecTy *ExpectedType = ExpectedArg->getType();
+          if (!TI->getType()->typeIsConvertibleTo(ExpectedType)) {
+            Error(NameLoc,
+                  "Value specified for template argument #" + Twine(i) + " (" +
+                  ExpectedArg->getNameInitAsString() + ") is of type '" +
+                  TI->getType()->getAsString() + "', expected '" +
+                  ExpectedType->getAsString() + "': " + TI->getAsString());
+            return nullptr;
+          }
+          continue;
+        }
+      } else if (ExpectedArg->getValue()->isComplete())
+        continue;
 
-      return UnOpInit::get(UnOpInit::CAST,
-                           BinOpInit::get(BinOpInit::STRCONCAT,
-                                          VarInit::get(MCNameRV->getName(),
-                                                       MCNameRV->getType()),
-                                          NewRec->getNameInit(),
-                                          StringRecTy::get()),
-                           Class->getDefInit()->getType());
+      Error(NameLoc,
+            "Value not specified for template argument #" + Twine(i) + " (" +
+            ExpectedArgs[i]->getAsUnquotedString() + ")");
+      return nullptr;
     }
 
-    // The result of the expression is a reference to the new record.
-    return DefInit::get(NewRec);
+    return VarDefInit::get(Class, Args)->Fold();
   }
   case tgtok::l_brace: {           // Value ::= '{' ValueList '}'
     SMLoc BraceLoc = Lex.getLoc();
@@ -1299,7 +1768,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
         // Fallthrough to try convert this to a bit.
       }
       // All other values must be convertible to just a single bit.
-      Init *Bit = Vals[i]->convertInitializerTo(BitRecTy::get());
+      Init *Bit = Vals[i]->getCastTo(BitRecTy::get());
       if (!Bit) {
         Error(BraceLoc, "Element #" + Twine(i) + " (" + Vals[i]->getAsString() +
               ") is not convertable to a bit");
@@ -1360,18 +1829,16 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     RecTy *EltTy = nullptr;
     for (Init *V : Vals) {
       TypedInit *TArg = dyn_cast<TypedInit>(V);
-      if (!TArg) {
-        TokError("Untyped list element");
-        return nullptr;
-      }
-      if (EltTy) {
-        EltTy = resolveTypes(EltTy, TArg->getType());
-        if (!EltTy) {
-          TokError("Incompatible types in list elements");
-          return nullptr;
+      if (TArg) {
+        if (EltTy) {
+          EltTy = resolveTypes(EltTy, TArg->getType());
+          if (!EltTy) {
+            TokError("Incompatible types in list elements");
+            return nullptr;
+          }
+        } else {
+          EltTy = TArg->getType();
         }
-      } else {
-        EltTy = TArg->getType();
       }
     }
 
@@ -1396,7 +1863,9 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       // Make sure the deduced type is compatible with the given type
       if (GivenListTy) {
         if (!EltTy->typeIsConvertibleTo(GivenListTy->getElementType())) {
-          TokError("Element type mismatch for list");
+          TokError(Twine("Element type mismatch for list: element type '") +
+                   EltTy->getAsString() + "' not convertible to '" +
+                   GivenListTy->getElementType()->getAsString());
           return nullptr;
         }
       }
@@ -1443,9 +1912,12 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 
   case tgtok::XHead:
   case tgtok::XTail:
+  case tgtok::XSize:
   case tgtok::XEmpty:
   case tgtok::XCast:  // Value ::= !unop '(' Value ')'
+  case tgtok::XIsA:
   case tgtok::XConcat:
+  case tgtok::XDag:
   case tgtok::XADD:
   case tgtok::XAND:
   case tgtok::XOR:
@@ -1453,9 +1925,15 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XSRL:
   case tgtok::XSHL:
   case tgtok::XEq:
+  case tgtok::XNe:
+  case tgtok::XLe:
+  case tgtok::XLt:
+  case tgtok::XGe:
+  case tgtok::XGt:
   case tgtok::XListConcat:
   case tgtok::XStrConcat:   // Value ::= !binop '(' Value ',' Value ')'
   case tgtok::XIf:
+  case tgtok::XFoldl:
   case tgtok::XForEach:
   case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
     return ParseOperation(CurRec, ItemType);
@@ -1481,7 +1959,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
     switch (Lex.getCode()) {
     default: return Result;
     case tgtok::l_brace: {
-      if (Mode == ParseNameMode || Mode == ParseForeachMode)
+      if (Mode == ParseNameMode)
         // This is the beginning of the object body.
         return Result;
 
@@ -1539,7 +2017,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
                  Result->getAsString() + "'");
         return nullptr;
       }
-      Result = FieldInit::get(Result, FieldName);
+      Result = FieldInit::get(Result, FieldName)->Fold(CurRec);
       Lex.Lex();  // eat field name
       break;
     }
@@ -1557,13 +2035,20 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       }
 
       if (LHS->getType() != StringRecTy::get()) {
-        LHS = UnOpInit::get(UnOpInit::CAST, LHS, StringRecTy::get());
+        LHS = dyn_cast<TypedInit>(
+            UnOpInit::get(UnOpInit::CAST, LHS, StringRecTy::get())
+                ->Fold(CurRec));
+        if (!LHS) {
+          Error(PasteLoc, Twine("can't cast '") + LHS->getAsString() +
+                              "' to string");
+          return nullptr;
+        }
       }
 
       TypedInit *RHS = nullptr;
 
       Lex.Lex();  // Eat the '#'.
-      switch (Lex.getCode()) { 
+      switch (Lex.getCode()) {
       case tgtok::colon:
       case tgtok::semi:
       case tgtok::l_brace:
@@ -1576,7 +2061,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
         break;
 
       default:
-        Init *RHSResult = ParseValue(CurRec, ItemType, ParseNameMode);
+        Init *RHSResult = ParseValue(CurRec, nullptr, ParseNameMode);
         RHS = dyn_cast<TypedInit>(RHSResult);
         if (!RHS) {
           Error(PasteLoc, "RHS of paste is not typed!");
@@ -1584,14 +2069,20 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
         }
 
         if (RHS->getType() != StringRecTy::get()) {
-          RHS = UnOpInit::get(UnOpInit::CAST, RHS, StringRecTy::get());
+          RHS = dyn_cast<TypedInit>(
+              UnOpInit::get(UnOpInit::CAST, RHS, StringRecTy::get())
+                  ->Fold(CurRec));
+          if (!RHS) {
+            Error(PasteLoc, Twine("can't cast '") + RHS->getAsString() +
+                                "' to string");
+            return nullptr;
+          }
         }
 
         break;
       }
 
-      Result = BinOpInit::get(BinOpInit::STRCONCAT, LHS, RHS,
-                              StringRecTy::get())->Fold(CurRec, CurMultiClass);
+      Result = BinOpInit::getStrConcat(LHS, RHS);
       break;
     }
   }
@@ -1720,8 +2211,14 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
     return nullptr;
   }
 
+  std::string Str = Lex.getCurStrVal();
+  if (Str == "NAME") {
+    TokError("'" + Str + "' is a reserved variable name");
+    return nullptr;
+  }
+
   SMLoc IdLoc = Lex.getLoc();
-  Init *DeclName = StringInit::get(Lex.getCurStrVal());
+  Init *DeclName = StringInit::get(Str);
   Lex.Lex();
 
   if (ParsingTemplateArgs) {
@@ -1758,11 +2255,11 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
 /// the name of the declared object or a NULL Init on error.  Return
 /// the name of the parsed initializer list through ForeachListName.
 ///
-///  ForeachDeclaration ::= ID '=' '[' ValueList ']'
 ///  ForeachDeclaration ::= ID '=' '{' RangeList '}'
 ///  ForeachDeclaration ::= ID '=' RangePiece
+///  ForeachDeclaration ::= ID '=' Value
 ///
-VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
+VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
   if (Lex.getCode() != tgtok::Id) {
     TokError("Expected identifier in foreach declaration");
     return nullptr;
@@ -1782,24 +2279,6 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   SmallVector<unsigned, 16> Ranges;
 
   switch (Lex.getCode()) {
-  default: TokError("Unknown token when expecting a range list"); return nullptr;
-  case tgtok::l_square: { // '[' ValueList ']'
-    Init *List = ParseSimpleValue(nullptr, nullptr, ParseForeachMode);
-    ForeachListValue = dyn_cast<ListInit>(List);
-    if (!ForeachListValue) {
-      TokError("Expected a Value list");
-      return nullptr;
-    }
-    RecTy *ValueType = ForeachListValue->getType();
-    ListRecTy *ListType = dyn_cast<ListRecTy>(ValueType);
-    if (!ListType) {
-      TokError("Value list is not of list type");
-      return nullptr;
-    }
-    IterType = ListType->getElementType();
-    break;
-  }
-
   case tgtok::IntVal: { // RangePiece.
     if (ParseRangePiece(Ranges))
       return nullptr;
@@ -1816,6 +2295,25 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
     Lex.Lex();
     break;
   }
+
+  default: {
+    SMLoc ValueLoc = Lex.getLoc();
+    Init *I = ParseValue(nullptr);
+    TypedInit *TI = dyn_cast<TypedInit>(I);
+    if (!TI || !isa<ListRecTy>(TI->getType())) {
+      std::string Type;
+      if (TI)
+        Type = (Twine("' of type '") + TI->getType()->getAsString()).str();
+      Error(ValueLoc, "expected a list, got '" + I->getAsString() + Type + "'");
+      if (CurMultiClass)
+        PrintNote({}, "references to multiclass template arguments cannot be "
+                      "resolved at this time");
+      return nullptr;
+    }
+    ForeachListValue = I;
+    IterType = cast<ListRecTy>(TI->getType())->getElementType();
+    break;
+  }
   }
 
   if (!Ranges.empty()) {
@@ -1857,9 +2355,15 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
     Lex.Lex(); // eat the ','
 
     // Read the following declarations.
+    SMLoc Loc = Lex.getLoc();
     TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
     if (!TemplArg)
       return true;
+
+    if (TheRecToAddTo->isTemplateArg(TemplArg))
+      return Error(Loc, "template argument with the same name has already been "
+                        "defined");
+
     TheRecToAddTo->addTemplateArg(TemplArg);
   }
 
@@ -1945,7 +2449,7 @@ bool TGParser::ParseBody(Record *CurRec) {
   return false;
 }
 
-/// \brief Apply the current let bindings to \a CurRec.
+/// Apply the current let bindings to \a CurRec.
 /// \returns true on error, false otherwise.
 bool TGParser::ApplyLetStack(Record *CurRec) {
   for (SmallVectorImpl<LetRecord> &LetInfo : LetStack)
@@ -1955,6 +2459,18 @@ bool TGParser::ApplyLetStack(Record *CurRec) {
   return false;
 }
 
+bool TGParser::ApplyLetStack(RecordsEntry &Entry) {
+  if (Entry.Rec)
+    return ApplyLetStack(Entry.Rec.get());
+
+  for (auto &E : Entry.Loop->Entries) {
+    if (ApplyLetStack(E))
+      return true;
+  }
+
+  return false;
+}
+
 /// ParseObjectBody - Parse the body of a def or class.  This consists of an
 /// optional ClassList followed by a Body.  CurRec is the current def or class
 /// that is being parsed.
@@ -2002,67 +2518,67 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   Lex.Lex();  // Eat the 'def' token.
 
   // Parse ObjectName and make a record for it.
-  std::unique_ptr<Record> CurRecOwner;
+  std::unique_ptr<Record> CurRec;
   Init *Name = ParseObjectName(CurMultiClass);
-  if (Name)
-    CurRecOwner = make_unique<Record>(Name, DefLoc, Records);
+  if (!Name)
+    return true;
+
+  if (isa<UnsetInit>(Name))
+    CurRec = make_unique<Record>(Records.getNewAnonymousName(), DefLoc, Records,
+                                 /*Anonymous=*/true);
   else
-    CurRecOwner = llvm::make_unique<Record>(GetNewAnonymousName(), DefLoc,
-                                            Records, /*IsAnonymous=*/true);
-  Record *CurRec = CurRecOwner.get(); // Keep a copy since we may release.
+    CurRec = make_unique<Record>(Name, DefLoc, Records);
 
-  if (!CurMultiClass && Loops.empty()) {
-    // Top-level def definition.
+  if (ParseObjectBody(CurRec.get()))
+    return true;
 
-    // Ensure redefinition doesn't happen.
-    if (Records.getDef(CurRec->getNameInitAsString()))
-      return Error(DefLoc, "def '" + CurRec->getNameInitAsString()+
-                   "' already defined");
-    Records.addDef(std::move(CurRecOwner));
+  return addEntry(std::move(CurRec));
+}
 
-    if (ParseObjectBody(CurRec))
-      return true;
-  } else if (CurMultiClass) {
-    // Parse the body before adding this prototype to the DefPrototypes vector.
-    // That way implicit definitions will be added to the DefPrototypes vector
-    // before this object, instantiated prior to defs derived from this object,
-    // and this available for indirect name resolution when defs derived from
-    // this object are instantiated.
-    if (ParseObjectBody(CurRec))
-      return true;
+/// ParseDefset - Parse a defset statement.
+///
+///   Defset ::= DEFSET Type Id '=' '{' ObjectList '}'
+///
+bool TGParser::ParseDefset() {
+  assert(Lex.getCode() == tgtok::Defset);
+  Lex.Lex(); // Eat the 'defset' token
 
-    // Otherwise, a def inside a multiclass, add it to the multiclass.
-    for (const auto &Proto : CurMultiClass->DefPrototypes)
-      if (Proto->getNameInit() == CurRec->getNameInit())
-        return Error(DefLoc, "def '" + CurRec->getNameInitAsString() +
-                     "' already defined in this multiclass!");
-    CurMultiClass->DefPrototypes.push_back(std::move(CurRecOwner));
-  } else if (ParseObjectBody(CurRec)) {
+  DefsetRecord Defset;
+  Defset.Loc = Lex.getLoc();
+  RecTy *Type = ParseType();
+  if (!Type)
     return true;
-  }
-
-  if (!CurMultiClass)  // Def's in multiclasses aren't really defs.
-    // See Record::setName().  This resolve step will see any new name
-    // for the def that might have been created when resolving
-    // inheritance, values and arguments above.
-    CurRec->resolveReferences();
+  if (!isa<ListRecTy>(Type))
+    return Error(Defset.Loc, "expected list type");
+  Defset.EltTy = cast<ListRecTy>(Type)->getElementType();
 
-  // If ObjectBody has template arguments, it's an error.
-  assert(CurRec->getTemplateArgs().empty() && "How'd this get template args?");
+  if (Lex.getCode() != tgtok::Id)
+    return TokError("expected identifier");
+  StringInit *DeclName = StringInit::get(Lex.getCurStrVal());
+  if (Records.getGlobal(DeclName->getValue()))
+    return TokError("def or global variable of this name already exists");
+
+  if (Lex.Lex() != tgtok::equal) // Eat the identifier
+    return TokError("expected '='");
+  if (Lex.Lex() != tgtok::l_brace) // Eat the '='
+    return TokError("expected '{'");
+  SMLoc BraceLoc = Lex.getLoc();
+  Lex.Lex(); // Eat the '{'
+
+  Defsets.push_back(&Defset);
+  bool Err = ParseObjectList(nullptr);
+  Defsets.pop_back();
+  if (Err)
+    return true;
 
-  if (CurMultiClass) {
-    // Copy the template arguments for the multiclass into the def.
-    for (Init *TArg : CurMultiClass->Rec.getTemplateArgs()) {
-      const RecordVal *RV = CurMultiClass->Rec.getValue(TArg);
-      assert(RV && "Template arg doesn't exist?");
-      CurRec->addValue(*RV);
-    }
+  if (Lex.getCode() != tgtok::r_brace) {
+    TokError("expected '}' at end of defset");
+    return Error(BraceLoc, "to match this '{'");
   }
+  Lex.Lex();  // Eat the '}'
 
-  if (ProcessForeachDefs(CurRec, DefLoc))
-    return Error(DefLoc, "Could not process loops for def" +
-                 CurRec->getNameInitAsString());
-
+  Records.addExtraGlobal(DeclName->getValue(),
+                         ListInit::get(Defset.Elements, Defset.EltTy));
   return false;
 }
 
@@ -2073,12 +2589,13 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
 ///   Foreach ::= FOREACH Declaration IN Object
 ///
 bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
+  SMLoc Loc = Lex.getLoc();
   assert(Lex.getCode() == tgtok::Foreach && "Unknown tok");
   Lex.Lex();  // Eat the 'for' token.
 
   // Make a temporary object to record items associated with the for
   // loop.
-  ListInit *ListValue = nullptr;
+  Init *ListValue = nullptr;
   VarInit *IterName = ParseForeachDeclaration(ListValue);
   if (!IterName)
     return TokError("expected declaration in for");
@@ -2088,7 +2605,7 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
   Lex.Lex();  // Eat the in
 
   // Create a loop object and remember it.
-  Loops.push_back(ForeachLoop(IterName, ListValue));
+  Loops.push_back(llvm::make_unique<ForeachLoop>(Loc, IterName, ListValue));
 
   if (Lex.getCode() != tgtok::l_brace) {
     // FOREACH Declaration IN Object
@@ -2110,10 +2627,11 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
     Lex.Lex();  // Eat the }
   }
 
-  // We've processed everything in this loop.
+  // Resolve the loop or store it for later resolution.
+  std::unique_ptr<ForeachLoop> Loop = std::move(Loops.back());
   Loops.pop_back();
 
-  return false;
+  return addEntry(std::move(Loop));
 }
 
 /// ParseClass - Parse a tblgen class definition.
@@ -2130,7 +2648,7 @@ bool TGParser::ParseClass() {
   Record *CurRec = Records.getClass(Lex.getCurStrVal());
   if (CurRec) {
     // If the body was previously defined, this is an error.
-    if (CurRec->getValues().size() > 1 ||  // Account for NAME.
+    if (!CurRec->getValues().empty() ||
         !CurRec->getSuperClasses().empty() ||
         !CurRec->getTemplateArgs().empty())
       return TokError("Class '" + CurRec->getNameInitAsString() +
@@ -2138,7 +2656,8 @@ bool TGParser::ParseClass() {
   } else {
     // If this is the first reference to this class, create and add it.
     auto NewRec =
-        llvm::make_unique<Record>(Lex.getCurStrVal(), Lex.getLoc(), Records);
+        llvm::make_unique<Record>(Lex.getCurStrVal(), Lex.getLoc(), Records,
+                                  /*Class=*/true);
     CurRec = NewRec.get();
     Records.addClass(std::move(NewRec));
   }
@@ -2149,7 +2668,6 @@ bool TGParser::ParseClass() {
     if (ParseTemplateArgList(CurRec))
       return true;
 
-  // Finally, parse the object body.
   return ParseObjectBody(CurRec);
 }
 
@@ -2318,7 +2836,8 @@ bool TGParser::ParseMultiClass() {
     while (Lex.getCode() != tgtok::r_brace) {
       switch (Lex.getCode()) {
       default:
-        return TokError("expected 'let', 'def' or 'defm' in multiclass body");
+        return TokError("expected 'let', 'def', 'defm' or 'foreach' in "
+                        "multiclass body");
       case tgtok::Let:
       case tgtok::Def:
       case tgtok::Defm:
@@ -2335,207 +2854,31 @@ bool TGParser::ParseMultiClass() {
   return false;
 }
 
-Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
-                                           Init *&DefmPrefix,
-                                           SMRange DefmPrefixRange,
-                                           ArrayRef<Init *> TArgs,
-                                           ArrayRef<Init *> TemplateVals) {
-  // We need to preserve DefProto so it can be reused for later
-  // instantiations, so create a new Record to inherit from it.
-
-  // Add in the defm name.  If the defm prefix is empty, give each
-  // instantiated def a unique name.  Otherwise, if "#NAME#" exists in the
-  // name, substitute the prefix for #NAME#.  Otherwise, use the defm name
-  // as a prefix.
-
-  bool IsAnonymous = false;
-  if (!DefmPrefix) {
-    DefmPrefix = GetNewAnonymousName();
-    IsAnonymous = true;
-  }
-
-  Init *DefName = DefProto->getNameInit();
-  StringInit *DefNameString = dyn_cast<StringInit>(DefName);
-
-  if (DefNameString) {
-    // We have a fully expanded string so there are no operators to
-    // resolve.  We should concatenate the given prefix and name.
-    DefName =
-      BinOpInit::get(BinOpInit::STRCONCAT,
-                     UnOpInit::get(UnOpInit::CAST, DefmPrefix,
-                                   StringRecTy::get())->Fold(DefProto, &MC),
-                     DefName, StringRecTy::get())->Fold(DefProto, &MC);
-  }
-
-  // Make a trail of SMLocs from the multiclass instantiations.
-  SmallVector<SMLoc, 4> Locs(1, DefmPrefixRange.Start);
-  Locs.append(DefProto->getLoc().begin(), DefProto->getLoc().end());
-  auto CurRec = make_unique<Record>(DefName, Locs, Records, IsAnonymous);
-
-  SubClassReference Ref;
-  Ref.RefRange = DefmPrefixRange;
-  Ref.Rec = DefProto;
-  AddSubClass(CurRec.get(), Ref);
-
-  // Set the value for NAME. We don't resolve references to it 'til later,
-  // though, so that uses in nested multiclass names don't get
-  // confused.
-  if (SetValue(CurRec.get(), Ref.RefRange.Start, StringInit::get("NAME"), None,
-               DefmPrefix, /*AllowSelfAssignment*/true)) {
-    Error(DefmPrefixRange.Start, "Could not resolve " +
-          CurRec->getNameInitAsString() + ":NAME to '" +
-          DefmPrefix->getAsUnquotedString() + "'");
-    return nullptr;
-  }
-
-  // If the DefNameString didn't resolve, we probably have a reference to
-  // NAME and need to replace it. We need to do at least this much greedily,
-  // otherwise nested multiclasses will end up with incorrect NAME expansions.
-  if (!DefNameString) {
-    RecordVal *DefNameRV = CurRec->getValue("NAME");
-    CurRec->resolveReferencesTo(DefNameRV);
-  }
-
-  if (!CurMultiClass) {
-    // Now that we're at the top level, resolve all NAME references
-    // in the resultant defs that weren't in the def names themselves.
-    RecordVal *DefNameRV = CurRec->getValue("NAME");
-    CurRec->resolveReferencesTo(DefNameRV);
-
-    // Check if the name is a complex pattern.
-    // If so, resolve it.
-    DefName = CurRec->getNameInit();
-    DefNameString = dyn_cast<StringInit>(DefName);
-
-    // OK the pattern is more complex than simply using NAME.
-    // Let's use the heavy weaponery.
-    if (!DefNameString) {
-      ResolveMulticlassDefArgs(MC, CurRec.get(), DefmPrefixRange.Start,
-                               Lex.getLoc(), TArgs, TemplateVals,
-                               false/*Delete args*/);
-      DefName = CurRec->getNameInit();
-      DefNameString = dyn_cast<StringInit>(DefName);
-
-      if (!DefNameString)
-        DefName = DefName->convertInitializerTo(StringRecTy::get());
-
-      // We ran out of options here...
-      DefNameString = dyn_cast<StringInit>(DefName);
-      if (!DefNameString) {
-        PrintFatalError(CurRec->getLoc()[CurRec->getLoc().size() - 1],
-                        DefName->getAsUnquotedString() + " is not a string.");
-        return nullptr;
-      }
-
-      CurRec->setName(DefName);
-    }
-
-    // Now that NAME references are resolved and we're at the top level of
-    // any multiclass expansions, add the record to the RecordKeeper. If we are
-    // currently in a multiclass, it means this defm appears inside a
-    // multiclass and its name won't be fully resolvable until we see
-    // the top-level defm. Therefore, we don't add this to the
-    // RecordKeeper at this point. If we did we could get duplicate
-    // defs as more than one probably refers to NAME or some other
-    // common internal placeholder.
-
-    // Ensure redefinition doesn't happen.
-    if (Records.getDef(CurRec->getNameInitAsString())) {
-      Error(DefmPrefixRange.Start, "def '" + CurRec->getNameInitAsString() +
-            "' already defined, instantiating defm with subdef '" + 
-            DefProto->getNameInitAsString() + "'");
-      return nullptr;
-    }
-
-    Record *CurRecSave = CurRec.get(); // Keep a copy before we release.
-    Records.addDef(std::move(CurRec));
-    return CurRecSave;
-  }
-
-  // FIXME This is bad but the ownership transfer to caller is pretty messy.
-  // The unique_ptr in this function at least protects the exits above.
-  return CurRec.release();
-}
-
-bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, Record *CurRec,
-                                        SMLoc DefmPrefixLoc, SMLoc SubClassLoc,
-                                        ArrayRef<Init *> TArgs,
-                                        ArrayRef<Init *> TemplateVals,
-                                        bool DeleteArgs) {
-  // Loop over all of the template arguments, setting them to the specified
-  // value or leaving them as the default if necessary.
-  for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
-    // Check if a value is specified for this temp-arg.
-    if (i < TemplateVals.size()) {
-      // Set it now.
-      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], None, TemplateVals[i]))
-        return true;
-
-      // Resolve it next.
-      CurRec->resolveReferencesTo(CurRec->getValue(TArgs[i]));
-
-      if (DeleteArgs)
-        // Now remove it.
-        CurRec->removeValue(TArgs[i]);
-
-    } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
-      return Error(SubClassLoc, "value not specified for template argument #" +
-                   Twine(i) + " (" + TArgs[i]->getAsUnquotedString() +
-                   ") of multiclassclass '" + MC.Rec.getNameInitAsString() +
-                   "'");
-    }
-  }
-  return false;
-}
-
-bool TGParser::ResolveMulticlassDef(MultiClass &MC,
-                                    Record *CurRec,
-                                    Record *DefProto,
-                                    SMLoc DefmPrefixLoc) {
-  // If the mdef is inside a 'let' expression, add to each def.
-  if (ApplyLetStack(CurRec))
-    return Error(DefmPrefixLoc, "when instantiating this defm");
-
-  // Don't create a top level definition for defm inside multiclasses,
-  // instead, only update the prototypes and bind the template args
-  // with the new created definition.
-  if (!CurMultiClass)
-    return false;
-  for (const auto &Proto : CurMultiClass->DefPrototypes)
-    if (Proto->getNameInit() == CurRec->getNameInit())
-      return Error(DefmPrefixLoc, "defm '" + CurRec->getNameInitAsString() +
-                   "' already defined in this multiclass!");
-  CurMultiClass->DefPrototypes.push_back(std::unique_ptr<Record>(CurRec));
-
-  // Copy the template arguments for the multiclass into the new def.
-  for (Init * TA : CurMultiClass->Rec.getTemplateArgs()) {
-    const RecordVal *RV = CurMultiClass->Rec.getValue(TA);
-    assert(RV && "Template arg doesn't exist?");
-    CurRec->addValue(*RV);
-  }
-
-  return false;
-}
-
 /// ParseDefm - Parse the instantiation of a multiclass.
 ///
 ///   DefMInst ::= DEFM ID ':' DefmSubClassRef ';'
 ///
 bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
   assert(Lex.getCode() == tgtok::Defm && "Unexpected token!");
-  SMLoc DefmLoc = Lex.getLoc();
-  Init *DefmPrefix = nullptr;
+  Lex.Lex(); // eat the defm
 
-  if (Lex.Lex() == tgtok::Id) {  // eat the defm.
-    DefmPrefix = ParseObjectName(CurMultiClass);
+  Init *DefmName = ParseObjectName(CurMultiClass);
+  if (!DefmName)
+    return true;
+  if (isa<UnsetInit>(DefmName)) {
+    DefmName = Records.getNewAnonymousName();
+    if (CurMultiClass)
+      DefmName = BinOpInit::getStrConcat(
+          VarInit::get(QualifiedNameOfImplicitName(CurMultiClass),
+                       StringRecTy::get()),
+          DefmName);
   }
 
-  SMLoc DefmPrefixEndLoc = Lex.getLoc();
   if (Lex.getCode() != tgtok::colon)
     return TokError("expected ':' after defm identifier");
 
   // Keep track of the new generated record definitions.
-  std::vector<Record*> NewRecDefs;
+  std::vector<RecordsEntry> NewEntries;
 
   // This record also inherits from a regular class (non-multiclass)?
   bool InheritFromClass = false;
@@ -2562,37 +2905,28 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
       return Error(SubClassLoc,
                    "more template args specified than multiclass expects");
 
-    // Loop over all the def's in the multiclass, instantiating each one.
-    for (const std::unique_ptr<Record> &DefProto : MC->DefPrototypes) {
-      // The record name construction goes as follow:
-      //  - If the def name is a string, prepend the prefix.
-      //  - If the def name is a more complex pattern, use that pattern.
-      // As a result, the record is instantiated before resolving
-      // arguments, as it would make its name a string.
-      Record *CurRec = InstantiateMulticlassDef(*MC, DefProto.get(), DefmPrefix,
-                                                SMRange(DefmLoc,
-                                                        DefmPrefixEndLoc),
-                                                TArgs, TemplateVals);
-      if (!CurRec)
-        return true;
-
-      // Now that the record is instantiated, we can resolve arguments.
-      if (ResolveMulticlassDefArgs(*MC, CurRec, DefmLoc, SubClassLoc,
-                                   TArgs, TemplateVals, true/*Delete args*/))
-        return Error(SubClassLoc, "could not instantiate def");
-
-      if (ResolveMulticlassDef(*MC, CurRec, DefProto.get(), DefmLoc))
-        return Error(SubClassLoc, "could not instantiate def");
-
-      // Defs that can be used by other definitions should be fully resolved
-      // before any use.
-      if (DefProto->isResolveFirst() && !CurMultiClass) {
-        CurRec->resolveReferences();
-        CurRec->setResolveFirst(false);
+    SubstStack Substs;
+    for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+      if (i < TemplateVals.size()) {
+        Substs.emplace_back(TArgs[i], TemplateVals[i]);
+      } else {
+        Init *Default = MC->Rec.getValue(TArgs[i])->getValue();
+        if (!Default->isComplete()) {
+          return Error(SubClassLoc,
+                       "value not specified for template argument #" +
+                           Twine(i) + " (" + TArgs[i]->getAsUnquotedString() +
+                           ") of multiclass '" + MC->Rec.getNameInitAsString() +
+                           "'");
+        }
+        Substs.emplace_back(TArgs[i], Default);
       }
-      NewRecDefs.push_back(CurRec);
     }
 
+    Substs.emplace_back(QualifiedNameOfImplicitName(MC), DefmName);
+
+    if (resolve(MC->Entries, Substs, CurMultiClass == nullptr, &NewEntries,
+                &SubClassLoc))
+      return true;
 
     if (Lex.getCode() != tgtok::comma) break;
     Lex.Lex(); // eat ','.
@@ -2622,12 +2956,9 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
 
       // Get the expanded definition prototypes and teach them about
       // the record values the current class to inherit has
-      for (Record *CurRec : NewRecDefs) {
+      for (auto &E : NewEntries) {
         // Add it.
-        if (AddSubClass(CurRec, SubClass))
-          return true;
-
-        if (ApplyLetStack(CurRec))
+        if (AddSubClass(E, SubClass))
           return true;
       }
 
@@ -2637,12 +2968,12 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     }
   }
 
-  if (!CurMultiClass)
-    for (Record *CurRec : NewRecDefs)
-      // See Record::setName().  This resolve step will see any new
-      // name for the def that might have been created when resolving
-      // inheritance, values and arguments above.
-      CurRec->resolveReferences();
+  for (auto &E : NewEntries) {
+    if (ApplyLetStack(E))
+      return true;
+
+    addEntry(std::move(E));
+  }
 
   if (Lex.getCode() != tgtok::semi)
     return TokError("expected ';' at end of defm");
@@ -2661,13 +2992,26 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
 bool TGParser::ParseObject(MultiClass *MC) {
   switch (Lex.getCode()) {
   default:
-    return TokError("Expected class, def, defm, multiclass or let definition");
+    return TokError("Expected class, def, defm, defset, multiclass, let or "
+                    "foreach");
   case tgtok::Let:   return ParseTopLevelLet(MC);
   case tgtok::Def:   return ParseDef(MC);
   case tgtok::Foreach:   return ParseForeach(MC);
   case tgtok::Defm:  return ParseDefm(MC);
-  case tgtok::Class: return ParseClass();
-  case tgtok::MultiClass: return ParseMultiClass();
+  case tgtok::Defset:
+    if (MC)
+      return TokError("defset is not allowed inside multiclass");
+    return ParseDefset();
+  case tgtok::Class:
+    if (MC)
+      return TokError("class is not allowed inside multiclass");
+    if (!Loops.empty())
+      return TokError("class is not allowed inside foreach loop");
+    return ParseClass();
+  case tgtok::MultiClass:
+    if (!Loops.empty())
+      return TokError("multiclass is not allowed inside foreach loop");
+    return ParseMultiClass();
   }
 }
 
@@ -2691,3 +3035,31 @@ bool TGParser::ParseFile() {
 
   return TokError("Unexpected input at top level");
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RecordsEntry::dump() const {
+  if (Loop)
+    Loop->dump();
+  if (Rec)
+    Rec->dump();
+}
+
+LLVM_DUMP_METHOD void ForeachLoop::dump() const {
+  errs() << "foreach " << IterVar->getAsString() << " = "
+         << ListValue->getAsString() << " in {\n";
+
+  for (const auto &E : Entries)
+    E.dump();
+
+  errs() << "}\n";
+}
+
+LLVM_DUMP_METHOD void MultiClass::dump() const {
+  errs() << "Record:\n";
+  Rec.dump();
+
+  errs() << "Defs:\n";
+  for (const auto &E : Entries)
+    E.dump();
+}
+#endif
diff --git a/contrib/llvm/lib/TableGen/TGParser.h b/contrib/llvm/lib/TableGen/TGParser.h
index 1b2966c9f6c9..0a28b3a03aa1 100644
--- a/contrib/llvm/lib/TableGen/TGParser.h
+++ b/contrib/llvm/lib/TableGen/TGParser.h
@@ -27,6 +27,7 @@ namespace llvm {
   class RecordKeeper;
   class RecTy;
   class Init;
+  struct ForeachLoop;
   struct MultiClass;
   struct SubClassReference;
   struct SubMultiClassReference;
@@ -41,16 +42,49 @@ namespace llvm {
     }
   };
 
+  /// RecordsEntry - Can be either a record or a foreach loop.
+  struct RecordsEntry {
+    std::unique_ptr<Record> Rec;
+    std::unique_ptr<ForeachLoop> Loop;
+
+    void dump() const;
+
+    RecordsEntry() {}
+    RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {}
+    RecordsEntry(std::unique_ptr<ForeachLoop> Loop)
+      : Loop(std::move(Loop)) {}
+  };
+
   /// ForeachLoop - Record the iteration state associated with a for loop.
   /// This is used to instantiate items in the loop body.
   struct ForeachLoop {
+    SMLoc Loc;
     VarInit *IterVar;
-    ListInit *ListValue;
+    Init *ListValue;
+    std::vector<RecordsEntry> Entries;
 
-    ForeachLoop(VarInit *IVar, ListInit *LValue)
-      : IterVar(IVar), ListValue(LValue) {}
+    void dump() const;
+
+    ForeachLoop(SMLoc Loc, VarInit *IVar, Init *LValue)
+      : Loc(Loc), IterVar(IVar), ListValue(LValue) {}
   };
 
+  struct DefsetRecord {
+    SMLoc Loc;
+    RecTy *EltTy;
+    SmallVector<Init *, 16> Elements;
+  };
+
+struct MultiClass {
+  Record Rec;  // Placeholder for template args and Name.
+  std::vector<RecordsEntry> Entries;
+
+  void dump() const;
+
+  MultiClass(StringRef Name, SMLoc Loc, RecordKeeper &Records) :
+    Rec(Name, Loc, Records) {}
+};
+
 class TGParser {
   TGLexer Lex;
   std::vector<SmallVector<LetRecord, 4>> LetStack;
@@ -58,8 +92,9 @@ class TGParser {
 
   /// Loops - Keep track of any foreach loops we are within.
   ///
-  typedef std::vector<ForeachLoop> LoopVector;
-  LoopVector Loops;
+  std::vector<std::unique_ptr<ForeachLoop>> Loops;
+
+  SmallVector<DefsetRecord *, 2> Defsets;
 
   /// CurMultiClass - If we are parsing a 'multiclass' definition, this is the
   /// current value.
@@ -68,8 +103,6 @@ class TGParser {
   // Record tracker
   RecordKeeper &Records;
 
-  unsigned AnonCounter;
-
   // A "named boolean" indicating how to parse identifiers.  Usually
   // identifiers map to some existing object but in special cases
   // (e.g. parsing def names) no such object exists yet because we are
@@ -79,12 +112,11 @@ class TGParser {
     ParseValueMode,   // We are parsing a value we expect to look up.
     ParseNameMode,    // We are parsing a name of an object that does not yet
                       // exist.
-    ParseForeachMode  // We are parsing a foreach init.
   };
 
 public:
   TGParser(SourceMgr &SrcMgr, RecordKeeper &records)
-      : Lex(SrcMgr), CurMultiClass(nullptr), Records(records), AnonCounter(0) {}
+      : Lex(SrcMgr), CurMultiClass(nullptr), Records(records) {}
 
   /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
   /// routines return true on error, or false on success.
@@ -107,44 +139,28 @@ private:  // Semantic analysis methods.
                 ArrayRef<unsigned> BitList, Init *V,
                 bool AllowSelfAssignment = false);
   bool AddSubClass(Record *Rec, SubClassReference &SubClass);
+  bool AddSubClass(RecordsEntry &Entry, SubClassReference &SubClass);
   bool AddSubMultiClass(MultiClass *CurMC,
                         SubMultiClassReference &SubMultiClass);
 
-  Init *GetNewAnonymousName();
-
-  // IterRecord: Map an iterator name to a value.
-  struct IterRecord {
-    VarInit *IterVar;
-    Init *IterValue;
-    IterRecord(VarInit *Var, Init *Val) : IterVar(Var), IterValue(Val) {}
-  };
-
-  // IterSet: The set of all iterator values at some point in the
-  // iteration space.
-  typedef std::vector<IterRecord> IterSet;
+  using SubstStack = SmallVector<std::pair<Init *, Init *>, 8>;
 
-  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc);
-  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals);
+  bool addEntry(RecordsEntry E);
+  bool resolve(const ForeachLoop &Loop, SubstStack &Stack, bool Final,
+               std::vector<RecordsEntry> *Dest, SMLoc *Loc = nullptr);
+  bool resolve(const std::vector<RecordsEntry> &Source, SubstStack &Substs,
+               bool Final, std::vector<RecordsEntry> *Dest,
+               SMLoc *Loc = nullptr);
+  bool addDefOne(std::unique_ptr<Record> Rec);
 
 private:  // Parser methods.
   bool ParseObjectList(MultiClass *MC = nullptr);
   bool ParseObject(MultiClass *MC);
   bool ParseClass();
   bool ParseMultiClass();
-  Record *InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
-                                   Init *&DefmPrefix, SMRange DefmPrefixRange,
-                                   ArrayRef<Init *> TArgs,
-                                   ArrayRef<Init *> TemplateVals);
-  bool ResolveMulticlassDefArgs(MultiClass &MC, Record *DefProto,
-                                SMLoc DefmPrefixLoc, SMLoc SubClassLoc,
-                                ArrayRef<Init *> TArgs,
-                                ArrayRef<Init *> TemplateVals, bool DeleteArgs);
-  bool ResolveMulticlassDef(MultiClass &MC,
-                            Record *CurRec,
-                            Record *DefProto,
-                            SMLoc DefmPrefixLoc);
   bool ParseDefm(MultiClass *CurMultiClass);
   bool ParseDef(MultiClass *CurMultiClass);
+  bool ParseDefset();
   bool ParseForeach(MultiClass *CurMultiClass);
   bool ParseTopLevelLet(MultiClass *CurMultiClass);
   void ParseLetList(SmallVectorImpl<LetRecord> &Result);
@@ -155,7 +171,7 @@ private:  // Parser methods.
 
   bool ParseTemplateArgList(Record *CurRec);
   Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs);
-  VarInit *ParseForeachDeclaration(ListInit *&ForeachListValue);
+  VarInit *ParseForeachDeclaration(Init *&ForeachListValue);
 
   SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm);
   SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC);
@@ -182,6 +198,7 @@ private:  // Parser methods.
   Record *ParseClassID();
   MultiClass *ParseMultiClassID();
   bool ApplyLetStack(Record *CurRec);
+  bool ApplyLetStack(RecordsEntry &Entry);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
index 75fb937de9bf..a69d38144c78 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -26,8 +26,32 @@ def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
+def FeatureSM4 : SubtargetFeature<
+    "sm4", "HasSM4", "true",
+    "Enable SM3 and SM4 support", [FeatureNEON]>;
+
+def FeatureSHA2 : SubtargetFeature<
+    "sha2", "HasSHA2", "true",
+    "Enable SHA1 and SHA256 support", [FeatureNEON]>;
+
+def FeatureSHA3 : SubtargetFeature<
+    "sha3", "HasSHA3", "true",
+    "Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2]>;
+
+def FeatureAES : SubtargetFeature<
+    "aes", "HasAES", "true",
+    "Enable AES support", [FeatureNEON]>;
+
+// Crypto has been split up and any combination is now valid (see the
+// crypto defintions above). Also, crypto is now context sensitive:
+// it has a different meaning for e.g. Armv8.4 than it has for Armv8.2.
+// Therefore, we rely on Clang, the user interacing tool, to pass on the
+// appropriate crypto options. But here in the backend, crypto has very little
+// meaning anymore. We kept the Crypto defintion here for backward
+// compatibility, and now imply features SHA2 and AES, which was the
+// "traditional" meaning of Crypto.
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-  "Enable cryptographic instructions", [FeatureNEON]>;
+  "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
@@ -76,6 +100,10 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
                                          "Reserve X18, making it unavailable "
                                          "as a GPR">;
 
+def FeatureReserveX20 : SubtargetFeature<"reserve-x20", "ReserveX20", "true",
+                                         "Reserve X20, making it unavailable "
+                                         "as a GPR">;
+
 def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
                                     "Use alias analysis during codegen">;
 
@@ -91,6 +119,11 @@ def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
     "CustomAsCheapAsMove", "true",
     "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
 
+def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
+    "ExynosAsCheapAsMove", "true",
+    "Use Exynos specific code in TargetInstrInfo::isAsCheapAsAMove()",
+    [FeatureCustomCheapAsMoveHandling]>;
+
 def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
 
@@ -115,10 +148,18 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
     "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
     "CPU fuses arithmetic + cbz/cbnz operations">;
 
+def FeatureFuseAddress : SubtargetFeature<
+    "fuse-address", "HasFuseAddress", "true",
+    "CPU fuses address generation and memory operations">;
+
 def FeatureFuseAES : SubtargetFeature<
     "fuse-aes", "HasFuseAES", "true",
     "CPU fuses AES crypto operations">;
 
+def FeatureFuseCCSelect : SubtargetFeature<
+    "fuse-csel", "HasFuseCCSelect", "true",
+    "CPU fuses conditional select operations">;
+
 def FeatureFuseLiterals : SubtargetFeature<
     "fuse-literals", "HasFuseLiterals", "true",
     "CPU fuses literal generation operations">;
@@ -149,6 +190,12 @@ def FeatureLSLFast : SubtargetFeature<
     "lsl-fast", "HasLSLFast", "true",
     "CPU has a fastpath logical shift of up to 3 places">;
 
+def FeatureAggressiveFMA :
+  SubtargetFeature<"aggressive-fma",
+                   "HasAggressiveFMA",
+                   "true",
+                   "Enable Aggressive FMA for floating-point.">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -162,6 +209,9 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
 def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
   "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC]>;
 
+def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
+  "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -193,7 +243,8 @@ include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 include "AArch64SchedFalkor.td"
 include "AArch64SchedKryo.td"
-include "AArch64SchedM1.td"
+include "AArch64SchedExynosM1.td"
+include "AArch64SchedExynosM3.td"
 include "AArch64SchedThunderX.td"
 include "AArch64SchedThunderX2T99.td"
 
@@ -294,7 +345,6 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
                                    FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon,
-                                   FeatureSlowMisaligned128Store,
                                    FeatureZCRegMove,
                                    FeatureZCZeroing,
                                    FeatureZCZeroingFPWorkaround
@@ -305,7 +355,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                     [FeatureSlowPaired128,
                                      FeatureCRC,
                                      FeatureCrypto,
-                                     FeatureCustomCheapAsMoveHandling,
+                                     FeatureExynosCheapAsMoveHandling,
                                      FeatureFPARMv8,
                                      FeatureFuseAES,
                                      FeatureNEON,
@@ -316,11 +366,11 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeatureZCZeroing]>;
 
 def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
-                                    "Samsung Exynos-M2/M3 processors",
+                                    "Samsung Exynos-M2 processors",
                                     [FeatureSlowPaired128,
                                      FeatureCRC,
                                      FeatureCrypto,
-                                     FeatureCustomCheapAsMoveHandling,
+                                     FeatureExynosCheapAsMoveHandling,
                                      FeatureFPARMv8,
                                      FeatureFuseAES,
                                      FeatureNEON,
@@ -329,6 +379,23 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureSlowMisaligned128Store,
                                      FeatureZCZeroing]>;
 
+def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
+                                    "Samsung Exynos-M3 processors",
+                                    [FeatureCRC,
+                                     FeatureCrypto,
+                                     FeatureExynosCheapAsMoveHandling,
+                                     FeatureFPARMv8,
+                                     FeatureFuseAddress,
+                                     FeatureFuseAES,
+                                     FeatureFuseCCSelect,
+                                     FeatureFuseLiterals,
+                                     FeatureLSLFast,
+                                     FeatureNEON,
+                                     FeaturePerfMon,
+                                     FeaturePostRAScheduler,
+                                     FeaturePredictableSelectIsExpensive,
+                                     FeatureZCZeroing]>;
+
 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
                                    FeatureCRC,
@@ -376,6 +443,7 @@ def ProcSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
 def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
                                          "ThunderX2T99",
                                          "Cavium ThunderX2 processors", [
+                                          FeatureAggressiveFMA,
                                           FeatureCRC,
                                           FeatureCrypto,
                                           FeatureFPARMv8,
@@ -449,7 +517,8 @@ def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
 def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
-def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
+def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
+def : ProcessorModel<"exynos-m4", ExynosM3Model, [ProcExynosM3]>;
 def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
 def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
 def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
@@ -469,12 +538,14 @@ def GenericAsmParserVariant : AsmParserVariant {
   int Variant = 0;
   string Name = "generic";
   string BreakCharacters = ".";
+  string TokenizingCharacters = "[]*!/";
 }
 
 def AppleAsmParserVariant : AsmParserVariant {
   int Variant = 1;
   string Name = "apple-neon";
   string BreakCharacters = ".";
+  string TokenizingCharacters = "[]*!/";
 }
 
 //===----------------------------------------------------------------------===//
@@ -504,4 +575,5 @@ def AArch64 : Target {
   let InstructionSet = AArch64InstrInfo;
   let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
   let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 7de5d0ef66b1..30232afaf024 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -116,7 +116,7 @@ INITIALIZE_PASS(AArch64A53Fix835769, "aarch64-fix-cortex-a53-835769-pass",
 
 bool
 AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
-  DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+  LLVM_DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
   bool Changed = false;
   TII = F.getSubtarget().getInstrInfo();
 
@@ -190,7 +190,8 @@ static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
 bool
 AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+  LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB
+                    << " - scanning instructions...\n");
 
   // First, scan the basic block, looking for a sequence of 2 instructions
   // that match the conditions under which the erratum may trigger.
@@ -206,17 +207,17 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
 
   for (auto &MI : MBB) {
     MachineInstr *CurrInstr = &MI;
-    DEBUG(dbgs() << "  Examining: " << MI);
+    LLVM_DEBUG(dbgs() << "  Examining: " << MI);
     if (PrevInstr) {
-      DEBUG(dbgs() << "    PrevInstr: " << *PrevInstr
-                   << "    CurrInstr: " << *CurrInstr
-                   << "    isFirstInstructionInSequence(PrevInstr): "
-                   << isFirstInstructionInSequence(PrevInstr) << "\n"
-                   << "    isSecondInstructionInSequence(CurrInstr): "
-                   << isSecondInstructionInSequence(CurrInstr) << "\n");
+      LLVM_DEBUG(dbgs() << "    PrevInstr: " << *PrevInstr
+                        << "    CurrInstr: " << *CurrInstr
+                        << "    isFirstInstructionInSequence(PrevInstr): "
+                        << isFirstInstructionInSequence(PrevInstr) << "\n"
+                        << "    isSecondInstructionInSequence(CurrInstr): "
+                        << isSecondInstructionInSequence(CurrInstr) << "\n");
       if (isFirstInstructionInSequence(PrevInstr) &&
           isSecondInstructionInSequence(CurrInstr)) {
-        DEBUG(dbgs() << "   ** pattern found at Idx " << Idx << "!\n");
+        LLVM_DEBUG(dbgs() << "   ** pattern found at Idx " << Idx << "!\n");
         Sequences.push_back(CurrInstr);
       }
     }
@@ -225,8 +226,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
     ++Idx;
   }
 
-  DEBUG(dbgs() << "Scan complete, " << Sequences.size()
-               << " occurrences of pattern found.\n");
+  LLVM_DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+                    << " occurrences of pattern found.\n");
 
   // Then update the basic block, inserting nops between the detected sequences.
   for (auto &MI : Sequences) {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 38a7e331bb97..a95476b91187 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -315,7 +315,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
     return false;
 
   bool Changed = false;
-  DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+  LLVM_DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
 
   MRI = &F.getRegInfo();
   TRI = F.getRegInfo().getTargetRegisterInfo();
@@ -330,7 +330,8 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
 
 bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+  LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB
+                    << " - scanning instructions...\n");
 
   // First, scan the basic block producing a set of chains.
 
@@ -343,7 +344,8 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
   for (auto &MI : MBB)
     scanInstruction(&MI, Idx++, ActiveChains, AllChains);
 
-  DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+  LLVM_DEBUG(dbgs() << "Scan complete, " << AllChains.size()
+                    << " chains created.\n");
 
   // Group the chains into disjoint sets based on their liveness range. This is
   // a poor-man's version of graph coloring. Ideally we'd create an interference
@@ -360,7 +362,7 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
     for (auto &J : AllChains)
       if (I != J && I->rangeOverlapsWith(*J))
         EC.unionSets(I.get(), J.get());
-  DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+  LLVM_DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
 
   // Now we assume that every member of an equivalence class interferes
   // with every other member of that class, and with no members of other classes.
@@ -375,9 +377,9 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
 
   // Now we have a set of sets, order them by start address so
   // we can iterate over them sequentially.
-  std::sort(V.begin(), V.end(),
-            [](const std::vector<Chain*> &A,
-               const std::vector<Chain*> &B) {
+  llvm::sort(V.begin(), V.end(),
+             [](const std::vector<Chain*> &A,
+                const std::vector<Chain*> &B) {
       return A.front()->startsBefore(B.front());
     });
 
@@ -440,7 +442,7 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
                                               MachineBasicBlock &MBB,
                                               int &Parity) {
   bool Changed = false;
-  DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+  LLVM_DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
 
   // Sort by descending size order so that we allocate the most important
   // sets first.
@@ -451,7 +453,7 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
   // change them to!
   // Final tie-break with instruction order so pass output is stable (i.e. not
   // dependent on malloc'd pointer values).
-  std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+  llvm::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
       if (G1->size() != G2->size())
         return G1->size() > G2->size();
       if (G1->requiresFixup() != G2->requiresFixup())
@@ -470,16 +472,18 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
       // But if we really don't care, use the chain's preferred color.
       C = G->getPreferredColor();
 
-    DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
-          << ColorNames[(int)C] << "\n");
+    LLVM_DEBUG(dbgs() << " - Parity=" << Parity
+                      << ", Color=" << ColorNames[(int)C] << "\n");
 
     // If we'll need a fixup FMOV, don't bother. Testing has shown that this
     // happens infrequently and when it does it has at least a 50% chance of
     // slowing code down instead of speeding it up.
     if (G->requiresFixup() && C != G->getPreferredColor()) {
       C = G->getPreferredColor();
-      DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
-            "color remains " << ColorNames[(int)C] << "\n");
+      LLVM_DEBUG(dbgs() << " - " << G->str()
+                        << " - not worthwhile changing; "
+                           "color remains "
+                        << ColorNames[(int)C] << "\n");
     }
 
     Changed |= colorChain(G, C, MBB);
@@ -528,17 +532,17 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
 bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
                                            MachineBasicBlock &MBB) {
   bool Changed = false;
-  DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
-        << ColorNames[(int)C] << ")\n");
+  LLVM_DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+                    << ColorNames[(int)C] << ")\n");
 
   // Try and obtain a free register of the right class. Without a register
   // to play with we cannot continue.
   int Reg = scavengeRegister(G, C, MBB);
   if (Reg == -1) {
-    DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+    LLVM_DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
     return false;
   }
-  DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n");
+  LLVM_DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n");
 
   std::map<unsigned, unsigned> Substs;
   for (MachineInstr &I : *G) {
@@ -586,11 +590,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
   assert(Substs.size() == 0 && "No substitutions should be left active!");
 
   if (G->getKill()) {
-    DEBUG(dbgs() << " - Kill instruction seen.\n");
+    LLVM_DEBUG(dbgs() << " - Kill instruction seen.\n");
   } else {
     // We didn't have a kill instruction, but we didn't seem to need to change
     // the destination register anyway.
-    DEBUG(dbgs() << " - Destination register not changed.\n");
+    LLVM_DEBUG(dbgs() << " - Destination register not changed.\n");
   }
   return Changed;
 }
@@ -611,8 +615,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
     // unit.
     unsigned DestReg = MI->getOperand(0).getReg();
 
-    DEBUG(dbgs() << "New chain started for register " << printReg(DestReg, TRI)
-                 << " at " << *MI);
+    LLVM_DEBUG(dbgs() << "New chain started for register "
+                      << printReg(DestReg, TRI) << " at " << *MI);
 
     auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
     ActiveChains[DestReg] = G.get();
@@ -631,8 +635,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
       maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
 
     if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
-      DEBUG(dbgs() << "Chain found for accumulator register "
-                   << printReg(AccumReg, TRI) << " in MI " << *MI);
+      LLVM_DEBUG(dbgs() << "Chain found for accumulator register "
+                        << printReg(AccumReg, TRI) << " in MI " << *MI);
 
       // For simplicity we only chain together sequences of MULs/MLAs where the
       // accumulator register is killed on each instruction. This means we don't
@@ -641,7 +645,7 @@ void AArch64A57FPLoadBalancing::scanInstruction(
       // FIXME: We could extend to handle the non-kill cases for more coverage.
       if (MI->getOperand(3).isKill()) {
         // Add to chain.
-        DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+        LLVM_DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
         ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
         // Handle cases where the destination is not the same as the accumulator.
         if (DestReg != AccumReg) {
@@ -651,13 +655,14 @@ void AArch64A57FPLoadBalancing::scanInstruction(
         return;
       }
 
-      DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
-            << "marked <kill>!\n");
+      LLVM_DEBUG(
+          dbgs() << "Cannot add to chain because accumulator operand wasn't "
+                 << "marked <kill>!\n");
       maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
     }
 
-    DEBUG(dbgs() << "Creating new chain for dest register "
-                 << printReg(DestReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Creating new chain for dest register "
+                      << printReg(DestReg, TRI) << "\n");
     auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
     ActiveChains[DestReg] = G.get();
     AllChains.push_back(std::move(G));
@@ -685,8 +690,8 @@ maybeKillChain(MachineOperand &MO, unsigned Idx,
 
     // If this is a KILL of a current chain, record it.
     if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
-      DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI)
+                        << "\n");
       ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
     }
     ActiveChains.erase(MO.getReg());
@@ -696,8 +701,8 @@ maybeKillChain(MachineOperand &MO, unsigned Idx,
     for (auto I = ActiveChains.begin(), E = ActiveChains.end();
          I != E;) {
       if (MO.clobbersPhysReg(I->first)) {
-        DEBUG(dbgs() << "Kill (regmask) seen for chain "
-                     << printReg(I->first, TRI) << "\n");
+        LLVM_DEBUG(dbgs() << "Kill (regmask) seen for chain "
+                          << printReg(I->first, TRI) << "\n");
         I->second->setKill(MI, Idx, /*Immutable=*/true);
         ActiveChains.erase(I++);
       } else
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 338daecb49e5..22b0c1e3b471 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -277,7 +277,7 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
   MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
                                     TII->get(AArch64::COPY), Dst)
                                 .addReg(Src, getKillRegState(IsKill));
-  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  LLVM_DEBUG(dbgs() << "    adding copy: " << *MIB);
   ++NumCopiesInserted;
   return MIB;
 }
@@ -286,7 +286,7 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
 // to be the correct register class, minimizing cross-class copies.
 void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
-  DEBUG(dbgs() << "Scalar transform: " << MI);
+  LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);
 
   MachineBasicBlock *MBB = MI.getParent();
   unsigned OldOpc = MI.getOpcode();
@@ -391,7 +391,7 @@ bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
 // runOnMachineFunction - Pass entry point from PassManager.
 bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = false;
-  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+  LLVM_DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
 
   if (skipFunction(mf.getFunction()))
     return false;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 6704fa27c86e..52819dedc23d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -71,7 +71,7 @@ public:
 
   StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
 
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// Wrapper for MCInstLowering.lowerOperand() for the
   /// tblgen'erated pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return MCInstLowering.lowerOperand(MO, MCOp);
@@ -88,7 +88,7 @@ public:
 
   void EmitSled(const MachineInstr &MI, SledKind Kind);
 
-  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
@@ -131,7 +131,7 @@ private:
 
   AArch64FunctionInfo *AArch64FI = nullptr;
 
-  /// \brief Emit the LOHs contained in AArch64FI.
+  /// Emit the LOHs contained in AArch64FI.
   void EmitLOHs();
 
   /// Emit instruction to set float register to zero.
@@ -210,29 +210,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
     SM.serializeToStackMapSection();
   }
-
-  if (TT.isOSBinFormatCOFF()) {
-    const auto &TLOF =
-        static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
-
-    std::string Flags;
-    raw_string_ostream OS(Flags);
-
-    for (const auto &Function : M)
-      TLOF.emitLinkerFlagsForGlobal(OS, &Function);
-    for (const auto &Global : M.globals())
-      TLOF.emitLinkerFlagsForGlobal(OS, &Global);
-    for (const auto &Alias : M.aliases())
-      TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
-
-    OS.flush();
-
-    // Output collected flags
-    if (!Flags.empty()) {
-      OutStreamer->SwitchSection(TLOF.getDrectveSection());
-      OutStreamer->EmitBytes(Flags);
-    }
-  }
 }
 
 void AArch64AsmPrinter::EmitLOHs() {
@@ -265,9 +242,7 @@ MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
         Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
         Twine(getFunctionNumber()) + "_" + Twine(CPID));
 
-  return OutContext.getOrCreateSymbol(
-      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
-      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+  return AsmPrinter::GetCPISymbol(CPID);
 }
 
 void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
index 08152c0d83d9..26d532555e78 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -31,7 +31,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -40,6 +39,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/MachineValueType.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -155,6 +155,12 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
+    if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) {
+      Size = VA.getLocVT().getSizeInBits() / 8;
+      ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg)
+                    ->getOperand(0)
+                    .getReg();
+    }
     auto MMO = MIRBuilder.getMF().getMachineMemOperand(
         MPO, MachineMemOperand::MOStore, Size, 0);
     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
@@ -187,6 +193,9 @@ void AArch64CallLowering::splitToValueTypes(
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
+  if (OrigArg.Ty->isVoidTy())
+    return;
+
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
@@ -226,9 +235,14 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
   bool Success = true;
   if (VReg) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    // We zero-extend i1s to i8.
+    if (MRI.getType(VReg).getSizeInBits() == 1)
+      VReg = MIRBuilder.buildZExt(LLT::scalar(8), VReg)->getOperand(0).getReg();
+
     const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
     CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
-    MachineRegisterInfo &MRI = MF.getRegInfo();
     auto &DL = F.getParent()->getDataLayout();
 
     ArgInfo OrigArg{VReg, Val->getType()};
@@ -369,8 +383,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   if (Callee.isReg())
     MIB->getOperand(0).setReg(constrainOperandRegClass(
         MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
-        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
-        Callee.getReg(), 0));
+        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arugments, the physical register must be an
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 93a68449de8d..30492003df14 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -345,3 +345,22 @@ def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
 def CSR_AArch64_RT_MostRegs :  CalleeSavedRegs<(add CSR_AArch64_AAPCS,
                                                 (sequence "X%u", 9, 15))>;
 
+def CSR_AArch64_StackProbe_Windows
+    : CalleeSavedRegs<(add (sequence "X%u", 0, 15),
+                           (sequence "X%u", 18, 28), FP, SP,
+                           (sequence "Q%u", 0, 31))>;
+
+// Variants of the standard calling conventions for shadow call stack.
+// These all preserve x18 in addition to any other registers.
+def CSR_AArch64_NoRegs_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>;
+def CSR_AArch64_AllRegs_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>;
+def CSR_AArch64_CXX_TLS_Darwin_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>;
+def CSR_AArch64_AAPCS_SwiftError_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>;
+def CSR_AArch64_RT_MostRegs_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>;
+def CSR_AArch64_AAPCS_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 0a9167edcdb3..720323f81d29 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -380,8 +380,8 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
 static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
                        LOHInfo &Info) {
   if (Info.LastADRP != nullptr) {
-    DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t'
-                 << *Info.LastADRP);
+    LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n"
+                      << '\t' << MI << '\t' << *Info.LastADRP);
     AFI.addLOHDirective(MCLOH_AdrpAdrp, {&MI, Info.LastADRP});
     ++NumADRPSimpleCandidate;
   }
@@ -390,48 +390,52 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
   if (Info.IsCandidate) {
     switch (Info.Type) {
     case MCLOH_AdrpAdd:
-      DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t'
-                   << *Info.MI0);
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n"
+                        << '\t' << MI << '\t' << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0});
       ++NumADRSimpleCandidate;
       break;
     case MCLOH_AdrpLdr:
       if (supportLoadFromLiteral(*Info.MI0)) {
-        DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" << '\t' << MI << '\t'
-                     << *Info.MI0);
+        LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n"
+                          << '\t' << MI << '\t' << *Info.MI0);
         AFI.addLOHDirective(MCLOH_AdrpLdr, {&MI, Info.MI0});
         ++NumADRPToLDR;
       }
       break;
     case MCLOH_AdrpAddLdr:
-      DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t'
-                   << *Info.MI1 << '\t' << *Info.MI0);
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n"
+                        << '\t' << MI << '\t' << *Info.MI1 << '\t'
+                        << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
       ++NumADDToLDR;
       break;
     case MCLOH_AdrpAddStr:
       if (Info.MI1 != nullptr) {
-        DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" << '\t' << MI << '\t'
-                     << *Info.MI1 << '\t' << *Info.MI0);
+        LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
+                          << '\t' << MI << '\t' << *Info.MI1 << '\t'
+                          << *Info.MI0);
         AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
         ++NumADDToSTR;
       }
       break;
     case MCLOH_AdrpLdrGotLdr:
-      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" << '\t' << MI << '\t'
-                   << *Info.MI1 << '\t' << *Info.MI0);
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n"
+                        << '\t' << MI << '\t' << *Info.MI1 << '\t'
+                        << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpLdrGotLdr, {&MI, Info.MI1, Info.MI0});
       ++NumLDRToLDR;
       break;
     case MCLOH_AdrpLdrGotStr:
-      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n" << '\t' << MI << '\t'
-                   << *Info.MI1 << '\t' << *Info.MI0);
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n"
+                        << '\t' << MI << '\t' << *Info.MI1 << '\t'
+                        << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpLdrGotStr, {&MI, Info.MI1, Info.MI0});
       ++NumLDRToSTR;
       break;
     case MCLOH_AdrpLdrGot:
-      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n" << '\t' << MI << '\t'
-                   << *Info.MI0);
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n"
+                        << '\t' << MI << '\t' << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpLdrGot, {&MI, Info.MI0});
       break;
     case MCLOH_AdrpAdrp:
@@ -485,8 +489,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n"
-               << "Looking in function " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n"
+                    << "Looking in function " << MF.getName() << '\n');
 
   LOHInfo LOHInfos[N_GPR_REGS];
   AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 30cefbad884c..5ae787409ae8 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -201,10 +201,10 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
             I->readsRegister(AArch64::NZCV, TRI))
           return false;
       }
-      DEBUG(dbgs() << "  Replacing instructions:\n    ");
-      DEBUG(DefMI.print(dbgs()));
-      DEBUG(dbgs() << "    ");
-      DEBUG(MI.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      LLVM_DEBUG(DefMI.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "    ");
+      LLVM_DEBUG(MI.print(dbgs()));
 
       NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
       NewBr = convertToCondBr(MI);
@@ -260,10 +260,10 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
             I->readsRegister(AArch64::NZCV, TRI))
           return false;
       }
-      DEBUG(dbgs() << "  Replacing instructions:\n    ");
-      DEBUG(DefMI.print(dbgs()));
-      DEBUG(dbgs() << "    ");
-      DEBUG(MI.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      LLVM_DEBUG(DefMI.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "    ");
+      LLVM_DEBUG(MI.print(dbgs()));
 
       NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
       NewBr = convertToCondBr(MI);
@@ -275,10 +275,10 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
   (void)NewCmp; (void)NewBr;
   assert(NewCmp && NewBr && "Expected new instructions.");
 
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(NewCmp->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(NewBr->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(NewCmp->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(NewBr->print(dbgs()));
 
   // If this was a flag setting version of the instruction, we use the original
   // instruction by just clearing the dead marked on the implicit-def of NCZV.
@@ -293,8 +293,9 @@ bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning  **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(
+      dbgs() << "********** AArch64 Conditional Branch Tuning  **********\n"
+             << "********** Function: " << MF.getName() << '\n');
 
   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   TRI = MF.getSubtarget().getRegisterInfo();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index d14bde33d94e..5064762b9f77 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -173,13 +173,14 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
     case AArch64::ADDSXri: {
       unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
       if (!I->getOperand(2).isImm()) {
-        DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+        LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
         return nullptr;
       } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
-        DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+        LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I
+                          << '\n');
         return nullptr;
       } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
-        DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+        LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
         return nullptr;
       }
       return &*I;
@@ -207,7 +208,8 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
       return nullptr;
     }
   }
-  DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) << '\n');
+  LLVM_DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB)
+                    << '\n');
   return nullptr;
 }
 
@@ -325,8 +327,8 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
 }
 
 bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -384,15 +386,15 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
     const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
     const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
 
-    DEBUG(dbgs() << "Head branch:\n");
-    DEBUG(dbgs() << "\tcondition: "
-          << AArch64CC::getCondCodeName(HeadCmp) << '\n');
-    DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+    LLVM_DEBUG(dbgs() << "Head branch:\n");
+    LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(HeadCmp)
+                      << '\n');
+    LLVM_DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
 
-    DEBUG(dbgs() << "True branch:\n");
-    DEBUG(dbgs() << "\tcondition: "
-          << AArch64CC::getCondCodeName(TrueCmp) << '\n');
-    DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+    LLVM_DEBUG(dbgs() << "True branch:\n");
+    LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(TrueCmp)
+                      << '\n');
+    LLVM_DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
 
     if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
          (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index b0bda7c43c15..8176b6fb269d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -311,7 +311,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
       return &*I;
     }
     ++NumCmpTermRejs;
-    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    LLVM_DEBUG(dbgs() << "Flags not used by terminator: " << *I);
     return nullptr;
   }
 
@@ -329,7 +329,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
       // Check that the immediate operand is within range, ccmp wants a uimm5.
       // Rd = SUBSri Rn, imm, shift
       if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
-        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        LLVM_DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
         ++NumImmRangeRejs;
         return nullptr;
       }
@@ -340,7 +340,8 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::ADDSXrr:
       if (isDeadDef(I->getOperand(0).getReg()))
         return &*I;
-      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      LLVM_DEBUG(dbgs() << "Can't convert compare with live destination: "
+                        << *I);
       ++NumLiveDstRejs;
       return nullptr;
     case AArch64::FCMPSrr:
@@ -358,18 +359,19 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
       // The ccmp doesn't produce exactly the same flags as the original
       // compare, so reject the transform if there are uses of the flags
       // besides the terminators.
-      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      LLVM_DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
       ++NumMultNZCVUses;
       return nullptr;
     }
 
     if (PRI.Defined || PRI.Clobbered) {
-      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      LLVM_DEBUG(dbgs() << "Not convertible compare: " << *I);
       ++NumUnknNZCVDefs;
       return nullptr;
     }
   }
-  DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) << '\n');
+  LLVM_DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB)
+                    << '\n');
   return nullptr;
 }
 
@@ -383,7 +385,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
   // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
   // get right.
   if (!MBB->livein_empty()) {
-    DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
     return false;
   }
 
@@ -392,18 +394,18 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
   // Check all instructions, except the terminators. It is assumed that
   // terminators never have side effects or define any used register values.
   for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
-    if (I.isDebugValue())
+    if (I.isDebugInstr())
       continue;
 
     if (++InstrCount > BlockInstrLimit && !Stress) {
-      DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
-                   << BlockInstrLimit << " instructions.\n");
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
       return false;
     }
 
     // There shouldn't normally be any phis in a single-predecessor block.
     if (I.isPHI()) {
-      DEBUG(dbgs() << "Can't hoist: " << I);
+      LLVM_DEBUG(dbgs() << "Can't hoist: " << I);
       return false;
     }
 
@@ -411,20 +413,20 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
     // speculate GOT or constant pool loads that are guaranteed not to trap,
     // but we don't support that for now.
     if (I.mayLoad()) {
-      DEBUG(dbgs() << "Won't speculate load: " << I);
+      LLVM_DEBUG(dbgs() << "Won't speculate load: " << I);
       return false;
     }
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
     if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
-      DEBUG(dbgs() << "Can't speculate: " << I);
+      LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
       return false;
     }
 
     // Only CmpMI is allowed to clobber the flags.
     if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
-      DEBUG(dbgs() << "Clobbers flags: " << I);
+      LLVM_DEBUG(dbgs() << "Clobbers flags: " << I);
       return false;
     }
   }
@@ -458,9 +460,9 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
     return false;
 
   // The CFG topology checks out.
-  DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
-               << printMBBReference(*CmpBB) << " -> "
-               << printMBBReference(*Tail) << '\n');
+  LLVM_DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
+                    << printMBBReference(*CmpBB) << " -> "
+                    << printMBBReference(*Tail) << '\n');
   ++NumConsidered;
 
   // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
@@ -470,13 +472,13 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // always be safe to sink the ccmp down to immediately before the CmpBB
   // terminators.
   if (!trivialTailPHIs()) {
-    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    LLVM_DEBUG(dbgs() << "Can't handle phis in Tail.\n");
     ++NumPhiRejs;
     return false;
   }
 
   if (!Tail->livein_empty()) {
-    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
     ++NumPhysRejs;
     return false;
   }
@@ -484,13 +486,13 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // CmpBB should never have PHIs since Head is its only predecessor.
   // FIXME: Clean them up if it happens.
   if (!CmpBB->empty() && CmpBB->front().isPHI()) {
-    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    LLVM_DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
     ++NumPhi2Rejs;
     return false;
   }
 
   if (!CmpBB->livein_empty()) {
-    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
     ++NumPhysRejs;
     return false;
   }
@@ -499,7 +501,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   HeadCond.clear();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
-    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    LLVM_DEBUG(dbgs() << "Head branch not analyzable.\n");
     ++NumHeadBranchRejs;
     return false;
   }
@@ -507,13 +509,14 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // This is weird, probably some sort of degenerate CFG, or an edge to a
   // landing pad.
   if (!TBB || HeadCond.empty()) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    LLVM_DEBUG(
+        dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
     ++NumHeadBranchRejs;
     return false;
   }
 
   if (!parseCond(HeadCond, HeadCmpBBCC)) {
-    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    LLVM_DEBUG(dbgs() << "Unsupported branch type on Head\n");
     ++NumHeadBranchRejs;
     return false;
   }
@@ -527,19 +530,20 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   CmpBBCond.clear();
   TBB = FBB = nullptr;
   if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
-    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    LLVM_DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
     ++NumCmpBranchRejs;
     return false;
   }
 
   if (!TBB || CmpBBCond.empty()) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    LLVM_DEBUG(
+        dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
     ++NumCmpBranchRejs;
     return false;
   }
 
   if (!parseCond(CmpBBCond, CmpBBTailCC)) {
-    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    LLVM_DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
     ++NumCmpBranchRejs;
     return false;
   }
@@ -547,9 +551,10 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   if (TBB != Tail)
     CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
 
-  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
-               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
-               << '\n');
+  LLVM_DEBUG(dbgs() << "Head->CmpBB on "
+                    << AArch64CC::getCondCodeName(HeadCmpBBCC)
+                    << ", CmpBB->Tail on "
+                    << AArch64CC::getCondCodeName(CmpBBTailCC) << '\n');
 
   CmpMI = findConvertibleCompare(CmpBB);
   if (!CmpMI)
@@ -563,9 +568,9 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
 }
 
 void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
-  DEBUG(dbgs() << "Merging " << printMBBReference(*CmpBB) << " into "
-               << printMBBReference(*Head) << ":\n"
-               << *CmpBB);
+  LLVM_DEBUG(dbgs() << "Merging " << printMBBReference(*CmpBB) << " into "
+                    << printMBBReference(*Head) << ":\n"
+                    << *CmpBB);
 
   // All CmpBB instructions are moved into Head, and CmpBB is deleted.
   // Update the CFG first.
@@ -710,7 +715,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
 
   RemovedBlocks.push_back(CmpBB);
   CmpBB->eraseFromParent();
-  DEBUG(dbgs() << "Result:\n" << *Head);
+  LLVM_DEBUG(dbgs() << "Result:\n" << *Head);
   ++NumConverted;
 }
 
@@ -860,13 +865,13 @@ bool AArch64ConditionalCompares::shouldConvert() {
   // If code size is the main concern
   if (MinSize) {
     int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
-    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    LLVM_DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
     // If we are minimizing the code size, do the conversion whatever
     // the cost is.
     if (CodeSizeDelta < 0)
       return true;
     if (CodeSizeDelta > 0) {
-      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      LLVM_DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
       return false;
     }
     // CodeSizeDelta == 0, continue with the regular heuristics
@@ -885,24 +890,24 @@ bool AArch64ConditionalCompares::shouldConvert() {
       Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
   unsigned CmpBBDepth =
       Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
-  DEBUG(dbgs() << "Head depth:  " << HeadDepth
-               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  LLVM_DEBUG(dbgs() << "Head depth:  " << HeadDepth
+                    << "\nCmpBB depth: " << CmpBBDepth << '\n');
   if (CmpBBDepth > HeadDepth + DelayLimit) {
-    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
-                 << " cycles.\n");
+    LLVM_DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                      << " cycles.\n");
     return false;
   }
 
   // Check the resource depth at the bottom of CmpBB - these instructions will
   // be speculated.
   unsigned ResDepth = Trace.getResourceDepth(true);
-  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+  LLVM_DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
 
   // Heuristic: The speculatively executed instructions must all be able to
   // merge into the Head block. The Head critical path should dominate the
   // resource cost of the speculated instructions.
   if (ResDepth > HeadDepth) {
-    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    LLVM_DEBUG(dbgs() << "Too many instructions to speculate.\n");
     return false;
   }
   return true;
@@ -922,8 +927,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
 }
 
 bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
-               << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   if (skipFunction(MF.getFunction()))
     return false;
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 8e7e740da6f6..2ba10d25e939 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -136,18 +136,21 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       // We need to skip this instruction because while it appears to have a
       // dead def it uses a frame index which might expand into a multi
       // instruction sequence during EPI.
-      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
+      LLVM_DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
       continue;
     }
     if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
       // It is not allowed to write to the same register (not even the zero
       // register) twice in a single instruction.
-      DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "    Ignoring, XZR or WZR already used by the instruction\n");
       continue;
     }
 
     if (shouldSkip(MI, MF)) {
-      DEBUG(dbgs() << "    Ignoring, Atomic instruction with acquire semantics using WZR/XZR\n");
+      LLVM_DEBUG(dbgs() << "    Ignoring, Atomic instruction with acquire "
+                           "semantics using WZR/XZR\n");
       continue;
     }
 
@@ -163,30 +166,30 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
           (!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
         continue;
       assert(!MO.isImplicit() && "Unexpected implicit def!");
-      DEBUG(dbgs() << "  Dead def operand #" << I << " in:\n    ";
-            MI.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "  Dead def operand #" << I << " in:\n    ";
+                 MI.print(dbgs()));
       // Be careful not to change the register if it's a tied operand.
       if (MI.isRegTiedToUseOperand(I)) {
-        DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+        LLVM_DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
         continue;
       }
       const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
       unsigned NewReg;
       if (RC == nullptr) {
-        DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+        LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
         continue;
       } else if (RC->contains(AArch64::WZR))
         NewReg = AArch64::WZR;
       else if (RC->contains(AArch64::XZR))
         NewReg = AArch64::XZR;
       else {
-        DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+        LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
         continue;
       }
-      DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+      LLVM_DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
       MO.setReg(NewReg);
       MO.setIsDead();
-      DEBUG(MI.print(dbgs()));
+      LLVM_DEBUG(MI.print(dbgs()));
       ++NumDeadDefsReplaced;
       Changed = true;
       // Only replace one dead register, see check for zero register above.
@@ -204,7 +207,7 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
-  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+  LLVM_DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
   Changed = false;
   for (auto &MBB : MF)
     processMachineBasicBlock(MBB);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c3842785f2be..9226a9dd879b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -66,6 +66,11 @@ private:
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
+  bool expandMOVImmSimple(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          unsigned BitSize,
+                          unsigned OneChunks,
+                          unsigned ZeroChunks);
 
   bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                       unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
@@ -83,7 +88,7 @@ char AArch64ExpandPseudo::ID = 0;
 INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
                 AARCH64_EXPAND_PSEUDO_NAME, false, false)
 
-/// \brief Transfer implicit operands on the pseudo instruction to the
+/// Transfer implicit operands on the pseudo instruction to the
 /// instructions created from the expansion.
 static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
                            MachineInstrBuilder &DefMI) {
@@ -99,7 +104,7 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
   }
 }
 
-/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// Helper function which extracts the specified 16-bit chunk from a
 /// 64-bit value.
 static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
   assert(ChunkIdx < 4 && "Out of range chunk index specified!");
@@ -107,58 +112,7 @@ static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
   return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
 }
 
-/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
-/// value. Indices correspond to element numbers in a v4i16.
-static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
-  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
-  const unsigned ShiftAmt = ToIdx * 16;
-
-  // Replicate the source chunk to the destination position.
-  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
-  // Clear the destination chunk.
-  Imm &= ~(0xFFFFLL << ShiftAmt);
-  // Insert the replicated chunk.
-  return Imm | Chunk;
-}
-
-/// \brief Helper function which tries to materialize a 64-bit value with an
-/// ORR + MOVK instruction sequence.
-static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
-                       MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator &MBBI,
-                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
-  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
-  const unsigned ShiftAmt = ChunkIdx * 16;
-
-  uint64_t Encoding;
-  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
-    // Create the ORR-immediate instruction.
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-            .add(MI.getOperand(0))
-            .addReg(AArch64::XZR)
-            .addImm(Encoding);
-
-    // Create the MOVK instruction.
-    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
-    const unsigned DstReg = MI.getOperand(0).getReg();
-    const bool DstIsDead = MI.getOperand(0).isDead();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
-            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
-
-    transferImpOps(MI, MIB, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// Check whether the given 16-bit chunk replicated to full 64-bit width
 /// can be materialized with an ORR instruction.
 static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
   Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
@@ -166,7 +120,7 @@ static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
   return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
 }
 
-/// \brief Check for identical 16-bit chunks within the constant and if so
+/// Check for identical 16-bit chunks within the constant and if so
 /// materialize them with a single ORR instruction. The remaining one or two
 /// 16-bit chunks will be materialized with MOVK instructions.
 ///
@@ -260,7 +214,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
   return false;
 }
 
-/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// Check whether this chunk matches the pattern '1...0...'. This pattern
 /// starts a contiguous sequence of ones if we look at the bits from the LSB
 /// towards the MSB.
 static bool isStartChunk(uint64_t Chunk) {
@@ -270,7 +224,7 @@ static bool isStartChunk(uint64_t Chunk) {
   return isMask_64(~Chunk);
 }
 
-/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// Check whether this chunk matches the pattern '0...1...' This pattern
 /// ends a contiguous sequence of ones if we look at the bits from the LSB
 /// towards the MSB.
 static bool isEndChunk(uint64_t Chunk) {
@@ -280,7 +234,7 @@ static bool isEndChunk(uint64_t Chunk) {
   return isMask_64(Chunk);
 }
 
-/// \brief Clear or set all bits in the chunk at the given index.
+/// Clear or set all bits in the chunk at the given index.
 static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
   const uint64_t Mask = 0xFFFF;
 
@@ -294,7 +248,7 @@ static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
   return Imm;
 }
 
-/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// Check whether the constant contains a sequence of contiguous ones,
 /// which might be interrupted by one or two chunks. If so, materialize the
 /// sequence of contiguous ones with an ORR instruction.
 /// Materialize the chunks which are either interrupting the sequence or outside
@@ -423,7 +377,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
   return true;
 }
 
-/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
 /// real move-immediate instructions to synthesize the immediate.
 bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
@@ -440,7 +394,22 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
     return true;
   }
 
-  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov"
+  // alias.
+
+  // Try a single ORR.
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
   if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
@@ -455,74 +424,69 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
     return true;
   }
 
-  // Scan the immediate and count the number of 16-bit chunks which are either
-  // all ones or all zeros.
-  unsigned OneChunks = 0;
-  unsigned ZeroChunks = 0;
+  // Two instruction sequences.
+  //
+  // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the
+  // fastest sequence with fast literal generation.
+  if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2)
+    return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
+
+  assert(BitSize == 64 && "All 32-bit immediates can be expanded with a"
+                          "MOVZ/MOVK pair");
+
+  // Try other two-instruction sequences.
+
+  // 64-bit ORR followed by MOVK.
+  // We try to construct the ORR immediate in three different ways: either we
+  // zero out the chunk which will be replaced, we fill the chunk which will
+  // be replaced with ones, or we take the bit pattern from the other half of
+  // the 64-bit immediate. This is comprehensive because of the way ORR
+  // immediates are constructed.
   for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
-    const unsigned Chunk = (Imm >> Shift) & Mask;
-    if (Chunk == Mask)
-      OneChunks++;
-    else if (Chunk == 0)
-      ZeroChunks++;
-  }
+    uint64_t ShiftedMask = (0xFFFFULL << Shift);
+    uint64_t ZeroChunk = UImm & ~ShiftedMask;
+    uint64_t OneChunk = UImm | ShiftedMask;
+    uint64_t RotatedImm = (UImm << 32) | (UImm >> 32);
+    uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask);
+    if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) ||
+        AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) ||
+        AArch64_AM::processLogicalImmediate(ReplicateChunk,
+                                            BitSize, Encoding)) {
+      // Create the ORR-immediate instruction.
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+              .add(MI.getOperand(0))
+              .addReg(AArch64::XZR)
+              .addImm(Encoding);
+
+      // Create the MOVK instruction.
+      const unsigned Imm16 = getChunk(UImm, Shift / 16);
+      const unsigned DstReg = MI.getOperand(0).getReg();
+      const bool DstIsDead = MI.getOperand(0).isDead();
+      MachineInstrBuilder MIB1 =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+              .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+              .addReg(DstReg)
+              .addImm(Imm16)
+              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
 
-  // Since we can't materialize the constant with a single ORR instruction,
-  // let's see whether we can materialize 3/4 of the constant with an ORR
-  // instruction and use an additional MOVK instruction to materialize the
-  // remaining 1/4.
-  //
-  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
-  //
-  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
-  // we would create the following instruction sequence:
-  //
-  // ORR x0, xzr, |A|X|A|X|
-  // MOVK x0, |B|, LSL #16
-  //
-  // Only look at 64-bit constants which can't be materialized with a single
-  // instruction e.g. which have less than either three all zero or all one
-  // chunks.
-  //
-  // Ignore 32-bit constants here, they always can be materialized with a
-  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
-  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
-  // Thus we fall back to the default code below which in the best case creates
-  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
-  //
-  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
-    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
-    // identical?
-    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 3 into element 1.
-      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
-        return true;
-
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 1 into element 3.
-      OrrImm = replicateChunk(UImm, 1, 3);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
-        return true;
-
-      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
-      // identical?
-    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 2 into element 0.
-      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
-        return true;
-
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 1 into element 3.
-      OrrImm = replicateChunk(UImm, 0, 2);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
-        return true;
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
     }
   }
 
+  // FIXME: Add more two-instruction sequences.
+
+  // Three instruction sequences.
+  //
+  // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly
+  // the fastest sequence with fast literal generation. (If neither MOVK is
+  // part of a fast literal generation pair, it could be slower than the
+  // four-instruction sequence, but we won't worry about that for now.)
+  if (OneChunks || ZeroChunks)
+    return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
+
   // Check for identical 16-bit chunks within the constant and if so materialize
   // them with a single ORR instruction. The remaining one or two 16-bit chunks
   // will be materialized with MOVK instructions.
@@ -537,6 +501,23 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
     return true;
 
+  // We found no possible two or three instruction sequence; use the general
+  // four-instruction sequence.
+  return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
+/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
+bool AArch64ExpandPseudo::expandMOVImmSimple(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             unsigned BitSize,
+                                             unsigned OneChunks,
+                                             unsigned ZeroChunks) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
   // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
   // more MOVK instructions to insert additional 16-bit portions into the
   // lower bits.
@@ -778,7 +759,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   return true;
 }
 
-/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
@@ -911,6 +892,16 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.eraseFromParent();
     return true;
   }
+  case AArch64::ADDlowTLS:
+    // Produce a plain ADD
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .addImm(0);
+    MI.eraseFromParent();
+    return true;
+
   case AArch64::MOVbaseTLS: {
     unsigned DstReg = MI.getOperand(0).getReg();
     auto SysReg = AArch64SysReg::TPIDR_EL0;
@@ -980,7 +971,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   return false;
 }
 
-/// \brief Iterate over the instructions in basic block MBB and expand any
+/// Iterate over the instructions in basic block MBB and expand any
 /// pseudo instructions.  Return true if anything was modified.
 bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
@@ -1004,7 +995,7 @@ bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   return Modified;
 }
 
-/// \brief Returns an instance of the pseudo instruction expansion pass.
+/// Returns an instance of the pseudo instruction expansion pass.
 FunctionPass *llvm::createAArch64ExpandPseudoPass() {
   return new AArch64ExpandPseudo();
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 0d00dab598d5..bc9a5ca97fea 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -10,7 +10,7 @@
 /// that may inhibit the HW prefetching.  This is done in two steps.  Before
 /// ISel, we mark strided loads (i.e. those that will likely benefit from
 /// prefetching) with metadata.  Then, after opcodes have been finalized, we
-/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+/// insert MOVs and re-write loads to prevent unintentional tag collisions.
 // ===---------------------------------------------------------------------===//
 
 #include "AArch64.h"
@@ -60,7 +60,7 @@ STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
 STATISTIC(NumCollisionsAvoided,
           "Number of HW prefetch tag collisions avoided");
 STATISTIC(NumCollisionsNotAvoided,
-          "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+          "Number of HW prefetch tag collisions not avoided due to lack of registers");
 DEBUG_COUNTER(FixCounter, "falkor-hwpf",
               "Controls which tag collisions are avoided");
 
@@ -169,7 +169,7 @@ bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
       LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
                          MDNode::get(LoadI->getContext(), {}));
       ++NumStridedLoadsMarked;
-      DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+      LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
       MadeChange = true;
     }
   }
@@ -190,6 +190,7 @@ public:
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
     AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -730,10 +731,10 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
         continue;
 
       bool Fixed = false;
-      DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+      LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
 
       if (!DebugCounter::shouldExecute(FixCounter)) {
-        DEBUG(dbgs() << "Skipping fix due to debug counter:\n  " << MI);
+        LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n  " << MI);
         continue;
       }
 
@@ -758,8 +759,8 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
         if (TagMap.count(NewTag))
           continue;
 
-        DEBUG(dbgs() << "Changing base reg to: " << printReg(ScratchReg, TRI)
-                     << '\n');
+        LLVM_DEBUG(dbgs() << "Changing base reg to: "
+                          << printReg(ScratchReg, TRI) << '\n');
 
         // Rewrite:
         //   Xd = LOAD Xb, off
@@ -777,8 +778,8 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
         // If the load does a pre/post increment, then insert a MOV after as
         // well to update the real base register.
         if (LdI.IsPrePost) {
-          DEBUG(dbgs() << "Doing post MOV of incremented reg: "
-                       << printReg(ScratchReg, TRI) << '\n');
+          LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+                            << printReg(ScratchReg, TRI) << '\n');
           MI.getOperand(0).setReg(
               ScratchReg); // Change tied operand pre/post update dest.
           BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 974f968ec2c4..43a3ae77a170 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -35,7 +35,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
@@ -66,6 +65,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
@@ -307,7 +307,7 @@ public:
 
 #include "AArch64GenCallingConv.inc"
 
-/// \brief Check if the sign-/zero-extend will be a noop.
+/// Check if the sign-/zero-extend will be a noop.
 static bool isIntExtFree(const Instruction *I) {
   assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
          "Unexpected integer extend instruction.");
@@ -326,7 +326,7 @@ static bool isIntExtFree(const Instruction *I) {
   return false;
 }
 
-/// \brief Determine the implicit scale factor that is applied by a memory
+/// Determine the implicit scale factor that is applied by a memory
 /// operation for a given value type.
 static unsigned getImplicitScaleFactor(MVT VT) {
   switch (VT.SimpleTy) {
@@ -535,7 +535,7 @@ unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
   return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
 }
 
-/// \brief Check if the multiply is by a power-of-2 constant.
+/// Check if the multiply is by a power-of-2 constant.
 static bool isMulPowOf2(const Value *I) {
   if (const auto *MI = dyn_cast<MulOperator>(I)) {
     if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
@@ -964,7 +964,7 @@ bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   return TLI.isTypeLegal(VT);
 }
 
-/// \brief Determine if the value type is supported by FastISel.
+/// Determine if the value type is supported by FastISel.
 ///
 /// FastISel for AArch64 can handle more value types than are legal. This adds
 /// simple value type such as i1, i8, and i16.
@@ -1524,7 +1524,7 @@ unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
                     IsZExt);
 }
 
-/// \brief This method is a wrapper to simplify add emission.
+/// This method is a wrapper to simplify add emission.
 ///
 /// First try to emit an add with an immediate operand using emitAddSub_ri. If
 /// that fails, then try to materialize the immediate into a register and use
@@ -2254,7 +2254,7 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
   }
 }
 
-/// \brief Try to emit a combined compare-and-branch instruction.
+/// Try to emit a combined compare-and-branch instruction.
 bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
   assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
   const CmpInst *CI = cast<CmpInst>(BI->getCondition());
@@ -2607,7 +2607,7 @@ bool AArch64FastISel::selectCmp(const Instruction *I) {
   return true;
 }
 
-/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// Optimize selects of i1 if one of the operands has a 'true' or 'false'
 /// value.
 bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
   if (!SI->getType()->isIntegerTy(1))
@@ -3322,7 +3322,7 @@ bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
   return true;
 }
 
-/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// Check if it is possible to fold the condition from the XALU intrinsic
 /// into the user. The condition code will only be updated on success.
 bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
                                         const Instruction *I,
@@ -3457,7 +3457,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       // Small memcpy's are common enough that we want to do them without a call
       // if possible.
       uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
-      unsigned Alignment = MTI->getAlignment();
+      unsigned Alignment = MinAlign(MTI->getDestAlignment(),
+                                    MTI->getSourceAlignment());
       if (isMemCpySmall(Len, Alignment)) {
         Address Dest, Src;
         if (!computeAddress(MTI->getRawDest(), Dest) ||
@@ -3477,7 +3478,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       return false;
 
     const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
-    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 1);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -3493,7 +3494,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       // address spaces.
       return false;
 
-    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
   }
   case Intrinsic::sin:
   case Intrinsic::cos:
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 789200b28445..6dc5d19862a9 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -140,8 +140,19 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
                                    cl::desc("enable use of redzone on AArch64"),
                                    cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    ReverseCSRRestoreSeq("reverse-csr-restore-seq",
+                         cl::desc("reverse the CSR restore sequence"),
+                         cl::init(false), cl::Hidden);
+
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
+/// This is the biggest offset to the stack pointer we can encode in aarch64
+/// instructions (without using a separate calculation and a temp register).
+/// Note that the exception here are vector stores/loads which cannot encode any
+/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
+static const unsigned DefaultSafeSPDisplacement = 255;
+
 /// Look at each instruction that references stack frames and return the stack
 /// size limit beyond which some of these instructions will require a scratch
 /// register during their expansion later.
@@ -151,7 +162,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
   // realistically that's not a big deal at this stage of the game.
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      if (MI.isDebugValue() || MI.isPseudo() ||
+      if (MI.isDebugInstr() || MI.isPseudo() ||
           MI.getOpcode() == AArch64::ADDXri ||
           MI.getOpcode() == AArch64::ADDSXri)
         continue;
@@ -167,7 +178,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
       }
     }
   }
-  return 255;
+  return DefaultSafeSPDisplacement;
 }
 
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
@@ -191,11 +202,25 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   // Retain behavior of always omitting the FP for leaf functions when possible.
-  return (MFI.hasCalls() &&
-          MF.getTarget().Options.DisableFramePointerElim(MF)) ||
-         MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
-         MFI.hasStackMap() || MFI.hasPatchPoint() ||
-         RegInfo->needsStackRealignment(MF);
+  if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF))
+    return true;
+  if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+      MFI.hasStackMap() || MFI.hasPatchPoint() ||
+      RegInfo->needsStackRealignment(MF))
+    return true;
+  // With large callframes around we may need to use FP to access the scavenging
+  // emergency spillslot.
+  //
+  // Unfortunately some calls to hasFP() like machine verifier ->
+  // getReservedReg() -> hasFP in the middle of global isel are too early
+  // to know the max call frame size. Hopefully conservatively returning "true"
+  // in those cases is fine.
+  // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
+  if (!MFI.isMaxCallFrameSizeComputed() ||
+      MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
+    return true;
+
+  return false;
 }
 
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -349,7 +374,8 @@ static bool windowsRequiresStackProbe(MachineFunction &MF,
     F.getFnAttribute("stack-probe-size")
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
-  return StackSizeInBytes >= StackProbeSize;
+  return (StackSizeInBytes >= StackProbeSize) &&
+         !F.hasFnAttribute("no-stack-arg-probe");
 }
 
 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
@@ -388,6 +414,14 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+  // Ignore instructions that do not operate on SP, i.e. shadow call stack
+  // instructions.
+  while (MBBI->getOpcode() == AArch64::STRXpost ||
+         MBBI->getOpcode() == AArch64::LDRXpre) {
+    assert(MBBI->getOperand(0).getReg() != AArch64::SP);
+    ++MBBI;
+  }
+
   unsigned NewOpc;
   bool NewIsUnscaled = false;
   switch (MBBI->getOpcode()) {
@@ -455,6 +489,14 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
                                               unsigned LocalStackSize) {
   unsigned Opc = MI.getOpcode();
+
+  // Ignore instructions that do not operate on SP, i.e. shadow call stack
+  // instructions.
+  if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre) {
+    assert(MI.getOperand(0).getReg() != AArch64::SP);
+    return;
+  }
+
   (void)Opc;
   assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
           Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
@@ -472,6 +514,38 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
 }
 
+static void adaptForLdStOpt(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator FirstSPPopI,
+                            MachineBasicBlock::iterator LastPopI) {
+  // Sometimes (when we restore in the same order as we save), we can end up
+  // with code like this:
+  //
+  // ldp      x26, x25, [sp]
+  // ldp      x24, x23, [sp, #16]
+  // ldp      x22, x21, [sp, #32]
+  // ldp      x20, x19, [sp, #48]
+  // add      sp, sp, #64
+  //
+  // In this case, it is always better to put the first ldp at the end, so
+  // that the load-store optimizer can run and merge the ldp and the add into
+  // a post-index ldp.
+  // If we managed to grab the first pop instruction, move it to the end.
+  if (ReverseCSRRestoreSeq)
+    MBB.splice(FirstSPPopI, &MBB, LastPopI);
+  // We should end up with something like this now:
+  //
+  // ldp      x24, x23, [sp, #16]
+  // ldp      x22, x21, [sp, #32]
+  // ldp      x20, x19, [sp, #48]
+  // ldp      x26, x25, [sp]
+  // add      sp, sp, #64
+  //
+  // and the load-store optimizer can merge the last two instructions into:
+  //
+  // ldp      x26, x25, [sp], #64
+  //
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -485,6 +559,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry();
   bool HasFP = hasFP(MF);
 
+  // At this point, we're going to decide whether or not the function uses a
+  // redzone. In most cases, the function doesn't have a redzone so let's
+  // assume that's false and set it to true in the case that there's a redzone.
+  AFI->setHasRedZone(false);
+
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
   DebugLoc DL;
@@ -505,9 +584,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
     // to actually allocate.
-    if (canUseRedZone(MF))
+    if (canUseRedZone(MF)) {
+      AFI->setHasRedZone(true);
       ++NumRedZoneFunctions;
-    else {
+    } else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                       MachineInstr::FrameSetup);
 
@@ -823,14 +903,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
   unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
 
+  uint64_t AfterCSRPopSize = ArgumentPopSize;
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
-
-  if (!CombineSPBump && PrologueSaveSize != 0)
-    convertCalleeSaveRestoreToSPPrePostIncDec(
-        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
+  // Assume we can't combine the last pop with the sp restore.
+
+  if (!CombineSPBump && PrologueSaveSize != 0) {
+    MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
+    // Converting the last ldp to a post-index ldp is valid only if the last
+    // ldp's offset is 0.
+    const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
+    // If the offset is 0, convert it to a post-index ldp.
+    if (OffsetOp.getImm() == 0) {
+      convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
+                                                PrologueSaveSize);
+    } else {
+      // If not, make sure to emit an add after the last ldp.
+      // We're doing this by transfering the size to be restored from the
+      // adjustment *before* the CSR pops to the adjustment *after* the CSR
+      // pops.
+      AfterCSRPopSize += PrologueSaveSize;
+    }
+  }
 
   // Move past the restores of the callee-saved registers.
+  // If we plan on combining the sp bump of the local stack size and the callee
+  // save stack size, we might need to adjust the CSR save and restore offsets.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
   MachineBasicBlock::iterator Begin = MBB.begin();
   while (LastPopI != Begin) {
@@ -845,7 +943,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    NumBytes + ArgumentPopSize, TII,
+                    NumBytes + AfterCSRPopSize, TII,
                     MachineInstr::FrameDestroy);
     return;
   }
@@ -857,19 +955,27 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     bool RedZone = canUseRedZone(MF);
     // If this was a redzone leaf function, we don't need to restore the
     // stack pointer (but we may need to pop stack args for fastcc).
-    if (RedZone && ArgumentPopSize == 0)
+    if (RedZone && AfterCSRPopSize == 0)
       return;
 
     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
     int StackRestoreBytes = RedZone ? 0 : NumBytes;
     if (NoCalleeSaveRestore)
-      StackRestoreBytes += ArgumentPopSize;
-    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+      StackRestoreBytes += AfterCSRPopSize;
+
     // If we were able to combine the local stack pop with the argument pop,
     // then we're done.
-    if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+    bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
+
+    // If we're done after this, make sure to help the load store optimizer.
+    if (Done)
+      adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
+
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+    if (Done)
       return;
+
     NumBytes = 0;
   }
 
@@ -888,9 +994,24 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
   // code in the prologue.
-  if (ArgumentPopSize)
-    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    ArgumentPopSize, TII, MachineInstr::FrameDestroy);
+  if (AfterCSRPopSize) {
+    // Find an insertion point for the first ldp so that it goes before the
+    // shadow call stack epilog instruction. This ensures that the restore of
+    // lr from x18 is placed after the restore from sp.
+    auto FirstSPPopI = MBB.getFirstTerminator();
+    while (FirstSPPopI != Begin) {
+      auto Prev = std::prev(FirstSPPopI);
+      if (Prev->getOpcode() != AArch64::LDRXpre ||
+          Prev->getOperand(0).getReg() == AArch64::SP)
+        break;
+      FirstSPPopI = Prev;
+    }
+
+    adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
+
+    emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
+                    AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
+  }
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -938,20 +1059,36 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
       // the CSR area.
       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
       UseFP = true;
-    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) &&
-               !RegInfo->needsStackRealignment(MF)) {
-      // Use SP or FP, whichever gives us the best chance of the offset
-      // being in range for direct access. If the FPOffset is positive,
-      // that'll always be best, as the SP will be even further away.
+    } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
       // If the FPOffset is negative, we have to keep in mind that the
       // available offset range for negative offsets is smaller than for
-      // positive ones. If we have variable sized objects, we're stuck with
-      // using the FP regardless, though, as the SP offset is unknown
-      // and we don't have a base pointer available. If an offset is
+      // positive ones. If an offset is
       // available via the FP and the SP, use whichever is closest.
-      if (PreferFP || MFI.hasVarSizedObjects() || FPOffset >= 0 ||
-          (FPOffset >= -256 && Offset > -FPOffset))
+      bool FPOffsetFits = FPOffset >= -256;
+      PreferFP |= Offset > -FPOffset;
+
+      if (MFI.hasVarSizedObjects()) {
+        // If we have variable sized objects, we can use either FP or BP, as the
+        // SP offset is unknown. We can use the base pointer if we have one and
+        // FP is not preferred. If not, we're stuck with using FP.
+        bool CanUseBP = RegInfo->hasBasePointer(MF);
+        if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
+          UseFP = PreferFP;
+        else if (!CanUseBP) // Can't use BP. Forced to use FP.
+          UseFP = true;
+        // else we can use BP and FP, but the offset from FP won't fit.
+        // That will make us scavenge registers which we can probably avoid by
+        // using BP. If it won't fit for BP either, we'll scavenge anyway.
+      } else if (FPOffset >= 0) {
+        // Use SP or FP, whichever gives us the best chance of the offset
+        // being in range for direct access. If the FPOffset is positive,
+        // that'll always be best, as the SP will be even further away.
         UseFP = true;
+      } else {
+        // We have the choice between FP and (SP or BP).
+        if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
+          UseFP = true;
+      }
     }
   }
 
@@ -968,6 +1105,8 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
   if (RegInfo->hasBasePointer(MF))
     FrameReg = RegInfo->getBaseRegister();
   else {
+    assert(!MFI.hasVarSizedObjects() &&
+           "Can't use SP when we have var sized objects.");
     FrameReg = AArch64::SP;
     // If we're using the red zone for this function, the SP won't actually
     // be adjusted, so the offsets will be negative. They're also all
@@ -1015,7 +1154,8 @@ struct RegPairInfo {
 
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
+    bool &NeedShadowCallStackProlog) {
 
   if (CSI.empty())
     return;
@@ -1049,6 +1189,15 @@ static void computeCalleeSaveRegisterPairs(
         RPI.Reg2 = NextReg;
     }
 
+    // If either of the registers to be saved is the lr register, it means that
+    // we also need to save lr in the shadow call stack.
+    if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
+        MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
+      if (!MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
+        report_fatal_error("Must reserve x18 to use shadow call stack");
+      NeedShadowCallStackProlog = true;
+    }
+
     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
     // list to come in sorted by frame index so that we can issue the store
     // pair instructions directly. Assert if we see anything otherwise.
@@ -1099,9 +1248,24 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
-  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+  bool NeedShadowCallStackProlog = false;
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
+                                 NeedShadowCallStackProlog);
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  if (NeedShadowCallStackProlog) {
+    // Shadow call stack prolog: str x30, [x18], #8
+    BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
+        .addReg(AArch64::X18, RegState::Define)
+        .addReg(AArch64::LR)
+        .addReg(AArch64::X18)
+        .addImm(8)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    // This instruction also makes x18 live-in to the entry block.
+    MBB.addLiveIn(AArch64::X18);
+  }
+
   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
        ++RPII) {
     RegPairInfo RPI = *RPII;
@@ -1123,13 +1287,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
     else
       StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
-    DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
-          if (RPI.isPaired())
-            dbgs() << ", " << printReg(Reg2, TRI);
-          dbgs() << ") -> fi#(" << RPI.FrameIdx;
-          if (RPI.isPaired())
-            dbgs() << ", " << RPI.FrameIdx+1;
-          dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
+               if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
+               dbgs() << ") -> fi#(" << RPI.FrameIdx;
+               if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
+               dbgs() << ")\n");
 
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     if (!MRI.isReserved(Reg1))
@@ -1165,11 +1327,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
 
-  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+  bool NeedShadowCallStackProlog = false;
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
+                                 NeedShadowCallStackProlog);
 
-  for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
-       ++RPII) {
-    RegPairInfo RPI = *RPII;
+  auto EmitMI = [&](const RegPairInfo &RPI) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -1186,13 +1348,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
     else
       LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
-    DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
-          if (RPI.isPaired())
-            dbgs() << ", " << printReg(Reg2, TRI);
-          dbgs() << ") -> fi#(" << RPI.FrameIdx;
-          if (RPI.isPaired())
-            dbgs() << ", " << RPI.FrameIdx+1;
-          dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
+               if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
+               dbgs() << ") -> fi#(" << RPI.FrameIdx;
+               if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
+               dbgs() << ")\n");
 
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
@@ -1208,7 +1368,25 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     MIB.addMemOperand(MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
         MachineMemOperand::MOLoad, 8, 8));
+  };
+
+  if (ReverseCSRRestoreSeq)
+    for (const RegPairInfo &RPI : reverse(RegPairs))
+      EmitMI(RPI);
+  else
+    for (const RegPairInfo &RPI : RegPairs)
+      EmitMI(RPI);
+
+  if (NeedShadowCallStackProlog) {
+    // Shadow call stack epilog: ldr x30, [x18, #-8]!
+    BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
+        .addReg(AArch64::X18, RegState::Define)
+        .addReg(AArch64::LR, RegState::Define)
+        .addReg(AArch64::X18)
+        .addImm(-8)
+        .setMIFlag(MachineInstr::FrameDestroy);
   }
+
   return true;
 }
 
@@ -1283,10 +1461,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
-  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
-        for (unsigned Reg : SavedRegs.set_bits())
-          dbgs() << ' ' << printReg(Reg, RegInfo);
-        dbgs() << "\n";);
+  LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+             for (unsigned Reg
+                  : SavedRegs.set_bits()) dbgs()
+             << ' ' << printReg(Reg, RegInfo);
+             dbgs() << "\n";);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
   unsigned NumRegsSpilled = SavedRegs.count();
@@ -1295,7 +1474,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
-  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
   bool BigStack = (CFSize > EstimatedStackSizeLimit);
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
@@ -1309,8 +1488,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // here.
   if (BigStack) {
     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
-      DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
-                   << " to get a scratch register.\n");
+      LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
+                        << " to get a scratch register.\n");
       SavedRegs.set(UnspilledCSGPR);
       // MachO's compact unwind format relies on all registers being stored in
       // pairs, so if we need to spill one extra for BigStack, then we need to
@@ -1330,8 +1509,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       unsigned Align = TRI->getSpillAlignment(RC);
       int FI = MFI.CreateStackObject(Size, Align, false);
       RS->addScavengingFrameIndex(FI);
-      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
-                   << " as the emergency spill slot.\n");
+      LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                        << " as the emergency spill slot.\n");
     }
   }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 55a256867fab..104e52b5f1f3 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -53,7 +53,7 @@ public:
                                   std::vector<CalleeSavedInfo> &CSI,
                                   const TargetRegisterInfo *TRI) const override;
 
-  /// \brief Can this function use the red zone for local allocations.
+  /// Can this function use the red zone for local allocations.
   bool canUseRedZone(const MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 0b10246b0cc8..c1a9ee333b62 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -168,6 +168,7 @@ public:
   bool tryBitfieldExtractOpFromSExt(SDNode *N);
   bool tryBitfieldInsertOp(SDNode *N);
   bool tryBitfieldInsertInZeroOp(SDNode *N);
+  bool tryShiftAmountMod(SDNode *N);
 
   bool tryReadRegister(SDNode *N);
   bool tryWriteRegister(SDNode *N);
@@ -336,7 +337,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
   }
 }
 
-/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// Determine whether it is worth it to fold SHL into the addressing
 /// mode.
 static bool isWorthFoldingSHL(SDValue V) {
   assert(V.getOpcode() == ISD::SHL && "invalid opcode");
@@ -360,7 +361,7 @@ static bool isWorthFoldingSHL(SDValue V) {
   return true;
 }
 
-/// \brief Determine whether it is worth to fold V into an extended register.
+/// Determine whether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
   // Trivial if we are optimizing for code size or if there is only
   // one use of the value.
@@ -743,14 +744,16 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     if (!GAN)
       return true;
 
-    const GlobalValue *GV = GAN->getGlobal();
-    unsigned Alignment = GV->getAlignment();
-    Type *Ty = GV->getValueType();
-    if (Alignment == 0 && Ty->isSized())
-      Alignment = DL.getABITypeAlignment(Ty);
+    if (GAN->getOffset() % Size == 0) {
+      const GlobalValue *GV = GAN->getGlobal();
+      unsigned Alignment = GV->getAlignment();
+      Type *Ty = GV->getValueType();
+      if (Alignment == 0 && Ty->isSized())
+        Alignment = DL.getABITypeAlignment(Ty);
 
-    if (Alignment >= Size)
-      return true;
+      if (Alignment >= Size)
+        return true;
+    }
   }
 
   if (CurDAG->isBaseWithConstantOffset(N)) {
@@ -824,7 +827,7 @@ static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
   return SDValue(Node, 0);
 }
 
-/// \brief Check if the given SHL node (\p N), can be used to form an
+/// Check if the given SHL node (\p N), can be used to form an
 /// extended register for an addressing mode.
 bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
                                             bool WantExtend, SDValue &Offset,
@@ -1512,7 +1515,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
 
   // Because of simplify-demanded-bits in DAGCombine, the mask may have been
   // simplified. Try to undo that
-  AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
+  AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
 
   // The immediate is a mask of the low bits iff imm & (imm+1) == 0
   if (AndImm & (AndImm + 1))
@@ -1551,8 +1554,9 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   // Bail out on large immediates. This happens when no proper
   // combining/constant folding was performed.
   if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
-    DEBUG((dbgs() << N
-           << ": Found large shift immediate, this should not happen\n"));
+    LLVM_DEBUG(
+        (dbgs() << N
+                << ": Found large shift immediate, this should not happen\n"));
     return false;
   }
 
@@ -1681,7 +1685,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
     // later find more redundancy.
     Opd0 = N->getOperand(0).getOperand(0);
     TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
-    VT = Opd0->getValueType(0);
+    VT = Opd0.getValueType();
     assert(VT == MVT::i64 && "the promoted type should be i64");
   } else if (BiggerPattern) {
     // Let's pretend a 0 shift left has been performed.
@@ -1694,8 +1698,9 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   // Missing combines/constant folding may have left us with strange
   // constants.
   if (ShlImm >= VT.getSizeInBits()) {
-    DEBUG((dbgs() << N
-           << ": Found large shift immediate, this should not happen\n"));
+    LLVM_DEBUG(
+        (dbgs() << N
+                << ": Found large shift immediate, this should not happen\n"));
     return false;
   }
 
@@ -2301,7 +2306,7 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
       continue;
 
     // Check the second part of the pattern
-    EVT VT = OrOpd1->getValueType(0);
+    EVT VT = OrOpd1Val.getValueType();
     assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
 
     // Compute the Known Zero for the candidate of the first operand.
@@ -2437,6 +2442,111 @@ bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
   return true;
 }
 
+/// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
+/// variable shift/rotate instructions.
+bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  unsigned Opc;
+  switch (N->getOpcode()) {
+  case ISD::ROTR:
+    Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
+    break;
+  case ISD::SHL:
+    Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
+    break;
+  case ISD::SRL:
+    Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
+    break;
+  case ISD::SRA:
+    Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
+    break;
+  default:
+    return false;
+  }
+
+  uint64_t Size;
+  uint64_t Bits;
+  if (VT == MVT::i32) {
+    Bits = 5;
+    Size = 32;
+  } else if (VT == MVT::i64) {
+    Bits = 6;
+    Size = 64;
+  } else
+    return false;
+
+  SDValue ShiftAmt = N->getOperand(1);
+  SDLoc DL(N);
+  SDValue NewShiftAmt;
+
+  // Skip over an extend of the shift amount.
+  if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
+      ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
+    ShiftAmt = ShiftAmt->getOperand(0);
+
+  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
+    SDValue Add0 = ShiftAmt->getOperand(0);
+    SDValue Add1 = ShiftAmt->getOperand(1);
+    uint64_t Add0Imm;
+    uint64_t Add1Imm;
+    // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
+    // to avoid the ADD/SUB.
+    if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
+      NewShiftAmt = Add0;
+    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
+    // generate a NEG instead of a SUB of a constant.
+    else if (ShiftAmt->getOpcode() == ISD::SUB &&
+             isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
+             (Add0Imm % Size == 0)) {
+      unsigned NegOpc;
+      unsigned ZeroReg;
+      EVT SubVT = ShiftAmt->getValueType(0);
+      if (SubVT == MVT::i32) {
+        NegOpc = AArch64::SUBWrr;
+        ZeroReg = AArch64::WZR;
+      } else {
+        assert(SubVT == MVT::i64);
+        NegOpc = AArch64::SUBXrr;
+        ZeroReg = AArch64::XZR;
+      }
+      SDValue Zero =
+          CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
+      MachineSDNode *Neg =
+          CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
+      NewShiftAmt = SDValue(Neg, 0);
+    } else
+      return false;
+  } else {
+    // If the shift amount is masked with an AND, check that the mask covers the
+    // bits that are implicitly ANDed off by the above opcodes and if so, skip
+    // the AND.
+    uint64_t MaskImm;
+    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+      return false;
+
+    if (countTrailingOnes(MaskImm) < Bits)
+      return false;
+
+    NewShiftAmt = ShiftAmt->getOperand(0);
+  }
+
+  // Narrow/widen the shift amount to match the size of the shift operation.
+  if (VT == MVT::i32)
+    NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
+  else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
+    MachineSDNode *Ext = CurDAG->getMachineNode(
+        AArch64::SUBREG_TO_REG, DL, VT,
+        CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
+    NewShiftAmt = SDValue(Ext, 0);
+  }
+
+  SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
 bool
 AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
                                               unsigned RegWidth) {
@@ -2653,14 +2763,9 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
 }
 
 void AArch64DAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return;
   }
@@ -2708,6 +2813,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       return;
     if (tryBitfieldInsertInZeroOp(Node))
       return;
+    LLVM_FALLTHROUGH;
+  case ISD::ROTR:
+  case ISD::SHL:
+    if (tryShiftAmountMod(Node))
+      return;
     break;
 
   case ISD::SIGN_EXTEND:
@@ -2757,9 +2867,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
     SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
                                                      Node->getOperand(0));
-    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
-    DEBUG(Extract->dumpr(CurDAG));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    LLVM_DEBUG(Extract->dumpr(CurDAG));
+    LLVM_DEBUG(dbgs() << "\n");
     ReplaceNode(Node, Extract.getNode());
     return;
   }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 233d6be247c2..0c72f2ebee18 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -38,7 +38,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -70,6 +69,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -198,6 +198,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FREM, MVT::f64, Expand);
   setOperationAction(ISD::FREM, MVT::f80, Expand);
 
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
   // Custom lowering hooks are needed for XOR
   // to fold it into CSINC/CSINV.
   setOperationAction(ISD::XOR, MVT::i32, Custom);
@@ -253,7 +255,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Variable-sized objects.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  if (Subtarget->isTargetWindows())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -463,7 +469,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
+  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
@@ -567,9 +579,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+  setTargetDAGCombine(ISD::GlobalAddress);
+
+  // In case of strict alignment, avoid an excessive number of byte wide stores.
+  MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemset = Subtarget->requiresStrictAlign()
+                       ? MaxStoresPerMemsetOptSize : 32;
+
+  MaxGluedStoresPerMemcpy = 4;
+  MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
+                       ? MaxStoresPerMemcpyOptSize : 16;
+
+  MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
 
   setStackPointerRegisterToSaveRestore(AArch64::SP);
 
@@ -691,9 +713,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (MVT VT : MVT::vector_valuetypes()) {
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
-      setOperationAction(ISD::MULHS, VT, Expand);
+      if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
+        setOperationAction(ISD::MULHS, VT, Custom);
+        setOperationAction(ISD::MULHU, VT, Custom);
+      } else {
+        setOperationAction(ISD::MULHS, VT, Expand);
+        setOperationAction(ISD::MULHU, VT, Expand);
+      }
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-      setOperationAction(ISD::MULHU, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
       setOperationAction(ISD::BSWAP, VT, Expand);
@@ -715,24 +742,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FTRUNC, Ty, Legal);
       setOperationAction(ISD::FROUND, Ty, Legal);
     }
+
+    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
-    setOperationAction(ISD::LOAD, VT, Promote);
-    AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
-
-    setOperationAction(ISD::STORE, VT, Promote);
-    AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
-    setOperationAction(ISD::LOAD, VT, Promote);
-    AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
+  assert(VT.isVector() && "VT should be a vector type");
 
-    setOperationAction(ISD::STORE, VT, Promote);
-    AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
+  if (VT.isFloatingPoint()) {
+    MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
+    setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
+    setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
   }
 
   // Mark vector float intrinsics as expand.
@@ -1431,7 +1454,8 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
 static bool isLegalArithImmed(uint64_t C) {
   // Matches AArch64DAGToDAGISel::SelectArithImmed().
   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
-  DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n"));
+  LLVM_DEBUG(dbgs() << "Is imm " << C
+                    << " legal: " << (IsLegal ? "yes\n" : "no\n"));
   return IsLegal;
 }
 
@@ -2474,6 +2498,26 @@ static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
   return false;
 }
 
+SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // The rounding mode is in bits 23:22 of the FPSCR.
+  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+  // so that the shift + and get folded into a bitfield extract.
+  SDLoc dl(Op);
+
+  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
+                                DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
+                                                MVT::i64));
+  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
+  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
+                                  DAG.getConstant(1U << 22, dl, MVT::i32));
+  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+                              DAG.getConstant(22, dl, MVT::i32));
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                     DAG.getConstant(3, dl, MVT::i32));
+}
+
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
@@ -2543,6 +2587,66 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
+// Lower vector multiply high (ISD::MULHS and ISD::MULHU).
+static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // {S,U}MULL{2} can be detected.  Otherwise v2i64 multiplications are not
+  // legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MULH{U,S}");
+
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+
+  SDLoc DL(Op);
+
+  EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+  // We turn (V0 mulhs/mulhu V1) to:
+  //
+  // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
+  //              (extract_subvector (ExtractVT V128:V1, (i64 0))))),
+  //       (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
+  //              (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
+  //
+  // Where ExtractVT is a subvector with half number of elements, and
+  // VMullIdx2 is the index of the middle element (the high part).
+  //
+  // The vector hight part extract and multiply will be matched against
+  // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
+  // issue a {s}mull2 instruction.
+  //
+  // This basically multiply the lower subvector with '{s,u}mull', the high
+  // subvector with '{s,u}mull2', and shuffle both results high part in
+  // resulting vector.
+  unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
+  SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
+  SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
+
+  SDValue VMullV0 =
+    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
+  SDValue VMullV1 =
+    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
+
+  SDValue VMull2V0 =
+    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
+  SDValue VMull2V1 =
+    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
+
+  unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
+                                                  : AArch64ISD::UMULL;
+
+  EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
+  SDValue Mull  = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
+  SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
+
+  Mull  = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
+  Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
+
+  return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
+}
+
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2571,10 +2675,72 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 }
 
+// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
+static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
+                                        EVT VT, EVT MemVT,
+                                        SelectionDAG &DAG) {
+  assert(VT.isVector() && "VT should be a vector type");
+  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
+
+  SDValue Value = ST->getValue();
+
+  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
+  // the word lane which represent the v4i8 subvector.  It optimizes the store
+  // to:
+  //
+  //   xtn  v0.8b, v0.8h
+  //   str  s0, [x0]
+
+  SDValue Undef = DAG.getUNDEF(MVT::i16);
+  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
+                                        {Undef, Undef, Undef, Undef});
+
+  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
+                                 Value, UndefVec);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
+
+  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
+  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                     Trunc, DAG.getConstant(0, DL, MVT::i64));
+
+  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
+                      ST->getBasePtr(), ST->getMemOperand());
+}
+
+// Custom lowering for any store, vector or scalar and/or default or with
+// a truncate operations.  Currently only custom lower truncate operation
+// from vector v4i16 to v4i8.
+SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc Dl(Op);
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  assert (StoreNode && "Can only custom lower store nodes");
+
+  SDValue Value = StoreNode->getValue();
+
+  EVT VT = Value.getValueType();
+  EVT MemVT = StoreNode->getMemoryVT();
+
+  assert (VT.isVector() && "Can only custom lower vector store types");
+
+  unsigned AS = StoreNode->getAddressSpace();
+  unsigned Align = StoreNode->getAlignment();
+  if (Align < MemVT.getStoreSize() &&
+      !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+    return scalarizeVectorStore(StoreNode, DAG);
+  }
+
+  if (StoreNode->isTruncatingStore()) {
+    return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
-  DEBUG(dbgs() << "Custom lowering: ");
-  DEBUG(Op.dump());
+  LLVM_DEBUG(dbgs() << "Custom lowering: ");
+  LLVM_DEBUG(Op.dump());
 
   switch (Op.getOpcode()) {
   default:
@@ -2673,10 +2839,17 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFP_TO_INT(Op, DAG);
   case ISD::FSINCOS:
     return LowerFSINCOS(Op, DAG);
+  case ISD::FLT_ROUNDS_:
+    return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
+  case ISD::MULHS:
+  case ISD::MULHU:
+    return LowerMULH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_SMAX:
   case ISD::VECREDUCE_SMIN:
@@ -2685,6 +2858,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_FMAX:
   case ISD::VECREDUCE_FMIN:
     return LowerVECREDUCE(Op, DAG);
+  case ISD::ATOMIC_LOAD_SUB:
+    return LowerATOMIC_LOAD_SUB(Op, DAG);
+  case ISD::ATOMIC_LOAD_AND:
+    return LowerATOMIC_LOAD_AND(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
 }
 
@@ -3667,7 +3846,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
-  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
+                                    N->getOffset(), Flag);
 }
 
 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
@@ -3693,7 +3873,7 @@ SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
 template <class NodeTy>
 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
                                       unsigned Flags) const {
-  DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
+  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
@@ -3706,7 +3886,7 @@ SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
 template <class NodeTy>
 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
                                             unsigned Flags) const {
-  DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
+  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   const unsigned char MO_NC = AArch64II::MO_NC;
@@ -3722,7 +3902,7 @@ SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
 template <class NodeTy>
 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
                                        unsigned Flags) const {
-  DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
+  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
@@ -3742,8 +3922,9 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
-  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
-         "unexpected offset in global node");
+  if (OpFlags != AArch64II::MO_NO_FLAG)
+    assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+           "unexpected offset in global node");
 
   // This also catches the large code model case for Darwin.
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
@@ -3764,7 +3945,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   return Result;
 }
 
-/// \brief Convert a TLS address reference into the correct sequence of loads
+/// Convert a TLS address reference into the correct sequence of loads
 /// and calls to compute the variable's address (for Darwin, currently) and
 /// return an SDValue containing the final node.
 
@@ -3968,16 +4149,77 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
+SDValue
+AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
+
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
+
+  // Load the ThreadLocalStoragePointer from the TEB
+  // A pointer to the TLS array is located at offset 0x58 from the TEB.
+  SDValue TLSArray =
+      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
+  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
+  Chain = TLSArray.getValue(1);
+
+  // Load the TLS index from the C runtime;
+  // This does the same as getAddr(), but without having a GlobalAddressSDNode.
+  // This also does the same as LOADgot, but using a generic i32 load,
+  // while LOADgot only loads i64.
+  SDValue TLSIndexHi =
+      DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
+  SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
+      "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
+  SDValue TLSIndex =
+      DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
+  TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
+  Chain = TLSIndex.getValue(1);
+
+  // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
+  // offset into the TLSArray.
+  TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
+  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
+                             DAG.getConstant(3, DL, PtrVT));
+  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
+                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
+                            MachinePointerInfo());
+  Chain = TLS.getValue(1);
+
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GA->getGlobal();
+  SDValue TGAHi = DAG.getTargetGlobalAddress(
+      GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+  SDValue TGALo = DAG.getTargetGlobalAddress(
+      GV, DL, PtrVT, 0,
+      AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+  // Add the offset from the start of the .tls section (section base).
+  SDValue Addr =
+      SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
+                                 DAG.getTargetConstant(0, DL, MVT::i32)),
+              0);
+  Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
+  return Addr;
+}
+
 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   if (Subtarget->isTargetDarwin())
     return LowerDarwinGlobalTLSAddress(Op, DAG);
   if (Subtarget->isTargetELF())
     return LowerELFGlobalTLSAddress(Op, DAG);
+  if (Subtarget->isTargetWindows())
+    return LowerWindowsGlobalTLSAddress(Op, DAG);
 
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
@@ -4778,9 +5020,13 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                        .Case("sp", AArch64::SP)
                        .Case("x18", AArch64::X18)
                        .Case("w18", AArch64::W18)
+                       .Case("x20", AArch64::X20)
+                       .Case("w20", AArch64::W20)
                        .Default(0);
-  if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
-      !Subtarget->isX18Reserved())
+  if (((Reg == AArch64::X18 || Reg == AArch64::W18) &&
+      !Subtarget->isX18Reserved()) ||
+      ((Reg == AArch64::X20 || Reg == AArch64::W20) &&
+      !Subtarget->isX20Reserved()))
     Reg = 0;
   if (Reg)
     return Reg;
@@ -4920,10 +5166,8 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
 
 bool AArch64TargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode *GA) const {
-  DEBUG(dbgs() << "Skipping offset folding global address: ");
-  DEBUG(GA->dump());
-  DEBUG(dbgs() << "AArch64 doesn't support folding offsets into global "
-        "addresses\n");
+  // Offsets are folded in the DAG combine rather than here so that we can
+  // intelligently choose an offset based on the uses.
   return false;
 }
 
@@ -4932,7 +5176,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   // FIXME: We should be able to handle f128 as well with a clever lowering.
   if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 ||
                           (VT == MVT::f16 && Subtarget->hasFullFP16()))) {
-    DEBUG(dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
+    LLVM_DEBUG(
+        dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
     return true;
   }
 
@@ -4953,14 +5198,17 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   }
 
   if (IsLegal) {
-    DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal << "\n");
+    LLVM_DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal
+                      << "\n");
     return true;
   }
 
   if (!FPType.empty())
-    DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal << "\n");
+    LLVM_DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal
+                      << "\n");
   else
-    DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal << ": unsupported fp type\n");
+    LLVM_DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal
+                      << ": unsupported fp type\n");
 
   return false;
 }
@@ -5004,7 +5252,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
       EVT VT = Operand.getValueType();
 
       SDNodeFlags Flags;
-      Flags.setUnsafeAlgebra(true);
+      Flags.setAllowReassociation(true);
 
       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -5014,7 +5262,6 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
-
       if (!Reciprocal) {
         EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                       VT);
@@ -5044,7 +5291,7 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
       EVT VT = Operand.getValueType();
 
       SDNodeFlags Flags;
-      Flags.setUnsafeAlgebra(true);
+      Flags.setAllowReassociation(true);
 
       // Newton reciprocal iteration: E * (2 - X * E)
       // AArch64 reciprocal iteration instruction: (2 - M * N)
@@ -5419,7 +5666,7 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
-  DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
+  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -5455,10 +5702,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
              !isa<ConstantSDNode>(V.getOperand(1))) {
-      DEBUG(dbgs() << "Reshuffle failed: "
-                      "a shuffle can only come from building a vector from "
-                      "various elements of other vectors, provided their "
-                      "indices are constant\n");
+      LLVM_DEBUG(
+          dbgs() << "Reshuffle failed: "
+                    "a shuffle can only come from building a vector from "
+                    "various elements of other vectors, provided their "
+                    "indices are constant\n");
       return SDValue();
     }
 
@@ -5475,8 +5723,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   }
 
   if (Sources.size() > 2) {
-    DEBUG(dbgs() << "Reshuffle failed: currently only do something sane when at "
-                    "most two source vectors are involved\n");
+    LLVM_DEBUG(
+        dbgs() << "Reshuffle failed: currently only do something sane when at "
+                  "most two source vectors are involved\n");
     return SDValue();
   }
 
@@ -5522,7 +5771,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
-      DEBUG(dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
+      LLVM_DEBUG(
+          dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
       return SDValue();
     }
 
@@ -5568,10 +5818,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   }
 
   // Final sanity check before we try to actually produce a shuffle.
-  DEBUG(
-    for (auto Src : Sources)
-      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
-  );
+  LLVM_DEBUG(for (auto Src
+                  : Sources)
+                 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
 
   // The stars all align, our next step is to produce the mask for the shuffle.
   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
@@ -5604,7 +5853,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
 
   // Final check before we try to produce nonsense...
   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
-    DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
+    LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
     return SDValue();
   }
 
@@ -5616,12 +5865,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                          ShuffleOps[1], Mask);
   SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 
-  DEBUG(
-    dbgs() << "Reshuffle, creating node: ";
-    Shuffle.dump();
-    dbgs() << "Reshuffle, creating node: ";
-    V.dump();
-  );
+  LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
+             dbgs() << "Reshuffle, creating node: "; V.dump(););
 
   return V;
 }
@@ -6256,96 +6501,235 @@ static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
   return false;
 }
 
-SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  SDValue LHS = Op.getOperand(0);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
+// Try 64-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+                                 const APInt &Bits) {
+  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+    EVT VT = Op.getValueType();
+    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
 
-  if (!BVN)
-    return Op;
+    if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
 
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We only have BIC vector immediate instruction, which is and-not.
-    CnstBits = ~CnstBits;
-
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+      SDLoc dl(Op);
+      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+                                DAG.getConstant(Value, dl, MVT::i32));
+      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+    }
+  }
 
-      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+  return SDValue();
+}
 
-      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(16, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+// Try 32-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+                                  const APInt &Bits,
+                                  const SDValue *LHS = nullptr) {
+  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+    EVT VT = Op.getValueType();
+    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+    bool isAdvSIMDModImm = false;
+    uint64_t Shift;
+
+    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
+      Shift = 0;
+    }
+    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
+      Shift = 8;
+    }
+    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
+      Shift = 16;
+    }
+    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
+      Shift = 24;
+    }
 
-      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(24, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+    if (isAdvSIMDModImm) {
+      SDLoc dl(Op);
+      SDValue Mov;
 
-      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+      if (LHS)
+        Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
+                          DAG.getConstant(Value, dl, MVT::i32),
+                          DAG.getConstant(Shift, dl, MVT::i32));
+      else
+        Mov = DAG.getNode(NewOp, dl, MovTy,
+                          DAG.getConstant(Value, dl, MVT::i32),
+                          DAG.getConstant(Shift, dl, MVT::i32));
 
-      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+    }
+  }
+
+  return SDValue();
+}
+
+// Try 16-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+                                  const APInt &Bits,
+                                  const SDValue *LHS = nullptr) {
+  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+    EVT VT = Op.getValueType();
+    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+    bool isAdvSIMDModImm = false;
+    uint64_t Shift;
+
+    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
+      Shift = 0;
+    }
+    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
+      Shift = 8;
+    }
+
+    if (isAdvSIMDModImm) {
+      SDLoc dl(Op);
+      SDValue Mov;
+
+      if (LHS)
+        Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
+                          DAG.getConstant(Value, dl, MVT::i32),
+                          DAG.getConstant(Shift, dl, MVT::i32));
+      else
+        Mov = DAG.getNode(NewOp, dl, MovTy,
+                          DAG.getConstant(Value, dl, MVT::i32),
+                          DAG.getConstant(Shift, dl, MVT::i32));
+
+      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+    }
+  }
+
+  return SDValue();
+}
+
+// Try 32-bit splatted SIMD immediate with shifted ones.
+static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
+                                    SelectionDAG &DAG, const APInt &Bits) {
+  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+    EVT VT = Op.getValueType();
+    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+    bool isAdvSIMDModImm = false;
+    uint64_t Shift;
+
+    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
+      Shift = 264;
+    }
+    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
+      Shift = 272;
+    }
+
+    if (isAdvSIMDModImm) {
+      SDLoc dl(Op);
+      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+                                DAG.getConstant(Value, dl, MVT::i32),
+                                DAG.getConstant(Shift, dl, MVT::i32));
+      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+    }
+  }
+
+  return SDValue();
+}
+
+// Try 8-bit splatted SIMD immediate.
+static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+                                 const APInt &Bits) {
+  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+    EVT VT = Op.getValueType();
+    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+
+    if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
+
+      SDLoc dl(Op);
+      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+                                DAG.getConstant(Value, dl, MVT::i32));
+      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+    }
+  }
+
+  return SDValue();
+}
+
+// Try FP splatted SIMD immediate.
+static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
+                                  const APInt &Bits) {
+  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
+    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
+    EVT VT = Op.getValueType();
+    bool isWide = (VT.getSizeInBits() == 128);
+    MVT MovTy;
+    bool isAdvSIMDModImm = false;
+
+    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
+      MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
+    }
+    else if (isWide &&
+             (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
+      Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
+      MovTy = MVT::v2f64;
+    }
+
+    if (isAdvSIMDModImm) {
+      SDLoc dl(Op);
+      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
+                                DAG.getConstant(Value, dl, MVT::i32));
+      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
+  }
+
+  return SDValue();
+}
 
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = ~UndefBits;
-    goto AttemptModImm;
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  EVT VT = Op.getValueType();
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  if (!BVN) {
+    // AND commutes, so try swapping the operands.
+    LHS = Op.getOperand(1);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
   }
+  if (!BVN)
+    return Op;
 
-// We can always fall back to a non-immediate AND.
-FailedModImm:
+  APInt DefBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+    SDValue NewOp;
+
+    // We only have BIC vector immediate instruction, which is and-not.
+    DefBits = ~DefBits;
+    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
+                                    DefBits, &LHS)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
+                                    DefBits, &LHS)))
+      return NewOp;
+
+    UndefBits = ~UndefBits;
+    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
+                                    UndefBits, &LHS)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
+                                    UndefBits, &LHS)))
+      return NewOp;
+  }
+
+  // We can always fall back to a non-immediate AND.
   return Op;
 }
 
@@ -6439,10 +6823,10 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
                   DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
                   Shift.getOperand(1));
 
-  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
-  DEBUG(N->dump(&DAG));
-  DEBUG(dbgs() << "into: \n");
-  DEBUG(ResultSLI->dump(&DAG));
+  LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  LLVM_DEBUG(N->dump(&DAG));
+  LLVM_DEBUG(dbgs() << "into: \n");
+  LLVM_DEBUG(ResultSLI->dump(&DAG));
 
   ++NumShiftInserts;
   return ResultSLI;
@@ -6456,96 +6840,38 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
       return Res;
   }
 
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
-  SDValue LHS = Op.getOperand(1);
-  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
-  // OR commutes, so try swapping the operands.
+  SDValue LHS = Op.getOperand(0);
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   if (!BVN) {
-    LHS = Op.getOperand(0);
-    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+    // OR commutes, so try swapping the operands.
+    LHS = Op.getOperand(1);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
   }
   if (!BVN)
     return Op;
 
-  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt DefBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(16, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+    SDValue NewOp;
 
-      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(24, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-    }
+    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
+                                    DefBits, &LHS)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
+                                    DefBits, &LHS)))
+      return NewOp;
 
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
+    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
+                                    UndefBits, &LHS)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
+                                    UndefBits, &LHS)))
+      return NewOp;
   }
 
-// We can always fall back to a non-immediate OR.
-FailedModImm:
+  // We can always fall back to a non-immediate OR.
   return Op;
 }
 
@@ -6573,226 +6899,71 @@ static SDValue NormalizeBuildVector(SDValue Op,
   return DAG.getBuildVector(VT, dl, Ops);
 }
 
-SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc dl(Op);
+static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
-  Op = NormalizeBuildVector(Op, DAG);
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
 
-  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt DefBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      // Certain magic vector constants (used to express things like NOT
-      // and NEG) are passed through unmodified.  This allows codegen patterns
-      // for these operations to match.  Special-purpose patterns will lower
-      // these immediates to MOVIs if it proves necessary.
-      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
-        return Op;
-
-      // The many faces of MOVI...
-      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
-        if (VT.getSizeInBits() == 128) {
-          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
-                                    DAG.getConstant(CnstVal, dl, MVT::i32));
-          return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-        }
-
-        // Support the V64 version via subregister insertion.
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(16, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(24, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(264, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(272, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
-        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      // The few faces of FMOV...
-      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
-        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
-          VT.getSizeInBits() == 128) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
-        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      // The many faces of MVNI...
-      CnstVal = ~CnstVal;
-      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(16, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(24, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
-
-      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+    SDValue NewOp;
+    if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
+      return NewOp;
+
+    DefBits = ~DefBits;
+    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
+      return NewOp;
+
+    DefBits = UndefBits;
+    if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
+      return NewOp;
+
+    DefBits = ~UndefBits;
+    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
+      return NewOp;
+  }
 
-      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(8, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+  return SDValue();
+}
 
-      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(264, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
-      }
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
 
-      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, dl, MVT::i32),
-                                  DAG.getConstant(272, dl, MVT::i32));
-        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+  // Try to build a simple constant vector.
+  Op = NormalizeBuildVector(Op, DAG);
+  if (VT.isInteger()) {
+    // Certain vector constants, used to express things like logical NOT and
+    // arithmetic NEG, are passed through unmodified.  This allows special
+    // patterns for these operations to match, which will lower these constants
+    // to whatever is proven necessary.
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+    if (BVN->isConstant())
+      if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
+        unsigned BitSize = VT.getVectorElementType().getSizeInBits();
+        APInt Val(BitSize,
+                  Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
+        if (Val.isNullValue() || Val.isAllOnesValue())
+          return Op;
       }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
   }
-FailedModImm:
+
+  if (SDValue V = ConstantBuildVector(Op, DAG))
+    return V;
 
   // Scan through the operands to find some interesting properties we can
   // exploit:
@@ -6805,16 +6976,21 @@ FailedModImm:
   //             select the values we'll be overwriting for the non-constant
   //             lanes such that we can directly materialize the vector
   //             some other way (MOVI, e.g.), we can be sneaky.
+  //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
+  SDLoc dl(Op);
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
   bool usesOnlyOneConstantValue = true;
   bool isConstant = true;
+  bool AllLanesExtractElt = true;
   unsigned NumConstantLanes = 0;
   SDValue Value;
   SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
+    if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      AllLanesExtractElt = false;
     if (V.isUndef())
       continue;
     if (i > 0)
@@ -6837,23 +7013,86 @@ FailedModImm:
   }
 
   if (!Value.getNode()) {
-    DEBUG(dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
+    LLVM_DEBUG(
+        dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
     return DAG.getUNDEF(VT);
   }
 
   if (isOnlyLowElement) {
-    DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
-                    "SCALAR_TO_VECTOR node\n");
+    LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
+                         "SCALAR_TO_VECTOR node\n");
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
   }
 
+  if (AllLanesExtractElt) {
+    SDNode *Vector = nullptr;
+    bool Even = false;
+    bool Odd = false;
+    // Check whether the extract elements match the Even pattern <0,2,4,...> or
+    // the Odd pattern <1,3,5,...>.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      const SDNode *N = V.getNode();
+      if (!isa<ConstantSDNode>(N->getOperand(1)))
+        break;
+      SDValue N0 = N->getOperand(0);
+
+      // All elements are extracted from the same vector.
+      if (!Vector) {
+        Vector = N0.getNode();
+        // Check that the type of EXTRACT_VECTOR_ELT matches the type of
+        // BUILD_VECTOR.
+        if (VT.getVectorElementType() !=
+            N0.getValueType().getVectorElementType())
+          break;
+      } else if (Vector != N0.getNode()) {
+        Odd = false;
+        Even = false;
+        break;
+      }
+
+      // Extracted values are either at Even indices <0,2,4,...> or at Odd
+      // indices <1,3,5,...>.
+      uint64_t Val = N->getConstantOperandVal(1);
+      if (Val == 2 * i) {
+        Even = true;
+        continue;
+      }
+      if (Val - 1 == 2 * i) {
+        Odd = true;
+        continue;
+      }
+
+      // Something does not match: abort.
+      Odd = false;
+      Even = false;
+      break;
+    }
+    if (Even || Odd) {
+      SDValue LHS =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
+                      DAG.getConstant(0, dl, MVT::i64));
+      SDValue RHS =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
+                      DAG.getConstant(NumElts, dl, MVT::i64));
+
+      if (Even && !Odd)
+        return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
+                           RHS);
+      if (Odd && !Even)
+        return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
+                           RHS);
+    }
+  }
+
   // Use DUP for non-constant splats. For f32 constant splats, reduce to
   // i32 and try again.
   if (usesOnlyOneValue) {
     if (!isConstant) {
       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
           Value.getValueType() != VT) {
-        DEBUG(dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
+        LLVM_DEBUG(
+            dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
       }
 
@@ -6862,8 +7101,9 @@ FailedModImm:
       SDValue Lane = Value.getOperand(1);
       Value = Value.getOperand(0);
       if (Value.getValueSizeInBits() == 64) {
-        DEBUG(dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
-                        "widening it\n");
+        LLVM_DEBUG(
+            dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
+                      "widening it\n");
         Value = WidenVector(Value, DAG);
       }
 
@@ -6876,17 +7116,16 @@ FailedModImm:
       EVT EltTy = VT.getVectorElementType();
       assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
               "Unsupported floating-point vector type");
-      DEBUG(dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
-                      "BITCASTS, and try again\n");
+      LLVM_DEBUG(
+          dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
+                    "BITCASTS, and try again\n");
       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
       for (unsigned i = 0; i < NumElts; ++i)
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
-      DEBUG(
-        dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
-        Val.dump();
-      );
+      LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
+                 Val.dump(););
       Val = LowerBUILD_VECTOR(Val, DAG);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -6898,24 +7137,32 @@ FailedModImm:
   // is better than the default, which will perform a separate initialization
   // for each lane.
   if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
-    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Firstly, try to materialize the splat constant.
+    SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
+            Val = ConstantBuildVector(Vec, DAG);
+    if (!Val) {
+      // Otherwise, materialize the constant and splat it.
+      Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+      DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
+    }
+
     // Now insert the non-constant lanes.
     for (unsigned i = 0; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
-      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
         // Note that type legalization likely mucked about with the VT of the
         // source operand, so we may have to convert it here before inserting.
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
-      }
     }
     return Val;
   }
 
   // This will generate a load from the constant pool.
   if (isConstant) {
-    DEBUG(dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
-                    "expansion\n");
+    LLVM_DEBUG(
+        dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
+                  "expansion\n");
     return SDValue();
   }
 
@@ -6932,8 +7179,9 @@ FailedModImm:
   // shuffle is valid for the target) and materialization element by element
   // on the stack followed by a load for everything else.
   if (!isConstant && !usesOnlyOneValue) {
-    DEBUG(dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
-                    "of INSERT_VECTOR_ELT\n");
+    LLVM_DEBUG(
+        dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
+                  "of INSERT_VECTOR_ELT\n");
 
     SDValue Vec = DAG.getUNDEF(VT);
     SDValue Op0 = Op.getOperand(0);
@@ -6950,14 +7198,12 @@ FailedModImm:
     // extended (i32) and it is safe to cast them to the vector type by ignoring
     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
     if (!Op0.isUndef()) {
-      DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
+      LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
       ++i;
     }
-    DEBUG(
-      if (i < NumElts)
-        dbgs() << "Creating nodes for the other vector elements:\n";
-    );
+    LLVM_DEBUG(if (i < NumElts) dbgs()
+                   << "Creating nodes for the other vector elements:\n";);
     for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       if (V.isUndef())
@@ -6968,8 +7214,9 @@ FailedModImm:
     return Vec;
   }
 
-  DEBUG(dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
-                  "better alternative\n");
+  LLVM_DEBUG(
+      dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
+                "better alternative\n");
   return SDValue();
 }
 
@@ -7310,8 +7557,21 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
-  if (LHS.getValueType().getVectorElementType() == MVT::f16)
-    return SDValue();
+  const bool FullFP16 =
+    static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+
+  // Make v4f16 (only) fcmp operations utilise vector instructions
+  // v8f16 support will be a litle more complicated
+  if (LHS.getValueType().getVectorElementType() == MVT::f16) {
+    if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
+      SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
+      DAG.ReplaceAllUsesWith(Op, NewSetcc);
+      CmpVT = MVT::v4i32;
+    } else
+      return SDValue();
+  }
 
   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
          LHS.getValueType().getVectorElementType() == MVT::f64);
@@ -7386,6 +7646,111 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
   }
 }
 
+SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+  if (!Subtarget.hasLSE())
+    return SDValue();
+
+  // LSE has an atomic load-add instruction, but not a load-sub.
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue RHS = Op.getOperand(2);
+  AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
+  RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
+                       Op.getOperand(0), Op.getOperand(1), RHS,
+                       AN->getMemOperand());
+}
+
+SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+  if (!Subtarget.hasLSE())
+    return SDValue();
+
+  // LSE has an atomic load-clear instruction, but not a load-and.
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue RHS = Op.getOperand(2);
+  AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
+  RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
+                       Op.getOperand(0), Op.getOperand(1), RHS,
+                       AN->getMemOperand());
+}
+
+SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
+    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
+
+  const uint32_t *Mask =
+      Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask();
+
+  Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
+                     DAG.getConstant(4, dl, MVT::i64));
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  // To match the actual intent better, we should read the output from X15 here
+  // again (instead of potentially spilling it to the stack), but rereading Size
+  // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
+  // here.
+
+  Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
+                     DAG.getConstant(4, dl, MVT::i64));
+  return Chain;
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() &&
+         "Only Windows alloca probing supported");
+  SDLoc dl(Op);
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          "no-stack-arg-probe")) {
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+    Chain = SP.getValue(1);
+    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+    if (Align)
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+    SDValue Ops[2] = {SP, Chain};
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
+
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align, dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
@@ -7491,6 +7856,33 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   return false;
 }
 
+bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+                                                  ISD::LoadExtType ExtTy,
+                                                  EVT NewVT) const {
+  // If we're reducing the load width in order to avoid having to use an extra
+  // instruction to do extension then it's probably a good idea.
+  if (ExtTy != ISD::NON_EXTLOAD)
+    return true;
+  // Don't reduce load width if it would prevent us from combining a shift into
+  // the offset.
+  MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
+  assert(Mem);
+  const SDValue &Base = Mem->getBasePtr();
+  if (Base.getOpcode() == ISD::ADD &&
+      Base.getOperand(1).getOpcode() == ISD::SHL &&
+      Base.getOperand(1).hasOneUse() &&
+      Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
+    // The shift can be combined if it matches the size of the value being
+    // loaded (and so reducing the width would make it not match).
+    uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
+    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
+    if (ShiftAmount == Log2_32(LoadBytes))
+      return false;
+  }
+  // We have no reason to disallow reducing the load width, so allow it.
+  return true;
+}
+
 // Truncations from 64-bit GPR to 32-bit GPR is free.
 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
@@ -7666,7 +8058,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
   return VecSize == 64 || VecSize % 128 == 0;
 }
 
-/// \brief Lower an interleaved load into a ldN intrinsic.
+/// Lower an interleaved load into a ldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
@@ -7778,7 +8170,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   return true;
 }
 
-/// \brief Lower an interleaved store into a stN intrinsic.
+/// Lower an interleaved store into a stN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
@@ -7836,8 +8228,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
     Type *IntTy = DL.getIntPtrType(EltTy);
-    unsigned NumOpElts =
-        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+    unsigned NumOpElts = Op0->getType()->getVectorNumElements();
 
     // Convert to the corresponding integer vector.
     Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
@@ -7952,15 +8343,16 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   if (Immed == std::numeric_limits<int64_t>::min()) {
-    DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n");
+    LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
+                      << ": avoid UB for INT64_MIN\n");
     return false;
   }
   // Same encoding for add/sub, just flip the sign.
   Immed = std::abs(Immed);
   bool IsLegal = ((Immed >> 12) == 0 ||
                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
-  DEBUG(dbgs() << "Is " << Immed << " legal add imm: " <<
-        (IsLegal ? "yes" : "no") << "\n");
+  LLVM_DEBUG(dbgs() << "Is " << Immed
+                    << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
   return IsLegal;
 }
 
@@ -8021,6 +8413,11 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
   return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
 }
 
+bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
+  // Consider splitting large offset of struct or array.
+  return true;
+}
+
 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
                                                 const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
@@ -8105,6 +8502,14 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return Shift < 3;
 }
 
+bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                                                    unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
@@ -8747,10 +9152,12 @@ static SDValue performBitcastCombine(SDNode *N,
   // If the source type has twice the number of elements as our destination
   // type, we know this is an extract of the high or low half of the vector.
   EVT SVT = Source->getValueType(0);
-  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+  if (!SVT.isVector() ||
+      SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
     return SDValue();
 
-  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+  LLVM_DEBUG(
+      dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
 
   // Create the simplified form to just extract the low or high half of the
   // vector directly rather than bothering with the bitcasts.
@@ -8838,7 +9245,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   if (!RHSTy.isVector())
     return SDValue();
 
-  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+  LLVM_DEBUG(
+      dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
 
   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
                                   RHSTy.getVectorNumElements() * 2);
@@ -8851,7 +9259,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
 static SDValue tryCombineFixedPointConvert(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
+  // Wait until after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
@@ -8953,26 +9361,26 @@ static bool isEssentiallyExtractSubvector(SDValue N) {
          N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
 }
 
-/// \brief Helper structure to keep track of ISD::SET_CC operands.
+/// Helper structure to keep track of ISD::SET_CC operands.
 struct GenericSetCCInfo {
   const SDValue *Opnd0;
   const SDValue *Opnd1;
   ISD::CondCode CC;
 };
 
-/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
 struct AArch64SetCCInfo {
   const SDValue *Cmp;
   AArch64CC::CondCode CC;
 };
 
-/// \brief Helper structure to keep track of SetCC information.
+/// Helper structure to keep track of SetCC information.
 union SetCCInfo {
   GenericSetCCInfo Generic;
   AArch64SetCCInfo AArch64;
 };
 
-/// \brief Helper structure to be able to read SetCC information.  If set to
+/// Helper structure to be able to read SetCC information.  If set to
 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
 /// GenericSetCCInfo.
 struct SetCCInfoAndKind {
@@ -8980,7 +9388,7 @@ struct SetCCInfoAndKind {
   bool IsAArch64;
 };
 
-/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// Check whether or not \p Op is a SET_CC operation, either a generic or
 /// an
 /// AArch64 lowered one.
 /// \p SetCCInfo is filled accordingly.
@@ -10422,12 +10830,65 @@ static SDValue performNVCASTCombine(SDNode *N) {
   return SDValue();
 }
 
+// If all users of the globaladdr are of the form (globaladdr + constant), find
+// the smallest constant, fold it into the globaladdr's offset and rewrite the
+// globaladdr as (globaladdr + constant) - constant.
+static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
+                                           const AArch64Subtarget *Subtarget,
+                                           const TargetMachine &TM) {
+  auto *GN = dyn_cast<GlobalAddressSDNode>(N);
+  if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
+                 AArch64II::MO_NO_FLAG)
+    return SDValue();
+
+  uint64_t MinOffset = -1ull;
+  for (SDNode *N : GN->uses()) {
+    if (N->getOpcode() != ISD::ADD)
+      return SDValue();
+    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
+    if (!C)
+      C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!C)
+      return SDValue();
+    MinOffset = std::min(MinOffset, C->getZExtValue());
+  }
+  uint64_t Offset = MinOffset + GN->getOffset();
+
+  // Require that the new offset is larger than the existing one. Otherwise, we
+  // can end up oscillating between two possible DAGs, for example,
+  // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
+  if (Offset <= uint64_t(GN->getOffset()))
+    return SDValue();
+
+  // Check whether folding this offset is legal. It must not go out of bounds of
+  // the referenced object to avoid violating the code model, and must be
+  // smaller than 2^21 because this is the largest offset expressible in all
+  // object formats.
+  //
+  // This check also prevents us from folding negative offsets, which will end
+  // up being treated in the same way as large positive ones. They could also
+  // cause code model violations, and aren't really common enough to matter.
+  if (Offset >= (1 << 21))
+    return SDValue();
+
+  const GlobalValue *GV = GN->getGlobal();
+  Type *T = GV->getValueType();
+  if (!T->isSized() ||
+      Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
+    return SDValue();
+
+  SDLoc DL(GN);
+  SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
+  return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
+                     DAG.getConstant(MinOffset, DL, MVT::i64));
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default:
-    DEBUG(dbgs() << "Custom combining: skipping\n");
+    LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
     break;
   case ISD::ADD:
   case ISD::SUB:
@@ -10509,6 +10970,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     default:
       break;
     }
+  case ISD::GlobalAddress:
+    return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
   }
   return SDValue();
 }
@@ -10669,11 +11132,79 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
   return std::make_pair(Lo, Hi);
 }
 
+// Create an even/odd pair of X registers holding integer value V.
+static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
+  SDLoc dl(V.getNode());
+  SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
+  SDValue VHi = DAG.getAnyExtOrTrunc(
+      DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
+      dl, MVT::i64);
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap (VLo, VHi);
+  SDValue RegClass =
+      DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
+  SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
+  SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
+}
+
 static void ReplaceCMP_SWAP_128Results(SDNode *N,
-                                       SmallVectorImpl<SDValue> & Results,
-                                       SelectionDAG &DAG) {
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG,
+                                       const AArch64Subtarget *Subtarget) {
   assert(N->getValueType(0) == MVT::i128 &&
          "AtomicCmpSwap on types less than 128 should be legal");
+
+  if (Subtarget->hasLSE()) {
+    // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
+    // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
+    SDValue Ops[] = {
+        createGPRPairNode(DAG, N->getOperand(2)), // Compare value
+        createGPRPairNode(DAG, N->getOperand(3)), // Store value
+        N->getOperand(1), // Ptr
+        N->getOperand(0), // Chain in
+    };
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+    MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+
+    unsigned Opcode;
+    switch (MemOp[0]->getOrdering()) {
+    case AtomicOrdering::Monotonic:
+      Opcode = AArch64::CASPX;
+      break;
+    case AtomicOrdering::Acquire:
+      Opcode = AArch64::CASPAX;
+      break;
+    case AtomicOrdering::Release:
+      Opcode = AArch64::CASPLX;
+      break;
+    case AtomicOrdering::AcquireRelease:
+    case AtomicOrdering::SequentiallyConsistent:
+      Opcode = AArch64::CASPALX;
+      break;
+    default:
+      llvm_unreachable("Unexpected ordering!");
+    }
+
+    MachineSDNode *CmpSwap = DAG.getMachineNode(
+        Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
+    CmpSwap->setMemRefs(MemOp, MemOp + 1);
+
+    unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(SubReg1, SubReg2);
+    Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
+                                                 SDValue(CmpSwap, 0)));
+    Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
+                                                 SDValue(CmpSwap, 0)));
+    Results.push_back(SDValue(CmpSwap, 1)); // Chain out
+    return;
+  }
+
   auto Desired = splitInt128(N->getOperand(2), DAG);
   auto New = splitInt128(N->getOperand(3), DAG);
   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
@@ -10732,7 +11263,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
     // Let normal code take care of it by not adding anything to Results.
     return;
   case ISD::ATOMIC_CMP_SWAP:
-    ReplaceCMP_SWAP_128Results(N, Results, DAG);
+    ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
     return;
   }
 }
@@ -10996,6 +11527,10 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   return OptSize && !VT.isVector();
 }
 
+bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
+}
+
 unsigned
 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
@@ -11003,3 +11538,8 @@ AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
 
   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
 }
+
+void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
+  MF.getFrameInfo().computeMaxCallFrameSize(MF);
+  TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 8d78b5b6b5b4..592845640a44 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -309,6 +309,9 @@ public:
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
+  bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                             EVT NewVT) const override;
+
   bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
   bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
@@ -332,6 +335,8 @@ public:
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
+  bool shouldConsiderGEPOffsetSplit() const override;
+
   EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                           bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
@@ -342,7 +347,7 @@ public:
                              unsigned AS,
                              Instruction *I = nullptr) const override;
 
-  /// \brief Return the cost of the scaling factor used in the addressing
+  /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
@@ -357,14 +362,19 @@ public:
 
   const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
-  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  /// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
   bool isDesirableToCommuteWithShift(const SDNode *N) const override;
 
-  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// Returns true if it is beneficial to convert a load of a constant
   /// to just the constant itself.
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
+  /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+  /// with this index.
+  bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                               unsigned Index) const override;
+
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
@@ -433,9 +443,35 @@ public:
 
   bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
 
-  bool hasAndNotCompare(SDValue) const override {
-    // 'bics'
-    return true;
+  bool hasAndNotCompare(SDValue V) const override {
+    // We can use bics for any scalar.
+    return V.getValueType().isScalarInteger();
+  }
+
+  bool hasAndNot(SDValue Y) const override {
+    EVT VT = Y.getValueType();
+
+    if (!VT.isVector())
+      return hasAndNotCompare(Y);
+
+    return VT.getSizeInBits() >= 64; // vector 'bic'
+  }
+
+  bool shouldTransformSignedTruncationCheck(EVT XVT,
+                                            unsigned KeptBits) const override {
+    // For vectors, we don't have a preference..
+    if (XVT.isVector())
+      return false;
+
+    auto VTIsOk = [](EVT VT) -> bool {
+      return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+             VT == MVT::i64;
+    };
+
+    // We are ok with KeptBitsVT being byte/word/dword, what SXT supports.
+    // XVT will be larger than KeptBitsVT.
+    MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+    return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
   }
 
   bool hasBitPreservingFPLogic(EVT VT) const override {
@@ -456,6 +492,9 @@ public:
     return true;
   }
 
+  /// Enable aggressive FMA fusion on targets that want it.
+  bool enableAggressiveFMAFusion(EVT VT) const override;
+
   /// Returns the size of the platform's va_list object.
   unsigned getVaListSizeInBits(const DataLayout &DL) const override;
 
@@ -476,12 +515,12 @@ public:
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
 private:
-  bool isExtFreeImpl(const Instruction *Ext) const override;
-
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
+  bool isExtFreeImpl(const Instruction *Ext) const override;
+
   void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
@@ -502,6 +541,8 @@ private:
                           SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
                           SDValue ThisVal) const;
 
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
@@ -545,12 +586,14 @@ private:
   SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
   template <class NodeTy>
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
+  SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
                                  SelectionDAG &DAG) const;
+  SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -569,6 +612,7 @@ private:
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -592,6 +636,12 @@ private:
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
+                                         SDValue &Size,
+                                         SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
@@ -647,6 +697,8 @@ private:
                           SelectionDAG &DAG) const override;
 
   bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
+
+  void finalizeLowering(MachineFunction &MF) const override;
 };
 
 namespace AArch64 {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 153bcf75cbcd..35cd7735ceb7 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -409,13 +409,18 @@ let Predicates = [HasLSE] in {
   defm : LDOPregister_patterns<"LDADD", "atomic_load_add">;
   defm : LDOPregister_patterns<"LDSET", "atomic_load_or">;
   defm : LDOPregister_patterns<"LDEOR", "atomic_load_xor">;
+  defm : LDOPregister_patterns<"LDCLR", "atomic_load_clr">;
   defm : LDOPregister_patterns<"LDSMAX", "atomic_load_max">;
   defm : LDOPregister_patterns<"LDSMIN", "atomic_load_min">;
   defm : LDOPregister_patterns<"LDUMAX", "atomic_load_umax">;
   defm : LDOPregister_patterns<"LDUMIN", "atomic_load_umin">;
   defm : LDOPregister_patterns<"SWP", "atomic_swap">;
+  defm : CASregister_patterns<"CAS", "atomic_cmp_swap">;
+
+  // These two patterns are only needed for global isel, selection dag isel
+  // converts atomic load-sub into a sub and atomic load-add, and likewise for
+  // and -> clr.
   defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;
   defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;
-  defm : CASregister_patterns<"CAS", "atomic_cmp_swap">;
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 80c5092a4eed..1060c64f7b5d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -167,7 +167,7 @@ def ExtendOperandLSL64 : AsmOperandClass {
 // 8-bit floating-point immediate encodings.
 def FPImmOperand : AsmOperandClass {
   let Name = "FPImm";
-  let ParserMethod = "tryParseFPImm";
+  let ParserMethod = "tryParseFPImm<true>";
   let DiagnosticType = "InvalidFPImm";
 }
 
@@ -179,20 +179,40 @@ def CondCode : AsmOperandClass {
 // A 32-bit register pasrsed as 64-bit
 def GPR32as64Operand : AsmOperandClass {
   let Name = "GPR32as64";
+  let ParserMethod =
+      "tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSubReg>";
 }
 def GPR32as64 : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32as64Operand;
 }
 
+// A 64-bit register pasrsed as 32-bit
+def GPR64as32Operand : AsmOperandClass {
+  let Name = "GPR64as32";
+  let ParserMethod =
+      "tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSuperReg>";
+}
+def GPR64as32 : RegisterOperand<GPR64, "printGPR64as32"> {
+  let ParserMatchClass = GPR64as32Operand;
+}
+
 // 8-bit immediate for AdvSIMD where 64-bit values of the form:
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
 // are encoded as the eight bit value 'abcdefgh'.
 def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
 
-// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
-def SImm10s8Operand : AsmOperandClass {
-  let Name = "SImm10s8";
-  let DiagnosticType = "InvalidMemoryIndexedSImm10";
+class UImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
+  let Name = "UImm" # Width # "s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "UImm" # Width;
+  let RenderMethod = "addImmScaledOperands<" # Scale # ">";
+  let PredicateMethod = "isUImmScaled<" # Width # ", " # Scale # ">";
+}
+
+class SImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
+  let Name = "SImm" # Width # "s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm" # Width;
+  let RenderMethod = "addImmScaledOperands<" # Scale # ">";
+  let PredicateMethod = "isSImmScaled<" # Width # ", " # Scale # ">";
 }
 
 //===----------------------------------------------------------------------===//
@@ -221,31 +241,66 @@ def adrlabel : Operand<i64> {
   let ParserMatchClass = AdrOperand;
 }
 
+class SImmOperand<int width> : AsmOperandClass {
+  let Name = "SImm" # width;
+  let DiagnosticType = "InvalidMemoryIndexedSImm" # width;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isSImm<" # width # ">";
+}
+
+// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
+def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>;
 def simm10Scaled : Operand<i64> {
   let ParserMatchClass = SImm10s8Operand;
   let DecoderMethod = "DecodeSImm<10>";
   let PrintMethod = "printImmScale<8>";
 }
 
-// simm9 predicate - True if the immediate is in the range [-256, 255].
-def SImm9Operand : AsmOperandClass {
-  let Name = "SImm9";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+// uimm6 predicate - True if the immediate is in the range [0, 63].
+def UImm6Operand : AsmOperandClass {
+  let Name = "UImm6";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+def uimm6 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
+  let ParserMatchClass = UImm6Operand;
 }
+
+def SImm9Operand : SImmOperand<9>;
 def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
   let ParserMatchClass = SImm9Operand;
+  let DecoderMethod = "DecodeSImm<9>";
+}
+
+def SImm8Operand : SImmOperand<8>;
+def simm8 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -128 && Imm < 127; }]> {
+  let ParserMatchClass = SImm8Operand;
+  let DecoderMethod = "DecodeSImm<8>";
+}
+
+def SImm6Operand : SImmOperand<6>;
+def simm6_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -32 && Imm < 32; }]> {
+  let ParserMatchClass = SImm6Operand;
+  let DecoderMethod = "DecodeSImm<6>";
+}
+
+def SImm5Operand : SImmOperand<5>;
+def simm5_64b : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -16 && Imm < 16; }]> {
+  let ParserMatchClass = SImm5Operand;
+  let DecoderMethod = "DecodeSImm<5>";
+}
+
+def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]> {
+  let ParserMatchClass = SImm5Operand;
+  let DecoderMethod = "DecodeSImm<5>";
 }
 
 // simm7sN predicate - True if the immediate is a multiple of N in the range
 // [-64 * N, 63 * N].
-class SImm7Scaled<int Scale> : AsmOperandClass {
-  let Name = "SImm7s" # Scale;
-  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
-}
 
-def SImm7s4Operand : SImm7Scaled<4>;
-def SImm7s8Operand : SImm7Scaled<8>;
-def SImm7s16Operand : SImm7Scaled<16>;
+def SImm7s4Operand : SImmScaledMemoryIndexed<7, 4>;
+def SImm7s8Operand : SImmScaledMemoryIndexed<7, 8>;
+def SImm7s16Operand : SImmScaledMemoryIndexed<7, 16>;
 
 def simm7s4 : Operand<i32> {
   let ParserMatchClass = SImm7s4Operand;
@@ -268,9 +323,107 @@ def am_indexed7s32  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
 def am_indexed7s64  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
 def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 
+// uimm5sN predicate - True if the immediate is a multiple of N in the range
+// [0 * N, 32 * N].
+def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
+def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
+def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
+
+def uimm5s2 : Operand<i64>, ImmLeaf<i64,
+                [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> {
+  let ParserMatchClass = UImm5s2Operand;
+  let PrintMethod = "printImmScale<2>";
+}
+def uimm5s4 : Operand<i64>, ImmLeaf<i64,
+                [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> {
+  let ParserMatchClass = UImm5s4Operand;
+  let PrintMethod = "printImmScale<4>";
+}
+def uimm5s8 : Operand<i64>, ImmLeaf<i64,
+                [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> {
+  let ParserMatchClass = UImm5s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
+
+// uimm6sN predicate - True if the immediate is a multiple of N in the range
+// [0 * N, 64 * N].
+def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>;
+def UImm6s2Operand : UImmScaledMemoryIndexed<6, 2>;
+def UImm6s4Operand : UImmScaledMemoryIndexed<6, 4>;
+def UImm6s8Operand : UImmScaledMemoryIndexed<6, 8>;
+
+def uimm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
+  let ParserMatchClass = UImm6s1Operand;
+}
+def uimm6s2 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >= 0 && Imm < (64*2) && ((Imm % 2) == 0); }]> {
+  let PrintMethod = "printImmScale<2>";
+  let ParserMatchClass = UImm6s2Operand;
+}
+def uimm6s4 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >= 0 && Imm < (64*4) && ((Imm % 4) == 0); }]> {
+  let PrintMethod = "printImmScale<4>";
+  let ParserMatchClass = UImm6s4Operand;
+}
+def uimm6s8 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >= 0 && Imm < (64*8) && ((Imm % 8) == 0); }]> {
+  let PrintMethod = "printImmScale<8>";
+  let ParserMatchClass = UImm6s8Operand;
+}
+
+// simm6sN predicate - True if the immediate is a multiple of N in the range
+// [-32 * N, 31 * N].
+def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>;
+def simm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -32 && Imm < 32; }]> {
+  let ParserMatchClass = SImm6s1Operand;
+  let DecoderMethod = "DecodeSImm<6>";
+}
+
+// simm4sN predicate - True if the immediate is a multiple of N in the range
+// [ -8* N, 7 * N].
+def SImm4s1Operand  : SImmScaledMemoryIndexed<4, 1>;
+def SImm4s2Operand  : SImmScaledMemoryIndexed<4, 2>;
+def SImm4s3Operand  : SImmScaledMemoryIndexed<4, 3>;
+def SImm4s4Operand  : SImmScaledMemoryIndexed<4, 4>;
+def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
+
+def simm4s1 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-8  && Imm <= 7; }]> {
+  let ParserMatchClass = SImm4s1Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
+
+def simm4s2 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-16  && Imm <= 14 && (Imm % 2) == 0x0; }]> {
+  let PrintMethod = "printImmScale<2>";
+  let ParserMatchClass = SImm4s2Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
+
+def simm4s3 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-24  && Imm <= 21 && (Imm % 3) == 0x0; }]> {
+  let PrintMethod = "printImmScale<3>";
+  let ParserMatchClass = SImm4s3Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
+
+def simm4s4 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-32  && Imm <= 28 && (Imm % 4) == 0x0; }]> {
+  let PrintMethod = "printImmScale<4>";
+  let ParserMatchClass = SImm4s4Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
+def simm4s16 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-128  && Imm <= 112 && (Imm % 16) == 0x0; }]> {
+  let PrintMethod = "printImmScale<16>";
+  let ParserMatchClass = SImm4s16Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
+
 class AsmImmRange<int Low, int High> : AsmOperandClass {
   let Name = "Imm" # Low # "_" # High;
   let DiagnosticType = "InvalidImm" # Low # "_" # High;
+  let RenderMethod = "addImmOperands";
   let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
 }
 
@@ -489,27 +642,35 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{
 let DiagnosticType = "LogicalSecondSource" in {
   def LogicalImm32Operand : AsmOperandClass {
     let Name = "LogicalImm32";
+    let PredicateMethod = "isLogicalImm<int32_t>";
+    let RenderMethod = "addLogicalImmOperands<int32_t>";
   }
   def LogicalImm64Operand : AsmOperandClass {
     let Name = "LogicalImm64";
+    let PredicateMethod = "isLogicalImm<int64_t>";
+    let RenderMethod = "addLogicalImmOperands<int64_t>";
   }
   def LogicalImm32NotOperand : AsmOperandClass {
     let Name = "LogicalImm32Not";
+    let PredicateMethod = "isLogicalImm<int32_t>";
+    let RenderMethod = "addLogicalImmNotOperands<int32_t>";
   }
   def LogicalImm64NotOperand : AsmOperandClass {
     let Name = "LogicalImm64Not";
+    let PredicateMethod = "isLogicalImm<int64_t>";
+    let RenderMethod = "addLogicalImmNotOperands<int64_t>";
   }
 }
 def logical_imm32 : Operand<i32>, IntImmLeaf<i32, [{
   return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 32);
 }], logical_imm32_XFORM> {
-  let PrintMethod = "printLogicalImm32";
+  let PrintMethod = "printLogicalImm<int32_t>";
   let ParserMatchClass = LogicalImm32Operand;
 }
 def logical_imm64 : Operand<i64>, IntImmLeaf<i64, [{
   return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 64);
 }], logical_imm64_XFORM> {
-  let PrintMethod = "printLogicalImm64";
+  let PrintMethod = "printLogicalImm<int64_t>";
   let ParserMatchClass = LogicalImm64Operand;
 }
 def logical_imm32_not : Operand<i32> {
@@ -672,11 +833,13 @@ def move_vec_shift : Operand<i32> {
 let DiagnosticType = "AddSubSecondSource" in {
   def AddSubImmOperand : AsmOperandClass {
     let Name = "AddSubImm";
-    let ParserMethod = "tryParseAddSubImm";
+    let ParserMethod = "tryParseImmWithOptionalShift";
+    let RenderMethod = "addImmWithOptionalShiftOperands<12>";
   }
   def AddSubImmNegOperand : AsmOperandClass {
     let Name = "AddSubImmNeg";
-    let ParserMethod = "tryParseAddSubImm";
+    let ParserMethod = "tryParseImmWithOptionalShift";
+    let RenderMethod = "addImmNegWithOptionalShiftOperands<12>";
   }
 }
 // An ADD/SUB immediate shifter operand:
@@ -797,52 +960,48 @@ def fpimm0 : FPImmLeaf<fAny, [{
 }]>;
 
 // Vector lane operands
-class AsmVectorIndex<string Suffix> : AsmOperandClass {
-  let Name = "VectorIndex" # Suffix;
-  let DiagnosticType = "InvalidIndex" # Suffix;
-}
-def VectorIndex1Operand : AsmVectorIndex<"1">;
-def VectorIndexBOperand : AsmVectorIndex<"B">;
-def VectorIndexHOperand : AsmVectorIndex<"H">;
-def VectorIndexSOperand : AsmVectorIndex<"S">;
-def VectorIndexDOperand : AsmVectorIndex<"D">;
-
-def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) == 1;
-}]> {
-  let ParserMatchClass = VectorIndex1Operand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = VectorIndexBOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = VectorIndexHOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
+class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
+  let Name = NamePrefix # "IndexRange" # Min # "_" # Max;
+  let DiagnosticType = "Invalid" # Name;
+  let PredicateMethod = "isVectorIndex<" # Min # ", " # Max #  ">";
+  let RenderMethod = "addVectorIndexOperands";
 }
-def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 4;
-}]> {
-  let ParserMatchClass = VectorIndexSOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 2;
-}]> {
-  let ParserMatchClass = VectorIndexDOperand;
+
+class AsmVectorIndexOpnd<AsmOperandClass mc, code pred>
+    : Operand<i64>, ImmLeaf<i64, pred> {
+  let ParserMatchClass = mc;
   let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
 }
 
+def VectorIndex1Operand : AsmVectorIndex<1, 1>;
+def VectorIndexBOperand : AsmVectorIndex<0, 15>;
+def VectorIndexHOperand : AsmVectorIndex<0, 7>;
+def VectorIndexSOperand : AsmVectorIndex<0, 3>;
+def VectorIndexDOperand : AsmVectorIndex<0, 1>;
+
+def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+
+def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
+def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
+def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">;
+def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
+def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
+
+def sve_elm_idx_extdup_b
+  : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
+def sve_elm_idx_extdup_h
+  : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
+def sve_elm_idx_extdup_s
+  : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def sve_elm_idx_extdup_d
+  : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def sve_elm_idx_extdup_q
+  : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+
 // 8-bit immediate for AdvSIMD where 64-bit values of the form:
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
 // are encoded as the eight bit value 'abcdefgh'.
@@ -1224,6 +1383,7 @@ def am_brcond : Operand<OtherVT> {
   let DecoderMethod = "DecodePCRelLabel19";
   let PrintMethod = "printAlignedLabel";
   let ParserMatchClass = PCRelLabel19Operand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
@@ -1279,18 +1439,20 @@ def am_tbrcond : Operand<OtherVT> {
   let EncoderMethod = "getTestBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
   let ParserMatchClass = BranchTarget14Operand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 // AsmOperand classes to emit (or not) special diagnostics
 def TBZImm0_31Operand : AsmOperandClass {
   let Name = "TBZImm0_31";
   let PredicateMethod = "isImmInRange<0,31>";
-  let RenderMethod = "addImm0_31Operands";
+  let RenderMethod = "addImmOperands";
 }
 def TBZImm32_63Operand : AsmOperandClass {
   let Name = "Imm32_63";
   let PredicateMethod = "isImmInRange<32,63>";
   let DiagnosticType = "InvalidImm0_63";
+  let RenderMethod = "addImmOperands";
 }
 
 class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
@@ -1355,11 +1517,13 @@ def am_b_target : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
   let ParserMatchClass = BranchTarget26Operand;
+  let OperandType = "OPERAND_PCREL";
 }
 def am_bl_target : Operand<i64> {
   let EncoderMethod = "getBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
   let ParserMatchClass = BranchTarget26Operand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 class BImm<bit op, dag iops, string asm, list<dag> pattern>
@@ -1458,6 +1622,30 @@ class SignAuthTwoOperand<bits<4> opc, string asm,
   let Inst{4-0}   = Rd;
 }
 
+// Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
+class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
+    : I<(outs), iops, asm, ops, "", []>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rn;
+  let Inst{31}    = sf;
+  let Inst{30-15} = 0b0111010000000000;
+  let Inst{14}    = sz;
+  let Inst{13-10} = 0b0010;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = 0b01101;
+}
+
+class FlagRotate<dag iops, string asm, string ops>
+    : BaseFlagManipulation<0b1, 0b0, iops, asm, ops> {
+  bits<6> imm;
+  bits<4> mask;
+  let Inst{20-15} = imm;
+  let Inst{13-10} = 0b0001;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = mask;
+}
+
 //---
 // Basic two-operand data processing instructions.
 //---
@@ -2579,7 +2767,7 @@ class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
   let DecoderMethod = "DecodeUnsignedLdStInstruction";
 }
 
-multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                   Operand indextype, string asm, list<dag> pattern> {
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
   def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
@@ -2591,7 +2779,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                   (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
-multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
              Operand indextype, string asm, list<dag> pattern> {
   let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
   def ui : BaseLoadStoreUI<sz, V, opc, (outs),
@@ -2647,10 +2835,11 @@ def am_ldrlit : Operand<iPTR> {
   let DecoderMethod = "DecodePCRelLabel19";
   let PrintMethod = "printAlignedLabel";
   let ParserMatchClass = PCRelLabel19Operand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+class LoadLiteral<bits<2> opc, bit V, RegisterOperand regtype, string asm>
     : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
         asm, "\t$Rt, $label", "", []>,
       Sched<[WriteLD]> {
@@ -2761,7 +2950,7 @@ def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
 def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
                        ro_Xextend128>;
 
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       string asm, dag ins, dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
@@ -2783,11 +2972,11 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   let Inst{4-0}   = Rt;
 }
 
-class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+class ROInstAlias<string asm, RegisterOperand regtype, Instruction INST>
   : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
               (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
 
-multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                    string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10 in
   def roW : LoadStore8RO<sz, V, opc, regtype, asm,
@@ -2814,7 +3003,7 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10 in
   def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
@@ -2839,7 +3028,7 @@ multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       string asm, dag ins, dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
@@ -2861,7 +3050,7 @@ class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   let Inst{4-0}   = Rt;
 }
 
-multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10 in
   def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -2886,7 +3075,7 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10 in
   def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
@@ -2911,7 +3100,7 @@ multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       string asm, dag ins, dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
@@ -2933,7 +3122,7 @@ class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   let Inst{4-0}   = Rt;
 }
 
-multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10 in
   def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -2958,7 +3147,7 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10 in
   def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
@@ -2983,7 +3172,7 @@ multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       string asm, dag ins, dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
@@ -3005,7 +3194,7 @@ class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   let Inst{4-0}   = Rt;
 }
 
-multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
   def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3030,7 +3219,7 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
   def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
@@ -3055,7 +3244,7 @@ multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       string asm, dag ins, dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
@@ -3077,7 +3266,7 @@ class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   let Inst{4-0}   = Rt;
 }
 
-multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
   def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
@@ -3102,7 +3291,7 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       string asm, ValueType Ty, SDPatternOperator storeop> {
   let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
   def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
@@ -3216,7 +3405,33 @@ class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
   let DecoderMethod = "DecodeSignedLdStInstruction";
 }
 
-multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+// Armv8.4 LDAPR & STLR with Immediate Offset instruction
+multiclass BaseLoadUnscaleV84<string asm, bits<2> sz, bits<2> opc,
+                              RegisterOperand regtype > {
+  def i : BaseLoadStoreUnscale<sz, 0, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, []>,
+          Sched<[WriteST]> {
+    let Inst{29} = 0;
+    let Inst{24} = 1;
+  }
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass BaseStoreUnscaleV84<string asm, bits<2> sz, bits<2> opc,
+                               RegisterOperand regtype > {
+  def i : BaseLoadStoreUnscale<sz, 0, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, []>,
+          Sched<[WriteST]> {
+    let Inst{29} = 0;
+    let Inst{24} = 1;
+  }
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                    string asm, list<dag> pattern> {
   let AddedComplexity = 1 in // try this before LoadUI
   def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
@@ -3227,7 +3442,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
-multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                          string asm, list<dag> pattern> {
   let AddedComplexity = 1 in // try this before StoreUI
   def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
@@ -3324,7 +3539,7 @@ class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
 
 let hasSideEffects = 0 in {
 let mayStore = 0, mayLoad = 1 in
-class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
              string asm>
     : BaseLoadStorePreIdx<sz, V, opc,
                      (outs GPR64sp:$wback, regtype:$Rt),
@@ -3333,7 +3548,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
       Sched<[WriteLD, WriteAdr]>;
 
 let mayStore = 1, mayLoad = 0 in
-class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                   string asm, SDPatternOperator storeop, ValueType Ty>
     : BaseLoadStorePreIdx<sz, V, opc,
                       (outs GPR64sp:$wback),
@@ -3370,16 +3585,16 @@ class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
 
 let hasSideEffects = 0 in {
 let mayStore = 0, mayLoad = 1 in
-class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
              string asm>
     : BaseLoadStorePostIdx<sz, V, opc,
                       (outs GPR64sp:$wback, regtype:$Rt),
                       (ins GPR64sp:$Rn, simm9:$offset),
                       asm, "$Rn = $wback,@earlyclobber $wback", []>,
-      Sched<[WriteLD, WriteI]>;
+      Sched<[WriteLD, WriteAdr]>;
 
 let mayStore = 1, mayLoad = 0 in
-class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                    string asm, SDPatternOperator storeop, ValueType Ty>
     : BaseLoadStorePostIdx<sz, V, opc,
                       (outs GPR64sp:$wback),
@@ -3387,7 +3602,7 @@ class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                        asm, "$Rn = $wback,@earlyclobber $wback",
       [(set GPR64sp:$wback,
             (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
-    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+    Sched<[WriteAdr, WriteST]>;
 } // hasSideEffects = 0
 
 
@@ -3417,7 +3632,7 @@ class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
   let DecoderMethod = "DecodePairLdStInstruction";
 }
 
-multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterOperand regtype,
                           Operand indextype, string asm> {
   let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
   def i : BaseLoadStorePairOffset<opc, V, 1,
@@ -3431,7 +3646,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
 }
 
 
-multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,
                            Operand indextype, string asm> {
   let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
   def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
@@ -3468,7 +3683,7 @@ class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
 
 let hasSideEffects = 0 in {
 let mayStore = 0, mayLoad = 1 in
-class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
                      Operand indextype, string asm>
     : BaseLoadStorePairPreIdx<opc, V, 1,
                               (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
@@ -3476,7 +3691,7 @@ class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
       Sched<[WriteLD, WriteLDHi, WriteAdr]>;
 
 let mayStore = 1, mayLoad = 0 in
-class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
                       Operand indextype, string asm>
     : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
                              (ins regtype:$Rt, regtype:$Rt2,
@@ -3509,7 +3724,7 @@ class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
 
 let hasSideEffects = 0 in {
 let mayStore = 0, mayLoad = 1 in
-class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
                       Operand idxtype, string asm>
     : BaseLoadStorePairPostIdx<opc, V, 1,
                               (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
@@ -3517,7 +3732,7 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
       Sched<[WriteLD, WriteLDHi, WriteAdr]>;
 
 let mayStore = 1, mayLoad = 0 in
-class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
                        Operand idxtype, string asm>
     : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
                              (ins regtype:$Rt, regtype:$Rt2,
@@ -4559,11 +4774,24 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
 }
 
 class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
-                                 string kind2> :
-        BaseSIMDThreeSameVector<Q, U, 0b100, 0b10010, V128, asm, kind1, [] > {
+                                 string kind2, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        [(set (AccumType RegType:$dst),
+              (OpNode (AccumType RegType:$Rd),
+                      (InputType RegType:$Rn),
+                      (InputType RegType:$Rm)))]> {
   let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
 }
 
+multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+                                         v2i32, v8i8, OpNode>;
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+                                         v4i32, v16i8, OpNode>;
+}
+
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
@@ -5492,7 +5720,7 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
   def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
                                             V128, V128, V128,
                                             asm#"2", ".8h", ".16b", ".16b", []>;
-  let Predicates = [HasCrypto] in {
+  let Predicates = [HasAES] in {
     def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
                                               V128, V64, V64,
                                               asm, ".1q", ".1d", ".1d", []>;
@@ -5911,10 +6139,10 @@ multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
 multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
                                  SDPatternOperator OpNode = null_frag> {
   def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
-                                     (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm), 
+                                     (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
                                      asm, []>;
   def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
-                                     (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm), 
+                                     (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
                                      asm, []>;
 }
 
@@ -6993,14 +7221,31 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
 
 // ARMv8.2 Index Dot product instructions
 class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
-                                      string lhs_kind, string rhs_kind> :
-        BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, V128, V128, V128, VectorIndexS,
-                            asm, "", dst_kind, lhs_kind, rhs_kind, []> {
+                                      string lhs_kind, string rhs_kind,
+                                      RegisterOperand RegType,
+                                      ValueType AccumType, ValueType InputType,
+                                      SDPatternOperator OpNode> :
+        BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+                            VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
+        [(set (AccumType RegType:$dst),
+              (AccumType (OpNode (AccumType RegType:$Rd),
+                                 (InputType RegType:$Rn),
+                                 (InputType (bitconvert (AccumType
+                                    (AArch64duplane32 (v4i32 V128:$Rm),
+                                        VectorIndexS:$idx)))))))]> {
   bits<2> idx;
   let Inst{21}    = idx{0};  // L
   let Inst{11}    = idx{1};  // H
 }
 
+multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+                                       SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", V64,
+                                              v2i32, v8i8, OpNode>;
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", V128,
+                                              v4i32, v16i8, OpNode>;
+}
+
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
@@ -7765,7 +8010,6 @@ multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
                               FPR32, FPR32, vecshiftR32, asm, []> {
     let Inst{20-16} = imm{4-0};
   }
-
   def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
                               FPR64, FPR64, vecshiftR64, asm, []> {
     let Inst{21-16} = imm{5-0};
@@ -8468,14 +8712,14 @@ class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
 
 // The immediate form of AdvSIMD post-indexed addressing is encoded with
 // register post-index addressing from the zero register.
-multiclass SIMDLdStAliases<string asm, string layout, string Count,
+multiclass SIMDLdStAliases<string BaseName, string asm, string layout, string Count,
                            int Offset, int Size> {
   // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
   //      "ld1\t$Vt, [$Rn], #16"
   // may get mapped to
   //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
   def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                  (!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
                       GPR64sp:$Rn,
                       !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
                       XZR), 1>;
@@ -8485,7 +8729,7 @@ multiclass SIMDLdStAliases<string asm, string layout, string Count,
   // may get mapped to
   //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
   def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                  (!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
                       GPR64sp:$Rn,
                       !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
                       XZR), 0>;
@@ -8495,7 +8739,7 @@ multiclass SIMDLdStAliases<string asm, string layout, string Count,
   // may get mapped to
   //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
   def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
-                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                  (!cast<Instruction>(BaseName # Count # "v" # layout)
                       !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
                       GPR64sp:$Rn), 0>;
 
@@ -8504,14 +8748,14 @@ multiclass SIMDLdStAliases<string asm, string layout, string Count,
   // may get mapped to
   //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
   def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                  (!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
                       GPR64sp:$Rn,
                       !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
                       !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
 }
 
-multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
+multiclass BaseSIMDLdN<string BaseName, string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode> {
   let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
     def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
                            (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
@@ -8573,18 +8817,18 @@ multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
                             !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
   }
 
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
 }
 
 // Only ld1/st1 has a v1d version.
-multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
+multiclass BaseSIMDStN<string BaseName, string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode> {
   let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
     def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
                             (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
@@ -8645,18 +8889,18 @@ multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
                             !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
   }
 
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
 }
 
-multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+multiclass BaseSIMDLd1<string BaseName, string Count, string asm, string veclist,
                        int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+  : BaseSIMDLdN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {
 
   // LD1 instructions have extra "1d" variants.
   let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
@@ -8671,12 +8915,12 @@ multiclass BaseSIMDLd1<string Count, string asm, string veclist,
                             !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
   }
 
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
 }
 
-multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+multiclass BaseSIMDSt1<string BaseName, string Count, string asm, string veclist,
                        int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+  : BaseSIMDStN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {
 
   // ST1 instructions have extra "1d" variants.
   let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
@@ -8691,45 +8935,45 @@ multiclass BaseSIMDSt1<string Count, string asm, string veclist,
                             !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
   }
 
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
 }
 
 multiclass SIMDLd1Multiple<string asm> {
-  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+  defm One   : BaseSIMDLd1<NAME, "One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
 }
 
 multiclass SIMDSt1Multiple<string asm> {
-  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+  defm One   : BaseSIMDSt1<NAME, "One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
 }
 
 multiclass SIMDLd2Multiple<string asm> {
-  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+  defm Two : BaseSIMDLdN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
 }
 
 multiclass SIMDSt2Multiple<string asm> {
-  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+  defm Two : BaseSIMDStN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
 }
 
 multiclass SIMDLd3Multiple<string asm> {
-  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+  defm Three : BaseSIMDLdN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
 }
 
 multiclass SIMDSt3Multiple<string asm> {
-  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+  defm Three : BaseSIMDStN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
 }
 
 multiclass SIMDLd4Multiple<string asm> {
-  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+  defm Four : BaseSIMDLdN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
 }
 
 multiclass SIMDSt4Multiple<string asm> {
-  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+  defm Four : BaseSIMDStN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
 }
 
 //---
@@ -8769,7 +9013,7 @@ class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
 
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
-                  Operand listtype>
+                  DAGOperand listtype>
   : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
                        (outs listtype:$Vt), (ins GPR64sp:$Rn),
                        []> {
@@ -8781,7 +9025,7 @@ class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
 }
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
-                      string asm, Operand listtype, Operand GPR64pi>
+                      string asm, DAGOperand listtype, DAGOperand GPR64pi>
   : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
                        "$Rn = $wback",
                        (outs GPR64sp:$wback, listtype:$Vt),
@@ -8794,14 +9038,14 @@ class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
   let Inst{11-10} = size;
 }
 
-multiclass SIMDLdrAliases<string asm, string layout, string Count,
+multiclass SIMDLdrAliases<string BaseName, string asm, string layout, string Count,
                           int Offset, int Size> {
   // E.g. "ld1r { v0.8b }, [x1], #1"
   //      "ld1r.8b\t$Vt, [$Rn], #1"
   // may get mapped to
   //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
   def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                  (!cast<Instruction>(BaseName # "v" # layout # "_POST")
                       GPR64sp:$Rn,
                       !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
                       XZR), 1>;
@@ -8811,7 +9055,7 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
   // may get mapped to
   //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
   def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                  (!cast<Instruction>(BaseName # "v" # layout # "_POST")
                       GPR64sp:$Rn,
                       !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
                       XZR), 0>;
@@ -8821,7 +9065,7 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
   // may get mapped to
   //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
   def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
-                  (!cast<Instruction>(NAME # "v" # layout)
+                  (!cast<Instruction>(BaseName # "v" # layout)
                       !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
                       GPR64sp:$Rn), 0>;
 
@@ -8830,7 +9074,7 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
   // may get mapped to
   //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
   def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                  (!cast<Instruction>(BaseName # "v" # layout # "_POST")
                       GPR64sp:$Rn,
                       !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
                       !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
@@ -8839,55 +9083,55 @@ multiclass SIMDLdrAliases<string asm, string layout, string Count,
 multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
   int Offset1, int Offset2, int Offset4, int Offset8> {
   def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count # "8b")>;
+                        !cast<DAGOperand>("VecList" # Count # "8b")>;
   def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count #"16b")>;
+                        !cast<DAGOperand>("VecList" # Count #"16b")>;
   def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"4h")>;
+                        !cast<DAGOperand>("VecList" # Count #"4h")>;
   def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"8h")>;
+                        !cast<DAGOperand>("VecList" # Count #"8h")>;
   def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"2s")>;
+                        !cast<DAGOperand>("VecList" # Count #"2s")>;
   def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"4s")>;
+                        !cast<DAGOperand>("VecList" # Count #"4s")>;
   def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"1d")>;
+                        !cast<DAGOperand>("VecList" # Count #"1d")>;
   def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"2d")>;
+                        !cast<DAGOperand>("VecList" # Count #"2d")>;
 
   def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "8b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
+                                 !cast<DAGOperand>("VecList" # Count # "8b"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset1)>;
   def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "16b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
+                                 !cast<DAGOperand>("VecList" # Count # "16b"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset1)>;
   def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "4h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
+                                 !cast<DAGOperand>("VecList" # Count # "4h"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset2)>;
   def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "8h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
+                                 !cast<DAGOperand>("VecList" # Count # "8h"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset2)>;
   def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "2s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
+                                 !cast<DAGOperand>("VecList" # Count # "2s"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset4)>;
   def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "4s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
+                                 !cast<DAGOperand>("VecList" # Count # "4s"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset4)>;
   def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "1d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
+                                 !cast<DAGOperand>("VecList" # Count # "1d"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset8)>;
   def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "2d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
+                                 !cast<DAGOperand>("VecList" # Count # "2d"),
+                                 !cast<DAGOperand>("GPR64pi" # Offset8)>;
 
-  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
-  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
-  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
-  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
-  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
-  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
-  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
-  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+  defm : SIMDLdrAliases<NAME, asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<NAME, asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<NAME, asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<NAME, asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<NAME, asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<NAME, asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<NAME, asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<NAME, asm, "2d",  Count, Offset8, 128>;
 }
 
 class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
@@ -9245,31 +9489,31 @@ multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
 }
 
 multiclass SIMDLdSt1SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+  defm "" : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
 }
 
 multiclass SIMDLdSt2SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+  defm "" : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
 }
 
 multiclass SIMDLdSt3SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+  defm "" : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
 }
 
 multiclass SIMDLdSt4SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+  defm "" : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
 }
 } // end of 'let Predicates = [HasNEON]'
 
@@ -9280,9 +9524,9 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
 let Predicates = [HasNEON, HasRDM] in {
 
 class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
-                                    RegisterOperand regtype, string asm, 
+                                    RegisterOperand regtype, string asm,
                                     string kind, list<dag> pattern>
-  : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, 
+  : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
                                 pattern> {
 }
 multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
@@ -9291,7 +9535,7 @@ multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
     [(set (v4i16 V64:$dst),
           (Accum (v4i16 V64:$Rd),
                  (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
-                                                   (v4i16 V64:$Rm)))))]>;         
+                                                   (v4i16 V64:$Rm)))))]>;
   def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
     [(set (v8i16 V128:$dst),
           (Accum (v8i16 V128:$Rd),
@@ -9355,28 +9599,28 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
     let Inst{21} = idx{0};
   }
 
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but 
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
   // an intermediate EXTRACT_SUBREG would be untyped.
-  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we 
+  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
   // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
   def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                       (i32 (vector_extract 
+                       (i32 (vector_extract
                                (v4i32 (insert_subvector
-                                       (undef), 
-                                        (v2i32 (int_aarch64_neon_sqrdmulh 
+                                       (undef),
+                                        (v2i32 (int_aarch64_neon_sqrdmulh
                                                  (v2i32 V64:$Rn),
-                                                 (v2i32 (AArch64duplane32 
+                                                 (v2i32 (AArch64duplane32
                                                           (v4i32 V128:$Rm),
                                                           VectorIndexS:$idx)))),
                                       (i32 0))),
                                (i64 0))))),
             (EXTRACT_SUBREG
                 (v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
-                          (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), 
-                                                FPR32Op:$Rd, 
-                                                ssub)), 
+                          (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                                FPR32Op:$Rd,
+                                                ssub)),
                           V64:$Rn,
-                          V128:$Rm, 
+                          V128:$Rm,
                           VectorIndexS:$idx)),
                 ssub)>;
 
@@ -9397,26 +9641,26 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
   // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
   // an intermediate EXTRACT_SUBREG would be untyped.
   def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                        (i32 (vector_extract 
-                               (v4i32 (int_aarch64_neon_sqrdmulh 
+                        (i32 (vector_extract
+                               (v4i32 (int_aarch64_neon_sqrdmulh
                                         (v4i32 V128:$Rn),
-                                        (v4i32 (AArch64duplane32 
+                                        (v4i32 (AArch64duplane32
                                                  (v4i32 V128:$Rm),
                                                  VectorIndexS:$idx)))),
                                (i64 0))))),
             (EXTRACT_SUBREG
                 (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
-                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), 
-                                               FPR32Op:$Rd, 
-                                               ssub)), 
+                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                               FPR32Op:$Rd,
+                                               ssub)),
                          V128:$Rn,
-                         V128:$Rm, 
+                         V128:$Rm,
                          VectorIndexS:$idx)),
                 ssub)>;
 
   def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
                                         FPR16Op, FPR16Op, V128_lo,
-                                        VectorIndexH, asm, ".h", "", "", ".h", 
+                                        VectorIndexH, asm, ".h", "", "", ".h",
                                         []> {
     bits<3> idx;
     let Inst{11} = idx{2};
@@ -9676,7 +9920,6 @@ multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
 // Crypto extensions
 //----------------------------------------------------------------------------
 
-let Predicates = [HasCrypto] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
               list<dag> pat>
@@ -9766,7 +10009,103 @@ class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
 class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
   : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
                [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-} // end of 'let Predicates = [HasCrypto]'
+
+// Armv8.2-A Crypto extensions
+class BaseCryptoV82<dag oops, dag iops, string asm, string asmops, string cst,
+                    list<dag> pattern>
+  : I <oops, iops, asm, asmops, cst, pattern>, Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  let Inst{31-25} = 0b1100111;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
+  : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops,
+                  "$Vm = $Vd", []> {
+  let Inst{31-25} = 0b1100111;
+  let Inst{24-21} = 0b0110;
+  let Inst{20-15} = 0b000001;
+  let Inst{14}    = op0;
+  let Inst{13-12} = 0b00;
+  let Inst{11-10} = op1;
+}
+class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm>
+  : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d}">;
+class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm>
+  : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s}">;
+
+class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
+                string asmops, string cst>
+  : BaseCryptoV82<oops, iops, asm , asmops, cst, []> {
+  bits<5> Vm;
+  let Inst{24-21} = 0b0011;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0b1;
+  let Inst{14}    = op0;
+  let Inst{13-12} = 0b00;
+  let Inst{11-10} = op1;
+}
+class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm>
+  : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
+              "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "">;
+class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm>
+  : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
+              "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "$Vd = $Vdst">;
+class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm>
+  : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
+              "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "">;
+class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm>
+  : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
+              "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "$Vd = $Vdst">;
+class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm>
+  : CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm),
+              asm, "{\t$Vd, $Vn, $Vm.2d}", "$Vd = $Vdst">;
+
+class CryptoRRRR<bits<2>op0, string asm, string asmops>
+  : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm,
+                  asmops, "", []> {
+  bits<5> Vm;
+  bits<5> Va;
+  let Inst{24-23} = 0b00;
+  let Inst{22-21} = op0;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0b0;
+  let Inst{14-10} = Va;
+}
+class CryptoRRRR_16B<bits<2>op0, string asm>
+ : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b}"> {
+}
+class CryptoRRRR_4S<bits<2>op0, string asm>
+ : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s}"> {
+}
+
+class CryptoRRRi6<string asm>
+  : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm,
+                  "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> {
+  bits<6> imm;
+  bits<5> Vm;
+  let Inst{24-21} = 0b0100;
+  let Inst{20-16} = Vm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm>
+  : BaseCryptoV82<(outs V128:$Vdst),
+                  (ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm),
+                  asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> {
+  bits<2> imm;
+  bits<5> Vm;
+  let Inst{24-21} = 0b0010;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0b1;
+  let Inst{14}    = op0;
+  let Inst{13-12} = imm;
+  let Inst{11-10} = op1;
+}
 
 //----------------------------------------------------------------------------
 // v8.1 atomic instructions extension:
@@ -9910,7 +10249,7 @@ class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
   let Predicates = [HasLSE];
 }
 
-multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, 
+multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
                         string order> {
   let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in
     def B : BaseLDOPregister<op, order, "b", GPR32>;
@@ -9927,15 +10266,15 @@ multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
 let Predicates = [HasLSE] in
 multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op,
                                          string size, dag SrcRHS, dag DstRHS> {
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
             (!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
             (!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
             (!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
             (!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
             (!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
 }
 
@@ -9974,15 +10313,15 @@ multiclass LDOPregister_patterns_mod<string inst, string op, string mod> {
 let Predicates = [HasLSE] in
 multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op,
                                         string size, dag OLD, dag NEW> {
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
             (!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
             (!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
             (!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
             (!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
-  def : Pat<(!cast<SDNode>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
+  def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
             (!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 40836b00b9e6..230480cf1cea 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -675,9 +674,13 @@ static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   if (!Subtarget.hasCustomCheapAsMoveHandling())
     return MI.isAsCheapAsAMove();
-  if (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
-      isExynosShiftLeftFast(MI))
-    return true;
+
+  if (Subtarget.hasExynosCheapAsMoveHandling()) {
+    if (isExynosResetFast(MI) || isExynosShiftLeftFast(MI))
+      return true;
+    else
+      return MI.isAsCheapAsAMove();
+  }
 
   switch (MI.getOpcode()) {
   default:
@@ -736,6 +739,77 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
+bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
+  unsigned Reg, Imm, Shift;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  // MOV Rd, SP
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+    if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm())
+      return false;
+
+    Reg = MI.getOperand(1).getReg();
+    Imm = MI.getOperand(2).getImm();
+    return ((Reg == AArch64::WSP || Reg == AArch64::SP) && Imm == 0);
+
+  // Literal
+  case AArch64::ADR:
+  case AArch64::ADRP:
+    return true;
+
+  // MOVI Vd, #0
+  case AArch64::MOVID:
+  case AArch64::MOVIv8b_ns:
+  case AArch64::MOVIv2d_ns:
+  case AArch64::MOVIv16b_ns:
+    Imm = MI.getOperand(1).getImm();
+    return (Imm == 0);
+
+  // MOVI Vd, #0
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv4i32:
+  case AArch64::MOVIv8i16:
+    Imm = MI.getOperand(1).getImm();
+    Shift = MI.getOperand(2).getImm();
+    return (Imm == 0 && Shift == 0);
+
+  // MOV Rd, Imm
+  case AArch64::MOVNWi:
+  case AArch64::MOVNXi:
+
+  // MOV Rd, Imm
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi:
+    return true;
+
+  // MOV Rd, Imm
+  case AArch64::ORRWri:
+  case AArch64::ORRXri:
+    if (!MI.getOperand(1).isReg())
+      return false;
+
+    Reg = MI.getOperand(1).getReg();
+    Imm = MI.getOperand(2).getImm();
+    return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Imm == 0);
+
+  // MOV Rd, Rm
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (!MI.getOperand(1).isReg())
+      return false;
+
+    Reg = MI.getOperand(1).getReg();
+    Imm = MI.getOperand(3).getImm();
+    Shift = AArch64_AM::getShiftValue(Imm);
+    return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Shift == 0);
+  }
+}
+
 bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const {
   unsigned Imm, Shift;
   AArch64_AM::ShiftExtendType Ext;
@@ -1135,7 +1209,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
   return true;
 }
 
-/// \brief Return the opcode that does not set flags when possible - otherwise
+/// Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
@@ -1574,7 +1648,7 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1612,7 +1686,7 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1640,7 +1714,7 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
 
 // Return true if this instruction simply sets its single destination register
 // to zero. This is equivalent to a register rename of the zero-register.
-bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1664,7 +1738,7 @@ bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1694,7 +1768,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1763,7 +1837,7 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 /// Return true if this is load/store scales or extends its register offset.
 /// This refers to scaling a dynamic index as opposed to scaled immediates.
 /// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1822,27 +1896,27 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
 }
 
 /// Check all MachineMemOperands for a hint to suppress pairing.
-bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
     return MMO->getFlags() & MOSuppressPair;
   });
 }
 
 /// Set a flag on the first MachineMemOperand to suppress pairing.
-void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
   if (MI.memoperands_empty())
     return;
   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
 }
 
 /// Check all MachineMemOperands for a hint that the load/store is strided.
-bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
     return MMO->getFlags() & MOStridedAccess;
   });
 }
 
-bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
@@ -1867,8 +1941,124 @@ bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
   }
 }
 
-bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
-  return isUnscaledLdSt(MI.getOpcode());
+bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Scaled instructions.
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+  case AArch64::LDRSWui:
+  // Unscaled instructions.
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+    return true;
+  }
+}
+
+unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
+                                                   bool &Is64Bit) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no flag setting equivalent!");
+  // 32-bit cases:
+  case AArch64::ADDWri:
+    Is64Bit = false;
+    return AArch64::ADDSWri;
+  case AArch64::ADDWrr:
+    Is64Bit = false;
+    return AArch64::ADDSWrr;
+  case AArch64::ADDWrs:
+    Is64Bit = false;
+    return AArch64::ADDSWrs;
+  case AArch64::ADDWrx:
+    Is64Bit = false;
+    return AArch64::ADDSWrx;
+  case AArch64::ANDWri:
+    Is64Bit = false;
+    return AArch64::ANDSWri;
+  case AArch64::ANDWrr:
+    Is64Bit = false;
+    return AArch64::ANDSWrr;
+  case AArch64::ANDWrs:
+    Is64Bit = false;
+    return AArch64::ANDSWrs;
+  case AArch64::BICWrr:
+    Is64Bit = false;
+    return AArch64::BICSWrr;
+  case AArch64::BICWrs:
+    Is64Bit = false;
+    return AArch64::BICSWrs;
+  case AArch64::SUBWri:
+    Is64Bit = false;
+    return AArch64::SUBSWri;
+  case AArch64::SUBWrr:
+    Is64Bit = false;
+    return AArch64::SUBSWrr;
+  case AArch64::SUBWrs:
+    Is64Bit = false;
+    return AArch64::SUBSWrs;
+  case AArch64::SUBWrx:
+    Is64Bit = false;
+    return AArch64::SUBSWrx;
+  // 64-bit cases:
+  case AArch64::ADDXri:
+    Is64Bit = true;
+    return AArch64::ADDSXri;
+  case AArch64::ADDXrr:
+    Is64Bit = true;
+    return AArch64::ADDSXrr;
+  case AArch64::ADDXrs:
+    Is64Bit = true;
+    return AArch64::ADDSXrs;
+  case AArch64::ADDXrx:
+    Is64Bit = true;
+    return AArch64::ADDSXrx;
+  case AArch64::ANDXri:
+    Is64Bit = true;
+    return AArch64::ANDSXri;
+  case AArch64::ANDXrr:
+    Is64Bit = true;
+    return AArch64::ANDSXrr;
+  case AArch64::ANDXrs:
+    Is64Bit = true;
+    return AArch64::ANDSXrs;
+  case AArch64::BICXrr:
+    Is64Bit = true;
+    return AArch64::BICSXrr;
+  case AArch64::BICXrs:
+    Is64Bit = true;
+    return AArch64::BICSXrs;
+  case AArch64::SUBXri:
+    Is64Bit = true;
+    return AArch64::SUBSXri;
+  case AArch64::SUBXrr:
+    Is64Bit = true;
+    return AArch64::SUBSXrr;
+  case AArch64::SUBXrs:
+    Is64Bit = true;
+    return AArch64::SUBSXrs;
+  case AArch64::SUBXrx:
+    Is64Bit = true;
+    return AArch64::SUBSXrx;
+  }
 }
 
 // Is this a candidate for ld/st merging or pairing?  For example, we don't
@@ -2592,6 +2782,16 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov1d;
       Offset = false;
+    } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, MBBI, DL, get(AArch64::STPXi))
+          .addReg(TRI->getSubReg(SrcReg, AArch64::sube64),
+                  getKillRegState(isKill))
+          .addReg(TRI->getSubReg(SrcReg, AArch64::subo64),
+                  getKillRegState(isKill))
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO);
+      return;
     }
     break;
   case 24:
@@ -2690,6 +2890,16 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov1d;
       Offset = false;
+    } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, MBBI, DL, get(AArch64::LDPXi))
+          .addReg(TRI->getSubReg(DestReg, AArch64::sube64),
+                  getDefRegState(true))
+          .addReg(TRI->getSubReg(DestReg, AArch64::subo64),
+                  getDefRegState(true))
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO);
+      return;
     }
     break;
   case 24:
@@ -4432,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   DelInstrs.push_back(&Root);
 }
 
-/// \brief Replace csincr-branch sequence by simple conditional branch
+/// Replace csincr-branch sequence by simple conditional branch
 ///
 /// Examples:
 /// 1. \code
@@ -4690,213 +4900,377 @@ AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   /// * Frame construction overhead: 1 (RET)
   /// * Requires stack fixups? No
   ///
+  /// \p MachineOutlinerThunk implies that the function is being created from
+  /// a sequence of instructions ending in a call. The outlined function is
+  /// called with a BL instruction, and the outlined function tail-calls the
+  /// original call destination.
+  ///
+  /// That is,
+  ///
+  /// I1                                OUTLINED_FUNCTION:
+  /// I2 --> BL OUTLINED_FUNCTION       I1
+  /// BL f                              I2
+  ///                                   B f
+  /// * Call construction overhead: 1 (BL)
+  /// * Frame construction overhead: 0
+  /// * Requires stack fixups? No
+  ///
 enum MachineOutlinerClass {
   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
   MachineOutlinerTailCall, /// Only emit a branch.
-  MachineOutlinerNoLRSave  /// Emit a call and return.
+  MachineOutlinerNoLRSave, /// Emit a call and return.
+  MachineOutlinerThunk,    /// Emit a call and tail-call.
 };
 
-bool AArch64InstrInfo::canOutlineWithoutLRSave(
-    MachineBasicBlock::iterator &CallInsertionPt) const {
-  // Was LR saved in the function containing this basic block?
-  MachineBasicBlock &MBB = *(CallInsertionPt->getParent());
-  LiveRegUnits LRU(getRegisterInfo());
-  LRU.addLiveOuts(MBB);
-
-  // Get liveness information from the end of the block to the end of the
-  // prospective outlined region.
-  std::for_each(MBB.rbegin(),
-                (MachineBasicBlock::reverse_iterator)CallInsertionPt,
-                [&LRU](MachineInstr &MI) { LRU.stepBackward(MI); });
-
-  // If the link register is available at this point, then we can safely outline
-  // the region without saving/restoring LR. Otherwise, we must emit a save and
-  // restore.
-  return LRU.available(AArch64::LR);
-}
+enum MachineOutlinerMBBFlags {
+  LRUnavailableSomewhere = 0x2,
+  HasCalls = 0x4
+};
 
-AArch64GenInstrInfo::MachineOutlinerInfo
-AArch64InstrInfo::getOutlininingCandidateInfo(
-    std::vector<
-        std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-        &RepeatedSequenceLocs) const {
+outliner::OutlinedFunction
+AArch64InstrInfo::getOutliningCandidateInfo(
+    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+  unsigned SequenceSize = std::accumulate(
+      RepeatedSequenceLocs[0].front(),
+      std::next(RepeatedSequenceLocs[0].back()),
+      0, [this](unsigned Sum, const MachineInstr &MI) {
+        return Sum + getInstSizeInBytes(MI);
+      });
+
+  // Compute liveness information for each candidate.
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+  std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
+                [&TRI](outliner::Candidate &C) { C.initLRU(TRI); });
+
+  // According to the AArch64 Procedure Call Standard, the following are
+  // undefined on entry/exit from a function call:
+  //
+  // * Registers x16, x17, (and thus w16, w17)
+  // * Condition codes (and thus the NZCV register)
+  //
+  // Because if this, we can't outline any sequence of instructions where
+  // one
+  // of these registers is live into/across it. Thus, we need to delete
+  // those
+  // candidates.
+  auto CantGuaranteeValueAcrossCall = [](outliner::Candidate &C) {
+    LiveRegUnits LRU = C.LRU;
+    return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
+            !LRU.available(AArch64::NZCV));
+  };
+
+  // Erase every candidate that violates the restrictions above. (It could be
+  // true that we have viable candidates, so it's not worth bailing out in
+  // the case that, say, 1 out of 20 candidates violate the restructions.)
+  RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+                                            RepeatedSequenceLocs.end(),
+                                            CantGuaranteeValueAcrossCall),
+                             RepeatedSequenceLocs.end());
+
+  // If the sequence is empty, we're done.
+  if (RepeatedSequenceLocs.empty())
+    return outliner::OutlinedFunction();
+
+  // At this point, we have only "safe" candidates to outline. Figure out
+  // frame + call instruction information.
+
+  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
+
+  // Helper lambda which sets call information for every candidate.
+  auto SetCandidateCallInfo =
+      [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
+        for (outliner::Candidate &C : RepeatedSequenceLocs)
+          C.setCallInfo(CallID, NumBytesForCall);
+      };
 
-  unsigned CallID = MachineOutlinerDefault;
   unsigned FrameID = MachineOutlinerDefault;
-  unsigned NumInstrsForCall = 3;
-  unsigned NumInstrsToCreateFrame = 1;
-
-  auto DoesntNeedLRSave =
-      [this](std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>
-                 &I) { return canOutlineWithoutLRSave(I.second); };
+  unsigned NumBytesToCreateFrame = 4;
 
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
-  if (RepeatedSequenceLocs[0].second->isTerminator()) {
-    CallID = MachineOutlinerTailCall;
+  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
     FrameID = MachineOutlinerTailCall;
-    NumInstrsForCall = 1;
-    NumInstrsToCreateFrame = 0;
+    NumBytesToCreateFrame = 0;
+    SetCandidateCallInfo(MachineOutlinerTailCall, 4);
+  }
+
+  else if (LastInstrOpcode == AArch64::BL || LastInstrOpcode == AArch64::BLR) {
+    // FIXME: Do we need to check if the code after this uses the value of LR?
+    FrameID = MachineOutlinerThunk;
+    NumBytesToCreateFrame = 0;
+    SetCandidateCallInfo(MachineOutlinerThunk, 4);
+  }
+
+  // Make sure that LR isn't live on entry to this candidate. The only
+  // instructions that use LR that could possibly appear in a repeated sequence
+  // are calls. Therefore, we only have to check and see if LR is dead on entry
+  // to (or exit from) some candidate.
+  else if (std::all_of(RepeatedSequenceLocs.begin(),
+                       RepeatedSequenceLocs.end(),
+                       [](outliner::Candidate &C) {
+                         return C.LRU.available(AArch64::LR);
+                         })) {
+    FrameID = MachineOutlinerNoLRSave;
+    NumBytesToCreateFrame = 4;
+    SetCandidateCallInfo(MachineOutlinerNoLRSave, 4);
   }
 
-  else if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                       DoesntNeedLRSave)) {
-    CallID = MachineOutlinerNoLRSave;
-    FrameID = MachineOutlinerNoLRSave;
-    NumInstrsForCall = 1;
-    NumInstrsToCreateFrame = 1;
+  // LR is live, so we need to save it to the stack.
+  else {
+    FrameID = MachineOutlinerDefault;
+    NumBytesToCreateFrame = 4;
+    SetCandidateCallInfo(MachineOutlinerDefault, 12);
   }
 
   // Check if the range contains a call. These require a save + restore of the
   // link register.
-  if (std::any_of(RepeatedSequenceLocs[0].first, RepeatedSequenceLocs[0].second,
+  if (std::any_of(RepeatedSequenceLocs[0].front(),
+                  RepeatedSequenceLocs[0].back(),
                   [](const MachineInstr &MI) { return MI.isCall(); }))
-    NumInstrsToCreateFrame += 2; // Save + restore the link register.
+    NumBytesToCreateFrame += 8; // Save + restore the link register.
 
   // Handle the last instruction separately. If this is a tail call, then the
   // last instruction is a call. We don't want to save + restore in this case.
   // However, it could be possible that the last instruction is a call without
   // it being valid to tail call this sequence. We should consider this as well.
-  else if (RepeatedSequenceLocs[0].second->isCall() &&
-           FrameID != MachineOutlinerTailCall)
-    NumInstrsToCreateFrame += 2;
+  else if (FrameID != MachineOutlinerThunk &&
+           FrameID != MachineOutlinerTailCall &&
+           RepeatedSequenceLocs[0].back()->isCall())
+    NumBytesToCreateFrame += 8;
 
-  return MachineOutlinerInfo(NumInstrsForCall, NumInstrsToCreateFrame, CallID,
-                             FrameID);
+  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+                                    NumBytesToCreateFrame, FrameID);
 }
 
 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
   const Function &F = MF.getFunction();
 
-  // If F uses a redzone, then don't outline from it because it might mess up
-  // the stack.
-  if (!F.hasFnAttribute(Attribute::NoRedZone))
+  // Can F be deduplicated by the linker? If it can, don't outline from it.
+  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
     return false;
 
-  // If anyone is using the address of this function, don't outline from it.
-  if (F.hasAddressTaken())
+  // Don't outline from functions with section markings; the program could
+  // expect that all the code is in the named section.
+  // FIXME: Allow outlining from multiple functions with the same section
+  // marking.
+  if (F.hasSection())
     return false;
 
-  // Can F be deduplicated by the linker? If it can, don't outline from it.
-  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
+  // Outlining from functions with redzones is unsafe since the outliner may
+  // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
+  // outline from it.
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  if (!AFI || AFI->hasRedZone().getValueOr(true))
     return false;
 
+  // It's safe to outline from MF.
   return true;
 }
 
-AArch64GenInstrInfo::MachineOutlinerInstrType
-AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
+unsigned
+AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
+  unsigned Flags = 0x0;
+  // Check if there's a call inside this MachineBasicBlock. If there is, then
+  // set a flag.
+  if (std::any_of(MBB.begin(), MBB.end(),
+                  [](MachineInstr &MI) { return MI.isCall(); }))
+    Flags |= MachineOutlinerMBBFlags::HasCalls;
+
+  // Check if LR is available through all of the MBB. If it's not, then set
+  // a flag.
+  assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
+         "Suitable Machine Function for outlining must track liveness");
+  LiveRegUnits LRU(getRegisterInfo());
+  LRU.addLiveOuts(MBB);
+
+  std::for_each(MBB.rbegin(),
+                MBB.rend(),
+                [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
 
-  MachineFunction *MF = MI.getParent()->getParent();
+  if (!LRU.available(AArch64::LR)) 
+      Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
+
+  return Flags;
+}
+
+outliner::InstrType
+AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
+                                   unsigned Flags) const {
+  MachineInstr &MI = *MIT;
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
 
   // Don't outline LOHs.
   if (FuncInfo->getLOHRelated().count(&MI))
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
 
   // Don't allow debug values to impact outlining type.
-  if (MI.isDebugValue() || MI.isIndirectDebugValue())
-    return MachineOutlinerInstrType::Invisible;
-
+  if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+    return outliner::InstrType::Invisible;
+
+  // At this point, KILL instructions don't really tell us much so we can go
+  // ahead and skip over them.
+  if (MI.isKill())
+    return outliner::InstrType::Invisible;
+  
   // Is this a terminator for a basic block?
   if (MI.isTerminator()) {
 
     // Is this the end of a function?
     if (MI.getParent()->succ_empty())
-      return MachineOutlinerInstrType::Legal;
-
+      return outliner::InstrType::Legal;
+    
     // It's not, so don't outline it.
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
   }
 
-  // Outline calls without stack parameters or aggregate parameters.
+  // Make sure none of the operands are un-outlinable.
+  for (const MachineOperand &MOP : MI.operands()) {
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return outliner::InstrType::Illegal;
+
+    // If it uses LR or W30 explicitly, then don't touch it.
+    if (MOP.isReg() && !MOP.isImplicit() &&
+        (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
+      return outliner::InstrType::Illegal;
+  }
+
+  // Special cases for instructions that can always be outlined, but will fail
+  // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
+  // be outlined because they don't require a *specific* value to be in LR.
+  if (MI.getOpcode() == AArch64::ADRP)
+    return outliner::InstrType::Legal;
+
+  // If MI is a call we might be able to outline it. We don't want to outline
+  // any calls that rely on the position of items on the stack. When we outline
+  // something containing a call, we have to emit a save and restore of LR in
+  // the outlined function. Currently, this always happens by saving LR to the
+  // stack. Thus, if we outline, say, half the parameters for a function call
+  // plus the call, then we'll break the callee's expectations for the layout
+  // of the stack.
+  //
+  // FIXME: Allow calls to functions which construct a stack frame, as long
+  // as they don't access arguments on the stack.
+  // FIXME: Figure out some way to analyze functions defined in other modules.
+  // We should be able to compute the memory usage based on the IR calling
+  // convention, even if we can't see the definition.
   if (MI.isCall()) {
-    const Module *M = MF->getFunction().getParent();
-    assert(M && "No module?");
-
     // Get the function associated with the call. Look at each operand and find
     // the one that represents the callee and get its name.
-    Function *Callee = nullptr;
+    const Function *Callee = nullptr;
     for (const MachineOperand &MOP : MI.operands()) {
-      if (MOP.isSymbol()) {
-        Callee = M->getFunction(MOP.getSymbolName());
-        break;
-      }
-
-      else if (MOP.isGlobal()) {
-        Callee = M->getFunction(MOP.getGlobal()->getGlobalIdentifier());
+      if (MOP.isGlobal()) {
+        Callee = dyn_cast<Function>(MOP.getGlobal());
         break;
       }
     }
 
-    // Only handle functions that we have information about.
+    // Never outline calls to mcount.  There isn't any rule that would require
+    // this, but the Linux kernel's "ftrace" feature depends on it.
+    if (Callee && Callee->getName() == "\01_mcount")
+      return outliner::InstrType::Illegal;
+
+    // If we don't know anything about the callee, assume it depends on the
+    // stack layout of the caller. In that case, it's only legal to outline
+    // as a tail-call.  Whitelist the call instructions we know about so we
+    // don't get unexpected results with call pseudo-instructions.
+    auto UnknownCallOutlineType = outliner::InstrType::Illegal;
+    if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
+      UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
+
     if (!Callee)
-      return MachineOutlinerInstrType::Illegal;
+      return UnknownCallOutlineType;
 
     // We have a function we have information about. Check it if it's something
     // can safely outline.
-
-    // If the callee is vararg, it passes parameters on the stack. Don't touch
-    // it.
-    // FIXME: Functions like printf are very common and we should be able to
-    // outline them.
-    if (Callee->isVarArg())
-      return MachineOutlinerInstrType::Illegal;
-
-    // Check if any of the arguments are a pointer to a struct. We don't want
-    // to outline these since they might be loaded in two instructions.
-    for (Argument &Arg : Callee->args()) {
-      if (Arg.getType()->isPointerTy() &&
-          Arg.getType()->getPointerElementType()->isAggregateType())
-        return MachineOutlinerInstrType::Illegal;
-    }
-
-    // If the thing we're calling doesn't access memory at all, then we're good
-    // to go.
-    if (Callee->doesNotAccessMemory())
-      return MachineOutlinerInstrType::Legal;
-
-    // It accesses memory. Get the machine function for the callee to see if
-    // it's safe to outline.
     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
 
     // We don't know what's going on with the callee at all. Don't touch it.
     if (!CalleeMF)
-      return MachineOutlinerInstrType::Illegal;
+      return UnknownCallOutlineType;
 
-    // Does it pass anything on the stack? If it does, don't outline it.
-    if (CalleeMF->getInfo<AArch64FunctionInfo>()->getBytesInStackArgArea() != 0)
-      return MachineOutlinerInstrType::Illegal;
+    // Check if we know anything about the callee saves on the function. If we
+    // don't, then don't touch it, since that implies that we haven't
+    // computed anything about its stack frame yet.
+    MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
+    if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
+        MFI.getNumObjects() > 0)
+      return UnknownCallOutlineType;
 
-    // It doesn't, so it's safe to outline and we're done.
-    return MachineOutlinerInstrType::Legal;
+    // At this point, we can say that CalleeMF ought to not pass anything on the
+    // stack. Therefore, we can outline it.
+    return outliner::InstrType::Legal;
   }
 
   // Don't outline positions.
   if (MI.isPosition())
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
 
   // Don't touch the link register or W30.
   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
-    return MachineOutlinerInstrType::Illegal;
-
-  // Make sure none of the operands are un-outlinable.
-  for (const MachineOperand &MOP : MI.operands()) {
-    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
-        MOP.isTargetIndex())
-      return MachineOutlinerInstrType::Illegal;
-
-    // Don't outline anything that uses the link register.
-    if (MOP.isReg() && getRegisterInfo().regsOverlap(MOP.getReg(), AArch64::LR))
-      return MachineOutlinerInstrType::Illegal;
-  }
+    return outliner::InstrType::Illegal;
 
   // Does this use the stack?
   if (MI.modifiesRegister(AArch64::SP, &RI) ||
       MI.readsRegister(AArch64::SP, &RI)) {
-
+    // True if there is no chance that any outlined candidate from this range
+    // could require stack fixups. That is, both
+    // * LR is available in the range (No save/restore around call)
+    // * The range doesn't include calls (No save/restore in outlined frame)
+    // are true.
+    // FIXME: This is very restrictive; the flags check the whole block,
+    // not just the bit we will try to outline.
+    bool MightNeedStackFixUp =
+        (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere |
+                  MachineOutlinerMBBFlags::HasCalls));
+
+    // If this instruction is in a range where it *never* needs to be fixed
+    // up, then we can *always* outline it. This is true even if it's not
+    // possible to fix that instruction up.
+    //
+    // Why? Consider two equivalent instructions I1, I2 where both I1 and I2
+    // use SP. Suppose that I1 sits within a range that definitely doesn't
+    // need stack fixups, while I2 sits in a range that does.
+    //
+    // First, I1 can be outlined as long as we *never* fix up the stack in
+    // any sequence containing it. I1 is already a safe instruction in the
+    // original program, so as long as we don't modify it we're good to go.
+    // So this leaves us with showing that outlining I2 won't break our
+    // program.
+    //
+    // Suppose I1 and I2 belong to equivalent candidate sequences. When we
+    // look at I2, we need to see if it can be fixed up. Suppose I2, (and
+    // thus I1) cannot be fixed up. Then I2 will be assigned an unique
+    // integer label; thus, I2 cannot belong to any candidate sequence (a
+    // contradiction). Suppose I2 can be fixed up. Then I1 can be fixed up
+    // as well, so we're good. Thus, I1 is always safe to outline.
+    //
+    // This gives us two things: first off, it buys us some more instructions
+    // for our search space by deeming stack instructions illegal only when
+    // they can't be fixed up AND we might have to fix them up. Second off,
+    // This allows us to catch tricky instructions like, say,
+    // %xi = ADDXri %sp, n, 0. We can't safely outline these since they might
+    // be paired with later SUBXris, which might *not* end up being outlined.
+    // If we mess with the stack to save something, then an ADDXri messes with
+    // it *after*, then we aren't going to restore the right something from
+    // the stack if we don't outline the corresponding SUBXri first. ADDXris and
+    // SUBXris are extremely common in prologue/epilogue code, so supporting
+    // them in the outliner can be a pretty big win!
+    if (!MightNeedStackFixUp)
+      return outliner::InstrType::Legal;
+
+    // Any modification of SP will break our code to save/restore LR.
+    // FIXME: We could handle some instructions which add a constant offset to
+    // SP, with a bit more work.
+    if (MI.modifiesRegister(AArch64::SP, &RI))
+      return outliner::InstrType::Illegal;
+
+    // At this point, we have a stack instruction that we might need to fix
+    // up. We'll handle it if it's a load or store.
     if (MI.mayLoadOrStore()) {
       unsigned Base;  // Filled with the base regiser of MI.
       int64_t Offset; // Filled with the offset of MI.
@@ -4905,7 +5279,7 @@ AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
       // Does it allow us to offset the base register and is the base SP?
       if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
           Base != AArch64::SP)
-        return MachineOutlinerInstrType::Illegal;
+        return outliner::InstrType::Illegal;
 
       // Find the minimum/maximum offset for this instruction and check if
       // fixing it up would be in range.
@@ -4918,17 +5292,19 @@ AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
       // to a MIR test, it really ought to be checked.
       Offset += 16; // Update the offset to what it would be if we outlined.
       if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
-        return MachineOutlinerInstrType::Illegal;
+        return outliner::InstrType::Illegal;
 
       // It's in range, so we can outline it.
-      return MachineOutlinerInstrType::Legal;
+      return outliner::InstrType::Legal;
     }
 
+    // FIXME: Add handling for instructions like "add x0, sp, #8".
+
     // We can't fix it up, so don't outline it.
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
   }
 
-  return MachineOutlinerInstrType::Legal;
+  return outliner::InstrType::Legal;
 }
 
 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
@@ -4959,15 +5335,36 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
   }
 }
 
-void AArch64InstrInfo::insertOutlinerEpilogue(
+void AArch64InstrInfo::buildOutlinedFrame(
     MachineBasicBlock &MBB, MachineFunction &MF,
-    const MachineOutlinerInfo &MInfo) const {
+    const outliner::OutlinedFunction &OF) const {
+  // For thunk outlining, rewrite the last instruction from a call to a
+  // tail-call.
+  if (OF.FrameConstructionID == MachineOutlinerThunk) {
+    MachineInstr *Call = &*--MBB.instr_end();
+    unsigned TailOpcode;
+    if (Call->getOpcode() == AArch64::BL) {
+      TailOpcode = AArch64::TCRETURNdi;
+    } else {
+      assert(Call->getOpcode() == AArch64::BLR);
+      TailOpcode = AArch64::TCRETURNri;
+    }
+    MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
+                            .add(Call->getOperand(0))
+                            .addImm(0);
+    MBB.insert(MBB.end(), TC);
+    Call->eraseFromParent();
+  }
 
   // Is there a call in the outlined range?
-  if (std::any_of(MBB.instr_begin(), MBB.instr_end(),
-                  [](MachineInstr &MI) { return MI.isCall(); })) {
+  auto IsNonTailCall = [](MachineInstr &MI) {
+    return MI.isCall() && !MI.isReturn();
+  };
+  if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
     // Fix up the instructions in the range, since we're going to modify the
     // stack.
+    assert(OF.FrameConstructionID != MachineOutlinerDefault &&
+           "Can only fix up stack references once");
     fixupPostOutline(MBB);
 
     // LR has to be a live in so that we can save it.
@@ -4976,7 +5373,8 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
     MachineBasicBlock::iterator It = MBB.begin();
     MachineBasicBlock::iterator Et = MBB.end();
 
-    if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
+    if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+        OF.FrameConstructionID == MachineOutlinerThunk)
       Et = std::prev(MBB.end());
 
     // Insert a save before the outlined region
@@ -4987,6 +5385,25 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
                                 .addImm(-16);
     It = MBB.insert(It, STRXpre);
 
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    const MCRegisterInfo *MRI = STI.getRegisterInfo();
+    unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
+
+    // Add a CFI saying the stack was moved 16 B down.
+    int64_t StackPosEntry =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
+    BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+        .addCFIIndex(StackPosEntry)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    // Add a CFI saying that the LR that we want to find is now 16 B higher than
+    // before.
+    int64_t LRPosEntry =
+        MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
+    BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+        .addCFIIndex(LRPosEntry)
+        .setMIFlags(MachineInstr::FrameSetup);
+
     // Insert a restore before the terminator for the function.
     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
                                  .addReg(AArch64::SP, RegState::Define)
@@ -4997,7 +5414,8 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
   }
 
   // If this is a tail call outlined function, then there's already a return.
-  if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
+  if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+      OF.FrameConstructionID == MachineOutlinerThunk)
     return;
 
   // It's not a tail call, so we have to insert the return ourselves.
@@ -5006,7 +5424,7 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
   MBB.insert(MBB.end(), ret);
 
   // Did we have to modify the stack by saving the link register?
-  if (MInfo.FrameConstructionID == MachineOutlinerNoLRSave)
+  if (OF.FrameConstructionID == MachineOutlinerNoLRSave)
     return;
 
   // We modified the stack.
@@ -5014,30 +5432,31 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
   fixupPostOutline(MBB);
 }
 
-void AArch64InstrInfo::insertOutlinerPrologue(
-    MachineBasicBlock &MBB, MachineFunction &MF,
-    const MachineOutlinerInfo &MInfo) const {}
-
 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
-    MachineFunction &MF, const MachineOutlinerInfo &MInfo) const {
+    MachineFunction &MF, const outliner::Candidate &C) const {
 
   // Are we tail calling?
-  if (MInfo.CallConstructionID == MachineOutlinerTailCall) {
+  if (C.CallConstructionID == MachineOutlinerTailCall) {
     // If yes, then we can just branch to the label.
-    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::B))
-                            .addGlobalAddress(M.getNamedValue(MF.getName())));
+    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
+                            .addGlobalAddress(M.getNamedValue(MF.getName()))
+                            .addImm(0));
     return It;
   }
 
   // Are we saving the link register?
-  if (MInfo.CallConstructionID == MachineOutlinerNoLRSave) {
+  if (C.CallConstructionID == MachineOutlinerNoLRSave ||
+      C.CallConstructionID == MachineOutlinerThunk) {
     // No, so just insert the call.
     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
                             .addGlobalAddress(M.getNamedValue(MF.getName())));
     return It;
   }
 
+  // We want to return the spot where we inserted the call.
+  MachineBasicBlock::iterator CallPt;
+
   // We have a default call. Save the link register.
   MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
                               .addReg(AArch64::SP, RegState::Define)
@@ -5050,7 +5469,7 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
   // Insert the call.
   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
                           .addGlobalAddress(M.getNamedValue(MF.getName())));
-
+  CallPt = It;
   It++;
 
   // Restore the link register.
@@ -5061,5 +5480,5 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
                                .addImm(16);
   It = MBB.insert(It, LDRXpost);
 
-  return It;
+  return CallPt;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 2f10bef1e474..0e5953f6216d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -64,165 +64,51 @@ public:
 
   /// Returns true if there is a shiftable register and that the shift value
   /// is non-zero.
-  bool hasShiftedReg(const MachineInstr &MI) const;
+  static bool hasShiftedReg(const MachineInstr &MI);
 
   /// Returns true if there is an extendable register and that the extending
   /// value is non-zero.
-  bool hasExtendedReg(const MachineInstr &MI) const;
+  static bool hasExtendedReg(const MachineInstr &MI);
 
-  /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr &MI) const;
+  /// Does this instruction set its full destination register to zero?
+  static bool isGPRZero(const MachineInstr &MI);
 
-  /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr &MI) const;
+  /// Does this instruction rename a GPR without modifying bits?
+  static bool isGPRCopy(const MachineInstr &MI);
 
-  /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr &MI) const;
+  /// Does this instruction rename an FPR without modifying bits?
+  static bool isFPRCopy(const MachineInstr &MI);
 
   /// Return true if this is load/store scales or extends its register offset.
   /// This refers to scaling a dynamic index as opposed to scaled immediates.
   /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr &MI) const;
+  static bool isScaledAddr(const MachineInstr &MI);
 
   /// Return true if pairing the given load or store is hinted to be
   /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr &MI) const;
+  static bool isLdStPairSuppressed(const MachineInstr &MI);
 
   /// Return true if the given load or store is a strided memory access.
-  bool isStridedAccess(const MachineInstr &MI) const;
+  static bool isStridedAccess(const MachineInstr &MI);
 
   /// Return true if this is an unscaled load/store.
-  bool isUnscaledLdSt(unsigned Opc) const;
-
-  /// Return true if this is an unscaled load/store.
-  bool isUnscaledLdSt(MachineInstr &MI) const;
-
-  static bool isPairableLdStInst(const MachineInstr &MI) {
-    switch (MI.getOpcode()) {
-    default:
-      return false;
-    // Scaled instructions.
-    case AArch64::STRSui:
-    case AArch64::STRDui:
-    case AArch64::STRQui:
-    case AArch64::STRXui:
-    case AArch64::STRWui:
-    case AArch64::LDRSui:
-    case AArch64::LDRDui:
-    case AArch64::LDRQui:
-    case AArch64::LDRXui:
-    case AArch64::LDRWui:
-    case AArch64::LDRSWui:
-    // Unscaled instructions.
-    case AArch64::STURSi:
-    case AArch64::STURDi:
-    case AArch64::STURQi:
-    case AArch64::STURWi:
-    case AArch64::STURXi:
-    case AArch64::LDURSi:
-    case AArch64::LDURDi:
-    case AArch64::LDURQi:
-    case AArch64::LDURWi:
-    case AArch64::LDURXi:
-    case AArch64::LDURSWi:
-      return true;
-    }
+  static bool isUnscaledLdSt(unsigned Opc);
+  static bool isUnscaledLdSt(MachineInstr &MI) {
+    return isUnscaledLdSt(MI.getOpcode());
   }
 
-  /// \brief Return the opcode that set flags when possible.  The caller is
+  /// Return true if pairing the given load or store may be paired with another.
+  static bool isPairableLdStInst(const MachineInstr &MI);
+
+  /// Return the opcode that set flags when possible.  The caller is
   /// responsible for ensuring the opc has a flag setting equivalent.
-  static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
-    switch (Opc) {
-    default:
-      llvm_unreachable("Opcode has no flag setting equivalent!");
-    // 32-bit cases:
-    case AArch64::ADDWri:
-      Is64Bit = false;
-      return AArch64::ADDSWri;
-    case AArch64::ADDWrr:
-      Is64Bit = false;
-      return AArch64::ADDSWrr;
-    case AArch64::ADDWrs:
-      Is64Bit = false;
-      return AArch64::ADDSWrs;
-    case AArch64::ADDWrx:
-      Is64Bit = false;
-      return AArch64::ADDSWrx;
-    case AArch64::ANDWri:
-      Is64Bit = false;
-      return AArch64::ANDSWri;
-    case AArch64::ANDWrr:
-      Is64Bit = false;
-      return AArch64::ANDSWrr;
-    case AArch64::ANDWrs:
-      Is64Bit = false;
-      return AArch64::ANDSWrs;
-    case AArch64::BICWrr:
-      Is64Bit = false;
-      return AArch64::BICSWrr;
-    case AArch64::BICWrs:
-      Is64Bit = false;
-      return AArch64::BICSWrs;
-    case AArch64::SUBWri:
-      Is64Bit = false;
-      return AArch64::SUBSWri;
-    case AArch64::SUBWrr:
-      Is64Bit = false;
-      return AArch64::SUBSWrr;
-    case AArch64::SUBWrs:
-      Is64Bit = false;
-      return AArch64::SUBSWrs;
-    case AArch64::SUBWrx:
-      Is64Bit = false;
-      return AArch64::SUBSWrx;
-    // 64-bit cases:
-    case AArch64::ADDXri:
-      Is64Bit = true;
-      return AArch64::ADDSXri;
-    case AArch64::ADDXrr:
-      Is64Bit = true;
-      return AArch64::ADDSXrr;
-    case AArch64::ADDXrs:
-      Is64Bit = true;
-      return AArch64::ADDSXrs;
-    case AArch64::ADDXrx:
-      Is64Bit = true;
-      return AArch64::ADDSXrx;
-    case AArch64::ANDXri:
-      Is64Bit = true;
-      return AArch64::ANDSXri;
-    case AArch64::ANDXrr:
-      Is64Bit = true;
-      return AArch64::ANDSXrr;
-    case AArch64::ANDXrs:
-      Is64Bit = true;
-      return AArch64::ANDSXrs;
-    case AArch64::BICXrr:
-      Is64Bit = true;
-      return AArch64::BICSXrr;
-    case AArch64::BICXrs:
-      Is64Bit = true;
-      return AArch64::BICSXrs;
-    case AArch64::SUBXri:
-      Is64Bit = true;
-      return AArch64::SUBSXri;
-    case AArch64::SUBXrr:
-      Is64Bit = true;
-      return AArch64::SUBSXrr;
-    case AArch64::SUBXrs:
-      Is64Bit = true;
-      return AArch64::SUBSXrs;
-    case AArch64::SUBXrx:
-      Is64Bit = true;
-      return AArch64::SUBSXrx;
-    }
-  }
+  static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit);
 
   /// Return true if this is a load/store that can be potentially paired/merged.
   bool isCandidateToMergeOrPair(MachineInstr &MI) const;
 
   /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr &MI) const;
+  static void suppressLdStPair(MachineInstr &MI);
 
   bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
                              int64_t &Offset,
@@ -235,7 +121,7 @@ public:
   /// Return the immediate offset of the base register in a load/store \p LdSt.
   MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
 
-  /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set
+  /// Returns true if opcode \p Opc is a memory operation. If it is, set
   /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
   ///
   /// For unscaled instructions, \p Scale is set to 1.
@@ -350,24 +236,22 @@ public:
   ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
   getSerializableMachineMemOperandTargetFlags() const override;
 
-  bool
-  canOutlineWithoutLRSave(MachineBasicBlock::iterator &CallInsertionPt) const;
   bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
                                    bool OutlineFromLinkOnceODRs) const override;
-  MachineOutlinerInfo getOutlininingCandidateInfo(
-      std::vector<
-          std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-          &RepeatedSequenceLocs) const override;
-  AArch64GenInstrInfo::MachineOutlinerInstrType
-  getOutliningType(MachineInstr &MI) const override;
-  void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF,
-                              const MachineOutlinerInfo &MInfo) const override;
-  void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF,
-                              const MachineOutlinerInfo &MInfo) const override;
+  outliner::OutlinedFunction getOutliningCandidateInfo(
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+  outliner::InstrType
+  getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
+  unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const override;
+  void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+                          const outliner::OutlinedFunction &OF) const override;
   MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const MachineOutlinerInfo &MInfo) const override;
+                     const outliner::Candidate &C) const override;
+  /// Returns true if the instruction sets to an immediate value that can be
+  /// executed more efficiently.
+  bool isExynosResetFast(const MachineInstr &MI) const;
   /// Returns true if the instruction has a shift left that can be executed
   /// more efficiently.
   bool isExynosShiftLeftFast(const MachineInstr &MI) const;
@@ -376,7 +260,7 @@ public:
   bool isFalkorShiftExtFast(const MachineInstr &MI) const;
 
 private:
-  /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+  /// Sets the offsets on outlined instructions in \p MBB which use SP
   /// so that they will be valid post-outlining.
   ///
   /// \param MBB A \p MachineBasicBlock in an outlined function.
@@ -406,14 +290,14 @@ bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                               unsigned FrameReg, int &Offset,
                               const AArch64InstrInfo *TII);
 
-/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+/// Use to report the frame offset status in isAArch64FrameOffsetLegal.
 enum AArch64FrameOffsetStatus {
   AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
   AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
   AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
 };
 
-/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// Check if the @p Offset is a valid frame offset for @p MI.
 /// The returned value reports the validity of the frame offset for @p MI.
 /// It uses the values defined by AArch64FrameOffsetStatus for that.
 /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 040011d858e7..d6b8bb5d89c7 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -20,12 +20,22 @@ def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
                                  AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
 def HasV8_3a         : Predicate<"Subtarget->hasV8_3aOps()">,
                                  AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
+                                 AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasSM4           : Predicate<"Subtarget->hasSM4()">,
+                                 AssemblerPredicate<"FeatureSM4", "sm4">;
+def HasSHA3          : Predicate<"Subtarget->hasSHA3()">,
+                                 AssemblerPredicate<"FeatureSHA3", "sha3">;
+def HasSHA2          : Predicate<"Subtarget->hasSHA2()">,
+                                 AssemblerPredicate<"FeatureSHA2", "sha2">;
+def HasAES           : Predicate<"Subtarget->hasAES()">,
+                                 AssemblerPredicate<"FeatureAES", "aes">;
 def HasDotProd       : Predicate<"Subtarget->hasDotProd()">,
                                  AssemblerPredicate<"FeatureDotProd", "dotprod">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
@@ -396,6 +406,15 @@ def MOVaddrEXT
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
                                             texternalsym:$low))]>,
       Sched<[WriteAdrAdr]>;
+// Normally AArch64addlow either gets folded into a following ldr/str,
+// or together with an adrp into MOVaddr above. For cases with TLS, it
+// might appear without either of them, so allow lowering it into a plain
+// add.
+def ADDlowTLS
+    : Pseudo<(outs GPR64:$dst), (ins GPR64:$src, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow GPR64:$src,
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdr]>;
 
 } // isReMaterializable, isCodeGenOnly
 
@@ -420,6 +439,7 @@ def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
 def : InstAlias<"esb",  (HINT 0b10000)>, Requires<[HasRAS]>;
+def : InstAlias<"csdb", (HINT 20)>;
 
 // v8.2a Statistical Profiling extension
 def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
@@ -439,20 +459,46 @@ def DSB   : CRmSystemI<barrier_op, 0b100, "dsb",
 
 def ISB   : CRmSystemI<barrier_op, 0b110, "isb",
                        [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
+
+def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
+  let CRm        = 0b0010;
+  let Inst{12}   = 0;
+  let Predicates = [HasV8_4a];
+}
 }
 
 // ARMv8.2 Dot Product
 let Predicates = [HasDotProd] in {
-def UDOT2S    : BaseSIMDThreeSameVectorDot<0, 1, "udot", ".2s", ".8b">;
-def SDOT2S    : BaseSIMDThreeSameVectorDot<0, 0, "sdot", ".2s", ".8b">;
-def UDOT4S    : BaseSIMDThreeSameVectorDot<1, 1, "udot", ".4s", ".16b">;
-def SDOT4S    : BaseSIMDThreeSameVectorDot<1, 0, "sdot", ".4s", ".16b">;
-def UDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 1, "udot", ".2s", ".8b", ".4b">;
-def SDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 0, "sdot", ".2s", ".8b", ".4b">;
-def UDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 1, "udot", ".4s", ".16b", ".4b">;
-def SDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 0, "sdot", ".4s", ".16b", ".4b">;
+defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
 }
 
+// Armv8.2-A Crypto extensions
+let Predicates = [HasSHA3] in {
+def SHA512H   : CryptoRRRTied<0b0, 0b00, "sha512h">;
+def SHA512H2  : CryptoRRRTied<0b0, 0b01, "sha512h2">;
+def SHA512SU0 : CryptoRRTied_2D<0b0, 0b00, "sha512su0">;
+def SHA512SU1 : CryptoRRRTied_2D<0b0, 0b10, "sha512su1">;
+def RAX1      : CryptoRRR_2D<0b0,0b11, "rax1">;
+def EOR3      : CryptoRRRR_16B<0b00, "eor3">;
+def BCAX      : CryptoRRRR_16B<0b01, "bcax">;
+def XAR       : CryptoRRRi6<"xar">;
+} // HasSHA3
+
+let Predicates = [HasSM4] in {
+def SM3TT1A   : CryptoRRRi2Tied<0b0, 0b00, "sm3tt1a">;
+def SM3TT1B   : CryptoRRRi2Tied<0b0, 0b01, "sm3tt1b">;
+def SM3TT2A   : CryptoRRRi2Tied<0b0, 0b10, "sm3tt2a">;
+def SM3TT2B   : CryptoRRRi2Tied<0b0, 0b11, "sm3tt2b">;
+def SM3SS1    : CryptoRRRR_4S<0b10, "sm3ss1">;
+def SM3PARTW1 : CryptoRRRTied_4S<0b1, 0b00, "sm3partw1">;
+def SM3PARTW2 : CryptoRRRTied_4S<0b1, 0b01, "sm3partw2">;
+def SM4ENCKEY : CryptoRRR_4S<0b1, 0b10, "sm4ekey">;
+def SM4E      : CryptoRRTied_4S<0b0, 0b01, "sm4e">;
+} // HasSM4
+
 let Predicates = [HasRCPC] in {
   // v8.3 Release Consistent Processor Consistent support, optional in v8.2.
   def LDAPRB  : RCPCLoad<0b00, "ldaprb", GPR32>;
@@ -470,31 +516,34 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
 defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
                                        null_frag>;
 
-let Predicates = [HasV8_3a] in {
-  // v8.3a Pointer Authentication
-  let Uses = [LR], Defs = [LR] in {
-    def PACIAZ   : SystemNoOperands<0b000, "paciaz">;
-    def PACIBZ   : SystemNoOperands<0b010, "pacibz">;
-    def AUTIAZ   : SystemNoOperands<0b100, "autiaz">;
-    def AUTIBZ   : SystemNoOperands<0b110, "autibz">;
-  }
-  let Uses = [LR, SP], Defs = [LR] in {
-    def PACIASP  : SystemNoOperands<0b001, "paciasp">;
-    def PACIBSP  : SystemNoOperands<0b011, "pacibsp">;
-    def AUTIASP  : SystemNoOperands<0b101, "autiasp">;
-    def AUTIBSP  : SystemNoOperands<0b111, "autibsp">;
-  }
-  let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
-    def PACIA1716  : SystemNoOperands<0b000, "pacia1716">;
-    def PACIB1716  : SystemNoOperands<0b010, "pacib1716">;
-    def AUTIA1716  : SystemNoOperands<0b100, "autia1716">;
-    def AUTIB1716  : SystemNoOperands<0b110, "autib1716">;
-  }
+// v8.3a Pointer Authentication
+// These instructions inhabit part of the hint space and so can be used for
+// armv8 targets
+let Uses = [LR], Defs = [LR] in {
+  def PACIAZ   : SystemNoOperands<0b000, "paciaz">;
+  def PACIBZ   : SystemNoOperands<0b010, "pacibz">;
+  def AUTIAZ   : SystemNoOperands<0b100, "autiaz">;
+  def AUTIBZ   : SystemNoOperands<0b110, "autibz">;
+}
+let Uses = [LR, SP], Defs = [LR] in {
+  def PACIASP  : SystemNoOperands<0b001, "paciasp">;
+  def PACIBSP  : SystemNoOperands<0b011, "pacibsp">;
+  def AUTIASP  : SystemNoOperands<0b101, "autiasp">;
+  def AUTIBSP  : SystemNoOperands<0b111, "autibsp">;
+}
+let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
+  def PACIA1716  : SystemNoOperands<0b000, "pacia1716">;
+  def PACIB1716  : SystemNoOperands<0b010, "pacib1716">;
+  def AUTIA1716  : SystemNoOperands<0b100, "autia1716">;
+  def AUTIB1716  : SystemNoOperands<0b110, "autib1716">;
+}
 
-  let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
-    def XPACLRI   : SystemNoOperands<0b111, "xpaclri">;
-  }
+let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
+  def XPACLRI   : SystemNoOperands<0b111, "xpaclri">;
+}
 
+// These pointer authentication isntructions require armv8.3a
+let Predicates = [HasV8_3a] in {
   multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
     def IA   : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
     def IB   : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
@@ -524,7 +573,7 @@ let Predicates = [HasV8_3a] in {
   def BLRAAZ  : AuthOneOperand<0b001, 0, "blraaz">;
   def BLRABZ  : AuthOneOperand<0b001, 1, "blrabz">;
 
-  let isReturn = 1 in {
+  let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     def RETAA   : AuthReturn<0b010, 0, "retaa">;
     def RETAB   : AuthReturn<0b010, 1, "retab">;
     def ERETAA  : AuthReturn<0b100, 0, "eretaa">;
@@ -541,7 +590,18 @@ let Predicates = [HasV8_3a] in {
     let Inst{31} = 0;
   }
 
-} // HasV8_3A
+} // HasV8_3a
+
+// v8.4 Flag manipulation instructions
+let Predicates = [HasV8_4a] in {
+def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
+  let Inst{20-5} = 0b0000001000000000;
+}
+def SETF8  : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
+def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
+def RMIF   : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
+                        "{\t$Rn, $imm, $mask}">;
+} // HasV8_4a
 
 def : InstAlias<"clrex", (CLREX 0xf)>;
 def : InstAlias<"isb", (ISB 0xf)>;
@@ -560,6 +620,9 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
 let Predicates = [HasPerfMon] in
 def : Pat<(readcyclecounter), (MRS 0xdce8)>;
 
+// FPCR register
+def : Pat<(i64 (int_aarch64_get_fpcr)), (MRS 0xda20)>;
+
 // Generic system instructions
 def SYSxt  : SystemXtI<0, "sys">;
 def SYSLxt : SystemLXtI<1, "sysl">;
@@ -678,6 +741,9 @@ def trunc_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
+def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
+  GISDNodeXFormEquiv<trunc_imm>;
+
 def : Pat<(i64 i64imm_32bit:$src),
           (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
 
@@ -1327,6 +1393,7 @@ def ADRP : ADRI<1, "adrp", adrplabel,
 // page address of a constant pool entry, block address
 def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
 def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+def : Pat<(AArch64adrp texternalsym:$sym), (ADRP texternalsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Unconditional branch (register) instructions.
@@ -1410,7 +1477,9 @@ def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
 //===----------------------------------------------------------------------===//
 // Exception generation instructions.
 //===----------------------------------------------------------------------===//
+let isTrap = 1 in {
 def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+}
 def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
 def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
 def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
@@ -1429,39 +1498,39 @@ def : InstAlias<"dcps3", (DCPS3 0)>;
 //===----------------------------------------------------------------------===//
 
 // Pair (indexed, offset)
-defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
-defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
-defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
-defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
-defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32Op, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64Op, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128Op, simm7s16, "ldp">;
 
-defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64z, simm7s4, "ldpsw">;
 
 // Pair (pre-indexed)
-def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
 
-def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;
 
 // Pair (post-indexed)
-def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
 
-def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;
 
 
 // Pair (no allocate)
-defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
-defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
-defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
-defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
-defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32z, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64z, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
 
 //---
 // (register offset)
@@ -1474,11 +1543,11 @@ defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
 defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
 
 // Floating-point
-defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
-defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
-defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
-defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
-defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8Op,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16Op,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32Op,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64Op,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128Op, "ldr", f128, load>;
 
 // Load sign-extended half-word
 defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
@@ -1640,26 +1709,26 @@ let AddedComplexity = 10 in {
 //---
 // (unsigned immediate)
 //---
-defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
-                   [(set GPR64:$Rt,
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64z, uimm12s8, "ldr",
+                   [(set GPR64z:$Rt,
                          (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
-defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
-                   [(set GPR32:$Rt,
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
+                   [(set GPR32z:$Rt,
                          (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
-defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
-                   [(set FPR8:$Rt,
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
+                   [(set FPR8Op:$Rt,
                          (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
-defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
-                   [(set (f16 FPR16:$Rt),
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
+                   [(set (f16 FPR16Op:$Rt),
                          (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
-defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
-                   [(set (f32 FPR32:$Rt),
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32Op, uimm12s4, "ldr",
+                   [(set (f32 FPR32Op:$Rt),
                          (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
-defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
-                   [(set (f64 FPR64:$Rt),
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64Op, uimm12s8, "ldr",
+                   [(set (f64 FPR64Op:$Rt),
                          (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
-defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
-                 [(set (f128 FPR128:$Rt),
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
+                 [(set (f128 FPR128Op:$Rt),
                        (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
 
 // For regular load, we do not have any alignment requirement.
@@ -1814,14 +1883,14 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
 
 //---
 // (literal)
-def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
-def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
-def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
-def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
-def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr">;
 
 // load sign-extended word
-def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw">;
 
 // prefetch
 def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
@@ -1829,26 +1898,26 @@ def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
 
 //---
 // (unscaled immediate)
-defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
-                    [(set GPR64:$Rt,
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64z, "ldur",
+                    [(set GPR64z:$Rt,
                           (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
-                    [(set GPR32:$Rt,
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
+                    [(set GPR32z:$Rt,
                           (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
-                    [(set FPR8:$Rt,
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
+                    [(set FPR8Op:$Rt,
                           (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
-                    [(set FPR16:$Rt,
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
+                    [(set FPR16Op:$Rt,
                           (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
-                    [(set (f32 FPR32:$Rt),
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
+                    [(set (f32 FPR32Op:$Rt),
                           (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
-                    [(set (f64 FPR64:$Rt),
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64Op, "ldur",
+                    [(set (f64 FPR64Op:$Rt),
                           (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
-                    [(set (f128 FPR128:$Rt),
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128Op, "ldur",
+                    [(set (f128 FPR128Op:$Rt),
                           (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
 
 defm LDURHH
@@ -1968,15 +2037,15 @@ def : InstAlias<"ldr $Rt, [$Rn, $offset]",
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+                (LDURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+                (LDURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+                (LDURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+                (LDURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+               (LDURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
 
 // zextload -> i64
 def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
@@ -2052,53 +2121,53 @@ defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
 
 //---
 // (immediate pre-indexed)
-def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32z, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64z, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8Op,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
 
 // load sign-extended half-word
-def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;
 
 // load sign-extended byte
-def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;
 
 // load zero-extended byte
-def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32z, "ldrh">;
 
 // load sign-extended word
-def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
 
 //---
 // (immediate post-indexed)
-def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32z, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64z, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8Op,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
 
 // load sign-extended half-word
-def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;
 
 // load sign-extended byte
-def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;
 
 // load zero-extended byte
-def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32z, "ldrh">;
 
 // load sign-extended word
-def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
 
 //===----------------------------------------------------------------------===//
 // Store instructions.
@@ -2106,32 +2175,32 @@ def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
 
 // Pair (indexed, offset)
 // FIXME: Use dedicated range-checked addressing mode operand here.
-defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
-defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
-defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
-defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
-defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+defm STPW : StorePairOffset<0b00, 0, GPR32z, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64z, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32Op, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64Op, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128Op, simm7s16, "stp">;
 
 // Pair (pre-indexed)
-def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32z, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64z, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
 
 // Pair (pre-indexed)
-def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
 
 // Pair (no allocate)
-defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
-defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
-defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
-defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
-defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32z, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
 
 //---
 // (Register offset)
@@ -2144,11 +2213,11 @@ defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
 
 
 // Floating-point
-defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
-defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
-defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
-defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
-defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8Op,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16Op,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32Op,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64Op,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str", f128,    store>;
 
 let Predicates = [UseSTRQro], AddedComplexity = 10 in {
   def : Pat<(store (f128 FPR128:$Rt),
@@ -2239,12 +2308,11 @@ multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
 
 let AddedComplexity = 19 in {
   defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
-  defm : VecROStoreLane0Pat<ro16,      store   , v8i16, i16, hsub, STRHroW, STRHroX>;
-  defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
-  defm : VecROStoreLane0Pat<ro32,      store   , v4i32, i32, ssub, STRSroW, STRSroX>;
-  defm : VecROStoreLane0Pat<ro32,      store   , v4f32, f32, ssub, STRSroW, STRSroX>;
-  defm : VecROStoreLane0Pat<ro64,      store   , v2i64, i64, dsub, STRDroW, STRDroX>;
-  defm : VecROStoreLane0Pat<ro64,      store   , v2f64, f64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro16,         store, v8f16, f16, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro32,         store, v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,         store, v4f32, f32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro64,         store, v2i64, i64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro64,         store, v2f64, f64, dsub, STRDroW, STRDroX>;
 }
 
 //---
@@ -2255,19 +2323,19 @@ defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str",
 defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
                     [(store GPR32z:$Rt,
                             (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
-defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
-                    [(store FPR8:$Rt,
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
+                    [(store FPR8Op:$Rt,
                             (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
-defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
-                    [(store (f16 FPR16:$Rt),
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
+                    [(store (f16 FPR16Op:$Rt),
                             (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
-defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
-                    [(store (f32 FPR32:$Rt),
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32Op, uimm12s4, "str",
+                    [(store (f32 FPR32Op:$Rt),
                             (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
-defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
-                    [(store (f64 FPR64:$Rt),
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64Op, uimm12s8, "str",
+                    [(store (f64 FPR64Op:$Rt),
                             (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
-defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128Op, uimm12s16, "str", []>;
 
 defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh",
                      [(truncstorei16 GPR32z:$Rt,
@@ -2278,8 +2346,16 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1,  "strb",
                                     (am_indexed8 GPR64sp:$Rn,
                                                  uimm12s1:$offset))]>;
 
-// Match all store 64 bits width whose type is compatible with FPR64
 let AddedComplexity = 10 in {
+
+// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v2f32 FPR64:$Rt),
@@ -2298,14 +2374,12 @@ let Predicates = [IsLE] in {
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 }
-def : Pat<(store (v1f64 FPR64:$Rt),
-                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt),
-                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 
 // Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v4f32 FPR128:$Rt),
@@ -2330,9 +2404,6 @@ let Predicates = [IsLE] in {
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 }
-def : Pat<(store (f128  FPR128:$Rt),
-                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 
 // truncstore i64
 def : Pat<(truncstorei32 GPR64:$Rt,
@@ -2346,37 +2417,81 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
 
 } // AddedComplexity = 10
 
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreLane0Pat<Operand UIAddrMode, SDPatternOperator storeop,
+                            ValueType VTy, ValueType STy,
+                            SubRegIndex SubRegIdx, Operand IndexType,
+                            Instruction STR> {
+  def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
+                     (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+            (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                 GPR64sp:$Rn, IndexType:$offset)>;
+}
+
+let AddedComplexity = 19 in {
+  defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
+  defm : VecStoreLane0Pat<am_indexed16,         store, v8f16, f16, hsub, uimm12s2, STRHui>;
+  defm : VecStoreLane0Pat<am_indexed32,         store, v4i32, i32, ssub, uimm12s4, STRSui>;
+  defm : VecStoreLane0Pat<am_indexed32,         store, v4f32, f32, ssub, uimm12s4, STRSui>;
+  defm : VecStoreLane0Pat<am_indexed64,         store, v2i64, i64, dsub, uimm12s8, STRDui>;
+  defm : VecStoreLane0Pat<am_indexed64,         store, v2f64, f64, dsub, uimm12s8, STRDui>;
+}
+
 //---
 // (unscaled immediate)
-defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
-                         [(store GPR64:$Rt,
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64z, "stur",
+                         [(store GPR64z:$Rt,
                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
-                         [(store GPR32:$Rt,
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
+                         [(store GPR32z:$Rt,
                                  (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
-                         [(store FPR8:$Rt,
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
+                         [(store FPR8Op:$Rt,
                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
-                         [(store (f16 FPR16:$Rt),
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
+                         [(store (f16 FPR16Op:$Rt),
                                  (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
-                         [(store (f32 FPR32:$Rt),
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32Op, "stur",
+                         [(store (f32 FPR32Op:$Rt),
                                  (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
-                         [(store (f64 FPR64:$Rt),
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64Op, "stur",
+                         [(store (f64 FPR64Op:$Rt),
                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
-                         [(store (f128 FPR128:$Rt),
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128Op, "stur",
+                         [(store (f128 FPR128Op:$Rt),
                                  (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
-                         [(truncstorei16 GPR32:$Rt,
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32z, "sturh",
+                         [(truncstorei16 GPR32z:$Rt,
                                  (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
-                         [(truncstorei8 GPR32:$Rt,
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb",
+                         [(truncstorei8 GPR32z:$Rt,
                                   (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
 
+// Armv8.4 LDAPR & STLR with Immediate Offset instruction
+let Predicates = [HasV8_4a] in {
+defm STLURB     : BaseStoreUnscaleV84<"stlurb",  0b00, 0b00, GPR32>;
+defm STLURH     : BaseStoreUnscaleV84<"stlurh",  0b01, 0b00, GPR32>;
+defm STLURW     : BaseStoreUnscaleV84<"stlur",   0b10, 0b00, GPR32>;
+defm STLURX     : BaseStoreUnscaleV84<"stlur",   0b11, 0b00, GPR64>;
+defm LDAPURB    : BaseLoadUnscaleV84<"ldapurb",  0b00, 0b01, GPR32>;
+defm LDAPURSBW  : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b11, GPR32>;
+defm LDAPURSBX  : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b10, GPR64>;
+defm LDAPURH    : BaseLoadUnscaleV84<"ldapurh",  0b01, 0b01, GPR32>;
+defm LDAPURSHW  : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b11, GPR32>;
+defm LDAPURSHX  : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b10, GPR64>;
+defm LDAPUR     : BaseLoadUnscaleV84<"ldapur",   0b10, 0b01, GPR32>;
+defm LDAPURSW   : BaseLoadUnscaleV84<"ldapursw", 0b10, 0b10, GPR64>;
+defm LDAPURX    : BaseLoadUnscaleV84<"ldapur",   0b11, 0b01, GPR64>;
+}
+
 // Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+let AddedComplexity = 10 in {
+
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v2f32 FPR64:$Rt),
@@ -2395,12 +2510,11 @@ let Predicates = [IsLE] in {
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
-def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 
 // Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+          (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v4f32 FPR128:$Rt),
@@ -2429,6 +2543,8 @@ let Predicates = [IsLE] in {
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
+} // AddedComplexity = 10
+
 // unscaled i64 truncating stores
 def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
   (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -2437,6 +2553,22 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
 def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
   (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
+                             ValueType VTy, ValueType STy,
+                             SubRegIndex SubRegIdx, Instruction STR> {
+  defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
+}
+
+let AddedComplexity = 19 in {
+  defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
+  defm : VecStoreULane0Pat<store,         v8f16, f16, hsub, STURHi>;
+  defm : VecStoreULane0Pat<store,         v4i32, i32, ssub, STURSi>;
+  defm : VecStoreULane0Pat<store,         v4f32, f32, ssub, STURSi>;
+  defm : VecStoreULane0Pat<store,         v2i64, i64, dsub, STURDi>;
+  defm : VecStoreULane0Pat<store,         v2f64, f64, dsub, STURDi>;
+}
+
 //---
 // STR mnemonics fall back to STUR for negative or unaligned offsets.
 def : InstAlias<"str $Rt, [$Rn, $offset]",
@@ -2444,15 +2576,15 @@ def : InstAlias<"str $Rt, [$Rn, $offset]",
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+                (STURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+                (STURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+                (STURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+                (STURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+                (STURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
 
 def : InstAlias<"strb $Rt, [$Rn, $offset]",
                 (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
@@ -2469,16 +2601,16 @@ defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
 
 //---
 // (immediate pre-indexed)
-def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
-def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
-def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
-def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
-def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
-def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>;
 
-def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
-def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>;
 
 // truncstore i64
 def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
@@ -2523,16 +2655,16 @@ def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
 
 //---
 // (immediate post-indexed)
-def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
-def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
-def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
-def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
-def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
-def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128Op, "str", post_store, f128>;
 
-def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
-def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32z, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32z, "strh", post_truncsti16, i32>;
 
 // truncstore i64
 def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
@@ -3073,6 +3205,14 @@ defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
 defm FABD    : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+let Predicates = [HasNEON] in {
+foreach VT = [ v2f32, v4f32, v2f64 ] in
+def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
+}
+let Predicates = [HasNEON, HasFullFP16] in {
+foreach VT = [ v4f16, v8f16 ] in
+def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
+}
 defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
 defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
 defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
@@ -3382,6 +3522,11 @@ defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
 defm FABD     : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
 def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+let Predicates = [HasFullFP16] in {
+def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
+}
+def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
+def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
 defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
                                      int_aarch64_neon_facge>;
 defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
@@ -3526,6 +3671,8 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
 def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
           (FCVTPUv1i64 FPR64:$Rn)>;
 
+def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
+          (FRECPEv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
           (FRECPEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
@@ -3557,11 +3704,15 @@ def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
 def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
           (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
 
+def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))),
+          (FRECPXv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
           (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
           (FRECPXv1i64 FPR64:$Rn)>;
 
+def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))),
+          (FRSQRTEv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
           (FRSQRTEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
@@ -3744,6 +3895,25 @@ defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
 defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
   UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
 
+// Patterns for smull2/umull2.
+multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
+                           (extract_high_v16i8 V128:$Rm))),
+             (INST8B V128:$Rn, V128:$Rm)>;
+  def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
+                           (extract_high_v8i16 V128:$Rm))),
+             (INST4H V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
+                           (extract_high_v4i32 V128:$Rm))),
+             (INST2S V128:$Rn, V128:$Rm)>;
+}
+
+defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
+  SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
+defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
+  UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
+
 // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
 multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
   Instruction INST8B, Instruction INST4H, Instruction INST2S> {
@@ -4103,12 +4273,18 @@ def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
             (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
 def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
             (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
+
 def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
@@ -4122,6 +4298,7 @@ def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+
 def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
 
@@ -4592,10 +4769,8 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 
-def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
-
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
 
 def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
@@ -4617,6 +4792,7 @@ def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
 def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 // EDIT per word: 2s & 4s with MSL shifter
 def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
                       [(set (v2i32 V64:$Rd),
@@ -4629,13 +4805,16 @@ def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
 def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64,  imm0_255,
                                                  "movi", ".8b",
                        [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
 def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
                                                  "movi", ".16b",
                        [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+}
 
 // AdvSIMD MVNI
 
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
 
 def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
@@ -4658,12 +4837,14 @@ def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
 
 // EDIT per word: 2s & 4s with MSL shifter
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
                       [(set (v2i32 V64:$Rd),
                             (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
                       [(set (v4i32 V128:$Rd),
                             (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+}
 
 //----------------------------------------------------------------------------
 // AdvSIMD indexed element
@@ -4850,20 +5031,55 @@ def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
 def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
-          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
 def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
           (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
           (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+
+// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported.
+
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
+          (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
+          (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
+            (and FPR32:$Rn, (i32 65535)),
+            vecshiftR16:$imm)),
+          (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)),
+          (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
+          (UCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
+          (i32 (INSERT_SUBREG
+            (i32 (IMPLICIT_DEF)),
+            (FCVTZSh FPR16:$Rn, vecshiftR32:$imm),
+            hsub))>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR64:$imm)),
+          (i64 (INSERT_SUBREG
+            (i64 (IMPLICIT_DEF)),
+            (FCVTZSh FPR16:$Rn, vecshiftR64:$imm),
+            hsub))>;
+def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR32:$imm)),
+          (i32 (INSERT_SUBREG
+            (i32 (IMPLICIT_DEF)),
+            (FCVTZUh FPR16:$Rn, vecshiftR32:$imm),
+            hsub))>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)),
+          (i64 (INSERT_SUBREG
+            (i64 (IMPLICIT_DEF)),
+            (FCVTZUh FPR16:$Rn, vecshiftR64:$imm),
+            hsub))>;
 
 defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
 defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
@@ -5425,10 +5641,12 @@ defm ST4 : SIMDLdSt4SingleAliases<"st4">;
 // Crypto extensions
 //----------------------------------------------------------------------------
 
+let Predicates = [HasAES] in {
 def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
 def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
 def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
 def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+}
 
 // Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
 // for AES fusion on some CPUs.
@@ -5455,6 +5673,7 @@ def : Pat<(v16i8 (int_aarch64_crypto_aesimc
                                               (v16i8 V128:$src2)))))>,
           Requires<[HasFuseAES]>;
 
+let Predicates = [HasSHA2] in {
 def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
 def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
 def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
@@ -5466,6 +5685,7 @@ def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1
 def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
 def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
 def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+}
 
 //----------------------------------------------------------------------------
 // Compiler-pseudos
@@ -5614,6 +5834,7 @@ def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
@@ -5785,7 +6006,7 @@ def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
-                             (v2i32 (REV64v4i16 FPR64:$src))>;
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
 }
 def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 
@@ -5794,7 +6015,6 @@ def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 }
@@ -5807,18 +6027,16 @@ def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
                              (v4i16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
-                             (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
                              (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
 }
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
@@ -5828,20 +6046,17 @@ let Predicates = [IsBE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
-                             (v4f16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
-                             (v4f16 (REV64v4i16 FPR64:$src))>;
+                             (v4f16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))),
                              (v4f16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
-                             (v4f16 (REV64v4i16 FPR64:$src))>;
+                             (v4f16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 }
-
-
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
@@ -5933,7 +6148,7 @@ def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
-                             (v2f32 (REV64v4i16 FPR64:$src))>;
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 
@@ -6076,7 +6291,6 @@ def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
@@ -6093,15 +6307,13 @@ def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
                              (v8i16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
-                             (v8i16 (REV32v8i16 FPR128:$src))>;
 }
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
@@ -6115,8 +6327,6 @@ def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
                              (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
                              (v8f16 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
-                             (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
                              (v8f16 (REV16v16i8 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
@@ -6124,6 +6334,7 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
                              (v8f16 (REV32v8i16 FPR128:$src))>;
 }
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -6179,20 +6390,25 @@ def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
 
 // A 64-bit subvector insert to the first 128-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+multiclass InsertSubvectorUndef<ValueType Ty> {
+  def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+}
+
+defm : InsertSubvectorUndef<i32>;
+defm : InsertSubvectorUndef<i64>;
 
 // Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
 // or v2f32.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 7d2ec1be2888..4d7ca2349ed1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -92,6 +92,8 @@ private:
     return selectAddrModeIndexed(Root, Width / 8);
   }
 
+  void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
+
   const AArch64TargetMachine &TM;
   const AArch64Subtarget &STI;
   const AArch64InstrInfo &TII;
@@ -172,7 +174,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
                              const AArch64RegisterInfo &TRI) {
   LLT Ty = MRI.getType(I.getOperand(0).getReg());
   if (!Ty.isValid()) {
-    DEBUG(dbgs() << "Generic binop register should be typed\n");
+    LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
     return true;
   }
 
@@ -180,7 +182,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
   for (auto &MO : I.operands()) {
     // FIXME: Support non-register operands.
     if (!MO.isReg()) {
-      DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
+      LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
       return true;
     }
 
@@ -189,18 +191,18 @@ static bool unsupportedBinOp(const MachineInstr &I,
     // bank out of the minimal class for the register.
     // Either way, this needs to be documented (and possibly verified).
     if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-      DEBUG(dbgs() << "Generic inst has physical register operand\n");
+      LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
       return true;
     }
 
     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
     if (!OpBank) {
-      DEBUG(dbgs() << "Generic register has no bank or class\n");
+      LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
       return true;
     }
 
     if (PrevOpBank && OpBank != PrevOpBank) {
-      DEBUG(dbgs() << "Generic inst operands have different banks\n");
+      LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
       return true;
     }
     PrevOpBank = OpBank;
@@ -376,7 +378,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   const TargetRegisterClass *RC = getRegClassForTypeOnBank(
       MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true);
   if (!RC) {
-    DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+    LLVM_DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
     return false;
   }
 
@@ -410,8 +412,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   // we hit another of its use or its defs.
   // Copies do not have constraints.
   if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
-                 << " operand\n");
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << " operand\n");
     return false;
   }
   I.setDesc(TII.get(AArch64::COPY));
@@ -612,11 +614,11 @@ bool AArch64InstructionSelector::selectCompareBranch(
   else
     return false;
 
-  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
-                 .addUse(LHS)
-                 .addMBB(DestMBB);
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
+      .addUse(LHS)
+      .addMBB(DestMBB)
+      .constrainAllUses(TII, TRI, RBI);
 
-  constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
   I.eraseFromParent();
   return true;
 }
@@ -684,13 +686,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
         DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
         if (!DefRC) {
           if (!DefTy.isValid()) {
-            DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+            LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
             return false;
           }
           const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
           DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
           if (!DefRC) {
-            DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+            LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
             return false;
           }
         }
@@ -708,7 +710,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
 
   if (I.getNumOperands() != I.getNumExplicitOperands()) {
-    DEBUG(dbgs() << "Generic instruction has unexpected implicit operands\n");
+    LLVM_DEBUG(
+        dbgs() << "Generic instruction has unexpected implicit operands\n");
     return false;
   }
 
@@ -724,8 +727,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
       // We shouldn't need this on AArch64, but it would be implemented as an
       // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
       // bit being tested is < 32.
-      DEBUG(dbgs() << "G_BRCOND has type: " << Ty
-                   << ", expected at most 32-bits");
+      LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty
+                        << ", expected at most 32-bits");
       return false;
     }
 
@@ -765,15 +768,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     // FIXME: Redundant check, but even less readable when factored out.
     if (isFP) {
       if (Ty != s32 && Ty != s64) {
-        DEBUG(dbgs() << "Unable to materialize FP " << Ty
-                     << " constant, expected: " << s32 << " or " << s64
-                     << '\n');
+        LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
+                          << " constant, expected: " << s32 << " or " << s64
+                          << '\n');
         return false;
       }
 
       if (RB.getID() != AArch64::FPRRegBankID) {
-        DEBUG(dbgs() << "Unable to materialize FP " << Ty
-                     << " constant on bank: " << RB << ", expected: FPR\n");
+        LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
+                          << " constant on bank: " << RB
+                          << ", expected: FPR\n");
         return false;
       }
 
@@ -784,15 +788,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     } else {
       // s32 and s64 are covered by tablegen.
       if (Ty != p0) {
-        DEBUG(dbgs() << "Unable to materialize integer " << Ty
-                     << " constant, expected: " << s32 << ", " << s64 << ", or "
-                     << p0 << '\n');
+        LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
+                          << " constant, expected: " << s32 << ", " << s64
+                          << ", or " << p0 << '\n');
         return false;
       }
 
       if (RB.getID() != AArch64::GPRRegBankID) {
-        DEBUG(dbgs() << "Unable to materialize integer " << Ty
-                     << " constant on bank: " << RB << ", expected: GPR\n");
+        LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
+                          << " constant on bank: " << RB
+                          << ", expected: GPR\n");
         return false;
       }
     }
@@ -818,7 +823,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
           .addUse(DefGPRReg);
 
       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
-        DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
+        LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
         return false;
       }
 
@@ -873,7 +878,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
     unsigned DstSize = DstTy.getSizeInBits();
-    (void)DstSize;
     // Larger inserts are vectors, same-size ones should be something else by
     // now (split up or turned into COPYs).
     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
@@ -907,8 +911,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_FRAME_INDEX: {
     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
     if (Ty != LLT::pointer(0, 64)) {
-      DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
-            << ", expected: " << LLT::pointer(0, 64) << '\n');
+      LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
+                        << ", expected: " << LLT::pointer(0, 64) << '\n');
       return false;
     }
     I.setDesc(TII.get(AArch64::ADDXri));
@@ -976,20 +980,20 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE: {
-    LLT MemTy = Ty;
     LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
 
     if (PtrTy != LLT::pointer(0, 64)) {
-      DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
-                   << ", expected: " << LLT::pointer(0, 64) << '\n');
+      LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
+                        << ", expected: " << LLT::pointer(0, 64) << '\n');
       return false;
     }
 
     auto &MemOp = **I.memoperands_begin();
     if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
-      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
       return false;
     }
+    unsigned MemSizeInBits = MemOp.getSize() * 8;
 
     // FIXME: PR36018: Volatile loads in some cases are incorrectly selected by
     // folding with an extend. Until we have a G_SEXTLOAD solution bail out if
@@ -1011,7 +1015,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
     const unsigned NewOpc =
-        selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemTy.getSizeInBits());
+        selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
     if (NewOpc == I.getOpcode())
       return false;
 
@@ -1024,7 +1028,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
       if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
         int64_t Imm = *COff;
-        const unsigned Size = MemTy.getSizeInBits() / 8;
+        const unsigned Size = MemSizeInBits / 8;
         const unsigned Scale = Log2_32(Size);
         if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
           unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
@@ -1065,13 +1069,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
+      LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
       return false;
     }
 
     if (Ty != LLT::scalar(64)) {
-      DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
-                   << ", expected: " << LLT::scalar(64) << '\n');
+      LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
+                        << ", expected: " << LLT::scalar(64) << '\n');
       return false;
     }
 
@@ -1137,7 +1141,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
 
     if (DstRB.getID() != SrcRB.getID()) {
-      DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
+      LLVM_DEBUG(
+          dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
       return false;
     }
 
@@ -1154,7 +1159,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-        DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
+        LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
@@ -1168,7 +1173,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
                  SrcRC == &AArch64::GPR64RegClass) {
         I.getOperand(1).setSubReg(AArch64::sub_32);
       } else {
-        DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
+        LLVM_DEBUG(
+            dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
@@ -1191,26 +1197,28 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
     if (RBDst.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n");
+      LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
+                        << ", expected: GPR\n");
       return false;
     }
 
     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
     if (RBSrc.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n");
+      LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
+                        << ", expected: GPR\n");
       return false;
     }
 
     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
 
     if (DstSize == 0) {
-      DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
+      LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
       return false;
     }
 
     if (DstSize != 64 && DstSize > 32) {
-      DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
-                   << ", expected: 32 or 64\n");
+      LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
+                        << ", expected: 32 or 64\n");
       return false;
     }
     // At this point G_ANYEXT is just like a plain COPY, but we need
@@ -1238,8 +1246,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
-                   << ", expected: GPR\n");
+      LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
+                        << ", expected: GPR\n");
       return false;
     }
 
@@ -1247,8 +1255,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     if (DstTy == LLT::scalar(64)) {
       // FIXME: Can we avoid manually doing this?
       if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
-        DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
-                     << " operand\n");
+        LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
+                          << " operand\n");
         return false;
       }
 
@@ -1316,8 +1324,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
   case TargetOpcode::G_SELECT: {
     if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
-      DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
-                   << ", expected: " << LLT::scalar(1) << '\n');
+      LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
+                        << ", expected: " << LLT::scalar(1) << '\n');
       return false;
     }
 
@@ -1355,8 +1363,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
   }
   case TargetOpcode::G_ICMP: {
     if (Ty != LLT::scalar(32)) {
-      DEBUG(dbgs() << "G_ICMP result has type: " << Ty
-                   << ", expected: " << LLT::scalar(32) << '\n');
+      LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
+                        << ", expected: " << LLT::scalar(32) << '\n');
       return false;
     }
 
@@ -1402,8 +1410,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
   case TargetOpcode::G_FCMP: {
     if (Ty != LLT::scalar(32)) {
-      DEBUG(dbgs() << "G_FCMP result has type: " << Ty
-                   << ", expected: " << LLT::scalar(32) << '\n');
+      LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty
+                        << ", expected: " << LLT::scalar(32) << '\n');
       return false;
     }
 
@@ -1465,8 +1473,23 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_VASTART:
     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
                                 : selectVaStartAAPCS(I, MF, MRI);
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+    if (!I.getOperand(0).isIntrinsicID())
+      return false;
+    if (I.getOperand(0).getIntrinsicID() != Intrinsic::trap)
+      return false;
+    BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::BRK))
+      .addImm(1);
+    I.eraseFromParent();
+    return true;
   case TargetOpcode::G_IMPLICIT_DEF:
     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+    const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    const unsigned DstReg = I.getOperand(0).getReg();
+    const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+    const TargetRegisterClass *DstRC =
+        getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+    RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
     return true;
   }
 
@@ -1624,6 +1647,15 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
   }};
 }
 
+void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+  assert(CstVal && "Expected constant value");
+  MIB.addImm(CstVal.getValue());
+}
+
 namespace llvm {
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 05df51202229..9b8c0a34efba 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -23,110 +23,8 @@
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
-
-/// FIXME: The following static functions are SizeChangeStrategy functions
-/// that are meant to temporarily mimic the behaviour of the old legalization
-/// based on doubling/halving non-legal types as closely as possible. This is
-/// not entirly possible as only legalizing the types that are exactly a power
-/// of 2 times the size of the legal types would require specifying all those
-/// sizes explicitly.
-/// In practice, not specifying those isn't a problem, and the below functions
-/// should disappear quickly as we add support for legalizing non-power-of-2
-/// sized types further.
-static void
-addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
-                                const LegalizerInfo::SizeAndActionsVec &v) {
-  for (unsigned i = 0; i < v.size(); ++i) {
-    result.push_back(v[i]);
-    if (i + 1 < v[i].first && i + 1 < v.size() &&
-        v[i + 1].first != v[i].first + 1)
-      result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
-  }
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_narrow_128_ToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 2);
-  LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
-                                             {2, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  assert(Largest + 1 < 128);
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
-  result.push_back({128, LegalizerInfo::NarrowScalar});
-  result.push_back({129, LegalizerInfo::Unsupported});
-  return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_16(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 17);
-  LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::Unsupported},
-                                             {16, LegalizerInfo::WidenScalar},
-                                             {17, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
-  return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 9);
-  LegalizerInfo::SizeAndActionsVec result = {
-      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
-      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
-  return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 17);
-  LegalizerInfo::SizeAndActionsVec result = {
-      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
-      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
-      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
-  return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16_narrowToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 17);
-  LegalizerInfo::SizeAndActionsVec result = {
-      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
-      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
-      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::NarrowScalar});
-  return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16_32(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 33);
-  LegalizerInfo::SizeAndActionsVec result = {
-      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
-      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
-      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported},
-      {32, LegalizerInfo::WidenScalar}, {33, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
-  return result;
-}
+using namespace LegalizeActions;
+using namespace LegalityPredicates;
 
 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   using namespace TargetOpcode;
@@ -137,255 +35,356 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
   const LLT s128 = LLT::scalar(128);
+  const LLT s256 = LLT::scalar(256);
+  const LLT s512 = LLT::scalar(512);
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s8 = LLT::vector(8, 8);
+  const LLT v4s8 = LLT::vector(4, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s16 = LLT::vector(4, 16);
+  const LLT v2s16 = LLT::vector(2, 16);
   const LLT v2s32 = LLT::vector(2, 32);
   const LLT v4s32 = LLT::vector(4, 32);
   const LLT v2s64 = LLT::vector(2, 64);
 
-  for (auto Ty : {p0, s1, s8, s16, s32, s64})
-    setAction({G_IMPLICIT_DEF, Ty}, Legal);
-
-  for (auto Ty : {s16, s32, s64, p0})
-    setAction({G_PHI, Ty}, Legal);
-
-  setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1_8);
-
-  for (auto Ty : { s32, s64 })
-    setAction({G_BSWAP, Ty}, Legal);
-
-  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
-    // These operations naturally get the right answer when used on
-    // GPR32, even if the actual type is narrower.
-    for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
-      setAction({BinOp, Ty}, Legal);
-
-    if (BinOp != G_ADD)
-      setLegalizeScalarToDifferentSizeStrategy(BinOp, 0,
-                                               widen_1_8_16_narrowToLargest);
-  }
-
-  setAction({G_GEP, p0}, Legal);
-  setAction({G_GEP, 1, s64}, Legal);
-
-  setLegalizeScalarToDifferentSizeStrategy(G_GEP, 1, widen_1_8_16_32);
-
-  setAction({G_PTR_MASK, p0}, Legal);
-
-  for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
-    for (auto Ty : {s32, s64})
-      setAction({BinOp, Ty}, Legal);
-
-    setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1_8_16);
-  }
-
-  for (unsigned BinOp : {G_SREM, G_UREM})
-    for (auto Ty : { s1, s8, s16, s32, s64 })
-      setAction({BinOp, Ty}, Lower);
-
-  for (unsigned Op : {G_SMULO, G_UMULO}) {
-    setAction({Op, 0, s64}, Lower);
-    setAction({Op, 1, s1}, Legal);
-  }
-
-  for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
-    for (auto Ty : { s32, s64 })
-      setAction({Op, Ty}, Legal);
-
-    setAction({Op, 1, s1}, Legal);
-  }
-
-  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
-    for (auto Ty : {s32, s64})
-      setAction({BinOp, Ty}, Legal);
-
-  for (unsigned BinOp : {G_FREM, G_FPOW}) {
-    setAction({BinOp, s32}, Libcall);
-    setAction({BinOp, s64}, Libcall);
-  }
-
-  for (auto Ty : {s32, s64, p0}) {
-    setAction({G_INSERT, Ty}, Legal);
-    setAction({G_INSERT, 1, Ty}, Legal);
-  }
-  setLegalizeScalarToDifferentSizeStrategy(G_INSERT, 0,
-                                           widen_1_8_16_narrowToLargest);
-  for (auto Ty : {s1, s8, s16}) {
-    setAction({G_INSERT, 1, Ty}, Legal);
-    // FIXME: Can't widen the sources because that violates the constraints on
-    // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
-  }
-
-  for (auto Ty : {s1, s8, s16, s32, s64, p0})
-    setAction({G_EXTRACT, Ty}, Legal);
-
-  for (auto Ty : {s32, s64})
-    setAction({G_EXTRACT, 1, Ty}, Legal);
-
-  for (unsigned MemOp : {G_LOAD, G_STORE}) {
-    for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
-      setAction({MemOp, Ty}, Legal);
-
-    setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
-                                             widen_1_narrow_128_ToLargest);
-
-    // And everything's fine in addrspace 0.
-    setAction({MemOp, 1, p0}, Legal);
-  }
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+      .legalFor({p0, s1, s8, s16, s32, s64})
+      .clampScalar(0, s1, s64)
+      .widenScalarToNextPow2(0, 8);
+
+  getActionDefinitionsBuilder(G_PHI)
+      .legalFor({p0, s16, s32, s64})
+      .clampScalar(0, s16, s64)
+      .widenScalarToNextPow2(0);
+
+  getActionDefinitionsBuilder(G_BSWAP)
+      .legalFor({s32, s64})
+      .clampScalar(0, s16, s64)
+      .widenScalarToNextPow2(0);
+
+  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL})
+      .legalFor({s32, s64, v2s32, v4s32, v2s64})
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+      .moreElementsToNextPow2(0);
+
+  getActionDefinitionsBuilder(G_GEP)
+      .legalFor({{p0, s64}})
+      .clampScalar(1, s64, s64);
+
+  getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0});
+
+  getActionDefinitionsBuilder({G_LSHR, G_ASHR, G_SDIV, G_UDIV})
+      .legalFor({s32, s64})
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0);
+
+  getActionDefinitionsBuilder({G_SREM, G_UREM})
+      .lowerFor({s1, s8, s16, s32, s64});
+
+  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
+      .lowerFor({{s64, s1}});
+
+  getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
+
+  getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO})
+      .legalFor({{s32, s1}, {s64, s1}});
+
+  getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
+      .legalFor({s32, s64});
+
+  getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
+
+  getActionDefinitionsBuilder(G_INSERT)
+      .unsupportedIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits();
+      })
+      .legalIf([=](const LegalityQuery &Query) {
+        const LLT &Ty0 = Query.Types[0];
+        const LLT &Ty1 = Query.Types[1];
+        if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0)
+          return false;
+        return isPowerOf2_32(Ty1.getSizeInBits()) &&
+               (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8);
+      })
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0)
+      .maxScalarIf(typeInSet(0, {s32}), 1, s16)
+      .maxScalarIf(typeInSet(0, {s64}), 1, s32)
+      .widenScalarToNextPow2(1);
+
+  getActionDefinitionsBuilder(G_EXTRACT)
+      .unsupportedIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits();
+      })
+      .legalIf([=](const LegalityQuery &Query) {
+        const LLT &Ty0 = Query.Types[0];
+        const LLT &Ty1 = Query.Types[1];
+        if (Ty1 != s32 && Ty1 != s64)
+          return false;
+        if (Ty1 == p0)
+          return true;
+        return isPowerOf2_32(Ty0.getSizeInBits()) &&
+               (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
+      })
+      .clampScalar(1, s32, s64)
+      .widenScalarToNextPow2(1)
+      .maxScalarIf(typeInSet(1, {s32}), 0, s16)
+      .maxScalarIf(typeInSet(1, {s64}), 0, s32)
+      .widenScalarToNextPow2(0);
+
+  getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
+      .legalForTypesWithMemSize({{s32, p0, 8},
+                                 {s32, p0, 16},
+                                 {s32, p0, 32},
+                                 {s64, p0, 64},
+                                 {p0, p0, 64},
+                                 {v2s32, p0, 64}})
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0)
+      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
+      //       how to do that yet.
+      .unsupportedIfMemSizeNotPow2()
+      // Lower anything left over into G_*EXT and G_LOAD
+      .lower();
+
+  getActionDefinitionsBuilder(G_LOAD)
+      .legalForTypesWithMemSize({{s8, p0, 8},
+                                 {s16, p0, 16},
+                                 {s32, p0, 32},
+                                 {s64, p0, 64},
+                                 {p0, p0, 64},
+                                 {v2s32, p0, 64}})
+      // These extends are also legal
+      .legalForTypesWithMemSize({{s32, p0, 8},
+                                 {s32, p0, 16}})
+      .clampScalar(0, s8, s64)
+      .widenScalarToNextPow2(0)
+      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
+      //       how to do that yet.
+      .unsupportedIfMemSizeNotPow2()
+      // Lower any any-extending loads left into G_ANYEXT and G_LOAD
+      .lowerIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+      })
+      .clampNumElements(0, v2s32, v2s32);
+
+  getActionDefinitionsBuilder(G_STORE)
+      .legalForTypesWithMemSize({{s8, p0, 8},
+                                 {s16, p0, 16},
+                                 {s32, p0, 32},
+                                 {s64, p0, 64},
+                                 {p0, p0, 64},
+                                 {v2s32, p0, 64}})
+      .clampScalar(0, s8, s64)
+      .widenScalarToNextPow2(0)
+      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
+      //       how to do that yet.
+      .unsupportedIfMemSizeNotPow2()
+      .lowerIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].isScalar() &&
+               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+      })
+      .clampNumElements(0, v2s32, v2s32);
 
   // Constants
-  for (auto Ty : {s32, s64}) {
-    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
-    setAction({TargetOpcode::G_FCONSTANT, Ty}, Legal);
-  }
-
-  setAction({G_CONSTANT, p0}, Legal);
-
-  setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
-  setLegalizeScalarToDifferentSizeStrategy(G_FCONSTANT, 0, widen_16);
-
-  setAction({G_ICMP, 1, s32}, Legal);
-  setAction({G_ICMP, 1, s64}, Legal);
-  setAction({G_ICMP, 1, p0}, Legal);
-
-  setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 0, widen_1_8_16);
-  setLegalizeScalarToDifferentSizeStrategy(G_FCMP, 0, widen_1_8_16);
-  setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1, widen_1_8_16);
-
-  setAction({G_ICMP, s32}, Legal);
-  setAction({G_FCMP, s32}, Legal);
-  setAction({G_FCMP, 1, s32}, Legal);
-  setAction({G_FCMP, 1, s64}, Legal);
+  getActionDefinitionsBuilder(G_CONSTANT)
+      .legalFor({p0, s32, s64})
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0);
+  getActionDefinitionsBuilder(G_FCONSTANT)
+      .legalFor({s32, s64})
+      .clampScalar(0, s32, s64);
+
+  getActionDefinitionsBuilder(G_ICMP)
+      .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
+      .clampScalar(0, s32, s32)
+      .clampScalar(1, s32, s64)
+      .widenScalarToNextPow2(1);
+
+  getActionDefinitionsBuilder(G_FCMP)
+      .legalFor({{s32, s32}, {s32, s64}})
+      .clampScalar(0, s32, s32)
+      .clampScalar(1, s32, s64)
+      .widenScalarToNextPow2(1);
 
   // Extensions
-  for (auto Ty : { s1, s8, s16, s32, s64 }) {
-    setAction({G_ZEXT, Ty}, Legal);
-    setAction({G_SEXT, Ty}, Legal);
-    setAction({G_ANYEXT, Ty}, Legal);
-  }
+  getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+      .legalForCartesianProduct({s8, s16, s32, s64}, {s1, s8, s16, s32});
 
   // FP conversions
-  for (auto Ty : { s16, s32 }) {
-    setAction({G_FPTRUNC, Ty}, Legal);
-    setAction({G_FPEXT, 1, Ty}, Legal);
-  }
-
-  for (auto Ty : { s32, s64 }) {
-    setAction({G_FPTRUNC, 1, Ty}, Legal);
-    setAction({G_FPEXT, Ty}, Legal);
-  }
+  getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
+      {{s16, s32}, {s16, s64}, {s32, s64}});
+  getActionDefinitionsBuilder(G_FPEXT).legalFor(
+      {{s32, s16}, {s64, s16}, {s64, s32}});
 
   // Conversions
-  for (auto Ty : { s32, s64 }) {
-    setAction({G_FPTOSI, 0, Ty}, Legal);
-    setAction({G_FPTOUI, 0, Ty}, Legal);
-    setAction({G_SITOFP, 1, Ty}, Legal);
-    setAction({G_UITOFP, 1, Ty}, Legal);
-  }
-  setLegalizeScalarToDifferentSizeStrategy(G_FPTOSI, 0, widen_1_8_16);
-  setLegalizeScalarToDifferentSizeStrategy(G_FPTOUI, 0, widen_1_8_16);
-  setLegalizeScalarToDifferentSizeStrategy(G_SITOFP, 1, widen_1_8_16);
-  setLegalizeScalarToDifferentSizeStrategy(G_UITOFP, 1, widen_1_8_16);
-
-  for (auto Ty : { s32, s64 }) {
-    setAction({G_FPTOSI, 1, Ty}, Legal);
-    setAction({G_FPTOUI, 1, Ty}, Legal);
-    setAction({G_SITOFP, 0, Ty}, Legal);
-    setAction({G_UITOFP, 0, Ty}, Legal);
-  }
+  getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+      .legalForCartesianProduct({s32, s64})
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0)
+      .clampScalar(1, s32, s64)
+      .widenScalarToNextPow2(1);
+
+  getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+      .legalForCartesianProduct({s32, s64})
+      .clampScalar(1, s32, s64)
+      .widenScalarToNextPow2(1)
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0);
 
   // Control-flow
-  for (auto Ty : {s1, s8, s16, s32})
-    setAction({G_BRCOND, Ty}, Legal);
-  setAction({G_BRINDIRECT, p0}, Legal);
+  getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
+  getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
 
   // Select
-  setLegalizeScalarToDifferentSizeStrategy(G_SELECT, 0, widen_1_8_16);
-
-  for (auto Ty : {s32, s64, p0})
-    setAction({G_SELECT, Ty}, Legal);
-
-  setAction({G_SELECT, 1, s1}, Legal);
+  getActionDefinitionsBuilder(G_SELECT)
+      .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0);
 
   // Pointer-handling
-  setAction({G_FRAME_INDEX, p0}, Legal);
-  setAction({G_GLOBAL_VALUE, p0}, Legal);
-
-  for (auto Ty : {s1, s8, s16, s32, s64})
-    setAction({G_PTRTOINT, 0, Ty}, Legal);
+  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+  getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
 
-  setAction({G_PTRTOINT, 1, p0}, Legal);
+  getActionDefinitionsBuilder(G_PTRTOINT)
+      .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+      .maxScalar(0, s64)
+      .widenScalarToNextPow2(0, /*Min*/ 8);
 
-  setAction({G_INTTOPTR, 0, p0}, Legal);
-  setAction({G_INTTOPTR, 1, s64}, Legal);
+  getActionDefinitionsBuilder(G_INTTOPTR)
+      .unsupportedIf([&](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
+      })
+      .legalFor({{p0, s64}});
 
   // Casts for 32 and 64-bit width type are just copies.
   // Same for 128-bit width type, except they are on the FPR bank.
-  for (auto Ty : {s1, s8, s16, s32, s64, s128}) {
-    setAction({G_BITCAST, 0, Ty}, Legal);
-    setAction({G_BITCAST, 1, Ty}, Legal);
-  }
-
-  // For the sake of copying bits around, the type does not really
-  // matter as long as it fits a register.
-  for (int EltSize = 8; EltSize <= 64; EltSize *= 2) {
-    setAction({G_BITCAST, 0, LLT::vector(128/EltSize, EltSize)}, Legal);
-    setAction({G_BITCAST, 1, LLT::vector(128/EltSize, EltSize)}, Legal);
-    if (EltSize >= 64)
-      continue;
-
-    setAction({G_BITCAST, 0, LLT::vector(64/EltSize, EltSize)}, Legal);
-    setAction({G_BITCAST, 1, LLT::vector(64/EltSize, EltSize)}, Legal);
-    if (EltSize >= 32)
-      continue;
-
-    setAction({G_BITCAST, 0, LLT::vector(32/EltSize, EltSize)}, Legal);
-    setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
-  }
+  getActionDefinitionsBuilder(G_BITCAST)
+      // FIXME: This is wrong since G_BITCAST is not allowed to change the
+      // number of bits but it's what the previous code described and fixing
+      // it breaks tests.
+      .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
+                                 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64});
 
-  setAction({G_VASTART, p0}, Legal);
+  getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
 
   // va_list must be a pointer, but most sized types are pretty easy to handle
   // as the destination.
-  setAction({G_VAARG, 1, p0}, Legal);
-
-  for (auto Ty : {s8, s16, s32, s64, p0})
-    setAction({G_VAARG, Ty}, Custom);
+  getActionDefinitionsBuilder(G_VAARG)
+      .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
+      .clampScalar(0, s8, s64)
+      .widenScalarToNextPow2(0, /*Min*/ 8);
 
   if (ST.hasLSE()) {
-    for (auto Ty : {s8, s16, s32, s64}) {
-      setAction({G_ATOMIC_CMPXCHG_WITH_SUCCESS, Ty}, Lower);
-      setAction({G_ATOMIC_CMPXCHG, Ty}, Legal);
-    }
-    setAction({G_ATOMIC_CMPXCHG, 1, p0}, Legal);
-
-    for (unsigned Op :
-         {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
-          G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
-          G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) {
-      for (auto Ty : {s8, s16, s32, s64}) {
-        setAction({Op, Ty}, Legal);
-      }
-      setAction({Op, 1, p0}, Legal);
-    }
+    getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+        .lowerIf(all(
+            typeInSet(0, {s8, s16, s32, s64}), typeIs(1, s1), typeIs(2, p0),
+            atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic)));
+
+    getActionDefinitionsBuilder(
+        {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
+         G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
+         G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX, G_ATOMIC_CMPXCHG})
+        .legalIf(all(
+            typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
+            atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic)));
   }
 
   // Merge/Unmerge
-  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES})
-    for (int Sz : {8, 16, 32, 64, 128, 192, 256, 384, 512}) {
-      LLT ScalarTy = LLT::scalar(Sz);
-      setAction({Op, ScalarTy}, Legal);
-      setAction({Op, 1, ScalarTy}, Legal);
-      if (Sz < 32)
-        continue;
-      for (int EltSize = 8; EltSize <= 64; EltSize *= 2) {
-        if (EltSize >= Sz)
-          continue;
-        LLT VecTy = LLT::vector(Sz / EltSize, EltSize);
-        setAction({Op, VecTy}, Legal);
-        setAction({Op, 1, VecTy}, Legal);
+  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
+    unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
+    unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+
+    auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) {
+      const LLT &Ty = Query.Types[TypeIdx];
+      if (Ty.isVector()) {
+        const LLT &EltTy = Ty.getElementType();
+        if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
+          return true;
+        if (!isPowerOf2_32(EltTy.getSizeInBits()))
+          return true;
       }
-    }
+      return false;
+    };
+    auto scalarize =
+        [](const LegalityQuery &Query, unsigned TypeIdx) {
+          const LLT &Ty = Query.Types[TypeIdx];
+          return std::make_pair(TypeIdx, Ty.getElementType());
+        };
+
+    // FIXME: This rule is horrible, but specifies the same as what we had
+    // before with the particularly strange definitions removed (e.g.
+    // s8 = G_MERGE_VALUES s32, s32).
+    // Part of the complexity comes from these ops being extremely flexible. For
+    // example, you can build/decompose vectors with it, concatenate vectors,
+    // etc. and in addition to this you can also bitcast with it at the same
+    // time. We've been considering breaking it up into multiple ops to make it
+    // more manageable throughout the backend.
+    getActionDefinitionsBuilder(Op)
+        // Break up vectors with weird elements into scalars
+        .fewerElementsIf(
+            [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
+            [=](const LegalityQuery &Query) { return scalarize(Query, 0); })
+        .fewerElementsIf(
+            [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
+            [=](const LegalityQuery &Query) { return scalarize(Query, 1); })
+        // Clamp the big scalar to s8-s512 and make it either a power of 2, 192,
+        // or 384.
+        .clampScalar(BigTyIdx, s8, s512)
+        .widenScalarIf(
+            [=](const LegalityQuery &Query) {
+              const LLT &Ty = Query.Types[BigTyIdx];
+              return !isPowerOf2_32(Ty.getSizeInBits()) &&
+                     Ty.getSizeInBits() % 64 != 0;
+            },
+            [=](const LegalityQuery &Query) {
+              // Pick the next power of 2, or a multiple of 64 over 128.
+              // Whichever is smaller.
+              const LLT &Ty = Query.Types[BigTyIdx];
+              unsigned NewSizeInBits = 1
+                                       << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+              if (NewSizeInBits >= 256) {
+                unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+                if (RoundedTo < NewSizeInBits)
+                  NewSizeInBits = RoundedTo;
+              }
+              return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+            })
+        // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
+        // worth considering the multiples of 64 since 2*192 and 2*384 are not
+        // valid.
+        .clampScalar(LitTyIdx, s8, s256)
+        .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8)
+        // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384,
+        // s512, <X x s8>, <X x s16>, <X x s32>, or <X x s64>.
+        // At this point it's simple enough to accept the legal types.
+        .legalIf([=](const LegalityQuery &Query) {
+          const LLT &BigTy = Query.Types[BigTyIdx];
+          const LLT &LitTy = Query.Types[LitTyIdx];
+          if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
+            return false;
+          if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
+            return false;
+          return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
+        })
+        // Any vectors left are the wrong size. Scalarize them.
+        .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+                         [](const LegalityQuery &Query) {
+                           return std::make_pair(
+                               0, Query.Types[0].getElementType());
+                         })
+        .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+                         [](const LegalityQuery &Query) {
+                           return std::make_pair(
+                               1, Query.Types[1].getElementType());
+                         });
+  }
 
   computeTables();
+  verify(*ST.getInstrInfo());
 }
 
 bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 8a29456430b9..4a19ecd69103 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -98,8 +98,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
   const AArch64Subtarget *Subtarget;
 
-  // Track which registers have been modified and used.
-  BitVector ModifiedRegs, UsedRegs;
+  // Track which register units have been modified and used.
+  LiveRegUnits ModifiedRegUnits, UsedRegUnits;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
@@ -702,16 +702,17 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
             .add(BaseRegOp)
             .addImm(OffsetImm)
-            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+            .setMemRefs(I->mergeMemRefsWith(*MergeMI))
+            .setMIFlags(I->mergeFlagsWith(*MergeMI));
   (void)MIB;
 
-  DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(MergeMI->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(MergeMI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
 
   // Erase the old instructions.
   I->eraseFromParent();
@@ -818,15 +819,17 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
             .add(RegOp1)
             .add(BaseRegOp)
             .addImm(OffsetImm)
-            .setMemRefs(I->mergeMemRefsWith(*Paired));
+            .setMemRefs(I->mergeMemRefsWith(*Paired))
+            .setMIFlags(I->mergeFlagsWith(*Paired));
 
   (void)MIB;
 
-  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Paired->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(
+      dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Paired->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
   if (SExtIdx != -1) {
     // Generate the sign extension for the proper result of the ldp.
     // I.e., with X1, that would be:
@@ -840,8 +843,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
     // Update the result of LDP to use the W instead of the X variant.
     DstMO.setReg(DstRegW);
-    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+    LLVM_DEBUG(dbgs() << "\n");
     // Make the machine verifier happy by providing a definition for
     // the X register.
     // Insert this definition right after the generated LDP, i.e., before
@@ -858,12 +861,12 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
             .addImm(0)
             .addImm(31);
     (void)MIBSXTW;
-    DEBUG(dbgs() << "  Extend operand:\n    ");
-    DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+    LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
+    LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
   } else {
-    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
   }
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "\n");
 
   // Erase the old instructions.
   I->eraseFromParent();
@@ -901,9 +904,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
           break;
         }
       }
-      DEBUG(dbgs() << "Remove load instruction:\n    ");
-      DEBUG(LoadI->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
+      LLVM_DEBUG(LoadI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
       LoadI->eraseFromParent();
       return NextI;
     }
@@ -913,7 +916,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
             .add(StMO)
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .setMIFlags(LoadI->getFlags());
   } else {
     // FIXME: Currently we disable this transformation in big-endian targets as
     // performance and correctness are verified only in little-endian.
@@ -954,7 +958,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
                   DestReg)
               .add(StMO)
-              .addImm(AndMaskEncoded);
+              .addImm(AndMaskEncoded)
+              .setMIFlags(LoadI->getFlags());
     } else {
       BitExtMI =
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
@@ -962,7 +967,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
                   DestReg)
               .add(StMO)
               .addImm(Immr)
-              .addImm(Imms);
+              .addImm(Imms)
+              .setMIFlags(LoadI->getFlags());
     }
   }
 
@@ -974,48 +980,21 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
       break;
     }
 
-  DEBUG(dbgs() << "Promoting load by replacing :\n    ");
-  DEBUG(StoreI->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(LoadI->print(dbgs()));
-  DEBUG(dbgs() << "  with instructions:\n    ");
-  DEBUG(StoreI->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG((BitExtMI)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
+  LLVM_DEBUG(StoreI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(LoadI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
+  LLVM_DEBUG(StoreI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG((BitExtMI)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
 
   // Erase the old instructions.
   LoadI->eraseFromParent();
   return NextI;
 }
 
-/// trackRegDefsUses - Remember what registers the specified instruction uses
-/// and modifies.
-static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
-                             BitVector &UsedRegs,
-                             const TargetRegisterInfo *TRI) {
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isRegMask())
-      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
-
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (MO.isDef()) {
-      // WZR/XZR are not modified even when used as a destination register.
-      if (Reg != AArch64::WZR && Reg != AArch64::XZR)
-        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-          ModifiedRegs.set(*AI);
-    } else {
-      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        UsedRegs.set(*AI);
-    }
-  }
-}
-
 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
   // Convert the byte-offset used by unscaled into an "element" offset used
   // by the scaled pair load/store instructions.
@@ -1073,10 +1052,10 @@ bool AArch64LoadStoreOpt::findMatchingStore(
   if (MBBI == B)
     return false;
 
-  // Track which registers have been modified and used between the first insn
-  // and the second insn.
-  ModifiedRegs.reset();
-  UsedRegs.reset();
+  // Track which register units have been modified and used between the first
+  // insn and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
 
   unsigned Count = 0;
   do {
@@ -1095,7 +1074,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
         BaseReg == getLdStBaseOp(MI).getReg() &&
         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
-        !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+        ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
       StoreI = MBBI;
       return true;
     }
@@ -1103,12 +1082,12 @@ bool AArch64LoadStoreOpt::findMatchingStore(
     if (MI.isCall())
       return false;
 
-    // Update modified / uses register lists.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+    // Update modified / uses register units.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
 
     // Otherwise, if the base register is modified, we have no match, so
     // return early.
-    if (ModifiedRegs[BaseReg])
+    if (!ModifiedRegUnits.available(BaseReg))
       return false;
 
     // If we encounter a store aliased with the load, return early.
@@ -1186,10 +1165,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  ModifiedRegs.reset();
-  UsedRegs.reset();
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
 
   // Remember any instructions that read/write memory between FirstMI and MI.
   SmallVector<MachineInstr *, 4> MemInsns;
@@ -1224,7 +1203,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           // If the unscaled offset isn't a multiple of the MemSize, we can't
           // pair the operations together: bail and keep looking.
           if (MIOffset % MemSize) {
-            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+                                              UsedRegUnits, TRI);
             MemInsns.push_back(&MI);
             continue;
           }
@@ -1244,7 +1224,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           // the stored value is the same (i.e., WZR).
           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
-            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+                                              UsedRegUnits, TRI);
             MemInsns.push_back(&MI);
             continue;
           }
@@ -1254,7 +1235,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           // immediate offset of merging these instructions is out of range for
           // a pairwise instruction, bail and keep looking.
           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
-            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+                                              UsedRegUnits, TRI);
             MemInsns.push_back(&MI);
             continue;
           }
@@ -1262,7 +1244,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           // can't express the offset of the unscaled input, bail and keep
           // looking.
           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
-            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
+                                              UsedRegUnits, TRI);
             MemInsns.push_back(&MI);
             continue;
           }
@@ -1271,7 +1254,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // and keep looking. A load-pair instruction with both destination
         // registers the same is UNPREDICTABLE and will result in an exception.
         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                            TRI);
           MemInsns.push_back(&MI);
           continue;
         }
@@ -1280,8 +1264,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // the two instructions and none of the instructions between the second
         // and first alias with the second, we can combine the second into the
         // first.
-        if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
-            !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+        if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
+            !(MI.mayLoad() &&
+              !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
             !mayAlias(MI, MemInsns, AA)) {
           Flags.setMergeForward(false);
           return MBBI;
@@ -1291,8 +1276,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // between the two instructions and none of the instructions between the
         // first and the second alias with the first, we can combine the first
         // into the second.
-        if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
-            !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
+        if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
+            !(MayLoad &&
+              !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
             !mayAlias(FirstMI, MemInsns, AA)) {
           Flags.setMergeForward(true);
           return MBBI;
@@ -1307,12 +1293,12 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
     if (MI.isCall())
       return E;
 
-    // Update modified / uses register lists.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+    // Update modified / uses register units.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
 
     // Otherwise, if the base register is modified, we have no match, so
     // return early.
-    if (ModifiedRegs[BaseReg])
+    if (!ModifiedRegUnits.available(BaseReg))
       return E;
 
     // Update list of instructions that read/write memory.
@@ -1352,7 +1338,8 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
               .add(getLdStRegOp(*I))
               .add(getLdStBaseOp(*I))
               .addImm(Value)
-              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end())
+              .setMIFlags(I->mergeFlagsWith(*Update));
   } else {
     // Paired instruction.
     int Scale = getMemScale(*I);
@@ -1362,24 +1349,25 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
               .add(getLdStRegOp(*I, 1))
               .add(getLdStBaseOp(*I))
               .addImm(Value / Scale)
-              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end())
+              .setMIFlags(I->mergeFlagsWith(*Update));
   }
   (void)MIB;
 
   if (IsPreIdx) {
     ++NumPreFolded;
-    DEBUG(dbgs() << "Creating pre-indexed load/store.");
+    LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
   } else {
     ++NumPostFolded;
-    DEBUG(dbgs() << "Creating post-indexed load/store.");
+    LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
   }
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Update->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Update->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
 
   // Erase the old instructions for the block.
   I->eraseFromParent();
@@ -1466,10 +1454,10 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
       return E;
   }
 
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  ModifiedRegs.reset();
-  UsedRegs.reset();
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
   ++MBBI;
   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
     MachineInstr &MI = *MBBI;
@@ -1484,11 +1472,12 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
 
     // Otherwise, if the base register is used or modified, we have no match, so
     // return early.
-    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+    if (!ModifiedRegUnits.available(BaseReg) ||
+        !UsedRegUnits.available(BaseReg))
       return E;
   }
   return E;
@@ -1517,10 +1506,10 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
       return E;
   }
 
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  ModifiedRegs.reset();
-  UsedRegs.reset();
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
   unsigned Count = 0;
   do {
     --MBBI;
@@ -1536,11 +1525,12 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
 
     // Otherwise, if the base register is used or modified, we have no match, so
     // return early.
-    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+    if (!ModifiedRegUnits.available(BaseReg) ||
+        !UsedRegUnits.available(BaseReg))
       return E;
   } while (MBBI != B && Count < Limit);
   return E;
@@ -1767,11 +1757,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TRI = Subtarget->getRegisterInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  // Resize the modified and used register bitfield trackers.  We do this once
-  // per function and then clear the bitfield each time we optimize a load or
-  // store.
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  // Resize the modified and used register unit trackers.  We do this once
+  // per function and then clear the register units each time we optimize a load
+  // or store.
+  ModifiedRegUnits.init(*TRI);
+  UsedRegUnits.init(*TRI);
 
   bool Modified = false;
   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 65dae03a24db..6c0263585933 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -18,13 +18,13 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
@@ -173,11 +173,20 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
 
 MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
                                                      MCSymbol *Sym) const {
-  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
-  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+  AArch64MCExpr::VariantKind RefKind = AArch64MCExpr::VK_NONE;
+  if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGEOFF)
+      RefKind = AArch64MCExpr::VK_SECREL_LO12;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_HI12)
+      RefKind = AArch64MCExpr::VK_SECREL_HI12;
+  }
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
   return MCOperand::createExpr(Expr);
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 9f354c009461..798340f8fed8 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -48,33 +49,33 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// determineCalleeSaves().
   bool HasStackFrame = false;
 
-  /// \brief Amount of stack frame size, not including callee-saved registers.
+  /// Amount of stack frame size, not including callee-saved registers.
   unsigned LocalStackSize;
 
-  /// \brief Amount of stack frame size used for saving callee-saved registers.
+  /// Amount of stack frame size used for saving callee-saved registers.
   unsigned CalleeSavedStackSize;
 
-  /// \brief Number of TLS accesses using the special (combinable)
+  /// Number of TLS accesses using the special (combinable)
   /// _TLS_MODULE_BASE_ symbol.
   unsigned NumLocalDynamicTLSAccesses = 0;
 
-  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// FrameIndex for start of varargs area for arguments passed on the
   /// stack.
   int VarArgsStackIndex = 0;
 
-  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// FrameIndex for start of varargs area for arguments passed in
   /// general purpose registers.
   int VarArgsGPRIndex = 0;
 
-  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// Size of the varargs area for arguments passed in general purpose
   /// registers.
   unsigned VarArgsGPRSize = 0;
 
-  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// FrameIndex for start of varargs area for arguments passed in
   /// floating-point registers.
   int VarArgsFPRIndex = 0;
 
-  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// Size of the varargs area for arguments passed in floating-point
   /// registers.
   unsigned VarArgsFPRSize = 0;
 
@@ -90,11 +91,22 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// other stack allocations.
   bool CalleeSaveStackHasFreeSpace = false;
 
+  /// Has a value when it is known whether or not the function uses a
+  /// redzone, and no value otherwise.
+  /// Initialized during frame lowering, unless the function has the noredzone
+  /// attribute, in which case it is set to false at construction.
+  Optional<bool> HasRedZone;
+
 public:
   AArch64FunctionInfo() = default;
 
   explicit AArch64FunctionInfo(MachineFunction &MF) {
     (void)MF;
+
+    // If we already know that the function doesn't have a redzone, set
+    // HasRedZone here.
+    if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+      HasRedZone = false;
   }
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
@@ -132,6 +144,9 @@ public:
     return NumLocalDynamicTLSAccesses;
   }
 
+  Optional<bool> hasRedZone() const { return HasRedZone; }
+  void setHasRedZone(bool s) { HasRedZone = s; }
+  
   int getVarArgsStackIndex() const { return VarArgsStackIndex; }
   void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index 6930c816b5ae..bc0168e783be 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -20,135 +20,262 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
-/// together. Given SecondMI, when FirstMI is unspecified, then check if
-/// SecondMI may be part of a fused pair at all.
-static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
-                                   const TargetSubtargetInfo &TSI,
-                                   const MachineInstr *FirstMI,
-                                   const MachineInstr &SecondMI) {
-  const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
-  const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
+// Fuse CMN, CMP, TST followed by Bcc.
+static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+                                const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() == AArch64::Bcc) {
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (!FirstMI)
+      return true;
 
-  // Assume wildcards for unspecified instrs.
+    switch (FirstMI->getOpcode()) {
+    case AArch64::ADDSWri:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXri:
+    case AArch64::ADDSXrr:
+    case AArch64::ANDSWri:
+    case AArch64::ANDSWrr:
+    case AArch64::ANDSXri:
+    case AArch64::ANDSXrr:
+    case AArch64::SUBSWri:
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXri:
+    case AArch64::SUBSXrr:
+    case AArch64::BICSWrr:
+    case AArch64::BICSXrr:
+      return true;
+    case AArch64::ADDSWrs:
+    case AArch64::ADDSXrs:
+    case AArch64::ANDSWrs:
+    case AArch64::ANDSXrs:
+    case AArch64::SUBSWrs:
+    case AArch64::SUBSXrs:
+    case AArch64::BICSWrs:
+    case AArch64::BICSXrs:
+      // Shift value can be 0 making these behave like the "rr" variant...
+      return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+    }
+  }
+  return false;
+}
+
+// Fuse ALU operations followed by CBZ/CBNZ.
+static bool isArithmeticCbzPair(const MachineInstr *FirstMI,
+                                const MachineInstr &SecondMI) {
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+      SecondOpcode == AArch64::CBZW  || SecondOpcode == AArch64::CBZX) {
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (!FirstMI)
+      return true;
+
+    switch (FirstMI->getOpcode()) {
+    case AArch64::ADDWri:
+    case AArch64::ADDWrr:
+    case AArch64::ADDXri:
+    case AArch64::ADDXrr:
+    case AArch64::ANDWri:
+    case AArch64::ANDWrr:
+    case AArch64::ANDXri:
+    case AArch64::ANDXrr:
+    case AArch64::EORWri:
+    case AArch64::EORWrr:
+    case AArch64::EORXri:
+    case AArch64::EORXrr:
+    case AArch64::ORRWri:
+    case AArch64::ORRWrr:
+    case AArch64::ORRXri:
+    case AArch64::ORRXrr:
+    case AArch64::SUBWri:
+    case AArch64::SUBWrr:
+    case AArch64::SUBXri:
+    case AArch64::SUBXrr:
+      return true;
+    case AArch64::ADDWrs:
+    case AArch64::ADDXrs:
+    case AArch64::ANDWrs:
+    case AArch64::ANDXrs:
+    case AArch64::SUBWrs:
+    case AArch64::SUBXrs:
+    case AArch64::BICWrs:
+    case AArch64::BICXrs:
+      // Shift value can be 0 making these behave like the "rr" variant...
+      return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+    }
+  }
+  return false;
+}
+
+// Fuse AES crypto encoding or decoding.
+static bool isAESPair(const MachineInstr *FirstMI,
+                      const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
   unsigned FirstOpcode =
       FirstMI ? FirstMI->getOpcode()
               : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
   unsigned SecondOpcode = SecondMI.getOpcode();
 
-  if (ST.hasArithmeticBccFusion())
-    // Fuse CMN, CMP, TST followed by Bcc.
-    if (SecondOpcode == AArch64::Bcc)
-      switch (FirstOpcode) {
-      default:
-        return false;
-      case AArch64::ADDSWri:
-      case AArch64::ADDSWrr:
-      case AArch64::ADDSXri:
-      case AArch64::ADDSXrr:
-      case AArch64::ANDSWri:
-      case AArch64::ANDSWrr:
-      case AArch64::ANDSXri:
-      case AArch64::ANDSXrr:
-      case AArch64::SUBSWri:
-      case AArch64::SUBSWrr:
-      case AArch64::SUBSXri:
-      case AArch64::SUBSXrr:
-      case AArch64::BICSWrr:
-      case AArch64::BICSXrr:
-        return true;
-      case AArch64::ADDSWrs:
-      case AArch64::ADDSXrs:
-      case AArch64::ANDSWrs:
-      case AArch64::ANDSXrs:
+  // AES encode.
+  if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+       FirstOpcode == AArch64::AESErr) &&
+      (SecondOpcode == AArch64::AESMCrr ||
+       SecondOpcode == AArch64::AESMCrrTied))
+    return true;
+  // AES decode.
+  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+            FirstOpcode == AArch64::AESDrr) &&
+           (SecondOpcode == AArch64::AESIMCrr ||
+            SecondOpcode == AArch64::AESIMCrrTied))
+    return true;
+
+  return false;
+}
+
+// Fuse literal generation.
+static bool isLiteralsPair(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  unsigned FirstOpcode =
+      FirstMI ? FirstMI->getOpcode()
+              : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  // PC relative address.
+  if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+       FirstOpcode == AArch64::ADRP) &&
+      SecondOpcode == AArch64::ADDXri)
+    return true;
+  // 32 bit immediate.
+  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+            FirstOpcode == AArch64::MOVZWi) &&
+           (SecondOpcode == AArch64::MOVKWi &&
+            SecondMI.getOperand(3).getImm() == 16))
+    return true;
+  // Lower half of 64 bit immediate.
+  else if((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+           FirstOpcode == AArch64::MOVZXi) &&
+          (SecondOpcode == AArch64::MOVKXi &&
+           SecondMI.getOperand(3).getImm() == 16))
+    return true;
+  // Upper half of 64 bit immediate.
+  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+            (FirstOpcode == AArch64::MOVKXi &&
+             FirstMI->getOperand(3).getImm() == 32)) &&
+           (SecondOpcode == AArch64::MOVKXi &&
+            SecondMI.getOperand(3).getImm() == 48))
+    return true;
+
+  return false;
+}
+
+// Fuse address generation and loads or stores.
+static bool isAddressLdStPair(const MachineInstr *FirstMI,
+                              const MachineInstr &SecondMI) {
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  switch (SecondOpcode) {
+  case AArch64::STRBBui:
+  case AArch64::STRBui:
+  case AArch64::STRDui:
+  case AArch64::STRHHui:
+  case AArch64::STRHui:
+  case AArch64::STRQui:
+  case AArch64::STRSui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRBui:
+  case AArch64::LDRDui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRHui:
+  case AArch64::LDRQui:
+  case AArch64::LDRSui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (!FirstMI)
+      return true;
+
+    switch (FirstMI->getOpcode()) {
+    case AArch64::ADR:
+      return (SecondMI.getOperand(2).getImm() == 0);
+    case AArch64::ADRP:
+      return true;
+    }
+  }
+  return false;
+}
+
+// Fuse compare and conditional select.
+static bool isCCSelectPair(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  // 32 bits
+  if (SecondOpcode == AArch64::CSELWr) {
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (!FirstMI)
+      return true;
+
+    if (FirstMI->definesRegister(AArch64::WZR))
+      switch (FirstMI->getOpcode()) {
       case AArch64::SUBSWrs:
-      case AArch64::SUBSXrs:
-      case AArch64::BICSWrs:
-      case AArch64::BICSXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !II.hasShiftedReg(*FirstMI);
-      case AArch64::INSTRUCTION_LIST_END:
+        return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+      case AArch64::SUBSWrx:
+        return (!AArch64InstrInfo::hasExtendedReg(*FirstMI));
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSWri:
         return true;
       }
+  }
+  // 64 bits
+  else if (SecondOpcode == AArch64::CSELXr) {
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (!FirstMI)
+      return true;
 
-  if (ST.hasArithmeticCbzFusion())
-    // Fuse ALU operations followed by CBZ/CBNZ.
-    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
-        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
-      switch (FirstOpcode) {
-      default:
-        return false;
-      case AArch64::ADDWri:
-      case AArch64::ADDWrr:
-      case AArch64::ADDXri:
-      case AArch64::ADDXrr:
-      case AArch64::ANDWri:
-      case AArch64::ANDWrr:
-      case AArch64::ANDXri:
-      case AArch64::ANDXrr:
-      case AArch64::EORWri:
-      case AArch64::EORWrr:
-      case AArch64::EORXri:
-      case AArch64::EORXrr:
-      case AArch64::ORRWri:
-      case AArch64::ORRWrr:
-      case AArch64::ORRXri:
-      case AArch64::ORRXrr:
-      case AArch64::SUBWri:
-      case AArch64::SUBWrr:
-      case AArch64::SUBXri:
-      case AArch64::SUBXrr:
-        return true;
-      case AArch64::ADDWrs:
-      case AArch64::ADDXrs:
-      case AArch64::ANDWrs:
-      case AArch64::ANDXrs:
-      case AArch64::SUBWrs:
-      case AArch64::SUBXrs:
-      case AArch64::BICWrs:
-      case AArch64::BICXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !II.hasShiftedReg(*FirstMI);
-      case AArch64::INSTRUCTION_LIST_END:
+    if (FirstMI->definesRegister(AArch64::XZR))
+      switch (FirstMI->getOpcode()) {
+      case AArch64::SUBSXrs:
+        return (!AArch64InstrInfo::hasShiftedReg(*FirstMI));
+      case AArch64::SUBSXrx:
+      case AArch64::SUBSXrx64:
+        return (!AArch64InstrInfo::hasExtendedReg(*FirstMI));
+      case AArch64::SUBSXrr:
+      case AArch64::SUBSXri:
         return true;
       }
+  }
+  return false;
+}
 
-  if (ST.hasFuseAES())
-    // Fuse AES crypto operations.
-    switch(SecondOpcode) {
-    // AES encode.
-    case AArch64::AESMCrr:
-    case AArch64::AESMCrrTied:
-      return FirstOpcode == AArch64::AESErr ||
-             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
-    // AES decode.
-    case AArch64::AESIMCrr:
-    case AArch64::AESIMCrrTied:
-      return FirstOpcode == AArch64::AESDrr ||
-             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
-    }
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
 
-  if (ST.hasFuseLiterals())
-    // Fuse literal generation operations.
-    switch (SecondOpcode) {
-    // PC relative address.
-    case AArch64::ADDXri:
-      return FirstOpcode == AArch64::ADRP ||
-             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
-    // 32 bit immediate.
-    case AArch64::MOVKWi:
-      return (FirstOpcode == AArch64::MOVZWi &&
-              SecondMI.getOperand(3).getImm() == 16) ||
-             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
-    // Lower and upper half of 64 bit immediate.
-    case AArch64::MOVKXi:
-      return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-             (FirstOpcode == AArch64::MOVZXi &&
-              SecondMI.getOperand(3).getImm() == 16) ||
-             (FirstOpcode == AArch64::MOVKXi &&
-              FirstMI->getOperand(3).getImm() == 32 &&
-              SecondMI.getOperand(3).getImm() == 48);
-    }
+  if (ST.hasArithmeticBccFusion() && isArithmeticBccPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
+    return true;
 
   return false;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index ee6703aed1e2..ccf646575296 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -164,10 +164,10 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
   LiveIntervals &LIs = G.getMetadata().LIS;
 
   if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
-    DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
-          << '\n');
-    DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
-          << '\n');
+    LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+                      << '\n');
+    LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+                      << '\n');
     return false;
   }
 
@@ -247,14 +247,14 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
   // Do some Chain management
   if (Chains.count(Ra)) {
     if (Rd != Ra) {
-      DEBUG(dbgs() << "Moving acc chain from " << printReg(Ra, TRI) << " to "
-                   << printReg(Rd, TRI) << '\n';);
+      LLVM_DEBUG(dbgs() << "Moving acc chain from " << printReg(Ra, TRI)
+                        << " to " << printReg(Rd, TRI) << '\n';);
       Chains.remove(Ra);
       Chains.insert(Rd);
     }
   } else {
-    DEBUG(dbgs() << "Creating new acc chain for " << printReg(Rd, TRI)
-                 << '\n';);
+    LLVM_DEBUG(dbgs() << "Creating new acc chain for " << printReg(Rd, TRI)
+                      << '\n';);
     Chains.insert(Rd);
   }
 
@@ -279,7 +279,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
       assert(edge != G.invalidEdgeId() &&
              "PBQP error ! The edge should exist !");
 
-      DEBUG(dbgs() << "Refining constraint !\n";);
+      LLVM_DEBUG(dbgs() << "Refining constraint !\n";);
 
       if (G.getEdgeNode1Id(edge) == node2) {
         std::swap(node1, node2);
@@ -329,7 +329,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
   LiveIntervals &LIs = G.getMetadata().LIS;
 
   TRI = MF.getSubtarget().getRegisterInfo();
-  DEBUG(MF.dump());
+  LLVM_DEBUG(MF.dump());
 
   for (const auto &MBB: MF) {
     Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
@@ -340,8 +340,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
       for (auto r : Chains) {
         SmallVector<unsigned, 8> toDel;
         if(regJustKilledBefore(LIs, r, MI)) {
-          DEBUG(dbgs() << "Killing chain " << printReg(r, TRI) << " at ";
-                MI.print(dbgs()););
+          LLVM_DEBUG(dbgs() << "Killing chain " << printReg(r, TRI) << " at ";
+                     MI.print(dbgs()););
           toDel.push_back(r);
         }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
index a8dc6e74ef6a..01d8a35bbc23 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -119,7 +119,7 @@ public:
   /// Iterate over the functions and promote the interesting constants into
   /// global variables with module scope.
   bool runOnModule(Module &M) override {
-    DEBUG(dbgs() << getPassName() << '\n');
+    LLVM_DEBUG(dbgs() << getPassName() << '\n');
     if (skipModule(M))
       return false;
     bool Changed = false;
@@ -380,9 +380,9 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
         (IPI.first->getParent() != NewPt->getParent() &&
          DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
       // No need to insert this point. Just record the dominated use.
-      DEBUG(dbgs() << "Insertion point dominated by:\n");
-      DEBUG(IPI.first->print(dbgs()));
-      DEBUG(dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "Insertion point dominated by:\n");
+      LLVM_DEBUG(IPI.first->print(dbgs()));
+      LLVM_DEBUG(dbgs() << '\n');
       IPI.second.emplace_back(User, OpNo);
       return true;
     }
@@ -408,9 +408,9 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
       // Instructions are in the same block.
       // By construction, NewPt is dominating the other.
       // Indeed, isDominated returned false with the exact same arguments.
-      DEBUG(dbgs() << "Merge insertion point with:\n");
-      DEBUG(IPI->first->print(dbgs()));
-      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      LLVM_DEBUG(dbgs() << "Merge insertion point with:\n");
+      LLVM_DEBUG(IPI->first->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\nat considered insertion point.\n");
       appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
       return true;
     }
@@ -430,11 +430,11 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
     }
     // else, CommonDominator is the block of NewBB, hence NewBB is the last
     // possible insertion point in that block.
-    DEBUG(dbgs() << "Merge insertion point with:\n");
-    DEBUG(IPI->first->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-    DEBUG(NewPt->print(dbgs()));
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Merge insertion point with:\n");
+    LLVM_DEBUG(IPI->first->print(dbgs()));
+    LLVM_DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(NewPt->print(dbgs()));
+    LLVM_DEBUG(dbgs() << '\n');
     appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
     return true;
   }
@@ -443,15 +443,15 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
 
 void AArch64PromoteConstant::computeInsertionPoint(
     Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
-  DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
-  DEBUG(User->print(dbgs()));
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+  LLVM_DEBUG(User->print(dbgs()));
+  LLVM_DEBUG(dbgs() << '\n');
 
   Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
 
-  DEBUG(dbgs() << "Considered insertion point:\n");
-  DEBUG(InsertionPoint->print(dbgs()));
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Considered insertion point:\n");
+  LLVM_DEBUG(InsertionPoint->print(dbgs()));
+  LLVM_DEBUG(dbgs() << '\n');
 
   if (isDominated(InsertionPoint, User, OpNo, InsertPts))
     return;
@@ -460,7 +460,7 @@ void AArch64PromoteConstant::computeInsertionPoint(
   if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
     return;
 
-  DEBUG(dbgs() << "Keep considered insertion point\n");
+  LLVM_DEBUG(dbgs() << "Keep considered insertion point\n");
 
   // It is definitely useful by its own
   InsertPts[InsertionPoint].emplace_back(User, OpNo);
@@ -476,9 +476,9 @@ static void ensurePromotedGV(Function &F, Constant &C,
       *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
       "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
   PC.GV->setInitializer(&C);
-  DEBUG(dbgs() << "Global replacement: ");
-  DEBUG(PC.GV->print(dbgs()));
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Global replacement: ");
+  LLVM_DEBUG(PC.GV->print(dbgs()));
+  LLVM_DEBUG(dbgs() << '\n');
   ++NumPromoted;
 }
 
@@ -495,10 +495,10 @@ void AArch64PromoteConstant::insertDefinitions(Function &F,
     // Create the load of the global variable.
     IRBuilder<> Builder(IPI.first);
     LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
-    DEBUG(dbgs() << "**********\n");
-    DEBUG(dbgs() << "New def: ");
-    DEBUG(LoadedCst->print(dbgs()));
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "**********\n");
+    LLVM_DEBUG(dbgs() << "New def: ");
+    LLVM_DEBUG(LoadedCst->print(dbgs()));
+    LLVM_DEBUG(dbgs() << '\n');
 
     // Update the dominated uses.
     for (auto Use : IPI.second) {
@@ -507,11 +507,11 @@ void AArch64PromoteConstant::insertDefinitions(Function &F,
                           findInsertionPoint(*Use.first, Use.second)) &&
              "Inserted definition does not dominate all its uses!");
 #endif
-      DEBUG({
-            dbgs() << "Use to update " << Use.second << ":";
-            Use.first->print(dbgs());
-            dbgs() << '\n';
-            });
+      LLVM_DEBUG({
+        dbgs() << "Use to update " << Use.second << ":";
+        Use.first->print(dbgs());
+        dbgs() << '\n';
+      });
       Use.first->setOperand(Use.second, LoadedCst);
       ++NumPromotedUses;
     }
@@ -523,7 +523,7 @@ void AArch64PromoteConstant::promoteConstants(
     PromotionCacheTy &PromotionCache) {
   // Promote the constants.
   for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
-    DEBUG(dbgs() << "** Compute insertion points **\n");
+    LLVM_DEBUG(dbgs() << "** Compute insertion points **\n");
     auto First = U;
     Constant *C = First->C;
     InsertionPoints InsertPts;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index e5822b114324..fcb0b36a9f6d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -55,6 +55,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
@@ -72,10 +73,10 @@ class AArch64RedundantCopyElimination : public MachineFunctionPass {
 
   // DomBBClobberedRegs is used when computing known values in the dominating
   // BB.
-  BitVector DomBBClobberedRegs;
+  LiveRegUnits DomBBClobberedRegs, DomBBUsedRegs;
 
   // OptBBClobberedRegs is used when optimizing away redundant copies/moves.
-  BitVector OptBBClobberedRegs;
+  LiveRegUnits OptBBClobberedRegs, OptBBUsedRegs;
 
 public:
   static char ID;
@@ -109,28 +110,6 @@ char AArch64RedundantCopyElimination::ID = 0;
 INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
                 "AArch64 redundant copy elimination pass", false, false)
 
-/// Remember what registers the specified instruction modifies.
-static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs,
-                         const TargetRegisterInfo *TRI) {
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isRegMask()) {
-      ClobberedRegs.setBitsNotInMask(MO.getRegMask());
-      continue;
-    }
-
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (!MO.isDef())
-      continue;
-
-    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-      ClobberedRegs.set(*AI);
-  }
-}
-
 /// It's possible to determine the value of a register based on a dominating
 /// condition.  To do so, this function checks to see if the basic block \p MBB
 /// is the target of a conditional branch \p CondBr with an equality comparison.
@@ -182,7 +161,8 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
 
   // Registers clobbered in PredMBB between CondBr instruction and current
   // instruction being checked in loop.
-  DomBBClobberedRegs.reset();
+  DomBBClobberedRegs.clear();
+  DomBBUsedRegs.clear();
 
   // Find compare instruction that sets NZCV used by CondBr.
   MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator();
@@ -212,7 +192,7 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
       // register of the compare is not modified (including a self-clobbering
       // compare) between the compare and conditional branch we known the value
       // of the 1st source operand.
-      if (PredI.getOperand(2).isImm() && !DomBBClobberedRegs[SrcReg] &&
+      if (PredI.getOperand(2).isImm() && DomBBClobberedRegs.available(SrcReg) &&
           SrcReg != DstReg) {
         // We've found the instruction that sets NZCV.
         int32_t KnownImm = PredI.getOperand(2).getImm();
@@ -232,7 +212,7 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
 
       // The destination register must not be modified between the NZCV setting
       // instruction and the conditional branch.
-      if (DomBBClobberedRegs[DstReg])
+      if (!DomBBClobberedRegs.available(DstReg))
         return Res;
 
       FirstUse = PredI;
@@ -276,7 +256,7 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
 
       // The destination register of the NZCV setting instruction must not be
       // modified before the conditional branch.
-      if (DomBBClobberedRegs[DstReg])
+      if (!DomBBClobberedRegs.available(DstReg))
         return false;
 
       // We've found the instruction that sets NZCV whose DstReg == 0.
@@ -290,8 +270,9 @@ bool AArch64RedundantCopyElimination::knownRegValInBlock(
     if (PredI.definesRegister(AArch64::NZCV))
       return false;
 
-    // Track clobbered registers.
-    trackRegDefs(PredI, DomBBClobberedRegs, TRI);
+    // Track clobbered and used registers.
+    LiveRegUnits::accumulateUsedDefed(PredI, DomBBClobberedRegs, DomBBUsedRegs,
+                                      TRI);
   }
   return false;
 }
@@ -330,8 +311,9 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
     if (!knownRegValInBlock(*Itr, MBB, KnownRegs, FirstUse))
       continue;
 
-    // Reset the clobber list.
-    OptBBClobberedRegs.reset();
+    // Reset the clobbered and used register units.
+    OptBBClobberedRegs.clear();
+    OptBBUsedRegs.clear();
 
     // Look backward in PredMBB for COPYs from the known reg to find other
     // registers that are known to be a constant value.
@@ -343,11 +325,12 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
         MCPhysReg CopyDstReg = PredI->getOperand(0).getReg();
         MCPhysReg CopySrcReg = PredI->getOperand(1).getReg();
         for (auto &KnownReg : KnownRegs) {
-          if (OptBBClobberedRegs[KnownReg.Reg])
+          if (!OptBBClobberedRegs.available(KnownReg.Reg))
             continue;
           // If we have X = COPY Y, and Y is known to be zero, then now X is
           // known to be zero.
-          if (CopySrcReg == KnownReg.Reg && !OptBBClobberedRegs[CopyDstReg]) {
+          if (CopySrcReg == KnownReg.Reg &&
+              OptBBClobberedRegs.available(CopyDstReg)) {
             KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm));
             if (SeenFirstUse)
               FirstUse = PredI;
@@ -355,7 +338,8 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
           }
           // If we have X = COPY Y, and X is known to be zero, then now Y is
           // known to be zero.
-          if (CopyDstReg == KnownReg.Reg && !OptBBClobberedRegs[CopySrcReg]) {
+          if (CopyDstReg == KnownReg.Reg &&
+              OptBBClobberedRegs.available(CopySrcReg)) {
             KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm));
             if (SeenFirstUse)
               FirstUse = PredI;
@@ -368,10 +352,11 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
       if (PredI == PredMBB->begin())
         break;
 
-      trackRegDefs(*PredI, OptBBClobberedRegs, TRI);
+      LiveRegUnits::accumulateUsedDefed(*PredI, OptBBClobberedRegs,
+                                        OptBBUsedRegs, TRI);
       // Stop if all of the known-zero regs have been clobbered.
       if (all_of(KnownRegs, [&](RegImm KnownReg) {
-            return OptBBClobberedRegs[KnownReg.Reg];
+            return !OptBBClobberedRegs.available(KnownReg.Reg);
           }))
         break;
     }
@@ -427,9 +412,9 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
           }
 
           if (IsCopy)
-            DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
+            LLVM_DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
           else
-            DEBUG(dbgs() << "Remove redundant Move : " << *MI);
+            LLVM_DEBUG(dbgs() << "Remove redundant Move : " << *MI);
 
           MI->eraseFromParent();
           Changed = true;
@@ -473,8 +458,8 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
 
   // Clear kills in the range where changes were made.  This is conservative,
   // but should be okay since kill markers are being phased out.
-  DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
-               << "\tLastChange: " << *LastChange);
+  LLVM_DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
+                    << "\tLastChange: " << *LastChange);
   for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end()))
     MMI.clearKillInfo();
   for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
@@ -490,10 +475,12 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
-  // Resize the clobber register bitfield trackers.  We do this once per
+  // Resize the clobbered and used register unit trackers.  We do this once per
   // function.
-  DomBBClobberedRegs.resize(TRI->getNumRegs());
-  OptBBClobberedRegs.resize(TRI->getNumRegs());
+  DomBBClobberedRegs.init(*TRI);
+  DomBBUsedRegs.init(*TRI);
+  OptBBClobberedRegs.init(*TRI);
+  OptBBUsedRegs.init(*TRI);
 
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF)
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 88dd297e0079..a7c2c1b8125b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -72,24 +72,41 @@ const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
   return nullptr;
 }
 
+const TargetRegisterClass *
+AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // edge case for GPR/FPR register classes
+  if (RC == &AArch64::GPR32allRegClass && Idx == AArch64::hsub)
+    return &AArch64::FPR32RegClass;
+  else if (RC == &AArch64::GPR64allRegClass && Idx == AArch64::hsub)
+    return &AArch64::FPR64RegClass;
+
+  // Forward to TableGen's default version.
+  return AArch64GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
 const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
+  bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
   if (CC == CallingConv::GHC)
     // This is academic because all GHC calls are (supposed to be) tail calls
-    return CSR_AArch64_NoRegs_RegMask;
+    return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
-    return CSR_AArch64_AllRegs_RegMask;
+    return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;
   if (CC == CallingConv::CXX_FAST_TLS)
-    return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+    return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask
+               : CSR_AArch64_CXX_TLS_Darwin_RegMask;
   if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
           ->supportSwiftError() &&
       MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
-    return CSR_AArch64_AAPCS_SwiftError_RegMask;
+    return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask
+               : CSR_AArch64_AAPCS_SwiftError_RegMask;
   if (CC == CallingConv::PreserveMost)
-    return CSR_AArch64_RT_MostRegs_RegMask;
+    return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask
+               : CSR_AArch64_RT_MostRegs_RegMask;
   else
-    return CSR_AArch64_AAPCS_RegMask;
+    return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
 }
 
 const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
@@ -114,6 +131,10 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   return CSR_AArch64_AAPCS_ThisReturn_RegMask;
 }
 
+const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const {
+  return CSR_AArch64_StackProbe_Windows_RegMask;
+}
+
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -129,6 +150,9 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
     markSuperRegs(Reserved, AArch64::W18); // Platform register
 
+  if (MF.getSubtarget<AArch64Subtarget>().isX20Reserved())
+    markSuperRegs(Reserved, AArch64::W20); // Platform register
+
   if (hasBasePointer(MF))
     markSuperRegs(Reserved, AArch64::W19);
 
@@ -151,12 +175,15 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
   case AArch64::X18:
   case AArch64::W18:
     return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
+  case AArch64::X19:
+  case AArch64::W19:
+    return hasBasePointer(MF);
+  case AArch64::X20:
+  case AArch64::W20:
+    return MF.getSubtarget<AArch64Subtarget>().isX20Reserved();
   case AArch64::FP:
   case AArch64::W29:
     return TFI->hasFP(MF) || TT.isOSDarwin();
-  case AArch64::W19:
-  case AArch64::X19:
-    return hasBasePointer(MF);
   }
 
   return false;
@@ -225,11 +252,13 @@ bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
 
 bool
 AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
-  // to the stack pointer, so only put the emergency spill slot next to the
-  // FP when there's no better way to access it (SP or base pointer).
-  return MFI.hasVarSizedObjects() && !hasBasePointer(MF);
+  // This function indicates whether the emergency spillslot should be placed
+  // close to the beginning of the stackframe (closer to FP) or the end
+  // (closer to SP).
+  //
+  // The beginning works most reliably if we have a frame pointer.
+  const AArch64FrameLowering &TFI = *getFrameLowering(MF);
+  return TFI.hasFP(MF);
 }
 
 bool AArch64RegisterInfo::requiresFrameIndexScavenging(
@@ -422,6 +451,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
               - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
               - MF.getSubtarget<AArch64Subtarget>()
                     .isX18Reserved() // X18 reserved as platform register
+              - MF.getSubtarget<AArch64Subtarget>()
+                    .isX20Reserved() // X20 reserved as platform register
               - hasBasePointer(MF);  // X19
   case AArch64::FPR8RegClassID:
   case AArch64::FPR16RegClassID:
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 8ce893516fe2..57000d37090d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -46,6 +46,10 @@ public:
     return 5;
   }
 
+  const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC,
+                        unsigned Idx) const override;
+
   // Calls involved in thread-local variable lookup save more registers than
   // normal calls, so they need a different mask to represent this.
   const uint32_t *getTLSCallPreservedMask() const;
@@ -61,6 +65,9 @@ public:
   const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
                                              CallingConv::ID) const;
 
+  /// Stack probing calls preserve different CSRs to the normal CC.
+  const uint32_t *getWindowsStackProbePreservedMask() const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isConstantPhysReg(unsigned PhysReg) const override;
   const TargetRegisterClass *
@@ -69,6 +76,8 @@ public:
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
   bool useFPForScavengingIndex(const MachineFunction &MF) const override;
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 9023c3dd8c25..7a653e117fd1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -131,6 +131,9 @@ def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
 // Condition code register.
 def NZCV  : AArch64Reg<0, "nzcv">;
 
+// First fault status register
+def FFR : AArch64Reg<0, "ffr">, DwarfRegNum<[47]>;
+
 // GPR register classes with the intersections of GPR32/GPR32sp and
 // GPR64/GPR64sp for use by the coalescer.
 def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
@@ -168,6 +171,7 @@ def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
 def GPR64spPlus0Operand : AsmOperandClass {
   let Name = "GPR64sp0";
   let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isGPR64<AArch64::GPR64spRegClassID>";
   let ParserMethod = "tryParseGPR64sp0Operand";
 }
 
@@ -489,25 +493,25 @@ def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
   let ParserMatchClass = VectorRegLoAsmOperand;
 }
 
-class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+class TypedVecListAsmOperand<int count, string vecty, int lanes, int eltsize>
     : AsmOperandClass {
-  let Name = "TypedVectorList" # count # "_" # lanes # kind;
+  let Name = "TypedVectorList" # count # "_" # lanes # eltsize;
 
   let PredicateMethod
-      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
-  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+      = "isTypedVectorList<RegKind::NeonVector, " # count # ", " # lanes # ", " # eltsize # ">";
+  let RenderMethod = "addVectorListOperands<" # vecty  # ", "  # count # ">";
 }
 
-class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string eltsize>
     : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
-                                                   # kind # "'>">;
+                                                   # eltsize # "'>">;
 
 multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
   // With implicit types (probably on instruction instead). E.g. { v0, v1 }
   def _64AsmOperand : AsmOperandClass {
     let Name = NAME # "64";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList64Operands<" # count # ">";
+    let PredicateMethod = "isImplicitlyTypedVectorList<RegKind::NeonVector, " # count # ">";
+    let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_DReg, " # count # ">";
   }
 
   def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
@@ -516,8 +520,8 @@ multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
 
   def _128AsmOperand : AsmOperandClass {
     let Name = NAME # "128";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList128Operands<" # count # ">";
+    let PredicateMethod = "isImplicitlyTypedVectorList<RegKind::NeonVector, " # count # ">";
+    let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_QReg, " # count # ">";
   }
 
   def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
@@ -527,25 +531,25 @@ multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
   // 64-bit register lists with explicit type.
 
   // { v0.8b, v1.8b }
-  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def _8bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 8, 8>;
   def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
   }
 
   // { v0.4h, v1.4h }
-  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def _4hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 4, 16>;
   def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
   }
 
   // { v0.2s, v1.2s }
-  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def _2sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 2, 32>;
   def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
   }
 
   // { v0.1d, v1.1d }
-  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def _1dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 1, 64>;
   def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
   }
@@ -553,49 +557,49 @@ multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
   // 128-bit register lists with explicit type
 
   // { v0.16b, v1.16b }
-  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def _16bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 16, 8>;
   def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
   }
 
   // { v0.8h, v1.8h }
-  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def _8hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 8, 16>;
   def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
   }
 
   // { v0.4s, v1.4s }
-  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def _4sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 4, 32>;
   def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
   }
 
   // { v0.2d, v1.2d }
-  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def _2dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 2, 64>;
   def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
   }
 
   // { v0.b, v1.b }
-  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def _bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 8>;
   def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
   }
 
   // { v0.h, v1.h }
-  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def _hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 16>;
   def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
   }
 
   // { v0.s, v1.s }
-  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def _sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 32>;
   def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
   }
 
   // { v0.d, v1.d }
-  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def _dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 64>;
   def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
   }
@@ -608,13 +612,32 @@ defm VecListTwo   : VectorList<2, DD,    QQ>;
 defm VecListThree : VectorList<3, DDD,   QQQ>;
 defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
 
+class FPRAsmOperand<string RC> : AsmOperandClass {
+  let Name = "FPRAsmOperand" # RC;
+  let PredicateMethod = "isGPR64<AArch64::" # RC # "RegClassID>";
+  let RenderMethod = "addRegOperands";
+}
 
 // Register operand versions of the scalar FP registers.
-def FPR16Op : RegisterOperand<FPR16, "printOperand">;
-def FPR32Op : RegisterOperand<FPR32, "printOperand">;
-def FPR64Op : RegisterOperand<FPR64, "printOperand">;
-def FPR128Op : RegisterOperand<FPR128, "printOperand">;
+def FPR8Op  : RegisterOperand<FPR8, "printOperand"> {
+  let ParserMatchClass = FPRAsmOperand<"FPR8">;
+}
+
+def FPR16Op  : RegisterOperand<FPR16, "printOperand"> {
+  let ParserMatchClass = FPRAsmOperand<"FPR16">;
+}
 
+def FPR32Op  : RegisterOperand<FPR32, "printOperand"> {
+  let ParserMatchClass = FPRAsmOperand<"FPR32">;
+}
+
+def FPR64Op  : RegisterOperand<FPR64, "printOperand"> {
+  let ParserMatchClass = FPRAsmOperand<"FPR64">;
+}
+
+def FPR128Op : RegisterOperand<FPR128, "printOperand"> {
+  let ParserMatchClass = FPRAsmOperand<"FPR128">;
+}
 
 //===----------------------------------------------------------------------===//
 // ARMv8.1a atomic CASP register operands
@@ -769,14 +792,14 @@ def PPR_3b : PPRClass<7>; // Restricted 3 bit SVE predicate register class.
 
 class PPRAsmOperand <string name, string RegClass, int Width>: AsmOperandClass {
   let Name = "SVE" # name # "Reg";
-  let PredicateMethod = "isSVEVectorRegOfWidth<"
+  let PredicateMethod = "isSVEPredicateVectorRegOfWidth<"
                             # Width # ", " # "AArch64::" # RegClass # "RegClassID>";
   let DiagnosticType = "InvalidSVE" # name # "Reg";
   let RenderMethod = "addRegOperands";
   let ParserMethod = "tryParseSVEPredicateVector";
 }
 
-def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR", -1>;
+def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR",  0>;
 def PPRAsmOp8   : PPRAsmOperand<"PredicateB",   "PPR",  8>;
 def PPRAsmOp16  : PPRAsmOperand<"PredicateH",   "PPR", 16>;
 def PPRAsmOp32  : PPRAsmOperand<"PredicateS",   "PPR", 32>;
@@ -788,7 +811,7 @@ def PPR16  : PPRRegOp<"h", PPRAsmOp16,  PPR>;
 def PPR32  : PPRRegOp<"s", PPRAsmOp32,  PPR>;
 def PPR64  : PPRRegOp<"d", PPRAsmOp64,  PPR>;
 
-def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", -1>;
+def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b",  0>;
 def PPRAsmOp3b8   : PPRAsmOperand<"Predicate3bB",   "PPR_3b",  8>;
 def PPRAsmOp3b16  : PPRAsmOperand<"Predicate3bH",   "PPR_3b", 16>;
 def PPRAsmOp3b32  : PPRAsmOperand<"Predicate3bS",   "PPR_3b", 32>;
@@ -812,16 +835,39 @@ def ZPR : RegisterClass<"AArch64",
   let Size = 128;
 }
 
-class ZPRAsmOperand <string name, int Width>: AsmOperandClass {
+// SVE restricted 4 bit scalable vector register class
+def ZPR_4b : RegisterClass<"AArch64",
+                         [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+                          nxv2f16, nxv4f16, nxv8f16,
+                          nxv1f32, nxv2f32, nxv4f32,
+                          nxv1f64, nxv2f64],
+                         128, (sequence "Z%u", 0, 15)> {
+  let Size = 128;
+}
+
+// SVE restricted 3 bit scalable vector register class
+def ZPR_3b : RegisterClass<"AArch64",
+                         [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+                          nxv2f16, nxv4f16, nxv8f16,
+                          nxv1f32, nxv2f32, nxv4f32,
+                          nxv1f64, nxv2f64],
+                         128, (sequence "Z%u", 0, 7)> {
+  let Size = 128;
+}
+
+class ZPRAsmOperand<string name, int Width, string RegClassSuffix = "">
+    : AsmOperandClass {
   let Name = "SVE" # name # "Reg";
-  let PredicateMethod = "isSVEVectorRegOfWidth<"
-                            # Width # ", AArch64::ZPRRegClassID>";
+  let PredicateMethod = "isSVEDataVectorRegOfWidth<"
+                            # Width # ", AArch64::ZPR"
+                            # RegClassSuffix # "RegClassID>";
   let RenderMethod = "addRegOperands";
-  let ParserMethod = "tryParseSVEDataVector<"
-                               # !if(!eq(Width, -1), "false", "true") # ">";
+  let DiagnosticType = "InvalidZPR" # RegClassSuffix # Width;
+  let ParserMethod = "tryParseSVEDataVector<false, "
+                               # !if(!eq(Width, 0), "false", "true") # ">";
 }
 
-def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", -1>;
+def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", 0>;
 def ZPRAsmOp8   : ZPRAsmOperand<"VectorB",   8>;
 def ZPRAsmOp16  : ZPRAsmOperand<"VectorH",   16>;
 def ZPRAsmOp32  : ZPRAsmOperand<"VectorS",   32>;
@@ -834,3 +880,217 @@ def ZPR16   : ZPRRegOp<"h", ZPRAsmOp16,  ZPR>;
 def ZPR32   : ZPRRegOp<"s", ZPRAsmOp32,  ZPR>;
 def ZPR64   : ZPRRegOp<"d", ZPRAsmOp64,  ZPR>;
 def ZPR128  : ZPRRegOp<"q", ZPRAsmOp128, ZPR>;
+
+def ZPRAsmOp3b8   : ZPRAsmOperand<"Vector3bB", 8, "_3b">;
+def ZPRAsmOp3b16  : ZPRAsmOperand<"Vector3bH", 16, "_3b">;
+def ZPRAsmOp3b32  : ZPRAsmOperand<"Vector3bS", 32, "_3b">;
+
+def ZPR3b8  : ZPRRegOp<"b", ZPRAsmOp3b8,  ZPR_3b>;
+def ZPR3b16 : ZPRRegOp<"h", ZPRAsmOp3b16, ZPR_3b>;
+def ZPR3b32 : ZPRRegOp<"s", ZPRAsmOp3b32, ZPR_3b>;
+
+def ZPRAsmOp4b16  : ZPRAsmOperand<"Vector4bH", 16, "_4b">;
+def ZPRAsmOp4b32  : ZPRAsmOperand<"Vector4bS", 32, "_4b">;
+def ZPRAsmOp4b64  : ZPRAsmOperand<"Vector4bD", 64, "_4b">;
+
+def ZPR4b16 : ZPRRegOp<"h", ZPRAsmOp4b16, ZPR_4b>;
+def ZPR4b32 : ZPRRegOp<"s", ZPRAsmOp4b32, ZPR_4b>;
+def ZPR4b64 : ZPRRegOp<"d", ZPRAsmOp4b64, ZPR_4b>;
+
+class FPRasZPR<int Width> : AsmOperandClass{
+  let Name = "FPR" # Width # "asZPR";
+  let PredicateMethod = "isFPRasZPR<AArch64::FPR" # Width # "RegClassID>";
+  let RenderMethod = "addFPRasZPRRegOperands<" # Width # ">";
+}
+
+class FPRasZPROperand<int Width> : RegisterOperand<ZPR> {
+  let ParserMatchClass = FPRasZPR<Width>;
+  let PrintMethod = "printZPRasFPR<" # Width # ">";
+}
+
+def FPR8asZPR   : FPRasZPROperand<8>;
+def FPR16asZPR  : FPRasZPROperand<16>;
+def FPR32asZPR  : FPRasZPROperand<32>;
+def FPR64asZPR  : FPRasZPROperand<64>;
+def FPR128asZPR : FPRasZPROperand<128>;
+
+let Namespace = "AArch64" in {
+  def zsub0 : SubRegIndex<128, -1>;
+  def zsub1 : SubRegIndex<128, -1>;
+  def zsub2 : SubRegIndex<128, -1>;
+  def zsub3 : SubRegIndex<128, -1>;
+}
+
+// Pairs, triples, and quads of SVE vector registers.
+def ZSeqPairs   : RegisterTuples<[zsub0, zsub1], [(rotl ZPR, 0), (rotl ZPR, 1)]>;
+def ZSeqTriples : RegisterTuples<[zsub0, zsub1, zsub2], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2)]>;
+def ZSeqQuads   : RegisterTuples<[zsub0, zsub1, zsub2, zsub3], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2), (rotl ZPR, 3)]>;
+
+def ZPR2   : RegisterClass<"AArch64", [untyped], 128, (add ZSeqPairs)>  {
+  let Size = 256;
+}
+def ZPR3  : RegisterClass<"AArch64", [untyped], 128, (add ZSeqTriples)> {
+  let Size = 384;
+}
+def ZPR4 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqQuads)> {
+  let Size = 512;
+}
+
+class ZPRVectorList<int ElementWidth, int NumRegs> : AsmOperandClass {
+  let Name = "SVEVectorList" # NumRegs # ElementWidth;
+  let ParserMethod = "tryParseVectorList<RegKind::SVEDataVector>";
+  let PredicateMethod =
+      "isTypedVectorList<RegKind::SVEDataVector, " #NumRegs #", 0, " #ElementWidth #">";
+  let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_ZReg, " # NumRegs # ">";
+}
+
+def Z_b  : RegisterOperand<ZPR,  "printTypedVectorList<0,'b'>"> {
+  let ParserMatchClass = ZPRVectorList<8, 1>;
+}
+
+def Z_h  : RegisterOperand<ZPR,  "printTypedVectorList<0,'h'>"> {
+  let ParserMatchClass = ZPRVectorList<16, 1>;
+}
+
+def Z_s  : RegisterOperand<ZPR,  "printTypedVectorList<0,'s'>"> {
+  let ParserMatchClass = ZPRVectorList<32, 1>;
+}
+
+def Z_d  : RegisterOperand<ZPR,  "printTypedVectorList<0,'d'>"> {
+  let ParserMatchClass = ZPRVectorList<64, 1>;
+}
+
+def ZZ_b  : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> {
+  let ParserMatchClass = ZPRVectorList<8, 2>;
+}
+
+def ZZ_h  : RegisterOperand<ZPR2, "printTypedVectorList<0,'h'>"> {
+  let ParserMatchClass = ZPRVectorList<16, 2>;
+}
+
+def ZZ_s  : RegisterOperand<ZPR2, "printTypedVectorList<0,'s'>"> {
+  let ParserMatchClass = ZPRVectorList<32, 2>;
+}
+
+def ZZ_d  : RegisterOperand<ZPR2, "printTypedVectorList<0,'d'>"> {
+  let ParserMatchClass = ZPRVectorList<64, 2>;
+}
+
+def ZZZ_b  : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> {
+  let ParserMatchClass = ZPRVectorList<8, 3>;
+}
+
+def ZZZ_h  : RegisterOperand<ZPR3, "printTypedVectorList<0,'h'>"> {
+  let ParserMatchClass = ZPRVectorList<16, 3>;
+}
+
+def ZZZ_s  : RegisterOperand<ZPR3, "printTypedVectorList<0,'s'>"> {
+  let ParserMatchClass = ZPRVectorList<32, 3>;
+}
+
+def ZZZ_d  : RegisterOperand<ZPR3, "printTypedVectorList<0,'d'>"> {
+  let ParserMatchClass = ZPRVectorList<64, 3>;
+}
+
+def ZZZZ_b : RegisterOperand<ZPR4, "printTypedVectorList<0,'b'>"> {
+  let ParserMatchClass = ZPRVectorList<8, 4>;
+}
+
+def ZZZZ_h : RegisterOperand<ZPR4, "printTypedVectorList<0,'h'>"> {
+  let ParserMatchClass = ZPRVectorList<16, 4>;
+}
+
+def ZZZZ_s : RegisterOperand<ZPR4, "printTypedVectorList<0,'s'>"> {
+  let ParserMatchClass = ZPRVectorList<32, 4>;
+}
+
+def ZZZZ_d : RegisterOperand<ZPR4, "printTypedVectorList<0,'d'>"> {
+  let ParserMatchClass = ZPRVectorList<64, 4>;
+}
+
+class ZPRExtendAsmOperand<string ShiftExtend, int RegWidth, int Scale,
+                          bit ScaleAlwaysSame = 0b0> : AsmOperandClass {
+  let Name = "ZPRExtend" # ShiftExtend # RegWidth # Scale
+                         # !if(ScaleAlwaysSame, "Only", "");
+
+  let PredicateMethod = "isSVEDataVectorRegWithShiftExtend<"
+                          # RegWidth # ", AArch64::ZPRRegClassID, "
+                          # "AArch64_AM::" # ShiftExtend # ", "
+                          # Scale # ", "
+                          # !if(ScaleAlwaysSame, "true", "false")
+                          # ">";
+  let DiagnosticType = "InvalidZPR" # RegWidth # ShiftExtend # Scale;
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseSVEDataVector<true, true>";
+}
+
+class ZPRExtendRegisterOperand<bit SignExtend, bit IsLSL, string Repr,
+                               int RegWidth, int Scale, string Suffix = "">
+    : RegisterOperand<ZPR> {
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("ZPR" # RegWidth # "AsmOpndExt" # Repr # Scale # Suffix);
+  let PrintMethod = "printRegWithShiftExtend<"
+                          # !if(SignExtend, "true", "false") # ", "
+                          # Scale # ", "
+                          # !if(IsLSL, "'x'", "'w'") # ", "
+                          # !if(!eq(RegWidth, 32), "'s'", "'d'") # ">";
+}
+
+foreach RegWidth = [32, 64] in {
+  // UXTW(8|16|32|64)
+  def ZPR#RegWidth#AsmOpndExtUXTW8Only : ZPRExtendAsmOperand<"UXTW", RegWidth, 8, 0b1>;
+  def ZPR#RegWidth#AsmOpndExtUXTW8     : ZPRExtendAsmOperand<"UXTW", RegWidth, 8>;
+  def ZPR#RegWidth#AsmOpndExtUXTW16    : ZPRExtendAsmOperand<"UXTW", RegWidth, 16>;
+  def ZPR#RegWidth#AsmOpndExtUXTW32    : ZPRExtendAsmOperand<"UXTW", RegWidth, 32>;
+  def ZPR#RegWidth#AsmOpndExtUXTW64    : ZPRExtendAsmOperand<"UXTW", RegWidth, 64>;
+
+  def ZPR#RegWidth#ExtUXTW8Only        : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 8, "Only">;
+  def ZPR#RegWidth#ExtUXTW8            : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 8>;
+  def ZPR#RegWidth#ExtUXTW16           : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 16>;
+  def ZPR#RegWidth#ExtUXTW32           : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 32>;
+  def ZPR#RegWidth#ExtUXTW64           : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 64>;
+
+  // SXTW(8|16|32|64)
+  def ZPR#RegWidth#AsmOpndExtSXTW8Only : ZPRExtendAsmOperand<"SXTW", RegWidth, 8, 0b1>;
+  def ZPR#RegWidth#AsmOpndExtSXTW8     : ZPRExtendAsmOperand<"SXTW", RegWidth, 8>;
+  def ZPR#RegWidth#AsmOpndExtSXTW16    : ZPRExtendAsmOperand<"SXTW", RegWidth, 16>;
+  def ZPR#RegWidth#AsmOpndExtSXTW32    : ZPRExtendAsmOperand<"SXTW", RegWidth, 32>;
+  def ZPR#RegWidth#AsmOpndExtSXTW64    : ZPRExtendAsmOperand<"SXTW", RegWidth, 64>;
+
+  def ZPR#RegWidth#ExtSXTW8Only        : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 8, "Only">;
+  def ZPR#RegWidth#ExtSXTW8            : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 8>;
+  def ZPR#RegWidth#ExtSXTW16           : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 16>;
+  def ZPR#RegWidth#ExtSXTW32           : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 32>;
+  def ZPR#RegWidth#ExtSXTW64           : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 64>;
+
+  // LSL(8|16|32|64)
+  def ZPR#RegWidth#AsmOpndExtLSL8      : ZPRExtendAsmOperand<"LSL", RegWidth, 8>;
+  def ZPR#RegWidth#AsmOpndExtLSL16     : ZPRExtendAsmOperand<"LSL", RegWidth, 16>;
+  def ZPR#RegWidth#AsmOpndExtLSL32     : ZPRExtendAsmOperand<"LSL", RegWidth, 32>;
+  def ZPR#RegWidth#AsmOpndExtLSL64     : ZPRExtendAsmOperand<"LSL", RegWidth, 64>;
+  def ZPR#RegWidth#ExtLSL8             : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 8>;
+  def ZPR#RegWidth#ExtLSL16            : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 16>;
+  def ZPR#RegWidth#ExtLSL32            : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 32>;
+  def ZPR#RegWidth#ExtLSL64            : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 64>;
+}
+
+class GPR64ShiftExtendAsmOperand <string AsmOperandName, int Scale, string RegClass> : AsmOperandClass {
+  let Name = AsmOperandName # Scale;
+  let PredicateMethod = "isGPR64WithShiftExtend<AArch64::"#RegClass#"RegClassID, " # Scale # ">";
+  let DiagnosticType = "Invalid" # AsmOperandName # Scale;
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPROperand<true>";
+}
+
+class GPR64ExtendRegisterOperand<string Name, int Scale, RegisterClass RegClass> : RegisterOperand<RegClass>{
+  let ParserMatchClass = !cast<AsmOperandClass>(Name);
+  let PrintMethod = "printRegWithShiftExtend<false, " # Scale # ", 'x', 0>";
+}
+
+foreach Scale = [8, 16, 32, 64] in {
+  def GPR64shiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64shifted", Scale, "GPR64">;
+  def GPR64shifted # Scale : GPR64ExtendRegisterOperand<"GPR64shiftedAsmOpnd" # Scale, Scale, GPR64>;
+
+  def GPR64NoXZRshiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64NoXZRshifted", Scale, "GPR64common">;
+  def GPR64NoXZRshifted # Scale : GPR64ExtendRegisterOperand<"GPR64NoXZRshiftedAsmOpnd" # Scale, Scale, GPR64common>;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index e1851875abc5..af555f6d2266 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -700,7 +700,7 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
       static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
   if (!AAII)
     return false;
-  SchedModel.init(ST.getSchedModel(), &ST, AAII);
+  SchedModel.init(&ST);
   if (!SchedModel.hasInstrSchedModel())
     return false;
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bcd7b60875a2..16e6ddda6398 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -12,12 +12,975 @@
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasSVE] in {
+
+  def RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr">;
+  def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
+  def RDFFR_P    : sve_int_rdffr_unpred<"rdffr">;
+  def SETFFR     : sve_int_setffr<"setffr">;
+  def WRFFR      : sve_int_wrffr<"wrffr">;
+
   defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add">;
   defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub">;
+  defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">;
+  defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">;
+  defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">;
+  defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">;
+
+  def AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
+  def ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
+  def EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
+  def BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;
+
+  defm ADD_ZPmZ   : sve_int_bin_pred_arit_0<0b000, "add">;
+  defm SUB_ZPmZ   : sve_int_bin_pred_arit_0<0b001, "sub">;
+  defm SUBR_ZPmZ  : sve_int_bin_pred_arit_0<0b011, "subr">;
+
+  defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">;
+  defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">;
+  defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">;
+  defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">;
+
+  defm ADD_ZI   : sve_int_arith_imm0<0b000, "add">;
+  defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub">;
+  defm SUBR_ZI  : sve_int_arith_imm0<0b011, "subr">;
+  defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">;
+  defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">;
+  defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">;
+  defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">;
+
+  defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">;
+  defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">;
+  defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">;
+  defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">;
+
+  // SVE predicated integer reductions.
+  defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">;
+  defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">;
+  defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">;
+  defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">;
+  defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">;
+  defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">;
+  defm ORV_VPZ   : sve_int_reduce_2<0b000, "orv">;
+  defm EORV_VPZ  : sve_int_reduce_2<0b001, "eorv">;
+  defm ANDV_VPZ  : sve_int_reduce_2<0b010, "andv">;
+
+  defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">;
+  defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">;
+  defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">;
+
+  defm SMAX_ZI   : sve_int_arith_imm1<0b00, "smax", simm8>;
+  defm SMIN_ZI   : sve_int_arith_imm1<0b10, "smin", simm8>;
+  defm UMAX_ZI   : sve_int_arith_imm1<0b01, "umax", imm0_255>;
+  defm UMIN_ZI   : sve_int_arith_imm1<0b11, "umin", imm0_255>;
+
+  defm MUL_ZI    : sve_int_arith_imm2<"mul">;
+  defm MUL_ZPmZ   : sve_int_bin_pred_arit_2<0b000, "mul">;
+  defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">;
+  defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">;
+
+  defm SDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b100, "sdiv">;
+  defm UDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b101, "udiv">;
+  defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
+  defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
+
+  defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
+  defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;
+
+  defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
+  defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;
+
+  defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
+  defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
+  defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">;
+  defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
+  defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
+  defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
+  defm ABS_ZPmZ  : sve_int_un_pred_arit_0<  0b110, "abs">;
+  defm NEG_ZPmZ  : sve_int_un_pred_arit_0<  0b111, "neg">;
+
+  defm CLS_ZPmZ  : sve_int_un_pred_arit_1<   0b000, "cls">;
+  defm CLZ_ZPmZ  : sve_int_un_pred_arit_1<   0b001, "clz">;
+  defm CNT_ZPmZ  : sve_int_un_pred_arit_1<   0b010, "cnt">;
+  defm CNOT_ZPmZ : sve_int_un_pred_arit_1<   0b011, "cnot">;
+  defm NOT_ZPmZ  : sve_int_un_pred_arit_1<   0b110, "not">;
+  defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
+  defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
+
+  defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">;
+  defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">;
+  defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">;
+  defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">;
+  defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">;
+  defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">;
+
+  defm FRECPE_ZZ  : sve_fp_2op_u_zd<0b110, "frecpe">;
+  defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">;
+
+  defm FADD_ZPmI    : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
+  defm FSUB_ZPmI    : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
+  defm FMUL_ZPmI    : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
+  defm FSUBR_ZPmI   : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
+  defm FMAXNM_ZPmI  : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
+  defm FMINNM_ZPmI  : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
+  defm FMAX_ZPmI    : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
+  defm FMIN_ZPmI    : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
+
+  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd">;
+  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub">;
+  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul">;
+  defm FSUBR_ZPmZ  : sve_fp_2op_p_zds<0b0011, "fsubr">;
+  defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">;
+  defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">;
+  defm FMAX_ZPmZ   : sve_fp_2op_p_zds<0b0110, "fmax">;
+  defm FMIN_ZPmZ   : sve_fp_2op_p_zds<0b0111, "fmin">;
+  defm FABD_ZPmZ   : sve_fp_2op_p_zds<0b1000, "fabd">;
+  defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">;
+  defm FMULX_ZPmZ  : sve_fp_2op_p_zds<0b1010, "fmulx">;
+  defm FDIVR_ZPmZ  : sve_fp_2op_p_zds<0b1100, "fdivr">;
+  defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv">;
+
+  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd">;
+  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub">;
+  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul">;
+  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd<0b011, "ftsmul">;
+  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps">;
+  defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;
+
+  defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
+
+  defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">;
+  defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">;
+
+  defm FMLA_ZPmZZ  : sve_fp_3op_p_zds_a<0b00, "fmla">;
+  defm FMLS_ZPmZZ  : sve_fp_3op_p_zds_a<0b01, "fmls">;
+  defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">;
+  defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">;
+
+  defm FMAD_ZPmZZ  : sve_fp_3op_p_zds_b<0b00, "fmad">;
+  defm FMSB_ZPmZZ  : sve_fp_3op_p_zds_b<0b01, "fmsb">;
+  defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">;
+  defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">;
+
+  defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">;
+
+  defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">;
+  defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">;
+
+  defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">;
+  defm FMUL_ZZZI   : sve_fp_fmul_by_indexed_elem<"fmul">;
+
+  // SVE floating point reductions.
+  defm FADDA_VPZ   : sve_fp_2op_p_vd<0b000, "fadda">;
+  defm FADDV_VPZ   : sve_fp_fast_red<0b000, "faddv">;
+  defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">;
+  defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">;
+  defm FMAXV_VPZ   : sve_fp_fast_red<0b110, "fmaxv">;
+  defm FMINV_VPZ   : sve_fp_fast_red<0b111, "fminv">;
+
+  // Splat immediate (unpredicated)
+  defm DUP_ZI   : sve_int_dup_imm<"dup">;
+  defm FDUP_ZI  : sve_int_dup_fpimm<"fdup">;
+  defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;
+
+  // Splat immediate (predicated)
+  defm CPY_ZPmI  : sve_int_dup_imm_pred_merge<"cpy">;
+  defm CPY_ZPzI  : sve_int_dup_imm_pred_zero<"cpy">;
+  defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
+
+  // Splat scalar register (unpredicated, GPR or vector + element index)
+  defm DUP_ZR  : sve_int_perm_dup_r<"dup">;
+  defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
+
+  // Splat scalar register (predicated)
+  defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">;
+  defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;
+
+  // Select elements from either vector (predicated)
+  defm SEL_ZPZZ    : sve_int_sel_vvv<"sel">;
+
+  defm SPLICE_ZPZ : sve_int_perm_splice<"splice">;
+  defm COMPACT_ZPZ : sve_int_perm_compact<"compact">;
+  defm INSR_ZR : sve_int_perm_insrs<"insr">;
+  defm INSR_ZV : sve_int_perm_insrv<"insr">;
+  def  EXT_ZZI : sve_int_perm_extract_i<"ext">;
+
+  defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">;
+  defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">;
+  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">;
+  defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">;
+
+  defm REV_PP : sve_int_perm_reverse_p<"rev">;
+  defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
+
+  defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
+  defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
+  defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
+  defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;
+
+  def  PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
+  def  PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;
+
+  def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>;
+  def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>;
+  def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>;
+
+  def AND_PPzPP   : sve_int_pred_log<0b0000, "and">;
+  def BIC_PPzPP   : sve_int_pred_log<0b0001, "bic">;
+  def EOR_PPzPP   : sve_int_pred_log<0b0010, "eor">;
+  def SEL_PPPP    : sve_int_pred_log<0b0011, "sel">;
+  def ANDS_PPzPP  : sve_int_pred_log<0b0100, "ands">;
+  def BICS_PPzPP  : sve_int_pred_log<0b0101, "bics">;
+  def EORS_PPzPP  : sve_int_pred_log<0b0110, "eors">;
+  def ORR_PPzPP   : sve_int_pred_log<0b1000, "orr">;
+  def ORN_PPzPP   : sve_int_pred_log<0b1001, "orn">;
+  def NOR_PPzPP   : sve_int_pred_log<0b1010, "nor">;
+  def NAND_PPzPP  : sve_int_pred_log<0b1011, "nand">;
+  def ORRS_PPzPP  : sve_int_pred_log<0b1100, "orrs">;
+  def ORNS_PPzPP  : sve_int_pred_log<0b1101, "orns">;
+  def NORS_PPzPP  : sve_int_pred_log<0b1110, "nors">;
+  def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">;
+
+  defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">;
+  defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">;
+  defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">;
+  defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">;
+  defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">;
+  defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">;
+
+  defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">;
+  defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">;
+  defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">;
+  defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">;
+
+  // continuous load with reg+immediate
+  defm LD1B_IMM    : sve_mem_cld_si<0b0000, "ld1b",  Z_b, ZPR8>;
+  defm LD1B_H_IMM  : sve_mem_cld_si<0b0001, "ld1b",  Z_h, ZPR16>;
+  defm LD1B_S_IMM  : sve_mem_cld_si<0b0010, "ld1b",  Z_s, ZPR32>;
+  defm LD1B_D_IMM  : sve_mem_cld_si<0b0011, "ld1b",  Z_d, ZPR64>;
+  defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
+  defm LD1H_IMM    : sve_mem_cld_si<0b0101, "ld1h",  Z_h, ZPR16>;
+  defm LD1H_S_IMM  : sve_mem_cld_si<0b0110, "ld1h",  Z_s, ZPR32>;
+  defm LD1H_D_IMM  : sve_mem_cld_si<0b0111, "ld1h",  Z_d, ZPR64>;
+  defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
+  defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
+  defm LD1W_IMM    : sve_mem_cld_si<0b1010, "ld1w",  Z_s, ZPR32>;
+  defm LD1W_D_IMM  : sve_mem_cld_si<0b1011, "ld1w",  Z_d, ZPR64>;
+  defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
+  defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
+  defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
+  defm LD1D_IMM    : sve_mem_cld_si<0b1111, "ld1d",  Z_d, ZPR64>;
+
+  // LD1R loads (splat scalar to vector)
+  defm LD1RB_IMM    : sve_mem_ld_dup<0b00, 0b00, "ld1rb",  Z_b, ZPR8,  uimm6s1>;
+  defm LD1RB_H_IMM  : sve_mem_ld_dup<0b00, 0b01, "ld1rb",  Z_h, ZPR16, uimm6s1>;
+  defm LD1RB_S_IMM  : sve_mem_ld_dup<0b00, 0b10, "ld1rb",  Z_s, ZPR32, uimm6s1>;
+  defm LD1RB_D_IMM  : sve_mem_ld_dup<0b00, 0b11, "ld1rb",  Z_d, ZPR64, uimm6s1>;
+  defm LD1RSW_IMM   : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
+  defm LD1RH_IMM    : sve_mem_ld_dup<0b01, 0b01, "ld1rh",  Z_h, ZPR16, uimm6s2>;
+  defm LD1RH_S_IMM  : sve_mem_ld_dup<0b01, 0b10, "ld1rh",  Z_s, ZPR32, uimm6s2>;
+  defm LD1RH_D_IMM  : sve_mem_ld_dup<0b01, 0b11, "ld1rh",  Z_d, ZPR64, uimm6s2>;
+  defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
+  defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
+  defm LD1RW_IMM    : sve_mem_ld_dup<0b10, 0b10, "ld1rw",  Z_s, ZPR32, uimm6s4>;
+  defm LD1RW_D_IMM  : sve_mem_ld_dup<0b10, 0b11, "ld1rw",  Z_d, ZPR64, uimm6s4>;
+  defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
+  defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
+  defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
+  defm LD1RD_IMM    : sve_mem_ld_dup<0b11, 0b11, "ld1rd",  Z_d, ZPR64, uimm6s8>;
+
+  // LD1RQ loads (load quadword-vector and splat to scalable vector)
+  defm LD1RQ_B_IMM  : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
+  defm LD1RQ_H_IMM  : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
+  defm LD1RQ_W_IMM  : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
+  defm LD1RQ_D_IMM  : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
+  defm LD1RQ_B      : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8,  GPR64NoXZRshifted8>;
+  defm LD1RQ_H      : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
+  defm LD1RQ_W      : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
+  defm LD1RQ_D      : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+  // continuous load with reg+reg addressing.
+  defm LD1B    : sve_mem_cld_ss<0b0000, "ld1b",  Z_b, ZPR8,  GPR64NoXZRshifted8>;
+  defm LD1B_H  : sve_mem_cld_ss<0b0001, "ld1b",  Z_h, ZPR16, GPR64NoXZRshifted8>;
+  defm LD1B_S  : sve_mem_cld_ss<0b0010, "ld1b",  Z_s, ZPR32, GPR64NoXZRshifted8>;
+  defm LD1B_D  : sve_mem_cld_ss<0b0011, "ld1b",  Z_d, ZPR64, GPR64NoXZRshifted8>;
+  defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
+  defm LD1H    : sve_mem_cld_ss<0b0101, "ld1h",  Z_h, ZPR16, GPR64NoXZRshifted16>;
+  defm LD1H_S  : sve_mem_cld_ss<0b0110, "ld1h",  Z_s, ZPR32, GPR64NoXZRshifted16>;
+  defm LD1H_D  : sve_mem_cld_ss<0b0111, "ld1h",  Z_d, ZPR64, GPR64NoXZRshifted16>;
+  defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
+  defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
+  defm LD1W    : sve_mem_cld_ss<0b1010, "ld1w",  Z_s, ZPR32, GPR64NoXZRshifted32>;
+  defm LD1W_D  : sve_mem_cld_ss<0b1011, "ld1w",  Z_d, ZPR64, GPR64NoXZRshifted32>;
+  defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
+  defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
+  defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
+  defm LD1D    : sve_mem_cld_ss<0b1111, "ld1d",  Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+  // non-faulting continuous load with reg+immediate
+  defm LDNF1B_IMM    : sve_mem_cldnf_si<0b0000, "ldnf1b",  Z_b, ZPR8>;
+  defm LDNF1B_H_IMM  : sve_mem_cldnf_si<0b0001, "ldnf1b",  Z_h, ZPR16>;
+  defm LDNF1B_S_IMM  : sve_mem_cldnf_si<0b0010, "ldnf1b",  Z_s, ZPR32>;
+  defm LDNF1B_D_IMM  : sve_mem_cldnf_si<0b0011, "ldnf1b",  Z_d, ZPR64>;
+  defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
+  defm LDNF1H_IMM    : sve_mem_cldnf_si<0b0101, "ldnf1h",  Z_h, ZPR16>;
+  defm LDNF1H_S_IMM  : sve_mem_cldnf_si<0b0110, "ldnf1h",  Z_s, ZPR32>;
+  defm LDNF1H_D_IMM  : sve_mem_cldnf_si<0b0111, "ldnf1h",  Z_d, ZPR64>;
+  defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
+  defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
+  defm LDNF1W_IMM    : sve_mem_cldnf_si<0b1010, "ldnf1w",  Z_s, ZPR32>;
+  defm LDNF1W_D_IMM  : sve_mem_cldnf_si<0b1011, "ldnf1w",  Z_d, ZPR64>;
+  defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
+  defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
+  defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
+  defm LDNF1D_IMM    : sve_mem_cldnf_si<0b1111, "ldnf1d",  Z_d, ZPR64>;
+
+  // First-faulting loads with reg+reg addressing.
+  defm LDFF1B    : sve_mem_cldff_ss<0b0000, "ldff1b",  Z_b, ZPR8,  GPR64shifted8>;
+  defm LDFF1B_H  : sve_mem_cldff_ss<0b0001, "ldff1b",  Z_h, ZPR16, GPR64shifted8>;
+  defm LDFF1B_S  : sve_mem_cldff_ss<0b0010, "ldff1b",  Z_s, ZPR32, GPR64shifted8>;
+  defm LDFF1B_D  : sve_mem_cldff_ss<0b0011, "ldff1b",  Z_d, ZPR64, GPR64shifted8>;
+  defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
+  defm LDFF1H    : sve_mem_cldff_ss<0b0101, "ldff1h",  Z_h, ZPR16, GPR64shifted16>;
+  defm LDFF1H_S  : sve_mem_cldff_ss<0b0110, "ldff1h",  Z_s, ZPR32, GPR64shifted16>;
+  defm LDFF1H_D  : sve_mem_cldff_ss<0b0111, "ldff1h",  Z_d, ZPR64, GPR64shifted16>;
+  defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
+  defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
+  defm LDFF1W    : sve_mem_cldff_ss<0b1010, "ldff1w",  Z_s, ZPR32, GPR64shifted32>;
+  defm LDFF1W_D  : sve_mem_cldff_ss<0b1011, "ldff1w",  Z_d, ZPR64, GPR64shifted32>;
+  defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
+  defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
+  defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
+  defm LDFF1D    : sve_mem_cldff_ss<0b1111, "ldff1d",  Z_d, ZPR64, GPR64shifted64>;
+
+  // LD(2|3|4) structured loads with reg+immediate
+  defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b,   "ld2b", simm4s2>;
+  defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b,  "ld3b", simm4s3>;
+  defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
+  defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h,   "ld2h", simm4s2>;
+  defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h,  "ld3h", simm4s3>;
+  defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
+  defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s,   "ld2w", simm4s2>;
+  defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s,  "ld3w", simm4s3>;
+  defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
+  defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d,   "ld2d", simm4s2>;
+  defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d,  "ld3d", simm4s3>;
+  defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;
+
+  // LD(2|3|4) structured loads (register + register)
+  def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b,   "ld2b", GPR64NoXZRshifted8>;
+  def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b,  "ld3b", GPR64NoXZRshifted8>;
+  def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
+  def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h,   "ld2h", GPR64NoXZRshifted16>;
+  def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h,  "ld3h", GPR64NoXZRshifted16>;
+  def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
+  def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s,   "ld2w", GPR64NoXZRshifted32>;
+  def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s,  "ld3w", GPR64NoXZRshifted32>;
+  def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
+  def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d,   "ld2d", GPR64NoXZRshifted64>;
+  def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d,  "ld3d", GPR64NoXZRshifted64>;
+  def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
+
+  // Gathers using unscaled 32-bit offsets, e.g.
+  //    ld1h z0.s, p0/z, [x0, z0.s, uxtw]
+  defm GLD1SB_S   : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb",   ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+  defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+  defm GLD1B_S    : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b",    ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+  defm GLDFF1B_S  : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b",  ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+  defm GLD1SH_S   : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh",   ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm GLD1H_S    : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h",    ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h",  ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm GLD1W      : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w",    ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm GLDFF1W    : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w",  ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+
+  // Gathers using scaled 32-bit offsets, e.g.
+  //    ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
+  defm GLD1SH_S   : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh",   ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+  defm GLD1H_S    : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h",    ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h",  ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+  defm GLD1W      : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w",    ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+  defm GLDFF1W    : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w",  ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+
+  // Gathers using scaled 32-bit pointers with offset, e.g.
+  //    ld1h z0.s, p0/z, [z0.s, #16]
+  defm GLD1SB_S   : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb",   imm0_31>;
+  defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>;
+  defm GLD1B_S    : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b",    imm0_31>;
+  defm GLDFF1B_S  : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b",  imm0_31>;
+  defm GLD1SH_S   : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh",   uimm5s2>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>;
+  defm GLD1H_S    : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h",    uimm5s2>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h",  uimm5s2>;
+  defm GLD1W      : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w",    uimm5s4>;
+  defm GLDFF1W    : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w",  uimm5s4>;
+
+  // Gathers using scaled 64-bit pointers with offset, e.g.
+  //    ld1h z0.d, p0/z, [z0.d, #16]
+  defm GLD1SB_D   : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb",   imm0_31>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>;
+  defm GLD1B_D    : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b",    imm0_31>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b",  imm0_31>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh",   uimm5s2>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>;
+  defm GLD1H_D    : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h",    uimm5s2>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h",  uimm5s2>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw",   uimm5s4>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>;
+  defm GLD1W_D    : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w",    uimm5s4>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w",  uimm5s4>;
+  defm GLD1D      : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d",    uimm5s8>;
+  defm GLDFF1D    : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d",  uimm5s8>;
+
+  // Gathers using unscaled 64-bit offsets, e.g.
+  //    ld1h z0.d, p0/z, [x0, z0.d]
+  defm GLD1SB_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
+  defm GLD1B_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
+  defm GLD1SH_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
+  defm GLD1H_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
+  defm GLD1SW_D   : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
+  defm GLD1W_D    : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
+  defm GLD1D      : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
+  defm GLDFF1D    : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
+
+  // Gathers using scaled 64-bit offsets, e.g.
+  //    ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
+  defm GLD1SH_D   : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh",   ZPR64ExtLSL16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
+  defm GLD1H_D    : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h",    ZPR64ExtLSL16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h",  ZPR64ExtLSL16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw",   ZPR64ExtLSL32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
+  defm GLD1W_D    : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w",    ZPR64ExtLSL32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w",  ZPR64ExtLSL32>;
+  defm GLD1D      : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d",    ZPR64ExtLSL64>;
+  defm GLDFF1D    : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d",  ZPR64ExtLSL64>;
+
+  // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
+  //    ld1h z0.d, p0/z, [x0, z0.d, uxtw]
+  defm GLD1SB_D   : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb",   ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+  defm GLD1B_D    : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b",    ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b",  ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh",   ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLD1H_D    : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h",    ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h",  ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw",   ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLD1W_D    : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w",    ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w",  ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLD1D      : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d",    ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLDFF1D    : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d",  ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+
+  // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
+  //    ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
+  defm GLD1SH_D   : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh",  ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+  defm GLD1H_D    : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h",   ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw",  ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+  defm GLD1W_D    : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w",   ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+  defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",   ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+  defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+
+  // Non-temporal contiguous loads (register + immediate)
+  defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
+  defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
+  defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
+  defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;
+
+  // Non-temporal contiguous loads (register + register)
+  defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8,  GPR64NoXZRshifted8>;
+  defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+  defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+  defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+  // contiguous store with immediates
+  defm ST1B_IMM   : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
+  defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
+  defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
+  defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
+  defm ST1H_IMM   : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
+  defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
+  defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
+  defm ST1W_IMM   : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
+  defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
+  defm ST1D_IMM   : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;
+
+  // contiguous store with reg+reg addressing.
+  defm ST1B   : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8,  GPR64NoXZRshifted8>;
+  defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
+  defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
+  defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
+  defm ST1H   : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+  defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
+  defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
+  defm ST1W   : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+  defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
+  defm ST1D   : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+  // Scatters using unscaled 32-bit offsets, e.g.
+  //    st1h z0.s, p0, [x0, z0.s, uxtw]
+  // and unpacked:
+  //    st1h z0.d, p0, [x0, z0.d, uxtw]
+  defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+  defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
+  defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm SST1W   : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm SST1D   : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+
+  // Scatters using scaled 32-bit offsets, e.g.
+  //    st1h z0.s, p0, [x0, z0.s, uxtw #1]
+  // and unpacked:
+  //    st1h z0.d, p0, [x0, z0.d, uxtw #1]
+  defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+  defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+  defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+  defm SST1W   : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+  defm SST1D   : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+
+  // Scatters using 32/64-bit pointers with offset, e.g.
+  //    st1h z0.s, p0, [z0.s, #16]
+  //    st1h z0.d, p0, [z0.d, #16]
+  defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
+  defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
+  defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
+  defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
+  defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
+  defm SST1W   : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
+  defm SST1D   : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;
+
+  // Scatters using unscaled 64-bit offsets, e.g.
+  //    st1h z0.d, p0, [x0, z0.d]
+  defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
+  defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
+  defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
+  defm SST1D   : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;
+
+  // Scatters using scaled 64-bit offsets, e.g.
+  //    st1h z0.d, p0, [x0, z0.d, lsl #1]
+  defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
+  defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
+  defm SST1D_SCALED   : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;
+
+  // ST(2|3|4) structured stores (register + immediate)
+  defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b,   "st2b", simm4s2>;
+  defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b,  "st3b", simm4s3>;
+  defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
+  defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h,   "st2h", simm4s2>;
+  defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h,  "st3h", simm4s3>;
+  defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
+  defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s,   "st2w", simm4s2>;
+  defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s,  "st3w", simm4s3>;
+  defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
+  defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d,   "st2d", simm4s2>;
+  defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d,  "st3d", simm4s3>;
+  defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;
+
+  // ST(2|3|4) structured stores (register + register)
+  def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b,   "st2b", GPR64NoXZRshifted8>;
+  def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b,  "st3b", GPR64NoXZRshifted8>;
+  def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
+  def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h,   "st2h", GPR64NoXZRshifted16>;
+  def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h,  "st3h", GPR64NoXZRshifted16>;
+  def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
+  def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s,   "st2w", GPR64NoXZRshifted32>;
+  def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s,  "st3w", GPR64NoXZRshifted32>;
+  def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
+  def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d,   "st2d", GPR64NoXZRshifted64>;
+  def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d,  "st3d", GPR64NoXZRshifted64>;
+  def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;
+
+  // Non-temporal contiguous stores (register + immediate)
+  defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
+  defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
+  defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
+  defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;
+
+  // Non-temporal contiguous stores (register + register)
+  defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
+  defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
+  defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
+  defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+
+  // Fill/Spill
+  defm LDR_ZXI : sve_mem_z_fill<"ldr">;
+  defm LDR_PXI : sve_mem_p_fill<"ldr">;
+  defm STR_ZXI : sve_mem_z_spill<"str">;
+  defm STR_PXI : sve_mem_p_spill<"str">;
+
+  // Contiguous prefetch (register + immediate)
+  defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
+  defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
+  defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
+  defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;
+
+  // Contiguous prefetch (register + register)
+  def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
+  def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
+  def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
+  def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
+
+  // Gather prefetch using scaled 32-bit offsets, e.g.
+  //    prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
+  defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only,  ZPR32ExtUXTW8Only>;
+  defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
+  defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
+  defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
+
+  // Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
+  //    prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
+  defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
+  defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
+  defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
+  defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+
+  // Gather prefetch using scaled 64-bit offsets, e.g.
+  //    prfh pldl1keep, p0, [x0, z0.d, lsl #1]
+  defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
+  defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
+  defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
+  defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
+
+  // Gather prefetch using 32/64-bit pointers with offset, e.g.
+  //    prfh pldl1keep, p0, [z0.s, #16]
+  //    prfh pldl1keep, p0, [z0.d, #16]
+  defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
+  defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
+  defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
+  defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;
+
+  defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
+  defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
+  defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
+  defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
+
+  defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
+  defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
+  defm ADR_LSL_ZZZ_S  : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
+  defm ADR_LSL_ZZZ_D  : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
+
+  defm TBL_ZZZ  : sve_int_perm_tbl<"tbl">;
 
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">;
   defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">;
+  defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">;
+  defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">;
+  defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">;
+  defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">;
 
   defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">;
   defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">;
+  defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">;
+  defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">;
+  defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">;
+  defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">;
+
+  defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">;
+  defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">;
+  defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">;
+  defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">;
+  defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">;
+  defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">;
+
+  defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">;
+  defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">;
+  defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">;
+  defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">;
+  defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">;
+  defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">;
+  defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">;
+  defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">;
+  defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">;
+  defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">;
+
+  defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">;
+  defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">;
+  defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">;
+  defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">;
+  defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">;
+  defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">;
+  defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">;
+  defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">;
+  defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">;
+  defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">;
+
+  defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">;
+  defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">;
+  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">;
+  defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">;
+  defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">;
+  defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">;
+  defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">;
+
+  defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
+  defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
+  defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
+  defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
+  defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
+  defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
+
+  def RDVLI_XI  : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
+  def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
+  def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;
+
+  defm CNTB_XPiI : sve_int_count<0b000, "cntb">;
+  defm CNTH_XPiI : sve_int_count<0b010, "cnth">;
+  defm CNTW_XPiI : sve_int_count<0b100, "cntw">;
+  defm CNTD_XPiI : sve_int_count<0b110, "cntd">;
+  defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">;
+
+  defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
+  defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
+  defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
+  defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
+  defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
+  defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
+  defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
+  defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;
+
+  defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">;
+  defm UQINCB_WPiI   : sve_int_pred_pattern_b_u32<0b00001, "uqincb">;
+  defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">;
+  defm UQDECB_WPiI   : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">;
+  defm SQINCB_XPiI   : sve_int_pred_pattern_b_x64<0b00100, "sqincb">;
+  defm UQINCB_XPiI   : sve_int_pred_pattern_b_x64<0b00101, "uqincb">;
+  defm SQDECB_XPiI   : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">;
+  defm UQDECB_XPiI   : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">;
+
+  defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">;
+  defm UQINCH_WPiI   : sve_int_pred_pattern_b_u32<0b01001, "uqinch">;
+  defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">;
+  defm UQDECH_WPiI   : sve_int_pred_pattern_b_u32<0b01011, "uqdech">;
+  defm SQINCH_XPiI   : sve_int_pred_pattern_b_x64<0b01100, "sqinch">;
+  defm UQINCH_XPiI   : sve_int_pred_pattern_b_x64<0b01101, "uqinch">;
+  defm SQDECH_XPiI   : sve_int_pred_pattern_b_x64<0b01110, "sqdech">;
+  defm UQDECH_XPiI   : sve_int_pred_pattern_b_x64<0b01111, "uqdech">;
+
+  defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">;
+  defm UQINCW_WPiI   : sve_int_pred_pattern_b_u32<0b10001, "uqincw">;
+  defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">;
+  defm UQDECW_WPiI   : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">;
+  defm SQINCW_XPiI   : sve_int_pred_pattern_b_x64<0b10100, "sqincw">;
+  defm UQINCW_XPiI   : sve_int_pred_pattern_b_x64<0b10101, "uqincw">;
+  defm SQDECW_XPiI   : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">;
+  defm UQDECW_XPiI   : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">;
+
+  defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">;
+  defm UQINCD_WPiI   : sve_int_pred_pattern_b_u32<0b11001, "uqincd">;
+  defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">;
+  defm UQDECD_WPiI   : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">;
+  defm SQINCD_XPiI   : sve_int_pred_pattern_b_x64<0b11100, "sqincd">;
+  defm UQINCD_XPiI   : sve_int_pred_pattern_b_x64<0b11101, "uqincd">;
+  defm SQDECD_XPiI   : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">;
+  defm UQDECD_XPiI   : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">;
+
+  defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>;
+  defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>;
+  defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>;
+  defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>;
+  defm INCH_ZPiI   : sve_int_countvlv<0b01100, "inch",   ZPR16>;
+  defm DECH_ZPiI   : sve_int_countvlv<0b01101, "dech",   ZPR16>;
+  defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>;
+  defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>;
+  defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>;
+  defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>;
+  defm INCW_ZPiI   : sve_int_countvlv<0b10100, "incw",   ZPR32>;
+  defm DECW_ZPiI   : sve_int_countvlv<0b10101, "decw",   ZPR32>;
+  defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>;
+  defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>;
+  defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>;
+  defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>;
+  defm INCD_ZPiI   : sve_int_countvlv<0b11100, "incd",   ZPR64>;
+  defm DECD_ZPiI   : sve_int_countvlv<0b11101, "decd",   ZPR64>;
+
+  defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">;
+  defm SQINCP_XP   : sve_int_count_r_x64<0b00010, "sqincp">;
+  defm UQINCP_WP   : sve_int_count_r_u32<0b00100, "uqincp">;
+  defm UQINCP_XP   : sve_int_count_r_x64<0b00110, "uqincp">;
+  defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">;
+  defm SQDECP_XP   : sve_int_count_r_x64<0b01010, "sqdecp">;
+  defm UQDECP_WP   : sve_int_count_r_u32<0b01100, "uqdecp">;
+  defm UQDECP_XP   : sve_int_count_r_x64<0b01110, "uqdecp">;
+  defm INCP_XP     : sve_int_count_r_x64<0b10000, "incp">;
+  defm DECP_XP     : sve_int_count_r_x64<0b10100, "decp">;
+
+  defm SQINCP_ZP   : sve_int_count_v<0b00000, "sqincp">;
+  defm UQINCP_ZP   : sve_int_count_v<0b00100, "uqincp">;
+  defm SQDECP_ZP   : sve_int_count_v<0b01000, "sqdecp">;
+  defm UQDECP_ZP   : sve_int_count_v<0b01100, "uqdecp">;
+  defm INCP_ZP     : sve_int_count_v<0b10000, "incp">;
+  defm DECP_ZP     : sve_int_count_v<0b10100, "decp">;
+
+  defm INDEX_RR : sve_int_index_rr<"index">;
+  defm INDEX_IR : sve_int_index_ir<"index">;
+  defm INDEX_RI : sve_int_index_ri<"index">;
+  defm INDEX_II : sve_int_index_ii<"index">;
+
+  // Unpredicated shifts
+  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
+  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
+  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
+
+  defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
+  defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
+  defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
+
+  // Predicated shifts
+  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b000, "asr">;
+  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b001, "lsr">;
+  defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b011, "lsl">;
+  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b100, "asrd">;
+
+  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr">;
+  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr">;
+  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl">;
+  defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">;
+  defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">;
+  defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">;
+
+  defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">;
+  defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">;
+  defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">;
+
+  def FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16>;
+  def FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32>;
+  def SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16>;
+  def SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32>;
+  def UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32>;
+  def UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16>;
+  def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16>;
+  def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32>;
+  def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16>;
+  def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32>;
+  def FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16>;
+  def FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64>;
+  def FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32>;
+  def FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64>;
+  def SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64>;
+  def UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64>;
+  def UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16>;
+  def SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32>;
+  def SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16>;
+  def SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16>;
+  def UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32>;
+  def UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16>;
+  def SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64>;
+  def UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64>;
+  def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32>;
+  def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32>;
+  def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64>;
+  def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32>;
+  def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64>;
+  def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32>;
+  def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64>;
+  def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64>;
+  def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64>;
+  def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64>;
+
+  defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">;
+  defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">;
+  defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">;
+  defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">;
+  defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">;
+  defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">;
+  defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">;
+  defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">;
+  defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">;
+
+  // InstAliases
+  def : InstAlias<"mov $Zd, $Zn",
+                  (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
+  def : InstAlias<"mov $Pd, $Pg/m, $Pn",
+                  (SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
+  def : InstAlias<"mov $Pd, $Pn",
+                  (ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
+  def : InstAlias<"mov $Pd, $Pg/z, $Pn",
+                  (AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
+
+  def : InstAlias<"movs $Pd, $Pn",
+                  (ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
+  def : InstAlias<"movs $Pd, $Pg/z, $Pn",
+                  (ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
+
+  def : InstAlias<"not $Pd, $Pg/z, $Pn",
+                  (EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
+
+  def : InstAlias<"nots $Pd, $Pg/z, $Pn",
+                  (EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
+
+  def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+  def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+  def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+  def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
+  def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
+                  (CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
+                  (FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
+                  (FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
+                  (FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
+                  (FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
+                  (FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
+                  (FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
+                  (FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
+                  (FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
+                  (FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
+                  (FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
+                  (FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
+                  (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
index 90ebd78f4ab9..f253a4f3e25a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -222,19 +222,19 @@ def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
-def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev2d$")>;
 def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
 def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev2d_POST$")>;
 
 def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td
index 91b6ffcd7083..ecc68aed1550 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -1,4 +1,4 @@
-//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
+//=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the machine model for Samsung Exynos-M1 to support
+// This file defines the machine model for the Samsung Exynos M1 to support
 // instruction scheduling and other instruction cost heuristics.
 //
 //===----------------------------------------------------------------------===//
@@ -32,6 +32,8 @@ def ExynosM1Model : SchedMachineModel {
 // Define each kind of processor resource and number available on the Exynos-M1,
 // which has 9 pipelines, each with its own queue with out-of-order dispatch.
 
+let SchedModel = ExynosM1Model in {
+
 def M1UnitA  : ProcResource<2>; // Simple integer
 def M1UnitC  : ProcResource<1>; // Simple and complex integer
 def M1UnitD  : ProcResource<1>; // Integer division (inside C, serialized)
@@ -54,14 +56,10 @@ let Super = M1PipeF1 in {
   def M1UnitFST  : ProcResource<1>; // FP store
 }
 
-let SchedModel = ExynosM1Model in {
-  def M1UnitALU  : ProcResGroup<[M1UnitA,
-                                 M1UnitC]>;    // All integer
-  def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
-                                 M1UnitNAL1]>; // All simple vector
-}
-
-let SchedModel = ExynosM1Model in {
+def M1UnitALU  : ProcResGroup<[M1UnitA,
+                               M1UnitC]>;    // All integer
+def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
+                               M1UnitNAL1]>; // All simple vector
 
 //===----------------------------------------------------------------------===//
 // Predicates.
@@ -109,7 +107,7 @@ def M1WriteLC : SchedWriteRes<[M1UnitL,
 def M1WriteLD : SchedWriteRes<[M1UnitL,
                                M1UnitA]> { let Latency = 6;
                                            let NumMicroOps = 2;
-                                           let ResourceCycles = [2]; }
+                                           let ResourceCycles = [2, 1]; }
 def M1WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
 def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>,
@@ -321,19 +319,19 @@ def M1WriteVLDC    : SchedWriteRes<[M1UnitL,
 def M1WriteVLDD    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU]>   { let Latency = 7;
                                                      let NumMicroOps = 2;
-                                                     let ResourceCycles = [2]; }
+                                                     let ResourceCycles = [2, 1]; }
 def M1WriteVLDE    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU]>   { let Latency = 6;
                                                      let NumMicroOps = 2; }
 def M1WriteVLDF    : SchedWriteRes<[M1UnitL,
                                     M1UnitL]>      { let Latency = 10;
                                                      let NumMicroOps = 2;
-                                                     let ResourceCycles = [5]; }
+                                                     let ResourceCycles = [1, 1]; }
 def M1WriteVLDG    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU]>   { let Latency = 7;
                                                      let NumMicroOps = 3;
-                                                     let ResourceCycles = [2]; }
+                                                     let ResourceCycles = [2, 1, 1]; }
 def M1WriteVLDH    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU]>   { let Latency = 6;
@@ -342,27 +340,27 @@ def M1WriteVLDI    : SchedWriteRes<[M1UnitL,
                                     M1UnitL,
                                     M1UnitL]>      { let Latency = 12;
                                                      let NumMicroOps = 3;
-                                                     let ResourceCycles = [6]; }
+                                                     let ResourceCycles = [2, 2, 2]; }
 def M1WriteVLDJ    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU,
                                     M1UnitNALU]>   { let Latency = 9;
                                                      let NumMicroOps = 4;
-                                                     let ResourceCycles = [4]; }
+                                                     let ResourceCycles = [2, 1, 1, 1]; }
 def M1WriteVLDK    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU,
                                     M1UnitNALU,
                                     M1UnitNALU]>   { let Latency = 9;
                                                      let NumMicroOps = 5;
-                                                     let ResourceCycles = [4]; }
+                                                     let ResourceCycles = [2, 1, 1, 1, 1]; }
 def M1WriteVLDL    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU,
                                     M1UnitL,
                                     M1UnitNALU]>   { let Latency = 7;
                                                      let NumMicroOps = 5;
-                                                     let ResourceCycles = [2]; }
+                                                     let ResourceCycles = [1, 1, 1, 1, 1]; }
 def M1WriteVLDM    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU,
@@ -370,13 +368,13 @@ def M1WriteVLDM    : SchedWriteRes<[M1UnitL,
                                     M1UnitNALU,
                                     M1UnitNALU]>   { let Latency = 7;
                                                      let NumMicroOps = 6;
-                                                     let ResourceCycles = [2]; }
+                                                     let ResourceCycles = [1, 1, 1, 1, 1, 1]; }
 def M1WriteVLDN    : SchedWriteRes<[M1UnitL,
                                     M1UnitL,
                                     M1UnitL,
                                     M1UnitL]>      { let Latency = 14;
                                                      let NumMicroOps = 4;
-                                                     let ResourceCycles = [7]; }
+                                                     let ResourceCycles = [2, 1, 2, 1]; }
 def M1WriteVSTA    : WriteSequence<[WriteVST], 2>;
 def M1WriteVSTB    : WriteSequence<[WriteVST], 3>;
 def M1WriteVSTC    : WriteSequence<[WriteVST], 4>;
@@ -384,14 +382,14 @@ def M1WriteVSTD    : SchedWriteRes<[M1UnitS,
                                     M1UnitFST,
                                     M1UnitFST]>    { let Latency = 7;
                                                      let NumMicroOps = 2;
-                                                     let ResourceCycles = [7]; }
+                                                     let ResourceCycles = [7, 1, 1]; }
 def M1WriteVSTE    : SchedWriteRes<[M1UnitS,
                                     M1UnitFST,
                                     M1UnitS,
                                     M1UnitFST,
                                     M1UnitFST]>    { let Latency = 8;
                                                      let NumMicroOps = 3;
-                                                     let ResourceCycles = [8]; }
+                                                     let ResourceCycles = [7, 1, 1, 1, 1]; }
 def M1WriteVSTF    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
@@ -400,7 +398,7 @@ def M1WriteVSTF    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitFST,
                                     M1UnitFST]>     { let Latency = 15;
                                                       let NumMicroOps = 5;
-                                                      let ResourceCycles = [15]; }
+                                                      let ResourceCycles = [1, 7, 1, 7, 1, 1, 1]; }
 def M1WriteVSTG    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
@@ -411,14 +409,14 @@ def M1WriteVSTG    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitFST,
                                     M1UnitFST]>     { let Latency = 16;
                                                       let NumMicroOps = 6;
-                                                      let ResourceCycles = [16]; }
+                                                      let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1]; }
 def M1WriteVSTH    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
                                     M1UnitFST,
                                     M1UnitFST]>      { let Latency = 14;
                                                        let NumMicroOps = 4;
-                                                       let ResourceCycles = [14]; }
+                                                       let ResourceCycles = [1, 7, 1, 7, 1]; }
 def M1WriteVSTI    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
@@ -431,7 +429,7 @@ def M1WriteVSTI    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitFST,
                                     M1UnitFST]>      { let Latency = 17;
                                                        let NumMicroOps = 7;
-                                                       let ResourceCycles = [17]; }
+                                                       let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; }
 
 // Branch instructions
 def : InstRW<[M1WriteB1], (instrs Bcc)>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
new file mode 100644
index 000000000000..5e5369a5a7fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -0,0 +1,860 @@
+//=- AArch64SchedExynosM3.td - Samsung Exynos M3 Sched Defs --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Samsung Exynos M3 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Exynos-M3 is an advanced superscalar microprocessor with a 6-wide
+// in-order stage for decode and dispatch and a wider issue stage.
+// The execution units and loads and stores are out-of-order.
+
+def ExynosM3Model : SchedMachineModel {
+  let IssueWidth            =   6; // Up to 6 uops per cycle.
+  let MicroOpBufferSize     = 228; // ROB size.
+  let LoopMicroOpBufferSize =  40; // Based on the instruction queue size.
+  let LoadLatency           =   4; // Optimistic load cases.
+  let MispredictPenalty     =  16; // Minimum branch misprediction penalty.
+  let CompleteModel         =   1; // Use the default model otherwise.
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on the Exynos-M3,
+// which has 12 pipelines, each with its own queue with out-of-order dispatch.
+
+let SchedModel = ExynosM3Model in {
+
+def M3UnitA  : ProcResource<2>; // Simple integer
+def M3UnitC  : ProcResource<2>; // Simple and complex integer
+def M3UnitD  : ProcResource<1>; // Integer division (inside C0, serialized)
+def M3UnitB  : ProcResource<2>; // Branch
+def M3UnitL  : ProcResource<2>; // Load
+def M3UnitS  : ProcResource<1>; // Store
+def M3PipeF0 : ProcResource<1>; // FP #0
+let Super = M3PipeF0 in {
+  def M3UnitFMAC0 : ProcResource<1>; // FP multiplication
+  def M3UnitFADD0 : ProcResource<1>; // Simple FP
+  def M3UnitFCVT0 : ProcResource<1>; // FP conversion
+  def M3UnitFSQR  : ProcResource<2>; // FP square root (serialized)
+  def M3UnitNALU0 : ProcResource<1>; // Simple vector
+  def M3UnitNMSC  : ProcResource<1>; // FP and vector miscellanea
+  def M3UnitNSHT0 : ProcResource<1>; // Vector shifting
+  def M3UnitNSHF0 : ProcResource<1>; // Vector shuffling
+}
+def M3PipeF1 : ProcResource<1>; // FP #1
+let Super = M3PipeF1 in {
+  def M3UnitFMAC1 : ProcResource<1>; // FP multiplication
+  def M3UnitFADD1 : ProcResource<1>; // Simple FP
+  def M3UnitFDIV0 : ProcResource<2>; // FP division (serialized)
+  def M3UnitFCVT1 : ProcResource<1>; // FP conversion
+  def M3UnitFST0  : ProcResource<1>; // FP store
+  def M3UnitNALU1 : ProcResource<1>; // Simple vector
+  def M3UnitNCRY0 : ProcResource<1>; // Cryptographic
+  def M3UnitNMUL  : ProcResource<1>; // Vector multiplication
+  def M3UnitNSHT1 : ProcResource<1>; // Vector shifting
+  def M3UnitNSHF1 : ProcResource<1>; // Vector shuffling
+}
+def M3PipeF2 : ProcResource<1>; // FP #2
+let Super = M3PipeF2 in {
+  def M3UnitFMAC2 : ProcResource<1>; // FP multiplication
+  def M3UnitFADD2 : ProcResource<1>; // Simple FP
+  def M3UnitFDIV1 : ProcResource<2>; // FP division (serialized)
+  def M3UnitFST1  : ProcResource<1>; // FP store
+  def M3UnitNALU2 : ProcResource<1>; // Simple vector
+  def M3UnitNCRY1 : ProcResource<1>; // Cryptographic
+  def M3UnitNSHT2 : ProcResource<1>; // Vector shifting
+  def M3UnitNSHF2 : ProcResource<1>; // Vector shuffling
+}
+
+
+def M3UnitALU  : ProcResGroup<[M3UnitA,
+                               M3UnitC]>;
+def M3UnitFMAC : ProcResGroup<[M3UnitFMAC0,
+                               M3UnitFMAC1,
+                               M3UnitFMAC2]>;
+def M3UnitFADD : ProcResGroup<[M3UnitFADD0,
+                               M3UnitFADD1,
+                               M3UnitFADD2]>;
+def M3UnitFDIV : ProcResGroup<[M3UnitFDIV0,
+                               M3UnitFDIV1]>;
+def M3UnitFCVT : ProcResGroup<[M3UnitFCVT0,
+                               M3UnitFCVT1]>;
+def M3UnitFST  : ProcResGroup<[M3UnitFST0,
+                               M3UnitFST1]>;
+def M3UnitNALU : ProcResGroup<[M3UnitNALU0,
+                               M3UnitNALU1,
+                               M3UnitNALU2]>;
+def M3UnitNCRY : ProcResGroup<[M3UnitNCRY0,
+                               M3UnitNCRY1]>;
+def M3UnitNSHT : ProcResGroup<[M3UnitNSHT0,
+                               M3UnitNSHT1,
+                               M3UnitNSHT2]>;
+def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0,
+                               M3UnitNSHF1,
+                               M3UnitNSHF2]>;
+
+//===----------------------------------------------------------------------===//
+// Predicates.
+
+def M3BranchLinkFastPred  : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
+                                             MI->getOperand(0).isReg() &&
+                                             MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M3ResetFastPred       : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
+def M3RotateRightFastPred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
+                                              MI->getOpcode() == AArch64::EXTRXrri) &&
+                                             MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+                                             MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
+def M3ShiftLeftFastPred   : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>;
+
+//===----------------------------------------------------------------------===//
+// Coarse scheduling model.
+
+def M3WriteZ0 : SchedWriteRes<[]> { let Latency = 0;
+                                    let NumMicroOps = 1; }
+
+def M3WriteA1 : SchedWriteRes<[M3UnitALU]> { let Latency = 1; }
+def M3WriteAA : SchedWriteRes<[M3UnitALU]> { let Latency = 2;
+                                             let ResourceCycles = [2]; }
+def M3WriteAB : SchedWriteRes<[M3UnitALU,
+                               M3UnitC]>   { let Latency = 1;
+                                             let NumMicroOps = 2; }
+def M3WriteAC : SchedWriteRes<[M3UnitALU,
+                               M3UnitALU,
+                               M3UnitC]>   { let Latency = 2;
+                                             let NumMicroOps = 3; }
+def M3WriteAD : SchedWriteRes<[M3UnitALU,
+                               M3UnitC]>   { let Latency = 2;
+                                             let NumMicroOps = 2; }
+def M3WriteC1 : SchedWriteRes<[M3UnitC]>   { let Latency = 1; }
+def M3WriteC2 : SchedWriteRes<[M3UnitC]>   { let Latency = 2; }
+def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetFastPred,     [M3WriteZ0]>,
+                                   SchedVar<M3ShiftLeftFastPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,         [M3WriteAA]>]>;
+def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotateRightFastPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,           [M3WriteAA]>]>;
+
+def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; }
+def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkFastPred, [M3WriteAB]>,
+                                   SchedVar<NoSchedPred,          [M3WriteAC]>]>;
+
+def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; }
+def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; }
+def M3WriteLA : SchedWriteRes<[M3UnitL,
+                               M3UnitL]> { let Latency = 5;
+                                           let NumMicroOps = 1; }
+def M3WriteLB : SchedWriteRes<[M3UnitA,
+                               M3UnitL]> { let Latency = 5;
+                                           let NumMicroOps = 2; }
+def M3WriteLC : SchedWriteRes<[M3UnitA,
+                               M3UnitL,
+                               M3UnitL]> { let Latency = 5;
+                                           let NumMicroOps = 2; }
+def M3WriteLD : SchedWriteRes<[M3UnitA,
+                               M3UnitL]> { let Latency = 4;
+                                           let NumMicroOps = 2; }
+def M3WriteLH : SchedWriteRes<[]>        { let Latency = 5;
+                                           let NumMicroOps = 0; }
+
+def M3WriteLX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteL5]>,
+                                   SchedVar<NoSchedPred,         [M3WriteLB]>]>;
+
+def M3WriteS1 : SchedWriteRes<[M3UnitS]>   { let Latency = 1; }
+def M3WriteSA : SchedWriteRes<[M3UnitA,
+                               M3UnitS,
+                               M3UnitFST]> { let Latency = 2;
+                                             let NumMicroOps = 2; }
+def M3WriteSB : SchedWriteRes<[M3UnitA,
+                               M3UnitS]>   { let Latency = 1;
+                                             let NumMicroOps = 2; }
+def M3WriteSC : SchedWriteRes<[M3UnitA,
+                               M3UnitS]>   { let Latency = 2;
+                                             let NumMicroOps = 2; }
+
+def M3WriteSX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,         [M3WriteSB]>]>;
+def M3WriteSY : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,         [M3WriteSC]>]>;
+
+def M3ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
+                                      SchedVar<NoSchedPred,   [ReadDefault]>]>;
+
+// Branch instructions.
+def : SchedAlias<WriteBr, M3WriteZ0>;
+def : WriteRes<WriteBrReg, [M3UnitC]> { let Latency = 1; }
+
+// Arithmetic and logical integer instructions.
+def : WriteRes<WriteI,     [M3UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [M3UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIEReg, [M3UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIS,    [M3UnitALU]> { let Latency = 1; }
+
+// Move instructions.
+def : WriteRes<WriteImm, [M3UnitALU]> { let Latency = 1; }
+
+// Divide and multiply instructions.
+def : WriteRes<WriteID32, [M3UnitC,
+                           M3UnitD]>  { let Latency = 12;
+                                        let ResourceCycles = [1, 12]; }
+def : WriteRes<WriteID64, [M3UnitC,
+                           M3UnitD]>  { let Latency = 21;
+                                        let ResourceCycles = [1, 21]; }
+def : WriteRes<WriteIM32, [M3UnitC]>  { let Latency = 3; }
+def : WriteRes<WriteIM64, [M3UnitC]>  { let Latency = 4;
+                                        let ResourceCycles = [2]; }
+
+// Miscellaneous instructions.
+def : WriteRes<WriteExtr, [M3UnitALU,
+                           M3UnitALU]> { let Latency = 1;
+                                         let NumMicroOps = 2; }
+
+// Addressing modes.
+def : WriteRes<WriteAdr, []> { let Latency = 1;
+                               let NumMicroOps = 0; }
+def : SchedAlias<ReadAdrBase, M3ReadAdrBase>;
+
+// Load instructions.
+def : SchedAlias<WriteLD, M3WriteL4>;
+def : WriteRes<WriteLDHi, []> { let Latency = 4;
+                                let NumMicroOps = 0; }
+def : SchedAlias<WriteLDIdx, M3WriteLX>;
+
+// Store instructions.
+def : SchedAlias<WriteST,    M3WriteS1>;
+def : SchedAlias<WriteSTP,   M3WriteS1>;
+def : SchedAlias<WriteSTX,   M3WriteS1>;
+def : SchedAlias<WriteSTIdx, M3WriteSX>;
+
+// FP data instructions.
+def : WriteRes<WriteF,    [M3UnitFADD]>  { let Latency = 2; }
+def : WriteRes<WriteFCmp, [M3UnitNMSC]>  { let Latency = 2; }
+def : WriteRes<WriteFDiv, [M3UnitFDIV]>  { let Latency = 12;
+                                           let ResourceCycles = [12]; }
+def : WriteRes<WriteFMul, [M3UnitFMAC]>  { let Latency = 4; }
+
+// FP miscellaneous instructions.
+// TODO: Conversion between register files is much different.
+def : WriteRes<WriteFCvt,  [M3UnitFCVT]> { let Latency = 3; }
+def : WriteRes<WriteFImm,  [M3UnitNALU]> { let Latency = 1; }
+def : WriteRes<WriteFCopy, [M3UnitNALU]> { let Latency = 1; }
+
+// FP load instructions.
+def : SchedAlias<WriteVLD, M3WriteL5>;
+
+// FP store instructions.
+def : WriteRes<WriteVST, [M3UnitS,
+                          M3UnitFST]> { let Latency = 1;
+                                        let NumMicroOps = 1; }
+
+// ASIMD FP instructions.
+def : WriteRes<WriteV, [M3UnitNALU]> { let Latency = 3; }
+
+// Other miscellaneous instructions.
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Generic fast forwarding.
+
+// TODO: Add FP register forwarding rules.
+
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+// TODO: The forwarding for 32 bits actually saves 2 cycles.
+def : ReadAdvance<ReadIMA,     3, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Finer scheduling model.
+
+def M3WriteNEONA   : SchedWriteRes<[M3UnitNSHF,
+                                    M3UnitFADD]>  { let Latency = 3;
+                                                    let NumMicroOps = 2; }
+def M3WriteNEONB   : SchedWriteRes<[M3UnitNALU,
+                                    M3UnitFST]>   { let Latency = 10;
+                                                    let NumMicroOps = 2; }
+def M3WriteNEOND   : SchedWriteRes<[M3UnitNSHF,
+                                    M3UnitFST]>   { let Latency = 6;
+                                                    let NumMicroOps = 2; }
+def M3WriteNEONH   : SchedWriteRes<[M3UnitNALU,
+                                    M3UnitS]>     { let Latency = 5;
+                                                    let NumMicroOps = 2; }
+def M3WriteNEONI   : SchedWriteRes<[M3UnitNSHF,
+                                    M3UnitS]>     { let Latency = 5;
+                                                    let NumMicroOps = 2; }
+def M3WriteNEONV   : SchedWriteRes<[M3UnitFDIV0,
+                                    M3UnitFDIV1]>  { let Latency = 7;
+                                                     let NumMicroOps = 2;
+                                                     let ResourceCycles = [8, 8]; }
+def M3WriteNEONW   : SchedWriteRes<[M3UnitFDIV0,
+                                    M3UnitFDIV1]>  { let Latency = 12;
+                                                     let NumMicroOps = 2;
+                                                     let ResourceCycles = [13, 13]; }
+def M3WriteNEONX   : SchedWriteRes<[M3UnitFSQR,
+                                    M3UnitFSQR]>  { let Latency = 18;
+                                                    let NumMicroOps = 2;
+                                                    let ResourceCycles = [19, 19]; }
+def M3WriteNEONY   : SchedWriteRes<[M3UnitFSQR,
+                                    M3UnitFSQR]>  { let Latency = 25;
+                                                    let NumMicroOps = 2;
+                                                    let ResourceCycles = [26, 26]; }
+def M3WriteNEONZ   : SchedWriteRes<[M3UnitNMSC,
+                                    M3UnitNMSC]>  { let Latency = 5;
+                                                    let NumMicroOps = 2; }
+def M3WriteFADD2   : SchedWriteRes<[M3UnitFADD]>  { let Latency = 2; }
+def M3WriteFCVT2   : SchedWriteRes<[M3UnitFCVT]>  { let Latency = 2; }
+def M3WriteFCVT3   : SchedWriteRes<[M3UnitFCVT]>  { let Latency = 3; }
+def M3WriteFCVT3A  : SchedWriteRes<[M3UnitFCVT0]> { let Latency = 3; }
+def M3WriteFCVT4A  : SchedWriteRes<[M3UnitFCVT0]> { let Latency = 4; }
+def M3WriteFCVT4   : SchedWriteRes<[M3UnitFCVT]>  { let Latency = 4; }
+def M3WriteFDIV10  : SchedWriteRes<[M3UnitFDIV]>  { let Latency = 7;
+                                                    let ResourceCycles = [8]; }
+def M3WriteFDIV12  : SchedWriteRes<[M3UnitFDIV]>  { let Latency = 12;
+                                                    let ResourceCycles = [13]; }
+def M3WriteFMAC3   : SchedWriteRes<[M3UnitFMAC]>  { let Latency = 3; }
+def M3WriteFMAC4   : SchedWriteRes<[M3UnitFMAC]>  { let Latency = 4; }
+def M3WriteFMAC5   : SchedWriteRes<[M3UnitFMAC]>  { let Latency = 5; }
+def M3WriteFSQR17  : SchedWriteRes<[M3UnitFSQR]>  { let Latency = 18;
+                                                    let ResourceCycles = [19]; }
+def M3WriteFSQR25  : SchedWriteRes<[M3UnitFSQR]>  { let Latency = 25;
+                                                    let ResourceCycles = [26]; }
+def M3WriteNALU1   : SchedWriteRes<[M3UnitNALU]>  { let Latency = 1; }
+def M3WriteNCRY1A  : SchedWriteRes<[M3UnitNCRY0]> { let Latency = 1; }
+def M3WriteNCRY3A  : SchedWriteRes<[M3UnitNCRY0]> { let Latency = 3; }
+def M3WriteNCRY5A  : SchedWriteRes<[M3UnitNCRY]>  { let Latency = 5; }
+def M3WriteNMSC1   : SchedWriteRes<[M3UnitNMSC]>  { let Latency = 1; }
+def M3WriteNMSC2   : SchedWriteRes<[M3UnitNMSC]>  { let Latency = 2; }
+def M3WriteNMSC3   : SchedWriteRes<[M3UnitNMSC]>  { let Latency = 3; }
+def M3WriteNMUL3   : SchedWriteRes<[M3UnitNMUL]>  { let Latency = 3; }
+def M3WriteNSHF1   : SchedWriteRes<[M3UnitNSHF]>  { let Latency = 1; }
+def M3WriteNSHF3   : SchedWriteRes<[M3UnitNSHF]>  { let Latency = 3; }
+def M3WriteNSHT1   : SchedWriteRes<[M3UnitNSHT]>  { let Latency = 1; }
+def M3WriteNSHT2   : SchedWriteRes<[M3UnitNSHT]>  { let Latency = 2; }
+def M3WriteNSHT3   : SchedWriteRes<[M3UnitNSHT]>  { let Latency = 3; }
+def M3WriteVLDA    : SchedWriteRes<[M3UnitL,
+                                    M3UnitL]>     { let Latency = 5;
+                                                    let NumMicroOps = 2; }
+def M3WriteVLDB    : SchedWriteRes<[M3UnitL,
+                                    M3UnitL,
+                                    M3UnitL]>     { let Latency = 6;
+                                                    let NumMicroOps = 3; }
+def M3WriteVLDC    : SchedWriteRes<[M3UnitL,
+                                    M3UnitL,
+                                    M3UnitL,
+                                    M3UnitL]>     { let Latency = 6;
+                                                    let NumMicroOps = 4; }
+def M3WriteVLDD    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU]>  { let Latency = 7;
+                                                    let NumMicroOps = 2;
+                                                    let ResourceCycles = [2, 1]; }
+def M3WriteVLDE    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU]>  { let Latency = 6;
+                                                    let NumMicroOps = 2;
+                                                    let ResourceCycles = [2, 1]; }
+def M3WriteVLDF    : SchedWriteRes<[M3UnitL,
+                                    M3UnitL]>     { let Latency = 10;
+                                                    let NumMicroOps = 2;
+                                                    let ResourceCycles = [5, 5]; }
+def M3WriteVLDG    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU]>  { let Latency = 7;
+                                                    let NumMicroOps = 3;
+                                                    let ResourceCycles = [2, 1, 1]; }
+def M3WriteVLDH    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU]>  { let Latency = 6;
+                                                    let NumMicroOps = 3;
+                                                    let ResourceCycles = [2, 1, 1]; }
+def M3WriteVLDI    : SchedWriteRes<[M3UnitL,
+                                    M3UnitL,
+                                    M3UnitL]>     { let Latency = 12;
+                                                    let NumMicroOps = 3;
+                                                    let ResourceCycles = [6, 6, 6]; }
+def M3WriteVLDJ    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU,
+                                    M3UnitNALU]>  { let Latency = 7;
+                                                    let NumMicroOps = 4;
+                                                    let ResourceCycles = [2, 1, 1, 1]; }
+def M3WriteVLDK    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU,
+                                    M3UnitNALU,
+                                    M3UnitNALU]>  { let Latency = 9;
+                                                    let NumMicroOps = 5;
+                                                    let ResourceCycles = [4, 1, 1, 1, 1]; }
+def M3WriteVLDL    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU,
+                                    M3UnitL,
+                                    M3UnitNALU]>  { let Latency = 6;
+                                                    let NumMicroOps = 5;
+                                                    let ResourceCycles = [6, 1, 1, 6, 1]; }
+def M3WriteVLDM    : SchedWriteRes<[M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU,
+                                    M3UnitL,
+                                    M3UnitNALU,
+                                    M3UnitNALU]>  { let Latency = 7;
+                                                    let NumMicroOps = 6;
+                                                    let ResourceCycles = [6, 1, 1, 6, 1, 1]; }
+def M3WriteVLDN    : SchedWriteRes<[M3UnitL,
+                                    M3UnitL,
+                                    M3UnitL,
+                                    M3UnitL]>     { let Latency = 14;
+                                                    let NumMicroOps = 4;
+                                                    let ResourceCycles = [6, 6, 6, 6]; }
+def M3WriteVSTA    : WriteSequence<[WriteVST], 2>;
+def M3WriteVSTB    : WriteSequence<[WriteVST], 3>;
+def M3WriteVSTC    : WriteSequence<[WriteVST], 4>;
+def M3WriteVSTD    : SchedWriteRes<[M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST]>   { let Latency = 7;
+                                                    let NumMicroOps = 4;
+                                                    let ResourceCycles = [1, 3, 1, 3]; }
+def M3WriteVSTE    : SchedWriteRes<[M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST]>   { let Latency = 8;
+                                                    let NumMicroOps = 6;
+                                                    let ResourceCycles = [1, 3, 1, 3, 1, 3]; }
+def M3WriteVSTF    : SchedWriteRes<[M3UnitNALU,
+                                    M3UnitFST,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST]>   { let Latency = 15;
+                                                    let NumMicroOps = 7;
+                                                    let ResourceCycles = [1, 3, 3, 1, 3, 1, 3]; }
+def M3WriteVSTG    : SchedWriteRes<[M3UnitNALU,
+                                    M3UnitFST,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST]>   { let Latency = 16;
+                                                    let NumMicroOps = 9;
+                                                    let ResourceCycles = [1, 3, 3, 1, 3, 1, 3, 1, 3]; }
+def M3WriteVSTH    : SchedWriteRes<[M3UnitNALU,
+                                    M3UnitFST,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST]>   { let Latency = 14;
+                                                    let NumMicroOps = 5;
+                                                    let ResourceCycles = [1, 3, 3, 1, 3]; }
+def M3WriteVSTI    : SchedWriteRes<[M3UnitNALU,
+                                    M3UnitFST,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST,
+                                    M3UnitS,
+                                    M3UnitFST]>   { let Latency = 17;
+                                                    let NumMicroOps = 9;
+                                                    let ResourceCycles = [1, 3, 3, 1, 3, 1, 3, 1, 3]; }
+
+// Special cases.
+def M3WriteAES     : SchedWriteRes<[M3UnitNCRY]>  { let Latency = 1; }
+def M3ReadAES      : SchedReadAdvance<1, [M3WriteAES]>;
+def M3ReadFMAC     : SchedReadAdvance<1, [M3WriteFMAC4,
+                                          M3WriteFMAC5]>;
+def M3WriteMOVI    : SchedWriteVariant<[SchedVar<M3ResetFastPred, [M3WriteZ0]>,
+                                        SchedVar<NoSchedPred,     [M3WriteNALU1]>]>;
+def M3ReadNMUL     : SchedReadAdvance<1, [M3WriteNMUL3]>;
+
+// Branch instructions
+def : InstRW<[M3WriteB1], (instrs Bcc)>;
+def : InstRW<[M3WriteA1], (instrs BL)>;
+def : InstRW<[M3WriteBX], (instrs BLR)>;
+def : InstRW<[M3WriteC1], (instregex "^CBN?Z[WX]")>;
+def : InstRW<[M3WriteAD], (instregex "^TBN?Z[WX]")>;
+
+// Arithmetic and logical integer instructions.
+def : InstRW<[M3WriteA1], (instrs COPY)>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?Xrx64")>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]$")>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|BIC|SUB)S[WX]r[sx]$")>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]ri")>;
+
+// Move instructions.
+def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>;
+def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>;
+
+// Divide and multiply instructions.
+
+// Miscellaneous instructions.
+def : InstRW<[M3WriteAY], (instrs EXTRWrri, EXTRXrri)>;
+
+// Load instructions.
+def : InstRW<[M3WriteLD,
+              WriteLDHi,
+              WriteAdr],    (instregex "^LDP(SW|W|X)(post|pre)")>;
+def : InstRW<[M3WriteLX,
+              ReadAdrBase], (instregex "^PRFMro[WX]")>;
+
+// Store instructions.
+
+// FP data instructions.
+def : InstRW<[M3WriteNSHF1],  (instregex "^FABS[DS]r")>;
+def : InstRW<[M3WriteFADD2],  (instregex "^F(ADD|SUB)[DS]rr")>;
+def : InstRW<[M3WriteFDIV10], (instrs FDIVSrr)>;
+def : InstRW<[M3WriteFDIV12], (instrs FDIVDrr)>;
+def : InstRW<[M3WriteNMSC1],  (instregex "^F(MAX|MIN).+rr")>;
+def : InstRW<[M3WriteFMAC3],  (instregex "^FN?MUL[DS]rr")>;
+def : InstRW<[M3WriteFMAC4,
+              M3ReadFMAC],    (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+def : InstRW<[M3WriteNALU1],  (instregex "^FNEG[DS]r")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FRINT.+r")>;
+def : InstRW<[M3WriteNEONH],  (instregex "^FCSEL[DS]rrr")>;
+def : InstRW<[M3WriteFSQR17], (instrs FSQRTSr)>;
+def : InstRW<[M3WriteFSQR25], (instrs FSQRTDr)>;
+
+// FP miscellaneous instructions.
+def : InstRW<[M3WriteFCVT3],  (instregex "^FCVT[DHS][DHS]r")>;
+def : InstRW<[M3WriteFCVT4A], (instregex "^[SU]CVTF[SU][XW][DHS]ri")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FCVT[AMNPZ][SU]U[XW][DHS]r")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FCVTZ[SU][dhs]")>;
+def : InstRW<[M3WriteNALU1],  (instregex "^FMOV[DS][ir]")>;
+def : InstRW<[M3WriteFCVT4],  (instregex "^[FU](RECP|RSQRT)Ev1")>;
+def : InstRW<[M3WriteNMSC1],  (instregex "^FRECPXv1")>;
+def : InstRW<[M3WriteFMAC4,
+              M3ReadFMAC],    (instregex "^F(RECP|RSQRT)S(16|32|64)")>;
+def : InstRW<[M3WriteNALU1],  (instregex "^FMOV[WX][DS]r")>;
+def : InstRW<[M3WriteNALU1],  (instregex "^FMOV[DS][WX]r")>;
+def : InstRW<[M3WriteNEONI],  (instregex "^FMOV(DX|XD)Highr")>;
+
+// FP load instructions.
+def : InstRW<[WriteVLD],    (instregex "^LDR[DSQ]l")>;
+def : InstRW<[WriteVLD],    (instregex "^LDUR[BDHSQ]i")>;
+def : InstRW<[WriteVLD,
+              WriteAdr],    (instregex "^LDR[BDHSQ](post|pre)")>;
+def : InstRW<[WriteVLD],    (instregex "^LDR[BDHSQ]ui")>;
+def : InstRW<[M3WriteLX,
+              ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>;
+def : InstRW<[M3WriteLB,
+              ReadAdrBase], (instregex "^LDRQro[WX]")>;
+def : InstRW<[WriteVLD,
+              M3WriteLH],   (instregex "^LDN?P[DS]i")>;
+def : InstRW<[M3WriteLA,
+              M3WriteLH],   (instregex "^LDN?PQi")>;
+def : InstRW<[M3WriteLB,
+              M3WriteLH,
+              WriteAdr],    (instregex "^LDP[DS](post|pre)")>;
+def : InstRW<[M3WriteLC,
+              M3WriteLH,
+              WriteAdr],    (instregex "^LDPQ(post|pre)")>;
+
+// FP store instructions.
+def : InstRW<[WriteVST],    (instregex "^STUR[BDHSQ]i")>;
+def : InstRW<[WriteVST,
+              WriteAdr],    (instregex "^STR[BDHSQ](post|pre)")>;
+def : InstRW<[WriteVST],    (instregex "^STR[BDHSQ]ui")>;
+def : InstRW<[M3WriteSY,
+              ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>;
+def : InstRW<[M3WriteSA,
+              ReadAdrBase], (instregex "^STRQro[WX]")>;
+def : InstRW<[WriteVST],    (instregex "^STN?P[DSQ]i")>;
+def : InstRW<[WriteVST,
+              WriteAdr],    (instregex "^STP[DS](post|pre)")>;
+def : InstRW<[M3WriteSA,
+              WriteAdr],    (instregex "^STPQ(post|pre)")>;
+
+// ASIMD instructions.
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^(SQ)?(ABS|NEG)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Pv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]H(ADD|SUB)v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU](ADD|SUB)[LW]V?v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^R?(ADD|SUB)HN2?v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]Q(ADD|SUB)v")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^(SU|US)QADDv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]RHADDv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Vv")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^CMTSTv")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>;
+def : InstRW<[M3WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU](MIN|MAX)Vv")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^(MUL|SQR?DMULH)v")>;
+def : InstRW<[M3WriteNMUL3,
+              M3ReadNMUL],   (instregex "^ML[AS]v")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^[SU]ML[AS]Lv")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^SQDML[AS]L")>;
+def : InstRW<[M3WriteNMUL3], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ADALPv")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^[SU]R?SRAv")>;
+def : InstRW<[M3WriteNSHT1], (instregex "^SHL[dv]")>;
+def : InstRW<[M3WriteNSHT1], (instregex "^[SU]SH[LR][dv]")>;
+def : InstRW<[M3WriteNSHT1], (instregex "^S[RS]I[dv]")>;
+def : InstRW<[M3WriteNSHT2], (instregex "^[SU]?SHLLv")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^[SU]RSH[LR][dv]")>;
+def : InstRW<[M3WriteNSHT3], (instregex "^[SU]QR?SHLU?[bdhsv]")>;
+
+// ASIMD FP instructions.
+def : InstRW<[M3WriteNSHF1],  (instregex "^FABSv")>;
+def : InstRW<[M3WriteFADD2],  (instregex "^F(ABD|ADD|SUB)v")>;
+def : InstRW<[M3WriteNEONA],  (instregex "^FADDP")>;
+def : InstRW<[M3WriteNMSC1],  (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
+def : InstRW<[M3WriteFCVT3],  (instregex "^FCVT(L|N|XN)v")>;
+def : InstRW<[M3WriteFCVT2],  (instregex "^FCVT[AMNPZ][SU]v")>;
+def : InstRW<[M3WriteFCVT2],  (instregex "^[SU]CVTFv")>;
+def : InstRW<[M3WriteFDIV10], (instrs FDIVv2f32)>;
+def : InstRW<[M3WriteNEONV],  (instrs FDIVv4f32)>;
+def : InstRW<[M3WriteNEONW],  (instrs FDIVv2f64)>;
+def : InstRW<[M3WriteNMSC1],  (instregex "^F(MAX|MIN)(NM)?v")>;
+def : InstRW<[M3WriteNMSC2],  (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[M3WriteNEONZ],  (instregex "^F(MAX|MIN)(NM)?Vv")>;
+def : InstRW<[M3WriteFMAC3],  (instregex "^FMULX?v.[fi]")>;
+def : InstRW<[M3WriteFMAC4,
+              M3ReadFMAC],    (instregex "^FML[AS]v.f")>;
+def : InstRW<[M3WriteFMAC5,
+              M3ReadFMAC],    (instregex "^FML[AS]v.i")>;
+def : InstRW<[M3WriteNALU1],  (instregex "^FNEGv")>;
+def : InstRW<[M3WriteFCVT3A], (instregex "^FRINT[AIMNPXZ]v")>;
+def : InstRW<[M3WriteFSQR17], (instrs FSQRTv2f32)>;
+def : InstRW<[M3WriteNEONX],  (instrs FSQRTv4f32)>;
+def : InstRW<[M3WriteNEONY],  (instrs FSQRTv2f64)>;
+
+// ASIMD miscellaneous instructions.
+def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^[SU]?Q?XTU?Nv")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^CPY")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^INSv.+lane")>;
+def : InstRW<[M3WriteMOVI],  (instregex "^MOVI")>;
+def : InstRW<[M3WriteNALU1], (instregex "^FMOVv")>;
+def : InstRW<[M3WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev[248]")>;
+def : InstRW<[M3WriteFMAC4,
+              M3ReadFMAC],   (instregex "^F(RECP|RSQRT)Sv")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^REV(16|32|64)v")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^TB[LX]v")>;
+def : InstRW<[M3WriteNEOND], (instregex "^[SU]MOVv")>;
+def : InstRW<[M3WriteNSHF3], (instregex "^INSv.+gpr")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^(TRN|UZP|ZIP)[12]v")>;
+
+// ASIMD load instructions.
+def : InstRW<[M3WriteL5],   (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteL5,
+              WriteAdr],    (instregex "LD1Onev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteL5],   (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteL5,
+              WriteAdr],    (instregex "LD1Onev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDA,
+              WriteAdr],    (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDA,
+              WriteAdr],    (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDB,
+              WriteAdr],    (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDB,
+              WriteAdr],    (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDC,
+              WriteAdr],    (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDC,
+              WriteAdr],    (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDD], (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDD,
+              WriteAdr],    (instregex "LD1i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDE], (instregex "LD1i(64)$")>;
+def : InstRW<[M3WriteVLDE,
+              WriteAdr],    (instregex "LD1i(64)_POST")>;
+
+def : InstRW<[M3WriteL5],   (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteL5,
+              WriteAdr],    (instregex "LD1Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteL5],   (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteL5,
+              WriteAdr],    (instregex "LD1Rv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVLDF,
+              WriteAdr],    (instregex "LD2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVLDF], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDF,
+              WriteAdr],    (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDG], (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDG,
+              WriteAdr],    (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDH], (instregex "LD2i(64)$")>;
+def : InstRW<[M3WriteVLDH,
+              WriteAdr],    (instregex "LD2i(64)_POST")>;
+
+def : InstRW<[M3WriteVLDA], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDA,
+              WriteAdr],    (instregex "LD2Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDA,
+              WriteAdr],    (instregex "LD2Rv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVLDI,
+              WriteAdr],    (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVLDI], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDI,
+              WriteAdr],    (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDJ], (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDJ,
+              WriteAdr],    (instregex "LD3i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDL], (instregex "LD3i(64)$")>;
+def : InstRW<[M3WriteVLDL,
+              WriteAdr],    (instregex "LD3i(64)_POST")>;
+
+def : InstRW<[M3WriteVLDB], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDB,
+              WriteAdr],    (instregex "LD3Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDB], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDB,
+              WriteAdr],    (instregex "LD3Rv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVLDN,
+              WriteAdr],    (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVLDN], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDN,
+              WriteAdr],    (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVLDK], (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[M3WriteVLDK,
+              WriteAdr],    (instregex "LD4i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVLDM], (instregex "LD4i(64)$")>;
+def : InstRW<[M3WriteVLDM,
+              WriteAdr],    (instregex "LD4i(64)_POST")>;
+
+def : InstRW<[M3WriteVLDC], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVLDC,
+              WriteAdr],    (instregex "LD4Rv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVLDC], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVLDC,
+              WriteAdr],    (instregex "LD4Rv(16b|8h|4s|2d)_POST")>;
+
+// ASIMD store instructions.
+def : InstRW<[WriteVST],    (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVST,
+              WriteAdr],    (instregex "ST1Onev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST],    (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVST,
+              WriteAdr],    (instregex "ST1Onev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVSTA,
+              WriteAdr],    (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTA,
+              WriteAdr],    (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVSTB,
+              WriteAdr],    (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTB,
+              WriteAdr],    (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M3WriteVSTC,
+              WriteAdr],    (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[M3WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTC,
+              WriteAdr],    (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTD], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[M3WriteVSTD,
+              WriteAdr],    (instregex "ST1i(8|16|32|64)_POST")>;
+
+def : InstRW<[M3WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVSTD,
+              WriteAdr],    (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVSTE], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTE,
+              WriteAdr],    (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTD], (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[M3WriteVSTD,
+              WriteAdr],    (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVSTD], (instregex "ST2i(64)$")>;
+def : InstRW<[M3WriteVSTD,
+              WriteAdr],    (instregex "ST2i(64)_POST")>;
+
+def : InstRW<[M3WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVSTF,
+              WriteAdr],    (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVSTG], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTG,
+              WriteAdr],    (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTH], (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[M3WriteVSTH,
+              WriteAdr],    (instregex "ST3i(8|16|32)_POST")>;
+def : InstRW<[M3WriteVSTF], (instregex "ST3i(64)$")>;
+def : InstRW<[M3WriteVSTF,
+              WriteAdr],    (instregex "ST3i(64)_POST")>;
+
+def : InstRW<[M3WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M3WriteVSTF,
+              WriteAdr],    (instregex "ST4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[M3WriteVSTI], (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M3WriteVSTI,
+              WriteAdr],    (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[M3WriteVSTF], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[M3WriteVSTF,
+              WriteAdr],    (instregex "ST4i(8|16|32|64)_POST")>;
+
+// Cryptography instructions.
+def : InstRW<[M3WriteAES],    (instregex "^AES[DE]")>;
+def : InstRW<[M3WriteAES,
+              M3ReadAES],     (instregex "^AESI?MC")>;
+
+def : InstRW<[M3WriteNCRY3A], (instregex "^PMULL?v")>;
+
+def : InstRW<[M3WriteNCRY1A], (instregex "^SHA1([CHMP]|SU[01])")>;
+def : InstRW<[M3WriteNCRY1A], (instregex "^SHA256SU0")>;
+def : InstRW<[M3WriteNCRY5A], (instregex "^SHA256(H2?|SU1)")>;
+
+// CRC instructions.
+def : InstRW<[M3WriteC2], (instregex "^CRC32")>;
+
+} // SchedModel = ExynosM3Model
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 7277198b585f..84825458e47c 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -25,6 +25,9 @@ def FalkorModel : SchedMachineModel {
   let CompleteModel = 1;
 
   list<Predicate> UnsupportedFeatures = [HasSVE];
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 0aeb1f3e3058..ff14e639d1a5 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -32,8 +32,12 @@
 
 //===----------------------------------------------------------------------===//
 // Define 0 micro-op types
-def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
-  let Latency = 3;
+def FalkorWr_LdInc_none_2cyc : SchedWriteRes<[]> {
+  let Latency = 2;
+  let NumMicroOps = 0;
+}
+def FalkorWr_StInc_none_2cyc : SchedWriteRes<[]> {
+  let Latency = 2;
   let NumMicroOps = 0;
 }
 def FalkorWr_none_3cyc : SchedWriteRes<[]> {
@@ -514,8 +518,8 @@ def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr
 def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
 def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
 
-def FalkorReadIncLd  : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
-def FalkorReadIncSt  : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncLd  : SchedReadAdvance<1, [FalkorWr_LdInc_none_2cyc]>;
+def FalkorReadIncSt  : SchedReadAdvance<1, [FalkorWr_StInc_none_2cyc]>;
 
 // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
 // -----------------------------------------------------------------------------
@@ -776,99 +780,99 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
 // SIMD Load Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instrs LD2i64)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                                          (instrs LD2i64_POST)>;
 
 def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD1i(8|16|32)_POST$")>;
 
 def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
 def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
 
 def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD3i64)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
                                                          (instrs LD3i64_POST)>;
 def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD4i64)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
                                                          (instrs LD4i64_POST)>;
 
 def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD2i(8|16|32)_POST$")>;
 
 def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
 
 def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
                                                          (instrs LD3Threev2d_POST)>;
 def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD3i(8|16|32)_POST$")>;
 
 def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
 
 def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
                                                          (instrs LD4Fourv2d_POST)>;
 def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
                                                          (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD4i(8|16|32)_POST$")>;
 
 def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
 
 def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
 
 def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
@@ -877,10 +881,10 @@ def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
 def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD4Fourv(16b|8h|4s)$")>;
 
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
 
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
                                                          (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
 
 // Arithmetic and Logical Instructions
@@ -965,17 +969,17 @@ def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
 
 def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STR(Q|D|S|H|B)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
 def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STR(D|S|H|B)ro(W|X)$")>;
 def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STPQi$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STPQ(post|pre)$")>;
 def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STP(D|S)(i)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STP(D|S)(post|pre)$")>;
 def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STRQro(W|X)$")>;
@@ -988,7 +992,7 @@ def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt]
 
 def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
 def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
@@ -1087,7 +1091,7 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDUR(Q|D|S|H|B)i$")>;
@@ -1101,9 +1105,9 @@ def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDNP(D|S)i$")>;
 def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDPQ(pre|post)$")>;
 
 // FP Data Processing Instructions
@@ -1165,11 +1169,11 @@ def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDNP(W|X)i$")>;
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDP(W|X)(post|pre)$")>;
 def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
 def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
                                       (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
@@ -1182,11 +1186,11 @@ def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
 def : InstRW<[FalkorWr_PRFMro],       (instregex "^PRFMro(W|X)$")>;
 def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
                                       (instrs LDPSWi)>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
                                       (instregex "^LDPSW(post|pre)$")>;
 def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
                                       (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
+def : InstRW<[FalkorWr_LdInc_none_2cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
                                       (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
 def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
                                       (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
@@ -1273,11 +1277,11 @@ def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadInc
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STP(W|X)i$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STP(W|X)(post|pre)$")>;
 def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+def : InstRW<[FalkorWr_StInc_none_2cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
 def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index ce2afd499afb..68de3e077c96 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -29,6 +29,9 @@ def KryoModel : SchedMachineModel {
   let CompleteModel = 1;
 
   list<Predicate> UnsupportedFeatures = [HasSVE];
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index 585688aae279..fbbd3850d0fd 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -27,6 +27,9 @@ def ThunderXT8XModel : SchedMachineModel {
   let CompleteModel = 1;
 
   list<Predicate> UnsupportedFeatures = [HasSVE];
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 // Modeling each pipeline with BufferSize == 0 since T8X is in-order.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 22f272edd680..bee3392b6d3b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -27,8 +27,13 @@ def ThunderX2T99Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = [HasSVE];
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
+let SchedModel = ThunderX2T99Model in {
+
 // Define the issue ports.
 
 // Port 0: ALU, FP/SIMD.
@@ -49,8 +54,6 @@ def THX2T99P4 : ProcResource<1>;
 // Port 5: Load/store.
 def THX2T99P5 : ProcResource<1>;
 
-let SchedModel = ThunderX2T99Model in {
-
 // Define groups for the functional units on each issue port.  Each group
 // created will be used by a WriteRes later on.
 //
@@ -359,13 +362,10 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
-}
 
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
 
-let SchedModel = ThunderX2T99Model in {
-
 //---
 // 3.1 Branch Instructions
 //---
@@ -391,7 +391,7 @@ def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
 
 def : WriteRes<WriteAtomic,  []> {
-  let Unsupported = 1;
+  let Latency = 4;
   let NumMicroOps = 2;
 }
 
@@ -416,63 +416,63 @@ def : InstRW<[THX2T99Write_1Cyc_I2],
 // Address generation
 def : WriteRes<WriteI,       [THX2T99I012]> {
   let Latency = 1;
-  let ResourceCycles = [1, 3];
+  let ResourceCycles = [1];
   let NumMicroOps = 2;
 }
 
 def : InstRW<[WriteI],
             (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
                        "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
-                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
                        "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
                        "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
                        "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
-                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
-                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
-                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
-                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
-                       "CSNEG?(W|X)r(i|r|s|x)")>;
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
 
 def : InstRW<[WriteI], (instrs COPY)>;
 
 // ALU, extend and/or shift
 def : WriteRes<WriteISReg,   [THX2T99I012]> {
   let Latency = 2;
-  let ResourceCycles = [2, 3];
+  let ResourceCycles = [2];
   let NumMicroOps = 2;
 }
 
 def : InstRW<[WriteISReg],
             (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
                        "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
-                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
                        "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
                        "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
                        "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
-                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
-                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
-                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
-                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
-                       "CSNEG?(W|X)r(i|r|s|x)")>;
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
 
 def : WriteRes<WriteIEReg,   [THX2T99I012]> {
   let Latency = 1;
-  let ResourceCycles = [1, 3];
+  let ResourceCycles = [1];
   let NumMicroOps = 2;
 }
 
 def : InstRW<[WriteIEReg],
             (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
                        "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
-                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
                        "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
                        "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
                        "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
-                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
-                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
-                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
-                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
-                       "CSNEG?(W|X)r(i|r|s|x)")>;
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
 
 // Move immed
 def : WriteRes<WriteImm,     [THX2T99I012]> {
@@ -500,14 +500,14 @@ def : WriteRes<WriteIS,      [THX2T99I012]> {
 // Latency range of 13-23/13-39.
 def : WriteRes<WriteID32,    [THX2T99I1]> {
   let Latency = 39;
-  let ResourceCycles = [13, 39];
+  let ResourceCycles = [39];
   let NumMicroOps = 4;
 }
 
 // Divide, X-form
 def : WriteRes<WriteID64,    [THX2T99I1]> {
   let Latency = 23;
-  let ResourceCycles = [13, 23];
+  let ResourceCycles = [23];
   let NumMicroOps = 4;
 }
 
@@ -1147,7 +1147,7 @@ def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>;
 def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>;
 def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>;
 def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
-def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>;
 
 // FP divide, D-form
 // FP square root, D-form
@@ -1155,7 +1155,7 @@ def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>;
 def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>;
 def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>;
 def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
-def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>;
 
 // FP multiply
 // FP multiply accumulate
@@ -1252,17 +1252,17 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 def : WriteRes<WriteV, [THX2T99F01]> {
   let Latency = 7;
   let NumMicroOps = 4;
-  let ResourceCycles = [4, 23];
+  let ResourceCycles = [4];
 }
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD arith, reduce, 8B/8H
 // ASIMD arith, reduce, 16B
 
-// ASIMD logical (MOV, MVN, ORN, ORR)
+// ASIMD logical (MVN (alias for NOT), ORN, ORR)
 def : InstRW<[THX2T99Write_5Cyc_F01],
-            (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv",
-                       "^ORRv", "^ORNv", "^NOTv")>;
+            (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+
 // ASIMD arith, reduce
 def : InstRW<[THX2T99Write_10Cyc_F01],
             (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
@@ -1513,7 +1513,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
 
 // ASIMD move, integer immed
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv")>;
 
 // ASIMD move, FP immed
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 571e61d7083c..fc7b5984fe3e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -91,9 +91,9 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB)
   if (SCDesc->isValid() && !SCDesc->isVariant()) {
     unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
     if (ResLenWithSTP > ResLength) {
-      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
-                   << " resources " << ResLength << " -> " << ResLenWithSTP
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                        << " resources " << ResLength << " -> " << ResLenWithSTP
+                        << "\n");
       return false;
     }
   }
@@ -127,14 +127,14 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
   TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
   TRI = ST.getRegisterInfo();
   MRI = &MF.getRegInfo();
-  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  SchedModel.init(&ST);
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
 
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
 
   if (!SchedModel.hasInstrSchedModel()) {
-    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    LLVM_DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
     return false;
   }
 
@@ -156,7 +156,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
           if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
             break;
           // Otherwise, continue unpairing the stores in this block.
-          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+          LLVM_DEBUG(dbgs() << "Unpairing store " << MI << "\n");
           SuppressSTP = true;
           TII->suppressLdStPair(MI);
         }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 39b764434388..04bb90d30d6d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 
@@ -82,6 +83,12 @@ void AArch64Subtarget::initializeProperties() {
     PrefFunctionAlignment = 4;
     PrefLoopAlignment = 3;
     break;
+  case ExynosM3:
+    MaxInterleaveFactor = 4;
+    MaxJumpTableSize = 20;
+    PrefFunctionAlignment = 5;
+    PrefLoopAlignment = 4;
+    break;
   case Falkor:
     MaxInterleaveFactor = 4;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
@@ -145,7 +152,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS),
-      ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian),
+      ReserveX18(AArch64::isX18ReservedByDefault(TT)), IsLittle(LittleEndian),
       TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
       TLInfo(TM, *this) {
@@ -253,3 +260,13 @@ std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
   return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
 }
+
+void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
+  // We usually compute max call frame size after ISel. Do the computation now
+  // if the .mir file didn't specify it. Note that this will probably give you
+  // bogus values after PEI has eliminated the callframe setup/destroy pseudo
+  // instructions, specify explicitely if you need it to be correct.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isMaxCallFrameSizeComputed())
+    MFI.computeMaxCallFrameSize(MF);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 9245b2f396b7..5af4c0dd9c19 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -48,6 +48,7 @@ public:
     CortexA75,
     Cyclone,
     ExynosM1,
+    ExynosM3,
     Falkor,
     Kryo,
     Saphira,
@@ -65,6 +66,7 @@ protected:
   bool HasV8_1aOps = false;
   bool HasV8_2aOps = false;
   bool HasV8_3aOps = false;
+  bool HasV8_4aOps = false;
 
   bool HasFPARMv8 = false;
   bool HasNEON = false;
@@ -77,9 +79,18 @@ protected:
   bool HasPerfMon = false;
   bool HasFullFP16 = false;
   bool HasSPE = false;
+
+  // ARMv8.4 Crypto extensions
+  bool HasSM4 = true;
+  bool HasSHA3 = true;
+
+  bool HasSHA2 = true;
+  bool HasAES = true;
+
   bool HasLSLFast = false;
   bool HasSVE = false;
   bool HasRCPC = false;
+  bool HasAggressiveFMA = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -101,6 +112,7 @@ protected:
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
   bool CustomAsCheapAsMove = false;
+  bool ExynosAsCheapAsMove = false;
   bool UsePostRAScheduler = false;
   bool Misaligned128StoreIsSlow = false;
   bool Paired128IsSlow = false;
@@ -108,7 +120,9 @@ protected:
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
   bool HasArithmeticCbzFusion = false;
+  bool HasFuseAddress = false;
   bool HasFuseAES = false;
+  bool HasFuseCCSelect = false;
   bool HasFuseLiterals = false;
   bool DisableLatencySchedHeuristic = false;
   bool UseRSqrt = false;
@@ -126,6 +140,9 @@ protected:
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
 
+  // ReserveX20 - X20 is not available as a general purpose register.
+  bool ReserveX20 = false;
+
   bool IsLittle;
 
   /// TargetTriple - What processor and OS we're targeting.
@@ -193,6 +210,7 @@ public:
   bool hasV8_1aOps() const { return HasV8_1aOps; }
   bool hasV8_2aOps() const { return HasV8_2aOps; }
   bool hasV8_3aOps() const { return HasV8_3aOps; }
+  bool hasV8_4aOps() const { return HasV8_4aOps; }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
@@ -211,6 +229,7 @@ public:
   }
 
   bool isX18Reserved() const { return ReserveX18; }
+  bool isX20Reserved() const { return ReserveX20; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
@@ -219,11 +238,16 @@ public:
   bool hasLSE() const { return HasLSE; }
   bool hasRAS() const { return HasRAS; }
   bool hasRDM() const { return HasRDM; }
+  bool hasSM4() const { return HasSM4; }
+  bool hasSHA3() const { return HasSHA3; }
+  bool hasSHA2() const { return HasSHA2; }
+  bool hasAES() const { return HasAES; }
   bool balanceFPOps() const { return BalanceFPOps; }
   bool predictableSelectIsExpensive() const {
     return PredictableSelectIsExpensive;
   }
   bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+  bool hasExynosCheapAsMoveHandling() const { return ExynosAsCheapAsMove; }
   bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
   bool isPaired128Slow() const { return Paired128IsSlow; }
   bool isSTRQroSlow() const { return STRQroIsSlow; }
@@ -232,13 +256,15 @@ public:
   }
   bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
   bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+  bool hasFuseAddress() const { return HasFuseAddress; }
   bool hasFuseAES() const { return HasFuseAES; }
+  bool hasFuseCCSelect() const { return HasFuseCCSelect; }
   bool hasFuseLiterals() const { return HasFuseLiterals; }
 
-  /// \brief Return true if the CPU supports any kind of instruction fusion.
+  /// Return true if the CPU supports any kind of instruction fusion.
   bool hasFusion() const {
     return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
-           hasFuseAES() || hasFuseLiterals();
+           hasFuseAES() || hasFuseCCSelect() || hasFuseLiterals();
   }
 
   bool useRSqrt() const { return UseRSqrt; }
@@ -269,6 +295,7 @@ public:
   bool hasLSLFast() const { return HasLSLFast; }
   bool hasSVE() const { return HasSVE; }
   bool hasRCPC() const { return HasRCPC; }
+  bool hasAggressiveFMA() const { return HasAggressiveFMA; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -326,6 +353,8 @@ public:
       return false;
     }
   }
+
+  void mirFileLoaded(MachineFunction &MF) const override;
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 66b7e02ceb99..8acd32533eea 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -143,6 +143,23 @@ class ISB<string name, bits<4> encoding> : SearchableTable{
 def : ISB<"sy", 0xf>;
 
 //===----------------------------------------------------------------------===//
+// TSB (Trace synchronization barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TSB<string name, bits<4> encoding> : SearchableTable{
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding;
+  let Encoding = encoding;
+
+  code Requires = [{ {AArch64::HasV8_4aOps} }];
+}
+
+def : TSB<"csync", 0>;
+
+//===----------------------------------------------------------------------===//
 // PRFM (prefetch) instruction options.
 //===----------------------------------------------------------------------===//
 
@@ -175,6 +192,87 @@ def : PRFM<"pstl3keep", 0x14>;
 def : PRFM<"pstl3strm", 0x15>;
 
 //===----------------------------------------------------------------------===//
+// SVE Prefetch instruction options.
+//===----------------------------------------------------------------------===//
+
+class SVEPRFM<string name, bits<4> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding;
+  let Encoding = encoding;
+  code Requires = [{ {} }];
+}
+
+let Requires = [{ {AArch64::FeatureSVE} }] in {
+def : SVEPRFM<"pldl1keep", 0x00>;
+def : SVEPRFM<"pldl1strm", 0x01>;
+def : SVEPRFM<"pldl2keep", 0x02>;
+def : SVEPRFM<"pldl2strm", 0x03>;
+def : SVEPRFM<"pldl3keep", 0x04>;
+def : SVEPRFM<"pldl3strm", 0x05>;
+def : SVEPRFM<"pstl1keep", 0x08>;
+def : SVEPRFM<"pstl1strm", 0x09>;
+def : SVEPRFM<"pstl2keep", 0x0a>;
+def : SVEPRFM<"pstl2strm", 0x0b>;
+def : SVEPRFM<"pstl3keep", 0x0c>;
+def : SVEPRFM<"pstl3strm", 0x0d>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Predicate patterns
+//===----------------------------------------------------------------------===//
+
+class SVEPREDPAT<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : SVEPREDPAT<"pow2",  0x00>;
+def : SVEPREDPAT<"vl1",   0x01>;
+def : SVEPREDPAT<"vl2",   0x02>;
+def : SVEPREDPAT<"vl3",   0x03>;
+def : SVEPREDPAT<"vl4",   0x04>;
+def : SVEPREDPAT<"vl5",   0x05>;
+def : SVEPREDPAT<"vl6",   0x06>;
+def : SVEPREDPAT<"vl7",   0x07>;
+def : SVEPREDPAT<"vl8",   0x08>;
+def : SVEPREDPAT<"vl16",  0x09>;
+def : SVEPREDPAT<"vl32",  0x0a>;
+def : SVEPREDPAT<"vl64",  0x0b>;
+def : SVEPREDPAT<"vl128", 0x0c>;
+def : SVEPREDPAT<"vl256", 0x0d>;
+def : SVEPREDPAT<"mul4",  0x1d>;
+def : SVEPREDPAT<"mul3",  0x1e>;
+def : SVEPREDPAT<"all",   0x1f>;
+
+//===----------------------------------------------------------------------===//
+// Exact FP Immediates.
+//
+// These definitions are used to create a lookup table with FP Immediates that
+// is used for a few instructions that only accept a limited set of exact FP
+// immediates values.
+//===----------------------------------------------------------------------===//
+class ExactFPImm<string name, string repr, bits<4> enum > : SearchableTable {
+  let SearchableFields = ["Enum", "Repr"];
+  let EnumValueField = "Enum";
+
+  string Name = name;
+  bits<4> Enum = enum;
+  string Repr = repr;
+}
+
+def : ExactFPImm<"zero", "0.0", 0x0>;
+def : ExactFPImm<"half", "0.5", 0x1>;
+def : ExactFPImm<"one",  "1.0", 0x2>;
+def : ExactFPImm<"two",  "2.0", 0x3>;
+
+//===----------------------------------------------------------------------===//
 // PState instruction options.
 //===----------------------------------------------------------------------===//
 
@@ -197,7 +295,9 @@ def : PState<"PAN",     0b00100>;
 // v8.2a "User Access Override" extension-specific PStates
 let Requires = [{ {AArch64::HasV8_2aOps} }] in
 def : PState<"UAO",     0b00011>;
-
+// v8.4a timining insensitivity of data processing instructions
+let Requires = [{ {AArch64::HasV8_4aOps} }] in
+def : PState<"DIT",     0b11010>;
 
 //===----------------------------------------------------------------------===//
 // PSB instruction options.
@@ -230,6 +330,7 @@ class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
   bit NeedsReg = needsreg;
+  code Requires = [{ {} }];
 }
 
 def : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
@@ -265,6 +366,59 @@ def : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
 def : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
 def : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
 
+// Armv8.4-A Outer Sharable TLB Maintenance instructions:
+let Requires = [{ {AArch64::HasV8_4aOps} }] in {
+//                         op1    CRn     CRm     op2
+def : TLBI<"VMALLE1OS",    0b000, 0b1000, 0b0001, 0b000, 0>;
+def : TLBI<"VAE1OS",       0b000, 0b1000, 0b0001, 0b001>;
+def : TLBI<"ASIDE1OS",     0b000, 0b1000, 0b0001, 0b010>;
+def : TLBI<"VAAE1OS",      0b000, 0b1000, 0b0001, 0b011>;
+def : TLBI<"VALE1OS",      0b000, 0b1000, 0b0001, 0b101>;
+def : TLBI<"VAALE1OS",     0b000, 0b1000, 0b0001, 0b111>;
+def : TLBI<"IPAS2E1OS",    0b100, 0b1000, 0b0100, 0b000>;
+def : TLBI<"IPAS2LE1OS",   0b100, 0b1000, 0b0100, 0b100>;
+def : TLBI<"VAE2OS",       0b100, 0b1000, 0b0001, 0b001>;
+def : TLBI<"VALE2OS",      0b100, 0b1000, 0b0001, 0b101>;
+def : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
+def : TLBI<"VAE3OS",       0b110, 0b1000, 0b0001, 0b001>;
+def : TLBI<"VALE3OS",      0b110, 0b1000, 0b0001, 0b101>;
+def : TLBI<"ALLE2OS",      0b100, 0b1000, 0b0001, 0b000, 0>;
+def : TLBI<"ALLE1OS",      0b100, 0b1000, 0b0001, 0b100, 0>;
+def : TLBI<"ALLE3OS",      0b110, 0b1000, 0b0001, 0b000, 0>;
+
+// Armv8.4-A TLB Range Maintenance instructions:
+//                         op1    CRn     CRm     op2
+def : TLBI<"RVAE1",        0b000, 0b1000, 0b0110, 0b001>;
+def : TLBI<"RVAAE1",       0b000, 0b1000, 0b0110, 0b011>;
+def : TLBI<"RVALE1",       0b000, 0b1000, 0b0110, 0b101>;
+def : TLBI<"RVAALE1",      0b000, 0b1000, 0b0110, 0b111>;
+def : TLBI<"RVAE1IS",      0b000, 0b1000, 0b0010, 0b001>;
+def : TLBI<"RVAAE1IS",     0b000, 0b1000, 0b0010, 0b011>;
+def : TLBI<"RVALE1IS",     0b000, 0b1000, 0b0010, 0b101>;
+def : TLBI<"RVAALE1IS",    0b000, 0b1000, 0b0010, 0b111>;
+def : TLBI<"RVAE1OS",      0b000, 0b1000, 0b0101, 0b001>;
+def : TLBI<"RVAAE1OS",     0b000, 0b1000, 0b0101, 0b011>;
+def : TLBI<"RVALE1OS",     0b000, 0b1000, 0b0101, 0b101>;
+def : TLBI<"RVAALE1OS",    0b000, 0b1000, 0b0101, 0b111>;
+def : TLBI<"RIPAS2E1IS",   0b100, 0b1000, 0b0000, 0b010>;
+def : TLBI<"RIPAS2LE1IS",  0b100, 0b1000, 0b0000, 0b110>;
+def : TLBI<"RIPAS2E1",     0b100, 0b1000, 0b0100, 0b010>;
+def : TLBI<"RIPAS2LE1",    0b100, 0b1000, 0b0100, 0b110>;
+def : TLBI<"RIPAS2E1OS",   0b100, 0b1000, 0b0100, 0b011>;
+def : TLBI<"RIPAS2LE1OS",  0b100, 0b1000, 0b0100, 0b111>;
+def : TLBI<"RVAE2",        0b100, 0b1000, 0b0110, 0b001>;
+def : TLBI<"RVALE2",       0b100, 0b1000, 0b0110, 0b101>;
+def : TLBI<"RVAE2IS",      0b100, 0b1000, 0b0010, 0b001>;
+def : TLBI<"RVALE2IS",     0b100, 0b1000, 0b0010, 0b101>;
+def : TLBI<"RVAE2OS",      0b100, 0b1000, 0b0101, 0b001>;
+def : TLBI<"RVALE2OS",     0b100, 0b1000, 0b0101, 0b101>;
+def : TLBI<"RVAE3",        0b110, 0b1000, 0b0110, 0b001>;
+def : TLBI<"RVALE3",       0b110, 0b1000, 0b0110, 0b101>;
+def : TLBI<"RVAE3IS",      0b110, 0b1000, 0b0010, 0b001>;
+def : TLBI<"RVALE3IS",     0b110, 0b1000, 0b0010, 0b101>;
+def : TLBI<"RVAE3OS",      0b110, 0b1000, 0b0101, 0b001>;
+def : TLBI<"RVALE3OS",     0b110, 0b1000, 0b0101, 0b101>;
+}
 
 //===----------------------------------------------------------------------===//
 // MRS/MSR (system register read/write) instruction options.
@@ -420,7 +574,7 @@ def : ROSysReg<"ICC_HPPIR0_EL1",     0b11, 0b000, 0b1100, 0b1000, 0b010>;
 def : ROSysReg<"ICC_RPR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b011>;
 def : ROSysReg<"ICH_VTR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b001>;
 def : ROSysReg<"ICH_EISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b011>;
-def : ROSysReg<"ICH_ELSR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b101>;
+def : ROSysReg<"ICH_ELRSR_EL2",      0b11, 0b100, 0b1100, 0b1011, 0b101>;
 
 // v8.1a "Limited Ordering Regions" extension-specific system register
 //                         Op0    Op1     CRn     CRm    Op2
@@ -1037,6 +1191,126 @@ def : RWSysReg<"APGAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b000>;
 def : RWSysReg<"APGAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b001>;
 }
 
+let Requires = [{ {AArch64::HasV8_4aOps} }] in {
+
+// v8.4a "Virtualization secure second stage translation" registers
+//                           Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"VSTCR_EL2" , 0b11, 0b100, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"VSTTBR_EL2", 0b11, 0b100, 0b0010, 0b0110, 0b000>;
+
+// v8.4a "Virtualization timer" registers
+//                                Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"CNTHVS_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0100, 0b000>;
+def : RWSysReg<"CNTHVS_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0100, 0b010>;
+def : RWSysReg<"CNTHVS_CTL_EL2",  0b11, 0b100, 0b1110, 0b0100, 0b001>;
+def : RWSysReg<"CNTHPS_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b000>;
+def : RWSysReg<"CNTHPS_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b010>;
+def : RWSysReg<"CNTHPS_CTL_EL2",  0b11, 0b100, 0b1110, 0b0101, 0b001>;
+
+// v8.4a "Virtualization debug state" registers
+//                           Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
+
+// v8.4a RAS registers
+//                              Op0   Op1    CRn     CRm    Op2
+def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
+def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
+def : RWSysReg<"ERXTS_EL1",     0b11, 0b000, 0b0101, 0b0101, 0b111>;
+def : RWSysReg<"ERXMISC2_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b010>;
+def : RWSysReg<"ERXMISC3_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b011>;
+def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;
+
+// v8.4a MPAM registers
+//                             Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>;
+def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"MPAMVPMV_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"MPAMVPM0_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b000>;
+def : RWSysReg<"MPAMVPM1_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b001>;
+def : RWSysReg<"MPAMVPM2_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b010>;
+def : RWSysReg<"MPAMVPM3_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b011>;
+def : RWSysReg<"MPAMVPM4_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b100>;
+def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>;
+def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>;
+def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
+def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
+
+// v8.4a Activitiy monitor registers
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"AMCR_EL0",         0b11, 0b011, 0b1101, 0b0010, 0b000>;
+def : ROSysReg<"AMCFGR_EL0",       0b11, 0b011, 0b1101, 0b0010, 0b001>;
+def : ROSysReg<"AMCGCR_EL0",       0b11, 0b011, 0b1101, 0b0010, 0b010>;
+def : RWSysReg<"AMUSERENR_EL0",    0b11, 0b011, 0b1101, 0b0010, 0b011>;
+def : RWSysReg<"AMCNTENCLR0_EL0",  0b11, 0b011, 0b1101, 0b0010, 0b100>;
+def : RWSysReg<"AMCNTENSET0_EL0",  0b11, 0b011, 0b1101, 0b0010, 0b101>;
+def : RWSysReg<"AMEVCNTR00_EL0",   0b11, 0b011, 0b1101, 0b0100, 0b000>;
+def : RWSysReg<"AMEVCNTR01_EL0",   0b11, 0b011, 0b1101, 0b0100, 0b001>;
+def : RWSysReg<"AMEVCNTR02_EL0",   0b11, 0b011, 0b1101, 0b0100, 0b010>;
+def : RWSysReg<"AMEVCNTR03_EL0",   0b11, 0b011, 0b1101, 0b0100, 0b011>;
+def : ROSysReg<"AMEVTYPER00_EL0",  0b11, 0b011, 0b1101, 0b0110, 0b000>;
+def : ROSysReg<"AMEVTYPER01_EL0",  0b11, 0b011, 0b1101, 0b0110, 0b001>;
+def : ROSysReg<"AMEVTYPER02_EL0",  0b11, 0b011, 0b1101, 0b0110, 0b010>;
+def : ROSysReg<"AMEVTYPER03_EL0",  0b11, 0b011, 0b1101, 0b0110, 0b011>;
+def : RWSysReg<"AMCNTENCLR1_EL0",  0b11, 0b011, 0b1101, 0b0011, 0b000>;
+def : RWSysReg<"AMCNTENSET1_EL0",  0b11, 0b011, 0b1101, 0b0011, 0b001>;
+def : RWSysReg<"AMEVCNTR10_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b000>;
+def : RWSysReg<"AMEVCNTR11_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b001>;
+def : RWSysReg<"AMEVCNTR12_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b010>;
+def : RWSysReg<"AMEVCNTR13_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b011>;
+def : RWSysReg<"AMEVCNTR14_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b100>;
+def : RWSysReg<"AMEVCNTR15_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b101>;
+def : RWSysReg<"AMEVCNTR16_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b110>;
+def : RWSysReg<"AMEVCNTR17_EL0",   0b11, 0b011, 0b1101, 0b1100, 0b111>;
+def : RWSysReg<"AMEVCNTR18_EL0",   0b11, 0b011, 0b1101, 0b1101, 0b000>;
+def : RWSysReg<"AMEVCNTR19_EL0",   0b11, 0b011, 0b1101, 0b1101, 0b001>;
+def : RWSysReg<"AMEVCNTR110_EL0",  0b11, 0b011, 0b1101, 0b1101, 0b010>;
+def : RWSysReg<"AMEVCNTR111_EL0",  0b11, 0b011, 0b1101, 0b1101, 0b011>;
+def : RWSysReg<"AMEVCNTR112_EL0",  0b11, 0b011, 0b1101, 0b1101, 0b100>;
+def : RWSysReg<"AMEVCNTR113_EL0",  0b11, 0b011, 0b1101, 0b1101, 0b101>;
+def : RWSysReg<"AMEVCNTR114_EL0",  0b11, 0b011, 0b1101, 0b1101, 0b110>;
+def : RWSysReg<"AMEVCNTR115_EL0",  0b11, 0b011, 0b1101, 0b1101, 0b111>;
+def : RWSysReg<"AMEVTYPER10_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b000>;
+def : RWSysReg<"AMEVTYPER11_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b001>;
+def : RWSysReg<"AMEVTYPER12_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b010>;
+def : RWSysReg<"AMEVTYPER13_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b011>;
+def : RWSysReg<"AMEVTYPER14_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b100>;
+def : RWSysReg<"AMEVTYPER15_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b101>;
+def : RWSysReg<"AMEVTYPER16_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b110>;
+def : RWSysReg<"AMEVTYPER17_EL0",  0b11, 0b011, 0b1101, 0b1110, 0b111>;
+def : RWSysReg<"AMEVTYPER18_EL0",  0b11, 0b011, 0b1101, 0b1111, 0b000>;
+def : RWSysReg<"AMEVTYPER19_EL0",  0b11, 0b011, 0b1101, 0b1111, 0b001>;
+def : RWSysReg<"AMEVTYPER110_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b010>;
+def : RWSysReg<"AMEVTYPER111_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b011>;
+def : RWSysReg<"AMEVTYPER112_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b100>;
+def : RWSysReg<"AMEVTYPER113_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b101>;
+def : RWSysReg<"AMEVTYPER114_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b110>;
+def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>;
+
+// v8.4a Trace Extension registers
+//
+// Please note that the 8.4 spec also defines these registers:
+// TRCIDR1, ID_DFR0_EL1, ID_AA64DFR0_EL1, MDSCR_EL1, MDCR_EL2, and MDCR_EL3,
+// but they are already defined above.
+//
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"TRFCR_EL1",        0b11, 0b000, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRFCR_EL2",        0b11, 0b100, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRFCR_EL12",       0b11, 0b101, 0b0001, 0b0010, 0b001>;
+
+// v8.4a Timining insensitivity of data processing instructions
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"DIT",              0b11, 0b011, 0b0100, 0b0010, 0b101>;
+
+// v8.4a Enhanced Support for Nested Virtualization
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"VNCR_EL2",         0b11, 0b100, 0b0010, 0b0010, 0b000>;
+
+} // HasV8_4aOps
+
 // Cyclone specific system registers
 //                                 Op0    Op1     CRn     CRm    Op2
 let Requires = [{ {AArch64::ProcCyclone} }] in
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 5d00dc58a5ab..01a997e5aed7 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
@@ -36,6 +35,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include <memory>
@@ -243,6 +243,18 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
                         getEffectiveCodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
   initAsmInfo();
+
+  if (TT.isOSBinFormatMachO()) {
+    this->Options.TrapUnreachable = true;
+    this->Options.NoTrapAfterNoreturn = true;
+  }
+
+  // Enable GlobalISel at or below EnableGlobalISelAt0.
+  if (getOptLevel() <= EnableGlobalISelAtO)
+    setGlobalISel(true);
+
+  // AArch64 supports the MachineOutliner.
+  setMachineOutliner(true);
 }
 
 AArch64TargetMachine::~AArch64TargetMachine() = default;
@@ -340,8 +352,6 @@ public:
   void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
-
-  bool isGlobalISelEnabled() const override;
 };
 
 } // end anonymous namespace
@@ -387,7 +397,7 @@ void AArch64PassConfig::addIRPasses() {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
     // multiple GEPs with single index.
-    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    addPass(createSeparateConstOffsetFromGEPPass(true));
     // Call EarlyCSE pass to find and remove subexpressions in the lowered
     // result.
     addPass(createEarlyCSEPass());
@@ -455,10 +465,6 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
   return false;
 }
 
-bool AArch64PassConfig::isGlobalISelEnabled() const {
-  return TM->getOptLevel() <= EnableGlobalISelAtO;
-}
-
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index f081d7caba67..9077eb7902fd 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
 
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
 class AArch64TargetMachine;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1820ad959fcb..d75fef7b0171 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -38,7 +38,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
 int AArch64TTIImpl::getIntImmCost(int64_t Val) {
@@ -54,7 +54,7 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {
   return (64 - LZ + 15) / 16;
 }
 
-/// \brief Calculate the cost of materializing the given constant.
+/// Calculate the cost of materializing the given constant.
 int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
@@ -277,7 +277,7 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // same as the second operand. In this case, we will generate a "long"
       // version of the widening instruction.
       if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
-        if (I->getOpcode() == Cast->getOpcode() &&
+        if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
             cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
           return 0;
     }
@@ -493,32 +493,70 @@ int AArch64TTIImpl::getArithmeticInstrCost(
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
-  if (ISD == ISD::SDIV &&
-      Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
-      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
-    // On AArch64, scalar signed division by constants power-of-two are
-    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
-    // The OperandValue properties many not be same as that of previous
-    // operation; conservatively assume OP_None.
-    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    return Cost;
-  }
-
   switch (ISD) {
   default:
     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
                                                 Opd1PropInfo, Opd2PropInfo);
+  case ISD::SDIV:
+    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+        Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+      // On AArch64, scalar signed division by constants power-of-two are
+      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+      // The OperandValue properties many not be same as that of previous
+      // operation; conservatively assume OP_None.
+      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      return Cost;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::UDIV:
+    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
+      auto VT = TLI->getValueType(DL, Ty);
+      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
+        // Vector signed division by constant are expanded to the
+        // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
+        // to MULHS + SUB + SRL + ADD + SRL.
+        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
+                                             Opd2Info,
+                                             TargetTransformInfo::OP_None,
+                                             TargetTransformInfo::OP_None);
+        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
+                                             Opd2Info,
+                                             TargetTransformInfo::OP_None,
+                                             TargetTransformInfo::OP_None);
+        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
+                                             Opd2Info,
+                                             TargetTransformInfo::OP_None,
+                                             TargetTransformInfo::OP_None);
+        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
+      }
+    }
+
+    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                          Opd1PropInfo, Opd2PropInfo);
+    if (Ty->isVectorTy()) {
+      // On AArch64, vector divisions are not supported natively and are
+      // expanded into scalar divisions of each pair of elements.
+      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
+                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
+      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
+                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
+      // TODO: if one of the arguments is scalar, then it's not necessary to
+      // double the cost of handling the vector elements.
+      Cost += Cost;
+    }
+    return Cost;
+
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -596,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
-      Ty->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
-    unsigned NumVecElts = Ty->getVectorNumElements();
-    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
-    // We generate 2 instructions per vector element.
-    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
+    unsigned ProfitableNumElements;
+    if (Opcode == Instruction::Store)
+      // We use a custom trunc store lowering so v.4b should be profitable.
+      ProfitableNumElements = 4;
+    else
+      // We scalarize the loads because there is not v.4b register and we
+      // have to promote the elements to v.2.
+      ProfitableNumElements = 8;
+
+    if (Ty->getVectorNumElements() < ProfitableNumElements) {
+      unsigned NumVecElts = Ty->getVectorNumElements();
+      unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+      // We generate 2 instructions per vector element.
+      return NumVectorizableInstsToAmortize * NumVecElts * 2;
+    }
   }
 
   return LT.first;
@@ -690,14 +736,14 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   };
 
   int StridedLoads = countStridedLoads(L, SE);
-  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
-               << " strided loads\n");
+  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+                    << " strided loads\n");
   // Pick the largest power of 2 unroll count that won't result in too many
   // strided loads.
   if (StridedLoads) {
     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
-    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
+                      << UP.MaxCount << '\n');
   }
 }
 
@@ -868,3 +914,73 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
   }
   return false;
 }
+
+int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
+                                               bool IsPairwiseForm) {
+
+  if (IsPairwiseForm)
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+  MVT MTy = LT.second;
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Horizontal adds can use the 'addv' instruction. We model the cost of these
+  // instructions as normal vector adds. This is the only arithmetic vector
+  // reduction operation for which we have an instruction.
+  static const CostTblEntry CostTblNoPairwise[]{
+      {ISD::ADD, MVT::v8i8,  1},
+      {ISD::ADD, MVT::v16i8, 1},
+      {ISD::ADD, MVT::v4i16, 1},
+      {ISD::ADD, MVT::v8i16, 1},
+      {ISD::ADD, MVT::v4i32, 1},
+  };
+
+  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
+    return LT.first * Entry->Cost;
+
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+}
+
+int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select ||
+      Kind == TTI::SK_PermuteSingleSrc) {
+    static const CostTblEntry ShuffleTbl[] = {
+      // Transpose shuffle kinds can be performed with 'trn1/trn2' and
+      // 'zip1/zip2' instructions.
+      { TTI::SK_Transpose, MVT::v8i8,  1 },
+      { TTI::SK_Transpose, MVT::v16i8, 1 },
+      { TTI::SK_Transpose, MVT::v4i16, 1 },
+      { TTI::SK_Transpose, MVT::v8i16, 1 },
+      { TTI::SK_Transpose, MVT::v2i32, 1 },
+      { TTI::SK_Transpose, MVT::v4i32, 1 },
+      { TTI::SK_Transpose, MVT::v2i64, 1 },
+      { TTI::SK_Transpose, MVT::v2f32, 1 },
+      { TTI::SK_Transpose, MVT::v4f32, 1 },
+      { TTI::SK_Transpose, MVT::v2f64, 1 },
+      // Select shuffle kinds.
+      // TODO: handle vXi8/vXi16.
+      { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
+      { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
+      { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
+      { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
+      { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
+      { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
+      // PermuteSingleSrc shuffle kinds.
+      // TODO: handle vXi8/vXi16.
+      { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
+    };
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 08c693ff38a8..c056a7d2428b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -166,6 +166,11 @@ public:
 
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
+
+  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+                                 bool IsPairwiseForm);
+
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   /// @}
 };
 
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index ac9ff51f69f1..a51c41d70915 100644
--- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -66,6 +66,12 @@ enum class RegKind {
   SVEPredicateVector
 };
 
+enum RegConstraintEqualityTy {
+  EqualsReg,
+  EqualsSuperReg,
+  EqualsSubReg
+};
+
 class AArch64AsmParser : public MCTargetAsmParser {
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
@@ -85,19 +91,18 @@ private:
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
   unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
-  int tryParseRegister();
-  int tryMatchVectorRegister(StringRef &Kind, bool expected);
   bool parseRegister(OperandVector &Operands);
   bool parseSymbolicImmVal(const MCExpr *&ImmVal);
-  bool parseVectorList(OperandVector &Operands);
+  bool parseNeonVectorList(OperandVector &Operands);
+  bool parseOptionalMulOperand(OperandVector &Operands);
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
+  bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo,
+                      OperandVector &Operands);
 
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveCPU(SMLoc L);
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
   bool parseDirectiveInst(SMLoc L);
 
   bool parseDirectiveTLSDescCall(SMLoc L);
@@ -121,25 +126,36 @@ private:
 
   /// }
 
-  OperandMatchResultTy tryParseSVERegister(int &Reg, StringRef &Kind,
-                                           RegKind MatchKind);
+  OperandMatchResultTy tryParseScalarRegister(unsigned &Reg);
+  OperandMatchResultTy tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
+                                              RegKind MatchKind);
   OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
   OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
   OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
   OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
   OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  template <bool IsSVEPrefetch = false>
   OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
   OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  template<bool AddFPZeroAsLiteral>
   OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
-  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseImmWithOptionalShift(OperandVector &Operands);
   OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
   bool tryParseNeonVectorRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseVectorIndex(OperandVector &Operands);
   OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
-  template <bool ParseSuffix>
+  template <bool ParseShiftExtend,
+            RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg>
+  OperandMatchResultTy tryParseGPROperand(OperandVector &Operands);
+  template <bool ParseShiftExtend, bool ParseSuffix>
   OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands);
   OperandMatchResultTy tryParseSVEPredicateVector(OperandVector &Operands);
+  template <RegKind VectorKind>
+  OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
+                                          bool ExpectMatch = false);
+  OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);
 
 public:
   enum AArch64MatchResultTy {
@@ -158,10 +174,19 @@ public:
     if (S.getTargetStreamer() == nullptr)
       new AArch64TargetStreamer(S);
 
+    // Alias .hword/.word/xword to the target-independent .2byte/.4byte/.8byte
+    // directives as they have the same form and semantics:
+    ///  ::= (.hword | .word | .xword ) [ expression (, expression)* ]
+    Parser.addAliasForDirective(".hword", ".2byte");
+    Parser.addAliasForDirective(".word", ".4byte");
+    Parser.addAliasForDirective(".xword", ".8byte");
+
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
 
+  bool regsEqual(const MCParsedAsmOperand &Op1,
+                 const MCParsedAsmOperand &Op2) const override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
@@ -204,18 +229,45 @@ private:
     bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
   };
 
+  // Separate shift/extend operand.
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
   struct RegOp {
     unsigned RegNum;
     RegKind Kind;
-
     int ElementWidth;
+
+    // The register may be allowed as a different register class,
+    // e.g. for GPR64as32 or GPR32as64.
+    RegConstraintEqualityTy EqualityTy;
+
+    // In some cases the shift/extend needs to be explicitly parsed together
+    // with the register, rather than as a separate operand. This is needed
+    // for addressing modes where the instruction as a whole dictates the
+    // scaling/extend, rather than specific bits in the instruction.
+    // By parsing them as a single operand, we avoid the need to pass an
+    // extra operand in all CodeGen patterns (because all operands need to
+    // have an associated value), and we avoid the need to update TableGen to
+    // accept operands that have no associated bits in the instruction.
+    //
+    // An added benefit of parsing them together is that the assembler
+    // can give a sensible diagnostic if the scaling is not correct.
+    //
+    // The default is 'lsl #0' (HasExplicitAmount = false) if no
+    // ShiftExtend is specified.
+    ShiftExtendOp ShiftExtend;
   };
 
   struct VectorListOp {
     unsigned RegNum;
     unsigned Count;
     unsigned NumElements;
-    unsigned ElementKind;
+    unsigned ElementWidth;
+    RegKind  RegisterKind;
   };
 
   struct VectorIndexOp {
@@ -236,7 +288,8 @@ private:
   };
 
   struct FPImmOp {
-    unsigned Val; // Encoded 8-bit representation.
+    uint64_t Val; // APFloat value bitcasted to uint64_t.
+    bool IsExact; // describes whether parsed value was exact.
   };
 
   struct BarrierOp {
@@ -269,12 +322,6 @@ private:
     unsigned Val;
   };
 
-  struct ShiftExtendOp {
-    AArch64_AM::ShiftExtendType Type;
-    unsigned Amount;
-    bool HasExplicitAmount;
-  };
-
   struct ExtendOp {
     unsigned Val;
   };
@@ -388,9 +435,14 @@ public:
     return CondCode.Code;
   }
 
-  unsigned getFPImm() const {
-    assert(Kind == k_FPImm && "Invalid access!");
-    return FPImm.Val;
+  APFloat getFPImm() const {
+    assert (Kind == k_FPImm && "Invalid access!");
+    return APFloat(APFloat::IEEEdouble(), APInt(64, FPImm.Val, true));
+  }
+
+  bool getFPImmIsExact() const {
+    assert (Kind == k_FPImm && "Invalid access!");
+    return FPImm.IsExact;
   }
 
   unsigned getBarrier() const {
@@ -408,6 +460,11 @@ public:
     return Reg.RegNum;
   }
 
+  RegConstraintEqualityTy getRegEqualityTy() const {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.EqualityTy;
+  }
+
   unsigned getVectorListStart() const {
     assert(Kind == k_VectorList && "Invalid access!");
     return VectorList.RegNum;
@@ -454,66 +511,88 @@ public:
   }
 
   AArch64_AM::ShiftExtendType getShiftExtendType() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.Type;
+    if (Kind == k_ShiftExtend)
+      return ShiftExtend.Type;
+    if (Kind == k_Register)
+      return Reg.ShiftExtend.Type;
+    llvm_unreachable("Invalid access!");
   }
 
   unsigned getShiftExtendAmount() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.Amount;
+    if (Kind == k_ShiftExtend)
+      return ShiftExtend.Amount;
+    if (Kind == k_Register)
+      return Reg.ShiftExtend.Amount;
+    llvm_unreachable("Invalid access!");
   }
 
   bool hasShiftExtendAmount() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.HasExplicitAmount;
+    if (Kind == k_ShiftExtend)
+      return ShiftExtend.HasExplicitAmount;
+    if (Kind == k_Register)
+      return Reg.ShiftExtend.HasExplicitAmount;
+    llvm_unreachable("Invalid access!");
   }
 
   bool isImm() const override { return Kind == k_Immediate; }
   bool isMem() const override { return false; }
-  bool isSImm9() const {
+
+  bool isUImm6() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
       return false;
     int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val < 256);
+    return (Val >= 0 && Val < 64);
   }
-  bool isSImm10s8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -4096 && Val < 4089 && (Val & 7) == 0);
+
+  template <int Width> bool isSImm() const { return isSImmScaled<Width, 1>(); }
+
+  template <int Bits, int Scale> DiagnosticPredicate isSImmScaled() const {
+    return isImmScaled<Bits, Scale>(true);
   }
-  bool isSImm7s4() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+
+  template <int Bits, int Scale> DiagnosticPredicate isUImmScaled() const {
+    return isImmScaled<Bits, Scale>(false);
   }
-  bool isSImm7s8() const {
+
+  template <int Bits, int Scale>
+  DiagnosticPredicate isImmScaled(bool Signed) const {
     if (!isImm())
-      return false;
+      return DiagnosticPredicateTy::NoMatch;
+
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
-      return false;
+      return DiagnosticPredicateTy::NoMatch;
+
+    int64_t MinVal, MaxVal;
+    if (Signed) {
+      int64_t Shift = Bits - 1;
+      MinVal = (int64_t(1) << Shift) * -Scale;
+      MaxVal = ((int64_t(1) << Shift) - 1) * Scale;
+    } else {
+      MinVal = 0;
+      MaxVal = ((int64_t(1) << Bits) - 1) * Scale;
+    }
+
     int64_t Val = MCE->getValue();
-    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+    if (Val >= MinVal && Val <= MaxVal && (Val % Scale) == 0)
+      return DiagnosticPredicateTy::Match;
+
+    return DiagnosticPredicateTy::NearMatch;
   }
-  bool isSImm7s16() const {
+
+  DiagnosticPredicate isSVEPattern() const {
     if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+      return DiagnosticPredicateTy::NoMatch;
+    auto *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
-      return false;
+      return DiagnosticPredicateTy::NoMatch;
     int64_t Val = MCE->getValue();
-    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+    if (Val >= 0 && Val < 32)
+      return DiagnosticPredicateTy::Match;
+    return DiagnosticPredicateTy::NearMatch;
   }
 
   bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
@@ -535,7 +614,9 @@ public:
         ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
         ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
         ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
-        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) {
       // Note that we don't range-check the addend. It's adjusted modulo page
       // size when converted, so there is no "out of range" condition when using
       // @pageoff.
@@ -572,48 +653,47 @@ public:
     return (Val >= N && Val <= M);
   }
 
-  bool isLogicalImm32() const {
+  // NOTE: Also used for isLogicalImmNot as anything that can be represented as
+  // a logical immediate can always be represented when inverted.
+  template <typename T>
+  bool isLogicalImm() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
       return false;
+
     int64_t Val = MCE->getValue();
-    if (Val >> 32 != 0 && Val >> 32 != ~0LL)
+    int64_t SVal = typename std::make_signed<T>::type(Val);
+    int64_t UVal = typename std::make_unsigned<T>::type(Val);
+    if (Val != SVal && Val != UVal)
       return false;
-    Val &= 0xFFFFFFFF;
-    return AArch64_AM::isLogicalImmediate(Val, 32);
-  }
 
-  bool isLogicalImm64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+    return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8);
   }
 
-  bool isLogicalImm32Not() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
-    return AArch64_AM::isLogicalImmediate(Val, 32);
-  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
 
-  bool isLogicalImm64Not() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
-  }
+  /// Returns the immediate value as a pair of (imm, shift) if the immediate is
+  /// a shifted immediate by value 'Shift' or '0', or if it is an unshifted
+  /// immediate that can be shifted by 'Shift'.
+  template <unsigned Width>
+  Optional<std::pair<int64_t, unsigned> > getShiftedVal() const {
+    if (isShiftedImm() && Width == getShiftedImmShift())
+      if (auto *CE = dyn_cast<MCConstantExpr>(getShiftedImmVal()))
+        return std::make_pair(CE->getValue(), Width);
+
+    if (isImm())
+      if (auto *CE = dyn_cast<MCConstantExpr>(getImm())) {
+        int64_t Val = CE->getValue();
+        if ((Val != 0) && (uint64_t(Val >> Width) << Width) == uint64_t(Val))
+          return std::make_pair(Val >> Width, Width);
+        else
+          return std::make_pair(Val, 0u);
+      }
 
-  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+    return {};
+  }
 
   bool isAddSubImm() const {
     if (!isShiftedImm() && !isImm())
@@ -646,12 +726,14 @@ public:
           || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
           || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
           || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
-          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12
+          || ELFRefKind == AArch64MCExpr::VK_SECREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
     }
 
-    // If it's a constant, it should be a real immediate in range:
-    if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
-      return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+    // If it's a constant, it should be a real immediate in range.
+    if (auto ShiftedVal = getShiftedVal<12>())
+      return ShiftedVal->first >= 0 && ShiftedVal->first <= 0xfff;
 
     // If it's an expression, we hope for the best and let the fixup/relocation
     // code deal with it.
@@ -662,20 +744,56 @@ public:
     if (!isShiftedImm() && !isImm())
       return false;
 
-    const MCExpr *Expr;
+    // Otherwise it should be a real negative immediate in range.
+    if (auto ShiftedVal = getShiftedVal<12>())
+      return ShiftedVal->first < 0 && -ShiftedVal->first <= 0xfff;
 
-    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
-    if (isShiftedImm()) {
-      unsigned Shift = ShiftedImm.ShiftAmount;
-      Expr = ShiftedImm.Val;
-      if (Shift != 0 && Shift != 12)
-        return false;
-    } else
-      Expr = getImm();
+    return false;
+  }
+
+  // Signed value in the range -128 to +127. For element widths of
+  // 16 bits or higher it may also be a signed multiple of 256 in the
+  // range -32768 to +32512.
+  // For element-width of 8 bits a range of -128 to 255 is accepted,
+  // since a copy of a byte can be either signed/unsigned.
+  template <typename T>
+  DiagnosticPredicate isSVECpyImm() const {
+    if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
+      return DiagnosticPredicateTy::NoMatch;
+
+    bool IsByte =
+        std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+    if (auto ShiftedImm = getShiftedVal<8>())
+      if (!(IsByte && ShiftedImm->second) &&
+          AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
+                                     << ShiftedImm->second))
+        return DiagnosticPredicateTy::Match;
 
-    // Otherwise it should be a real negative immediate in range:
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
-    return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff;
+    return DiagnosticPredicateTy::NearMatch;
+  }
+
+  // Unsigned value in the range 0 to 255. For element widths of
+  // 16 bits or higher it may also be a signed multiple of 256 in the
+  // range 0 to 65280.
+  template <typename T> DiagnosticPredicate isSVEAddSubImm() const {
+    if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
+      return DiagnosticPredicateTy::NoMatch;
+
+    bool IsByte =
+        std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+    if (auto ShiftedImm = getShiftedVal<8>())
+      if (!(IsByte && ShiftedImm->second) &&
+          AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
+                                        << ShiftedImm->second))
+        return DiagnosticPredicateTy::Match;
+
+    return DiagnosticPredicateTy::NearMatch;
+  }
+
+  template <typename T> DiagnosticPredicate isSVEPreferredLogicalImm() const {
+    if (isLogicalImm<T>() && !isSVECpyImm<T>())
+      return DiagnosticPredicateTy::Match;
+    return DiagnosticPredicateTy::NoMatch;
   }
 
   bool isCondCode() const { return Kind == k_CondCode; }
@@ -792,7 +910,11 @@ public:
     return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
   }
 
-  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isFPImm() const {
+    return Kind == k_FPImm &&
+           AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1;
+  }
+
   bool isBarrier() const { return Kind == k_Barrier; }
   bool isSysReg() const { return Kind == k_SysReg; }
 
@@ -810,6 +932,7 @@ public:
   bool isSystemPStateFieldWithImm0_1() const {
     if (!isSysReg()) return false;
     return (SysReg.PStateField == AArch64PState::PAN ||
+            SysReg.PStateField == AArch64PState::DIT ||
             SysReg.PStateField == AArch64PState::UAO);
   }
 
@@ -840,6 +963,8 @@ public:
     RegKind RK;
     switch (Class) {
     case AArch64::ZPRRegClassID:
+    case AArch64::ZPR_3bRegClassID:
+    case AArch64::ZPR_4bRegClassID:
       RK = RegKind::SVEDataVector;
       break;
     case AArch64::PPRRegClassID:
@@ -854,10 +979,56 @@ public:
            AArch64MCRegisterClasses[Class].contains(getReg());
   }
 
+  template <unsigned Class> bool isFPRasZPR() const {
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+           AArch64MCRegisterClasses[Class].contains(getReg());
+  }
+
+  template <int ElementWidth, unsigned Class>
+  DiagnosticPredicate isSVEPredicateVectorRegOfWidth() const {
+    if (Kind != k_Register || Reg.Kind != RegKind::SVEPredicateVector)
+      return DiagnosticPredicateTy::NoMatch;
+
+    if (isSVEVectorReg<Class>() &&
+           (ElementWidth == 0 || Reg.ElementWidth == ElementWidth))
+      return DiagnosticPredicateTy::Match;
+
+    return DiagnosticPredicateTy::NearMatch;
+  }
+
   template <int ElementWidth, unsigned Class>
-  bool isSVEVectorRegOfWidth() const {
-    return isSVEVectorReg<Class>() &&
-           (ElementWidth == -1 || Reg.ElementWidth == ElementWidth);
+  DiagnosticPredicate isSVEDataVectorRegOfWidth() const {
+    if (Kind != k_Register || Reg.Kind != RegKind::SVEDataVector)
+      return DiagnosticPredicateTy::NoMatch;
+
+    if (isSVEVectorReg<Class>() &&
+           (ElementWidth == 0 || Reg.ElementWidth == ElementWidth))
+      return DiagnosticPredicateTy::Match;
+
+    return DiagnosticPredicateTy::NearMatch;
+  }
+
+  template <int ElementWidth, unsigned Class,
+            AArch64_AM::ShiftExtendType ShiftExtendTy, int ShiftWidth,
+            bool ShiftWidthAlwaysSame>
+  DiagnosticPredicate isSVEDataVectorRegWithShiftExtend() const {
+    auto VectorMatch = isSVEDataVectorRegOfWidth<ElementWidth, Class>();
+    if (!VectorMatch.isMatch())
+      return DiagnosticPredicateTy::NoMatch;
+
+    // Give a more specific diagnostic when the user has explicitly typed in
+    // a shift-amount that does not match what is expected, but for which
+    // there is also an unscaled addressing mode (e.g. sxtw/uxtw).
+    bool MatchShift = getShiftExtendAmount() == Log2_32(ShiftWidth / 8);
+    if (!MatchShift && (ShiftExtendTy == AArch64_AM::UXTW ||
+                        ShiftExtendTy == AArch64_AM::SXTW) &&
+        !ShiftWidthAlwaysSame && hasShiftExtendAmount() && ShiftWidth == 8)
+      return DiagnosticPredicateTy::NoMatch;
+
+    if (MatchShift && ShiftExtendTy == getShiftExtendType())
+      return DiagnosticPredicateTy::Match;
+
+    return DiagnosticPredicateTy::NearMatch;
   }
 
   bool isGPR32as64() const {
@@ -865,6 +1036,11 @@ public:
       AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
 
+  bool isGPR64as32() const {
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+      AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
+  }
+
   bool isWSeqPair() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
@@ -877,58 +1053,65 @@ public:
                Reg.RegNum);
   }
 
-  bool isGPR64sp0() const {
-    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
-      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
-  }
-
   template<int64_t Angle, int64_t Remainder>
-  bool isComplexRotation() const {
-    if (!isImm()) return false;
+  DiagnosticPredicate isComplexRotation() const {
+    if (!isImm()) return DiagnosticPredicateTy::NoMatch;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    if (!CE) return DiagnosticPredicateTy::NoMatch;
     uint64_t Value = CE->getValue();
 
-    return (Value % Angle == Remainder && Value <= 270);
+    if (Value % Angle == Remainder && Value <= 270)
+      return DiagnosticPredicateTy::Match;
+    return DiagnosticPredicateTy::NearMatch;
+  }
+
+  template <unsigned RegClassID> bool isGPR64() const {
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+           AArch64MCRegisterClasses[RegClassID].contains(getReg());
+  }
+
+  template <unsigned RegClassID, int ExtWidth>
+  DiagnosticPredicate isGPR64WithShiftExtend() const {
+    if (Kind != k_Register || Reg.Kind != RegKind::Scalar)
+      return DiagnosticPredicateTy::NoMatch;
+
+    if (isGPR64<RegClassID>() && getShiftExtendType() == AArch64_AM::LSL &&
+        getShiftExtendAmount() == Log2_32(ExtWidth / 8))
+      return DiagnosticPredicateTy::Match;
+    return DiagnosticPredicateTy::NearMatch;
   }
 
   /// Is this a vector list with the type implicit (presumably attached to the
   /// instruction itself)?
-  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+  template <RegKind VectorKind, unsigned NumRegs>
+  bool isImplicitlyTypedVectorList() const {
     return Kind == k_VectorList && VectorList.Count == NumRegs &&
-           !VectorList.ElementKind;
+           VectorList.NumElements == 0 &&
+           VectorList.RegisterKind == VectorKind;
   }
 
-  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  template <RegKind VectorKind, unsigned NumRegs, unsigned NumElements,
+            unsigned ElementWidth>
   bool isTypedVectorList() const {
     if (Kind != k_VectorList)
       return false;
     if (VectorList.Count != NumRegs)
       return false;
-    if (VectorList.ElementKind != ElementKind)
+    if (VectorList.RegisterKind != VectorKind)
+      return false;
+    if (VectorList.ElementWidth != ElementWidth)
       return false;
     return VectorList.NumElements == NumElements;
   }
 
-  bool isVectorIndex1() const {
-    return Kind == k_VectorIndex && VectorIndex.Val == 1;
-  }
-
-  bool isVectorIndexB() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 16;
-  }
-
-  bool isVectorIndexH() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 8;
-  }
-
-  bool isVectorIndexS() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 4;
-  }
-
-  bool isVectorIndexD() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  template <int Min, int Max>
+  DiagnosticPredicate isVectorIndex() const {
+    if (Kind != k_VectorIndex)
+      return DiagnosticPredicateTy::NoMatch;
+    if (VectorIndex.Val >= Min && VectorIndex.Val <= Max)
+      return DiagnosticPredicateTy::Match;
+    return DiagnosticPredicateTy::NearMatch;
   }
 
   bool isToken() const override { return Kind == k_Token; }
@@ -949,6 +1132,39 @@ public:
             ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
             ST == AArch64_AM::MSL);
   }
+
+  template <unsigned ImmEnum> DiagnosticPredicate isExactFPImm() const {
+    if (Kind != k_FPImm)
+      return DiagnosticPredicateTy::NoMatch;
+
+    if (getFPImmIsExact()) {
+      // Lookup the immediate from table of supported immediates.
+      auto *Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmEnum);
+      assert(Desc && "Unknown enum value");
+
+      // Calculate its FP value.
+      APFloat RealVal(APFloat::IEEEdouble());
+      if (RealVal.convertFromString(Desc->Repr, APFloat::rmTowardZero) !=
+          APFloat::opOK)
+        llvm_unreachable("FP immediate is not exact");
+
+      if (getFPImm().bitwiseIsEqual(RealVal))
+        return DiagnosticPredicateTy::Match;
+    }
+
+    return DiagnosticPredicateTy::NearMatch;
+  }
+
+  template <unsigned ImmA, unsigned ImmB>
+  DiagnosticPredicate isExactFPImm() const {
+    DiagnosticPredicate Res = DiagnosticPredicateTy::NoMatch;
+    if ((Res = isExactFPImm<ImmA>()))
+      return DiagnosticPredicateTy::Match;
+    if ((Res = isExactFPImm<ImmB>()))
+      return DiagnosticPredicateTy::Match;
+    return Res;
+  }
+
   bool isExtend() const {
     if (!isShiftExtend())
       return false;
@@ -1081,7 +1297,7 @@ public:
   // ambiguity in the matcher.
   template<int Width>
   bool isSImm9OffsetFB() const {
-    return isSImm9() && !isUImm12Offset<Width / 8>();
+    return isSImm<9>() && !isUImm12Offset<Width / 8>();
   }
 
   bool isAdrpLabel() const {
@@ -1143,6 +1359,33 @@ public:
     Inst.addOperand(MCOperand::createReg(Reg));
   }
 
+  void addGPR64as32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(getReg()));
+
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR64RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
+
+    Inst.addOperand(MCOperand::createReg(Reg));
+  }
+
+  template <int Width>
+  void addFPRasZPRRegOperands(MCInst &Inst, unsigned N) const {
+    unsigned Base;
+    switch (Width) {
+    case 8:   Base = AArch64::B0; break;
+    case 16:  Base = AArch64::H0; break;
+    case 32:  Base = AArch64::S0; break;
+    case 64:  Base = AArch64::D0; break;
+    case 128: Base = AArch64::Q0; break;
+    default:
+      llvm_unreachable("Unsupported width");
+    }
+    Inst.addOperand(MCOperand::createReg(AArch64::Z0 + getReg() - Base));
+  }
+
   void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     assert(
@@ -1162,55 +1405,45 @@ public:
     Inst.addOperand(MCOperand::createReg(getReg()));
   }
 
-  template <unsigned NumRegs>
-  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static const unsigned FirstRegs[] = { AArch64::D0,
-                                          AArch64::D0_D1,
-                                          AArch64::D0_D1_D2,
-                                          AArch64::D0_D1_D2_D3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static const unsigned FirstRegs[] = { AArch64::Q0,
-                                          AArch64::Q0_Q1,
-                                          AArch64::Q0_Q1_Q2,
-                                          AArch64::Q0_Q1_Q2_Q3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
-  }
-
-  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
-  }
+  enum VecListIndexType {
+    VecListIdx_DReg = 0,
+    VecListIdx_QReg = 1,
+    VecListIdx_ZReg = 2,
+  };
 
-  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+  template <VecListIndexType RegTy, unsigned NumRegs>
+  void addVectorListOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
-  }
-
-  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    static const unsigned FirstRegs[][5] = {
+      /* DReg */ { AArch64::Q0,
+                   AArch64::D0,       AArch64::D0_D1,
+                   AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 },
+      /* QReg */ { AArch64::Q0,
+                   AArch64::Q0,       AArch64::Q0_Q1,
+                   AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 },
+      /* ZReg */ { AArch64::Z0,
+                   AArch64::Z0,       AArch64::Z0_Z1,
+                   AArch64::Z0_Z1_Z2, AArch64::Z0_Z1_Z2_Z3 }
+    };
+
+    assert((RegTy != VecListIdx_ZReg || NumRegs <= 4) &&
+           " NumRegs must be <= 4 for ZRegs");
+
+    unsigned FirstReg = FirstRegs[(unsigned)RegTy][NumRegs];
+    Inst.addOperand(MCOperand::createReg(FirstReg + getVectorListStart() -
+                                         FirstRegs[(unsigned)RegTy][0]));
+  }
+
+  void addVectorIndexOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(getVectorIndex()));
   }
 
-  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+  template <unsigned ImmIs0, unsigned ImmIs1>
+  void addExactFPImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
-  }
-
-  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+    assert(bool(isExactFPImm<ImmIs0, ImmIs1>()) && "Invalid operand");
+    Inst.addOperand(MCOperand::createImm(bool(isExactFPImm<ImmIs1>())));
   }
 
   void addImmOperands(MCInst &Inst, unsigned N) const {
@@ -1221,9 +1454,13 @@ public:
     addExpr(Inst, getImm());
   }
 
-  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+  template <int Shift>
+  void addImmWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
-    if (isShiftedImm()) {
+    if (auto ShiftedVal = getShiftedVal<Shift>()) {
+      Inst.addOperand(MCOperand::createImm(ShiftedVal->first));
+      Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
+    } else if (isShiftedImm()) {
       addExpr(Inst, getShiftedImmVal());
       Inst.addOperand(MCOperand::createImm(getShiftedImmShift()));
     } else {
@@ -1232,16 +1469,14 @@ public:
     }
   }
 
-  void addAddSubImmNegOperands(MCInst &Inst, unsigned N) const {
+  template <int Shift>
+  void addImmNegWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
-
-    const MCExpr *MCE = isShiftedImm() ? getShiftedImmVal() : getImm();
-    const MCConstantExpr *CE = cast<MCConstantExpr>(MCE);
-    int64_t Val = -CE->getValue();
-    unsigned ShiftAmt = isShiftedImm() ? ShiftedImm.ShiftAmount : 0;
-
-    Inst.addOperand(MCOperand::createImm(Val));
-    Inst.addOperand(MCOperand::createImm(ShiftAmt));
+    if (auto ShiftedVal = getShiftedVal<Shift>()) {
+      Inst.addOperand(MCOperand::createImm(-ShiftedVal->first));
+      Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
+    } else
+      llvm_unreachable("Not a shifted negative immediate");
   }
 
   void addCondCodeOperands(MCInst &Inst, unsigned N) const {
@@ -1274,155 +1509,34 @@ public:
     Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
   }
 
-  void addSImm9Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addSImm10s8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8));
-  }
-
-  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 4));
-  }
-
-  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8));
-  }
-
-  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
-  }
-
-  void addImm0_1Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+  void addUImm6Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(MCE->getValue()));
   }
 
-  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+  template <int Scale>
+  void addImmScaledOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
-  }
-
-  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    uint64_t encoding =
-        AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
-    Inst.addOperand(MCOperand::createImm(encoding));
-  }
-
-  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
-    Inst.addOperand(MCOperand::createImm(encoding));
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
   }
 
-  void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const {
+  template <typename T>
+  void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
-    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32);
+    typename std::make_unsigned<T>::type Val = MCE->getValue();
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
     Inst.addOperand(MCOperand::createImm(encoding));
   }
 
-  void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const {
+  template <typename T>
+  void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    uint64_t encoding =
-        AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64);
+    typename std::make_unsigned<T>::type Val = ~MCE->getValue();
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
     Inst.addOperand(MCOperand::createImm(encoding));
   }
 
@@ -1477,7 +1591,8 @@ public:
 
   void addFPImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(getFPImm()));
+    Inst.addOperand(MCOperand::createImm(
+        AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt())));
   }
 
   void addBarrierOperands(MCInst &Inst, unsigned N) const {
@@ -1611,35 +1726,49 @@ public:
   }
 
   static std::unique_ptr<AArch64Operand>
-  CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) {
+  CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx,
+            RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg,
+            AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
+            unsigned ShiftAmount = 0,
+            unsigned HasExplicitAmount = false) {
     auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
     Op->Reg.RegNum = RegNum;
     Op->Reg.Kind = Kind;
+    Op->Reg.ElementWidth = 0;
+    Op->Reg.EqualityTy = EqTy;
+    Op->Reg.ShiftExtend.Type = ExtTy;
+    Op->Reg.ShiftExtend.Amount = ShiftAmount;
+    Op->Reg.ShiftExtend.HasExplicitAmount = HasExplicitAmount;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
   static std::unique_ptr<AArch64Operand>
-  CreateReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
-            SMLoc S, SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
-    Op->Reg.RegNum = RegNum;
+  CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
+                  SMLoc S, SMLoc E, MCContext &Ctx,
+                  AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
+                  unsigned ShiftAmount = 0,
+                  unsigned HasExplicitAmount = false) {
+    assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector ||
+            Kind == RegKind::SVEPredicateVector) &&
+           "Invalid vector kind");
+    auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount,
+                        HasExplicitAmount);
     Op->Reg.ElementWidth = ElementWidth;
-    Op->Reg.Kind = Kind;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
     return Op;
   }
 
   static std::unique_ptr<AArch64Operand>
   CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
-                   char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+                   unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
+                   MCContext &Ctx) {
     auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.NumElements = NumElements;
-    Op->VectorList.ElementKind = ElementKind;
+    Op->VectorList.ElementWidth = ElementWidth;
+    Op->VectorList.RegisterKind = RegisterKind;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -1684,10 +1813,11 @@ public:
     return Op;
   }
 
-  static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
-                                                     MCContext &Ctx) {
+  static std::unique_ptr<AArch64Operand>
+  CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
     auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
-    Op->FPImm.Val = Val;
+    Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
+    Op->FPImm.IsExact = IsExact;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
@@ -1775,8 +1905,10 @@ public:
 void AArch64Operand::print(raw_ostream &OS) const {
   switch (Kind) {
   case k_FPImm:
-    OS << "<fpimm " << getFPImm() << "("
-       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    OS << "<fpimm " << getFPImm().bitcastToAPInt().getZExtValue();
+    if (!getFPImmIsExact())
+      OS << " (inexact)";
+    OS << ">";
     break;
   case k_Barrier: {
     StringRef Name = getBarrierName();
@@ -1799,9 +1931,6 @@ void AArch64Operand::print(raw_ostream &OS) const {
   case k_CondCode:
     OS << "<condcode " << getCondCode() << ">";
     break;
-  case k_Register:
-    OS << "<register " << getReg() << ">";
-    break;
   case k_VectorList: {
     OS << "<vectorlist ";
     unsigned Reg = getVectorListStart();
@@ -1833,6 +1962,11 @@ void AArch64Operand::print(raw_ostream &OS) const {
   case k_PSBHint:
     OS << getPSBHintName();
     break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    if (!getShiftExtendAmount() && !hasShiftExtendAmount())
+      break;
+    LLVM_FALLTHROUGH;
   case k_ShiftExtend:
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
@@ -1887,29 +2021,65 @@ static unsigned MatchNeonVectorRegName(StringRef Name) {
       .Default(0);
 }
 
-static bool isValidVectorKind(StringRef Name) {
-  return StringSwitch<bool>(Name.lower())
-      .Case(".8b", true)
-      .Case(".16b", true)
-      .Case(".4h", true)
-      .Case(".8h", true)
-      .Case(".2s", true)
-      .Case(".4s", true)
-      .Case(".1d", true)
-      .Case(".2d", true)
-      .Case(".1q", true)
-      // Accept the width neutral ones, too, for verbose syntax. If those
-      // aren't used in the right places, the token operand won't match so
-      // all will work out.
-      .Case(".b", true)
-      .Case(".h", true)
-      .Case(".s", true)
-      .Case(".d", true)
-      // Needed for fp16 scalar pairwise reductions
-      .Case(".2h", true)
-      // another special case for the ARMv8.2a dot product operand
-      .Case(".4b", true)
-      .Default(false);
+/// Returns an optional pair of (#elements, element-width) if Suffix
+/// is a valid vector kind. Where the number of elements in a vector
+/// or the vector width is implicit or explicitly unknown (but still a
+/// valid suffix kind), 0 is used.
+static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
+                                                     RegKind VectorKind) {
+  std::pair<int, int> Res = {-1, -1};
+
+  switch (VectorKind) {
+  case RegKind::NeonVector:
+    Res =
+        StringSwitch<std::pair<int, int>>(Suffix.lower())
+            .Case("", {0, 0})
+            .Case(".1d", {1, 64})
+            .Case(".1q", {1, 128})
+            // '.2h' needed for fp16 scalar pairwise reductions
+            .Case(".2h", {2, 16})
+            .Case(".2s", {2, 32})
+            .Case(".2d", {2, 64})
+            // '.4b' is another special case for the ARMv8.2a dot product
+            // operand
+            .Case(".4b", {4, 8})
+            .Case(".4h", {4, 16})
+            .Case(".4s", {4, 32})
+            .Case(".8b", {8, 8})
+            .Case(".8h", {8, 16})
+            .Case(".16b", {16, 8})
+            // Accept the width neutral ones, too, for verbose syntax. If those
+            // aren't used in the right places, the token operand won't match so
+            // all will work out.
+            .Case(".b", {0, 8})
+            .Case(".h", {0, 16})
+            .Case(".s", {0, 32})
+            .Case(".d", {0, 64})
+            .Default({-1, -1});
+    break;
+  case RegKind::SVEPredicateVector:
+  case RegKind::SVEDataVector:
+    Res = StringSwitch<std::pair<int, int>>(Suffix.lower())
+              .Case("", {0, 0})
+              .Case(".b", {0, 8})
+              .Case(".h", {0, 16})
+              .Case(".s", {0, 32})
+              .Case(".d", {0, 64})
+              .Case(".q", {0, 128})
+              .Default({-1, -1});
+    break;
+  default:
+    llvm_unreachable("Unsupported RegKind");
+  }
+
+  if (Res == std::make_pair(-1, -1))
+    return Optional<std::pair<int, int>>();
+
+  return Optional<std::pair<int, int>>(Res);
+}
+
+static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) {
+  return parseVectorKind(Suffix, VectorKind).hasValue();
 }
 
 static unsigned matchSVEDataVectorRegName(StringRef Name) {
@@ -1970,40 +2140,12 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
       .Default(0);
 }
 
-static bool isValidSVEKind(StringRef Name) {
-  return StringSwitch<bool>(Name.lower())
-      .Case(".b", true)
-      .Case(".h", true)
-      .Case(".s", true)
-      .Case(".d", true)
-      .Case(".q", true)
-      .Default(false);
-}
-
-static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
-                                 char &ElementKind) {
-  assert(isValidVectorKind(Name));
-
-  ElementKind = Name.lower()[Name.size() - 1];
-  NumElements = 0;
-
-  if (Name.size() == 2)
-    return;
-
-  // Parse the lane count
-  Name = Name.drop_front();
-  while (isdigit(Name.front())) {
-    NumElements = 10 * NumElements + (Name.front() - '0');
-    Name = Name.drop_front();
-  }
-}
-
 bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                      SMLoc &EndLoc) {
   StartLoc = getLoc();
-  RegNo = tryParseRegister();
+  auto Res = tryParseScalarRegister(RegNo);
   EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  return (RegNo == (unsigned)-1);
+  return Res != MatchOperand_Success;
 }
 
 // Matches a register name or register alias previously defined by '.req'
@@ -2024,6 +2166,15 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
     return Kind == RegKind::Scalar ? RegNum : 0;
 
   if (!RegNum) {
+    // Handle a few common aliases of registers.
+    if (auto RegNum = StringSwitch<unsigned>(Name.lower())
+                    .Case("fp", AArch64::FP)
+                    .Case("lr",  AArch64::LR)
+                    .Case("x31", AArch64::XZR)
+                    .Case("w31", AArch64::WZR)
+                    .Default(0))
+      return Kind == RegKind::Scalar ? RegNum : 0;
+
     // Check for aliases registered via .req. Canonicalize to lower case.
     // That's more consistent since register names are case insensitive, and
     // it's how the original entry was passed in from MC/MCParser/AsmParser.
@@ -2038,65 +2189,24 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
   return RegNum;
 }
 
-/// tryParseRegister - Try to parse a register name. The token must be an
+/// tryParseScalarRegister - Try to parse a register name. The token must be an
 /// Identifier when called, and if it is a register name the token is eaten and
 /// the register is added to the operand list.
-int AArch64AsmParser::tryParseRegister() {
+OperandMatchResultTy
+AArch64AsmParser::tryParseScalarRegister(unsigned &RegNum) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
-    return -1;
+    return MatchOperand_NoMatch;
 
   std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
-
-  // Also handle a few aliases of registers.
-  if (RegNum == 0)
-    RegNum = StringSwitch<unsigned>(lowerCase)
-                 .Case("fp",  AArch64::FP)
-                 .Case("lr",  AArch64::LR)
-                 .Case("x31", AArch64::XZR)
-                 .Case("w31", AArch64::WZR)
-                 .Default(0);
-
-  if (RegNum == 0)
-    return -1;
+  unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
+  if (Reg == 0)
+    return MatchOperand_NoMatch;
 
+  RegNum = Reg;
   Parser.Lex(); // Eat identifier token.
-  return RegNum;
-}
-
-/// tryMatchVectorRegister - Try to parse a vector register name with optional
-/// kind specifier. If it is a register specifier, eat the token and return it.
-int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
-  MCAsmParser &Parser = getParser();
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    TokError("vector register expected");
-    return -1;
-  }
-
-  StringRef Name = Parser.getTok().getString();
-  // If there is a kind specifier, it's separated from the register name by
-  // a '.'.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchRegisterNameAlias(Head, RegKind::NeonVector);
-
-  if (RegNum) {
-    if (Next != StringRef::npos) {
-      Kind = Name.slice(Next, StringRef::npos);
-      if (!isValidVectorKind(Kind)) {
-        TokError("invalid vector kind qualifier");
-        return -1;
-      }
-    }
-    Parser.Lex(); // Eat the register token.
-    return RegNum;
-  }
-
-  if (expected)
-    TokError("vector register expected");
-  return -1;
+  return MatchOperand_Success;
 }
 
 /// tryParseSysCROperand - Try to parse a system instruction CR operand name.
@@ -2130,11 +2240,32 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
 }
 
 /// tryParsePrefetch - Try to parse a prefetch operand.
+template <bool IsSVEPrefetch>
 OperandMatchResultTy
 AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const AsmToken &Tok = Parser.getTok();
+
+  auto LookupByName = [](StringRef N) {
+    if (IsSVEPrefetch) {
+      if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByName(N))
+        return Optional<unsigned>(Res->Encoding);
+    } else if (auto Res = AArch64PRFM::lookupPRFMByName(N))
+      return Optional<unsigned>(Res->Encoding);
+    return Optional<unsigned>();
+  };
+
+  auto LookupByEncoding = [](unsigned E) {
+    if (IsSVEPrefetch) {
+      if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByEncoding(E))
+        return Optional<StringRef>(Res->Name);
+    } else if (auto Res = AArch64PRFM::lookupPRFMByEncoding(E))
+      return Optional<StringRef>(Res->Name);
+    return Optional<StringRef>();
+  };
+  unsigned MaxVal = IsSVEPrefetch ? 15 : 31;
+
   // Either an identifier for named values or a 5-bit immediate.
   // Eat optional hash.
   if (parseOptionalToken(AsmToken::Hash) ||
@@ -2149,31 +2280,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
       return MatchOperand_ParseFail;
     }
     unsigned prfop = MCE->getValue();
-    if (prfop > 31) {
-      TokError("prefetch operand out of range, [0,31] expected");
+    if (prfop > MaxVal) {
+      TokError("prefetch operand out of range, [0," + utostr(MaxVal) +
+               "] expected");
       return MatchOperand_ParseFail;
     }
 
-    auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+    auto PRFM = LookupByEncoding(MCE->getValue());
     Operands.push_back(AArch64Operand::CreatePrefetch(
-        prfop, PRFM ? PRFM->Name : "", S, getContext()));
+        prfop, PRFM.getValueOr(""), S, getContext()));
     return MatchOperand_Success;
   }
 
   if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("pre-fetch hint expected");
+    TokError("prefetch hint expected");
     return MatchOperand_ParseFail;
   }
 
-  auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+  auto PRFM = LookupByName(Tok.getString());
   if (!PRFM) {
-    TokError("pre-fetch hint expected");
+    TokError("prefetch hint expected");
     return MatchOperand_ParseFail;
   }
 
   Parser.Lex(); // Eat identifier token.
   Operands.push_back(AArch64Operand::CreatePrefetch(
-      PRFM->Encoding, Tok.getString(), S, getContext()));
+      *PRFM, Tok.getString(), S, getContext()));
   return MatchOperand_Success;
 }
 
@@ -2258,17 +2390,21 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
   SMLoc S = getLoc();
   const MCExpr *Expr;
 
-  parseOptionalToken(AsmToken::Hash);
-  if (getParser().parseExpression(Expr))
-    return MatchOperand_ParseFail;
+  const AsmToken &Tok = getParser().getTok();
+  if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
+    if (getParser().parseExpression(Expr))
+      return MatchOperand_ParseFail;
 
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
 
-  return MatchOperand_Success;
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
 }
 
 /// tryParseFPImm - A floating point immediate expression operand.
+template<bool AddFPZeroAsLiteral>
 OperandMatchResultTy
 AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
@@ -2280,50 +2416,50 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
   bool isNegative = parseOptionalToken(AsmToken::Minus);
 
   const AsmToken &Tok = Parser.getTok();
-  if (Tok.is(AsmToken::Real) || Tok.is(AsmToken::Integer)) {
-    int64_t Val;
-    if (Tok.is(AsmToken::Integer) && !isNegative && Tok.getString().startswith("0x")) {
-      Val = Tok.getIntVal();
-      if (Val > 255 || Val < 0) {
-        TokError("encoded floating point value out of range");
-        return MatchOperand_ParseFail;
-      }
-    } else {
-      APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
-      if (isNegative)
-        RealVal.changeSign();
+  if (!Tok.is(AsmToken::Real) && !Tok.is(AsmToken::Integer)) {
+    if (!Hash)
+      return MatchOperand_NoMatch;
+    TokError("invalid floating point immediate");
+    return MatchOperand_ParseFail;
+  }
 
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
-
-      // Check for out of range values. As an exception we let Zero through,
-      // but as tokens instead of an FPImm so that it can be matched by the
-      // appropriate alias if one exists.
-      if (RealVal.isPosZero()) {
-        Parser.Lex(); // Eat the token.
-        Operands.push_back(AArch64Operand::CreateToken("#0", false, S, getContext()));
-        Operands.push_back(AArch64Operand::CreateToken(".0", false, S, getContext()));
-        return MatchOperand_Success;
-      } else if (Val == -1) {
-        TokError("expected compatible register or floating-point constant");
-        return MatchOperand_ParseFail;
-      }
+  // Parse hexadecimal representation.
+  if (Tok.is(AsmToken::Integer) && Tok.getString().startswith("0x")) {
+    if (Tok.getIntVal() > 255 || isNegative) {
+      TokError("encoded floating point value out of range");
+      return MatchOperand_ParseFail;
     }
-    Parser.Lex(); // Eat the token.
-    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
+
+    APFloat F((double)AArch64_AM::getFPImmFloat(Tok.getIntVal()));
+    Operands.push_back(
+        AArch64Operand::CreateFPImm(F, true, S, getContext()));
+  } else {
+    // Parse FP representation.
+    APFloat RealVal(APFloat::IEEEdouble());
+    auto Status =
+        RealVal.convertFromString(Tok.getString(), APFloat::rmTowardZero);
+    if (isNegative)
+      RealVal.changeSign();
+
+    if (AddFPZeroAsLiteral && RealVal.isPosZero()) {
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+    } else
+      Operands.push_back(AArch64Operand::CreateFPImm(
+          RealVal, Status == APFloat::opOK, S, getContext()));
   }
 
-  if (!Hash)
-    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the token.
 
-  TokError("invalid floating point immediate");
-  return MatchOperand_ParseFail;
+  return MatchOperand_Success;
 }
 
-/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+/// tryParseImmWithOptionalShift - Parse immediate operand, optionally with
+/// a shift suffix, for example '#1, lsl #12'.
 OperandMatchResultTy
-AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
@@ -2337,18 +2473,9 @@ AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
   if (parseSymbolicImmVal(Imm))
     return MatchOperand_ParseFail;
   else if (Parser.getTok().isNot(AsmToken::Comma)) {
-    uint64_t ShiftAmount = 0;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
-    if (MCE) {
-      int64_t Val = MCE->getValue();
-      if (Val > 0xfff && (Val & 0xfff) == 0) {
-        Imm = MCConstantExpr::create(Val >> 12, getContext());
-        ShiftAmount = 12;
-      }
-    }
     SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
-                                                        getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateImm(Imm, S, E, getContext()));
     return MatchOperand_Success;
   }
 
@@ -2380,6 +2507,13 @@ AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
   }
   Parser.Lex(); // Eat the number
 
+  // Just in case the optional lsl #0 is used for immediates other than zero.
+  if (ShiftAmount == 0 && Imm != 0) {
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext()));
+    return MatchOperand_Success;
+  }
+
   SMLoc E = Parser.getTok().getLoc();
   Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
                                                       S, E, getContext()));
@@ -2408,6 +2542,22 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
                     .Case("al", AArch64CC::AL)
                     .Case("nv", AArch64CC::NV)
                     .Default(AArch64CC::Invalid);
+
+  if (CC == AArch64CC::Invalid &&
+      getSTI().getFeatureBits()[AArch64::FeatureSVE])
+    CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("none",  AArch64CC::EQ)
+                    .Case("any",   AArch64CC::NE)
+                    .Case("nlast", AArch64CC::HS)
+                    .Case("last",  AArch64CC::LO)
+                    .Case("first", AArch64CC::MI)
+                    .Case("nfrst", AArch64CC::PL)
+                    .Case("pmore", AArch64CC::HI)
+                    .Case("plast", AArch64CC::LS)
+                    .Case("tcont", AArch64CC::GE)
+                    .Case("tstop", AArch64CC::LT)
+                    .Default(AArch64CC::Invalid);
+
   return CC;
 }
 
@@ -2515,6 +2665,10 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv8.1a";
   else if (FBS[AArch64::HasV8_2aOps])
     Str += "ARMv8.2a";
+  else if (FBS[AArch64::HasV8_3aOps])
+    Str += "ARMv8.3a";
+  else if (FBS[AArch64::HasV8_4aOps])
+    Str += "ARMv8.4a";
   else
     Str += "(unknown)";
 }
@@ -2625,9 +2779,11 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
+  if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
+    TokError("'csync' operand expected");
+    return MatchOperand_ParseFail;
   // Can be either a #imm style literal or an option name
-  if (parseOptionalToken(AsmToken::Hash) ||
-      Tok.is(AsmToken::Integer)) {
+  } else if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
     // Immediate operand.
     const MCExpr *ImmVal;
     SMLoc ExprLoc = getLoc();
@@ -2653,18 +2809,23 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
+  auto TSB = AArch64TSB::lookupTSBByName(Tok.getString());
   // The only valid named option for ISB is 'sy'
   auto DB = AArch64DB::lookupDBByName(Tok.getString());
   if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
     TokError("'sy' or #imm operand expected");
     return MatchOperand_ParseFail;
-  } else if (!DB) {
+  // The only valid named option for TSB is 'csync'
+  } else if (Mnemonic == "tsb" && (!TSB || TSB->Encoding != AArch64TSB::csync)) {
+    TokError("'csync' operand expected");
+    return MatchOperand_ParseFail;
+  } else if (!DB && !TSB) {
     TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AArch64Operand::CreateBarrier(
-      DB->Encoding, Tok.getString(), getLoc(), getContext()));
+      DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), getContext()));
   Parser.Lex(); // Consume the option
 
   return MatchOperand_Success;
@@ -2708,12 +2869,20 @@ bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
   SMLoc S = getLoc();
   // Check for a vector register specifier first.
   StringRef Kind;
-  int64_t Reg = tryMatchVectorRegister(Kind, false);
-  if (Reg == -1)
+  unsigned Reg;
+  OperandMatchResultTy Res =
+      tryParseVectorRegister(Reg, Kind, RegKind::NeonVector);
+  if (Res != MatchOperand_Success)
+    return true;
+
+  const auto &KindRes = parseVectorKind(Kind, RegKind::NeonVector);
+  if (!KindRes)
     return true;
+
+  unsigned ElementWidth = KindRes->second;
   Operands.push_back(
-      AArch64Operand::CreateReg(Reg, RegKind::NeonVector, S, getLoc(),
-                                getContext()));
+      AArch64Operand::CreateVectorReg(Reg, RegKind::NeonVector, ElementWidth,
+                                      S, getLoc(), getContext()));
 
   // If there was an explicit qualifier, that goes on as a literal text
   // operand.
@@ -2721,36 +2890,41 @@ bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
     Operands.push_back(
         AArch64Operand::CreateToken(Kind, false, S, getContext()));
 
-  // If there is an index specifier following the register, parse that too.
+  return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseVectorIndex(OperandVector &Operands) {
   SMLoc SIdx = getLoc();
   if (parseOptionalToken(AsmToken::LBrac)) {
     const MCExpr *ImmVal;
     if (getParser().parseExpression(ImmVal))
-      return false;
+      return MatchOperand_NoMatch;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
     if (!MCE) {
       TokError("immediate value expected for vector index");
-      return false;
+      return MatchOperand_ParseFail;;
     }
 
     SMLoc E = getLoc();
 
     if (parseToken(AsmToken::RBrac, "']' expected"))
-      return false;
+      return MatchOperand_ParseFail;;
 
     Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
                                                          E, getContext()));
+    return MatchOperand_Success;
   }
 
-  return false;
+  return MatchOperand_NoMatch;
 }
 
-// tryParseSVEDataVectorRegister - Try to parse a SVE vector register name with
+// tryParseVectorRegister - Try to parse a vector register name with
 // optional kind specifier. If it is a register specifier, eat the token
 // and return it.
 OperandMatchResultTy
-AArch64AsmParser::tryParseSVERegister(int &Reg, StringRef &Kind,
-                                      RegKind MatchKind) {
+AArch64AsmParser::tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
+                                         RegKind MatchKind) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
@@ -2767,8 +2941,8 @@ AArch64AsmParser::tryParseSVERegister(int &Reg, StringRef &Kind,
   if (RegNum) {
     if (Next != StringRef::npos) {
       Kind = Name.slice(Next, StringRef::npos);
-      if (!isValidSVEKind(Kind)) {
-        TokError("invalid sve vector kind qualifier");
+      if (!isValidVectorKind(Kind, MatchKind)) {
+        TokError("invalid vector kind qualifier");
         return MatchOperand_ParseFail;
       }
     }
@@ -2787,45 +2961,64 @@ AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
   // Check for a SVE predicate register specifier first.
   const SMLoc S = getLoc();
   StringRef Kind;
-  int RegNum = -1;
-  auto Res = tryParseSVERegister(RegNum, Kind, RegKind::SVEPredicateVector);
+  unsigned RegNum;
+  auto Res = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
   if (Res != MatchOperand_Success)
     return Res;
 
-  unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower())
-                              .Case("", -1)
-                              .Case(".b", 8)
-                              .Case(".h", 16)
-                              .Case(".s", 32)
-                              .Case(".d", 64)
-                              .Case(".q", 128)
-                              .Default(0);
-
-  if (!ElementWidth)
+  const auto &KindRes = parseVectorKind(Kind, RegKind::SVEPredicateVector);
+  if (!KindRes)
     return MatchOperand_NoMatch;
 
+  unsigned ElementWidth = KindRes->second;
+  Operands.push_back(AArch64Operand::CreateVectorReg(
+      RegNum, RegKind::SVEPredicateVector, ElementWidth, S,
+      getLoc(), getContext()));
+
+  // Not all predicates are followed by a '/m' or '/z'.
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Slash))
+    return MatchOperand_Success;
+
+  // But when they do they shouldn't have an element type suffix.
+  if (!Kind.empty()) {
+    Error(S, "not expecting size suffix");
+    return MatchOperand_ParseFail;
+  }
+
+  // Add a literal slash as operand
   Operands.push_back(
-      AArch64Operand::CreateReg(RegNum, RegKind::SVEPredicateVector,
-                                ElementWidth, S, getLoc(), getContext()));
+      AArch64Operand::CreateToken("/" , false, getLoc(), getContext()));
 
+  Parser.Lex(); // Eat the slash.
+
+  // Zeroing or merging?
+  auto Pred = Parser.getTok().getString().lower();
+  if (Pred != "z" && Pred != "m") {
+    Error(getLoc(), "expecting 'm' or 'z' predication");
+    return MatchOperand_ParseFail;
+  }
+
+  // Add zero/merge token.
+  const char *ZM = Pred == "z" ? "z" : "m";
+  Operands.push_back(
+    AArch64Operand::CreateToken(ZM, false, getLoc(), getContext()));
+
+  Parser.Lex(); // Eat zero/merge token.
   return MatchOperand_Success;
 }
 
-/// parseRegister - Parse a non-vector register operand.
+/// parseRegister - Parse a register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  // Try for a vector (neon) register.
+  // Try for a Neon vector register.
   if (!tryParseNeonVectorRegister(Operands))
     return false;
 
-  // Try for a scalar register.
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return true;
-  Operands.push_back(AArch64Operand::CreateReg(Reg, RegKind::Scalar, S,
-      getLoc(), getContext()));
+  // Otherwise try for a scalar register.
+  if (tryParseGPROperand<false>(Operands) == MatchOperand_Success)
+    return false;
 
-  return false;
+  return true;
 }
 
 bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
@@ -2876,6 +3069,8 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
                   .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
                   .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
                   .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Case("secrel_lo12", AArch64MCExpr::VK_SECREL_LO12)
+                  .Case("secrel_hi12", AArch64MCExpr::VK_SECREL_HI12)
                   .Default(AArch64MCExpr::VK_INVALID);
 
     if (RefKind == AArch64MCExpr::VK_INVALID)
@@ -2896,33 +3091,74 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
   return false;
 }
 
-/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
-bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+template <RegKind VectorKind>
+OperandMatchResultTy
+AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
+                                     bool ExpectMatch) {
   MCAsmParser &Parser = getParser();
-  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  if (!Parser.getTok().is(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+
+  // Wrapper around parse function
+  auto ParseVector = [this, &Parser](unsigned &Reg, StringRef &Kind, SMLoc Loc,
+                                     bool NoMatchIsError) {
+    auto RegTok = Parser.getTok();
+    auto ParseRes = tryParseVectorRegister(Reg, Kind, VectorKind);
+    if (ParseRes == MatchOperand_Success) {
+      if (parseVectorKind(Kind, VectorKind))
+        return ParseRes;
+      llvm_unreachable("Expected a valid vector kind");
+    }
+
+    if (RegTok.isNot(AsmToken::Identifier) ||
+        ParseRes == MatchOperand_ParseFail ||
+        (ParseRes == MatchOperand_NoMatch && NoMatchIsError)) {
+      Error(Loc, "vector register expected");
+      return MatchOperand_ParseFail;
+    }
+
+    return MatchOperand_NoMatch;
+  };
+
   SMLoc S = getLoc();
+  auto LCurly = Parser.getTok();
   Parser.Lex(); // Eat left bracket token.
+
   StringRef Kind;
-  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
-  if (FirstReg == -1)
-    return true;
+  unsigned FirstReg;
+  auto ParseRes = ParseVector(FirstReg, Kind, getLoc(), ExpectMatch);
+
+  // Put back the original left bracket if there was no match, so that
+  // different types of list-operands can be matched (e.g. SVE, Neon).
+  if (ParseRes == MatchOperand_NoMatch)
+    Parser.getLexer().UnLex(LCurly);
+
+  if (ParseRes != MatchOperand_Success)
+    return ParseRes;
+
   int64_t PrevReg = FirstReg;
   unsigned Count = 1;
 
   if (parseOptionalToken(AsmToken::Minus)) {
     SMLoc Loc = getLoc();
     StringRef NextKind;
-    int64_t Reg = tryMatchVectorRegister(NextKind, true);
-    if (Reg == -1)
-      return true;
+
+    unsigned Reg;
+    ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
+    if (ParseRes != MatchOperand_Success)
+      return ParseRes;
+
     // Any Kind suffices must match on all regs in the list.
-    if (Kind != NextKind)
-      return Error(Loc, "mismatched register size suffix");
+    if (Kind != NextKind) {
+      Error(Loc, "mismatched register size suffix");
+      return MatchOperand_ParseFail;
+    }
 
     unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
 
     if (Space == 0 || Space > 3) {
-      return Error(Loc, "invalid number of vectors");
+      Error(Loc, "invalid number of vectors");
+      return MatchOperand_ParseFail;
     }
 
     Count += Space;
@@ -2931,17 +3167,23 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
     while (parseOptionalToken(AsmToken::Comma)) {
       SMLoc Loc = getLoc();
       StringRef NextKind;
-      int64_t Reg = tryMatchVectorRegister(NextKind, true);
-      if (Reg == -1)
-        return true;
+      unsigned Reg;
+      ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
+      if (ParseRes != MatchOperand_Success)
+        return ParseRes;
+
       // Any Kind suffices must match on all regs in the list.
-      if (Kind != NextKind)
-        return Error(Loc, "mismatched register size suffix");
+      if (Kind != NextKind) {
+        Error(Loc, "mismatched register size suffix");
+        return MatchOperand_ParseFail;
+      }
 
       // Registers must be incremental (with wraparound at 31)
       if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
-          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
-       return Error(Loc, "registers must be sequential");
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) {
+        Error(Loc, "registers must be sequential");
+        return MatchOperand_ParseFail;
+      }
 
       PrevReg = Reg;
       ++Count;
@@ -2949,83 +3191,146 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
   }
 
   if (parseToken(AsmToken::RCurly, "'}' expected"))
-    return true;
+    return MatchOperand_ParseFail;
 
-  if (Count > 4)
-    return Error(S, "invalid number of vectors");
+  if (Count > 4) {
+    Error(S, "invalid number of vectors");
+    return MatchOperand_ParseFail;
+  }
 
   unsigned NumElements = 0;
-  char ElementKind = 0;
-  if (!Kind.empty())
-    parseValidVectorKind(Kind, NumElements, ElementKind);
+  unsigned ElementWidth = 0;
+  if (!Kind.empty()) {
+    if (const auto &VK = parseVectorKind(Kind, VectorKind))
+      std::tie(NumElements, ElementWidth) = *VK;
+  }
 
   Operands.push_back(AArch64Operand::CreateVectorList(
-      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+      FirstReg, Count, NumElements, ElementWidth, VectorKind, S, getLoc(),
+      getContext()));
 
-  // If there is an index specifier following the list, parse that too.
-  SMLoc SIdx = getLoc();
-  if (parseOptionalToken(AsmToken::LBrac)) { // Eat left bracket token.
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
+  return MatchOperand_Success;
+}
 
-    SMLoc E = getLoc();
-    if (parseToken(AsmToken::RBrac, "']' expected"))
-      return false;
+/// parseNeonVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseNeonVectorList(OperandVector &Operands) {
+  auto ParseRes = tryParseVectorList<RegKind::NeonVector>(Operands, true);
+  if (ParseRes != MatchOperand_Success)
+    return true;
 
-    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
-                                                         E, getContext()));
-  }
-  return false;
+  return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
 }
 
 OperandMatchResultTy
 AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
-  if (!Tok.is(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
+  SMLoc StartLoc = getLoc();
 
-  unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), RegKind::Scalar);
-
-  MCContext &Ctx = getContext();
-  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
-  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
-    return MatchOperand_NoMatch;
-
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat register
+  unsigned RegNum;
+  OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
+  if (Res != MatchOperand_Success)
+    return Res;
 
   if (!parseOptionalToken(AsmToken::Comma)) {
-    Operands.push_back(
-        AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
+    Operands.push_back(AArch64Operand::CreateReg(
+        RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
     return MatchOperand_Success;
   }
 
   parseOptionalToken(AsmToken::Hash);
 
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
+  if (getParser().getTok().isNot(AsmToken::Integer)) {
     Error(getLoc(), "index must be absent or #0");
     return MatchOperand_ParseFail;
   }
 
   const MCExpr *ImmVal;
-  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+  if (getParser().parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
       cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
     Error(getLoc(), "index must be absent or #0");
     return MatchOperand_ParseFail;
   }
 
-  Operands.push_back(
-      AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
+  Operands.push_back(AArch64Operand::CreateReg(
+      RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
   return MatchOperand_Success;
 }
 
+template <bool ParseShiftExtend, RegConstraintEqualityTy EqTy>
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPROperand(OperandVector &Operands) {
+  SMLoc StartLoc = getLoc();
+
+  unsigned RegNum;
+  OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  // No shift/extend is the default.
+  if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(AArch64Operand::CreateReg(
+        RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext(), EqTy));
+    return MatchOperand_Success;
+  }
+
+  // Eat the comma
+  getParser().Lex();
+
+  // Match the shift
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
+  Res = tryParseOptionalShiftExtend(ExtOpnd);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  auto Ext = static_cast<AArch64Operand*>(ExtOpnd.back().get());
+  Operands.push_back(AArch64Operand::CreateReg(
+      RegNum, RegKind::Scalar, StartLoc, Ext->getEndLoc(), getContext(), EqTy,
+      Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
+      Ext->hasShiftExtendAmount()));
+
+  return MatchOperand_Success;
+}
+
+bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+
+  // Some SVE instructions have a decoration after the immediate, i.e.
+  // "mul vl". We parse them here and add tokens, which must be present in the
+  // asm string in the tablegen instruction.
+  bool NextIsVL = Parser.getLexer().peekTok().getString().equals_lower("vl");
+  bool NextIsHash = Parser.getLexer().peekTok().is(AsmToken::Hash);
+  if (!Parser.getTok().getString().equals_lower("mul") ||
+      !(NextIsVL || NextIsHash))
+    return true;
+
+  Operands.push_back(
+    AArch64Operand::CreateToken("mul", false, getLoc(), getContext()));
+  Parser.Lex(); // Eat the "mul"
+
+  if (NextIsVL) {
+    Operands.push_back(
+        AArch64Operand::CreateToken("vl", false, getLoc(), getContext()));
+    Parser.Lex(); // Eat the "vl"
+    return false;
+  }
+
+  if (NextIsHash) {
+    Parser.Lex(); // Eat the #
+    SMLoc S = getLoc();
+
+    // Parse immediate operand.
+    const MCExpr *ImmVal;
+    if (!Parser.parseExpression(ImmVal))
+      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal)) {
+        Operands.push_back(AArch64Operand::CreateImm(
+            MCConstantExpr::create(MCE->getValue(), getContext()), S, getLoc(),
+            getContext()));
+        return MatchOperand_Success;
+      }
+  }
+
+  return Error(getLoc(), "expected 'vl' or '#<imm>'");
+}
+
 /// parseOperand - Parse a arm instruction operand.  For now this parses the
 /// operand regardless of the mnemonic.
 bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
@@ -3069,7 +3374,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
     return parseOperand(Operands, false, false);
   }
   case AsmToken::LCurly:
-    return parseVectorList(Operands);
+    return parseNeonVectorList(Operands);
   case AsmToken::Identifier: {
     // If we're expecting a Condition Code operand, then just parse that.
     if (isCondCode)
@@ -3079,6 +3384,11 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
     if (!parseRegister(Operands))
       return false;
 
+    // See if this is a "mul vl" decoration or "mul #<int>" operand used
+    // by SVE instructions.
+    if (!parseOptionalMulOperand(Operands))
+      return false;
+
     // This could be an optional "shift" or "extend" operand.
     OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
     // We can only continue if no tokens were eaten.
@@ -3122,7 +3432,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
       uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
       if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
           Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
-          Mnemonic != "fcmlt")
+          Mnemonic != "fcmlt" && Mnemonic != "fcmne")
         return TokError("unexpected floating point literal");
       else if (IntVal != 0 || isNegative)
         return TokError("expected floating-point constant #0.0");
@@ -3193,6 +3503,30 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
   }
 }
 
+bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
+                                 const MCParsedAsmOperand &Op2) const {
+  auto &AOp1 = static_cast<const AArch64Operand&>(Op1);
+  auto &AOp2 = static_cast<const AArch64Operand&>(Op2);
+  if (AOp1.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg &&
+      AOp2.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg)
+    return MCTargetAsmParser::regsEqual(Op1, Op2);
+
+  assert(AOp1.isScalarReg() && AOp2.isScalarReg() &&
+         "Testing equality of non-scalar registers not supported");
+
+  // Check if a registers match their sub/super register classes.
+  if (AOp1.getRegEqualityTy() == EqualsSuperReg)
+    return getXRegFromWReg(Op1.getReg()) == Op2.getReg();
+  if (AOp1.getRegEqualityTy() == EqualsSubReg)
+    return getWRegFromXReg(Op1.getReg()) == Op2.getReg();
+  if (AOp2.getRegEqualityTy() == EqualsSuperReg)
+    return getXRegFromWReg(Op2.getReg()) == Op1.getReg();
+  if (AOp2.getRegEqualityTy() == EqualsSubReg)
+    return getWRegFromXReg(Op2.getReg()) == Op1.getReg();
+
+  return false;
+}
+
 /// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
 /// operands.
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
@@ -3451,7 +3785,39 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
                            "is also a source");
     break;
   }
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRX:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STLXRW:
+  case AArch64::STLXRX: {
+    unsigned Rs = Inst.getOperand(0).getReg();
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rt, Rs) ||
+        (RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
+      return Error(Loc[0],
+                   "unpredictable STXR instruction, status is also a source");
+    break;
+  }
+  case AArch64::STXPW:
+  case AArch64::STXPX:
+  case AArch64::STLXPW:
+  case AArch64::STLXPX: {
+    unsigned Rs = Inst.getOperand(0).getReg();
+    unsigned Rt1 = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rt1, Rs) || RI->isSubRegisterEq(Rt2, Rs) ||
+        (RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
+      return Error(Loc[0],
+                   "unpredictable STXP instruction, status is also a source");
+    break;
   }
+  }
+
 
   // Now check immediate ranges. Separate from the above as there is overlap
   // in the instructions being checked and this keeps the nested conditionals
@@ -3488,7 +3854,9 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
              ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
              ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
              ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
-             ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+             ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) &&
             (Inst.getOpcode() == AArch64::ADDXri ||
              Inst.getOpcode() == AArch64::ADDWri))
           return false;
@@ -3512,8 +3880,23 @@ static std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS,
                                              unsigned VariantID = 0);
 
 bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+                                      uint64_t ErrorInfo,
                                       OperandVector &Operands) {
   switch (ErrCode) {
+  case Match_InvalidTiedOperand: {
+    RegConstraintEqualityTy EqTy =
+        static_cast<const AArch64Operand &>(*Operands[ErrorInfo])
+            .getRegEqualityTy();
+    switch (EqTy) {
+    case RegConstraintEqualityTy::EqualsSubReg:
+      return Error(Loc, "operand must be 64-bit form of destination register");
+    case RegConstraintEqualityTy::EqualsSuperReg:
+      return Error(Loc, "operand must be 32-bit form of destination register");
+    case RegConstraintEqualityTy::EqualsReg:
+      return Error(Loc, "operand must match destination register");
+    }
+    llvm_unreachable("Unknown RegConstraintEqualityTy");
+  }
   case Match_MissingFeature:
     return Error(Loc,
                  "instruction requires a CPU feature not currently enabled");
@@ -3547,9 +3930,27 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
   case Match_InvalidFPImm:
     return Error(Loc,
                  "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm6:
+    return Error(Loc, "index must be an integer in range [-32, 31].");
+  case Match_InvalidMemoryIndexedSImm5:
+    return Error(Loc, "index must be an integer in range [-16, 15].");
+  case Match_InvalidMemoryIndexed1SImm4:
+    return Error(Loc, "index must be an integer in range [-8, 7].");
+  case Match_InvalidMemoryIndexed2SImm4:
+    return Error(Loc, "index must be a multiple of 2 in range [-16, 14].");
+  case Match_InvalidMemoryIndexed3SImm4:
+    return Error(Loc, "index must be a multiple of 3 in range [-24, 21].");
+  case Match_InvalidMemoryIndexed4SImm4:
+    return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
+  case Match_InvalidMemoryIndexed16SImm4:
+    return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
+  case Match_InvalidMemoryIndexed1SImm6:
+    return Error(Loc, "index must be an integer in range [-32, 31].");
+  case Match_InvalidMemoryIndexedSImm8:
+    return Error(Loc, "index must be an integer in range [-128, 127].");
   case Match_InvalidMemoryIndexedSImm9:
     return Error(Loc, "index must be an integer in range [-256, 255].");
-  case Match_InvalidMemoryIndexedSImm10:
+  case Match_InvalidMemoryIndexed8SImm10:
     return Error(Loc, "index must be a multiple of 8 in range [-4096, 4088].");
   case Match_InvalidMemoryIndexed4SImm7:
     return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
@@ -3557,6 +3958,20 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
     return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
   case Match_InvalidMemoryIndexed16SImm7:
     return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryIndexed8UImm5:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 248].");
+  case Match_InvalidMemoryIndexed4UImm5:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 124].");
+  case Match_InvalidMemoryIndexed2UImm5:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 62].");
+  case Match_InvalidMemoryIndexed8UImm6:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 504].");
+  case Match_InvalidMemoryIndexed4UImm6:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 252].");
+  case Match_InvalidMemoryIndexed2UImm6:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 126].");
+  case Match_InvalidMemoryIndexed1UImm6:
+    return Error(Loc, "index must be in range [0, 63].");
   case Match_InvalidMemoryWExtend8:
     return Error(Loc,
                  "expected 'uxtw' or 'sxtw' with optional shift of #0");
@@ -3621,16 +4036,44 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
     return Error(Loc, "immediate must be an integer in range [1, 32].");
   case Match_InvalidImm1_64:
     return Error(Loc, "immediate must be an integer in range [1, 64].");
-  case Match_InvalidIndex1:
+  case Match_InvalidSVEAddSubImm8:
+    return Error(Loc, "immediate must be an integer in range [0, 255]"
+                      " with a shift amount of 0");
+  case Match_InvalidSVEAddSubImm16:
+  case Match_InvalidSVEAddSubImm32:
+  case Match_InvalidSVEAddSubImm64:
+    return Error(Loc, "immediate must be an integer in range [0, 255] or a "
+                      "multiple of 256 in range [256, 65280]");
+  case Match_InvalidSVECpyImm8:
+    return Error(Loc, "immediate must be an integer in range [-128, 255]"
+                      " with a shift amount of 0");
+  case Match_InvalidSVECpyImm16:
+    return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
+                      "multiple of 256 in range [-32768, 65280]");
+  case Match_InvalidSVECpyImm32:
+  case Match_InvalidSVECpyImm64:
+    return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
+                      "multiple of 256 in range [-32768, 32512]");
+  case Match_InvalidIndexRange1_1:
     return Error(Loc, "expected lane specifier '[1]'");
-  case Match_InvalidIndexB:
+  case Match_InvalidIndexRange0_15:
     return Error(Loc, "vector lane must be an integer in range [0, 15].");
-  case Match_InvalidIndexH:
+  case Match_InvalidIndexRange0_7:
     return Error(Loc, "vector lane must be an integer in range [0, 7].");
-  case Match_InvalidIndexS:
+  case Match_InvalidIndexRange0_3:
     return Error(Loc, "vector lane must be an integer in range [0, 3].");
-  case Match_InvalidIndexD:
+  case Match_InvalidIndexRange0_1:
     return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidSVEIndexRange0_63:
+    return Error(Loc, "vector lane must be an integer in range [0, 63].");
+  case Match_InvalidSVEIndexRange0_31:
+    return Error(Loc, "vector lane must be an integer in range [0, 31].");
+  case Match_InvalidSVEIndexRange0_15:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidSVEIndexRange0_7:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidSVEIndexRange0_3:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
   case Match_InvalidLabel:
     return Error(Loc, "expected label or encodable integer pc offset");
   case Match_MRS:
@@ -3647,6 +4090,84 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
         ComputeAvailableFeatures(STI->getFeatureBits()));
     return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
   }
+  case Match_InvalidGPR64shifted8:
+    return Error(Loc, "register must be x0..x30 or xzr, without shift");
+  case Match_InvalidGPR64shifted16:
+    return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #1'");
+  case Match_InvalidGPR64shifted32:
+    return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #2'");
+  case Match_InvalidGPR64shifted64:
+    return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #3'");
+  case Match_InvalidGPR64NoXZRshifted8:
+    return Error(Loc, "register must be x0..x30 without shift");
+  case Match_InvalidGPR64NoXZRshifted16:
+    return Error(Loc, "register must be x0..x30 with required shift 'lsl #1'");
+  case Match_InvalidGPR64NoXZRshifted32:
+    return Error(Loc, "register must be x0..x30 with required shift 'lsl #2'");
+  case Match_InvalidGPR64NoXZRshifted64:
+    return Error(Loc, "register must be x0..x30 with required shift 'lsl #3'");
+  case Match_InvalidZPR32UXTW8:
+  case Match_InvalidZPR32SXTW8:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw)'");
+  case Match_InvalidZPR32UXTW16:
+  case Match_InvalidZPR32SXTW16:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #1'");
+  case Match_InvalidZPR32UXTW32:
+  case Match_InvalidZPR32SXTW32:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #2'");
+  case Match_InvalidZPR32UXTW64:
+  case Match_InvalidZPR32SXTW64:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #3'");
+  case Match_InvalidZPR64UXTW8:
+  case Match_InvalidZPR64SXTW8:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (uxtw|sxtw)'");
+  case Match_InvalidZPR64UXTW16:
+  case Match_InvalidZPR64SXTW16:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #1'");
+  case Match_InvalidZPR64UXTW32:
+  case Match_InvalidZPR64SXTW32:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #2'");
+  case Match_InvalidZPR64UXTW64:
+  case Match_InvalidZPR64SXTW64:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #3'");
+  case Match_InvalidZPR32LSL8:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s'");
+  case Match_InvalidZPR32LSL16:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #1'");
+  case Match_InvalidZPR32LSL32:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #2'");
+  case Match_InvalidZPR32LSL64:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #3'");
+  case Match_InvalidZPR64LSL8:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d'");
+  case Match_InvalidZPR64LSL16:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #1'");
+  case Match_InvalidZPR64LSL32:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #2'");
+  case Match_InvalidZPR64LSL64:
+    return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #3'");
+  case Match_InvalidZPR0:
+    return Error(Loc, "expected register without element width sufix");
+  case Match_InvalidZPR8:
+  case Match_InvalidZPR16:
+  case Match_InvalidZPR32:
+  case Match_InvalidZPR64:
+  case Match_InvalidZPR128:
+    return Error(Loc, "invalid element width");
+  case Match_InvalidZPR_3b8:
+    return Error(Loc, "Invalid restricted vector register, expected z0.b..z7.b");
+  case Match_InvalidZPR_3b16:
+    return Error(Loc, "Invalid restricted vector register, expected z0.h..z7.h");
+  case Match_InvalidZPR_3b32:
+    return Error(Loc, "Invalid restricted vector register, expected z0.s..z7.s");
+  case Match_InvalidZPR_4b16:
+    return Error(Loc, "Invalid restricted vector register, expected z0.h..z15.h");
+  case Match_InvalidZPR_4b32:
+    return Error(Loc, "Invalid restricted vector register, expected z0.s..z15.s");
+  case Match_InvalidZPR_4b64:
+    return Error(Loc, "Invalid restricted vector register, expected z0.d..z15.d");
+  case Match_InvalidSVEPattern:
+    return Error(Loc, "invalid predicate pattern");
   case Match_InvalidSVEPredicateAnyReg:
   case Match_InvalidSVEPredicateBReg:
   case Match_InvalidSVEPredicateHReg:
@@ -3659,6 +4180,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
   case Match_InvalidSVEPredicate3bSReg:
   case Match_InvalidSVEPredicate3bDReg:
     return Error(Loc, "restricted predicate has range [0, 7].");
+  case Match_InvalidSVEExactFPImmOperandHalfOne:
+    return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0.");
+  case Match_InvalidSVEExactFPImmOperandHalfTwo:
+    return Error(Loc, "Invalid floating point constant, expected 0.5 or 2.0.");
+  case Match_InvalidSVEExactFPImmOperandZeroOne:
+    return Error(Loc, "Invalid floating point constant, expected 0.0 or 1.0.");
   default:
     llvm_unreachable("unexpected error code!");
   }
@@ -4012,7 +4539,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, Msg);
   }
   case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult, Operands);
+    return showMatchError(IDLoc, MatchResult, ErrorInfo, Operands);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
 
@@ -4031,8 +4558,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
       MatchResult = Match_InvalidSuffix;
 
-    return showMatchError(ErrorLoc, MatchResult, Operands);
+    return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
   }
+  case Match_InvalidTiedOperand:
   case Match_InvalidMemoryIndexed1:
   case Match_InvalidMemoryIndexed2:
   case Match_InvalidMemoryIndexed4:
@@ -4058,11 +4586,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidMemoryXExtend32:
   case Match_InvalidMemoryXExtend64:
   case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed1SImm4:
+  case Match_InvalidMemoryIndexed2SImm4:
+  case Match_InvalidMemoryIndexed3SImm4:
+  case Match_InvalidMemoryIndexed4SImm4:
+  case Match_InvalidMemoryIndexed1SImm6:
+  case Match_InvalidMemoryIndexed16SImm4:
   case Match_InvalidMemoryIndexed4SImm7:
   case Match_InvalidMemoryIndexed8SImm7:
   case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexed8UImm5:
+  case Match_InvalidMemoryIndexed4UImm5:
+  case Match_InvalidMemoryIndexed2UImm5:
+  case Match_InvalidMemoryIndexed1UImm6:
+  case Match_InvalidMemoryIndexed2UImm6:
+  case Match_InvalidMemoryIndexed4UImm6:
+  case Match_InvalidMemoryIndexed8UImm6:
+  case Match_InvalidMemoryIndexedSImm6:
+  case Match_InvalidMemoryIndexedSImm5:
+  case Match_InvalidMemoryIndexedSImm8:
   case Match_InvalidMemoryIndexedSImm9:
-  case Match_InvalidMemoryIndexedSImm10:
+  case Match_InvalidMemoryIndexed8SImm10:
   case Match_InvalidImm0_1:
   case Match_InvalidImm0_7:
   case Match_InvalidImm0_15:
@@ -4075,15 +4619,73 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidImm1_16:
   case Match_InvalidImm1_32:
   case Match_InvalidImm1_64:
-  case Match_InvalidIndex1:
-  case Match_InvalidIndexB:
-  case Match_InvalidIndexH:
-  case Match_InvalidIndexS:
-  case Match_InvalidIndexD:
+  case Match_InvalidSVEAddSubImm8:
+  case Match_InvalidSVEAddSubImm16:
+  case Match_InvalidSVEAddSubImm32:
+  case Match_InvalidSVEAddSubImm64:
+  case Match_InvalidSVECpyImm8:
+  case Match_InvalidSVECpyImm16:
+  case Match_InvalidSVECpyImm32:
+  case Match_InvalidSVECpyImm64:
+  case Match_InvalidIndexRange1_1:
+  case Match_InvalidIndexRange0_15:
+  case Match_InvalidIndexRange0_7:
+  case Match_InvalidIndexRange0_3:
+  case Match_InvalidIndexRange0_1:
+  case Match_InvalidSVEIndexRange0_63:
+  case Match_InvalidSVEIndexRange0_31:
+  case Match_InvalidSVEIndexRange0_15:
+  case Match_InvalidSVEIndexRange0_7:
+  case Match_InvalidSVEIndexRange0_3:
   case Match_InvalidLabel:
   case Match_InvalidComplexRotationEven:
   case Match_InvalidComplexRotationOdd:
+  case Match_InvalidGPR64shifted8:
+  case Match_InvalidGPR64shifted16:
+  case Match_InvalidGPR64shifted32:
+  case Match_InvalidGPR64shifted64:
+  case Match_InvalidGPR64NoXZRshifted8:
+  case Match_InvalidGPR64NoXZRshifted16:
+  case Match_InvalidGPR64NoXZRshifted32:
+  case Match_InvalidGPR64NoXZRshifted64:
+  case Match_InvalidZPR32UXTW8:
+  case Match_InvalidZPR32UXTW16:
+  case Match_InvalidZPR32UXTW32:
+  case Match_InvalidZPR32UXTW64:
+  case Match_InvalidZPR32SXTW8:
+  case Match_InvalidZPR32SXTW16:
+  case Match_InvalidZPR32SXTW32:
+  case Match_InvalidZPR32SXTW64:
+  case Match_InvalidZPR64UXTW8:
+  case Match_InvalidZPR64SXTW8:
+  case Match_InvalidZPR64UXTW16:
+  case Match_InvalidZPR64SXTW16:
+  case Match_InvalidZPR64UXTW32:
+  case Match_InvalidZPR64SXTW32:
+  case Match_InvalidZPR64UXTW64:
+  case Match_InvalidZPR64SXTW64:
+  case Match_InvalidZPR32LSL8:
+  case Match_InvalidZPR32LSL16:
+  case Match_InvalidZPR32LSL32:
+  case Match_InvalidZPR32LSL64:
+  case Match_InvalidZPR64LSL8:
+  case Match_InvalidZPR64LSL16:
+  case Match_InvalidZPR64LSL32:
+  case Match_InvalidZPR64LSL64:
+  case Match_InvalidZPR0:
+  case Match_InvalidZPR8:
+  case Match_InvalidZPR16:
+  case Match_InvalidZPR32:
+  case Match_InvalidZPR64:
+  case Match_InvalidZPR128:
+  case Match_InvalidZPR_3b8:
+  case Match_InvalidZPR_3b16:
+  case Match_InvalidZPR_3b32:
+  case Match_InvalidZPR_4b16:
+  case Match_InvalidZPR_4b32:
+  case Match_InvalidZPR_4b64:
   case Match_InvalidSVEPredicateAnyReg:
+  case Match_InvalidSVEPattern:
   case Match_InvalidSVEPredicateBReg:
   case Match_InvalidSVEPredicateHReg:
   case Match_InvalidSVEPredicateSReg:
@@ -4093,6 +4695,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSVEPredicate3bHReg:
   case Match_InvalidSVEPredicate3bSReg:
   case Match_InvalidSVEPredicate3bDReg:
+  case Match_InvalidSVEExactFPImmOperandHalfOne:
+  case Match_InvalidSVEExactFPImmOperandHalfTwo:
+  case Match_InvalidSVEExactFPImmOperandZeroOne:
   case Match_MSR:
   case Match_MRS: {
     if (ErrorInfo >= Operands.size())
@@ -4102,7 +4707,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult, Operands);
+    return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
   }
   }
 
@@ -4122,12 +4727,6 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
     parseDirectiveArch(Loc);
   else if (IDVal == ".cpu")
     parseDirectiveCPU(Loc);
-  else if (IDVal == ".hword")
-    parseDirectiveWord(2, Loc);
-  else if (IDVal == ".word")
-    parseDirectiveWord(4, Loc);
-  else if (IDVal == ".xword")
-    parseDirectiveWord(8, Loc);
   else if (IDVal == ".tlsdesccall")
     parseDirectiveTLSDescCall(Loc);
   else if (IDVal == ".ltorg" || IDVal == ".pool")
@@ -4150,7 +4749,11 @@ static const struct {
   const char *Name;
   const FeatureBitset Features;
 } ExtensionMap[] = {
-  { "crc", {AArch64::FeatureCRC} },
+  { "crc",  {AArch64::FeatureCRC} },
+  { "sm4",  {AArch64::FeatureSM4} },
+  { "sha3", {AArch64::FeatureSHA3} },
+  { "sha2", {AArch64::FeatureSHA2} },
+  { "aes",  {AArch64::FeatureAES} },
   { "crypto", {AArch64::FeatureCrypto} },
   { "fp", {AArch64::FeatureFPARMv8} },
   { "simd", {AArch64::FeatureNEON} },
@@ -4164,6 +4767,54 @@ static const struct {
   { "profile", {} },
 };
 
+static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
+                            SmallVector<StringRef, 4> &RequestedExtensions) {
+  const bool NoCrypto =
+      (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
+                 "nocrypto") != std::end(RequestedExtensions));
+  const bool Crypto =
+      (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
+                 "crypto") != std::end(RequestedExtensions));
+
+  if (!NoCrypto && Crypto) {
+    switch (ArchKind) {
+    default:
+      // Map 'generic' (and others) to sha2 and aes, because
+      // that was the traditional meaning of crypto.
+    case AArch64::ArchKind::ARMV8_1A:
+    case AArch64::ArchKind::ARMV8_2A:
+    case AArch64::ArchKind::ARMV8_3A:
+      RequestedExtensions.push_back("sha2");
+      RequestedExtensions.push_back("aes");
+      break;
+    case AArch64::ArchKind::ARMV8_4A:
+      RequestedExtensions.push_back("sm4");
+      RequestedExtensions.push_back("sha3");
+      RequestedExtensions.push_back("sha2");
+      RequestedExtensions.push_back("aes");
+      break;
+    }
+  } else if (NoCrypto) {
+    switch (ArchKind) {
+    default:
+      // Map 'generic' (and others) to sha2 and aes, because
+      // that was the traditional meaning of crypto.
+    case AArch64::ArchKind::ARMV8_1A:
+    case AArch64::ArchKind::ARMV8_2A:
+    case AArch64::ArchKind::ARMV8_3A:
+      RequestedExtensions.push_back("nosha2");
+      RequestedExtensions.push_back("noaes");
+      break;
+    case AArch64::ArchKind::ARMV8_4A:
+      RequestedExtensions.push_back("nosm4");
+      RequestedExtensions.push_back("nosha3");
+      RequestedExtensions.push_back("nosha2");
+      RequestedExtensions.push_back("noaes");
+      break;
+    }
+  }
+}
+
 /// parseDirectiveArch
 ///   ::= .arch token
 bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
@@ -4194,6 +4845,8 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
   if (!ExtensionString.empty())
     ExtensionString.split(RequestedExtensions, '+');
 
+  ExpandCryptoAEK(ID, RequestedExtensions);
+
   FeatureBitset Features = STI.getFeatureBits();
   for (auto Name : RequestedExtensions) {
     bool EnableFeature = true;
@@ -4253,6 +4906,8 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
   STI.setDefaultFeatures(CPU, "");
   CurLoc = incrementLoc(CurLoc, CPU.size());
 
+  ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
+
   FeatureBitset Features = STI.getFeatureBits();
   for (auto Name : RequestedExtensions) {
     // Advance source location past '+'.
@@ -4292,22 +4947,6 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
   return false;
 }
 
-/// parseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
-  auto parseOp = [&]() -> bool {
-    const MCExpr *Value;
-    if (getParser().parseExpression(Value))
-      return true;
-    getParser().getStreamer().EmitValue(Value, Size, L);
-    return false;
-  };
-
-  if (parseMany(parseOp))
-    return true;
-  return false;
-}
-
 /// parseDirectiveInst
 ///  ::= .inst opcode [, ...]
 bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
@@ -4418,46 +5057,50 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   MCAsmParser &Parser = getParser();
   Parser.Lex(); // Eat the '.req' token.
   SMLoc SRegLoc = getLoc();
-  int RegNum = tryParseRegister();
   RegKind RegisterKind = RegKind::Scalar;
+  unsigned RegNum;
+  OperandMatchResultTy ParseRes = tryParseScalarRegister(RegNum);
 
-  if (RegNum == -1) {
+  if (ParseRes != MatchOperand_Success) {
     StringRef Kind;
     RegisterKind = RegKind::NeonVector;
-    RegNum = tryMatchVectorRegister(Kind, false);
-    if (!Kind.empty())
+    ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::NeonVector);
+
+    if (ParseRes == MatchOperand_ParseFail)
+      return true;
+
+    if (ParseRes == MatchOperand_Success && !Kind.empty())
       return Error(SRegLoc, "vector register without type specifier expected");
   }
 
-  if (RegNum == -1) {
+  if (ParseRes != MatchOperand_Success) {
     StringRef Kind;
     RegisterKind = RegKind::SVEDataVector;
-    OperandMatchResultTy Res =
-        tryParseSVERegister(RegNum, Kind, RegKind::SVEDataVector);
+    ParseRes =
+        tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);
 
-    if (Res == MatchOperand_ParseFail)
+    if (ParseRes == MatchOperand_ParseFail)
       return true;
 
-    if (Res == MatchOperand_Success && !Kind.empty())
+    if (ParseRes == MatchOperand_Success && !Kind.empty())
       return Error(SRegLoc,
                    "sve vector register without type specifier expected");
   }
 
-  if (RegNum == -1) {
+  if (ParseRes != MatchOperand_Success) {
     StringRef Kind;
     RegisterKind = RegKind::SVEPredicateVector;
-    OperandMatchResultTy Res =
-        tryParseSVERegister(RegNum, Kind, RegKind::SVEPredicateVector);
+    ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
 
-    if (Res == MatchOperand_ParseFail)
+    if (ParseRes == MatchOperand_ParseFail)
       return true;
 
-    if (Res == MatchOperand_Success && !Kind.empty())
+    if (ParseRes == MatchOperand_Success && !Kind.empty())
       return Error(SRegLoc,
                    "sve predicate register without type specifier expected");
   }
 
-  if (RegNum == -1)
+  if (ParseRes != MatchOperand_Success)
     return Error(SRegLoc, "register name or alias expected");
 
   // Shouldn't be anything else.
@@ -4519,7 +5162,7 @@ AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
       BE->getOpcode() != MCBinaryExpr::Sub)
     return false;
 
-  // See if the addend is is a constant, otherwise there's more going
+  // See if the addend is a constant, otherwise there's more going
   // on here than we can deal with.
   auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
   if (!AddendExpr)
@@ -4620,10 +5263,11 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  int FirstReg = tryParseRegister();
-  if (FirstReg == -1) {
+  unsigned FirstReg;
+  OperandMatchResultTy Res = tryParseScalarRegister(FirstReg);
+  if (Res != MatchOperand_Success)
     return MatchOperand_ParseFail;
-  }
+
   const MCRegisterClass &WRegClass =
       AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
   const MCRegisterClass &XRegClass =
@@ -4646,19 +5290,18 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  SMLoc M = getLoc();
   if (getParser().getTok().isNot(AsmToken::Comma)) {
-    Error(M, "expected comma");
+    Error(getLoc(), "expected comma");
     return MatchOperand_ParseFail;
   }
   // Eat the comma
   getParser().Lex();
 
   SMLoc E = getLoc();
-  int SecondReg = tryParseRegister();
-  if (SecondReg ==-1) {
+  unsigned SecondReg;
+  Res = tryParseScalarRegister(SecondReg);
+  if (Res != MatchOperand_Success)
     return MatchOperand_ParseFail;
-  }
 
   if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
       (isXReg && !XRegClass.contains(SecondReg)) ||
@@ -4683,16 +5326,16 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-template <bool ParseSuffix>
+template <bool ParseShiftExtend, bool ParseSuffix>
 OperandMatchResultTy
 AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
   const SMLoc S = getLoc();
   // Check for a SVE vector register specifier first.
-  int RegNum = -1;
+  unsigned RegNum;
   StringRef Kind;
 
   OperandMatchResultTy Res =
-      tryParseSVERegister(RegNum, Kind, RegKind::SVEDataVector);
+      tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);
 
   if (Res != MatchOperand_Success)
     return Res;
@@ -4700,20 +5343,81 @@ AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
   if (ParseSuffix && Kind.empty())
     return MatchOperand_NoMatch;
 
-  unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower())
-                        .Case("", -1)
-                        .Case(".b", 8)
-                        .Case(".h", 16)
-                        .Case(".s", 32)
-                        .Case(".d", 64)
-                        .Case(".q", 128)
-                        .Default(0);
-  if (!ElementWidth)
+  const auto &KindRes = parseVectorKind(Kind, RegKind::SVEDataVector);
+  if (!KindRes)
+    return MatchOperand_NoMatch;
+
+  unsigned ElementWidth = KindRes->second;
+
+  // No shift/extend is the default.
+  if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(AArch64Operand::CreateVectorReg(
+        RegNum, RegKind::SVEDataVector, ElementWidth, S, S, getContext()));
+
+    OperandMatchResultTy Res = tryParseVectorIndex(Operands);
+    if (Res == MatchOperand_ParseFail)
+      return MatchOperand_ParseFail;
+    return MatchOperand_Success;
+  }
+
+  // Eat the comma
+  getParser().Lex();
+
+  // Match the shift
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
+  Res = tryParseOptionalShiftExtend(ExtOpnd);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  auto Ext = static_cast<AArch64Operand *>(ExtOpnd.back().get());
+  Operands.push_back(AArch64Operand::CreateVectorReg(
+      RegNum, RegKind::SVEDataVector, ElementWidth, S, Ext->getEndLoc(),
+      getContext(), Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
+      Ext->hasShiftExtendAmount()));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+
+  SMLoc SS = getLoc();
+  const AsmToken &TokE = Parser.getTok();
+  bool IsHash = TokE.is(AsmToken::Hash);
+
+  if (!IsHash && TokE.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
+  int64_t Pattern;
+  if (IsHash) {
+    Parser.Lex(); // Eat hash
+
+    // Parse the immediate operand.
+    const MCExpr *ImmVal;
+    SS = getLoc();
+    if (Parser.parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+
+    auto *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE)
+      return MatchOperand_ParseFail;
+
+    Pattern = MCE->getValue();
+  } else {
+    // Parse the pattern
+    auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByName(TokE.getString());
+    if (!Pat)
+      return MatchOperand_NoMatch;
+
+    Parser.Lex();
+    Pattern = Pat->Encoding;
+    assert(Pattern >= 0 && Pattern < 32);
+  }
+
   Operands.push_back(
-    AArch64Operand::CreateReg(RegNum, RegKind::SVEDataVector, ElementWidth,
-                              S, S, getContext()));
+      AArch64Operand::CreateImm(MCConstantExpr::create(Pattern, getContext()),
+                                SS, getLoc(), getContext()));
 
   return MatchOperand_Success;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 30438a159fbc..cef0ff346448 100644
--- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -55,6 +55,9 @@ static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
+static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
@@ -87,13 +90,28 @@ static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                             const void *Decoder);
 static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decode);
+                                           const void *Decoder);
+static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
 static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
-                                           const void *Decode);
-LLVM_ATTRIBUTE_UNUSED static DecodeStatus
-DecodePPR_3bRegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address,
-                          const void *Decode);
+                                           const void *Decoder);
+static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
 
 static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
@@ -188,9 +206,18 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
                                                       unsigned RegNo,
                                                       uint64_t Addr,
                                                       const void *Decoder);
+static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
 template<int Bits>
 static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
                                uint64_t Address, const void *Decoder);
+template <int ElementWidth>
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
+                                     uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -389,6 +416,17 @@ static const unsigned GPR64DecoderTable[] = {
     AArch64::LR,  AArch64::XZR
 };
 
+static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Addr,
+                                                   const void *Decoder) {
+  if (RegNo > 30)
+    return Fail;
+
+  unsigned Register = GPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
                                              const void *Decoder) {
@@ -467,6 +505,91 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
+static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 7)
+    return Fail;
+  return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const unsigned ZZDecoderTable[] = {
+  AArch64::Z0_Z1,   AArch64::Z1_Z2,   AArch64::Z2_Z3,   AArch64::Z3_Z4,
+  AArch64::Z4_Z5,   AArch64::Z5_Z6,   AArch64::Z6_Z7,   AArch64::Z7_Z8,
+  AArch64::Z8_Z9,   AArch64::Z9_Z10,  AArch64::Z10_Z11, AArch64::Z11_Z12,
+  AArch64::Z12_Z13, AArch64::Z13_Z14, AArch64::Z14_Z15, AArch64::Z15_Z16,
+  AArch64::Z16_Z17, AArch64::Z17_Z18, AArch64::Z18_Z19, AArch64::Z19_Z20,
+  AArch64::Z20_Z21, AArch64::Z21_Z22, AArch64::Z22_Z23, AArch64::Z23_Z24,
+  AArch64::Z24_Z25, AArch64::Z25_Z26, AArch64::Z26_Z27, AArch64::Z27_Z28,
+  AArch64::Z28_Z29, AArch64::Z29_Z30, AArch64::Z30_Z31, AArch64::Z31_Z0
+};
+
+static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void* Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = ZZDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned ZZZDecoderTable[] = {
+  AArch64::Z0_Z1_Z2,    AArch64::Z1_Z2_Z3,    AArch64::Z2_Z3_Z4,
+  AArch64::Z3_Z4_Z5,    AArch64::Z4_Z5_Z6,    AArch64::Z5_Z6_Z7,
+  AArch64::Z6_Z7_Z8,    AArch64::Z7_Z8_Z9,    AArch64::Z8_Z9_Z10,
+  AArch64::Z9_Z10_Z11,  AArch64::Z10_Z11_Z12, AArch64::Z11_Z12_Z13,
+  AArch64::Z12_Z13_Z14, AArch64::Z13_Z14_Z15, AArch64::Z14_Z15_Z16,
+  AArch64::Z15_Z16_Z17, AArch64::Z16_Z17_Z18, AArch64::Z17_Z18_Z19,
+  AArch64::Z18_Z19_Z20, AArch64::Z19_Z20_Z21, AArch64::Z20_Z21_Z22,
+  AArch64::Z21_Z22_Z23, AArch64::Z22_Z23_Z24, AArch64::Z23_Z24_Z25,
+  AArch64::Z24_Z25_Z26, AArch64::Z25_Z26_Z27, AArch64::Z26_Z27_Z28,
+  AArch64::Z27_Z28_Z29, AArch64::Z28_Z29_Z30, AArch64::Z29_Z30_Z31,
+  AArch64::Z30_Z31_Z0,  AArch64::Z31_Z0_Z1
+};
+
+static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void* Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = ZZZDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned ZZZZDecoderTable[] = {
+  AArch64::Z0_Z1_Z2_Z3,     AArch64::Z1_Z2_Z3_Z4,     AArch64::Z2_Z3_Z4_Z5,
+  AArch64::Z3_Z4_Z5_Z6,     AArch64::Z4_Z5_Z6_Z7,     AArch64::Z5_Z6_Z7_Z8,
+  AArch64::Z6_Z7_Z8_Z9,     AArch64::Z7_Z8_Z9_Z10,    AArch64::Z8_Z9_Z10_Z11,
+  AArch64::Z9_Z10_Z11_Z12,  AArch64::Z10_Z11_Z12_Z13, AArch64::Z11_Z12_Z13_Z14,
+  AArch64::Z12_Z13_Z14_Z15, AArch64::Z13_Z14_Z15_Z16, AArch64::Z14_Z15_Z16_Z17,
+  AArch64::Z15_Z16_Z17_Z18, AArch64::Z16_Z17_Z18_Z19, AArch64::Z17_Z18_Z19_Z20,
+  AArch64::Z18_Z19_Z20_Z21, AArch64::Z19_Z20_Z21_Z22, AArch64::Z20_Z21_Z22_Z23,
+  AArch64::Z21_Z22_Z23_Z24, AArch64::Z22_Z23_Z24_Z25, AArch64::Z23_Z24_Z25_Z26,
+  AArch64::Z24_Z25_Z26_Z27, AArch64::Z25_Z26_Z27_Z28, AArch64::Z26_Z27_Z28_Z29,
+  AArch64::Z27_Z28_Z29_Z30, AArch64::Z28_Z29_Z30_Z31, AArch64::Z29_Z30_Z31_Z0,
+  AArch64::Z30_Z31_Z0_Z1,   AArch64::Z31_Z0_Z1_Z2
+};
+
+static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void* Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = ZZZZDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
 static const unsigned PPRDecoderTable[] = {
   AArch64::P0,  AArch64::P1,  AArch64::P2,  AArch64::P3,
   AArch64::P4,  AArch64::P5,  AArch64::P6,  AArch64::P7,
@@ -1060,6 +1183,14 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
   case AArch64::LDRHHpost:
   case AArch64::STRWpost:
   case AArch64::LDRWpost:
+  case AArch64::STLURBi:
+  case AArch64::STLURHi:
+  case AArch64::STLURWi:
+  case AArch64::LDAPURBi:
+  case AArch64::LDAPURSBWi:
+  case AArch64::LDAPURHi:
+  case AArch64::LDAPURSHWi:
+  case AArch64::LDAPURi:
     DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
     break;
   case AArch64::LDURSBXi:
@@ -1082,6 +1213,11 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
   case AArch64::STRXpost:
   case AArch64::LDRSWpost:
   case AArch64::LDRXpost:
+  case AArch64::LDAPURSWi:
+  case AArch64::LDAPURSHXi:
+  case AArch64::LDAPURSBXi:
+  case AArch64::STLURXi:
+  case AArch64::LDAPURXi:
     DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
     break;
   case AArch64::LDURQi:
@@ -1649,6 +1785,23 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
                                              RegNo, Addr, Decoder);
 }
 
+static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Zdn = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 13);
+  if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+    return Fail;
+
+  // The same (tied) operand is added twice to the instruction.
+  DecodeZPRRegisterClass(Inst, Zdn, Addr, Decoder);
+  if (Inst.getOpcode() != AArch64::DUPM_ZI)
+    DecodeZPRRegisterClass(Inst, Zdn, Addr, Decoder);
+  Inst.addOperand(MCOperand::createImm(imm));
+  return Success;
+}
+
 template<int Bits>
 static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
                                uint64_t Address, const void *Decoder) {
@@ -1663,3 +1816,22 @@ static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
   return Success;
 }
 
+// Decode 8-bit signed/unsigned immediate for a given element width.
+template <int ElementWidth>
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
+                                      uint64_t Addr, const void *Decoder) {
+  unsigned Val = (uint8_t)Imm;
+  unsigned Shift = (Imm & 0x100) ? 8 : 0;
+  if (ElementWidth == 8 && Shift)
+    return Fail;
+  Inst.addOperand(MCOperand::createImm(Val));
+  Inst.addOperand(MCOperand::createImm(Shift));
+  return Success;
+}
+
+// Decode uimm4 ranged from 1-16.
+static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(Imm + 1));
+  return Success;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 19d0ba2e1c41..6e64fc9347b9 100644
--- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -99,8 +99,8 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
         EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
         SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
                      &ReferenceName);
-        CommentStream << format("0x%llx",
-                                0xfffffffffffff000LL & (Address + Value));
+        CommentStream << format("0x%llx", (0xfffffffffffff000LL & Address) +
+                                              Value * 0x1000);
     } else if (MI.getOpcode() == AArch64::ADDXri ||
                MI.getOpcode() == AArch64::LDRXui ||
                MI.getOpcode() == AArch64::LDRXl ||
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index bdf71b095fda..26e41215afc6 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -282,6 +282,13 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
+  // Instruction TSB is specified as a one operand instruction, but 'csync' is
+  // not encoded, so for printing it is treated as a special case here:
+  if (Opcode == AArch64::TSB) {
+    O << "\ttsb\tcsync";
+    return;
+  }
+
   if (!printAliasInstr(MI, STI, O))
     printInstruction(MI, STI, O);
 
@@ -907,20 +914,13 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
   }
 }
 
-void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
-}
-
-void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
+template <typename T>
+void AArch64InstPrinter::printLogicalImm(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
   uint64_t Val = MI->getOperand(OpNum).getImm();
   O << "#0x";
-  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 8 * sizeof(T)));
 }
 
 void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
@@ -976,12 +976,9 @@ void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
     O << " #" << ShiftVal;
 }
 
-void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O, char SrcRegKind,
-                                        unsigned Width) {
-  unsigned SignExtend = MI->getOperand(OpNum).getImm();
-  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
-
+static void printMemExtendImpl(bool SignExtend, bool DoShift,
+                               unsigned Width, char SrcRegKind,
+                               raw_ostream &O) {
   // sxtw, sxtx, uxtw or lsl (== uxtx)
   bool IsLSL = !SignExtend && SrcRegKind == 'x';
   if (IsLSL)
@@ -993,6 +990,32 @@ void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
     O << " #" << Log2_32(Width / 8);
 }
 
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  bool SignExtend = MI->getOperand(OpNum).getImm();
+  bool DoShift = MI->getOperand(OpNum + 1).getImm();
+  printMemExtendImpl(SignExtend, DoShift, Width, SrcRegKind, O);
+}
+
+template <bool SignExtend, int ExtWidth, char SrcRegKind, char Suffix>
+void AArch64InstPrinter::printRegWithShiftExtend(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  printOperand(MI, OpNum, STI, O);
+  if (Suffix == 's' || Suffix == 'd')
+    O << '.' << Suffix;
+  else
+    assert(Suffix == 0 && "Unsupported suffix size");
+
+  bool DoShift = ExtWidth != 8;
+  if (SignExtend || DoShift || SrcRegKind == 'w') {
+    O << ", ";
+    printMemExtendImpl(SignExtend, DoShift, ExtWidth, SrcRegKind, O);
+  }
+}
+
 void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
@@ -1045,15 +1068,22 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
   O << ']';
 }
 
+template <bool IsSVEPrefetch>
 void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
   unsigned prfop = MI->getOperand(OpNum).getImm();
-  auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
-  if (PRFM)
+  if (IsSVEPrefetch) {
+    if (auto PRFM = AArch64SVEPRFM::lookupSVEPRFMByEncoding(prfop)) {
+      O << PRFM->Name;
+      return;
+    }
+  } else if (auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop)) {
     O << PRFM->Name;
-  else
-    O << '#' << formatImm(prfop);
+    return;
+  }
+
+  O << '#' << formatImm(prfop);
 }
 
 void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
@@ -1118,6 +1148,41 @@ static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
     case AArch64::Q31:
       Reg = AArch64::Q0;
       break;
+    case AArch64::Z0:  Reg = AArch64::Z1;  break;
+    case AArch64::Z1:  Reg = AArch64::Z2;  break;
+    case AArch64::Z2:  Reg = AArch64::Z3;  break;
+    case AArch64::Z3:  Reg = AArch64::Z4;  break;
+    case AArch64::Z4:  Reg = AArch64::Z5;  break;
+    case AArch64::Z5:  Reg = AArch64::Z6;  break;
+    case AArch64::Z6:  Reg = AArch64::Z7;  break;
+    case AArch64::Z7:  Reg = AArch64::Z8;  break;
+    case AArch64::Z8:  Reg = AArch64::Z9;  break;
+    case AArch64::Z9:  Reg = AArch64::Z10; break;
+    case AArch64::Z10: Reg = AArch64::Z11; break;
+    case AArch64::Z11: Reg = AArch64::Z12; break;
+    case AArch64::Z12: Reg = AArch64::Z13; break;
+    case AArch64::Z13: Reg = AArch64::Z14; break;
+    case AArch64::Z14: Reg = AArch64::Z15; break;
+    case AArch64::Z15: Reg = AArch64::Z16; break;
+    case AArch64::Z16: Reg = AArch64::Z17; break;
+    case AArch64::Z17: Reg = AArch64::Z18; break;
+    case AArch64::Z18: Reg = AArch64::Z19; break;
+    case AArch64::Z19: Reg = AArch64::Z20; break;
+    case AArch64::Z20: Reg = AArch64::Z21; break;
+    case AArch64::Z21: Reg = AArch64::Z22; break;
+    case AArch64::Z22: Reg = AArch64::Z23; break;
+    case AArch64::Z23: Reg = AArch64::Z24; break;
+    case AArch64::Z24: Reg = AArch64::Z25; break;
+    case AArch64::Z25: Reg = AArch64::Z26; break;
+    case AArch64::Z26: Reg = AArch64::Z27; break;
+    case AArch64::Z27: Reg = AArch64::Z28; break;
+    case AArch64::Z28: Reg = AArch64::Z29; break;
+    case AArch64::Z29: Reg = AArch64::Z30; break;
+    case AArch64::Z30: Reg = AArch64::Z31; break;
+    // Vector lists can wrap around.
+    case AArch64::Z31:
+      Reg = AArch64::Z0;
+      break;
     }
   }
   return Reg;
@@ -1152,12 +1217,15 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
   // list).
   unsigned NumRegs = 1;
   if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::ZPR2RegClassID).contains(Reg) ||
       MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
     NumRegs = 2;
   else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::ZPR3RegClassID).contains(Reg) ||
            MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
     NumRegs = 3;
   else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::ZPR4RegClassID).contains(Reg) ||
            MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
     NumRegs = 4;
 
@@ -1166,6 +1234,8 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
     Reg = FirstReg;
   else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
     Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::zsub0))
+    Reg = FirstReg;
 
   // If it's a D-reg, we need to promote it to the equivalent Q-reg before
   // printing (otherwise getRegisterName fails).
@@ -1176,7 +1246,11 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
   }
 
   for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
-    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (MRI.getRegClass(AArch64::ZPRRegClassID).contains(Reg))
+      O << getRegisterName(Reg) << LayoutSuffix;
+    else
+      O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+
     if (i + 1 != NumRegs)
       O << ", ";
   }
@@ -1262,6 +1336,9 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
   if (Opcode == AArch64::ISB) {
     auto ISB = AArch64ISB::lookupISBByEncoding(Val);
     Name = ISB ? ISB->Name : "";
+  } else if (Opcode == AArch64::TSB) {
+    auto TSB = AArch64TSB::lookupTSBByEncoding(Val);
+    Name = TSB ? TSB->Name : "";
   } else {
     auto DB = AArch64DB::lookupDBByEncoding(Val);
     Name = DB ? DB->Name : "";
@@ -1340,6 +1417,16 @@ void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo,
   O << "#" << (Val * Angle) + Remainder;
 }
 
+void AArch64InstPrinter::printSVEPattern(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  if (auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByEncoding(Val))
+    O << Pat->Name;
+  else
+    O << '#' << formatImm(Val);
+}
+
 template <char suffix>
 void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
                                        const MCSubtargetInfo &STI,
@@ -1359,4 +1446,101 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
   O << getRegisterName(Reg);
   if (suffix != 0)
     O << '.' << suffix;
-}
-\ No newline at end of file
+}
+
+template <typename T>
+void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) {
+  typename std::make_unsigned<T>::type HexValue = Value;
+
+  if (getPrintImmHex())
+    O << '#' << formatHex((uint64_t)HexValue);
+  else
+    O << '#' << formatDec(Value);
+
+  if (CommentStream) {
+    // Do the opposite to that used for instruction operands.
+    if (getPrintImmHex())
+      *CommentStream << '=' << formatDec(HexValue) << '\n';
+    else
+      *CommentStream << '=' << formatHex((uint64_t)Value) << '\n';
+  }
+}
+
+template <typename T>
+void AArch64InstPrinter::printImm8OptLsl(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  unsigned UnscaledVal = MI->getOperand(OpNum).getImm();
+  unsigned Shift = MI->getOperand(OpNum + 1).getImm();
+  assert(AArch64_AM::getShiftType(Shift) == AArch64_AM::LSL &&
+         "Unexepected shift type!");
+
+  // #0 lsl #8 is never pretty printed
+  if ((UnscaledVal == 0) && (AArch64_AM::getShiftValue(Shift) != 0)) {
+    O << '#' << formatImm(UnscaledVal);
+    printShifter(MI, OpNum + 1, STI, O);
+    return;
+  }
+
+  T Val;
+  if (std::is_signed<T>())
+    Val = (int8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift));
+  else
+    Val = (uint8_t)UnscaledVal * (1 << AArch64_AM::getShiftValue(Shift));
+
+  printImmSVE(Val, O);
+}
+
+template <typename T>
+void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  typedef typename std::make_signed<T>::type SignedT;
+  typedef typename std::make_unsigned<T>::type UnsignedT;
+
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64);
+
+  // Prefer the default format for 16bit values, hex otherwise.
+  if ((int16_t)PrintVal == (SignedT)PrintVal)
+    printImmSVE((T)PrintVal, O);
+  else if ((uint16_t)PrintVal == PrintVal)
+    printImmSVE(PrintVal, O);
+  else
+    O << '#' << formatHex((uint64_t)PrintVal);
+}
+
+template <int Width>
+void AArch64InstPrinter::printZPRasFPR(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  unsigned Base;
+  switch (Width) {
+  case 8:   Base = AArch64::B0; break;
+  case 16:  Base = AArch64::H0; break;
+  case 32:  Base = AArch64::S0; break;
+  case 64:  Base = AArch64::D0; break;
+  case 128: Base = AArch64::Q0; break;
+  default:
+    llvm_unreachable("Unsupported width");
+  }
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  O << getRegisterName(Reg - AArch64::Z0 + Base);
+}
+
+template <unsigned ImmIs0, unsigned ImmIs1>
+void AArch64InstPrinter::printExactFPImm(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream  &O) {
+  auto *Imm0Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs0);
+  auto *Imm1Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmIs1);
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  O << "#" << (Val ? Imm1Desc->Repr : Imm0Desc->Repr);
+}
+
+void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  O << getRegisterName(getWRegFromXReg(Reg));
+}
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 76f20f042cef..8dc9264f94a1 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "../Utils/AArch64BaseInfo.h"
 
 namespace llvm {
 
@@ -56,6 +57,7 @@ protected:
                 raw_ostream &O);
   void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  template <typename T> void printImmSVE(T Value, raw_ostream &O);
   void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
                            raw_ostream &O);
   template <int Amount>
@@ -70,10 +72,9 @@ protected:
                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printAddSubImm(const MCInst *MI, unsigned OpNum,
                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printLogicalImm32(const MCInst *MI, unsigned OpNum,
-                         const MCSubtargetInfo &STI, raw_ostream &O);
-  void printLogicalImm64(const MCInst *MI, unsigned OpNum,
-                         const MCSubtargetInfo &STI, raw_ostream &O);
+  template <typename T>
+  void printLogicalImm(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
   void printShifter(const MCInst *MI, unsigned OpNum,
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printShiftedRegister(const MCInst *MI, unsigned OpNum,
@@ -90,7 +91,9 @@ protected:
                       const MCSubtargetInfo &STI, raw_ostream &O) {
     printMemExtend(MI, OpNum, O, SrcRegKind, Width);
   }
-
+  template <bool SignedExtend, int ExtWidth, char SrcRegKind, char Suffix>
+  void printRegWithShiftExtend(const MCInst *MI, unsigned OpNum,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printCondCode(const MCInst *MI, unsigned OpNum,
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printInverseCondCode(const MCInst *MI, unsigned OpNum,
@@ -121,6 +124,7 @@ protected:
   void printImmScale(const MCInst *MI, unsigned OpNum,
                      const MCSubtargetInfo &STI, raw_ostream &O);
 
+  template <bool IsSVEPrefetch = false>
   void printPrefetchOp(const MCInst *MI, unsigned OpNum,
                        const MCSubtargetInfo &STI, raw_ostream &O);
 
@@ -165,9 +169,25 @@ protected:
   void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O);
+  template <typename T>
+  void printImm8OptLsl(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  template <typename T>
+  void printSVELogicalImm(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSVEPattern(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
   template <char = 0>
   void printSVERegOp(const MCInst *MI, unsigned OpNum,
                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printGPR64as32(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+  template <int Width>
+  void printZPRasFPR(const MCInst *MI, unsigned OpNum,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  template <unsigned ImmIs0, unsigned ImmIs1>
+  void printExactFPImm(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
 };
 
 class AArch64AppleInstPrinter : public AArch64InstPrinter {
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 3e5ef4df4706..62644ab2f457 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -213,7 +213,8 @@ static inline uint64_t ror(uint64_t elt, unsigned size) {
 static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
                                            uint64_t &Encoding) {
   if (Imm == 0ULL || Imm == ~0ULL ||
-      (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
+      (RegSize != 64 &&
+        (Imm >> RegSize != 0 || Imm == (~0ULL >> (64 - RegSize)))))
     return false;
 
   // First, determine the element size.
@@ -753,6 +754,67 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
   return (EncVal << 32) | EncVal;
 }
 
+/// Returns true if Imm is the concatenation of a repeating pattern of type T.
+template <typename T>
+static inline bool isSVEMaskOfIdenticalElements(int64_t Imm) {
+  union {
+    int64_t Whole;
+    T Parts[sizeof(int64_t)/sizeof(T)];
+  } Vec { Imm };
+
+  return all_of(Vec.Parts, [Vec](T Elem) { return Elem == Vec.Parts[0]; });
+}
+
+/// Returns true if Imm is valid for CPY/DUP.
+template <typename T>
+static inline bool isSVECpyImm(int64_t Imm) {
+  bool IsImm8 = int8_t(Imm) == Imm;
+  bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
+
+  if (std::is_same<int8_t, typename std::make_signed<T>::type>::value)
+    return IsImm8 || uint8_t(Imm) == Imm;
+
+  if (std::is_same<int16_t, typename std::make_signed<T>::type>::value)
+    return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm;
+
+  return IsImm8 || IsImm16;
+}
+
+/// Returns true if Imm is valid for ADD/SUB.
+template <typename T>
+static inline bool isSVEAddSubImm(int64_t Imm) {
+  bool IsInt8t =
+      std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+  return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm);
+}
+
+/// Return true if Imm is valid for DUPM and has no single CPY/DUP equivalent.
+static inline bool isSVEMoveMaskPreferredLogicalImmediate(int64_t Imm) {
+  union {
+    int64_t D;
+    int32_t S[2];
+    int16_t H[4];
+    int8_t  B[8];
+  } Vec = { Imm };
+
+  if (isSVECpyImm<int64_t>(Vec.D))
+    return false;
+
+  if (isSVEMaskOfIdenticalElements<int32_t>(Imm) &&
+      isSVECpyImm<int32_t>(Vec.S[0]))
+    return false;
+
+  if (isSVEMaskOfIdenticalElements<int16_t>(Imm) &&
+      isSVECpyImm<int16_t>(Vec.H[0]))
+    return false;
+
+  if (isSVEMaskOfIdenticalElements<int8_t>(Imm) &&
+      isSVECpyImm<int8_t>(Vec.B[0]))
+    return false;
+
+  return isLogicalImmediate(Vec.D, 64);
+}
+
 inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
   for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
     if ((Value & ~(0xffffULL << Shift)) == 0)
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 4d1d3fd57353..856946555198 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -33,11 +33,9 @@ class AArch64AsmBackend : public MCAsmBackend {
   Triple TheTriple;
 
 public:
-  bool IsLittleEndian;
-
-public:
   AArch64AsmBackend(const Target &T, const Triple &TT, bool IsLittleEndian)
-      : MCAsmBackend(), TheTriple(TT), IsLittleEndian(IsLittleEndian) {}
+      : MCAsmBackend(IsLittleEndian ? support::little : support::big),
+        TheTriple(TT) {}
 
   unsigned getNumFixupKinds() const override {
     return AArch64::NumTargetFixupKinds;
@@ -75,15 +73,17 @@ public:
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
@@ -97,7 +97,7 @@ public:
 
 } // end anonymous namespace
 
-/// \brief The number of bytes the fixup may change.
+/// The number of bytes the fixup may change.
 static unsigned getFixupKindNumBytes(unsigned Kind) {
   switch (Kind) {
   default:
@@ -248,7 +248,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 /// getFixupKindContainereSizeInBytes - The number of bytes of the
 /// container involved in big endian or 0 if the item is little endian
 unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
-  if (IsLittleEndian)
+  if (Endian == support::little)
     return 0;
 
   switch (Kind) {
@@ -287,7 +287,8 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                    const MCValue &Target,
                                    MutableArrayRef<char> Data, uint64_t Value,
-                                   bool IsResolved) const {
+                                   bool IsResolved,
+                                   const MCSubtargetInfo *STI) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
@@ -323,7 +324,8 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   }
 }
 
-bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst,
+                                          const MCSubtargetInfo &STI) const {
   return false;
 }
 
@@ -344,16 +346,16 @@ void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
   llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
 
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
-  OW->WriteZeros(Count % 4);
+  OS.write_zeros(Count % 4);
 
   // We are properly aligned, so write NOPs as requested.
   Count /= 4;
   for (uint64_t i = 0; i != Count; ++i)
-    OW->write32(0xd503201f);
+    support::endian::write<uint32_t>(OS, 0xd503201f, Endian);
   return true;
 }
 
@@ -381,20 +383,20 @@ namespace {
 
 namespace CU {
 
-/// \brief Compact unwind encoding values.
+/// Compact unwind encoding values.
 enum CompactUnwindEncodings {
-  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// A "frameless" leaf function, where no non-volatile registers are
   /// saved. The return remains in LR throughout the function.
   UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
 
-  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// No compact unwind encoding available. Instead the low 23-bits of
   /// the compact unwind encoding is the offset of the DWARF FDE in the
   /// __eh_frame section. This mode is never used in object files. It is only
   /// generated by the linker in final linked images, which have only DWARF info
   /// for a function.
   UNWIND_ARM64_MODE_DWARF = 0x03000000,
 
-  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// This is a standard arm64 prologue where FP/LR are immediately
   /// pushed on the stack, then SP is copied to FP. If there are any
   /// non-volatile register saved, they are copied into the stack fame in pairs
   /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
@@ -402,7 +404,7 @@ enum CompactUnwindEncodings {
   /// in register number order.
   UNWIND_ARM64_MODE_FRAME = 0x04000000,
 
-  /// \brief Frame register pair encodings.
+  /// Frame register pair encodings.
   UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
   UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
   UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
@@ -420,7 +422,7 @@ enum CompactUnwindEncodings {
 class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   const MCRegisterInfo &MRI;
 
-  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// Encode compact unwind stack adjustment for frameless functions.
   /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
   /// The stack size always needs to be 16 byte aligned.
   uint32_t encodeStackAdjustment(uint32_t StackSize) const {
@@ -432,13 +434,13 @@ public:
                           const MCRegisterInfo &MRI)
       : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64,
                                          MachO::CPU_SUBTYPE_ARM64_ALL);
   }
 
-  /// \brief Generate the compact unwind encoding from the CFI directives.
+  /// Generate the compact unwind encoding from the CFI directives.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
     if (Instrs.empty())
@@ -457,9 +459,17 @@ public:
         return CU::UNWIND_ARM64_MODE_DWARF;
       case MCCFIInstruction::OpDefCfa: {
         // Defines a frame pointer.
-        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
-                   AArch64::FP &&
-               "Invalid frame pointer!");
+        unsigned XReg =
+            getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true));
+
+        // Other CFA registers than FP are not supported by compact unwind.
+        // Fallback on DWARF.
+        // FIXME: When opt-remarks are supported in MC, add a remark to notify
+        // the user.
+        if (XReg != AArch64::FP)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+
+        assert(XReg == AArch64::FP && "Invalid frame pointer!");
         assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
 
         const MCCFIInstruction &LRPush = Instrs[++i];
@@ -583,9 +593,9 @@ public:
       : AArch64AsmBackend(T, TT, IsLittleEndian), OSABI(OSABI),
         IsILP32(IsILP32) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createAArch64ELFObjectWriter(OSABI, IsILP32);
   }
 };
 
@@ -597,9 +607,9 @@ public:
   COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple)
       : AArch64AsmBackend(T, TheTriple, /*IsLittleEndian*/ true) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAArch64WinCOFFObjectWriter(OS);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createAArch64WinCOFFObjectWriter();
   }
 };
 }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 2d90e67960f8..a11e396217af 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -31,7 +31,7 @@ namespace {
 
 class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32);
+  AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32);
 
   ~AArch64ELFObjectWriter() override = default;
 
@@ -43,9 +43,7 @@ protected:
 
 } // end anonymous namespace
 
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
-                                               bool IsLittleEndian,
-                                               bool IsILP32)
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
     : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
@@ -429,10 +427,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
   llvm_unreachable("Unimplemented fixup -> relocation");
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                                   bool IsLittleEndian, bool IsILP32) {
-  auto MOTW =
-      llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsLittleEndian, IsILP32);
-  return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
+  return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
 }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 8ee627d50df2..c0ef8b670286 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -87,9 +88,10 @@ public:
   friend class AArch64TargetELFStreamer;
 
   AArch64ELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                     raw_pwrite_stream &OS,
+                     std::unique_ptr<MCObjectWriter> OW,
                      std::unique_ptr<MCCodeEmitter> Emitter)
-      : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+      : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+                      std::move(Emitter)),
         MappingSymbolCounter(0), LastEMS(EMS_None) {}
 
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
@@ -209,11 +211,11 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
 
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
                                         std::unique_ptr<MCAsmBackend> TAB,
-                                        raw_pwrite_stream &OS,
+                                        std::unique_ptr<MCObjectWriter> OW,
                                         std::unique_ptr<MCCodeEmitter> Emitter,
                                         bool RelaxAll) {
-  AArch64ELFStreamer *S =
-      new AArch64ELFStreamer(Context, std::move(TAB), OS, std::move(Emitter));
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(
+      Context, std::move(TAB), std::move(OW), std::move(Emitter));
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 19b188aa1c61..d5b009ec30d1 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -20,7 +20,7 @@ namespace llvm {
 
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
                                         std::unique_ptr<MCAsmBackend> TAB,
-                                        raw_pwrite_stream &OS,
+                                        std::unique_ptr<MCObjectWriter> OW,
                                         std::unique_ptr<MCCodeEmitter> Emitter,
                                         bool RelaxAll);
 }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 12b5a27b7699..ebb49121c1bf 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -101,7 +101,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   HasIdentDirective = true;
 }
 
-AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
+AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
 
@@ -112,14 +112,23 @@ AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
   CodePointerSize = 8;
-}
 
-AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   CommentString = ";";
   ExceptionsType = ExceptionHandling::WinEH;
 }
 
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  AlignmentIsInBytes = false;
+  SupportsDebugInformation = true;
+  CodePointerSize = 8;
+
   CommentString = "//";
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index afde87b40929..e8570b1c2887 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -34,15 +34,11 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
 };
 
-struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF {
-  explicit AArch64MCAsmInfoCOFF();
-};
-
-struct AArch64MCAsmInfoMicrosoftCOFF : public AArch64MCAsmInfoCOFF {
+struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
 };
 
-struct AArch64MCAsmInfoGNUCOFF : public AArch64MCAsmInfoCOFF {
+struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
 };
 
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 33698d2b8c38..41cad48f7aea 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -163,6 +163,13 @@ public:
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
 
+  uint32_t getImm8OptLsl(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+  uint32_t getSVEIncDecImm(const MCInst &MI, unsigned OpIdx,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+
   unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
                    const MCSubtargetInfo &STI) const;
 
@@ -276,7 +283,8 @@ AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
   if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
     AArch64MCExpr::VariantKind RefKind = A64E->getKind();
     if (RefKind == AArch64MCExpr::VK_TPREL_HI12 ||
-        RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        RefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+        RefKind == AArch64MCExpr::VK_SECREL_HI12)
       ShiftVal = 12;
   }
   return ShiftVal == 0 ? 0 : (1 << ShiftVal);
@@ -508,6 +516,34 @@ AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
   return MO.getImm() - 8;
 }
 
+uint32_t
+AArch64MCCodeEmitter::getImm8OptLsl(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+  // Test shift
+  auto ShiftOpnd = MI.getOperand(OpIdx + 1).getImm();
+  assert(AArch64_AM::getShiftType(ShiftOpnd) == AArch64_AM::LSL &&
+         "Unexpected shift type for imm8_opt_lsl immediate.");
+
+  unsigned ShiftVal = AArch64_AM::getShiftValue(ShiftOpnd);
+  assert((ShiftVal == 0 || ShiftVal == 8) &&
+         "Unexpected shift value for imm8_opt_lsl immediate.");
+
+  // Test immediate
+  auto Immediate = MI.getOperand(OpIdx).getImm();
+  return (Immediate & 0xff) | (ShiftVal == 0 ? 0 : (1 << ShiftVal));
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSVEIncDecImm(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value!");
+  // Normalize 1-16 range to 0-15.
+  return MO.getImm() - 1;
+}
+
 /// getMoveVecShifterOpValue - Return the encoded value for the vector move
 /// shifter (MSL).
 uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
@@ -571,7 +607,7 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+  support::endian::write<uint32_t>(OS, Binary, support::little);
   ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index f606d272bcb0..cd937935ddbf 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -70,6 +70,8 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
   case VK_TLSDESC:             return "";
   case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case VK_SECREL_LO12:         return ":secrel_lo12:";
+  case VK_SECREL_HI12:         return ":secrel_hi12:";
   default:
     llvm_unreachable("Invalid ELF symbol kind");
   }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 3dbf0f84a665..b6bf254d3835 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -35,6 +35,7 @@ public:
     VK_GOTTPREL = 0x005,
     VK_TPREL    = 0x006,
     VK_TLSDESC  = 0x007,
+    VK_SECREL   = 0x008,
     VK_SymLocBits = 0x00f,
 
     // Variants specifying which part of the final address calculation is
@@ -98,6 +99,8 @@ public:
     VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
     VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF,
     VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+    VK_SECREL_LO12       = VK_SECREL   | VK_PAGEOFF,
+    VK_SECREL_HI12       = VK_SECREL   | VK_HI12,
 
     VK_INVALID  = 0xfff
   };
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index c3458d625b83..4ceda7e122f4 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -103,36 +104,61 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
 
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&TAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                                      bool RelaxAll) {
-  return createAArch64ELFStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
-                                  RelaxAll);
+  return createAArch64ELFStreamer(Ctx, std::move(TAB), std::move(OW),
+                                  std::move(Emitter), RelaxAll);
 }
 
 static MCStreamer *createMachOStreamer(MCContext &Ctx,
                                        std::unique_ptr<MCAsmBackend> &&TAB,
-                                       raw_pwrite_stream &OS,
+                                       std::unique_ptr<MCObjectWriter> &&OW,
                                        std::unique_ptr<MCCodeEmitter> &&Emitter,
                                        bool RelaxAll,
                                        bool DWARFMustBeAtTheEnd) {
-  return createMachOStreamer(Ctx, std::move(TAB), OS, std::move(Emitter),
-                             RelaxAll, DWARFMustBeAtTheEnd,
+  return createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
+                             std::move(Emitter), RelaxAll, DWARFMustBeAtTheEnd,
                              /*LabelSections*/ true);
 }
 
 static MCStreamer *
 createWinCOFFStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
-                      raw_pwrite_stream &OS,
+                      std::unique_ptr<MCObjectWriter> &&OW,
                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
                       bool IncrementalLinkerCompatible) {
-  return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), OS,
+  return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
                                       std::move(Emitter), RelaxAll,
                                       IncrementalLinkerCompatible);
 }
 
+namespace {
+
+class AArch64MCInstrAnalysis : public MCInstrAnalysis {
+public:
+  AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    // Search for a PC-relative argument.
+    // This will handle instructions like bcc (where the first argument is the
+    // condition code) and cbz (where it is a register).
+    const auto &Desc = Info->get(Inst.getOpcode());
+    for (unsigned i = 0, e = Inst.getNumOperands(); i != e; i++) {
+      if (Desc.OpInfo[i].OperandType == MCOI::OPERAND_PCREL) {
+        int64_t Imm = Inst.getOperand(i).getImm() * 4;
+        Target = Addr + Imm;
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
 static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
-  return new MCInstrAnalysis(Info);
+  return new AArch64MCInstrAnalysis(Info);
 }
 
 // Force static initialization.
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index a5720e0e8b87..63f50778ccdb 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -26,7 +26,7 @@ class MCContext;
 class MCInstrInfo;
 class MCInstPrinter;
 class MCRegisterInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -53,16 +53,13 @@ MCAsmBackend *createAArch64beAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI,
                                         const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
-createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                             bool IsLittleEndian, bool IsILP32);
+std::unique_ptr<MCObjectTargetWriter>
+createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32);
 
-std::unique_ptr<MCObjectWriter>
-createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType,
-                              uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype);
 
-std::unique_ptr<MCObjectWriter>
-createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS);
+std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter();
 
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 55151c2b8d21..1021cdeeb3be 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -306,39 +306,24 @@ void AArch64MachObjectWriter::recordRelocation(
     bool CanUseLocalRelocation =
         canUseLocalRelocation(Section, *Symbol, Log2Size);
     if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+      // Make sure that the symbol is actually in a section here. If it isn't,
+      // emit an error and exit.
+      if (!Symbol->isInSection()) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+        return;
+      }
       const MCSection &Sec = Symbol->getSection();
       if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
         Symbol->setUsedInReloc();
     }
 
     const MCSymbol *Base = Asm.getAtom(*Symbol);
-
-    // If the symbol is a variable and we weren't able to get a Base for it
-    // (i.e., it's not in the symbol table associated with a section) resolve
-    // the relocation based its expansion instead.
-    if (Symbol->isVariable() && !Base) {
-      // If the evaluation is an absolute value, just use that directly
-      // to keep things easy.
-      int64_t Res;
-      if (Symbol->getVariableValue()->evaluateAsAbsolute(
-              Res, Layout, Writer->getSectionAddressMap())) {
-        FixedValue = Res;
-        return;
-      }
-
-      // FIXME: Will the Target we already have ever have any data in it
-      // we need to preserve and merge with the new Target? How about
-      // the FixedValue?
-      if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
-                                                             &Fixup)) {
-        Asm.getContext().reportError(Fixup.getLoc(),
-                                     "unable to resolve variable '" +
-                                         Symbol->getName() + "'");
-        return;
-      }
-      return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                              FixedValue);
-    }
+    // If the symbol is a variable it can either be in a section and
+    // we have a base or it is absolute and should have been expanded.
+    assert(!Symbol->isVariable() || Base);
 
     // Relocations inside debug sections always use local relocations when
     // possible. This seems to be done because the debugger doesn't fully
@@ -377,19 +362,8 @@ void AArch64MachObjectWriter::recordRelocation(
         Value -= Writer->getFragmentAddress(Fragment, Layout) +
                  Fixup.getOffset() + (1ULL << Log2Size);
     } else {
-      // Resolve constant variables.
-      if (Symbol->isVariable()) {
-        int64_t Res;
-        if (Symbol->getVariableValue()->evaluateAsAbsolute(
-                Res, Layout, Writer->getSectionAddressMap())) {
-          FixedValue = Res;
-          return;
-        }
-      }
-      Asm.getContext().reportError(Fixup.getLoc(),
-                                  "unsupported relocation of variable '" +
-                                      Symbol->getName() + "'");
-      return;
+      llvm_unreachable(
+          "This constant variable should have been expanded during evaluation");
     }
   }
 
@@ -430,10 +404,7 @@ void AArch64MachObjectWriter::recordRelocation(
   Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType,
-                                    uint32_t CPUSubtype) {
-  return createMachObjectWriter(
-      llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype), OS,
-      /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) {
+  return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype);
 }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index d06c5e8862ae..7ea7d5f2a20e 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,6 +8,7 @@
 //===---------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -46,6 +47,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
     bool IsCrossSection, const MCAsmBackend &MAB) const {
   auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
                                       : Target.getSymA()->getKind();
+  const MCExpr *Expr = Fixup.getValue();
 
   switch (static_cast<unsigned>(Fixup.getKind())) {
   default: {
@@ -73,6 +75,13 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
     return COFF::IMAGE_REL_ARM64_SECREL;
 
   case AArch64::fixup_aarch64_add_imm12:
+    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+      AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+      if (RefKind == AArch64MCExpr::VK_SECREL_LO12)
+        return COFF::IMAGE_REL_ARM64_SECREL_LOW12A;
+      if (RefKind == AArch64MCExpr::VK_SECREL_HI12)
+        return COFF::IMAGE_REL_ARM64_SECREL_HIGH12A;
+    }
     return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
 
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
@@ -80,11 +89,25 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+      AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+      if (RefKind == AArch64MCExpr::VK_SECREL_LO12)
+        return COFF::IMAGE_REL_ARM64_SECREL_LOW12L;
+    }
     return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
 
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    return COFF::IMAGE_REL_ARM64_REL21;
+
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
 
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    return COFF::IMAGE_REL_ARM64_BRANCH14;
+
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return COFF::IMAGE_REL_ARM64_BRANCH19;
+
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     return COFF::IMAGE_REL_ARM64_BRANCH26;
@@ -97,10 +120,8 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
 
 namespace llvm {
 
-std::unique_ptr<MCObjectWriter>
-createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) {
-  auto MOTW = llvm::make_unique<AArch64WinCOFFObjectWriter>();
-  return createWinCOFFObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() {
+  return llvm::make_unique<AArch64WinCOFFObjectWriter>();
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index c88363d2c250..9871dc553bed 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -10,6 +10,7 @@
 #include "AArch64WinCOFFStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
 
 using namespace llvm;
 
@@ -21,8 +22,8 @@ public:
 
   AArch64WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
                          std::unique_ptr<MCCodeEmitter> CE,
-                         raw_pwrite_stream &OS)
-      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+                         std::unique_ptr<MCObjectWriter> OW)
+      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
   void FinishImpl() override;
 };
@@ -37,10 +38,10 @@ void AArch64WinCOFFStreamer::FinishImpl() {
 namespace llvm {
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
     bool RelaxAll, bool IncrementalLinkerCompatible) {
   auto *S = new AArch64WinCOFFStreamer(Context, std::move(MAB),
-                                       std::move(Emitter), OS);
+                                       std::move(Emitter), std::move(OW));
   S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
   return S;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index b67a19e883e9..c05422163584 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -35,7 +35,7 @@ namespace llvm {
 
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
     bool RelaxAll, bool IncrementalLinkerCompatible);
 } // end llvm namespace
 
diff --git a/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 15c1275f259d..17b3f6041279 100644
--- a/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11,6 +11,934 @@
 //
 //===----------------------------------------------------------------------===//
 
+def SVEPatternOperand : AsmOperandClass {
+  let Name = "SVEPattern";
+  let ParserMethod = "tryParseSVEPattern";
+  let PredicateMethod = "isSVEPattern";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidSVEPattern";
+}
+
+def sve_pred_enum : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+  }]> {
+
+  let PrintMethod = "printSVEPattern";
+  let ParserMatchClass = SVEPatternOperand;
+}
+
+def SVEPrefetchOperand : AsmOperandClass {
+  let Name = "SVEPrefetch";
+  let ParserMethod = "tryParsePrefetch<true>";
+  let PredicateMethod = "isPrefetch";
+  let RenderMethod = "addPrefetchOperands";
+}
+
+def sve_prfop : Operand<i32>, ImmLeaf<i32, [{
+    return (((uint32_t)Imm) <= 15);
+  }]> {
+  let PrintMethod = "printPrefetchOp<true>";
+  let ParserMatchClass = SVEPrefetchOperand;
+}
+
+class SVELogicalImmOperand<int Width> : AsmOperandClass {
+  let Name = "SVELogicalImm" # Width;
+  let DiagnosticType = "LogicalSecondSource";
+  let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
+  let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
+}
+
+def sve_logical_imm8 : Operand<i64> {
+  let ParserMatchClass = SVELogicalImmOperand<8>;
+  let PrintMethod = "printLogicalImm<int8_t>";
+
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+    return AArch64_AM::isSVEMaskOfIdenticalElements<int8_t>(Val);
+  }];
+}
+
+def sve_logical_imm16 : Operand<i64> {
+  let ParserMatchClass = SVELogicalImmOperand<16>;
+  let PrintMethod = "printLogicalImm<int16_t>";
+
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+    return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val);
+  }];
+}
+
+def sve_logical_imm32 : Operand<i64> {
+  let ParserMatchClass = SVELogicalImmOperand<32>;
+  let PrintMethod = "printLogicalImm<int32_t>";
+
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+    return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val);
+  }];
+}
+
+class SVEPreferredLogicalImmOperand<int Width> : AsmOperandClass {
+  let Name = "SVEPreferredLogicalImm" # Width;
+  let PredicateMethod = "isSVEPreferredLogicalImm<int" # Width # "_t>";
+  let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
+}
+
+def sve_preferred_logical_imm16 : Operand<i64> {
+  let ParserMatchClass = SVEPreferredLogicalImmOperand<16>;
+  let PrintMethod = "printSVELogicalImm<int16_t>";
+
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+    return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val) &&
+           AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
+  }];
+}
+
+def sve_preferred_logical_imm32 : Operand<i64> {
+  let ParserMatchClass =  SVEPreferredLogicalImmOperand<32>;
+  let PrintMethod = "printSVELogicalImm<int32_t>";
+
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+    return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val) &&
+           AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
+  }];
+}
+
+def sve_preferred_logical_imm64 : Operand<i64> {
+  let ParserMatchClass = SVEPreferredLogicalImmOperand<64>;
+  let PrintMethod = "printSVELogicalImm<int64_t>";
+
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
+    return AArch64_AM::isSVEMaskOfIdenticalElements<int64_t>(Val) &&
+           AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
+  }];
+}
+
+class SVELogicalImmNotOperand<int Width> : AsmOperandClass {
+  let Name = "SVELogicalImm" # Width # "Not";
+  let DiagnosticType = "LogicalSecondSource";
+  let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
+  let RenderMethod = "addLogicalImmNotOperands<int" # Width # "_t>";
+}
+
+def sve_logical_imm8_not : Operand<i64> {
+  let ParserMatchClass = SVELogicalImmNotOperand<8>;
+}
+
+def sve_logical_imm16_not : Operand<i64> {
+  let ParserMatchClass = SVELogicalImmNotOperand<16>;
+}
+
+def sve_logical_imm32_not : Operand<i64> {
+  let ParserMatchClass = SVELogicalImmNotOperand<32>;
+}
+
+class SVEShiftedImmOperand<int ElementWidth, string Infix, string Predicate>
+    : AsmOperandClass {
+  let Name = "SVE" # Infix # "Imm" # ElementWidth;
+  let DiagnosticType = "Invalid" # Name;
+  let RenderMethod = "addImmWithOptionalShiftOperands<8>";
+  let ParserMethod = "tryParseImmWithOptionalShift";
+  let PredicateMethod = Predicate;
+}
+
+def SVECpyImmOperand8  : SVEShiftedImmOperand<8,  "Cpy", "isSVECpyImm<int8_t>">;
+def SVECpyImmOperand16 : SVEShiftedImmOperand<16, "Cpy", "isSVECpyImm<int16_t>">;
+def SVECpyImmOperand32 : SVEShiftedImmOperand<32, "Cpy", "isSVECpyImm<int32_t>">;
+def SVECpyImmOperand64 : SVEShiftedImmOperand<64, "Cpy", "isSVECpyImm<int64_t>">;
+
+def SVEAddSubImmOperand8  : SVEShiftedImmOperand<8,  "AddSub", "isSVEAddSubImm<int8_t>">;
+def SVEAddSubImmOperand16 : SVEShiftedImmOperand<16, "AddSub", "isSVEAddSubImm<int16_t>">;
+def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<int32_t>">;
+def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;
+
+class imm8_opt_lsl<int ElementWidth, string printType,
+                   AsmOperandClass OpndClass, code Predicate>
+    : Operand<i32>, ImmLeaf<i32, Predicate> {
+  let EncoderMethod = "getImm8OptLsl";
+  let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
+  let PrintMethod = "printImm8OptLsl<" # printType # ">";
+  let ParserMatchClass = OpndClass;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def cpy_imm8_opt_lsl_i8  : imm8_opt_lsl<8,  "int8_t",  SVECpyImmOperand8,  [{
+  return AArch64_AM::isSVECpyImm<int8_t>(Imm);
+}]>;
+def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{
+  return AArch64_AM::isSVECpyImm<int16_t>(Imm);
+}]>;
+def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{
+  return AArch64_AM::isSVECpyImm<int32_t>(Imm);
+}]>;
+def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{
+  return AArch64_AM::isSVECpyImm<int64_t>(Imm);
+}]>;
+
+def addsub_imm8_opt_lsl_i8  : imm8_opt_lsl<8,  "uint8_t",  SVEAddSubImmOperand8,  [{
+  return AArch64_AM::isSVEAddSubImm<int8_t>(Imm);
+}]>;
+def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{
+  return AArch64_AM::isSVEAddSubImm<int16_t>(Imm);
+}]>;
+def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{
+  return AArch64_AM::isSVEAddSubImm<int32_t>(Imm);
+}]>;
+def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{
+  return AArch64_AM::isSVEAddSubImm<int64_t>(Imm);
+}]>;
+
+class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
+  let Name = "SVEExactFPImmOperand" # Suffix;
+  let DiagnosticType = "Invalid" # Name;
+  let ParserMethod = "tryParseFPImm<false>";
+  let PredicateMethod = "isExactFPImm<" # ValA # ", " # ValB # ">";
+  let RenderMethod = "addExactFPImmOperands<" # ValA # ", " # ValB # ">";
+}
+
+class SVEExactFPImmOperand<string Suffix, string ValA, string ValB> : Operand<i32> {
+  let PrintMethod = "printExactFPImm<" # ValA # ", " # ValB # ">";
+  let ParserMatchClass = SVEExactFPImm<Suffix, ValA, ValB>;
+}
+
+def sve_fpimm_half_one
+    : SVEExactFPImmOperand<"HalfOne", "AArch64ExactFPImm::half",
+                           "AArch64ExactFPImm::one">;
+def sve_fpimm_half_two
+    : SVEExactFPImmOperand<"HalfTwo", "AArch64ExactFPImm::half",
+                           "AArch64ExactFPImm::two">;
+def sve_fpimm_zero_one
+    : SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero",
+                           "AArch64ExactFPImm::one">;
+
+def sve_incdec_imm : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let ParserMatchClass = Imm1_16Operand;
+  let EncoderMethod = "getSVEIncDecImm";
+  let DecoderMethod = "DecodeSVEIncDecImm";
+}
+
+//===----------------------------------------------------------------------===//
+// SVE PTrue - These are used extensively throughout the pattern matching so
+//             it's important we define them first.
+//===----------------------------------------------------------------------===//
+
+class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty>
+: I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern),
+  asm, "\t$Pd, $pattern",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<5> pattern;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-19} = 0b011;
+  let Inst{18-17} = opc{2-1};
+  let Inst{16}    = opc{0};
+  let Inst{15-10} = 0b111000;
+  let Inst{9-5}   = pattern;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = Pd;
+
+  let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
+}
+
+multiclass sve_int_ptrue<bits<3> opc, string asm> {
+  def _B : sve_int_ptrue<0b00, opc, asm, PPR8>;
+  def _H : sve_int_ptrue<0b01, opc, asm, PPR16>;
+  def _S : sve_int_ptrue<0b10, opc, asm, PPR32>;
+  def _D : sve_int_ptrue<0b11, opc, asm, PPR64>;
+
+  def : InstAlias<asm # "\t$Pd",
+                  (!cast<Instruction>(NAME # _B) PPR8:$Pd, 0b11111), 1>;
+  def : InstAlias<asm # "\t$Pd",
+                  (!cast<Instruction>(NAME # _H) PPR16:$Pd, 0b11111), 1>;
+  def : InstAlias<asm # "\t$Pd",
+                  (!cast<Instruction>(NAME # _S) PPR32:$Pd, 0b11111), 1>;
+  def : InstAlias<asm # "\t$Pd",
+                  (!cast<Instruction>(NAME # _D) PPR64:$Pd, 0b11111), 1>;
+}
+
+let Predicates = [HasSVE] in {
+  defm PTRUE  : sve_int_ptrue<0b000, "ptrue">;
+  defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Predicate Count Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_count_r<bits<2> sz8_64, bits<5> opc, string asm,
+                      RegisterOperand dty, PPRRegOp pprty, RegisterOperand sty>
+: I<(outs dty:$Rdn), (ins pprty:$Pg, sty:$_Rdn),
+  asm, "\t$Rdn, $Pg",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rdn;
+  bits<4> Pg;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-19} = 0b101;
+  let Inst{18-16} = opc{4-2};
+  let Inst{15-11} = 0b10001;
+  let Inst{10-9}  = opc{1-0};
+  let Inst{8-5}   = Pg;
+  let Inst{4-0}   = Rdn;
+
+  // Signed 32bit forms require their GPR operand printed.
+  let AsmString = !if(!eq(opc{4,2-0}, 0b0000),
+                      !strconcat(asm, "\t$Rdn, $Pg, $_Rdn"),
+                      !strconcat(asm, "\t$Rdn, $Pg"));
+  let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_count_r_s32<bits<5> opc, string asm> {
+  def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>;
+  def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>;
+  def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>;
+  def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>;
+}
+
+multiclass sve_int_count_r_u32<bits<5> opc, string asm> {
+  def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>;
+  def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>;
+  def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>;
+  def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>;
+}
+
+multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
+  def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
+  def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
+  def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
+  def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>;
+}
+
+class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
+                      ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
+  asm, "\t$Zdn, $Pg",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pg;
+  bits<5> Zdn;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-19} = 0b101;
+  let Inst{18-16} = opc{4-2};
+  let Inst{15-11} = 0b10000;
+  let Inst{10-9}  = opc{1-0};
+  let Inst{8-5}   = Pg;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_count_v<bits<5> opc, string asm> {
+  def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
+  def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
+  def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+}
+
+class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
+                          PPRRegOp pprty>
+: I<(outs GPR64:$Rd), (ins PPRAny:$Pg, pprty:$Pn),
+  asm, "\t$Rd, $Pg, $Pn",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pg;
+  bits<4> Pn;
+  bits<5> Rd;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = opc{3-1};
+  let Inst{15-14} = 0b10;
+  let Inst{13-10} = Pg;
+  let Inst{9}     = opc{0};
+  let Inst{8-5}   = Pn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass sve_int_pcount_pred<bits<4> opc, string asm> {
+  def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
+  def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
+  def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
+  def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Element Count Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_count<bits<3> opc, string asm>
+: I<(outs GPR64:$Rd), (ins sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+  asm, "\t$Rd, $pattern, mul $imm4",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rd;
+  bits<4> imm4;
+  bits<5> pattern;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc{2-1};
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = imm4;
+  let Inst{15-11} = 0b11100;
+  let Inst{10}    = opc{0};
+  let Inst{9-5}   = pattern;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass sve_int_count<bits<3> opc, string asm> {
+  def NAME : sve_int_count<opc, asm>;
+
+  def : InstAlias<asm # "\t$Rd, $pattern",
+                  (!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
+  def : InstAlias<asm # "\t$Rd",
+                  (!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
+}
+
+class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+  asm, "\t$Zdn, $pattern, mul $imm4",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<5> pattern;
+  bits<4> imm4;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc{4-3};
+  let Inst{21}    = 0b1;
+  let Inst{20}    = opc{2};
+  let Inst{19-16} = imm4;
+  let Inst{15-12} = 0b1100;
+  let Inst{11-10} = opc{1-0};
+  let Inst{9-5}   = pattern;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> {
+  def NAME : sve_int_countvlv<opc, asm, zprty>;
+
+  def : InstAlias<asm # "\t$Zdn, $pattern",
+                  (!cast<Instruction>(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>;
+  def : InstAlias<asm # "\t$Zdn",
+                  (!cast<Instruction>(NAME) zprty:$Zdn, 0b11111, 1), 2>;
+}
+
+class sve_int_pred_pattern_a<bits<3> opc, string asm>
+: I<(outs GPR64:$Rdn), (ins GPR64:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+  asm, "\t$Rdn, $pattern, mul $imm4",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rdn;
+  bits<5> pattern;
+  bits<4> imm4;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc{2-1};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm4;
+  let Inst{15-11} = 0b11100;
+  let Inst{10}    = opc{0};
+  let Inst{9-5}   = pattern;
+  let Inst{4-0}   = Rdn;
+
+  let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
+  def NAME : sve_int_pred_pattern_a<opc, asm>;
+
+  def : InstAlias<asm # "\t$Rdn, $pattern",
+                  (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+  def : InstAlias<asm # "\t$Rdn",
+                  (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
+}
+
+class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
+                             RegisterOperand st>
+: I<(outs dt:$Rdn), (ins st:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
+  asm, "\t$Rdn, $pattern, mul $imm4",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rdn;
+  bits<5> pattern;
+  bits<4> imm4;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc{4-3};
+  let Inst{21}    = 0b1;
+  let Inst{20}    = opc{2};
+  let Inst{19-16} = imm4;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-10} = opc{1-0};
+  let Inst{9-5}   = pattern;
+  let Inst{4-0}   = Rdn;
+
+  // Signed 32bit forms require their GPR operand printed.
+  let AsmString = !if(!eq(opc{2,0}, 0b00),
+                      !strconcat(asm, "\t$Rdn, $_Rdn, $pattern, mul $imm4"),
+                      !strconcat(asm, "\t$Rdn, $pattern, mul $imm4"));
+
+  let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_pred_pattern_b_s32<bits<5> opc, string asm> {
+  def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64as32>;
+
+  def : InstAlias<asm # "\t$Rd, $Rn, $pattern",
+                  (!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>;
+  def : InstAlias<asm # "\t$Rd, $Rn",
+                  (!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>;
+}
+
+multiclass sve_int_pred_pattern_b_u32<bits<5> opc, string asm> {
+  def NAME : sve_int_pred_pattern_b<opc, asm, GPR32z, GPR32z>;
+
+  def : InstAlias<asm # "\t$Rdn, $pattern",
+                  (!cast<Instruction>(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+  def : InstAlias<asm # "\t$Rdn",
+                  (!cast<Instruction>(NAME) GPR32z:$Rdn, 0b11111, 1), 2>;
+}
+
+multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
+  def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64z>;
+
+  def : InstAlias<asm # "\t$Rdn, $pattern",
+                  (!cast<Instruction>(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+  def : InstAlias<asm # "\t$Rdn",
+                  (!cast<Instruction>(NAME) GPR64z:$Rdn, 0b11111, 1), 2>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Permute - Cross Lane Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                         RegisterClass srcRegType>
+: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
+  asm, "\t$Zd, $Rn",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rn;
+  bits<5> Zd;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-10} = 0b100000001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_dup_r<string asm> {
+  def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
+  def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
+  def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
+  def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;
+
+  def : InstAlias<"mov $Zd, $Rn",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
+  def : InstAlias<"mov $Zd, $Rn",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, GPR32sp:$Rn), 1>;
+  def : InstAlias<"mov $Zd, $Rn",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, GPR32sp:$Rn), 1>;
+  def : InstAlias<"mov $Zd, $Rn",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, GPR64sp:$Rn), 1>;
+}
+
+class sve_int_perm_dup_i<bits<5> tsz, Operand immtype, string asm,
+                         ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$idx),
+  asm, "\t$Zd, $Zn$idx",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<7> idx;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = {?,?}; // imm3h
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = tsz;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_dup_i<string asm> {
+  def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
+    let Inst{23-22} = idx{5-4};
+    let Inst{20-17} = idx{3-0};
+  }
+  def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
+    let Inst{23-22} = idx{4-3};
+    let Inst{20-18} = idx{2-0};
+  }
+  def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
+    let Inst{23-22} = idx{3-2};
+    let Inst{20-19}    = idx{1-0};
+  }
+  def _D : sve_int_perm_dup_i<{?,1,0,0,0}, sve_elm_idx_extdup_d, asm, ZPR64> {
+    let Inst{23-22} = idx{2-1};
+    let Inst{20}    = idx{0};
+  }
+  def _Q : sve_int_perm_dup_i<{1,0,0,0,0}, sve_elm_idx_extdup_q, asm, ZPR128> {
+    let Inst{23-22} = idx{1-0};
+  }
+
+  def : InstAlias<"mov $Zd, $Zn$idx",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
+  def : InstAlias<"mov $Zd, $Zn$idx",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
+  def : InstAlias<"mov $Zd, $Zn$idx",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
+  def : InstAlias<"mov $Zd, $Zn$idx",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
+  def : InstAlias<"mov $Zd, $Zn$idx",
+                  (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, ZPR128:$Zn, sve_elm_idx_extdup_q:$idx), 1>;
+  def : InstAlias<"mov $Zd, $Bn",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, FPR8asZPR:$Bn, 0), 2>;
+  def : InstAlias<"mov $Zd, $Hn",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, FPR16asZPR:$Hn, 0), 2>;
+  def : InstAlias<"mov $Zd, $Sn",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, FPR32asZPR:$Sn, 0), 2>;
+  def : InstAlias<"mov $Zd, $Dn",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
+  def : InstAlias<"mov $Zd, $Qn",
+                  (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
+}
+
+class sve_int_perm_tbl<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                       RegisterOperand VecList>
+: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b001100;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_tbl<string asm> {
+  def _B : sve_int_perm_tbl<0b00, asm, ZPR8, Z_b>;
+  def _H : sve_int_perm_tbl<0b01, asm, ZPR16, Z_h>;
+  def _S : sve_int_perm_tbl<0b10, asm, ZPR32, Z_s>;
+  def _D : sve_int_perm_tbl<0b11, asm, ZPR64, Z_d>;
+
+  def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+                 (!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+                 (!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+                 (!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+                 (!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;
+}
+
+class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn),
+  asm, "\t$Zd, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-10} = 0b111000001110;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_reverse_z<string asm> {
+  def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>;
+  def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>;
+  def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>;
+  def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>;
+}
+
+class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
+: I<(outs pprty:$Pd), (ins pprty:$Pn),
+  asm, "\t$Pd, $Pn",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<4> Pn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-9}  = 0b1101000100000;
+  let Inst{8-5}   = Pn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = Pd;
+}
+
+multiclass sve_int_perm_reverse_p<string asm> {
+  def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>;
+  def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>;
+  def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>;
+  def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>;
+}
+
+class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
+                        ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
+  asm, "\t$Zd, $Zn",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz16_64;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = opc;
+  let Inst{15-10} = 0b001110;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
+  def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
+  def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
+  def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
+}
+
+class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                         RegisterClass srcRegType>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Rm),
+  asm, "\t$Zdn, $Rm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rm;
+  bits<5> Zdn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-10} = 0b100100001110;
+  let Inst{9-5}   = Rm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_insrs<string asm> {
+  def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
+  def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
+  def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
+  def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;
+}
+
+class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                         RegisterClass srcRegType>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
+  asm, "\t$Zdn, $Vm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Vm;
+  bits<5> Zdn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-10} = 0b110100001110;
+  let Inst{9-5}   = Vm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_insrv<string asm> {
+  def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
+  def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
+  def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
+  def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Permute - Extract Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_extract_i<string asm>
+: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_255:$imm8),
+  asm, "\t$Zdn, $_Zdn, $Zm, $imm8",
+  "", []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<5> Zm;
+  bits<8> imm8;
+  let Inst{31-21} = 0b00000101001;
+  let Inst{20-16} = imm8{7-3};
+  let Inst{15-13} = 0b000;
+  let Inst{12-10} = imm8{2-0};
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Vector Select Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_sel_vvv<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins PPRAny:$Pg, zprty:$Zn, zprty:$Zm),
+  asm, "\t$Zd, $Pg, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pg;
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-14} = 0b11;
+  let Inst{13-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_sel_vvv<string asm> {
+  def _B : sve_int_sel_vvv<0b00, asm, ZPR8>;
+  def _H : sve_int_sel_vvv<0b01, asm, ZPR16>;
+  def _S : sve_int_sel_vvv<0b10, asm, ZPR32>;
+  def _D : sve_int_sel_vvv<0b11, asm, ZPR64>;
+
+  def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, ZPR16:$Zn, ZPR16:$Zd), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, ZPR32:$Zn, ZPR32:$Zd), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Zn",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, ZPR64:$Zn, ZPR64:$Zd), 1>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Predicate Logical Operations Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_pred_log<bits<4> opc, string asm>
+: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
+  asm, "\t$Pd, $Pg/z, $Pn, $Pm",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<4> Pg;
+  bits<4> Pm;
+  bits<4> Pn;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = opc{3-2};
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = Pm;
+  let Inst{15-14} = 0b01;
+  let Inst{13-10} = Pg;
+  let Inst{9}     = opc{1};
+  let Inst{8-5}   = Pn;
+  let Inst{4}     = opc{0};
+  let Inst{3-0}   = Pd;
+
+  // SEL has no predication qualifier.
+  let AsmString = !if(!eq(opc, 0b0011),
+                      !strconcat(asm, "\t$Pd, $Pg, $Pn, $Pm"),
+                      !strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));
+
+  let Defs = !if(!eq (opc{2}, 1), [NZCV], []);
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Logical Mask Immediate Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_log_imm<bits<2> opc, string asm>
+: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, logical_imm64:$imms13),
+  asm, "\t$Zdn, $_Zdn, $imms13",
+  "", []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<13> imms13;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = opc;
+  let Inst{21-18} = 0b0000;
+  let Inst{17-5}  = imms13;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+  let DecoderMethod = "DecodeSVELogicalImmInstruction";
+}
+
+multiclass sve_int_log_imm<bits<2> opc, string asm, string alias> {
+  def NAME : sve_int_log_imm<opc, asm>;
+
+  def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>;
+  def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16:$imm), 3>;
+  def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32:$imm), 2>;
+
+  def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8_not:$imm), 0>;
+  def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16_not:$imm), 0>;
+  def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32_not:$imm), 0>;
+  def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
+                  (!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
+}
+
+class sve_int_dup_mask_imm<string asm>
+: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
+  asm, "\t$Zd, $imms",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<13> imms;
+  let Inst{31-18} = 0b00000101110000;
+  let Inst{17-5} = imms;
+  let Inst{4-0} = Zd;
+
+  let isReMaterializable = 1;
+  let DecoderMethod = "DecodeSVELogicalImmInstruction";
+}
+
+multiclass sve_int_dup_mask_imm<string asm> {
+  def NAME : sve_int_dup_mask_imm<asm>;
+
+  def : InstAlias<"dupm $Zd, $imm",
+                  (!cast<Instruction>(NAME) ZPR8:$Zd, sve_logical_imm8:$imm), 4>;
+  def : InstAlias<"dupm $Zd, $imm",
+                  (!cast<Instruction>(NAME) ZPR16:$Zd, sve_logical_imm16:$imm), 3>;
+  def : InstAlias<"dupm $Zd, $imm",
+                  (!cast<Instruction>(NAME) ZPR32:$Zd, sve_logical_imm32:$imm), 2>;
+
+  // All Zd.b forms have a CPY/DUP equivalent, hence no byte alias here.
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME) ZPR16:$Zd, sve_preferred_logical_imm16:$imm), 7>;
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE Integer Arithmetic -  Unpredicated Group.
 //===----------------------------------------------------------------------===//
@@ -41,6 +969,408 @@ multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> {
 }
 
 //===----------------------------------------------------------------------===//
+// SVE Floating Point Arithmetic - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
+                         ZPRRegOp zprty,
+                         Operand imm_ty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, imm_ty:$i1),
+  asm, "\t$Zdn, $Pg/m, $_Zdn, $i1",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bit i1;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-19} = 0b011;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-6}   = 0b0000;
+  let Inst{5}     = i1;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
+  def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
+  def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
+  def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
+}
+
+class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
+                       ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+  asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = opc;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_2op_p_zds<bits<4> opc, string asm> {
+  def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
+}
+
+class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm0_7:$imm3),
+  asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<5> Zm;
+  bits<3> imm3;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-19} = 0b010;
+  let Inst{18-16} = imm3;
+  let Inst{15-10} = 0b100000;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_ftmad<string asm> {
+  def _H : sve_fp_ftmad<0b01, asm, ZPR16>;
+  def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
+  def _D : sve_fp_ftmad<0b11, asm, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Arithmetic - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
+                      ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins  zprty:$Zn, zprty:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
+  def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Fused Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+  asm, "\t$Zda, $Pg/m, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zda;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = 0b0;
+  let Inst{14-13} = opc;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm> {
+  def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
+}
+
+class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
+                         ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
+  asm, "\t$Zdn, $Pg/m, $Zm, $Za",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Za;
+  bits<5> Zdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Za;
+  let Inst{15}    = 0b1;
+  let Inst{14-13} = opc;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm> {
+  def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
+                                 ZPRRegOp zprty1,
+                                 ZPRRegOp zprty2, Operand itype>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty1:$Zn, zprty2:$Zm, itype:$iop),
+  asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{15-11} = 0;
+  let Inst{10}    = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm> {
+  def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH> {
+    bits<3> Zm;
+    bits<3> iop;
+    let Inst{22} = iop{2};
+    let Inst{20-19} = iop{1-0};
+    let Inst{18-16} = Zm;
+  }
+  def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS> {
+    bits<3> Zm;
+    bits<2> iop;
+    let Inst{20-19} = iop;
+    let Inst{18-16} = Zm;
+  }
+  def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD> {
+    bits<4> Zm;
+    bit iop;
+    let Inst{20} = iop;
+    let Inst{19-16} = Zm;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Multiply - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fmul_by_indexed_elem<bits<2> sz, string asm, ZPRRegOp zprty,
+                                      ZPRRegOp zprty2, Operand itype>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
+  asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_fp_fmul_by_indexed_elem<string asm> {
+  def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH> {
+    bits<3> Zm;
+    bits<3> iop;
+    let Inst{22} = iop{2};
+    let Inst{20-19} = iop{1-0};
+    let Inst{18-16} = Zm;
+  }
+  def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS> {
+    bits<3> Zm;
+    bits<2> iop;
+    let Inst{20-19} = iop;
+    let Inst{18-16} = Zm;
+  }
+  def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD> {
+    bits<4> Zm;
+    bit iop;
+    let Inst{20} = iop;
+    let Inst{19-16} = Zm;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Complex Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm,
+                        complexrotateop:$imm),
+  asm, "\t$Zda, $Pg/m, $Zn, $Zm, $imm",
+  "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<5> Zm;
+  bits<2> imm;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = imm;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_fcmla<string asm> {
+  def _H : sve_fp_fcmla<0b01, asm, ZPR16>;
+  def _S : sve_fp_fcmla<0b10, asm, ZPR32>;
+  def _D : sve_fp_fcmla<0b11, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Complex Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
+                                   ZPRRegOp zprty,
+                                   ZPRRegOp zprty2, Operand itype>
+: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty2:$Zm, itype:$iop,
+                        complexrotateop:$imm),
+  asm, "\t$Zda, $Zn, $Zm$iop, $imm",
+  "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<2> imm;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{15-12} = 0b0001;
+  let Inst{11-10} = imm;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_fp_fcmla_by_indexed_elem<string asm> {
+  def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS> {
+    bits<3> Zm;
+    bits<2> iop;
+    let Inst{20-19} = iop;
+    let Inst{18-16} = Zm;
+  }
+  def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD> {
+    bits<4> Zm;
+    bits<1> iop;
+    let Inst{20} = iop;
+    let Inst{19-16} = Zm;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Complex Addition Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm,
+                        complexrotateopodd:$imm),
+  asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm, $imm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<5> Zm;
+  bits<3> Pg;
+  bit imm;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23-22} = sz;
+  let Inst{21-17} = 0;
+  let Inst{16}    = imm;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_fp_fcadd<string asm> {
+  def _H : sve_fp_fcadd<0b01, asm, ZPR16>;
+  def _S : sve_fp_fcadd<0b10, asm, ZPR32>;
+  def _D : sve_fp_fcadd<0b11, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Stack Allocation Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_arith_vl<bit opc, string asm>
+: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
+  asm, "\t$Rd, $Rn, $imm6",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> imm6;
+  let Inst{31-23} = 0b000001000;
+  let Inst{22}    = opc;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Rn;
+  let Inst{15-11} = 0b01010;
+  let Inst{10-5}  = imm6;
+  let Inst{4-0}   = Rd;
+}
+
+class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
+: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
+  asm, "\t$Rd, $imm6",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rd;
+  bits<6> imm6;
+  let Inst{31-23} = 0b000001001;
+  let Inst{22}    = op;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = opc2{4-0};
+  let Inst{15-11} = 0b01010;
+  let Inst{10-5}  = imm6;
+  let Inst{4-0}   = Rd;
+}
+
+//===----------------------------------------------------------------------===//
 // SVE Permute - In Lane Group
 //===----------------------------------------------------------------------===//
 
@@ -71,6 +1401,1442 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm> {
 }
 
 //===----------------------------------------------------------------------===//
+// SVE Floating Point Unary Operations Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
+                      RegisterOperand o_zprtype>
+: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
+  asm, "\t$Zd, $Pg/m, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = opc{6-5};
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = opc{4-0};
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm> {
+  def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16>;
+  def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32>;
+  def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64>;
+}
+ 
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Unary Operations - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_u_zd<bits<2> sz, bits<3> opc, string asm,
+                      ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn),
+  asm, "\t$Zd, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-19} = 0b001;
+  let Inst{18-16} = opc;
+  let Inst{15-10} = 0b001100;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_fp_2op_u_zd<bits<3> opc, string asm> {
+  def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Arithmetic - Binary Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
+                                string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+  asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b0;
+  let Inst{20-19} = fmt;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b000;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_bin_pred_log<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+}
+
+// Special case for divides which are not defined for 8b/16b elements.
+multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm> {
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
+                                ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
+  asm, "\t$Zdn, $Pg/m, $Zm, $Za",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<5> Za;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15-14} = 0b11;
+  let Inst{13}    = opc;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Za;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm> {
+  def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>;
+  def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>;
+  def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>;
+  def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>;
+}
+
+class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
+                            ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+  asm, "\t$Zda, $Pg/m, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zda;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15-14} = 0b01;
+  let Inst{13}    = opc;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm> {
+  def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
+  def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
+  def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
+  def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Group
+//===----------------------------------------------------------------------===//
+
+class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
+                   ZPRRegOp zprty2>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm,
+  "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-23} = 0b010001001;
+  let Inst{22}    = sz;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Zm;
+  let Inst{15-11} = 0;
+  let Inst{10}    = U;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_intx_dot<bit opc, string asm> {
+  def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
+  def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Group - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
+                                   ZPRRegOp zprty1, ZPRRegOp zprty2,
+                                   ZPRRegOp zprty3, Operand itype>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
+  asm, "\t$Zda, $Zn, $Zm$iop",
+  "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  let Inst{31-23} = 0b010001001;
+  let Inst{22}    = sz;
+  let Inst{21}    = 0b1;
+  let Inst{15-11} = 0;
+  let Inst{10}    = U;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+}
+
+multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
+  def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+    bits<2> iop;
+    bits<3> Zm;
+    let Inst{20-19} = iop;
+    let Inst{18-16} = Zm;
+  }
+  def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+    bits<1> iop;
+    bits<4> Zm;
+    let Inst{20} = iop;
+    let Inst{19-16} = Zm;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Arithmetic - Unary Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
+                             string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Zd, $Pg/m, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-20} = 0b01;
+  let Inst{19}    = opc{0};
+  let Inst{18-16} = opc{3-1};
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
+  def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
+  def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
+  def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
+  def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
+  def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
+  def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
+  def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm> {
+  def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
+  def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
+  def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
+  def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
+  def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
+  def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
+  def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+}
+
+multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
+  def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
+  def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
+  def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Wide Immediate - Unpredicated Group
+//===----------------------------------------------------------------------===//
+class sve_int_dup_imm<bits<2> sz8_64, string asm,
+                      ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins immtype:$imm),
+  asm, "\t$Zd, $imm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<9> imm;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-14} = 0b11100011;
+  let Inst{13}    = imm{8};   // sh
+  let Inst{12-5}  = imm{7-0}; // imm8
+  let Inst{4-0}   = Zd;
+
+  let isReMaterializable = 1;
+}
+
+multiclass sve_int_dup_imm<string asm> {
+  def _B : sve_int_dup_imm<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8>;
+  def _H : sve_int_dup_imm<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16>;
+  def _S : sve_int_dup_imm<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32>;
+  def _D : sve_int_dup_imm<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64>;
+
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, cpy_imm8_opt_lsl_i8:$imm), 1>;
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, cpy_imm8_opt_lsl_i16:$imm), 1>;
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, cpy_imm8_opt_lsl_i32:$imm), 1>;
+  def : InstAlias<"mov $Zd, $imm",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;
+
+  def : InstAlias<"fmov $Zd, #0.0",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
+  def : InstAlias<"fmov $Zd, #0.0",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
+  def : InstAlias<"fmov $Zd, #0.0",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
+}
+
+class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
+                        string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins fpimmtype:$imm8),
+  asm, "\t$Zd, $imm8",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<8> imm8;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-14} = 0b11100111;
+  let Inst{13}    = 0b0;
+  let Inst{12-5}  = imm8;
+  let Inst{4-0}   = Zd;
+
+  let isReMaterializable = 1;
+}
+
+multiclass sve_int_dup_fpimm<string asm> {
+  def _H : sve_int_dup_fpimm<0b01, fpimm16, asm, ZPR16>;
+  def _S : sve_int_dup_fpimm<0b10, fpimm32, asm, ZPR32>;
+  def _D : sve_int_dup_fpimm<0b11, fpimm64, asm, ZPR64>;
+
+  def : InstAlias<"fmov $Zd, $imm8",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, fpimm16:$imm8), 1>;
+  def : InstAlias<"fmov $Zd, $imm8",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, fpimm32:$imm8), 1>;
+  def : InstAlias<"fmov $Zd, $imm8",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, fpimm64:$imm8), 1>;
+}
+
+class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
+                         ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
+  asm, "\t$Zdn, $_Zdn, $imm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<9> imm;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = opc;
+  let Inst{15-14} = 0b11;
+  let Inst{13}    = imm{8};   // sh
+  let Inst{12-5}  = imm{7-0}; // imm8
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_arith_imm0<bits<3> opc, string asm> {
+  def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
+  def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
+  def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
+  def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
+}
+
+class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
+                        ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
+  asm, "\t$Zdn, $_Zdn, $imm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zdn;
+  bits<8> imm;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-16} = opc;
+  let Inst{15-13} = 0b110;
+  let Inst{12-5} = imm;
+  let Inst{4-0} = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_arith_imm1<bits<2> opc, string asm, Operand immtype> {
+  def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, immtype>;
+  def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, immtype>;
+  def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, immtype>;
+  def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, immtype>;
+}
+
+multiclass sve_int_arith_imm2<string asm> {
+  def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8,  simm8>;
+  def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>;
+  def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
+  def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Bitwise Logical - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_log<bits<2> opc, string asm>
+: I<(outs ZPR64:$Zd), (ins ZPR64:$Zn, ZPR64:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc{1-0};
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b001100;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Wide Immediate - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
+                             string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPRAny:$Pg, fpimmtype:$imm8),
+  asm, "\t$Zd, $Pg/m, $imm8",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pg;
+  bits<5> Zd;
+  bits<8> imm8;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz;
+  let Inst{21-20} = 0b01;
+  let Inst{19-16} = Pg;
+  let Inst{15-13} = 0b110;
+  let Inst{12-5}  = imm8;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_dup_fpimm_pred<string asm> {
+  def _H : sve_int_dup_fpimm_pred<0b01, fpimm16, asm, ZPR16>;
+  def _S : sve_int_dup_fpimm_pred<0b10, fpimm32, asm, ZPR32>;
+  def _D : sve_int_dup_fpimm_pred<0b11, fpimm64, asm, ZPR64>;
+
+  def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, fpimm16:$imm8), 1>;
+  def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
+  def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
+}
+
+class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
+                           ZPRRegOp zprty, string pred_qual, dag iops>
+: I<(outs zprty:$Zd), iops,
+  asm, "\t$Zd, $Pg"#pred_qual#", $imm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<4> Pg;
+  bits<9> imm;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-20} = 0b01;
+  let Inst{19-16} = Pg;
+  let Inst{15}    = 0b0;
+  let Inst{14}    = m;
+  let Inst{13}    = imm{8};   // sh
+  let Inst{12-5}  = imm{7-0}; // imm8
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_dup_imm_pred_merge<string asm> {
+  let Constraints = "$Zd = $_Zd" in {
+  def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8,  "/m", (ins ZPR8:$_Zd,  PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
+  def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
+  def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
+  def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
+  }
+
+  def : InstAlias<"mov $Zd, $Pg/m, $imm",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd,  PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $imm",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $imm",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $imm",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+
+  def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
+  def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
+  def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
+}
+
+multiclass sve_int_dup_imm_pred_zero<string asm> {
+  def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8,  "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
+  def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
+  def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
+  def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
+
+  def : InstAlias<"mov $Zd, $Pg/z, $imm",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd,  PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
+  def : InstAlias<"mov $Zd, $Pg/z, $imm",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+  def : InstAlias<"mov $Zd, $Pg/z, $imm",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
+  def : InstAlias<"mov $Zd, $Pg/z, $imm",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Compare - Vectors Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
+                  PPRRegOp pprty, ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty1:$Zn, zprty2:$Zm),
+  asm, "\t$Pd, $Pg/z, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<3> Pg;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00100100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = opc{2};
+  let Inst{14}    = cmp_1;
+  let Inst{13}    = opc{1};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = opc{0};
+  let Inst{3-0}   = Pd;
+
+  let Defs = [NZCV];
+}
+
+multiclass sve_int_cmp_0<bits<3> opc, string asm> {
+  def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
+  def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
+  def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
+  def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
+}
+
+multiclass sve_int_cmp_0_wide<bits<3> opc, string asm> {
+  def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
+  def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
+  def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
+}
+
+multiclass sve_int_cmp_1_wide<bits<3> opc, string asm> {
+  def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
+  def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
+  def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Compare - Signed Immediate Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
+                      ZPRRegOp zprty,
+                      Operand immtype>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm5),
+  asm, "\t$Pd, $Pg/z, $Zn, $imm5",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<5> imm5;
+  let Inst{31-24} = 0b00100101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = imm5;
+  let Inst{15}    = opc{2};
+  let Inst{14}    = 0b0;
+  let Inst{13}    = opc{1};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = opc{0};
+  let Inst{3-0}   = Pd;
+
+  let Defs = [NZCV];
+}
+
+multiclass sve_int_scmp_vi<bits<3> opc, string asm> {
+  def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
+  def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
+  def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
+  def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Compare - Unsigned Immediate Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
+                      ZPRRegOp zprty, Operand immtype>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm7),
+  asm, "\t$Pd, $Pg/z, $Zn, $imm7",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<7> imm7;
+  let Inst{31-24} = 0b00100100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 1;
+  let Inst{20-14} = imm7;
+  let Inst{13}    = opc{1};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = opc{0};
+  let Inst{3-0}   = Pd;
+
+  let Defs = [NZCV];
+}
+
+multiclass sve_int_ucmp_vi<bits<2> opc, string asm> {
+  def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
+  def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
+  def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
+  def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Fast Reduction Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
+                      ZPRRegOp zprty, RegisterClass dstRegClass>
+: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Vd, $Pg, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zn;
+  bits<5> Vd;
+  bits<3> Pg;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-19} = 0b000;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Vd;
+}
+
+multiclass sve_fp_fast_red<bits<3> opc, string asm> {
+  def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>;
+  def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>;
+  def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Accumulating Reduction Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
+                      ZPRRegOp zprty, RegisterClass dstRegClass>
+: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm),
+  asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
+  "",
+  []>,
+  Sched<[]> {
+  bits<3> Pg;
+  bits<5> Vdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-19} = 0b011;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Vdn;
+
+  let Constraints = "$Vdn = $_Vdn";
+}
+
+multiclass sve_fp_2op_p_vd<bits<3> opc, string asm> {
+  def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>;
+  def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>;
+  def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Compare - Vectors Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_3op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
+                      ZPRRegOp zprty>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
+  asm, "\t$Pd, $Pg/z, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<3> Pg;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = opc{2};
+  let Inst{14}    = 0b1;
+  let Inst{13}    = opc{1};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = opc{0};
+  let Inst{3-0}   = Pd;
+}
+
+multiclass sve_fp_3op_p_pd<bits<3> opc, string asm> {
+  def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
+  def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
+  def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Compare - with Zero Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
+                      ZPRRegOp zprty>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Pd, $Pg/z, $Zn, #0.0",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<3> Pg;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = sz;
+  let Inst{21-18} = 0b0100;
+  let Inst{17-16} = opc{2-1};
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = opc{0};
+  let Inst{3-0}   = Pd;
+}
+
+multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
+  def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
+  def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
+  def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//SVE Index Generation Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                       Operand imm_ty>
+: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
+  asm, "\t$Zd, $imm5, $imm5b",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> imm5;
+  bits<5> imm5b;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = imm5b;
+  let Inst{15-10} = 0b010000;
+  let Inst{9-5}   = imm5;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_index_ii<string asm> {
+  def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>;
+  def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>;
+  def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
+  def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
+}
+
+class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                       RegisterClass srcRegType, Operand imm_ty>
+: I<(outs zprty:$Zd), (ins imm_ty:$imm5, srcRegType:$Rm),
+  asm, "\t$Zd, $imm5, $Rm",
+  "", []>, Sched<[]> {
+  bits<5> Rm;
+  bits<5> Zd;
+  bits<5> imm5;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b010010;
+  let Inst{9-5}   = imm5;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_index_ir<string asm> {
+  def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>;
+  def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>;
+  def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
+  def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
+}
+
+class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                       RegisterClass srcRegType, Operand imm_ty>
+: I<(outs zprty:$Zd), (ins srcRegType:$Rn, imm_ty:$imm5),
+  asm, "\t$Zd, $Rn, $imm5",
+  "", []>, Sched<[]> {
+  bits<5> Rn;
+  bits<5> Zd;
+  bits<5> imm5;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = imm5;
+  let Inst{15-10} = 0b010001;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_index_ri<string asm> {
+  def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>;
+  def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>;
+  def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
+  def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
+}
+
+class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                       RegisterClass srcRegType>
+: I<(outs zprty:$Zd), (ins srcRegType:$Rn, srcRegType:$Rm),
+  asm, "\t$Zd, $Rn, $Rm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b010011;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_index_rr<string asm> {
+  def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
+  def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
+  def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
+  def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
+}
+//
+//===----------------------------------------------------------------------===//
+// SVE Bitwise Shift - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm,
+                               ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
+  asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<6> imm;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = tsz8_64{3-2};
+  let Inst{21-19} = 0b000;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-8}   = tsz8_64{1-0};
+  let Inst{7-5}   = imm{2-0}; // imm3
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_bin_pred_shift_imm_left<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+  def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+    let Inst{8} = imm{3};
+  }
+  def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+    let Inst{9-8} = imm{4-3};
+  }
+  def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+    let Inst{22}  = imm{5};
+    let Inst{9-8} = imm{4-3};
+  }
+}
+
+multiclass sve_int_bin_pred_shift_imm_right<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+    let Inst{8} = imm{3};
+  }
+  def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+    let Inst{9-8} = imm{4-3};
+  }
+  def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+    let Inst{22}  = imm{5};
+    let Inst{9-8} = imm{4-3};
+  }
+}
+
+class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
+                             string asm, ZPRRegOp zprty, ZPRRegOp zprty2>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm),
+  asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-20} = 0b01;
+  let Inst{19}    = wide;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_bin_pred_shift<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
+  def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
+  def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
+  def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
+}
+
+multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm> {
+  def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
+  def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
+  def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Shift - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
+                               ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, ZPR64:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-10} = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
+  def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
+  def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
+  def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
+}
+
+class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+                               ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+  asm, "\t$Zd, $Zn, $imm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<6> imm;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = tsz8_64{3-2};
+  let Inst{21}    = 0b1;
+  let Inst{20-19} = tsz8_64{1-0};
+  let Inst{18-16} = imm{2-0}; // imm3
+  let Inst{15-12} = 0b1001;
+  let Inst{11-10} = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
+  def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+  def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+    let Inst{19} = imm{3};
+  }
+  def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+    let Inst{20-19} = imm{4-3};
+  }
+  def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+    let Inst{22}    = imm{5};
+    let Inst{20-19} = imm{4-3};
+  }
+}
+
+multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
+  def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+    let Inst{19} = imm{3};
+  }
+  def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+    let Inst{20-19} = imm{4-3};
+  }
+  def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+    let Inst{22}    = imm{5};
+    let Inst{20-19} = imm{4-3};
+  }
+}
+//===----------------------------------------------------------------------===//
+// SVE Memory - Store Group
+//===----------------------------------------------------------------------===//
+
+class sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
+                     RegisterOperand VecList>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+  asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = msz;
+  let Inst{22-21} = esz;
+  let Inst{20}    = 0;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
+                          RegisterOperand listty, ZPRRegOp zprty>
+{
+  def NAME : sve_mem_cst_si<msz, esz, asm, listty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+                 (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+                     string asm, Operand immtype>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
+  asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = nregs;
+  let Inst{20}    = 1;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+                          string asm, Operand immtype> {
+  def NAME : sve_mem_est_si<sz, nregs, VecList, asm, immtype>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+                  (!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_est_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+                     string asm, RegisterOperand gprty>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rm;
+  bits<5> Rn;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = nregs;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b011;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+class sve_mem_cst_ss_base<bits<4> dtype, string asm,
+                          RegisterOperand listty, RegisterOperand gprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rm;
+  bits<5> Rn;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-21} = dtype;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_cst_ss<bits<4> dtype, string asm,
+                          RegisterOperand listty, ZPRRegOp zprty,
+                          RegisterOperand gprty> {
+  def NAME : sve_mem_cst_ss_base<dtype, asm, listty, gprty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand VecList>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+  asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = msz;
+  let Inst{22-20} = 0b001;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand listty,
+                            ZPRRegOp zprty> {
+  def NAME : sve_mem_cstnt_si<msz, asm, listty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
+                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_cstnt_ss_base<bits<2> msz, string asm, RegisterOperand listty,
+                            RegisterOperand gprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rm;
+  bits<5> Rn;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = msz;
+  let Inst{22-21} = 0b00;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b011;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
+                            ZPRRegOp zprty, RegisterOperand gprty> {
+  def NAME : sve_mem_cstnt_ss_base<msz, asm, listty, gprty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
+                 (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
+                     RegisterOperand VecList, RegisterOperand zprext>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+  asm, "\t$Zt, $Pg, [$Rn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zm;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-22} = opc;
+  let Inst{21}    = scaled;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = 0b1;
+  let Inst{14}    = xs;
+  let Inst{13}    = 0;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_sst_sv_32_scaled<bits<3> opc, string asm,
+                                    RegisterOperand listty,
+                                    ZPRRegOp zprty,
+                                    RegisterOperand sxtw_opnd,
+                                    RegisterOperand uxtw_opnd > {
+  def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, listty, uxtw_opnd>;
+  def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, listty, sxtw_opnd>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+                 (!cast<Instruction>(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+                 (!cast<Instruction>(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_sst_sv_32_unscaled<bits<3> opc, string asm,
+                                      RegisterOperand listty,
+                                      ZPRRegOp zprty,
+                                      RegisterOperand sxtw_opnd,
+                                      RegisterOperand uxtw_opnd> {
+  def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, listty, uxtw_opnd>;
+  def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, listty, sxtw_opnd>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+                 (!cast<Instruction>(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+                 (!cast<Instruction>(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
+                      RegisterOperand zprext>
+: I<(outs), (ins Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+  asm, "\t$Zt, $Pg, [$Rn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zm;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = msz;
+  let Inst{22}    = 0b0;
+  let Inst{21}    = scaled;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
+                                    RegisterOperand zprext> {
+  def "" : sve_mem_sst_sv2<msz, 1, asm, zprext>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+                 (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+
+}
+
+multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm> {
+  def "" : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+                 (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+}
+
+class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
+                     RegisterOperand VecList, Operand imm_ty>
+: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5),
+  asm, "\t$Zt, $Pg, [$Zn, $imm5]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> imm5;
+  bits<5> Zn;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1110010;
+  let Inst{24-23} = opc{2-1};
+  let Inst{22}    = 0b1;
+  let Inst{21}    = opc{0};
+  let Inst{20-16} = imm5;
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_sst_vi_ptrs<bits<3> opc, string asm, RegisterOperand listty,
+                               ZPRRegOp zprty, Operand imm_ty> {
+  def _IMM : sve_mem_sst_vi<opc, asm, zprty, listty, imm_ty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                  (!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
+                  (!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                  (!cast<Instruction>(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>;
+}
+
+class sve_mem_z_spill<string asm>
+: I<(outs), (ins ZPRAny:$Zt, GPR64sp:$Rn, simm9:$imm9),
+  asm, "\t$Zt, [$Rn, $imm9, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<9> imm9;
+  let Inst{31-22} = 0b1110010110;
+  let Inst{21-16} = imm9{8-3};
+  let Inst{15-13} = 0b010;
+  let Inst{12-10} = imm9{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_z_spill<string asm> {
+  def NAME : sve_mem_z_spill<asm>;
+
+  def : InstAlias<asm # "\t$Zt, [$Rn]",
+                  (!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_p_spill<string asm>
+: I<(outs), (ins PPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9),
+  asm, "\t$Pt, [$Rn, $imm9, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pt;
+  bits<5> Rn;
+  bits<9> imm9;
+  let Inst{31-22} = 0b1110010110;
+  let Inst{21-16} = imm9{8-3};
+  let Inst{15-13} = 0b000;
+  let Inst{12-10} = imm9{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = Pt;
+
+  let mayStore = 1;
+}
+
+multiclass sve_mem_p_spill<string asm> {
+  def NAME : sve_mem_p_spill<asm>;
+
+  def : InstAlias<asm # "\t$Pt, [$Rn]",
+                  (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
+}
+
+//===----------------------------------------------------------------------===//
 // SVE Permute - Predicates Group
 //===----------------------------------------------------------------------===//
 
@@ -100,4 +2866,1254 @@ multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm> {
   def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>;
   def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>;
   def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>;
-}
-\ No newline at end of file
+}
+
+class sve_int_perm_punpk<bit opc, string asm>
+: I<(outs PPR16:$Pd), (ins PPR8:$Pn),
+  asm, "\t$Pd, $Pn",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<4> Pn;
+  let Inst{31-17} = 0b000001010011000;
+  let Inst{16}    = opc;
+  let Inst{15-9}  = 0b0100000;
+  let Inst{8-5}   = Pn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = Pd;
+}
+
+class sve_int_rdffr_pred<bit s, string asm>
+: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
+  asm, "\t$Pd, $Pg/z",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  bits<4> Pg;
+  let Inst{31-23} = 0b001001010;
+  let Inst{22}    = s;
+  let Inst{21-9}  = 0b0110001111000;
+  let Inst{8-5}   = Pg;
+  let Inst{4}     = 0;
+  let Inst{3-0}   = Pd;
+
+  let Defs = !if(!eq (s, 1), [NZCV], []);
+  let Uses = [FFR];
+}
+
+class sve_int_rdffr_unpred<string asm> : I<
+  (outs PPR8:$Pd), (ins),
+  asm, "\t$Pd",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pd;
+  let Inst{31-4} = 0b0010010100011001111100000000;
+  let Inst{3-0}   = Pd;
+
+  let Uses = [FFR];
+}
+
+class sve_int_wrffr<string asm>
+: I<(outs), (ins PPR8:$Pn),
+  asm, "\t$Pn",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pn;
+  let Inst{31-9} = 0b00100101001010001001000;
+  let Inst{8-5}  = Pn;
+  let Inst{4-0}  = 0b00000;
+
+  let hasSideEffects = 1;
+  let Defs = [FFR];
+}
+
+class sve_int_setffr<string asm>
+: I<(outs), (ins),
+  asm, "",
+  "",
+  []>, Sched<[]> {
+  let Inst{31-0} = 0b00100101001011001001000000000000;
+
+  let hasSideEffects = 1;
+  let Defs = [FFR];
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Permute Vector - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_clast_rz<bits<2> sz8_64, bit ab, string asm,
+                            ZPRRegOp zprty, RegisterClass rt>
+: I<(outs rt:$Rdn), (ins PPR3bAny:$Pg, rt:$_Rdn, zprty:$Zm),
+  asm, "\t$Rdn, $Pg, $_Rdn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-17} = 0b11000;
+  let Inst{16}    = ab;
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Rdn;
+
+  let Constraints = "$Rdn = $_Rdn";
+}
+
+multiclass sve_int_perm_clast_rz<bit ab, string asm> {
+  def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>;
+  def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>;
+  def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>;
+  def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>;
+}
+
+class sve_int_perm_clast_vz<bits<2> sz8_64, bit ab, string asm,
+                            ZPRRegOp zprty, RegisterClass rt>
+: I<(outs rt:$Vdn), (ins PPR3bAny:$Pg, rt:$_Vdn, zprty:$Zm),
+  asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Vdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-17} = 0b10101;
+  let Inst{16}    = ab;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Vdn;
+
+  let Constraints = "$Vdn = $_Vdn";
+}
+
+multiclass sve_int_perm_clast_vz<bit ab, string asm> {
+  def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>;
+  def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>;
+  def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
+  def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
+}
+
+class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
+                            ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+  asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-17} = 0b10100;
+  let Inst{16}    = ab;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_clast_zz<bit ab, string asm> {
+  def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>;
+  def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>;
+  def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>;
+  def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>;
+}
+
+class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
+                          ZPRRegOp zprty, RegisterClass resultRegType>
+: I<(outs resultRegType:$Rd), (ins PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Rd, $Pg, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-17} = 0b10000;
+  let Inst{16}    = ab;
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass sve_int_perm_last_r<bit ab, string asm> {
+  def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>;
+  def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>;
+  def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>;
+  def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>;
+}
+
+class sve_int_perm_last_v<bits<2> sz8_64, bit ab, string asm,
+                          ZPRRegOp zprty, RegisterClass dstRegtype>
+: I<(outs dstRegtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Vd, $Pg, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Vd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = ab;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Vd;
+}
+
+multiclass sve_int_perm_last_v<bit ab, string asm> {
+  def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>;
+  def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>;
+  def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
+  def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;
+}
+
+class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+  asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zdn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-13} = 0b101100100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zm;
+  let Inst{4-0}   = Zdn;
+
+  let Constraints = "$Zdn = $_Zdn";
+}
+
+multiclass sve_int_perm_splice<string asm> {
+  def _B : sve_int_perm_splice<0b00, asm, ZPR8>;
+  def _H : sve_int_perm_splice<0b01, asm, ZPR16>;
+  def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
+  def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
+}
+
+class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
+                       ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Zd, $Pg/m, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<3> Pg;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-18} = 0b1001;
+  let Inst{17-16} = opc;
+  let Inst{15-13} = 0b100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_perm_rev_rbit<string asm> {
+  def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
+  def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
+  def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
+  def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
+}
+
+multiclass sve_int_perm_rev_revb<string asm> {
+  def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
+  def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
+  def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
+}
+
+multiclass sve_int_perm_rev_revh<string asm> {
+  def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
+  def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
+}
+
+multiclass sve_int_perm_rev_revw<string asm> {
+  def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
+}
+
+class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                         RegisterClass srcRegType>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn),
+  asm, "\t$Zd, $Pg/m, $Rn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zd;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-13} = 0b101000101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_perm_cpy_r<string asm> {
+  def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
+  def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
+  def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
+  def _D : sve_int_perm_cpy_r<0b11, asm, ZPR64, GPR64sp>;
+
+  def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Rn",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
+}
+
+class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
+                         RegisterClass srcRegtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegtype:$Vn),
+  asm, "\t$Zd, $Pg/m, $Vn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Vn;
+  bits<5> Zd;
+  let Inst{31-24} = 0b00000101;
+  let Inst{23-22} = sz8_64;
+  let Inst{21-13} = 0b100000100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve_int_perm_cpy_v<string asm> {
+  def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
+  def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
+  def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
+  def _D : sve_int_perm_cpy_v<0b11, asm, ZPR64, FPR64>;
+
+  def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, FPR8:$Vn), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, FPR16:$Vn), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
+  def : InstAlias<"mov $Zd, $Pg/m, $Vn",
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
+}
+
+class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Zd, $Pg, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-23} = 0b000001011;
+  let Inst{22}    = sz;
+  let Inst{21-13} = 0b100001100;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_compact<string asm> {
+  def _S : sve_int_perm_compact<0b0, asm, ZPR32>;
+  def _D : sve_int_perm_compact<0b1, asm, ZPR64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - Contiguous Load Group
+//===----------------------------------------------------------------------===//
+
+class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
+                          RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-21} = dtype;
+  let Inst{20}    = nf;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+  let Uses = !if(!eq(nf, 1), [FFR], []);
+  let Defs = !if(!eq(nf, 1), [FFR], []);
+}
+
+multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
+                               RegisterOperand listty, ZPRRegOp zprty> {
+  def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
+                          ZPRRegOp zprty>
+: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;
+
+class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zt;
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = msz;
+  let Inst{22-20} = 0b000;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_cldnt_si<bits<2> msz, string asm, RegisterOperand listty,
+                            ZPRRegOp zprty> {
+  def NAME : sve_mem_cldnt_si_base<msz, asm, listty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_cldnt_ss_base<bits<2> msz, string asm, RegisterOperand VecList,
+                            RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rm;
+  bits<5> Rn;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = msz;
+  let Inst{22-21} = 0b00;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b110;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_cldnt_ss<bits<2> msz, string asm, RegisterOperand listty,
+                            ZPRRegOp zprty, RegisterOperand gprty> {
+  def NAME : sve_mem_cldnt_ss_base<msz, asm, listty, gprty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+                 (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
+  bits<5> Zt;
+  bits<5> Rn;
+  bits<3> Pg;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-20} = 0;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand listty,
+                           ZPRRegOp zprty> {
+  def NAME : sve_mem_ldqr_si<sz, asm, listty>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>;
+}
+
+class sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand VecList,
+                      RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
+  bits<5> Zt;
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand listty,
+                           ZPRRegOp zprty, RegisterOperand gprty> {
+  def NAME : sve_mem_ldqr_ss<sz, asm, listty, gprty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+class sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
+                     RegisterOperand VecList, Operand immtype>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm6]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<6> imm6;
+  let Inst{31-25} = 0b1000010;
+  let Inst{24-23} = dtypeh;
+  let Inst{22}    = 1;
+  let Inst{21-16} = imm6;
+  let Inst{15}    = 0b1;
+  let Inst{14-13} = dtypel;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
+                          RegisterOperand zlistty, ZPRRegOp zprty, Operand immtype> {
+  def NAME : sve_mem_ld_dup<dtypeh, dtypel, asm, zlistty, immtype>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm6]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) zlistty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
+                          RegisterOperand VecList>
+: I<(outs VecList:$Zt), iops,
+  asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zt;
+  bits<3> Pg;
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-21} = dtype;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b01;
+  let Inst{13}    = ff;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+  let Uses = !if(!eq(ff, 1), [FFR], []);
+  let Defs = !if(!eq(ff, 1), [FFR], []);
+}
+
+multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
+                          ZPRRegOp zprty, RegisterOperand gprty> {
+  def "" : sve_mem_cld_ss_base<dtype, 0, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+                               asm, listty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+                 (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
+                            ZPRRegOp zprty, RegisterOperand gprty> {
+  def _REAL : sve_mem_cld_ss_base<dtype, 1, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+                                  asm, listty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
+}
+
+multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
+                            ZPRRegOp zprty>
+: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;
+
+class sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+                     string asm, Operand immtype>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zt;
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = nregs;
+  let Inst{20}    = 0;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+                          string asm, Operand immtype> {
+  def NAME : sve_mem_eld_si<sz, nregs, VecList, asm, immtype>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_eld_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
+                     string asm, RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rm;
+  bits<5> Rn;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = nregs;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b110;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - 32-bit Gather and Unsized Contiguous Group
+//===----------------------------------------------------------------------===//
+
+// bit xs      is '1' if offsets are signed
+// bit scaled  is '1' if the offsets are scaled
+class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
+                         RegisterOperand zprext>
+: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zm;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1000010;
+  let Inst{24-23} = opc{3-2};
+  let Inst{22}    = xs;
+  let Inst{21}    = scaled;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = 0b0;
+  let Inst{14-13} = opc{1-0};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+  let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+  let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
+                                        RegisterOperand sxtw_opnd,
+                                        RegisterOperand uxtw_opnd> {
+  def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
+  def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
+                                          RegisterOperand sxtw_opnd,
+                                          RegisterOperand uxtw_opnd> {
+  def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
+  def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+
+class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
+: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
+  asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<5> Zt;
+  bits<5> imm5;
+  let Inst{31-25} = 0b1000010;
+  let Inst{24-23} = opc{3-2};
+  let Inst{22-21} = 0b01;
+  let Inst{20-16} = imm5;
+  let Inst{15}    = 0b1;
+  let Inst{14-13} = opc{1-0};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+  let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+  let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty> {
+  def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                  (!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
+                  (!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                  (!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+}
+
+class sve_mem_prfm_si<bits<2> msz, string asm>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, simm6s1:$imm6),
+  asm, "\t$prfop, $Pg, [$Rn, $imm6, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rn;
+  bits<3> Pg;
+  bits<6> imm6;
+  bits<4> prfop;
+  let Inst{31-22} = 0b1000010111;
+  let Inst{21-16} = imm6;
+  let Inst{15}    = 0b0;
+  let Inst{14-13} = msz;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = prfop;
+
+  let hasSideEffects = 1;
+}
+
+multiclass sve_mem_prfm_si<bits<2> msz, string asm> {
+  def NAME : sve_mem_prfm_si<msz, asm>;
+
+  def : InstAlias<asm # "\t$prfop, $Pg, [$Rn]",
+                  (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_prfm_ss<bits<3> opc, string asm, RegisterOperand gprty>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$prfop, $Pg, [$Rn, $Rm]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  bits<3> Pg;
+  bits<4> prfop;
+  let Inst{31-25} = 0b1000010;
+  let Inst{24-23} = opc{2-1};
+  let Inst{22-21} = 0b00;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0b1;
+  let Inst{14}    = opc{0};
+  let Inst{13}    = 0b0;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = prfop;
+
+  let hasSideEffects = 1;
+}
+
+class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
+                          RegisterOperand zprext>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+  asm, "\t$prfop, $Pg, [$Rn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zm;
+  bits<4> prfop;
+  let Inst{31-23} = 0b100001000;
+  let Inst{22}    = xs;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = 0b0;
+  let Inst{14-13} = msz;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = prfop;
+
+  let hasSideEffects = 1;
+}
+
+multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
+                                      RegisterOperand sxtw_opnd,
+                                      RegisterOperand uxtw_opnd> {
+  def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
+  def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
+}
+
+class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
+  asm, "\t$prfop, $Pg, [$Zn, $imm5]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<5> imm5;
+  bits<4> prfop;
+  let Inst{31-25} = 0b1000010;
+  let Inst{24-23} = msz;
+  let Inst{22-21} = 0b00;
+  let Inst{20-16} = imm5;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = prfop;
+}
+
+multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+  def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
+
+  def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
+                  (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+}
+
+class sve_mem_z_fill<string asm>
+: I<(outs ZPRAny:$Zt), (ins GPR64sp:$Rn, simm9:$imm9),
+  asm, "\t$Zt, [$Rn, $imm9, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Rn;
+  bits<5> Zt;
+  bits<9> imm9;
+  let Inst{31-22} = 0b1000010110;
+  let Inst{21-16} = imm9{8-3};
+  let Inst{15-13} = 0b010;
+  let Inst{12-10} = imm9{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_z_fill<string asm> {
+  def NAME : sve_mem_z_fill<asm>;
+
+  def : InstAlias<asm # "\t$Zt, [$Rn]",
+                  (!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
+}
+
+class sve_mem_p_fill<string asm>
+: I<(outs PPRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9),
+  asm, "\t$Pt, [$Rn, $imm9, mul vl]",
+  "",
+  []>, Sched<[]> {
+  bits<4> Pt;
+  bits<5> Rn;
+  bits<9> imm9;
+  let Inst{31-22} = 0b1000010110;
+  let Inst{21-16} = imm9{8-3};
+  let Inst{15-13} = 0b000;
+  let Inst{12-10} = imm9{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = Pt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_p_fill<string asm> {
+  def NAME : sve_mem_p_fill<asm>;
+
+  def : InstAlias<asm # "\t$Pt, [$Rn]",
+                  (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - 64-bit Gather Group
+//===----------------------------------------------------------------------===//
+
+// bit xs      is '1' if offsets are signed
+// bit scaled  is '1' if the offsets are scaled
+// bit lsl     is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
+class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
+                         RegisterOperand zprext>
+: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zm;
+  bits<5> Zt;
+  let Inst{31-25} = 0b1100010;
+  let Inst{24-23} = opc{3-2};
+  let Inst{22}    = xs;
+  let Inst{21}    = scaled;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = lsl;
+  let Inst{14-13} = opc{1-0};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+  let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+  let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
+                                        RegisterOperand sxtw_opnd,
+                                        RegisterOperand uxtw_opnd> {
+  def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
+  def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
+                                          RegisterOperand sxtw_opnd,
+                                          RegisterOperand uxtw_opnd> {
+  def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
+  def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+}
+
+multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
+                                         RegisterOperand zprext> {
+  def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+}
+
+multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
+  def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
+                  (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+}
+
+class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
+: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
+  asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<5> Zt;
+  bits<5> imm5;
+  let Inst{31-25} = 0b1100010;
+  let Inst{24-23} = opc{3-2};
+  let Inst{22-21} = 0b01;
+  let Inst{20-16} = imm5;
+  let Inst{15}    = 0b1;
+  let Inst{14-13} = opc{1-0};
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+  let Defs = !if(!eq(opc{0}, 1), [FFR], []);
+  let Uses = !if(!eq(opc{0}, 1), [FFR], []);
+}
+
+multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty> {
+  def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                  (!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
+                 (!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                  (!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+}
+
+// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
+class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
+                          RegisterOperand zprext>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
+  asm, "\t$prfop, $Pg, [$Rn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Zm;
+  bits<4> prfop;
+  let Inst{31-23} = 0b110001000;
+  let Inst{22}    = xs;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15}    = lsl;
+  let Inst{14-13} = msz;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = prfop;
+
+  let hasSideEffects = 1;
+}
+
+multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
+                                          RegisterOperand sxtw_opnd,
+                                          RegisterOperand uxtw_opnd> {
+  def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
+  def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
+}
+
+multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
+                                          RegisterOperand zprext> {
+  def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
+}
+
+
+class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
+: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
+  asm, "\t$prfop, $Pg, [$Zn, $imm5]",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Zn;
+  bits<5> imm5;
+  bits<4> prfop;
+  let Inst{31-25} = 0b1100010;
+  let Inst{24-23} = msz;
+  let Inst{22-21} = 0b00;
+  let Inst{20-16} = imm5;
+  let Inst{15-13} = 0b111;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = prfop;
+
+  let hasSideEffects = 1;
+}
+
+multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+  def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
+
+  def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
+                  (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Compute Vector Address Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_misc_0_a<bits<2> opc, bits<2> msz, string asm,
+                                ZPRRegOp zprty, RegisterOperand zprext>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprext:$Zm),
+  asm, "\t$Zd, [$Zn, $Zm]",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-12} = 0b1010;
+  let Inst{11-10} = msz;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_uxtw<bits<2> opc, string asm> {
+  def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtUXTW8>;
+  def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtUXTW16>;
+  def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtUXTW32>;
+  def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtUXTW64>;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_sxtw<bits<2> opc, string asm> {
+  def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtSXTW8>;
+  def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtSXTW16>;
+  def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtSXTW32>;
+  def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtSXTW64>;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_32_lsl<bits<2> opc, string asm> {
+  def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR32, ZPR32ExtLSL8>;
+  def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR32, ZPR32ExtLSL16>;
+  def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR32, ZPR32ExtLSL32>;
+  def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR32, ZPR32ExtLSL64>;
+}
+
+multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
+  def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtLSL8>;
+  def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtLSL16>;
+  def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtLSL32>;
+  def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Misc - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_misc_0_b<bits<2> sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b101100;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_bin_cons_misc_0_b<string asm> {
+  def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>;
+  def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>;
+  def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>;
+}
+
+class sve_int_bin_cons_misc_0_c<bits<8> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn),
+  asm, "\t$Zd, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = opc{7-6};
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = opc{5-1};
+  let Inst{15-11} = 0b10111;
+  let Inst{10}    = opc{0};
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Reduction Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
+                     ZPRRegOp zprty, RegisterClass regtype>
+: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+  asm, "\t$Vd, $Pg, $Zn",
+  "",
+  []>, Sched<[]> {
+  bits<3> Pg;
+  bits<5> Vd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_32;
+  let Inst{21}    = 0b0;
+  let Inst{20-19} = fmt;
+  let Inst{18-16} = opc;
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Vd;
+}
+
+multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm> {
+  def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
+  def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
+  def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
+}
+
+multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm> {
+  def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
+  def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
+  def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
+  def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
+}
+
+multiclass sve_int_reduce_1<bits<3> opc, string asm> {
+  def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
+  def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
+  def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
+  def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
+}
+
+multiclass sve_int_reduce_2<bits<3> opc, string asm> {
+  def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
+  def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
+  def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
+  def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index e65ba1f2401d..23cc21ce2e7c 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -53,6 +53,14 @@ namespace llvm {
 #include "AArch64GenSystemOperands.inc"
   }
 }
+
+namespace llvm {
+  namespace AArch64TSB {
+#define GET_TSB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
 namespace llvm {
   namespace AArch64PRFM {
 #define GET_PRFM_IMPL
@@ -61,6 +69,27 @@ namespace llvm {
 }
 
 namespace llvm {
+  namespace AArch64SVEPRFM {
+#define GET_SVEPRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64SVEPredPattern {
+#define GET_SVEPREDPAT_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64ExactFPImm {
+#define GET_EXACTFPIMM_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
   namespace AArch64PState {
 #define GET_PSTATE_IMPL
 #include "AArch64GenSystemOperands.inc"
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index c1c799b7b349..2874c4ab42ea 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -285,6 +285,8 @@ struct SysAlias {
 struct SysAliasReg : SysAlias {
   bool NeedsReg;
   SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
+  SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F),
+    NeedsReg(R) {};
 };
 
 namespace AArch64AT{
@@ -327,6 +329,14 @@ namespace  AArch64ISB {
   #include "AArch64GenSystemOperands.inc"
 }
 
+namespace  AArch64TSB {
+  struct TSB : SysAlias {
+    using SysAlias::SysAlias;
+  };
+  #define GET_TSB_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
 namespace AArch64PRFM {
   struct PRFM : SysAlias {
     using SysAlias::SysAlias;
@@ -335,6 +345,33 @@ namespace AArch64PRFM {
   #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64SVEPRFM {
+  struct SVEPRFM : SysAlias {
+    using SysAlias::SysAlias;
+  };
+#define GET_SVEPRFM_DECL
+#include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64SVEPredPattern {
+  struct SVEPREDPAT {
+    const char *Name;
+    uint16_t Encoding;
+  };
+#define GET_SVEPREDPAT_DECL
+#include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64ExactFPImm {
+  struct ExactFPImm {
+    const char *Name;
+    int Enum;
+    const char *Repr;
+  };
+#define GET_EXACTFPIMM_DECL
+#include "AArch64GenSystemOperands.inc"
+}
+
 namespace AArch64PState {
   struct PState : SysAlias{
     using SysAlias::SysAlias;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0ddc43ad5033..796766d94622 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -11,7 +11,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -50,9 +49,9 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
-FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIFormMemoryClausesPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
@@ -74,6 +73,14 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
+FunctionPass *createAMDGPULowerKernelArgumentsPass();
+void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPULowerKernelArgumentsID;
+
+ModulePass *createAMDGPULowerKernelAttributesPass();
+void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
+extern char &AMDGPULowerKernelAttributesID;
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
@@ -134,6 +141,9 @@ extern char &AMDGPUSimplifyLibCallsID;
 void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
 extern char &AMDGPUUseNativeCallsID;
 
+void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
+extern char &AMDGPUPerfHintAnalysisID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -144,7 +154,7 @@ FunctionPass *createAMDGPUISelDag(
   TargetMachine *TM = nullptr,
   CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
 ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
-ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *createR600OpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
 ModulePass* createAMDGPUUnifyMetadataPass();
@@ -169,12 +179,12 @@ extern char &SIMemoryLegalizerID;
 void initializeSIDebuggerInsertNopsPass(PassRegistry&);
 extern char &SIDebuggerInsertNopsID;
 
-void initializeSIInsertWaitsPass(PassRegistry&);
-extern char &SIInsertWaitsID;
-
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
+void initializeSIFormMemoryClausesPass(PassRegistry&);
+extern char &SIFormMemoryClausesID;
+
 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
 extern char &AMDGPUUnifyDivergentExitNodesID;
 
@@ -222,8 +232,11 @@ struct AMDGPUAS {
     MAX_COMMON_ADDRESS = 5,
 
     GLOBAL_ADDRESS = 1,   ///< Address space for global memory (RAT0, VTX0).
-    CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+    CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
     LOCAL_ADDRESS = 3,    ///< Address space for local memory.
+
+    CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+
     /// Address space for direct addressible parameter memory (CONST0)
     PARAM_D_ADDRESS = 6,
     /// Address space for indirect addressible parameter memory (VTX1)
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
index c02d0a131041..16c2a366db28 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -7,58 +7,30 @@
 //
 //===------------------------------------------------------------===//
 
+include "llvm/TableGen/SearchableTable.td"
 include "llvm/Target/Target.td"
+include "AMDGPUFeatures.td"
 
 //===------------------------------------------------------------===//
 // Subtarget Features (device properties)
 //===------------------------------------------------------------===//
 
-def FeatureFP64 : SubtargetFeature<"fp64",
-  "FP64",
-  "true",
-  "Enable double precision operations"
->;
-
-def FeatureFMA : SubtargetFeature<"fmaf",
-  "FMA",
-  "true",
-  "Enable single precision FMA (not as fast as mul+add, but fused)"
->;
-
 def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
   "FastFMAF32",
   "true",
   "Assuming f32 fma is at least as fast as mul + add"
 >;
 
-def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
-  "HalfRate64Ops",
-  "true",
-  "Most fp64 instructions are half rate instead of quarter"
->;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-  "R600ALUInst",
-  "false",
-  "Older version of ALU instructions encoding"
->;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-  "HasVertexCache",
+def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
+  "MIMG_R128",
   "true",
-  "Specify use of dedicated vertex cache"
+  "Support 128-bit texture resources"
 >;
 
-def FeatureCaymanISA : SubtargetFeature<"caymanISA",
-  "CaymanISA",
-  "true",
-  "Use Cayman ISA"
->;
-
-def FeatureCFALUBug : SubtargetFeature<"cfalubug",
-  "CFALUBug",
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+  "HalfRate64Ops",
   "true",
-  "GPU has CF_ALU bug"
+  "Most fp64 instructions are half rate instead of quarter"
 >;
 
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
@@ -121,6 +93,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
   "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
 >;
 
+def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
+  "HasFmaMixInsts",
+  "true",
+  "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
+>;
+
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
@@ -140,27 +118,6 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "VI SGPR initialization bug requiring a fixed SGPR allocation size"
 >;
 
-class SubtargetFeatureFetchLimit <string Value> :
-                          SubtargetFeature <"fetch"#Value,
-  "TexVTXClauseSize",
-  Value,
-  "Limit the maximum number of fetches in a clause to "#Value
->;
-
-def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
-def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
-
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
-  "wavefrontsize"#Value,
-  "WavefrontSize",
-  !cast<string>(Value),
-  "The number of threads per wavefront"
->;
-
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
   "ldsbankcount"#Value,
   "LDSBankCount",
@@ -171,19 +128,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
-class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
-  "localmemorysize"#Value,
-  "LocalMemorySize",
-  !cast<string>(Value),
-  "The size of local memory in bytes"
->;
-
-def FeatureGCN : SubtargetFeature<"gcn",
-  "IsGCN",
-  "true",
-  "GCN or newer GPU"
->;
-
 def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
   "GCN3Encoding",
   "true",
@@ -244,6 +188,12 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores",
   "Has store scalar memory instructions"
 >;
 
+def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
+  "HasScalarAtomics",
+  "true",
+  "Has atomic scalar memory instructions"
+>;
+
 def FeatureSDWA : SubtargetFeature<"sdwa",
   "HasSDWA",
   "true",
@@ -292,6 +242,27 @@ def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
   "Support clamp for integer destination"
 >;
 
+def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
+  "HasUnpackedD16VMem",
+  "true",
+  "Has unpacked d16 vmem instructions"
+>;
+
+def FeatureDLInsts : SubtargetFeature<"dl-insts",
+  "HasDLInsts",
+  "true",
+  "Has deep learning instructions"
+>;
+
+def FeatureD16PreservesUnusedBits : SubtargetFeature<
+  "d16-preserves-unused-bits",
+  "D16PreservesUnusedBits",
+  "true",
+  "If present, then instructions defined by HasD16LoadStore predicate preserve "
+  "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
+  "zero unused bits."
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -329,12 +300,6 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
   [FeatureFP64FP16Denormals]
 >;
 
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
-  "DX10Clamp",
-  "true",
-  "clamp modifier clamps NaNs to 0.0"
->;
-
 def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
   "FPExceptions",
   "true",
@@ -377,12 +342,6 @@ def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
   "Dump MachineInstrs in the CodeEmitter"
 >;
 
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
-  "EnablePromoteAlloca",
-  "true",
-  "Enable promote alloca pass"
->;
-
 // XXX - This should probably be removed once enabled by default
 def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
   "EnableLoadStoreOpt",
@@ -408,6 +367,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
   "Enable SI Machine Scheduler"
 >;
 
+def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
+  "EnableDS128",
+  "true",
+  "Use ds_{read|write}_b128"
+>;
+
 // Unless +-flat-for-global is specified, turn on FlatForGlobal for
 // all OS-es on VI and newer hardware to avoid assertion failures due
 // to missing ADDR64 variants of MUBUF instructions.
@@ -440,46 +405,30 @@ def FeatureDisable : SubtargetFeature<"",
   "Dummy feature to disable assembler instructions"
 >;
 
-class SubtargetFeatureGeneration <string Value,
-                                  list<SubtargetFeature> Implies> :
-        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
-                          Value#" GPU generation", Implies>;
-
-def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
-def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
-def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
-
-def FeatureR600 : SubtargetFeatureGeneration<"R600",
-  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
->;
-
-def FeatureR700 : SubtargetFeatureGeneration<"R700",
-  [FeatureFetchLimit16, FeatureLocalMemorySize0]
->;
-
-def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+def FeatureGCN : SubtargetFeature<"gcn",
+  "IsGCN",
+  "true",
+  "GCN or newer GPU"
 >;
 
-def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-  [FeatureFetchLimit16, FeatureWavefrontSize64,
-   FeatureLocalMemorySize32768]
->;
+class GCNSubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
 
-def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-  [FeatureFP64, FeatureLocalMemorySize32768,
+def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN,
   FeatureLDSBankCount32, FeatureMovrel]
 >;
 
-def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-  [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel]
 >;
 
-def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
-  [FeatureFP64, FeatureLocalMemorySize65536,
+def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
@@ -489,7 +438,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
   ]
 >;
 
-def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   [FeatureFP64, FeatureLocalMemorySize65536,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
@@ -498,7 +447,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
-   FeatureAddNoCarryInsts
+   FeatureAddNoCarryInsts, FeatureScalarAtomics
   ]
 >;
 
@@ -534,7 +483,8 @@ def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
-   FeatureLDSBankCount16]>;
+   FeatureLDSBankCount16,
+   FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
@@ -544,26 +494,24 @@ def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
 
-def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
-  [FeatureVolcanicIslands,
-   FeatureLDSBankCount32,
-   FeatureSGPRInitBug]>;
-
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureSGPRInitBug]>;
+   FeatureSGPRInitBug,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
@@ -573,14 +521,28 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32
-   ]>;
+   FeatureLDSBankCount32,
+   FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32
-   ]>;
+   FeatureLDSBankCount32,
+   FeatureXNACK,
+   FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+  [FeatureGFX9,
+   FeatureLDSBankCount32,
+   FeatureFmaMixInsts,
+   FeatureD16PreservesUnusedBits]>;
+
+def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+  [FeatureGFX9,
+   HalfRate64Ops,
+   FeatureFmaMixInsts,
+   FeatureLDSBankCount32,
+   FeatureDLInsts]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
@@ -593,13 +555,6 @@ def FeatureDebuggerInsertNops : SubtargetFeature<
   "Insert one nop instruction for each high level source statement"
 >;
 
-def FeatureDebuggerReserveRegs : SubtargetFeature<
-  "amdgpu-debugger-reserve-regs",
-  "DebuggerReserveRegs",
-  "true",
-  "Reserve registers for debugger usage"
->;
-
 def FeatureDebuggerEmitPrologue : SubtargetFeature<
   "amdgpu-debugger-emit-prologue",
   "DebuggerEmitPrologue",
@@ -675,6 +630,7 @@ def AMDGPU : Target {
                                 SDWA9AsmParserVariant,
                                 DPPAsmParserVariant];
   let AssemblyWriters = [AMDGPUAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
 
 // Dummy Instruction itineraries for pseudo instructions
@@ -685,8 +641,6 @@ def NullALU : InstrItinClass;
 // Predicate helper class
 //===----------------------------------------------------------------------===//
 
-def TruePredicate : Predicate<"true">;
-
 def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
@@ -715,6 +669,13 @@ def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
   AssemblerPredicate<"FeatureGFX9Insts">;
 
+def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
+  AssemblerPredicate<"FeatureUnpackedD16VMem">;
+def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
+  AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+
+def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
+  AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
@@ -733,6 +694,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<"FeatureVOP3P">;
 
+def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
+  AssemblerPredicate<"!FeatureVOP3P">;
+
 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
   AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
 
@@ -748,38 +712,35 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
 def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
   AssemblerPredicate<"FeatureMadMixInsts">;
 
-def EnableLateCFGStructurize : Predicate<
-  "EnableLateStructurizeCFG">;
+def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
+  AssemblerPredicate<"FeatureScalarAtomics">;
 
-// Exists to help track down where SubtargetPredicate isn't set rather
-// than letting tablegen crash with an unhelpful error.
-def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
-
-class PredicateControl {
-  Predicate SubtargetPredicate = InvalidPred;
-  Predicate SIAssemblerPredicate = isSICI;
-  Predicate VIAssemblerPredicate = isVI;
-  list<Predicate> AssemblerPredicates = [];
-  Predicate AssemblerPredicate = TruePredicate;
-  list<Predicate> OtherPredicates = [];
-  list<Predicate> Predicates = !listconcat([SubtargetPredicate,
-                                            AssemblerPredicate],
-                                            AssemblerPredicates,
-                                            OtherPredicates);
-}
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+                      AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+                AssemblerPredicate<"FeatureMovrel">;
+
+def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
+  AssemblerPredicate<"FeatureFmaMixInsts">;
 
-class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
-  PredicateControl;
+def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
+  AssemblerPredicate<"FeatureDLInsts">;
 
 
+def EnableLateCFGStructurize : Predicate<
+  "EnableLateStructurizeCFG">;
+
 // Include AMDGPU TD files
-include "R600Schedule.td"
-include "R600Processors.td"
 include "SISchedule.td"
 include "GCNProcessors.td"
 include "AMDGPUInstrInfo.td"
 include "AMDGPUIntrinsics.td"
+include "SIIntrinsics.td"
 include "AMDGPURegisterInfo.td"
 include "AMDGPURegisterBanks.td"
 include "AMDGPUInstructions.td"
+include "SIInstrInfo.td"
 include "AMDGPUCallingConv.td"
+include "AMDGPUSearchableTables.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 392b011e387c..ef4b69d09d9f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
   /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
   };
   static const AliasResult ASAliasRulesGenIsZero[6][6] = {
-  /*             Flat       Global    Constant  Group     Region    Private */
+  /*             Flat       Global    Region    Group     Constant  Private */
   /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
   /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
   /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
@@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
   assert(AS.MAX_COMMON_ADDRESS <= 5);
   if (AS.FLAT_ADDRESS == 0) {
     assert(AS.GLOBAL_ADDRESS   == 1 &&
-           AS.REGION_ADDRESS   == 4 &&
+           AS.REGION_ADDRESS   == 2 &&
            AS.LOCAL_ADDRESS    == 3 &&
-           AS.CONSTANT_ADDRESS == 2 &&
+           AS.CONSTANT_ADDRESS == 4 &&
            AS.PRIVATE_ADDRESS  == 5);
     ASAliasRules = &ASAliasRulesGenIsZero;
   } else {
@@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
                                             bool OrLocal) {
   const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
 
-  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
+      Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
     return true;
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c27425443abc..d4bbb2c1eb8d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -14,6 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls(
 class AMDGPUAlwaysInline : public ModulePass {
   bool GlobalOpt;
 
+  void recursivelyVisitUsers(GlobalValue &GV,
+                             SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
 public:
   static char ID;
 
   AMDGPUAlwaysInline(bool GlobalOpt = false) :
     ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
-  StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+ }
 };
 
 } // End anonymous namespace
@@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
 
 char AMDGPUAlwaysInline::ID = 0;
 
+void AMDGPUAlwaysInline::recursivelyVisitUsers(
+  GlobalValue &GV,
+  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+  SmallVector<User *, 16> Stack;
+
+  SmallPtrSet<const Value *, 8> Visited;
+
+  for (User *U : GV.users())
+    Stack.push_back(U);
+
+  while (!Stack.empty()) {
+    User *U = Stack.pop_back_val();
+    if (!Visited.insert(U).second)
+      continue;
+
+    if (Instruction *I = dyn_cast<Instruction>(U)) {
+      Function *F = I->getParent()->getParent();
+      if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+        FuncsToAlwaysInline.insert(F);
+        Stack.push_back(F);
+      }
+
+      // No need to look at further users, but we do need to inline any callers.
+      continue;
+    }
+
+    for (User *UU : U->users())
+      Stack.push_back(UU);
+  }
+}
+
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
+
   std::vector<GlobalAlias*> AliasesToRemove;
-  std::vector<Function *> FuncsToClone;
+
+  SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
+  SmallPtrSet<Function *, 8> FuncsToNoInline;
 
   for (GlobalAlias &A : M.aliases()) {
     if (Function* F = dyn_cast<Function>(A.getAliasee())) {
       A.replaceAllUsesWith(F);
       AliasesToRemove.push_back(&A);
     }
+
+    // FIXME: If the aliasee isn't a function, it's some kind of constant expr
+    // cast that won't be inlined through.
   }
 
   if (GlobalOpt) {
@@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     }
   }
 
-  auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline;
-  auto IncompatAttr
-    = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
-
-  for (Function &F : M) {
-    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
-        !F.hasFnAttribute(IncompatAttr))
-      FuncsToClone.push_back(&F);
-  }
-
-  for (Function *F : FuncsToClone) {
-    ValueToValueMapTy VMap;
-    Function *NewFunc = CloneFunction(F, VMap);
-    NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    F->replaceAllUsesWith(NewFunc);
+  // Always force inlining of any function that uses an LDS global address. This
+  // is something of a workaround because we don't have a way of supporting LDS
+  // objects defined in functions. LDS is always allocated by a kernel, and it
+  // is difficult to manage LDS usage if a function may be used by multiple
+  // kernels.
+  //
+  // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
+  // should only appear when IPO passes manages to move LDs defined in a kernel
+  // into a single user function.
+
+  for (GlobalVariable &GV : M.globals()) {
+    // TODO: Region address
+    unsigned AS = GV.getType()->getAddressSpace();
+    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+      continue;
+
+    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
   }
 
-  for (Function &F : M) {
-    if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
-      F.addFnAttr(NewAttr);
+  if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
+    auto IncompatAttr
+      = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
+
+    for (Function &F : M) {
+      if (!F.isDeclaration() && !F.use_empty() &&
+          !F.hasFnAttribute(IncompatAttr)) {
+        if (StressCalls) {
+          if (!FuncsToAlwaysInline.count(&F))
+            FuncsToNoInline.insert(&F);
+        } else
+          FuncsToAlwaysInline.insert(&F);
+      }
     }
   }
-  return false;
+
+  for (Function *F : FuncsToAlwaysInline)
+    F->addFnAttr(Attribute::AlwaysInline);
+
+  for (Function *F : FuncsToNoInline)
+    F->addFnAttr(Attribute::NoInline);
+
+  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
 }
 
 ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
   return new AMDGPUAlwaysInline(GlobalOpt);
 }
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index ce17202f3414..1a70833a4472 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -219,7 +219,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 }
 
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
   bool HasFlat = ST.hasFlatAddressSpace();
   bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dcca3a2fab96..7465cf22b5a4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -55,9 +55,6 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
        << "  DispatchID: " << FI.second.DispatchID
        << "  FlatScratchInit: " << FI.second.FlatScratchInit
        << "  PrivateSegmentSize: " << FI.second.PrivateSegmentSize
-       << "  GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX
-       << "  GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY
-       << "  GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ
        << "  WorkGroupIDX: " << FI.second.WorkGroupIDX
        << "  WorkGroupIDY: " << FI.second.WorkGroupIDY
        << "  WorkGroupIDZ: " << FI.second.WorkGroupIDZ
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index bf9635549a8c..f0e6d1b83f15 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -18,7 +18,7 @@ namespace llvm {
 
 class Function;
 class raw_ostream;
-class SISubtarget;
+class GCNSubtarget;
 class TargetMachine;
 class TargetRegisterClass;
 class TargetRegisterInfo;
@@ -111,9 +111,6 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor DispatchID;
   ArgDescriptor FlatScratchInit;
   ArgDescriptor PrivateSegmentSize;
-  ArgDescriptor GridWorkGroupCountX;
-  ArgDescriptor GridWorkGroupCountY;
-  ArgDescriptor GridWorkGroupCountZ;
 
   // System SGPRs in kernels.
   ArgDescriptor WorkGroupIDX;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index fda6252f46e3..e62e5d52ad74 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer  -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,7 +21,9 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "R600AsmPrinter.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
@@ -32,7 +34,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -40,6 +41,7 @@
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -65,7 +67,7 @@ using namespace llvm::AMDGPU;
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
 static uint32_t getFPMode(const MachineFunction &F) {
-  const SISubtarget& ST = F.getSubtarget<SISubtarget>();
+  const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
   // TODO: Is there any real use for the flush in only / flush out only modes?
 
   uint32_t FP32Denormals =
@@ -88,7 +90,7 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
 
 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
-                                     createAMDGPUAsmPrinterPass);
+                                     llvm::createR600AsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
                                      createAMDGPUAsmPrinterPass);
 }
@@ -114,7 +116,8 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
@@ -127,10 +130,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     readPALMetadata(M);
 
-  // Deprecated notes are not emitted for code object v3.
-  if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits()))
-    return;
-
   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
@@ -142,7 +141,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
 }
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (TM.getTargetTriple().getArch() != Triple::amdgcn)
+  // TODO: Add metadata to code object v3.
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
   // Following code requires TargetStreamer to be present.
@@ -189,37 +190,82 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
-  const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
-  if (!MFI->isEntryFunction())
+  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  if (!MFI.isEntryFunction())
+    return;
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
-  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  amd_kernel_code_t KernelCode;
-  if (STM.isAmdCodeObjectV2(*MF)) {
+  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+  const Function &F = MF->getFunction();
+  if (STM.isAmdCodeObjectV2(F) &&
+      (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+       F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
+    amd_kernel_code_t KernelCode;
     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
-
-    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
   }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
 
-  HSAMetadataStream.emitKernel(MF->getFunction(),
-                               getHSACodeProps(*MF, CurrentProgramInfo),
-                               getHSADebugProps(*MF, CurrentProgramInfo));
+  HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+}
+
+void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  if (!MFI.isEntryFunction())
+    return;
+  if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+      TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+
+  auto &Streamer = getTargetStreamer()->getStreamer();
+  auto &Context = Streamer.getContext();
+  auto &ObjectFileInfo = *Context.getObjectFileInfo();
+  auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
+
+  Streamer.PushSection();
+  Streamer.SwitchSection(&ReadOnlySection);
+
+  // CP microcode requires the kernel descriptor to be allocated on 64 byte
+  // alignment.
+  Streamer.EmitValueToAlignment(64, 0, 1, 0);
+  if (ReadOnlySection.getAlignment() < 64)
+    ReadOnlySection.setAlignment(64);
+
+  SmallString<128> KernelName;
+  getNameWithPrefix(KernelName, &MF->getFunction());
+  getTargetStreamer()->EmitAmdhsaKernelDescriptor(
+      *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+      CurrentProgramInfo.NumVGPRsForWavesPerEU,
+      CurrentProgramInfo.NumSGPRsForWavesPerEU -
+          IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(),
+                                    CurrentProgramInfo.VCCUsed,
+                                    CurrentProgramInfo.FlatUsed),
+      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+      hasXNACK(*getSTI()));
+
+  Streamer.PopSection();
 }
 
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+    AsmPrinter::EmitFunctionEntryLabel();
+    return;
+  }
+
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
+  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
     SmallString<128> SymbolName;
     getNameWithPrefix(SymbolName, &MF->getFunction()),
     getTargetStreamer()->EmitAMDGPUSymbolType(
         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
   }
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   if (STI.dumpCode()) {
     // Disassemble function name label to text.
     DisasmLines.push_back(MF->getName().str() + ":");
@@ -231,7 +277,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
-  const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
   if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
     // Write a line for the basic block label if it is not only fallthrough.
     DisasmLines.push_back(
@@ -283,11 +329,66 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
   uint32_t NumVGPR,
   uint32_t NumSGPR,
   uint64_t ScratchSize,
-  uint64_t CodeSize) {
+  uint64_t CodeSize,
+  const AMDGPUMachineFunction *MFI) {
   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+  OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+                              false);
+}
+
+uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+    const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  uint16_t KernelCodeProperties = 0;
+
+  if (MFI.hasPrivateSegmentBuffer()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+  if (MFI.hasDispatchPtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+  }
+  if (MFI.hasQueuePtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+  }
+  if (MFI.hasKernargSegmentPtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+  }
+  if (MFI.hasDispatchID()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+  }
+  if (MFI.hasFlatScratchInit()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+  }
+
+  return KernelCodeProperties;
+}
+
+amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
+    const MachineFunction &MF,
+    const SIProgramInfo &PI) const {
+  amdhsa::kernel_descriptor_t KernelDescriptor;
+  memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+
+  assert(isUInt<32>(PI.ScratchSize));
+  assert(isUInt<32>(PI.ComputePGMRSrc1));
+  assert(isUInt<32>(PI.ComputePGMRSrc2));
+
+  KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
+  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+  KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+  KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+  return KernelDescriptor;
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -301,32 +402,29 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   SetupMachineFunction(MF);
 
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Context = getObjFileLowering().getContext();
-  if (!STM.isAmdHsaOS()) {
+  // FIXME: This should be an explicit check for Mesa.
+  if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
     MCSectionELF *ConfigSection =
         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
     OutStreamer->SwitchSection(ConfigSection);
   }
 
-  if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    if (MFI->isEntryFunction()) {
-      getSIProgramInfo(CurrentProgramInfo, MF);
-    } else {
-      auto I = CallGraphResourceInfo.insert(
-        std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
-      SIFunctionResourceInfo &Info = I.first->second;
-      assert(I.second && "should only be called once per function");
-      Info = analyzeResourceUsage(MF);
-    }
-
-    if (STM.isAmdPalOS())
-      EmitPALMetadata(MF, CurrentProgramInfo);
-    if (!STM.isAmdHsaOS()) {
-      EmitProgramInfoSI(MF, CurrentProgramInfo);
-    }
+  if (MFI->isEntryFunction()) {
+    getSIProgramInfo(CurrentProgramInfo, MF);
   } else {
-    EmitProgramInfoR600(MF);
+    auto I = CallGraphResourceInfo.insert(
+      std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+    SIFunctionResourceInfo &Info = I.first->second;
+    assert(I.second && "should only be called once per function");
+    Info = analyzeResourceUsage(MF);
+  }
+
+  if (STM.isAmdPalOS())
+    EmitPALMetadata(MF, CurrentProgramInfo);
+  else if (!STM.isAmdHsaOS()) {
+    EmitProgramInfoSI(MF, CurrentProgramInfo);
   }
 
   DisasmLines.clear();
@@ -340,84 +438,74 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     OutStreamer->SwitchSection(CommentSection);
 
-    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      if (!MFI->isEntryFunction()) {
-        OutStreamer->emitRawComment(" Function info:", false);
-        SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
-        emitCommonFunctionComments(
-          Info.NumVGPR,
-          Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
-          Info.PrivateSegmentSize,
-          getFunctionCodeSize(MF));
-        return false;
-      }
-
-      OutStreamer->emitRawComment(" Kernel info:", false);
-      emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
-                                 CurrentProgramInfo.NumSGPR,
-                                 CurrentProgramInfo.ScratchSize,
-                                 getFunctionCodeSize(MF));
-
-      OutStreamer->emitRawComment(
-        " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
-      OutStreamer->emitRawComment(
-        " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
-      OutStreamer->emitRawComment(
-        " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
-        " bytes/workgroup (compile time only)", false);
-
-      OutStreamer->emitRawComment(
-        " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
-      OutStreamer->emitRawComment(
-        " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
-
-      OutStreamer->emitRawComment(
-        " NumSGPRsForWavesPerEU: " +
-        Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
-      OutStreamer->emitRawComment(
-        " NumVGPRsForWavesPerEU: " +
-        Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
-
-      OutStreamer->emitRawComment(
-        " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
-        false);
-      OutStreamer->emitRawComment(
-        " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
-        false);
-
-      if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
-        OutStreamer->emitRawComment(
-          " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
-          Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
-        OutStreamer->emitRawComment(
-          " DebuggerPrivateSegmentBufferSGPR: s" +
-          Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
-      }
+    if (!MFI->isEntryFunction()) {
+      OutStreamer->emitRawComment(" Function info:", false);
+      SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
+      emitCommonFunctionComments(
+        Info.NumVGPR,
+        Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
+        Info.PrivateSegmentSize,
+        getFunctionCodeSize(MF), MFI);
+      return false;
+    }
 
+    OutStreamer->emitRawComment(" Kernel info:", false);
+    emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+                               CurrentProgramInfo.NumSGPR,
+                               CurrentProgramInfo.ScratchSize,
+                               getFunctionCodeSize(MF), MFI);
+
+    OutStreamer->emitRawComment(
+      " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+    OutStreamer->emitRawComment(
+      " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+    OutStreamer->emitRawComment(
+      " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+      " bytes/workgroup (compile time only)", false);
+
+    OutStreamer->emitRawComment(
+      " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+    OutStreamer->emitRawComment(
+      " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+
+    OutStreamer->emitRawComment(
+      " NumSGPRsForWavesPerEU: " +
+      Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+    OutStreamer->emitRawComment(
+      " NumVGPRsForWavesPerEU: " +
+      Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+
+    OutStreamer->emitRawComment(
+      " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
+
+    if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:USER_SGPR: " +
-        Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
-        Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+        " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+        Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-        Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-        Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-        Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-      OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-        Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
-        false);
-    } else {
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-      OutStreamer->emitRawComment(
-        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+        " DebuggerPrivateSegmentBufferSGPR: s" +
+        Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
     }
+
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+      Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+      Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+      Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+      Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+      Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+      Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+      false);
   }
 
   if (STM.dumpCode()) {
@@ -440,67 +528,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
-void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
-  unsigned MaxGPR = 0;
-  bool killPixel = false;
-  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
-  const R600RegisterInfo *RI = STM.getRegisterInfo();
-  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::KILLGT)
-        killPixel = true;
-      unsigned numOperands = MI.getNumOperands();
-      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        const MachineOperand &MO = MI.getOperand(op_idx);
-        if (!MO.isReg())
-          continue;
-        unsigned HWReg = RI->getHWRegIndex(MO.getReg());
-
-        // Register with value > 127 aren't GPR
-        if (HWReg > 127)
-          continue;
-        MaxGPR = std::max(MaxGPR, HWReg);
-      }
-    }
-  }
-
-  unsigned RsrcReg;
-  if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
-    // Evergreen / Northern Islands
-    switch (MF.getFunction().getCallingConv()) {
-    default: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
-    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
-    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
-    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
-    }
-  } else {
-    // R600 / R700
-    switch (MF.getFunction().getCallingConv()) {
-    default: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
-    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
-    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
-    }
-  }
-
-  OutStreamer->EmitIntValue(RsrcReg, 4);
-  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
-                           S_STACK_SIZE(MFI->CFStackSize), 4);
-  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
-  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
-
-  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
-    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
-    OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
-  }
-}
-
 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = STM.getInstrInfo();
 
   uint64_t CodeSize = 0;
@@ -510,7 +539,7 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
       // TODO: CodeSize should account for multiple functions.
 
       // TODO: Should we count size of debug info?
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       CodeSize += TII->getInstSizeInBytes(MI);
@@ -531,30 +560,10 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
   return false;
 }
 
-static unsigned getNumExtraSGPRs(const SISubtarget &ST,
-                                 bool VCCUsed,
-                                 bool FlatScrUsed) {
-  unsigned ExtraSGPRs = 0;
-  if (VCCUsed)
-    ExtraSGPRs = 2;
-
-  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
-    if (FlatScrUsed)
-      ExtraSGPRs = 4;
-  } else {
-    if (ST.isXNACKEnabled())
-      ExtraSGPRs = 4;
-
-    if (FlatScrUsed)
-      ExtraSGPRs = 6;
-  }
-
-  return ExtraSGPRs;
-}
-
 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
-  const SISubtarget &ST) const {
-  return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+  const GCNSubtarget &ST) const {
+  return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(),
+                                                     UsesVCC, UsesFlatScratch);
 }
 
 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
@@ -562,7 +571,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
   SIFunctionResourceInfo Info;
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -586,6 +595,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
+  if (MFI->isStackRealigned())
+    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
 
 
   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
@@ -649,7 +660,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           continue;
 
         case AMDGPU::NoRegister:
-          assert(MI.isDebugValue());
+          assert(MI.isDebugInstr());
           continue;
 
         case AMDGPU::VCC:
@@ -663,6 +674,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         case AMDGPU::FLAT_SCR_HI:
           continue;
 
+        case AMDGPU::XNACK_MASK:
+        case AMDGPU::XNACK_MASK_LO:
+        case AMDGPU::XNACK_MASK_HI:
+          llvm_unreachable("xnack_mask registers should not be used");
+
         case AMDGPU::TBA:
         case AMDGPU::TBA_LO:
         case AMDGPU::TBA_HI:
@@ -742,8 +758,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           // conservative guesses.
 
           // 48 SGPRs - vcc, - flat_scr, -xnack
-          int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
-                                                   ST.hasFlatAddressSpace());
+          int MaxSGPRGuess =
+              47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true,
+                                             ST.hasFlatAddressSpace());
           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
           MaxVGPR = std::max(MaxVGPR, 23);
 
@@ -798,15 +815,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     MF.getFunction().getContext().diagnose(DiagStackSize);
   }
 
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SIInstrInfo *TII = STM.getInstrInfo();
   const SIRegisterInfo *RI = &TII->getRegisterInfo();
 
-  unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
-                                         ProgInfo.VCCUsed,
-                                         ProgInfo.FlatUsed);
-  unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
+  // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
+  // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
+  // unified.
+  unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+      STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -827,7 +845,19 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
   ProgInfo.NumSGPR += ExtraSGPRs;
-  ProgInfo.NumVGPR += ExtraVGPRs;
+
+  // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
+  // dispatch registers are function args.
+  unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
+  for (auto &Arg : MF.getFunction().args()) {
+    unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
+    if (Arg.hasAttribute(Attribute::InReg))
+      WaveDispatchNumSGPR += NumRegs;
+    else
+      WaveDispatchNumVGPR += NumRegs;
+  }
+  ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
+  ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
@@ -875,19 +905,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     Ctx.diagnose(Diag);
   }
 
-  // SGPRBlocks is actual number of SGPR blocks minus 1.
-  ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
-                                STM.getSGPREncodingGranule());
-  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
-
-  // VGPRBlocks is actual number of VGPR blocks minus 1.
-  ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
-                                STM.getVGPREncodingGranule());
-  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
-
-  // Record first reserved VGPR and number of reserved VGPRs.
-  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0;
-  ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+  ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
+      STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU);
+  ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
+      STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU);
 
   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
@@ -909,7 +930,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = STM.enableDX10Clamp();
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     // LDS is allocated in 64 dword blocks.
     LDSAlignShift = 8;
   } else {
@@ -954,7 +975,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
-      S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
+      // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+      S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
@@ -981,7 +1003,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &CurrentProgramInfo) {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
@@ -1002,26 +1024,21 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
-    unsigned Rsrc2Val = 0;
     if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
-      if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
-        Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0);
-    }
-    if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
-      OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-      OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
-      OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
-      OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
-      Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
-    }
-    if (Rsrc2Val) {
-      OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4);
-      OutStreamer->EmitIntValue(Rsrc2Val, 4);
     }
   }
 
+  if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
+    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
+    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+  }
+
   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
@@ -1114,8 +1131,12 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
                                         const SIProgramInfo &CurrentProgramInfo,
                                         const MachineFunction &MF) const {
+  const Function &F = MF.getFunction();
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
 
   AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
@@ -1151,21 +1172,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (MFI->hasFlatScratchInit())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
-  if (MFI->hasGridWorkgroupCountX()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
-  }
-
-  if (MFI->hasGridWorkgroupCountY()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
-  }
-
-  if (MFI->hasGridWorkgroupCountZ()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
-  }
-
   if (MFI->hasDispatchPtr())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
@@ -1175,20 +1181,17 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (STM.isXNACKEnabled())
     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
-  // FIXME: Should use getKernArgSize
-  Out.kernarg_segment_byte_size =
-    STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
+  unsigned MaxKernArgAlign;
+  Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
-  Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
-  Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
   Out.kernarg_segment_alignment = std::max((size_t)4,
-      countTrailingZeros(MFI->getMaxKernArgAlign()));
+      countTrailingZeros(MaxKernArgAlign));
 
   if (STM.debuggerEmitPrologue()) {
     Out.debug_wavefront_private_segment_offset_sgpr =
@@ -1198,55 +1201,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   }
 }
 
-AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
-
-  HSACodeProps.mKernargSegmentSize =
-      STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset());
-  HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
-  HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
-  HSACodeProps.mKernargSegmentAlign =
-      std::max(uint32_t(4), MFI.getMaxKernArgAlign());
-  HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
-  HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR;
-  HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR;
-  HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
-  HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
-  HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
-  HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
-  HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
-
-  return HSACodeProps;
-}
-
-AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
-
-  if (!STM.debuggerSupported())
-    return HSADebugProps;
-
-  HSADebugProps.mDebuggerABIVersion.push_back(1);
-  HSADebugProps.mDebuggerABIVersion.push_back(0);
-  HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount;
-  HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst;
-
-  if (STM.debuggerEmitPrologue()) {
-    HSADebugProps.mPrivateSegmentBufferSGPR =
-        ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
-    HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
-        ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
-  }
-
-  return HSADebugProps;
-}
-
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                        unsigned AsmVariant,
                                        const char *ExtraCode, raw_ostream &O) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 51d48a0c7320..22982d912c70 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Assembly printer class.
+/// AMDGPU Assembly printer class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,9 +17,11 @@
 
 #include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
-#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "SIProgramInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -29,9 +31,10 @@
 
 namespace llvm {
 
+class AMDGPUMachineFunction;
 class AMDGPUTargetStreamer;
 class MCOperand;
-class SISubtarget;
+class GCNSubtarget;
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
@@ -47,68 +50,7 @@ private:
     bool HasDynamicallySizedStack = false;
     bool HasRecursion = false;
 
-    int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
-  };
-
-  // Track resource usage for kernels / entry functions.
-  struct SIProgramInfo {
-    // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks = 0;
-    uint32_t SGPRBlocks = 0;
-    uint32_t Priority = 0;
-    uint32_t FloatMode = 0;
-    uint32_t Priv = 0;
-    uint32_t DX10Clamp = 0;
-    uint32_t DebugMode = 0;
-    uint32_t IEEEMode = 0;
-    uint64_t ScratchSize = 0;
-
-    uint64_t ComputePGMRSrc1 = 0;
-
-    // Fields set in PGM_RSRC2 pm4 packet.
-    uint32_t LDSBlocks = 0;
-    uint32_t ScratchBlocks = 0;
-
-    uint64_t ComputePGMRSrc2 = 0;
-
-    uint32_t NumVGPR = 0;
-    uint32_t NumSGPR = 0;
-    uint32_t LDSSize = 0;
-    bool FlatUsed = false;
-
-    // Number of SGPRs that meets number of waves per execution unit request.
-    uint32_t NumSGPRsForWavesPerEU = 0;
-
-    // Number of VGPRs that meets number of waves per execution unit request.
-    uint32_t NumVGPRsForWavesPerEU = 0;
-
-    // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
-    // fixed VGPR number reserved.
-    uint16_t ReservedVGPRFirst = 0;
-
-    // The number of consecutive VGPRs reserved.
-    uint16_t ReservedVGPRCount = 0;
-
-    // Fixed SGPR number used to hold wave scratch offset for entire kernel
-    // execution, or std::numeric_limits<uint16_t>::max() if the register is not
-    // used or not known.
-    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
-        std::numeric_limits<uint16_t>::max();
-
-    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
-    // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
-    // is not used or not known.
-    uint16_t DebuggerPrivateSegmentBufferSGPR =
-        std::numeric_limits<uint16_t>::max();
-
-    // Whether there is recursion, dynamic allocas, indirect calls or some other
-    // reason there may be statically unknown stack usage.
-    bool DynamicCallStack = false;
-
-    // Bonus information for debugging.
-    bool VCCUsed = false;
-
-    SIProgramInfo() = default;
+    int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
   };
 
   SIProgramInfo CurrentProgramInfo;
@@ -128,16 +70,8 @@ private:
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
 
-  AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps(
-      const MachineFunction &MF,
-      const SIProgramInfo &ProgramInfo) const;
-  AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps(
-      const MachineFunction &MF,
-      const SIProgramInfo &ProgramInfo) const;
-
-  /// \brief Emit register usage information so that the GPU driver
+  /// Emit register usage information so that the GPU driver
   /// can correctly setup the GPU state.
-  void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF,
                          const SIProgramInfo &KernelInfo);
   void EmitPALMetadata(const MachineFunction &MF,
@@ -145,7 +79,15 @@ private:
   void emitCommonFunctionComments(uint32_t NumVGPR,
                                   uint32_t NumSGPR,
                                   uint64_t ScratchSize,
-                                  uint64_t CodeSize);
+                                  uint64_t CodeSize,
+                                  const AMDGPUMachineFunction* MFI);
+
+  uint16_t getAmdhsaKernelCodeProperties(
+      const MachineFunction &MF) const;
+
+  amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
+      const MachineFunction &MF,
+      const SIProgramInfo &PI) const;
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
@@ -160,16 +102,16 @@ public:
   bool doFinalization(Module &M) override;
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+  /// Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
   /// pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
 
-  /// \brief Lower the specified LLVM Constant to an MCExpr.
+  /// Lower the specified LLVM Constant to an MCExpr.
   /// The AsmPrinter::lowerConstantof does not know how to lower
   /// addrspacecast, therefore they should be lowered by this function.
   const MCExpr *lowerConstant(const Constant *CV) override;
 
-  /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+  /// tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
@@ -179,6 +121,8 @@ public:
 
   void EmitFunctionBodyStart() override;
 
+  void EmitFunctionBodyEnd() override;
+
   void EmitFunctionEntryLabel() override;
 
   void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5a9138731934..18c7df0d94f2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,13 +33,17 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                      const Value *Val, unsigned VReg) const {
+  // FIXME: Add support for non-void returns.
+  if (Val)
+    return false;
+
   MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
   return true;
 }
 
 unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
                                                Type *ParamTy,
-                                               unsigned Offset) const {
+                                               uint64_t Offset) const {
 
   MachineFunction &MF = MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -61,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
 }
 
 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
-                                        Type *ParamTy, unsigned Offset,
+                                        Type *ParamTy, uint64_t Offset,
+                                        unsigned Align,
                                         unsigned DstReg) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -69,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
-  unsigned Align = DL.getABITypeAlignment(ParamTy);
   unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
 
   MachineMemOperand *MMO =
@@ -84,12 +89,16 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
 bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                               const Function &F,
                                               ArrayRef<unsigned> VRegs) const {
+  // AMDGPU_GS and AMDGP_HS are not supported yet.
+  if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
+      F.getCallingConv() == CallingConv::AMDGPU_HS)
+    return false;
 
   MachineFunction &MF = MIRBuilder.getMF();
-  const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+  const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -116,7 +125,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   if (Info->hasKernargSegmentPtr()) {
     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
-    const LLT P2 = LLT::pointer(2, 64);
+    const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
     unsigned VReg = MRI.createGenericVirtualRegister(P2);
     MRI.addLiveIn(InputPtrReg, VReg);
     MIRBuilder.getMBB().addLiveIn(InputPtrReg);
@@ -136,49 +145,106 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
+  // The infrastructure for normal calling convention lowering is essentially
+  // useless for kernels. We want to avoid any kind of legalization or argument
+  // splitting.
+  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    unsigned i = 0;
+    const unsigned KernArgBaseAlign = 16;
+    const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+    uint64_t ExplicitArgOffset = 0;
+
+    // TODO: Align down to dword alignment and extract bits for extending loads.
+    for (auto &Arg : F.args()) {
+      Type *ArgTy = Arg.getType();
+      unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+      if (AllocSize == 0)
+        continue;
+
+      unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+      uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+      ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+      unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+      ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+      lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
+      ++i;
+    }
+
+    return true;
+  }
+
   unsigned NumArgs = F.arg_size();
   Function::const_arg_iterator CurOrigArg = F.arg_begin();
   const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+  unsigned PSInputNum = 0;
+  BitVector Skipped(NumArgs);
   for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
     EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
 
     // We can only hanlde simple value types at the moment.
-    if (!ValEVT.isSimple())
-      return false;
-    MVT ValVT = ValEVT.getSimpleVT();
     ISD::ArgFlagsTy Flags;
     ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
     setArgFlags(OrigArg, i + 1, DL, F);
     Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+
+    if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+        !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
+        PSInputNum <= 15) {
+      if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
+        Skipped.set(i);
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->markPSInputAllocated(PSInputNum);
+      if (!CurOrigArg->use_empty())
+        Info->markPSInputEnabled(PSInputNum);
+
+      ++PSInputNum;
+    }
+
     CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
                                              /*IsVarArg=*/false);
-    bool Res =
-        AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
 
-    // Fail if we don't know how to handle this type.
-    if (Res)
-      return false;
+    if (ValEVT.isVector()) {
+      EVT ElemVT = ValEVT.getVectorElementType();
+      if (!ValEVT.isSimple())
+        return false;
+      MVT ValVT = ElemVT.getSimpleVT();
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
+                          OrigArg.Flags, CCInfo);
+      if (!Res)
+        return false;
+    } else {
+      MVT ValVT = ValEVT.getSimpleVT();
+      if (!ValEVT.isSimple())
+        return false;
+      bool Res =
+          AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
+
+      // Fail if we don't know how to handle this type.
+      if (Res)
+        return false;
+    }
   }
 
   Function::const_arg_iterator Arg = F.arg_begin();
 
-  if (F.getCallingConv() == CallingConv::AMDGPU_VS) {
-    for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
-      CCValAssign &VA = ArgLocs[i];
-      MRI.addLiveIn(VA.getLocReg(), VRegs[i]);
+  if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
+      F.getCallingConv() == CallingConv::AMDGPU_PS) {
+    for (unsigned i = 0, OrigArgIdx = 0;
+         OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
+       if (Skipped.test(OrigArgIdx))
+          continue;
+      CCValAssign &VA = ArgLocs[i++];
+      MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]);
       MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
-      MIRBuilder.buildCopy(VRegs[i], VA.getLocReg());
+      MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg());
     }
     return true;
   }
 
-  for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
-    // FIXME: We should be getting DebugInfo from the arguments some how.
-    CCValAssign &VA = ArgLocs[i];
-    lowerParameter(MIRBuilder, Arg->getType(),
-                   VA.getLocMemOffset() +
-                   Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]);
-  }
-
-  return true;
+  return false;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 251cb7a2c440..f51cb6abbf65 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLowering {
   AMDGPUAS AMDGPUASI;
 
   unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                             unsigned Offset) const;
+                             uint64_t Offset) const;
 
   void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                      unsigned Offset, unsigned DstReg) const;
+                      uint64_t Offset, unsigned Align,
+                      unsigned DstReg) const;
 
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c1c066fd1404..68bc7fdd9961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -85,22 +85,6 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>
 ]>;
 
-// Calling convention for R600
-def CC_R600 : CallingConv<[
-  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
-    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
-    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
-    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
-    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
-    T30_XYZW, T31_XYZW, T32_XYZW
-  ]>>>
-]>;
-
-// Calling convention for compute kernels
-def CC_AMDGPU_Kernel : CallingConv<[
-  CCCustom<"allocateKernArg">
-]>;
-
 def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
   (sequence "VGPR%u", 24, 255)
 >;
@@ -127,7 +111,7 @@ def CC_AMDGPU_Func : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
   CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
   CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
@@ -144,30 +128,16 @@ def RetCC_AMDGPU_Func : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "!AMDGPU::isShader(State.getCallingConv())",
-       CCDelegateTo<CC_AMDGPU_Kernel>>,
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "!AMDGPU::isShader(State.getCallingConv())",
-        CCDelegateTo<CC_AMDGPU_Kernel>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
+   CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
         CCDelegateTo<CC_SI>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
+   CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
-        CCDelegateTo<CC_AMDGPU_Func>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
-          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_R600>>
+        CCDelegateTo<CC_AMDGPU_Func>>
 ]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b17b67167666..5713b7b7f9a8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -17,8 +17,10 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
@@ -48,15 +50,22 @@ using namespace llvm;
 
 namespace {
 
+static cl::opt<bool> WidenLoads(
+  "amdgpu-codegenprepare-widen-constant-loads",
+  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
-  const SISubtarget *ST = nullptr;
+  const GCNSubtarget *ST = nullptr;
+  AssumptionCache *AC = nullptr;
   DivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   bool HasUnsafeFPMath = false;
   AMDGPUAS AMDGPUASI;
 
-  /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
   ///
   /// \returns Binary operation \p V.
@@ -80,7 +89,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// false otherwise.
   bool needsPromotionToI32(const Type *T) const;
 
-  /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
   /// operation.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
@@ -93,7 +102,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// false otherwise.
   bool promoteUniformOpToI32(BinaryOperator &I) const;
 
-  /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
   /// than or equal 16. Promotion is done by sign or zero extending operands to
@@ -102,7 +111,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformOpToI32(ICmpInst &I) const;
 
-  /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
   /// operation.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
@@ -113,7 +122,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformOpToI32(SelectInst &I) const;
 
-  /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
   /// intrinsic.
   ///
   /// \details \p I's base element bit width must be greater than 1 and less
@@ -125,7 +134,17 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   ///
   /// \returns True.
   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
-  /// \brief Widen a scalar load.
+
+  /// Expands 24 bit div or rem.
+  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den,
+                        bool IsDiv, bool IsSigned) const;
+
+  /// Expands 32 bit div or rem.
+  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den) const;
+
+  /// Widen a scalar load.
   ///
   /// \details \p Widen scalar load for uniform, small type loads from constant
   //  memory / to a full 32-bits and then truncate the input to allow a scalar
@@ -157,6 +176,7 @@ public:
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DivergenceAnalysis>();
     AU.setPreservesAll();
  }
@@ -250,7 +270,9 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
          "I does not need promotion to i32");
 
   if (I.getOpcode() == Instruction::SDiv ||
-      I.getOpcode() == Instruction::UDiv)
+      I.getOpcode() == Instruction::UDiv ||
+      I.getOpcode() == Instruction::SRem ||
+      I.getOpcode() == Instruction::URem)
     return false;
 
   IRBuilder<> Builder(&I);
@@ -372,13 +394,18 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
-    return false;
+    return HasDenormals;
+
+  if (UnsafeDiv)
+    return true;
+
+  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
 
   // Reciprocal f32 is handled separately without denormals.
-  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+  return HasDenormals ^ IsOne;
 }
 
 // Insert an intrinsic for fast fdiv for safe math situations where we can
@@ -404,7 +431,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
                                       FMF.allowReciprocal();
 
   // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (ST->hasFP32Denormals() || UnsafeDiv)
+  if (UnsafeDiv)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
@@ -418,6 +445,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 
   Value *NewFDiv = nullptr;
 
+  bool HasDenormals = ST->hasFP32Denormals();
   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     NewFDiv = UndefValue::get(VT);
 
@@ -428,7 +456,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
       Value *NewElt;
 
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
       } else {
         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -437,7 +465,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
   } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
   }
 
@@ -447,7 +475,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     FDiv.eraseFromParent();
   }
 
-  return true;
+  return !!NewFDiv;
 }
 
 static bool hasUnsafeFPMath(const Function &F) {
@@ -455,18 +483,324 @@ static bool hasUnsafeFPMath(const Function &F) {
   return Attr.getValueAsString() == "true";
 }
 
+static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
+                                          Value *LHS, Value *RHS) {
+  Type *I32Ty = Builder.getInt32Ty();
+  Type *I64Ty = Builder.getInt64Ty();
+
+  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
+  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
+  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
+  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
+  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
+  Hi = Builder.CreateTrunc(Hi, I32Ty);
+  return std::make_pair(Lo, Hi);
+}
+
+static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
+  return getMul64(Builder, LHS, RHS).second;
+}
+
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den,
+                                            bool IsDiv, bool IsSigned) const {
+  assert(Num->getType()->isIntegerTy(32));
+
+  const DataLayout &DL = Mod->getDataLayout();
+  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
+  if (LHSSignBits < 9)
+    return nullptr;
+
+  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
+  if (RHSSignBits < 9)
+    return nullptr;
+
+
+  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+  unsigned DivBits = 32 - SignBits;
+  if (IsSigned)
+    ++DivBits;
+
+  Type *Ty = Num->getType();
+  Type *I32Ty = Builder.getInt32Ty();
+  Type *F32Ty = Builder.getFloatTy();
+  ConstantInt *One = Builder.getInt32(1);
+  Value *JQ = One;
+
+  if (IsSigned) {
+    // char|short jq = ia ^ ib;
+    JQ = Builder.CreateXor(Num, Den);
+
+    // jq = jq >> (bitsize - 2)
+    JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
+
+    // jq = jq | 0x1
+    JQ = Builder.CreateOr(JQ, One);
+  }
+
+  // int ia = (int)LHS;
+  Value *IA = Num;
+
+  // int ib, (int)RHS;
+  Value *IB = Den;
+
+  // float fa = (float)ia;
+  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
+                       : Builder.CreateUIToFP(IA, F32Ty);
+
+  // float fb = (float)ib;
+  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
+                       : Builder.CreateUIToFP(IB,F32Ty);
+
+  Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+  Value *FQM = Builder.CreateFMul(FA, RCP);
+
+  // fq = trunc(fqm);
+  CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
+  FQ->copyFastMathFlags(Builder.getFastMathFlags());
+
+  // float fqneg = -fq;
+  Value *FQNeg = Builder.CreateFNeg(FQ);
+
+  // float fr = mad(fqneg, fb, fa);
+  Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+                                      { FQNeg, FB, FA }, FQ);
+
+  // int iq = (int)fq;
+  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
+                       : Builder.CreateFPToUI(FQ, I32Ty);
+
+  // fr = fabs(fr);
+  FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
+
+  // fb = fabs(fb);
+  FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
+
+  // int cv = fr >= fb;
+  Value *CV = Builder.CreateFCmpOGE(FR, FB);
+
+  // jq = (cv ? jq : 0);
+  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
+
+  // dst = iq + jq;
+  Value *Div = Builder.CreateAdd(IQ, JQ);
+
+  Value *Res = Div;
+  if (!IsDiv) {
+    // Rem needs compensation, it's easier to recompute it
+    Value *Rem = Builder.CreateMul(Div, Den);
+    Res = Builder.CreateSub(Num, Rem);
+  }
+
+  // Truncate to number of bits this divide really is.
+  if (IsSigned) {
+    Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
+    Res = Builder.CreateSExt(Res, Ty);
+  } else {
+    ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+    Res = Builder.CreateAnd(Res, TruncMask);
+  }
+
+  return Res;
+}
+
+Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den) const {
+  Instruction::BinaryOps Opc = I.getOpcode();
+  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
+         Opc == Instruction::SRem || Opc == Instruction::SDiv);
+
+  FastMathFlags FMF;
+  FMF.setFast();
+  Builder.setFastMathFlags(FMF);
+
+  if (isa<Constant>(Den))
+    return nullptr; // Keep it for optimization
+
+  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
+  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
+
+  Type *Ty = Num->getType();
+  Type *I32Ty = Builder.getInt32Ty();
+  Type *F32Ty = Builder.getFloatTy();
+
+  if (Ty->getScalarSizeInBits() < 32) {
+    if (IsSigned) {
+      Num = Builder.CreateSExt(Num, I32Ty);
+      Den = Builder.CreateSExt(Den, I32Ty);
+    } else {
+      Num = Builder.CreateZExt(Num, I32Ty);
+      Den = Builder.CreateZExt(Den, I32Ty);
+    }
+  }
+
+  if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
+    Res = Builder.CreateTrunc(Res, Ty);
+    return Res;
+  }
+
+  ConstantInt *Zero = Builder.getInt32(0);
+  ConstantInt *One = Builder.getInt32(1);
+  ConstantInt *MinusOne = Builder.getInt32(~0);
+
+  Value *Sign = nullptr;
+  if (IsSigned) {
+    ConstantInt *K31 = Builder.getInt32(31);
+    Value *LHSign = Builder.CreateAShr(Num, K31);
+    Value *RHSign = Builder.CreateAShr(Den, K31);
+    // Remainder sign is the same as LHS
+    Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+
+    Num = Builder.CreateAdd(Num, LHSign);
+    Den = Builder.CreateAdd(Den, RHSign);
+
+    Num = Builder.CreateXor(Num, LHSign);
+    Den = Builder.CreateXor(Den, RHSign);
+  }
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
+  Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
+  Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
+  Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
+  Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
+
+  // RCP_LO, RCP_HI = mul(RCP, Den) */
+  Value *RCP_LO, *RCP_HI;
+  std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
+  Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
+
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  Value *RCP_A_E = Builder.CreateAdd(RCP, E);
+
+  // RCP_S_E = RCP - E
+  Value *RCP_S_E = Builder.CreateSub(RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
+
+  // Quotient = mulhu(Tmp0, Num)
+  Value *Quotient = getMulHu(Builder, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
+  Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
+  Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
+                                                  MinusOne, Zero);
+
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
+  Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+
+  Value *Res;
+  if (IsDiv) {
+    // Quotient_A_One = Quotient + 1
+    Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+
+    // Quotient_S_One = Quotient - 1
+    Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+
+    // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+    Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+
+    // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
+  } else {
+    // Remainder_S_Den = Remainder - Den
+    Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+
+    // Remainder_A_Den = Remainder + Den
+    Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+
+    // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+    Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+
+    // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+  }
+
+  if (IsSigned) {
+    Res = Builder.CreateXor(Res, Sign);
+    Res = Builder.CreateSub(Res, Sign);
+  }
+
+  Res = Builder.CreateTrunc(Res, Ty);
+
+  return Res;
+}
+
 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      DA->isUniform(&I) && promoteUniformOpToI32(I))
+    return true;
+
   bool Changed = false;
+  Instruction::BinaryOps Opc = I.getOpcode();
+  Type *Ty = I.getType();
+  Value *NewDiv = nullptr;
+  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
+       Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
+      Ty->getScalarSizeInBits() <= 32) {
+    Value *Num = I.getOperand(0);
+    Value *Den = I.getOperand(1);
+    IRBuilder<> Builder(&I);
+    Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
-      DA->isUniform(&I))
-    Changed |= promoteUniformOpToI32(I);
+    if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+      NewDiv = UndefValue::get(VT);
+
+      for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
+        Value *NumEltN = Builder.CreateExtractElement(Num, N);
+        Value *DenEltN = Builder.CreateExtractElement(Den, N);
+        Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+        if (!NewElt)
+          NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+        NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
+      }
+    } else {
+      NewDiv = expandDivRem32(Builder, I, Num, Den);
+    }
+
+    if (NewDiv) {
+      I.replaceAllUsesWith(NewDiv);
+      I.eraseFromParent();
+      Changed = true;
+    }
+  }
 
   return Changed;
 }
 
-bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
-  if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+  if (!WidenLoads)
+    return false;
+
+  if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+       I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
       canWidenScalarExtLoad(I)) {
     IRBuilder<> Builder(&I);
     Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -474,7 +808,28 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
     Type *I32Ty = Builder.getInt32Ty();
     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
-    Value *WidenLoad = Builder.CreateLoad(BitCast);
+    LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
+    WidenLoad->copyMetadata(I);
+
+    // If we have range metadata, we need to convert the type, and not make
+    // assumptions about the high bits.
+    if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
+      ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(Range->getOperand(0));
+
+      if (Lower->getValue().isNullValue()) {
+        WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
+      } else {
+        Metadata *LowAndHigh[] = {
+          ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
+          // Don't make assumptions about the high bits.
+          ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
+        };
+
+        WidenLoad->setMetadata(LLVMContext::MD_range,
+                               MDNode::get(Mod->getContext(), LowAndHigh));
+      }
+    }
 
     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     Type *IntNTy = Builder.getIntNTy(TySize);
@@ -540,10 +895,12 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   if (!TPC)
     return false;
 
-  const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  ST = &TM.getSubtarget<SISubtarget>(F);
+  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
+  ST = &TM.getSubtarget<GCNSubtarget>(F);
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DA = &getAnalysis<DivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
+  AMDGPUASI = TM.getAMDGPUAS();
 
   bool MadeChange = false;
 
@@ -560,6 +917,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 
 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                     false, false)
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
new file mode 100644
index 000000000000..b375cae9018e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -0,0 +1,60 @@
+//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+  "FP64",
+  "true",
+  "Enable double precision operations"
+>;
+
+def FeatureFMA : SubtargetFeature<"fmaf",
+  "FMA",
+  "true",
+  "Enable single precision FMA (not as fast as mul+add, but fused)"
+>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+  "localmemorysize"#Value,
+  "LocalMemorySize",
+  !cast<string>(Value),
+  "The size of local memory in bytes"
+>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+  "wavefrontsize"#Value,
+  "WavefrontSize",
+  !cast<string>(Value),
+  "The number of threads per wavefront"
+>;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureGeneration <string Value, string Subtarget,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+  "DX10Clamp",
+  "true",
+  "clamp modifier clamps NaNs to 0.0"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+  "EnablePromoteAlloca",
+  "true",
+  "Enable promote alloca pass"
+>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 91fe921bfeec..ee836bf8a631 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
+/// Interface to describe a layout of a stack frame on an AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,7 +19,7 @@
 
 namespace llvm {
 
-/// \brief Information about the stack frame layout on the AMDGPU targets.
+/// Information about the stack frame layout on the AMDGPU targets.
 ///
 /// It holds the direction of the stack growth, the known stack alignment on
 /// entry to each function, and the offset to the locals area.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
new file mode 100644
index 000000000000..ba735390f679
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -0,0 +1,138 @@
+//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This files contains patterns that should only be used by GlobalISel.  For
+// example patterns for V_* instructions that have S_* equivalents.
+// SelectionDAG does not support selecting V_* instructions.
+//===----------------------------------------------------------------------===//
+
+include "AMDGPU.td"
+
+def sd_vsrc0 : ComplexPattern<i32, 1, "">;
+def gi_vsrc0 :
+    GIComplexOperandMatcher<s32, "selectVSRC0">,
+    GIComplexPatternEquiv<sd_vsrc0>;
+
+def sd_vcsrc : ComplexPattern<i32, 1, "">;
+def gi_vcsrc :
+    GIComplexOperandMatcher<s32, "selectVCSRC">,
+    GIComplexPatternEquiv<sd_vcsrc>;
+
+def gi_vop3mods0 :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods0">,
+    GIComplexPatternEquiv<VOP3Mods0>;
+
+def gi_vop3mods :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods">,
+    GIComplexPatternEquiv<VOP3Mods>;
+
+def gi_vop3omods :
+    GIComplexOperandMatcher<s32, "selectVOP3OMods">,
+    GIComplexPatternEquiv<VOP3OMods>;
+
+class GISelSop2Pat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt SReg_32:$src0), (src1_vt SReg_32:$src1))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2Pat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt (sd_vsrc0 src0_vt:$src0)), (src1_vt VGPR_32:$src1))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop2CommutePat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src1_vt VGPR_32:$src1), (src0_vt (sd_vsrc0 src0_vt:$src0)))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2 <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+  (inst src0_vt:$src0, src1_vt:$src1)
+>;
+
+class GISelVop3Pat2CommutePat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt>   : GCNPat <
+
+  (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))),
+  (inst src0_vt:$src1, src1_vt:$src0)
+>;
+
+class GISelVop3Pat2ModsPat <
+  SDPatternOperator node,
+  Instruction inst,
+  ValueType dst_vt,
+  ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat <
+
+  (dst_vt (node (src0_vt (VOP3Mods0 src0_vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omods)),
+                (src1_vt (VOP3Mods src1_vt:$src1, i32:$src1_modifiers)))),
+  (inst i32:$src0_modifiers, src0_vt:$src0,
+        i32:$src1_modifiers, src1_vt:$src1, $clamp, $omods)
+>;
+
+multiclass GISelVop2IntrPat <
+  SDPatternOperator node, Instruction inst,
+  ValueType dst_vt, ValueType src_vt = dst_vt> {
+
+  def : GISelVop2Pat <node, inst, dst_vt, src_vt>;
+
+  // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit
+  // pattern to handle commuting.  This is another reason why legalizing to a
+  // generic machine instruction may be better that matching the intrinsic
+  // directly.
+  def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>;
+}
+
+def : GISelSop2Pat <or, S_OR_B32, i32>;
+def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
+
+def : GISelSop2Pat <sra, S_ASHR_I32, i32>;
+let AddedComplexity = 100 in {
+let SubtargetPredicate = isSICI in {
+def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>;
+}
+def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
+}
+def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
+
+// FIXME: Select directly to _e32 so we don't need to deal with modifiers.
+// FIXME: We can't re-use SelectionDAG patterns here because they match
+// against a custom SDNode and we would need to create a generic machine
+// instruction that is equivalent to the custom SDNode.  This would also require
+// us to custom legalize the intrinsic to the new generic machine instruction,
+// but I can't get custom legalizing of intrinsic to work and I'm not sure if
+// this is even supported yet.
+defm : GISelVop2IntrPat <
+  int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>;
+
+defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
+defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
+def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index bf7deb500d1a..3a58c6c6a29f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -16,41 +16,89 @@ namespace AMDGPU {
 
 enum PartialMappingIdx {
   None = - 1,
-  PM_SGPR32 = 0,
-  PM_SGPR64 = 1,
-  PM_VGPR32 = 2,
-  PM_VGPR64 = 3
+  PM_SGPR1  = 0,
+  PM_SGPR16 = 4,
+  PM_SGPR32 = 5,
+  PM_SGPR64 = 6,
+  PM_SGPR128 = 7,
+  PM_SGPR256 = 8,
+  PM_SGPR512 = 9,
+  PM_VGPR1  = 10,
+  PM_VGPR16 = 14,
+  PM_VGPR32 = 15,
+  PM_VGPR64 = 16,
+  PM_VGPR128 = 17,
+  PM_VGPR256 = 18,
+  PM_VGPR512 = 19,
+  PM_SGPR96 = 20,
+  PM_VGPR96 = 21
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
   // StartIdx, Length, RegBank
+  {0, 1,  SCCRegBank},
+  {0, 16, SGPRRegBank},
   {0, 32, SGPRRegBank},
   {0, 64, SGPRRegBank},
+  {0, 128, SGPRRegBank},
+  {0, 256, SGPRRegBank},
+  {0, 512, SGPRRegBank},
+  {0, 1,  SGPRRegBank},
+  {0, 16, VGPRRegBank},
   {0, 32, VGPRRegBank},
-  {0, 64, VGPRRegBank}
+  {0, 64, VGPRRegBank},
+  {0, 128, VGPRRegBank},
+  {0, 256, VGPRRegBank},
+  {0, 512, VGPRRegBank},
+  {0, 96, SGPRRegBank},
+  {0, 96, VGPRRegBank},
 };
 
 const RegisterBankInfo::ValueMapping ValMappings[] {
-  // SGPR 32-bit
   {&PartMappings[0], 1},
-  // SGPR 64-bit
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
   {&PartMappings[1], 1},
-  // VGPR 32-bit
   {&PartMappings[2], 1},
-  // VGPR 64-bit
-  {&PartMappings[3], 1}
+  {&PartMappings[3], 1},
+  {&PartMappings[4], 1},
+  {&PartMappings[5], 1},
+  {&PartMappings[6], 1},
+  {&PartMappings[7], 1},
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
+  {&PartMappings[8], 1},
+  {&PartMappings[9], 1},
+  {&PartMappings[10], 1},
+  {&PartMappings[11], 1},
+  {&PartMappings[12], 1},
+  {&PartMappings[13], 1},
+  {&PartMappings[14], 1},
+  {&PartMappings[15], 1}
 };
 
 enum ValueMappingIdx {
   SGPRStartIdx = 0,
-  VGPRStartIdx = 2
+  VGPRStartIdx = 10
 };
 
 const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
                                                       unsigned Size) {
-  assert(Size % 32 == 0);
-  unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
-  Idx += (Size / 32) - 1;
+  unsigned Idx;
+  switch (Size) {
+  case 1:
+    Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1;
+    break;
+  case 96:
+    Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
+    break;
+  default:
+    Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
+    Idx += Log2_32_Ceil(Size);
+    break;
+  }
   return &ValMappings[Idx];
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 463e700f13b7..01ef346f74ee 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -8,13 +8,17 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
 ///
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIProgramInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
@@ -196,6 +200,57 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
   return Dims;
 }
 
+Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
+    const MachineFunction &MF,
+    const SIProgramInfo &ProgramInfo) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
+  const Function &F = MF.getFunction();
+
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+  unsigned MaxKernArgAlign;
+  HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
+                                                               MaxKernArgAlign);
+  HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
+  HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
+  HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+  HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
+  HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
+  HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
+  HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
+  HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
+  HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
+  HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
+  HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
+
+  return HSACodeProps;
+}
+
+Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
+    const MachineFunction &MF,
+    const SIProgramInfo &ProgramInfo) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
+
+  if (!STM.debuggerSupported())
+    return HSADebugProps;
+
+  HSADebugProps.mDebuggerABIVersion.push_back(1);
+  HSADebugProps.mDebuggerABIVersion.push_back(0);
+
+  if (STM.debuggerEmitPrologue()) {
+    HSADebugProps.mPrivateSegmentBufferSGPR =
+        ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
+    HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+        ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+  }
+
+  return HSADebugProps;
+}
+
 void MetadataStreamer::emitVersion() {
   auto &Version = HSAMetadata.mVersion;
 
@@ -255,32 +310,7 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
-  // TODO: What about other languages?
-  if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
-    return;
-
-  auto &DL = Func.getParent()->getDataLayout();
-  auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
-  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
-  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
-  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
-
-  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
-                                      AMDGPUASI.GLOBAL_ADDRESS);
-  auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
-  if (CallsPrintf)
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
-  if (Func.hasFnAttribute("calls-enqueue-kernel")) {
-    if (!CallsPrintf) {
-      // Emit a dummy argument so that the remaining hidden arguments
-      // have a fixed position relative to the first hidden argument.
-      // This is to facilitate library code to access hidden arguments.
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
-    }
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
-  }
+  emitHiddenKernelArgs(Func);
 }
 
 void MetadataStreamer::emitKernelArg(const Argument &Arg) {
@@ -320,13 +350,26 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
   if (Node && ArgNo < Node->getNumOperands())
     TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
 
-  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
-                getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name,
-                TypeName, BaseTypeName, AccQual, TypeQual);
+  Type *Ty = Arg.getType();
+  const DataLayout &DL = Func->getParent()->getDataLayout();
+
+  unsigned PointeeAlign = 0;
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+      PointeeAlign = Arg.getParamAlignment();
+      if (PointeeAlign == 0)
+        PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+    }
+  }
+
+  emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName),
+                PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
 void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
-                                     ValueKind ValueKind, StringRef Name,
+                                     ValueKind ValueKind,
+                                     unsigned PointeeAlign,
+                                     StringRef Name,
                                      StringRef TypeName, StringRef BaseTypeName,
                                      StringRef AccQual, StringRef TypeQual) {
   HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
@@ -338,12 +381,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   Arg.mAlign = DL.getABITypeAlignment(Ty);
   Arg.mValueKind = ValueKind;
   Arg.mValueType = getValueType(Ty, BaseTypeName);
-
-  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
-    auto ElTy = PtrTy->getElementType();
-    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
-      Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
-  }
+  Arg.mPointeeAlign = PointeeAlign;
 
   if (auto PtrTy = dyn_cast<PointerType>(Ty))
     Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
@@ -366,6 +404,48 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   }
 }
 
+void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+  int HiddenArgNumBytes =
+      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+  if (!HiddenArgNumBytes)
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  if (HiddenArgNumBytes >= 8)
+    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+  if (HiddenArgNumBytes >= 16)
+    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+  if (HiddenArgNumBytes >= 24)
+    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+                                      AMDGPUASI.GLOBAL_ADDRESS);
+
+  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+  // "none" argument.
+  if (HiddenArgNumBytes >= 32) {
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+    else
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+  }
+
+  // Emit "default queue" and "completion action" arguments if enqueue kernel is
+  // used, otherwise emit dummy "none" arguments.
+  if (HiddenArgNumBytes >= 48) {
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+    }
+  }
+}
+
 void MetadataStreamer::begin(const Module &Mod) {
   AMDGPUASI = getAMDGPUAS(Mod);
   emitVersion();
@@ -383,13 +463,14 @@ void MetadataStreamer::end() {
     verify(HSAMetadataString);
 }
 
-void MetadataStreamer::emitKernel(
-    const Function &Func,
-    const Kernel::CodeProps::Metadata &CodeProps,
-    const Kernel::DebugProps::Metadata &DebugProps) {
+void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+  auto &Func = MF.getFunction();
   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
     return;
 
+  auto CodeProps = getHSACodeProps(MF, ProgramInfo);
+  auto DebugProps = getHSADebugProps(MF, ProgramInfo);
+
   HSAMetadata.mKernels.push_back(Kernel::Metadata());
   auto &Kernel = HSAMetadata.mKernels.back();
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index bd6515521a74..3424c956d781 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU HSA Metadata Streamer.
+/// AMDGPU HSA Metadata Streamer.
 ///
 //
 //===----------------------------------------------------------------------===//
@@ -28,6 +28,7 @@ class DataLayout;
 class Function;
 class MDNode;
 class Module;
+struct SIProgramInfo;
 class Type;
 
 namespace AMDGPU {
@@ -55,6 +56,13 @@ private:
 
   std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
 
+  Kernel::CodeProps::Metadata getHSACodeProps(
+      const MachineFunction &MF,
+      const SIProgramInfo &ProgramInfo) const;
+  Kernel::DebugProps::Metadata getHSADebugProps(
+      const MachineFunction &MF,
+      const SIProgramInfo &ProgramInfo) const;
+
   void emitVersion();
 
   void emitPrintf(const Module &Mod);
@@ -68,10 +76,13 @@ private:
   void emitKernelArg(const Argument &Arg);
 
   void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+                     unsigned PointeeAlign = 0,
                      StringRef Name = "", StringRef TypeName = "",
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
+  void emitHiddenKernelArgs(const Function &Func);
+
 public:
   MetadataStreamer() = default;
   ~MetadataStreamer() = default;
@@ -84,9 +95,7 @@ public:
 
   void end();
 
-  void emitKernel(const Function &Func,
-                  const Kernel::CodeProps::Metadata &CodeProps,
-                  const Kernel::DebugProps::Metadata &DebugProps);
+  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
 };
 
 } // end namespace HSAMD
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f4776adb069c..f25f4d4693ea 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Defines an instruction selector for the AMDGPU target.
+/// Defines an instruction selector for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,6 +16,7 @@
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
@@ -24,15 +25,16 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -68,7 +71,7 @@ namespace {
 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
-  const AMDGPUSubtarget *Subtarget;
+  const GCNSubtarget *Subtarget;
   AMDGPUAS AMDGPUASI;
   bool EnableLateStructurizeCFG;
 
@@ -83,6 +86,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AMDGPUArgumentUsageInfo>();
+    AU.addRequired<AMDGPUPerfHintAnalysis>();
+    AU.addRequired<DivergenceAnalysis>();
     SelectionDAGISel::getAnalysisUsage(AU);
   }
 
@@ -98,20 +103,12 @@ private:
   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
   bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
-  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
-                   const R600InstrInfo *TII);
-  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
-  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
-  bool isConstantLoad(const MemSDNode *N, int cbID) const;
   bool isUniformBr(const SDNode *N) const;
 
   SDNode *glueCopyToM0(SDNode *N) const;
 
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
-  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
-                                       SDValue& Offset);
   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -162,6 +159,7 @@ private:
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
+  SDValue Expand32BitAddress(SDValue Addr) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
                   bool &Imm) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -216,7 +214,7 @@ private:
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
   void SelectBRCOND(SDNode *N);
-  void SelectFMAD(SDNode *N);
+  void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
 
 protected:
@@ -225,9 +223,18 @@ protected:
 };
 
 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+  const R600Subtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
+
+  bool isConstantLoad(const MemSDNode *N, int cbID) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
 public:
   explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
-      AMDGPUDAGToDAGISel(TM, OptLevel) {}
+      AMDGPUDAGToDAGISel(TM, OptLevel) {
+    AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
+      }
 
   void Select(SDNode *N) override;
 
@@ -235,6 +242,11 @@ public:
                           SDValue &Offset) override;
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                           SDValue &Offset) override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+protected:
+  // Include the pieces autogenerated from the target description.
+#include "R600GenDAGISel.inc"
 };
 
 }  // end anonymous namespace
@@ -242,17 +254,19 @@ public:
 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 
-/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+/// This pass converts a legalized DAG into a AMDGPU-specific
 // DAG, ready for instruction scheduling.
 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
                                         CodeGenOpt::Level OptLevel) {
   return new AMDGPUDAGToDAGISel(TM, OptLevel);
 }
 
-/// \brief This pass converts a legalized DAG into a R600-specific
+/// This pass converts a legalized DAG into a R600-specific
 // DAG, ready for instruction scheduling.
 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
                                       CodeGenOpt::Level OptLevel) {
@@ -260,7 +274,7 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
 }
 
 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
+  Subtarget = &MF.getSubtarget<GCNSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -276,8 +290,7 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
 }
 
 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
-  const SIInstrInfo *TII
-    = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+  const SIInstrInfo *TII = Subtarget->getInstrInfo();
 
   if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
     return TII->isInlineConstant(C->getAPIntValue());
@@ -288,7 +301,7 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   return false;
 }
 
-/// \brief Determine the register class for \p OpNo
+/// Determine the register class for \p OpNo
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
@@ -303,7 +316,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
       }
 
       const SIRegisterInfo *TRI
-        = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+        = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
       return TRI->getPhysRegClass(Reg);
     }
 
@@ -394,7 +407,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   EVT VT = N->getValueType(0);
   unsigned NumVectorElts = VT.getVectorNumElements();
   EVT EltVT = VT.getVectorElementType();
-  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
   SDLoc DL(N);
   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
 
@@ -420,10 +432,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
       IsRegSeq = false;
       break;
     }
+    unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
-    RegSeqArgs[1 + (2 * i) + 1] =
-            CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
-                                      MVT::i32);
+    RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
   }
   if (NOps != NumVectorElts) {
     // Fill in the missing undef elements if this was a scalar_to_vector.
@@ -431,9 +442,10 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                    DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
+      unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
       RegSeqArgs[1 + (2 * i) + 1] =
-        CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+          CurDAG->getTargetConstant(Sub, DL, MVT::i32);
     }
   }
 
@@ -450,7 +462,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
 
   if (isa<AtomicSDNode>(N) ||
-      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
+      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
     N = glueCopyToM0(N);
 
   switch (Opc) {
@@ -487,9 +502,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::BUILD_VECTOR: {
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
-
-    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
-      if (Opc == ISD::BUILD_VECTOR) {
+    if (VT.getScalarSizeInBits() == 16) {
+      if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
         uint32_t LHSVal, RHSVal;
         if (getConstantValue(N->getOperand(0), LHSVal) &&
             getConstantValue(N->getOperand(1), RHSVal)) {
@@ -559,7 +573,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ISD::LOAD:
-  case ISD::STORE: {
+  case ISD::STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE: {
     N = glueCopyToM0(N);
     break;
   }
@@ -619,7 +635,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectBRCOND(N);
     return;
   case ISD::FMAD:
-    SelectFMAD(N);
+  case ISD::FMA:
+    SelectFMAD_FMA(N);
     return;
   case AMDGPUISD::ATOMIC_CMP_SWAP:
     SelectATOMIC_CMP_SWAP(N);
@@ -629,15 +646,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   SelectCode(N);
 }
 
-bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
-  if (!N->readMem())
-    return false;
-  if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
-
-  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
-}
-
 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
   const Instruction *Term = BB->getTerminator();
@@ -653,26 +661,6 @@ StringRef AMDGPUDAGToDAGISel::getPassName() const {
 // Complex Patterns
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-                                                         SDValue& IntPtr) {
-  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
-    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
-                                       true);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
-    SDValue& BaseReg, SDValue &Offset) {
-  if (!isa<ConstantSDNode>(Addr)) {
-    BaseReg = Addr;
-    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
-    return true;
-  }
-  return false;
-}
-
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                                             SDValue &Offset) {
   return false;
@@ -684,11 +672,11 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   SDLoc DL(Addr);
 
   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -759,12 +747,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
 
   if (ProduceCarry) {
     // Replace the carry-use
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+    ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
   }
 
   // Replace the remaining uses.
-  CurDAG->ReplaceAllUsesWith(N, RegSequence);
-  CurDAG->RemoveDeadNode(N);
+  ReplaceNode(N, RegSequence);
 }
 
 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
@@ -1410,7 +1397,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
     return false;
 
   SDLoc SL(ByteOffsetNode);
-  AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+  GCNSubtarget::Generation Gen = Subtarget->getGeneration();
   int64_t ByteOffset = C->getSExtValue();
   int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
 
@@ -1435,19 +1422,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   return true;
 }
 
+SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
+  if (Addr.getValueType() != MVT::i32)
+    return Addr;
+
+  // Zero-extend a 32-bit address.
+  SDLoc SL(Addr);
+
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned AddrHiVal = Info->get32BitAddressHighBits();
+  SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
+
+  const SDValue Ops[] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+    Addr,
+    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+            0),
+    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+  };
+
+  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+                                        Ops), 0);
+}
+
 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                      SDValue &Offset, bool &Imm) const {
   SDLoc SL(Addr);
+
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
     if (SelectSMRDOffset(N1, Offset, Imm)) {
-      SBase = N0;
+      SBase = Expand32BitAddress(N0);
       return true;
     }
   }
-  SBase = Addr;
+  SBase = Expand32BitAddress(Addr);
   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
   Imm = true;
   return true;
@@ -1651,7 +1664,7 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
     return true;
 
   if (VT == MVT::i64) {
-    auto ST = static_cast<const SISubtarget *>(Subtarget);
+    auto ST = static_cast<const GCNSubtarget *>(Subtarget);
 
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
@@ -1674,15 +1687,39 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
   unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
   SDLoc SL(N);
 
+  if (!UseSCCBr) {
+    // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
+    // analyzed what generates the vcc value, so we do not know whether vcc
+    // bits for disabled lanes are 0.  Thus we need to mask out bits for
+    // disabled lanes.
+    //
+    // For the case that we select S_CBRANCH_SCC1 and it gets
+    // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
+    // SIInstrInfo::moveToVALU which inserts the S_AND).
+    //
+    // We could add an analysis of what generates the vcc value here and omit
+    // the S_AND when is unnecessary. But it would be better to add a separate
+    // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
+    // catches both cases.
+    Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+                               CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+                               Cond),
+                   0);
+  }
+
   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
                        N->getOperand(2), // Basic Block
                        VCC.getValue(0));
 }
 
-void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
   MVT VT = N->getSimpleValueType(0);
-  if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
+  bool IsFMA = N->getOpcode() == ISD::FMA;
+  if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
+                         !Subtarget->hasFmaMixInsts()) ||
+      ((IsFMA && Subtarget->hasMadMixInsts()) ||
+       (!IsFMA && Subtarget->hasFmaMixInsts()))) {
     SelectCode(N);
     return;
   }
@@ -1692,13 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
   SDValue Src2 = N->getOperand(2);
   unsigned Src0Mods, Src1Mods, Src2Mods;
 
-  // Avoid using v_mad_mix_f32 unless there is actually an operand using the
-  // conversion from f16.
+  // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
+  // using the conversion from f16.
   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
 
-  assert(!Subtarget->hasFP32Denormals() &&
+  assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
          "fmad selected with denormals enabled");
   // TODO: We can select this with f32 denormals enabled if all the sources are
   // converted from f16 (in which case fmad isn't legal).
@@ -1714,7 +1751,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
       Zero, Zero
     };
 
-    CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+    CurDAG->SelectNodeTo(N,
+                         IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
+                         MVT::f32, Ops);
   } else {
     SelectCode(N);
   }
@@ -2100,6 +2139,41 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   } while (IsModified);
 }
 
+bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<R600Subtarget>();
+  return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+  if (!N->readMem())
+    return false;
+  if (CbId == -1)
+    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+           N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+
+  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+                                                         SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+                                       true);
+    return true;
+  }
+  return false;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!isa<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+    return true;
+  }
+  return false;
+}
+
 void R600DAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -2120,12 +2194,12 @@ void R600DAGToDAGISel::Select(SDNode *N) {
     // pass. We want to avoid 128 bits copies as much as possible because they
     // can't be bundled by our scheduler.
     switch(NumVectorElts) {
-    case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+    case 2: RegClassID = R600::R600_Reg64RegClassID; break;
     case 4:
       if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-        RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        RegClassID = R600::R600_Reg128VerticalRegClassID;
       else
-        RegClassID = AMDGPU::R600_Reg128RegClassID;
+        RegClassID = R600::R600_Reg128RegClassID;
       break;
     default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
     }
@@ -2143,11 +2217,11 @@ bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   SDLoc DL(Addr);
 
   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
@@ -2178,7 +2252,7 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
              && isInt<16>(IMMOffset->getZExtValue())) {
     Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
                                   SDLoc(CurDAG->getEntryNode()),
-                                  AMDGPU::ZERO, MVT::i32);
+                                  R600::ZERO, MVT::i32);
     Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
                                        MVT::i32);
     return true;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 21192a2c1cc8..b201126c593b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This is the parent TargetLowering class for hardware code gen
+/// This is the parent TargetLowering class for hardware code gen
 /// targets.
 //
 //===----------------------------------------------------------------------===//
@@ -25,9 +25,12 @@
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,18 +41,6 @@
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;
 
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                            CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  MachineFunction &MF = State.getMachineFunction();
-  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
-  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
-                                         ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return true;
-}
-
 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo,
                            ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
   case MVT::i64:
   case MVT::f64:
   case MVT::v2i32:
-  case MVT::v2f32: {
+  case MVT::v2f32:
+  case MVT::v4i16:
+  case MVT::v4f16: {
     // Up to SGPR0-SGPR39
     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
                           &AMDGPU::SGPR_64RegClass, 20);
@@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
   case MVT::i64:
   case MVT::f64:
   case MVT::v2i32:
-  case MVT::v2f32: {
+  case MVT::v2f32:
+  case MVT::v4i16:
+  case MVT::v4f16: {
     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
                           &AMDGPU::VReg_64RegClass, 31);
   }
@@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FLOG, MVT::f32, Custom);
   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
 
-  if (Subtarget->has16BitInsts()) {
-    setOperationAction(ISD::FLOG, MVT::f16, Custom);
-    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
-  }
 
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
@@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FREM, MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, Custom);
 
-  // v_mad_f32 does not support denormals according to some sources.
-  if (!Subtarget->hasFP32Denormals())
-    setOperationAction(ISD::FMAD, MVT::f32, Legal);
-
   // Expand to fneg + fadd.
   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
@@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
-    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
-    setOperationAction(ISD::FRINT, MVT::f64, Custom);
-    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
-  }
-
-  if (!Subtarget->hasBFI()) {
-    // fcopysign can be done in a single instruction with BFI.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  }
-
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
@@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
-  }
-
-  if (!Subtarget->hasBCNT(32))
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 
-  if (!Subtarget->hasBCNT(64))
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+    // AMDGPU uses ADDC/SUBC/ADDE/SUBE
+    setOperationAction(ISD::ADDC, VT, Legal);
+    setOperationAction(ISD::SUBC, VT, Legal);
+    setOperationAction(ISD::ADDE, VT, Legal);
+    setOperationAction(ISD::SUBE, VT, Legal);
+  }
 
   // The hardware supports 32-bit ROTR, but not ROTL.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SMAX, MVT::i32, Legal);
   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 
-  if (Subtarget->hasFFBH())
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-
-  if (Subtarget->hasFFBL())
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-
   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 
-  // We only really have 32-bit BFE instructions (and 16-bit on VI).
-  //
-  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
-  // effort to match them now. We want this to be false for i64 cases when the
-  // extraction isn't restricted to the upper or lower half. Ideally we would
-  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
-  // span the midpoint are probably relatively rare, so don't worry about them
-  // for now.
-  if (Subtarget->hasBFE())
-    setHasExtractBitsInsn(true);
-
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Custom);
     setOperationAction(ISD::UDIVREM, VT, Expand);
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // vector compares until that is fixed.
   setHasMultipleConditionRegisters(true);
 
-  // SI at least has hardware support for floating point exceptions, but no way
-  // of using or handling them is implemented. They are also optional in OpenCL
-  // (Section 7.3)
-  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
-
   PredictableSelectIsExpensive = false;
 
   // We want to find all load dependencies for long chains of stores to enable
@@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::MULHU);
   setTargetDAGCombine(ISD::MULHS);
@@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FNEARBYINT:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::SIN_HW:
   case AMDGPUISD::FMUL_LEGACY:
   case AMDGPUISD::FMIN_LEGACY:
@@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
   return true;
 }
 
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+  switch (N->getOpcode()) {
+    default:
+    return false;
+    case ISD::EntryToken:
+    case ISD::TokenFactor:
+      return true;
+    case ISD::INTRINSIC_WO_CHAIN:
+    {
+      unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      switch (IntrID) {
+        default:
+        return false;
+        case Intrinsic::amdgcn_readfirstlane:
+        case Intrinsic::amdgcn_readlane:
+          return true;
+      }
+    }
+    break;
+    case ISD::LOAD:
+    {
+      const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
+      if (L->getMemOperand()->getAddrSpace()
+      == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+        return true;
+      return false;
+    }
+    break;
+  }
+}
+
 //===---------------------------------------------------------------------===//
 // Target Properties
 //===---------------------------------------------------------------------===//
@@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return isZExtFree(Val.getValueType(), VT2);
 }
 
-// v_mad_mix* support a conversion from f16 to f32.
-//
-// There is only one special case when denormals are enabled we don't currently,
-// where this is OK to use.
-bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
-                                           EVT DestVT, EVT SrcVT) const {
-  return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
-         DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
-         SrcVT.getScalarType() == MVT::f16;
-}
-
 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
   // limited number of native 64-bit operations. Shrinking an operation to fit
@@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
@@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
@@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 /// for each individual part is i8.  We pass the memory type as LocVT to the
 /// calling convention analysis function and the register type (Ins[x].VT) as
 /// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
-                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-    const ISD::InputArg &In = Ins[i];
-    EVT MemVT;
-
-    unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
-
-    if (!Subtarget->isAmdHsaOS() &&
-        (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
-      // The ABI says the caller will extend these values to 32-bits.
-      MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
-    } else if (NumRegs == 1) {
-      // This argument is not split, so the IR type is the memory type.
-      assert(!In.Flags.isSplit());
-      if (In.ArgVT.isExtended()) {
-        // We have an extended type, like i24, so we should just use the register type
-        MemVT = In.VT;
-      } else {
-        MemVT = In.ArgVT;
-      }
-    } else if (In.ArgVT.isVector() && In.VT.isVector() &&
-               In.ArgVT.getScalarType() == In.VT.getScalarType()) {
-      assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
-      // We have a vector value which has been split into a vector with
-      // the same scalar type, but fewer elements.  This should handle
-      // all the floating-point vector types.
-      MemVT = In.VT;
-    } else if (In.ArgVT.isVector() &&
-               In.ArgVT.getVectorNumElements() == NumRegs) {
-      // This arg has been split so that each element is stored in a separate
-      // register.
-      MemVT = In.ArgVT.getScalarType();
-    } else if (In.ArgVT.isExtended()) {
-      // We have an extended type, like i65.
-      MemVT = In.VT;
-    } else {
-      unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
-      assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
-      if (In.VT.isInteger()) {
-        MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
-      } else if (In.VT.isVector()) {
-        assert(!In.VT.getScalarType().isFloatingPoint());
-        unsigned NumElements = In.VT.getVectorNumElements();
-        assert(MemoryBits % NumElements == 0);
-        // This vector type has been split into another vector type with
-        // a different elements size.
-        EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
-                                         MemoryBits / NumElements);
-        MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
+  CCState &State,
+  const SmallVectorImpl<ISD::InputArg> &Ins) const {
+  const MachineFunction &MF = State.getMachineFunction();
+  const Function &Fn = MF.getFunction();
+  LLVMContext &Ctx = Fn.getParent()->getContext();
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+
+  unsigned MaxAlign = 1;
+  uint64_t ExplicitArgOffset = 0;
+  const DataLayout &DL = Fn.getParent()->getDataLayout();
+
+  unsigned InIndex = 0;
+
+  for (const Argument &Arg : Fn.args()) {
+    Type *BaseArgTy = Arg.getType();
+    unsigned Align = DL.getABITypeAlignment(BaseArgTy);
+    MaxAlign = std::max(Align, MaxAlign);
+    unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+
+    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+    // We're basically throwing away everything passed into us and starting over
+    // to get accurate in-memory offsets. The "PartOffset" is completely useless
+    // to us as computed in Ins.
+    //
+    // We also need to figure out what type legalization is trying to do to get
+    // the correct memory offsets.
+
+    SmallVector<EVT, 16> ValueVTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      uint64_t BasePartOffset = Offsets[Value];
+
+      EVT ArgVT = ValueVTs[Value];
+      EVT MemVT = ArgVT;
+      MVT RegisterVT =
+        getRegisterTypeForCallingConv(Ctx, ArgVT);
+      unsigned NumRegs =
+        getNumRegistersForCallingConv(Ctx, ArgVT);
+
+      if (!Subtarget->isAmdHsaOS() &&
+          (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
+        // The ABI says the caller will extend these values to 32-bits.
+        MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+      } else if (NumRegs == 1) {
+        // This argument is not split, so the IR type is the memory type.
+        if (ArgVT.isExtended()) {
+          // We have an extended type, like i24, so we should just use the
+          // register type.
+          MemVT = RegisterVT;
+        } else {
+          MemVT = ArgVT;
+        }
+      } else if (ArgVT.isVector() && RegisterVT.isVector() &&
+                 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
+        assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
+        // We have a vector value which has been split into a vector with
+        // the same scalar type, but fewer elements.  This should handle
+        // all the floating-point vector types.
+        MemVT = RegisterVT;
+      } else if (ArgVT.isVector() &&
+                 ArgVT.getVectorNumElements() == NumRegs) {
+        // This arg has been split so that each element is stored in a separate
+        // register.
+        MemVT = ArgVT.getScalarType();
+      } else if (ArgVT.isExtended()) {
+        // We have an extended type, like i65.
+        MemVT = RegisterVT;
       } else {
-        llvm_unreachable("cannot deduce memory type.");
+        unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
+        assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
+        if (RegisterVT.isInteger()) {
+          MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+        } else if (RegisterVT.isVector()) {
+          assert(!RegisterVT.getScalarType().isFloatingPoint());
+          unsigned NumElements = RegisterVT.getVectorNumElements();
+          assert(MemoryBits % NumElements == 0);
+          // This vector type has been split into another vector type with
+          // a different elements size.
+          EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+                                           MemoryBits / NumElements);
+          MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+        } else {
+          llvm_unreachable("cannot deduce memory type.");
+        }
       }
-    }
 
-    // Convert one element vectors to scalar.
-    if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
-      MemVT = MemVT.getScalarType();
+      // Convert one element vectors to scalar.
+      if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+        MemVT = MemVT.getScalarType();
 
-    if (MemVT.isExtended()) {
-      // This should really only happen if we have vec3 arguments
-      assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
-      MemVT = MemVT.getPow2VectorType(State.getContext());
-    }
+      if (MemVT.isExtended()) {
+        // This should really only happen if we have vec3 arguments
+        assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+        MemVT = MemVT.getPow2VectorType(State.getContext());
+      }
 
-    assert(MemVT.isSimple());
-    allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
-                    State);
+      unsigned PartOffset = 0;
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
+                                               BasePartOffset + PartOffset,
+                                               MemVT.getSimpleVT(),
+                                               CCValAssign::Full));
+        PartOffset += MemVT.getStoreSize();
+      }
+    }
   }
 }
 
@@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
-  if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+  if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+      G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+    if (!MFI->isEntryFunction()) {
+      const Function &Fn = DAG.getMachineFunction().getFunction();
+      DiagnosticInfoUnsupported BadLDSDecl(
+        Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+      DAG.getContext()->diagnose(BadLDSDecl);
+    }
+
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
@@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> Args;
 
+  EVT VT = Op.getValueType();
+  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    SDLoc SL(Op);
+    SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
+    SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+
+    SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
+    return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+  }
+
   for (const SDUse &U : Op->ops())
     DAG.ExtractVectorElements(U.get(), Args);
 
@@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
-/// \brief Generate Min/Max node
+/// Generate Min/Max node
 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
                                                    SDValue LHS, SDValue RHS,
                                                    SDValue True, SDValue False,
@@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
 
-  // Extend back to to 64-bits.
+  // Extend back to 64-bits.
   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
 
@@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
-SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
-                                                  DAGCombinerInfo &DCI) const {
-  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
-  if (!CSrc)
-    return SDValue();
-
-  const APFloat &F = CSrc->getValueAPF();
-  APFloat Zero = APFloat::getZero(F.getSemantics());
-  APFloat::cmpResult Cmp0 = F.compare(Zero);
-  if (Cmp0 == APFloat::cmpLessThan ||
-      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
-    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
-  }
-
-  APFloat One(F.getSemantics(), "1.0");
-  APFloat::cmpResult Cmp1 = F.compare(One);
-  if (Cmp1 == APFloat::cmpGreaterThan)
-    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
-
-  return SDValue(CSrc, 0);
-}
-
 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
 // issues.
@@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     SDValue X = LHS->getOperand(0);
 
     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
-        isTypeLegal(MVT::v2i16)) {
+        isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
       // Prefer build_vector as the canonical form if packed types are legal.
       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
@@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
 }
 
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+
+  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+  if (Src.getOpcode() == ISD::BITCAST) {
+    SDValue Vec = Src.getOperand(0);
+    if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt0 = Vec.getOperand(0);
+      EVT EltVT = Elt0.getValueType();
+      if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+        if (EltVT.isFloatingPoint()) {
+          Elt0 = DAG.getNode(ISD::BITCAST, SL,
+                             EltVT.changeTypeToInteger(), Elt0);
+        }
+
+        return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+      }
+    }
+  }
+
+  // Equivalent of above for accessing the high element of a vector as an
+  // integer operation.
+  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
+  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
+    if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
+      if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
+        SDValue BV = stripBitcast(Src.getOperand(0));
+        if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+            BV.getValueType().getVectorNumElements() == 2) {
+          SDValue SrcElt = BV.getOperand(1);
+          EVT SrcEltVT = SrcElt.getValueType();
+          if (SrcEltVT.isFloatingPoint()) {
+            SrcElt = DAG.getNode(ISD::BITCAST, SL,
+                                 SrcEltVT.changeTypeToInteger(), SrcElt);
+          }
+
+          return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
+        }
+      }
+    }
+  }
+
+  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+  //
+  // i16 (trunc (srl i64:x, K)), K <= 16 ->
+  //     i16 (trunc (srl (i32 (trunc x), K)))
+  if (VT.getScalarSizeInBits() < 32) {
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() > 32 &&
+        (Src.getOpcode() == ISD::SRL ||
+         Src.getOpcode() == ISD::SRA ||
+         Src.getOpcode() == ISD::SHL)) {
+      SDValue Amt = Src.getOperand(1);
+      KnownBits Known;
+      DAG.computeKnownBits(Amt, Known);
+      unsigned Size = VT.getScalarSizeInBits();
+      if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
+          (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
+        EVT MidVT = VT.isVector() ?
+          EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                           VT.getVectorNumElements()) : MVT::i32;
+
+        EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+                                    Src.getOperand(0));
+        DCI.AddToWorklist(Trunc.getNode());
+
+        if (Amt.getValueType() != NewShiftVT) {
+          Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
+          DCI.AddToWorklist(Amt.getNode());
+        }
+
+        SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+                                          Trunc, Amt);
+        return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 // We need to specifically handle i64 mul here to avoid unnecessary conversion
 // instructions. If we only match on the legalized i64 mul expansion,
 // SimplifyDemandedBits will be unable to remove them because there will be
@@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+
+  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
+  // in the source into any_extends if the result of the mul is truncated. Since
+  // we can assume the high bits are whatever we want, use the underlying value
+  // to avoid the unknown high bits from interfering.
+  if (N0.getOpcode() == ISD::ANY_EXTEND)
+    N0 = N0.getOperand(0);
+
+  if (N1.getOpcode() == ISD::ANY_EXTEND)
+    N1 = N1.getOperand(0);
+
   SDValue Mul;
 
   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
@@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   case ISD::FSIN:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::SIN_HW: {
     SDValue CvtSrc = N0.getOperand(0);
     if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
   }
 }
 
+SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CFP)
+    return SDValue();
+
+  // XXX - Should this flush denormals?
+  const APFloat &Val = CFP->getValueAPF();
+  APFloat One(Val.getSemantics(), "1.0");
+  return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     // TODO: Generalize and move to DAGCombiner
     SDValue Src = N->getOperand(0);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
-      assert(Src.getValueType() == MVT::i64);
-      SDLoc SL(N);
-      uint64_t CVal = C->getZExtValue();
-      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
-                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
-                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+      if (Src.getValueType() == MVT::i64) {
+        SDLoc SL(N);
+        uint64_t CVal = C->getZExtValue();
+        return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+                           DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                           DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+      }
     }
 
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     return performSraCombine(N, DCI);
   }
+  case ISD::TRUNCATE:
+    return performTruncateCombine(N, DCI);
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case ISD::MULHS:
@@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
-  case AMDGPUISD::CLAMP:
-    return performClampCombine(N, DCI);
-  case AMDGPUISD::RCP: {
-    if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
-      // XXX - Should this flush denormals?
-      const APFloat &Val = CFP->getValueAPF();
-      APFloat One(Val.getSemantics(), "1.0");
-      return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
-    }
-
-    break;
-  }
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_IFLAG:
+    return performRcpCombine(N, DCI);
   case ISD::AssertZext:
   case ISD::AssertSext:
     return performAssertSZExtCombine(N, DCI);
@@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
 }
 
 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
-    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
-  unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
-  uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+    const MachineFunction &MF, const ImplicitParameter Param) const {
+  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+  const AMDGPUSubtarget &ST =
+      AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
+  unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
+  unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+  uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
+                       ExplicitArgOffset;
   switch (Param) {
   case GRID_DIM:
     return ArgOffset;
@@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMED3)
   NODE_NAME_CASE(SMED3)
   NODE_NAME_CASE(UMED3)
+  NODE_NAME_CASE(FDOT2)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
@@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RCP_LEGACY)
   NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(RCP_IFLAG)
   NODE_NAME_CASE(FMUL_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMP)
   NODE_NAME_CASE(LDEXP)
@@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(MAD_I64_I32)
   NODE_NAME_CASE(MAD_U64_U32)
+  NODE_NAME_CASE(PERM)
   NODE_NAME_CASE(TEXTURE_FETCH)
   NODE_NAME_CASE(EXPORT)
   NODE_NAME_CASE(EXPORT_DONE)
@@ -3980,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
+  NODE_NAME_CASE(ATOMIC_LOAD_FADD)
+  NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
+  NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+  NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
   NODE_NAME_CASE(BUFFER_STORE)
   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
@@ -3999,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
   return nullptr;
@@ -4112,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
       Known.Zero.setHighBits(32 - MaxValBits);
     break;
   }
+  case AMDGPUISD::PERM: {
+    ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CMask)
+      return;
+
+    KnownBits LHSKnown, RHSKnown;
+    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+    unsigned Sel = CMask->getZExtValue();
+
+    for (unsigned I = 0; I < 32; I += 8) {
+      unsigned SelBits = Sel & 0xff;
+      if (SelBits < 4) {
+        SelBits *= 8;
+        Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+        Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+      } else if (SelBits < 7) {
+        SelBits = (SelBits & 3) * 8;
+        Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+        Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+      } else if (SelBits == 0x0c) {
+        Known.Zero |= 0xff << I;
+      } else if (SelBits > 0x0c) {
+        Known.One |= 0xff << I;
+      }
+      Sel >>= 8;
+    }
+    break;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     switch (IID) {
     case Intrinsic::amdgcn_mbcnt_lo:
     case Intrinsic::amdgcn_mbcnt_hi: {
+      const GCNSubtarget &ST =
+          DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
       // These return at most the wavefront size - 1.
       unsigned Size = Op.getValueType().getSizeInBits();
-      Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
+      Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
       break;
     }
     default:
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 039ee174e5b7..a4c3b413e103 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition of the TargetLowering class that is common
+/// Interface definition of the TargetLowering class that is common
 /// to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
@@ -28,6 +28,8 @@ struct ArgDescriptor;
 
 class AMDGPUTargetLowering : public TargetLowering {
 private:
+  const AMDGPUSubtarget *Subtarget;
+
   /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
   /// legalized from a smaller type VT. Need to match pre-legalized type because
   /// the generic legalization inserts the add/sub between the select and
@@ -39,12 +41,11 @@ public:
   static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
 
 protected:
-  const AMDGPUSubtarget *Subtarget;
   AMDGPUAS AMDGPUASI;
 
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  /// \brief Split a vector store into multiple scalar stores.
+  /// Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
   SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
@@ -78,7 +79,6 @@ protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
@@ -87,6 +87,7 @@ protected:
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -96,6 +97,7 @@ protected:
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
@@ -108,10 +110,10 @@ protected:
   SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
   SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Split a vector load into 2 loads of half the vector.
+  /// Split a vector load into 2 loads of half the vector.
   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Split a vector store into 2 stores of half the vector.
+  /// Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -120,8 +122,11 @@ protected:
   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
-  void analyzeFormalArgumentsCompute(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+  void analyzeFormalArgumentsCompute(
+    CCState &State,
+    const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
@@ -136,6 +141,10 @@ public:
     return false;
   }
 
+  static inline SDValue stripBitcast(SDValue Val) {
+    return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+  }
+
   static bool allUsesHaveSourceMods(const SDNode *N,
                                     unsigned CostThreshold = 4);
   bool isFAbsFree(EVT VT) const override;
@@ -146,7 +155,6 @@ public:
   bool isZExtFree(Type *Src, Type *Dest) const override;
   bool isZExtFree(EVT Src, EVT Dest) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
-  bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
@@ -168,6 +176,7 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
+  bool isSDNodeAlwaysUniform(const SDNode *N) const override;
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
@@ -224,7 +233,7 @@ public:
   virtual SDNode *PostISelFolding(MachineSDNode *N,
                                   SelectionDAG &DAG) const = 0;
 
-  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op,
@@ -237,7 +246,7 @@ public:
                                            const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
-  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
   ///
   /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
@@ -285,9 +294,9 @@ public:
     GRID_OFFSET,
   };
 
-  /// \brief Helper function that returns the byte offset of the given
+  /// Helper function that returns the byte offset of the given
   /// type of implicit parameter.
-  uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+  uint32_t getImplicitParameterOffset(const MachineFunction &MF,
                                       const ImplicitParameter Param) const;
 
   AMDGPUAS getAMDGPUAS() const {
@@ -357,6 +366,7 @@ enum NodeType : unsigned {
   FMED3,
   SMED3,
   UMED3,
+  FDOT2,
   URECIP,
   DIV_SCALE,
   DIV_FMAS,
@@ -372,6 +382,7 @@ enum NodeType : unsigned {
   RSQ,
   RCP_LEGACY,
   RSQ_LEGACY,
+  RCP_IFLAG,
   FMUL_LEGACY,
   RSQ_CLAMP,
   LDEXP,
@@ -396,6 +407,7 @@ enum NodeType : unsigned {
   MAD_I64_I32,
   MUL_LOHI_I24,
   MUL_LOHI_U24,
+  PERM,
   TEXTURE_FETCH,
   EXPORT, // exp on SI+
   EXPORT_DONE, // exp on SI+ with done bit set
@@ -455,14 +467,21 @@ enum NodeType : unsigned {
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
   TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_STORE_FORMAT_D16,
   TBUFFER_LOAD_FORMAT,
+  TBUFFER_LOAD_FORMAT_D16,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
+  ATOMIC_LOAD_FADD,
+  ATOMIC_LOAD_FMIN,
+  ATOMIC_LOAD_FMAX,
   BUFFER_LOAD,
   BUFFER_LOAD_FORMAT,
+  BUFFER_LOAD_FORMAT_D16,
   BUFFER_STORE,
   BUFFER_STORE_FORMAT,
+  BUFFER_STORE_FORMAT_D16,
   BUFFER_ATOMIC_SWAP,
   BUFFER_ATOMIC_ADD,
   BUFFER_ATOMIC_SUB,
@@ -474,6 +493,7 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_OR,
   BUFFER_ATOMIC_XOR,
   BUFFER_ATOMIC_CMPSWAP,
+
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
index ff9e7b50ed5c..35dd9eb0a478 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This is AMDGPU specific replacement of the standard inliner.
+/// This is AMDGPU specific replacement of the standard inliner.
 /// The main purpose is to account for the fact that calls not only expensive
 /// on the AMDGPU, but much more expensive if a private memory pointer is
 /// passed to a function as an argument. In this situation, we are unable to
@@ -161,8 +161,8 @@ static bool isWrapperOnlyCall(CallSite CS) {
       return false;
     }
     if (isa<ReturnInst>(*std::next(I->getIterator()))) {
-      DEBUG(dbgs() << "    Wrapper only call detected: "
-                   << Callee->getName() << '\n');
+      LLVM_DEBUG(dbgs() << "    Wrapper only call detected: "
+                        << Callee->getName() << '\n');
       return true;
     }
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 61892efe39e0..07aa7c2cc8ad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -16,98 +16,18 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
-#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenInstrInfo.inc"
-
 // Pin the vtable to this file.
-void AMDGPUInstrInfo::anchor() {}
-
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
-  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
-    ST(ST),
-    AMDGPUASI(ST.getAMDGPUAS()) {}
-
-// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
-// the first 16 loads will be interleaved with the stores, and the next 16 will
-// be clustered as expected. It should really split into 2 16 store batches.
-//
-// Loads are clustered until this returns false, rather than trying to schedule
-// groups of stores. This also means we have to deal with saying different
-// address space loads should be clustered, and ones which might cause bank
-// conflicts.
-//
-// This might be deprecated so it might not be worth that much effort to fix.
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
-                                              int64_t Offset0, int64_t Offset1,
-                                              unsigned NumLoads) const {
-  assert(Offset1 > Offset0 &&
-         "Second offset should be larger than first offset!");
-  // If we have less than 16 loads in a row, and the offsets are within 64
-  // bytes, then schedule together.
-
-  // A cacheline is 64 bytes (for global memory).
-  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
-}
-
-// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
-enum SIEncodingFamily {
-  SI = 0,
-  VI = 1,
-  SDWA = 2,
-  SDWA9 = 3,
-  GFX9 = 4
-};
-
-static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
-  switch (ST.getGeneration()) {
-  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-  case AMDGPUSubtarget::SEA_ISLANDS:
-    return SIEncodingFamily::SI;
-  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
-  case AMDGPUSubtarget::GFX9:
-    return SIEncodingFamily::VI;
-
-  // FIXME: This should never be called for r600 GPUs.
-  case AMDGPUSubtarget::R600:
-  case AMDGPUSubtarget::R700:
-  case AMDGPUSubtarget::EVERGREEN:
-  case AMDGPUSubtarget::NORTHERN_ISLANDS:
-    return SIEncodingFamily::SI;
-  }
+//void AMDGPUInstrInfo::anchor() {}
 
-  llvm_unreachable("Unknown subtarget generation!");
-}
-
-int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
-
-  if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
-    ST.getGeneration() >= AMDGPUSubtarget::GFX9)
-    Gen = SIEncodingFamily::GFX9;
-
-  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
-    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
-                                                      : SIEncodingFamily::SDWA;
-
-  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { }
 
-  // -1 means that Opcode is already a native instruction.
-  if (MCOp == -1)
-    return Opcode;
-
-  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
-  // no encoding in the given subtarget generation.
-  if (MCOp == (uint16_t)-1)
-    return -1;
-
-  return MCOp;
-}
 
 // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
 bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
@@ -120,6 +40,9 @@ bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
     return true;
 
+  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
+
   if (const Argument *Arg = dyn_cast<Argument>(Ptr))
     return AMDGPU::isArgPassedInSGPR(Arg);
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 74e14ef8fbd8..2f8166da0d33 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// Contains the definition of a TargetInstrInfo class that is common
 /// to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
@@ -20,39 +20,43 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 
-#define GET_INSTRINFO_HEADER
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_HEADER
-
 namespace llvm {
 
-class AMDGPUSubtarget;
+class GCNSubtarget;
 class MachineFunction;
 class MachineInstr;
 class MachineInstrBuilder;
 
-class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
-private:
-  const AMDGPUSubtarget &ST;
+class AMDGPUInstrInfo {
+public:
+  explicit AMDGPUInstrInfo(const GCNSubtarget &st);
 
-  virtual void anchor();
-protected:
-  AMDGPUAS AMDGPUASI;
+  static bool isUniformMMO(const MachineMemOperand *MMO);
+};
 
-public:
-  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+namespace AMDGPU {
 
-  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
-                               int64_t Offset1, int64_t Offset2,
-                               unsigned NumLoads) const override;
+struct RsrcIntrinsic {
+  unsigned Intr;
+  uint8_t RsrcArg;
+  bool IsImage;
+};
+const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);
 
-  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
-  /// Return -1 if the target-specific opcode for the pseudo instruction does
-  /// not exist. If Opcode is not a pseudo instruction, this is identity.
-  int pseudoToMCOpcode(int Opcode) const;
+struct D16ImageDimIntrinsic {
+  unsigned Intr;
+  unsigned D16HelperIntr;
+};
+const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
 
-  static bool isUniformMMO(const MachineMemOperand *MMO);
+struct ImageDimIntrinsicInfo {
+  unsigned Intr;
+  unsigned BaseOpcode;
+  MIMGDim Dim;
 };
+const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
+
+} // end AMDGPU namespace
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 65c483d85c5a..96b7568eec1f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -140,6 +140,8 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
 def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
 
+def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
+
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
 def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
@@ -168,8 +170,6 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-
 // out = min(a, b) a and b are floats, where a nan comparison fails.
 def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
   []
@@ -341,6 +341,13 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
 
 def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
+def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+                  SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<0>, SDTCisVec<1>]>,
+                  []>;
+
+def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+
 def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
                       SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                       [SDNPHasChain, SDNPInGlue]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 16d240e96196..219d430fbb39 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,12 @@
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -30,10 +36,48 @@
 
 using namespace llvm;
 
+#define GET_GLOBALISEL_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+#undef AMDGPUSubtarget
+
 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
-    const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+    const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
+    const AMDGPUTargetMachine &TM)
     : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
+      TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+      STI(STI),
+      EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+      ,AMDGPUASI(STI.getAMDGPUAS())
+{
+}
+
+const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+
+bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  I.setDesc(TII.get(TargetOpcode::COPY));
+  for (const MachineOperand &MO : I.operands()) {
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      continue;
+
+    const TargetRegisterClass *RC =
+            TRI.getConstrainedRegClassForOperand(MO, MRI);
+    if (!RC)
+      continue;
+    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+  }
+  return true;
+}
 
 MachineOperand
 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
@@ -71,6 +115,10 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
   }
 }
 
+static int64_t getConstant(const MachineInstr *MI) {
+  return MI->getOperand(1).getCImm()->getSExtValue();
+}
+
 bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
@@ -118,12 +166,144 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
   return selectG_ADD(I);
 }
 
+bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const MachineOperand &MO = I.getOperand(0);
+  const TargetRegisterClass *RC =
+      TRI.getConstrainedRegClassForOperand(MO, MRI);
+  if (RC)
+    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
+                                          CodeGenCoverage &CoverageInfo) const {
+  unsigned IntrinsicID =  I.getOperand(1).getIntrinsicID();
+
+  switch (IntrinsicID) {
+  default:
+    break;
+  case Intrinsic::maxnum:
+  case Intrinsic::minnum:
+  case Intrinsic::amdgcn_cvt_pkrtz:
+    return selectImpl(I, CoverageInfo);
+
+  case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    MachineFunction *MF = I.getParent()->getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    const ArgDescriptor *InputPtrReg;
+    const TargetRegisterClass *RC;
+    const DebugLoc &DL = I.getDebugLoc();
+
+    std::tie(InputPtrReg, RC)
+      = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+    if (!InputPtrReg)
+      report_fatal_error("missing kernarg segment ptr");
+
+    BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY))
+      .add(I.getOperand(0))
+      .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister()));
+    I.eraseFromParent();
+    return true;
+  }
+  }
+  return false;
+}
+
+static MachineInstr *
+buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
+         unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
+         unsigned VM, bool Compr, unsigned Enabled, bool Done) {
+  const DebugLoc &DL = Insert->getDebugLoc();
+  MachineBasicBlock &BB = *Insert->getParent();
+  unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
+  return BuildMI(BB, Insert, DL, TII.get(Opcode))
+          .addImm(Tgt)
+          .addReg(Reg0)
+          .addReg(Reg1)
+          .addReg(Reg2)
+          .addReg(Reg3)
+          .addImm(VM)
+          .addImm(Compr)
+          .addImm(Enabled);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+                                                 MachineInstr &I,
+						 CodeGenCoverage &CoverageInfo) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+  switch (IntrinsicID) {
+  case Intrinsic::amdgcn_exp: {
+    int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+    int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+    int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
+    int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+
+    MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
+                                 I.getOperand(4).getReg(),
+                                 I.getOperand(5).getReg(),
+                                 I.getOperand(6).getReg(),
+                                 VM, false, Enabled, Done);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+  }
+  case Intrinsic::amdgcn_exp_compr: {
+    const DebugLoc &DL = I.getDebugLoc();
+    int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
+    int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
+    unsigned Reg0 = I.getOperand(3).getReg();
+    unsigned Reg1 = I.getOperand(4).getReg();
+    unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
+    int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+    MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
+                                 true,  Enabled, Done);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+  }
+  }
+  return false;
+}
+
 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   DebugLoc DL = I.getDebugLoc();
+  unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+  unsigned Opcode;
 
   // FIXME: Select store instruction based on address space
-  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+  switch (StoreSize) {
+  default:
+    return false;
+  case 32:
+    Opcode = AMDGPU::FLAT_STORE_DWORD;
+    break;
+  case 64:
+    Opcode = AMDGPU::FLAT_STORE_DWORDX2;
+    break;
+  case 96:
+    Opcode = AMDGPU::FLAT_STORE_DWORDX3;
+    break;
+  case 128:
+    Opcode = AMDGPU::FLAT_STORE_DWORDX4;
+    break;
+  }
+
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
           .add(I.getOperand(1))
           .add(I.getOperand(0))
           .addImm(0)  // offset
@@ -143,36 +323,67 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineOperand &ImmOp = I.getOperand(1);
+
+  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
+  if (ImmOp.isFPImm()) {
+    const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
+    ImmOp.ChangeToImmediate(Imm.getZExtValue());
+  } else if (ImmOp.isCImm()) {
+    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
+  }
+
   unsigned DstReg = I.getOperand(0).getReg();
-  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+  unsigned Size;
+  bool IsSgpr;
+  const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+  if (RB) {
+    IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
+    Size = MRI.getType(DstReg).getSizeInBits();
+  } else {
+    const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+    IsSgpr = TRI.isSGPRClass(RC);
+    Size = TRI.getRegSizeInBits(*RC);
+  }
 
+  if (Size != 32 && Size != 64)
+    return false;
+
+  unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
   if (Size == 32) {
-    I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+    I.setDesc(TII.get(Opcode));
+    I.addImplicitDefUseOperands(*MF);
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
-  assert(Size == 64);
-
   DebugLoc DL = I.getDebugLoc();
-  unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  const APInt &Imm = I.getOperand(1).getCImm()->getValue();
+  const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
+                                           &AMDGPU::VGPR_32RegClass;
+  unsigned LoReg = MRI.createVirtualRegister(RC);
+  unsigned HiReg = MRI.createVirtualRegister(RC);
+  const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
 
-  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
           .addImm(Imm.trunc(32).getZExtValue());
 
-  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
           .addImm(Imm.ashr(32).getZExtValue());
 
-  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-          .addReg(LoReg)
-          .addImm(AMDGPU::sub0)
-          .addReg(HiReg)
-          .addImm(AMDGPU::sub1);
+  const MachineInstr *RS =
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+              .addReg(LoReg)
+              .addImm(AMDGPU::sub0)
+              .addReg(HiReg)
+              .addImm(AMDGPU::sub1);
+
   // We can't call constrainSelectedInstRegOperands here, because it doesn't
   // work for target independent opcodes
   I.eraseFromParent();
-  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+  const TargetRegisterClass *DstRC =
+      TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+  if (!DstRC)
+    return true;
+  return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
 }
 
 static bool isConstant(const MachineInstr &MI) {
@@ -228,6 +439,9 @@ static bool isInstrUniform(const MachineInstr &MI) {
       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
     return true;
 
+  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
+
   const Instruction *I = dyn_cast<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.uniform");
 }
@@ -292,7 +506,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
   if (!I.hasOneMemOperand())
     return false;
 
-  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
     return false;
 
   if (!isInstrUniform(I))
@@ -303,7 +518,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
 
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
-  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned DstReg = I.getOperand(0).getReg();
   const DebugLoc &DL = I.getDebugLoc();
@@ -405,18 +620,30 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::select(MachineInstr &I,
                                        CodeGenCoverage &CoverageInfo) const {
 
-  if (!isPreISelGenericOpcode(I.getOpcode()))
+  if (!isPreISelGenericOpcode(I.getOpcode())) {
+    if (I.isCopy())
+      return selectCOPY(I);
     return true;
+  }
 
   switch (I.getOpcode()) {
   default:
-    break;
+    return selectImpl(I, CoverageInfo);
   case TargetOpcode::G_ADD:
     return selectG_ADD(I);
+  case TargetOpcode::G_BITCAST:
+    return selectCOPY(I);
   case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
     return selectG_CONSTANT(I);
   case TargetOpcode::G_GEP:
     return selectG_GEP(I);
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return selectG_IMPLICIT_DEF(I);
+  case TargetOpcode::G_INTRINSIC:
+    return selectG_INTRINSIC(I, CoverageInfo);
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+    return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
   case TargetOpcode::G_LOAD:
     return selectG_LOAD(I);
   case TargetOpcode::G_STORE:
@@ -424,3 +651,47 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
   }
   return false;
 }
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+  }};
+
+}
+
+///
+/// This will select either an SGPR or VGPR operand and will save us from
+/// having to write an extra tablegen pattern.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
+  }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // src_mods
+  }};
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 715c4882f380..68b40b20aca2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -15,27 +15,39 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 
 #include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 
+namespace {
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+#undef AMDGPUSubtarget
+}
+
 namespace llvm {
 
 class AMDGPUInstrInfo;
 class AMDGPURegisterBankInfo;
+class GCNSubtarget;
 class MachineInstr;
 class MachineOperand;
 class MachineRegisterInfo;
 class SIInstrInfo;
+class SIMachineFunctionInfo;
 class SIRegisterInfo;
-class SISubtarget;
 
 class AMDGPUInstructionSelector : public InstructionSelector {
 public:
-  AMDGPUInstructionSelector(const SISubtarget &STI,
-                            const AMDGPURegisterBankInfo &RBI);
+  AMDGPUInstructionSelector(const GCNSubtarget &STI,
+                            const AMDGPURegisterBankInfo &RBI,
+                            const AMDGPUTargetMachine &TM);
 
   bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  static const char *getName();
 
 private:
   struct GEPInfo {
@@ -46,10 +58,18 @@ private:
     GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
   };
 
+  /// tblgen-erated 'select' implementation.
+  bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
   MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+  bool selectCOPY(MachineInstr &I) const;
   bool selectG_CONSTANT(MachineInstr &I) const;
   bool selectG_ADD(MachineInstr &I) const;
   bool selectG_GEP(MachineInstr &I) const;
+  bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+  bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+  bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
+                                        CodeGenCoverage &CoverageInfo) const;
   bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
   void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
                        SmallVectorImpl<GEPInfo> &AddrInfo) const;
@@ -57,9 +77,35 @@ private:
   bool selectG_LOAD(MachineInstr &I) const;
   bool selectG_STORE(MachineInstr &I) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectVCSRC(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectVSRC0(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectVOP3Mods0(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3OMods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3Mods(MachineOperand &Root) const;
+
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const AMDGPURegisterBankInfo &RBI;
+  const AMDGPUTargetMachine &TM;
+  const GCNSubtarget &STI;
+  bool EnableLateStructurizeCFG;
+#define GET_GLOBALISEL_PREDICATES_DECL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+#undef AMDGPUSubtarget
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AMDGPUGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+
 protected:
   AMDGPUAS AMDGPUASI;
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 31f728b0c22f..9426df399597 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -42,6 +42,47 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
   field bits<32> Inst = 0xffffffff;
 }
 
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+     let isCodeGenOnly = 1;
+}
+
+def TruePredicate : Predicate<"true">;
+
+// Exists to help track down where SubtargetPredicate isn't set rather
+// than letting tablegen crash with an unhelpful error.
+def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
+
+class PredicateControl {
+  Predicate SubtargetPredicate = InvalidPred;
+  list<Predicate> AssemblerPredicates = [];
+  Predicate AssemblerPredicate = TruePredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate,
+                                            AssemblerPredicate],
+                                            AssemblerPredicates,
+                                            OtherPredicates);
+}
+class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
+      PredicateControl;
+
 def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
 def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
 def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
@@ -52,7 +93,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 def FMA : Predicate<"Subtarget->hasFMA()">;
 
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 def u16ImmTarget : AsmOperandClass {
   let Name = "U16Imm";
@@ -95,12 +135,6 @@ def brtarget   : Operand<OtherVT>;
 // Misc. PatFrags
 //===----------------------------------------------------------------------===//
 
-class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0),
-  (op $src0),
-  [{ return N->hasOneUse(); }]
->;
-
 class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
   (ops node:$src0, node:$src1),
   (op $src0, $src1),
@@ -113,8 +147,6 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
   [{ return N->hasOneUse(); }]
 >;
 
-def trunc_oneuse : HasOneUseUnaryOp<trunc>;
-
 let Properties = [SDNPCommutative, SDNPAssociative] in {
 def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
@@ -127,6 +159,7 @@ def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
+def add_oneuse : HasOneUseBinOp<add>;
 def sub_oneuse : HasOneUseBinOp<sub>;
 
 def srl_oneuse : HasOneUseBinOp<srl>;
@@ -240,6 +273,37 @@ def COND_NULL : PatLeaf <
   [{(void)N; return false;}]
 >;
 
+//===----------------------------------------------------------------------===//
+// PatLeafs for Texture Constants
+//===----------------------------------------------------------------------===//
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 16;
+  }]
+>;
+
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 13;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
 
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
@@ -249,6 +313,10 @@ class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
   return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
 }]>;
 
+class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
 class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
 
 class StoreFrag<SDPatternOperator op> : PatFrag <
@@ -361,21 +429,31 @@ def az_extloadi8_local : LocalLoad <az_extloadi8>;
 def sextloadi8_local : LocalLoad <sextloadi8>;
 def az_extloadi16_local : LocalLoad <az_extloadi16>;
 def sextloadi16_local : LocalLoad <sextloadi16>;
+def atomic_load_32_local : LocalLoad<atomic_load_32>;
+def atomic_load_64_local : LocalLoad<atomic_load_64>;
 
 def store_local : LocalStore <store>;
 def truncstorei8_local : LocalStore <truncstorei8>;
 def truncstorei16_local : LocalStore <truncstorei16>;
 def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
 def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
+def atomic_store_local : LocalStore <atomic_store>;
 
 def load_align8_local : Aligned8Bytes <
   (ops node:$ptr), (load_local node:$ptr)
 >;
 
+def load_align16_local : Aligned16Bytes <
+  (ops node:$ptr), (load_local node:$ptr)
+>;
+
 def store_align8_local : Aligned8Bytes <
   (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
 >;
 
+def store_align16_local : Aligned16Bytes <
+  (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
+>;
 
 def load_flat          : FlatLoad <load>;
 def az_extloadi8_flat  : FlatLoad <az_extloadi8>;
@@ -571,6 +649,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
     (BFI_INT $x, $y, $z)
   >;
 
+  // 64-bit version
+  def : AMDGPUPat <
+    (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+    (REG_SEQUENCE RC64,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+               (i32 (EXTRACT_SUBREG $y, sub0)),
+               (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+               (i32 (EXTRACT_SUBREG $y, sub1)),
+               (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+  >;
+
   // SHA-256 Ch function
   // z ^ (x & (y ^ z))
   def : AMDGPUPat <
@@ -578,6 +668,18 @@ multiclass BFIPatterns <Instruction BFI_INT,
     (BFI_INT $x, $y, $z)
   >;
 
+  // 64-bit version
+  def : AMDGPUPat <
+    (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+    (REG_SEQUENCE RC64,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)),
+               (i32 (EXTRACT_SUBREG $y, sub0)),
+               (i32 (EXTRACT_SUBREG $z, sub0))), sub0,
+      (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)),
+               (i32 (EXTRACT_SUBREG $y, sub1)),
+               (i32 (EXTRACT_SUBREG $z, sub1))), sub1)
+  >;
+
   def : AMDGPUPat <
     (fcopysign f32:$src0, f32:$src1),
     (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
@@ -611,10 +713,25 @@ multiclass BFIPatterns <Instruction BFI_INT,
 // SHA-256 Ma patterns
 
 // ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat <
-  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
-  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
->;
+multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> {
+  def : AMDGPUPat <
+    (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+    (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+  >;
+
+  def : AMDGPUPat <
+    (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
+    (REG_SEQUENCE RC64,
+      (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub0)),
+                    (i32 (EXTRACT_SUBREG $y, sub0))),
+               (i32 (EXTRACT_SUBREG $z, sub0)),
+               (i32 (EXTRACT_SUBREG $y, sub0))), sub0,
+      (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub1)),
+                    (i32 (EXTRACT_SUBREG $y, sub1))),
+               (i32 (EXTRACT_SUBREG $z, sub1)),
+               (i32 (EXTRACT_SUBREG $y, sub1))), sub1)
+  >;
+}
 
 // Bitfield extract patterns
 
@@ -633,14 +750,33 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
     (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
   >;
 
+  // x & ((1 << y) - 1)
+  def : AMDGPUPat <
+    (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+    (UBFE $src, (MOV (i32 0)), $width)
+  >;
+
+  // x & ~(-1 << y)
+  def : AMDGPUPat <
+    (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+    (UBFE $src, (MOV (i32 0)), $width)
+  >;
+
+  // x & (-1 >> (bitwidth - y))
+  def : AMDGPUPat <
+    (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+    (UBFE $src, (MOV (i32 0)), $width)
+  >;
+
+  // x << (bitwidth - y) >> (bitwidth - y)
   def : AMDGPUPat <
     (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
-    (UBFE $src, (i32 0), $width)
+    (UBFE $src, (MOV (i32 0)), $width)
   >;
 
   def : AMDGPUPat <
     (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
-    (SBFE $src, (i32 0), $width)
+    (SBFE $src, (MOV (i32 0)), $width)
   >;
 }
 
@@ -697,11 +833,3 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
   (AMDGPUrcp (fsqrt vt:$src)),
   (RsqInst $src)
 >;
-
-include "R600Instructions.td"
-include "R700Instructions.td"
-include "EvergreenInstructions.td"
-include "CaymanInstructions.td"
-
-include "SIInstrInfo.td"
-
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 86dc9bd9ea74..896e2055cf62 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+/// AMDGPU Implementation of the IntrinsicInfo class.
 //
 //===-----------------------------------------------------------------------===//
 
@@ -25,13 +25,13 @@ AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
 
 static const char *const IntrinsicNameTable[] = {
 #define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
 #undef GET_INTRINSIC_NAME_TABLE
 };
 
 namespace {
 #define GET_INTRINSIC_ATTRIBUTES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 }
 
@@ -80,7 +80,7 @@ unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
 bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
 // Overload Table
 #define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicImpl.inc"
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index 6cb8b9644642..ef42f9a319af 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+/// Interface for the AMDGPU Implementation of the Intrinsic Info class.
 //
 //===-----------------------------------------------------------------------===//
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
@@ -24,7 +24,7 @@ namespace AMDGPUIntrinsic {
 enum ID {
   last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
 #define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsics.inc"
+#include "AMDGPUGenIntrinsicEnums.inc"
 #undef GET_INTRINSIC_ENUM_VALUES
       , num_AMDGPU_intrinsics
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 18c9bd933af2..230a04628504 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -13,7 +13,4 @@
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
-  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
 }
-
-include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b4704f6feb92..87b072c9ea20 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -12,7 +12,9 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "AMDGPULegalizerInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -20,19 +22,46 @@
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
+using namespace LegalizeActions;
 
-AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
+                                         const GCNTargetMachine &TM) {
   using namespace TargetOpcode;
 
-  const LLT S1= LLT::scalar(1);
+  auto GetAddrSpacePtr = [&TM](unsigned AS) {
+    return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
+  };
+
+  auto AMDGPUAS = ST.getAMDGPUAS();
+
+  const LLT S1 = LLT::scalar(1);
   const LLT V2S16 = LLT::vector(2, 16);
+
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
-  const LLT P1 = LLT::pointer(1, 64);
-  const LLT P2 = LLT::pointer(2, 64);
+  const LLT S512 = LLT::scalar(512);
+
+  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
+  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS);
+  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS);
+
+  const LLT AddrSpaces[] = {
+    GlobalPtr,
+    ConstantPtr,
+    LocalPtr,
+    FlatPtr,
+    PrivatePtr
+  };
 
   setAction({G_ADD, S32}, Legal);
+  setAction({G_ASHR, S32}, Legal);
+  setAction({G_SUB, S32}, Legal);
+  setAction({G_MUL, S32}, Legal);
   setAction({G_AND, S32}, Legal);
+  setAction({G_OR, S32}, Legal);
+  setAction({G_XOR, S32}, Legal);
 
   setAction({G_BITCAST, V2S16}, Legal);
   setAction({G_BITCAST, 1, S32}, Legal);
@@ -40,41 +69,88 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   setAction({G_BITCAST, S32}, Legal);
   setAction({G_BITCAST, 1, V2S16}, Legal);
 
+  getActionDefinitionsBuilder(G_FCONSTANT)
+    .legalFor({S32, S64});
+
+  // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that
+  // can fit in a register.
+  // FIXME: We need to legalize several more operations before we can add
+  // a test case for size > 512.
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+    .legalIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].getSizeInBits() <= 512;
+    })
+    .clampScalar(0, S1, S512);
+
+  getActionDefinitionsBuilder(G_CONSTANT)
+    .legalFor({S1, S32, S64});
+
   // FIXME: i1 operands to intrinsics should always be legal, but other i1
   // values may not be legal.  We need to figure out how to distinguish
   // between these two scenarios.
   setAction({G_CONSTANT, S1}, Legal);
-  setAction({G_CONSTANT, S32}, Legal);
-  setAction({G_CONSTANT, S64}, Legal);
-
-  setAction({G_FCONSTANT, S32}, Legal);
 
   setAction({G_FADD, S32}, Legal);
 
+  setAction({G_FCMP, S1}, Legal);
+  setAction({G_FCMP, 1, S32}, Legal);
+  setAction({G_FCMP, 1, S64}, Legal);
+
   setAction({G_FMUL, S32}, Legal);
 
-  setAction({G_GEP, P1}, Legal);
-  setAction({G_GEP, P2}, Legal);
-  setAction({G_GEP, 1, S64}, Legal);
+  setAction({G_ZEXT, S64}, Legal);
+  setAction({G_ZEXT, 1, S32}, Legal);
+
+  setAction({G_FPTOSI, S32}, Legal);
+  setAction({G_FPTOSI, 1, S32}, Legal);
+
+  setAction({G_SITOFP, S32}, Legal);
+  setAction({G_SITOFP, 1, S32}, Legal);
+
+  setAction({G_FPTOUI, S32}, Legal);
+  setAction({G_FPTOUI, 1, S32}, Legal);
+
+  for (LLT PtrTy : AddrSpaces) {
+    LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
+    setAction({G_GEP, PtrTy}, Legal);
+    setAction({G_GEP, 1, IdxTy}, Legal);
+  }
 
   setAction({G_ICMP, S1}, Legal);
   setAction({G_ICMP, 1, S32}, Legal);
 
-  setAction({G_LOAD, P1}, Legal);
-  setAction({G_LOAD, P2}, Legal);
-  setAction({G_LOAD, S32}, Legal);
-  setAction({G_LOAD, 1, P1}, Legal);
-  setAction({G_LOAD, 1, P2}, Legal);
 
-  setAction({G_OR, S32}, Legal);
+  getActionDefinitionsBuilder({G_LOAD, G_STORE})
+    .legalIf([=, &ST](const LegalityQuery &Query) {
+        const LLT &Ty0 = Query.Types[0];
+
+        // TODO: Decompose private loads into 4-byte components.
+        // TODO: Illegal flat loads on SI
+        switch (Ty0.getSizeInBits()) {
+        case 32:
+        case 64:
+        case 128:
+          return true;
+
+        case 96:
+          // XXX hasLoadX3
+          return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
+
+        case 256:
+        case 512:
+          // TODO: constant loads
+        default:
+          return false;
+        }
+      });
+
+
 
   setAction({G_SELECT, S32}, Legal);
   setAction({G_SELECT, 1, S1}, Legal);
 
   setAction({G_SHL, S32}, Legal);
 
-  setAction({G_STORE, S32}, Legal);
-  setAction({G_STORE, 1, P1}, Legal);
 
   // FIXME: When RegBankSelect inserts copies, it will only create new
   // registers with scalar types.  This means we can end up with
@@ -83,8 +159,54 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   // if it sees a generic instruction which isn't legal, so we need to
   // tell it that scalar types are legal for pointer operands
   setAction({G_GEP, S64}, Legal);
-  setAction({G_LOAD, 1, S64}, Legal);
-  setAction({G_STORE, 1, S64}, Legal);
+
+  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
+    getActionDefinitionsBuilder(Op)
+      .legalIf([=](const LegalityQuery &Query) {
+          const LLT &VecTy = Query.Types[1];
+          const LLT &IdxTy = Query.Types[2];
+          return VecTy.getSizeInBits() % 32 == 0 &&
+            VecTy.getSizeInBits() <= 512 &&
+            IdxTy.getSizeInBits() == 32;
+        });
+  }
+
+  // FIXME: Doesn't handle extract of illegal sizes.
+  getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
+    .legalIf([=](const LegalityQuery &Query) {
+        const LLT &Ty0 = Query.Types[0];
+        const LLT &Ty1 = Query.Types[1];
+        return (Ty0.getSizeInBits() % 32 == 0) &&
+               (Ty1.getSizeInBits() % 32 == 0);
+      });
+
+  // Merge/Unmerge
+  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
+    unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
+    unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+
+    getActionDefinitionsBuilder(Op)
+      .legalIf([=](const LegalityQuery &Query) {
+          const LLT &BigTy = Query.Types[BigTyIdx];
+          const LLT &LitTy = Query.Types[LitTyIdx];
+          return BigTy.getSizeInBits() % 32 == 0 &&
+                 LitTy.getSizeInBits() % 32 == 0 &&
+                 BigTy.getSizeInBits() <= 512;
+        })
+      // Any vectors left are the wrong size. Scalarize them.
+      .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+                       [](const LegalityQuery &Query) {
+                         return std::make_pair(
+                           0, Query.Types[0].getElementType());
+                       })
+      .fewerElementsIf([](const LegalityQuery &Query) { return true; },
+                       [](const LegalityQuery &Query) {
+                         return std::make_pair(
+                           1, Query.Types[1].getElementType());
+                       });
+
+  }
 
   computeTables();
+  verify(*ST.getInstrInfo());
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 291e3361f163..1cbd37c42c4b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -19,12 +19,15 @@
 
 namespace llvm {
 
+class GCNTargetMachine;
 class LLVMContext;
+class GCNSubtarget;
 
 /// This class provides the information for the target register banks.
 class AMDGPULegalizerInfo : public LegalizerInfo {
 public:
-  AMDGPULegalizerInfo();
+  AMDGPULegalizerInfo(const GCNSubtarget &ST,
+                      const GCNTargetMachine &TM);
 };
 } // End llvm namespace.
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index f594767c8edb..7a7ed7a4f065 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This file does AMD library function optimizations.
+/// This file does AMD library function optimizations.
 //
 //===----------------------------------------------------------------------===//
 
@@ -765,8 +765,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
         ArrayRef<double> tmp(DVal);
         nval = ConstantDataVector::get(context, tmp);
       }
-      DEBUG(errs() << "AMDIC: " << *CI
-                   << " ---> " << *nval << "\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
       replaceCall(nval);
       return true;
     }
@@ -776,8 +775,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
       for (int i = 0; i < sz; ++i) {
         if (CF->isExactlyValue(ftbl[i].input)) {
           Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
-          DEBUG(errs() << "AMDIC: " << *CI
-                       << " ---> " << *nval << "\n");
+          LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
           replaceCall(nval);
           return true;
         }
@@ -798,11 +796,11 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
   AMDGPULibFunc nf = FInfo;
   nf.setPrefix(AMDGPULibFunc::NATIVE);
   if (Constant *FPExpr = getFunction(M, nf)) {
-    DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
+    LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
 
     CI->setCalledFunction(FPExpr);
 
-    DEBUG(dbgs() << *CI << '\n');
+    LLVM_DEBUG(dbgs() << *CI << '\n');
 
     return true;
   }
@@ -820,8 +818,7 @@ bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
     Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
                                opr0,
                                "recip2div");
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *nval << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
     replaceCall(nval);
     return true;
   }
@@ -899,7 +896,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
 
   if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
     //  pow/powr/pown(x, 0) == 1
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
     Constant *cnval = ConstantFP::get(eltType, 1.0);
     if (getVecSize(FInfo) > 1) {
       cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -909,23 +906,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
   }
   if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
     // pow/powr/pown(x, 1.0) = x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
     replaceCall(opr0);
     return true;
   }
   if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
     // pow/powr/pown(x, 2.0) = x*x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *opr0 << " * " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0
+                      << "\n");
     Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
     replaceCall(nval);
     return true;
   }
   if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
     // pow/powr/pown(x, -1.0) = 1.0/x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> 1 / " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n");
     Constant *cnval = ConstantFP::get(eltType, 1.0);
     if (getVecSize(FInfo) > 1) {
       cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
@@ -942,8 +937,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
     if (Constant *FPExpr = getFunction(M,
         AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
                              : AMDGPULibFunc::EI_RSQRT, FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                   << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                        << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
                                                         : "__pow2rsqrt");
       replaceCall(nval);
@@ -999,8 +994,9 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
       }
       nval = B.CreateFDiv(cnval, nval, "__1powprod");
     }
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                 <<  ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                      << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
+                      << ")\n");
     replaceCall(nval);
     return true;
   }
@@ -1137,8 +1133,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
     nval = B.CreateBitCast(nval, opr0->getType());
   }
 
-  DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-               << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
+  LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                    << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
   replaceCall(nval);
 
   return true;
@@ -1155,8 +1151,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
   }
   int ci_opr1 = (int)CINT->getSExtValue();
   if (ci_opr1 == 1) {  // rootn(x, 1) = x
-    DEBUG(errs() << "AMDIC: " << *CI
-                 << " ---> " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
     replaceCall(opr0);
     return true;
   }
@@ -1166,7 +1161,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
     Module *M = CI->getModule();
     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
                                                         FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
       replaceCall(nval);
       return true;
@@ -1175,13 +1170,13 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
     Module *M = CI->getModule();
     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
                                                         FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
       replaceCall(nval);
       return true;
     }
   } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
     Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
                                opr0,
                                "__rootn2div");
@@ -1193,7 +1188,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
     Module *M = CI->getModule();
     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
                                                         FInfo))) {
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
+                        << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
       replaceCall(nval);
       return true;
@@ -1212,22 +1208,22 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
   ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
   if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
     // fma/mad(a, b, c) = c if a=0 || b=0
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
     replaceCall(opr2);
     return true;
   }
   if (CF0 && CF0->isExactlyValue(1.0f)) {
     // fma/mad(a, b, c) = b+c if a=1
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                 << *opr1 << " + " << *opr2 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
+                      << "\n");
     Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
     replaceCall(nval);
     return true;
   }
   if (CF1 && CF1->isExactlyValue(1.0f)) {
     // fma/mad(a, b, c) = a+c if b=1
-    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                 << *opr0 << " + " << *opr2 << "\n");
+    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
+                      << "\n");
     Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
     replaceCall(nval);
     return true;
@@ -1235,8 +1231,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
   if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
     if (CF->isZero()) {
       // fma/mad(a, b, c) = a*b if c=0
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                   << *opr0 << " * " << *opr1 << "\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
+                        << *opr1 << "\n");
       Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
       replaceCall(nval);
       return true;
@@ -1263,8 +1259,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
     if (Constant *FPExpr = getNativeFunction(
         CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
       Value *opr0 = CI->getArgOperand(0);
-      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
-                   << "sqrt(" << *opr0 << ")\n");
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                        << "sqrt(" << *opr0 << ")\n");
       Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
       replaceCall(nval);
       return true;
@@ -1355,8 +1351,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
     P = B.CreateAddrSpaceCast(Alloc, PTy);
   CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
 
-  DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI
-               << ") with " << *Call << "\n");
+  LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
+                    << *Call << "\n");
 
   if (!isSin) { // CI->cos, UI->sin
     B.SetInsertPoint(&*ItOld);
@@ -1719,9 +1715,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
   bool Changed = false;
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  DEBUG(dbgs() << "AMDIC: process function ";
-        F.printAsOperand(dbgs(), false, F.getParent());
-        dbgs() << '\n';);
+  LLVM_DEBUG(dbgs() << "AMDIC: process function ";
+             F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
 
   if (!EnablePreLink)
     Changed |= setFastFlags(F, Options);
@@ -1737,8 +1732,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
       Function *Callee = CI->getCalledFunction();
       if (Callee == 0) continue;
 
-      DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
-            dbgs().flush());
+      LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+                 dbgs().flush());
       if(Simplifier.fold(CI, AA))
         Changed = true;
     }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index 5405bc645714..fe062384800a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -1,4 +1,4 @@
-//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 7e0e9802c0e6..2cec8fe53283 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -117,7 +117,6 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
     return false;
 
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
   bool Changed = false;
 
   for (auto *U : F.users()) {
@@ -125,7 +124,7 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
     if (!CI)
       continue;
 
-    Changed |= ST.makeLIDRangeMetadata(CI);
+    Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI);
   }
   return Changed;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
new file mode 100644
index 000000000000..8cc7e38f7b29
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -0,0 +1,264 @@
+//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass replaces accesses to kernel arguments with loads from
+/// offsets from the kernarg base pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerKernelArguments : public FunctionPass{
+public:
+  static char ID;
+
+  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  LLVMContext &Ctx = F.getParent()->getContext();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  BasicBlock &EntryBlock = *F.begin();
+  IRBuilder<> Builder(&*EntryBlock.begin());
+
+  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
+
+  unsigned MaxAlign;
+  // FIXME: Alignment is broken broken with explicit arg offset.;
+  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
+  if (TotalKernArgSize == 0)
+    return false;
+
+  CallInst *KernArgSegment =
+    Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
+                            F.getName() + ".kernarg.segment");
+
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
+    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+
+  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
+  uint64_t ExplicitArgOffset = 0;
+
+  for (Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    unsigned Size = DL.getTypeSizeInBits(ArgTy);
+    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+
+    // Clover seems to always pad i8/i16 to i32, but doesn't properly align
+    // them?
+    // Make sure the struct elements have correct size and alignment for ext
+    // args. These seem to be padded up to 4-bytes but not correctly aligned.
+    bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+                    !ST.isAmdHsaOS();
+    if (IsExtArg)
+      AllocSize = 4;
+
+    uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+    if (Arg.use_empty())
+      continue;
+
+    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
+      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
+      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
+      // can't represent this with range metadata because it's only allowed for
+      // integer types.
+      if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+          ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        continue;
+
+      // FIXME: We can replace this with equivalent alias.scope/noalias
+      // metadata, but this appears to be a lot of work.
+      if (Arg.hasNoAliasAttr())
+        continue;
+    }
+
+    VectorType *VT = dyn_cast<VectorType>(ArgTy);
+    bool IsV3 = VT && VT->getNumElements() == 3;
+    VectorType *V4Ty = nullptr;
+
+    int64_t AlignDownOffset = alignDown(EltOffset, 4);
+    int64_t OffsetDiff = EltOffset - AlignDownOffset;
+    unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+
+    Value *ArgPtr;
+    if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+      // Since we don't have sub-dword scalar loads, avoid doing an extload by
+      // loading earlier than the argument address, and extracting the relevant
+      // bits.
+      //
+      // Additionally widen any sub-dword load to i32 even if suitably aligned,
+      // so that CSE between different argument loads works easily.
+
+      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+        KernArgSegment,
+        AlignDownOffset,
+        Arg.getName() + ".kernarg.offset.align.down");
+      ArgPtr = Builder.CreateBitCast(ArgPtr,
+                                     Builder.getInt32Ty()->getPointerTo(AS),
+                                     ArgPtr->getName() + ".cast");
+    } else {
+      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+        KernArgSegment,
+        AlignDownOffset,
+        Arg.getName() + ".kernarg.offset");
+      ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
+                                     ArgPtr->getName() + ".cast");
+    }
+
+    assert((!IsExtArg || !IsV3) && "incompatible situation");
+
+    if (IsV3 && Size >= 32) {
+      V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
+      ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+    }
+
+    LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
+
+    MDBuilder MDB(Ctx);
+
+    if (isa<PointerType>(ArgTy)) {
+      if (Arg.hasNonNullAttr())
+        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
+
+      uint64_t DerefBytes = Arg.getDereferenceableBytes();
+      if (DerefBytes != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_dereferenceable,
+          MDNode::get(Ctx,
+                      MDB.createConstant(
+                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
+      }
+
+      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
+      if (DerefOrNullBytes != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_dereferenceable_or_null,
+          MDNode::get(Ctx,
+                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+                                                          DerefOrNullBytes))));
+      }
+
+      unsigned ParamAlign = Arg.getParamAlignment();
+      if (ParamAlign != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_align,
+          MDNode::get(Ctx,
+                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+                                                          ParamAlign))));
+      }
+    }
+
+    // TODO: Convert noalias arg to !noalias
+
+    if (Size < 32 && !ArgTy->isAggregateType()) {
+      if (IsExtArg && OffsetDiff == 0) {
+        Type *I32Ty = Builder.getInt32Ty();
+        bool IsSext = Arg.hasSExtAttr();
+        Metadata *LowAndHigh[] = {
+          ConstantAsMetadata::get(
+            ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
+          ConstantAsMetadata::get(
+            ConstantInt::get(I32Ty,
+                             IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
+        };
+
+        Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
+      }
+
+      Value *ExtractBits = OffsetDiff == 0 ?
+        Load : Builder.CreateLShr(Load, OffsetDiff * 8);
+
+      IntegerType *ArgIntTy = Builder.getIntNTy(Size);
+      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
+      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
+                                            Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(NewVal);
+    } else if (IsV3) {
+      Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
+                                                {0, 1, 2},
+                                                Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(Shuf);
+    } else {
+      Load->setName(Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(Load);
+    }
+  }
+
+  KernArgSegment->addAttribute(
+    AttributeList::ReturnIndex,
+    Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
+                      "AMDGPU Lower Kernel Arguments", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
+                    false, false)
+
+char AMDGPULowerKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
+  return new AMDGPULowerKernelArguments();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
new file mode 100644
index 000000000000..a43dcef4cf0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -0,0 +1,270 @@
+//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass does attempts to make use of reqd_work_group_size metadata
+/// to eliminate loads from the dispatch packet and to constant fold OpenCL
+/// get_local_size-like functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
+
+using namespace llvm;
+
+namespace {
+
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+  WORKGROUP_SIZE_X = 4,
+  WORKGROUP_SIZE_Y = 6,
+  WORKGROUP_SIZE_Z = 8,
+
+  GRID_SIZE_X = 12,
+  GRID_SIZE_Y = 16,
+  GRID_SIZE_Z = 20
+};
+
+class AMDGPULowerKernelAttributes : public ModulePass {
+  Module *Mod = nullptr;
+
+public:
+  static char ID;
+
+  AMDGPULowerKernelAttributes() : ModulePass(ID) {}
+
+  bool processUse(CallInst *CI);
+
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Kernel Attributes";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
+
+  auto MD = F->getMetadata("reqd_work_group_size");
+  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+  const bool HasUniformWorkGroupSize =
+    F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+    return false;
+
+  Value *WorkGroupSizeX = nullptr;
+  Value *WorkGroupSizeY = nullptr;
+  Value *WorkGroupSizeZ = nullptr;
+
+  Value *GridSizeX = nullptr;
+  Value *GridSizeY = nullptr;
+  Value *GridSizeZ = nullptr;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // We expect to see several GEP users, casted to the appropriate type and
+  // loaded.
+  for (User *U : CI->users()) {
+    if (!U->hasOneUse())
+      continue;
+
+    int64_t Offset = 0;
+    if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+      continue;
+
+    auto *BCI = dyn_cast<BitCastInst>(*U->user_begin());
+    if (!BCI || !BCI->hasOneUse())
+      continue;
+
+    auto *Load = dyn_cast<LoadInst>(*BCI->user_begin());
+    if (!Load || !Load->isSimple())
+      continue;
+
+    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+    // TODO: Handle merged loads.
+    switch (Offset) {
+    case WORKGROUP_SIZE_X:
+      if (LoadSize == 2)
+        WorkGroupSizeX = Load;
+      break;
+    case WORKGROUP_SIZE_Y:
+      if (LoadSize == 2)
+        WorkGroupSizeY = Load;
+      break;
+    case WORKGROUP_SIZE_Z:
+      if (LoadSize == 2)
+        WorkGroupSizeZ = Load;
+      break;
+    case GRID_SIZE_X:
+      if (LoadSize == 4)
+        GridSizeX = Load;
+      break;
+    case GRID_SIZE_Y:
+      if (LoadSize == 4)
+        GridSizeY = Load;
+      break;
+    case GRID_SIZE_Z:
+      if (LoadSize == 4)
+        GridSizeZ = Load;
+      break;
+    default:
+      break;
+    }
+  }
+
+  // Pattern match the code used to handle partial workgroup dispatches in the
+  // library implementation of get_local_size, so the entire function can be
+  // constant folded with a known group size.
+  //
+  // uint r = grid_size - group_id * group_size;
+  // get_local_size = (r < group_size) ? r : group_size;
+  //
+  // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+  // the grid_size is required to be a multiple of group_size). In this case:
+  //
+  // grid_size - (group_id * group_size) < group_size
+  // ->
+  // grid_size < group_size + (group_id * group_size)
+  //
+  // (grid_size / group_size) < 1 + group_id
+  //
+  // grid_size / group_size is at least 1, so we can conclude the select
+  // condition is false (except for group_id == 0, where the select result is
+  // the same).
+
+  bool MadeChange = false;
+  Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
+  Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };
+
+  for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
+    Value *GroupSize = WorkGroupSizes[I];
+    Value *GridSize = GridSizes[I];
+    if (!GroupSize || !GridSize)
+      continue;
+
+    for (User *U : GroupSize->users()) {
+      auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+      if (!ZextGroupSize)
+        continue;
+
+      for (User *ZextUser : ZextGroupSize->users()) {
+        auto *SI = dyn_cast<SelectInst>(ZextUser);
+        if (!SI)
+          continue;
+
+        using namespace llvm::PatternMatch;
+        auto GroupIDIntrin = I == 0 ?
+          m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
+            (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
+                      m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+        auto SubExpr = m_Sub(m_Specific(GridSize),
+                             m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
+
+        ICmpInst::Predicate Pred;
+        if (match(SI,
+                  m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
+                           SubExpr,
+                           m_Specific(ZextGroupSize))) &&
+            Pred == ICmpInst::ICMP_ULT) {
+          if (HasReqdWorkGroupSize) {
+            ConstantInt *KnownSize
+              = mdconst::extract<ConstantInt>(MD->getOperand(I));
+            SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
+                                                                SI->getType(),
+                                                                false));
+          } else {
+            SI->replaceAllUsesWith(ZextGroupSize);
+          }
+
+          MadeChange = true;
+        }
+      }
+    }
+  }
+
+  if (!HasReqdWorkGroupSize)
+    return MadeChange;
+
+  // Eliminate any other loads we can from the dispatch packet.
+  for (int I = 0; I < 3; ++I) {
+    Value *GroupSize = WorkGroupSizes[I];
+    if (!GroupSize)
+      continue;
+
+    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+    GroupSize->replaceAllUsesWith(
+      ConstantExpr::getIntegerCast(KnownSize,
+                                   GroupSize->getType(),
+                                   false));
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
+// TargetPassConfig for subtarget.
+bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
+  StringRef DispatchPtrName
+    = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
+
+  Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
+  if (!DispatchPtr) // Dispatch ptr not used.
+    return false;
+
+  bool MadeChange = false;
+
+  SmallPtrSet<Instruction *, 4> HandledUses;
+  for (auto *U : DispatchPtr->users()) {
+    CallInst *CI = cast<CallInst>(U);
+    if (HandledUses.insert(CI).second) {
+      if (processUse(CI))
+        MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
+                      "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
+                    false, false)
+
+char AMDGPULowerKernelAttributes::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
+  return new AMDGPULowerKernelAttributes();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 23fd8113932c..1876dc3f7122 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -8,16 +8,17 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+/// Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
 //
 //===----------------------------------------------------------------------===//
 //
 
-#include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600AsmPrinter.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -36,9 +37,43 @@
 
 using namespace llvm;
 
+namespace {
+
+class AMDGPUMCInstLower {
+  MCContext &Ctx;
+  const TargetSubtargetInfo &ST;
+  const AsmPrinter &AP;
+
+  const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+                                       const MachineOperand &MO) const;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
+                    const AsmPrinter &AP);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  /// Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+class R600MCInstLower : public AMDGPUMCInstLower {
+public:
+  R600MCInstLower(MCContext &ctx, const R600Subtarget &ST,
+                  const AsmPrinter &AP);
+
+  /// Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+
+
+} // End anonymous namespace
+
 #include "AMDGPUGenMCPseudoLowering.inc"
 
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx,
+                                     const TargetSubtargetInfo &st,
                                      const AsmPrinter &ap):
   Ctx(ctx), ST(st), AP(ap) { }
 
@@ -129,7 +164,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   unsigned Opcode = MI->getOpcode();
-  const auto *TII = ST.getInstrInfo();
+  const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
 
   // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
   // need to select it to the subtarget specific version, and there's no way to
@@ -169,16 +204,18 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
 bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
                                     MCOperand &MCOp) const {
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
   return MCInstLowering.lowerOperand(MO, MCOp);
 }
 
-const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+static const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
+                                        const Constant *CV,
+                                        MCContext &OutContext) {
   // TargetMachine does not support llvm-style cast. Use C++-style cast.
   // This is safe since TM is always of type AMDGPUTargetMachine or its
   // derived class.
-  auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+  auto &AT = static_cast<const AMDGPUTargetMachine&>(TM);
   auto *CE = dyn_cast<ConstantExpr>(CV);
 
   // Lower null pointers in private and local address space.
@@ -187,12 +224,18 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
   if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
     auto Op = CE->getOperand(0);
     auto SrcAddr = Op->getType()->getPointerAddressSpace();
-    if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+    if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) {
       auto DstAddr = CE->getType()->getPointerAddressSpace();
-      return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+      return MCConstantExpr::create(AT.getNullPointerValue(DstAddr),
         OutContext);
     }
   }
+  return nullptr;
+}
+
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+  if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+    return E;
   return AsmPrinter::lowerConstant(CV);
 }
 
@@ -200,7 +243,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
 
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
 
   StringRef Err;
@@ -292,3 +335,47 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
   }
 }
+
+R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST,
+                                 const AsmPrinter &AP) :
+        AMDGPUMCInstLower(Ctx, ST, AP) { }
+
+void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    MCOperand MCOp;
+    lowerOperand(MO, MCOp);
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
+  R600MCInstLower MCInstLowering(OutContext, STI, *this);
+
+  StringRef Err;
+  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+    C.emitError("Illegal instruction detected: " + Err);
+    MI->print(errs());
+  }
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+    while (I != MBB->instr_end() && I->isInsideBundle()) {
+      EmitInstruction(&*I);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+ }
+}
+
+const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) {
+  if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+    return E;
+  return AsmPrinter::lowerConstant(CV);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
deleted file mode 100644
index 57d2d85daecd..000000000000
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class AsmPrinter;
-class MachineBasicBlock;
-class MachineInstr;
-class MachineOperand;
-class MCContext;
-class MCExpr;
-class MCInst;
-class MCOperand;
-
-class AMDGPUMCInstLower {
-  MCContext &Ctx;
-  const AMDGPUSubtarget &ST;
-  const AsmPrinter &AP;
-
-  const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
-                                       const MachineOperand &MO) const;
-
-public:
-  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
-                    const AsmPrinter &AP);
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
-  /// \brief Lower a MachineInstr to an MCInst
-  void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 20918233e447..6f44e2dbb2d5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -658,7 +659,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
       continue;
     }
 
-    DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n");
     MBBMRT *NewMBB = new MBBMRT(MBB);
     MachineRegion *Region = RegionInfo->getRegionFor(MBB);
 
@@ -695,18 +696,19 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
                                        const TargetRegisterInfo *TRI,
                                        PHILinearize &PHIInfo) {
   if (TRI->isVirtualRegister(Reg)) {
-    DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+                      << "\n");
     // If this is a source register to a PHI we are chaining, it
     // must be live out.
     if (PHIInfo.isSource(Reg)) {
-      DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
+      LLVM_DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n");
       addLiveOut(Reg);
     } else {
       // If this is live out of the MBB
       for (auto &UI : MRI->use_operands(Reg)) {
         if (UI.getParent()->getParent() != MBB) {
-          DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
-                       << "): " << printReg(Reg, TRI) << "\n");
+          LLVM_DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB)
+                            << "): " << printReg(Reg, TRI) << "\n");
           addLiveOut(Reg);
         } else {
           // If the use is in the same MBB we have to make sure
@@ -717,8 +719,8 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
                    MIE = UseInstr->getParent()->instr_end();
                MII != MIE; ++MII) {
             if ((&(*MII)) == DefInstr) {
-              DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
-                           << "\n");
+              LLVM_DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI)
+                                << "\n");
               addLiveOut(Reg);
             }
           }
@@ -734,11 +736,12 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
                                              const TargetRegisterInfo *TRI,
                                              PHILinearize &PHIInfo) {
   if (TRI->isVirtualRegister(Reg)) {
-    DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
+                      << "\n");
     for (auto &UI : MRI->use_operands(Reg)) {
       if (!Region->contains(UI.getParent()->getParent())) {
-        DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
-                     << "): " << printReg(Reg, TRI) << "\n");
+        LLVM_DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+                          << "): " << printReg(Reg, TRI) << "\n");
         addLiveOut(Reg);
       }
     }
@@ -749,8 +752,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
                                      const MachineRegisterInfo *MRI,
                                      const TargetRegisterInfo *TRI,
                                      PHILinearize &PHIInfo) {
-  DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
-               << ")-\n");
+  LLVM_DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB)
+                    << ")-\n");
   for (auto &II : *MBB) {
     for (auto &RI : II.defs()) {
       storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
@@ -774,9 +777,10 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
         for (int i = 0; i < numPreds; ++i) {
           if (getPHIPred(PHI, i) == MBB) {
             unsigned PHIReg = getPHISourceReg(PHI, i);
-            DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
-                         << " -> " << printMBBReference(*(*SI))
-                         << "): " << printReg(PHIReg, TRI) << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
+                       << " -> " << printMBBReference(*(*SI))
+                       << "): " << printReg(PHIReg, TRI) << "\n");
             addLiveOut(PHIReg);
           }
         }
@@ -784,7 +788,7 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
     }
   }
 
-  DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+  LLVM_DEBUG(dbgs() << "-Store Live Outs Endn-\n");
 }
 
 void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
@@ -844,8 +848,8 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
         for (int i = 0; i < numPreds; ++i) {
           if (Region->contains(getPHIPred(PHI, i))) {
             unsigned PHIReg = getPHISourceReg(PHI, i);
-            DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
-                         << "): " << printReg(PHIReg, TRI) << "\n");
+            LLVM_DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+                              << "): " << printReg(PHIReg, TRI) << "\n");
             addLiveOut(PHIReg);
           }
         }
@@ -909,20 +913,21 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
                                        bool IncludeLoopPHI) {
   assert(Register != NewRegister && "Cannot replace a reg with itself");
 
-  DEBUG(dbgs() << "Pepareing to replace register (region): "
-               << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
-               << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+  LLVM_DEBUG(
+      dbgs() << "Pepareing to replace register (region): "
+             << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
+             << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
 
   // If we are replacing outside, we also need to update the LiveOuts
   if (ReplaceOutside &&
       (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
     LinearizedRegion *Current = this;
     while (Current != nullptr && Current->getEntry() != nullptr) {
-      DEBUG(dbgs() << "Region before register replace\n");
-      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      LLVM_DEBUG(dbgs() << "Region before register replace\n");
+      LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
       Current->replaceLiveOut(Register, NewRegister);
-      DEBUG(dbgs() << "Region after register replace\n");
-      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      LLVM_DEBUG(dbgs() << "Region after register replace\n");
+      LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
       Current = Current->getParent();
     }
   }
@@ -946,16 +951,16 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
     if (ShouldReplace) {
 
       if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
-        DEBUG(dbgs() << "Trying to substitute physical register: "
-                     << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+                          << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                          << "\n");
         llvm_unreachable("Cannot substitute physical registers");
       } else {
-        DEBUG(dbgs() << "Replacing register (region): "
-                     << printReg(Register, MRI->getTargetRegisterInfo())
-                     << " with "
-                     << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Replacing register (region): "
+                          << printReg(Register, MRI->getTargetRegisterInfo())
+                          << " with "
+                          << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                          << "\n");
         O.setReg(NewRegister);
       }
     }
@@ -1022,18 +1027,18 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
             if (hasNoDef(Reg, MRI))
               continue;
             if (!MRI->hasOneDef(Reg)) {
-              DEBUG(this->getEntry()->getParent()->dump());
-              DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
+              LLVM_DEBUG(this->getEntry()->getParent()->dump());
+              LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << "\n");
             }
 
             if (MRI->def_begin(Reg) == MRI->def_end()) {
-              DEBUG(dbgs() << "Register "
-                           << printReg(Reg, MRI->getTargetRegisterInfo())
-                           << " has NO defs\n");
+              LLVM_DEBUG(dbgs() << "Register "
+                                << printReg(Reg, MRI->getTargetRegisterInfo())
+                                << " has NO defs\n");
             } else if (!MRI->hasOneDef(Reg)) {
-              DEBUG(dbgs() << "Register "
-                           << printReg(Reg, MRI->getTargetRegisterInfo())
-                           << " has multiple defs\n");
+              LLVM_DEBUG(dbgs() << "Register "
+                                << printReg(Reg, MRI->getTargetRegisterInfo())
+                                << " has multiple defs\n");
             }
 
             assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1041,8 +1046,8 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
             MachineOperand *UseOperand = &(RI);
             bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
             if (UseIsOutsideDefMBB && UseOperand->isKill()) {
-              DEBUG(dbgs() << "Removing kill flag on register: "
-                           << printReg(Reg, TRI) << "\n");
+              LLVM_DEBUG(dbgs() << "Removing kill flag on register: "
+                                << printReg(Reg, TRI) << "\n");
               UseOperand->setIsKill(false);
             }
           }
@@ -1415,8 +1420,8 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
     MachineInstr &Instr = *I;
     if (Instr.isPHI()) {
       unsigned PHIDestReg = getPHIDestReg(Instr);
-      DEBUG(dbgs() << "Extractking killed phi:\n");
-      DEBUG(Instr.dump());
+      LLVM_DEBUG(dbgs() << "Extractking killed phi:\n");
+      LLVM_DEBUG(Instr.dump());
       PHIs.insert(&Instr);
       PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
       storePHILinearizationInfoDest(PHIDestReg, Instr);
@@ -1448,9 +1453,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
                                        MachineBasicBlock *SourceMBB,
                                        SmallVector<unsigned, 2> &PHIIndices,
                                        unsigned *ReplaceReg) {
-  DEBUG(dbgs() << "Shrink PHI: ");
-  DEBUG(PHI.dump());
-  DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+  LLVM_DEBUG(dbgs() << "Shrink PHI: ");
+  LLVM_DEBUG(PHI.dump());
+  LLVM_DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI)
+                    << " = PHI(");
 
   bool Replaced = false;
   unsigned NumInputs = getPHINumInputs(PHI);
@@ -1480,8 +1486,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
     if (SourceMBB) {
       MIB.addReg(CombinedSourceReg);
       MIB.addMBB(SourceMBB);
-      DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
-                   << printMBBReference(*SourceMBB));
+      LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+                        << printMBBReference(*SourceMBB));
     }
 
     for (unsigned i = 0; i < NumInputs; ++i) {
@@ -1492,10 +1498,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
       MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
       MIB.addReg(SourceReg);
       MIB.addMBB(SourcePred);
-      DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                   << printMBBReference(*SourcePred));
+      LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                        << printMBBReference(*SourcePred));
     }
-    DEBUG(dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << ")\n");
   }
   PHI.eraseFromParent();
   return Replaced;
@@ -1504,9 +1510,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
 void AMDGPUMachineCFGStructurizer::replacePHI(
     MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
     SmallVector<unsigned, 2> &PHIRegionIndices) {
-  DEBUG(dbgs() << "Replace PHI: ");
-  DEBUG(PHI.dump());
-  DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+  LLVM_DEBUG(dbgs() << "Replace PHI: ");
+  LLVM_DEBUG(PHI.dump());
+  LLVM_DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI)
+                    << " = PHI(");
 
   bool HasExternalEdge = false;
   unsigned NumInputs = getPHINumInputs(PHI);
@@ -1523,8 +1530,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
                 getPHIDestReg(PHI));
     MIB.addReg(CombinedSourceReg);
     MIB.addMBB(LastMerge);
-    DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
-                 << printMBBReference(*LastMerge));
+    LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+                      << printMBBReference(*LastMerge));
     for (unsigned i = 0; i < NumInputs; ++i) {
       if (isPHIRegionIndex(PHIRegionIndices, i)) {
         continue;
@@ -1533,10 +1540,10 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
       MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
       MIB.addReg(SourceReg);
       MIB.addMBB(SourcePred);
-      DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                   << printMBBReference(*SourcePred));
+      LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                        << printMBBReference(*SourcePred));
     }
-    DEBUG(dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << ")\n");
   } else {
     replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
   }
@@ -1546,9 +1553,9 @@ void AMDGPUMachineCFGStructurizer::replacePHI(
 void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
     MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
     SmallVector<unsigned, 2> &PHIRegionIndices) {
-  DEBUG(dbgs() << "Replace entry PHI: ");
-  DEBUG(PHI.dump());
-  DEBUG(dbgs() << " with ");
+  LLVM_DEBUG(dbgs() << "Replace entry PHI: ");
+  LLVM_DEBUG(PHI.dump());
+  LLVM_DEBUG(dbgs() << " with ");
 
   unsigned NumInputs = getPHINumInputs(PHI);
   unsigned NumNonRegionInputs = NumInputs;
@@ -1561,18 +1568,19 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
   if (NumNonRegionInputs == 0) {
     auto DestReg = getPHIDestReg(PHI);
     replaceRegisterWith(DestReg, CombinedSourceReg);
-    DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI)
+                      << "\n");
     PHI.eraseFromParent();
   } else {
-    DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
+    LLVM_DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI(");
     MachineBasicBlock *MBB = PHI.getParent();
     MachineInstrBuilder MIB =
         BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
                 getPHIDestReg(PHI));
     MIB.addReg(CombinedSourceReg);
     MIB.addMBB(IfMBB);
-    DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
-                 << printMBBReference(*IfMBB));
+    LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", "
+                      << printMBBReference(*IfMBB));
     unsigned NumInputs = getPHINumInputs(PHI);
     for (unsigned i = 0; i < NumInputs; ++i) {
       if (isPHIRegionIndex(PHIRegionIndices, i)) {
@@ -1582,10 +1590,10 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
       MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
       MIB.addReg(SourceReg);
       MIB.addMBB(SourcePred);
-      DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                   << printMBBReference(*SourcePred));
+      LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                        << printMBBReference(*SourcePred));
     }
-    DEBUG(dbgs() << ")\n");
+    LLVM_DEBUG(dbgs() << ")\n");
     PHI.eraseFromParent();
   }
 }
@@ -1607,8 +1615,9 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
         }
       }
 
-      DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
-                   << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+      LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
+                        << (IsDead ? "dead" : "alive")
+                        << " after PHI replace\n");
       if (IsDead) {
         LRegion->removeLiveOut(Reg);
       }
@@ -1682,8 +1691,8 @@ void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Regi
 void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
                                                        MachineBasicBlock *Dest,
                                                        const DebugLoc &DL) {
-  DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
-               << " -> " << Dest->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+                    << " -> " << Dest->getNumber() << "\n");
   MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
   bool HasTerminator = Terminator != MBB->instr_end();
   if (HasTerminator) {
@@ -1732,7 +1741,8 @@ AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
     MF->insert(ExitIter, LastMerge);
     LastMerge->addSuccessor(Exit);
     insertUnconditionalBranch(LastMerge, Exit);
-    DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+    LLVM_DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber()
+                      << "\n");
   }
   return LastMerge;
 }
@@ -1748,11 +1758,12 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
   if (MergeBB->succ_begin() == MergeBB->succ_end()) {
     return;
   }
-  DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
-               << "): " << printReg(DestRegister, TRI) << " = PHI("
-               << printReg(IfSourceRegister, TRI) << ", "
-               << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI)
-               << ", " << printMBBReference(*CodeBB) << ")\n");
+  LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
+                    << "): " << printReg(DestRegister, TRI) << " = PHI("
+                    << printReg(IfSourceRegister, TRI) << ", "
+                    << printMBBReference(*IfBB)
+                    << printReg(CodeSourceRegister, TRI) << ", "
+                    << printMBBReference(*CodeBB) << ")\n");
   const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
   MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
                                     TII->get(TargetOpcode::PHI), DestRegister);
@@ -1810,8 +1821,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
 
   for (auto SI : Succs) {
     std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
-    DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
-                 << " -> " << printMBBReference(*Edge.second) << "\n");
+    LLVM_DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first)
+                      << " -> " << printMBBReference(*Edge.second) << "\n");
     Edge.first->removeSuccessor(Edge.second);
   }
 }
@@ -1844,13 +1855,13 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
   IfBB->addSuccessor(MergeBB);
   IfBB->addSuccessor(CodeBBStart);
 
-  DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
   // Ensure that the MergeBB is a successor of the CodeEndBB.
   if (!CodeBBEnd->isSuccessor(MergeBB))
     CodeBBEnd->addSuccessor(MergeBB);
 
-  DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through "
-               << printMBBReference(*CodeBBEnd) << "\n");
+  LLVM_DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart)
+                    << " through " << printMBBReference(*CodeBBEnd) << "\n");
 
   // If we have a single predecessor we can find a reasonable debug location
   MachineBasicBlock *SinglePred =
@@ -1935,16 +1946,18 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
 
 MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
   if (MRI->def_begin(Reg) == MRI->def_end()) {
-    DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
-                 << " has NO defs\n");
+    LLVM_DEBUG(dbgs() << "Register "
+                      << printReg(Reg, MRI->getTargetRegisterInfo())
+                      << " has NO defs\n");
   } else if (!MRI->hasOneDef(Reg)) {
-    DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo())
-                 << " has multiple defs\n");
-    DEBUG(dbgs() << "DEFS BEGIN:\n");
+    LLVM_DEBUG(dbgs() << "Register "
+                      << printReg(Reg, MRI->getTargetRegisterInfo())
+                      << " has multiple defs\n");
+    LLVM_DEBUG(dbgs() << "DEFS BEGIN:\n");
     for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
-      DEBUG(DI->getParent()->dump());
+      LLVM_DEBUG(DI->getParent()->dump());
     }
-    DEBUG(dbgs() << "DEFS END\n");
+    LLVM_DEBUG(dbgs() << "DEFS END\n");
   }
 
   assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
@@ -1986,7 +1999,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
     const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
     unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
     bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
-    DEBUG(dbgs() << "Insert Chained PHI\n");
+    LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
     insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
                    SourceReg, IsLastDef);
 
@@ -2022,16 +2035,16 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
   }
 
   for (auto LI : OldLiveOuts) {
-    DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
+    LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
     if (!containsDef(CodeBB, InnerRegion, LI) ||
         (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
       // If the register simly lives through the CodeBB, we don't have
       // to rewrite anything since the register is not defined in this
       // part of the code.
-      DEBUG(dbgs() << "- through");
+      LLVM_DEBUG(dbgs() << "- through");
       continue;
     }
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
     unsigned Reg = LI;
     if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
       // If the register is live out, we do want to create a phi,
@@ -2048,12 +2061,12 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
       unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
       // Create initializer, this value is never used, but is needed
       // to satisfy SSA.
-      DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
+      LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
       TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
                         IfSourceReg, 0);
 
       InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
-      DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+      LLVM_DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
       insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
                      IfSourceReg, Reg, true);
     }
@@ -2063,22 +2076,22 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
   // is a source block for a definition.
   SmallVector<unsigned, 4> Sources;
   if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
-    DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Inserting PHI Live Out from "
+                      << printMBBReference(*CodeBB) << "\n");
     for (auto SI : Sources) {
       unsigned DestReg;
       PHIInfo.findDest(SI, CodeBB, DestReg);
       insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
     }
-    DEBUG(dbgs() << "Insertion done.\n");
+    LLVM_DEBUG(dbgs() << "Insertion done.\n");
   }
 
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Before PHI Prune\n");
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(dbgs() << "Before PHI Prune\n");
+  LLVM_DEBUG(PHIInfo.dump(MRI));
   SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
       ElimiatedSources;
   for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
@@ -2118,8 +2131,8 @@ void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
     PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
                          std::get<2>(SourceInfo));
   }
-  DEBUG(dbgs() << "After PHI Prune\n");
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(dbgs() << "After PHI Prune\n");
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
@@ -2127,8 +2140,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
   MachineBasicBlock *Entry = CurrentRegion->getEntry();
   MachineBasicBlock *Exit = CurrentRegion->getExit();
 
-  DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
-               << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() << " Pred: "
+                    << (*(Entry->pred_begin()))->getNumber() << "\n");
 
   int NumSources = 0;
   auto SE = PHIInfo.sources_end(DestReg);
@@ -2145,7 +2158,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
     const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
     MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
                                       TII->get(TargetOpcode::PHI), DestReg);
-    DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
+    LLVM_DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI(");
 
     unsigned CurrentBackedgeReg = 0;
 
@@ -2169,19 +2182,19 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
           BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
           BackedgePHI.addMBB((*SRI).second);
           CurrentBackedgeReg = NewBackedgeReg;
-          DEBUG(dbgs() << "Inserting backedge PHI: "
-                       << printReg(NewBackedgeReg, TRI) << " = PHI("
-                       << printReg(CurrentBackedgeReg, TRI) << ", "
-                       << printMBBReference(*getPHIPred(*PHIDefInstr, 0))
-                       << ", "
-                       << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
-                       << ", " << printMBBReference(*(*SRI).second));
+          LLVM_DEBUG(dbgs()
+                     << "Inserting backedge PHI: "
+                     << printReg(NewBackedgeReg, TRI) << " = PHI("
+                     << printReg(CurrentBackedgeReg, TRI) << ", "
+                     << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) << ", "
+                     << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) << ", "
+                     << printMBBReference(*(*SRI).second));
         }
       } else {
         MIB.addReg(SourceReg);
         MIB.addMBB((*SRI).second);
-        DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
-                     << printMBBReference(*(*SRI).second) << ", ");
+        LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", "
+                          << printMBBReference(*(*SRI).second) << ", ");
       }
     }
 
@@ -2189,16 +2202,16 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
     if (CurrentBackedgeReg != 0) {
       MIB.addReg(CurrentBackedgeReg);
       MIB.addMBB(Exit);
-      DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
-                   << printMBBReference(*Exit) << ")\n");
+      LLVM_DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", "
+                        << printMBBReference(*Exit) << ")\n");
     } else {
-      DEBUG(dbgs() << ")\n");
+      LLVM_DEBUG(dbgs() << ")\n");
     }
   }
 }
 
 void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 
   for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
        ++DRI) {
@@ -2219,19 +2232,19 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
     MachineOperand &O = *I;
     ++I;
     if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
-      DEBUG(dbgs() << "Trying to substitute physical register: "
-                   << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
+                        << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                        << "\n");
       llvm_unreachable("Cannot substitute physical registers");
       // We don't handle physical registers, but if we need to
       // in the future This is how we do it:
       // O.substPhysReg(NewRegister, *TRI);
     } else {
-      DEBUG(dbgs() << "Replacing register: "
-                   << printReg(Register, MRI->getTargetRegisterInfo())
-                   << " with "
-                   << printReg(NewRegister, MRI->getTargetRegisterInfo())
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Replacing register: "
+                        << printReg(Register, MRI->getTargetRegisterInfo())
+                        << " with "
+                        << printReg(NewRegister, MRI->getTargetRegisterInfo())
+                        << "\n");
       O.setReg(NewRegister);
     }
   }
@@ -2239,20 +2252,20 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
 
   getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
 
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
-  DEBUG(dbgs() << "Resolve PHI Infos\n");
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(dbgs() << "Resolve PHI Infos\n");
+  LLVM_DEBUG(PHIInfo.dump(MRI));
   for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
        ++DRI) {
     unsigned DestReg = *DRI;
-    DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n");
     auto SRI = PHIInfo.sources_begin(DestReg);
     unsigned SourceReg = (*SRI).first;
-    DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
-                 << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
+    LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI)
+                      << " SourceReg: " << printReg(SourceReg, TRI) << "\n");
 
     assert(PHIInfo.sources_end(DestReg) == ++SRI &&
            "More than one phi source in entry node");
@@ -2326,9 +2339,9 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
         MachineOperand RegOp =
             MachineOperand::CreateReg(Reg, false, false, true);
         ArrayRef<MachineOperand> Cond(RegOp);
-        DEBUG(dbgs() << "RegionExitReg: ");
-        DEBUG(Cond[0].print(dbgs(), TRI));
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+        LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+        LLVM_DEBUG(dbgs() << "\n");
         TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
                           Cond, DebugLoc());
         RegionExit->addSuccessor(CurrentRegion->getEntry());
@@ -2338,12 +2351,12 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
     LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
 
     InnerRegion.setParent(CurrentRegion);
-    DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+    LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
     insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
                    CodeBBSelectReg);
     InnerRegion.addMBB(MergeBB);
 
-    DEBUG(InnerRegion.print(dbgs(), TRI));
+    LLVM_DEBUG(InnerRegion.print(dbgs(), TRI));
     rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
     extractKilledPHIs(CodeBB);
     if (IsRegionEntryBB) {
@@ -2384,16 +2397,16 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
                         CurrentRegion->getRegionMRT()->getEntry()->getNumber());
       MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
       ArrayRef<MachineOperand> Cond(RegOp);
-      DEBUG(dbgs() << "RegionExitReg: ");
-      DEBUG(Cond[0].print(dbgs(), TRI));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "RegionExitReg: ");
+      LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
+      LLVM_DEBUG(dbgs() << "\n");
       TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
                         Cond, DebugLoc());
       RegionExit->addSuccessor(IfBB);
     }
   }
   CurrentRegion->addMBBs(InnerRegion);
-  DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+  LLVM_DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
   insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
                  CodeBBSelectReg);
 
@@ -2439,15 +2452,16 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
   MachineInstrBuilder MIB =
       BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
               TII->get(TargetOpcode::PHI), NewDestReg);
-  DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI(");
+  LLVM_DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI)
+                    << " = PHI(");
   MIB.addReg(PHISource);
   MIB.addMBB(Entry);
-  DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
-               << printMBBReference(*Entry));
+  LLVM_DEBUG(dbgs() << printReg(PHISource, TRI) << ", "
+                    << printMBBReference(*Entry));
   MIB.addReg(RegionSourceReg);
   MIB.addMBB(RegionSourceMBB);
-  DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
-               << printMBBReference(*RegionSourceMBB) << ")\n");
+  LLVM_DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", "
+                    << printMBBReference(*RegionSourceMBB) << ")\n");
 }
 
 void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
@@ -2480,7 +2494,8 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
   LRegion->addMBB(NewExit);
   LRegion->setExit(NewExit);
 
-  DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber()
+                    << "\n");
 
   // Replace any PHI Predecessors in the successor with NewExit
   for (auto &II : *Succ) {
@@ -2528,9 +2543,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
   MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
   MachineBasicBlock *Exit = LRegion->getExit();
 
-  DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
-               << printMBBReference(*Entry) << " -> "
-               << printMBBReference(*EntrySucc) << "\n");
+  LLVM_DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to "
+                    << printMBBReference(*Entry) << " -> "
+                    << printMBBReference(*EntrySucc) << "\n");
   LRegion->addMBB(EntrySucc);
 
   // Make the backedge go to Entry Succ
@@ -2621,21 +2636,21 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
   rewriteRegionExitPHIs(Region, LastMerge, LRegion);
   removeOldExitPreds(Region);
 
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 
   SetVector<MRT *> *Children = Region->getChildren();
-  DEBUG(dbgs() << "===========If Region Start===============\n");
+  LLVM_DEBUG(dbgs() << "===========If Region Start===============\n");
   if (LRegion->getHasLoop()) {
-    DEBUG(dbgs() << "Has Backedge: Yes\n");
+    LLVM_DEBUG(dbgs() << "Has Backedge: Yes\n");
   } else {
-    DEBUG(dbgs() << "Has Backedge: No\n");
+    LLVM_DEBUG(dbgs() << "Has Backedge: No\n");
   }
 
   unsigned BBSelectRegIn;
   unsigned BBSelectRegOut;
   for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
-    DEBUG(dbgs() << "CurrentRegion: \n");
-    DEBUG(LRegion->print(dbgs(), TRI));
+    LLVM_DEBUG(dbgs() << "CurrentRegion: \n");
+    LLVM_DEBUG(LRegion->print(dbgs(), TRI));
 
     auto CNI = CI;
     ++CNI;
@@ -2649,9 +2664,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       // We found the block is the exit of an inner region, we need
       // to put it in the current linearized region.
 
-      DEBUG(dbgs() << "Linearizing region: ");
-      DEBUG(InnerLRegion->print(dbgs(), TRI));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Linearizing region: ");
+      LLVM_DEBUG(InnerLRegion->print(dbgs(), TRI));
+      LLVM_DEBUG(dbgs() << "\n");
 
       MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
       if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
@@ -2669,10 +2684,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       BBSelectRegOut = Child->getBBSelectRegOut();
       BBSelectRegIn = Child->getBBSelectRegIn();
 
-      DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
-                   << "\n");
-      DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+                        << "\n");
 
       MachineBasicBlock *IfEnd = CurrentMerge;
       CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
@@ -2681,7 +2696,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
     } else {
       MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
-      DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+      LLVM_DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
 
       if (MBB == getSingleExitNode(*(MBB->getParent()))) {
         // If this is the exit block then we need to skip to the next.
@@ -2693,10 +2708,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
       BBSelectRegOut = Child->getBBSelectRegOut();
       BBSelectRegIn = Child->getBBSelectRegIn();
 
-      DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
-                   << "\n");
-      DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI)
+                        << "\n");
 
       MachineBasicBlock *IfEnd = CurrentMerge;
       // This is a basic block that is not part of an inner region, we
@@ -2707,7 +2722,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
         TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
       }
 
-      DEBUG(PHIInfo.dump(MRI));
+      LLVM_DEBUG(PHIInfo.dump(MRI));
     }
   }
 
@@ -2728,7 +2743,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
                               NewInReg, Region->getEntry()->getNumber());
     // Need to be careful about updating the registers inside the region.
     LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
-    DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+    LLVM_DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
     insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
                    InnerSelectReg, NewInReg,
                    LRegion->getRegionMRT()->getInnerOutputRegister());
@@ -2740,11 +2755,11 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
     TII->insertReturn(*LastMerge);
   }
 
-  DEBUG(Region->getEntry()->getParent()->dump());
-  DEBUG(LRegion->print(dbgs(), TRI));
-  DEBUG(PHIInfo.dump(MRI));
+  LLVM_DEBUG(Region->getEntry()->getParent()->dump());
+  LLVM_DEBUG(LRegion->print(dbgs(), TRI));
+  LLVM_DEBUG(PHIInfo.dump(MRI));
 
-  DEBUG(dbgs() << "===========If Region End===============\n");
+  LLVM_DEBUG(dbgs() << "===========If Region End===============\n");
 
   Region->setLinearizedRegion(LRegion);
   return true;
@@ -2784,12 +2799,12 @@ bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
 }
 
 void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
-  DEBUG(dbgs() << "Fallthrough Map:\n");
+  LLVM_DEBUG(dbgs() << "Fallthrough Map:\n");
   for (auto &MBBI : MF) {
     MachineBasicBlock *MBB = MBBI.getFallThrough();
     if (MBB != nullptr) {
-      DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
-                   << MBB->getNumber() << "\n");
+      LLVM_DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+                        << MBB->getNumber() << "\n");
     }
     FallthroughMap[&MBBI] = MBB;
   }
@@ -2800,8 +2815,8 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
   LinearizedRegion *LRegion = new LinearizedRegion();
   if (SelectOut) {
     LRegion->addLiveOut(SelectOut);
-    DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI)
+                      << "\n");
   }
   LRegion->setRegionMRT(Region);
   Region->setLinearizedRegion(LRegion);
@@ -2856,26 +2871,26 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) {
 }
 
 bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
   MRI = &(MF.getRegInfo());
   initFallthroughMap(MF);
 
   checkRegOnlyPHIInputs(MF);
-  DEBUG(dbgs() << "----STRUCTURIZER START----\n");
-  DEBUG(MF.dump());
+  LLVM_DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+  LLVM_DEBUG(MF.dump());
 
   Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
-  DEBUG(Regions->dump());
+  LLVM_DEBUG(Regions->dump());
 
   RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
   setRegionMRT(RTree);
   initializeSelectRegisters(RTree, 0, MRI, TII);
-  DEBUG(RTree->dump(TRI));
+  LLVM_DEBUG(RTree->dump(TRI));
   bool result = structurizeRegions(RTree, true);
   delete RTree;
-  DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+  LLVM_DEBUG(dbgs() << "----STRUCTURIZER END----\n");
   initFallthroughMap(MF);
   return result;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index b7c8c1213537..13b4b50149ce 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -9,20 +9,38 @@
 
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 
 using namespace llvm;
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
-  KernArgSize(0),
+  ExplicitKernArgSize(0),
   MaxKernArgAlign(0),
   LDSSize(0),
-  ABIArgOffset(0),
   IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
-  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
+  MemoryBound(false),
+  WaveLimiter(false) {
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
+  const Function &F = MF.getFunction();
+
+  if (auto *Resolver = MF.getMMI().getResolver()) {
+    if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
+          Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
+      MemoryBound = PHA->isMemoryBound(&F);
+      WaveLimiter = PHA->needsWaveLimiter(&F);
+    }
+  }
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
+    ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 99bb61b21db0..8d6b871bc03e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -15,57 +15,43 @@
 
 namespace llvm {
 
+class GCNSubtarget;
+
 class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// A map to keep track of local memory objects and their offsets within the
   /// local memory space.
   SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
 
-  uint64_t KernArgSize;
-  unsigned MaxKernArgAlign;
+protected:
+  uint64_t ExplicitKernArgSize; // Cache for this.
+  unsigned MaxKernArgAlign; // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
 
-  // FIXME: This should probably be removed.
-  /// Start of implicit kernel args
-  unsigned ABIArgOffset;
-
-  // Kernels + shaders. i.e. functions called by the driver and not not called
+  // Kernels + shaders. i.e. functions called by the driver and not called
   // by other functions.
   bool IsEntryFunction;
 
   bool NoSignedZerosFPMath;
 
-public:
-  AMDGPUMachineFunction(const MachineFunction &MF);
-
-  uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
-    assert(isPowerOf2_32(Align));
-    KernArgSize = alignTo(KernArgSize, Align);
+  // Function may be memory bound.
+  bool MemoryBound;
 
-    uint64_t Result = KernArgSize;
-    KernArgSize += Size;
+  // Kernel may need limited waves per EU for better performance.
+  bool WaveLimiter;
 
-    MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
-    return Result;
-  }
+public:
+  AMDGPUMachineFunction(const MachineFunction &MF);
 
-  uint64_t getKernArgSize() const {
-    return KernArgSize;
+  uint64_t getExplicitKernArgSize() const {
+    return ExplicitKernArgSize;
   }
 
   unsigned getMaxKernArgAlign() const {
     return MaxKernArgAlign;
   }
 
-  void setABIArgOffset(unsigned NewOffset) {
-    ABIArgOffset = NewOffset;
-  }
-
-  unsigned getABIArgOffset() const {
-    return ABIArgOffset;
-  }
-
   unsigned getLDSSize() const {
     return LDSSize;
   }
@@ -78,6 +64,14 @@ public:
     return NoSignedZerosFPMath;
   }
 
+  bool isMemoryBound() const {
+    return MemoryBound;
+  }
+
+  bool needsWaveLimiter() const {
+    return WaveLimiter;
+  }
+
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 3164140abe29..7b9f673c418c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
 ///
 //
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1a728c6bd04a..1219ab26fb69 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU Machine Module Info.
+/// AMDGPU Machine Module Info.
 ///
 //
 //===----------------------------------------------------------------------===//
@@ -30,14 +30,14 @@ private:
   // All supported memory/synchronization scopes can be found here:
   //   http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
 
-  /// \brief Agent synchronization scope ID.
+  /// Agent synchronization scope ID.
   SyncScope::ID AgentSSID;
-  /// \brief Workgroup synchronization scope ID.
+  /// Workgroup synchronization scope ID.
   SyncScope::ID WorkgroupSSID;
-  /// \brief Wavefront synchronization scope ID.
+  /// Wavefront synchronization scope ID.
   SyncScope::ID WavefrontSSID;
 
-  /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+  /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
   /// scope.
   ///
@@ -74,7 +74,7 @@ public:
     return WavefrontSSID;
   }
 
-  /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a
+  /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
   /// scope.
   ///
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 7263ba73d155..995d9ae3907f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 #include "llvm/CodeGen/MacroFusion.h"
 
@@ -22,7 +23,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
 /// together. Given SecondMI, when FirstMI is unspecified, then check if
 /// SecondMI may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index bb65636f15af..7bd8533a0ccf 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // \file
-// \brief This post-linking pass replaces the function pointer of enqueued
+// This post-linking pass replaces the function pointer of enqueued
 // block kernel with a global variable (runtime handle) and adds
 // "runtime-handle" attribute to the enqueued block kernel.
 //
@@ -36,7 +36,9 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/Pass.h"
@@ -49,7 +51,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Lower enqueued blocks.
+/// Lower enqueued blocks.
 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
 public:
   static char ID;
@@ -80,49 +82,63 @@ static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
   for (auto U : F->users()) {
     if (auto *CI = dyn_cast<CallInst>(&*U)) {
       auto *Caller = CI->getParent()->getParent();
-      if (Callers.count(Caller))
-        continue;
-      Callers.insert(Caller);
-      collectCallers(Caller, Callers);
+      if (Callers.insert(Caller).second)
+        collectCallers(Caller, Callers);
     }
   }
 }
 
+/// If \p U is instruction or constant, collect functions which directly or
+/// indirectly use it.
+static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
+  if (auto *I = dyn_cast<Instruction>(U)) {
+    auto *F = I->getParent()->getParent();
+    if (Funcs.insert(F).second)
+      collectCallers(F, Funcs);
+    return;
+  }
+  if (!isa<Constant>(U))
+    return;
+  for (auto UU : U->users())
+    collectFunctionUsers(&*UU, Funcs);
+}
+
 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
   DenseSet<Function *> Callers;
   auto &C = M.getContext();
   bool Changed = false;
   for (auto &F : M.functions()) {
     if (F.hasFnAttribute("enqueued-block")) {
-      if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
-          !isa<ConstantExpr>(*F.user_begin()) ||
-          !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
-        continue;
+      if (!F.hasName()) {
+        SmallString<64> Name;
+        Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
+                                   M.getDataLayout());
+        F.setName(Name);
       }
-      auto *BitCast = cast<ConstantExpr>(*F.user_begin());
-      auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
-      auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+      LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
+      auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
+      auto T = ArrayType::get(Type::getInt64Ty(C), 2);
       auto *GV = new GlobalVariable(
-          M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
-          /*IsConstant=*/true, GlobalValue::ExternalLinkage,
-          /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
-          GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
-          /*IsExternallyInitialized=*/true);
-      DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
-      auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
-      AddrCast->replaceAllUsesWith(NewPtr);
-      F.addFnAttr("runtime-handle", RuntimeHandle);
-      F.setLinkage(GlobalValue::ExternalLinkage);
-
-      // Collect direct or indirect callers of enqueue_kernel.
-      for (auto U : NewPtr->users()) {
-        if (auto *I = dyn_cast<Instruction>(&*U)) {
-          auto *F = I->getParent()->getParent();
-          Callers.insert(F);
-          collectCallers(F, Callers);
-        }
+          M, T,
+          /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+          /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
+          /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+          AMDGPUAS::GLOBAL_ADDRESS,
+          /*IsExternallyInitialized=*/false);
+      LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+
+      for (auto U : F.users()) {
+        auto *UU = &*U;
+        if (!isa<ConstantExpr>(UU))
+          continue;
+        collectFunctionUsers(UU, Callers);
+        auto *BitCast = cast<ConstantExpr>(UU);
+        auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
+        BitCast->replaceAllUsesWith(NewPtr);
+        F.addFnAttr("runtime-handle", RuntimeHandle);
+        F.setLinkage(GlobalValue::ExternalLinkage);
+        Changed = true;
       }
-      Changed = true;
     }
   }
 
@@ -130,6 +146,7 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
     if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
       continue;
     F->addFnAttr("calls-enqueue-kernel");
+    LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
   }
   return Changed;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
new file mode 100644
index 000000000000..3cfdccc9fe51
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -0,0 +1,397 @@
+//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-perf-hint"
+
+static cl::opt<unsigned>
+    MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
+                   cl::desc("Function mem bound threshold in %"));
+
+static cl::opt<unsigned>
+    LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
+                    cl::desc("Kernel limit wave threshold in %"));
+
+static cl::opt<unsigned>
+    IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
+             cl::desc("Indirect access memory instruction weight"));
+
+static cl::opt<unsigned>
+    LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
+             cl::desc("Large stride memory access weight"));
+
+static cl::opt<unsigned>
+    LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
+                      cl::desc("Large stride memory access threshold"));
+
+STATISTIC(NumMemBound, "Number of functions marked as memory bound");
+STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
+
+char llvm::AMDGPUPerfHintAnalysis::ID = 0;
+char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
+
+INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
+                "Analysis if a function is memory bound", true, true)
+
+namespace {
+
+struct AMDGPUPerfHint {
+  friend AMDGPUPerfHintAnalysis;
+
+public:
+  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
+                 const TargetLowering *TLI_)
+      : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
+
+  void runOnFunction(Function &F);
+
+private:
+  struct MemAccessInfo {
+    const Value *V;
+    const Value *Base;
+    int64_t Offset;
+    MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+    bool isLargeStride(MemAccessInfo &Reference) const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    Printable print() const {
+      return Printable([this](raw_ostream &OS) {
+        OS << "Value: " << *V << '\n'
+           << "Base: " << *Base << " Offset: " << Offset << '\n';
+      });
+    }
+#endif
+  };
+
+  MemAccessInfo makeMemAccessInfo(Instruction *) const;
+
+  MemAccessInfo LastAccess; // Last memory access info
+
+  AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
+
+  const DataLayout *DL;
+
+  AMDGPUAS AS;
+
+  const TargetLowering *TLI;
+
+  void visit(const Function &F);
+  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+
+  bool isIndirectAccess(const Instruction *Inst) const;
+
+  /// Check if the instruction is large stride.
+  /// The purpose is to identify memory access pattern like:
+  /// x = a[i];
+  /// y = a[i+1000];
+  /// z = a[i+2000];
+  /// In the above example, the second and third memory access will be marked
+  /// large stride memory access.
+  bool isLargeStride(const Instruction *Inst);
+
+  bool isGlobalAddr(const Value *V) const;
+  bool isLocalAddr(const Value *V) const;
+  bool isConstantAddr(const Value *V) const;
+};
+
+static const Value *getMemoryInstrPtr(const Instruction *Inst) {
+  if (auto LI = dyn_cast<LoadInst>(Inst)) {
+    return LI->getPointerOperand();
+  }
+  if (auto SI = dyn_cast<StoreInst>(Inst)) {
+    return SI->getPointerOperand();
+  }
+  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    return AI->getPointerOperand();
+  }
+  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
+    return AI->getPointerOperand();
+  }
+  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
+    return MI->getRawDest();
+  }
+
+  return nullptr;
+}
+
+bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
+  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
+  SmallSet<const Value *, 32> WorkSet;
+  SmallSet<const Value *, 32> Visited;
+  if (const Value *MO = getMemoryInstrPtr(Inst)) {
+    if (isGlobalAddr(MO))
+      WorkSet.insert(MO);
+  }
+
+  while (!WorkSet.empty()) {
+    const Value *V = *WorkSet.begin();
+    WorkSet.erase(*WorkSet.begin());
+    if (!Visited.insert(V).second)
+      continue;
+    LLVM_DEBUG(dbgs() << "  check: " << *V << '\n');
+
+    if (auto LD = dyn_cast<LoadInst>(V)) {
+      auto M = LD->getPointerOperand();
+      if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+        LLVM_DEBUG(dbgs() << "    is IA\n");
+        return true;
+      }
+      continue;
+    }
+
+    if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
+      auto P = GEP->getPointerOperand();
+      WorkSet.insert(P);
+      for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
+        WorkSet.insert(GEP->getOperand(I));
+      continue;
+    }
+
+    if (auto U = dyn_cast<UnaryInstruction>(V)) {
+      WorkSet.insert(U->getOperand(0));
+      continue;
+    }
+
+    if (auto BO = dyn_cast<BinaryOperator>(V)) {
+      WorkSet.insert(BO->getOperand(0));
+      WorkSet.insert(BO->getOperand(1));
+      continue;
+    }
+
+    if (auto S = dyn_cast<SelectInst>(V)) {
+      WorkSet.insert(S->getFalseValue());
+      WorkSet.insert(S->getTrueValue());
+      continue;
+    }
+
+    if (auto E = dyn_cast<ExtractElementInst>(V)) {
+      WorkSet.insert(E->getVectorOperand());
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "    dropped\n");
+  }
+
+  LLVM_DEBUG(dbgs() << "  is not IA\n");
+  return false;
+}
+
+void AMDGPUPerfHint::visit(const Function &F) {
+  auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
+  if (!FIP.second)
+    return;
+
+  AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+
+  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
+
+  for (auto &B : F) {
+    LastAccess = MemAccessInfo();
+    for (auto &I : B) {
+      if (getMemoryInstrPtr(&I)) {
+        if (isIndirectAccess(&I))
+          ++FI.IAMInstCount;
+        if (isLargeStride(&I))
+          ++FI.LSMInstCount;
+        ++FI.MemInstCount;
+        ++FI.InstCount;
+        continue;
+      }
+      CallSite CS(const_cast<Instruction *>(&I));
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+        if (!Callee || Callee->isDeclaration()) {
+          ++FI.InstCount;
+          continue;
+        }
+        if (&F == Callee) // Handle immediate recursion
+          continue;
+
+        visit(*Callee);
+        auto Loc = FIM.find(Callee);
+
+        assert(Loc != FIM.end() && "No func info");
+        FI.MemInstCount += Loc->second.MemInstCount;
+        FI.InstCount += Loc->second.InstCount;
+        FI.IAMInstCount += Loc->second.IAMInstCount;
+        FI.LSMInstCount += Loc->second.LSMInstCount;
+      } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+        TargetLoweringBase::AddrMode AM;
+        auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
+        AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
+        AM.HasBaseReg = !AM.BaseGV;
+        if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
+                                       GEP->getPointerAddressSpace()))
+          // Offset will likely be folded into load or store
+          continue;
+        ++FI.InstCount;
+      } else {
+        ++FI.InstCount;
+      }
+    }
+  }
+}
+
+void AMDGPUPerfHint::runOnFunction(Function &F) {
+  if (FIM.find(&F) != FIM.end())
+    return;
+
+  const Module &M = *F.getParent();
+  DL = &M.getDataLayout();
+  AS = AMDGPU::getAMDGPUAS(M);
+
+  visit(F);
+  auto Loc = FIM.find(&F);
+
+  assert(Loc != FIM.end() && "No func info");
+  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+                    << '\n'
+                    << " IAMInst: " << Loc->second.IAMInstCount << '\n'
+                    << " LSMInst: " << Loc->second.LSMInstCount << '\n'
+                    << " TotalInst: " << Loc->second.InstCount << '\n');
+
+  auto &FI = Loc->second;
+
+  if (isMemBound(FI)) {
+    LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
+    NumMemBound++;
+  }
+
+  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+    LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
+    NumLimitWave++;
+  }
+}
+
+bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+}
+
+bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
+           FI.LSMInstCount * LSWeight) *
+          100 / FI.InstCount) > LimitWaveThresh;
+}
+
+bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
+  if (auto PT = dyn_cast<PointerType>(V->getType())) {
+    unsigned As = PT->getAddressSpace();
+    // Flat likely points to global too.
+    return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+  }
+  return false;
+}
+
+bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
+  if (auto PT = dyn_cast<PointerType>(V->getType()))
+    return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+  return false;
+}
+
+bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
+  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
+
+  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
+  bool IsLargeStride = MAI.isLargeStride(LastAccess);
+  if (MAI.Base)
+    LastAccess = std::move(MAI);
+
+  return IsLargeStride;
+}
+
+AMDGPUPerfHint::MemAccessInfo
+AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
+  MemAccessInfo MAI;
+  const Value *MO = getMemoryInstrPtr(Inst);
+
+  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
+  // Do not treat local-addr memory access as large stride.
+  if (isLocalAddr(MO))
+    return MAI;
+
+  MAI.V = MO;
+  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
+  return MAI;
+}
+
+bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
+  if (auto PT = dyn_cast<PointerType>(V->getType())) {
+    unsigned As = PT->getAddressSpace();
+    return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+  }
+  return false;
+}
+
+bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
+    MemAccessInfo &Reference) const {
+
+  if (!Base || !Reference.Base || Base != Reference.Base)
+    return false;
+
+  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
+                                            : Reference.Offset - Offset;
+  bool Result = Diff > LargeStrideThresh;
+  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
+               << print() << "<=>\n"
+               << Reference.print() << "Result:" << Result << '\n');
+  return Result;
+}
+} // namespace
+
+bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
+
+  AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+  Analyzer.runOnFunction(F);
+  return false;
+}
+
+bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
+  auto FI = FIM.find(F);
+  if (FI == FIM.end())
+    return false;
+
+  return AMDGPUPerfHint::isMemBound(FI->second);
+}
+
+bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
+  auto FI = FIM.find(F);
+  if (FI == FIM.end())
+    return false;
+
+  return AMDGPUPerfHint::needLimitWave(FI->second);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
new file mode 100644
index 000000000000..be7f37cb6815
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -0,0 +1,55 @@
+//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+struct AMDGPUPerfHintAnalysis : public FunctionPass {
+  static char ID;
+
+public:
+  AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool isMemoryBound(const Function *F) const;
+
+  bool needsWaveLimiter(const Function *F) const;
+
+  struct FuncInfo {
+    unsigned MemInstCount;
+    unsigned InstCount;
+    unsigned IAMInstCount; // Indirect access memory instruction count
+    unsigned LSMInstCount; // Large stride memory instruction count
+    FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
+                 LSMInstCount(0) {}
+  };
+
+  typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
+
+private:
+
+  FuncInfoMap FIM;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 41876ed45c8c..d341fec6296f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -65,6 +65,11 @@ using namespace llvm;
 
 namespace {
 
+static cl::opt<bool> DisablePromoteAllocaToVector(
+  "disable-promote-alloca-to-vector",
+  cl::desc("Disable promote alloca to vector"),
+  cl::init(false));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -147,7 +152,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   IsAMDGCN = TT.getArch() == Triple::amdgcn;
   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
@@ -169,8 +174,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
 std::pair<Value *, Value *>
 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
-                                *Builder.GetInsertBlock()->getParent());
+  const Function &F = *Builder.GetInsertBlock()->getParent();
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 
   if (!IsAMDHSA) {
     Function *LocalSizeYFn
@@ -256,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 }
 
 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
-                                *Builder.GetInsertBlock()->getParent());
+  const AMDGPUSubtarget &ST =
+      AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
 
   switch (N) {
@@ -318,18 +323,19 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 static bool canVectorizeInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
+    // Currently only handle the case where the Pointer Operand is a GEP.
+    // Also we could not vectorize volatile or atomic loads.
     LoadInst *LI = cast<LoadInst>(Inst);
-    // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
-    return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
+    return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
   }
   case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
     return true;
   case Instruction::Store: {
     // Must be the stored pointer operand, not a stored value, plus
     // since it should be canonical form, the User should be a GEP.
+    // Also we could not vectorize volatile or atomic stores.
     StoreInst *SI = cast<StoreInst>(Inst);
-    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
+    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
   }
   default:
     return false;
@@ -337,19 +343,25 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
 }
 
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+
+  if (DisablePromoteAllocaToVector) {
+    LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
+    return false;
+  }
+
   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
-  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
+  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
   // could also be promoted but we don't currently handle this case
   if (!AllocaTy ||
-      AllocaTy->getNumElements() > 4 ||
+      AllocaTy->getNumElements() > 16 ||
       AllocaTy->getNumElements() < 2 ||
       !VectorType::isValidElementType(AllocaTy->getElementType())) {
-    DEBUG(dbgs() << "  Cannot convert type to vector\n");
+    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
@@ -370,7 +382,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
     // If we can't compute a vector index from this GEP, then we can't
     // promote this alloca to vector.
     if (!Index) {
-      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+      LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
+                        << '\n');
       return false;
     }
 
@@ -385,8 +398,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
 
   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
 
-  DEBUG(dbgs() << "  Converting alloca to vector "
-        << *AllocaTy << " -> " << *VectorTy << '\n');
+  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
+                    << *VectorTy << '\n');
 
   for (Value *V : WorkList) {
     Instruction *Inst = cast<Instruction>(V);
@@ -443,7 +456,8 @@ static bool isCallPromotable(CallInst *CI) {
   case Intrinsic::lifetime_end:
   case Intrinsic::invariant_start:
   case Intrinsic::invariant_end:
-  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
   case Intrinsic::objectsize:
     return true;
   default:
@@ -475,7 +489,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
   // important part is both must have the same address space at
   // the end.
   if (OtherObj != BaseAlloca) {
-    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    LLVM_DEBUG(
+        dbgs() << "Found a binary instruction with another alloca object\n");
     return false;
   }
 
@@ -588,7 +603,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
 
   FunctionType *FTy = F.getFunctionType();
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 
   // If the function has any arguments in the local address space, then it's
   // possible these arguments require the entire local memory space, so
@@ -597,8 +612,8 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
     if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
       LocalMemLimit = 0;
-      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
-                      "local memory disabled.\n");
+      LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                           "local memory disabled.\n");
       return false;
     }
   }
@@ -667,13 +682,12 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
 
   LocalMemLimit = MaxSizeWithWaveCount;
 
-  DEBUG(
-    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
-    << "  Rounding size to " << MaxSizeWithWaveCount
-    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-    << " available for promotion\n"
-  );
+  LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
+                    << " bytes of LDS\n"
+                    << "  Rounding size to " << MaxSizeWithWaveCount
+                    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+                    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+                    << " available for promotion\n");
 
   return true;
 }
@@ -690,7 +704,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // First try to replace the alloca with a vector
   Type *AllocaTy = I.getAllocatedType();
 
-  DEBUG(dbgs() << "Trying to promote " << I << '\n');
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
   if (tryPromoteAllocaToVector(&I, AS))
     return true; // Promoted to vector.
@@ -706,7 +720,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   case CallingConv::SPIR_KERNEL:
     break;
   default:
-    DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << " promote alloca to LDS not supported with calling convention.\n");
     return false;
   }
 
@@ -714,8 +730,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (!SufficientLDS)
     return false;
 
-  const AMDGPUSubtarget &ST =
-    TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
   const DataLayout &DL = Mod->getDataLayout();
@@ -735,8 +750,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   NewSize += AllocSize;
 
   if (NewSize > LocalMemLimit) {
-    DEBUG(dbgs() << "  " << AllocSize
-          << " bytes of local memory not available to promote\n");
+    LLVM_DEBUG(dbgs() << "  " << AllocSize
+                      << " bytes of local memory not available to promote\n");
     return false;
   }
 
@@ -745,11 +760,11 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   std::vector<Value*> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
-    DEBUG(dbgs() << " Do not know how to convert all uses\n");
+    LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
     return false;
   }
 
-  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+  LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
 
   Function *F = I.getParent()->getParent();
 
@@ -843,31 +858,32 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       continue;
     case Intrinsic::memcpy: {
       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
-      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
-                           MemCpy->getLength(), MemCpy->getAlignment(),
-                           MemCpy->isVolatile());
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
+                           MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
+                           MemCpy->getLength(), MemCpy->isVolatile());
       Intr->eraseFromParent();
       continue;
     }
     case Intrinsic::memmove: {
       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
-      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
-                            MemMove->getLength(), MemMove->getAlignment(),
-                            MemMove->isVolatile());
+      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
+                            MemMove->getRawSource(), MemMove->getSourceAlignment(),
+                            MemMove->getLength(), MemMove->isVolatile());
       Intr->eraseFromParent();
       continue;
     }
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
-                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->getLength(), MemSet->getDestAlignment(),
                            MemSet->isVolatile());
       Intr->eraseFromParent();
       continue;
     }
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
-    case Intrinsic::invariant_group_barrier:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
       Intr->eraseFromParent();
       // FIXME: I think the invariant marker should still theoretically apply,
       // but the intrinsics need to be changed to accept pointers with any
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index e3df6d9bee88..012e4fe200aa 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -14,7 +14,9 @@
 
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -50,10 +52,38 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
 
 }
 
-unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
-                                           const RegisterBank &B,
-                                           unsigned Size) const {
-  return RegisterBankInfo::copyCost(A, B, Size);
+static bool isConstant(const MachineOperand &MO, int64_t &C) {
+  const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const MachineInstr *Def = MRI.getVRegDef(MO.getReg());
+  if (!Def)
+    return false;
+
+  if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+    C = Def->getOperand(1).getCImm()->getSExtValue();
+    return true;
+  }
+
+  if (Def->getOpcode() == AMDGPU::COPY)
+    return isConstant(Def->getOperand(1), C);
+
+  return false;
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
+                                          const RegisterBank &Src,
+                                          unsigned Size) const {
+  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
+      Src.getID() == AMDGPU::VGPRRegBankID)
+    return std::numeric_limits<unsigned>::max();
+
+  // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
+  // the valu.
+  if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
+      Src.getID() == AMDGPU::SGPRRegBankID)
+    return std::numeric_limits<unsigned>::max();
+
+  return RegisterBankInfo::copyCost(Dst, Src, Size);
 }
 
 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
@@ -72,11 +102,11 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
 
   InstructionMappings AltMappings;
   switch (MI.getOpcode()) {
   case TargetOpcode::G_LOAD: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
     // FIXME: Should we be hard coding the size for these mappings?
     const InstructionMapping &SSMapping = getInstructionMapping(
         1, 1, getOperandsMapping(
@@ -104,6 +134,42 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     return AltMappings;
 
   }
+  case TargetOpcode::G_ICMP: {
+    unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&SSMapping);
+
+    const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&SVMapping);
+
+    const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&VSMapping);
+
+    const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+                          nullptr, // Predicate operand.
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+      4); // Num Operands
+    AltMappings.push_back(&VVMapping);
+
+    return AltMappings;
+  }
   default:
     break;
   }
@@ -123,6 +189,59 @@ static bool isInstrUniform(const MachineInstr &MI) {
   return AMDGPUInstrInfo::isUniformMMO(MMO);
 }
 
+bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
+    unsigned Reg = MI.getOperand(i).getReg();
+    const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+    if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID)
+      return false;
+  }
+  return true;
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+    OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+  }
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+                               MI.getNumOperands());
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+  unsigned OpdIdx = 0;
+
+  unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
+
+  if (MI.getOperand(OpdIdx).isIntrinsicID())
+    OpdsMapping[OpdIdx++] = nullptr;
+
+  unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
+  unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
+  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI);
+  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
+
+  for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
+    unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
+    OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+  }
+
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+                               MI.getNumOperands());
+}
+
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
 
@@ -155,6 +274,22 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
   // handle that during instruction selection?
 }
 
+unsigned
+AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
+                                     const MachineRegisterInfo &MRI,
+                                     const TargetRegisterInfo &TRI,
+                                     unsigned Default) const {
+
+  const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+  return Bank ? Bank->getID() : Default;
+}
+
+///
+/// This function must return a legal mapping, because
+/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
+/// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
+/// VGPR to SGPR generated is illegal.
+///
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
@@ -166,16 +301,102 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
-  bool IsComplete = true;
   switch (MI.getOpcode()) {
   default:
-    IsComplete = false;
+    return getInvalidInstructionMapping();
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SUB:
+  case AMDGPU::G_MUL:
+  case AMDGPU::G_AND:
+  case AMDGPU::G_OR:
+  case AMDGPU::G_XOR:
+  case AMDGPU::G_SHL:
+    if (isSALUMapping(MI))
+      return getDefaultMappingSOP(MI);
+    // Fall-through
+
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FPTOSI:
+  case AMDGPU::G_FPTOUI:
+  case AMDGPU::G_FMUL:
+    return getDefaultMappingVOP(MI);
+  case AMDGPU::G_IMPLICIT_DEF: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
+  }
+  case AMDGPU::G_FCONSTANT:
   case AMDGPU::G_CONSTANT: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
   }
+  case AMDGPU::G_EXTRACT: {
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
+    OpdsMapping[2] = nullptr;
+    break;
+  }
+  case AMDGPU::G_MERGE_VALUES: {
+    unsigned Bank = isSALUMapping(MI) ?
+      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+    // Op1 and Dst should use the same register bank.
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
+      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
+    break;
+  }
+  case AMDGPU::G_BITCAST: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+    break;
+  }
+  case AMDGPU::G_TRUNC: {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned Bank = getRegBankID(Src, MRI, *TRI);
+    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
+    break;
+  }
+  case AMDGPU::G_ZEXT: {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
+    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
+    unsigned SrcBank = getRegBankID(Src, MRI, *TRI,
+                                    SrcSize == 1 ? AMDGPU::SGPRRegBankID :
+                                    AMDGPU::VGPRRegBankID);
+    unsigned DstBank = SrcBank;
+    if (SrcSize == 1) {
+      if (SrcBank == AMDGPU::SGPRRegBankID)
+        DstBank = AMDGPU::VGPRRegBankID;
+      else
+        DstBank = AMDGPU::SGPRRegBankID;
+    }
+
+    OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+    break;
+  }
+  case AMDGPU::G_FCMP: {
+    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1);
+    OpdsMapping[1] = nullptr; // Predicate Operand.
+    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+    OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    break;
+  }
   case AMDGPU::G_GEP: {
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       if (!MI.getOperand(i).isReg())
@@ -204,24 +425,113 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
 
-  case AMDGPU::G_LOAD:
-    return getInstrMappingForLoad(MI);
+  case AMDGPU::G_ICMP: {
+    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+    unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
+                       Op3Bank == AMDGPU::SGPRRegBankID ?
+                       AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID;
+    OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
+    OpdsMapping[1] = nullptr; // Predicate Operand.
+    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
+    OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
+    break;
+  }
+
+
+  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+    unsigned IdxOp = 2;
+    int64_t Imm;
+    // XXX - Do we really need to fully handle these? The constant case should
+    // be legalized away before RegBankSelect?
+
+    unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+                            AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+    unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
+    OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+
+    // The index can be either if the source vector is VGPR.
+    OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+    break;
   }
+  case AMDGPU::G_INSERT_VECTOR_ELT: {
+    // XXX - Do we really need to fully handle these? The constant case should
+    // be legalized away before RegBankSelect?
+
+    int64_t Imm;
+
+    unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3;
+    unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+                      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+
+
+    // TODO: Can do SGPR indexing, which would obviate the need for the
+    // isConstant check.
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+      OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+    }
 
-  if (!IsComplete) {
-    unsigned BankID = AMDGPU::SGPRRegBankID;
 
-    unsigned Size = 0;
-    for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
-      // If the operand is not a register default to the size of the previous
-      // operand.
-      // FIXME: Can't we pull the types from the MachineInstr rather than the
-      // operands.
-      if (MI.getOperand(Idx).isReg())
-        Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
-      OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+    break;
+  }
+  case AMDGPU::G_INTRINSIC: {
+    switch (MI.getOperand(1).getIntrinsicID()) {
+    default:
+      return getInvalidInstructionMapping();
+    case Intrinsic::maxnum:
+    case Intrinsic::minnum:
+    case Intrinsic::amdgcn_cvt_pkrtz:
+      return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_kernarg_segment_ptr: {
+      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      break;
+    }
+    }
+    break;
+  }
+  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+    switch (MI.getOperand(0).getIntrinsicID()) {
+    default:
+      return getInvalidInstructionMapping();
+    case Intrinsic::amdgcn_exp_compr:
+      OpdsMapping[0] = nullptr; // IntrinsicID
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      // FIXME: Could we support packed types here?
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      break;
+    case Intrinsic::amdgcn_exp:
+      OpdsMapping[0] = nullptr; // IntrinsicID
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      // FIXME: Could we support packed types here?
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      // FIXME: These are immediate values which can't be read from registers.
+      OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      break;
     }
+    break;
+  }
+  case AMDGPU::G_LOAD:
+    return getInstrMappingForLoad(MI);
   }
+
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
                                MI.getNumOperands());
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 201fdc1974c6..d48a66589873 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -16,19 +16,15 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "AMDGPUGenRegisterBank.inc"
+#undef GET_REGBANK_DECLARATIONS
+
 namespace llvm {
 
 class SIRegisterInfo;
 class TargetRegisterInfo;
 
-namespace AMDGPU {
-enum {
-  SGPRRegBankID = 0,
-  VGPRRegBankID = 1,
-  NumRegisterBanks
-};
-} // End AMDGPU namespace.
-
 /// This class provides the information for the target register banks.
 class AMDGPUGenRegisterBankInfo : public RegisterBankInfo {
 
@@ -46,6 +42,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
   const RegisterBankInfo::InstructionMapping &
   getInstrMappingForLoad(const MachineInstr &MI) const;
 
+  unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI,
+                        const TargetRegisterInfo &TRI,
+                        unsigned Default = AMDGPU::VGPRRegBankID) const;
+
+  bool isSALUMapping(const MachineInstr &MI) const;
+  const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
+  const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
 public:
   AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index f4428e56035f..7f7f75f65647 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -14,3 +14,5 @@ def SGPRRegBank : RegisterBank<"SGPR",
 def VGPRRegBank : RegisterBank<"VGPR",
   [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
 >;
+
+def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 5e4d33aaa691..50f859addc2b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -8,13 +8,15 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+/// Parent TargetRegisterInfo class common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 using namespace llvm;
 
@@ -25,7 +27,7 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
   static const unsigned SubRegs[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
     AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
@@ -37,6 +39,13 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   return SubRegs[Channel];
 }
 
+void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
+
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
+
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
@@ -75,5 +84,6 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 }
 
 unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  return FuncInfo->getFrameOffsetReg();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index d8604d2590f1..07de5fc549e2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// TargetRegisterInfo interface that is implemented by all hw codegen
 /// targets.
 //
 //===----------------------------------------------------------------------===//
@@ -21,15 +21,19 @@
 
 namespace llvm {
 
-class AMDGPUSubtarget;
+class GCNSubtarget;
 class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   AMDGPURegisterInfo();
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
+  static unsigned getSubRegFromChannel(unsigned Channel);
+
+  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index 3bbcba826f63..ceabae524414 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -19,5 +19,4 @@ foreach Index = 0-15 in {
 
 }
 
-include "R600RegisterInfo.td"
 include "SIRegisterInfo.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 83e56a9ab495..a861762a8c9e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -249,8 +249,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   SmallVector<Argument *, 4> OutArgs;
   for (Argument &Arg : F.args()) {
     if (isOutArgumentCandidate(Arg)) {
-      DEBUG(dbgs() << "Found possible out argument " << Arg
-            << " in function " << F.getName() << '\n');
+      LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
+                        << " in function " << F.getName() << '\n');
       OutArgs.push_back(&Arg);
     }
   }
@@ -310,7 +310,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
           SI = dyn_cast<StoreInst>(Q.getInst());
 
         if (SI) {
-          DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
+          LLVM_DEBUG(dbgs() << "Found out argument store: " << *SI << '\n');
           ReplaceableStores.emplace_back(RI, SI);
         } else {
           ThisReplaceable = false;
@@ -328,7 +328,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
         if (llvm::find_if(ValVec,
               [OutArg](const std::pair<Argument *, Value *> &Entry) {
                  return Entry.first == OutArg;}) != ValVec.end()) {
-          DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "Saw multiple out arg stores" << *OutArg << '\n');
           // It is possible to see stores to the same argument multiple times,
           // but we expect these would have been optimized out already.
           ThisReplaceable = false;
@@ -358,7 +359,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
                                               F.getFunctionType()->params(),
                                               F.isVarArg());
 
-  DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
+  LLVM_DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n');
 
   Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage,
                                        F.getName() + ".body");
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
new file mode 100644
index 000000000000..9dbd7751b4d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -0,0 +1,77 @@
+//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Resource intrinsics table.
+//===----------------------------------------------------------------------===//
+
+class RsrcIntrinsic<AMDGPURsrcIntrinsic intr> {
+  Intrinsic Intr = !cast<Intrinsic>(intr);
+  bits<8> RsrcArg = intr.RsrcArg;
+  bit IsImage = intr.IsImage;
+}
+
+def RsrcIntrinsics : GenericTable {
+  let FilterClass = "RsrcIntrinsic";
+  let Fields = ["Intr", "RsrcArg", "IsImage"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "lookupRsrcIntrinsic";
+}
+
+foreach intr = !listconcat(AMDGPUBufferIntrinsics,
+                           AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimAtomicIntrinsics) in {
+  def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>;
+}
+
+class SourceOfDivergence<Intrinsic intr> {
+  Intrinsic Intr = intr;
+}
+
+def SourcesOfDivergence : GenericTable {
+  let FilterClass = "SourceOfDivergence";
+  let Fields = ["Intr"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "lookupSourceOfDivergence";
+}
+
+def : SourceOfDivergence<int_amdgcn_workitem_id_x>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_y>;
+def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
+def : SourceOfDivergence<int_amdgcn_interp_mov>;
+def : SourceOfDivergence<int_amdgcn_interp_p1>;
+def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
+def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
+def : SourceOfDivergence<int_r600_read_tidig_x>;
+def : SourceOfDivergence<int_r600_read_tidig_y>;
+def : SourceOfDivergence<int_r600_read_tidig_z>;
+def : SourceOfDivergence<int_amdgcn_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_ds_fadd>;
+def : SourceOfDivergence<int_amdgcn_ds_fmin>;
+def : SourceOfDivergence<int_amdgcn_ds_fmax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in
+def : SourceOfDivergence<intr>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 80feaa44766f..98b49070fa99 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+/// Implements the AMDGPU specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,8 +20,10 @@
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include <algorithm>
@@ -32,12 +34,37 @@ using namespace llvm;
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
+#define AMDGPUSubtarget GCNSubtarget
 #include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#undef AMDGPUSubtarget
+#include "R600GenSubtargetInfo.inc"
 
-AMDGPUSubtarget::~AMDGPUSubtarget() = default;
+GCNSubtarget::~GCNSubtarget() = default;
+
+R600Subtarget &
+R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
+                                               StringRef GPU, StringRef FS) {
+  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+  FullFS += FS;
+  ParseSubtargetFeatures(GPU, FullFS);
+
+  // FIXME: I don't think think Evergreen has any useful support for
+  // denormals, but should be checked. Should we issue a warning somewhere
+  // if someone tries to enable these?
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    FP32Denormals = false;
+  }
+
+  HasMulU24 = getGeneration() >= EVERGREEN;
+  HasMulI24 = hasCaymanISA();
+
+  return *this;
+}
 
-AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+GCNSubtarget &
+GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                                  StringRef GPU, StringRef FS) {
   // Determine default and user-specified characteristics
   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
@@ -92,26 +119,43 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
       HasMovrel = true;
   }
 
+  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+
   return *this;
 }
 
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                                 const TargetMachine &TM)
-  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
+                                             const FeatureBitset &FeatureBits) :
+  TargetTriple(TT),
+  SubtargetFeatureBits(FeatureBits),
+  Has16BitInsts(false),
+  HasMadMixInsts(false),
+  FP32Denormals(false),
+  FPExceptions(false),
+  HasSDWA(false),
+  HasVOP3PInsts(false),
+  HasMulI24(true),
+  HasMulU24(true),
+  HasFminFmaxLegacy(true),
+  EnablePromoteAlloca(false),
+  LocalMemorySize(0),
+  WavefrontSize(0)
+  { }
+
+GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                                 const GCNTargetMachine &TM) :
+    AMDGPUGenSubtargetInfo(TT, GPU, FS),
+    AMDGPUSubtarget(TT, getFeatureBits()),
     TargetTriple(TT),
-    Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+    Gen(SOUTHERN_ISLANDS),
     IsaVersion(ISAVersion0_0_0),
-    WavefrontSize(0),
-    LocalMemorySize(0),
     LDSBankCount(0),
     MaxPrivateElementSize(0),
 
     FastFMAF32(false),
     HalfRate64Ops(false),
 
-    FP32Denormals(false),
     FP64FP16Denormals(false),
-    FPExceptions(false),
     DX10Clamp(false),
     FlatForGlobal(false),
     AutoWaitcntBeforeBarrier(false),
@@ -123,57 +167,56 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     EnableXNACK(false),
     TrapHandler(false),
     DebuggerInsertNops(false),
-    DebuggerReserveRegs(false),
     DebuggerEmitPrologue(false),
 
     EnableHugePrivateBuffer(false),
     EnableVGPRSpilling(false),
-    EnablePromoteAlloca(false),
     EnableLoadStoreOpt(false),
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
+    EnableDS128(false),
     DumpCode(false),
 
     FP64(false),
-    FMA(false),
-    IsGCN(false),
     GCN3Encoding(false),
     CIInsts(false),
     GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
-    Has16BitInsts(false),
     HasIntClamp(false),
-    HasVOP3PInsts(false),
-    HasMadMixInsts(false),
+    HasFmaMixInsts(false),
     HasMovrel(false),
     HasVGPRIndexMode(false),
     HasScalarStores(false),
+    HasScalarAtomics(false),
     HasInv2PiInlineImm(false),
-    HasSDWA(false),
     HasSDWAOmod(false),
     HasSDWAScalar(false),
     HasSDWASdst(false),
     HasSDWAMac(false),
     HasSDWAOutModsVOPC(false),
     HasDPP(false),
+    HasDLInsts(false),
+    D16PreservesUnusedBits(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
     FlatGlobalInsts(false),
     FlatScratchInsts(false),
     AddNoCarryInsts(false),
+    HasUnpackedD16VMem(false),
 
-    R600ALUInst(false),
-    CaymanISA(false),
-    CFALUBug(false),
-    HasVertexCache(false),
-    TexVTXClauseSize(0),
     ScalarizeGlobal(false),
 
     FeatureDisable(false),
-    InstrItins(getInstrItineraryForCPU(GPU)) {
+    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
+    TLInfo(TM, *this),
+    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
   AS = AMDGPU::getAMDGPUAS(TT);
-  initializeSubtargetDependencies(TT, GPU, FS);
+  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
+  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+  InstSelector.reset(new AMDGPUInstructionSelector(
+  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 }
 
 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
@@ -198,6 +241,12 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
   return NumWaves;
 }
 
+unsigned
+AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
+}
+
 std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   switch (CC) {
@@ -357,27 +406,64 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   return true;
 }
 
-R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                             const TargetMachine &TM) :
-  AMDGPUSubtarget(TT, GPU, FS, TM),
-  InstrInfo(*this),
-  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-  TLInfo(TM, *this) {}
+uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
+                                                 unsigned &MaxAlign) const {
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
 
-SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                         const TargetMachine &TM)
-    : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
-      FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-      TLInfo(TM, *this) {
-  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
-  Legalizer.reset(new AMDGPULegalizerInfo());
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  uint64_t ExplicitArgBytes = 0;
+  MaxAlign = 1;
 
-  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
-  InstSelector.reset(new AMDGPUInstructionSelector(
-      *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
+  for (const Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
+    MaxAlign = std::max(MaxAlign, Align);
+  }
+
+  return ExplicitArgBytes;
 }
 
-void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
+                                                unsigned &MaxAlign) const {
+  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
+
+  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+
+  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
+  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
+  if (ImplicitBytes != 0) {
+    unsigned Alignment = getAlignmentForImplicitArgPtr();
+    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+  }
+
+  // Being able to dereference past the end is useful for emitting scalar loads.
+  return alignTo(TotalSize, 4);
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                             const TargetMachine &TM) :
+  R600GenSubtargetInfo(TT, GPU, FS),
+  AMDGPUSubtarget(TT, getFeatureBits()),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  FMA(false),
+  CaymanISA(false),
+  CFALUBug(false),
+  DX10Clamp(false),
+  HasVertexCache(false),
+  R600ALUInst(false),
+  FP64(false),
+  TexVTXClauseSize(0),
+  Gen(R600),
+  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
+  InstrItins(getInstrItineraryForCPU(GPU)),
+  AS (AMDGPU::getAMDGPUAS(TT)) { }
+
+void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                       unsigned NumRegionInstrs) const {
   // Track register pressure so the scheduler can try to decrease
   // pressure once register usage is above the threshold defined by
@@ -394,22 +480,12 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     Policy.ShouldTrackLaneMasks = true;
 }
 
-bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
-unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
-                                            unsigned ExplicitArgBytes) const {
-  unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
-  if (ImplicitBytes == 0)
-    return ExplicitArgBytes;
-
-  unsigned Alignment = getAlignmentForImplicitArgPtr();
-  return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
-}
-
-unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
-  if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
       return 10;
     if (SGPRs <= 88)
@@ -431,7 +507,7 @@ unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   return 5;
 }
 
-unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   if (VGPRs <= 24)
     return 10;
   if (VGPRs <= 28)
@@ -453,7 +529,7 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   return 1;
 }
 
-unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   if (MFI.hasFlatScratchInit()) {
     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -467,7 +543,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
   return 2; // VCC.
 }
 
-unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 
@@ -517,7 +593,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
                   MaxAddressableNumSGPRs);
 }
 
-unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 
@@ -532,10 +608,6 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     unsigned Requested = AMDGPU::getIntegerAttribute(
       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getReservedNumVGPRs(MF))
-      Requested = 0;
-
     // Make sure requested value is compatible with values implied by
     // default/requested minimum/maximum number of waves per execution unit.
     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
@@ -548,7 +620,7 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
       MaxNumVGPRs = Requested;
   }
 
-  return MaxNumVGPRs - getReservedNumVGPRs(MF);
+  return MaxNumVGPRs;
 }
 
 namespace {
@@ -602,7 +674,21 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
 };
 } // namespace
 
-void SISubtarget::getPostRAMutations(
+void GCNSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 }
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
+  if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
+    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
+  else
+    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
+}
+
+const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
+    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
+  else
+    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index cf4a691d4b58..623109733651 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -8,7 +8,7 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief AMDGPU specific subclass of TargetSubtarget.
+/// AMDGPU specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,7 +23,6 @@
 #include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
@@ -40,24 +39,216 @@
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"
+#define GET_SUBTARGETINFO_HEADER
+#include "R600GenSubtargetInfo.inc"
 
 namespace llvm {
 
 class StringRef;
 
-class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+class AMDGPUSubtarget {
 public:
   enum Generation {
     R600 = 0,
-    R700,
-    EVERGREEN,
-    NORTHERN_ISLANDS,
-    SOUTHERN_ISLANDS,
-    SEA_ISLANDS,
-    VOLCANIC_ISLANDS,
-    GFX9,
+    R700 = 1,
+    EVERGREEN = 2,
+    NORTHERN_ISLANDS = 3,
+    SOUTHERN_ISLANDS = 4,
+    SEA_ISLANDS = 5,
+    VOLCANIC_ISLANDS = 6,
+    GFX9 = 7
   };
 
+private:
+  Triple TargetTriple;
+
+protected:
+  const FeatureBitset &SubtargetFeatureBits;
+  bool Has16BitInsts;
+  bool HasMadMixInsts;
+  bool FP32Denormals;
+  bool FPExceptions;
+  bool HasSDWA;
+  bool HasVOP3PInsts;
+  bool HasMulI24;
+  bool HasMulU24;
+  bool HasFminFmaxLegacy;
+  bool EnablePromoteAlloca;
+  int LocalMemorySize;
+  unsigned WavefrontSize;
+
+public:
+  AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits);
+
+  static const AMDGPUSubtarget &get(const MachineFunction &MF);
+  static const AMDGPUSubtarget &get(const TargetMachine &TM,
+                                    const Function &F);
+
+  /// \returns Default range flat work group size for a calling convention.
+  std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
+
+  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+  /// for function \p F, or minimum/maximum flat work group sizes explicitly
+  /// requested using "amdgpu-flat-work-group-size" attribute attached to
+  /// function \p F.
+  ///
+  /// \returns Subtarget's default values if explicitly requested values cannot
+  /// be converted to integer, or violate subtarget's specifications.
+  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+  /// \returns Subtarget's default pair of minimum/maximum number of waves per
+  /// execution unit for function \p F, or minimum/maximum number of waves per
+  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+  /// attached to function \p F.
+  ///
+  /// \returns Subtarget's default values if explicitly requested values cannot
+  /// be converted to integer, violate subtarget's specifications, or are not
+  /// compatible with minimum/maximum number of waves limited by flat work group
+  /// size, register usage, and/or lds usage.
+  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
+
+  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+  /// the given LDS memory size is the only constraint.
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
+
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
+  }
+
+  bool isAmdPalOS() const {
+    return TargetTriple.getOS() == Triple::AMDPAL;
+  }
+
+  bool isMesa3DOS() const {
+    return TargetTriple.getOS() == Triple::Mesa3D;
+  }
+
+  bool isMesaKernel(const Function &F) const {
+    return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+  }
+
+  bool isAmdCodeObjectV2(const Function &F) const {
+    return isAmdHsaOS() || isMesaKernel(F);
+  }
+
+  bool has16BitInsts() const {
+    return Has16BitInsts;
+  }
+
+  bool hasMadMixInsts() const {
+    return HasMadMixInsts;
+  }
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
+  }
+
+  bool hasFPExceptions() const {
+    return FPExceptions;
+  }
+
+  bool hasSDWA() const {
+    return HasSDWA;
+  }
+
+  bool hasVOP3PInsts() const {
+    return HasVOP3PInsts;
+  }
+
+  bool hasMulI24() const {
+    return HasMulI24;
+  }
+
+  bool hasMulU24() const {
+    return HasMulU24;
+  }
+
+  bool hasFminFmaxLegacy() const {
+    return HasFminFmaxLegacy;
+  }
+
+  bool isPromoteAllocaEnabled() const {
+    return EnablePromoteAlloca;
+  }
+
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
+  }
+
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
+  }
+
+  unsigned getAlignmentForImplicitArgPtr() const {
+    return isAmdHsaOS() ? 8 : 4;
+  }
+
+  /// Returns the offset in bytes from the start of the input buffer
+  ///        of the first explicit kernel argument.
+  unsigned getExplicitKernelArgOffset(const Function &F) const {
+    return isAmdCodeObjectV2(F) ? 0 : 36;
+  }
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits,
+                                                  FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const {
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits);
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const {
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits);
+  }
+
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits,
+                                             FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const {
+    return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits);
+  }
+
+  unsigned getMaxWavesPerEU() const { return 10; }
+
+  /// Creates value range metadata on an workitemid.* inrinsic call or load.
+  bool makeLIDRangeMetadata(Instruction *I) const;
+
+  /// \returns Number of bytes of arguments that are passed to a shader or
+  /// kernel in addition to the explicit ones declared for the function.
+  unsigned getImplicitArgNumBytes(const Function &F) const {
+    if (isMesaKernel(F))
+      return 16;
+    return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+  }
+  uint64_t getExplicitKernArgSize(const Function &F,
+                                  unsigned &MaxAlign) const;
+  unsigned getKernArgSegmentSize(const Function &F,
+                                 unsigned &MaxAlign) const;
+
+  virtual ~AMDGPUSubtarget() {}
+};
+
+class GCNSubtarget : public AMDGPUGenSubtargetInfo,
+                     public AMDGPUSubtarget {
+public:
   enum {
     ISAVersion0_0_0,
     ISAVersion6_0_0,
@@ -67,13 +258,14 @@ public:
     ISAVersion7_0_2,
     ISAVersion7_0_3,
     ISAVersion7_0_4,
-    ISAVersion8_0_0,
     ISAVersion8_0_1,
     ISAVersion8_0_2,
     ISAVersion8_0_3,
     ISAVersion8_1_0,
     ISAVersion9_0_0,
-    ISAVersion9_0_2
+    ISAVersion9_0_2,
+    ISAVersion9_0_4,
+    ISAVersion9_0_6,
   };
 
   enum TrapHandlerAbi {
@@ -96,13 +288,18 @@ public:
     LLVMTrapHandlerRegValue = 1
   };
 
+private:
+  /// GlobalISel related APIs.
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
 protected:
   // Basic subtarget description.
   Triple TargetTriple;
-  Generation Gen;
+  unsigned Gen;
   unsigned IsaVersion;
-  unsigned WavefrontSize;
-  int LocalMemorySize;
   int LDSBankCount;
   unsigned MaxPrivateElementSize;
 
@@ -111,9 +308,7 @@ protected:
   bool HalfRate64Ops;
 
   // Dynamially set bits that enable features.
-  bool FP32Denormals;
   bool FP64FP16Denormals;
-  bool FPExceptions;
   bool DX10Clamp;
   bool FlatForGlobal;
   bool AutoWaitcntBeforeBarrier;
@@ -124,47 +319,48 @@ protected:
   bool EnableXNACK;
   bool TrapHandler;
   bool DebuggerInsertNops;
-  bool DebuggerReserveRegs;
   bool DebuggerEmitPrologue;
 
   // Used as options.
   bool EnableHugePrivateBuffer;
   bool EnableVGPRSpilling;
-  bool EnablePromoteAlloca;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
+  bool EnableDS128;
   bool DumpCode;
 
   // Subtarget statically properties set by tablegen
   bool FP64;
   bool FMA;
+  bool MIMG_R128;
   bool IsGCN;
   bool GCN3Encoding;
   bool CIInsts;
   bool GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
-  bool Has16BitInsts;
   bool HasIntClamp;
-  bool HasVOP3PInsts;
-  bool HasMadMixInsts;
+  bool HasFmaMixInsts;
   bool HasMovrel;
   bool HasVGPRIndexMode;
   bool HasScalarStores;
+  bool HasScalarAtomics;
   bool HasInv2PiInlineImm;
-  bool HasSDWA;
   bool HasSDWAOmod;
   bool HasSDWAScalar;
   bool HasSDWASdst;
   bool HasSDWAMac;
   bool HasSDWAOutModsVOPC;
   bool HasDPP;
+  bool HasDLInsts;
+  bool D16PreservesUnusedBits;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
   bool FlatGlobalInsts;
   bool FlatScratchInsts;
   bool AddNoCarryInsts;
+  bool HasUnpackedD16VMem;
   bool R600ALUInst;
   bool CaymanISA;
   bool CFALUBug;
@@ -175,67 +371,68 @@ protected:
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable;
 
-  InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
   AMDGPUAS AS;
+private:
+  SIInstrInfo InstrInfo;
+  SITargetLowering TLInfo;
+  SIFrameLowering FrameLowering;
 
 public:
-  AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                  const TargetMachine &TM);
-  ~AMDGPUSubtarget() override;
+  GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+               const GCNTargetMachine &TM);
+  ~GCNSubtarget() override;
 
-  AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
                                                    StringRef GPU, StringRef FS);
 
-  const AMDGPUInstrInfo *getInstrInfo() const override = 0;
-  const AMDGPUFrameLowering *getFrameLowering() const override = 0;
-  const AMDGPUTargetLowering *getTargetLowering() const override = 0;
-  const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
 
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+  const SIFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
   }
 
-  // Nothing implemented, just prevent crashes on use.
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
 
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  const SIRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
 
-  bool isAmdHsaOS() const {
-    return TargetTriple.getOS() == Triple::AMDHSA;
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
   }
 
-  bool isMesa3DOS() const {
-    return TargetTriple.getOS() == Triple::Mesa3D;
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
   }
 
-  bool isOpenCLEnv() const {
-    return TargetTriple.getEnvironment() == Triple::OpenCL ||
-           TargetTriple.getEnvironmentName() == "amdgizcl";
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
   }
 
-  bool isAmdPalOS() const {
-    return TargetTriple.getOS() == Triple::AMDPAL;
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
   }
 
-  Generation getGeneration() const {
-    return Gen;
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
   }
 
-  unsigned getWavefrontSize() const {
-    return WavefrontSize;
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  Generation getGeneration() const {
+    return (Generation)Gen;
   }
 
   unsigned getWavefrontSizeLog2() const {
     return Log2_32(WavefrontSize);
   }
 
-  int getLocalMemorySize() const {
-    return LocalMemorySize;
-  }
-
   int getLDSBankCount() const {
     return LDSBankCount;
   }
@@ -248,19 +445,19 @@ public:
     return AS;
   }
 
-  bool has16BitInsts() const {
-    return Has16BitInsts;
-  }
-
   bool hasIntClamp() const {
     return HasIntClamp;
   }
 
-  bool hasVOP3PInsts() const {
-    return HasVOP3PInsts;
+  bool hasFP64() const {
+    return FP64;
   }
 
-  bool hasFP64() const {
+  bool hasMIMG_R128() const {
+    return MIMG_R128;
+  }
+
+  bool hasHWFP64() const {
     return FP64;
   }
 
@@ -273,15 +470,15 @@ public:
   }
 
   bool hasAddr64() const {
-    return (getGeneration() < VOLCANIC_ISLANDS);
+    return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
   }
 
   bool hasBFE() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasBFI() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasBFM() const {
@@ -289,62 +486,31 @@ public:
   }
 
   bool hasBCNT(unsigned Size) const {
-    if (Size == 32)
-      return (getGeneration() >= EVERGREEN);
-
-    if (Size == 64)
-      return (getGeneration() >= SOUTHERN_ISLANDS);
-
-    return false;
-  }
-
-  bool hasMulU24() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasMulI24() const {
-    return (getGeneration() >= SOUTHERN_ISLANDS ||
-            hasCaymanISA());
+    return true;
   }
 
   bool hasFFBL() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasFFBH() const {
-    return (getGeneration() >= EVERGREEN);
+    return true;
   }
 
   bool hasMed3_16() const {
-    return getGeneration() >= GFX9;
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
   bool hasMin3Max3_16() const {
-    return getGeneration() >= GFX9;
-  }
-
-  bool hasMadMixInsts() const {
-    return HasMadMixInsts;
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasSBufferLoadStoreAtomicDwordxN() const {
-    // Only use the "x1" variants on GFX9 or don't use the buffer variants.
-    // For x2 and higher variants, if the accessed region spans 2 VM pages and
-    // the second page is unmapped, the hw hangs.
-    // TODO: There is one future GFX9 chip that doesn't have this bug.
-    return getGeneration() != GFX9;
+  bool hasFmaMixInsts() const {
+    return HasFmaMixInsts;
   }
 
   bool hasCARRY() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBORROW() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasCaymanISA() const {
-    return CaymanISA;
+    return true;
   }
 
   bool hasFMA() const {
@@ -359,10 +525,6 @@ public:
     return EnableHugePrivateBuffer;
   }
 
-  bool isPromoteAllocaEnabled() const {
-    return EnablePromoteAlloca;
-  }
-
   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
   }
@@ -376,23 +538,10 @@ public:
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
                                            const Function &) const;
 
-  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
-  /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
-
-  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
-    const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
-  }
-
   bool hasFP16Denormals() const {
     return FP64FP16Denormals;
   }
 
-  bool hasFP32Denormals() const {
-    return FP32Denormals;
-  }
-
   bool hasFP64Denormals() const {
     return FP64FP16Denormals;
   }
@@ -401,10 +550,6 @@ public:
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasFPExceptions() const {
-    return FPExceptions;
-  }
-
   bool enableDX10Clamp() const {
     return DX10Clamp;
   }
@@ -417,6 +562,12 @@ public:
     return FlatForGlobal;
   }
 
+  /// \returns If target supports ds_read/write_b128 and user enables generation
+  /// of ds_read/write_b128.
+  bool useDS128() const {
+    return CIInsts && EnableDS128;
+  }
+
   /// \returns If MUBUF instructions always perform range checking, even for
   /// buffer resources used for private memory access.
   bool privateMemoryResourceIsRangeChecked() const {
@@ -440,7 +591,7 @@ public:
   }
 
   bool hasApertureRegs() const {
-   return HasApertureRegs;
+    return HasApertureRegs;
   }
 
   bool isTrapHandlerEnabled() const {
@@ -467,6 +618,10 @@ public:
     return FlatScratchInsts;
   }
 
+  bool hasFlatLgkmVMemCountInOrder() const {
+    return getGeneration() > GFX9;
+  }
+
   bool hasD16LoadStore() const {
     return getGeneration() >= GFX9;
   }
@@ -481,31 +636,19 @@ public:
     return AddNoCarryInsts;
   }
 
-  bool isMesaKernel(const MachineFunction &MF) const {
-    return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv());
+  bool hasUnpackedD16VMem() const {
+    return HasUnpackedD16VMem;
   }
 
   // Covers VS/PS/CS graphics shaders
-  bool isMesaGfxShader(const MachineFunction &MF) const {
-    return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv());
-  }
-
-  bool isAmdCodeObjectV2(const MachineFunction &MF) const {
-    return isAmdHsaOS() || isMesaKernel(MF);
+  bool isMesaGfxShader(const Function &F) const {
+    return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }
 
   bool hasMad64_32() const {
     return getGeneration() >= SEA_ISLANDS;
   }
 
-  bool hasFminFmaxLegacy() const {
-    return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
-  }
-
-  bool hasSDWA() const {
-    return HasSDWA;
-  }
-
   bool hasSDWAOmod() const {
     return HasSDWAOmod;
   }
@@ -526,29 +669,28 @@ public:
     return HasSDWAOutModsVOPC;
   }
 
-  /// \brief Returns the offset in bytes from the start of the input buffer
-  ///        of the first explicit kernel argument.
-  unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
-    return isAmdCodeObjectV2(MF) ? 0 : 36;
+  bool vmemWriteNeedsExpWaitcnt() const {
+    return getGeneration() < SEA_ISLANDS;
   }
 
-  unsigned getAlignmentForImplicitArgPtr() const {
-    return isAmdHsaOS() ? 8 : 4;
+  bool hasDLInsts() const {
+    return HasDLInsts;
   }
 
-  unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
-    if (isMesaKernel(MF))
-      return 16;
-    if (isAmdHsaOS() && isOpenCLEnv())
-      return 32;
-    return 0;
+  bool d16PreservesUnusedBits() const {
+    return D16PreservesUnusedBits;
   }
 
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
   // is 4-byte aligned.
+  //
+  // Only 4-byte alignment is really needed to access anything. Transformations
+  // on the pointer value itself may rely on the alignment / known low bits of
+  // the pointer. Set this to something above the minimum to avoid needing
+  // dynamic realignment in common cases.
   unsigned getStackAlignment() const {
-    return 4;
+    return 16;
   }
 
   bool enableMachineScheduler() const override {
@@ -559,184 +701,43 @@ public:
     return true;
   }
 
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 
   /// \returns Number of execution units per compute unit supported by the
   /// subtarget.
   unsigned getEUsPerCU() const {
-    return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
-  }
-
-  /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
-                                                  FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerCU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(),
                                              FlatWorkGroupSize);
   }
 
-  /// \returns Minimum number of waves per execution unit supported by the
-  /// subtarget.
-  unsigned getMinWavesPerEU() const {
-    return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
-  }
-
   /// \returns Maximum number of waves per execution unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
-  }
-
-  /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
-                                             FlatWorkGroupSize);
-  }
-
-  /// \returns Minimum flat work group size supported by the subtarget.
-  unsigned getMinFlatWorkGroupSize() const {
-    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
-  }
-
-  /// \returns Maximum flat work group size supported by the subtarget.
-  unsigned getMaxFlatWorkGroupSize() const {
-    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
+    return AMDGPU::IsaInfo::getMaxWavesPerEU();
   }
 
   /// \returns Number of waves per work group supported by the subtarget and
   /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
-                                                 FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getWavesPerWorkGroup(
+        MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize);
   }
 
-  /// \returns Default range flat work group size for a calling convention.
-  std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
-
-  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
-  /// for function \p F, or minimum/maximum flat work group sizes explicitly
-  /// requested using "amdgpu-flat-work-group-size" attribute attached to
-  /// function \p F.
-  ///
-  /// \returns Subtarget's default values if explicitly requested values cannot
-  /// be converted to integer, or violate subtarget's specifications.
-  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
-
-  /// \returns Subtarget's default pair of minimum/maximum number of waves per
-  /// execution unit for function \p F, or minimum/maximum number of waves per
-  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
-  /// attached to function \p F.
-  ///
-  /// \returns Subtarget's default values if explicitly requested values cannot
-  /// be converted to integer, violate subtarget's specifications, or are not
-  /// compatible with minimum/maximum number of waves limited by flat work group
-  /// size, register usage, and/or lds usage.
-  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
-
-  /// Creates value range metadata on an workitemid.* inrinsic call or load.
-  bool makeLIDRangeMetadata(Instruction *I) const;
-};
-
-class R600Subtarget final : public AMDGPUSubtarget {
-private:
-  R600InstrInfo InstrInfo;
-  R600FrameLowering FrameLowering;
-  R600TargetLowering TLInfo;
-
-public:
-  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                const TargetMachine &TM);
-
-  const R600InstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
-
-  const R600FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-
-  const R600TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const R600RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-
-  bool hasCFAluBug() const {
-    return CFALUBug;
-  }
-
-  bool hasVertexCache() const {
-    return HasVertexCache;
-  }
-
-  short getTexVTXClauseSize() const {
-    return TexVTXClauseSize;
-  }
-};
-
-class SISubtarget final : public AMDGPUSubtarget {
-private:
-  SIInstrInfo InstrInfo;
-  SIFrameLowering FrameLowering;
-  SITargetLowering TLInfo;
-
-  /// GlobalISel related APIs.
-  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-public:
-  SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
-              const TargetMachine &TM);
-
-  const SIInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
-
-  const SIFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-
-  const SITargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-
-  const SIRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
+  // static wrappers
+  static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
   // XXX - Why is this here if it isn't in the default pass set?
   bool enableEarlyIfConversion() const override {
@@ -746,7 +747,7 @@ public:
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
-  bool isVGPRSpillingEnabled(const Function& F) const;
+  bool isVGPRSpillingEnabled(const Function &F) const;
 
   unsigned getMaxNumUserSGPRs() const {
     return 16;
@@ -776,6 +777,10 @@ public:
     return HasScalarStores;
   }
 
+  bool hasScalarAtomics() const {
+    return HasScalarAtomics;
+  }
+
   bool hasInv2PiInlineImm() const {
     return HasInv2PiInlineImm;
   }
@@ -789,18 +794,13 @@ public:
   }
 
   bool debuggerSupported() const {
-    return debuggerInsertNops() && debuggerReserveRegs() &&
-      debuggerEmitPrologue();
+    return debuggerInsertNops() && debuggerEmitPrologue();
   }
 
   bool debuggerInsertNops() const {
     return DebuggerInsertNops;
   }
 
-  bool debuggerReserveRegs() const {
-    return DebuggerReserveRegs;
-  }
-
   bool debuggerEmitPrologue() const {
     return DebuggerEmitPrologue;
   }
@@ -829,52 +829,61 @@ public:
     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
 
-  unsigned getKernArgSegmentSize(const MachineFunction &MF,
-                                 unsigned ExplictArgBytes) const;
-
-  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+  /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 
-  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+  /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
   /// \returns true if the flat_scratch register should be initialized with the
   /// pointer to the wave's scratch memory rather than a size and offset.
   bool flatScratchIsPointer() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  /// \returns true if the machine has merged shaders in which s0-s7 are
+  /// reserved by the hardware and user SGPRs start at s8
+  bool hasMergedShaders() const {
     return getGeneration() >= GFX9;
   }
 
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns SGPR encoding granularity supported by the subtarget.
   unsigned getSGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Total number of SGPRs supported by the subtarget.
   unsigned getTotalNumSGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Addressable number of SGPRs supported by the subtarget.
   unsigned getAddressableNumSGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Minimum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
+    return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU);
   }
 
   /// \returns Maximum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
-    return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
-                                           Addressable);
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU, Addressable);
   }
 
   /// \returns Reserved number of SGPRs for given function \p MF.
@@ -892,39 +901,39 @@ public:
 
   /// \returns VGPR allocation granularity supported by the subtarget.
   unsigned getVGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns VGPR encoding granularity supported by the subtarget.
   unsigned getVGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Total number of VGPRs supported by the subtarget.
   unsigned getTotalNumVGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(
+        MCSubtargetInfo::getFeatureBits());
   }
 
   /// \returns Minimum number of VGPRs that meets given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+    return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU);
   }
 
   /// \returns Maximum number of VGPRs that meets given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
-  }
-
-  /// \returns Reserved number of VGPRs for given function \p MF.
-  unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
-    return debuggerReserveRegs() ? 4 : 0;
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(),
+                                           WavesPerEU);
   }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -942,6 +951,119 @@ public:
       const override;
 };
 
+class R600Subtarget final : public R600GenSubtargetInfo,
+                            public AMDGPUSubtarget {
+private:
+  R600InstrInfo InstrInfo;
+  R600FrameLowering FrameLowering;
+  bool FMA;
+  bool CaymanISA;
+  bool CFALUBug;
+  bool DX10Clamp;
+  bool HasVertexCache;
+  bool R600ALUInst;
+  bool FP64;
+  short TexVTXClauseSize;
+  Generation Gen;
+  R600TargetLowering TLInfo;
+  InstrItineraryData InstrItins;
+  SelectionDAGTargetInfo TSInfo;
+  AMDGPUAS AS;
+
+public:
+  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                const TargetMachine &TM);
+
+  const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+  const R600FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const R600TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const R600RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  unsigned getStackAlignment() const {
+    return 4;
+  }
+
+  R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                 StringRef GPU, StringRef FS);
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    return false;
+  }
+
+  bool hasBORROW() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCARRY() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
+
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFMA() const { return FMA; }
+
+  bool hasCFAluBug() const { return CFALUBug; }
+
+  bool hasVertexCache() const { return HasVertexCache; }
+
+  short getTexVTXClauseSize() const { return TexVTXClauseSize; }
+
+  AMDGPUAS getAMDGPUAS() const { return AS; }
+
+  bool enableMachineScheduler() const override {
+    return true;
+  }
+
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e09263b6fac9..2205819c444f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief The AMDGPU target machine contains all of the hardware specific
+/// The AMDGPU target machine contains all of the hardware specific
 /// information  needed to emit code for R600 and SI GPUs.
 //
 //===----------------------------------------------------------------------===//
@@ -31,7 +31,6 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
@@ -40,6 +39,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
   cl::init(true),
   cl::Hidden);
 
-// Option to to control global loads scalarization
+// Option to control global loads scalarization
 static cl::opt<bool> ScalarizeGlobal(
   "amdgpu-scalarize-global-loads",
   cl::desc("Enable global load scalarization"),
@@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
   cl::desc("Enable AMDGPU Alias Analysis"),
   cl::init(true));
 
-// Option to enable new waitcnt insertion pass.
-static cl::opt<bool> EnableSIInsertWaitcntsPass(
-  "enable-si-insert-waitcnts",
-  cl::desc("Use new waitcnt insertion pass"),
-  cl::init(true));
-
 // Option to run late CFG structurizer
 static cl::opt<bool, true> LateCFGStructurize(
   "amdgpu-late-structurize",
@@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize(
   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
   cl::Hidden);
 
-static cl::opt<bool> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
   "amdgpu-function-calls",
-  cl::Hidden,
   cl::desc("Enable AMDGPU function call support"),
-  cl::init(false));
+  cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
+  cl::init(false),
+  cl::Hidden);
 
 // Enable lib calls simplifications
 static cl::opt<bool> EnableLibCallSimplify(
   "amdgpu-simplify-libcall",
-  cl::desc("Enable mdgpu library simplifications"),
+  cl::desc("Enable amdgpu library simplifications"),
+  cl::init(true),
+  cl::Hidden);
+
+static cl::opt<bool> EnableLowerKernelArguments(
+  "amdgpu-ir-lower-kernel-arguments",
+  cl::desc("Lower kernel argument loads in IR pass"),
   cl::init(true),
   cl::Hidden);
 
@@ -161,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
+  initializeAMDGPULowerKernelArgumentsPass(*PR);
+  initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
@@ -168,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
-  initializeSIInsertWaitsPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
@@ -177,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
   initializeSIFixWWMLivenessPass(*PR);
+  initializeSIFormMemoryClausesPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
@@ -261,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp",
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
-    if (TT.getEnvironmentName() == "amdgiz" ||
-        TT.getEnvironmentName() == "amdgizcl")
       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
-    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
   }
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat.
-  if (TT.getEnvironmentName() == "amdgiz" ||
-      TT.getEnvironmentName() == "amdgizcl")
-    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+    return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
-      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
 }
 
 LLVM_READNONE
@@ -318,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
-
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
   Attribute GPUAttr = F.getFnAttribute("target-cpu");
@@ -413,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       // Add infer address spaces pass to the opt pipeline after inlining
       // but before SROA to increase SROA opportunities.
       PM.add(createInferAddressSpacesPass());
+
+      // This should run after inlining to have any chance of doing anything,
+      // and before other cleanup optimizations.
+      PM.add(createAMDGPULowerKernelAttributesPass());
   });
 }
 
@@ -450,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
   return I.get();
 }
 
+TargetTransformInfo
+R600TargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(R600TTIImpl(this, F));
+}
+
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
@@ -462,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
                                    CodeGenOpt::Level OL, bool JIT)
     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
-const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef GPU = getGPUName(F);
   StringRef FS = getFeatureString(F);
 
@@ -475,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+    I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
   }
 
   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -483,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
+TargetTransformInfo
+GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(GCNTTIImpl(this, F));
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPU Pass Setup
 //===----------------------------------------------------------------------===//
@@ -572,11 +581,6 @@ public:
 
 } // end anonymous namespace
 
-TargetTransformInfo
-AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
-  return TargetTransformInfo(AMDGPUTTIImpl(this, F));
-}
-
 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
   if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createGVNPass());
@@ -585,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
 }
 
 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  addPass(createLICMPass());
   addPass(createSeparateConstOffsetFromGEPPass());
   addPass(createSpeculativeExecutionPass());
   // ReassociateGEPs exposes more opportunites for SLSR. See
@@ -630,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() {
   }
 
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
-  addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+  if (TM.getTargetTriple().getArch() == Triple::r600)
+    addPass(createR600OpenCLImageTypeLoweringPass());
 
   // Replace OpenCL enqueued block function pointers with global variables.
   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
@@ -673,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() {
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+      EnableLowerKernelArguments)
+    addPass(createAMDGPULowerKernelArgumentsPass());
+
   TargetPassConfig::addCodeGenPrepare();
 
   if (EnableLoadStoreVectorizer)
@@ -740,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
   MachineSchedContext *C) const {
-  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   if (ST.enableSIScheduler())
     return createSIMachineScheduler(C);
   return createGCNMaxOccupancyMachineScheduler(C);
@@ -783,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&SILoadStoreOptimizerID);
   if (EnableSDWAPeephole) {
     addPass(&SIPeepholeSDWAID);
-    addPass(&MachineLICMID);
+    addPass(&EarlyMachineLICMID);
     addPass(&MachineCSEID);
     addPass(&SIFoldOperandsID);
     addPass(&DeadMachineInstructionElimID);
@@ -852,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
 
+  insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -874,6 +886,10 @@ void GCNPassConfig::addPreSched2() {
 }
 
 void GCNPassConfig::addPreEmitPass() {
+  addPass(createSIMemoryLegalizerPass());
+  addPass(createSIInsertWaitcntsPass());
+  addPass(createSIShrinkInstructionsPass());
+
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
   // are multiple scheduling regions in a basic block, the regions are scheduled
@@ -882,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() {
   //
   // Here we add a stand-alone hazard recognizer pass which can handle all
   // cases.
+  //
+  // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
+  // be better for it to emit S_NOP <N> when possible.
   addPass(&PostRAHazardRecognizerID);
 
-  if (EnableSIInsertWaitcntsPass)
-    addPass(createSIInsertWaitcntsPass());
-  else
-    addPass(createSIInsertWaitsPass());
-  addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
-  addPass(createSIMemoryLegalizerPass());
   addPass(createSIDebuggerInsertNopsPass());
   addPass(&BranchRelaxationPassID);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 5f9b2a7fca20..0fe14493fabd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codgen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -34,7 +34,6 @@ namespace llvm {
 class AMDGPUTargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  AMDGPUIntrinsicInfo IntrinsicInfo;
   AMDGPUAS AS;
 
   StringRef getGPUName(const Function &F) const;
@@ -42,6 +41,7 @@ protected:
 
 public:
   static bool EnableLateStructurizeCFG;
+  static bool EnableFunctionCalls;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
@@ -49,13 +49,8 @@ public:
                       CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine() override;
 
-  const AMDGPUSubtarget *getSubtargetImpl() const;
-  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
-
-  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
-    return &IntrinsicInfo;
-  }
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+  const TargetSubtargetInfo *getSubtargetImpl() const;
+  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
@@ -91,6 +86,8 @@ public:
 
   const R600Subtarget *getSubtargetImpl(const Function &) const override;
 
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
   bool isMachineVerifierClean() const override {
     return false;
   }
@@ -102,7 +99,8 @@ public:
 
 class GCNTargetMachine final : public AMDGPUTargetMachine {
 private:
-  mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+  mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
 
 public:
   GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -112,7 +110,13 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  const SISubtarget *getSubtargetImpl(const Function &) const override;
+  const GCNSubtarget *getSubtargetImpl(const Function &) const override;
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+    return &IntrinsicInfo;
+  }
 
   bool useIPRA() const override {
     return true;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index ca6210f69298..dd9dc1a88fc2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares the AMDGPU-specific subclass of
+/// This file declares the AMDGPU-specific subclass of
 /// TargetLoweringObjectFile.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 77c2d4b956c6..a68b8d03f06e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,12 +17,12 @@
 
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
@@ -101,7 +102,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   unsigned ThresholdPrivate = UnrollThresholdPrivate;
   unsigned ThresholdLocal = UnrollThresholdLocal;
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
-  AMDGPUAS ASST = ST->getAMDGPUAS();
+  const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
     unsigned LocalGEPsSeen = 0;
@@ -123,8 +124,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
             continue;
           if (dependsOnLocalPhi(L, Br->getCondition())) {
             UP.Threshold += UnrollThresholdIf;
-            DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
-                         << " for loop:\n" << *L << " due to " << *Br << '\n');
+            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                              << " for loop:\n"
+                              << *L << " due to " << *Br << '\n');
             if (UP.Threshold >= MaxBoost)
               return;
           }
@@ -200,61 +202,76 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       // Don't use the maximum allowed value here as it will make some
       // programs way too big.
       UP.Threshold = Threshold;
-      DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
-                   << *L << " due to " << *GEP << '\n');
+      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
+                        << " for loop:\n"
+                        << *L << " due to " << *GEP << '\n');
       if (UP.Threshold >= MaxBoost)
         return;
     }
   }
 }
 
-unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
   // The concept of vector registers doesn't really exist. Some packed vector
   // operations operate on the normal 32-bit registers.
-
-  // Number of VGPRs on SI.
-  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return 256;
-
-  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+  return 256;
 }
 
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
   // This is really the number of registers to fill when vectorizing /
   // interleaving loops, so we lie to avoid trying to use all registers.
   return getHardwareNumberOfRegisters(Vec) >> 3;
 }
 
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
+unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
   return 32;
 }
 
-unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
   return 32;
 }
 
-unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+                                            unsigned ChainSizeInBytes,
+                                            VectorType *VecTy) const {
+  unsigned VecRegBitWidth = VF * LoadSize;
+  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
+    // TODO: Support element-size less than 32bit?
+    return 128 / LoadSize;
+
+  return VF;
+}
+
+unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+                                             unsigned ChainSizeInBytes,
+                                             VectorType *VecTy) const {
+  unsigned VecRegBitWidth = VF * StoreSize;
+  if (VecRegBitWidth > 128)
+    return 128 / StoreSize;
+
+  return VF;
+}
+
+unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
   AMDGPUAS AS = ST->getAMDGPUAS();
   if (AddrSpace == AS.GLOBAL_ADDRESS ||
       AddrSpace == AS.CONSTANT_ADDRESS ||
-      AddrSpace == AS.FLAT_ADDRESS)
-    return 128;
-  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
+    return 512;
+  }
+
+  if (AddrSpace == AS.FLAT_ADDRESS ||
+      AddrSpace == AS.LOCAL_ADDRESS ||
       AddrSpace == AS.REGION_ADDRESS)
-    return 64;
+    return 128;
+
   if (AddrSpace == AS.PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
 
-  if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
-      (AddrSpace == AS.PARAM_D_ADDRESS ||
-      AddrSpace == AS.PARAM_I_ADDRESS ||
-      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
-      AddrSpace <= AS.CONSTANT_BUFFER_15)))
-    return 128;
   llvm_unreachable("unhandled address space");
 }
 
-bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
                                                unsigned Alignment,
                                                unsigned AddrSpace) const {
   // We allow vectorization of flat stores, even though we may need to decompose
@@ -267,19 +284,19 @@ bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   return true;
 }
 
-bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
                                                 unsigned Alignment,
                                                 unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
-bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
                                                  unsigned Alignment,
                                                  unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
-unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Disable unrolling if the loop is not vectorized.
   // TODO: Enable this again.
   if (VF == 1)
@@ -288,11 +305,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return 8;
 }
 
-bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                        MemIntrinsicInfo &Info) const {
   switch (Inst->getIntrinsicID()) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
     if (!Ordering || !Volatile)
@@ -314,7 +334,7 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   }
 }
 
-int AMDGPUTTIImpl::getArithmeticInstrCost(
+int GCNTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
@@ -424,7 +444,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
                                        Opd1PropInfo, Opd2PropInfo);
 }
 
-unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
   // XXX - For some reason this isn't called for switch.
   switch (Opcode) {
   case Instruction::Br:
@@ -435,7 +455,38 @@ unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
   }
 }
 
-int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+                                              bool IsPairwise) {
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+
+  // Computes cost on targets that have packed math instructions(which support
+  // 16-bit types only).
+  if (IsPairwise ||
+      !ST->hasVOP3PInsts() ||
+      OrigTy.getScalarSizeInBits() != 16)
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  return LT.first * getFullRateInstrCost();
+}
+
+int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                                          bool IsPairwise,
+                                          bool IsUnsigned) {
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+
+  // Computes cost on targets that have packed math instructions(which support
+  // 16-bit types only).
+  if (IsPairwise ||
+      !ST->hasVOP3PInsts() ||
+      OrigTy.getScalarSizeInBits() != 16)
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  return LT.first * getHalfRateInstrCost();
+}
+
+int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
@@ -460,52 +511,7 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
-  switch (I->getIntrinsicID()) {
-  case Intrinsic::amdgcn_workitem_id_x:
-  case Intrinsic::amdgcn_workitem_id_y:
-  case Intrinsic::amdgcn_workitem_id_z:
-  case Intrinsic::amdgcn_interp_mov:
-  case Intrinsic::amdgcn_interp_p1:
-  case Intrinsic::amdgcn_interp_p2:
-  case Intrinsic::amdgcn_mbcnt_hi:
-  case Intrinsic::amdgcn_mbcnt_lo:
-  case Intrinsic::r600_read_tidig_x:
-  case Intrinsic::r600_read_tidig_y:
-  case Intrinsic::r600_read_tidig_z:
-  case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
-  case Intrinsic::amdgcn_image_atomic_swap:
-  case Intrinsic::amdgcn_image_atomic_add:
-  case Intrinsic::amdgcn_image_atomic_sub:
-  case Intrinsic::amdgcn_image_atomic_smin:
-  case Intrinsic::amdgcn_image_atomic_umin:
-  case Intrinsic::amdgcn_image_atomic_smax:
-  case Intrinsic::amdgcn_image_atomic_umax:
-  case Intrinsic::amdgcn_image_atomic_and:
-  case Intrinsic::amdgcn_image_atomic_or:
-  case Intrinsic::amdgcn_image_atomic_xor:
-  case Intrinsic::amdgcn_image_atomic_inc:
-  case Intrinsic::amdgcn_image_atomic_dec:
-  case Intrinsic::amdgcn_image_atomic_cmpswap:
-  case Intrinsic::amdgcn_buffer_atomic_swap:
-  case Intrinsic::amdgcn_buffer_atomic_add:
-  case Intrinsic::amdgcn_buffer_atomic_sub:
-  case Intrinsic::amdgcn_buffer_atomic_smin:
-  case Intrinsic::amdgcn_buffer_atomic_umin:
-  case Intrinsic::amdgcn_buffer_atomic_smax:
-  case Intrinsic::amdgcn_buffer_atomic_umax:
-  case Intrinsic::amdgcn_buffer_atomic_and:
-  case Intrinsic::amdgcn_buffer_atomic_or:
-  case Intrinsic::amdgcn_buffer_atomic_xor:
-  case Intrinsic::amdgcn_buffer_atomic_cmpswap:
-  case Intrinsic::amdgcn_ps_live:
-  case Intrinsic::amdgcn_ds_swizzle:
-    return true;
-  default:
-    return false;
-  }
-}
+
 
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
@@ -535,7 +541,7 @@ static bool isArgPassedInSGPR(const Argument *A) {
 
 /// \returns true if the result of the value could potentially be
 /// different across workitems in a wavefront.
-bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const Argument *A = dyn_cast<Argument>(V))
     return !isArgPassedInSGPR(A);
 
@@ -556,7 +562,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
     return true;
 
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
-    return isIntrinsicSourceOfDivergence(Intrinsic);
+    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
 
   // Assume all function calls are a source of divergence.
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
@@ -565,7 +571,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   return false;
 }
 
-bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
+bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     switch (Intrinsic->getIntrinsicID()) {
     default:
@@ -578,7 +584,7 @@ bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
   return false;
 }
 
-unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                        Type *SubTp) {
   if (ST->hasVOP3PInsts()) {
     VectorType *VT = cast<VectorType>(Tp);
@@ -601,7 +607,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
+bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
                                         const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
   const FeatureBitset &CallerBits =
@@ -613,3 +619,114 @@ bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
   return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
 }
+
+void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                         TTI::UnrollingPreferences &UP) {
+  CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
+
+unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
+  return getHardwareNumberOfRegisters(Vec);
+}
+
+unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
+  return 32;
+}
+
+unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
+  return 32;
+}
+
+unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+  AMDGPUAS AS = ST->getAMDGPUAS();
+  if (AddrSpace == AS.GLOBAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS)
+    return 128;
+  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.REGION_ADDRESS)
+    return 64;
+  if (AddrSpace == AS.PRIVATE_ADDRESS)
+    return 32;
+
+  if ((AddrSpace == AS.PARAM_D_ADDRESS ||
+      AddrSpace == AS.PARAM_I_ADDRESS ||
+      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+      AddrSpace <= AS.CONSTANT_BUFFER_15)))
+    return 128;
+  llvm_unreachable("unhandled address space");
+}
+
+bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                             unsigned Alignment,
+                                             unsigned AddrSpace) const {
+  // We allow vectorization of flat stores, even though we may need to decompose
+  // them later if they may access private memory. We don't have enough context
+  // here, and legalization can handle it.
+  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
+    return false;
+  return true;
+}
+
+bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                              unsigned Alignment,
+                                              unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                               unsigned Alignment,
+                                               unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
+  if (VF == 1)
+    return 1;
+
+  return 8;
+}
+
+unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
+  // XXX - For some reason this isn't called for switch.
+  switch (Opcode) {
+  case Instruction::Br:
+  case Instruction::Ret:
+    return 10;
+  default:
+    return BaseT::getCFInstrCost(Opcode);
+  }
+}
+
+int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                    unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement: {
+    unsigned EltSize
+      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+    if (EltSize < 32) {
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    }
+
+    // Extracts are just reads of a subregister, so are free. Inserts are
+    // considered free because we don't want to have any cost for scalarizing
+    // operations, and we don't have to copy into a different register class.
+
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  }
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
+
+void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                          TTI::UnrollingPreferences &UP) {
+  CommonTTI.getUnrollingPreferences(L, SE, UP);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 8899d2c6da8a..8e63d789e17d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -21,6 +21,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -44,8 +45,26 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   friend BaseT;
 
-  const AMDGPUSubtarget *ST;
+  Triple TargetTriple;
+
+public:
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      TargetTriple(TM->getTargetTriple()) {}
+
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
+};
+
+class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
+  using BaseT = BasicTTIImplBase<GCNTTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const GCNSubtarget *ST;
   const AMDGPUTargetLowering *TLI;
+  AMDGPUTTIImpl CommonTTI;
   bool IsGraphicsShader;
 
   const FeatureBitset InlineFeatureIgnoreList = {
@@ -61,7 +80,6 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
     AMDGPU::FeatureDebuggerEmitPrologue,
     AMDGPU::FeatureDebuggerInsertNops,
-    AMDGPU::FeatureDebuggerReserveRegs,
 
     // Property of the kernel/environment which can't actually differ.
     AMDGPU::FeatureSGPRInitBug,
@@ -73,7 +91,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
     AMDGPU::HalfRate64Ops
   };
 
-  const AMDGPUSubtarget *getST() const { return ST; }
+  const GCNSubtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
   static inline int getFullRateInstrCost() {
@@ -98,10 +116,11 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
   }
 
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+  explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()),
-      ST(TM->getSubtargetImpl(F)),
+      ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()),
+      CommonTTI(TM, F),
       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
 
   bool hasBranchDivergence() { return true; }
@@ -118,6 +137,12 @@ public:
   unsigned getNumberOfRegisters(bool Vector) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
+                               unsigned ChainSizeInBytes,
+                               VectorType *VecTy) const;
+  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
+                                unsigned ChainSizeInBytes,
+                                VectorType *VecTy) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 
   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
@@ -166,6 +191,53 @@ public:
                            const Function *Callee) const;
 
   unsigned getInliningThresholdMultiplier() { return 9; }
+
+  int getArithmeticReductionCost(unsigned Opcode,
+                                 Type *Ty,
+                                 bool IsPairwise);
+  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
+                             bool IsPairwiseForm,
+                             bool IsUnsigned);
+};
+
+class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
+  using BaseT = BasicTTIImplBase<R600TTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const R600Subtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+  AMDGPUTTIImpl CommonTTI;
+
+public:
+  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
+      TLI(ST->getTargetLowering()),
+      CommonTTI(TM, F)	{}
+
+  const R600Subtarget *getST() const { return ST; }
+  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
+  unsigned getHardwareNumberOfRegisters(bool Vec) const;
+  unsigned getNumberOfRegisters(bool Vec) const;
+  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
+                                  unsigned AddrSpace) const;
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+		                   unsigned Alignment,
+                                   unsigned AddrSpace) const;
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                    unsigned Alignment,
+                                    unsigned AddrSpace) const;
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getCFInstrCost(unsigned Opcode);
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 6107f3a7dd18..0d3a1673696a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -39,7 +40,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
 
 using namespace llvm;
 
@@ -144,7 +145,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
     if (PN)
       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
 
-    BB->getInstList().pop_back();  // Remove the return insn
+    // Remove and delete the return inst.
+    BB->getTerminator()->eraseFromParent();
     BranchInst::Create(NewRetBlock, BB);
   }
 
@@ -168,6 +170,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   SmallVector<BasicBlock *, 4> ReturningBlocks;
   SmallVector<BasicBlock *, 4> UnreachableBlocks;
 
+  // Dummy return block for infinite loop.
+  BasicBlock *DummyReturnBB = nullptr;
+
   for (BasicBlock *BB : PDT.getRoots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
@@ -175,6 +180,35 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
     } else if (isa<UnreachableInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         UnreachableBlocks.push_back(BB);
+    } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+
+      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+      if (DummyReturnBB == nullptr) {
+        DummyReturnBB = BasicBlock::Create(F.getContext(),
+                                           "DummyReturnBlock", &F);
+        Type *RetTy = F.getReturnType();
+        Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+        ReturningBlocks.push_back(DummyReturnBB);
+      }
+
+      if (BI->isUnconditional()) {
+        BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
+        BI->eraseFromParent(); // Delete the unconditional branch.
+        // Add a new conditional branch with a dummy edge to the return block.
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+      } else { // Conditional branch.
+        // Create a new transition block to hold the conditional branch.
+        BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
+                                                      "TransitionBlock", &F);
+
+        // Move BI from BB to the new transition block.
+        BI->removeFromParent();
+        TransitionBB->getInstList().push_back(BI);
+
+        // Create a branch that will always branch to the transition block.
+        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+      }
     }
   }
 
@@ -189,7 +223,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
       new UnreachableInst(F.getContext(), UnreachableBlock);
 
       for (BasicBlock *BB : UnreachableBlocks) {
-        BB->getInstList().pop_back();  // Remove the unreachable inst.
+        // Remove and delete the unreachable inst.
+        BB->getTerminator()->eraseFromParent();
         BranchInst::Create(UnreachableBlock, BB);
       }
     }
@@ -200,7 +235,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
 
       Type *RetTy = F.getReturnType();
       Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.
+      // Remove and delete the unreachable inst.
+      UnreachableBlock->getTerminator()->eraseFromParent();
 
       Function *UnreachableIntrin =
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index b78568e89cfb..1f6d9234c1ed 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // \file
-// \brief This pass that unifies multiple OpenCL metadata due to linking.
+// This pass that unifies multiple OpenCL metadata due to linking.
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,7 +37,7 @@ namespace {
 
   } // end namespace kOCLMD
 
-  /// \brief Unify multiple OpenCL metadata due to linking.
+  /// Unify multiple OpenCL metadata due to linking.
   class AMDGPUUnifyMetadata : public ModulePass {
   public:
     static char ID;
@@ -47,7 +47,7 @@ namespace {
   private:
     bool runOnModule(Module &M) override;
 
-    /// \brief Unify version metadata.
+    /// Unify version metadata.
     /// \return true if changes are made.
     /// Assume the named metadata has operands each of which is a pair of
     /// integer constant, e.g.
@@ -82,7 +82,7 @@ namespace {
       return true;
     }
 
-  /// \brief Unify version metadata.
+  /// Unify version metadata.
   /// \return true if changes are made.
   /// Assume the named metadata has operands each of which is a list e.g.
   /// !Name = {!n1, !n2}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 0a0e43123ae0..11cd49e5b3dc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -28,12 +29,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstddef>
@@ -78,23 +79,18 @@ namespace {
 //
 //===----------------------------------------------------------------------===//
 
-#define SHOWNEWINSTR(i) \
-  DEBUG(dbgs() << "New instr: " << *i << "\n");
+#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
 
-#define SHOWNEWBLK(b, msg) \
-DEBUG( \
-  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  dbgs() << "\n"; \
-);
+#define SHOWNEWBLK(b, msg)                                                     \
+  LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();  \
+             dbgs() << "\n";);
 
-#define SHOWBLK_DETAIL(b, msg) \
-DEBUG( \
-  if (b) { \
-  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  b->print(dbgs()); \
-  dbgs() << "\n"; \
-  } \
-);
+#define SHOWBLK_DETAIL(b, msg)                                                 \
+  LLVM_DEBUG(if (b) {                                                          \
+    dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();           \
+    b->print(dbgs());                                                          \
+    dbgs() << "\n";                                                            \
+  });
 
 #define INVALIDSCCNUM -1
 
@@ -158,19 +154,19 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override {
     TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
     TRI = &TII->getRegisterInfo();
-    DEBUG(MF.dump(););
+    LLVM_DEBUG(MF.dump(););
     OrderedBlks.clear();
     Visited.clear();
     FuncRep = &MF;
     MLI = &getAnalysis<MachineLoopInfo>();
-    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
+    LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
-    DEBUG(PDT->print(dbgs()););
+    LLVM_DEBUG(PDT->print(dbgs()););
     prepare();
     run();
-    DEBUG(MF.dump(););
+    LLVM_DEBUG(MF.dump(););
     return true;
   }
 
@@ -436,19 +432,19 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
   for (;; --I) {
     if (I == MBB.end())
       continue;
-    if (I->getOpcode() == AMDGPU::PRED_X) {
+    if (I->getOpcode() == R600::PRED_X) {
       switch (I->getOperand(2).getImm()) {
-      case AMDGPU::PRED_SETE_INT:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
+      case R600::PRED_SETE_INT:
+        I->getOperand(2).setImm(R600::PRED_SETNE_INT);
         return;
-      case AMDGPU::PRED_SETNE_INT:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
+      case R600::PRED_SETNE_INT:
+        I->getOperand(2).setImm(R600::PRED_SETE_INT);
         return;
-      case AMDGPU::PRED_SETE:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
+      case R600::PRED_SETE:
+        I->getOperand(2).setImm(R600::PRED_SETNE);
         return;
-      case AMDGPU::PRED_SETNE:
-        I->getOperand(2).setImm(AMDGPU::PRED_SETE);
+      case R600::PRED_SETNE:
+        I->getOperand(2).setImm(R600::PRED_SETE);
         return;
       default:
         llvm_unreachable("PRED_X Opcode invalid!");
@@ -517,10 +513,10 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
 
 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-  case AMDGPU::BRANCH_COND_i32:
-  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::IF_PREDICATE_SET;
+  case R600::BRANCH_COND_i32:
+  case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -528,10 +524,10 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
 
 int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-  case AMDGPU::BRANCH_COND_i32:
-  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::IF_PREDICATE_SET;
+  case R600::BRANCH_COND_i32:
+  case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -539,8 +535,8 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
 
 int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -548,8 +544,8 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
 
 int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  case R600::JUMP_COND:
+  case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
   default: llvm_unreachable("internal error");
   }
   return -1;
@@ -577,9 +573,9 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
 
 bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
   switch (MI->getOpcode()) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return true;
+    case R600::JUMP_COND:
+    case R600::BRANCH_COND_i32:
+    case R600::BRANCH_COND_f32: return true;
   default:
     return false;
   }
@@ -588,8 +584,8 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
 
 bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
   switch (MI->getOpcode()) {
-  case AMDGPU::JUMP:
-  case AMDGPU::BRANCH:
+  case R600::JUMP:
+  case R600::BRANCH:
     return true;
   default:
     return false;
@@ -638,7 +634,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
   if (It != MBB->rend()) {
     MachineInstr *instr = &(*It);
-    if (instr->getOpcode() == AMDGPU::RETURN)
+    if (instr->getOpcode() == R600::RETURN)
       return instr;
   }
   return nullptr;
@@ -650,9 +646,8 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   if (MI)
     assert(IsReturn);
   else if (IsReturn)
-    DEBUG(
-      dbgs() << "BB" << MBB->getNumber()
-             <<" is return block without RETURN instr\n";);
+    LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
+                      << " is return block without RETURN instr\n";);
   return  IsReturn;
 }
 
@@ -692,8 +687,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    MachineBasicBlock::iterator E = MBB->end();
    MachineBasicBlock::iterator It = Pre;
    while (It != E) {
-     if (Pre->getOpcode() == AMDGPU::CONTINUE
-         && It->getOpcode() == AMDGPU::ENDLOOP)
+     if (Pre->getOpcode() == R600::CONTINUE
+         && It->getOpcode() == R600::ENDLOOP)
        ContInstr.push_back(&*Pre);
      Pre = It;
      ++It;
@@ -714,7 +709,7 @@ bool AMDGPUCFGStructurizer::prepare() {
 
   //FIXME: if not reducible flow graph, make it so ???
 
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
 
   orderBlocks(FuncRep);
 
@@ -757,14 +752,14 @@ bool AMDGPUCFGStructurizer::prepare() {
 
 bool AMDGPUCFGStructurizer::run() {
   //Assume reducible CFG...
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
 
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
   ReverseVector(orderedBlks);
 #endif
 
-  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
   int NumIter = 0;
   bool Finish = false;
   MachineBasicBlock *MBB;
@@ -774,10 +769,8 @@ bool AMDGPUCFGStructurizer::run() {
 
   do {
     ++NumIter;
-    DEBUG(
-      dbgs() << "numIter = " << NumIter
-             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
-    );
+    LLVM_DEBUG(dbgs() << "numIter = " << NumIter
+                      << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
 
     SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
         OrderedBlks.begin();
@@ -799,10 +792,8 @@ bool AMDGPUCFGStructurizer::run() {
         SccBeginMBB = MBB;
         SccNumIter = 0;
         SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
-        DEBUG(
-              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
-              dbgs() << "\n";
-        );
+        LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+                   dbgs() << "\n";);
       }
 
       if (!isRetiredBlock(MBB))
@@ -817,20 +808,16 @@ bool AMDGPUCFGStructurizer::run() {
         ++SccNumIter;
         int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
         if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
-          DEBUG(
-            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
-                   << ", sccNumIter = " << SccNumIter;
-            dbgs() << "doesn't make any progress\n";
-          );
+          LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                            << ", sccNumIter = " << SccNumIter;
+                     dbgs() << "doesn't make any progress\n";);
           ContNextScc = true;
         } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
           SccNumBlk = sccRemainedNumBlk;
           It = SccBeginIter;
           ContNextScc = false;
-          DEBUG(
-            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
-                   << "sccNumIter = " << SccNumIter << '\n';
-          );
+          LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                            << "sccNumIter = " << SccNumIter << '\n';);
         } else {
           // Finish the current scc.
           ContNextScc = true;
@@ -848,9 +835,7 @@ bool AMDGPUCFGStructurizer::run() {
         *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
     if (EntryMBB->succ_size() == 0) {
       Finish = true;
-      DEBUG(
-        dbgs() << "Reduce to one block\n";
-      );
+      LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
     } else {
       int NewnumRemainedBlk
         = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
@@ -860,9 +845,7 @@ bool AMDGPUCFGStructurizer::run() {
         NumRemainedBlk = NewnumRemainedBlk;
       } else {
         MakeProgress = false;
-        DEBUG(
-          dbgs() << "No progress\n";
-        );
+        LLVM_DEBUG(dbgs() << "No progress\n";);
       }
     }
   } while (!Finish && MakeProgress);
@@ -875,9 +858,7 @@ bool AMDGPUCFGStructurizer::run() {
       It != E; ++It) {
     if ((*It).second && (*It).second->IsRetired) {
       assert(((*It).first)->getNumber() != -1);
-      DEBUG(
-        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
-      );
+      LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
       (*It).first->eraseFromParent();  //Remove from the parent Function.
     }
     delete (*It).second;
@@ -886,7 +867,7 @@ bool AMDGPUCFGStructurizer::run() {
   LLInfoMap.clear();
 
   if (!Finish) {
-    DEBUG(FuncRep->viewCFG());
+    LLVM_DEBUG(FuncRep->viewCFG());
     report_fatal_error("IRREDUCIBLE_CFG");
   }
 
@@ -920,17 +901,13 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
   int NumMatch = 0;
   int CurMatch;
 
-  DEBUG(
-        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
 
   while ((CurMatch = patternMatchGroup(MBB)) > 0)
     NumMatch += CurMatch;
 
-  DEBUG(
-        dbgs() << "End patternMatch BB" << MBB->getNumber()
-      << ", numMatch = " << NumMatch << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
+                    << ", numMatch = " << NumMatch << "\n";);
 
   return NumMatch;
 }
@@ -1050,7 +1027,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
   for (MachineLoop *ExaminedLoop : NestedLoops) {
     if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
       continue;
-    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
     int NumBreak = mergeLoop(ExaminedLoop);
     if (NumBreak == -1)
       break;
@@ -1064,7 +1041,8 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   MBBVector ExitingMBBs;
   LoopRep->getExitingBlocks(ExitingMBBs);
   assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
-  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
+                    << " exiting blocks\n";);
   // We assume a single ExitBlk
   MBBVector ExitBlks;
   LoopRep->getExitBlocks(ExitBlks);
@@ -1106,11 +1084,9 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
     if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
       MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
       if (TheEntry) {
-        DEBUG(
-          dbgs() << "isLoopContBreakBlock yes src1 = BB"
-                 << Src1MBB->getNumber()
-                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
-        );
+        LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                          << Src1MBB->getNumber() << " src2 = BB"
+                          << Src2MBB->getNumber() << "\n";);
         return true;
       }
     }
@@ -1122,9 +1098,8 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
   if (Num == 0) {
-    DEBUG(
-      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
-    );
+    LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
+                      << "\n";);
     Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
   }
   return Num;
@@ -1138,22 +1113,16 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
   //trueBlk could be the common post dominator
   DownBlk = TrueMBB;
 
-  DEBUG(
-    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
-           << " true = BB" << TrueMBB->getNumber()
-           << ", numSucc=" << TrueMBB->succ_size()
-           << " false = BB" << FalseMBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+                    << " true = BB" << TrueMBB->getNumber()
+                    << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
+                    << FalseMBB->getNumber() << "\n";);
 
   while (DownBlk) {
-    DEBUG(
-      dbgs() << "check down = BB" << DownBlk->getNumber();
-    );
+    LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
 
     if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
-      DEBUG(
-        dbgs() << " working\n";
-      );
+      LLVM_DEBUG(dbgs() << " working\n";);
 
       Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
       Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
@@ -1166,9 +1135,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
 
       break;
     }
-    DEBUG(
-      dbgs() << " not working\n";
-    );
+    LLVM_DEBUG(dbgs() << " not working\n";);
     DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   } // walk down the postDomTree
 
@@ -1247,10 +1214,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
     MigrateFalse = true;
 
-  DEBUG(
-    dbgs() << "before improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
-  );
+  LLVM_DEBUG(
+      dbgs() << "before improveSimpleJumpintoIf: ";
+      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
 
   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
   //
@@ -1337,15 +1303,15 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
   bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
-  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
-  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+  //insert R600::ENDIF to avoid special case "input landBlk == NULL"
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
 
   if (LandBlkHasOtherPred) {
     report_fatal_error("Extra register needed to handle CFG");
     unsigned CmpResReg =
       HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
     report_fatal_error("Extra compare instruction needed to handle CFG");
-    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+    insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
         CmpResReg, DebugLoc());
   }
 
@@ -1353,7 +1319,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   // cause an assertion failure in the PostRA scheduling pass.
   unsigned InitReg =
     HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+  insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
       DebugLoc());
 
   if (MigrateTrue) {
@@ -1363,7 +1329,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // (initVal != 1).
     report_fatal_error("Extra register needed to handle CFG");
   }
-  insertInstrBefore(I, AMDGPU::ELSE);
+  insertInstrBefore(I, R600::ELSE);
 
   if (MigrateFalse) {
     migrateInstruction(FalseMBB, LandBlk, I);
@@ -1375,7 +1341,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
   if (LandBlkHasOtherPred) {
     // add endif
-    insertInstrBefore(I, AMDGPU::ENDIF);
+    insertInstrBefore(I, R600::ENDIF);
 
     // put initReg = 2 to other predecessors of landBlk
     for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
@@ -1385,10 +1351,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
         report_fatal_error("Extra register needed to handle CFG");
     }
   }
-  DEBUG(
-    dbgs() << "result from improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
-  );
+  LLVM_DEBUG(
+      dbgs() << "result from improveSimpleJumpintoIf: ";
+      showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
 
   // update landBlk
   *LandMBBPtr = LandBlk;
@@ -1398,10 +1363,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
-  DEBUG(
-    dbgs() << "serialPattern BB" << DstMBB->getNumber()
-           << " <= BB" << SrcMBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
+                    << SrcMBB->getNumber() << "\n";);
   DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
   DstMBB->removeSuccessor(SrcMBB, true);
@@ -1416,26 +1379,15 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
   assert (TrueMBB);
-  DEBUG(
-    dbgs() << "ifPattern BB" << MBB->getNumber();
-    dbgs() << "{  ";
-    if (TrueMBB) {
-      dbgs() << "BB" << TrueMBB->getNumber();
-    }
-    dbgs() << "  } else ";
-    dbgs() << "{  ";
-    if (FalseMBB) {
-      dbgs() << "BB" << FalseMBB->getNumber();
-    }
-    dbgs() << "  }\n ";
-    dbgs() << "landBlock: ";
-    if (!LandMBB) {
-      dbgs() << "NULL";
-    } else {
-      dbgs() << "BB" << LandMBB->getNumber();
-    }
-    dbgs() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{  ";
+             if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
+             << "  } else ";
+             dbgs() << "{  "; if (FalseMBB) {
+               dbgs() << "BB" << FalseMBB->getNumber();
+             } dbgs() << "  }\n ";
+             dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
+               dbgs() << "BB" << LandMBB->getNumber();
+             } dbgs() << "\n";);
 
   int OldOpcode = BranchMI->getOpcode();
   DebugLoc BranchDL = BranchMI->getDebugLoc();
@@ -1462,7 +1414,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
   }
 
   if (FalseMBB) {
-    insertInstrBefore(I, AMDGPU::ELSE);
+    insertInstrBefore(I, R600::ELSE);
     MBB->splice(I, FalseMBB, FalseMBB->begin(),
                    FalseMBB->end());
     MBB->removeSuccessor(FalseMBB, true);
@@ -1471,7 +1423,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     retireBlock(FalseMBB);
     MLI->removeBlock(FalseMBB);
   }
-  insertInstrBefore(I, AMDGPU::ENDIF);
+  insertInstrBefore(I, R600::ENDIF);
 
   BranchMI->eraseFromParent();
 
@@ -1481,18 +1433,19 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
 
 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
     MachineBasicBlock *LandMBB) {
-  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
-               << " land = BB" << LandMBB->getNumber() << "\n";);
+  LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+                    << " land = BB" << LandMBB->getNumber() << "\n";);
 
-  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
-  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
   DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
     MachineBasicBlock *LandMBB) {
-  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
-               << " land = BB" << LandMBB->getNumber() << "\n";);
+  LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
+                    << ExitingMBB->getNumber() << " land = BB"
+                    << LandMBB->getNumber() << "\n";);
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
   assert(BranchMI && isCondBranch(BranchMI));
   DebugLoc DL = BranchMI->getDebugLoc();
@@ -1500,9 +1453,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   MachineBasicBlock::iterator I = BranchMI;
   if (TrueBranch != LandMBB)
     reversePredicateSetter(I, *I->getParent());
-  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
-  insertInstrBefore(I, AMDGPU::BREAK);
-  insertInstrBefore(I, AMDGPU::ENDIF);
+  insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
+  insertInstrBefore(I, R600::BREAK);
+  insertInstrBefore(I, R600::ENDIF);
   //now branchInst can be erase safely
   BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
@@ -1511,9 +1464,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
 
 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
     MachineBasicBlock *ContMBB) {
-  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
-               << ContingMBB->getNumber()
-               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+  LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+                    << ContingMBB->getNumber() << ", cont = BB"
+                    << ContMBB->getNumber() << "\n";);
 
   MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
   if (MI) {
@@ -1531,8 +1484,8 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
           getBranchZeroOpcode(OldOpcode);
       insertCondBranchBefore(I, BranchOpcode, DL);
       // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
-      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+      insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, R600::ENDIF, DL);
     } else {
       int BranchOpcode =
           TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
@@ -1547,7 +1500,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
     // location we've just inserted that reference here so it should be
     // representative insertEnd to ensure phi-moves, if exist, go before the
     // continue-instr.
-    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+    insertInstrEnd(ContingMBB, R600::CONTINUE,
         getLastDebugLocInBB(ContingMBB));
   }
 }
@@ -1587,10 +1540,9 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
 
   numClonedInstr += MBB->size();
 
-  DEBUG(
-    dbgs() << "Cloned block: " << "BB"
-           << MBB->getNumber() << "size " << MBB->size() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Cloned block: "
+                    << "BB" << MBB->getNumber() << "size " << MBB->size()
+                    << "\n";);
 
   SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
 
@@ -1603,26 +1555,22 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
   //look for the input branchinstr, not the AMDGPU branchinstr
   MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
   if (!BranchMI) {
-    DEBUG(
-      dbgs() << "migrateInstruction don't see branch instr\n";
-    );
+    LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
     SpliceEnd = SrcMBB->end();
   } else {
-    DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+    LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
     SpliceEnd = BranchMI;
   }
-  DEBUG(
-    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
+                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
+                    << "\n";);
 
   //splice insert before insertPos
   DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
 
-  DEBUG(
-    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
+                    << DstMBB->size() << "srcSize = " << SrcMBB->size()
+                    << '\n';);
 }
 
 MachineBasicBlock *
@@ -1640,7 +1588,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
   LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
   Ctx.emitError("Extra register needed to handle CFG");
   return nullptr;
@@ -1653,7 +1601,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
   while ((BranchMI = getLoopendBlockBranchInstr(MBB))
           && isUncondBranch(BranchMI)) {
-    DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+    LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
     BranchMI->eraseFromParent();
   }
 }
@@ -1669,7 +1617,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
 
   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
   assert(BranchMI && isCondBranch(BranchMI));
-  DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+  LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
   BranchMI->eraseFromParent();
   SHOWNEWBLK(MBB1, "Removing redundant successor");
   MBB->removeSuccessor(MBB1, true);
@@ -1679,7 +1627,7 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
     SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
-  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+  insertInstrEnd(DummyExitBlk, R600::RETURN);
 
   for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
        E = RetMBB.end(); It != E; ++It) {
@@ -1688,10 +1636,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
     if (MI)
       MI->eraseFromParent();
     MBB->addSuccessor(DummyExitBlk);
-    DEBUG(
-      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
-             << " successors\n";
-    );
+    LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+                      << " successors\n";);
   }
   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
 }
@@ -1710,9 +1656,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
 }
 
 void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
-  DEBUG(
-        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
-  );
+  LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
 
   BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
index 5d243e949fd3..289642aaa2d0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -198,7 +198,7 @@ enum amd_code_property_mask_t {
   AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
 };
 
-/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// The hsa_ext_control_directives_t specifies the values for the HSAIL
 /// control directives. These control how the finalizer generates code. This
 /// struct is used both as an argument to hsaFinalizeKernel to specify values for
 /// the control directives, and is used in HsaKernelCode to record the values of
@@ -551,14 +551,8 @@ typedef struct amd_kernel_code_s {
   int64_t kernel_code_prefetch_byte_offset;
   uint64_t kernel_code_prefetch_byte_size;
 
-  /// Number of bytes of scratch backing memory required for full
-  /// occupancy of target chip. This takes into account the number of
-  /// bytes of scratch per work-item, the wavefront size, the maximum
-  /// number of wavefronts per CU, and the number of CUs. This is an
-  /// upper limit on scratch. If the grid being dispatched is small it
-  /// may only need less than this. If the kernel uses no scratch, or
-  /// the Finalizer has not computed this value, it must be 0.
-  uint64_t max_scratch_backing_memory_byte_size;
+  /// Reserved. Must be 0.
+  uint64_t reserved0;
 
   /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
   /// COMPUTE_PGM_RSRC2 registers.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2e3a453f9c75..31e2885c833d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -12,6 +12,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
+#include "SIInstrInfo.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
@@ -25,7 +26,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -42,9 +42,11 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -60,6 +62,7 @@
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::amdhsa;
 
 namespace {
 
@@ -128,6 +131,7 @@ public:
   enum ImmTy {
     ImmTyNone,
     ImmTyGDS,
+    ImmTyLDS,
     ImmTyOffen,
     ImmTyIdxen,
     ImmTyAddr64,
@@ -138,6 +142,7 @@ public:
     ImmTyGLC,
     ImmTySLC,
     ImmTyTFE,
+    ImmTyD16,
     ImmTyClampSI,
     ImmTyOModSI,
     ImmTyDppCtrl,
@@ -267,7 +272,11 @@ public:
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
 
-  bool isSDWARegKind() const;
+  bool isSDWAOperand(MVT type) const;
+  bool isSDWAFP16Operand() const;
+  bool isSDWAFP32Operand() const;
+  bool isSDWAInt16Operand() const;
+  bool isSDWAInt32Operand() const;
 
   bool isImmTy(ImmTy ImmT) const {
     return isImm() && Imm.Type == ImmT;
@@ -282,7 +291,7 @@ public:
   bool isDMask() const { return isImmTy(ImmTyDMask); }
   bool isUNorm() const { return isImmTy(ImmTyUNorm); }
   bool isDA() const { return isImmTy(ImmTyDA); }
-  bool isR128() const { return isImmTy(ImmTyUNorm); }
+  bool isR128() const { return isImmTy(ImmTyR128); }
   bool isLWE() const { return isImmTy(ImmTyLWE); }
   bool isOff() const { return isImmTy(ImmTyOff); }
   bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -298,9 +307,11 @@ public:
   bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
   bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
   bool isGDS() const { return isImmTy(ImmTyGDS); }
+  bool isLDS() const { return isImmTy(ImmTyLDS); }
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isD16() const { return isImmTy(ImmTyD16); }
   bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
   bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -434,7 +445,7 @@ public:
   }
 
   bool isVSrcB32() const {
-    return isVCSrcF32() || isLiteralImm(MVT::i32);
+    return isVCSrcF32() || isLiteralImm(MVT::i32) || isExpr();
   }
 
   bool isVSrcB64() const {
@@ -451,7 +462,7 @@ public:
   }
 
   bool isVSrcF32() const {
-    return isVCSrcF32() || isLiteralImm(MVT::f32);
+    return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
   }
 
   bool isVSrcF64() const {
@@ -643,6 +654,7 @@ public:
     switch (Type) {
     case ImmTyNone: OS << "None"; break;
     case ImmTyGDS: OS << "GDS"; break;
+    case ImmTyLDS: OS << "LDS"; break;
     case ImmTyOffen: OS << "Offen"; break;
     case ImmTyIdxen: OS << "Idxen"; break;
     case ImmTyAddr64: OS << "Addr64"; break;
@@ -653,6 +665,7 @@ public:
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyD16: OS << "D16"; break;
     case ImmTyDFMT: OS << "DFMT"; break;
     case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
@@ -817,7 +830,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
   // Number of extra operands parsed after the first optional operand.
   // This may be necessary to skip hardcoded mandatory operands.
-  static const unsigned MAX_OPR_LOOKAHEAD = 1;
+  static const unsigned MAX_OPR_LOOKAHEAD = 8;
 
   unsigned ForcedEncodingSize = 0;
   bool ForcedDPP = false;
@@ -834,6 +847,27 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
 private:
   bool ParseAsAbsoluteExpression(uint32_t &Ret);
+  bool OutOfRangeError(SMRange Range);
+  /// Calculate VGPR/SGPR blocks required for given target, reserved
+  /// registers, and user-specified NextFreeXGPR values.
+  ///
+  /// \param Features [in] Target features, used for bug corrections.
+  /// \param VCCUsed [in] Whether VCC special SGPR is reserved.
+  /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved.
+  /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved.
+  /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one.
+  /// \param VGPRRange [in] Token range, used for VGPR diagnostics.
+  /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one.
+  /// \param SGPRRange [in] Token range, used for SGPR diagnostics.
+  /// \param VGPRBlocks [out] Result VGPR block count.
+  /// \param SGPRBlocks [out] Result SGPR block count.
+  bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed, bool XNACKUsed,
+                          unsigned NextFreeVGPR, SMRange VGPRRange,
+                          unsigned NextFreeSGPR, SMRange SGPRRange,
+                          unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+  bool ParseDirectiveAMDGCNTarget();
+  bool ParseDirectiveAMDHSAKernel();
   bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
   bool ParseDirectiveHSACodeObjectVersion();
   bool ParseDirectiveHSACodeObjectISA();
@@ -852,8 +886,12 @@ private:
   bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
                            unsigned& RegNum, unsigned& RegWidth,
                            unsigned *DwordRegIndex);
+  Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
+  void initializeGprCountSymbol(RegisterKind RegKind);
+  bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
+                             unsigned RegWidth);
   void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
-                    bool IsAtomic, bool IsAtomicReturn);
+                    bool IsAtomic, bool IsAtomicReturn, bool IsLds = false);
   void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
                  bool IsGdsHardcoded);
 
@@ -885,15 +923,37 @@ public:
       AMDGPU::IsaInfo::IsaVersion ISA =
           AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
       MCContext &Ctx = getContext();
-      MCSymbol *Sym =
-          Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
-      Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
-      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
-      Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
-      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
-      Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+      if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+        MCSymbol *Sym =
+            Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+      } else {
+        MCSymbol *Sym =
+            Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+        Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
+        Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+        Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
+      }
+      if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+        initializeGprCountSymbol(IS_VGPR);
+        initializeGprCountSymbol(IS_SGPR);
+      } else
+        KernelScope.initialize(getContext());
     }
-    KernelScope.initialize(getContext());
+  }
+
+  bool hasXNACK() const {
+    return AMDGPU::hasXNACK(getSTI());
+  }
+
+  bool hasMIMG_R128() const {
+    return AMDGPU::hasMIMG_R128(getSTI());
+  }
+
+  bool hasPackedD16() const {
+    return AMDGPU::hasPackedD16(getSTI());
   }
 
   bool isSI() const {
@@ -1029,6 +1089,11 @@ private:
   bool validateConstantBusLimitations(const MCInst &Inst);
   bool validateEarlyClobberLimitations(const MCInst &Inst);
   bool validateIntClampSupported(const MCInst &Inst);
+  bool validateMIMGAtomicDMask(const MCInst &Inst);
+  bool validateMIMGGatherDMask(const MCInst &Inst);
+  bool validateMIMGDataSize(const MCInst &Inst);
+  bool validateMIMGR128(const MCInst &Inst);
+  bool validateMIMGD16(const MCInst &Inst);
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1065,17 +1130,12 @@ public:
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
   void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
 
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
-  AMDGPUOperand::Ptr defaultTFE() const;
 
-  AMDGPUOperand::Ptr defaultDMask() const;
-  AMDGPUOperand::Ptr defaultUNorm() const;
-  AMDGPUOperand::Ptr defaultDA() const;
-  AMDGPUOperand::Ptr defaultR128() const;
-  AMDGPUOperand::Ptr defaultLWE() const;
   AMDGPUOperand::Ptr defaultSMRDOffset8() const;
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
@@ -1281,15 +1341,31 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
-bool AMDGPUOperand::isSDWARegKind() const {
+bool AMDGPUOperand::isSDWAOperand(MVT type) const {
   if (AsmParser->isVI())
     return isVReg();
   else if (AsmParser->isGFX9())
-    return isRegKind();
+    return isRegKind() || isInlinableImm(type);
   else
     return false;
 }
 
+bool AMDGPUOperand::isSDWAFP16Operand() const {
+  return isSDWAOperand(MVT::f16);
+}
+
+bool AMDGPUOperand::isSDWAFP32Operand() const {
+  return isSDWAOperand(MVT::f32);
+}
+
+bool AMDGPUOperand::isSDWAInt16Operand() const {
+  return isSDWAOperand(MVT::i16);
+}
+
+bool AMDGPUOperand::isSDWAInt32Operand() const {
+  return isSDWAOperand(MVT::i32);
+}
+
 uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
 {
   assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -1521,12 +1597,15 @@ static unsigned getSpecialRegForName(StringRef RegName) {
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
     .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("xnack_mask", AMDGPU::XNACK_MASK)
     .Case("m0", AMDGPU::M0)
     .Case("scc", AMDGPU::SCC)
     .Case("tba", AMDGPU::TBA)
     .Case("tma", AMDGPU::TMA)
     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
+    .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
     .Case("vcc_lo", AMDGPU::VCC_LO)
     .Case("vcc_hi", AMDGPU::VCC_HI)
     .Case("exec_lo", AMDGPU::EXEC_LO)
@@ -1564,6 +1643,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
       RegWidth = 2;
       return true;
     }
+    if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) {
+      Reg = AMDGPU::XNACK_MASK;
+      RegWidth = 2;
+      return true;
+    }
     if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
       Reg = AMDGPU::VCC;
       RegWidth = 2;
@@ -1722,6 +1806,54 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
   return true;
 }
 
+Optional<StringRef>
+AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) {
+  switch (RegKind) {
+  case IS_VGPR:
+    return StringRef(".amdgcn.next_free_vgpr");
+  case IS_SGPR:
+    return StringRef(".amdgcn.next_free_sgpr");
+  default:
+    return None;
+  }
+}
+
+void AMDGPUAsmParser::initializeGprCountSymbol(RegisterKind RegKind) {
+  auto SymbolName = getGprCountSymbolName(RegKind);
+  assert(SymbolName && "initializing invalid register kind");
+  MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+  Sym->setVariableValue(MCConstantExpr::create(0, getContext()));
+}
+
+bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
+                                            unsigned DwordRegIndex,
+                                            unsigned RegWidth) {
+  // Symbols are only defined for GCN targets
+  if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6)
+    return true;
+
+  auto SymbolName = getGprCountSymbolName(RegKind);
+  if (!SymbolName)
+    return true;
+  MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName);
+
+  int64_t NewMax = DwordRegIndex + RegWidth - 1;
+  int64_t OldCount;
+
+  if (!Sym->isVariable())
+    return !Error(getParser().getTok().getLoc(),
+                  ".amdgcn.next_free_{v,s}gpr symbols must be variable");
+  if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount))
+    return !Error(
+        getParser().getTok().getLoc(),
+        ".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions");
+
+  if (OldCount <= NewMax)
+    Sym->setVariableValue(MCConstantExpr::create(NewMax + 1, getContext()));
+
+  return true;
+}
+
 std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   const auto &Tok = Parser.getTok();
   SMLoc StartLoc = Tok.getLoc();
@@ -1732,7 +1864,11 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
     return nullptr;
   }
-  KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+  if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+      return nullptr;
+  } else
+    KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
   return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
 }
 
@@ -2239,6 +2375,111 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
   return true;
 }
 
+bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+
+  int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+  int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+  int TFEIdx   = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
+
+  assert(VDataIdx != -1);
+  assert(DMaskIdx != -1);
+  assert(TFEIdx != -1);
+
+  unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
+  unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+  unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+  if (DMask == 0)
+    DMask = 1;
+
+  unsigned DataSize =
+    (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask);
+  if (hasPackedD16()) {
+    int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+    if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm())
+      DataSize = (DataSize + 1) / 2;
+  }
+
+  return (VDataSize / 4) == DataSize + TFESize;
+}
+
+bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+  if (!Desc.mayLoad() || !Desc.mayStore())
+    return true; // Not atomic
+
+  int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+  unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+  // This is an incomplete check because image_atomic_cmpswap
+  // may only use 0x3 and 0xf while other atomic operations
+  // may use 0x1 and 0x3. However these limitations are
+  // verified when we check that dmask matches dst size.
+  return DMask == 0x1 || DMask == 0x3 || DMask == 0xf;
+}
+
+bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::Gather4) == 0)
+    return true;
+
+  int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask);
+  unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
+
+  // GATHER4 instructions use dmask in a different fashion compared to
+  // other MIMG instructions. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
+}
+
+bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+
+  int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+  assert(Idx != -1);
+
+  bool R128 = (Inst.getOperand(Idx).getImm() != 0);
+
+  return !R128 || hasMIMG_R128();
+}
+
+bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+    return true;
+
+  int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
+  if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) {
+    if (isCI() || isSI())
+      return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
                                           const SMLoc &IDLoc) {
   if (!validateConstantBusLimitations(Inst)) {
@@ -2256,6 +2497,32 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
       "integer clamping is not supported on this GPU");
     return false;
   }
+  if (!validateMIMGR128(Inst)) {
+    Error(IDLoc,
+      "r128 modifier is not supported on this GPU");
+    return false;
+  }
+  // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
+  if (!validateMIMGD16(Inst)) {
+    Error(IDLoc,
+      "d16 modifier is not supported on this GPU");
+    return false;
+  }
+  if (!validateMIMGDataSize(Inst)) {
+    Error(IDLoc,
+      "image data size does not match dmask and tfe");
+    return false;
+  }
+  if (!validateMIMGAtomicDMask(Inst)) {
+    Error(IDLoc,
+      "invalid atomic image dmask");
+    return false;
+  }
+  if (!validateMIMGGatherDMask(Inst)) {
+    Error(IDLoc,
+      "invalid image_gather dmask: only one bit must be set");
+    return false;
+  }
 
   return true;
 }
@@ -2360,6 +2627,320 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
   return false;
 }
 
+bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
+  if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+    return TokError("directive only supported for amdgcn architecture");
+
+  std::string Target;
+
+  SMLoc TargetStart = getTok().getLoc();
+  if (getParser().parseEscapedString(Target))
+    return true;
+  SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+
+  std::string ExpectedTarget;
+  raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+  IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
+
+  if (Target != ExpectedTargetOS.str())
+    return getParser().Error(TargetRange.Start, "target must match options",
+                             TargetRange);
+
+  getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
+  return false;
+}
+
+bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
+  return getParser().Error(Range.Start, "value out of range", Range);
+}
+
+bool AMDGPUAsmParser::calculateGPRBlocks(
+    const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
+    bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange,
+    unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks,
+    unsigned &SGPRBlocks) {
+  // TODO(scott.linder): These calculations are duplicated from
+  // AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
+  IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features);
+
+  unsigned NumVGPRs = NextFreeVGPR;
+  unsigned NumSGPRs = NextFreeSGPR;
+  unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features);
+
+  if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
+      NumSGPRs > MaxAddressableNumSGPRs)
+    return OutOfRangeError(SGPRRange);
+
+  NumSGPRs +=
+      IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed);
+
+  if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+      NumSGPRs > MaxAddressableNumSGPRs)
+    return OutOfRangeError(SGPRRange);
+
+  if (Features.test(FeatureSGPRInitBug))
+    NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+  VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs);
+  SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs);
+
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
+  if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
+    return TokError("directive only supported for amdgcn architecture");
+
+  if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA)
+    return TokError("directive only supported for amdhsa OS");
+
+  StringRef KernelName;
+  if (getParser().parseIdentifier(KernelName))
+    return true;
+
+  kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
+
+  StringSet<> Seen;
+
+  IsaInfo::IsaVersion IVersion =
+      IsaInfo::getIsaVersion(getSTI().getFeatureBits());
+
+  SMRange VGPRRange;
+  uint64_t NextFreeVGPR = 0;
+  SMRange SGPRRange;
+  uint64_t NextFreeSGPR = 0;
+  unsigned UserSGPRCount = 0;
+  bool ReserveVCC = true;
+  bool ReserveFlatScr = true;
+  bool ReserveXNACK = hasXNACK();
+
+  while (true) {
+    while (getLexer().is(AsmToken::EndOfStatement))
+      Lex();
+
+    if (getLexer().isNot(AsmToken::Identifier))
+      return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel");
+
+    StringRef ID = getTok().getIdentifier();
+    SMRange IDRange = getTok().getLocRange();
+    Lex();
+
+    if (ID == ".end_amdhsa_kernel")
+      break;
+
+    if (Seen.find(ID) != Seen.end())
+      return TokError(".amdhsa_ directives cannot be repeated");
+    Seen.insert(ID);
+
+    SMLoc ValStart = getTok().getLoc();
+    int64_t IVal;
+    if (getParser().parseAbsoluteExpression(IVal))
+      return true;
+    SMLoc ValEnd = getTok().getLoc();
+    SMRange ValRange = SMRange(ValStart, ValEnd);
+
+    if (IVal < 0)
+      return OutOfRangeError(ValRange);
+
+    uint64_t Val = IVal;
+
+#define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE)                           \
+  if (!isUInt<ENTRY##_WIDTH>(VALUE))                                           \
+    return OutOfRangeError(RANGE);                                             \
+  AMDHSA_BITS_SET(FIELD, ENTRY, VALUE);
+
+    if (ID == ".amdhsa_group_segment_fixed_size") {
+      if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val))
+        return OutOfRangeError(ValRange);
+      KD.group_segment_fixed_size = Val;
+    } else if (ID == ".amdhsa_private_segment_fixed_size") {
+      if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
+        return OutOfRangeError(ValRange);
+      KD.private_segment_fixed_size = Val;
+    } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
+                       Val, ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+                       Val, ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
+                       ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
+                       Val, ValRange);
+      UserSGPRCount++;
+    } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
+      PARSE_BITS_ENTRY(
+          KD.compute_pgm_rsrc2,
+          COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val,
+          ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_sgpr_workgroup_info") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_system_vgpr_workitem_id") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_next_free_vgpr") {
+      VGPRRange = ValRange;
+      NextFreeVGPR = Val;
+    } else if (ID == ".amdhsa_next_free_sgpr") {
+      SGPRRange = ValRange;
+      NextFreeSGPR = Val;
+    } else if (ID == ".amdhsa_reserve_vcc") {
+      if (!isUInt<1>(Val))
+        return OutOfRangeError(ValRange);
+      ReserveVCC = Val;
+    } else if (ID == ".amdhsa_reserve_flat_scratch") {
+      if (IVersion.Major < 7)
+        return getParser().Error(IDRange.Start, "directive requires gfx7+",
+                                 IDRange);
+      if (!isUInt<1>(Val))
+        return OutOfRangeError(ValRange);
+      ReserveFlatScr = Val;
+    } else if (ID == ".amdhsa_reserve_xnack_mask") {
+      if (IVersion.Major < 8)
+        return getParser().Error(IDRange.Start, "directive requires gfx8+",
+                                 IDRange);
+      if (!isUInt<1>(Val))
+        return OutOfRangeError(ValRange);
+      ReserveXNACK = Val;
+    } else if (ID == ".amdhsa_float_round_mode_32") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
+    } else if (ID == ".amdhsa_float_round_mode_16_64") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange);
+    } else if (ID == ".amdhsa_float_denorm_mode_32") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange);
+    } else if (ID == ".amdhsa_float_denorm_mode_16_64") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_dx10_clamp") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange);
+    } else if (ID == ".amdhsa_ieee_mode") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_fp16_overflow") {
+      if (IVersion.Major < 9)
+        return getParser().Error(IDRange.Start, "directive requires gfx9+",
+                                 IDRange);
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
+                       ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
+      PARSE_BITS_ENTRY(
+          KD.compute_pgm_rsrc2,
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val,
+          ValRange);
+    } else if (ID == ".amdhsa_exception_fp_denorm_src") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") {
+      PARSE_BITS_ENTRY(
+          KD.compute_pgm_rsrc2,
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val,
+          ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_overflow") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_underflow") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_fp_ieee_inexact") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
+                       Val, ValRange);
+    } else if (ID == ".amdhsa_exception_int_div_zero") {
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+                       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
+                       Val, ValRange);
+    } else {
+      return getParser().Error(IDRange.Start,
+                               "unknown .amdhsa_kernel directive", IDRange);
+    }
+
+#undef PARSE_BITS_ENTRY
+  }
+
+  if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end())
+    return TokError(".amdhsa_next_free_vgpr directive is required");
+
+  if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end())
+    return TokError(".amdhsa_next_free_sgpr directive is required");
+
+  unsigned VGPRBlocks;
+  unsigned SGPRBlocks;
+  if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
+                         ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR,
+                         SGPRRange, VGPRBlocks, SGPRBlocks))
+    return true;
+
+  if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+          VGPRBlocks))
+    return OutOfRangeError(VGPRRange);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks);
+
+  if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+          SGPRBlocks))
+    return OutOfRangeError(SGPRRange);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
+                  SGPRBlocks);
+
+  if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
+    return TokError("too many user SGPRs enabled");
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
+                  UserSGPRCount);
+
+  getTargetStreamer().EmitAmdhsaKernelDescriptor(
+      getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
+      ReserveFlatScr, ReserveXNACK);
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
   uint32_t Major;
   uint32_t Minor;
@@ -2426,6 +3007,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
 
 bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
                                                amd_kernel_code_t &Header) {
+  // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing
+  // assembly for backwards compatibility.
+  if (ID == "max_scratch_backing_memory_byte_size") {
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
   SmallString<40> ErrStr;
   raw_svector_ostream Err(ErrStr);
   if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
@@ -2472,7 +3060,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
   getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
                                            ELF::STT_AMDGPU_HSA_KERNEL);
   Lex();
-  KernelScope.initialize(getContext());
+  if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()))
+    KernelScope.initialize(getContext());
   return false;
 }
 
@@ -2576,20 +3165,28 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
-  if (IDVal == ".hsa_code_object_version")
-    return ParseDirectiveHSACodeObjectVersion();
+  if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (IDVal == ".amdgcn_target")
+      return ParseDirectiveAMDGCNTarget();
+
+    if (IDVal == ".amdhsa_kernel")
+      return ParseDirectiveAMDHSAKernel();
+  } else {
+    if (IDVal == ".hsa_code_object_version")
+      return ParseDirectiveHSACodeObjectVersion();
 
-  if (IDVal == ".hsa_code_object_isa")
-    return ParseDirectiveHSACodeObjectISA();
+    if (IDVal == ".hsa_code_object_isa")
+      return ParseDirectiveHSACodeObjectISA();
 
-  if (IDVal == ".amd_kernel_code_t")
-    return ParseDirectiveAMDKernelCodeT();
+    if (IDVal == ".amd_kernel_code_t")
+      return ParseDirectiveAMDKernelCodeT();
 
-  if (IDVal == ".amdgpu_hsa_kernel")
-    return ParseDirectiveAMDGPUHsaKernel();
+    if (IDVal == ".amdgpu_hsa_kernel")
+      return ParseDirectiveAMDGPUHsaKernel();
 
-  if (IDVal == ".amd_amdgpu_isa")
-    return ParseDirectiveISAVersion();
+    if (IDVal == ".amd_amdgpu_isa")
+      return ParseDirectiveISAVersion();
+  }
 
   if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
     return ParseDirectiveHSAMetadata();
@@ -2617,6 +3214,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   case AMDGPU::TMA_LO:
   case AMDGPU::TMA_HI:
     return !isGFX9();
+  case AMDGPU::XNACK_MASK:
+  case AMDGPU::XNACK_MASK_LO:
+  case AMDGPU::XNACK_MASK_HI:
+    return !isCI() && !isSI() && hasXNACK();
   default:
     break;
   }
@@ -3163,7 +3764,10 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
     HwReg.IsSymbolic = true;
     HwReg.Id = ID_UNKNOWN_;
     const StringRef tok = Parser.getTok().getString();
-    for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+    int Last = ID_SYMBOLIC_LAST_;
+    if (isSI() || isCI() || isVI())
+      Last = ID_SYMBOLIC_FIRST_GFX9_;
+    for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) {
       if (tok == IdSymbolic[i]) {
         HwReg.Id = i;
         break;
@@ -3912,13 +4516,13 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
 }
 
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
-}
-
 void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
                                const OperandVector &Operands,
-                               bool IsAtomic, bool IsAtomicReturn) {
+                               bool IsAtomic,
+                               bool IsAtomicReturn,
+                               bool IsLds) {
+  bool IsLdsOpcode = IsLds;
+  bool HasLdsModifier = false;
   OptionalImmIndexMap OptionalIdx;
   assert(IsAtomicReturn ? IsAtomic : true);
 
@@ -3937,6 +4541,8 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
       continue;
     }
 
+    HasLdsModifier = Op.isLDS();
+
     // Handle tokens like 'offen' which are sometimes hard-coded into the
     // asm string.  There are no MCInst operands for these.
     if (Op.isToken()) {
@@ -3948,6 +4554,21 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     OptionalIdx[Op.getImmTy()] = i;
   }
 
+  // This is a workaround for an llvm quirk which may result in an
+  // incorrect instruction selection. Lds and non-lds versions of
+  // MUBUF instructions are identical except that lds versions
+  // have mandatory 'lds' modifier. However this modifier follows
+  // optional modifiers and llvm asm matcher regards this 'lds'
+  // modifier as an optional one. As a result, an lds version
+  // of opcode may be selected even if it has no 'lds' modifier.
+  if (IsLdsOpcode && !HasLdsModifier) {
+    int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode());
+    if (NoLdsOpcode != -1) { // Got lds version - correct it.
+      Inst.setOpcode(NoLdsOpcode);
+      IsLdsOpcode = false;
+    }
+  }
+
   // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
   if (IsAtomicReturn) {
     MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
@@ -3959,7 +4580,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   }
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+
+  if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  }
 }
 
 void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -4014,7 +4638,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
 
   if (IsAtomic) {
     // Add src, same as dst
-    ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+    assert(Desc.getNumDefs() == 1);
+    ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1);
   }
 
   OptionalImmIndexMap OptionalIdx;
@@ -4023,9 +4648,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
 
     // Add the register arguments
-    if (Op.isRegOrImm()) {
-      Op.addRegOrImmOperands(Inst, 1);
-      continue;
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
     } else if (Op.isImmModifier()) {
       OptionalIdx[Op.getImmTy()] = I;
     } else {
@@ -4036,37 +4660,18 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
 }
 
 void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
   cvtMIMG(Inst, Operands, true);
 }
 
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
-  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
-}
-
 //===----------------------------------------------------------------------===//
 // smrd
 //===----------------------------------------------------------------------===//
@@ -4153,6 +4758,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
+  {"lds",     AMDGPUOperand::ImmTyLDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
   {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
   {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
@@ -4160,6 +4766,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
+  {"d16",     AMDGPUOperand::ImmTyD16, true, nullptr},
   {"high",    AMDGPUOperand::ImmTyHigh, true, nullptr},
   {"clamp",   AMDGPUOperand::ImmTyClampSI, true, nullptr},
   {"omod",    AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
@@ -4167,6 +4774,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
   {"r128",    AMDGPUOperand::ImmTyR128,  true, nullptr},
   {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
+  {"d16",     AMDGPUOperand::ImmTyD16,   true, nullptr},
   {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
   {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
   {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
@@ -4379,12 +4987,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
   }
 
-  // special case v_mac_{f16, f32}:
+  // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
   // it has src2 register operand that is tied to dst operand
   // we don't allow modifiers for this operand in assembler so src2_modifiers
-  // should be 0
-  if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
-      Opc == AMDGPU::V_MAC_F16_e64_vi) {
+  // should be 0.
+  if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+      Opc == AMDGPU::V_MAC_F32_e64_vi ||
+      Opc == AMDGPU::V_MAC_F16_e64_vi ||
+      Opc == AMDGPU::V_FMAC_F32_e64_vi) {
     auto it = Inst.begin();
     std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
     it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -4486,21 +5096,23 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
 //===----------------------------------------------------------------------===//
 
 bool AMDGPUOperand::isDPPCtrl() const {
+  using namespace AMDGPU::DPP;
+
   bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
   if (result) {
     int64_t Imm = getImm();
-    return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
-           ((Imm >= 0x101) && (Imm <= 0x10f)) ||
-           ((Imm >= 0x111) && (Imm <= 0x11f)) ||
-           ((Imm >= 0x121) && (Imm <= 0x12f)) ||
-           (Imm == 0x130) ||
-           (Imm == 0x134) ||
-           (Imm == 0x138) ||
-           (Imm == 0x13c) ||
-           (Imm == 0x140) ||
-           (Imm == 0x141) ||
-           (Imm == 0x142) ||
-           (Imm == 0x143);
+    return (Imm >= DppCtrl::QUAD_PERM_FIRST && Imm <= DppCtrl::QUAD_PERM_LAST) ||
+           (Imm >= DppCtrl::ROW_SHL_FIRST && Imm <= DppCtrl::ROW_SHL_LAST) ||
+           (Imm >= DppCtrl::ROW_SHR_FIRST && Imm <= DppCtrl::ROW_SHR_LAST) ||
+           (Imm >= DppCtrl::ROW_ROR_FIRST && Imm <= DppCtrl::ROW_ROR_LAST) ||
+           (Imm == DppCtrl::WAVE_SHL1) ||
+           (Imm == DppCtrl::WAVE_ROL1) ||
+           (Imm == DppCtrl::WAVE_SHR1) ||
+           (Imm == DppCtrl::WAVE_ROR1) ||
+           (Imm == DppCtrl::ROW_MIRROR) ||
+           (Imm == DppCtrl::ROW_HALF_MIRROR) ||
+           (Imm == DppCtrl::BCAST15) ||
+           (Imm == DppCtrl::BCAST31);
   }
   return false;
 }
@@ -4519,6 +5131,8 @@ bool AMDGPUOperand::isU16Imm() const {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+  using namespace AMDGPU::DPP;
+
   SMLoc S = Parser.getTok().getLoc();
   StringRef Prefix;
   int64_t Int;
@@ -4530,10 +5144,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
   }
 
   if (Prefix == "row_mirror") {
-    Int = 0x140;
+    Int = DppCtrl::ROW_MIRROR;
     Parser.Lex();
   } else if (Prefix == "row_half_mirror") {
-    Int = 0x141;
+    Int = DppCtrl::ROW_HALF_MIRROR;
     Parser.Lex();
   } else {
     // Check to prevent parseDPPCtrlOps from eating invalid tokens
@@ -4585,24 +5199,24 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
         return MatchOperand_ParseFail;
 
       if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
-        Int |= 0x100;
+        Int |= DppCtrl::ROW_SHL0;
       } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
-        Int |= 0x110;
+        Int |= DppCtrl::ROW_SHR0;
       } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
-        Int |= 0x120;
+        Int |= DppCtrl::ROW_ROR0;
       } else if (Prefix == "wave_shl" && 1 == Int) {
-        Int = 0x130;
+        Int = DppCtrl::WAVE_SHL1;
       } else if (Prefix == "wave_rol" && 1 == Int) {
-        Int = 0x134;
+        Int = DppCtrl::WAVE_ROL1;
       } else if (Prefix == "wave_shr" && 1 == Int) {
-        Int = 0x138;
+        Int = DppCtrl::WAVE_SHR1;
       } else if (Prefix == "wave_ror" && 1 == Int) {
-        Int = 0x13C;
+        Int = DppCtrl::WAVE_ROR1;
       } else if (Prefix == "row_bcast") {
         if (Int == 15) {
-          Int = 0x142;
+          Int = DppCtrl::BCAST15;
         } else if (Int == 31) {
-          Int = 0x143;
+          Int = DppCtrl::BCAST31;
         } else {
           return MatchOperand_ParseFail;
         }
@@ -4780,7 +5394,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       }
     }
     if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegWithInputModsOperands(Inst, 2);
+      Op.addRegOrImmWithInputModsOperands(Inst, 2);
     } else if (Op.isImm()) {
       // Handle optional arguments
       OptionalIdx[Op.getImmTy()] = I;
@@ -4862,6 +5476,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
     return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
   case MCK_gds:
     return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+  case MCK_lds:
+    return Operand.isLDS() ? Match_Success : Match_InvalidOperand;
   case MCK_glc:
     return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
   case MCK_idxen:
diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 2230457b3a9b..b87c47a6b9ee 100644
--- a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -52,14 +52,19 @@ class getAddrName<int addrKind> {
     "")))));
 }
 
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFAddr64Table <bit is_addr64, string Name> {
   bit IsAddr64 = is_addr64;
-  string OpName = NAME # suffix;
+  string OpName = Name;
 }
 
-class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+class MUBUFLdsTable <bit is_lds, string Name> {
+  bit IsLds = is_lds;
+  string OpName = Name;
+}
+
+class MTBUFAddr64Table <bit is_addr64, string Name> {
   bit IsAddr64 = is_addr64;
-  string OpName = NAME # suffix;
+  string OpName = Name;
 }
 
 //===----------------------------------------------------------------------===//
@@ -137,17 +142,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
   );
   dag InsData = !if(!empty(vaddrList),
     (ins vdataClass:$vdata,                    SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
-         slc:$slc, tfe:$tfe),
+         SLC:$slc, TFE:$tfe),
     (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
-         slc:$slc, tfe:$tfe)
+         SLC:$slc, TFE:$tfe)
   );
   dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
@@ -214,13 +219,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
     [(set load_vt:$vdata,
      (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
                       i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MTBUFAddr64Table<0>;
+    MTBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(set load_vt:$vdata,
      (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
                       i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MTBUFAddr64Table<1>;
+    MTBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
   def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -260,13 +265,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                        i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
                                        i1:$slc, i1:$tfe))]>,
-    MTBUFAddr64Table<0>;
+    MTBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                        i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
                                        i1:$slc, i1:$tfe))]>,
-    MTBUFAddr64Table<1>;
+    MTBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
   def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -310,6 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> offen       = 0;
   bits<1> idxen       = 0;
   bits<1> addr64      = 0;
+  bits<1> lds         = 0;
   bits<1> has_vdata   = 1;
   bits<1> has_vaddr   = 1;
   bits<1> has_glc     = 1;
@@ -336,7 +342,6 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
 
   bits<12> offset;
   bits<1>  glc;
-  bits<1>  lds = 0;
   bits<8>  vaddr;
   bits<8>  vdata;
   bits<7>  srsrc;
@@ -371,31 +376,35 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> :
 }
 
 class getMUBUFInsDA<list<RegisterClass> vdataList,
-                    list<RegisterClass> vaddrList=[]> {
+                    list<RegisterClass> vaddrList=[],
+                    bit isLds = 0> {
   RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+         offset:$offset, GLC:$glc, SLC:$slc),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+         offset:$offset, GLC:$glc, SLC:$slc)
   );
   dag InsData = !if(!empty(vaddrList),
     (ins vdataClass:$vdata,                    SReg_128:$srsrc,
-         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
     (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
-         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
   );
-  dag ret = !if(!empty(vdataList), InsNoData, InsData);
+  dag ret = !con(
+              !if(!empty(vdataList), InsNoData, InsData),
+              !if(isLds, (ins), (ins TFE:$tfe))
+             );
 }
 
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
   dag ret =
-    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
-    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
-    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
-    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
-    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret,
     (ins))))));
 }
 
@@ -426,20 +435,29 @@ class MUBUF_Load_Pseudo <string opName,
                          int addrKind,
                          RegisterClass vdataClass,
                          bit HasTiedDest = 0,
+                         bit isLds = 0,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind>
   : MUBUF_Pseudo<opName,
                  (outs vdataClass:$vdata),
-                 !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
-                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
+                      !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
+                   !if(isLds, " lds", "$tfe"),
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
-  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  let PseudoInstr = opName # !if(isLds, "_lds", "") #
+                    "_" # getAddrName<addrKindCopy>.ret;
+  let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf");
+
   let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
   let mayLoad = 1;
   let mayStore = 0;
   let maybeAtomic = 1;
+  let Uses = !if(isLds, [EXEC, M0], [EXEC]);
+  let has_tfe = !if(isLds, 0, 1);
+  let lds = isLds;
 }
 
 // FIXME: tfe can't be an operand because it requires a separate
@@ -447,32 +465,45 @@ class MUBUF_Load_Pseudo <string opName,
 multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
                               ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag,
-                              bit TiedDest = 0> {
+                              bit TiedDest = 0,
+                              bit isLds = 0> {
 
   def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
-    TiedDest,
-    [(set load_vt:$vdata,
-     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MUBUFAddr64Table<0>;
+    TiedDest, isLds,
+    !if(isLds,
+        [],
+        [(set load_vt:$vdata,
+         (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+    MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
 
   def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
-    TiedDest,
-    [(set load_vt:$vdata,
-     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
-    MUBUFAddr64Table<1>;
+    TiedDest, isLds,
+    !if(isLds,
+        [],
+        [(set load_vt:$vdata,
+         (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+    MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
 
-  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
-  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
-  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>;
-    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
-    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
-    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
   }
 }
 
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
+                                  ValueType load_vt = i32,
+                                  SDPatternOperator ld_nolds = null_frag,
+                                  SDPatternOperator ld_lds = null_frag> {
+  defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
+  defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+}
+
 class MUBUF_Store_Pseudo <string opName,
                           int addrKind,
                           RegisterClass vdataClass,
@@ -499,12 +530,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
   def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                        i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
-    MUBUFAddr64Table<0>;
+    MUBUFAddr64Table<0, NAME>;
 
   def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                        i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
-    MUBUFAddr64Table<1>;
+    MUBUFAddr64Table<1, NAME>;
 
   def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
   def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
@@ -518,6 +549,23 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
   }
 }
 
+class MUBUF_Pseudo_Store_Lds<string opName>
+  : MUBUF_Pseudo<opName,
+                 (outs),
+                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
+                 " $srsrc, $soffset$offset lds$glc$slc"> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let maybeAtomic = 1;
+
+  let has_vdata = 0;
+  let has_vaddr = 0;
+  let has_tfe = 0;
+  let lds = 1;
+
+  let Uses = [EXEC, M0];
+  let AsmMatchConverter = "cvtMubufLds";
+}
 
 class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
                           list<RegisterClass> vaddrList=[]> {
@@ -525,15 +573,15 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
   dag ret = !if(vdata_in,
     !if(!empty(vaddrList),
       (ins vdataClass:$vdata_in,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
       (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
     ),
     !if(!empty(vaddrList),
       (ins vdataClass:$vdata,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
       (ins vdataClass:$vdata, vaddrClass:$vaddr,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
   ));
 }
 
@@ -618,9 +666,9 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
                                  SDPatternOperator atomic> {
 
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
-                MUBUFAddr64Table <0>;
+                MUBUFAddr64Table <0, NAME>;
   def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
-                MUBUFAddr64Table <1>;
+                MUBUFAddr64Table <1, NAME>;
   def _OFFEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
   def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
@@ -629,13 +677,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
     [(set vdataType:$vdata,
      (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
              vdataType:$vdata_in))]>,
-    MUBUFAddr64Table <0, "_RTN">;
+    MUBUFAddr64Table <0, NAME # "_RTN">;
 
   def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
     [(set vdataType:$vdata,
      (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
              vdataType:$vdata_in))]>,
-    MUBUFAddr64Table <1, "_RTN">;
+    MUBUFAddr64Table <1, NAME # "_RTN">;
 
   def _OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
@@ -647,7 +695,7 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
 
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_format_x", VGPR_32
 >;
 defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
@@ -671,19 +719,74 @@ defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
 defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
   "buffer_store_format_xyzw", VReg_128
 >;
-defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+  defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xy", VReg_64
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xyz", VReg_96
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
+   "buffer_load_format_d16_xyzw", VReg_128
+  >;
+  defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xy", VReg_64
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyz", VReg_96
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyzw", VReg_128
+  >;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+  defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xy", VGPR_32
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xyz", VReg_64
+  >;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
+    "buffer_load_format_d16_xyzw", VReg_64
+  >;
+  defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_x", VGPR_32
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xy", VGPR_32
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyz", VReg_64
+  >;
+  defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
+    "buffer_store_format_d16_xyzw", VReg_64
+  >;
+} // End HasPackedD16VMem.
+
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
 >;
-defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
 >;
-defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
 >;
-defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
 >;
-defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
   "buffer_load_dword", VGPR_32, i32, mubuf_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
@@ -695,6 +798,22 @@ defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
   "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
 >;
+
+// This is not described in AMD documentation,
+// but 'lds' versions of these opcodes are available
+// in at least GFX8+ chips. See Bug 37653.
+let SubtargetPredicate = isVI in {
+defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+>;
+defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+>;
+}
+
 defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
   "buffer_store_byte", VGPR_32, i32, truncstorei8_global
 >;
@@ -792,6 +911,10 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
 >;
 
+let SubtargetPredicate = isVI in {
+def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
+}
+
 let SubtargetPredicate = isSI in { // isn't on CI & VI
 /*
 defm BUFFER_ATOMIC_RSUB        : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
@@ -842,6 +965,13 @@ defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
   "buffer_store_short_d16_hi", VGPR_32, i32
 >;
 
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+  "buffer_load_format_d16_hi_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
+  "buffer_store_format_d16_hi_x", VGPR_32
+>;
+
 } // End HasD16LoadStore
 
 def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
@@ -860,6 +990,28 @@ defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",
 defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
 defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
+let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VReg_64>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_96>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_128>;
+  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32>;
+  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VReg_64>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_96>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VGPR_32>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_64>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_64>;
+  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32>;
+  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VGPR_32>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_64>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+} // End HasPackedD16VMem.
+
 let SubtargetPredicate = isCIVI in {
 
 //===----------------------------------------------------------------------===//
@@ -922,6 +1074,19 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -969,6 +1134,19 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1210,7 +1388,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
 defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
 defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
@@ -1325,7 +1503,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OF
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
 
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
  // Hiding the extract high pattern in the PatFrag seems to not
  // automatically increase the complexity.
 let AddedComplexity = 1 in {
@@ -1382,6 +1560,18 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
 defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
 defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
 
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
                                    string opcode> {
   def : GCNPat<
@@ -1431,6 +1621,18 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
 defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
 defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+} // End HasUnpackedD16VMem.
+
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
+} // End HasPackedD16VMem.
+
 //===----------------------------------------------------------------------===//
 // Target instructions, move to the appropriate target TD file
 //===----------------------------------------------------------------------===//
@@ -1451,7 +1653,7 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{15}    = ps.addr64;
-  let Inst{16}    = lds;
+  let Inst{16}    = !if(ps.lds, 1, 0);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
@@ -1470,6 +1672,31 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
   def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
+multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> {
+
+  def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+                   MUBUFLdsTable<0, NAME # "_OFFSET_si">;
+  def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+                   MUBUFLdsTable<0, NAME # "_ADDR64_si">;
+  def _OFFEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+                   MUBUFLdsTable<0, NAME # "_OFFEN_si">;
+  def _IDXEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+                   MUBUFLdsTable<0, NAME # "_IDXEN_si">;
+  def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+                   MUBUFLdsTable<0, NAME # "_BOTHEN_si">;
+
+  def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+                       MUBUFLdsTable<1, NAME # "_OFFSET_si">;
+  def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
+                       MUBUFLdsTable<1, NAME # "_ADDR64_si">;
+  def _LDS_OFFEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+                       MUBUFLdsTable<1, NAME # "_OFFEN_si">;
+  def _LDS_IDXEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+                       MUBUFLdsTable<1, NAME # "_IDXEN_si">;
+  def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+                       MUBUFLdsTable<1, NAME # "_BOTHEN_si">;
+}
+
 multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
   def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
   def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
@@ -1478,7 +1705,7 @@ multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
   def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
 }
 
-defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_Lds_si <0x00>;
 defm BUFFER_LOAD_FORMAT_XY      : MUBUF_Real_AllAddr_si <0x01>;
 defm BUFFER_LOAD_FORMAT_XYZ     : MUBUF_Real_AllAddr_si <0x02>;
 defm BUFFER_LOAD_FORMAT_XYZW    : MUBUF_Real_AllAddr_si <0x03>;
@@ -1486,11 +1713,11 @@ defm BUFFER_STORE_FORMAT_X      : MUBUF_Real_AllAddr_si <0x04>;
 defm BUFFER_STORE_FORMAT_XY     : MUBUF_Real_AllAddr_si <0x05>;
 defm BUFFER_STORE_FORMAT_XYZ    : MUBUF_Real_AllAddr_si <0x06>;
 defm BUFFER_STORE_FORMAT_XYZW   : MUBUF_Real_AllAddr_si <0x07>;
-defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_si <0x08>;
-defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_si <0x09>;
-defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_si <0x0a>;
-defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_si <0x0b>;
-defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_Lds_si <0x08>;
+defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_Lds_si <0x09>;
+defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_Lds_si <0x0a>;
+defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_Lds_si <0x0b>;
+defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_Lds_si <0x0c>;
 defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_si <0x0d>;
 defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_si <0x0e>;
 defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_si <0x0f>;
@@ -1575,7 +1802,7 @@ multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
 
 defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
 defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
 defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
 defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
 defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
@@ -1610,7 +1837,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
-  let Inst{16}    = lds;
+  let Inst{16}    = !if(ps.lds, 1, 0);
   let Inst{17}    = !if(ps.has_slc, slc, ?);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
@@ -1628,6 +1855,56 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
   def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
+multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
+
+  def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+                   MUBUFLdsTable<0, NAME # "_OFFSET_vi">;
+  def _OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+                   MUBUFLdsTable<0, NAME # "_OFFEN_vi">;
+  def _IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+                   MUBUFLdsTable<0, NAME # "_IDXEN_vi">;
+  def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+                   MUBUFLdsTable<0, NAME # "_BOTHEN_vi">;
+
+  def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+                       MUBUFLdsTable<1, NAME # "_OFFSET_vi">;
+  def _LDS_OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+                       MUBUFLdsTable<1, NAME # "_OFFEN_vi">;
+  def _LDS_IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+                       MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
+  def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+                       MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+}
+
+class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real<op, ps>,
+  Enc64,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+  let AssemblerPredicate=HasUnpackedD16VMem;
+  let DecoderNamespace="GFX80_UNPACKED";
+
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{16}    = !if(ps.lds, 1, 0);
+  let Inst{17}    = !if(ps.has_slc, slc, ?);
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> {
+  def _OFFSET_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_gfx80  : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_gfx80  : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
 multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
   MUBUF_Real_AllAddr_vi<op> {
   def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
@@ -1636,7 +1913,7 @@ multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
   def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
 }
 
-defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_Lds_vi <0x00>;
 defm BUFFER_LOAD_FORMAT_XY      : MUBUF_Real_AllAddr_vi <0x01>;
 defm BUFFER_LOAD_FORMAT_XYZ     : MUBUF_Real_AllAddr_vi <0x02>;
 defm BUFFER_LOAD_FORMAT_XYZW    : MUBUF_Real_AllAddr_vi <0x03>;
@@ -1644,14 +1921,34 @@ defm BUFFER_STORE_FORMAT_X      : MUBUF_Real_AllAddr_vi <0x04>;
 defm BUFFER_STORE_FORMAT_XY     : MUBUF_Real_AllAddr_vi <0x05>;
 defm BUFFER_STORE_FORMAT_XYZ    : MUBUF_Real_AllAddr_vi <0x06>;
 defm BUFFER_STORE_FORMAT_XYZW   : MUBUF_Real_AllAddr_vi <0x07>;
-defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_vi <0x10>;
-defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_vi <0x11>;
-defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_vi <0x12>;
-defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_vi <0x13>;
-defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_vi <0x14>;
-defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_vi <0x17>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm BUFFER_LOAD_FORMAT_D16_X_gfx80       : MUBUF_Real_AllAddr_gfx80 <0x08>;
+  defm BUFFER_LOAD_FORMAT_D16_XY_gfx80      : MUBUF_Real_AllAddr_gfx80 <0x09>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80     : MUBUF_Real_AllAddr_gfx80 <0x0a>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80    : MUBUF_Real_AllAddr_gfx80 <0x0b>;
+  defm BUFFER_STORE_FORMAT_D16_X_gfx80      : MUBUF_Real_AllAddr_gfx80 <0x0c>;
+  defm BUFFER_STORE_FORMAT_D16_XY_gfx80     : MUBUF_Real_AllAddr_gfx80 <0x0d>;
+  defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80    : MUBUF_Real_AllAddr_gfx80 <0x0e>;
+  defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80   : MUBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm BUFFER_LOAD_FORMAT_D16_X       : MUBUF_Real_AllAddr_vi <0x08>;
+  defm BUFFER_LOAD_FORMAT_D16_XY      : MUBUF_Real_AllAddr_vi <0x09>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZ     : MUBUF_Real_AllAddr_vi <0x0a>;
+  defm BUFFER_LOAD_FORMAT_D16_XYZW    : MUBUF_Real_AllAddr_vi <0x0b>;
+  defm BUFFER_STORE_FORMAT_D16_X      : MUBUF_Real_AllAddr_vi <0x0c>;
+  defm BUFFER_STORE_FORMAT_D16_XY     : MUBUF_Real_AllAddr_vi <0x0d>;
+  defm BUFFER_STORE_FORMAT_D16_XYZ    : MUBUF_Real_AllAddr_vi <0x0e>;
+  defm BUFFER_STORE_FORMAT_D16_XYZW   : MUBUF_Real_AllAddr_vi <0x0f>;
+} // End HasPackedD16VMem.
+defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_Lds_vi <0x10>;
+defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_Lds_vi <0x11>;
+defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_Lds_vi <0x12>;
+defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_Lds_vi <0x13>;
+defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_Lds_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_Lds_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_Lds_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_Lds_vi <0x17>;
 defm BUFFER_STORE_BYTE          : MUBUF_Real_AllAddr_vi <0x18>;
 defm BUFFER_STORE_BYTE_D16_HI   : MUBUF_Real_AllAddr_vi <0x19>;
 defm BUFFER_STORE_SHORT         : MUBUF_Real_AllAddr_vi <0x1a>;
@@ -1668,6 +1965,9 @@ defm BUFFER_LOAD_SBYTE_D16_HI   : MUBUF_Real_AllAddr_vi <0x23>;
 defm BUFFER_LOAD_SHORT_D16      : MUBUF_Real_AllAddr_vi <0x24>;
 defm BUFFER_LOAD_SHORT_D16_HI   : MUBUF_Real_AllAddr_vi <0x25>;
 
+defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_vi <0x26>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x27>;
+
 defm BUFFER_ATOMIC_SWAP         : MUBUF_Real_Atomic_vi <0x40>;
 defm BUFFER_ATOMIC_CMPSWAP      : MUBUF_Real_Atomic_vi <0x41>;
 defm BUFFER_ATOMIC_ADD          : MUBUF_Real_Atomic_vi <0x42>;
@@ -1696,6 +1996,8 @@ defm BUFFER_ATOMIC_XOR_X2       : MUBUF_Real_Atomic_vi <0x6a>;
 defm BUFFER_ATOMIC_INC_X2       : MUBUF_Real_Atomic_vi <0x6b>;
 defm BUFFER_ATOMIC_DEC_X2       : MUBUF_Real_Atomic_vi <0x6c>;
 
+def BUFFER_STORE_LDS_DWORD_vi   : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+
 def BUFFER_WBINVL1_vi           : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
 def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 
@@ -1729,11 +2031,61 @@ multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
   def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
-defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
-defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
-//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
-defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
-defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
-defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
-defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
+class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
+  MTBUF_Real<ps>,
+  Enc64,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
+  let AssemblerPredicate=HasUnpackedD16VMem;
+  let DecoderNamespace="GFX80_UNPACKED";
+
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MTBUF_Real_AllAddr_gfx80<bits<4> op> {
+  def _OFFSET_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_gfx80  : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_gfx80  : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0x00>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <0x01>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <0x02>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <0x03>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <0x04>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <0x05>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <0x06>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>;
+let SubtargetPredicate = HasUnpackedD16VMem in {
+  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Real_AllAddr_gfx80 <0x08>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Real_AllAddr_gfx80 <0x09>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Real_AllAddr_gfx80 <0x0a>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Real_AllAddr_gfx80 <0x0b>;
+  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Real_AllAddr_gfx80 <0x0c>;
+  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Real_AllAddr_gfx80 <0x0d>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Real_AllAddr_gfx80 <0x0e>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0f>;
+} // End HasUnpackedD16VMem.
+let SubtargetPredicate = HasPackedD16VMem in {
+  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Real_AllAddr_vi <0x08>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Real_AllAddr_vi <0x09>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Real_AllAddr_vi <0x0a>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Real_AllAddr_vi <0x0b>;
+  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Real_AllAddr_vi <0x0c>;
+  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Real_AllAddr_vi <0x0d>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Real_AllAddr_vi <0x0e>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
+} // End HasUnpackedD16VMem.
diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
index f898fd7948cc..cdc6ab9412e6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -440,7 +440,7 @@ defm DS_XOR_RTN_B32   : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
 defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
 defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
 defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-defm DS_MIN_RTN_F32   : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MIN_RTN_F32   : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
 defm DS_MAX_RTN_F32   : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
 
 defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
@@ -584,6 +584,8 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
                                        int_amdgcn_ds_bpermute>;
 }
 
+def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
+
 } // let SubtargetPredicate = isVI
 
 //===----------------------------------------------------------------------===//
@@ -600,8 +602,6 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
   (inst $ptr, (as_i16imm $offset), (i1 0))
 >;
 
-// FIXME: Passing name of PatFrag in workaround. Why doesn't
-// !cast<PatFrag>(frag.NAME#"_m0") work!?
 multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
 
   let OtherPredicates = [LDSRequiresM0Init] in {
@@ -609,7 +609,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
   }
 }
 
@@ -647,14 +647,17 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
 defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
+defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
 
 let AddedComplexity = 100 in {
 
 defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
 
 } // End AddedComplexity = 100
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 let AddedComplexity = 100 in {
 defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
 defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
@@ -678,7 +681,24 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+  }
+}
+
+// Irritatingly, atomic_store reverses the order of operands from a
+// normal store.
+class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
   }
 }
 
@@ -687,8 +707,10 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
 defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
 defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
 def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
 }
@@ -720,6 +742,8 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
 let AddedComplexity = 100 in {
 
 defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
+
 } // End AddedComplexity = 100
 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
@@ -732,7 +756,8 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                         !cast<PatFrag>(frag)>;
   }
 }
 
@@ -749,7 +774,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                          !cast<PatFrag>(frag)>;
   }
 }
 
@@ -769,6 +795,9 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
 defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
 
 // 64-bit atomics.
 defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
@@ -1123,6 +1152,7 @@ def DS_XOR_SRC2_B32_vi    : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
 def DS_WRITE_SRC2_B32_vi  : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
 def DS_MIN_SRC2_F32_vi    : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
 def DS_MAX_SRC2_F32_vi    : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_F32_vi    : DS_Real_vi<0x95, DS_ADD_SRC2_F32>;
 def DS_ADD_SRC2_U64_vi    : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
 def DS_SUB_SRC2_U64_vi    : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
 def DS_RSUB_SRC2_U64_vi   : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 47a2d3f2fdc5..f3de903f21b2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -20,7 +20,9 @@
 #include "Disassembler/AMDGPUDisassembler.h"
 #include "AMDGPU.h"
 #include "AMDGPURegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm-c/Disassembler.h"
 #include "llvm/ADT/APInt.h"
@@ -198,6 +200,21 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
       if (Res) { IsSDWA = true;  break; }
+
+      if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
+        Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
+        if (Res)
+          break;
+      }
+
+      // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+      // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+      // table first so we print the correct name.
+      if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+        Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+        if (Res)
+          break;
+      }
     }
 
     // Reinitialize Bytes as DPP64 could have eaten too much
@@ -228,7 +245,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
-              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+              MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
@@ -241,7 +259,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   if (Res && IsSDWA)
     Res = convertSDWAInst(MI);
 
-  Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+  // if the opcode was not recognized we'll assume a Size of 4 bytes
+  // (unless there are fewer bytes left)
+  Size = Res ? (MaxInstBytesNum - Bytes.size())
+             : std::min((size_t)4, Bytes_.size());
   return Res;
 }
 
@@ -264,26 +285,70 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
   return MCDisassembler::Success;
 }
 
+// Note that MIMG format provides no information about VADDR size.
+// Consequently, decoded instructions always show address
+// as if it has 1 dword, which could be not really so.
 DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+
+  int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::vdst);
+
   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::vdata);
 
   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::dmask);
+
+  int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                            AMDGPU::OpName::tfe);
+  int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                            AMDGPU::OpName::d16);
+
+  assert(VDataIdx != -1);
+  assert(DMaskIdx != -1);
+  assert(TFEIdx != -1);
+
+  bool IsAtomic = (VDstIdx != -1);
+  bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
+
   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
   if (DMask == 0)
     return MCDisassembler::Success;
 
-  unsigned ChannelCount = countPopulation(DMask);
-  if (ChannelCount == 1)
+  unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
+  if (DstSize == 1)
     return MCDisassembler::Success;
 
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount);
-  assert(NewOpcode != -1 && "could not find matching mimg channel instruction");
+  bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
+  if (D16 && AMDGPU::hasPackedD16(STI)) {
+    DstSize = (DstSize + 1) / 2;
+  }
+
+  // FIXME: Add tfe support
+  if (MI.getOperand(TFEIdx).getImm())
+    return MCDisassembler::Success;
+
+  int NewOpcode = -1;
+
+  if (IsGather4) {
+    if (D16 && AMDGPU::hasPackedD16(STI))
+      NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
+    else
+      return MCDisassembler::Success;
+  } else {
+    NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
+    if (NewOpcode == -1)
+      return MCDisassembler::Success;
+  }
+
   auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
 
-  // Widen the register to the correct number of enabled channels.
+  // Get first subregister of VData
   unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+  unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+  Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+
+  // Widen the register to the correct number of enabled channels.
   auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
                                           &MRI.getRegClass(RCID));
   if (NewVdata == AMDGPU::NoRegister) {
@@ -297,6 +362,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   // how it is usually emitted because the number of register components is not
   // in the instruction encoding.
   MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
+
+  if (IsAtomic) {
+    // Atomic operations have an additional operand (a copy of data)
+    MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+  }
+
   return MCDisassembler::Success;
 }
 
@@ -690,9 +761,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   switch (Val) {
   case 102: return createRegOperand(FLAT_SCR_LO);
   case 103: return createRegOperand(FLAT_SCR_HI);
-    // ToDo: no support for xnack_mask_lo/_hi register
-  case 104:
-  case 105: break;
+  case 104: return createRegOperand(XNACK_MASK_LO);
+  case 105: return createRegOperand(XNACK_MASK_HI);
   case 106: return createRegOperand(VCC_LO);
   case 107: return createRegOperand(VCC_HI);
   case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
@@ -722,6 +792,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
 
   switch (Val) {
   case 102: return createRegOperand(FLAT_SCR);
+  case 104: return createRegOperand(XNACK_MASK);
   case 106: return createRegOperand(VCC);
   case 108: assert(!isGFX9()); return createRegOperand(TBA);
   case 110: assert(!isGFX9()); return createRegOperand(TMA);
@@ -732,8 +803,9 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
 }
 
 MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
-                                            unsigned Val) const {
+                                            const unsigned Val) const {
   using namespace AMDGPU::SDWA;
+  using namespace AMDGPU::EncValues;
 
   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
     // XXX: static_cast<int> is needed to avoid stupid warning:
@@ -754,7 +826,15 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
                                Val - SDWA9EncValues::SRC_TTMP_MIN);
     }
 
-    return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+    const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
+
+    if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
+      return decodeIntImmed(SVal);
+
+    if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
+      return decodeFPImmed(Width, SVal);
+
+    return decodeSpecialReg32(SVal);
   } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
     return createRegOperand(getVgprClassId(Width), Val);
   }
@@ -815,6 +895,9 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
   }
 
   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+  if (!Symbols)
+    return false;
+
   auto Result = std::find_if(Symbols->begin(), Symbols->end(),
                              [Value](const SymbolInfoTy& Val) {
                                 return std::get<0>(Val) == static_cast<uint64_t>(Value)
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 5e26f97b0c86..944f4ffe598d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -15,7 +15,6 @@
 
 def isEG : Predicate<
   "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
-  "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && "
   "!Subtarget->hasCaymanISA()"
 >;
 
@@ -693,7 +692,7 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
 def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
 
 // SHA-256 Patterns
-def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>;
 
 def EG_ExportSwz : ExportSwzInst {
   let Word1{19-16} = 0; // BURST_COUNT
diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 693869128081..3ef473b7fd96 100644
--- a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -135,7 +135,7 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
         !con((ins VReg_64:$vaddr),
           !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
             (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
-            (ins GLC:$glc, slc:$slc)),
+            (ins GLC:$glc, SLC:$slc)),
             !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
   " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
   let has_data = 0;
@@ -158,7 +158,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
       !con((ins VReg_64:$vaddr, vdataClass:$vdata),
         !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
           (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
-          (ins GLC:$glc, slc:$slc)),
+          (ins GLC:$glc, SLC:$slc)),
   " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
@@ -188,8 +188,8 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   opName,
   (outs regClass:$vdst),
   !if(EnableSaddr,
-      (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
-      (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+      (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+      (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
   " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
   let has_data = 0;
   let mayLoad = 1;
@@ -204,8 +204,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
   opName,
   (outs),
   !if(EnableSaddr,
-    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc),
-    (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)),
+    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
+    (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
   " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
@@ -260,7 +260,7 @@ multiclass FLAT_Atomic_Pseudo<
   RegisterClass data_rc = vdst_rc> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
     " $vaddr, $vdata$offset$slc">,
     AtomicNoRet <opName, 0> {
     let PseudoInstr = NAME;
@@ -268,7 +268,7 @@ multiclass FLAT_Atomic_Pseudo<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -285,7 +285,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, off$offset$slc">,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
@@ -294,7 +294,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, off$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
@@ -304,7 +304,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
@@ -314,7 +314,7 @@ multiclass FLAT_Global_Atomic_Pseudo<
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc),
+      (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
@@ -780,7 +780,7 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
 def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
 def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
 
@@ -824,7 +824,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
 
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
 def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index dd515b0bf2f1..f236f10ba75a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,6 +16,7 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -39,7 +40,7 @@ using namespace llvm;
 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   CurrCycleInstr(nullptr),
   MF(MF),
-  ST(MF.getSubtarget<SISubtarget>()),
+  ST(MF.getSubtarget<GCNSubtarget>()),
   TII(*ST.getInstrInfo()),
   TRI(TII.getRegisterInfo()),
   ClauseUses(TRI.getNumRegUnits()),
@@ -355,13 +356,13 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
 }
 
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   int WaitStatesNeeded = 0;
 
   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
 
   // This SMRD hazard only affects SI.
-  if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+  if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return WaitStatesNeeded;
 
   // A read of an SGPR by SMRD instruction requires 4 wait states when the
@@ -398,7 +399,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
 }
 
 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
-  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return 0;
 
   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
@@ -634,7 +635,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
 }
 
 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
-  if (MI->isDebugValue())
+  if (MI->isDebugInstr())
     return 0;
 
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f9a6e395a454..ca17e7cb6018 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -28,7 +28,7 @@ class MachineRegisterInfo;
 class ScheduleDAG;
 class SIInstrInfo;
 class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
 
 class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   // This variable stores the instruction that has been emitted this cycle. It
@@ -37,7 +37,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   MachineInstr *CurrCycleInstr;
   std::list<MachineInstr*> EmittedInstrs;
   const MachineFunction &MF;
-  const SISubtarget &ST;
+  const GCNSubtarget &ST;
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
index ba8211b189cf..651091d44136 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -149,9 +149,9 @@ static int BUCompareLatency(const SUnit *left, const SUnit *right) {
   int LDepth = left->getDepth();
   int RDepth = right->getDepth();
   if (LDepth != RDepth) {
-    DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
-      << ") depth " << LDepth << " vs SU (" << right->NodeNum
-      << ") depth " << RDepth << "\n");
+    LLVM_DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+                      << ") depth " << LDepth << " vs SU (" << right->NodeNum
+                      << ") depth " << RDepth << "\n");
     return LDepth < RDepth ? 1 : -1;
   }
   if (left->Latency != right->Latency)
@@ -169,9 +169,9 @@ const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
   if (!DisableSchedCriticalPath) {
     int spread = (int)left->getDepth() - (int)right->getDepth();
     if (std::abs(spread) > MaxReorderWindow) {
-      DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
-        << left->getDepth() << " != SU(" << right->NodeNum << "): "
-        << right->getDepth() << "\n");
+      LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+                        << left->getDepth() << " != SU(" << right->NodeNum
+                        << "): " << right->getDepth() << "\n");
       return left->getDepth() < right->getDepth() ? right : left;
     }
   }
@@ -324,19 +324,18 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
     if (AvailQueue.empty())
       break;
 
-    DEBUG(
-      dbgs() << "\n=== Picking candidate\n"
-                "Ready queue:";
-      for (auto &C : AvailQueue)
-        dbgs() << ' ' << C.SU->NodeNum;
-      dbgs() << '\n';
-    );
+    LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n"
+                         "Ready queue:";
+               for (auto &C
+                    : AvailQueue) dbgs()
+               << ' ' << C.SU->NodeNum;
+               dbgs() << '\n';);
 
     auto C = pickCandidate();
     assert(C);
     AvailQueue.remove(*C);
     auto SU = C->SU;
-    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
 
     advanceToCycle(SU->getHeight());
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index a0e4f7ff24cb..15366d66bd85 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUSubtarget.h"
 #include "GCNRegPressure.h"
 #include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -19,6 +20,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -68,14 +70,14 @@ static void printRegion(raw_ostream &OS,
   auto I = Begin;
   MaxInstNum = std::max(MaxInstNum, 1u);
   for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
-    if (!I->isDebugValue() && LIS)
+    if (!I->isDebugInstr() && LIS)
       OS << LIS->getInstructionIndex(*I);
     OS << '\t' << *I;
   }
   if (I != End) {
     OS << "\t...\n";
     I = std::prev(End);
-    if (!I->isDebugValue() && LIS)
+    if (!I->isDebugInstr() && LIS)
       OS << LIS->getInstructionIndex(*I);
     OS << '\t' << *I;
   }
@@ -106,7 +108,7 @@ static void printLivenessInfo(raw_ostream &OS,
 
 LLVM_DUMP_METHOD
 void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   for (const auto R : Regions) {
     OS << "Region to schedule ";
     printRegion(OS, R->Begin, R->End, LIS, 1);
@@ -130,7 +132,7 @@ LLVM_DUMP_METHOD
 void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
                                          const GCNRegPressure &Before,
                                          const GCNRegPressure &After) const {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   OS << "RP before: ";
   Before.print(OS, &ST);
   OS << "RP after:  ";
@@ -199,8 +201,8 @@ public:
 
   void schedule() {
     assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
-    DEBUG(dbgs() << "\nScheduling ";
-      printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+    LLVM_DEBUG(dbgs() << "\nScheduling ";
+               printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
     Sch.BaseClass::schedule();
 
     // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
@@ -310,14 +312,13 @@ void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
 
 void GCNIterativeScheduler::schedule() { // overriden
   // do nothing
-  DEBUG(
-    printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
-    if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
-      dbgs() << "Max RP: ";
-      Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
-    }
-    dbgs() << '\n';
-  );
+  LLVM_DEBUG(printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+             if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+               dbgs() << "Max RP: ";
+               Regions.back()->MaxPressure.print(
+                   dbgs(), &MF.getSubtarget<GCNSubtarget>());
+             } dbgs()
+             << '\n';);
 }
 
 void GCNIterativeScheduler::finalizeSchedule() { // overriden
@@ -383,10 +384,10 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
     if (MI != &*Top) {
       BB->remove(MI);
       BB->insert(Top, MI);
-      if (!MI->isDebugValue())
+      if (!MI->isDebugInstr())
         LIS->handleMove(*MI, true);
     }
-    if (!MI->isDebugValue()) {
+    if (!MI->isDebugInstr()) {
       // Reset read - undef flags and update them later.
       for (auto &Op : MI->operands())
         if (Op.isReg() && Op.isDef())
@@ -417,7 +418,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 #ifndef NDEBUG
   const auto RegionMaxRP = getRegionPressure(R);
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
 #endif
   assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
   || (dbgs() << "Max RP mismatch!!!\n"
@@ -432,8 +433,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  std::sort(Regions.begin(), Regions.end(),
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  llvm::sort(Regions.begin(), Regions.end(),
     [&ST, TargetOcc](const Region *R1, const Region *R2) {
     return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
   });
@@ -450,24 +451,24 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
 // BestSchedules aren't deleted on fail.
 unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
   // TODO: assert Regions are sorted descending by pressure
-  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
-  DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
-               << ", current = " << Occ << '\n');
+  LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
+                    << ", current = " << Occ << '\n');
 
   auto NewOcc = TargetOcc;
   for (auto R : Regions) {
     if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
       break;
 
-    DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
-          printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+    LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+               printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
 
     BuildDAG DAG(*R, *this);
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
     const auto MaxRP = getSchedulePressure(*R, MinSchedule);
-    DEBUG(dbgs() << "Occupancy improvement attempt:\n";
-          printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+    LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+               printSchedRP(dbgs(), R->MaxPressure, MaxRP));
 
     NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
     if (NewOcc <= Occ)
@@ -475,15 +476,21 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
 
     setBestSchedule(*R, MinSchedule, MaxRP);
   }
-  DEBUG(dbgs() << "New occupancy = " << NewOcc
-               << ", prev occupancy = " << Occ << '\n');
+  LLVM_DEBUG(dbgs() << "New occupancy = " << NewOcc
+                    << ", prev occupancy = " << Occ << '\n');
+  if (NewOcc > Occ) {
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    MFI->increaseOccupancy(MF, NewOcc);
+  }
+
   return std::max(NewOcc, Occ);
 }
 
 void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  auto TgtOcc = MFI->getMinAllowedOccupancy();
 
   sortRegionsByPressure(TgtOcc);
   auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -496,9 +503,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   const int NumPasses = Occ < TgtOcc ? 2 : 1;
 
   TgtOcc = std::min(Occ, TgtOcc);
-  DEBUG(dbgs() << "Scheduling using default scheduler, "
-                  "target occupancy = " << TgtOcc << '\n');
+  LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+                       "target occupancy = "
+                    << TgtOcc << '\n');
   GCNMaxOccupancySchedStrategy LStrgy(Context);
+  unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
 
   for (int I = 0; I < NumPasses; ++I) {
     // running first pass with TargetOccupancy = 0 mimics previous scheduling
@@ -509,30 +518,33 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
 
       Ovr.schedule();
       const auto RP = getRegionPressure(*R);
-      DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+      LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
       if (RP.getOccupancy(ST) < TgtOcc) {
-        DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+        LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
         if (R->BestSchedule.get() &&
             R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
-          DEBUG(dbgs() << ", scheduling minimal register\n");
+          LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
           scheduleBest(*R);
         } else {
-          DEBUG(dbgs() << ", restoring\n");
+          LLVM_DEBUG(dbgs() << ", restoring\n");
           Ovr.restoreOrder();
           assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
         }
       }
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
     }
   }
+  MFI->limitOccupancy(FinalOccupancy);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 // Minimal Register Strategy
 
 void GCNIterativeScheduler::scheduleMinReg(bool force) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const auto TgtOcc = MFI->getOccupancy();
   sortRegionsByPressure(TgtOcc);
 
   auto MaxPressure = Regions.front()->MaxPressure;
@@ -544,7 +556,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, MinSchedule);
-    DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+    LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
       dbgs() << "\nWarning: Pressure becomes worse after minreg!";
       printSchedRP(dbgs(), R->MaxPressure, RP);
     });
@@ -553,7 +565,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
       break;
 
     scheduleRegion(*R, MinSchedule, RP);
-    DEBUG(printSchedResult(dbgs(), R, RP));
+    LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
 
     MaxPressure = RP;
   }
@@ -564,9 +576,9 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
 
 void GCNIterativeScheduler::scheduleILP(
   bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<SISubtarget>();
-  auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
-                         ST.getWavesPerEU(MF.getFunction()).second);
+  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  auto TgtOcc = MFI->getMinAllowedOccupancy();
 
   sortRegionsByPressure(TgtOcc);
   auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
@@ -575,26 +587,30 @@ void GCNIterativeScheduler::scheduleILP(
     Occ = tryMaximizeOccupancy(TgtOcc);
 
   TgtOcc = std::min(Occ, TgtOcc);
-  DEBUG(dbgs() << "Scheduling using default scheduler, "
-    "target occupancy = " << TgtOcc << '\n');
+  LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
+                       "target occupancy = "
+                    << TgtOcc << '\n');
 
+  unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
   for (auto R : Regions) {
     BuildDAG DAG(*R, *this);
     const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, ILPSchedule);
-    DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+    LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
     if (RP.getOccupancy(ST) < TgtOcc) {
-      DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+      LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
       if (R->BestSchedule.get() &&
         R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
-        DEBUG(dbgs() << ", scheduling minimal register\n");
+        LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
         scheduleBest(*R);
       }
     } else {
       scheduleRegion(*R, ILPSchedule, RP);
-      DEBUG(printSchedResult(dbgs(), R, RP));
+      LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
     }
   }
+  MFI->limitOccupancy(FinalOccupancy);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 9904b5f0f4ba..192d534bb9cf 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -142,35 +142,38 @@ GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
     unsigned Num = RQ.size();
     if (Num == 1) break;
 
-    DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+    LLVM_DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num
+                      << '\n');
     Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
     if (Num == 1) break;
 
-    DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
-                 << Num << '\n');
+    LLVM_DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+                      << Num << '\n');
     Num = findMax(Num, [=](const Candidate &C) {
       auto SU = C.SU;
       int Res = getNotReadySuccessors(SU);
-      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
-                   << Res << " successors, metric = " << -Res << '\n');
+      LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+                        << Res << " successors, metric = " << -Res << '\n');
       return -Res;
     });
     if (Num == 1) break;
 
-    DEBUG(dbgs() << "\nSelecting most producing candidate among "
-                 << Num << '\n');
+    LLVM_DEBUG(dbgs() << "\nSelecting most producing candidate among " << Num
+                      << '\n');
     Num = findMax(Num, [=](const Candidate &C) {
       auto SU = C.SU;
       auto Res = getReadySuccessors(SU);
-      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
-                   << Res << " successors, metric = " << Res << '\n');
+      LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " << Res
+                        << " successors, metric = " << Res << '\n');
       return Res;
     });
     if (Num == 1) break;
 
     Num = Num ? Num : RQ.size();
-    DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
-                 << Num << '\n');
+    LLVM_DEBUG(
+        dbgs()
+        << "\nCan't find best candidate, selecting in program order among "
+        << Num << '\n');
     Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
     assert(Num == 1);
   } while (false);
@@ -202,17 +205,17 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
         Worklist.push_back(P.getSUnit());
     }
   }
-  DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
-               << ")'s non-ready successors of " << Priority
-               << " priority in ready queue: ");
+  LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+                    << ")'s non-ready successors of " << Priority
+                    << " priority in ready queue: ");
   const auto SetEnd = Set.end();
   for (auto &C : RQ) {
     if (Set.find(C.SU) != SetEnd) {
       C.Priority = Priority;
-      DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+      LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
     }
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 }
 
 void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
@@ -243,19 +246,19 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
   releaseSuccessors(&DAG.EntrySU, StepNo);
 
   while (!RQ.empty()) {
-    DEBUG(
-      dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
-                "Ready queue:";
-      for (auto &C : RQ)
-        dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
-      dbgs() << '\n';
-    );
+    LLVM_DEBUG(dbgs() << "\n=== Picking candidate, Step = " << StepNo
+                      << "\n"
+                         "Ready queue:";
+               for (auto &C
+                    : RQ) dbgs()
+               << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+               dbgs() << '\n';);
 
     auto C = pickCandidate();
     assert(C);
     RQ.remove(*C);
     auto SU = C->SU;
-    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
 
     releaseSuccessors(SU, StepNo);
     Schedule.push_back(SU);
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNProcessors.td b/contrib/llvm/lib/Target/AMDGPU/GCNProcessors.td
index b2a3f652abd8..d76acfa24f90 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -93,14 +93,6 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
 // GCN GFX8 (Volcanic Islands (VI)).
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
-  [FeatureISAVersion8_0_0]
->;
-
-def : ProcessorModel<"iceland", SIQuarterSpeedModel,
-  [FeatureISAVersion8_0_0]
->;
-
 def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
   [FeatureISAVersion8_0_1]
 >;
@@ -113,6 +105,10 @@ def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
   [FeatureISAVersion8_0_2]
 >;
 
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_2]
+>;
+
 def : ProcessorModel<"tonga", SIQuarterSpeedModel,
   [FeatureISAVersion8_0_2]
 >;
@@ -152,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
 def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
   [FeatureISAVersion9_0_2]
 >;
+
+def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_4]
+>;
+
+def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_6]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 992bb7cceb6f..3d8cacc4f02c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -131,7 +132,7 @@ void GCNRegPressure::inc(unsigned Reg,
   }
 }
 
-bool GCNRegPressure::less(const SISubtarget &ST,
+bool GCNRegPressure::less(const GCNSubtarget &ST,
                           const GCNRegPressure& O,
                           unsigned MaxOccupancy) const {
   const auto SGPROcc = std::min(MaxOccupancy,
@@ -177,7 +178,7 @@ bool GCNRegPressure::less(const SISubtarget &ST,
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
-void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
   OS << "VGPRs: " << getVGPRNum();
   if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
   OS << ", SGPRs: " << getSGPRNum();
@@ -283,24 +284,33 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   return LiveRegs;
 }
 
-void GCNUpwardRPTracker::reset(const MachineInstr &MI,
-                               const LiveRegSet *LiveRegsCopy) {
-  MRI = &MI.getParent()->getParent()->getRegInfo();
+void GCNRPTracker::reset(const MachineInstr &MI,
+                         const LiveRegSet *LiveRegsCopy,
+                         bool After) {
+  const MachineFunction &MF = *MI.getMF();
+  MRI = &MF.getRegInfo();
   if (LiveRegsCopy) {
     if (&LiveRegs != LiveRegsCopy)
       LiveRegs = *LiveRegsCopy;
   } else {
-    LiveRegs = getLiveRegsAfter(MI, LIS);
+    LiveRegs = After ? getLiveRegsAfter(MI, LIS)
+                     : getLiveRegsBefore(MI, LIS);
   }
+
   MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
 }
 
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+                               const LiveRegSet *LiveRegsCopy) {
+  GCNRPTracker::reset(MI, LiveRegsCopy, true);
+}
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
   LastTrackedMI = &MI;
 
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return;
 
   auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
@@ -348,13 +358,7 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
   NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
   if (NextMI == MBBEnd)
     return false;
-  if (LiveRegsCopy) {
-    if (&LiveRegs != LiveRegsCopy)
-      LiveRegs = *LiveRegsCopy;
-  } else {
-    LiveRegs = getLiveRegsBefore(*NextMI, LIS);
-  }
-  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+  GCNRPTracker::reset(*NextMI, LiveRegsCopy, false);
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index e418aa0fe911..357d3b7b2334 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -49,7 +49,7 @@ struct GCNRegPressure {
   unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
   unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
 
-  unsigned getOccupancy(const SISubtarget &ST) const {
+  unsigned getOccupancy(const GCNSubtarget &ST) const {
     return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
                     ST.getOccupancyWithNumVGPRs(getVGPRNum()));
   }
@@ -59,11 +59,11 @@ struct GCNRegPressure {
            LaneBitmask NewMask,
            const MachineRegisterInfo &MRI);
 
-  bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
     return getOccupancy(ST) > O.getOccupancy(ST);
   }
 
-  bool less(const SISubtarget &ST, const GCNRegPressure& O,
+  bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
     unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
   bool operator==(const GCNRegPressure &O) const {
@@ -74,7 +74,7 @@ struct GCNRegPressure {
     return !(*this == O);
   }
 
-  void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const;
+  void print(raw_ostream &OS, const GCNSubtarget *ST = nullptr) const;
   void dump() const { print(dbgs()); }
 
 private:
@@ -106,6 +106,9 @@ protected:
 
   GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
 
+  void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
+             bool After);
+
 public:
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d414b899050a..f09b7f6cff22 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -28,18 +28,6 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C) :
     GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
 
-static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
-                            const MachineFunction &MF) {
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
-                                      ST.getOccupancyWithNumVGPRs(VGPRs));
-  return std::min(MinRegOccupancy,
-                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
-                                                  MF.getFunction()));
-}
-
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
 
@@ -47,7 +35,7 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
 
   MF = &DAG->MF;
 
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   // FIXME: This is also necessary, because some passes that run after
   // scheduling and before regalloc increase register pressure.
@@ -81,7 +69,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   Cand.AtTop = AtTop;
 
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
-  // the the tracker, so we need to pass those function a non-const copy.
+  // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
   std::vector<unsigned> Pressure;
@@ -200,34 +188,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 
   // See if BotCand is still valid (because we previously scheduled from Top).
-  DEBUG(dbgs() << "Picking from Bot:\n");
+  LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
-    DEBUG(traceCandidate(BotCand));
+    LLVM_DEBUG(traceCandidate(BotCand));
   }
 
   // Check if the top Q has a better candidate.
-  DEBUG(dbgs() << "Picking from Top:\n");
+  LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
-    DEBUG(traceCandidate(TopCand));
+    LLVM_DEBUG(traceCandidate(TopCand));
   }
 
   // Pick best from BotCand and TopCand.
-  DEBUG(
-    dbgs() << "Top Cand: ";
-    traceCandidate(TopCand);
-    dbgs() << "Bot Cand: ";
-    traceCandidate(BotCand);
-  );
+  LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
+             dbgs() << "Bot Cand: "; traceCandidate(BotCand););
   SchedCandidate Cand;
   if (TopCand.Reason == BotCand.Reason) {
     Cand = BotCand;
@@ -256,10 +240,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       }
     }
   }
-  DEBUG(
-    dbgs() << "Picking: ";
-    traceCandidate(Cand);
-  );
+  LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 
   IsTopNode = Cand.AtTop;
   return Cand.SU;
@@ -305,20 +286,20 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
   return SU;
 }
 
 GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
                         std::unique_ptr<MachineSchedStrategy> S) :
   ScheduleDAGMILive(C, std::move(S)),
-  ST(MF.getSubtarget<SISubtarget>()),
+  ST(MF.getSubtarget<GCNSubtarget>()),
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
-  StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
-                                                    MF.getFunction())),
+  StartingOccupancy(MFI.getOccupancy()),
   MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
 
-  DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+  LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
@@ -338,12 +319,12 @@ void GCNScheduleDAGMILive::schedule() {
   if (LIS) {
     PressureBefore = Pressure[RegionIdx];
 
-    DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
-          GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
-          dbgs() << "Region live-in pressure:  ";
-          llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
-          dbgs() << "Region register pressure: ";
-          PressureBefore.print(dbgs()));
+    LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+               GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+               dbgs() << "Region live-in pressure:  ";
+               llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+               dbgs() << "Region register pressure: ";
+               PressureBefore.print(dbgs()));
   }
 
   ScheduleDAGMILive::schedule();
@@ -356,45 +337,54 @@ void GCNScheduleDAGMILive::schedule() {
   GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
   auto PressureAfter = getRealRegPressure();
 
-  DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+  LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+             PressureAfter.print(dbgs()));
 
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
     Pressure[RegionIdx] = PressureAfter;
-    DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
-  unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
-                                    PressureAfter.getVGPRNum(), MF);
-  unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
-                                     PressureBefore.getVGPRNum(), MF);
-  DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
-                  ", after " << WavesAfter << ".\n");
+  unsigned Occ = MFI.getOccupancy();
+  unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST));
+  unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST));
+  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+                    << ", after " << WavesAfter << ".\n");
 
   // We could not keep current target occupancy because of the just scheduled
   // region. Record new occupancy for next scheduling cycle.
   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+  // Allow memory bound functions to drop to 4 waves if not limited by an
+  // attribute.
+  if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
+      WavesAfter >= MFI.getMinAllowedOccupancy()) {
+    LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+                      << MFI.getMinAllowedOccupancy() << " waves\n");
+    NewOccupancy = WavesAfter;
+  }
   if (NewOccupancy < MinOccupancy) {
     MinOccupancy = NewOccupancy;
-    DEBUG(dbgs() << "Occupancy lowered for the function to "
-                 << MinOccupancy << ".\n");
+    MFI.limitOccupancy(MinOccupancy);
+    LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+                      << MinOccupancy << ".\n");
   }
 
-  if (WavesAfter >= WavesBefore) {
+  if (WavesAfter >= MinOccupancy) {
     Pressure[RegionIdx] = PressureAfter;
     return;
   }
 
-  DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   RegionEnd = RegionBegin;
   for (MachineInstr *MI : Unsched) {
-    if (MI->isDebugValue())
+    if (MI->isDebugInstr())
       continue;
 
     if (MI->getIterator() != RegionEnd) {
       BB->remove(MI);
       BB->insert(RegionEnd, MI);
-      if (!MI->isDebugValue())
+      if (!MI->isDebugInstr())
         LIS->handleMove(*MI, true);
     }
     // Reset read-undef flags and update them later.
@@ -403,7 +393,7 @@ void GCNScheduleDAGMILive::schedule() {
         Op.setIsUndef(false);
     RegisterOperands RegOpers;
     RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
-    if (!MI->isDebugValue()) {
+    if (!MI->isDebugInstr()) {
       if (ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -415,7 +405,7 @@ void GCNScheduleDAGMILive::schedule() {
     }
     RegionEnd = MI->getIterator();
     ++RegionEnd;
-    DEBUG(dbgs() << "Scheduling " << *MI);
+    LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
   }
   RegionBegin = Unsched.front()->getIterator();
   Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
@@ -490,7 +480,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
   GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+  LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
@@ -509,9 +499,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
       if (!LIS || StartingOccupancy <= MinOccupancy)
         break;
 
-      DEBUG(dbgs()
-              << "Retrying function scheduling with lowest recorded occupancy "
-              << MinOccupancy << ".\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "Retrying function scheduling with lowest recorded occupancy "
+          << MinOccupancy << ".\n");
 
       S.setTargetOccupancy(MinOccupancy);
     }
@@ -537,12 +528,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
         continue;
       }
 
-      DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
-                   << MBB->getName() << "\n  From: " << *begin() << "    To: ";
-            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-            else dbgs() << "End";
-            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+      LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
+                        << MBB->getName() << "\n  From: " << *begin()
+                        << "    To: ";
+                 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+                 else dbgs() << "End";
+                 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
 
       schedule();
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 060d2ca72d93..3ac6af89cb9b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -21,7 +21,7 @@ namespace llvm {
 
 class SIMachineFunctionInfo;
 class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
 
 /// This is a minimal scheduler strategy.  The main difference between this
 /// and the GenericScheduler is that GCNSchedStrategy uses different
@@ -62,9 +62,9 @@ public:
 
 class GCNScheduleDAGMILive : public ScheduleDAGMILive {
 
-  const SISubtarget &ST;
+  const GCNSubtarget &ST;
 
-  const SIMachineFunctionInfo &MFI;
+  SIMachineFunctionInfo &MFI;
 
   // Occupancy target at the beginning of function scheduling cycle.
   unsigned StartingOccupancy;
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index bf57f88bef91..db908368a179 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -217,6 +217,11 @@ void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
   printNamedBit(MI, OpNo, O, "lwe");
 }
 
+void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "d16");
+}
+
 void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
@@ -267,6 +272,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::FLAT_SCR:
     O << "flat_scratch";
     return;
+  case AMDGPU::XNACK_MASK:
+    O << "xnack_mask";
+    return;
   case AMDGPU::VCC_LO:
     O << "vcc_lo";
     return;
@@ -297,6 +305,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::FLAT_SCR_HI:
     O << "flat_scratch_hi";
     return;
+  case AMDGPU::XNACK_MASK_LO:
+    O << "xnack_mask_lo";
+    return;
+  case AMDGPU::XNACK_MASK_HI:
+    O << "xnack_mask_hi";
+    return;
   case AMDGPU::FP_REG:
   case AMDGPU::SP_REG:
   case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
@@ -371,6 +385,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
   printOperand(MI, OpNo, STI, O);
 }
 
+void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI, raw_ostream &O) {
+  if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI))
+    O << " ";
+  else
+    O << "_e32 ";
+
+  printOperand(MI, OpNo, STI, O);
+}
+
 void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
@@ -486,11 +510,6 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
-    static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O);
-    return;
-  }
-
   if (OpNo >= MI->getNumOperands()) {
     O << "/*Missing OP" << OpNo << "*/";
     return;
@@ -612,40 +631,45 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
 void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
+  using namespace AMDGPU::DPP;
+
   unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (Imm <= 0x0ff) {
+  if (Imm <= DppCtrl::QUAD_PERM_LAST) {
     O << " quad_perm:[";
     O << formatDec(Imm & 0x3)         << ',';
     O << formatDec((Imm & 0xc)  >> 2) << ',';
     O << formatDec((Imm & 0x30) >> 4) << ',';
     O << formatDec((Imm & 0xc0) >> 6) << ']';
-  } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+  } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
+             (Imm <= DppCtrl::ROW_SHL_LAST)) {
     O << " row_shl:";
     printU4ImmDecOperand(MI, OpNo, O);
-  } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+  } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
+             (Imm <= DppCtrl::ROW_SHR_LAST)) {
     O << " row_shr:";
     printU4ImmDecOperand(MI, OpNo, O);
-  } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+  } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
+             (Imm <= DppCtrl::ROW_ROR_LAST)) {
     O << " row_ror:";
     printU4ImmDecOperand(MI, OpNo, O);
-  } else if (Imm == 0x130) {
+  } else if (Imm == DppCtrl::WAVE_SHL1) {
     O << " wave_shl:1";
-  } else if (Imm == 0x134) {
+  } else if (Imm == DppCtrl::WAVE_ROL1) {
     O << " wave_rol:1";
-  } else if (Imm == 0x138) {
+  } else if (Imm == DppCtrl::WAVE_SHR1) {
     O << " wave_shr:1";
-  } else if (Imm == 0x13c) {
+  } else if (Imm == DppCtrl::WAVE_ROR1) {
     O << " wave_ror:1";
-  } else if (Imm == 0x140) {
+  } else if (Imm == DppCtrl::ROW_MIRROR) {
     O << " row_mirror";
-  } else if (Imm == 0x141) {
+  } else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
     O << " row_half_mirror";
-  } else if (Imm == 0x142) {
+  } else if (Imm == DppCtrl::BCAST15) {
     O << " row_bcast:15";
-  } else if (Imm == 0x143) {
+  } else if (Imm == DppCtrl::BCAST31) {
     O << " row_bcast:31";
   } else {
-    llvm_unreachable("Invalid dpp_ctrl value");
+    O << " /* Invalid dpp_ctrl value */";
   }
 }
 
@@ -936,11 +960,6 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) {
-    static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O);
-    return;
-  }
-
   printOperand(MI, OpNo, STI, O);
   O  << ", ";
   printOperand(MI, OpNo + 1, STI, O);
@@ -966,16 +985,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
     O << Asm;
 }
 
-void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O);
-}
-
 void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI,
                                   raw_ostream &O) {
@@ -1002,70 +1011,6 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
     O << " div:2";
 }
 
-void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
-                                            const MCSubtargetInfo &STI,
-                                            raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
-                                const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
-                                    const MCSubtargetInfo &STI, raw_ostream &O) {
-  static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O);
-}
-
 void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
@@ -1254,7 +1199,10 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
   const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
 
   O << "hwreg(";
-  if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+  unsigned Last = ID_SYMBOLIC_LAST_;
+  if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI))
+    Last = ID_SYMBOLIC_FIRST_GFX9_;
+  if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) {
     O << IdSymbolic[Id];
   } else {
     O << Id;
@@ -1267,6 +1215,13 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
 
 #include "AMDGPUGenAsmWriter.inc"
 
+void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+		                StringRef Annot, const MCSubtargetInfo &STI) {
+  O.flush();
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
 void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
                                raw_ostream &O) {
   AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
@@ -1385,7 +1340,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     switch (Op.getReg()) {
     // This is the default predicate state, so we don't need to print it.
-    case AMDGPU::PRED_SEL_OFF:
+    case R600::PRED_SEL_OFF:
       break;
 
     default:
@@ -1461,3 +1416,5 @@ void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
     O << " (MASKED)";
   }
 }
+
+#include "R600GenAsmWriter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index d97f04689e18..11a496a38b2c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -84,6 +84,8 @@ private:
                  raw_ostream &O);
   void printLWE(const MCInst *MI, unsigned OpNo,
                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printD16(const MCInst *MI, unsigned OpNo,
+                const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpCompr(const MCInst *MI, unsigned OpNo,
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
@@ -96,6 +98,8 @@ private:
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -214,13 +218,16 @@ protected:
                   raw_ostream &O);
 };
 
-// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and
-// MCTargetDesc should be using R600InstPrinter for the R600 target.
-class R600InstPrinter : public AMDGPUInstPrinter {
+class R600InstPrinter : public MCInstPrinter {
 public:
   R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                   const MCRegisterInfo &MRI)
-    : AMDGPUInstPrinter(MAI, MII, MRI) {}
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
 
   void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index d700acc34bc9..abc88c02adca 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -26,14 +26,14 @@ namespace {
 
 class AMDGPUAsmBackend : public MCAsmBackend {
 public:
-  AMDGPUAsmBackend(const Target &T)
-    : MCAsmBackend() {}
+  AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {}
 
   unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -43,10 +43,13 @@ public:
                         MCInst &Res) const override {
     llvm_unreachable("Not implemented");
   }
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
 
   unsigned getMinimumNopSize() const override;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 };
@@ -103,7 +106,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                   const MCValue &Target,
                                   MutableArrayRef<char> Data, uint64_t Value,
-                                  bool IsResolved) const {
+                                  bool IsResolved,
+                                  const MCSubtargetInfo *STI) const {
   Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
   if (!Value)
     return; // Doesn't change encoding.
@@ -140,11 +144,11 @@ unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
   return 4;
 }
 
-bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
-  OW->WriteZeros(Count % 4);
+  OS.write_zeros(Count % 4);
 
   // We are properly aligned, so write NOPs as requested.
   Count /= 4;
@@ -154,7 +158,7 @@ bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   const uint32_t Encoded_S_NOP_0 = 0xbf800000;
 
   for (uint64_t I = 0; I != Count; ++I)
-    OW->write32(Encoded_S_NOP_0);
+    support::endian::write<uint32_t>(OS, Encoded_S_NOP_0, Endian);
 
   return true;
 }
@@ -189,9 +193,9 @@ public:
     }
   }
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
   }
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index e443b0729606..07bef9103c0d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -66,6 +66,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_AMDGPU_REL32_LO;
   case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
     return ELF::R_AMDGPU_REL32_HI;
+  case MCSymbolRefExpr::VK_AMDGPU_REL64:
+    return ELF::R_AMDGPU_REL64;
   }
 
   switch (Fixup.getKind()) {
@@ -82,11 +84,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
   llvm_unreachable("unhandled relocation type");
 }
 
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
 llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
-                                  bool HasRelocationAddend,
-                                  raw_pwrite_stream &OS) {
-  auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
-                                                       HasRelocationAddend);
-  return createELFObjectWriter(std::move(MOTW), OS, true);
+                                  bool HasRelocationAddend) {
+  return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+                                                  HasRelocationAddend);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 1497edc7a054..c627a08e7463 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -12,37 +12,28 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
 
 using namespace llvm;
 
-AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context,
-                                     std::unique_ptr<MCAsmBackend> MAB,
-                                     raw_pwrite_stream &OS,
-                                     std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
-  unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE;
-  switch (T.getArch()) {
-  case Triple::r600:
-    Arch = ELF::EF_AMDGPU_ARCH_R600;
-    break;
-  case Triple::amdgcn:
-    Arch = ELF::EF_AMDGPU_ARCH_GCN;
-    break;
-  default:
-    break;
-  }
+namespace {
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+  AMDGPUELFStreamer(const Triple &T, MCContext &Context,
+                    std::unique_ptr<MCAsmBackend> MAB,
+                    std::unique_ptr<MCObjectWriter> OW,
+                    std::unique_ptr<MCCodeEmitter> Emitter)
+      : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+                      std::move(Emitter)) {}
+};
 
-  MCAssembler &MCA = getAssembler();
-  unsigned EFlags = MCA.getELFHeaderEFlags();
-  EFlags &= ~ELF::EF_AMDGPU_ARCH;
-  EFlags |= Arch;
-  MCA.setELFHeaderEFlags(EFlags);
 }
 
 MCELFStreamer *llvm::createAMDGPUELFStreamer(
     const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
     bool RelaxAll) {
-  return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+  return new AMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                std::move(Emitter));
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 0cc0a4c5cd5d..41e9063a759e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -23,16 +23,9 @@ class MCCodeEmitter;
 class MCContext;
 class MCSubtargetInfo;
 
-class AMDGPUELFStreamer : public MCELFStreamer {
-public:
-  AMDGPUELFStreamer(const Triple &T, MCContext &Context,
-                    std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS,
-                    std::unique_ptr<MCCodeEmitter> Emitter);
-};
-
 MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
                                        std::unique_ptr<MCAsmBackend> MAB,
-                                       raw_pwrite_stream &OS,
+                                       std::unique_ptr<MCObjectWriter> OW,
                                        std::unique_ptr<MCCodeEmitter> Emitter,
                                        bool RelaxAll);
 } // namespace llvm.
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 521b3b39bba2..cae7a7a6c7e7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 1b062064ace1..dcc10a032afe 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for R600 and SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 2b321c04fb30..c579c7d60e16 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This file provides AMDGPU specific target descriptions.
+/// This file provides AMDGPU specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -37,9 +38,17 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_MC_DESC
 #include "AMDGPUGenSubtargetInfo.inc"
 
+#define NoSchedModel NoSchedModelR600
+#define GET_SUBTARGETINFO_MC_DESC
+#include "R600GenSubtargetInfo.inc"
+#undef NoSchedModelR600
+
 #define GET_REGINFO_MC_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
+#define GET_REGINFO_MC_DESC
+#include "R600GenRegisterInfo.inc"
+
 static MCInstrInfo *createAMDGPUMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitAMDGPUMCInstrInfo(X);
@@ -48,12 +57,17 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {
 
 static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitAMDGPUMCRegisterInfo(X, 0);
+  if (TT.getArch() == Triple::r600)
+    InitR600MCRegisterInfo(X, 0);
+  else
+    InitAMDGPUMCRegisterInfo(X, 0);
   return X;
 }
 
 static MCSubtargetInfo *
 createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  if (TT.getArch() == Triple::r600)
+    return createR600MCSubtargetInfoImpl(TT, CPU, FS);
   return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
@@ -62,8 +76,10 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
                                                 const MCAsmInfo &MAI,
                                                 const MCInstrInfo &MII,
                                                 const MCRegisterInfo &MRI) {
-  return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) : 
-                                       new AMDGPUInstPrinter(MAI, MII, MRI);
+  if (T.getArch() == Triple::r600)
+    return new R600InstPrinter(MAI, MII, MRI);
+  else
+    return new AMDGPUInstPrinter(MAI, MII, MRI);
 }
 
 static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
@@ -76,23 +92,25 @@ static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
 static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
                                                    MCStreamer &S,
                                                    const MCSubtargetInfo &STI) {
-  return new AMDGPUTargetELFStreamer(S);
+  return new AMDGPUTargetELFStreamer(S, STI);
 }
 
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
                                     bool RelaxAll) {
-  return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS,
+  return createAMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                  std::move(Emitter), RelaxAll);
 }
 
 extern "C" void LLVMInitializeAMDGPUTargetMC() {
+
+  TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo);
   for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
     RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
 
-    TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
     TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
     TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
@@ -103,6 +121,8 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
   // R600 specific registration
   TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
                                         createR600MCCodeEmitter);
+  TargetRegistry::RegisterObjectTargetStreamer(
+      getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer);
 
   // GCN specific registration
   TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 1173dfd437ca..f3628d96d6e9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Provides AMDGPU specific target descriptions.
+/// Provides AMDGPU specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 //
@@ -25,7 +25,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -40,6 +40,7 @@ Target &getTheGCNTarget();
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                        const MCRegisterInfo &MRI,
                                        MCContext &Ctx);
+MCInstrInfo *createR600MCInstrInfo();
 
 MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCRegisterInfo &MRI,
@@ -50,15 +51,19 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
                                      const MCRegisterInfo &MRI,
                                      const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
+std::unique_ptr<MCObjectTargetWriter>
 createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
-                            bool HasRelocationAddend, raw_pwrite_stream &OS);
+                            bool HasRelocationAddend);
 } // End llvm namespace
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
 #undef GET_REGINFO_ENUM
 
+#define GET_REGINFO_ENUM
+#include "R600GenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_OPERAND_ENUM
 #define GET_INSTRINFO_SCHED_ENUM
@@ -67,9 +72,20 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
 #undef GET_INSTRINFO_OPERAND_ENUM
 #undef GET_INSTRINFO_ENUM
 
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
+#include "R600GenInstrInfo.inc"
+#undef GET_INSTRINFO_SCHED_ENUM
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 #undef GET_SUBTARGETINFO_ENUM
 
+#define GET_SUBTARGETINFO_ENUM
+#include "R600GenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index d897956daccf..6a41e3f650bc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -39,6 +39,84 @@ using namespace llvm::AMDGPU;
 // AMDGPUTargetStreamer
 //===----------------------------------------------------------------------===//
 
+static const struct {
+  const char *Name;
+  unsigned Mach;
+} MachTable[] = {
+      // Radeon HD 2000/3000 Series (R600).
+      { "r600", ELF::EF_AMDGPU_MACH_R600_R600 },
+      { "r630", ELF::EF_AMDGPU_MACH_R600_R630 },
+      { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 },
+      { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 },
+      // Radeon HD 4000 Series (R700).
+      { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 },
+      { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 },
+      { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 },
+      // Radeon HD 5000 Series (Evergreen).
+      { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR },
+      { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS },
+      { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER },
+      { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD },
+      { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO },
+      // Radeon HD 6000 Series (Northern Islands).
+      { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS },
+      { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS },
+      { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN },
+      { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS },
+      // AMDGCN GFX6.
+      { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+      { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
+      { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
+      // AMDGCN GFX7.
+      { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+      { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
+      { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+      { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
+      { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 },
+      { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+      { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+      { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
+      { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+      { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
+      // AMDGCN GFX8.
+      { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+      { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
+      { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+      { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+      { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
+      { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
+      { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+      { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
+      // AMDGCN GFX9.
+      { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 },
+      { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 },
+      { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 },
+      { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 },
+      // Not specified processor.
+      { nullptr, ELF::EF_AMDGPU_MACH_NONE }
+};
+
+unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
+  auto Entry = MachTable;
+  for (; Entry->Name && GPU != Entry->Name; ++Entry)
+    ;
+  return Entry->Mach;
+}
+
+const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) {
+  auto Entry = MachTable;
+  for (; Entry->Name && Mach != Entry->Mach; ++Entry)
+    ;
+  return Entry->Name;
+}
+
 bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
   HSAMD::Metadata HSAMetadata;
   if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
@@ -55,9 +133,12 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS)
     : AMDGPUTargetStreamer(S), OS(OS) { }
 
-void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                                           uint32_t Minor) {
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
+  OS << "\t.amdgcn_target \"" << Target << "\"\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
+    uint32_t Major, uint32_t Minor) {
   OS << "\t.hsa_code_object_version " <<
         Twine(Major) << "," << Twine(Minor) << '\n';
 }
@@ -118,12 +199,157 @@ bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
   return true;
 }
 
+void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
+    const MCSubtargetInfo &STI, StringRef KernelName,
+    const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
+    bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
+  amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor();
+
+  IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits());
+
+  OS << "\t.amdhsa_kernel " << KernelName << '\n';
+
+#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC,                   \
+                             DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME)     \
+  if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) !=                  \
+      AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME))            \
+    STREAM << "\t\t" << DIRECTIVE << " "                                       \
+           << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
+
+  if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size)
+    OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
+       << '\n';
+  if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size)
+    OS << "\t\t.amdhsa_private_segment_fixed_size "
+       << KD.private_segment_fixed_size << '\n';
+
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD,
+                       kernel_code_properties,
+                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD,
+                       kernel_code_properties,
+                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD,
+                       kernel_code_properties,
+                       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD,
+      kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD,
+                       compute_pgm_rsrc2,
+                       amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+
+  // These directives are required.
+  OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
+  OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+
+  if (!ReserveVCC)
+    OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
+  if (IVersion.Major >= 7 && !ReserveFlatScr)
+    OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+  if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
+    OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
+
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+  PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD,
+                       compute_pgm_rsrc1,
+                       amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+  if (IVersion.Major >= 9)
+    PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD,
+                         compute_pgm_rsrc1,
+                         amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD,
+      compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+  PRINT_IF_NOT_DEFAULT(
+      OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+#undef PRINT_IF_NOT_DEFAULT
+
+  OS << "\t.end_amdhsa_kernel\n";
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
 
-AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
-    : AMDGPUTargetStreamer(S), Streamer(S) {}
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
+    MCStreamer &S, const MCSubtargetInfo &STI)
+    : AMDGPUTargetStreamer(S), Streamer(S) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  EFlags &= ~ELF::EF_AMDGPU_MACH;
+  EFlags |= getMACH(STI.getCPU());
+
+  EFlags &= ~ELF::EF_AMDGPU_XNACK;
+  if (AMDGPU::hasXNACK(STI))
+    EFlags |= ELF::EF_AMDGPU_XNACK;
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
 
 MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
@@ -150,9 +376,10 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
   S.PopSection();
 }
 
-void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                                           uint32_t Minor) {
+void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
+
+void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
+    uint32_t Major, uint32_t Minor) {
 
   EmitAMDGPUNote(
     MCConstantExpr::create(8, getContext()),
@@ -207,7 +434,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
                                                    unsigned Type) {
   MCSymbolELF *Symbol = cast<MCSymbolELF>(
       getStreamer().getContext().getOrCreateSymbol(SymbolName));
-  Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+  Symbol->setType(Type);
 }
 
 bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
@@ -271,3 +498,46 @@ bool AMDGPUTargetELFStreamer::EmitPALMetadata(
   );
   return true;
 }
+
+void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
+    const MCSubtargetInfo &STI, StringRef KernelName,
+    const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+    uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+    bool ReserveXNACK) {
+  auto &Streamer = getStreamer();
+  auto &Context = Streamer.getContext();
+
+  MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
+      Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
+  KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+  KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
+  KernelDescriptorSymbol->setSize(
+      MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+
+  MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+      Context.getOrCreateSymbol(Twine(KernelName)));
+  KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+
+  Streamer.EmitLabel(KernelDescriptorSymbol);
+  Streamer.EmitBytes(StringRef(
+      (const char*)&(KernelDescriptor),
+      offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+  // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
+  // expression being created is:
+  //   (start of kernel code) - (start of kernel descriptor)
+  // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
+  Streamer.EmitValue(MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(
+          KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+      MCSymbolRefExpr::create(
+          KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
+      Context),
+      sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
+  Streamer.EmitBytes(StringRef(
+      (const char*)&(KernelDescriptor) +
+          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
+          sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
+      sizeof(KernelDescriptor) -
+          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
+          sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 0919b754480d..472da1b73593 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -30,9 +31,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
   MCContext &getContext() const { return Streamer.getContext(); }
 
+  /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name.
+  unsigned getMACH(StringRef GPU) const;
+
 public:
+  /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value.
+  static const char *getMachName(unsigned Mach);
+
   AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
+  virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
+
   virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                                  uint32_t Minor) = 0;
 
@@ -56,12 +65,21 @@ public:
 
   /// \returns True on success, false on failure.
   virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+
+  virtual void EmitAmdhsaKernelDescriptor(
+      const MCSubtargetInfo &STI, StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+      bool ReserveXNACK) = 0;
 };
 
 class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   formatted_raw_ostream &OS;
 public:
   AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
   void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                          uint32_t Minor) override;
 
@@ -81,6 +99,12 @@ public:
 
   /// \returns True on success, false on failure.
   bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+  void EmitAmdhsaKernelDescriptor(
+      const MCSubtargetInfo &STI, StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+      bool ReserveXNACK) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -90,10 +114,12 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
                       function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
-  AMDGPUTargetELFStreamer(MCStreamer &S);
+  AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
   MCELFStreamer &getStreamer();
 
+  void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+
   void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                          uint32_t Minor) override;
 
@@ -113,6 +139,12 @@ public:
 
   /// \returns True on success, false on failure.
   bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+  void EmitAmdhsaKernelDescriptor(
+      const MCSubtargetInfo &STI, StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+      bool ReserveXNACK) override;
 };
 
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index eab90e1d344c..28d4bc1829e2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,13 +9,12 @@
 //
 /// \file
 ///
-/// \brief The R600 code emitter produces machine code that can be executed
+/// The R600 code emitter produces machine code that can be executed
 /// directly on the GPU device.
 //
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600Defines.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -36,30 +35,40 @@ using namespace llvm;
 
 namespace {
 
-class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+class R600MCCodeEmitter : public MCCodeEmitter {
   const MCRegisterInfo &MRI;
+  const MCInstrInfo &MCII;
 
 public:
   R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
-    : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+    : MRI(mri), MCII(mcii) {}
   R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
   R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
 
-  /// \brief Encode the instruction and write it to the OS.
+  /// Encode the instruction and write it to the OS.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override;
+                         const MCSubtargetInfo &STI) const;
 
   /// \returns the encoding for an MCOperand.
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const override;
+                             const MCSubtargetInfo &STI) const;
 
 private:
+
   void Emit(uint32_t value, raw_ostream &OS) const;
   void Emit(uint64_t value, raw_ostream &OS) const;
 
   unsigned getHWReg(unsigned regNo) const;
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+
 };
 
 } // end anonymous namespace
@@ -94,16 +103,16 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-  if (MI.getOpcode() == AMDGPU::RETURN ||
-    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
-    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
-    MI.getOpcode() == AMDGPU::BUNDLE ||
-    MI.getOpcode() == AMDGPU::KILL) {
+  if (MI.getOpcode() == R600::RETURN ||
+    MI.getOpcode() == R600::FETCH_CLAUSE ||
+    MI.getOpcode() == R600::ALU_CLAUSE ||
+    MI.getOpcode() == R600::BUNDLE ||
+    MI.getOpcode() == R600::KILL) {
     return;
   } else if (IS_VTX(Desc)) {
     uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
     uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
-    if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+    if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) {
       InstWord2 |= 1 << 19; // Mega-Fetch bit
     }
 
@@ -136,7 +145,7 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       Emit((uint32_t) 0, OS);
   } else {
     uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
-    if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+    if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) &&
        ((Desc.TSFlags & R600_InstFlag::OP1) ||
          Desc.TSFlags & R600_InstFlag::OP2)) {
       uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
@@ -148,11 +157,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 }
 
 void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
-  support::endian::Writer<support::little>(OS).write(Value);
+  support::endian::write(OS, Value, support::little);
 }
 
 void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
-  support::endian::Writer<support::little>(OS).write(Value);
+  support::endian::write(OS, Value, support::little);
 }
 
 unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
@@ -186,4 +195,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
 }
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
-#include "AMDGPUGenMCCodeEmitter.inc"
+#include "R600GenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
new file mode 100644
index 000000000000..1c99a708e5ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -0,0 +1,27 @@
+//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides R600 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "R600GenInstrInfo.inc"
+
+MCInstrInfo *llvm::createR600MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitR600MCInstrInfo(X);
+  return X;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 94c0157edeb5..36913bd04274 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief The SI code emitter produces machine code that can be executed
+/// The SI code emitter produces machine code that can be executed
 /// directly on the GPU device.
 //
 //===----------------------------------------------------------------------===//
@@ -43,7 +43,7 @@ namespace {
 class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   const MCRegisterInfo &MRI;
 
-  /// \brief Encode an fp or int literal
+  /// Encode an fp or int literal
   uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
                           const MCSubtargetInfo &STI) const;
 
@@ -54,7 +54,7 @@ public:
   SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
   SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
 
-  /// \brief Encode the instruction and write it to the OS.
+  /// Encode the instruction and write it to the OS.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override;
@@ -64,7 +64,7 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
 
-  /// \brief Use a fixup to encode the simm16 field for SOPP branch
+  /// Use a fixup to encode the simm16 field for SOPP branch
   ///        instructions.
   unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
@@ -335,13 +335,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
 
   const MCOperand &MO = MI.getOperand(OpNo);
 
-  unsigned Reg = MO.getReg();
-  RegEnc |= MRI.getEncodingValue(Reg);
-  RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
-  if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
-    RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    RegEnc |= MRI.getEncodingValue(Reg);
+    RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+    if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+      RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+    }
+    return RegEnc;
+  } else {
+    const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+    uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
+    if (Enc != ~0U && Enc != 255) {
+      return Enc | SDWA9EncValues::SRC_SGPR_MASK;
+    }
   }
-  return RegEnc;
+
+  llvm_unreachable("Unsupported operand kind");
+  return 0;
 }
 
 unsigned
@@ -427,3 +438,6 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   llvm_unreachable("Encoding of this operand type is not supported yet.");
   return 0;
 }
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 651265fc54d5..1e0bc62c45a6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -7,9 +7,63 @@
 //
 //===----------------------------------------------------------------------===//
 
-class MIMG_Mask <string op, int channels> {
-  string Op = op;
-  int Channels = channels;
+// MIMG-specific encoding families to distinguish between semantically
+// equivalent machine instructions with different encoding.
+//
+// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
+// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+class MIMGEncoding;
+
+def MIMGEncGfx6 : MIMGEncoding;
+def MIMGEncGfx8 : MIMGEncoding;
+
+def MIMGEncoding : GenericEnum {
+  let FilterClass = "MIMGEncoding";
+}
+
+// Represent an ISA-level opcode, independent of the encoding and the
+// vdata/vaddr size.
+class MIMGBaseOpcode {
+  MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
+  bit Store = 0;
+  bit Atomic = 0;
+  bit AtomicX2 = 0; // (f)cmpswap
+  bit Sampler = 0;
+  bits<8> NumExtraArgs = 0;
+  bit Gradients = 0;
+  bit Coordinates = 1;
+  bit LodOrClampOrMip = 0;
+  bit HasD16 = 0;
+}
+
+def MIMGBaseOpcode : GenericEnum {
+  let FilterClass = "MIMGBaseOpcode";
+}
+
+def MIMGBaseOpcodesTable : GenericTable {
+  let FilterClass = "MIMGBaseOpcode";
+  let CppTypeName = "MIMGBaseOpcodeInfo";
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+                "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
+                "HasD16"];
+  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+
+  let PrimaryKey = ["BaseOpcode"];
+  let PrimaryKeyName = "getMIMGBaseOpcodeInfo";
+}
+
+def MIMGDim : GenericEnum {
+  let FilterClass = "AMDGPUDimProps";
+}
+
+def MIMGDimInfoTable : GenericTable {
+  let FilterClass = "AMDGPUDimProps";
+  let CppTypeName = "MIMGDimInfo";
+  let Fields = ["Dim", "NumCoords", "NumGradients", "DA"];
+  GenericEnum TypeOf_Dim = MIMGDim;
+
+  let PrimaryKey = ["Dim"];
+  let PrimaryKeyName = "getMIMGDimInfo";
 }
 
 class mimg <bits<7> si, bits<7> vi = si> {
@@ -17,93 +71,161 @@ class mimg <bits<7> si, bits<7> vi = si> {
   field bits<7> VI = vi;
 }
 
-class MIMG_Helper <dag outs, dag ins, string asm,
-                   string dns=""> : MIMG<outs, ins, asm,[]> {
+class MIMG <dag outs, string dns = "">
+  : InstSI <outs, (ins), "", []> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MIMG = 1;
+  let Uses = [EXEC];
   let mayLoad = 1;
   let mayStore = 0;
   let hasPostISelHook = 1;
+  let SchedRW = [WriteVMEM];
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0; // XXX ????
+
+  let SubtargetPredicate = isGCN;
   let DecoderNamespace = dns;
   let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
   let AsmMatchConverter = "cvtMIMG";
   let usesCustomInserter = 1;
-  let SchedRW = [WriteVMEM];
+
+  Instruction Opcode = !cast<Instruction>(NAME);
+  MIMGBaseOpcode BaseOpcode;
+  MIMGEncoding MIMGEncoding = MIMGEncGfx6;
+  bits<8> VDataDwords;
+  bits<8> VAddrDwords;
+}
+
+def MIMGInfoTable : GenericTable {
+  let FilterClass = "MIMG";
+  let CppTypeName = "MIMGInfo";
+  let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+  GenericEnum TypeOf_MIMGEncoding = MIMGEncoding;
+
+  let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
+  let PrimaryKeyName = "getMIMGOpcodeHelper";
+}
+
+def getMIMGInfo : SearchIndex {
+  let Table = MIMGInfoTable;
+  let Key = ["Opcode"];
 }
 
 class MIMG_NoSampler_Helper <bits<7> op, string asm,
                              RegisterClass dst_rc,
                              RegisterClass addr_rc,
-                             string dns=""> : MIMG_Helper <
-  (outs dst_rc:$vdata),
-  (ins addr_rc:$vaddr, SReg_256:$srsrc,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
-  dns>, MIMGe<op> {
+                             string dns="">
+  : MIMG <(outs dst_rc:$vdata), dns>,
+    MIMGe<op> {
   let ssamp = 0;
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
 multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
-                                      RegisterClass dst_rc,
-                                      int channels> {
-  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
-                                   !if(!eq(channels, 1), "AMDGPU", "")>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
-            MIMG_Mask<asm#"_V4", channels>;
-}
+                                             RegisterClass dst_rc,
+                                             bit enableDisasm> {
+  let VAddrDwords = 1 in
+  def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+                                         !if(enableDisasm, "AMDGPU", "")>;
+  let VAddrDwords = 2 in
+  def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+  let VAddrDwords = 3 in
+  def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+  let VAddrDwords = 4 in
+  def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
+                           bit isResInfo = 0> {
+  def "" : MIMGBaseOpcode {
+    let Coordinates = !if(isResInfo, 0, 1);
+    let LodOrClampOrMip = mip;
+    let HasD16 = has_d16;
+  }
 
-multiclass MIMG_NoSampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
-  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+      mayLoad = !if(isResInfo, 0, 1) in {
+    let VDataDwords = 1 in
+    defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+  }
 }
 
 class MIMG_Store_Helper <bits<7> op, string asm,
                          RegisterClass data_rc,
                          RegisterClass addr_rc,
-                         string dns = ""> : MIMG_Helper <
-  (outs),
-  (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> {
+                         string dns = "">
+  : MIMG <(outs), dns>,
+    MIMGe<op> {
   let ssamp = 0;
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
   let mayLoad = 0;
   let mayStore = 1;
   let hasSideEffects = 0;
   let hasPostISelHook = 0;
   let DisableWQM = 1;
+
+  let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
 multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
                                   RegisterClass data_rc,
-                                  int channels> {
-  def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
-                               !if(!eq(channels, 1), "AMDGPU", "")>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
-            MIMG_Mask<asm#"_V4", channels>;
-}
+                                  bit enableDisasm> {
+  let VAddrDwords = 1 in
+  def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+                                      !if(enableDisasm, "AMDGPU", "")>;
+  let VAddrDwords = 2 in
+  def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+  let VAddrDwords = 3 in
+  def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+  let VAddrDwords = 4 in
+  def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
+  def "" : MIMGBaseOpcode {
+    let Store = 1;
+    let LodOrClampOrMip = mip;
+    let HasD16 = has_d16;
+  }
 
-multiclass MIMG_Store <bits<7> op, string asm> {
-  defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
-  defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+    let VDataDwords = 1 in
+    defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>;
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+  }
 }
 
 class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
-                          RegisterClass addr_rc> : MIMG_Helper <
-    (outs data_rc:$vdst),
-    (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
-         dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-         r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-    asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"> {
+                          RegisterClass addr_rc, string dns="",
+                          bit enableDasm = 0>
+  : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> {
   let mayLoad = 1;
   let mayStore = 1;
   let hasSideEffects = 1; // FIXME: Remove this
@@ -111,160 +233,210 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
   let DisableWQM = 1;
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
-}
 
-class MIMG_Atomic_Real_si<mimg op, string name, string asm,
-  RegisterClass data_rc, RegisterClass addr_rc> :
-  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
-  SIMCInstr<name, SIEncodingFamily.SI>,
-  MIMGe<op.SI> {
-  let isCodeGenOnly = 0;
-  let AssemblerPredicates = [isSICI];
-  let DecoderNamespace = "SICI";
-  let DisableDecoder = DisableSIDecoder;
-}
-
-class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
-  RegisterClass data_rc, RegisterClass addr_rc> :
-  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
-  SIMCInstr<name, SIEncodingFamily.VI>,
-  MIMGe<op.VI> {
-  let isCodeGenOnly = 0;
-  let AssemblerPredicates = [isVI];
-  let DecoderNamespace = "VI";
-  let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
-                                 RegisterClass data_rc, RegisterClass addr_rc> {
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
-             SIMCInstr<name, SIEncodingFamily.NONE>;
+  let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+                           DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                           R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
+  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc,
+                                 RegisterClass addr_rc, bit enableDasm = 0> {
+  let ssamp = 0, d16 = 0 in {
+    def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>,
+              SIMCInstr<NAME, SIEncodingFamily.SI>,
+              MIMGe<op.SI> {
+      let AssemblerPredicates = [isSICI];
+      let DisableDecoder = DisableSIDecoder;
+    }
+
+    def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>,
+              SIMCInstr<NAME, SIEncodingFamily.VI>,
+              MIMGe<op.VI> {
+      let AssemblerPredicates = [isVI];
+      let DisableDecoder = DisableVIDecoder;
+      let MIMGEncoding = MIMGEncGfx8;
+    }
   }
+}
 
-  let ssamp = 0 in {
-    def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
+                                      RegisterClass data_rc,
+                                      bit enableDasm = 0> {
+  // _V* variants have different address size, but the size is not encoded.
+  // So only one variant can be disassembled. V1 looks the safest to decode.
+  let VAddrDwords = 1 in
+  defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>;
+  let VAddrDwords = 2 in
+  defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>;
+  let VAddrDwords = 3 in
+  defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>;
+  let VAddrDwords = 4 in
+  defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>;
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
+  def "" : MIMGBaseOpcode {
+    let Atomic = 1;
+    let AtomicX2 = isCmpSwap;
+  }
 
-    def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+    // _V* variants have different dst size, but the size is encoded implicitly,
+    // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+    // Other variants are reconstructed by disassembler using dmask and tfe.
+    let VDataDwords = !if(isCmpSwap, 2, 1) in
+    defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>;
+    let VDataDwords = !if(isCmpSwap, 4, 2) in
+    defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>;
   }
 }
 
-multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
-  defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
-  defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
-  defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
+                           RegisterClass src_rc, string dns="">
+  : MIMG <(outs dst_rc:$vdata), dns>,
+    MIMGe<op> {
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+
+  let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMGAddrSize<int dw, bit enable_disasm> {
+  int NumWords = dw;
+
+  RegisterClass RegClass = !if(!le(NumWords, 0), ?,
+                           !if(!eq(NumWords, 1), VGPR_32,
+                           !if(!eq(NumWords, 2), VReg_64,
+                           !if(!eq(NumWords, 3), VReg_96,
+                           !if(!eq(NumWords, 4), VReg_128,
+                           !if(!le(NumWords, 8), VReg_256,
+                           !if(!le(NumWords, 16), VReg_512, ?)))))));
+
+  // Whether the instruction variant with this vaddr size should be enabled for
+  // the auto-generated disassembler.
+  bit Disassemble = enable_disasm;
+}
+
+// Return whether a value inside the range [min, max] (endpoints inclusive)
+// is in the given list.
+class isRangeInList<int min, int max, list<int> lst> {
+  bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max))));
+}
+
+class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> {
+  list<MIMGAddrSize> List = lst;
+  int Min = min;
+}
+
+class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
+  // List of all possible numbers of address words, taking all combinations of
+  // A16 and image dimension into account (note: no MSAA, since this is for
+  // sample/gather ops).
+  list<int> AllNumAddrWords =
+    !foreach(dw, !if(sample.Gradients,
+                     !if(!eq(sample.LodOrClamp, ""),
+                         [2, 3, 4, 5, 6, 7, 9],
+                         [2, 3, 4, 5, 7, 8, 10]),
+                     !if(!eq(sample.LodOrClamp, ""),
+                         [1, 2, 3],
+                         [1, 2, 3, 4])),
+             !add(dw, !size(sample.ExtraAddrArgs)));
+
+  // Generate machine instructions based on possible register classes for the
+  // required numbers of address words. The disassembler defaults to the
+  // smallest register class.
+  list<MIMGAddrSize> MachineInstrs =
+    !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw,
+           !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret,
+               MIMGAddrSizes_tmp<
+                  !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
+                  !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
+               lhs)).List;
 }
 
-class MIMG_Sampler_Helper <bits<7> op, string asm,
-                           RegisterClass dst_rc,
-                           RegisterClass src_rc,
-                           bit wqm,
-                           string dns=""> : MIMG_Helper <
-  (outs dst_rc:$vdata),
-  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
-  dns>, MIMGe<op> {
-  let WQM = wqm;
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    AMDGPUSampleVariant sample, RegisterClass dst_rc,
+                                    bit enableDisasm = 0> {
+  foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
+    let VAddrDwords = addr.NumWords in
+    def _V # addr.NumWords
+      : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+                             !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+  }
 }
 
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
-                                    RegisterClass dst_rc,
-                                    int channels, bit wqm> {
-  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
-                                 !if(!eq(channels, 1), "AMDGPU", "")>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
-            MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
-            MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
-            MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
-}
-
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
-
-class MIMG_Gather_Helper <bits<7> op, string asm,
-                          RegisterClass dst_rc,
-                          RegisterClass src_rc, bit wqm> : MIMG <
-  (outs dst_rc:$vdata),
-  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
-       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
-       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
-  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
-  []>, MIMGe<op> {
-  let mayLoad = 1;
-  let mayStore = 0;
+class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
+  : MIMGBaseOpcode {
+  let Sampler = 1;
+  let NumExtraArgs = !size(sample.ExtraAddrArgs);
+  let Gradients = sample.Gradients;
+  let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
+}
 
-  // DMASK was repurposed for GATHER4. 4 components are always
-  // returned and DMASK works like a swizzle - it selects
-  // the component to fetch. The only useful DMASK values are
-  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
-  // (red,red,red,red) etc.) The ISA document doesn't mention
-  // this.
-  // Therefore, disable all code which updates DMASK by setting this:
-  let Gather4 = 1;
-  let hasPostISelHook = 0;
-  let WQM = wqm;
+multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+                         bit isGetLod = 0,
+                         string asm = "image_sample"#sample.LowerCaseMod> {
+  def "" : MIMG_Sampler_BaseOpcode<sample> {
+    let HasD16 = !if(isGetLod, 0, 1);
+  }
 
-  let isAsmParserOnly = 1; // TBD: fix it later
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+      mayLoad = !if(isGetLod, 0, 1) in {
+    let VDataDwords = 1 in
+    defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+    let VDataDwords = 3 in
+    defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+  }
 }
 
-multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
-                                    RegisterClass dst_rc,
-                                    int channels, bit wqm> {
-  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
-            MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
-            MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
-            MIMG_Mask<asm#"_V16", channels>;
-}
+multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample>
+    : MIMG_Sampler<op, sample, 1>;
 
-multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+                        string asm = "image_gather4"#sample.LowerCaseMod> {
+  def "" : MIMG_Sampler_BaseOpcode<sample> {
+    let HasD16 = 1;
+  }
+
+  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
+      Gather4 = 1, hasPostISelHook = 0 in {
+    let VDataDwords = 2 in
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
+    let VDataDwords = 4 in
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+  }
 }
 
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample>
+    : MIMG_Gather<op, sample, 1>;
 
 //===----------------------------------------------------------------------===//
 // MIMG Instructions
 //===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-}
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>;
 
 defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>;
 defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
 defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
 //def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
@@ -277,397 +449,101 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
 defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
 defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
 defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
-defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
-defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
-defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
-defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
-defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
-defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
-defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
-defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
-defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
-defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
-defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
-defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
-defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
-defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
-defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
-defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
-defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
-defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
-defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
-defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
-defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
-defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
-defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
-defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
-defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-
-let mayLoad = 0, mayStore = 0 in {
-defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
-}
-
-defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
-defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
-defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
-defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
-defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
-defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
-defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+
+/********** ========================================= **********/
+/********** Table of dimension-aware image intrinsics **********/
+/********** ========================================= **********/
+
+class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
+  Intrinsic Intr = I;
+  MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+  AMDGPUDimProps Dim = I.P.Dim;
 }
 
-/********** ======================= **********/
-/********** Image sampling patterns **********/
-/********** ======================= **********/
-
-// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
-  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
-        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode $addr, $rsrc, $sampler,
-          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
-}
-
-// Image + sampler for amdgcn
-// TODO:
-// 1. Handle half data type like v4f16, and add D16 bit support;
-// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
-// 3. Add A16 support when we pass address of half type.
-multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt>  {
-  def : GCNPat<
-    (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
-        i1:$slc, i1:$lwe, i1:$da)),
-    (opcode $addr, $rsrc, $sampler,
-          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
-          0, 0, (as_i1imm $lwe), (as_i1imm $da))
-    >;
-}
-
-multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
-  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
-}
-
-// TODO: support v3f32.
-multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
-  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
-  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
-  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
-  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
-        imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
-  (opcode $addr, $rsrc,
-          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass ImagePatterns<SDPatternOperator name, string opcode> {
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
-  def : GCNPat <
-    (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
-                i1:$da)),
-    (opcode $addr, $rsrc,
-          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
-          0, 0, (as_i1imm $lwe), (as_i1imm $da))
-  >;
-}
-
-multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
-  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
-  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
-  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
-  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
-  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
-  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
-  def : GCNPat <
-    (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
-          i1:$lwe, i1:$da),
-    (opcode $data, $addr, $rsrc,
-          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
-          0, 0, (as_i1imm $lwe), (as_i1imm $da))
-  >;
-}
-
-multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
-  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
-  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
-  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
-}
-
-// TODO: support v3f32.
-multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
-  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
-  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
-  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
-}
-
-class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat <
-  (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
-  (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
->;
-
-multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
-  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
-  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
-  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
-}
-
-class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat <
-  (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
-                                   imm:$r128, imm:$da, imm:$slc),
-  (EXTRACT_SUBREG
-    (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
-            $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
-    sub0)
->;
-
-// ======= amdgcn Image Intrinsics ==============
-
-// Image load
-defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
-defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
-defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
-
-// Image store
-defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
-defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
-
-// Basic sample
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample,           "IMAGE_SAMPLE">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl,        "IMAGE_SAMPLE_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d,         "IMAGE_SAMPLE_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l,         "IMAGE_SAMPLE_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b,         "IMAGE_SAMPLE_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd,        "IMAGE_SAMPLE_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c,         "IMAGE_SAMPLE_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o,         "IMAGE_SAMPLE_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4,           "IMAGE_GATHER4">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl,        "IMAGE_GATHER4_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l,         "IMAGE_GATHER4_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b,         "IMAGE_GATHER4_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl,      "IMAGE_GATHER4_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz,        "IMAGE_GATHER4_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c,         "IMAGE_GATHER4_C">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl,      "IMAGE_GATHER4_C_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l,       "IMAGE_GATHER4_C_L">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b,       "IMAGE_GATHER4_C_B">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl,    "IMAGE_GATHER4_C_B_CL">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz,      "IMAGE_GATHER4_C_LZ">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o,         "IMAGE_GATHER4_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o,      "IMAGE_GATHER4_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o,       "IMAGE_GATHER4_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o,       "IMAGE_GATHER4_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o,    "IMAGE_GATHER4_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o,      "IMAGE_GATHER4_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o,       "IMAGE_GATHER4_C_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o,    "IMAGE_GATHER4_C_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o,     "IMAGE_GATHER4_C_L_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o,     "IMAGE_GATHER4_C_B_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o,  "IMAGE_GATHER4_C_B_CL_O">;
-defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o,    "IMAGE_GATHER4_C_LZ_O">;
-
-defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
-
-// Image atomics
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
-
-/* SIsample for simple 1D texture lookup */
-def : GCNPat <
-  (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
-  (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
-    (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-class SampleShadowPattern<SDNode name, MIMG opcode,
-                          ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleShadowArrayPattern<SDNode name, MIMG opcode,
-                               ValueType vt> : GCNPat <
-    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
-    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
->;
-
-/* SIsample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
-                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
-MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
-  def : SamplePattern <SIsample, sample, addr_type>;
-  def : SampleRectPattern <SIsample, sample, addr_type>;
-  def : SampleArrayPattern <SIsample, sample, addr_type>;
-  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
-  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
-
-  def : SamplePattern <SIsamplel, sample_l, addr_type>;
-  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
-  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
-  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
-
-  def : SamplePattern <SIsampleb, sample_b, addr_type>;
-  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
-  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
-  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
-
-  def : SamplePattern <SIsampled, sample_d, addr_type>;
-  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
-  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
-  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
-}
-
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
-                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
-                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
-                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
-                      v2i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
-                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
-                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
-                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
-                      v4i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
-                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
-                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
-                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
-                      v8i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
-                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
-                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
-                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
-                      v16i32>;
+def ImageDimIntrinsicTable : GenericTable {
+  let FilterClass = "ImageDimIntrinsicInfo";
+  let Fields = ["Intr", "BaseOpcode", "Dim"];
+  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+  GenericEnum TypeOf_Dim = MIMGDim;
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "getImageDimIntrinsicInfo";
+  let PrimaryKeyEarlyOut = 1;
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimAtomicIntrinsics) in {
+  def : ImageDimIntrinsicInfo<intr>;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
deleted file mode 100644
index d50dae78e247..000000000000
--- a/contrib/llvm/lib/Target/AMDGPU/Processors.td
+++ /dev/null
@@ -1,12 +0,0 @@
-//===-- Processors.td - AMDGPU Processor definitions ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600.td b/contrib/llvm/lib/Target/AMDGPU/R600.td
new file mode 100644
index 000000000000..5c9c1c1ed504
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600.td
@@ -0,0 +1,54 @@
+//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+def R600InstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+  let noNamedPositionallyEncodedOperands = 1;
+}
+
+def R600 : Target {
+  let InstructionSet = R600InstrInfo;
+  let AllowRegisterRenaming = 1;
+}
+
+let Namespace = "R600" in {
+
+foreach Index = 0-15 in {
+  def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+include "R600RegisterInfo.td"
+
+}
+
+def NullALU : InstrItinClass;
+def ALU_NULL : FuncUnit;
+
+include "AMDGPUFeatures.td"
+include "R600Schedule.td"
+include "R600Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUInstructions.td"
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
new file mode 100644
index 000000000000..68f8c30775b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -0,0 +1,133 @@
+//===-- R600AsmPrinter.cpp - R600 Assebly printer  ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The R600AsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600AsmPrinter.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+AsmPrinter *
+llvm::createR600AsmPrinterPass(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> &&Streamer) {
+  return new R600AsmPrinter(TM, std::move(Streamer));
+}
+
+R600AsmPrinter::R600AsmPrinter(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> Streamer)
+  : AsmPrinter(TM, std::move(Streamer)) { }
+
+StringRef R600AsmPrinter::getPassName() const {
+  return "R600 Assembly Printer";
+}
+
+void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+  unsigned MaxGPR = 0;
+  bool killPixel = false;
+  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+  const R600RegisterInfo *RI = STM.getRegisterInfo();
+  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == R600::KILLGT)
+        killPixel = true;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        if (!MO.isReg())
+          continue;
+        unsigned HWReg = RI->getHWRegIndex(MO.getReg());
+
+        // Register with value > 127 aren't GPR
+        if (HWReg > 127)
+          continue;
+        MaxGPR = std::max(MaxGPR, HWReg);
+      }
+    }
+  }
+
+  unsigned RsrcReg;
+  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+    // Evergreen / Northern Islands
+    switch (MF.getFunction().getCallingConv()) {
+    default: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    }
+  } else {
+    // R600 / R700
+    switch (MF.getFunction().getCallingConv()) {
+    default: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    }
+  }
+
+  OutStreamer->EmitIntValue(RsrcReg, 4);
+  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+                           S_STACK_SIZE(MFI->CFStackSize), 4);
+  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
+    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+  }
+}
+
+bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+
+  // Functions needs to be cacheline (256B) aligned.
+  MF.ensureAlignment(8);
+
+  SetupMachineFunction(MF);
+
+  MCContext &Context = getObjFileLowering().getContext();
+  MCSectionELF *ConfigSection =
+      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+  OutStreamer->SwitchSection(ConfigSection);
+
+  EmitProgramInfoR600(MF);
+
+  EmitFunctionBody();
+
+  if (isVerbose()) {
+    MCSectionELF *CommentSection =
+        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+    OutStreamer->SwitchSection(CommentSection);
+
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+    OutStreamer->emitRawComment(
+      Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+  }
+
+  return false;
+}
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600AsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
new file mode 100644
index 000000000000..079fc707b03c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -0,0 +1,46 @@
+//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600 Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class R600AsmPrinter final : public AsmPrinter {
+
+public:
+  explicit R600AsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer);
+  StringRef getPassName() const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  /// Implemented in AMDGPUMCInstLower.cpp
+  void EmitInstruction(const MachineInstr *MI) override;
+  /// Lower the specified LLVM Constant to an MCExpr.
+  /// The AsmPrinter::lowerConstantof does not know how to lower
+  /// addrspacecast, therefore they should be lowered by this function.
+  const MCExpr *lowerConstant(const Constant *CV) override;
+
+private:
+  void EmitProgramInfoR600(const MachineFunction &MF);
+};
+
+AsmPrinter *
+createR600AsmPrinterPass(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> &&Streamer);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 5e1ba6b506da..0c62d6a4b3d9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -19,6 +19,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -33,8 +34,8 @@ namespace {
 
 static bool isCFAlu(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  case AMDGPU::CF_ALU:
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case R600::CF_ALU:
+  case R600::CF_ALU_PUSH_BEFORE:
     return true;
   default:
     return false;
@@ -84,20 +85,20 @@ char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID;
 unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
   return MI
-      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::COUNT))
       .getImm();
 }
 
 bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
   return MI
-      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::Enabled))
       .getImm();
 }
 
 void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
     MachineInstr &CFAlu) const {
-  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
   MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
   I++;
   do {
@@ -116,46 +117,46 @@ void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
 bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
                                           const MachineInstr &LatrCFAlu) const {
   assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
-  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT);
   unsigned RootInstCount = getCFAluSize(RootCFAlu),
       LaterInstCount = getCFAluSize(LatrCFAlu);
   unsigned CumuledInsts = RootInstCount + LaterInstCount;
   if (CumuledInsts >= TII->getMaxAlusPerClause()) {
-    DEBUG(dbgs() << "Excess inst counts\n");
+    LLVM_DEBUG(dbgs() << "Excess inst counts\n");
     return false;
   }
-  if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+  if (RootCFAlu.getOpcode() == R600::CF_ALU_PUSH_BEFORE)
     return false;
   // Is KCache Bank 0 compatible ?
   int Mode0Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE0);
   int KBank0Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK0);
   int KBank0LineIdx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR0);
   if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
       RootCFAlu.getOperand(Mode0Idx).getImm() &&
       (LatrCFAlu.getOperand(KBank0Idx).getImm() !=
            RootCFAlu.getOperand(KBank0Idx).getImm() ||
        LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
            RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
-    DEBUG(dbgs() << "Wrong KC0\n");
+    LLVM_DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
   // Is KCache Bank 1 compatible ?
   int Mode1Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE1);
   int KBank1Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK1);
   int KBank1LineIdx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+      TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR1);
   if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
       RootCFAlu.getOperand(Mode1Idx).getImm() &&
       (LatrCFAlu.getOperand(KBank1Idx).getImm() !=
            RootCFAlu.getOperand(KBank1Idx).getImm() ||
        LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
            RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
-    DEBUG(dbgs() << "Wrong KC0\n");
+    LLVM_DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
   if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 0e788df1c9c0..a19020276f35 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -19,6 +19,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -93,7 +94,7 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
 }
 
 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
-  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+  if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
       getLoopDepth() > 1)
     return true;
 
@@ -102,10 +103,10 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
 
   switch(Opcode) {
   default: return false;
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
-  case AMDGPU::CF_ALU_ELSE_AFTER:
-  case AMDGPU::CF_ALU_BREAK:
-  case AMDGPU::CF_ALU_CONTINUE:
+  case R600::CF_ALU_PUSH_BEFORE:
+  case R600::CF_ALU_ELSE_AFTER:
+  case R600::CF_ALU_BREAK:
+  case R600::CF_ALU_CONTINUE:
     if (CurrentSubEntries == 0)
       return false;
     if (ST->getWavefrontSize() == 64) {
@@ -136,7 +137,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 0;
   case CFStack::FIRST_NON_WQM_PUSH:
   assert(!ST->hasCaymanISA());
-  if (ST->getGeneration() <= R600Subtarget::R700) {
+  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
     // +1 For the push operation.
     // +2 Extra space required.
     return 3;
@@ -149,7 +150,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 2;
   }
   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
-    assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
+    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     // +1 For the push operation.
     // +1 Extra space required.
     return 2;
@@ -167,8 +168,8 @@ void CFStack::updateMaxStackSize() {
 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
   CFStack::StackItem Item = CFStack::ENTRY;
   switch(Opcode) {
-  case AMDGPU::CF_PUSH_EG:
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case R600::CF_PUSH_EG:
+  case R600::CF_ALU_PUSH_BEFORE:
     if (!isWQM) {
       if (!ST->hasCaymanISA() &&
           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
@@ -176,7 +177,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
                                              // See comment in
                                              // CFStack::getSubEntrySize()
       else if (CurrentEntries > 0 &&
-               ST->getGeneration() > R600Subtarget::EVERGREEN &&
+               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
                !ST->hasCaymanISA() &&
                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
@@ -239,8 +240,8 @@ private:
 
   bool IsTrivialInst(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
-    case AMDGPU::KILL:
-    case AMDGPU::RETURN:
+    case R600::KILL:
+    case R600::RETURN:
       return true;
     default:
       return false;
@@ -249,44 +250,44 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
+    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
-      Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+      Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600;
       break;
     case CF_VC:
-      Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+      Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600;
       break;
     case CF_CALL_FS:
-      Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+      Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600;
       break;
     case CF_WHILE_LOOP:
-      Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+      Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600;
       break;
     case CF_END_LOOP:
-      Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+      Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600;
       break;
     case CF_LOOP_BREAK:
-      Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+      Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600;
       break;
     case CF_LOOP_CONTINUE:
-      Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+      Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600;
       break;
     case CF_JUMP:
-      Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+      Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600;
       break;
     case CF_ELSE:
-      Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+      Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600;
       break;
     case CF_POP:
-      Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+      Opcode = isEg ? R600::POP_EG : R600::POP_R600;
       break;
     case CF_END:
       if (ST->hasCaymanISA()) {
-        Opcode = AMDGPU::CF_END_CM;
+        Opcode = R600::CF_END_CM;
         break;
       }
-      Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+      Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600;
       break;
     }
     assert (Opcode && "No opcode selected");
@@ -304,21 +305,21 @@ private:
         continue;
       if (MO.isDef()) {
         unsigned Reg = MO.getReg();
-        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+        if (R600::R600_Reg128RegClass.contains(Reg))
           DstMI = Reg;
         else
           DstMI = TRI->getMatchingSuperReg(Reg,
-              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
-              &AMDGPU::R600_Reg128RegClass);
+              AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &R600::R600_Reg128RegClass);
       }
       if (MO.isUse()) {
         unsigned Reg = MO.getReg();
-        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+        if (R600::R600_Reg128RegClass.contains(Reg))
           SrcMI = Reg;
         else
           SrcMI = TRI->getMatchingSuperReg(Reg,
-              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
-              &AMDGPU::R600_Reg128RegClass);
+              AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &R600::R600_Reg128RegClass);
       }
     }
     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
@@ -358,15 +359,15 @@ private:
 
   void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
     static const unsigned LiteralRegs[] = {
-      AMDGPU::ALU_LITERAL_X,
-      AMDGPU::ALU_LITERAL_Y,
-      AMDGPU::ALU_LITERAL_Z,
-      AMDGPU::ALU_LITERAL_W
+      R600::ALU_LITERAL_X,
+      R600::ALU_LITERAL_Y,
+      R600::ALU_LITERAL_Z,
+      R600::ALU_LITERAL_W
     };
     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
         TII->getSrcs(MI);
     for (const auto &Src:Srcs) {
-      if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
+      if (Src.first->getReg() != R600::ALU_LITERAL_X)
         continue;
       int64_t Imm = Src.second;
       std::vector<MachineOperand *>::iterator It =
@@ -376,7 +377,7 @@ private:
 
       // Get corresponding Operand
       MachineOperand &Operand = MI.getOperand(
-          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+          TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal));
 
       if (It != Lits.end()) {
         // Reuse existing literal reg
@@ -399,7 +400,7 @@ private:
       unsigned LiteralPair0 = Literals[i];
       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
-          TII->get(AMDGPU::LITERALS))
+          TII->get(R600::LITERALS))
           .addImm(LiteralPair0)
           .addImm(LiteralPair1);
     }
@@ -441,7 +442,7 @@ private:
       }
       for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
         MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
-            TII->get(AMDGPU::LITERALS));
+            TII->get(R600::LITERALS));
         if (Literals[i]->isImm()) {
             MILit.addImm(Literals[i]->getImm());
         } else {
@@ -470,7 +471,7 @@ private:
                        unsigned &CfCount) {
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
-    BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
+    BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
       BB->splice(InsertPos, BB, Clause.second[i]);
     }
@@ -482,7 +483,7 @@ private:
     Clause.first->getOperand(0).setImm(0);
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
-    BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
+    BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
       BB->splice(InsertPos, BB, Clause.second[i]);
     }
@@ -531,7 +532,7 @@ public:
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
         if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
-          DEBUG(dbgs() << CfCount << ":"; I->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
           LastAlu.back() = nullptr;
@@ -539,33 +540,34 @@ public:
         }
 
         MachineBasicBlock::iterator MI = I;
-        if (MI->getOpcode() != AMDGPU::ENDIF)
+        if (MI->getOpcode() != R600::ENDIF)
           LastAlu.back() = nullptr;
-        if (MI->getOpcode() == AMDGPU::CF_ALU)
+        if (MI->getOpcode() == R600::CF_ALU)
           LastAlu.back() = &*MI;
         I++;
         bool RequiresWorkAround =
             CFStack.requiresWorkAroundForInst(MI->getOpcode());
         switch (MI->getOpcode()) {
-        case AMDGPU::CF_ALU_PUSH_BEFORE:
+        case R600::CF_ALU_PUSH_BEFORE:
           if (RequiresWorkAround) {
-            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
-            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+            LLVM_DEBUG(dbgs()
+                       << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG))
                 .addImm(CfCount + 1)
                 .addImm(1);
-            MI->setDesc(TII->get(AMDGPU::CF_ALU));
+            MI->setDesc(TII->get(R600::CF_ALU));
             CfCount++;
-            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+            CFStack.pushBranch(R600::CF_PUSH_EG);
           } else
-            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+            CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE);
           LLVM_FALLTHROUGH;
-        case AMDGPU::CF_ALU:
+        case R600::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
-          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
           CfCount++;
           break;
-        case AMDGPU::WHILELOOP: {
+        case R600::WHILELOOP: {
           CFStack.pushLoop();
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_WHILE_LOOP))
@@ -578,7 +580,7 @@ public:
           CfCount++;
           break;
         }
-        case AMDGPU::ENDLOOP: {
+        case R600::ENDLOOP: {
           CFStack.popLoop();
           std::pair<unsigned, std::set<MachineInstr *>> Pair =
               std::move(LoopStack.back());
@@ -590,19 +592,19 @@ public:
           CfCount++;
           break;
         }
-        case AMDGPU::IF_PREDICATE_SET: {
+        case R600::IF_PREDICATE_SET: {
           LastAlu.push_back(nullptr);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_JUMP))
               .addImm(0)
               .addImm(0);
           IfThenElseStack.push_back(MIb);
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
           MI->eraseFromParent();
           CfCount++;
           break;
         }
-        case AMDGPU::ELSE: {
+        case R600::ELSE: {
           MachineInstr * JumpInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
           CounterPropagateAddr(*JumpInst, CfCount);
@@ -610,13 +612,13 @@ public:
               getHWInstrDesc(CF_ELSE))
               .addImm(0)
               .addImm(0);
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
           IfThenElseStack.push_back(MIb);
           MI->eraseFromParent();
           CfCount++;
           break;
         }
-        case AMDGPU::ENDIF: {
+        case R600::ENDIF: {
           CFStack.popBranch();
           if (LastAlu.back()) {
             ToPopAfter.push_back(LastAlu.back());
@@ -626,7 +628,7 @@ public:
                 .addImm(CfCount + 1)
                 .addImm(1);
             (void)MIb;
-            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+            LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
             CfCount++;
           }
 
@@ -638,7 +640,7 @@ public:
           MI->eraseFromParent();
           break;
         }
-        case AMDGPU::BREAK: {
+        case R600::BREAK: {
           CfCount ++;
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_LOOP_BREAK))
@@ -647,7 +649,7 @@ public:
           MI->eraseFromParent();
           break;
         }
-        case AMDGPU::CONTINUE: {
+        case R600::CONTINUE: {
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_LOOP_CONTINUE))
               .addImm(0);
@@ -656,12 +658,12 @@ public:
           CfCount++;
           break;
         }
-        case AMDGPU::RETURN: {
+        case R600::RETURN: {
           DebugLoc DL = MBB.findDebugLoc(MI);
           BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
           CfCount++;
           if (CfCount % 2) {
-            BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
+            BuildMI(MBB, I, DL, TII->get(R600::PAD));
             CfCount++;
           }
           MI->eraseFromParent();
@@ -673,7 +675,7 @@ public:
         }
         default:
           if (TII->isExport(MI->getOpcode())) {
-            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
             CfCount++;
           }
           break;
@@ -682,7 +684,7 @@ public:
       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
         MachineInstr *Alu = ToPopAfter[i];
         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
-            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            TII->get(R600::CF_ALU_POP_AFTER))
             .addImm(Alu->getOperand(0).getImm())
             .addImm(Alu->getOperand(1).getImm())
             .addImm(Alu->getOperand(2).getImm())
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
index 534461adc59f..0d33d82e8e0f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
@@ -23,7 +23,7 @@
 #define MO_FLAG_LAST  (1 << 6)
 #define NUM_MO_FLAGS 7
 
-/// \brief Helper for getting the operand index for the instruction flags
+/// Helper for getting the operand index for the instruction flags
 /// operand.
 #define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
 
@@ -52,7 +52,7 @@ namespace R600_InstFlag {
 
 #define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
 
-/// \brief Defines for extracting register information from register encoding
+/// Defines for extracting register information from register encoding
 #define HW_REG_MASK 0x1ff
 #define HW_CHAN_SHIFT 9
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 0d8ccd088ec4..1683fe6c9a57 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -19,6 +19,7 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -51,12 +52,12 @@ private:
 
   unsigned OccupiedDwords(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT_4:
+    case R600::INTERP_PAIR_XY:
+    case R600::INTERP_PAIR_ZW:
+    case R600::INTERP_VEC_LOAD:
+    case R600::DOT_4:
       return 4;
-    case AMDGPU::KILL:
+    case R600::KILL:
       return 0;
     default:
       break;
@@ -76,7 +77,7 @@ private:
                                     E = MI.operands_end();
          It != E; ++It) {
       MachineOperand &MO = *It;
-      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+      if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
         ++NumLiteral;
     }
     return 1 + NumLiteral;
@@ -88,12 +89,12 @@ private:
     if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
       return true;
     switch (MI.getOpcode()) {
-    case AMDGPU::PRED_X:
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::COPY:
-    case AMDGPU::DOT_4:
+    case R600::PRED_X:
+    case R600::INTERP_PAIR_XY:
+    case R600::INTERP_PAIR_ZW:
+    case R600::INTERP_VEC_LOAD:
+    case R600::COPY:
+    case R600::DOT_4:
       return true;
     default:
       return false;
@@ -102,9 +103,9 @@ private:
 
   bool IsTrivialInst(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
-    case AMDGPU::KILL:
-    case AMDGPU::RETURN:
-    case AMDGPU::IMPLICIT_DEF:
+    case R600::KILL:
+    case R600::RETURN:
+    case R600::IMPLICIT_DEF:
       return true;
     default:
       return false;
@@ -131,16 +132,16 @@ private:
                        bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned>> UsedKCache;
 
-    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
+    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != R600::DOT_4)
       return true;
 
     const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
         TII->getSrcs(MI);
     assert(
-        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) &&
         "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+      if (Consts[i].first->getReg() != R600::ALU_CONST)
         continue;
       unsigned Sel = Consts[i].second;
       unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
@@ -171,16 +172,16 @@ private:
       return true;
 
     for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
-      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+      if (Consts[i].first->getReg() != R600::ALU_CONST)
         continue;
       switch(UsedKCache[j].first) {
       case 0:
         Consts[i].first->setReg(
-            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+            R600::R600_KC0RegClass.getRegister(UsedKCache[j].second));
         break;
       case 1:
         Consts[i].first->setReg(
-            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+            R600::R600_KC1RegClass.getRegister(UsedKCache[j].second));
         break;
       default:
         llvm_unreachable("Wrong Cache Line");
@@ -252,7 +253,7 @@ private:
         break;
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
-      if (I->getOpcode() == AMDGPU::PRED_X) {
+      if (I->getOpcode() == R600::PRED_X) {
         // We put PRED_X in its own clause to ensure that ifcvt won't create
         // clauses with more than 128 insts.
         // IfCvt is indeed checking that "then" and "else" branches of an if
@@ -288,7 +289,7 @@ private:
       AluInstCount += OccupiedDwords(*I);
     }
     unsigned Opcode = PushBeforeModifier ?
-        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+        R600::CF_ALU_PUSH_BEFORE : R600::CF_ALU;
     BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
     // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
     // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
@@ -321,7 +322,7 @@ public:
                                                     BB != BB_E; ++BB) {
       MachineBasicBlock &MBB = *BB;
       MachineBasicBlock::iterator I = MBB.begin();
-      if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
+      if (I != MBB.end() && I->getOpcode() == R600::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
         if (isALU(*I)) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index ffea231ee4d0..b924ff019dd1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -21,6 +21,7 @@
 #include "R600RegisterInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -95,16 +96,16 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
 
       // Expand LDS_*_RET instructions
       if (TII->isLDSRetInstr(MI.getOpcode())) {
-        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
         assert(DstIdx != -1);
         MachineOperand &DstOp = MI.getOperand(DstIdx);
         MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
-                                               DstOp.getReg(), AMDGPU::OQAP);
-        DstOp.setReg(AMDGPU::OQAP);
+                                               DstOp.getReg(), R600::OQAP);
+        DstOp.setReg(R600::OQAP);
         int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
-                                           AMDGPU::OpName::pred_sel);
+                                           R600::OpName::pred_sel);
         int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
-                                           AMDGPU::OpName::pred_sel);
+                                           R600::OpName::pred_sel);
         // Copy the pred_sel bit
         Mov->getOperand(MovPredSelIdx).setReg(
             MI.getOperand(LDSPredSelIdx).getReg());
@@ -113,7 +114,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       switch (MI.getOpcode()) {
       default: break;
       // Expand PRED_X to one of the PRED_SET instructions.
-      case AMDGPU::PRED_X: {
+      case R600::PRED_X: {
         uint64_t Flags = MI.getOperand(3).getImm();
         // The native opcode used by PRED_X is stored as an immediate in the
         // third operand.
@@ -121,17 +122,18 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
                                             MI.getOperand(2).getImm(), // opcode
                                             MI.getOperand(0).getReg(), // dst
                                             MI.getOperand(1).getReg(), // src0
-                                            AMDGPU::ZERO);             // src1
+                                            R600::ZERO);             // src1
         TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
         if (Flags & MO_FLAG_PUSH) {
-          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
+          TII->setImmOperand(*PredSet, R600::OpName::update_exec_mask, 1);
         } else {
-          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
+          TII->setImmOperand(*PredSet, R600::OpName::update_pred, 1);
         }
         MI.eraseFromParent();
         continue;
         }
-      case AMDGPU::DOT_4: {
+      case R600::DOT_4: {
+
         const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
         unsigned DstReg = MI.getOperand(0).getReg();
@@ -140,7 +142,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         for (unsigned Chan = 0; Chan < 4; ++Chan) {
           bool Mask = (Chan != TRI.getHWRegChan(DstReg));
           unsigned SubDstReg =
-              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+              R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
           MachineInstr *BMI =
               TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
           if (Chan > 0) {
@@ -155,10 +157,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
           // While not strictly necessary from hw point of view, we force
           // all src operands of a dot4 inst to belong to the same slot.
           unsigned Src0 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+              TII->getOperandIdx(Opcode, R600::OpName::src0))
               .getReg();
           unsigned Src1 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+              TII->getOperandIdx(Opcode, R600::OpName::src1))
               .getReg();
           (void) Src0;
           (void) Src1;
@@ -205,26 +207,26 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       // T0_W = CUBE T1_Y, T1_Z
       for (unsigned Chan = 0; Chan < 4; Chan++) {
         unsigned DstReg = MI.getOperand(
-                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+                            TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
         unsigned Src0 = MI.getOperand(
-                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+                           TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
         unsigned Src1 = 0;
 
         // Determine the correct source registers
         if (!IsCube) {
-          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+          int Src1Idx = TII->getOperandIdx(MI, R600::OpName::src1);
           if (Src1Idx != -1) {
             Src1 = MI.getOperand(Src1Idx).getReg();
           }
         }
         if (IsReduction) {
-          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
           Src0 = TRI.getSubReg(Src0, SubRegIndex);
           Src1 = TRI.getSubReg(Src1, SubRegIndex);
         } else if (IsCube) {
           static const int CubeSrcSwz[] = {2, 2, 0, 1};
-          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
-          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
           Src1 = TRI.getSubReg(Src0, SubRegIndex1);
           Src0 = TRI.getSubReg(Src0, SubRegIndex0);
         }
@@ -233,14 +235,14 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         bool Mask = false;
         bool NotLast = true;
         if (IsCube) {
-          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
           DstReg = TRI.getSubReg(DstReg, SubRegIndex);
         } else {
           // Mask the write if the original instruction does not write to
           // the current Channel.
           Mask = (Chan != TRI.getHWRegChan(DstReg));
           unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
-          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          DstReg = R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
         }
 
         // Set the IsLast bit
@@ -249,11 +251,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         // Add the new instruction
         unsigned Opcode = MI.getOpcode();
         switch (Opcode) {
-        case AMDGPU::CUBE_r600_pseudo:
-          Opcode = AMDGPU::CUBE_r600_real;
+        case R600::CUBE_r600_pseudo:
+          Opcode = R600::CUBE_r600_real;
           break;
-        case AMDGPU::CUBE_eg_pseudo:
-          Opcode = AMDGPU::CUBE_eg_real;
+        case R600::CUBE_eg_pseudo:
+          Opcode = R600::CUBE_eg_real;
           break;
         default:
           break;
@@ -270,12 +272,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         if (NotLast) {
           TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
         }
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::clamp);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::literal);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_abs);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_abs);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_neg);
+        SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_neg);
       }
       MI.eraseFromParent();
     }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 66291d0be4e6..113d6249fa60 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -8,18 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Custom DAG lowering for R600
+/// Custom DAG lowering for R600
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
 #include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600FrameLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -35,13 +35,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -50,17 +50,19 @@
 
 using namespace llvm;
 
+#include "R600GenCallingConv.inc"
+
 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
                                        const R600Subtarget &STI)
-    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
-  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
-  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
-  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+    : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+  addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
+  addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
+  addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
+  addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
 
-  computeRegisterProperties(STI.getRegisterInfo());
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -147,6 +149,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
+  setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+  setOperationAction(ISD::FRINT, MVT::f64, Custom);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -216,6 +223,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f64, Expand);
   }
 
+  // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+  // need it for R600.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
+  // need it for R600.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -245,14 +280,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::LOAD);
 }
 
-const R600Subtarget *R600TargetLowering::getSubtarget() const {
-  return static_cast<const R600Subtarget *>(Subtarget);
-}
-
 static inline bool isEOP(MachineBasicBlock::iterator I) {
   if (std::next(I) == I->getParent()->end())
     return false;
-  return std::next(I)->getOpcode() == AMDGPU::RETURN;
+  return std::next(I)->getOpcode() == R600::RETURN;
 }
 
 MachineBasicBlock *
@@ -261,24 +292,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = MI;
-  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  const R600InstrInfo *TII = Subtarget->getInstrInfo();
 
   switch (MI.getOpcode()) {
   default:
     // Replace LDS_*_RET instruction that don't have any uses with the
     // equivalent LDS_*_NORET instruction.
     if (TII->isLDSRetInstr(MI.getOpcode())) {
-      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+      int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
       assert(DstIdx != -1);
       MachineInstrBuilder NewMI;
       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
       //        LDS_1A2D support and remove this special case.
       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
-          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
+          MI.getOpcode() == R600::LDS_CMPST_RET)
         return BB;
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+                      TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
         NewMI.add(MI.getOperand(i));
       }
@@ -286,31 +317,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     }
     break;
-  case AMDGPU::CLAMP_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
-        MI.getOperand(1).getReg());
-    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
-    break;
-  }
 
-  case AMDGPU::FABS_R600: {
+  case R600::FABS_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        *BB, I, R600::MOV, MI.getOperand(0).getReg(),
         MI.getOperand(1).getReg());
     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     break;
   }
 
-  case AMDGPU::FNEG_R600: {
+  case R600::FNEG_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        *BB, I, R600::MOV, MI.getOperand(0).getReg(),
         MI.getOperand(1).getReg());
     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     break;
   }
 
-  case AMDGPU::MASK_WRITE: {
+  case R600::MASK_WRITE: {
     unsigned maskedRegister = MI.getOperand(0).getReg();
     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
@@ -318,7 +342,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     break;
   }
 
-  case AMDGPU::MOV_IMM_F32:
+  case R600::MOV_IMM_F32:
     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
                                                             .getFPImm()
                                                             ->getValueAPF()
@@ -326,39 +350,39 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                             .getZExtValue());
     break;
 
-  case AMDGPU::MOV_IMM_I32:
+  case R600::MOV_IMM_I32:
     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
                      MI.getOperand(1).getImm());
     break;
 
-  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+  case R600::MOV_IMM_GLOBAL_ADDR: {
     //TODO: Perhaps combine this instruction with the next if possible
     auto MIB = TII->buildDefaultInstruction(
-        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
-    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+        *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
+    int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
     //TODO: Ugh this is rather ugly
     MIB->getOperand(Idx) = MI.getOperand(1);
     break;
   }
 
-  case AMDGPU::CONST_COPY: {
+  case R600::CONST_COPY: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
-        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
-    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+        *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST);
+    TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
                        MI.getOperand(1).getImm());
     break;
   }
 
-  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
-  case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
+  case R600::RAT_WRITE_CACHELESS_32_eg:
+  case R600::RAT_WRITE_CACHELESS_64_eg:
+  case R600::RAT_WRITE_CACHELESS_128_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
-  case AMDGPU::RAT_STORE_TYPED_eg:
+  case R600::RAT_STORE_TYPED_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
@@ -366,49 +390,49 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
-  case AMDGPU::BRANCH:
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+  case R600::BRANCH:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
         .add(MI.getOperand(0));
     break;
 
-  case AMDGPU::BRANCH_COND_f32: {
+  case R600::BRANCH_COND_f32: {
     MachineInstr *NewMI =
-        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-                AMDGPU::PREDICATE_BIT)
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+                R600::PREDICATE_BIT)
             .add(MI.getOperand(1))
-            .addImm(AMDGPU::PRED_SETNE)
+            .addImm(R600::PRED_SETNE)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
         .add(MI.getOperand(0))
-        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addReg(R600::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
-  case AMDGPU::BRANCH_COND_i32: {
+  case R600::BRANCH_COND_i32: {
     MachineInstr *NewMI =
-        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-                AMDGPU::PREDICATE_BIT)
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
+                R600::PREDICATE_BIT)
             .add(MI.getOperand(1))
-            .addImm(AMDGPU::PRED_SETNE_INT)
+            .addImm(R600::PRED_SETNE_INT)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
         .add(MI.getOperand(0))
-        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addReg(R600::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
-  case AMDGPU::EG_ExportSwz:
-  case AMDGPU::R600_ExportSwz: {
+  case R600::EG_ExportSwz:
+  case R600::R600_ExportSwz: {
     // Instruction is left unmodified if its not the last one of its type
     bool isLastInstructionOfItsType = true;
     unsigned InstExportType = MI.getOperand(1).getImm();
     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
          EndBlock = BB->end(); NextExportInst != EndBlock;
          NextExportInst = std::next(NextExportInst)) {
-      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
-          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+      if (NextExportInst->getOpcode() == R600::EG_ExportSwz ||
+          NextExportInst->getOpcode() == R600::R600_ExportSwz) {
         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
             .getImm();
         if (CurrentInstExportType == InstExportType) {
@@ -420,7 +444,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
-    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+    unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40;
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
@@ -433,7 +457,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addImm(EOP);
     break;
   }
-  case AMDGPU::RETURN: {
+  case R600::RETURN: {
     return BB;
   }
   }
@@ -478,7 +502,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
-    case AMDGPUIntrinsic::r600_store_swizzle: {
+    case Intrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
       const SDValue Args[8] = {
         Chain,
@@ -505,14 +529,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
     switch (IntrinsicID) {
-    case AMDGPUIntrinsic::r600_tex:
-    case AMDGPUIntrinsic::r600_texc: {
+    case Intrinsic::r600_tex:
+    case Intrinsic::r600_texc: {
       unsigned TextureOp;
       switch (IntrinsicID) {
-      case AMDGPUIntrinsic::r600_tex:
+      case Intrinsic::r600_tex:
         TextureOp = 0;
         break;
-      case AMDGPUIntrinsic::r600_texc:
+      case Intrinsic::r600_texc:
         TextureOp = 1;
         break;
       default:
@@ -542,7 +566,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       };
       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
-    case AMDGPUIntrinsic::r600_dot4: {
+    case Intrinsic::r600_dot4: {
       SDValue Args[8] = {
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(0, DL, MVT::i32)),
@@ -566,7 +590,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 
     case Intrinsic::r600_implicitarg_ptr: {
       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
-      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+      uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
     case Intrinsic::r600_read_ngroups_x:
@@ -589,23 +613,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T1_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T1_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T1_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T0_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T0_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
-                                     AMDGPU::T0_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
+                                     R600::T0_Z, VT);
 
     case Intrinsic::r600_recipsqrt_ieee:
       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
@@ -755,7 +779,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT, FractPart,
         DAG.getConstantFP(-0.5, DL, MVT::f32)));
-  if (Gen >= R600Subtarget::R700)
+  if (Gen >= AMDGPUSubtarget::R700)
     return TrigVal;
   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
@@ -1527,7 +1551,7 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
                                             SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+  const R600FrameLowering *TFL = Subtarget->getFrameLowering();
 
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
@@ -1539,6 +1563,28 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
                          Op.getValueType());
 }
 
+CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                  bool IsVarArg) const {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::Cold:
+    llvm_unreachable("kernels should not be handled here");
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_ES:
+  case CallingConv::AMDGPU_LS:
+    return CC_R600;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
@@ -1550,8 +1596,6 @@ SDValue R600TargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   MachineFunction &MF = DAG.getMachineFunction();
-  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
   SmallVector<ISD::InputArg, 8> LocalIns;
 
   if (AMDGPU::isShader(CallConv)) {
@@ -1571,7 +1615,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     if (AMDGPU::isShader(CallConv)) {
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
       continue;
@@ -1602,19 +1646,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
-    unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
     SDValue Arg = DAG.getLoad(
         ISD::UNINDEXED, Ext, VT, DL, Chain,
-        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+        DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
+        PtrInfo,
         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
                                         MachineMemOperand::MODereferenceable |
                                         MachineMemOperand::MOInvariant);
 
     // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
-    MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
   }
   return Chain;
 }
@@ -1989,26 +2032,26 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
                                      SDValue &Sel, SDValue &Imm,
                                      SelectionDAG &DAG) const {
-  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  const R600InstrInfo *TII = Subtarget->getInstrInfo();
   if (!Src.isMachineOpcode())
     return false;
 
   switch (Src.getMachineOpcode()) {
-  case AMDGPU::FNEG_R600:
+  case R600::FNEG_R600:
     if (!Neg.getNode())
       return false;
     Src = Src.getOperand(0);
     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
     return true;
-  case AMDGPU::FABS_R600:
+  case R600::FABS_R600:
     if (!Abs.getNode())
       return false;
     Src = Src.getOperand(0);
     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
     return true;
-  case AMDGPU::CONST_COPY: {
+  case R600::CONST_COPY: {
     unsigned Opcode = ParentNode->getMachineOpcode();
-    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+    bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
 
     if (!Sel.getNode())
       return false;
@@ -2019,17 +2062,17 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
 
     // Gather constants values
     int SrcIndices[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0),
+      TII->getOperandIdx(Opcode, R600::OpName::src1),
+      TII->getOperandIdx(Opcode, R600::OpName::src2),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_W)
     };
     std::vector<unsigned> Consts;
     for (int OtherSrcIdx : SrcIndices) {
@@ -2042,7 +2085,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
       }
       if (RegisterSDNode *Reg =
           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
-        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        if (Reg->getReg() == R600::ALU_CONST) {
           ConstantSDNode *Cst
             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
           Consts.push_back(Cst->getZExtValue());
@@ -2057,30 +2100,30 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
     }
 
     Sel = CstOffset;
-    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
     return true;
   }
-  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+  case R600::MOV_IMM_GLOBAL_ADDR:
     // Check if the Imm slot is used. Taken from below.
     if (cast<ConstantSDNode>(Imm)->getZExtValue())
       return false;
     Imm = Src.getOperand(0);
-    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+    Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
     return true;
-  case AMDGPU::MOV_IMM_I32:
-  case AMDGPU::MOV_IMM_F32: {
-    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+  case R600::MOV_IMM_I32:
+  case R600::MOV_IMM_F32: {
+    unsigned ImmReg = R600::ALU_LITERAL_X;
     uint64_t ImmValue = 0;
 
-    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+    if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
       float FloatValue = FPC->getValueAPF().convertToFloat();
       if (FloatValue == 0.0) {
-        ImmReg = AMDGPU::ZERO;
+        ImmReg = R600::ZERO;
       } else if (FloatValue == 0.5) {
-        ImmReg = AMDGPU::HALF;
+        ImmReg = R600::HALF;
       } else if (FloatValue == 1.0) {
-        ImmReg = AMDGPU::ONE;
+        ImmReg = R600::ONE;
       } else {
         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
       }
@@ -2088,9 +2131,9 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
       uint64_t Value = C->getZExtValue();
       if (Value == 0) {
-        ImmReg = AMDGPU::ZERO;
+        ImmReg = R600::ZERO;
       } else if (Value == 1) {
-        ImmReg = AMDGPU::ONE_INT;
+        ImmReg = R600::ONE_INT;
       } else {
         ImmValue = Value;
       }
@@ -2099,7 +2142,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
     // Check that we aren't already using an immediate.
     // XXX: It's possible for an instruction to have more than one
     // immediate operand, but this is not supported yet.
-    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+    if (ImmReg == R600::ALU_LITERAL_X) {
       if (!Imm.getNode())
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
@@ -2116,10 +2159,10 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
   }
 }
 
-/// \brief Fold the instructions after selecting them
+/// Fold the instructions after selecting them
 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
                                             SelectionDAG &DAG) const {
-  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  const R600InstrInfo *TII = Subtarget->getInstrInfo();
   if (!Node->isMachineOpcode())
     return Node;
 
@@ -2128,36 +2171,36 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
 
   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
 
-  if (Opcode == AMDGPU::DOT_4) {
+  if (Opcode == R600::DOT_4) {
     int OperandIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_W)
         };
     int NegIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
     };
     int AbsIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
     };
     for (unsigned i = 0; i < 8; i++) {
       if (OperandIdx[i] < 0)
@@ -2165,7 +2208,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
       SDValue &Src = Ops[OperandIdx[i] - 1];
       SDValue &Neg = Ops[NegIdx[i] - 1];
       SDValue &Abs = Ops[AbsIdx[i] - 1];
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
       if (HasDst)
         SelIdx--;
@@ -2173,42 +2216,28 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
-  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+  } else if (Opcode == R600::REG_SEQUENCE) {
     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
       SDValue &Src = Ops[i];
       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
-  } else if (Opcode == AMDGPU::CLAMP_R600) {
-    SDValue Src = Node->getOperand(0);
-    if (!Src.isMachineOpcode() ||
-        !TII->hasInstrModifiers(Src.getMachineOpcode()))
-      return Node;
-    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
-        AMDGPU::OpName::clamp);
-    if (ClampIdx < 0)
-      return Node;
-    SDLoc DL(Node);
-    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
-    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
-    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
-                              Node->getVTList(), Ops);
   } else {
     if (!TII->hasInstrModifiers(Opcode))
       return Node;
     int OperandIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+      TII->getOperandIdx(Opcode, R600::OpName::src0),
+      TII->getOperandIdx(Opcode, R600::OpName::src1),
+      TII->getOperandIdx(Opcode, R600::OpName::src2)
     };
     int NegIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+      TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
     };
     int AbsIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
       -1
     };
     for (unsigned i = 0; i < 3; i++) {
@@ -2218,9 +2247,9 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
       SDValue &Neg = Ops[NegIdx[i] - 1];
       SDValue FakeAbs;
       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
-      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
       if (HasDst) {
         SelIdx--;
         ImmIdx--;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 2a774693f02b..907d1f10e151 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 DAG Lowering interface definition
+/// R600 DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,6 +23,8 @@ class R600InstrInfo;
 class R600Subtarget;
 
 class R600TargetLowering final : public AMDGPUTargetLowering {
+
+  const R600Subtarget *Subtarget;
 public:
   R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
 
@@ -36,6 +38,7 @@ public:
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
index 61106ed42e64..687a9affa138 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
 
 def isR600toCayman : Predicate<
-    "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+    "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
   let SubtargetPredicate = isR600toCayman;
@@ -41,7 +41,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   bit LDS_1A2D = 0;
 
   let SubtargetPredicate = isR600toCayman;
-  let Namespace = "AMDGPU";
+  let Namespace = "R600";
   let OutOperandList = outs;
   let InOperandList = ins;
   let AsmString = asm;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 23e646c8147c..5397e779474c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 Implementation of TargetInstrInfo.
+/// R600 Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,6 +19,7 @@
 #include "R600Defines.h"
 #include "R600FrameLowering.h"
 #include "R600RegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -44,10 +45,15 @@
 using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenDFAPacketizer.inc"
+#include "R600GenDFAPacketizer.inc"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_NAMED_OPS
+#include "R600GenInstrInfo.inc"
 
 R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+  : R600GenInstrInfo(-1, -1), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -58,31 +64,31 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                 const DebugLoc &DL, unsigned DestReg,
                                 unsigned SrcReg, bool KillSrc) const {
   unsigned VectorComponents = 0;
-  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
-      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
-      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
-       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+  if ((R600::R600_Reg128RegClass.contains(DestReg) ||
+      R600::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (R600::R600_Reg128RegClass.contains(SrcReg) ||
+       R600::R600_Reg128VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 4;
-  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
-            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
-            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
-             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+  } else if((R600::R600_Reg64RegClass.contains(DestReg) ||
+            R600::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (R600::R600_Reg64RegClass.contains(SrcReg) ||
+             R600::R600_Reg64VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 2;
   }
 
   if (VectorComponents > 0) {
     for (unsigned I = 0; I < VectorComponents; I++) {
-      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
-      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+      unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, R600::MOV,
                               RI.getSubReg(DestReg, SubRegIndex),
                               RI.getSubReg(SrcReg, SubRegIndex))
                               .addReg(DestReg,
                                       RegState::Define | RegState::Implicit);
     }
   } else {
-    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, R600::MOV,
                                                   DestReg, SrcReg);
-    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+    NewMI->getOperand(getOperandIdx(*NewMI, R600::OpName::src0))
                                     .setIsKill(KillSrc);
   }
 }
@@ -103,9 +109,9 @@ bool R600InstrInfo::isMov(unsigned Opcode) const {
   switch(Opcode) {
   default:
     return false;
-  case AMDGPU::MOV:
-  case AMDGPU::MOV_IMM_F32:
-  case AMDGPU::MOV_IMM_I32:
+  case R600::MOV:
+  case R600::MOV_IMM_F32:
+  case R600::MOV_IMM_I32:
     return true;
   }
 }
@@ -117,10 +123,10 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
 bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
   switch(Opcode) {
     default: return false;
-    case AMDGPU::CUBE_r600_pseudo:
-    case AMDGPU::CUBE_r600_real:
-    case AMDGPU::CUBE_eg_pseudo:
-    case AMDGPU::CUBE_eg_real:
+    case R600::CUBE_r600_pseudo:
+    case R600::CUBE_r600_real:
+    case R600::CUBE_eg_pseudo:
+    case R600::CUBE_eg_real:
       return true;
   }
 }
@@ -148,7 +154,7 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
-  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, R600::OpName::dst) != -1;
 }
 
 bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
@@ -157,12 +163,12 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
   if (isVector(MI) || isCubeOp(MI.getOpcode()))
     return true;
   switch (MI.getOpcode()) {
-  case AMDGPU::PRED_X:
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::COPY:
-  case AMDGPU::DOT_4:
+  case R600::PRED_X:
+  case R600::INTERP_PAIR_XY:
+  case R600::INTERP_PAIR_ZW:
+  case R600::INTERP_VEC_LOAD:
+  case R600::COPY:
+  case R600::DOT_4:
     return true;
   default:
     return false;
@@ -172,7 +178,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
   if (ST.hasCaymanISA())
     return false;
-  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+  return (get(Opcode).getSchedClass() == R600::Sched::TransALU);
 }
 
 bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
@@ -180,7 +186,7 @@ bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
 }
 
 bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
-  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+  return (get(Opcode).getSchedClass() == R600::Sched::VecALU);
 }
 
 bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
@@ -214,8 +220,8 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
   switch (Opcode) {
-  case AMDGPU::KILLGT:
-  case AMDGPU::GROUP_BARRIER:
+  case R600::KILLGT:
+  case R600::GROUP_BARRIER:
     return true;
   default:
     return false;
@@ -223,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+  return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
 }
 
 bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+  return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
 }
 
 bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
@@ -241,7 +247,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
         TargetRegisterInfo::isVirtualRegister(I->getReg()))
       continue;
 
-    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+    if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
       return true;
   }
   return false;
@@ -249,17 +255,17 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
 
 int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
   static const unsigned SrcSelTable[][2] = {
-    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
-    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
-    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
-    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
-    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
-    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
-    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
-    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
-    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
-    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
-    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+    {R600::OpName::src0, R600::OpName::src0_sel},
+    {R600::OpName::src1, R600::OpName::src1_sel},
+    {R600::OpName::src2, R600::OpName::src2_sel},
+    {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+    {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+    {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+    {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+    {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+    {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+    {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+    {R600::OpName::src1_W, R600::OpName::src1_sel_W}
   };
 
   for (const auto &Row : SrcSelTable) {
@@ -274,23 +280,23 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
 R600InstrInfo::getSrcs(MachineInstr &MI) const {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
 
-  if (MI.getOpcode() == AMDGPU::DOT_4) {
+  if (MI.getOpcode() == R600::DOT_4) {
     static const unsigned OpTable[8][2] = {
-      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
-      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
-      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
-      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
-      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
-      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
-      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
-      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+      {R600::OpName::src0_X, R600::OpName::src0_sel_X},
+      {R600::OpName::src0_Y, R600::OpName::src0_sel_Y},
+      {R600::OpName::src0_Z, R600::OpName::src0_sel_Z},
+      {R600::OpName::src0_W, R600::OpName::src0_sel_W},
+      {R600::OpName::src1_X, R600::OpName::src1_sel_X},
+      {R600::OpName::src1_Y, R600::OpName::src1_sel_Y},
+      {R600::OpName::src1_Z, R600::OpName::src1_sel_Z},
+      {R600::OpName::src1_W, R600::OpName::src1_sel_W},
     };
 
     for (unsigned j = 0; j < 8; j++) {
       MachineOperand &MO =
           MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
       unsigned Reg = MO.getReg();
-      if (Reg == AMDGPU::ALU_CONST) {
+      if (Reg == R600::ALU_CONST) {
         MachineOperand &Sel =
             MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
         Result.push_back(std::make_pair(&MO, Sel.getImm()));
@@ -302,9 +308,9 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
   }
 
   static const unsigned OpTable[3][2] = {
-    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
-    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
-    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+    {R600::OpName::src0, R600::OpName::src0_sel},
+    {R600::OpName::src1, R600::OpName::src1_sel},
+    {R600::OpName::src2, R600::OpName::src2_sel},
   };
 
   for (unsigned j = 0; j < 3; j++) {
@@ -313,15 +319,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
       break;
     MachineOperand &MO = MI.getOperand(SrcIdx);
     unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::ALU_CONST) {
+    if (Reg == R600::ALU_CONST) {
       MachineOperand &Sel =
           MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
       Result.push_back(std::make_pair(&MO, Sel.getImm()));
       continue;
     }
-    if (Reg == AMDGPU::ALU_LITERAL_X) {
+    if (Reg == R600::ALU_LITERAL_X) {
       MachineOperand &Operand =
-          MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+          MI.getOperand(getOperandIdx(MI.getOpcode(), R600::OpName::literal));
       if (Operand.isImm()) {
         Result.push_back(std::make_pair(&MO, Operand.getImm()));
         continue;
@@ -345,7 +351,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
     ++i;
     unsigned Reg = Src.first->getReg();
     int Index = RI.getEncodingValue(Reg) & 0xff;
-    if (Reg == AMDGPU::OQAP) {
+    if (Reg == R600::OQAP) {
       Result.push_back(std::make_pair(Index, 0U));
     }
     if (PV.find(Reg) != PV.end()) {
@@ -435,7 +441,7 @@ unsigned  R600InstrInfo::isLegalUpTo(
       const std::pair<int, unsigned> &Src = Srcs[j];
       if (Src.first < 0 || Src.first == 255)
         continue;
-      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(R600::OQAP))) {
         if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
             Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
             // The value from output queue A (denoted by register OQAP) can
@@ -541,7 +547,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   for (unsigned i = 0, e = IG.size(); i < e; ++i) {
     IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
     unsigned Op = getOperandIdx(IG[i]->getOpcode(),
-        AMDGPU::OpName::bank_swizzle);
+        R600::OpName::bank_swizzle);
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
         IG[i]->getOperand(Op).getImm());
   }
@@ -610,14 +616,14 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
       continue;
 
     for (const auto &Src : getSrcs(MI)) {
-      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+      if (Src.first->getReg() == R600::ALU_LITERAL_X)
         Literals.insert(Src.second);
       if (Literals.size() > 4)
         return false;
-      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+      if (Src.first->getReg() == R600::ALU_CONST)
         Consts.push_back(Src.second);
-      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
-          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+      if (R600::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          R600::R600_KC1RegClass.contains(Src.first->getReg())) {
         unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
         unsigned Chan = RI.getHWRegChan(Src.first->getReg());
         Consts.push_back((Index << 2) | Chan);
@@ -636,7 +642,7 @@ R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
 static bool
 isPredicateSetter(unsigned Opcode) {
   switch (Opcode) {
-  case AMDGPU::PRED_X:
+  case R600::PRED_X:
     return true;
   default:
     return false;
@@ -658,12 +664,12 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
 
 static
 bool isJump(unsigned Opcode) {
-  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+  return Opcode == R600::JUMP || Opcode == R600::JUMP_COND;
 }
 
 static bool isBranch(unsigned Opcode) {
-  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
-      Opcode == AMDGPU::BRANCH_COND_f32;
+  return Opcode == R600::BRANCH || Opcode == R600::BRANCH_COND_i32 ||
+      Opcode == R600::BRANCH_COND_f32;
 }
 
 bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
@@ -678,7 +684,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (I == MBB.end())
     return false;
 
-  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // R600::BRANCH* instructions are only available after isel and are not
   // handled
   if (isBranch(I->getOpcode()))
     return true;
@@ -687,7 +693,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Remove successive JUMP
-  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+  while (I != MBB.begin() && std::prev(I)->getOpcode() == R600::JUMP) {
       MachineBasicBlock::iterator PriorI = std::prev(I);
       if (AllowModify)
         I->removeFromParent();
@@ -698,10 +704,10 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst.getOpcode();
   if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
-    if (LastOpc == AMDGPU::JUMP) {
+    if (LastOpc == R600::JUMP) {
       TBB = LastInst.getOperand(0).getMBB();
       return false;
-    } else if (LastOpc == AMDGPU::JUMP_COND) {
+    } else if (LastOpc == R600::JUMP_COND) {
       auto predSet = I;
       while (!isPredicateSetter(predSet->getOpcode())) {
         predSet = --I;
@@ -709,7 +715,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       TBB = LastInst.getOperand(0).getMBB();
       Cond.push_back(predSet->getOperand(1));
       Cond.push_back(predSet->getOperand(2));
-      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
       return false;
     }
     return true;  // Can't handle indirect branch.
@@ -720,7 +726,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   unsigned SecondLastOpc = SecondLastInst.getOpcode();
 
   // If the block ends with a B and a Bcc, handle it.
-  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+  if (SecondLastOpc == R600::JUMP_COND && LastOpc == R600::JUMP) {
     auto predSet = --I;
     while (!isPredicateSetter(predSet->getOpcode())) {
       predSet = --I;
@@ -729,7 +735,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     FBB = LastInst.getOperand(0).getMBB();
     Cond.push_back(predSet->getOperand(1));
     Cond.push_back(predSet->getOperand(2));
-    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false));
     return false;
   }
 
@@ -741,8 +747,8 @@ static
 MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
       It != E; ++It) {
-    if (It->getOpcode() == AMDGPU::CF_ALU ||
-        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    if (It->getOpcode() == R600::CF_ALU ||
+        It->getOpcode() == R600::CF_ALU_PUSH_BEFORE)
       return It.getReverse();
   }
   return MBB.end();
@@ -759,7 +765,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
 
   if (!FBB) {
     if (Cond.empty()) {
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB);
       return 1;
     } else {
       MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
@@ -767,14 +773,14 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
       addFlag(*PredSet, 0, MO_FLAG_PUSH);
       PredSet->getOperand(2).setImm(Cond[1].getImm());
 
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+      BuildMI(&MBB, DL, get(R600::JUMP_COND))
              .addMBB(TBB)
-             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+             .addReg(R600::PREDICATE_BIT, RegState::Kill);
       MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
       if (CfAlu == MBB.end())
         return 1;
-      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
-      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+      assert (CfAlu->getOpcode() == R600::CF_ALU);
+      CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
       return 1;
     }
   } else {
@@ -782,15 +788,15 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
     assert(PredSet && "No previous predicate !");
     addFlag(*PredSet, 0, MO_FLAG_PUSH);
     PredSet->getOperand(2).setImm(Cond[1].getImm());
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+    BuildMI(&MBB, DL, get(R600::JUMP_COND))
             .addMBB(TBB)
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+            .addReg(R600::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB);
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
       return 2;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+    assert (CfAlu->getOpcode() == R600::CF_ALU);
+    CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE));
     return 2;
   }
 }
@@ -811,18 +817,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
   switch (I->getOpcode()) {
   default:
     return 0;
-  case AMDGPU::JUMP_COND: {
+  case R600::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
     clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
       break;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(R600::CF_ALU));
     break;
   }
-  case AMDGPU::JUMP:
+  case R600::JUMP:
     I->eraseFromParent();
     break;
   }
@@ -836,18 +842,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
     // FIXME: only one case??
   default:
     return 1;
-  case AMDGPU::JUMP_COND: {
+  case R600::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
     clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
       break;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(R600::CF_ALU));
     break;
   }
-  case AMDGPU::JUMP:
+  case R600::JUMP:
     I->eraseFromParent();
     break;
   }
@@ -862,9 +868,9 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
   unsigned Reg = MI.getOperand(idx).getReg();
   switch (Reg) {
   default: return false;
-  case AMDGPU::PRED_SEL_ONE:
-  case AMDGPU::PRED_SEL_ZERO:
-  case AMDGPU::PREDICATE_BIT:
+  case R600::PRED_SEL_ONE:
+  case R600::PRED_SEL_ZERO:
+  case R600::PREDICATE_BIT:
     return true;
   }
 }
@@ -875,9 +881,9 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   // be predicated.  Until we have proper support for instruction clauses in the
   // backend, we will mark KILL* instructions as unpredicable.
 
-  if (MI.getOpcode() == AMDGPU::KILLGT) {
+  if (MI.getOpcode() == R600::KILLGT) {
     return false;
-  } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
+  } else if (MI.getOpcode() == R600::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
     if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
@@ -887,7 +893,7 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   } else if (isVector(MI)) {
     return false;
   } else {
-    return AMDGPUInstrInfo::isPredicable(MI);
+    return TargetInstrInfo::isPredicable(MI);
   }
 }
 
@@ -928,17 +934,17 @@ bool
 R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   MachineOperand &MO = Cond[1];
   switch (MO.getImm()) {
-  case AMDGPU::PRED_SETE_INT:
-    MO.setImm(AMDGPU::PRED_SETNE_INT);
+  case R600::PRED_SETE_INT:
+    MO.setImm(R600::PRED_SETNE_INT);
     break;
-  case AMDGPU::PRED_SETNE_INT:
-    MO.setImm(AMDGPU::PRED_SETE_INT);
+  case R600::PRED_SETNE_INT:
+    MO.setImm(R600::PRED_SETE_INT);
     break;
-  case AMDGPU::PRED_SETE:
-    MO.setImm(AMDGPU::PRED_SETNE);
+  case R600::PRED_SETE:
+    MO.setImm(R600::PRED_SETNE);
     break;
-  case AMDGPU::PRED_SETNE:
-    MO.setImm(AMDGPU::PRED_SETE);
+  case R600::PRED_SETNE:
+    MO.setImm(R600::PRED_SETE);
     break;
   default:
     return true;
@@ -946,11 +952,11 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
 
   MachineOperand &MO2 = Cond[2];
   switch (MO2.getReg()) {
-  case AMDGPU::PRED_SEL_ZERO:
-    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+  case R600::PRED_SEL_ZERO:
+    MO2.setReg(R600::PRED_SEL_ONE);
     break;
-  case AMDGPU::PRED_SEL_ONE:
-    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+  case R600::PRED_SEL_ONE:
+    MO2.setReg(R600::PRED_SEL_ZERO);
     break;
   default:
     return true;
@@ -967,22 +973,22 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
                                          ArrayRef<MachineOperand> Pred) const {
   int PIdx = MI.findFirstPredOperandIdx();
 
-  if (MI.getOpcode() == AMDGPU::CF_ALU) {
+  if (MI.getOpcode() == R600::CF_ALU) {
     MI.getOperand(8).setImm(0);
     return true;
   }
 
-  if (MI.getOpcode() == AMDGPU::DOT_4) {
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
+  if (MI.getOpcode() == R600::DOT_4) {
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_X))
         .setReg(Pred[2].getReg());
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Y))
         .setReg(Pred[2].getReg());
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Z))
         .setReg(Pred[2].getReg());
-    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
+    MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
         .setReg(Pred[2].getReg());
     MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
-    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
 
@@ -990,7 +996,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
     MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
     MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
-    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
 
@@ -1020,20 +1026,20 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   default: {
     MachineBasicBlock *MBB = MI.getParent();
     int OffsetOpIdx =
-        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+        R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::addr);
     // addr is a custom operand with multiple MI operands, and only the
     // first MI operand is given a name.
     int RegOpIdx = OffsetOpIdx + 1;
     int ChanOpIdx =
-        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+        R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::chan);
     if (isRegisterLoad(MI)) {
       int DstOpIdx =
-          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+          R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::dst);
       unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
       unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
       unsigned Address = calculateIndirectAddress(RegIndex, Channel);
       unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
-      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
         buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
                       getIndirectAddrRegClass()->getRegister(Address));
       } else {
@@ -1042,12 +1048,12 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       }
     } else if (isRegisterStore(MI)) {
       int ValOpIdx =
-          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+          R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::val);
       unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
       unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
       unsigned Address = calculateIndirectAddress(RegIndex, Channel);
       unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
-      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
         buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
                       MI.getOperand(ValOpIdx).getReg());
       } else {
@@ -1062,15 +1068,15 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MBB->erase(MI);
     return true;
   }
-  case AMDGPU::R600_EXTRACT_ELT_V2:
-  case AMDGPU::R600_EXTRACT_ELT_V4:
+  case R600::R600_EXTRACT_ELT_V2:
+  case R600::R600_EXTRACT_ELT_V4:
     buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
                       RI.getHWRegIndex(MI.getOperand(1).getReg()), //  Address
                       MI.getOperand(2).getReg(),
                       RI.getHWRegChan(MI.getOperand(1).getReg()));
     break;
-  case AMDGPU::R600_INSERT_ELT_V2:
-  case AMDGPU::R600_INSERT_ELT_V4:
+  case R600::R600_INSERT_ELT_V2:
+  case R600::R600_INSERT_ELT_V4:
     buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
                        RI.getHWRegIndex(MI.getOperand(1).getReg()),   // Address
                        MI.getOperand(3).getReg(),                     // Offset
@@ -1082,7 +1088,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 }
 
 void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
-                                             const MachineFunction &MF) const {
+                                             const MachineFunction &MF,
+                                             const R600RegisterInfo &TRI) const {
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600FrameLowering *TFL = ST.getFrameLowering();
 
@@ -1093,17 +1100,15 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
     return;
 
   for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
-    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
-    Reserved.set(SuperReg);
     for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
-      Reserved.set(Reg);
+      unsigned Reg = R600::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      TRI.reserveRegisterTuples(Reserved, Reg);
     }
   }
 }
 
 const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::R600_TReg32_XRegClass;
+  return &R600::R600_TReg32_XRegClass;
 }
 
 MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -1121,20 +1126,20 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
   unsigned AddrReg;
   switch (AddrChan) {
     default: llvm_unreachable("Invalid Channel");
-    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
-    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
-    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
-    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+    case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
   }
-  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
-                                               AMDGPU::AR_X, OffsetReg);
-  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+                                               R600::AR_X, OffsetReg);
+  setImmOperand(*MOVA, R600::OpName::write, 0);
 
-  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
                                       AddrReg, ValueReg)
-                                      .addReg(AMDGPU::AR_X,
+                                      .addReg(R600::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
+  setImmOperand(*Mov, R600::OpName::dst_rel, 1);
   return Mov;
 }
 
@@ -1153,21 +1158,21 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   unsigned AddrReg;
   switch (AddrChan) {
     default: llvm_unreachable("Invalid Channel");
-    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
-    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
-    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
-    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+    case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break;
   }
-  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
-                                                       AMDGPU::AR_X,
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg,
+                                                       R600::AR_X,
                                                        OffsetReg);
-  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
-  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+  setImmOperand(*MOVA, R600::OpName::write, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV,
                                       ValueReg,
                                       AddrReg)
-                                      .addReg(AMDGPU::AR_X,
+                                      .addReg(R600::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
+  setImmOperand(*Mov, R600::OpName::src0_rel, 1);
 
   return Mov;
 }
@@ -1265,7 +1270,7 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
   //XXX: The r600g finalizer expects this to be 1, once we've moved the
   //scheduling to the backend, we can change the default to 0.
   MIB.addImm(1)        // $last
-      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addReg(R600::PRED_SEL_OFF) // $pred_sel
       .addImm(0)         // $literal
       .addImm(0);        // $bank_swizzle
 
@@ -1286,23 +1291,23 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
 
 static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
   switch (Op) {
-  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
-  OPERAND_CASE(AMDGPU::OpName::update_pred)
-  OPERAND_CASE(AMDGPU::OpName::write)
-  OPERAND_CASE(AMDGPU::OpName::omod)
-  OPERAND_CASE(AMDGPU::OpName::dst_rel)
-  OPERAND_CASE(AMDGPU::OpName::clamp)
-  OPERAND_CASE(AMDGPU::OpName::src0)
-  OPERAND_CASE(AMDGPU::OpName::src0_neg)
-  OPERAND_CASE(AMDGPU::OpName::src0_rel)
-  OPERAND_CASE(AMDGPU::OpName::src0_abs)
-  OPERAND_CASE(AMDGPU::OpName::src0_sel)
-  OPERAND_CASE(AMDGPU::OpName::src1)
-  OPERAND_CASE(AMDGPU::OpName::src1_neg)
-  OPERAND_CASE(AMDGPU::OpName::src1_rel)
-  OPERAND_CASE(AMDGPU::OpName::src1_abs)
-  OPERAND_CASE(AMDGPU::OpName::src1_sel)
-  OPERAND_CASE(AMDGPU::OpName::pred_sel)
+  OPERAND_CASE(R600::OpName::update_exec_mask)
+  OPERAND_CASE(R600::OpName::update_pred)
+  OPERAND_CASE(R600::OpName::write)
+  OPERAND_CASE(R600::OpName::omod)
+  OPERAND_CASE(R600::OpName::dst_rel)
+  OPERAND_CASE(R600::OpName::clamp)
+  OPERAND_CASE(R600::OpName::src0)
+  OPERAND_CASE(R600::OpName::src0_neg)
+  OPERAND_CASE(R600::OpName::src0_rel)
+  OPERAND_CASE(R600::OpName::src0_abs)
+  OPERAND_CASE(R600::OpName::src0_sel)
+  OPERAND_CASE(R600::OpName::src1)
+  OPERAND_CASE(R600::OpName::src1_neg)
+  OPERAND_CASE(R600::OpName::src1_rel)
+  OPERAND_CASE(R600::OpName::src1_abs)
+  OPERAND_CASE(R600::OpName::src1_sel)
+  OPERAND_CASE(R600::OpName::pred_sel)
   default:
     llvm_unreachable("Wrong Operand");
   }
@@ -1313,39 +1318,39 @@ static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
 MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
     const {
-  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  assert (MI->getOpcode() == R600::DOT_4 && "Not Implemented");
   unsigned Opcode;
-  if (ST.getGeneration() <= R600Subtarget::R700)
-    Opcode = AMDGPU::DOT4_r600;
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+    Opcode = R600::DOT4_r600;
   else
-    Opcode = AMDGPU::DOT4_eg;
+    Opcode = R600::DOT4_eg;
   MachineBasicBlock::iterator I = MI;
   MachineOperand &Src0 = MI->getOperand(
-      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+      getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src0, Slot)));
   MachineOperand &Src1 = MI->getOperand(
-      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+      getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot)));
   MachineInstr *MIB = buildDefaultInstruction(
       MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
   static const unsigned  Operands[14] = {
-    AMDGPU::OpName::update_exec_mask,
-    AMDGPU::OpName::update_pred,
-    AMDGPU::OpName::write,
-    AMDGPU::OpName::omod,
-    AMDGPU::OpName::dst_rel,
-    AMDGPU::OpName::clamp,
-    AMDGPU::OpName::src0_neg,
-    AMDGPU::OpName::src0_rel,
-    AMDGPU::OpName::src0_abs,
-    AMDGPU::OpName::src0_sel,
-    AMDGPU::OpName::src1_neg,
-    AMDGPU::OpName::src1_rel,
-    AMDGPU::OpName::src1_abs,
-    AMDGPU::OpName::src1_sel,
+    R600::OpName::update_exec_mask,
+    R600::OpName::update_pred,
+    R600::OpName::write,
+    R600::OpName::omod,
+    R600::OpName::dst_rel,
+    R600::OpName::clamp,
+    R600::OpName::src0_neg,
+    R600::OpName::src0_rel,
+    R600::OpName::src0_abs,
+    R600::OpName::src0_sel,
+    R600::OpName::src1_neg,
+    R600::OpName::src1_rel,
+    R600::OpName::src1_abs,
+    R600::OpName::src1_sel,
   };
 
   MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
-      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
-  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      getSlotedOps(R600::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
       .setReg(MO.getReg());
 
   for (unsigned i = 0; i < 14; i++) {
@@ -1362,16 +1367,16 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
                                          MachineBasicBlock::iterator I,
                                          unsigned DstReg,
                                          uint64_t Imm) const {
-  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
-                                                  AMDGPU::ALU_LITERAL_X);
-  setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, R600::MOV, DstReg,
+                                                  R600::ALU_LITERAL_X);
+  setImmOperand(*MovImm, R600::OpName::literal, Imm);
   return MovImm;
 }
 
 MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned DstReg, unsigned SrcReg) const {
-  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+  return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg);
 }
 
 int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
@@ -1379,7 +1384,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
 }
 
 int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
-  return AMDGPU::getNamedOperandIdx(Opcode, Op);
+  return R600::getNamedOperandIdx(Opcode, Op);
 }
 
 void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
@@ -1406,25 +1411,25 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
     bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
     switch (Flag) {
     case MO_FLAG_CLAMP:
-      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
+      FlagIndex = getOperandIdx(MI, R600::OpName::clamp);
       break;
     case MO_FLAG_MASK:
-      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
+      FlagIndex = getOperandIdx(MI, R600::OpName::write);
       break;
     case MO_FLAG_NOT_LAST:
     case MO_FLAG_LAST:
-      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
+      FlagIndex = getOperandIdx(MI, R600::OpName::last);
       break;
     case MO_FLAG_NEG:
       switch (SrcIdx) {
       case 0:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src0_neg);
         break;
       case 1:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src1_neg);
         break;
       case 2:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src2_neg);
         break;
       }
       break;
@@ -1435,10 +1440,10 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
       (void)IsOP3;
       switch (SrcIdx) {
       case 0:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src0_abs);
         break;
       case 1:
-        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+        FlagIndex = getOperandIdx(MI, R600::OpName::src1_abs);
         break;
       }
       break;
@@ -1499,15 +1504,15 @@ unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
   switch (Kind) {
   case PseudoSourceValue::Stack:
   case PseudoSourceValue::FixedStack:
-    return AMDGPUASI.PRIVATE_ADDRESS;
+    return ST.getAMDGPUAS().PRIVATE_ADDRESS;
   case PseudoSourceValue::ConstantPool:
   case PseudoSourceValue::GOT:
   case PseudoSourceValue::JumpTable:
   case PseudoSourceValue::GlobalValueCallEntry:
   case PseudoSourceValue::ExternalSymbolCallEntry:
   case PseudoSourceValue::TargetCustom:
-    return AMDGPUASI.CONSTANT_ADDRESS;
+    return ST.getAMDGPUAS().CONSTANT_ADDRESS;
   }
   llvm_unreachable("Invalid pseudo source kind");
-  return AMDGPUASI.PRIVATE_ADDRESS;
+  return ST.getAMDGPUAS().PRIVATE_ADDRESS;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index abaa37450758..7a3dece31665 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -8,15 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for R600InstrInfo
+/// Interface definition for R600InstrInfo
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
 
-#include "AMDGPUInstrInfo.h"
 #include "R600RegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "R600GenInstrInfo.inc"
 
 namespace llvm {
 
@@ -34,7 +37,7 @@ class MachineInstr;
 class MachineInstrBuilder;
 class R600Subtarget;
 
-class R600InstrInfo final : public AMDGPUInstrInfo {
+class R600InstrInfo final : public R600GenInstrInfo {
 private:
   const R600RegisterInfo RI;
   const R600Subtarget &ST;
@@ -150,7 +153,7 @@ public:
   /// Same but using const index set instead of MI set.
   bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
 
-  /// \brief Vector instructions are instructions that must fill all
+  /// Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
@@ -209,9 +212,10 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  /// Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
-                                const MachineFunction &MF) const;
+                                const MachineFunction &MF,
+                                const R600RegisterInfo &TRI) const;
 
   /// Calculate the "Indirect Address" for the given \p RegIndex and
   /// \p Channel
@@ -235,7 +239,7 @@ public:
   /// read or write or -1 if indirect addressing is not used by this program.
   int getIndirectIndexEnd(const MachineFunction &MF) const;
 
-  /// \brief Build instruction(s) for an indirect register write.
+  /// Build instruction(s) for an indirect register write.
   ///
   /// \returns The instruction that performs the indirect register write
   MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
@@ -243,7 +247,7 @@ public:
                                          unsigned ValueReg, unsigned Address,
                                          unsigned OffsetReg) const;
 
-  /// \brief Build instruction(s) for an indirect register read.
+  /// Build instruction(s) for an indirect register read.
   ///
   /// \returns The instruction that performs the indirect register read
   MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
@@ -281,23 +285,23 @@ public:
                               MachineBasicBlock::iterator I,
                               unsigned DstReg, unsigned SrcReg) const;
 
-  /// \brief Get the index of Op in the MachineInstr.
+  /// Get the index of Op in the MachineInstr.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
 
-  /// \brief Get the index of \p Op for the given Opcode.
+  /// Get the index of \p Op for the given Opcode.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(unsigned Opcode, unsigned Op) const;
 
-  /// \brief Helper function for setting instruction flag values.
+  /// Helper function for setting instruction flag values.
   void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
 
-  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  ///Add one of the MO_FLAG* flags to the specified \p Operand.
   void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
-  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  ///Determine if the specified \p Flag is set on this \p Operand.
   bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
   /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
@@ -307,7 +311,7 @@ public:
   MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
                             unsigned Flag = 0) const;
 
-  /// \brief Clear the specified flag on the instruction.
+  /// Clear the specified flag on the instruction.
   void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
   // Helper functions that check the opcode for status information
@@ -323,7 +327,7 @@ public:
       PseudoSourceValue::PSVKind Kind) const override;
 };
 
-namespace AMDGPU {
+namespace R600 {
 
 int getLDSNoRetOp(uint16_t Opcode);
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
index 801e4e61fca6..7bf174f4cd86 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -12,20 +12,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-include "R600Intrinsics.td"
 include "R600InstrFormats.td"
 
 // FIXME: Should not be arbitrarily split from other R600 inst classes.
 class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> :
   AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
   let SubtargetPredicate = isR600toCayman;
+  let Namespace = "R600";
 }
 
 
 class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
     InstR600 <outs, ins, asm, pattern, NullALU> {
 
-  let Namespace = "AMDGPU";
 }
 
 def MEMxi : Operand<iPTR> {
@@ -81,11 +80,18 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
 def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
 def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 
 def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
                                      (ops PRED_SEL_OFF)>;
 
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1, Namespace = "R600" in {
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
+}
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 
@@ -219,34 +225,6 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
 
 } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
 
-def TEX_SHADOW : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return (TType >= 6 && TType <= 8) || TType == 13;
-  }]
->;
-
-def TEX_RECT : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 5;
-  }]
->;
-
-def TEX_ARRAY : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 9 || TType == 10 || TType == 16;
-  }]
->;
-
-def TEX_SHADOW_ARRAY : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 11 || TType == 12 || TType == 17;
-  }]
->;
-
 class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
                  dag outs, dag ins, string asm, list<dag> pattern> :
     InstR600ISA <outs, ins, asm, pattern>,
@@ -357,6 +335,8 @@ def vtx_id2_load : LoadVtxId2 <load>;
 // R600 SDNodes
 //===----------------------------------------------------------------------===//
 
+let Namespace = "R600" in {
+
 def INTERP_PAIR_XY :  AMDGPUShaderInst <
   (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
   (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
@@ -369,6 +349,8 @@ def INTERP_PAIR_ZW :  AMDGPUShaderInst <
   "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
   []>;
 
+}
+
 def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
   SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
   [SDNPVariadic]
@@ -416,11 +398,15 @@ def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
 // Interpolation Instructions
 //===----------------------------------------------------------------------===//
 
+let Namespace = "R600" in {
+
 def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
   "INTERP_LOAD $src0 : $dst">;
 
+}
+
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
 }
@@ -660,14 +646,7 @@ def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > {
 
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
-let usesCustomInserter = 1  in {
-
-class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
-  (outs rc:$dst),
-  (ins rc:$src0),
-  "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0))]
->;
+let Namespace = "R600", usesCustomInserter = 1  in {
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
@@ -799,7 +778,9 @@ class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
   (ins immType:$imm),
   "",
   []
->;
+> {
+  let Namespace = "R600";
+}
 
 } // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
 
@@ -1014,7 +995,7 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
 }
 
 
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600"  in {
 class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
 // Slot X
    UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
@@ -1193,7 +1174,6 @@ class COS_Common <bits<11> inst> : R600_1OP <
   let Itinerary = TransALU;
 }
 
-def CLAMP_R600 :  CLAMP <R600_Reg32>;
 def FABS_R600 : FABS<R600_Reg32>;
 def FNEG_R600 : FNEG<R600_Reg32>;
 
@@ -1334,7 +1314,9 @@ let Predicates = [isR600] in {
 // Regist loads and stores - for indirect addressing
 //===----------------------------------------------------------------------===//
 
+let Namespace = "R600" in {
 defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+}
 
 // Hardcode channel to 0
 // NOTE: LSHR is not available here. LSHR is per family instruction
@@ -1386,11 +1368,12 @@ let usesCustomInserter = 1 in {
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
 
-def MASK_WRITE : AMDGPUShaderInst <
+def MASK_WRITE : InstR600 <
     (outs),
     (ins R600_Reg32:$src),
     "MASK_WRITE $src",
-    []
+    [],
+    NullALU
 >;
 
 } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
@@ -1421,7 +1404,7 @@ def TXD_SHADOW: InstR600 <
 // Constant Buffer Addressing Support
 //===----------------------------------------------------------------------===//
 
-let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600"  in {
 def CONST_COPY : Instruction {
   let OutOperandList = (outs R600_Reg32:$dst);
   let InOperandList = (ins i32imm:$src);
@@ -1544,23 +1527,6 @@ let Inst{63-32} = Word1;
 //===---------------------------------------------------------------------===//
 // Flow and Program control Instructions
 //===---------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
-     let Namespace = "AMDGPU";
-     dag OutOperandList = outs;
-     dag InOperandList = ins;
-     let Pattern = pattern;
-     let AsmString = !strconcat(asmstr, "\n");
-     let isPseudo = 1;
-     let Itinerary = NullALU;
-     bit hasIEEEFlag = 0;
-     bit hasZeroOpFlag = 0;
-     let mayLoad = 0;
-     let mayStore = 0;
-     let hasSideEffects = 0;
-     let isCodeGenOnly = 1;
-}
 
 multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
     def _i32 : ILFormat<(outs),
@@ -1592,23 +1558,14 @@ multiclass BranchInstr2<string name> {
 // Custom Inserter for Branches and returns, this eventually will be a
 // separate pass
 //===---------------------------------------------------------------------===//
-let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1,
+    Namespace = "R600" in {
   def BRANCH : ILFormat<(outs), (ins brtarget:$target),
       "; Pseudo unconditional branch instruction",
       [(br bb:$target)]>;
   defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
 }
 
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
-    usesCustomInserter = 1 in {
-  def RETURN : ILFormat<(outs), (ins variable_ops),
-    "RETURN", [(AMDGPUendpgm)]
-  >;
-}
-
 //===----------------------------------------------------------------------===//
 // Branch Instructions
 //===----------------------------------------------------------------------===//
@@ -1738,13 +1695,8 @@ def : R600Pat <
 >;
 
 // KIL Patterns
-def KILP : R600Pat <
-  (int_AMDGPU_kilp),
-  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
->;
-
 def KIL : R600Pat <
-  (int_AMDGPU_kill f32:$src0),
+  (int_r600_kill f32:$src0),
   (MASK_WRITE (KILLGT (f32 ZERO), $src0))
 >;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
deleted file mode 100644
index 4c9e1e8a5434..000000000000
--- a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 Intrinsic Definitions
-//
-//===----------------------------------------------------------------------===//
-
-class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
-  llvm_v4f32_ty, // Coord
-  llvm_i32_ty,   // offset_x
-  llvm_i32_ty,   // offset_y,
-  llvm_i32_ty,   // offset_z,
-  llvm_i32_ty,   // resource_id
-  llvm_i32_ty,   // samplerid
-  llvm_i32_ty,   // coord_type_x
-  llvm_i32_ty,   // coord_type_y
-  llvm_i32_ty,   // coord_type_z
-  llvm_i32_ty],  // coord_type_w
-  [IntrNoMem]
->;
-
-class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
-    llvm_v4i32_ty, // Coord
-    llvm_i32_ty,   // offset_x
-    llvm_i32_ty,   // offset_y,
-    llvm_i32_ty,   // offset_z,
-    llvm_i32_ty,   // resource_id
-    llvm_i32_ty,   // samplerid
-    llvm_i32_ty,   // coord_type_x
-    llvm_i32_ty,   // coord_type_y
-    llvm_i32_ty,   // coord_type_z
-    llvm_i32_ty],  // coord_type_w
-    [IntrNoMem]
->;
-
-let TargetPrefix = "r600", isTarget = 1 in {
-
-def int_r600_store_swizzle :
-  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_store_stream_output : Intrinsic<
-  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
->;
-
-def int_r600_tex : TextureIntrinsicFloatInput;
-def int_r600_texc : TextureIntrinsicFloatInput;
-def int_r600_txl : TextureIntrinsicFloatInput;
-def int_r600_txlc : TextureIntrinsicFloatInput;
-def int_r600_txb : TextureIntrinsicFloatInput;
-def int_r600_txbc : TextureIntrinsicFloatInput;
-def int_r600_txf : TextureIntrinsicInt32Input;
-def int_r600_txq : TextureIntrinsicInt32Input;
-def int_r600_ddx : TextureIntrinsicFloatInput;
-def int_r600_ddy : TextureIntrinsicFloatInput;
-
-def int_r600_dot4 : Intrinsic<[llvm_float_ty],
-  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
->;
-
-} // End TargetPrefix = "r600", isTarget = 1
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index a7e540f9d14d..a1429a2ac50f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -8,13 +8,14 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Pass.h"
@@ -78,7 +79,7 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       AllowSwitchFromAlu = true;
     } else {
       unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
-      DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+      LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
       // We assume the local GPR requirements to be "dominated" by the requirement
       // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
       // after TEX are indeed likely to consume or generate values from/for the
@@ -124,26 +125,24 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       NextInstKind = IDOther;
   }
 
-  DEBUG(
-      if (SU) {
-        dbgs() << " ** Pick node **\n";
-        SU->dump(DAG);
-      } else {
-        dbgs() << "NO NODE \n";
-        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
-          const SUnit &S = DAG->SUnits[i];
-          if (!S.isScheduled)
-            S.dump(DAG);
-        }
-      }
-  );
+  LLVM_DEBUG(if (SU) {
+    dbgs() << " ** Pick node **\n";
+    SU->dump(DAG);
+  } else {
+    dbgs() << "NO NODE \n";
+    for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+      const SUnit &S = DAG->SUnits[i];
+      if (!S.isScheduled)
+        S.dump(DAG);
+    }
+  });
 
   return SU;
 }
 
 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (NextInstKind != CurInstKind) {
-    DEBUG(dbgs() << "Instruction Type Switch\n");
+    LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
     if (NextInstKind != IDAlu)
       OccupedSlotsMask |= 31;
     CurEmitted = 0;
@@ -163,7 +162,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
       for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
           E = SU->getInstr()->operands_end(); It != E; ++It) {
         MachineOperand &MO = *It;
-        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+        if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
           ++CurEmitted;
       }
     }
@@ -172,8 +171,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
     ++CurEmitted;
   }
 
-
-  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
@@ -183,18 +181,18 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
 static bool
 isPhysicalRegCopy(MachineInstr *MI) {
-  if (MI->getOpcode() != AMDGPU::COPY)
+  if (MI->getOpcode() != R600::COPY)
     return false;
 
   return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
 }
 
 void R600SchedStrategy::releaseTopNode(SUnit *SU) {
-  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+  LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
 }
 
 void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
-  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
   if (isPhysicalRegCopy(SU->getInstr())) {
     PhysicalRegCopy.push_back(SU);
     return;
@@ -226,14 +224,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
     return AluTrans;
 
   switch (MI->getOpcode()) {
-  case AMDGPU::PRED_X:
+  case R600::PRED_X:
     return AluPredX;
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT_4:
+  case R600::INTERP_PAIR_XY:
+  case R600::INTERP_PAIR_ZW:
+  case R600::INTERP_VEC_LOAD:
+  case R600::DOT_4:
     return AluT_XYZW;
-  case AMDGPU::COPY:
+  case R600::COPY:
     if (MI->getOperand(1).isUndef()) {
       // MI will become a KILL, don't considers it in scheduling
       return AluDiscarded;
@@ -248,7 +246,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   if(TII->isVector(*MI) ||
      TII->isCubeOp(MI->getOpcode()) ||
      TII->isReductionOp(MI->getOpcode()) ||
-     MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+     MI->getOpcode() == R600::GROUP_BARRIER) {
     return AluT_XYZW;
   }
 
@@ -259,13 +257,13 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   // Is the result already assigned to a channel ?
   unsigned DestSubReg = MI->getOperand(0).getSubReg();
   switch (DestSubReg) {
-  case AMDGPU::sub0:
+  case R600::sub0:
     return AluT_X;
-  case AMDGPU::sub1:
+  case R600::sub1:
     return AluT_Y;
-  case AMDGPU::sub2:
+  case R600::sub2:
     return AluT_Z;
-  case AMDGPU::sub3:
+  case R600::sub3:
     return AluT_W;
   default:
     break;
@@ -273,16 +271,16 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
 
   // Is the result already member of a X/Y/Z/W class ?
   unsigned DestReg = MI->getOperand(0).getReg();
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
-      regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
+      regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
     return AluT_X;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
     return AluT_Y;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
     return AluT_Z;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
     return AluT_W;
-  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
     return AluT_XYZW;
 
   // LDS src registers cannot be used in the Trans slot.
@@ -303,13 +301,13 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
   }
 
   switch (Opcode) {
-  case AMDGPU::PRED_X:
-  case AMDGPU::COPY:
-  case AMDGPU::CONST_COPY:
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT_4:
+  case R600::PRED_X:
+  case R600::COPY:
+  case R600::CONST_COPY:
+  case R600::INTERP_PAIR_XY:
+  case R600::INTERP_PAIR_ZW:
+  case R600::INTERP_VEC_LOAD:
+  case R600::DOT_4:
     return IDAlu;
   default:
     return IDOther;
@@ -345,17 +343,17 @@ void R600SchedStrategy::LoadAlu() {
 }
 
 void R600SchedStrategy::PrepareNextSlot() {
-  DEBUG(dbgs() << "New Slot\n");
+  LLVM_DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
-//  if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
 //    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
 }
 
 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
-  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
   if (DstIndex == -1) {
     return;
   }
@@ -372,16 +370,16 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   // Constrains the regclass of DestReg to assign it to Slot
   switch (Slot) {
   case 0:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
     break;
   case 1:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
     break;
   case 2:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
     break;
   case 3:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
     break;
   }
 }
@@ -461,7 +459,7 @@ SUnit* R600SchedStrategy::pickOther(int QID) {
   }
   if (!AQ.empty()) {
     SU = AQ.back();
-    AQ.resize(AQ.size() - 1);
+    AQ.pop_back();
   }
   return SU;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
index 9a6770570477..8a9a8d3d1e23 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 Machine Scheduler interface
+/// R600 Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index cd71f19760b9..7de5e2c9577d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -1,4 +1,4 @@
-//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===//
+//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -153,7 +153,7 @@ PushArgMD(KernelArgMD &MD, const MDVector &V) {
 
 namespace {
 
-class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+class R600OpenCLImageTypeLoweringPass : public ModulePass {
   static char ID;
 
   LLVMContext *Context;
@@ -364,7 +364,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
   }
 
 public:
-  AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+  R600OpenCLImageTypeLoweringPass() : ModulePass(ID) {}
 
   bool runOnModule(Module &M) override {
     Context = &M.getContext();
@@ -376,14 +376,14 @@ public:
   }
 
   StringRef getPassName() const override {
-    return "AMDGPU OpenCL Image Type Pass";
+    return "R600 OpenCL Image Type Pass";
   }
 };
 
 } // end anonymous namespace
 
-char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+char R600OpenCLImageTypeLoweringPass::ID = 0;
 
-ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
-  return new AMDGPUOpenCLImageTypeLoweringPass();
+ModulePass *llvm::createR600OpenCLImageTypeLoweringPass() {
+  return new R600OpenCLImageTypeLoweringPass();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 4a14d95f1cc4..692451cb8fe0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -31,6 +31,7 @@
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -78,7 +79,7 @@ public:
   std::vector<unsigned> UndefReg;
 
   RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
-    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    assert(MI->getOpcode() == R600::REG_SEQUENCE);
     for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
       MachineOperand &MO = Instr->getOperand(i);
       unsigned Chan = Instr->getOperand(i + 1).getImm();
@@ -158,8 +159,8 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
   if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
     return true;
   switch (MI.getOpcode()) {
-  case AMDGPU::R600_ExportSwz:
-  case AMDGPU::EG_ExportSwz:
+  case R600::R600_ExportSwz:
+  case R600::EG_ExportSwz:
     return true;
   default:
     return false;
@@ -212,12 +213,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
   for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
       E = RSI->RegToChan.end(); It != E; ++It) {
-    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
     unsigned SubReg = (*It).first;
     unsigned Swizzle = (*It).second;
     unsigned Chan = getReassignedChan(RemapChan, Swizzle);
 
-    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
         DstReg)
         .addReg(SrcVec)
         .addReg(SubReg)
@@ -228,20 +229,20 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
       UpdatedUndef.erase(ChanPos);
     assert(!is_contained(UpdatedUndef, Chan) &&
            "UpdatedUndef shouldn't contain Chan more than once!");
-    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    LLVM_DEBUG(dbgs() << "    ->"; Tmp->dump(););
     (void)Tmp;
     SrcVec = DstReg;
   }
   MachineInstr *NewMI =
-      BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
-  DEBUG(dbgs() << "    ->"; NewMI->dump(););
+      BuildMI(MBB, Pos, DL, TII->get(R600::COPY), Reg).addReg(SrcVec);
+  LLVM_DEBUG(dbgs() << "    ->"; NewMI->dump(););
 
-  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  LLVM_DEBUG(dbgs() << "  Updating Swizzle:\n");
   for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
       E = MRI->use_instr_end(); It != E; ++It) {
-    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    LLVM_DEBUG(dbgs() << "    "; (*It).dump(); dbgs() << "    ->");
     SwizzleInput(*It, RemapChan);
-    DEBUG((*It).dump());
+    LLVM_DEBUG((*It).dump());
   }
   RSI->Instr->eraseFromParent();
 
@@ -353,7 +354,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
     for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
          MII != MIIE; ++MII) {
       MachineInstr &MI = *MII;
-      if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+      if (MI.getOpcode() != R600::REG_SEQUENCE) {
         if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
           unsigned Reg = MI.getOperand(1).getReg();
           for (MachineRegisterInfo::def_instr_iterator
@@ -372,14 +373,14 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
       if (!areAllUsesSwizzeable(Reg))
         continue;
 
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Trying to optimize ";
         MI.dump();
       });
 
       RegSeqInfo CandidateRSI;
       std::vector<std::pair<unsigned, unsigned>> RemapChan;
-      DEBUG(dbgs() << "Using common slots...\n";);
+      LLVM_DEBUG(dbgs() << "Using common slots...\n";);
       if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
         // Remove CandidateRSI mapping
         RemoveMI(CandidateRSI.Instr);
@@ -387,7 +388,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
         trackRSI(RSI);
         continue;
       }
-      DEBUG(dbgs() << "Using free slots...\n";);
+      LLVM_DEBUG(dbgs() << "Using free slots...\n";);
       RemapChan.clear();
       if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
         RemoveMI(CandidateRSI.Instr);
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 7340318d2d88..612c62b514fd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -83,39 +84,39 @@ private:
       LastDstChan = BISlot;
       if (TII->isPredicated(*BI))
         continue;
-      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
         continue;
-      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+      int DstIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::dst);
       if (DstIdx == -1) {
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
       if (isTrans || TII->isTransOnly(*BI)) {
-        Result[Dst] = AMDGPU::PS;
+        Result[Dst] = R600::PS;
         continue;
       }
-      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
-          BI->getOpcode() == AMDGPU::DOT4_eg) {
-        Result[Dst] = AMDGPU::PV_X;
+      if (BI->getOpcode() == R600::DOT4_r600 ||
+          BI->getOpcode() == R600::DOT4_eg) {
+        Result[Dst] = R600::PV_X;
         continue;
       }
-      if (Dst == AMDGPU::OQAP) {
+      if (Dst == R600::OQAP) {
         continue;
       }
       unsigned PVReg = 0;
       switch (TRI.getHWRegChan(Dst)) {
       case 0:
-        PVReg = AMDGPU::PV_X;
+        PVReg = R600::PV_X;
         break;
       case 1:
-        PVReg = AMDGPU::PV_Y;
+        PVReg = R600::PV_Y;
         break;
       case 2:
-        PVReg = AMDGPU::PV_Z;
+        PVReg = R600::PV_Z;
         break;
       case 3:
-        PVReg = AMDGPU::PV_W;
+        PVReg = R600::PV_W;
         break;
       default:
         llvm_unreachable("Invalid Chan");
@@ -128,9 +129,9 @@ private:
   void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
       const {
     unsigned Ops[] = {
-      AMDGPU::OpName::src0,
-      AMDGPU::OpName::src1,
-      AMDGPU::OpName::src2
+      R600::OpName::src0,
+      R600::OpName::src1,
+      R600::OpName::src2
     };
     for (unsigned i = 0; i < 3; i++) {
       int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
@@ -170,7 +171,7 @@ public:
       return true;
     if (!TII->isALUInstr(MI.getOpcode()))
       return true;
-    if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
+    if (MI.getOpcode() == R600::GROUP_BARRIER)
       return true;
     // XXX: This can be removed once the packetizer properly handles all the
     // LDS instruction group restrictions.
@@ -184,8 +185,8 @@ public:
     if (getSlot(*MII) == getSlot(*MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
-    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
-        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+    int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel),
+        OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel);
     unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
         PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
     if (PredI != PredJ)
@@ -219,7 +220,7 @@ public:
   }
 
   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
-    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600::OpName::last);
     MI->getOperand(LastOp).setImm(Bit);
   }
 
@@ -236,7 +237,7 @@ public:
         if (ConsideredInstUsesAlreadyWrittenVectorElement &&
             !TII->isVectorOnly(MI) && VLIW5) {
           isTransSlot = true;
-          DEBUG({
+          LLVM_DEBUG({
             dbgs() << "Considering as Trans Inst :";
             MI.dump();
           });
@@ -249,7 +250,7 @@ public:
     // Are the Constants limitations met ?
     CurrentPacketMIs.push_back(&MI);
     if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Couldn't pack :\n";
         MI.dump();
         dbgs() << "with the following packets :\n";
@@ -266,7 +267,7 @@ public:
     // Is there a BankSwizzle set that meet Read Port limitations ?
     if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
             PV, BS, isTransSlot)) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "Couldn't pack :\n";
         MI.dump();
         dbgs() << "with the following packets :\n";
@@ -300,11 +301,11 @@ public:
       for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
         MachineInstr *MI = CurrentPacketMIs[i];
         unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-            AMDGPU::OpName::bank_swizzle);
+            R600::OpName::bank_swizzle);
         MI->getOperand(Op).setImm(BS[i]);
       }
       unsigned Op =
-          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+          TII->getOperandIdx(MI.getOpcode(), R600::OpName::bank_swizzle);
       MI.getOperand(Op).setImm(BS.back());
       if (!CurrentPacketMIs.empty())
         setIsLastBit(CurrentPacketMIs.back(), 0);
@@ -333,6 +334,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+  assert(Packetizer.getResourceTracker()->getInstrItins());
 
   if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
     return false;
@@ -352,8 +354,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator End = MBB->end();
     MachineBasicBlock::iterator MI = MBB->begin();
     while (MI != End) {
-      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
-          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+      if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF ||
+          (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) {
         MachineBasicBlock::iterator DeleteMI = MI;
         ++MI;
         MBB->erase(DeleteMI);
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Processors.td b/contrib/llvm/lib/Target/AMDGPU/R600Processors.td
index 89194dc1bdf6..f39b3dc1bfd4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Processors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Processors.td
@@ -7,6 +7,62 @@
 //
 //===----------------------------------------------------------------------===//
 
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+  "TexVTXClauseSize",
+  Value,
+  "Limit the maximum number of fetches in a clause to "#Value
+>;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+  "R600ALUInst",
+  "false",
+  "Older version of ALU instructions encoding"
+>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+  "HasVertexCache",
+  "true",
+  "Specify use of dedicated vertex cache"
+>;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+  "CaymanISA",
+  "true",
+  "Use Cayman ISA"
+>;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+  "CFALUBug",
+  "true",
+  "GPU has CF_ALU bug"
+>;
+
+class R600SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>;
+
+def FeatureR600 : R600SubtargetFeatureGeneration<"R600",
+  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
+
+def FeatureR700 : R600SubtargetFeatureGeneration<"R700",
+  [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
+
+def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN",
+  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
+
+def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+  [FeatureFetchLimit16, FeatureWavefrontSize64,
+   FeatureLocalMemorySize32768]
+>;
+
+
 //===----------------------------------------------------------------------===//
 // Radeon HD 2000/3000 Series (R600).
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 7501facb0cba..38933e7616a0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief R600 implementation of the TargetRegisterInfo class.
+/// R600 implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,47 +17,51 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) {
   RCW.RegWeight = 0;
   RCW.WeightLimit = 0;
 }
 
+#define GET_REGINFO_TARGET_DESC
+#include "R600GenRegisterInfo.inc"
+
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600InstrInfo *TII = ST.getInstrInfo();
 
-  Reserved.set(AMDGPU::ZERO);
-  Reserved.set(AMDGPU::HALF);
-  Reserved.set(AMDGPU::ONE);
-  Reserved.set(AMDGPU::ONE_INT);
-  Reserved.set(AMDGPU::NEG_HALF);
-  Reserved.set(AMDGPU::NEG_ONE);
-  Reserved.set(AMDGPU::PV_X);
-  Reserved.set(AMDGPU::ALU_LITERAL_X);
-  Reserved.set(AMDGPU::ALU_CONST);
-  Reserved.set(AMDGPU::PREDICATE_BIT);
-  Reserved.set(AMDGPU::PRED_SEL_OFF);
-  Reserved.set(AMDGPU::PRED_SEL_ZERO);
-  Reserved.set(AMDGPU::PRED_SEL_ONE);
-  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-
-  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
-                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
-    Reserved.set(*I);
+  reserveRegisterTuples(Reserved, R600::ZERO);
+  reserveRegisterTuples(Reserved, R600::HALF);
+  reserveRegisterTuples(Reserved, R600::ONE);
+  reserveRegisterTuples(Reserved, R600::ONE_INT);
+  reserveRegisterTuples(Reserved, R600::NEG_HALF);
+  reserveRegisterTuples(Reserved, R600::NEG_ONE);
+  reserveRegisterTuples(Reserved, R600::PV_X);
+  reserveRegisterTuples(Reserved, R600::ALU_LITERAL_X);
+  reserveRegisterTuples(Reserved, R600::ALU_CONST);
+  reserveRegisterTuples(Reserved, R600::PREDICATE_BIT);
+  reserveRegisterTuples(Reserved, R600::PRED_SEL_OFF);
+  reserveRegisterTuples(Reserved, R600::PRED_SEL_ZERO);
+  reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
+  reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
+
+  for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
+                        E = R600::R600_AddrRegClass.end(); I != E; ++I) {
+    reserveRegisterTuples(Reserved, *I);
   }
 
-  TII->reserveIndirectRegisters(Reserved, MF);
+  TII->reserveIndirectRegisters(Reserved, MF, *this);
 
   return Reserved;
 }
 
 // Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+static const MCPhysReg CalleeSavedReg = R600::NoRegister;
 
 const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
   const MachineFunction *) const {
@@ -65,7 +69,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
 }
 
 unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
+  return R600::NoRegister;
 }
 
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
@@ -80,7 +84,7 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
   default:
-  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  case MVT::i32: return &R600::R600_TReg32RegClass;
   }
 }
 
@@ -93,9 +97,9 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
   switch (Reg) {
-  case AMDGPU::OQAP:
-  case AMDGPU::OQBP:
-  case AMDGPU::AR_X:
+  case R600::OQAP:
+  case R600::OQBP:
+  case R600::AR_X:
     return false;
   default:
     return true;
@@ -108,3 +112,10 @@ void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                            RegScavenger *RS) const {
   llvm_unreachable("Subroutines not supported yet");
 }
+
+void R600RegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
+
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index f0d9644b02f2..c4c77172b299 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -8,20 +8,19 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for R600RegisterInfo
+/// Interface definition for R600RegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
 
-#include "AMDGPURegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "R600GenRegisterInfo.inc"
 
 namespace llvm {
 
-class AMDGPUSubtarget;
-
-struct R600RegisterInfo final : public AMDGPURegisterInfo {
+struct R600RegisterInfo final : public R600GenRegisterInfo {
   RegClassWeight RCW;
 
   R600RegisterInfo();
@@ -30,12 +29,12 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  /// \brief get the HW encoding for a register's channel.
+  /// get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
   unsigned getHWRegIndex(unsigned Reg) const;
 
-  /// \brief get the register class of the specified type to use in the
+  /// get the register class of the specified type to use in the
   /// CFGStructurizer
   const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
 
@@ -49,6 +48,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
+
+  void reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
index 84ab328bdb2b..02164b74a01b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -245,7 +245,7 @@ def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   (add V0123_W, V0123_Z, V0123_Y, V0123_X)
 >;
 
-def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64,
                                 (add (sequence "T%u_XY", 0, 63))>;
 
 def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 150d8c3dc3d3..74f1bd8fb986 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -37,7 +38,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
 
@@ -133,7 +133,7 @@ INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
 
 char SIAnnotateControlFlow::ID = 0;
 
-/// \brief Initialize all the types and constants used in the pass
+/// Initialize all the types and constants used in the pass
 bool SIAnnotateControlFlow::doInitialization(Module &M) {
   LLVMContext &Context = M.getContext();
 
@@ -157,29 +157,29 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   return false;
 }
 
-/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// Is the branch condition uniform or did the StructurizeCFG pass
 /// consider it as such?
 bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
   return DA->isUniform(T->getCondition()) ||
          T->getMetadata("structurizecfg.uniform") != nullptr;
 }
 
-/// \brief Is BB the last block saved on the stack ?
+/// Is BB the last block saved on the stack ?
 bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
   return !Stack.empty() && Stack.back().first == BB;
 }
 
-/// \brief Pop the last saved value from the control flow stack
+/// Pop the last saved value from the control flow stack
 Value *SIAnnotateControlFlow::popSaved() {
   return Stack.pop_back_val().second;
 }
 
-/// \brief Push a BB and saved value to the control flow stack
+/// Push a BB and saved value to the control flow stack
 void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
   Stack.push_back(std::make_pair(BB, Saved));
 }
 
-/// \brief Can the condition represented by this PHI node treated like
+/// Can the condition represented by this PHI node treated like
 /// an "Else" block?
 bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
   BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
@@ -198,14 +198,14 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
   return true;
 }
 
-// \brief Erase "Phi" if it is not used any more
+// Erase "Phi" if it is not used any more
 void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
   if (RecursivelyDeleteDeadPHINode(Phi)) {
-    DEBUG(dbgs() << "Erased unused condition phi\n");
+    LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
   }
 }
 
-/// \brief Open a new "If" block
+/// Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
   if (isUniform(Term))
     return;
@@ -215,7 +215,7 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
 }
 
-/// \brief Close the last "If" block and open a new "Else" block
+/// Close the last "If" block and open a new "Else" block
 void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   if (isUniform(Term)) {
     return;
@@ -225,7 +225,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
 }
 
-/// \brief Recursively handle the condition leading to a loop
+/// Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(
     Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
     SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
@@ -322,7 +322,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
   llvm_unreachable("Unhandled loop condition!");
 }
 
-/// \brief Handle a back edge (loop)
+/// Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   if (isUniform(Term))
     return;
@@ -353,7 +353,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   push(Term->getSuccessor(0), Arg);
 }
 
-/// \brief Close the last opened control flow
+/// Close the last opened control flow
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
@@ -381,7 +381,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
 }
 
-/// \brief Annotate the control flow with intrinsics so the backend can
+/// Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -422,11 +422,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
     openIf(Term);
   }
 
-  assert(Stack.empty());
+  if (!Stack.empty()) {
+    // CFG was probably not structured.
+    report_fatal_error("failed to annotate CFG");
+  }
+
   return true;
 }
 
-/// \brief Create the annotation pass
+/// Create the annotation pass
 FunctionPass *llvm::createSIAnnotateControlFlowPass() {
   return new SIAnnotateControlFlow();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
index b5c439b21b89..7e884ad93a23 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Inserts one nop instruction for each high level source statement for
+/// Inserts one nop instruction for each high level source statement for
 /// debugger usage.
 ///
 /// Tools, such as a debugger, need to pause execution based on user input (i.e.
@@ -21,6 +21,7 @@
 
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -62,7 +63,7 @@ FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
 bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
   // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
   // specified.
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.debuggerInsertNops())
     return false;
 
@@ -78,8 +79,8 @@ bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
-      // Skip DBG_VALUE instructions and instructions without location.
-      if (MI->isDebugValue() || !MI->getDebugLoc())
+      // Skip debug instructions and instructions without location.
+      if (MI->isDebugInstr() || !MI->getDebugLoc())
         continue;
 
       // Insert nop instruction if line number does not have nop inserted.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index a9f6069e798a..a6d28d6999e5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -85,7 +85,10 @@ enum : uint64_t {
   ClampHi = UINT64_C(1) << 48,
 
   // Is a packed VOP3P instruction.
-  IsPacked = UINT64_C(1) << 49
+  IsPacked = UINT64_C(1) << 49,
+
+  // Is a D16 buffer instruction.
+  D16Buf = UINT64_C(1) << 50
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -137,7 +140,6 @@ namespace AMDGPU {
     OPERAND_INPUT_MODS,
 
     // Operand for SDWA instructions
-    OPERAND_SDWA_SRC,
     OPERAND_SDWA_VOPC_DST,
 
     /// Operand with 32-bit immediate that uses the constant bus.
@@ -146,6 +148,13 @@ namespace AMDGPU {
   };
 }
 
+namespace SIStackID {
+enum StackTypes : uint8_t {
+  SCRATCH = 0,
+  SGPR_SPILL = 1
+};
+}
+
 // Input operand modifiers bit-masks
 // NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
@@ -273,8 +282,9 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_GPR_ALLOC = 5,
   ID_LDS_ALLOC = 6,
   ID_IB_STS = 7,
-  ID_SYMBOLIC_LAST_ = 8,
   ID_MEM_BASES = 15,
+  ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
+  ID_SYMBOLIC_LAST_ = 16,
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -375,6 +385,44 @@ enum SDWA9EncValues{
 };
 
 } // namespace SDWA
+
+namespace DPP {
+
+enum DppCtrl {
+  QUAD_PERM_FIRST   = 0,
+  QUAD_PERM_LAST    = 0xFF,
+  DPP_UNUSED1       = 0x100,
+  ROW_SHL0          = 0x100,
+  ROW_SHL_FIRST     = 0x101,
+  ROW_SHL_LAST      = 0x10F,
+  DPP_UNUSED2       = 0x110,
+  ROW_SHR0          = 0x110,
+  ROW_SHR_FIRST     = 0x111,
+  ROW_SHR_LAST      = 0x11F,
+  DPP_UNUSED3       = 0x120,
+  ROW_ROR0          = 0x120,
+  ROW_ROR_FIRST     = 0x121,
+  ROW_ROR_LAST      = 0x12F,
+  WAVE_SHL1         = 0x130,
+  DPP_UNUSED4_FIRST = 0x131,
+  DPP_UNUSED4_LAST  = 0x133,
+  WAVE_ROL1         = 0x134,
+  DPP_UNUSED5_FIRST = 0x135,
+  DPP_UNUSED5_LAST  = 0x137,
+  WAVE_SHR1         = 0x138,
+  DPP_UNUSED6_FIRST = 0x139,
+  DPP_UNUSED6_LAST  = 0x13B,
+  WAVE_ROR1         = 0x13C,
+  DPP_UNUSED7_FIRST = 0x13D,
+  DPP_UNUSED7_LAST  = 0x13F,
+  ROW_MIRROR        = 0x140,
+  ROW_HALF_MIRROR   = 0x141,
+  BCAST15           = 0x142,
+  BCAST31           = 0x143,
+  DPP_LAST          = BCAST31
+};
+
+} // namespace DPP
 } // namespace AMDGPU
 
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8b155c2d2780..566e0d3febc7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -69,6 +69,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -81,7 +82,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
@@ -110,12 +110,7 @@ namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
   MachineDominatorTree *MDT;
-  MachinePostDominatorTree *MPDT;
-  DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
-  void computePDF(MachineFunction * MF);
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void printPDF();
-#endif
+
 public:
   static char ID;
 
@@ -128,8 +123,6 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addPreserved<MachinePostDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -417,6 +410,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
   return false;
 }
 
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                                        const TargetRegisterInfo *TRI) {
+  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
 // Checks if there is potential path From instruction To instruction.
 // If CutOff is specified and it sits in between of that path we ignore
 // a higher portion of the path and report it is not reachable.
@@ -515,9 +514,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
         if (MDT.dominates(MI1, MI2)) {
           if (!intereferes(MI2, MI1)) {
-            DEBUG(dbgs() << "Erasing from "
-                         << printMBBReference(*MI2->getParent()) << " "
-                         << *MI2);
+            LLVM_DEBUG(dbgs()
+                       << "Erasing from "
+                       << printMBBReference(*MI2->getParent()) << " " << *MI2);
             MI2->eraseFromParent();
             Defs.erase(I2++);
             Changed = true;
@@ -525,9 +524,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
           }
         } else if (MDT.dominates(MI2, MI1)) {
           if (!intereferes(MI1, MI2)) {
-            DEBUG(dbgs() << "Erasing from "
-                         << printMBBReference(*MI1->getParent()) << " "
-                         << *MI1);
+            LLVM_DEBUG(dbgs()
+                       << "Erasing from "
+                       << printMBBReference(*MI1->getParent()) << " " << *MI1);
             MI1->eraseFromParent();
             Defs.erase(I1++);
             Changed = true;
@@ -543,11 +542,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
           MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
           if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
-            DEBUG(dbgs() << "Erasing from "
-                         << printMBBReference(*MI1->getParent()) << " " << *MI1
-                         << "and moving from "
-                         << printMBBReference(*MI2->getParent()) << " to "
-                         << printMBBReference(*I->getParent()) << " " << *MI2);
+            LLVM_DEBUG(dbgs()
+                       << "Erasing from "
+                       << printMBBReference(*MI1->getParent()) << " " << *MI1
+                       << "and moving from "
+                       << printMBBReference(*MI2->getParent()) << " to "
+                       << printMBBReference(*I->getParent()) << " " << *MI2);
             I->getParent()->splice(I, MI2->getParent(), MI2);
             MI1->eraseFromParent();
             Defs.erase(I1++);
@@ -567,47 +567,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
   return Changed;
 }
 
-void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
-  MachineFunction::iterator B = MF->begin();
-  MachineFunction::iterator E = MF->end();
-  for (; B != E; ++B) {
-    if (B->succ_size() > 1) {
-      for (auto S : B->successors()) {
-        MachineDomTreeNode *runner = MPDT->getNode(&*S);
-        MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
-        while (runner && runner != sentinel) {
-          PDF[runner->getBlock()].insert(&*B);
-          runner = runner->getIDom();
-        }
-      }
-    }
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SIFixSGPRCopies::printPDF() {
-  dbgs() << "\n######## PostDominanceFrontiers set #########\n";
-  for (auto &I : PDF) {
-    dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
-    for (auto &J : I.second) {
-      dbgs() << J->getNumber() << ' ';
-    }
-    dbgs() << '\n';
-  }
-  dbgs() << "\n##############################################\n";
-}
-#endif
-
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
-  MPDT = &getAnalysis<MachinePostDominatorTree>();
-  PDF.clear();
-  computePDF(&MF);
-  DEBUG(printPDF());
 
   SmallVector<MachineInstr *, 16> Worklist;
 
@@ -661,28 +626,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
           break;
 
-        // We don't need to fix the PHI if all the source blocks
-        // have no divergent control dependecies
+        // We don't need to fix the PHI if the common dominator of the
+        // two incoming blocks terminates with a uniform branch.
         bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (!HasVGPROperand) {
-          bool Uniform = true;
-          MachineBasicBlock * Join = MI.getParent();
-          for (auto &O : MI.explicit_operands()) {
-            if (O.isMBB()) {
-              MachineBasicBlock * Source = O.getMBB();
-              SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
-              SetVector<MachineBasicBlock*> &JoinPDF   = PDF[Join];
-              SetVector<MachineBasicBlock*> CDList;
-              for (auto &I : SourcePDF) {
-                if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
-                  if (hasTerminatorThatModifiesExec(*I, *TRI))
-                    Uniform = false;
-                }
-              }
-            }
-          }
-          if (Uniform) {
-            DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
+          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+          if (!predsHasDivergentTerminator(MBB0, TRI) &&
+              !predsHasDivergentTerminator(MBB1, TRI)) {
+            LLVM_DEBUG(dbgs()
+                       << "Not fixing PHI for uniform branch: " << MI << '\n');
             break;
           }
         }
@@ -722,7 +676,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         SmallSet<unsigned, 8> Visited;
         if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
-          DEBUG(dbgs() << "Fixing PHI: " << MI);
+          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
           TII->moveToVALU(MI);
         }
         break;
@@ -734,7 +688,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
-        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+        LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
         TII->moveToVALU(MI);
         break;
@@ -745,7 +699,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
-          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+          LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
           TII->moveToVALU(MI);
         }
         break;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 3d3121788b5e..15ba78edf919 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -8,13 +8,14 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Add implicit use of exec to vector register copies.
+/// Add implicit use of exec to vector register copies.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
@@ -46,7 +47,7 @@ char SIFixVGPRCopies::ID = 0;
 char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
 
 bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   bool Changed = false;
@@ -58,7 +59,7 @@ bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) {
           MI.addOperand(MF,
                         MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
-          DEBUG(dbgs() << "Add exec use to " << MI);
+          LLVM_DEBUG(dbgs() << "Add exec use to " << MI);
           Changed = true;
         }
         break;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 3493c7775f0c..5d613d8874fa 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Computations in WWM can overwrite values in inactive channels for
+/// Computations in WWM can overwrite values in inactive channels for
 /// variables that the register allocator thinks are dead. This pass adds fake
 /// uses of those variables to WWM instructions to make sure that they aren't
 /// overwritten.
@@ -55,6 +55,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -184,7 +185,7 @@ bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
   // This doesn't actually need LiveIntervals, but we can preserve them.
   LIS = getAnalysisIfAvailable<LiveIntervals>();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   TRI = &TII->getRegisterInfo();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 783181980342..338cabcb906b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -75,7 +76,7 @@ public:
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
-  const SISubtarget *ST;
+  const GCNSubtarget *ST;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
@@ -127,14 +128,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
   unsigned Opc = UseMI.getOpcode();
   switch (Opc) {
   case AMDGPU::V_MAC_F32_e64:
-  case AMDGPU::V_MAC_F16_e64: {
+  case AMDGPU::V_MAC_F16_e64:
+  case AMDGPU::V_FMAC_F32_e64: {
     // Special case for mac. Since this is replaced with mad when folded into
     // src2, we need to check the legality for the final instruction.
     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     if (static_cast<int>(OpNo) == Src2Idx) {
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
-      const MCInstrDesc &MadDesc
-        = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+
+      unsigned Opc = IsFMA ?
+        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+      const MCInstrDesc &MadDesc = TII->get(Opc);
       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     }
     return false;
@@ -155,6 +160,35 @@ static bool updateOperand(FoldCandidate &Fold,
   assert(Old.isReg());
 
   if (Fold.isImm()) {
+    if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
+      // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+      // already set.
+      unsigned Opcode = MI->getOpcode();
+      int OpNo = MI->getOperandNo(&Old);
+      int ModIdx = -1;
+      if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+        ModIdx = AMDGPU::OpName::src0_modifiers;
+      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+        ModIdx = AMDGPU::OpName::src1_modifiers;
+      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+        ModIdx = AMDGPU::OpName::src2_modifiers;
+      assert(ModIdx != -1);
+      ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+      MachineOperand &Mod = MI->getOperand(ModIdx);
+      unsigned Val = Mod.getImm();
+      if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+        return false;
+      // If upper part is all zero we do not need op_sel_hi.
+      if (!isUInt<16>(Fold.ImmToFold)) {
+        if (!(Fold.ImmToFold & 0xffff)) {
+          Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+          Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+          Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+          return true;
+        }
+        Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+      }
+    }
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
@@ -195,13 +229,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
-    if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+    if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+         Opc == AMDGPU::V_FMAC_F32_e64) &&
         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      unsigned NewOpc = IsFMA ?
+        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
 
       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
       // to fold the operand.
-      MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+      MI->setDesc(TII->get(NewOpc));
       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
       if (FoldAsMAD) {
         MI->untieRegOperand(OpNo);
@@ -345,6 +383,7 @@ void SIFoldOperands::foldOperand(
     // Don't fold into target independent nodes.  Target independent opcodes
     // don't have defined register classes.
     if (UseDesc.isVariadic() ||
+        UseOp.isImplicit() ||
         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
       return;
   }
@@ -470,7 +509,8 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
                                                MachineOperand &Op) {
   if (Op.isReg()) {
     // If this has a subregister, it obviously is a register source.
-    if (Op.getSubReg() != AMDGPU::NoSubRegister)
+    if (Op.getSubReg() != AMDGPU::NoSubRegister ||
+        !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -598,14 +638,14 @@ static bool tryFoldInst(const SIInstrInfo *TII,
     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
     if (Src1->isIdenticalTo(*Src0)) {
-      DEBUG(dbgs() << "Folded " << *MI << " into ");
+      LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
       if (Src2Idx != -1)
         MI->RemoveOperand(Src2Idx);
       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
                                                : getMovOpc(false)));
-      DEBUG(dbgs() << *MI << '\n');
+      LLVM_DEBUG(dbgs() << *MI << '\n');
       return true;
     }
   }
@@ -646,7 +686,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       // be folded due to multiple uses or operand constraints.
 
       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
-        DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+        LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
 
         // Some constant folding cases change the same immediate's use to a new
         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
@@ -713,8 +753,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // copies.
         MRI->clearKillFlags(Fold.OpToFold->getReg());
       }
-      DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
-            static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+      LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
+                        << static_cast<int>(Fold.UseOpNo) << " of "
+                        << *Fold.UseMI << '\n');
       tryFoldInst(TII, Fold.UseMI);
     } else if (Fold.isCommuted()) {
       // Restoring instruction's original operand order if fold has failed.
@@ -794,7 +835,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
   if (!DefClamp)
     return false;
 
-  DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+  LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
+                    << '\n');
 
   // Clamp is applied after omod, so it is OK if omod is set.
   DefClamp->setImm(1);
@@ -917,7 +959,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
     return false;
 
-  DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+  LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
 
   DefOMod->setImm(OMod);
   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
@@ -930,7 +972,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MRI = &MF.getRegInfo();
-  ST = &MF.getSubtarget<SISubtarget>();
+  ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
new file mode 100644
index 000000000000..cd14239de822
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -0,0 +1,398 @@
+//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of SMEM and VMEM instructions forming memory
+/// clauses if XNACK is enabled. Def operands of clauses are marked as early
+/// clobber to make sure we will not override any source within a clause.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-form-memory-clauses"
+
+// Clauses longer then 15 instructions would overflow one of the counters
+// and stall. They can stall even earlier if there are outstanding counters.
+static cl::opt<unsigned>
+MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
+          cl::desc("Maximum length of a memory clause, instructions"));
+
+namespace {
+
+class SIFormMemoryClauses : public MachineFunctionPass {
+  typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse;
+
+public:
+  static char ID;
+
+public:
+  SIFormMemoryClauses() : MachineFunctionPass(ID) {
+    initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI Form memory clauses";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  template <typename Callable>
+  void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
+
+  bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+  bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
+  void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+  bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
+                      GCNDownwardRPTracker &RPT);
+
+  const GCNSubtarget *ST;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  SIMachineFunctionInfo *MFI;
+
+  unsigned LastRecordedOccupancy;
+  unsigned MaxVGPRs;
+  unsigned MaxSGPRs;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
+                      "SI Form memory clauses", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
+                    "SI Form memory clauses", false, false)
+
+
+char SIFormMemoryClauses::ID = 0;
+
+char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
+
+FunctionPass *llvm::createSIFormMemoryClausesPass() {
+  return new SIFormMemoryClauses();
+}
+
+static bool isVMEMClauseInst(const MachineInstr &MI) {
+  return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
+}
+
+static bool isSMEMClauseInst(const MachineInstr &MI) {
+  return SIInstrInfo::isSMRD(MI);
+}
+
+// There no sense to create store clauses, they do not define anything,
+// thus there is nothing to set early-clobber.
+static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
+  if (MI.isDebugValue() || MI.isBundled())
+    return false;
+  if (!MI.mayLoad() || MI.mayStore())
+    return false;
+  if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
+      AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+    return false;
+  if (IsVMEMClause && !isVMEMClauseInst(MI))
+    return false;
+  if (!IsVMEMClause && !isSMEMClauseInst(MI))
+    return false;
+  return true;
+}
+
+static unsigned getMopState(const MachineOperand &MO) {
+  unsigned S = 0;
+  if (MO.isImplicit())
+    S |= RegState::Implicit;
+  if (MO.isDead())
+    S |= RegState::Dead;
+  if (MO.isUndef())
+    S |= RegState::Undef;
+  if (MO.isKill())
+    S |= RegState::Kill;
+  if (MO.isEarlyClobber())
+    S |= RegState::EarlyClobber;
+  if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+    S |= RegState::Renamable;
+  return S;
+}
+
+template <typename Callable>
+void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
+                                      Callable Func) const {
+  if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+      LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
+    Func(0);
+    return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+  unsigned E = TRI->getNumSubRegIndices();
+  SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
+  for (unsigned Idx = 1; Idx < E; ++Idx) {
+    // Is this index even compatible with the given class?
+    if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+      continue;
+    LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+    // Early exit if we found a perfect match.
+    if (SubRegMask == LaneMask) {
+      Func(Idx);
+      return;
+    }
+
+    if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+      continue;
+
+    CoveringSubregs.push_back(Idx);
+  }
+
+  llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(),
+             [this](unsigned A, unsigned B) {
+               LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
+               LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
+               unsigned NA = MaskA.getNumLanes();
+               unsigned NB = MaskB.getNumLanes();
+               if (NA != NB)
+                 return NA > NB;
+               return MaskA.getHighestLane() > MaskB.getHighestLane();
+             });
+
+  for (unsigned Idx : CoveringSubregs) {
+    LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+    if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
+      continue;
+
+    Func(Idx);
+    LaneMask &= ~SubRegMask;
+    if (LaneMask.none())
+      return;
+  }
+
+  llvm_unreachable("Failed to find all subregs to cover lane mask");
+}
+
+// Returns false if there is a use of a def already in the map.
+// In this case we must break the clause.
+bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
+                                    RegUse &Defs, RegUse &Uses) const {
+  // Check interference with defs.
+  for (const MachineOperand &MO : MI.operands()) {
+    // TODO: Prologue/Epilogue Insertion pass does not process bundled
+    //       instructions.
+    if (MO.isFI())
+      return false;
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+
+    // If it is tied we will need to write same register as we read.
+    if (MO.isTied())
+      return false;
+
+    RegUse &Map = MO.isDef() ? Uses : Defs;
+    auto Conflict = Map.find(Reg);
+    if (Conflict == Map.end())
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return false;
+
+    LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    if ((Conflict->second.second & Mask).any())
+      return false;
+  }
+
+  return true;
+}
+
+// Since all defs in the clause are early clobber we can run out of registers.
+// Function returns false if pressure would hit the limit if instruction is
+// bundled into a memory clause.
+bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
+                                        GCNDownwardRPTracker &RPT) {
+  // NB: skip advanceBeforeNext() call. Since all defs will be marked
+  // early-clobber they will all stay alive at least to the end of the
+  // clause. Therefor we should not decrease pressure even if load
+  // pointer becomes dead and could otherwise be reused for destination.
+  RPT.advanceToNext();
+  GCNRegPressure MaxPressure = RPT.moveMaxPressure();
+  unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+  if (Occupancy >= MFI->getMinAllowedOccupancy() &&
+      MaxPressure.getVGPRNum() <= MaxVGPRs &&
+      MaxPressure.getSGPRNum() <= MaxSGPRs) {
+    LastRecordedOccupancy = Occupancy;
+    return true;
+  }
+  return false;
+}
+
+// Collect register defs and uses along with their lane masks and states.
+void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
+                                         RegUse &Defs, RegUse &Uses) const {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
+                         TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
+                         LaneBitmask::getAll();
+    RegUse &Map = MO.isDef() ? Defs : Uses;
+
+    auto Loc = Map.find(Reg);
+    unsigned State = getMopState(MO);
+    if (Loc == Map.end()) {
+      Map[Reg] = std::make_pair(State, Mask);
+    } else {
+      Loc->second.first |= State;
+      Loc->second.second |= Mask;
+    }
+  }
+}
+
+// Check register def/use conflicts, occupancy limits and collect def/use maps.
+// Return true if instruction can be bundled with previous. It it cannot
+// def/use maps are not updated.
+bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
+                                         RegUse &Defs, RegUse &Uses,
+                                         GCNDownwardRPTracker &RPT) {
+  if (!canBundle(MI, Defs, Uses))
+    return false;
+
+  if (!checkPressure(MI, RPT))
+    return false;
+
+  collectRegUses(MI, Defs, Uses);
+  return true;
+}
+
+bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  ST = &MF.getSubtarget<GCNSubtarget>();
+  if (!ST->isXNACKEnabled())
+    return false;
+
+  const SIInstrInfo *TII = ST->getInstrInfo();
+  TRI = ST->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+  SlotIndexes *Ind = LIS->getSlotIndexes();
+  bool Changed = false;
+
+  MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
+  MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::instr_iterator Next;
+    for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
+      MachineInstr &MI = *I;
+      Next = std::next(I);
+
+      bool IsVMEM = isVMEMClauseInst(MI);
+
+      if (!isValidClauseInst(MI, IsVMEM))
+        continue;
+
+      RegUse Defs, Uses;
+      GCNDownwardRPTracker RPT(*LIS);
+      RPT.reset(MI);
+
+      if (!processRegUses(MI, Defs, Uses, RPT))
+        continue;
+
+      unsigned Length = 1;
+      for ( ; Next != E && Length < MaxClause; ++Next) {
+        if (!isValidClauseInst(*Next, IsVMEM))
+          break;
+
+        // A load from pointer which was loaded inside the same bundle is an
+        // impossible clause because we will need to write and read the same
+        // register inside. In this case processRegUses will return false.
+        if (!processRegUses(*Next, Defs, Uses, RPT))
+          break;
+
+        ++Length;
+      }
+      if (Length < 2)
+        continue;
+
+      Changed = true;
+      MFI->limitOccupancy(LastRecordedOccupancy);
+
+      auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
+      Ind->insertMachineInstrInMaps(*B);
+
+      for (auto BI = I; BI != Next; ++BI) {
+        BI->bundleWithPred();
+        Ind->removeSingleMachineInstrFromMaps(*BI);
+
+        for (MachineOperand &MO : BI->defs())
+          if (MO.readsReg())
+            MO.setIsInternalRead(true);
+      }
+
+      for (auto &&R : Defs) {
+        forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+          unsigned S = R.second.first | RegState::EarlyClobber;
+          if (!SubReg)
+            S &= ~(RegState::Undef | RegState::Dead);
+          B.addDef(R.first, S, SubReg);
+        });
+      }
+
+      for (auto &&R : Uses) {
+        forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
+          B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
+        });
+      }
+
+      for (auto &&R : Defs) {
+        unsigned Reg = R.first;
+        Uses.erase(Reg);
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          continue;
+        LIS->removeInterval(Reg);
+        LIS->createAndComputeVirtRegInterval(Reg);
+      }
+
+      for (auto &&R : Uses) {
+        unsigned Reg = R.first;
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          continue;
+        LIS->removeInterval(Reg);
+        LIS->createAndComputeVirtRegInterval(Reg);
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 89bb98dbd028..ac0ef90f25a4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -12,7 +12,9 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -21,19 +23,19 @@
 using namespace llvm;
 
 
-static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
                                          const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
                       ST.getMaxNumSGPRs(MF) / 4);
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
                                        const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
                       ST.getMaxNumSGPRs(MF));
 }
 
-void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
+void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
                                           MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -96,7 +98,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
 }
 
 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
-  const SISubtarget &ST,
+  const GCNSubtarget &ST,
   const SIInstrInfo *TII,
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
@@ -147,7 +149,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
 // SGPRs.
 std::pair<unsigned, unsigned>
 SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
-  const SISubtarget &ST,
+  const GCNSubtarget &ST,
   const SIInstrInfo *TII,
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
@@ -218,7 +220,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
@@ -235,6 +237,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = MF.getFunction();
 
   // We need to do the replacement of the private segment buffer and wave offset
   // register even if there are no stack objects. There could be stores to undef
@@ -286,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(MF)) {
+  if (ST.isAmdCodeObjectV2(F)) {
     PreloadedPrivateBufferReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
   }
@@ -305,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   }
 
   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
-    assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
+    assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F));
     MRI.addLiveIn(PreloadedPrivateBufferReg);
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
@@ -330,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 
   bool CopyBuffer = ResourceRegUsed &&
     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
-    ST.isAmdCodeObjectV2(MF) &&
+    ST.isAmdCodeObjectV2(F) &&
     ScratchRsrcReg != PreloadedPrivateBufferReg;
 
   // This needs to be careful of the copying order to avoid overwriting one of
@@ -361,13 +364,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 }
 
 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
+void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
       MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
       unsigned ScratchRsrcReg) const {
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const Function &Fn = MF.getFunction();
   DebugLoc DL;
 
   if (ST.isAmdPalOS()) {
@@ -387,12 +391,27 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
     }
+    auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
+    if (ST.hasMergedShaders()) {
+      switch (MF.getFunction().getCallingConv()) {
+        case CallingConv::AMDGPU_HS:
+        case CallingConv::AMDGPU_GS:
+          // Low GIT address is passed in s8 rather than s0 for an LS+HS or
+          // ES+GS merged shader on gfx9+.
+          GitPtrLo = AMDGPU::SGPR8;
+          break;
+        default:
+          break;
+      }
+    }
+    MF.getRegInfo().addLiveIn(GitPtrLo);
+    MF.front().addLiveIn(GitPtrLo);
     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
-      .addReg(AMDGPU::SGPR0) // Low address passed in
+      .addReg(GitPtrLo)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
-    // at offset 0.
+    // at offset 0 (or offset 16 for a compute shader).
     PointerType *PtrTy =
       PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
                        AMDGPUAS::CONSTANT_ADDRESS);
@@ -403,17 +422,18 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
                                        MachineMemOperand::MOInvariant |
                                        MachineMemOperand::MODereferenceable,
                                        0, 0);
+    unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
       .addReg(Rsrc01)
-      .addImm(0) // offset
+      .addImm(Offset) // offset
       .addImm(0) // glc
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
       .addMemOperand(MMO);
     return;
   }
-  if (ST.isMesaGfxShader(MF)
+  if (ST.isMesaGfxShader(Fn)
       || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
-    assert(!ST.isAmdCodeObjectV2(MF));
+    assert(!ST.isAmdCodeObjectV2(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
@@ -474,17 +494,52 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
   }
 }
 
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+  MachineFunction *MF = MBB.getParent();
+
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
+  LiveRegs.addLiveIns(MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   if (FuncInfo->isEntryFunction()) {
     emitEntryFunctionPrologue(MF, MBB);
     return;
   }
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -492,8 +547,34 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
+  // XXX - Is this the right predicate?
+
   bool NeedFP = hasFP(MF);
-  if (NeedFP) {
+  uint32_t NumBytes = MFI.getStackSize();
+  uint32_t RoundedSize = NumBytes;
+  const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+  if (NeedsRealignment) {
+    assert(NeedFP);
+    const unsigned Alignment = MFI.getMaxAlignment();
+
+    RoundedSize += Alignment;
+
+    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+    assert(ScratchSPReg != AMDGPU::NoRegister);
+
+    // s_add_u32 tmp_reg, s32, NumBytes
+    // s_and_b32 s32, tmp_reg, 0b111...0000
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+      .addReg(StackPtrReg)
+      .addImm((Alignment - 1) * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+      .addReg(ScratchSPReg, RegState::Kill)
+      .addImm(-Alignment * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    FuncInfo->setIsStackRealigned(true);
+  } else if (NeedFP) {
     // If we need a base pointer, set it up here. It's whatever the value of
     // the stack pointer is at this point. Any variable size objects will be
     // allocated after this, so we can still use the base pointer to reference
@@ -503,11 +584,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  uint32_t NumBytes = MFI.getStackSize();
-  if (NumBytes != 0 && hasSP(MF)) {
+  if (RoundedSize != 0 && hasSP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
+      .addImm(RoundedSize * ST.getWavefrontSize())
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -527,7 +607,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FuncInfo->isEntryFunction())
     return;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
 
@@ -553,10 +633,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   // it's really whether we need SP to be accurate or not.
 
   if (NumBytes != 0 && hasSP(MF)) {
+    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+      NumBytes + MFI.getMaxAlignment() : NumBytes;
+
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameDestroy);
+      .addImm(RoundedSize * ST.getWavefrontSize());
   }
 }
 
@@ -572,7 +654,7 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
 
 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                             unsigned &FrameReg) const {
-  const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
 
   FrameReg = RI->getFrameRegister(MF);
   return MF.getFrameInfo().getObjectOffset(FI);
@@ -586,7 +668,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (!MFI.hasStackObjects())
     return;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -611,6 +693,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
         if (TII->isSGPRSpill(MI)) {
           int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+          assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
           if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
             bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
             (void)Spilled;
@@ -667,7 +750,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   if (Amount == 0)
     return MBB.erase(I);
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
@@ -696,7 +779,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
 
 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
                                            MachineBasicBlock &MBB) const {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -746,7 +829,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
 }
 
 bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   // All stack operations are relative to the frame offset SGPR.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index df6f1632a316..2f35b3631cdc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -17,7 +17,7 @@ namespace llvm {
 class SIInstrInfo;
 class SIMachineFunctionInfo;
 class SIRegisterInfo;
-class SISubtarget;
+class GCNSubtarget;
 
 class SIFrameLowering final : public AMDGPUFrameLowering {
 public:
@@ -48,29 +48,29 @@ public:
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  void emitFlatScratchInit(const SISubtarget &ST,
+  void emitFlatScratchInit(const GCNSubtarget &ST,
                            MachineFunction &MF,
                            MachineBasicBlock &MBB) const;
 
   unsigned getReservedPrivateSegmentBufferReg(
-    const SISubtarget &ST,
+    const GCNSubtarget &ST,
     const SIInstrInfo *TII,
     const SIRegisterInfo *TRI,
     SIMachineFunctionInfo *MFI,
     MachineFunction &MF) const;
 
   std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
-    const SISubtarget &ST,
+    const GCNSubtarget &ST,
     const SIInstrInfo *TII,
     const SIRegisterInfo *TRI,
     SIMachineFunctionInfo *MFI,
     MachineFunction &MF) const;
 
-  /// \brief Emits debugger prologue.
+  /// Emits debugger prologue.
   void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
   // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-  void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF,
+  void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
       MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
       unsigned ScratchRsrcReg) const;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 41ca7fe8bfaa..5b7fc2656a20 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Custom DAG lowering for SI
+/// Custom DAG lowering for SI
 //
 //===----------------------------------------------------------------------===//
 
@@ -26,6 +26,7 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -49,7 +50,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -73,6 +73,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cassert>
@@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
 }
 
 SITargetLowering::SITargetLowering(const TargetMachine &TM,
-                                   const SISubtarget &STI)
-    : AMDGPUTargetLowering(TM, STI) {
+                                   const GCNSubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI),
+      Subtarget(&STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
@@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->has16BitInsts()) {
     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
-  }
 
-  if (Subtarget->hasVOP3PInsts()) {
+    // Unless there are also VOP3P operations, not operations are really legal.
     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
+    addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
   }
 
-  computeRegisterProperties(STI.getRegisterInfo());
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
 
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -208,11 +210,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
 
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
 
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -232,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
 #endif
 
-  //setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  //setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-        MVT::v2i64, MVT::v2f64}) {
+        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -261,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     }
   }
 
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
+
   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
   // is expanded to avoid having two separate loops in case the index is a VGPR.
 
@@ -285,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
 
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
   // and output demarshalling
@@ -302,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
 
-  if (getSubtarget()->hasFlatAddressSpace()) {
+  if (Subtarget->hasFlatAddressSpace()) {
     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
   }
@@ -315,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::TRAP, MVT::Other, Custom);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
 
+  if (Subtarget->has16BitInsts()) {
+    setOperationAction(ISD::FLOG, MVT::f16, Custom);
+    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+  }
+
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  // We only really have 32-bit BFE instructions (and 16-bit on VI).
+  //
+  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+  // effort to match them now. We want this to be false for i64 cases when the
+  // extraction isn't restricted to the upper or lower half. Ideally we would
+  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+  // span the midpoint are probably relatively rare, so don't worry about them
+  // for now.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
 
-  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
+  } else {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
   }
 
   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -408,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f16, Legal);
     if (!Subtarget->hasFP16Denormals())
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
-  }
 
-  if (Subtarget->hasVOP3PInsts()) {
-    for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -438,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
 
+    setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
+    setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+
     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
@@ -454,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
-    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
-    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
-    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
-    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
 
+    setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
+    setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+
+    setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+    setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+
+    if (!Subtarget->hasVOP3PInsts()) {
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+    }
+
+    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+    // This isn't really legal, but this avoids the legalizer unrolling it (and
+    // allows matching fneg (fabs x) patterns)
+    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+  }
+
+  if (Subtarget->hasVOP3PInsts()) {
     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
@@ -471,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
 
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
-    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
-
-    // This isn't really legal, but this avoids the legalizer unrolling it (and
-    // allows matching fneg (fabs x) patterns)
-    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+    setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
 
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+    setOperationAction(ISD::SHL, MVT::v4i16, Custom);
+    setOperationAction(ISD::SRA, MVT::v4i16, Custom);
+    setOperationAction(ISD::SRL, MVT::v4i16, Custom);
+    setOperationAction(ISD::ADD, MVT::v4i16, Custom);
+    setOperationAction(ISD::SUB, MVT::v4i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
+    setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
+    setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
+    setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::FADD, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+  }
+
+  setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
+  setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+
+  if (Subtarget->has16BitInsts()) {
+    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
   } else {
+    // Legalization hack.
     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+
+    setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
+    setOperationAction(ISD::FABS, MVT::v2f16, Custom);
   }
 
   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
@@ -505,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::SMAX);
   setTargetDAGCombine(ISD::UMIN);
@@ -542,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
 
   setSchedulingPreference(Sched::RegPressure);
+
+  // SI at least has hardware support for floating point exceptions, but no way
+  // of using or handling them is implemented. They are also optional in OpenCL
+  // (Section 7.3)
+  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 }
 
-const SISubtarget *SITargetLowering::getSubtarget() const {
-  return static_cast<const SISubtarget *>(Subtarget);
+const GCNSubtarget *SITargetLowering::getSubtarget() const {
+  return Subtarget;
 }
 
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+// v_mad_mix* support a conversion from f16 to f32.
+//
+// There is only one special case when denormals are enabled we don't currently,
+// where this is OK to use.
+bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
+                                           EVT DestVT, EVT SrcVT) const {
+  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+          (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+         DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
+         SrcVT.getScalarType() == MVT::f16;
+}
+
 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
   // shuffles are legal in order to prefer scalarizing some vector operations.
@@ -562,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
                                           unsigned IntrID) const {
+  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
+          AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+    AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
+                                                  (Intrinsic::ID)IntrID);
+    if (Attr.hasFnAttribute(Attribute::ReadNone))
+      return false;
+
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+    if (RsrcIntr->IsImage) {
+      Info.ptrVal = MFI->getImagePSV(
+        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+        CI.getArgOperand(RsrcIntr->RsrcArg));
+      Info.align = 0;
+    } else {
+      Info.ptrVal = MFI->getBufferPSV(
+        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+        CI.getArgOperand(RsrcIntr->RsrcArg));
+    }
+
+    Info.flags = MachineMemOperand::MODereferenceable;
+    if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(CI.getType());
+      Info.flags |= MachineMemOperand::MOLoad;
+    } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+      Info.flags |= MachineMemOperand::MOStore;
+    } else {
+      // Atomic
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(CI.getType());
+      Info.flags = MachineMemOperand::MOLoad |
+                   MachineMemOperand::MOStore |
+                   MachineMemOperand::MODereferenceable;
+
+      // XXX - Should this be volatile without known ordering?
+      Info.flags |= MachineMemOperand::MOVolatile;
+    }
+    return true;
+  }
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -578,220 +760,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return true;
   }
 
-  // Image load.
-  case Intrinsic::amdgcn_image_load:
-  case Intrinsic::amdgcn_image_load_mip:
-
-  // Sample.
-  case Intrinsic::amdgcn_image_sample:
-  case Intrinsic::amdgcn_image_sample_cl:
-  case Intrinsic::amdgcn_image_sample_d:
-  case Intrinsic::amdgcn_image_sample_d_cl:
-  case Intrinsic::amdgcn_image_sample_l:
-  case Intrinsic::amdgcn_image_sample_b:
-  case Intrinsic::amdgcn_image_sample_b_cl:
-  case Intrinsic::amdgcn_image_sample_lz:
-  case Intrinsic::amdgcn_image_sample_cd:
-  case Intrinsic::amdgcn_image_sample_cd_cl:
-
-    // Sample with comparison.
-  case Intrinsic::amdgcn_image_sample_c:
-  case Intrinsic::amdgcn_image_sample_c_cl:
-  case Intrinsic::amdgcn_image_sample_c_d:
-  case Intrinsic::amdgcn_image_sample_c_d_cl:
-  case Intrinsic::amdgcn_image_sample_c_l:
-  case Intrinsic::amdgcn_image_sample_c_b:
-  case Intrinsic::amdgcn_image_sample_c_b_cl:
-  case Intrinsic::amdgcn_image_sample_c_lz:
-  case Intrinsic::amdgcn_image_sample_c_cd:
-  case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
-    // Sample with offsets.
-  case Intrinsic::amdgcn_image_sample_o:
-  case Intrinsic::amdgcn_image_sample_cl_o:
-  case Intrinsic::amdgcn_image_sample_d_o:
-  case Intrinsic::amdgcn_image_sample_d_cl_o:
-  case Intrinsic::amdgcn_image_sample_l_o:
-  case Intrinsic::amdgcn_image_sample_b_o:
-  case Intrinsic::amdgcn_image_sample_b_cl_o:
-  case Intrinsic::amdgcn_image_sample_lz_o:
-  case Intrinsic::amdgcn_image_sample_cd_o:
-  case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
-    // Sample with comparison and offsets.
-  case Intrinsic::amdgcn_image_sample_c_o:
-  case Intrinsic::amdgcn_image_sample_c_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_d_o:
-  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_l_o:
-  case Intrinsic::amdgcn_image_sample_c_b_o:
-  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_lz_o:
-  case Intrinsic::amdgcn_image_sample_c_cd_o:
-  case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
-
-    // Basic gather4
-  case Intrinsic::amdgcn_image_gather4:
-  case Intrinsic::amdgcn_image_gather4_cl:
-  case Intrinsic::amdgcn_image_gather4_l:
-  case Intrinsic::amdgcn_image_gather4_b:
-  case Intrinsic::amdgcn_image_gather4_b_cl:
-  case Intrinsic::amdgcn_image_gather4_lz:
-
-    // Gather4 with comparison
-  case Intrinsic::amdgcn_image_gather4_c:
-  case Intrinsic::amdgcn_image_gather4_c_cl:
-  case Intrinsic::amdgcn_image_gather4_c_l:
-  case Intrinsic::amdgcn_image_gather4_c_b:
-  case Intrinsic::amdgcn_image_gather4_c_b_cl:
-  case Intrinsic::amdgcn_image_gather4_c_lz:
-
-    // Gather4 with offsets
-  case Intrinsic::amdgcn_image_gather4_o:
-  case Intrinsic::amdgcn_image_gather4_cl_o:
-  case Intrinsic::amdgcn_image_gather4_l_o:
-  case Intrinsic::amdgcn_image_gather4_b_o:
-  case Intrinsic::amdgcn_image_gather4_b_cl_o:
-  case Intrinsic::amdgcn_image_gather4_lz_o:
-
-    // Gather4 with comparison and offsets
-  case Intrinsic::amdgcn_image_gather4_c_o:
-  case Intrinsic::amdgcn_image_gather4_c_cl_o:
-  case Intrinsic::amdgcn_image_gather4_c_l_o:
-  case Intrinsic::amdgcn_image_gather4_c_b_o:
-  case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
-  case Intrinsic::amdgcn_image_gather4_c_lz_o: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = MFI->getImagePSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(1));
-    Info.align = 0;
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MODereferenceable;
-    return true;
-  }
-  case Intrinsic::amdgcn_image_store:
-  case Intrinsic::amdgcn_image_store_mip: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_VOID;
-    Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
-    Info.ptrVal = MFI->getImagePSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(2));
-    Info.flags = MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable;
-    Info.align = 0;
-    return true;
-  }
-  case Intrinsic::amdgcn_image_atomic_swap:
-  case Intrinsic::amdgcn_image_atomic_add:
-  case Intrinsic::amdgcn_image_atomic_sub:
-  case Intrinsic::amdgcn_image_atomic_smin:
-  case Intrinsic::amdgcn_image_atomic_umin:
-  case Intrinsic::amdgcn_image_atomic_smax:
-  case Intrinsic::amdgcn_image_atomic_umax:
-  case Intrinsic::amdgcn_image_atomic_and:
-  case Intrinsic::amdgcn_image_atomic_or:
-  case Intrinsic::amdgcn_image_atomic_xor:
-  case Intrinsic::amdgcn_image_atomic_inc:
-  case Intrinsic::amdgcn_image_atomic_dec: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = MFI->getImagePSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(2));
-
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable;
-
-    // XXX - Should this be volatile without known ordering?
-    Info.flags |= MachineMemOperand::MOVolatile;
-    return true;
-  }
-  case Intrinsic::amdgcn_image_atomic_cmpswap: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = MFI->getImagePSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(3));
-
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable;
-
-    // XXX - Should this be volatile without known ordering?
-    Info.flags |= MachineMemOperand::MOVolatile;
-    return true;
-  }
-  case Intrinsic::amdgcn_tbuffer_load:
-  case Intrinsic::amdgcn_buffer_load:
-  case Intrinsic::amdgcn_buffer_load_format: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = MFI->getBufferPSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(0));
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MODereferenceable;
-
-    // There is a constant offset component, but there are additional register
-    // offsets which could break AA if we set the offset to anything non-0.
-    return true;
-  }
-  case Intrinsic::amdgcn_tbuffer_store:
-  case Intrinsic::amdgcn_buffer_store:
-  case Intrinsic::amdgcn_buffer_store_format: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_VOID;
-    Info.ptrVal = MFI->getBufferPSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(1));
-    Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
-    Info.flags = MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable;
-    return true;
-  }
-  case Intrinsic::amdgcn_buffer_atomic_swap:
-  case Intrinsic::amdgcn_buffer_atomic_add:
-  case Intrinsic::amdgcn_buffer_atomic_sub:
-  case Intrinsic::amdgcn_buffer_atomic_smin:
-  case Intrinsic::amdgcn_buffer_atomic_umin:
-  case Intrinsic::amdgcn_buffer_atomic_smax:
-  case Intrinsic::amdgcn_buffer_atomic_umax:
-  case Intrinsic::amdgcn_buffer_atomic_and:
-  case Intrinsic::amdgcn_buffer_atomic_or:
-  case Intrinsic::amdgcn_buffer_atomic_xor: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = MFI->getBufferPSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(1));
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable |
-                 MachineMemOperand::MOVolatile;
-    return true;
-  }
-  case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = MFI->getBufferPSV(
-      *MF.getSubtarget<SISubtarget>().getInstrInfo(),
-      CI.getArgOperand(2));
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable |
-                 MachineMemOperand::MOVolatile;
-    return true;
-  }
   default:
     return false;
   }
@@ -802,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
                                             Type *&AccessTy) const {
   switch (II->getIntrinsicID()) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     Value *Ptr = II->getArgOperand(0);
     AccessTy = II->getType();
     Ops.push_back(Ptr);
@@ -892,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
     return isLegalGlobalAddressingMode(AM);
 
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -903,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     // will use a MUBUF load.
     // FIXME?: We also need to do this if unaligned, but we don't know the
     // alignment here.
-    if (DL.getTypeStoreSize(Ty) < 4)
+    if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
       return isLegalGlobalAddressingMode(AM);
 
-    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       // SMRD instructions have an 8-bit, dword offset on SI.
       if (!isUInt<8>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
       // On CI+, this can also be a 32-bit literal constant offset. If it fits
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -1015,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
+      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
+                 AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
         (Align % 4 == 0) : true;
     }
 
@@ -1058,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
 static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
          AS == AMDGPUASI.FLAT_ADDRESS ||
-         AS == AMDGPUASI.CONSTANT_ADDRESS;
+         AS == AMDGPUASI.CONSTANT_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
 }
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -1070,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
   const Value *Ptr = MemNode->getMemOperand()->getValue();
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.noclobber");
 }
 
@@ -1149,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
 
-  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                     DAG.getConstant(Offset, SL, PtrVT));
+  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
 }
 
 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
                                             const SDLoc &SL) const {
-  auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
-  uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
+                                               FIRST_IMPLICIT);
   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
 }
 
@@ -1183,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
 SDValue SITargetLowering::lowerKernargMemParameter(
   SelectionDAG &DAG, EVT VT, EVT MemVT,
   const SDLoc &SL, SDValue Chain,
-  uint64_t Offset, bool Signed,
+  uint64_t Offset, unsigned Align, bool Signed,
   const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  unsigned Align = DL.getABITypeAlignment(Ty);
+  // Try to avoid using an extload by loading earlier than the argument address,
+  // and extracting the relevant bits. The load should hopefully be merged with
+  // the previous argument.
+  if (MemVT.getStoreSize() < 4 && Align < 4) {
+    // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
+    int64_t AlignDownOffset = alignDown(Offset, 4);
+    int64_t OffsetDiff = Offset - AlignDownOffset;
+
+    EVT IntVT = MemVT.changeTypeToInteger();
+
+    // TODO: If we passed in the base kernel offset we could have a better
+    // alignment than 4, but we don't really need it.
+    SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
+    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+                               MachineMemOperand::MODereferenceable |
+                               MachineMemOperand::MOInvariant);
+
+    SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
+    SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
+
+    SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
+    ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
+    ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
+
+
+    return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
+  }
 
   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
-                             MachineMemOperand::MONonTemporal |
                              MachineMemOperand::MODereferenceable |
                              MachineMemOperand::MOInvariant);
 
@@ -1269,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
                                    FunctionType *FType,
                                    SIMachineFunctionInfo *Info) {
   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
-    const ISD::InputArg &Arg = Ins[I];
+    const ISD::InputArg *Arg = &Ins[I];
 
     // First check if it's a PS input addr.
-    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
-        !Arg.Flags.isByVal() && PSInputNum <= 15) {
+    if (CallConv == CallingConv::AMDGPU_PS &&
+        !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
+
+      bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
+
+      // Inconveniently only the first part of the split is marked as isSplit,
+      // so skip to the end. We only want to increment PSInputNum once for the
+      // entire split argument.
+      if (Arg->Flags.isSplit()) {
+        while (!Arg->Flags.isSplitEnd()) {
+          assert(!Arg->VT.isVector() &&
+                 "unexpected vector split in ps argument type");
+          if (!SkipArg)
+            Splits.push_back(*Arg);
+          Arg = &Ins[++I];
+        }
+      }
 
-      if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+      if (SkipArg) {
         // We can safely skip PS inputs.
-        Skipped.set(I);
+        Skipped.set(Arg->getOrigArgIndex());
         ++PSInputNum;
         continue;
       }
 
       Info->markPSInputAllocated(PSInputNum);
-      if (Arg.Used)
+      if (Arg->Used)
         Info->markPSInputEnabled(PSInputNum);
 
       ++PSInputNum;
     }
 
     // Second split vertices into their elements.
-    if (Arg.VT.isVector()) {
-      ISD::InputArg NewArg = Arg;
+    if (Arg->VT.isVector()) {
+      ISD::InputArg NewArg = *Arg;
       NewArg.Flags.setSplit();
-      NewArg.VT = Arg.VT.getVectorElementType();
+      NewArg.VT = Arg->VT.getVectorElementType();
 
       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
       // three or five element vertex only needs three or five registers,
       // NOT four or eight.
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
 
       for (unsigned J = 0; J != NumElements; ++J) {
@@ -1306,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
     } else {
-      Splits.push_back(Arg);
+      Splits.push_back(*Arg);
     }
   }
 }
@@ -1564,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
   // the scratch registers to pass in.
   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  if (ST.isAmdCodeObjectV2(MF)) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (ST.isAmdCodeObjectV2(MF.getFunction())) {
     if (RequiresStackAccess) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1677,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments(
   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   MachineFunction &MF = DAG.getMachineFunction();
+  const Function &Fn = MF.getFunction();
   FunctionType *FType = MF.getFunction().getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
-    const Function &Fn = MF.getFunction();
     DiagnosticInfoUnsupported NoGraphicsHSA(
         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -1779,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   SmallVector<SDValue, 16> Chains;
 
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  // FIXME: This is the minimum kernel argument alignment. We should improve
+  // this to the maximum alignment of the arguments.
+  //
+  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+  // kern arg offset.
+  const unsigned KernelArgBaseAlign = 16;
+
+   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
-    if (Skipped[i]) {
+    if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
       InVals.push_back(DAG.getUNDEF(Arg.VT));
       continue;
     }
@@ -1793,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments(
       VT = Ins[i].VT;
       EVT MemVT = VA.getLocVT();
 
-      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
-        VA.getLocMemOffset();
-      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+      const uint64_t Offset = VA.getLocMemOffset();
+      unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
 
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
       SDValue Arg = lowerKernargMemParameter(
-        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+        DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
       Chains.push_back(Arg.getValue(1));
 
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
@@ -1913,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   auto &ArgUsageInfo =
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
-  ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
 
   unsigned StackArgSize = CCInfo.getNextStackOffset();
   Info->setBytesInStackArgArea(StackArgSize);
@@ -2058,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // FIXME: Does sret work properly?
   if (!Info->isEntryFunction()) {
-    const SIRegisterInfo *TRI
-      = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
     const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
     if (I) {
@@ -2161,8 +2176,7 @@ void SITargetLowering::passSpecialInputs(
   SelectionDAG &DAG = CLI.DAG;
   const SDLoc &DL = CLI.DL;
 
-  const SISubtarget *ST = getSubtarget();
-  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   auto &ArgUsageInfo =
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
@@ -2355,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported required tail call to function ");
   }
 
+  if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
+    // Note the issue is with the CC of the calling function, not of the call
+    // itself.
+    return lowerUnhandledCall(CLI, InVals,
+                          "unsupported call from graphics shader of function ");
+  }
+
   // The first 4 bytes are reserved for the callee's emergency stack slot.
   const unsigned CalleeUsableStackOffset = 4;
 
@@ -2600,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Add a register mask operand representing the call-preserved registers.
 
-  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2660,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
 
   }
 
-  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
     report_fatal_error(Twine("invalid register \""
                              + StringRef(RegName)  + "\" for subtarget."));
@@ -2734,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
   unsigned PhiReg,
   unsigned InitSaveExecReg,
   int Offset,
-  bool UseGPRIdxMode) {
+  bool UseGPRIdxMode,
+  bool IsIndirectSrc) {
   MachineBasicBlock::iterator I = LoopBB.begin();
 
   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -2763,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
     .addReg(CurrentIdxReg)
     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
 
+  // Update EXEC, save the original EXEC value to VCC.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+    .addReg(CondReg, RegState::Kill);
+
+  MRI.setSimpleHint(NewExec, CondReg);
+
   if (UseGPRIdxMode) {
     unsigned IdxReg;
     if (Offset == 0) {
@@ -2773,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
         .addReg(CurrentIdxReg, RegState::Kill)
         .addImm(Offset);
     }
-
-    MachineInstr *SetIdx =
-      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
-      .addReg(IdxReg, RegState::Kill);
-    SetIdx->getOperand(2).setIsUndef();
+    unsigned IdxMode = IsIndirectSrc ?
+      VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+    MachineInstr *SetOn =
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+      .addReg(IdxReg, RegState::Kill)
+      .addImm(IdxMode);
+    SetOn->getOperand(3).setIsUndef();
   } else {
     // Move index from VCC into M0
     if (Offset == 0) {
@@ -2790,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
     }
   }
 
-  // Update EXEC, save the original EXEC value to VCC.
-  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
-    .addReg(CondReg, RegState::Kill);
-
-  MRI.setSimpleHint(NewExec, CondReg);
-
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   MachineInstr *InsertPt =
     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@@ -2823,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
                                                   unsigned InitResultReg,
                                                   unsigned PhiReg,
                                                   int Offset,
-                                                  bool UseGPRIdxMode) {
+                                                  bool UseGPRIdxMode,
+                                                  bool IsIndirectSrc) {
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -2862,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
 
   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
                                       InitResultReg, DstReg, PhiReg, TmpExec,
-                                      Offset, UseGPRIdxMode);
+                                      Offset, UseGPRIdxMode, IsIndirectSrc);
 
   MachineBasicBlock::iterator First = RemainderBB->begin();
   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -2947,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
 // Control flow needs to be inserted if indexing with a VGPR.
 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
                                           MachineBasicBlock &MBB,
-                                          const SISubtarget &ST) {
+                                          const GCNSubtarget &ST) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineFunction *MF = MBB.getParent();
@@ -2997,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
 
-  if (UseGPRIdxMode) {
-    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addImm(0) // Reset inside loop.
-      .addImm(VGPRIndexMode::SRC0_ENABLE);
-    SetOn->getOperand(3).setIsUndef();
-
-    // Disable again after the loop.
-    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  }
-
-  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
+                              Offset, UseGPRIdxMode, true);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
@@ -3015,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
       .addReg(SrcReg, RegState::Undef, SubReg)
       .addReg(SrcReg, RegState::Implicit)
       .addReg(AMDGPU::M0, RegState::Implicit);
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
   } else {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
       .addReg(SrcReg, RegState::Undef, SubReg)
@@ -3046,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
 
 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
                                           MachineBasicBlock &MBB,
-                                          const SISubtarget &ST) {
+                                          const GCNSubtarget &ST) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineFunction *MF = MBB.getParent();
@@ -3115,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
   const DebugLoc &DL = MI.getDebugLoc();
 
-  if (UseGPRIdxMode) {
-    MachineBasicBlock::iterator I(&MI);
-
-    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addImm(0) // Reset inside loop.
-      .addImm(VGPRIndexMode::DST_ENABLE);
-    SetOn->getOperand(3).setIsUndef();
-
-    // Disable again after the loop.
-    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  }
-
   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
 
   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
-                              Offset, UseGPRIdxMode);
+                              Offset, UseGPRIdxMode, false);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
@@ -3140,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
         .addReg(Dst, RegState::ImplicitDefine)
         .addReg(PhiReg, RegState::Implicit)
         .addReg(AMDGPU::M0, RegState::Implicit);
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
   } else {
     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 
@@ -3350,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::ADJCALLSTACKDOWN: {
     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
     MachineInstrBuilder MIB(*MF, &MI);
+
+    // Add an implicit use of the frame offset reg to prevent the restore copy
+    // inserted after the call from being reorderd after stack operations in the
+    // the caller's frame.
     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
-        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
+        .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
     return BB;
   }
   case AMDGPU::SI_CALL_ISEL:
@@ -3441,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
+  case MVT::f32: {
     // This is as fast on some subtargets. However, we always have full rate f32
     // mad available which returns the same result as the separate operations
     // which we should prefer over fma. We can't use this if we want to support
     // denormals, so only report this in these cases.
-    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+    if (Subtarget->hasFP32Denormals())
+      return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+    // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
+    return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+  }
   case MVT::f64:
     return true;
   case MVT::f16:
@@ -3462,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::v4f16);
+
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+
+  SDLoc SL(Op);
+  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
+                             Op->getFlags());
+  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
+                             Op->getFlags());
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+  SDValue Lo0, Hi0;
+  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+  SDValue Lo1, Hi1;
+  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+
+  SDLoc SL(Op);
+
+  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
+                             Op->getFlags());
+  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
+                             Op->getFlags());
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3494,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return lowerBUILD_VECTOR(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
   case ISD::TRAP:
-  case ISD::DEBUGTRAP:
     return lowerTRAP(Op, DAG);
+  case ISD::DEBUGTRAP:
+    return lowerDEBUGTRAP(Op, DAG);
+  case ISD::FABS:
+  case ISD::FNEG:
+    return splitUnaryVectorOp(Op, DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FADD:
+  case ISD::FMUL:
+    return splitBinaryVectorOp(Op, DAG);
   }
   return SDValue();
 }
 
+static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
+                                       const SDLoc &DL,
+                                       SelectionDAG &DAG, bool Unpacked) {
+  if (!LoadVT.isVector())
+    return Result;
+
+  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
+    // Truncate to v2i16/v4i16.
+    EVT IntLoadVT = LoadVT.changeTypeToInteger();
+
+    // Workaround legalizer not scalarizing truncate after vector op
+    // legalization byt not creating intermediate vector trunc.
+    SmallVector<SDValue, 4> Elts;
+    DAG.ExtractVectorElements(Result, Elts);
+    for (SDValue &Elt : Elts)
+      Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+
+    Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
+
+    // Bitcast to original type (v2f16/v4f16).
+    return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+  }
+
+  // Cast back to the original packed type.
+  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+}
+
+SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
+                                              MemSDNode *M,
+                                              SelectionDAG &DAG,
+                                              bool IsIntrinsic) const {
+  SDLoc DL(M);
+  SmallVector<SDValue, 10> Ops;
+  Ops.reserve(M->getNumOperands());
+
+  Ops.push_back(M->getOperand(0));
+  if (IsIntrinsic)
+    Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
+
+  // Skip 1, as it is the intrinsic ID.
+  for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
+    Ops.push_back(M->getOperand(I));
+
+  bool Unpacked = Subtarget->hasUnpackedD16VMem();
+  EVT LoadVT = M->getValueType(0);
+
+  EVT EquivLoadVT = LoadVT;
+  if (Unpacked && LoadVT.isVector()) {
+    EquivLoadVT = LoadVT.isVector() ?
+      EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                       LoadVT.getVectorNumElements()) : LoadVT;
+  }
+
+  // Change from v4f16/v2f16 to EquivLoadVT.
+  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
+
+  SDValue Load
+    = DAG.getMemIntrinsicNode(
+      IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
+      VTList, Ops, M->getMemoryVT(),
+      M->getMemOperand());
+  if (!Unpacked) // Just adjusted the opcode.
+    return Load;
+
+  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
+
+  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -3554,6 +3703,15 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     }
     break;
   }
+  case ISD::INTRINSIC_W_CHAIN: {
+    if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
+      Results.push_back(Res);
+      Results.push_back(Res.getValue(1));
+      return;
+    }
+
+    break;
+  }
   case ISD::SELECT: {
     SDLoc SL(N);
     EVT VT = N->getValueType(0);
@@ -3576,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
     return;
   }
+  case ISD::FNEG: {
+    if (N->getValueType(0) != MVT::v2f16)
+      break;
+
+    SDLoc SL(N);
+    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+    SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
+                             BC,
+                             DAG.getConstant(0x80008000, SL, MVT::i32));
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+    return;
+  }
+  case ISD::FABS: {
+    if (N->getValueType(0) != MVT::v2f16)
+      break;
+
+    SDLoc SL(N);
+    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+    SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
+                             BC,
+                             DAG.getConstant(0x7fff7fff, SL, MVT::i32));
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+    return;
+  }
   default:
     break;
   }
 }
 
-/// \brief Helper function for LowerBRCOND
+/// Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
   SDNode *Parent = Value.getNode();
@@ -3646,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
 
 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
   const Triple &TT = getTargetMachine().getTargetTriple();
-  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+  return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -3789,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
-  MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
 
-  unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
-    SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
-
-  if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
-      Subtarget->isTrapHandlerEnabled()) {
-    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-    unsigned UserSGPR = Info->getQueuePtrUserSGPR();
-    assert(UserSGPR != AMDGPU::NoRegister);
-
-    SDValue QueuePtr = CreateLiveInRegister(
-      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
-
-    SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
-
-    SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
-                                     QueuePtr, SDValue());
+  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+      !Subtarget->isTrapHandlerEnabled())
+    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
 
-    SDValue Ops[] = {
-      ToReg,
-      DAG.getTargetConstant(TrapID, SL, MVT::i16),
-      SGPR01,
-      ToReg.getValue(1)
-    };
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  assert(UserSGPR != AMDGPU::NoRegister);
+  SDValue QueuePtr = CreateLiveInRegister(
+    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+                                   QueuePtr, SDValue());
+  SDValue Ops[] = {
+    ToReg,
+    DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+    SGPR01,
+    ToReg.getValue(1)
+  };
+  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
 
-    return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
-  }
+SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Chain = Op.getOperand(0);
+  MachineFunction &MF = DAG.getMachineFunction();
 
-  switch (TrapID) {
-  case SISubtarget::TrapIDLLVMTrap:
-    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
-  case SISubtarget::TrapIDLLVMDebugTrap: {
+  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+      !Subtarget->isTrapHandlerEnabled()) {
     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
                                      "debugtrap handler not supported",
                                      Op.getDebugLoc(),
@@ -3831,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
     Ctx.diagnose(NoTrap);
     return Chain;
   }
-  default:
-    llvm_unreachable("unsupported trap handler type!");
-  }
 
-  return Chain;
+  SDValue Ops[] = {
+    Chain,
+    DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+  };
+  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
 }
 
 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -3948,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
 
 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDValue InsVal = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+
+  assert(VecSize <= 64);
+
+  unsigned NumElts = VecVT.getVectorNumElements();
+  SDLoc SL(Op);
+  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
+
+  if (NumElts == 4 && EltSize == 16 && KIdx) {
+    SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
+
+    SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+                                 DAG.getConstant(0, SL, MVT::i32));
+    SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+                                 DAG.getConstant(1, SL, MVT::i32));
+
+    SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
+    SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
+
+    unsigned Idx = KIdx->getZExtValue();
+    bool InsertLo = Idx < 2;
+    SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
+      InsertLo ? LoVec : HiVec,
+      DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
+      DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
+
+    InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
+
+    SDValue Concat = InsertLo ?
+      DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
+      DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
+
+    return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
+  }
+
   if (isa<ConstantSDNode>(Idx))
     return SDValue();
 
+  MVT IntVT = MVT::getIntegerVT(VecSize);
+
   // Avoid stack access for dynamic indexing.
-  SDLoc SL(Op);
-  SDValue Vec = Op.getOperand(0);
-  SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+  SDValue Val = InsVal;
+  if (InsVal.getValueType() == MVT::f16)
+      Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
 
   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
-  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
 
-  // Convert vector index to bit-index.
-  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
-                                  DAG.getConstant(16, SL, MVT::i32));
+  assert(isPowerOf2_32(EltSize));
+  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
 
-  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+  // Convert vector index to bit-index.
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
 
-  SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
-                            DAG.getConstant(0xffff, SL, MVT::i32),
+  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
+                            DAG.getConstant(0xffff, SL, IntVT),
                             ScaledIdx);
 
-  SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
-  SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
-                            DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
+                            DAG.getNOT(SL, BFM, IntVT), BCVec);
 
-  SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
-  return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
 }
 
 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -3985,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   EVT ResultVT = Op.getValueType();
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
+  EVT VecVT = Vec.getValueType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  EVT EltVT = VecVT.getVectorElementType();
+  assert(VecSize <= 64);
 
   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
 
-  // Make sure we we do any optimizations that will make it easier to fold
+  // Make sure we do any optimizations that will make it easier to fold
   // source modifiers before obscuring it with bit operations.
 
   // XXX - Why doesn't this get called when vector_shuffle is expanded?
   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
     return Combined;
 
-  if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
-    SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+  unsigned EltSize = EltVT.getSizeInBits();
+  assert(isPowerOf2_32(EltSize));
 
-    if (CIdx->getZExtValue() == 1) {
-      Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
-                           DAG.getConstant(16, SL, MVT::i32));
-    } else {
-      assert(CIdx->getZExtValue() == 0);
-    }
+  MVT IntVT = MVT::getIntegerVT(VecSize);
+  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+
+  // Convert vector index to bit-index (* EltSize)
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
 
-    if (ResultVT.bitsLT(MVT::i32))
-      Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+  if (ResultVT == MVT::f16) {
+    SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
   }
 
-  SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
+}
 
-  // Convert vector index to bit-index.
-  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
 
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
-  SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+    // Turn into pair of packed build_vectors.
+    // TODO: Special case for constants that can be materialized with s_mov_b64.
+    SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+                                    { Op.getOperand(0), Op.getOperand(1) });
+    SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+                                    { Op.getOperand(2), Op.getOperand(3) });
 
-  SDValue Result = Elt;
-  if (ResultVT.bitsLT(MVT::i32))
-    Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+    SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+    SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
 
-  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+    SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+    return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+  }
+
+  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+
+  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
+
+  SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
+                              DAG.getConstant(16, SL, MVT::i32));
+
+  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+
+  return DAG.getNode(ISD::BITCAST, SL, VT, Or);
 }
 
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // We can fold offsets for anything that doesn't require a GOT relocation.
   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
-              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+          GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
          !shouldEmitGOTReloc(GA->getGlobal());
 }
 
@@ -4082,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   const GlobalValue *GV = GSD->getGlobal();
 
   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
       // FIXME: It isn't correct to rely on the type of the pointer. This should
       // be removed when address space 0 is 64-bit.
@@ -4134,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                  unsigned Offset) const {
   SDLoc SL(Op);
   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
-                                           DAG.getEntryNode(), Offset, false);
+                                           DAG.getEntryNode(), Offset, 4, false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
@@ -4158,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
   return DAG.getUNDEF(VT);
 }
 
+static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
+                                    ArrayRef<SDValue> Elts) {
+  assert(!Elts.empty());
+  MVT Type;
+  unsigned NumElts;
+
+  if (Elts.size() == 1) {
+    Type = MVT::f32;
+    NumElts = 1;
+  } else if (Elts.size() == 2) {
+    Type = MVT::v2f32;
+    NumElts = 2;
+  } else if (Elts.size() <= 4) {
+    Type = MVT::v4f32;
+    NumElts = 4;
+  } else if (Elts.size() <= 8) {
+    Type = MVT::v8f32;
+    NumElts = 8;
+  } else {
+    assert(Elts.size() <= 16);
+    Type = MVT::v16f32;
+    NumElts = 16;
+  }
+
+  SmallVector<SDValue, 16> VecElts(NumElts);
+  for (unsigned i = 0; i < Elts.size(); ++i) {
+    SDValue Elt = Elts[i];
+    if (Elt.getValueType() != MVT::f32)
+      Elt = DAG.getBitcast(MVT::f32, Elt);
+    VecElts[i] = Elt;
+  }
+  for (unsigned i = Elts.size(); i < NumElts; ++i)
+    VecElts[i] = DAG.getUNDEF(MVT::f32);
+
+  if (NumElts == 1)
+    return VecElts[0];
+  return DAG.getBuildVector(Type, DL, VecElts);
+}
+
+static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
+                             SDValue *GLC, SDValue *SLC) {
+  auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
+  if (!CachePolicyConst)
+    return false;
+
+  uint64_t Value = CachePolicyConst->getZExtValue();
+  SDLoc DL(CachePolicy);
+  if (GLC) {
+    *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+    Value &= ~(uint64_t)0x1;
+  }
+  if (SLC) {
+    *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+    Value &= ~(uint64_t)0x2;
+  }
+
+  return Value == 0;
+}
+
+SDValue SITargetLowering::lowerImage(SDValue Op,
+                                     const AMDGPU::ImageDimIntrinsicInfo *Intr,
+                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+
+  SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+  bool IsD16 = false;
+  SDValue VData;
+  int NumVDataDwords;
+  unsigned AddrIdx; // Index of first address argument
+  unsigned DMask;
+
+  if (BaseOpcode->Atomic) {
+    VData = Op.getOperand(2);
+
+    bool Is64Bit = VData.getValueType() == MVT::i64;
+    if (BaseOpcode->AtomicX2) {
+      SDValue VData2 = Op.getOperand(3);
+      VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
+                                 {VData, VData2});
+      if (Is64Bit)
+        VData = DAG.getBitcast(MVT::v4i32, VData);
+
+      ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+      DMask = Is64Bit ? 0xf : 0x3;
+      NumVDataDwords = Is64Bit ? 4 : 2;
+      AddrIdx = 4;
+    } else {
+      DMask = Is64Bit ? 0x3 : 0x1;
+      NumVDataDwords = Is64Bit ? 2 : 1;
+      AddrIdx = 3;
+    }
+  } else {
+    unsigned DMaskIdx;
+
+    if (BaseOpcode->Store) {
+      VData = Op.getOperand(2);
+
+      MVT StoreVT = VData.getSimpleValueType();
+      if (StoreVT.getScalarType() == MVT::f16) {
+        if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+            !BaseOpcode->HasD16)
+          return Op; // D16 is unsupported for this instruction
+
+        IsD16 = true;
+        VData = handleD16VData(VData, DAG);
+      }
+
+      NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
+      DMaskIdx = 3;
+    } else {
+      MVT LoadVT = Op.getSimpleValueType();
+      if (LoadVT.getScalarType() == MVT::f16) {
+        if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+            !BaseOpcode->HasD16)
+          return Op; // D16 is unsupported for this instruction
+
+        IsD16 = true;
+        if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
+          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
+      }
+
+      NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
+      DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
+    }
+
+    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+    if (!DMaskConst)
+      return Op;
+
+    AddrIdx = DMaskIdx + 1;
+    DMask = DMaskConst->getZExtValue();
+    if (!DMask && !BaseOpcode->Store) {
+      // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
+      // store the channels' default values.
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      if (isa<MemSDNode>(Op))
+        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+      return Undef;
+    }
+  }
+
+  unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
+                       (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+                       (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+                       (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+  SmallVector<SDValue, 4> VAddrs;
+  for (unsigned i = 0; i < NumVAddrs; ++i)
+    VAddrs.push_back(Op.getOperand(AddrIdx + i));
+  SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+
+  SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
+  SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
+  unsigned CtrlIdx; // Index of texfailctrl argument
+  SDValue Unorm;
+  if (!BaseOpcode->Sampler) {
+    Unorm = True;
+    CtrlIdx = AddrIdx + NumVAddrs + 1;
+  } else {
+    auto UnormConst =
+        dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+    if (!UnormConst)
+      return Op;
+
+    Unorm = UnormConst->getZExtValue() ? True : False;
+    CtrlIdx = AddrIdx + NumVAddrs + 3;
+  }
+
+  SDValue TexFail = Op.getOperand(CtrlIdx);
+  auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
+  if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+    return Op;
+
+  SDValue GLC;
+  SDValue SLC;
+  if (BaseOpcode->Atomic) {
+    GLC = True; // TODO no-return optimization
+    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+      return Op;
+  } else {
+    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+      return Op;
+  }
+
+  SmallVector<SDValue, 14> Ops;
+  if (BaseOpcode->Store || BaseOpcode->Atomic)
+    Ops.push_back(VData); // vdata
+  Ops.push_back(VAddr);
+  Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+  if (BaseOpcode->Sampler)
+    Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+  Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+  Ops.push_back(Unorm);
+  Ops.push_back(GLC);
+  Ops.push_back(SLC);
+  Ops.push_back(False); // r128
+  Ops.push_back(False); // tfe
+  Ops.push_back(False); // lwe
+  Ops.push_back(DimInfo->DA ? True : False);
+  if (BaseOpcode->HasD16)
+    Ops.push_back(IsD16 ? True : False);
+  if (isa<MemSDNode>(Op))
+    Ops.push_back(Op.getOperand(0)); // chain
+
+  int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+  int Opcode = -1;
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+                                   NumVDataDwords, NumVAddrDwords);
+  if (Opcode == -1)
+    Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+                                   NumVDataDwords, NumVAddrDwords);
+  assert(Opcode != -1);
+
+  MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
+  if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
+    MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+    *MemRefs = MemOp->getMemOperand();
+    NewNode->setMemRefs(MemRefs, MemRefs + 1);
+  }
+
+  if (BaseOpcode->AtomicX2) {
+    SmallVector<SDValue, 1> Elt;
+    DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
+    return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
+  } else if (IsD16 && !BaseOpcode->Store) {
+    MVT LoadVT = Op.getSimpleValueType();
+    SDValue Adjusted = adjustLoadValueTypeImpl(
+        SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
+    return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+  }
+
+  return SDValue(NewNode, 0);
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4171,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    if (getSubtarget()->isAmdCodeObjectV2(MF))
+    if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
       return emitNonHSAIntrinsicError(DAG, DL, VT);
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
   }
   case Intrinsic::amdgcn_dispatch_ptr:
   case Intrinsic::amdgcn_queue_ptr: {
-    if (!Subtarget->isAmdCodeObjectV2(MF)) {
+    if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
       DiagnosticInfoUnsupported BadIntrin(
           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
           DL.getDebugLoc());
@@ -4208,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_rsq:
     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq_legacy:
-    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
 
     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rcp_legacy:
-    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq_clamp: {
-    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
 
     Type *Type = VT.getTypeForEVT(*DAG.getContext());
@@ -4235,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_X, false);
+                                    SI::KernelInputOffsets::NGROUPS_X, 4, false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Y, false);
+                                    SI::KernelInputOffsets::NGROUPS_Y, 4, false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Z, false);
+                                    SI::KernelInputOffsets::NGROUPS_Z, 4, false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -4354,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
 
   case Intrinsic::amdgcn_log_clamp: {
-    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return SDValue();
 
     DiagnosticInfoUnsupported BadIntrin(
@@ -4439,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_fmed3:
     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_fdot2:
+    return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_fmul_legacy:
     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
@@ -4484,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
                    0);
   }
-  case Intrinsic::amdgcn_image_getlod:
-  case Intrinsic::amdgcn_image_getresinfo: {
-    unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
-
-    // Replace dmask with everything disabled with undef.
-    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
-    if (!DMask || DMask->isNullValue())
-      return DAG.getUNDEF(Op.getValueType());
-    return SDValue();
-  }
+  case Intrinsic::amdgcn_fmad_ftz:
+    return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
   default:
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+      return lowerImage(Op, ImageDimIntr, DAG);
+
     return Op;
   }
 }
@@ -4506,10 +5010,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec: {
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     MemSDNode *M = cast<MemSDNode>(Op);
-    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
-      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+    unsigned Opc;
+    switch (IntrID) {
+    case Intrinsic::amdgcn_atomic_inc:
+      Opc = AMDGPUISD::ATOMIC_INC;
+      break;
+    case Intrinsic::amdgcn_atomic_dec:
+      Opc = AMDGPUISD::ATOMIC_DEC;
+      break;
+    case Intrinsic::amdgcn_ds_fadd:
+      Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
+      break;
+    case Intrinsic::amdgcn_ds_fmin:
+      Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
+      break;
+    case Intrinsic::amdgcn_ds_fmax:
+      Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
+      break;
+    default:
+      llvm_unreachable("Unknown intrinsic!");
+    }
     SDValue Ops[] = {
       M->getOperand(0), // Chain
       M->getOperand(2), // Ptr
@@ -4534,13 +5059,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
     EVT VT = Op.getValueType();
     EVT IntVT = VT.changeTypeToInteger();
-
     auto *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+    bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+    if (IsD16)
+      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
+
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
                                    M->getMemOperand());
   }
   case Intrinsic::amdgcn_tbuffer_load: {
     MemSDNode *M = cast<MemSDNode>(Op);
+    EVT LoadVT = Op.getValueType();
+    bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+    if (IsD16) {
+      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
+    }
+
     SDValue Ops[] = {
       Op.getOperand(0),  // Chain
       Op.getOperand(2),  // rsrc
@@ -4554,10 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(10)   // slc
     };
 
-    EVT VT = Op.getValueType();
-
     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, M->getMemOperand());
+                                   Op->getVTList(), Ops, LoadVT,
+                                   M->getMemOperand());
   }
   case Intrinsic::amdgcn_buffer_atomic_swap:
   case Intrinsic::amdgcn_buffer_atomic_add:
@@ -4638,65 +5172,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
   }
 
-  // Basic sample.
-  case Intrinsic::amdgcn_image_sample:
-  case Intrinsic::amdgcn_image_sample_cl:
-  case Intrinsic::amdgcn_image_sample_d:
-  case Intrinsic::amdgcn_image_sample_d_cl:
-  case Intrinsic::amdgcn_image_sample_l:
-  case Intrinsic::amdgcn_image_sample_b:
-  case Intrinsic::amdgcn_image_sample_b_cl:
-  case Intrinsic::amdgcn_image_sample_lz:
-  case Intrinsic::amdgcn_image_sample_cd:
-  case Intrinsic::amdgcn_image_sample_cd_cl:
-
-  // Sample with comparison.
-  case Intrinsic::amdgcn_image_sample_c:
-  case Intrinsic::amdgcn_image_sample_c_cl:
-  case Intrinsic::amdgcn_image_sample_c_d:
-  case Intrinsic::amdgcn_image_sample_c_d_cl:
-  case Intrinsic::amdgcn_image_sample_c_l:
-  case Intrinsic::amdgcn_image_sample_c_b:
-  case Intrinsic::amdgcn_image_sample_c_b_cl:
-  case Intrinsic::amdgcn_image_sample_c_lz:
-  case Intrinsic::amdgcn_image_sample_c_cd:
-  case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
-  // Sample with offsets.
-  case Intrinsic::amdgcn_image_sample_o:
-  case Intrinsic::amdgcn_image_sample_cl_o:
-  case Intrinsic::amdgcn_image_sample_d_o:
-  case Intrinsic::amdgcn_image_sample_d_cl_o:
-  case Intrinsic::amdgcn_image_sample_l_o:
-  case Intrinsic::amdgcn_image_sample_b_o:
-  case Intrinsic::amdgcn_image_sample_b_cl_o:
-  case Intrinsic::amdgcn_image_sample_lz_o:
-  case Intrinsic::amdgcn_image_sample_cd_o:
-  case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
-  // Sample with comparison and offsets.
-  case Intrinsic::amdgcn_image_sample_c_o:
-  case Intrinsic::amdgcn_image_sample_c_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_d_o:
-  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_l_o:
-  case Intrinsic::amdgcn_image_sample_c_b_o:
-  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
-  case Intrinsic::amdgcn_image_sample_c_lz_o:
-  case Intrinsic::amdgcn_image_sample_c_cd_o:
-  case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
-    // Replace dmask with everything disabled with undef.
-    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
-    if (!DMask || DMask->isNullValue()) {
-      SDValue Undef = DAG.getUNDEF(Op.getValueType());
-      return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
-    }
+  default:
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrID))
+      return lowerImage(Op, ImageDimIntr, DAG);
 
     return SDValue();
   }
-  default:
-    return SDValue();
+}
+
+SDValue SITargetLowering::handleD16VData(SDValue VData,
+                                         SelectionDAG &DAG) const {
+  EVT StoreVT = VData.getValueType();
+
+  // No change for f16 and legal vector D16 types.
+  if (!StoreVT.isVector())
+    return VData;
+
+  SDLoc DL(VData);
+  assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+
+  if (Subtarget->hasUnpackedD16VMem()) {
+    // We need to unpack the packed data to store.
+    EVT IntStoreVT = StoreVT.changeTypeToInteger();
+    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+    EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                        StoreVT.getVectorNumElements());
+    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
+    return DAG.UnrollVectorOp(ZExt.getNode());
   }
+
+  assert(isTypeLegal(StoreVT));
+  return VData;
 }
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4786,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+      const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
@@ -4841,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
 
   case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
     SDValue Ops[] = {
       Chain,
-      Op.getOperand(2),  // vdata
+      VData,             // vdata
       Op.getOperand(3),  // rsrc
       Op.getOperand(4),  // vindex
       Op.getOperand(5),  // voffset
@@ -4854,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Op.getOperand(10), // glc
       Op.getOperand(11)  // slc
     };
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+                           AMDGPUISD::TBUFFER_STORE_FORMAT;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
   }
 
   case Intrinsic::amdgcn_buffer_store:
   case Intrinsic::amdgcn_buffer_store_format: {
+    SDValue VData = Op.getOperand(2);
+    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    if (IsD16)
+      VData = handleD16VData(VData, DAG);
     SDValue Ops[] = {
       Chain,
-      Op.getOperand(2), // vdata
+      VData,            // vdata
       Op.getOperand(3), // rsrc
       Op.getOperand(4), // vindex
       Op.getOperand(5), // offset
       Op.getOperand(6), // glc
       Op.getOperand(7)  // slc
     };
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore |
-      MachineMemOperand::MODereferenceable,
-      VT.getStoreSize(), 4);
+    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+    MemSDNode *M = cast<MemSDNode>(Op);
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+  default: {
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+      return lowerImage(Op, ImageDimIntr, DAG);
 
-    unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
-                        AMDGPUISD::BUFFER_STORE :
-                        AMDGPUISD::BUFFER_STORE_FORMAT;
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+    return Op;
   }
+  }
+}
 
-  default:
+static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
+                                 ISD::LoadExtType ExtType, SDValue Op,
+                                 const SDLoc &SL, EVT VT) {
+  if (VT.bitsLT(Op.getValueType()))
+    return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
+
+  switch (ExtType) {
+  case ISD::SEXTLOAD:
+    return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
+  case ISD::ZEXTLOAD:
+    return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
+  case ISD::EXTLOAD:
+    return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
+  case ISD::NON_EXTLOAD:
     return Op;
   }
+
+  llvm_unreachable("invalid ext type");
+}
+
+SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  if (Ld->getAlignment() < 4 || Ld->isDivergent())
+    return SDValue();
+
+  // FIXME: Constant loads should all be marked invariant.
+  unsigned AS = Ld->getAddressSpace();
+  if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
+      AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+      (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
+    return SDValue();
+
+  // Don't do this early, since it may interfere with adjacent load merging for
+  // illegal types. We can avoid losing alignment information for exotic types
+  // pre-legalize.
+  EVT MemVT = Ld->getMemoryVT();
+  if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
+      MemVT.getSizeInBits() >= 32)
+    return SDValue();
+
+  SDLoc SL(Ld);
+
+  assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
+         "unexpected vector extload");
+
+  // TODO: Drop only high part of range.
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+                                MVT::i32, SL, Ld->getChain(), Ptr,
+                                Ld->getOffset(),
+                                Ld->getPointerInfo(), MVT::i32,
+                                Ld->getAlignment(),
+                                Ld->getMemOperand()->getFlags(),
+                                Ld->getAAInfo(),
+                                nullptr); // Drop ranges
+
+  EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+  if (MemVT.isFloatingPoint()) {
+    assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
+           "unexpected fp extload");
+    TruncVT = MemVT.changeTypeToInteger();
+  }
+
+  SDValue Cvt = NewLoad;
+  if (Ld->getExtensionType() == ISD::SEXTLOAD) {
+    Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
+                      DAG.getValueType(TruncVT));
+  } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
+             Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+    Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
+  } else {
+    assert(Ld->getExtensionType() == ISD::EXTLOAD);
+  }
+
+  EVT VT = Ld->getValueType(0);
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  DCI.AddToWorklist(Cvt.getNode());
+
+  // We may need to handle exotic cases, such as i16->i64 extloads, so insert
+  // the appropriate extension from the 32-bit load.
+  Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
+  DCI.AddToWorklist(Cvt.getNode());
+
+  // Handle conversion back to floating point if necessary.
+  Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
+
+  return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
 }
 
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4928,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
 
+  unsigned Alignment = Load->getAlignment();
   unsigned AS = Load->getAddressSpace();
   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
-                          AS, Load->getAlignment())) {
+                          AS, Alignment)) {
     SDValue Ops[2];
     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
     return DAG.getMergeValues(Ops, DL);
@@ -4945,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = MemVT.getVectorNumElements();
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
-    if (isMemOpUniform(Load))
+
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+    if (!Op->isDivergent() && Alignment >= 4)
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
   }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
-    if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
-        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
+
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUASI.GLOBAL_ADDRESS) {
+    if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
+        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+        Alignment >= 4)
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
   }
-  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+      AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+      AS == AMDGPUASI.GLOBAL_ADDRESS ||
       AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
@@ -4989,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
-    if (NumElements > 2)
-      return SplitVectorLoad(Op, DAG);
-
-    if (NumElements == 2)
+    // Use ds_read_b128 if possible.
+    if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
+        MemVT.getStoreSize() == 16)
       return SDValue();
 
-    // If properly aligned, if we split we might be able to use ds_read_b64.
-    return SplitVectorLoad(Op, DAG);
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
   }
   return SDValue();
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() != MVT::i64)
-    return SDValue();
+  EVT VT = Op.getValueType();
+  assert(VT.getSizeInBits() == 64);
 
   SDLoc DL(Op);
   SDValue Cond = Op.getOperand(0);
@@ -5025,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
 
   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
-  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 }
 
 // Catch division cases where we can use shortcuts with rcp and rsq
@@ -5037,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
-                Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
 
   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
     return SDValue();
@@ -5295,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Scale;
 
-  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     // Workaround a hardware bug on SI where the condition output from div_scale
     // is not usable.
 
@@ -5393,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+    // Use ds_write_b128 if possible.
+    if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
+        VT.getStoreSize() == 16)
+      return SDValue();
+
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
-
-    if (NumElements == 2)
-      return Op;
-
-    // If properly aligned, if we split we might be able to use ds_write_b64.
-    return SplitVectorStore(Op, DAG);
+    return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
   }
@@ -5474,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
   // about in practice.
-  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+  if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
       DCI.AddToWorklist(Cvt.getNode());
@@ -5617,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) {
   return false;
 }
 
+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+  // 0xff for any zero byte in the mask
+  uint32_t ZeroByteMask = 0;
+  if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+  if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+  if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+  if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+  uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+  if ((NonZeroByteMask & C) != NonZeroByteMask)
+    return 0; // Partial bytes selected.
+  return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+  assert(V.getValueSizeInBits() == 32);
+
+  if (V.getNumOperands() != 2)
+    return ~0;
+
+  ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!N1)
+    return ~0;
+
+  uint32_t C = N1->getZExtValue();
+
+  switch (V.getOpcode()) {
+  default:
+    break;
+  case ISD::AND:
+    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+      return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+    }
+    break;
+
+  case ISD::OR:
+    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+      return (0x03020100 & ~ConstMask) | ConstMask;
+    }
+    break;
+
+  case ISD::SHL:
+    if (C % 8)
+      return ~0;
+
+    return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+  case ISD::SRL:
+    if (C % 8)
+      return ~0;
+
+    return uint32_t(0x0c0c0c0c03020100ull >> C);
+  }
+
+  return ~0;
+}
+
 SDValue SITargetLowering::performAndCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   if (DCI.isBeforeLegalize())
@@ -5663,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
         }
       }
     }
+
+    // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+    if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+        isa<ConstantSDNode>(LHS.getOperand(2))) {
+      uint32_t Sel = getConstantPermuteMask(Mask);
+      if (!Sel)
+        return SDValue();
+
+      // Select 0xc for all zero bytes
+      Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+      SDLoc DL(N);
+      return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+                         LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+    }
   }
 
   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -5715,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
   }
 
+  // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+    uint32_t LHSMask = getPermuteMask(DAG, LHS);
+    uint32_t RHSMask = getPermuteMask(DAG, RHS);
+    if (LHSMask != ~0u && RHSMask != ~0u) {
+      // Canonicalize the expression in an attempt to have fewer unique masks
+      // and therefore fewer registers used to hold the masks.
+      if (LHSMask > RHSMask) {
+        std::swap(LHSMask, RHSMask);
+        std::swap(LHS, RHS);
+      }
+
+      // Select 0xc for each lane used from source operand. Zero has 0xc mask
+      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+      // Check of we need to combine values from two sources within a byte.
+      if (!(LHSUsedLanes & RHSUsedLanes) &&
+          // If we select high and lower word keep it for SDWA.
+          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+        // Each byte in each mask is either selector mask 0-3, or has higher
+        // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+        // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+        // mask which is not 0xff wins. By anding both masks we have a correct
+        // result except that 0x0c shall be corrected to give 0x0c only.
+        uint32_t Mask = LHSMask & RHSMask;
+        for (unsigned I = 0; I < 32; I += 8) {
+          uint32_t ByteSel = 0xff << I;
+          if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+            Mask &= (0x0c << I) & 0xffffffff;
+        }
+
+        // Add 4 to each active LHS lane. It will not affect any existing 0xff
+        // or 0x0c.
+        uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+        SDLoc DL(N);
+
+        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+                           LHS.getOperand(0), RHS.getOperand(0),
+                           DAG.getConstant(Sel, DL, MVT::i32));
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -5750,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
     return SDValue();
   }
 
+  // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+  if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+      LHS.getOpcode() == AMDGPUISD::PERM &&
+      isa<ConstantSDNode>(LHS.getOperand(2))) {
+    uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+    if (!Sel)
+      return SDValue();
+
+    Sel |= LHS.getConstantOperandVal(2);
+    SDLoc DL(N);
+    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+                       LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+  }
+
+  // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+    uint32_t LHSMask = getPermuteMask(DAG, LHS);
+    uint32_t RHSMask = getPermuteMask(DAG, RHS);
+    if (LHSMask != ~0u && RHSMask != ~0u) {
+      // Canonicalize the expression in an attempt to have fewer unique masks
+      // and therefore fewer registers used to hold the masks.
+      if (LHSMask > RHSMask) {
+        std::swap(LHSMask, RHSMask);
+        std::swap(LHS, RHS);
+      }
+
+      // Select 0xc for each lane used from source operand. Zero has 0xc mask
+      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+      // Check of we need to combine values from two sources within a byte.
+      if (!(LHSUsedLanes & RHSUsedLanes) &&
+          // If we select high and lower word keep it for SDWA.
+          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+        // Kill zero bytes selected by other mask. Zero value is 0xc.
+        LHSMask &= ~RHSUsedLanes;
+        RHSMask &= ~LHSUsedLanes;
+        // Add 4 to each active LHS lane
+        LHSMask |= LHSUsedLanes & 0x04040404;
+        // Combine masks
+        uint32_t Sel = LHSMask | RHSMask;
+        SDLoc DL(N);
+
+        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+                           LHS.getOperand(0), RHS.getOperand(0),
+                           DAG.getConstant(Sel, DL, MVT::i32));
+      }
+    }
+  }
+
   if (VT != MVT::i64)
     return SDValue();
 
@@ -5856,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) {
   case AMDGPUISD::FMAD_FTZ:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RSQ:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::LDEXP:
     return true;
   default:
@@ -5908,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performRcpCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+
+  if (N0.isUndef())
+    return N0;
+
+  if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
+                         N0.getOpcode() == ISD::SINT_TO_FP)) {
+    return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
+                           N->getFlags());
+  }
+
+  return AMDGPUTargetLowering::performRcpCombine(N, DCI);
+}
+
 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
     return true;
@@ -5916,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
 }
 
 static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
-                            const SISubtarget *ST, unsigned MaxDepth=5) {
+                            const GCNSubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
@@ -6174,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
 
 
   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      VT != MVT::f64 &&
+      !VT.isVector() && VT != MVT::f64 &&
       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
@@ -6294,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
 SDValue SITargetLowering::performExtractVectorEltCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
-
   SelectionDAG &DAG = DCI.DAG;
-  if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+
+  if ((Vec.getOpcode() == ISD::FNEG ||
+       Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
     SDLoc SL(N);
     EVT EltVT = N->getValueType(0);
     SDValue Idx = N->getOperand(1);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                               Vec.getOperand(0), Idx);
-    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+    return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
+  }
+
+  // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
+  //    =>
+  // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
+  // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
+  // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
+  if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
+    SDLoc SL(N);
+    EVT EltVT = N->getValueType(0);
+    SDValue Idx = N->getOperand(1);
+    unsigned Opc = Vec.getOpcode();
+
+    switch(Opc) {
+    default:
+      return SDValue();
+      // TODO: Support other binary operations.
+    case ISD::FADD:
+    case ISD::ADD:
+    case ISD::UMIN:
+    case ISD::UMAX:
+    case ISD::SMIN:
+    case ISD::SMAX:
+    case ISD::FMAXNUM:
+    case ISD::FMINNUM:
+      return DAG.getNode(Opc, SL, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                                     Vec.getOperand(0), Idx),
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                                     Vec.getOperand(1), Idx));
+    }
+  }
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+  // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
+  // elements. This exposes more load reduction opportunities by replacing
+  // multiple small extract_vector_elements with a single 32-bit extract.
+  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (EltSize <= 16 &&
+      EltVT.isByteSized() &&
+      VecSize > 32 &&
+      VecSize % 32 == 0 &&
+      Idx) {
+    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
+
+    unsigned BitIndex = Idx->getZExtValue() * EltSize;
+    unsigned EltIdx = BitIndex / 32;
+    unsigned LeftoverBitIdx = BitIndex % 32;
+    SDLoc SL(N);
+
+    SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
+    DCI.AddToWorklist(Cast.getNode());
+
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
+                              DAG.getConstant(EltIdx, SL, MVT::i32));
+    DCI.AddToWorklist(Elt.getNode());
+    SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
+                              DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
+    DCI.AddToWorklist(Srl.getNode());
+
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
+    DCI.AddToWorklist(Trunc.getNode());
+    return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
   }
 
   return SDValue();
@@ -6363,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
 
   const TargetOptions &Options = DAG.getTarget().Options;
   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
-       (N0->getFlags().hasUnsafeAlgebra() &&
-        N1->getFlags().hasUnsafeAlgebra())) &&
+       (N0->getFlags().hasAllowContract() &&
+        N1->getFlags().hasAllowContract())) &&
       isFMAFasterThanFMulAndFAdd(VT)) {
     return ISD::FMA;
   }
@@ -6420,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
     return SDValue();
   }
 
-  if (VT != MVT::i32)
+  if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
     return SDValue();
 
   // add x, zext (setcc) => addcarry x, 0, setcc
@@ -6596,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+    return SDValue();
+
+  // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+  //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+  SDValue Op1 = N->getOperand(0);
+  SDValue Op2 = N->getOperand(1);
+  SDValue FMA = N->getOperand(2);
+
+  if (FMA.getOpcode() != ISD::FMA ||
+      Op1.getOpcode() != ISD::FP_EXTEND ||
+      Op2.getOpcode() != ISD::FP_EXTEND)
+    return SDValue();
+
+  // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+  // is sufficient to allow generaing fdot2.
+  const TargetOptions &Options = DAG.getTarget().Options;
+  if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+      (N->getFlags().hasAllowContract() &&
+       FMA->getFlags().hasAllowContract())) {
+    Op1 = Op1.getOperand(0);
+    Op2 = Op2.getOperand(0);
+    if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    SDValue Vec1 = Op1.getOperand(0);
+    SDValue Idx1 = Op1.getOperand(1);
+    SDValue Vec2 = Op2.getOperand(0);
+
+    SDValue FMAOp1 = FMA.getOperand(0);
+    SDValue FMAOp2 = FMA.getOperand(1);
+    SDValue FMAAcc = FMA.getOperand(2);
+
+    if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+        FMAOp2.getOpcode() != ISD::FP_EXTEND)
+      return SDValue();
+
+    FMAOp1 = FMAOp1.getOperand(0);
+    FMAOp2 = FMAOp2.getOperand(0);
+    if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    SDValue Vec3 = FMAOp1.getOperand(0);
+    SDValue Vec4 = FMAOp2.getOperand(0);
+    SDValue Idx2 = FMAOp1.getOperand(1);
+
+    if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+        // Idx1 and Idx2 cannot be the same.
+        Idx1 == Idx2)
+      return SDValue();
+
+    if (Vec1 == Vec2 || Vec3 == Vec4)
+      return SDValue();
+
+    if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+      return SDValue();
+
+    if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+        (Vec1 == Vec4 && Vec2 == Vec3))
+      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6615,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
     }
   }
 
-  if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
-      isBoolSGPR(LHS.getOperand(0))) {
-    // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
-    // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
-    // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
-    // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
-    if ((CRHS->isAllOnesValue() &&
-         (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
-        (CRHS->isNullValue() &&
-         (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
-      return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
-                         DAG.getConstant(-1, SL, MVT::i1));
-    if ((CRHS->isAllOnesValue() &&
-         (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
-        (CRHS->isNullValue() &&
-         (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
-      return LHS.getOperand(0);
+  if (CRHS) {
+    if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+        isBoolSGPR(LHS.getOperand(0))) {
+      // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+      // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+      // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
+      // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
+      if ((CRHS->isAllOnesValue() &&
+           (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+          (CRHS->isNullValue() &&
+           (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                           DAG.getConstant(-1, SL, MVT::i1));
+      if ((CRHS->isAllOnesValue() &&
+           (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+          (CRHS->isNullValue() &&
+           (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+        return LHS.getOperand(0);
+    }
+
+    uint64_t CRHSVal = CRHS->getZExtValue();
+    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        LHS.getOpcode() == ISD::SELECT &&
+        isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        isa<ConstantSDNode>(LHS.getOperand(2)) &&
+        LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
+        isBoolSGPR(LHS.getOperand(0))) {
+      // Given CT != FT:
+      // setcc (select cc, CT, CF), CF, eq => xor cc, -1
+      // setcc (select cc, CT, CF), CF, ne => cc
+      // setcc (select cc, CT, CF), CT, ne => xor cc, -1
+      // setcc (select cc, CT, CF), CT, eq => cc
+      uint64_t CT = LHS.getConstantOperandVal(1);
+      uint64_t CF = LHS.getConstantOperandVal(2);
+
+      if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+          (CT == CRHSVal && CC == ISD::SETNE))
+        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                           DAG.getConstant(-1, SL, MVT::i1));
+      if ((CF == CRHSVal && CC == ISD::SETNE) ||
+          (CT == CRHSVal && CC == ISD::SETEQ))
+        return LHS.getOperand(0);
+    }
   }
 
   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
@@ -6700,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performClampCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CSrc)
+    return SDValue();
+
+  const APFloat &F = CSrc->getValueAPF();
+  APFloat Zero = APFloat::getZero(F.getSemantics());
+  APFloat::cmpResult Cmp0 = F.compare(Zero);
+  if (Cmp0 == APFloat::cmpLessThan ||
+      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+  }
+
+  APFloat One(F.getSemantics(), "1.0");
+  APFloat::cmpResult Cmp1 = F.compare(One);
+  if (Cmp1 == APFloat::cmpGreaterThan)
+    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+  return SDValue(CSrc, 0);
+}
+
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -6731,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       return performMinMaxCombine(N, DCI);
     break;
   }
-  case ISD::LOAD:
+  case ISD::FMA:
+    return performFMACombine(N, DCI);
+  case ISD::LOAD: {
+    if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+      return Widended;
+    LLVM_FALLTHROUGH;
+  }
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:
@@ -6749,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case AMDGPUISD::ATOMIC_INC:
-  case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
+  case AMDGPUISD::ATOMIC_DEC:
+  case AMDGPUISD::ATOMIC_LOAD_FADD:
+  case AMDGPUISD::ATOMIC_LOAD_FMIN:
+  case AMDGPUISD::ATOMIC_LOAD_FMAX:  // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -6765,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performClassCombine(N, DCI);
   case ISD::FCANONICALIZE:
     return performFCanonicalizeCombine(N, DCI);
-  case AMDGPUISD::FRACT:
   case AMDGPUISD::RCP:
+    return performRcpCombine(N, DCI);
+  case AMDGPUISD::FRACT:
   case AMDGPUISD::RSQ:
   case AMDGPUISD::RCP_LEGACY:
   case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::RSQ_CLAMP:
   case AMDGPUISD::LDEXP: {
     SDValue Src = N->getOperand(0);
@@ -6789,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFMed3Combine(N, DCI);
   case AMDGPUISD::CVT_PKRTZ_F16_F32:
     return performCvtPkRTZCombine(N, DCI);
+  case AMDGPUISD::CLAMP:
+    return performClampCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR: {
     SelectionDAG &DAG = DCI.DAG;
     EVT VT = N->getValueType(0);
@@ -6815,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
-/// \brief Helper function for adjustWritemask
+/// Helper function for adjustWritemask
 static unsigned SubIdx2Lane(unsigned Idx) {
   switch (Idx) {
   default: return 0;
@@ -6826,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
   }
 }
 
-/// \brief Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG instructions
 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                           SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getMachineOpcode();
+
+  // Subtract 1 because the vdata output is not a MachineSDNode operand.
+  int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
+  if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
+    return Node; // not implemented for D16
+
   SDNode *Users[4] = { nullptr };
   unsigned Lane = 0;
-  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+  unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
   bool HasChain = Node->getNumValues() > 1;
@@ -6881,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   unsigned BitsSet = countPopulation(NewDmask);
 
-  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
-                                          Node->getMachineOpcode(), BitsSet);
+  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
   assert(NewOpcode != -1 &&
          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
          "failed to find equivalent MIMG op");
@@ -6948,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) {
   return isa<FrameIndexSDNode>(Op);
 }
 
-/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// Legalize target independent instructions (e.g. INSERT_SUBREG)
 /// with frame index operands.
 /// LLVM assumes that inputs are to these instructions are registers.
 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
@@ -6995,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
   return DAG.UpdateNodeOperands(Node, Ops);
 }
 
-/// \brief Fold the instructions after selecting them.
+/// Fold the instructions after selecting them.
 /// Returns null if users were already updated.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
@@ -7069,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
   return Node;
 }
 
-/// \brief Assign the register class depending on the number of
+/// Assign the register class depending on the number of
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                      SDNode *Node) const {
@@ -7156,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
 }
 
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// Return a resource descriptor with the 'Add TID' bit enabled
 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
 ///        of the resource descriptor) to create an offset, which is added to
 ///        the resource pointer.
@@ -7198,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
-  if (!isTypeLegal(VT))
-    return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
+  const TargetRegisterClass *RC = nullptr;
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
+    default:
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
     case 's':
     case 'r':
       switch (VT.getSizeInBits()) {
@@ -7210,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, nullptr);
       case 32:
       case 16:
-        return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+        RC = &AMDGPU::SReg_32_XM0RegClass;
+        break;
       case 64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+        RC = &AMDGPU::SGPR_64RegClass;
+        break;
       case 128:
-        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+        RC = &AMDGPU::SReg_128RegClass;
+        break;
       case 256:
-        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+        RC = &AMDGPU::SReg_256RegClass;
+        break;
       case 512:
-        return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
+        RC = &AMDGPU::SReg_512RegClass;
+        break;
       }
-
+      break;
     case 'v':
       switch (VT.getSizeInBits()) {
       default:
         return std::make_pair(0U, nullptr);
       case 32:
       case 16:
-        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+        RC = &AMDGPU::VGPR_32RegClass;
+        break;
       case 64:
-        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+        RC = &AMDGPU::VReg_64RegClass;
+        break;
       case 96:
-        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+        RC = &AMDGPU::VReg_96RegClass;
+        break;
       case 128:
-        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+        RC = &AMDGPU::VReg_128RegClass;
+        break;
       case 256:
-        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+        RC = &AMDGPU::VReg_256RegClass;
+        break;
       case 512:
-        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+        RC = &AMDGPU::VReg_512RegClass;
+        break;
       }
+      break;
     }
+    // We actually support i128, i16 and f16 as inline parameters
+    // even if they are not reported as legal
+    if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+               VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+      return std::make_pair(0U, RC);
   }
 
   if (Constraint.size() > 1) {
-    const TargetRegisterClass *RC = nullptr;
     if (Constraint[1] == 'v') {
       RC = &AMDGPU::VGPR_32RegClass;
     } else if (Constraint[1] == 's') {
@@ -7280,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   if (Info->isEntryFunction()) {
     // Callable functions have fixed registers used for stack access.
@@ -7311,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
                      Info->getScratchWaveOffsetReg());
 
+  Info->limitOccupancy(MF);
+
   TargetLoweringBase::finalizeLowering(MF);
 }
 
@@ -7331,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
   // calculation won't overflow, so assume the sign bit is never set.
   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
 }
+
+bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
+  FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+{
+  switch (N->getOpcode()) {
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    {
+      const RegisterSDNode *R = nullptr;
+      if (N->getOpcode() == ISD::Register) {
+        R = dyn_cast<RegisterSDNode>(N);
+      }
+      else {
+        R = dyn_cast<RegisterSDNode>(N->getOperand(1));
+      }
+      if (R)
+      {
+        const MachineFunction * MF = FLI->MF;
+        const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+        const MachineRegisterInfo &MRI = MF->getRegInfo();
+        const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+        unsigned Reg = R->getReg();
+        if (TRI.isPhysicalRegister(Reg))
+          return TRI.isVGPR(MRI, Reg);
+
+        if (MRI.isLiveIn(Reg)) {
+          // workitem.id.x workitem.id.y workitem.id.z
+          // Any VGPR formal argument is also considered divergent
+          if (TRI.isVGPR(MRI, Reg))
+              return true;
+          // Formal arguments of non-entry functions
+          // are conservatively considered divergent
+          else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+            return true;
+        }
+        return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+      }
+    }
+    break;
+    case ISD::LOAD: {
+      const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
+      if (L->getMemOperand()->getAddrSpace() ==
+          Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
+        return true;
+    } break;
+    case ISD::CALLSEQ_END:
+    return true;
+    break;
+    case ISD::INTRINSIC_WO_CHAIN:
+    {
+
+    }
+      return AMDGPU::isIntrinsicSourceOfDivergence(
+      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    case ISD::INTRINSIC_W_CHAIN:
+      return AMDGPU::isIntrinsicSourceOfDivergence(
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    // In some cases intrinsics that are a source of divergence have been
+    // lowered to AMDGPUISD so we also need to check those too.
+    case AMDGPUISD::INTERP_MOV:
+    case AMDGPUISD::INTERP_P1:
+    case AMDGPUISD::INTERP_P2:
+      return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index b48e67f7563a..ad049f2a71c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI DAG Lowering interface definition
+/// SI DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,12 +22,15 @@
 namespace llvm {
 
 class SITargetLowering final : public AMDGPUTargetLowering {
+private:
+  const GCNSubtarget *Subtarget;
+
   SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
                                    SDValue Chain, uint64_t Offset) const;
   SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
   SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                    const SDLoc &SL, SDValue Chain,
-                                   uint64_t Offset, bool Signed,
+                                   uint64_t Offset, unsigned Align, bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
 
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
@@ -42,10 +45,14 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
                                  MVT VT, unsigned Offset) const;
+  SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
+                     SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
@@ -60,7 +67,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Converts \p Op, which must be of floating point type, to the
+  SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
+                              SelectionDAG &DAG,
+                              bool IsIntrinsic = false) const;
+
+  SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
+
+  /// Converts \p Op, which must be of floating point type, to the
   /// floating point type \p VT, by either extending or truncating it.
   SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
                             SDValue Op,
@@ -71,7 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
     SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
     bool Signed, const ISD::InputArg *Arg = nullptr) const;
 
-  /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+  /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -80,7 +93,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
 
   SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
@@ -121,8 +136,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
@@ -145,9 +163,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool shouldEmitPCReloc(const GlobalValue *GV) const;
 
 public:
-  SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+  SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
 
-  const SISubtarget *getSubtarget() const;
+  const GCNSubtarget *getSubtarget() const;
+
+  bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;
 
   bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
 
@@ -255,7 +275,10 @@ public:
                          EVT VT) const override;
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
@@ -284,6 +307,9 @@ public:
                                      const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
+
+  bool isSDNodeSourceOfDivergence(const SDNode *N,
+    FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index eb7277b7a5bb..61c8f359e168 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// This pass inserts branches on the 0 exec mask over divergent branches
 /// branches when it's expected that jumping over the untaken control flow will
 /// be cheaper than having every workitem no-op through it.
 //
@@ -18,6 +18,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -338,7 +339,7 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
 }
 
 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   SkipThreshold = SkipThresholdFlag;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6bbe5979316d..d456e3d9b94d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Insert wait instructions for memory reads and writes.
+/// Insert wait instructions for memory reads and writes.
 ///
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
@@ -40,6 +40,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -50,9 +51,21 @@
 #include <utility>
 #include <vector>
 
+using namespace llvm;
+
 #define DEBUG_TYPE "si-insert-waitcnts"
 
-using namespace llvm;
+DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
+              "Force emit s_waitcnt expcnt(0) instrs");
+DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
+              "Force emit s_waitcnt lgkmcnt(0) instrs");
+DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
+              "Force emit s_waitcnt vmcnt(0) instrs");
+
+static cl::opt<unsigned> ForceEmitZeroFlag(
+  "amdgpu-waitcnt-forcezero",
+  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
+  cl::init(0), cl::Hidden);
 
 namespace {
 
@@ -115,15 +128,15 @@ enum RegisterMapping {
        (w) = (enum WaitEventType)((w) + 1))
 
 // This is a per-basic-block object that maintains current score brackets
-// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// of each wait counter, and a per-register scoreboard for each wait counter.
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
-// wait-count may get decreased out of order, therefore we need to put in
+// wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
-  BlockWaitcntBrackets() {
+  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -301,6 +314,7 @@ public:
   void dump() { print(dbgs()); }
 
 private:
+  const GCNSubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
   bool MixedExpTypes = false;
@@ -332,14 +346,12 @@ public:
 
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
-  int32_t getIterCnt() { return IterCnt; }
+  unsigned getIterCnt() { return IterCnt; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
 
-  void print() {
-    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
-  }
+  void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
 
 private:
   // s_waitcnt added at the end of loop footer to stablize wait scores
@@ -352,7 +364,7 @@ private:
 
 class SIInsertWaitcnts : public MachineFunctionPass {
 private:
-  const SISubtarget *ST = nullptr;
+  const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
@@ -361,22 +373,31 @@ private:
   AMDGPUAS AMDGPUASI;
 
   DenseSet<MachineBasicBlock *> BlockVisitedSet;
-  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
       BlockWaitcntBracketsMap;
 
-  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
 
   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 
   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
 
+  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
+  // because of amdgpu-waitcnt-forcezero flag
+  bool ForceEmitZeroWaitcnts;
+  bool ForceEmitWaitcnt[NUM_INST_CNTS];
+
 public:
   static char ID;
 
-  SIInsertWaitcnts() : MachineFunctionPass(ID) {}
+  SIInsertWaitcnts() : MachineFunctionPass(ID) {
+    (void)ForceExpCounter;
+    (void)ForceLgkmCounter;
+    (void)ForceVMCounter;
+  }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -397,15 +418,53 @@ public:
         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
   }
 
+  bool isForceEmitWaitcnt() const {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1))
+      if (ForceEmitWaitcnt[T])
+        return true;
+    return false;
+  }
+
+  void setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+    if (DebugCounter::isCounterSet(ForceExpCounter) &&
+        DebugCounter::shouldExecute(ForceExpCounter)) {
+      ForceEmitWaitcnt[EXP_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[EXP_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+         DebugCounter::shouldExecute(ForceLgkmCounter)) {
+      ForceEmitWaitcnt[LGKM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[LGKM_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceVMCounter) &&
+        DebugCounter::shouldExecute(ForceVMCounter)) {
+      ForceEmitWaitcnt[VM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[VM_CNT] = false;
+    }
+#endif // NDEBUG
+  }
+
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
-                                           BlockWaitcntBrackets *ScoreBrackets);
-  void updateEventWaitCntAfter(MachineInstr &Inst,
+  void generateWaitcntInstBefore(MachineInstr &MI,
+                                  BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitcntAfter(MachineInstr &Inst,
                                BlockWaitcntBrackets *ScoreBrackets);
   void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+  unsigned countNumBottomBlocks(const MachineLoop *Loop);
   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+  bool isWaitcntStronger(unsigned LHS, unsigned RHS);
+  unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
 };
 
 } // end anonymous namespace
@@ -459,7 +518,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
                                        const MachineRegisterInfo *MRI,
                                        unsigned OpNo, int32_t Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
-  DEBUG({
+  LLVM_DEBUG({
     const MachineOperand &Opnd = MI->getOperand(OpNo);
     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
   });
@@ -681,14 +740,17 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
   const int32_t LB = getScoreLB(T);
   const int32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if (T == VM_CNT && hasPendingFlat()) {
-      // If there is a pending FLAT operation, and this is a VM waitcnt,
-      // then we need to force a waitcnt 0 for VM.
+    if ((T == VM_CNT || T == LGKM_CNT) &&
+        hasPendingFlat() &&
+        !ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
-      // are multiple types event in the brack. Also emit an s_wait counter
+      // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
@@ -789,7 +851,30 @@ static bool readsVCCZ(const MachineInstr &MI) {
          !MI.getOperand(1).isUndef();
 }
 
-///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+/// Given wait count encodings checks if LHS is stronger than RHS.
+bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
+  if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
+    return false;
+  if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
+    return false;
+  if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
+    return false;
+  return true;
+}
+
+/// Given wait count encodings create a new encoding which is stronger
+/// or equal to both.
+unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
+  unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
+                            AMDGPU::decodeVmcnt(IV, RHS));
+  unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
+                              AMDGPU::decodeLgkmcnt(IV, RHS));
+  unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
+                             AMDGPU::decodeExpcnt(IV, RHS));
+  return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
+}
+
+///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
 ///  We rely on this in-order completion
@@ -799,23 +884,29 @@ static bool readsVCCZ(const MachineInstr &MI) {
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+void SIInsertWaitcnts::generateWaitcntInstBefore(
     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
   // To emit, or not to emit - that's the question!
   // Start with an assumption that there is no need to emit.
-  unsigned int EmitSwaitcnt = 0;
-  // s_waitcnt instruction to return; default is NULL.
-  MachineInstr *SWaitInst = nullptr;
+  unsigned int EmitWaitcnt = 0;
+
   // No need to wait before phi. If a phi-move exists, then the wait should
   // has been inserted before the move. If a phi-move does not exist, then
   // wait should be inserted before the real use. The same is true for
   // sc-merge. It is not a coincident that all these cases correspond to the
   // instructions that are skipped in the assembling loop.
   bool NeedLineMapping = false; // TODO: Check on this.
-  if (MI.isDebugValue() &&
+
+  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
+  bool ForceEmitZeroWaitcnt = false;
+
+  setForceEmitWaitcnt();
+  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
+
+  if (MI.isDebugInstr() &&
       // TODO: any other opcode?
       !NeedLineMapping) {
-    return SWaitInst;
+    return;
   }
 
   // See if an s_waitcnt is forced at block entry, or is needed at
@@ -826,7 +917,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     ScoreBrackets->clearWaitAtBeginning();
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
-      EmitSwaitcnt |= CNT_MASK(T);
+      EmitWaitcnt |= CNT_MASK(T);
       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     }
   }
@@ -836,21 +927,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
-    EmitSwaitcnt |=
+    EmitWaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
   }
 
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
-  if (MI.getOpcode() == AMDGPU::RETURN ||
-      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-        EmitSwaitcnt |= CNT_MASK(T);
+        EmitWaitcnt |= CNT_MASK(T);
       }
     }
   }
@@ -861,7 +951,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
             AMDGPU::SendMsg::ID_GS_DONE)) {
     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+      EmitWaitcnt |= CNT_MASK(VM_CNT);
     }
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
@@ -879,11 +969,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_LDS:
           if (group_is_multi_wave ||
             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
                                ScoreBrackets->getScoreUB(LGKM_CNT));
             // LDS may have to wait for VM_CNT after buffer load to LDS
             if (target_info->HasBufferLoadToLDS()) {
-              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+              EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
                                  ScoreBrackets->getScoreUB(VM_CNT));
             }
           }
@@ -891,9 +981,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
 
         case SCMEM_GDS:
           if (group_is_multi_wave || fence_is_global) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
               ScoreBrackets->getScoreUB(LGKM_CNT));
           }
           break;
@@ -903,9 +993,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_RING:
         case SCMEM_SCATTER:
           if (group_is_multi_wave || fence_is_global) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
               ScoreBrackets->getScoreUB(VM_CNT));
           }
           break;
@@ -926,13 +1016,13 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
     }
 
@@ -947,7 +1037,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       if (ScoreBrackets->getScoreUB(EXP_CNT) >
         ScoreBrackets->getScoreLB(EXP_CNT)) {
         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+        EmitWaitcnt |= CNT_MASK(EXP_CNT);
       }
     }
 #endif
@@ -965,7 +1055,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         continue;
       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
       // VM_CNT is only relevant to vgpr or LDS.
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     }
 
@@ -977,10 +1067,10 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Op.getReg())) {
           // VM_CNT is only relevant to vgpr or LDS.
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
         }
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     }
@@ -999,9 +1089,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         if (AS != AMDGPUASI.LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
       }
     }
@@ -1012,38 +1102,35 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Def.getReg())) {
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
         }
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     } // End of for loop that looks at all dest operands.
   }
 
-  // TODO: Tie force zero to a compiler triage option.
-  bool ForceZero = false;
-
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
   // occurs before the instruction. Doing it here prevents any additional
   // S_WAITCNTs from being emitted if the instruction was marked as
   // requiring a WAITCNT beforehand.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier()) {
-    EmitSwaitcnt |=
+    EmitWaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+    EmitWaitcnt |= ScoreBrackets->updateByWait(
         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+    EmitWaitcnt |= ScoreBrackets->updateByWait(
         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
-  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
             ScoreBrackets->getScoreUB(LGKM_CNT) &&
         ScoreBrackets->hasPendingSMEM()) {
@@ -1052,17 +1139,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       // block, so if we only wait on LGKM here, we might end up with
       // another s_waitcnt inserted right after this if there are non-LGKM
       // instructions still outstanding.
-      ForceZero = true;
-      EmitSwaitcnt = true;
+      // FIXME: this is too conservative / the comment is wrong.
+      // We don't wait on everything at the end of the block and we combine
+      // waitcnts so we should never have back-to-back waitcnts.
+      ForceEmitZeroWaitcnt = true;
+      EmitWaitcnt = true;
     }
   }
 
   // Does this operand processing indicate s_wait counter update?
-  if (EmitSwaitcnt) {
+  if (EmitWaitcnt || IsForceEmitWaitcnt) {
     int CntVal[NUM_INST_CNTS];
 
     bool UseDefaultWaitcntStrategy = true;
-    if (ForceZero) {
+    if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
       // Force all waitcnts to 0.
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
@@ -1077,7 +1167,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     if (UseDefaultWaitcntStrategy) {
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
-        if (EmitSwaitcnt & CNT_MASK(T)) {
+        if (EmitWaitcnt & CNT_MASK(T)) {
           int Delta =
               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
@@ -1087,7 +1177,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
               ScoreBrackets->setScoreLB(
                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
             }
-            EmitSwaitcnt &= ~CNT_MASK(T);
+            EmitWaitcnt &= ~CNT_MASK(T);
           }
           CntVal[T] = Delta;
         } else {
@@ -1099,10 +1189,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     }
 
     // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitSwaitcnt != 0) {
+    if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+      if (!OldWaitcnt ||
+          (AMDGPU::decodeVmcnt(IV, Imm) !=
                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
           (AMDGPU::decodeExpcnt(IV, Imm) !=
            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1114,39 +1205,80 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
-            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>();
+                llvm::make_unique<BlockWaitcntBrackets>(ST);
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
-          DEBUG(dbgs() << "set-revisit: block"
-                       << ContainingLoop->getHeader()->getNumber() << '\n';);
+          LLVM_DEBUG(dbgs()
+                         << "set-revisit2: Block"
+                         << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
 
       // Update an existing waitcount, or make a new one.
-      MachineFunction &MF = *MI.getParent()->getParent();
-      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
-        SWaitInst = OldWaitcnt;
-      } else {
-        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
-                                          MI.getDebugLoc());
-        CompilerGeneratedWaitcntSet.insert(SWaitInst);
-      }
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+                      ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
+                      ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
+                      ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
+      // We don't remove waitcnts that existed prior to the waitcnt
+      // pass. Check if the waitcnt to-be-inserted can be avoided
+      // or if the prev waitcnt can be updated.
+      bool insertSWaitInst = true;
+      for (MachineBasicBlock::iterator I = MI.getIterator(),
+                                       B = MI.getParent()->begin();
+           insertSWaitInst && I != B; --I) {
+        if (I == MI.getIterator())
+          continue;
 
-      const MachineOperand &Op =
-          MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
-              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
-      SWaitInst->addOperand(MF, Op);
+        switch (I->getOpcode()) {
+        case AMDGPU::S_WAITCNT:
+          if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+            insertSWaitInst = false;
+          else if (!OldWaitcnt) {
+            OldWaitcnt = &*I;
+            Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
+          }
+          break;
+        // TODO: skip over instructions which never require wait.
+        }
+        break;
+      }
+      if (insertSWaitInst) {
+        if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
+          if (ForceEmitZeroWaitcnts)
+            LLVM_DEBUG(
+                dbgs()
+                << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+          if (IsForceEmitWaitcnt)
+            LLVM_DEBUG(dbgs()
+                       << "Force emit a s_waitcnt due to debug counter\n");
+
+          OldWaitcnt->getOperand(0).setImm(Enc);
+          if (!OldWaitcnt->getParent())
+            MI.getParent()->insert(MI, OldWaitcnt);
+
+          LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                            << "Old Instr: " << MI << '\n'
+                            << "New Instr: " << *OldWaitcnt << '\n');
+        } else {
+            auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                               MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                             .addImm(Enc);
+            TrackedWaitcntSet.insert(SWaitInst);
+
+            LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                              << "Old Instr: " << MI << '\n'
+                              << "New Instr: " << *SWaitInst << '\n');
+        }
+      }
 
       if (CntVal[EXP_CNT] == 0) {
         ScoreBrackets->setMixedExpTypes(false);
       }
     }
   }
-
-  return SWaitInst;
 }
 
 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
@@ -1180,7 +1312,7 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   return false;
 }
 
-void SIInsertWaitcnts::updateEventWaitCntAfter(
+void SIInsertWaitcnts::updateEventWaitcntAfter(
     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
@@ -1214,7 +1346,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+    if (ST->vmemWriteNeedsExpWaitcnt() &&
         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
@@ -1247,27 +1379,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
   }
 }
 
+// Merge the score brackets of the Block's predecessors;
+// this merged score bracket is used when adding waitcnts to the Block
 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   int32_t MaxPending[NUM_INST_CNTS] = {0};
   int32_t MaxFlat[NUM_INST_CNTS] = {0};
   bool MixedExpTypes = false;
 
-  // Clear the score bracket state.
-  ScoreBrackets->clear();
-
-  // Compute the number of pending elements on block entry.
+  // For single basic block loops, we need to retain the Block's
+  // score bracket to have accurate Pred info. So, make a copy of Block's
+  // score bracket, clear() it (which retains several important bits of info),
+  // populate, and then replace en masse. For non-single basic block loops,
+  // just clear Block's current score bracket and repopulate in-place.
+  bool IsSelfPred;
+  std::unique_ptr<BlockWaitcntBrackets> S;
+
+  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
+    != Block.pred_end();
+  if (IsSelfPred) {
+    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+    ScoreBrackets = S.get();
+  }
 
-  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
-  // need to handle single BBs with backedges to themselves. This means that
-  // they will need to retain and not clear their initial state.
+  ScoreBrackets->clear();
 
   // See if there are any uninitialized predecessors. If so, emit an
   // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *pred : Block.predecessors()) {
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1306,7 +1448,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1354,7 +1496,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
 
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1468,7 +1610,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1496,17 +1638,36 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
       }
     }
   }
+
+  // if a single block loop, update the score brackets. Not needed for other
+  // blocks, as we did this in-place
+  if (IsSelfPred) {
+    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+  }
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+                                    const MachineBasicBlock *Block) {
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+  unsigned Count = 0;
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB->isSuccessor(Loop->getHeader())) {
+      Count++;
+    }
+  }
+  return Count;
 }
 
 // Generate s_waitcnt instructions where needed.
@@ -1517,8 +1678,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
 
-  DEBUG({
-    dbgs() << "Block" << Block.getNumber();
+  LLVM_DEBUG({
+    dbgs() << "*** Block" << Block.getNumber() << " ***";
     ScoreBrackets->dump();
   });
 
@@ -1528,16 +1689,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     MachineInstr &Inst = *Iter;
     // Remove any previously existing waitcnts.
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      // TODO: Register the old waitcnt and optimize the following waitcnts.
-      // Leaving the previously existing waitcnts is conservatively correct.
-      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
-          CompilerGeneratedWaitcntSet.end())
+      // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+      // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+      // as needed.
+      if (!TrackedWaitcntSet.count(&Inst))
         ++Iter;
       else {
-        ScoreBrackets->setWaitcnt(&Inst);
         ++Iter;
         Inst.removeFromParent();
       }
+      ScoreBrackets->setWaitcnt(&Inst);
       continue;
     }
 
@@ -1550,29 +1711,20 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
-        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+        (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
           ScoreBrackets->hasPendingSMEM()) {
-        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+        if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }
     }
 
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
-    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
-
-    if (SWaitInst) {
-      Block.insert(Inst, SWaitInst);
-      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
-        DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                     << "Old Instr: " << Inst << '\n'
-                     << "New Instr: " << *SWaitInst << '\n';);
-      }
-    }
+    generateWaitcntInstBefore(Inst, ScoreBrackets);
 
-    updateEventWaitCntAfter(Inst, ScoreBrackets);
+    updateEventWaitcntAfter(Inst, ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
@@ -1587,10 +1739,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     ScoreBrackets->clearWaitcnt();
 
-    if (SWaitInst) {
-      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
-    }
-    DEBUG({
+    LLVM_DEBUG({
       Inst.print(dbgs());
       ScoreBrackets->dump();
     });
@@ -1627,21 +1776,22 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Check if we need to force convergence at loop footer.
   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
     WaitcntData->print();
-    DEBUG(dbgs() << '\n';);
+    LLVM_DEBUG(dbgs() << '\n';);
 
     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement and doesn't always guarantee convergence for a loop. Each
-    // loop should take at most 2 iterations for it to converge naturally.
-    // When this max is reached and result doesn't converge, we force
-    // convergence by inserting a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > 2) {
+    // placement, but doesn't guarantee convergence for a loop. Each
+    // loop should take at most (n+1) iterations for it to converge naturally,
+    // where n is the number of bottom blocks. If this threshold is reached and
+    // the result hasn't converged, then we force convergence by inserting
+    // a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
-      // As a simplification, Instead of tracking individual scores and
-      // generate the precise wait count, just wait on 0.
+      // As a simplification, instead of tracking individual scores and
+      // generating the precise wait count, just wait on 0.
       bool HasPending = false;
       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1649,16 +1799,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
           HasPending = true;
+          break;
         }
       }
 
       if (HasPending) {
         if (!SWaitInst) {
-          SWaitInst = Block.getParent()->CreateMachineInstr(
-              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
-          CompilerGeneratedWaitcntSet.insert(SWaitInst);
-          const MachineOperand &Op = MachineOperand::CreateImm(0);
-          SWaitInst->addOperand(MF, Op);
+          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                              .addImm(0);
+          TrackedWaitcntSet.insert(SWaitInst);
 #if 0 // TODO: Format the debug output
           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
           OutputTransformAdd(SWaitInst, context);
@@ -1670,7 +1820,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
 
       if (SWaitInst) {
-        DEBUG({
+        LLVM_DEBUG({
           SWaitInst->print(dbgs());
           dbgs() << "\nAdjusted score board:";
           ScoreBrackets->dump();
@@ -1678,7 +1828,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
         // Add this waitcnt to the block. It is either newly created or
         // created in previous iterations and added back since block traversal
-        // always remove waitcnt.
+        // always removes waitcnts.
         insertWaitcntBeforeCF(Block, SWaitInst);
         WaitcntData->setWaitcnt(SWaitInst);
       }
@@ -1687,7 +1837,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
-  ST = &MF.getSubtarget<SISubtarget>();
+  ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
@@ -1696,6 +1846,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   AMDGPUASI = ST->getAMDGPUAS();
 
+  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1))
+    ForceEmitWaitcnt[T] = false;
+
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
@@ -1712,6 +1867,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   RegisterEncoding.SGPRL =
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
+  TrackedWaitcntSet.clear();
+  BlockVisitedSet.clear();
+  VCCZBugHandledSet.clear();
+  LoopWaitcntDataMap.clear();
+  BlockWaitcntProcessedSet.clear();
+
   // Walk over the blocks in reverse post-dominator order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1726,7 +1887,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     }
     ScoreBrackets->setPostOrder(MBB.getNumber());
@@ -1737,22 +1898,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
-        (BlockWaitcntProcessedSet.find(&MBB) ==
-         BlockWaitcntProcessedSet.end())) {
-      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-      DEBUG(dbgs() << "set-revisit: block"
-                   << ContainingLoop->getHeader()->getNumber() << '\n';);
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+      unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+      // If the loop has multiple back-edges, and so more than one "bottom"
+      // basic block, we have to guarantee a re-walk over every blocks.
+      if ((std::count(BlockWaitcntProcessedSet.begin(),
+                      BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
+        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
+                          << ContainingLoop->getHeader()->getNumber() << '\n';);
+      }
     }
 
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
-    // Flag that waitcnts have been processed at least once.
-    BlockWaitcntProcessedSet.insert(&MBB);
+    // Record that waitcnts have been processed at least once for this block.
+    BlockWaitcntProcessedSet.push_back(&MBB);
 
-    // See if we want to revisit the loop.
-    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+    // See if we want to revisit the loop. If a loop has multiple back-edges,
+    // we shouldn't revisit the same "bottom" basic block.
+    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+        std::count(BlockWaitcntProcessedSet.begin(),
+                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
       if (EntrySB && EntrySB->getRevisitLoop()) {
@@ -1772,7 +1941,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
         }
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
         WaitcntData->incIterCnt();
-        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
         continue;
       } else {
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
@@ -1837,7 +2006,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   if (!MFI->isEntryFunction()) {
     // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to to the wait after the
+    // depend on. We can't track them and it's better to the wait after the
     // costly call sequence.
 
     // TODO: Could insert earlier and schedule more liberally with operations
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
deleted file mode 100644
index b074b95c2d3c..000000000000
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ /dev/null
@@ -1,703 +0,0 @@
-//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Insert wait instructions for memory reads and writes.
-///
-/// Memory reads and writes are issued asynchronously, so we need to insert
-/// S_WAITCNT instructions when we want to access any of their results or
-/// overwrite any register that's used asynchronously.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#define DEBUG_TYPE "si-insert-waits"
-
-using namespace llvm;
-
-namespace {
-
-/// \brief One variable for each of the hardware counters
-using Counters = union {
-  struct {
-    unsigned VM;
-    unsigned EXP;
-    unsigned LGKM;
-  } Named;
-  unsigned Array[3];
-};
-
-using InstType = enum {
-  OTHER,
-  SMEM,
-  VMEM
-};
-
-using RegCounters =  Counters[512];
-using RegInterval = std::pair<unsigned, unsigned>;
-
-class SIInsertWaits : public MachineFunctionPass {
-private:
-  const SISubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI;
-  AMDGPU::IsaInfo::IsaVersion ISA;
-
-  /// \brief Constant zero value
-  static const Counters ZeroCounts;
-
-  /// \brief Hardware limits
-  Counters HardwareLimits;
-
-  /// \brief Counter values we have already waited on.
-  Counters WaitedOn;
-
-  /// \brief Counter values that we must wait on before the next counter
-  /// increase.
-  Counters DelayedWaitOn;
-
-  /// \brief Counter values for last instruction issued.
-  Counters LastIssued;
-
-  /// \brief Registers used by async instructions.
-  RegCounters UsedRegs;
-
-  /// \brief Registers defined by async instructions.
-  RegCounters DefinedRegs;
-
-  /// \brief Different export instruction types seen since last wait.
-  unsigned ExpInstrTypesSeen = 0;
-
-  /// \brief Type of the last opcode.
-  InstType LastOpcodeType;
-
-  bool LastInstWritesM0;
-
-  /// Whether or not we have flat operations outstanding.
-  bool IsFlatOutstanding;
-
-  /// \brief Whether the machine function returns void
-  bool ReturnsVoid;
-
-  /// Whether the VCCZ bit is possibly corrupt
-  bool VCCZCorrupt = false;
-
-  /// \brief Get increment/decrement amount for this instruction.
-  Counters getHwCounts(MachineInstr &MI);
-
-  /// \brief Is operand relevant for async execution?
-  bool isOpRelevant(MachineOperand &Op);
-
-  /// \brief Get register interval an operand affects.
-  RegInterval getRegInterval(const TargetRegisterClass *RC,
-                             const MachineOperand &Reg) const;
-
-  /// \brief Handle instructions async components
-  void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I,
-                       const Counters& Increment);
-
-  /// \brief Insert the actual wait instruction
-  bool insertWait(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  const Counters &Counts);
-
-  /// \brief Handle existing wait instructions (from intrinsics)
-  void handleExistingWait(MachineBasicBlock::iterator I);
-
-  /// \brief Do we need def2def checks?
-  bool unorderedDefines(MachineInstr &MI);
-
-  /// \brief Resolve all operand dependencies to counter requirements
-  Counters handleOperands(MachineInstr &MI);
-
-  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
-  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-
-  /// Return true if there are LGKM instrucitons that haven't been waited on
-  /// yet.
-  bool hasOutstandingLGKM() const;
-
-public:
-  static char ID;
-
-  SIInsertWaits() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override {
-    return "SI insert wait instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
-                      "SI Insert Waits", false, false)
-INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
-                    "SI Insert Waits", false, false)
-
-char SIInsertWaits::ID = 0;
-
-char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
-
-FunctionPass *llvm::createSIInsertWaitsPass() {
-  return new SIInsertWaits();
-}
-
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-
-static bool readsVCCZ(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
-         !MI.getOperand(1).isUndef();
-}
-
-bool SIInsertWaits::hasOutstandingLGKM() const {
-  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
-}
-
-Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-  uint64_t TSFlags = MI.getDesc().TSFlags;
-  Counters Result = { { 0, 0, 0 } };
-
-  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
-
-  // Only consider stores or EXP for EXP_CNT
-  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
-
-  // LGKM may uses larger values
-  if (TSFlags & SIInstrFlags::LGKM_CNT) {
-
-    if (TII->isSMRD(MI)) {
-
-      if (MI.getNumOperands() != 0) {
-        assert(MI.getOperand(0).isReg() &&
-               "First LGKM operand must be a register!");
-
-        // XXX - What if this is a write into a super register?
-        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
-        unsigned Size = TRI->getRegSizeInBits(*RC);
-        Result.Named.LGKM = Size > 32 ? 2 : 1;
-      } else {
-        // s_dcache_inv etc. do not have a a destination register. Assume we
-        // want a wait on these.
-        // XXX - What is the right value?
-        Result.Named.LGKM = 1;
-      }
-    } else {
-      // DS
-      Result.Named.LGKM = 1;
-    }
-
-  } else {
-    Result.Named.LGKM = 0;
-  }
-
-  return Result;
-}
-
-bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-  // Constants are always irrelevant
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-    return false;
-
-  // Defines are always relevant
-  if (Op.isDef())
-    return true;
-
-  // For exports all registers are relevant.
-  // TODO: Skip undef/disabled registers.
-  MachineInstr &MI = *Op.getParent();
-  if (TII->isEXP(MI))
-    return true;
-
-  // For stores the stored value is also relevant
-  if (!MI.getDesc().mayStore())
-    return false;
-
-  // Check if this operand is the value being stored.
-  // Special case for DS/FLAT instructions, since the address
-  // operand comes before the value operand and it may have
-  // multiple data operands.
-
-  if (TII->isDS(MI)) {
-    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
-    if (Data0 && Op.isIdenticalTo(*Data0))
-      return true;
-
-    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
-    return Data1 && Op.isIdenticalTo(*Data1);
-  }
-
-  if (TII->isFLAT(MI)) {
-    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
-    if (Data && Op.isIdenticalTo(*Data))
-      return true;
-  }
-
-  // NOTE: This assumes that the value operand is before the
-  // address operand, and that there is only one value operand.
-  for (MachineInstr::mop_iterator I = MI.operands_begin(),
-       E = MI.operands_end(); I != E; ++I) {
-
-    if (I->isReg() && I->isUse())
-      return Op.isIdenticalTo(*I);
-  }
-
-  return false;
-}
-
-RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
-                                          const MachineOperand &Reg) const {
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-  assert(Size >= 32);
-
-  RegInterval Result;
-  Result.first = TRI->getEncodingValue(Reg.getReg());
-  Result.second = Result.first + Size / 32;
-
-  return Result;
-}
-
-void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I,
-                                    const Counters &Increment) {
-  // Get the hardware counter increments and sum them up
-  Counters Limit = ZeroCounts;
-  unsigned Sum = 0;
-
-  if (TII->mayAccessFlatAddressSpace(*I))
-    IsFlatOutstanding = true;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    LastIssued.Array[i] += Increment.Array[i];
-    if (Increment.Array[i])
-      Limit.Array[i] = LastIssued.Array[i];
-    Sum += Increment.Array[i];
-  }
-
-  // If we don't increase anything then that's it
-  if (Sum == 0) {
-    LastOpcodeType = OTHER;
-    return;
-  }
-
-  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
-    // or SMEM clause, respectively.
-    //
-    // The temporary workaround is to break the clauses with S_NOP.
-    //
-    // The proper solution would be to allocate registers such that all source
-    // and destination registers don't overlap, e.g. this is illegal:
-    //   r0 = load r2
-    //   r2 = load r0
-    if (LastOpcodeType == VMEM && Increment.Named.VM) {
-      // Insert a NOP to break the clause.
-      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
-          .addImm(0);
-      LastInstWritesM0 = false;
-    }
-
-    if (TII->isSMRD(*I))
-      LastOpcodeType = SMEM;
-    else if (Increment.Named.VM)
-      LastOpcodeType = VMEM;
-  }
-
-  // Remember which export instructions we have seen
-  if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
-  }
-
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = I->getOperand(i);
-    if (!isOpRelevant(Op))
-      continue;
-
-    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
-    RegInterval Interval = getRegInterval(RC, Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
-      // Remember which registers we define
-      if (Op.isDef())
-        DefinedRegs[j] = Limit;
-
-      // and which one we are using
-      if (Op.isUse())
-        UsedRegs[j] = Limit;
-    }
-  }
-}
-
-bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               const Counters &Required) {
-  // End of program? No need to wait on anything
-  // A function not returning void needs to wait, because other bytecode will
-  // be appended after it and we don't know what it will be.
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
-    return false;
-
-  // Figure out if the async instructions execute in order
-  bool Ordered[3];
-
-  // VM_CNT is always ordered except when there are flat instructions, which
-  // can return out of order.
-  Ordered[0] = !IsFlatOutstanding;
-
-  // EXP_CNT is unordered if we have both EXP & VM-writes
-  Ordered[1] = ExpInstrTypesSeen == 3;
-
-  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
-  Ordered[2] = false;
-
-  // The values we are going to put into the S_WAITCNT instruction
-  Counters Counts = HardwareLimits;
-
-  // Do we really need to wait?
-  bool NeedWait = false;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    if (Required.Array[i] <= WaitedOn.Array[i])
-      continue;
-
-    NeedWait = true;
-
-    if (Ordered[i]) {
-      unsigned Value = LastIssued.Array[i] - Required.Array[i];
-
-      // Adjust the value to the real hardware possibilities.
-      Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
-    } else
-      Counts.Array[i] = 0;
-
-    // Remember on what we have waited on.
-    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-  }
-
-  if (!NeedWait)
-    return false;
-
-  // Reset EXP_CNT instruction types
-  if (Counts.Named.EXP == 0)
-    ExpInstrTypesSeen = 0;
-
-  // Build the wait instruction
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-    .addImm(AMDGPU::encodeWaitcnt(ISA,
-                                  Counts.Named.VM,
-                                  Counts.Named.EXP,
-                                  Counts.Named.LGKM));
-
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  IsFlatOutstanding = false;
-  return true;
-}
-
-/// \brief helper function for handleOperands
-static void increaseCounters(Counters &Dst, const Counters &Src) {
-  for (unsigned i = 0; i < 3; ++i)
-    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
-}
-
-/// \brief check whether any of the counters is non-zero
-static bool countersNonZero(const Counters &Counter) {
-  for (unsigned i = 0; i < 3; ++i)
-    if (Counter.Array[i])
-      return true;
-  return false;
-}
-
-void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
-  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
-
-  unsigned Imm = I->getOperand(0).getImm();
-  Counters Counts, WaitOn;
-
-  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
-  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
-  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
-
-  for (unsigned i = 0; i < 3; ++i) {
-    if (Counts.Array[i] <= LastIssued.Array[i])
-      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-    else
-      WaitOn.Array[i] = 0;
-  }
-
-  increaseCounters(DelayedWaitOn, WaitOn);
-}
-
-Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
-  Counters Result = ZeroCounts;
-
-  // For each register affected by this instruction increase the result
-  // sequence.
-  //
-  // TODO: We could probably just look at explicit operands if we removed VCC /
-  // EXEC from SMRD dest reg classes.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-      continue;
-
-    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
-    RegInterval Interval = getRegInterval(RC, Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-      if (Op.isDef()) {
-        increaseCounters(Result, UsedRegs[j]);
-        increaseCounters(Result, DefinedRegs[j]);
-      }
-
-      if (Op.isUse())
-        increaseCounters(Result, DefinedRegs[j]);
-    }
-  }
-
-  return Result;
-}
-
-void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) {
-  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
-    return;
-
-  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
-  if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
-    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
-    LastInstWritesM0 = false;
-    return;
-  }
-
-  // Set whether this instruction sets M0
-  LastInstWritesM0 = false;
-
-  unsigned NumOperands = I->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; i++) {
-    const MachineOperand &Op = I->getOperand(i);
-
-    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
-      LastInstWritesM0 = true;
-  }
-}
-
-/// Return true if \p MBB has one successor immediately following, and is its
-/// only predecessor
-static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
-  if (MBB.succ_size() != 1)
-    return false;
-
-  const MachineBasicBlock *Succ = *MBB.succ_begin();
-  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
-}
-
-// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
-// around other non-memory instructions.
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
-  bool Changes = false;
-
-  ST = &MF.getSubtarget<SISubtarget>();
-  TII = ST->getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
-  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
-  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
-
-  WaitedOn = ZeroCounts;
-  DelayedWaitOn = ZeroCounts;
-  LastIssued = ZeroCounts;
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  IsFlatOutstanding = false;
-  ReturnsVoid = MFI->returnsVoid();
-
-  memset(&UsedRegs, 0, sizeof(UsedRegs));
-  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
-
-  SmallVector<MachineInstr *, 4> RemoveMI;
-  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
-  bool HaveScalarStores = false;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (!HaveScalarStores && TII->isScalarStore(*I))
-        HaveScalarStores = true;
-
-      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
-        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
-        // vccz bit, so when we detect that an instruction may read from a
-        // corrupt vccz bit, we need to:
-        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
-        //    complete.
-        // 2. Restore the correct value of vccz by writing the current value
-        //    of vcc back to vcc.
-
-        if (TII->isSMRD(I->getOpcode())) {
-          VCCZCorrupt = true;
-        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
-          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
-          // Whenever we store a value in vcc, the correct value of vccz is
-          // restored.
-          VCCZCorrupt = false;
-        }
-
-        // Check if we need to apply the bug work-around
-        if (VCCZCorrupt && readsVCCZ(*I)) {
-          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
-
-          // Wait on everything, not just LGKM.  vccz reads usually come from
-          // terminators, and we always wait on everything at the end of the
-          // block, so if we only wait on LGKM here, we might end up with
-          // another s_waitcnt inserted right after this if there are non-LGKM
-          // instructions still outstanding.
-          insertWait(MBB, I, LastIssued);
-
-          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
-          // bit is updated, so we can restore the bit by reading the value of
-          // vcc and then writing it back to the register.
-          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
-                  AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
-        }
-      }
-
-      // Record pre-existing, explicitly requested waits
-      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
-        handleExistingWait(*I);
-        RemoveMI.push_back(&*I);
-        continue;
-      }
-
-      Counters Required;
-
-      // Wait for everything before a barrier.
-      //
-      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-      // but we also want to wait for any other outstanding transfers before
-      // signalling other hardware blocks
-      if ((I->getOpcode() == AMDGPU::S_BARRIER &&
-               !ST->hasAutoWaitcntBeforeBarrier()) ||
-           I->getOpcode() == AMDGPU::S_SENDMSG ||
-           I->getOpcode() == AMDGPU::S_SENDMSGHALT)
-        Required = LastIssued;
-      else
-        Required = handleOperands(*I);
-
-      Counters Increment = getHwCounts(*I);
-
-      if (countersNonZero(Required) || countersNonZero(Increment))
-        increaseCounters(Required, DelayedWaitOn);
-
-      Changes |= insertWait(MBB, I, Required);
-
-      pushInstruction(MBB, I, Increment);
-      handleSendMsg(MBB, I);
-
-      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
-        EndPgmBlocks.push_back(&MBB);
-    }
-
-    // Wait for everything at the end of the MBB. If there is only one
-    // successor, we can defer this until the uses there.
-    if (!hasTrivialSuccessor(MBB))
-      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
-  }
-
-  if (HaveScalarStores) {
-    // If scalar writes are used, the cache must be flushed or else the next
-    // wave to reuse the same scratch memory can be clobbered.
-    //
-    // Insert s_dcache_wb at wave termination points if there were any scalar
-    // stores, and only if the cache hasn't already been flushed. This could be
-    // improved by looking across blocks for flushes in postdominating blocks
-    // from the stores but an explicitly requested flush is probably very rare.
-    for (MachineBasicBlock *MBB : EndPgmBlocks) {
-      bool SeenDCacheWB = false;
-
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I) {
-        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
-          SeenDCacheWB = true;
-        else if (TII->isScalarStore(*I))
-          SeenDCacheWB = false;
-
-        // FIXME: It would be better to insert this before a waitcnt if any.
-        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
-          Changes = true;
-          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
-        }
-      }
-    }
-  }
-
-  for (MachineInstr *I : RemoveMI)
-    I->eraseFromParent();
-
-  if (!MFI->isEntryFunction()) {
-    // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to to the wait after the
-    // costly call sequence.
-
-    // TODO: Could insert earlier and schedule more liberally with operations
-    // that only use caller preserved registers.
-    MachineBasicBlock &EntryBB = MF.front();
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
-
-    Changes = true;
-  }
-
-  return Changes;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 25917cc06e6a..b73d30940fc3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 def isGCN : Predicate<"Subtarget->getGeneration() "
-                      ">= SISubtarget::SOUTHERN_ISLANDS">,
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
             AssemblerPredicate<"FeatureGCN">;
 def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== SISubtarget::SOUTHERN_ISLANDS">,
+                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
            AssemblerPredicate<"FeatureSouthernIslands">;
 
 
 class InstSI <dag outs, dag ins, string asm = "",
               list<dag> pattern = []> :
-  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+  AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl {
   let SubtargetPredicate = isGCN;
 
   // Low bits - basic encoding information.
@@ -118,6 +118,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is a packed VOP3P instruction
   field bit IsPacked = 0;
 
+  // This bit indicates that this is a D16 buffer instruction.
+  field bit D16Buf = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -173,6 +176,8 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{49} = IsPacked;
 
+  let TSFlags{50} = D16Buf;
+
   let SchedRW = [Write32Bit];
 
   field bits<1> DisableSIDecoder = 0;
@@ -181,6 +186,9 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
   let AsmVariantName = AMDGPUAsmVariants.Default;
+
+  // Avoid changing source registers in a way that violates constant bus read limitations.
+  let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0))))));
 }
 
 class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
@@ -247,6 +255,7 @@ class MIMGe <bits<7> op> : Enc64 {
   bits<1> tfe;
   bits<1> lwe;
   bits<1> slc;
+  bit d16;
   bits<8> vaddr;
   bits<7> srsrc;
   bits<7> ssamp;
@@ -265,6 +274,7 @@ class MIMGe <bits<7> op> : Enc64 {
   let Inst{47-40} = vdata;
   let Inst{52-48} = srsrc{6-2};
   let Inst{57-53} = ssamp{6-2};
+  let Inst{63} = d16;
 }
 
 class EXPe : Enc64 {
@@ -309,6 +319,7 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let VALU = 1;
 }
 
 class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
@@ -323,15 +334,3 @@ class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 } // End Uses = [EXEC]
-
-class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-  let MIMG = 1;
-  let Uses = [EXEC];
-
-  let UseNamedOperandTable = 1;
-  let hasSideEffects = 0; // XXX ????
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 654b96f792b1..6c85c92454c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8,17 +8,19 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI Implementation of TargetInstrInfo.
+/// SI Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #include "SIInstrInfo.h"
 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "GCNHazardRecognizer.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -37,7 +39,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -53,6 +54,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
@@ -62,6 +64,19 @@
 
 using namespace llvm;
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+namespace AMDGPU {
+#define GET_D16ImageDimIntrinsics_IMPL
+#define GET_ImageDimIntrinsicTable_IMPL
+#define GET_RsrcIntrinsics_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+}
+}
+
+
 // Must be at least 4 to be able to branch over minimum unconditional branch
 // code. This is only for making it possible to write reasonably small tests for
 // long branches.
@@ -69,8 +84,9 @@ static cl::opt<unsigned>
 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
                  cl::desc("Restrict range of branch instructions (DEBUG)"));
 
-SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
+SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
+  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+    RI(ST), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -89,7 +105,7 @@ static SDValue findChainOperand(SDNode *Load) {
   return LastOp;
 }
 
-/// \brief Returns true if both nodes have the same value for the given
+/// Returns true if both nodes have the same value for the given
 ///        operand \p Op, or if both nodes do not have this operand.
 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
   unsigned Opc0 = N0->getMachineOpcode();
@@ -437,6 +453,28 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
 }
 
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+                                          int64_t Offset0, int64_t Offset1,
+                                          unsigned NumLoads) const {
+  assert(Offset1 > Offset0 &&
+         "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 64
+  // bytes, then schedule together.
+
+  // A cacheline is 64 bytes (for global memory).
+  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, unsigned DestReg,
@@ -827,10 +865,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
 
-  assert(SrcReg != MFI->getStackPtrOffsetReg() &&
-         SrcReg != MFI->getFrameOffsetReg() &&
-         SrcReg != MFI->getScratchWaveOffsetReg());
-
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   MachinePointerInfo PtrInfo
@@ -864,7 +898,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // needing them, and need to ensure that the reserved registers are
     // correctly handled.
 
-    FrameInfo.setStackID(FrameIndex, 1);
+    FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -960,7 +994,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     }
 
-    FrameInfo.setStackID(FrameIndex, 1);
+    FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
       .addMemOperand(MMO)
@@ -1001,7 +1035,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
     unsigned FrameOffset, unsigned Size) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
   unsigned WavefrontSize = ST.getWavefrontSize();
@@ -1137,7 +1171,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
   switch (MI.getOpcode()) {
-  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  default: return TargetInstrInfo::expandPostRAPseudo(MI);
   case AMDGPU::S_MOV_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -1269,6 +1303,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_MOV_B64));
     break;
   }
+  case TargetOpcode::BUNDLE: {
+    if (!MI.mayLoad())
+      return false;
+
+    // If it is a load it must be a memory clause
+    for (MachineBasicBlock::instr_iterator I = MI.getIterator();
+         I->isBundledWithSucc(); ++I) {
+      I->unbundleFromSucc();
+      for (MachineOperand &MO : I->operands())
+        if (MO.isReg())
+          MO.setIsInternalRead(false);
+    }
+
+    MI.eraseFromParent();
+    break;
+  }
   }
   return true;
 }
@@ -1887,16 +1937,16 @@ unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
   switch(Kind) {
   case PseudoSourceValue::Stack:
   case PseudoSourceValue::FixedStack:
-    return AMDGPUASI.PRIVATE_ADDRESS;
+    return ST.getAMDGPUAS().PRIVATE_ADDRESS;
   case PseudoSourceValue::ConstantPool:
   case PseudoSourceValue::GOT:
   case PseudoSourceValue::JumpTable:
   case PseudoSourceValue::GlobalValueCallEntry:
   case PseudoSourceValue::ExternalSymbolCallEntry:
   case PseudoSourceValue::TargetCustom:
-    return AMDGPUASI.CONSTANT_ADDRESS;
+    return ST.getAMDGPUAS().CONSTANT_ADDRESS;
   }
-  return AMDGPUASI.FLAT_ADDRESS;
+  return ST.getAMDGPUAS().FLAT_ADDRESS;
 }
 
 static void removeModOperands(MachineInstr &MI) {
@@ -2165,20 +2215,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
                                                  MachineInstr &MI,
                                                  LiveVariables *LV) const {
+  unsigned Opc = MI.getOpcode();
   bool IsF16 = false;
+  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
 
-  switch (MI.getOpcode()) {
+  switch (Opc) {
   default:
     return nullptr;
   case AMDGPU::V_MAC_F16_e64:
     IsF16 = true;
     LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e64:
+  case AMDGPU::V_FMAC_F32_e64:
     break;
   case AMDGPU::V_MAC_F16_e32:
     IsF16 = true;
     LLVM_FALLTHROUGH;
-  case AMDGPU::V_MAC_F32_e32: {
+  case AMDGPU::V_MAC_F32_e32:
+  case AMDGPU::V_FMAC_F32_e32: {
     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::src0);
     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -2203,7 +2257,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
 
-  if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
       // If we have an SGPR input, we will violate the constant bus restriction.
       (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
     if (auto Imm = getFoldableImm(Src2)) {
@@ -2234,8 +2288,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
     }
   }
 
-  return BuildMI(*MBB, MI, MI.getDebugLoc(),
-                 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
+  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
+    (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
       .add(*Dst)
       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
       .add(*Src0)
@@ -2339,6 +2395,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   }
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    if (isUInt<16>(Imm)) {
+      int16_t Trunc = static_cast<int16_t>(Imm);
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+    }
+    if (!(Imm & 0xffff)) {
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
+    }
     uint32_t Trunc = static_cast<uint32_t>(Imm);
     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
   }
@@ -2711,14 +2776,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  // Verify VOP*
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
+  // Verify VOP*. Ignore multiple sgpr operands on writelane.
+  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
+      && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
     unsigned ConstantBusCount = 0;
+    unsigned LiteralCount = 0;
 
     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
       ++ConstantBusCount;
@@ -2738,6 +2805,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
           SGPRUsed = MO.getReg();
         } else {
           ++ConstantBusCount;
+          ++LiteralCount;
         }
       }
     }
@@ -2745,6 +2813,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       ErrInfo = "VOP* instruction uses the constant bus more than once";
       return false;
     }
+
+    if (isVOP3(MI) && LiteralCount) {
+      ErrInfo = "VOP3 instruction uses literal";
+      return false;
+    }
   }
 
   // Verify misc. restrictions on specific instructions.
@@ -2842,7 +2915,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
+  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
     if (Offset->getImm() != 0) {
       ErrInfo = "subtarget does not support offsets in flat instructions";
@@ -2850,6 +2923,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
+  if (DppCt) {
+    using namespace AMDGPU::DPP;
+
+    unsigned DC = DppCt->getImm();
+    if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
+        DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
+        (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
+        (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
+        (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
+        (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
+      ErrInfo = "Invalid dpp_ctrl value";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -3147,6 +3236,29 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
       legalizeOpWithMove(MI, Src0Idx);
   }
 
+  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
+  // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
+  // src0/src1 with V_READFIRSTLANE.
+  if (Opc == AMDGPU::V_WRITELANE_B32) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const DebugLoc &DL = MI.getDebugLoc();
+    if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+          .add(Src0);
+      Src0.ChangeToRegister(Reg, false);
+    }
+    if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      const DebugLoc &DL = MI.getDebugLoc();
+      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+          .add(Src1);
+      Src1.ChangeToRegister(Reg, false);
+    }
+    return;
+  }
+
   // VOP2 src0 instructions support all operand types, so we don't need to check
   // their legality. If src1 is already legal, we don't need to do anything.
   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
@@ -3261,6 +3373,13 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
   unsigned DstReg = MRI.createVirtualRegister(SRC);
   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
 
+  if (SubRegs == 1) {
+    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+            get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+        .addReg(SrcReg);
+    return DstReg;
+  }
+
   SmallVector<unsigned, 8> SRegs;
   for (unsigned i = 0; i < SubRegs; ++i) {
     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
@@ -3438,6 +3557,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
     return;
   }
 
+  // Legalize SI_INIT_M0
+  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
+    MachineOperand &Src = MI.getOperand(0);
+    if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
+      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+    return;
+  }
+
   // Legalize MIMG and MUBUF/MTBUF for shaders.
   //
   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -3539,8 +3666,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
     } else {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
-      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
-             < SISubtarget::VOLCANIC_ISLANDS &&
+      assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
+             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
              "FIXME: Need to emit flat atomics here");
 
       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
@@ -3676,37 +3803,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       continue;
 
     case AMDGPU::S_LSHL_B32:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I32:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B32:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHL_B64:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I64:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B64:
-      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B64;
         swapOperands(Inst);
       }
@@ -3899,6 +4026,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
         MRI.clearKillFlags(Inst.getOperand(1).getReg());
         Inst.getOperand(0).setReg(DstReg);
+
+        // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+        // these are deleted later, but at -O0 it would leave a suspicious
+        // looking illegal copy of an undef register.
+        for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+          Inst.RemoveOperand(I);
+        Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
         continue;
       }
 
@@ -3990,17 +4124,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
 
-  unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
-    .add(Src0)
-    .add(Src1);
+  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  if (ST.hasDLInsts()) {
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
+      .add(Src0)
+      .add(Src1);
+  } else {
+    unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
+      .add(Src0)
+      .add(Src1);
 
-  unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
-    .addReg(Xor);
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
+      .addReg(Xor);
+  }
 
-  MRI.replaceRegWith(Dest.getReg(), Not);
-  addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
+  MRI.replaceRegWith(Dest.getReg(), NewDest);
+  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -4493,12 +4633,12 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   if (ST.isAmdHsaOS()) {
     // Set ATC = 1. GFX9 doesn't have this bit.
-    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+    if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (1ULL << 56);
 
     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
     // BTW, it disables TC L2 and therefore decreases performance.
-    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
+    if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (2ULL << 59);
   }
 
@@ -4511,7 +4651,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     0xffffffff; // Size;
 
   // GFX9 doesn't have ELEMENT_SIZE.
-  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
   }
@@ -4521,7 +4661,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
-  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
 
   return Rsrc23;
@@ -4546,7 +4686,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
     return AMDGPU::NoRegister;
 
   assert(!MI.memoperands_empty() &&
-         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+         (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
 
   FrameIndex = Addr->getIndex();
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4613,12 +4753,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (DescSize != 0 && DescSize != 4)
     return DescSize;
 
+  if (isFixedSize(MI))
+    return DescSize;
+
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    if (isFixedSize(MI))
-      return DescSize;
-
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
       return 4; // No operands.
@@ -4665,7 +4805,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
     return true;
 
   for (const MachineMemOperand *MMO : MI.memoperands()) {
-    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
+    if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
       return true;
   }
   return false;
@@ -4832,3 +4972,70 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
   }
 }
+
+bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
+  if (!isSMRD(MI))
+    return false;
+
+  // Check that it is using a buffer resource.
+  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
+  if (Idx == -1) // e.g. s_memtime
+    return false;
+
+  const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
+  return RCID == AMDGPU::SReg_128RegClassID;
+}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+  SI = 0,
+  VI = 1,
+  SDWA = 2,
+  SDWA9 = 3,
+  GFX80 = 4,
+  GFX9 = 5
+};
+
+static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
+  switch (ST.getGeneration()) {
+  default:
+    break;
+  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+  case AMDGPUSubtarget::SEA_ISLANDS:
+    return SIEncodingFamily::SI;
+  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+  case AMDGPUSubtarget::GFX9:
+    return SIEncodingFamily::VI;
+  }
+  llvm_unreachable("Unknown subtarget generation!");
+}
+
+int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+
+  if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
+    ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+    Gen = SIEncodingFamily::GFX9;
+
+  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+                                                      : SIEncodingFamily::SDWA;
+  // Adjust the encoding family to GFX80 for D16 buffer instructions when the
+  // subtarget has UnpackedD16VMem feature.
+  // TODO: remove this when we discard GFX80 encoding.
+  if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
+    Gen = SIEncodingFamily::GFX80;
+
+  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+
+  // -1 means that Opcode is already a native instruction.
+  if (MCOp == -1)
+    return Opcode;
+
+  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // no encoding in the given subtarget generation.
+  if (MCOp == (uint16_t)-1)
+    return -1;
+
+  return MCOp;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 24ee843e6ade..0a735257d34e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for SIInstrInfo.
+/// Interface definition for SIInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,20 +31,23 @@
 #include <cassert>
 #include <cstdint>
 
+#define GET_INSTRINFO_HEADER
+#include "AMDGPUGenInstrInfo.inc"
+
 namespace llvm {
 
 class APInt;
 class MachineRegisterInfo;
 class RegScavenger;
-class SISubtarget;
+class GCNSubtarget;
 class TargetRegisterClass;
 
-class SIInstrInfo final : public AMDGPUInstrInfo {
+class SIInstrInfo final : public AMDGPUGenInstrInfo {
 private:
   const SIRegisterInfo RI;
-  const SISubtarget &ST;
+  const GCNSubtarget &ST;
 
-  // The the inverse predicate should have the negative value.
+  // The inverse predicate should have the negative value.
   enum BranchPredicate {
     INVALID_BR = 0,
     SCC_TRUE = 1,
@@ -144,7 +147,7 @@ public:
     MO_REL32_HI = 5
   };
 
-  explicit SIInstrInfo(const SISubtarget &ST);
+  explicit SIInstrInfo(const GCNSubtarget &ST);
 
   const SIRegisterInfo &getRegisterInfo() const {
     return RI;
@@ -163,7 +166,10 @@ public:
 
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
                            MachineInstr &SecondLdSt, unsigned BaseReg2,
-                           unsigned NumLoads) const final;
+                           unsigned NumLoads) const override;
+
+  bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
+                               int64_t Offset1, unsigned NumLoads) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
@@ -203,7 +209,7 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  // \brief Returns an opcode that can be used to move a value to a \p DstRC
+  // Returns an opcode that can be used to move a value to a \p DstRC
   // register.  If there is no hardware instruction that can store to \p
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
@@ -419,18 +425,7 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::SMRD;
   }
 
-  bool isBufferSMRD(const MachineInstr &MI) const {
-    if (!isSMRD(MI))
-      return false;
-
-    // Check that it is using a buffer resource.
-    int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
-    if (Idx == -1) // e.g. s_memtime
-      return false;
-
-    const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
-    return RCID == AMDGPU::SReg_128RegClassID;
-  }
+  bool isBufferSMRD(const MachineInstr &MI) const;
 
   static bool isDS(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::DS;
@@ -674,16 +669,16 @@ public:
   bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
-  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+  /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;
 
-  /// \brief Returns true if this operand uses the constant bus.
+  /// Returns true if this operand uses the constant bus.
   bool usesConstantBus(const MachineRegisterInfo &MRI,
                        const MachineOperand &MO,
                        const MCOperandInfo &OpInfo) const;
 
-  /// \brief Return true if this instruction has any modifiers.
+  /// Return true if this instruction has any modifiers.
   ///  e.g. src[012]_mod, omod, clamp.
   bool hasModifiers(unsigned Opcode) const;
 
@@ -696,7 +691,7 @@ public:
 
   unsigned getVALUOp(const MachineInstr &MI) const;
 
-  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
   /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
   /// the register class of its machine operand.
@@ -704,7 +699,7 @@ public:
   const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
                                            unsigned OpNo) const;
 
-  /// \brief Return the size in bytes of the operand OpNo on the given
+  /// Return the size in bytes of the operand OpNo on the given
   // instruction opcode.
   unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
     const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
@@ -718,7 +713,7 @@ public:
     return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
   }
 
-  /// \brief This form should usually be preferred since it handles operands
+  /// This form should usually be preferred since it handles operands
   /// with unknown register classes.
   unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
     return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
@@ -728,7 +723,7 @@ public:
   /// to read a VGPR.
   bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
 
-  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// Legalize the \p OpIndex operand of this instruction by inserting
   /// a MOV.  For example:
   /// ADD_I32_e32 VGPR0, 15
   /// to
@@ -739,29 +734,29 @@ public:
   /// instead of MOV.
   void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
 
-  /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+  /// Check if \p MO is a legal operand if it was the \p OpIdx Operand
   /// for \p MI.
   bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
-  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// Check if \p MO would be a valid operand for the given operand
   /// definition \p OpInfo. Note this does not attempt to validate constant bus
   /// restrictions (e.g. literal constant usage).
   bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
                           const MCOperandInfo &OpInfo,
                           const MachineOperand &MO) const;
 
-  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// Check if \p MO (a register operand) is a legal register for the
   /// given operand description.
   bool isLegalRegOperand(const MachineRegisterInfo &MRI,
                          const MCOperandInfo &OpInfo,
                          const MachineOperand &MO) const;
 
-  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// Legalize operands in \p MI by either commuting it or inserting a
   /// copy of src1.
   void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
-  /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+  /// Fix operands in \p MI to satisfy constant bus requirements.
   void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
   /// Copy a value from a VGPR (\p SrcReg) to SGPR.  This function can only
@@ -779,11 +774,11 @@ public:
                               MachineOperand &Op, MachineRegisterInfo &MRI,
                               const DebugLoc &DL) const;
 
-  /// \brief Legalize all operands in this instruction.  This function may
+  /// Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr &MI) const;
 
-  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
@@ -795,11 +790,11 @@ public:
                   MachineBasicBlock::iterator MI) const override;
 
   void insertReturn(MachineBasicBlock &MBB) const;
-  /// \brief Return the number of wait states that result from executing this
+  /// Return the number of wait states that result from executing this
   /// instruction.
   unsigned getNumWaitStates(const MachineInstr &MI) const;
 
-  /// \brief Returns the operand named \p Op.  If \p MI does not have an
+  /// Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
   LLVM_READONLY
   MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
@@ -822,7 +817,7 @@ public:
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyInstruction(const MachineInstr &MI) const;
 
-  /// \brief Return the descriptor of the target-specific machine instruction
+  /// Return the descriptor of the target-specific machine instruction
   /// that corresponds to the specified pseudo or native opcode.
   const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
     return get(pseudoToMCOpcode(Opcode));
@@ -867,7 +862,7 @@ public:
 
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
-  /// \brief Return a partially built integer add instruction without carry.
+  /// Return a partially built integer add instruction without carry.
   /// Caller must add source operands.
   /// For pre-GFX9 it will generate unused carry destination operand.
   /// TODO: After GFX9 it should return a no-carry operation.
@@ -882,6 +877,12 @@ public:
   static bool isLegalMUBUFImmOffset(unsigned Imm) {
     return isUInt<12>(Imm);
   }
+
+  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+  /// Return -1 if the target-specific opcode for the pseudo instruction does
+  /// not exist. If Opcode is not a pseudo instruction, this is identity.
+  int pseudoToMCOpcode(int Opcode) const;
+
 };
 
 namespace AMDGPU {
@@ -908,6 +909,9 @@ namespace AMDGPU {
   int getAddr64Inst(uint16_t Opcode);
 
   LLVM_READONLY
+  int getMUBUFNoLdsInst(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAtomicRetOp(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index fc2d35d873aa..8fa37aa83dae 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,16 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 def isCI : Predicate<"Subtarget->getGeneration() "
-                      ">= SISubtarget::SEA_ISLANDS">;
+                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
 def isCIOnly : Predicate<"Subtarget->getGeneration() =="
-                         "SISubtarget::SEA_ISLANDS">,
+                         "AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate <"FeatureSeaIslands">;
 def isVIOnly : Predicate<"Subtarget->getGeneration() =="
-                         "SISubtarget::VOLCANIC_ISLANDS">,
+                         "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate <"FeatureVolcanicIslands">;
 
 def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
 
+class GCNPredicateControl : PredicateControl {
+  Predicate SIAssemblerPredicate = isSICI;
+  Predicate VIAssemblerPredicate = isVI;
+}
+
 // Execpt for the NONE field, this must be kept in sync with the
 // SIEncodingFamily enum in AMDGPUInstrInfo.cpp
 def SIEncodingFamily {
@@ -25,13 +30,16 @@ def SIEncodingFamily {
   int VI = 1;
   int SDWA = 2;
   int SDWA9 = 3;
-  int GFX9 = 4;
+  int GFX80 = 4;
+  int GFX9 = 5;
 }
 
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
+
 def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
   SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
                       [SDNPMayLoad, SDNPMemOperand]
@@ -45,22 +53,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
-  SDTypeProfile<1, 9,
-    [                     // vdata
-     SDTCisVT<1, v4i32>,  // rsrc
-     SDTCisVT<2, i32>,    // vindex(VGPR)
-     SDTCisVT<3, i32>,    // voffset(VGPR)
-     SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // offset(imm)
-     SDTCisVT<6, i32>,    // dfmt(imm)
-     SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // glc(imm)
-     SDTCisVT<9, i32>     // slc(imm)
-    ]>,
-  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
+  SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
+]>;
+
+def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SDTbuffer_load : SDTypeProfile<1, 9,
+  [                     // vdata
+   SDTCisVT<1, v4i32>,  // rsrc
+   SDTCisVT<2, i32>,    // vindex(VGPR)
+   SDTCisVT<3, i32>,    // voffset(VGPR)
+   SDTCisVT<4, i32>,    // soffset(SGPR)
+   SDTCisVT<5, i32>,    // offset(imm)
+   SDTCisVT<6, i32>,    // dfmt(imm)
+   SDTCisVT<7, i32>,    // nfmt(imm)
+   SDTCisVT<8, i32>,    // glc(imm)
+   SDTCisVT<9, i32>     // slc(imm)
+  ]>;
+
+def SItbuffer_load :   SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
+                              [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
+                                SDTbuffer_load,
+                                [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTtbuffer_store : SDTypeProfile<0, 10,
     [                     // vdata
      SDTCisVT<1, v4i32>,  // rsrc
@@ -79,6 +106,9 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store
 def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
                                 SDTtbuffer_store,
                                 [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
 
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
@@ -92,6 +122,9 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
+                                SDTBufferLoad,
+                                [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 
 def SDTBufferStore : SDTypeProfile<0, 6,
     [                    // vdata
@@ -102,9 +135,13 @@ def SDTBufferStore : SDTypeProfile<0, 6,
      SDTCisVT<5, i1>]>;  // slc
 
 def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
-                             [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
-def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore,
-                             [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>;
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
+                            SDTBufferStore,
+                            [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
+                            SDTBufferStore,
+                            [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
 
 class SDBufferAtomic<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 5,
@@ -140,21 +177,41 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
-class SDSample<string opcode> : SDNode <opcode,
-  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
-                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
->;
-
-def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
-def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
-def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
-def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
-
 def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
   SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
 >;
 
 //===----------------------------------------------------------------------===//
+// ValueType helpers
+//===----------------------------------------------------------------------===//
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
+class isFloatType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, f16.Value), 1,
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    0))));
+}
+
+class isIntType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, i16.Value), 1,
+    !if(!eq(SrcVT.Value, i32.Value), 1,
+    !if(!eq(SrcVT.Value, i64.Value), 1,
+    0)));
+}
+
+class isPackedType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+    );
+}
+
+//===----------------------------------------------------------------------===//
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
 
@@ -163,6 +220,9 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
 
 def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
 def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
+def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
+def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
+def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
 
 //===----------------------------------------------------------------------===//
 // SDNodes PatFrags for loads/stores with a glue input.
@@ -178,6 +238,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad,
   [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
 >;
 
+def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
+  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
 def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
   return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
 }]>;
@@ -186,6 +250,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
 }]>;
 
+def atomic_load_32_glue : PatFrag<(ops node:$ptr),
+  (AMDGPUatomic_ld_glue node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i32;
+}
+
+def atomic_load_64_glue : PatFrag<(ops node:$ptr),
+  (AMDGPUatomic_ld_glue node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i64;
+}
+
 def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
 }]>;
@@ -219,6 +295,9 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
 def load_glue_align8 : Aligned8Bytes <
   (ops node:$ptr), (load_glue node:$ptr)
 >;
+def load_glue_align16 : Aligned16Bytes <
+  (ops node:$ptr), (load_glue node:$ptr)
+>;
 
 
 def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
@@ -227,12 +306,23 @@ def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
 def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
 def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
 def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
+def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
+def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
+def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
 
 
 def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
   [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
 >;
 
+def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
+  [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
+  (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
+}
+
 def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
                                    (AMDGPUst_glue node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
@@ -262,11 +352,17 @@ def store_glue_align8 : Aligned8Bytes <
   (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
 >;
 
+def store_glue_align16 : Aligned16Bytes <
+  (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
+>;
+
 def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
 def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
 def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
+def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
 
 def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
+def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
 
 def si_setcc_uniform : PatFrag <
   (ops node:$lhs, node:$rhs, node:$cond),
@@ -297,10 +393,11 @@ def lshl_rev : PatFrag <
   (shl $src0, $src1)
 >;
 
-multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
+                            SDTypeProfile tc = SDTAtomic2> {
 
   def _glue : SDNode <
-    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
+    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
   >;
 
@@ -319,6 +416,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
 defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
 defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
 defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
 
 def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
   [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
@@ -368,6 +468,12 @@ return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
+class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
+  uint64_t Imm = N->getZExtValue();
+  unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
+  return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
+}]>;
+
 def SIMM16bit : PatLeaf <(imm),
   [{return isInt<16>(N->getSExtValue());}]
 >;
@@ -381,7 +487,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
 }]>;
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
-  if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
@@ -552,19 +658,18 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
   let ParserMatchClass = VReg32OrOffClass;
 }
 
-class SDWASrc : RegisterOperand<VS_32> {
+class SDWASrc<ValueType vt> : RegisterOperand<VS_32> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_SDWA_SRC";
+  string Type = !if(isFloatType<vt>.ret, "FP", "INT");
+  let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size;
+  let DecoderMethod = "decodeSDWASrc"#vt.Size;
   let EncoderMethod = "getSDWASrcEncoding";
 }
 
-def SDWASrc32 : SDWASrc {
-  let DecoderMethod = "decodeSDWASrc32";
-}
-
-def SDWASrc16 : SDWASrc {
-  let DecoderMethod = "decodeSDWASrc16";
-}
+def SDWASrc_i32 : SDWASrc<i32>;
+def SDWASrc_i16 : SDWASrc<i16>;
+def SDWASrc_f32 : SDWASrc<f32>;
+def SDWASrc_f16 : SDWASrc<f16>;
 
 def SDWAVopcDst : VOPDstOperand<SReg_64> {
   let OperandNamespace = "AMDGPU";
@@ -637,19 +742,20 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
 def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
 
 def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
-def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
-def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
-def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
-def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
-def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
-def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
+def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
 def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
 def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
 
-def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
 def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
@@ -747,16 +853,23 @@ class OpSelModsMatchClass : AsmOperandClass {
 def IntOpSelModsMatchClass : OpSelModsMatchClass;
 def IntOpSelMods : InputMods<IntOpSelModsMatchClass>;
 
-def FPRegSDWAInputModsMatchClass : AsmOperandClass {
-  let Name = "SDWARegWithFPInputMods";
-  let ParserMethod = "parseRegWithFPInputMods";
-  let PredicateMethod = "isSDWARegKind";
+class FPSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "SDWAWithFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithFPInputMods";
+  let PredicateMethod = "isSDWAFP"#opSize#"Operand";
 }
 
-def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
+def FP16SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<16>;
+def FP32SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<32>;
+
+class FPSDWAInputMods <FPSDWAInputModsMatchClass matchClass> :
+  InputMods <matchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
+def FP16SDWAInputMods : FPSDWAInputMods<FP16SDWAInputModsMatchClass>;
+def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
+
 def FPVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
@@ -767,17 +880,23 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
-
-def IntRegSDWAInputModsMatchClass : AsmOperandClass {
-  let Name = "SDWARegWithIntInputMods";
-  let ParserMethod = "parseRegWithIntInputMods";
-  let PredicateMethod = "isSDWARegKind";
+class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "SDWAWithInt"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithIntInputMods";
+  let PredicateMethod = "isSDWAInt"#opSize#"Operand";
 }
 
-def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
+def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
+def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+
+class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
+  InputMods <matchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
+def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
+def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
@@ -1023,7 +1142,12 @@ class getVregSrcForVT<ValueType VT> {
 }
 
 class getSDWASrcForVT <ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
+  bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, f32.Value), 1,
+             0));
+  RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
+  RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
+  RegisterOperand ret = !if(isFP, retFlt, retInt);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
@@ -1064,32 +1188,6 @@ class getVOP3SrcForVT<ValueType VT> {
   );
 }
 
-// Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
-class isFloatType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, f16.Value), 1,
-    !if(!eq(SrcVT.Value, f32.Value), 1,
-    !if(!eq(SrcVT.Value, f64.Value), 1,
-    !if(!eq(SrcVT.Value, v2f16.Value), 1,
-    0))));
-}
-
-class isIntType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, i16.Value), 1,
-    !if(!eq(SrcVT.Value, i32.Value), 1,
-    !if(!eq(SrcVT.Value, i64.Value), 1,
-    0)));
-}
-
-class isPackedType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, v2i16.Value), 1,
-      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
-    );
-}
-
 // Float or packed int
 class isModifierType<ValueType SrcVT> {
   bit ret =
@@ -1134,11 +1232,10 @@ class getSrcModExt <ValueType VT> {
 
 // Return type of input modifiers operand specified input operand for SDWA
 class getSrcModSDWA <ValueType VT> {
-    bit isFP = !if(!eq(VT.Value, f16.Value), 1,
-               !if(!eq(VT.Value, f32.Value), 1,
-               !if(!eq(VT.Value, f64.Value), 1,
-               0)));
-  Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
+  Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
+                !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
+                !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
+                Int32SDWAInputMods)));
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1733,6 +1830,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
 def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
 def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
 
+def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
+def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+
 class Commutable_REV <string revOp, bit isOrig> {
   string RevOp = revOp;
   bit IsOrig = isOrig;
@@ -1747,6 +1847,8 @@ class AtomicNoRet <string noRetOp, bit isRet> {
 // Interpolation opcodes
 //===----------------------------------------------------------------------===//
 
+class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">;
+
 class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   VINTRPCommon <outs, ins, "", pattern>,
   SIMCInstr<opName, SIEncodingFamily.NONE> {
@@ -1823,38 +1925,6 @@ def getBasicFromSDWAOp : InstrMapping {
   let ValueCols = [["Default"]];
 }
 
-def getMaskedMIMGOp1 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["1"];
-  let ValueCols = [["2"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp2 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["2"];
-  let ValueCols = [["1"], ["3"], ["4"] ];
-}
-
-def getMaskedMIMGOp3 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["3"];
-  let ValueCols = [["1"], ["2"], ["4"] ];
-}
-
-def getMaskedMIMGOp4 : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["4"];
-  let ValueCols = [["1"], ["2"], ["3"] ];
-}
-
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "Commutable_REV";
@@ -1882,6 +1952,11 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.VI)],
                    [!cast<string>(SIEncodingFamily.SDWA)],
                    [!cast<string>(SIEncodingFamily.SDWA9)],
+                   // GFX80 encoding is added to work around a multiple matching
+                   // issue for buffer instructions with unpacked d16 data. This
+                   // does not actually change the encoding, and thus may be
+                   // removed later.
+                   [!cast<string>(SIEncodingFamily.GFX80)],
                    [!cast<string>(SIEncodingFamily.GFX9)]];
 }
 
@@ -1902,6 +1977,14 @@ def getAddr64Inst : InstrMapping {
   let ValueCols = [["1"]];
 }
 
+def getMUBUFNoLdsInst : InstrMapping {
+  let FilterClass = "MUBUFLdsTable";
+  let RowFields = ["OpName"];
+  let ColFields = ["IsLds"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
 // Maps an atomic opcode to its version with a return value.
 def getAtomicRetOp : InstrMapping {
   let FilterClass = "AtomicNoRet";
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8c02e8da8d79..c3f8bfb53ef4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -11,18 +11,10 @@
 // that are not yet supported remain commented out.
 //===----------------------------------------------------------------------===//
 
-def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
-def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
-                      AssemblerPredicate<"FeatureVGPRIndexMode">;
-def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
-                AssemblerPredicate<"FeatureMovrel">;
-
-class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
+class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
   let SubtargetPredicate = isGCN;
 }
 
-
 include "VOPInstructions.td"
 include "SOPInstructions.td"
 include "SMInstructions.td"
@@ -40,15 +32,18 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
 // VINTRP Instructions
 //===----------------------------------------------------------------------===//
 
+// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
+def VINTRPDst : VINTRPDstOperand <VGPR_32>;
+
 let Uses = [M0, EXEC] in {
 
 // FIXME: Specify SchedRW for VINTRP insturctions.
 
 multiclass V_INTERP_P1_F32_m : VINTRP_m <
   0x00000000,
-  (outs VGPR_32:$vdst),
+  (outs VINTRPDst:$vdst),
   (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
-  "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+  "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
   [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
                                                (i32 imm:$attr)))]
 >;
@@ -69,9 +64,9 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
 
 defm V_INTERP_P2_F32 : VINTRP_m <
   0x00000001,
-  (outs VGPR_32:$vdst),
+  (outs VINTRPDst:$vdst),
   (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
-  "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+  "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
   [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
                                                           (i32 imm:$attr)))]>;
 
@@ -79,9 +74,9 @@ defm V_INTERP_P2_F32 : VINTRP_m <
 
 defm V_INTERP_MOV_F32 : VINTRP_m <
   0x00000002,
-  (outs VGPR_32:$vdst),
+  (outs VINTRPDst:$vdst),
   (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
-  "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+  "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
   [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
                                      (i32 imm:$attr)))]>;
 
@@ -186,6 +181,7 @@ def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
   let SALU = 1;
   let isAsCheapAsAMove = 1;
   let isTerminator = 1;
+  let Defs = [SCC];
 }
 
 def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
@@ -246,7 +242,6 @@ def SI_IF: CFPseudoInstSI <
 def SI_ELSE : CFPseudoInstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
-  let Constraints = "$src = $dst";
   let Size = 12;
   let hasSideEffects = 1;
 }
@@ -296,14 +291,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
-let Uses = [EXEC], Defs = [EXEC,VCC] in {
+let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {
+  // Even though this pseudo can usually be expanded without an SCC def, we
+  // conservatively assume that it has an SCC def, both because it is sometimes
+  // required in degenerate cases (when V_CMPX cannot be used due to constant
+  // bus limitations) and because it allows us to avoid having to track SCC
+  // liveness across basic blocks.
+  let Defs = [EXEC,VCC,SCC] in
   def _PSEUDO : PseudoInstSI <(outs), ins> {
     let isConvergent = 1;
     let usesCustomInserter = 1;
   }
 
+  let Defs = [EXEC,VCC,SCC] in
   def _TERMINATOR : SPseudoInstSI <(outs), ins> {
     let isTerminator = 1;
   }
@@ -312,6 +314,7 @@ multiclass PseudoInstKill <dag ins> {
 defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
 
+let Defs = [EXEC,VCC] in
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),
   [], " ; illegal copy $src to $dst">;
@@ -371,6 +374,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   let isReturn = 1;
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
+  let FixedSize = 1;
 }
 
 // Return for returning function calls.
@@ -449,7 +453,7 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
   let usesCustomInserter = 1;
 }
 
-let Defs = [M0, EXEC],
+let Defs = [M0, EXEC, SCC],
   UseNamedOperandTable = 1 in {
 
 class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
@@ -569,11 +573,6 @@ def : GCNPat<
   (SI_ELSE $src, $target, 0)
 >;
 
-def : GCNPat <
-  (int_AMDGPU_kilp),
-  (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
 def : Pat <
   // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
   (AMDGPUkill (i32 -1082130432)),
@@ -643,6 +642,11 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : GCNPat <
   (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
   (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
@@ -700,15 +704,19 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
 defm : FMADPat <f16, V_MAC_F16_e64>;
 defm : FMADPat <f32, V_MAC_F32_e64>;
 
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat<
-  (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
-  (VOP3Mods f32:$src1, i32:$src1_mod),
-  (VOP3Mods f32:$src2, i32:$src2_mod))),
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+  : GCNPat<
+  (Ty (mad_opr (VOP3Mods Ty:$src0, i32:$src0_mod),
+  (VOP3Mods Ty:$src1, i32:$src1_mod),
+  (VOP3Mods Ty:$src2, i32:$src2_mod))),
   (inst $src0_mod, $src0, $src1_mod, $src1,
   $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
+def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+  let SubtargetPredicate = Has16BitInsts;
+}
 
 multiclass SelectPat <ValueType vt, Instruction inst> {
   def : GCNPat <
@@ -799,6 +807,27 @@ foreach Index = 0-15 in {
   >;
 }
 
+
+def : Pat <
+  (extract_subvector v4i16:$vec, (i32 0)),
+  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
+>;
+
+def : Pat <
+  (extract_subvector v4i16:$vec, (i32 2)),
+  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
+>;
+
+def : Pat <
+  (extract_subvector v4f16:$vec, (i32 0)),
+  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
+>;
+
+def : Pat <
+  (extract_subvector v4f16:$vec, (i32 2)),
+  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
+>;
+
 let SubtargetPredicate = isGCN in {
 
 // FIXME: Why do only some of these type combinations for SReg and
@@ -838,6 +867,26 @@ def : BitConvert <f64, v2f32, VReg_64>;
 def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
+
+// FIXME: Make SGPR
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v2i32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2i32, VReg_64>;
+def : BitConvert <v2f32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2f32, VReg_64>;
+def : BitConvert <v4i16, f64, VReg_64>;
+def : BitConvert <v4f16, f64, VReg_64>;
+def : BitConvert <f64, v4i16, VReg_64>;
+def : BitConvert <f64, v4f16, VReg_64>;
+def : BitConvert <v4i16, i64, VReg_64>;
+def : BitConvert <v4f16, i64, VReg_64>;
+def : BitConvert <i64, v4i16, VReg_64>;
+def : BitConvert <i64, v4f16, VReg_64>;
+
 def : BitConvert <v4i32, v4f32, VReg_128>;
 def : BitConvert <v4f32, v4i32, VReg_128>;
 
@@ -880,11 +929,13 @@ def : ClampPat<V_MAX_F32_e64, f32>;
 def : ClampPat<V_MAX_F64, f64>;
 def : ClampPat<V_MAX_F16_e64, f16>;
 
+let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
   (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
   (V_PK_MAX_F16 $src0_modifiers, $src0,
                 $src0_modifiers, $src0, DSTCLAMP.ENABLE)
 >;
+}
 
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
@@ -910,7 +961,7 @@ def : GCNPat <
 
 def : GCNPat <
   (fabs f32:$src),
-  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+  (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
 >;
 
 def : GCNPat <
@@ -971,12 +1022,12 @@ def : GCNPat <
 
 def : GCNPat <
   (fneg f16:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+  (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
 >;
 
 def : GCNPat <
   (fabs f16:$src),
-  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+  (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
 >;
 
 def : GCNPat <
@@ -986,12 +1037,12 @@ def : GCNPat <
 
 def : GCNPat <
   (fneg v2f16:$src),
-  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+  (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
 >;
 
 def : GCNPat <
   (fabs v2f16:$src),
-  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+  (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
 >;
 
 // This is really (fneg (fabs v2f16:$src))
@@ -1000,7 +1051,12 @@ def : GCNPat <
 // VOP3P instructions, so it is turned into the bit op.
 def : GCNPat <
   (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
-  (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+  (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+def : GCNPat <
+  (fneg (v2f16 (fabs v2f16:$src))),
+  (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
 >;
 
 /********** ================== **********/
@@ -1101,6 +1157,7 @@ let SubtargetPredicate = isGCN in {
 def : IMad24Pat<V_MAD_I32_I24, 1>;
 def : UMad24Pat<V_MAD_U32_U24, 1>;
 
+// FIXME: This should only be done for VALU inputs
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
@@ -1341,11 +1398,13 @@ def : GCNPat<
   (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
 >;
 
+let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat<
   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
   (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
 >;
 }
+}
 
 let OtherPredicates = [NoFP32Denormals] in {
 def : GCNPat<
@@ -1375,6 +1434,16 @@ def : GCNPat<
 >;
 }
 
+let OtherPredicates = [HasDLInsts] in {
+def : GCNPat <
+  (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+       (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+       (f32 (VOP3NoMods f32:$src2))),
+  (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+                  SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+} // End OtherPredicates = [HasDLInsts]
+
 
 // Allow integer inputs
 class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1385,11 +1454,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
 def : ExpPattern<AMDGPUexport, i32, EXP>;
 def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
 
-def : GCNPat <
-  (v2i16 (build_vector i16:$src0, i16:$src1)),
-  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
->;
-
 // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
 // from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
@@ -1397,6 +1461,13 @@ def : GCNPat <
   (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
 >;
 
+
+let SubtargetPredicate = HasVOP3PInsts in {
+def : GCNPat <
+  (v2i16 (build_vector i16:$src0, i16:$src1)),
+  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
 // With multiple uses of the shift, this will duplicate the shift and
 // increase register pressure.
 def : GCNPat <
@@ -1404,6 +1475,7 @@ def : GCNPat <
   (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
 >;
 
+
 def : GCNPat <
   (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
                        (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
@@ -1416,6 +1488,9 @@ def : GCNPat <
   (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
 >;
 
+} // End SubtargetPredicate = HasVOP3PInsts
+
+
 // def : GCNPat <
 //   (v2f16 (scalar_to_vector f16:$src0)),
 //   (COPY $src0)
@@ -1426,6 +1501,16 @@ def : GCNPat <
 //   (COPY $src0)
 // >;
 
+def : GCNPat <
+  (v4i16 (scalar_to_vector i16:$src0)),
+  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
+def : GCNPat <
+  (v4f16 (scalar_to_vector f16:$src0)),
+  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
+>;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -1490,7 +1575,7 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
 
 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
 
 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 84cd47a101a8..4b537540046f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -45,6 +45,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
    };
 
 private:
-  const SISubtarget *STM = nullptr;
+  const GCNSubtarget *STM = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   MachineRegisterInfo *MRI = nullptr;
@@ -137,7 +138,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
+  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -150,10 +151,10 @@ public:
 } // end anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
-                      "SI Load / Store Optimizer", false, false)
+                      "SI Load Store Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
-                    "SI Load / Store Optimizer", false, false)
+                    "SI Load Store Optimizer", false, false)
 
 char SILoadStoreOptimizer::ID = 0;
 
@@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
   }
 }
 
-static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
-  // XXX: Should this be looking for implicit defs?
-  for (const MachineOperand &Def : MI.defs())
-    Defs.insert(Def.getReg());
+static void addDefsUsesToList(const MachineInstr &MI,
+                              DenseSet<unsigned> &RegDefs,
+                              DenseSet<unsigned> &PhysRegUses) {
+  for (const MachineOperand &Op : MI.operands()) {
+    if (Op.isReg()) {
+      if (Op.isDef())
+        RegDefs.insert(Op.getReg());
+      else if (Op.readsReg() &&
+               TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+        PhysRegUses.insert(Op.getReg());
+    }
+  }
 }
 
 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
@@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
 // already in the list. Returns true in that case.
 static bool
 addToListsIfDependent(MachineInstr &MI,
-                      DenseSet<unsigned> &Defs,
+                      DenseSet<unsigned> &RegDefs,
+                      DenseSet<unsigned> &PhysRegUses,
                       SmallVectorImpl<MachineInstr*> &Insts) {
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
     // instruction that I will potentially be merged with. We will need to move
     // this instruction after the merged instructions.
-
-    if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
+    //
+    // Similarly, if there is a def which is read by an instruction that is to
+    // be moved for merging, then we need to move the def-instruction as well.
+    // This can only happen for physical registers such as M0; virtual
+    // registers are in SSA form.
+    if (Use.isReg() &&
+        ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+         (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+          PhysRegUses.count(Use.getReg())))) {
       Insts.push_back(&MI);
-      addDefsToList(MI, Defs);
+      addDefsUsesToList(MI, RegDefs, PhysRegUses);
       return true;
     }
   }
@@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
 
   ++MBBI;
 
-  DenseSet<unsigned> DefsToMove;
-  addDefsToList(*CI.I, DefsToMove);
+  DenseSet<unsigned> RegDefsToMove;
+  DenseSet<unsigned> PhysRegUsesToMove;
+  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
   for ( ; MBBI != E; ++MBBI) {
     if (MBBI->getOpcode() != CI.I->getOpcode()) {
@@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
         CI.InstsToMove.push_back(&*MBBI);
-        addDefsToList(*MBBI, DefsToMove);
+        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
         continue;
       }
 
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
+      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                            CI.InstsToMove);
       continue;
     }
 
@@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     //   DS_WRITE_B32 addr, f(w), idx1
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
+    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                              CI.InstsToMove))
       continue;
 
     bool Match = true;
@@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     // down past this instruction.
     // check if we can move I across MBBI and if we can move all I's users
     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
       break;
   }
   return false;
@@ -496,13 +516,15 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   unsigned BaseReg = AddrReg->getReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
+    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+      .addImm(CI.BaseOff);
+
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    unsigned AddOpc = STM->hasAddNoCarry() ?
-      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
-    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
-      .addImm(CI.BaseOff)
+    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+      .addReg(ImmReg)
       .addReg(AddrReg->getReg());
   }
 
@@ -532,7 +554,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
-  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   return Next;
 }
 
@@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
   const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
   const MachineOperand *Data1
     = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
@@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   const MCInstrDesc &Write2Desc = TII->get(Opc);
   DebugLoc DL = CI.I->getDebugLoc();
 
-  unsigned BaseReg = Addr->getReg();
+  unsigned BaseReg = AddrReg->getReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
+    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+      .addImm(CI.BaseOff);
+
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    unsigned AddOpc = STM->hasAddNoCarry() ?
-      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
-    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
-      .addImm(CI.BaseOff)
-      .addReg(Addr->getReg());
+    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+      .addReg(ImmReg)
+      .addReg(AddrReg->getReg());
   }
 
   MachineInstrBuilder Write2 =
@@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
-  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
   return Next;
 }
 
@@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 
       continue;
     }
-    if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
-        (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
-         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
       // EltSize is in units of the offset encoding.
       CI.InstClass = S_BUFFER_LOAD_IMM;
       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
@@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  STM = &MF.getSubtarget<SISubtarget>();
+  STM = &MF.getSubtarget<GCNSubtarget>();
   if (!STM->loadStoreOptEnabled())
     return false;
 
@@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   assert(MRI->isSSA() && "Must be run on SSA");
 
-  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
 
   bool Modified = false;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index a9af83323976..ad30317c344c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass lowers the pseudo control flow instructions to real
+/// This pass lowers the pseudo control flow instructions to real
 /// machine instructions.
 ///
 /// All control flow is handled using predicated instructions and
@@ -51,6 +51,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -343,11 +344,49 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) {
 }
 
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
-  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  auto Dst = MI.getOperand(0).getReg();
+
+  // Skip ANDing with exec if the break condition is already masked by exec
+  // because it is a V_CMP in the same basic block. (We know the break
+  // condition operand was an i1 in IR, so if it is a VALU instruction it must
+  // be one with a carry-out.)
+  bool SkipAnding = false;
+  if (MI.getOperand(1).isReg()) {
+    if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
+      SkipAnding = Def->getParent() == MI.getParent()
+          && SIInstrInfo::isVALU(*Def);
+    }
+  }
+
+  // AND the break condition operand with exec, then OR that into the "loop
+  // exit" mask.
+  MachineInstr *And = nullptr, *Or = nullptr;
+  if (!SkipAnding) {
+    And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+             .addReg(AMDGPU::EXEC)
+             .add(MI.getOperand(1));
+    Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+             .addReg(Dst)
+             .add(MI.getOperand(2));
+  } else
+    Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+             .add(MI.getOperand(1))
+             .add(MI.getOperand(2));
+
+  if (LIS) {
+    if (And)
+      LIS->InsertMachineInstrInMaps(*And);
+    LIS->ReplaceMachineInstrInMaps(MI, *Or);
+  }
+
+  MI.eraseFromParent();
 }
 
 void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
-  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+  // Lowered in the same way as emitIfBreak above.
+  emitIfBreak(MI);
 }
 
 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
@@ -414,8 +453,8 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
       return;
 
   for (const auto &SrcOp : Def->explicit_operands())
-    if (SrcOp.isUse() && (!SrcOp.isReg() ||
-        TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+    if (SrcOp.isReg() && SrcOp.isUse() &&
+        (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
         SrcOp.getReg() == AMDGPU::EXEC))
       Src.push_back(SrcOp);
 }
@@ -447,7 +486,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
 }
 
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index da57b90dd8c4..ecc6cff407e1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -17,6 +17,8 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPULaneDominator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,7 +66,7 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
 
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
 
@@ -141,7 +143,8 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
               DefInst->getOperand(3).getReg()) &&
             TRI->getCommonSubClass(
               MRI.getRegClass(DefInst->getOperand(3).getReg()),
-              &AMDGPU::SGPR_64RegClass)) {
+              &AMDGPU::SGPR_64RegClass) &&
+            AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
               .add(Dst)
               .addReg(AMDGPU::EXEC)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 888d8f978aff..0d5ff75e37ed 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -34,9 +35,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     KernargSegmentPtr(false),
     DispatchID(false),
     FlatScratchInit(false),
-    GridWorkgroupCountX(false),
-    GridWorkgroupCountY(false),
-    GridWorkgroupCountZ(false),
     WorkGroupIDX(false),
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
@@ -47,12 +45,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDZ(false),
     ImplicitBufferPtr(false),
     ImplicitArgPtr(false),
-    GITPtrHigh(0xffffffff) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+    GITPtrHigh(0xffffffff),
+    HighBitsOf32BitAddress(0) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
 
+  Occupancy = getMaxWavesPerEU();
+  limitOccupancy(MF);
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+    if (!F.arg_empty())
+      KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+  }
+
   if (!isEntryFunction()) {
     // Non-entry functions have no special inputs for now, other registers
     // required for scratch access.
@@ -69,18 +81,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
       ImplicitArgPtr = true;
   } else {
-    if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
-      KernargSegmentPtr = true;
-  }
-
-  CallingConv::ID CC = F.getCallingConv();
-  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    if (!F.arg_empty())
+    if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
       KernargSegmentPtr = true;
-    WorkGroupIDX = true;
-    WorkItemIDX = true;
-  } else if (CC == CallingConv::AMDGPU_PS) {
-    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+      MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
+                                 MaxKernArgAlign);
+    }
   }
 
   if (ST.debuggerEmitPrologue()) {
@@ -132,7 +137,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     }
   }
 
-  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  bool IsCOV2 = ST.isAmdCodeObjectV2(F);
   if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
@@ -145,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     if (F.hasFnAttribute("amdgpu-dispatch-id"))
       DispatchID = true;
-  } else if (ST.isMesaGfxShader(MF)) {
+  } else if (ST.isMesaGfxShader(F)) {
     if (HasStackObjects || MaySpill)
       ImplicitBufferPtr = true;
   }
@@ -164,6 +169,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   StringRef S = A.getValueAsString();
   if (!S.empty())
     S.consumeInteger(0, GITPtrHigh);
+
+  A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
+  S = A.getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, HighBitsOf32BitAddress);
+}
+
+void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
+  limitOccupancy(getMaxWavesPerEU());
+  const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
+  limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
+                 MF.getFunction()));
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -236,7 +253,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   if (!SpillLanes.empty())
     return true;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -267,10 +284,9 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
       }
 
       Optional<int> CSRSpillFI;
-      if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
-        // TODO: Should this be a CreateSpillStackObject? This is technically a
-        // weird CSR spill.
-        CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
+      if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
+          isCalleeSavedReg(CSRegs, LaneVGPR)) {
+        CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
       }
 
       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
@@ -293,3 +309,29 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI)
   for (auto &R : SGPRToVGPRSpills)
     MFI.RemoveStackObject(R.first);
 }
+
+
+/// \returns VGPR used for \p Dim' work item ID.
+unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
+  switch (Dim) {
+  case 0:
+    assert(hasWorkItemIDX());
+    return AMDGPU::VGPR0;
+  case 1:
+    assert(hasWorkItemIDY());
+    return AMDGPU::VGPR1;
+  case 2:
+    assert(hasWorkItemIDZ());
+    return AMDGPU::VGPR2;
+  }
+  llvm_unreachable("unexpected dimension");
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
+  assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+  return AMDGPU::SGPR0 + NumUserSGPRs;
+}
+
+MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
+  return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 02e63f0258e6..ef91d1e43075 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,7 +16,9 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUMachineFunction.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -34,7 +36,6 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineFunction;
-class SIInstrInfo;
 class TargetRegisterClass;
 
 class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
@@ -50,15 +51,11 @@ public:
   }
 
   bool isAliased(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 
   bool mayAlias(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 };
 
@@ -74,15 +71,11 @@ public:
   }
 
   bool isAliased(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 
   bool mayAlias(const MachineFrameInfo *) const override {
-    // FIXME: If we ever change image intrinsics to accept fat pointers, then
-    // this could be true for some cases.
-    return false;
+    return true;
   }
 };
 
@@ -150,6 +143,7 @@ private:
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
   bool HasNonSpillStackObjects = false;
+  bool IsStackRealigned = false;
 
   unsigned NumSpilledSGPRs = 0;
   unsigned NumSpilledVGPRs = 0;
@@ -161,9 +155,6 @@ private:
   bool KernargSegmentPtr : 1;
   bool DispatchID : 1;
   bool FlatScratchInit : 1;
-  bool GridWorkgroupCountX : 1;
-  bool GridWorkgroupCountY : 1;
-  bool GridWorkgroupCountZ : 1;
 
   // Feature bits required for inputs passed in system SGPRs.
   bool WorkGroupIDX : 1; // Always initialized.
@@ -190,25 +181,25 @@ private:
   // current hardware only allows a 16 bit value.
   unsigned GITPtrHigh;
 
-  MCPhysReg getNextUserSGPR() const {
-    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
-    return AMDGPU::SGPR0 + NumUserSGPRs;
-  }
+  unsigned HighBitsOf32BitAddress;
 
-  MCPhysReg getNextSystemSGPR() const {
-    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
-  }
+  // Current recorded maximum possible occupancy.
+  unsigned Occupancy;
+
+  MCPhysReg getNextUserSGPR() const;
+
+  MCPhysReg getNextSystemSGPR() const;
 
 public:
   struct SpilledReg {
-    unsigned VGPR = AMDGPU::NoRegister;
+    unsigned VGPR = 0;
     int Lane = -1;
 
     SpilledReg() = default;
     SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
 
     bool hasLane() { return Lane != -1;}
-    bool hasReg() { return VGPR != AMDGPU::NoRegister;}
+    bool hasReg() { return VGPR != 0;}
   };
 
   struct SGPRSpillVGPRCSR {
@@ -248,8 +239,8 @@ public:
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
   void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
 
-  bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }
-  unsigned getTIDReg() const { return TIDReg; }
+  bool hasCalculatedTID() const { return TIDReg != 0; };
+  unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
 
   unsigned getBytesInStackArgArea() const {
@@ -342,18 +333,6 @@ public:
     return FlatScratchInit;
   }
 
-  bool hasGridWorkgroupCountX() const {
-    return GridWorkgroupCountX;
-  }
-
-  bool hasGridWorkgroupCountY() const {
-    return GridWorkgroupCountY;
-  }
-
-  bool hasGridWorkgroupCountZ() const {
-    return GridWorkgroupCountZ;
-  }
-
   bool hasWorkGroupIDX() const {
     return WorkGroupIDX;
   }
@@ -415,6 +394,10 @@ public:
     return GITPtrHigh;
   }
 
+  unsigned get32BitAddressHighBits() const {
+    return HighBitsOf32BitAddress;
+  }
+
   unsigned getNumUserSGPRs() const {
     return NumUserSGPRs;
   }
@@ -427,14 +410,14 @@ public:
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }
 
-  /// \brief Returns the physical register reserved for use as the resource
+  /// Returns the physical register reserved for use as the resource
   /// descriptor for scratch accesses.
   unsigned getScratchRSrcReg() const {
     return ScratchRSrcReg;
   }
 
   void setScratchRSrcReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    assert(Reg != 0 && "Should never be unset");
     ScratchRSrcReg = Reg;
   }
 
@@ -447,6 +430,7 @@ public:
   }
 
   void setStackPtrOffsetReg(unsigned Reg) {
+    assert(Reg != 0 && "Should never be unset");
     StackPtrOffsetReg = Reg;
   }
 
@@ -459,7 +443,7 @@ public:
   }
 
   void setScratchWaveOffsetReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    assert(Reg != 0 && "Should never be unset");
     ScratchWaveOffsetReg = Reg;
     if (isEntryFunction())
       FrameOffsetReg = ScratchWaveOffsetReg;
@@ -497,6 +481,14 @@ public:
     HasNonSpillStackObjects = StackObject;
   }
 
+  bool isStackRealigned() const {
+    return IsStackRealigned;
+  }
+
+  void setIsStackRealigned(bool Realigned = true) {
+    IsStackRealigned = Realigned;
+  }
+
   unsigned getNumSpilledSGPRs() const {
     return NumSpilledSGPRs;
   }
@@ -579,7 +571,7 @@ public:
     return DebuggerWorkGroupIDStackObjectIndices[Dim];
   }
 
-  /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+  /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
   void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
     assert(Dim < 3);
     DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
@@ -591,7 +583,7 @@ public:
     return DebuggerWorkItemIDStackObjectIndices[Dim];
   }
 
-  /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+  /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
   void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
     assert(Dim < 3);
     DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
@@ -614,20 +606,7 @@ public:
   }
 
   /// \returns VGPR used for \p Dim' work item ID.
-  unsigned getWorkItemIDVGPR(unsigned Dim) const {
-    switch (Dim) {
-    case 0:
-      assert(hasWorkItemIDX());
-      return AMDGPU::VGPR0;
-    case 1:
-      assert(hasWorkItemIDY());
-      return AMDGPU::VGPR1;
-    case 2:
-      assert(hasWorkItemIDZ());
-      return AMDGPU::VGPR2;
-    }
-    llvm_unreachable("unexpected dimension");
-  }
+  unsigned getWorkItemIDVGPR(unsigned Dim) const;
 
   unsigned getLDSWaveSpillSize() const {
     return LDSWaveSpillSize;
@@ -650,6 +629,29 @@ public:
       llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
     return PSV.first->second.get();
   }
+
+  unsigned getOccupancy() const {
+    return Occupancy;
+  }
+
+  unsigned getMinAllowedOccupancy() const {
+    if (!isMemoryBound() && !needsWaveLimiter())
+      return Occupancy;
+    return (Occupancy < 4) ? Occupancy : 4;
+  }
+
+  void limitOccupancy(const MachineFunction &MF);
+
+  void limitOccupancy(unsigned Limit) {
+    if (Occupancy > Limit)
+      Occupancy = Limit;
+  }
+
+  void increaseOccupancy(const MachineFunction &MF, unsigned Limit) {
+    if (Occupancy < Limit)
+      Occupancy = Limit;
+    limitOccupancy(MF);
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 6b67b76652ed..18754442898f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -154,6 +155,8 @@ static const char *getReasonStr(SIScheduleCandReason Reason) {
 
 #endif
 
+namespace llvm {
+namespace SISched {
 static bool tryLess(int TryVal, int CandVal,
                     SISchedulerCandidate &TryCand,
                     SISchedulerCandidate &Cand,
@@ -187,6 +190,8 @@ static bool tryGreater(int TryVal, int CandVal,
   Cand.setRepeat(Reason);
   return false;
 }
+} // end namespace SISched
+} // end namespace llvm
 
 // SIScheduleBlock //
 
@@ -212,7 +217,8 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
   }
 
   if (Cand.SGPRUsage > 60 &&
-      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+      SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
+                       TryCand, Cand, RegUsage))
     return;
 
   // Schedule low latency instructions as top as possible.
@@ -230,21 +236,22 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
   // could go quite high, thus above the arbitrary limit of 60 will encourage
   // use the already loaded constants (in order to release some SGPRs) before
   // loading more.
-  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
-              Cand.HasLowLatencyNonWaitedParent,
-              TryCand, Cand, SIScheduleCandReason::Depth))
+  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
+                       Cand.HasLowLatencyNonWaitedParent,
+                       TryCand, Cand, SIScheduleCandReason::Depth))
     return;
 
-  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
-                 TryCand, Cand, SIScheduleCandReason::Depth))
+  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                          TryCand, Cand, SIScheduleCandReason::Depth))
     return;
 
   if (TryCand.IsLowLatency &&
-      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
-              TryCand, Cand, SIScheduleCandReason::Depth))
+      SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+                       TryCand, Cand, SIScheduleCandReason::Depth))
     return;
 
-  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
+                       TryCand, Cand, RegUsage))
     return;
 
   // Fall through to original instruction order.
@@ -1201,7 +1208,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
   NextReservedID = 1;
   NextNonReservedID = DAGSize + 1;
 
-  DEBUG(dbgs() << "Coloring the graph\n");
+  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
 
   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
     colorHighLatenciesGroups();
@@ -1258,13 +1265,11 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
     SIScheduleBlock *Block = CurrentBlocks[i];
     Block->finalizeUnits();
   }
-  DEBUG(
-    dbgs() << "Blocks created:\n\n";
-    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-      SIScheduleBlock *Block = CurrentBlocks[i];
-      Block->printDebug(true);
-    }
-  );
+  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
+             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+               SIScheduleBlock *Block = CurrentBlocks[i];
+               Block->printDebug(true);
+             });
 }
 
 // Two functions taken from Codegen/MachineScheduler.cpp
@@ -1274,7 +1279,7 @@ static MachineBasicBlock::iterator
 nextIfDebug(MachineBasicBlock::iterator I,
             MachineBasicBlock::const_iterator End) {
   for (; I != End; ++I) {
-    if (!I->isDebugValue())
+    if (!I->isDebugInstr())
       break;
   }
   return I;
@@ -1284,7 +1289,7 @@ void SIScheduleBlockCreator::topologicalSort() {
   unsigned DAGSize = CurrentBlocks.size();
   std::vector<int> WorkList;
 
-  DEBUG(dbgs() << "Topological Sort\n");
+  LLVM_DEBUG(dbgs() << "Topological Sort\n");
 
   WorkList.reserve(DAGSize);
   TopDownIndex2Block.resize(DAGSize);
@@ -1331,11 +1336,11 @@ void SIScheduleBlockCreator::topologicalSort() {
 void SIScheduleBlockCreator::scheduleInsideBlocks() {
   unsigned DAGSize = CurrentBlocks.size();
 
-  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
 
   // We do schedule a valid scheduling such that a Block corresponds
   // to a range of instructions.
-  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     SIScheduleBlock *Block = CurrentBlocks[i];
     Block->fastSchedule();
@@ -1389,7 +1394,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
   }
 
-  DEBUG(dbgs() << "Restoring MI Pos\n");
+  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
   // Restore old ordering (which prevents a LIS->handleMove bug).
   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
     MachineBasicBlock::iterator POld = PosOld[i-1];
@@ -1403,12 +1408,10 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
     }
   }
 
-  DEBUG(
-    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-      SIScheduleBlock *Block = CurrentBlocks[i];
-      Block->printDebug(true);
-    }
-  );
+  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->printDebug(true);
+  });
 }
 
 void SIScheduleBlockCreator::fillStats() {
@@ -1559,13 +1562,10 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
     blockScheduled(Block);
   }
 
-  DEBUG(
-    dbgs() << "Block Order:";
-    for (SIScheduleBlock* Block : BlocksScheduled) {
-      dbgs() << ' ' << Block->getID();
-    }
-    dbgs() << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
+                                            : BlocksScheduled) {
+    dbgs() << ' ' << Block->getID();
+  } dbgs() << '\n';);
 }
 
 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
@@ -1576,19 +1576,19 @@ bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
   }
 
   // Try to hide high latencies.
-  if (tryLess(TryCand.LastPosHighLatParentScheduled,
-              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
+                 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
     return true;
   // Schedule high latencies early so you can hide them better.
-  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
-                 TryCand, Cand, Latency))
+  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                          TryCand, Cand, Latency))
     return true;
-  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
-                                          TryCand, Cand, Depth))
+  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
+                                                   TryCand, Cand, Depth))
     return true;
-  if (tryGreater(TryCand.NumHighLatencySuccessors,
-                 Cand.NumHighLatencySuccessors,
-                 TryCand, Cand, Successor))
+  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
+                          Cand.NumHighLatencySuccessors,
+                          TryCand, Cand, Successor))
     return true;
   return false;
 }
@@ -1600,17 +1600,17 @@ bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
     return true;
   }
 
-  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
-              TryCand, Cand, RegUsage))
+  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+                       TryCand, Cand, RegUsage))
     return true;
-  if (tryGreater(TryCand.NumSuccessors > 0,
-                 Cand.NumSuccessors > 0,
-                 TryCand, Cand, Successor))
+  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
+                          Cand.NumSuccessors > 0,
+                          TryCand, Cand, Successor))
     return true;
-  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
     return true;
-  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
-              TryCand, Cand, RegUsage))
+  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+                       TryCand, Cand, RegUsage))
     return true;
   return false;
 }
@@ -1628,18 +1628,17 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     maxVregUsage = VregCurrentUsage;
   if (SregCurrentUsage > maxSregUsage)
     maxSregUsage = SregCurrentUsage;
-  DEBUG(
-    dbgs() << "Picking New Blocks\n";
-    dbgs() << "Available: ";
-    for (SIScheduleBlock* Block : ReadyBlocks)
-      dbgs() << Block->getID() << ' ';
-    dbgs() << "\nCurrent Live:\n";
-    for (unsigned Reg : LiveRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-    dbgs() << '\n';
-    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
-    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
+             for (SIScheduleBlock *Block
+                  : ReadyBlocks) dbgs()
+             << Block->getID() << ' ';
+             dbgs() << "\nCurrent Live:\n";
+             for (unsigned Reg
+                  : LiveRegs) dbgs()
+             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+             dbgs() << '\n';
+             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
 
   Cand.Block = nullptr;
   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1671,20 +1670,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     if (TryCand.Reason != NoCand) {
       Cand.setBest(TryCand);
       Best = I;
-      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
-                   << getReasonStr(Cand.Reason) << '\n');
+      LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+                        << getReasonStr(Cand.Reason) << '\n');
     }
   }
 
-  DEBUG(
-    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
-    dbgs() << "Is a block with high latency instruction: "
-      << (Cand.IsHighLatency ? "yes\n" : "no\n");
-    dbgs() << "Position of last high latency dependency: "
-           << Cand.LastPosHighLatParentScheduled << '\n';
-    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
-    dbgs() << '\n';
-  );
+  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+             dbgs() << "Is a block with high latency instruction: "
+                    << (Cand.IsHighLatency ? "yes\n" : "no\n");
+             dbgs() << "Position of last high latency dependency: "
+                    << Cand.LastPosHighLatParentScheduled << '\n';
+             dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+             dbgs() << '\n';);
 
   Block = Cand.Block;
   ReadyBlocks.erase(Best);
@@ -1933,13 +1930,10 @@ void SIScheduleDAGMI::schedule()
 {
   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   SIScheduleBlockResult Best, Temp;
-  DEBUG(dbgs() << "Preparing Scheduling\n");
+  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
 
   buildDAGWithRegPressure();
-  DEBUG(
-    for(SUnit& SU : SUnits)
-       SU.dumpAll(this)
-  );
+  LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
 
   topologicalSort();
   findRootsAndBiasEdges(TopRoots, BotRoots);
@@ -2041,15 +2035,15 @@ void SIScheduleDAGMI::schedule()
 
     scheduleMI(SU, true);
 
-    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
-                 << *SU->getInstr());
+    LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                      << *SU->getInstr());
   }
 
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
   placeDebugValues();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "*** Final schedule for "
            << printMBBReference(*begin()->getParent()) << " ***\n";
     dumpSchedule();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index d824e38504e6..0ce68ac6a897 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI Machine Scheduler interface
+/// SI Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index c73fb10b7ea0..938cdaf1ef8f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Memory legalizer - implements memory model. More information can be
+/// Memory legalizer - implements memory model. More information can be
 /// found here:
 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
 //
@@ -19,7 +19,9 @@
 #include "AMDGPUSubtarget.h"
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -36,6 +38,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <list>
 
@@ -47,42 +50,142 @@ using namespace llvm::AMDGPU;
 
 namespace {
 
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+/// Memory operation flags. Can be ORed together.
+enum class SIMemOp {
+  NONE = 0u,
+  LOAD = 1u << 0,
+  STORE = 1u << 1,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
+};
+
+/// Position to insert a new instruction relative to an existing
+/// instruction.
+enum class Position {
+  BEFORE,
+  AFTER
+};
+
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+  NONE,
+  SINGLETHREAD,
+  WAVEFRONT,
+  WORKGROUP,
+  AGENT,
+  SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed toether.
+enum class SIAtomicAddrSpace {
+  NONE = 0u,
+  GLOBAL = 1u << 0,
+  LDS = 1u << 1,
+  SCRATCH = 1u << 2,
+  GDS = 1u << 3,
+  OTHER = 1u << 4,
+
+  /// The address spaces that can be accessed by a FLAT instruction.
+  FLAT = GLOBAL | LDS | SCRATCH,
+
+  /// The address spaces that support atomic instructions.
+  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+  /// All address spaces.
+  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
+/// Sets named bit \p BitName to "true" if present in instruction \p MI.
+/// \returns Returns true if \p MI is modified, false otherwise.
+template <uint16_t BitName>
+bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
+  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+  if (BitIdx == -1)
+    return false;
+
+  MachineOperand &Bit = MI->getOperand(BitIdx);
+  if (Bit.getImm() != 0)
+    return false;
+
+  Bit.setImm(1);
+  return true;
+}
+
 class SIMemOpInfo final {
 private:
-  SyncScope::ID SSID = SyncScope::System;
+
+  friend class SIMemOpAccess;
+
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
   bool IsNonTemporal = false;
 
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
-      : SSID(SSID), Ordering(Ordering) {}
-
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
-              AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
-      : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
-        IsNonTemporal(IsNonTemporal) {}
-
-  /// \returns Info constructed from \p MI, which has at least machine memory
-  /// operand.
-  static Optional<SIMemOpInfo> constructFromMIWithMMO(
-      const MachineBasicBlock::iterator &MI);
+  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
+              SIAtomicScope Scope = SIAtomicScope::SYSTEM,
+              SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
+              SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
+              bool IsCrossAddressSpaceOrdering = true,
+              AtomicOrdering FailureOrdering =
+                AtomicOrdering::SequentiallyConsistent,
+              bool IsNonTemporal = false)
+    : Ordering(Ordering), FailureOrdering(FailureOrdering),
+      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
+      InstrAddrSpace(InstrAddrSpace),
+      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+      IsNonTemporal(IsNonTemporal) {
+    // There is also no cross address space ordering if the ordering
+    // address space is the same as the instruction address space and
+    // only contains a single address space.
+    if ((OrderingAddrSpace == InstrAddrSpace) &&
+        isPowerOf2_32(uint32_t(InstrAddrSpace)))
+      IsCrossAddressSpaceOrdering = false;
+  }
 
 public:
-  /// \returns Synchronization scope ID of the machine instruction used to
+  /// \returns Atomic synchronization scope of the machine instruction used to
   /// create this SIMemOpInfo.
-  SyncScope::ID getSSID() const {
-    return SSID;
+  SIAtomicScope getScope() const {
+    return Scope;
   }
+
   /// \returns Ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getOrdering() const {
     return Ordering;
   }
+
   /// \returns Failure ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getFailureOrdering() const {
     return FailureOrdering;
   }
+
+  /// \returns The address spaces be accessed by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getInstrAddrSpace() const {
+    return InstrAddrSpace;
+  }
+
+  /// \returns The address spaces that must be ordered by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getOrderingAddrSpace() const {
+    return OrderingAddrSpace;
+  }
+
+  /// \returns Return true iff memory ordering of operations on
+  /// different address spaces is required.
+  bool getIsCrossAddressSpaceOrdering() const {
+    return IsCrossAddressSpaceOrdering;
+  }
+
   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is non-temporal, false otherwise.
   bool isNonTemporal() const {
@@ -95,109 +198,198 @@ public:
     return Ordering != AtomicOrdering::NotAtomic;
   }
 
+};
+
+class SIMemOpAccess final {
+private:
+
+  AMDGPUAS SIAddrSpaceInfo;
+  AMDGPUMachineModuleInfo *MMI = nullptr;
+
+  /// Reports unsupported message \p Msg for \p MI to LLVM context.
+  void reportUnsupported(const MachineBasicBlock::iterator &MI,
+                         const char *Msg) const;
+
+  /// Inspects the target synchonization scope \p SSID and determines
+  /// the SI atomic scope it corresponds to, the address spaces it
+  /// covers, and whether the memory ordering applies between address
+  /// spaces.
+  Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+
+  /// \return Return a bit set of the address spaces accessed by \p AS.
+  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
+
+  /// \returns Info constructed from \p MI, which has at least machine memory
+  /// operand.
+  Optional<SIMemOpInfo> constructFromMIWithMMO(
+      const MachineBasicBlock::iterator &MI) const;
+
+public:
+  /// Construct class to support accessing the machine memory operands
+  /// of instructions in the machine function \p MF.
+  SIMemOpAccess(MachineFunction &MF);
+
   /// \returns Load info if \p MI is a load operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getLoadInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getLoadInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Store info if \p MI is a store operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getStoreInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getStoreInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Atomic fence info if \p MI is an atomic fence operation,
   /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicFenceInfo(
-      const MachineBasicBlock::iterator &MI);
-  /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation,
-  /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicCmpxchgInfo(
-      const MachineBasicBlock::iterator &MI);
-  /// \returns Atomic rmw info if \p MI is an atomic rmw operation,
-  /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicRmwInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getAtomicFenceInfo(
+      const MachineBasicBlock::iterator &MI) const;
 
-  /// \brief Reports unknown synchronization scope used in \p MI to LLVM
-  /// context.
-  static void reportUnknownSyncScope(
-      const MachineBasicBlock::iterator &MI);
+  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
+  /// rmw operation, "None" otherwise.
+  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
+      const MachineBasicBlock::iterator &MI) const;
 };
 
-class SIMemoryLegalizer final : public MachineFunctionPass {
-private:
-  /// \brief Machine module info.
-  const AMDGPUMachineModuleInfo *MMI = nullptr;
+class SICacheControl {
+protected:
 
-  /// \brief Instruction info.
+  /// Instruction info.
   const SIInstrInfo *TII = nullptr;
 
-  /// \brief Immediate for "vmcnt(0)".
-  unsigned Vmcnt0Immediate = 0;
+  IsaInfo::IsaVersion IV;
 
-  /// \brief Opcode for cache invalidation instruction (L1).
-  unsigned Wbinvl1Opcode = 0;
+  SICacheControl(const GCNSubtarget &ST);
 
-  /// \brief List of atomic pseudo instructions.
-  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+public:
 
-  /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
-  /// true if \p MI is modified, false otherwise.
-  template <uint16_t BitName>
-  bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
-    int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
-    if (BitIdx == -1)
-      return false;
+  /// Create a cache control for the subtarget \p ST.
+  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
+
+  /// Update \p MI memory load instruction to bypass any caches up to
+  /// the \p Scope memory scope for address spaces \p
+  /// AddrSpace. Return true iff the instruction was modified.
+  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace) const = 0;
+
+  /// Update \p MI memory instruction to indicate it is
+  /// nontemporal. Return true iff the instruction was modified.
+  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
+    const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure any caches associated with
+  /// address spaces \p AddrSpace for memory scopes up to memory scope
+  /// \p Scope are invalidated. Returns true iff any instructions
+  /// inserted.
+  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace,
+                                     Position Pos) const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure memory instructions of kind \p Op
+  /// associated with address spaces \p AddrSpace have completed as
+  /// observed by other memory instructions executing in memory scope
+  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
+  /// ordering is between address spaces. Returns true iff any
+  /// instructions inserted.
+  virtual bool insertWait(MachineBasicBlock::iterator &MI,
+                          SIAtomicScope Scope,
+                          SIAtomicAddrSpace AddrSpace,
+                          SIMemOp Op,
+                          bool IsCrossAddrSpaceOrdering,
+                          Position Pos) const = 0;
+
+  /// Virtual destructor to allow derivations to be deleted.
+  virtual ~SICacheControl() = default;
 
-    MachineOperand &Bit = MI->getOperand(BitIdx);
-    if (Bit.getImm() != 0)
-      return false;
+};
 
-    Bit.setImm(1);
-    return true;
-  }
+class SIGfx6CacheControl : public SICacheControl {
+protected:
 
-  /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit<AMDGPU::OpName::glc>(MI);
   }
 
-  /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
     return enableNamedBit<AMDGPU::OpName::slc>(MI);
   }
 
-  /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
-                              bool Before = true) const;
-  /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                           bool Before = true) const;
+public:
+
+  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+  bool insertWait(MachineBasicBlock::iterator &MI,
+                  SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace,
+                  SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering,
+                  Position Pos) const override;
+};
+
+class SIGfx7CacheControl : public SIGfx6CacheControl {
+public:
 
-  /// \brief Removes all processed atomic pseudo instructions from the current
+  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+
+  /// Cache Control.
+  std::unique_ptr<SICacheControl> CC = nullptr;
+
+  /// List of atomic pseudo instructions.
+  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+  /// Return true iff instruction \p MI is a atomic instruction that
+  /// returns a result.
+  bool isAtomicRet(const MachineInstr &MI) const {
+    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+  }
+
+  /// Removes all processed atomic pseudo instructions from the current
   /// function. Returns true if current function is modified, false otherwise.
   bool removeAtomicPseudoMIs();
 
-  /// \brief Expands load operation \p MI. Returns true if instructions are
+  /// Expands load operation \p MI. Returns true if instructions are
   /// added/deleted or \p MI is modified, false otherwise.
   bool expandLoad(const SIMemOpInfo &MOI,
                   MachineBasicBlock::iterator &MI);
-  /// \brief Expands store operation \p MI. Returns true if instructions are
+  /// Expands store operation \p MI. Returns true if instructions are
   /// added/deleted or \p MI is modified, false otherwise.
   bool expandStore(const SIMemOpInfo &MOI,
                    MachineBasicBlock::iterator &MI);
-  /// \brief Expands atomic fence operation \p MI. Returns true if
+  /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicFence(const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
-  /// \brief Expands atomic cmpxchg operation \p MI. Returns true if
-  /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicCmpxchg(const SIMemOpInfo &MOI,
-                           MachineBasicBlock::iterator &MI);
-  /// \brief Expands atomic rmw operation \p MI. Returns true if
+  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicRmw(const SIMemOpInfo &MOI,
-                       MachineBasicBlock::iterator &MI);
+  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+                                MachineBasicBlock::iterator &MI);
 
 public:
   static char ID;
@@ -218,48 +410,129 @@ public:
 
 } // end namespace anonymous
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
-    const MachineBasicBlock::iterator &MI) {
-  assert(MI->getNumMemOperands() > 0);
+void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
+                                      const char *Msg) const {
+  const Function &Func = MI->getParent()->getParent()->getFunction();
+  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
+  Func.getContext().diagnose(Diag);
+}
 
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const AMDGPUMachineModuleInfo *MMI =
-      &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
+                               SIAtomicAddrSpace InstrScope) const {
+  /// TODO: For now assume OpenCL memory model which treats each
+  /// address space as having a separate happens-before relation, and
+  /// so an instruction only has ordering with respect to the address
+  /// space it accesses, and if it accesses multiple address spaces it
+  /// does not require ordering of operations in different address
+  /// spaces.
+ if (SSID == SyncScope::System)
+    return std::make_tuple(SIAtomicScope::SYSTEM,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getAgentSSID())
+    return std::make_tuple(SIAtomicScope::AGENT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWorkgroupSSID())
+    return std::make_tuple(SIAtomicScope::WORKGROUP,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWavefrontSSID())
+    return std::make_tuple(SIAtomicScope::WAVEFRONT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == SyncScope::SingleThread)
+    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  /// TODO: To support HSA Memory Model need to add additional memory
+  /// scopes that specify that do require cross address space
+  /// ordering.
+  return None;
+}
+
+SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
+  if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+    return SIAtomicAddrSpace::FLAT;
+  if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+    return SIAtomicAddrSpace::GLOBAL;
+  if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+    return SIAtomicAddrSpace::LDS;
+  if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+    return SIAtomicAddrSpace::SCRATCH;
+  if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+    return SIAtomicAddrSpace::GDS;
+
+  return SIAtomicAddrSpace::OTHER;
+}
+
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+  SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
+  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
+
+Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->getNumMemOperands() > 0);
 
   SyncScope::ID SSID = SyncScope::SingleThread;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsNonTemporal = true;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
   for (const auto &MMO : MI->memoperands()) {
-    const auto &IsSyncScopeInclusion =
-        MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
-    if (!IsSyncScopeInclusion) {
-      reportUnknownSyncScope(MI);
-      return None;
-    }
-
-    SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
-    Ordering =
-        isStrongerThan(Ordering, MMO->getOrdering()) ?
-            Ordering : MMO->getOrdering();
-    FailureOrdering =
-        isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
-            FailureOrdering : MMO->getFailureOrdering();
+    IsNonTemporal &= MMO->isNonTemporal();
+    InstrAddrSpace |=
+      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
+    AtomicOrdering OpOrdering = MMO->getOrdering();
+    if (OpOrdering != AtomicOrdering::NotAtomic) {
+      const auto &IsSyncScopeInclusion =
+          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+      if (!IsSyncScopeInclusion) {
+        reportUnsupported(MI,
+          "Unsupported non-inclusive atomic synchronization scope");
+        return None;
+      }
 
-    if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
-      IsNonTemporal = false;
+      SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+      Ordering =
+          isStrongerThan(Ordering, OpOrdering) ?
+              Ordering : MMO->getOrdering();
+      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
+             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
+      FailureOrdering =
+          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+              FailureOrdering : MMO->getFailureOrdering();
+    }
   }
 
-  return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  if (Ordering != AtomicOrdering::NotAtomic) {
+    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
+    if (!ScopeOrNone) {
+      reportUnsupported(MI, "Unsupported atomic synchronization scope");
+      return None;
+    }
+    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+      ScopeOrNone.getValue();
+    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+        ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+      reportUnsupported(MI, "Unsupported atomic address space");
+      return None;
+    }
+  }
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(MI->mayLoad() && !MI->mayStore()))
@@ -267,15 +540,13 @@ Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(!MI->mayLoad() && MI->mayStore()))
@@ -283,30 +554,46 @@ Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
     return None;
 
-  SyncScope::ID SSID =
-      static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
   AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
-  return SIMemOpInfo(SSID, Ordering);
+    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+
+  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
+  if (!ScopeOrNone) {
+    reportUnsupported(MI, "Unsupported atomic synchronization scope");
+    return None;
+  }
+
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+    ScopeOrNone.getValue();
+
+  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+      ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+    reportUnsupported(MI, "Unsupported atomic address space");
+    return None;
+  }
+
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
+                     IsCrossAddressSpaceOrdering);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(MI->mayLoad() && MI->mayStore()))
@@ -314,68 +601,251 @@ Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
+}
+
+SICacheControl::SICacheControl(const GCNSubtarget &ST) {
+  TII = ST.getInstrInfo();
+  IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
 }
 
 /* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo(
-    const MachineBasicBlock::iterator &MI) {
-  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
+  GCNSubtarget::Generation Generation = ST.getGeneration();
+  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return make_unique<SIGfx6CacheControl>(ST);
+  return make_unique<SIGfx7CacheControl>(ST);
+}
 
-  if (!(MI->mayLoad() && MI->mayStore()))
-    return None;
+bool SIGfx6CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
 
-  // Be conservative if there are no memory operands.
-  if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    /// TODO: Do not set glc for rmw atomic operations as they
+    /// implicitly bypass the L1 cache.
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  return Changed;
 }
 
-/* static */
-void SIMemOpInfo::reportUnknownSyncScope(
-    const MachineBasicBlock::iterator &MI) {
-  DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
-                                 "Unsupported synchronization scope");
-  LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
-  CTX->diagnose(Diag);
+bool SIGfx6CacheControl::enableNonTemporal(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->mayLoad() ^ MI->mayStore());
+  bool Changed = false;
+
+  /// TODO: Do not enableGLCBit if rmw atomic.
+  Changed |= enableGLCBit(MI);
+  Changed |= enableSLCBit(MI);
+
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
-                                               bool Before) const {
+bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode));
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                                            bool Before) const {
+bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                    SIAtomicScope Scope,
+                                    SIAtomicAddrSpace AddrSpace,
+                                    SIMemOp Op,
+                                    bool IsCrossAddrSpaceOrdering,
+                                    Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+  bool VMCnt = false;
+  bool LGKMCnt = false;
+  bool EXPCnt = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      VMCnt = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The L1 cache keeps all memory operations in order for
+      // wavefronts in the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+      // If no cross address space ordering then an LDS waitcnt is not
+      // needed as LDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/GDS memory as LDS operations
+      // could be reordered with respect to later global/GDS memory
+      // operations of the same wave.
+      LGKMCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The LDS keeps all memory operations in order for
+      // the same wavesfront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      // If no cross address space ordering then an GDS waitcnt is not
+      // needed as GDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/LDS memory as GDS operations
+      // could be reordered with respect to later global/LDS memory
+      // operations of the same wave.
+      EXPCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The GDS keeps all memory operations in order for
+      // the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
 
-  if (!Before)
+  if (VMCnt || LGKMCnt || EXPCnt) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            EXPCnt ? 0 : getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+    Changed = true;
+  }
+
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
+}
+
+bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
 }
 
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
@@ -396,37 +866,38 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= enableGLCBit(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertBufferWbinvl1Vol(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+        MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace());
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getInstrAddrSpace(),
+                                SIMemOp::LOAD,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -440,28 +911,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
 
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -472,111 +935,74 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
+  AtomicPseudoMIs.push_back(MI);
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertBufferWbinvl1Vol(MI);
-
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      /// TODO: This relies on a barrier always generating a waitcnt
+      /// for LDS to ensure it is not reordered with the completion of
+      /// the proceeding LDS operations. If barrier had a memory
+      /// ordering and memory scope, then library does not need to
+      /// generate a fence. Could add support in this file for
+      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
+      /// adding waitcnt before a S_BARRIER.
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::BEFORE);
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
-
-    SIMemOpInfo::reportUnknownSyncScope(MI);
-  }
-
-  return Changed;
-}
-
-bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
-                                            MachineBasicBlock::iterator &MI) {
-  assert(MI->mayLoad() && MI->mayStore());
-
-  bool Changed = false;
-
-  if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertBufferWbinvl1Vol(MI, false);
-      }
-
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= enableGLCBit(MI);
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   return Changed;
 }
 
-bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
-                                        MachineBasicBlock::iterator &MI) {
+bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
+  MachineBasicBlock::iterator &MI) {
   assert(MI->mayLoad() && MI->mayStore());
 
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertBufferWbinvl1Vol(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                isAtomicRet(*MI) ? SIMemOp::LOAD :
+                                                   SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= enableGLCBit(MI);
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   return Changed;
@@ -584,32 +1010,23 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
 
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
-
-  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-  TII = ST.getInstrInfo();
 
-  Vmcnt0Immediate =
-      AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
-  Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
-      AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+  SIMemOpAccess MOA(MF);
+  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
-      if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+      if (const auto &MOI = MOA.getLoadInfo(MI))
         Changed |= expandLoad(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+      else if (const auto &MOI = MOA.getStoreInfo(MI))
         Changed |= expandStore(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+      else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
         Changed |= expandAtomicFence(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI))
-        Changed |= expandAtomicCmpxchg(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI))
-        Changed |= expandAtomicRmw(MOI.getValue(), MI);
+      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+        Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
     }
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 2dc6f2702b3b..ebcad30a1866 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -76,7 +77,7 @@ static unsigned isCopyToExec(const MachineInstr &MI) {
   case AMDGPU::COPY:
   case AMDGPU::S_MOV_B64: {
     const MachineOperand &Dst = MI.getOperand(0);
-    if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+    if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
       return MI.getOperand(1).getReg();
     break;
   }
@@ -208,7 +209,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
@@ -243,11 +244,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
       if (CopyToExecInst->getOperand(1).isKill() &&
           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
-        DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
+        LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
 
         PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
 
-        DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
+        LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
 
         CopyToExecInst->eraseFromParent();
       }
@@ -257,7 +258,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
     if (isLiveOut(MBB, CopyToExec)) {
       // The copied register is live out and has a second use in another block.
-      DEBUG(dbgs() << "Exec copy source register is live out\n");
+      LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
       continue;
     }
 
@@ -269,7 +270,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
            = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
          J != JE; ++J) {
       if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
-        DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+        LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
         // Make sure this is inserted after any VALU ops that may have been
         // scheduled in between.
         SaveExecInst = nullptr;
@@ -280,8 +281,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
       if (J->modifiesRegister(CopyToExec, TRI)) {
         if (SaveExecInst) {
-          DEBUG(dbgs() << "Multiple instructions modify "
-                << printReg(CopyToExec, TRI) << '\n');
+          LLVM_DEBUG(dbgs() << "Multiple instructions modify "
+                            << printReg(CopyToExec, TRI) << '\n');
           SaveExecInst = nullptr;
           break;
         }
@@ -292,10 +293,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
         if (ReadsCopyFromExec) {
           SaveExecInst = &*J;
-          DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+          LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
           continue;
         } else {
-          DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "Instruction does not read exec copy: " << *J << '\n');
           break;
         }
       } else if (ReadsCopyFromExec && !SaveExecInst) {
@@ -307,8 +309,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
         // spill %sgpr0_sgpr1
         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
         //
-        DEBUG(dbgs() << "Found second use of save inst candidate: "
-              << *J << '\n');
+        LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
+                          << '\n');
         break;
       }
 
@@ -321,7 +323,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
     if (!SaveExecInst)
       continue;
 
-    DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+    LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
 
     MachineOperand &Src0 = SaveExecInst->getOperand(1);
     MachineOperand &Src1 = SaveExecInst->getOperand(2);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 83074773c495..7b678d12ba81 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in
+/// This pass removes redundant S_OR_B64 instructions enabling lanes in
 /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
 /// vector instructions between them we can only keep outer SI_END_CF, given
 /// that CFG is structured and exec bits of the outer end statement are always
@@ -23,6 +23,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
@@ -106,7 +107,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -134,7 +135,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
         }
 
         while (I != E) {
-          if (I->isDebugValue()) {
+          if (I->isDebugInstr()) {
             I = std::next(I);
             continue;
           }
@@ -143,7 +144,8 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
               I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
             break;
 
-          DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n');
+          LLVM_DEBUG(dbgs()
+                     << "Removing no effect instruction: " << *I << '\n');
 
           for (auto &Op : I->operands()) {
             if (Op.isReg())
@@ -193,7 +195,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
         !getOrExecSource(*NextLead, *TII, MRI))
       continue;
 
-    DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
+    LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
 
     auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
     unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
@@ -224,7 +226,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
 
-      DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
+      LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
     }
 
     if (SafeToReplace) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 5ed7fdf220bf..0e000b72962e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -25,6 +25,7 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -39,6 +40,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
@@ -86,11 +88,11 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
-  void matchSDWAOperands(MachineFunction &MF);
+  void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
-  bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
+  bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
-  void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
+  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
@@ -218,7 +220,7 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() {
 
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
   switch(Sel) {
   case BYTE_0: OS << "BYTE_0"; break;
   case BYTE_1: OS << "BYTE_1"; break;
@@ -366,18 +368,53 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   // Find operand in instruction that matches source operand and replace it with
   // target operand. Set corresponding src_sel
-
+  bool IsPreserveSrc = false;
   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
   MachineOperand *SrcMods =
       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   assert(Src && (Src->isReg() || Src->isImm()));
   if (!isSameReg(*Src, *getReplacedOperand())) {
-    // If this is not src0 then it should be src1
+    // If this is not src0 then it could be src1
     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
 
+    if (!Src ||
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // It's possible this Src is a tied operand for
+      // UNUSED_PRESERVE, in which case we can either
+      // abandon the peephole attempt, or if legal we can
+      // copy the target operand into the tied slot
+      // if the preserve operation will effectively cause the same
+      // result by overwriting the rest of the dst.
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      MachineOperand *DstUnused =
+        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+
+      if (Dst &&
+          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+        // This will work if the tied src is acessing WORD_0, and the dst is
+        // writing WORD_1. Modifiers don't matter because all the bits that
+        // would be impacted are being overwritten by the dst.
+        // Any other case will not work.
+        SdwaSel DstSel = static_cast<SdwaSel>(
+            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
+        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
+          IsPreserveSrc = true;
+          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                   AMDGPU::OpName::vdst);
+          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
+          Src = &MI.getOperand(TiedIdx);
+          SrcSel = nullptr;
+          SrcMods = nullptr;
+        } else {
+          // Not legal to convert this src
+          return false;
+        }
+      }
+    }
     assert(Src && Src->isReg());
 
     if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
@@ -388,11 +425,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
       return false;
     }
 
-    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+    assert(isSameReg(*Src, *getReplacedOperand()) &&
+           (IsPreserveSrc || (SrcSel && SrcMods)));
   }
   copyRegOperand(*Src, *getTargetOperand());
-  SrcSel->setImm(getSrcSel());
-  SrcMods->setImm(getSrcMods(TII, Src));
+  if (!IsPreserveSrc) {
+    SrcSel->setImm(getSrcSel());
+    SrcMods->setImm(getSrcMods(TII, Src));
+  }
   getTargetOperand()->setIsKill(false);
   return true;
 }
@@ -661,7 +701,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (TRI->isPhysicalRegister(Src1->getReg()) ||
+    if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
         TRI->isPhysicalRegister(Dst->getReg()))
       break;
 
@@ -739,8 +779,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     // TODO: add support for non-SDWA instructions as OtherInst.
     // For now this only works with SDWA instructions. For regular instructions
-    // there is no way to determine if instruction write only 8/16/24-bit out of
-    // full register size and all registers are at min 32-bit wide.
+    // there is no way to determine if the instruction writes only 8/16/24-bit
+    // out of full register size and all registers are at min 32-bit wide.
     if (!TII->isSDWA(*OtherInst))
       break;
 
@@ -804,20 +844,18 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
   return std::unique_ptr<SDWAOperand>(nullptr);
 }
 
-void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (auto Operand = matchSDWAOperand(MI)) {
-        DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
-        SDWAOperands[&MI] = std::move(Operand);
-        ++NumSDWAPatternsFound;
-      }
+void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
+  for (MachineInstr &MI : MBB) {
+    if (auto Operand = matchSDWAOperand(MI)) {
+      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
+      SDWAOperands[&MI] = std::move(Operand);
+      ++NumSDWAPatternsFound;
     }
   }
 }
 
 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
-                                         const SISubtarget &ST) const {
+                                         const GCNSubtarget &ST) const {
   // Check if this is already an SDWA instruction
   unsigned Opc = MI.getOpcode();
   if (TII->isSDWA(Opc))
@@ -854,11 +892,18 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
                            Opc == AMDGPU::V_MAC_F32_e32))
     return false;
 
+  // FIXME: has SDWA but require handling of implicit VCC use
+  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
+    return false;
+
   return true;
 }
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
                                    const SDWAOperandsVector &SDWAOperands) {
+
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
   // Convert to sdwa
   int SDWAOpcode;
   unsigned Opcode = MI.getOpcode();
@@ -984,9 +1029,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     }
   }
 
-  // Apply all sdwa operand pattenrs
+  // Check for a preserved register that needs to be copied.
+  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  if (DstUnused &&
+      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+    // We expect, if we are here, that the instruction was already in it's SDWA form,
+    // with a tied operand.
+    assert(Dst && Dst->isTied());
+    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
+    // We also expect a vdst, since sdst can't preserve.
+    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+    assert(PreserveDstIdx != -1);
+
+    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
+    auto Tied = MI.getOperand(TiedIdx);
+
+    SDWAInst.add(Tied);
+    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+  }
+
+  // Apply all sdwa operand patterns.
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
+    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
     // There should be no intesection between SDWA operands and potential MIs
     // e.g.:
     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
@@ -1007,8 +1072,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     return false;
   }
 
-  DEBUG(dbgs() << "Convert instruction:" << MI
-               << "Into:" << *SDWAInst << '\n');
+  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
   ++NumSDWAInstructionsPeepholed;
 
   MI.eraseFromParent();
@@ -1017,7 +1081,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
 
 // If an instruction was converted to SDWA it should not have immediates or SGPR
 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
+                                            const GCNSubtarget &ST) const {
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
   unsigned ConstantBusCount = 0;
   for (MachineOperand &Op : MI.explicit_uses()) {
@@ -1048,7 +1113,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget
 }
 
 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
     return false;
@@ -1058,35 +1123,36 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
 
   // Find all SDWA operands in MF.
-  bool Changed = false;
   bool Ret = false;
-  do {
-    matchSDWAOperands(MF);
-
-    for (const auto &OperandPair : SDWAOperands) {
-      const auto &Operand = OperandPair.second;
-      MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
-      if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
-        PotentialMatches[PotentialMI].push_back(Operand.get());
+  for (MachineBasicBlock &MBB : MF) {
+    bool Changed = false;
+    do {
+      matchSDWAOperands(MBB);
+
+      for (const auto &OperandPair : SDWAOperands) {
+        const auto &Operand = OperandPair.second;
+        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+          PotentialMatches[PotentialMI].push_back(Operand.get());
+        }
       }
-    }
 
-    for (auto &PotentialPair : PotentialMatches) {
-      MachineInstr &PotentialMI = *PotentialPair.first;
-      convertToSDWA(PotentialMI, PotentialPair.second);
-    }
-
-    PotentialMatches.clear();
-    SDWAOperands.clear();
+      for (auto &PotentialPair : PotentialMatches) {
+        MachineInstr &PotentialMI = *PotentialPair.first;
+        convertToSDWA(PotentialMI, PotentialPair.second);
+      }
 
-    Changed = !ConvertedInstructions.empty();
+      PotentialMatches.clear();
+      SDWAOperands.clear();
 
-    if (Changed)
-      Ret = true;
+      Changed = !ConvertedInstructions.empty();
 
-    while (!ConvertedInstructions.empty())
-      legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
-  } while (Changed);
+      if (Changed)
+        Ret = true;
+      while (!ConvertedInstructions.empty())
+        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+    } while (Changed);
+  }
 
   return Ret;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIProgramInfo.h
new file mode 100644
index 000000000000..383f6b575808
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -0,0 +1,77 @@
+//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines struct to track resource usage for kernels and entry functions.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+
+namespace llvm {
+
+/// Track resource usage for kernels / entry functions.
+struct SIProgramInfo {
+    // Fields set in PGM_RSRC1 pm4 packet.
+    uint32_t VGPRBlocks = 0;
+    uint32_t SGPRBlocks = 0;
+    uint32_t Priority = 0;
+    uint32_t FloatMode = 0;
+    uint32_t Priv = 0;
+    uint32_t DX10Clamp = 0;
+    uint32_t DebugMode = 0;
+    uint32_t IEEEMode = 0;
+    uint64_t ScratchSize = 0;
+
+    uint64_t ComputePGMRSrc1 = 0;
+
+    // Fields set in PGM_RSRC2 pm4 packet.
+    uint32_t LDSBlocks = 0;
+    uint32_t ScratchBlocks = 0;
+
+    uint64_t ComputePGMRSrc2 = 0;
+
+    uint32_t NumVGPR = 0;
+    uint32_t NumSGPR = 0;
+    uint32_t LDSSize = 0;
+    bool FlatUsed = false;
+
+    // Number of SGPRs that meets number of waves per execution unit request.
+    uint32_t NumSGPRsForWavesPerEU = 0;
+
+    // Number of VGPRs that meets number of waves per execution unit request.
+    uint32_t NumVGPRsForWavesPerEU = 0;
+
+    // Fixed SGPR number used to hold wave scratch offset for entire kernel
+    // execution, or std::numeric_limits<uint16_t>::max() if the register is not
+    // used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
+        std::numeric_limits<uint16_t>::max();
+
+    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+    // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
+    // is not used or not known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR =
+        std::numeric_limits<uint16_t>::max();
+
+    // Whether there is recursion, dynamic allocas, indirect calls or some other
+    // reason there may be statically unknown stack usage.
+    bool DynamicCallStack = false;
+
+    // Bonus information for debugging.
+    bool VCCUsed = false;
+
+    SIProgramInfo() = default;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 65cdc13e03cd..624607f6ea54 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -8,14 +8,16 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief SI implementation of the TargetRegisterInfo class.
+/// SI implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "SIRegisterInfo.h"
+#include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -54,7 +56,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
-SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
   AMDGPURegisterInfo(),
   SGPRPressureSets(getNumRegPressureSets()),
   VGPRPressureSets(getNumRegPressureSets()),
@@ -101,17 +103,10 @@ SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
          VGPRSetID < NumRegPressureSets);
 }
 
-void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
-  MCRegAliasIterator R(Reg, this, true);
-
-  for (; R.isValid(); ++R)
-    Reserved.set(*R);
-}
-
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
@@ -136,7 +131,7 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
@@ -163,6 +158,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
 
+  // Reserve xnack_mask registers - support is not implemented in Codegen.
+  reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
+
   // Reserve Trap Handler registers - support is not implemented in Codegen.
   reserveRegisterTuples(Reserved, AMDGPU::TBA);
   reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -175,7 +173,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
@@ -255,7 +253,7 @@ bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
   // create a virtual register for it during frame index elimination, so the
   // scavenger is directly needed.
   return MF.getFrameInfo().hasStackObjects() &&
-         MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+         MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
          MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
 }
 
@@ -310,7 +308,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     DL = Ins->getDebugLoc();
 
   MachineFunction *MF = MBB->getParent();
-  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = Subtarget.getInstrInfo();
 
   if (Offset == 0) {
@@ -339,7 +337,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
-  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = Subtarget.getInstrInfo();
 
 #ifndef NDEBUG
@@ -526,7 +524,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
                                          RegScavenger *RS) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MI->getParent()->getParent();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
 
@@ -534,22 +532,29 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   const DebugLoc &DL = MI->getDebugLoc();
   bool IsStore = Desc.mayStore();
 
-  bool RanOutOfSGPRs = false;
   bool Scavenged = false;
   unsigned SOffset = ScratchOffsetReg;
 
+  const unsigned EltSize = 4;
   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
-  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
-  unsigned Size = NumSubRegs * 4;
+  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
+  unsigned Size = NumSubRegs * EltSize;
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
-  const int64_t OriginalImmOffset = Offset;
+  int64_t ScratchOffsetRegDelta = 0;
 
   unsigned Align = MFI.getObjectAlignment(Index);
   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
 
-  if (!isUInt<12>(Offset + Size)) {
+  assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
+
+  if (!isUInt<12>(Offset + Size - EltSize)) {
     SOffset = AMDGPU::NoRegister;
 
+    // We currently only support spilling VGPRs to EltSize boundaries, meaning
+    // we can simplify the adjustment of Offset here to just scale with
+    // WavefrontSize.
+    Offset *= ST.getWavefrontSize();
+
     // We don't have access to the register scavenger if this function is called
     // during  PEI::scavengeFrameVirtualRegs().
     if (RS)
@@ -563,8 +568,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       // add the offset directly to the ScratchOffset register, and then
       // subtract the offset after the spill to return ScratchOffset to it's
       // original value.
-      RanOutOfSGPRs = true;
       SOffset = ScratchOffsetReg;
+      ScratchOffsetRegDelta = Offset;
     } else {
       Scavenged = true;
     }
@@ -576,8 +581,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
     Offset = 0;
   }
 
-  const unsigned EltSize = 4;
-
   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
     unsigned SubReg = NumSubRegs == 1 ?
       ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
@@ -609,11 +612,11 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
   }
 
-  if (RanOutOfSGPRs) {
+  if (ScratchOffsetRegDelta != 0) {
     // Subtract the offset we added to the ScratchOffset register.
     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
-      .addReg(ScratchOffsetReg)
-      .addImm(OriginalImmOffset);
+        .addReg(ScratchOffsetReg)
+        .addImm(ScratchOffsetRegDelta);
   }
 }
 
@@ -640,6 +643,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
 
   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
     = MFI->getSGPRToVGPRSpills(Index);
@@ -648,7 +652,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
     return false;
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   unsigned SuperReg = MI->getOperand(0).getReg();
@@ -661,6 +665,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   if (SpillToSMEM && OnlyToVGPR)
     return false;
 
+  assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
+                         SuperReg != MFI->getFrameOffsetReg() &&
+                         SuperReg != MFI->getScratchWaveOffsetReg()));
+
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
   unsigned OffsetReg = AMDGPU::M0;
@@ -736,11 +744,21 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
     if (SpillToVGPR) {
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 
+      // During SGPR spilling to VGPR, determine if the VGPR is defined. The
+      // only circumstance in which we say it is undefined is when it is the
+      // first spill to this VGPR in the first basic block.
+      bool VGPRDefined = true;
+      if (MBB == &MF->front())
+        VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+
+      // Mark the "old value of vgpr" input undef only if this is the first sgpr
+      // spill to this specific vgpr in the first basic block.
       BuildMI(*MBB, MI, DL,
               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
               Spill.VGPR)
         .addReg(SubReg, getKillRegState(IsKill))
-        .addImm(Spill.Lane);
+        .addImm(Spill.Lane)
+        .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
 
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to
@@ -812,7 +830,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
     return false;
 
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
@@ -972,7 +990,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1051,8 +1069,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         // Convert to an absolute stack address by finding the offset from the
         // scratch wave base and scaling by the wave size.
         //
-        // In an entry function/kernel the stack address is already the absolute
-        // address relative to the the scratch wave offset.
+        // In an entry function/kernel the stack address is already the
+        // absolute address relative to the scratch wave offset.
 
         unsigned DiffReg
           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1219,6 +1237,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::VReg_512RegClass,
     &AMDGPU::SReg_512RegClass,
     &AMDGPU::SCC_CLASSRegClass,
+    &AMDGPU::Pseudo_SReg_32RegClass,
+    &AMDGPU::Pseudo_SReg_128RegClass,
   };
 
   for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -1355,7 +1375,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
   return getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
-/// \brief Returns a register that is not used at any point in the function.
+/// Returns a register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 //         AMDGPU::NoRegister.
 unsigned
@@ -1483,7 +1503,9 @@ SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
 
 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
                             unsigned Reg) const {
-  return hasVGPRs(getRegClassForReg(MRI, Reg));
+  const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
+  assert(RC && "Register class for the reg not found");
+  return hasVGPRs(RC);
 }
 
 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -1510,7 +1532,7 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                              MachineFunction &MF) const {
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1545,3 +1567,34 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
     return Empty;
   return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
 }
+
+unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
+  // Not a callee saved register.
+  return AMDGPU::SGPR30_SGPR31;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+                                         const MachineRegisterInfo &MRI) const {
+  unsigned Size = getRegSizeInBits(MO.getReg(), MRI);
+  const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
+  if (!RB)
+    return nullptr;
+
+  switch (Size) {
+  case 32:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+                                                  &AMDGPU::SReg_32_XM0RegClass;
+  case 64:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
+                                                   &AMDGPU::SReg_64_XEXECRegClass;
+  case 96:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
+                                                  nullptr;
+  case 128:
+    return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
+                                                  &AMDGPU::SReg_128RegClass;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index bf814b6974a8..5a51b67ca719 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface definition for SIRegisterInfo
+/// Interface definition for SIRegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,15 +16,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
 
+class GCNSubtarget;
 class LiveIntervals;
 class MachineRegisterInfo;
-class SISubtarget;
 class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -36,11 +35,10 @@ private:
   bool SpillSGPRToVGPR;
   bool SpillSGPRToSMEM;
 
-  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
   void classifyPressureSet(unsigned PSetID, unsigned Reg,
                            BitVector &PressureSets) const;
 public:
-  SIRegisterInfo(const SISubtarget &ST);
+  SIRegisterInfo(const GCNSubtarget &ST);
 
   bool spillSGPRToVGPR() const {
     return SpillSGPRToVGPR;
@@ -126,7 +124,7 @@ public:
     return getEncodingValue(Reg) & 0xff;
   }
 
-  /// \brief Return the 'base' register class for this register.
+  /// Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
   const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
 
@@ -224,10 +222,11 @@ public:
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
-  unsigned getReturnAddressReg(const MachineFunction &MF) const {
-    // Not a callee saved register.
-    return AMDGPU::SGPR30_SGPR31;
-  }
+  unsigned getReturnAddressReg(const MachineFunction &MF) const;
+
+  const TargetRegisterClass *
+  getConstrainedRegClassForOperand(const MachineOperand &MO,
+                                 const MachineRegisterInfo &MRI) const override;
 
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index dd0efef7f91b..f87a0763b353 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -76,6 +76,16 @@ def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
 def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
 def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
 
+def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
+def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
+
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+                 DwarfRegAlias<XNACK_MASK_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 104;
+}
+
 // Trap handler registers
 def TBA_LO : SIReg<"tba_lo", 108>;
 def TBA_HI : SIReg<"tba_hi", 109>;
@@ -394,7 +404,7 @@ def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16],
   let CopyCost = -1;
 }
 
-def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -403,7 +413,7 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-  (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+  (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
    SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
   let AllocationPriority = 7;
@@ -425,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   let AllocationPriority = 7;
 }
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
 
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
   let isAllocatable = 0;
 }
 
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
-  (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
@@ -457,7 +467,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
   (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
@@ -495,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
 }
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
   let Size = 64;
 
   // Requires 2 v_mov_b32 to copy
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
index 0f02f5825cb0..7af69cb6a46d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -46,7 +46,7 @@ def Write64Bit : SchedWrite;
 // instructions)
 
 class SISchedMachineModel : SchedMachineModel {
-  let CompleteModel = 1;
+  let CompleteModel = 0;
   // MicroOpBufferSize = 1 means that instructions will always be added
   // the ready queue when they become available.  This exposes them
   // to the register pressure analysis.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 41f989ad3228..4189bcce52ea 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -10,9 +10,9 @@
 //
 
 #include "AMDGPU.h"
-#include "AMDGPUMCInstLower.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -64,17 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
   return new SIShrinkInstructions();
 }
 
-static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
-                   const MachineRegisterInfo &MRI) {
-  if (!MO->isReg())
-    return false;
-
-  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
-    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
-
-  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
-}
-
 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
                       const SIRegisterInfo &TRI,
                       const MachineRegisterInfo &MRI) {
@@ -92,14 +81,18 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
 
       case AMDGPU::V_ADDC_U32_e64:
       case AMDGPU::V_SUBB_U32_e64:
-        if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+      case AMDGPU::V_SUBBREV_U32_e64: {
+        const MachineOperand *Src1
+          = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
           return false;
         // Additional verification is needed for sdst/src2.
         return true;
-
+      }
       case AMDGPU::V_MAC_F32_e64:
       case AMDGPU::V_MAC_F16_e64:
-        if (!isVGPR(Src2, TRI, MRI) ||
+      case AMDGPU::V_FMAC_F32_e64:
+        if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
           return false;
         break;
@@ -110,7 +103,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   }
 
   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-  if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
+  if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
                TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
     return false;
 
@@ -124,7 +117,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
          !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 }
 
-/// \brief This function checks \p MI for operands defined by a move immediate
+/// This function checks \p MI for operands defined by a move immediate
 /// instruction and then folds the literal constant into the instruction if it
 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
@@ -290,7 +283,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
@@ -442,7 +435,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
           //
           // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we we will run
+          // a hint to the register allocator to use VCC and then we will run
           // this pass again after RA and shrink it if it outputs to VCC.
           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
           continue;
@@ -493,7 +486,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // We can shrink this instruction
-      DEBUG(dbgs() << "Shrinking " << MI);
+      LLVM_DEBUG(dbgs() << "Shrinking " << MI);
 
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
@@ -537,9 +530,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       MI.eraseFromParent();
       foldImmediates(*Inst32, TII, MRI);
 
-      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
-
-
+      LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
     }
   }
   return false;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 53aefe829737..879726b1528c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// This pass adds instructions to enable whole quad mode for pixel
 /// shaders, and whole wavefront mode for all programs.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
@@ -60,6 +60,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
@@ -325,9 +326,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
-        Flags = StateWQM;
-      } else if (TII->isWQM(Opcode)) {
+      if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
@@ -454,6 +453,11 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 
   if (II.Needs != 0)
     markInstructionUses(MI, II.Needs, Worklist);
+
+  // Ensure we process a block containing WWM, even if it does not require any
+  // WQM transitions.
+  if (II.Needs & StateWWM)
+    BI.Needs |= StateWWM;
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -681,7 +685,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
     return;
 
-  DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n");
+  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
+                    << ":\n");
 
   unsigned SavedWQMReg = 0;
   unsigned SavedNonWWMReg = 0;
@@ -844,7 +849,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToCopyInstrs.clear();
   CallingConv = MF.getFunction().getCallingConv();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
@@ -884,7 +889,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  DEBUG(printInfo());
+  LLVM_DEBUG(printInfo());
 
   lowerCopyInstrs();
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
index 8f347986eb8a..7485326017b2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -63,6 +63,18 @@ class SM_Real <SM_Pseudo ps>
   bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
 }
 
+class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
+  : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let has_glc = 0;
+  let LGKM_CNT = 0;
+  let ScalarStore = 0;
+  let hasSideEffects = 1;
+  let offset_is_imm = isImm;
+  let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
 class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
   : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
   RegisterClass BaseClass;
@@ -81,6 +93,18 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
   let ScalarStore = 1;
 }
 
+class SM_Discard_Pseudo <string opName, dag ins, bit isImm>
+  : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let has_glc = 0;
+  let has_sdst = 0;
+  let ScalarStore = 0;
+  let hasSideEffects = 1;
+  let offset_is_imm = isImm;
+  let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR");
+}
+
 multiclass SM_Pseudo_Loads<string opName,
                            RegisterClass baseClass,
                            RegisterClass dstClass> {
@@ -125,6 +149,11 @@ multiclass SM_Pseudo_Stores<string opName,
   }
 }
 
+multiclass SM_Pseudo_Discards<string opName> {
+  def _IMM  : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>;
+  def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
+}
+
 class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
   opName, (outs SReg_64_XEXEC:$sdst), (ins),
   " $sdst", [(set i64:$sdst, (node))]> {
@@ -144,6 +173,60 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
   let has_offset = 0;
 }
 
+multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
+  def _IMM  : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>;
+  def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Atomic Memory Classes
+//===----------------------------------------------------------------------===//
+
+class SM_Atomic_Pseudo <string opName,
+                        dag outs, dag ins, string asmOps, bit isRet>
+  : SM_Pseudo<opName, outs, ins, asmOps, []> {
+
+  bit glc = isRet;
+
+  let mayLoad = 1;
+  let mayStore = 1;
+  let has_glc = 1;
+
+  // Should these be set?
+  let ScalarStore = 1;
+  let hasSideEffects = 1;
+  let maybeAtomic = 1;
+}
+
+class SM_Pseudo_Atomic<string opName,
+                       RegisterClass baseClass,
+                       RegisterClass dataClass,
+                       bit isImm,
+                       bit isRet> :
+  SM_Atomic_Pseudo<opName,
+                   !if(isRet, (outs dataClass:$sdst), (outs)),
+                   !if(isImm,
+                       (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset),
+                       (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)),
+                   !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""),
+                   isRet> {
+  let offset_is_imm = isImm;
+  let PseudoInstr = opName # !if(isImm,
+                                 !if(isRet, "_IMM_RTN", "_IMM"),
+                                 !if(isRet, "_SGPR_RTN", "_SGPR"));
+
+  let Constraints = !if(isRet, "$sdst = $sdata", "");
+  let DisableEncoding = !if(isRet, "$sdata", "");
+}
+
+multiclass SM_Pseudo_Atomics<string opName,
+                             RegisterClass baseClass,
+                             RegisterClass dataClass> {
+  def _IMM      : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>;
+  def _SGPR     : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>;
+  def _IMM_RTN  : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>;
+  def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>;
+}
 
 //===----------------------------------------------------------------------===//
 // Scalar Memory Instructions
@@ -211,9 +294,85 @@ let SubtargetPredicate = isVI in {
 def S_DCACHE_WB     : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
 def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
 def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
-} // SubtargetPredicate = isVI
 
+defm S_ATC_PROBE        : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+} // SubtargetPredicate = isVI
 
+let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in {
+defm S_SCRATCH_LOAD_DWORD    : SM_Pseudo_Loads <"s_scratch_load_dword",   SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX2  : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX4  : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
+
+defm S_SCRATCH_STORE_DWORD   : SM_Pseudo_Stores <"s_scratch_store_dword",   SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
+} // SubtargetPredicate = HasFlatScratchInsts
+
+let SubtargetPredicate = HasScalarAtomics in {
+
+defm S_BUFFER_ATOMIC_SWAP         : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_ADD          : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB          : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN         : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN         : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX         : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX         : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_AND          : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_OR           : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR          : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_INC          : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC          : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>;
+
+defm S_BUFFER_ATOMIC_SWAP_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2   : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>;
+defm S_BUFFER_ATOMIC_ADD_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX_X2      : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_AND_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_OR_X2        : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_INC_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+
+defm S_ATOMIC_SWAP                : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_CMPSWAP             : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_ADD                 : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SUB                 : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMIN                : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMIN                : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMAX                : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMAX                : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_AND                 : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_OR                  : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_XOR                 : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_INC                 : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_DEC                 : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>;
+
+defm S_ATOMIC_SWAP_X2             : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_CMPSWAP_X2          : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>;
+defm S_ATOMIC_ADD_X2              : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SUB_X2              : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMIN_X2             : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMIN_X2             : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMAX_X2             : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMAX_X2             : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_AND_X2              : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_OR_X2               : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_XOR_X2              : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_INC_X2              : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_DEC_X2              : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>;
+
+} // let SubtargetPredicate = HasScalarAtomics
+
+let SubtargetPredicate = isGFX9 in {
+defm S_DCACHE_DISCARD    : SM_Pseudo_Discards <"s_dcache_discard">;
+defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
+}
 
 //===----------------------------------------------------------------------===//
 // Scalar Memory Patterns
@@ -223,11 +382,9 @@ def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+    ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) ||
     (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
-    !Ld->isVolatile() &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+     !Ld->isVolatile() && !N->isDivergent() &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
 
@@ -407,6 +564,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
   }
 }
 
+multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
+  def _IMM_vi  : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
+}
+
 defm S_LOAD_DWORD           : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
 defm S_LOAD_DWORDX2         : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
 defm S_LOAD_DWORDX4         : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
@@ -434,6 +596,103 @@ def S_DCACHE_WB_VOL_vi      : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
 def S_MEMTIME_vi            : SMEM_Real_vi <0x24, S_MEMTIME>;
 def S_MEMREALTIME_vi        : SMEM_Real_vi <0x25, S_MEMREALTIME>;
 
+defm S_SCRATCH_LOAD_DWORD    : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">;
+defm S_SCRATCH_LOAD_DWORDX2  : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">;
+defm S_SCRATCH_LOAD_DWORDX4  : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">;
+
+defm S_SCRATCH_STORE_DWORD   : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">;
+
+defm S_ATC_PROBE        : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">;
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
+  : SMEM_Real_vi <op, ps> {
+
+  bits<7> sdata;
+
+  let Constraints = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  let glc = ps.glc;
+  let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+}
+
+multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
+  def _IMM_vi       : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi      : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+  def _IMM_RTN_vi   : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
+  def _SGPR_RTN_vi  : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+}
+
+defm S_BUFFER_ATOMIC_SWAP         : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
+defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
+defm S_BUFFER_ATOMIC_ADD          : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">;
+defm S_BUFFER_ATOMIC_SUB          : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">;
+defm S_BUFFER_ATOMIC_SMIN         : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">;
+defm S_BUFFER_ATOMIC_UMIN         : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">;
+defm S_BUFFER_ATOMIC_SMAX         : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">;
+defm S_BUFFER_ATOMIC_UMAX         : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">;
+defm S_BUFFER_ATOMIC_AND          : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">;
+defm S_BUFFER_ATOMIC_OR           : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">;
+defm S_BUFFER_ATOMIC_XOR          : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">;
+defm S_BUFFER_ATOMIC_INC          : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">;
+defm S_BUFFER_ATOMIC_DEC          : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">;
+
+defm S_BUFFER_ATOMIC_SWAP_X2      : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2   : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
+defm S_BUFFER_ATOMIC_ADD_X2       : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
+defm S_BUFFER_ATOMIC_SUB_X2       : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
+defm S_BUFFER_ATOMIC_SMIN_X2      : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
+defm S_BUFFER_ATOMIC_UMIN_X2      : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
+defm S_BUFFER_ATOMIC_SMAX_X2      : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
+defm S_BUFFER_ATOMIC_UMAX_X2      : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
+defm S_BUFFER_ATOMIC_AND_X2       : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">;
+defm S_BUFFER_ATOMIC_OR_X2        : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">;
+defm S_BUFFER_ATOMIC_XOR_X2       : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
+defm S_BUFFER_ATOMIC_INC_X2       : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
+defm S_BUFFER_ATOMIC_DEC_X2       : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+
+defm S_ATOMIC_SWAP                : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">;
+defm S_ATOMIC_CMPSWAP             : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">;
+defm S_ATOMIC_ADD                 : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">;
+defm S_ATOMIC_SUB                 : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">;
+defm S_ATOMIC_SMIN                : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">;
+defm S_ATOMIC_UMIN                : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">;
+defm S_ATOMIC_SMAX                : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">;
+defm S_ATOMIC_UMAX                : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">;
+defm S_ATOMIC_AND                 : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">;
+defm S_ATOMIC_OR                  : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">;
+defm S_ATOMIC_XOR                 : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">;
+defm S_ATOMIC_INC                 : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">;
+defm S_ATOMIC_DEC                 : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">;
+
+defm S_ATOMIC_SWAP_X2             : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">;
+defm S_ATOMIC_CMPSWAP_X2          : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">;
+defm S_ATOMIC_ADD_X2              : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">;
+defm S_ATOMIC_SUB_X2              : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">;
+defm S_ATOMIC_SMIN_X2             : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">;
+defm S_ATOMIC_UMIN_X2             : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">;
+defm S_ATOMIC_SMAX_X2             : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">;
+defm S_ATOMIC_UMAX_X2             : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">;
+defm S_ATOMIC_AND_X2              : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">;
+defm S_ATOMIC_OR_X2               : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">;
+defm S_ATOMIC_XOR_X2              : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">;
+defm S_ATOMIC_INC_X2              : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">;
+defm S_ATOMIC_DEC_X2              : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">;
+
+multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
+  def _IMM_vi  : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
+  def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_DCACHE_DISCARD    : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -502,7 +761,7 @@ let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
 
 class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
   (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
-  (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+  (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
   let OtherPredicates = [isCIOnly];
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 02a95a4b6f24..6f5db9644c86 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -19,17 +19,28 @@ def GPRIdxMode : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
+                  list<dag> pattern=[]> :
+    InstSI<outs, ins, "", pattern>,
+    SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let SubtargetPredicate = isGCN;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_sdst = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // SOP1 Instructions
 //===----------------------------------------------------------------------===//
 
 class SOP1_Pseudo <string opName, dag outs, dag ins,
                    string asmOps, list<dag> pattern=[]> :
-  InstSI <outs, ins, "", pattern>,
-  SIMCInstr<opName, SIEncodingFamily.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let SubtargetPredicate = isGCN;
+  SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
 
   let mayLoad = 0;
   let mayStore = 0;
@@ -40,9 +51,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
   let Size = 4;
   let UseNamedOperandTable = 1;
 
-  string Mnemonic = opName;
-  string AsmOperands = asmOps;
-
   bits<1> has_src0 = 1;
   bits<1> has_sdst = 1;
 }
@@ -247,17 +255,25 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
 }
 }
 
+let SubtargetPredicate = isGFX9 in {
+  let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
+    def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">;
+    def S_ORN1_SAVEEXEC_B64  : SOP1_64<"s_orn1_saveexec_b64">;
+    def S_ANDN1_WREXEC_B64   : SOP1_64<"s_andn1_wrexec_b64">;
+    def S_ANDN2_WREXEC_B64   : SOP1_64<"s_andn2_wrexec_b64">;
+  } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+
+  def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+} // End SubtargetPredicate = isGFX9
+
 //===----------------------------------------------------------------------===//
 // SOP2 Instructions
 //===----------------------------------------------------------------------===//
 
 class SOP2_Pseudo<string opName, dag outs, dag ins,
                   string asmOps, list<dag> pattern=[]> :
-  InstSI<outs, ins, "", pattern>,
-  SIMCInstr<opName, SIEncodingFamily.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let SubtargetPredicate = isGCN;
+  SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
+
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -266,10 +282,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
   let SchedRW = [WriteSALU];
   let UseNamedOperandTable = 1;
 
-  string Mnemonic = opName;
-  string AsmOperands = asmOps;
-
-  bits<1> has_sdst = 1;
+  let has_sdst = 1;
 
   // Pseudo instructions have no encodings, but adding this field here allows
   // us to do:
@@ -279,7 +292,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
   // let Size = 4; // Do we need size here?
 }
 
-class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList,
           ps.Mnemonic # " " # ps.AsmOperands, []>,
   Enc32 {
@@ -482,6 +495,16 @@ let SubtargetPredicate = isGFX9 in {
   def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
   def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
   def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+
+  let Defs = [SCC] in {
+    def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">;
+    def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">;
+    def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">;
+    def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
+  } // End Defs = [SCC]
+
+  def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
+  def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -659,6 +682,16 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
 
 } // End hasSideEffects = 1
 
+let SubtargetPredicate = isGFX9 in {
+  def S_CALL_B64 : SOPK_Pseudo<
+      "s_call_b64",
+      (outs SReg_64:$sdst),
+      (ins s16imm:$simm16),
+      "$sdst, $simm16"> {
+    let isCall = 1;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SOPC Instructions
 //===----------------------------------------------------------------------===//
@@ -806,6 +839,13 @@ def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
 }
 }
 
+let SubtargetPredicate = isGFX9 in {
+  let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+    def S_ENDPGM_ORDERED_PS_DONE :
+      SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
+  } // End isBarrier = 1, isReturn = 1, simm16 = 0
+} // End SubtargetPredicate = isGFX9
+
 let isBranch = 1, SchedRW = [WriteBranch] in {
 def S_BRANCH : SOPP <
   0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -1312,3 +1352,26 @@ def S_SETREG_B32_vi        : SOPK_Real_vi <0x12, S_SETREG_B32>;
 //def S_GETREG_REGRD_B32_vi  : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
 def S_SETREG_IMM32_B32_vi  : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
                              Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
+
+def S_CALL_B64_vi          : SOPK_Real_vi <0x15, S_CALL_B64>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_ANDN1_SAVEEXEC_B64_vi   : SOP1_Real_vi<0x33, S_ANDN1_SAVEEXEC_B64>;
+def S_ORN1_SAVEEXEC_B64_vi    : SOP1_Real_vi<0x34, S_ORN1_SAVEEXEC_B64>;
+def S_ANDN1_WREXEC_B64_vi     : SOP1_Real_vi<0x35, S_ANDN1_WREXEC_B64>;
+def S_ANDN2_WREXEC_B64_vi     : SOP1_Real_vi<0x36, S_ANDN2_WREXEC_B64>;
+def S_BITREPLICATE_B64_B32_vi : SOP1_Real_vi<0x37, S_BITREPLICATE_B64_B32>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX9.
+//===----------------------------------------------------------------------===//
+
+def S_LSHL1_ADD_U32_vi   : SOP2_Real_vi<0x2e, S_LSHL1_ADD_U32>;
+def S_LSHL2_ADD_U32_vi   : SOP2_Real_vi<0x2f, S_LSHL2_ADD_U32>;
+def S_LSHL3_ADD_U32_vi   : SOP2_Real_vi<0x30, S_LSHL3_ADD_U32>;
+def S_LSHL4_ADD_U32_vi   : SOP2_Real_vi<0x31, S_LSHL4_ADD_U32>;
+def S_MUL_HI_U32_vi      : SOP2_Real_vi<0x2c, S_MUL_HI_U32>;
+def S_MUL_HI_I32_vi      : SOP2_Real_vi<0x2d, S_MUL_HI_I32>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index f61e2e413ad4..e4c442db3016 100644
--- a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,19 +16,19 @@
 
 using namespace llvm;
 
-/// \brief The target which supports all AMD GPUs.  This will eventually
+/// The target which supports all AMD GPUs.  This will eventually
 ///         be deprecated and there will be a R600 target and a GCN target.
 Target &llvm::getTheAMDGPUTarget() {
   static Target TheAMDGPUTarget;
   return TheAMDGPUTarget;
 }
-/// \brief The target for GCN GPUs
+/// The target for GCN GPUs
 Target &llvm::getTheGCNTarget() {
   static Target TheGCNTarget;
   return TheGCNTarget;
 }
 
-/// \brief Extern function to initialize the targets for the AMDGPU backend
+/// Extern function to initialize the targets for the AMDGPU backend
 extern "C" void LLVMInitializeAMDGPUTargetInfo() {
   RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
                                            "AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 03b11ae80500..9eb4c6513cce 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -61,7 +61,15 @@ const char* const IdSymbolic[] = {
   "HW_REG_HW_ID",
   "HW_REG_GPR_ALLOC",
   "HW_REG_LDS_ALLOC",
-  "HW_REG_IB_STS"
+  "HW_REG_IB_STS",
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  "HW_REG_SH_MEM_BASES"
 };
 
 } // namespace Hwreg
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index bf9d5bc6ebdc..3fd3c75874a3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUBaseInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPU.h"
 #include "SIDefines.h"
 #include "llvm/ADT/StringRef.h"
@@ -52,7 +53,7 @@ unsigned getBitMask(unsigned Shift, unsigned Width) {
   return ((1 << Width) - 1) << Shift;
 }
 
-/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+/// Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
 ///
 /// \returns Packed \p Dst.
 unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
@@ -61,7 +62,7 @@ unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
   return Dst;
 }
 
-/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+/// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
 ///
 /// \returns Unpacked bits.
 unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
@@ -96,64 +97,34 @@ unsigned getVmcntBitWidthHi() { return 2; }
 
 namespace llvm {
 
-static cl::opt<bool> EnablePackedInlinableLiterals(
-    "enable-packed-inlinable-literals",
-    cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
-    cl::init(false));
-
 namespace AMDGPU {
 
-LLVM_READNONE
-static inline Channels indexToChannel(unsigned Channel) {
-  switch (Channel) {
-  case 1:
-    return AMDGPU::Channels_1;
-  case 2:
-    return AMDGPU::Channels_2;
-  case 3:
-    return AMDGPU::Channels_3;
-  case 4:
-    return AMDGPU::Channels_4;
-  default:
-    llvm_unreachable("invalid MIMG channel");
-  }
-}
+struct MIMGInfo {
+  uint16_t Opcode;
+  uint16_t BaseOpcode;
+  uint8_t MIMGEncoding;
+  uint8_t VDataDwords;
+  uint8_t VAddrDwords;
+};
 
+#define GET_MIMGBaseOpcodesTable_IMPL
+#define GET_MIMGDimInfoTable_IMPL
+#define GET_MIMGInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
 
-// FIXME: Need to handle d16 images correctly.
-static unsigned rcToChannels(unsigned RCID) {
-  switch (RCID) {
-  case AMDGPU::VGPR_32RegClassID:
-    return 1;
-  case AMDGPU::VReg_64RegClassID:
-    return 2;
-  case AMDGPU::VReg_96RegClassID:
-    return 3;
-  case AMDGPU::VReg_128RegClassID:
-    return 4;
-  default:
-    llvm_unreachable("invalid MIMG register class");
-  }
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+                  unsigned VDataDwords, unsigned VAddrDwords) {
+  const MIMGInfo *Info = getMIMGOpcodeHelper(BaseOpcode, MIMGEncoding,
+                                             VDataDwords, VAddrDwords);
+  return Info ? Info->Opcode : -1;
 }
 
-int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) {
-  AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels);
-  unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass);
-  if (NewChannels == OrigChannels)
-    return Opc;
-
-  switch (OrigChannels) {
-  case 1:
-    return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
-  case 2:
-    return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
-  case 3:
-    return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
-  case 4:
-    return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
-  default:
-    llvm_unreachable("invalid MIMG channel");
-  }
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
+  const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
+  const MIMGInfo *NewInfo =
+      getMIMGOpcodeHelper(OrigInfo->BaseOpcode, OrigInfo->MIMGEncoding,
+                          NewChannels, OrigInfo->VAddrDwords);
+  return NewInfo ? NewInfo->Opcode : -1;
 }
 
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
@@ -183,10 +154,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
     return {7, 0, 3};
   if (Features.test(FeatureISAVersion7_0_4))
     return {7, 0, 4};
+  if (Features.test(FeatureSeaIslands))
+    return {7, 0, 0};
 
   // GCN GFX8 (Volcanic Islands (VI)).
-  if (Features.test(FeatureISAVersion8_0_0))
-    return {8, 0, 0};
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
   if (Features.test(FeatureISAVersion8_0_2))
@@ -195,14 +166,22 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
     return {8, 0, 3};
   if (Features.test(FeatureISAVersion8_1_0))
     return {8, 1, 0};
+  if (Features.test(FeatureVolcanicIslands))
+    return {8, 0, 0};
 
   // GCN GFX9.
   if (Features.test(FeatureISAVersion9_0_0))
     return {9, 0, 0};
   if (Features.test(FeatureISAVersion9_0_2))
     return {9, 0, 2};
+  if (Features.test(FeatureISAVersion9_0_4))
+    return {9, 0, 4};
+  if (Features.test(FeatureISAVersion9_0_6))
+    return {9, 0, 6};
+  if (Features.test(FeatureGFX9))
+    return {9, 0, 0};
 
-  if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+  if (Features.test(FeatureSouthernIslands))
     return {0, 0, 0};
   return {7, 0, 0};
 }
@@ -219,11 +198,15 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
          << ISAVersion.Major
          << ISAVersion.Minor
          << ISAVersion.Stepping;
+
+  if (hasXNACK(*STI))
+    Stream << "+xnack";
+
   Stream.flush();
 }
 
-bool hasCodeObjectV3(const FeatureBitset &Features) {
-  return Features.test(FeatureCodeObjectV3);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
+  return STI->getFeatureBits().test(FeatureCodeObjectV3);
 }
 
 unsigned getWavefrontSize(const FeatureBitset &Features) {
@@ -260,7 +243,7 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
 }
 
 unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
-  return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+  return getMaxWavesPerEU() * getEUsPerCU(Features);
 }
 
 unsigned getMaxWavesPerCU(const FeatureBitset &Features,
@@ -272,9 +255,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features) {
   return 1;
 }
 
-unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
-  if (!Features.test(FeatureGCN))
-    return 8;
+unsigned getMaxWavesPerEU() {
   // FIXME: Need to take scratch memory into account.
   return 10;
 }
@@ -330,11 +311,13 @@ unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
 unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  if (WavesPerEU >= getMaxWavesPerEU(Features))
+  if (WavesPerEU >= getMaxWavesPerEU())
     return 0;
-  unsigned MinNumSGPRs =
-      alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
-                getSGPRAllocGranule(Features)) + 1;
+
+  unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1);
+  if (Features.test(FeatureTrapHandler))
+    MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+  MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1;
   return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
 }
 
@@ -343,14 +326,49 @@ unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
   assert(WavesPerEU != 0);
 
   IsaVersion Version = getIsaVersion(Features);
-  unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
-                                   getSGPRAllocGranule(Features));
   unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
   if (Version.Major >= 8 && !Addressable)
     AddressableNumSGPRs = 112;
+  unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU;
+  if (Features.test(FeatureTrapHandler))
+    MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
+  MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features));
   return std::min(MaxNumSGPRs, AddressableNumSGPRs);
 }
 
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed, bool XNACKUsed) {
+  unsigned ExtraSGPRs = 0;
+  if (VCCUsed)
+    ExtraSGPRs = 2;
+
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major < 8) {
+    if (FlatScrUsed)
+      ExtraSGPRs = 4;
+  } else {
+    if (XNACKUsed)
+      ExtraSGPRs = 4;
+
+    if (FlatScrUsed)
+      ExtraSGPRs = 6;
+  }
+
+  return ExtraSGPRs;
+}
+
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed) {
+  return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed,
+                          Features[AMDGPU::FeatureXNACK]);
+}
+
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) {
+  NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features));
+  // SGPRBlocks is actual number of SGPR blocks minus 1.
+  return NumSGPRs / getSGPREncodingGranule(Features) - 1;
+}
+
 unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
   return 4;
 }
@@ -370,7 +388,7 @@ unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
 unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  if (WavesPerEU >= getMaxWavesPerEU(Features))
+  if (WavesPerEU >= getMaxWavesPerEU())
     return 0;
   unsigned MinNumVGPRs =
       alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
@@ -387,6 +405,12 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) {
+  NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features));
+  // VGPRBlocks is actual number of VGPR blocks minus 1.
+  return NumVGPRs / getVGPREncodingGranule(Features) - 1;
+}
+
 } // end namespace IsaInfo
 
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
@@ -396,7 +420,7 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   memset(&Header, 0, sizeof(Header));
 
   Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 1;
+  Header.amd_kernel_code_version_minor = 2;
   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
   Header.amd_machine_version_major = ISA.Major;
   Header.amd_machine_version_minor = ISA.Minor;
@@ -416,6 +440,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.private_segment_alignment = 4;
 }
 
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
+  amdhsa::kernel_descriptor_t KD;
+  memset(&KD, 0, sizeof(KD));
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
+                  amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+                  amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
+  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
+                  amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+  return KD;
+}
+
 bool isGroupSegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }
@@ -425,7 +464,8 @@ bool isGlobalSegment(const GlobalValue *GV) {
 }
 
 bool isReadOnlySegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+         GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -598,6 +638,18 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
   }
 }
 
+bool hasXNACK(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
+}
+
+bool hasMIMG_R128(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+}
+
+bool hasPackedD16(const MCSubtargetInfo &STI) {
+  return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
@@ -681,6 +733,8 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
   case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
 
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+  if (STI.getTargetTriple().getArch() == Triple::r600)
+    return Reg;
   MAP_REG2REG
 }
 
@@ -837,9 +891,6 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   assert(HasInv2Pi);
 
-  if (!EnablePackedInlinableLiterals)
-    return false;
-
   int16_t Lo16 = static_cast<int16_t>(Literal);
   int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
   return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
@@ -891,18 +942,10 @@ namespace llvm {
 namespace AMDGPU {
 
 AMDGPUAS getAMDGPUAS(Triple T) {
-  auto Env = T.getEnvironmentName();
   AMDGPUAS AS;
-  if (Env == "amdgiz" || Env == "amdgizcl") {
-    AS.FLAT_ADDRESS     = 0;
-    AS.PRIVATE_ADDRESS  = 5;
-    AS.REGION_ADDRESS   = 4;
-  }
-  else {
-    AS.FLAT_ADDRESS     = 4;
-    AS.PRIVATE_ADDRESS  = 0;
-    AS.REGION_ADDRESS   = 5;
-   }
+  AS.FLAT_ADDRESS = 0;
+  AS.PRIVATE_ADDRESS = 5;
+  AS.REGION_ADDRESS = 2;
   return AS;
 }
 
@@ -913,5 +956,21 @@ AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
 AMDGPUAS getAMDGPUAS(const Module &M) {
   return getAMDGPUAS(Triple(M.getTargetTriple()));
 }
+
+namespace {
+
+struct SourceOfDivergence {
+  unsigned Intr;
+};
+const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
+
+#define GET_SourcesOfDivergence_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+} // end anonymous namespace
+
+bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
+  return lookupSourceOfDivergence(IntrID);
+}
 } // namespace AMDGPU
 } // namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9515001b63d2..70681c271697 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstdint>
@@ -28,24 +29,31 @@ class Argument;
 class FeatureBitset;
 class Function;
 class GlobalValue;
-class MachineMemOperand;
 class MCContext;
 class MCRegisterClass;
 class MCRegisterInfo;
 class MCSection;
 class MCSubtargetInfo;
+class MachineMemOperand;
 class Triple;
 
 namespace AMDGPU {
+
+#define GET_MIMGBaseOpcode_DECL
+#define GET_MIMGDim_DECL
+#define GET_MIMGEncoding_DECL
+#include "AMDGPUGenSearchableTables.inc"
+
 namespace IsaInfo {
 
 enum {
   // The closed Vulkan driver sets 96, which limits the wave count to 8 but
   // doesn't spill SGPRs as much as when 80 is set.
-  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96,
+  TRAP_NUM_SGPRS = 16
 };
 
-/// \brief Instruction set architecture version.
+/// Instruction set architecture version.
 struct IsaVersion {
   unsigned Major;
   unsigned Minor;
@@ -55,12 +63,12 @@ struct IsaVersion {
 /// \returns Isa version for given subtarget \p Features.
 IsaVersion getIsaVersion(const FeatureBitset &Features);
 
-/// \brief Streams isa version string for given subtarget \p STI into \p Stream.
+/// Streams isa version string for given subtarget \p STI into \p Stream.
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
 
-/// \returns True if given subtarget \p Features support code object version 3,
+/// \returns True if given subtarget \p STI supports code object version 3,
 /// false otherwise.
-bool hasCodeObjectV3(const FeatureBitset &Features);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI);
 
 /// \returns Wavefront size for given subtarget \p Features.
 unsigned getWavefrontSize(const FeatureBitset &Features);
@@ -92,7 +100,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features);
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
 /// Features without any kind of limitation.
-unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+unsigned getMaxWavesPerEU();
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
 /// Features and limited by given \p FlatWorkGroupSize.
@@ -131,6 +139,22 @@ unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
                         bool Addressable);
 
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed, bool XNACKUsed);
+
+/// \returns Number of extra SGPRs implicitly required by given subtarget \p
+/// Features when the given special registers are used. XNACK is inferred from
+/// \p Features.
+unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+                          bool FlatScrUsed);
+
+/// \returns Number of SGPR blocks needed for given subtarget \p Features when
+/// \p NumSGPRs are used. \p NumSGPRs should already include any special
+/// register counts.
+unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
 /// \returns VGPR allocation granularity for given subtarget \p Features.
 unsigned getVGPRAllocGranule(const FeatureBitset &Features);
 
@@ -151,20 +175,57 @@ unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 /// execution unit requirement for given subtarget \p Features.
 unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 
+/// \returns Number of VGPR blocks needed for given subtarget \p Features when
+/// \p NumVGPRs are used.
+unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+
 } // end namespace IsaInfo
 
 LLVM_READONLY
 int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
 
+struct MIMGBaseOpcodeInfo {
+  MIMGBaseOpcode BaseOpcode;
+  bool Store;
+  bool Atomic;
+  bool AtomicX2;
+  bool Sampler;
+
+  uint8_t NumExtraArgs;
+  bool Gradients;
+  bool Coordinates;
+  bool LodOrClampOrMip;
+  bool HasD16;
+};
+
+LLVM_READONLY
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode);
+
+struct MIMGDimInfo {
+  MIMGDim Dim;
+  uint8_t NumCoords;
+  uint8_t NumGradients;
+  bool DA;
+};
+
 LLVM_READONLY
-int getMaskedMIMGOp(const MCInstrInfo &MII,
-                    unsigned Opc, unsigned NewChannels);
+const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+
+LLVM_READONLY
+int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
+                  unsigned VDataDwords, unsigned VAddrDwords);
+
+LLVM_READONLY
+int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+
 LLVM_READONLY
 int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
+
 bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
@@ -216,7 +277,7 @@ unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
 unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
-/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
 /// \p Lgkmcnt respectively.
 ///
@@ -240,7 +301,7 @@ unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
 unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                        unsigned Lgkmcnt);
 
-/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
 /// \p Version.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
@@ -278,41 +339,45 @@ inline bool isKernel(CallingConv::ID CC) {
   }
 }
 
+bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasPackedD16(const MCSubtargetInfo &STI);
+
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
 bool isGFX9(const MCSubtargetInfo &STI);
 
-/// \brief Is Reg - scalar register
+/// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
-/// \brief Is there any intersection between registers
+/// Is there any intersection between registers
 bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
 
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
-/// \brief Convert hardware register \p Reg to a pseudo register
+/// Convert hardware register \p Reg to a pseudo register
 LLVM_READNONE
 unsigned mc2PseudoReg(unsigned Reg);
 
-/// \brief Can this operand also contain immediate values?
+/// Can this operand also contain immediate values?
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// \brief Is this floating-point operand?
+/// Is this floating-point operand?
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// \brief Does this opearnd support only inlinable literals?
+/// Does this opearnd support only inlinable literals?
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
 unsigned getRegBitWidth(unsigned RCID);
 
-/// \brief Get the size in bits of a register from the register class \p RC.
+/// Get the size in bits of a register from the register class \p RC.
 unsigned getRegBitWidth(const MCRegisterClass &RC);
 
-/// \brief Get size of register operand
+/// Get size of register operand
 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
                            unsigned OpNo);
 
@@ -349,7 +414,7 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
   return getOperandSize(Desc.OpInfo[OpNo]);
 }
 
-/// \brief Is this literal inlinable
+/// Is this literal inlinable
 LLVM_READNONE
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
 
@@ -373,6 +438,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 /// not the encoded offset.
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
+/// \returns true if the intrinsic is divergent
+bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
new file mode 100644
index 000000000000..1924f71f11c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MBB A lane-dominates MBB B if
+// 1. A dominates B in the usual sense, i.e. every path from the entry to B
+//    goes through A, and
+// 2. whenever B executes, every active lane during that execution of B was
+//    also active during the most recent execution of A.
+//
+// The simplest example where A dominates B but does not lane-dominate it is
+// where A is a loop:
+//
+//     |
+//     +--+
+//     A  |
+//     +--+
+//     |
+//     B
+//
+// Unfortunately, the second condition is not fully captured by the control
+// flow graph when it is unstructured (as may happen when branch conditions are
+// uniform).
+//
+// The following replacement of the second condition is a conservative
+// approximation. It is an equivalent condition when the CFG is fully
+// structured:
+//
+// 2'. every cycle in the CFG that contains A also contains B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneDominator.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Given machine basic blocks A and B where A dominates B, check whether
+// A lane-dominates B.
+//
+// The check is conservative, i.e. there can be false-negatives.
+bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
+  // Check whether A is reachable from itself without going through B.
+  DenseSet<MachineBasicBlock *> Reachable;
+  SmallVector<MachineBasicBlock *, 8> Stack;
+
+  Stack.push_back(A);
+  do {
+    MachineBasicBlock *MBB = Stack.back();
+    Stack.pop_back();
+
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (Succ == A)
+        return false;
+      if (Succ != B && Reachable.insert(Succ).second)
+        Stack.push_back(Succ);
+    }
+  } while (!Stack.empty());
+
+  return true;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
new file mode 100644
index 000000000000..4f33a89a364b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
@@ -0,0 +1,24 @@
+//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+
+namespace llvm {
+
+class MachineBasicBlock;
+
+namespace AMDGPU {
+
+bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 991408c81c92..9f0a4d29b5e4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -73,7 +73,6 @@ FIELD2(amd_machine_version_stepping,  machine_version_stepping,   amd_machine_ve
 
 FIELD(kernel_code_entry_byte_offset),
 FIELD(kernel_code_prefetch_byte_size),
-FIELD(max_scratch_backing_memory_byte_size),
 
 COMPPGM1(granulated_workitem_vgpr_count,  compute_pgm_rsrc1_vgprs,          VGPRS),
 COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs,          SGPRS),
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index ff2bd2454400..4c7a92219755 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -40,17 +40,9 @@ class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
 }
 
 class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
-  InstSI <P.Outs32, P.Ins32, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
-  MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
+  VOP_Pseudo <opName, !if(VOP1Only, "", "_e32"), P, P.Outs32, P.Ins32, "", pattern> {
 
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  string Mnemonic = opName;
-  string AsmOperands = P.Asm32;
+  let AsmOperands = P.Asm32;
 
   let Size = 4;
   let mayLoad = 0;
@@ -63,8 +55,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
-
-  VOPProfile Pfl = P;
 }
 
 class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
@@ -86,6 +76,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 }
 
 class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -202,13 +193,14 @@ defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
 defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
 defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
 defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
-defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
 
 let SchedRW = [WriteQuarterRate32] in {
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
 defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
 defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
-defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
 defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
 } // End SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteDouble] in {
@@ -216,8 +208,6 @@ defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
 } // End SchedRW = [WriteDouble];
 
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
-
 let SchedRW = [WriteDouble] in {
 defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
 } // End SchedRW = [WriteDouble]
@@ -232,9 +222,9 @@ defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
 defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
 defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
 defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
 
 let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
 defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
 defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
 } // End SchedRW = [WriteDoubleAdd]
@@ -298,9 +288,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
 defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
-let SchedRW = [WriteQuarterRate32] in {
 defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-}
 
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -344,11 +332,15 @@ defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+let SchedRW = [WriteQuarterRate32] in {
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
 defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
 defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
 defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
 defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
 defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
 defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -356,8 +348,6 @@ defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
 defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
 defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
-defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
-defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 
 }
 
@@ -392,6 +382,12 @@ let SubtargetPredicate = isGFX9 in {
 def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
 }
 
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+
+defm V_SAT_PK_U8_I16    : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
+defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
 } // End SubtargetPredicate = isGFX9
 
 //===----------------------------------------------------------------------===//
@@ -521,7 +517,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> {
   }
 }
 
-multiclass VOP1_Real_vi <bits<10> op> {
+multiclass VOP1_Real_e32e64_vi <bits<10> op> {
   let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
     def _e32_vi :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
@@ -530,6 +526,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
       VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
       VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+  defm NAME : VOP1_Real_e32e64_vi <op>;
 
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -593,9 +593,9 @@ defm V_FRACT_F64         : VOP1_Real_vi <0x32>;
 defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
 defm V_FREXP_MANT_F32    : VOP1_Real_vi <0x34>;
 defm V_CLREXCP           : VOP1_Real_vi <0x35>;
-defm V_MOVRELD_B32       : VOP1_Real_vi <0x36>;
-defm V_MOVRELS_B32       : VOP1_Real_vi <0x37>;
-defm V_MOVRELSD_B32      : VOP1_Real_vi <0x38>;
+defm V_MOVRELD_B32       : VOP1_Real_e32e64_vi <0x36>;
+defm V_MOVRELS_B32       : VOP1_Real_e32e64_vi <0x37>;
+defm V_MOVRELSD_B32      : VOP1_Real_e32e64_vi <0x38>;
 defm V_TRUNC_F64         : VOP1_Real_vi <0x17>;
 defm V_CEIL_F64          : VOP1_Real_vi <0x18>;
 defm V_FLOOR_F64         : VOP1_Real_vi <0x1A>;
@@ -622,6 +622,10 @@ defm V_SIN_F16           : VOP1_Real_vi <0x49>;
 defm V_COS_F16           : VOP1_Real_vi <0x4a>;
 defm V_SWAP_B32          : VOP1Only_Real_vi <0x51>;
 
+defm V_SAT_PK_U8_I16     : VOP1_Real_vi<0x4f>;
+defm V_CVT_NORM_I16_F16  : VOP1_Real_vi<0x4d>;
+defm V_CVT_NORM_U16_F16  : VOP1_Real_vi<0x4e>;
+
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
 // and an implicit use and def of the super register should be added.
@@ -694,3 +698,23 @@ def : GCNPat <
 >;
 
 } // End OtherPredicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_gfx9 <bits<10> op> {
+  let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+    defm NAME : VOP1_Real_e32e64_vi <op>;
+  }
+
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+  // For now left dpp only for asm/dasm
+  // TODO: add corresponding pseudo
+  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 56b934f92f61..5ec1a15c5cd2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -61,17 +61,9 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
 }
 
 class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
-  InstSI <P.Outs32, P.Ins32, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#suffix, opName> {
+  VOP_Pseudo <opName, suffix, P, P.Outs32, P.Ins32, "", pattern> {
 
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  string Mnemonic = opName;
-  string AsmOperands = P.Asm32;
+  let AsmOperands = P.Asm32;
 
   let Size = 4;
   let mayLoad = 0;
@@ -84,8 +76,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
-
-  VOPProfile Pfl = P;
 }
 
 class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
@@ -107,6 +97,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 }
 
 class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -177,6 +168,10 @@ multiclass VOP2eInst <string opName,
     let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+      def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+        let AsmMatchConverter = "cvtSdwaVOP2b";
+      }
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -303,12 +298,30 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
   let Asm32 = "$vdst, $src0, $src1, vcc";
   let Asm64 = "$vdst, $src0, $src1, $src2";
+  let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst);
 
   // Suppress src2 implied by type since the 32-bit encoding uses an
   // implicit VCC use.
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+                     clampmod:$clamp,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+  let InsDPP = (ins DstRCDPP:$old,
+                    Src0DPP:$src0,
+                    Src1DPP:$src1,
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let HasExt = 1;
+  let HasSDWA9 = 1;
 }
 
 def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
@@ -322,15 +335,17 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
   let HasSDWA9 = 0;
 }
 
-def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
   let Outs32 = (outs VGPR_32:$vdst);
   let Outs64 = Outs32;
-  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
+  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
   let HasExt = 0;
   let HasSDWA9 = 0;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -398,7 +413,10 @@ let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
   [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
 
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
+  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+} // End $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
@@ -473,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
 
 } // End SubtargetPredicate = Has16BitInsts
 
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2",
+    DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1,
+    isCommutable = 1 in {
+defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+}
+
+} // End SubtargetPredicate = HasDLInsts
+
 // Note: 16-bit instructions produce a 0 result in the high 16-bits.
 multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
 
@@ -639,7 +670,7 @@ defm V_SUBBREV_U32        : VOP2be_Real_e32e64_si <0x2a>;
 
 defm V_READLANE_B32       : VOP2_Real_si <0x01>;
 
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
 defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
 }
 
@@ -824,7 +855,7 @@ multiclass VOP2_Real_e32e64_vi <bits<6> op> :
   def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
 }
 
-defm V_CNDMASK_B32        : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>;
 defm V_ADD_F32            : VOP2_Real_e32e64_vi <0x1>;
 defm V_SUB_F32            : VOP2_Real_e32e64_vi <0x2>;
 defm V_SUBREV_F32         : VOP2_Real_e32e64_vi <0x3>;
@@ -926,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
 def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
 
 } // End SubtargetPredicate = isVI
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
+defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index aedbfa015bf6..17ae08dc6267 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -153,19 +153,24 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
             (i1 VCC)))];
 }
 
-class VOP3Features<bit Clamp, bit OpSel> {
+class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
   bit HasClamp = Clamp;
   bit HasOpSel = OpSel;
+  bit IsPacked = Packed;
 }
 
-def VOP3_REGULAR : VOP3Features<0, 0>;
-def VOP3_CLAMP   : VOP3Features<1, 0>;
-def VOP3_OPSEL   : VOP3Features<1, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0>;
+def VOP3_CLAMP   : VOP3Features<1, 0, 0>;
+def VOP3_OPSEL   : VOP3Features<1, 1, 0>;
+def VOP3_PACKED  : VOP3Features<1, 1, 1>;
 
 class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
 
   let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
   let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
 
   // FIXME: Hack to stop printing _e64
   let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -283,10 +288,10 @@ def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
 def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
@@ -355,14 +360,12 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPU
 
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
-  let hasExtraSrcRegAllocReq = 1;
   let AsmMatchConverter = "";
 }
 
 // Double precision division pre-scale.
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
-  let hasExtraSrcRegAllocReq = 1;
   let AsmMatchConverter = "";
 }
 
@@ -376,6 +379,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
   let SchedRW = [WriteDouble];
 }
 
+let SchedRW = [Write64Bit] in {
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
@@ -389,17 +393,17 @@ def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
 def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
 def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 } // End SubtargetPredicate = isVI
-
+} // End SchedRW = [Write64Bit]
 
 let SubtargetPredicate = isCIVI in {
 
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
 
 let isCommutable = 1 in {
-let SchedRW = [WriteDouble, WriteSALU] in {
+let SchedRW = [WriteQuarterRate32, WriteSALU] in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SchedRW = [WriteDouble, WriteSALU]
@@ -408,16 +412,16 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SubtargetPredicate = isCIVI
 
 
-let SubtargetPredicate = Has16BitInsts in {
-
-let renamedInGFX9 = 1 in {
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
+  let Predicates = [Has16BitInsts, isVIOnly];
 }
-let SubtargetPredicate = isGFX9 in {
-def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
+                                      VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
+  let renamedInGFX9 = 1;
+  let Predicates = [Has16BitInsts, isGFX9];
 }
 
-let isCommutable = 1 in {
+let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
 let renamedInGFX9 = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
@@ -438,15 +442,14 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
 
-}  // End isCommutable = 1
-} // End SubtargetPredicate = Has16BitInsts
+} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
 let SubtargetPredicate = isVI in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
 
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
 } // End SubtargetPredicate = isVI
 
 let Predicates = [Has16BitInsts] in {
@@ -697,7 +700,7 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
 let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
 
 multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
             VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
               let AsmString = AsmName # ps.AsmOperands;
@@ -705,7 +708,7 @@ multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
 }
 
 multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
             VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
               let AsmString = AsmName # ps.AsmOperands;
@@ -713,7 +716,7 @@ multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
 }
 
 multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
             VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
               let AsmString = AsmName # ps.AsmOperands;
@@ -721,9 +724,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
 }
 
 multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
-              VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
-              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+  def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
+              VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
+              VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index eeee8b36c175..5c78ada3211e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
 
+multiclass MadFmaMixPats<SDPatternOperator fma_like,
+                         Instruction mix_inst,
+                         Instruction mixlo_inst,
+                         Instruction mixhi_inst> {
+  def : GCNPat <
+    (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                            (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                            (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (mixlo_inst $src0_modifiers, $src0,
+                $src1_modifiers, $src1,
+                $src2_modifiers, $src2,
+                DSTCLAMP.NONE,
+                (i32 (IMPLICIT_DEF)))
+  >;
+
+  // FIXME: Special case handling for maxhi (especially for clamp)
+  // because dealing with the write to high half of the register is
+  // difficult.
+  def : GCNPat <
+    (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+                       $src1_modifiers, $src1,
+                       $src2_modifiers, $src2,
+                       DSTCLAMP.NONE,
+                       $elt0))
+  >;
+
+  def : GCNPat <
+    (build_vector
+      f16:$elt0,
+      (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                      (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+                       $src1_modifiers, $src1,
+                       $src2_modifiers, $src2,
+                       DSTCLAMP.ENABLE,
+                       $elt0))
+  >;
+
+  def : GCNPat <
+    (AMDGPUclamp (build_vector
+      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+    (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+                       $hi_src1_modifiers, $hi_src1,
+                       $hi_src2_modifiers, $hi_src2,
+                       DSTCLAMP.ENABLE,
+                       (mixlo_inst $lo_src0_modifiers, $lo_src0,
+                                   $lo_src1_modifiers, $lo_src1,
+                                   $lo_src2_modifiers, $lo_src2,
+                                   DSTCLAMP.ENABLE,
+                                   (i32 (IMPLICIT_DEF)))))
+  >;
+}
 
 let SubtargetPredicate = HasMadMixInsts in {
 // These are VOP3a-like opcodes which accept no omod.
@@ -84,68 +145,41 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16
 }
 }
 
-def : GCNPat <
-  (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                      (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
-  (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
-                   $src1_modifiers, $src1,
-                   $src2_modifiers, $src2,
-                   DSTCLAMP.NONE,
-                   (i32 (IMPLICIT_DEF)))
->;
+defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+} // End SubtargetPredicate = HasMadMixInsts
 
-// FIXME: Special case handling for maxhi (especially for clamp)
-// because dealing with the write to high half of the register is
-// difficult.
-def : GCNPat <
-  (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                          (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                          (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
-  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
-                          $src1_modifiers, $src1,
-                          $src2_modifiers, $src2,
-                          DSTCLAMP.NONE,
-                          $elt0))
->;
 
-def : GCNPat <
-  (build_vector
-    f16:$elt0,
-    (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
-  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
-                          $src1_modifiers, $src1,
-                          $src2_modifiers, $src2,
-                          DSTCLAMP.ENABLE,
-                          $elt0))
->;
+// Essentially the same as the mad_mix versions
+let SubtargetPredicate = HasFmaMixInsts in {
+let isCommutable = 1 in {
+def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
-def : GCNPat <
-  (AMDGPUclamp (build_vector
-    (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
-    (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
-                   (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
-  (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
-                          $hi_src1_modifiers, $hi_src1,
-                          $hi_src2_modifiers, $hi_src2,
-                          DSTCLAMP.ENABLE,
-                          (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
-                                           $lo_src1_modifiers, $lo_src1,
-                                           $lo_src2_modifiers, $lo_src2,
-                                           DSTCLAMP.ENABLE,
-                                           (i32 (IMPLICIT_DEF)))))
->;
+// Clamp modifier is applied after conversion to f16.
+def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
+def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+}
+}
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+}
 
-} // End SubtargetPredicate = [HasMadMixInsts]
+let SubtargetPredicate = HasDLInsts in {
+
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
+def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
+def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
+def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
+def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+
+} // End SubtargetPredicate = HasDLInsts
 
 multiclass VOP3P_Real_vi<bits<10> op> {
-  def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+  def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicates = [HasVOP3PInsts];
     let DecoderNamespace = "VI";
   }
@@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
 defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
 defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
 
+
+let SubtargetPredicate = HasMadMixInsts in {
 defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
 defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
 defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+
+let SubtargetPredicate = HasFmaMixInsts in {
+let DecoderNamespace = "GFX9_DL" in {
+// The mad_mix instructions were renamed and their behaviors changed,
+// but the opcode stayed the same so we need to put these in a
+// different DecoderNamespace to avoid the ambiguity.
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+}
+
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
+defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x3a8>;
+defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x3a9>;
+defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x3aa>;
+defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x3ab>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 146870e21531..cc6b8116afee 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -30,8 +30,8 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   let Inst{31-25} = 0x3e; // encoding
 
   // VOPC disallows dst_sel and dst_unused as they have no effect on destination
-  let Inst{42-40} = SDWA.DWORD;
-  let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+  let Inst{42-40} = 0;
+  let Inst{44-43} = 0;
 }
 
 class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
@@ -106,6 +106,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f24ff5ce8dea..f0f7f259f71d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -38,6 +38,23 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let Uses = [EXEC];
 }
 
+class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
+                  string asm, list<dag> pattern> :
+  InstSI <outs, ins, asm, pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+  MnemonicAlias<opName#suffix, opName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  VOPProfile Pfl = P;
+
+  string AsmOperands;
+}
+
 class VOP3Common <dag outs, dag ins, string asm = "",
                   list<dag> pattern = [], bit HasMods = 0,
                   bit VOP3Only = 0> :
@@ -66,26 +83,18 @@ class VOP3Common <dag outs, dag ins, string asm = "",
 
 class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
                    bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> :
-  InstSI <P.Outs64,
-          !if(isVop3OpSel,
-              P.InsVOP3OpSel,
-              !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
-          "",
-          pattern>,
-  VOP <opName>,
-  SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#"_e64", opName> {
+  VOP_Pseudo <opName, "_e64", P, P.Outs64,
+              !if(isVop3OpSel,
+                  P.InsVOP3OpSel,
+                  !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)),
+              "", pattern> {
 
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
   let VOP3_OPSEL = isVop3OpSel;
   let IsPacked = P.IsPacked;
 
-  string Mnemonic = opName;
-  string AsmOperands = !if(isVop3OpSel,
-                           P.AsmVOP3OpSel,
-                           !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
+  let AsmOperands = !if(isVop3OpSel,
+                        P.AsmVOP3OpSel,
+                        !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
 
   let Size = 8;
   let mayLoad = 0;
@@ -120,8 +129,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
         !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
             "cvtVOP3",
             ""));
-
-  VOPProfile Pfl = P;
 }
 
 class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
@@ -129,7 +136,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
   let VOP3P = 1;
 }
 
-class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
@@ -149,13 +156,14 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+  let Defs                 = ps.Defs;
 
   VOPProfile Pfl = ps.Pfl;
 }
 
 // XXX - Is there any reason to distingusih this from regular VOP3
 // here?
-class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
   VOP3_Real<ps, EncodingFamily>;
 
 class VOP3a<VOPProfile P> : Enc64 {
@@ -324,13 +332,13 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   bits<1> clamp;
 
   let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
-  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
-  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
-  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
-  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
 }
@@ -358,11 +366,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 {
   bits<1> src1_sgpr;
 
   let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
-  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{55}    = !if(P.HasSrc0, src0{8}, 0);
-  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
   let Inst{63}    = 0; // src1_sgpr - should be specified in subclass
@@ -375,8 +383,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
-  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
   let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
 }
diff --git a/contrib/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/contrib/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
index af9dd968b7a6..8c13da0484fd 100644
--- a/contrib/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -34,6 +33,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <algorithm>
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/ARC/ARCBranchFinalize.cpp b/contrib/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
index 9341e7bdda41..3b410fa383b7 100644
--- a/contrib/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -112,7 +112,7 @@ static unsigned getCmpForPseudo(MachineInstr *MI) {
 }
 
 void ARCBranchFinalize::replaceWithBRcc(MachineInstr *MI) const {
-  DEBUG(dbgs() << "Replacing pseudo branch with BRcc\n");
+  LLVM_DEBUG(dbgs() << "Replacing pseudo branch with BRcc\n");
   unsigned CC = getCCForBRcc(MI->getOperand(3).getImm());
   if (CC != -1U) {
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -128,8 +128,8 @@ void ARCBranchFinalize::replaceWithBRcc(MachineInstr *MI) const {
 }
 
 void ARCBranchFinalize::replaceWithCmpBcc(MachineInstr *MI) const {
-  DEBUG(dbgs() << "Branch: " << *MI << "\n");
-  DEBUG(dbgs() << "Replacing pseudo branch with Cmp + Bcc\n");
+  LLVM_DEBUG(dbgs() << "Branch: " << *MI << "\n");
+  LLVM_DEBUG(dbgs() << "Replacing pseudo branch with Cmp + Bcc\n");
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(getCmpForPseudo(MI)))
       .addReg(MI->getOperand(1).getReg())
@@ -141,8 +141,8 @@ void ARCBranchFinalize::replaceWithCmpBcc(MachineInstr *MI) const {
 }
 
 bool ARCBranchFinalize::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "Running ARC Branch Finalize on "
-               << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Running ARC Branch Finalize on " << MF.getName()
+                    << "\n");
   std::vector<MachineInstr *> Branches;
   bool Changed = false;
   unsigned MaxSize = 0;
@@ -156,7 +156,7 @@ bool ARCBranchFinalize::runOnMachineFunction(MachineFunction &MF) {
     for (auto &MI : MBB) {
       unsigned Size = TII->getInstSizeInBytes(MI);
       if (Size > 8 || Size == 0) {
-        DEBUG(dbgs() << "Unknown (or size 0) size for: " << MI << "\n");
+        LLVM_DEBUG(dbgs() << "Unknown (or size 0) size for: " << MI << "\n");
       } else {
         MaxSize += Size;
       }
@@ -172,8 +172,8 @@ bool ARCBranchFinalize::runOnMachineFunction(MachineFunction &MF) {
       isInt<9>(MaxSize) ? replaceWithBRcc(P.first) : replaceWithCmpBcc(P.first);
   }
 
-  DEBUG(dbgs() << "Estimated function size for " << MF.getName()
-               << ": " << MaxSize << "\n");
+  LLVM_DEBUG(dbgs() << "Estimated function size for " << MF.getName() << ": "
+                    << MaxSize << "\n");
 
   return Changed;
 }
diff --git a/contrib/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/contrib/llvm/lib/Target/ARC/ARCFrameLowering.cpp
index 195a781950be..ca59cb2baaa7 100644
--- a/contrib/llvm/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCFrameLowering.cpp
@@ -59,8 +59,8 @@ static void generateStackAdjustment(MachineBasicBlock &MBB,
     Positive = true;
   }
 
-  DEBUG(dbgs() << "Internal: adjust stack by: " << Amount << "," << AbsAmount
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Internal: adjust stack by: " << Amount << ","
+                    << AbsAmount << "\n");
 
   assert((AbsAmount % 4 == 0) && "Stack adjustments must be 4-byte aligned.");
   if (isUInt<6>(AbsAmount))
@@ -88,8 +88,7 @@ determineLastCalleeSave(const std::vector<CalleeSavedInfo> &CSI) {
 void ARCFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
-  DEBUG(dbgs() << "Determine Callee Saves: " << MF.getName()
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Determine Callee Saves: " << MF.getName() << "\n");
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   SavedRegs.set(ARC::BLINK);
 }
@@ -115,7 +114,7 @@ void ARCFrameLowering::adjustStackToMatchRecords(
 /// registers onto the stack, when enough callee saved registers are required.
 void ARCFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  DEBUG(dbgs() << "Emit Prologue: " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Emit Prologue: " << MF.getName() << "\n");
   auto *AFI = MF.getInfo<ARCFunctionInfo>();
   MachineModuleInfo &MMI = MF.getMMI();
   MCContext &Context = MMI.getContext();
@@ -133,7 +132,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned AlreadyAdjusted = 0;
   if (MF.getFunction().isVarArg()) {
     // Add in the varargs area here first.
-    DEBUG(dbgs() << "Varargs\n");
+    LLVM_DEBUG(dbgs() << "Varargs\n");
     unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex());
     BuildMI(MBB, MBBI, dl, TII->get(ARC::SUB_rru6))
         .addReg(ARC::SP)
@@ -141,7 +140,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
         .addImm(VarArgsBytes);
   }
   if (hasFP(MF)) {
-    DEBUG(dbgs() << "Saving FP\n");
+    LLVM_DEBUG(dbgs() << "Saving FP\n");
     BuildMI(MBB, MBBI, dl, TII->get(ARC::ST_AW_rs9))
         .addReg(ARC::SP, RegState::Define)
         .addReg(ARC::FP)
@@ -150,7 +149,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
     AlreadyAdjusted += 4;
   }
   if (UseSaveRestoreFunclet && Last > ARC::R14) {
-    DEBUG(dbgs() << "Creating store funclet.\n");
+    LLVM_DEBUG(dbgs() << "Creating store funclet.\n");
     // BL to __save_r13_to_<TRI->getRegAsmName()>
     StackSlotsUsedByFunclet = Last - ARC::R12;
     BuildMI(MBB, MBBI, dl, TII->get(ARC::PUSH_S_BLINK));
@@ -166,20 +165,20 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
   }
   // If we haven't saved BLINK, but we need to...do that now.
   if (MFI.hasCalls() && !SavedBlink) {
-    DEBUG(dbgs() << "Creating save blink.\n");
+    LLVM_DEBUG(dbgs() << "Creating save blink.\n");
     BuildMI(MBB, MBBI, dl, TII->get(ARC::PUSH_S_BLINK));
     AlreadyAdjusted += 4;
   }
   if (AFI->MaxCallStackReq > 0)
     MFI.setStackSize(MFI.getStackSize() + AFI->MaxCallStackReq);
   // We have already saved some of the stack...
-  DEBUG(dbgs() << "Adjusting stack by: "
-               << (MFI.getStackSize() - AlreadyAdjusted) << "\n");
+  LLVM_DEBUG(dbgs() << "Adjusting stack by: "
+                    << (MFI.getStackSize() - AlreadyAdjusted) << "\n");
   generateStackAdjustment(MBB, MBBI, *ST.getInstrInfo(), dl,
                           -(MFI.getStackSize() - AlreadyAdjusted), ARC::SP);
 
   if (hasFP(MF)) {
-    DEBUG(dbgs() << "Setting FP from SP.\n");
+    LLVM_DEBUG(dbgs() << "Setting FP from SP.\n");
     BuildMI(MBB, MBBI, dl,
             TII->get(isUInt<6>(MFI.getStackSize()) ? ARC::ADD_rru6
                                                    : ARC::ADD_rrlimm),
@@ -235,7 +234,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
 /// registers onto the stack, when enough callee saved registers are required.
 void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  DEBUG(dbgs() << "Emit Epilogue: " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Emit Epilogue: " << MF.getName() << "\n");
   auto *AFI = MF.getInfo<ARCFunctionInfo>();
   const ARCInstrInfo *TII = MF.getSubtarget<ARCSubtarget>().getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
@@ -304,7 +303,7 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
   // Relieve the varargs area if necessary.
   if (MF.getFunction().isVarArg()) {
     // Add in the varargs area here first.
-    DEBUG(dbgs() << "Varargs\n");
+    LLVM_DEBUG(dbgs() << "Varargs\n");
     unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex());
     BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6))
         .addReg(ARC::SP)
@@ -334,16 +333,16 @@ bool ARCFrameLowering::assignCalleeSavedSpillSlots(
   if (hasFP(MF)) {
     // Create a fixed slot at for FP
     int StackObj = MFI.CreateFixedSpillStackObject(4, CurOffset, true);
-    DEBUG(dbgs() << "Creating fixed object (" << StackObj << ") for FP at "
-                 << CurOffset << "\n");
+    LLVM_DEBUG(dbgs() << "Creating fixed object (" << StackObj << ") for FP at "
+                      << CurOffset << "\n");
     (void)StackObj;
     CurOffset -= 4;
   }
   if (MFI.hasCalls() || (UseSaveRestoreFunclet && Last > ARC::R14)) {
     // Create a fixed slot for BLINK.
     int StackObj  = MFI.CreateFixedSpillStackObject(4, CurOffset, true);
-    DEBUG(dbgs() << "Creating fixed object (" << StackObj << ") for BLINK at "
-                 << CurOffset << "\n");
+    LLVM_DEBUG(dbgs() << "Creating fixed object (" << StackObj
+                      << ") for BLINK at " << CurOffset << "\n");
     (void)StackObj;
     CurOffset -= 4;
   }
@@ -366,12 +365,12 @@ bool ARCFrameLowering::assignCalleeSavedSpillSlots(
       continue;
     if (I.getFrameIdx() == 0) {
       I.setFrameIdx(MFI.CreateFixedSpillStackObject(4, CurOffset, true));
-      DEBUG(dbgs() << "Creating fixed object (" << I.getFrameIdx()
-                   << ") for other register at " << CurOffset << "\n");
+      LLVM_DEBUG(dbgs() << "Creating fixed object (" << I.getFrameIdx()
+                        << ") for other register at " << CurOffset << "\n");
     } else {
       MFI.setObjectOffset(I.getFrameIdx(), CurOffset);
-      DEBUG(dbgs() << "Updating fixed object (" << I.getFrameIdx()
-                   << ") for other register at " << CurOffset << "\n");
+      LLVM_DEBUG(dbgs() << "Updating fixed object (" << I.getFrameIdx()
+                        << ") for other register at " << CurOffset << "\n");
     }
     CurOffset -= 4;
   }
@@ -382,8 +381,8 @@ bool ARCFrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
-  DEBUG(dbgs() << "Spill callee saved registers: "
-               << MBB.getParent()->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Spill callee saved registers: "
+                    << MBB.getParent()->getName() << "\n");
   // There are routines for saving at least 3 registers (r13 to r15, etc.)
   unsigned Last = determineLastCalleeSave(CSI);
   if (UseSaveRestoreFunclet && Last > ARC::R14) {
@@ -399,8 +398,8 @@ bool ARCFrameLowering::spillCalleeSavedRegisters(
 bool ARCFrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const {
-  DEBUG(dbgs() << "Restore callee saved registers: "
-               << MBB.getParent()->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Restore callee saved registers: "
+                    << MBB.getParent()->getName() << "\n");
   // There are routines for saving at least 3 registers (r13 to r15, etc.)
   unsigned Last = determineLastCalleeSave(CSI);
   if (UseSaveRestoreFunclet && Last > ARC::R14) {
@@ -414,16 +413,17 @@ bool ARCFrameLowering::restoreCalleeSavedRegisters(
 void ARCFrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  DEBUG(dbgs() << "Process function before frame finalized: "
-               << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Process function before frame finalized: "
+                    << MF.getName() << "\n");
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  DEBUG(dbgs() << "Current stack size: " << MFI.getStackSize() << "\n");
+  LLVM_DEBUG(dbgs() << "Current stack size: " << MFI.getStackSize() << "\n");
   const TargetRegisterClass *RC = &ARC::GPR32RegClass;
   if (MFI.hasStackObjects()) {
     int RegScavFI = MFI.CreateStackObject(
         RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
     RS->addScavengingFrameIndex(RegScavFI);
-    DEBUG(dbgs() << "Created scavenging index RegScavFI=" << RegScavFI << "\n");
+    LLVM_DEBUG(dbgs() << "Created scavenging index RegScavFI=" << RegScavFI
+                      << "\n");
   }
 }
 
@@ -440,7 +440,7 @@ static void emitRegUpdate(MachineBasicBlock &MBB,
 MachineBasicBlock::iterator ARCFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  DEBUG(dbgs() << "EmitCallFramePseudo: " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "EmitCallFramePseudo: " << MF.getName() << "\n");
   const ARCInstrInfo *TII = MF.getSubtarget<ARCSubtarget>().getInstrInfo();
   MachineInstr &Old = *I;
   DebugLoc dl = Old.getDebugLoc();
diff --git a/contrib/llvm/lib/Target/ARC/ARCISelLowering.cpp b/contrib/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 5991838a15c4..bf98af801406 100644
--- a/contrib/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -72,7 +72,7 @@ static ARCCC::CondCode ISDCCtoARCCC(ISD::CondCode isdCC) {
 
 ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
                                      const ARCSubtarget &Subtarget)
-    : TargetLowering(TM), TM(TM), Subtarget(Subtarget) {
+    : TargetLowering(TM), Subtarget(Subtarget) {
   // Set up the register classes.
   addRegisterClass(MVT::i32, &ARC::GPR32RegClass);
 
@@ -486,8 +486,8 @@ SDValue ARCTargetLowering::LowerCallArguments(
       EVT RegVT = VA.getLocVT();
       switch (RegVT.getSimpleVT().SimpleTy) {
       default: {
-        DEBUG(errs() << "LowerFormalArguments Unhandled argument type: "
-                     << (unsigned)RegVT.getSimpleVT().SimpleTy << "\n");
+        LLVM_DEBUG(errs() << "LowerFormalArguments Unhandled argument type: "
+                          << (unsigned)RegVT.getSimpleVT().SimpleTy << "\n");
         llvm_unreachable("Unhandled LowerFormalArguments type.");
       }
       case MVT::i32:
diff --git a/contrib/llvm/lib/Target/ARC/ARCISelLowering.h b/contrib/llvm/lib/Target/ARC/ARCISelLowering.h
index cb06e9dcd79f..fec01b13a866 100644
--- a/contrib/llvm/lib/Target/ARC/ARCISelLowering.h
+++ b/contrib/llvm/lib/Target/ARC/ARCISelLowering.h
@@ -76,7 +76,6 @@ public:
                              Instruction *I = nullptr) const override;
 
 private:
-  const TargetMachine &TM;
   const ARCSubtarget &Subtarget;
 
   // Lower Operand helpers
diff --git a/contrib/llvm/lib/Target/ARC/ARCInstrFormats.td b/contrib/llvm/lib/Target/ARC/ARCInstrFormats.td
index 50edddd4ea9f..0a49b83ef16a 100644
--- a/contrib/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/contrib/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -62,7 +62,7 @@ class InstARC<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Namespace = "ARC";
   dag OutOperandList = outs;
   dag InOperandList = ins;
-  let AsmString   = asmstr;
+  let AsmString = asmstr;
   let Pattern = pattern;
   let Size = sz;
 }
diff --git a/contrib/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/contrib/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index a299e32c03a0..a8084f16893b 100644
--- a/contrib/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -103,6 +103,10 @@ static ARCCC::CondCode GetOppositeBranchCondition(ARCCC::CondCode CC) {
     return ARCCC::LE;
   case ARCCC::GE:
     return ARCCC::LT;
+  case ARCCC::VS:
+    return ARCCC::VC;
+  case ARCCC::VC:
+    return ARCCC::VS;
   case ARCCC::LT:
     return ARCCC::GE;
   case ARCCC::LE:
@@ -169,7 +173,7 @@ bool ARCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     bool CantAnalyze = false;
 
     // Skip over DEBUG values and predicated nonterminators.
-    while (I->isDebugValue() || !I->isTerminator()) {
+    while (I->isDebugInstr() || !I->isTerminator()) {
       if (I == MBB.begin())
         return false;
       --I;
@@ -294,8 +298,8 @@ void ARCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
          "Only support 4-byte stores to stack now.");
   assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
          "Only support GPR32 stores to stack now.");
-  DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
-               << " to FrameIndex=" << FrameIndex << "\n");
+  LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
+                    << " to FrameIndex=" << FrameIndex << "\n");
   BuildMI(MBB, I, dl, get(ARC::ST_rs9))
       .addReg(SrcReg, getKillRegState(isKill))
       .addFrameIndex(FrameIndex)
@@ -321,8 +325,8 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
          "Only support 4-byte loads from stack now.");
   assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
          "Only support GPR32 stores to stack now.");
-  DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
-               << " from FrameIndex=" << FrameIndex << "\n");
+  LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
+                    << " from FrameIndex=" << FrameIndex << "\n");
   BuildMI(MBB, I, dl, get(ARC::LD_rs9))
       .addReg(DestReg, RegState::Define)
       .addFrameIndex(FrameIndex)
diff --git a/contrib/llvm/lib/Target/ARC/ARCInstrInfo.td b/contrib/llvm/lib/Target/ARC/ARCInstrInfo.td
index edd853fe150d..525098c4ff66 100644
--- a/contrib/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -125,18 +125,36 @@ multiclass ArcBinaryInst<bits<5> major, bits<6> mincode,
                         (ins GPR32:$B, GPR32:$C),
                         !strconcat(opasm, "\t$A, $B, $C"),
                         []>;
+  def _f_rrr : F32_DOP_RR<major, mincode, 1, (outs GPR32:$A),
+                          (ins GPR32:$B, GPR32:$C),
+                          !strconcat(opasm, ".f\t$A, $B, $C"),
+                          []>
+  { let Defs = [STATUS32]; }
 
   // 2 register with unsigned 6-bit immediate variant.
   def _rru6 : F32_DOP_RU6<major, mincode, 0, (outs GPR32:$A),
                           (ins GPR32:$B, immU6:$U6),
                           !strconcat(opasm, "\t$A, $B, $U6"),
                           []>;
+  def _f_rru6 : F32_DOP_RU6<major, mincode, 1, (outs GPR32:$A),
+                            (ins GPR32:$B, immU6:$U6),
+                            !strconcat(opasm, ".f\t$A, $B, $U6"),
+                            []>
+  { let Defs = [STATUS32]; }
+
   // 2 register with 32-bit immediate variant.
   def _rrlimm : F32_DOP_RLIMM<major, mincode, 0,
-                      (outs GPR32:$A),
-                      (ins GPR32:$B, i32imm:$LImm),
-                      !strconcat(opasm, "\t$A, $B, $LImm"),
-                      []>;
+                              (outs GPR32:$A),
+                              (ins GPR32:$B, i32imm:$LImm),
+                              !strconcat(opasm, "\t$A, $B, $LImm"),
+                              []>;
+  def _f_rrlimm : F32_DOP_RLIMM<major, mincode, 1,
+                                (outs GPR32:$A),
+                                (ins GPR32:$B, i32imm:$LImm),
+                                !strconcat(opasm, ".f\t$A, $B, $LImm"),
+                                []>
+  { let Defs = [STATUS32]; }
+
   // 2 matched-register with signed 12-bit immediate variant (add r0, r0, -1).
   def _rrs12 : F32_DOP_RS12<major, mincode, 0,
                             (outs GPR32:$B),
@@ -144,6 +162,12 @@ multiclass ArcBinaryInst<bits<5> major, bits<6> mincode,
                             !strconcat(opasm, "\t$B, $in, $S12"),
                             []>
   { let Constraints = "$B = $in"; }
+  def _f_rrs12 : F32_DOP_RS12<major, mincode, 1,
+                              (outs GPR32:$B),
+                              (ins GPR32:$in, immS<12>:$S12),
+                              !strconcat(opasm, ".f\t$B, $in, $S12"),
+                              []>
+  { let Constraints = "$B = $in"; let Defs = [STATUS32]; }
 }
 
 // Special multivariant GEN4 DOP format instruction that take 2 registers.
@@ -168,6 +192,10 @@ multiclass ArcUnaryInst<bits<5> major, bits<6> subop,
                         string opasm> {
   def _rr : F32_SOP_RR<major, subop, 0, (outs GPR32:$B), (ins GPR32:$C),
                        !strconcat(opasm, "\t$B, $C"), []>;
+
+  def _f_rr : F32_SOP_RR<major, subop, 1, (outs GPR32:$B), (ins GPR32:$C),
+                       !strconcat(opasm, ".f\t$B, $C"), []>
+  { let Defs = [STATUS32]; }
 }
 
 
@@ -328,11 +356,19 @@ let isBranch = 1, isTerminator = 1 in {
                                  { let Size = 8; }
 } // let isBranch, isTerminator
 
-// Indirect, unconditional Jump.
-let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
-def J :  F32_DOP_RR<0b00100, 0b100000, 0,
-               (outs), (ins GPR32:$C),
-               "j\t[$C]", [(brind i32:$C)]>;
+// Unconditional Jump.
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  // Indirect.
+  let isIndirectBranch = 1 in
+  def J :  F32_DOP_RR<0b00100, 0b100000, 0,
+                      (outs), (ins GPR32:$C),
+                      "j\t[$C]", [(brind i32:$C)]>;
+
+  // Direct.
+  def J_LImm : F32_DOP_RLIMM<0b00100, 0b100000, 0,
+                             (outs), (ins i32imm:$LImm),
+                             "j\t$LImm", []>;
+}
 
 // Call instructions.
 let isCall = 1, isBarrier = 1, Defs = [BLINK], Uses = [SP] in {
@@ -344,6 +380,10 @@ let isCall = 1, isBarrier = 1, Defs = [BLINK], Uses = [SP] in {
   let isIndirectBranch = 1 in
   def JL : F32_DOP_RR<0b00100, 0b100010, 0, (outs), (ins GPR32:$C),
                      "jl\t[$C]", [(ARCJumpLink i32:$C)]>;
+
+  // Direct unconditional call.
+  def JL_LImm : F32_DOP_RLIMM<0b00100, 0b100010, 0, (outs), (ins i32imm:$LImm),
+                              "jl\t$LImm", []>;
 } // let isCall, isBarrier, Defs, Uses
 
 // Pattern to generate BL instruction.
diff --git a/contrib/llvm/lib/Target/ARC/ARCMCInstLower.cpp b/contrib/llvm/lib/Target/ARC/ARCMCInstLower.cpp
index 4658388924ec..43b087a57204 100644
--- a/contrib/llvm/lib/Target/ARC/ARCMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCMCInstLower.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains code to lower ARC MachineInstrs to their
+/// This file contains code to lower ARC MachineInstrs to their
 /// corresponding MCInst records.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/ARC/ARCMCInstLower.h b/contrib/llvm/lib/Target/ARC/ARCMCInstLower.h
index 22e15cdb351e..9a698f26334a 100644
--- a/contrib/llvm/lib/Target/ARC/ARCMCInstLower.h
+++ b/contrib/llvm/lib/Target/ARC/ARCMCInstLower.h
@@ -23,7 +23,7 @@ class MachineFunction;
 class Mangler;
 class AsmPrinter;
 
-/// \brief This class is used to lower an MachineInstr into an MCInst.
+/// This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY ARCMCInstLower {
   using MachineOperandType = MachineOperand::MachineOperandType;
   MCContext *Ctx;
diff --git a/contrib/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
index bfb3fdef5ebf..95ad294e3668 100644
--- a/contrib/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -25,16 +25,15 @@ class ARCFunctionInfo : public MachineFunctionInfo {
   virtual void anchor();
   bool ReturnStackOffsetSet;
   int VarArgsFrameIndex;
-  unsigned VarArgFrameBytes;
   unsigned ReturnStackOffset;
 
 public:
   ARCFunctionInfo()
-      : ReturnStackOffsetSet(false), VarArgsFrameIndex(0), VarArgFrameBytes(0),
+      : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
         ReturnStackOffset(-1U), MaxCallStackReq(0) {}
 
   explicit ARCFunctionInfo(MachineFunction &MF)
-      : ReturnStackOffsetSet(false), VarArgsFrameIndex(0), VarArgFrameBytes(0),
+      : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
         ReturnStackOffset(-1U), MaxCallStackReq(0) {
     // Functions are 4-byte (2**2) aligned.
     MF.setAlignment(2);
diff --git a/contrib/llvm/lib/Target/ARC/ARCRegisterInfo.cpp b/contrib/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
index cb9f89d3499b..38ea3c93a2d4 100644
--- a/contrib/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -66,9 +66,9 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
           MBB.getParent()->getSubtarget().getRegisterInfo();
       BaseReg = RS->scavengeRegister(&ARC::GPR32RegClass, II, SPAdj);
       assert(BaseReg && "Register scavenging failed.");
-      DEBUG(dbgs() << "Scavenged register " << printReg(BaseReg, TRI)
-                   << " for FrameReg=" << printReg(FrameReg, TRI)
-                   << "+Offset=" << Offset << "\n");
+      LLVM_DEBUG(dbgs() << "Scavenged register " << printReg(BaseReg, TRI)
+                        << " for FrameReg=" << printReg(FrameReg, TRI)
+                        << "+Offset=" << Offset << "\n");
       (void)TRI;
       RS->setRegUsed(BaseReg);
     }
@@ -88,7 +88,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
     assert((Offset % 2 == 0) && "LDH needs 2 byte alignment.");
   case ARC::LDB_rs9:
   case ARC::LDB_X_rs9:
-    DEBUG(dbgs() << "Building LDFI\n");
+    LLVM_DEBUG(dbgs() << "Building LDFI\n");
     BuildMI(MBB, II, dl, TII.get(MI.getOpcode()), Reg)
         .addReg(BaseReg, KillState)
         .addImm(Offset)
@@ -99,7 +99,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
   case ARC::STH_rs9:
     assert((Offset % 2 == 0) && "STH needs 2 byte alignment.");
   case ARC::STB_rs9:
-    DEBUG(dbgs() << "Building STFI\n");
+    LLVM_DEBUG(dbgs() << "Building STFI\n");
     BuildMI(MBB, II, dl, TII.get(MI.getOpcode()))
         .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
         .addReg(BaseReg, KillState)
@@ -107,7 +107,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
         .addMemOperand(*MI.memoperands_begin());
     break;
   case ARC::GETFI:
-    DEBUG(dbgs() << "Building GETFI\n");
+    LLVM_DEBUG(dbgs() << "Building GETFI\n");
     BuildMI(MBB, II, dl,
             TII.get(isUInt<6>(Offset) ? ARC::ADD_rru6 : ARC::ADD_rrlimm))
         .addReg(Reg, RegState::Define)
@@ -175,14 +175,14 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int StackSize = MF.getFrameInfo().getStackSize();
   int LocalFrameSize = MF.getFrameInfo().getLocalFrameSize();
 
-  DEBUG(dbgs() << "\nFunction         : " << MF.getName() << "\n");
-  DEBUG(dbgs() << "<--------->\n");
-  DEBUG(dbgs() << MI << "\n");
-  DEBUG(dbgs() << "FrameIndex         : " << FrameIndex << "\n");
-  DEBUG(dbgs() << "ObjSize            : " << ObjSize << "\n");
-  DEBUG(dbgs() << "FrameOffset        : " << Offset << "\n");
-  DEBUG(dbgs() << "StackSize          : " << StackSize << "\n");
-  DEBUG(dbgs() << "LocalFrameSize     : " << LocalFrameSize << "\n");
+  LLVM_DEBUG(dbgs() << "\nFunction         : " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "<--------->\n");
+  LLVM_DEBUG(dbgs() << MI << "\n");
+  LLVM_DEBUG(dbgs() << "FrameIndex         : " << FrameIndex << "\n");
+  LLVM_DEBUG(dbgs() << "ObjSize            : " << ObjSize << "\n");
+  LLVM_DEBUG(dbgs() << "FrameOffset        : " << Offset << "\n");
+  LLVM_DEBUG(dbgs() << "StackSize          : " << StackSize << "\n");
+  LLVM_DEBUG(dbgs() << "LocalFrameSize     : " << LocalFrameSize << "\n");
   (void)LocalFrameSize;
 
   // Special handling of DBG_VALUE instructions.
@@ -200,8 +200,8 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // ldb needs no alignment,
   // ldh needs 2 byte alignment
   // ld needs 4 byte alignment
-  DEBUG(dbgs() << "Offset             : " << Offset << "\n"
-               << "<--------->\n");
+  LLVM_DEBUG(dbgs() << "Offset             : " << Offset << "\n"
+                    << "<--------->\n");
 
   unsigned Reg = MI.getOperand(0).getReg();
   assert(ARC::GPR32RegClass.contains(Reg) && "Unexpected register operand");
diff --git a/contrib/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/contrib/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
index dd181767d81a..3fc5a033dd5d 100644
--- a/contrib/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/contrib/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file is part of the ARC Disassembler.
+/// This file is part of the ARC Disassembler.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -31,7 +31,7 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
 
 namespace {
 
-/// \brief A disassembler class for ARC.
+/// A disassembler class for ARC.
 class ARCDisassembler : public MCDisassembler {
 public:
   std::unique_ptr<MCInstrInfo const> const MCII;
@@ -122,7 +122,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
   if (RegNo >= 32) {
-    DEBUG(dbgs() << "Not a GPR32 register.");
+    LLVM_DEBUG(dbgs() << "Not a GPR32 register.");
     return MCDisassembler::Fail;
   }
 
@@ -222,7 +222,7 @@ static DecodeStatus DecodeStLImmInstruction(MCInst &Inst, uint64_t Insn,
   unsigned SrcC, DstB, LImm;
   DstB = decodeBField(Insn);
   if (DstB != 62) {
-    DEBUG(dbgs() << "Decoding StLImm found non-limm register.");
+    LLVM_DEBUG(dbgs() << "Decoding StLImm found non-limm register.");
     return MCDisassembler::Fail;
   }
   SrcC = decodeCField(Insn);
@@ -237,10 +237,10 @@ static DecodeStatus DecodeLdLImmInstruction(MCInst &Inst, uint64_t Insn,
                                             uint64_t Address,
                                             const void *Decoder) {
   unsigned DstA, SrcB, LImm;
-  DEBUG(dbgs() << "Decoding LdLImm:\n");
+  LLVM_DEBUG(dbgs() << "Decoding LdLImm:\n");
   SrcB = decodeBField(Insn);
   if (SrcB != 62) {
-    DEBUG(dbgs() << "Decoding LdLImm found non-limm register.");
+    LLVM_DEBUG(dbgs() << "Decoding LdLImm found non-limm register.");
     return MCDisassembler::Fail;
   }
   DstA = decodeAField(Insn);
@@ -255,13 +255,13 @@ static DecodeStatus DecodeLdRLImmInstruction(MCInst &Inst, uint64_t Insn,
                                              uint64_t Address,
                                              const void *Decoder) {
   unsigned DstA, SrcB;
-  DEBUG(dbgs() << "Decoding LdRLimm\n");
+  LLVM_DEBUG(dbgs() << "Decoding LdRLimm\n");
   DstA = decodeAField(Insn);
   DecodeGPR32RegisterClass(Inst, DstA, Address, Decoder);
   SrcB = decodeBField(Insn);
   DecodeGPR32RegisterClass(Inst, SrcB, Address, Decoder);
   if (decodeCField(Insn) != 62) {
-    DEBUG(dbgs() << "Decoding LdRLimm found non-limm register.");
+    LLVM_DEBUG(dbgs() << "Decoding LdRLimm found non-limm register.");
     return MCDisassembler::Fail;
   }
   Inst.addOperand(MCOperand::createImm((uint32_t)(Insn >> 32)));
@@ -271,7 +271,7 @@ static DecodeStatus DecodeLdRLImmInstruction(MCInst &Inst, uint64_t Insn,
 static DecodeStatus DecodeMoveHRegInstruction(MCInst &Inst, uint64_t Insn,
                                               uint64_t Address,
                                               const void *Decoder) {
-  DEBUG(dbgs() << "Decoding MOV_S h-register\n");
+  LLVM_DEBUG(dbgs() << "Decoding MOV_S h-register\n");
   using Field = decltype(Insn);
   Field h = fieldFromInstruction(Insn, 5, 3) |
             (fieldFromInstruction(Insn, 0, 2) << 3);
@@ -322,10 +322,10 @@ DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       Result =
           decodeInstruction(DecoderTable64, Instr, Insn64, Address, this, STI);
       if (Success == Result) {
-        DEBUG(dbgs() << "Successfully decoded 64-bit instruction.");
+        LLVM_DEBUG(dbgs() << "Successfully decoded 64-bit instruction.");
         return Result;
       }
-      DEBUG(dbgs() << "Not a 64-bit instruction, falling back to 32-bit.");
+      LLVM_DEBUG(dbgs() << "Not a 64-bit instruction, falling back to 32-bit.");
     }
     uint32_t Insn32;
     if (!readInstruction32(Bytes, Address, Size, Insn32)) {
@@ -342,10 +342,12 @@ DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       Result =
           decodeInstruction(DecoderTable48, Instr, Insn48, Address, this, STI);
       if (Success == Result) {
-        DEBUG(dbgs() << "Successfully decoded 16-bit instruction with limm.");
+        LLVM_DEBUG(
+            dbgs() << "Successfully decoded 16-bit instruction with limm.");
         return Result;
       }
-      DEBUG(dbgs() << "Not a 16-bit instruction with limm, try without it.");
+      LLVM_DEBUG(
+          dbgs() << "Not a 16-bit instruction with limm, try without it.");
     }
 
     uint32_t Insn16;
diff --git a/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp b/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
index 4760ac4456d0..0c627d04698b 100644
--- a/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
@@ -43,9 +43,8 @@ static const char *ARCBRCondCodeToString(ARCCC::BRCondCode BRCC) {
     return "lo";
   case ARCCC::BRHS:
     return "hs";
-  default:
-    llvm_unreachable("Unhandled ARCCC::BRCondCode");
   }
+  llvm_unreachable("Unhandled ARCCC::BRCondCode");
 }
 
 static const char *ARCCondCodeToString(ARCCC::CondCode CC) {
@@ -66,6 +65,10 @@ static const char *ARCCondCodeToString(ARCCC::CondCode CC) {
     return "gt";
   case ARCCC::GE:
     return "ge";
+  case ARCCC::VS:
+    return "vs";
+  case ARCCC::VC:
+    return "vc";
   case ARCCC::LT:
     return "lt";
   case ARCCC::LE:
diff --git a/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.h b/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
index e26c08104e23..bb3898a67cef 100644
--- a/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
+++ b/contrib/llvm/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the declaration of the ARCInstPrinter class,
+/// This file contains the declaration of the ARCInstPrinter class,
 /// which is used to print ARC MCInst to a .s file.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/ARC/MCTargetDesc/ARCInfo.h b/contrib/llvm/lib/Target/ARC/MCTargetDesc/ARCInfo.h
index b9ed99885702..401b4c5e6613 100644
--- a/contrib/llvm/lib/Target/ARC/MCTargetDesc/ARCInfo.h
+++ b/contrib/llvm/lib/Target/ARC/MCTargetDesc/ARCInfo.h
@@ -30,6 +30,8 @@ enum CondCode {
   N = 0x4,
   LO = 0x5,
   HS = 0x6,
+  VS = 0x7,
+  VC = 0x8,
   GT = 0x9,
   GE = 0xa,
   LT = 0xb,
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index 16d5f74d19e3..be88fe4ddb14 100644
--- a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -180,7 +180,7 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
   SmallVector<MachineInstr *, 8> Front;
   DeadInstr.insert(MI);
 
-  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+  LLVM_DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
   Front.push_back(MI);
 
   while (Front.size() != 0) {
@@ -232,7 +232,7 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
 
       if (!IsDead) continue;
 
-      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+      LLVM_DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
       DeadInstr.insert(Def);
     }
   }
@@ -264,7 +264,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
           // Is it a subreg copy of ssub_0?
           if (EC && EC->isCopy() &&
               EC->getOperand(1).getSubReg() == ARM::ssub_0) {
-            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+            LLVM_DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
 
             // Find the thing we're subreg copying out of - is it of the same
             // regclass as DPRMI? (i.e. a DPR or QPR).
@@ -272,8 +272,8 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
             const TargetRegisterClass *TRC =
               MRI->getRegClass(MI->getOperand(1).getReg());
             if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
-              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
-              DEBUG(dbgs() << printReg(FullReg) << "\n");
+              LLVM_DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+              LLVM_DEBUG(dbgs() << printReg(FullReg) << "\n");
               eraseInstrWithNoUses(MI);
               return FullReg;
             }
@@ -387,7 +387,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
          continue;
        Front.push_back(NewMI);
      } else {
-       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+       LLVM_DEBUG(dbgs() << "Found partial copy" << *MI << "\n");
        Outs.push_back(MI);
      }
    }
@@ -642,9 +642,8 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
           // to find.
           MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
 
-          DEBUG(dbgs() << "Replacing operand "
-                       << **I << " with "
-                       << printReg(NewReg) << "\n");
+          LLVM_DEBUG(dbgs() << "Replacing operand " << **I << " with "
+                            << printReg(NewReg) << "\n");
           (*I)->substVirtReg(NewReg, 0, *TRI);
         }
       }
@@ -661,14 +660,15 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
   const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
   // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
   // enabled when NEON is available.
-  if (!(STI.isCortexA15() && STI.hasNEON()))
+  if (!(STI.useSplatVFPToNeon() && STI.hasNEON()))
     return false;
+
   TII = STI.getInstrInfo();
   TRI = STI.getRegisterInfo();
   MRI = &Fn.getRegInfo();
   bool Modified = false;
 
-  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+  LLVM_DEBUG(dbgs() << "Running on function " << Fn.getName() << "\n");
 
   DeadInstr.clear();
   Replacements.clear();
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
index 9ffb4c2055f9..b5cc45c5cc94 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARM_H
 #define LLVM_LIB_TARGET_ARM_ARM_H
 
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CodeGen.h"
 #include <functional>
 #include <vector>
@@ -35,11 +36,14 @@ class MachineInstr;
 class MCInst;
 class PassRegistry;
 
+
+Pass *createARMParallelDSPPass();
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMCodeGenPreparePass();
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
@@ -57,8 +61,11 @@ void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
                       BasicBlockInfo &BBI);
 std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
 
+
+void initializeARMParallelDSPPass(PassRegistry &);
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMCodeGenPreparePass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
 void initializeARMExpandPseudoPass(PassRegistry &);
 void initializeThumb2SizeReducePass(PassRegistry &);
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
index c9766aa2161a..2e62a0790418 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -109,10 +109,16 @@ def Feature8MSecExt       : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
                                              "Enable support for ARMv8-M "
                                              "Security Extensions">;
 
+def FeatureSHA2           : SubtargetFeature<"sha2", "HasSHA2", "true",
+                                             "Enable SHA1 and SHA256 support", [FeatureNEON]>;
+
+def FeatureAES            : SubtargetFeature<"aes", "HasAES", "true",
+                                             "Enable AES support", [FeatureNEON]>;
+
 def FeatureCrypto         : SubtargetFeature<"crypto", "HasCrypto", "true",
                                              "Enable support for "
                                              "Cryptography extensions",
-                                             [FeatureNEON]>;
+                                             [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
 def FeatureCRC            : SubtargetFeature<"crc", "HasCRC", "true",
                                              "Enable support for CRC instructions">;
@@ -135,6 +141,10 @@ def FeatureFPAO           : SubtargetFeature<"fpao", "HasFPAO", "true",
 def FeatureFuseAES        : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
                                              "CPU fuses AES crypto operations">;
 
+// Fast execution of bottom and top halves of literal generation
+def FeatureFuseLiterals   : SubtargetFeature<"fuse-literals", "HasFuseLiterals", "true",
+                                             "CPU fuses literal generation operations">;
+
 // The way of reading thread pointer                                             
 def FeatureReadTp :  SubtargetFeature<"read-tp-hard", "ReadTPHard", "true",
                                       "Reading thread pointer from register">;
@@ -189,6 +199,13 @@ def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
                                              "DontWidenVMOVS", "true",
                                              "Don't widen VMOVS to VMOVD">;
 
+// Some targets (e.g. Cortex-A15) prefer to avoid mixing operations on different
+// VFP register widths.
+def FeatureSplatVFPToNeon : SubtargetFeature<"splat-vfp-neon",
+                                             "SplatVFPToNeon", "true",
+                                             "Splat register from VFP to NEON",
+                                             [FeatureDontWidenVMOVS]>;
+
 // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
 def FeatureExpandMLx      : SubtargetFeature<"expand-fp-mlx",
                                              "ExpandMLx", "true",
@@ -330,6 +347,10 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
     "DisablePostRAScheduler", "true",
     "Don't schedule again after register allocation">;
 
+// Enable use of alias analysis during code generation
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+                                    "Use alias analysis during codegen">;
+
 //===----------------------------------------------------------------------===//
 // ARM architecture class
 //
@@ -415,6 +436,10 @@ def HasV8_3aOps   : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
                                    "Support ARM v8.3a instructions",
                                    [HasV8_2aOps]>;
 
+def HasV8_4aOps   : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
+                                   "Support ARM v8.4a instructions",
+                                   [HasV8_3aOps, FeatureDotProd]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Processor subtarget features.
 //
@@ -507,7 +532,8 @@ def ARMv5te   : Architecture<"armv5te",   "ARMv5te",  [HasV5TEOps]>;
 
 def ARMv5tej  : Architecture<"armv5tej",  "ARMv5tej", [HasV5TEOps]>;
 
-def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops]>;
+def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops,
+                                                       FeatureDSP]>;
 
 def ARMv6t2   : Architecture<"armv6t2",   "ARMv6t2",  [HasV6T2Ops,
                                                        FeatureDSP]>;
@@ -521,13 +547,15 @@ def ARMv6m    : Architecture<"armv6-m",   "ARMv6m",   [HasV6MOps,
                                                        FeatureNoARM,
                                                        ModeThumb,
                                                        FeatureDB,
-                                                       FeatureMClass]>;
+                                                       FeatureMClass,
+                                                       FeatureStrictAlign]>;
 
 def ARMv6sm   : Architecture<"armv6s-m",  "ARMv6sm",  [HasV6MOps,
                                                        FeatureNoARM,
                                                        ModeThumb,
                                                        FeatureDB,
-                                                       FeatureMClass]>;
+                                                       FeatureMClass,
+                                                       FeatureStrictAlign]>;
 
 def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
                                                        FeatureNEON,
@@ -617,6 +645,20 @@ def ARMv83a   : Architecture<"armv8.3-a", "ARMv83a",  [HasV8_3aOps,
                                                        FeatureCRC,
                                                        FeatureRAS]>;
 
+def ARMv84a   : Architecture<"armv8.4-a", "ARMv84a",  [HasV8_4aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
+
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
@@ -637,7 +679,8 @@ def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
                                                        FeatureV7Clrex,
                                                        Feature8MSecExt,
                                                        FeatureAcquireRelease,
-                                                       FeatureMClass]>;
+                                                       FeatureMClass,
+                                                       FeatureStrictAlign]>;
 
 def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
                                                       [HasV8MMainlineOps,
@@ -787,6 +830,7 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
 
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureDontWidenVMOVS,
+                                                         FeatureSplatVFPToNeon,
                                                          FeatureHasRetAddrStack,
                                                          FeatureMuxedUnits,
                                                          FeatureTrustZone,
@@ -991,6 +1035,12 @@ def : ProcNoItin<"exynos-m3",                           [ARMv8a, ProcExynosM1,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
+def : ProcNoItin<"exynos-m4",                           [ARMv8a, ProcExynosM1,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 def : ProcNoItin<"kryo",                                [ARMv8a, ProcKryo,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
@@ -998,7 +1048,9 @@ def : ProcNoItin<"kryo",                                [ARMv8a, ProcKryo,
                                                          FeatureCRC]>;
 
 def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
-                                                         FeatureFPAO]>;
+                                                         FeatureUseMISched,
+                                                         FeatureFPAO,
+                                                         FeatureUseAA]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -1042,4 +1094,5 @@ def ARM : Target {
   let AssemblyWriters = [ARMAsmWriter];
   let AssemblyParsers = [ARMAsmParser];
   let AssemblyParserVariants = [ARMAsmParserVariant];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index d3d79fe975bb..2196f9b47f3b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -235,6 +235,15 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   }
 }
 
+MCSymbol *ARMAsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // The AsmPrinter::GetCPISymbol superclass method tries to use CPID as
+  // indexes in MachineConstantPool, which isn't in sync with indexes used here.
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      "CPI" + Twine(getFunctionNumber()) + "_" +
+                                      Twine(CPID));
+}
+
 //===--------------------------------------------------------------------===//
 
 MCSymbol *ARMAsmPrinter::
@@ -545,29 +554,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
-  if (TT.isOSBinFormatCOFF()) {
-    const auto &TLOF =
-        static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
-
-    std::string Flags;
-    raw_string_ostream OS(Flags);
-
-    for (const auto &Function : M)
-      TLOF.emitLinkerFlagsForGlobal(OS, &Function);
-    for (const auto &Global : M.globals())
-      TLOF.emitLinkerFlagsForGlobal(OS, &Global);
-    for (const auto &Alias : M.aliases())
-      TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
-
-    OS.flush();
-
-    // Output collected flags
-    if (!Flags.empty()) {
-      OutStreamer->SwitchSection(TLOF.getDrectveSection());
-      OutStreamer->EmitBytes(Flags);
-    }
-  }
-
   // The last attribute to be emitted is ABI_optimization_goals
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1086,6 +1072,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
     unsigned StartOp = 2 + 2;
     // Use all the operands.
     unsigned NumOffset = 0;
+    // Amount of SP adjustment folded into a push.
+    unsigned Pad = 0;
 
     switch (Opc) {
     default:
@@ -1107,6 +1095,16 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         // temporary to workaround PR11902.
         if (MO.isImplicit())
           continue;
+        // Registers, pushed as a part of folding an SP update into the
+        // push instruction are marked as undef and should not be
+        // restored when unwinding, because the function can modify the
+        // corresponding stack slots.
+        if (MO.isUndef()) {
+          assert(RegList.empty() &&
+                 "Pad registers must come before restored ones");
+          Pad += 4;
+          continue;
+        }
         RegList.push_back(MO.getReg());
       }
       break;
@@ -1118,8 +1116,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       RegList.push_back(SrcReg);
       break;
     }
-    if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
       ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+      // Account for the SP adjustment, folded into the push.
+      if (Pad)
+        ATS.emitPad(Pad);
+    }
   } else {
     // Changes of stack / frame pointer.
     if (SrcReg == ARM::SP) {
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 7b811b18f74a..0ba4bc05d6f7 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -101,7 +101,9 @@ public:
   void EmitEndOfAsmFile(Module &M) override;
   void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
   void EmitGlobalVariable(const GlobalVariable *GV) override;
-  
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index cff24a10bb5f..b1c2031c7d7b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -331,7 +331,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     bool CantAnalyze = false;
 
     // Skip over DEBUG values and predicated nonterminators.
-    while (I->isDebugValue() || !I->isTerminator()) {
+    while (I->isDebugInstr() || !I->isTerminator()) {
       if (I == MBB.begin())
         return false;
       --I;
@@ -935,6 +935,25 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Mov->addRegisterKilled(SrcReg, TRI);
 }
 
+bool ARMBaseInstrInfo::isCopyInstr(const MachineInstr &MI,
+                                   const MachineOperand *&Src,
+                                   const MachineOperand *&Dest) const {
+  // VMOVRRD is also a copy instruction but it requires
+  // special way of handling. It is more complex copy version
+  // and since that we are not considering it. For recognition
+  // of such instruction isExtractSubregLike MI interface fuction
+  // could be used.
+  // VORRq is considered as a move only if two inputs are
+  // the same register.
+  if (!MI.isMoveReg() ||
+      (MI.getOpcode() == ARM::VORRq &&
+       MI.getOperand(1).getReg() != MI.getOperand(2).getReg()))
+    return false;
+  Dest = &MI.getOperand(0);
+  Src = &MI.getOperand(1);
+  return true;
+}
+
 const MachineInstrBuilder &
 ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                           unsigned SubIdx, unsigned State,
@@ -963,6 +982,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MFI.getObjectSize(FI), Align);
 
   switch (TRI->getSpillSize(*RC)) {
+    case 2:
+      if (ARM::HPRRegClass.hasSubClassEq(RC)) {
+        BuildMI(MBB, I, DL, get(ARM::VSTRH))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
     case 4:
       if (ARM::GPRRegClass.hasSubClassEq(RC)) {
         BuildMI(MBB, I, DL, get(ARM::STRi12))
@@ -1161,6 +1191,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MFI.getObjectSize(FI), Align);
 
   switch (TRI->getSpillSize(*RC)) {
+  case 2:
+    if (ARM::HPRRegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
   case 4:
     if (ARM::GPRRegClass.hasSubClassEq(RC)) {
       BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
@@ -1168,7 +1208,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
           .addImm(0)
           .addMemOperand(MMO)
           .add(predOps(ARMCC::AL));
-
     } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
       BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
           .addFrameIndex(FI)
@@ -1321,7 +1360,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
     }
     break;
   case ARM::VLD1q64:
+  case ARM::VLD1d8TPseudo:
+  case ARM::VLD1d16TPseudo:
+  case ARM::VLD1d32TPseudo:
   case ARM::VLD1d64TPseudo:
+  case ARM::VLD1d8QPseudo:
+  case ARM::VLD1d16QPseudo:
+  case ARM::VLD1d32QPseudo:
   case ARM::VLD1d64QPseudo:
     if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
       FrameIndex = MI.getOperand(1).getIndex();
@@ -1345,7 +1390,7 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
   return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
 }
 
-/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
+/// Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
 /// depending on whether the result is used.
 void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   bool isThumb1 = Subtarget.isThumb1Only();
@@ -1358,7 +1403,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   MachineInstrBuilder LDM, STM;
   if (isThumb1 || !MI->getOperand(1).isDead()) {
     MachineOperand LDWb(MI->getOperand(1));
-    LDWb.setIsRenamable(false);
     LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
                                                  : isThumb1 ? ARM::tLDMIA_UPD
                                                             : ARM::LDMIA_UPD))
@@ -1369,7 +1413,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
 
   if (isThumb1 || !MI->getOperand(0).isDead()) {
     MachineOperand STWb(MI->getOperand(0));
-    STWb.setIsRenamable(false);
     STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
                                                  : isThumb1 ? ARM::tSTMIA_UPD
                                                             : ARM::STMIA_UPD))
@@ -1379,11 +1422,9 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   }
 
   MachineOperand LDBase(MI->getOperand(3));
-  LDBase.setIsRenamable(false);
   LDM.add(LDBase).add(predOps(ARMCC::AL));
 
   MachineOperand STBase(MI->getOperand(2));
-  STBase.setIsRenamable(false);
   STM.add(STBase).add(predOps(ARMCC::AL));
 
   // Sort the scratch registers into ascending order.
@@ -1391,12 +1432,12 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   SmallVector<unsigned, 6> ScratchRegs;
   for(unsigned I = 5; I < MI->getNumOperands(); ++I)
     ScratchRegs.push_back(MI->getOperand(I).getReg());
-  std::sort(ScratchRegs.begin(), ScratchRegs.end(),
-            [&TRI](const unsigned &Reg1,
-                   const unsigned &Reg2) -> bool {
-              return TRI.getEncodingValue(Reg1) <
-                     TRI.getEncodingValue(Reg2);
-            });
+  llvm::sort(ScratchRegs.begin(), ScratchRegs.end(),
+             [&TRI](const unsigned &Reg1,
+                    const unsigned &Reg2) -> bool {
+               return TRI.getEncodingValue(Reg1) <
+                      TRI.getEncodingValue(Reg2);
+             });
 
   for (const auto &Reg : ScratchRegs) {
     LDM.addReg(Reg, RegState::Define);
@@ -1453,7 +1494,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return false;
 
   // All clear, widen the COPY.
-  DEBUG(dbgs() << "widening:    " << MI);
+  LLVM_DEBUG(dbgs() << "widening:    " << MI);
   MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
 
   // Get rid of the old implicit-def of DstRegD.  Leave it if it defines a Q-reg
@@ -1482,7 +1523,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.addRegisterKilled(SrcRegS, TRI, true);
   }
 
-  DEBUG(dbgs() << "replaced by: " << MI);
+  LLVM_DEBUG(dbgs() << "replaced by: " << MI);
   return true;
 }
 
@@ -1659,7 +1700,7 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
     }
 
     for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
-      // %12 = PICLDR %11, 0, pred:14, pred:%noreg
+      // %12 = PICLDR %11, 0, 14, %noreg
       const MachineOperand &MO0 = MI0.getOperand(i);
       const MachineOperand &MO1 = MI1.getOperand(i);
       if (!MO0.isIdenticalTo(MO1))
@@ -1799,7 +1840,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // considered a scheduling hazard, which is wrong. It should be the actual
   // instruction preceding the dbg_value instruction(s), just like it is
   // when debug info is not present.
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return false;
 
   // Terminators and labels can't be scheduled around.
@@ -1813,8 +1854,8 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // to the t2IT instruction. The added compile time and complexity does not
   // seem worth it.
   MachineBasicBlock::const_iterator I = MI;
-  // Make sure to skip any dbg_value instructions
-  while (++I != MBB->end() && I->isDebugValue())
+  // Make sure to skip any debug instructions
+  while (++I != MBB->end() && I->isDebugInstr())
     ;
   if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
     return true;
@@ -2277,9 +2318,9 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
        --CurRegEnc) {
     unsigned CurReg = RegClass->getRegister(CurRegEnc);
     if (!IsPop) {
-      // Pushing any register is completely harmless, mark the
-      // register involved as undef since we don't care about it in
-      // the slightest.
+      // Pushing any register is completely harmless, mark the register involved
+      // as undef since we don't care about its value and must not restore it
+      // during stack unwinding.
       RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
                                                   false, false, true));
       --RegsNeeded;
@@ -2409,6 +2450,14 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       NumBits = 8;
       Scale = 4;
       break;
+    case ARMII::AddrMode5FP16:
+      ImmIdx = FrameRegIdx+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 2;
+      break;
     default:
       llvm_unreachable("Unsupported addressing mode!");
     }
@@ -2534,14 +2583,28 @@ inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
   }
 }
 
+/// getCmpToAddCondition - assume the flags are set by CMP(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by ADD(a,b,X).
+inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::HS: return ARMCC::LO;
+  case ARMCC::LO: return ARMCC::HS;
+  case ARMCC::VS: return ARMCC::VS;
+  case ARMCC::VC: return ARMCC::VC;
+  }
+}
+
 /// isRedundantFlagInstr - check whether the first instruction, whose only
 /// purpose is to update flags, can be made redundant.
 /// CMPrr can be made redundant by SUBrr if the operands are the same.
 /// CMPri can be made redundant by SUBri if the operands are the same.
+/// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X).
 /// This function can be extended later on.
-inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
-                                        MachineInstr *OI) {
+inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
+                                        unsigned SrcReg, unsigned SrcReg2,
+                                        int ImmValue, const MachineInstr *OI) {
   if ((CmpI->getOpcode() == ARM::CMPrr ||
        CmpI->getOpcode() == ARM::t2CMPrr) &&
       (OI->getOpcode() == ARM::SUBrr ||
@@ -2559,6 +2622,14 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
       OI->getOperand(1).getReg() == SrcReg &&
       OI->getOperand(2).getImm() == ImmValue)
     return true;
+
+  if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::ADDrr || OI->getOpcode() == ARM::t2ADDrr ||
+       OI->getOpcode() == ARM::ADDri || OI->getOpcode() == ARM::t2ADDri) &&
+      OI->getOperand(0).isReg() && OI->getOperand(1).isReg() &&
+      OI->getOperand(0).getReg() == SrcReg &&
+      OI->getOperand(1).getReg() == SrcReg2)
+    return true;
   return false;
 }
 
@@ -2661,17 +2732,18 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   if (I == B) return false;
 
   // There are two possible candidates which can be changed to set CPSR:
-  // One is MI, the other is a SUB instruction.
-  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // One is MI, the other is a SUB or ADD instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2), SUB(r2,r1), or
+  // ADDr[ri](r1, r2, X).
   // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
-  MachineInstr *Sub = nullptr;
+  MachineInstr *SubAdd = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
     MI = nullptr;
   else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
-    // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+    // For CMPri w/ CmpValue != 0, a SubAdd may still be a candidate.
     // Thus we cannot return here.
     if (CmpInstr.getOpcode() == ARM::CMPri ||
         CmpInstr.getOpcode() == ARM::t2CMPri)
@@ -2716,11 +2788,20 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   }
 
   // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change. At the same time, search for Sub.
+  // want to change. At the same time, search for SubAdd.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  --I;
-  for (; I != E; --I) {
-    const MachineInstr &Instr = *I;
+  do {
+    const MachineInstr &Instr = *--I;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &Instr)) {
+      SubAdd = &*I;
+      break;
+    }
+
+    // Allow E (which was initially MI) to be SubAdd but do not search before E.
+    if (I == E)
+      break;
 
     if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
         Instr.readsRegister(ARM::CPSR, TRI))
@@ -2728,23 +2809,14 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
       // change. We can't do this transformation.
       return false;
 
-    // Check whether CmpInstr can be made redundant by the current instruction.
-    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
-      Sub = &*I;
-      break;
-    }
-
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
+  } while (I != B);
 
   // Return false if no candidates exist.
-  if (!MI && !Sub)
+  if (!MI && !SubAdd)
     return false;
 
   // The single candidate is called MI.
-  if (!MI) MI = Sub;
+  if (!MI) MI = SubAdd;
 
   // We can't use a predicated instruction - it doesn't always write the flags.
   if (isPredicated(*MI))
@@ -2802,25 +2874,31 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
         break;
       }
 
-      if (Sub) {
-        ARMCC::CondCodes NewCC = getSwappedCondition(CC);
-        if (NewCC == ARMCC::AL)
-          return false;
+      if (SubAdd) {
         // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
         // on CMP needs to be updated to be based on SUB.
+        // If we have ADD(r1, r2, X) and CMP(r1, r2), the condition code also
+        // needs to be modified.
         // Push the condition code operands to OperandsToUpdate.
         // If it is safe to remove CmpInstr, the condition code of these
         // operands will be modified.
-        if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-            Sub->getOperand(2).getReg() == SrcReg) {
+        unsigned Opc = SubAdd->getOpcode();
+        bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr ||
+                     Opc == ARM::SUBri || Opc == ARM::t2SUBri;
+        if (!IsSub || (SrcReg2 != 0 && SubAdd->getOperand(1).getReg() == SrcReg2 &&
+                       SubAdd->getOperand(2).getReg() == SrcReg)) {
           // VSel doesn't support condition code update.
           if (IsInstrVSel)
             return false;
+          // Ensure we can swap the condition.
+          ARMCC::CondCodes NewCC = (IsSub ? getSwappedCondition(CC) : getCmpToAddCondition(CC));
+          if (NewCC == ARMCC::AL)
+            return false;
           OperandsToUpdate.push_back(
               std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
         }
       } else {
-        // No Sub, so this is x = <op> y, z; cmp x, 0.
+        // No SubAdd, so this is x = <op> y, z; cmp x, 0.
         switch (CC) {
         case ARMCC::EQ: // Z
         case ARMCC::NE: // Z
@@ -2874,6 +2952,23 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   return true;
 }
 
+bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
+  // Do not sink MI if it might be used to optimize a redundant compare.
+  // We heuristically only look at the instruction immediately following MI to
+  // avoid potentially searching the entire basic block.
+  if (isPredicated(MI))
+    return true;
+  MachineBasicBlock::const_iterator Next = &MI;
+  ++Next;
+  unsigned SrcReg, SrcReg2;
+  int CmpMask, CmpValue;
+  if (Next != MI.getParent()->end() &&
+      analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) &&
+      isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI))
+    return false;
+  return true;
+}
+
 bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                      unsigned Reg,
                                      MachineRegisterInfo *MRI) const {
@@ -3467,8 +3562,8 @@ bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
 }
 unsigned
 ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
-  // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops
-  // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops)
+  // ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops
+  // (outs GPR:$wb), (ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops)
   return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
 }
 
@@ -4142,8 +4237,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD3d8Pseudo:
     case ARM::VLD3d16Pseudo:
     case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d8TPseudo:
+    case ARM::VLD1d16TPseudo:
+    case ARM::VLD1d32TPseudo:
     case ARM::VLD1d64TPseudo:
     case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD1d64TPseudoWB_register:
     case ARM::VLD3d8Pseudo_UPD:
     case ARM::VLD3d16Pseudo_UPD:
     case ARM::VLD3d32Pseudo_UPD:
@@ -4159,8 +4258,28 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4d8Pseudo:
     case ARM::VLD4d16Pseudo:
     case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d8QPseudo:
+    case ARM::VLD1d16QPseudo:
+    case ARM::VLD1d32QPseudo:
     case ARM::VLD1d64QPseudo:
     case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD1d64QPseudoWB_register:
+    case ARM::VLD1q8HighQPseudo:
+    case ARM::VLD1q8LowQPseudo_UPD:
+    case ARM::VLD1q8HighTPseudo:
+    case ARM::VLD1q8LowTPseudo_UPD:
+    case ARM::VLD1q16HighQPseudo:
+    case ARM::VLD1q16LowQPseudo_UPD:
+    case ARM::VLD1q16HighTPseudo:
+    case ARM::VLD1q16LowTPseudo_UPD:
+    case ARM::VLD1q32HighQPseudo:
+    case ARM::VLD1q32LowQPseudo_UPD:
+    case ARM::VLD1q32HighTPseudo:
+    case ARM::VLD1q32LowTPseudo_UPD:
+    case ARM::VLD1q64HighQPseudo:
+    case ARM::VLD1q64LowQPseudo_UPD:
+    case ARM::VLD1q64HighTPseudo:
+    case ARM::VLD1q64LowTPseudo_UPD:
     case ARM::VLD4d8Pseudo_UPD:
     case ARM::VLD4d16Pseudo_UPD:
     case ARM::VLD4d32Pseudo_UPD:
@@ -4191,12 +4310,30 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD2DUPd8wb_register:
     case ARM::VLD2DUPd16wb_register:
     case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD2DUPq8EvenPseudo:
+    case ARM::VLD2DUPq8OddPseudo:
+    case ARM::VLD2DUPq16EvenPseudo:
+    case ARM::VLD2DUPq16OddPseudo:
+    case ARM::VLD2DUPq32EvenPseudo:
+    case ARM::VLD2DUPq32OddPseudo:
+    case ARM::VLD3DUPq8EvenPseudo:
+    case ARM::VLD3DUPq8OddPseudo:
+    case ARM::VLD3DUPq16EvenPseudo:
+    case ARM::VLD3DUPq16OddPseudo:
+    case ARM::VLD3DUPq32EvenPseudo:
+    case ARM::VLD3DUPq32OddPseudo:
     case ARM::VLD4DUPd8Pseudo:
     case ARM::VLD4DUPd16Pseudo:
     case ARM::VLD4DUPd32Pseudo:
     case ARM::VLD4DUPd8Pseudo_UPD:
     case ARM::VLD4DUPd16Pseudo_UPD:
     case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPq8EvenPseudo:
+    case ARM::VLD4DUPq8OddPseudo:
+    case ARM::VLD4DUPq16EvenPseudo:
+    case ARM::VLD4DUPq16OddPseudo:
+    case ARM::VLD4DUPq32EvenPseudo:
+    case ARM::VLD4DUPq32OddPseudo:
     case ARM::VLD1LNq8Pseudo:
     case ARM::VLD1LNq16Pseudo:
     case ARM::VLD1LNq32Pseudo:
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index d375f40d6e14..b54be15097b1 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -201,6 +201,9 @@ public:
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+                   const MachineOperand *&Dest) const override;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
@@ -215,6 +218,8 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  bool shouldSink(const MachineInstr &MI) const override;
+
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 4b9a4376adf8..43e8b7d66c62 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -838,10 +838,10 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
   auto AFI = MF->getInfo<ARMFunctionInfo>();
   auto It = AFI->getCoalescedWeight(MBB);
 
-  DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
-    << It->second << "\n");
-  DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
-    << NewRCWeight.RegWeight << "\n");
+  LLVM_DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
+                    << It->second << "\n");
+  LLVM_DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
+                    << NewRCWeight.RegWeight << "\n");
 
   // This number is the largest round number that which meets the criteria:
   //  (1) addresses PR18825
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 5801e6bdbd0e..f755f66a0f3a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -154,6 +154,7 @@ public:
 
   void updateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const override;
+  bool enableMultipleCopyHints() const override { return true; }
 
   bool hasBasePointer(const MachineFunction &MF) const;
 
@@ -200,7 +201,7 @@ public:
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
-  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+  /// SrcRC and DstRC will be morphed into NewRC if this returns true
   bool shouldCoalesce(MachineInstr *MI,
                       const TargetRegisterClass *SrcRC,
                       unsigned SubReg,
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
index eab4b3b13f31..47f998b696f5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -31,7 +31,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -43,6 +42,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -469,7 +469,12 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (!MBB.empty())
     MIRBuilder.setInstr(*MBB.begin());
 
-  return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
+  if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+    return false;
+
+  // Move back to the end of the basic block.
+  MIRBuilder.setMBB(MBB);
+  return true;
 }
 
 namespace {
@@ -521,7 +526,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg))
       MIB->getOperand(0).setReg(constrainOperandRegClass(
           MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
-          *MIB.getInstr(), MIB->getDesc(), CalleeReg, 0));
+          *MIB.getInstr(), MIB->getDesc(), Callee, 0));
   }
 
   SmallVector<ArgInfo, 8> ArgInfos;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
index 284b67fd59b6..63bf48abb7ac 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -217,12 +217,15 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
 
     break;
   }
+  case MVT::f16:
   case MVT::f32:
     RegList = SRegList;
     break;
+  case MVT::v4f16:
   case MVT::f64:
     RegList = DRegList;
     break;
+  case MVT::v8f16:
   case MVT::v2f64:
     RegList = QRegList;
     break;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
index dcfd6518a840..f173e423f3e4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -160,8 +160,8 @@ def CC_ARM_AAPCS : CallingConv<[
   CCIfNest<CCAssignToReg<[R12]>>,
 
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -176,8 +176,8 @@ def CC_ARM_AAPCS : CallingConv<[
 
 def RetCC_ARM_AAPCS : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16,v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -187,6 +187,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
@@ -200,8 +201,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
 
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -221,8 +222,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
 
 def RetCC_ARM_AAPCS_VFP : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -233,7 +234,7 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                 S9, S10, S11, S12, S13, S14, S15]>>,
+                                      S9, S10, S11, S12, S13, S14, S15]>>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
new file mode 100644
index 000000000000..24071277427a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -0,0 +1,750 @@
+//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass inserts intrinsics to handle small types that would otherwise be
+/// promoted during legalization. Here we can manually promote types or insert
+/// intrinsics which can handle narrow types that aren't supported by the
+/// register classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "arm-codegenprepare"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true),
+           cl::desc("Disable ARM specific CodeGenPrepare pass"));
+
+static cl::opt<bool>
+EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
+          cl::desc("Use DSP instructions for scalar operations"));
+
+static cl::opt<bool>
+EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
+                   cl::desc("Use DSP instructions for scalar operations\
+                            with immediate operands"));
+
+namespace {
+
+class IRPromoter {
+  SmallPtrSet<Value*, 8> NewInsts;
+  SmallVector<Instruction*, 4> InstsToRemove;
+  Module *M = nullptr;
+  LLVMContext &Ctx;
+
+public:
+  IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+
+  void Cleanup() {
+    for (auto *I : InstsToRemove) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+      I->dropAllReferences();
+      I->eraseFromParent();
+    }
+    InstsToRemove.clear();
+    NewInsts.clear();
+  }
+
+  void Mutate(Type *OrigTy,
+              SmallPtrSetImpl<Value*> &Visited,
+              SmallPtrSetImpl<Value*> &Leaves,
+              SmallPtrSetImpl<Instruction*> &Roots);
+};
+
+class ARMCodeGenPrepare : public FunctionPass {
+  const ARMSubtarget *ST = nullptr;
+  IRPromoter *Promoter = nullptr;
+  std::set<Value*> AllVisited;
+  Type *OrigTy = nullptr;
+  unsigned TypeSize = 0;
+
+  bool isNarrowInstSupported(Instruction *I);
+  bool isSupportedValue(Value *V);
+  bool isLegalToPromote(Value *V);
+  bool TryToPromote(Value *V);
+
+public:
+  static char ID;
+
+  ARMCodeGenPrepare() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+  }
+
+  StringRef getPassName() const override { return "ARM IR optimizations"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  bool doFinalization(Module &M) override;
+};
+
+}
+
+/// Can the given value generate sign bits.
+static bool isSigned(Value *V) {
+  if (!isa<Instruction>(V))
+    return false;
+
+  unsigned Opc = cast<Instruction>(V)->getOpcode();
+  return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
+         Opc == Instruction::SRem;
+}
+
+/// Some instructions can use 8- and 16-bit operands, and we don't need to
+/// promote anything larger. We disallow booleans to make life easier when
+/// dealing with icmps but allow any other integer that is <= 16 bits. Void
+/// types are accepted so we can handle switches.
+static bool isSupportedType(Value *V) {
+  if (V->getType()->isVoidTy())
+    return true;
+
+  const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
+  if (!IntTy)
+    return false;
+
+  // Don't try to promote boolean values.
+  if (IntTy->getBitWidth() == 1)
+    return false;
+
+  if (auto *ZExt = dyn_cast<ZExtInst>(V))
+    return isSupportedType(ZExt->getOperand(0));
+
+  return IntTy->getBitWidth() <= 16;
+}
+
+/// Return true if V will require any promoted values to be truncated for the
+/// use to be valid.
+static bool isSink(Value *V) {
+  auto UsesNarrowValue = [](Value *V) {
+    return V->getType()->getScalarSizeInBits() <= 32;
+  };
+
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return UsesNarrowValue(Store->getValueOperand());
+  if (auto *Return = dyn_cast<ReturnInst>(V))
+    return UsesNarrowValue(Return->getReturnValue());
+
+  return isa<CallInst>(V);
+}
+
+/// Return true if the given value is a leaf that will need to be zext'd.
+static bool isSource(Value *V) {
+  if (isa<Argument>(V) && isSupportedType(V))
+    return true;
+  else if (isa<TruncInst>(V))
+    return true;
+  else if (auto *ZExt = dyn_cast<ZExtInst>(V))
+    // ZExt can be a leaf if its the only user of a load.
+    return isa<LoadInst>(ZExt->getOperand(0)) &&
+                         ZExt->getOperand(0)->hasOneUse();
+  else if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  else if (auto *Load = dyn_cast<LoadInst>(V)) {
+    if (!isa<IntegerType>(Load->getType()))
+      return false;
+    // A load is a leaf, unless its already just being zext'd.
+    if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
+      return false;
+
+    return true;
+  }
+  return false;
+}
+
+/// Return whether the instruction can be promoted within any modifications to
+/// it's operands or result.
+static bool isSafeOverflow(Instruction *I) {
+  if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
+    return true;
+
+  unsigned Opc = I->getOpcode();
+  if (Opc == Instruction::Add || Opc == Instruction::Sub) {
+    // We don't care if the add or sub could wrap if the value is decreasing
+    // and is only being used by an unsigned compare.
+    if (!I->hasOneUse() ||
+        !isa<ICmpInst>(*I->user_begin()) ||
+        !isa<ConstantInt>(I->getOperand(1)))
+      return false;
+
+    auto *CI = cast<ICmpInst>(*I->user_begin());
+    if (CI->isSigned())
+      return false;
+
+    bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative();
+    bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
+                        ((Opc == Instruction::Add) && NegImm);
+    if (!IsDecreasing)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+    return true;
+  }
+
+  // Otherwise, if an instruction is using a negative immediate we will need
+  // to fix it up during the promotion.
+  for (auto &Op : I->operands()) {
+    if (auto *Const = dyn_cast<ConstantInt>(Op))
+      if (Const->isNegative())
+        return false;
+  }
+  return false;
+}
+
+static bool shouldPromote(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  if (!isa<IntegerType>(V->getType()))
+    return false;
+
+  if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
+      isa<ICmpInst>(I))
+    return false;
+
+  if (auto *ZExt = dyn_cast<ZExtInst>(I))
+    return !ZExt->getDestTy()->isIntegerTy(32);
+
+  return true;
+}
+
+/// Return whether we can safely mutate V's type to ExtTy without having to be
+/// concerned with zero extending or truncation.
+static bool isPromotedResultSafe(Value *V) {
+  if (!isa<Instruction>(V))
+    return true;
+
+  if (isSigned(V))
+    return false;
+
+  // If I is only being used by something that will require its value to be
+  // truncated, then we don't care about the promoted result.
+  auto *I = cast<Instruction>(V);
+  if (I->hasOneUse() && isSink(*I->use_begin()))
+    return true;
+
+  if (isa<OverflowingBinaryOperator>(I))
+    return isSafeOverflow(I);
+  return true;
+}
+
+/// Return the intrinsic for the instruction that can perform the same
+/// operation but on a narrow type. This is using the parallel dsp intrinsics
+/// on scalar values.
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+  // Whether we use the signed or unsigned versions of these intrinsics
+  // doesn't matter because we're not using the GE bits that they set in
+  // the APSR.
+  switch(I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Add:
+    return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+      Intrinsic::arm_uadd8;
+  case Instruction::Sub:
+    return TypeSize == 16 ? Intrinsic::arm_usub16 :
+      Intrinsic::arm_usub8;
+  }
+  llvm_unreachable("unhandled opcode for narrow intrinsic");
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+                        SmallPtrSetImpl<Value*> &Visited,
+                        SmallPtrSetImpl<Value*> &Leaves,
+                        SmallPtrSetImpl<Instruction*> &Roots) {
+  IRBuilder<> Builder{Ctx};
+  Type *ExtTy = Type::getInt32Ty(M->getContext());
+  unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
+  SmallPtrSet<Value*, 8> Promoted;
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
+        << " to 32-bits\n");
+
+  auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
+    SmallVector<Instruction*, 4> Users;
+    Instruction *InstTo = dyn_cast<Instruction>(To);
+    for (Use &U : From->uses()) {
+      auto *User = cast<Instruction>(U.getUser());
+      if (InstTo && User->isIdenticalTo(InstTo))
+        continue;
+      Users.push_back(User);
+    }
+
+    for (auto &U : Users)
+      U->replaceUsesOfWith(From, To);
+  };
+
+  auto FixConst = [&](ConstantInt *Const, Instruction *I) {
+    Constant *NewConst = nullptr;
+    if (isSafeOverflow(I)) {
+      NewConst = (Const->isNegative()) ?
+        ConstantExpr::getSExt(Const, ExtTy) :
+        ConstantExpr::getZExt(Const, ExtTy);
+    } else {
+      uint64_t NewVal = *Const->getValue().getRawData();
+      if (Const->getType() == Type::getInt16Ty(Ctx))
+        NewVal &= 0xFFFF;
+      else
+        NewVal &= 0xFF;
+      NewConst = ConstantInt::get(ExtTy, NewVal);
+    }
+    I->replaceUsesOfWith(Const, NewConst);
+  };
+
+  auto InsertDSPIntrinsic = [&](Instruction *I) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+               << *I << "\n");
+    Function *DSPInst =
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+    CallInst *Call = Builder.CreateCall(DSPInst, Args);
+    ReplaceAllUsersOfWith(I, Call);
+    InstsToRemove.push_back(I);
+    NewInsts.insert(Call);
+  };
+
+  auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
+    Builder.SetInsertPoint(InsertPt);
+    if (auto *I = dyn_cast<Instruction>(V))
+      Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
+    if (isa<Argument>(V))
+      ZExt->moveBefore(InsertPt);
+    else
+      ZExt->moveAfter(InsertPt);
+    ReplaceAllUsersOfWith(V, ZExt);
+    NewInsts.insert(ZExt);
+  };
+
+  // First, insert extending instructions between the leaves and their users.
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
+  for (auto V : Leaves) {
+    LLVM_DEBUG(dbgs() << " - " << *V << "\n");
+    if (auto *ZExt = dyn_cast<ZExtInst>(V))
+      ZExt->mutateType(ExtTy);
+    else if (auto *I = dyn_cast<Instruction>(V))
+      InsertZExt(I, I);
+    else if (auto *Arg = dyn_cast<Argument>(V)) {
+      BasicBlock &BB = Arg->getParent()->front();
+      InsertZExt(Arg, &*BB.getFirstInsertionPt());
+    } else {
+      llvm_unreachable("unhandled leaf that needs extending");
+    }
+    Promoted.insert(V);
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
+  // Then mutate the types of the instructions within the tree. Here we handle
+  // constant operands.
+  for (auto *V : Visited) {
+    if (Leaves.count(V))
+      continue;
+
+    if (!isa<Instruction>(V))
+      continue;
+
+    auto *I = cast<Instruction>(V);
+    if (Roots.count(I))
+      continue;
+
+    for (auto &U : I->operands()) {
+      if ((U->getType() == ExtTy) || !isSupportedType(&*U))
+        continue;
+
+      if (auto *Const = dyn_cast<ConstantInt>(&*U))
+        FixConst(Const, I);
+      else if (isa<UndefValue>(&*U))
+        U->mutateType(ExtTy);
+    }
+
+    if (shouldPromote(I)) {
+      I->mutateType(ExtTy);
+      Promoted.insert(I);
+    }
+  }
+
+  // Now we need to remove any zexts that have become unnecessary, as well
+  // as insert any intrinsics.
+  for (auto *V : Visited) {
+    if (Leaves.count(V))
+      continue;
+    if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+      if (ZExt->getDestTy() != ExtTy) {
+        ZExt->mutateType(ExtTy);
+        Promoted.insert(ZExt);
+      }
+      else if (ZExt->getSrcTy() == ExtTy) {
+        ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
+        InstsToRemove.push_back(ZExt);
+      }
+      continue;
+    }
+
+    if (!shouldPromote(V) || isPromotedResultSafe(V))
+      continue;
+
+    // Replace unsafe instructions with appropriate intrinsic calls.
+    InsertDSPIntrinsic(cast<Instruction>(V));
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n");
+  // Fix up any stores or returns that use the results of the promoted
+  // chain.
+  for (auto I : Roots) {
+    LLVM_DEBUG(dbgs() << " - " << *I << "\n");
+    Type *TruncTy = OrigTy;
+    if (auto *Store = dyn_cast<StoreInst>(I)) {
+      auto *PtrTy = cast<PointerType>(Store->getPointerOperandType());
+      TruncTy = PtrTy->getElementType();
+    } else if (isa<ReturnInst>(I)) {
+      Function *F = I->getParent()->getParent();
+      TruncTy = F->getFunctionType()->getReturnType();
+    }
+
+    for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+      Value *V = I->getOperand(i);
+      if (Promoted.count(V) || NewInsts.count(V)) {
+        if (auto *Op = dyn_cast<Instruction>(V)) {
+
+          if (auto *Call = dyn_cast<CallInst>(I))
+            TruncTy = Call->getFunctionType()->getParamType(i);
+
+          if (TruncTy == ExtTy)
+            continue;
+
+          LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy
+                     << " Trunc for " << *Op << "\n");
+          Builder.SetInsertPoint(Op);
+          auto *Trunc = cast<Instruction>(Builder.CreateTrunc(Op, TruncTy));
+          Trunc->moveBefore(I);
+          I->setOperand(i, Trunc);
+          NewInsts.insert(Trunc);
+        }
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
+}
+
+bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
+  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
+    return false;
+
+  if (ST->isThumb() && !ST->hasThumb2())
+    return false;
+
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
+    return false;
+
+  // TODO
+  // Would it be profitable? For Thumb code, these parallel DSP instructions
+  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+  // halved. They also do not take immediates as operands.
+  for (auto &Op : I->operands()) {
+    if (isa<Constant>(Op)) {
+      if (!EnableDSPWithImms)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// We accept most instructions, as well as Arguments and ConstantInsts. We
+/// Disallow casts other than zext and truncs and only allow calls if their
+/// return value is zeroext. We don't allow opcodes that can introduce sign
+/// bits.
+bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
+
+  // Non-instruction values that we can handle.
+  if (isa<ConstantInt>(V) || isa<Argument>(V))
+    return true;
+
+  // Memory instructions
+  if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+    return true;
+
+  // Branches and targets.
+  if (auto *ICmp = dyn_cast<ICmpInst>(V))
+    return ICmp->isEquality() || !ICmp->isSigned();
+
+  if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
+    return true;
+
+  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
+    return true;
+
+  // Special cases for calls as we need to check for zeroext
+  // TODO We should accept calls even if they don't have zeroext, as they can
+  // still be roots.
+  if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  else if (auto *Cast = dyn_cast<CastInst>(V)) {
+    if (isa<ZExtInst>(Cast))
+      return Cast->getDestTy()->getScalarSizeInBits() <= 32;
+    else if (auto *Trunc = dyn_cast<TruncInst>(V))
+      return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
+    else {
+      LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
+      return false;
+    }
+  } else if (!isa<BinaryOperator>(V)) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
+    return false;
+  }
+
+  bool res = !isSigned(V);
+  if (!res)
+    LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n");
+  return res;
+}
+
+/// Check that the type of V would be promoted and that the original type is
+/// smaller than the targeted promoted type. Check that we're not trying to
+/// promote something larger than our base 'TypeSize' type.
+bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
+  if (!isSupportedType(V))
+    return false;
+
+  unsigned VSize = 0;
+  if (auto *Ld = dyn_cast<LoadInst>(V)) {
+    auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
+    VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
+  } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+    VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
+  } else {
+    VSize = V->getType()->getPrimitiveSizeInBits();
+  }
+
+  if (VSize > TypeSize)
+    return false;
+
+  if (isPromotedResultSafe(V))
+    return true;
+
+  if (auto *I = dyn_cast<Instruction>(V))
+    return isNarrowInstSupported(I);
+
+  return false;
+}
+
+bool ARMCodeGenPrepare::TryToPromote(Value *V) {
+  OrigTy = V->getType();
+  TypeSize = OrigTy->getPrimitiveSizeInBits();
+
+  if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+
+  SetVector<Value*> WorkList;
+  SmallPtrSet<Value*, 8> Leaves;
+  SmallPtrSet<Instruction*, 4> Roots;
+  WorkList.insert(V);
+  SmallPtrSet<Value*, 16> CurrentVisited;
+  CurrentVisited.clear();
+
+  // Return true if the given value can, or has been, visited. Add V to the
+  // worklist if needed.
+  auto AddLegalInst = [&](Value *V) {
+    if (CurrentVisited.count(V))
+      return true;
+
+    if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
+      return false;
+    }
+
+    WorkList.insert(V);
+    return true;
+  };
+
+  // Iterate through, and add to, a tree of operands and users in the use-def.
+  while (!WorkList.empty()) {
+    Value *V = WorkList.back();
+    WorkList.pop_back();
+    if (CurrentVisited.count(V))
+      continue;
+
+    if (!isa<Instruction>(V) && !isSource(V))
+      continue;
+
+    // If we've already visited this value from somewhere, bail now because
+    // the tree has already been explored.
+    // TODO: This could limit the transform, ie if we try to promote something
+    // from an i8 and fail first, before trying an i16.
+    if (AllVisited.count(V)) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n");
+      return false;
+    }
+
+    CurrentVisited.insert(V);
+    AllVisited.insert(V);
+
+    // Calls can be both sources and sinks.
+    if (isSink(V))
+      Roots.insert(cast<Instruction>(V));
+    if (isSource(V))
+      Leaves.insert(V);
+    else if (auto *I = dyn_cast<Instruction>(V)) {
+      // Visit operands of any instruction visited.
+      for (auto &U : I->operands()) {
+        if (!AddLegalInst(U))
+          return false;
+      }
+    }
+
+    // Don't visit users of a node which isn't going to be mutated unless its a
+    // source.
+    if (isSource(V) || shouldPromote(V)) {
+      for (Use &U : V->uses()) {
+        if (!AddLegalInst(U.getUser()))
+          return false;
+      }
+    }
+  }
+
+  unsigned NumToPromote = 0;
+  unsigned Cost = 0;
+  for (auto *V : CurrentVisited) {
+    // Truncs will cause a uxt and no zeroext arguments will often require
+    // a uxt somewhere.
+    if (isa<TruncInst>(V))
+      ++Cost;
+    else if (auto *Arg = dyn_cast<Argument>(V)) {
+      if (!Arg->hasZExtAttr())
+        ++Cost;
+    }
+
+    // Mem ops can automatically be extended/truncated and non-instructions
+    // don't need anything done.
+    if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
+      continue;
+
+    // Will need to truncate calls args and returns.
+    if (Roots.count(cast<Instruction>(V))) {
+      ++Cost;
+      continue;
+    }
+
+    if (shouldPromote(V))
+      ++NumToPromote;
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
+             for (auto *I : CurrentVisited)
+               I->dump();
+             );
+  LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
+             << " instructions = " << Cost << "\n");
+  if (Cost > NumToPromote || (NumToPromote == 0))
+    return false;
+
+  Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
+  return true;
+}
+
+bool ARMCodeGenPrepare::doInitialization(Module &M) {
+  Promoter = new IRPromoter(&M);
+  return false;
+}
+
+bool ARMCodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F) || DisableCGP)
+    return false;
+
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  ST = &TM.getSubtarget<ARMSubtarget>(F);
+  bool MadeChange = false;
+  LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
+
+  // Search up from icmps to try to promote their operands.
+  for (BasicBlock &BB : F) {
+    auto &Insts = BB.getInstList();
+    for (auto &I : Insts) {
+      if (AllVisited.count(&I))
+        continue;
+
+      if (isa<ICmpInst>(I)) {
+        auto &CI = cast<ICmpInst>(I);
+
+        // Skip signed or pointer compares
+        if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
+          continue;
+
+        LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+        for (auto &Op : CI.operands()) {
+          if (auto *I = dyn_cast<Instruction>(Op)) {
+            if (isa<ZExtInst>(I))
+              MadeChange |= TryToPromote(I->getOperand(0));
+            else
+              MadeChange |= TryToPromote(I);
+          }
+        }
+      }
+    }
+    Promoter->Cleanup();
+    LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
+                dbgs();
+                report_fatal_error("Broken function after type promotion");
+               });
+  }
+  if (MadeChange)
+    LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
+
+  return MadeChange;
+}
+
+bool ARMCodeGenPrepare::doFinalization(Module &M) {
+  delete Promoter;
+  return false;
+}
+
+INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
+                      "ARM IR optimizations", false, false)
+INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
+                    false, false)
+
+char ARMCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createARMCodeGenPreparePass() {
+  return new ARMCodeGenPrepare();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 8baee1ce281d..de08eb8c6985 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -301,7 +302,7 @@ void ARMConstantIslands::verify() {
                           return BBInfo[LHS.getNumber()].postOffset() <
                                  BBInfo[RHS.getNumber()].postOffset();
                         }));
-  DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
+  LLVM_DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
   for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
     CPUser &U = CPUsers[i];
     unsigned UserOffset = getUserOffset(U);
@@ -309,12 +310,12 @@ void ARMConstantIslands::verify() {
     // adjustment.
     if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
                          /* DoDump = */ true)) {
-      DEBUG(dbgs() << "OK\n");
+      LLVM_DEBUG(dbgs() << "OK\n");
       continue;
     }
-    DEBUG(dbgs() << "Out of range.\n");
+    LLVM_DEBUG(dbgs() << "Out of range.\n");
     dumpBBs();
-    DEBUG(MF->dump());
+    LLVM_DEBUG(MF->dump());
     llvm_unreachable("Constant pool entry out of range!");
   }
 #endif
@@ -323,7 +324,7 @@ void ARMConstantIslands::verify() {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
 LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
-  DEBUG({
+  LLVM_DEBUG({
     for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
       const BasicBlockInfo &BBI = BBInfo[J];
       dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
@@ -340,9 +341,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MCP = mf.getConstantPool();
 
-  DEBUG(dbgs() << "***** ARMConstantIslands: "
-               << MCP->getConstants().size() << " CP entries, aligned to "
-               << MCP->getConstantPoolAlignment() << " bytes *****\n");
+  LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: "
+                    << MCP->getConstants().size() << " CP entries, aligned to "
+                    << MCP->getConstantPoolAlignment() << " bytes *****\n");
 
   STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
   TII = STI->getInstrInfo();
@@ -393,7 +394,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // constant pool users.
   initializeFunctionInfo(CPEMIs);
   CPEMIs.clear();
-  DEBUG(dumpBBs());
+  LLVM_DEBUG(dumpBBs());
 
   // Functions with jump tables need an alignment of 4 because they use the ADR
   // instruction, which aligns the PC to 4 bytes before adding an offset.
@@ -407,7 +408,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // is no change.
   unsigned NoCPIters = 0, NoBRIters = 0;
   while (true) {
-    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    LLVM_DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
     bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
       // For most inputs, it converges in no more than 5 iterations.
@@ -416,19 +417,19 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
       CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2);
     if (CPChange && ++NoCPIters > CPMaxIteration)
       report_fatal_error("Constant Island pass failed to converge!");
-    DEBUG(dumpBBs());
+    LLVM_DEBUG(dumpBBs());
 
     // Clear NewWaterList now.  If we split a block for branches, it should
     // appear as "new water" for the next iteration of constant pool placement.
     NewWaterList.clear();
 
-    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    LLVM_DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
     bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
       BRChange |= fixupImmediateBr(ImmBranches[i]);
     if (BRChange && ++NoBRIters > 30)
       report_fatal_error("Branch Fix Up pass failed to converge!");
-    DEBUG(dumpBBs());
+    LLVM_DEBUG(dumpBBs());
 
     if (!CPChange && !BRChange)
       break;
@@ -464,7 +465,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     }
   }
 
-  DEBUG(dbgs() << '\n'; dumpBBs());
+  LLVM_DEBUG(dbgs() << '\n'; dumpBBs());
 
   BBInfo.clear();
   WaterList.clear();
@@ -479,7 +480,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// \brief Perform the initial placement of the regular constant pool entries.
+/// Perform the initial placement of the regular constant pool entries.
 /// To start with, we put them all at the end of the function.
 void
 ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) {
@@ -510,7 +511,6 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
   const DataLayout &TD = MF->getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
-    assert(Size >= 4 && "Too small constant pool entry");
     unsigned Align = CPs[i].getAlignment();
     assert(isPowerOf2_32(Align) && "Invalid alignment");
     // Verify that all constant pool entries are a multiple of their alignment.
@@ -534,13 +534,13 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
     // Add a new CPEntry, but no corresponding CPUser yet.
     CPEntries.emplace_back(1, CPEntry(CPEMI, i));
     ++NumCPEs;
-    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
-                 << Size << ", align = " << Align <<'\n');
+    LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                      << Size << ", align = " << Align << '\n');
   }
-  DEBUG(BB->dump());
+  LLVM_DEBUG(BB->dump());
 }
 
-/// \brief Do initial placement of the jump tables. Because Thumb2's TBB and TBH
+/// Do initial placement of the jump tables. Because Thumb2's TBB and TBH
 /// instructions can be made more efficient if the jump table immediately
 /// follows the instruction, it's best to place them immediately next to their
 /// jumps to begin with. In almost all cases they'll never be moved from that
@@ -701,7 +701,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
       WaterList.push_back(&MBB);
 
     for (MachineInstr &I : MBB) {
-      if (I.isDebugValue())
+      if (I.isDebugInstr())
         continue;
 
       unsigned Opc = I.getOpcode();
@@ -820,6 +820,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             Scale = 4;  // +-(offset_8*4)
             NegOk = true;
             break;
+          case ARM::VLDRH:
+            Bits = 8;
+            Scale = 2;  // +-(offset_8*2)
+            NegOk = true;
+            break;
 
           case ARM::tLDRHi:
             Bits = 5;
@@ -1066,7 +1071,7 @@ bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
   unsigned CPEOffset  = getOffsetOf(CPEMI);
 
   if (DoDump) {
-    DEBUG({
+    LLVM_DEBUG({
       unsigned Block = MI->getParent()->getNumber();
       const BasicBlockInfo &BBI = BBInfo[Block];
       dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
@@ -1159,7 +1164,7 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) {
   // Check to see if the CPE is already in-range.
   if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
                        true)) {
-    DEBUG(dbgs() << "In range\n");
+    LLVM_DEBUG(dbgs() << "In range\n");
     return 1;
   }
 
@@ -1175,8 +1180,8 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) {
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
-      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
-                   << CPEs[i].CPI << "\n");
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                        << CPEs[i].CPI << "\n");
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
@@ -1261,8 +1266,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
       // This is the least amount of required padding seen so far.
       BestGrowth = Growth;
       WaterIter = IP;
-      DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
-                   << " Growth=" << Growth << '\n');
+      LLVM_DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
+                        << " Growth=" << Growth << '\n');
 
       if (CloserWater && WaterBB == U.MI->getParent())
         return true;
@@ -1305,8 +1310,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
 
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
-      DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
-                   << format(", expected CPE offset %#x\n", CPEOffset));
+      LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
+                        << format(", expected CPE offset %#x\n", CPEOffset));
       NewMBB = &*++UserMBB->getIterator();
       // Add an unconditional branch from UserMBB to fallthrough block.  Record
       // it for branch lengthening; this new branch will not get out of range,
@@ -1349,18 +1354,17 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   unsigned KnownBits = UserBBI.internalKnownBits();
   unsigned UPad = UnknownPadding(LogAlign, KnownBits);
   unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
-  DEBUG(dbgs() << format("Split in middle of big block before %#x",
-                         BaseInsertOffset));
+  LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                              BaseInsertOffset));
 
   // The 4 in the following is for the unconditional branch we'll be inserting
   // (allows for long branch on Thumb1).  Alignment of the island is handled
   // inside isOffsetInRange.
   BaseInsertOffset -= 4;
 
-  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
-               << " la=" << LogAlign
-               << " kb=" << KnownBits
-               << " up=" << UPad << '\n');
+  LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+                    << " la=" << LogAlign << " kb=" << KnownBits
+                    << " up=" << UPad << '\n');
 
   // This could point off the end of the block if we've already got constant
   // pool entries following this block; only the last one is in the water list.
@@ -1373,7 +1377,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     BaseInsertOffset =
         std::max(UserBBI.postOffset() - UPad - 8,
                  UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
-    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+    LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
   }
   unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
     CPEMI->getOperand(2).getImm();
@@ -1417,8 +1421,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   }
 
   // We really must not split an IT block.
-  DEBUG(unsigned PredReg;
-        assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
+  LLVM_DEBUG(unsigned PredReg; assert(
+                 !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
 
   NewMBB = splitBlockBeforeInstr(&*MI);
 }
@@ -1452,7 +1456,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
   MachineBasicBlock *NewMBB;
   water_iterator IP;
   if (findAvailableWater(U, UserOffset, IP, CloserWater)) {
-    DEBUG(dbgs() << "Found water in range\n");
+    LLVM_DEBUG(dbgs() << "Found water in range\n");
     MachineBasicBlock *WaterBB = *IP;
 
     // If the original WaterList entry was "new water" on this iteration,
@@ -1465,7 +1469,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
     NewMBB = &*++WaterBB->getIterator();
   } else {
     // No water found.
-    DEBUG(dbgs() << "No water found\n");
+    LLVM_DEBUG(dbgs() << "No water found\n");
     createNewWater(CPUserIndex, UserOffset, NewMBB);
 
     // splitBlockBeforeInstr adds to WaterList, which is important when it is
@@ -1481,6 +1485,12 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
     // We are adding new water.  Update NewWaterList.
     NewWaterList.insert(NewIsland);
   }
+  // Always align the new block because CP entries can be smaller than 4
+  // bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may
+  // be an already aligned constant pool block.
+  const unsigned Align = isThumb ? 1 : 2;
+  if (NewMBB->getAlignment() < Align)
+    NewMBB->setAlignment(Align);
 
   // Remove the original WaterList entry; we want subsequent insertions in
   // this vicinity to go after the one we're about to insert.  This
@@ -1522,8 +1532,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
       break;
     }
 
-  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
-        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+  LLVM_DEBUG(
+      dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+             << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
 
   return true;
 }
@@ -1578,11 +1589,11 @@ bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
   unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
   unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
 
-  DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
-               << " from " << printMBBReference(*MI->getParent())
-               << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
-               << " to " << DestOffset << " offset "
-               << int(DestOffset - BrOffset) << "\t" << *MI);
+  LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
+                    << " from " << printMBBReference(*MI->getParent())
+                    << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
+                    << " to " << DestOffset << " offset "
+                    << int(DestOffset - BrOffset) << "\t" << *MI);
 
   if (BrOffset <= DestOffset) {
     // Branch before the Dest.
@@ -1629,7 +1640,7 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   HasFarJump = true;
   ++NumUBrFixed;
 
-  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+  LLVM_DEBUG(dbgs() << "  Changed B to long jump " << *MI);
 
   return true;
 }
@@ -1673,8 +1684,9 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
       // b   L1
       MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
       if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
-        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
-                     << *BMI);
+        LLVM_DEBUG(
+            dbgs() << "  Invert Bcc condition and swap its destination with "
+                   << *BMI);
         BMI->getOperand(0).setMBB(DestBB);
         MI->getOperand(0).setMBB(NewDest);
         MI->getOperand(1).setImm(CC);
@@ -1700,9 +1712,9 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   }
   MachineBasicBlock *NextBB = &*++MBB->getIterator();
 
-  DEBUG(dbgs() << "  Insert B to " << printMBBReference(*DestBB)
-               << " also invert condition and change dest. to "
-               << printMBBReference(*NextBB) << "\n");
+  LLVM_DEBUG(dbgs() << "  Insert B to " << printMBBReference(*DestBB)
+                    << " also invert condition and change dest. to "
+                    << printMBBReference(*NextBB) << "\n");
 
   // Insert a new conditional branch and a new unconditional branch.
   // Also update the ImmBranch as well as adding a new entry for the new branch.
@@ -1795,7 +1807,7 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
 
     // FIXME: Check if offset is multiple of scale if scale is not 4.
     if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
-      DEBUG(dbgs() << "Shrink: " << *U.MI);
+      LLVM_DEBUG(dbgs() << "Shrink: " << *U.MI);
       U.MI->setDesc(TII->get(NewOpc));
       MachineBasicBlock *MBB = U.MI->getParent();
       BBInfo[MBB->getNumber()].Size -= 2;
@@ -1839,7 +1851,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
       unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
       MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
       if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
-        DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
+        LLVM_DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
         Br.MI->setDesc(TII->get(NewOpc));
         MachineBasicBlock *MBB = Br.MI->getParent();
         BBInfo[MBB->getNumber()].Size -= 2;
@@ -1883,7 +1895,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
               CmpMI->getOperand(1).getImm() == 0 &&
               isARMLowRegister(Reg)) {
             MachineBasicBlock *MBB = Br.MI->getParent();
-            DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+            LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
             MachineInstr *NewBR =
               BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
               .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
@@ -1918,7 +1930,7 @@ static bool isSimpleIndexCalc(MachineInstr &I, unsigned EntryReg,
   return true;
 }
 
-/// \brief While trying to form a TBB/TBH instruction, we may (if the table
+/// While trying to form a TBB/TBH instruction, we may (if the table
 /// doesn't immediately follow the BR_JT) need access to the start of the
 /// jump-table. We know one instruction that produces such a register; this
 /// function works out whether that definition can be preserved to the BR_JT,
@@ -2006,7 +2018,7 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
   return true;
 }
 
-/// \brief Returns whether CPEMI is the first instruction in the block
+/// Returns whether CPEMI is the first instruction in the block
 /// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
 /// we can switch the first register to PC and usually remove the address
 /// calculation that preceded it.
@@ -2052,7 +2064,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
     }
   }
 
-  DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
+  LLVM_DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
   RemovableAdd->eraseFromParent();
   DeadSize += 4;
 }
@@ -2198,7 +2210,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       DeadSize += 4;
     }
 
-    DEBUG(dbgs() << "Shrink JT: " << *MI);
+    LLVM_DEBUG(dbgs() << "Shrink JT: " << *MI);
     MachineInstr *CPEMI = User.CPEMI;
     unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
     if (!isThumb2)
@@ -2212,7 +2224,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
             .addReg(IdxReg, getKillRegState(IdxRegKill))
             .addJumpTableIndex(JTI, JTOP.getTargetFlags())
             .addImm(CPEMI->getOperand(0).getImm());
-    DEBUG(dbgs() << printMBBReference(*MBB) << ": " << *NewJTMI);
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": " << *NewJTMI);
 
     unsigned JTOpc = ByteOk ? ARM::JUMPTABLE_TBB : ARM::JUMPTABLE_TBH;
     CPEMI->setDesc(TII->get(JTOpc));
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
index 39ae02af513b..236c4fab2a5c 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -14,6 +14,7 @@
 #include "ARMConstantPoolValue.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b14b2c6a813f..5dac6ec0b799 100644
--- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -110,6 +110,9 @@ namespace {
   // OddDblSpc depending on the lane number operand.
   enum NEONRegSpacing {
     SingleSpc,
+    SingleLowSpc ,  // Single spacing, low registers, three and four vectors.
+    SingleHighQSpc, // Single spacing, high registers, four vectors.
+    SingleHighTSpc, // Single spacing, high registers, three vectors.
     EvenDblSpc,
     OddDblSpc
   };
@@ -154,10 +157,41 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD1LNq8Pseudo,      ARM::VLD1LNd8,      true, false, false, EvenDblSpc, 1, 8 ,true},
 { ARM::VLD1LNq8Pseudo_UPD,  ARM::VLD1LNd8_UPD, true, true, true,  EvenDblSpc, 1, 8 ,true},
 
+{ ARM::VLD1d16QPseudo,      ARM::VLD1d16Q,     true,  false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VLD1d16TPseudo,      ARM::VLD1d16T,     true,  false, false, SingleSpc,  3, 4 ,false},
+{ ARM::VLD1d32QPseudo,      ARM::VLD1d32Q,     true,  false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VLD1d32TPseudo,      ARM::VLD1d32T,     true,  false, false, SingleSpc,  3, 2 ,false},
 { ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
 { ARM::VLD1d64QPseudoWB_fixed,  ARM::VLD1d64Qwb_fixed,   true,  true, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_register,  ARM::VLD1d64Qwb_register,   true,  true, true, SingleSpc,  4, 1 ,false},
 { ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
 { ARM::VLD1d64TPseudoWB_fixed,  ARM::VLD1d64Twb_fixed,   true,  true, false, SingleSpc,  3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_register,  ARM::VLD1d64Twb_register, true, true, true,  SingleSpc,  3, 1 ,false},
+{ ARM::VLD1d8QPseudo,       ARM::VLD1d8Q,      true,  false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VLD1d8TPseudo,       ARM::VLD1d8T,      true,  false, false, SingleSpc,  3, 8 ,false},
+{ ARM::VLD1q16HighQPseudo,  ARM::VLD1d16Q,     true,  false, false, SingleHighQSpc,  4, 4 ,false},
+{ ARM::VLD1q16HighTPseudo,  ARM::VLD1d16T,     true,  false, false, SingleHighTSpc,  3, 4 ,false},
+{ ARM::VLD1q16LowQPseudo_UPD,  ARM::VLD1d16Qwb_fixed,   true,  true, true, SingleLowSpc,  4, 4 ,false},
+{ ARM::VLD1q16LowTPseudo_UPD,  ARM::VLD1d16Twb_fixed,   true,  true, true, SingleLowSpc,  3, 4 ,false},
+{ ARM::VLD1q32HighQPseudo,  ARM::VLD1d32Q,     true,  false, false, SingleHighQSpc,  4, 2 ,false},
+{ ARM::VLD1q32HighTPseudo,  ARM::VLD1d32T,     true,  false, false, SingleHighTSpc,  3, 2 ,false},
+{ ARM::VLD1q32LowQPseudo_UPD,  ARM::VLD1d32Qwb_fixed,   true,  true, true, SingleLowSpc,  4, 2 ,false},
+{ ARM::VLD1q32LowTPseudo_UPD,  ARM::VLD1d32Twb_fixed,   true,  true, true, SingleLowSpc,  3, 2 ,false},
+{ ARM::VLD1q64HighQPseudo,  ARM::VLD1d64Q,     true,  false, false, SingleHighQSpc,  4, 1 ,false},
+{ ARM::VLD1q64HighTPseudo,  ARM::VLD1d64T,     true,  false, false, SingleHighTSpc,  3, 1 ,false},
+{ ARM::VLD1q64LowQPseudo_UPD,  ARM::VLD1d64Qwb_fixed,   true,  true, true, SingleLowSpc,  4, 1 ,false},
+{ ARM::VLD1q64LowTPseudo_UPD,  ARM::VLD1d64Twb_fixed,   true,  true, true, SingleLowSpc,  3, 1 ,false},
+{ ARM::VLD1q8HighQPseudo,   ARM::VLD1d8Q,     true,  false, false, SingleHighQSpc,  4, 8 ,false},
+{ ARM::VLD1q8HighTPseudo,   ARM::VLD1d8T,     true,  false, false, SingleHighTSpc,  3, 8 ,false},
+{ ARM::VLD1q8LowQPseudo_UPD,  ARM::VLD1d8Qwb_fixed,   true,  true, true, SingleLowSpc,  4, 8 ,false},
+{ ARM::VLD1q8LowTPseudo_UPD,  ARM::VLD1d8Twb_fixed,   true,  true, true, SingleLowSpc,  3, 8 ,false},
+
+{ ARM::VLD2DUPq16EvenPseudo,  ARM::VLD2DUPd16x2,  true, false, false, EvenDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq16OddPseudo,   ARM::VLD2DUPd16x2,  true, false, false, OddDblSpc,  2, 4 ,false},
+{ ARM::VLD2DUPq32EvenPseudo,  ARM::VLD2DUPd32x2,  true, false, false, EvenDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq32OddPseudo,   ARM::VLD2DUPd32x2,  true, false, false, OddDblSpc,  2, 2 ,false},
+{ ARM::VLD2DUPq8EvenPseudo,   ARM::VLD2DUPd8x2,   true, false, false, EvenDblSpc, 2, 8 ,false},
+{ ARM::VLD2DUPq8OddPseudo,    ARM::VLD2DUPd8x2,   true, false, false, OddDblSpc,  2, 8 ,false},
 
 { ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
 { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
@@ -186,6 +220,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true,  SingleSpc, 3, 2,true},
 { ARM::VLD3DUPd8Pseudo,      ARM::VLD3DUPd8,      true, false, false, SingleSpc, 3, 8,true},
 { ARM::VLD3DUPd8Pseudo_UPD,  ARM::VLD3DUPd8_UPD, true, true, true,  SingleSpc, 3, 8,true},
+{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16,     true, false, false, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3DUPq16OddPseudo,  ARM::VLD3DUPq16,     true, false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32,     true, false, false, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3DUPq32OddPseudo,  ARM::VLD3DUPq32,     true, false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3DUPq8EvenPseudo,  ARM::VLD3DUPq8,      true, false, false, EvenDblSpc, 3, 8 ,true},
+{ ARM::VLD3DUPq8OddPseudo,   ARM::VLD3DUPq8,      true, false, false, OddDblSpc,  3, 8 ,true},
 
 { ARM::VLD3LNd16Pseudo,     ARM::VLD3LNd16,     true, false, false, SingleSpc,  3, 4 ,true},
 { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
@@ -221,6 +261,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true,  SingleSpc, 4, 2,true},
 { ARM::VLD4DUPd8Pseudo,      ARM::VLD4DUPd8,      true, false, false, SingleSpc, 4, 8,true},
 { ARM::VLD4DUPd8Pseudo_UPD,  ARM::VLD4DUPd8_UPD, true, true, true,  SingleSpc, 4, 8,true},
+{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16,     true, false, false, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4DUPq16OddPseudo,  ARM::VLD4DUPq16,     true, false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32,     true, false, false, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4DUPq32OddPseudo,  ARM::VLD4DUPq32,     true, false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4DUPq8EvenPseudo,  ARM::VLD4DUPq8,      true, false, false, EvenDblSpc, 4, 8 ,true},
+{ ARM::VLD4DUPq8OddPseudo,   ARM::VLD4DUPq8,      true, false, false, OddDblSpc,  4, 8 ,true},
 
 { ARM::VLD4LNd16Pseudo,     ARM::VLD4LNd16,     true, false, false, SingleSpc,  4, 4 ,true},
 { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
@@ -257,12 +303,34 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, false, EvenDblSpc, 1, 8 ,true},
 { ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true, true,  EvenDblSpc, 1, 8 ,true},
 
+{ ARM::VST1d16QPseudo,      ARM::VST1d16Q,     false, false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VST1d16TPseudo,      ARM::VST1d16T,     false, false, false, SingleSpc,  3, 4 ,false},
+{ ARM::VST1d32QPseudo,      ARM::VST1d32Q,     false, false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VST1d32TPseudo,      ARM::VST1d32T,     false, false, false, SingleSpc,  3, 2 ,false},
 { ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,false},
 { ARM::VST1d64QPseudoWB_fixed,  ARM::VST1d64Qwb_fixed, false, true, false,  SingleSpc,  4, 1 ,false},
 { ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true,  SingleSpc,  4, 1 ,false},
 { ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,false},
 { ARM::VST1d64TPseudoWB_fixed,  ARM::VST1d64Twb_fixed, false, true, false,  SingleSpc,  3, 1 ,false},
 { ARM::VST1d64TPseudoWB_register,  ARM::VST1d64Twb_register, false, true, true,  SingleSpc,  3, 1 ,false},
+{ ARM::VST1d8QPseudo,       ARM::VST1d8Q,      false, false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VST1d8TPseudo,       ARM::VST1d8T,      false, false, false, SingleSpc,  3, 8 ,false},
+{ ARM::VST1q16HighQPseudo,  ARM::VST1d16Q,      false, false, false, SingleHighQSpc,   4, 4 ,false},
+{ ARM::VST1q16HighTPseudo,  ARM::VST1d16T,      false, false, false, SingleHighTSpc,   3, 4 ,false},
+{ ARM::VST1q16LowQPseudo_UPD,   ARM::VST1d16Qwb_fixed,  false, true, true, SingleLowSpc,   4, 4 ,false},
+{ ARM::VST1q16LowTPseudo_UPD,   ARM::VST1d16Twb_fixed,  false, true, true, SingleLowSpc,   3, 4 ,false},
+{ ARM::VST1q32HighQPseudo,  ARM::VST1d32Q,      false, false, false, SingleHighQSpc,   4, 2 ,false},
+{ ARM::VST1q32HighTPseudo,  ARM::VST1d32T,      false, false, false, SingleHighTSpc,   3, 2 ,false},
+{ ARM::VST1q32LowQPseudo_UPD,   ARM::VST1d32Qwb_fixed,  false, true, true, SingleLowSpc,   4, 2 ,false},
+{ ARM::VST1q32LowTPseudo_UPD,   ARM::VST1d32Twb_fixed,  false, true, true, SingleLowSpc,   3, 2 ,false},
+{ ARM::VST1q64HighQPseudo,  ARM::VST1d64Q,      false, false, false, SingleHighQSpc,   4, 1 ,false},
+{ ARM::VST1q64HighTPseudo,  ARM::VST1d64T,      false, false, false, SingleHighTSpc,   3, 1 ,false},
+{ ARM::VST1q64LowQPseudo_UPD,   ARM::VST1d64Qwb_fixed,  false, true, true, SingleLowSpc,   4, 1 ,false},
+{ ARM::VST1q64LowTPseudo_UPD,   ARM::VST1d64Twb_fixed,  false, true, true, SingleLowSpc,   3, 1 ,false},
+{ ARM::VST1q8HighQPseudo,   ARM::VST1d8Q,      false, false, false, SingleHighQSpc,   4, 8 ,false},
+{ ARM::VST1q8HighTPseudo,   ARM::VST1d8T,      false, false, false, SingleHighTSpc,   3, 8 ,false},
+{ ARM::VST1q8LowQPseudo_UPD,   ARM::VST1d8Qwb_fixed,  false, true, true, SingleLowSpc,   4, 8 ,false},
+{ ARM::VST1q8LowTPseudo_UPD,   ARM::VST1d8Twb_fixed,  false, true, true, SingleLowSpc,   3, 8 ,false},
 
 { ARM::VST2LNd16Pseudo,     ARM::VST2LNd16,     false, false, false, SingleSpc, 2, 4 ,true},
 { ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true,  SingleSpc, 2, 4 ,true},
@@ -347,11 +415,11 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
 #ifndef NDEBUG
   // Make sure the table is sorted.
-  static bool TableChecked = false;
-  if (!TableChecked) {
+  static std::atomic<bool> TableChecked(false);
+  if (!TableChecked.load(std::memory_order_relaxed)) {
     assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
            "NEONLdStTable is not sorted!");
-    TableChecked = true;
+    TableChecked.store(true, std::memory_order_relaxed);
   }
 #endif
 
@@ -368,11 +436,21 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
 static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
                         const TargetRegisterInfo *TRI, unsigned &D0,
                         unsigned &D1, unsigned &D2, unsigned &D3) {
-  if (RegSpc == SingleSpc) {
+  if (RegSpc == SingleSpc || RegSpc == SingleLowSpc) {
     D0 = TRI->getSubReg(Reg, ARM::dsub_0);
     D1 = TRI->getSubReg(Reg, ARM::dsub_1);
     D2 = TRI->getSubReg(Reg, ARM::dsub_2);
     D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+  } else if (RegSpc == SingleHighQSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_4);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_5);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_6);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_7);
+  } else if (RegSpc == SingleHighTSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_3);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_4);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_6);
   } else if (RegSpc == EvenDblSpc) {
     D0 = TRI->getSubReg(Reg, ARM::dsub_0);
     D1 = TRI->getSubReg(Reg, ARM::dsub_2);
@@ -404,15 +482,31 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
 
   bool DstIsDead = MI.getOperand(OpIdx).isDead();
   unsigned DstReg = MI.getOperand(OpIdx++).getReg();
-  unsigned D0, D1, D2, D3;
-  GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
-  MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
-  if (NumRegs > 1 && TableEntry->copyAllListRegs)
-    MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
-  if (NumRegs > 2 && TableEntry->copyAllListRegs)
-    MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
-  if (NumRegs > 3 && TableEntry->copyAllListRegs)
-    MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+  if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
+     TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
+     TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
+    unsigned SubRegIndex;
+    if (RegSpc == EvenDblSpc) {
+      SubRegIndex = ARM::dsub_0;
+    } else {
+      assert(RegSpc == OddDblSpc && "Unexpected spacing!");
+      SubRegIndex = ARM::dsub_1;
+    }
+    unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex);
+    unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
+                                                   &ARM::DPairSpcRegClass);
+    MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
+  } else {
+    unsigned D0, D1, D2, D3;
+    GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+    MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 1 && TableEntry->copyAllListRegs)
+      MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 2 && TableEntry->copyAllListRegs)
+      MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 3 && TableEntry->copyAllListRegs)
+      MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+  }
 
   if (TableEntry->isUpdating)
     MIB.add(MI.getOperand(OpIdx++));
@@ -420,16 +514,45 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
   // Copy the addrmode6 operands.
   MIB.add(MI.getOperand(OpIdx++));
   MIB.add(MI.getOperand(OpIdx++));
+
   // Copy the am6offset operand.
-  if (TableEntry->hasWritebackOperand)
-    MIB.add(MI.getOperand(OpIdx++));
+  if (TableEntry->hasWritebackOperand) {
+    // TODO: The writing-back pseudo instructions we translate here are all
+    // defined to take am6offset nodes that are capable to represent both fixed
+    // and register forms. Some real instructions, however, do not rely on
+    // am6offset and have separate definitions for such forms. When this is the
+    // case, fixed forms do not take any offset nodes, so here we skip them for
+    // such instructions. Once all real and pseudo writing-back instructions are
+    // rewritten without use of am6offset nodes, this code will go away.
+    const MachineOperand &AM6Offset = MI.getOperand(OpIdx++);
+    if (TableEntry->RealOpc == ARM::VLD1d8Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d16Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d32Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d64Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d8Twb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d16Twb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d32Twb_fixed ||
+        TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) {
+      assert(AM6Offset.getReg() == 0 &&
+             "A fixed writing-back pseudo instruction provides an offset "
+             "register!");
+    } else {
+      MIB.add(AM6Offset);
+    }
+  }
 
   // For an instruction writing double-spaced subregs, the pseudo instruction
   // has an extra operand that is a use of the super-register.  Record the
   // operand index and skip over it.
   unsigned SrcOpIdx = 0;
-  if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
-    SrcOpIdx = OpIdx++;
+  if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 &&
+     TableEntry->RealOpc != ARM::VLD2DUPd16x2 &&
+     TableEntry->RealOpc != ARM::VLD2DUPd32x2) {
+    if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
+        RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
+        RegSpc == SingleHighTSpc)
+      SrcOpIdx = OpIdx++;
+  }
 
   // Copy the predicate operands.
   MIB.add(MI.getOperand(OpIdx++));
@@ -472,9 +595,31 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
   // Copy the addrmode6 operands.
   MIB.add(MI.getOperand(OpIdx++));
   MIB.add(MI.getOperand(OpIdx++));
-  // Copy the am6offset operand.
-  if (TableEntry->hasWritebackOperand)
-    MIB.add(MI.getOperand(OpIdx++));
+
+  if (TableEntry->hasWritebackOperand) {
+    // TODO: The writing-back pseudo instructions we translate here are all
+    // defined to take am6offset nodes that are capable to represent both fixed
+    // and register forms. Some real instructions, however, do not rely on
+    // am6offset and have separate definitions for such forms. When this is the
+    // case, fixed forms do not take any offset nodes, so here we skip them for
+    // such instructions. Once all real and pseudo writing-back instructions are
+    // rewritten without use of am6offset nodes, this code will go away.
+    const MachineOperand &AM6Offset = MI.getOperand(OpIdx++);
+    if (TableEntry->RealOpc == ARM::VST1d8Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d16Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d32Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d64Qwb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d8Twb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d16Twb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d32Twb_fixed ||
+        TableEntry->RealOpc == ARM::VST1d64Twb_fixed) {
+      assert(AM6Offset.getReg() == 0 &&
+             "A fixed writing-back pseudo instruction provides an offset "
+             "register!");
+    } else {
+      MIB.add(AM6Offset);
+    }
+  }
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
@@ -608,7 +753,6 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MIB.add(MI.getOperand(OpIdx++));
   if (IsExt) {
     MachineOperand VdSrc(MI.getOperand(OpIdx++));
-    VdSrc.setIsRenamable(false);
     MIB.add(VdSrc);
   }
 
@@ -620,7 +764,6 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
 
   // Copy the other source register operand.
   MachineOperand VmSrc(MI.getOperand(OpIdx++));
-  VmSrc.setIsRenamable(false);
   MIB.add(VmSrc);
 
   // Copy the predicate operands.
@@ -1470,7 +1613,6 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       // Copy the destination register.
       MachineOperand Dst(MI.getOperand(OpIdx++));
-      Dst.setIsRenamable(false);
       MIB.add(Dst);
 
       // Copy the predicate operands.
@@ -1504,8 +1646,12 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD3d8Pseudo:
     case ARM::VLD3d16Pseudo:
     case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d8TPseudo:
+    case ARM::VLD1d16TPseudo:
+    case ARM::VLD1d32TPseudo:
     case ARM::VLD1d64TPseudo:
     case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD1d64TPseudoWB_register:
     case ARM::VLD3d8Pseudo_UPD:
     case ARM::VLD3d16Pseudo_UPD:
     case ARM::VLD3d32Pseudo_UPD:
@@ -1521,8 +1667,28 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD4d8Pseudo:
     case ARM::VLD4d16Pseudo:
     case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d8QPseudo:
+    case ARM::VLD1d16QPseudo:
+    case ARM::VLD1d32QPseudo:
     case ARM::VLD1d64QPseudo:
     case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD1d64QPseudoWB_register:
+    case ARM::VLD1q8HighQPseudo:
+    case ARM::VLD1q8LowQPseudo_UPD:
+    case ARM::VLD1q8HighTPseudo:
+    case ARM::VLD1q8LowTPseudo_UPD:
+    case ARM::VLD1q16HighQPseudo:
+    case ARM::VLD1q16LowQPseudo_UPD:
+    case ARM::VLD1q16HighTPseudo:
+    case ARM::VLD1q16LowTPseudo_UPD:
+    case ARM::VLD1q32HighQPseudo:
+    case ARM::VLD1q32LowQPseudo_UPD:
+    case ARM::VLD1q32HighTPseudo:
+    case ARM::VLD1q32LowTPseudo_UPD:
+    case ARM::VLD1q64HighQPseudo:
+    case ARM::VLD1q64LowQPseudo_UPD:
+    case ARM::VLD1q64HighTPseudo:
+    case ARM::VLD1q64LowTPseudo_UPD:
     case ARM::VLD4d8Pseudo_UPD:
     case ARM::VLD4d16Pseudo_UPD:
     case ARM::VLD4d32Pseudo_UPD:
@@ -1547,6 +1713,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD4DUPd8Pseudo_UPD:
     case ARM::VLD4DUPd16Pseudo_UPD:
     case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD2DUPq8EvenPseudo:
+    case ARM::VLD2DUPq8OddPseudo:
+    case ARM::VLD2DUPq16EvenPseudo:
+    case ARM::VLD2DUPq16OddPseudo:
+    case ARM::VLD2DUPq32EvenPseudo:
+    case ARM::VLD2DUPq32OddPseudo:
+    case ARM::VLD3DUPq8EvenPseudo:
+    case ARM::VLD3DUPq8OddPseudo:
+    case ARM::VLD3DUPq16EvenPseudo:
+    case ARM::VLD3DUPq16OddPseudo:
+    case ARM::VLD3DUPq32EvenPseudo:
+    case ARM::VLD3DUPq32OddPseudo:
+    case ARM::VLD4DUPq8EvenPseudo:
+    case ARM::VLD4DUPq8OddPseudo:
+    case ARM::VLD4DUPq16EvenPseudo:
+    case ARM::VLD4DUPq16OddPseudo:
+    case ARM::VLD4DUPq32EvenPseudo:
+    case ARM::VLD4DUPq32OddPseudo:
       ExpandVLD(MBBI);
       return true;
 
@@ -1562,6 +1746,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VST3d8Pseudo:
     case ARM::VST3d16Pseudo:
     case ARM::VST3d32Pseudo:
+    case ARM::VST1d8TPseudo:
+    case ARM::VST1d16TPseudo:
+    case ARM::VST1d32TPseudo:
     case ARM::VST1d64TPseudo:
     case ARM::VST3d8Pseudo_UPD:
     case ARM::VST3d16Pseudo_UPD:
@@ -1580,12 +1767,31 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VST4d8Pseudo:
     case ARM::VST4d16Pseudo:
     case ARM::VST4d32Pseudo:
+    case ARM::VST1d8QPseudo:
+    case ARM::VST1d16QPseudo:
+    case ARM::VST1d32QPseudo:
     case ARM::VST1d64QPseudo:
     case ARM::VST4d8Pseudo_UPD:
     case ARM::VST4d16Pseudo_UPD:
     case ARM::VST4d32Pseudo_UPD:
     case ARM::VST1d64QPseudoWB_fixed:
     case ARM::VST1d64QPseudoWB_register:
+    case ARM::VST1q8HighQPseudo:
+    case ARM::VST1q8LowQPseudo_UPD:
+    case ARM::VST1q8HighTPseudo:
+    case ARM::VST1q8LowTPseudo_UPD:
+    case ARM::VST1q16HighQPseudo:
+    case ARM::VST1q16LowQPseudo_UPD:
+    case ARM::VST1q16HighTPseudo:
+    case ARM::VST1q16LowTPseudo_UPD:
+    case ARM::VST1q32HighQPseudo:
+    case ARM::VST1q32LowQPseudo_UPD:
+    case ARM::VST1q32HighTPseudo:
+    case ARM::VST1q32LowTPseudo_UPD:
+    case ARM::VST1q64HighQPseudo:
+    case ARM::VST1q64LowQPseudo_UPD:
+    case ARM::VST1q64HighTPseudo:
+    case ARM::VST1q64LowTPseudo_UPD:
     case ARM::VST4q8Pseudo_UPD:
     case ARM::VST4q16Pseudo_UPD:
     case ARM::VST4q32Pseudo_UPD:
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
index 60048d4453d8..26d4aaa12acf 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -41,7 +41,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -75,6 +74,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -2352,8 +2352,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
     // If we're lowering a memory intrinsic instead of a regular call, skip the
-    // last two arguments, which shouldn't be passed to the underlying function.
-    if (IntrMemName && e-i <= 2)
+    // last argument, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e - i <= 1)
       break;
 
     ISD::ArgFlagsTy Flags;
@@ -2546,7 +2546,8 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
         if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
             !ARMComputeAddress(MTI.getRawSource(), Src))
           return false;
-        unsigned Alignment = MTI.getAlignment();
+        unsigned Alignment = MinAlign(MTI.getDestAlignment(),
+                                      MTI.getSourceAlignment());
         if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment))
           return true;
       }
@@ -2912,7 +2913,7 @@ static const struct FoldableLoadExtendsStruct {
   { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
 };
 
-/// \brief The specified machine instr operand is a vreg, and that
+/// The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
 /// try to fold the load as an operand to the instruction, returning true if
 /// successful.
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 4ff864ac6ccd..af983ce2606a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -87,6 +87,18 @@ bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
          MF.getSubtarget<ARMSubtarget>().useFastISel();
 }
 
+/// Returns true if the target can safely skip saving callee-saved registers
+/// for noreturn nounwind functions.
+bool ARMFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const {
+  assert(MF.getFunction().hasFnAttribute(Attribute::NoReturn) &&
+         MF.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+         !MF.getFunction().hasFnAttribute(Attribute::UWTable));
+
+  // Frame pointer and link register are not treated as normal CSR, thus we
+  // can always skip CSR saves for nonreturning functions.
+  return true;
+}
+
 /// hasFP - Return true if the specified function should have a dedicated frame
 /// pointer register.  This is true if the function has variable sized allocas
 /// or if frame pointer elimination is disabled.
@@ -209,7 +221,8 @@ static bool WindowsRequiresStackProbe(const MachineFunction &MF,
     F.getFnAttribute("stack-probe-size")
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
-  return StackSizeInBytes >= StackProbeSize;
+  return (StackSizeInBytes >= StackProbeSize) &&
+         !F.hasFnAttribute("no-stack-arg-probe");
 }
 
 namespace {
@@ -918,15 +931,17 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
           return FPOffset;
         }
       }
-    } else if (AFI->isThumb2Function()) {
+    } else if (AFI->isThumbFunction()) {
+      // Prefer SP to base pointer, if the offset is suitably aligned and in
+      // range as the effective range of the immediate offset is bigger when
+      // basing off SP.
       // Use  add <rd>, sp, #<imm8>
       //      ldr <rd>, [sp, #<imm8>]
-      // if at all possible to save space.
       if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
         return Offset;
       // In Thumb2 mode, the negative offset is very limited. Try to avoid
       // out of range references. ldr <rt>,[<rn>, #-<imm8>]
-      if (FPOffset >= -255 && FPOffset < 0) {
+      if (AFI->isThumb2Function() && FPOffset >= -255 && FPOffset < 0) {
         FrameReg = RegInfo->getFrameRegister(MF);
         return FPOffset;
       }
@@ -991,8 +1006,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     if (Regs.empty())
       continue;
 
-    std::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
-                                            const RegAndKill &RHS) {
+    llvm::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
+                                             const RegAndKill &RHS) {
       return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first);
     });
 
@@ -1065,6 +1080,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
           !isTrap && STI.hasV5TOps()) {
         if (MBB.succ_empty()) {
           Reg = ARM::PC;
+          // Fold the return instruction into the LDM.
           DeleteRet = true;
           LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
           // We 'restore' LR into PC so it is not live out of the return block:
@@ -1072,7 +1088,6 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
           Info.setRestored(false);
         } else
           LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
-        // Fold the return instruction into the LDM.
       }
 
       // If NoGap is true, pop consecutive registers and then leave the rest
@@ -1088,7 +1103,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     if (Regs.empty())
       continue;
 
-    std::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
+    llvm::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
       return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS);
     });
 
@@ -1605,6 +1620,17 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
     SavedRegs.set(ARM::R4);
 
+  // If a stack probe will be emitted, spill R4 and LR, since they are
+  // clobbered by the stack probe call.
+  // This estimate should be a safe, conservative estimate. The actual
+  // stack probe is enabled based on the size of the local objects;
+  // this estimate also includes the varargs store size.
+  if (STI.isTargetWindows() &&
+      WindowsRequiresStackProbe(MF, MFI.estimateStackSize(MF))) {
+    SavedRegs.set(ARM::R4);
+    SavedRegs.set(ARM::LR);
+  }
+
   if (AFI->isThumb1OnlyFunction()) {
     // Spill LR if Thumb1 function uses variable length argument lists.
     if (AFI->getArgRegsSaveSize() > 0)
@@ -1797,34 +1823,36 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
         if (!MF.getRegInfo().isLiveIn(Reg)) {
           --EntryRegDeficit;
-          DEBUG(dbgs() << printReg(Reg, TRI)
-                       << " is unused argument register, EntryRegDeficit = "
-                       << EntryRegDeficit << "\n");
+          LLVM_DEBUG(dbgs()
+                     << printReg(Reg, TRI)
+                     << " is unused argument register, EntryRegDeficit = "
+                     << EntryRegDeficit << "\n");
         }
       }
 
       // Unused return registers can be clobbered in the epilogue for free.
       int ExitRegDeficit = AFI->getReturnRegsCount() - 4;
-      DEBUG(dbgs() << AFI->getReturnRegsCount()
-                   << " return regs used, ExitRegDeficit = " << ExitRegDeficit
-                   << "\n");
+      LLVM_DEBUG(dbgs() << AFI->getReturnRegsCount()
+                        << " return regs used, ExitRegDeficit = "
+                        << ExitRegDeficit << "\n");
 
       int RegDeficit = std::max(EntryRegDeficit, ExitRegDeficit);
-      DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n");
+      LLVM_DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n");
 
       // r4-r6 can be used in the prologue if they are pushed by the first push
       // instruction.
       for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6}) {
         if (SavedRegs.test(Reg)) {
           --RegDeficit;
-          DEBUG(dbgs() << printReg(Reg, TRI)
-                       << " is saved low register, RegDeficit = " << RegDeficit
-                       << "\n");
+          LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+                            << " is saved low register, RegDeficit = "
+                            << RegDeficit << "\n");
         } else {
           AvailableRegs.push_back(Reg);
-          DEBUG(dbgs()
-                << printReg(Reg, TRI)
-                << " is non-saved low register, adding to AvailableRegs\n");
+          LLVM_DEBUG(
+              dbgs()
+              << printReg(Reg, TRI)
+              << " is non-saved low register, adding to AvailableRegs\n");
         }
       }
 
@@ -1832,12 +1860,13 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       if (!HasFP) {
         if (SavedRegs.test(ARM::R7)) {
           --RegDeficit;
-          DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = "
-                       << RegDeficit << "\n");
+          LLVM_DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = "
+                            << RegDeficit << "\n");
         } else {
           AvailableRegs.push_back(ARM::R7);
-          DEBUG(dbgs()
-                << "%r7 is non-saved low register, adding to AvailableRegs\n");
+          LLVM_DEBUG(
+              dbgs()
+              << "%r7 is non-saved low register, adding to AvailableRegs\n");
         }
       }
 
@@ -1845,9 +1874,9 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       for (unsigned Reg : {ARM::R8, ARM::R9, ARM::R10, ARM::R11}) {
         if (SavedRegs.test(Reg)) {
           ++RegDeficit;
-          DEBUG(dbgs() << printReg(Reg, TRI)
-                       << " is saved high register, RegDeficit = " << RegDeficit
-                       << "\n");
+          LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+                            << " is saved high register, RegDeficit = "
+                            << RegDeficit << "\n");
         }
       }
 
@@ -1859,11 +1888,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
             MF.getFrameInfo().isReturnAddressTaken())) {
         if (SavedRegs.test(ARM::LR)) {
           --RegDeficit;
-          DEBUG(dbgs() << "%lr is saved register, RegDeficit = " << RegDeficit
-                       << "\n");
+          LLVM_DEBUG(dbgs() << "%lr is saved register, RegDeficit = "
+                            << RegDeficit << "\n");
         } else {
           AvailableRegs.push_back(ARM::LR);
-          DEBUG(dbgs() << "%lr is not saved, adding to AvailableRegs\n");
+          LLVM_DEBUG(dbgs() << "%lr is not saved, adding to AvailableRegs\n");
         }
       }
 
@@ -1872,11 +1901,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       // instructions. This might not reduce RegDeficit all the way to zero,
       // because we can only guarantee that r4-r6 are available, but r8-r11 may
       // need saving.
-      DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n");
+      LLVM_DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n");
       for (; RegDeficit > 0 && !AvailableRegs.empty(); --RegDeficit) {
         unsigned Reg = AvailableRegs.pop_back_val();
-        DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
-                     << " to make up reg deficit\n");
+        LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
+                          << " to make up reg deficit\n");
         SavedRegs.set(Reg);
         NumGPRSpills++;
         CS1Spilled = true;
@@ -1887,7 +1916,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         if (Reg == ARM::LR)
           LRSpilled = true;
       }
-      DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit << "\n");
+      LLVM_DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit
+                        << "\n");
     }
 
     // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
@@ -1908,7 +1938,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // If stack and double are 8-byte aligned and we are spilling an odd number
     // of GPRs, spill one extra callee save GPR so we won't have to pad between
     // the integer and double callee save areas.
-    DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
+    LLVM_DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
     unsigned TargetAlign = getStackAlignment();
     if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
       if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
@@ -1920,8 +1950,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
               (STI.isTargetWindows() && Reg == ARM::R11) ||
               isARMLowRegister(Reg) || Reg == ARM::LR) {
             SavedRegs.set(Reg);
-            DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
-                         << " to make up alignment\n");
+            LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
+                              << " to make up alignment\n");
             if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg))
               ExtraCSSpill = true;
             break;
@@ -1930,8 +1960,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
         unsigned Reg = UnspilledCS2GPRs.front();
         SavedRegs.set(Reg);
-        DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
-                     << " to make up alignment\n");
+        LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
+                          << " to make up alignment\n");
         if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg))
           ExtraCSSpill = true;
       }
@@ -2118,8 +2148,10 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   uint64_t StackSize = MFI.getStackSize();
 
-  // Do not generate a prologue for functions with a stack of size zero
-  if (StackSize == 0)
+  // Do not generate a prologue for leaf functions with a stack of size zero.
+  // For non-leaf functions we have to allow for the possibility that the
+  // call is to a non-split function, as in PR37807.
+  if (StackSize == 0 && !MFI.hasTailCall())
     return;
 
   // Use R4 and R5 as scratch registers.
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
index 1f18e2bf80c4..e994cab28fe7 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -44,6 +44,8 @@ public:
 
   bool noFramePointerElim(const MachineFunction &MF) const override;
 
+  bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
+
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index f878bf9937a4..d5dacbe08770 100644
--- a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -37,7 +37,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 
   MachineInstr *MI = SU->getInstr();
 
-  if (!MI->isDebugValue()) {
+  if (!MI->isDebugInstr()) {
     // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
     // a VMLA / VMLS will cause 4 cycle stall.
     const MCInstrDesc &MCID = MI->getDesc();
@@ -81,7 +81,7 @@ void ARMHazardRecognizer::Reset() {
 
 void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
   MachineInstr *MI = SU->getInstr();
-  if (!MI->isDebugValue()) {
+  if (!MI->isDebugInstr()) {
     LastMI = MI;
     FpMLxStalls = 0;
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8d32510e2004..081d4ff033bd 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -97,6 +97,8 @@ public:
     return SelectImmShifterOperand(N, A, B, false);
   }
 
+  bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out);
+
   bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
 
@@ -118,8 +120,10 @@ public:
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
-  bool SelectAddrMode5(SDValue N, SDValue &Base,
-                       SDValue &Offset);
+  bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
+                         int Lwb, int Upb, bool FP16);
+  bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset);
   bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
   bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
 
@@ -199,10 +203,11 @@ private:
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
-  /// for loading D registers.  (Q registers are not supported.)
-  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes = nullptr);
+  /// for loading D registers.
+  void SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating,
+                    unsigned NumVecs, const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0 = nullptr,
+                    const uint16_t *QOpcodes1 = nullptr);
 
   /// Try to select SBFX/UBFX instructions for ARM.
   bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
@@ -281,7 +286,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
-/// \brief Check whether a particular node is a constant value representable as
+/// Check whether a particular node is a constant value representable as
 /// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
 ///
 /// \param ScaledConstant [out] - On success, the pre-scaled constant value.
@@ -498,7 +503,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
 
 void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
   CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
-  CurDAG->ReplaceAllUsesWith(N, M);
+  ReplaceUses(N, M);
 }
 
 bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
@@ -567,6 +572,14 @@ bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
   return true;
 }
 
+// Determine whether an ISD::OR's operands are suitable to turn the operation
+// into an addition, which often has more compact encodings.
+bool ARMDAGToDAGISel::SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out) {
+  assert(Parent->getOpcode() == ISD::OR && "unexpected parent");
+  Out = N;
+  return CurDAG->haveNoCommonBitsSet(N, Parent->getOperand(1));
+}
+
 
 bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
                                           SDValue &Base,
@@ -886,8 +899,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
-                                      SDValue &Base, SDValue &Offset) {
+bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
+                                        int Lwb, int Upb, bool FP16) {
   if (!CurDAG->isBaseWithConstantOffset(N)) {
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
@@ -907,8 +920,9 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
 
   // If the RHS is +/- imm8, fold into addr mode.
   int RHSC;
-  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
-                              -256 + 1, 256, RHSC)) {
+  const int Scale = FP16 ? 2 : 4;
+
+  if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) {
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
@@ -921,17 +935,43 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
       AddSub = ARM_AM::sub;
       RHSC = -RHSC;
     }
-    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
-                                       SDLoc(N), MVT::i32);
+
+    if (FP16)
+      Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC),
+                                         SDLoc(N), MVT::i32);
+    else
+      Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                         SDLoc(N), MVT::i32);
+
     return true;
   }
 
   Base = N;
-  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
-                                     SDLoc(N), MVT::i32);
+
+  if (FP16)
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0),
+                                       SDLoc(N), MVT::i32);
+  else
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       SDLoc(N), MVT::i32);
+
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  int Lwb = -256 + 1;
+  int Upb = 256;
+  return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false);
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N,
+                                          SDValue &Base, SDValue &Offset) {
+  int Lwb = -512 + 1;
+  int Upb = 512;
+  return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true);
+}
+
 bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
                                       SDValue &Align) {
   Addr = N;
@@ -1467,7 +1507,7 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   return false;
 }
 
-/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+/// Form a GPRPair pseudo register from a pair of GPR regs.
 SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass =
@@ -1478,7 +1518,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form a D register from a pair of S registers.
+/// Form a D register from a pair of S registers.
 SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass =
@@ -1489,7 +1529,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form a quad register from a pair of D registers.
+/// Form a quad register from a pair of D registers.
 SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl,
@@ -1500,7 +1540,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive D registers from a pair of Q registers.
+/// Form 4 consecutive D registers from a pair of Q registers.
 SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
@@ -1511,7 +1551,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive S registers.
+/// Form 4 consecutive S registers.
 SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   SDLoc dl(V0.getNode());
@@ -1526,7 +1566,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive D registers.
+/// Form 4 consecutive D registers.
 SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   SDLoc dl(V0.getNode());
@@ -1541,7 +1581,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive Q registers.
+/// Form 4 consecutive Q registers.
 SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   SDLoc dl(V0.getNode());
@@ -1708,7 +1748,9 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  bool IsIntrinsic = !isUpdating;  // By coincidence, all supported updating
+                                   // nodes are not intrinsics.
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
 
@@ -1732,9 +1774,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
   case MVT::v2f64:
-  case MVT::v2i64: OpcodeIndex = 3;
-    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
-    break;
+  case MVT::v2i64: OpcodeIndex = 3; break;
   }
 
   EVT ResTy;
@@ -1765,15 +1805,17 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
-      // case entirely when the rest are updated to that form, too.
       bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
-      if ((NumVecs <= 2) && !IsImmUpdate)
-        Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
-      // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate)
-        Ops.push_back(IsImmUpdate ? Reg0 : Inc);
+      if (!IsImmUpdate) {
+        // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+        // check for the opcode rather than the number of vector elements.
+        if (isVLDfixed(Opc))
+          Opc = getVLDSTRegisterUpdateOpcode(Opc);
+        Ops.push_back(Inc);
+      // VLD1/VLD2 fixed increment does not need Reg0 so only include it in
+      // the operands if not such an opcode.
+      } else if (!isVLDfixed(Opc))
+        Ops.push_back(Reg0);
     }
     Ops.push_back(Pred);
     Ops.push_back(Reg0);
@@ -1844,7 +1886,9 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  bool IsIntrinsic = !isUpdating;  // By coincidence, all supported updating
+                                   // nodes are not intrinsics.
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
@@ -1862,19 +1906,19 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   default: llvm_unreachable("unhandled vst type");
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4f16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
   case MVT::v1i64: OpcodeIndex = 3; break;
     // Quad-register operations:
   case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8f16:
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
   case MVT::v2f64:
-  case MVT::v2i64: OpcodeIndex = 3;
-    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
-    break;
+  case MVT::v2i64: OpcodeIndex = 3; break;
   }
 
   std::vector<EVT> ResTys;
@@ -1919,16 +1963,17 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
-      // case entirely when the rest are updated to that form, too.
       bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
-      if (NumVecs <= 2 && !IsImmUpdate)
-        Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
-      // check for that explicitly too. Horribly hacky, but temporary.
-      if  (!IsImmUpdate)
+      if (!IsImmUpdate) {
+        // We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so
+        // check for the opcode rather than the number of vector elements.
+        if (isVSTfixed(Opc))
+          Opc = getVLDSTRegisterUpdateOpcode(Opc);
         Ops.push_back(Inc);
-      else if (NumVecs > 2 && !isVSTfixed(Opc))
+      }
+      // VST1/VST2 fixed increment does not need Reg0 so only include it in
+      // the operands if not such an opcode.
+      else if (!isVSTfixed(Opc))
         Ops.push_back(Reg0);
     }
     Ops.push_back(SrcReg);
@@ -1993,7 +2038,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  bool IsIntrinsic = !isUpdating;  // By coincidence, all supported updating
+                                   // nodes are not intrinsics.
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
@@ -2109,21 +2156,22 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   CurDAG->RemoveDeadNode(N);
 }
 
-void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
+                                   bool isUpdating, unsigned NumVecs,
                                    const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes) {
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
 
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
 
   unsigned Alignment = 0;
   if (NumVecs != 3) {
@@ -2140,49 +2188,84 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
   }
   Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
 
-  unsigned Opc;
+  unsigned OpcodeIndex;
   switch (VT.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("unhandled vld-dup type");
-  case MVT::v8i8:  Opc = DOpcodes[0]; break;
-  case MVT::v16i8: Opc = QOpcodes[0]; break;
-  case MVT::v4i16: Opc = DOpcodes[1]; break;
-  case MVT::v8i16: Opc = QOpcodes[1]; break;
+  case MVT::v8i8:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v4i16:
+  case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
-  case MVT::v2i32: Opc = DOpcodes[2]; break;
+  case MVT::v2i32:
   case MVT::v4f32:
-  case MVT::v4i32: Opc = QOpcodes[2]; break;
-  }
-
-  SDValue Pred = getAL(CurDAG, dl);
-  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(MemAddr);
-  Ops.push_back(Align);
-  if (isUpdating) {
-    // fixed-stride update instructions don't have an explicit writeback
-    // operand. It's implicit in the opcode itself.
-    SDValue Inc = N->getOperand(2);
-    bool IsImmUpdate =
-        isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
-    if (NumVecs <= 2 && !IsImmUpdate)
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    if (!IsImmUpdate)
-      Ops.push_back(Inc);
-    // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
-    else if (NumVecs > 2)
-      Ops.push_back(Reg0);
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v1f64:
+  case MVT::v1i64: OpcodeIndex = 3; break;
   }
-  Ops.push_back(Pred);
-  Ops.push_back(Reg0);
-  Ops.push_back(Chain);
 
   unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+  if (!is64BitVector)
+    ResTyElts *= 2;
+  EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+
   std::vector<EVT> ResTys;
-  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+  ResTys.push_back(ResTy);
   if (isUpdating)
     ResTys.push_back(MVT::i32);
   ResTys.push_back(MVT::Other);
-  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  SDValue Pred = getAL(CurDAG, dl);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+  SDNode *VLdDup;
+  if (is64BitVector || NumVecs == 1) {
+    SmallVector<SDValue, 6> Ops;
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] :
+                                   QOpcodes0[OpcodeIndex];
+    if (isUpdating) {
+      // fixed-stride update instructions don't have an explicit writeback
+      // operand. It's implicit in the opcode itself.
+      SDValue Inc = N->getOperand(2);
+      bool IsImmUpdate =
+          isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+      if (NumVecs <= 2 && !IsImmUpdate)
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      if (!IsImmUpdate)
+        Ops.push_back(Inc);
+      // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+      else if (NumVecs > 2)
+        Ops.push_back(Reg0);
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  } else if (NumVecs == 2) {
+    const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
+                                          dl, ResTys, OpsA);
+
+    Chain = SDValue(VLdA, 1);
+    const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain };
+    VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
+  } else {
+    SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+    const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
+                                          dl, ResTys, OpsA);
+
+    SDValue SuperReg = SDValue(VLdA, 0);
+    Chain = SDValue(VLdA, 1);
+    const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain };
+    VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
+  }
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
 
   // Extract the subregisters.
@@ -2191,10 +2274,11 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
   } else {
     SDValue SuperReg = SDValue(VLdDup, 0);
     static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
-    unsigned SubIdx = ARM::dsub_0;
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+    for (unsigned Vec = 0; Vec != NumVecs; ++Vec) {
       ReplaceUses(SDValue(N, Vec),
                   CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+    }
   }
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
@@ -2253,6 +2337,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
           return true;
         }
 
+        assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                           CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2277,6 +2362,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
       if (LSB < 0)
         return false;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2298,6 +2384,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
       // Note: The width operand is encoded as width-1.
       unsigned Width = MSB - LSB;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      assert(Srl_imm + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2318,6 +2405,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
       return false;
 
     SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+    assert(LSB + Width <= 32 && "Shouldn't create an invalid ubfx");
     SDValue Ops[] = { N->getOperand(0).getOperand(0),
                       CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                       CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
@@ -2427,7 +2515,7 @@ void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) {
   SDValue X = And.getOperand(0);
   auto C = dyn_cast<ConstantSDNode>(And.getOperand(1));
 
-  if (!C || !X->hasOneUse())
+  if (!C)
     return;
   auto Range = getContiguousRangeOfSetBits(C->getAPIntValue());
   if (!Range)
@@ -2765,7 +2853,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     }
   }
   case ARMISD::SUBE: {
-    if (!Subtarget->hasV6Ops())
+    if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
       break;
     // Look for a pattern to match SMMLS
     // (sube a, (smul_loHi a, b), (subc 0, (smul_LOhi(a, b))))
@@ -3026,14 +3114,14 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1DUPd32 };
     static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
                                          ARM::VLD1DUPq32 };
-    SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 1, DOpcodes, QOpcodes);
     return;
   }
 
   case ARMISD::VLD2DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                         ARM::VLD2DUPd32 };
-    SelectVLDDup(N, false, 2, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 2, Opcodes);
     return;
   }
 
@@ -3041,7 +3129,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
                                         ARM::VLD3DUPd16Pseudo,
                                         ARM::VLD3DUPd32Pseudo };
-    SelectVLDDup(N, false, 3, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 3, Opcodes);
     return;
   }
 
@@ -3049,7 +3137,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
                                         ARM::VLD4DUPd16Pseudo,
                                         ARM::VLD4DUPd32Pseudo };
-    SelectVLDDup(N, false, 4, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 4, Opcodes);
     return;
   }
 
@@ -3060,7 +3148,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
                                          ARM::VLD1DUPq16wb_fixed,
                                          ARM::VLD1DUPq32wb_fixed };
-    SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 1, DOpcodes, QOpcodes);
     return;
   }
 
@@ -3068,7 +3156,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                         ARM::VLD2DUPd16wb_fixed,
                                         ARM::VLD2DUPd32wb_fixed };
-    SelectVLDDup(N, true, 2, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes);
     return;
   }
 
@@ -3076,7 +3164,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
                                         ARM::VLD3DUPd16Pseudo_UPD,
                                         ARM::VLD3DUPd32Pseudo_UPD };
-    SelectVLDDup(N, true, 3, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes);
     return;
   }
 
@@ -3084,7 +3172,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
                                         ARM::VLD4DUPd16Pseudo_UPD,
                                         ARM::VLD4DUPd32Pseudo_UPD };
-    SelectVLDDup(N, true, 4, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes);
     return;
   }
 
@@ -3407,6 +3495,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    case Intrinsic::arm_neon_vld1x2: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1d8QPseudo,
+                                           ARM::VLD1d16QPseudo,
+                                           ARM::VLD1d32QPseudo,
+                                           ARM::VLD1d64QPseudo };
+      SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld1x3: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8TPseudo,
+                                           ARM::VLD1d16TPseudo,
+                                           ARM::VLD1d32TPseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowTPseudo_UPD,
+                                            ARM::VLD1q16LowTPseudo_UPD,
+                                            ARM::VLD1q32LowTPseudo_UPD,
+                                            ARM::VLD1q64LowTPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighTPseudo,
+                                            ARM::VLD1q16HighTPseudo,
+                                            ARM::VLD1q32HighTPseudo,
+                                            ARM::VLD1q64HighTPseudo };
+      SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld1x4: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8QPseudo,
+                                           ARM::VLD1d16QPseudo,
+                                           ARM::VLD1d32QPseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowQPseudo_UPD,
+                                            ARM::VLD1q16LowQPseudo_UPD,
+                                            ARM::VLD1q32LowQPseudo_UPD,
+                                            ARM::VLD1q64LowQPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighQPseudo,
+                                            ARM::VLD1q16HighQPseudo,
+                                            ARM::VLD1q32HighQPseudo,
+                                            ARM::VLD1q64HighQPseudo };
+      SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
     case Intrinsic::arm_neon_vld2: {
       static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
                                            ARM::VLD2d32, ARM::VLD1q64 };
@@ -3446,6 +3579,52 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    case Intrinsic::arm_neon_vld2dup: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                           ARM::VLD2DUPd32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo,
+                                            ARM::VLD2DUPq16EvenPseudo,
+                                            ARM::VLD2DUPq32EvenPseudo };
+      static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudo,
+                                            ARM::VLD2DUPq16OddPseudo,
+                                            ARM::VLD2DUPq32OddPseudo };
+      SelectVLDDup(N, /* IsIntrinsic= */ true, false, 2,
+                   DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld3dup: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                           ARM::VLD3DUPd16Pseudo,
+                                           ARM::VLD3DUPd32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo,
+                                            ARM::VLD3DUPq16EvenPseudo,
+                                            ARM::VLD3DUPq32EvenPseudo };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo,
+                                            ARM::VLD3DUPq16OddPseudo,
+                                            ARM::VLD3DUPq32OddPseudo };
+      SelectVLDDup(N, /* IsIntrinsic= */ true, false, 3,
+                   DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld4dup: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                           ARM::VLD4DUPd16Pseudo,
+                                           ARM::VLD4DUPd32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo,
+                                            ARM::VLD4DUPq16EvenPseudo,
+                                            ARM::VLD4DUPq32EvenPseudo };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo,
+                                            ARM::VLD4DUPq16OddPseudo,
+                                            ARM::VLD4DUPq32OddPseudo };
+      SelectVLDDup(N, /* IsIntrinsic= */ true, false, 4,
+                   DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
     case Intrinsic::arm_neon_vld2lane: {
       static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
                                            ARM::VLD2LNd16Pseudo,
@@ -3485,6 +3664,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    case Intrinsic::arm_neon_vst1x2: {
+      static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo,
+                                           ARM::VST1d16QPseudo,
+                                           ARM::VST1d32QPseudo,
+                                           ARM::VST1d64QPseudo };
+      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst1x3: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo,
+                                           ARM::VST1d16TPseudo,
+                                           ARM::VST1d32TPseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD,
+                                            ARM::VST1q16LowTPseudo_UPD,
+                                            ARM::VST1q32LowTPseudo_UPD,
+                                            ARM::VST1q64LowTPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo,
+                                            ARM::VST1q16HighTPseudo,
+                                            ARM::VST1q32HighTPseudo,
+                                            ARM::VST1q64HighTPseudo };
+      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst1x4: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo,
+                                           ARM::VST1d16QPseudo,
+                                           ARM::VST1d32QPseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD,
+                                            ARM::VST1q16LowQPseudo_UPD,
+                                            ARM::VST1q32LowQPseudo_UPD,
+                                            ARM::VST1q64LowQPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo,
+                                            ARM::VST1q16HighQPseudo,
+                                            ARM::VST1q32HighQPseudo,
+                                            ARM::VST1q64HighQPseudo };
+      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
     case Intrinsic::arm_neon_vst2: {
       static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
                                            ARM::VST2d32, ARM::VST1q64 };
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c4c7ad088c0b..47222a66f798 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -53,7 +53,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -97,6 +96,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -308,13 +308,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
           setCmpLibcallCC(LC.Op, LC.Cond);
       }
     }
-
-    // Set the correct calling convention for ARMv7k WatchOS. It's just
-    // AAPCS_VFP for functions as simple as libcalls.
-    if (Subtarget->isTargetWatchABI()) {
-      for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
-        setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
-    }
   }
 
   // These libcalls are not available in 32-bit.
@@ -522,6 +515,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
   }
 
+  if (Subtarget->hasFullFP16()) {
+    addRegisterClass(MVT::f16, &ARM::HPRRegClass);
+    setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
+    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+  }
+
   for (MVT VT : MVT::vector_valuetypes()) {
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
@@ -558,6 +561,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
 
+    if (Subtarget->hasFullFP16()) {
+      addQRTypeForNEON(MVT::v8f16);
+      addDRTypeForNEON(MVT::v4f16);
+    }
+
     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
     // neither Neon nor VFP support any arithmetic operations on it.
     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
@@ -820,10 +828,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
 
-  setOperationAction(ISD::ADDC,      MVT::i32, Custom);
-  setOperationAction(ISD::ADDE,      MVT::i32, Custom);
-  setOperationAction(ISD::SUBC,      MVT::i32, Custom);
-  setOperationAction(ISD::SUBE,      MVT::i32, Custom);
+  // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
+  if (Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+  }
 
   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -949,7 +959,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+  if (Subtarget->isTargetWindows())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
@@ -1036,13 +1046,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  if (Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::SETCC,     MVT::f16, Expand);
+    setOperationAction(ISD::SELECT,    MVT::f16, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
+  }
 
-  // Thumb-1 cannot currently select ARMISD::SUBE.
-  if (!Subtarget->isThumb1Only())
-    setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+  setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
 
   setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
+  if (Subtarget->hasFullFP16())
+      setOperationAction(ISD::BR_CC, MVT::f16,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
@@ -1121,6 +1136,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we use
     // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
@@ -1259,6 +1276,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
+  case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
+  case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
+  case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
 
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
@@ -1337,6 +1357,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
+  case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
+  case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -2465,12 +2487,37 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     SDValue Arg = OutVals[realRVLocIdx];
+    bool ReturnF16 = false;
+
+    if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
+      // Half-precision return values can be returned like this:
+      //
+      // t11 f16 = fadd ...
+      // t12: i16 = bitcast t11
+      //   t13: i32 = zero_extend t12
+      // t14: f32 = bitcast t13  <~~~~~~~ Arg
+      //
+      // to avoid code generation for bitcasts, we simply set Arg to the node
+      // that produces the f16 value, t11 in this case.
+      //
+      if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
+        SDValue ZE = Arg.getOperand(0);
+        if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
+          SDValue BC = ZE.getOperand(0);
+          if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
+            Arg = BC.getOperand(0);
+            ReturnF16 = true;
+          }
+        }
+      }
+    }
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      if (!ReturnF16)
+        Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
       break;
     }
 
@@ -2518,7 +2565,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // Guarantee that all emitted copies are
     // stuck together, avoiding something bad.
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(),
+                                     ReturnF16 ? MVT::f16 : VA.getLocVT()));
   }
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *I =
@@ -2738,7 +2786,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
 }
 
-/// \brief Convert a TLS address reference into the correct sequence of loads
+/// Convert a TLS address reference into the correct sequence of loads
 /// and calls to compute the variable's address for Darwin, and return an
 /// SDValue containing the final node.
 
@@ -2959,7 +3007,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
 SDValue
 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   if (Subtarget->isTargetDarwin())
@@ -3675,11 +3723,14 @@ SDValue ARMTargetLowering::LowerFormalArguments(
       } else {
         const TargetRegisterClass *RC;
 
-        if (RegVT == MVT::f32)
+
+        if (RegVT == MVT::f16)
+          RC = &ARM::HPRRegClass;
+        else if (RegVT == MVT::f32)
           RC = &ARM::SPRRegClass;
-        else if (RegVT == MVT::f64)
+        else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
           RC = &ARM::DPRRegClass;
-        else if (RegVT == MVT::v2f64)
+        else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
           RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
@@ -3799,8 +3850,8 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                      const SDLoc &dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
-    if (!isLegalICmpImmediate(C)) {
-      // Constant does not fit, try adjusting it by one?
+    if (!isLegalICmpImmediate((int32_t)C)) {
+      // Constant does not fit, try adjusting it by one.
       switch (CC) {
       default: break;
       case ISD::SETLT:
@@ -3940,6 +3991,29 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
     break;
+  case ISD::UMULO:
+    // We generate a UMUL_LOHI and then check if the high word is 0.
+    ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
+    Value = DAG.getNode(ISD::UMUL_LOHI, dl,
+                        DAG.getVTList(Op.getValueType(), Op.getValueType()),
+                        LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
+                              DAG.getConstant(0, dl, MVT::i32));
+    Value = Value.getValue(0); // We only want the low 32 bits for the result.
+    break;
+  case ISD::SMULO:
+    // We generate a SMUL_LOHI and then check if all the bits of the high word
+    // are the same as the sign bit of the low word.
+    ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
+    Value = DAG.getNode(ISD::SMUL_LOHI, dl,
+                        DAG.getVTList(Op.getValueType(), Op.getValueType()),
+                        LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
+                              DAG.getNode(ISD::SRA, dl, Op.getValueType(),
+                                          Value.getValue(0),
+                                          DAG.getConstant(31, dl, MVT::i32)));
+    Value = Value.getValue(0); // We only want the low 32 bits for the result.
+    break;
   } // switch (...)
 
   return std::make_pair(Value, OverflowCmp);
@@ -3973,11 +4047,12 @@ static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
   SDLoc DL(BoolCarry);
   EVT CarryVT = BoolCarry.getValueType();
 
-  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   // This converts the boolean value carry into the carry flag by doing
-  // ARMISD::ADDC Carry, ~0
-  return DAG.getNode(ARMISD::ADDC, DL, DAG.getVTList(CarryVT, MVT::i32),
-                     BoolCarry, DAG.getConstant(NegOne, DL, CarryVT));
+  // ARMISD::SUBC Carry, 1
+  SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
+                              DAG.getVTList(CarryVT, MVT::i32),
+                              BoolCarry, DAG.getConstant(1, DL, CarryVT));
+  return Carry.getValue(1);
 }
 
 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
@@ -4313,6 +4388,48 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
   return false;
 }
 
+// Check if a condition of the type x < k ? k : x can be converted into a
+// bit operation instead of conditional moves.
+// Currently this is allowed given:
+// - The conditions and values match up
+// - k is 0 or -1 (all ones)
+// This function will not check the last condition, thats up to the caller
+// It returns true if the transformation can be made, and in such case
+// returns x in V, and k in SatK.
+static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
+                                         SDValue &SatK)
+{
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+
+  SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
+                                               ? &RHS
+                                               : nullptr;
+
+  // No constant operation in comparison, early out
+  if (!K)
+    return false;
+
+  SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
+  V = (KTmp == TrueVal) ? FalseVal : TrueVal;
+  SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
+
+  // If the constant on left and right side, or variable on left and right,
+  // does not match, early out
+  if (*K != KTmp || V != VTmp)
+    return false;
+
+  if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
+    SatK = *K;
+    return true;
+  }
+
+  return false;
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
@@ -4331,6 +4448,25 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
   }
 
+  // Try to convert expressions of the form x < k ? k : x (and similar forms)
+  // into more efficient bit operations, which is possible when k is 0 or -1
+  // On ARM and Thumb-2 which have flexible operand 2 this will result in
+  // single instructions. On Thumb the shift and the bit operation will be two
+  // instructions.
+  // Only allow this transformation on full-width (32-bit) operations
+  SDValue LowerSatConstant;
+  if (VT == MVT::i32 &&
+      isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
+    SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
+                                 DAG.getConstant(31, dl, VT));
+    if (isNullConstant(LowerSatConstant)) {
+      SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
+                                      DAG.getAllOnesConstant(dl, VT));
+      return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
+    } else if (isAllOnesConstant(LowerSatConstant))
+      return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
+  }
+
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -4380,9 +4516,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   bool InvalidOnQNaN;
   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
-  // Try to generate VMAXNM/VMINNM on ARMv8.
-  if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
-                                  TrueVal.getValueType() == MVT::f64)) {
+  // Normalize the fp compare. If RHS is zero we keep it there so we match
+  // CMPFPw0 instead of CMPFP.
+  if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
+     (TrueVal.getValueType() == MVT::f16 ||
+      TrueVal.getValueType() == MVT::f32 ||
+      TrueVal.getValueType() == MVT::f64)) {
     bool swpCmpOps = false;
     bool swpVselOps = false;
     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -4532,10 +4671,14 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest = Op.getOperand(2);
   SDLoc dl(Op);
 
-  // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
   unsigned Opc = Cond.getOpcode();
-  if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO ||
-                               Opc == ISD::SSUBO || Opc == ISD::USUBO)) {
+  bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
+                      !Subtarget->isThumb1Only();
+  if (Cond.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || OptimizeMul)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
       return SDValue();
@@ -4579,11 +4722,15 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
   unsigned Opc = LHS.getOpcode();
+  bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
+                      !Subtarget->isThumb1Only();
   if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+       Opc == ISD::USUBO || OptimizeMul) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
       return SDValue();
@@ -4614,8 +4761,6 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
                        Chain, Dest, ARMcc, CCR, Cmp);
   }
 
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
   if (getTargetMachine().Options.UnsafeFPMath &&
       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
        CC == ISD::SETNE || CC == ISD::SETUNE)) {
@@ -4979,7 +5124,8 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
 /// vectors), since the legalizer won't know what to do with that.
-static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
@@ -4988,8 +5134,78 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
   // source or destination of the bit convert.
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
-  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
-         "ExpandBITCAST called for non-i64 type");
+  const bool HasFullFP16 = Subtarget->hasFullFP16();
+
+  if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
+     // FullFP16: half values are passed in S-registers, and we don't
+     // need any of the bitcast and moves:
+     //
+     // t2: f32,ch = CopyFromReg t0, Register:f32 %0
+     //   t5: i32 = bitcast t2
+     // t18: f16 = ARMISD::VMOVhr t5
+     if (Op.getOpcode() != ISD::CopyFromReg ||
+         Op.getValueType() != MVT::f32)
+       return SDValue();
+
+     auto Move = N->use_begin();
+     if (Move->getOpcode() != ARMISD::VMOVhr)
+       return SDValue();
+
+     SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+     SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
+     DAG.ReplaceAllUsesWith(*Move, &Copy);
+     return Copy;
+  }
+
+  if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
+    if (!HasFullFP16)
+      return SDValue();
+    // SoftFP: read half-precision arguments:
+    //
+    // t2: i32,ch = ... 
+    //        t7: i16 = truncate t2 <~~~~ Op
+    //      t8: f16 = bitcast t7    <~~~~ N
+    //
+    if (Op.getOperand(0).getValueType() == MVT::i32)
+      return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
+                         MVT::f16, Op.getOperand(0));
+
+    return SDValue();
+  }
+
+  // Half-precision return values 
+  if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+    if (!HasFullFP16)
+      return SDValue();
+    //
+    //          t11: f16 = fadd t8, t10
+    //        t12: i16 = bitcast t11       <~~~ SDNode N
+    //      t13: i32 = zero_extend t12
+    //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
+    //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
+    //
+    // transform this into:
+    //
+    //    t20: i32 = ARMISD::VMOVrh t11
+    //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
+    //
+    auto ZeroExtend = N->use_begin();
+    if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
+        ZeroExtend->getValueType(0) != MVT::i32)
+      return SDValue();
+
+    auto Copy = ZeroExtend->use_begin();
+    if (Copy->getOpcode() == ISD::CopyToReg &&
+        Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
+      SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
+      DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
+      return Cvt;
+    }
+    return SDValue();
+  }
+
+  if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
+    return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
@@ -5566,16 +5782,22 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
-static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
   SDValue Cond = Op.getOperand(3);
   SDLoc DL(Op);
 
-  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
+
+  // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
+  // have to invert the carry first.
+  Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
+                      DAG.getConstant(1, DL, MVT::i32), Carry);
+  // This converts the boolean value carry into the carry flag.
+  Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
 
-  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
 
@@ -5731,23 +5953,34 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
 
 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *ST) const {
-  bool IsDouble = Op.getValueType() == MVT::f64;
+  EVT VT = Op.getValueType();
+  bool IsDouble = (VT == MVT::f64);
   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
   const APFloat &FPVal = CFP->getValueAPF();
 
   // Prevent floating-point constants from using literal loads
   // when execute-only is enabled.
   if (ST->genExecuteOnly()) {
+    // If we can represent the constant as an immediate, don't lower it
+    if (isFPImmLegal(FPVal, VT))
+      return Op;
+    // Otherwise, construct as integer, and move to float register
     APInt INTVal = FPVal.bitcastToAPInt();
     SDLoc DL(CFP);
-    if (IsDouble) {
-      SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
-      SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
-      if (!ST->isLittle())
-        std::swap(Lo, Hi);
-      return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
-    } else {
-      return DAG.getConstant(INTVal, DL, MVT::i32);
+    switch (VT.getSimpleVT().SimpleTy) {
+      default:
+        llvm_unreachable("Unknown floating point type!");
+        break;
+      case MVT::f64: {
+        SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
+        SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+        if (!ST->isLittle())
+          std::swap(Lo, Hi);
+        return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
+      }
+      case MVT::f32:
+          return DAG.getNode(ARMISD::VMOVSR, DL, VT,
+              DAG.getConstant(INTVal, DL, MVT::i32));
     }
   }
 
@@ -6598,10 +6831,9 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   }
 
   // Final sanity check before we try to actually produce a shuffle.
-  DEBUG(
-    for (auto Src : Sources)
-      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
-  );
+  LLVM_DEBUG(for (auto Src
+                  : Sources)
+                 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
 
   // The stars all align, our next step is to produce the mask for the shuffle.
   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
@@ -7490,39 +7722,15 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   return N0;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getNode()->getValueType(0);
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid code");
-  case ISD::ADDC: Opc = ARMISD::ADDC; break;
-  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
-  case ISD::SUBC: Opc = ARMISD::SUBC; break;
-  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
-  }
-
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
-                       Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
-                     Op.getOperand(1), Op.getOperand(2));
-}
-
 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   SDNode *N = Op.getNode();
   EVT VT = N->getValueType(0);
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   SDValue Carry = Op.getOperand(2);
-  EVT CarryVT = Carry.getValueType();
 
   SDLoc DL(Op);
 
-  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
-
   SDValue Result;
   if (Op.getOpcode() == ISD::ADDCARRY) {
     // This converts the boolean value carry into the carry flag.
@@ -7530,7 +7738,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
 
     // Do the addition proper using the carry flag we wanted.
     Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
-                         Op.getOperand(1), Carry.getValue(1));
+                         Op.getOperand(1), Carry);
 
     // Now convert the carry flag into a boolean value.
     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
@@ -7544,7 +7752,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
 
     // Do the subtraction proper using the carry flag we wanted.
     Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
-                         Op.getOperand(1), Carry.getValue(1));
+                         Op.getOperand(1), Carry);
 
     // Now convert the carry flag into a boolean value.
     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
@@ -7851,7 +8059,7 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
 }
 
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  DEBUG(dbgs() << "Lowering node: "; Op.dump());
+  LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
@@ -7879,7 +8087,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
-  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
+  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
@@ -7892,7 +8100,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
-  case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
+  case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -7909,10 +8117,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
     return LowerUDIV(Op, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
   case ISD::SADDO:
@@ -7927,7 +8131,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
-    if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+    if (Subtarget->isTargetWindows())
       return LowerDYNAMIC_STACKALLOC(Op, DAG);
     llvm_unreachable("Don't know how to custom lower this!");
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
@@ -7981,7 +8185,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     ExpandREAD_REGISTER(N, Results, DAG);
     break;
   case ISD::BITCAST:
-    Res = ExpandBITCAST(N, DAG);
+    Res = ExpandBITCAST(N, DAG, Subtarget);
     break;
   case ISD::SRL:
   case ISD::SRA:
@@ -9055,8 +9259,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   // Thumb1 post-indexed loads are really just single-register LDMs.
   case ARM::tLDR_postidx: {
     MachineOperand Def(MI.getOperand(1));
-    if (TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
-      Def.setIsRenamable(false);
     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
         .add(Def)  // Rn_wb
         .add(MI.getOperand(2))  // Rn
@@ -9323,7 +9525,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
 }
 
-/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// Attaches vregs to MEMCPY that it will use as scratch registers
 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
 /// instead of as a custom inserter because we need the use list from the SDNode.
 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
@@ -9860,7 +10062,7 @@ static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
   return resNode;
 }
 
-static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
   // Look for multiply add opportunities.
@@ -9877,49 +10079,61 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
   //                  V      V
   //                    ADDE   <- hiAdd
   //
-  assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
-
-  assert(AddeNode->getNumOperands() == 3 &&
-         AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+  // In the special case where only the higher part of a signed result is used
+  // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
+  // a constant with the exact value of 0x80000000, we recognize we are dealing
+  // with a "rounded multiply and add" (or subtract) and transform it into
+  // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
+
+  assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
+          AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
+         "Expect an ADDE or SUBE");
+
+  assert(AddeSubeNode->getNumOperands() == 3 &&
+         AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
          "ADDE node has the wrong inputs");
 
-  // Check that we are chained to the right ADDC node.
-  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
-  if (AddcNode->getOpcode() != ARMISD::ADDC)
+  // Check that we are chained to the right ADDC or SUBC node.
+  SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
+  if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+       AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
+      (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
+       AddcSubcNode->getOpcode() != ARMISD::SUBC))
     return SDValue();
 
-  SDValue AddcOp0 = AddcNode->getOperand(0);
-  SDValue AddcOp1 = AddcNode->getOperand(1);
+  SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
+  SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
 
   // Check if the two operands are from the same mul_lohi node.
-  if (AddcOp0.getNode() == AddcOp1.getNode())
+  if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
     return SDValue();
 
-  assert(AddcNode->getNumValues() == 2 &&
-         AddcNode->getValueType(0) == MVT::i32 &&
+  assert(AddcSubcNode->getNumValues() == 2 &&
+         AddcSubcNode->getValueType(0) == MVT::i32 &&
          "Expect ADDC with two result values. First: i32");
 
   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
   // maybe a SMLAL which multiplies two 16-bit values.
-  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
-      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
-      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
-      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
-    return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
+  if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+      AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
 
   // Check for the triangle shape.
-  SDValue AddeOp0 = AddeNode->getOperand(0);
-  SDValue AddeOp1 = AddeNode->getOperand(1);
+  SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
+  SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
 
-  // Make sure that the ADDE operands are not coming from the same node.
-  if (AddeOp0.getNode() == AddeOp1.getNode())
+  // Make sure that the ADDE/SUBE operands are not coming from the same node.
+  if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
     return SDValue();
 
-  // Find the MUL_LOHI node walking up ADDE's operands.
+  // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
   bool IsLeftOperandMUL = false;
-  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
   if (MULOp == SDValue())
-   MULOp = findMUL_LOHI(AddeOp1);
+    MULOp = findMUL_LOHI(AddeSubeOp1);
   else
     IsLeftOperandMUL = true;
   if (MULOp == SDValue())
@@ -9930,63 +10144,88 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
 
   // Figure out the high and low input values to the MLAL node.
-  SDValue* HiAdd = nullptr;
-  SDValue* LoMul = nullptr;
-  SDValue* LowAdd = nullptr;
+  SDValue *HiAddSub = nullptr;
+  SDValue *LoMul = nullptr;
+  SDValue *LowAddSub = nullptr;
 
-  // Ensure that ADDE is from high result of ISD::xMUL_LOHI.
-  if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+  // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
+  if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
     return SDValue();
 
   if (IsLeftOperandMUL)
-    HiAdd = &AddeOp1;
+    HiAddSub = &AddeSubeOp1;
   else
-    HiAdd = &AddeOp0;
+    HiAddSub = &AddeSubeOp0;
 
+  // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
+  // whose low result is fed to the ADDC/SUBC we are checking.
 
-  // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
-  // whose low result is fed to the ADDC we are checking.
-
-  if (AddcOp0 == MULOp.getValue(0)) {
-    LoMul = &AddcOp0;
-    LowAdd = &AddcOp1;
+  if (AddcSubcOp0 == MULOp.getValue(0)) {
+    LoMul = &AddcSubcOp0;
+    LowAddSub = &AddcSubcOp1;
   }
-  if (AddcOp1 == MULOp.getValue(0)) {
-    LoMul = &AddcOp1;
-    LowAdd = &AddcOp0;
+  if (AddcSubcOp1 == MULOp.getValue(0)) {
+    LoMul = &AddcSubcOp1;
+    LowAddSub = &AddcSubcOp0;
   }
 
   if (!LoMul)
     return SDValue();
 
-  // If HiAdd is the same node as ADDC or is a predecessor of ADDC the
-  // replacement below will create a cycle.
-  if (AddcNode == HiAdd->getNode() ||
-      AddcNode->isPredecessorOf(HiAdd->getNode()))
+  // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
+  // the replacement below will create a cycle.
+  if (AddcSubcNode == HiAddSub->getNode() ||
+      AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
     return SDValue();
 
   // Create the merged node.
   SelectionDAG &DAG = DCI.DAG;
 
-  // Build operand list.
+  // Start building operand list.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(LoMul->getOperand(0));
   Ops.push_back(LoMul->getOperand(1));
-  Ops.push_back(*LowAdd);
-  Ops.push_back(*HiAdd);
 
-  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
+  // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
+  // the case, we must be doing signed multiplication and only use the higher
+  // part of the result of the MLAL, furthermore the LowAddSub must be a constant
+  // addition or subtraction with the value of 0x800000.
+  if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
+      FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
+      LowAddSub->getNode()->getOpcode() == ISD::Constant &&
+      static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
+          0x80000000) {
+    Ops.push_back(*HiAddSub);
+    if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
+      FinalOpc = ARMISD::SMMLSR;
+    } else {
+      FinalOpc = ARMISD::SMMLAR;
+    }
+    SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
+
+    return SDValue(AddeSubeNode, 0);
+  } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
+    // SMMLS is generated during instruction selection and the rest of this
+    // function can not handle the case where AddcSubcNode is a SUBC.
+    return SDValue();
+
+  // Finish building the operand list for {U/S}MLAL
+  Ops.push_back(*LowAddSub);
+  Ops.push_back(*HiAddSub);
+
+  SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
 
   // Replace the ADDs' nodes uses by the MLA node's values.
   SDValue HiMLALResult(MLALNode.getNode(), 1);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
 
   SDValue LoMLALResult(MLALNode.getNode(), 0);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
 
   // Return original node to notify the driver to stop replacing.
-  return SDValue(AddeNode, 0);
+  return SDValue(AddeSubeNode, 0);
 }
 
 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
@@ -10071,13 +10310,13 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
                                       const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG(DCI.DAG);
 
-  if (N->getOpcode() == ARMISD::ADDC) {
-    // (ADDC (ADDE 0, 0, C), -1) -> C
+  if (N->getOpcode() == ARMISD::SUBC) {
+    // (SUBC (ADDE 0, 0, C), 1) -> C
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
     if (LHS->getOpcode() == ARMISD::ADDE &&
         isNullConstant(LHS->getOperand(0)) &&
-        isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) {
+        isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
       return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
     }
   }
@@ -10095,12 +10334,15 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
       }
     }
   }
+
   return SDValue();
 }
 
-static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformAddeSubeCombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   if (Subtarget->isThumb1Only()) {
+    SelectionDAG &DAG = DCI.DAG;
     SDValue RHS = N->getOperand(1);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
       int64_t imm = C->getSExtValue();
@@ -10118,6 +10360,8 @@ static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
                            N->getOperand(0), RHS, N->getOperand(2));
       }
     }
+  } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
+    return AddCombineTo64bitMLAL(N, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -10130,7 +10374,7 @@ static SDValue PerformADDECombine(SDNode *N,
                                   const ARMSubtarget *Subtarget) {
   // Only ARM and Thumb2 support UMLAL/SMLAL.
   if (Subtarget->isThumb1Only())
-    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+    return PerformAddeSubeCombine(N, DCI, Subtarget);
 
   // Only perform the checks after legalize when the pattern is available.
   if (DCI.isBeforeLegalize()) return SDValue();
@@ -10259,9 +10503,9 @@ static SDValue PerformSHLSimplify(SDNode *N,
   // Shift left to compensate for the lshr of C1Int.
   SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
 
-  DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); SHL.dump();
-        N->dump());
-  DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
+  LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
+             SHL.dump(); N->dump());
+  LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
 
   DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
   return SDValue(N, 0);
@@ -10432,6 +10676,83 @@ static SDValue PerformMULCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue CombineANDShift(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const ARMSubtarget *Subtarget) {
+  // Allow DAGCombine to pattern-match before we touch the canonical form.
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  if (N->getValueType(0) != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!N1C)
+    return SDValue();
+
+  uint32_t C1 = (uint32_t)N1C->getZExtValue();
+  // Don't transform uxtb/uxth.
+  if (C1 == 255 || C1 == 65535)
+    return SDValue();
+
+  SDNode *N0 = N->getOperand(0).getNode();
+  if (!N0->hasOneUse())
+    return SDValue();
+
+  if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
+    return SDValue();
+
+  bool LeftShift = N0->getOpcode() == ISD::SHL;
+
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  if (!N01C)
+    return SDValue();
+
+  uint32_t C2 = (uint32_t)N01C->getZExtValue();
+  if (!C2 || C2 >= 32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  // We have a pattern of the form "(and (shl x, c2) c1)" or
+  // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
+  // transform to a pair of shifts, to save materializing c1.
+
+  // First pattern: right shift, and c1+1 is a power of two.
+  // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power
+  // of two).
+  // FIXME: Use demanded bits?
+  if (!LeftShift && isMask_32(C1)) {
+    uint32_t C3 = countLeadingZeros(C1);
+    if (C2 < C3) {
+      SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+                                DAG.getConstant(C3 - C2, DL, MVT::i32));
+      return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
+                         DAG.getConstant(C3, DL, MVT::i32));
+    }
+  }
+
+  // Second pattern: left shift, and (c1>>c2)+1 is a power of two.
+  // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1
+  // is a power of two).
+  // FIXME: Use demanded bits?
+  if (LeftShift && isShiftedMask_32(C1)) {
+    uint32_t C3 = countLeadingZeros(C1);
+    if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {
+      SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+                                DAG.getConstant(C2 + C3, DL, MVT::i32));
+      return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
+                        DAG.getConstant(C3, DL, MVT::i32));
+    }
+  }
+
+  // FIXME: Transform "(and (shl x, c2) c1)" ->
+  // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
+  // c1.
+  return SDValue();
+}
+
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
@@ -10473,6 +10794,10 @@ static SDValue PerformANDCombine(SDNode *N,
       return Result;
   }
 
+  if (Subtarget->isThumb1Only())
+    if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
+      return Result;
+
   return SDValue();
 }
 
@@ -11021,7 +11346,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
-/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
 static SDValue
 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
@@ -11237,6 +11562,12 @@ static SDValue CombineBaseUpdate(SDNode *N,
         NumVecs = 3; break;
       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
         NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld2dup:
+      case Intrinsic::arm_neon_vld3dup:
+      case Intrinsic::arm_neon_vld4dup:
+        // TODO: Support updating VLDxDUP nodes. For now, we just skip
+        // combining base updates for such intrinsics.
+        continue;
       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
         NumVecs = 2; isLaneOp = true; break;
       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
@@ -12315,6 +12646,89 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
     }
   }
 
+  if (!VT.isInteger())
+      return SDValue();
+
+  // Materialize a boolean comparison for integers so we can avoid branching.
+  if (isNullConstant(FalseVal)) {
+    if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
+      if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
+        // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
+        // right 5 bits will make that 32 be 1, otherwise it will be 0.
+        // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
+        SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+        Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
+                          DAG.getConstant(5, dl, MVT::i32));
+      } else {
+        // CMOV 0, 1, ==, (CMPZ x, y) ->
+        //     (ADDCARRY (SUB x, y), t:0, t:1)
+        // where t = (SUBCARRY 0, (SUB x, y), 0)
+        //
+        // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
+        // x != y. In other words, a carry C == 1 when x == y, C == 0
+        // otherwise.
+        // The final ADDCARRY computes
+        //     x - y + (0 - (x - y)) + C == C
+        SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+        SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+        SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
+        // ISD::SUBCARRY returns a borrow but we want the carry here
+        // actually.
+        SDValue Carry =
+            DAG.getNode(ISD::SUB, dl, MVT::i32,
+                        DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
+        Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
+      }
+    } else if (CC == ARMCC::NE && LHS != RHS &&
+               (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
+      // This seems pointless but will allow us to combine it further below.
+      // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
+      SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+      Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
+                        N->getOperand(3), Cmp);
+    }
+  } else if (isNullConstant(TrueVal)) {
+    if (CC == ARMCC::EQ && LHS != RHS &&
+        (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
+      // This seems pointless but will allow us to combine it further below
+      // Note that we change == for != as this is the dual for the case above.
+      // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
+      SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+      Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
+                        DAG.getConstant(ARMCC::NE, dl, MVT::i32),
+                        N->getOperand(3), Cmp);
+    }
+  }
+
+  // On Thumb1, the DAG above may be further combined if z is a power of 2
+  // (z == 2 ^ K).
+  // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
+  //       merge t3, t4
+  // where t1 = (SUBCARRY (SUB x, y), z, 0)
+  //       t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
+  //       t3 = if K != 0 then (SHL t2:0, K) else t2:0
+  //       t4 = (SUB 1, t2:1)   [ we want a carry, not a borrow ]
+  const APInt *TrueConst;
+  if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
+      (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
+      (FalseVal.getOperand(1) == RHS) &&
+      (TrueConst = isPowerOf2Constant(TrueVal))) {
+    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+    unsigned ShiftAmount = TrueConst->logBase2();
+    if (ShiftAmount)
+      TrueVal = DAG.getConstant(1, dl, VT);
+    SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
+    Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
+    // Make it a carry, not a borrow.
+    SDValue Carry = DAG.getNode(
+        ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
+    Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
+
+    if (ShiftAmount)
+      Res = DAG.getNode(ISD::SHL, dl, VT, Res,
+                        DAG.getConstant(ShiftAmount, dl, MVT::i32));
+  }
+
   if (Res.getNode()) {
     KnownBits Known;
     DAG.computeKnownBits(SDValue(N,0), Known);
@@ -12347,7 +12761,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
-  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
@@ -12433,13 +12847,22 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld1x2:
+    case Intrinsic::arm_neon_vld1x3:
+    case Intrinsic::arm_neon_vld1x4:
     case Intrinsic::arm_neon_vld2:
     case Intrinsic::arm_neon_vld3:
     case Intrinsic::arm_neon_vld4:
     case Intrinsic::arm_neon_vld2lane:
     case Intrinsic::arm_neon_vld3lane:
     case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::arm_neon_vld2dup:
+    case Intrinsic::arm_neon_vld3dup:
+    case Intrinsic::arm_neon_vld4dup:
     case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst1x2:
+    case Intrinsic::arm_neon_vst1x3:
+    case Intrinsic::arm_neon_vst1x4:
     case Intrinsic::arm_neon_vst2:
     case Intrinsic::arm_neon_vst3:
     case Intrinsic::arm_neon_vst4:
@@ -12463,6 +12886,10 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                        unsigned,
                                                        unsigned,
                                                        bool *Fast) const {
+  // Depends what it gets converted into if the type is weird.
+  if (!VT.isSimple())
+    return false;
+
   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
 
@@ -12569,6 +12996,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
+bool ARMTargetLowering::isFNegFree(EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
+  // negate values directly (fneg is free). So, we don't want to let the DAG
+  // combiner rewrite fneg into xors and some other instructions.  For f16 and
+  // FullFP16 argument passing, some bitcast nodes may be introduced,
+  // triggering this DAG combine rewrite, so we are avoiding that with this.
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::f16:
+    return Subtarget->hasFullFP16();
+  }
+
+  return false;
+}
+
 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT VT = ExtVal.getValueType();
 
@@ -12837,9 +13282,11 @@ bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   // Thumb2 and ARM modes can use cmn for negative immediates.
   if (!Subtarget->isThumb())
-    return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
+    return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
+           ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
   if (Subtarget->isThumb2())
-    return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
+    return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
+           ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
   // Thumb1 doesn't have cmn, and only 8-bit immediates.
   return Imm >= 0 && Imm <= 255;
 }
@@ -13271,8 +13718,14 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
         return RCPair(0U, &ARM::QPR_8RegClass);
       break;
     case 't':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32 || VT == MVT::i32)
         return RCPair(0U, &ARM::SPRRegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPR_VFP2RegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPR_VFP2RegClass);
       break;
     }
   }
@@ -13602,6 +14055,20 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
 
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          "no-stack-arg-probe")) {
+    unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+    Chain = SP.getValue(1);
+    SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
+    if (Align)
+      SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
+    Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
+    SDValue Ops[2] = { SP, Chain };
+    return DAG.getMergeValues(Ops, DL);
+  }
+
   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
                               DAG.getConstant(2, DL, MVT::i32));
 
@@ -13665,6 +14132,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (!Subtarget->hasVFP3())
     return false;
+  if (VT == MVT::f16 && Subtarget->hasFullFP16())
+    return ARM_AM::getFP16Imm(Imm) != -1;
   if (VT == MVT::f32)
     return ARM_AM::getFP32Imm(Imm) != -1;
   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
@@ -13686,7 +14155,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vld4:
   case Intrinsic::arm_neon_vld2lane:
   case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane: {
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vld2dup:
+  case Intrinsic::arm_neon_vld3dup:
+  case Intrinsic::arm_neon_vld4dup: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -13700,6 +14172,21 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
+  case Intrinsic::arm_neon_vld1x2:
+  case Intrinsic::arm_neon_vld1x3:
+  case Intrinsic::arm_neon_vld1x4: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    // volatile loads with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::arm_neon_vst1:
   case Intrinsic::arm_neon_vst2:
   case Intrinsic::arm_neon_vst3:
@@ -13726,6 +14213,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
+  case Intrinsic::arm_neon_vst1x2:
+  case Intrinsic::arm_neon_vst1x3:
+  case Intrinsic::arm_neon_vst1x4: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 0;
+    // volatile stores with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
+  }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -13777,7 +14285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   return false;
 }
 
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
@@ -14073,7 +14581,7 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
   return VecSize == 64 || VecSize % 128 == 0;
 }
 
-/// \brief Lower an interleaved load into a vldN intrinsic.
+/// Lower an interleaved load into a vldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
@@ -14191,7 +14699,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   return true;
 }
 
-/// \brief Lower an interleaved store into a vstN intrinsic.
+/// Lower an interleaved store into a vstN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
@@ -14389,7 +14897,19 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
   return (Members > 0 && Members <= 4);
 }
 
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// Return the correct alignment for the current calling convention.
+unsigned
+ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+                                                 DataLayout DL) const {
+  if (!ArgTy->isVectorTy())
+    return DL.getABITypeAlignment(ArgTy);
+
+  // Avoid over-aligning vector parameters. It would require realigning the
+  // stack and waste space for no real benefit.
+  return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
+}
+
+/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
 /// passing according to AAPCS rules.
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
@@ -14401,7 +14921,7 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
   HABaseType Base = HA_UNKNOWN;
   uint64_t Members = 0;
   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
-  DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+  LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
 
   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
   return IsHA || IsIntArray;
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index bf63dfae4407..50b4c2977fb5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -31,6 +30,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/MachineValueType.h"
 #include <utility>
 
 namespace llvm {
@@ -102,6 +102,7 @@ class VectorType;
 
       VMOVRRD,      // double to two gprs.
       VMOVDRR,      // Two gprs to double.
+      VMOVSR,       // move gpr to single, used for f32 literal constructed in a gpr
 
       EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
       EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
@@ -171,6 +172,10 @@ class VectorType;
       // Vector move f32 immediate:
       VMOVFPIMM,
 
+      // Move H <-> R, clearing top 16 bits
+      VMOVrh,
+      VMOVhr,
+
       // Vector duplicate:
       VDUP,
       VDUPLANE,
@@ -203,6 +208,8 @@ class VectorType;
       SMLALDX,      // Signed multiply accumulate long dual exchange
       SMLSLD,       // Signed multiply subtract long dual
       SMLSLDX,      // Signed multiply subtract long dual exchange
+      SMMLAR,       // Signed multiply long, round and add
+      SMMLSR,       // Signed multiply long, subtract and round
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -325,6 +332,7 @@ class VectorType;
     bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
     bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
+    bool isFNegFree(EVT VT) const override;
 
     bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
 
@@ -346,7 +354,7 @@ class VectorType;
 
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
-    /// \brief Returns true if the addresing mode representing by AM is legal
+    /// Returns true if the addresing mode representing by AM is legal
     /// for the Thumb1 target, for a load/store of the specified type.
     bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
@@ -474,7 +482,7 @@ class VectorType;
                             MachineFunction &MF,
                             unsigned Intrinsic) const override;
 
-    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
@@ -484,7 +492,7 @@ class VectorType;
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                  unsigned Index) const override;
 
-    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// Returns true if an argument of type Ty needs to be passed in a
     /// contiguous block of registers in calling convention CallConv.
     bool functionArgumentNeedsConsecutiveRegisters(
         Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
@@ -571,6 +579,10 @@ class VectorType;
 
     void finalizeLowering(MachineFunction &MF) const override;
 
+    /// Return the correct alignment for the current calling convention.
+    unsigned getABIAlignmentForCallingConv(Type *ArgTy,
+                                           DataLayout DL) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
index f7c6c32eb4dc..70aded247f65 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -108,6 +108,7 @@ def AddrModeT2_so   : AddrMode<13>;
 def AddrModeT2_pc   : AddrMode<14>;
 def AddrModeT2_i8s4 : AddrMode<15>;
 def AddrMode_i12    : AddrMode<16>;
+def AddrMode5FP16   : AddrMode<17>;
 
 // Load / store index mode.
 class IndexMode<bits<2> val> {
@@ -1023,6 +1024,12 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
 class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
 }
+class FP16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasFP16];
+}
+class FullFP16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasFullFP16];
+}
 //===----------------------------------------------------------------------===//
 // Thumb Instruction Format Definitions.
 //
@@ -1527,7 +1534,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
 class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
-  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+  : VFPI<oops, iops, AddrMode5FP16, 4, IndexModeNone,
          VFPLdStFrm, itin, opc, asm, "", pattern> {
   list<Predicate> Predicates = [HasFullFP16];
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index a0e2ac4cbc6f..397c9dadb4ac 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -135,3 +135,31 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
       .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
       .add(predOps(ARMCC::AL));
 }
+
+std::pair<unsigned, unsigned>
+ARMInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  const unsigned Mask = ARMII::MO_OPTION_MASK;
+  return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+ARMInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace ARMII;
+
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}};
+  return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+ARMInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace ARMII;
+
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT, "arm-got"},
+      {MO_SBREL, "arm-sbrel"},
+      {MO_DLLIMPORT, "arm-dllimport"},
+      {MO_SECREL, "arm-secrel"},
+      {MO_NONLAZY, "arm-nonlazy"}};
+  return makeArrayRef(TargetFlags);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
index c87fb97448c9..c54c987134df 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -38,6 +38,13 @@ public:
   ///
   const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
 
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
+
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index eb8526bfeadf..d4c342cee5c0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -105,6 +105,14 @@ def ARMSmlaldx       : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
 def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
 def ARMSmlsldx       : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
 
+def SDT_MulHSR       : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
+                                            SDTCisSameAs<0, 1>,
+                                            SDTCisSameAs<0, 2>,
+                                            SDTCisSameAs<0, 3>]>;
+
+def ARMsmmlar      : SDNode<"ARMISD::SMMLAR", SDT_MulHSR>;
+def ARMsmmlsr      : SDNode<"ARMISD::SMMLSR", SDT_MulHSR>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -245,6 +253,8 @@ def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
                                  AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
 def HasV8_3a         : Predicate<"Subtarget->hasV8_3aOps()">,
                                  AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
+                                 AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -259,6 +269,10 @@ def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                  AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasSHA2          : Predicate<"Subtarget->hasSHA2()">,
+                                 AssemblerPredicate<"FeatureSHA2", "sha2">;
+def HasAES           : Predicate<"Subtarget->hasAES()">,
+                                 AssemblerPredicate<"FeatureAES", "aes">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasDotProd       : Predicate<"Subtarget->hasDotProd()">,
@@ -875,6 +889,16 @@ def bf_inv_mask_imm : Operand<i32>,
   let PrintMethod = "printBitfieldInvMaskImmOperand";
   let DecoderMethod = "DecodeBitfieldMaskOperand";
   let ParserMatchClass = BitfieldAsmOperand;
+  let GISelPredicateCode = [{
+    // There's better methods of implementing this check. IntImmLeaf<> would be
+    // equivalent and have less boilerplate but we need a test for C++
+    // predicates and this one causes new rules to be imported into GlobalISel
+    // without requiring additional features first.
+    const auto &MO = MI.getOperand(1);
+    if (!MO.isCImm())
+      return false;
+    return ARM::isBitFieldInvertedMask(MO.getCImm()->getZExtValue());
+  }];
 }
 
 def imm1_32_XFORM: SDNodeXForm<imm, [{
@@ -1996,6 +2020,7 @@ def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
 def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
+def : InstAlias<"csdb$p", (HINT 20, pred:$p)>, Requires<[IsARM, HasV6K]>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm",
@@ -3331,7 +3356,7 @@ defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
 //  Move Instructions.
 //
 
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isMoveReg = 1 in
 def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
                 "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
@@ -3904,6 +3929,8 @@ def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
   let Inst{11-4} = 0b00000000;
   let Inst{15-12} = Rd;
   let Inst{3-0} = Rm;
+
+  let Unpredictable{19-16} = 0b1111;
 }
 def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
                   DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
@@ -3917,10 +3944,12 @@ def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
   let Inst{11-5} = shift{11-5};
   let Inst{4} = 0;
   let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{19-16} = 0b1111;
 }
-def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+def  MVNsr  : AsI1<0b1111, (outs GPRnopc:$Rd), (ins so_reg_reg:$shift),
                   DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+                  [(set GPRnopc:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
                   Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> shift;
@@ -3932,6 +3961,8 @@ def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
   let Inst{6-5} = shift{6-5};
   let Inst{4} = 1;
   let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{19-16} = 0b1111;
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
@@ -4143,7 +4174,8 @@ def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, (i32 0)))]>,
             Requires<[IsARM, HasV6]>,
              Sched<[WriteMUL32, ReadMUL, ReadMUL]>  {
   let Inst{15-12} = 0b1111;
@@ -4158,7 +4190,8 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
             Requires<[IsARM, HasV6]>,
              Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
@@ -4170,7 +4203,8 @@ def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (ARMsmmlsr GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
             Requires<[IsARM, HasV6]>,
              Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
@@ -4785,6 +4819,15 @@ def instsyncb_opt : Operand<i32> {
   let DecoderMethod = "DecodeInstSyncBarrierOption";
 }
 
+def TraceSyncBarrierOptOperand : AsmOperandClass {
+  let Name = "TraceSyncBarrierOpt";
+  let ParserMethod = "parseTraceSyncBarrierOptOperand";
+}
+def tsb_opt : Operand<i32> {
+  let PrintMethod = "printTraceSyncBOption";
+  let ParserMatchClass = TraceSyncBarrierOptOperand;
+}
+
 // Memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
@@ -4811,6 +4854,13 @@ def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{31-4} = 0xf57ff06;
   let Inst{3-0} = opt;
 }
+
+let hasNoSchedulingInfo = 1 in
+def TSB : AInoP<(outs), (ins tsb_opt:$opt), MiscFrm, NoItinerary,
+                "tsb", "\t$opt", []>, Requires<[IsARM, HasV8_4a]> {
+  let Inst{31-0} = 0xe320f012;
+}
+
 }
 
 let usesCustomInserter = 1, Defs = [CPSR] in {
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
index cd67dded5853..4525eec8da03 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -48,46 +48,28 @@ def nImmVMOVI32 : Operand<i32> {
   let ParserMatchClass = nImmVMOVI32AsmOperand;
 }
 
-def nImmVMOVI16AsmOperandByteReplicate :
-  AsmOperandClass {
-  let Name = "NEONi16vmovByteReplicate";
-  let PredicateMethod = "isNEONi16ByteReplicate";
-  let RenderMethod = "addNEONvmovByteReplicateOperands";
-}
-def nImmVMOVI32AsmOperandByteReplicate :
-  AsmOperandClass {
-  let Name = "NEONi32vmovByteReplicate";
-  let PredicateMethod = "isNEONi32ByteReplicate";
-  let RenderMethod = "addNEONvmovByteReplicateOperands";
-}
-def nImmVMVNI16AsmOperandByteReplicate :
-  AsmOperandClass {
-  let Name = "NEONi16invByteReplicate";
-  let PredicateMethod = "isNEONi16ByteReplicate";
-  let RenderMethod = "addNEONinvByteReplicateOperands";
-}
-def nImmVMVNI32AsmOperandByteReplicate :
-  AsmOperandClass {
-  let Name = "NEONi32invByteReplicate";
-  let PredicateMethod = "isNEONi32ByteReplicate";
-  let RenderMethod = "addNEONinvByteReplicateOperands";
-}
-
-def nImmVMOVI16ByteReplicate : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
-  let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+class nImmVMOVIAsmOperandReplicate<ValueType From, ValueType To>
+  : AsmOperandClass {
+  let Name = "NEONi" # To.Size # "vmovi" # From.Size # "Replicate";
+  let PredicateMethod = "isNEONmovReplicate<" # From.Size # ", " # To.Size # ">";
+  let RenderMethod = "addNEONvmovi" # From.Size # "ReplicateOperands";
 }
-def nImmVMOVI32ByteReplicate : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
-  let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+
+class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To>
+  : AsmOperandClass {
+  let Name = "NEONi" # To.Size # "invi" # From.Size # "Replicate";
+  let PredicateMethod = "isNEONinvReplicate<" # From.Size # ", " # To.Size # ">";
+  let RenderMethod = "addNEONinvi" # From.Size # "ReplicateOperands";
 }
-def nImmVMVNI16ByteReplicate : Operand<i32> {
+
+class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
-  let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+  let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>;
 }
-def nImmVMVNI32ByteReplicate : Operand<i32> {
+
+class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
-  let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+  let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>;
 }
 
 def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
@@ -227,7 +209,7 @@ def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
   let ParserMethod = "parseVectorList";
   let RenderMethod = "addVecListOperands";
 }
-def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
+def VecListDPairSpacedAllLanes : RegisterOperand<DPairSpc,
                                          "printVectorListTwoSpacedAllLanes"> {
   let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
 }
@@ -788,10 +770,22 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
 defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
 defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
+def VLD1d8TPseudo  : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d16TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d32TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
 def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
 def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
 def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
 
+def VLD1q8HighTPseudo     : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q8LowTPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16HighTPseudo    : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q16LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32HighTPseudo    : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q32LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64HighTPseudo    : VLDQQQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1q64LowTPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+
 // ...with 4 registers
 class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
@@ -829,10 +823,22 @@ defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
+def VLD1d8QPseudo  : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d16QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d32QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
 def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
 def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
 def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
 
+def VLD1q8LowQPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q8HighQPseudo     : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q16HighQPseudo    : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q32HighQPseudo    : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64LowQPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1q64HighQPseudo    : VLDQQQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
            InstrItinClass itin, Operand AddrMode>
@@ -1512,6 +1518,13 @@ def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
 def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
                            addrmode6dupalign64>;
 
+def VLD2DUPq8EvenPseudo  : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudo   : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudo  : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudo  : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+
 // ...with address register writeback:
 multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
                      Operand AddrMode> {
@@ -1572,6 +1585,13 @@ def VLD3DUPq8  : VLD3DUP<{0,0,1,?}, "8">;
 def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
 def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
 
+def VLD3DUPq8EvenPseudo  : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq8OddPseudo   : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq16EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq16OddPseudo  : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq32EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPq32OddPseudo  : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+
 // ...with address register writeback:
 class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
@@ -1618,6 +1638,13 @@ def VLD4DUPq8  : VLD4DUP<{0,0,1,?}, "8">;
 def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">;
 def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
 
+def VLD4DUPq8EvenPseudo  : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq8OddPseudo   : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq16EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq16OddPseudo  : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq32EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPq32OddPseudo  : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+
 // ...with address register writeback:
 class VLD4DUPWB<bits<4> op7_4, string Dt>
   : NLdSt<1, 0b10, 0b1111, op7_4,
@@ -1795,10 +1822,22 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
 defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
 defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
+def VST1d8TPseudo             : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d16TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d32TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
 def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
 def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
 def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
 
+def VST1q8HighTPseudo     : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q8LowTPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q16HighTPseudo    : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q32HighTPseudo    : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q64HighTPseudo    : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1q64LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+
 // ...with 4 registers
 class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
@@ -1838,10 +1877,22 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
+def VST1d8QPseudo             : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d16QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d32QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
 def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
 def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
 def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
 
+def VST1q8HighQPseudo     : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q8LowQPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q16HighQPseudo    : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q32HighQPseudo    : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q64HighQPseudo    : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1q64LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
             InstrItinClass itin, Operand AddrMode>
@@ -4700,37 +4751,59 @@ def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
 // We put them in the VFPV8 decoder namespace because the ARM and Thumb
 // encodings are the same and thus no further bit twiddling is necessary
 // in the disassembler.
-let Predicates = [HasDotProd], DecoderNamespace = "VFPV8" in {
-
-def VUDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b1,
-                  (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b0,
-                  (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
-def VUDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b1,
-                  (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b0,
-                  (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
+class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
+           ValueType AccumTy, ValueType InputTy,
+           SDPatternOperator OpNode> :
+      N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+            (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
+            Asm, AsmTy,
+            [(set (AccumTy RegTy:$dst),
+                  (OpNode (AccumTy RegTy:$Vd),
+                          (InputTy RegTy:$Vn),
+                          (InputTy RegTy:$Vm)))]> {
+  let Predicates = [HasDotProd];
+  let DecoderNamespace = "VFPV8";
+  let Constraints = "$dst = $Vd";
+}
+
+def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8,  int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8,  int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
 
 // Indexed dot product instructions:
-class DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty> :
-  N3Vnp<0b11100, 0b10, 0b1101, Q, U,
-       (outs Ty:$Vd), (ins Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
-       N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
-  bit lane;
-  let Inst{5} = lane;
-  let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
+           ValueType AccumType, ValueType InputType, SDPatternOperator OpNode,
+           dag RHS> {
+  def "" : N3Vnp<0b11100, 0b10, 0b1101, Q, U, (outs Ty:$dst),
+                 (ins Ty:$Vd, Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+                 N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
+    bit lane;
+    let Inst{5} = lane;
+    let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+    let Constraints = "$dst = $Vd";
+    let Predicates = [HasDotProd];
+    let DecoderNamespace = "VFPV8";
+  }
+
+  def : Pat<
+    (AccumType (OpNode (AccumType Ty:$Vd),
+                       (InputType Ty:$Vn),
+                       (InputType (bitconvert (AccumType
+                                  (NEONvduplane (AccumType Ty:$Vm),
+                                                 VectorIndex32:$lane)))))),
+    (!cast<Instruction>(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>;
 }
 
-def VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR>;
-def VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR>;
-def VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR>;
-def VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR>;
+defm VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR, v2i32, v8i8,
+                    int_arm_neon_udot, (v2i32 DPR_VFP2:$Vm)>;
+defm VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR, v2i32, v8i8,
+                    int_arm_neon_sdot, (v2i32 DPR_VFP2:$Vm)>;
+defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8,
+                    int_arm_neon_udot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
+                    int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
 
-}  // HasDotProd
 
 // ARMv8.3 complex operations
 class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
@@ -5340,23 +5413,19 @@ defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
 defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
                                "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
 
+def : Pat<(v8i16 (abs (sub (zext (v8i8 DPR:$opA)), (zext (v8i8 DPR:$opB))))),
+          (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
+def : Pat<(v4i32 (abs (sub (zext (v4i16 DPR:$opA)), (zext (v4i16 DPR:$opB))))),
+          (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+
+// ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the
+// shift/xor pattern for ABS.
+
 def abd_shr :
     PatFrag<(ops node:$in1, node:$in2, node:$shift),
             (NEONvshrs (sub (zext node:$in1),
                             (zext node:$in2)), (i32 $shift))>;
 
-def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))),
-               (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)),
-                                                   (zext (v8i8 DPR:$opB))),
-                                              (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))),
-          (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
-
-def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)),
-               (v4i32 (add (sub (zext (v4i16 DPR:$opA)),
-                                (zext (v4i16 DPR:$opB))),
-                           (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))),
-          (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
-
 def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
                (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
                                                    (zext (v2i32 DPR:$opB))),
@@ -5933,34 +6002,57 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
 } // isReMaterializable, isAsCheapAsAMove
 
 // Add support for bytes replication feature, so it could be GAS compatible.
-// E.g. instructions below:
-// "vmov.i32 d0, 0xffffffff"
-// "vmov.i32 d0, 0xabababab"
-// "vmov.i16 d0, 0xabab"
-// are incorrect, but we could deal with such cases.
-// For last two instructions, for example, it should emit:
-// "vmov.i8 d0, 0xab"
-def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
-                    (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
-                    (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
-                    (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
-                    (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
-
-// Also add same support for VMVN instructions. So instruction:
-// "vmvn.i32 d0, 0xabababab"
-// actually means:
-// "vmov.i8 d0, 0x54"
-def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
-                    (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
-                    (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
-                    (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
-                    (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+multiclass NEONImmReplicateI8InstAlias<ValueType To> {
+  // E.g. instructions below:
+  // "vmov.i32 d0, #0xffffffff"
+  // "vmov.i32 d0, #0xabababab"
+  // "vmov.i16 d0, #0xabab"
+  // are incorrect, but we could deal with such cases.
+  // For last two instructions, for example, it should emit:
+  // "vmov.i8 d0, #0xab"
+  def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+                      (VMOVv8i8 DPR:$Vd, nImmVMOVIReplicate<i8, To>:$Vm, pred:$p)>;
+  def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+                      (VMOVv16i8 QPR:$Vd, nImmVMOVIReplicate<i8, To>:$Vm, pred:$p)>;
+  // Also add same support for VMVN instructions. So instruction:
+  // "vmvn.i32 d0, #0xabababab"
+  // actually means:
+  // "vmov.i8 d0, #0x54"
+  def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+                      (VMOVv8i8 DPR:$Vd, nImmVINVIReplicate<i8, To>:$Vm, pred:$p)>;
+  def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+                      (VMOVv16i8 QPR:$Vd, nImmVINVIReplicate<i8, To>:$Vm, pred:$p)>;
+}
+
+defm : NEONImmReplicateI8InstAlias<i16>;
+defm : NEONImmReplicateI8InstAlias<i32>;
+defm : NEONImmReplicateI8InstAlias<i64>;
+
+// Similar to above for types other than i8, e.g.:
+// "vmov.i32 d0, #0xab00ab00" -> "vmov.i16 d0, #0xab00"
+// "vmvn.i64 q0, #0xab000000ab000000" -> "vmvn.i32 q0, #0xab000000"
+// In this case we do not canonicalize VMVN to VMOV
+multiclass NEONImmReplicateInstAlias<ValueType From, NeonI V8, NeonI V16,
+                                     NeonI NV8, NeonI NV16, ValueType To> {
+  def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+                      (V8 DPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+  def : NEONInstAlias<"vmov${p}.i" # To.Size # " $Vd, $Vm",
+                      (V16 QPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+  def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+                      (NV8 DPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+  def : NEONInstAlias<"vmvn${p}.i" # To.Size # " $Vd, $Vm",
+                      (NV16 QPR:$Vd, nImmVMOVIReplicate<From, To>:$Vm, pred:$p)>;
+}
+
+defm : NEONImmReplicateInstAlias<i16, VMOVv4i16, VMOVv8i16,
+                                      VMVNv4i16, VMVNv8i16, i32>;
+defm : NEONImmReplicateInstAlias<i16, VMOVv4i16, VMOVv8i16,
+                                      VMVNv4i16, VMVNv8i16, i64>;
+defm : NEONImmReplicateInstAlias<i32, VMOVv2i32, VMOVv4i32,
+                                      VMVNv2i32, VMVNv4i32, i64>;
+// TODO: add "VMOV <-> VMVN" conversion for cases like
+// "vmov.i32 d0, #0xffaaffaa" -> "vmvn.i16 d0, #0x55"
+// "vmvn.i32 d0, #0xaaffaaff" -> "vmov.i16 d0, #0xff00"
 
 // On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
 // require zero cycles to execute so they should be used wherever possible for
@@ -6865,6 +6957,17 @@ class N3VSPat<SDNode OpNode, NeonI Inst>
                  (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
                  SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
 
+class N3VSPatFP16<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f16 (OpNode HPR:$a, HPR:$b)),
+              (EXTRACT_SUBREG
+               (v4f16 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v4f16 (COPY_TO_REGCLASS (v4f16 (IMPLICIT_DEF)), DPR_VFP2)),
+                 HPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v4f16 (COPY_TO_REGCLASS (v4f16 (IMPLICIT_DEF)), DPR_VFP2)),
+                 HPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
 class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
   : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
               (EXTRACT_SUBREG
@@ -6907,6 +7010,8 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
       Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N2VSPat<fabs, VABSfd>;
 def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPatFP16<fmaxnan, VMAXhd>, Requires<[HasFullFP16]>;
+def : N3VSPatFP16<fminnan, VMINhd>, Requires<[HasFullFP16]>;
 def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
 def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
 def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
@@ -6930,6 +7035,9 @@ def : VFPPat<(f64 (uint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))),
 def : Pat<(f32 (bitconvert GPR:$a)),
           (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
         Requires<[HasNEON, DontUseVMOVSR]>;
+def : Pat<(arm_vmovsr GPR:$a),
+          (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+        Requires<[HasNEON, DontUseVMOVSR]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -6966,9 +7074,11 @@ def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
 let Predicates = [IsLE] in {
   def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4f16 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v4f16 (bitconvert (f64   DPR:$src))), (v4f16 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
@@ -6997,6 +7107,7 @@ let Predicates = [IsLE] in {
   def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
@@ -7014,6 +7125,7 @@ def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
 let Predicates = [IsLE] in {
   def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
 }
@@ -7039,6 +7151,7 @@ let Predicates = [IsBE] in {
   def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (VREV64d8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (VREV32d8  DPR:$src)>;
   def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
   def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
@@ -7060,6 +7173,7 @@ let Predicates = [IsBE] in {
   def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8  QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8  QPR:$src)>;
@@ -7068,10 +7182,12 @@ let Predicates = [IsBE] in {
   def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8  QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
index c2bcc087e077..88aab47a79bf 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -270,6 +270,14 @@ def t_addrmode_sp : MemOperand,
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
+// Inspects parent to determine whether an or instruction can be implemented as
+// an add (i.e. whether we know overflow won't occur in the add).
+def AddLikeOrOp : ComplexPattern<i32, 1, "SelectAddLikeOr", [],
+                                 [SDNPWantParent]>;
+
+// Pattern to exclude immediates from matching
+def non_imm32 : PatLeaf<(i32 GPR), [{ return !isa<ConstantSDNode>(N); }]>;
+
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions.
 //
@@ -997,6 +1005,15 @@ let isAdd = 1 in {
   }
 }
 
+// Thumb has more flexible short encodings for ADD than ORR, so use those where
+// possible.
+def : T1Pat<(or AddLikeOrOp:$Rn, imm0_7:$imm), (tADDi3 $Rn, imm0_7:$imm)>;
+
+def : T1Pat<(or AddLikeOrOp:$Rn, imm8_255:$imm), (tADDi8 $Rn, imm8_255:$imm)>;
+
+def : T1Pat<(or AddLikeOrOp:$Rn, tGPR:$Rm), (tADDrr $Rn, $Rm)>;
+
+
 def : tInstAlias <"add${s}${p} $Rdn, $Rm",
                  (tADDrr tGPR:$Rdn,s_cc_out:$s, tGPR:$Rdn, tGPR:$Rm, pred:$p)>;
 
@@ -1154,7 +1171,7 @@ def : tInstAlias <"movs $Rdn, $imm",
 
 // A7-73: MOV(2) - mov setting flag.
 
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, isMoveReg = 1 in {
 def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
                       2, IIC_iMOVr,
                       "mov", "\t$Rd, $Rm", "", []>,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 4592249f5795..c7133b6483ef 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2104,6 +2104,12 @@ def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
                  (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
 def : t2InstSubst<"subw${p} $rd, $rn, $imm",
                  (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstSubst<"subw${p} $Rd, $Rn, $imm",
+                 (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstSubst<"sub${s}${p} $rd, $rn, $imm",
+                 (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sub${p} $rd, $rn, $imm",
+                 (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
 
@@ -2594,6 +2600,18 @@ def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
 def : T2Pat<(t2_so_imm_not:$src),
             (t2MVNi t2_so_imm_not:$src)>;
 
+// There are shorter Thumb encodings for ADD than ORR, so to increase
+// Thumb2SizeReduction's chances later on we select a t2ADD for an or where
+// possible.
+def : T2Pat<(or AddLikeOrOp:$Rn, t2_so_imm:$imm),
+            (t2ADDri $Rn, t2_so_imm:$imm)>;
+
+def : T2Pat<(or AddLikeOrOp:$Rn, imm0_4095:$Rm),
+            (t2ADDri12 $Rn, imm0_4095:$Rm)>;
+
+def : T2Pat<(or AddLikeOrOp:$Rn, non_imm32:$Rm),
+            (t2ADDrr $Rn, $Rm)>;
+
 //===----------------------------------------------------------------------===//
 //  Multiply Instructions.
 //
@@ -2661,7 +2679,9 @@ class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
 }
 def t2SMMUL : T2SMMUL<0b0000, "smmul", [(set rGPR:$Rd, (mulhs rGPR:$Rn,
                                                               rGPR:$Rm))]>;
-def t2SMMULR : T2SMMUL<0b0001, "smmulr", []>;
+def t2SMMULR :
+  T2SMMUL<0b0001, "smmulr",
+          [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, (i32 0)))]>;
 
 class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
                      list<dag> pattern>
@@ -2677,9 +2697,11 @@ class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
 
 def t2SMMLA :   T2FourRegSMMLA<0b101, 0b0000, "smmla",
                 [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>;
-def t2SMMLAR:   T2FourRegSMMLA<0b101, 0b0001, "smmlar", []>;
+def t2SMMLAR:   T2FourRegSMMLA<0b101, 0b0001, "smmlar",
+                [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>;
 def t2SMMLS:    T2FourRegSMMLA<0b110, 0b0000, "smmls", []>;
-def t2SMMLSR:   T2FourRegSMMLA<0b110, 0b0001, "smmlsr", []>;
+def t2SMMLSR:   T2FourRegSMMLA<0b110, 0b0001, "smmlsr",
+                [(set rGPR:$Rd, (ARMsmmlsr rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>;
 
 class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
                      list<dag> pattern>
@@ -3193,6 +3215,12 @@ def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
   let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = opt;
 }
+
+let hasNoSchedulingInfo = 1 in
+def t2TSB : T2I<(outs), (ins tsb_opt:$opt), NoItinerary,
+                "tsb", "\t$opt", []>, Requires<[IsThumb, HasV8_4a]> {
+  let Inst{31-0} = 0xf3af8012;
+}
 }
 
 class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
@@ -3696,6 +3724,8 @@ def : t2InstAlias<"esb$p.w", (t2HINT 16, pred:$p), 1> {
 def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
   let Predicates = [IsThumb2, HasRAS];
 }
+def : t2InstAlias<"csdb$p.w", (t2HINT 20, pred:$p), 0>;
+def : t2InstAlias<"csdb$p",   (t2HINT 20, pred:$p), 1>;
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
                 [(int_arm_dbg imm0_15:$opt)]> {
@@ -4713,12 +4743,24 @@ def : t2InstSubst<"bic${s}${p} $Rd, $Rn, $imm",
 def : t2InstSubst<"bic${s}${p} $Rdn, $imm",
                   (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
+def : t2InstSubst<"bic${s}${p}.w $Rd, $Rn, $imm",
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstSubst<"bic${s}${p}.w $Rdn, $imm",
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
 def : t2InstSubst<"and${s}${p} $Rd, $Rn, $imm",
                   (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstSubst<"and${s}${p} $Rdn, $imm",
                   (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
+def : t2InstSubst<"and${s}${p}.w $Rd, $Rn, $imm",
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstSubst<"and${s}${p}.w $Rdn, $imm",
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
 // And ORR <--> ORN
 def : t2InstSubst<"orn${s}${p} $Rd, $Rn, $imm",
                   (t2ORRri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
index 22e157a7480b..2f14b78c91fd 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -17,11 +17,19 @@ def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
 def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
                                        SDTCisVT<2, f64>]>;
 
+def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
+
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
 def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMFCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
+def arm_vmovsr  : SDNode<"ARMISD::VMOVSR", SDT_VMOVSR>;
+
+def SDT_VMOVhr : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, i32>] >;
+def SDT_VMOVrh : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisFP<1>] >;
+def arm_vmovhr : SDNode<"ARMISD::VMOVhr", SDT_VMOVhr>;
+def arm_vmovrh : SDNode<"ARMISD::VMOVrh", SDT_VMOVrh>;
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
@@ -39,7 +47,7 @@ def vfp_f16imm : Operand<f16>,
     }], SDNodeXForm<fpimm, [{
       APFloat InVal = N->getValueAPF();
       uint32_t enc = ARM_AM::getFP16Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
     }]>> {
   let PrintMethod = "printFPImmOperand";
   let ParserMatchClass = FPImmOperand;
@@ -69,10 +77,19 @@ def vfp_f64imm : Operand<f64>,
   let ParserMatchClass = FPImmOperand;
 }
 
+def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+
 def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 4;
 }]>;
 
+def alignedstore16 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
 def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAlignment() >= 4;
@@ -113,9 +130,9 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
   let D = VFPNeonDomain;
 }
 
-def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
                  IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
-                 []>,
+                 [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
             Requires<[HasFullFP16]>;
 
 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
@@ -132,9 +149,9 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
   let D = VFPNeonDomain;
 }
 
-def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
                  IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
-                 []>,
+                 [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
             Requires<[HasFullFP16]>;
 
 //===----------------------------------------------------------------------===//
@@ -335,9 +352,9 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
              Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -360,9 +377,9 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -381,9 +398,9 @@ def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>,
              Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -406,9 +423,9 @@ def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULH  : AHbI<0b11100, 0b10, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>,
              Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@@ -428,18 +445,18 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
 }
 
 def VNMULH : AHbI<0b11100, 0b10, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>,
              Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
       Uses = [CPSR], AddedComplexity = 4 in {
     def H : AHbInp<0b11100, opc, 0,
-                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
-                   []>,
+                   [(set HPR:$Sd, (ARMcmov HPR:$Sm, HPR:$Sn, CC))]>,
                    Requires<[HasFullFP16]>;
 
     def S : ASbInp<0b11100, opc, 0,
@@ -465,9 +482,9 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
     def H : AHbInp<0b11101, 0b00, opc,
-                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                    NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
-                   []>,
+                   [(set HPR:$Sd, (SD HPR:$Sn, HPR:$Sm))]>,
                    Requires<[HasFullFP16]>;
 
     def S : ASbInp<0b11101, 0b00, opc,
@@ -511,9 +528,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
 }
 
 def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
-                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
-                  []>;
+                  [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
 
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
@@ -530,9 +547,9 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
 }
 
 def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
-                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
-                  []>;
+                  [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
 } // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
@@ -580,9 +597,9 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
 }
 
 def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
-                   (outs), (ins SPR:$Sd),
+                   (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
-                   []> {
+                   [(arm_cmpfp0 HPR:$Sd, (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -608,9 +625,9 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
 }
 
 def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
-                   (outs), (ins SPR:$Sd),
+                   (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
-                   []> {
+                   [(arm_cmpfp0 HPR:$Sd, (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -658,20 +675,29 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
   let Predicates = [HasVFP2, HasDPVFP];
 }
 
-// Between half, single and double-precision.  For disassembly only.
-
+// Between half, single and double-precision.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
+def : FullFP16Pat<(f32 (fpextend HPR:$Sm)),
+                  (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f16_to_fp GPR:$a),
+              (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
+def : FullFP16Pat<(f16 (fpround SPR:$Sm)),
+                  (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
+def : FP16Pat<(fp_to_f16 SPR:$a),
+              (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
@@ -687,7 +713,8 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasFPARMv8, HasDPVFP]>,
+                   [/* Intentionally left blank, see patterns below */]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>,
               Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Sm;
@@ -697,10 +724,16 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
   let Inst{5}   = Sm{0};
 }
 
+def : FullFP16Pat<(f64 (fpextend HPR:$Sm)),
+                  (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
+              (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
 def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
                    (outs SPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                   [/* Intentionally left blank, see patterns below */]>,
+                   Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -712,6 +745,11 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
   let Inst{22}      = Sd{0};
 }
 
+def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
+                  (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>;
+def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
+              (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+
 def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
@@ -739,23 +777,11 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
   let Inst{5}     = Dm{4};
 }
 
-def : Pat<(fp_to_f16 SPR:$a),
-          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
-
-def : Pat<(fp_to_f16 (f64 DPR:$a)),
-          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
-
-def : Pat<(f16_to_fp GPR:$a),
-          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-
-def : Pat<(f64 (f16_to_fp GPR:$a)),
-          (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
-
 multiclass vcvt_inst<string opc, bits<2> rm,
                      SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
     def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
-                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    (outs SPR:$Sd), (ins HPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
                     []>,
                     Requires<[HasFullFP16]> {
@@ -763,7 +789,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
     }
 
     def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
-                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    (outs SPR:$Sd), (ins HPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
                     []>,
                     Requires<[HasFullFP16]> {
@@ -818,6 +844,17 @@ multiclass vcvt_inst<string opc, bits<2> rm,
   }
 
   let Predicates = [HasFPARMv8] in {
+    let Predicates = [HasFullFP16] in {
+    def : Pat<(i32 (fp_to_sint (node HPR:$a))),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>(NAME#"SH") HPR:$a),
+                GPR)>;
+
+    def : Pat<(i32 (fp_to_uint (node HPR:$a))),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>(NAME#"UH") HPR:$a),
+                GPR)>;
+    }
     def : Pat<(i32 (fp_to_sint (node SPR:$a))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"SS") SPR:$a),
@@ -859,9 +896,9 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
 }
 
 def VNEGH  : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sm),
                   IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
-                  []>;
+                  [(set HPR:$Sd, (fneg HPR:$Sm))]>;
 
 multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
   def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
@@ -940,7 +977,7 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
 }
 
 defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
-defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01, int_arm_neon_vrintn>;
 defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
 defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
 
@@ -962,6 +999,7 @@ def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   []>;
 
 let hasSideEffects = 0 in {
+let isMoveReg = 1 in {
 def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
@@ -969,6 +1007,7 @@ def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
 def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+} // isMoveReg
 
 let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
 def VMOVH  : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
@@ -987,6 +1026,7 @@ def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
 // FP <-> GPR Copies.  Int <-> FP Conversions.
 //
 
+let isMoveReg = 1 in {
 def VMOVRS : AVConv2I<0b11100001, 0b1010,
                       (outs GPR:$Rt), (ins SPR:$Sn),
                       IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
@@ -1032,6 +1072,8 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
   // pipelines.
   let D = VFPNeonDomain;
 }
+} // isMoveReg
+def : Pat<(arm_vmovsr GPR:$Rt), (VMOVSR GPR:$Rt)>, Requires<[HasVFP2, UseVMOVSR]>;
 
 let hasSideEffects = 0 in {
 def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
@@ -1160,9 +1202,9 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
 
 // Move H->R, clearing top 16 bits
 def VMOVRH : AVConv2I<0b11100001, 0b1001,
-                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      (outs GPR:$Rt), (ins HPR:$Sn),
                       IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
-                      []>,
+                      [(set GPR:$Rt, (arm_vmovrh HPR:$Sn))]>,
              Requires<[HasFullFP16]>,
              Sched<[WriteFPMOV]> {
   // Instruction operands.
@@ -1180,9 +1222,9 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
 
 // Move R->H, clearing top 16 bits
 def VMOVHR : AVConv4I<0b11100000, 0b1001,
-                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      (outs HPR:$Sn), (ins GPR:$Rt),
                       IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
-                      []>,
+                      [(set HPR:$Sn, (arm_vmovhr GPR:$Rt))]>,
              Requires<[HasFullFP16]>,
              Sched<[WriteFPMOV]> {
   // Instruction operands.
@@ -1297,13 +1339,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VSITOS (VLDRS addrmode5:$a))>;
 
 def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
-                               (outs SPR:$Sd), (ins SPR:$Sm),
+                               (outs HPR:$Sd), (ins SPR:$Sm),
                                IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
                                []>,
              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 }
 
+def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
+                   (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
@@ -1339,13 +1384,16 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VUITOS (VLDRS addrmode5:$a))>;
 
 def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
-                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                (outs HPR:$Sd), (ins SPR:$Sm),
                                 IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
                                 []>,
              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 }
 
+def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
+                   (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
 // FP -> Int:
 
 class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1440,13 +1488,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
                    (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
 
 def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
-                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 (outs SPR:$Sd), (ins HPR:$Sm),
                                  IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
                                  []>,
               Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
+def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
+                   (COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>;
+
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
                                IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
@@ -1483,13 +1534,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
                   (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
 
 def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
-                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 (outs SPR:$Sd), (ins HPR:$Sm),
                                  IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
                                  []>,
               Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
+def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
+                   (COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>;
+
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
 def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
@@ -1773,9 +1827,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
 }
 
 def VMLAH : AHbI<0b11100, 0b00, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
+                                           HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
 
@@ -1785,6 +1840,10 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+          (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+
 
 def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1809,9 +1868,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
 }
 
 def VMLSH : AHbI<0b11100, 0b00, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+                                           HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
 
@@ -1821,6 +1881,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+          (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1845,9 +1908,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 }
 
 def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+                                           HPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
 
@@ -1858,6 +1922,9 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
+          (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 // (-dst - (a * b)) -> -(dst + (a * b))
 def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
@@ -1866,6 +1933,9 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
+          (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1889,9 +1959,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
 }
 
 def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
-             []>,
+             [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
 
@@ -1901,6 +1971,9 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
+          (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 //===----------------------------------------------------------------------===//
 // Fused FP Multiply-Accumulate Operations.
@@ -1927,9 +2000,10 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
 }
 
 def VFMAH : AHbI<0b11101, 0b10, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
+                                           HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFusedMAC]>,
             Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -1940,6 +2014,9 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+          (VFMAH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
 // (fma x, y, z) -> (vfms z, x, y)
@@ -1972,9 +2049,10 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
 }
 
 def VFMSH : AHbI<0b11101, 0b10, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+                                           HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFusedMAC]>,
               Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -1985,6 +2063,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
+          (VFMSH HPR:$dstin, HPR:$a, HPR:$b)>,
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
 // (fma (fneg x), y, z) -> (vfms z, x, y)
@@ -2024,9 +2105,10 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
 }
 
 def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
+                                           HPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasFullFP16,UseFusedMAC]>,
                 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2075,9 +2157,9 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
 }
 
 def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
-             []>,
+             [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
                   Requires<[HasFullFP16,UseFusedMAC]>,
                   Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2269,10 +2351,11 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
   let Inst{3-0}   = imm{3-0};
 }
 
-def FCONSTH : VFPAI<(outs SPR:$Sd), (ins vfp_f16imm:$imm),
+def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
                      VFPMiscFrm, IIC_fpUNA16,
                      "vmov", ".f16\t$Sd, $imm",
-                     []>, Requires<[HasFullFP16]> {
+                     [(set HPR:$Sd, vfp_f16imm:$imm)]>,
+              Requires<[HasFullFP16]> {
   bits<5> Sd;
   bits<8> imm;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index b0fd0b476920..6692a4d41420 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -117,39 +117,47 @@ ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM,
 {
 }
 
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
-                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
-                       const RegisterBankInfo &RBI) {
-  unsigned DstReg = I.getOperand(0).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
-    return true;
-
-  const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
-  (void)RegBank;
+static const TargetRegisterClass *guessRegClass(unsigned Reg,
+                                                MachineRegisterInfo &MRI,
+                                                const TargetRegisterInfo &TRI,
+                                                const RegisterBankInfo &RBI) {
+  const RegisterBank *RegBank = RBI.getRegBank(Reg, MRI, TRI);
   assert(RegBank && "Can't get reg bank for virtual register");
 
-  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  const unsigned Size = MRI.getType(Reg).getSizeInBits();
   assert((RegBank->getID() == ARM::GPRRegBankID ||
           RegBank->getID() == ARM::FPRRegBankID) &&
          "Unsupported reg bank");
 
-  const TargetRegisterClass *RC = &ARM::GPRRegClass;
-
   if (RegBank->getID() == ARM::FPRRegBankID) {
-    if (DstSize == 32)
-      RC = &ARM::SPRRegClass;
-    else if (DstSize == 64)
-      RC = &ARM::DPRRegClass;
+    if (Size == 32)
+      return &ARM::SPRRegClass;
+    else if (Size == 64)
+      return &ARM::DPRRegClass;
+    else if (Size == 128)
+      return &ARM::QPRRegClass;
     else
       llvm_unreachable("Unsupported destination size");
   }
 
+  return &ARM::GPRRegClass;
+}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                       const RegisterBankInfo &RBI) {
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return true;
+
+  const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
+
   // No need to constrain SrcReg. It will get constrained when
   // we hit another of its uses or its defs.
   // Copies do not have constraints.
   if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
-                 << " operand\n");
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << " operand\n");
     return false;
   }
   return true;
@@ -393,12 +401,12 @@ bool ARMInstructionSelector::validReg(MachineRegisterInfo &MRI, unsigned Reg,
                                       unsigned ExpectedSize,
                                       unsigned ExpectedRegBankID) const {
   if (MRI.getType(Reg).getSizeInBits() != ExpectedSize) {
-    DEBUG(dbgs() << "Unexpected size for register");
+    LLVM_DEBUG(dbgs() << "Unexpected size for register");
     return false;
   }
 
   if (RBI.getRegBank(Reg, MRI, TRI)->getID() != ExpectedRegBankID) {
-    DEBUG(dbgs() << "Unexpected register bank for register");
+    LLVM_DEBUG(dbgs() << "Unexpected register bank for register");
     return false;
   }
 
@@ -490,13 +498,13 @@ bool ARMInstructionSelector::insertComparison(CmpConstants Helper, InsertInfo I,
 bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
                                           MachineRegisterInfo &MRI) const {
   if ((STI.isROPI() || STI.isRWPI()) && !STI.isTargetELF()) {
-    DEBUG(dbgs() << "ROPI and RWPI only supported for ELF\n");
+    LLVM_DEBUG(dbgs() << "ROPI and RWPI only supported for ELF\n");
     return false;
   }
 
   auto GV = MIB->getOperand(1).getGlobal();
   if (GV->isThreadLocal()) {
-    DEBUG(dbgs() << "TLS variables not supported yet\n");
+    LLVM_DEBUG(dbgs() << "TLS variables not supported yet\n");
     return false;
   }
 
@@ -505,7 +513,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
 
   bool UseMovt = STI.useMovt(MF);
 
-  unsigned Size = TM.getPointerSize();
+  unsigned Size = TM.getPointerSize(0);
   unsigned Alignment = 4;
 
   auto addOpsForConstantPoolLoad = [&MF, Alignment,
@@ -548,7 +556,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
     if (Indirect)
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad,
-          TM.getPointerSize(), Alignment));
+          TM.getProgramPointerSize(), Alignment));
 
     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
   }
@@ -601,7 +609,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
     else
       MIB->setDesc(TII.get(ARM::LDRLIT_ga_abs));
   } else {
-    DEBUG(dbgs() << "Object format not supported yet\n");
+    LLVM_DEBUG(dbgs() << "Object format not supported yet\n");
     return false;
   }
 
@@ -670,14 +678,6 @@ bool ARMInstructionSelector::select(MachineInstr &I,
   }
 
   using namespace TargetOpcode;
-  if (I.getOpcode() == G_CONSTANT) {
-    // Pointer constants should be treated the same as 32-bit integer constants.
-    // Change the type and let TableGen handle it.
-    unsigned ResultReg = I.getOperand(0).getReg();
-    LLT Ty = MRI.getType(ResultReg);
-    if (Ty.isPointer())
-      MRI.setType(ResultReg, LLT::scalar(32));
-  }
 
   if (selectImpl(I, CoverageInfo))
     return true;
@@ -693,7 +693,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
     // FIXME: Smaller destination sizes coming soon!
     if (DstTy.getSizeInBits() != 32) {
-      DEBUG(dbgs() << "Unsupported destination size for extension");
+      LLVM_DEBUG(dbgs() << "Unsupported destination size for extension");
       return false;
     }
 
@@ -735,7 +735,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
       break;
     }
     default:
-      DEBUG(dbgs() << "Unsupported source size for extension");
+      LLVM_DEBUG(dbgs() << "Unsupported source size for extension");
       return false;
     }
     break;
@@ -776,18 +776,45 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     }
 
     if (SrcRegBank.getID() != DstRegBank.getID()) {
-      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
+      LLVM_DEBUG(
+          dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
       return false;
     }
 
     if (SrcRegBank.getID() != ARM::GPRRegBankID) {
-      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
+      LLVM_DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
       return false;
     }
 
     I.setDesc(TII.get(COPY));
     return selectCopy(I, TII, MRI, TRI, RBI);
   }
+  case G_CONSTANT: {
+    if (!MRI.getType(I.getOperand(0).getReg()).isPointer()) {
+      // Non-pointer constants should be handled by TableGen.
+      LLVM_DEBUG(dbgs() << "Unsupported constant type\n");
+      return false;
+    }
+
+    auto &Val = I.getOperand(1);
+    if (Val.isCImm()) {
+      if (!Val.getCImm()->isZero()) {
+        LLVM_DEBUG(dbgs() << "Unsupported pointer constant value\n");
+        return false;
+      }
+      Val.ChangeToImmediate(0);
+    } else {
+      assert(Val.isImm() && "Unexpected operand for G_CONSTANT");
+      if (Val.getImm() != 0) {
+        LLVM_DEBUG(dbgs() << "Unsupported pointer constant value\n");
+        return false;
+      }
+    }
+
+    I.setDesc(TII.get(ARM::MOVi));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+    break;
+  }
   case G_INTTOPTR:
   case G_PTRTOINT: {
     auto SrcReg = I.getOperand(1).getReg();
@@ -797,13 +824,15 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 
     if (SrcRegBank.getID() != DstRegBank.getID()) {
-      DEBUG(dbgs()
-            << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n");
       return false;
     }
 
     if (SrcRegBank.getID() != ARM::GPRRegBankID) {
-      DEBUG(dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n");
+      LLVM_DEBUG(
+          dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n");
       return false;
     }
 
@@ -824,11 +853,11 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     unsigned Size = MRI.getType(OpReg).getSizeInBits();
 
     if (Size == 64 && STI.isFPOnlySP()) {
-      DEBUG(dbgs() << "Subtarget only supports single precision");
+      LLVM_DEBUG(dbgs() << "Subtarget only supports single precision");
       return false;
     }
     if (Size != 32 && Size != 64) {
-      DEBUG(dbgs() << "Unsupported size for G_FCMP operand");
+      LLVM_DEBUG(dbgs() << "Unsupported size for G_FCMP operand");
       return false;
     }
 
@@ -859,7 +888,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
   case G_LOAD: {
     const auto &MemOp = **I.memoperands_begin();
     if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
-      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
       return false;
     }
 
@@ -896,7 +925,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
   }
   case G_BRCOND: {
     if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) {
-      DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
+      LLVM_DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
       return false;
     }
 
@@ -917,6 +946,17 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case G_PHI: {
+    I.setDesc(TII.get(PHI));
+
+    unsigned DstReg = I.getOperand(0).getReg();
+    const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
+    if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+      break;
+    }
+
+    return true;
+  }
   default:
     return false;
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 8cff1f0869d0..891418306903 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
+using namespace LegalizeActions;
 
 /// FIXME: The following static functions are SizeChangeStrategy functions
 /// that are meant to temporarily mimic the behaviour of the old legalization
@@ -40,7 +41,7 @@ addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
     result.push_back(v[i]);
     if (i + 1 < v[i].first && i + 1 < v.size() &&
         v[i + 1].first != v[i].first + 1)
-      result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+      result.push_back({v[i].first + 1, Unsupported});
   }
 }
 
@@ -48,27 +49,14 @@ static LegalizerInfo::SizeAndActionsVec
 widen_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
   assert(v.size() >= 1);
   assert(v[0].first > 17);
-  LegalizerInfo::SizeAndActionsVec result = {
-      {1, LegalizerInfo::Unsupported},
-      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
-      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+  LegalizerInfo::SizeAndActionsVec result = {{1, Unsupported},
+                                             {8, WidenScalar},
+                                             {9, Unsupported},
+                                             {16, WidenScalar},
+                                             {17, Unsupported}};
   addAndInterleaveWithUnsupported(result, v);
   auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
-  return result;
-}
-
-static LegalizerInfo::SizeAndActionsVec
-widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 17);
-  LegalizerInfo::SizeAndActionsVec result = {
-      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
-      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
-      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  result.push_back({Largest + 1, Unsupported});
   return result;
 }
 
@@ -87,30 +75,21 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
-  setAction({G_GLOBAL_VALUE, p0}, Legal);
-  setAction({G_FRAME_INDEX, p0}, Legal);
+  getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
 
-  for (unsigned Op : {G_LOAD, G_STORE}) {
-    for (auto Ty : {s1, s8, s16, s32, p0})
-      setAction({Op, Ty}, Legal);
-    setAction({Op, 1, p0}, Legal);
-  }
-
-  for (unsigned Op : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
-    if (Op != G_ADD)
-      setLegalizeScalarToDifferentSizeStrategy(
-          Op, 0, widenToLargerTypesUnsupportedOtherwise);
-    setAction({Op, s32}, Legal);
-  }
+  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+      .legalFor({s32})
+      .minScalar(0, s32);
 
-  for (unsigned Op : {G_SDIV, G_UDIV}) {
-    setLegalizeScalarToDifferentSizeStrategy(Op, 0,
-        widenToLargerTypesUnsupportedOtherwise);
-    if (ST.hasDivideInARMMode())
-      setAction({Op, s32}, Legal);
-    else
-      setAction({Op, s32}, Libcall);
-  }
+  if (ST.hasDivideInARMMode())
+    getActionDefinitionsBuilder({G_SDIV, G_UDIV})
+        .legalFor({s32})
+        .clampScalar(0, s32, s32);
+  else
+    getActionDefinitionsBuilder({G_SDIV, G_UDIV})
+        .libcallFor({s32})
+        .clampScalar(0, s32, s32);
 
   for (unsigned Op : {G_SREM, G_UREM}) {
     setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
@@ -122,74 +101,96 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
       setAction({Op, s32}, Libcall);
   }
 
-  for (unsigned Op : {G_SEXT, G_ZEXT, G_ANYEXT}) {
-    setAction({Op, s32}, Legal);
-  }
+  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
+      .legalForCartesianProduct({s32}, {s1, s8, s16});
+
+  getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+  getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
 
-  setAction({G_INTTOPTR, p0}, Legal);
-  setAction({G_INTTOPTR, 1, s32}, Legal);
+  getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32});
 
-  setAction({G_PTRTOINT, s32}, Legal);
-  setAction({G_PTRTOINT, 1, p0}, Legal);
+  getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}});
 
-  for (unsigned Op : {G_ASHR, G_LSHR, G_SHL})
-    setAction({Op, s32}, Legal);
+  getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0},
+                                                                 {s1});
 
-  setAction({G_GEP, p0}, Legal);
-  setAction({G_GEP, 1, s32}, Legal);
+  getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
 
-  setAction({G_SELECT, s32}, Legal);
-  setAction({G_SELECT, p0}, Legal);
-  setAction({G_SELECT, 1, s1}, Legal);
+  getActionDefinitionsBuilder(G_CONSTANT)
+      .legalFor({s32, p0})
+      .clampScalar(0, s32, s32);
 
-  setAction({G_BRCOND, s1}, Legal);
+  getActionDefinitionsBuilder(G_ICMP)
+      .legalForCartesianProduct({s1}, {s32, p0})
+      .minScalar(1, s32);
 
-  setAction({G_CONSTANT, s32}, Legal);
-  setAction({G_CONSTANT, p0}, Legal);
-  setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
+  // We're keeping these builders around because we'll want to add support for
+  // floating point to them.
+  auto &LoadStoreBuilder =
+      getActionDefinitionsBuilder({G_LOAD, G_STORE})
+          .legalForCartesianProduct({s1, s8, s16, s32, p0}, {p0});
 
-  setAction({G_ICMP, s1}, Legal);
-  setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1,
-      widenToLargerTypesUnsupportedOtherwise);
-  for (auto Ty : {s32, p0})
-    setAction({G_ICMP, 1, Ty}, Legal);
+  auto &PhiBuilder =
+      getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32);
 
   if (!ST.useSoftFloat() && ST.hasVFP2()) {
-    for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
-      for (auto Ty : {s32, s64})
-        setAction({BinOp, Ty}, Legal);
+    getActionDefinitionsBuilder(
+        {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FCONSTANT, G_FNEG})
+        .legalFor({s32, s64});
+
+    LoadStoreBuilder.legalFor({{s64, p0}});
+    PhiBuilder.legalFor({s64});
+
+    getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct({s1},
+                                                                 {s32, s64});
 
-    setAction({G_LOAD, s64}, Legal);
-    setAction({G_STORE, s64}, Legal);
+    getActionDefinitionsBuilder(G_MERGE_VALUES).legalFor({{s64, s32}});
+    getActionDefinitionsBuilder(G_UNMERGE_VALUES).legalFor({{s32, s64}});
 
-    setAction({G_FCMP, s1}, Legal);
-    setAction({G_FCMP, 1, s32}, Legal);
-    setAction({G_FCMP, 1, s64}, Legal);
+    getActionDefinitionsBuilder(G_FPEXT).legalFor({{s64, s32}});
+    getActionDefinitionsBuilder(G_FPTRUNC).legalFor({{s32, s64}});
 
-    setAction({G_MERGE_VALUES, s64}, Legal);
-    setAction({G_MERGE_VALUES, 1, s32}, Legal);
-    setAction({G_UNMERGE_VALUES, s32}, Legal);
-    setAction({G_UNMERGE_VALUES, 1, s64}, Legal);
+    getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+        .legalForCartesianProduct({s32}, {s32, s64});
+    getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+        .legalForCartesianProduct({s32, s64}, {s32});
   } else {
-    for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
-      for (auto Ty : {s32, s64})
-        setAction({BinOp, Ty}, Libcall);
+    getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV})
+        .libcallFor({s32, s64});
+
+    LoadStoreBuilder.maxScalar(0, s32);
+
+    for (auto Ty : {s32, s64})
+      setAction({G_FNEG, Ty}, Lower);
 
-    setAction({G_FCMP, s1}, Legal);
-    setAction({G_FCMP, 1, s32}, Custom);
-    setAction({G_FCMP, 1, s64}, Custom);
+    getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
+
+    getActionDefinitionsBuilder(G_FCMP).customForCartesianProduct({s1},
+                                                                  {s32, s64});
 
     if (AEABI(ST))
       setFCmpLibcallsAEABI();
     else
       setFCmpLibcallsGNU();
+
+    getActionDefinitionsBuilder(G_FPEXT).libcallFor({{s64, s32}});
+    getActionDefinitionsBuilder(G_FPTRUNC).libcallFor({{s32, s64}});
+
+    getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+        .libcallForCartesianProduct({s32}, {s32, s64});
+    getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+        .libcallForCartesianProduct({s32, s64}, {s32});
   }
 
-  for (unsigned Op : {G_FREM, G_FPOW})
-    for (auto Ty : {s32, s64})
-      setAction({Op, Ty}, Libcall);
+  if (!ST.useSoftFloat() && ST.hasVFP4())
+    getActionDefinitionsBuilder(G_FMA).legalFor({s32, s64});
+  else
+    getActionDefinitionsBuilder(G_FMA).libcallFor({s32, s64});
+
+  getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
 
   computeTables();
+  verify(*ST.getInstrInfo());
 }
 
 void ARMLegalizerInfo::setFCmpLibcallsAEABI() {
@@ -305,6 +306,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
   using namespace TargetOpcode;
 
   MIRBuilder.setInstr(MI);
+  LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
   switch (MI.getOpcode()) {
   default:
@@ -321,7 +323,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
 
     // Our divmod libcalls return a struct containing the quotient and the
     // remainder. We need to create a virtual register for it.
-    auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
     Type *ArgTy = Type::getInt32Ty(Ctx);
     StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true);
     auto RetVal = MRI.createGenericVirtualRegister(
@@ -362,7 +363,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
       return true;
     }
 
-    auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
     assert((OpSize == 32 || OpSize == 64) && "Unsupported operand size");
     auto *ArgTy = OpSize == 32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
     auto *RetTy = Type::getInt32Ty(Ctx);
@@ -407,6 +407,14 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
     }
     break;
   }
+  case G_FCONSTANT: {
+    // Convert to integer constants, while preserving the binary representation.
+    auto AsInteger =
+        MI.getOperand(1).getFPImm()->getValueAPF().bitcastToAPInt();
+    MIRBuilder.buildConstant(MI.getOperand(0).getReg(),
+                             *ConstantInt::get(Ctx, AsInteger));
+    break;
+  }
   }
 
   MI.eraseFromParent();
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 8b3a2e223796..901138dbdfd5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1198,7 +1198,7 @@ findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
 
   // Skip debug values.
   MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+  while (PrevMBBI->isDebugInstr() && PrevMBBI != BeginMBBI)
     --PrevMBBI;
 
   Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
@@ -1214,7 +1214,7 @@ findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
   MachineBasicBlock::iterator EndMBBI = MBB.end();
   MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
   // Skip debug values.
-  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+  while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
     ++NextMBBI;
   if (NextMBBI == EndMBBI)
     return EndMBBI;
@@ -1807,7 +1807,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       MBBI = I;
       --Position;
       // Fallthrough to look into existing chain.
-    } else if (MBBI->isDebugValue()) {
+    } else if (MBBI->isDebugInstr()) {
       continue;
     } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
                MBBI->getOpcode() == ARM::t2STRDi8) {
@@ -1834,7 +1834,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
     return M0->InsertPos < M1->InsertPos;
   };
-  std::sort(Candidates.begin(), Candidates.end(), LessThan);
+  llvm::sort(Candidates.begin(), Candidates.end(), LessThan);
 
   // Go through list of candidates and merge.
   bool Changed = false;
@@ -1891,8 +1891,8 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
        MBBI->getOpcode() == ARM::tBX_RET ||
        MBBI->getOpcode() == ARM::MOVPCLR)) {
     MachineBasicBlock::iterator PrevI = std::prev(MBBI);
-    // Ignore any DBG_VALUE instructions.
-    while (PrevI->isDebugValue() && PrevI != MBB.begin())
+    // Ignore any debug instructions.
+    while (PrevI->isDebugInstr() && PrevI != MBB.begin())
       --PrevI;
     MachineInstr &PrevMI = *PrevI;
     unsigned Opcode = PrevMI.getOpcode();
@@ -2063,7 +2063,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
   // Are there stores / loads / calls between them?
   SmallSet<unsigned, 4> AddedRegPressure;
   while (++I != E) {
-    if (I->isDebugValue() || MemOps.count(&*I))
+    if (I->isDebugInstr() || MemOps.count(&*I))
       continue;
     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
       return false;
@@ -2172,13 +2172,13 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   bool RetVal = false;
 
   // Sort by offset (in reverse order).
-  std::sort(Ops.begin(), Ops.end(),
-            [](const MachineInstr *LHS, const MachineInstr *RHS) {
-              int LOffset = getMemoryOpOffset(*LHS);
-              int ROffset = getMemoryOpOffset(*RHS);
-              assert(LHS == RHS || LOffset != ROffset);
-              return LOffset > ROffset;
-            });
+  llvm::sort(Ops.begin(), Ops.end(),
+             [](const MachineInstr *LHS, const MachineInstr *RHS) {
+               int LOffset = getMemoryOpOffset(*LHS);
+               int ROffset = getMemoryOpOffset(*RHS);
+               assert(LHS == RHS || LOffset != ROffset);
+               return LOffset > ROffset;
+             });
 
   // The loads / stores of the same base are in order. Scan them from first to
   // last and check for the following:
@@ -2253,7 +2253,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
         // This is the new location for the loads / stores.
         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
         while (InsertPos != MBB->end() &&
-               (MemOps.count(&*InsertPos) || InsertPos->isDebugValue()))
+               (MemOps.count(&*InsertPos) || InsertPos->isDebugInstr()))
           ++InsertPos;
 
         // If we are moving a pair of loads / stores, see if it makes sense
@@ -2291,7 +2291,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
               MIB.addReg(0);
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
-            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
             ++NumLDRDFormed;
           } else {
             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
@@ -2305,7 +2305,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
               MIB.addReg(0);
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
-            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
             ++NumSTRDFormed;
           }
           MBB->erase(Op0);
@@ -2355,7 +2355,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
         break;
       }
 
-      if (!MI.isDebugValue())
+      if (!MI.isDebugInstr())
         MI2LocMap[&MI] = ++Loc;
 
       if (!isMemoryOp(MI))
diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
index 5c9aad417ceb..d11fe9d5c502 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
@@ -19,7 +19,48 @@
 
 namespace llvm {
 
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+// Fuse AES crypto encoding or decoding.
+static bool isAESPair(const MachineInstr *FirstMI,
+                      const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  unsigned FirstOpcode =
+      FirstMI ? FirstMI->getOpcode()
+              : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  switch(SecondOpcode) {
+  // AES encode.
+  case ARM::AESMC :
+    return FirstOpcode == ARM::AESE ||
+           FirstOpcode == ARM::INSTRUCTION_LIST_END;
+  // AES decode.
+  case ARM::AESIMC:
+    return FirstOpcode == ARM::AESD ||
+           FirstOpcode == ARM::INSTRUCTION_LIST_END;
+  }
+
+  return false;
+}
+
+// Fuse literal generation.
+static bool isLiteralsPair(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  unsigned FirstOpcode =
+      FirstMI ? FirstMI->getOpcode()
+              : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  // 32 bit immediate.
+  if ((FirstOpcode == ARM::INSTRUCTION_LIST_END ||
+       FirstOpcode == ARM::MOVi16) &&
+      SecondOpcode == ARM::MOVTi16)
+    return true;
+
+  return false;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
 /// together. Given SecondMI, when FirstMI is unspecified, then check if
 /// SecondMI may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
@@ -28,24 +69,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const MachineInstr &SecondMI) {
   const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI);
 
-  // Assume wildcards for unspecified instrs.
-  unsigned FirstOpcode =
-    FirstMI ? FirstMI->getOpcode()
-            : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = SecondMI.getOpcode();
-
-  if (ST.hasFuseAES())
-    // Fuse AES crypto operations.
-    switch(SecondOpcode) {
-    // AES encode.
-    case ARM::AESMC :
-      return FirstOpcode == ARM::AESE ||
-             FirstOpcode == ARM::INSTRUCTION_LIST_END;
-    // AES decode.
-    case ARM::AESIMC:
-      return FirstOpcode == ARM::AESD ||
-             FirstOpcode == ARM::INSTRUCTION_LIST_END;
-    }
+  if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
+    return true;
 
   return false;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp
new file mode 100644
index 000000000000..9d5478b76c18
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -0,0 +1,672 @@
+//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
+/// purpose of this pass is do some IR pattern matching to create ACLE
+/// DSP intrinsics, which map on these 32-bit SIMD operations.
+/// This pass runs only when unaligned accesses is supported/enabled.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "arm-parallel-dsp"
+
+STATISTIC(NumSMLAD , "Number of smlad instructions generated");
+
+namespace {
+  struct OpChain;
+  struct BinOpChain;
+  struct Reduction;
+
+  using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
+  using ReductionList   = SmallVector<Reduction, 8>;
+  using ValueList       = SmallVector<Value*, 8>;
+  using MemInstList     = SmallVector<Instruction*, 8>;
+  using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
+  using PMACPairList    = SmallVector<PMACPair, 8>;
+  using Instructions    = SmallVector<Instruction*,16>;
+  using MemLocList      = SmallVector<MemoryLocation, 4>;
+
+  struct OpChain {
+    Instruction   *Root;
+    ValueList     AllValues;
+    MemInstList   VecLd;    // List of all load instructions.
+    MemLocList    MemLocs;  // All memory locations read by this tree.
+    bool          ReadOnly = true;
+
+    OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
+    virtual ~OpChain() = default;
+
+    void SetMemoryLocations() {
+      const auto Size = MemoryLocation::UnknownSize;
+      for (auto *V : AllValues) {
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          if (I->mayWriteToMemory())
+            ReadOnly = false;
+          if (auto *Ld = dyn_cast<LoadInst>(V))
+            MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
+        }
+      }
+    }
+
+    unsigned size() const { return AllValues.size(); }
+  };
+
+  // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
+  // 'Reduction' contains the phi-node and accumulator statement from where we
+  // start pattern matching, and 'BinOpChain' the multiplication
+  // instructions that are candidates for parallel execution.
+  struct BinOpChain : public OpChain {
+    ValueList     LHS;      // List of all (narrow) left hand operands.
+    ValueList     RHS;      // List of all (narrow) right hand operands.
+
+    BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
+      OpChain(I, lhs), LHS(lhs), RHS(rhs) {
+        for (auto *V : RHS)
+          AllValues.push_back(V);
+      }
+  };
+
+  struct Reduction {
+    PHINode         *Phi;             // The Phi-node from where we start
+                                      // pattern matching.
+    Instruction     *AccIntAdd;       // The accumulating integer add statement,
+                                      // i.e, the reduction statement.
+
+    OpChainList     MACCandidates;    // The MAC candidates associated with
+                                      // this reduction statement.
+    Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
+  };
+
+  class ARMParallelDSP : public LoopPass {
+    ScalarEvolution   *SE;
+    AliasAnalysis     *AA;
+    TargetLibraryInfo *TLI;
+    DominatorTree     *DT;
+    LoopInfo          *LI;
+    Loop              *L;
+    const DataLayout  *DL;
+    Module            *M;
+
+    bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
+    bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+    PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
+    Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+                                 Instruction *Acc, Instruction *InsertAfter);
+
+    /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
+    /// Dual performs two signed 16x16-bit multiplications. It adds the
+    /// products to a 32-bit accumulate operand. Optionally, the instruction can
+    /// exchange the halfwords of the second operand before performing the
+    /// arithmetic.
+    bool MatchSMLAD(Function &F);
+
+  public:
+    static char ID;
+
+    ARMParallelDSP() : LoopPass(ID) { }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      LoopPass::getAnalysisUsage(AU);
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<TargetPassConfig>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.setPreservesCFG();
+    }
+
+    bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+      L = TheLoop;
+      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+      auto &TPC = getAnalysis<TargetPassConfig>();
+
+      BasicBlock *Header = TheLoop->getHeader();
+      if (!Header)
+        return false;
+
+      // TODO: We assume the loop header and latch to be the same block.
+      // This is not a fundamental restriction, but lifting this would just
+      // require more work to do the transformation and then patch up the CFG.
+      if (Header != TheLoop->getLoopLatch()) {
+        LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
+                             "running pass ARMParallelDSP\n");
+        return false;
+      }
+
+      Function &F = *Header->getParent();
+      M = F.getParent();
+      DL = &M->getDataLayout();
+
+      auto &TM = TPC.getTM<TargetMachine>();
+      auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+
+      if (!ST->allowsUnalignedMem()) {
+        LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
+                             "running pass ARMParallelDSP\n");
+        return false;
+      }
+
+      if (!ST->hasDSP()) {
+        LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
+                             "ARMParallelDSP\n");
+        return false;
+      }
+
+      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
+      bool Changes = false;
+
+      LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
+      Changes = MatchSMLAD(F);
+      return Changes;
+    }
+  };
+}
+
+// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
+// instructions, which is set to 16. So here we should collect all i8 and i16
+// narrow operations.
+// TODO: we currently only collect i16, and will support i8 later, so that's
+// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
+template<unsigned MaxBitWidth>
+static bool IsNarrowSequence(Value *V, ValueList &VL) {
+  LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
+  ConstantInt *CInt;
+
+  if (match(V, m_ConstantInt(CInt))) {
+    // TODO: if a constant is used, it needs to fit within the bit width.
+    return false;
+  }
+
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+   return false;
+
+  Value *Val, *LHS, *RHS;
+  if (match(V, m_Trunc(m_Value(Val)))) {
+    if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
+      return IsNarrowSequence<MaxBitWidth>(Val, VL);
+  } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
+    // TODO: we need to implement sadd16/sadd8 for this, which enables to
+    // also do the rewrite for smlad8.ll, but it is unsupported for now.
+    LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
+    return false;
+  } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
+    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
+      LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
+        cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+      return false;
+    }
+
+    if (match(Val, m_Load(m_Value()))) {
+      LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
+      VL.push_back(Val);
+      VL.push_back(I);
+      return true;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
+  return false;
+}
+
+// Element-by-element comparison of Value lists returning true if they are
+// instructions with the same opcode or constants with the same value.
+static bool AreSymmetrical(const ValueList &VL0,
+                           const ValueList &VL1) {
+  if (VL0.size() != VL1.size()) {
+    LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
+                      << VL0.size() << " != " << VL1.size() << "\n");
+    return false;
+  }
+
+  const unsigned Pairs = VL0.size();
+  LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
+
+  for (unsigned i = 0; i < Pairs; ++i) {
+    const Value *V0 = VL0[i];
+    const Value *V1 = VL1[i];
+    const auto *Inst0 = dyn_cast<Instruction>(V0);
+    const auto *Inst1 = dyn_cast<Instruction>(V1);
+
+    LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
+               dbgs() << "mul1: "; V0->dump();
+               dbgs() << "mul2: "; V1->dump());
+
+    if (!Inst0 || !Inst1)
+      return false;
+
+    if (Inst0->isSameOperationAs(Inst1)) {
+      LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+      continue;
+    }
+
+    const APInt *C0, *C1;
+    if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
+  return true;
+}
+
+template<typename MemInst>
+static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
+                                  MemInstList &VecMem, const DataLayout &DL,
+                                  ScalarEvolution &SE) {
+  if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
+    LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
+    return false;
+  }
+  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
+    VecMem.push_back(MemOp0);
+    VecMem.push_back(MemOp1);
+    LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
+    return true;
+  }
+  LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
+  return false;
+}
+
+bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
+                                        MemInstList &VecMem) {
+  if (!Ld0 || !Ld1)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
+    dbgs() << "Ld0:"; Ld0->dump();
+    dbgs() << "Ld1:"; Ld1->dump();
+  );
+
+  if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
+    return false;
+  }
+
+  return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
+}
+
+PMACPairList
+ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
+  const unsigned Elems = Candidates.size();
+  PMACPairList PMACPairs;
+
+  if (Elems < 2)
+    return PMACPairs;
+
+  // TODO: for now we simply try to match consecutive pairs i and i+1.
+  // We can compare all elements, but then we need to compare and evaluate
+  // different solutions.
+  for(unsigned i=0; i<Elems-1; i+=2) {
+    BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
+    BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1].get());
+    const Instruction *Mul0 = PMul0->Root;
+    const Instruction *Mul1 = PMul1->Root;
+
+    if (Mul0 == Mul1)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
+               dbgs() << "- "; Mul0->dump();
+               dbgs() << "- "; Mul1->dump());
+
+    const ValueList &Mul0_LHS = PMul0->LHS;
+    const ValueList &Mul0_RHS = PMul0->RHS;
+    const ValueList &Mul1_LHS = PMul1->LHS;
+    const ValueList &Mul1_RHS = PMul1->RHS;
+
+    if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
+        !AreSymmetrical(Mul0_RHS, Mul1_RHS))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
+    // The first elements of each vector should be loads with sexts. If we find
+    // that its two pairs of consecutive loads, then these can be transformed
+    // into two wider loads and the users can be replaced with DSP
+    // intrinsics.
+    for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
+      auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
+      auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
+      auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
+      auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
+
+      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
+                 dbgs() << "\t mul1: "; Mul0_LHS[x]->dump();
+                 dbgs() << "\t mul2: "; Mul1_LHS[x]->dump();
+                 dbgs() << "and operands " << x + 2 << ":\n";
+                 dbgs() << "\t mul1: "; Mul0_RHS[x]->dump();
+                 dbgs() << "\t mul2: "; Mul1_RHS[x]->dump());
+
+      if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) &&
+          AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+        LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+        PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+      }
+    }
+  }
+  return PMACPairs;
+}
+
+bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
+                                        PMACPairList &PMACPairs) {
+  Instruction *Acc = Reduction.Phi;
+  Instruction *InsertAfter = Reduction.AccIntAdd;
+
+  for (auto &Pair : PMACPairs) {
+    LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
+               dbgs() << "- "; Pair.first->Root->dump();
+               dbgs() << "- "; Pair.second->Root->dump());
+    auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
+    auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
+    Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
+    InsertAfter = Acc;
+  }
+
+  if (Acc != Reduction.Phi) {
+    LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
+    Reduction.AccIntAdd->replaceAllUsesWith(Acc);
+    return true;
+  }
+  return false;
+}
+
+static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
+                            ReductionList &Reductions) {
+  RecurrenceDescriptor RecDesc;
+  const bool HasFnNoNaNAttr =
+    F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+  const BasicBlock *Latch = TheLoop->getLoopLatch();
+
+  // We need a preheader as getIncomingValueForBlock assumes there is one.
+  if (!TheLoop->getLoopPreheader()) {
+    LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
+    return;
+  }
+
+  for (PHINode &Phi : Header->phis()) {
+    const auto *Ty = Phi.getType();
+    if (!Ty->isIntegerTy(32))
+      continue;
+
+    const bool IsReduction =
+      RecurrenceDescriptor::AddReductionVar(&Phi,
+                                            RecurrenceDescriptor::RK_IntegerAdd,
+                                            TheLoop, HasFnNoNaNAttr, RecDesc);
+    if (!IsReduction)
+      continue;
+
+    Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
+    if (!Acc)
+      continue;
+
+    Reductions.push_back(Reduction(&Phi, Acc));
+  }
+
+  LLVM_DEBUG(
+    dbgs() << "\nAccumulating integer additions (reductions) found:\n";
+    for (auto &R : Reductions) {
+      dbgs() << "-  "; R.Phi->dump();
+      dbgs() << "-> "; R.AccIntAdd->dump();
+    }
+  );
+}
+
+static void AddMACCandidate(OpChainList &Candidates,
+                            const Instruction *Acc,
+                            Value *MulOp0, Value *MulOp1, int MulOpNum) {
+  Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
+  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
+  ValueList LHS;
+  ValueList RHS;
+  if (IsNarrowSequence<16>(MulOp0, LHS) &&
+      IsNarrowSequence<16>(MulOp1, RHS)) {
+    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
+    Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
+  }
+}
+
+static void MatchParallelMACSequences(Reduction &R,
+                                      OpChainList &Candidates) {
+  const Instruction *Acc = R.AccIntAdd;
+  Value *A, *MulOp0, *MulOp1;
+  LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
+
+  // Pattern 1: the accumulator is the RHS of the mul.
+  while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
+                         m_Value(A)))){
+    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+    Acc = dyn_cast<Instruction>(A);
+  }
+  // Pattern 2: the accumulator is the LHS of the mul.
+  while(match(Acc, m_Add(m_Value(A),
+                         m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
+    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1);
+    Acc = dyn_cast<Instruction>(A);
+  }
+
+  // The last mul in the chain has a slightly different pattern:
+  // the mul is the first operand
+  if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
+    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+
+  // Because we start at the bottom of the chain, and we work our way up,
+  // the muls are added in reverse program order to the list.
+  std::reverse(Candidates.begin(), Candidates.end());
+}
+
+// Collects all instructions that are not part of the MAC chains, which is the
+// set of instructions that can potentially alias with the MAC operands.
+static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
+                            Instructions &Writes) {
+  for (auto &I : *Header) {
+    if (I.mayReadFromMemory())
+      Reads.push_back(&I);
+    if (I.mayWriteToMemory())
+      Writes.push_back(&I);
+  }
+}
+
+// Check whether statements in the basic block that write to memory alias with
+// the memory locations accessed by the MAC-chains.
+// TODO: we need the read statements when we accept more complicated chains.
+static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
+                       Instructions &Writes, OpChainList &MACCandidates) {
+  LLVM_DEBUG(dbgs() << "Alias checks:\n");
+  for (auto &MAC : MACCandidates) {
+    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
+
+    // At the moment, we allow only simple chains that only consist of reads,
+    // accumulate their result with an integer add, and thus that don't write
+    // memory, and simply bail if they do.
+    if (!MAC->ReadOnly)
+      return true;
+
+    // Now for all writes in the basic block, check that they don't alias with
+    // the memory locations accessed by our MAC-chain:
+    for (auto *I : Writes) {
+      LLVM_DEBUG(dbgs() << "- "; I->dump());
+      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
+      for (auto &MemLoc : MAC->MemLocs) {
+        if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
+                                          ModRefInfo::ModRef))) {
+          LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
+          return true;
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
+  return false;
+}
+
+static bool CheckMACMemory(OpChainList &Candidates) {
+  for (auto &C : Candidates) {
+    // A mul has 2 operands, and a narrow op consist of sext and a load; thus
+    // we expect at least 4 items in this operand value list.
+    if (C->size() < 4) {
+      LLVM_DEBUG(dbgs() << "Operand list too short.\n");
+      return false;
+    }
+    C->SetMemoryLocations();
+    ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
+    ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
+
+    // Use +=2 to skip over the expected extend instructions.
+    for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
+      if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
+        return false;
+    }
+  }
+  return true;
+}
+
+// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// multiplications.
+// To use SMLAD:
+// 1) we first need to find integer add reduction PHIs,
+// 2) then from the PHI, look for this pattern:
+//
+// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
+// ld0 = load i16
+// sext0 = sext i16 %ld0 to i32
+// ld1 = load i16
+// sext1 = sext i16 %ld1 to i32
+// mul0 = mul %sext0, %sext1
+// ld2 = load i16
+// sext2 = sext i16 %ld2 to i32
+// ld3 = load i16
+// sext3 = sext i16 %ld3 to i32
+// mul1 = mul i32 %sext2, %sext3
+// add0 = add i32 %mul0, %acc0
+// acc1 = add i32 %add0, %mul1
+//
+// Which can be selected to:
+//
+// ldr.h r0
+// ldr.h r1
+// smlad r2, r0, r1, r2
+//
+// If constants are used instead of loads, these will need to be hoisted
+// out and into a register.
+//
+// If loop invariants are used instead of loads, these need to be packed
+// before the loop begins.
+//
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
+  BasicBlock *Header = L->getHeader();
+  LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
+             dbgs() << "Header block:\n"; Header->dump();
+             dbgs() << "Loop info:\n\n"; L->dump());
+
+  bool Changed = false;
+  ReductionList Reductions;
+  MatchReductions(F, L, Header, Reductions);
+
+  for (auto &R : Reductions) {
+    OpChainList MACCandidates;
+    MatchParallelMACSequences(R, MACCandidates);
+    if (!CheckMACMemory(MACCandidates))
+      continue;
+
+    R.MACCandidates = std::move(MACCandidates);
+
+    LLVM_DEBUG(dbgs() << "MAC candidates:\n";
+      for (auto &M : R.MACCandidates)
+        M->Root->dump();
+      dbgs() << "\n";);
+  }
+
+  // Collect all instructions that may read or write memory. Our alias
+  // analysis checks bail out if any of these instructions aliases with an
+  // instruction from the MAC-chain.
+  Instructions Reads, Writes;
+  AliasCandidates(Header, Reads, Writes);
+
+  for (auto &R : Reductions) {
+    if (AreAliased(AA, Reads, Writes, R.MACCandidates))
+      return false;
+    PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
+    Changed |= InsertParallelMACs(R, PMACPairs);
+  }
+
+  LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
+  return Changed;
+}
+
+static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
+                          LoadInst **VecLd) {
+  const Type *AccTy = Acc->getType();
+  const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
+
+  Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(),
+                                    AccTy->getPointerTo(AddrSpace));
+  *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment());
+}
+
+Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+                                             Instruction *Acc,
+                                             Instruction *InsertAfter) {
+  LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
+             dbgs() << "- "; VecLd0->dump();
+             dbgs() << "- "; VecLd1->dump();
+             dbgs() << "- "; Acc->dump());
+
+  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                              ++BasicBlock::iterator(InsertAfter));
+
+  // Replace the reduction chain with an intrinsic call
+  CreateLoadIns(Builder, Acc, &VecLd0);
+  CreateLoadIns(Builder, Acc, &VecLd1);
+  Value* Args[] = { VecLd0, VecLd1, Acc };
+  Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
+  CallInst *Call = Builder.CreateCall(SMLAD, Args);
+  NumSMLAD++;
+  return Call;
+}
+
+Pass *llvm::createARMParallelDSPPass() {
+  return new ARMParallelDSP();
+}
+
+char ARMParallelDSP::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
+                "Transform loops to use DSP intrinsics", false, false)
+INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
+                "Transform loops to use DSP intrinsics", false, false)
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index fad0e98285e6..0e16d6bcfe2b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -175,15 +175,20 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
 
   switch (RC.getID()) {
   case GPRRegClassID:
+  case GPRwithAPSRRegClassID:
   case GPRnopcRegClassID:
+  case rGPRRegClassID:
   case GPRspRegClassID:
   case tGPR_and_tcGPRRegClassID:
+  case tcGPRRegClassID:
   case tGPRRegClassID:
     return getRegBank(ARM::GPRRegBankID);
+  case HPRRegClassID:
   case SPR_8RegClassID:
   case SPRRegClassID:
   case DPR_8RegClassID:
   case DPRRegClassID:
+  case QPRRegClassID:
     return getRegBank(ARM::FPRRegBankID);
   default:
     llvm_unreachable("Unsupported register kind");
@@ -263,13 +268,74 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_FADD:
   case G_FSUB:
   case G_FMUL:
-  case G_FDIV: {
+  case G_FDIV:
+  case G_FNEG: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
     OperandsMapping =Ty.getSizeInBits() == 64
                           ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
                           : &ARM::ValueMappings[ARM::SPR3OpsIdx];
     break;
   }
+  case G_FMA: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    OperandsMapping =
+        Ty.getSizeInBits() == 64
+            ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::DPR3OpsIdx]})
+            : getOperandsMapping({&ARM::ValueMappings[ARM::SPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::SPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::SPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::SPR3OpsIdx]});
+    break;
+  }
+  case G_FPEXT: {
+    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+    if (ToTy.getSizeInBits() == 64 && FromTy.getSizeInBits() == 32)
+      OperandsMapping =
+          getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                              &ARM::ValueMappings[ARM::SPR3OpsIdx]});
+    break;
+  }
+  case G_FPTRUNC: {
+    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+    if (ToTy.getSizeInBits() == 32 && FromTy.getSizeInBits() == 64)
+      OperandsMapping =
+          getOperandsMapping({&ARM::ValueMappings[ARM::SPR3OpsIdx],
+                              &ARM::ValueMappings[ARM::DPR3OpsIdx]});
+    break;
+  }
+  case G_FPTOSI:
+  case G_FPTOUI: {
+    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+    if ((FromTy.getSizeInBits() == 32 || FromTy.getSizeInBits() == 64) &&
+        ToTy.getSizeInBits() == 32)
+      OperandsMapping =
+          FromTy.getSizeInBits() == 64
+              ? getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                                    &ARM::ValueMappings[ARM::DPR3OpsIdx]})
+              : getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                                    &ARM::ValueMappings[ARM::SPR3OpsIdx]});
+    break;
+  }
+  case G_SITOFP:
+  case G_UITOFP: {
+    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+    if (FromTy.getSizeInBits() == 32 &&
+        (ToTy.getSizeInBits() == 32 || ToTy.getSizeInBits() == 64))
+      OperandsMapping =
+          ToTy.getSizeInBits() == 64
+              ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                    &ARM::ValueMappings[ARM::GPR3OpsIdx]})
+              : getOperandsMapping({&ARM::ValueMappings[ARM::SPR3OpsIdx],
+                                    &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
   case G_CONSTANT:
   case G_FRAME_INDEX:
   case G_GLOBAL_VALUE:
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td b/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td
index 7cd2d60d36a4..6e3834da3bb5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td
@@ -11,4 +11,4 @@
 //===----------------------------------------------------------------------===//
 
 def GPRRegBank : RegisterBank<"GPRB", [GPR, GPRwithAPSR]>;
-def FPRRegBank : RegisterBank<"FPRB", [SPR, DPR]>;
+def FPRRegBank : RegisterBank<"FPRB", [HPR, SPR, DPR, QPR]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 14526b777c70..dc56186cb54a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -307,6 +307,18 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
   let DiagnosticString = "operand must be a register in range [s0, s31]";
 }
 
+def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
+  let AltOrders = [(add (decimate HPR, 2), SPR),
+                   (add (decimate HPR, 4),
+                        (decimate HPR, 2),
+                        (decimate (rotl HPR, 1), 4),
+                        (decimate (rotl HPR, 1), 2))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+  let DiagnosticString = "operand must be a register in range [s0, s31]";
+}
+
 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
 // operations
 def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td
index 1ed9e14dfcd6..63f975ba6e39 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -92,6 +92,9 @@ def CortexA57Model : SchedMachineModel {
   // Enable partial & runtime unrolling.
   let LoopMicroOpBufferSize = 16;
   let CompleteModel = 1;
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -125,8 +128,9 @@ def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
   "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
   "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
   "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
-  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG",
-  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>;
+  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
+  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
+  "t__brkdiv0")>;
 
 def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
 
@@ -146,7 +150,7 @@ def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
 // Pseudos
 def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
   "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
-  "tLDRpci_pic", "t2SUBS_PC_LR",
+  "tLDRpci_pic", "(t2)?SUBS_PC_LR",
   "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
   "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
   "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
@@ -279,6 +283,9 @@ def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
 def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
 def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
 
+def : InstRW<[A57WriteMLA],
+  (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
+
 def : SchedAlias<WriteMAC16, A57WriteMLA>;
 def : SchedAlias<WriteMAC32, A57WriteMLA>;
 def : SchedAlias<ReadMAC,    A57ReadMLA>;
@@ -587,6 +594,8 @@ def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
 def : InstRW<[A57WriteLDM_Upd],
   (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
 
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
+
 // --- 3.9 Store Instructions ---
 
 // Store, immed offset
@@ -705,6 +714,8 @@ def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
 def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
   (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
 
+def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
+
 // --- 3.10 FP Data Processing Instructions ---
 def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
 def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
@@ -722,9 +733,11 @@ def : InstRW<[A57WriteVcmp],
 // fp convert
 def : InstRW<[A57Write_5cyc_1V], (instregex
   "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
-
+def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
 def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
 
+def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
+
 // FP round to integral
 def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
 
@@ -734,6 +747,8 @@ def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
 def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
 def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
 
+def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
+
 // FP max/min
 def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
 
@@ -767,6 +782,13 @@ def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
 def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
 def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
 
+// VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
+def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
+  (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
+
+def : InstRW<[A57WriteVMUL],
+  (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
+
 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
 def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
 
@@ -775,6 +797,8 @@ def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
 def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
 
+def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
+
 // 5cyc L for FP transfer, vfp to core reg,
 // 5cyc L for FP transfer, core reg to vfp
 def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
@@ -1062,6 +1086,11 @@ def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
 def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
   (instregex "VQDMLAL", "VQDMLSL")>;
 
+// Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
+// Scheduling info from VQDMLAL/VQDMLSL
+def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
+  (instregex "VQRDMLAH", "VQRDMLSH")>;
+
 // ASIMD multiply long
 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
 def A57WriteVMULL_VecInt : SchedWriteVariant<[
@@ -1126,6 +1155,8 @@ def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
 def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
   "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
 
+def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
+
 // ASIMD FP compare
 def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
   "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
@@ -1184,7 +1215,7 @@ def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
 // ASIMD move, immed
 def : InstRW<[A57Write_3cyc_1V], (instregex
   "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
-  "VMOVQ0")>;
+  "VMOVD0", "VMOVQ0")>;
 
 // ASIMD move, narrowing
 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
index 4e72b13d94cb..fc301c589269 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -1898,6 +1898,9 @@ def CortexA9Model : SchedMachineModel {
   // FIXME: Many vector operations were never given an itinerary. We
   // haven't mapped these to the new model either.
   let CompleteModel = 0;
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1993,15 +1996,15 @@ def : WriteRes<WriteVST4, []>;
 // Reserve A9UnitFP for 2 consecutive cycles.
 def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
   let Latency = 4;
-  let ResourceCycles = [2];
+  let ResourceCycles = [2, 1];
 }
 def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
   let Latency = 7;
-  let ResourceCycles = [2];
+  let ResourceCycles = [2, 1];
 }
 def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
   let Latency = 9;
-  let ResourceCycles = [2];
+  let ResourceCycles = [2, 1];
 }
 
 // Branches don't have a def operand but still consume resources.
@@ -2534,8 +2537,7 @@ def : SchedAlias<WriteCMPsr, A9WriteALU>;
 def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
                                        "MOVCCsr")>;
 def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
-def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
-                                      "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm")>;
 def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
 def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
 
@@ -2548,12 +2550,12 @@ def : InstRW< [A9WriteM],
       "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
 def : InstRW< [A9WriteM, A9WriteMHi],
       (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
-      "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+      "UMAAL", "SMLALv5", "UMLALv5", "SMLALBB", "SMLALBT", "SMLALTB",
       "SMLALTT")>;
 // FIXME: These instructions used to have NoItinerary. Just copied the one from above.
 def : InstRW< [A9WriteM, A9WriteMHi],
       (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
-      "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+      "SMLSLD", "SMLSLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
 
 def : InstRW<[A9WriteM16, A9WriteM16Hi],
       (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
index ca3172808d36..11bce45161b3 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -217,12 +217,11 @@ def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS],
       "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH", "t2UXTB16")>;
 
 def : InstRW<[R52WriteALU_EX1, R52Read_ISS],
-      (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi",
-      "t2MOVi", "t2MOV_ga_dyn")>;
+      (instregex "MOVCCi32imm", "MOVi32imm", "t2MOVCCi", "t2MOVi")>;
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1],
-      (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel")>;
+      (instregex "MOV_ga_pcrel$")>;
 def : InstRW<[R52WriteLd,R52Read_ISS],
-      (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+      (instregex "MOV_ga_pcrel_ldr")>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "SEL", "t2SEL")>;
 
@@ -257,12 +256,12 @@ def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
 
 // Sum of Absolute Difference
 def : InstRW< [R52WriteALU_WRI, R52Read_ISS, R52Read_ISS, R52Read_ISS],
-      (instregex "USAD8", "t2USAD8", "tUSAD8","USADA8", "t2USADA8", "tUSADA8") >;
+      (instregex "USAD8", "t2USAD8", "USADA8", "t2USADA8") >;
 
 // Integer Multiply
 def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS],
-      (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
-      "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+      (instregex "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+      "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDX", "t2MUL",
       "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
       "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
 
@@ -270,17 +269,17 @@ def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS],
 // Even for 64-bit accumulation (or Long), the single MAC is used (not ALUs).
 // The store pipeline is used partly for 64-bit operations.
 def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
-      (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
-      "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", "t2SMMLSR",
+      (instregex "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+      "t2MLA", "t2MLS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", "t2SMMLSR",
       "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX",
       "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
       "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
       "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT",
       "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX",
       "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$",
-      "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+      "SMLAL", "UMLAL", "SMLALBT",
       "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
-      "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB",
+      "UMAAL", "t2SMLAL", "t2UMLAL",
       "t2SMLALBT", "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX",
       "t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
 
@@ -301,31 +300,31 @@ def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_ISS],
       "LDRBT_POST$", "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
       "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T",
       "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
-      "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T",
+      "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)?",
       "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "MOVS?sr", "t2MOVS?sr")>;
 def : InstRW<[R52WriteALU_WRI, R52Read_EX2], (instregex "MOVT", "t2MOVT")>;
 
-def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "AD(C|D)S?ri","ANDS?ri",
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "AD(C|D)S?ri", "ANDS?ri",
       "BICS?ri", "CLZ", "EORri", "MVNS?r", "ORRri", "RSBS?ri", "RSCri", "SBCri",
       "t2AD(C|D)S?ri", "t2ANDS?ri", "t2BICS?ri","t2CLZ", "t2EORri", "t2MVN",
       "t2ORRri", "t2RSBS?ri", "t2SBCri")>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "AD(C|D)S?rr",
-      "ANDS?rr", "BICS?rr", "CRC*", "EORrr", "ORRrr", "RSBrr", "RSCrr", "SBCrr",
+      "ANDS?rr", "BICS?rr", "CRC", "EORrr", "ORRrr", "RSBrr", "RSCrr", "SBCrr",
       "t2AD(C|D)S?rr", "t2ANDS?rr", "t2BICS?rr", "t2CRC", "t2EORrr", "t2SBCrr")>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS], (instregex "AD(C|D)S?rsi",
       "ANDS?rsi", "BICS?rsi", "EORrsi", "ORRrsi", "RSBrsi", "RSCrsi", "SBCrsi",
-      "t2AD(|D)S?rsi", "t2ANDS?rsi", "t2BICS?rsi", "t2EORrsi", "t2ORRrsi", "t2RSBrsi", "t2SBCrsi")>;
+      "t2AD(C|D)S?rs", "t2ANDS?rs", "t2BICS?rs", "t2EORrs", "t2ORRrs", "t2RSBrs", "t2SBCrs")>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS, R52Read_ISS],
       (instregex "AD(C|D)S?rsr", "ANDS?rsr", "BICS?rsr", "EORrsr", "MVNS?sr",
-      "ORRrsrr", "RSBrsr", "RSCrsr", "SBCrsr")>;
+      "ORRrsr", "RSBrsr", "RSCrsr", "SBCrsr")>;
 
 def : InstRW<[R52WriteALU_EX1],
-    (instregex "ADR", "MOVSi", "MOVSsi", "MOVST?i16*", "MVNS?s?i", "t2MOVS?si")>;
+    (instregex "ADR", "MOVsi", "MVNS?s?i", "t2MOVS?si")>;
 
 def : InstRW<[R52WriteALU_EX1, R52Read_ISS], (instregex "ASRi", "RORS?i")>;
 def : InstRW<[R52WriteALU_EX1, R52Read_ISS, R52Read_ISS],
@@ -484,7 +483,7 @@ def : InstRW<[R52WriteILDM, R52Read_ISS],
 def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
       (instregex "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
 def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
-        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "tPOP")>;
 
 // Integer Store, Single Element
 def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
@@ -500,7 +499,7 @@ def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
 
 // Integer Store, Dual
 def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
-    (instregex "STRD$", "t2STRDi8", "STL", "t2STRD$", "t2STL")>;
+    (instregex "STRD$", "t2STRDi8", "STL", "t2STL")>;
 def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
     (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
 
@@ -508,11 +507,11 @@ def : InstRW<[R52WriteISTM, R52Read_ISS, R52Read_EX2],
     (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
 def : InstRW<[R52WriteISTM, R52WriteAdr, R52Read_ISS, R52Read_EX2],
     (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
-    "PUSH", "tPUSH")>;
+    "tPUSH")>;
 
 // LDRLIT pseudo instructions, they expand to LDR + PICADD
 def : InstRW<[R52WriteLd],
-      (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+      (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel$")>;
 // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
 def : InstRW<[R52WriteLd], (instregex "LDRLIT_ga_pcrel_ldr")>;
 
@@ -530,7 +529,7 @@ def : InstRW<[R52Write2FPALU_F5, R52Read_F1], (instregex "VABS(fq|hq)")>;
 def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fd|hd)")>;
 def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fq|hq)")>;
 
-def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)$")>;
 def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(fq|hq)")>;
 
 def : InstRW<[R52WriteFPLd_F4, R52Read_ISS, R52Read_F1], (instregex "VLDR")>;
@@ -792,8 +791,6 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
 def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>;
 def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>;
 
-def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
-
 def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1],
       (instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>;
 def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
index b838688c6f04..87984648139b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -44,6 +44,9 @@ def SwiftModel : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 14; // A branch direction mispredict.
   let CompleteModel = 0;      // FIXME: Remove if all instructions are covered.
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 // Swift predicates.
@@ -161,12 +164,12 @@ let SchedModel = SwiftModel in {
                           "t2UXTB16")>;
   // Pseudo instructions.
   def : InstRW<[SwiftWriteP01OneCycle2x],
-        (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
-                   "t2MOVi32imm", "t2MOV_ga_dyn")>;
+        (instregex "MOVCCi32imm", "MOVi32imm", "t2MOVCCi32imm",
+                   "t2MOVi32imm")>;
   def : InstRW<[SwiftWriteP01OneCycle3x],
         (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
   def : InstRW<[SwiftWriteP01OneCycle2x_load],
-        (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+        (instregex "MOV_ga_pcrel_ldr")>;
 
   def SwiftWriteP0TwoCycleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
 
@@ -218,8 +221,8 @@ let SchedModel = SwiftModel in {
   // 4.2.12 Integer Multiply (32-bit result)
   // Two sources.
   def : InstRW< [SwiftWriteP0FourCycle],
-        (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
-        "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+        (instregex "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+        "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDX", "t2MUL",
         "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
         "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
 
@@ -241,8 +244,8 @@ let SchedModel = SwiftModel in {
   // Multiply accumulate, three sources
   def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
                  SwiftReadAdvanceFourCyclesPred],
-        (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
-        "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+        (instregex "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+        "t2MLA", "t2MLS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
         "t2SMMLSR")>;
 
   // 4.2.13 Integer Multiply (32-bit result, Q flag)
@@ -302,9 +305,9 @@ let SchedModel = SwiftModel in {
   // We are being a bit inaccurate here.
   def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
                  SchedReadAdvance<4>, SchedReadAdvance<3>],
-        (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+        (instregex "SMLAL", "UMLAL", "SMLALBT",
         "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
-        "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+        "UMAAL", "t2SMLAL", "t2UMLAL", "t2SMLALBB", "t2SMLALBT",
         "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
         "t2UMAAL")>;
 
@@ -366,7 +369,7 @@ let SchedModel = SwiftModel in {
         "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
   def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
         (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
-        "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+        "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)?")>;
 
   // 4.2.21 Integer Dual Load
   // Not accurate.
@@ -483,7 +486,7 @@ let SchedModel = SwiftModel in {
         (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
         "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
   def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
-        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "tPOP")>;
   // 4.2.23 Integer Store, Single Element
   def : InstRW<[SwiftWriteP2],
         (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
@@ -533,7 +536,7 @@ let SchedModel = SwiftModel in {
         (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
   def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
         (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
-        "PUSH", "tPUSH")>;
+        "tPUSH")>;
 
   // LDRLIT pseudo instructions, they expand to LDR + PICADD
   def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU],
@@ -549,14 +552,14 @@ let SchedModel = SwiftModel in {
 
   // 4.2.27 Not issued
   def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
-  def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+  def : InstRW<[WriteNoop], (instregex "t2IT", "IT")>;
 
   // 4.2.28 Advanced SIMD, Integer, 2 cycle
   def : InstRW<[SwiftWriteP0TwoCycle],
         (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
                    "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
                    "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
-                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF",
                    "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
 
   def : InstRW<[SwiftWriteP1TwoCycle],
@@ -566,7 +569,7 @@ let SchedModel = SwiftModel in {
   // 4.2.30 Advanced SIMD, Integer with Accumulate
   def : InstRW<[SwiftWriteP0FourCycle],
         (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
-        "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+        "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
         "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
         "VQSUB")>;
   def : InstRW<[SwiftWriteP1FourCycle],
@@ -623,12 +626,12 @@ let SchedModel = SwiftModel in {
   // 4.2.37 Advanced SIMD and VFP, Move
   def : InstRW<[SwiftWriteP0TwoCycle],
         (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
-                   "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+                   "VMVNv", "VMVN(d|q)",
                    "FCONST(D|S)")>;
   def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
   def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
         (instregex "VQMOVN")>;
-  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN")>;
   def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
         (instregex "VDUP(8|16|32)")>;
   def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 23027e92481f..f42cbbda1b71 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -302,6 +302,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   }
 }
 
+bool ARMSubtarget::isTargetHardFloat() const { return TM.isTargetHardFloat(); }
+
 bool ARMSubtarget::isAPCS_ABI() const {
   assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
   return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
index eedb675a3304..74aee9a8ed38 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -105,6 +105,7 @@ protected:
     ARMv81a,
     ARMv82a,
     ARMv83a,
+    ARMv84a,
     ARMv8a,
     ARMv8mBaseline,
     ARMv8mMainline,
@@ -151,6 +152,7 @@ protected:
   bool HasV8_1aOps = false;
   bool HasV8_2aOps = false;
   bool HasV8_3aOps = false;
+  bool HasV8_4aOps = false;
   bool HasV8MBaselineOps = false;
   bool HasV8MMainlineOps = false;
 
@@ -198,6 +200,9 @@ protected:
   /// register allocation.
   bool DisablePostRAScheduler = false;
 
+  /// UseAA - True if using AA during codegen (DAGCombine, MISched, etc)
+  bool UseAA = false;
+
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2 = false;
 
@@ -296,6 +301,12 @@ protected:
   /// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
   bool Has8MSecExt = false;
 
+  /// HasSHA2 - if true, processor supports SHA1 and SHA256
+  bool HasSHA2 = false;
+
+  /// HasAES - if true, processor supports AES
+  bool HasAES = false;
+
   /// HasCrypto - if true, processor supports Cryptography extensions
   bool HasCrypto = false;
 
@@ -316,6 +327,10 @@ protected:
   /// pairs faster.
   bool HasFuseAES = false;
 
+  /// HasFuseLiterals - if true, processor executes back to back
+  /// bottom and top halves of literal generation faster.
+  bool HasFuseLiterals = false;
+
   /// If true, if conversion may decide to leave some instructions unpredicated.
   bool IsProfitableToUnpredicate = false;
 
@@ -341,9 +356,12 @@ protected:
   /// If true, the AGU and NEON/FPU units are multiplexed.
   bool HasMuxedUnits = false;
 
-  /// If true, VMOVS will never be widened to VMOVD
+  /// If true, VMOVS will never be widened to VMOVD.
   bool DontWidenVMOVS = false;
 
+  /// If true, splat a register between VFP and NEON instructions.
+  bool SplatVFPToNeon = false;
+
   /// If true, run the MLx expansion pass.
   bool ExpandMLx = false;
 
@@ -510,6 +528,7 @@ public:
   bool hasV8_1aOps() const { return HasV8_1aOps; }
   bool hasV8_2aOps() const { return HasV8_2aOps; }
   bool hasV8_3aOps() const { return HasV8_3aOps; }
+  bool hasV8_4aOps() const { return HasV8_4aOps; }
   bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
   bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
 
@@ -535,6 +554,8 @@ public:
   bool hasVFP4() const { return HasVFPv4; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON;  }
+  bool hasSHA2() const { return HasSHA2; }
+  bool hasAES() const { return HasAES; }
   bool hasCrypto() const { return HasCrypto; }
   bool hasDotProd() const { return HasDotProd; }
   bool hasCRC() const { return HasCRC; }
@@ -577,6 +598,7 @@ public:
   bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
   bool hasMuxedUnits() const { return HasMuxedUnits; }
   bool dontWidenVMOVS() const { return DontWidenVMOVS; }
+  bool useSplatVFPToNeon() const { return SplatVFPToNeon; }
   bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
   bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
   bool nonpipelinedVFP() const { return NonpipelinedVFP; }
@@ -598,8 +620,9 @@ public:
   bool hasFullFP16() const { return HasFullFP16; }
 
   bool hasFuseAES() const { return HasFuseAES; }
-  /// \brief Return true if the CPU supports any kind of instruction fusion.
-  bool hasFusion() const { return hasFuseAES(); }
+  bool hasFuseLiterals() const { return HasFuseLiterals; }
+  /// Return true if the CPU supports any kind of instruction fusion.
+  bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -652,13 +675,7 @@ public:
            !isTargetDarwin() && !isTargetWindows();
   }
 
-  bool isTargetHardFloat() const {
-    // FIXME: this is invalid for WindowsCE
-    return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
-           TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
-           TargetTriple.getEnvironment() == Triple::EABIHF ||
-           isTargetWindows() || isAAPCS16_ABI();
-  }
+  bool isTargetHardFloat() const;
 
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
 
@@ -723,6 +740,10 @@ public:
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 
+  /// Enable use of alias analysis during code generation (during MI
+  /// scheduling, DAGCombine, etc.).
+  bool useAA() const override { return UseAA; }
+
   // enableAtomicExpand- True if we need to expand our atomics.
   bool enableAtomicExpand() const override;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 0f6d1eddc985..519f789fc215 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/ExecutionDomainFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -34,7 +34,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
@@ -45,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cassert>
@@ -75,7 +75,7 @@ EnableGlobalMerge("arm-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"));
 
 namespace llvm {
-  void initializeARMExecutionDepsFixPass(PassRegistry&);
+  void initializeARMExecutionDomainFixPass(PassRegistry&);
 }
 
 extern "C" void LLVMInitializeARMTarget() {
@@ -89,8 +89,10 @@ extern "C" void LLVMInitializeARMTarget() {
   initializeGlobalISel(Registry);
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
+  initializeARMParallelDSPPass(Registry);
+  initializeARMCodeGenPreparePass(Registry);
   initializeARMConstantIslandsPass(Registry);
-  initializeARMExecutionDepsFixPass(Registry);
+  initializeARMExecutionDomainFixPass(Registry);
   initializeARMExpandPseudoPass(Registry);
   initializeThumb2SizeReducePass(Registry);
 }
@@ -214,11 +216,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
 
   // Default to triple-appropriate float ABI
   if (Options.FloatABIType == FloatABI::Default) {
-    if (TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
-        TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
-        TargetTriple.getEnvironment() == Triple::EABIHF ||
-        TargetTriple.isOSWindows() ||
-        TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+    if (isTargetHardFloat())
       this->Options.FloatABIType = FloatABI::Hard;
     else
       this->Options.FloatABIType = FloatABI::Soft;
@@ -238,6 +236,11 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
       this->Options.EABIVersion = EABI::EABI5;
   }
 
+  if (TT.isOSBinFormatMachO()) {
+    this->Options.TrapUnreachable = true;
+    this->Options.NoTrapAfterNoreturn = true;
+  }
+
   initAsmInfo();
 }
 
@@ -344,6 +347,7 @@ public:
   }
 
   void addIRPasses() override;
+  void addCodeGenPrepare() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
@@ -355,20 +359,23 @@ public:
   void addPreEmitPass() override;
 };
 
-class ARMExecutionDepsFix : public ExecutionDepsFix {
+class ARMExecutionDomainFix : public ExecutionDomainFix {
 public:
   static char ID;
-  ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {}
+  ARMExecutionDomainFix() : ExecutionDomainFix(ID, ARM::DPRRegClass) {}
   StringRef getPassName() const override {
-    return "ARM Execution Dependency Fix";
+    return "ARM Execution Domain Fix";
   }
 };
-char ARMExecutionDepsFix::ID;
+char ARMExecutionDomainFix::ID;
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix",
-                "ARM Execution Dependency Fix", false, false)
+INITIALIZE_PASS_BEGIN(ARMExecutionDomainFix, "arm-execution-domain-fix",
+  "ARM Execution Domain Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(ARMExecutionDomainFix, "arm-execution-domain-fix",
+  "ARM Execution Domain Fix", false, false)
 
 TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new ARMPassConfig(*this, PM);
@@ -397,7 +404,16 @@ void ARMPassConfig::addIRPasses() {
     addPass(createInterleavedAccessPass());
 }
 
+void ARMPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createARMCodeGenPreparePass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool ARMPassConfig::addPreISel() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createARMParallelDSPPass());
+
   if ((TM->getOptLevel() != CodeGenOpt::None &&
        EnableGlobalMerge == cl::BOU_UNSET) ||
       EnableGlobalMerge == cl::BOU_TRUE) {
@@ -462,7 +478,8 @@ void ARMPassConfig::addPreSched2() {
     if (EnableARMLoadStoreOpt)
       addPass(createARMLoadStoreOptimizationPass());
 
-    addPass(new ARMExecutionDepsFix());
+    addPass(new ARMExecutionDomainFix());
+    addPass(createBreakFalseDeps());
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
index 2072bb731f0a..2c791998e702 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -61,6 +61,16 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isTargetHardFloat() const {
+    return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+           TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+           TargetTriple.getEnvironment() == Triple::EABIHF ||
+           (TargetTriple.isOSBinFormatMachO() &&
+            TargetTriple.getSubArch() == Triple::ARMSubArch_v7em) ||
+           TargetTriple.isOSWindows() ||
+           TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+  }
 };
 
 /// ARM/Thumb little endian target machine.
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 88bab64ffaf2..d0620761ea9c 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -40,9 +40,6 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   if (isAAPCS_ABI) {
     LSDASection = nullptr;
   }
-
-  AttributesSection =
-      getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
index bd7aa1cfe02b..0dc0882809c0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -16,9 +16,6 @@
 namespace llvm {
 
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
-protected:
-  const MCSection *AttributesSection = nullptr;
-
 public:
   ARMElfTargetObjectFile()
       : TargetLoweringObjectFileELF() {
@@ -33,7 +30,7 @@ public:
                                         MachineModuleInfo *MMI,
                                         MCStreamer &Streamer) const override;
 
-  /// \brief Describe a TLS variable address within debug info.
+  /// Describe a TLS variable address within debug info.
   const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
 
   MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 43d7888075b5..f8cae31641ff 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
@@ -26,6 +25,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -126,6 +126,10 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
       return 0;
   }
 
+  // xor a, -1 can always be folded to MVN
+  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
+    return 0;
+
   return getIntImmCost(Imm, Ty);
 }
 
@@ -351,7 +355,7 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                    const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // On NEON a a vector select gets lowered to vbsl.
+  // On NEON a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
     // Lowering of some vector selects is currently far from perfect.
     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
@@ -396,8 +400,8 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  // We only handle costs of reverse and alternate shuffles for now.
-  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+  // We only handle costs of reverse and select shuffles for now.
+  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select)
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
@@ -422,9 +426,9 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
   }
-  if (Kind == TTI::SK_Alternate) {
-    static const CostTblEntry NEONAltShuffleTbl[] = {
-        // Alt shuffle cost table for ARM. Cost is the number of instructions
+  if (Kind == TTI::SK_Select) {
+    static const CostTblEntry NEONSelShuffleTbl[] = {
+        // Select shuffle cost table for ARM. Cost is the number of instructions
         // required to create the shuffled vector.
 
         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
@@ -441,7 +445,7 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl,
+    if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
                                             ISD::VECTOR_SHUFFLE, LT.second))
       return LT.first * Entry->Cost;
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -579,9 +583,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 
   SmallVector<BasicBlock*, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
-  DEBUG(dbgs() << "Loop has:\n"
-      << "Blocks: " << L->getNumBlocks() << "\n"
-      << "Exit blocks: " << ExitingBlocks.size() << "\n");
+  LLVM_DEBUG(dbgs() << "Loop has:\n"
+                    << "Blocks: " << L->getNumBlocks() << "\n"
+                    << "Exit blocks: " << ExitingBlocks.size() << "\n");
 
   // Only allow another exit other than the latch. This acts as an early exit
   // as it mirrors the profitability calculation of the runtime unroller.
@@ -612,12 +616,14 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     }
   }
 
-  DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
+  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
 
   UP.Partial = true;
   UP.Runtime = true;
   UP.UnrollRemainder = true;
   UP.DefaultUnrollRuntimeCount = 4;
+  UP.UnrollAndJam = true;
+  UP.UnrollAndJamInnerLoopThreshold = 60;
 
   // Force unrolling small loops can be very useful because of the branch
   // taken cost of the backedge.
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 97b642c99f80..807d62547337 100644
--- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -527,6 +527,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
   OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
   OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseTraceSyncBarrierOptOperand(OperandVector &);
   OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
   OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
   OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
@@ -561,6 +562,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
   bool isITBlockTerminator(MCInst &Inst) const;
   void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands);
+  bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
+                        bool Load, bool ARMMode, bool Writeback);
 
 public:
   enum ARMMatchResultTy {
@@ -644,6 +647,7 @@ class ARMOperand : public MCParsedAsmOperand {
     k_Immediate,
     k_MemBarrierOpt,
     k_InstSyncBarrierOpt,
+    k_TraceSyncBarrierOpt,
     k_Memory,
     k_PostIndexRegister,
     k_MSRMask,
@@ -694,6 +698,10 @@ class ARMOperand : public MCParsedAsmOperand {
     ARM_ISB::InstSyncBOpt Val;
   };
 
+  struct TSBOptOp {
+    ARM_TSB::TraceSyncBOpt Val;
+  };
+
   struct IFlagsOp {
     ARM_PROC::IFlags Val;
   };
@@ -790,6 +798,7 @@ class ARMOperand : public MCParsedAsmOperand {
     struct CoprocOptionOp CoprocOption;
     struct MBOptOp MBOpt;
     struct ISBOptOp ISBOpt;
+    struct TSBOptOp TSBOpt;
     struct ITMaskOp ITMask;
     struct IFlagsOp IFlags;
     struct MMaskOp MMask;
@@ -879,6 +888,11 @@ public:
     return ISBOpt.Val;
   }
 
+  ARM_TSB::TraceSyncBOpt getTraceSyncBarrierOpt() const {
+    assert(Kind == k_TraceSyncBarrierOpt && "Invalid access!");
+    return TSBOpt.Val;
+  }
+
   ARM_PROC::IFlags getProcIFlags() const {
     assert(Kind == k_ProcIFlags && "Invalid access!");
     return IFlags.Val;
@@ -1028,7 +1042,12 @@ public:
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
-    int64_t Value = -CE->getValue();
+    // isImm0_4095Neg is used with 32-bit immediates only.
+    // 32-bit immediates are zero extended to 64-bit when parsed,
+    // thus simple -CE->getValue() results in a big negative number,
+    // not a small positive number as intended
+    if ((CE->getValue() >> 32) > 0) return false;
+    uint32_t Value = -static_cast<uint32_t>(CE->getValue());
     return Value > 0 && Value < 4096;
   }
 
@@ -1150,10 +1169,31 @@ public:
   bool isToken() const override { return Kind == k_Token; }
   bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
   bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
-  bool isMem() const override { return Kind == k_Memory; }
+  bool isTraceSyncBarrierOpt() const { return Kind == k_TraceSyncBarrierOpt; }
+  bool isMem() const override {
+    if (Kind != k_Memory)
+      return false;
+    if (Memory.BaseRegNum &&
+        !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.BaseRegNum))
+      return false;
+    if (Memory.OffsetRegNum &&
+        !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.OffsetRegNum))
+      return false;
+    return true;
+  }
   bool isShifterImm() const { return Kind == k_ShifterImmediate; }
-  bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
-  bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+  bool isRegShiftedReg() const {
+    return Kind == k_ShiftedRegister &&
+           ARMMCRegisterClasses[ARM::GPRRegClassID].contains(
+               RegShiftedReg.SrcReg) &&
+           ARMMCRegisterClasses[ARM::GPRRegClassID].contains(
+               RegShiftedReg.ShiftReg);
+  }
+  bool isRegShiftedImm() const {
+    return Kind == k_ShiftedImmediate &&
+           ARMMCRegisterClasses[ARM::GPRRegClassID].contains(
+               RegShiftedImm.SrcReg);
+  }
   bool isRotImm() const { return Kind == k_RotateImmediate; }
   bool isModImm() const { return Kind == k_ModifiedImmediate; }
 
@@ -1192,9 +1232,12 @@ public:
 
   bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
-  bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+  bool isPostIdxRegShifted() const {
+    return Kind == k_PostIndexRegister &&
+           ARMMCRegisterClasses[ARM::GPRRegClassID].contains(PostIdxReg.RegNum);
+  }
   bool isPostIdxReg() const {
-    return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
+    return isPostIdxRegShifted() && PostIdxReg.ShiftTy == ARM_AM::no_shift;
   }
   bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
     if (!isMem())
@@ -1331,10 +1374,10 @@ public:
   }
 
   bool isAM3Offset() const {
-    if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+    if (isPostIdxReg())
+      return true;
+    if (!isImm())
       return false;
-    if (Kind == k_PostIndexRegister)
-      return PostIdxReg.ShiftTy == ARM_AM::no_shift;
     // Immediate offset in range [-255, 255].
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -1834,7 +1877,22 @@ public:
     return ARM_AM::isNEONi32splat(~Value);
   }
 
-  bool isNEONByteReplicate(unsigned NumBytes) const {
+  static bool isValidNEONi32vmovImm(int64_t Value) {
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    return ((Value & 0xffffffffffffff00) == 0) ||
+           ((Value & 0xffffffffffff00ff) == 0) ||
+           ((Value & 0xffffffffff00ffff) == 0) ||
+           ((Value & 0xffffffff00ffffff) == 0) ||
+           ((Value & 0xffffffffffff00ff) == 0xff) ||
+           ((Value & 0xffffffffff00ffff) == 0xffff);
+  }
+
+  bool isNEONReplicate(unsigned Width, unsigned NumElems, bool Inv) const {
+    assert((Width == 8 || Width == 16 || Width == 32) &&
+           "Invalid element width");
+    assert(NumElems * Width <= 64 && "Invalid result width");
+
     if (!isImm())
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -1844,18 +1902,49 @@ public:
     int64_t Value = CE->getValue();
     if (!Value)
       return false; // Don't bother with zero.
+    if (Inv)
+      Value = ~Value;
 
-    unsigned char B = Value & 0xff;
-    for (unsigned i = 1; i < NumBytes; ++i) {
-      Value >>= 8;
-      if ((Value & 0xff) != B)
+    uint64_t Mask = (1ull << Width) - 1;
+    uint64_t Elem = Value & Mask;
+    if (Width == 16 && (Elem & 0x00ff) != 0 && (Elem & 0xff00) != 0)
+      return false;
+    if (Width == 32 && !isValidNEONi32vmovImm(Elem))
+      return false;
+
+    for (unsigned i = 1; i < NumElems; ++i) {
+      Value >>= Width;
+      if ((Value & Mask) != Elem)
         return false;
     }
     return true;
   }
 
-  bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
-  bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
+  bool isNEONByteReplicate(unsigned NumBytes) const {
+    return isNEONReplicate(8, NumBytes, false);
+  }
+
+  static void checkNeonReplicateArgs(unsigned FromW, unsigned ToW) {
+    assert((FromW == 8 || FromW == 16 || FromW == 32) &&
+           "Invalid source width");
+    assert((ToW == 16 || ToW == 32 || ToW == 64) &&
+           "Invalid destination width");
+    assert(FromW < ToW && "ToW is not less than FromW");
+  }
+
+  template<unsigned FromW, unsigned ToW>
+  bool isNEONmovReplicate() const {
+    checkNeonReplicateArgs(FromW, ToW);
+    if (ToW == 64 && isNEONi64splat())
+      return false;
+    return isNEONReplicate(FromW, ToW / FromW, false);
+  }
+
+  template<unsigned FromW, unsigned ToW>
+  bool isNEONinvReplicate() const {
+    checkNeonReplicateArgs(FromW, ToW);
+    return isNEONReplicate(FromW, ToW / FromW, true);
+  }
 
   bool isNEONi32vmov() const {
     if (isNEONByteReplicate(4))
@@ -1866,16 +1955,7 @@ public:
     // Must be a constant.
     if (!CE)
       return false;
-    int64_t Value = CE->getValue();
-    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
-    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
-    // FIXME: This is probably wrong and a copy and paste from previous example
-    return (Value >= 0 && Value < 256) ||
-      (Value >= 0x0100 && Value <= 0xff00) ||
-      (Value >= 0x010000 && Value <= 0xff0000) ||
-      (Value >= 0x01000000 && Value <= 0xff000000) ||
-      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
-      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+    return isValidNEONi32vmovImm(CE->getValue());
   }
 
   bool isNEONi32vmovNeg() const {
@@ -1883,16 +1963,7 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
-    int64_t Value = ~CE->getValue();
-    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
-    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
-    // FIXME: This is probably wrong and a copy and paste from previous example
-    return (Value >= 0 && Value < 256) ||
-      (Value >= 0x0100 && Value <= 0xff00) ||
-      (Value >= 0x010000 && Value <= 0xff0000) ||
-      (Value >= 0x01000000 && Value <= 0xff000000) ||
-      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
-      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+    return isValidNEONi32vmovImm(~CE->getValue());
   }
 
   bool isNEONi64splat() const {
@@ -2189,7 +2260,7 @@ public:
     // The operand is actually an imm0_4095, but we have its
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
   }
 
   void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
@@ -2234,6 +2305,11 @@ public:
     Inst.addOperand(MCOperand::createImm(unsigned(getInstSyncBarrierOpt())));
   }
 
+  void addTraceSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getTraceSyncBarrierOpt())));
+  }
+
   void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
@@ -2710,62 +2786,87 @@ public:
     Inst.addOperand(MCOperand::createImm(Value));
   }
 
-  void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const {
     // The immediate encodes the type of constant as well as the value.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    unsigned Value = CE->getValue();
     assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
             Inst.getOpcode() == ARM::VMOVv16i8) &&
-           "All vmvn instructions that wants to replicate non-zero byte "
-           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
-    unsigned B = ((~Value) & 0xff);
+          "All instructions that wants to replicate non-zero byte "
+          "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned Value = CE->getValue();
+    if (Inv)
+      Value = ~Value;
+    unsigned B = Value & 0xff;
     B |= 0xe00; // cmode = 0b1110
     Inst.addOperand(MCOperand::createImm(B));
   }
 
-  void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
+  void addNEONinvi8ReplicateOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    unsigned Value = CE->getValue();
+    addNEONi8ReplicateOperands(Inst, true);
+  }
+
+  static unsigned encodeNeonVMOVImmediate(unsigned Value) {
     if (Value >= 256 && Value <= 0xffff)
       Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
     else if (Value > 0xffff && Value <= 0xffffff)
       Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
     else if (Value > 0xffffff)
       Value = (Value >> 24) | 0x600;
-    Inst.addOperand(MCOperand::createImm(Value));
+    return Value;
   }
 
-  void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+  void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    unsigned Value = CE->getValue();
-    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
-            Inst.getOpcode() == ARM::VMOVv16i8) &&
-           "All instructions that wants to replicate non-zero byte "
-           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
-    unsigned B = Value & 0xff;
-    B |= 0xe00; // cmode = 0b1110
-    Inst.addOperand(MCOperand::createImm(B));
+    unsigned Value = encodeNeonVMOVImmediate(CE->getValue());
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONvmovi8ReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addNEONi8ReplicateOperands(Inst, false);
+  }
+
+  void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert((Inst.getOpcode() == ARM::VMOVv4i16 ||
+            Inst.getOpcode() == ARM::VMOVv8i16 ||
+            Inst.getOpcode() == ARM::VMVNv4i16 ||
+            Inst.getOpcode() == ARM::VMVNv8i16) &&
+          "All instructions that want to replicate non-zero half-word "
+          "always must be replaced with V{MOV,MVN}v{4,8}i16.");
+    uint64_t Value = CE->getValue();
+    unsigned Elem = Value & 0xffff;
+    if (Elem >= 256)
+      Elem = (Elem >> 8) | 0x200;
+    Inst.addOperand(MCOperand::createImm(Elem));
   }
 
   void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    unsigned Value = ~CE->getValue();
-    if (Value >= 256 && Value <= 0xffff)
-      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
-    else if (Value > 0xffff && Value <= 0xffffff)
-      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
-    else if (Value > 0xffffff)
-      Value = (Value >> 24) | 0x600;
+    unsigned Value = encodeNeonVMOVImmediate(~CE->getValue());
     Inst.addOperand(MCOperand::createImm(Value));
   }
 
+  void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert((Inst.getOpcode() == ARM::VMOVv2i32 ||
+            Inst.getOpcode() == ARM::VMOVv4i32 ||
+            Inst.getOpcode() == ARM::VMVNv2i32 ||
+            Inst.getOpcode() == ARM::VMVNv4i32) &&
+          "All instructions that want to replicate non-zero word "
+          "always must be replaced with V{MOV,MVN}v{2,4}i32.");
+    uint64_t Value = CE->getValue();
+    unsigned Elem = encodeNeonVMOVImmediate(Value & 0xffffffff);
+    Inst.addOperand(MCOperand::createImm(Elem));
+  }
+
   void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
@@ -3064,6 +3165,15 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<ARMOperand>
+  CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
+    Op->TSBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
                                                       SMLoc S) {
     auto Op = make_unique<ARMOperand>(k_ProcIFlags);
@@ -3133,6 +3243,9 @@ void ARMOperand::print(raw_ostream &OS) const {
   case k_InstSyncBarrierOpt:
     OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
     break;
+  case k_TraceSyncBarrierOpt:
+    OS << "<ARM_TSB::" << TraceSyncBOptToString(getTraceSyncBarrierOpt()) << ">";
+    break;
   case k_Memory:
     OS << "<memory "
        << " base:" << Memory.BaseRegNum;
@@ -4122,6 +4235,24 @@ ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy
+ARMAsmParser::parseTraceSyncBarrierOptOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+     return MatchOperand_NoMatch;
+
+  if (!Tok.getString().equals_lower("csync"))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+
+  Operands.push_back(ARMOperand::CreateTraceSyncBarrierOpt(ARM_TSB::CSYNC, S));
+  return MatchOperand_Success;
+}
+
 /// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
 OperandMatchResultTy
 ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
@@ -4215,6 +4346,18 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val = Tok.getIntVal();
+    if (Val > 255 || Val < 0) {
+      return MatchOperand_NoMatch;
+    }
+    unsigned SYSmvalue = Val & 0xFF;
+    Parser.Lex(); 
+    Operands.push_back(ARMOperand::CreateMSRMask(SYSmvalue, S));
+    return MatchOperand_Success;
+  }
+
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
   StringRef Mask = Tok.getString();
@@ -5450,7 +5593,7 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   return false;
 }
 
-/// \brief Given a mnemonic, split out possible predication code and carry
+/// Given a mnemonic, split out possible predication code and carry
 /// setting letters to form a canonical mnemonic and flags.
 //
 // FIXME: Would be nice to autogen this.
@@ -5541,7 +5684,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
   return Mnemonic;
 }
 
-/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// Given a canonical mnemonic, determine if the instruction ever allows
 /// inclusion of carry set or predication code operands.
 //
 // FIXME: It would be nice to autogen this.
@@ -5585,6 +5728,7 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
         Mnemonic != "isb" && Mnemonic != "pld" && Mnemonic != "pli" &&
         Mnemonic != "pldw" && Mnemonic != "ldc2" && Mnemonic != "ldc2l" &&
         Mnemonic != "stc2" && Mnemonic != "stc2l" &&
+        Mnemonic != "tsb" &&
         !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
   } else if (isThumbOne()) {
     if (hasV6MOps())
@@ -5595,7 +5739,7 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
     CanAcceptPredicationCode = true;
 }
 
-// \brief Some Thumb instructions have two operand forms that are not
+// Some Thumb instructions have two operand forms that are not
 // available as three operand, convert to two operand form if possible.
 //
 // FIXME: We would really like to be able to tablegen'erate this.
@@ -6214,6 +6358,65 @@ bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
   return false;
 }
 
+bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
+                                    const OperandVector &Operands,
+                                    bool Load, bool ARMMode, bool Writeback) {
+  unsigned RtIndex = Load || !Writeback ? 0 : 1;
+  unsigned Rt = MRI->getEncodingValue(Inst.getOperand(RtIndex).getReg());
+  unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(RtIndex + 1).getReg());
+
+  if (ARMMode) {
+    // Rt can't be R14.
+    if (Rt == 14)
+      return Error(Operands[3]->getStartLoc(),
+                  "Rt can't be R14");
+
+    // Rt must be even-numbered.
+    if ((Rt & 1) == 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt must be even-numbered");
+
+    // Rt2 must be Rt + 1.
+    if (Rt2 != Rt + 1) {
+      if (Load)
+        return Error(Operands[3]->getStartLoc(),
+                     "destination operands must be sequential");
+      else
+        return Error(Operands[3]->getStartLoc(),
+                     "source operands must be sequential");
+    }
+
+    // FIXME: Diagnose m == 15
+    // FIXME: Diagnose ldrd with m == t || m == t2.
+  }
+
+  if (!ARMMode && Load) {
+    if (Rt2 == Rt)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands can't be identical");
+  }
+
+  if (Writeback) {
+    unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+
+    if (Rn == Rt || Rn == Rt2) {
+      if (Load)
+        return Error(Operands[3]->getStartLoc(),
+                     "base register needs to be different from destination "
+                     "registers");
+      else
+        return Error(Operands[3]->getStartLoc(),
+                     "source register and base register can't be identical");
+    }
+
+    // FIXME: Diagnose ldrd/strd with writeback and n == 15.
+    // (Except the immediate form of ldrd?)
+  }
+
+  return false;
+}
+
+
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::validateInstruction(MCInst &Inst,
                                        const OperandVector &Operands) {
@@ -6227,7 +6430,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // The instruction must be predicable.
     if (!MCID.isPredicable())
       return Error(Loc, "instructions in IT block must be predicable");
-    unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+    ARMCC::CondCodes Cond = ARMCC::CondCodes(
+        Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm());
     if (Cond != currentITCond()) {
       // Find the condition code Operand to get its SMLoc information.
       SMLoc CondLoc;
@@ -6235,9 +6439,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
           CondLoc = Operands[I]->getStartLoc();
       return Error(CondLoc, "incorrect condition in IT block; got '" +
-                   StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
-                   "', but expected '" +
-                   ARMCondCodeToString(ARMCC::CondCodes(currentITCond())) + "'");
+                                StringRef(ARMCondCodeToString(Cond)) +
+                                "', but expected '" +
+                                ARMCondCodeToString(currentITCond()) + "'");
     }
   // Check for non-'al' condition codes outside of the IT block.
   } else if (isThumbTwo() && MCID.isPredicable() &&
@@ -6259,51 +6463,43 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
 
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
+  case ARM::t2IT: {
+    // Encoding is unpredictable if it ever results in a notional 'NV'
+    // predicate. Since we don't parse 'NV' directly this means an 'AL'
+    // predicate with an "else" mask bit.
+    unsigned Cond = Inst.getOperand(0).getImm();
+    unsigned Mask = Inst.getOperand(1).getImm();
+
+    // Mask hasn't been modified to the IT instruction encoding yet so
+    // conditions only allowing a 't' are a block of 1s starting at bit 3
+    // followed by all 0s. Easiest way is to just list the 4 possibilities.
+    if (Cond == ARMCC::AL && Mask != 8 && Mask != 12 && Mask != 14 &&
+        Mask != 15)
+      return Error(Loc, "unpredictable IT predicate sequence");
+    break;
+  }
   case ARM::LDRD:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
+                         /*Writeback*/false))
+      return true;
+    break;
   case ARM::LDRD_PRE:
-  case ARM::LDRD_POST: {
-    const unsigned RtReg = Inst.getOperand(0).getReg();
-
-    // Rt can't be R14.
-    if (RtReg == ARM::LR)
-      return Error(Operands[3]->getStartLoc(),
-                   "Rt can't be R14");
-
-    const unsigned Rt = MRI->getEncodingValue(RtReg);
-    // Rt must be even-numbered.
-    if ((Rt & 1) == 1)
-      return Error(Operands[3]->getStartLoc(),
-                   "Rt must be even-numbered");
-
-    // Rt2 must be Rt + 1.
-    const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
-    if (Rt2 != Rt + 1)
-      return Error(Operands[3]->getStartLoc(),
-                   "destination operands must be sequential");
-
-    if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
-      const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
-      // For addressing modes with writeback, the base register needs to be
-      // different from the destination registers.
-      if (Rn == Rt || Rn == Rt2)
-        return Error(Operands[3]->getStartLoc(),
-                     "base register needs to be different from destination "
-                     "registers");
-    }
-
-    return false;
-  }
+  case ARM::LDRD_POST:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
+                         /*Writeback*/true))
+      return true;
+    break;
   case ARM::t2LDRDi8:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
+                         /*Writeback*/false))
+      return true;
+    break;
   case ARM::t2LDRD_PRE:
-  case ARM::t2LDRD_POST: {
-    // Rt2 must be different from Rt.
-    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
-    if (Rt2 == Rt)
-      return Error(Operands[3]->getStartLoc(),
-                   "destination operands can't be identical");
-    return false;
-  }
+  case ARM::t2LDRD_POST:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
+                         /*Writeback*/true))
+      return true;
+    break;
   case ARM::t2BXJ: {
     const unsigned RmReg = Inst.getOperand(0).getReg();
     // Rm = SP is no longer unpredictable in v8-A
@@ -6312,35 +6508,39 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
                    "r13 (SP) is an unpredictable operand to BXJ");
     return false;
   }
-  case ARM::STRD: {
-    // Rt2 must be Rt + 1.
-    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
-    if (Rt2 != Rt + 1)
-      return Error(Operands[3]->getStartLoc(),
-                   "source operands must be sequential");
-    return false;
-  }
+  case ARM::STRD:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
+                         /*Writeback*/false))
+      return true;
+    break;
   case ARM::STRD_PRE:
-  case ARM::STRD_POST: {
-    // Rt2 must be Rt + 1.
-    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
-    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
-    if (Rt2 != Rt + 1)
-      return Error(Operands[3]->getStartLoc(),
-                   "source operands must be sequential");
-    return false;
-  }
+  case ARM::STRD_POST:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
+                         /*Writeback*/true))
+      return true;
+    break;
+  case ARM::t2STRD_PRE:
+  case ARM::t2STRD_POST:
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/false,
+                         /*Writeback*/true))
+      return true;
+    break;
   case ARM::STR_PRE_IMM:
   case ARM::STR_PRE_REG:
+  case ARM::t2STR_PRE:
   case ARM::STR_POST_IMM:
   case ARM::STR_POST_REG:
+  case ARM::t2STR_POST:
   case ARM::STRH_PRE:
+  case ARM::t2STRH_PRE:
   case ARM::STRH_POST:
+  case ARM::t2STRH_POST:
   case ARM::STRB_PRE_IMM:
   case ARM::STRB_PRE_REG:
+  case ARM::t2STRB_PRE:
   case ARM::STRB_POST_IMM:
-  case ARM::STRB_POST_REG: {
+  case ARM::STRB_POST_REG:
+  case ARM::t2STRB_POST: {
     // Rt must be different from Rn.
     const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
@@ -6352,18 +6552,28 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   }
   case ARM::LDR_PRE_IMM:
   case ARM::LDR_PRE_REG:
+  case ARM::t2LDR_PRE:
   case ARM::LDR_POST_IMM:
   case ARM::LDR_POST_REG:
+  case ARM::t2LDR_POST:
   case ARM::LDRH_PRE:
+  case ARM::t2LDRH_PRE:
   case ARM::LDRH_POST:
+  case ARM::t2LDRH_POST:
   case ARM::LDRSH_PRE:
+  case ARM::t2LDRSH_PRE:
   case ARM::LDRSH_POST:
+  case ARM::t2LDRSH_POST:
   case ARM::LDRB_PRE_IMM:
   case ARM::LDRB_PRE_REG:
+  case ARM::t2LDRB_PRE:
   case ARM::LDRB_POST_IMM:
   case ARM::LDRB_POST_REG:
+  case ARM::t2LDRB_POST:
   case ARM::LDRSB_PRE:
-  case ARM::LDRSB_POST: {
+  case ARM::t2LDRSB_PRE:
+  case ARM::LDRSB_POST:
+  case ARM::t2LDRSB_POST: {
     // Rt must be different from Rn.
     const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
@@ -6374,7 +6584,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     return false;
   }
   case ARM::SBFX:
-  case ARM::UBFX: {
+  case ARM::t2SBFX:
+  case ARM::UBFX:
+  case ARM::t2UBFX: {
     // Width must be in range [1, 32-lsb].
     unsigned LSB = Inst.getOperand(2).getImm();
     unsigned Widthm1 = Inst.getOperand(3).getImm();
@@ -6592,19 +6804,40 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::HINT:
-  case ARM::t2HINT:
-    if (hasRAS()) {
-      // ESB is not predicable (pred must be AL)
-      unsigned Imm8 = Inst.getOperand(0).getImm();
-      unsigned Pred = Inst.getOperand(1).getImm();
-      if (Imm8 == 0x10 && Pred != ARMCC::AL)
-        return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
-                                                 "predicable, but condition "
-                                                 "code specified");
-    }
-    // Without the RAS extension, this behaves as any other unallocated hint.
+  case ARM::t2HINT: {
+    unsigned Imm8 = Inst.getOperand(0).getImm();
+    unsigned Pred = Inst.getOperand(1).getImm();
+    // ESB is not predicable (pred must be AL). Without the RAS extension, this
+    // behaves as any other unallocated hint.
+    if (Imm8 == 0x10 && Pred != ARMCC::AL && hasRAS())
+      return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
+                                               "predicable, but condition "
+                                               "code specified");
+    if (Imm8 == 0x14 && Pred != ARMCC::AL)
+      return Error(Operands[1]->getStartLoc(), "instruction 'csdb' is not "
+                                               "predicable, but condition "
+                                               "code specified");
+    break;
+  }
+  case ARM::VMOVRRS: {
+    // Source registers must be sequential.
+    const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+    const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+    if (Sm1 != Sm + 1)
+      return Error(Operands[5]->getStartLoc(),
+                   "source operands must be sequential");
     break;
   }
+  case ARM::VMOVSRR: {
+    // Destination registers must be sequential.
+    const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Sm1 != Sm + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands must be sequential");
+    break;
+  }
+  }
 
   return false;
 }
@@ -10173,10 +10406,11 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
         Message.Message = "too many operands for instruction";
       } else {
         Message.Message = "invalid operand for instruction";
-        DEBUG(dbgs() << "Missing diagnostic string for operand class " <<
-              getMatchClassName((MatchClassKind)I.getOperandClass())
-              << I.getOperandClass() << ", error " << I.getOperandError()
-              << ", opcode " << MII.getName(I.getOpcode()) << "\n");
+        LLVM_DEBUG(
+            dbgs() << "Missing diagnostic string for operand class "
+                   << getMatchClassName((MatchClassKind)I.getOperandClass())
+                   << I.getOperandClass() << ", error " << I.getOperandError()
+                   << ", opcode " << MII.getName(I.getOpcode()) << "\n");
       }
       NearMissesOut.emplace_back(Message);
       break;
@@ -10203,6 +10437,8 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
       if (!isThumb() && (MissingFeatures & Feature_IsThumb2) &&
           (MissingFeatures & ~(Feature_IsThumb2 | Feature_IsThumb)))
         break;
+      if (isMClass() && (MissingFeatures & Feature_HasNEON))
+        break;
 
       NearMissMessage Message;
       Message.Loc = IDLoc;
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 53c635877675..4733cf49827e 100644
--- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -158,6 +158,8 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -657,6 +659,8 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
 void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
   unsigned CC;
   CC = ITBlock.getITCC();
+  if (CC == 0xF)
+    CC = ARMCC::AL;
   if (ITBlock.instrInITBlock())
     ITBlock.advanceITState();
 
@@ -727,10 +731,13 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // code and mask operands so that we can apply them correctly
     // to the subsequent instructions.
     if (MI.getOpcode() == ARM::t2IT) {
-
       unsigned Firstcond = MI.getOperand(0).getImm();
       unsigned Mask = MI.getOperand(1).getImm();
       ITBlock.setITState(Firstcond, Mask);
+
+      // An IT instruction that would give a 'NV' predicate is unpredictable.
+      if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask))
+        CS << "unpredictable IT predicate sequence";
     }
 
     return Result;
@@ -996,6 +1003,11 @@ static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
 static const uint16_t DPRDecoderTable[] = {
      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
@@ -4142,7 +4154,6 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
     case 0x8a: // msplim_ns
     case 0x8b: // psplim_ns
     case 0x91: // basepri_ns
-    case 0x92: // basepri_max_ns
     case 0x93: // faultmask_ns
       if (!(FeatureBits[ARM::HasV8MMainlineOps]))
         return MCDisassembler::Fail;
@@ -4158,7 +4169,9 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
         return MCDisassembler::Fail;
       break;
     default:
-      return MCDisassembler::Fail;
+      // Architecturally defined as unpredictable
+      S = MCDisassembler::SoftFail;
+      break;
     }
 
     if (Inst.getOpcode() == ARM::t2MSR_M) {
@@ -4198,15 +4211,8 @@ static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
   // The table of encodings for these banked registers comes from B9.2.3 of the
   // ARM ARM. There are patterns, but nothing regular enough to make this logic
   // neater. So by fiat, these values are UNPREDICTABLE:
-  if (!R) {
-    if (SysM == 0x7 || SysM == 0xf || SysM == 0x18 || SysM == 0x19 ||
-        SysM == 0x1a || SysM == 0x1b)
-      return MCDisassembler::SoftFail;
-  } else {
-    if (SysM != 0xe && SysM != 0x10 && SysM != 0x12 && SysM != 0x14 &&
-        SysM != 0x16 && SysM != 0x1c && SysM != 0x1e)
-      return MCDisassembler::SoftFail;
-  }
+  if (!ARMBankedReg::lookupBankedRegByEncoding((R << 5) | SysM))
+    return MCDisassembler::Fail;
 
   Inst.addOperand(MCOperand::createImm(Val));
   return MCDisassembler::Success;
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 4fc67a4f6eb5..75ed40c18fa2 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -13,8 +13,6 @@
 
 #include "ARMInstPrinter.h"
 #include "Utils/ARMBaseInfo.h"
-#include "ARMBaseRegisterInfo.h"
-#include "ARMBaseRegisterInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -271,6 +269,10 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
     break;
   }
+  case ARM::TSB:
+  case ARM::t2TSB:
+    O << "\ttsb\tcsync";
+    return;
   }
 
   if (!printAliasInstr(MI, STI, O))
@@ -698,6 +700,13 @@ void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
   O << ARM_ISB::InstSyncBOptToString(val);
 }
 
+void ARMInstPrinter::printTraceSyncBOption(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_TSB::TraceSyncBOptToString(val);
+}
+
 void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
@@ -825,7 +834,8 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
       return;
     }
 
-    llvm_unreachable("Unexpected mask value!");
+    O << SYSm; 
+
     return;
   }
 
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 7dc311229cca..afc8515136bc 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -94,6 +94,8 @@ public:
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInstSyncBOption(const MCInst *MI, unsigned OpNum,
                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printTraceSyncBOption(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
   void printShiftImmOperand(const MCInst *MI, unsigned OpNum,
                             const MCSubtargetInfo &STI, raw_ostream &O);
   void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index ff507ab7162f..f524a0081301 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/TargetParser.h"
@@ -155,7 +156,8 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
   assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
          "Invalid kind!");
-  return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+  return (Endian == support::little ? InfosLE
+                                    : InfosBE)[Kind - FirstTargetFixupKind];
 }
 
 void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
@@ -171,7 +173,8 @@ void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
   }
 }
 
-unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
+unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op,
+                                         const MCSubtargetInfo &STI) const {
   bool HasThumb2 = STI.getFeatureBits()[ARM::FeatureThumb2];
   bool HasV8MBaselineOps = STI.getFeatureBits()[ARM::HasV8MBaselineOps];
 
@@ -193,8 +196,9 @@ unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
   }
 }
 
-bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+                                      const MCSubtargetInfo &STI) const {
+  if (getRelaxedOpcode(Inst.getOpcode(), STI) != Inst.getOpcode())
     return true;
   return false;
 }
@@ -239,7 +243,7 @@ const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
   }
   case ARM::fixup_arm_thumb_cb: {
     // If we have a Thumb CBZ or CBNZ instruction and its target is the next
-    // instruction it is is actually out of range for the instruction.
+    // instruction it is actually out of range for the instruction.
     // It will be changed to a NOP.
     int64_t Offset = (Value & ~1);
     if (Offset == 2)
@@ -261,7 +265,7 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
 void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
                                      const MCSubtargetInfo &STI,
                                      MCInst &Res) const {
-  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode(), STI);
 
   // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
   if (RelaxedOp == Inst.getOpcode()) {
@@ -289,7 +293,7 @@ void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
   Res.setOpcode(RelaxedOp);
 }
 
-bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
   const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
   const uint32_t ARMv4_NopEncoding = 0xe1a00000;   // using MOV r0,r0
@@ -299,9 +303,9 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
         hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
     uint64_t NumNops = Count / 2;
     for (uint64_t i = 0; i != NumNops; ++i)
-      OW->write16(nopEncoding);
+      support::endian::write(OS, nopEncoding, Endian);
     if (Count & 1)
-      OW->write8(0);
+      OS << '\0';
     return true;
   }
   // ARM mode
@@ -309,21 +313,20 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
       hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
   uint64_t NumNops = Count / 4;
   for (uint64_t i = 0; i != NumNops; ++i)
-    OW->write32(nopEncoding);
+    support::endian::write(OS, nopEncoding, Endian);
   // FIXME: should this function return false when unable to write exactly
   // 'Count' bytes with NOP encodings?
   switch (Count % 4) {
   default:
     break; // No leftover bytes to write
   case 1:
-    OW->write8(0);
+    OS << '\0';
     break;
   case 2:
-    OW->write16(0);
+    OS.write("\0\0", 2);
     break;
   case 3:
-    OW->write16(0);
-    OW->write8(0xa0);
+    OS.write("\0\0\xa0", 3);
     break;
   }
 
@@ -360,7 +363,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
                                          const MCFixup &Fixup,
                                          const MCValue &Target, uint64_t Value,
                                          bool IsResolved, MCContext &Ctx,
-                                         bool IsLittleEndian) const {
+                                         const MCSubtargetInfo* STI) const {
   unsigned Kind = Fixup.getKind();
 
   // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT
@@ -389,7 +392,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
   case FK_SecRel_4:
     return Value;
   case ARM::fixup_arm_movt_hi16:
-    if (IsResolved || !STI.getTargetTriple().isOSBinFormatELF())
+    assert(STI != nullptr);
+    if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF())
       Value >>= 16;
     LLVM_FALLTHROUGH;
   case ARM::fixup_arm_movw_lo16: {
@@ -401,7 +405,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     return Value;
   }
   case ARM::fixup_t2_movt_hi16:
-    if (IsResolved || !STI.getTargetTriple().isOSBinFormatELF())
+    assert(STI != nullptr);
+    if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF())
       Value >>= 16;
     LLVM_FALLTHROUGH;
   case ARM::fixup_t2_movw_lo16: {
@@ -414,7 +419,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     // inst{14-12} = Mid3;
     // inst{7-0} = Lo8;
     Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
-    return swapHalfWords(Value, IsLittleEndian);
+    return swapHalfWords(Value, Endian == support::little);
   }
   case ARM::fixup_arm_ldst_pcrel_12:
     // ARM PC-relative values are offset by 8.
@@ -437,7 +442,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     // Same addressing mode as fixup_arm_pcrel_10,
     // but with 16-bit halfwords swapped.
     if (Kind == ARM::fixup_t2_ldst_pcrel_12)
-      return swapHalfWords(Value, IsLittleEndian);
+      return swapHalfWords(Value, Endian == support::little);
 
     return Value;
   }
@@ -470,7 +475,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     out |= (Value & 0x700) << 4;
     out |= (Value & 0x0FF);
 
-    return swapHalfWords(out, IsLittleEndian);
+    return swapHalfWords(out, Endian == support::little);
   }
 
   case ARM::fixup_arm_condbranch:
@@ -487,6 +492,11 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     return 0xffffff & ((Value - 8) >> 2);
   case ARM::fixup_t2_uncondbranch: {
     Value = Value - 4;
+    if (!isInt<25>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+      return 0;
+    }
+
     Value >>= 1; // Low bit is not encoded.
 
     uint32_t out = 0;
@@ -502,10 +512,15 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     out |= (Value & 0x1FF800) << 5; // imm6 field
     out |= (Value & 0x0007FF);      // imm11 field
 
-    return swapHalfWords(out, IsLittleEndian);
+    return swapHalfWords(out, Endian == support::little);
   }
   case ARM::fixup_t2_condbranch: {
     Value = Value - 4;
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+      return 0;
+    }
+
     Value >>= 1; // Low bit is not encoded.
 
     uint64_t out = 0;
@@ -515,12 +530,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     out |= (Value & 0x1F800) << 5; // imm6 field
     out |= (Value & 0x007FF);      // imm11 field
 
-    return swapHalfWords(out, IsLittleEndian);
+    return swapHalfWords(out, Endian == support::little);
   }
   case ARM::fixup_arm_thumb_bl: {
-    // FIXME: We get both thumb1 and thumb2 in here, so we can only check for
-    // the less strict thumb2 value.
-    if (!isInt<26>(Value - 4)) {
+    if (!isInt<25>(Value - 4) ||
+        (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
+         !STI->getFeatureBits()[ARM::HasV8MBaselineOps] &&
+         !STI->getFeatureBits()[ARM::HasV6MOps] &&
+         !isInt<23>(Value - 4))) {
       Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
       return 0;
     }
@@ -549,7 +566,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
     uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                            (uint16_t)imm11Bits);
-    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+    return joinHalfWords(FirstHalf, SecondHalf, Endian == support::little);
   }
   case ARM::fixup_arm_thumb_blx: {
     // The value doesn't encode the low two bits (always zero) and is offset by
@@ -585,13 +602,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
     uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                            ((uint16_t)imm10LBits) << 1);
-    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+    return joinHalfWords(FirstHalf, SecondHalf, Endian == support::little);
   }
   case ARM::fixup_thumb_adr_pcrel_10:
   case ARM::fixup_arm_thumb_cp:
     // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
     // could have an error on our hands.
-    if (!STI.getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+    assert(STI != nullptr);
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
         Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
@@ -615,8 +633,9 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (!STI.getFeatureBits()[ARM::FeatureThumb2] &&
-        !STI.getFeatureBits()[ARM::HasV8MBaselineOps]) {
+    assert(STI != nullptr);
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
+        !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
         Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
@@ -626,7 +645,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     return ((Value - 4) >> 1) & 0x7ff;
   case ARM::fixup_arm_thumb_bcc:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (!STI.getFeatureBits()[ARM::FeatureThumb2]) {
+    assert(STI != nullptr);
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
         Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
@@ -673,7 +693,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
     // swapped.
     if (Kind == ARM::fixup_t2_pcrel_10)
-      return swapHalfWords(Value, IsLittleEndian);
+      return swapHalfWords(Value, Endian == support::little);
 
     return Value;
   }
@@ -704,7 +724,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     // Same addressing mode as fixup_arm_pcrel_9, but with 16-bit halfwords
     // swapped.
     if (Kind == ARM::fixup_t2_pcrel_9)
-      return swapHalfWords(Value, IsLittleEndian);
+      return swapHalfWords(Value, Endian == support::little);
 
     return Value;
   }
@@ -730,7 +750,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     EncValue |= (Value & 0x800) << 15;
     EncValue |= (Value & 0x700) << 4;
     EncValue |= (Value & 0xff);
-    return swapHalfWords(EncValue, IsLittleEndian);
+    return swapHalfWords(EncValue, Endian == support::little);
   }
   }
 }
@@ -755,7 +775,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   // Create relocations for unconditional branches to function symbols with
   // different execution mode in ELF binaries.
   if (Sym && Sym->isELF()) {
-    unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType();
+    unsigned Type = cast<MCSymbolELF>(Sym)->getType();
     if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
       if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
         return true;
@@ -882,11 +902,11 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
 void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                const MCValue &Target,
                                MutableArrayRef<char> Data, uint64_t Value,
-                               bool IsResolved) const {
+                               bool IsResolved,
+                               const MCSubtargetInfo* STI) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   MCContext &Ctx = Asm.getContext();
-  Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx,
-                           IsLittleEndian);
+  Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI);
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -895,7 +915,7 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
 
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
-  if (!IsLittleEndian) {
+  if (Endian == support::big) {
     FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
     assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
@@ -905,14 +925,14 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   // the fixup value. The Value has been "split up" into the appropriate
   // bitfields above.
   for (unsigned i = 0; i != NumBytes; ++i) {
-    unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i);
+    unsigned Idx = Endian == support::little ? i : (FullSizeBytes - 1 - i);
     Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
   }
 }
 
 namespace CU {
 
-/// \brief Compact unwind encoding values.
+/// Compact unwind encoding values.
 enum CompactUnwindEncodings {
   UNWIND_ARM_MODE_MASK                         = 0x0F000000,
   UNWIND_ARM_MODE_FRAME                        = 0x01000000,
@@ -1153,11 +1173,11 @@ static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
   }
 }
 
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
-                                        const MCSubtargetInfo &STI,
-                                        const MCRegisterInfo &MRI,
-                                        const MCTargetOptions &Options,
-                                        bool isLittle) {
+static MCAsmBackend *createARMAsmBackend(const Target &T,
+                                         const MCSubtargetInfo &STI,
+                                         const MCRegisterInfo &MRI,
+                                         const MCTargetOptions &Options,
+                                         support::endianness Endian) {
   const Triple &TheTriple = STI.getTargetTriple();
   switch (TheTriple.getObjectFormat()) {
   default:
@@ -1172,7 +1192,7 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
   case Triple::ELF:
     assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
     uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-    return new ARMAsmBackendELF(T, STI, OSABI, isLittle);
+    return new ARMAsmBackendELF(T, STI, OSABI, Endian);
   }
 }
 
@@ -1180,26 +1200,12 @@ MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
                                           const MCSubtargetInfo &STI,
                                           const MCRegisterInfo &MRI,
                                           const MCTargetOptions &Options) {
-  return createARMAsmBackend(T, STI, MRI, Options, true);
+  return createARMAsmBackend(T, STI, MRI, Options, support::little);
 }
 
 MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T,
                                           const MCSubtargetInfo &STI,
                                           const MCRegisterInfo &MRI,
                                           const MCTargetOptions &Options) {
-  return createARMAsmBackend(T, STI, MRI, Options, false);
-}
-
-MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
-                                            const MCSubtargetInfo &STI,
-                                            const MCRegisterInfo &MRI,
-                                            const MCTargetOptions &Options) {
-  return createARMAsmBackend(T, STI, MRI, Options, true);
-}
-
-MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
-                                            const MCSubtargetInfo &STI,
-                                            const MCRegisterInfo &MRI,
-                                            const MCTargetOptions &Options) {
-  return createARMAsmBackend(T, STI, MRI, Options, false);
+  return createARMAsmBackend(T, STI, MRI, Options, support::big);
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index c8527e5cca20..88c476bf65f4 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -19,19 +19,23 @@
 namespace llvm {
 
 class ARMAsmBackend : public MCAsmBackend {
+  // The STI from the target triple the MCAsmBackend was instantiated with
+  // note that MCFragments may have a different local STI that should be
+  // used in preference.
   const MCSubtargetInfo &STI;
   bool isThumbMode;    // Currently emitting Thumb code.
-  bool IsLittleEndian; // Big or little endian.
 public:
-  ARMAsmBackend(const Target &T, const MCSubtargetInfo &STI, bool IsLittle)
-      : MCAsmBackend(), STI(STI),
-        isThumbMode(STI.getTargetTriple().isThumb()),
-        IsLittleEndian(IsLittle) {}
+  ARMAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+                support::endianness Endian)
+      : MCAsmBackend(Endian), STI(STI),
+        isThumbMode(STI.getTargetTriple().isThumb()) {}
 
   unsigned getNumFixupKinds() const override {
     return ARM::NumTargetFixupKinds;
   }
 
+  // FIXME: this should be calculated per fragment as the STI may be
+  // different.
   bool hasNOP() const { return STI.getFeatureBits()[ARM::HasV6T2Ops]; }
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -42,15 +46,17 @@ public:
   unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
                             const MCValue &Target, uint64_t Value,
                             bool IsResolved, MCContext &Ctx,
-                            bool IsLittleEndian) const;
+                            const MCSubtargetInfo *STI) const;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
-  unsigned getRelaxedOpcode(unsigned Op) const;
+  unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const;
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override;
 
   const char *reasonForFixupRelaxation(const MCFixup &Fixup,
                                        uint64_t Value) const;
@@ -62,14 +68,13 @@ public:
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   void handleAssemblerFlag(MCAssemblerFlag Flag) override;
 
   unsigned getPointerSize() const { return 4; }
   bool isThumb() const { return isThumbMode; }
   void setIsThumb(bool it) { isThumbMode = it; }
-  bool isLittle() const { return IsLittleEndian; }
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 19e3fdb72046..de1bfaf203e4 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -21,12 +21,11 @@ public:
   const MachO::CPUSubTypeARM Subtype;
   ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI,
                       const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
-      : ARMAsmBackend(T, STI, /* IsLittleEndian */ true), MRI(MRI),
-        Subtype(st) {}
+      : ARMAsmBackend(T, STI, support::little), MRI(MRI), Subtype(st) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createARMMachObjectWriter(/*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
                                      Subtype);
   }
 
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
index 361ea3040847..86a583b19cf7 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -21,12 +21,12 @@ class ARMAsmBackendELF : public ARMAsmBackend {
 public:
   uint8_t OSABI;
   ARMAsmBackendELF(const Target &T, const MCSubtargetInfo &STI, uint8_t OSABI,
-                   bool IsLittle)
-      : ARMAsmBackend(T, STI, IsLittle), OSABI(OSABI) {}
+                   support::endianness Endian)
+      : ARMAsmBackend(T, STI, Endian), OSABI(OSABI) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createARMELFObjectWriter(OS, OSABI, isLittle());
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createARMELFObjectWriter(OSABI);
   }
 };
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 0ac6d4270aac..553922d20f43 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -18,10 +18,10 @@ namespace {
 class ARMAsmBackendWinCOFF : public ARMAsmBackend {
 public:
   ARMAsmBackendWinCOFF(const Target &T, const MCSubtargetInfo &STI)
-      : ARMAsmBackend(T, STI, true) {}
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+      : ARMAsmBackend(T, STI, support::little) {}
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createARMWinCOFFObjectWriter(/*Is64Bit=*/false);
   }
 };
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index c4480e3da505..b918006fe9e3 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -98,6 +98,20 @@ namespace ARM_MB {
   }
 } // namespace ARM_MB
 
+namespace ARM_TSB {
+  enum TraceSyncBOpt {
+    CSYNC = 0
+  };
+
+  inline static const char *TraceSyncBOptToString(unsigned val) {
+    switch (val) {
+    default:
+      llvm_unreachable("Unknown trace synchronization barrier operation");
+      case CSYNC: return "csync";
+    }
+  }
+} // namespace ARM_TSB
+
 namespace ARM_ISB {
   enum InstSyncBOpt {
     RESERVED_0 = 0,
@@ -186,7 +200,8 @@ namespace ARMII {
     AddrModeT2_so   = 13,
     AddrModeT2_pc   = 14, // +/- i12 for pc relative data
     AddrModeT2_i8s4 = 15, // i8 * 4
-    AddrMode_i12    = 16
+    AddrMode_i12    = 16,
+    AddrMode5FP16   = 17  // i8 * 2
   };
 
   inline static const char *AddrModeToString(AddrMode addrmode) {
@@ -197,6 +212,7 @@ namespace ARMII {
     case AddrMode3:       return "AddrMode3";
     case AddrMode4:       return "AddrMode4";
     case AddrMode5:       return "AddrMode5";
+    case AddrMode5FP16:   return "AddrMode5FP16";
     case AddrMode6:       return "AddrMode6";
     case AddrModeT1_1:    return "AddrModeT1_1";
     case AddrModeT1_2:    return "AddrModeT1_2";
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 3cd52fe1e7eb..dfa339091a7b 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -236,9 +236,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                               bool IsLittleEndian) {
-  return createELFObjectWriter(llvm::make_unique<ARMELFObjectWriter>(OSABI), OS,
-                               IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createARMELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<ARMELFObjectWriter>(OSABI);
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index d465da1a7bb1..3373d691db50 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -441,9 +442,9 @@ public:
   friend class ARMTargetELFStreamer;
 
   ARMELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                 raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+                 std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
                  bool IsThumb)
-      : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+      : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
         IsThumb(IsThumb) {
     EHReset();
   }
@@ -512,9 +513,11 @@ public:
 
       assert(IsThumb);
       EmitThumbMappingSymbol();
+      // Thumb wide instructions are emitted as a pair of 16-bit words of the
+      // appropriate endianness.
       for (unsigned II = 0, IE = Size; II != IE; II = II + 2) {
-        const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1);
-        const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2);
+        const unsigned I0 = LittleEndian ? II + 0 : II + 1;
+        const unsigned I1 = LittleEndian ? II + 1 : II + 0;
         Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT);
         Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT);
       }
@@ -856,6 +859,8 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV8A:
   case ARM::ArchKind::ARMV8_1A:
   case ARM::ArchKind::ARMV8_2A:
+  case ARM::ArchKind::ARMV8_3A:
+  case ARM::ArchKind::ARMV8_4A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1066,7 +1071,7 @@ void ARMTargetELFStreamer::finishAttributeSection() {
   if (Contents.empty())
     return;
 
-  std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+  llvm::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
 
   ARMELFStreamer &Streamer = getStreamer();
 
@@ -1492,10 +1497,10 @@ MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
 
 MCELFStreamer *createARMELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> TAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> OW,
                                     std::unique_ptr<MCCodeEmitter> Emitter,
                                     bool RelaxAll, bool IsThumb) {
-  ARMELFStreamer *S = new ARMELFStreamer(Context, std::move(TAB), OS,
+  ARMELFStreamer *S = new ARMELFStreamer(Context, std::move(TAB), std::move(OW),
                                          std::move(Emitter), IsThumb);
   // FIXME: This should eventually end up somewhere else where more
   // intelligent flag decisions can be made. For now we are just maintaining
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index f1f35f409900..0dab789505d5 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1520,7 +1520,7 @@ unsigned ARMMCCodeEmitter::
 getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const {
-  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+  // 10 bits. lower 5 bits are the lsb of the mask, high five bits are the
   // msb of the mask.
   const MCOperand &MO = MI.getOperand(Op);
   uint32_t v = ~MO.getImm();
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index ae5bc723ee5f..46434007a854 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -140,17 +141,21 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
     ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str();
 
   if (TT.isThumb()) {
-    if (ARMArchFeature.empty())
-      ARMArchFeature = "+thumb-mode,+v4t";
-    else
-      ARMArchFeature += ",+thumb-mode,+v4t";
+    if (!ARMArchFeature.empty())
+      ARMArchFeature += ",";
+    ARMArchFeature += "+thumb-mode,+v4t";
   }
 
   if (TT.isOSNaCl()) {
-    if (ARMArchFeature.empty())
-      ARMArchFeature = "+nacl-trap";
-    else
-      ARMArchFeature += ",+nacl-trap";
+    if (!ARMArchFeature.empty())
+      ARMArchFeature += ",";
+    ARMArchFeature += "+nacl-trap";
+  }
+
+  if (TT.isOSWindows()) {
+    if (!ARMArchFeature.empty())
+      ARMArchFeature += ",";
+    ARMArchFeature += "+noarm";
   }
 
   return ARMArchFeature;
@@ -201,21 +206,21 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
 
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                                      bool RelaxAll) {
   return createARMELFStreamer(
-      Ctx, std::move(MAB), OS, std::move(Emitter), false,
+      Ctx, std::move(MAB), std::move(OW), std::move(Emitter), false,
       (T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb));
 }
 
 static MCStreamer *
 createARMMachOStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&MAB,
-                       raw_pwrite_stream &OS,
+                       std::unique_ptr<MCObjectWriter> &&OW,
                        std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
                        bool DWARFMustBeAtTheEnd) {
-  return createMachOStreamer(Ctx, std::move(MAB), OS, std::move(Emitter), false,
-                             DWARFMustBeAtTheEnd);
+  return createMachOStreamer(Ctx, std::move(MAB), std::move(OW),
+                             std::move(Emitter), false, DWARFMustBeAtTheEnd);
 }
 
 static MCInstPrinter *createARMMCInstPrinter(const Triple &T,
@@ -338,19 +343,12 @@ extern "C" void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheThumbLETarget(), &getTheThumbBETarget()})
     TargetRegistry::RegisterMCInstrAnalysis(*T, createThumbMCInstrAnalysis);
 
-  // Register the MC Code Emitter
-  for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
+  for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()}) {
     TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
-  for (Target *T : {&getTheARMBETarget(), &getTheThumbBETarget()})
+    TargetRegistry::RegisterMCAsmBackend(*T, createARMLEAsmBackend);
+  }
+  for (Target *T : {&getTheARMBETarget(), &getTheThumbBETarget()}) {
     TargetRegistry::RegisterMCCodeEmitter(*T, createARMBEMCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(getTheARMLETarget(),
-                                       createARMLEAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(getTheARMBETarget(),
-                                       createARMBEAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(getTheThumbLETarget(),
-                                       createThumbLEAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(getTheThumbBETarget(),
-                                       createThumbBEAsmBackend);
+    TargetRegistry::RegisterMCAsmBackend(*T, createARMBEAsmBackend);
+  }
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index df9874c78d07..3ee004592ac6 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -25,6 +25,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCInstPrinter;
+class MCObjectTargetWriter;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
@@ -68,11 +69,6 @@ MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
                                         MCContext &Ctx);
 
-MCAsmBackend *createARMAsmBackend(const Target &T, const MCSubtargetInfo &STI,
-                                  const MCRegisterInfo &MRI,
-                                  const MCTargetOptions &Options,
-                                  bool IsLittleEndian);
-
 MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
@@ -81,39 +77,26 @@ MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 
-MCAsmBackend *createThumbLEAsmBackend(const Target &T,
-                                      const MCSubtargetInfo &STI,
-                                      const MCRegisterInfo &MRI,
-                                      const MCTargetOptions &Options);
-
-MCAsmBackend *createThumbBEAsmBackend(const Target &T,
-                                      const MCSubtargetInfo &STI,
-                                      const MCRegisterInfo &MRI,
-                                      const MCTargetOptions &Options);
-
 // Construct a PE/COFF machine code streamer which will generate a PE/COFF
 // object file.
 MCStreamer *createARMWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                                      bool RelaxAll,
                                      bool IncrementalLinkerCompatible);
 
 /// Construct an ELF Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createARMELFObjectWriter(raw_pwrite_stream &OS,
-                                                         uint8_t OSABI,
-                                                         bool IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter> createARMELFObjectWriter(uint8_t OSABI);
 
 /// Construct an ARM Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createARMMachObjectWriter(raw_pwrite_stream &OS,
-                                                          bool Is64Bit,
-                                                          uint32_t CPUType,
-                                                          uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                          uint32_t CPUSubtype);
 
 /// Construct an ARM PE/COFF object writer.
-std::unique_ptr<MCObjectWriter>
-createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+createARMWinCOFFObjectWriter(bool Is64Bit);
 
 /// Construct ARM Mach-O relocation info.
 MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 521ae5337e7a..4b4956e914f2 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -484,10 +484,8 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
   Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createARMMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                                uint32_t CPUType, uint32_t CPUSubtype) {
-  return createMachObjectWriter(
-      llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
-      /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                                uint32_t CPUSubtype) {
+  return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 5e09b126f43f..8ae713b7b489 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -91,10 +91,9 @@ bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
 
 namespace llvm {
 
-std::unique_ptr<MCObjectWriter>
-createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) {
-  auto MOTW = llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
-  return createWinCOFFObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter>
+createARMWinCOFFObjectWriter(bool Is64Bit) {
+  return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index a2424e1abab3..32cb3dcdcad8 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -10,6 +10,7 @@
 #include "ARMMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 
 using namespace llvm;
@@ -18,8 +19,9 @@ namespace {
 class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
 public:
   ARMWinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
-                     std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS)
-      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+                     std::unique_ptr<MCCodeEmitter> CE,
+                     std::unique_ptr<MCObjectWriter> OW)
+      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Symbol) override;
@@ -48,10 +50,11 @@ void ARMWinCOFFStreamer::FinishImpl() {
 
 MCStreamer *llvm::createARMWinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> &&Emitter,
-    bool RelaxAll, bool IncrementalLinkerCompatible) {
-  auto *S =
-      new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter), OS);
+    std::unique_ptr<MCObjectWriter> &&OW,
+    std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
+    bool IncrementalLinkerCompatible) {
+  auto *S = new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter),
+                                   std::move(OW));
   S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
   return S;
 }
diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
index 153e7b1e2197..637e4a44c428 100644
--- a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -309,17 +309,17 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
   }
   MIB.addImm(Pred).addReg(PredReg);
 
-  DEBUG({
-      dbgs() << "Expanding: " << *MI;
-      dbgs() << "  to:\n";
-      MachineBasicBlock::iterator MII = MI;
-      MII = std::prev(MII);
-      MachineInstr &MI2 = *MII;
-      MII = std::prev(MII);
-      MachineInstr &MI1 = *MII;
-      dbgs() << "    " << MI1;
-      dbgs() << "    " << MI2;
-   });
+  LLVM_DEBUG({
+    dbgs() << "Expanding: " << *MI;
+    dbgs() << "  to:\n";
+    MachineBasicBlock::iterator MII = MI;
+    MII = std::prev(MII);
+    MachineInstr &MI2 = *MII;
+    MII = std::prev(MII);
+    MachineInstr &MI1 = *MII;
+    dbgs() << "    " << MI1;
+    dbgs() << "    " << MI2;
+  });
 
   MI->eraseFromParent();
   ++NumExpand;
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index ba00b3d79da9..a65e22fd86e8 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -611,6 +611,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   unsigned TemporaryReg = 0;
   BitVector PopFriendly =
       TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
+  // R7 may be used as a frame pointer, hence marked as not generally
+  // allocatable, however there's no reason to not use it as a temporary for
+  // restoring LR.
+  if (STI.useR7AsFramePointer())
+    PopFriendly.set(ARM::R7);
+
   assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
   // Rebuild the GPRs from the high registers because they are removed
   // form the GPR reg class for thumb1.
@@ -622,17 +628,20 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   GPRsNoLRSP.reset(ARM::PC);
   findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg);
 
-  // If we couldn't find a pop-friendly register, restore LR before popping the
-  // other callee-saved registers, so we can use one of them as a temporary.
+  // If we couldn't find a pop-friendly register, try restoring LR before
+  // popping the other callee-saved registers, so we could use one of them as a
+  // temporary.
   bool UseLDRSP = false;
   if (!PopReg && MBBI != MBB.begin()) {
     auto PrevMBBI = MBBI;
     PrevMBBI--;
     if (PrevMBBI->getOpcode() == ARM::tPOP) {
-      MBBI = PrevMBBI;
-      UsedRegs.stepBackward(*MBBI);
+      UsedRegs.stepBackward(*PrevMBBI);
       findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg);
-      UseLDRSP = true;
+      if (PopReg) {
+        MBBI = PrevMBBI;
+        UseLDRSP = true;
+      }
     }
   }
 
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 05c98aab6f27..11aa285fc939 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -109,11 +109,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  assert((RC == &ARM::tGPRRegClass ||
+  assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
           (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
            isARMLowRegister(DestReg))) && "Unknown regclass!");
 
-  if (RC == &ARM::tGPRRegClass ||
+  if (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
       (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
        isARMLowRegister(DestReg))) {
     DebugLoc DL;
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 04bdd91b53e6..e0a5f7f04fa9 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -183,7 +183,7 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
   // If not, then there is nothing to be gained by moving the copy.
   MachineBasicBlock::iterator I = MI; ++I;
   MachineBasicBlock::iterator E = MI->getParent()->end();
-  while (I != E && I->isDebugValue())
+  while (I != E && I->isDebugInstr())
     ++I;
   if (I != E) {
     unsigned NPredReg = 0;
@@ -237,7 +237,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
       // block so check the instruction we just put in the block.
       for (; MBBI != E && Pos &&
              (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
-        if (MBBI->isDebugValue())
+        if (MBBI->isDebugInstr())
           continue;
 
         MachineInstr *NMI = &*MBBI;
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index c5eb14f3e608..d5f0ba9ee485 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -82,7 +82,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
     MachineBasicBlock::iterator E = MBB->begin();
     unsigned Count = 4; // At most 4 instructions in an IT block.
     while (Count && MBBI != E) {
-      if (MBBI->isDebugValue()) {
+      if (MBBI->isDebugInstr()) {
         --MBBI;
         continue;
       }
@@ -109,7 +109,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
 bool
 Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI) const {
-  while (MBBI->isDebugValue()) {
+  while (MBBI->isDebugInstr()) {
     ++MBBI;
     if (MBBI == MBB.end())
       return false;
@@ -489,7 +489,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
 
     unsigned PredReg;
-    if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+    if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL &&
+        !MI.definesRegister(ARM::CPSR)) {
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::tMOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
@@ -600,6 +601,20 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
         Offset = -Offset;
         isSub = true;
       }
+    } else if (AddrMode == ARMII::AddrMode5FP16) {
+      // VFP address mode.
+      const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+      int InstrOffs = ARM_AM::getAM5FP16Offset(OffOp.getImm());
+      if (ARM_AM::getAM5FP16Op(OffOp.getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 2;
+      Offset += InstrOffs * 2;
+      assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+      if (Offset < 0) {
+        Offset = -Offset;
+        isSub = true;
+      }
     } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
       Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
       NumBits = 10; // 8 bits scaled by 4
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index 5357e26856ea..abf54ba7e87c 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -610,7 +610,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
 
-  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+  LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+                    << "       to 16-bit: " << *MIB);
 
   MBB.erase_instr(MI);
   ++NumLdSts;
@@ -657,7 +658,8 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
 
-    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " <<*MIB);
+    LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+                      << "       to 16-bit: " << *MIB);
 
     MBB.erase_instr(MI);
     ++NumNarrows;
@@ -826,7 +828,8 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
 
-  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+  LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+                    << "       to 16-bit: " << *MIB);
 
   MBB.erase_instr(MI);
   ++Num2Addrs;
@@ -933,7 +936,8 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
 
-  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+  LLVM_DEBUG(errs() << "Converted 32-bit: " << *MI
+                    << "       to 16-bit: " << *MIB);
 
   MBB.erase_instr(MI);
   ++NumNarrows;
@@ -1033,7 +1037,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       BundleMI = MI;
       continue;
     }
-    if (MI->isDebugValue())
+    if (MI->isDebugInstr())
       continue;
 
     LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index d190edf5913c..e4bdd40fb743 100644
--- a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -475,7 +475,7 @@ bool ThumbRegisterInfo::saveScavengerRegister(
   // before that instead and adjust the UseMI.
   bool done = false;
   for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
-    if (II->isDebugValue())
+    if (II->isDebugInstr())
       continue;
     // If this instruction affects R12, adjust our restore point.
     for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
@@ -517,25 +517,13 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   unsigned VReg = 0;
   const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
 
-  unsigned FrameReg = ARM::SP;
+  unsigned FrameReg;
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
-               MF.getFrameInfo().getStackSize() + SPAdj;
-
-  if (MF.getFrameInfo().hasVarSizedObjects()) {
-    assert(SPAdj == 0 && STI.getFrameLowering()->hasFP(MF) && "Unexpected");
-    // There are alloca()'s in this function, must reference off the frame
-    // pointer or base pointer instead.
-    if (!hasBasePointer(MF)) {
-      FrameReg = getFrameRegister(MF);
-      Offset -= AFI->getFramePtrSpillOffset();
-    } else
-      FrameReg = BasePtr;
-  }
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+  int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
 
   // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
   // call frame setup/destroy instructions have already been eliminated.  That
@@ -560,7 +548,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
-  assert(AFI->isThumbFunction() &&
+  assert(MF.getInfo<ARMFunctionInfo>()->isThumbFunction() &&
          "This eliminateFrameIndex only supports Thumb1!");
   if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
     return;
diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h
index 2535b63dccdd..48327fd377b2 100644
--- a/contrib/llvm/lib/Target/AVR/AVR.h
+++ b/contrib/llvm/lib/Target/AVR/AVR.h
@@ -37,8 +37,10 @@ void initializeAVRRelaxMemPass(PassRegistry&);
 /// Contains the AVR backend.
 namespace AVR {
 
+/// An integer that identifies all of the supported AVR address spaces.
 enum AddressSpace { DataMemory, ProgramMemory };
 
+/// Checks if a given type is a pointer to program memory.
 template <typename T> bool isProgramMemoryAddress(T *V) {
   return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
 }
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 462a7d57d2de..b0b23effc6c6 100644
--- a/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -519,12 +519,9 @@ bool AVRDAGToDAGISel::selectMultiplication(llvm::SDNode *N) {
 }
 
 void AVRDAGToDAGISel::Select(SDNode *N) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: "; N->dump(CurDAG); errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (N->isMachineOpcode()) {
-    DEBUG(errs() << "== "; N->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; N->dump(CurDAG); errs() << "\n");
     N->setNodeId(-1);
     return;
   }
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
index d9e27e91405c..c1515571aae5 100644
--- a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -345,6 +345,9 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   case MVT::i64:
     LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
     break;
+  case MVT::i128:
+    LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+    break;
   }
 
   SDValue InChain = DAG.getEntryNode();
@@ -867,10 +870,12 @@ bool AVRTargetLowering::isOffsetFoldingLegal(
 
 /// For each argument in a function store the number of pieces it is composed
 /// of.
-static void parseFunctionArgs(const Function *F, const DataLayout *TD,
+static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
                               SmallVectorImpl<unsigned> &Out) {
-  for (Argument const &Arg : F->args()) {
-    unsigned Bytes = (TD->getTypeSizeInBits(Arg.getType()) + 7) / 8;
+  for (const ISD::InputArg &Arg : Ins) {
+    if(Arg.PartOffset > 0) continue;
+    unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;
+
     Out.push_back((Bytes + 1) / 2);
   }
 }
@@ -938,7 +943,7 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
     parseExternFuncCallArgs(*Outs, Args);
   } else {
     assert(F != nullptr && "function should not be null");
-    parseFunctionArgs(F, TD, Args);
+    parseFunctionArgs(*Ins, Args);
   }
 
   unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 1a89a13693e1..0c32334167f0 100644
--- a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -273,7 +273,7 @@ bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue()) {
+    if (I->isDebugInstr()) {
       continue;
     }
 
@@ -444,7 +444,7 @@ unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue()) {
+    if (I->isDebugInstr()) {
       continue;
     }
     //:TODO: add here the missing jmp instructions once they are implemented
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
index 7d1bfc8d85e0..a2129cc0e2e9 100644
--- a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1228,9 +1228,23 @@ isReMaterializable = 1 in
                           [(set i16:$dst, (load addr:$memri))]>,
                    Requires<[HasSRAM]>;
 
+  // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
+  // register and without the @earlyclobber flag.
+  //
+  // Used to work around a bug caused by the register allocator not
+  // being able to handle the expansion of a COPY into an machine instruction
+  // that has an earlyclobber flag. This is because the register allocator will
+  // try expand a copy from a register slot into an earlyclobber instruction.
+  // Instructions that are earlyclobber need to be in a dedicated earlyclobber slot.
+  //
+  // This pseudo instruction can be used pre-AVR pseudo expansion in order to
+  // get a frame index load without directly using earlyclobber instructions.
+  //
+  // The pseudo expansion pass trivially expands this into LDDWRdPtrQ.
+  //
+  // This instruction may be removed once PR13375 is fixed.
   let mayLoad = 1,
-  hasSideEffects = 0,
-  Constraints = "@earlyclobber $dst" in
+  hasSideEffects = 0 in
   def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
                         (ins memri:$memri),
                         "lddw\t$dst, $memri",
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
index f8fefb859682..104b336b9c48 100644
--- a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -51,6 +51,11 @@ public:
   /// Splits a 16-bit `DREGS` register into the lo/hi register pair.
   /// \param Reg A 16-bit register to split.
   void splitReg(unsigned Reg, unsigned &LoReg, unsigned &HiReg) const;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
+    return true;
+  }
+
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index f9a738b2182c..74300d9a451c 100644
--- a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-static const char *AVRDataLayout = "e-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
+static const char *AVRDataLayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
 
 /// Processes a CPU name.
 static StringRef getCPU(StringRef CPU) {
diff --git a/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index b527ad3e0b14..d57cc098497f 100644
--- a/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -482,7 +482,7 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
 }
 
 bool AVRAsmParser::parseOperand(OperandVector &Operands) {
-  DEBUG(dbgs() << "parseOperand\n");
+  LLVM_DEBUG(dbgs() << "parseOperand\n");
 
   switch (getLexer().getKind()) {
   default:
@@ -527,7 +527,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) {
 
 OperandMatchResultTy
 AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
-  DEBUG(dbgs() << "parseMemriOperand()\n");
+  LLVM_DEBUG(dbgs() << "parseMemriOperand()\n");
 
   SMLoc E, S;
   MCExpr const *Expression;
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 2f5e9f02e53c..f81a57dd71e3 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -352,15 +352,16 @@ void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createAVRELFObjectWriter(OS,
-                                  MCELFObjectTargetWriter::getOSABI(OSType));
+std::unique_ptr<MCObjectTargetWriter>
+AVRAsmBackend::createObjectTargetWriter() const {
+  return createAVRELFObjectWriter(MCELFObjectTargetWriter::getOSABI(OSType));
 }
 
 void AVRAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
-                               const MCValue &Target, MutableArrayRef<char> Data,
-                               uint64_t Value, bool IsPCRel) const {
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsResolved,
+                               const MCSubtargetInfo *STI) const {
   adjustFixupValue(Fixup, Target, Value, &Asm.getContext());
   if (Value == 0)
     return; // Doesn't change encoding.
@@ -453,13 +454,13 @@ MCFixupKindInfo const &AVRAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AVRAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // If the count is not 2-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
   assert((Count % 2) == 0 && "NOP instructions must be 2 bytes");
 
-  OW->WriteZeros(Count);
+  OS.write_zeros(Count);
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index af615df033aa..d48077c3ab8e 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -31,18 +31,19 @@ struct MCFixupKindInfo;
 /// Utilities for manipulating generated AVR machine code.
 class AVRAsmBackend : public MCAsmBackend {
 public:
-
   AVRAsmBackend(Triple::OSType OSType)
-      : MCAsmBackend(), OSType(OSType) {}
+      : MCAsmBackend(support::little), OSType(OSType) {}
 
   void adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
                         uint64_t &Value, MCContext *Ctx = nullptr) const;
 
-  std::unique_ptr<MCObjectWriter> createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
@@ -50,7 +51,10 @@ public:
     return AVR::NumTargetFixupKinds;
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
 
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
@@ -62,7 +66,7 @@ public:
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 412f66fbcf22..4a921a1601a9 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -152,10 +152,8 @@ unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
-  std::unique_ptr<MCELFObjectTargetWriter> MOTW(new AVRELFObjectWriter(OSABI));
-  return createELFObjectWriter(std::move(MOTW), OS, true);
+std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI) {
+  return make_unique<AVRELFObjectWriter>(OSABI);
 }
 
 } // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index c60ea7a92e6f..861acd47347f 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCObjectWriter.h"
 
 using namespace llvm;
 
@@ -43,9 +44,10 @@ void AVRMCELFStreamer::EmitValueForModiferKind(
 namespace llvm {
 MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
                                  std::unique_ptr<MCAsmBackend> MAB,
-                                 raw_pwrite_stream &OS,
+                                 std::unique_ptr<MCObjectWriter> OW,
                                  std::unique_ptr<MCCodeEmitter> CE) {
-  return new AVRMCELFStreamer(Context, std::move(MAB), OS, std::move(CE));
+  return new AVRMCELFStreamer(Context, std::move(MAB), std::move(OW),
+                              std::move(CE));
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 398b409f4586..12e805fc7d13 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 
 namespace llvm {
 
@@ -27,16 +28,18 @@ class AVRMCELFStreamer : public MCELFStreamer {
 
 public:
   AVRMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                   raw_pwrite_stream &OS,
+                   std::unique_ptr<MCObjectWriter> OW,
                    std::unique_ptr<MCCodeEmitter> Emitter)
-      : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+      : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+                      std::move(Emitter)),
         MCII(createAVRMCInstrInfo()) {}
 
   AVRMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                   raw_pwrite_stream &OS,
+                   std::unique_ptr<MCObjectWriter> OW,
                    std::unique_ptr<MCCodeEmitter> Emitter,
                    MCAssembler *Assembler)
-      : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+      : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+                      std::move(Emitter)),
         MCII(createAVRMCInstrInfo()) {}
 
   void EmitValueForModiferKind(
@@ -46,7 +49,7 @@ public:
 
 MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
                                  std::unique_ptr<MCAsmBackend> MAB,
-                                 raw_pwrite_stream &OS,
+                                 std::unique_ptr<MCObjectWriter> OW,
                                  std::unique_ptr<MCCodeEmitter> CE);
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index dd65a4312077..8c39b5f4039e 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -70,11 +70,11 @@ static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
 
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
                                     bool RelaxAll) {
-  return createELFStreamer(Context, std::move(MAB), OS,
-      std::move(Emitter), RelaxAll);
+  return createELFStreamer(Context, std::move(MAB), std::move(OW),
+                           std::move(Emitter), RelaxAll);
 }
 
 static MCTargetStreamer *
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
index fcfd8cf82292..a764f15bd065 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -48,8 +48,7 @@ MCAsmBackend *createAVRAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                   const llvm::MCTargetOptions &TO);
 
 /// Creates an ELF object writer for AVR.
-std::unique_ptr<MCObjectWriter>
-createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index deaa11325809..496f2befde58 100644
--- a/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -460,7 +460,7 @@ bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   } else if (BPFOperand::isValidIdAtStart (Name))
     Operands.push_back(BPFOperand::createToken(Name, NameLoc));
   else
-    return true;
+    return Error(NameLoc, "invalid register/token name");
 
   while (!getLexer().is(AsmToken::EndOfStatement)) {
     // Attempt to parse token as operator
@@ -472,8 +472,10 @@ bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       continue;
 
     // Attempt to parse token as an immediate
-    if (parseImmediate(Operands) != MatchOperand_Success)
-      return true;
+    if (parseImmediate(Operands) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token");
+    }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
diff --git a/contrib/llvm/lib/Target/BPF/BPF.h b/contrib/llvm/lib/Target/BPF/BPF.h
index 4a0cb20357c8..76d3e1ca5f6f 100644
--- a/contrib/llvm/lib/Target/BPF/BPF.h
+++ b/contrib/llvm/lib/Target/BPF/BPF.h
@@ -17,6 +17,11 @@ namespace llvm {
 class BPFTargetMachine;
 
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+FunctionPass *createBPFMIPeepholePass();
+FunctionPass *createBPFMIPreEmitPeepholePass();
+
+void initializeBPFMIPeepholePass(PassRegistry&);
+void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
 }
 
 #endif
diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td
index 2d0c22a3a516..877bd15f4f2b 100644
--- a/contrib/llvm/lib/Target/BPF/BPF.td
+++ b/contrib/llvm/lib/Target/BPF/BPF.td
@@ -26,6 +26,12 @@ def : Proc<"probe", []>;
 def DummyFeature : SubtargetFeature<"dummy", "isDummyMode",
                                     "true", "unused feature">;
 
+def ALU32 : SubtargetFeature<"alu32", "HasAlu32", "true",
+                             "Enable ALU32 instructions">;
+
+def DwarfRIS: SubtargetFeature<"dwarfris", "UseDwarfRIS", "true",
+                               "Disable MCAsmInfo DwarfUsesRelocationsAcrossSections">;
+
 def BPFInstPrinter : AsmWriter {
   string AsmWriterClassName  = "InstPrinter";
   bit isMCAsmWriter = 1;
diff --git a/contrib/llvm/lib/Target/BPF/BPFCallingConv.td b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
index 8cec6fa54698..637f9752ec42 100644
--- a/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
+++ b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
@@ -26,4 +26,24 @@ def CC_BPF64 : CallingConv<[
   CCAssignToStack<8, 8>
 ]>;
 
+// Return-value convention when -mattr=+alu32 enabled
+def RetCC_BPF32 : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [R0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0], [W0]>>
+]>;
+
+// Calling convention when -mattr=+alu32 enabled
+def CC_BPF32 : CallingConv<[
+  // Promote i8/i16/i32 args to i64
+  CCIfType<[i32], CCAssignToRegWithShadow<[W1, W2, W3, W4, W5],
+                                          [R1, R2, R3, R4, R5]>>,
+
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i64], CCAssignToRegWithShadow<[R1, R2, R3, R4, R5],
+                                          [W1, W2, W3, W4, W5]>>,
+
+  // Could be assigned to the stack in 8-byte aligned units, but unsupported
+  CCAssignToStack<8, 8>
+]>;
+
 def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 61b04d1f2a13..8b9bc08e144f 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -39,8 +39,14 @@ using namespace llvm;
 namespace {
 
 class BPFDAGToDAGISel : public SelectionDAGISel {
+
+  /// Subtarget - Keep a pointer to the BPFSubtarget around so that we can
+  /// make the right decision when generating code for different subtargets.
+  const BPFSubtarget *Subtarget;
+
 public:
-  explicit BPFDAGToDAGISel(BPFTargetMachine &TM) : SelectionDAGISel(TM) {
+  explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
+      : SelectionDAGISel(TM), Subtarget(nullptr) {
     curr_func_ = nullptr;
   }
 
@@ -48,6 +54,12 @@ public:
     return "BPF DAG->DAG Pattern Instruction Selection";
   }
 
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &MF.getSubtarget<BPFSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
   void PreprocessISelDAG() override;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
@@ -65,9 +77,9 @@ private:
   bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   // Node preprocessing cases
-  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator &I);
   void PreprocessCopyToReg(SDNode *Node);
-  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator &I);
 
   // Find constants from a constant structure
   typedef std::vector<unsigned char> val_vec_type;
@@ -176,12 +188,9 @@ bool BPFDAGToDAGISel::SelectInlineAsmMemoryOperand(
 void BPFDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
 
-  // Dump information about the Node being selected
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
     return;
   }
 
@@ -241,7 +250,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
 }
 
 void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
-                                     SelectionDAG::allnodes_iterator I) {
+                                     SelectionDAG::allnodes_iterator &I) {
   union {
     uint8_t c[8];
     uint16_t s;
@@ -268,7 +277,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
     if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0)
       return;
 
-    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
 
     const GlobalAddressSDNode *GADN =
         dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
@@ -278,7 +287,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
           getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
   } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
              LDAddrNode->getNumOperands() > 0) {
-    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
 
     SDValue OP1 = LDAddrNode->getOperand(0);
     if (const GlobalAddressSDNode *GADN =
@@ -301,8 +310,8 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
     val = new_val.d;
   }
 
-  DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val
-               << '\n');
+  LLVM_DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
+                    << val << '\n');
   SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
 
   // After replacement, the current node is dead, we need to
@@ -418,8 +427,8 @@ bool BPFDAGToDAGISel::fillGenericConstant(const DataLayout &DL,
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
     uint64_t val = CI->getZExtValue();
-    DEBUG(dbgs() << "Byte array at offset " << Offset << " with value " << val
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "Byte array at offset " << Offset << " with value "
+                      << val << '\n');
 
     if (Size > 8 || (Size & (Size - 1)))
       return false;
@@ -508,17 +517,49 @@ void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
     break;
   }
 
-  DEBUG(dbgs() << "Find Load Value to VReg "
-               << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n');
+  LLVM_DEBUG(dbgs() << "Find Load Value to VReg "
+                    << TargetRegisterInfo::virtReg2Index(RegN->getReg())
+                    << '\n');
   load_to_vreg_[RegN->getReg()] = mem_load_op;
 }
 
 void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
-                                      SelectionDAG::allnodes_iterator I) {
+                                      SelectionDAG::allnodes_iterator &I) {
   ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
   if (!MaskN)
     return;
 
+  // The Reg operand should be a virtual register, which is defined
+  // outside the current basic block. DAG combiner has done a pretty
+  // good job in removing truncating inside a single basic block except
+  // when the Reg operand comes from bpf_load_[byte | half | word] for
+  // which the generic optimizer doesn't understand their results are
+  // zero extended.
+  SDValue BaseV = Node->getOperand(0);
+  if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+    uint64_t MaskV = MaskN->getZExtValue();
+
+    if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
+          (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
+          (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
+      return;
+
+    LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
+               Node->dump(); dbgs() << '\n');
+
+    I--;
+    CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
+    I++;
+    CurDAG->DeleteNode(Node);
+
+    return;
+  }
+
+  // Multiple basic blocks case.
+  if (BaseV.getOpcode() != ISD::CopyFromReg)
+    return;
+
   unsigned match_load_op = 0;
   switch (MaskN->getZExtValue()) {
   default:
@@ -534,19 +575,12 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
     break;
   }
 
-  // The Reg operand should be a virtual register, which is defined
-  // outside the current basic block. DAG combiner has done a pretty
-  // good job in removing truncating inside a single basic block.
-  SDValue BaseV = Node->getOperand(0);
-  if (BaseV.getOpcode() != ISD::CopyFromReg)
-    return;
-
   const RegisterSDNode *RegN =
       dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
   if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
     return;
   unsigned AndOpReg = RegN->getReg();
-  DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
+  LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
 
   // Examine the PHI insns in the MachineBasicBlock to found out the
   // definitions of this virtual register. At this stage (DAG2DAG
@@ -576,8 +610,8 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
     //   %2 = PHI %0, <%bb.1>, %1, <%bb.3>
     // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3)
     // The AND operation can be removed if both %0 in %bb.1 and %1 in
-    // %bb.3 are defined with with a load matching the MaskN.
-    DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
+    // %bb.3 are defined with a load matching the MaskN.
+    LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
     unsigned PrevReg = -1;
     for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
       const MachineOperand &MOP = MII->getOperand(i);
@@ -593,8 +627,8 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
     }
   }
 
-  DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
+             dbgs() << '\n');
 
   I--;
   CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 3ea96e3148f2..9272cf692dc9 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -33,6 +33,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bpf-lower"
 
+static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
+  cl::Hidden, cl::init(false),
+  cl::desc("Expand memcpy into load/store pairs in order"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
@@ -57,6 +61,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   // Set up the register classes.
   addRegisterClass(MVT::i64, &BPF::GPRRegClass);
+  if (STI.getHasAlu32())
+    addRegisterClass(MVT::i32, &BPF::GPR32RegClass);
 
   // Compute derived properties from the register classes
   computeRegisterProperties(STI.getRegisterInfo());
@@ -67,9 +73,6 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
-  setOperationAction(ISD::SETCC, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 
@@ -77,32 +80,39 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  for (auto VT : { MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i32 && !STI.getHasAlu32())
+      continue;
 
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::SHL_PARTS, VT, Expand);
+    setOperationAction(ISD::SRL_PARTS, VT, Expand);
+    setOperationAction(ISD::SRA_PARTS, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Custom);
+  }
 
-  setOperationAction(ISD::ROTR, MVT::i64, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  if (STI.getHasAlu32()) {
+    setOperationAction(ISD::BSWAP, MVT::i32, Promote);
+    setOperationAction(ISD::BR_CC, MVT::i32, Promote);
+  }
 
   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
@@ -126,12 +136,33 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setMinFunctionAlignment(3);
   setPrefFunctionAlignment(3);
 
-  // inline memcpy() for kernel to see explicit copy
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+  if (BPFExpandMemcpyInOrder) {
+    // LLVM generic code will try to expand memcpy into load/store pairs at this
+    // stage which is before quite a few IR optimization passes, therefore the
+    // loads and stores could potentially be moved apart from each other which
+    // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
+    // compilers.
+    //
+    // When -bpf-expand-memcpy-in-order specified, we want to defer the expand
+    // of memcpy to later stage in IR optimization pipeline so those load/store
+    // pairs won't be touched and could be kept in order. Hence, we set
+    // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
+    // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+  } else {
+    // inline memcpy() for kernel to see explicit copy
+    unsigned CommonMaxStores =
+      STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
+
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+  }
 
   // CPU/Feature control
+  HasAlu32 = STI.getHasAlu32();
   HasJmpExt = STI.getHasJmpExt();
 }
 
@@ -189,26 +220,29 @@ SDValue BPFTargetLowering::LowerFormalArguments(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CC_BPF64);
+  CCInfo.AnalyzeFormalArguments(Ins, getHasAlu32() ? CC_BPF32 : CC_BPF64);
 
   for (auto &VA : ArgLocs) {
     if (VA.isRegLoc()) {
       // Arguments passed in registers
       EVT RegVT = VA.getLocVT();
-      switch (RegVT.getSimpleVT().SimpleTy) {
+      MVT::SimpleValueType SimpleTy = RegVT.getSimpleVT().SimpleTy;
+      switch (SimpleTy) {
       default: {
         errs() << "LowerFormalArguments Unhandled argument type: "
                << RegVT.getEVTString() << '\n';
         llvm_unreachable(0);
       }
+      case MVT::i32:
       case MVT::i64:
-        unsigned VReg = RegInfo.createVirtualRegister(&BPF::GPRRegClass);
+        unsigned VReg = RegInfo.createVirtualRegister(SimpleTy == MVT::i64 ?
+                                                      &BPF::GPRRegClass :
+                                                      &BPF::GPR32RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
 
-        // If this is an 8/16/32-bit value, it is really passed promoted to 64
-        // bits. Insert an assert[sz]ext to capture this, then truncate to the
-        // right size.
+        // If this is an value that has been promoted to wider types, insert an
+        // assert[sz]ext to capture this, then truncate to the right size.
         if (VA.getLocInfo() == CCValAssign::SExt)
           ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
                                  DAG.getValueType(VA.getValVT()));
@@ -220,6 +254,8 @@ SDValue BPFTargetLowering::LowerFormalArguments(
           ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
 
         InVals.push_back(ArgValue);
+
+	break;
       }
     } else {
       fail(DL, DAG, "defined with too many args");
@@ -264,7 +300,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_BPF64);
+  CCInfo.AnalyzeCallOperands(Outs, getHasAlu32() ? CC_BPF32 : CC_BPF64);
 
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
@@ -388,7 +424,7 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   }
 
   // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_BPF64);
+  CCInfo.AnalyzeReturn(Outs, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -432,7 +468,7 @@ SDValue BPFTargetLowering::LowerCallResult(
     return DAG.getCopyFromReg(Chain, DL, 1, Ins[0].VT, InFlag).getValue(1);
   }
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
+  CCInfo.AnalyzeCallResult(Ins, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
 
   // Copy all of the result registers out of their specified physreg.
   for (auto &Val : RVLocs) {
@@ -485,8 +521,7 @@ SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   if (!getHasJmpExt())
     NegateCC(LHS, RHS, CC);
 
-  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i64);
-
+  SDValue TargetCC = DAG.getConstant(CC, DL, LHS.getValueType());
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
 
@@ -507,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "BPFISD::BR_CC";
   case BPFISD::Wrapper:
     return "BPFISD::Wrapper";
+  case BPFISD::MEMCPY:
+    return "BPFISD::MEMCPY";
   }
   return nullptr;
 }
@@ -523,14 +560,90 @@ SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
   return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
 }
 
+unsigned
+BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
+                                 unsigned Reg, bool isSigned) const {
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i64);
+  int RShiftOp = isSigned ? BPF::SRA_ri : BPF::SRL_ri;
+  MachineFunction *F = BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned PromotedReg0 = RegInfo.createVirtualRegister(RC);
+  unsigned PromotedReg1 = RegInfo.createVirtualRegister(RC);
+  unsigned PromotedReg2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+  BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
+    .addReg(PromotedReg0).addImm(32);
+  BuildMI(BB, DL, TII.get(RShiftOp), PromotedReg2)
+    .addReg(PromotedReg1).addImm(32);
+
+  return PromotedReg2;
+}
+
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                     MachineBasicBlock *BB)
+                                                     const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+  unsigned ScratchReg;
+
+  // This function does custom insertion during lowering BPFISD::MEMCPY which
+  // only has two register operands from memcpy semantics, the copy source
+  // address and the copy destination address.
+  //
+  // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
+  // a third scratch register to serve as the destination register of load and
+  // source register of store.
+  //
+  // The scratch register here is with the Define | Dead | EarlyClobber flags.
+  // The EarlyClobber flag has the semantic property that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction. The Define flag is
+  // needed to coerce the machine verifier that an Undef value isn't a problem
+  // as we anyway is loading memory into it. The Dead flag is needed as the
+  // value in scratch isn't supposed to be used by any other instruction.
+  ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
+  MIB.addReg(ScratchReg,
+             RegState::Define | RegState::Dead | RegState::EarlyClobber);
+
+  return BB;
+}
+
 MachineBasicBlock *
 BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
-  bool isSelectOp = MI.getOpcode() == BPF::Select;
+  unsigned Opc = MI.getOpcode();
+  bool isSelectRROp = (Opc == BPF::Select ||
+                       Opc == BPF::Select_64_32 ||
+                       Opc == BPF::Select_32 ||
+                       Opc == BPF::Select_32_64);
+
+  bool isMemcpyOp = Opc == BPF::MEMCPY;
+
+#ifndef NDEBUG
+  bool isSelectRIOp = (Opc == BPF::Select_Ri ||
+                       Opc == BPF::Select_Ri_64_32 ||
+                       Opc == BPF::Select_Ri_32 ||
+                       Opc == BPF::Select_Ri_32_64);
+
+
+  assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
+         "Unexpected instr type to insert");
+#endif
+
+  if (isMemcpyOp)
+    return EmitInstrWithCustomInserterMemcpy(MI, BB);
 
-  assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
+  bool is32BitCmp = (Opc == BPF::Select_32 ||
+                     Opc == BPF::Select_32_64 ||
+                     Opc == BPF::Select_Ri_32 ||
+                     Opc == BPF::Select_Ri_32_64);
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -561,56 +674,72 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   BB->addSuccessor(Copy1MBB);
 
   // Insert Branch if Flag
-  unsigned LHS = MI.getOperand(1).getReg();
   int CC = MI.getOperand(3).getImm();
   int NewCC;
   switch (CC) {
   case ISD::SETGT:
-    NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
+    NewCC = isSelectRROp ? BPF::JSGT_rr : BPF::JSGT_ri;
     break;
   case ISD::SETUGT:
-    NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
+    NewCC = isSelectRROp ? BPF::JUGT_rr : BPF::JUGT_ri;
     break;
   case ISD::SETGE:
-    NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
+    NewCC = isSelectRROp ? BPF::JSGE_rr : BPF::JSGE_ri;
     break;
   case ISD::SETUGE:
-    NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
+    NewCC = isSelectRROp ? BPF::JUGE_rr : BPF::JUGE_ri;
     break;
   case ISD::SETEQ:
-    NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
+    NewCC = isSelectRROp ? BPF::JEQ_rr : BPF::JEQ_ri;
     break;
   case ISD::SETNE:
-    NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
+    NewCC = isSelectRROp ? BPF::JNE_rr : BPF::JNE_ri;
     break;
   case ISD::SETLT:
-    NewCC = isSelectOp ? BPF::JSLT_rr : BPF::JSLT_ri;
+    NewCC = isSelectRROp ? BPF::JSLT_rr : BPF::JSLT_ri;
     break;
   case ISD::SETULT:
-    NewCC = isSelectOp ? BPF::JULT_rr : BPF::JULT_ri;
+    NewCC = isSelectRROp ? BPF::JULT_rr : BPF::JULT_ri;
     break;
   case ISD::SETLE:
-    NewCC = isSelectOp ? BPF::JSLE_rr : BPF::JSLE_ri;
+    NewCC = isSelectRROp ? BPF::JSLE_rr : BPF::JSLE_ri;
     break;
   case ISD::SETULE:
-    NewCC = isSelectOp ? BPF::JULE_rr : BPF::JULE_ri;
+    NewCC = isSelectRROp ? BPF::JULE_rr : BPF::JULE_ri;
     break;
   default:
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
-  if (isSelectOp)
-    BuildMI(BB, DL, TII.get(NewCC))
-        .addReg(LHS)
-        .addReg(MI.getOperand(2).getReg())
-        .addMBB(Copy1MBB);
-  else {
+
+  unsigned LHS = MI.getOperand(1).getReg();
+  bool isSignedCmp = (CC == ISD::SETGT ||
+                      CC == ISD::SETGE ||
+                      CC == ISD::SETLT ||
+                      CC == ISD::SETLE);
+
+  // eBPF at the moment only has 64-bit comparison. Any 32-bit comparison need
+  // to be promoted, however if the 32-bit comparison operands are destination
+  // registers then they are implicitly zero-extended already, there is no
+  // need of explicit zero-extend sequence for them.
+  //
+  // We simply do extension for all situations in this method, but we will
+  // try to remove those unnecessary in BPFMIPeephole pass.
+  if (is32BitCmp)
+    LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp);
+
+  if (isSelectRROp) {
+    unsigned RHS = MI.getOperand(2).getReg();
+
+    if (is32BitCmp)
+      RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp);
+
+    BuildMI(BB, DL, TII.get(NewCC)).addReg(LHS).addReg(RHS).addMBB(Copy1MBB);
+  } else {
     int64_t imm32 = MI.getOperand(2).getImm();
     // sanity check before we build J*_ri instruction.
     assert (isInt<32>(imm32));
     BuildMI(BB, DL, TII.get(NewCC))
-        .addReg(LHS)
-        .addImm(imm32)
-        .addMBB(Copy1MBB);
+        .addReg(LHS).addImm(imm32).addMBB(Copy1MBB);
   }
 
   // Copy0MBB:
@@ -634,3 +763,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
+
+EVT BPFTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                          EVT VT) const {
+  return getHasAlu32() ? MVT::i32 : MVT::i64;
+}
+
+MVT BPFTargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+                                              EVT VT) const {
+  return (getHasAlu32() && VT == MVT::i32) ? MVT::i32 : MVT::i64;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
index 6ca2594a7e88..0aa8b9ac57ac 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -28,7 +28,8 @@ enum NodeType : unsigned {
   CALL,
   SELECT_CC,
   BR_CC,
-  Wrapper
+  Wrapper,
+  MEMCPY
 };
 }
 
@@ -54,10 +55,17 @@ public:
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
+  bool getHasAlu32() const { return HasAlu32; }
   bool getHasJmpExt() const { return HasJmpExt; }
 
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+
 private:
   // Control Instruction Selection Features
+  bool HasAlu32;
   bool HasJmpExt;
 
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -100,6 +108,14 @@ private:
                                          Type *Ty) const override {
     return true;
   }
+
+  unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
+                         bool isSigned) const;
+
+  MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                        MachineBasicBlock *BB)
+                                                        const;
+
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 5351cfa95020..4d47debdaa74 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -36,10 +36,92 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (BPF::GPRRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (BPF::GPR32RegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(BPF::MOV_rr_32), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
   else
     llvm_unreachable("Impossible reg-to-reg copy");
 }
 
+void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  uint64_t CopyLen = MI->getOperand(2).getImm();
+  uint64_t Alignment = MI->getOperand(3).getImm();
+  unsigned ScratchReg = MI->getOperand(4).getReg();
+  MachineBasicBlock *BB = MI->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned LdOpc, StOpc;
+
+  switch (Alignment) {
+  case 1:
+    LdOpc = BPF::LDB;
+    StOpc = BPF::STB;
+    break;
+  case 2:
+    LdOpc = BPF::LDH;
+    StOpc = BPF::STH;
+    break;
+  case 4:
+    LdOpc = BPF::LDW;
+    StOpc = BPF::STW;
+    break;
+  case 8:
+    LdOpc = BPF::LDD;
+    StOpc = BPF::STD;
+    break;
+  default:
+    llvm_unreachable("unsupported memcpy alignment");
+  }
+
+  unsigned IterationNum = CopyLen >> Log2_64(Alignment);
+  for(unsigned I = 0; I < IterationNum; ++I) {
+    BuildMI(*BB, MI, dl, get(LdOpc))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg)
+            .addImm(I * Alignment);
+    BuildMI(*BB, MI, dl, get(StOpc))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg)
+            .addImm(I * Alignment);
+  }
+
+  unsigned BytesLeft = CopyLen & (Alignment - 1);
+  unsigned Offset = IterationNum * Alignment;
+  bool Hanging4Byte = BytesLeft & 0x4;
+  bool Hanging2Byte = BytesLeft & 0x2;
+  bool Hanging1Byte = BytesLeft & 0x1;
+  if (Hanging4Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDW))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STW))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+    Offset += 4;
+  }
+  if (Hanging2Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDH))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STH))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+    Offset += 2;
+  }
+  if (Hanging1Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDB))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STB))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+  }
+
+  BB->erase(MI);
+}
+
+bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == BPF::MEMCPY) {
+    expandMEMCPY(MI);
+    return true;
+  }
+
+  return false;
+}
+
 void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned SrcReg, bool IsKill, int FI,
@@ -54,6 +136,11 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(IsKill))
         .addFrameIndex(FI)
         .addImm(0);
+  else if (RC == &BPF::GPR32RegClass)
+    BuildMI(MBB, I, DL, get(BPF::STW32))
+        .addReg(SrcReg, getKillRegState(IsKill))
+        .addFrameIndex(FI)
+        .addImm(0);
   else
     llvm_unreachable("Can't store this register to stack slot");
 }
@@ -69,6 +156,8 @@ void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   if (RC == &BPF::GPRRegClass)
     BuildMI(MBB, I, DL, get(BPF::LDD), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == &BPF::GPR32RegClass)
+    BuildMI(MBB, I, DL, get(BPF::LDW32), DestReg).addFrameIndex(FI).addImm(0);
   else
     llvm_unreachable("Can't load this register from stack slot");
 }
@@ -83,7 +172,7 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     // Working from the bottom, when we see a non-terminator
@@ -158,7 +247,7 @@ unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (I->getOpcode() != BPF::JMP)
       break;
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
index f591f48a89a6..fb65a86a6d18 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -34,6 +34,8 @@ public:
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -55,6 +57,9 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+private:
+  void expandMEMCPY(MachineBasicBlock::iterator) const;
+
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
index 126d55fc28de..aaef5fb706e0 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -28,6 +28,10 @@ def SDT_BPFBrCC         : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
                                                SDTCisVT<3, OtherVT>]>;
 def SDT_BPFWrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                SDTCisPtrTy<0>]>;
+def SDT_BPFMEMCPY       : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
+                                               SDTCisVT<1, i64>,
+                                               SDTCisVT<2, i64>,
+                                               SDTCisVT<3, i64>]>;
 
 def BPFcall         : SDNode<"BPFISD::CALL", SDT_BPFCall,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -43,8 +47,13 @@ def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
 
 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
+                             [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                              SDNPMayStore, SDNPMayLoad]>;
 def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
 def BPFIsBigEndian    : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
+def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
+def BPFNoALU32 : Predicate<"!Subtarget->getHasAlu32()">;
 
 def brtarget : Operand<OtherVT> {
   let PrintMethod = "printBrTargetOperand";
@@ -57,6 +66,8 @@ def u64imm   : Operand<i64> {
 
 def i64immSExt32 : PatLeaf<(i64 imm),
                 [{return isInt<32>(N->getSExtValue()); }]>;
+def i32immSExt32 : PatLeaf<(i32 imm),
+                [{return isInt<32>(N->getSExtValue()); }]>;
 
 // Addressing modes.
 def ADDRri : ComplexPattern<i64, 2, "SelectAddr", [], []>;
@@ -218,7 +229,7 @@ multiclass ALU<BPFArithOp Opc, string OpcodeStr, SDNode OpNode> {
                    (outs GPR32:$dst),
                    (ins GPR32:$src2, i32imm:$imm),
                    "$dst "#OpcodeStr#" $imm",
-                   [(set GPR32:$dst, (OpNode GPR32:$src2, i32:$imm))]>;
+                   [(set GPR32:$dst, (OpNode GPR32:$src2, i32immSExt32:$imm))]>;
 }
 
 let Constraints = "$dst = $src2" in {
@@ -292,7 +303,7 @@ def MOV_ri_32 : ALU_RI<BPF_ALU, BPF_MOV,
                     (outs GPR32:$dst),
                     (ins i32imm:$imm),
                     "$dst = $imm",
-                    [(set GPR32:$dst, (i32 i32:$imm))]>;
+                    [(set GPR32:$dst, (i32 i32immSExt32:$imm))]>;
 }
 
 def FI_ri
@@ -347,9 +358,11 @@ class STORE<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
 class STOREi64<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
     : STORE<Opc, OpcodeStr, [(OpNode i64:$src, ADDRri:$addr)]>;
 
-def STW : STOREi64<BPF_W, "u32", truncstorei32>;
-def STH : STOREi64<BPF_H, "u16", truncstorei16>;
-def STB : STOREi64<BPF_B, "u8", truncstorei8>;
+let Predicates = [BPFNoALU32] in {
+  def STW : STOREi64<BPF_W, "u32", truncstorei32>;
+  def STH : STOREi64<BPF_H, "u16", truncstorei16>;
+  def STB : STOREi64<BPF_B, "u8", truncstorei8>;
+}
 def STD : STOREi64<BPF_DW, "u64", store>;
 
 // LOAD instructions
@@ -371,9 +384,13 @@ class LOAD<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
 class LOADi64<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
     : LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
 
-def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
-def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
-def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+
+let Predicates = [BPFNoALU32] in {
+  def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
+  def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
+  def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+}
+
 def LDD : LOADi64<BPF_DW, "u64", load>;
 
 class BRANCH<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
@@ -456,7 +473,7 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
 }
 
 // ADJCALLSTACKDOWN/UP pseudo insns
-let Defs = [R11], Uses = [R11] in {
+let Defs = [R11], Uses = [R11], isCodeGenOnly = 1 in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               "#ADJCALLSTACKDOWN $amt1 $amt2",
                               [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
@@ -465,7 +482,7 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
   def Select : Pseudo<(outs GPR:$dst),
                       (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
@@ -476,6 +493,36 @@ let usesCustomInserter = 1 in {
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
                       [(set i64:$dst,
                        (BPFselectcc i64:$lhs, (i64immSExt32:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_64_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_Ri_64_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i64:$lhs, (i64immSExt32:$rhs), (i64 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR32:$lhs, GPR32:$rhs, i32imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i32:$lhs, i32:$rhs, (i32 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_Ri_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR32:$lhs, i32imm:$rhs, i32imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i32:$lhs, (i32immSExt32:$rhs), (i32 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_32_64 : Pseudo<(outs GPR:$dst),
+                      (ins GPR32:$lhs, GPR32:$rhs, i32imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i32:$lhs, i32:$rhs, (i32 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_Ri_32_64 : Pseudo<(outs GPR:$dst),
+                      (ins GPR32:$lhs, i32imm:$rhs, i32imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i32:$lhs, (i32immSExt32:$rhs), (i32 imm:$imm), i64:$src, i64:$src2))]>;
 }
 
 // load 64-bit global addr into register
@@ -492,9 +539,11 @@ def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
 def : Pat<(BPFcall GPR:$dst), (JALX GPR:$dst)>;
 
 // Loads
-def : Pat<(extloadi8  ADDRri:$src), (i64 (LDB ADDRri:$src))>;
-def : Pat<(extloadi16 ADDRri:$src), (i64 (LDH ADDRri:$src))>;
-def : Pat<(extloadi32 ADDRri:$src), (i64 (LDW ADDRri:$src))>;
+let Predicates = [BPFNoALU32] in {
+  def : Pat<(i64 (extloadi8  ADDRri:$src)), (i64 (LDB ADDRri:$src))>;
+  def : Pat<(i64 (extloadi16 ADDRri:$src)), (i64 (LDH ADDRri:$src))>;
+  def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
+}
 
 // Atomics
 class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
@@ -581,3 +630,102 @@ def LD_ABS_W : LOAD_ABS<BPF_W, "u32", int_bpf_load_word>;
 def LD_IND_B : LOAD_IND<BPF_B, "u8", int_bpf_load_byte>;
 def LD_IND_H : LOAD_IND<BPF_H, "u16", int_bpf_load_half>;
 def LD_IND_W : LOAD_IND<BPF_W, "u32", int_bpf_load_word>;
+
+let isCodeGenOnly = 1 in {
+  def MOV_32_64 : ALU_RR<BPF_ALU, BPF_MOV,
+                         (outs GPR:$dst), (ins GPR32:$src),
+                         "$dst = $src", []>;
+}
+
+def : Pat<(i64 (sext GPR32:$src)),
+          (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+
+def : Pat<(i64 (zext GPR32:$src)),
+          (SRL_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+
+// For i64 -> i32 truncation, use the 32-bit subregister directly.
+def : Pat<(i32 (trunc GPR:$src)),
+          (i32 (EXTRACT_SUBREG GPR:$src, sub_32))>;
+
+// For i32 -> i64 anyext, we don't care about the high bits.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+class STORE32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
+    : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+                 (outs),
+                 (ins GPR32:$src, MEMri:$addr),
+                 "*("#OpcodeStr#" *)($addr) = $src",
+                 Pattern> {
+  bits<4> src;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = src;
+  let Inst{47-32} = addr{15-0}; // offset
+  let BPFClass = BPF_STX;
+}
+
+class STOREi32<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
+    : STORE32<Opc, OpcodeStr, [(OpNode i32:$src, ADDRri:$addr)]>;
+
+let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+  def STW32 : STOREi32<BPF_W, "u32", store>;
+  def STH32 : STOREi32<BPF_H, "u16", truncstorei16>;
+  def STB32 : STOREi32<BPF_B, "u8", truncstorei8>;
+}
+
+class LOAD32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
+    : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+                (outs GPR32:$dst),
+                (ins MEMri:$addr),
+                "$dst = *("#OpcodeStr#" *)($addr)",
+                Pattern> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = dst;
+  let Inst{55-52} = addr{19-16};
+  let Inst{47-32} = addr{15-0};
+  let BPFClass = BPF_LDX;
+}
+
+class LOADi32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+    : LOAD32<SizeOp, OpcodeStr, [(set i32:$dst, (OpNode ADDRri:$addr))]>;
+
+let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+  def LDW32 : LOADi32<BPF_W, "u32", load>;
+  def LDH32 : LOADi32<BPF_H, "u16", zextloadi16>;
+  def LDB32 : LOADi32<BPF_B, "u8", zextloadi8>;
+}
+
+let Predicates = [BPFHasALU32] in {
+  def : Pat<(truncstorei8 GPR:$src, ADDRri:$dst),
+            (STB32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+  def : Pat<(truncstorei16 GPR:$src, ADDRri:$dst),
+            (STH32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+  def : Pat<(truncstorei32 GPR:$src, ADDRri:$dst),
+            (STW32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+  def : Pat<(i32 (extloadi8 ADDRri:$src)), (i32 (LDB32 ADDRri:$src))>;
+  def : Pat<(i32 (extloadi16 ADDRri:$src)), (i32 (LDH32 ADDRri:$src))>;
+  def : Pat<(i64 (zextloadi8  ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (zextloadi16 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDH32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (zextloadi32 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (extloadi8  ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (extloadi16 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDH32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (extloadi32 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
+}
+
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+    def MEMCPY : Pseudo<
+      (outs),
+      (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
+      "#memcpy dst: $dst, src: $src, len: $len, align: $align",
+      [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/contrib/llvm/lib/Target/BPF/BPFMIPeephole.cpp
new file mode 100644
index 000000000000..9e984d0facfb
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -0,0 +1,284 @@
+//===-------------- BPFMIPeephole.cpp - MI Peephole Cleanups  -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to cleanup ugly code sequences at
+// MachineInstruction layer.
+//
+// Currently, there are two optimizations implemented:
+//  - One pre-RA MachineSSA pass to eliminate type promotion sequences, those
+//    zero extend 32-bit subregisters to 64-bit registers, if the compiler
+//    could prove the subregisters is defined by 32-bit operations in which
+//    case the upper half of the underlying 64-bit registers were zeroed
+//    implicitly.
+//
+//  - One post-RA PreEmit pass to do final cleanup on some redundant
+//    instructions generated due to bad RA on subregister.
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-mi-zext-elim"
+
+STATISTIC(ZExtElemNum, "Number of zero extension shifts eliminated");
+
+namespace {
+
+struct BPFMIPeephole : public MachineFunctionPass {
+
+  static char ID;
+  const BPFInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  BPFMIPeephole() : MachineFunctionPass(ID) {
+    initializeBPFMIPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  bool isMovFrom32Def(MachineInstr *MovMI);
+  bool eliminateZExtSeq(void);
+
+public:
+
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
+
+    return eliminateZExtSeq();
+  }
+};
+
+// Initialize class variables.
+void BPFMIPeephole::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n");
+}
+
+bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
+{
+  MachineInstr *DefInsn = MRI->getVRegDef(MovMI->getOperand(1).getReg());
+
+  LLVM_DEBUG(dbgs() << "  Def of Mov Src:");
+  LLVM_DEBUG(DefInsn->dump());
+
+  if (!DefInsn)
+    return false;
+
+  if (DefInsn->isPHI()) {
+    for (unsigned i = 1, e = DefInsn->getNumOperands(); i < e; i += 2) {
+      MachineOperand &opnd = DefInsn->getOperand(i);
+
+      if (!opnd.isReg())
+        return false;
+
+      MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+      // quick check on PHI incoming definitions.
+      if (!PhiDef || PhiDef->isPHI() || PhiDef->getOpcode() == BPF::COPY)
+        return false;
+    }
+  }
+
+  if (DefInsn->getOpcode() == BPF::COPY) {
+    MachineOperand &opnd = DefInsn->getOperand(1);
+
+    if (!opnd.isReg())
+      return false;
+
+    unsigned Reg = opnd.getReg();
+    if ((TargetRegisterInfo::isVirtualRegister(Reg) &&
+         MRI->getRegClass(Reg) == &BPF::GPRRegClass))
+       return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "  One ZExt elim sequence identified.\n");
+
+  return true;
+}
+
+bool BPFMIPeephole::eliminateZExtSeq(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // Eliminate the 32-bit to 64-bit zero extension sequence when possible.
+      //
+      //   MOV_32_64 rB, wA
+      //   SLL_ri    rB, rB, 32
+      //   SRL_ri    rB, rB, 32
+      if (MI.getOpcode() == BPF::SRL_ri &&
+          MI.getOperand(2).getImm() == 32) {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned ShfReg = MI.getOperand(1).getReg();
+        MachineInstr *SllMI = MRI->getVRegDef(ShfReg);
+
+        LLVM_DEBUG(dbgs() << "Starting SRL found:");
+        LLVM_DEBUG(MI.dump());
+
+        if (!SllMI ||
+            SllMI->isPHI() ||
+            SllMI->getOpcode() != BPF::SLL_ri ||
+            SllMI->getOperand(2).getImm() != 32)
+          continue;
+
+        LLVM_DEBUG(dbgs() << "  SLL found:");
+        LLVM_DEBUG(SllMI->dump());
+
+        MachineInstr *MovMI = MRI->getVRegDef(SllMI->getOperand(1).getReg());
+        if (!MovMI ||
+            MovMI->isPHI() ||
+            MovMI->getOpcode() != BPF::MOV_32_64)
+          continue;
+
+        LLVM_DEBUG(dbgs() << "  Type cast Mov found:");
+        LLVM_DEBUG(MovMI->dump());
+
+        unsigned SubReg = MovMI->getOperand(1).getReg();
+        if (!isMovFrom32Def(MovMI)) {
+          LLVM_DEBUG(dbgs()
+                     << "  One ZExt elim sequence failed qualifying elim.\n");
+          continue;
+        }
+
+        BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::SUBREG_TO_REG), DstReg)
+          .addImm(0).addReg(SubReg).addImm(BPF::sub_32);
+
+        SllMI->eraseFromParent();
+        MovMI->eraseFromParent();
+        // MI is the right shift, we can't erase it in it's own iteration.
+        // Mark it to ToErase, and erase in the next iteration.
+        ToErase = &MI;
+        ZExtElemNum++;
+        Eliminated = true;
+      }
+    }
+  }
+
+  return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
+                "BPF MachineSSA Peephole Optimization", false, false)
+
+char BPFMIPeephole::ID = 0;
+FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
+
+STATISTIC(RedundantMovElemNum, "Number of redundant moves eliminated");
+
+namespace {
+
+struct BPFMIPreEmitPeephole : public MachineFunctionPass {
+
+  static char ID;
+  MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+
+  BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {
+    initializeBPFMIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  bool eliminateRedundantMov(void);
+
+public:
+
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
+
+    return eliminateRedundantMov();
+  }
+};
+
+// Initialize class variables.
+void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
+}
+
+bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        LLVM_DEBUG(dbgs() << "  Redundant Mov Eliminated:");
+        LLVM_DEBUG(ToErase->dump());
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // Eliminate identical move:
+      //
+      //   MOV rA, rA
+      //
+      // This is particularly possible to happen when sub-register support
+      // enabled. The special type cast insn MOV_32_64 involves different
+      // register class on src (i32) and dst (i64), RA could generate useless
+      // instruction due to this.
+      if (MI.getOpcode() == BPF::MOV_32_64) {
+        unsigned dst = MI.getOperand(0).getReg();
+        unsigned dst_sub = TRI->getSubReg(dst, BPF::sub_32);
+        unsigned src = MI.getOperand(1).getReg();
+
+        if (dst_sub != src)
+          continue;
+
+        ToErase = &MI;
+        RedundantMovElemNum++;
+        Eliminated = true;
+      }
+    }
+  }
+
+  return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",
+                "BPF PreEmit Peephole Optimization", false, false)
+
+char BPFMIPreEmitPeephole::ID = 0;
+FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
+{
+  return new BPFMIPreEmitPeephole();
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
index 6f7067816098..635c11113151 100644
--- a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -37,8 +37,8 @@ BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  Reserved.set(BPF::R10); // R10 is read only frame pointer
-  Reserved.set(BPF::R11); // R11 is pseudo stack pointer
+  markSuperRegs(Reserved, BPF::W10); // [W|R]10 is read only frame pointer
+  markSuperRegs(Reserved, BPF::W11); // [W|R]11 is pseudo stack pointer
   return Reserved;
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
index 4202850e9eb9..bb0d6bcf5450 100644
--- a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
@@ -29,6 +29,8 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
diff --git a/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..24d5f59bbfd7
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -0,0 +1,43 @@
+//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-selectiondag-info"
+
+SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  // Requires the copy size to be a constant.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  unsigned CopyLen = ConstantSize->getZExtValue();
+  unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+  // Impose the same copy length limit as MaxStoresPerMemcpy.
+  if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+                    DAG.getConstant(CopyLen, dl, MVT::i64),
+                    DAG.getConstant(Align, dl, MVT::i64));
+
+  return Dst.getValue(0);
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
new file mode 100644
index 000000000000..19d3c5769573
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BPF subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 42ca87f9ef67..56780bd9d46f 100644
--- a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -30,11 +30,14 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
   initSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, FS);
   return *this;
 }
 
 void BPFSubtarget::initializeEnvironment() {
   HasJmpExt = false;
+  HasAlu32 = false;
+  UseDwarfRIS = false;
 }
 
 void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
index fa1f24443bc3..60e56435fe4c 100644
--- a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -17,6 +17,7 @@
 #include "BPFFrameLowering.h"
 #include "BPFISelLowering.h"
 #include "BPFInstrInfo.h"
+#include "BPFSelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   BPFInstrInfo InstrInfo;
   BPFFrameLowering FrameLowering;
   BPFTargetLowering TLInfo;
-  SelectionDAGTargetInfo TSInfo;
+  BPFSelectionDAGInfo TSInfo;
 
 private:
   void initializeEnvironment();
@@ -47,6 +48,12 @@ protected:
   // whether the cpu supports jmp ext
   bool HasJmpExt;
 
+  // whether the cpu supports alu32 instructions.
+  bool HasAlu32;
+
+  // whether we should enable MCAsmInfo DwarfUsesRelocationsAcrossSections
+  bool UseDwarfRIS;
+
 public:
   // This constructor initializes the data members to match that
   // of the specified triple.
@@ -59,6 +66,8 @@ public:
   // subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
   bool getHasJmpExt() const { return HasJmpExt; }
+  bool getHasAlu32() const { return HasAlu32; }
+  bool getUseDwarfRIS() const { return UseDwarfRIS; }
 
   const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const BPFFrameLowering *getFrameLowering() const override {
@@ -67,7 +76,7 @@ public:
   const BPFTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+  const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const TargetRegisterInfo *getRegisterInfo() const override {
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 60672fa2684b..84d89bff74fe 100644
--- a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "BPFTargetMachine.h"
 #include "BPF.h"
+#include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -22,11 +23,18 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+static cl::
+opt<bool> DisableMIPeephole("disable-bpf-peephole", cl::Hidden,
+                            cl::desc("Disable machine peepholes for BPF"));
+
 extern "C" void LLVMInitializeBPFTarget() {
   // Register the target.
   RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
   RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
   RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeBPFMIPeepholePass(PR);
 }
 
 // DataLayout: little or big endian
@@ -61,6 +69,9 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
+
+  BPFMCAsmInfo *MAI = static_cast<BPFMCAsmInfo *>(const_cast<MCAsmInfo *>(AsmInfo));
+  MAI->setDwarfUsesRelocationsAcrossSections(!Subtarget.getUseDwarfRIS());
 }
 namespace {
 // BPF Code Generator Pass Configuration Options.
@@ -74,6 +85,8 @@ public:
   }
 
   bool addInstSelector() override;
+  void addMachineSSAOptimization() override;
+  void addPreEmitPass() override;
 };
 }
 
@@ -88,3 +101,21 @@ bool BPFPassConfig::addInstSelector() {
 
   return false;
 }
+
+void BPFPassConfig::addMachineSSAOptimization() {
+  // The default implementation must be called first as we want eBPF
+  // Peephole ran at last.
+  TargetPassConfig::addMachineSSAOptimization();
+
+  const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
+  if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+    addPass(createBPFMIPeepholePass());
+}
+
+void BPFPassConfig::addPreEmitPass() {
+  const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
+
+  if (getOptLevel() != CodeGenOpt::None)
+    if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+      addPass(createBPFMIPreEmitPeepholePass());
+}
diff --git a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 6fc87d79c439..e7790ddb3d7e 100644
--- a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -35,6 +35,34 @@ namespace {
 /// A disassembler class for BPF.
 class BPFDisassembler : public MCDisassembler {
 public:
+  enum BPF_CLASS {
+    BPF_LD = 0x0,
+    BPF_LDX = 0x1,
+    BPF_ST = 0x2,
+    BPF_STX = 0x3,
+    BPF_ALU = 0x4,
+    BPF_JMP = 0x5,
+    BPF_RES = 0x6,
+    BPF_ALU64 = 0x7
+  };
+
+  enum BPF_SIZE {
+    BPF_W = 0x0,
+    BPF_H = 0x1,
+    BPF_B = 0x2,
+    BPF_DW = 0x3
+  };
+
+  enum BPF_MODE {
+    BPF_IMM = 0x0,
+    BPF_ABS = 0x1,
+    BPF_IND = 0x2,
+    BPF_MEM = 0x3,
+    BPF_LEN = 0x4,
+    BPF_MSH = 0x5,
+    BPF_XADD = 0x6
+  };
+
   BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
   ~BPFDisassembler() override = default;
@@ -43,6 +71,10 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
+
+  uint8_t getInstClass(uint64_t Inst) const { return (Inst >> 56) & 0x7; };
+  uint8_t getInstSize(uint64_t Inst) const { return (Inst >> 59) & 0x3; };
+  uint8_t getInstMode(uint64_t Inst) const { return (Inst >> 61) & 0x7; };
 };
 
 } // end anonymous namespace
@@ -141,8 +173,17 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   Result = readInstruction64(Bytes, Address, Size, Insn, IsLittleEndian);
   if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
 
-  Result = decodeInstruction(DecoderTableBPF64, Instr, Insn,
-                             Address, this, STI);
+  uint8_t InstClass = getInstClass(Insn);
+  if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
+      getInstSize(Insn) != BPF_DW &&
+      getInstMode(Insn) == BPF_MEM &&
+      STI.getFeatureBits()[BPF::ALU32])
+    Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
+                               this, STI);
+  else
+    Result = decodeInstruction(DecoderTableBPF64, Instr, Insn, Address, this,
+                               STI);
+
   if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
 
   switch (Instr.getOpcode()) {
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
index 1f4ef098403d..20627da38817 100644
--- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "BPFInstPrinter.h"
-#include "BPF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 6593d9d018fd..6c255e9ef780 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -12,6 +12,7 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/EndianStream.h"
 #include <cassert>
 #include <cstdint>
 
@@ -21,18 +22,16 @@ namespace {
 
 class BPFAsmBackend : public MCAsmBackend {
 public:
-  bool IsLittleEndian;
-
-  BPFAsmBackend(bool IsLittleEndian)
-    : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+  BPFAsmBackend(support::endianness Endian) : MCAsmBackend(Endian) {}
   ~BPFAsmBackend() override = default;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
   // No instruction requires relaxation
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -43,22 +42,25 @@ public:
 
   unsigned getNumFixupKinds() const override { return 1; }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
 
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
 } // end anonymous namespace
 
-bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   if ((Count % 8) != 0)
     return false;
 
   for (uint64_t i = 0; i < Count; i += 8)
-    OW->write64(0x15000000);
+    support::endian::write<uint64_t>(OS, 0x15000000, Endian);
 
   return true;
 }
@@ -66,19 +68,17 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                const MCValue &Target,
                                MutableArrayRef<char> Data, uint64_t Value,
-                               bool IsResolved) const {
+                               bool IsResolved,
+                               const MCSubtargetInfo *STI) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
-  } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
-    unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
-
-    for (unsigned i = 0; i != Size; ++i) {
-      unsigned Idx = IsLittleEndian ? i : Size - i - 1;
-      Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
-    }
+  } else if (Fixup.getKind() == FK_Data_4) {
+    support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian);
+  } else if (Fixup.getKind() == FK_Data_8) {
+    support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian);
   } else if (Fixup.getKind() == FK_PCRel_4) {
     Value = (uint32_t)((Value - 8) / 8);
-    if (IsLittleEndian) {
+    if (Endian == support::little) {
       Data[Fixup.getOffset() + 1] = 0x10;
       support::endian::write32le(&Data[Fixup.getOffset() + 4], Value);
     } else {
@@ -88,31 +88,26 @@ void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   } else {
     assert(Fixup.getKind() == FK_PCRel_2);
     Value = (uint16_t)((Value - 8) / 8);
-    if (IsLittleEndian) {
-      Data[Fixup.getOffset() + 2] = Value & 0xFF;
-      Data[Fixup.getOffset() + 3] = Value >> 8;
-    } else {
-      Data[Fixup.getOffset() + 2] = Value >> 8;
-      Data[Fixup.getOffset() + 3] = Value & 0xFF;
-    }
+    support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value,
+                                     Endian);
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+BPFAsmBackend::createObjectTargetWriter() const {
+  return createBPFELFObjectWriter(0);
 }
 
 MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
                                         const MCSubtargetInfo &STI,
                                         const MCRegisterInfo &MRI,
                                         const MCTargetOptions &) {
-  return new BPFAsmBackend(/*IsLittleEndian=*/true);
+  return new BPFAsmBackend(support::little);
 }
 
 MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T,
                                           const MCSubtargetInfo &STI,
                                           const MCRegisterInfo &MRI,
                                           const MCTargetOptions &) {
-  return new BPFAsmBackend(/*IsLittleEndian=*/false);
+  return new BPFAsmBackend(support::big);
 }
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index f7de612dab15..134e890dfe49 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -54,9 +54,7 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                               bool IsLittleEndian) {
-  return createELFObjectWriter(llvm::make_unique<BPFELFObjectWriter>(OSABI), OS,
-                               IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createBPFELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<BPFELFObjectWriter>(OSABI);
 }
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index fd7c97bf1f0a..171f7f607ff4 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -44,6 +44,10 @@ public:
     // line numbers, etc.
     CodePointerSize = 8;
   }
+
+  void setDwarfUsesRelocationsAcrossSections(bool enable) {
+    DwarfUsesRelocationsAcrossSections = enable;
+  }
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index b4ecfdee7bff..437f658caf6e 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -122,44 +122,35 @@ void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
   unsigned Opcode = MI.getOpcode();
-  support::endian::Writer<support::little> LE(OS);
-  support::endian::Writer<support::big> BE(OS);
+  support::endian::Writer OSE(OS,
+                              IsLittleEndian ? support::little : support::big);
 
   if (Opcode == BPF::LD_imm64 || Opcode == BPF::LD_pseudo) {
     uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
-    LE.write<uint8_t>(Value >> 56);
+    OS << char(Value >> 56);
     if (IsLittleEndian)
-      LE.write<uint8_t>((Value >> 48) & 0xff);
+      OS << char((Value >> 48) & 0xff);
     else
-      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
-    LE.write<uint16_t>(0);
-    if (IsLittleEndian)
-      LE.write<uint32_t>(Value & 0xffffFFFF);
-    else
-      BE.write<uint32_t>(Value & 0xffffFFFF);
+      OS << char(SwapBits((Value >> 48) & 0xff));
+    OSE.write<uint16_t>(0);
+    OSE.write<uint32_t>(Value & 0xffffFFFF);
 
     const MCOperand &MO = MI.getOperand(1);
     uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
-    LE.write<uint8_t>(0);
-    LE.write<uint8_t>(0);
-    LE.write<uint16_t>(0);
-    if (IsLittleEndian)
-      LE.write<uint32_t>(Imm >> 32);
-    else
-      BE.write<uint32_t>(Imm >> 32);
+    OSE.write<uint8_t>(0);
+    OSE.write<uint8_t>(0);
+    OSE.write<uint16_t>(0);
+    OSE.write<uint32_t>(Imm >> 32);
   } else {
     // Get instruction encoding and emit it
     uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
-    LE.write<uint8_t>(Value >> 56);
-    if (IsLittleEndian) {
-      LE.write<uint8_t>((Value >> 48) & 0xff);
-      LE.write<uint16_t>((Value >> 32) & 0xffff);
-      LE.write<uint32_t>(Value & 0xffffFFFF);
-    } else {
-      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
-      BE.write<uint16_t>((Value >> 32) & 0xffff);
-      BE.write<uint32_t>(Value & 0xffffFFFF);
-    }
+    OS << char(Value >> 56);
+    if (IsLittleEndian)
+      OS << char((Value >> 48) & 0xff);
+    else
+      OS << char(SwapBits((Value >> 48) & 0xff));
+    OSE.write<uint16_t>((Value >> 32) & 0xffff);
+    OSE.write<uint32_t>(Value & 0xffffFFFF);
   }
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index cbf1ea7d7fb8..834b57527882 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -52,10 +52,10 @@ static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
 
 static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx,
                                        std::unique_ptr<MCAsmBackend> &&MAB,
-                                       raw_pwrite_stream &OS,
+                                       std::unique_ptr<MCObjectWriter> &&OW,
                                        std::unique_ptr<MCCodeEmitter> &&Emitter,
                                        bool RelaxAll) {
-  return createELFStreamer(Ctx, std::move(MAB), OS, std::move(Emitter),
+  return createELFStreamer(Ctx, std::move(MAB), std::move(OW), std::move(Emitter),
                            RelaxAll);
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index a6dac3abca02..6d2f0a1601e6 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -52,9 +52,7 @@ MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter> createBPFELFObjectWriter(raw_pwrite_stream &OS,
-                                                         uint8_t OSABI,
-                                                         bool IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter> createBPFELFObjectWriter(uint8_t OSABI);
 }
 
 // Defines symbolic names for BPF registers.  This defines a mapping from
diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 387296c69c39..92bda224f3dc 100644
--- a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -118,7 +118,6 @@ class HexagonAsmParser : public MCTargetAsmParser {
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseDirectiveSubsection(SMLoc L);
-  bool ParseDirectiveValue(unsigned Size, SMLoc L);
   bool ParseDirectiveComm(bool IsLocal, SMLoc L);
   bool RegisterMatchesArch(unsigned MatchNum) const;
 
@@ -165,6 +164,10 @@ public:
     MCB.setOpcode(Hexagon::BUNDLE);
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
 
+    Parser.addAliasForDirective(".half", ".2byte");
+    Parser.addAliasForDirective(".hword", ".2byte");
+    Parser.addAliasForDirective(".word", ".4byte");
+
     MCAsmParserExtension::Initialize(_Parser);
   }
 
@@ -462,9 +465,9 @@ void HexagonOperand::print(raw_ostream &OS) const {
 }
 
 bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
-  DEBUG(dbgs() << "Bundle:");
-  DEBUG(MCB.dump_pretty(dbgs()));
-  DEBUG(dbgs() << "--\n");
+  LLVM_DEBUG(dbgs() << "Bundle:");
+  LLVM_DEBUG(MCB.dump_pretty(dbgs()));
+  LLVM_DEBUG(dbgs() << "--\n");
 
   MCB.setLoc(IDLoc);
   // Check the bundle for errors.
@@ -506,16 +509,19 @@ bool HexagonAsmParser::matchBundleOptions() {
         "supported with this architecture";
     StringRef Option = Parser.getTok().getString();
     auto IDLoc = Parser.getTok().getLoc();
-    if (Option.compare_lower("endloop0") == 0)
+    if (Option.compare_lower("endloop01") == 0) {
+      HexagonMCInstrInfo::setInnerLoop(MCB);
+      HexagonMCInstrInfo::setOuterLoop(MCB);
+    } else if (Option.compare_lower("endloop0") == 0) {
       HexagonMCInstrInfo::setInnerLoop(MCB);
-    else if (Option.compare_lower("endloop1") == 0)
+    } else if (Option.compare_lower("endloop1") == 0) {
       HexagonMCInstrInfo::setOuterLoop(MCB);
-    else if (Option.compare_lower("mem_noshuf") == 0)
+    } else if (Option.compare_lower("mem_noshuf") == 0) {
       if (getSTI().getFeatureBits()[Hexagon::FeatureMemNoShuf])
         HexagonMCInstrInfo::setMemReorderDisabled(MCB);
       else
         return getParser().Error(IDLoc, MemNoShuffMsg);
-    else
+    } else
       return getParser().Error(IDLoc, llvm::Twine("'") + Option +
                                           "' is not a valid bundle option");
     Lex();
@@ -554,9 +560,9 @@ bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
     canonicalizeImmediates(MCI);
     result = processInstruction(MCI, InstOperands, IDLoc);
 
-    DEBUG(dbgs() << "Insn:");
-    DEBUG(MCI.dump_pretty(dbgs()));
-    DEBUG(dbgs() << "\n\n");
+    LLVM_DEBUG(dbgs() << "Insn:");
+    LLVM_DEBUG(MCI.dump_pretty(dbgs()));
+    LLVM_DEBUG(dbgs() << "\n\n");
 
     MCI.setLoc(IDLoc);
   }
@@ -648,11 +654,6 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 /// ParseDirective parses the Hexagon specific directives
 bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
-  if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte"))
-    return ParseDirectiveValue(4, DirectiveID.getLoc());
-  if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" ||
-      IDVal.lower() == ".half")
-    return ParseDirectiveValue(2, DirectiveID.getLoc());
   if (IDVal.lower() == ".falign")
     return ParseDirectiveFalign(256, DirectiveID.getLoc());
   if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon"))
@@ -720,39 +721,6 @@ bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
   return false;
 }
 
-///  ::= .word [ expression (, expression)* ]
-bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    while (true) {
-      const MCExpr *Value;
-      SMLoc ExprLoc = L;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      // Special case constant expressions to match code generator.
-      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
-        assert(Size <= 8 && "Invalid size");
-        uint64_t IntValue = MCE->getValue();
-        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
-          return Error(ExprLoc, "literal value out of range for directive");
-        getStreamer().EmitIntValue(IntValue, Size);
-      } else
-        getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
-    }
-  }
-
-  Lex();
-  return false;
-}
-
 // This is largely a copy of AsmParser's ParseDirectiveComm extended to
 // accept a 3rd argument, AccessAlignment which indicates the smallest
 // memory access made to the symbol, expressed in bytes.  If no
@@ -1293,9 +1261,9 @@ unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
       return Match_Success;
   }
 
-  DEBUG(dbgs() << "Unmatched Operand:");
-  DEBUG(Op->dump());
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Unmatched Operand:");
+  LLVM_DEBUG(Op->dump());
+  LLVM_DEBUG(dbgs() << "\n");
 
   return Match_InvalidOperand;
 }
@@ -1333,6 +1301,17 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     }
     break;
 
+  case Hexagon::J2_trap1:
+    if (!getSTI().getFeatureBits()[Hexagon::ArchV65]) {
+      MCOperand &Rx = Inst.getOperand(0);
+      MCOperand &Ry = Inst.getOperand(1);
+      if (Rx.getReg() != Hexagon::R0 || Ry.getReg() != Hexagon::R0) {
+        Error(IDLoc, "trap1 can only have register r0 as operand");
+        return Match_InvalidOperand;
+      }
+    }
+    break;
+
   case Hexagon::A2_iconst: {
     Inst.setOpcode(Hexagon::A2_addi);
     MCOperand Reg = Inst.getOperand(0);
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
index 15d6a05a0078..69529b0d1162 100644
--- a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -779,15 +779,18 @@ bool BT::UseQueueType::Cmp::operator()(const MachineInstr *InstA,
     return BA->getNumber() > BB->getNumber();
   }
 
-  MachineBasicBlock::const_iterator ItA = InstA->getIterator();
-  MachineBasicBlock::const_iterator ItB = InstB->getIterator();
-  MachineBasicBlock::const_iterator End = BA->end();
-  while (ItA != End) {
-    if (ItA == ItB)
-      return false;   // ItA was before ItB.
-    ++ItA;
-  }
-  return true;
+  auto getDist = [this] (const MachineInstr *MI) {
+    auto F = Dist.find(MI);
+    if (F != Dist.end())
+      return F->second;
+    MachineBasicBlock::const_iterator I = MI->getParent()->begin();
+    MachineBasicBlock::const_iterator E = MI->getIterator();
+    unsigned D = std::distance(I, E);
+    Dist.insert(std::make_pair(MI, D));
+    return D;
+  };
+
+  return getDist(InstA) > getDist(InstB);
 }
 
 // Main W-Z implementation.
@@ -840,7 +843,7 @@ void BT::visitPHI(const MachineInstr &PI) {
 void BT::visitNonBranch(const MachineInstr &MI) {
   if (Trace)
     dbgs() << "Visit MI(" << printMBBReference(*MI.getParent()) << "): " << MI;
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return;
   assert(!MI.isBranch() && "Unexpected branch instruction");
 
@@ -1138,6 +1141,7 @@ void BT::run() {
     runEdgeQueue(BlockScanned);
     runUseQueue();
   }
+  UseQ.reset();
 
   if (Trace)
     print_cells(dbgs() << "Cells after propagation:\n");
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
index 5df6b61710f6..058225c0d812 100644
--- a/contrib/llvm/lib/Target/Hexagon/BitTracker.h
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include <cassert>
 #include <cstdint>
@@ -28,7 +29,6 @@ class ConstantInt;
 class MachineRegisterInfo;
 class MachineBasicBlock;
 class MachineFunction;
-class MachineInstr;
 class raw_ostream;
 class TargetRegisterClass;
 class TargetRegisterInfo;
@@ -73,6 +73,8 @@ private:
   // Priority queue of instructions using modified registers, ordered by
   // their relative position in a basic block.
   struct UseQueueType {
+    UseQueueType() : Uses(Dist) {}
+
     unsigned size() const {
       return Uses.size();
     }
@@ -90,12 +92,18 @@ private:
       Set.erase(front());
       Uses.pop();
     }
+    void reset() {
+      Dist.clear();
+    }
   private:
     struct Cmp {
+      Cmp(DenseMap<const MachineInstr*,unsigned> &Map) : Dist(Map) {}
       bool operator()(const MachineInstr *MI, const MachineInstr *MJ) const;
+      DenseMap<const MachineInstr*,unsigned> &Dist;
     };
     std::priority_queue<MachineInstr*, std::vector<MachineInstr*>, Cmp> Uses;
-    DenseSet<MachineInstr*> Set; // Set to avoid adding duplicate entries.
+    DenseSet<const MachineInstr*> Set; // Set to avoid adding duplicate entries.
+    DenseMap<const MachineInstr*,unsigned> Dist;
   };
 
   void reset();
diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 481b692ae8bf..1a619ebda84e 100644
--- a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -40,7 +40,7 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
 
 namespace {
 
-/// \brief Hexagon disassembler for all Hexagon platforms.
+/// Hexagon disassembler for all Hexagon platforms.
 class HexagonDisassembler : public MCDisassembler {
 public:
   std::unique_ptr<MCInstrInfo const> const MCII;
@@ -127,12 +127,18 @@ static DecodeStatus DecodeHvxQRRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
 static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder);
+static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
 
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t Address, const void *Decoder);
@@ -783,3 +789,55 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
     HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t /*Address*/,
+                                                 const void *Decoder) {
+  using namespace Hexagon;
+
+  static const MCPhysReg GuestRegDecoderTable[] = {
+    /*  0 */ GELR,      GSR,        GOSP,       G3,
+    /*  4 */ G4,        G5,         G6,         G7,
+    /*  8 */ G8,        G9,         G10,        G11,
+    /* 12 */ G12,       G13,        G14,        G15,
+    /* 16 */ GPMUCNT4,  GPMUCNT5,   GPMUCNT6,   GPMUCNT7,
+    /* 20 */ G20,       G21,        G22,        G23,
+    /* 24 */ GPCYCLELO, GPCYCLEHI,  GPMUCNT0,   GPMUCNT1,
+    /* 28 */ GPMUCNT2,  GPMUCNT3,   G30,        G31
+  };
+
+  if (RegNo >= array_lengthof(GuestRegDecoderTable))
+    return MCDisassembler::Fail;
+  if (GuestRegDecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GuestRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t /*Address*/,
+                                                   const void *Decoder) {
+  using namespace Hexagon;
+
+  static const MCPhysReg GuestReg64DecoderTable[] = {
+    /*  0 */ G1_0,      0,          G3_2,       0,
+    /*  4 */ G5_4,      0,          G7_6,       0,
+    /*  8 */ G9_8,      0,          G11_10,     0,
+    /* 12 */ G13_12,    0,          G15_14,     0,
+    /* 16 */ G17_16,    0,          G19_18,     0,
+    /* 20 */ G21_20,    0,          G23_22,     0,
+    /* 24 */ G25_24,    0,          G27_26,     0,
+    /* 28 */ G29_28,    0,          G31_30,     0
+  };
+
+  if (RegNo >= array_lengthof(GuestReg64DecoderTable))
+    return MCDisassembler::Fail;
+  if (GuestReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GuestReg64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
index 66b387b62c6c..6ec52d18cdc4 100644
--- a/contrib/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
@@ -49,7 +49,7 @@
 namespace llvm {
   class HexagonTargetMachine;
 
-  /// \brief Creates a Hexagon-specific Target Transformation Info pass.
+  /// Creates a Hexagon-specific Target Transformation Info pass.
   ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
 } // end namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
index 6292e2a7a4ea..69e263a425f8 100644
--- a/contrib/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -36,32 +36,36 @@ def ExtensionHVXV62: SubtargetFeature<"hvxv62", "HexagonHVXVersion",
 def ExtensionHVXV65: SubtargetFeature<"hvxv65", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V65", "Hexagon HVX instructions",
       [ExtensionHVX,ExtensionHVXV60, ExtensionHVXV62]>;
-def ExtensionHVX64B
-    : SubtargetFeature<"hvx-length64b", "UseHVX64BOps", "true",
-                       "Hexagon HVX 64B instructions", [ExtensionHVX]>;
-def ExtensionHVX128B
-    : SubtargetFeature<"hvx-length128b", "UseHVX128BOps", "true",
-                       "Hexagon HVX 128B instructions", [ExtensionHVX]>;
-
-// This is an alias to ExtensionHVX128B to accept the hvx-double as
-// an acceptable subtarget feature.
-def ExtensionHVXDbl
-    : SubtargetFeature<"hvx-double", "UseHVX128BOps", "true",
-                       "Hexagon HVX 128B instructions", [ExtensionHVX128B]>;
 
+def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
+      "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
+def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
+      "true", "Hexagon HVX 128B instructions", [ExtensionHVX]>;
+
+def FeaturePackets: SubtargetFeature<"packets", "UsePackets", "true",
+      "Support for instruction packets">;
 def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
       "Use constant-extended calls">;
 def FeatureMemNoShuf: SubtargetFeature<"mem_noshuf", "HasMemNoShuf", "false",
       "Supports mem_noshuf feature">;
-def FeatureDuplex : SubtargetFeature<"duplex", "EnableDuplex", "true",
+def FeatureMemops: SubtargetFeature<"memops", "UseMemops", "true",
+      "Use memop instructions">;
+def FeatureNVJ: SubtargetFeature<"nvj", "UseNewValueJumps", "true",
+      "Support for new-value jumps", [FeaturePackets]>;
+def FeatureNVS: SubtargetFeature<"nvs", "UseNewValueStores", "true",
+      "Support for new-value stores", [FeaturePackets]>;
+def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
+      "Allow GP-relative addressing of global variables">;
+def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
       "Enable generation of duplex instruction">;
+def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
+      "true", "Reserve register R19">;
 
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
 
-def UseMEMOP           : Predicate<"HST->useMemOps()">;
-def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
+def UseMEMOPS          : Predicate<"HST->useMemops()">;
 def UseHVX64B          : Predicate<"HST->useHVX64BOps()">,
                          AssemblerPredicate<"ExtensionHVX64B">;
 def UseHVX128B         : Predicate<"HST->useHVX128BOps()">,
@@ -75,10 +79,8 @@ def UseHVXV62          : Predicate<"HST->useHVXOps()">,
 def UseHVXV65          : Predicate<"HST->useHVXOps()">,
                          AssemblerPredicate<"ExtensionHVXV65">;
 
-def Hvx64     : HwMode<"+hvx-length64b">;
-def Hvx64old  : HwMode<"-hvx-double">;
-def Hvx128    : HwMode<"+hvx-length128b">;
-def Hvx128old : HwMode<"+hvx-double">;
+def Hvx64:  HwMode<"+hvx-length64b">;
+def Hvx128: HwMode<"+hvx-length128b">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
@@ -300,8 +302,10 @@ include "HexagonDepITypes.td"
 include "HexagonInstrFormats.td"
 include "HexagonDepInstrFormats.td"
 include "HexagonDepInstrInfo.td"
+include "HexagonCallingConv.td"
 include "HexagonPseudo.td"
 include "HexagonPatterns.td"
+include "HexagonPatternsHVX.td"
 include "HexagonPatternsV65.td"
 include "HexagonDepMappings.td"
 include "HexagonIntrinsics.td"
@@ -318,19 +322,34 @@ class Proc<string Name, SchedMachineModel Model,
            list<SubtargetFeature> Features>
  : ProcessorModel<Name, Model, Features>;
 
+def : Proc<"generic", HexagonModelV60,
+           [ArchV4, ArchV5, ArchV55, ArchV60,
+            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+            FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv4",  HexagonModelV4,
-           [ArchV4, FeatureDuplex]>;
+           [ArchV4,
+            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+            FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv5",  HexagonModelV4,
-           [ArchV4, ArchV5, FeatureDuplex]>;
+           [ArchV4, ArchV5,
+            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+            FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv55", HexagonModelV55,
-           [ArchV4, ArchV5, ArchV55, FeatureDuplex]>;
+           [ArchV4, ArchV5, ArchV55,
+            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+            FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv60", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60, FeatureDuplex]>;
+           [ArchV4, ArchV5, ArchV55, ArchV60,
+            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+            FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv62", HexagonModelV62,
-           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, FeatureDuplex]>;
+           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62,
+            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
+            FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv65", HexagonModelV65,
            [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
-            FeatureMemNoShuf, FeatureDuplex]>;
+            FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
+            FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
@@ -357,4 +376,5 @@ def Hexagon : Target {
   let AssemblyParsers = [HexagonAsmParser];
   let AssemblyParserVariants = [HexagonAsmParserVariant];
   let AssemblyWriters = [HexagonAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 68b1fe6bf4b1..0ac83ea7c5fc 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -62,10 +62,6 @@ void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
 
 #define DEBUG_TYPE "asm-printer"
 
-static cl::opt<bool> AlignCalls(
-         "hexagon-align-calls", cl::Hidden, cl::init(true),
-          cl::desc("Insert falign after call instruction for Hexagon target"));
-
 // Given a scalar register return its pair.
 inline static unsigned getHexagonRegisterPair(unsigned Reg,
       const MCRegisterInfo *RI) {
@@ -76,16 +72,13 @@ inline static unsigned getHexagonRegisterPair(unsigned Reg,
   return Pair;
 }
 
-HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
-                                     std::unique_ptr<MCStreamer> Streamer)
-    : AsmPrinter(TM, std::move(Streamer)) {}
-
 void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                      raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
 
   switch (MO.getType()) {
-  default: llvm_unreachable ("<unknown operand type>");
+  default:
+    llvm_unreachable ("<unknown operand type>");
   case MachineOperand::MO_Register:
     O << HexagonInstPrinter::getRegisterName(MO.getReg());
     return;
@@ -112,8 +105,8 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
 // for the case in which the basic block is reachable by a fall through but
 // through an indirect from a jump table. In this case, the jump table
 // will contain a label not defined by AsmPrinter.
-bool HexagonAsmPrinter::
-isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+bool HexagonAsmPrinter::isBlockOnlyReachableByFallthrough(
+      const MachineBasicBlock *MBB) const {
   if (MBB->hasAddressTaken())
     return false;
   return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
@@ -167,7 +160,8 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
 }
 
 bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
-                                              unsigned OpNo, unsigned AsmVariant,
+                                              unsigned OpNo,
+                                              unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
   if (ExtraCode && ExtraCode[0])
@@ -183,10 +177,10 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
   if (Offset.isImm()) {
     if (Offset.getImm())
-      O << " + #" << Offset.getImm();
-  }
-  else
+      O << "+#" << Offset.getImm();
+  } else {
     llvm_unreachable("Unimplemented");
+  }
 
   return false;
 }
@@ -285,7 +279,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   unsigned VectorSize = HRI.getRegSizeInBits(Hexagon::HvxVRRegClass) / 8;
 
   switch (Inst.getOpcode()) {
-  default: return;
+  default:
+    return;
 
   case Hexagon::A2_iconst: {
     Inst.setOpcode(Hexagon::A2_addi);
@@ -300,30 +295,40 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     break;
   }
 
-  case Hexagon::A2_tfrf:
+  case Hexagon::A2_tfrf: {
+    const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
     Inst.setOpcode(Hexagon::A2_paddif);
-    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    Inst.addOperand(MCOperand::createExpr(Zero));
     break;
+  }
 
-  case Hexagon::A2_tfrt:
+  case Hexagon::A2_tfrt: {
+    const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
     Inst.setOpcode(Hexagon::A2_paddit);
-    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    Inst.addOperand(MCOperand::createExpr(Zero));
     break;
+  }
 
-  case Hexagon::A2_tfrfnew:
+  case Hexagon::A2_tfrfnew: {
+    const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
     Inst.setOpcode(Hexagon::A2_paddifnew);
-    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    Inst.addOperand(MCOperand::createExpr(Zero));
     break;
+  }
 
-  case Hexagon::A2_tfrtnew:
+  case Hexagon::A2_tfrtnew: {
+    const MCConstantExpr *Zero = MCConstantExpr::create(0, OutContext);
     Inst.setOpcode(Hexagon::A2_padditnew);
-    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    Inst.addOperand(MCOperand::createExpr(Zero));
     break;
+  }
 
-  case Hexagon::A2_zxtb:
+  case Hexagon::A2_zxtb: {
+    const MCConstantExpr *C255 = MCConstantExpr::create(255, OutContext);
     Inst.setOpcode(Hexagon::A2_andir);
-    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, OutContext)));
+    Inst.addOperand(MCOperand::createExpr(C255));
     break;
+  }
 
   // "$dst = CONST64(#$src1)",
   case Hexagon::CONST64:
@@ -525,10 +530,12 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
     if (Success && Imm < 0) {
       const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(MOne, OutContext)));
+      const HexagonMCExpr *E = HexagonMCExpr::create(MOne, OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(E));
     } else {
       const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(Zero, OutContext)));
+      const HexagonMCExpr *E = HexagonMCExpr::create(Zero, OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(E));
     }
     TmpInst.addOperand(MO);
     MappedInst = TmpInst;
@@ -569,9 +576,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MO.setReg(High);
     // Add a new operand for the second register in the pair.
     MappedInst.addOperand(MCOperand::createReg(Low));
-    MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
-                          ? Hexagon::C2_ccombinewnewt
-                          : Hexagon::C2_ccombinewnewf);
+    MappedInst.setOpcode(Inst.getOpcode() == Hexagon::A2_tfrptnew
+                            ? Hexagon::C2_ccombinewnewt
+                            : Hexagon::C2_ccombinewnewf);
     return;
   }
 
@@ -615,6 +622,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MappedInst = TmpInst;
     return;
   }
+
   case Hexagon::V6_vdd0: {
     MCInst TmpInst;
     assert (Inst.getOperand(0).isReg() &&
@@ -627,6 +635,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MappedInst = TmpInst;
     return;
   }
+
   case Hexagon::V6_vL32Ub_pi:
   case Hexagon::V6_vL32b_cur_pi:
   case Hexagon::V6_vL32b_nt_cur_pi:
@@ -735,12 +744,10 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   case Hexagon::V6_vS32b_srls_pi:
     MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
     return;
-
   }
 }
 
-/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
-/// the current output stream.
+/// Print out a single Hexagon MI to the current output stream.
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst MCB;
   MCB.setOpcode(Hexagon::BUNDLE);
@@ -748,21 +755,27 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
 
   if (MI->isBundle()) {
+    assert(Subtarget->usePackets() && "Support for packets is disabled");
     const MachineBasicBlock* MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
 
     for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
-      if (!MII->isDebugValue() && !MII->isImplicitDef())
+      if (!MII->isDebugInstr() && !MII->isImplicitDef())
         HexagonLowerToMC(MCII, &*MII, MCB, *this);
-  }
-  else
+  } else {
     HexagonLowerToMC(MCII, MI, MCB, *this);
+  }
+
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  if (MI->isBundle() && HII.getBundleNoShuf(*MI))
+    HexagonMCInstrInfo::setMemReorderDisabled(MCB);
 
-  bool Ok = HexagonMCInstrInfo::canonicalizePacket(
-      MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
-  assert(Ok);
-  (void)Ok;
-  if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+  MCContext &Ctx = OutStreamer->getContext();
+  bool Ok = HexagonMCInstrInfo::canonicalizePacket(MCII, *Subtarget, Ctx,
+                                                   MCB, nullptr);
+  assert(Ok); (void)Ok;
+  if (HexagonMCInstrInfo::bundleSize(MCB) == 0)
     return;
   OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index 4b8865672cf4..d0629d173a65 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -18,7 +18,8 @@
 #include "HexagonSubtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include <memory>
+#include "llvm/MC/MCStreamer.h"
+#include <utility>
 
 namespace llvm {
 
@@ -32,7 +33,8 @@ class TargetMachine;
 
   public:
     explicit HexagonAsmPrinter(TargetMachine &TM,
-                               std::unique_ptr<MCStreamer> Streamer);
+                               std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
 
     bool runOnMachineFunction(MachineFunction &Fn) override {
       Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
@@ -43,13 +45,11 @@ class TargetMachine;
       return "Hexagon Assembly Printer";
     }
 
-    bool isBlockOnlyReachableByFallthrough(
-                                   const MachineBasicBlock *MBB) const override;
+    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
+          const override;
 
     void EmitInstruction(const MachineInstr *MI) override;
-
-    void HexagonProcessInstruction(MCInst &Inst,
-                                   const MachineInstr &MBB);
+    void HexagonProcessInstruction(MCInst &Inst, const MachineInstr &MBB);
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -58,8 +58,6 @@ class TargetMachine;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &OS) override;
-
-    static const char *getRegisterName(unsigned RegNo);
   };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 9e73766b6fdc..4791b067aa8d 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -184,9 +184,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonBitSimplify() : MachineFunctionPass(ID) {
-      initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonBitSimplify() : MachineFunctionPass(ID) {}
 
     StringRef getPassName() const override {
       return "Hexagon bit simplification";
@@ -257,10 +255,10 @@ namespace {
 
 char HexagonBitSimplify::ID = 0;
 
-INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit",
+INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexagon-bit-simplify",
       "Hexagon bit simplification", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
+INITIALIZE_PASS_END(HexagonBitSimplify, "hexagon-bit-simplify",
       "Hexagon bit simplification", false, false)
 
 bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
@@ -622,7 +620,7 @@ bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits,
 // operand may be a subregister of a larger register, while Bits would
 // correspond to the larger register in its entirety. Because of that,
 // the parameter Begin can be used to indicate which bit of Bits should be
-// considered the LSB of of the operand.
+// considered the LSB of the operand.
 bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
       BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) {
   using namespace Hexagon;
@@ -2452,7 +2450,7 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
   if (Len == RW)
     return false;
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << __func__ << " on reg: " << printReg(RD.Reg, &HRI, RD.Sub)
            << ", MI: " << *MI;
     dbgs() << "Cell: " << RC << '\n';
@@ -2646,7 +2644,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
 
   const HexagonEvaluator HE(HRI, MRI, HII, MF);
   BitTracker BT(HE, MF);
-  DEBUG(BT.trace(true));
+  LLVM_DEBUG(BT.trace(true));
   BT.run();
 
   MachineBasicBlock &Entry = MF.front();
@@ -2977,7 +2975,8 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
 }
 
 bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
-  DEBUG(dbgs() << "Processing loop in " << printMBBReference(*C.LB) << "\n");
+  LLVM_DEBUG(dbgs() << "Processing loop in " << printMBBReference(*C.LB)
+                    << "\n");
   std::vector<PhiInfo> Phis;
   for (auto &I : *C.LB) {
     if (!I.isPHI())
@@ -3001,7 +3000,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
     Phis.push_back(PhiInfo(I, *C.LB));
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Phis: {";
     for (auto &I : Phis) {
       dbgs() << ' ' << printReg(I.DefR, HRI) << "=phi("
@@ -3122,7 +3121,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
     Groups.push_back(G);
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
       InstrGroup &G = Groups[i];
       dbgs() << "Group[" << i << "] inp: "
@@ -3190,7 +3189,7 @@ bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
   BitTracker BT(HE, MF);
-  DEBUG(BT.trace(true));
+  LLVM_DEBUG(BT.trace(true));
   BT.run();
   BTP = &BT;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index b6e220beb0c6..e13cfd3f655a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -325,7 +325,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
       int FI = op(1).getIndex();
       int Off = op(2).getImm();
       unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
-      unsigned L = Log2_32(A);
+      unsigned L = countTrailingZeros(A);
       RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
       RC.fill(0, L, BT::BitValue::Zero);
       return rr0(RC, Outputs);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index ff915ca59dae..48a4505458ae 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -85,7 +85,7 @@ void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
   if (empty())
     return;
 
-  std::sort(begin(), end());
+  llvm::sort(begin(), end());
   iterator Iter = begin();
 
   while (Iter != end()-1) {
@@ -160,7 +160,7 @@ HexagonBlockRanges::InstrIndexMap::InstrIndexMap(MachineBasicBlock &B)
   IndexType Idx = IndexType::First;
   First = Idx;
   for (auto &In : B) {
-    if (In.isDebugValue())
+    if (In.isDebugInstr())
       continue;
     assert(getIndex(&In) == IndexType::None && "Instruction already in map");
     Map.insert(std::make_pair(Idx, &In));
@@ -314,7 +314,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
   RegisterSet Defs, Clobbers;
 
   for (auto &In : B) {
-    if (In.isDebugValue())
+    if (In.isDebugInstr())
       continue;
     IndexType Index = IndexMap.getIndex(&In);
     // Process uses first.
@@ -422,10 +422,10 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
 HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeLiveMap(
       InstrIndexMap &IndexMap) {
   RegToRangeMap LiveMap;
-  DEBUG(dbgs() << __func__ << ": index map\n" << IndexMap << '\n');
+  LLVM_DEBUG(dbgs() << __func__ << ": index map\n" << IndexMap << '\n');
   computeInitialLiveRanges(IndexMap, LiveMap);
-  DEBUG(dbgs() << __func__ << ": live map\n"
-               << PrintRangeMap(LiveMap, TRI) << '\n');
+  LLVM_DEBUG(dbgs() << __func__ << ": live map\n"
+                    << PrintRangeMap(LiveMap, TRI) << '\n');
   return LiveMap;
 }
 
@@ -486,8 +486,8 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
     if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
       addDeadRanges(P.first);
 
-  DEBUG(dbgs() << __func__ << ": dead map\n"
-               << PrintRangeMap(DeadMap, TRI) << '\n');
+  LLVM_DEBUG(dbgs() << __func__ << ": dead map\n"
+                    << PrintRangeMap(DeadMap, TRI) << '\n');
   return DeadMap;
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index 84af4b14b9f7..2fa7888dd02b 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -90,7 +90,7 @@ FunctionPass *llvm::createHexagonBranchRelaxation() {
 }
 
 bool HexagonBranchRelaxation::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
+  LLVM_DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
 
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   HII = HST.getInstrInfo();
@@ -114,8 +114,12 @@ void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
       InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
     }
     OffsetMap[&B] = InstOffset;
-    for (auto &MI : B.instrs())
+    for (auto &MI : B.instrs()) {
       InstOffset += HII->getSize(MI);
+      // Assume that all extendable branches will be extended.
+      if (MI.isBranch() && HII->isExtendable(MI))
+        InstOffset += HEXAGON_INSTR_SIZE;
+    }
   }
 }
 
@@ -145,6 +149,9 @@ bool HexagonBranchRelaxation::isJumpOutOfRange(MachineInstr &MI,
   if (FirstTerm == B.instr_end())
     return false;
 
+  if (HII->isExtended(MI))
+    return false;
+
   unsigned InstOffset = BlockToInstOffset[&B];
   unsigned Distance = 0;
 
@@ -193,14 +200,14 @@ bool HexagonBranchRelaxation::reGenerateBranch(MachineFunction &MF,
     for (auto &MI : B) {
       if (!MI.isBranch() || !isJumpOutOfRange(MI, BlockToInstOffset))
         continue;
-      DEBUG(dbgs() << "Long distance jump. isExtendable("
-                   << HII->isExtendable(MI) << ") isConstExtended("
-                   << HII->isConstExtended(MI) << ") " << MI);
+      LLVM_DEBUG(dbgs() << "Long distance jump. isExtendable("
+                        << HII->isExtendable(MI) << ") isConstExtended("
+                        << HII->isConstExtended(MI) << ") " << MI);
 
       // Since we have not merged HW loops relaxation into
       // this code (yet), soften our approach for the moment.
       if (!HII->isExtendable(MI) && !HII->isExtended(MI)) {
-        DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
+        LLVM_DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
       } else {
         // Find which operand is expandable.
         int ExtOpNum = HII->getCExtOpNum(MI);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 000000000000..ed2f87570d6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,134 @@
+//===- HexagonCallingConv.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class CCIfArgIsVarArg<CCAction A>
+  : CCIf<"State.isVarArg() && "
+         "ValNo >= static_cast<HexagonCCState&>(State)"
+         ".getNumNamedVarArgParams()", A>;
+
+def CC_HexagonStack: CallingConv<[
+  CCIfType<[i32,v2i16,v4i8],
+    CCAssignToStack<4,4>>,
+  CCIfType<[i64,v2i32,v4i16,v8i8],
+    CCAssignToStack<8,8>>
+]>;
+
+def CC_Hexagon: CallingConv<[
+  CCIfType<[i1,i8,i16],
+    CCPromoteToType<i32>>,
+  CCIfType<[f32],
+    CCBitConvertToType<i32>>,
+  CCIfType<[f64],
+    CCBitConvertToType<i64>>,
+
+  CCIfByVal<
+    CCPassByVal<8,8>>,
+  CCIfArgIsVarArg<
+    CCDelegateTo<CC_HexagonStack>>,
+
+  // Pass split values in pairs, allocate odd register if necessary.
+  CCIfType<[i32],
+    CCIfSplit<
+      CCCustom<"CC_SkipOdd">>>,
+
+  CCIfType<[i32,v2i16,v4i8],
+    CCAssignToReg<[R0,R1,R2,R3,R4,R5]>>,
+  // Make sure to allocate any skipped 32-bit register, so it does not get
+  // allocated to a subsequent 32-bit value.
+  CCIfType<[i64,v2i32,v4i16,v8i8],
+    CCCustom<"CC_SkipOdd">>,
+  CCIfType<[i64,v2i32,v4i16,v8i8],
+    CCAssignToReg<[D0,D1,D2]>>,
+
+  CCDelegateTo<CC_HexagonStack>
+]>;
+
+def RetCC_Hexagon: CallingConv<[
+  CCIfType<[i1,i8,i16],
+    CCPromoteToType<i32>>,
+  CCIfType<[f32],
+    CCBitConvertToType<i32>>,
+  CCIfType<[f64],
+    CCBitConvertToType<i64>>,
+
+  // Small structures are returned in a pair of registers, (which is
+  // always r1:0). In such case, what is returned are two i32 values
+  // without any additional information (in ArgFlags) stating that
+  // they are parts of a structure. Because of that there is no way
+  // to differentiate that situation from an attempt to return two
+  // values, so always assign R0 and R1.
+  CCIfSplit<
+    CCAssignToReg<[R0,R1]>>,
+  CCIfType<[i32,v2i16,v4i8],
+    CCAssignToReg<[R0,R1]>>,
+  CCIfType<[i64,v2i32,v4i16,v8i8],
+    CCAssignToReg<[D0]>>
+]>;
+
+
+class CCIfHvx64<CCAction A>
+  : CCIf<"State.getMachineFunction().getSubtarget<HexagonSubtarget>()"
+         ".useHVX64BOps()", A>;
+
+class CCIfHvx128<CCAction A>
+  : CCIf<"State.getMachineFunction().getSubtarget<HexagonSubtarget>()"
+         ".useHVX128BOps()", A>;
+
+def CC_Hexagon_HVX: CallingConv<[
+  // HVX 64-byte mode
+  CCIfHvx64<
+    CCIfType<[v16i32,v32i16,v64i8],
+      CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
+  CCIfHvx64<
+    CCIfType<[v32i32,v64i16,v128i8],
+      CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
+  CCIfHvx64<
+    CCIfType<[v16i32,v32i16,v64i8],
+      CCAssignToStack<64,64>>>,
+  CCIfHvx64<
+    CCIfType<[v32i32,v64i16,v128i8],
+      CCAssignToStack<128,64>>>,
+
+  // HVX 128-byte mode
+  CCIfHvx128<
+    CCIfType<[v32i32,v64i16,v128i8],
+      CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
+  CCIfHvx128<
+    CCIfType<[v64i32,v128i16,v256i8],
+      CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
+  CCIfHvx128<
+    CCIfType<[v32i32,v64i16,v128i8],
+      CCAssignToStack<128,128>>>,
+  CCIfHvx128<
+    CCIfType<[v64i32,v128i16,v256i8],
+      CCAssignToStack<256,128>>>,
+
+  CCDelegateTo<CC_Hexagon>
+]>;
+
+def RetCC_Hexagon_HVX: CallingConv<[
+  // HVX 64-byte mode
+  CCIfHvx64<
+    CCIfType<[v16i32,v32i16,v64i8],
+      CCAssignToReg<[V0]>>>,
+  CCIfHvx64<
+    CCIfType<[v32i32,v64i16,v128i8],
+      CCAssignToReg<[W0]>>>,
+
+  // HVX 128-byte mode
+  CCIfHvx128<
+    CCIfType<[v32i32,v64i16,v128i8],
+      CCAssignToReg<[V0]>>>,
+  CCIfHvx128<
+    CCIfType<[v64i32,v128i16,v256i8],
+      CCAssignToReg<[W0]>>>,
+
+  CCDelegateTo<RetCC_Hexagon>
+]>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 7e3d049d337f..f315e24eba62 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -36,7 +37,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -342,7 +342,7 @@ bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
 
 void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
       ValueToNodeMap &NM) {
-  DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
+  LLVM_DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
   GepNode *N = new (*Mem) GepNode;
   Value *PtrOp = GepI->getPointerOperand();
   uint32_t InBounds = GepI->isInBounds() ? GepNode::InBounds : 0;
@@ -426,7 +426,7 @@ void HexagonCommonGEP::collect() {
     }
   }
 
-  DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
+  LLVM_DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
 }
 
 static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
@@ -575,7 +575,7 @@ void HexagonCommonGEP::common() {
     }
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Gep node equality:\n";
     for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I)
       dbgs() << "{ " << I->first << ", " << I->second << " }\n";
@@ -642,7 +642,7 @@ void HexagonCommonGEP::common() {
     N->Parent = Rep;
   }
 
-  DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
+  LLVM_DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
 
   // Finally, erase the nodes that are no longer used.
   NodeSet Erase;
@@ -662,35 +662,35 @@ void HexagonCommonGEP::common() {
   NodeVect::iterator NewE = remove_if(Nodes, in_set(Erase));
   Nodes.resize(std::distance(Nodes.begin(), NewE));
 
-  DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
+  LLVM_DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
 }
 
 template <typename T>
 static BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) {
-    DEBUG({
-      dbgs() << "NCD of {";
-      for (typename T::iterator I = Blocks.begin(), E = Blocks.end();
-           I != E; ++I) {
-        if (!*I)
-          continue;
-        BasicBlock *B = cast<BasicBlock>(*I);
-        dbgs() << ' ' << B->getName();
-      }
-      dbgs() << " }\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "NCD of {";
+    for (typename T::iterator I = Blocks.begin(), E = Blocks.end(); I != E;
+         ++I) {
+      if (!*I)
+        continue;
+      BasicBlock *B = cast<BasicBlock>(*I);
+      dbgs() << ' ' << B->getName();
+    }
+    dbgs() << " }\n";
+  });
 
-    // Allow null basic blocks in Blocks.  In such cases, return nullptr.
-    typename T::iterator I = Blocks.begin(), E = Blocks.end();
-    if (I == E || !*I)
+  // Allow null basic blocks in Blocks.  In such cases, return nullptr.
+  typename T::iterator I = Blocks.begin(), E = Blocks.end();
+  if (I == E || !*I)
+    return nullptr;
+  BasicBlock *Dom = cast<BasicBlock>(*I);
+  while (++I != E) {
+    BasicBlock *B = cast_or_null<BasicBlock>(*I);
+    Dom = B ? DT->findNearestCommonDominator(Dom, B) : nullptr;
+    if (!Dom)
       return nullptr;
-    BasicBlock *Dom = cast<BasicBlock>(*I);
-    while (++I != E) {
-      BasicBlock *B = cast_or_null<BasicBlock>(*I);
-      Dom = B ? DT->findNearestCommonDominator(Dom, B) : nullptr;
-      if (!Dom)
-        return nullptr;
     }
-    DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
     return Dom;
 }
 
@@ -753,7 +753,7 @@ static bool is_empty(const BasicBlock *B) {
 
 BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
       NodeChildrenMap &NCM, NodeToValueMap &Loc) {
-  DEBUG(dbgs() << "Loc for node:" << Node << '\n');
+  LLVM_DEBUG(dbgs() << "Loc for node:" << Node << '\n');
   // Recalculate the placement for Node, assuming that the locations of
   // its children in Loc are valid.
   // Return nullptr if there is no valid placement for Node (for example, it
@@ -820,7 +820,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
 
 BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
       NodeChildrenMap &NCM, NodeToValueMap &Loc) {
-  DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
+  LLVM_DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
   // Recalculate the placement of Node, after recursively recalculating the
   // placements of all its children.
   NodeChildrenMap::iterator CF = NCM.find(Node);
@@ -830,7 +830,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
       recalculatePlacementRec(*I, NCM, Loc);
   }
   BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
-  DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
+  LLVM_DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
   return LB;
 }
 
@@ -952,8 +952,8 @@ namespace {
 void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
       NodeToValueMap &Loc) {
   User *R = U->getUser();
-  DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: "
-               << *R << '\n');
+  LLVM_DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: " << *R
+                    << '\n');
   BasicBlock *PB = cast<Instruction>(R)->getParent();
 
   GepNode *N = Node;
@@ -996,7 +996,7 @@ void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
 
   // Should at least have U in NewUs.
   NewNode->Flags |= GepNode::Used;
-  DEBUG(dbgs() << "new node: " << NewNode << "  " << *NewNode << '\n');
+  LLVM_DEBUG(dbgs() << "new node: " << NewNode << "  " << *NewNode << '\n');
   assert(!NewUs.empty());
   Uses[NewNode] = NewUs;
 }
@@ -1007,7 +1007,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
   NodeSet Ns;
   nodes_for_root(Node, NCM, Ns);
 
-  DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
+  LLVM_DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
   // Collect all used nodes together with the uses from loads and stores,
   // where the GEP node could be folded into the load/store instruction.
   NodeToUsesMap FNs; // Foldable nodes.
@@ -1044,7 +1044,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
       FNs.insert(std::make_pair(N, LSs));
   }
 
-  DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
+  LLVM_DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
 
   for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
     GepNode *N = I->first;
@@ -1066,32 +1066,33 @@ void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
   for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
     recalculatePlacementRec(*I, NCM, Loc);
 
-  DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
+  LLVM_DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
 
   if (OptEnableInv) {
     for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
       adjustForInvariance(*I, NCM, Loc);
 
-    DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
-                 << LocationAsBlock(Loc));
+    LLVM_DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
+                      << LocationAsBlock(Loc));
   }
   if (OptEnableConst) {
     for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
       separateConstantChains(*I, NCM, Loc);
   }
-  DEBUG(dbgs() << "Node use information:\n" << Uses);
+  LLVM_DEBUG(dbgs() << "Node use information:\n" << Uses);
 
   // At the moment, there is no further refinement of the initial placement.
   // Such a refinement could include splitting the nodes if they are placed
   // too far from some of its users.
 
-  DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
+  LLVM_DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
 }
 
 Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
       BasicBlock *LocB) {
-  DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
-               << " for nodes:\n" << NA);
+  LLVM_DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
+                    << " for nodes:\n"
+                    << NA);
   unsigned Num = NA.size();
   GepNode *RN = NA[0];
   assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
@@ -1128,7 +1129,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
     Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
     NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
     NewInst->setIsInBounds(RN->Flags & GepNode::InBounds);
-    DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
+    LLVM_DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
     Input = NewInst;
   } while (nax <= Num);
 
@@ -1161,7 +1162,7 @@ void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
 }
 
 void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
-  DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
+  LLVM_DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
   NodeChildrenMap NCM;
   NodeVect Roots;
   // Compute the inversion again, since computing placement could alter
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 294a6da69f51..cbce61bc63c9 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -39,31 +39,57 @@ namespace llvm {
   FunctionPass *createHexagonConstExtenders();
 }
 
+static int32_t adjustUp(int32_t V, uint8_t A, uint8_t O) {
+  assert(isPowerOf2_32(A));
+  int32_t U = (V & -A) + O;
+  return U >= V ? U : U+A;
+}
+
+static int32_t adjustDown(int32_t V, uint8_t A, uint8_t O) {
+  assert(isPowerOf2_32(A));
+  int32_t U = (V & -A) + O;
+  return U <= V ? U : U-A;
+}
+
 namespace {
   struct OffsetRange {
+    // The range of values between Min and Max that are of form Align*N+Offset,
+    // for some integer N. Min and Max are required to be of that form as well,
+    // except in the case of an empty range.
     int32_t Min = INT_MIN, Max = INT_MAX;
     uint8_t Align = 1;
+    uint8_t Offset = 0;
 
     OffsetRange() = default;
-    OffsetRange(int32_t L, int32_t H, uint8_t A)
-      : Min(L), Max(H), Align(A) {}
+    OffsetRange(int32_t L, int32_t H, uint8_t A, uint8_t O = 0)
+      : Min(L), Max(H), Align(A), Offset(O) {}
     OffsetRange &intersect(OffsetRange A) {
-      Align = std::max(Align, A.Align);
-      Min = std::max(Min, A.Min);
-      Max = std::min(Max, A.Max);
+      if (Align < A.Align)
+        std::swap(*this, A);
+
+      // Align >= A.Align.
+      if (Offset >= A.Offset && (Offset - A.Offset) % A.Align == 0) {
+        Min = adjustUp(std::max(Min, A.Min), Align, Offset);
+        Max = adjustDown(std::min(Max, A.Max), Align, Offset);
+      } else {
+        // Make an empty range.
+        Min = 0;
+        Max = -1;
+      }
       // Canonicalize empty ranges.
       if (Min > Max)
         std::tie(Min, Max, Align) = std::make_tuple(0, -1, 1);
       return *this;
     }
     OffsetRange &shift(int32_t S) {
-      assert(alignTo(std::abs(S), Align) == uint64_t(std::abs(S)));
       Min += S;
       Max += S;
+      Offset = (Offset+S) % Align;
       return *this;
     }
     OffsetRange &extendBy(int32_t D) {
       // If D < 0, extend Min, otherwise extend Max.
+      assert(D % Align == 0);
       if (D < 0)
         Min = (INT_MIN-D < Min) ? Min+D : INT_MIN;
       else
@@ -74,7 +100,7 @@ namespace {
       return Min > Max;
     }
     bool contains(int32_t V) const {
-      return Min <= V && V <= Max && (V % Align) == 0;
+      return Min <= V && V <= Max && (V-Offset) % Align == 0;
     }
     bool operator==(const OffsetRange &R) const {
       return Min == R.Min && Max == R.Max && Align == R.Align;
@@ -408,7 +434,8 @@ namespace {
   raw_ostream &operator<< (raw_ostream &OS, const OffsetRange &OR) {
     if (OR.Min > OR.Max)
       OS << '!';
-    OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align);
+    OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align)
+       << '+' << unsigned(OR.Offset);
     return OS;
   }
 
@@ -703,9 +730,21 @@ bool HCE::ExtRoot::operator< (const HCE::ExtRoot &ER) const {
     }
     case MachineOperand::MO_ExternalSymbol:
       return StringRef(V.SymbolName) < StringRef(ER.V.SymbolName);
-    case MachineOperand::MO_GlobalAddress:
-      assert(V.GV->hasName() && ER.V.GV->hasName());
-      return V.GV->getName() < ER.V.GV->getName();
+    case MachineOperand::MO_GlobalAddress: {
+      // Global values may not have names, so compare their positions
+      // in the parent module.
+      const Module &M = *V.GV->getParent();
+      auto FindPos = [&M] (const GlobalValue &V) {
+        unsigned P = 0;
+        for (const GlobalValue &T : M.global_values()) {
+          if (&T == &V)
+            return P;
+          P++;
+        }
+        llvm_unreachable("Global value not found in module");
+      };
+      return FindPos(*V.GV) < FindPos(*ER.V.GV);
+    }
     case MachineOperand::MO_BlockAddress: {
       const BasicBlock *ThisB = V.BA->getBasicBlock();
       const BasicBlock *OtherB = ER.V.BA->getBasicBlock();
@@ -999,15 +1038,19 @@ unsigned HCE::getDirectRegReplacement(unsigned ExtOpc) const {
   return 0;
 }
 
-// Return the allowable deviation from the current value of Rb which the
+// Return the allowable deviation from the current value of Rb (i.e. the
+// range of values that can be added to the current value) which the
 // instruction MI can accommodate.
 // The instruction MI is a user of register Rb, which is defined via an
 // extender. It may be possible for MI to be tweaked to work for a register
 // defined with a slightly different value. For example
-//   ... = L2_loadrub_io Rb, 0
+//   ... = L2_loadrub_io Rb, 1
 // can be modifed to be
-//   ... = L2_loadrub_io Rb', 1
-// if Rb' = Rb-1.
+//   ... = L2_loadrub_io Rb', 0
+// if Rb' = Rb+1.
+// The range for Rb would be [Min+1, Max+1], where [Min, Max] is a range
+// for L2_loadrub with offset 0. That means that Rb could be replaced with
+// Rc, where Rc-Rb belongs to [Min+1, Max+1].
 OffsetRange HCE::getOffsetRange(Register Rb, const MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   // Instructions that are constant-extended may be replaced with something
@@ -1109,6 +1152,13 @@ void HCE::recordExtender(MachineInstr &MI, unsigned OpNum) {
   bool IsLoad = MI.mayLoad();
   bool IsStore = MI.mayStore();
 
+  // Fixed stack slots have negative indexes, and they cannot be used
+  // with TRI::stackSlot2Index and TRI::index2StackSlot. This is somewhat
+  // unfortunate, but should not be a frequent thing.
+  for (MachineOperand &Op : MI.operands())
+    if (Op.isFI() && Op.getIndex() < 0)
+      return;
+
   if (IsLoad || IsStore) {
     unsigned AM = HII->getAddrMode(MI);
     switch (AM) {
@@ -1220,7 +1270,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
     if (!ED.IsDef)
       continue;
     ExtValue EV(ED);
-    DEBUG(dbgs() << " =" << I << ". " << EV << "  " << ED << '\n');
+    LLVM_DEBUG(dbgs() << " =" << I << ". " << EV << "  " << ED << '\n');
     assert(ED.Rd.Reg != 0);
     Ranges[I-Begin] = getOffsetRange(ED.Rd).shift(EV.Offset);
     // A2_tfrsi is a special case: it will be replaced with A2_addi, which
@@ -1240,7 +1290,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
     if (ED.IsDef)
       continue;
     ExtValue EV(ED);
-    DEBUG(dbgs() << "  " << I << ". " << EV << "  " << ED << '\n');
+    LLVM_DEBUG(dbgs() << "  " << I << ". " << EV << "  " << ED << '\n');
     OffsetRange Dev = getOffsetRange(ED);
     Ranges[I-Begin].intersect(Dev.shift(EV.Offset));
   }
@@ -1252,7 +1302,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
   for (unsigned I = Begin; I != End; ++I)
     RangeMap[Ranges[I-Begin]].insert(I);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Ranges\n";
     for (unsigned I = Begin; I != End; ++I)
       dbgs() << "  " << I << ". " << Ranges[I-Begin] << '\n';
@@ -1280,11 +1330,17 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
   SmallVector<RangeTree::Node*,8> Nodes;
   Tree.order(Nodes);
 
-  auto MaxAlign = [](const SmallVectorImpl<RangeTree::Node*> &Nodes) {
-    uint8_t Align = 1;
-    for (RangeTree::Node *N : Nodes)
-      Align = std::max(Align, N->Range.Align);
-    return Align;
+  auto MaxAlign = [](const SmallVectorImpl<RangeTree::Node*> &Nodes,
+                     uint8_t Align, uint8_t Offset) {
+    for (RangeTree::Node *N : Nodes) {
+      if (N->Range.Align <= Align || N->Range.Offset < Offset)
+        continue;
+      if ((N->Range.Offset - Offset) % Align != 0)
+        continue;
+      Align = N->Range.Align;
+      Offset = N->Range.Offset;
+    }
+    return std::make_pair(Align, Offset);
   };
 
   // Construct the set of all potential definition points from the endpoints
@@ -1294,14 +1350,14 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
   std::set<int32_t> CandSet;
   for (RangeTree::Node *N : Nodes) {
     const OffsetRange &R = N->Range;
-    uint8_t A0 = MaxAlign(Tree.nodesWith(R.Min, false));
+    auto P0 = MaxAlign(Tree.nodesWith(R.Min, false), R.Align, R.Offset);
     CandSet.insert(R.Min);
-    if (R.Align < A0)
-      CandSet.insert(R.Min < 0 ? -alignDown(-R.Min, A0) : alignTo(R.Min, A0));
-    uint8_t A1 = MaxAlign(Tree.nodesWith(R.Max, false));
+    if (R.Align < P0.first)
+      CandSet.insert(adjustUp(R.Min, P0.first, P0.second));
+    auto P1 = MaxAlign(Tree.nodesWith(R.Max, false), R.Align, R.Offset);
     CandSet.insert(R.Max);
-    if (R.Align < A1)
-      CandSet.insert(R.Max < 0 ? -alignTo(-R.Max, A1) : alignDown(R.Max, A1));
+    if (R.Align < P1.first)
+      CandSet.insert(adjustDown(R.Max, P1.first, P1.second));
   }
 
   // Build the assignment map: candidate C -> { list of extender indexes }.
@@ -1340,7 +1396,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
     }
   }
 
-  DEBUG(dbgs() << "IMap (before fixup) = " << PrintIMap(IMap, *HRI));
+  LLVM_DEBUG(dbgs() << "IMap (before fixup) = " << PrintIMap(IMap, *HRI));
 
   // There is some ambiguity in what initializer should be used, if the
   // descriptor's subexpression is non-trivial: it can be the entire
@@ -1359,10 +1415,50 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
     AssignmentMap::iterator F = IMap.find({EV, ExtExpr()});
     if (F == IMap.end())
       continue;
+
     // Finally, check if all extenders have the same value as the initializer.
-    auto SameValue = [&EV,this](unsigned I) {
+    // Make sure that extenders that are a part of a stack address are not
+    // merged with those that aren't. Stack addresses need an offset field
+    // (to be used by frame index elimination), while non-stack expressions
+    // can be replaced with forms (such as rr) that do not have such a field.
+    // Example:
+    //
+    // Collected 3 extenders
+    //  =2. imm:0  off:32968  bb#2: %7 = ## + __ << 0, def
+    //   0. imm:0  off:267  bb#0: __ = ## + SS#1 << 0
+    //   1. imm:0  off:267  bb#1: __ = ## + SS#1 << 0
+    // Ranges
+    //   0. [-756,267]a1+0
+    //   1. [-756,267]a1+0
+    //   2. [201,65735]a1+0
+    // RangeMap
+    //   [-756,267]a1+0 -> 0 1
+    //   [201,65735]a1+0 -> 2
+    // IMap (before fixup) = {
+    //   [imm:0  off:267, ## + __ << 0] -> { 2 }
+    //   [imm:0  off:267, ## + SS#1 << 0] -> { 0 1 }
+    // }
+    // IMap (after fixup) = {
+    //   [imm:0  off:267, ## + __ << 0] -> { 2 0 1 }
+    //   [imm:0  off:267, ## + SS#1 << 0] -> { }
+    // }
+    // Inserted def in bb#0 for initializer: [imm:0  off:267, ## + __ << 0]
+    //   %12:intregs = A2_tfrsi 267
+    //
+    // The result was
+    //   %12:intregs = A2_tfrsi 267
+    //   S4_pstorerbt_rr %3, %12, %stack.1, 0, killed %4
+    // Which became
+    //   r0 = #267
+    //   if (p0.new) memb(r0+r29<<#4) = r2
+
+    bool IsStack = any_of(F->second, [this](unsigned I) {
+                      return Extenders[I].Expr.Rs.isSlot();
+                   });
+    auto SameValue = [&EV,this,IsStack](unsigned I) {
       const ExtDesc &ED = Extenders[I];
-      return ExtValue(ED).Offset == EV.Offset;
+      return ED.Expr.Rs.isSlot() == IsStack &&
+             ExtValue(ED).Offset == EV.Offset;
     };
     if (all_of(P.second, SameValue)) {
       F->second.insert(P.second.begin(), P.second.end());
@@ -1370,7 +1466,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
     }
   }
 
-  DEBUG(dbgs() << "IMap (after fixup) = " << PrintIMap(IMap, *HRI));
+  LLVM_DEBUG(dbgs() << "IMap (after fixup) = " << PrintIMap(IMap, *HRI));
 }
 
 void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
@@ -1473,9 +1569,9 @@ HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
 
   assert(InitI);
   (void)InitI;
-  DEBUG(dbgs() << "Inserted def in bb#" << MBB.getNumber()
-               << " for initializer: " << PrintInit(ExtI, *HRI)
-               << "\n  " << *InitI);
+  LLVM_DEBUG(dbgs() << "Inserted def in bb#" << MBB.getNumber()
+                    << " for initializer: " << PrintInit(ExtI, *HRI) << "\n  "
+                    << *InitI);
   return { DefR, 0 };
 }
 
@@ -1618,7 +1714,7 @@ bool HCE::replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI,
     assert(IdxOpc == Hexagon::A2_addi);
 
     // Clamp Diff to the 16 bit range.
-    int32_t D = isInt<16>(Diff) ? Diff : (Diff > 32767 ? 32767 : -32767);
+    int32_t D = isInt<16>(Diff) ? Diff : (Diff > 0 ? 32767 : -32768);
     BuildMI(MBB, At, dl, HII->get(IdxOpc))
       .add(MI.getOperand(0))
       .add(MachineOperand(ExtR))
@@ -1626,11 +1722,13 @@ bool HCE::replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI,
     Diff -= D;
 #ifndef NDEBUG
     // Make sure the output is within allowable range for uses.
+    // "Diff" is a difference in the "opposite direction", i.e. Ext - DefV,
+    // not DefV - Ext, as the getOffsetRange would calculate.
     OffsetRange Uses = getOffsetRange(MI.getOperand(0));
-    if (!Uses.contains(Diff))
-      dbgs() << "Diff: " << Diff << " out of range " << Uses
+    if (!Uses.contains(-Diff))
+      dbgs() << "Diff: " << -Diff << " out of range " << Uses
              << " for " << MI;
-    assert(Uses.contains(Diff));
+    assert(Uses.contains(-Diff));
 #endif
     MBB.erase(MI);
     return true;
@@ -1726,8 +1824,8 @@ bool HCE::replaceInstr(unsigned Idx, Register ExtR, const ExtenderInit &ExtI) {
   ExtValue EV(ED);
   int32_t Diff = EV.Offset - DefV.Offset;
   const MachineInstr &MI = *ED.UseMI;
-  DEBUG(dbgs() << __func__ << " Idx:" << Idx << " ExtR:"
-               << PrintRegister(ExtR, *HRI) << " Diff:" << Diff << '\n');
+  LLVM_DEBUG(dbgs() << __func__ << " Idx:" << Idx << " ExtR:"
+                    << PrintRegister(ExtR, *HRI) << " Diff:" << Diff << '\n');
 
   // These two addressing modes must be converted into indexed forms
   // regardless of what the initializer looks like.
@@ -1833,7 +1931,7 @@ const MachineOperand &HCE::getStoredValueOp(const MachineInstr &MI) const {
 bool HCE::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
-  DEBUG(MF.print(dbgs() << "Before " << getPassName() << '\n', nullptr));
+  LLVM_DEBUG(MF.print(dbgs() << "Before " << getPassName() << '\n', nullptr));
 
   HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
@@ -1842,13 +1940,13 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
   AssignmentMap IMap;
 
   collect(MF);
-  std::sort(Extenders.begin(), Extenders.end(),
+  llvm::sort(Extenders.begin(), Extenders.end(),
     [](const ExtDesc &A, const ExtDesc &B) {
       return ExtValue(A) < ExtValue(B);
     });
 
   bool Changed = false;
-  DEBUG(dbgs() << "Collected " << Extenders.size() << " extenders\n");
+  LLVM_DEBUG(dbgs() << "Collected " << Extenders.size() << " extenders\n");
   for (unsigned I = 0, E = Extenders.size(); I != E; ) {
     unsigned B = I;
     const ExtRoot &T = Extenders[B].getOp();
@@ -1860,7 +1958,7 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
     Changed |= replaceExtenders(IMap);
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     if (Changed)
       MF.print(dbgs() << "After " << getPassName() << '\n', nullptr);
     else
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 8ac96f3a4bfa..8f22a71dc1f3 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -617,7 +617,7 @@ void MachineConstPropagator::CellMap::print(raw_ostream &os,
 void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
   const MachineBasicBlock *MB = PN.getParent();
   unsigned MBN = MB->getNumber();
-  DEBUG(dbgs() << "Visiting FI(" << printMBBReference(*MB) << "): " << PN);
+  LLVM_DEBUG(dbgs() << "Visiting FI(" << printMBBReference(*MB) << "): " << PN);
 
   const MachineOperand &MD = PN.getOperand(0);
   Register DefR(MD);
@@ -642,8 +642,8 @@ Bottomize:
     const MachineBasicBlock *PB = PN.getOperand(i+1).getMBB();
     unsigned PBN = PB->getNumber();
     if (!EdgeExec.count(CFGEdge(PBN, MBN))) {
-      DEBUG(dbgs() << "  edge " << printMBBReference(*PB) << "->"
-                   << printMBBReference(*MB) << " not executable\n");
+      LLVM_DEBUG(dbgs() << "  edge " << printMBBReference(*PB) << "->"
+                        << printMBBReference(*MB) << " not executable\n");
       continue;
     }
     const MachineOperand &SO = PN.getOperand(i);
@@ -658,8 +658,9 @@ Bottomize:
 
     LatticeCell SrcC;
     bool Eval = MCE.evaluate(UseR, Cells.get(UseR.Reg), SrcC);
-    DEBUG(dbgs() << "  edge from " << printMBBReference(*PB) << ": "
-                 << printReg(UseR.Reg, &MCE.TRI, UseR.SubReg) << SrcC << '\n');
+    LLVM_DEBUG(dbgs() << "  edge from " << printMBBReference(*PB) << ": "
+                      << printReg(UseR.Reg, &MCE.TRI, UseR.SubReg) << SrcC
+                      << '\n');
     Changed |= Eval ? DefC.meet(SrcC)
                     : DefC.setBottom();
     Cells.update(DefR.Reg, DefC);
@@ -671,11 +672,11 @@ Bottomize:
 }
 
 void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
-  DEBUG(dbgs() << "Visiting MI(" << printMBBReference(*MI.getParent())
-               << "): " << MI);
+  LLVM_DEBUG(dbgs() << "Visiting MI(" << printMBBReference(*MI.getParent())
+                    << "): " << MI);
   CellMap Outputs;
   bool Eval = MCE.evaluate(MI, Cells, Outputs);
-  DEBUG({
+  LLVM_DEBUG({
     if (Eval) {
       dbgs() << "  outputs:";
       for (auto &I : Outputs)
@@ -713,7 +714,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
   }
 }
 
-// \brief Starting at a given branch, visit remaining branches in the block.
+// Starting at a given branch, visit remaining branches in the block.
 // Traverse over the subsequent branches for as long as the preceding one
 // can fall through. Add all the possible targets to the flow work queue,
 // including the potential fall-through to the layout-successor block.
@@ -728,8 +729,8 @@ void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
   while (It != End) {
     const MachineInstr &MI = *It;
     InstrExec.insert(&MI);
-    DEBUG(dbgs() << "Visiting " << (EvalOk ? "BR" : "br") << "("
-                 << printMBBReference(B) << "): " << MI);
+    LLVM_DEBUG(dbgs() << "Visiting " << (EvalOk ? "BR" : "br") << "("
+                      << printMBBReference(B) << "): " << MI);
     // Do not evaluate subsequent branches if the evaluation of any of the
     // previous branches failed. Keep iterating over the branches only
     // to mark them as executable.
@@ -763,23 +764,23 @@ void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
     // last one set "FallsThru", then add an edge to the layout successor
     // to the targets.
     Targets.clear();
-    DEBUG(dbgs() << "  failed to evaluate a branch...adding all CFG "
-                    "successors\n");
+    LLVM_DEBUG(dbgs() << "  failed to evaluate a branch...adding all CFG "
+                         "successors\n");
     for (const MachineBasicBlock *SB : B.successors())
       Targets.insert(SB);
   }
 
   for (const MachineBasicBlock *TB : Targets) {
     unsigned TBN = TB->getNumber();
-    DEBUG(dbgs() << "  pushing edge " << printMBBReference(B) << " -> "
-                 << printMBBReference(*TB) << "\n");
+    LLVM_DEBUG(dbgs() << "  pushing edge " << printMBBReference(B) << " -> "
+                      << printMBBReference(*TB) << "\n");
     FlowQ.push(CFGEdge(MBN, TBN));
   }
 }
 
 void MachineConstPropagator::visitUsesOf(unsigned Reg) {
-  DEBUG(dbgs() << "Visiting uses of " << printReg(Reg, &MCE.TRI)
-               << Cells.get(Reg) << '\n');
+  LLVM_DEBUG(dbgs() << "Visiting uses of " << printReg(Reg, &MCE.TRI)
+                    << Cells.get(Reg) << '\n');
   for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
     // Do not process non-executable instructions. They can become exceutable
     // later (via a flow-edge in the work queue). In such case, the instruc-
@@ -799,7 +800,7 @@ bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
       SetVector<const MachineBasicBlock*> &Targets) {
   MachineBasicBlock::const_iterator FirstBr = MB->end();
   for (const MachineInstr &MI : *MB) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
     if (MI.isBranch()) {
       FirstBr = MI.getIterator();
@@ -814,7 +815,7 @@ bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
   for (MachineBasicBlock::const_iterator I = FirstBr; I != End; ++I) {
     const MachineInstr &MI = *I;
     // Can there be debug instructions between branches?
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
     if (!InstrExec.count(&MI))
       continue;
@@ -870,10 +871,10 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
     CFGEdge Edge = FlowQ.front();
     FlowQ.pop();
 
-    DEBUG(dbgs() << "Picked edge "
-                 << printMBBReference(*MF.getBlockNumbered(Edge.first)) << "->"
-                 << printMBBReference(*MF.getBlockNumbered(Edge.second))
-                 << '\n');
+    LLVM_DEBUG(
+        dbgs() << "Picked edge "
+               << printMBBReference(*MF.getBlockNumbered(Edge.first)) << "->"
+               << printMBBReference(*MF.getBlockNumbered(Edge.second)) << '\n');
     if (Edge.first != EntryNum)
       if (EdgeExec.count(Edge))
         continue;
@@ -896,7 +897,7 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
     // If the successor block just became executable, visit all instructions.
     // To see if this is the first time we're visiting it, check the first
     // non-debug instruction to see if it is executable.
-    while (It != End && It->isDebugValue())
+    while (It != End && It->isDebugInstr())
       ++It;
     assert(It == End || !It->isPHI());
     // If this block has been visited, go on to the next one.
@@ -905,7 +906,7 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
     // For now, scan all non-branch instructions. Branches require different
     // processing.
     while (It != End && !It->isBranch()) {
-      if (!It->isDebugValue()) {
+      if (!It->isDebugInstr()) {
         InstrExec.insert(&*It);
         visitNonBranch(*It);
       }
@@ -927,7 +928,7 @@ void MachineConstPropagator::propagate(MachineFunction &MF) {
     }
   } // while (FlowQ)
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Cells after propagation:\n";
     Cells.print(dbgs(), MCE.TRI);
     dbgs() << "Dead CFG edges:\n";
@@ -1042,7 +1043,7 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
 // This is the constant propagation algorithm as described by Wegman-Zadeck.
 // Most of the terminology comes from there.
 bool MachineConstPropagator::run(MachineFunction &MF) {
-  DEBUG(MF.print(dbgs() << "Starting MachineConstPropagator\n", nullptr));
+  LLVM_DEBUG(MF.print(dbgs() << "Starting MachineConstPropagator\n", nullptr));
 
   MRI = &MF.getRegInfo();
 
@@ -1054,7 +1055,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
   propagate(MF);
   bool Changed = rewrite(MF);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "End of MachineConstPropagator (Changed=" << Changed << ")\n";
     if (Changed)
       MF.print(dbgs(), nullptr);
@@ -1880,10 +1881,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonConstPropagation() : MachineFunctionPass(ID) {
-      PassRegistry &Registry = *PassRegistry::getPassRegistry();
-      initializeHexagonConstPropagationPass(Registry);
-    }
+    HexagonConstPropagation() : MachineFunctionPass(ID) {}
 
     StringRef getPassName() const override {
       return "Hexagon Constant Propagation";
@@ -1903,8 +1901,8 @@ namespace {
 
 char HexagonConstPropagation::ID = 0;
 
-INITIALIZE_PASS(HexagonConstPropagation, "hcp", "Hexagon Constant Propagation",
-                false, false)
+INITIALIZE_PASS(HexagonConstPropagation, "hexagon-constp",
+  "Hexagon Constant Propagation", false, false)
 
 HexagonConstEvaluator::HexagonConstEvaluator(MachineFunction &Fn)
   : MachineConstEvaluator(Fn),
@@ -2022,6 +2020,8 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
     case Hexagon::A2_combineii:  // combine(#s8Ext, #s8)
     case Hexagon::A4_combineii:  // combine(#s8, #u6Ext)
     {
+      if (!MI.getOperand(1).isImm() || !MI.getOperand(2).isImm())
+        return false;
       uint64_t Hi = MI.getOperand(1).getImm();
       uint64_t Lo = MI.getOperand(2).getImm();
       uint64_t Res = (Hi << 32) | (Lo & 0xFFFFFFFF);
@@ -2631,6 +2631,8 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
       Eval = evaluateANDrr(R1, Register(Src2), Inputs, RC);
       break;
     case Hexagon::A2_andir: {
+      if (!Src2.isImm())
+        return false;
       APInt A(32, Src2.getImm(), true);
       Eval = evaluateANDri(R1, A, Inputs, RC);
       break;
@@ -2640,6 +2642,8 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
       Eval = evaluateORrr(R1, Register(Src2), Inputs, RC);
       break;
     case Hexagon::A2_orir: {
+      if (!Src2.isImm())
+        return false;
       APInt A(32, Src2.getImm(), true);
       Eval = evaluateORri(R1, A, Inputs, RC);
       break;
@@ -2775,7 +2779,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
   AllDefs = false;
 
   // Some diagnostics.
-  // DEBUG({...}) gets confused with all this code as an argument.
+  // LLVM_DEBUG({...}) gets confused with all this code as an argument.
 #ifndef NDEBUG
   bool Debugging = DebugFlag && isCurrentDebugType(DEBUG_TYPE);
   if (Debugging) {
@@ -2920,7 +2924,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
     ChangedNum++;
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     if (!NewInstrs.empty()) {
       MachineFunction &MF = *MI.getParent()->getParent();
       dbgs() << "In function: " << MF.getName() << "\n";
@@ -3087,7 +3091,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
         MO.setIsKill(false);
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     if (NewMI) {
       dbgs() << "Rewrite: for " << MI;
       if (NewMI != &MI)
@@ -3127,7 +3131,7 @@ bool HexagonConstEvaluator::rewriteHexBranch(MachineInstr &BrI,
   if (BrI.getOpcode() == Hexagon::J2_jump)
     return false;
 
-  DEBUG(dbgs() << "Rewrite(" << printMBBReference(B) << "):" << BrI);
+  LLVM_DEBUG(dbgs() << "Rewrite(" << printMBBReference(B) << "):" << BrI);
   bool Rewritten = false;
   if (NumTargets > 0) {
     assert(!FallsThru && "This should have been checked before");
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 087a77203fcb..fccde96d8a32 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -300,7 +300,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
       //   * reads I2's def reg
       //   * or has unmodelled side effects
       // we can't move I2 across it.
-      if (I->isDebugValue())
+      if (I->isDebugInstr())
         continue;
 
       if (isUnsafeToMoveAcross(*I, I2UseReg, I2DestReg, TRI)) {
@@ -358,7 +358,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
       //      to remove the implicit killed %d4 operand. For now, we are
       //      conservative and disallow the move.
       // we can't move I1 across it.
-      if (MI.isDebugValue()) {
+      if (MI.isDebugInstr()) {
         if (MI.readsRegister(I1DestReg, TRI)) // Move this instruction after I2.
           DbgMItoMove.push_back(&MI);
         continue;
@@ -396,7 +396,7 @@ void
 HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
   DenseMap<unsigned, MachineInstr *> LastDef;
   for (MachineInstr &MI : BB) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
 
     // Mark TFRs that feed a potential new value store as such.
@@ -423,7 +423,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
         MachineBasicBlock::iterator It(DefInst);
         unsigned NumInstsToDef = 0;
         while (&*It != &MI) {
-          if (!It->isDebugValue())
+          if (!It->isDebugInstr())
             ++NumInstsToDef;
           ++It;
         }
@@ -489,7 +489,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
         MI != End;) {
       MachineInstr &I1 = *MI++;
 
-      if (I1.isDebugValue())
+      if (I1.isDebugInstr())
         continue;
 
       // Don't combine a TFR whose user could be newified (instructions that
@@ -526,7 +526,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
                                                  bool &DoInsertAtI1,
                                                  bool AllowC64) {
   MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
-  while (I2 != I1.getParent()->end() && I2->isDebugValue())
+  while (I2 != I1.getParent()->end() && I2->isDebugInstr())
     ++I2;
 
   unsigned I1DestReg = I1.getOperand(0).getReg();
@@ -649,7 +649,7 @@ void HexagonCopyToCombine::emitConst64(MachineBasicBlock::iterator &InsertPt,
                                        unsigned DoubleDestReg,
                                        MachineOperand &HiOperand,
                                        MachineOperand &LoOperand) {
-  DEBUG(dbgs() << "Found a CONST64\n");
+  LLVM_DEBUG(dbgs() << "Found a CONST64\n");
 
   DebugLoc DL = InsertPt->getDebugLoc();
   MachineBasicBlock *BB = InsertPt->getParent();
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td
index 87dcd966f2ed..3594379aa841 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -11,14 +11,14 @@
 
 
 def ArchV65: SubtargetFeature<"v65", "HexagonArchVersion", "Hexagon::ArchEnum::V65", "Enable Hexagon V65 architecture">;
-def HasV65T : Predicate<"HST->hasV65TOps()">, AssemblerPredicate<"ArchV65">;
+def HasV65 : Predicate<"HST->hasV65Ops()">, AssemblerPredicate<"ArchV65">;
 def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "Hexagon::ArchEnum::V62", "Enable Hexagon V62 architecture">;
-def HasV62T : Predicate<"HST->hasV62TOps()">, AssemblerPredicate<"ArchV62">;
+def HasV62 : Predicate<"HST->hasV62Ops()">, AssemblerPredicate<"ArchV62">;
 def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "Hexagon::ArchEnum::V60", "Enable Hexagon V60 architecture">;
-def HasV60T : Predicate<"HST->hasV60TOps()">, AssemblerPredicate<"ArchV60">;
+def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">;
 def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
-def HasV55T : Predicate<"HST->hasV55TOps()">, AssemblerPredicate<"ArchV55">;
+def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">;
 def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "Hexagon::ArchEnum::V4", "Enable Hexagon V4 architecture">;
-def HasV4T : Predicate<"HST->hasV4TOps()">, AssemblerPredicate<"ArchV4">;
+def HasV4 : Predicate<"HST->hasV4Ops()">, AssemblerPredicate<"ArchV4">;
 def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">;
-def HasV5T : Predicate<"HST->hasV5TOps()">, AssemblerPredicate<"ArchV5">;
+def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index 083ec7753e04..931504b56ccb 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -10,21 +10,17 @@
 //===----------------------------------------------------------------------===//
 
 
-def tc_0077f68c : InstrItinClass;
 def tc_00afc57e : InstrItinClass;
 def tc_00e7c26e : InstrItinClass;
 def tc_03220ffa : InstrItinClass;
 def tc_038a1342 : InstrItinClass;
 def tc_04c9decc : InstrItinClass;
 def tc_05b6c987 : InstrItinClass;
-def tc_0a2b8c7c : InstrItinClass;
 def tc_0cd51c76 : InstrItinClass;
 def tc_0dc560de : InstrItinClass;
 def tc_0fc1ae07 : InstrItinClass;
 def tc_10b97e27 : InstrItinClass;
-def tc_128f96e3 : InstrItinClass;
 def tc_1372bca1 : InstrItinClass;
-def tc_1432937d : InstrItinClass;
 def tc_14cd4cfa : InstrItinClass;
 def tc_15411484 : InstrItinClass;
 def tc_16d0d8d5 : InstrItinClass;
@@ -32,18 +28,14 @@ def tc_181af5d0 : InstrItinClass;
 def tc_1853ea6d : InstrItinClass;
 def tc_1b82a277 : InstrItinClass;
 def tc_1b9c9ee5 : InstrItinClass;
-def tc_1c0005f9 : InstrItinClass;
 def tc_1d5a38a8 : InstrItinClass;
 def tc_1e856f58 : InstrItinClass;
-def tc_20280784 : InstrItinClass;
 def tc_234a11a5 : InstrItinClass;
 def tc_238d91d2 : InstrItinClass;
 def tc_29175780 : InstrItinClass;
-def tc_29641329 : InstrItinClass;
 def tc_2a160009 : InstrItinClass;
 def tc_2b2f4060 : InstrItinClass;
 def tc_2b6f77c6 : InstrItinClass;
-def tc_2e00db30 : InstrItinClass;
 def tc_2f185f5c : InstrItinClass;
 def tc_2fc0c436 : InstrItinClass;
 def tc_351fed2d : InstrItinClass;
@@ -71,22 +63,19 @@ def tc_51b866be : InstrItinClass;
 def tc_523fcf30 : InstrItinClass;
 def tc_5274e61a : InstrItinClass;
 def tc_52d7bbea : InstrItinClass;
-def tc_53173427 : InstrItinClass;
 def tc_53bc8a6a : InstrItinClass;
 def tc_53bdb2f6 : InstrItinClass;
 def tc_540fdfbc : InstrItinClass;
 def tc_55050d58 : InstrItinClass;
-def tc_56d25411 : InstrItinClass;
 def tc_57288781 : InstrItinClass;
 def tc_594ab548 : InstrItinClass;
+def tc_59a01ead : InstrItinClass;
 def tc_5acef64a : InstrItinClass;
 def tc_5ba5997d : InstrItinClass;
 def tc_5eb851fc : InstrItinClass;
 def tc_5f6847a1 : InstrItinClass;
 def tc_60571023 : InstrItinClass;
 def tc_609d2efe : InstrItinClass;
-def tc_60d76817 : InstrItinClass;
-def tc_60f5738d : InstrItinClass;
 def tc_63fe3df7 : InstrItinClass;
 def tc_66888ded : InstrItinClass;
 def tc_6792d5ff : InstrItinClass;
@@ -96,6 +85,7 @@ def tc_6aa5711a : InstrItinClass;
 def tc_6ac37025 : InstrItinClass;
 def tc_6ebb4a12 : InstrItinClass;
 def tc_6efc556e : InstrItinClass;
+def tc_6fa4db47 : InstrItinClass;
 def tc_73043bf4 : InstrItinClass;
 def tc_746baa8e : InstrItinClass;
 def tc_74e47fd9 : InstrItinClass;
@@ -103,18 +93,16 @@ def tc_7934b9df : InstrItinClass;
 def tc_7a830544 : InstrItinClass;
 def tc_7f881c76 : InstrItinClass;
 def tc_84df2cd3 : InstrItinClass;
-def tc_85523bcb : InstrItinClass;
 def tc_855b0b61 : InstrItinClass;
 def tc_87735c3b : InstrItinClass;
-def tc_88fa1a78 : InstrItinClass;
 def tc_897d1a9d : InstrItinClass;
 def tc_8b15472a : InstrItinClass;
-def tc_8bb285ec : InstrItinClass;
 def tc_8fd5f294 : InstrItinClass;
 def tc_8fe6b782 : InstrItinClass;
 def tc_90f3e30c : InstrItinClass;
 def tc_976ddc4f : InstrItinClass;
 def tc_97743097 : InstrItinClass;
+def tc_994333cd : InstrItinClass;
 def tc_999d32db : InstrItinClass;
 def tc_99be14ca : InstrItinClass;
 def tc_9c00ce8d : InstrItinClass;
@@ -133,7 +121,6 @@ def tc_adb14c66 : InstrItinClass;
 def tc_b13761ae : InstrItinClass;
 def tc_b166348b : InstrItinClass;
 def tc_b44c6e2a : InstrItinClass;
-def tc_b5a33b22 : InstrItinClass;
 def tc_b77c481f : InstrItinClass;
 def tc_b7dd427e : InstrItinClass;
 def tc_b9488031 : InstrItinClass;
@@ -141,7 +128,6 @@ def tc_b9c0b731 : InstrItinClass;
 def tc_b9c4623f : InstrItinClass;
 def tc_bad2bcaf : InstrItinClass;
 def tc_bcc96cee : InstrItinClass;
-def tc_bd90564c : InstrItinClass;
 def tc_bde7aaf4 : InstrItinClass;
 def tc_be706f30 : InstrItinClass;
 def tc_c2f7d806 : InstrItinClass;
@@ -166,24 +152,20 @@ def tc_d9f95eef : InstrItinClass;
 def tc_daa058fa : InstrItinClass;
 def tc_dbdffe3d : InstrItinClass;
 def tc_e0739b8c : InstrItinClass;
-def tc_e1e0a2dc : InstrItinClass;
 def tc_e1e99bfa : InstrItinClass;
 def tc_e216a5db : InstrItinClass;
 def tc_e421e012 : InstrItinClass;
-def tc_e6b38e01 : InstrItinClass;
 def tc_e7624c08 : InstrItinClass;
 def tc_e7d02c66 : InstrItinClass;
 def tc_e913dc32 : InstrItinClass;
 def tc_e9c822f7 : InstrItinClass;
 def tc_e9fae2d6 : InstrItinClass;
-def tc_ef20db1c : InstrItinClass;
 def tc_ef52ed71 : InstrItinClass;
 def tc_ef84f62f : InstrItinClass;
 def tc_f2704b9a : InstrItinClass;
 def tc_f3eaa14b : InstrItinClass;
 def tc_f47d212f : InstrItinClass;
 def tc_f49e76f4 : InstrItinClass;
-def tc_f4f43fb5 : InstrItinClass;
 def tc_f7dd9c9f : InstrItinClass;
 def tc_f86c328a : InstrItinClass;
 def tc_f8eeed7a : InstrItinClass;
@@ -192,21 +174,17 @@ def tc_ff9ee76e : InstrItinClass;
 
 class DepScalarItinV4 {
   list<InstrItinData> DepScalarItinV4_list = [
-    InstrItinData <tc_0077f68c, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_00afc57e, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_00e7c26e, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_03220ffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_038a1342, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_04c9decc, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_05b6c987, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0a2b8c7c, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_0cd51c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_0dc560de, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_0fc1ae07, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_10b97e27, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_128f96e3, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_1372bca1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_1432937d, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_14cd4cfa, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_15411484, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_16d0d8d5, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -214,18 +192,14 @@ class DepScalarItinV4 {
     InstrItinData <tc_1853ea6d, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_1b82a277, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_1b9c9ee5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1c0005f9, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_1d5a38a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_1e856f58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_20280784, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_234a11a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_238d91d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_29175780, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_29641329, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_2a160009, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_2b2f4060, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_2b6f77c6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2e00db30, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_2f185f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_2fc0c436, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_351fed2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -253,22 +227,19 @@ class DepScalarItinV4 {
     InstrItinData <tc_523fcf30, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_5274e61a, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_52d7bbea, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_53173427, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_53bc8a6a, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_53bdb2f6, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_540fdfbc, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_55050d58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_56d25411, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_57288781, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_594ab548, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_59a01ead, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_5acef64a, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_5ba5997d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_5eb851fc, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_5f6847a1, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_60571023, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_609d2efe, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_60d76817, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_60f5738d, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_63fe3df7, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_66888ded, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_6792d5ff, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -278,6 +249,7 @@ class DepScalarItinV4 {
     InstrItinData <tc_6ac37025, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_6ebb4a12, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_6efc556e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6fa4db47, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_73043bf4, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_746baa8e, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_74e47fd9, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -285,18 +257,16 @@ class DepScalarItinV4 {
     InstrItinData <tc_7a830544, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_7f881c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_84df2cd3, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_85523bcb, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_855b0b61, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_87735c3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_88fa1a78, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_897d1a9d, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_8b15472a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_8bb285ec, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_8fd5f294, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_8fe6b782, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_90f3e30c, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_976ddc4f, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_97743097, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_994333cd, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_999d32db, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_99be14ca, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_9c00ce8d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -315,7 +285,6 @@ class DepScalarItinV4 {
     InstrItinData <tc_b13761ae, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_b166348b, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_b44c6e2a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b5a33b22, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_b77c481f, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_b7dd427e, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_b9488031, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
@@ -323,7 +292,6 @@ class DepScalarItinV4 {
     InstrItinData <tc_b9c4623f, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_bad2bcaf, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_bcc96cee, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_bd90564c, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_bde7aaf4, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_be706f30, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_c2f7d806, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -348,24 +316,20 @@ class DepScalarItinV4 {
     InstrItinData <tc_daa058fa, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_dbdffe3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_e0739b8c, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_e1e0a2dc, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_e1e99bfa, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_e216a5db, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_e421e012, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e6b38e01, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_e7624c08, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_e7d02c66, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_e913dc32, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_e9c822f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_e9fae2d6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_ef20db1c, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_ef52ed71, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_ef84f62f, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_f2704b9a, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_f3eaa14b, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_f47d212f, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_f49e76f4, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f4f43fb5, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_f7dd9c9f, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_f86c328a, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_f8eeed7a, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -375,21 +339,17 @@ class DepScalarItinV4 {
 
 class DepScalarItinV5 {
   list<InstrItinData> DepScalarItinV5_list = [
-    InstrItinData <tc_0077f68c, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_00afc57e, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_00e7c26e, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_03220ffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_038a1342, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_04c9decc, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_05b6c987, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0a2b8c7c, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_0cd51c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_0dc560de, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_0fc1ae07, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_10b97e27, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_128f96e3, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_1372bca1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_1432937d, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_14cd4cfa, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_15411484, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_16d0d8d5, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -397,18 +357,14 @@ class DepScalarItinV5 {
     InstrItinData <tc_1853ea6d, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_1b82a277, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_1b9c9ee5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1c0005f9, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_1d5a38a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_1e856f58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_20280784, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_234a11a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_238d91d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_29175780, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_29641329, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_2a160009, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_2b2f4060, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_2b6f77c6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2e00db30, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_2f185f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_2fc0c436, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_351fed2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -436,22 +392,19 @@ class DepScalarItinV5 {
     InstrItinData <tc_523fcf30, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_5274e61a, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_52d7bbea, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_53173427, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_53bc8a6a, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_53bdb2f6, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_540fdfbc, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_55050d58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_56d25411, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_57288781, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_594ab548, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_59a01ead, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_5acef64a, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_5ba5997d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_5eb851fc, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_5f6847a1, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_60571023, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_609d2efe, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_60d76817, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_60f5738d, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_63fe3df7, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_66888ded, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_6792d5ff, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -461,6 +414,7 @@ class DepScalarItinV5 {
     InstrItinData <tc_6ac37025, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_6ebb4a12, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_6efc556e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6fa4db47, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_73043bf4, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_746baa8e, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_74e47fd9, [InstrStage<1, [SLOT0, SLOT1]>]>,
@@ -468,18 +422,16 @@ class DepScalarItinV5 {
     InstrItinData <tc_7a830544, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_7f881c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_84df2cd3, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_85523bcb, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_855b0b61, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_87735c3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_88fa1a78, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_897d1a9d, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_8b15472a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_8bb285ec, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_8fd5f294, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_8fe6b782, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData <tc_90f3e30c, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_976ddc4f, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_97743097, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_994333cd, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_999d32db, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_99be14ca, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_9c00ce8d, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -498,7 +450,6 @@ class DepScalarItinV5 {
     InstrItinData <tc_b13761ae, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_b166348b, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_b44c6e2a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b5a33b22, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_b77c481f, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_b7dd427e, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_b9488031, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
@@ -506,7 +457,6 @@ class DepScalarItinV5 {
     InstrItinData <tc_b9c4623f, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_bad2bcaf, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_bcc96cee, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_bd90564c, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_bde7aaf4, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_be706f30, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_c2f7d806, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -531,24 +481,20 @@ class DepScalarItinV5 {
     InstrItinData <tc_daa058fa, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_dbdffe3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_e0739b8c, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_e1e0a2dc, [InstrStage<1, [SLOT2]>]>,
     InstrItinData <tc_e1e99bfa, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_e216a5db, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_e421e012, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e6b38e01, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_e7624c08, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_e7d02c66, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_e913dc32, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_e9c822f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_e9fae2d6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_ef20db1c, [InstrStage<1, [SLOT3]>]>,
     InstrItinData <tc_ef52ed71, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_ef84f62f, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_f2704b9a, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_f3eaa14b, [InstrStage<1, [SLOT2, SLOT3]>]>,
     InstrItinData <tc_f47d212f, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_f49e76f4, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f4f43fb5, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_f7dd9c9f, [InstrStage<1, [SLOT0]>]>,
     InstrItinData <tc_f86c328a, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData <tc_f8eeed7a, [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -558,10 +504,6 @@ class DepScalarItinV5 {
 
 class DepScalarItinV55 {
   list<InstrItinData> DepScalarItinV55_list = [
-    InstrItinData <tc_0077f68c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
     InstrItinData <tc_00afc57e, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -586,10 +528,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_0cd51c76, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -606,18 +544,10 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_128f96e3, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1372bca1, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1432937d, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_14cd4cfa, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
@@ -646,10 +576,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c0005f9, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1d5a38a8, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -658,10 +584,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20280784, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_234a11a5, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -674,10 +596,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29641329, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_2a160009, /*tc_2early*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
@@ -690,10 +608,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2e00db30, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
     InstrItinData <tc_2f185f5c, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -802,10 +716,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_53173427, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_53bc8a6a, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -822,10 +732,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56d25411, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_57288781, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -834,6 +740,10 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_59a01ead, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5acef64a, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -858,14 +768,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60d76817, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_60f5738d, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_63fe3df7, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -902,6 +804,10 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
+    InstrItinData <tc_6fa4db47, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_73043bf4, /*tc_2early*/
       [InstrStage<1, [SLOT3]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -930,10 +836,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85523bcb, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_855b0b61, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -942,10 +844,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_88fa1a78, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_897d1a9d, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -954,10 +852,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8bb285ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_8fd5f294, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -978,6 +872,10 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_994333cd, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_999d32db, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
@@ -1050,10 +948,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b5a33b22, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_b77c481f, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1082,10 +976,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd90564c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_bde7aaf4, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1182,10 +1072,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e0a2dc, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
-
     InstrItinData <tc_e1e99bfa, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1198,10 +1084,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e6b38e01, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_e7624c08, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
@@ -1222,10 +1104,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef20db1c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_ef52ed71, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1250,10 +1128,6 @@ class DepScalarItinV55 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f4f43fb5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_f7dd9c9f, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1278,10 +1152,6 @@ class DepScalarItinV55 {
 
 class DepScalarItinV60 {
   list<InstrItinData> DepScalarItinV60_list = [
-    InstrItinData <tc_0077f68c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
     InstrItinData <tc_00afc57e, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1306,10 +1176,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_0cd51c76, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1326,18 +1192,10 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_128f96e3, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1372bca1, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1432937d, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_14cd4cfa, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
@@ -1366,10 +1224,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c0005f9, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1d5a38a8, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1378,10 +1232,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20280784, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_234a11a5, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1394,10 +1244,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29641329, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_2a160009, /*tc_2early*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
@@ -1410,10 +1256,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2e00db30, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
     InstrItinData <tc_2f185f5c, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1522,10 +1364,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_53173427, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_53bc8a6a, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1542,10 +1380,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56d25411, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_57288781, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1554,6 +1388,10 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_59a01ead, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5acef64a, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1578,14 +1416,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60d76817, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_60f5738d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_63fe3df7, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1622,6 +1452,10 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
+    InstrItinData <tc_6fa4db47, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_73043bf4, /*tc_2early*/
       [InstrStage<1, [SLOT3]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1650,10 +1484,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85523bcb, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_855b0b61, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1662,10 +1492,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_88fa1a78, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_897d1a9d, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1674,10 +1500,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8bb285ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_8fd5f294, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1698,6 +1520,10 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_994333cd, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_999d32db, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
@@ -1770,10 +1596,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b5a33b22, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_b77c481f, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1802,10 +1624,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd90564c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1902,10 +1720,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
-
     InstrItinData <tc_e1e99bfa, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -1918,10 +1732,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e6b38e01, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_e7624c08, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
@@ -1942,10 +1752,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef20db1c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_ef52ed71, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1970,10 +1776,6 @@ class DepScalarItinV60 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f4f43fb5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_f7dd9c9f, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -1996,765 +1798,8 @@ class DepScalarItinV60 {
   ];
 }
 
-class DepScalarItinV60se {
-  list<InstrItinData> DepScalarItinV60se_list = [
-    InstrItinData <tc_0077f68c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_00afc57e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_00e7c26e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_03220ffa, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_038a1342, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_04c9decc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_05b6c987, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_0cd51c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_0dc560de, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_0fc1ae07, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_10b97e27, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_128f96e3, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1372bca1, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1432937d, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_14cd4cfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_15411484, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_16d0d8d5, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_181af5d0, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1853ea6d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1b82a277, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_1b9c9ee5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1c0005f9, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1d5a38a8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1e856f58, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_20280784, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_234a11a5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_238d91d2, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_29175780, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_29641329, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_2a160009, /*tc_2early*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [],
-      []>,
-
-    InstrItinData <tc_2b2f4060, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_2b6f77c6, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_2e00db30, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
-    InstrItinData <tc_2f185f5c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_2fc0c436, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_351fed2d, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3669266a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_367f7f3d, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
-    InstrItinData <tc_36c68ad1, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
-      []>,
-
-    InstrItinData <tc_395dc00f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3bc2c5d3, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3cb8ea06, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3d04548d, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3da80ba5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3e07fb90, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_41d5298e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4403ca65, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_44126683, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_452f85af, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_481e5e5c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_49eb22c8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4ca572d4, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_4d9914c9, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4d99bca9, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4f7cd700, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_513bef45, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_51b866be, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_523fcf30, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5274e61a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_52d7bbea, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [],
-      []>,
-
-    InstrItinData <tc_53173427, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_53bc8a6a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_53bdb2f6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_540fdfbc, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_55050d58, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_56d25411, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_57288781, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_594ab548, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5acef64a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5ba5997d, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5eb851fc, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5f6847a1, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_60571023, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_609d2efe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_60d76817, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_60f5738d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_63fe3df7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_66888ded, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_6792d5ff, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_681a2300, /*tc_3stall*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_68cb12ce, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_6aa5711a, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_6ac37025, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_6ebb4a12, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_6efc556e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_73043bf4, /*tc_2early*/
-      [InstrStage<1, [SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_746baa8e, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_74e47fd9, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7934b9df, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7a830544, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7f881c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_84df2cd3, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_85523bcb, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_855b0b61, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_87735c3b, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_88fa1a78, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_897d1a9d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_8b15472a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_8bb285ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_8fd5f294, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_8fe6b782, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_90f3e30c, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_976ddc4f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_97743097, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_999d32db, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_99be14ca, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9c00ce8d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9c98e8af, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9d5941c7, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9ef61e5c, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9faf76ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_9fdb5406, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_a21dc435, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_a27582fa, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_a46f0df5, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_a788683e, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_a8acdac0, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_a904d137, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_adb14c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b13761ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
-
-    InstrItinData <tc_b166348b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b44c6e2a, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b5a33b22, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b77c481f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b7dd427e, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b9488031, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b9c0b731, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b9c4623f, /*tc_2*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_bad2bcaf, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_bcc96cee, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_bd90564c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_be706f30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c2f7d806, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c5e2426d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c6aa82f7, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c6ce9b3f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c6ebf8dd, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c74f796f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c82dc1ff, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_caaebcba, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cd7374a0, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cde8b071, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cf47a43f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cf59f215, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d088982c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d1090e34, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d24b2d85, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d580173f, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d6bf0472, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d9709180, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d9f95eef, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_daa058fa, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_dbdffe3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e0739b8c, /*tc_2early*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_ST]>], [],
-      []>,
-
-    InstrItinData <tc_e1e99bfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e216a5db, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e421e012, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e6b38e01, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e7624c08, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_e7d02c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e913dc32, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e9c822f7, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_e9fae2d6, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_ef20db1c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_ef52ed71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_ef84f62f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f2704b9a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f3eaa14b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f47d212f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f49e76f4, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f4f43fb5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f7dd9c9f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f86c328a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f8eeed7a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_fcab4871, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [],
-      []>,
-
-    InstrItinData <tc_ff9ee76e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
-      [Hex_FWD, Hex_FWD]>
-  ];
-}
-
 class DepScalarItinV62 {
   list<InstrItinData> DepScalarItinV62_list = [
-    InstrItinData <tc_0077f68c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
     InstrItinData <tc_00afc57e, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -2779,10 +1824,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_0cd51c76, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2799,18 +1840,10 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_128f96e3, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1372bca1, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1432937d, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_14cd4cfa, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
@@ -2839,10 +1872,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c0005f9, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1d5a38a8, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -2851,10 +1880,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20280784, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_234a11a5, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -2867,10 +1892,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29641329, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_2a160009, /*tc_2early*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
@@ -2883,10 +1904,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2e00db30, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
     InstrItinData <tc_2f185f5c, /*tc_3*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -2995,10 +2012,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_53173427, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_53bc8a6a, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3015,10 +2028,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56d25411, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_57288781, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3027,6 +2036,10 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_59a01ead, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5acef64a, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3051,14 +2064,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60d76817, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_60f5738d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_63fe3df7, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3095,6 +2100,10 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
+    InstrItinData <tc_6fa4db47, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_73043bf4, /*tc_2early*/
       [InstrStage<1, [SLOT3]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3123,10 +2132,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85523bcb, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_855b0b61, /*tc_2early*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3135,10 +2140,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_88fa1a78, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_897d1a9d, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3147,10 +2148,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8bb285ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_8fd5f294, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3167,6 +2164,10 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_994333cd, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_97743097, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
@@ -3243,10 +2244,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b5a33b22, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_b77c481f, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3275,10 +2272,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd90564c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3375,10 +2368,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
-
     InstrItinData <tc_e1e99bfa, /*tc_2early*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3391,10 +2380,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e6b38e01, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_e7624c08, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
@@ -3415,10 +2400,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef20db1c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_ef52ed71, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3443,10 +2424,6 @@ class DepScalarItinV62 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f4f43fb5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_f7dd9c9f, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3471,10 +2448,6 @@ class DepScalarItinV62 {
 
 class DepScalarItinV65 {
   list<InstrItinData> DepScalarItinV65_list = [
-    InstrItinData <tc_0077f68c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
     InstrItinData <tc_00afc57e, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3499,10 +2472,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a2b8c7c, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_0cd51c76, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3519,18 +2488,10 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_128f96e3, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1372bca1, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1432937d, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_14cd4cfa, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
@@ -3559,10 +2520,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c0005f9, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_1d5a38a8, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3571,10 +2528,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20280784, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_234a11a5, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3587,10 +2540,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29641329, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_2a160009, /*tc_2early*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
@@ -3603,10 +2552,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2e00db30, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
     InstrItinData <tc_2f185f5c, /*tc_3*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3715,10 +2660,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_53173427, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_53bc8a6a, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3735,10 +2676,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56d25411, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_57288781, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3747,6 +2684,10 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_59a01ead, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5acef64a, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3771,14 +2712,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60d76817, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_60f5738d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_63fe3df7, /*tc_latepredldaia*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3815,6 +2748,10 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
+    InstrItinData <tc_6fa4db47, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_73043bf4, /*tc_1*/
       [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -3843,10 +2780,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85523bcb, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_855b0b61, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3855,10 +2788,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_88fa1a78, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_897d1a9d, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3867,10 +2796,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8bb285ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
     InstrItinData <tc_8fd5f294, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3891,6 +2816,10 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_994333cd, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
     InstrItinData <tc_999d32db, /*tc_3stall*/
       [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
@@ -3963,10 +2892,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b5a33b22, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_b77c481f, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -3995,10 +2920,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd90564c, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4095,10 +3016,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e0a2dc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
-
     InstrItinData <tc_e1e99bfa, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
@@ -4111,10 +3028,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e6b38e01, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_e7624c08, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
@@ -4135,10 +3048,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef20db1c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_ef52ed71, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
@@ -4163,10 +3072,6 @@ class DepScalarItinV65 {
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f4f43fb5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
     InstrItinData <tc_f7dd9c9f, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 6e16762ac0eb..b6824fa33106 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -991,7 +991,7 @@ def A2_roundsat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = round($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_c2f7d806, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -3301,7 +3301,7 @@ def A5_ACS : HInst<
 (outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
-tc_caaebcba, TypeM>, Enc_831a7d, Requires<[HasV55T]> {
+tc_caaebcba, TypeM>, Enc_831a7d, Requires<[HasV55]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -3314,7 +3314,7 @@ def A5_vaddhubs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vaddhub($Rss32,$Rtt32):sat",
-tc_2b6f77c6, TypeS_3op>, Enc_d2216a, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_3op>, Enc_d2216a, Requires<[HasV5]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -3327,7 +3327,7 @@ def A6_vcmpbeq_notany : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = !any8(vcmpb.eq($Rss32,$Rtt32))",
-tc_55050d58, TypeALU64>, Enc_fcf7a7, Requires<[HasV65T]> {
+tc_55050d58, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3336,7 +3336,7 @@ def A6_vminub_RdP : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
-tc_ef84f62f, TypeM>, Enc_d2c7f1, Requires<[HasV62T]> {
+tc_ef84f62f, TypeM>, Enc_d2c7f1, Requires<[HasV62]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -4059,7 +4059,7 @@ def F2_conv_d2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_d2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4069,7 +4069,7 @@ def F2_conv_d2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_d2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -4081,7 +4081,7 @@ def F2_conv_df2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4091,7 +4091,7 @@ def F2_conv_df2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4101,7 +4101,7 @@ def F2_conv_df2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -4113,7 +4113,7 @@ def F2_conv_df2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4123,7 +4123,7 @@ def F2_conv_df2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4133,7 +4133,7 @@ def F2_conv_df2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -4145,7 +4145,7 @@ def F2_conv_df2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000101;
 let hasNewValue = 1;
@@ -4157,7 +4157,7 @@ def F2_conv_df2w : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -4169,7 +4169,7 @@ def F2_conv_df2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -4181,7 +4181,7 @@ def F2_conv_sf2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4191,7 +4191,7 @@ def F2_conv_sf2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4201,7 +4201,7 @@ def F2_conv_sf2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4211,7 +4211,7 @@ def F2_conv_sf2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4221,7 +4221,7 @@ def F2_conv_sf2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4231,7 +4231,7 @@ def F2_conv_sf2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4243,7 +4243,7 @@ def F2_conv_sf2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4255,7 +4255,7 @@ def F2_conv_sf2w : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4267,7 +4267,7 @@ def F2_conv_sf2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4279,7 +4279,7 @@ def F2_conv_ud2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_ud2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4289,7 +4289,7 @@ def F2_conv_ud2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_ud2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000001;
 let hasNewValue = 1;
@@ -4301,7 +4301,7 @@ def F2_conv_uw2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_uw2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4311,7 +4311,7 @@ def F2_conv_uw2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_uw2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011001;
 let hasNewValue = 1;
@@ -4323,7 +4323,7 @@ def F2_conv_w2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_w2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4333,7 +4333,7 @@ def F2_conv_w2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_w2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011010;
 let hasNewValue = 1;
@@ -4345,7 +4345,7 @@ def F2_dfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Pd4 = dfclass($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_1f19b5, Requires<[HasV5T]> {
+tc_7a830544, TypeALU64>, Enc_1f19b5, Requires<[HasV5]> {
 let Inst{4-2} = 0b100;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b11011100100;
@@ -4356,7 +4356,7 @@ def F2_dfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4368,7 +4368,7 @@ def F2_dfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4380,7 +4380,7 @@ def F2_dfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4392,7 +4392,7 @@ def F2_dfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4404,7 +4404,7 @@ def F2_dfimm_n : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100101;
 let prefersSlot3 = 1;
@@ -4413,7 +4413,7 @@ def F2_dfimm_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100100;
 let prefersSlot3 = 1;
@@ -4422,7 +4422,7 @@ def F2_sfadd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfadd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4436,7 +4436,7 @@ def F2_sfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = sfclass($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64, Requires<[HasV5T]> {
+tc_7a830544, TypeS_2op>, Enc_83ee64, Requires<[HasV5]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101111;
@@ -4447,7 +4447,7 @@ def F2_sfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4459,7 +4459,7 @@ def F2_sfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.ge($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4471,7 +4471,7 @@ def F2_sfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4483,7 +4483,7 @@ def F2_sfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.uo($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4495,7 +4495,7 @@ def F2_sffixupd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4507,7 +4507,7 @@ def F2_sffixupn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupn($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4519,7 +4519,7 @@ def F2_sffixupr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sffixupr($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011101;
 let hasNewValue = 1;
@@ -4530,7 +4530,7 @@ def F2_sffma : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4544,7 +4544,7 @@ def F2_sffma_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4558,7 +4558,7 @@ def F2_sffma_sc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
 "$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
-tc_038a1342, TypeM>, Enc_437f33, Requires<[HasV5T]> {
+tc_038a1342, TypeM>, Enc_437f33, Requires<[HasV5]> {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -4572,7 +4572,7 @@ def F2_sffms : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4586,7 +4586,7 @@ def F2_sffms_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4600,7 +4600,7 @@ def F2_sfimm_n : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011001;
 let hasNewValue = 1;
@@ -4611,7 +4611,7 @@ def F2_sfimm_p : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5T]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011000;
 let hasNewValue = 1;
@@ -4622,7 +4622,7 @@ def F2_sfinvsqrta : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32),
 "$Rd32,$Pe4 = sfinvsqrta($Rs32)",
-tc_4d99bca9, TypeS_2op>, Enc_890909, Requires<[HasV5T]> {
+tc_4d99bca9, TypeS_2op>, Enc_890909, Requires<[HasV5]> {
 let Inst{13-7} = 0b0000000;
 let Inst{31-21} = 0b10001011111;
 let hasNewValue = 1;
@@ -4634,7 +4634,7 @@ def F2_sfmax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmax($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4648,7 +4648,7 @@ def F2_sfmin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmin($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4662,7 +4662,7 @@ def F2_sfmpy : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmpy($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011010;
@@ -4676,7 +4676,7 @@ def F2_sfrecipa : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
-tc_9c00ce8d, TypeM>, Enc_a94f3b, Requires<[HasV5T]> {
+tc_9c00ce8d, TypeM>, Enc_a94f3b, Requires<[HasV5]> {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011111;
@@ -4689,7 +4689,7 @@ def F2_sfsub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfsub($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4698,6 +4698,44 @@ let opNewValue = 0;
 let isFP = 1;
 let Uses = [USR];
 }
+def G4_tfrgcpp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins GuestRegs64:$Gss32),
+"$Rdd32 = $Gss32",
+tc_6fa4db47, TypeCR>, Enc_0aa344 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101000001;
+}
+def G4_tfrgcrr : HInst<
+(outs IntRegs:$Rd32),
+(ins GuestRegs:$Gs32),
+"$Rd32 = $Gs32",
+tc_6fa4db47, TypeCR>, Enc_44271f {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def G4_tfrgpcp : HInst<
+(outs GuestRegs64:$Gdd32),
+(ins DoubleRegs:$Rss32),
+"$Gdd32 = $Rss32",
+tc_994333cd, TypeCR>, Enc_ed5027 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def G4_tfrgrcr : HInst<
+(outs GuestRegs:$Gd32),
+(ins IntRegs:$Rs32),
+"$Gd32 = $Rs32",
+tc_994333cd, TypeCR>, Enc_621fba {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
 def J2_call : HInst<
 (outs),
 (ins a30_2Imm:$Ii),
@@ -4905,7 +4943,7 @@ def J2_jumpf_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, b15_2Imm:$Ii),
 "if (!$Pu4) jump $Ii",
-tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -4967,7 +5005,7 @@ def J2_jumpfpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4) jump:t $Ii",
-tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60T]>, PredNewRel {
+tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b100;
 let Inst{21-21} = 0b1;
@@ -5029,7 +5067,7 @@ def J2_jumprf_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr $Rs32",
-tc_e0739b8c, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5077,7 +5115,7 @@ def J2_jumprfpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr:t $Rs32",
-tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60T]>, PredNewRel {
+tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b01010011011;
@@ -5222,7 +5260,7 @@ def J2_jumprt_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr $Rs32",
-tc_e0739b8c, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5268,7 +5306,7 @@ def J2_jumprtpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr:t $Rs32",
-tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60T]>, PredNewRel {
+tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b01010011010;
@@ -5347,7 +5385,7 @@ def J2_jumpt_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, b15_2Imm:$Ii),
 "if ($Pu4) jump $Ii",
-tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60T]> {
+tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5407,7 +5445,7 @@ def J2_jumptpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4) jump:t $Ii",
-tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60T]>, PredNewRel {
+tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b100;
 let Inst{21-21} = 0b0;
@@ -5631,6 +5669,30 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0101010000000000;
 let isSolo = 1;
 }
+def J2_trap1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u8_0Imm:$Ii),
+"trap1($Rx32,#$Ii)",
+tc_59a01ead, TypeJ>, Enc_33f8ba {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01010100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let Uses = [GOSP];
+let Defs = [GOSP, PC];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def J2_trap1_noregmap : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"trap1(#$Ii)",
+tc_59a01ead, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
 def J4_cmpeq_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
@@ -13334,7 +13396,7 @@ def L4_return_map_to_raw_f : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4) dealloc_return",
-tc_513bef45, TypeMAPPING>, Requires<[HasV65T]> {
+tc_513bef45, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13342,7 +13404,7 @@ def L4_return_map_to_raw_fnew_pnt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4.new) dealloc_return:nt",
-tc_395dc00f, TypeMAPPING>, Requires<[HasV65T]> {
+tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13350,7 +13412,7 @@ def L4_return_map_to_raw_fnew_pt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4.new) dealloc_return:t",
-tc_395dc00f, TypeMAPPING>, Requires<[HasV65T]> {
+tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13358,7 +13420,7 @@ def L4_return_map_to_raw_t : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4) dealloc_return",
-tc_3bc2c5d3, TypeMAPPING>, Requires<[HasV65T]> {
+tc_3bc2c5d3, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13366,7 +13428,7 @@ def L4_return_map_to_raw_tnew_pnt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4.new) dealloc_return:nt",
-tc_e7624c08, TypeMAPPING>, Requires<[HasV65T]> {
+tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13374,7 +13436,7 @@ def L4_return_map_to_raw_tnew_pt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4.new) dealloc_return:t",
-tc_e7624c08, TypeMAPPING>, Requires<[HasV65T]> {
+tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13528,7 +13590,7 @@ def L6_deallocframe_map_to_raw : HInst<
 (outs),
 (ins),
 "deallocframe",
-tc_d1090e34, TypeMAPPING>, Requires<[HasV65T]> {
+tc_d1090e34, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13536,7 +13598,7 @@ def L6_return_map_to_raw : HInst<
 (outs),
 (ins),
 "dealloc_return",
-tc_3d04548d, TypeMAPPING>, Requires<[HasV65T]> {
+tc_3d04548d, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -16916,7 +16978,7 @@ def M4_cmpyi_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5T]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -16942,7 +17004,7 @@ def M4_cmpyr_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5T]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17295,7 +17357,7 @@ def M5_vdmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c, Requires<[HasV5T]> {
+tc_e913dc32, TypeM>, Enc_88c16c, Requires<[HasV5]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17307,7 +17369,7 @@ def M5_vdmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825, Requires<[HasV5T]> {
+tc_8fd5f294, TypeM>, Enc_a56825, Requires<[HasV5]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -17402,7 +17464,7 @@ def M6_vabsdiffb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
-tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62T]> {
+tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -17412,7 +17474,7 @@ def M6_vabsdiffub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
-tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62T]> {
+tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -18142,7 +18204,7 @@ def S2_asr_i_p_rnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Enc_5eac98, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_5eac98, Requires<[HasV5]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18151,7 +18213,7 @@ def S2_asr_i_p_rnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asrrnd($Rss32,#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
 let isPseudo = 1;
 }
 def S2_asr_i_r : HInst<
@@ -25086,7 +25148,7 @@ def S5_asrhub_rnd_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
 let Inst{7-5} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25099,7 +25161,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -25108,7 +25170,7 @@ def S5_asrhub_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):sat",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
 let Inst{7-5} = 0b101;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25121,7 +25183,7 @@ def S5_popcountp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = popcount($Rss32)",
-tc_00afc57e, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+tc_00afc57e, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -25132,7 +25194,7 @@ def S5_vasrhrnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_12b6e9, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Enc_12b6e9, Requires<[HasV5]> {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000001;
@@ -25142,14 +25204,14 @@ def S5_vasrhrnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5T]> {
+tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
 let isPseudo = 1;
 }
 def S6_allocframe_to_raw : HInst<
 (outs),
 (ins u11_3Imm:$Ii),
 "allocframe(#$Ii)",
-tc_e216a5db, TypeMAPPING>, Requires<[HasV65T]> {
+tc_e216a5db, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -25157,7 +25219,7 @@ def S6_rol_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = rol($Rss32,#$Ii)",
-tc_55050d58, TypeS_2op>, Enc_5eac98, Requires<[HasV60T]> {
+tc_55050d58, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000000000;
 }
@@ -25165,7 +25227,7 @@ def S6_rol_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -25175,7 +25237,7 @@ def S6_rol_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -25185,7 +25247,7 @@ def S6_rol_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -25195,7 +25257,7 @@ def S6_rol_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -25205,7 +25267,7 @@ def S6_rol_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -25215,7 +25277,7 @@ def S6_rol_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = rol($Rs32,#$Ii)",
-tc_55050d58, TypeS_2op>, Enc_a05677, Requires<[HasV60T]> {
+tc_55050d58, TypeS_2op>, Enc_a05677, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -25226,7 +25288,7 @@ def S6_rol_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -25239,7 +25301,7 @@ def S6_rol_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -25252,7 +25314,7 @@ def S6_rol_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -25265,7 +25327,7 @@ def S6_rol_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -25278,7 +25340,7 @@ def S6_rol_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -25291,7 +25353,7 @@ def S6_vsplatrbp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsplatb($Rs32)",
-tc_be706f30, TypeS_2op>, Enc_3a3d62, Requires<[HasV62T]> {
+tc_be706f30, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100010;
 }
@@ -25299,7 +25361,7 @@ def S6_vtrunehb_ppp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunehb($Rss32,$Rtt32)",
-tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62T]> {
+tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -25308,7 +25370,7 @@ def S6_vtrunohb_ppp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunohb($Rss32,$Rtt32)",
-tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62T]> {
+tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -26288,7 +26350,7 @@ def V6_ldntnt0 : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32),
 "$Vd32 = vmem($Rt32):nt",
-PSEUDO, TypeMAPPING>, Requires<[HasV62T]> {
+PSEUDO, TypeMAPPING>, Requires<[HasV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30301,7 +30363,7 @@ def V6_vasrhbrndsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30335,7 +30397,7 @@ def V6_vasrhubrndsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30357,7 +30419,7 @@ def V6_vasrhubsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30500,7 +30562,7 @@ def V6_vasrwh_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30522,7 +30584,7 @@ def V6_vasrwhrndsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30544,7 +30606,7 @@ def V6_vasrwhsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30578,7 +30640,7 @@ def V6_vasrwuhsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -36942,7 +37004,7 @@ def Y5_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "l2fetch($Rs32,$Rtt32)",
-tc_daa058fa, TypeST>, Enc_e6abcf, Requires<[HasV5T]> {
+tc_daa058fa, TypeST>, Enc_e6abcf, Requires<[HasV5]> {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110100;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td
index 7a156c39da9c..03c504ff0b08 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td
@@ -26,6 +26,7 @@ def J2_jumpf_nopred_mapAlias : InstAlias<"if (!$Pu4) jump $Ii", (J2_jumpf PredRe
 def J2_jumprf_nopred_mapAlias : InstAlias<"if (!$Pu4) jumpr $Rs32", (J2_jumprf PredRegs:$Pu4, IntRegs:$Rs32)>;
 def J2_jumprt_nopred_mapAlias : InstAlias<"if ($Pu4) jumpr $Rs32", (J2_jumprt PredRegs:$Pu4, IntRegs:$Rs32)>;
 def J2_jumpt_nopred_mapAlias : InstAlias<"if ($Pu4) jump $Ii", (J2_jumpt PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def J2_trap1_noregmapAlias : InstAlias<"trap1(#$Ii)", (J2_trap1 R0, u8_0Imm:$Ii)>;
 def L2_loadalignb_zomapAlias : InstAlias<"$Ryy32 = memb_fifo($Rs32)", (L2_loadalignb_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
 def L2_loadalignh_zomapAlias : InstAlias<"$Ryy32 = memh_fifo($Rs32)", (L2_loadalignh_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
 def L2_loadbsw2_zomapAlias : InstAlias<"$Rd32 = membh($Rs32)", (L2_loadbsw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 0f1b9a4733c5..557e6384be6a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -100,7 +100,7 @@ namespace llvm {
 } // end namespace llvm
 
 static cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden,
-  cl::init(false), cl::desc("Enable branch probability info"));
+  cl::init(true), cl::desc("Enable branch probability info"));
 static cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
   cl::desc("Size limit in Hexagon early if-conversion"));
 static cl::opt<bool> SkipExitBranches("eif-no-loop-exit", cl::init(false),
@@ -191,6 +191,7 @@ namespace {
     bool isProfitable(const FlowPattern &FP) const;
     bool isPredicableStore(const MachineInstr *MI) const;
     bool isSafeToSpeculate(const MachineInstr *MI) const;
+    bool isPredicate(unsigned R) const;
 
     unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const;
     void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At,
@@ -207,7 +208,6 @@ namespace {
 
     void removeBlock(MachineBasicBlock *B);
     void eliminatePhis(MachineBasicBlock *B);
-    void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB);
     void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB);
     void simplifyFlowGraph(const FlowPattern &FP);
 
@@ -238,11 +238,12 @@ bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
 
 bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
     MachineLoop *L, FlowPattern &FP) {
-  DEBUG(dbgs() << "Checking flow pattern at " << printMBBReference(*B) << "\n");
+  LLVM_DEBUG(dbgs() << "Checking flow pattern at " << printMBBReference(*B)
+                    << "\n");
 
   // Interested only in conditional branches, no .new, no new-value, etc.
   // Check the terminators directly, it's easier than handling all responses
-  // from AnalyzeBranch.
+  // from analyzeBranch.
   MachineBasicBlock *TB = nullptr, *FB = nullptr;
   MachineBasicBlock::const_iterator T1I = B->getFirstTerminator();
   if (T1I == B->end())
@@ -325,17 +326,17 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
   }
   // Don't try to predicate loop preheaders.
   if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
-    DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
-                 << " is a loop preheader. Skipping.\n");
+    LLVM_DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
+                      << " is a loop preheader. Skipping.\n");
     return false;
   }
 
   FP = FlowPattern(B, PredR, TB, FB, JB);
-  DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
+  LLVM_DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
   return true;
 }
 
-// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that
+// KLUDGE: HexagonInstrInfo::analyzeBranch won't work on a block that
 // contains EH_LABEL.
 bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
   for (auto &I : *B)
@@ -344,7 +345,7 @@ bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
   return false;
 }
 
-// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize
+// KLUDGE: HexagonInstrInfo::analyzeBranch may be unable to recognize
 // that a block can never fall-through.
 bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B)
       const {
@@ -367,7 +368,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
     return false;
 
   for (auto &MI : *B) {
-    if (MI.isDebugValue())
+    if (MI.isDebugInstr())
       continue;
     if (MI.isConditionalBranch())
       return false;
@@ -387,13 +388,8 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
       unsigned R = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(R))
         continue;
-      switch (MRI->getRegClass(R)->getID()) {
-        case Hexagon::PredRegsRegClassID:
-        case Hexagon::HvxQRRegClassID:
-          break;
-        default:
-          continue;
-      }
+      if (!isPredicate(R))
+        continue;
       for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
         if (U->getParent()->isPHI())
           return false;
@@ -443,8 +439,7 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
       if (usesUndefVReg(&MI))
         return false;
       unsigned DefR = MI.getOperand(0).getReg();
-      const TargetRegisterClass *RC = MRI->getRegClass(DefR);
-      if (RC == &Hexagon::PredRegsRegClass)
+      if (isPredicate(DefR))
         return false;
     }
   }
@@ -500,7 +495,7 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
       unsigned R = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(R))
         continue;
-      if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass)
+      if (isPredicate(R))
         PredDefs++;
     }
   }
@@ -508,10 +503,21 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
 }
 
 bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
+  BranchProbability JumpProb(1, 10);
+  BranchProbability Prob(9, 10);
+  if (MBPI && FP.TrueB && !FP.FalseB &&
+      (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) < JumpProb ||
+       MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob))
+    return false;
+
+  if (MBPI && !FP.TrueB && FP.FalseB &&
+      (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) < JumpProb ||
+       MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob))
+    return false;
+
   if (FP.TrueB && FP.FalseB) {
     // Do not IfCovert if the branch is one sided.
     if (MBPI) {
-      BranchProbability Prob(9, 10);
       if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob)
         return false;
       if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob)
@@ -546,8 +552,9 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   };
   unsigned Spare = 0;
   unsigned TotalIn = TotalCount(FP.TrueB, Spare) + TotalCount(FP.FalseB, Spare);
-  DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
-               << TotalIn << ", spare room: " << Spare << "\n");
+  LLVM_DEBUG(
+      dbgs() << "Total number of instructions to be predicated/speculated: "
+             << TotalIn << ", spare room: " << Spare << "\n");
   if (TotalIn >= SizeLimit+Spare)
     return false;
 
@@ -574,12 +581,13 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
       PredDefs += countPredicateDefs(SB);
     }
   }
-  DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
-               << TotalPh << "\n");
+  LLVM_DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
+                    << TotalPh << "\n");
   if (TotalIn+TotalPh >= SizeLimit+Spare)
     return false;
 
-  DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n");
+  LLVM_DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs
+                    << "\n");
   if (PredDefs > 4)
     return false;
 
@@ -620,11 +628,11 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
     return Changed;
 
   if (!isValid(FP)) {
-    DEBUG(dbgs() << "Conversion is not valid\n");
+    LLVM_DEBUG(dbgs() << "Conversion is not valid\n");
     return Changed;
   }
   if (!isProfitable(FP)) {
-    DEBUG(dbgs() << "Conversion is not profitable\n");
+    LLVM_DEBUG(dbgs() << "Conversion is not profitable\n");
     return Changed;
   }
 
@@ -635,8 +643,9 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
 
 bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
   MachineBasicBlock *HB = L ? L->getHeader() : nullptr;
-  DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
-           : dbgs() << "Visiting function") << "\n");
+  LLVM_DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
+                : dbgs() << "Visiting function")
+             << "\n");
   bool Changed = false;
   if (L) {
     for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
@@ -680,10 +689,18 @@ bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI)
     return false;
   if (MI->hasUnmodeledSideEffects())
     return false;
+  if (MI->getOpcode() == TargetOpcode::LIFETIME_END)
+    return false;
 
   return true;
 }
 
+bool HexagonEarlyIfConversion::isPredicate(unsigned R) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RC == &Hexagon::PredRegsRegClass ||
+         RC == &Hexagon::HvxQRRegClass;
+}
+
 unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc,
       bool IfTrue) const {
   return HII->getCondOpcode(Opc, !IfTrue);
@@ -745,7 +762,7 @@ void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
 void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
       MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
       unsigned PredR, bool IfTrue) {
-  DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
+  LLVM_DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
   MachineBasicBlock::iterator End = FromB->getFirstTerminator();
   MachineBasicBlock::iterator I, NextI;
 
@@ -765,9 +782,11 @@ unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
   unsigned Opc = 0;
   switch (DRC->getID()) {
     case Hexagon::IntRegsRegClassID:
+    case Hexagon::IntRegsLow8RegClassID:
       Opc = Hexagon::C2_mux;
       break;
     case Hexagon::DoubleRegsRegClassID:
+    case Hexagon::GeneralDoubleLow8RegsRegClassID:
       Opc = Hexagon::PS_pselect;
       break;
     case Hexagon::HvxVRRegClassID:
@@ -935,7 +954,7 @@ void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
 }
 
 void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
-  DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
+  LLVM_DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
 
   // Transfer the immediate dominator information from B to its descendants.
   MachineDomTreeNode *N = MDT->getNode(B);
@@ -965,7 +984,7 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
 }
 
 void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
-  DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
+  LLVM_DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
   MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI();
   for (I = B->begin(); I != NonPHI; I = NextI) {
     NextI = std::next(I);
@@ -990,34 +1009,16 @@ void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
   }
 }
 
-void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
-      MachineBasicBlock *NewB) {
-  for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) {
-    MachineBasicBlock *SB = *I;
-    MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
-    for (P = SB->begin(); P != N; ++P) {
-      MachineInstr &PN = *P;
-      for (MachineOperand &MO : PN.operands())
-        if (MO.isMBB() && MO.getMBB() == OldB)
-          MO.setMBB(NewB);
-    }
-  }
-}
-
 void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
       MachineBasicBlock *SuccB) {
-  DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
-               << PrintMB(SuccB) << "\n");
+  LLVM_DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
+                    << PrintMB(SuccB) << "\n");
   bool TermOk = hasUncondBranch(SuccB);
   eliminatePhis(SuccB);
   HII->removeBranch(*PredB);
   PredB->removeSuccessor(SuccB);
   PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
-  MachineBasicBlock::succ_iterator I, E = SuccB->succ_end();
-  for (I = SuccB->succ_begin(); I != E; ++I)
-    PredB->addSuccessor(*I);
-  PredB->normalizeSuccProbs();
-  replacePhiEdges(SuccB, PredB);
+  PredB->transferSuccessorsAndUpdatePHIs(SuccB);
   removeBlock(SuccB);
   if (!TermOk)
     PredB->updateTerminator();
@@ -1039,7 +1040,7 @@ void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
 
   // By now, the split block has only one successor (SB), and SB has only
   // one predecessor. We can try to merge them. We will need to update ter-
-  // minators in FP.Split+SB, and that requires working AnalyzeBranch, which
+  // minators in FP.Split+SB, and that requires working analyzeBranch, which
   // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends
   // with an unconditional branch, we won't need to touch the terminators.
   if (!hasEHLabel(SB) || hasUncondBranch(SB))
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c2feaf5737b2..7e774674e0c0 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -316,8 +316,10 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
   auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
     // Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
     MachineInstr *MI = LIS->getInstructionFromIndex(K);
-    for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg ||
+          MI->isRegTiedToDefOperand(i))
         continue;
       LaneBitmask SLM = getLaneMask(Reg, Op.getSubReg());
       if ((SLM & LM) == SLM) {
@@ -497,14 +499,18 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
       if (!Op.isReg() || !DefRegs.count(Op))
         continue;
       if (Op.isDef()) {
-        ImpUses.insert({Op, i});
+        // Tied defs will always have corresponding uses, so no extra
+        // implicit uses are needed.
+        if (!Op.isTied())
+          ImpUses.insert({Op, i});
       } else {
         // This function can be called for the same register with different
         // lane masks. If the def in this instruction was for the whole
         // register, we can get here more than once. Avoid adding multiple
         // implicit uses (or adding an implicit use when an explicit one is
         // present).
-        ImpUses.erase(Op);
+        if (Op.isTied())
+          ImpUses.erase(Op);
       }
     }
     if (ImpUses.empty())
@@ -545,7 +551,14 @@ void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
 void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
       bool Recalc, bool UpdateKills, bool UpdateDeads) {
   UpdateKills |= UpdateDeads;
-  for (auto R : RegSet) {
+  for (unsigned R : RegSet) {
+    if (!TargetRegisterInfo::isVirtualRegister(R)) {
+      assert(TargetRegisterInfo::isPhysicalRegister(R));
+      // There shouldn't be any physical registers as operands, except
+      // possibly reserved registers.
+      assert(MRI->isReserved(R));
+      continue;
+    }
     if (Recalc)
       recalculateLiveInterval(R);
     if (UpdateKills)
@@ -641,7 +654,7 @@ MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
               .add(SrcOp);
   }
 
-  DEBUG(dbgs() << "created an initial copy: " << *MIB);
+  LLVM_DEBUG(dbgs() << "created an initial copy: " << *MIB);
   return &*MIB;
 }
 
@@ -654,8 +667,8 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
       return false;
     TfrCounter++;
   }
-  DEBUG(dbgs() << "\nsplitting " << printMBBReference(*MI.getParent()) << ": "
-               << MI);
+  LLVM_DEBUG(dbgs() << "\nsplitting " << printMBBReference(*MI.getParent())
+                    << ": " << MI);
   MachineOperand &MD = MI.getOperand(0);  // Definition
   MachineOperand &MP = MI.getOperand(1);  // Predicate register
   assert(MD.isDef());
@@ -932,8 +945,8 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
   unsigned Opc = TfrI.getOpcode();
   (void)Opc;
   assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf);
-  DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
-               << ": " << TfrI);
+  LLVM_DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
+                    << ": " << TfrI);
 
   MachineOperand &MD = TfrI.getOperand(0);
   MachineOperand &MP = TfrI.getOperand(1);
@@ -954,7 +967,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
   if (!DefI || !isPredicable(DefI))
     return false;
 
-  DEBUG(dbgs() << "Source def: " << *DefI);
+  LLVM_DEBUG(dbgs() << "Source def: " << *DefI);
 
   // Collect the information about registers defined and used between the
   // DefI and the TfrI.
@@ -1039,8 +1052,8 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
     if (!canMoveMemTo(*DefI, TfrI, true))
       CanDown = false;
 
-  DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
-               << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
+  LLVM_DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
+                    << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
   MachineBasicBlock::iterator PastDefIt = std::next(DefIt);
   if (CanUp)
     predicateAt(MD, *DefI, PastDefIt, MP, Cond, UpdRegs);
@@ -1135,10 +1148,10 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
     return false;
   bool Overlap = L1.overlaps(L2);
 
-  DEBUG(dbgs() << "compatible registers: ("
-               << (Overlap ? "overlap" : "disjoint") << ")\n  "
-               << printReg(R1.Reg, TRI, R1.Sub) << "  " << L1 << "\n  "
-               << printReg(R2.Reg, TRI, R2.Sub) << "  " << L2 << "\n");
+  LLVM_DEBUG(dbgs() << "compatible registers: ("
+                    << (Overlap ? "overlap" : "disjoint") << ")\n  "
+                    << printReg(R1.Reg, TRI, R1.Sub) << "  " << L1 << "\n  "
+                    << printReg(R2.Reg, TRI, R2.Sub) << "  " << L2 << "\n");
   if (R1.Sub || R2.Sub)
     return false;
   if (Overlap)
@@ -1171,7 +1184,7 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
   LIS->removeInterval(R2.Reg);
 
   updateKillFlags(R1.Reg);
-  DEBUG(dbgs() << "coalesced: " << L1 << "\n");
+  LLVM_DEBUG(dbgs() << "coalesced: " << L1 << "\n");
   L1.verify();
 
   return true;
@@ -1252,8 +1265,8 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
   LIS = &getAnalysis<LiveIntervals>();
   MRI = &MF.getRegInfo();
 
-  DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
-                   MF.getFunction().getParent()));
+  LLVM_DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
+                        MF.getFunction().getParent()));
 
   bool Changed = false;
   std::set<unsigned> CoalUpd, PredUpd;
@@ -1280,8 +1293,8 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
         if (!CoalUpd.count(Op.getReg()))
           KillUpd.insert(Op.getReg());
   updateLiveness(KillUpd, false, true, false);
-  DEBUG(LIS->print(dbgs() << "After coalescing\n",
-                   MF.getFunction().getParent()));
+  LLVM_DEBUG(
+      LIS->print(dbgs() << "After coalescing\n", MF.getFunction().getParent()));
 
   // First, simply split all muxes into a pair of conditional transfers
   // and update the live intervals to reflect the new arrangement. The
@@ -1297,8 +1310,8 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
   // predication, and after splitting they are difficult to recalculate
   // (because of predicated defs), so make sure they are left untouched.
   // Predication does not use live intervals.
-  DEBUG(LIS->print(dbgs() << "After splitting\n",
-                   MF.getFunction().getParent()));
+  LLVM_DEBUG(
+      LIS->print(dbgs() << "After splitting\n", MF.getFunction().getParent()));
 
   // Traverse all blocks and collapse predicable instructions feeding
   // conditional transfers into predicated instructions.
@@ -1306,13 +1319,13 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
   // cases that were not created in the previous step.
   for (auto &B : MF)
     Changed |= predicateInBlock(B, PredUpd);
-  DEBUG(LIS->print(dbgs() << "After predicating\n",
-                   MF.getFunction().getParent()));
+  LLVM_DEBUG(LIS->print(dbgs() << "After predicating\n",
+                        MF.getFunction().getParent()));
 
   PredUpd.insert(CoalUpd.begin(), CoalUpd.end());
   updateLiveness(PredUpd, true, true, true);
 
-  DEBUG({
+  LLVM_DEBUG({
     if (Changed)
       LIS->print(dbgs() << "After expand-condsets\n",
                  MF.getFunction().getParent());
@@ -1324,7 +1337,6 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
-
 FunctionPass *llvm::createHexagonExpandCondsets() {
   return new HexagonExpandCondsets();
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index a842b672736c..e9067e2285a8 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/PassSupport.h"
 
 using namespace llvm;
@@ -59,12 +60,12 @@ namespace {
     }
 
   private:
-    /// \brief Check the offset between each loop instruction and
+    /// Check the offset between each loop instruction and
     /// the loop basic block to determine if we can use the LOOP instruction
     /// or if we need to set the LC/SA registers explicitly.
     bool fixupLoopInstrs(MachineFunction &MF);
 
-    /// \brief Replace loop instruction with the constant extended
+    /// Replace loop instruction with the constant extended
     /// version if the loop label is too far from the loop instruction.
     void useExtLoopInstr(MachineFunction &MF,
                          MachineBasicBlock::iterator &MII);
@@ -80,7 +81,7 @@ FunctionPass *llvm::createHexagonFixupHwLoops() {
   return new HexagonFixupHwLoops();
 }
 
-/// \brief Returns true if the instruction is a hardware loop instruction.
+/// Returns true if the instruction is a hardware loop instruction.
 static bool isHardwareLoop(const MachineInstr &MI) {
   return MI.getOpcode() == Hexagon::J2_loop0r ||
          MI.getOpcode() == Hexagon::J2_loop0i ||
@@ -94,7 +95,7 @@ bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
   return fixupLoopInstrs(MF);
 }
 
-/// \brief For Hexagon, if the loop label is to far from the
+/// For Hexagon, if the loop label is to far from the
 /// loop instruction then we need to set the LC0 and SA0 registers
 /// explicitly instead of using LOOP(start,count).  This function
 /// checks the distance, and generates register assignments if needed.
@@ -137,7 +138,7 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
     MachineBasicBlock::iterator MII = MBB.begin();
     MachineBasicBlock::iterator MIE = MBB.end();
     while (MII != MIE) {
-      InstOffset += HII->getSize(*MII);
+      unsigned InstSize = HII->getSize(*MII);
       if (MII->isMetaInstruction()) {
         ++MII;
         continue;
@@ -145,8 +146,10 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
       if (isHardwareLoop(*MII)) {
         assert(MII->getOperand(0).isMBB() &&
                "Expect a basic block as loop operand");
-        int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
-        if ((unsigned)abs(diff) > MaxLoopRange) {
+        MachineBasicBlock *TargetBB = MII->getOperand(0).getMBB();
+        unsigned Diff = AbsoluteDifference(InstOffset,
+                                           BlockToInstOffset[TargetBB]);
+        if (Diff > MaxLoopRange) {
           useExtLoopInstr(MF, MII);
           MII = MBB.erase(MII);
           Changed = true;
@@ -156,13 +159,14 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
       } else {
         ++MII;
       }
+      InstOffset += InstSize;
     }
   }
 
   return Changed;
 }
 
-/// \brief Replace loop instructions with the constant extended version.
+/// Replace loop instructions with the constant extended version.
 void HexagonFixupHwLoops::useExtLoopInstr(MachineFunction &MF,
                                           MachineBasicBlock::iterator &MII) {
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 65a2fc35b11b..97b02e2b34cb 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -442,7 +442,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
     if (needsStackFrame(I, CSR, HRI))
       SFBlocks.push_back(&I);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Blocks needing SF: {";
     for (auto &B : SFBlocks)
       dbgs() << " " << printMBBReference(*B);
@@ -465,7 +465,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
     if (!PDomB)
       break;
   }
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Computed dom block: ";
     if (DomB)
       dbgs() << printMBBReference(*DomB);
@@ -483,11 +483,11 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
 
   // Make sure that DomB dominates PDomB and PDomB post-dominates DomB.
   if (!MDT.dominates(DomB, PDomB)) {
-    DEBUG(dbgs() << "Dom block does not dominate pdom block\n");
+    LLVM_DEBUG(dbgs() << "Dom block does not dominate pdom block\n");
     return;
   }
   if (!MPT.dominates(PDomB, DomB)) {
-    DEBUG(dbgs() << "PDom block does not post-dominate dom block\n");
+    LLVM_DEBUG(dbgs() << "PDom block does not post-dominate dom block\n");
     return;
   }
 
@@ -1396,7 +1396,7 @@ static void dump_registers(BitVector &Regs, const TargetRegisterInfo &TRI) {
 
 bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
       const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const {
-  DEBUG(dbgs() << __func__ << " on " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << __func__ << " on " << MF.getName() << '\n');
   MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector SRegs(Hexagon::NUM_TARGET_REGS);
 
@@ -1406,15 +1406,16 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
 
   // (1) For each callee-saved register, add that register and all of its
   // sub-registers to SRegs.
-  DEBUG(dbgs() << "Initial CS registers: {");
+  LLVM_DEBUG(dbgs() << "Initial CS registers: {");
   for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
     unsigned R = CSI[i].getReg();
-    DEBUG(dbgs() << ' ' << printReg(R, TRI));
+    LLVM_DEBUG(dbgs() << ' ' << printReg(R, TRI));
     for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
       SRegs[*SR] = true;
   }
-  DEBUG(dbgs() << " }\n");
-  DEBUG(dbgs() << "SRegs.1: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << " }\n");
+  LLVM_DEBUG(dbgs() << "SRegs.1: "; dump_registers(SRegs, *TRI);
+             dbgs() << "\n");
 
   // (2) For each reserved register, remove that register and all of its
   // sub- and super-registers from SRegs.
@@ -1424,8 +1425,10 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     for (MCSuperRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
       SRegs[*SR] = false;
   }
-  DEBUG(dbgs() << "Res:     "; dump_registers(Reserved, *TRI); dbgs() << "\n");
-  DEBUG(dbgs() << "SRegs.2: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Res:     "; dump_registers(Reserved, *TRI);
+             dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "SRegs.2: "; dump_registers(SRegs, *TRI);
+             dbgs() << "\n");
 
   // (3) Collect all registers that have at least one sub-register in SRegs,
   // and also have no sub-registers that are reserved. These will be the can-
@@ -1446,11 +1449,13 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
       break;
     }
   }
-  DEBUG(dbgs() << "TmpSup:  "; dump_registers(TmpSup, *TRI); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "TmpSup:  "; dump_registers(TmpSup, *TRI);
+             dbgs() << "\n");
 
   // (4) Include all super-registers found in (3) into SRegs.
   SRegs |= TmpSup;
-  DEBUG(dbgs() << "SRegs.4: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "SRegs.4: "; dump_registers(SRegs, *TRI);
+             dbgs() << "\n");
 
   // (5) For each register R in SRegs, if any super-register of R is in SRegs,
   // remove R from SRegs.
@@ -1463,7 +1468,8 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
       break;
     }
   }
-  DEBUG(dbgs() << "SRegs.5: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "SRegs.5: "; dump_registers(SRegs, *TRI);
+             dbgs() << "\n");
 
   // Now, for each register that has a fixed stack slot, create the stack
   // object for it.
@@ -1501,7 +1507,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     SRegs[R] = false;
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "CS information: {";
     for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
       int FI = CSI[i].getFrameIdx();
@@ -1706,11 +1712,6 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   for (auto R = B.begin(); R != It; ++R) {
     Clobbers.clear();
     LPR.stepForward(*R, Clobbers);
-    // Dead defs are recorded in Clobbers, but are not automatically removed
-    // from the live set.
-    for (auto &C : Clobbers)
-      if (C.second->isReg() && C.second->isDead())
-        LPR.removeReg(C.first);
   }
 
   DebugLoc DL = MI->getDebugLoc();
@@ -1867,11 +1868,11 @@ bool HexagonFrameLowering::expandSpillMacros(MachineFunction &MF,
           Changed |= expandCopy(B, I, MRI, HII, NewRegs);
           break;
         case Hexagon::STriw_pred:
-        case Hexagon::STriw_mod:
+        case Hexagon::STriw_ctr:
           Changed |= expandStoreInt(B, I, MRI, HII, NewRegs);
           break;
         case Hexagon::LDriw_pred:
-        case Hexagon::LDriw_mod:
+        case Hexagon::LDriw_ctr:
           Changed |= expandLoadInt(B, I, MRI, HII, NewRegs);
           break;
         case Hexagon::PS_vstorerq_ai:
@@ -1914,7 +1915,7 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (OptimizeSpillSlots && !isOptNone(MF))
     optimizeSpillSlots(MF, NewRegs);
 
-  // We need to reserve a a spill slot if scavenging could potentially require
+  // We need to reserve a spill slot if scavenging could potentially require
   // spilling a scavenged register.
   if (!NewRegs.empty() || mayOverflowFrameOffset(MF)) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -2026,8 +2027,8 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
     auto P = BlockIndexes.insert(
                 std::make_pair(&B, HexagonBlockRanges::InstrIndexMap(B)));
     auto &IndexMap = P.first->second;
-    DEBUG(dbgs() << "Index map for " << printMBBReference(B) << "\n"
-                 << IndexMap << '\n');
+    LLVM_DEBUG(dbgs() << "Index map for " << printMBBReference(B) << "\n"
+                      << IndexMap << '\n');
 
     for (auto &In : B) {
       int LFI, SFI;
@@ -2134,7 +2135,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
     }
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     for (auto &P : FIRangeMap) {
       dbgs() << "fi#" << P.first;
       if (BadFIs.count(P.first))
@@ -2173,7 +2174,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
     }
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Block-to-FI map (* -- live-on-exit):\n";
     for (auto &P : BlockFIMap) {
       auto &FIs = P.second;
@@ -2200,16 +2201,16 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
     HexagonBlockRanges::InstrIndexMap &IM = F->second;
     HexagonBlockRanges::RegToRangeMap LM = HBR.computeLiveMap(IM);
     HexagonBlockRanges::RegToRangeMap DM = HBR.computeDeadMap(IM, LM);
-    DEBUG(dbgs() << printMBBReference(B) << " dead map\n"
-                 << HexagonBlockRanges::PrintRangeMap(DM, HRI));
+    LLVM_DEBUG(dbgs() << printMBBReference(B) << " dead map\n"
+                      << HexagonBlockRanges::PrintRangeMap(DM, HRI));
 
     for (auto FI : BlockFIMap[&B]) {
       if (BadFIs.count(FI))
         continue;
-      DEBUG(dbgs() << "Working on fi#" << FI << '\n');
+      LLVM_DEBUG(dbgs() << "Working on fi#" << FI << '\n');
       HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
       for (auto &Range : RL) {
-        DEBUG(dbgs() << "--Examining range:" << RL << '\n');
+        LLVM_DEBUG(dbgs() << "--Examining range:" << RL << '\n');
         if (!IndexType::isInstr(Range.start()) ||
             !IndexType::isInstr(Range.end()))
           continue;
@@ -2224,7 +2225,8 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI, MF);
         // The this-> is needed to unconfuse MSVC.
         unsigned FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
-        DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI) << '\n');
+        LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI)
+                          << '\n');
         if (FoundR == 0)
           continue;
 #ifndef NDEBUG
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGatherPacketize.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGatherPacketize.cpp
index 253f09d12839..63ec9c3d3124 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGatherPacketize.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGatherPacketize.cpp
@@ -62,7 +62,7 @@ bool HexagonGatherPacketize::runOnMachineFunction(MachineFunction &Fn) {
   if (!EnableGatherPacketize)
     return false;
   auto &ST = Fn.getSubtarget<HexagonSubtarget>();
-  bool HasV65 = ST.hasV65TOps();
+  bool HasV65 = ST.hasV65Ops();
   bool UseHVX = ST.useHVXOps();
   if (!(HasV65 & UseHVX))
     return false;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index c1841d735b8c..2582a021e956 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -55,6 +55,12 @@ static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
   cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
   "generation."));
 
+// Limit the container sizes for extreme cases where we run out of memory.
+static cl::opt<unsigned> MaxORLSize("insert-max-orl", cl::init(4096),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum size of OrderedRegisterList"));
+static cl::opt<unsigned> MaxIFMSize("insert-max-ifmap", cl::init(1024),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Maximum size of IFMap"));
+
 static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
   cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
 static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
@@ -86,6 +92,7 @@ namespace {
   struct RegisterSet : private BitVector {
     RegisterSet() = default;
     explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+    RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
 
     using BitVector::clear;
 
@@ -370,9 +377,11 @@ namespace {
 
   class OrderedRegisterList {
     using ListType = std::vector<unsigned>;
+    const unsigned MaxSize;
 
   public:
-    OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {}
+    OrderedRegisterList(const RegisterOrdering &RO)
+      : MaxSize(MaxORLSize), Ord(RO) {}
 
     void insert(unsigned VR);
     void remove(unsigned VR);
@@ -433,12 +442,17 @@ void OrderedRegisterList::insert(unsigned VR) {
     Seq.push_back(VR);
   else
     Seq.insert(L, VR);
+
+  unsigned S = Seq.size();
+  if (S > MaxSize)
+    Seq.resize(MaxSize);
+  assert(Seq.size() <= MaxSize);
 }
 
 void OrderedRegisterList::remove(unsigned VR) {
   iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
-  assert(L != Seq.end());
-  Seq.erase(L);
+  if (L != Seq.end())
+    Seq.erase(L);
 }
 
 namespace {
@@ -618,7 +632,7 @@ void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
   SortableVectorType VRs;
   for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
     VRs.push_back(I->first);
-  std::sort(VRs.begin(), VRs.end(), LexCmp);
+  llvm::sort(VRs.begin(), VRs.end(), LexCmp);
   // Transfer the results to the outgoing register ordering.
   for (unsigned i = 0, n = VRs.size(); i < n; ++i)
     RO.insert(std::make_pair(VRs[i], i));
@@ -950,6 +964,9 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
           continue;
 
         findRecordInsertForms(VR, AVs);
+        // Stop if the map size is too large.
+        if (IFMap.size() > MaxIFMSize)
+          return;
       }
     }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index 5a001d6ed9c1..e5af96468af1 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -40,6 +40,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
@@ -56,6 +57,11 @@ namespace llvm {
 
 } // end namespace llvm
 
+// Initialize this to 0 to always prefer generating mux by default.
+static cl::opt<unsigned> MinPredDist("hexagon-gen-mux-threshold", cl::Hidden,
+  cl::init(0), cl::desc("Minimum distance between predicate definition and "
+  "farther of the two predicated uses"));
+
 namespace {
 
   class HexagonGenMux : public MachineFunctionPass {
@@ -269,11 +275,13 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     // There is now a complete definition of DR, i.e. we have the predicate
     // register, the definition if-true, and definition if-false.
 
-    // First, check if both definitions are far enough from the definition
+    // First, check if the definitions are far enough from the definition
     // of the predicate register.
     unsigned MinX = std::min(CI.TrueX, CI.FalseX);
     unsigned MaxX = std::max(CI.TrueX, CI.FalseX);
-    unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0;
+    // Specifically, check if the predicate definition is within a prescribed
+    // distance from the farther of the two predicated instructions.
+    unsigned SearchX = (MaxX >= MinPredDist) ? MaxX-MinPredDist : 0;
     bool NearDef = false;
     for (unsigned X = SearchX; X < MaxX; ++X) {
       const DefUseInfo &DU = DUM.lookup(X);
@@ -348,7 +356,7 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     return false;
   };
   for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     // This isn't 100% accurate, but it's safe.
     // It won't detect (as a kill) a case like this
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 9288ed03d4d2..c0d2de90467a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -222,13 +222,12 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
 }
 
 void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
-  DEBUG(dbgs() << __func__ << ": "
-               << printReg(Reg.R, TRI, Reg.S) << "\n");
+  LLVM_DEBUG(dbgs() << __func__ << ": " << printReg(Reg.R, TRI, Reg.S) << "\n");
   using use_iterator = MachineRegisterInfo::use_iterator;
 
   use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end();
   if (I == E) {
-    DEBUG(dbgs() << "Dead reg: " << printReg(Reg.R, TRI, Reg.S) << '\n');
+    LLVM_DEBUG(dbgs() << "Dead reg: " << printReg(Reg.R, TRI, Reg.S) << '\n');
     MachineInstr *DefI = MRI->getVRegDef(Reg.R);
     DefI->eraseFromParent();
     return;
@@ -250,7 +249,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
   if (F != G2P.end())
     return F->second;
 
-  DEBUG(dbgs() << __func__ << ": " << PrintRegister(Reg, *TRI));
+  LLVM_DEBUG(dbgs() << __func__ << ": " << PrintRegister(Reg, *TRI));
   MachineInstr *DefI = MRI->getVRegDef(Reg.R);
   assert(DefI);
   unsigned Opc = DefI->getOpcode();
@@ -258,7 +257,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
     assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
     Register PR = DefI->getOperand(1);
     G2P.insert(std::make_pair(Reg, PR));
-    DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
+    LLVM_DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
     return PR;
   }
 
@@ -274,7 +273,8 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
     BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
       .addReg(Reg.R, 0, Reg.S);
     G2P.insert(std::make_pair(Reg, Register(NewPR)));
-    DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n');
+    LLVM_DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI)
+                      << '\n');
     return Register(NewPR);
   }
 
@@ -364,7 +364,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) {
 }
 
 bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
-  DEBUG(dbgs() << __func__ << ": " << MI << " " << *MI);
+  LLVM_DEBUG(dbgs() << __func__ << ": " << MI << " " << *MI);
 
   unsigned Opc = MI->getOpcode();
   assert(isConvertibleToPredForm(MI));
@@ -426,7 +426,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
     Register Pred = getPredRegFor(GPR);
     MIB.addReg(Pred.R, 0, Pred.S);
   }
-  DEBUG(dbgs() << "generated: " << *MIB);
+  LLVM_DEBUG(dbgs() << "generated: " << *MIB);
 
   // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
   // with NewGPR.
@@ -449,7 +449,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
 }
 
 bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
-  DEBUG(dbgs() << __func__ << "\n");
+  LLVM_DEBUG(dbgs() << __func__ << "\n");
   const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
   bool Changed = false;
   VectOfInst Erase;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 715fd52f3acd..0e33976a58ac 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -168,7 +168,7 @@ namespace {
       }
     };
 
-    /// \brief Find the register that contains the loop controlling
+    /// Find the register that contains the loop controlling
     /// induction variable.
     /// If successful, it will return true and set the \p Reg, \p IVBump
     /// and \p IVOp arguments.  Otherwise it will return false.
@@ -183,19 +183,19 @@ namespace {
     bool findInductionRegister(MachineLoop *L, unsigned &Reg,
                                int64_t &IVBump, MachineInstr *&IVOp) const;
 
-    /// \brief Return the comparison kind for the specified opcode.
+    /// Return the comparison kind for the specified opcode.
     Comparison::Kind getComparisonKind(unsigned CondOpc,
                                        MachineOperand *InitialValue,
                                        const MachineOperand *Endvalue,
                                        int64_t IVBump) const;
 
-    /// \brief Analyze the statements in a loop to determine if the loop
+    /// Analyze the statements in a loop to determine if the loop
     /// has a computable trip count and, if so, return a value that represents
     /// the trip count expression.
     CountValue *getLoopTripCount(MachineLoop *L,
                                  SmallVectorImpl<MachineInstr *> &OldInsts);
 
-    /// \brief Return the expression that represents the number of times
+    /// Return the expression that represents the number of times
     /// a loop iterates.  The function takes the operands that represent the
     /// loop start value, loop end value, and induction value.  Based upon
     /// these operands, the function attempts to compute the trip count.
@@ -206,64 +206,64 @@ namespace {
                              const MachineOperand *End, unsigned IVReg,
                              int64_t IVBump, Comparison::Kind Cmp) const;
 
-    /// \brief Return true if the instruction is not valid within a hardware
+    /// Return true if the instruction is not valid within a hardware
     /// loop.
     bool isInvalidLoopOperation(const MachineInstr *MI,
                                 bool IsInnerHWLoop) const;
 
-    /// \brief Return true if the loop contains an instruction that inhibits
+    /// Return true if the loop contains an instruction that inhibits
     /// using the hardware loop.
     bool containsInvalidInstruction(MachineLoop *L, bool IsInnerHWLoop) const;
 
-    /// \brief Given a loop, check if we can convert it to a hardware loop.
+    /// Given a loop, check if we can convert it to a hardware loop.
     /// If so, then perform the conversion and return true.
     bool convertToHardwareLoop(MachineLoop *L, bool &L0used, bool &L1used);
 
-    /// \brief Return true if the instruction is now dead.
+    /// Return true if the instruction is now dead.
     bool isDead(const MachineInstr *MI,
                 SmallVectorImpl<MachineInstr *> &DeadPhis) const;
 
-    /// \brief Remove the instruction if it is now dead.
+    /// Remove the instruction if it is now dead.
     void removeIfDead(MachineInstr *MI);
 
-    /// \brief Make sure that the "bump" instruction executes before the
+    /// Make sure that the "bump" instruction executes before the
     /// compare.  We need that for the IV fixup, so that the compare
     /// instruction would not use a bumped value that has not yet been
     /// defined.  If the instructions are out of order, try to reorder them.
     bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
 
-    /// \brief Return true if MO and MI pair is visited only once. If visited
+    /// Return true if MO and MI pair is visited only once. If visited
     /// more than once, this indicates there is recursion. In such a case,
     /// return false.
     bool isLoopFeeder(MachineLoop *L, MachineBasicBlock *A, MachineInstr *MI,
                       const MachineOperand *MO,
                       LoopFeederMap &LoopFeederPhi) const;
 
-    /// \brief Return true if the Phi may generate a value that may underflow,
+    /// Return true if the Phi may generate a value that may underflow,
     /// or may wrap.
     bool phiMayWrapOrUnderflow(MachineInstr *Phi, const MachineOperand *EndVal,
                                MachineBasicBlock *MBB, MachineLoop *L,
                                LoopFeederMap &LoopFeederPhi) const;
 
-    /// \brief Return true if the induction variable may underflow an unsigned
+    /// Return true if the induction variable may underflow an unsigned
     /// value in the first iteration.
     bool loopCountMayWrapOrUnderFlow(const MachineOperand *InitVal,
                                      const MachineOperand *EndVal,
                                      MachineBasicBlock *MBB, MachineLoop *L,
                                      LoopFeederMap &LoopFeederPhi) const;
 
-    /// \brief Check if the given operand has a compile-time known constant
+    /// Check if the given operand has a compile-time known constant
     /// value. Return true if yes, and false otherwise. When returning true, set
     /// Val to the corresponding constant value.
     bool checkForImmediate(const MachineOperand &MO, int64_t &Val) const;
 
-    /// \brief Check if the operand has a compile-time known constant value.
+    /// Check if the operand has a compile-time known constant value.
     bool isImmediate(const MachineOperand &MO) const {
       int64_t V;
       return checkForImmediate(MO, V);
     }
 
-    /// \brief Return the immediate for the specified operand.
+    /// Return the immediate for the specified operand.
     int64_t getImmediate(const MachineOperand &MO) const {
       int64_t V;
       if (!checkForImmediate(MO, V))
@@ -271,12 +271,12 @@ namespace {
       return V;
     }
 
-    /// \brief Reset the given machine operand to now refer to a new immediate
+    /// Reset the given machine operand to now refer to a new immediate
     /// value.  Assumes that the operand was already referencing an immediate
     /// value, either directly, or via a register.
     void setImmediate(MachineOperand &MO, int64_t Val);
 
-    /// \brief Fix the data flow of the induction variable.
+    /// Fix the data flow of the induction variable.
     /// The desired flow is: phi ---> bump -+-> comparison-in-latch.
     ///                                     |
     ///                                     +-> back to phi
@@ -297,7 +297,7 @@ namespace {
     /// cannot be adjusted to reflect the post-bump value.
     bool fixupInductionVariable(MachineLoop *L);
 
-    /// \brief Given a loop, if it does not have a preheader, create one.
+    /// Given a loop, if it does not have a preheader, create one.
     /// Return the block that is the preheader.
     MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
   };
@@ -307,7 +307,7 @@ namespace {
   int HexagonHardwareLoops::Counter = 0;
 #endif
 
-  /// \brief Abstraction for a trip count of a loop. A smaller version
+  /// Abstraction for a trip count of a loop. A smaller version
   /// of the MachineOperand class without the concerns of changing the
   /// operand representation.
   class CountValue {
@@ -376,7 +376,7 @@ FunctionPass *llvm::createHexagonHardwareLoops() {
 }
 
 bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+  LLVM_DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -556,7 +556,7 @@ HexagonHardwareLoops::getComparisonKind(unsigned CondOpc,
   return Cmp;
 }
 
-/// \brief Analyze the statements in a loop to determine if the loop has
+/// Analyze the statements in a loop to determine if the loop has
 /// a computable trip count and, if so, return a value that represents
 /// the trip count expression.
 ///
@@ -718,7 +718,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
 }
 
-/// \brief Helper function that returns the expression that represents the
+/// Helper function that returns the expression that represents the
 /// number of times a loop iterates.  The function takes the operands that
 /// represent the loop start value, loop end value, and induction value.
 /// Based upon these operands, the function attempts to compute the trip count.
@@ -928,6 +928,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
       // 'Add' instruction.
       const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
       if (EndValInstr->getOpcode() == Hexagon::A2_addi &&
+          EndValInstr->getOperand(1).getSubReg() == 0 &&
           EndValInstr->getOperand(2).getImm() == StartV) {
         DistR = EndValInstr->getOperand(1).getReg();
       } else {
@@ -984,7 +985,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   return new CountValue(CountValue::CV_Register, CountR, CountSR);
 }
 
-/// \brief Return true if the operation is invalid within hardware loop.
+/// Return true if the operation is invalid within hardware loop.
 bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
                                                   bool IsInnerHWLoop) const {
   // Call is not allowed because the callee may use a hardware loop except for
@@ -1006,19 +1007,20 @@ bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
   return false;
 }
 
-/// \brief Return true if the loop contains an instruction that inhibits
+/// Return true if the loop contains an instruction that inhibits
 /// the use of the hardware loop instruction.
 bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
     bool IsInnerHWLoop) const {
   const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
-  DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
+  LLVM_DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *MBB = Blocks[i];
     for (MachineBasicBlock::iterator
            MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
       const MachineInstr *MI = &*MII;
       if (isInvalidLoopOperation(MI, IsInnerHWLoop)) {
-        DEBUG(dbgs()<< "\nCannot convert to hw_loop due to:"; MI->dump(););
+        LLVM_DEBUG(dbgs() << "\nCannot convert to hw_loop due to:";
+                   MI->dump(););
         return true;
       }
     }
@@ -1026,7 +1028,7 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
   return false;
 }
 
-/// \brief Returns true if the instruction is dead.  This was essentially
+/// Returns true if the instruction is dead.  This was essentially
 /// copied from DeadMachineInstructionElim::isDead, but with special cases
 /// for inline asm, physical registers and instructions with side effects
 /// removed.
@@ -1083,7 +1085,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
 
   SmallVector<MachineInstr*, 1> DeadPhis;
   if (isDead(MI, DeadPhis)) {
-    DEBUG(dbgs() << "HW looping will remove: " << *MI);
+    LLVM_DEBUG(dbgs() << "HW looping will remove: " << *MI);
 
     // It is possible that some DBG_VALUE instructions refer to this
     // instruction.  Examine each def operand for such references;
@@ -1112,7 +1114,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
   }
 }
 
-/// \brief Check if the loop is a candidate for converting to a hardware
+/// Check if the loop is a candidate for converting to a hardware
 /// loop.  If so, then perform the transformation.
 ///
 /// This function works on innermost loops first.  A loop can be converted
@@ -1237,7 +1239,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
     LoopStart = TopBlock;
 
   // Convert the loop to a hardware loop.
-  DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+  LLVM_DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
   DebugLoc DL;
   if (InsertPos != Preheader->end())
     DL = InsertPos->getDebugLoc();
@@ -1367,7 +1369,7 @@ bool HexagonHardwareLoops::isLoopFeeder(MachineLoop *L, MachineBasicBlock *A,
                                         LoopFeederMap &LoopFeederPhi) const {
   if (LoopFeederPhi.find(MO->getReg()) == LoopFeederPhi.end()) {
     const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
-    DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
+    LLVM_DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(*Blocks[0]));
     // Ignore all BBs that form Loop.
     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
       MachineBasicBlock *MBB = Blocks[i];
@@ -1768,16 +1770,16 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
         for (unsigned i = 1, n = PredDef->getNumOperands(); i < n; ++i) {
           MachineOperand &MO = PredDef->getOperand(i);
           if (MO.isReg() && MO.getReg() == RB.first) {
-            DEBUG(dbgs() << "\n DefMI(" << i << ") = "
-                         << *(MRI->getVRegDef(I->first)));
+            LLVM_DEBUG(dbgs() << "\n DefMI(" << i
+                              << ") = " << *(MRI->getVRegDef(I->first)));
             if (IndI)
               return false;
 
             IndI = MRI->getVRegDef(I->first);
             IndMO = &MO;
           } else if (MO.isReg()) {
-            DEBUG(dbgs() << "\n DefMI(" << i << ") = "
-                         << *(MRI->getVRegDef(MO.getReg())));
+            LLVM_DEBUG(dbgs() << "\n DefMI(" << i
+                              << ") = " << *(MRI->getVRegDef(MO.getReg())));
             if (nonIndI)
               return false;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index 036b18678709..44f1f554c662 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -26,11 +26,13 @@ using namespace llvm;
 #define DEBUG_TYPE "post-RA-sched"
 
 void HexagonHazardRecognizer::Reset() {
-  DEBUG(dbgs() << "Reset hazard recognizer\n");
+  LLVM_DEBUG(dbgs() << "Reset hazard recognizer\n");
   Resources->clearResources();
   PacketNum = 0;
   UsesDotCur = nullptr;
   DotCurPNum = -1;
+  UsesLoad = false;
+  PrefVectorStoreNew = nullptr;
   RegDefs.clear();
 }
 
@@ -41,7 +43,7 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
     return NoHazard;
 
   if (!Resources->canReserveResources(*MI)) {
-    DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
+    LLVM_DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
     HazardType RetVal = Hazard;
     if (TII->mayBeNewStore(*MI)) {
       // Make sure the register to be stored is defined by an instruction in the
@@ -57,14 +59,16 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
                                MI->getDebugLoc());
       if (Resources->canReserveResources(*NewMI))
         RetVal = NoHazard;
-      DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard) << "\n");
+      LLVM_DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard)
+                        << "\n");
       MF->DeleteMachineInstr(NewMI);
     }
     return RetVal;
   }
 
   if (SU == UsesDotCur && DotCurPNum != (int)PacketNum) {
-    DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", " << *MI);
+    LLVM_DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", "
+                      << *MI);
     return Hazard;
   }
 
@@ -72,21 +76,33 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
 }
 
 void HexagonHazardRecognizer::AdvanceCycle() {
-  DEBUG(dbgs() << "Advance cycle, clear state\n");
+  LLVM_DEBUG(dbgs() << "Advance cycle, clear state\n");
   Resources->clearResources();
   if (DotCurPNum != -1 && DotCurPNum != (int)PacketNum) {
     UsesDotCur = nullptr;
     DotCurPNum = -1;
   }
+  UsesLoad = false;
+  PrefVectorStoreNew = nullptr;
   PacketNum++;
   RegDefs.clear();
 }
 
-/// If a packet contains a dot cur instruction, then we may prefer the
-/// instruction that can use the dot cur result. Or, if the use
-/// isn't scheduled in the same packet, then prefer other instructions
-/// in the subsequent packet.
+/// Handle the cases when we prefer one instruction over another. Case 1 - we
+/// prefer not to generate multiple loads in the packet to avoid a potential
+/// bank conflict. Case 2 - if a packet contains a dot cur instruction, then we
+/// prefer the instruction that can use the dot cur result. However, if the use
+/// is not scheduled in the same packet, then prefer other instructions in the
+/// subsequent packet. Case 3 - we prefer a vector store that can be converted
+/// to a .new store. The packetizer will not generate the .new store if the
+/// store doesn't have resources to fit in the packet (but the .new store may
+/// have resources). We attempt to schedule the store as soon as possible to
+/// help packetize the two instructions together.
 bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  if (PrefVectorStoreNew != nullptr && PrefVectorStoreNew != SU)
+    return true;
+  if (UsesLoad && SU->isInstr() && SU->getInstr()->mayLoad())
+    return true;
   return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum));
 }
 
@@ -118,17 +134,16 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
   }
   else
     Resources->reserveResources(*MI);
-  DEBUG(dbgs() << " Add instruction " << *MI);
+  LLVM_DEBUG(dbgs() << " Add instruction " << *MI);
 
   // When scheduling a dot cur instruction, check if there is an instruction
   // that can use the dot cur in the same packet. If so, we'll attempt to
-  // schedule it before other instructions. We only do this if the use has
-  // the same height as the dot cur. Otherwise, we may miss scheduling an
-  // instruction with a greater height, which is more important.
+  // schedule it before other instructions. We only do this if the load has a
+  // single zero-latency use.
   if (TII->mayBeCurLoad(*MI))
     for (auto &S : SU->Succs)
       if (S.isAssignedRegDep() && S.getLatency() == 0 &&
-          SU->getHeight() == S.getSUnit()->getHeight()) {
+          S.getSUnit()->NumPredsLeft == 1) {
         UsesDotCur = S.getSUnit();
         DotCurPNum = PacketNum;
         break;
@@ -137,4 +152,15 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
     UsesDotCur = nullptr;
     DotCurPNum = -1;
   }
+
+  UsesLoad = MI->mayLoad();
+
+  if (TII->isHVXVec(*MI) && !MI->mayLoad() && !MI->mayStore())
+    for (auto &S : SU->Succs)
+      if (S.isAssignedRegDep() && S.getLatency() == 0 &&
+          TII->mayBeNewStore(*S.getSUnit()->getInstr()) &&
+          Resources->canReserveResources(*S.getSUnit()->getInstr())) {
+        PrefVectorStoreNew = S.getSUnit();
+        break;
+      }
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
index 70efcb7a9f76..2874d73ce819 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -23,13 +23,21 @@ namespace llvm {
 class HexagonHazardRecognizer : public ScheduleHazardRecognizer {
   DFAPacketizer *Resources;
   const HexagonInstrInfo *TII;
-  unsigned PacketNum;
+  unsigned PacketNum = 0;
   // If the packet contains a potential dot cur instruction. This is
   // used for the scheduling priority function.
-  SUnit *UsesDotCur;
+  SUnit *UsesDotCur = nullptr;
   // The packet number when a dor cur is emitted. If its use is not generated
   // in the same packet, then try to wait another cycle before emitting.
-  int DotCurPNum;
+  int DotCurPNum = -1;
+  // Does the packet contain a load. Used to restrict another load, if possible.
+  bool UsesLoad = false;
+  // Check if we should prefer a vector store that will become a .new version.
+  // The .new store uses different resources than a normal store, and the
+  // packetizer will not generate the .new if the regular store does not have
+  // resources available (even if the .new version does). To help, the schedule
+  // attempts to schedule the .new as soon as possible in the packet.
+  SUnit *PrefVectorStoreNew = nullptr;
   // The set of registers defined by instructions in the current packet.
   SmallSet<unsigned, 8> RegDefs;
 
@@ -37,8 +45,7 @@ public:
   HexagonHazardRecognizer(const InstrItineraryData *II,
                           const HexagonInstrInfo *HII,
                           const HexagonSubtarget &ST)
-    : Resources(ST.createDFAPacketizer(II)), TII(HII), PacketNum(0),
-    UsesDotCur(nullptr), DotCurPNum(-1) { }
+    : Resources(ST.createDFAPacketizer(II)), TII(HII) { }
 
   ~HexagonHazardRecognizer() override {
     if (Resources)
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index a6ac4e3df745..efb4c2eb0fc3 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -64,51 +64,6 @@ FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
 }
 }
 
-// Intrinsics that return a a predicate.
-static bool doesIntrinsicReturnPredicate(unsigned ID) {
-  switch (ID) {
-    default:
-      return false;
-    case Intrinsic::hexagon_C2_cmpeq:
-    case Intrinsic::hexagon_C2_cmpgt:
-    case Intrinsic::hexagon_C2_cmpgtu:
-    case Intrinsic::hexagon_C2_cmpgtup:
-    case Intrinsic::hexagon_C2_cmpgtp:
-    case Intrinsic::hexagon_C2_cmpeqp:
-    case Intrinsic::hexagon_C2_bitsset:
-    case Intrinsic::hexagon_C2_bitsclr:
-    case Intrinsic::hexagon_C2_cmpeqi:
-    case Intrinsic::hexagon_C2_cmpgti:
-    case Intrinsic::hexagon_C2_cmpgtui:
-    case Intrinsic::hexagon_C2_cmpgei:
-    case Intrinsic::hexagon_C2_cmpgeui:
-    case Intrinsic::hexagon_C2_cmplt:
-    case Intrinsic::hexagon_C2_cmpltu:
-    case Intrinsic::hexagon_C2_bitsclri:
-    case Intrinsic::hexagon_C2_and:
-    case Intrinsic::hexagon_C2_or:
-    case Intrinsic::hexagon_C2_xor:
-    case Intrinsic::hexagon_C2_andn:
-    case Intrinsic::hexagon_C2_not:
-    case Intrinsic::hexagon_C2_orn:
-    case Intrinsic::hexagon_C2_pxfer_map:
-    case Intrinsic::hexagon_C2_any8:
-    case Intrinsic::hexagon_C2_all8:
-    case Intrinsic::hexagon_A2_vcmpbeq:
-    case Intrinsic::hexagon_A2_vcmpbgtu:
-    case Intrinsic::hexagon_A2_vcmpheq:
-    case Intrinsic::hexagon_A2_vcmphgt:
-    case Intrinsic::hexagon_A2_vcmphgtu:
-    case Intrinsic::hexagon_A2_vcmpweq:
-    case Intrinsic::hexagon_A2_vcmpwgt:
-    case Intrinsic::hexagon_A2_vcmpwgtu:
-    case Intrinsic::hexagon_C2_tfrrp:
-    case Intrinsic::hexagon_S2_tstbit_i:
-    case Intrinsic::hexagon_S2_tstbit_r:
-      return true;
-  }
-}
-
 void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
@@ -138,12 +93,18 @@ void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
       Opcode = IsValidInc ? Hexagon::L2_loadrh_pi : Hexagon::L2_loadrh_io;
     break;
   case MVT::i32:
+  case MVT::f32:
+  case MVT::v2i16:
+  case MVT::v4i8:
     Opcode = IsValidInc ? Hexagon::L2_loadri_pi : Hexagon::L2_loadri_io;
     break;
   case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v4i16:
+  case MVT::v8i8:
     Opcode = IsValidInc ? Hexagon::L2_loadrd_pi : Hexagon::L2_loadrd_io;
     break;
-  // 64B
   case MVT::v64i8:
   case MVT::v32i16:
   case MVT::v16i32:
@@ -223,7 +184,6 @@ void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
   CurDAG->RemoveDeadNode(LD);
 }
 
-
 MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
   if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return nullptr;
@@ -241,35 +201,14 @@ MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
   };
   auto FLC = LoadPciMap.find(IntNo);
   if (FLC != LoadPciMap.end()) {
-    SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
-          IntN->getOperand(4));
     EVT ValTy = (IntNo == Intrinsic::hexagon_circ_ldd) ? MVT::i64 : MVT::i32;
     EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
     // Operands: { Base, Increment, Modifier, Chain }
     auto Inc = cast<ConstantSDNode>(IntN->getOperand(5));
     SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), dl, MVT::i32);
     MachineSDNode *Res = CurDAG->getMachineNode(FLC->second, dl, RTys,
-          { IntN->getOperand(2), I, SDValue(Mod,0), IntN->getOperand(0) });
-    return Res;
-  }
-
-  static std::map<unsigned,unsigned> LoadPbrMap = {
-    { Intrinsic::hexagon_brev_ldb,  Hexagon::L2_loadrb_pbr  },
-    { Intrinsic::hexagon_brev_ldub, Hexagon::L2_loadrub_pbr },
-    { Intrinsic::hexagon_brev_ldh,  Hexagon::L2_loadrh_pbr  },
-    { Intrinsic::hexagon_brev_lduh, Hexagon::L2_loadruh_pbr },
-    { Intrinsic::hexagon_brev_ldw,  Hexagon::L2_loadri_pbr  },
-    { Intrinsic::hexagon_brev_ldd,  Hexagon::L2_loadrd_pbr  },
-  };
-  auto FLB = LoadPbrMap.find(IntNo);
-  if (FLB != LoadPbrMap.end()) {
-    SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
-            IntN->getOperand(4));
-    EVT ValTy = (IntNo == Intrinsic::hexagon_brev_ldd) ? MVT::i64 : MVT::i32;
-    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
-    // Operands: { Base, Modifier, Chain }
-    MachineSDNode *Res = CurDAG->getMachineNode(FLB->second, dl, RTys,
-          { IntN->getOperand(2), SDValue(Mod,0), IntN->getOperand(0) });
+          { IntN->getOperand(2), I, IntN->getOperand(4),
+            IntN->getOperand(0) });
     return Res;
   }
 
@@ -343,14 +282,10 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
   // a sign-extending intrinsic into (or the other way around).
   ISD::LoadExtType IntExt;
   switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
-    case Intrinsic::hexagon_brev_ldub:
-    case Intrinsic::hexagon_brev_lduh:
     case Intrinsic::hexagon_circ_ldub:
     case Intrinsic::hexagon_circ_lduh:
       IntExt = ISD::ZEXTLOAD;
       break;
-    case Intrinsic::hexagon_brev_ldw:
-    case Intrinsic::hexagon_brev_ldd:
     case Intrinsic::hexagon_circ_ldw:
     case Intrinsic::hexagon_circ_ldd:
       IntExt = ISD::NON_EXTLOAD;
@@ -378,6 +313,134 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
     CurDAG->RemoveDeadNode(C);
     return true;
   }
+  return false;
+}
+
+// Convert the bit-reverse load intrinsic to appropriate target instruction.
+bool HexagonDAGToDAGISel::SelectBrevLdIntrinsic(SDNode *IntN) {
+  if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  const SDLoc &dl(IntN);
+  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+
+  static const std::map<unsigned, unsigned> LoadBrevMap = {
+    { Intrinsic::hexagon_L2_loadrb_pbr, Hexagon::L2_loadrb_pbr },
+    { Intrinsic::hexagon_L2_loadrub_pbr, Hexagon::L2_loadrub_pbr },
+    { Intrinsic::hexagon_L2_loadrh_pbr, Hexagon::L2_loadrh_pbr },
+    { Intrinsic::hexagon_L2_loadruh_pbr, Hexagon::L2_loadruh_pbr },
+    { Intrinsic::hexagon_L2_loadri_pbr, Hexagon::L2_loadri_pbr },
+    { Intrinsic::hexagon_L2_loadrd_pbr, Hexagon::L2_loadrd_pbr }
+  };
+  auto FLI = LoadBrevMap.find(IntNo);
+  if (FLI != LoadBrevMap.end()) {
+    EVT ValTy =
+        (IntNo == Intrinsic::hexagon_L2_loadrd_pbr) ? MVT::i64 : MVT::i32;
+    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+    // Operands of Intrinsic: {chain, enum ID of intrinsic, baseptr,
+    // modifier}.
+    // Operands of target instruction: { Base, Modifier, Chain }.
+    MachineSDNode *Res = CurDAG->getMachineNode(
+        FLI->second, dl, RTys,
+        {IntN->getOperand(2), IntN->getOperand(3), IntN->getOperand(0)});
+
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = cast<MemIntrinsicSDNode>(IntN)->getMemOperand();
+    Res->setMemRefs(MemOp, MemOp + 1);
+
+    ReplaceUses(SDValue(IntN, 0), SDValue(Res, 0));
+    ReplaceUses(SDValue(IntN, 1), SDValue(Res, 1));
+    ReplaceUses(SDValue(IntN, 2), SDValue(Res, 2));
+    CurDAG->RemoveDeadNode(IntN);
+    return true;
+  }
+  return false;
+}
+
+/// Generate a machine instruction node for the new circlar buffer intrinsics.
+/// The new versions use a CSx register instead of the K field.
+bool HexagonDAGToDAGISel::SelectNewCircIntrinsic(SDNode *IntN) {
+  if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  SDLoc DL(IntN);
+  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  SmallVector<SDValue, 7> Ops;
+
+  static std::map<unsigned,unsigned> LoadNPcMap = {
+    { Intrinsic::hexagon_L2_loadrub_pci, Hexagon::PS_loadrub_pci },
+    { Intrinsic::hexagon_L2_loadrb_pci, Hexagon::PS_loadrb_pci },
+    { Intrinsic::hexagon_L2_loadruh_pci, Hexagon::PS_loadruh_pci },
+    { Intrinsic::hexagon_L2_loadrh_pci, Hexagon::PS_loadrh_pci },
+    { Intrinsic::hexagon_L2_loadri_pci, Hexagon::PS_loadri_pci },
+    { Intrinsic::hexagon_L2_loadrd_pci, Hexagon::PS_loadrd_pci },
+    { Intrinsic::hexagon_L2_loadrub_pcr, Hexagon::PS_loadrub_pcr },
+    { Intrinsic::hexagon_L2_loadrb_pcr, Hexagon::PS_loadrb_pcr },
+    { Intrinsic::hexagon_L2_loadruh_pcr, Hexagon::PS_loadruh_pcr },
+    { Intrinsic::hexagon_L2_loadrh_pcr, Hexagon::PS_loadrh_pcr },
+    { Intrinsic::hexagon_L2_loadri_pcr, Hexagon::PS_loadri_pcr },
+    { Intrinsic::hexagon_L2_loadrd_pcr, Hexagon::PS_loadrd_pcr }
+  };
+  auto FLI = LoadNPcMap.find (IntNo);
+  if (FLI != LoadNPcMap.end()) {
+    EVT ValTy = MVT::i32;
+    if (IntNo == Intrinsic::hexagon_L2_loadrd_pci ||
+        IntNo == Intrinsic::hexagon_L2_loadrd_pcr)
+      ValTy = MVT::i64;
+    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+    // Handle load.*_pci case which has 6 operands.
+    if (IntN->getNumOperands() == 6) {
+      auto Inc = cast<ConstantSDNode>(IntN->getOperand(3));
+      SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), DL, MVT::i32);
+      // Operands: { Base, Increment, Modifier, Start, Chain }.
+      Ops = { IntN->getOperand(2), I, IntN->getOperand(4), IntN->getOperand(5),
+              IntN->getOperand(0) };
+    } else
+      // Handle load.*_pcr case which has 5 operands.
+      // Operands: { Base, Modifier, Start, Chain }.
+      Ops = { IntN->getOperand(2), IntN->getOperand(3), IntN->getOperand(4),
+              IntN->getOperand(0) };
+    MachineSDNode *Res = CurDAG->getMachineNode(FLI->second, DL, RTys, Ops);
+    ReplaceUses(SDValue(IntN, 0), SDValue(Res, 0));
+    ReplaceUses(SDValue(IntN, 1), SDValue(Res, 1));
+    ReplaceUses(SDValue(IntN, 2), SDValue(Res, 2));
+    CurDAG->RemoveDeadNode(IntN);
+    return true;
+  }
+
+  static std::map<unsigned,unsigned> StoreNPcMap = {
+    { Intrinsic::hexagon_S2_storerb_pci, Hexagon::PS_storerb_pci },
+    { Intrinsic::hexagon_S2_storerh_pci, Hexagon::PS_storerh_pci },
+    { Intrinsic::hexagon_S2_storerf_pci, Hexagon::PS_storerf_pci },
+    { Intrinsic::hexagon_S2_storeri_pci, Hexagon::PS_storeri_pci },
+    { Intrinsic::hexagon_S2_storerd_pci, Hexagon::PS_storerd_pci },
+    { Intrinsic::hexagon_S2_storerb_pcr, Hexagon::PS_storerb_pcr },
+    { Intrinsic::hexagon_S2_storerh_pcr, Hexagon::PS_storerh_pcr },
+    { Intrinsic::hexagon_S2_storerf_pcr, Hexagon::PS_storerf_pcr },
+    { Intrinsic::hexagon_S2_storeri_pcr, Hexagon::PS_storeri_pcr },
+    { Intrinsic::hexagon_S2_storerd_pcr, Hexagon::PS_storerd_pcr }
+  };
+  auto FSI = StoreNPcMap.find (IntNo);
+  if (FSI != StoreNPcMap.end()) {
+    EVT RTys[] = { MVT::i32, MVT::Other };
+    // Handle store.*_pci case which has 7 operands.
+    if (IntN->getNumOperands() == 7) {
+      auto Inc = cast<ConstantSDNode>(IntN->getOperand(3));
+      SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), DL, MVT::i32);
+      // Operands: { Base, Increment, Modifier, Value, Start, Chain }.
+      Ops = { IntN->getOperand(2), I, IntN->getOperand(4), IntN->getOperand(5),
+              IntN->getOperand(6), IntN->getOperand(0) };
+    } else
+      // Handle store.*_pcr case which has 6 operands.
+      // Operands: { Base, Modifier, Value, Start, Chain }.
+      Ops = { IntN->getOperand(2), IntN->getOperand(3), IntN->getOperand(4),
+              IntN->getOperand(5), IntN->getOperand(0) };
+    MachineSDNode *Res = CurDAG->getMachineNode(FSI->second, DL, RTys, Ops);
+    ReplaceUses(SDValue(IntN, 0), SDValue(Res, 0));
+    ReplaceUses(SDValue(IntN, 1), SDValue(Res, 1));
+    CurDAG->RemoveDeadNode(IntN);
+    return true;
+  }
 
   return false;
 }
@@ -385,9 +448,9 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
 void HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
 
   // Handle indexed loads.
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM != ISD::UNINDEXED) {
     SelectIndexedLoad(LD, dl);
     return;
@@ -422,9 +485,16 @@ void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
     Opcode = IsValidInc ? Hexagon::S2_storerh_pi : Hexagon::S2_storerh_io;
     break;
   case MVT::i32:
+  case MVT::f32:
+  case MVT::v2i16:
+  case MVT::v4i8:
     Opcode = IsValidInc ? Hexagon::S2_storeri_pi : Hexagon::S2_storeri_io;
     break;
   case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v4i16:
+  case MVT::v8i8:
     Opcode = IsValidInc ? Hexagon::S2_storerd_pi : Hexagon::S2_storerd_io;
     break;
   case MVT::v64i8:
@@ -488,9 +558,9 @@ void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
 void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
   SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
-  ISD::MemIndexedMode AM = ST->getAddressingMode();
 
   // Handle indexed stores.
+  ISD::MemIndexedMode AM = ST->getAddressingMode();
   if (AM != ISD::UNINDEXED) {
     SelectIndexedStore(ST, dl);
     return;
@@ -553,85 +623,6 @@ void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
   return Default();
 }
 
-
-//
-// If there is an zero_extend followed an intrinsic in DAG (this means - the
-// result of the intrinsic is predicate); convert the zero_extend to
-// transfer instruction.
-//
-// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
-// converted into a MUX as predicate registers defined as 1 bit in the
-// compiler. Architecture defines them as 8-bit registers.
-// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
-//
-void HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
-  SDLoc dl(N);
-
-  SDValue Op0 = N->getOperand(0);
-  EVT OpVT = Op0.getValueType();
-  unsigned OpBW = OpVT.getSizeInBits();
-
-  // Special handling for zero-extending a vector of booleans.
-  if (OpVT.isVector() && OpVT.getVectorElementType() == MVT::i1 && OpBW <= 64) {
-    SDNode *Mask = CurDAG->getMachineNode(Hexagon::C2_mask, dl, MVT::i64, Op0);
-    unsigned NE = OpVT.getVectorNumElements();
-    EVT ExVT = N->getValueType(0);
-    unsigned ES = ExVT.getScalarSizeInBits();
-    uint64_t MV = 0, Bit = 1;
-    for (unsigned i = 0; i < NE; ++i) {
-      MV |= Bit;
-      Bit <<= ES;
-    }
-    SDValue Ones = CurDAG->getTargetConstant(MV, dl, MVT::i64);
-    SDNode *OnesReg = CurDAG->getMachineNode(Hexagon::CONST64, dl,
-                                             MVT::i64, Ones);
-    if (ExVT.getSizeInBits() == 32) {
-      SDNode *And = CurDAG->getMachineNode(Hexagon::A2_andp, dl, MVT::i64,
-                                           SDValue(Mask,0), SDValue(OnesReg,0));
-      SDValue SubR = CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32);
-      ReplaceNode(N, CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
-                                            SDValue(And, 0), SubR));
-      return;
-    }
-    ReplaceNode(N,
-                CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
-                                       SDValue(Mask, 0), SDValue(OnesReg, 0)));
-    return;
-  }
-
-  SDNode *Int = N->getOperand(0).getNode();
-  if ((Int->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
-    unsigned ID = cast<ConstantSDNode>(Int->getOperand(0))->getZExtValue();
-    if (doesIntrinsicReturnPredicate(ID)) {
-      // Now we need to differentiate target data types.
-      if (N->getValueType(0) == MVT::i64) {
-        // Convert the zero_extend to Rs = Pd followed by A2_combinew(0,Rs).
-        SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
-                                                  MVT::i32, SDValue(Int, 0));
-        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl,
-                                                  MVT::i32, TargetConst0);
-        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
-                                                  MVT::i64, MVT::Other,
-                                                  SDValue(Result_2, 0),
-                                                  SDValue(Result_1, 0));
-        ReplaceNode(N, Result_3);
-        return;
-      }
-      if (N->getValueType(0) == MVT::i32) {
-        // Convert the zero_extend to Rs = Pd
-        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
-                                              MVT::i32, SDValue(Int, 0));
-        ReplaceNode(N, RsPd);
-        return;
-      }
-      llvm_unreachable("Unexpected value type");
-    }
-  }
-  SelectCode(N);
-}
-
-
 //
 // Handling intrinsics for circular load and bitreverse load.
 //
@@ -642,6 +633,13 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
     return;
   }
 
+  // Handle bit-reverse load intrinsics.
+  if (SelectBrevLdIntrinsic(N))
+    return;
+
+  if (SelectNewCircIntrinsic(N))
+    return;
+
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   if (IntNo == Intrinsic::hexagon_V6_vgathermw ||
       IntNo == Intrinsic::hexagon_V6_vgathermw_128B ||
@@ -735,7 +733,6 @@ void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
   SelectCode(N);
 }
 
-
 void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
   const HexagonFrameLowering *HFI = HST->getFrameLowering();
@@ -765,20 +762,113 @@ void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
   ReplaceNode(N, R);
 }
 
+void HexagonDAGToDAGISel::SelectAddSubCarry(SDNode *N) {
+  unsigned OpcCarry = N->getOpcode() == HexagonISD::ADDC ? Hexagon::A4_addp_c
+                                                         : Hexagon::A4_subp_c;
+  SDNode *C = CurDAG->getMachineNode(OpcCarry, SDLoc(N), N->getVTList(),
+                                     { N->getOperand(0), N->getOperand(1),
+                                       N->getOperand(2) });
+  ReplaceNode(N, C);
+}
 
-void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
-  EVT SVT = N->getOperand(0).getValueType();
-  EVT DVT = N->getValueType(0);
-  if (!SVT.isVector() || !DVT.isVector() ||
-      SVT.getVectorElementType() == MVT::i1 ||
-      DVT.getVectorElementType() == MVT::i1 ||
-      SVT.getSizeInBits() != DVT.getSizeInBits()) {
-    SelectCode(N);
-    return;
+void HexagonDAGToDAGISel::SelectVAlign(SDNode *N) {
+  MVT ResTy = N->getValueType(0).getSimpleVT();
+  if (HST->isHVXVectorType(ResTy, true))
+    return SelectHvxVAlign(N);
+
+  const SDLoc &dl(N);
+  unsigned VecLen = ResTy.getSizeInBits();
+  if (VecLen == 32) {
+    SDValue Ops[] = {
+      CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32),
+      N->getOperand(0),
+      CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32),
+      N->getOperand(1),
+      CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32)
+    };
+    SDNode *R = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl,
+                                       MVT::i64, Ops);
+
+    // Shift right by "(Addr & 0x3) * 8" bytes.
+    SDValue M0 = CurDAG->getTargetConstant(0x18, dl, MVT::i32);
+    SDValue M1 = CurDAG->getTargetConstant(0x03, dl, MVT::i32);
+    SDNode *C = CurDAG->getMachineNode(Hexagon::S4_andi_asl_ri, dl, MVT::i32,
+                                       M0, N->getOperand(2), M1);
+    SDNode *S = CurDAG->getMachineNode(Hexagon::S2_lsr_r_p, dl, MVT::i64,
+                                       SDValue(R, 0), SDValue(C, 0));
+    SDValue E = CurDAG->getTargetExtractSubreg(Hexagon::isub_lo, dl, ResTy,
+                                               SDValue(S, 0));
+    ReplaceNode(N, E.getNode());
+  } else {
+    assert(VecLen == 64);
+    SDNode *Pu = CurDAG->getMachineNode(Hexagon::C2_tfrrp, dl, MVT::v8i1,
+                                        N->getOperand(2));
+    SDNode *VA = CurDAG->getMachineNode(Hexagon::S2_valignrb, dl, ResTy,
+                                        N->getOperand(0), N->getOperand(1),
+                                        SDValue(Pu,0));
+    ReplaceNode(N, VA);
   }
+}
+
+void HexagonDAGToDAGISel::SelectVAlignAddr(SDNode *N) {
+  const SDLoc &dl(N);
+  SDValue A = N->getOperand(1);
+  int Mask = -cast<ConstantSDNode>(A.getNode())->getSExtValue();
+  assert(isPowerOf2_32(-Mask));
+
+  SDValue M = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
+  SDNode *AA = CurDAG->getMachineNode(Hexagon::A2_andir, dl, MVT::i32,
+                                      N->getOperand(0), M);
+  ReplaceNode(N, AA);
+}
+
+// Handle these nodes here to avoid having to write patterns for all
+// combinations of input/output types. In all cases, the resulting
+// instruction is the same.
+void HexagonDAGToDAGISel::SelectTypecast(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  MVT OpTy = Op.getValueType().getSimpleVT();
+  SDNode *T = CurDAG->MorphNodeTo(N, N->getOpcode(),
+                                  CurDAG->getVTList(OpTy), {Op});
+  ReplaceNode(T, Op.getNode());
+}
+
+void HexagonDAGToDAGISel::SelectP2D(SDNode *N) {
+  MVT ResTy = N->getValueType(0).getSimpleVT();
+  SDNode *T = CurDAG->getMachineNode(Hexagon::C2_mask, SDLoc(N), ResTy,
+                                     N->getOperand(0));
+  ReplaceNode(N, T);
+}
+
+void HexagonDAGToDAGISel::SelectD2P(SDNode *N) {
+  const SDLoc &dl(N);
+  MVT ResTy = N->getValueType(0).getSimpleVT();
+  SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+  SDNode *T = CurDAG->getMachineNode(Hexagon::A4_vcmpbgtui, dl, ResTy,
+                                     N->getOperand(0), Zero);
+  ReplaceNode(N, T);
+}
+
+void HexagonDAGToDAGISel::SelectV2Q(SDNode *N) {
+  const SDLoc &dl(N);
+  MVT ResTy = N->getValueType(0).getSimpleVT();
 
-  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
-  CurDAG->RemoveDeadNode(N);
+  SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32);
+  SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C);
+  SDNode *T = CurDAG->getMachineNode(Hexagon::V6_vandvrt, dl, ResTy,
+                                     N->getOperand(0), SDValue(R,0));
+  ReplaceNode(N, T);
+}
+
+void HexagonDAGToDAGISel::SelectQ2V(SDNode *N) {
+  const SDLoc &dl(N);
+  MVT ResTy = N->getValueType(0).getSimpleVT();
+
+  SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32);
+  SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C);
+  SDNode *T = CurDAG->getMachineNode(Hexagon::V6_vandqrt, dl, ResTy,
+                                     N->getOperand(0), SDValue(R,0));
+  ReplaceNode(N, T);
 }
 
 void HexagonDAGToDAGISel::Select(SDNode *N) {
@@ -789,13 +879,21 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
   case ISD::Constant:             return SelectConstant(N);
   case ISD::ConstantFP:           return SelectConstantFP(N);
   case ISD::FrameIndex:           return SelectFrameIndex(N);
-  case ISD::BITCAST:              return SelectBitcast(N);
   case ISD::SHL:                  return SelectSHL(N);
   case ISD::LOAD:                 return SelectLoad(N);
   case ISD::STORE:                return SelectStore(N);
-  case ISD::ZERO_EXTEND:          return SelectZeroExtend(N);
   case ISD::INTRINSIC_W_CHAIN:    return SelectIntrinsicWChain(N);
   case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
+
+  case HexagonISD::ADDC:
+  case HexagonISD::SUBC:          return SelectAddSubCarry(N);
+  case HexagonISD::VALIGN:        return SelectVAlign(N);
+  case HexagonISD::VALIGNADDR:    return SelectVAlignAddr(N);
+  case HexagonISD::TYPECAST:      return SelectTypecast(N);
+  case HexagonISD::P2D:           return SelectP2D(N);
+  case HexagonISD::D2P:           return SelectD2P(N);
+  case HexagonISD::Q2V:           return SelectQ2V(N);
+  case HexagonISD::V2Q:           return SelectV2Q(N);
   }
 
   if (HST->useHVXOps()) {
@@ -1240,7 +1338,7 @@ bool HexagonDAGToDAGISel::SelectAnyImmediate(SDValue &N, SDValue &R,
   }
   case HexagonISD::JT:
   case HexagonISD::CP:
-    // These are assumed to always be aligned at at least 8-byte boundary.
+    // These are assumed to always be aligned at least 8-byte boundary.
     if (LogAlign > 3)
       return false;
     R = N.getOperand(0);
@@ -1252,7 +1350,7 @@ bool HexagonDAGToDAGISel::SelectAnyImmediate(SDValue &N, SDValue &R,
     R = N;
     return true;
   case ISD::BlockAddress:
-    // Block address is always aligned at at least 4-byte boundary.
+    // Block address is always aligned at least 4-byte boundary.
     if (LogAlign > 2 || !IsAligned(cast<BlockAddressSDNode>(N)->getOffset()))
       return false;
     R = N;
@@ -1345,9 +1443,13 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
       EVT T = Opc == ISD::SIGN_EXTEND
                 ? N.getOperand(0).getValueType()
                 : cast<VTSDNode>(N.getOperand(1))->getVT();
-      if (T.getSizeInBits() != 32)
+      unsigned SW = T.getSizeInBits();
+      if (SW == 32)
+        R = N.getOperand(0);
+      else if (SW < 32)
+        R = N;
+      else
         return false;
-      R = N.getOperand(0);
       break;
     }
     case ISD::LOAD: {
@@ -1361,6 +1463,13 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
       R = N;
       break;
     }
+    case ISD::SRA: {
+      auto *S = dyn_cast<ConstantSDNode>(N.getOperand(1));
+      if (!S || S->getZExtValue() != 32)
+        return false;
+      R = N;
+      break;
+    }
     default:
       return false;
   }
@@ -1500,7 +1609,7 @@ static bool isOpcodeHandled(const SDNode *N) {
   }
 }
 
-/// \brief Return the weight of an SDNode
+/// Return the weight of an SDNode
 int HexagonDAGToDAGISel::getWeight(SDNode *N) {
   if (!isOpcodeHandled(N))
     return 1;
@@ -1799,15 +1908,15 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
     RootHeights[N] = std::max(getHeight(N->getOperand(0).getNode()),
                               getHeight(N->getOperand(1).getNode())) + 1;
 
-    DEBUG(dbgs() << "--> No need to balance root (Weight=" << Weight
-                 << " Height=" << RootHeights[N] << "): ");
-    DEBUG(N->dump());
+    LLVM_DEBUG(dbgs() << "--> No need to balance root (Weight=" << Weight
+                      << " Height=" << RootHeights[N] << "): ");
+    LLVM_DEBUG(N->dump(CurDAG));
 
     return SDValue(N, 0);
   }
 
-  DEBUG(dbgs() << "** Balancing root node: ");
-  DEBUG(N->dump());
+  LLVM_DEBUG(dbgs() << "** Balancing root node: ");
+  LLVM_DEBUG(N->dump(CurDAG));
 
   unsigned NOpcode = N->getOpcode();
 
@@ -1855,7 +1964,7 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
         // Whoops, this node was RAUWd by one of the balanceSubTree calls we
         // made. Our worklist isn't up to date anymore.
         // Restart the whole process.
-        DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+        LLVM_DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
         return balanceSubTree(N, TopLevel);
       }
 
@@ -1926,15 +2035,15 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
     }
   }
 
-  DEBUG(dbgs() << "--> Current height=" << NodeHeights[SDValue(N, 0)]
-               << " weight=" << CurrentWeight << " imbalanced="
-               << Imbalanced << "\n");
+  LLVM_DEBUG(dbgs() << "--> Current height=" << NodeHeights[SDValue(N, 0)]
+                    << " weight=" << CurrentWeight
+                    << " imbalanced=" << Imbalanced << "\n");
 
   // Transform MUL(x, C * 2^Y) + SHL(z, Y) -> SHL(ADD(MUL(x, C), z), Y)
   //  This factors out a shift in order to match memw(a<<Y+b).
   if (CanFactorize && (willShiftRightEliminate(Mul1.Value, MaxPowerOf2) ||
                        willShiftRightEliminate(Mul2.Value, MaxPowerOf2))) {
-    DEBUG(dbgs() << "--> Found common factor for two MUL children!\n");
+    LLVM_DEBUG(dbgs() << "--> Found common factor for two MUL children!\n");
     int Weight = Mul1.Weight + Mul2.Weight;
     int Height = std::max(NodeHeights[Mul1.Value], NodeHeights[Mul2.Value]) + 1;
     SDValue Mul1Factored = factorOutPowerOf2(Mul1.Value, MaxPowerOf2);
@@ -1968,9 +2077,9 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
 
     if (getUsesInFunction(GANode->getGlobal()) == 1 && Offset->hasOneUse() &&
         getTargetLowering()->isOffsetFoldingLegal(GANode)) {
-      DEBUG(dbgs() << "--> Combining GA and offset (" << Offset->getSExtValue()
-          << "): ");
-      DEBUG(GANode->dump());
+      LLVM_DEBUG(dbgs() << "--> Combining GA and offset ("
+                        << Offset->getSExtValue() << "): ");
+      LLVM_DEBUG(GANode->dump(CurDAG));
 
       SDValue NewTGA =
         CurDAG->getTargetGlobalAddress(GANode->getGlobal(), SDLoc(GA.Value),
@@ -2014,7 +2123,7 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
   // If this is the top level and we haven't factored out a shift, we should try
   // to move a constant to the bottom to match addressing modes like memw(rX+C)
   if (TopLevel && !CanFactorize && Leaves.hasConst()) {
-    DEBUG(dbgs() << "--> Pushing constant to tip of tree.");
+    LLVM_DEBUG(dbgs() << "--> Pushing constant to tip of tree.");
     Leaves.pushToBottom(Leaves.pop());
   }
 
@@ -2041,7 +2150,7 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
     // Make sure that none of these nodes have been RAUW'd
     if ((RootWeights.count(V0.getNode()) && RootWeights[V0.getNode()] == -2) ||
         (RootWeights.count(V1.getNode()) && RootWeights[V1.getNode()] == -2)) {
-      DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+      LLVM_DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
       return balanceSubTree(N, TopLevel);
     }
 
@@ -2075,9 +2184,9 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
     int Weight = V0Weight + V1Weight;
     Leaves.push(WeightedLeaf(NewNode, Weight, L0.InsertionOrder));
 
-    DEBUG(dbgs() << "--> Built new node (Weight=" << Weight << ",Height="
-                 << Height << "):\n");
-    DEBUG(NewNode.dump());
+    LLVM_DEBUG(dbgs() << "--> Built new node (Weight=" << Weight
+                      << ",Height=" << Height << "):\n");
+    LLVM_DEBUG(NewNode.dump());
   }
 
   assert(Leaves.size() == 1);
@@ -2101,15 +2210,15 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
   }
 
   if (N != NewRoot.getNode()) {
-    DEBUG(dbgs() << "--> Root is now: ");
-    DEBUG(NewRoot.dump());
+    LLVM_DEBUG(dbgs() << "--> Root is now: ");
+    LLVM_DEBUG(NewRoot.dump());
 
     // Replace all uses of old root by new root
     CurDAG->ReplaceAllUsesWith(N, NewRoot.getNode());
     // Mark that we have RAUW'd N
     RootWeights[N] = -2;
   } else {
-    DEBUG(dbgs() << "--> Root unchanged.\n");
+    LLVM_DEBUG(dbgs() << "--> Root unchanged.\n");
   }
 
   RootWeights[NewRoot.getNode()] = Leaves.top().Weight;
@@ -2132,8 +2241,8 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
     if (RootWeights.count(BasePtr.getNode()))
       continue;
 
-    DEBUG(dbgs() << "** Rebalancing address calculation in node: ");
-    DEBUG(N->dump());
+    LLVM_DEBUG(dbgs() << "** Rebalancing address calculation in node: ");
+    LLVM_DEBUG(N->dump(CurDAG));
 
     // FindRoots
     SmallVector<SDNode *, 4> Worklist;
@@ -2173,8 +2282,8 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
       N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
             NewBasePtr, N->getOperand(3));
 
-    DEBUG(dbgs() << "--> Final node: ");
-    DEBUG(N->dump());
+    LLVM_DEBUG(dbgs() << "--> Final node: ");
+    LLVM_DEBUG(N->dump(CurDAG));
   }
 
   CurDAG->RemoveDeadNodes();
@@ -2182,4 +2291,3 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
   RootHeights.clear();
   RootWeights.clear();
 }
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index fc66940ee52d..f4f09dd4e758 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -90,6 +90,8 @@ public:
                                     unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
   bool tryLoadOfLoadIntrinsic(LoadSDNode *N);
+  bool SelectBrevLdIntrinsic(SDNode *IntN);
+  bool SelectNewCircIntrinsic(SDNode *IntN);
   void SelectLoad(SDNode *N);
   void SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl);
   void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
@@ -100,10 +102,17 @@ public:
   void SelectIntrinsicWOChain(SDNode *N);
   void SelectConstant(SDNode *N);
   void SelectConstantFP(SDNode *N);
-  void SelectBitcast(SDNode *N);
   void SelectV65Gather(SDNode *N);
   void SelectV65GatherPred(SDNode *N);
   void SelectHVXDualOutput(SDNode *N);
+  void SelectAddSubCarry(SDNode *N);
+  void SelectVAlign(SDNode *N);
+  void SelectVAlignAddr(SDNode *N);
+  void SelectTypecast(SDNode *N);
+  void SelectP2D(SDNode *N);
+  void SelectD2P(SDNode *N);
+  void SelectQ2V(SDNode *N);
+  void SelectV2Q(SDNode *N);
 
   // Include the declarations autogenerated from the selection patterns.
   #define GET_DAGISEL_DECL
@@ -122,6 +131,7 @@ private:
 
   void SelectHvxShuffle(SDNode *N);
   void SelectHvxRor(SDNode *N);
+  void SelectHvxVAlign(SDNode *N);
 
   bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src);
   bool isAlignedMemNode(const MemSDNode *N) const;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index f08c50540656..8aef9b4560d5 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -11,6 +11,7 @@
 #include "HexagonISelDAGToDAG.h"
 #include "HexagonISelLowering.h"
 #include "HexagonTargetMachine.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Intrinsics.h"
@@ -94,18 +95,13 @@ namespace {
 // Benes network is a forward delta network immediately followed by
 // a reverse delta network.
 
+enum class ColorKind { None, Red, Black };
 
 // Graph coloring utility used to partition nodes into two groups:
 // they will correspond to nodes routed to the upper and lower networks.
 struct Coloring {
-  enum : uint8_t {
-    None = 0,
-    Red,
-    Black
-  };
-
   using Node = int;
-  using MapType = std::map<Node,uint8_t>;
+  using MapType = std::map<Node, ColorKind>;
   static constexpr Node Ignore = Node(-1);
 
   Coloring(ArrayRef<Node> Ord) : Order(Ord) {
@@ -118,10 +114,10 @@ struct Coloring {
     return Colors;
   }
 
-  uint8_t other(uint8_t Color) {
-    if (Color == None)
-      return Red;
-    return Color == Red ? Black : Red;
+  ColorKind other(ColorKind Color) {
+    if (Color == ColorKind::None)
+      return ColorKind::Red;
+    return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red;
   }
 
   void dump() const;
@@ -139,28 +135,28 @@ private:
     return (Pos < Num/2) ? Pos + Num/2 : Pos - Num/2;
   }
 
-  uint8_t getColor(Node N) {
+  ColorKind getColor(Node N) {
     auto F = Colors.find(N);
-    return F != Colors.end() ? F->second : (uint8_t)None;
+    return F != Colors.end() ? F->second : ColorKind::None;
   }
 
-  std::pair<bool,uint8_t> getUniqueColor(const NodeSet &Nodes);
+  std::pair<bool, ColorKind> getUniqueColor(const NodeSet &Nodes);
 
   void build();
   bool color();
 };
 } // namespace
 
-std::pair<bool,uint8_t> Coloring::getUniqueColor(const NodeSet &Nodes) {
-  uint8_t Color = None;
+std::pair<bool, ColorKind> Coloring::getUniqueColor(const NodeSet &Nodes) {
+  auto Color = ColorKind::None;
   for (Node N : Nodes) {
-    uint8_t ColorN = getColor(N);
-    if (ColorN == None)
+    ColorKind ColorN = getColor(N);
+    if (ColorN == ColorKind::None)
       continue;
-    if (Color == None)
+    if (Color == ColorKind::None)
       Color = ColorN;
-    else if (Color != None && Color != ColorN)
-      return { false, None };
+    else if (Color != ColorKind::None && Color != ColorN)
+      return { false, ColorKind::None };
   }
   return { true, Color };
 }
@@ -245,12 +241,12 @@ bool Coloring::color() {
 
     // Coloring failed. Split this node.
     Node C = conj(N);
-    uint8_t ColorN = other(None);
-    uint8_t ColorC = other(ColorN);
+    ColorKind ColorN = other(ColorKind::None);
+    ColorKind ColorC = other(ColorN);
     NodeSet &Cs = Edges[C];
     NodeSet CopyNs = Ns;
     for (Node M : CopyNs) {
-      uint8_t ColorM = getColor(M);
+      ColorKind ColorM = getColor(M);
       if (ColorM == ColorC) {
         // Connect M with C, disconnect M from N.
         Cs.insert(M);
@@ -263,10 +259,10 @@ bool Coloring::color() {
     Colors[C] = ColorC;
   }
 
-  // Explicitly assign "None" all all uncolored nodes.
+  // Explicitly assign "None" to all uncolored nodes.
   for (unsigned I = 0; I != Order.size(); ++I)
     if (Colors.count(I) == 0)
-      Colors[I] = None;
+      Colors[I] = ColorKind::None;
 
   return true;
 }
@@ -296,10 +292,21 @@ void Coloring::dump() const {
   }
   dbgs() << "  }\n";
 
-  static const char *const Names[] = { "None", "Red", "Black" };
+  auto ColorKindToName = [](ColorKind C) {
+    switch (C) {
+    case ColorKind::None:
+      return "None";
+    case ColorKind::Red:
+      return "Red";
+    case ColorKind::Black:
+      return "Black";
+    }
+    llvm_unreachable("all ColorKinds should be handled by the switch above");
+  };
+
   dbgs() << "  Colors: {\n";
   for (auto C : Colors)
-    dbgs() << "    " << C.first << " -> " << Names[C.second] << "\n";
+    dbgs() << "    " << C.first << " -> " << ColorKindToName(C.second) << "\n";
   dbgs() << "  }\n}\n";
 }
 
@@ -471,21 +478,21 @@ bool ReverseDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size,
   if (M.empty())
     return false;
 
-  uint8_t ColorUp = Coloring::None;
+  ColorKind ColorUp = ColorKind::None;
   for (ElemType J = 0; J != Num; ++J) {
     ElemType I = P[J];
     // I is the position in the input,
     // J is the position in the output.
     if (I == Ignore)
       continue;
-    uint8_t C = M.at(I);
-    if (C == Coloring::None)
+    ColorKind C = M.at(I);
+    if (C == ColorKind::None)
       continue;
     // During "Step", inputs cannot switch halves, so if the "up" color
     // is still unknown, make sure that it is selected in such a way that
     // "I" will stay in the same half.
     bool InpUp = I < Num/2;
-    if (ColorUp == Coloring::None)
+    if (ColorUp == ColorKind::None)
       ColorUp = InpUp ? C : G.other(C);
     if ((C == ColorUp) != InpUp) {
       // If I should go to a different half than where is it now, give up.
@@ -545,16 +552,16 @@ bool BenesNetwork::route(ElemType *P, RowType *T, unsigned Size,
   // Both assignments, i.e. Red->Up and Red->Down are valid, but they will
   // result in different controls. Let's pick the one where the first
   // control will be "Pass".
-  uint8_t ColorUp = Coloring::None;
+  ColorKind ColorUp = ColorKind::None;
   for (ElemType J = 0; J != Num; ++J) {
     ElemType I = P[J];
     if (I == Ignore)
       continue;
-    uint8_t C = M.at(I);
-    if (C == Coloring::None)
+    ColorKind C = M.at(I);
+    if (C == ColorKind::None)
       continue;
-    if (ColorUp == Coloring::None) {
-      ColorUp = (I < Num/2) ? Coloring::Red : Coloring::Black;
+    if (ColorUp == ColorKind::None) {
+      ColorUp = (I < Num / 2) ? ColorKind::Red : ColorKind::Black;
     }
     unsigned CI = (I < Num/2) ? I+Num/2 : I-Num/2;
     if (C == ColorUp) {
@@ -769,6 +776,13 @@ struct ShuffleMask {
     size_t H = Mask.size()/2;
     return ShuffleMask(Mask.take_back(H));
   }
+
+  void print(raw_ostream &OS) const {
+    OS << "MinSrc:" << MinSrc << ", MaxSrc:" << MaxSrc << " {";
+    for (int M : Mask)
+      OS << ' ' << M;
+    OS << " }";
+  }
 };
 } // namespace
 
@@ -806,6 +820,7 @@ namespace llvm {
 
     void selectShuffle(SDNode *N);
     void selectRor(SDNode *N);
+    void selectVAlign(SDNode *N);
 
   private:
     void materialize(const ResultStack &Results);
@@ -904,42 +919,55 @@ static bool isPermutation(ArrayRef<int> Mask) {
 }
 
 bool HvxSelector::selectVectorConstants(SDNode *N) {
-  // Constant vectors are generated as loads from constant pools.
-  // Since they are generated during the selection process, the main
-  // selection algorithm is not aware of them. Select them directly
-  // here.
-  SmallVector<SDNode*,4> Loads;
-  SmallVector<SDNode*,16> WorkQ;
+  // Constant vectors are generated as loads from constant pools or as
+  // splats of a constant value. Since they are generated during the
+  // selection process, the main selection algorithm is not aware of them.
+  // Select them directly here.
+  SmallVector<SDNode*,4> Nodes;
+  SetVector<SDNode*> WorkQ;
+
+  // The one-use test for VSPLATW's operand may fail due to dead nodes
+  // left over in the DAG.
+  DAG.RemoveDeadNodes();
 
   // The DAG can change (due to CSE) during selection, so cache all the
   // unselected nodes first to avoid traversing a mutating DAG.
 
-  auto IsLoadToSelect = [] (SDNode *N) {
-    if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) {
-      SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
-      unsigned AddrOpc = Addr.getOpcode();
-      if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
-        if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
-          return true;
+  auto IsNodeToSelect = [] (SDNode *N) {
+    if (N->isMachineOpcode())
+      return false;
+    switch (N->getOpcode()) {
+      case HexagonISD::VZERO:
+      case HexagonISD::VSPLATW:
+        return true;
+      case ISD::LOAD: {
+        SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
+        unsigned AddrOpc = Addr.getOpcode();
+        if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
+          if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+            return true;
+      }
+      break;
     }
-    return false;
+    // Make sure to select the operand of VSPLATW.
+    bool IsSplatOp = N->hasOneUse() &&
+                     N->use_begin()->getOpcode() == HexagonISD::VSPLATW;
+    return IsSplatOp;
   };
 
-  WorkQ.push_back(N);
+  WorkQ.insert(N);
   for (unsigned i = 0; i != WorkQ.size(); ++i) {
     SDNode *W = WorkQ[i];
-    if (IsLoadToSelect(W)) {
-      Loads.push_back(W);
-      continue;
-    }
+    if (IsNodeToSelect(W))
+      Nodes.push_back(W);
     for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
-      WorkQ.push_back(W->getOperand(j).getNode());
+      WorkQ.insert(W->getOperand(j).getNode());
   }
 
-  for (SDNode *L : Loads)
+  for (SDNode *L : Nodes)
     ISel.Select(L);
 
-  return !Loads.empty();
+  return !Nodes.empty();
 }
 
 void HvxSelector::materialize(const ResultStack &Results) {
@@ -976,15 +1004,11 @@ void HvxSelector::materialize(const ResultStack &Results) {
       MVT OpTy = Op.getValueType().getSimpleVT();
       if (Part != OpRef::Whole) {
         assert(Part == OpRef::LoHalf || Part == OpRef::HiHalf);
-        if (Op.getOpcode() == HexagonISD::VCOMBINE) {
-          Op = (Part == OpRef::HiHalf) ? Op.getOperand(0) : Op.getOperand(1);
-        } else {
-          MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
-                                        OpTy.getVectorNumElements()/2);
-          unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
-                                                 : Hexagon::vsub_hi;
-          Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
-        }
+        MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
+                                      OpTy.getVectorNumElements()/2);
+        unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
+                                               : Hexagon::vsub_hi;
+        Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
       }
       Ops.push_back(Op);
     } // for (Node : Results)
@@ -1030,25 +1054,53 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
   int VecLen = SM.Mask.size();
   MVT Ty = getSingleVT(MVT::i8);
 
-  if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
-    if (SM.MaxSrc < int(HwLen)) {
-      memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
-      return Va;
+  auto IsExtSubvector = [] (ShuffleMask M) {
+    assert(M.MinSrc >= 0 && M.MaxSrc >= 0);
+    for (int I = 0, E = M.Mask.size(); I != E; ++I) {
+      if (M.Mask[I] >= 0 && M.Mask[I]-I != M.MinSrc)
+        return false;
     }
-    if (SM.MinSrc >= int(HwLen)) {
-      for (int I = 0; I != VecLen; ++I) {
-        int M = SM.Mask[I];
-        if (M != -1)
-          M -= HwLen;
-        NewMask[I] = M;
+    return true;
+  };
+
+  if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
+    if (SM.MinSrc == 0 || SM.MinSrc == int(HwLen) || !IsExtSubvector(SM)) {
+      // If the mask picks elements from only one of the operands, return
+      // that operand, and update the mask to use index 0 to refer to the
+      // first element of that operand.
+      // If the mask extracts a subvector, it will be handled below, so
+      // skip it here.
+      if (SM.MaxSrc < int(HwLen)) {
+        memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
+        return Va;
+      }
+      if (SM.MinSrc >= int(HwLen)) {
+        for (int I = 0; I != VecLen; ++I) {
+          int M = SM.Mask[I];
+          if (M != -1)
+            M -= HwLen;
+          NewMask[I] = M;
+        }
+        return Vb;
       }
-      return Vb;
+    }
+    int MinSrc = SM.MinSrc;
+    if (SM.MaxSrc < int(HwLen)) {
+      Vb = Va;
+    } else if (SM.MinSrc > int(HwLen)) {
+      Va = Vb;
+      MinSrc = SM.MinSrc - HwLen;
     }
     const SDLoc &dl(Results.InpNode);
-    SDValue S = DAG.getTargetConstant(SM.MinSrc, dl, MVT::i32);
-    if (isUInt<3>(SM.MinSrc)) {
-      Results.push(Hexagon::V6_valignbi, Ty, {Vb, Va, S});
+    if (isUInt<3>(MinSrc) || isUInt<3>(HwLen-MinSrc)) {
+      bool IsRight = isUInt<3>(MinSrc); // Right align.
+      SDValue S = DAG.getTargetConstant(IsRight ? MinSrc : HwLen-MinSrc,
+                                        dl, MVT::i32);
+      unsigned Opc = IsRight ? Hexagon::V6_valignbi
+                             : Hexagon::V6_vlalignbi;
+      Results.push(Opc, Ty, {Vb, Va, S});
     } else {
+      SDValue S = DAG.getTargetConstant(MinSrc, dl, MVT::i32);
       Results.push(Hexagon::A2_tfrsi, MVT::i32, {S});
       unsigned Top = Results.top();
       Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(Top)});
@@ -1259,6 +1311,8 @@ OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
     return shuffp1(ShuffleMask(PackedMask), P, Results);
 
   SmallVector<int,256> MaskL(VecLen), MaskR(VecLen);
+  splitMask(SM.Mask, MaskL, MaskR);
+
   OpRef L = shuffp1(ShuffleMask(MaskL), Va, Results);
   OpRef R = shuffp1(ShuffleMask(MaskR), Vb, Results);
   if (!L.isValid() || !R.isValid())
@@ -1914,7 +1968,6 @@ void HvxSelector::selectShuffle(SDNode *N) {
   // If the mask is all -1's, generate "undef".
   if (!UseLeft && !UseRight) {
     ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
-    DAG.RemoveDeadNode(N);
     return;
   }
 
@@ -1956,8 +2009,8 @@ void HvxSelector::selectRor(SDNode *N) {
   SDNode *NewN = nullptr;
 
   if (auto *CN = dyn_cast<ConstantSDNode>(RotV.getNode())) {
-    unsigned S = CN->getZExtValue();
-    if (S % HST.getVectorLength() == 0) {
+    unsigned S = CN->getZExtValue() % HST.getVectorLength();
+    if (S == 0) {
       NewN = VecV.getNode();
     } else if (isUInt<3>(S)) {
       SDValue C = DAG.getTargetConstant(S, dl, MVT::i32);
@@ -1970,6 +2023,15 @@ void HvxSelector::selectRor(SDNode *N) {
     NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});
 
   ISel.ReplaceNode(N, NewN);
+}
+
+void HvxSelector::selectVAlign(SDNode *N) {
+  SDValue Vv = N->getOperand(0);
+  SDValue Vu = N->getOperand(1);
+  SDValue Rt = N->getOperand(2);
+  SDNode *NewN = DAG.getMachineNode(Hexagon::V6_valignb, SDLoc(N),
+                                    N->getValueType(0), {Vv, Vu, Rt});
+  ISel.ReplaceNode(N, NewN);
   DAG.RemoveDeadNode(N);
 }
 
@@ -1981,7 +2043,15 @@ void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
   HvxSelector(*this, *CurDAG).selectRor(N);
 }
 
+void HexagonDAGToDAGISel::SelectHvxVAlign(SDNode *N) {
+  HvxSelector(*this, *CurDAG).selectVAlign(N);
+}
+
 void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
+  if (!HST->usePackets()) {
+    report_fatal_error("Support for gather requires packets, "
+                       "which are disabled");
+  }
   const SDLoc &dl(N);
   SDValue Chain = N->getOperand(0);
   SDValue Address = N->getOperand(2);
@@ -2017,11 +2087,14 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 
-  ReplaceUses(N, Result);
-  CurDAG->RemoveDeadNode(N);
+  ReplaceNode(N, Result);
 }
 
 void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
+  if (!HST->usePackets()) {
+    report_fatal_error("Support for gather requires packets, "
+                       "which are disabled");
+  }
   const SDLoc &dl(N);
   SDValue Chain = N->getOperand(0);
   SDValue Address = N->getOperand(2);
@@ -2056,8 +2129,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 
-  ReplaceUses(N, Result);
-  CurDAG->RemoveDeadNode(N);
+  ReplaceNode(N, Result);
 }
 
 void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
@@ -2100,5 +2172,3 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
   ReplaceUses(SDValue(N, 1), SDValue(Result, 1));
   CurDAG->RemoveDeadNode(N);
 }
-
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 0e0da2ddc400..604d84994b6c 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -103,427 +104,52 @@ static cl::opt<int> MaxStoresPerMemsetOptSizeCL("max-store-memset-Os",
   cl::Hidden, cl::ZeroOrMore, cl::init(4),
   cl::desc("Max #stores to inline memset"));
 
+static cl::opt<bool> AlignLoads("hexagon-align-loads",
+  cl::Hidden, cl::init(false),
+  cl::desc("Rewrite unaligned loads as a pair of aligned loads"));
+
 
 namespace {
 
   class HexagonCCState : public CCState {
-    unsigned NumNamedVarArgParams;
+    unsigned NumNamedVarArgParams = 0;
 
   public:
-    HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+    HexagonCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
                    SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
-                   int NumNamedVarArgParams)
-        : CCState(CC, isVarArg, MF, locs, C),
-          NumNamedVarArgParams(NumNamedVarArgParams) {}
-
+                   unsigned NumNamedArgs)
+        : CCState(CC, IsVarArg, MF, locs, C),
+          NumNamedVarArgParams(NumNamedArgs) {}
     unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
   };
 
-  enum StridedLoadKind {
-    Even = 0,
-    Odd,
-    NoPattern
-  };
-
 } // end anonymous namespace
 
-// Implement calling convention for Hexagon.
-
-static const MVT LegalV64[] =  { MVT::v64i8,  MVT::v32i16,  MVT::v16i32 };
-static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
-static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
-static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
-
-static bool
-CC_Hexagon(unsigned ValNo, MVT ValVT,
-           MVT LocVT, CCValAssign::LocInfo LocInfo,
-           ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon32(unsigned ValNo, MVT ValVT,
-             MVT LocVT, CCValAssign::LocInfo LocInfo,
-             ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon64(unsigned ValNo, MVT ValVT,
-             MVT LocVT, CCValAssign::LocInfo LocInfo,
-             ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_HexagonVector(unsigned ValNo, MVT ValVT,
-                 MVT LocVT, CCValAssign::LocInfo LocInfo,
-                 ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon(unsigned ValNo, MVT ValVT,
-              MVT LocVT, CCValAssign::LocInfo LocInfo,
-              ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
-                MVT LocVT, CCValAssign::LocInfo LocInfo,
-                ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
-                MVT LocVT, CCValAssign::LocInfo LocInfo,
-                ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
-                    MVT LocVT, CCValAssign::LocInfo LocInfo,
-                    ISD::ArgFlagsTy ArgFlags, CCState &State);
-
-static bool
-CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
-            MVT LocVT, CCValAssign::LocInfo LocInfo,
-            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
-
-  if (ValNo < HState.getNumNamedVarArgParams()) {
-    // Deal with named arguments.
-    return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
-  }
-
-  // Deal with un-named arguments.
-  unsigned Offset;
-  if (ArgFlags.isByVal()) {
-    // If pass-by-value, the size allocated on stack is decided
-    // by ArgFlags.getByValSize(), not by the size of LocVT.
-    Offset = State.AllocateStack(ArgFlags.getByValSize(),
-                                 ArgFlags.getByValAlign());
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    ValVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
-  }
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    Offset = State.AllocateStack(4, 4);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    Offset = State.AllocateStack(8, 8);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
-      LocVT == MVT::v16i8) {
-    Offset = State.AllocateStack(16, 16);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
-      LocVT == MVT::v32i8) {
-    Offset = State.AllocateStack(32, 32);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
-      LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
-    Offset = State.AllocateStack(64, 64);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
-      LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
-    Offset = State.AllocateStack(128, 128);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
-      LocVT == MVT::v256i8) {
-    Offset = State.AllocateStack(256, 256);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-
-  llvm_unreachable(nullptr);
-}
-
-static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
-      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (ArgFlags.isByVal()) {
-    // Passed on stack.
-    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
-                                          ArgFlags.getByValAlign());
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-
-  if (LocVT == MVT::i1) {
-    LocVT = MVT::i32;
-  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    ValVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
-  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
-    LocVT = MVT::i32;
-    LocInfo = CCValAssign::BCvt;
-  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
-    LocVT = MVT::i64;
-    LocInfo = CCValAssign::BCvt;
-  }
-
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
-    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-
-  auto &HST = State.getMachineFunction().getSubtarget<HexagonSubtarget>();
-  if (HST.isHVXVectorType(LocVT)) {
-    if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  return true;  // CC didn't match.
-}
 
+// Implement calling convention for Hexagon.
 
-static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
-                         MVT LocVT, CCValAssign::LocInfo LocInfo,
-                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const MCPhysReg RegList[] = {
-    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
-    Hexagon::R5
+static bool CC_SkipOdd(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                       CCValAssign::LocInfo &LocInfo,
+                       ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    Hexagon::R0, Hexagon::R1, Hexagon::R2,
+    Hexagon::R3, Hexagon::R4, Hexagon::R5
   };
-  if (unsigned Reg = State.AllocateReg(RegList)) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
 
-  unsigned Offset = State.AllocateStack(4, 4);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
-}
-
-static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
-                         MVT LocVT, CCValAssign::LocInfo LocInfo,
-                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
-
-  static const MCPhysReg RegList1[] = {
-    Hexagon::D1, Hexagon::D2
-  };
-  static const MCPhysReg RegList2[] = {
-    Hexagon::R1, Hexagon::R3
-  };
-  if (unsigned Reg = State.AllocateReg(RegList1, RegList2)) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
+  // RegNum is an index into ArgRegs: skip a register if RegNum is odd.
+  if (RegNum != NumArgRegs && RegNum % 2 == 1)
+    State.AllocateReg(ArgRegs[RegNum]);
 
-  unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  // Always return false here, as this function only makes sure that the first
+  // unallocated register has an even register number and does not actually
+  // allocate a register for the current argument.
   return false;
 }
 
-static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
-                             MVT LocVT, CCValAssign::LocInfo LocInfo,
-                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const MCPhysReg VecLstS[] = {
-      Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
-      Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
-      Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
-      Hexagon::V15
-  };
-  static const MCPhysReg VecLstD[] = {
-      Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, Hexagon::W4,
-      Hexagon::W5, Hexagon::W6, Hexagon::W7
-  };
-  auto &MF = State.getMachineFunction();
-  auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
-  if (HST.useHVX64BOps() &&
-      (LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
-       LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
-    if (unsigned Reg = State.AllocateReg(VecLstS)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(64, 64);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (HST.useHVX64BOps() && (LocVT == MVT::v32i32 ||
-                             LocVT == MVT::v64i16 || LocVT == MVT::v128i8)) {
-    if (unsigned Reg = State.AllocateReg(VecLstD)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(128, 128);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  // 128B Mode
-  if (HST.useHVX128BOps() && (LocVT == MVT::v64i32 ||
-                              LocVT == MVT::v128i16 || LocVT == MVT::v256i8)) {
-    if (unsigned Reg = State.AllocateReg(VecLstD)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(256, 256);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  if (HST.useHVX128BOps() &&
-      (LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
-       LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
-    if (unsigned Reg = State.AllocateReg(VecLstS)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-    unsigned Offset = State.AllocateStack(128, 128);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return false;
-  }
-  return true;
-}
-
-static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
-                          MVT LocVT, CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  auto &MF = State.getMachineFunction();
-  auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
-  if (LocVT == MVT::i1) {
-    // Return values of type MVT::i1 still need to be assigned to R0, but
-    // the value type needs to remain i1. LowerCallResult will deal with it,
-    // but it needs to recognize i1 as the value type.
-    LocVT = MVT::i32;
-  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    ValVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
-  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
-    LocVT = MVT::i32;
-    LocInfo = CCValAssign::BCvt;
-  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
-    LocVT = MVT::i64;
-    LocInfo = CCValAssign::BCvt;
-  } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
-             LocVT == MVT::v16i32 || LocVT == MVT::v512i1) {
-    LocVT = MVT::v16i32;
-    ValVT = MVT::v16i32;
-    LocInfo = CCValAssign::Full;
-  } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
-             LocVT == MVT::v32i32 ||
-             (LocVT == MVT::v1024i1 && HST.useHVX128BOps())) {
-    LocVT = MVT::v32i32;
-    ValVT = MVT::v32i32;
-    LocInfo = CCValAssign::Full;
-  } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
-             LocVT == MVT::v64i32) {
-    LocVT = MVT::v64i32;
-    ValVT = MVT::v64i32;
-    LocInfo = CCValAssign::Full;
-  }
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-  if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
-    if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
-      return false;
-  }
-  return true;  // CC didn't match.
-}
-
-static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
-                            MVT LocVT, CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    // Note that use of registers beyond R1 is not ABI compliant. However there
-    // are (experimental) IR passes which generate internal functions that
-    // return structs using these additional registers.
-    static const uint16_t RegList[] = { Hexagon::R0, Hexagon::R1,
-                                        Hexagon::R2, Hexagon::R3,
-                                        Hexagon::R4, Hexagon::R5 };
-    if (unsigned Reg = State.AllocateReg(RegList)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
-                            MVT LocVT, CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  return true;
-}
+#include "HexagonGenCallingConv.inc"
 
-static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
-                                MVT LocVT, CCValAssign::LocInfo LocInfo,
-                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  auto &MF = State.getMachineFunction();
-  auto &HST = MF.getSubtarget<HexagonSubtarget>();
-
-  if (LocVT == MVT::v16i32) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  } else if (LocVT == MVT::v32i32) {
-    unsigned Req = HST.useHVX128BOps() ? Hexagon::V0 : Hexagon::W0;
-    if (unsigned Reg = State.AllocateReg(Req)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  } else if (LocVT == MVT::v64i32) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  return true;
-}
 
 void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
   if (VT != PromotedLdStVT) {
@@ -558,11 +184,14 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
 
 bool
 HexagonTargetLowering::CanLowerReturn(
-    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+
+  if (MF.getSubtarget<HexagonSubtarget>().useHVXOps())
+    return CCInfo.CheckReturn(Outs, RetCC_Hexagon_HVX);
   return CCInfo.CheckReturn(Outs, RetCC_Hexagon);
 }
 
@@ -571,7 +200,7 @@ HexagonTargetLowering::CanLowerReturn(
 // the value is stored in memory pointed by a pointer passed by caller.
 SDValue
 HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                   bool isVarArg,
+                                   bool IsVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
                                    const SDLoc &dl, SelectionDAG &DAG) const {
@@ -579,11 +208,14 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
   // Analyze return values of ISD::RET
-  CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon_HVX);
+  else
+    CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -624,17 +256,20 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
 /// being lowered. Returns a SDNode with the same number of values as the
 /// ISD::CALL.
 SDValue HexagonTargetLowering::LowerCallResult(
-    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
 
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon_HVX);
+  else
+    CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -683,67 +318,57 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
-  bool &IsTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool IsVarArg                         = CLI.IsVarArg;
   bool DoesNotReturn                    = CLI.DoesNotReturn;
 
-  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsStructRet    = Outs.empty() ? false : Outs[0].Flags.isSRet();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
-  // Check for varargs.
-  unsigned NumNamedVarArgParams = -1U;
-  if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = GAN->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
-    if (const Function* F = dyn_cast<Function>(GV)) {
-      // If a function has zero args and is a vararg function, that's
-      // disallowed so it must be an undeclared function.  Do not assume
-      // varargs if the callee is undefined.
-      if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
-        NumNamedVarArgParams = F->getFunctionType()->getNumParams();
-    }
-  }
+  unsigned NumParams = CLI.CS.getInstruction()
+                        ? CLI.CS.getFunctionType()->getNumParams()
+                        : 0;
+  if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
-                        *DAG.getContext(), NumNamedVarArgParams);
+  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+                        NumParams);
 
-  if (IsVarArg)
-    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX);
   else
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
 
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   if (Attr.getValueAsString() == "true")
-    IsTailCall = false;
+    CLI.IsTailCall = false;
 
-  if (IsTailCall) {
+  if (CLI.IsTailCall) {
     bool StructAttrFlag = MF.getFunction().hasStructRetAttr();
-    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                                                   IsVarArg, IsStructRet,
-                                                   StructAttrFlag,
-                                                   Outs, OutVals, Ins, DAG);
+    CLI.IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                        IsVarArg, IsStructRet, StructAttrFlag, Outs,
+                        OutVals, Ins, DAG);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
       CCValAssign &VA = ArgLocs[i];
       if (VA.isMemLoc()) {
-        IsTailCall = false;
+        CLI.IsTailCall = false;
         break;
       }
     }
-    DEBUG(dbgs() << (IsTailCall ? "Eligible for Tail Call\n"
-                                : "Argument must be passed on stack. "
-                                  "Not eligible for Tail Call\n"));
+    LLVM_DEBUG(dbgs() << (CLI.IsTailCall ? "Eligible for Tail Call\n"
+                                         : "Argument must be passed on stack. "
+                                           "Not eligible for Tail Call\n"));
   }
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  auto &HRI = *Subtarget.getRegisterInfo();
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
 
@@ -789,7 +414,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     VA.getLocVT().getStoreSizeInBits() >> 3);
       if (Flags.isByVal()) {
         // The argument is a struct passed by value. According to LLVM, "Arg"
-        // is is pointer.
+        // is a pointer.
         MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
                                                         Flags, DAG, dl));
       } else {
@@ -807,14 +432,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   }
 
-  if (NeedsArgAlign && Subtarget.hasV60TOps()) {
-    DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
-    // V6 vectors passed by value have 64 or 128 byte alignment depending
-    // on whether we are 64 byte vector mode or 128 byte.
-    bool UseHVX128B = Subtarget.useHVX128BOps();
-    assert(Subtarget.useHVXOps());
-    const unsigned ObjAlign = UseHVX128B ? 128 : 64;
-    LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+  if (NeedsArgAlign && Subtarget.hasV60Ops()) {
+    LLVM_DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+    unsigned VecAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+    LargestAlignSeen = std::max(LargestAlignSeen, VecAlign);
     MFI.ensureMaxAlignment(LargestAlignSeen);
   }
   // Transform all store nodes into one single node because all store
@@ -823,7 +444,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   SDValue Glue;
-  if (!IsTailCall) {
+  if (!CLI.IsTailCall) {
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
     Glue = Chain.getValue(1);
   }
@@ -832,7 +453,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // chain and flag operands which copy the outgoing args into registers.
   // The Glue is necessary since all emitted instructions must be
   // stuck together.
-  if (!IsTailCall) {
+  if (!CLI.IsTailCall) {
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                                RegsToPass[i].second, Glue);
@@ -891,7 +512,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Glue.getNode())
     Ops.push_back(Glue);
 
-  if (IsTailCall) {
+  if (CLI.IsTailCall) {
     MFI.setHasTailCall();
     return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
   }
@@ -916,66 +537,36 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                          InVals, OutVals, Callee);
 }
 
-static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
-                                   SDValue &Base, SDValue &Offset,
-                                   bool &IsInc, SelectionDAG &DAG) {
-  if (Ptr->getOpcode() != ISD::ADD)
-    return false;
-
-  auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
-
-  bool ValidHVX128BType =
-      HST.useHVX128BOps() && (VT == MVT::v32i32 ||
-                              VT == MVT::v64i16 || VT == MVT::v128i8);
-  bool ValidHVXType =
-      HST.useHVX64BOps() && (VT == MVT::v16i32 ||
-                             VT == MVT::v32i16 || VT == MVT::v64i8);
-
-  if (ValidHVX128BType || ValidHVXType || VT == MVT::i64 || VT == MVT::i32 ||
-      VT == MVT::i16 || VT == MVT::i8) {
-    IsInc = (Ptr->getOpcode() == ISD::ADD);
-    Base = Ptr->getOperand(0);
-    Offset = Ptr->getOperand(1);
-    // Ensure that Offset is a constant.
-    return isa<ConstantSDNode>(Offset);
-  }
-
-  return false;
-}
-
-/// getPostIndexedAddressParts - returns true by value, base pointer and
-/// offset pointer and addressing mode by reference if this node can be
-/// combined with a load / store to form a post-indexed load / store.
+/// Returns true by value, base pointer and offset pointer and addressing
+/// mode by reference if this node can be combined with a load / store to
+/// form a post-indexed load / store.
 bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                                       SDValue &Base,
-                                                       SDValue &Offset,
-                                                       ISD::MemIndexedMode &AM,
-                                                       SelectionDAG &DAG) const
-{
-  EVT VT;
-
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT  = LD->getMemoryVT();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT  = ST->getMemoryVT();
-    if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore())
-      return false;
-  } else {
+      SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM,
+      SelectionDAG &DAG) const {
+  LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(N);
+  if (!LSN)
+    return false;
+  EVT VT = LSN->getMemoryVT();
+  if (!VT.isSimple())
+    return false;
+  bool IsLegalType = VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+                     VT == MVT::i64 || VT == MVT::f32 || VT == MVT::f64 ||
+                     VT == MVT::v2i16 || VT == MVT::v2i32 || VT == MVT::v4i8 ||
+                     VT == MVT::v4i16 || VT == MVT::v8i8 ||
+                     Subtarget.isHVXVectorType(VT.getSimpleVT());
+  if (!IsLegalType)
     return false;
-  }
 
-  bool IsInc = false;
-  bool isLegal = getIndexedAddressParts(Op, VT, Base, Offset, IsInc, DAG);
-  if (isLegal) {
-    auto &HII = *Subtarget.getInstrInfo();
-    int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
-    if (HII.isValidAutoIncImm(VT, OffsetVal)) {
-      AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
-      return true;
-    }
-  }
+  if (Op->getOpcode() != ISD::ADD)
+    return false;
+  Base = Op->getOperand(0);
+  Offset = Op->getOperand(1);
+  if (!isa<ConstantSDNode>(Offset.getNode()))
+    return false;
+  AM = ISD::POST_INC;
 
-  return false;
+  int32_t V = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+  return Subtarget.getInstrInfo()->isValidAutoIncImm(VT, V);
 }
 
 SDValue
@@ -1080,7 +671,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   if (A == 0)
     A = HFI.getStackAlignment();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs () << __func__ << " Align: " << A << " Size: ";
     Size.getNode()->dump(&DAG);
     dbgs() << "\n";
@@ -1095,20 +686,22 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 }
 
 SDValue HexagonTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+                        MF.getFunction().getFunctionType()->getNumParams());
 
-  CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+  if (Subtarget.useHVXOps())
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX);
+  else
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
 
   // For LLVM, in the case when returning a struct by value (>8byte),
   // the first argument is a pointer that points to the location on caller's
@@ -1117,110 +710,62 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
   // equal to) 8 bytes. If not, no address will be passed into callee and
   // callee return the result direclty through R0/R1.
 
-  SmallVector<SDValue, 8> MemOps;
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     ISD::ArgFlagsTy Flags = Ins[i].Flags;
-    unsigned ObjSize;
-    unsigned StackLocation;
-    int FI;
-
-    if (   (VA.isRegLoc() && !Flags.isByVal())
-        || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
-      // Arguments passed in registers
-      // 1. int, long long, ptr args that get allocated in register.
-      // 2. Large struct that gets an register to put its address in.
-      EVT RegVT = VA.getLocVT();
-      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
-          RegVT == MVT::i32 || RegVT == MVT::f32) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        if (VA.getLocInfo() == CCValAssign::BCvt)
-          RegVT = VA.getValVT();
-        SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
-        // Treat values of type MVT::i1 specially: they are passed in
-        // registers of type i32, but they need to remain as values of
-        // type i1 for consistency of the argument lowering.
-        if (VA.getValVT() == MVT::i1) {
-          // Generate a copy into a predicate register and use the value
-          // of the register as the "InVal".
-          unsigned PReg =
-            RegInfo.createVirtualRegister(&Hexagon::PredRegsRegClass);
-          SDNode *T = DAG.getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
-                                         Copy.getValue(0));
-          Copy = DAG.getCopyToReg(Copy.getValue(1), dl, PReg, SDValue(T, 0));
-          Copy = DAG.getCopyFromReg(Copy, dl, PReg, MVT::i1);
-        }
-        InVals.push_back(Copy);
-        Chain = Copy.getValue(1);
-      } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        if (VA.getLocInfo() == CCValAssign::BCvt)
-          RegVT = VA.getValVT();
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-
-      // Single Vector
-      } else if ((RegVT == MVT::v16i32 ||
-                  RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxVRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (Subtarget.useHVX128BOps() &&
-                 ((RegVT == MVT::v32i32 ||
-                   RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxVRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-
-      // Double Vector
-      } else if ((RegVT == MVT::v32i32 ||
-                  RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxWRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (Subtarget.useHVX128BOps() &&
-                 ((RegVT == MVT::v64i32 ||
-                   RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxWRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
-        assert(0 && "need to support VecPred regs");
-        unsigned VReg =
-          RegInfo.createVirtualRegister(&Hexagon::HvxQRRegClass);
-        RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+    bool ByVal = Flags.isByVal();
+
+    // Arguments passed in registers:
+    // 1. 32- and 64-bit values and HVX vectors are passed directly,
+    // 2. Large structs are passed via an address, and the address is
+    //    passed in a register.
+    if (VA.isRegLoc() && ByVal && Flags.getByValSize() <= 8)
+      llvm_unreachable("ByValSize must be bigger than 8 bytes");
+
+    bool InReg = VA.isRegLoc() &&
+                 (!ByVal || (ByVal && Flags.getByValSize() > 8));
+
+    if (InReg) {
+      MVT RegVT = VA.getLocVT();
+      if (VA.getLocInfo() == CCValAssign::BCvt)
+        RegVT = VA.getValVT();
+
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
+      unsigned VReg = MRI.createVirtualRegister(RC);
+      SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+      // Treat values of type MVT::i1 specially: they are passed in
+      // registers of type i32, but they need to remain as values of
+      // type i1 for consistency of the argument lowering.
+      if (VA.getValVT() == MVT::i1) {
+        assert(RegVT.getSizeInBits() <= 32);
+        SDValue T = DAG.getNode(ISD::AND, dl, RegVT,
+                                Copy, DAG.getConstant(1, dl, RegVT));
+        Copy = DAG.getSetCC(dl, MVT::i1, T, DAG.getConstant(0, dl, RegVT),
+                            ISD::SETNE);
       } else {
-        assert (0);
+#ifndef NDEBUG
+        unsigned RegSize = RegVT.getSizeInBits();
+        assert(RegSize == 32 || RegSize == 64 ||
+               Subtarget.isHVXVectorType(RegVT));
+#endif
       }
-    } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
-      assert (0 && "ByValSize must be bigger than 8 bytes");
+      InVals.push_back(Copy);
+      MRI.addLiveIn(VA.getLocReg(), VReg);
     } else {
-      // Sanity check.
-      assert(VA.isMemLoc());
-
-      if (Flags.isByVal()) {
-        // If it's a byval parameter, then we need to compute the
-        // "real" size, not the size of the pointer.
-        ObjSize = Flags.getByValSize();
-      } else {
-        ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
-      }
+      assert(VA.isMemLoc() && "Argument should be passed in memory");
 
-      StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
-      // Create the frame index object for this incoming parameter...
-      FI = MFI.CreateFixedObject(ObjSize, StackLocation, true);
+      // If it's a byval parameter, then we need to compute the
+      // "real" size, not the size of the pointer.
+      unsigned ObjSize = Flags.isByVal()
+                            ? Flags.getByValSize()
+                            : VA.getLocVT().getStoreSizeInBits() / 8;
 
-      // Create the SelectionDAG nodes cordl, responding to a load
-      // from this parameter.
+      // Create the frame index object for this incoming parameter.
+      int Offset = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+      int FI = MFI.CreateFixedObject(ObjSize, Offset, true);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
 
       if (Flags.isByVal()) {
@@ -1229,22 +774,19 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         // location.
         InVals.push_back(FIN);
       } else {
-        InVals.push_back(
-            DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
+        SDValue L = DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                MachinePointerInfo::getFixedStack(MF, FI, 0));
+        InVals.push_back(L);
       }
     }
   }
 
-  if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
-  if (isVarArg) {
+  if (IsVarArg) {
     // This will point to the next argument passed via stack.
-    int FrameIndex = MFI.CreateFixedObject(Hexagon_PointerSize,
-                                           HEXAGON_LRFP_SIZE +
-                                           CCInfo.getNextStackOffset(),
-                                           true);
-    FuncInfo.setVarArgsFrameIndex(FrameIndex);
+    int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+    int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+    HMFI.setVarArgsFrameIndex(FI);
   }
 
   return Chain;
@@ -1262,66 +804,62 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-static bool isSExtFree(SDValue N) {
-  // A sign-extend of a truncate of a sign-extend is free.
-  if (N.getOpcode() == ISD::TRUNCATE &&
-      N.getOperand(0).getOpcode() == ISD::AssertSext)
-    return true;
-  // We have sign-extended loads.
-  if (N.getOpcode() == ISD::LOAD)
-    return true;
-  return false;
-}
-
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-
+  const SDLoc &dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(LHS)))
-    return LowerHvxSetCC(Op, DAG);
-
-  SDValue Cmp = Op.getOperand(2);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
-
-  EVT VT = Op.getValueType();
-  EVT LHSVT = LHS.getValueType();
-  EVT RHSVT = RHS.getValueType();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  MVT ResTy = ty(Op);
+  MVT OpTy = ty(LHS);
 
-  if (LHSVT == MVT::v2i16) {
-    assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC));
-    unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND
-                                                : ISD::ZERO_EXTEND;
-    SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS);
-    SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS);
-    SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp);
-    return SC;
+  if (OpTy == MVT::v2i16 || OpTy == MVT::v4i8) {
+    MVT ElemTy = OpTy.getVectorElementType();
+    assert(ElemTy.isScalarInteger());
+    MVT WideTy = MVT::getVectorVT(MVT::getIntegerVT(2*ElemTy.getSizeInBits()),
+                                  OpTy.getVectorNumElements());
+    return DAG.getSetCC(dl, ResTy,
+                        DAG.getSExtOrTrunc(LHS, SDLoc(LHS), WideTy),
+                        DAG.getSExtOrTrunc(RHS, SDLoc(RHS), WideTy), CC);
   }
 
   // Treat all other vector types as legal.
-  if (VT.isVector())
+  if (ResTy.isVector())
     return Op;
 
-  // Equals and not equals should use sign-extend, not zero-extend, since
-  // we can represent small negative values in the compare instructions.
+  // Comparisons of short integers should use sign-extend, not zero-extend,
+  // since we can represent small negative values in the compare instructions.
   // The LLVM default is to use zero-extend arbitrarily in these cases.
-  if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
-      (RHSVT == MVT::i8 || RHSVT == MVT::i16) &&
-      (LHSVT == MVT::i8 || LHSVT == MVT::i16)) {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
-    if (C && C->getAPIntValue().isNegative()) {
-      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
-      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
-      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
-                         LHS, RHS, Op.getOperand(2));
-    }
-    if (isSExtFree(LHS) || isSExtFree(RHS)) {
-      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
-      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
-      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
-                         LHS, RHS, Op.getOperand(2));
+  auto isSExtFree = [this](SDValue N) {
+    switch (N.getOpcode()) {
+      case ISD::TRUNCATE: {
+        // A sign-extend of a truncate of a sign-extend is free.
+        SDValue Op = N.getOperand(0);
+        if (Op.getOpcode() != ISD::AssertSext)
+          return false;
+        EVT OrigTy = cast<VTSDNode>(Op.getOperand(1))->getVT();
+        unsigned ThisBW = ty(N).getSizeInBits();
+        unsigned OrigBW = OrigTy.getSizeInBits();
+        // The type that was sign-extended to get the AssertSext must be
+        // narrower than the type of N (so that N has still the same value
+        // as the original).
+        return ThisBW >= OrigBW;
+      }
+      case ISD::LOAD:
+        // We have sign-extended loads.
+        return true;
     }
+    return false;
+  };
+
+  if (OpTy == MVT::i8 || OpTy == MVT::i16) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+    bool IsNegative = C && C->getAPIntValue().isNegative();
+    if (IsNegative || isSExtFree(LHS) || isSExtFree(RHS))
+      return DAG.getSetCC(dl, ResTy,
+                          DAG.getSExtOrTrunc(LHS, SDLoc(LHS), MVT::i32),
+                          DAG.getSExtOrTrunc(RHS, SDLoc(RHS), MVT::i32), CC);
   }
+
   return SDValue();
 }
 
@@ -1393,8 +931,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   else if (isVTi1Type)
     T = DAG.getTargetConstantPool(CVal, ValTy, Align, Offset, TF);
   else
-    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset,
-                                  TF);
+    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset, TF);
 
   assert(cast<ConstantPoolSDNode>(T)->getTargetFlags() == TF &&
          "Inconsistent target flag encountered");
@@ -1480,7 +1017,7 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
   if (RM == Reloc::Static) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
     const GlobalObject *GO = GV->getBaseObject();
-    if (GO && HLOF.isGlobalInSmallSection(GO, HTM))
+    if (GO && Subtarget.useSmallData() && HLOF.isGlobalInSmallSection(GO, HTM))
       return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
     return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
   }
@@ -1688,13 +1225,15 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
                                              const HexagonSubtarget &ST)
     : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
       Subtarget(ST) {
-  bool IsV4 = !Subtarget.hasV5TOps();
+  bool IsV4 = !Subtarget.hasV5Ops();
   auto &HRI = *Subtarget.getRegisterInfo();
 
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
   setMinFunctionAlignment(2);
   setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
+  setBooleanContents(TargetLoweringBase::UndefinedBooleanContent);
+  setBooleanVectorContents(TargetLoweringBase::UndefinedBooleanContent);
 
   setMaxAtomicSizeInBitsSupported(64);
   setMinCmpXchgSizeInBits(32);
@@ -1728,45 +1267,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
   addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget.hasV5Ops()) {
     addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
     addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
   }
 
-  if (Subtarget.hasV60TOps()) {
-    if (Subtarget.useHVX64BOps()) {
-      addRegisterClass(MVT::v64i8,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v32i16, &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v16i32, &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass);
-      // These "short" boolean vector types should be legal because
-      // they will appear as results of vector compares. If they were
-      // not legal, type legalization would try to make them legal
-      // and that would require using operations that do not use or
-      // produce such types. That, in turn, would imply using custom
-      // nodes, which would be unoptimizable by the DAG combiner.
-      // The idea is to rely on target-independent operations as much
-      // as possible.
-      addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v512i1, &Hexagon::HvxQRRegClass);
-    } else if (Subtarget.useHVX128BOps()) {
-      addRegisterClass(MVT::v128i8,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v64i16,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v32i32,  &Hexagon::HvxVRRegClass);
-      addRegisterClass(MVT::v256i8,  &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v128i16, &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v64i32,  &Hexagon::HvxWRRegClass);
-      addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
-      addRegisterClass(MVT::v1024i1, &Hexagon::HvxQRRegClass);
-    }
-  }
-
   //
   // Handling of scalar operations.
   //
@@ -1801,13 +1306,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
 
   // Hexagon needs to optimize cases with negative constants.
-  setOperationAction(ISD::SETCC, MVT::i8,  Custom);
-  setOperationAction(ISD::SETCC, MVT::i16, Custom);
+  setOperationAction(ISD::SETCC, MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC, MVT::i16,   Custom);
+  setOperationAction(ISD::SETCC, MVT::v4i8,  Custom);
+  setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
   setOperationAction(ISD::VAARG,   MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -1819,35 +1327,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setMinimumJumpTableEntries(std::numeric_limits<int>::max());
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 
-  // Hexagon has instructions for add/sub with carry. The problem with
-  // modeling these instructions is that they produce 2 results: Rdd and Px.
-  // To model the update of Px, we will have to use Defs[p0..p3] which will
-  // cause any predicate live range to spill. So, we pretend we dont't have
-  // these instructions.
-  setOperationAction(ISD::ADDE, MVT::i8,  Expand);
-  setOperationAction(ISD::ADDE, MVT::i16, Expand);
-  setOperationAction(ISD::ADDE, MVT::i32, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i8,  Expand);
-  setOperationAction(ISD::SUBE, MVT::i16, Expand);
-  setOperationAction(ISD::SUBE, MVT::i32, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
-  setOperationAction(ISD::ADDC, MVT::i8,  Expand);
-  setOperationAction(ISD::ADDC, MVT::i16, Expand);
-  setOperationAction(ISD::ADDC, MVT::i32, Expand);
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i8,  Expand);
-  setOperationAction(ISD::SUBC, MVT::i16, Expand);
-  setOperationAction(ISD::SUBC, MVT::i32, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
-  // Only add and sub that detect overflow are the saturating ones.
+  setOperationAction(ISD::ABS, MVT::i32, Legal);
+  setOperationAction(ISD::ABS, MVT::i64, Legal);
+
+  // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
+  // but they only operate on i64.
   for (MVT VT : MVT::integer_valuetypes()) {
-    setOperationAction(ISD::UADDO, VT, Expand);
-    setOperationAction(ISD::SADDO, VT, Expand);
-    setOperationAction(ISD::USUBO, VT, Expand);
-    setOperationAction(ISD::SSUBO, VT, Expand);
+    setOperationAction(ISD::UADDO,    VT, Expand);
+    setOperationAction(ISD::USUBO,    VT, Expand);
+    setOperationAction(ISD::SADDO,    VT, Expand);
+    setOperationAction(ISD::SSUBO,    VT, Expand);
+    setOperationAction(ISD::ADDCARRY, VT, Expand);
+    setOperationAction(ISD::SUBCARRY, VT, Expand);
   }
+  setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
+  setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
 
   setOperationAction(ISD::CTLZ, MVT::i8,  Promote);
   setOperationAction(ISD::CTLZ, MVT::i16, Promote);
@@ -1865,22 +1359,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
   setOperationAction(ISD::BSWAP, MVT::i64, Legal);
-  setOperationAction(ISD::MUL,   MVT::i64, Legal);
 
   for (unsigned IntExpOp :
-       { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
-         ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
-         ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
-         ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
-    setOperationAction(IntExpOp, MVT::i32, Expand);
-    setOperationAction(IntExpOp, MVT::i64, Expand);
+       {ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
+        ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
+        ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+        ISD::SMUL_LOHI, ISD::UMUL_LOHI}) {
+    for (MVT VT : MVT::integer_valuetypes())
+      setOperationAction(IntExpOp, VT, Expand);
   }
 
   for (unsigned FPExpOp :
        {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FSINCOS,
         ISD::FPOW, ISD::FCOPYSIGN}) {
-    setOperationAction(FPExpOp, MVT::f32, Expand);
-    setOperationAction(FPExpOp, MVT::f64, Expand);
+    for (MVT VT : MVT::fp_valuetypes())
+      setOperationAction(FPExpOp, VT, Expand);
   }
 
   // No extending loads from i32.
@@ -1920,10 +1413,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   // either "custom" or "legal" for specific cases.
   static const unsigned VectExpOps[] = {
     // Integer arithmetic:
-    ISD::ADD,     ISD::SUB,     ISD::MUL,     ISD::SDIV,    ISD::UDIV,
-    ISD::SREM,    ISD::UREM,    ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
-    ISD::SUBC,    ISD::SADDO,   ISD::UADDO,   ISD::SSUBO,   ISD::USUBO,
-    ISD::SMUL_LOHI,             ISD::UMUL_LOHI,
+    ISD::ADD,     ISD::SUB,     ISD::MUL,     ISD::SDIV,      ISD::UDIV,
+    ISD::SREM,    ISD::UREM,    ISD::SDIVREM, ISD::UDIVREM,   ISD::SADDO,
+    ISD::UADDO,   ISD::SSUBO,   ISD::USUBO,   ISD::SMUL_LOHI, ISD::UMUL_LOHI,
     // Logical/bit:
     ISD::AND,     ISD::OR,      ISD::XOR,     ISD::ROTL,    ISD::ROTR,
     ISD::CTPOP,   ISD::CTLZ,    ISD::CTTZ,
@@ -1970,16 +1462,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
   // Extending loads from (native) vectors of i8 into (native) vectors of i16
   // are legal.
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::v2i16, MVT::v2i8, Legal);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, MVT::v2i8, Legal);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Legal);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
 
   // Types natively supported:
-  for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16,
-                       MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) {
+  for (MVT NativeVT : {MVT::v8i1, MVT::v4i1, MVT::v2i1, MVT::v4i8,
+                       MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32}) {
     setOperationAction(ISD::BUILD_VECTOR,       NativeVT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  NativeVT, Custom);
@@ -1995,19 +1487,34 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::XOR, NativeVT, Legal);
   }
 
+  // Custom lower unaligned loads.
+  for (MVT VecVT : {MVT::i32, MVT::v4i8, MVT::i64, MVT::v8i8,
+                    MVT::v2i16, MVT::v4i16, MVT::v2i32}) {
+    setOperationAction(ISD::LOAD, VecVT, Custom);
+  }
+
+  for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i32, MVT::v4i16, MVT::v2i32}) {
+    setCondCodeAction(ISD::SETLT,  VT, Expand);
+    setCondCodeAction(ISD::SETLE,  VT, Expand);
+    setCondCodeAction(ISD::SETULT, VT, Expand);
+    setCondCodeAction(ISD::SETULE, VT, Expand);
+  }
+
+  // Custom-lower bitcasts from i8 to v8i1.
+  setOperationAction(ISD::BITCAST,        MVT::i8,    Custom);
   setOperationAction(ISD::SETCC,          MVT::v2i16, Custom);
   setOperationAction(ISD::VSELECT,        MVT::v2i16, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8,  Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
 
-  auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
-    setOperationAction(Opc, FromTy, Promote);
-    AddPromotedToType(Opc, FromTy, ToTy);
-  };
-
   // Subtarget-specific operation actions.
   //
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget.hasV60Ops()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Custom);
+    setOperationAction(ISD::ROTL, MVT::i64, Custom);
+  }
+  if (Subtarget.hasV5Ops()) {
     setOperationAction(ISD::FMA,  MVT::f64, Expand);
     setOperationAction(ISD::FADD, MVT::f64, Expand);
     setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -2061,71 +1568,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
   // Handling of indexed loads/stores: default is "expand".
   //
-  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64,
+                 MVT::v2i16, MVT::v2i32, MVT::v4i8, MVT::v4i16, MVT::v8i8}) {
     setIndexedLoadAction(ISD::POST_INC, VT, Legal);
     setIndexedStoreAction(ISD::POST_INC, VT, Legal);
   }
 
-  if (Subtarget.useHVXOps()) {
-    bool Use64b = Subtarget.useHVX64BOps();
-    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
-    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
-    MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8;
-    MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
-    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal);
-    setOperationAction(ISD::AND,            ByteV, Legal);
-    setOperationAction(ISD::OR,             ByteV, Legal);
-    setOperationAction(ISD::XOR,            ByteV, Legal);
-
-    for (MVT T : LegalV) {
-      setIndexedLoadAction(ISD::POST_INC,  T, Legal);
-      setIndexedStoreAction(ISD::POST_INC, T, Legal);
-
-      setOperationAction(ISD::ADD,     T, Legal);
-      setOperationAction(ISD::SUB,     T, Legal);
-      if (T != ByteV) {
-        setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
-        setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
-      }
-
-      setOperationAction(ISD::MUL,                T, Custom);
-      setOperationAction(ISD::SETCC,              T, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
-      if (T != ByteV)
-        setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
-    }
-
-    for (MVT T : LegalV) {
-      if (T == ByteV)
-        continue;
-      // Promote all shuffles and concats to operate on vectors of bytes.
-      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
-      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV);
-      setPromoteTo(ISD::AND,            T, ByteV);
-      setPromoteTo(ISD::OR,             T, ByteV);
-      setPromoteTo(ISD::XOR,            T, ByteV);
-    }
-
-    for (MVT T : LegalW) {
-      // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
-      // independent) handling of it would convert it to a load, which is
-      // not always the optimal choice.
-      setOperationAction(ISD::BUILD_VECTOR, T, Custom);
-
-      if (T == ByteW)
-        continue;
-      // Promote all shuffles and concats to operate on vectors of bytes.
-      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
-      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW);
-    }
-  }
+  if (Subtarget.useHVXOps())
+    initializeHVXLowering();
 
   computeRegisterProperties(&HRI);
 
@@ -2195,7 +1645,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
   }
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget.hasV5Ops()) {
     if (FastMath)
       setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
     else
@@ -2242,6 +1692,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
 const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((HexagonISD::NodeType)Opcode) {
+  case HexagonISD::ADDC:          return "HexagonISD::ADDC";
+  case HexagonISD::SUBC:          return "HexagonISD::SUBC";
   case HexagonISD::ALLOCA:        return "HexagonISD::ALLOCA";
   case HexagonISD::AT_GOT:        return "HexagonISD::AT_GOT";
   case HexagonISD::AT_PCREL:      return "HexagonISD::AT_PCREL";
@@ -2255,16 +1707,12 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::CP:            return "HexagonISD::CP";
   case HexagonISD::DCFETCH:       return "HexagonISD::DCFETCH";
   case HexagonISD::EH_RETURN:     return "HexagonISD::EH_RETURN";
+  case HexagonISD::TSTBIT:        return "HexagonISD::TSTBIT";
   case HexagonISD::EXTRACTU:      return "HexagonISD::EXTRACTU";
-  case HexagonISD::EXTRACTURP:    return "HexagonISD::EXTRACTURP";
   case HexagonISD::INSERT:        return "HexagonISD::INSERT";
-  case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
   case HexagonISD::JT:            return "HexagonISD::JT";
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
-  case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
-  case HexagonISD::VPACKE:        return "HexagonISD::VPACKE";
-  case HexagonISD::VPACKO:        return "HexagonISD::VPACKO";
   case HexagonISD::VASL:          return "HexagonISD::VASL";
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
@@ -2274,11 +1722,97 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::VZERO:         return "HexagonISD::VZERO";
+  case HexagonISD::VSPLATW:       return "HexagonISD::VSPLATW";
+  case HexagonISD::D2P:           return "HexagonISD::D2P";
+  case HexagonISD::P2D:           return "HexagonISD::P2D";
+  case HexagonISD::V2Q:           return "HexagonISD::V2Q";
+  case HexagonISD::Q2V:           return "HexagonISD::Q2V";
+  case HexagonISD::QCAT:          return "HexagonISD::QCAT";
+  case HexagonISD::QTRUE:         return "HexagonISD::QTRUE";
+  case HexagonISD::QFALSE:        return "HexagonISD::QFALSE";
+  case HexagonISD::TYPECAST:      return "HexagonISD::TYPECAST";
+  case HexagonISD::VALIGN:        return "HexagonISD::VALIGN";
+  case HexagonISD::VALIGNADDR:    return "HexagonISD::VALIGNADDR";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
 }
 
+// Bit-reverse Load Intrinsic: Check if the instruction is a bit reverse load
+// intrinsic.
+static bool isBrevLdIntrinsic(const Value *Inst) {
+  unsigned ID = cast<IntrinsicInst>(Inst)->getIntrinsicID();
+  return (ID == Intrinsic::hexagon_L2_loadrd_pbr ||
+          ID == Intrinsic::hexagon_L2_loadri_pbr ||
+          ID == Intrinsic::hexagon_L2_loadrh_pbr ||
+          ID == Intrinsic::hexagon_L2_loadruh_pbr ||
+          ID == Intrinsic::hexagon_L2_loadrb_pbr ||
+          ID == Intrinsic::hexagon_L2_loadrub_pbr);
+}
+
+// Bit-reverse Load Intrinsic :Crawl up and figure out the object from previous
+// instruction. So far we only handle bitcast, extract value and bit reverse
+// load intrinsic instructions. Should we handle CGEP ?
+static Value *getBrevLdObject(Value *V) {
+  if (Operator::getOpcode(V) == Instruction::ExtractValue ||
+      Operator::getOpcode(V) == Instruction::BitCast)
+    V = cast<Operator>(V)->getOperand(0);
+  else if (isa<IntrinsicInst>(V) && isBrevLdIntrinsic(V))
+    V = cast<Instruction>(V)->getOperand(0);
+  return V;
+}
+
+// Bit-reverse Load Intrinsic: For a PHI Node return either an incoming edge or
+// a back edge. If the back edge comes from the intrinsic itself, the incoming
+// edge is returned.
+static Value *returnEdge(const PHINode *PN, Value *IntrBaseVal) {
+  const BasicBlock *Parent = PN->getParent();
+  int Idx = -1;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+    BasicBlock *Blk = PN->getIncomingBlock(i);
+    // Determine if the back edge is originated from intrinsic.
+    if (Blk == Parent) {
+      Value *BackEdgeVal = PN->getIncomingValue(i);
+      Value *BaseVal;
+      // Loop over till we return the same Value or we hit the IntrBaseVal.
+      do {
+        BaseVal = BackEdgeVal;
+        BackEdgeVal = getBrevLdObject(BackEdgeVal);
+      } while ((BaseVal != BackEdgeVal) && (IntrBaseVal != BackEdgeVal));
+      // If the getBrevLdObject returns IntrBaseVal, we should return the
+      // incoming edge.
+      if (IntrBaseVal == BackEdgeVal)
+        continue;
+      Idx = i;
+      break;
+    } else // Set the node to incoming edge.
+      Idx = i;
+  }
+  assert(Idx >= 0 && "Unexpected index to incoming argument in PHI");
+  return PN->getIncomingValue(Idx);
+}
+
+// Bit-reverse Load Intrinsic: Figure out the underlying object the base
+// pointer points to, for the bit-reverse load intrinsic. Setting this to
+// memoperand might help alias analysis to figure out the dependencies.
+static Value *getUnderLyingObjectForBrevLdIntr(Value *V) {
+  Value *IntrBaseVal = V;
+  Value *BaseVal;
+  // Loop over till we return the same Value, implies we either figure out
+  // the object or we hit a PHI
+  do {
+    BaseVal = V;
+    V = getBrevLdObject(V);
+  } while (BaseVal != V);
+
+  // Identify the object from PHINode.
+  if (const PHINode *PN = dyn_cast<PHINode>(V))
+    return returnEdge(PN, IntrBaseVal);
+  // For non PHI nodes, the object is the last value returned by getBrevLdObject
+  else
+    return V;
+}
+
 /// Given an intrinsic, checks if on the target the intrinsic will need to map
 /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
 /// true and store the intrinsic information into the IntrinsicInfo that was
@@ -2288,6 +1822,32 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                MachineFunction &MF,
                                                unsigned Intrinsic) const {
   switch (Intrinsic) {
+  case Intrinsic::hexagon_L2_loadrd_pbr:
+  case Intrinsic::hexagon_L2_loadri_pbr:
+  case Intrinsic::hexagon_L2_loadrh_pbr:
+  case Intrinsic::hexagon_L2_loadruh_pbr:
+  case Intrinsic::hexagon_L2_loadrb_pbr:
+  case Intrinsic::hexagon_L2_loadrub_pbr: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    auto &Cont = I.getCalledFunction()->getParent()->getContext();
+    // The intrinsic function call is of the form { ElTy, i8* }
+    // @llvm.hexagon.L2.loadXX.pbr(i8*, i32). The pointer and memory access type
+    // should be derived from ElTy.
+    PointerType *PtrTy = I.getCalledFunction()
+                             ->getReturnType()
+                             ->getContainedType(0)
+                             ->getPointerTo();
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    llvm::Value *BasePtrVal = I.getOperand(0);
+    Info.ptrVal = getUnderLyingObjectForBrevLdIntr(BasePtrVal);
+    // The offset value comes through Modifier register. For now, assume the
+    // offset is 0.
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont));
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::hexagon_V6_vgathermw:
   case Intrinsic::hexagon_V6_vgathermw_128B:
   case Intrinsic::hexagon_V6_vgathermh:
@@ -2319,17 +1879,13 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 }
 
 bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  EVT MTy1 = EVT::getEVT(Ty1);
-  EVT MTy2 = EVT::getEVT(Ty2);
-  if (!MTy1.isSimple() || !MTy2.isSimple())
-    return false;
-  return (MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32);
+  return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2));
 }
 
 bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isSimple() || !VT2.isSimple())
     return false;
-  return (VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32);
+  return VT1.getSimpleVT() == MVT::i64 && VT2.getSimpleVT() == MVT::i32;
 }
 
 bool HexagonTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
@@ -2372,126 +1928,199 @@ HexagonTargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::TypeSplitVector;
 }
 
+std::pair<SDValue, int>
+HexagonTargetLowering::getBaseAndOffset(SDValue Addr) const {
+  if (Addr.getOpcode() == ISD::ADD) {
+    SDValue Op1 = Addr.getOperand(1);
+    if (auto *CN = dyn_cast<const ConstantSDNode>(Op1.getNode()))
+      return { Addr.getOperand(0), CN->getSExtValue() };
+  }
+  return { Addr, 0 };
+}
+
 // Lower a vector shuffle (V1, V2, V3).  V1 and V2 are the two vectors
 // to select data from, V3 is the permutation.
 SDValue
 HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
       const {
-  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
+  const auto *SVN = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> AM = SVN->getMask();
+  assert(AM.size() <= 8 && "Unexpected shuffle mask");
+  unsigned VecLen = AM.size();
 
-  if (V2.isUndef())
-    V2 = V1;
-
-  if (SVN->isSplat()) {
-    int Lane = SVN->getSplatIndex();
-    if (Lane == -1) Lane = 0;
-
-    // Test if V1 is a SCALAR_TO_VECTOR.
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
-
-    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
-    // (and probably will turn into a SCALAR_TO_VECTOR once legalization
-    // reaches it).
-    if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
-        !isa<ConstantSDNode>(V1.getOperand(0))) {
-      bool IsScalarToVector = true;
-      for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) {
-        if (!V1.getOperand(i).isUndef()) {
-          IsScalarToVector = false;
-          break;
-        }
-      }
-      if (IsScalarToVector)
-        return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
+  MVT VecTy = ty(Op);
+  assert(!Subtarget.isHVXVectorType(VecTy, true) &&
+         "HVX shuffles should be legal");
+  assert(VecTy.getSizeInBits() <= 64 && "Unexpected vector length");
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  const SDLoc &dl(Op);
+
+  // If the inputs are not the same as the output, bail. This is not an
+  // error situation, but complicates the handling and the default expansion
+  // (into BUILD_VECTOR) should be adequate.
+  if (ty(Op0) != VecTy || ty(Op1) != VecTy)
+    return SDValue();
+
+  // Normalize the mask so that the first non-negative index comes from
+  // the first operand.
+  SmallVector<int,8> Mask(AM.begin(), AM.end());
+  unsigned F = llvm::find_if(AM, [](int M) { return M >= 0; }) - AM.data();
+  if (F == AM.size())
+    return DAG.getUNDEF(VecTy);
+  if (AM[F] >= int(VecLen)) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(Op0, Op1);
+  }
+
+  // Express the shuffle mask in terms of bytes.
+  SmallVector<int,8> ByteMask;
+  unsigned ElemBytes = VecTy.getVectorElementType().getSizeInBits() / 8;
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    int M = Mask[i];
+    if (M < 0) {
+      for (unsigned j = 0; j != ElemBytes; ++j)
+        ByteMask.push_back(-1);
+    } else {
+      for (unsigned j = 0; j != ElemBytes; ++j)
+        ByteMask.push_back(M*ElemBytes + j);
     }
-    return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
-                       DAG.getConstant(Lane, dl, MVT::i32));
   }
+  assert(ByteMask.size() <= 8);
+
+  // All non-undef (non-negative) indexes are well within [0..127], so they
+  // fit in a single byte. Build two 64-bit words:
+  // - MaskIdx where each byte is the corresponding index (for non-negative
+  //   indexes), and 0xFF for negative indexes, and
+  // - MaskUnd that has 0xFF for each negative index.
+  uint64_t MaskIdx = 0;
+  uint64_t MaskUnd = 0;
+  for (unsigned i = 0, e = ByteMask.size(); i != e; ++i) {
+    unsigned S = 8*i;
+    uint64_t M = ByteMask[i] & 0xFF;
+    if (M == 0xFF)
+      MaskUnd |= M << S;
+    MaskIdx |= M << S;
+  }
+
+  if (ByteMask.size() == 4) {
+    // Identity.
+    if (MaskIdx == (0x03020100 | MaskUnd))
+      return Op0;
+    // Byte swap.
+    if (MaskIdx == (0x00010203 | MaskUnd)) {
+      SDValue T0 = DAG.getBitcast(MVT::i32, Op0);
+      SDValue T1 = DAG.getNode(ISD::BSWAP, dl, MVT::i32, T0);
+      return DAG.getBitcast(VecTy, T1);
+    }
 
-  // FIXME: We need to support more general vector shuffles.  See
-  // below the comment from the ARM backend that deals in the general
-  // case with the vector shuffles.  For now, let expand handle these.
-  return SDValue();
+    // Byte packs.
+    SDValue Concat10 = DAG.getNode(HexagonISD::COMBINE, dl,
+                                   typeJoin({ty(Op1), ty(Op0)}), {Op1, Op0});
+    if (MaskIdx == (0x06040200 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunehb, dl, VecTy, {Concat10}, DAG);
+    if (MaskIdx == (0x07050301 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunohb, dl, VecTy, {Concat10}, DAG);
+
+    SDValue Concat01 = DAG.getNode(HexagonISD::COMBINE, dl,
+                                   typeJoin({ty(Op0), ty(Op1)}), {Op0, Op1});
+    if (MaskIdx == (0x02000604 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunehb, dl, VecTy, {Concat01}, DAG);
+    if (MaskIdx == (0x03010705 | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunohb, dl, VecTy, {Concat01}, DAG);
+  }
+
+  if (ByteMask.size() == 8) {
+    // Identity.
+    if (MaskIdx == (0x0706050403020100ull | MaskUnd))
+      return Op0;
+    // Byte swap.
+    if (MaskIdx == (0x0001020304050607ull | MaskUnd)) {
+      SDValue T0 = DAG.getBitcast(MVT::i64, Op0);
+      SDValue T1 = DAG.getNode(ISD::BSWAP, dl, MVT::i64, T0);
+      return DAG.getBitcast(VecTy, T1);
+    }
 
-  // If the shuffle is not directly supported and it has 4 elements, use
-  // the PerfectShuffle-generated table to synthesize it from other shuffles.
-}
+    // Halfword picks.
+    if (MaskIdx == (0x0d0c050409080100ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffeh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0f0e07060b0a0302ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffoh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0d0c090805040100ull | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunewh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0f0e0b0a07060302ull | MaskUnd))
+      return getInstr(Hexagon::S2_vtrunowh, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0706030205040100ull | MaskUnd)) {
+      VectorPair P = opSplit(Op0, dl, DAG);
+      return getInstr(Hexagon::S2_packhl, dl, VecTy, {P.second, P.first}, DAG);
+    }
 
-// If BUILD_VECTOR has same base element repeated several times,
-// report true.
-static bool isCommonSplatElement(BuildVectorSDNode *BVN) {
-  unsigned NElts = BVN->getNumOperands();
-  SDValue V0 = BVN->getOperand(0);
+    // Byte packs.
+    if (MaskIdx == (0x0e060c040a020800ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffeb, dl, VecTy, {Op1, Op0}, DAG);
+    if (MaskIdx == (0x0f070d050b030901ull | MaskUnd))
+      return getInstr(Hexagon::S2_shuffob, dl, VecTy, {Op1, Op0}, DAG);
+  }
 
-  for (unsigned i = 1, e = NElts; i != e; ++i) {
-    if (BVN->getOperand(i) != V0)
-      return false;
+  return SDValue();
+}
+
+// Create a Hexagon-specific node for shifting a vector by an integer.
+SDValue
+HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG)
+      const {
+  if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) {
+    if (SDValue S = BVN->getSplatValue()) {
+      unsigned NewOpc;
+      switch (Op.getOpcode()) {
+        case ISD::SHL:
+          NewOpc = HexagonISD::VASL;
+          break;
+        case ISD::SRA:
+          NewOpc = HexagonISD::VASR;
+          break;
+        case ISD::SRL:
+          NewOpc = HexagonISD::VLSR;
+          break;
+        default:
+          llvm_unreachable("Unexpected shift opcode");
+      }
+      return DAG.getNode(NewOpc, SDLoc(Op), ty(Op), Op.getOperand(0), S);
+    }
   }
-  return true;
+
+  return SDValue();
 }
 
-// Lower a vector shift. Try to convert
-// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific
-// <VT> = SHL/SRA/SRL <VT> by <IT/i32>.
 SDValue
 HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = nullptr;
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDValue V3;
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
+  return getVectorShiftByInt(Op, DAG);
+}
 
-  if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) &&
-      isCommonSplatElement(BVN))
-    V3 = V2;
-  else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) &&
-           isCommonSplatElement(BVN))
-    V3 = V1;
-  else
-    return SDValue();
+SDValue
+HexagonTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+  if (isa<ConstantSDNode>(Op.getOperand(1).getNode()))
+    return Op;
+  return SDValue();
+}
 
-  SDValue CommonSplat = BVN->getOperand(0);
-  SDValue Result;
+SDValue
+HexagonTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+  MVT ResTy = ty(Op);
+  SDValue InpV = Op.getOperand(0);
+  MVT InpTy = ty(InpV);
+  assert(ResTy.getSizeInBits() == InpTy.getSizeInBits());
+  const SDLoc &dl(Op);
 
-  if (VT.getSimpleVT() == MVT::v4i16) {
-    switch (Op.getOpcode()) {
-    case ISD::SRA:
-      Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SHL:
-      Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SRL:
-      Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
-      break;
-    default:
-      return SDValue();
-    }
-  } else if (VT.getSimpleVT() == MVT::v2i32) {
-    switch (Op.getOpcode()) {
-    case ISD::SRA:
-      Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SHL:
-      Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
-      break;
-    case ISD::SRL:
-      Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
-      break;
-    default:
-      return SDValue();
-    }
-  } else {
-    return SDValue();
+  // Handle conversion from i8 to v8i1.
+  if (ResTy == MVT::v8i1) {
+    SDValue Sc = DAG.getBitcast(tyScalar(InpTy), InpV);
+    SDValue Ext = DAG.getZExtOrTrunc(Sc, dl, MVT::i32);
+    return getInstr(Hexagon::C2_tfrrp, dl, ResTy, Ext, DAG);
   }
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+  return SDValue();
 }
 
 bool
@@ -2509,9 +2138,10 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
       Consts[i] = ConstantInt::get(IntTy, 0);
       continue;
     }
+    // Make sure to always cast to IntTy.
     if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
       const ConstantInt *CI = CN->getConstantIntValue();
-      Consts[i] = const_cast<ConstantInt*>(CI);
+      Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
     } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
       const ConstantFP *CF = CN->getConstantFPValue();
       APInt A = CF->getValueAPF().bitcastToAPInt();
@@ -2550,8 +2180,8 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
                    Consts[1]->getZExtValue() << 16;
       return DAG.getBitcast(MVT::v2i16, DAG.getConstant(V, dl, MVT::i32));
     }
-    SDValue N = getNode(Hexagon::A2_combine_ll, dl, MVT::i32,
-                        {Elem[1], Elem[0]}, DAG);
+    SDValue N = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32,
+                         {Elem[1], Elem[0]}, DAG);
     return DAG.getBitcast(MVT::v2i16, N);
   }
 
@@ -2596,7 +2226,7 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
     SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0});
     SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1});
 
-    SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
+    SDValue R = getInstr(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
     return DAG.getBitcast(MVT::v4i8, R);
   }
 
@@ -2651,7 +2281,7 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
     uint64_t Mask = (ElemTy == MVT::i8)  ? 0xFFull
                   : (ElemTy == MVT::i16) ? 0xFFFFull : 0xFFFFFFFFull;
     for (unsigned i = 0; i != Num; ++i)
-      Val = (Val << W) | (Consts[i]->getZExtValue() & Mask);
+      Val = (Val << W) | (Consts[Num-1-i]->getZExtValue() & Mask);
     SDValue V0 = DAG.getConstant(Val, dl, MVT::i64);
     return DAG.getBitcast(VecTy, V0);
   }
@@ -2677,8 +2307,56 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
   unsigned VecWidth = VecTy.getSizeInBits();
   unsigned ValWidth = ValTy.getSizeInBits();
   unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits();
-  assert(VecWidth == 32 || VecWidth == 64);
   assert((VecWidth % ElemWidth) == 0);
+  auto *IdxN = dyn_cast<ConstantSDNode>(IdxV);
+
+  // Special case for v{8,4,2}i1 (the only boolean vectors legal in Hexagon
+  // without any coprocessors).
+  if (ElemWidth == 1) {
+    assert(VecWidth == VecTy.getVectorNumElements() && "Sanity failure");
+    assert(VecWidth == 8 || VecWidth == 4 || VecWidth == 2);
+    // Check if this is an extract of the lowest bit.
+    if (IdxN) {
+      // Extracting the lowest bit is a no-op, but it changes the type,
+      // so it must be kept as an operation to avoid errors related to
+      // type mismatches.
+      if (IdxN->isNullValue() && ValTy.getSizeInBits() == 1)
+        return DAG.getNode(HexagonISD::TYPECAST, dl, MVT::i1, VecV);
+    }
+
+    // If the value extracted is a single bit, use tstbit.
+    if (ValWidth == 1) {
+      SDValue A0 = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32, {VecV}, DAG);
+      SDValue M0 = DAG.getConstant(8 / VecWidth, dl, MVT::i32);
+      SDValue I0 = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, M0);
+      return DAG.getNode(HexagonISD::TSTBIT, dl, MVT::i1, A0, I0);
+    }
+
+    // Each bool vector (v2i1, v4i1, v8i1) always occupies 8 bits in
+    // a predicate register. The elements of the vector are repeated
+    // in the register (if necessary) so that the total number is 8.
+    // The extracted subvector will need to be expanded in such a way.
+    unsigned Scale = VecWidth / ValWidth;
+
+    // Generate (p2d VecV) >> 8*Idx to move the interesting bytes to
+    // position 0.
+    assert(ty(IdxV) == MVT::i32);
+    SDValue S0 = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+                             DAG.getConstant(8*Scale, dl, MVT::i32));
+    SDValue T0 = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, VecV);
+    SDValue T1 = DAG.getNode(ISD::SRL, dl, MVT::i64, T0, S0);
+    while (Scale > 1) {
+      // The longest possible subvector is at most 32 bits, so it is always
+      // contained in the low subregister.
+      T1 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, T1);
+      T1 = expandPredicate(T1, dl, DAG);
+      Scale /= 2;
+    }
+
+    return DAG.getNode(HexagonISD::D2P, dl, ResTy, T1);
+  }
+
+  assert(VecWidth == 32 || VecWidth == 64);
 
   // Cast everything to scalar integer types.
   MVT ScalarTy = tyScalar(VecTy);
@@ -2687,8 +2365,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
   SDValue WidthV = DAG.getConstant(ValWidth, dl, MVT::i32);
   SDValue ExtV;
 
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(IdxV)) {
-    unsigned Off = C->getZExtValue() * ElemWidth;
+  if (IdxN) {
+    unsigned Off = IdxN->getZExtValue() * ElemWidth;
     if (VecWidth == 64 && ValWidth == 32) {
       assert(Off == 0 || Off == 32);
       unsigned SubIdx = Off == 0 ? Hexagon::isub_lo : Hexagon::isub_hi;
@@ -2707,11 +2385,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
       IdxV = DAG.getZExtOrTrunc(IdxV, dl, MVT::i32);
     SDValue OffV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
                                DAG.getConstant(ElemWidth, dl, MVT::i32));
-    // EXTRACTURP takes width/offset in a 64-bit pair.
-    SDValue CombV = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
-                                  {WidthV, OffV});
-    ExtV = DAG.getNode(HexagonISD::EXTRACTURP, dl, ScalarTy,
-                       {VecV, CombV});
+    ExtV = DAG.getNode(HexagonISD::EXTRACTU, dl, ScalarTy,
+                       {VecV, WidthV, OffV});
   }
 
   // Cast ExtV to the requested result type.
@@ -2725,6 +2400,33 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
                                     const SDLoc &dl, MVT ValTy,
                                     SelectionDAG &DAG) const {
   MVT VecTy = ty(VecV);
+  if (VecTy.getVectorElementType() == MVT::i1) {
+    MVT ValTy = ty(ValV);
+    assert(ValTy.getVectorElementType() == MVT::i1);
+    SDValue ValR = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, ValV);
+    unsigned VecLen = VecTy.getVectorNumElements();
+    unsigned Scale = VecLen / ValTy.getVectorNumElements();
+    assert(Scale > 1);
+
+    for (unsigned R = Scale; R > 1; R /= 2) {
+      ValR = contractPredicate(ValR, dl, DAG);
+      ValR = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+                         DAG.getUNDEF(MVT::i32), ValR);
+    }
+    // The longest possible subvector is at most 32 bits, so it is always
+    // contained in the low subregister.
+    ValR = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, ValR);
+
+    unsigned ValBytes = 64 / Scale;
+    SDValue Width = DAG.getConstant(ValBytes*8, dl, MVT::i32);
+    SDValue Idx = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+                              DAG.getConstant(8, dl, MVT::i32));
+    SDValue VecR = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, VecV);
+    SDValue Ins = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32,
+                              {VecR, ValR, Width, Idx});
+    return DAG.getNode(HexagonISD::D2P, dl, VecTy, Ins);
+  }
+
   unsigned VecWidth = VecTy.getSizeInBits();
   unsigned ValWidth = ValTy.getSizeInBits();
   assert(VecWidth == 32 || VecWidth == 64);
@@ -2752,17 +2454,32 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
     if (ty(IdxV) != MVT::i32)
       IdxV = DAG.getZExtOrTrunc(IdxV, dl, MVT::i32);
     SDValue OffV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, WidthV);
-    // INSERTRP takes width/offset in a 64-bit pair.
-    SDValue CombV = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
-                                  {WidthV, OffV});
-    InsV = DAG.getNode(HexagonISD::INSERTRP, dl, ScalarTy,
-                       {VecV, ValV, CombV});
+    InsV = DAG.getNode(HexagonISD::INSERT, dl, ScalarTy,
+                       {VecV, ValV, WidthV, OffV});
   }
 
   return DAG.getNode(ISD::BITCAST, dl, VecTy, InsV);
 }
 
 SDValue
+HexagonTargetLowering::expandPredicate(SDValue Vec32, const SDLoc &dl,
+                                       SelectionDAG &DAG) const {
+  assert(ty(Vec32).getSizeInBits() == 32);
+  if (isUndef(Vec32))
+    return DAG.getUNDEF(MVT::i64);
+  return getInstr(Hexagon::S2_vsxtbh, dl, MVT::i64, {Vec32}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::contractPredicate(SDValue Vec64, const SDLoc &dl,
+                                         SelectionDAG &DAG) const {
+  assert(ty(Vec64).getSizeInBits() == 64);
+  if (isUndef(Vec64))
+    return DAG.getUNDEF(MVT::i32);
+  return getInstr(Hexagon::S2_vtrunehb, dl, MVT::i32, {Vec64}, DAG);
+}
+
+SDValue
 HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
       const {
   if (Ty.isVector()) {
@@ -2784,18 +2501,34 @@ SDValue
 HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
   unsigned BW = VecTy.getSizeInBits();
-
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true))
-    return LowerHvxBuildVector(Op, DAG);
-
-  if (BW == 32 || BW == 64) {
-    const SDLoc &dl(Op);
-    SmallVector<SDValue,8> Ops;
-    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
-      Ops.push_back(Op.getOperand(i));
-    if (BW == 32)
-      return buildVector32(Ops, dl, VecTy, DAG);
+  const SDLoc &dl(Op);
+  SmallVector<SDValue,8> Ops;
+  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  if (BW == 32)
+    return buildVector32(Ops, dl, VecTy, DAG);
+  if (BW == 64)
     return buildVector64(Ops, dl, VecTy, DAG);
+
+  if (VecTy == MVT::v8i1 || VecTy == MVT::v4i1 || VecTy == MVT::v2i1) {
+    // For each i1 element in the resulting predicate register, put 1
+    // shifted by the index of the element into a general-purpose register,
+    // then or them together and transfer it back into a predicate register.
+    SDValue Rs[8];
+    SDValue Z = getZero(dl, MVT::i32, DAG);
+    // Always produce 8 bits, repeat inputs if necessary.
+    unsigned Rep = 8 / VecTy.getVectorNumElements();
+    for (unsigned i = 0; i != 8; ++i) {
+      SDValue S = DAG.getConstant(1ull << i, dl, MVT::i32);
+      Rs[i] = DAG.getSelect(dl, MVT::i32, Ops[i/Rep], S, Z);
+    }
+    for (ArrayRef<SDValue> A(Rs); A.size() != 1; A = A.drop_back(A.size()/2)) {
+      for (unsigned i = 0, e = A.size()/2; i != e; ++i)
+        Rs[i] = DAG.getNode(ISD::OR, dl, MVT::i32, Rs[2*i], Rs[2*i+1]);
+    }
+    // Move the value directly to a predicate register.
+    return getInstr(Hexagon::C2_tfrrp, dl, VecTy, {Rs[0]}, DAG);
   }
 
   return SDValue();
@@ -2805,14 +2538,64 @@ SDValue
 HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                            SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
-  assert(!Subtarget.useHVXOps() || !Subtarget.isHVXVectorType(VecTy));
-
+  const SDLoc &dl(Op);
   if (VecTy.getSizeInBits() == 64) {
     assert(Op.getNumOperands() == 2);
-    return DAG.getNode(HexagonISD::COMBINE, SDLoc(Op), VecTy, Op.getOperand(1),
+    return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, Op.getOperand(1),
                        Op.getOperand(0));
   }
 
+  MVT ElemTy = VecTy.getVectorElementType();
+  if (ElemTy == MVT::i1) {
+    assert(VecTy == MVT::v2i1 || VecTy == MVT::v4i1 || VecTy == MVT::v8i1);
+    MVT OpTy = ty(Op.getOperand(0));
+    // Scale is how many times the operands need to be contracted to match
+    // the representation in the target register.
+    unsigned Scale = VecTy.getVectorNumElements() / OpTy.getVectorNumElements();
+    assert(Scale == Op.getNumOperands() && Scale > 1);
+
+    // First, convert all bool vectors to integers, then generate pairwise
+    // inserts to form values of doubled length. Up until there are only
+    // two values left to concatenate, all of these values will fit in a
+    // 32-bit integer, so keep them as i32 to use 32-bit inserts.
+    SmallVector<SDValue,4> Words[2];
+    unsigned IdxW = 0;
+
+    for (SDValue P : Op.getNode()->op_values()) {
+      SDValue W = DAG.getNode(HexagonISD::P2D, dl, MVT::i64, P);
+      for (unsigned R = Scale; R > 1; R /= 2) {
+        W = contractPredicate(W, dl, DAG);
+        W = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+                        DAG.getUNDEF(MVT::i32), W);
+      }
+      W = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, W);
+      Words[IdxW].push_back(W);
+    }
+
+    while (Scale > 2) {
+      SDValue WidthV = DAG.getConstant(64 / Scale, dl, MVT::i32);
+      Words[IdxW ^ 1].clear();
+
+      for (unsigned i = 0, e = Words[IdxW].size(); i != e; i += 2) {
+        SDValue W0 = Words[IdxW][i], W1 = Words[IdxW][i+1];
+        // Insert W1 into W0 right next to the significant bits of W0.
+        SDValue T = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32,
+                                {W0, W1, WidthV, WidthV});
+        Words[IdxW ^ 1].push_back(T);
+      }
+      IdxW ^= 1;
+      Scale /= 2;
+    }
+
+    // Another sanity check. At this point there should only be two words
+    // left, and Scale should be 2.
+    assert(Scale == 2 && Words[IdxW].size() == 2);
+
+    SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64,
+                             Words[IdxW][1], Words[IdxW][0]);
+    return DAG.getNode(HexagonISD::D2P, dl, VecTy, WW);
+  }
+
   return SDValue();
 }
 
@@ -2820,10 +2603,6 @@ SDValue
 HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
-  MVT VecTy = ty(Vec);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxExtractElement(Op, DAG);
-
   MVT ElemTy = ty(Vec).getVectorElementType();
   return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ElemTy, ty(Op), DAG);
 }
@@ -2831,31 +2610,20 @@ HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 SDValue
 HexagonTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                               SelectionDAG &DAG) const {
-  SDValue Vec = Op.getOperand(0);
-  MVT VecTy = ty(Vec);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxExtractSubvector(Op, DAG);
-
-  return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ty(Op), ty(Op), DAG);
+  return extractVector(Op.getOperand(0), Op.getOperand(1), SDLoc(Op),
+                       ty(Op), ty(Op), DAG);
 }
 
 SDValue
 HexagonTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MVT VecTy = ty(Op);
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxInsertElement(Op, DAG);
-
   return insertVector(Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
-                      SDLoc(Op), VecTy.getVectorElementType(), DAG);
+                      SDLoc(Op), ty(Op).getVectorElementType(), DAG);
 }
 
 SDValue
 HexagonTargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(Op)))
-    return LowerHvxInsertSubvector(Op, DAG);
-
   SDValue ValV = Op.getOperand(1);
   return insertVector(Op.getOperand(0), ValV, Op.getOperand(2),
                       SDLoc(Op), ty(ValV), DAG);
@@ -2875,6 +2643,109 @@ HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
 }
 
 SDValue
+HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
+      const {
+  LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+  unsigned HaveAlign = LN->getAlignment();
+  MVT LoadTy = ty(Op);
+  unsigned NeedAlign = Subtarget.getTypeAlignment(LoadTy);
+  if (HaveAlign >= NeedAlign)
+    return Op;
+
+  const SDLoc &dl(Op);
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &Ctx = *DAG.getContext();
+  unsigned AS = LN->getAddressSpace();
+
+  // If the load aligning is disabled or the load can be broken up into two
+  // smaller legal loads, do the default (target-independent) expansion.
+  bool DoDefault = false;
+  // Handle it in the default way if this is an indexed load.
+  if (!LN->isUnindexed())
+    DoDefault = true;
+
+  if (!AlignLoads) {
+    if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), AS, HaveAlign))
+      return Op;
+    DoDefault = true;
+  }
+  if (!DoDefault && 2*HaveAlign == NeedAlign) {
+    // The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)".
+    MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8*HaveAlign)
+                                : MVT::getVectorVT(MVT::i8, HaveAlign);
+    DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, AS, HaveAlign);
+  }
+  if (DoDefault) {
+    std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG);
+    return DAG.getMergeValues({P.first, P.second}, dl);
+  }
+
+  // The code below generates two loads, both aligned as NeedAlign, and
+  // with the distance of NeedAlign between them. For that to cover the
+  // bits that need to be loaded (and without overlapping), the size of
+  // the loads should be equal to NeedAlign. This is true for all loadable
+  // types, but add an assertion in case something changes in the future.
+  assert(LoadTy.getSizeInBits() == 8*NeedAlign);
+
+  unsigned LoadLen = NeedAlign;
+  SDValue Base = LN->getBasePtr();
+  SDValue Chain = LN->getChain();
+  auto BO = getBaseAndOffset(Base);
+  unsigned BaseOpc = BO.first.getOpcode();
+  if (BaseOpc == HexagonISD::VALIGNADDR && BO.second % LoadLen == 0)
+    return Op;
+
+  if (BO.second % LoadLen != 0) {
+    BO.first = DAG.getNode(ISD::ADD, dl, MVT::i32, BO.first,
+                           DAG.getConstant(BO.second % LoadLen, dl, MVT::i32));
+    BO.second -= BO.second % LoadLen;
+  }
+  SDValue BaseNoOff = (BaseOpc != HexagonISD::VALIGNADDR)
+      ? DAG.getNode(HexagonISD::VALIGNADDR, dl, MVT::i32, BO.first,
+                    DAG.getConstant(NeedAlign, dl, MVT::i32))
+      : BO.first;
+  SDValue Base0 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second, dl);
+  SDValue Base1 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second+LoadLen, dl);
+
+  MachineMemOperand *WideMMO = nullptr;
+  if (MachineMemOperand *MMO = LN->getMemOperand()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    WideMMO = MF.getMachineMemOperand(MMO->getPointerInfo(), MMO->getFlags(),
+                    2*LoadLen, LoadLen, MMO->getAAInfo(), MMO->getRanges(),
+                    MMO->getSyncScopeID(), MMO->getOrdering(),
+                    MMO->getFailureOrdering());
+  }
+
+  SDValue Load0 = DAG.getLoad(LoadTy, dl, Chain, Base0, WideMMO);
+  SDValue Load1 = DAG.getLoad(LoadTy, dl, Chain, Base1, WideMMO);
+
+  SDValue Aligned = DAG.getNode(HexagonISD::VALIGN, dl, LoadTy,
+                                {Load1, Load0, BaseNoOff.getOperand(0)});
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 Load0.getValue(1), Load1.getValue(1));
+  SDValue M = DAG.getMergeValues({Aligned, NewChain}, dl);
+  return M;
+}
+
+SDValue
+HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned Opc = Op.getOpcode();
+  SDValue X = Op.getOperand(0), Y = Op.getOperand(1), C = Op.getOperand(2);
+
+  if (Opc == ISD::ADDCARRY)
+    return DAG.getNode(HexagonISD::ADDC, dl, Op.getNode()->getVTList(),
+                       { X, Y, C });
+
+  EVT CarryTy = C.getValueType();
+  SDValue SubC = DAG.getNode(HexagonISD::SUBC, dl, Op.getNode()->getVTList(),
+                             { X, Y, DAG.getLogicalNOT(dl, C, CarryTy) });
+  SDValue Out[] = { SubC.getValue(0),
+                    DAG.getLogicalNOT(dl, SubC.getValue(1), CarryTy) };
+  return DAG.getMergeValues(Out, dl);
+}
+
+SDValue
 HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
@@ -2904,6 +2775,17 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
+
+  // Handle INLINEASM first.
+  if (Opc == ISD::INLINEASM)
+    return LowerINLINEASM(Op, DAG);
+
+  if (isHvxOperation(Op)) {
+    // If HVX lowering returns nothing, try the default lowering.
+    if (SDValue V = LowerHvxOperation(Op, DAG))
+      return V;
+  }
+
   switch (Opc) {
     default:
 #ifndef NDEBUG
@@ -2919,13 +2801,17 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::EXTRACT_VECTOR_ELT:   return LowerEXTRACT_VECTOR_ELT(Op, DAG);
     case ISD::BUILD_VECTOR:         return LowerBUILD_VECTOR(Op, DAG);
     case ISD::VECTOR_SHUFFLE:       return LowerVECTOR_SHUFFLE(Op, DAG);
+    case ISD::BITCAST:              return LowerBITCAST(Op, DAG);
+    case ISD::LOAD:                 return LowerUnalignedLoad(Op, DAG);
+    case ISD::ADDCARRY:
+    case ISD::SUBCARRY:             return LowerAddSubCarry(Op, DAG);
     case ISD::SRA:
     case ISD::SHL:
     case ISD::SRL:                  return LowerVECTOR_SHIFT(Op, DAG);
+    case ISD::ROTL:                 return LowerROTL(Op, DAG);
     case ISD::ConstantPool:         return LowerConstantPool(Op, DAG);
     case ISD::JumpTable:            return LowerJumpTable(Op, DAG);
     case ISD::EH_RETURN:            return LowerEH_RETURN(Op, DAG);
-      // Frame & Return address. Currently unimplemented.
     case ISD::RETURNADDR:           return LowerRETURNADDR(Op, DAG);
     case ISD::FRAMEADDR:            return LowerFRAMEADDR(Op, DAG);
     case ISD::GlobalTLSAddress:     return LowerGlobalTLSAddress(Op, DAG);
@@ -2939,17 +2825,35 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
-    case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
     case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
     case ISD::READCYCLECOUNTER:     return LowerREADCYCLECOUNTER(Op, DAG);
-    case ISD::MUL:
-      if (Subtarget.useHVXOps())
-        return LowerHvxMul(Op, DAG);
       break;
   }
+
   return SDValue();
 }
 
+void
+HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  const SDLoc &dl(N);
+  switch (N->getOpcode()) {
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::SHL:
+      return;
+    case ISD::BITCAST:
+      // Handle a bitcast from v8i1 to i8.
+      if (N->getValueType(0) == MVT::i8) {
+        SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
+                             N->getOperand(0), DAG);
+        Results.push_back(P);
+      }
+      break;
+  }
+}
+
 /// Returns relocation base for the given PIC jumptable.
 SDValue
 HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
@@ -3023,7 +2927,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
       case 512:
         return {0u, &Hexagon::HvxVRRegClass};
       case 1024:
-        if (Subtarget.hasV60TOps() && Subtarget.useHVX128BOps())
+        if (Subtarget.hasV60Ops() && Subtarget.useHVX128BOps())
           return {0u, &Hexagon::HvxVRRegClass};
         return {0u, &Hexagon::HvxWRRegClass};
       case 2048:
@@ -3042,7 +2946,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  return Subtarget.hasV5TOps();
+  return Subtarget.hasV5Ops();
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
@@ -3104,9 +3008,9 @@ bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
 bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
                                  SDValue Callee,
                                  CallingConv::ID CalleeCC,
-                                 bool isVarArg,
-                                 bool isCalleeStructRet,
-                                 bool isCallerStructRet,
+                                 bool IsVarArg,
+                                 bool IsCalleeStructRet,
+                                 bool IsCallerStructRet,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -3137,12 +3041,12 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
   }
 
   // Do not tail call optimize vararg calls.
-  if (isVarArg)
+  if (IsVarArg)
     return false;
 
   // Also avoid tail call optimization if either caller or callee uses struct
   // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
+  if (IsCalleeStructRet || IsCallerStructRet)
     return false;
 
   // In addition to the cases above, we also disable Tail Call Optimization if
@@ -3185,54 +3089,25 @@ bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
       unsigned AS, unsigned Align, bool *Fast) const {
   if (Fast)
     *Fast = false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  default:
-    return false;
-  case MVT::v64i8:
-  case MVT::v128i8:
-  case MVT::v256i8:
-  case MVT::v32i16:
-  case MVT::v64i16:
-  case MVT::v128i16:
-  case MVT::v16i32:
-  case MVT::v32i32:
-  case MVT::v64i32:
-    return true;
-  }
-  return false;
+  return Subtarget.isHVXVectorType(VT.getSimpleVT());
 }
 
 std::pair<const TargetRegisterClass*, uint8_t>
 HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
       MVT VT) const {
-  const TargetRegisterClass *RRC = nullptr;
+  if (Subtarget.isHVXVectorType(VT, true)) {
+    unsigned BitWidth = VT.getSizeInBits();
+    unsigned VecWidth = Subtarget.getVectorLength() * 8;
 
-  uint8_t Cost = 1;
-  switch (VT.SimpleTy) {
-  default:
-    return TargetLowering::findRepresentativeClass(TRI, VT);
-  case MVT::v64i8:
-  case MVT::v32i16:
-  case MVT::v16i32:
-    RRC = &Hexagon::HvxVRRegClass;
-    break;
-  case MVT::v128i8:
-  case MVT::v64i16:
-  case MVT::v32i32:
-    if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
-        Subtarget.useHVX128BOps())
-      RRC = &Hexagon::HvxVRRegClass;
-    else
-      RRC = &Hexagon::HvxWRRegClass;
-    break;
-  case MVT::v256i8:
-  case MVT::v128i16:
-  case MVT::v64i32:
-    RRC = &Hexagon::HvxWRRegClass;
-    break;
+    if (VT.getVectorElementType() == MVT::i1)
+      return std::make_pair(&Hexagon::HvxQRRegClass, 1);
+    if (BitWidth == VecWidth)
+      return std::make_pair(&Hexagon::HvxVRRegClass, 1);
+    assert(BitWidth == 2 * VecWidth);
+    return std::make_pair(&Hexagon::HvxWRRegClass, 1);
   }
-  return std::make_pair(RRC, Cost);
+
+  return TargetLowering::findRepresentativeClass(TRI, VT);
 }
 
 Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 732834b464b4..3d94bd1ff6ed 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -18,12 +18,12 @@
 #include "Hexagon.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cstdint>
 #include <utility>
 
@@ -36,6 +36,8 @@ namespace HexagonISD {
 
       CONST32 = OP_BEGIN,
       CONST32_GP,  // For marking data present in GP.
+      ADDC,        // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
+      SUBC,        // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
       ALLOCA,
 
       AT_GOT,      // Index in GOT.
@@ -51,18 +53,15 @@ namespace HexagonISD {
       CP,          // Constant pool.
 
       COMBINE,
-      VSPLAT,
+      VSPLAT,      // Generic splat, selection depends on argument/return
+                   // types.
       VASL,
       VASR,
       VLSR,
 
+      TSTBIT,
       INSERT,
-      INSERTRP,
       EXTRACTU,
-      EXTRACTURP,
-      VCOMBINE,
-      VPACKE,
-      VPACKO,
       VEXTRACTW,
       VINSERTW0,
       VROR,
@@ -70,8 +69,24 @@ namespace HexagonISD {
       EH_RETURN,
       DCFETCH,
       READCYCLE,
+      D2P,         // Convert 8-byte value to 8-bit predicate register. [*]
+      P2D,         // Convert 8-bit predicate register to 8-byte value. [*]
+      V2Q,         // Convert HVX vector to a vector predicate reg. [*]
+      Q2V,         // Convert vector predicate to an HVX vector. [*]
+                   // [*] The equivalence is defined as "Q <=> (V != 0)",
+                   //     where the != operation compares bytes.
+                   // Note: V != 0 is implemented as V >u 0.
+      QCAT,
+      QTRUE,
+      QFALSE,
       VZERO,
-
+      VSPLATW,     // HVX splat of a 32-bit word with an arbitrary result type.
+      TYPECAST,    // No-op that's used to convert between different legal
+                   // types in a register.
+      VALIGN,      // Align two vectors (in Op0, Op1) to one that would have
+                   // been loaded from address in Op2.
+      VALIGNADDR,  // Align vector address: Op0 & -Op1, except when it is
+                   // an address in a vector load, then it's a no-op.
       OP_END
     };
 
@@ -110,6 +125,10 @@ namespace HexagonISD {
     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
     bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
+    bool isCheapToSpeculateCttz() const override { return true; }
+    bool isCheapToSpeculateCtlz() const override { return true; }
+    bool isCtlzFast() const override { return true; }
+
     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
     /// Return true if an FMA operation is faster than a pair of mul and add
@@ -127,6 +146,9 @@ namespace HexagonISD {
         const override;
 
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                            SelectionDAG &DAG) const override;
+
     const char *getTargetNodeName(unsigned Opcode) const override;
 
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -137,6 +159,13 @@ namespace HexagonISD {
     SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
@@ -284,6 +313,9 @@ namespace HexagonISD {
     }
 
   private:
+    void initializeHVXLowering();
+    std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
+
     bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
                                  SelectionDAG &DAG,
                                  MutableArrayRef<ConstantInt*> Consts) const;
@@ -295,13 +327,19 @@ namespace HexagonISD {
                           MVT ValTy, MVT ResTy, SelectionDAG &DAG) const;
     SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
                          const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
+    SDValue expandPredicate(SDValue Vec32, const SDLoc &dl,
+                            SelectionDAG &DAG) const;
+    SDValue contractPredicate(SDValue Vec64, const SDLoc &dl,
+                              SelectionDAG &DAG) const;
+    SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const;
+
     bool isUndef(SDValue Op) const {
       if (Op.isMachineOpcode())
         return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
       return Op.getOpcode() == ISD::UNDEF;
     }
-    SDValue getNode(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
-                    ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
+    SDValue getInstr(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
+                     ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
       SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
       return SDValue(N, 0);
     }
@@ -328,7 +366,8 @@ namespace HexagonISD {
     MVT tyVector(MVT Ty, MVT ElemTy) const {
       if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
         return Ty;
-      unsigned TyWidth = Ty.getSizeInBits(), ElemWidth = ElemTy.getSizeInBits();
+      unsigned TyWidth = Ty.getSizeInBits();
+      unsigned ElemWidth = ElemTy.getSizeInBits();
       assert((TyWidth % ElemWidth) == 0);
       return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
     }
@@ -343,31 +382,66 @@ namespace HexagonISD {
     VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
     SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
 
+    bool isHvxSingleTy(MVT Ty) const;
+    bool isHvxPairTy(MVT Ty) const;
     SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
                                SelectionDAG &DAG) const;
     SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
     SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
                            ArrayRef<int> Mask, SelectionDAG &DAG) const;
 
-    MVT getVecBoolVT() const;
-
-    SDValue buildHvxVectorSingle(ArrayRef<SDValue> Values, const SDLoc &dl,
-                                 MVT VecTy, SelectionDAG &DAG) const;
+    SDValue buildHvxVectorReg(ArrayRef<SDValue> Values, const SDLoc &dl,
+                              MVT VecTy, SelectionDAG &DAG) const;
     SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
                                MVT VecTy, SelectionDAG &DAG) const;
+    SDValue createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
+                                unsigned BitBytes, bool ZeroFill,
+                                SelectionDAG &DAG) const;
+    SDValue extractHvxElementReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                                 MVT ResTy, SelectionDAG &DAG) const;
+    SDValue extractHvxElementPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                                  MVT ResTy, SelectionDAG &DAG) const;
+    SDValue insertHvxElementReg(SDValue VecV, SDValue IdxV, SDValue ValV,
+                                const SDLoc &dl, SelectionDAG &DAG) const;
+    SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
+                                 const SDLoc &dl, SelectionDAG &DAG) const;
+    SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                                   MVT ResTy, SelectionDAG &DAG) const;
+    SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                                    MVT ResTy, SelectionDAG &DAG) const;
+    SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
+                                  const SDLoc &dl, SelectionDAG &DAG) const;
+    SDValue insertHvxSubvectorPred(SDValue VecV, SDValue SubV, SDValue IdxV,
+                                   const SDLoc &dl, SelectionDAG &DAG) const;
+    SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
+                                bool ZeroExt, SelectionDAG &DAG) const;
 
     SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
+    SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
 
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
         const override;
+
+    bool isHvxOperation(SDValue Op) const;
+    SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
   };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 51480d09d734..2566194ca9c6 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -10,9 +10,192 @@
 #include "HexagonISelLowering.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+static const MVT LegalV64[] =  { MVT::v64i8,  MVT::v32i16,  MVT::v16i32 };
+static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
+static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
+static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
+
+
+void
+HexagonTargetLowering::initializeHVXLowering() {
+  if (Subtarget.useHVX64BOps()) {
+    addRegisterClass(MVT::v64i8,  &Hexagon::HvxVRRegClass);
+    addRegisterClass(MVT::v32i16, &Hexagon::HvxVRRegClass);
+    addRegisterClass(MVT::v16i32, &Hexagon::HvxVRRegClass);
+    addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);
+    addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);
+    addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass);
+    // These "short" boolean vector types should be legal because
+    // they will appear as results of vector compares. If they were
+    // not legal, type legalization would try to make them legal
+    // and that would require using operations that do not use or
+    // produce such types. That, in turn, would imply using custom
+    // nodes, which would be unoptimizable by the DAG combiner.
+    // The idea is to rely on target-independent operations as much
+    // as possible.
+    addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
+    addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
+    addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
+    addRegisterClass(MVT::v512i1, &Hexagon::HvxQRRegClass);
+  } else if (Subtarget.useHVX128BOps()) {
+    addRegisterClass(MVT::v128i8,  &Hexagon::HvxVRRegClass);
+    addRegisterClass(MVT::v64i16,  &Hexagon::HvxVRRegClass);
+    addRegisterClass(MVT::v32i32,  &Hexagon::HvxVRRegClass);
+    addRegisterClass(MVT::v256i8,  &Hexagon::HvxWRRegClass);
+    addRegisterClass(MVT::v128i16, &Hexagon::HvxWRRegClass);
+    addRegisterClass(MVT::v64i32,  &Hexagon::HvxWRRegClass);
+    addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
+    addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
+    addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
+    addRegisterClass(MVT::v1024i1, &Hexagon::HvxQRRegClass);
+  }
+
+  // Set up operation actions.
+
+  bool Use64b = Subtarget.useHVX64BOps();
+  ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
+  ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
+  MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8;
+  MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
+
+  auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
+    setOperationAction(Opc, FromTy, Promote);
+    AddPromotedToType(Opc, FromTy, ToTy);
+  };
+
+  setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
+
+  for (MVT T : LegalV) {
+    setIndexedLoadAction(ISD::POST_INC,  T, Legal);
+    setIndexedStoreAction(ISD::POST_INC, T, Legal);
+
+    setOperationAction(ISD::AND,            T, Legal);
+    setOperationAction(ISD::OR,             T, Legal);
+    setOperationAction(ISD::XOR,            T, Legal);
+    setOperationAction(ISD::ADD,            T, Legal);
+    setOperationAction(ISD::SUB,            T, Legal);
+    setOperationAction(ISD::CTPOP,          T, Legal);
+    setOperationAction(ISD::CTLZ,           T, Legal);
+    if (T != ByteV) {
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+      setOperationAction(ISD::BSWAP,                    T, Legal);
+    }
+
+    setOperationAction(ISD::CTTZ,               T, Custom);
+    setOperationAction(ISD::LOAD,               T, Custom);
+    setOperationAction(ISD::MUL,                T, Custom);
+    setOperationAction(ISD::MULHS,              T, Custom);
+    setOperationAction(ISD::MULHU,              T, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
+    // Make concat-vectors custom to handle concats of more than 2 vectors.
+    setOperationAction(ISD::CONCAT_VECTORS,     T, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         T, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        T, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        T, Custom);
+    if (T != ByteV) {
+      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
+      // HVX only has shifts of words and halfwords.
+      setOperationAction(ISD::SRA,                     T, Custom);
+      setOperationAction(ISD::SHL,                     T, Custom);
+      setOperationAction(ISD::SRL,                     T, Custom);
+
+      // Promote all shuffles to operate on vectors of bytes.
+      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
+    }
+
+    setCondCodeAction(ISD::SETNE,  T, Expand);
+    setCondCodeAction(ISD::SETLE,  T, Expand);
+    setCondCodeAction(ISD::SETGE,  T, Expand);
+    setCondCodeAction(ISD::SETLT,  T, Expand);
+    setCondCodeAction(ISD::SETULE, T, Expand);
+    setCondCodeAction(ISD::SETUGE, T, Expand);
+    setCondCodeAction(ISD::SETULT, T, Expand);
+  }
+
+  for (MVT T : LegalW) {
+    // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
+    // independent) handling of it would convert it to a load, which is
+    // not always the optimal choice.
+    setOperationAction(ISD::BUILD_VECTOR,   T, Custom);
+    // Make concat-vectors custom to handle concats of more than 2 vectors.
+    setOperationAction(ISD::CONCAT_VECTORS, T, Custom);
+
+    // Custom-lower these operations for pairs. Expand them into a concat
+    // of the corresponding operations on individual vectors.
+    setOperationAction(ISD::ANY_EXTEND,               T, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,              T, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,              T, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_INREG,        T, Custom);
+    setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG,  T, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+
+    setOperationAction(ISD::LOAD,     T, Custom);
+    setOperationAction(ISD::STORE,    T, Custom);
+    setOperationAction(ISD::CTLZ,     T, Custom);
+    setOperationAction(ISD::CTTZ,     T, Custom);
+    setOperationAction(ISD::CTPOP,    T, Custom);
+
+    setOperationAction(ISD::ADD,      T, Legal);
+    setOperationAction(ISD::SUB,      T, Legal);
+    setOperationAction(ISD::MUL,      T, Custom);
+    setOperationAction(ISD::MULHS,    T, Custom);
+    setOperationAction(ISD::MULHU,    T, Custom);
+    setOperationAction(ISD::AND,      T, Custom);
+    setOperationAction(ISD::OR,       T, Custom);
+    setOperationAction(ISD::XOR,      T, Custom);
+    setOperationAction(ISD::SETCC,    T, Custom);
+    setOperationAction(ISD::VSELECT,  T, Custom);
+    if (T != ByteW) {
+      setOperationAction(ISD::SRA,      T, Custom);
+      setOperationAction(ISD::SHL,      T, Custom);
+      setOperationAction(ISD::SRL,      T, Custom);
+
+      // Promote all shuffles to operate on vectors of bytes.
+      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
+    }
+  }
+
+  // Boolean vectors.
+
+  for (MVT T : LegalW) {
+    // Boolean types for vector pairs will overlap with the boolean
+    // types for single vectors, e.g.
+    //   v64i8  -> v64i1 (single)
+    //   v64i16 -> v64i1 (pair)
+    // Set these actions first, and allow the single actions to overwrite
+    // any duplicates.
+    MVT BoolW = MVT::getVectorVT(MVT::i1, T.getVectorNumElements());
+    setOperationAction(ISD::SETCC,              BoolW, Custom);
+    setOperationAction(ISD::AND,                BoolW, Custom);
+    setOperationAction(ISD::OR,                 BoolW, Custom);
+    setOperationAction(ISD::XOR,                BoolW, Custom);
+  }
+
+  for (MVT T : LegalV) {
+    MVT BoolV = MVT::getVectorVT(MVT::i1, T.getVectorNumElements());
+    setOperationAction(ISD::BUILD_VECTOR,       BoolV, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     BoolV, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   BoolV, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  BoolV, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR,  BoolV, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, BoolV, Custom);
+    setOperationAction(ISD::AND,                BoolV, Legal);
+    setOperationAction(ISD::OR,                 BoolV, Legal);
+    setOperationAction(ISD::XOR,                BoolV, Legal);
+  }
+}
+
 SDValue
 HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
                               const SDLoc &dl, SelectionDAG &DAG) const {
@@ -75,9 +258,23 @@ HexagonTargetLowering::VectorPair
 HexagonTargetLowering::opSplit(SDValue Vec, const SDLoc &dl,
                                SelectionDAG &DAG) const {
   TypePair Tys = typeSplit(ty(Vec));
+  if (Vec.getOpcode() == HexagonISD::QCAT)
+    return VectorPair(Vec.getOperand(0), Vec.getOperand(1));
   return DAG.SplitVector(Vec, dl, Tys.first, Tys.second);
 }
 
+bool
+HexagonTargetLowering::isHvxSingleTy(MVT Ty) const {
+  return Subtarget.isHVXVectorType(Ty) &&
+         Ty.getSizeInBits() == 8 * Subtarget.getVectorLength();
+}
+
+bool
+HexagonTargetLowering::isHvxPairTy(MVT Ty) const {
+  return Subtarget.isHVXVectorType(Ty) &&
+         Ty.getSizeInBits() == 16 * Subtarget.getVectorLength();
+}
+
 SDValue
 HexagonTargetLowering::convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
                                           SelectionDAG &DAG) const {
@@ -141,36 +338,16 @@ HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0,
                               opCastElem(Op1, MVT::i8, DAG), ByteMask);
 }
 
-MVT
-HexagonTargetLowering::getVecBoolVT() const {
-  return MVT::getVectorVT(MVT::i1, 8*Subtarget.getVectorLength());
-}
-
 SDValue
-HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
-                                            const SDLoc &dl, MVT VecTy,
-                                            SelectionDAG &DAG) const {
+HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
+                                         const SDLoc &dl, MVT VecTy,
+                                         SelectionDAG &DAG) const {
   unsigned VecLen = Values.size();
   MachineFunction &MF = DAG.getMachineFunction();
   MVT ElemTy = VecTy.getVectorElementType();
   unsigned ElemWidth = ElemTy.getSizeInBits();
   unsigned HwLen = Subtarget.getVectorLength();
 
-  SmallVector<ConstantInt*, 128> Consts(VecLen);
-  bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts);
-  if (AllConst) {
-    if (llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); }))
-      return getZero(dl, VecTy, DAG);
-
-    ArrayRef<Constant*> Tmp((Constant**)Consts.begin(),
-                            (Constant**)Consts.end());
-    Constant *CV = ConstantVector::get(Tmp);
-    unsigned Align = HwLen;
-    SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
-    return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
-                       MachinePointerInfo::getConstantPool(MF), Align);
-  }
-
   unsigned ElemSize = ElemWidth / 8;
   assert(ElemSize*VecLen == HwLen);
   SmallVector<SDValue,32> Words;
@@ -187,12 +364,47 @@ HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
     Words.assign(Values.begin(), Values.end());
   }
 
+  unsigned NumWords = Words.size();
+  bool IsSplat = true, IsUndef = true;
+  SDValue SplatV;
+  for (unsigned i = 0; i != NumWords && IsSplat; ++i) {
+    if (isUndef(Words[i]))
+      continue;
+    IsUndef = false;
+    if (!SplatV.getNode())
+      SplatV = Words[i];
+    else if (SplatV != Words[i])
+      IsSplat = false;
+  }
+  if (IsUndef)
+    return DAG.getUNDEF(VecTy);
+  if (IsSplat) {
+    assert(SplatV.getNode());
+    auto *IdxN = dyn_cast<ConstantSDNode>(SplatV.getNode());
+    if (IdxN && IdxN->isNullValue())
+      return getZero(dl, VecTy, DAG);
+    return DAG.getNode(HexagonISD::VSPLATW, dl, VecTy, SplatV);
+  }
+
+  // Delay recognizing constant vectors until here, so that we can generate
+  // a vsplat.
+  SmallVector<ConstantInt*, 128> Consts(VecLen);
+  bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts);
+  if (AllConst) {
+    ArrayRef<Constant*> Tmp((Constant**)Consts.begin(),
+                            (Constant**)Consts.end());
+    Constant *CV = ConstantVector::get(Tmp);
+    unsigned Align = HwLen;
+    SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
+    return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
+                       MachinePointerInfo::getConstantPool(MF), Align);
+  }
+
   // Construct two halves in parallel, then or them together.
   assert(4*Words.size() == Subtarget.getVectorLength());
-  SDValue HalfV0 = getNode(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue HalfV1 = getNode(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
+  SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
+  SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
   SDValue S = DAG.getConstant(4, dl, MVT::i32);
-  unsigned NumWords = Words.size();
   for (unsigned i = 0; i != NumWords/2; ++i) {
     SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
                             {HalfV0, Words[i]});
@@ -209,6 +421,95 @@ HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
 }
 
 SDValue
+HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
+      unsigned BitBytes, bool ZeroFill, SelectionDAG &DAG) const {
+  MVT PredTy = ty(PredV);
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+
+  if (Subtarget.isHVXVectorType(PredTy, true)) {
+    // Move the vector predicate SubV to a vector register, and scale it
+    // down to match the representation (bytes per type element) that VecV
+    // uses. The scaling down will pick every 2nd or 4th (every Scale-th
+    // in general) element and put them at the front of the resulting
+    // vector. This subvector will then be inserted into the Q2V of VecV.
+    // To avoid having an operation that generates an illegal type (short
+    // vector), generate a full size vector.
+    //
+    SDValue T = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, PredV);
+    SmallVector<int,128> Mask(HwLen);
+    // Scale = BitBytes(PredV) / Given BitBytes.
+    unsigned Scale = HwLen / (PredTy.getVectorNumElements() * BitBytes);
+    unsigned BlockLen = PredTy.getVectorNumElements() * BitBytes;
+
+    for (unsigned i = 0; i != HwLen; ++i) {
+      unsigned Num = i % Scale;
+      unsigned Off = i / Scale;
+      Mask[BlockLen*Num + Off] = i;
+    }
+    SDValue S = DAG.getVectorShuffle(ByteTy, dl, T, DAG.getUNDEF(ByteTy), Mask);
+    if (!ZeroFill)
+      return S;
+    // Fill the bytes beyond BlockLen with 0s.
+    MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+    SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+                         {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
+    SDValue M = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Q);
+    return DAG.getNode(ISD::AND, dl, ByteTy, S, M);
+  }
+
+  // Make sure that this is a valid scalar predicate.
+  assert(PredTy == MVT::v2i1 || PredTy == MVT::v4i1 || PredTy == MVT::v8i1);
+
+  unsigned Bytes = 8 / PredTy.getVectorNumElements();
+  SmallVector<SDValue,4> Words[2];
+  unsigned IdxW = 0;
+
+  auto Lo32 = [&DAG, &dl] (SDValue P) {
+    return DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, P);
+  };
+  auto Hi32 = [&DAG, &dl] (SDValue P) {
+    return DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, P);
+  };
+
+  SDValue W0 = isUndef(PredV)
+                  ? DAG.getUNDEF(MVT::i64)
+                  : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
+  Words[IdxW].push_back(Hi32(W0));
+  Words[IdxW].push_back(Lo32(W0));
+
+  while (Bytes < BitBytes) {
+    IdxW ^= 1;
+    Words[IdxW].clear();
+
+    if (Bytes < 4) {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        SDValue T = expandPredicate(W, dl, DAG);
+        Words[IdxW].push_back(Hi32(T));
+        Words[IdxW].push_back(Lo32(T));
+      }
+    } else {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        Words[IdxW].push_back(W);
+        Words[IdxW].push_back(W);
+      }
+    }
+    Bytes *= 2;
+  }
+
+  assert(Bytes == BitBytes);
+
+  SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy);
+  SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32);
+  for (const SDValue &W : Words[IdxW]) {
+    Vec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, Vec, S4);
+    Vec = DAG.getNode(HexagonISD::VINSERTW0, dl, ByteTy, Vec, W);
+  }
+
+  return Vec;
+}
+
+SDValue
 HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
                                           const SDLoc &dl, MVT VecTy,
                                           SelectionDAG &DAG) const {
@@ -218,6 +519,18 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
   unsigned HwLen = Subtarget.getVectorLength();
   assert(VecLen <= HwLen || VecLen == 8*HwLen);
   SmallVector<SDValue,128> Bytes;
+  bool AllT = true, AllF = true;
+
+  auto IsTrue = [] (SDValue V) {
+    if (const auto *N = dyn_cast<ConstantSDNode>(V.getNode()))
+      return !N->isNullValue();
+    return false;
+  };
+  auto IsFalse = [] (SDValue V) {
+    if (const auto *N = dyn_cast<ConstantSDNode>(V.getNode()))
+      return N->isNullValue();
+    return false;
+  };
 
   if (VecLen <= HwLen) {
     // In the hardware, each bit of a vector predicate corresponds to a byte
@@ -226,8 +539,11 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
     assert(HwLen % VecLen == 0);
     unsigned BitBytes = HwLen / VecLen;
     for (SDValue V : Values) {
+      AllT &= IsTrue(V);
+      AllF &= IsFalse(V);
+
       SDValue Ext = !V.isUndef() ? DAG.getZExtOrTrunc(V, dl, MVT::i8)
-                                 : DAG.getConstant(0, dl, MVT::i8);
+                                 : DAG.getUNDEF(MVT::i8);
       for (unsigned B = 0; B != BitBytes; ++B)
         Bytes.push_back(Ext);
     }
@@ -243,8 +559,11 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
           break;
       }
       SDValue F = Values[I+B];
+      AllT &= IsTrue(F);
+      AllF &= IsFalse(F);
+
       SDValue Ext = (B < 8) ? DAG.getZExtOrTrunc(F, dl, MVT::i8)
-                            : DAG.getConstant(0, dl, MVT::i8);
+                            : DAG.getUNDEF(MVT::i8);
       Bytes.push_back(Ext);
       // Verify that the rest of values in the group are the same as the
       // first.
@@ -253,53 +572,25 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
     }
   }
 
-  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
-  SDValue ByteVec = buildHvxVectorSingle(Bytes, dl, ByteTy, DAG);
-  SDValue Cmp = DAG.getSetCC(dl, VecTy, ByteVec, getZero(dl, ByteTy, DAG),
-                             ISD::SETUGT);
-  return Cmp;
-}
-
-SDValue
-HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
-      const {
-  const SDLoc &dl(Op);
-  MVT VecTy = ty(Op);
-
-  unsigned Size = Op.getNumOperands();
-  SmallVector<SDValue,128> Ops;
-  for (unsigned i = 0; i != Size; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  if (VecTy.getVectorElementType() == MVT::i1)
-    return buildHvxVectorPred(Ops, dl, VecTy, DAG);
+  if (AllT)
+    return DAG.getNode(HexagonISD::QTRUE, dl, VecTy);
+  if (AllF)
+    return DAG.getNode(HexagonISD::QFALSE, dl, VecTy);
 
-  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
-    ArrayRef<SDValue> A(Ops);
-    MVT SingleTy = typeSplit(VecTy).first;
-    SDValue V0 = buildHvxVectorSingle(A.take_front(Size/2), dl, SingleTy, DAG);
-    SDValue V1 = buildHvxVectorSingle(A.drop_front(Size/2), dl, SingleTy, DAG);
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
-  }
-
-  return buildHvxVectorSingle(Ops, dl, VecTy, DAG);
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue ByteVec = buildHvxVectorReg(Bytes, dl, ByteTy, DAG);
+  return DAG.getNode(HexagonISD::V2Q, dl, VecTy, ByteVec);
 }
 
 SDValue
-HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
-      const {
-  // Change the type of the extracted element to i32.
-  SDValue VecV = Op.getOperand(0);
+HexagonTargetLowering::extractHvxElementReg(SDValue VecV, SDValue IdxV,
+      const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
   MVT ElemTy = ty(VecV).getVectorElementType();
+
   unsigned ElemWidth = ElemTy.getSizeInBits();
   assert(ElemWidth >= 8 && ElemWidth <= 32);
   (void)ElemWidth;
 
-  const SDLoc &dl(Op);
-  SDValue IdxV = Op.getOperand(1);
-  if (ty(IdxV) != MVT::i32)
-    IdxV = DAG.getBitcast(MVT::i32, IdxV);
-
   SDValue ByteIdx = convertToByteIndex(IdxV, ElemTy, DAG);
   SDValue ExWord = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32,
                                {VecV, ByteIdx});
@@ -316,13 +607,29 @@ HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
 }
 
 SDValue
-HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
-      const {
-  const SDLoc &dl(Op);
-  SDValue VecV = Op.getOperand(0);
-  SDValue ValV = Op.getOperand(1);
-  SDValue IdxV = Op.getOperand(2);
+HexagonTargetLowering::extractHvxElementPred(SDValue VecV, SDValue IdxV,
+      const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+  // Implement other return types if necessary.
+  assert(ResTy == MVT::i1);
+
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+
+  unsigned Scale = HwLen / ty(VecV).getVectorNumElements();
+  SDValue ScV = DAG.getConstant(Scale, dl, MVT::i32);
+  IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, ScV);
+
+  SDValue ExtB = extractHvxElementReg(ByteVec, IdxV, dl, MVT::i32, DAG);
+  SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
+  return getInstr(Hexagon::C2_cmpgtui, dl, MVT::i1, {ExtB, Zero}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::insertHvxElementReg(SDValue VecV, SDValue IdxV,
+      SDValue ValV, const SDLoc &dl, SelectionDAG &DAG) const {
   MVT ElemTy = ty(VecV).getVectorElementType();
+
   unsigned ElemWidth = ElemTy.getSizeInBits();
   assert(ElemWidth >= 8 && ElemWidth <= 32);
   (void)ElemWidth;
@@ -336,7 +643,7 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
     SDValue RotV = DAG.getNode(HexagonISD::VROR, dl, VecTy, {VecV, MaskV});
     SDValue InsV = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, {RotV, ValV});
     SDValue SubV = DAG.getNode(ISD::SUB, dl, MVT::i32,
-                               {DAG.getConstant(HwLen/4, dl, MVT::i32), MaskV});
+                               {DAG.getConstant(HwLen, dl, MVT::i32), MaskV});
     SDValue TorV = DAG.getNode(HexagonISD::VROR, dl, VecTy, {InsV, SubV});
     return TorV;
   };
@@ -349,9 +656,8 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
   // 1. Extract the existing word from the target vector.
   SDValue WordIdx = DAG.getNode(ISD::SRL, dl, MVT::i32,
                                 {ByteIdx, DAG.getConstant(2, dl, MVT::i32)});
-  SDValue Ex0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                            {opCastElem(VecV, MVT::i32, DAG), WordIdx});
-  SDValue Ext = LowerHvxExtractElement(Ex0, DAG);
+  SDValue Ext = extractHvxElementReg(opCastElem(VecV, MVT::i32, DAG), WordIdx,
+                                     dl, MVT::i32, DAG);
 
   // 2. Treating the extracted word as a 32-bit vector, insert the given
   //    value into it.
@@ -365,55 +671,531 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
 }
 
 SDValue
+HexagonTargetLowering::insertHvxElementPred(SDValue VecV, SDValue IdxV,
+      SDValue ValV, const SDLoc &dl, SelectionDAG &DAG) const {
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+
+  unsigned Scale = HwLen / ty(VecV).getVectorNumElements();
+  SDValue ScV = DAG.getConstant(Scale, dl, MVT::i32);
+  IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV, ScV);
+  ValV = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, ValV);
+
+  SDValue InsV = insertHvxElementReg(ByteVec, IdxV, ValV, dl, DAG);
+  return DAG.getNode(HexagonISD::V2Q, dl, ty(VecV), InsV);
+}
+
+SDValue
+HexagonTargetLowering::extractHvxSubvectorReg(SDValue VecV, SDValue IdxV,
+      const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+  MVT VecTy = ty(VecV);
+  unsigned HwLen = Subtarget.getVectorLength();
+  unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+  MVT ElemTy = VecTy.getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+
+  // If the source vector is a vector pair, get the single vector containing
+  // the subvector of interest. The subvector will never overlap two single
+  // vectors.
+  if (isHvxPairTy(VecTy)) {
+    unsigned SubIdx;
+    if (Idx * ElemWidth >= 8*HwLen) {
+      SubIdx = Hexagon::vsub_hi;
+      Idx -= VecTy.getVectorNumElements() / 2;
+    } else {
+      SubIdx = Hexagon::vsub_lo;
+    }
+    VecTy = typeSplit(VecTy).first;
+    VecV = DAG.getTargetExtractSubreg(SubIdx, dl, VecTy, VecV);
+    if (VecTy == ResTy)
+      return VecV;
+  }
+
+  // The only meaningful subvectors of a single HVX vector are those that
+  // fit in a scalar register.
+  assert(ResTy.getSizeInBits() == 32 || ResTy.getSizeInBits() == 64);
+
+  MVT WordTy = tyVector(VecTy, MVT::i32);
+  SDValue WordVec = DAG.getBitcast(WordTy, VecV);
+  unsigned WordIdx = (Idx*ElemWidth) / 32;
+
+  SDValue W0Idx = DAG.getConstant(WordIdx, dl, MVT::i32);
+  SDValue W0 = extractHvxElementReg(WordVec, W0Idx, dl, MVT::i32, DAG);
+  if (ResTy.getSizeInBits() == 32)
+    return DAG.getBitcast(ResTy, W0);
+
+  SDValue W1Idx = DAG.getConstant(WordIdx+1, dl, MVT::i32);
+  SDValue W1 = extractHvxElementReg(WordVec, W1Idx, dl, MVT::i32, DAG);
+  SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64, {W1, W0});
+  return DAG.getBitcast(ResTy, WW);
+}
+
+SDValue
+HexagonTargetLowering::extractHvxSubvectorPred(SDValue VecV, SDValue IdxV,
+      const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+  MVT VecTy = ty(VecV);
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+  // IdxV is required to be a constant.
+  unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+
+  unsigned ResLen = ResTy.getVectorNumElements();
+  unsigned BitBytes = HwLen / VecTy.getVectorNumElements();
+  unsigned Offset = Idx * BitBytes;
+  SDValue Undef = DAG.getUNDEF(ByteTy);
+  SmallVector<int,128> Mask;
+
+  if (Subtarget.isHVXVectorType(ResTy, true)) {
+    // Converting between two vector predicates. Since the result is shorter
+    // than the source, it will correspond to a vector predicate with the
+    // relevant bits replicated. The replication count is the ratio of the
+    // source and target vector lengths.
+    unsigned Rep = VecTy.getVectorNumElements() / ResLen;
+    assert(isPowerOf2_32(Rep) && HwLen % Rep == 0);
+    for (unsigned i = 0; i != HwLen/Rep; ++i) {
+      for (unsigned j = 0; j != Rep; ++j)
+        Mask.push_back(i + Offset);
+    }
+    SDValue ShuffV = DAG.getVectorShuffle(ByteTy, dl, ByteVec, Undef, Mask);
+    return DAG.getNode(HexagonISD::V2Q, dl, ResTy, ShuffV);
+  }
+
+  // Converting between a vector predicate and a scalar predicate. In the
+  // vector predicate, a group of BitBytes bits will correspond to a single
+  // i1 element of the source vector type. Those bits will all have the same
+  // value. The same will be true for ByteVec, where each byte corresponds
+  // to a bit in the vector predicate.
+  // The algorithm is to traverse the ByteVec, going over the i1 values from
+  // the source vector, and generate the corresponding representation in an
+  // 8-byte vector. To avoid repeated extracts from ByteVec, shuffle the
+  // elements so that the interesting 8 bytes will be in the low end of the
+  // vector.
+  unsigned Rep = 8 / ResLen;
+  // Make sure the output fill the entire vector register, so repeat the
+  // 8-byte groups as many times as necessary.
+  for (unsigned r = 0; r != HwLen/ResLen; ++r) {
+    // This will generate the indexes of the 8 interesting bytes.
+    for (unsigned i = 0; i != ResLen; ++i) {
+      for (unsigned j = 0; j != Rep; ++j)
+        Mask.push_back(Offset + i*BitBytes);
+    }
+  }
+
+  SDValue Zero = getZero(dl, MVT::i32, DAG);
+  SDValue ShuffV = DAG.getVectorShuffle(ByteTy, dl, ByteVec, Undef, Mask);
+  // Combine the two low words from ShuffV into a v8i8, and byte-compare
+  // them against 0.
+  SDValue W0 = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32, {ShuffV, Zero});
+  SDValue W1 = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32,
+                           {ShuffV, DAG.getConstant(4, dl, MVT::i32)});
+  SDValue Vec64 = DAG.getNode(HexagonISD::COMBINE, dl, MVT::v8i8, {W1, W0});
+  return getInstr(Hexagon::A4_vcmpbgtui, dl, ResTy,
+                  {Vec64, DAG.getTargetConstant(0, dl, MVT::i32)}, DAG);
+}
+
+SDValue
+HexagonTargetLowering::insertHvxSubvectorReg(SDValue VecV, SDValue SubV,
+      SDValue IdxV, const SDLoc &dl, SelectionDAG &DAG) const {
+  MVT VecTy = ty(VecV);
+  MVT SubTy = ty(SubV);
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT ElemTy = VecTy.getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+
+  bool IsPair = isHvxPairTy(VecTy);
+  MVT SingleTy = MVT::getVectorVT(ElemTy, (8*HwLen)/ElemWidth);
+  // The two single vectors that VecV consists of, if it's a pair.
+  SDValue V0, V1;
+  SDValue SingleV = VecV;
+  SDValue PickHi;
+
+  if (IsPair) {
+    V0 = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, VecV);
+    V1 = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, VecV);
+
+    SDValue HalfV = DAG.getConstant(SingleTy.getVectorNumElements(),
+                                    dl, MVT::i32);
+    PickHi = DAG.getSetCC(dl, MVT::i1, IdxV, HalfV, ISD::SETUGT);
+    if (isHvxSingleTy(SubTy)) {
+      if (const auto *CN = dyn_cast<const ConstantSDNode>(IdxV.getNode())) {
+        unsigned Idx = CN->getZExtValue();
+        assert(Idx == 0 || Idx == VecTy.getVectorNumElements()/2);
+        unsigned SubIdx = (Idx == 0) ? Hexagon::vsub_lo : Hexagon::vsub_hi;
+        return DAG.getTargetInsertSubreg(SubIdx, dl, VecTy, VecV, SubV);
+      }
+      // If IdxV is not a constant, generate the two variants: with the
+      // SubV as the high and as the low subregister, and select the right
+      // pair based on the IdxV.
+      SDValue InLo = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {SubV, V1});
+      SDValue InHi = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {V0, SubV});
+      return DAG.getNode(ISD::SELECT, dl, VecTy, PickHi, InHi, InLo);
+    }
+    // The subvector being inserted must be entirely contained in one of
+    // the vectors V0 or V1. Set SingleV to the correct one, and update
+    // IdxV to be the index relative to the beginning of that vector.
+    SDValue S = DAG.getNode(ISD::SUB, dl, MVT::i32, IdxV, HalfV);
+    IdxV = DAG.getNode(ISD::SELECT, dl, MVT::i32, PickHi, S, IdxV);
+    SingleV = DAG.getNode(ISD::SELECT, dl, SingleTy, PickHi, V1, V0);
+  }
+
+  // The only meaningful subvectors of a single HVX vector are those that
+  // fit in a scalar register.
+  assert(SubTy.getSizeInBits() == 32 || SubTy.getSizeInBits() == 64);
+  // Convert IdxV to be index in bytes.
+  auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
+  if (!IdxN || !IdxN->isNullValue()) {
+    IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+                       DAG.getConstant(ElemWidth/8, dl, MVT::i32));
+    SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV, IdxV);
+  }
+  // When inserting a single word, the rotation back to the original position
+  // would be by HwLen-Idx, but if two words are inserted, it will need to be
+  // by (HwLen-4)-Idx.
+  unsigned RolBase = HwLen;
+  if (VecTy.getSizeInBits() == 32) {
+    SDValue V = DAG.getBitcast(MVT::i32, SubV);
+    SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, V);
+  } else {
+    SDValue V = DAG.getBitcast(MVT::i64, SubV);
+    SDValue R0 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, V);
+    SDValue R1 = DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, V);
+    SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, SingleV, R0);
+    SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV,
+                          DAG.getConstant(4, dl, MVT::i32));
+    SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, SingleV, R1);
+    RolBase = HwLen-4;
+  }
+  // If the vector wasn't ror'ed, don't ror it back.
+  if (RolBase != 4 || !IdxN || !IdxN->isNullValue()) {
+    SDValue RolV = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                               DAG.getConstant(RolBase, dl, MVT::i32), IdxV);
+    SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV, RolV);
+  }
+
+  if (IsPair) {
+    SDValue InLo = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {SingleV, V1});
+    SDValue InHi = DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, {V0, SingleV});
+    return DAG.getNode(ISD::SELECT, dl, VecTy, PickHi, InHi, InLo);
+  }
+  return SingleV;
+}
+
+SDValue
+HexagonTargetLowering::insertHvxSubvectorPred(SDValue VecV, SDValue SubV,
+      SDValue IdxV, const SDLoc &dl, SelectionDAG &DAG) const {
+  MVT VecTy = ty(VecV);
+  MVT SubTy = ty(SubV);
+  assert(Subtarget.isHVXVectorType(VecTy, true));
+  // VecV is an HVX vector predicate. SubV may be either an HVX vector
+  // predicate as well, or it can be a scalar predicate.
+
+  unsigned VecLen = VecTy.getVectorNumElements();
+  unsigned HwLen = Subtarget.getVectorLength();
+  assert(HwLen % VecLen == 0 && "Unexpected vector type");
+
+  unsigned Scale = VecLen / SubTy.getVectorNumElements();
+  unsigned BitBytes = HwLen / VecLen;
+  unsigned BlockLen = HwLen / Scale;
+
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
+  SDValue ByteSub = createHvxPrefixPred(SubV, dl, BitBytes, false, DAG);
+  SDValue ByteIdx;
+
+  auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
+  if (!IdxN || !IdxN->isNullValue()) {
+    ByteIdx = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
+                          DAG.getConstant(BitBytes, dl, MVT::i32));
+    ByteVec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, ByteVec, ByteIdx);
+  }
+
+  // ByteVec is the target vector VecV rotated in such a way that the
+  // subvector should be inserted at index 0. Generate a predicate mask
+  // and use vmux to do the insertion.
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+  SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+                       {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
+  ByteVec = getInstr(Hexagon::V6_vmux, dl, ByteTy, {Q, ByteSub, ByteVec}, DAG);
+  // Rotate ByteVec back, and convert to a vector predicate.
+  if (!IdxN || !IdxN->isNullValue()) {
+    SDValue HwLenV = DAG.getConstant(HwLen, dl, MVT::i32);
+    SDValue ByteXdi = DAG.getNode(ISD::SUB, dl, MVT::i32, HwLenV, ByteIdx);
+    ByteVec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, ByteVec, ByteXdi);
+  }
+  return DAG.getNode(HexagonISD::V2Q, dl, VecTy, ByteVec);
+}
+
+SDValue
+HexagonTargetLowering::extendHvxVectorPred(SDValue VecV, const SDLoc &dl,
+      MVT ResTy, bool ZeroExt, SelectionDAG &DAG) const {
+  // Sign- and any-extending of a vector predicate to a vector register is
+  // equivalent to Q2V. For zero-extensions, generate a vmux between 0 and
+  // a vector of 1s (where the 1s are of type matching the vector type).
+  assert(Subtarget.isHVXVectorType(ResTy));
+  if (!ZeroExt)
+    return DAG.getNode(HexagonISD::Q2V, dl, ResTy, VecV);
+
+  assert(ty(VecV).getVectorNumElements() == ResTy.getVectorNumElements());
+  SDValue True = DAG.getNode(HexagonISD::VSPLAT, dl, ResTy,
+                             DAG.getConstant(1, dl, MVT::i32));
+  SDValue False = getZero(dl, ResTy, DAG);
+  return DAG.getSelect(dl, ResTy, VecV, True, False);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  MVT VecTy = ty(Op);
+
+  unsigned Size = Op.getNumOperands();
+  SmallVector<SDValue,128> Ops;
+  for (unsigned i = 0; i != Size; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  if (VecTy.getVectorElementType() == MVT::i1)
+    return buildHvxVectorPred(Ops, dl, VecTy, DAG);
+
+  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
+    ArrayRef<SDValue> A(Ops);
+    MVT SingleTy = typeSplit(VecTy).first;
+    SDValue V0 = buildHvxVectorReg(A.take_front(Size/2), dl, SingleTy, DAG);
+    SDValue V1 = buildHvxVectorReg(A.drop_front(Size/2), dl, SingleTy, DAG);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
+  }
+
+  return buildHvxVectorReg(Ops, dl, VecTy, DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
+      const {
+  // Vector concatenation of two integer (non-bool) vectors does not need
+  // special lowering. Custom-lower concats of bool vectors and expand
+  // concats of more than 2 vectors.
+  MVT VecTy = ty(Op);
+  const SDLoc &dl(Op);
+  unsigned NumOp = Op.getNumOperands();
+  if (VecTy.getVectorElementType() != MVT::i1) {
+    if (NumOp == 2)
+      return Op;
+    // Expand the other cases into a build-vector.
+    SmallVector<SDValue,8> Elems;
+    for (SDValue V : Op.getNode()->ops())
+      DAG.ExtractVectorElements(V, Elems);
+    // A vector of i16 will be broken up into a build_vector of i16's.
+    // This is a problem, since at the time of operation legalization,
+    // all operations are expected to be type-legalized, and i16 is not
+    // a legal type. If any of the extracted elements is not of a valid
+    // type, sign-extend it to a valid one.
+    for (unsigned i = 0, e = Elems.size(); i != e; ++i) {
+      SDValue V = Elems[i];
+      MVT Ty = ty(V);
+      if (!isTypeLegal(Ty)) {
+        EVT NTy = getTypeToTransformTo(*DAG.getContext(), Ty);
+        if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+          Elems[i] = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NTy,
+                                 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NTy,
+                                             V.getOperand(0), V.getOperand(1)),
+                                 DAG.getValueType(Ty));
+          continue;
+        }
+        // A few less complicated cases.
+        if (V.getOpcode() == ISD::Constant)
+          Elems[i] = DAG.getSExtOrTrunc(V, dl, NTy);
+        else if (V.isUndef())
+          Elems[i] = DAG.getUNDEF(NTy);
+        else
+          llvm_unreachable("Unexpected vector element");
+      }
+    }
+    return DAG.getBuildVector(VecTy, dl, Elems);
+  }
+
+  assert(VecTy.getVectorElementType() == MVT::i1);
+  unsigned HwLen = Subtarget.getVectorLength();
+  assert(isPowerOf2_32(NumOp) && HwLen % NumOp == 0);
+
+  SDValue Op0 = Op.getOperand(0);
+
+  // If the operands are HVX types (i.e. not scalar predicates), then
+  // defer the concatenation, and create QCAT instead.
+  if (Subtarget.isHVXVectorType(ty(Op0), true)) {
+    if (NumOp == 2)
+      return DAG.getNode(HexagonISD::QCAT, dl, VecTy, Op0, Op.getOperand(1));
+
+    ArrayRef<SDUse> U(Op.getNode()->ops());
+    SmallVector<SDValue,4> SV(U.begin(), U.end());
+    ArrayRef<SDValue> Ops(SV);
+
+    MVT HalfTy = typeSplit(VecTy).first;
+    SDValue V0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfTy,
+                             Ops.take_front(NumOp/2));
+    SDValue V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfTy,
+                             Ops.take_back(NumOp/2));
+    return DAG.getNode(HexagonISD::QCAT, dl, VecTy, V0, V1);
+  }
+
+  // Count how many bytes (in a vector register) each bit in VecTy
+  // corresponds to.
+  unsigned BitBytes = HwLen / VecTy.getVectorNumElements();
+
+  SmallVector<SDValue,8> Prefixes;
+  for (SDValue V : Op.getNode()->op_values()) {
+    SDValue P = createHvxPrefixPred(V, dl, BitBytes, true, DAG);
+    Prefixes.push_back(P);
+  }
+
+  unsigned InpLen = ty(Op.getOperand(0)).getVectorNumElements();
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue S = DAG.getConstant(InpLen*BitBytes, dl, MVT::i32);
+  SDValue Res = getZero(dl, ByteTy, DAG);
+  for (unsigned i = 0, e = Prefixes.size(); i != e; ++i) {
+    Res = DAG.getNode(HexagonISD::VROR, dl, ByteTy, Res, S);
+    Res = DAG.getNode(ISD::OR, dl, ByteTy, Res, Prefixes[e-i-1]);
+  }
+  return DAG.getNode(HexagonISD::V2Q, dl, VecTy, Res);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
+      const {
+  // Change the type of the extracted element to i32.
+  SDValue VecV = Op.getOperand(0);
+  MVT ElemTy = ty(VecV).getVectorElementType();
+  const SDLoc &dl(Op);
+  SDValue IdxV = Op.getOperand(1);
+  if (ElemTy == MVT::i1)
+    return extractHvxElementPred(VecV, IdxV, dl, ty(Op), DAG);
+
+  return extractHvxElementReg(VecV, IdxV, dl, ty(Op), DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  SDValue VecV = Op.getOperand(0);
+  SDValue ValV = Op.getOperand(1);
+  SDValue IdxV = Op.getOperand(2);
+  MVT ElemTy = ty(VecV).getVectorElementType();
+  if (ElemTy == MVT::i1)
+    return insertHvxElementPred(VecV, IdxV, ValV, dl, DAG);
+
+  return insertHvxElementReg(VecV, IdxV, ValV, dl, DAG);
+}
+
+SDValue
 HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG)
       const {
   SDValue SrcV = Op.getOperand(0);
   MVT SrcTy = ty(SrcV);
-  unsigned SrcElems = SrcTy.getVectorNumElements();
+  MVT DstTy = ty(Op);
   SDValue IdxV = Op.getOperand(1);
   unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
-  MVT DstTy = ty(Op);
-  assert(Idx == 0 || DstTy.getVectorNumElements() % Idx == 0);
+  assert(Idx % DstTy.getVectorNumElements() == 0);
+  (void)Idx;
   const SDLoc &dl(Op);
-  if (Idx == 0)
-    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, DstTy, SrcV);
-  if (Idx == SrcElems/2)
-    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, DstTy, SrcV);
-  return SDValue();
+
+  MVT ElemTy = SrcTy.getVectorElementType();
+  if (ElemTy == MVT::i1)
+    return extractHvxSubvectorPred(SrcV, IdxV, dl, DstTy, DAG);
+
+  return extractHvxSubvectorReg(SrcV, IdxV, dl, DstTy, DAG);
 }
 
 SDValue
 HexagonTargetLowering::LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG)
       const {
-  // Idx may be variable.
+  // Idx does not need to be a constant.
+  SDValue VecV = Op.getOperand(0);
+  SDValue ValV = Op.getOperand(1);
   SDValue IdxV = Op.getOperand(2);
-  auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
-  if (!IdxN)
-    return SDValue();
-  unsigned Idx = IdxN->getZExtValue();
 
-  SDValue DstV = Op.getOperand(0);
-  SDValue SrcV = Op.getOperand(1);
-  MVT DstTy = ty(DstV);
-  MVT SrcTy = ty(SrcV);
-  unsigned DstElems = DstTy.getVectorNumElements();
-  unsigned SrcElems = SrcTy.getVectorNumElements();
-  if (2*SrcElems != DstElems)
-    return SDValue();
+  const SDLoc &dl(Op);
+  MVT VecTy = ty(VecV);
+  MVT ElemTy = VecTy.getVectorElementType();
+  if (ElemTy == MVT::i1)
+    return insertHvxSubvectorPred(VecV, ValV, IdxV, dl, DAG);
+
+  return insertHvxSubvectorReg(VecV, ValV, IdxV, dl, DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const {
+  // Lower any-extends of boolean vectors to sign-extends, since they
+  // translate directly to Q2V. Zero-extending could also be done equally
+  // fast, but Q2V is used/recognized in more places.
+  // For all other vectors, use zero-extend.
+  MVT ResTy = ty(Op);
+  SDValue InpV = Op.getOperand(0);
+  MVT ElemTy = ty(InpV).getVectorElementType();
+  if (ElemTy == MVT::i1 && Subtarget.isHVXVectorType(ResTy))
+    return LowerHvxSignExt(Op, DAG);
+  return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Op), ResTy, InpV);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const {
+  MVT ResTy = ty(Op);
+  SDValue InpV = Op.getOperand(0);
+  MVT ElemTy = ty(InpV).getVectorElementType();
+  if (ElemTy == MVT::i1 && Subtarget.isHVXVectorType(ResTy))
+    return extendHvxVectorPred(InpV, SDLoc(Op), ty(Op), false, DAG);
+  return Op;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const {
+  MVT ResTy = ty(Op);
+  SDValue InpV = Op.getOperand(0);
+  MVT ElemTy = ty(InpV).getVectorElementType();
+  if (ElemTy == MVT::i1 && Subtarget.isHVXVectorType(ResTy))
+    return extendHvxVectorPred(InpV, SDLoc(Op), ty(Op), true, DAG);
+  return Op;
+}
 
+SDValue
+HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
+  // Lower vector CTTZ into a computation using CTLZ (Hacker's Delight):
+  // cttz(x) = bitwidth(x) - ctlz(~x & (x-1))
   const SDLoc &dl(Op);
-  if (Idx == 0)
-    return DAG.getTargetInsertSubreg(Hexagon::vsub_lo, dl, DstTy, DstV, SrcV);
-  if (Idx == SrcElems)
-    return DAG.getTargetInsertSubreg(Hexagon::vsub_hi, dl, DstTy, DstV, SrcV);
-  return SDValue();
+  MVT ResTy = ty(Op);
+  SDValue InpV = Op.getOperand(0);
+  assert(ResTy == ty(InpV));
+
+  // Calculate the vectors of 1 and bitwidth(x).
+  MVT ElemTy = ty(InpV).getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  // Using uint64_t because a shift by 32 can happen.
+  uint64_t Splat1 = 0, SplatW = 0;
+  assert(isPowerOf2_32(ElemWidth) && ElemWidth <= 32);
+  for (unsigned i = 0; i != 32/ElemWidth; ++i) {
+    Splat1 = (Splat1 << ElemWidth) | 1;
+    SplatW = (SplatW << ElemWidth) | ElemWidth;
+  }
+  SDValue Vec1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+                             DAG.getConstant(uint32_t(Splat1), dl, MVT::i32));
+  SDValue VecW = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+                             DAG.getConstant(uint32_t(SplatW), dl, MVT::i32));
+  SDValue VecN1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+                              DAG.getConstant(-1, dl, MVT::i32));
+  // Do not use DAG.getNOT, because that would create BUILD_VECTOR with
+  // a BITCAST. Here we can skip the BITCAST (so we don't have to handle
+  // it separately in custom combine or selection).
+  SDValue A = DAG.getNode(ISD::AND, dl, ResTy,
+                          {DAG.getNode(ISD::XOR, dl, ResTy, {InpV, VecN1}),
+                           DAG.getNode(ISD::SUB, dl, ResTy, {InpV, Vec1})});
+  return DAG.getNode(ISD::SUB, dl, ResTy,
+                     {VecW, DAG.getNode(ISD::CTLZ, dl, ResTy, A)});
 }
 
 SDValue
 HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
   MVT ResTy = ty(Op);
-  if (!ResTy.isVector())
-    return SDValue();
+  assert(ResTy.isVector() && isHvxSingleTy(ResTy));
   const SDLoc &dl(Op);
   SmallVector<int,256> ShuffMask;
 
@@ -423,18 +1205,14 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
   SDValue Vt = Op.getOperand(1);
 
   switch (ElemTy.SimpleTy) {
-    case MVT::i8:
-    case MVT::i16: {
+    case MVT::i8: {
       // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
       // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
       // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
-      // For i16, use V6_vmpyhv, which behaves in an analogous way to
-      // V6_vmpybv: results Lo and Hi are products of even/odd elements
-      // respectively.
       MVT ExtTy = typeExtElem(ResTy, 2);
       unsigned MpyOpc = ElemTy == MVT::i8 ? Hexagon::V6_vmpybv
                                           : Hexagon::V6_vmpyhv;
-      SDValue M = getNode(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
+      SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
 
       // Discard high halves of the resulting values, collect the low halves.
       for (unsigned I = 0; I < VecLen; I += 2) {
@@ -442,18 +1220,24 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
         ShuffMask.push_back(I+VecLen);  // Pick odd element.
       }
       VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
-      return getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
+      SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
+      return DAG.getBitcast(ResTy, BS);
     }
+    case MVT::i16:
+      // For i16 there is V6_vmpyih, which acts exactly like the MUL opcode.
+      // (There is also V6_vmpyhv, which behaves in an analogous way to
+      // V6_vmpybv.)
+      return getInstr(Hexagon::V6_vmpyih, dl, ResTy, {Vs, Vt}, DAG);
     case MVT::i32: {
       // Use the following sequence for signed word multiply:
       // T0 = V6_vmpyiowh Vs, Vt
       // T1 = V6_vaslw T0, 16
       // T2 = V6_vmpyiewuh_acc T1, Vs, Vt
       SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
-      SDValue T0 = getNode(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG);
-      SDValue T1 = getNode(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG);
-      SDValue T2 = getNode(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
-                           {T1, Vs, Vt}, DAG);
+      SDValue T0 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG);
+      SDValue T1 = getInstr(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG);
+      SDValue T2 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
+                            {T1, Vs, Vt}, DAG);
       return T2;
     }
     default:
@@ -463,78 +1247,109 @@ HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
-HexagonTargetLowering::LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
-  MVT VecTy = ty(Op.getOperand(0));
-  assert(VecTy == ty(Op.getOperand(1)));
-
-  SDValue Cmp = Op.getOperand(2);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
-  bool Negate = false, Swap = false;
-
-  // HVX has instructions for SETEQ, SETGT, SETUGT. The other comparisons
-  // can be arranged as operand-swapped/negated versions of these. Since
-  // the generated code will have the original CC expressed as
-  //   (negate (swap-op NewCmp)),
-  // the condition code for the NewCmp should be calculated from the original
-  // CC by applying these operations in the reverse order.
-  //
-  // This could also be done through setCondCodeAction, but for negation it
-  // uses a xor with a vector of -1s, which it obtains from BUILD_VECTOR.
-  // That is far too expensive for what can be done with a single instruction.
-
-  switch (CC) {
-    case ISD::SETNE:    // !eq
-    case ISD::SETLE:    // !gt
-    case ISD::SETGE:    // !lt
-    case ISD::SETULE:   // !ugt
-    case ISD::SETUGE:   // !ult
-      CC = ISD::getSetCCInverse(CC, true);
-      Negate = true;
-      break;
-    default:
-      break;
+HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
+  MVT ResTy = ty(Op);
+  assert(ResTy.isVector());
+  const SDLoc &dl(Op);
+  SmallVector<int,256> ShuffMask;
+
+  MVT ElemTy = ResTy.getVectorElementType();
+  unsigned VecLen = ResTy.getVectorNumElements();
+  SDValue Vs = Op.getOperand(0);
+  SDValue Vt = Op.getOperand(1);
+  bool IsSigned = Op.getOpcode() == ISD::MULHS;
+
+  if (ElemTy == MVT::i8 || ElemTy == MVT::i16) {
+    // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
+    // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
+    // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
+    // For i16, use V6_vmpyhv, which behaves in an analogous way to
+    // V6_vmpybv: results Lo and Hi are products of even/odd elements
+    // respectively.
+    MVT ExtTy = typeExtElem(ResTy, 2);
+    unsigned MpyOpc = ElemTy == MVT::i8
+        ? (IsSigned ? Hexagon::V6_vmpybv : Hexagon::V6_vmpyubv)
+        : (IsSigned ? Hexagon::V6_vmpyhv : Hexagon::V6_vmpyuhv);
+    SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
+
+    // Discard low halves of the resulting values, collect the high halves.
+    for (unsigned I = 0; I < VecLen; I += 2) {
+      ShuffMask.push_back(I+1);         // Pick even element.
+      ShuffMask.push_back(I+VecLen+1);  // Pick odd element.
+    }
+    VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
+    SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
+    return DAG.getBitcast(ResTy, BS);
   }
 
-  switch (CC) {
-    case ISD::SETLT:    // swap gt
-    case ISD::SETULT:   // swap ugt
-      CC = ISD::getSetCCSwappedOperands(CC);
-      Swap = true;
-      break;
-    default:
-      break;
+  assert(ElemTy == MVT::i32);
+  SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
+
+  if (IsSigned) {
+    // mulhs(Vs,Vt) =
+    //   = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32
+    //   = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16
+    //      + Lo(Vs) *us (Hi(Vt)*2^16 + Lo(Vt))] >> 32
+    //   = [Hi(Vs) *s Hi(Vt)*2^32 + Hi(Vs) *su Lo(Vt)*2^16
+    //      + Lo(Vs) *us Vt] >> 32
+    // The low half of Lo(Vs)*Lo(Vt) will be discarded (it's not added to
+    // anything, so it cannot produce any carry over to higher bits),
+    // so everything in [] can be shifted by 16 without loss of precision.
+    //   = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + Lo(Vs)*Vt >> 16] >> 16
+    //   = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + V6_vmpyewuh(Vs,Vt)] >> 16
+    // Denote Hi(Vs) = Vs':
+    //   = [Vs'*s Hi(Vt)*2^16 + Vs' *su Lo(Vt) + V6_vmpyewuh(Vt,Vs)] >> 16
+    //   = Vs'*s Hi(Vt) + (V6_vmpyiewuh(Vs',Vt) + V6_vmpyewuh(Vt,Vs)) >> 16
+    SDValue T0 = getInstr(Hexagon::V6_vmpyewuh, dl, ResTy, {Vt, Vs}, DAG);
+    // Get Vs':
+    SDValue S0 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {Vs, S16}, DAG);
+    SDValue T1 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
+                          {T0, S0, Vt}, DAG);
+    // Shift by 16:
+    SDValue S2 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {T1, S16}, DAG);
+    // Get Vs'*Hi(Vt):
+    SDValue T2 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {S0, Vt}, DAG);
+    // Add:
+    SDValue T3 = DAG.getNode(ISD::ADD, dl, ResTy, {S2, T2});
+    return T3;
   }
 
-  assert(CC == ISD::SETEQ || CC == ISD::SETGT || CC == ISD::SETUGT);
+  // Unsigned mulhw. (Would expansion using signed mulhw be better?)
 
-  MVT ElemTy = VecTy.getVectorElementType();
-  unsigned ElemWidth = ElemTy.getSizeInBits();
-  assert(isPowerOf2_32(ElemWidth));
-
-  auto getIdx = [] (unsigned Code) {
-    static const unsigned Idx[] = { ISD::SETEQ, ISD::SETGT, ISD::SETUGT };
-    for (unsigned I = 0, E = array_lengthof(Idx); I != E; ++I)
-      if (Code == Idx[I])
-        return I;
-    llvm_unreachable("Unhandled CondCode");
+  auto LoVec = [&DAG,ResTy,dl] (SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResTy, Pair);
   };
-
-  static unsigned OpcTable[3][3] = {
-    //           SETEQ             SETGT,            SETUGT
-    /* Byte */ { Hexagon::V6_veqb, Hexagon::V6_vgtb, Hexagon::V6_vgtub },
-    /* Half */ { Hexagon::V6_veqh, Hexagon::V6_vgth, Hexagon::V6_vgtuh },
-    /* Word */ { Hexagon::V6_veqw, Hexagon::V6_vgtw, Hexagon::V6_vgtuw }
+  auto HiVec = [&DAG,ResTy,dl] (SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResTy, Pair);
   };
 
-  unsigned CmpOpc = OpcTable[Log2_32(ElemWidth)-3][getIdx(CC)];
-
-  MVT ResTy = ty(Op);
-  const SDLoc &dl(Op);
-  SDValue OpL = Swap ? Op.getOperand(1) : Op.getOperand(0);
-  SDValue OpR = Swap ? Op.getOperand(0) : Op.getOperand(1);
-  SDValue CmpV = getNode(CmpOpc, dl, ResTy, {OpL, OpR}, DAG);
-  return Negate ? getNode(Hexagon::V6_pred_not, dl, ResTy, {CmpV}, DAG)
-                : CmpV;
+  MVT PairTy = typeJoin({ResTy, ResTy});
+  SDValue P = getInstr(Hexagon::V6_lvsplatw, dl, ResTy,
+                       {DAG.getConstant(0x02020202, dl, MVT::i32)}, DAG);
+  // Multiply-unsigned halfwords:
+  //   LoVec = Vs.uh[2i] * Vt.uh[2i],
+  //   HiVec = Vs.uh[2i+1] * Vt.uh[2i+1]
+  SDValue T0 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, Vt}, DAG);
+  // The low halves in the LoVec of the pair can be discarded. They are
+  // not added to anything (in the full-precision product), so they cannot
+  // produce a carry into the higher bits.
+  SDValue T1 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {LoVec(T0), S16}, DAG);
+  // Swap low and high halves in Vt, and do the halfword multiplication
+  // to get products Vs.uh[2i] * Vt.uh[2i+1] and Vs.uh[2i+1] * Vt.uh[2i].
+  SDValue D0 = getInstr(Hexagon::V6_vdelta, dl, ResTy, {Vt, P}, DAG);
+  SDValue T2 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, D0}, DAG);
+  // T2 has mixed products of halfwords: Lo(Vt)*Hi(Vs) and Hi(Vt)*Lo(Vs).
+  // These products are words, but cannot be added directly because the
+  // sums could overflow. Add these products, by halfwords, where each sum
+  // of a pair of halfwords gives a word.
+  SDValue T3 = getInstr(Hexagon::V6_vadduhw, dl, PairTy,
+                        {LoVec(T2), HiVec(T2)}, DAG);
+  // Add the high halfwords from the products of the low halfwords.
+  SDValue T4 = DAG.getNode(ISD::ADD, dl, ResTy, {T1, LoVec(T3)});
+  SDValue T5 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {T4, S16}, DAG);
+  SDValue T6 = DAG.getNode(ISD::ADD, dl, ResTy, {HiVec(T0), HiVec(T3)});
+  SDValue T7 = DAG.getNode(ISD::ADD, dl, ResTy, {T5, T6});
+  return T7;
 }
 
 SDValue
@@ -543,3 +1358,163 @@ HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG);
   return DAG.getZeroExtendVectorInReg(Op.getOperand(0), SDLoc(Op), ty(Op));
 }
+
+SDValue
+HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue S = getVectorShiftByInt(Op, DAG))
+    return S;
+  return Op;
+}
+
+SDValue
+HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Op.isMachineOpcode());
+  SmallVector<SDValue,2> OpsL, OpsH;
+  const SDLoc &dl(Op);
+
+  auto SplitVTNode = [&DAG,this] (const VTSDNode *N) {
+    MVT Ty = typeSplit(N->getVT().getSimpleVT()).first;
+    SDValue TV = DAG.getValueType(Ty);
+    return std::make_pair(TV, TV);
+  };
+
+  for (SDValue A : Op.getNode()->ops()) {
+    VectorPair P = Subtarget.isHVXVectorType(ty(A), true)
+                    ? opSplit(A, dl, DAG)
+                    : std::make_pair(A, A);
+    // Special case for type operand.
+    if (Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+      if (const auto *N = dyn_cast<const VTSDNode>(A.getNode()))
+        P = SplitVTNode(N);
+    }
+    OpsL.push_back(P.first);
+    OpsH.push_back(P.second);
+  }
+
+  MVT ResTy = ty(Op);
+  MVT HalfTy = typeSplit(ResTy).first;
+  SDValue L = DAG.getNode(Op.getOpcode(), dl, HalfTy, OpsL);
+  SDValue H = DAG.getNode(Op.getOpcode(), dl, HalfTy, OpsH);
+  SDValue S = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, L, H);
+  return S;
+}
+
+SDValue
+HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
+  LSBaseSDNode *BN = cast<LSBaseSDNode>(Op.getNode());
+  assert(BN->isUnindexed());
+  MVT MemTy = BN->getMemoryVT().getSimpleVT();
+  if (!isHvxPairTy(MemTy))
+    return Op;
+
+  const SDLoc &dl(Op);
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT SingleTy = typeSplit(MemTy).first;
+  SDValue Chain = BN->getChain();
+  SDValue Base0 = BN->getBasePtr();
+  SDValue Base1 = DAG.getMemBasePlusOffset(Base0, HwLen, dl);
+
+  MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
+  if (MachineMemOperand *MMO = BN->getMemOperand()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
+    MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
+  }
+
+  unsigned MemOpc = BN->getOpcode();
+  SDValue NewOp;
+
+  if (MemOpc == ISD::LOAD) {
+    SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
+    SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1);
+    NewOp = DAG.getMergeValues(
+              { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
+                DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            Load0.getValue(1), Load1.getValue(1)) }, dl);
+  } else {
+    assert(MemOpc == ISD::STORE);
+    VectorPair Vals = opSplit(cast<StoreSDNode>(Op)->getValue(), dl, DAG);
+    SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0);
+    SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1);
+    NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+  }
+
+  return NewOp;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  bool IsPairOp = isHvxPairTy(ty(Op)) ||
+                  llvm::any_of(Op.getNode()->ops(), [this] (SDValue V) {
+                    return isHvxPairTy(ty(V));
+                  });
+
+  if (IsPairOp) {
+    switch (Opc) {
+      default:
+        break;
+      case ISD::LOAD:
+      case ISD::STORE:
+        return SplitHvxMemOp(Op, DAG);
+      case ISD::CTPOP:
+      case ISD::CTLZ:
+      case ISD::CTTZ:
+      case ISD::MUL:
+      case ISD::MULHS:
+      case ISD::MULHU:
+      case ISD::AND:
+      case ISD::OR:
+      case ISD::XOR:
+      case ISD::SRA:
+      case ISD::SHL:
+      case ISD::SRL:
+      case ISD::SETCC:
+      case ISD::VSELECT:
+      case ISD::SIGN_EXTEND_INREG:
+        return SplitHvxPairOp(Op, DAG);
+    }
+  }
+
+  switch (Opc) {
+    default:
+      break;
+    case ISD::BUILD_VECTOR:            return LowerHvxBuildVector(Op, DAG);
+    case ISD::CONCAT_VECTORS:          return LowerHvxConcatVectors(Op, DAG);
+    case ISD::INSERT_SUBVECTOR:        return LowerHvxInsertSubvector(Op, DAG);
+    case ISD::INSERT_VECTOR_ELT:       return LowerHvxInsertElement(Op, DAG);
+    case ISD::EXTRACT_SUBVECTOR:       return LowerHvxExtractSubvector(Op, DAG);
+    case ISD::EXTRACT_VECTOR_ELT:      return LowerHvxExtractElement(Op, DAG);
+
+    case ISD::ANY_EXTEND:              return LowerHvxAnyExt(Op, DAG);
+    case ISD::SIGN_EXTEND:             return LowerHvxSignExt(Op, DAG);
+    case ISD::ZERO_EXTEND:             return LowerHvxZeroExt(Op, DAG);
+    case ISD::CTTZ:                    return LowerHvxCttz(Op, DAG);
+    case ISD::SRA:
+    case ISD::SHL:
+    case ISD::SRL:                     return LowerHvxShift(Op, DAG);
+    case ISD::MUL:                     return LowerHvxMul(Op, DAG);
+    case ISD::MULHS:
+    case ISD::MULHU:                   return LowerHvxMulh(Op, DAG);
+    case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG);
+    case ISD::SETCC:
+    case ISD::INTRINSIC_VOID:          return Op;
+    // Unaligned loads will be handled by the default lowering.
+    case ISD::LOAD:                    return SDValue();
+  }
+#ifndef NDEBUG
+  Op.dumpr(&DAG);
+#endif
+  llvm_unreachable("Unhandled HVX operation");
+}
+
+bool
+HexagonTargetLowering::isHvxOperation(SDValue Op) const {
+  // If the type of the result, or any operand type are HVX vector types,
+  // this is an HVX operation.
+  return Subtarget.isHVXVectorType(ty(Op), true) ||
+         llvm::any_of(Op.getNode()->ops(),
+                      [this] (SDValue V) {
+                        return Subtarget.isHVXVectorType(ty(V), true);
+                      });
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
index 14bda0e0107d..1347a655353f 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -19,4 +19,4 @@ class CVI_VA_Resource<dag outs, dag ins, string asmstr,
                        list<dag> pattern = [], string cstr = "",
                        InstrItinClass itin = CVI_VA>
    : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+     OpcodeHexagon, Requires<[HasV60, UseHVX]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index b82a0157e81f..6019c7c5d024 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -34,7 +34,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -49,6 +48,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -134,7 +134,7 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
                               MachineBasicBlock::const_instr_iterator MIE) {
   unsigned Count = 0;
   for (; MIB != MIE; ++MIB) {
-    if (!MIB->isDebugValue())
+    if (!MIB->isDebugInstr())
       ++Count;
   }
   return Count;
@@ -144,9 +144,9 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
 /// On Hexagon, we have two instructions used to set-up the hardware loop
 /// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
 /// to indicate the end of a loop.
-static MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
-      MachineBasicBlock *TargetBB,
-      SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
+MachineInstr *HexagonInstrInfo::findLoopInstr(MachineBasicBlock *BB,
+      unsigned EndLoopOp, MachineBasicBlock *TargetBB,
+      SmallPtrSet<MachineBasicBlock *, 8> &Visited) const {
   unsigned LOOPi;
   unsigned LOOPr;
   if (EndLoopOp == Hexagon::ENDLOOP0) {
@@ -240,41 +240,41 @@ static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
 unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                                int &FrameIndex) const {
   switch (MI.getOpcode()) {
-  default:
-    break;
-  case Hexagon::L2_loadri_io:
-  case Hexagon::L2_loadrd_io:
-  case Hexagon::V6_vL32b_ai:
-  case Hexagon::V6_vL32b_nt_ai:
-  case Hexagon::V6_vL32Ub_ai:
-  case Hexagon::LDriw_pred:
-  case Hexagon::LDriw_mod:
-  case Hexagon::PS_vloadrq_ai:
-  case Hexagon::PS_vloadrw_ai:
-  case Hexagon::PS_vloadrw_nt_ai: {
-    const MachineOperand OpFI = MI.getOperand(1);
-    if (!OpFI.isFI())
-      return 0;
-    const MachineOperand OpOff = MI.getOperand(2);
-    if (!OpOff.isImm() || OpOff.getImm() != 0)
-      return 0;
-    FrameIndex = OpFI.getIndex();
-    return MI.getOperand(0).getReg();
-  }
+    default:
+      break;
+    case Hexagon::L2_loadri_io:
+    case Hexagon::L2_loadrd_io:
+    case Hexagon::V6_vL32b_ai:
+    case Hexagon::V6_vL32b_nt_ai:
+    case Hexagon::V6_vL32Ub_ai:
+    case Hexagon::LDriw_pred:
+    case Hexagon::LDriw_ctr:
+    case Hexagon::PS_vloadrq_ai:
+    case Hexagon::PS_vloadrw_ai:
+    case Hexagon::PS_vloadrw_nt_ai: {
+      const MachineOperand OpFI = MI.getOperand(1);
+      if (!OpFI.isFI())
+        return 0;
+      const MachineOperand OpOff = MI.getOperand(2);
+      if (!OpOff.isImm() || OpOff.getImm() != 0)
+        return 0;
+      FrameIndex = OpFI.getIndex();
+      return MI.getOperand(0).getReg();
+    }
 
-  case Hexagon::L2_ploadrit_io:
-  case Hexagon::L2_ploadrif_io:
-  case Hexagon::L2_ploadrdt_io:
-  case Hexagon::L2_ploadrdf_io: {
-    const MachineOperand OpFI = MI.getOperand(2);
-    if (!OpFI.isFI())
-      return 0;
-    const MachineOperand OpOff = MI.getOperand(3);
-    if (!OpOff.isImm() || OpOff.getImm() != 0)
-      return 0;
-    FrameIndex = OpFI.getIndex();
-    return MI.getOperand(0).getReg();
-  }
+    case Hexagon::L2_ploadrit_io:
+    case Hexagon::L2_ploadrif_io:
+    case Hexagon::L2_ploadrdt_io:
+    case Hexagon::L2_ploadrdf_io: {
+      const MachineOperand OpFI = MI.getOperand(2);
+      if (!OpFI.isFI())
+        return 0;
+      const MachineOperand OpOff = MI.getOperand(3);
+      if (!OpOff.isImm() || OpOff.getImm() != 0)
+        return 0;
+      FrameIndex = OpFI.getIndex();
+      return MI.getOperand(0).getReg();
+    }
   }
 
   return 0;
@@ -288,48 +288,84 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
 unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
   switch (MI.getOpcode()) {
-  default:
-    break;
-  case Hexagon::S2_storerb_io:
-  case Hexagon::S2_storerh_io:
-  case Hexagon::S2_storeri_io:
-  case Hexagon::S2_storerd_io:
-  case Hexagon::V6_vS32b_ai:
-  case Hexagon::V6_vS32Ub_ai:
-  case Hexagon::STriw_pred:
-  case Hexagon::STriw_mod:
-  case Hexagon::PS_vstorerq_ai:
-  case Hexagon::PS_vstorerw_ai: {
-    const MachineOperand &OpFI = MI.getOperand(0);
-    if (!OpFI.isFI())
-      return 0;
-    const MachineOperand &OpOff = MI.getOperand(1);
-    if (!OpOff.isImm() || OpOff.getImm() != 0)
-      return 0;
-    FrameIndex = OpFI.getIndex();
-    return MI.getOperand(2).getReg();
+    default:
+      break;
+    case Hexagon::S2_storerb_io:
+    case Hexagon::S2_storerh_io:
+    case Hexagon::S2_storeri_io:
+    case Hexagon::S2_storerd_io:
+    case Hexagon::V6_vS32b_ai:
+    case Hexagon::V6_vS32Ub_ai:
+    case Hexagon::STriw_pred:
+    case Hexagon::STriw_ctr:
+    case Hexagon::PS_vstorerq_ai:
+    case Hexagon::PS_vstorerw_ai: {
+      const MachineOperand &OpFI = MI.getOperand(0);
+      if (!OpFI.isFI())
+        return 0;
+      const MachineOperand &OpOff = MI.getOperand(1);
+      if (!OpOff.isImm() || OpOff.getImm() != 0)
+        return 0;
+      FrameIndex = OpFI.getIndex();
+      return MI.getOperand(2).getReg();
+    }
+
+    case Hexagon::S2_pstorerbt_io:
+    case Hexagon::S2_pstorerbf_io:
+    case Hexagon::S2_pstorerht_io:
+    case Hexagon::S2_pstorerhf_io:
+    case Hexagon::S2_pstorerit_io:
+    case Hexagon::S2_pstorerif_io:
+    case Hexagon::S2_pstorerdt_io:
+    case Hexagon::S2_pstorerdf_io: {
+      const MachineOperand &OpFI = MI.getOperand(1);
+      if (!OpFI.isFI())
+        return 0;
+      const MachineOperand &OpOff = MI.getOperand(2);
+      if (!OpOff.isImm() || OpOff.getImm() != 0)
+        return 0;
+      FrameIndex = OpFI.getIndex();
+      return MI.getOperand(3).getReg();
+    }
   }
 
-  case Hexagon::S2_pstorerbt_io:
-  case Hexagon::S2_pstorerbf_io:
-  case Hexagon::S2_pstorerht_io:
-  case Hexagon::S2_pstorerhf_io:
-  case Hexagon::S2_pstorerit_io:
-  case Hexagon::S2_pstorerif_io:
-  case Hexagon::S2_pstorerdt_io:
-  case Hexagon::S2_pstorerdf_io: {
-    const MachineOperand &OpFI = MI.getOperand(1);
-    if (!OpFI.isFI())
-      return 0;
-    const MachineOperand &OpOff = MI.getOperand(2);
-    if (!OpOff.isImm() || OpOff.getImm() != 0)
-      return 0;
-    FrameIndex = OpFI.getIndex();
-    return MI.getOperand(3).getReg();
+  return 0;
+}
+
+/// This function checks if the instruction or bundle of instructions
+/// has load from stack slot and returns frameindex and machine memory
+/// operand of that instruction if true.
+bool HexagonInstrInfo::hasLoadFromStackSlot(const MachineInstr &MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+  if (MI.isBundle()) {
+    const MachineBasicBlock *MBB = MI.getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI.getIterator();
+    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
+      if (TargetInstrInfo::hasLoadFromStackSlot(*MII, MMO, FrameIndex))
+        return true;
+    return false;
   }
+
+  return TargetInstrInfo::hasLoadFromStackSlot(MI, MMO, FrameIndex);
+}
+
+/// This function checks if the instruction or bundle of instructions
+/// has store to stack slot and returns frameindex and machine memory
+/// operand of that instruction if true.
+bool HexagonInstrInfo::hasStoreToStackSlot(const MachineInstr &MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+  if (MI.isBundle()) {
+    const MachineBasicBlock *MBB = MI.getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI.getIterator();
+    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
+      if (TargetInstrInfo::hasStoreToStackSlot(*MII, MMO, FrameIndex))
+        return true;
+    return false;
   }
 
-  return 0;
+  return TargetInstrInfo::hasStoreToStackSlot(MI, MMO, FrameIndex);
 }
 
 /// This function can analyze one/two way branching only and should (mostly) be
@@ -383,7 +419,7 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   I = MBB.instr_end();
   --I;
 
-  while (I->isDebugValue()) {
+  while (I->isDebugInstr()) {
     if (I == MBB.instr_begin())
       return false;
     --I;
@@ -394,7 +430,7 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   // Delete the J2_jump if it's equivalent to a fall-through.
   if (AllowModify && JumpToBlock &&
       MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-    DEBUG(dbgs() << "\nErasing the jump to successor block\n";);
+    LLVM_DEBUG(dbgs() << "\nErasing the jump to successor block\n";);
     I->eraseFromParent();
     I = MBB.instr_end();
     if (I == MBB.instr_begin())
@@ -463,8 +499,8 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       Cond.push_back(LastInst->getOperand(1));
       return false;
     }
-    DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
-                 << " with one jump\n";);
+    LLVM_DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
+                      << " with one jump\n";);
     // Otherwise, don't know what this is.
     return true;
   }
@@ -511,8 +547,8 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     FBB = LastInst->getOperand(0).getMBB();
     return false;
   }
-  DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
-               << " with two jumps";);
+  LLVM_DEBUG(dbgs() << "\nCant analyze " << printMBBReference(MBB)
+                    << " with two jumps";);
   // Otherwise, can't handle this.
   return true;
 }
@@ -521,12 +557,12 @@ unsigned HexagonInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                         int *BytesRemoved) const {
   assert(!BytesRemoved && "code size not handled");
 
-  DEBUG(dbgs() << "\nRemoving branches out of " << printMBBReference(MBB));
+  LLVM_DEBUG(dbgs() << "\nRemoving branches out of " << printMBBReference(MBB));
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     // Only removing branches from end of MBB.
     if (!I->isBranch())
@@ -593,7 +629,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
       // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
       // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
       unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
-      DEBUG(dbgs() << "\nInserting NVJump for " << printMBBReference(MBB););
+      LLVM_DEBUG(dbgs() << "\nInserting NVJump for "
+                        << printMBBReference(MBB););
       if (Cond[2].isReg()) {
         unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
         BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
@@ -864,7 +901,7 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
-    BuildMI(MBB, I, DL, get(Hexagon::STriw_mod))
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_ctr))
       .addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::HvxQRRegClass.hasSubClassEq(RC)) {
@@ -926,7 +963,7 @@ void HexagonInstrInfo::loadRegFromStackSlot(
     BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
       .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
-    BuildMI(MBB, I, DL, get(Hexagon::LDriw_mod), DestReg)
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_ctr), DestReg)
       .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (Hexagon::HvxQRRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai), DestReg)
@@ -980,6 +1017,20 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   DebugLoc DL = MI.getDebugLoc();
   unsigned Opc = MI.getOpcode();
 
+  auto RealCirc = [&](unsigned Opc, bool HasImm, unsigned MxOp) {
+    unsigned Mx = MI.getOperand(MxOp).getReg();
+    unsigned CSx = (Mx == Hexagon::M0 ? Hexagon::CS0 : Hexagon::CS1);
+    BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrrcr), CSx)
+        .add(MI.getOperand((HasImm ? 5 : 4)));
+    auto MIB = BuildMI(MBB, MI, DL, get(Opc)).add(MI.getOperand(0))
+        .add(MI.getOperand(1)).add(MI.getOperand(2)).add(MI.getOperand(3));
+    if (HasImm)
+      MIB.add(MI.getOperand(4));
+    MIB.addReg(CSx, RegState::Implicit);
+    MBB.erase(MI);
+    return true;
+  };
+
   switch (Opc) {
     case TargetOpcode::COPY: {
       MachineOperand &MD = MI.getOperand(0);
@@ -1088,6 +1139,28 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       MBB.erase(MI);
       return true;
     }
+    case Hexagon::PS_qtrue: {
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_veqw), MI.getOperand(0).getReg())
+        .addReg(Hexagon::V0, RegState::Undef)
+        .addReg(Hexagon::V0, RegState::Undef);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_qfalse: {
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vgtw), MI.getOperand(0).getReg())
+        .addReg(Hexagon::V0, RegState::Undef)
+        .addReg(Hexagon::V0, RegState::Undef);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_vdd0: {
+      unsigned Vd = MI.getOperand(0).getReg();
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vsubw_dv), Vd)
+        .addReg(Vd, RegState::Undef)
+        .addReg(Vd, RegState::Undef);
+      MBB.erase(MI);
+      return true;
+    }
     case Hexagon::PS_vmulw: {
       // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
       unsigned DstReg = MI.getOperand(0).getReg();
@@ -1344,6 +1417,50 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       MBB.erase(MI);
       return true;
 
+    case Hexagon::PS_loadrub_pci:
+      return RealCirc(Hexagon::L2_loadrub_pci, /*HasImm*/true,  /*MxOp*/4);
+    case Hexagon::PS_loadrb_pci:
+      return RealCirc(Hexagon::L2_loadrb_pci,  /*HasImm*/true,  /*MxOp*/4);
+    case Hexagon::PS_loadruh_pci:
+      return RealCirc(Hexagon::L2_loadruh_pci, /*HasImm*/true,  /*MxOp*/4);
+    case Hexagon::PS_loadrh_pci:
+      return RealCirc(Hexagon::L2_loadrh_pci,  /*HasImm*/true,  /*MxOp*/4);
+    case Hexagon::PS_loadri_pci:
+      return RealCirc(Hexagon::L2_loadri_pci,  /*HasImm*/true,  /*MxOp*/4);
+    case Hexagon::PS_loadrd_pci:
+      return RealCirc(Hexagon::L2_loadrd_pci,  /*HasImm*/true,  /*MxOp*/4);
+    case Hexagon::PS_loadrub_pcr:
+      return RealCirc(Hexagon::L2_loadrub_pcr, /*HasImm*/false, /*MxOp*/3);
+    case Hexagon::PS_loadrb_pcr:
+      return RealCirc(Hexagon::L2_loadrb_pcr,  /*HasImm*/false, /*MxOp*/3);
+    case Hexagon::PS_loadruh_pcr:
+      return RealCirc(Hexagon::L2_loadruh_pcr, /*HasImm*/false, /*MxOp*/3);
+    case Hexagon::PS_loadrh_pcr:
+      return RealCirc(Hexagon::L2_loadrh_pcr,  /*HasImm*/false, /*MxOp*/3);
+    case Hexagon::PS_loadri_pcr:
+      return RealCirc(Hexagon::L2_loadri_pcr,  /*HasImm*/false, /*MxOp*/3);
+    case Hexagon::PS_loadrd_pcr:
+      return RealCirc(Hexagon::L2_loadrd_pcr,  /*HasImm*/false, /*MxOp*/3);
+    case Hexagon::PS_storerb_pci:
+      return RealCirc(Hexagon::S2_storerb_pci, /*HasImm*/true,  /*MxOp*/3);
+    case Hexagon::PS_storerh_pci:
+      return RealCirc(Hexagon::S2_storerh_pci, /*HasImm*/true,  /*MxOp*/3);
+    case Hexagon::PS_storerf_pci:
+      return RealCirc(Hexagon::S2_storerf_pci, /*HasImm*/true,  /*MxOp*/3);
+    case Hexagon::PS_storeri_pci:
+      return RealCirc(Hexagon::S2_storeri_pci, /*HasImm*/true,  /*MxOp*/3);
+    case Hexagon::PS_storerd_pci:
+      return RealCirc(Hexagon::S2_storerd_pci, /*HasImm*/true,  /*MxOp*/3);
+    case Hexagon::PS_storerb_pcr:
+      return RealCirc(Hexagon::S2_storerb_pcr, /*HasImm*/false, /*MxOp*/2);
+    case Hexagon::PS_storerh_pcr:
+      return RealCirc(Hexagon::S2_storerh_pcr, /*HasImm*/false, /*MxOp*/2);
+    case Hexagon::PS_storerf_pcr:
+      return RealCirc(Hexagon::S2_storerf_pcr, /*HasImm*/false, /*MxOp*/2);
+    case Hexagon::PS_storeri_pcr:
+      return RealCirc(Hexagon::S2_storeri_pcr, /*HasImm*/false, /*MxOp*/2);
+    case Hexagon::PS_storerd_pcr:
+      return RealCirc(Hexagon::S2_storerd_pcr, /*HasImm*/false, /*MxOp*/2);
   }
 
   return false;
@@ -1393,7 +1510,7 @@ bool HexagonInstrInfo::PredicateInstruction(
     MachineInstr &MI, ArrayRef<MachineOperand> Cond) const {
   if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
       isEndLoopN(Cond[0].getImm())) {
-    DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
+    LLVM_DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
     return false;
   }
   int Opc = MI.getOpcode();
@@ -1483,7 +1600,7 @@ bool HexagonInstrInfo::isPredicable(const MachineInstr &MI) const {
   }
 
   // HVX loads are not predicable on v60, but are on v62.
-  if (!Subtarget.hasV62TOps()) {
+  if (!Subtarget.hasV62Ops()) {
     switch (MI.getOpcode()) {
       case Hexagon::V6_vL32b_ai:
       case Hexagon::V6_vL32b_pi:
@@ -1518,7 +1635,7 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // considered a scheduling hazard, which is wrong. It should be the actual
   // instruction preceding the dbg_value instruction(s), just like it is
   // when debug info is not present.
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return false;
 
   // Throwing call is a boundary.
@@ -1586,7 +1703,7 @@ HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
   return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
 }
 
-/// \brief For a comparison instruction, return the source registers in
+/// For a comparison instruction, return the source registers in
 /// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
@@ -1836,6 +1953,10 @@ bool HexagonInstrInfo::isAccumulator(const MachineInstr &MI) const {
   return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
 }
 
+bool HexagonInstrInfo::isBaseImmOffset(const MachineInstr &MI) const {
+  return getAddrMode(MI) == HexagonII::BaseImmOffset;
+}
+
 bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
   return !isTC1(MI) && !isTC2Early(MI) && !MI.getDesc().mayLoad() &&
          !MI.getDesc().mayStore() &&
@@ -2139,13 +2260,13 @@ bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
   bool isLate = isLateResultInstr(LRMI);
   bool isEarly = isEarlySourceInstr(ESMI);
 
-  DEBUG(dbgs() << "V60" <<  (isLate ? "-LR  " : " --  "));
-  DEBUG(LRMI.dump());
-  DEBUG(dbgs() << "V60" <<  (isEarly ? "-ES  " : " --  "));
-  DEBUG(ESMI.dump());
+  LLVM_DEBUG(dbgs() << "V60" << (isLate ? "-LR  " : " --  "));
+  LLVM_DEBUG(LRMI.dump());
+  LLVM_DEBUG(dbgs() << "V60" << (isEarly ? "-ES  " : " --  "));
+  LLVM_DEBUG(ESMI.dump());
 
   if (isLate && isEarly) {
-    DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
+    LLVM_DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
     return true;
   }
 
@@ -2472,6 +2593,13 @@ bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, int Offset) const {
     case MVT::i16:
     case MVT::i32:
     case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+    case MVT::v2i16:
+    case MVT::v2i32:
+    case MVT::v4i8:
+    case MVT::v4i16:
+    case MVT::v8i8:
       return isInt<4>(Count);
     // For HVX vectors the auto-inc is s3
     case MVT::v64i8:
@@ -2599,8 +2727,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   // any size. Later pass knows how to handle it.
   case Hexagon::STriw_pred:
   case Hexagon::LDriw_pred:
-  case Hexagon::STriw_mod:
-  case Hexagon::LDriw_mod:
+  case Hexagon::STriw_ctr:
+  case Hexagon::LDriw_ctr:
     return true;
 
   case Hexagon::PS_fi:
@@ -2754,7 +2882,7 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
   return false;
 }
 
-/// \brief Get the base register and byte offset of a load/store instr.
+/// Get the base register and byte offset of a load/store instr.
 bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
       unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
       const {
@@ -2765,7 +2893,7 @@ bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
   return BaseReg != 0;
 }
 
-/// \brief Can these instructions execute at the same time in a bundle.
+/// Can these instructions execute at the same time in a bundle.
 bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
       const MachineInstr &Second) const {
   if (Second.mayStore() && First.getOpcode() == Hexagon::S2_allocframe) {
@@ -2860,11 +2988,14 @@ bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
 bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr &MI) const {
   const uint64_t F = MI.getDesc().TSFlags;
   return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) &&
-         Subtarget.hasV60TOps();
+         Subtarget.hasV60Ops();
 }
 
 // Returns true, if a ST insn can be promoted to a new-value store.
 bool HexagonInstrInfo::mayBeNewStore(const MachineInstr &MI) const {
+  if (MI.mayStore() && !Subtarget.useNewValueStores())
+    return false;
+
   const uint64_t F = MI.getDesc().TSFlags;
   return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
 }
@@ -2917,10 +3048,29 @@ bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
       return false;
   }
 
-  // Hexagon Programmer's Reference says that decbin, memw_locked, and
-  // memd_locked cannot be used as .new as well,
-  // but we don't seem to have these instructions defined.
-  return MI.getOpcode() != Hexagon::A4_tlbmatch;
+  // Instruction that produce late predicate cannot be used as sources of
+  // dot-new.
+  switch (MI.getOpcode()) {
+    case Hexagon::A4_addp_c:
+    case Hexagon::A4_subp_c:
+    case Hexagon::A4_tlbmatch:
+    case Hexagon::A5_ACS:
+    case Hexagon::F2_sfinvsqrta:
+    case Hexagon::F2_sfrecipa:
+    case Hexagon::J2_endloop0:
+    case Hexagon::J2_endloop01:
+    case Hexagon::J2_ploop1si:
+    case Hexagon::J2_ploop1sr:
+    case Hexagon::J2_ploop2si:
+    case Hexagon::J2_ploop2sr:
+    case Hexagon::J2_ploop3si:
+    case Hexagon::J2_ploop3sr:
+    case Hexagon::S2_cabacdecbin:
+    case Hexagon::S2_storew_locked:
+    case Hexagon::S4_stored_locked:
+      return false;
+  }
+  return true;
 }
 
 bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
@@ -3047,7 +3197,7 @@ SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
   I = MBB.instr_end();
   --I;
 
-  while (I->isDebugValue()) {
+  while (I->isDebugInstr()) {
     if (I == MBB.instr_begin())
       return Jumpers;
     --I;
@@ -3496,7 +3646,7 @@ int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const {
     assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
   }
 
-  if (Subtarget.hasV60TOps())
+  if (Subtarget.hasV60Ops())
     return NewOp;
 
   // Subtargets prior to V60 didn't support 'taken' forms of predicated jumps.
@@ -3893,9 +4043,9 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
 
   // Get DefIdx and UseIdx for super registers.
-  MachineOperand DefMO = DefMI.getOperand(DefIdx);
+  const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
 
-  if (HRI.isPhysicalRegister(DefMO.getReg())) {
+  if (DefMO.isReg() && HRI.isPhysicalRegister(DefMO.getReg())) {
     if (DefMO.isImplicit()) {
       for (MCSuperRegIterator SR(DefMO.getReg(), &HRI); SR.isValid(); ++SR) {
         int Idx = DefMI.findRegisterDefOperandIdx(*SR, false, false, &HRI);
@@ -3906,7 +4056,7 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       }
     }
 
-    MachineOperand UseMO = UseMI.getOperand(UseIdx);
+    const MachineOperand &UseMO = UseMI.getOperand(UseIdx);
     if (UseMO.isImplicit()) {
       for (MCSuperRegIterator SR(UseMO.getReg(), &HRI); SR.isValid(); ++SR) {
         int Idx = UseMI.findRegisterUseOperandIdx(*SR, false, &HRI);
@@ -4057,7 +4207,7 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
     return false;
   assert(Cond.size() == 2);
   if (isNewValueJump(Cond[0].getImm()) || Cond[1].isMBB()) {
-    DEBUG(dbgs() << "No predregs for new-value jumps/endloop");
+    LLVM_DEBUG(dbgs() << "No predregs for new-value jumps/endloop");
     return false;
   }
   PredReg = Cond[1].getReg();
@@ -4084,7 +4234,7 @@ short HexagonInstrInfo::getRegForm(const MachineInstr &MI) const {
 // use a constant extender, which requires another 4 bytes.
 // For debug instructions and prolog labels, return 0.
 unsigned HexagonInstrInfo::getSize(const MachineInstr &MI) const {
-  if (MI.isDebugValue() || MI.isPosition())
+  if (MI.isDebugInstr() || MI.isPosition())
     return 0;
 
   unsigned Size = MI.getDesc().getSize();
@@ -4159,9 +4309,9 @@ void HexagonInstrInfo::immediateExtend(MachineInstr &MI) const {
 
 bool HexagonInstrInfo::invertAndChangeJumpTarget(
       MachineInstr &MI, MachineBasicBlock *NewTarget) const {
-  DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to "
-               << printMBBReference(*NewTarget);
-        MI.dump(););
+  LLVM_DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to "
+                    << printMBBReference(*NewTarget);
+             MI.dump(););
   assert(MI.isBranch());
   unsigned NewOpcode = getInvertedPredicatedOpcode(MI.getOpcode());
   int TargetPos = MI.getNumOperands() - 1;
@@ -4189,8 +4339,9 @@ void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
   for (unsigned insn = TargetOpcode::GENERIC_OP_END+1;
        insn < Hexagon::INSTRUCTION_LIST_END; ++insn) {
     NewMI = BuildMI(B, I, DL, get(insn));
-    DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) <<
-          "  Class: " << NewMI->getDesc().getSchedClass());
+    LLVM_DEBUG(dbgs() << "\n"
+                      << getName(NewMI->getOpcode())
+                      << "  Class: " << NewMI->getDesc().getSchedClass());
     NewMI->eraseFromParent();
   }
   /* --- The code above is used to generate complete set of Hexagon Insn --- */
@@ -4200,7 +4351,7 @@ void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
 // p -> NotP
 // NotP -> P
 bool HexagonInstrInfo::reversePredSense(MachineInstr &MI) const {
-  DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI.dump());
+  LLVM_DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI.dump());
   MI.setDesc(get(getInvertedPredicatedOpcode(MI.getOpcode())));
   return true;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 4530d3b999cc..96b4ffaba02f 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -18,9 +18,9 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cstdint>
 #include <vector>
 
@@ -66,6 +66,20 @@ public:
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
+  /// Check if the instruction or the bundle of instructions has
+  /// load from stack slots. Return the frameindex and machine memory operand
+  /// if true.
+  bool hasLoadFromStackSlot(const MachineInstr &MI,
+                           const MachineMemOperand *&MMO,
+                           int &FrameIndex) const override;
+
+  /// Check if the instruction or the bundle of instructions has
+  /// store to stack slots. Return the frameindex and machine memory operand
+  /// if true.
+  bool hasStoreToStackSlot(const MachineInstr &MI,
+                           const MachineMemOperand *&MMO,
+                           int &FrameIndex) const override;
+
   /// Analyze the branching code at the end of MBB, returning
   /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
   /// implemented for a target).  Upon success, this returns false and returns
@@ -122,8 +136,8 @@ public:
   bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
                    MachineInstr *&CmpInst) const override;
 
-  /// Generate code to reduce the loop iteration by one and check if the loop is
-  /// finished.  Return the value/register of the the new loop count.  We need
+  /// Generate code to reduce the loop iteration by one and check if the loop
+  /// is finished.  Return the value/register of the new loop count.  We need
   /// this function when peeling off one or more iterations of a loop. This
   /// function assumes the nth iteration is peeled first.
   unsigned reduceLoopCount(MachineBasicBlock &MBB,
@@ -201,7 +215,7 @@ public:
   /// anything was changed.
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  /// \brief Get the base register and byte offset of a load/store instr.
+  /// Get the base register and byte offset of a load/store instr.
   bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
                              int64_t &Offset,
                              const TargetRegisterInfo *TRI) const override;
@@ -332,7 +346,11 @@ public:
   /// HexagonInstrInfo specifics.
 
   unsigned createVR(MachineFunction *MF, MVT VT) const;
+  MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
+                              MachineBasicBlock *TargetBB,
+                              SmallPtrSet<MachineBasicBlock *, 8> &Visited) const;
 
+  bool isBaseImmOffset(const MachineInstr &MI) const;
   bool isAbsoluteSet(const MachineInstr &MI) const;
   bool isAccumulator(const MachineInstr &MI) const;
   bool isAddrModeWithOffset(const MachineInstr &MI) const;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index 1df143de6e80..b25e316709c5 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -807,7 +807,6 @@ def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32_0ImmPred, s8_0ImmPred>;
 // Shift halfword
 def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
 def : T_R_pat<A2_asrh, int_hexagon_A2_asrh>;
-def : T_R_pat<A2_asrh, int_hexagon_SI_to_SXTHI_asrh>;
 
 // Sign/zero extend
 def : T_R_pat<A2_sxth, int_hexagon_A2_sxth>;
@@ -1353,11 +1352,11 @@ class T_stb_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Val>
   : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru),
         (MI I32:$Rs, I32:$Ru, Val:$Rt)>;
 
-def : T_stb_pat <S2_storerh_pbr, int_hexagon_brev_sth,   I32>;
-def : T_stb_pat <S2_storerb_pbr, int_hexagon_brev_stb,   I32>;
-def : T_stb_pat <S2_storeri_pbr, int_hexagon_brev_stw,   I32>;
-def : T_stb_pat <S2_storerf_pbr, int_hexagon_brev_sthhi, I32>;
-def : T_stb_pat <S2_storerd_pbr, int_hexagon_brev_std,   I64>;
+def : T_stb_pat <S2_storerh_pbr, int_hexagon_S2_storerh_pbr, I32>;
+def : T_stb_pat <S2_storerb_pbr, int_hexagon_S2_storerb_pbr, I32>;
+def : T_stb_pat <S2_storeri_pbr, int_hexagon_S2_storeri_pbr, I32>;
+def : T_stb_pat <S2_storerf_pbr, int_hexagon_S2_storerf_pbr, I32>;
+def : T_stb_pat <S2_storerd_pbr, int_hexagon_S2_storerd_pbr, I64>;
 
 class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
   : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index f27a63e20e61..29f67cffcf89 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -9,7 +9,7 @@
 
 //Rdd[+]=vrmpybsu(Rss,Rtt)
 //Rdd[+]=vrmpybuu(Rss,Rtt)
-let Predicates = [HasV5T]  in {
+let Predicates = [HasV5]  in {
 def : T_PP_pat  <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
 def : T_PP_pat  <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 2646d0bcbf47..f9ed03909233 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -56,7 +57,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -243,8 +244,8 @@ namespace {
     const Value *V;
   };
 
-  raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
-  raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
+  LLVM_ATTRIBUTE_USED
+  raw_ostream &operator<<(raw_ostream &OS, const PE &P) {
     P.C.print(OS, P.V ? P.V : P.C.Root);
     return OS;
   }
@@ -608,9 +609,9 @@ namespace {
     unsigned getInverseMxN(unsigned QP);
     Value *generate(BasicBlock::iterator At, ParsedValues &PV);
 
-    void setupSimplifier();
+    void setupPreSimplifier(Simplifier &S);
+    void setupPostSimplifier(Simplifier &S);
 
-    Simplifier Simp;
     Loop *CurLoop;
     const DataLayout &DL;
     const DominatorTree &DT;
@@ -985,6 +986,7 @@ bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
     case Instruction::Xor:
     case Instruction::LShr: // Shift right is ok.
     case Instruction::Select:
+    case Instruction::Trunc:
       return true;
     case Instruction::ICmp:
       if (CmpInst *CI = cast<CmpInst>(In))
@@ -998,6 +1000,8 @@ bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
 
 void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
       IntegerType *DestTy, BasicBlock *LoopB) {
+  Type *OrigTy = In->getType();
+
   // Leave boolean values alone.
   if (!In->getType()->isIntegerTy(1))
     In->mutateType(DestTy);
@@ -1028,6 +1032,14 @@ void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
     Z->eraseFromParent();
     return;
   }
+  if (TruncInst *T = dyn_cast<TruncInst>(In)) {
+    IntegerType *TruncTy = cast<IntegerType>(OrigTy);
+    Value *Mask = ConstantInt::get(DestTy, (1u << TruncTy->getBitWidth()) - 1);
+    Value *And = IRBuilder<>(In).CreateAnd(T->getOperand(0), Mask);
+    T->replaceAllUsesWith(And);
+    T->eraseFromParent();
+    return;
+  }
 
   // Promote immediates.
   for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
@@ -1569,8 +1581,8 @@ static bool hasZeroSignBit(const Value *V) {
   return false;
 }
 
-void PolynomialMultiplyRecognize::setupSimplifier() {
-  Simp.addRule("sink-zext",
+void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier &S) {
+  S.addRule("sink-zext",
     // Sink zext past bitwise operations.
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::ZExt)
@@ -1591,7 +1603,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
                            B.CreateZExt(T->getOperand(0), I->getType()),
                            B.CreateZExt(T->getOperand(1), I->getType()));
     });
-  Simp.addRule("xor/and -> and/xor",
+  S.addRule("xor/and -> and/xor",
     // (xor (and x a) (and y a)) -> (and (xor x y) a)
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::Xor)
@@ -1609,7 +1621,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
       return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
                          And0->getOperand(1));
     });
-  Simp.addRule("sink binop into select",
+  S.addRule("sink binop into select",
     // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
     // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1635,7 +1647,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
       }
       return nullptr;
     });
-  Simp.addRule("fold select-select",
+  S.addRule("fold select-select",
     // (select c (select c x y) z) -> (select c x z)
     // (select c x (select c y z)) -> (select c x z)
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1654,7 +1666,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
       }
       return nullptr;
     });
-  Simp.addRule("or-signbit -> xor-signbit",
+  S.addRule("or-signbit -> xor-signbit",
     // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::Or)
@@ -1666,7 +1678,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
         return nullptr;
       return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
     });
-  Simp.addRule("sink lshr into binop",
+  S.addRule("sink lshr into binop",
     // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::LShr)
@@ -1688,7 +1700,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
                 B.CreateLShr(BitOp->getOperand(0), S),
                 B.CreateLShr(BitOp->getOperand(1), S));
     });
-  Simp.addRule("expose bitop-const",
+  S.addRule("expose bitop-const",
     // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       auto IsBitOp = [](unsigned Op) -> bool {
@@ -1717,16 +1729,44 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
     });
 }
 
+void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier &S) {
+  S.addRule("(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::And)
+        return nullptr;
+      Instruction *Xor = dyn_cast<Instruction>(I->getOperand(0));
+      ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!Xor || !C0)
+        return nullptr;
+      if (Xor->getOpcode() != Instruction::Xor)
+        return nullptr;
+      Instruction *And0 = dyn_cast<Instruction>(Xor->getOperand(0));
+      Instruction *And1 = dyn_cast<Instruction>(Xor->getOperand(1));
+      // Pick the first non-null and.
+      if (!And0 || And0->getOpcode() != Instruction::And)
+        std::swap(And0, And1);
+      ConstantInt *C1 = dyn_cast<ConstantInt>(And0->getOperand(1));
+      if (!C1)
+        return nullptr;
+      uint32_t V0 = C0->getZExtValue();
+      uint32_t V1 = C1->getZExtValue();
+      if (V0 != (V0 & V1))
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1), C0);
+    });
+}
+
 bool PolynomialMultiplyRecognize::recognize() {
-  DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
-               << *CurLoop << '\n');
+  LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
+                    << *CurLoop << '\n');
   // Restrictions:
   // - The loop must consist of a single block.
   // - The iteration count must be known at compile-time.
   // - The loop must have an induction variable starting from 0, and
   //   incremented in each iteration of the loop.
   BasicBlock *LoopB = CurLoop->getHeader();
-  DEBUG(dbgs() << "Loop header:\n" << *LoopB);
+  LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB);
 
   if (LoopB != CurLoop->getLoopLatch())
     return false;
@@ -1746,10 +1786,12 @@ bool PolynomialMultiplyRecognize::recognize() {
 
   Value *CIV = getCountIV(LoopB);
   ParsedValues PV;
+  Simplifier PreSimp;
   PV.IterCount = IterCount;
-  DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
+  LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount
+                    << '\n');
 
-  setupSimplifier();
+  setupPreSimplifier(PreSimp);
 
   // Perform a preliminary scan of select instructions to see if any of them
   // looks like a generator of the polynomial multiply steps. Assume that a
@@ -1772,9 +1814,9 @@ bool PolynomialMultiplyRecognize::recognize() {
       continue;
 
     Simplifier::Context C(SI);
-    Value *T = Simp.simplify(C);
+    Value *T = PreSimp.simplify(C);
     SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
-    DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
+    LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
     if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
       FoundPreScan = true;
       if (SelI != SI) {
@@ -1787,7 +1829,7 @@ bool PolynomialMultiplyRecognize::recognize() {
   }
 
   if (!FoundPreScan) {
-    DEBUG(dbgs() << "Have not found candidates for pmpy\n");
+    LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
     return false;
   }
 
@@ -1798,6 +1840,24 @@ bool PolynomialMultiplyRecognize::recognize() {
     // wide as the target's pmpy instruction.
     if (!promoteTypes(LoopB, ExitB))
       return false;
+    // Run post-promotion simplifications.
+    Simplifier PostSimp;
+    setupPostSimplifier(PostSimp);
+    for (Instruction &In : *LoopB) {
+      SelectInst *SI = dyn_cast<SelectInst>(&In);
+      if (!SI || !FeedsPHI(SI))
+        continue;
+      Simplifier::Context C(SI);
+      Value *T = PostSimp.simplify(C);
+      SelectInst *SelI = dyn_cast_or_null<SelectInst>(T);
+      if (SelI != SI) {
+        Value *NewSel = C.materialize(LoopB, SI->getIterator());
+        SI->replaceAllUsesWith(NewSel);
+        RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
+      }
+      break;
+    }
+
     if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
       return false;
     cleanupLoopBody(LoopB);
@@ -1809,14 +1869,14 @@ bool PolynomialMultiplyRecognize::recognize() {
     SelectInst *SelI = dyn_cast<SelectInst>(&In);
     if (!SelI)
       continue;
-    DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
+    LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
     FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
     if (FoundScan)
       break;
   }
   assert(FoundScan);
 
-  DEBUG({
+  LLVM_DEBUG({
     StringRef PP = (PV.M ? "(P+M)" : "P");
     if (!PV.Inv)
       dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
@@ -1910,7 +1970,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // Get the location that may be stored across the loop.  Since the access
   // is strided positively through memory, we say that the modified location
   // starts at the pointer and has infinite size.
-  uint64_t AccessSize = MemoryLocation::UnknownSize;
+  LocationSize AccessSize = MemoryLocation::UnknownSize;
 
   // If the loop iterates a fixed number of times, we can refine the access
   // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -2080,7 +2140,6 @@ CleanupAndExit:
   // pointer size if it isn't already.
   LLVMContext &Ctx = SI->getContext();
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
-  unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
   DebugLoc DLoc = SI->getDebugLoc();
 
   const SCEV *NumBytesS =
@@ -2214,12 +2273,14 @@ CleanupAndExit:
                       : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
       NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
     } else {
-      NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
-                                          NumBytes, Alignment);
+      NewCall = CondBuilder.CreateMemMove(StoreBasePtr, SI->getAlignment(),
+                                          LoadBasePtr, LI->getAlignment(),
+                                          NumBytes);
     }
   } else {
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
-                                   NumBytes, Alignment);
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
+                                   LoadBasePtr, LI->getAlignment(),
+                                   NumBytes);
     // Okay, the memcpy has been formed.  Zap the original store and
     // anything that feeds into it.
     RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
@@ -2227,15 +2288,16 @@ CleanupAndExit:
 
   NewCall->setDebugLoc(DLoc);
 
-  DEBUG(dbgs() << "  Formed " << (Overlap ? "memmove: " : "memcpy: ")
-               << *NewCall << "\n"
-               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
-               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+  LLVM_DEBUG(dbgs() << "  Formed " << (Overlap ? "memmove: " : "memcpy: ")
+                    << *NewCall << "\n"
+                    << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+                    << "    from store ptr=" << *StoreEv << " at: " << *SI
+                    << "\n");
 
   return true;
 }
 
-// \brief Check if the instructions in Insts, together with their dependencies
+// Check if the instructions in Insts, together with their dependencies
 // cover the loop in the sense that the loop could be safely eliminated once
 // the instructions in Insts are removed.
 bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index b1c549aa13fa..74c550ce8226 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
@@ -47,26 +48,46 @@ using namespace llvm;
 static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure",
     cl::Hidden, cl::ZeroOrMore, cl::init(false));
 
+static cl::opt<bool> UseNewerCandidate("use-newer-candidate",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
 static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
     cl::Hidden, cl::ZeroOrMore, cl::init(1));
 
-static cl::opt<bool> TopUseShorterTie("top-use-shorter-tie",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
-static cl::opt<bool> BotUseShorterTie("bot-use-shorter-tie",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
-static cl::opt<bool> DisableTCTie("disable-tc-tie",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
 // Check if the scheduler should penalize instructions that are available to
 // early due to a zero-latency dependence.
 static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
     cl::ZeroOrMore, cl::init(true));
 
-/// Save the last formed packet
-void VLIWResourceModel::savePacket() {
-  OldPacket = Packet;
+// This value is used to determine if a register class is a high pressure set.
+// We compute the maximum number of registers needed and divided by the total
+// available. Then, we compare the result to this value.
+static cl::opt<float> RPThreshold("hexagon-reg-pressure", cl::Hidden,
+    cl::init(0.75f), cl::desc("High register pressure threhold."));
+
+/// Return true if there is a dependence between SUd and SUu.
+static bool hasDependence(const SUnit *SUd, const SUnit *SUu,
+                          const HexagonInstrInfo &QII) {
+  if (SUd->Succs.size() == 0)
+    return false;
+
+  // Enable .cur formation.
+  if (QII.mayBeCurLoad(*SUd->getInstr()))
+    return false;
+
+  if (QII.canExecuteInBundle(*SUd->getInstr(), *SUu->getInstr()))
+    return false;
+
+  for (const auto &S : SUd->Succs) {
+    // Since we do not add pseudos to packets, might as well
+    // ignore order dependencies.
+    if (S.isCtrl())
+      continue;
+
+    if (S.getSUnit() == SUu && S.getLatency() > 0)
+      return true;
+  }
+  return false;
 }
 
 /// Check if scheduling of this SU is possible
@@ -74,7 +95,7 @@ void VLIWResourceModel::savePacket() {
 /// It is _not_ precise (statefull), it is more like
 /// another heuristic. Many corner cases are figured
 /// empirically.
-bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
   if (!SU || !SU->getInstr())
     return false;
 
@@ -94,49 +115,39 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
     break;
   }
 
-  MachineFunction &MF = *SU->getInstr()->getParent()->getParent();
-  auto &QII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  MachineBasicBlock *MBB = SU->getInstr()->getParent();
+  auto &QST = MBB->getParent()->getSubtarget<HexagonSubtarget>();
+  const auto &QII = *QST.getInstrInfo();
 
   // Now see if there are no other dependencies to instructions already
   // in the packet.
-  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
-    if (Packet[i]->Succs.size() == 0)
-      continue;
-
-    // Enable .cur formation.
-    if (QII.mayBeCurLoad(*Packet[i]->getInstr()))
-      continue;
-
-    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
-         E = Packet[i]->Succs.end(); I != E; ++I) {
-      // Since we do not add pseudos to packets, might as well
-      // ignore order dependencies.
-      if (I->isCtrl())
-        continue;
-
-      if (I->getSUnit() == SU)
+  if (IsTop) {
+    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+      if (hasDependence(Packet[i], SU, QII))
+        return false;
+  } else {
+    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+      if (hasDependence(SU, Packet[i], QII))
         return false;
-    }
   }
   return true;
 }
 
 /// Keep track of available resources.
-bool VLIWResourceModel::reserveResources(SUnit *SU) {
+bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
   bool startNewCycle = false;
   // Artificially reset state.
   if (!SU) {
     ResourcesModel->clearResources();
-    savePacket();
     Packet.clear();
     TotalPackets++;
     return false;
   }
-  // If this SU does not fit in the packet
+  // If this SU does not fit in the packet or the packet is now full
   // start a new one.
-  if (!isResourceAvailable(SU)) {
+  if (!isResourceAvailable(SU, IsTop) ||
+      Packet.size() >= SchedModel->getIssueWidth()) {
     ResourcesModel->clearResources();
-    savePacket();
     Packet.clear();
     TotalPackets++;
     startNewCycle = true;
@@ -161,24 +172,14 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
   Packet.push_back(SU);
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
   for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
-    DEBUG(dbgs() << "\t[" << i << "] SU(");
-    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
-    DEBUG(Packet[i]->getInstr()->dump());
+    LLVM_DEBUG(dbgs() << "\t[" << i << "] SU(");
+    LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    LLVM_DEBUG(Packet[i]->getInstr()->dump());
   }
 #endif
 
-  // If packet is now full, reset the state so in the next cycle
-  // we start fresh.
-  if (Packet.size() >= SchedModel->getIssueWidth()) {
-    ResourcesModel->clearResources();
-    savePacket();
-    Packet.clear();
-    TotalPackets++;
-    startNewCycle = true;
-  }
-
   return startNewCycle;
 }
 
@@ -186,37 +187,43 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
 /// after setting up the current scheduling region. [RegionBegin, RegionEnd)
 /// only includes instructions that have DAG nodes, not scheduling boundaries.
 void VLIWMachineScheduler::schedule() {
-  DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
-               << printMBBReference(*BB) << " " << BB->getName() << " in_func "
-               << BB->getParent()->getName() << " at loop depth "
-               << MLI->getLoopDepth(BB) << " \n");
+  LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
+                    << printMBBReference(*BB) << " " << BB->getName()
+                    << " in_func " << BB->getParent()->getName()
+                    << " at loop depth " << MLI->getLoopDepth(BB) << " \n");
 
   buildDAGWithRegPressure();
 
+  Topo.InitDAGTopologicalSorting();
+
+  // Postprocess the DAG to add platform-specific artificial dependencies.
+  postprocessDAG();
+
   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   findRootsAndBiasEdges(TopRoots, BotRoots);
 
   // Initialize the strategy before modifying the DAG.
   SchedImpl->initialize(this);
 
-  DEBUG(unsigned maxH = 0;
-        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          if (SUnits[su].getHeight() > maxH)
-            maxH = SUnits[su].getHeight();
-        dbgs() << "Max Height " << maxH << "\n";);
-  DEBUG(unsigned maxD = 0;
-        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          if (SUnits[su].getDepth() > maxD)
-            maxD = SUnits[su].getDepth();
-        dbgs() << "Max Depth " << maxD << "\n";);
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
+  LLVM_DEBUG(unsigned maxH = 0;
+             for (unsigned su = 0, e = SUnits.size(); su != e;
+                  ++su) if (SUnits[su].getHeight() > maxH) maxH =
+                 SUnits[su].getHeight();
+             dbgs() << "Max Height " << maxH << "\n";);
+  LLVM_DEBUG(unsigned maxD = 0;
+             for (unsigned su = 0, e = SUnits.size(); su != e;
+                  ++su) if (SUnits[su].getDepth() > maxD) maxD =
+                 SUnits[su].getDepth();
+             dbgs() << "Max Depth " << maxD << "\n";);
+  LLVM_DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) SUnits[su]
+                 .dumpAll(this));
 
   initQueues(TopRoots, BotRoots);
 
   bool IsTopNode = false;
   while (true) {
-    DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+    LLVM_DEBUG(
+        dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
     SUnit *SU = SchedImpl->pickNode(IsTopNode);
     if (!SU) break;
 
@@ -225,16 +232,16 @@ void VLIWMachineScheduler::schedule() {
 
     scheduleMI(SU, IsTopNode);
 
-    updateQueues(SU, IsTopNode);
-
     // Notify the scheduling strategy after updating the DAG.
     SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
   }
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
   placeDebugValues();
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "*** Final schedule for "
            << printMBBReference(*begin()->getParent()) << " ***\n";
     dumpSchedule();
@@ -264,6 +271,15 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
   Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
 
+  const std::vector<unsigned> &MaxPressure =
+    DAG->getRegPressure().MaxSetPressure;
+  HighPressureSets.assign(MaxPressure.size(), 0);
+  for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) {
+    unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i);
+    HighPressureSets[i] =
+      ((float) MaxPressure[i] > ((float) Limit * RPThreshold));
+  }
+
   assert((!ForceTopDown || !ForceBottomUp) &&
          "-misched-topdown incompatible with -misched-bottomup");
 }
@@ -364,8 +380,8 @@ void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
   }
   CheckPending = true;
 
-  DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
-               << CurrCycle << '\n');
+  LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+                    << CurrCycle << '\n');
 }
 
 /// Move the boundary of scheduled code by one SUnit.
@@ -383,18 +399,18 @@ void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
   }
 
   // Update DFA model.
-  startNewCycle = ResourceModel->reserveResources(SU);
+  startNewCycle = ResourceModel->reserveResources(SU, isTop());
 
   // Check the instruction group dispatch limit.
   // TODO: Check if this SU must end a dispatch group.
   IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
   if (startNewCycle) {
-    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
     bumpCycle();
   }
   else
-    DEBUG(dbgs() << "*** IssueCount " << IssueCount
-          << " at cycle " << CurrCycle << '\n');
+    LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle "
+                      << CurrCycle << '\n');
 }
 
 /// Release pending ready nodes in to the available queue. This makes them
@@ -443,10 +459,18 @@ SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
-  for (unsigned i = 0; Available.empty(); ++i) {
+  auto AdvanceCycle = [this]() {
+    if (Available.empty())
+      return true;
+    if (Available.size() == 1 && Pending.size() > 0)
+      return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) ||
+        getWeakLeft(*Available.begin(), isTop()) != 0;
+    return false;
+  };
+  for (unsigned i = 0; AdvanceCycle(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
-    ResourceModel->reserveResources(nullptr);
+    ResourceModel->reserveResources(nullptr, isTop());
     bumpCycle();
     releasePending();
   }
@@ -520,13 +544,31 @@ static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
   return true;
 }
 
+/// Check if the instruction changes the register pressure of a register in the
+/// high pressure set. The function returns a negative value if the pressure
+/// decreases and a positive value is the pressure increases. If the instruction
+/// doesn't use a high pressure register or doesn't change the register
+/// pressure, then return 0.
+int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) {
+  PressureDiff &PD = DAG->getPressureDiff(SU);
+  for (auto &P : PD) {
+    if (!P.isValid())
+      continue;
+    // The pressure differences are computed bottom-up, so the comparision for
+    // an increase is positive in the bottom direction, but negative in the
+    //  top-down direction.
+    if (HighPressureSets[P.getPSet()])
+      return (isBotUp ? P.getUnitInc() : -P.getUnitInc());
+  }
+  return 0;
+}
+
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
 static const unsigned PriorityTwo = 50;
 static const unsigned PriorityThree = 75;
 static const unsigned ScaleTwo = 10;
-static const unsigned FactorOne = 2;
 
 /// Single point to compute overall scheduling cost.
 /// TODO: More heuristics will be used soon.
@@ -541,20 +583,23 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   if (!SU || SU->isScheduled)
     return ResCount;
 
-  MachineInstr &Instr = *SU->getInstr();
-
-  DEBUG(if (verbose) dbgs() << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
+  LLVM_DEBUG(if (verbose) dbgs()
+             << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
   // Forced priority is high.
   if (SU->isScheduleHigh) {
     ResCount += PriorityOne;
-    DEBUG(dbgs() << "H|");
+    LLVM_DEBUG(dbgs() << "H|");
   }
 
+  unsigned IsAvailableAmt = 0;
   // Critical path first.
   if (Q.getID() == TopQID) {
-    ResCount += (SU->getHeight() * ScaleTwo);
+    if (Top.isLatencyBound(SU)) {
+      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+      ResCount += (SU->getHeight() * ScaleTwo);
+    }
 
-    DEBUG(if (verbose) {
+    LLVM_DEBUG(if (verbose) {
       std::stringstream dbgstr;
       dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
       dbgs() << dbgstr.str();
@@ -562,16 +607,19 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
 
     // If resources are available for it, multiply the
     // chance of scheduling.
-    if (Top.ResourceModel->isResourceAvailable(SU)) {
-      ResCount <<= FactorOne;
-      ResCount += PriorityThree;
-      DEBUG(if (verbose) dbgs() << "A|");
+    if (Top.ResourceModel->isResourceAvailable(SU, true)) {
+      IsAvailableAmt = (PriorityTwo + PriorityThree);
+      ResCount += IsAvailableAmt;
+      LLVM_DEBUG(if (verbose) dbgs() << "A|");
     } else
-      DEBUG(if (verbose) dbgs() << " |");
+      LLVM_DEBUG(if (verbose) dbgs() << " |");
   } else {
-    ResCount += (SU->getDepth() * ScaleTwo);
+    if (Bot.isLatencyBound(SU)) {
+      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+      ResCount += (SU->getDepth() * ScaleTwo);
+    }
 
-    DEBUG(if (verbose) {
+    LLVM_DEBUG(if (verbose) {
       std::stringstream dbgstr;
       dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
       dbgs() << dbgstr.str();
@@ -579,12 +627,12 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
 
     // If resources are available for it, multiply the
     // chance of scheduling.
-    if (Bot.ResourceModel->isResourceAvailable(SU)) {
-      ResCount <<= FactorOne;
-      ResCount += PriorityThree;
-      DEBUG(if (verbose) dbgs() << "A|");
+    if (Bot.ResourceModel->isResourceAvailable(SU, false)) {
+      IsAvailableAmt = (PriorityTwo + PriorityThree);
+      ResCount += IsAvailableAmt;
+      LLVM_DEBUG(if (verbose) dbgs() << "A|");
     } else
-      DEBUG(if (verbose) dbgs() << " |");
+      LLVM_DEBUG(if (verbose) dbgs() << " |");
   }
 
   unsigned NumNodesBlocking = 0;
@@ -593,18 +641,20 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
     // Look at all of the successors of this node.
     // Count the number of nodes that
     // this node is the sole unscheduled node for.
-    for (const SDep &SI : SU->Succs)
-      if (isSingleUnscheduledPred(SI.getSUnit(), SU))
-        ++NumNodesBlocking;
+    if (Top.isLatencyBound(SU))
+      for (const SDep &SI : SU->Succs)
+        if (isSingleUnscheduledPred(SI.getSUnit(), SU))
+          ++NumNodesBlocking;
   } else {
     // How many unscheduled predecessors block this node?
-    for (const SDep &PI : SU->Preds)
-      if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
-        ++NumNodesBlocking;
+    if (Bot.isLatencyBound(SU))
+      for (const SDep &PI : SU->Preds)
+        if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
+          ++NumNodesBlocking;
   }
   ResCount += (NumNodesBlocking * ScaleTwo);
 
-  DEBUG(if (verbose) {
+  LLVM_DEBUG(if (verbose) {
     std::stringstream dbgstr;
     dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
     dbgs() << dbgstr.str();
@@ -619,10 +669,17 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
     // Decrease priority slightly if register pressure would increase over the
     // current maximum.
     ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo);
-    DEBUG(if (verbose) {
-        dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
-               << Delta.CriticalMax.getUnitInc() <<"/"
-               << Delta.CurrentMax.getUnitInc() << ")|";
+    // If there are register pressure issues, then we remove the value added for
+    // the instruction being available. The rationale is that we really don't
+    // want to schedule an instruction that causes a spill.
+    if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 &&
+        (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() ||
+         Delta.CurrentMax.getUnitInc()))
+      ResCount -= IsAvailableAmt;
+    LLVM_DEBUG(if (verbose) {
+      dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+             << Delta.CriticalMax.getUnitInc() << "/"
+             << Delta.CurrentMax.getUnitInc() << ")|";
     });
   }
 
@@ -631,53 +688,39 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
   auto &QII = *QST.getInstrInfo();
   if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) {
-    if (Q.getID() == TopQID && Top.ResourceModel->isResourceAvailable(SU)) {
+    if (Q.getID() == TopQID &&
+        Top.ResourceModel->isResourceAvailable(SU, true)) {
       ResCount += PriorityTwo;
-      DEBUG(if (verbose) dbgs() << "C|");
+      LLVM_DEBUG(if (verbose) dbgs() << "C|");
     } else if (Q.getID() == BotQID &&
-               Bot.ResourceModel->isResourceAvailable(SU)) {
+               Bot.ResourceModel->isResourceAvailable(SU, false)) {
       ResCount += PriorityTwo;
-      DEBUG(if (verbose) dbgs() << "C|");
+      LLVM_DEBUG(if (verbose) dbgs() << "C|");
     }
   }
 
   // Give preference to a zero latency instruction if the dependent
   // instruction is in the current packet.
-  if (Q.getID() == TopQID) {
+  if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) {
     for (const SDep &PI : SU->Preds) {
       if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
           PI.getLatency() == 0 &&
           Top.ResourceModel->isInPacket(PI.getSUnit())) {
         ResCount += PriorityThree;
-        DEBUG(if (verbose) dbgs() << "Z|");
+        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
       }
     }
-  } else {
+  } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) {
     for (const SDep &SI : SU->Succs) {
       if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
           SI.getLatency() == 0 &&
           Bot.ResourceModel->isInPacket(SI.getSUnit())) {
         ResCount += PriorityThree;
-        DEBUG(if (verbose) dbgs() << "Z|");
+        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
       }
     }
   }
 
-  // Give less preference to an instruction that will cause a stall with
-  // an instruction in the previous packet.
-  if (QII.isHVXVec(Instr)) {
-    // Check for stalls in the previous packet.
-    if (Q.getID() == TopQID) {
-      for (auto J : Top.ResourceModel->OldPacket)
-        if (QII.producesStall(*J->getInstr(), Instr))
-          ResCount -= PriorityOne;
-    } else {
-      for (auto J : Bot.ResourceModel->OldPacket)
-        if (QII.producesStall(Instr, *J->getInstr()))
-          ResCount -= PriorityOne;
-    }
-  }
-
   // If the instruction has a non-zero latency dependence with an instruction in
   // the current packet, then it should not be scheduled yet. The case occurs
   // when the dependent instruction is scheduled in a new packet, so the
@@ -689,7 +732,7 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
         if (PI.getLatency() > 0 &&
             Top.ResourceModel->isInPacket(PI.getSUnit())) {
           ResCount -= PriorityOne;
-          DEBUG(if (verbose) dbgs() << "D|");
+          LLVM_DEBUG(if (verbose) dbgs() << "D|");
         }
       }
     } else {
@@ -697,13 +740,13 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
         if (SI.getLatency() > 0 &&
             Bot.ResourceModel->isInPacket(SI.getSUnit())) {
           ResCount -= PriorityOne;
-          DEBUG(if (verbose) dbgs() << "D|");
+          LLVM_DEBUG(if (verbose) dbgs() << "D|");
         }
       }
     }
   }
 
-  DEBUG(if (verbose) {
+  LLVM_DEBUG(if (verbose) {
     std::stringstream dbgstr;
     dbgstr << "Total " << std::setw(4) << ResCount << ")";
     dbgs() << dbgstr.str();
@@ -718,11 +761,12 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
 ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
-pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+pickNodeFromQueue(VLIWSchedBoundary &Zone, const RegPressureTracker &RPTracker,
                   SchedCandidate &Candidate) {
-  DEBUG(if (SchedDebugVerboseLevel > 1)
-        readyQueueVerboseDump(RPTracker, Candidate, Q);
-        else Q.dump(););
+  ReadyQueue &Q = Zone.Available;
+  LLVM_DEBUG(if (SchedDebugVerboseLevel > 1)
+                 readyQueueVerboseDump(RPTracker, Candidate, Q);
+             else Q.dump(););
 
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
@@ -739,7 +783,7 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
 
     // Initialize the candidate if needed.
     if (!Candidate.SU) {
-      DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
+      LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
       Candidate.SU = *I;
       Candidate.RPDelta = RPDelta;
       Candidate.SCost = CurrentCost;
@@ -747,9 +791,23 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
       continue;
     }
 
+    // Choose node order for negative cost candidates. There is no good
+    // candidate in this case.
+    if (CurrentCost < 0 && Candidate.SCost < 0) {
+      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
+          || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+        LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = NodeOrder;
+      }
+      continue;
+    }
+
     // Best cost.
     if (CurrentCost > Candidate.SCost) {
-      DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
+      LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
       Candidate.SU = *I;
       Candidate.RPDelta = RPDelta;
       Candidate.SCost = CurrentCost;
@@ -757,65 +815,53 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
       continue;
     }
 
-    // Tie breaker using Timing Class.
-    if (!DisableTCTie) {
-      auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
-      auto &QII = *QST.getInstrInfo();
-
-      const MachineInstr *MI = (*I)->getInstr();
-      const MachineInstr *CandI = Candidate.SU->getInstr();
-      const InstrItineraryData *InstrItins = QST.getInstrItineraryData();
-
-      unsigned InstrLatency = QII.getInstrTimingClassLatency(InstrItins, *MI);
-      unsigned CandLatency = QII.getInstrTimingClassLatency(InstrItins, *CandI);
-      DEBUG(dbgs() << "TC Tie Breaker Cand: "
-                   << CandLatency << " Instr:" << InstrLatency << "\n"
-                   << *MI << *CandI << "\n");
-      if (Q.getID() == TopQID && CurrentCost == Candidate.SCost) {
-        if (InstrLatency < CandLatency && TopUseShorterTie) {
-          Candidate.SU = *I;
-          Candidate.RPDelta = RPDelta;
-          Candidate.SCost = CurrentCost;
-          FoundCandidate = BestCost;
-          DEBUG(dbgs() << "Used top shorter tie breaker\n");
-          continue;
-        } else if (InstrLatency > CandLatency && !TopUseShorterTie) {
-          Candidate.SU = *I;
-          Candidate.RPDelta = RPDelta;
-          Candidate.SCost = CurrentCost;
-          FoundCandidate = BestCost;
-          DEBUG(dbgs() << "Used top longer tie breaker\n");
-          continue;
-        }
-      } else if (Q.getID() == BotQID && CurrentCost == Candidate.SCost) {
-        if (InstrLatency < CandLatency && BotUseShorterTie) {
-          Candidate.SU = *I;
-          Candidate.RPDelta = RPDelta;
-          Candidate.SCost = CurrentCost;
-          FoundCandidate = BestCost;
-          DEBUG(dbgs() << "Used Bot shorter tie breaker\n");
-          continue;
-        } else if (InstrLatency > CandLatency && !BotUseShorterTie) {
-          Candidate.SU = *I;
-          Candidate.RPDelta = RPDelta;
-          Candidate.SCost = CurrentCost;
-          FoundCandidate = BestCost;
-          DEBUG(dbgs() << "Used Bot longer tie breaker\n");
-          continue;
-        }
+    // Choose an instruction that does not depend on an artificial edge.
+    unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID));
+    unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID));
+    if (CurrWeak != CandWeak) {
+      if (CurrWeak < CandWeak) {
+        LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = Weak;
       }
+      continue;
     }
 
-    if (CurrentCost == Candidate.SCost) {
-      if ((Q.getID() == TopQID &&
-           (*I)->Succs.size() > Candidate.SU->Succs.size()) ||
-          (Q.getID() == BotQID &&
-           (*I)->Preds.size() < Candidate.SU->Preds.size())) {
-        DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+    if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) {
+      unsigned CurrSize, CandSize;
+      if (Q.getID() == TopQID) {
+        CurrSize = (*I)->Succs.size();
+        CandSize = Candidate.SU->Succs.size();
+      } else {
+        CurrSize = (*I)->Preds.size();
+        CandSize = Candidate.SU->Preds.size();
+      }
+      if (CurrSize > CandSize) {
+        LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
         Candidate.SU = *I;
         Candidate.RPDelta = RPDelta;
         Candidate.SCost = CurrentCost;
         FoundCandidate = BestCost;
+      }
+      // Keep the old candidate if it's a better candidate. That is, don't use
+      // the subsequent tie breaker.
+      if (CurrSize != CandSize)
+        continue;
+    }
+
+    // Tie breaker.
+    // To avoid scheduling indeterminism, we need a tie breaker
+    // for the case when cost is identical for two nodes.
+    if (UseNewerCandidate && CurrentCost == Candidate.SCost) {
+      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
+          || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+        LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = NodeOrder;
         continue;
       }
     }
@@ -833,18 +879,18 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
-    DEBUG(dbgs() << "Picked only Bottom\n");
+    LLVM_DEBUG(dbgs() << "Picked only Bottom\n");
     IsTopNode = false;
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
-    DEBUG(dbgs() << "Picked only Top\n");
+    LLVM_DEBUG(dbgs() << "Picked only Top\n");
     IsTopNode = true;
     return SU;
   }
   SchedCandidate BotCand;
   // Prefer bottom scheduling when heuristics are silent.
-  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+  CandResult BotResult = pickNodeFromQueue(Bot,
                                            DAG->getBotRPTracker(), BotCand);
   assert(BotResult != NoCand && "failed to find the first candidate");
 
@@ -856,40 +902,40 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // increase pressure for one of the excess PSets, then schedule in that
   // direction first to provide more freedom in the other direction.
   if (BotResult == SingleExcess || BotResult == SingleCritical) {
-    DEBUG(dbgs() << "Prefered Bottom Node\n");
+    LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n");
     IsTopNode = false;
     return BotCand.SU;
   }
   // Check if the top Q has a better candidate.
   SchedCandidate TopCand;
-  CandResult TopResult = pickNodeFromQueue(Top.Available,
+  CandResult TopResult = pickNodeFromQueue(Top,
                                            DAG->getTopRPTracker(), TopCand);
   assert(TopResult != NoCand && "failed to find the first candidate");
 
   if (TopResult == SingleExcess || TopResult == SingleCritical) {
-    DEBUG(dbgs() << "Prefered Top Node\n");
+    LLVM_DEBUG(dbgs() << "Prefered Top Node\n");
     IsTopNode = true;
     return TopCand.SU;
   }
   // If either Q has a single candidate that minimizes pressure above the
   // original region's pressure pick it.
   if (BotResult == SingleMax) {
-    DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
+    LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
     IsTopNode = false;
     return BotCand.SU;
   }
   if (TopResult == SingleMax) {
-    DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
+    LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
     IsTopNode = true;
     return TopCand.SU;
   }
   if (TopCand.SCost > BotCand.SCost) {
-    DEBUG(dbgs() << "Prefered Top Node Cost\n");
+    LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n");
     IsTopNode = true;
     return TopCand.SU;
   }
   // Otherwise prefer the bottom candidate in node order.
-  DEBUG(dbgs() << "Prefered Bottom in Node order\n");
+  LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n");
   IsTopNode = false;
   return BotCand.SU;
 }
@@ -907,7 +953,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
     if (!SU) {
       SchedCandidate TopCand;
       CandResult TopResult =
-        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
       assert(TopResult != NoCand && "failed to find the first candidate");
       (void)TopResult;
       SU = TopCand.SU;
@@ -918,7 +964,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
     if (!SU) {
       SchedCandidate BotCand;
       CandResult BotResult =
-        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
       assert(BotResult != NoCand && "failed to find the first candidate");
       (void)BotResult;
       SU = BotCand.SU;
@@ -932,10 +978,11 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
-        << " Scheduling Instruction in cycle "
-        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
-        SU->dump(DAG));
+  LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+                    << " Scheduling instruction in cycle "
+                    << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " ("
+                    << reportPackets() << ")\n";
+             SU->dump(DAG));
   return SU;
 }
 
@@ -945,10 +992,10 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
 /// does.
 void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
-    SU->TopReadyCycle = Top.CurrCycle;
     Top.bumpNode(SU);
+    SU->TopReadyCycle = Top.CurrCycle;
   } else {
-    SU->BotReadyCycle = Bot.CurrCycle;
     Bot.bumpNode(SU);
+    SU->BotReadyCycle = Bot.CurrCycle;
   }
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
index bf7fe2d484a2..585a7858ad2b 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -49,9 +49,6 @@ class VLIWResourceModel {
   unsigned TotalPackets = 0;
 
 public:
-  /// Save the last formed packet.
-  std::vector<SUnit *> OldPacket;
-
   VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
       : SchedModel(SM) {
     ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
@@ -62,8 +59,6 @@ public:
 
     Packet.resize(SchedModel->getIssueWidth());
     Packet.clear();
-    OldPacket.resize(SchedModel->getIssueWidth());
-    OldPacket.clear();
     ResourcesModel->clearResources();
   }
 
@@ -84,9 +79,8 @@ public:
     ResourcesModel->clearResources();
   }
 
-  bool isResourceAvailable(SUnit *SU);
-  bool reserveResources(SUnit *SU);
-  void savePacket();
+  bool isResourceAvailable(SUnit *SU, bool IsTop);
+  bool reserveResources(SUnit *SU, bool IsTop);
   unsigned getTotalPackets() const { return TotalPackets; }
   bool isInPacket(SUnit *SU) const { return is_contained(Packet, SU); }
 };
@@ -102,6 +96,9 @@ public:
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
   void schedule() override;
+
+  RegisterClassInfo *getRegClassInfo() { return RegClassInfo; }
+  int getBBSize() { return BB->size(); }
 };
 
 //===----------------------------------------------------------------------===//
@@ -129,7 +126,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
   /// Represent the type of SchedCandidate found within a single queue.
   enum CandResult {
     NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
-    BestCost};
+    BestCost, Weak};
 
   /// Each Scheduling boundary is associated with ready queues. It tracks the
   /// current cycle in whichever direction at has moved, and maintains the state
@@ -147,6 +144,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
 
     unsigned CurrCycle = 0;
     unsigned IssueCount = 0;
+    unsigned CriticalPathLength = 0;
 
     /// MinReadyCycle - Cycle of the soonest available instruction.
     unsigned MinReadyCycle = std::numeric_limits<unsigned>::max();
@@ -168,7 +166,27 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
       DAG = dag;
       SchedModel = smodel;
+      CurrCycle = 0;
       IssueCount = 0;
+      // Initialize the critical path length limit, which used by the scheduling
+      // cost model to determine the value for scheduling an instruction. We use
+      // a slightly different heuristic for small and large functions. For small
+      // functions, it's important to use the height/depth of the instruction.
+      // For large functions, prioritizing by height or depth increases spills.
+      CriticalPathLength = DAG->getBBSize() / SchedModel->getIssueWidth();
+      if (DAG->getBBSize() < 50)
+        // We divide by two as a cheap and simple heuristic to reduce the
+        // critcal path length, which increases the priority of using the graph
+        // height/depth in the scheduler's cost computation.
+        CriticalPathLength >>= 1;
+      else {
+        // For large basic blocks, we prefer a larger critical path length to
+        // decrease the priority of using the graph height/depth.
+        unsigned MaxPath = 0;
+        for (auto &SU : DAG->SUnits)
+          MaxPath = std::max(MaxPath, isTop() ? SU.getHeight() : SU.getDepth());
+        CriticalPathLength = std::max(CriticalPathLength, MaxPath) + 1;
+      }
     }
 
     bool isTop() const {
@@ -188,6 +206,13 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     void removeReady(SUnit *SU);
 
     SUnit *pickOnlyChoice();
+
+    bool isLatencyBound(SUnit *SU) {
+      if (CurrCycle >= CriticalPathLength)
+        return true;
+      unsigned PathLength = isTop() ? SU->getHeight() : SU->getDepth();
+      return CriticalPathLength - CurrCycle <= PathLength;
+    }
   };
 
   VLIWMachineScheduler *DAG = nullptr;
@@ -197,6 +222,9 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
   VLIWSchedBoundary Top;
   VLIWSchedBoundary Bot;
 
+  /// List of pressure sets that have a high pressure level in the region.
+  std::vector<bool> HighPressureSets;
+
 public:
   /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
   enum {
@@ -217,7 +245,7 @@ public:
 
   void releaseBottomNode(SUnit *SU) override;
 
-  unsigned ReportPackets() {
+  unsigned reportPackets() {
     return Top.ResourceModel->getTotalPackets() +
            Bot.ResourceModel->getTotalPackets();
   }
@@ -225,11 +253,13 @@ public:
 protected:
   SUnit *pickNodeBidrectional(bool &IsTopNode);
 
+  int pressureChange(const SUnit *SU, bool isBotUp);
+
   int SchedulingCost(ReadyQueue &Q,
                      SUnit *SU, SchedCandidate &Candidate,
                      RegPressureDelta &Delta, bool verbose);
 
-  CandResult pickNodeFromQueue(ReadyQueue &Q,
+  CandResult pickNodeFromQueue(VLIWSchedBoundary &Zone,
                                const RegPressureTracker &RPTracker,
                                SchedCandidate &Candidate);
 #ifndef NDEBUG
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td b/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
index 718d3ac7d45a..c29a75e6fe74 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
@@ -7,80 +7,80 @@
 //
 //===----------------------------------------------------------------------===//
 
-def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2), (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65T]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vdd0),      (V6_vdd0)>, Requires<[HasV65T, UseHVX]>;
-def: Pat<(int_hexagon_V6_vdd0_128B), (V6_vdd0)>, Requires<[HasV65T, UseHVX]>;
+def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2), (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1), (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1), (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2), (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2), (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2), (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2), (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1), (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1), (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1), (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vdd0),      (V6_vdd0)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vdd0_128B), (V6_vdd0)>, Requires<[HasV65, UseHVX]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index ffa447cc1311..f2a6627c99be 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -16,7 +16,7 @@
 
 // The basic approach looks for sequence of predicated jump, compare instruciton
 // that genereates the predicate and, the feeder to the predicate. Once it finds
-// all, it collapses compare and jump instruction into a new valu jump
+// all, it collapses compare and jump instruction into a new value jump
 // intstructions.
 //
 //===----------------------------------------------------------------------===//
@@ -24,6 +24,7 @@
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -95,7 +96,7 @@ namespace {
     const HexagonInstrInfo *QII;
     const HexagonRegisterInfo *QRI;
 
-    /// \brief A handle to the branch probability pass.
+    /// A handle to the branch probability pass.
     const MachineBranchProbabilityInfo *MBPI;
 
     bool isNewValueJumpCandidate(const MachineInstr &MI) const;
@@ -142,8 +143,24 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
   if (QII->isSolo(*II))
     return false;
 
-  // Make sure there there is no 'def' or 'use' of any of the uses of
-  // feeder insn between it's definition, this MI and jump, jmpInst
+  if (QII->isFloat(*II))
+    return false;
+
+  // Make sure that the (unique) def operand is a register from IntRegs.
+  bool HadDef = false;
+  for (const MachineOperand &Op : II->operands()) {
+    if (!Op.isReg() || !Op.isDef())
+      continue;
+    if (HadDef)
+      return false;
+    HadDef = true;
+    if (!Hexagon::IntRegsRegClass.contains(Op.getReg()))
+      return false;
+  }
+  assert(HadDef);
+
+  // Make sure there is no 'def' or 'use' of any of the uses of
+  // feeder insn between its definition, this MI and jump, jmpInst
   // skipping compare, cmpInst.
   // Here's the example.
   //    r21=memub(r22+r24<<#0)
@@ -270,8 +287,8 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
     if (cmpReg1 == cmpOp2)
       return false;
 
-    // Make sure that that second register is not from COPY
-    // At machine code level, we don't need this, but if we decide
+    // Make sure that the second register is not from COPY
+    // at machine code level, we don't need this, but if we decide
     // to move new value jump prior to RA, we would be needing this.
     MachineRegisterInfo &MRI = MF.getRegInfo();
     if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
@@ -285,7 +302,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
   // and satisfy the following conditions.
   ++II;
   for (MachineBasicBlock::iterator localII = II; localII != end; ++localII) {
-    if (localII->isDebugValue())
+    if (localII->isDebugInstr())
       continue;
 
     // Check 1.
@@ -431,8 +448,8 @@ bool HexagonNewValueJump::isNewValueJumpCandidate(
 }
 
 bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
-               << "********** Function: " << MF.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+                    << "********** Function: " << MF.getName() << "\n");
 
   if (skipFunction(MF.getFunction()))
     return false;
@@ -445,9 +462,9 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
       MF.getSubtarget().getRegisterInfo());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
 
-  if (DisableNewValueJumps) {
+  if (DisableNewValueJumps ||
+      !MF.getSubtarget<HexagonSubtarget>().useNewValueJumps())
     return false;
-  }
 
   int nvjCount = DbgNVJCount;
   int nvjGenerated = 0;
@@ -457,9 +474,10 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
        MBBb != MBBe; ++MBBb) {
     MachineBasicBlock *MBB = &*MBBb;
 
-    DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n");
-    DEBUG(MBB->dump());
-    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+    LLVM_DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n");
+    LLVM_DEBUG(MBB->dump());
+    LLVM_DEBUG(dbgs() << "\n"
+                      << "********** dumping instr bottom up **********\n");
     bool foundJump    = false;
     bool foundCompare = false;
     bool invertPredicate = false;
@@ -477,14 +495,14 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
          MII != E;) {
       MachineInstr &MI = *--MII;
-      if (MI.isDebugValue()) {
+      if (MI.isDebugInstr()) {
         continue;
       }
 
       if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
         break;
 
-      DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
 
       if (!foundJump && (MI.getOpcode() == Hexagon::J2_jumpt ||
                          MI.getOpcode() == Hexagon::J2_jumptpt ||
@@ -505,7 +523,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
         // operands, the following check on the kill flag would suffice.
         // if(!jmpInstr->getOperand(0).isKill()) break;
 
-        // This predicate register is live out out of BB
+        // This predicate register is live out of BB
         // this would only work if we can actually use Live
         // variable analysis on phy regs - but LLVM does not
         // provide LV analysis on phys regs.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 4738a4d32409..29c044b3b729 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
@@ -78,7 +79,9 @@ private:
   using MISetType = DenseSet<MachineInstr *>;
   using InstrEvalMap = DenseMap<MachineInstr *, bool>;
 
+  MachineRegisterInfo *MRI = nullptr;
   const HexagonInstrInfo *HII = nullptr;
+  const HexagonRegisterInfo *HRI = nullptr;
   MachineDominatorTree *MDT = nullptr;
   DataFlowGraph *DFG = nullptr;
   DataFlowGraph::DefStackMap DefM;
@@ -88,11 +91,16 @@ private:
   bool processBlock(NodeAddr<BlockNode *> BA);
   bool xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
                   NodeAddr<UseNode *> UseN, unsigned UseMOnum);
+  bool processAddUses(NodeAddr<StmtNode *> AddSN, MachineInstr *AddMI,
+                      const NodeList &UNodeList);
+  bool updateAddUses(MachineInstr *AddMI, MachineInstr *UseMI);
   bool analyzeUses(unsigned DefR, const NodeList &UNodeList,
                    InstrEvalMap &InstrEvalResult, short &SizeInc);
   bool hasRepForm(MachineInstr &MI, unsigned TfrDefR);
   bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr &MI,
                        const NodeList &UNodeList);
+  bool isSafeToExtLR(NodeAddr<StmtNode *> SN, MachineInstr *MI,
+                     unsigned LRExtReg, const NodeList &UNodeList);
   void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
   bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
   short getBaseWithLongOffset(const MachineInstr &MI) const;
@@ -101,6 +109,7 @@ private:
   bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
   bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI,
                     const MachineOperand &ImmOp, unsigned ImmOpNum);
+  bool isValidOffset(MachineInstr *MI, int Offset);
 };
 
 } // end anonymous namespace
@@ -208,7 +217,7 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
     NodeSet Visited, Defs;
     const auto &P = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
     if (!P.second) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "*** Unable to collect all reaching defs for use ***\n"
                << PrintNode<UseNode*>(UN, *DFG) << '\n'
                << "The program's complexity may exceed the limits.\n";
@@ -217,7 +226,7 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
     }
     const auto &ReachingDefs = P.first;
     if (ReachingDefs.size() > 1) {
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
         for (auto DI : ReachingDefs) {
           NodeAddr<UseNode *> DA = DFG->addr<UseNode *>(DI);
@@ -235,15 +244,15 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
 void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
                                         NodeList &UNodeList) {
   for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
-    DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "\t\t[DefNode]: "
+                      << Print<NodeAddr<DefNode *>>(DA, *DFG) << "\n");
     RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
 
     auto UseSet = LV->getAllReachedUses(DR, DA);
 
     for (auto UI : UseSet) {
       NodeAddr<UseNode *> UA = DFG->addr<UseNode *>(UI);
-      DEBUG({
+      LLVM_DEBUG({
         NodeAddr<StmtNode *> TempIA = UA.Addr->getOwner(*DFG);
         dbgs() << "\t\t\t[Reached Use]: "
                << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
@@ -253,8 +262,8 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
         NodeAddr<PhiNode *> PA = UA.Addr->getOwner(*DFG);
         NodeId id = PA.Id;
         const Liveness::RefMap &phiUse = LV->getRealUses(id);
-        DEBUG(dbgs() << "\t\t\t\tphi real Uses"
-                     << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
+        LLVM_DEBUG(dbgs() << "\t\t\t\tphi real Uses"
+                          << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
         if (!phiUse.empty()) {
           for (auto I : phiUse) {
             if (!DFG->getPRI().alias(RegisterRef(I.first), DR))
@@ -272,6 +281,153 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
   }
 }
 
+bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
+                                       MachineInstr *MI, unsigned LRExtReg,
+                                       const NodeList &UNodeList) {
+  RegisterRef LRExtRR;
+  NodeId LRExtRegRD = 0;
+  // Iterate through all the UseNodes in SN and find the reaching def
+  // for the LRExtReg.
+  for (NodeAddr<UseNode *> UA : SN.Addr->members_if(DFG->IsUse, *DFG)) {
+    RegisterRef RR = UA.Addr->getRegRef(*DFG);
+    if (LRExtReg == RR.Reg) {
+      LRExtRR = RR;
+      LRExtRegRD = UA.Addr->getReachingDef();
+    }
+  }
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UA = *I;
+    NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG);
+    // The reaching def of LRExtRR at load/store node should be same as the
+    // one reaching at the SN.
+    if (UA.Addr->getFlags() & NodeAttrs::PhiRef)
+      return false;
+    NodeAddr<RefNode*> AA = LV->getNearestAliasedRef(LRExtRR, IA);
+    if ((DFG->IsDef(AA) && AA.Id != LRExtRegRD) ||
+        AA.Addr->getReachingDef() != LRExtRegRD) {
+      LLVM_DEBUG(
+          dbgs() << "isSafeToExtLR: Returning false; another reaching def\n");
+      return false;
+    }
+
+    MachineInstr *UseMI = NodeAddr<StmtNode *>(IA).Addr->getCode();
+    NodeAddr<DefNode *> LRExtRegDN = DFG->addr<DefNode *>(LRExtRegRD);
+    // Reaching Def to LRExtReg can't be a phi.
+    if ((LRExtRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+        MI->getParent() != UseMI->getParent())
+    return false;
+  }
+  return true;
+}
+
+bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) {
+  unsigned AlignMask = 0;
+  switch (HII->getMemAccessSize(*MI)) {
+  case HexagonII::MemAccessSize::DoubleWordAccess:
+    AlignMask = 0x7;
+    break;
+  case HexagonII::MemAccessSize::WordAccess:
+    AlignMask = 0x3;
+    break;
+  case HexagonII::MemAccessSize::HalfWordAccess:
+    AlignMask = 0x1;
+    break;
+  case HexagonII::MemAccessSize::ByteAccess:
+    AlignMask = 0x0;
+    break;
+  default:
+    return false;
+  }
+
+  if ((AlignMask & Offset) != 0)
+    return false;
+  return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false);
+}
+
+bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
+                                        MachineInstr *AddMI,
+                                        const NodeList &UNodeList) {
+
+  unsigned AddDefR = AddMI->getOperand(0).getReg();
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UN = *I;
+    NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
+    MachineInstr *MI = SN.Addr->getCode();
+    const MCInstrDesc &MID = MI->getDesc();
+    if ((!MID.mayLoad() && !MID.mayStore()) ||
+        HII->getAddrMode(*MI) != HexagonII::BaseImmOffset ||
+        HII->isHVXVec(*MI))
+      return false;
+
+    MachineOperand BaseOp = MID.mayLoad() ? MI->getOperand(1)
+                                          : MI->getOperand(0);
+
+    if (!BaseOp.isReg() || BaseOp.getReg() != AddDefR)
+      return false;
+
+    MachineOperand OffsetOp = MID.mayLoad() ? MI->getOperand(2)
+                                            : MI->getOperand(1);
+    if (!OffsetOp.isImm())
+      return false;
+
+    int64_t newOffset = OffsetOp.getImm() + AddMI->getOperand(2).getImm();
+    if (!isValidOffset(MI, newOffset))
+      return false;
+
+    // Since we'll be extending the live range of Rt in the following example,
+    // make sure that is safe. another definition of Rt doesn't exist between 'add'
+    // and load/store instruction.
+    //
+    // Ex: Rx= add(Rt,#10)
+    //     memw(Rx+#0) = Rs
+    // will be replaced with =>  memw(Rt+#10) = Rs
+    unsigned BaseReg = AddMI->getOperand(1).getReg();
+    if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList))
+      return false;
+  }
+
+  // Update all the uses of 'add' with the appropriate base and offset
+  // values.
+  bool Changed = false;
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UseN = *I;
+    assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+           "Found a PhiRef node as a real reached use!!");
+
+    NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
+    MachineInstr *UseMI = OwnerN.Addr->getCode();
+    LLVM_DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
+                      << ">]: " << *UseMI << "\n");
+    Changed |= updateAddUses(AddMI, UseMI);
+  }
+
+  if (Changed)
+    Deleted.insert(AddMI);
+
+  return Changed;
+}
+
+bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI,
+                                        MachineInstr *UseMI) {
+  const MachineOperand ImmOp = AddMI->getOperand(2);
+  const MachineOperand AddRegOp = AddMI->getOperand(1);
+  unsigned newReg = AddRegOp.getReg();
+  const MCInstrDesc &MID = UseMI->getDesc();
+
+  MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1)
+                                         : UseMI->getOperand(0);
+  MachineOperand &OffsetOp = MID.mayLoad() ? UseMI->getOperand(2)
+                                           : UseMI->getOperand(1);
+  BaseOp.setReg(newReg);
+  BaseOp.setIsUndef(AddRegOp.isUndef());
+  BaseOp.setImplicit(AddRegOp.isImplicit());
+  OffsetOp.setImm(ImmOp.getImm() + OffsetOp.getImm());
+  MRI->clearKillFlags(newReg);
+
+  return true;
+}
+
 bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
                                      const NodeList &UNodeList,
                                      InstrEvalMap &InstrEvalResult,
@@ -296,7 +452,7 @@ bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
     } else if (MI.getOpcode() == Hexagon::S2_addasl_rrri) {
       NodeList AddaslUseList;
 
-      DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
+      LLVM_DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
       getAllRealUses(SN, AddaslUseList);
       // Process phi nodes.
       if (allValidCandidates(SN, AddaslUseList) &&
@@ -360,8 +516,8 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
     } else
       Changed = false;
 
-    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
-    DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+    LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
   } else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) {
     short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
@@ -371,8 +527,8 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
     MIB.add(ImmOp);
     OpStart = 4;
     Changed = true;
-    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
-    DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+    LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
   }
 
   if (Changed)
@@ -413,8 +569,8 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       OpStart = 3;
     }
     Changed = true;
-    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
-    DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+    LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
   } else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
     short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
@@ -423,8 +579,8 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
     MIB.add(ImmOp);
     OpStart = 3;
     Changed = true;
-    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
-    DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+    LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
   }
   if (Changed)
     for (unsigned i = OpStart; i < OpEnd; ++i)
@@ -447,7 +603,7 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
                                       unsigned ImmOpNum) {
   NodeAddr<StmtNode *> SA = AddAslUN.Addr->getOwner(*DFG);
 
-  DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
+  LLVM_DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
 
   NodeList UNodeList;
   getAllRealUses(SA, UNodeList);
@@ -458,11 +614,11 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
            "Can't transform this 'AddAsl' instruction!");
 
     NodeAddr<StmtNode *> UseIA = UseUN.Addr->getOwner(*DFG);
-    DEBUG(dbgs() << "[InstrNode]: " << Print<NodeAddr<InstrNode *>>(UseIA, *DFG)
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "[InstrNode]: "
+                      << Print<NodeAddr<InstrNode *>>(UseIA, *DFG) << "\n");
     MachineInstr *UseMI = UseIA.Addr->getCode();
-    DEBUG(dbgs() << "[MI <" << printMBBReference(*UseMI->getParent())
-                 << ">]: " << *UseMI << "\n");
+    LLVM_DEBUG(dbgs() << "[MI <" << printMBBReference(*UseMI->getParent())
+                      << ">]: " << *UseMI << "\n");
     const MCInstrDesc &UseMID = UseMI->getDesc();
     assert(HII->getAddrMode(*UseMI) == HexagonII::BaseImmOffset);
 
@@ -534,13 +690,15 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
 
     NodeAddr<StmtNode *> SA = IA;
     MachineInstr *MI = SA.Addr->getCode();
-    if (MI->getOpcode() != Hexagon::A2_tfrsi ||
-        !MI->getOperand(1).isGlobal())
-      continue;
+    if ((MI->getOpcode() != Hexagon::A2_tfrsi ||
+         !MI->getOperand(1).isGlobal()) &&
+        (MI->getOpcode() != Hexagon::A2_addi ||
+         !MI->getOperand(2).isImm() || HII->isConstExtended(*MI)))
+    continue;
 
-    DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode()) << "]: "
-                 << *MI << "\n\t[InstrNode]: "
-                 << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
+    LLVM_DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode())
+                      << "]: " << *MI << "\n\t[InstrNode]: "
+                      << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
 
     NodeList UNodeList;
     getAllRealUses(SA, UNodeList);
@@ -548,6 +706,21 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
     if (!allValidCandidates(SA, UNodeList))
       continue;
 
+    // Analyze all uses of 'add'. If the output of 'add' is used as an address
+    // in the base+immediate addressing mode load/store instructions, see if
+    // they can be updated to use the immediate value as an offet. Thus,
+    // providing us the opportunity to eliminate 'add'.
+    // Ex: Rx= add(Rt,#12)
+    //     memw(Rx+#0) = Rs
+    // This can be replaced with memw(Rt+#12) = Rs
+    //
+    // This transformation is only performed if all uses can be updated and
+    // the offset isn't required to be constant extended.
+    if (MI->getOpcode() == Hexagon::A2_addi) {
+      Changed |= processAddUses(SA, MI, UNodeList);
+      continue;
+    }
+
     short SizeInc = 0;
     unsigned DefR = MI->getOperand(0).getReg();
     InstrEvalMap InstrEvalResult;
@@ -561,8 +734,9 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
 
     bool KeepTfr = false;
 
-    DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size() << "\n");
-    DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
+    LLVM_DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size()
+                      << "\n");
+    LLVM_DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
     for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
       NodeAddr<UseNode *> UseN = *I;
       assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
@@ -570,8 +744,8 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
 
       NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
       MachineInstr *UseMI = OwnerN.Addr->getCode();
-      DEBUG(dbgs() << "\t\t[MI <" << printMBBReference(*UseMI->getParent())
-                   << ">]: " << *UseMI << "\n");
+      LLVM_DEBUG(dbgs() << "\t\t[MI <" << printMBBReference(*UseMI->getParent())
+                        << ">]: " << *UseMI << "\n");
 
       int UseMOnum = -1;
       unsigned NumOperands = UseMI->getNumOperands();
@@ -580,9 +754,11 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
         if (op.isReg() && op.isUse() && DefR == op.getReg())
           UseMOnum = j;
       }
-      assert(UseMOnum >= 0 && "Invalid reached use!");
+      // It is possible that the register will not be found in any operand.
+      // This could happen, for example, when DefR = R4, but the used
+      // register is D2.
 
-      if (InstrEvalResult[UseMI])
+      if (UseMOnum >= 0 && InstrEvalResult[UseMI])
         // Change UseMI if replacement is possible.
         Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum);
       else
@@ -600,27 +776,27 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
-  auto &MRI = MF.getRegInfo();
+  MRI = &MF.getRegInfo();
   HII = HST.getInstrInfo();
+  HRI = HST.getRegisterInfo();
   const auto &MDF = getAnalysis<MachineDominanceFrontier>();
   MDT = &getAnalysis<MachineDominatorTree>();
-  const auto &TRI = *MF.getSubtarget().getRegisterInfo();
   const TargetOperandInfo TOI(*HII);
 
-  DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
+  DataFlowGraph G(MF, *HII, *HRI, *MDT, MDF, TOI);
   // Need to keep dead phis because we can propagate uses of registers into
   // nodes dominated by those would-be phis.
   G.build(BuildOptions::KeepDeadPhis);
   DFG = &G;
 
-  Liveness L(MRI, *DFG);
+  Liveness L(*MRI, *DFG);
   L.computePhiInfo();
   LV = &L;
 
   Deleted.clear();
   NodeAddr<FuncNode *> FA = DFG->getFunc();
-  DEBUG(dbgs() << "==== [RefMap#]=====:\n "
-               << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
+  LLVM_DEBUG(dbgs() << "==== [RefMap#]=====:\n "
+                    << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
 
   for (NodeAddr<BlockNode *> BA : FA.Addr->members(*DFG))
     Changed |= processBlock(BA);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 98229f4fa64a..384fda4ce39a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -100,6 +100,17 @@ def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
 def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
 def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
 
+def SDTVecVecIntOp:
+  SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>,
+                       SDTCisVT<3,i32>]>;
+
+def HexagonVALIGN:     SDNode<"HexagonISD::VALIGN",     SDTVecVecIntOp>;
+def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
+
+def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
+                    (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
+def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
+
 // Pattern fragments to extract the low and high subregisters from a
 // 64-bit value.
 def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>;
@@ -109,16 +120,6 @@ def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
   return isOrEquivalentToAdd(N);
 }]>;
 
-def IsVecOff : PatLeaf<(i32 imm), [{
-  int32_t V = N->getSExtValue();
-  int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass);
-  assert(isPowerOf2_32(VecSize));
-  if ((uint32_t(V) & (uint32_t(VecSize)-1)) != 0)
-    return false;
-  int32_t L = Log2_32(VecSize);
-  return isInt<4>(V >> L);
-}]>;
-
 def IsPow2_32: PatLeaf<(i32 imm), [{
   uint32_t V = N->getZExtValue();
   return isPowerOf2_32(V);
@@ -214,7 +215,7 @@ def NegImm32: SDNodeXForm<imm, [{
 
 // Helpers for type promotions/contractions.
 def I1toI32:  OutPatFrag<(ops node:$Rs), (C2_muxii (i1 $Rs), 1, 0)>;
-def I32toI1:  OutPatFrag<(ops node:$Rs), (i1 (C2_tfrrp (i32 $Rs)))>;
+def I32toI1:  OutPatFrag<(ops node:$Rs), (i1 (C2_cmpgtui (i32 $Rs), (i32 0)))>;
 def ToZext64: OutPatFrag<(ops node:$Rs), (i64 (A4_combineir 0, (i32 $Rs)))>;
 def ToSext64: OutPatFrag<(ops node:$Rs), (i64 (A2_sxtw (i32 $Rs)))>;
 
@@ -249,23 +250,6 @@ def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off),
          (PS_fi (i32 AddrFI:$Rs), imm:$off)>;
 
 
-def alignedload: PatFrag<(ops node:$a), (load $a), [{
-  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-def unalignedload: PatFrag<(ops node:$a), (load $a), [{
-  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-def alignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
-  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-def unalignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
-  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
-}]>;
-
-
 // Converters from unary/binary SDNode to PatFrag.
 class pf1<SDNode Op> : PatFrag<(ops node:$a), (Op node:$a)>;
 class pf2<SDNode Op> : PatFrag<(ops node:$a, node:$b), (Op node:$a, node:$b)>;
@@ -274,7 +258,7 @@ class Not2<PatFrag P>
   : PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;
 
 class Su<PatFrag Op>
-  : PatFrag<Op.Operands, Op.Fragment, [{ return hasOneUse(N); }],
+  : PatFrag<Op.Operands, !head(Op.Fragments), [{ return hasOneUse(N); }],
             Op.OperandTransform>;
 
 // Main selection macros.
@@ -298,9 +282,9 @@ class AccRRI_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op,
         (MI RegPred:$Rx, RegPred:$Rs, imm:$I)>;
 
 class AccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op,
-                 PatFrag RsPred, PatFrag RtPred>
-  : Pat<(AccOp RsPred:$Rx, (Op RsPred:$Rs, RtPred:$Rt)),
-        (MI RsPred:$Rx, RsPred:$Rs, RtPred:$Rt)>;
+                 PatFrag RxPred, PatFrag RsPred, PatFrag RtPred>
+  : Pat<(AccOp RxPred:$Rx, (Op RsPred:$Rs, RtPred:$Rt)),
+        (MI RxPred:$Rx, RsPred:$Rs, RtPred:$Rt)>;
 
 multiclass SelMinMax_pats<PatFrag CmpOp, PatFrag Val,
                           InstHexagon InstA, InstHexagon InstB> {
@@ -316,6 +300,7 @@ def Add: pf2<add>;    def And: pf2<and>;    def Sra: pf2<sra>;
 def Sub: pf2<sub>;    def Or:  pf2<or>;     def Srl: pf2<srl>;
 def Mul: pf2<mul>;    def Xor: pf2<xor>;    def Shl: pf2<shl>;
 
+def Rol: pf2<rotl>;
 
 // --(1) Immediate -------------------------------------------------------
 //
@@ -363,7 +348,7 @@ def ToI32: OutPatFrag<(ops node:$V), (A2_tfrsi $V)>;
 // --(2) Type cast -------------------------------------------------------
 //
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
   def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
 
@@ -389,7 +374,7 @@ let Predicates = [HasV5T] in {
 }
 
 // Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
   def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
   def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
@@ -422,9 +407,14 @@ def: Pat<(i64 (sext I1:$Pu)),
          (Combinew (C2_muxii PredRegs:$Pu, -1, 0),
                    (C2_muxii PredRegs:$Pu, -1, 0))>;
 
-def: Pat<(i32 (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>;
-def: Pat<(i32 (zext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
-def: Pat<(i64 (zext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def: Pat<(i32   (sext I1:$Pu)),   (C2_muxii I1:$Pu, -1, 0)>;
+def: Pat<(i32   (zext I1:$Pu)),   (C2_muxii I1:$Pu, 1, 0)>;
+def: Pat<(i64   (zext I1:$Pu)),   (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>;
+def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>;
+def: Pat<(v4i8  (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>;
+def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>;
+def: Pat<(v8i8  (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>;
 
 def: Pat<(i64 (sext I32:$Rs)), (A2_sxtw I32:$Rs)>;
 def: Pat<(Zext64 I32:$Rs),     (ToZext64 $Rs)>;
@@ -441,6 +431,20 @@ let AddedComplexity = 20 in {
 def: Pat<(i32 (anyext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
 def: Pat<(i64 (anyext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
 
+def Vsplatpi: OutPatFrag<(ops node:$V),
+                         (Combinew (A2_tfrsi $V), (A2_tfrsi $V))>;
+def: Pat<(v8i8 (zext V8I1:$Pu)),
+         (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>;
+def: Pat<(v4i16 (zext V4I1:$Pu)),
+         (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>;
+def: Pat<(v2i32 (zext V2I1:$Pu)),
+         (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>;
+
+def: Pat<(v4i8 (zext V4I1:$Pu)),
+         (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>;
+def: Pat<(v2i16 (zext V2I1:$Pu)),
+         (A2_andir (LoReg (C2_mask V2I1:$Pu)), (i32 0x00010001))>;
+
 def: Pat<(v4i16 (zext   V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
 def: Pat<(v2i32 (zext   V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
 def: Pat<(v4i16 (anyext V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
@@ -475,25 +479,40 @@ def: Pat<(v2i16 (trunc V2I32:$Rs)),
 //
 
 def: Pat<(not I1:$Ps),      (C2_not I1:$Ps)>;
+def: Pat<(not V8I1:$Ps),    (C2_not V8I1:$Ps)>;
 def: Pat<(add I1:$Ps, -1),  (C2_not I1:$Ps)>;
 
-def: OpR_RR_pat<C2_and,   And,       i1, I1>;
-def: OpR_RR_pat<C2_or,    Or,        i1, I1>;
-def: OpR_RR_pat<C2_xor,   Xor,       i1, I1>;
-def: OpR_RR_pat<C2_andn,  Not2<And>, i1, I1>;
-def: OpR_RR_pat<C2_orn,   Not2<Or>,  i1, I1>;
+multiclass BoolOpR_RR_pat<InstHexagon MI, PatFrag Op> {
+  def: OpR_RR_pat<MI, Op,   i1,   I1>;
+  def: OpR_RR_pat<MI, Op, v2i1, V2I1>;
+  def: OpR_RR_pat<MI, Op, v4i1, V4I1>;
+  def: OpR_RR_pat<MI, Op, v8i1, V8I1>;
+}
+
+multiclass BoolAccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op> {
+  def: AccRRR_pat<MI, AccOp, Op,   I1,   I1,   I1>;
+  def: AccRRR_pat<MI, AccOp, Op, V2I1, V2I1, V2I1>;
+  def: AccRRR_pat<MI, AccOp, Op, V4I1, V4I1, V4I1>;
+  def: AccRRR_pat<MI, AccOp, Op, V8I1, V8I1, V8I1>;
+}
+
+defm: BoolOpR_RR_pat<C2_and,   And>;
+defm: BoolOpR_RR_pat<C2_or,    Or>;
+defm: BoolOpR_RR_pat<C2_xor,   Xor>;
+defm: BoolOpR_RR_pat<C2_andn,  Not2<And>>;
+defm: BoolOpR_RR_pat<C2_orn,   Not2<Or>>;
 
 // op(Ps, op(Pt, Pu))
-def: AccRRR_pat<C4_and_and,   And, Su<And>,       I1, I1>;
-def: AccRRR_pat<C4_and_or,    And, Su<Or>,        I1, I1>;
-def: AccRRR_pat<C4_or_and,    Or,  Su<And>,       I1, I1>;
-def: AccRRR_pat<C4_or_or,     Or,  Su<Or>,        I1, I1>;
+defm: BoolAccRRR_pat<C4_and_and,   And, Su<And>>;
+defm: BoolAccRRR_pat<C4_and_or,    And, Su<Or>>;
+defm: BoolAccRRR_pat<C4_or_and,    Or,  Su<And>>;
+defm: BoolAccRRR_pat<C4_or_or,     Or,  Su<Or>>;
 
 // op(Ps, op(Pt, ~Pu))
-def: AccRRR_pat<C4_and_andn,  And, Su<Not2<And>>, I1, I1>;
-def: AccRRR_pat<C4_and_orn,   And, Su<Not2<Or>>,  I1, I1>;
-def: AccRRR_pat<C4_or_andn,   Or,  Su<Not2<And>>, I1, I1>;
-def: AccRRR_pat<C4_or_orn,    Or,  Su<Not2<Or>>,  I1, I1>;
+defm: BoolAccRRR_pat<C4_and_andn,  And, Su<Not2<And>>>;
+defm: BoolAccRRR_pat<C4_and_orn,   And, Su<Not2<Or>>>;
+defm: BoolAccRRR_pat<C4_or_andn,   Or,  Su<Not2<And>>>;
+defm: BoolAccRRR_pat<C4_or_orn,    Or,  Su<Not2<Or>>>;
 
 
 // --(5) Compare ---------------------------------------------------------
@@ -519,7 +538,7 @@ def: Pat<(i1 (setult I32:$Rs, u32_0ImmPred:$u9)),
 // Patfrag to convert the usual comparison patfrags (e.g. setlt) to ones
 // that reverse the order of the operands.
 class RevCmp<PatFrag F>
-  : PatFrag<(ops node:$rhs, node:$lhs), F.Fragment, F.PredicateCode,
+  : PatFrag<(ops node:$rhs, node:$lhs), !head(F.Fragments), F.PredicateCode,
             F.OperandTransform>;
 
 def: OpR_RR_pat<C2_cmpeq,     seteq,          i1,   I32>;
@@ -563,7 +582,7 @@ def: OpR_RR_pat<A2_vcmpwgtu,  RevCmp<setult>, v2i1, V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         v2i1, V2I32>;
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
   def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
   def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
@@ -598,27 +617,40 @@ def: Pat<(i1 (setle I32:$Rs, anyimm:$u5)),
 def: Pat<(i1 (setule I32:$Rs, anyimm:$u5)),
          (C2_not (C2_cmpgtui I32:$Rs, imm:$u5))>;
 
-def: Pat<(i1 (setne I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpeq I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setle I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgt I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setule I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgtu I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setge I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgt I32:$Rt, I32:$Rs))>;
-def: Pat<(i1 (setuge I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgtu I32:$Rt, I32:$Rs))>;
-
-def: Pat<(i1 (setle I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtp I64:$Rs, I64:$Rt))>;
-def: Pat<(i1 (setne I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpeqp I64:$Rs, I64:$Rt))>;
-def: Pat<(i1 (setge I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtp I64:$Rt, I64:$Rs))>;
-def: Pat<(i1 (setuge I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtup I64:$Rt, I64:$Rs))>;
-def: Pat<(i1 (setule I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtup I64:$Rs, I64:$Rt))>;
+class OpmR_RR_pat<PatFrag Output, PatFrag Op, ValueType ResType,
+                  PatFrag RsPred, PatFrag RtPred = RsPred>
+  : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
+        (Output RsPred:$Rs, RtPred:$Rt)>;
+
+class Outn<InstHexagon MI>
+  : OutPatFrag<(ops node:$Rs, node:$Rt),
+               (C2_not (MI $Rs, $Rt))>;
+
+def: OpmR_RR_pat<Outn<C2_cmpeq>,    setne,          i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgt>,    setle,          i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgtu>,   setule,         i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgt>,    RevCmp<setge>,  i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgtu>,   RevCmp<setuge>, i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpeqp>,   setne,          i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtp>,   setle,          i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtup>,  setule,         i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtp>,   RevCmp<setge>,  i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtup>,  RevCmp<setuge>, i1,   I64>;
+def: OpmR_RR_pat<Outn<A2_vcmpbeq>,  setne,          v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A4_vcmpbgt>,  setle,          v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpbgtu>, setule,         v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A4_vcmpbgt>,  RevCmp<setge>,  v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpbgtu>, RevCmp<setuge>, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpheq>,  setne,          v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgt>,  setle,          v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgtu>, setule,         v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgt>,  RevCmp<setge>,  v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgtu>, RevCmp<setuge>, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmpweq>,  setne,          v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgt>,  setle,          v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgtu>, setule,         v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgt>,  RevCmp<setge>,  v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgtu>, RevCmp<setuge>, v2i1, V2I32>;
 
 let AddedComplexity = 100 in {
   def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt), 255), 0)),
@@ -680,25 +712,10 @@ def: Pat<(i32 (zext (i1 (seteq I32:$Rs, anyimm:$s8)))),
 def: Pat<(i32 (zext (i1 (setne I32:$Rs, anyimm:$s8)))),
          (A4_rcmpneqi I32:$Rs, imm:$s8)>;
 
-def: Pat<(i1 (setne I1:$Ps, I1:$Pt)),
-         (C2_xor I1:$Ps, I1:$Pt)>;
-
-def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)),
-         (A2_vcmpbeq (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)),
-         (A4_vcmpbgt (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)),
-         (A2_vcmpbgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
-
-def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)),
-         (A2_vcmpheq (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)),
-         (A2_vcmphgt (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)),
-         (A2_vcmphgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
-
-def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)),
-         (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>;
+def: Pat<(i1 (seteq I1:$Ps, (i1 -1))), (I1:$Ps)>;
+def: Pat<(i1 (setne I1:$Ps, (i1 -1))), (C2_not I1:$Ps)>;
+def: Pat<(i1 (seteq I1:$Ps, I1:$Pt)),  (C2_xor I1:$Ps, (C2_not I1:$Pt))>;
+def: Pat<(i1 (setne I1:$Ps, I1:$Pt)),  (C2_xor I1:$Ps, I1:$Pt)>;
 
 // Floating-point comparisons with checks for ordered/unordered status.
 
@@ -706,18 +723,13 @@ class T3<InstHexagon MI1, InstHexagon MI2, InstHexagon MI3>
   : OutPatFrag<(ops node:$Rs, node:$Rt),
                (MI1 (MI2 $Rs, $Rt), (MI3 $Rs, $Rt))>;
 
-class OpmR_RR_pat<PatFrag Output, PatFrag Op, ValueType ResType,
-                  PatFrag RsPred, PatFrag RtPred = RsPred>
-  : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
-        (Output RsPred:$Rs, RtPred:$Rt)>;
-
 class Cmpuf<InstHexagon MI>:  T3<C2_or,  F2_sfcmpuo, MI>;
 class Cmpud<InstHexagon MI>:  T3<C2_or,  F2_dfcmpuo, MI>;
 
 class Cmpufn<InstHexagon MI>: T3<C2_orn, F2_sfcmpuo, MI>;
 class Cmpudn<InstHexagon MI>: T3<C2_orn, F2_dfcmpuo, MI>;
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
   def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
   def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
@@ -733,11 +745,7 @@ let Predicates = [HasV5T] in {
   def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
 }
 
-class Outn<InstHexagon MI>
-  : OutPatFrag<(ops node:$Rs, node:$Rt),
-               (C2_not (MI $Rs, $Rt))>;
-
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
   def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
 
@@ -776,7 +784,7 @@ def: Pat<(select I1:$Pu, I64:$Rs, I64:$Rt),
          (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
                    (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
            (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
   def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
@@ -813,20 +821,6 @@ def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt),
 def: Pat<(vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt),
          (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
 
-
-class HvxSel_pat<InstHexagon MI, PatFrag RegPred>
-  : Pat<(select I1:$Pu, RegPred:$Vs, RegPred:$Vt),
-        (MI I1:$Pu, RegPred:$Vs, RegPred:$Vt)>;
-
-let Predicates = [HasV60T,UseHVX] in {
-  def: HvxSel_pat<PS_vselect, HVI8>;
-  def: HvxSel_pat<PS_vselect, HVI16>;
-  def: HvxSel_pat<PS_vselect, HVI32>;
-  def: HvxSel_pat<PS_wselect, HWI8>;
-  def: HvxSel_pat<PS_wselect, HWI16>;
-  def: HvxSel_pat<PS_wselect, HWI32>;
-}
-
 // From LegalizeDAG.cpp: (Pu ? Pv : Pw) <=> (Pu & Pv) | (!Pu & Pw).
 def: Pat<(select I1:$Pu, I1:$Pv, I1:$Pw),
          (C2_or (C2_and  I1:$Pu, I1:$Pv),
@@ -878,7 +872,7 @@ let AddedComplexity = 200 in {
   defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
 }
 
-let AddedComplexity = 100, Predicates = [HasV5T] in {
+let AddedComplexity = 100, Predicates = [HasV5] in {
   defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
@@ -892,40 +886,34 @@ let AddedComplexity = 100, Predicates = [HasV5T] in {
 def SDTHexagonINSERT:
   SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                        SDTCisInt<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
-def SDTHexagonINSERTRP:
-  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                       SDTCisInt<0>, SDTCisVT<3, i64>]>;
-
 def HexagonINSERT:    SDNode<"HexagonISD::INSERT",   SDTHexagonINSERT>;
-def HexagonINSERTRP:  SDNode<"HexagonISD::INSERTRP", SDTHexagonINSERTRP>;
 
-def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2),
-         (S2_insert I32:$Rs, I32:$Rt, imm:$u1, imm:$u2)>;
-def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2),
-         (S2_insertp I64:$Rs, I64:$Rt, imm:$u1, imm:$u2)>;
-def: Pat<(HexagonINSERTRP I32:$Rs, I32:$Rt, I64:$Ru),
-         (S2_insert_rp I32:$Rs, I32:$Rt, I64:$Ru)>;
-def: Pat<(HexagonINSERTRP I64:$Rs, I64:$Rt, I64:$Ru),
-         (S2_insertp_rp I64:$Rs, I64:$Rt, I64:$Ru)>;
+let AddedComplexity = 10 in {
+  def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2),
+           (S2_insert I32:$Rs, I32:$Rt, imm:$u1, imm:$u2)>;
+  def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2),
+           (S2_insertp I64:$Rs, I64:$Rt, imm:$u1, imm:$u2)>;
+}
+def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, I32:$Width, I32:$Off),
+         (S2_insert_rp I32:$Rs, I32:$Rt, (Combinew $Width, $Off))>;
+def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, I32:$Width, I32:$Off),
+         (S2_insertp_rp I64:$Rs, I64:$Rt, (Combinew $Width, $Off))>;
 
 def SDTHexagonEXTRACTU
   : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
                   SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def SDTHexagonEXTRACTURP
-  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
-                  SDTCisVT<2, i64>]>;
-
 def HexagonEXTRACTU:   SDNode<"HexagonISD::EXTRACTU",   SDTHexagonEXTRACTU>;
-def HexagonEXTRACTURP: SDNode<"HexagonISD::EXTRACTURP", SDTHexagonEXTRACTURP>;
 
-def: Pat<(HexagonEXTRACTU I32:$Rs, u5_0ImmPred:$u5, u5_0ImmPred:$U5),
-         (S2_extractu I32:$Rs, imm:$u5, imm:$U5)>;
-def: Pat<(HexagonEXTRACTU I64:$Rs, u6_0ImmPred:$u6, u6_0ImmPred:$U6),
-         (S2_extractup I64:$Rs, imm:$u6, imm:$U6)>;
-def: Pat<(HexagonEXTRACTURP I32:$Rs, I64:$Rt),
-         (S2_extractu_rp I32:$Rs, I64:$Rt)>;
-def: Pat<(HexagonEXTRACTURP I64:$Rs, I64:$Rt),
-         (S2_extractup_rp I64:$Rs, I64:$Rt)>;
+let AddedComplexity = 10 in {
+  def: Pat<(HexagonEXTRACTU I32:$Rs, u5_0ImmPred:$u5, u5_0ImmPred:$U5),
+           (S2_extractu I32:$Rs, imm:$u5, imm:$U5)>;
+  def: Pat<(HexagonEXTRACTU I64:$Rs, u6_0ImmPred:$u6, u6_0ImmPred:$U6),
+           (S2_extractup I64:$Rs, imm:$u6, imm:$U6)>;
+}
+def: Pat<(HexagonEXTRACTU I32:$Rs, I32:$Width, I32:$Off),
+         (S2_extractu_rp I32:$Rs, (Combinew $Width, $Off))>;
+def: Pat<(HexagonEXTRACTU I64:$Rs, I32:$Width, I32:$Off),
+         (S2_extractup_rp I64:$Rs, (Combinew $Width, $Off))>;
 
 def SDTHexagonVSPLAT:
   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
@@ -938,20 +926,20 @@ def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
          (A2_combineii imm:$s8, imm:$s8)>;
 def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (Combinew I32:$Rs, I32:$Rs)>;
 
+let AddedComplexity = 10 in
+def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
+     Requires<[HasV62]>;
+def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)),
+         (Combinew (S2_vsplatrb I32:$Rs), (S2_vsplatrb I32:$Rs))>;
+
 
 // --(8) Shift/permute ---------------------------------------------------
 //
 
 def SDTHexagonI64I32I32: SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
-def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
-  SDTCisSubVecOfVec<1, 0>]>;
-def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>;
 
 def HexagonCOMBINE:  SDNode<"HexagonISD::COMBINE",  SDTHexagonI64I32I32>;
-def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
-def HexagonVPACKE:   SDNode<"HexagonISD::VPACKE",   SDTHexagonVPACK>;
-def HexagonVPACKO:   SDNode<"HexagonISD::VPACKO",   SDTHexagonVPACK>;
 
 def: Pat<(HexagonCOMBINE I32:$Rs, I32:$Rt), (Combinew $Rs, $Rt)>;
 
@@ -1001,11 +989,15 @@ def: OpR_RR_pat<S2_asr_r_p, Sra, i64, I64, I32>;
 def: OpR_RR_pat<S2_lsr_r_p, Srl, i64, I64, I32>;
 def: OpR_RR_pat<S2_asl_r_p, Shl, i64, I64, I32>;
 
+let Predicates = [HasV60] in {
+  def: OpR_RI_pat<S6_rol_i_r, Rol, i32, I32, u5_0ImmPred>;
+  def: OpR_RI_pat<S6_rol_i_p, Rol, i64, I64, u6_0ImmPred>;
+}
 
 def: Pat<(sra (add (sra I32:$Rs, u5_0ImmPred:$u5), 1), (i32 1)),
          (S2_asr_i_r_rnd I32:$Rs, imm:$u5)>;
 def: Pat<(sra (add (sra I64:$Rs, u6_0ImmPred:$u6), 1), (i32 1)),
-         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5T]>;
+         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5]>;
 
 // Prefer S2_addasl_rrri over S2_asl_i_r_acc.
 let AddedComplexity = 120 in
@@ -1046,41 +1038,55 @@ let AddedComplexity = 100 in {
   def: AccRRI_pat<S2_asl_i_p_and,   And, Su<Shl>, I64, u6_0ImmPred>;
   def: AccRRI_pat<S2_asl_i_p_or,    Or,  Su<Shl>, I64, u6_0ImmPred>;
   def: AccRRI_pat<S2_asl_i_p_xacc,  Xor, Su<Shl>, I64, u6_0ImmPred>;
+
+  let Predicates = [HasV60] in {
+    def: AccRRI_pat<S6_rol_i_r_acc,   Add, Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_nac,   Sub, Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_and,   And, Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_or,    Or,  Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_xacc,  Xor, Su<Rol>, I32, u5_0ImmPred>;
+
+    def: AccRRI_pat<S6_rol_i_p_acc,   Add, Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_nac,   Sub, Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_and,   And, Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_or,    Or,  Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_xacc,  Xor, Su<Rol>, I64, u6_0ImmPred>;
+  }
 }
 
 let AddedComplexity = 100 in {
-  def: AccRRR_pat<S2_asr_r_r_acc,   Add, Su<Sra>, I32, I32>;
-  def: AccRRR_pat<S2_asr_r_r_nac,   Sub, Su<Sra>, I32, I32>;
-  def: AccRRR_pat<S2_asr_r_r_and,   And, Su<Sra>, I32, I32>;
-  def: AccRRR_pat<S2_asr_r_r_or,    Or,  Su<Sra>, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_acc,   Add, Su<Sra>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_nac,   Sub, Su<Sra>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_and,   And, Su<Sra>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_or,    Or,  Su<Sra>, I32, I32, I32>;
 
-  def: AccRRR_pat<S2_asr_r_p_acc,   Add, Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_nac,   Sub, Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_and,   And, Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_or,    Or,  Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_xor,   Xor, Su<Sra>, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_acc,   Add, Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_nac,   Sub, Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_and,   And, Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_or,    Or,  Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_xor,   Xor, Su<Sra>, I64, I64, I32>;
 
-  def: AccRRR_pat<S2_lsr_r_r_acc,   Add, Su<Srl>, I32, I32>;
-  def: AccRRR_pat<S2_lsr_r_r_nac,   Sub, Su<Srl>, I32, I32>;
-  def: AccRRR_pat<S2_lsr_r_r_and,   And, Su<Srl>, I32, I32>;
-  def: AccRRR_pat<S2_lsr_r_r_or,    Or,  Su<Srl>, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_acc,   Add, Su<Srl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_nac,   Sub, Su<Srl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_and,   And, Su<Srl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_or,    Or,  Su<Srl>, I32, I32, I32>;
 
-  def: AccRRR_pat<S2_lsr_r_p_acc,   Add, Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_nac,   Sub, Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_and,   And, Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_or,    Or,  Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_xor,   Xor, Su<Srl>, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_acc,   Add, Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_nac,   Sub, Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_and,   And, Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_or,    Or,  Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_xor,   Xor, Su<Srl>, I64, I64, I32>;
 
-  def: AccRRR_pat<S2_asl_r_r_acc,   Add, Su<Shl>, I32, I32>;
-  def: AccRRR_pat<S2_asl_r_r_nac,   Sub, Su<Shl>, I32, I32>;
-  def: AccRRR_pat<S2_asl_r_r_and,   And, Su<Shl>, I32, I32>;
-  def: AccRRR_pat<S2_asl_r_r_or,    Or,  Su<Shl>, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_acc,   Add, Su<Shl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_nac,   Sub, Su<Shl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_and,   And, Su<Shl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_or,    Or,  Su<Shl>, I32, I32, I32>;
 
-  def: AccRRR_pat<S2_asl_r_p_acc,   Add, Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_nac,   Sub, Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_and,   And, Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_or,    Or,  Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_xor,   Xor, Su<Shl>, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_acc,   Add, Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_nac,   Sub, Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_and,   And, Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_or,    Or,  Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_xor,   Xor, Su<Shl>, I64, I64, I32>;
 }
 
 
@@ -1170,11 +1176,13 @@ def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
 // --(9) Arithmetic/bitwise ----------------------------------------------
 //
 
-def: Pat<(abs I32:$Rs), (A2_abs   I32:$Rs)>;
-def: Pat<(not I32:$Rs), (A2_subri -1, I32:$Rs)>;
-def: Pat<(not I64:$Rs), (A2_notp  I64:$Rs)>;
+def: Pat<(abs  I32:$Rs), (A2_abs   I32:$Rs)>;
+def: Pat<(abs  I64:$Rs), (A2_absp  I64:$Rs)>;
+def: Pat<(not  I32:$Rs), (A2_subri -1, I32:$Rs)>;
+def: Pat<(not  I64:$Rs), (A2_notp  I64:$Rs)>;
+def: Pat<(ineg I64:$Rs), (A2_negp  I64:$Rs)>;
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
   def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
 
@@ -1186,13 +1194,6 @@ let Predicates = [HasV5T] in {
                      (i32 (LoReg $Rs)))>;
 }
 
-let AddedComplexity = 50 in
-def: Pat<(xor (add (sra I32:$Rs, (i32 31)),
-                   I32:$Rs),
-              (sra I32:$Rs, (i32 31))),
-         (A2_abs I32:$Rs)>;
-
-
 def: Pat<(add I32:$Rs, anyimm:$s16),   (A2_addi   I32:$Rs,  imm:$s16)>;
 def: Pat<(or  I32:$Rs, anyimm:$s10),   (A2_orir   I32:$Rs,  imm:$s10)>;
 def: Pat<(and I32:$Rs, anyimm:$s10),   (A2_andir  I32:$Rs,  imm:$s10)>;
@@ -1221,18 +1222,20 @@ def: OpR_RR_pat<A2_vsubub,    Sub,        v8i8,  V8I8>;
 def: OpR_RR_pat<A2_vsubh,     Sub,        v4i16, V4I16>;
 def: OpR_RR_pat<A2_vsubw,     Sub,        v2i32, V2I32>;
 
+def: OpR_RR_pat<A2_and,       And,        v4i8,  V4I8>;
+def: OpR_RR_pat<A2_xor,       Xor,        v4i8,  V4I8>;
+def: OpR_RR_pat<A2_or,        Or,         v4i8,  V4I8>;
 def: OpR_RR_pat<A2_and,       And,        v2i16, V2I16>;
 def: OpR_RR_pat<A2_xor,       Xor,        v2i16, V2I16>;
 def: OpR_RR_pat<A2_or,        Or,         v2i16, V2I16>;
-
 def: OpR_RR_pat<A2_andp,      And,        v8i8,  V8I8>;
-def: OpR_RR_pat<A2_andp,      And,        v4i16, V4I16>;
-def: OpR_RR_pat<A2_andp,      And,        v2i32, V2I32>;
 def: OpR_RR_pat<A2_orp,       Or,         v8i8,  V8I8>;
-def: OpR_RR_pat<A2_orp,       Or,         v4i16, V4I16>;
-def: OpR_RR_pat<A2_orp,       Or,         v2i32, V2I32>;
 def: OpR_RR_pat<A2_xorp,      Xor,        v8i8,  V8I8>;
+def: OpR_RR_pat<A2_andp,      And,        v4i16, V4I16>;
+def: OpR_RR_pat<A2_orp,       Or,         v4i16, V4I16>;
 def: OpR_RR_pat<A2_xorp,      Xor,        v4i16, V4I16>;
+def: OpR_RR_pat<A2_andp,      And,        v2i32, V2I32>;
+def: OpR_RR_pat<A2_orp,       Or,         v2i32, V2I32>;
 def: OpR_RR_pat<A2_xorp,      Xor,        v2i32, V2I32>;
 
 def: OpR_RR_pat<M2_mpyi,      Mul,        i32,   I32>;
@@ -1255,7 +1258,7 @@ def: OpR_RR_pat<C2_and,       Mul,        v2i1,  V2I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v4i1,  V4I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v8i1,  V8I1>;
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
   def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
   def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
@@ -1268,12 +1271,62 @@ let Predicates = [HasV5T] in {
 let AddedComplexity = 10 in {
   def: AccRRI_pat<M2_macsip,    Add, Su<Mul>, I32, u32_0ImmPred>;
   def: AccRRI_pat<M2_macsin,    Sub, Su<Mul>, I32, u32_0ImmPred>;
-  def: AccRRR_pat<M2_maci,      Add, Su<Mul>, I32, I32>;
+  def: AccRRR_pat<M2_maci,      Add, Su<Mul>, I32, I32, I32>;
 }
 
 def: AccRRI_pat<M2_naccii,    Sub, Su<Add>, I32, s32_0ImmPred>;
 def: AccRRI_pat<M2_accii,     Add, Su<Add>, I32, s32_0ImmPred>;
-def: AccRRR_pat<M2_acci,      Add, Su<Add>, I32, I32>;
+def: AccRRR_pat<M2_acci,      Add, Su<Add>, I32, I32, I32>;
+
+// Mulh for vectors
+//
+def: Pat<(v2i32 (mulhu V2I32:$Rss, V2I32:$Rtt)),
+         (Combinew (M2_mpyu_up (HiReg $Rss), (HiReg $Rtt)),
+                   (M2_mpyu_up (LoReg $Rss), (LoReg $Rtt)))>;
+
+def: Pat<(v2i32 (mulhs V2I32:$Rs, V2I32:$Rt)),
+         (Combinew (M2_mpy_up (HiReg $Rs), (HiReg $Rt)),
+                   (M2_mpy_up (LoReg $Rt), (LoReg $Rt)))>;
+
+def Mulhub:
+  OutPatFrag<(ops node:$Rss, node:$Rtt),
+             (Combinew (S2_vtrunohb (M5_vmpybuu (HiReg $Rss), (HiReg $Rtt))),
+                       (S2_vtrunohb (M5_vmpybuu (LoReg $Rss), (LoReg $Rtt))))>;
+
+// Equivalent of byte-wise arithmetic shift right by 7 in v8i8.
+def Asr7:
+  OutPatFrag<(ops node:$Rss), (C2_mask (C2_not (A4_vcmpbgti $Rss, 0)))>;
+
+def: Pat<(v8i8 (mulhu V8I8:$Rss, V8I8:$Rtt)),
+         (Mulhub $Rss, $Rtt)>;
+
+def: Pat<(v8i8 (mulhs V8I8:$Rss, V8I8:$Rtt)),
+         (A2_vsubub
+           (Mulhub $Rss, $Rtt),
+           (A2_vaddub (A2_andp V8I8:$Rss, (Asr7 $Rtt)),
+                      (A2_andp V8I8:$Rtt, (Asr7 $Rss))))>;
+
+def Mpysh:
+  OutPatFrag<(ops node:$Rs, node:$Rt), (M2_vmpy2s_s0 $Rs, $Rt)>;
+def Mpyshh:
+  OutPatFrag<(ops node:$Rss, node:$Rtt), (Mpysh (HiReg $Rss), (HiReg $Rtt))>;
+def Mpyshl:
+  OutPatFrag<(ops node:$Rss, node:$Rtt), (Mpysh (LoReg $Rss), (LoReg $Rtt))>;
+
+def Mulhsh:
+  OutPatFrag<(ops node:$Rss, node:$Rtt),
+             (Combinew (A2_combine_hh (HiReg (Mpyshh $Rss, $Rtt)),
+                                      (LoReg (Mpyshh $Rss, $Rtt))),
+                       (A2_combine_hh (HiReg (Mpyshl $Rss, $Rtt)),
+                                      (LoReg (Mpyshl $Rss, $Rtt))))>;
+
+def: Pat<(v4i16 (mulhs V4I16:$Rss, V4I16:$Rtt)), (Mulhsh $Rss, $Rtt)>;
+
+def: Pat<(v4i16 (mulhu V4I16:$Rss, V4I16:$Rtt)),
+         (A2_vaddh
+           (Mulhsh $Rss, $Rtt),
+           (A2_vaddh (A2_andp V4I16:$Rss, (S2_asr_i_vh $Rtt, 15)),
+                     (A2_andp V4I16:$Rtt, (S2_asr_i_vh $Rss, 15))))>;
 
 
 def: Pat<(ineg (mul I32:$Rs, u8_0ImmPred:$u8)),
@@ -1291,24 +1344,24 @@ def: Pat<(mul I32:$Rs, n8_0ImmPred:$n8),
 def: Pat<(add Sext64:$Rs, I64:$Rt),
          (A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>;
 
-def: AccRRR_pat<M4_and_and,   And, Su<And>,       I32,  I32>;
-def: AccRRR_pat<M4_and_or,    And, Su<Or>,        I32,  I32>;
-def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,       I32,  I32>;
-def: AccRRR_pat<M4_or_and,    Or,  Su<And>,       I32,  I32>;
-def: AccRRR_pat<M4_or_or,     Or,  Su<Or>,        I32,  I32>;
-def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,       I32,  I32>;
-def: AccRRR_pat<M4_xor_and,   Xor, Su<And>,       I32,  I32>;
-def: AccRRR_pat<M4_xor_or,    Xor, Su<Or>,        I32,  I32>;
-def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,       I32,  I32>;
-def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,       I64,  I64>;
+def: AccRRR_pat<M4_and_and,   And, Su<And>,       I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_or,    And, Su<Or>,        I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,       I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_and,    Or,  Su<And>,       I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_or,     Or,  Su<Or>,        I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,       I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_and,   Xor, Su<And>,       I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_or,    Xor, Su<Or>,        I32,  I32,  I32>;
+def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,       I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,       I64,  I64,  I64>;
 
 // For dags like (or (and (not _), _), (shl _, _)) where the "or" with
 // one argument matches the patterns below, and with the other argument
 // matches S2_asl_r_r_or, etc, prefer the patterns below.
 let AddedComplexity = 110 in {  // greater than S2_asl_r_r_and/or/xor.
-  def: AccRRR_pat<M4_and_andn,  And, Su<Not2<And>>, I32,  I32>;
-  def: AccRRR_pat<M4_or_andn,   Or,  Su<Not2<And>>, I32,  I32>;
-  def: AccRRR_pat<M4_xor_andn,  Xor, Su<Not2<And>>, I32,  I32>;
+  def: AccRRR_pat<M4_and_andn,  And, Su<Not2<And>>, I32,  I32,  I32>;
+  def: AccRRR_pat<M4_or_andn,   Or,  Su<Not2<And>>, I32,  I32,  I32>;
+  def: AccRRR_pat<M4_xor_andn,  Xor, Su<Not2<And>>, I32,  I32,  I32>;
 }
 
 // S4_addaddi and S4_subaddi don't have tied operands, so give them
@@ -1444,7 +1497,7 @@ def: Pat<(add I32:$Ru, (Su<Mul> I32:$Ry, I32:$Rs)),
          (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
 
 
-let Predicates = [HasV5T] in {
+let Predicates = [HasV5] in {
   def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
            (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
   def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
@@ -1479,13 +1532,13 @@ def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
 // Multiplies two v4i8 vectors.
 def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
          (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>,
-     Requires<[HasV5T]>;
+     Requires<[HasV5]>;
 
 // Multiplies two v8i8 vectors.
 def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
          (Combinew (S2_vtrunehb (M5_vmpybuu (HiReg $Rs), (HiReg $Rt))),
                    (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>,
-     Requires<[HasV5T]>;
+     Requires<[HasV5]>;
 
 
 // --(10) Bit ------------------------------------------------------------
@@ -1519,7 +1572,6 @@ def: Pat<(i32 (ctpop I32:$Rs)),   (S5_popcountp (A4_combineir 0, I32:$Rs))>;
 def: Pat<(bitreverse I32:$Rs),    (S2_brev I32:$Rs)>;
 def: Pat<(bitreverse I64:$Rss),   (S2_brevp I64:$Rss)>;
 
-
 let AddedComplexity = 20 in { // Complexity greater than and/or/xor
   def: Pat<(and I32:$Rs, IsNPow2_32:$V),
            (S2_clrbit_i IntRegs:$Rs, (LogN2_32 $V))>;
@@ -1582,6 +1634,15 @@ let AddedComplexity = 10 in   // Complexity greater than compare reg-reg.
 def: Pat<(i1 (seteq (and I32:$Rs, I32:$Rt), IntRegs:$Rt)),
          (C2_bitsset IntRegs:$Rs, IntRegs:$Rt)>;
 
+def SDTTestBit:
+  SDTypeProfile<1, 2, [SDTCisVT<0, i1>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def HexagonTSTBIT: SDNode<"HexagonISD::TSTBIT", SDTTestBit>;
+
+def: Pat<(HexagonTSTBIT I32:$Rs, u5_0ImmPred:$u5),
+         (S2_tstbit_i I32:$Rs, imm:$u5)>;
+def: Pat<(HexagonTSTBIT I32:$Rs, I32:$Rt),
+         (S2_tstbit_r I32:$Rs, I32:$Rt)>;
+
 let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
   def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
            (S4_ntstbit_i I32:$Rs, imm:$u5)>;
@@ -1790,7 +1851,12 @@ let AddedComplexity = 20 in {
   defm: Loadxi_pat<zextloadv2i8,    v2i16, anyimm1, L2_loadbzw2_io>;
   defm: Loadxi_pat<zextloadv4i8,    v4i16, anyimm2, L2_loadbzw4_io>;
   defm: Loadxi_pat<load,            i32,   anyimm2, L2_loadri_io>;
+  defm: Loadxi_pat<load,            v2i16, anyimm2, L2_loadri_io>;
+  defm: Loadxi_pat<load,            v4i8,  anyimm2, L2_loadri_io>;
   defm: Loadxi_pat<load,            i64,   anyimm3, L2_loadrd_io>;
+  defm: Loadxi_pat<load,            v2i32, anyimm3, L2_loadrd_io>;
+  defm: Loadxi_pat<load,            v4i16, anyimm3, L2_loadrd_io>;
+  defm: Loadxi_pat<load,            v8i8,  anyimm3, L2_loadrd_io>;
   defm: Loadxi_pat<load,            f32,   anyimm2, L2_loadri_io>;
   defm: Loadxi_pat<load,            f64,   anyimm3, L2_loadrd_io>;
   // No sextloadi1.
@@ -1828,10 +1894,15 @@ let AddedComplexity  = 60 in {
   def: Loadxu_pat<zextloadi16,  i32,   anyimm1, L4_loadruh_ur>;
   def: Loadxu_pat<zextloadv2i8, v2i16, anyimm1, L4_loadbzw2_ur>;
   def: Loadxu_pat<zextloadv4i8, v4i16, anyimm2, L4_loadbzw4_ur>;
-  def: Loadxu_pat<load,         f32,   anyimm2, L4_loadri_ur>;
-  def: Loadxu_pat<load,         f64,   anyimm3, L4_loadrd_ur>;
   def: Loadxu_pat<load,         i32,   anyimm2, L4_loadri_ur>;
+  def: Loadxu_pat<load,         v2i16, anyimm2, L4_loadri_ur>;
+  def: Loadxu_pat<load,         v4i8,  anyimm2, L4_loadri_ur>;
   def: Loadxu_pat<load,         i64,   anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         v2i32, anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         v4i16, anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         v8i8,  anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         f32,   anyimm2, L4_loadri_ur>;
+  def: Loadxu_pat<load,         f64,   anyimm3, L4_loadrd_ur>;
 
   def: Loadxum_pat<sextloadi8,  i64, anyimm0, ToSext64, L4_loadrb_ur>;
   def: Loadxum_pat<zextloadi8,  i64, anyimm0, ToZext64, L4_loadrub_ur>;
@@ -1845,29 +1916,39 @@ let AddedComplexity  = 60 in {
 }
 
 let AddedComplexity = 40 in {
-  def: Loadxr_shl_pat<extloadi8,     i32, L4_loadrub_rr>;
-  def: Loadxr_shl_pat<zextloadi8,    i32, L4_loadrub_rr>;
-  def: Loadxr_shl_pat<sextloadi8,    i32, L4_loadrb_rr>;
-  def: Loadxr_shl_pat<extloadi16,    i32, L4_loadruh_rr>;
-  def: Loadxr_shl_pat<zextloadi16,   i32, L4_loadruh_rr>;
-  def: Loadxr_shl_pat<sextloadi16,   i32, L4_loadrh_rr>;
-  def: Loadxr_shl_pat<load,          i32, L4_loadri_rr>;
-  def: Loadxr_shl_pat<load,          i64, L4_loadrd_rr>;
-  def: Loadxr_shl_pat<load,          f32, L4_loadri_rr>;
-  def: Loadxr_shl_pat<load,          f64, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<extloadi8,     i32,   L4_loadrub_rr>;
+  def: Loadxr_shl_pat<zextloadi8,    i32,   L4_loadrub_rr>;
+  def: Loadxr_shl_pat<sextloadi8,    i32,   L4_loadrb_rr>;
+  def: Loadxr_shl_pat<extloadi16,    i32,   L4_loadruh_rr>;
+  def: Loadxr_shl_pat<zextloadi16,   i32,   L4_loadruh_rr>;
+  def: Loadxr_shl_pat<sextloadi16,   i32,   L4_loadrh_rr>;
+  def: Loadxr_shl_pat<load,          i32,   L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          v2i16, L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          v4i8,  L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          i64,   L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          v2i32, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          v4i16, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          v8i8,  L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          f32,   L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          f64,   L4_loadrd_rr>;
 }
 
 let AddedComplexity = 20 in {
-  def: Loadxr_add_pat<extloadi8,     i32, L4_loadrub_rr>;
-  def: Loadxr_add_pat<zextloadi8,    i32, L4_loadrub_rr>;
-  def: Loadxr_add_pat<sextloadi8,    i32, L4_loadrb_rr>;
-  def: Loadxr_add_pat<extloadi16,    i32, L4_loadruh_rr>;
-  def: Loadxr_add_pat<zextloadi16,   i32, L4_loadruh_rr>;
-  def: Loadxr_add_pat<sextloadi16,   i32, L4_loadrh_rr>;
-  def: Loadxr_add_pat<load,          i32, L4_loadri_rr>;
-  def: Loadxr_add_pat<load,          i64, L4_loadrd_rr>;
-  def: Loadxr_add_pat<load,          f32, L4_loadri_rr>;
-  def: Loadxr_add_pat<load,          f64, L4_loadrd_rr>;
+  def: Loadxr_add_pat<extloadi8,     i32,   L4_loadrub_rr>;
+  def: Loadxr_add_pat<zextloadi8,    i32,   L4_loadrub_rr>;
+  def: Loadxr_add_pat<sextloadi8,    i32,   L4_loadrb_rr>;
+  def: Loadxr_add_pat<extloadi16,    i32,   L4_loadruh_rr>;
+  def: Loadxr_add_pat<zextloadi16,   i32,   L4_loadruh_rr>;
+  def: Loadxr_add_pat<sextloadi16,   i32,   L4_loadrh_rr>;
+  def: Loadxr_add_pat<load,          i32,   L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          v2i16, L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          v4i8,  L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          i64,   L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          v2i32, L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          v4i16, L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          v8i8,  L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          f32,   L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          f64,   L4_loadrd_rr>;
 }
 
 let AddedComplexity = 40 in {
@@ -1897,17 +1978,22 @@ let AddedComplexity = 20 in {
 // Absolute address
 
 let AddedComplexity  = 60 in {
-  def: Loada_pat<zextloadi1,      i32, anyimm0, PS_loadrubabs>;
-  def: Loada_pat<sextloadi8,      i32, anyimm0, PS_loadrbabs>;
-  def: Loada_pat<extloadi8,       i32, anyimm0, PS_loadrubabs>;
-  def: Loada_pat<zextloadi8,      i32, anyimm0, PS_loadrubabs>;
-  def: Loada_pat<sextloadi16,     i32, anyimm1, PS_loadrhabs>;
-  def: Loada_pat<extloadi16,      i32, anyimm1, PS_loadruhabs>;
-  def: Loada_pat<zextloadi16,     i32, anyimm1, PS_loadruhabs>;
-  def: Loada_pat<load,            i32, anyimm2, PS_loadriabs>;
-  def: Loada_pat<load,            i64, anyimm3, PS_loadrdabs>;
-  def: Loada_pat<load,            f32, anyimm2, PS_loadriabs>;
-  def: Loada_pat<load,            f64, anyimm3, PS_loadrdabs>;
+  def: Loada_pat<zextloadi1,      i32,   anyimm0, PS_loadrubabs>;
+  def: Loada_pat<sextloadi8,      i32,   anyimm0, PS_loadrbabs>;
+  def: Loada_pat<extloadi8,       i32,   anyimm0, PS_loadrubabs>;
+  def: Loada_pat<zextloadi8,      i32,   anyimm0, PS_loadrubabs>;
+  def: Loada_pat<sextloadi16,     i32,   anyimm1, PS_loadrhabs>;
+  def: Loada_pat<extloadi16,      i32,   anyimm1, PS_loadruhabs>;
+  def: Loada_pat<zextloadi16,     i32,   anyimm1, PS_loadruhabs>;
+  def: Loada_pat<load,            i32,   anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            v2i16, anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            v4i8,  anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            i64,   anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            v2i32, anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            v4i16, anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            v8i8,  anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            f32,   anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            f64,   anyimm3, PS_loadrdabs>;
 
   def: Loada_pat<atomic_load_8,   i32, anyimm0, PS_loadrubabs>;
   def: Loada_pat<atomic_load_16,  i32, anyimm1, PS_loadruhabs>;
@@ -1933,18 +2019,23 @@ let AddedComplexity  = 30 in {
 // GP-relative address
 
 let AddedComplexity  = 100 in {
-  def: Loada_pat<extloadi1,       i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<zextloadi1,      i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<extloadi8,       i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<sextloadi8,      i32, addrgp,  L2_loadrbgp>;
-  def: Loada_pat<zextloadi8,      i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<extloadi16,      i32, addrgp,  L2_loadruhgp>;
-  def: Loada_pat<sextloadi16,     i32, addrgp,  L2_loadrhgp>;
-  def: Loada_pat<zextloadi16,     i32, addrgp,  L2_loadruhgp>;
-  def: Loada_pat<load,            i32, addrgp,  L2_loadrigp>;
-  def: Loada_pat<load,            i64, addrgp,  L2_loadrdgp>;
-  def: Loada_pat<load,            f32, addrgp,  L2_loadrigp>;
-  def: Loada_pat<load,            f64, addrgp,  L2_loadrdgp>;
+  def: Loada_pat<extloadi1,       i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<zextloadi1,      i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<extloadi8,       i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<sextloadi8,      i32,   addrgp,  L2_loadrbgp>;
+  def: Loada_pat<zextloadi8,      i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<extloadi16,      i32,   addrgp,  L2_loadruhgp>;
+  def: Loada_pat<sextloadi16,     i32,   addrgp,  L2_loadrhgp>;
+  def: Loada_pat<zextloadi16,     i32,   addrgp,  L2_loadruhgp>;
+  def: Loada_pat<load,            i32,   addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            v2i16, addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            v4i8,  addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            i64,   addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            v2i32, addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            v4i16, addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            v8i8,  addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            f32,   addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            f64,   addrgp,  L2_loadrdgp>;
 
   def: Loada_pat<atomic_load_8,   i32, addrgp,  L2_loadrubgp>;
   def: Loada_pat<atomic_load_16,  i32, addrgp,  L2_loadruhgp>;
@@ -1983,46 +2074,10 @@ def: Pat<(i1 (load (add I32:$Rs, anyimm0:$Off))),
 def: Pat<(i1 (load I32:$Rs)),
          (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, 0))>;
 
-// HVX loads
-
-multiclass HvxLd_pat<InstHexagon MI, PatFrag Load, ValueType VT,
-                     PatFrag ImmPred> {
-  def: Pat<(VT (Load I32:$Rt)),                   (MI I32:$Rt, 0)>;
-  def: Pat<(VT (Load (add I32:$Rt, ImmPred:$s))), (MI I32:$Rt, imm:$s)>;
-  // The HVX selection code for shuffles can generate vector constants.
-  // Calling "Select" on the resulting loads from CP fails without these
-  // patterns.
-  def: Pat<(VT (Load (HexagonCP tconstpool:$A))), (MI (A2_tfrsi imm:$A), 0)>;
-  def: Pat<(VT (Load (HexagonAtPcrel tconstpool:$A))),
-           (MI (C4_addipc imm:$A), 0)>;
-}
-
-
-let Predicates = [UseHVX] in {
-  multiclass HvxLdVs_pat<InstHexagon MI, PatFrag Load> {
-    defm: HvxLd_pat<MI, Load, VecI8,  IsVecOff>;
-    defm: HvxLd_pat<MI, Load, VecI16, IsVecOff>;
-    defm: HvxLd_pat<MI, Load, VecI32, IsVecOff>;
-  }
-  defm: HvxLdVs_pat<V6_vL32b_nt_ai, alignednontemporalload>;
-  defm: HvxLdVs_pat<V6_vL32b_ai,    alignedload>;
-  defm: HvxLdVs_pat<V6_vL32Ub_ai,   unalignedload>;
-
-  multiclass HvxLdWs_pat<InstHexagon MI, PatFrag Load> {
-    defm: HvxLd_pat<MI, Load, VecPI8,  IsVecOff>;
-    defm: HvxLd_pat<MI, Load, VecPI16, IsVecOff>;
-    defm: HvxLd_pat<MI, Load, VecPI32, IsVecOff>;
-  }
-  defm: HvxLdWs_pat<PS_vloadrw_nt_ai, alignednontemporalload>;
-  defm: HvxLdWs_pat<PS_vloadrw_ai,    alignedload>;
-  defm: HvxLdWs_pat<PS_vloadrwu_ai,   unalignedload>;
-}
-
 
 // --(13) Store ----------------------------------------------------------
 //
 
-
 class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset, InstHexagon MI>
   : Pat<(Store Value:$Rt, I32:$Rx, Offset:$s4),
         (MI I32:$Rx, imm:$s4, Value:$Rt)>;
@@ -2135,7 +2190,7 @@ class Stoream_pat<PatFrag Store, PatFrag Value, PatFrag Addr, PatFrag ValueMod,
 // swapped. This relies on the knowledge that the F.Fragment uses names
 // "ptr" and "val".
 class AtomSt<PatFrag F>
-  : PatFrag<(ops node:$val, node:$ptr), F.Fragment, F.PredicateCode,
+  : PatFrag<(ops node:$val, node:$ptr), !head(F.Fragments), F.PredicateCode,
             F.OperandTransform> {
   let IsAtomic = F.IsAtomic;
   let MemoryVT = F.MemoryVT;
@@ -2459,36 +2514,6 @@ let AddedComplexity = 10 in {
   def: Storexi_base_pat<AtomSt<atomic_store_64>,  I64, S2_storerd_io>;
 }
 
-// HVX stores
-
-multiclass HvxSt_pat<InstHexagon MI, PatFrag Store, PatFrag ImmPred,
-                     PatFrag Value> {
-  def: Pat<(Store Value:$Vs, I32:$Rt),
-           (MI I32:$Rt, 0, Value:$Vs)>;
-  def: Pat<(Store Value:$Vs, (add I32:$Rt, ImmPred:$s)),
-           (MI I32:$Rt, imm:$s, Value:$Vs)>;
-}
-
-let Predicates = [UseHVX] in {
-  multiclass HvxStVs_pat<InstHexagon MI, PatFrag Store> {
-    defm: HvxSt_pat<MI, Store, IsVecOff, HVI8>;
-    defm: HvxSt_pat<MI, Store, IsVecOff, HVI16>;
-    defm: HvxSt_pat<MI, Store, IsVecOff, HVI32>;
-  }
-  defm: HvxStVs_pat<V6_vS32b_nt_ai, alignednontemporalstore>;
-  defm: HvxStVs_pat<V6_vS32b_ai,    alignedstore>;
-  defm: HvxStVs_pat<V6_vS32Ub_ai,   unalignedstore>;
-
-  multiclass HvxStWs_pat<InstHexagon MI, PatFrag Store> {
-    defm: HvxSt_pat<MI, Store, IsVecOff, HWI8>;
-    defm: HvxSt_pat<MI, Store, IsVecOff, HWI16>;
-    defm: HvxSt_pat<MI, Store, IsVecOff, HWI32>;
-  }
-  defm: HvxStWs_pat<PS_vstorerw_nt_ai, alignednontemporalstore>;
-  defm: HvxStWs_pat<PS_vstorerw_ai,    alignedstore>;
-  defm: HvxStWs_pat<PS_vstorerwu_ai,   unalignedstore>;
-}
-
 
 // --(14) Memop ----------------------------------------------------------
 //
@@ -2570,8 +2595,10 @@ multiclass Memopxr_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
 
 multiclass Memopxr_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
                        SDNode Oper, InstHexagon MI> {
-  defm: Memopxr_base_pat <Load, Store,          Oper, MI>;
-  defm: Memopxr_add_pat  <Load, Store, ImmPred, Oper, MI>;
+  let Predicates = [UseMEMOPS] in {
+    defm: Memopxr_base_pat <Load, Store,          Oper, MI>;
+    defm: Memopxr_add_pat  <Load, Store, ImmPred, Oper, MI>;
+  }
 }
 
 let AddedComplexity = 200 in {
@@ -2669,8 +2696,10 @@ multiclass Memopxi_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
 multiclass Memopxi_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
                        SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
                        InstHexagon MI> {
-  defm: Memopxi_base_pat <Load, Store,          Oper, Arg, ArgMod, MI>;
-  defm: Memopxi_add_pat  <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+  let Predicates = [UseMEMOPS] in {
+    defm: Memopxi_base_pat <Load, Store,          Oper, Arg, ArgMod, MI>;
+    defm: Memopxi_add_pat  <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+  }
 }
 
 let AddedComplexity = 220 in {
@@ -2829,6 +2858,8 @@ def: Pat<(brcond (not I1:$Pu), bb:$dst),
          (J2_jumpf I1:$Pu, bb:$dst)>;
 def: Pat<(brcond (i1 (setne I1:$Pu, -1)), bb:$dst),
          (J2_jumpf I1:$Pu, bb:$dst)>;
+def: Pat<(brcond (i1 (seteq I1:$Pu, 0)), bb:$dst),
+         (J2_jumpf I1:$Pu, bb:$dst)>;
 def: Pat<(brcond (i1 (setne I1:$Pu, 0)), bb:$dst),
          (J2_jumpt I1:$Pu, bb:$dst)>;
 
@@ -2898,107 +2929,17 @@ def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
 
 def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
 
-
-def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>;
-
-def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,
-  [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;
-def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>;
-
-def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
-  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
-def HexagonVINSERTW0 : SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
-
-def Combinev: OutPatFrag<(ops node:$Rs, node:$Rt),
-  (REG_SEQUENCE HvxWR, $Rs, vsub_hi, $Rt, vsub_lo)>;
-
-def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>;
-def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
-
-let Predicates = [UseHVX] in {
-  def: OpR_RR_pat<V6_vpackeb,  pf2<HexagonVPACKE>,     VecI8,  HVI8>;
-  def: OpR_RR_pat<V6_vpackob,  pf2<HexagonVPACKO>,     VecI8,  HVI8>;
-  def: OpR_RR_pat<V6_vpackeh,  pf2<HexagonVPACKE>,    VecI16, HVI16>;
-  def: OpR_RR_pat<V6_vpackoh,  pf2<HexagonVPACKO>,    VecI16, HVI16>;
-}
-
-def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>;
-def vzero: PatFrag<(ops), (HexagonVZERO)>;
-
-def VSxtb: OutPatFrag<(ops node:$Vs),
-           (V6_vshuffvdd (HiVec (V6_vsb $Vs)),
-                         (LoVec (V6_vsb $Vs)),
-                         (A2_tfrsi -2))>;
-def VSxth: OutPatFrag<(ops node:$Vs),
-           (V6_vshuffvdd (HiVec (V6_vsh $Vs)),
-                         (LoVec (V6_vsh $Vs)),
-                         (A2_tfrsi -4))>;
-def VZxtb: OutPatFrag<(ops node:$Vs),
-           (V6_vshuffvdd (HiVec (V6_vzb $Vs)),
-                         (LoVec (V6_vzb $Vs)),
-                         (A2_tfrsi -2))>;
-def VZxth: OutPatFrag<(ops node:$Vs),
-           (V6_vshuffvdd (HiVec (V6_vzh $Vs)),
-                         (LoVec (V6_vzh $Vs)),
-                         (A2_tfrsi -4))>;
-
-let Predicates = [UseHVX] in {
-  def: Pat<(VecI8  vzero), (V6_vd0)>;
-  def: Pat<(VecI16 vzero), (V6_vd0)>;
-  def: Pat<(VecI32 vzero), (V6_vd0)>;
-
-  def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
-           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
-  def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)),
-           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
-  def: Pat<(VecPI32 (concat_vectors HVI32:$Vs, HVI32:$Vt)),
-           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
-
-  def: Pat<(HexagonVEXTRACTW HVI8:$Vu, I32:$Rs),
-           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
-  def: Pat<(HexagonVEXTRACTW HVI16:$Vu, I32:$Rs),
-           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
-  def: Pat<(HexagonVEXTRACTW HVI32:$Vu, I32:$Rs),
-           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
-
-  def: Pat<(HexagonVINSERTW0 HVI8:$Vu,  I32:$Rt),
-           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
-  def: Pat<(HexagonVINSERTW0 HVI16:$Vu, I32:$Rt),
-           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
-  def: Pat<(HexagonVINSERTW0 HVI32:$Vu, I32:$Rt),
-           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
-
-  def: Pat<(add HVI8:$Vs,  HVI8:$Vt),   (V6_vaddb HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(add HVI16:$Vs, HVI16:$Vt),  (V6_vaddh HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(add HVI32:$Vs, HVI32:$Vt),  (V6_vaddw HvxVR:$Vs, HvxVR:$Vt)>;
-
-  def: Pat<(sub HVI8:$Vs,  HVI8:$Vt),   (V6_vsubb HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(sub HVI16:$Vs, HVI16:$Vt),  (V6_vsubh HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(sub HVI32:$Vs, HVI32:$Vt),  (V6_vsubw HvxVR:$Vs, HvxVR:$Vt)>;
-
-  def: Pat<(and HVI8:$Vs,  HVI8:$Vt),   (V6_vand  HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(or  HVI8:$Vs,  HVI8:$Vt),   (V6_vor   HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(xor HVI8:$Vs,  HVI8:$Vt),   (V6_vxor  HvxVR:$Vs, HvxVR:$Vt)>;
-
-  def: Pat<(vselect HQ8:$Qu, HVI8:$Vs, HVI8:$Vt),
-           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(vselect HQ16:$Qu, HVI16:$Vs, HVI16:$Vt),
-           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
-  def: Pat<(vselect HQ32:$Qu, HVI32:$Vs, HVI32:$Vt),
-           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
-
-  def: Pat<(VecPI16 (sext HVI8:$Vs)),  (VSxtb $Vs)>;
-  def: Pat<(VecPI32 (sext HVI16:$Vs)), (VSxth $Vs)>;
-  def: Pat<(VecPI16 (zext HVI8:$Vs)),  (VZxtb $Vs)>;
-  def: Pat<(VecPI32 (zext HVI16:$Vs)), (VZxth $Vs)>;
-
-  def: Pat<(VecI16 (sext_invec HVI8:$Vs)),  (LoVec (VSxtb $Vs))>;
-  def: Pat<(VecI32 (sext_invec HVI16:$Vs)), (LoVec (VSxth $Vs))>;
-  def: Pat<(VecI32 (sext_invec HVI8:$Vs)),
-           (LoVec (VSxth (LoVec (VSxtb $Vs))))>;
-
-  def: Pat<(VecI16 (zext_invec HVI8:$Vs)),  (LoVec (VZxtb $Vs))>;
-  def: Pat<(VecI32 (zext_invec HVI16:$Vs)), (LoVec (VZxth $Vs))>;
-  def: Pat<(VecI32 (zext_invec HVI8:$Vs)),
-           (LoVec (VZxth (LoVec (VZxtb $Vs))))>;
+// The declared return value of the store-locked intrinsics is i32, but
+// the instructions actually define i1. To avoid register copies from
+// IntRegs to PredRegs and back, fold the entire pattern checking the
+// result against true/false.
+let AddedComplexity = 100 in {
+  def: Pat<(i1 (setne (int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt), 0)),
+           (S2_storew_locked I32:$Rs, I32:$Rt)>;
+  def: Pat<(i1 (seteq (int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt), 0)),
+           (C2_not (S2_storew_locked I32:$Rs, I32:$Rt))>;
+  def: Pat<(i1 (setne (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
+           (S4_stored_locked I32:$Rs, I64:$Rt)>;
+  def: Pat<(i1 (seteq (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
+           (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/contrib/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
new file mode 100644
index 000000000000..a4cfca9ac7d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -0,0 +1,497 @@
+def SDTVecLeaf:
+  SDTypeProfile<1, 0, [SDTCisVec<0>]>;
+def SDTVecBinOp:
+  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>;
+
+def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,
+  [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;
+def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>;
+
+def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
+  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
+def HexagonVINSERTW0: SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
+
+def SDTHexagonVSPLATW: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def HexagonVSPLATW: SDNode<"HexagonISD::VSPLATW", SDTHexagonVSPLATW>;
+
+def HwLen2: SDNodeXForm<imm, [{
+  const auto &ST = static_cast<const HexagonSubtarget&>(CurDAG->getSubtarget());
+  return CurDAG->getTargetConstant(ST.getVectorLength()/2, SDLoc(N), MVT::i32);
+}]>;
+
+def Q2V: OutPatFrag<(ops node:$Qs), (V6_vandqrt $Qs, (A2_tfrsi -1))>;
+
+def Combinev: OutPatFrag<(ops node:$Vs, node:$Vt),
+  (REG_SEQUENCE HvxWR, $Vs, vsub_hi, $Vt, vsub_lo)>;
+
+def Combineq: OutPatFrag<(ops node:$Qs, node:$Qt),
+  (V6_vandvrt
+    (V6_vor
+      (V6_vror (V6_vpackeb (V6_vd0), (Q2V $Qs)),
+               (A2_tfrsi (HwLen2 (i32 0)))),  // Half the vector length
+      (V6_vpackeb (V6_vd0), (Q2V $Qt))),
+    (A2_tfrsi -1))>;
+
+def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>;
+def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
+
+def HexagonVZERO:      SDNode<"HexagonISD::VZERO",      SDTVecLeaf>;
+def HexagonQCAT:       SDNode<"HexagonISD::QCAT",       SDTVecBinOp>;
+def HexagonQTRUE:      SDNode<"HexagonISD::QTRUE",      SDTVecLeaf>;
+def HexagonQFALSE:     SDNode<"HexagonISD::QFALSE",     SDTVecLeaf>;
+
+def vzero:  PatFrag<(ops), (HexagonVZERO)>;
+def qtrue:  PatFrag<(ops), (HexagonQTRUE)>;
+def qfalse: PatFrag<(ops), (HexagonQFALSE)>;
+def qcat:   PatFrag<(ops node:$Qs, node:$Qt),
+                    (HexagonQCAT node:$Qs, node:$Qt)>;
+
+def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
+
+def VSxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackb  $Vs)>;
+def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh  $Vs)>;
+def VZxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackub $Vs)>;
+def VZxth: OutPatFrag<(ops node:$Vs), (V6_vunpackuh $Vs)>;
+
+def SplatB: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  assert(isUInt<8>(V));
+  uint32_t S = V << 24 | V << 16 | V << 8 | V;
+  return CurDAG->getTargetConstant(S, SDLoc(N), MVT::i32);
+}]>;
+
+def SplatH: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  assert(isUInt<16>(V));
+  return CurDAG->getTargetConstant(V << 16 | V, SDLoc(N), MVT::i32);
+}]>;
+
+def IsVecOff : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass);
+  assert(isPowerOf2_32(VecSize));
+  if ((uint32_t(V) & (uint32_t(VecSize)-1)) != 0)
+    return false;
+  int32_t L = Log2_32(VecSize);
+  return isInt<4>(V >> L);
+}]>;
+
+
+def alignedload: PatFrag<(ops node:$a), (load $a), [{
+  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedload: PatFrag<(ops node:$a), (load $a), [{
+  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def alignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
+  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
+  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+
+// HVX loads
+
+multiclass HvxLd_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+                     PatFrag ImmPred> {
+  def: Pat<(ResType (Load I32:$Rt)),
+           (MI I32:$Rt, 0)>;
+  def: Pat<(ResType (Load (add I32:$Rt, ImmPred:$s))),
+           (MI I32:$Rt, imm:$s)>;
+  // The HVX selection code for shuffles can generate vector constants.
+  // Calling "Select" on the resulting loads from CP fails without these
+  // patterns.
+  def: Pat<(ResType (Load (HexagonCP tconstpool:$A))),
+           (MI (A2_tfrsi imm:$A), 0)>;
+  def: Pat<(ResType (Load (HexagonAtPcrel tconstpool:$A))),
+           (MI (C4_addipc imm:$A), 0)>;
+}
+
+multiclass HvxLda_pat<InstHexagon MI, PatFrag Load, ValueType ResType,
+                      PatFrag ImmPred> {
+  let AddedComplexity = 50 in {
+    def: Pat<(ResType (Load (valignaddr I32:$Rt))),
+             (MI I32:$Rt, 0)>;
+    def: Pat<(ResType (Load (add (valignaddr I32:$Rt), ImmPred:$Off))),
+             (MI I32:$Rt, imm:$Off)>;
+  }
+  defm: HvxLd_pat<MI, Load, ResType, ImmPred>;
+}
+
+let Predicates = [UseHVX] in {
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI8,  IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI16, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecI32, IsVecOff>;
+
+  defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI8,  IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI16, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI32, IsVecOff>;
+
+  defm: HvxLd_pat<V6_vL32Ub_ai,  unalignedload, VecI8,  IsVecOff>;
+  defm: HvxLd_pat<V6_vL32Ub_ai,  unalignedload, VecI16, IsVecOff>;
+  defm: HvxLd_pat<V6_vL32Ub_ai,  unalignedload, VecI32, IsVecOff>;
+}
+
+// HVX stores
+
+multiclass HvxSt_pat<InstHexagon MI, PatFrag Store, PatFrag ImmPred,
+                     PatFrag Value> {
+  def: Pat<(Store Value:$Vs, I32:$Rt),
+           (MI I32:$Rt, 0, Value:$Vs)>;
+  def: Pat<(Store Value:$Vs, (add I32:$Rt, ImmPred:$s)),
+           (MI I32:$Rt, imm:$s, Value:$Vs)>;
+}
+
+let Predicates = [UseHVX] in {
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI8>;
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI16>;
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, IsVecOff, HVI32>;
+
+  defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI8>;
+  defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI16>;
+  defm: HvxSt_pat<V6_vS32b_ai, alignedstore, IsVecOff, HVI32>;
+
+  defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI8>;
+  defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI16>;
+  defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, IsVecOff, HVI32>;
+}
+
+// Bitcasts between same-size vector types are no-ops, except for the
+// actual type change.
+class Bitcast<ValueType ResTy, ValueType InpTy, RegisterClass RC>
+  : Pat<(ResTy (bitconvert (InpTy RC:$Val))), (ResTy RC:$Val)>;
+
+let Predicates = [UseHVX] in {
+  def: Bitcast<VecI8,   VecI16,  HvxVR>;
+  def: Bitcast<VecI8,   VecI32,  HvxVR>;
+  def: Bitcast<VecI16,  VecI8,   HvxVR>;
+  def: Bitcast<VecI16,  VecI32,  HvxVR>;
+  def: Bitcast<VecI32,  VecI8,   HvxVR>;
+  def: Bitcast<VecI32,  VecI16,  HvxVR>;
+
+  def: Bitcast<VecPI8,  VecPI16, HvxWR>;
+  def: Bitcast<VecPI8,  VecPI32, HvxWR>;
+  def: Bitcast<VecPI16, VecPI8,  HvxWR>;
+  def: Bitcast<VecPI16, VecPI32, HvxWR>;
+  def: Bitcast<VecPI32, VecPI8,  HvxWR>;
+  def: Bitcast<VecPI32, VecPI16, HvxWR>;
+}
+
+let Predicates = [UseHVX] in {
+  def: Pat<(VecI8   vzero), (V6_vd0)>;
+  def: Pat<(VecI16  vzero), (V6_vd0)>;
+  def: Pat<(VecI32  vzero), (V6_vd0)>;
+  def: Pat<(VecPI8  vzero), (PS_vdd0)>;
+  def: Pat<(VecPI16 vzero), (PS_vdd0)>;
+  def: Pat<(VecPI32 vzero), (PS_vdd0)>;
+
+  def: Pat<(concat_vectors  (VecI8 vzero),  (VecI8 vzero)), (PS_vdd0)>;
+  def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
+  def: Pat<(concat_vectors (VecI32 vzero), (VecI32 vzero)), (PS_vdd0)>;
+
+  def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(VecPI32 (concat_vectors HVI32:$Vs, HVI32:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+
+  def: Pat<(VecQ8  (qcat HQ16:$Qs, HQ16:$Qt)), (Combineq $Qt, $Qs)>;
+  def: Pat<(VecQ16 (qcat HQ32:$Qs, HQ32:$Qt)), (Combineq $Qt, $Qs)>;
+
+  def: Pat<(HexagonVEXTRACTW HVI8:$Vu, I32:$Rs),
+           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+  def: Pat<(HexagonVEXTRACTW HVI16:$Vu, I32:$Rs),
+           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+  def: Pat<(HexagonVEXTRACTW HVI32:$Vu, I32:$Rs),
+           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+
+  def: Pat<(HexagonVINSERTW0 HVI8:$Vu,  I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVI16:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVI32:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+}
+
+def Vsplatib: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatB $V)))>;
+def Vsplatih: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatH $V)))>;
+def Vsplatiw: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 $V))>;
+
+def Vsplatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
+def Vsplatrh: OutPatFrag<(ops node:$Rs),
+                         (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
+def Vsplatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
+
+def Rep: OutPatFrag<(ops node:$N), (Combinev $N, $N)>;
+
+let Predicates = [UseHVX] in {
+  let AddedComplexity = 10 in {
+    def: Pat<(VecI8   (HexagonVSPLAT u8_0ImmPred:$V)),  (Vsplatib $V)>;
+    def: Pat<(VecI16  (HexagonVSPLAT u16_0ImmPred:$V)), (Vsplatih $V)>;
+    def: Pat<(VecI32  (HexagonVSPLAT anyimm:$V)),       (Vsplatiw $V)>;
+    def: Pat<(VecPI8  (HexagonVSPLAT u8_0ImmPred:$V)),  (Rep (Vsplatib $V))>;
+    def: Pat<(VecPI16 (HexagonVSPLAT u16_0ImmPred:$V)), (Rep (Vsplatih $V))>;
+    def: Pat<(VecPI32 (HexagonVSPLAT anyimm:$V)),       (Rep (Vsplatiw $V))>;
+  }
+  def: Pat<(VecI8   (HexagonVSPLAT I32:$Rs)), (Vsplatrb $Rs)>;
+  def: Pat<(VecI16  (HexagonVSPLAT I32:$Rs)), (Vsplatrh $Rs)>;
+  def: Pat<(VecI32  (HexagonVSPLAT I32:$Rs)), (Vsplatrw $Rs)>;
+  def: Pat<(VecPI8  (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrb $Rs))>;
+  def: Pat<(VecPI16 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrh $Rs))>;
+  def: Pat<(VecPI32 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+
+  def: Pat<(VecI8   (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
+  def: Pat<(VecI16  (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
+  def: Pat<(VecI32  (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
+  def: Pat<(VecPI8  (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+  def: Pat<(VecPI16 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+  def: Pat<(VecPI32 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
+}
+
+class Vneg1<ValueType VecTy>
+  : PatFrag<(ops), (VecTy (HexagonVSPLATW (i32 -1)))>;
+
+class Vnot<ValueType VecTy>
+  : PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>;
+
+let Predicates = [UseHVX] in {
+  let AddedComplexity = 200 in {
+    def: Pat<(Vnot<VecI8>   HVI8:$Vs), (V6_vnot HvxVR:$Vs)>;
+    def: Pat<(Vnot<VecI16> HVI16:$Vs), (V6_vnot HvxVR:$Vs)>;
+    def: Pat<(Vnot<VecI32> HVI32:$Vs), (V6_vnot HvxVR:$Vs)>;
+  }
+
+  def: OpR_RR_pat<V6_vaddb,    Add,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vaddh,    Add,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vaddw,    Add,  VecI32, HVI32>;
+  def: OpR_RR_pat<V6_vaddb_dv, Add,  VecPI8,  HWI8>;
+  def: OpR_RR_pat<V6_vaddh_dv, Add, VecPI16, HWI16>;
+  def: OpR_RR_pat<V6_vaddw_dv, Add, VecPI32, HWI32>;
+  def: OpR_RR_pat<V6_vsubb,    Sub,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vsubh,    Sub,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vsubw,    Sub,  VecI32, HVI32>;
+  def: OpR_RR_pat<V6_vsubb_dv, Sub,  VecPI8,  HWI8>;
+  def: OpR_RR_pat<V6_vsubh_dv, Sub, VecPI16, HWI16>;
+  def: OpR_RR_pat<V6_vsubw_dv, Sub, VecPI32, HWI32>;
+  def: OpR_RR_pat<V6_vand,     And,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vand,     And,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vand,     And,  VecI32, HVI32>;
+  def: OpR_RR_pat<V6_vor,       Or,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vor,       Or,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vor,       Or,  VecI32, HVI32>;
+  def: OpR_RR_pat<V6_vxor,     Xor,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vxor,     Xor,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vxor,     Xor,  VecI32, HVI32>;
+
+  def: Pat<(vselect HQ8:$Qu, HVI8:$Vs, HVI8:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(vselect HQ16:$Qu, HVI16:$Vs, HVI16:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(vselect HQ32:$Qu, HVI32:$Vs, HVI32:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+
+  def: Pat<(vselect (qnot HQ8:$Qu), HVI8:$Vs, HVI8:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(vselect (qnot HQ16:$Qu), HVI16:$Vs, HVI16:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(vselect (qnot HQ32:$Qu), HVI32:$Vs, HVI32:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+}
+
+let Predicates = [UseHVX] in {
+  def: Pat<(VecPI16 (sext HVI8:$Vs)),  (VSxtb $Vs)>;
+  def: Pat<(VecPI32 (sext HVI16:$Vs)), (VSxth $Vs)>;
+  def: Pat<(VecPI16 (zext HVI8:$Vs)),  (VZxtb $Vs)>;
+  def: Pat<(VecPI32 (zext HVI16:$Vs)), (VZxth $Vs)>;
+
+  def: Pat<(VecI16 (sext_invec HVI8:$Vs)),  (LoVec (VSxtb $Vs))>;
+  def: Pat<(VecI32 (sext_invec HVI16:$Vs)), (LoVec (VSxth $Vs))>;
+  def: Pat<(VecI32 (sext_invec HVI8:$Vs)),
+           (LoVec (VSxth (LoVec (VSxtb $Vs))))>;
+  def: Pat<(VecPI16 (sext_invec HWI8:$Vss)),  (VSxtb (LoVec $Vss))>;
+  def: Pat<(VecPI32 (sext_invec HWI16:$Vss)), (VSxth (LoVec $Vss))>;
+  def: Pat<(VecPI32 (sext_invec HWI8:$Vss)),
+           (VSxth (LoVec (VSxtb (LoVec $Vss))))>;
+
+  def: Pat<(VecI16 (zext_invec HVI8:$Vs)),  (LoVec (VZxtb $Vs))>;
+  def: Pat<(VecI32 (zext_invec HVI16:$Vs)), (LoVec (VZxth $Vs))>;
+  def: Pat<(VecI32 (zext_invec HVI8:$Vs)),
+           (LoVec (VZxth (LoVec (VZxtb $Vs))))>;
+  def: Pat<(VecPI16 (zext_invec HWI8:$Vss)),  (VZxtb (LoVec $Vss))>;
+  def: Pat<(VecPI32 (zext_invec HWI16:$Vss)), (VZxth (LoVec $Vss))>;
+  def: Pat<(VecPI32 (zext_invec HWI8:$Vss)),
+           (VZxth (LoVec (VZxtb (LoVec $Vss))))>;
+
+  def: Pat<(VecI8 (trunc HWI16:$Vss)),
+           (V6_vpackeb (HiVec $Vss), (LoVec $Vss))>;
+  def: Pat<(VecI16 (trunc HWI32:$Vss)),
+           (V6_vpackeh (HiVec $Vss), (LoVec $Vss))>;
+
+  def: Pat<(VecQ8 (trunc HVI8:$Vs)),
+           (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
+  def: Pat<(VecQ16 (trunc HVI16:$Vs)),
+           (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
+  def: Pat<(VecQ32 (trunc HVI32:$Vs)),
+           (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
+}
+
+let Predicates = [UseHVX] in {
+  // The "source" types are not legal, and there are no parameterized
+  // definitions for them, but they are length-specific.
+  let Predicates = [UseHVX,UseHVX64B] in {
+    def: Pat<(VecI16 (sext_inreg HVI16:$Vs, v32i8)),
+             (V6_vasrh (V6_vaslh HVI16:$Vs, (A2_tfrsi 8)), (A2_tfrsi 8))>;
+    def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v16i8)),
+             (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 24)), (A2_tfrsi 24))>;
+    def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v16i16)),
+             (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
+  }
+  let Predicates = [UseHVX,UseHVX128B] in {
+    def: Pat<(VecI16 (sext_inreg HVI16:$Vs, v64i8)),
+             (V6_vasrh (V6_vaslh HVI16:$Vs, (A2_tfrsi 8)), (A2_tfrsi 8))>;
+    def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v32i8)),
+             (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 24)), (A2_tfrsi 24))>;
+    def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v32i16)),
+             (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
+  }
+
+  def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt),
+           (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
+                       (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
+  def: Pat<(HexagonVASR HVI8:$Vs, I32:$Rt),
+           (V6_vpackeb (V6_vasrh (HiVec (VSxtb HvxVR:$Vs)), I32:$Rt),
+                       (V6_vasrh (LoVec (VSxtb HvxVR:$Vs)), I32:$Rt))>;
+  def: Pat<(HexagonVLSR HVI8:$Vs, I32:$Rt),
+           (V6_vpackeb (V6_vlsrh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
+                       (V6_vlsrh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
+
+  def: Pat<(HexagonVASL HVI16:$Vs, I32:$Rt), (V6_vaslh HvxVR:$Vs, I32:$Rt)>;
+  def: Pat<(HexagonVASL HVI32:$Vs, I32:$Rt), (V6_vaslw HvxVR:$Vs, I32:$Rt)>;
+  def: Pat<(HexagonVASR HVI16:$Vs, I32:$Rt), (V6_vasrh HvxVR:$Vs, I32:$Rt)>;
+  def: Pat<(HexagonVASR HVI32:$Vs, I32:$Rt), (V6_vasrw HvxVR:$Vs, I32:$Rt)>;
+  def: Pat<(HexagonVLSR HVI16:$Vs, I32:$Rt), (V6_vlsrh HvxVR:$Vs, I32:$Rt)>;
+  def: Pat<(HexagonVLSR HVI32:$Vs, I32:$Rt), (V6_vlsrw HvxVR:$Vs, I32:$Rt)>;
+
+  def: Pat<(add HVI32:$Vx, (HexagonVASL HVI32:$Vu, I32:$Rt)),
+           (V6_vaslw_acc HvxVR:$Vx, HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(add HVI32:$Vx, (HexagonVASR HVI32:$Vu, I32:$Rt)),
+           (V6_vasrw_acc HvxVR:$Vx, HvxVR:$Vu, I32:$Rt)>;
+
+  def: Pat<(shl HVI16:$Vs, HVI16:$Vt), (V6_vaslhv HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(shl HVI32:$Vs, HVI32:$Vt), (V6_vaslwv HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(sra HVI16:$Vs, HVI16:$Vt), (V6_vasrhv HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(sra HVI32:$Vs, HVI32:$Vt), (V6_vasrwv HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>;
+
+  def: Pat<(VecI16 (bswap HVI16:$Vs)),
+           (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x01010101)))>;
+  def: Pat<(VecI32 (bswap HVI32:$Vs)),
+           (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x03030303)))>;
+
+  def: Pat<(VecI8 (ctpop HVI8:$Vs)),
+           (V6_vpackeb (V6_vpopcounth (HiVec (V6_vunpackub HvxVR:$Vs))),
+                       (V6_vpopcounth (LoVec (V6_vunpackub HvxVR:$Vs))))>;
+  def: Pat<(VecI16 (ctpop HVI16:$Vs)), (V6_vpopcounth HvxVR:$Vs)>;
+  def: Pat<(VecI32 (ctpop HVI32:$Vs)),
+           (V6_vaddw (LoVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))),
+                     (HiVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))))>;
+
+  def: Pat<(VecI8 (ctlz HVI8:$Vs)),
+           (V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))),
+                                 (V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))),
+                     (V6_lvsplatw (A2_tfrsi 0x08080808)))>;
+  def: Pat<(VecI16 (ctlz HVI16:$Vs)), (V6_vcl0h HvxVR:$Vs)>;
+  def: Pat<(VecI32 (ctlz HVI32:$Vs)), (V6_vcl0w HvxVR:$Vs)>;
+}
+
+class HvxSel_pat<InstHexagon MI, PatFrag RegPred>
+  : Pat<(select I1:$Pu, RegPred:$Vs, RegPred:$Vt),
+        (MI I1:$Pu, RegPred:$Vs, RegPred:$Vt)>;
+
+let Predicates = [UseHVX] in {
+  def: HvxSel_pat<PS_vselect, HVI8>;
+  def: HvxSel_pat<PS_vselect, HVI16>;
+  def: HvxSel_pat<PS_vselect, HVI32>;
+  def: HvxSel_pat<PS_wselect, HWI8>;
+  def: HvxSel_pat<PS_wselect, HWI16>;
+  def: HvxSel_pat<PS_wselect, HWI32>;
+}
+
+let Predicates = [UseHVX] in {
+  def: Pat<(VecQ8   (qtrue)), (PS_qtrue)>;
+  def: Pat<(VecQ16  (qtrue)), (PS_qtrue)>;
+  def: Pat<(VecQ32  (qtrue)), (PS_qtrue)>;
+  def: Pat<(VecQ8  (qfalse)), (PS_qfalse)>;
+  def: Pat<(VecQ16 (qfalse)), (PS_qfalse)>;
+  def: Pat<(VecQ32 (qfalse)), (PS_qfalse)>;
+
+  def: Pat<(vnot  HQ8:$Qs), (V6_pred_not HvxQR:$Qs)>;
+  def: Pat<(vnot HQ16:$Qs), (V6_pred_not HvxQR:$Qs)>;
+  def: Pat<(vnot HQ32:$Qs), (V6_pred_not HvxQR:$Qs)>;
+  def: Pat<(qnot  HQ8:$Qs), (V6_pred_not HvxQR:$Qs)>;
+  def: Pat<(qnot HQ16:$Qs), (V6_pred_not HvxQR:$Qs)>;
+  def: Pat<(qnot HQ32:$Qs), (V6_pred_not HvxQR:$Qs)>;
+
+  def: OpR_RR_pat<V6_pred_and,         And,  VecQ8,   HQ8>;
+  def: OpR_RR_pat<V6_pred_and,         And, VecQ16,  HQ16>;
+  def: OpR_RR_pat<V6_pred_and,         And, VecQ32,  HQ32>;
+  def: OpR_RR_pat<V6_pred_or,           Or,  VecQ8,   HQ8>;
+  def: OpR_RR_pat<V6_pred_or,           Or, VecQ16,  HQ16>;
+  def: OpR_RR_pat<V6_pred_or,           Or, VecQ32,  HQ32>;
+  def: OpR_RR_pat<V6_pred_xor,         Xor,  VecQ8,   HQ8>;
+  def: OpR_RR_pat<V6_pred_xor,         Xor, VecQ16,  HQ16>;
+  def: OpR_RR_pat<V6_pred_xor,         Xor, VecQ32,  HQ32>;
+
+  def: OpR_RR_pat<V6_pred_and_n, Not2<And>,  VecQ8,   HQ8>;
+  def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ16,  HQ16>;
+  def: OpR_RR_pat<V6_pred_and_n, Not2<And>, VecQ32,  HQ32>;
+  def: OpR_RR_pat<V6_pred_or_n,   Not2<Or>,  VecQ8,   HQ8>;
+  def: OpR_RR_pat<V6_pred_or_n,   Not2<Or>, VecQ16,  HQ16>;
+  def: OpR_RR_pat<V6_pred_or_n,   Not2<Or>, VecQ32,  HQ32>;
+
+  def: OpR_RR_pat<V6_veqb,              seteq,  VecQ8,  HVI8>;
+  def: OpR_RR_pat<V6_veqh,              seteq, VecQ16, HVI16>;
+  def: OpR_RR_pat<V6_veqw,              seteq, VecQ32, HVI32>;
+  def: OpR_RR_pat<V6_vgtb,              setgt,  VecQ8,  HVI8>;
+  def: OpR_RR_pat<V6_vgth,              setgt, VecQ16, HVI16>;
+  def: OpR_RR_pat<V6_vgtw,              setgt, VecQ32, HVI32>;
+  def: OpR_RR_pat<V6_vgtub,            setugt,  VecQ8,  HVI8>;
+  def: OpR_RR_pat<V6_vgtuh,            setugt, VecQ16, HVI16>;
+  def: OpR_RR_pat<V6_vgtuw,            setugt, VecQ32, HVI32>;
+
+  def: AccRRR_pat<V6_veqb_and,    And,  seteq,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_veqb_or,      Or,  seteq,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_veqb_xor,    Xor,  seteq,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_veqh_and,    And,  seteq,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_veqh_or,      Or,  seteq,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_veqh_xor,    Xor,  seteq,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_veqw_and,    And,  seteq,   HQ32, HVI32, HVI32>;
+  def: AccRRR_pat<V6_veqw_or,      Or,  seteq,   HQ32, HVI32, HVI32>;
+  def: AccRRR_pat<V6_veqw_xor,    Xor,  seteq,   HQ32, HVI32, HVI32>;
+
+  def: AccRRR_pat<V6_vgtb_and,    And,  setgt,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_vgtb_or,      Or,  setgt,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_vgtb_xor,    Xor,  setgt,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_vgth_and,    And,  setgt,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_vgth_or,      Or,  setgt,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_vgth_xor,    Xor,  setgt,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_vgtw_and,    And,  setgt,   HQ32, HVI32, HVI32>;
+  def: AccRRR_pat<V6_vgtw_or,      Or,  setgt,   HQ32, HVI32, HVI32>;
+  def: AccRRR_pat<V6_vgtw_xor,    Xor,  setgt,   HQ32, HVI32, HVI32>;
+
+  def: AccRRR_pat<V6_vgtub_and,   And, setugt,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_vgtub_or,     Or, setugt,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_vgtub_xor,   Xor, setugt,    HQ8,  HVI8,  HVI8>;
+  def: AccRRR_pat<V6_vgtuh_and,   And, setugt,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_vgtuh_or,     Or, setugt,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_vgtuh_xor,   Xor, setugt,   HQ16, HVI16, HVI16>;
+  def: AccRRR_pat<V6_vgtuw_and,   And, setugt,   HQ32, HVI32, HVI32>;
+  def: AccRRR_pat<V6_vgtuw_or,     Or, setugt,   HQ32, HVI32, HVI32>;
+  def: AccRRR_pat<V6_vgtuw_xor,   Xor, setugt,   HQ32, HVI32, HVI32>;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td b/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td
index b2d66317b66e..fd7466349ecd 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -24,7 +24,7 @@ let PrintMethod = "printGlobalOperand" in {
 let isPseudo = 1 in {
 let isCodeGenOnly = 0 in
 def A2_iconst : Pseudo<(outs IntRegs:$Rd32),
-    (ins s27_2Imm:$Ii), "${Rd32}=iconst(#${Ii})">;
+    (ins s27_2Imm:$Ii), "${Rd32} = iconst(#${Ii})">;
 
 def DUPLEX_Pseudo : InstHexagon<(outs),
     (ins s32_0Imm:$offset), "DUPLEX", [], "", DUPLEX, TypePSEUDO>;
@@ -34,7 +34,7 @@ let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
     isAsmParserOnly = 1 in
 def TFRI64_V2_ext : InstHexagon<(outs DoubleRegs:$dst),
     (ins s32_0Imm:$src1, s8_0Imm:$src2),
-    "$dst=combine(#$src1,#$src2)", [], "",
+    "$dst = combine(#$src1,#$src2)", [], "",
     A2_combineii.Itinerary, TypeALU32_2op>, OpcodeHexagon;
 
 // HI/LO Instructions
@@ -44,7 +44,7 @@ class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp,
                 InstHexagon rootInst>
   : InstHexagon<(outs IntRegs:$dst),
                 (ins u16_0Imm:$imm_value),
-                "$dst"#RegHalf#"=#$imm_value", [], "",
+                "$dst"#RegHalf#" = #$imm_value", [], "",
                 rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
     bits<5> dst;
     bits<32> imm_value;
@@ -102,6 +102,13 @@ def ENDLOOP1 : Endloop<(outs), (ins b30_2Imm:$offset),
                        []>;
 }
 
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC0, LC1], Uses = [SA0, SA1, LC0, LC1] in {
+def ENDLOOP01 : Endloop<(outs), (ins b30_2Imm:$offset),
+                        ":endloop01",
+                        []>;
+}
+
 let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
     opExtendable = 0, hasSideEffects = 0 in
 class LOOP_iBase<string mnemonic, InstHexagon rootInst>
@@ -316,7 +323,7 @@ def LDriw_pred : LDInst<(outs PredRegs:$dst),
 // Load modifier.
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
     isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_mod : LDInst<(outs ModRegs:$dst),
+def LDriw_ctr : LDInst<(outs CtrRegs:$dst),
                         (ins IntRegs:$addr, s32_0Imm:$off),
                         ".error \"should not emit\"", []>;
 
@@ -399,42 +406,42 @@ let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
 }
 
 // Vector store pseudos
-let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+let Predicates = [HasV60,UseHVX], isPseudo = 1, isCodeGenOnly = 1,
     mayStore = 1, accessSize = HVXVectorAccess, hasSideEffects = 0 in
 class STrivv_template<RegisterClass RC, InstHexagon rootInst>
   : InstHexagon<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src),
     "", [], "", rootInst.Itinerary, rootInst.Type>;
 
 def PS_vstorerw_ai: STrivv_template<HvxWR, V6_vS32b_ai>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 def PS_vstorerw_nt_ai: STrivv_template<HvxWR, V6_vS32b_nt_ai>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 def PS_vstorerwu_ai: STrivv_template<HvxWR, V6_vS32Ub_ai>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 
 let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in
 def PS_vstorerq_ai: Pseudo<(outs),
       (ins IntRegs:$Rs, s32_0Imm:$Off, HvxQR:$Qt), "", []>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 
 // Vector load pseudos
-let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+let Predicates = [HasV60, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
     mayLoad = 1, accessSize = HVXVectorAccess, hasSideEffects = 0 in
 class LDrivv_template<RegisterClass RC, InstHexagon rootInst>
   : InstHexagon<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off),
     "", [], "", rootInst.Itinerary, rootInst.Type>;
 
 def PS_vloadrw_ai: LDrivv_template<HvxWR, V6_vL32b_ai>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 def PS_vloadrw_nt_ai: LDrivv_template<HvxWR, V6_vL32b_nt_ai>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 def PS_vloadrwu_ai: LDrivv_template<HvxWR, V6_vL32Ub_ai>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 
 let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in
 def PS_vloadrq_ai: Pseudo<(outs HvxQR:$Qd),
       (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 
 
 let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
@@ -443,10 +450,20 @@ class VSELInst<dag outs, dag ins, InstHexagon rootInst>
 
 def PS_vselect: VSELInst<(outs HvxVR:$dst),
       (ins PredRegs:$src1, HvxVR:$src2, HvxVR:$src3), V6_vcmov>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
 def PS_wselect: VSELInst<(outs HvxWR:$dst),
       (ins PredRegs:$src1, HvxWR:$src2, HvxWR:$src3), V6_vccombine>,
-      Requires<[HasV60T,UseHVX]>;
+      Requires<[HasV60,UseHVX]>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in {
+  def PS_qtrue:  InstHexagon<(outs HvxQR:$Qd), (ins), "", [], "",
+                 V6_veqw.Itinerary, TypeCVI_VA>;
+  def PS_qfalse: InstHexagon<(outs HvxQR:$Qd), (ins), "", [], "",
+                 V6_vgtw.Itinerary, TypeCVI_VA>;
+  def PS_vdd0:   InstHexagon<(outs HvxWR:$Vd), (ins), "", [], "",
+                 V6_vsubw_dv.Itinerary, TypeCVI_VA_DV>;
+}
 
 // Store predicate.
 let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
@@ -457,8 +474,8 @@ def STriw_pred : STInst<(outs),
 // Store modifier.
 let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
     isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_mod : STInst<(outs),
-      (ins IntRegs:$addr, s32_0Imm:$off, ModRegs:$src1),
+def STriw_ctr : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, CtrRegs:$src1),
       ".error \"should not emit\"", []>;
 
 let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
@@ -499,3 +516,46 @@ def DuplexIClassC:  InstDuplex < 0xC >;
 def DuplexIClassD:  InstDuplex < 0xD >;
 def DuplexIClassE:  InstDuplex < 0xE >;
 def DuplexIClassF:  InstDuplex < 0xF >;
+
+// Pseudos for circular buffer instructions. These are needed in order to
+// allocate the correct pair of CSx and Mx registers.
+multiclass NewCircularLoad<RegisterClass RC, MemAccessSize MS> {
+
+let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
+    addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
+  def NAME#_pci : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
+       (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Cs),
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_4403ca65>;
+
+  def NAME#_pcr : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
+       (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Cs),
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_2fc0c436>;
+}
+}
+
+defm PS_loadrub : NewCircularLoad<IntRegs, ByteAccess>;
+defm PS_loadrb : NewCircularLoad<IntRegs, ByteAccess>;
+defm PS_loadruh : NewCircularLoad<IntRegs, HalfWordAccess>;
+defm PS_loadrh : NewCircularLoad<IntRegs, HalfWordAccess>;
+defm PS_loadri : NewCircularLoad<IntRegs, WordAccess>;
+defm PS_loadrd : NewCircularLoad<DoubleRegs, DoubleWordAccess>;
+
+multiclass NewCircularStore<RegisterClass RC, MemAccessSize MS> {
+
+let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
+    addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
+  def NAME#_pci : STInst<(outs IntRegs:$Rx32),
+       (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_9fdb5406>;
+
+  def NAME#_pcr : STInst<(outs IntRegs:$Rx32),
+       (ins IntRegs:$Rx32in, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_f86c328a>;
+}
+}
+
+defm PS_storerb : NewCircularStore<IntRegs, ByteAccess>;
+defm PS_storerh : NewCircularStore<IntRegs, HalfWordAccess>;
+defm PS_storerf : NewCircularStore<IntRegs, HalfWordAccess>;
+defm PS_storeri : NewCircularStore<IntRegs, WordAccess>;
+defm PS_storerd : NewCircularStore<DoubleRegs, WordAccess>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 85d6a6b4089e..2e11f875c0f9 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -19,6 +19,7 @@
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -145,6 +146,13 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
   Reserved.set(Hexagon::VTMP);
+
+  // Guest registers.
+  Reserved.set(Hexagon::GELR);        // G0
+  Reserved.set(Hexagon::GSR);         // G1
+  Reserved.set(Hexagon::GOSP);        // G2
+  Reserved.set(Hexagon::G3);          // G3
+
   // Control registers.
   Reserved.set(Hexagon::SA0);         // C0
   Reserved.set(Hexagon::LC0);         // C1
@@ -171,6 +179,9 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   Reserved.set(Hexagon::C8);
   Reserved.set(Hexagon::USR_OVF);
 
+  if (MF.getSubtarget<HexagonSubtarget>().hasReservedR19())
+    Reserved.set(Hexagon::R19);
+
   for (int x = Reserved.find_first(); x >= 0; x = Reserved.find_next(x))
     markSuperRegs(Reserved, x);
 
@@ -233,6 +244,55 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 
+bool HexagonRegisterInfo::shouldCoalesce(MachineInstr *MI,
+      const TargetRegisterClass *SrcRC, unsigned SubReg,
+      const TargetRegisterClass *DstRC, unsigned DstSubReg,
+      const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+  // Coalescing will extend the live interval of the destination register.
+  // If the destination register is a vector pair, avoid introducing function
+  // calls into the interval, since it could result in a spilling of a pair
+  // instead of a single vector.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+  if (!HST.useHVXOps() || NewRC->getID() != Hexagon::HvxWRRegClass.getID())
+    return true;
+  bool SmallSrc = SrcRC->getID() == Hexagon::HvxVRRegClass.getID();
+  bool SmallDst = DstRC->getID() == Hexagon::HvxVRRegClass.getID();
+  if (!SmallSrc && !SmallDst)
+    return true;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  const SlotIndexes &Indexes = *LIS.getSlotIndexes();
+  auto HasCall = [&Indexes] (const LiveInterval::Segment &S) {
+    for (SlotIndex I = S.start.getBaseIndex(), E = S.end.getBaseIndex();
+         I != E; I = I.getNextIndex()) {
+      if (const MachineInstr *MI = Indexes.getInstructionFromIndex(I))
+        if (MI->isCall())
+          return true;
+    }
+    return false;
+  };
+
+  if (SmallSrc == SmallDst) {
+    // Both must be true, because the case for both being false was
+    // checked earlier. Both registers will be coalesced into a register
+    // of a wider class (HvxWR), and we don't want its live range to
+    // span over calls.
+    return !any_of(LIS.getInterval(DstReg), HasCall) &&
+           !any_of(LIS.getInterval(SrcReg), HasCall);
+  }
+
+  // If one register is large (HvxWR) and the other is small (HvxVR), then
+  // coalescing is ok if the large is already live across a function call,
+  // or if the small one is not.
+  unsigned SmallReg = SmallSrc ? SrcReg : DstReg;
+  unsigned LargeReg = SmallSrc ? DstReg : SrcReg;
+  return  any_of(LIS.getInterval(LargeReg), HasCall) ||
+         !any_of(LIS.getInterval(SmallReg), HasCall);
+}
+
+
 unsigned HexagonRegisterInfo::getRARegister() const {
   return Hexagon::R31;
 }
@@ -283,6 +343,11 @@ bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
   return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF);
 }
 
+const TargetRegisterClass *
+HexagonRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                        unsigned Kind) const {
+  return &Hexagon::IntRegsRegClass;
+}
 
 unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
   return Hexagon::R6;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
index 4ead57da8fa1..497dc45236b1 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -39,6 +39,8 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
         unsigned FIOperandNum, RegScavenger *RS = nullptr) const override;
 
@@ -61,6 +63,10 @@ public:
     return true;
   }
 
+  bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC,
+        unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg,
+        const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override;
+
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const override;
@@ -75,6 +81,10 @@ public:
 
   unsigned getFirstCallerSavedNonParamReg() const;
 
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
   bool isEHReturnCalleeSaveReg(unsigned Reg) const;
 };
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index 1d1e85e7ac7e..1fe1ef4ac572 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -67,6 +67,17 @@ let Namespace = "Hexagon" in {
     let HWEncoding{0} = num;
   }
 
+  // Rg - Guest/Hypervisor registers
+  class Rg<bits<5> num, string n,
+           list<string> alt = [], list<Register> alias = []> :
+        HexagonReg<num, n, alt, alias>;
+
+  // Rgg - 64-bit Guest/Hypervisor registers
+  class Rgg<bits<5> num, string n, list<Register> subregs> :
+        HexagonDoubleReg<num, n, subregs> {
+    let SubRegs = subregs;
+  }
+
   def isub_lo  : SubRegIndex<32>;
   def isub_hi  : SubRegIndex<32, 32>;
   def vsub_lo  : SubRegIndex<512>;
@@ -200,40 +211,87 @@ let Namespace = "Hexagon" in {
   def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
   def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
   def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
+
+  // Guest Registers
+  def GELR:      Rg<0,  "gelr", ["g0"]>,       DwarfRegNum<[220]>;
+  def GSR:       Rg<1,  "gsr", ["g1"]>,        DwarfRegNum<[221]>;
+  def GOSP:      Rg<2,  "gosp", ["g2"]>,       DwarfRegNum<[222]>;
+  def G3:        Rg<3,  "gbadva", ["g3"]>,     DwarfRegNum<[223]>;
+  def G4:        Rg<4,  "g4">,                 DwarfRegNum<[224]>;
+  def G5:        Rg<5,  "g5">,                 DwarfRegNum<[225]>;
+  def G6:        Rg<6,  "g6">,                 DwarfRegNum<[226]>;
+  def G7:        Rg<7,  "g7">,                 DwarfRegNum<[227]>;
+  def G8:        Rg<8,  "g8">,                 DwarfRegNum<[228]>;
+  def G9:        Rg<9,  "g9">,                 DwarfRegNum<[229]>;
+  def G10:       Rg<10, "g10">,                DwarfRegNum<[230]>;
+  def G11:       Rg<11, "g11">,                DwarfRegNum<[231]>;
+  def G12:       Rg<12, "g12">,                DwarfRegNum<[232]>;
+  def G13:       Rg<13, "g13">,                DwarfRegNum<[233]>;
+  def G14:       Rg<14, "g14">,                DwarfRegNum<[234]>;
+  def G15:       Rg<15, "g15">,                DwarfRegNum<[235]>;
+  def GPMUCNT4:  Rg<16, "gpmucnt4", ["g16"]>,  DwarfRegNum<[236]>;
+  def GPMUCNT5:  Rg<17, "gpmucnt5", ["g17"]>,  DwarfRegNum<[237]>;
+  def GPMUCNT6:  Rg<18, "gpmucnt6", ["g18"]>,  DwarfRegNum<[238]>;
+  def GPMUCNT7:  Rg<19, "gpmucnt7", ["g19"]>,  DwarfRegNum<[239]>;
+  def G20:       Rg<20, "g20">,                DwarfRegNum<[240]>;
+  def G21:       Rg<21, "g21">,                DwarfRegNum<[241]>;
+  def G22:       Rg<22, "g22">,                DwarfRegNum<[242]>;
+  def G23:       Rg<23, "g23">,                DwarfRegNum<[243]>;
+  def GPCYCLELO: Rg<24, "gpcyclelo", ["g24"]>, DwarfRegNum<[244]>;
+  def GPCYCLEHI: Rg<25, "gpcyclehi", ["g25"]>, DwarfRegNum<[245]>;
+  def GPMUCNT0:  Rg<26, "gpmucnt0",  ["g26"]>, DwarfRegNum<[246]>;
+  def GPMUCNT1:  Rg<27, "gpmucnt1",  ["g27"]>, DwarfRegNum<[247]>;
+  def GPMUCNT2:  Rg<28, "gpmucnt2",  ["g28"]>, DwarfRegNum<[248]>;
+  def GPMUCNT3:  Rg<29, "gpmucnt3",  ["g29"]>, DwarfRegNum<[249]>;
+  def G30:       Rg<30, "g30">,                DwarfRegNum<[250]>;
+  def G31:       Rg<31, "g31">,                DwarfRegNum<[251]>;
+
+  // Guest Register Pairs
+  let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+    def G1_0   : Rgg<0,  "g1:0",   [GELR, GSR]>,            DwarfRegNum<[220]>;
+    def G3_2   : Rgg<2,  "g3:2",   [GOSP, G3]>,             DwarfRegNum<[222]>;
+    def G5_4   : Rgg<4,  "g5:4",   [G4, G5]>,               DwarfRegNum<[224]>;
+    def G7_6   : Rgg<6,  "g7:6",   [G6, G7]>,               DwarfRegNum<[226]>;
+    def G9_8   : Rgg<8,  "g9:8",   [G8, G9]>,               DwarfRegNum<[228]>;
+    def G11_10 : Rgg<10, "g11:10", [G10, G11]>,             DwarfRegNum<[230]>;
+    def G13_12 : Rgg<12, "g13:12", [G12, G13]>,             DwarfRegNum<[232]>;
+    def G15_14 : Rgg<14, "g15:14", [G14, G15]>,             DwarfRegNum<[234]>;
+    def G17_16 : Rgg<16, "g17:16", [GPMUCNT4, GPMUCNT5]>,   DwarfRegNum<[236]>;
+    def G19_18 : Rgg<18, "g19:18", [GPMUCNT6, GPMUCNT7]>,   DwarfRegNum<[238]>;
+    def G21_20 : Rgg<20, "g21:20", [G20, G21]>,             DwarfRegNum<[240]>;
+    def G23_22 : Rgg<22, "g23:22", [G22, G23]>,             DwarfRegNum<[242]>;
+    def G25_24 : Rgg<24, "g25:24", [GPCYCLELO, GPCYCLEHI]>, DwarfRegNum<[244]>;
+    def G27_26 : Rgg<26, "g27:26", [GPMUCNT0, GPMUCNT1]>,   DwarfRegNum<[246]>;
+    def G29_28 : Rgg<28, "g29:28", [GPMUCNT2, GPMUCNT3]>,   DwarfRegNum<[248]>;
+    def G31_30 : Rgg<30, "g31:30", [G30, G31]>,             DwarfRegNum<[250]>;
+  }
+
 }
 
 // HVX types
 
-def VecI1
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v512i1, v512i1,    v1024i1, v1024i1,   v512i1]>;
-def VecI8
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v64i8,  v64i8,     v128i8,  v128i8,    v64i8]>;
-def VecI16
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v32i16, v32i16,    v64i16,  v64i16,    v32i16]>;
-def VecI32
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v16i32, v16i32,    v32i32,  v32i32,    v16i32]>;
-def VecPI8
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v128i8, v128i8,    v256i8,  v256i8,    v128i8]>;
-def VecPI16
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v64i16, v64i16,    v128i16, v128i16,   v64i16]>;
-def VecPI32
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v32i32, v32i32,    v64i32,  v64i32,    v32i32]>;
-def VecQ8
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v64i1,  v64i1,     v128i1,  v128i1,    v64i1]>;
-def VecQ16
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v32i1,  v32i1,     v64i1,   v64i1,     v32i1]>;
-def VecQ32
-  : ValueTypeByHwMode<[Hvx64,  Hvx64old,  Hvx128,  Hvx128old, DefaultMode],
-                      [v16i1,  v16i1,     v32i1,   v32i1,     v16i1]>;
+def VecI1:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v512i1, v1024i1, v512i1]>;
+def VecI8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v64i8,  v128i8,  v64i8]>;
+def VecI16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32i16, v64i16,  v32i16]>;
+def VecI32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v16i32, v32i32,  v16i32]>;
+
+def VecPI8:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v128i8, v256i8,  v128i8]>;
+def VecPI16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v64i16, v128i16, v64i16]>;
+def VecPI32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32i32, v64i32,  v32i32]>;
+
+def VecQ8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v64i1,  v128i1,  v64i1]>;
+def VecQ16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32i1,  v64i1,   v32i1]>;
+def VecQ32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v16i1,  v32i1,   v16i1]>;
 
 // HVX register classes
 
@@ -242,7 +300,7 @@ def VecQ32
 // FIXME: the register order should be defined in terms of the preferred
 // allocation order...
 //
-def IntRegs : RegisterClass<"Hexagon", [i32, f32, v32i1, v4i8, v2i16], 32,
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
   (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),
        R10, R11, R29, R30, R31)>;
 
@@ -254,8 +312,7 @@ def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
 def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
   (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
 
-def DoubleRegs : RegisterClass<"Hexagon",
-  [i64, f64, v64i1, v8i8, v4i16, v2i32], 64,
+def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
   (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;
 
 def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
@@ -301,6 +358,25 @@ def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
   (add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE, C17_16,
        PKTCOUNT, UTIMER)>;
 
+let Size = 32, isAllocatable = 0 in
+def GuestRegs : RegisterClass<"Hexagon", [i32], 32,
+  (add GELR, GSR, GOSP,
+       (sequence "G%u", 3, 15),
+       GPMUCNT4, GPMUCNT5, GPMUCNT6, GPMUCNT7,
+       G20, G21, G22, G23,
+       GPCYCLELO, GPCYCLEHI, GPMUCNT0, GPMUCNT1,
+       GPMUCNT2,  GPMUCNT3,
+       G30, G31)>;
+
+let Size = 64, isAllocatable = 0 in
+def GuestRegs64 : RegisterClass<"Hexagon", [i64], 64,
+  (add G1_0, G3_2,
+       G5_4, G7_6, G9_8, G11_10, G13_12, G15_14,
+       G17_16, G19_18,
+       G21_20, G23_22,
+       G25_24, G27_26, G29_28,
+       G31_30)>;
+
 // These registers are new for v62 and onward.
 // The function RegisterMatchesArch() uses this list for validation.
 let isAllocatable = 0 in
@@ -313,7 +389,6 @@ let Size = 32, isAllocatable = 0 in
 def V65Regs : RegisterClass<"Hexagon", [i32], 32, (add VTMP)>;
 
 
-
 def HexagonCSR
   : CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
                          R24, R25, R26, R27)>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 3fe4cc73d2f3..c41f0d3c085c 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -60,14 +60,14 @@ INITIALIZE_PASS(HexagonSplitConst32AndConst64, "split-const-for-sdata",
       "Hexagon Split Const32s and Const64s", false, false)
 
 bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
-  const HexagonTargetObjectFile &TLOF =
-      *static_cast<const HexagonTargetObjectFile *>(
-          Fn.getTarget().getObjFileLowering());
-  if (TLOF.isSmallDataEnabled())
-    return true;
+  auto &HST = Fn.getSubtarget<HexagonSubtarget>();
+  auto &HTM = static_cast<const HexagonTargetMachine&>(Fn.getTarget());
+  auto &TLOF = *HTM.getObjFileLowering();
+  if (HST.useSmallData() && TLOF.isSmallDataEnabled())
+    return false;
 
-  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetInstrInfo *TII = HST.getInstrInfo();
+  const TargetRegisterInfo *TRI = HST.getRegisterInfo();
 
   // Loop over all of the basic blocks
   for (MachineBasicBlock &B : Fn) {
@@ -109,7 +109,6 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
-
 FunctionPass *llvm::createHexagonSplitConst32AndConst64() {
   return new HexagonSplitConst32AndConst64();
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index c9f5400018e8..e018785f24d8 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -55,6 +56,8 @@ static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1),
     cl::desc("Maximum number of split partitions"));
 static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true),
     cl::desc("Do not split loads or stores"));
+  static cl::opt<bool> SplitAll("hsdr-split-all", cl::Hidden, cl::init(false),
+      cl::desc("Split all partitions"));
 
 namespace {
 
@@ -62,9 +65,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonSplitDoubleRegs() : MachineFunctionPass(ID) {
-      initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonSplitDoubleRegs() : MachineFunctionPass(ID) {}
 
     StringRef getPassName() const override {
       return "Hexagon Split Double Registers";
@@ -97,6 +98,7 @@ namespace {
     bool isFixedInstr(const MachineInstr *MI) const;
     void partitionRegisters(UUSetMap &P2Rs);
     int32_t profit(const MachineInstr *MI) const;
+    int32_t profit(unsigned Reg) const;
     bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
 
     void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
@@ -161,7 +163,7 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
   if (MI->mayLoad() || MI->mayStore())
     if (MemRefsFixed || isVolatileInstr(MI))
       return true;
-  if (MI->isDebugValue())
+  if (MI->isDebugInstr())
     return false;
 
   unsigned Opc = MI->getOpcode();
@@ -244,7 +246,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
     if (FixedRegs[x])
       continue;
     unsigned R = TargetRegisterInfo::index2VirtReg(x);
-    DEBUG(dbgs() << printReg(R, TRI) << " ~~");
+    LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~");
     USet &Asc = AssocMap[R];
     for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
          U != Z; ++U) {
@@ -267,13 +269,13 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
         unsigned u = TargetRegisterInfo::virtReg2Index(T);
         if (FixedRegs[u])
           continue;
-        DEBUG(dbgs() << ' ' << printReg(T, TRI));
+        LLVM_DEBUG(dbgs() << ' ' << printReg(T, TRI));
         Asc.insert(T);
         // Make it symmetric.
         AssocMap[T].insert(R);
       }
     }
-    DEBUG(dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << '\n');
   }
 
   UUMap R2P;
@@ -304,15 +306,10 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
     P2Rs[I.second].insert(I.first);
 }
 
-static inline int32_t profitImm(unsigned Lo, unsigned Hi) {
+static inline int32_t profitImm(unsigned Imm) {
   int32_t P = 0;
-  bool LoZ1 = false, HiZ1 = false;
-  if (Lo == 0 || Lo == 0xFFFFFFFF)
-    P += 10, LoZ1 = true;
-  if (Hi == 0 || Hi == 0xFFFFFFFF)
-    P += 10, HiZ1 = true;
-  if (!LoZ1 && !HiZ1 && Lo == Hi)
-    P += 3;
+  if (Imm == 0 || Imm == 0xFFFFFFFF)
+    P += 10;
   return P;
 }
 
@@ -342,21 +339,28 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
       uint64_t D = MI->getOperand(1).getImm();
       unsigned Lo = D & 0xFFFFFFFFULL;
       unsigned Hi = D >> 32;
-      return profitImm(Lo, Hi);
+      return profitImm(Lo) + profitImm(Hi);
     }
     case Hexagon::A2_combineii:
-    case Hexagon::A4_combineii:
-      return profitImm(MI->getOperand(1).getImm(),
-                       MI->getOperand(2).getImm());
+    case Hexagon::A4_combineii: {
+      const MachineOperand &Op1 = MI->getOperand(1);
+      const MachineOperand &Op2 = MI->getOperand(2);
+      int32_t Prof1 = Op1.isImm() ? profitImm(Op1.getImm()) : 0;
+      int32_t Prof2 = Op2.isImm() ? profitImm(Op2.getImm()) : 0;
+      return Prof1 + Prof2;
+    }
     case Hexagon::A4_combineri:
       ImmX++;
       // Fall through into A4_combineir.
       LLVM_FALLTHROUGH;
     case Hexagon::A4_combineir: {
       ImmX++;
-      int64_t V = MI->getOperand(ImmX).getImm();
-      if (V == 0 || V == -1)
-        return 10;
+      const MachineOperand &OpX = MI->getOperand(ImmX);
+      if (OpX.isImm()) {
+        int64_t V = OpX.getImm();
+        if (V == 0 || V == -1)
+          return 10;
+      }
       // Fall through into A2_combinew.
       LLVM_FALLTHROUGH;
     }
@@ -368,8 +372,11 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
 
     case Hexagon::A2_andp:
     case Hexagon::A2_orp:
-    case Hexagon::A2_xorp:
-      return 1;
+    case Hexagon::A2_xorp: {
+      unsigned Rs = MI->getOperand(1).getReg();
+      unsigned Rt = MI->getOperand(2).getReg();
+      return profit(Rs) + profit(Rt);
+    }
 
     case Hexagon::S2_asl_i_p_or: {
       unsigned S = MI->getOperand(3).getImm();
@@ -393,6 +400,25 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
   return 0;
 }
 
+int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+
+  const MachineInstr *DefI = MRI->getVRegDef(Reg);
+  switch (DefI->getOpcode()) {
+    case Hexagon::A2_tfrpi:
+    case Hexagon::CONST64:
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+    case Hexagon::A4_combineri:
+    case Hexagon::A4_combineir:
+    case Hexagon::A2_combinew:
+      return profit(DefI);
+    default:
+      break;
+  }
+  return 0;
+}
+
 bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
       const {
   unsigned FixedNum = 0, LoopPhiNum = 0;
@@ -442,7 +468,9 @@ bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
   if (FixedNum > 0 && LoopPhiNum > 0)
     TotalP -= 20*LoopPhiNum;
 
-  DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+  LLVM_DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+  if (SplitAll)
+    return true;
   return TotalP > 0;
 }
 
@@ -535,7 +563,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
   Rs.insert(CmpR1);
   Rs.insert(CmpR2);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "For loop at " << printMBBReference(*HB) << " ind regs: ";
     dump_partition(dbgs(), Rs, *TRI);
     dbgs() << '\n';
@@ -710,23 +738,21 @@ void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI,
   assert(F != PairMap.end());
   const UUPair &P = F->second;
 
-  if (Op1.isImm()) {
+  if (!Op1.isReg()) {
     BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
-      .addImm(Op1.getImm());
-  } else if (Op1.isReg()) {
+      .add(Op1);
+  } else {
     BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second)
       .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg());
-  } else
-    llvm_unreachable("Unexpected operand");
+  }
 
-  if (Op2.isImm()) {
+  if (!Op2.isReg()) {
     BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
-      .addImm(Op2.getImm());
-  } else if (Op2.isReg()) {
+      .add(Op2);
+  } else {
     BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
       .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg());
-  } else
-    llvm_unreachable("Unexpected operand");
+  }
 }
 
 void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI,
@@ -970,7 +996,7 @@ bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
       const UUPairMap &PairMap) {
   using namespace Hexagon;
 
-  DEBUG(dbgs() << "Splitting: " << *MI);
+  LLVM_DEBUG(dbgs() << "Splitting: " << *MI);
   bool Split = false;
   unsigned Opc = MI->getOpcode();
 
@@ -1104,8 +1130,8 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
   const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
   bool Changed = false;
 
-  DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI);
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Splitting partition: ";
+             dump_partition(dbgs(), Part, *TRI); dbgs() << '\n');
 
   UUPairMap PairMap;
 
@@ -1122,8 +1148,9 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
 
     unsigned LoR = MRI->createVirtualRegister(IntRC);
     unsigned HiR = MRI->createVirtualRegister(IntRC);
-    DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> "
-                 << printReg(HiR, TRI) << ':' << printReg(LoR, TRI) << '\n');
+    LLVM_DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> "
+                      << printReg(HiR, TRI) << ':' << printReg(LoR, TRI)
+                      << '\n');
     PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR)));
   }
 
@@ -1160,12 +1187,12 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
 }
 
 bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "Splitting double registers in function: "
-        << MF.getName() << '\n');
-
   if (skipFunction(MF.getFunction()))
     return false;
 
+  LLVM_DEBUG(dbgs() << "Splitting double registers in function: "
+                    << MF.getName() << '\n');
+
   auto &ST = MF.getSubtarget<HexagonSubtarget>();
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
@@ -1178,7 +1205,7 @@ bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
   collectIndRegs(IRM);
   partitionRegisters(P2Rs);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Register partitioning: (partition #0 is fixed)\n";
     for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
       dbgs() << '#' << I->first << " -> ";
@@ -1196,7 +1223,8 @@ bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
     if (Limit >= 0 && Counter >= Limit)
       break;
     USet &Part = I->second;
-    DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n');
+    LLVM_DEBUG(dbgs() << "Calculating profit for partition #" << I->first
+                      << '\n');
     if (!isProfitable(Part, IRM))
       continue;
     Counter++;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
index 300f6de33552..991af047387e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -433,10 +433,11 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     const MCInstrDesc &StD = TII->get(WOpc);
     MachineOperand &MR = FirstSt->getOperand(0);
     int64_t Off = FirstSt->getOperand(1).getImm();
-    MachineInstr *StI = BuildMI(*MF, DL, StD)
-                          .addReg(MR.getReg(), getKillRegState(MR.isKill()))
-                          .addImm(Off)
-                          .addImm(Val);
+    MachineInstr *StI =
+        BuildMI(*MF, DL, StD)
+            .addReg(MR.getReg(), getKillRegState(MR.isKill()), MR.getSubReg())
+            .addImm(Off)
+            .addImm(Val);
     StI->addMemOperand(*MF, NewM);
     NG.push_back(StI);
   } else {
@@ -455,10 +456,11 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     const MCInstrDesc &StD = TII->get(WOpc);
     MachineOperand &MR = FirstSt->getOperand(0);
     int64_t Off = FirstSt->getOperand(1).getImm();
-    MachineInstr *StI = BuildMI(*MF, DL, StD)
-                          .addReg(MR.getReg(), getKillRegState(MR.isKill()))
-                          .addImm(Off)
-                          .addReg(VReg, RegState::Kill);
+    MachineInstr *StI =
+        BuildMI(*MF, DL, StD)
+            .addReg(MR.getReg(), getKillRegState(MR.isKill()), MR.getSubReg())
+            .addImm(Off)
+            .addReg(VReg, RegState::Kill);
     StI->addMemOperand(*MF, NewM);
     NG.push_back(StI);
   }
@@ -472,7 +474,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
 // from OG was (in the order in which they appeared in the basic block).
 // (The ordering in OG does not have to match the order in the basic block.)
 bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Replacing:\n";
     for (auto I : OG)
       dbgs() << "  " << *I;
@@ -576,7 +578,7 @@ bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
   };
   for (auto &G : SGs) {
     assert(G.size() > 1 && "Store group with fewer than 2 elements");
-    std::sort(G.begin(), G.end(), Less);
+    llvm::sort(G.begin(), G.end(), Less);
 
     Changed |= processStoreGroup(G);
   }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 6f1f6c46a107..0686d6eb6118 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -15,13 +15,14 @@
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/CommandLine.h"
@@ -38,17 +39,6 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "HexagonGenSubtargetInfo.inc"
 
-static cl::opt<bool> EnableMemOps("enable-hexagon-memops",
-  cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
-  cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool> DisableMemOps("disable-hexagon-memops",
-  cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
-  cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Generate non-chopped conversion from fp to int."));
 
 static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
   cl::Hidden, cl::ZeroOrMore, cl::init(true));
@@ -60,9 +50,6 @@ static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
   cl::Hidden, cl::ZeroOrMore, cl::init(true),
   cl::desc("Enable the scheduler to generate .cur"));
 
-static cl::opt<bool> EnableVecFrwdSched("enable-evec-frwd-sched",
-  cl::Hidden, cl::ZeroOrMore, cl::init(true));
-
 static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable Hexagon MI Scheduling"));
@@ -105,6 +92,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
 HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   static std::map<StringRef, Hexagon::ArchEnum> CpuTable{
+      {"generic", Hexagon::ArchEnum::V60},
       {"hexagonv4", Hexagon::ArchEnum::V4},
       {"hexagonv5", Hexagon::ArchEnum::V5},
       {"hexagonv55", Hexagon::ArchEnum::V55},
@@ -123,9 +111,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   UseHVX64BOps = false;
   UseLongCalls = false;
 
-  UseMemOps = DisableMemOps ? false : EnableMemOps;
-  ModeIEEERndNear = EnableIEEERndNear;
-  UseBSBScheduling = hasV60TOps() && EnableBSBSched;
+  UseBSBScheduling = hasV60Ops() && EnableBSBSched;
 
   ParseSubtargetFeatures(CPUString, FS);
 
@@ -204,11 +190,14 @@ bool HexagonSubtarget::CallMutation::shouldTFRICallBind(
          Type == HexagonII::TypeALU64 || Type == HexagonII::TypeM;
 }
 
-void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAG) {
+void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
   SUnit* LastSequentialCall = nullptr;
-  unsigned VRegHoldingRet = 0;
-  unsigned RetRegister;
-  SUnit* LastUseOfRet = nullptr;
+  // Map from virtual register to physical register from the copy.
+  DenseMap<unsigned, unsigned> VRegHoldingReg;
+  // Map from the physical register to the instruction that uses virtual
+  // register. This is used to create the barrier edge.
+  DenseMap<unsigned, SUnit *> LastVRegUse;
   auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo();
   auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
@@ -220,13 +209,15 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAG) {
       LastSequentialCall = &DAG->SUnits[su];
     // Look for a compare that defines a predicate.
     else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall)
-      DAG->SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+      DAG->addEdge(&DAG->SUnits[su], SDep(LastSequentialCall, SDep::Barrier));
     // Look for call and tfri* instructions.
     else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 &&
              shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1]))
-      DAG->SUnits[su].addPred(SDep(&DAG->SUnits[su-1], SDep::Barrier));
-    // Prevent redundant register copies between two calls, which are caused by
-    // both the return value and the argument for the next call being in %r0.
+      DAG->addEdge(&DAG->SUnits[su], SDep(&DAG->SUnits[su-1], SDep::Barrier));
+    // Prevent redundant register copies due to reads and writes of physical
+    // registers. The original motivation for this was the code generated
+    // between two calls, which are caused both the return value and the
+    // argument for the next call being in %r0.
     // Example:
     //   1: <call1>
     //   2: %vreg = COPY %r0
@@ -235,21 +226,37 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAG) {
     //   5: <call2>
     // The scheduler would often swap 3 and 4, so an additional register is
     // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
-    // this. The same applies for %d0 and %v0/%w0, which are also handled.
+    // this.
+    // The code below checks for all the physical registers, not just R0/D0/V0.
     else if (SchedRetvalOptimization) {
       const MachineInstr *MI = DAG->SUnits[su].getInstr();
-      if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) ||
-                           MI->readsRegister(Hexagon::V0, &TRI)))  {
-        // %vreg = COPY %r0
-        VRegHoldingRet = MI->getOperand(0).getReg();
-        RetRegister = MI->getOperand(1).getReg();
-        LastUseOfRet = nullptr;
-      } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet))
-        // <use of %X>
-        LastUseOfRet = &DAG->SUnits[su];
-      else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI))
-        // %r0 = ...
-        DAG->SUnits[su].addPred(SDep(LastUseOfRet, SDep::Barrier));
+      if (MI->isCopy() &&
+          TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+        // %vregX = COPY %r0
+        VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg();
+        LastVRegUse.erase(MI->getOperand(1).getReg());
+      } else {
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg())
+            continue;
+          if (MO.isUse() && !MI->isCopy() &&
+              VRegHoldingReg.count(MO.getReg())) {
+            // <use of %vregX>
+            LastVRegUse[VRegHoldingReg[MO.getReg()]] = &DAG->SUnits[su];
+          } else if (MO.isDef() &&
+                     TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+            for (MCRegAliasIterator AI(MO.getReg(), &TRI, true); AI.isValid();
+                 ++AI) {
+              if (LastVRegUse.count(*AI) &&
+                  LastVRegUse[*AI] != &DAG->SUnits[su])
+                // %r0 = ...
+                DAG->addEdge(&DAG->SUnits[su], SDep(LastVRegUse[*AI], SDep::Barrier));
+              LastVRegUse.erase(*AI);
+            }
+          }
+        }
+      }
     }
   }
 }
@@ -300,7 +307,7 @@ void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) {
   }
 }
 
-/// \brief Enable use of alias analysis during code generation (during MI
+/// Enable use of alias analysis during code generation (during MI
 /// scheduling, DAGCombine, etc.).
 bool HexagonSubtarget::useAA() const {
   if (OptLevel != CodeGenOpt::None)
@@ -308,7 +315,7 @@ bool HexagonSubtarget::useAA() const {
   return false;
 }
 
-/// \brief Perform target specific adjustments to the latency of a schedule
+/// Perform target specific adjustments to the latency of a schedule
 /// dependency.
 void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
                                              SDep &Dep) const {
@@ -328,25 +335,30 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
     return;
   }
 
-  if (!hasV60TOps())
+  if (!hasV60Ops())
     return;
 
-  // If it's a REG_SEQUENCE, use its destination instruction to determine
+  // Set the latency for a copy to zero since we hope that is will get removed.
+  if (DstInst->isCopy())
+    Dep.setLatency(0);
+
+  // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
   // the correct latency.
-  if (DstInst->isRegSequence() && Dst->NumSuccs == 1) {
-    unsigned RSeqReg = DstInst->getOperand(0).getReg();
-    MachineInstr *RSeqDst = Dst->Succs[0].getSUnit()->getInstr();
+  if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) {
+    unsigned DReg = DstInst->getOperand(0).getReg();
+    MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr();
     unsigned UseIdx = -1;
-    for (unsigned OpNum = 0; OpNum < RSeqDst->getNumOperands(); OpNum++) {
-      const MachineOperand &MO = RSeqDst->getOperand(OpNum);
-      if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == RSeqReg) {
+    for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
+      const MachineOperand &MO = DDst->getOperand(OpNum);
+      if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) {
         UseIdx = OpNum;
         break;
       }
     }
-    unsigned RSeqLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst,
-                                                        0, *RSeqDst, UseIdx));
-    Dep.setLatency(RSeqLatency);
+    int DLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst,
+                                                0, *DDst, UseIdx));
+    DLatency = std::max(DLatency, 0);
+    Dep.setLatency((unsigned)DLatency);
   }
 
   // Try to schedule uses near definitions to generate .cur.
@@ -394,7 +406,7 @@ void HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
     return;
   }
 
-  if (!hasV60TOps())
+  if (!hasV60Ops())
     return;
 
   auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
@@ -418,6 +430,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
     }
     assert(DefIdx >= 0 && "Def Reg not found in Src MI");
     MachineInstr *DstI = Dst->getInstr();
+    SDep T = I;
     for (unsigned OpNum = 0; OpNum < DstI->getNumOperands(); OpNum++) {
       const MachineOperand &MO = DstI->getOperand(OpNum);
       if (MO.isReg() && MO.isUse() && MO.getReg() == DepR) {
@@ -426,8 +439,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
 
         // For some instructions (ex: COPY), we might end up with < 0 latency
         // as they don't have any Itinerary class associated with them.
-        if (Latency <= 0)
-          Latency = 1;
+        Latency = std::max(Latency, 0);
 
         I.setLatency(Latency);
         updateLatency(*SrcI, *DstI, I);
@@ -435,11 +447,10 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
     }
 
     // Update the latency of opposite edge too.
-    for (auto &J : Dst->Preds) {
-      if (J.getSUnit() != Src)
-        continue;
-      J.setLatency(I.getLatency());
-    }
+    T.setSUnit(Src);
+    auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+    assert(F != Dst->Preds.end());
+    F->setLatency(I.getLatency());
   }
 }
 
@@ -447,7 +458,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
 void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
       const {
   for (auto &I : Src->Succs) {
-    if (I.getSUnit() != Dst)
+    if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
       continue;
     SDep T = I;
     I.setLatency(Lat);
@@ -456,7 +467,7 @@ void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
     T.setSUnit(Src);
     auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
     assert(F != Dst->Preds.end());
-    F->setLatency(I.getLatency());
+    F->setLatency(Lat);
   }
 }
 
@@ -519,13 +530,13 @@ bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
   // Reassign the latency for the previous bests, which requires setting
   // the dependence edge in both directions.
   if (SrcBest != nullptr) {
-    if (!hasV60TOps())
+    if (!hasV60Ops())
       changeLatency(SrcBest, Dst, 1);
     else
       restoreLatency(SrcBest, Dst);
   }
   if (DstBest != nullptr) {
-    if (!hasV60TOps())
+    if (!hasV60Ops())
       changeLatency(Src, DstBest, 1);
     else
       restoreLatency(Src, DstBest);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index af93f20d97fc..dc8d173a5057 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -32,9 +32,6 @@
 #define GET_SUBTARGETINFO_HEADER
 #include "HexagonGenSubtargetInfo.inc"
 
-#define Hexagon_SMALL_DATA_THRESHOLD 8
-#define Hexagon_SLOTS 4
-
 namespace llvm {
 
 class MachineInstr;
@@ -46,12 +43,20 @@ class Triple;
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
   virtual void anchor();
 
-  bool UseMemOps, UseHVX64BOps, UseHVX128BOps;
-  bool UseLongCalls;
-  bool ModeIEEERndNear;
+  bool UseHVX64BOps = false;
+  bool UseHVX128BOps = false;
+
+  bool UseLongCalls = false;
+  bool UseMemops = false;
+  bool UsePackets = false;
+  bool UseNewValueJumps = false;
+  bool UseNewValueStores = false;
+  bool UseSmallData = false;
 
   bool HasMemNoShuf = false;
   bool EnableDuplex = false;
+  bool ReservedR19 = false;
+
 public:
   Hexagon::ArchEnum HexagonArchVersion;
   Hexagon::ArchEnum HexagonHVXVersion = Hexagon::ArchEnum::V4;
@@ -115,44 +120,50 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool useMemOps() const { return UseMemOps; }
-  bool hasV5TOps() const {
+  bool hasV5Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V5;
   }
-  bool hasV5TOpsOnly() const {
+  bool hasV5OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V5;
   }
-  bool hasV55TOps() const {
+  bool hasV55Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V55;
   }
-  bool hasV55TOpsOnly() const {
+  bool hasV55OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V55;
   }
-  bool hasV60TOps() const {
+  bool hasV60Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V60;
   }
-  bool hasV60TOpsOnly() const {
+  bool hasV60OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V60;
   }
-  bool hasV62TOps() const {
+  bool hasV62Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V62;
   }
-  bool hasV62TOpsOnly() const {
+  bool hasV62OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V62;
   }
-  bool hasV65TOps() const {
+  bool hasV65Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V65;
   }
-  bool hasV65TOpsOnly() const {
+  bool hasV65OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V65;
   }
 
-  bool modeIEEERndNear() const { return ModeIEEERndNear; }
+  bool useLongCalls() const { return UseLongCalls; }
+  bool useMemops() const { return UseMemops; }
+  bool usePackets() const { return UsePackets; }
+  bool useNewValueJumps() const { return UseNewValueJumps; }
+  bool useNewValueStores() const { return UseNewValueStores; }
+  bool useSmallData() const { return UseSmallData; }
+
   bool useHVXOps() const { return HexagonHVXVersion > Hexagon::ArchEnum::V4; }
   bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
   bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
+
   bool hasMemNoShuf() const { return HasMemNoShuf; }
-  bool useLongCalls() const { return UseLongCalls; }
+  bool hasReservedR19() const { return ReservedR19; }
   bool usePredicatedCalls() const;
 
   bool useBSBScheduling() const { return UseBSBScheduling; }
@@ -170,11 +181,6 @@ public:
 
   const std::string &getCPUString () const { return CPUString; }
 
-  // Threshold for small data section
-  unsigned getSmallDataThreshold() const {
-    return Hexagon_SMALL_DATA_THRESHOLD;
-  }
-
   const Hexagon::ArchEnum &getHexagonArchVersion() const {
     return HexagonArchVersion;
   }
@@ -187,11 +193,11 @@ public:
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
 
-  /// \brief Enable use of alias analysis during code generation (during MI
+  /// Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
   bool useAA() const override;
 
-  /// \brief Perform target specific adjustments to the latency of a schedule
+  /// Perform target specific adjustments to the latency of a schedule
   /// dependency.
   void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
 
@@ -238,6 +244,12 @@ public:
     return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; });
   }
 
+  unsigned getTypeAlignment(MVT Ty) const {
+    if (isHVXVectorType(Ty, true))
+      return getVectorLength();
+    return Ty.getSizeInBits() / 8;
+  }
+
   unsigned getL1CacheLineSize() const;
   unsigned getL1PrefetchDistance() const;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 363b703fef28..2c75e9139ad7 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -94,9 +94,8 @@ static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Enable Hexagon Vector print instr pass"));
 
-static cl::opt<bool> EnableTrapUnreachable("hexagon-trap-unreachable",
-  cl::Hidden, cl::ZeroOrMore, cl::init(false),
-  cl::desc("Enable generating trap for unreachable"));
+static cl::opt<bool> EnableVExtractOpt("hexagon-opt-vextract", cl::Hidden,
+  cl::ZeroOrMore, cl::init(true), cl::desc("Enable vextract optimization"));
 
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
@@ -122,7 +121,9 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 
 namespace llvm {
   extern char &HexagonExpandCondsetsID;
+  void initializeHexagonBitSimplifyPass(PassRegistry&);
   void initializeHexagonConstExtendersPass(PassRegistry&);
+  void initializeHexagonConstPropagationPass(PassRegistry&);
   void initializeHexagonEarlyIfConversionPass(PassRegistry&);
   void initializeHexagonExpandCondsetsPass(PassRegistry&);
   void initializeHexagonGenMuxPass(PassRegistry&);
@@ -133,6 +134,8 @@ namespace llvm {
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
   void initializeHexagonRDFOptPass(PassRegistry&);
+  void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+  void initializeHexagonVExtractPass(PassRegistry&);
   Pass *createHexagonLoopIdiomPass();
   Pass *createHexagonVectorLoopCarriedReusePass();
 
@@ -165,6 +168,7 @@ namespace llvm {
   FunctionPass *createHexagonSplitDoubleRegs();
   FunctionPass *createHexagonStoreWidening();
   FunctionPass *createHexagonVectorPrint();
+  FunctionPass *createHexagonVExtract();
 } // end namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
@@ -184,7 +188,9 @@ extern "C" void LLVMInitializeHexagonTarget() {
   RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeHexagonBitSimplifyPass(PR);
   initializeHexagonConstExtendersPass(PR);
+  initializeHexagonConstPropagationPass(PR);
   initializeHexagonEarlyIfConversionPass(PR);
   initializeHexagonGenMuxPass(PR);
   initializeHexagonHardwareLoopsPass(PR);
@@ -194,6 +200,8 @@ extern "C" void LLVMInitializeHexagonTarget() {
   initializeHexagonOptAddrModePass(PR);
   initializeHexagonPacketizerPass(PR);
   initializeHexagonRDFOptPass(PR);
+  initializeHexagonSplitDoubleRegsPass(PR);
+  initializeHexagonVExtractPass(PR);
 }
 
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
@@ -213,8 +221,6 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CM), (HexagonNoOpt ? CodeGenOpt::None : OL)),
       TLOF(make_unique<HexagonTargetObjectFile>()) {
-  if (EnableTrapUnreachable)
-    this->Options.TrapUnreachable = true;
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
@@ -299,6 +305,11 @@ void HexagonPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
+  if (!NoOpt) {
+    addPass(createConstantPropagationPass());
+    addPass(createDeadCodeEliminationPass());
+  }
+
   addPass(createAtomicExpandPass());
   if (!NoOpt) {
     if (EnableLoopPrefetch)
@@ -321,6 +332,8 @@ bool HexagonPassConfig::addInstSelector() {
   addPass(createHexagonISelDag(TM, getOptLevel()));
 
   if (!NoOpt) {
+    if (EnableVExtractOpt)
+      addPass(createHexagonVExtract());
     // Create logical operations on predicate registers.
     if (EnableGenPred)
       addPass(createHexagonGenPredicate());
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index ea86c9c42f47..e771f383dffa 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -74,7 +74,7 @@ static cl::opt<bool>
     if (TraceGVPlacement) {                                                    \
       TRACE_TO(errs(), X);                                                     \
     } else {                                                                   \
-      DEBUG(TRACE_TO(dbgs(), X));                                              \
+      LLVM_DEBUG(TRACE_TO(dbgs(), X));                                         \
     }                                                                          \
   } while (false)
 #endif
@@ -200,11 +200,11 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
 bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
       const TargetMachine &TM) const {
   // Only global variables, not functions.
-  DEBUG(dbgs() << "Checking if value is in small-data, -G"
-               << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
+  LLVM_DEBUG(dbgs() << "Checking if value is in small-data, -G"
+                    << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO);
   if (!GVar) {
-    DEBUG(dbgs() << "no, not a global variable\n");
+    LLVM_DEBUG(dbgs() << "no, not a global variable\n");
     return false;
   }
 
@@ -213,19 +213,19 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
   // small data or not. This is how we can support mixing -G0/-G8 in LTO.
   if (GVar->hasSection()) {
     bool IsSmall = isSmallDataSection(GVar->getSection());
-    DEBUG(dbgs() << (IsSmall ? "yes" : "no") << ", has section: "
-                 << GVar->getSection() << '\n');
+    LLVM_DEBUG(dbgs() << (IsSmall ? "yes" : "no")
+                      << ", has section: " << GVar->getSection() << '\n');
     return IsSmall;
   }
 
   if (GVar->isConstant()) {
-    DEBUG(dbgs() << "no, is a constant\n");
+    LLVM_DEBUG(dbgs() << "no, is a constant\n");
     return false;
   }
 
   bool IsLocal = GVar->hasLocalLinkage();
   if (!StaticsInSData && IsLocal) {
-    DEBUG(dbgs() << "no, is static\n");
+    LLVM_DEBUG(dbgs() << "no, is static\n");
     return false;
   }
 
@@ -234,7 +234,7 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
     GType = PT->getElementType();
 
   if (isa<ArrayType>(GType)) {
-    DEBUG(dbgs() << "no, is an array\n");
+    LLVM_DEBUG(dbgs() << "no, is an array\n");
     return false;
   }
 
@@ -244,22 +244,22 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
   // these objects end up in the sdata, the references will still be valid.
   if (StructType *ST = dyn_cast<StructType>(GType)) {
     if (ST->isOpaque()) {
-      DEBUG(dbgs() << "no, has opaque type\n");
+      LLVM_DEBUG(dbgs() << "no, has opaque type\n");
       return false;
     }
   }
 
   unsigned Size = GVar->getParent()->getDataLayout().getTypeAllocSize(GType);
   if (Size == 0) {
-    DEBUG(dbgs() << "no, has size 0\n");
+    LLVM_DEBUG(dbgs() << "no, has size 0\n");
     return false;
   }
   if (Size > SmallDataThreshold) {
-    DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
+    LLVM_DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
     return false;
   }
 
-  DEBUG(dbgs() << "yes\n");
+  LLVM_DEBUG(dbgs() << "yes\n");
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index d638503990ad..a496a17788d5 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -16,23 +16,59 @@
 #include "HexagonTargetTransformInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "hexagontti"
 
+static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
+  cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
+
 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
   cl::init(true), cl::Hidden,
   cl::desc("Control lookup table emission on Hexagon target"));
 
+// Constant "cost factor" to make floating point operations more expensive
+// in terms of vectorization cost. This isn't the best way, but it should
+// do. Ultimately, the cost should use cycles.
+static const unsigned FloatFactor = 4;
+
+bool HexagonTTIImpl::useHVX() const {
+  return ST.useHVXOps() && HexagonAutoHVX;
+}
+
+bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
+  assert(VecTy->isVectorTy());
+  // Avoid types like <2 x i32*>.
+  if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
+    return false;
+  EVT VecVT = EVT::getEVT(VecTy);
+  if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
+    return false;
+  if (ST.isHVXVectorType(VecVT.getSimpleVT()))
+    return true;
+  auto Action = TLI.getPreferredVectorAction(VecVT);
+  return Action == TargetLoweringBase::TypeWidenVector;
+}
+
+unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
+  if (Ty->isVectorTy())
+    return Ty->getVectorNumElements();
+  assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
+         "Expecting scalar type");
+  return 1;
+}
+
 TargetTransformInfo::PopcntSupportKind
 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
-  // Return Fast Hardware support as every input  < 64 bits will be promoted
+  // Return fast hardware support as every input < 64 bits will be promoted
   // to 64 bits.
   return TargetTransformInfo::PSK_FastHardware;
 }
@@ -41,37 +77,223 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
 void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   UP.Runtime = UP.Partial = true;
+  // Only try to peel innermost loops with small runtime trip counts.
+  if (L && L->empty() && canPeel(L) &&
+      SE.getSmallConstantTripCount(L) == 0 &&
+      SE.getSmallConstantMaxTripCount(L) > 0 &&
+      SE.getSmallConstantMaxTripCount(L) <= 5) {
+    UP.PeelCount = 2;
+  }
+}
+
+bool HexagonTTIImpl::shouldFavorPostInc() const {
+  return true;
+}
+
+/// --- Vector TTI begin ---
+
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
+  if (Vector)
+    return useHVX() ? 32 : 0;
+  return 32;
+}
+
+unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  return useHVX() ? 2 : 0;
+}
+
+unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
+  return Vector ? getMinVectorRegisterBitWidth() : 32;
+}
+
+unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
+  return useHVX() ? ST.getVectorLength()*8 : 0;
+}
+
+unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
+  return (8 * ST.getVectorLength()) / ElemWidth;
+}
+
+unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
+      bool Extract) {
+  return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
+}
+
+unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
+      ArrayRef<const Value*> Args, unsigned VF) {
+  return BaseT::getOperandsScalarizationOverhead(Args, VF);
+}
+
+unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
+      ArrayRef<Type*> Tys) {
+  return BaseT::getCallInstrCost(F, RetTy, Tys);
+}
+
+unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+}
+
+unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+      ArrayRef<Type*> Tys, FastMathFlags FMF,
+      unsigned ScalarizationCostPassed) {
+  if (ID == Intrinsic::bswap) {
+    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
+    return LT.first + 2;
+  }
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                      ScalarizationCostPassed);
+}
+
+unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
+      ScalarEvolution *SE, const SCEV *S) {
+  return 0;
+}
+
+unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+      unsigned Alignment, unsigned AddressSpace, const Instruction *I) {
+  assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
+  if (Opcode == Instruction::Store)
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+
+  if (Src->isVectorTy()) {
+    VectorType *VecTy = cast<VectorType>(Src);
+    unsigned VecWidth = VecTy->getBitWidth();
+    if (useHVX() && isTypeForHVX(VecTy)) {
+      unsigned RegWidth = getRegisterBitWidth(true);
+      Alignment = std::min(Alignment, RegWidth/8);
+      // Cost of HVX loads.
+      if (VecWidth % RegWidth == 0)
+        return VecWidth / RegWidth;
+      // Cost of constructing HVX vector from scalar loads.
+      unsigned AlignWidth = 8 * std::max(1u, Alignment);
+      unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
+      return 3*NumLoads;
+    }
+
+    // Non-HVX vectors.
+    // Add extra cost for floating point types.
+    unsigned Cost = VecTy->getElementType()->isFloatingPointTy() ? FloatFactor
+                                                                 : 1;
+    Alignment = std::min(Alignment, 8u);
+    unsigned AlignWidth = 8 * std::max(1u, Alignment);
+    unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
+    if (Alignment == 4 || Alignment == 8)
+      return Cost * NumLoads;
+    // Loads of less than 32 bits will need extra inserts to compose a vector.
+    unsigned LogA = Log2_32(Alignment);
+    return (3 - LogA) * Cost * NumLoads;
+  }
+
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+}
+
+unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode,
+      Type *Src, unsigned Alignment, unsigned AddressSpace) {
+  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+}
+
+unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+      int Index, Type *SubTp) {
+  return 1;
+}
+
+unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+      Value *Ptr, bool VariableMask, unsigned Alignment) {
+  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                       Alignment);
+}
+
+unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
+      Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      unsigned Alignment, unsigned AddressSpace) {
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
+unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+      Type *CondTy, const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
+    if (Opcode == Instruction::FCmp)
+      return LT.first + FloatFactor * getTypeNumElements(ValTy);
+  }
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
-unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const {
-  return vector ? 0 : 32;
+unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+      TTI::OperandValueProperties Opd1PropInfo,
+      TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args) {
+  if (Ty->isVectorTy()) {
+    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
+    if (LT.second.isFloatingPoint())
+      return LT.first + FloatFactor * getTypeNumElements(Ty);
+  }
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
 }
 
+unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
+      Type *SrcTy, const Instruction *I) {
+  if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
+    unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
+    unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
+
+    std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
+    std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
+    return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+  }
+  return 1;
+}
+
+unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+      unsigned Index) {
+  Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
+                                   : Val;
+  if (Opcode == Instruction::InsertElement) {
+    // Need two rotations for non-zero index.
+    unsigned Cost = (Index != 0) ? 2 : 0;
+    if (ElemTy->isIntegerTy(32))
+      return Cost;
+    // If it's not a 32-bit value, there will need to be an extract.
+    return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index);
+  }
+
+  if (Opcode == Instruction::ExtractElement)
+    return 2;
+
+  return 1;
+}
+
+/// --- Vector TTI end ---
+
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
-  return getST()->getL1PrefetchDistance();
+  return ST.getL1PrefetchDistance();
 }
 
 unsigned HexagonTTIImpl::getCacheLineSize() const {
-  return getST()->getL1CacheLineSize();
+  return ST.getL1CacheLineSize();
 }
 
 int HexagonTTIImpl::getUserCost(const User *U,
                                 ArrayRef<const Value *> Operands) {
-  auto isCastFoldedIntoLoad = [](const CastInst *CI) -> bool {
+  auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
     if (!CI->isIntegerCast())
       return false;
+    // Only extensions from an integer type shorter than 32-bit to i32
+    // can be folded into the load.
+    const DataLayout &DL = getDataLayout();
+    unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy());
+    unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy());
+    if (DBW != 32 || SBW >= DBW)
+      return false;
+
     const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
     // Technically, this code could allow multiple uses of the load, and
     // check if all the uses are the same extension operation, but this
     // should be sufficient for most cases.
-    if (!LI || !LI->hasOneUse())
-      return false;
-
-    // Only extensions from an integer type shorter than 32-bit to i32
-    // can be folded into the load.
-    unsigned SBW = CI->getSrcTy()->getIntegerBitWidth();
-    unsigned DBW = CI->getDestTy()->getIntegerBitWidth();
-    return DBW == 32 && (SBW < DBW);
+    return LI && LI->hasOneUse();
   };
 
   if (const CastInst *CI = dyn_cast<const CastInst>(U))
@@ -81,5 +303,5 @@ int HexagonTTIImpl::getUserCost(const User *U,
 }
 
 bool HexagonTTIImpl::shouldBuildLookupTables() const {
-   return EmitLookupTables;
+  return EmitLookupTables;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index d2cd05012afa..a232f99fc407 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -37,16 +37,24 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
 
   friend BaseT;
 
-  const HexagonSubtarget *ST;
-  const HexagonTargetLowering *TLI;
+  const HexagonSubtarget &ST;
+  const HexagonTargetLowering &TLI;
 
-  const HexagonSubtarget *getST() const { return ST; }
-  const HexagonTargetLowering *getTLI() const { return TLI; }
+  const HexagonSubtarget *getST() const { return &ST; }
+  const HexagonTargetLowering *getTLI() const { return &TLI; }
+
+  bool useHVX() const;
+  bool isTypeForHVX(Type *VecTy) const;
+
+  // Returns the number of vector elements of Ty, if Ty is a vector type,
+  // or 1 if Ty is a scalar type. It is incorrect to call this function
+  // with any other type.
+  unsigned getTypeNumElements(Type *Ty) const;
 
 public:
   explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
-        TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()),
+        ST(*TM->getSubtargetImpl(F)), TLI(*ST.getTargetLowering()) {}
 
   /// \name Scalar TTI Implementations
   /// @{
@@ -57,6 +65,9 @@ public:
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  /// Bias LSR towards creating post-increment opportunities.
+  bool shouldFavorPostInc() const;
+
   // L1 cache prefetch.
   unsigned getPrefetchDistance() const;
   unsigned getCacheLineSize() const;
@@ -67,6 +78,64 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool vector) const;
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getMinimumVF(unsigned ElemWidth) const;
+
+  bool shouldMaximizeVectorBandwidth(bool OptSize) const {
+    return true;
+  }
+  bool supportsEfficientVectorElementLoadStore() {
+    return false;
+  }
+  bool hasBranchDivergence() {
+    return false;
+  }
+  bool enableAggressiveInterleaving(bool LoopHasReductions) {
+    return false;
+  }
+  bool prefersVectorizedAddressing() {
+    return false;
+  }
+
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
+            unsigned VF);
+  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+            ArrayRef<Type*> Tys, FastMathFlags FMF,
+            unsigned ScalarizationCostPassed = UINT_MAX);
+  unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
+            const SCEV *S);
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+            unsigned AddressSpace, const Instruction *I = nullptr);
+  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+            unsigned AddressSpace);
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+            Type *SubTp);
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+            bool VariableMask, unsigned Alignment);
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+            unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
+            unsigned AddressSpace);
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+            const Instruction *I);
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+            TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+            TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+            TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+            TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+            ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+            const Instruction *I = nullptr);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  unsigned getCFInstrCost(unsigned Opcode) {
+    return 1;
+  }
 
   /// @}
 
@@ -77,5 +146,4 @@ public:
 };
 
 } // end namespace llvm
-
 #endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
new file mode 100644
index 000000000000..929ac2bd0d93
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -0,0 +1,166 @@
+//===- HexagonVExtract.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass will replace multiple occurrences of V6_extractw from the same
+// vector register with a combination of a vector store and scalar loads.
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/PassSupport.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<unsigned> VExtractThreshold("hexagon-vextract-threshold",
+  cl::Hidden, cl::ZeroOrMore, cl::init(1),
+  cl::desc("Threshold for triggering vextract replacement"));
+
+namespace llvm {
+  void initializeHexagonVExtractPass(PassRegistry& Registry);
+  FunctionPass *createHexagonVExtract();
+}
+
+namespace {
+  class HexagonVExtract : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonVExtract() : MachineFunctionPass(ID) {}
+
+    StringRef getPassName() const override {
+      return "Hexagon optimize vextract";
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    const HexagonSubtarget *HST = nullptr;
+    const HexagonInstrInfo *HII = nullptr;
+
+    unsigned genElemLoad(MachineInstr *ExtI, unsigned BaseR,
+                         MachineRegisterInfo &MRI);
+  };
+
+  char HexagonVExtract::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonVExtract, "hexagon-vextract",
+  "Hexagon optimize vextract", false, false)
+
+unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR,
+                                      MachineRegisterInfo &MRI) {
+  MachineBasicBlock &ExtB = *ExtI->getParent();
+  DebugLoc DL = ExtI->getDebugLoc();
+  unsigned ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+
+  unsigned ExtIdxR = ExtI->getOperand(2).getReg();
+  unsigned ExtIdxS = ExtI->getOperand(2).getSubReg();
+
+  // Simplified check for a compile-time constant value of ExtIdxR.
+  if (ExtIdxS == 0) {
+    MachineInstr *DI = MRI.getVRegDef(ExtIdxR);
+    if (DI->getOpcode() == Hexagon::A2_tfrsi) {
+      unsigned V = DI->getOperand(1).getImm();
+      V &= (HST->getVectorLength()-1) & -4u;
+
+      BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::L2_loadri_io), ElemR)
+        .addReg(BaseR)
+        .addImm(V);
+      return ElemR;
+    }
+  }
+
+  unsigned IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::A2_andir), IdxR)
+    .add(ExtI->getOperand(2))
+    .addImm(-4);
+  BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::L4_loadri_rr), ElemR)
+    .addReg(BaseR)
+    .addReg(IdxR)
+    .addImm(0);
+  return ElemR;
+}
+
+bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  HII = HST->getInstrInfo();
+  const auto &HRI = *HST->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  std::map<unsigned, SmallVector<MachineInstr*,4>> VExtractMap;
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc = MI.getOpcode();
+      if (Opc != Hexagon::V6_extractw)
+        continue;
+      unsigned VecR = MI.getOperand(1).getReg();
+      VExtractMap[VecR].push_back(&MI);
+    }
+  }
+
+  for (auto &P : VExtractMap) {
+    unsigned VecR = P.first;
+    if (P.second.size() <= VExtractThreshold)
+      continue;
+
+    const auto &VecRC = *MRI.getRegClass(VecR);
+    int FI = MFI.CreateSpillStackObject(HRI.getSpillSize(VecRC),
+                                        HRI.getSpillAlignment(VecRC));
+    MachineInstr *DefI = MRI.getVRegDef(VecR);
+    MachineBasicBlock::iterator At = std::next(DefI->getIterator());
+    MachineBasicBlock &DefB = *DefI->getParent();
+    unsigned StoreOpc = VecRC.getID() == Hexagon::HvxVRRegClassID
+                          ? Hexagon::V6_vS32b_ai
+                          : Hexagon::PS_vstorerw_ai;
+    BuildMI(DefB, At, DefI->getDebugLoc(), HII->get(StoreOpc))
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addReg(VecR);
+
+    unsigned VecSize = HRI.getRegSizeInBits(VecRC) / 8;
+
+    for (MachineInstr *ExtI : P.second) {
+      assert(ExtI->getOpcode() == Hexagon::V6_extractw);
+      unsigned SR = ExtI->getOperand(1).getSubReg();
+      assert(ExtI->getOperand(1).getReg() == VecR);
+
+      MachineBasicBlock &ExtB = *ExtI->getParent();
+      DebugLoc DL = ExtI->getDebugLoc();
+      unsigned BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::PS_fi), BaseR)
+        .addFrameIndex(FI)
+        .addImm(SR == 0 ? 0 : VecSize/2);
+
+      unsigned ElemR = genElemLoad(ExtI, BaseR, MRI);
+      unsigned ExtR = ExtI->getOperand(0).getReg();
+      MRI.replaceRegWith(ExtR, ElemR);
+      ExtB.erase(ExtI);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonVExtract() {
+  return new HexagonVExtract();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index c2404235091c..56ab69db9bd1 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -199,11 +199,12 @@ static MachineBasicBlock::iterator moveInstrOut(MachineInstr &MI,
 }
 
 bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
-  if (DisablePacketizer || skipFunction(MF.getFunction()))
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  if (DisablePacketizer || !HST.usePackets() || skipFunction(MF.getFunction()))
     return false;
 
-  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
-  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  HII = HST.getInstrInfo();
+  HRI = HST.getRegisterInfo();
   auto &MLI = getAnalysis<MachineLoopInfo>();
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
@@ -374,7 +375,7 @@ bool HexagonPacketizerList::promoteToDotCur(MachineInstr &MI,
 void HexagonPacketizerList::cleanUpDotCur() {
   MachineInstr *MI = nullptr;
   for (auto BI : CurrentPacketMIs) {
-    DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
+    LLVM_DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
     if (HII->isDotCurInst(*BI)) {
       MI = BI;
       continue;
@@ -389,7 +390,7 @@ void HexagonPacketizerList::cleanUpDotCur() {
     return;
   // We did not find a use of the CUR, so de-cur it.
   MI->setDesc(HII->get(HII->getNonDotCurOp(*MI)));
-  DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
+  LLVM_DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
 }
 
 // Check to see if an instruction can be dot cur.
@@ -413,11 +414,10 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
     return false;
 
   // Make sure candidate instruction uses cur.
-  DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
-        MI.dump();
-        dbgs() << "in packet\n";);
+  LLVM_DEBUG(dbgs() << "Can we DOT Cur Vector MI\n"; MI.dump();
+             dbgs() << "in packet\n";);
   MachineInstr &MJ = *MII;
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Checking CUR against ";
     MJ.dump();
   });
@@ -432,12 +432,12 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
   // Check for existing uses of a vector register within the packet which
   // would be affected by converting a vector load into .cur formt.
   for (auto BI : CurrentPacketMIs) {
-    DEBUG(dbgs() << "packet has "; BI->dump(););
+    LLVM_DEBUG(dbgs() << "packet has "; BI->dump(););
     if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo()))
       return false;
   }
 
-  DEBUG(dbgs() << "Can Dot CUR MI\n"; MI.dump(););
+  LLVM_DEBUG(dbgs() << "Can Dot CUR MI\n"; MI.dump(););
   // We can convert the opcode into a .cur.
   return true;
 }
@@ -529,6 +529,9 @@ bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) {
     return false;
 
   int64_t Offset = MI.getOperand(OPI).getImm();
+  if (!HII->isValidOffset(MI.getOpcode(), Offset+Incr, HRI))
+    return false;
+
   MI.getOperand(OPI).setImm(Offset + Incr);
   ChangedOffset = Offset;
   return true;
@@ -1033,7 +1036,7 @@ void HexagonPacketizerList::initPacketizerState() {
 // Ignore bundling of pseudo instructions.
 bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr &MI,
                                                     const MachineBasicBlock *) {
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr())
     return true;
 
   if (MI.isCFIInstruction())
@@ -1095,7 +1098,7 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
 static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
       const HexagonInstrInfo &HII) {
   const MachineFunction *MF = MI.getParent()->getParent();
-  if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() &&
+  if (MF->getSubtarget<HexagonSubtarget>().hasV60OpsOnly() &&
       HII.isHVXMemWithAIndirect(MI, MJ))
     return true;
 
@@ -1112,6 +1115,10 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
   case Hexagon::S4_stored_locked:
   case Hexagon::L2_loadw_locked:
   case Hexagon::L4_loadd_locked:
+  case Hexagon::Y2_dccleana:
+  case Hexagon::Y2_dccleaninva:
+  case Hexagon::Y2_dcinva:
+  case Hexagon::Y2_dczeroa:
   case Hexagon::Y4_l2fetch:
   case Hexagon::Y5_l2fetch: {
     // These instructions can only be grouped with ALU32 or non-floating-point
@@ -1513,7 +1520,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       bool IsVecJ = HII->isHVXVec(J);
       bool IsVecI = HII->isHVXVec(I);
 
-      if (Slot1Store && MF.getSubtarget<HexagonSubtarget>().hasV65TOps() &&
+      if (Slot1Store && MF.getSubtarget<HexagonSubtarget>().hasV65Ops() &&
           ((LoadJ && StoreI && !NVStoreI) ||
            (StoreJ && LoadI && !NVStoreJ)) &&
           (J.getOpcode() != Hexagon::S2_allocframe &&
@@ -1683,8 +1690,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
     PacketStalls = false;
   PacketStalls |= producesStall(MI);
 
-  if (MI.isImplicitDef())
+  if (MI.isImplicitDef()) {
+    // Add to the packet to allow subsequent instructions to be checked
+    // properly.
+    CurrentPacketMIs.push_back(&MI);
     return MII;
+  }
   assert(ResourceTracker->canReserveResources(MI));
 
   bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
@@ -1754,7 +1765,7 @@ void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
   bool memShufDisabled = getmemShufDisabled();
   if (memShufDisabled && !foundLSInPacket()) {
     setmemShufDisabled(false);
-    DEBUG(dbgs() << "  Not added to NoShufPacket\n");
+    LLVM_DEBUG(dbgs() << "  Not added to NoShufPacket\n");
   }
   memShufDisabled = getmemShufDisabled();
 
@@ -1773,7 +1784,7 @@ void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
   CurrentPacketMIs.clear();
 
   ResourceTracker->clearResources();
-  DEBUG(dbgs() << "End packet\n");
+  LLVM_DEBUG(dbgs() << "End packet\n");
 }
 
 bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
@@ -1803,17 +1814,18 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
 
   SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
 
-  // Check if the latency is 0 between this instruction and any instruction
-  // in the current packet. If so, we disregard any potential stalls due to
-  // the instructions in the previous packet. Most of the instruction pairs
-  // that can go together in the same packet have 0 latency between them.
-  // Only exceptions are newValueJumps as they're generated much later and
-  // the latencies can't be changed at that point. Another is .cur
-  // instructions if its consumer has a 0 latency successor (such as .new).
-  // In this case, the latency between .cur and the consumer stays non-zero
-  // even though we can have  both .cur and .new in the same packet. Changing
-  // the latency to 0 is not an option as it causes software pipeliner to
-  // not pipeline in some cases.
+  // If the latency is 0 and there is a data dependence between this
+  // instruction and any instruction in the current packet, we disregard any
+  // potential stalls due to the instructions in the previous packet. Most of
+  // the instruction pairs that can go together in the same packet have 0
+  // latency between them. The exceptions are
+  // 1. NewValueJumps as they're generated much later and the latencies can't
+  // be changed at that point.
+  // 2. .cur instructions, if its consumer has a 0 latency successor (such as
+  // .new). In this case, the latency between .cur and the consumer stays
+  // non-zero even though we can have  both .cur and .new in the same packet.
+  // Changing the latency to 0 is not an option as it causes software pipeliner
+  // to not pipeline in some cases.
 
   // For Example:
   // {
@@ -1826,19 +1838,10 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
   for (auto J : CurrentPacketMIs) {
     SUnit *SUJ = MIToSUnit[J];
     for (auto &Pred : SUI->Preds)
-      if (Pred.getSUnit() == SUJ &&
-          (Pred.getLatency() == 0 || HII->isNewValueJump(I) ||
-           HII->isToBeScheduledASAP(*J, I)))
-        return false;
-  }
-
-  // Check if the latency is greater than one between this instruction and any
-  // instruction in the previous packet.
-  for (auto J : OldPacketMIs) {
-    SUnit *SUJ = MIToSUnit[J];
-    for (auto &Pred : SUI->Preds)
-      if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
-        return true;
+      if (Pred.getSUnit() == SUJ)
+        if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) ||
+            HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I))
+          return false;
   }
 
   // Check if the latency is greater than one between this instruction and any
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 764d9ae9059a..40dcee3441a2 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -59,7 +59,7 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   bool PacketStalls = false;
 
 protected:
-  /// \brief A handle to the branch probability pass.
+  /// A handle to the branch probability pass.
   const MachineBranchProbabilityInfo *MBPI;
   const MachineLoopInfo *MLI;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 39395dbd3aec..9d1073346c72 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -138,6 +138,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -363,17 +364,18 @@ bool HexagonVectorLoopCarriedReuse::canReplace(Instruction *I) {
   if (II &&
       (II->getIntrinsicID() == Intrinsic::hexagon_V6_hi ||
        II->getIntrinsicID() == Intrinsic::hexagon_V6_lo)) {
-    DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n");
+    LLVM_DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n");
     return false;
   }
   return true;
 }
 void HexagonVectorLoopCarriedReuse::findValueToReuse() {
   for (auto *D : Dependences) {
-    DEBUG(dbgs() << "Processing dependence " << *(D->front()) << "\n");
+    LLVM_DEBUG(dbgs() << "Processing dependence " << *(D->front()) << "\n");
     if (D->iterations() > HexagonVLCRIterationLim) {
-      DEBUG(dbgs() <<
-            ".. Skipping because number of iterations > than the limit\n");
+      LLVM_DEBUG(
+          dbgs()
+          << ".. Skipping because number of iterations > than the limit\n");
       continue;
     }
 
@@ -381,7 +383,8 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
     Instruction *BEInst = D->back();
     int Iters = D->iterations();
     BasicBlock *BB = PN->getParent();
-    DEBUG(dbgs() << "Checking if any uses of " << *PN << " can be reused\n");
+    LLVM_DEBUG(dbgs() << "Checking if any uses of " << *PN
+                      << " can be reused\n");
 
     SmallVector<Instruction *, 4> PNUsers;
     for (auto UI = PN->use_begin(), E = PN->use_end(); UI != E; ++UI) {
@@ -391,7 +394,8 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
       if (User->getParent() != BB)
         continue;
       if (ReplacedInsts.count(User)) {
-        DEBUG(dbgs() << *User << " has already been replaced. Skipping...\n");
+        LLVM_DEBUG(dbgs() << *User
+                          << " has already been replaced. Skipping...\n");
         continue;
       }
       if (isa<PHINode>(User))
@@ -403,7 +407,7 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
 
       PNUsers.push_back(User);
     }
-    DEBUG(dbgs() << PNUsers.size() << " use(s) of the PHI in the block\n");
+    LLVM_DEBUG(dbgs() << PNUsers.size() << " use(s) of the PHI in the block\n");
 
     // For each interesting use I of PN, find an Instruction BEUser that
     // performs the same operation as I on BEInst and whose other operands,
@@ -439,7 +443,7 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
           }
         }
         if (BEUser) {
-          DEBUG(dbgs() << "Found Value for reuse.\n");
+          LLVM_DEBUG(dbgs() << "Found Value for reuse.\n");
           ReuseCandidate.Inst2Replace = I;
           ReuseCandidate.BackedgeInst = BEUser;
           return;
@@ -460,7 +464,7 @@ Value *HexagonVectorLoopCarriedReuse::findValueInBlock(Value *Op,
 }
 
 void HexagonVectorLoopCarriedReuse::reuseValue() {
-  DEBUG(dbgs() << ReuseCandidate);
+  LLVM_DEBUG(dbgs() << ReuseCandidate);
   Instruction *Inst2Replace = ReuseCandidate.Inst2Replace;
   Instruction *BEInst = ReuseCandidate.BackedgeInst;
   int NumOperands = Inst2Replace->getNumOperands();
@@ -485,7 +489,7 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
     }
   }
 
-  DEBUG(dbgs() << "reuseValue is making the following changes\n");
+  LLVM_DEBUG(dbgs() << "reuseValue is making the following changes\n");
 
   SmallVector<Instruction *, 4> InstsInPreheader;
   for (int i = 0; i < Iterations; ++i) {
@@ -506,8 +510,8 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
     InstsInPreheader.push_back(InstInPreheader);
     InstInPreheader->setName(Inst2Replace->getName() + ".hexagon.vlcr");
     InstInPreheader->insertBefore(LoopPH->getTerminator());
-    DEBUG(dbgs() << "Added " << *InstInPreheader << " to " << LoopPH->getName()
-          << "\n");
+    LLVM_DEBUG(dbgs() << "Added " << *InstInPreheader << " to "
+                      << LoopPH->getName() << "\n");
   }
   BasicBlock *BB = BEInst->getParent();
   IRBuilder<> IRB(BB);
@@ -519,7 +523,8 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
     NewPhi = IRB.CreatePHI(InstInPreheader->getType(), 2);
     NewPhi->addIncoming(InstInPreheader, LoopPH);
     NewPhi->addIncoming(BEVal, BB);
-    DEBUG(dbgs() << "Adding " << *NewPhi << " to " << BB->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Adding " << *NewPhi << " to " << BB->getName()
+                      << "\n");
     BEVal = NewPhi;
   }
   // We are in LCSSA form. So, a value defined inside the Loop is used only
@@ -538,7 +543,7 @@ bool HexagonVectorLoopCarriedReuse::doVLCR() {
   bool Changed = false;
   bool Continue;
 
-  DEBUG(dbgs() << "Working on Loop: " << *CurLoop->getHeader() << "\n");
+  LLVM_DEBUG(dbgs() << "Working on Loop: " << *CurLoop->getHeader() << "\n");
   do {
     // Reset datastructures.
     Dependences.clear();
@@ -625,10 +630,9 @@ void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() {
     else
       delete D;
   }
-  DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n");
-  DEBUG(for (size_t i = 0; i < Dependences.size(); ++i) {
-      dbgs() << *Dependences[i] << "\n";
-    });
+  LLVM_DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n");
+  LLVM_DEBUG(for (size_t i = 0; i < Dependences.size();
+                  ++i) { dbgs() << *Dependences[i] << "\n"; });
 }
 
 Pass *llvm::createHexagonVectorLoopCarriedReusePass() {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
index ddd668b2cb1e..18d2f2f4acde 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -144,14 +144,15 @@ bool HexagonVectorPrint::runOnMachineFunction(MachineFunction &Fn) {
           unsigned Reg = 0;
           if (getInstrVecReg(*MII, Reg)) {
             VecPrintList.push_back((&*MII));
-            DEBUG(dbgs() << "Found vector reg inside bundle \n"; MII->dump());
+            LLVM_DEBUG(dbgs() << "Found vector reg inside bundle \n";
+                       MII->dump());
           }
         }
       } else {
         unsigned Reg = 0;
         if (getInstrVecReg(MI, Reg)) {
           VecPrintList.push_back(&MI);
-          DEBUG(dbgs() << "Found vector reg \n"; MI.dump());
+          LLVM_DEBUG(dbgs() << "Found vector reg \n"; MI.dump());
         }
       }
     }
@@ -163,33 +164,33 @@ bool HexagonVectorPrint::runOnMachineFunction(MachineFunction &Fn) {
   for (auto *I : VecPrintList) {
     DebugLoc DL = I->getDebugLoc();
     MachineBasicBlock *MBB = I->getParent();
-    DEBUG(dbgs() << "Evaluating V MI\n"; I->dump());
+    LLVM_DEBUG(dbgs() << "Evaluating V MI\n"; I->dump());
     unsigned Reg = 0;
     if (!getInstrVecReg(*I, Reg))
       llvm_unreachable("Need a vector reg");
     MachineBasicBlock::instr_iterator MII = I->getIterator();
     if (I->isInsideBundle()) {
-      DEBUG(dbgs() << "add to end of bundle\n"; I->dump());
+      LLVM_DEBUG(dbgs() << "add to end of bundle\n"; I->dump());
       while (MBB->instr_end() != MII && MII->isInsideBundle())
         MII++;
     } else {
-      DEBUG(dbgs() << "add after instruction\n"; I->dump());
+      LLVM_DEBUG(dbgs() << "add after instruction\n"; I->dump());
       MII++;
     }
     if (MBB->instr_end() == MII)
       continue;
 
     if (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) {
-      DEBUG(dbgs() << "adding dump for V" << Reg-Hexagon::V0 << '\n');
+      LLVM_DEBUG(dbgs() << "adding dump for V" << Reg - Hexagon::V0 << '\n');
       addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
     } else if (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) {
-      DEBUG(dbgs() << "adding dump for W" << Reg-Hexagon::W0 << '\n');
+      LLVM_DEBUG(dbgs() << "adding dump for W" << Reg - Hexagon::W0 << '\n');
       addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2 + 1,
                   MII, DL, QII, Fn);
       addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2,
                    MII, DL, QII, Fn);
     } else if (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3) {
-      DEBUG(dbgs() << "adding dump for Q" << Reg-Hexagon::Q0 << '\n');
+      LLVM_DEBUG(dbgs() << "adding dump for Q" << Reg - Hexagon::Q0 << '\n');
       addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
     } else
       llvm_unreachable("Bad Vector reg");
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index fe54c19370b3..af1e5429d0c2 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -51,7 +51,7 @@ class HexagonAsmBackend : public MCAsmBackend {
     SmallVector<MCFixup, 4> Fixups;
     SmallString<256> Code;
     raw_svector_ostream VecOS(Code);
-    E.encodeInstruction(HMB, VecOS, Fixups, RF.getSubtargetInfo());
+    E.encodeInstruction(HMB, VecOS, Fixups, *RF.getSubtargetInfo());
 
     // Update the fragment.
     RF.setInst(HMB);
@@ -61,13 +61,14 @@ class HexagonAsmBackend : public MCAsmBackend {
 
 public:
   HexagonAsmBackend(const Target &T, const Triple &TT, uint8_t OSABI,
-      StringRef CPU) :
-      OSABI(OSABI), CPU(CPU), MCII(T.createMCInstrInfo()),
-      RelaxTarget(new MCInst *), Extender(nullptr) {}
-
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createHexagonELFObjectWriter(OS, OSABI, CPU);
+                    StringRef CPU)
+      : MCAsmBackend(support::little), OSABI(OSABI), CPU(CPU),
+        MCII(T.createMCInstrInfo()), RelaxTarget(new MCInst *),
+        Extender(nullptr) {}
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createHexagonELFObjectWriter(OSABI, CPU);
   }
 
   void setExtender(MCContext &Context) const {
@@ -413,7 +414,8 @@ public:
   /// fixup kind as appropriate.
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t FixupValue, bool IsResolved) const override {
+                  uint64_t FixupValue, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {
 
     // When FixupValue is 0 the relocation is external and there
     // is nothing for us to do.
@@ -510,17 +512,15 @@ public:
         break;
     }
 
-    DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "(" <<
-          (unsigned)Kind << ")\n");
-    DEBUG(uint32_t OldData = 0;
-          for (unsigned i = 0; i < NumBytes; i++)
-            OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
-          dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
-            ": AValue=0x"; dbgs().write_hex(FixupValue) <<
-            ": Offset=" << Offset <<
-            ": Size=" << Data.size() <<
-            ": OInst=0x"; dbgs().write_hex(OldData) <<
-            ": Reloc=0x"; dbgs().write_hex(Reloc););
+    LLVM_DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "("
+                      << (unsigned)Kind << ")\n");
+    LLVM_DEBUG(
+        uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |=
+                              (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+        dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x";
+        dbgs().write_hex(FixupValue)
+        << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x";
+        dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc););
 
     // For each byte of the fragment that the fixup touches, mask in the
     // bits from the fixup value. The Value has been "split up" into the
@@ -530,10 +530,10 @@ public:
       InstAddr[i] |= uint8_t(Reloc >> (i * 8)) & 0xff;     // Apply new reloc
     }
 
-    DEBUG(uint32_t NewData = 0;
-          for (unsigned i = 0; i < NumBytes; i++)
-            NewData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
-          dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
+    LLVM_DEBUG(uint32_t NewData = 0;
+               for (unsigned i = 0; i < NumBytes; i++) NewData |=
+               (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+               dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
   }
 
   bool isInstRelaxable(MCInst const &HMI) const {
@@ -562,7 +562,8 @@ public:
   /// relaxation.
   ///
   /// \param Inst - The instruction to test.
-  bool mayNeedRelaxation(MCInst const &Inst) const override {
+  bool mayNeedRelaxation(MCInst const &Inst,
+                         const MCSubtargetInfo &STI) const override {
     return true;
   }
 
@@ -571,7 +572,8 @@ public:
   bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
                                     uint64_t Value,
                                     const MCRelaxableFragment *DF,
-                                    const MCAsmLayout &Layout) const override {
+                                    const MCAsmLayout &Layout,
+                                    const bool WasForced) const override {
     MCInst const &MCB = DF->getInst();
     assert(HexagonMCInstrInfo::isBundle(MCB));
 
@@ -682,17 +684,17 @@ public:
     assert(Update && "Didn't find relaxation target");
   }
 
-  bool writeNopData(uint64_t Count,
-                    MCObjectWriter * OW) const override {
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
     static const uint32_t Nopcode  = 0x7f000000, // Hard-coded NOP.
                           ParseIn  = 0x00004000, // In packet parse-bits.
                           ParseEnd = 0x0000c000; // End of packet parse-bits.
 
     while(Count % HEXAGON_INSTR_SIZE) {
-      DEBUG(dbgs() << "Alignment not a multiple of the instruction size:" <<
-          Count % HEXAGON_INSTR_SIZE << "/" << HEXAGON_INSTR_SIZE << "\n");
+      LLVM_DEBUG(dbgs() << "Alignment not a multiple of the instruction size:"
+                        << Count % HEXAGON_INSTR_SIZE << "/"
+                        << HEXAGON_INSTR_SIZE << "\n");
       --Count;
-      OW->write8(0);
+      OS << '\0';
     }
 
     while(Count) {
@@ -700,7 +702,7 @@ public:
       // Close the packet whenever a multiple of the maximum packet size remains
       uint32_t ParseBits = (Count % (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE))?
                            ParseIn: ParseEnd;
-      OW->write32(Nopcode | ParseBits);
+      support::endian::write<uint32_t>(OS, Nopcode | ParseBits, Endian);
     }
     return true;
   }
@@ -736,7 +738,7 @@ public:
                 Inst.addOperand(MCOperand::createInst(Nop));
                 Size -= 4;
                 if (!HexagonMCChecker(
-                         Context, *MCII, RF.getSubtargetInfo(), Inst,
+                         Context, *MCII, *RF.getSubtargetInfo(), Inst,
                          *Context.getRegisterInfo(), false)
                          .check()) {
                   Inst.erase(Inst.end() - 1);
@@ -744,7 +746,7 @@ public:
                 }
               }
               bool Error = HexagonMCShuffle(Context, true, *MCII,
-                                            RF.getSubtargetInfo(), Inst);
+                                            *RF.getSubtargetInfo(), Inst);
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index f5a376033757..cb504b5c3d5d 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -25,7 +25,7 @@ namespace llvm {
 /// HexagonII - This namespace holds all of the target specific flags that
 /// instruction info tracks.
 namespace HexagonII {
-  unsigned const TypeCVI_FIRST = TypeCVI_HIST;
+  unsigned const TypeCVI_FIRST = TypeCVI_4SLOT_MPY;
   unsigned const TypeCVI_LAST = TypeCVI_VX_LATE;
 
   enum SubTarget {
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index 12aa1bd9b2a0..e82e6b559f62 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -298,9 +298,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                                   StringRef CPU) {
-  auto MOTW = llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
-  return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian*/ true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU) {
+  return llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 1929152129fa..3b3a15b990f1 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -69,19 +69,12 @@ void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     OS << "\n";
   }
 
-  auto Separator = "";
-  if (HexagonMCInstrInfo::isInnerLoop(*MI)) {
-    OS << Separator;
-    Separator = " ";
-    MCInst ME;
-    ME.setOpcode(Hexagon::ENDLOOP0);
-    printInstruction(&ME, OS);
-  }
-  if (HexagonMCInstrInfo::isOuterLoop(*MI)) {
-    OS << Separator;
-    MCInst ME;
-    ME.setOpcode(Hexagon::ENDLOOP1);
-    printInstruction(&ME, OS);
+  bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
+  bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
+  if (IsLoop0) {
+    OS << (IsLoop1 ? " :endloop01" : " :endloop0");
+  } else if (IsLoop1) {
+    OS << " :endloop1";
   }
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 631c38c2734f..3382684803aa 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -33,7 +33,9 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <map>
 #include <string>
+#include <vector>
 
 #define DEBUG_TYPE "mccodeemitter"
 
@@ -42,62 +44,350 @@ using namespace Hexagon;
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
-HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
-                                           MCContext &aMCT)
-    : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
-      Extended(new bool(false)), CurrentBundle(new MCInst const *),
-      CurrentIndex(new size_t(0)) {}
+static const unsigned fixup_Invalid = ~0u;
+
+#define _ fixup_Invalid
+#define P(x) Hexagon::fixup_Hexagon##x
+static const std::map<unsigned, std::vector<unsigned>> ExtFixups = {
+  { MCSymbolRefExpr::VK_DTPREL,
+    { _,                _,              _,                      _,
+      _,                _,              P(_DTPREL_16_X),        P(_DTPREL_11_X),
+      P(_DTPREL_11_X),  P(_9_X),        _,                      P(_DTPREL_11_X),
+      P(_DTPREL_16_X),  _,              _,                      _,
+      P(_DTPREL_16_X),  _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_DTPREL_32_6_X) }},
+  { MCSymbolRefExpr::VK_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              P(_GOT_11_X),           _ /* [1] */,
+      _ /* [1] */,      P(_9_X),        _,                      P(_GOT_11_X),
+      P(_GOT_16_X),     _,              _,                      _,
+      P(_GOT_16_X),     _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GOT_32_6_X)    }},
+  { MCSymbolRefExpr::VK_GOTREL,
+    { _,                _,              _,                      _,
+      _,                _,              P(_GOTREL_11_X),        P(_GOTREL_11_X),
+      P(_GOTREL_11_X),  P(_9_X),        _,                      P(_GOTREL_11_X),
+      P(_GOTREL_16_X),  _,              _,                      _,
+      P(_GOTREL_16_X),  _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GOTREL_32_6_X) }},
+  { MCSymbolRefExpr::VK_TPREL,
+    { _,                _,              _,                      _,
+      _,                _,              P(_TPREL_16_X),         P(_TPREL_11_X),
+      P(_TPREL_11_X),   P(_9_X),        _,                      P(_TPREL_11_X),
+      P(_TPREL_16_X),   _,              _,                      _,
+      P(_TPREL_16_X),   _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_TPREL_32_6_X)  }},
+  { MCSymbolRefExpr::VK_Hexagon_GD_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              P(_GD_GOT_16_X),        P(_GD_GOT_11_X),
+      P(_GD_GOT_11_X),  P(_9_X),        _,                      P(_GD_GOT_11_X),
+      P(_GD_GOT_16_X),  _,              _,                      _,
+      P(_GD_GOT_16_X),  _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GD_GOT_32_6_X) }},
+  { MCSymbolRefExpr::VK_Hexagon_GD_PLT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                P(_9_X),        _,                      P(_GD_PLT_B22_PCREL_X),
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              P(_GD_PLT_B22_PCREL_X), _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_IE,
+    { _,                _,              _,                      _,
+      _,                _,              P(_IE_16_X),            _,
+      _,                P(_9_X),        _,                      _,
+      P(_IE_16_X),      _,              _,                      _,
+      P(_IE_16_X),      _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_IE_32_6_X)     }},
+  { MCSymbolRefExpr::VK_Hexagon_IE_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              P(_IE_GOT_11_X),        P(_IE_GOT_11_X),
+      P(_IE_GOT_11_X),  P(_9_X),        _,                      P(_IE_GOT_11_X),
+      P(_IE_GOT_16_X),  _,              _,                      _,
+      P(_IE_GOT_16_X),  _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_IE_GOT_32_6_X) }},
+  { MCSymbolRefExpr::VK_Hexagon_LD_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              P(_LD_GOT_11_X),        P(_LD_GOT_11_X),
+      P(_LD_GOT_11_X),  P(_9_X),        _,                      P(_LD_GOT_11_X),
+      P(_LD_GOT_16_X),  _,              _,                      _,
+      P(_LD_GOT_16_X),  _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_LD_GOT_32_6_X) }},
+  { MCSymbolRefExpr::VK_Hexagon_LD_PLT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                P(_9_X),        _,                      P(_LD_PLT_B22_PCREL_X),
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              P(_LD_PLT_B22_PCREL_X), _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_PCREL,
+    { _,                _,              _,                      _,
+      _,                _,              P(_6_PCREL_X),          _,
+      _,                P(_9_X),        _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_32_PCREL)      }},
+  { MCSymbolRefExpr::VK_None,
+    { _,                _,              _,                      _,
+      _,                _,              P(_6_X),                P(_8_X),
+      P(_8_X),          P(_9_X),        P(_10_X),               P(_11_X),
+      P(_12_X),         P(_B13_PCREL),  _,                      P(_B15_PCREL_X),
+      P(_16_X),         _,              _,                      _,
+      _,                _,              P(_B22_PCREL_X),        _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_32_6_X)        }},
+};
+// [1] The fixup is GOT_16_X for signed values and GOT_11_X for unsigned.
+
+static const std::map<unsigned, std::vector<unsigned>> StdFixups = {
+  { MCSymbolRefExpr::VK_DTPREL,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_DTPREL_16),    _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_DTPREL_32)     }},
+  { MCSymbolRefExpr::VK_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GOT_32)        }},
+  { MCSymbolRefExpr::VK_GOTREL,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _ /* [2] */,      _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GOTREL_32)     }},
+  { MCSymbolRefExpr::VK_PLT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              P(_PLT_B22_PCREL),      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_TPREL,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      P(_TPREL_11_X),
+      _,                _,              _,                      _,
+      P(_TPREL_16),     _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_TPREL_32)      }},
+  { MCSymbolRefExpr::VK_Hexagon_GD_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GD_GOT_16),    _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GD_GOT_32)     }},
+  { MCSymbolRefExpr::VK_Hexagon_GD_PLT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              P(_GD_PLT_B22_PCREL),   _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_GPREL,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_GPREL16_0),    _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_HI16,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_HI16),         _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_IE,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_IE_32)         }},
+  { MCSymbolRefExpr::VK_Hexagon_IE_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_IE_GOT_16),    _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_IE_GOT_32)     }},
+  { MCSymbolRefExpr::VK_Hexagon_LD_GOT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_LD_GOT_16),    _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_LD_GOT_32)     }},
+  { MCSymbolRefExpr::VK_Hexagon_LD_PLT,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              P(_LD_PLT_B22_PCREL),   _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_LO16,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_LO16),         _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _                 }},
+  { MCSymbolRefExpr::VK_Hexagon_PCREL,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_32_PCREL)      }},
+  { MCSymbolRefExpr::VK_None,
+    { _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      _,                P(_B13_PCREL),  _,                      P(_B15_PCREL),
+      _,                _,              _,                      _,
+      _,                _,              P(_B22_PCREL),          _,
+      _,                _,              _,                      _,
+      _,                _,              _,                      _,
+      P(_32)            }},
+};
+//
+// [2] The actual fixup is LO16 or HI16, depending on the instruction.
+#undef P
+#undef _
 
-uint32_t HexagonMCCodeEmitter::parseBits(size_t Last,
-                                         MCInst const &MCB,
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Last, MCInst const &MCB,
                                          MCInst const &MCI) const {
   bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
-  if (*CurrentIndex == 0) {
+  if (State.Index == 0) {
     if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
       assert(!Duplex);
-      assert(*CurrentIndex != Last);
+      assert(State.Index != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
-  if (*CurrentIndex == 1) {
+  if (State.Index == 1) {
     if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
       assert(!Duplex);
-      assert(*CurrentIndex != Last);
+      assert(State.Index != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
   if (Duplex) {
-    assert(*CurrentIndex == Last);
+    assert(State.Index == Last);
     return HexagonII::INST_PARSE_DUPLEX;
   }
-  if(*CurrentIndex == Last)
+  if (State.Index == Last)
     return HexagonII::INST_PARSE_PACKET_END;
   return HexagonII::INST_PARSE_NOT_END;
 }
 
-/// EncodeInstruction - Emit the bundle
+/// Emit the bundle.
 void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                              SmallVectorImpl<MCFixup> &Fixups,
                                              const MCSubtargetInfo &STI) const {
   MCInst &HMB = const_cast<MCInst &>(MI);
 
   assert(HexagonMCInstrInfo::isBundle(HMB));
-  DEBUG(dbgs() << "Encoding bundle\n";);
-  *Addend = 0;
-  *Extended = false;
-  *CurrentBundle = &MI;
-  *CurrentIndex = 0;
+  LLVM_DEBUG(dbgs() << "Encoding bundle\n";);
+  State.Addend = 0;
+  State.Extended = false;
+  State.Bundle = &MI;
+  State.Index = 0;
   size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
+  uint64_t Features = computeAvailableFeatures(STI.getFeatureBits());
+
   for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
     MCInst &HMI = const_cast<MCInst &>(*I.getInst());
-    verifyInstructionPredicates(HMI,
-                                computeAvailableFeatures(STI.getFeatureBits()));
-
-    EncodeSingleInstruction(HMI, OS, Fixups, STI,
-                            parseBits(Last, HMB, HMI));
-    *Extended = HexagonMCInstrInfo::isImmext(HMI);
-    *Addend += HEXAGON_INSTR_SIZE;
-    ++*CurrentIndex;
+    verifyInstructionPredicates(HMI, Features);
+
+    EncodeSingleInstruction(HMI, OS, Fixups, STI, parseBits(Last, HMB, HMI));
+    State.Extended = HexagonMCInstrInfo::isImmext(HMI);
+    State.Addend += HEXAGON_INSTR_SIZE;
+    ++State.Index;
   }
 }
 
@@ -115,9 +405,9 @@ static bool RegisterMatches(unsigned Consumer, unsigned Producer,
 }
 
 /// EncodeSingleInstruction - Emit a single
-void HexagonMCCodeEmitter::EncodeSingleInstruction(
-    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI, uint32_t Parse) const {
+void HexagonMCCodeEmitter::EncodeSingleInstruction(const MCInst &MI,
+      raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+      const MCSubtargetInfo &STI, uint32_t Parse) const {
   assert(!HexagonMCInstrInfo::isBundle(MI));
   uint64_t Binary;
 
@@ -125,198 +415,150 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
   // in the first place!
   assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo() &&
          "pseudo-instruction found");
-  DEBUG(dbgs() << "Encoding insn"
-                  " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
-                                                                    "\n");
+  LLVM_DEBUG(dbgs() << "Encoding insn `"
+                    << HexagonMCInstrInfo::getName(MCII, MI) << "'\n");
 
   Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  unsigned Opc = MI.getOpcode();
+
   // Check for unimplemented instructions. Immediate extenders
   // are encoded as zero, so they need to be accounted for.
-  if (!Binary &&
-      MI.getOpcode() != DuplexIClass0 &&
-      MI.getOpcode() != A4_ext) {
-    DEBUG(dbgs() << "Unimplemented inst: "
-                    " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
-                                                                      "\n");
+  if (!Binary && Opc != DuplexIClass0 && Opc != A4_ext) {
+    LLVM_DEBUG(dbgs() << "Unimplemented inst `"
+                      << HexagonMCInstrInfo::getName(MCII, MI) << "'\n");
     llvm_unreachable("Unimplemented Instruction");
   }
   Binary |= Parse;
 
   // if we need to emit a duplexed instruction
-  if (MI.getOpcode() >= Hexagon::DuplexIClass0 &&
-      MI.getOpcode() <= Hexagon::DuplexIClassF) {
+  if (Opc >= Hexagon::DuplexIClass0 && Opc <= Hexagon::DuplexIClassF) {
     assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
            "Emitting duplex without duplex parse bits");
-    unsigned dupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
+    unsigned DupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
     // 29 is the bit position.
     // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
     // Last bit is moved to bit position 13
-    Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
+    Binary = ((DupIClass & 0xE) << (29 - 1)) | ((DupIClass & 0x1) << 13);
 
-    const MCInst *subInst0 = MI.getOperand(0).getInst();
-    const MCInst *subInst1 = MI.getOperand(1).getInst();
+    const MCInst *Sub0 = MI.getOperand(0).getInst();
+    const MCInst *Sub1 = MI.getOperand(1).getInst();
 
-    // get subinstruction slot 0
-    unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
-    // get subinstruction slot 1
-    unsigned subInstSlot1Bits = getBinaryCodeForInstr(*subInst1, Fixups, STI);
+    // Get subinstruction slot 0.
+    unsigned SubBits0 = getBinaryCodeForInstr(*Sub0, Fixups, STI);
+    // Get subinstruction slot 1.
+    State.SubInst1 = true;
+    unsigned SubBits1 = getBinaryCodeForInstr(*Sub1, Fixups, STI);
+    State.SubInst1 = false;
 
-    Binary |= subInstSlot0Bits | (subInstSlot1Bits << 16);
+    Binary |= SubBits0 | (SubBits1 << 16);
   }
-  support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+  support::endian::write<uint32_t>(OS, Binary, support::little);
   ++MCNumEmitted;
 }
 
 LLVM_ATTRIBUTE_NORETURN
-static void raise_relocation_error(unsigned bits, unsigned kind) {
+static void raise_relocation_error(unsigned Width, unsigned Kind) {
   std::string Text;
-  {
-    raw_string_ostream Stream(Text);
-    Stream << "Unrecognized relocation combination bits: " << bits
-           << " kind: " << kind;
-  }
-  report_fatal_error(Text);
+  raw_string_ostream Stream(Text);
+  Stream << "Unrecognized relocation combination: width=" << Width
+         << " kind=" << Kind;
+  report_fatal_error(Stream.str());
 }
 
-/// getFixupNoBits - Some insns are not extended and thus have no
-/// bits.  These cases require a more brute force method for determining
-/// the correct relocation.
+/// Some insns are not extended and thus have no bits. These cases require
+/// a more brute force method for determining the correct relocation.
 Hexagon::Fixups HexagonMCCodeEmitter::getFixupNoBits(
-    MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
-    const MCSymbolRefExpr::VariantKind kind) const {
+      MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
+      const MCSymbolRefExpr::VariantKind VarKind) const {
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
-  unsigned insnType = HexagonMCInstrInfo::getType(MCII, MI);
-
-  if (insnType == HexagonII::TypeEXTENDER) {
-    switch (kind) {
-    case MCSymbolRefExpr::VK_GOTREL:
-      return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
-    case MCSymbolRefExpr::VK_GOT:
-      return Hexagon::fixup_Hexagon_GOT_32_6_X;
-    case MCSymbolRefExpr::VK_TPREL:
-      return Hexagon::fixup_Hexagon_TPREL_32_6_X;
-    case MCSymbolRefExpr::VK_DTPREL:
-      return Hexagon::fixup_Hexagon_DTPREL_32_6_X;
-    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-      return Hexagon::fixup_Hexagon_GD_GOT_32_6_X;
-    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-      return Hexagon::fixup_Hexagon_LD_GOT_32_6_X;
-    case MCSymbolRefExpr::VK_Hexagon_IE:
-      return Hexagon::fixup_Hexagon_IE_32_6_X;
-    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-      return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
-    case MCSymbolRefExpr::VK_Hexagon_PCREL:
-      return Hexagon::fixup_Hexagon_B32_PCREL_X;
-    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
-      return Hexagon::fixup_Hexagon_GD_PLT_B32_PCREL_X;
-    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
-      return Hexagon::fixup_Hexagon_LD_PLT_B32_PCREL_X;
-
-    case MCSymbolRefExpr::VK_None: {
-      auto Insts = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
-      for (auto I = Insts.begin(), N = Insts.end(); I != N; ++I) {
-        if (I->getInst() == &MI) {
-          const MCInst &NextI = *(I+1)->getInst();
-          const MCInstrDesc &D = HexagonMCInstrInfo::getDesc(MCII, NextI);
-          if (D.isBranch() || D.isCall() ||
-              HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
-            return Hexagon::fixup_Hexagon_B32_PCREL_X;
-          return Hexagon::fixup_Hexagon_32_6_X;
-        }
+  unsigned InsnType = HexagonMCInstrInfo::getType(MCII, MI);
+  using namespace Hexagon;
+
+  if (InsnType == HexagonII::TypeEXTENDER) {
+    if (VarKind == MCSymbolRefExpr::VK_None) {
+      auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
+      for (auto I = Instrs.begin(), N = Instrs.end(); I != N; ++I) {
+        if (I->getInst() != &MI)
+          continue;
+        assert(I+1 != N && "Extender cannot be last in packet");
+        const MCInst &NextI = *(I+1)->getInst();
+        const MCInstrDesc &NextD = HexagonMCInstrInfo::getDesc(MCII, NextI);
+        if (NextD.isBranch() || NextD.isCall() ||
+            HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
+          return fixup_Hexagon_B32_PCREL_X;
+        return fixup_Hexagon_32_6_X;
       }
-      raise_relocation_error(0, kind);
-    }
-    default:
-      raise_relocation_error(0, kind);
     }
-  } else if (MCID.isBranch())
-    return Hexagon::fixup_Hexagon_B13_PCREL;
 
-  switch (MCID.getOpcode()) {
-  case Hexagon::HI:
-  case Hexagon::A2_tfrih:
-    switch (kind) {
-    case MCSymbolRefExpr::VK_GOT:
-      return Hexagon::fixup_Hexagon_GOT_HI16;
-    case MCSymbolRefExpr::VK_GOTREL:
-      return Hexagon::fixup_Hexagon_GOTREL_HI16;
-    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-      return Hexagon::fixup_Hexagon_GD_GOT_HI16;
-    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-      return Hexagon::fixup_Hexagon_LD_GOT_HI16;
-    case MCSymbolRefExpr::VK_Hexagon_IE:
-      return Hexagon::fixup_Hexagon_IE_HI16;
-    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-      return Hexagon::fixup_Hexagon_IE_GOT_HI16;
-    case MCSymbolRefExpr::VK_TPREL:
-      return Hexagon::fixup_Hexagon_TPREL_HI16;
-    case MCSymbolRefExpr::VK_DTPREL:
-      return Hexagon::fixup_Hexagon_DTPREL_HI16;
-    case MCSymbolRefExpr::VK_None:
-      return Hexagon::fixup_Hexagon_HI16;
-    default:
-      raise_relocation_error(0, kind);
-    }
+    static const std::map<unsigned,unsigned> Relocs = {
+      { MCSymbolRefExpr::VK_GOTREL,         fixup_Hexagon_GOTREL_32_6_X },
+      { MCSymbolRefExpr::VK_GOT,            fixup_Hexagon_GOT_32_6_X },
+      { MCSymbolRefExpr::VK_TPREL,          fixup_Hexagon_TPREL_32_6_X },
+      { MCSymbolRefExpr::VK_DTPREL,         fixup_Hexagon_DTPREL_32_6_X },
+      { MCSymbolRefExpr::VK_Hexagon_GD_GOT, fixup_Hexagon_GD_GOT_32_6_X },
+      { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_32_6_X },
+      { MCSymbolRefExpr::VK_Hexagon_IE,     fixup_Hexagon_IE_32_6_X },
+      { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_32_6_X },
+      { MCSymbolRefExpr::VK_Hexagon_PCREL,  fixup_Hexagon_B32_PCREL_X },
+      { MCSymbolRefExpr::VK_Hexagon_GD_PLT, fixup_Hexagon_GD_PLT_B32_PCREL_X },
+      { MCSymbolRefExpr::VK_Hexagon_LD_PLT, fixup_Hexagon_LD_PLT_B32_PCREL_X },
+    };
+
+    auto F = Relocs.find(VarKind);
+    if (F != Relocs.end())
+      return Hexagon::Fixups(F->second);
+    raise_relocation_error(0, VarKind);
+  }
 
-  case Hexagon::LO:
-  case Hexagon::A2_tfril:
-    switch (kind) {
-    case MCSymbolRefExpr::VK_GOT:
-      return Hexagon::fixup_Hexagon_GOT_LO16;
-    case MCSymbolRefExpr::VK_GOTREL:
-      return Hexagon::fixup_Hexagon_GOTREL_LO16;
-    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-      return Hexagon::fixup_Hexagon_GD_GOT_LO16;
-    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-      return Hexagon::fixup_Hexagon_LD_GOT_LO16;
-    case MCSymbolRefExpr::VK_Hexagon_IE:
-      return Hexagon::fixup_Hexagon_IE_LO16;
-    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-      return Hexagon::fixup_Hexagon_IE_GOT_LO16;
-    case MCSymbolRefExpr::VK_TPREL:
-      return Hexagon::fixup_Hexagon_TPREL_LO16;
-    case MCSymbolRefExpr::VK_DTPREL:
-      return Hexagon::fixup_Hexagon_DTPREL_LO16;
-    case MCSymbolRefExpr::VK_None:
-      return Hexagon::fixup_Hexagon_LO16;
-    default:
-      raise_relocation_error(0, kind);
-    }
+  if (MCID.isBranch())
+    return fixup_Hexagon_B13_PCREL;
+
+  static const std::map<unsigned,unsigned> RelocsLo = {
+    { MCSymbolRefExpr::VK_GOT,            fixup_Hexagon_GOT_LO16 },
+    { MCSymbolRefExpr::VK_GOTREL,         fixup_Hexagon_GOTREL_LO16 },
+    { MCSymbolRefExpr::VK_Hexagon_GD_GOT, fixup_Hexagon_GD_GOT_LO16 },
+    { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_LO16 },
+    { MCSymbolRefExpr::VK_Hexagon_IE,     fixup_Hexagon_IE_LO16 },
+    { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_LO16 },
+    { MCSymbolRefExpr::VK_TPREL,          fixup_Hexagon_TPREL_LO16 },
+    { MCSymbolRefExpr::VK_DTPREL,         fixup_Hexagon_DTPREL_LO16 },
+    { MCSymbolRefExpr::VK_None,           fixup_Hexagon_LO16 },
+  };
+
+  static const std::map<unsigned,unsigned> RelocsHi = {
+    { MCSymbolRefExpr::VK_GOT,            fixup_Hexagon_GOT_HI16 },
+    { MCSymbolRefExpr::VK_GOTREL,         fixup_Hexagon_GOTREL_HI16 },
+    { MCSymbolRefExpr::VK_Hexagon_GD_GOT, fixup_Hexagon_GD_GOT_HI16 },
+    { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_HI16 },
+    { MCSymbolRefExpr::VK_Hexagon_IE,     fixup_Hexagon_IE_HI16 },
+    { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_HI16 },
+    { MCSymbolRefExpr::VK_TPREL,          fixup_Hexagon_TPREL_HI16 },
+    { MCSymbolRefExpr::VK_DTPREL,         fixup_Hexagon_DTPREL_HI16 },
+    { MCSymbolRefExpr::VK_None,           fixup_Hexagon_HI16 },
+  };
 
-  // The only relocs left should be GP relative:
-  default:
-    if (MCID.mayStore() || MCID.mayLoad()) {
-      for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
-           ++ImpUses) {
-        if (*ImpUses != Hexagon::GP)
-          continue;
-        switch (HexagonMCInstrInfo::getMemAccessSize(MCII, MI)) {
-        case 1:
-          return fixup_Hexagon_GPREL16_0;
-        case 2:
-          return fixup_Hexagon_GPREL16_1;
-        case 4:
-          return fixup_Hexagon_GPREL16_2;
-        case 8:
-          return fixup_Hexagon_GPREL16_3;
-        default:
-          raise_relocation_error(0, kind);
-        }
-      }
+  switch (MCID.getOpcode()) {
+    case Hexagon::LO:
+    case Hexagon::A2_tfril: {
+      auto F = RelocsLo.find(VarKind);
+      if (F != RelocsLo.end())
+        return Hexagon::Fixups(F->second);
+      break;
+    }
+    case Hexagon::HI:
+    case Hexagon::A2_tfrih: {
+      auto F = RelocsHi.find(VarKind);
+      if (F != RelocsHi.end())
+        return Hexagon::Fixups(F->second);
+      break;
     }
-    raise_relocation_error(0, kind);
   }
-  llvm_unreachable("Relocation exit not taken");
-}
-
-namespace llvm {
-
-extern const MCInstrDesc HexagonInsts[];
 
-} // end namespace llvm
+  raise_relocation_error(0, VarKind);
+}
 
-static bool isPCRel (unsigned Kind) {
-  switch(Kind){
+static bool isPCRel(unsigned Kind) {
+  switch (Kind){
   case fixup_Hexagon_B22_PCREL:
   case fixup_Hexagon_B15_PCREL:
   case fixup_Hexagon_B7_PCREL:
@@ -342,16 +584,34 @@ static bool isPCRel (unsigned Kind) {
 }
 
 unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
-                                              const MCOperand &MO,
-                                              const MCExpr *ME,
-                                              SmallVectorImpl<MCFixup> &Fixups,
-                                              const MCSubtargetInfo &STI) const
-{
+      const MCOperand &MO, const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
+      const MCSubtargetInfo &STI) const {
   if (isa<HexagonMCExpr>(ME))
     ME = &HexagonMCInstrInfo::getExpr(*ME);
   int64_t Value;
-  if (ME->evaluateAsAbsolute(Value))
+  if (ME->evaluateAsAbsolute(Value)) {
+    bool InstExtendable = HexagonMCInstrInfo::isExtendable(MCII, MI) ||
+                          HexagonMCInstrInfo::isExtended(MCII, MI);
+    // Only sub-instruction #1 can be extended in a duplex. If MI is a
+    // sub-instruction #0, it is not extended even if Extended is true
+    // (it can be true for the duplex as a whole).
+    bool IsSub0 = HexagonMCInstrInfo::isSubInstruction(MI) && !State.SubInst1;
+    if (State.Extended && InstExtendable && !IsSub0) {
+      unsigned OpIdx = ~0u;
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        if (&MO != &MI.getOperand(I))
+          continue;
+        OpIdx = I;
+        break;
+      }
+      assert(OpIdx != ~0u);
+      if (OpIdx == HexagonMCInstrInfo::getExtendableOp(MCII, MI)) {
+        unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+        Value = (Value & 0x3f) << Shift;
+      }
+    }
     return Value;
+  }
   assert(ME->getKind() == MCExpr::SymbolRef ||
          ME->getKind() == MCExpr::Binary);
   if (ME->getKind() == MCExpr::Binary) {
@@ -360,366 +620,99 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
     getExprOpValue(MI, MO, Binary->getRHS(), Fixups, STI);
     return 0;
   }
-  Hexagon::Fixups FixupKind =
-      Hexagon::Fixups(Hexagon::fixup_Hexagon_TPREL_LO16);
+
+  unsigned FixupKind = fixup_Invalid;
   const MCSymbolRefExpr *MCSRE = static_cast<const MCSymbolRefExpr *>(ME);
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
-  unsigned bits = HexagonMCInstrInfo::getExtentBits(MCII, MI) -
-                  HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
-  const MCSymbolRefExpr::VariantKind kind = MCSRE->getKind();
-
-  DEBUG(dbgs() << "----------------------------------------\n");
-  DEBUG(dbgs() << "Opcode Name: " << HexagonMCInstrInfo::getName(MCII, MI)
-               << "\n");
-  DEBUG(dbgs() << "Opcode: " << MCID.getOpcode() << "\n");
-  DEBUG(dbgs() << "Relocation bits: " << bits << "\n");
-  DEBUG(dbgs() << "Addend: " << *Addend << "\n");
-  DEBUG(dbgs() << "----------------------------------------\n");
-
-  switch (bits) {
-  default:
-    raise_relocation_error(bits, kind);
-  case 32:
-    switch (kind) {
-    case MCSymbolRefExpr::VK_DTPREL:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_DTPREL_32_6_X
-                            : Hexagon::fixup_Hexagon_DTPREL_32;
-      break;
-    case MCSymbolRefExpr::VK_GOT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOT_32_6_X
-                            : Hexagon::fixup_Hexagon_GOT_32;
-      break;
-    case MCSymbolRefExpr::VK_GOTREL:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOTREL_32_6_X
-                            : Hexagon::fixup_Hexagon_GOTREL_32;
-      break;
-    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_GOT_32_6_X
-                            : Hexagon::fixup_Hexagon_GD_GOT_32;
-      break;
-    case MCSymbolRefExpr::VK_Hexagon_IE:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_32_6_X
-                            : Hexagon::fixup_Hexagon_IE_32;
-      break;
-    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_GOT_32_6_X
-                            : Hexagon::fixup_Hexagon_IE_GOT_32;
-      break;
-    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_GOT_32_6_X
-                            : Hexagon::fixup_Hexagon_LD_GOT_32;
-      break;
-    case MCSymbolRefExpr::VK_Hexagon_PCREL:
-      FixupKind = Hexagon::fixup_Hexagon_32_PCREL;
-      break;
-    case MCSymbolRefExpr::VK_None:
-      FixupKind =
-          *Extended ? Hexagon::fixup_Hexagon_32_6_X : Hexagon::fixup_Hexagon_32;
-      break;
-    case MCSymbolRefExpr::VK_TPREL:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_TPREL_32_6_X
-                            : Hexagon::fixup_Hexagon_TPREL_32;
-      break;
-    default:
-      raise_relocation_error(bits, kind);
-    }
-    break;
-
-  case 22:
-    switch (kind) {
-    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL_X
-                            : Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
-      break;
-    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL_X
-                            : Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
-      break;
-    case MCSymbolRefExpr::VK_None:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
-                            : Hexagon::fixup_Hexagon_B22_PCREL;
-      break;
-    case MCSymbolRefExpr::VK_PLT:
-      FixupKind = Hexagon::fixup_Hexagon_PLT_B22_PCREL;
-      break;
-    default:
-      raise_relocation_error(bits, kind);
-    }
-    break;
-
-  case 16:
-    if (*Extended) {
-      switch (kind) {
-      case MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
-        break;
-      case MCSymbolRefExpr::VK_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
-        break;
-      case MCSymbolRefExpr::VK_GOTREL:
-        FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_IE:
-        FixupKind = Hexagon::fixup_Hexagon_IE_16_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16_X;
-        break;
-      case MCSymbolRefExpr::VK_None:
-        FixupKind = Hexagon::fixup_Hexagon_16_X;
-        break;
-      case MCSymbolRefExpr::VK_TPREL:
-        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
-      }
-    } else
-      switch (kind) {
-      case MCSymbolRefExpr::VK_None:
-        if (HexagonMCInstrInfo::s27_2_reloc(*MO.getExpr()))
-          FixupKind = Hexagon::fixup_Hexagon_27_REG;
-        else
-          if (MCID.mayStore() || MCID.mayLoad()) {
-            for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
-                 ++ImpUses) {
-              if (*ImpUses != Hexagon::GP)
-                continue;
-              switch (HexagonMCInstrInfo::getMemAccessSize(MCII, MI)) {
-              case 1:
-                FixupKind = fixup_Hexagon_GPREL16_0;
-                break;
-              case 2:
-                FixupKind = fixup_Hexagon_GPREL16_1;
-                break;
-              case 4:
-                FixupKind = fixup_Hexagon_GPREL16_2;
-                break;
-              case 8:
-                FixupKind = fixup_Hexagon_GPREL16_3;
-                break;
-              default:
-                raise_relocation_error(bits, kind);
-              }
-            }
-          } else
-            raise_relocation_error(bits, kind);
-        break;
-      case MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16;
-        break;
-      case MCSymbolRefExpr::VK_GOTREL:
-        if (MCID.getOpcode() == Hexagon::HI)
-          FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
-        else
-          FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_GPREL:
-        FixupKind = Hexagon::fixup_Hexagon_GPREL16_0;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_HI16:
-        FixupKind = Hexagon::fixup_Hexagon_HI16;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_LO16:
-        FixupKind = Hexagon::fixup_Hexagon_LO16;
-        break;
-      case MCSymbolRefExpr::VK_TPREL:
-        FixupKind = Hexagon::fixup_Hexagon_TPREL_16;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
+  unsigned FixupWidth = HexagonMCInstrInfo::getExtentBits(MCII, MI) -
+                        HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+  MCSymbolRefExpr::VariantKind VarKind = MCSRE->getKind();
+  unsigned Opc = MCID.getOpcode();
+  unsigned IType = HexagonMCInstrInfo::getType(MCII, MI);
+
+  LLVM_DEBUG(dbgs() << "----------------------------------------\n"
+                    << "Opcode Name: " << HexagonMCInstrInfo::getName(MCII, MI)
+                    << "\nOpcode: " << Opc << "\nRelocation bits: "
+                    << FixupWidth << "\nAddend: " << State.Addend
+                    << "\nVariant: " << unsigned(VarKind)
+                    << "\n----------------------------------------\n");
+
+  // Pick the applicable fixup kind for the symbol.
+  // Handle special cases first, the rest will be looked up in the tables.
+
+  if (FixupWidth == 16 && !State.Extended) {
+    if (VarKind == MCSymbolRefExpr::VK_None) {
+      if (HexagonMCInstrInfo::s27_2_reloc(*MO.getExpr())) {
+        // A2_iconst.
+        FixupKind = Hexagon::fixup_Hexagon_27_REG;
+      } else {
+        // Look for GP-relative fixups.
+        unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+        static const Hexagon::Fixups GPRelFixups[] = {
+          Hexagon::fixup_Hexagon_GPREL16_0, Hexagon::fixup_Hexagon_GPREL16_1,
+          Hexagon::fixup_Hexagon_GPREL16_2, Hexagon::fixup_Hexagon_GPREL16_3
+        };
+        assert(Shift < array_lengthof(GPRelFixups));
+        auto UsesGP = [] (const MCInstrDesc &D) {
+          for (const MCPhysReg *U = D.getImplicitUses(); U && *U; ++U)
+            if (*U == Hexagon::GP)
+              return true;
+          return false;
+        };
+        if (UsesGP(MCID))
+          FixupKind = GPRelFixups[Shift];
       }
-    break;
-
-  case 15:
-    switch (kind) {
-    case MCSymbolRefExpr::VK_None:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B15_PCREL_X
-                            : Hexagon::fixup_Hexagon_B15_PCREL;
-      break;
-    default:
-      raise_relocation_error(bits, kind);
+    } else if (VarKind == MCSymbolRefExpr::VK_GOTREL) {
+      // Select between LO/HI.
+      if (Opc == Hexagon::LO)
+        FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
+      else if (Opc == Hexagon::HI)
+        FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
     }
-    break;
-
-  case 13:
-    switch (kind) {
-    case MCSymbolRefExpr::VK_None:
-      FixupKind = Hexagon::fixup_Hexagon_B13_PCREL;
-      break;
-    default:
-      raise_relocation_error(bits, kind);
-    }
-    break;
-
-  case 12:
-    if (*Extended)
-      switch (kind) {
-      // There isn't a GOT_12_X, both 11_X and 16_X resolve to 6/26
-      case MCSymbolRefExpr::VK_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
-        break;
-      case MCSymbolRefExpr::VK_GOTREL:
-        FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
-        break;
-      case MCSymbolRefExpr::VK_None:
-        FixupKind = Hexagon::fixup_Hexagon_12_X;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
-      }
-    else
-      raise_relocation_error(bits, kind);
-    break;
-
-  case 11:
-    if (*Extended)
-      switch (kind) {
-      case MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_11_X;
-        break;
-      case MCSymbolRefExpr::VK_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
-        break;
-      case MCSymbolRefExpr::VK_GOTREL:
-        FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_11_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_11_X;
+  } else {
+    bool BranchOrCR = MCID.isBranch() || IType == HexagonII::TypeCR;
+    switch (FixupWidth) {
+      case 9:
+        if (BranchOrCR)
+          FixupKind = State.Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
+                                     : Hexagon::fixup_Hexagon_B9_PCREL;
+        break;
+      case 8:
+      case 7:
+        if (State.Extended && VarKind == MCSymbolRefExpr::VK_GOT)
+          FixupKind = HexagonMCInstrInfo::isExtentSigned(MCII, MI)
+                        ? Hexagon::fixup_Hexagon_GOT_16_X
+                        : Hexagon::fixup_Hexagon_GOT_11_X;
+        else if (FixupWidth == 7 && BranchOrCR)
+          FixupKind = State.Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
+                                     : Hexagon::fixup_Hexagon_B7_PCREL;
+        break;
+      case 0:
+        FixupKind = getFixupNoBits(MCII, MI, MO, VarKind);
         break;
-      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_11_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
-        FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
-        FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL_X;
-        break;
-      case MCSymbolRefExpr::VK_None:
-        FixupKind = Hexagon::fixup_Hexagon_11_X;
-        break;
-      case MCSymbolRefExpr::VK_TPREL:
-        FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
-      }
-    else {
-      switch (kind) {
-      case MCSymbolRefExpr::VK_TPREL:
-        FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
-      }
     }
-    break;
+  }
 
-  case 10:
-    if (*Extended) {
-      switch (kind) {
-      case MCSymbolRefExpr::VK_None:
-        FixupKind = Hexagon::fixup_Hexagon_10_X;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
-      }
-    } else
-      raise_relocation_error(bits, kind);
-    break;
-
-  case 9:
-    if (MCID.isBranch() ||
-        (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
-                            : Hexagon::fixup_Hexagon_B9_PCREL;
-    else if (*Extended)
-      FixupKind = Hexagon::fixup_Hexagon_9_X;
-    else
-      raise_relocation_error(bits, kind);
-    break;
-
-  case 8:
-    if (*Extended)
-      FixupKind = Hexagon::fixup_Hexagon_8_X;
-    else
-      raise_relocation_error(bits, kind);
-    break;
-
-  case 7:
-    if (MCID.isBranch() ||
-        (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
-                            : Hexagon::fixup_Hexagon_B7_PCREL;
-    else if (*Extended)
-      FixupKind = Hexagon::fixup_Hexagon_7_X;
-    else
-      raise_relocation_error(bits, kind);
-    break;
-
-  case 6:
-    if (*Extended) {
-      switch (kind) {
-      case MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
-        break;
-      // This is part of an extender, GOT_11 is a
-      // Word32_U6 unsigned/truncated reloc.
-      case MCSymbolRefExpr::VK_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
-        break;
-      case MCSymbolRefExpr::VK_GOTREL:
-        FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
-        break;
-      case MCSymbolRefExpr::VK_Hexagon_PCREL:
-        FixupKind = Hexagon::fixup_Hexagon_6_PCREL_X;
-        break;
-      case MCSymbolRefExpr::VK_TPREL:
-        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
-        break;
-      case MCSymbolRefExpr::VK_None:
-        FixupKind = Hexagon::fixup_Hexagon_6_X;
-        break;
-      default:
-        raise_relocation_error(bits, kind);
-      }
-    } else
-      raise_relocation_error(bits, kind);
-    break;
+  if (FixupKind == fixup_Invalid) {
+    const auto &FixupTable = State.Extended ? ExtFixups : StdFixups;
 
-  case 0:
-    FixupKind = getFixupNoBits(MCII, MI, MO, kind);
-    break;
+    auto FindVK = FixupTable.find(VarKind);
+    if (FindVK != FixupTable.end())
+      FixupKind = FindVK->second[FixupWidth];
   }
 
-  MCExpr const *FixupExpression =
-      (*Addend > 0 && isPCRel(FixupKind))
-          ? MCBinaryExpr::createAdd(MO.getExpr(),
-                                    MCConstantExpr::create(*Addend, MCT), MCT)
-          : MO.getExpr();
+  if (FixupKind == fixup_Invalid)
+    raise_relocation_error(FixupWidth, VarKind);
 
-  MCFixup fixup = MCFixup::create(*Addend, FixupExpression,
+  const MCExpr *FixupExpr = MO.getExpr();
+  if (State.Addend != 0 && isPCRel(FixupKind)) {
+    const MCExpr *C = MCConstantExpr::create(State.Addend, MCT);
+    FixupExpr = MCBinaryExpr::createAdd(FixupExpr, C, MCT);
+  }
+
+  MCFixup Fixup = MCFixup::create(State.Addend, FixupExpr,
                                   MCFixupKind(FixupKind), MI.getLoc());
-  Fixups.push_back(fixup);
+  Fixups.push_back(Fixup);
   // All of the information is in the fixup.
   return 0;
 }
@@ -739,55 +732,55 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
 #endif
 
   if (HexagonMCInstrInfo::isNewValue(MCII, MI) &&
-      &MO == &MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI))) {
+      &MO == &HexagonMCInstrInfo::getNewValueOperand(MCII, MI)) {
     // Calculate the new value distance to the associated producer
-    MCOperand const &MCO =
-      MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI));
     unsigned SOffset = 0;
     unsigned VOffset = 0;
-    unsigned Register = MCO.getReg();
-    unsigned Register1;
-    unsigned Register2;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
-    auto i = Instructions.begin() + *CurrentIndex - 1;
-    for (;; --i) {
-      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
-      MCInst const &Inst = *i->getInst();
+    unsigned UseReg = MO.getReg();
+    unsigned DefReg1, DefReg2;
+
+    auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
+    const MCOperand *I = Instrs.begin() + State.Index - 1;
+
+    for (;; --I) {
+      assert(I != Instrs.begin() - 1 && "Couldn't find producer");
+      MCInst const &Inst = *I->getInst();
       if (HexagonMCInstrInfo::isImmext(Inst))
         continue;
+
+      DefReg1 = DefReg2 = 0;
       ++SOffset;
-      if (HexagonMCInstrInfo::isVector(MCII, Inst))
-        // Vector instructions don't count scalars
+      if (HexagonMCInstrInfo::isVector(MCII, Inst)) {
+        // Vector instructions don't count scalars.
         ++VOffset;
-      Register1 =
-        HexagonMCInstrInfo::hasNewValue(MCII, Inst)
-        ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
-        : static_cast<unsigned>(Hexagon::NoRegister);
-      Register2 =
-        HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
-        ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
-        : static_cast<unsigned>(Hexagon::NoRegister);
-      if (!RegisterMatches(Register, Register1, Register2))
+      }
+      if (HexagonMCInstrInfo::hasNewValue(MCII, Inst))
+        DefReg1 = HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg();
+      if (HexagonMCInstrInfo::hasNewValue2(MCII, Inst))
+        DefReg2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg();
+      if (!RegisterMatches(UseReg, DefReg1, DefReg2)) {
         // This isn't the register we're looking for
         continue;
-      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+      }
+      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst)) {
         // Producer is unpredicated
         break;
+      }
       assert(HexagonMCInstrInfo::isPredicated(MCII, MI) &&
-        "Unpredicated consumer depending on predicated producer");
+             "Unpredicated consumer depending on predicated producer");
       if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
-        HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
-        // Producer predicate sense matched ours
+          HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
+        // Producer predicate sense matched ours.
         break;
     }
     // Hexagon PRM 10.11 Construct Nt from distance
-    unsigned Offset =
-      HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset : SOffset;
+    unsigned Offset = HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset
+                                                             : SOffset;
     Offset <<= 1;
-    Offset |=
-      HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+    Offset |= HexagonMCInstrInfo::SubregisterBit(UseReg, DefReg1, DefReg2);
     return Offset;
   }
+
   assert(!MO.isImm());
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 14cabf1534a5..fcea63db23a3 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Definition for classes that emit Hexagon machine code from MCInsts
+/// Definition for classes that emit Hexagon machine code from MCInsts
 ///
 //===----------------------------------------------------------------------===//
 
@@ -35,25 +35,20 @@ class raw_ostream;
 class HexagonMCCodeEmitter : public MCCodeEmitter {
   MCContext &MCT;
   MCInstrInfo const &MCII;
-  std::unique_ptr<unsigned> Addend;
-  std::unique_ptr<bool> Extended;
-  std::unique_ptr<MCInst const *> CurrentBundle;
-  std::unique_ptr<size_t> CurrentIndex;
 
-  // helper routine for getMachineOpValue()
-  unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
-                          const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
-                          const MCSubtargetInfo &STI) const;
-
-  Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
-                                 const MCOperand &MO,
-                                 const MCSymbolRefExpr::VariantKind kind) const;
+  // A mutable state of the emitter when encoding bundles and duplexes.
+  struct EmitterState {
+    unsigned Addend = 0;
+    bool Extended = false;
+    bool SubInst1 = false;
+    const MCInst *Bundle = nullptr;
+    size_t Index = 0;
+  };
+  mutable EmitterState State;
 
 public:
-  HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
-
-  // Return parse bits for instruction `MCI' inside bundle `MCB'
-  uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
+  HexagonMCCodeEmitter(MCInstrInfo const &MII, MCContext &MCT)
+    : MCT(MCT), MCII(MII) {}
 
   void encodeInstruction(MCInst const &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -64,18 +59,30 @@ public:
                                const MCSubtargetInfo &STI,
                                uint32_t Parse) const;
 
-  // \brief TableGen'erated function for getting the
+  // TableGen'erated function for getting the
   // binary encoding for an instruction.
   uint64_t getBinaryCodeForInstr(MCInst const &MI,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  MCSubtargetInfo const &STI) const;
 
-  /// \brief Return binary encoding of operand.
+  /// Return binary encoding of operand.
   unsigned getMachineOpValue(MCInst const &MI, MCOperand const &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              MCSubtargetInfo const &STI) const;
 
 private:
+  // helper routine for getMachineOpValue()
+  unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
+                          const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
+  Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+                                 const MCOperand &MO,
+                                 const MCSymbolRefExpr::VariantKind Kind) const;
+
+  // Return parse bits for instruction `MCI' inside bundle `MCB'
+  uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
+
   uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 127c97e342dc..3eaef9ac7410 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -205,7 +205,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
 
   switch (L.getOpcode()) {
   default:
-    DEBUG(dbgs() << "Possible compound ignored\n");
+    LLVM_DEBUG(dbgs() << "Possible compound ignored\n");
     return CompoundInsn;
 
   case Hexagon::A2_tfrsi:
@@ -233,7 +233,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::C2_cmpeq:
-    DEBUG(dbgs() << "CX: C2_cmpeq\n");
+    LLVM_DEBUG(dbgs() << "CX: C2_cmpeq\n");
     Rs = L.getOperand(1);
     Rt = L.getOperand(2);
 
@@ -246,7 +246,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::C2_cmpgt:
-    DEBUG(dbgs() << "CX: C2_cmpgt\n");
+    LLVM_DEBUG(dbgs() << "CX: C2_cmpgt\n");
     Rs = L.getOperand(1);
     Rt = L.getOperand(2);
 
@@ -259,7 +259,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::C2_cmpgtu:
-    DEBUG(dbgs() << "CX: C2_cmpgtu\n");
+    LLVM_DEBUG(dbgs() << "CX: C2_cmpgtu\n");
     Rs = L.getOperand(1);
     Rt = L.getOperand(2);
 
@@ -272,7 +272,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::C2_cmpeqi:
-    DEBUG(dbgs() << "CX: C2_cmpeqi\n");
+    LLVM_DEBUG(dbgs() << "CX: C2_cmpeqi\n");
     Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
     (void)Success;
     assert(Success);
@@ -290,7 +290,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::C2_cmpgti:
-    DEBUG(dbgs() << "CX: C2_cmpgti\n");
+    LLVM_DEBUG(dbgs() << "CX: C2_cmpgti\n");
     Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
     (void)Success;
     assert(Success);
@@ -308,7 +308,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::C2_cmpgtui:
-    DEBUG(dbgs() << "CX: C2_cmpgtui\n");
+    LLVM_DEBUG(dbgs() << "CX: C2_cmpgtui\n");
     Rs = L.getOperand(1);
     compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
     CompoundInsn = new (Context) MCInst;
@@ -319,7 +319,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     break;
 
   case Hexagon::S2_tstbit_i:
-    DEBUG(dbgs() << "CX: S2_tstbit_i\n");
+    LLVM_DEBUG(dbgs() << "CX: S2_tstbit_i\n");
     Rs = L.getOperand(1);
     compoundOpcode = tstBitOpcode[getCompoundOp(R)];
     CompoundInsn = new (Context) MCInst;
@@ -372,14 +372,14 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
           BExtended = true;
           continue;
         }
-        DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
-                     << Inst->getOpcode() << "\n");
+        LLVM_DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
+                          << Inst->getOpcode() << "\n");
         if (isOrderedCompoundPair(*Inst, BExtended, *JumpInst, JExtended)) {
           MCInst *CompoundInsn = getCompoundInsn(Context, *Inst, *JumpInst);
           if (CompoundInsn) {
-            DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
-                         << JumpInst->getOpcode() << " Compounds to "
-                         << CompoundInsn->getOpcode() << "\n");
+            LLVM_DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
+                              << JumpInst->getOpcode() << " Compounds to "
+                              << CompoundInsn->getOpcode() << "\n");
             J->setInst(CompoundInsn);
             MCI.erase(B);
             return true;
@@ -422,7 +422,7 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo co
 
     if (StartedValid &&
         !llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI)) {
-       DEBUG(dbgs() << "Found ERROR\n");
+      LLVM_DEBUG(dbgs() << "Found ERROR\n");
       MCI = OriginalBundle;
     }
   }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 4c18af60efd1..b208a3668124 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -263,12 +263,10 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     break;
 
   case Hexagon::L4_return:
-
   case Hexagon::L2_deallocframe:
-
     return HexagonII::HSIG_L2;
-  case Hexagon::EH_RETURN_JMPR:
 
+  case Hexagon::EH_RETURN_JMPR:
   case Hexagon::J2_jumpr:
   case Hexagon::PS_jmpret:
     // jumpr r31
@@ -789,12 +787,12 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
       addOps(Result, Inst, 2);
       break; //  1,3 SUBInst $Rdd = combine(#2, #$u2)
     }
+    break;
   case Hexagon::A4_combineir:
     Result.setOpcode(Hexagon::SA1_combinezr);
     addOps(Result, Inst, 0);
     addOps(Result, Inst, 2);
     break; //    1,3 SUBInst $Rdd = combine(#0, $Rs)
-
   case Hexagon::A4_combineri:
     Result.setOpcode(Hexagon::SA1_combinerz);
     addOps(Result, Inst, 0);
@@ -901,6 +899,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
       addOps(Result, Inst, 1);
       break; //  2 1,2 SUBInst memb($Rs + #$u4_0)=#1
     }
+    break;
   case Hexagon::S2_storerb_io:
     Result.setOpcode(Hexagon::SS1_storeb_io);
     addOps(Result, Inst, 0);
@@ -937,6 +936,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
       addOps(Result, Inst, 2);
       break; //  1 2,3 SUBInst memw(r29 + #$u5_2) = $Rt
     }
+    break;
   case Hexagon::S2_storeri_io:
     if (Inst.getOperand(0).getReg() == Hexagon::R29) {
       Result.setOpcode(Hexagon::SS2_storew_sp);
@@ -1045,8 +1045,8 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
       bool bisReversable = true;
       if (isStoreInst(MCB.getOperand(j).getInst()->getOpcode()) &&
           isStoreInst(MCB.getOperand(k).getInst()->getOpcode())) {
-        DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
+                          << "\n");
         bisReversable = false;
       }
       if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf
@@ -1066,14 +1066,14 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
 
         // Save off pairs for duplex checking.
         duplexToTry.push_back(DuplexCandidate(j, k, iClass));
-        DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
-                     << MCB.getOperand(j).getInst()->getOpcode() << ","
-                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        LLVM_DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
+                          << MCB.getOperand(j).getInst()->getOpcode() << ","
+                          << MCB.getOperand(k).getInst()->getOpcode() << "\n");
         continue;
       } else {
-        DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
-                     << MCB.getOperand(j).getInst()->getOpcode() << ","
-                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        LLVM_DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
+                          << MCB.getOperand(j).getInst()->getOpcode() << ","
+                          << MCB.getOperand(k).getInst()->getOpcode() << "\n");
       }
 
       // Try reverse.
@@ -1091,13 +1091,15 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
 
           // Save off pairs for duplex checking.
           duplexToTry.push_back(DuplexCandidate(k, j, iClass));
-          DEBUG(dbgs() << "adding pair:" << k << "," << j << ":"
-                       << MCB.getOperand(j).getInst()->getOpcode() << ","
-                       << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "adding pair:" << k << "," << j << ":"
+                     << MCB.getOperand(j).getInst()->getOpcode() << ","
+                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
         } else {
-          DEBUG(dbgs() << "skipping pair: " << k << "," << j << ":"
-                       << MCB.getOperand(j).getInst()->getOpcode() << ","
-                       << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "skipping pair: " << k << "," << j << ":"
+                     << MCB.getOperand(j).getInst()->getOpcode() << ","
+                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
         }
       }
     }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 691e269cb91f..f304bc50530f 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
@@ -47,15 +48,15 @@ static cl::opt<unsigned> GPSize
 
 HexagonMCELFStreamer::HexagonMCELFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter)
+    : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
       MCII(createHexagonMCInstrInfo()) {}
 
 HexagonMCELFStreamer::HexagonMCELFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
     MCAssembler *Assembler)
-    : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)),
+    : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
       MCII(createHexagonMCInstrInfo()) {}
 
 void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
@@ -63,21 +64,6 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
   assert(MCB.getOpcode() == Hexagon::BUNDLE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
-  bool Extended = false;
-  for (auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-    MCInst *MCI = const_cast<MCInst *>(I.getInst());
-    if (Extended) {
-      if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
-        MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst());
-        HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst);
-      } else {
-        HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI);
-      }
-      Extended = false;
-    } else {
-      Extended = HexagonMCInstrInfo::isImmext(*MCI);
-    }
-  }
 
   // At this point, MCB is a bundle
   // Iterate through the bundle and assign addends for the instructions
@@ -124,7 +110,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     MCSectionSubPair P = getCurrentSection();
     SwitchSection(&Section);
 
-    if (ELFSymbol->isUndefined(false)) {
+    if (ELFSymbol->isUndefined()) {
       EmitValueToAlignment(ByteAlignment, 0, 1, 0);
       EmitLabel(Symbol);
       EmitZeros(Size);
@@ -166,9 +152,10 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
 namespace llvm {
 MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> OW,
                                      std::unique_ptr<MCCodeEmitter> CE) {
-  return new HexagonMCELFStreamer(Context, std::move(MAB), OS, std::move(CE));
+  return new HexagonMCELFStreamer(Context, std::move(MAB), std::move(OW),
+                                  std::move(CE));
   }
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index c6fa0021d86b..c02bef8f06f7 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -23,11 +23,11 @@ class HexagonMCELFStreamer : public MCELFStreamer {
 
 public:
   HexagonMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                       raw_pwrite_stream &OS,
+                       std::unique_ptr<MCObjectWriter> OW,
                        std::unique_ptr<MCCodeEmitter> Emitter);
 
   HexagonMCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                       raw_pwrite_stream &OS,
+                       std::unique_ptr<MCObjectWriter> OW,
                        std::unique_ptr<MCCodeEmitter> Emitter,
                        MCAssembler *Assembler);
 
@@ -43,7 +43,7 @@ public:
 
 MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> OW,
                                      std::unique_ptr<MCCodeEmitter> CE);
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 19308cd425e8..a11aa92ccbe1 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -158,23 +158,6 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
   return true;
 }
 
-void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
-                                       MCContext &Context, MCInst &MCI) {
-  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
-         HexagonMCInstrInfo::isExtended(MCII, MCI));
-  MCOperand &exOp =
-      MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
-  // If the extended value is a constant, then use it for the extended and
-  // for the extender instructions, masking off the lower 6 bits and
-  // including the assumed bits.
-  int64_t Value;
-  if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
-    unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
-    exOp.setExpr(HexagonMCExpr::create(
-        MCConstantExpr::create((Value & 0x3f) << Shift, Context), Context));
-  }
-}
-
 MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
                                           MCInst const &Inst,
                                           MCOperand const &MO) {
@@ -330,16 +313,19 @@ unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
   return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
 }
 
+bool HexagonMCInstrInfo::isExtentSigned(MCInstrInfo const &MCII,
+                                        MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+}
+
 /// Return the maximum value of an extendable operand.
 int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-
   assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
          HexagonMCInstrInfo::isExtended(MCII, MCI));
 
-  if (S) // if value is signed
+  if (HexagonMCInstrInfo::isExtentSigned(MCII, MCI)) // if value is signed
     return (1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1)) - 1;
   return (1 << HexagonMCInstrInfo::getExtentBits(MCII, MCI)) - 1;
 }
@@ -347,13 +333,10 @@ int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
 /// Return the minimum value of an extendable operand.
 int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-
   assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
          HexagonMCInstrInfo::isExtended(MCII, MCI));
 
-  if (S) // if value is signed
+  if (HexagonMCInstrInfo::isExtentSigned(MCII, MCI)) // if value is signed
     return -(1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1));
   return 0;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 28d89429266b..d040bea23b6d 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -103,9 +103,6 @@ MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
 // Convert this instruction in to a duplex subinst
 MCInst deriveSubInst(MCInst const &Inst);
 
-// Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
-
 // Return the extender for instruction at Index or nullptr if none
 MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
 void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
@@ -143,6 +140,9 @@ unsigned getExtentAlignment(MCInstrInfo const &MCII, MCInst const &MCI);
 // Return the number of logical bits of the extendable operand
 unsigned getExtentBits(MCInstrInfo const &MCII, MCInst const &MCI);
 
+// Check if the extendable operand is signed.
+bool isExtentSigned(MCInstrInfo const &MCII, MCInst const &MCI);
+
 // Return the max value that a constant extendable operand can have
 // without being extended.
 int getMaxValue(MCInstrInfo const &MCII, MCInst const &MCI);
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 7bd54fdfa3d5..4281144acaee 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -38,7 +38,8 @@ void HexagonMCShuffler::init(MCInst &MCB) {
     // Copy the bundle for the shuffling.
     for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
       MCInst &MI = *const_cast<MCInst *>(I.getInst());
-      DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode()) << '\n');
+      LLVM_DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode())
+                        << '\n');
       assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo());
 
       if (!HexagonMCInstrInfo::isImmext(MI)) {
@@ -98,7 +99,7 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
     copyTo(MCB);
     return true;
   }
-  DEBUG(MCB.dump());
+  LLVM_DEBUG(MCB.dump());
   return false;
 }
 
@@ -119,10 +120,10 @@ bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal,
     //      * %d7 = IMPLICIT_DEF; flags:
     // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
     // became empty.
-    DEBUG(dbgs() << "Skipping empty bundle");
+    LLVM_DEBUG(dbgs() << "Skipping empty bundle");
     return false;
   } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
-    DEBUG(dbgs() << "Skipping stand-alone insn");
+    LLVM_DEBUG(dbgs() << "Skipping stand-alone insn");
     return false;
   }
 
@@ -144,10 +145,10 @@ llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
     //      * %d7 = IMPLICIT_DEF; flags:
     // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
     // became empty.
-    DEBUG(dbgs() << "Skipping empty bundle");
+    LLVM_DEBUG(dbgs() << "Skipping empty bundle");
     return false;
   } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
-    DEBUG(dbgs() << "Skipping stand-alone insn");
+    LLVM_DEBUG(dbgs() << "Skipping stand-alone insn");
     return false;
   }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 3fbe2197f937..b211a81524fb 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -147,7 +148,7 @@ public:
     auto PacketBundle = Contents.rsplit('\n');
     auto HeadTail = PacketBundle.first.split('\n');
     StringRef Separator = "\n";
-    StringRef Indent = "\t\t";
+    StringRef Indent = "\t";
     OS << "\t{\n";
     while (!HeadTail.first.empty()) {
       StringRef InstTxt;
@@ -164,7 +165,7 @@ public:
     }
 
     if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
-      OS << "\n\t}:mem_noshuf" << PacketBundle.second;
+      OS << "\n\t} :mem_noshuf" << PacketBundle.second;
     else
       OS << "\t}" << PacketBundle.second;
   }
@@ -248,10 +249,10 @@ createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
 
 static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
                                     bool RelaxAll) {
-  return createHexagonELFStreamer(T, Context, std::move(MAB), OS,
+  return createHexagonELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                   std::move(Emitter));
 }
 
@@ -308,6 +309,7 @@ static bool isCPUValid(std::string CPU)
 {
   std::vector<std::string> table
   {
+    "generic",
     "hexagonv4",
     "hexagonv5",
     "hexagonv55",
@@ -342,8 +344,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
     break;
   }
   bool UseHvx = false;
-  for (unsigned F : {ExtensionHVX, ExtensionHVX64B, ExtensionHVX128B,
-                     ExtensionHVXDbl}) {
+  for (unsigned F : {ExtensionHVX, ExtensionHVX64B, ExtensionHVX128B}) {
     if (!FB.test(F))
       continue;
     UseHvx = true;
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 71545a5c02c9..6cd1b3a4691f 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -27,7 +27,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -65,9 +65,8 @@ MCAsmBackend *createHexagonAsmBackend(const Target &T,
                                       const MCRegisterInfo &MRI,
                                       const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
-createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                             StringRef CPU);
+std::unique_ptr<MCObjectTargetWriter>
+createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU);
 
 unsigned HexagonGetLastSlot();
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 7709a0f61624..59f3caa6af94 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -641,14 +641,14 @@ bool HexagonShuffler::shuffle() {
     }
 
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
-    DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
+    LLVM_DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
       dbgs() << '/';
       dbgs().write_hex(ISJ->CVI.getUnits()) << '|';
       dbgs() << ISJ->CVI.getLanes();
     } dbgs() << ':'
              << HexagonMCInstrInfo::getDesc(MCII, ISJ->getDesc()).getOpcode();
-          dbgs() << '\n');
-  DEBUG(dbgs() << '\n');
+               dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   return Ok;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
index f8c766ac972c..4339fa2089d9 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -103,7 +104,7 @@ bool CopyPropagation::run() {
 
   if (trace()) {
     dbgs() << "Copies:\n";
-    for (auto I : Copies) {
+    for (NodeId I : Copies) {
       dbgs() << "Instr: " << *DFG.addr<StmtNode*>(I).Addr->getCode();
       dbgs() << "   eq: {";
       for (auto J : CopyMap[I])
@@ -130,7 +131,7 @@ bool CopyPropagation::run() {
     return 0;
   };
 
-  for (auto C : Copies) {
+  for (NodeId C : Copies) {
 #ifndef NDEBUG
     if (HasLimit && CpCount >= CpLimit)
       break;
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
index 240d7c355bc7..da339bfd3ff4 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -214,7 +214,7 @@ bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
       return false;
     return A.Id < B.Id;
   };
-  std::sort(DRNs.begin(), DRNs.end(), UsesFirst);
+  llvm::sort(DRNs.begin(), DRNs.end(), UsesFirst);
 
   if (trace())
     dbgs() << "Removing dead ref nodes:\n";
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
index d1f6e5a4c8ef..3d1ec31dada7 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
@@ -893,7 +893,7 @@ void DataFlowGraph::build(unsigned Options) {
     NodeAddr<BlockNode*> BA = newBlock(Func, &B);
     BlockNodes.insert(std::make_pair(&B, BA));
     for (MachineInstr &I : B) {
-      if (I.isDebugValue())
+      if (I.isDebugInstr())
         continue;
       buildStmt(BA, I);
     }
@@ -1471,7 +1471,7 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
   // and add a def for each S in the closure.
 
   // Sort the refs so that the phis will be created in a deterministic order.
-  std::sort(MaxRefs.begin(), MaxRefs.end());
+  llvm::sort(MaxRefs.begin(), MaxRefs.end());
   // Remove duplicates.
   auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
   MaxRefs.erase(NewEnd, MaxRefs.end());
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
index 13d9a1741978..c257d754ddf9 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
@@ -207,7 +207,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   };
 
   std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
-  std::sort(Tmp.begin(), Tmp.end(), Less);
+  llvm::sort(Tmp.begin(), Tmp.end(), Less);
 
   // The vector is a list of instructions, so that defs coming from
   // the same instruction don't need to be artificially ordered.
@@ -628,7 +628,7 @@ void Liveness::computePhiInfo() {
 
         // Collect the set PropUp of uses that are reached by the current
         // phi PA, and are not covered by any intervening def between the
-        // currently visited use UA and the the upward phi P.
+        // currently visited use UA and the upward phi P.
 
         if (MidDefs.hasCoverOf(UR))
           continue;
@@ -813,7 +813,7 @@ void Liveness::computeLiveIns() {
       std::vector<RegisterRef> LV;
       for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
         LV.push_back(RegisterRef(I->PhysReg, I->LaneMask));
-      std::sort(LV.begin(), LV.end());
+      llvm::sort(LV.begin(), LV.end());
       dbgs() << printMBBReference(B) << "\t rec = {";
       for (auto I : LV)
         dbgs() << ' ' << Print<RegisterRef>(I, DFG);
@@ -824,7 +824,7 @@ void Liveness::computeLiveIns() {
       const RegisterAggr &LG = LiveMap[&B];
       for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I)
         LV.push_back(*I);
-      std::sort(LV.begin(), LV.end());
+      llvm::sort(LV.begin(), LV.end());
       dbgs() << "\tcomp = {";
       for (auto I : LV)
         dbgs() << ' ' << Print<RegisterRef>(I, DFG);
@@ -880,7 +880,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
 
   for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
     MachineInstr *MI = &*I;
-    if (MI->isDebugValue())
+    if (MI->isDebugInstr())
       continue;
 
     MI->clearKillInfo();
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
index 8cfb6a1e9554..eaeb4ea115b3 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
@@ -53,8 +53,8 @@ namespace rdf {
     using RefMap = std::map<RegisterId, NodeRefSet>;
 
     Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
-      : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
-        MDF(g.getDF()), LiveMap(g.getPRI()), NoRegs(g.getPRI()) {}
+        : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
+          MDF(g.getDF()), LiveMap(g.getPRI()), Empty(), NoRegs(g.getPRI()) {}
 
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
         bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
diff --git a/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index a330f27ed300..78e2f2b2ddb3 100644
--- a/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -18,6 +18,6 @@ Target &llvm::getTheHexagonTarget() {
 }
 
 extern "C" void LLVMInitializeHexagonTargetInfo() {
-  RegisterTarget<Triple::hexagon, /*HasJIT=*/false> X(
+  RegisterTarget<Triple::hexagon, /*HasJIT=*/true> X(
       getTheHexagonTarget(), "hexagon", "Hexagon", "Hexagon");
 }
diff --git a/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 3f24c3ef3902..a77b2b8f15ca 100644
--- a/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -475,8 +475,8 @@ public:
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+      assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
                  LanaiMCExpr::VK_Lanai_ABS_LO);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
@@ -505,8 +505,8 @@ public:
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+      assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
                  LanaiMCExpr::VK_Lanai_ABS_HI);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index 6b4fa7771783..ea76a1128373 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -156,7 +156,7 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
   for (MachineBasicBlock::reverse_instr_iterator I = ++Slot.getReverse();
        I != MBB.instr_rend(); ++I) {
     // skip debug value
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     // Convert to forward iterator.
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
index ed0c99a76ce4..5081cfbe4922 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -273,12 +273,9 @@ bool LanaiDAGToDAGISel::SelectInlineAsmMemoryOperand(
 void LanaiDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
 
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     return;
   }
 
@@ -319,7 +316,7 @@ void LanaiDAGToDAGISel::Select(SDNode *Node) {
 void LanaiDAGToDAGISel::selectFrameIndex(SDNode *Node) {
   SDLoc DL(Node);
   SDValue Imm = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
   EVT VT = Node->getValueType(0);
   SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
   unsigned Opc = Lanai::ADD_I_LO;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 17567436384e..045a897c4126 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -44,6 +43,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -87,7 +87,6 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCCE, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -193,8 +192,6 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
     return LowerSELECT_CC(Op, DAG);
   case ISD::SETCC:
     return LowerSETCC(Op, DAG);
-  case ISD::SETCCE:
-    return LowerSETCCE(Op, DAG);
   case ISD::SHL_PARTS:
     return LowerSHL_PARTS(Op, DAG);
   case ISD::SRL_PARTS:
@@ -484,8 +481,8 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
         break;
       }
       default:
-        DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
-                     << RegVT.getEVTString() << "\n");
+        LLVM_DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
+                          << RegVT.getEVTString() << "\n");
         llvm_unreachable("unhandled argument type");
       }
     } else {
@@ -969,19 +966,6 @@ SDValue LanaiTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   return Res;
 }
 
-SDValue LanaiTargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue Carry = Op.getOperand(2);
-  SDValue Cond = Op.getOperand(3);
-  SDLoc DL(Op);
-
-  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
-  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
-  SDValue Flag = DAG.getNode(LanaiISD::SUBBF, DL, MVT::Glue, LHS, RHS, Carry);
-  return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
-}
-
 SDValue LanaiTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
index 46024e6fd508..0cde633cb41a 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
@@ -87,7 +87,6 @@ public:
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
index 30289ea4ac0b..1bb6b3d26a49 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
@@ -482,7 +482,7 @@ class InstSLI<dag outs, dag ins, string asmstr, list<dag> pattern>
 //             Memory(ea) <- (least significant half-word of Rr)
 //        If `YS' = 10  (bYte     load):  Rr <- Memory(ea)
 //        If `YS' = 00  (halfword load):  Rr <- Memory(ea)
-//             [Note: here ea is determined as in the the RM instruction. ]
+//             [Note: here ea is determined as in the RM instruction. ]
 //        If `SE' = 01 then the value is zEro extended
 //             before being loaded into Rd.
 //        If `SE' = 00 then the value is sign extended
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index a7c9a7a7f280..493d02bef37c 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -573,8 +573,8 @@ bool LanaiInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   while (Instruction != MBB.begin()) {
     --Instruction;
 
-    // Skip over debug values.
-    if (Instruction->isDebugValue())
+    // Skip over debug instructions.
+    if (Instruction->isDebugInstr())
       continue;
 
     // Working from the bottom, when we see a non-terminator
@@ -699,7 +699,7 @@ unsigned LanaiInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (Instruction != MBB.begin()) {
     --Instruction;
-    if (Instruction->isDebugValue())
+    if (Instruction->isDebugInstr())
       continue;
     if (Instruction->getOpcode() != Lanai::BT &&
         Instruction->getOpcode() != Lanai::BRCC) {
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index f07fede67a41..fe22fde2470b 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
 
 #include "LanaiRegisterInfo.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
index 776fee101dfe..66192b4a4704 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -842,6 +842,10 @@ def : Pat<(Call texternalsym:$dst), (CALL texternalsym:$dst)>;
 // Loads
 def : Pat<(extloadi8  ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
 def : Pat<(extloadi16 ADDRspls:$src), (i32 (LDHz_RI ADDRspls:$src))>;
+// Loads up to 32-bits are already atomic.
+// TODO: This is a workaround for a particular failing case and should be
+// handled more generally.
+def : Pat<(atomic_load_8  ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
 
 // GlobalAddress, ExternalSymbol, Jumptable, ConstantPool
 def : Pat<(LanaiHi tglobaladdr:$dst), (MOVHI tglobaladdr:$dst)>;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
index c29c933db747..35e2542dfb13 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -343,7 +343,7 @@ MbbIterator LanaiMemAluCombiner::findClosestSuitableAluInstr(
       break;
 
     // Skip over debug instructions
-    if (First->isDebugValue())
+    if (First->isDebugInstr())
       continue;
 
     if (isSuitableAluInstr(IsSpls, First, *Base, *Offset)) {
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
index 38e75108ba16..7d165e9c5f8c 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -50,8 +50,7 @@ static bool isInSmallSection(uint64_t Size) {
 // section.
 bool LanaiTargetObjectFile::isGlobalInSmallSection(
     const GlobalObject *GO, const TargetMachine &TM) const {
-  if (GO == nullptr)
-    return false;
+  if (GO == nullptr) return TM.getCodeModel() == CodeModel::Small;
 
   // We first check the case where global is a declaration, because finding
   // section kind using getKindForGlobal() is only allowed for global
@@ -67,8 +66,7 @@ bool LanaiTargetObjectFile::isGlobalInSmallSection(
 bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
                                                    const TargetMachine &TM,
                                                    SectionKind Kind) const {
-  return (isGlobalInSmallSectionImpl(GO, TM) &&
-          (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+  return isGlobalInSmallSectionImpl(GO, TM);
 }
 
 // Return true if this global address should be placed into small data/bss
@@ -76,10 +74,10 @@ bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
 // kind.
 bool LanaiTargetObjectFile::isGlobalInSmallSectionImpl(
     const GlobalObject *GO, const TargetMachine &TM) const {
-  // Only global variables, not functions.
   const auto *GVA = dyn_cast<GlobalVariable>(GO);
-  if (!GVA)
-    return false;
+
+  // If not a GlobalVariable, only consider the code model.
+  if (!GVA) return TM.getCodeModel() == CodeModel::Small;
 
   // Global values placed in sections starting with .ldata do not fit in
   // 21-bits, so always use large memory access for them. FIXME: This is a
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index e3eaa4d30a90..82fa93ea5e5e 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -47,14 +47,15 @@ class LanaiAsmBackend : public MCAsmBackend {
 
 public:
   LanaiAsmBackend(const Target &T, Triple::OSType OST)
-      : MCAsmBackend(), OSType(OST) {}
+      : MCAsmBackend(support::big), OSType(OST) {}
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
   // No instruction requires relaxation
   bool fixupNeedsRelaxation(const MCFixup & /*Fixup*/, uint64_t /*Value*/,
@@ -69,7 +70,8 @@ public:
     return Lanai::NumTargetFixupKinds;
   }
 
-  bool mayNeedRelaxation(const MCInst & /*Inst*/) const override {
+  bool mayNeedRelaxation(const MCInst & /*Inst*/,
+                         const MCSubtargetInfo &STI) const override {
     return false;
   }
 
@@ -77,15 +79,15 @@ public:
                         const MCSubtargetInfo & /*STI*/,
                         MCInst & /*Res*/) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
-bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   if ((Count % 4) != 0)
     return false;
 
   for (uint64_t i = 0; i < Count; i += 4)
-    OW->write32(0x15000000);
+    OS.write("\x15\0\0\0", 4);
 
   return true;
 }
@@ -93,7 +95,8 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                  const MCValue &Target,
                                  MutableArrayRef<char> Data, uint64_t Value,
-                                 bool /*IsResolved*/) const {
+                                 bool /*IsResolved*/,
+                                 const MCSubtargetInfo * /*STI*/) const {
   MCFixupKind Kind = Fixup.getKind();
   Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
 
@@ -127,10 +130,9 @@ void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-LanaiAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createLanaiELFObjectWriter(OS,
-                                    MCELFObjectTargetWriter::getOSABI(OSType));
+std::unique_ptr<MCObjectTargetWriter>
+LanaiAsmBackend::createObjectTargetWriter() const {
+  return createLanaiELFObjectWriter(MCELFObjectTargetWriter::getOSABI(OSType));
 }
 
 const MCFixupKindInfo &
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index 3c40176d2f60..7676891ef981 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -87,8 +87,7 @@ bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createLanaiELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
-  return createELFObjectWriter(llvm::make_unique<LanaiELFObjectWriter>(OSABI),
-                               OS, /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createLanaiELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<LanaiELFObjectWriter>(OSABI);
 }
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index c3727416ecb9..21f4005aaf83 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Lanai.h"
-#include "LanaiAluCode.h"
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
 #include "MCTargetDesc/LanaiMCExpr.h"
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 74d929450ed2..ddb01cdd2d8f 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -61,14 +61,14 @@ createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
                                     bool RelaxAll) {
   if (!T.isOSBinFormatELF())
     llvm_unreachable("OS not supported");
 
-  return createELFStreamer(Context, std::move(MAB), OS, std::move(Emitter),
-                           RelaxAll);
+  return createELFStreamer(Context, std::move(MAB), std::move(OW),
+                           std::move(Emitter), RelaxAll);
 }
 
 static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
index ddb4e9b0d728..2d8828ea4fa9 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCInstrAnalysis;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRelocationInfo;
 class MCSubtargetInfo;
 class Target;
@@ -42,8 +42,7 @@ MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
-createLanaiELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createLanaiELFObjectWriter(uint8_t OSABI);
 } // namespace llvm
 
 // Defines symbolic names for Lanai registers.  This defines a mapping from
diff --git a/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index 5eed0cb28361..ccf47b08fcff 100644
--- a/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Lanai.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
index 87c320aa76aa..2b3495405545 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -138,15 +138,15 @@ bool MSP430BSel::expandBranches(OffsetVector &BlockOffsets) {
         continue;
       }
 
-      DEBUG(dbgs() << "  Found a branch that needs expanding, "
-                   << printMBBReference(*DestBB) << ", Distance "
-                   << BranchDistance << "\n");
+      LLVM_DEBUG(dbgs() << "  Found a branch that needs expanding, "
+                        << printMBBReference(*DestBB) << ", Distance "
+                        << BranchDistance << "\n");
 
       // If JCC is not the last instruction we need to split the MBB.
       if (MI->getOpcode() == MSP430::JCC && std::next(MI) != EE) {
 
-        DEBUG(dbgs() << "  Found a basic block that needs to be split, "
-                     << printMBBReference(*MBB) << "\n");
+        LLVM_DEBUG(dbgs() << "  Found a basic block that needs to be split, "
+                          << printMBBReference(*MBB) << "\n");
 
         // Create a new basic block.
         MachineBasicBlock *NewBB =
@@ -229,7 +229,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &mf) {
   if (!BranchSelectEnabled)
     return false;
 
-  DEBUG(dbgs() << "\n********** " << getPassName() << " **********\n");
+  LLVM_DEBUG(dbgs() << "\n********** " << getPassName() << " **********\n");
 
   // BlockOffsets - Contains the distance from the beginning of the function to
   // the beginning of each basic block.
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index bebf7478bccf..005f5f44a635 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -179,7 +180,7 @@ bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM)
 }
 
 bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
-  DEBUG(errs() << "MatchAddress: "; AM.dump());
+  LLVM_DEBUG(errs() << "MatchAddress: "; AM.dump());
 
   switch (N.getOpcode()) {
   default: break;
@@ -381,16 +382,9 @@ bool MSP430DAGToDAGISel::tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
 void MSP430DAGToDAGISel::Select(SDNode *Node) {
   SDLoc dl(Node);
 
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== ";
-          Node->dump(CurDAG);
-          errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return;
   }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 6135ce080920..dd1b30a3e470 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -113,7 +113,7 @@ unsigned MSP430InstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (I->getOpcode() != MSP430::JMP &&
         I->getOpcode() != MSP430::JCC &&
@@ -183,7 +183,7 @@ bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     // Working from the bottom, when we see a non-terminator
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index f36a4317b1b9..2acf701b43cb 100644
--- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -146,10 +146,15 @@ class MipsAsmParser : public MCTargetAsmParser {
   /// If true, then CpSaveLocation is a register, otherwise it's an offset.
   bool     CpSaveLocationIsRegister;
 
+  // Map of register aliases created via the .set directive.
+  StringMap<AsmToken> RegisterSets;
+
   // Print a warning along with its fix-it message at the given range.
   void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
                              SMRange Range, bool ShowColors = true);
 
+  void ConvertXWPOperands(MCInst &Inst, const OperandVector &Operands);
+
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
@@ -182,12 +187,14 @@ class MipsAsmParser : public MCTargetAsmParser {
   matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
                                     StringRef Identifier, SMLoc S);
   OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
+                                                     const AsmToken &Token,
+                                                     SMLoc S);
+  OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
                                                      SMLoc S);
   OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
   OperandMatchResultTy parseImm(OperandVector &Operands);
   OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
   OperandMatchResultTy parseInvNum(OperandVector &Operands);
-  OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
   OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
   OperandMatchResultTy parseRegisterList(OperandVector &Operands);
 
@@ -235,13 +242,7 @@ class MipsAsmParser : public MCTargetAsmParser {
                                   const MCSubtargetInfo *STI);
 
   void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                     const MCSubtargetInfo *STI, bool IsLoad, bool IsImmOpnd);
-
-  void expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                      const MCSubtargetInfo *STI, bool IsImmOpnd);
-
-  void expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                       const MCSubtargetInfo *STI, bool IsImmOpnd);
+                     const MCSubtargetInfo *STI, bool IsLoad);
 
   bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                const MCSubtargetInfo *STI);
@@ -255,9 +256,9 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                           const MCSubtargetInfo *STI);
 
-  bool expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                 const MCSubtargetInfo *STI, const bool IsMips64,
-                 const bool Signed);
+  bool expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                    const MCSubtargetInfo *STI, const bool IsMips64,
+                    const bool Signed);
 
   bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc,
                    MCStreamer &Out, const MCSubtargetInfo *STI);
@@ -348,10 +349,12 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetHardFloatDirective();
   bool parseSetMtDirective();
   bool parseSetNoMtDirective();
+  bool parseSetNoCRCDirective();
+  bool parseSetNoVirtDirective();
+  bool parseSetNoGINVDirective();
 
   bool parseSetAssignment();
 
-  bool parseDataDirective(unsigned Size, SMLoc L);
   bool parseDirectiveGpWord();
   bool parseDirectiveGpDWord();
   bool parseDirectiveDtpRelWord();
@@ -466,6 +469,7 @@ public:
     Match_RequiresSameSrcAndDst,
     Match_NoFCCRegisterForCurrentISA,
     Match_NonZeroOperandForSync,
+    Match_NonZeroOperandForMTCX,
     Match_RequiresPosSizeRange0_32,
     Match_RequiresPosSizeRange33_64,
     Match_RequiresPosSizeUImm6,
@@ -482,6 +486,9 @@ public:
     MCAsmParserExtension::Initialize(parser);
 
     parser.addAliasForDirective(".asciiz", ".asciz");
+    parser.addAliasForDirective(".hword", ".2byte");
+    parser.addAliasForDirective(".word", ".4byte");
+    parser.addAliasForDirective(".dword", ".8byte");
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
@@ -507,14 +514,13 @@ public:
     CpRestoreOffset = -1;
 
     const Triple &TheTriple = sti.getTargetTriple();
-    if ((TheTriple.getArch() == Triple::mips) ||
-        (TheTriple.getArch() == Triple::mips64))
-      IsLittleEndian = false;
-    else
-      IsLittleEndian = true;
+    IsLittleEndian = TheTriple.isLittleEndian();
 
     if (getSTI().getCPU() == "mips64r6" && inMicroMipsMode())
       report_fatal_error("microMIPS64R6 is not supported", false);
+
+    if (!isABI_O32() && inMicroMipsMode())
+      report_fatal_error("microMIPS64 is not supported", false);
   }
 
   /// True if all of $fcc0 - $fcc7 exist for the current ISA.
@@ -643,6 +649,18 @@ public:
     return getSTI().getFeatureBits()[Mips::FeatureMT];
   }
 
+  bool hasCRC() const {
+    return getSTI().getFeatureBits()[Mips::FeatureCRC];
+  }
+
+  bool hasVirt() const {
+    return getSTI().getFeatureBits()[Mips::FeatureVirt];
+  }
+
+  bool hasGINV() const {
+    return getSTI().getFeatureBits()[Mips::FeatureGINV];
+  }
+
   /// Warn if RegIndex is the same as the current AT.
   void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
 
@@ -1297,6 +1315,20 @@ public:
     return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
   }
 
+  bool isMemWithPtrSizeOffset() const {
+    if (!isMem())
+      return false;
+    if (!getMemBase()->isGPRAsmReg())
+      return false;
+    const unsigned PtrBits = AsmParser.getABI().ArePtrs64bit() ? 64 : 32;
+    if (isa<MCTargetExpr>(getMemOff()) ||
+        (isConstantMemOff() && isIntN(PtrBits, getConstantMemOff())))
+      return true;
+    MCValue Res;
+    bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
+    return IsReloc && isIntN(PtrBits, Res.getConstant());
+  }
+
   bool isMemWithGRPMM16Base() const {
     return isMem() && getMemBase()->isMM16AsmReg();
   }
@@ -1326,9 +1358,11 @@ public:
 
   template <unsigned Bits, unsigned ShiftLeftAmount>
   bool isScaledSImm() const {
-    if (isConstantImm() && isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
+    if (isConstantImm() &&
+        isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
       return true;
-    // Operand can also be a symbol or symbol plus offset in case of relocations.
+    // Operand can also be a symbol or symbol plus
+    // offset in case of relocations.
     if (Kind != k_Immediate)
       return false;
     MCValue Res;
@@ -1405,10 +1439,6 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  bool isRegPair() const {
-    return Kind == k_RegPair && RegIdx.Index <= 30;
-  }
-
   unsigned getReg() const override {
     // As a special case until we sort out the definition of div/divu, accept
     // $0/$zero here so that MCK_ZERO works correctly.
@@ -1471,7 +1501,7 @@ public:
   static std::unique_ptr<MipsOperand>
   createNumericReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
                    SMLoc S, SMLoc E, MipsAsmParser &Parser) {
-    DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
+    LLVM_DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
     return CreateReg(Index, Str, RegKind_Numeric, RegInfo, S, E, Parser);
   }
 
@@ -2034,7 +2064,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     // FIXME: Add support for forward-declared local symbols.
     // FIXME: Add expansion for when the LargeGOT option is enabled.
     if (JalSym->isInSection() || JalSym->isTemporary() ||
-        (JalSym->isELF() && cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
+        (JalSym->isELF() &&
+         cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
       if (isABI_O32()) {
         // If it's a local symbol and the O32 ABI is being used, we expand to:
         //  lw $25, 0($gp)
@@ -2102,10 +2133,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
         MCOperand &Op = Inst.getOperand(i);
         if (Op.isImm()) {
-          int MemOffset = Op.getImm();
+          int64_t MemOffset = Op.getImm();
           if (MemOffset < -32768 || MemOffset > 32767) {
             // Offset can't exceed 16bit value.
-            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), true);
+            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
             return getParser().hasPendingError();
           }
         } else if (Op.isExpr()) {
@@ -2115,11 +2146,11 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                 static_cast<const MCSymbolRefExpr *>(Expr);
             if (SR->getKind() == MCSymbolRefExpr::VK_None) {
               // Expand symbol.
-              expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+              expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
               return getParser().hasPendingError();
             }
           } else if (!isEvaluated(Expr)) {
-            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
             return getParser().hasPendingError();
           }
         }
@@ -2128,7 +2159,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   }   // if load/store
 
   if (inMicroMipsMode()) {
-    if (MCID.mayLoad()) {
+    if (MCID.mayLoad() && Inst.getOpcode() != Mips::LWP_MM) {
       // Try to create 16-bit GP relative load instruction.
       for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
         const MCOperandInfo &OpInfo = MCID.OpInfo[i];
@@ -2245,13 +2276,18 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           return Error(IDLoc, "immediate operand value out of range");
         break;
       case Mips::ADDIUPC_MM:
-        MCOperand Opnd = Inst.getOperand(1);
+        Opnd = Inst.getOperand(1);
         if (!Opnd.isImm())
           return Error(IDLoc, "expected immediate operand kind");
-        int Imm = Opnd.getImm();
+        Imm = Opnd.getImm();
         if ((Imm % 4 != 0) || !isInt<25>(Imm))
           return Error(IDLoc, "immediate operand value out of range");
         break;
+      case Mips::LWP_MM:
+      case Mips::SWP_MM:
+        if (Inst.getOperand(0).getReg() == Mips::RA)
+          return Error(IDLoc, "invalid operand for instruction");
+        break;
     }
   }
 
@@ -2392,20 +2428,28 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SDivMacro:
   case Mips::SDivIMacro:
-    return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
-                                                         : MER_Success;
+  case Mips::SRemMacro:
+  case Mips::SRemIMacro:
+    return expandDivRem(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
+                                                            : MER_Success;
   case Mips::DSDivMacro:
   case Mips::DSDivIMacro:
-    return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
-                                                        : MER_Success;
+  case Mips::DSRemMacro:
+  case Mips::DSRemIMacro:
+    return expandDivRem(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
+                                                           : MER_Success;
   case Mips::UDivMacro:
   case Mips::UDivIMacro:
-    return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
-                                                          : MER_Success;
+  case Mips::URemMacro:
+  case Mips::URemIMacro:
+    return expandDivRem(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
+                                                             : MER_Success;
   case Mips::DUDivMacro:
   case Mips::DUDivIMacro:
-    return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
-                                                         : MER_Success;
+  case Mips::DURemMacro:
+  case Mips::DURemIMacro:
+    return expandDivRem(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
+                                                            : MER_Success;
   case Mips::PseudoTRUNC_W_S:
     return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail
                                                             : MER_Success;
@@ -3522,21 +3566,17 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 }
 
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                  const MCSubtargetInfo *STI, bool IsLoad,
-                                  bool IsImmOpnd) {
-  if (IsLoad) {
-    expandLoadInst(Inst, IDLoc, Out, STI, IsImmOpnd);
-    return;
-  }
-  expandStoreInst(Inst, IDLoc, Out, STI, IsImmOpnd);
-}
+                                  const MCSubtargetInfo *STI, bool IsLoad) {
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  const MCOperand &BaseRegOp = Inst.getOperand(1);
+  assert(BaseRegOp.isReg() && "expected register operand kind");
+  const MCOperand &OffsetOp = Inst.getOperand(2);
 
-void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                   const MCSubtargetInfo *STI, bool IsImmOpnd) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned BaseReg = Inst.getOperand(1).getReg();
+  unsigned DstReg = DstRegOp.getReg();
+  unsigned BaseReg = BaseRegOp.getReg();
+  unsigned TmpReg = DstReg;
 
   const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
   int16_t DstRegClass = Desc.OpInfo[0].RegClass;
@@ -3545,75 +3585,51 @@ void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
                (DstRegClassID == Mips::GPR64RegClassID);
 
-  if (IsImmOpnd) {
-    // Try to use DstReg as the temporary.
-    if (IsGPR && (BaseReg != DstReg)) {
-      TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
-                                 Inst.getOperand(2).getImm(), DstReg, IDLoc,
-                                 STI);
-      return;
-    }
-
-    // At this point we need AT to perform the expansions and we exit if it is
-    // not available.
-    unsigned ATReg = getATReg(IDLoc);
-    if (!ATReg)
+  if (!IsLoad || !IsGPR || (BaseReg == DstReg)) {
+    // At this point we need AT to perform the expansions
+    // and we exit if it is not available.
+    TmpReg = getATReg(IDLoc);
+    if (!TmpReg)
       return;
-
-    TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
-                               Inst.getOperand(2).getImm(), ATReg, IDLoc, STI);
-    return;
   }
 
-  const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
-  MCOperand LoOperand = MCOperand::createExpr(
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
-  MCOperand HiOperand = MCOperand::createExpr(
-      MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+  if (OffsetOp.isImm()) {
+    int64_t LoOffset = OffsetOp.getImm() & 0xffff;
+    int64_t HiOffset = OffsetOp.getImm() & ~0xffff;
 
-  // Try to use DstReg as the temporary.
-  if (IsGPR && (BaseReg != DstReg)) {
-    TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
-                               LoOperand, DstReg, IDLoc, STI);
-    return;
-  }
+    // If msb of LoOffset is 1(negative number) we must increment
+    // HiOffset to account for the sign-extension of the low part.
+    if (LoOffset & 0x8000)
+      HiOffset += 0x10000;
 
-  // At this point we need AT to perform the expansions and we exit if it is
-  // not available.
-  unsigned ATReg = getATReg(IDLoc);
-  if (!ATReg)
-    return;
+    bool IsLargeOffset = HiOffset != 0;
 
-  TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
-                             LoOperand, ATReg, IDLoc, STI);
-}
-
-void MipsAsmParser::expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                    const MCSubtargetInfo *STI,
-                                    bool IsImmOpnd) {
-  MipsTargetStreamer &TOut = getTargetStreamer();
-
-  unsigned SrcReg = Inst.getOperand(0).getReg();
-  unsigned BaseReg = Inst.getOperand(1).getReg();
+    if (IsLargeOffset) {
+      bool Is32BitImm = (HiOffset >> 32) == 0;
+      if (loadImmediate(HiOffset, TmpReg, Mips::NoRegister, Is32BitImm, true,
+                        IDLoc, Out, STI))
+        return;
+    }
 
-  if (IsImmOpnd) {
-    TOut.emitStoreWithImmOffset(Inst.getOpcode(), SrcReg, BaseReg,
-                                Inst.getOperand(2).getImm(),
-                                [&]() { return getATReg(IDLoc); }, IDLoc, STI);
-    return;
+    if (BaseReg != Mips::ZERO && BaseReg != Mips::ZERO_64)
+      TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg,
+                   BaseReg, IDLoc, STI);
+    TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI);
+  } else {
+    assert(OffsetOp.isExpr() && "expected expression operand kind");
+    const MCExpr *ExprOffset = OffsetOp.getExpr();
+    MCOperand LoOperand = MCOperand::createExpr(
+        MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+    MCOperand HiOperand = MCOperand::createExpr(
+        MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+    if (IsLoad)
+      TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                                 LoOperand, TmpReg, IDLoc, STI);
+    else
+      TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                                  LoOperand, TmpReg, IDLoc, STI);
   }
-
-  unsigned ATReg = getATReg(IDLoc);
-  if (!ATReg)
-    return;
-
-  const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
-  MCOperand LoOperand = MCOperand::createExpr(
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
-  MCOperand HiOperand = MCOperand::createExpr(
-      MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
-  TOut.emitStoreWithSymOffset(Inst.getOpcode(), SrcReg, BaseReg, HiOperand,
-                              LoOperand, ATReg, IDLoc, STI);
 }
 
 bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
@@ -3734,7 +3750,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   case Mips::BLTUL:
     AcceptsEquality = false;
     ReverseOrderSLT = false;
-    IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
+    IsUnsigned =
+        ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
     IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL));
     ZeroSrcOpcode = Mips::BGTZ;
     ZeroTrgOpcode = Mips::BLTZ;
@@ -3745,7 +3762,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   case Mips::BLEUL:
     AcceptsEquality = true;
     ReverseOrderSLT = true;
-    IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
+    IsUnsigned =
+        ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
     IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL));
     ZeroSrcOpcode = Mips::BGEZ;
     ZeroTrgOpcode = Mips::BLEZ;
@@ -3756,7 +3774,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   case Mips::BGEUL:
     AcceptsEquality = true;
     ReverseOrderSLT = false;
-    IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
+    IsUnsigned =
+        ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
     IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL));
     ZeroSrcOpcode = Mips::BLEZ;
     ZeroTrgOpcode = Mips::BGEZ;
@@ -3767,7 +3786,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   case Mips::BGTUL:
     AcceptsEquality = false;
     ReverseOrderSLT = true;
-    IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
+    IsUnsigned =
+        ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
     IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL));
     ZeroSrcOpcode = Mips::BLTZ;
     ZeroTrgOpcode = Mips::BGTZ;
@@ -3885,7 +3905,7 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   // This is accomplished by using a BNEZ with the result of the SLT.
   //
   // The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT
-  // and BLE with BGT), so we change the BNEZ into a a BEQZ.
+  // and BLE with BGT), so we change the BNEZ into a BEQZ.
   // Because only BGE and BLE branch on equality, we can use the
   // AcceptsEquality variable to decide when to emit the BEQZ.
   // Note that the order of the SLT arguments doesn't change between
@@ -3912,9 +3932,9 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
 // The destination register can only be $zero when expanding (S)DivIMacro or
 // D(S)DivMacro.
 
-bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                              const MCSubtargetInfo *STI, const bool IsMips64,
-                              const bool Signed) {
+bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                 const MCSubtargetInfo *STI, const bool IsMips64,
+                                 const bool Signed) {
   MipsTargetStreamer &TOut = getTargetStreamer();
 
   warnIfNoMacro(IDLoc);
@@ -3954,6 +3974,17 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   bool UseTraps = useTraps();
 
+  unsigned Opcode = Inst.getOpcode();
+  bool isDiv = Opcode == Mips::SDivMacro || Opcode == Mips::SDivIMacro ||
+               Opcode == Mips::UDivMacro || Opcode == Mips::UDivIMacro ||
+               Opcode == Mips::DSDivMacro || Opcode == Mips::DSDivIMacro ||
+               Opcode == Mips::DUDivMacro || Opcode == Mips::DUDivIMacro;
+
+  bool isRem = Opcode == Mips::SRemMacro || Opcode == Mips::SRemIMacro ||
+               Opcode == Mips::URemMacro || Opcode == Mips::URemIMacro ||
+               Opcode == Mips::DSRemMacro || Opcode == Mips::DSRemIMacro ||
+               Opcode == Mips::DURemMacro || Opcode == Mips::DURemIMacro;
+
   if (RtOp.isImm()) {
     unsigned ATReg = getATReg(IDLoc);
     if (!ATReg)
@@ -3967,10 +3998,13 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       return false;
     }
 
-    if (ImmValue == 1) {
+    if (isRem && (ImmValue == 1 || (Signed && (ImmValue == -1)))) {
+      TOut.emitRRR(Mips::OR, RdReg, ZeroReg, ZeroReg, IDLoc, STI);
+      return false;
+    } else if (isDiv && ImmValue == 1) {
       TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI);
       return false;
-    } else if (Signed && ImmValue == -1) {
+    } else if (isDiv && Signed && ImmValue == -1) {
       TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
       return false;
     } else {
@@ -3978,16 +4012,16 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                         false, Inst.getLoc(), Out, STI))
         return true;
       TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
-      TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+      TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
       return false;
     }
     return true;
   }
 
-  // If the macro expansion of (d)div(u) would always trap or break, insert
-  // the trap/break and exit. This gives a different result to GAS. GAS has
-  // an inconsistency/missed optimization in that not all cases are handled
-  // equivalently. As the observed behaviour is the same, we're ok.
+  // If the macro expansion of (d)div(u) or (d)rem(u) would always trap or
+  // break, insert the trap/break and exit. This gives a different result to
+  // GAS. GAS has an inconsistency/missed optimization in that not all cases
+  // are handled equivalently. As the observed behaviour is the same, we're ok.
   if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
     if (UseTraps) {
       TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
@@ -3997,6 +4031,13 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return false;
   }
 
+  // (d)rem(u) $0, $X, $Y is a special case. Like div $zero, $X, $Y, it does
+  // not expand to macro sequence.
+  if (isRem && (RdReg == Mips::ZERO || RdReg == Mips::ZERO_64)) {
+    TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+    return false;
+  }
+
   // Temporary label for first branch traget
   MCContext &Context = TOut.getStreamer().getContext();
   MCSymbol *BrTarget;
@@ -4020,7 +4061,7 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     if (!UseTraps)
       TOut.getStreamer().EmitLabel(BrTarget);
 
-    TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+    TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
     return false;
   }
 
@@ -4043,7 +4084,7 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   if (IsMips64) {
     TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
-    TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
+    TOut.emitDSLL(ATReg, ATReg, 63, IDLoc, STI);
   } else {
     TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
   }
@@ -4053,12 +4094,12 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   else {
     // Branch to the mflo instruction.
     TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI);
-    TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
     TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
   }
 
   TOut.getStreamer().EmitLabel(BrTargetEnd);
-  TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+  TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
   return false;
 }
 
@@ -4287,7 +4328,8 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     DstReg = ATReg;
   }
 
-  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Out, STI)) {
+  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false,
+                     Inst.getLoc(), Out, STI)) {
     switch (FinalOpcode) {
     default:
       llvm_unreachable("unimplemented expansion");
@@ -4675,7 +4717,8 @@ bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   if (!ATReg)
     return true;
 
-  loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out, STI);
+  loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out,
+                STI);
 
   TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
               SrcReg, ATReg, IDLoc, STI);
@@ -5143,7 +5186,6 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_RequiresDifferentSrcAndDst;
     return Match_Success;
   case Mips::LWP_MM:
-  case Mips::LWP_MMR6:
     if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg())
       return Match_RequiresDifferentSrcAndDst;
     return Match_Success;
@@ -5151,6 +5193,13 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (Inst.getOperand(0).getImm() != 0 && !hasMips32())
       return Match_NonZeroOperandForSync;
     return Match_Success;
+  case Mips::MFC0:
+  case Mips::MTC0:
+  case Mips::MTC2:
+  case Mips::MFC2:
+    if (Inst.getOperand(2).getImm() != 0 && !hasMips32())
+      return Match_NonZeroOperandForMTCX;
+    return Match_Success;
   // As described the MIPSR6 spec, the compact branches that compare registers
   // must:
   // a) Not use the zero register.
@@ -5238,6 +5287,13 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_RequiresPosSizeRange33_64;
     return Match_Success;
   }
+  case Mips::CRC32B: case Mips::CRC32CB:
+  case Mips::CRC32H: case Mips::CRC32CH:
+  case Mips::CRC32W: case Mips::CRC32CW:
+  case Mips::CRC32D: case Mips::CRC32CD:
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg())
+      return Match_RequiresSameSrcAndDst;
+    return Match_Success;
   }
 
   uint64_t TSFlags = getInstDesc(Inst.getOpcode()).TSFlags;
@@ -5291,7 +5347,10 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
   case Match_NonZeroOperandForSync:
-    return Error(IDLoc, "s-type must be zero or unspecified for pre-MIPS32 ISAs");
+    return Error(IDLoc,
+                 "s-type must be zero or unspecified for pre-MIPS32 ISAs");
+  case Match_NonZeroOperandForMTCX:
+    return Error(IDLoc, "selector must be zero for pre-MIPS32 ISAs");
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction");
   case Match_RequiresDifferentSrcAndDst:
@@ -5430,6 +5489,9 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_MemSImm16:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected memory with 16-bit signed offset");
+  case Match_MemSImmPtr:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 32-bit signed offset");
   case Match_RequiresPosSizeRange0_32: {
     SMLoc ErrorStart = Operands[3]->getStartLoc();
     SMLoc ErrorEnd = Operands[4]->getEndLoc();
@@ -5464,6 +5526,17 @@ void MipsAsmParser::warnIfNoMacro(SMLoc Loc) {
     Warning(Loc, "macro instruction expanded into multiple instructions");
 }
 
+void MipsAsmParser::ConvertXWPOperands(MCInst &Inst,
+                                       const OperandVector &Operands) {
+  assert(
+      (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM) &&
+      "Unexpected instruction!");
+  ((MipsOperand &)*Operands[1]).addGPR32ZeroAsmRegOperands(Inst, 1);
+  int NextReg = nextReg(((MipsOperand &)*Operands[1]).getGPR32Reg());
+  Inst.addOperand(MCOperand::createReg(NextReg));
+  ((MipsOperand &)*Operands[2]).addMemOperands(Inst, 2);
+}
+
 void
 MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
                                      SMRange Range, bool ShowColors) {
@@ -5654,7 +5727,7 @@ unsigned MipsAsmParser::getReg(int RC, int RegNo) {
 
 bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   MCAsmParser &Parser = getParser();
-  DEBUG(dbgs() << "parseOperand\n");
+  LLVM_DEBUG(dbgs() << "parseOperand\n");
 
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
@@ -5667,7 +5740,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   if (ResTy == MatchOperand_ParseFail)
     return true;
 
-  DEBUG(dbgs() << ".. Generic Parser\n");
+  LLVM_DEBUG(dbgs() << ".. Generic Parser\n");
 
   switch (getLexer().getKind()) {
   case AsmToken::Dollar: {
@@ -5697,7 +5770,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     return false;
   }
   default: {
-    DEBUG(dbgs() << ".. generic integer expression\n");
+    LLVM_DEBUG(dbgs() << ".. generic integer expression\n");
 
     const MCExpr *Expr;
     SMLoc S = Parser.getTok().getLoc(); // Start location of the operand.
@@ -5770,7 +5843,7 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 OperandMatchResultTy
 MipsAsmParser::parseMemOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
-  DEBUG(dbgs() << "parseMemOperand\n");
+  LLVM_DEBUG(dbgs() << "parseMemOperand\n");
   const MCExpr *IdVal = nullptr;
   SMLoc S;
   bool isParenExpr = false;
@@ -5906,13 +5979,12 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) {
 bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   MCSymbol *Sym = getContext().lookupSymbol(Parser.getTok().getIdentifier());
-  if (Sym) {
-    SMLoc S = Parser.getTok().getLoc();
-    const MCExpr *Expr;
-    if (Sym->isVariable())
-      Expr = Sym->getVariableValue();
-    else
-      return false;
+  if (!Sym)
+    return false;
+
+  SMLoc S = Parser.getTok().getLoc();
+  if (Sym->isVariable()) {
+    const MCExpr *Expr = Sym->getVariableValue();
     if (Expr->getKind() == MCExpr::SymbolRef) {
       const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
       StringRef DefSymbol = Ref->getSymbol().getName();
@@ -5922,12 +5994,26 @@ bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
         if (ResTy == MatchOperand_Success) {
           Parser.Lex();
           return true;
-        } else if (ResTy == MatchOperand_ParseFail)
+        }
+        if (ResTy == MatchOperand_ParseFail)
           llvm_unreachable("Should never ParseFail");
-        return false;
+      }
+    }
+  } else if (Sym->isUnset()) {
+    // If symbol is unset, it might be created in the `parseSetAssignment`
+    // routine as an alias for a numeric register name.
+    // Lookup in the aliases list.
+    auto Entry = RegisterSets.find(Sym->getName());
+    if (Entry != RegisterSets.end()) {
+      OperandMatchResultTy ResTy =
+          matchAnyRegisterWithoutDollar(Operands, Entry->getValue(), S);
+      if (ResTy == MatchOperand_Success) {
+        Parser.Lex();
+        return true;
       }
     }
   }
+
   return false;
 }
 
@@ -5995,48 +6081,59 @@ MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
 }
 
 OperandMatchResultTy
-MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
-  MCAsmParser &Parser = getParser();
-  auto Token = Parser.getLexer().peekTok(false);
-
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands,
+                                             const AsmToken &Token, SMLoc S) {
   if (Token.is(AsmToken::Identifier)) {
-    DEBUG(dbgs() << ".. identifier\n");
+    LLVM_DEBUG(dbgs() << ".. identifier\n");
     StringRef Identifier = Token.getIdentifier();
     OperandMatchResultTy ResTy =
         matchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
     return ResTy;
   } else if (Token.is(AsmToken::Integer)) {
-    DEBUG(dbgs() << ".. integer\n");
+    LLVM_DEBUG(dbgs() << ".. integer\n");
+    int64_t RegNum = Token.getIntVal();
+    if (RegNum < 0 || RegNum > 31) {
+      // Show the error, but treat invalid register
+      // number as a normal one to continue parsing
+      // and catch other possible errors.
+      Error(getLexer().getLoc(), "invalid register number");
+    }
     Operands.push_back(MipsOperand::createNumericReg(
-        Token.getIntVal(), Token.getString(), getContext().getRegisterInfo(), S,
+        RegNum, Token.getString(), getContext().getRegisterInfo(), S,
         Token.getLoc(), *this));
     return MatchOperand_Success;
   }
 
-  DEBUG(dbgs() << Parser.getTok().getKind() << "\n");
+  LLVM_DEBUG(dbgs() << Token.getKind() << "\n");
 
   return MatchOperand_NoMatch;
 }
 
 OperandMatchResultTy
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+  auto Token = getLexer().peekTok(false);
+  return matchAnyRegisterWithoutDollar(Operands, Token, S);
+}
+
+OperandMatchResultTy
 MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
-  DEBUG(dbgs() << "parseAnyRegister\n");
+  LLVM_DEBUG(dbgs() << "parseAnyRegister\n");
 
   auto Token = Parser.getTok();
 
   SMLoc S = Token.getLoc();
 
   if (Token.isNot(AsmToken::Dollar)) {
-    DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
+    LLVM_DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
     if (Token.is(AsmToken::Identifier)) {
       if (searchSymbolAlias(Operands))
         return MatchOperand_Success;
     }
-    DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
+    LLVM_DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
     return MatchOperand_NoMatch;
   }
-  DEBUG(dbgs() << ".. $\n");
+  LLVM_DEBUG(dbgs() << ".. $\n");
 
   OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S);
   if (ResTy == MatchOperand_Success) {
@@ -6049,7 +6146,7 @@ MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
 OperandMatchResultTy
 MipsAsmParser::parseJumpTarget(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
-  DEBUG(dbgs() << "parseJumpTarget\n");
+  LLVM_DEBUG(dbgs() << "parseJumpTarget\n");
 
   SMLoc S = getLexer().getLoc();
 
@@ -6182,22 +6279,6 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-MipsAsmParser::parseRegisterPair(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-
-  SMLoc S = Parser.getTok().getLoc();
-  if (parseAnyRegister(Operands) != MatchOperand_Success)
-    return MatchOperand_ParseFail;
-
-  SMLoc E = Parser.getTok().getLoc();
-  MipsOperand Op = static_cast<MipsOperand &>(*Operands.back());
-
-  Operands.pop_back();
-  Operands.push_back(MipsOperand::CreateRegPair(Op, S, E, *this));
-  return MatchOperand_Success;
-}
-
-OperandMatchResultTy
 MipsAsmParser::parseMovePRegPair(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
@@ -6293,7 +6374,7 @@ bool MipsAsmParser::parseBracketSuffix(StringRef Name,
 bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                      SMLoc NameLoc, OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
-  DEBUG(dbgs() << "ParseInstruction\n");
+  LLVM_DEBUG(dbgs() << "ParseInstruction\n");
 
   // We have reached first instruction, module directive are now forbidden.
   getTargetStreamer().forbidModuleDirective();
@@ -6655,6 +6736,57 @@ bool MipsAsmParser::parseSetNoMtDirective() {
   return false;
 }
 
+bool MipsAsmParser::parseSetNoCRCDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nocrc".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureCRC, "crc");
+
+  getTargetStreamer().emitDirectiveSetNoCRC();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoVirtDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "novirt".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureVirt, "virt");
+
+  getTargetStreamer().emitDirectiveSetNoVirt();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoGINVDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "noginv".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureGINV, "ginv");
+
+  getTargetStreamer().emitDirectiveSetNoGINV();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
 bool MipsAsmParser::parseSetPopDirective() {
   MCAsmParser &Parser = getParser();
   SMLoc Loc = getLexer().getLoc();
@@ -6720,17 +6852,30 @@ bool MipsAsmParser::parseSetAssignment() {
   MCAsmParser &Parser = getParser();
 
   if (Parser.parseIdentifier(Name))
-    reportParseError("expected identifier after .set");
+    return reportParseError("expected identifier after .set");
 
   if (getLexer().isNot(AsmToken::Comma))
     return reportParseError("unexpected token, expected comma");
   Lex(); // Eat comma
 
-  if (Parser.parseExpression(Value))
+  if (getLexer().is(AsmToken::Dollar) &&
+      getLexer().peekTok().is(AsmToken::Integer)) {
+    // Parse assignment of a numeric register:
+    //   .set r1,$1
+    Parser.Lex(); // Eat $.
+    RegisterSets[Name] = Parser.getTok();
+    Parser.Lex(); // Eat identifier.
+    getContext().getOrCreateSymbol(Name);
+  } else if (!Parser.parseExpression(Value)) {
+    // Parse assignment of an expression including
+    // symbolic registers:
+    //   .set  $tmp, $BB0-$BB1
+    //   .set  r2, $f2
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    Sym->setVariableValue(Value);
+  } else {
     return reportParseError("expected valid expression after comma");
-
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-  Sym->setVariableValue(Value);
+  }
 
   return false;
 }
@@ -6876,6 +7021,18 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
     selectArch("mips64r6");
     getTargetStreamer().emitDirectiveSetMips64R6();
     break;
+  case Mips::FeatureCRC:
+    setFeatureBits(Mips::FeatureCRC, "crc");
+    getTargetStreamer().emitDirectiveSetCRC();
+    break;
+  case Mips::FeatureVirt:
+    setFeatureBits(Mips::FeatureVirt, "virt");
+    getTargetStreamer().emitDirectiveSetVirt();
+    break;
+  case Mips::FeatureGINV:
+    setFeatureBits(Mips::FeatureGINV, "ginv");
+    getTargetStreamer().emitDirectiveSetGINV();
+    break;
   }
   return false;
 }
@@ -7075,143 +7232,131 @@ bool MipsAsmParser::parseDirectiveNaN() {
 }
 
 bool MipsAsmParser::parseDirectiveSet() {
-  MCAsmParser &Parser = getParser();
-  // Get the next token.
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getParser().getTok();
+  StringRef IdVal = Tok.getString();
+  SMLoc Loc = Tok.getLoc();
 
-  if (Tok.getString() == "noat") {
+  if (IdVal == "noat")
     return parseSetNoAtDirective();
-  } else if (Tok.getString() == "at") {
+  if (IdVal == "at")
     return parseSetAtDirective();
-  } else if (Tok.getString() == "arch") {
+  if (IdVal == "arch")
     return parseSetArchDirective();
-  } else if (Tok.getString() == "bopt") {
-    Warning(Tok.getLoc(), "'bopt' feature is unsupported");
+  if (IdVal == "bopt") {
+    Warning(Loc, "'bopt' feature is unsupported");
     getParser().Lex();
     return false;
-  } else if (Tok.getString() == "nobopt") {
+  }
+  if (IdVal == "nobopt") {
     // We're already running in nobopt mode, so nothing to do.
     getParser().Lex();
     return false;
-  } else if (Tok.getString() == "fp") {
+  }
+  if (IdVal == "fp")
     return parseSetFpDirective();
-  } else if (Tok.getString() == "oddspreg") {
+  if (IdVal == "oddspreg")
     return parseSetOddSPRegDirective();
-  } else if (Tok.getString() == "nooddspreg") {
+  if (IdVal == "nooddspreg")
     return parseSetNoOddSPRegDirective();
-  } else if (Tok.getString() == "pop") {
+  if (IdVal == "pop")
     return parseSetPopDirective();
-  } else if (Tok.getString() == "push") {
+  if (IdVal == "push")
     return parseSetPushDirective();
-  } else if (Tok.getString() == "reorder") {
+  if (IdVal == "reorder")
     return parseSetReorderDirective();
-  } else if (Tok.getString() == "noreorder") {
+  if (IdVal == "noreorder")
     return parseSetNoReorderDirective();
-  } else if (Tok.getString() == "macro") {
+  if (IdVal == "macro")
     return parseSetMacroDirective();
-  } else if (Tok.getString() == "nomacro") {
+  if (IdVal == "nomacro")
     return parseSetNoMacroDirective();
-  } else if (Tok.getString() == "mips16") {
+  if (IdVal == "mips16")
     return parseSetMips16Directive();
-  } else if (Tok.getString() == "nomips16") {
+  if (IdVal == "nomips16")
     return parseSetNoMips16Directive();
-  } else if (Tok.getString() == "nomicromips") {
+  if (IdVal == "nomicromips") {
     clearFeatureBits(Mips::FeatureMicroMips, "micromips");
     getTargetStreamer().emitDirectiveSetNoMicroMips();
-    Parser.eatToEndOfStatement();
+    getParser().eatToEndOfStatement();
     return false;
-  } else if (Tok.getString() == "micromips") {
+  }
+  if (IdVal == "micromips") {
     if (hasMips64r6()) {
-      Error(Tok.getLoc(), ".set micromips directive is not supported with MIPS64R6");
+      Error(Loc, ".set micromips directive is not supported with MIPS64R6");
       return false;
     }
     return parseSetFeature(Mips::FeatureMicroMips);
-  } else if (Tok.getString() == "mips0") {
+  }
+  if (IdVal == "mips0")
     return parseSetMips0Directive();
-  } else if (Tok.getString() == "mips1") {
+  if (IdVal == "mips1")
     return parseSetFeature(Mips::FeatureMips1);
-  } else if (Tok.getString() == "mips2") {
+  if (IdVal == "mips2")
     return parseSetFeature(Mips::FeatureMips2);
-  } else if (Tok.getString() == "mips3") {
+  if (IdVal == "mips3")
     return parseSetFeature(Mips::FeatureMips3);
-  } else if (Tok.getString() == "mips4") {
+  if (IdVal == "mips4")
     return parseSetFeature(Mips::FeatureMips4);
-  } else if (Tok.getString() == "mips5") {
+  if (IdVal == "mips5")
     return parseSetFeature(Mips::FeatureMips5);
-  } else if (Tok.getString() == "mips32") {
+  if (IdVal == "mips32")
     return parseSetFeature(Mips::FeatureMips32);
-  } else if (Tok.getString() == "mips32r2") {
+  if (IdVal == "mips32r2")
     return parseSetFeature(Mips::FeatureMips32r2);
-  } else if (Tok.getString() == "mips32r3") {
+  if (IdVal == "mips32r3")
     return parseSetFeature(Mips::FeatureMips32r3);
-  } else if (Tok.getString() == "mips32r5") {
+  if (IdVal == "mips32r5")
     return parseSetFeature(Mips::FeatureMips32r5);
-  } else if (Tok.getString() == "mips32r6") {
+  if (IdVal == "mips32r6")
     return parseSetFeature(Mips::FeatureMips32r6);
-  } else if (Tok.getString() == "mips64") {
+  if (IdVal == "mips64")
     return parseSetFeature(Mips::FeatureMips64);
-  } else if (Tok.getString() == "mips64r2") {
+  if (IdVal == "mips64r2")
     return parseSetFeature(Mips::FeatureMips64r2);
-  } else if (Tok.getString() == "mips64r3") {
+  if (IdVal == "mips64r3")
     return parseSetFeature(Mips::FeatureMips64r3);
-  } else if (Tok.getString() == "mips64r5") {
+  if (IdVal == "mips64r5")
     return parseSetFeature(Mips::FeatureMips64r5);
-  } else if (Tok.getString() == "mips64r6") {
+  if (IdVal == "mips64r6") {
     if (inMicroMipsMode()) {
-      Error(Tok.getLoc(), "MIPS64R6 is not supported with microMIPS");
+      Error(Loc, "MIPS64R6 is not supported with microMIPS");
       return false;
     }
     return parseSetFeature(Mips::FeatureMips64r6);
-  } else if (Tok.getString() == "dsp") {
+  }
+  if (IdVal == "dsp")
     return parseSetFeature(Mips::FeatureDSP);
-  } else if (Tok.getString() == "dspr2") {
+  if (IdVal == "dspr2")
     return parseSetFeature(Mips::FeatureDSPR2);
-  } else if (Tok.getString() == "nodsp") {
+  if (IdVal == "nodsp")
     return parseSetNoDspDirective();
-  } else if (Tok.getString() == "msa") {
+  if (IdVal == "msa")
     return parseSetMsaDirective();
-  } else if (Tok.getString() == "nomsa") {
+  if (IdVal == "nomsa")
     return parseSetNoMsaDirective();
-  } else if (Tok.getString() == "mt") {
+  if (IdVal == "mt")
     return parseSetMtDirective();
-  } else if (Tok.getString() == "nomt") {
+  if (IdVal == "nomt")
     return parseSetNoMtDirective();
-  } else if (Tok.getString() == "softfloat") {
+  if (IdVal == "softfloat")
     return parseSetSoftFloatDirective();
-  } else if (Tok.getString() == "hardfloat") {
+  if (IdVal == "hardfloat")
     return parseSetHardFloatDirective();
-  } else {
-    // It is just an identifier, look for an assignment.
-    parseSetAssignment();
-    return false;
-  }
-
-  return true;
-}
-
-/// parseDataDirective
-///  ::= .word [ expression (, expression)* ]
-bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
-  MCAsmParser &Parser = getParser();
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    while (true) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token, expected comma");
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
+  if (IdVal == "crc")
+    return parseSetFeature(Mips::FeatureCRC);
+  if (IdVal == "nocrc")
+    return parseSetNoCRCDirective();
+  if (IdVal == "virt")
+    return parseSetFeature(Mips::FeatureVirt);
+  if (IdVal == "novirt")
+    return parseSetNoVirtDirective();
+  if (IdVal == "ginv")
+    return parseSetFeature(Mips::FeatureGINV);
+  if (IdVal == "noginv")
+    return parseSetNoGINVDirective();
+
+  // It is just an identifier, look for an assignment.
+  return parseSetAssignment();
 }
 
 /// parseDirectiveGpWord
@@ -7426,6 +7571,12 @@ bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
 ///  ::= .module softfloat
 ///  ::= .module hardfloat
 ///  ::= .module mt
+///  ::= .module crc
+///  ::= .module nocrc
+///  ::= .module virt
+///  ::= .module novirt
+///  ::= .module ginv
+///  ::= .module noginv
 bool MipsAsmParser::parseDirectiveModule() {
   MCAsmParser &Parser = getParser();
   MCAsmLexer &Lexer = getLexer();
@@ -7544,6 +7695,120 @@ bool MipsAsmParser::parseDirectiveModule() {
     }
 
     return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "crc") {
+    setModuleFeatureBits(Mips::FeatureCRC, "crc");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleCRC();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "nocrc") {
+    clearModuleFeatureBits(Mips::FeatureCRC, "crc");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleNoCRC();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "virt") {
+    setModuleFeatureBits(Mips::FeatureVirt, "virt");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleVirt();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "novirt") {
+    clearModuleFeatureBits(Mips::FeatureVirt, "virt");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleNoVirt();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "ginv") {
+    setModuleFeatureBits(Mips::FeatureGINV, "ginv");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleGINV();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "noginv") {
+    clearModuleFeatureBits(Mips::FeatureGINV, "ginv");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleNoGINV();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
   } else {
     return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
   }
@@ -7673,10 +7938,6 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     parseDirectiveCpRestore(DirectiveID.getLoc());
     return false;
   }
-  if (IDVal == ".dword") {
-    parseDataDirective(8, DirectiveID.getLoc());
-    return false;
-  }
   if (IDVal == ".ent") {
     StringRef SymbolName;
 
@@ -7924,16 +8185,6 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
-  if (IDVal == ".word") {
-    parseDataDirective(4, DirectiveID.getLoc());
-    return false;
-  }
-
-  if (IDVal == ".hword") {
-    parseDataDirective(2, DirectiveID.getLoc());
-    return false;
-  }
-
   if (IDVal == ".option") {
     parseDirectiveOption();
     return false;
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index ef0f08b49850..b94afb9520e3 100644
--- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -277,11 +277,6 @@ static DecodeStatus DecodeMemEVA(MCInst &Inst,
                                  uint64_t Address,
                                  const void *Decoder);
 
-static DecodeStatus DecodeLoadByte9(MCInst &Inst,
-                                    unsigned Insn,
-                                    uint64_t Address,
-                                    const void *Decoder);
-
 static DecodeStatus DecodeLoadByte15(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -300,11 +295,6 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
                                     uint64_t Address,
                                     const void *Decoder);
 
-static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
-                                       unsigned Insn,
-                                       uint64_t Address,
-                                       const void *Decoder);
-
 static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
                                     unsigned Insn,
                                     uint64_t Address,
@@ -315,6 +305,11 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
                                 uint64_t Address,
                                 const void *Decoder);
 
+static DecodeStatus DecodeSyncI_MM(MCInst &Inst,
+                                   unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder);
+
 static DecodeStatus DecodeSynciR6(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
@@ -527,6 +522,10 @@ template <typename InsnType>
 static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
                                const void *Decoder);
 
+template <typename InsnType>
+static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
+                              const void *Decoder);
+
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const void *Decoder);
@@ -1139,6 +1138,22 @@ static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
 
   return MCDisassembler::Success;
 }
+
+// Auto-generated decoder wouldn't add the third operand for CRC32*.
+template <typename InsnType>
+static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
+                              const void *Decoder) {
+  InsnType Rs = fieldFromInstruction(Insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(Insn, 16, 5);
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rs)));
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  return MCDisassembler::Success;
+}
+
 /// Read two bytes from the ArrayRef and return 16 bit halfword sorted
 /// according to the given endianness.
 static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -1210,7 +1225,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       return MCDisassembler::Fail;
 
     if (hasMips32r6()) {
-      DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+      LLVM_DEBUG(
+          dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
       // Calling the auto-generated decoder function for microMIPS32R6
       // 16-bit instructions.
       Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
@@ -1221,7 +1237,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       }
     }
 
-    DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+    LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
     // Calling the auto-generated decoder function for microMIPS 16-bit
     // instructions.
     Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
@@ -1236,7 +1252,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       return MCDisassembler::Fail;
 
     if (hasMips32r6()) {
-      DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
+      LLVM_DEBUG(
+          dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
       // Calling the auto-generated decoder function.
       Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address,
                                  this, STI);
@@ -1246,7 +1263,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       }
     }
 
-    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+    LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
                                this, STI);
@@ -1256,7 +1273,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     }
 
     if (isFP64()) {
-      DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
+      LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
       Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn,
                                  Address, this, STI);
       if (Result != MCDisassembler::Fail) {
@@ -1285,7 +1302,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   Size = 4;
 
   if (hasCOP3()) {
-    DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+    LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
     Result =
         decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1293,7 +1310,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (hasMips32r6() && isGP64()) {
-    DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+    LLVM_DEBUG(
+        dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1301,7 +1319,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (hasMips32r6() && isPTR64()) {
-    DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    LLVM_DEBUG(
+        dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1309,7 +1328,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (hasMips32r6()) {
-    DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+    LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1317,7 +1336,8 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (hasMips2() && isPTR64()) {
-    DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    LLVM_DEBUG(
+        dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1325,7 +1345,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (hasCnMips()) {
-    DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+    LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1333,7 +1353,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (isGP64()) {
-    DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+    LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
@@ -1341,14 +1361,15 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   }
 
   if (isFP64()) {
-    DEBUG(dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
+    LLVM_DEBUG(
+        dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail)
       return Result;
   }
 
-  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+  LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result =
       decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
@@ -1538,24 +1559,6 @@ static DecodeStatus DecodeMemEVA(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLoadByte9(MCInst &Inst,
-                                    unsigned Insn,
-                                    uint64_t Address,
-                                    const void *Decoder) {
-  int Offset = SignExtend32<9>(Insn & 0x1ff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-
-  Inst.addOperand(MCOperand::createReg(Reg));
-  Inst.addOperand(MCOperand::createReg(Base));
-  Inst.addOperand(MCOperand::createImm(Offset));
-
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeLoadByte15(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -1642,30 +1645,25 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
-                                       unsigned Insn,
-                                       uint64_t Address,
-                                       const void *Decoder) {
-  int Offset = SignExtend32<9>(Insn & 0x1ff);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
-  Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
 
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSyncI(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
 
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
@@ -1862,7 +1860,7 @@ static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
   Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
-  if (Inst.getOpcode() == Mips::SCE_MM)
+  if (Inst.getOpcode() == Mips::SCE_MM || Inst.getOpcode() == Mips::SC_MMR6)
     Inst.addOperand(MCOperand::createReg(Reg));
 
   Inst.addOperand(MCOperand::createReg(Reg));
@@ -1897,8 +1895,7 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
     LLVM_FALLTHROUGH;
   default:
     Inst.addOperand(MCOperand::createReg(Reg));
-    if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM ||
-        Inst.getOpcode() == Mips::LWP_MMR6 || Inst.getOpcode() == Mips::SWP_MMR6)
+    if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM)
       Inst.addOperand(MCOperand::createReg(Reg+1));
 
     Inst.addOperand(MCOperand::createReg(Base));
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 1d125d0dbae6..73732a40bb8a 100644
--- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -197,11 +197,6 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
 }
 
 void MipsInstPrinter::
-printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O) {
-  printRegName(O, MI->getOperand(opNum).getReg());
-}
-
-void MipsInstPrinter::
 printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
   llvm_unreachable("TODO");
 }
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 4a76b5acac79..f02443ee21d3 100644
--- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -98,7 +98,6 @@ private:
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
-  void printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O);
   void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
 
   bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 9abd4f1d6b08..68bf3829aab5 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -161,6 +161,12 @@ public:
       ASESet |= Mips::AFL_ASE_MIPS16;
     if (P.hasMT())
       ASESet |= Mips::AFL_ASE_MT;
+    if (P.hasCRC())
+      ASESet |= Mips::AFL_ASE_CRC;
+    if (P.hasVirt())
+      ASESet |= Mips::AFL_ASE_VIRT;
+    if (P.hasGINV())
+      ASESet |= Mips::AFL_ASE_GINV;
   }
 
   template <class PredicateLibrary>
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 498ea6fda4b3..bf1390880281 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -57,7 +57,7 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
     return MipsABIInfo::N64();
   assert(Options.getABIName().empty() && "Unknown ABI option for MIPS");
 
-  if (TT.getArch() == Triple::mips64 || TT.getArch() == Triple::mips64el)
+  if (TT.isMIPS64())
     return MipsABIInfo::N64();
   return MipsABIInfo::O32();
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index acbc6d37e24b..4397c971d080 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -53,6 +54,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_Mips_GOT_DISP:
   case Mips::fixup_Mips_GOT_LO16:
   case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_MICROMIPS_GPOFF_HI:
+  case Mips::fixup_MICROMIPS_GPOFF_LO:
   case Mips::fixup_MICROMIPS_LO16:
   case Mips::fixup_MICROMIPS_GOT_PAGE:
   case Mips::fixup_MICROMIPS_GOT_OFST:
@@ -107,10 +110,12 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
   case Mips::fixup_Mips_HIGHER:
+  case Mips::fixup_MICROMIPS_HIGHER:
     // Get the 3rd 16-bits.
     Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
     break;
   case Mips::fixup_Mips_HIGHEST:
+  case Mips::fixup_MICROMIPS_HIGHEST:
     // Get the 4th 16-bits.
     Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
     break;
@@ -210,9 +215,9 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   return Value;
 }
 
-std::unique_ptr<MCObjectWriter>
-MipsAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createMipsELFObjectWriter(OS, TheTriple, IsN32);
+std::unique_ptr<MCObjectTargetWriter>
+MipsAsmBackend::createObjectTargetWriter() const {
+  return createMipsELFObjectWriter(TheTriple, IsN32);
 }
 
 // Little-endian fixup data byte ordering:
@@ -238,7 +243,8 @@ static unsigned calculateMMLEIndex(unsigned i) {
 void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                 const MCValue &Target,
                                 MutableArrayRef<char> Data, uint64_t Value,
-                                bool IsResolved) const {
+                                bool IsResolved,
+                                const MCSubtargetInfo *STI) const {
   MCFixupKind Kind = Fixup.getKind();
   MCContext &Ctx = Asm.getContext();
   Value = adjustFixupValue(Fixup, Value, Ctx);
@@ -275,9 +281,9 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   bool microMipsLEByteOrder = needsMMLEByteOrder((unsigned) Kind);
 
   for (unsigned i = 0; i != NumBytes; ++i) {
-    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
-                                                    : i)
-                            : (FullSize - 1 - i);
+    unsigned Idx = Endian == support::little
+                       ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
+                       : (FullSize - 1 - i);
     CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
   }
 
@@ -287,9 +293,9 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
 
   // Write out the fixed up bytes back to the code/data bits.
   for (unsigned i = 0; i != NumBytes; ++i) {
-    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
-                                                    : i)
-                            : (FullSize - 1 - i);
+    unsigned Idx = Endian == support::little
+                       ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
+                       : (FullSize - 1 - i);
     Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
   }
 }
@@ -298,12 +304,46 @@ Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
   return StringSwitch<Optional<MCFixupKind>>(Name)
       .Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE)
       .Case("R_MIPS_32", FK_Data_4)
+      .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
+      .Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16)
+      .Case("R_MIPS_CALL_LO16", (MCFixupKind)Mips::fixup_Mips_CALL_LO16)
+      .Case("R_MIPS_CALL16", (MCFixupKind)Mips::fixup_Mips_CALL16)
+      .Case("R_MIPS_GOT16", (MCFixupKind)Mips::fixup_Mips_GOT)
+      .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
+      .Case("R_MIPS_GOT_OFST", (MCFixupKind)Mips::fixup_Mips_GOT_OFST)
+      .Case("R_MIPS_GOT_DISP", (MCFixupKind)Mips::fixup_Mips_GOT_DISP)
+      .Case("R_MIPS_GOT_HI16", (MCFixupKind)Mips::fixup_Mips_GOT_HI16)
+      .Case("R_MIPS_GOT_LO16", (MCFixupKind)Mips::fixup_Mips_GOT_LO16)
+      .Case("R_MIPS_TLS_GOTTPREL", (MCFixupKind)Mips::fixup_Mips_GOTTPREL)
+      .Case("R_MIPS_TLS_DTPREL_HI16", (MCFixupKind)Mips::fixup_Mips_DTPREL_HI)
+      .Case("R_MIPS_TLS_DTPREL_LO16", (MCFixupKind)Mips::fixup_Mips_DTPREL_LO)
+      .Case("R_MIPS_TLS_GD", (MCFixupKind)Mips::fixup_Mips_TLSGD)
+      .Case("R_MIPS_TLS_LDM", (MCFixupKind)Mips::fixup_Mips_TLSLDM)
+      .Case("R_MIPS_TLS_TPREL_HI16", (MCFixupKind)Mips::fixup_Mips_TPREL_HI)
+      .Case("R_MIPS_TLS_TPREL_LO16", (MCFixupKind)Mips::fixup_Mips_TPREL_LO)
+      .Case("R_MICROMIPS_CALL16", (MCFixupKind)Mips::fixup_MICROMIPS_CALL16)
+      .Case("R_MICROMIPS_GOT_DISP", (MCFixupKind)Mips::fixup_MICROMIPS_GOT_DISP)
+      .Case("R_MICROMIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_MICROMIPS_GOT_PAGE)
+      .Case("R_MICROMIPS_GOT_OFST", (MCFixupKind)Mips::fixup_MICROMIPS_GOT_OFST)
+      .Case("R_MICROMIPS_GOT16", (MCFixupKind)Mips::fixup_MICROMIPS_GOT16)
+      .Case("R_MICROMIPS_TLS_GOTTPREL",
+            (MCFixupKind)Mips::fixup_MICROMIPS_GOTTPREL)
+      .Case("R_MICROMIPS_TLS_DTPREL_HI16",
+            (MCFixupKind)Mips::fixup_MICROMIPS_TLS_DTPREL_HI16)
+      .Case("R_MICROMIPS_TLS_DTPREL_LO16",
+            (MCFixupKind)Mips::fixup_MICROMIPS_TLS_DTPREL_LO16)
+      .Case("R_MICROMIPS_TLS_GD", (MCFixupKind)Mips::fixup_MICROMIPS_TLS_GD)
+      .Case("R_MICROMIPS_TLS_LDM", (MCFixupKind)Mips::fixup_MICROMIPS_TLS_LDM)
+      .Case("R_MICROMIPS_TLS_TPREL_HI16",
+            (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_HI16)
+      .Case("R_MICROMIPS_TLS_TPREL_LO16",
+            (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_LO16)
       .Default(MCAsmBackend::getFixupKind(Name));
 }
 
 const MCFixupKindInfo &MipsAsmBackend::
 getFixupKindInfo(MCFixupKind Kind) const {
-  const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
+  const static MCFixupKindInfo LittleEndianInfos[] = {
     // This table *must* be in same the order of fixup_* kinds in
     // MipsFixupKinds.h.
     //
@@ -333,12 +373,16 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
     { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+    { "fixup_MICROMIPS_GPOFF_HI",0,     16,   0 },
     { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+    { "fixup_MICROMIPS_GPOFF_LO",0,     16,   0 },
     { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
     { "fixup_Mips_GOT_OFST",     0,     16,   0 },
     { "fixup_Mips_GOT_DISP",     0,     16,   0 },
     { "fixup_Mips_HIGHER",       0,     16,   0 },
+    { "fixup_MICROMIPS_HIGHER",  0,     16,   0 },
     { "fixup_Mips_HIGHEST",      0,     16,   0 },
+    { "fixup_MICROMIPS_HIGHEST", 0,     16,   0 },
     { "fixup_Mips_GOT_HI16",     0,     16,   0 },
     { "fixup_Mips_GOT_LO16",     0,     16,   0 },
     { "fixup_Mips_CALL_HI16",    0,     16,   0 },
@@ -374,8 +418,10 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_SUB",                  0,     64,   0 },
     { "fixup_MICROMIPS_SUB",             0,     64,   0 }
   };
+  static_assert(array_lengthof(LittleEndianInfos) == Mips::NumTargetFixupKinds,
+                "Not all MIPS little endian fixup kinds added!");
 
-  const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+  const static MCFixupKindInfo BigEndianInfos[] = {
     // This table *must* be in same the order of fixup_* kinds in
     // MipsFixupKinds.h.
     //
@@ -405,12 +451,16 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_DTPREL_LO",   16,     16,   0 },
     { "fixup_Mips_Branch_PCRel",16,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_Mips_GPOFF_HI",    16,     16,   0 },
+    { "fixup_MICROMIPS_GPOFF_HI", 16,     16,   0 },
     { "fixup_Mips_GPOFF_LO",    16,     16,   0 },
+    { "fixup_MICROMIPS_GPOFF_LO", 16,     16,   0 },
     { "fixup_Mips_GOT_PAGE",    16,     16,   0 },
     { "fixup_Mips_GOT_OFST",    16,     16,   0 },
     { "fixup_Mips_GOT_DISP",    16,     16,   0 },
     { "fixup_Mips_HIGHER",      16,     16,   0 },
+    { "fixup_MICROMIPS_HIGHER", 16,     16,   0 },
     { "fixup_Mips_HIGHEST",     16,     16,   0 },
+    { "fixup_MICROMIPS_HIGHEST",16,     16,   0 },
     { "fixup_Mips_GOT_HI16",    16,     16,   0 },
     { "fixup_Mips_GOT_LO16",    16,     16,   0 },
     { "fixup_Mips_CALL_HI16",   16,     16,   0 },
@@ -446,6 +496,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_SUB",                   0,     64,   0 },
     { "fixup_MICROMIPS_SUB",              0,     64,   0 }
   };
+  static_assert(array_lengthof(BigEndianInfos) == Mips::NumTargetFixupKinds,
+                "Not all MIPS big endian fixup kinds added!");
 
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
@@ -453,7 +505,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
   assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
           "Invalid kind!");
 
-  if (IsLittle)
+  if (Endian == support::little)
     return LittleEndianInfos[Kind - FirstTargetFixupKind];
   return BigEndianInfos[Kind - FirstTargetFixupKind];
 }
@@ -463,7 +515,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
 /// it should return an error.
 ///
 /// \return - True on success.
-bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool MipsAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // Check for a less than instruction size number of bytes
   // FIXME: 16 bit instructions are not handled yet here.
   // We shouldn't be using a hard coded number for instruction size.
@@ -471,10 +523,51 @@ bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
-  OW->WriteZeros(Count);
+  OS.write_zeros(Count);
   return true;
 }
 
+bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                           const MCFixup &Fixup,
+                                           const MCValue &Target) {
+  const unsigned FixupKind = Fixup.getKind();
+  switch (FixupKind) {
+  default:
+    return false;
+  // All these relocations require special processing
+  // at linking time. Delegate this work to a linker.
+  case Mips::fixup_Mips_CALL_HI16:
+  case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_Mips_CALL16:
+  case Mips::fixup_Mips_GOT:
+  case Mips::fixup_Mips_GOT_PAGE:
+  case Mips::fixup_Mips_GOT_OFST:
+  case Mips::fixup_Mips_GOT_DISP:
+  case Mips::fixup_Mips_GOT_HI16:
+  case Mips::fixup_Mips_GOT_LO16:
+  case Mips::fixup_Mips_GOTTPREL:
+  case Mips::fixup_Mips_DTPREL_HI:
+  case Mips::fixup_Mips_DTPREL_LO:
+  case Mips::fixup_Mips_TLSGD:
+  case Mips::fixup_Mips_TLSLDM:
+  case Mips::fixup_Mips_TPREL_HI:
+  case Mips::fixup_Mips_TPREL_LO:
+  case Mips::fixup_MICROMIPS_CALL16:
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+  case Mips::fixup_MICROMIPS_GOT16:
+  case Mips::fixup_MICROMIPS_GOTTPREL:
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+  case Mips::fixup_MICROMIPS_TLS_GD:
+  case Mips::fixup_MICROMIPS_TLS_LDM:
+  case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+  case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+    return true;
+  }
+}
+
 MCAsmBackend *llvm::createMipsAsmBackend(const Target &T,
                                          const MCSubtargetInfo &STI,
                                          const MCRegisterInfo &MRI,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 406b820edae5..3d5e16fcf9b4 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -29,20 +29,21 @@ class Target;
 
 class MipsAsmBackend : public MCAsmBackend {
   Triple TheTriple;
-  bool IsLittle; // Big or little endian
   bool IsN32;
 
 public:
   MipsAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT,
                  StringRef CPU, bool N32)
-      : TheTriple(TT), IsLittle(TT.isLittleEndian()), IsN32(N32) {}
+      : MCAsmBackend(TT.isLittleEndian() ? support::little : support::big),
+        TheTriple(TT), IsN32(N32) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -58,7 +59,8 @@ public:
   /// relaxation.
   ///
   /// \param Inst - The instruction to test.
-  bool mayNeedRelaxation(const MCInst &Inst) const override {
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
     return false;
   }
 
@@ -83,7 +85,10 @@ public:
 
   /// @}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 
 }; // class MipsAsmBackend
 
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 3c67743947cb..3dc753772e5f 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -56,8 +56,7 @@ raw_ostream &operator<<(raw_ostream &OS, const MipsRelocationEntry &RHS) {
 
 class MipsELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  MipsELFObjectWriter(uint8_t OSABI, bool HasRelocationAddend, bool Is64,
-                      bool IsLittleEndian);
+  MipsELFObjectWriter(uint8_t OSABI, bool HasRelocationAddend, bool Is64);
 
   ~MipsELFObjectWriter() override = default;
 
@@ -116,15 +115,15 @@ static InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
   for (InputIt I = First; I != Last; ++I) {
     unsigned Matched = Predicate(*I);
     if (Matched != FindBest_NoMatch) {
-      DEBUG(dbgs() << std::distance(First, I) << " is a match (";
-            I->print(dbgs()); dbgs() << ")\n");
+      LLVM_DEBUG(dbgs() << std::distance(First, I) << " is a match (";
+                 I->print(dbgs()); dbgs() << ")\n");
       if (Best == Last || BetterThan(*I, *Best)) {
-        DEBUG(dbgs() << ".. and it beats the last one\n");
+        LLVM_DEBUG(dbgs() << ".. and it beats the last one\n");
         Best = I;
       }
     }
     if (Matched == FindBest_PerfectMatch) {
-      DEBUG(dbgs() << ".. and it is unbeatable\n");
+      LLVM_DEBUG(dbgs() << ".. and it is unbeatable\n");
       break;
     }
   }
@@ -148,7 +147,8 @@ static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) {
   if (Type == ELF::R_MIPS16_HI16)
     return ELF::R_MIPS16_LO16;
 
-  if (Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
+  if (Reloc.OriginalSymbol &&
+      Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
     return ELF::R_MIPS_NONE;
 
   if (Type == ELF::R_MIPS_GOT16)
@@ -211,8 +211,7 @@ static void dumpRelocs(const char *Prefix, const Container &Relocs) {
 #endif
 
 MipsELFObjectWriter::MipsELFObjectWriter(uint8_t OSABI,
-                                         bool HasRelocationAddend, bool Is64,
-                                         bool IsLittleEndian)
+                                         bool HasRelocationAddend, bool Is64)
     : MCELFObjectTargetWriter(Is64, OSABI, ELF::EM_MIPS, HasRelocationAddend) {}
 
 unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
@@ -331,6 +330,13 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
     Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
     return Type;
   }
+  case Mips::fixup_MICROMIPS_GPOFF_HI: {
+    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+    Type = setRType((unsigned)ELF::R_MICROMIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MICROMIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MICROMIPS_HI16, Type);
+    return Type;
+  }
   case Mips::fixup_Mips_GPOFF_LO: {
     unsigned Type = (unsigned)ELF::R_MIPS_NONE;
     Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
@@ -338,6 +344,13 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
     Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
     return Type;
   }
+  case Mips::fixup_MICROMIPS_GPOFF_LO: {
+    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+    Type = setRType((unsigned)ELF::R_MICROMIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MICROMIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MICROMIPS_LO16, Type);
+    return Type;
+  }
   case Mips::fixup_Mips_HIGHER:
     return ELF::R_MIPS_HIGHER;
   case Mips::fixup_Mips_HIGHEST:
@@ -384,6 +397,10 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_MICROMIPS_TLS_TPREL_LO16;
   case Mips::fixup_MICROMIPS_SUB:
     return ELF::R_MICROMIPS_SUB;
+  case Mips::fixup_MICROMIPS_HIGHER:
+    return ELF::R_MICROMIPS_HIGHER;
+  case Mips::fixup_MICROMIPS_HIGHEST:
+    return ELF::R_MICROMIPS_HIGHEST;
   }
 
   llvm_unreachable("invalid fixup kind!");
@@ -436,15 +453,15 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
     return;
 
   // Sort relocations by the address they are applied to.
-  std::sort(Relocs.begin(), Relocs.end(),
-            [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
-              return A.Offset < B.Offset;
-            });
+  llvm::sort(Relocs.begin(), Relocs.end(),
+             [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+               return A.Offset < B.Offset;
+             });
 
   std::list<MipsRelocationEntry> Sorted;
   std::list<ELFRelocationEntry> Remainder;
 
-  DEBUG(dumpRelocs("R: ", Relocs));
+  LLVM_DEBUG(dumpRelocs("R: ", Relocs));
 
   // Separate the movable relocations (AHL relocations using the high bits) from
   // the immobile relocations (everything else). This does not preserve high/low
@@ -455,7 +472,7 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                });
 
   for (auto &R : Remainder) {
-    DEBUG(dbgs() << "Matching: " << R << "\n");
+    LLVM_DEBUG(dbgs() << "Matching: " << R << "\n");
 
     unsigned MatchingType = getMatchingLoType(R);
     assert(MatchingType != ELF::R_MIPS_NONE &&
@@ -490,7 +507,7 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
     Sorted.insert(InsertionPoint, R)->Matched = true;
   }
 
-  DEBUG(dumpRelocs("S: ", Sorted));
+  LLVM_DEBUG(dumpRelocs("S: ", Sorted));
 
   assert(Relocs.size() == Sorted.size() && "Some relocs were not consumed");
 
@@ -658,13 +675,11 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createMipsELFObjectWriter(raw_pwrite_stream &OS, const Triple &TT,
-                                bool IsN32) {
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createMipsELFObjectWriter(const Triple &TT, bool IsN32) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
   bool IsN64 = TT.isArch64Bit() && !IsN32;
   bool HasRelocationAddend = TT.isArch64Bit();
-  auto MOTW = llvm::make_unique<MipsELFObjectWriter>(
-      OSABI, HasRelocationAddend, IsN64, TT.isLittleEndian());
-  return createELFObjectWriter(std::move(MOTW), OS, TT.isLittleEndian());
+  return llvm::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend,
+                                                IsN64);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 4b8f9c7a680c..7b9a02503ce2 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/Casting.h"
 
@@ -23,9 +24,10 @@ using namespace llvm;
 
 MipsELFStreamer::MipsELFStreamer(MCContext &Context,
                                  std::unique_ptr<MCAsmBackend> MAB,
-                                 raw_pwrite_stream &OS,
+                                 std::unique_ptr<MCObjectWriter> OW,
                                  std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) {
+    : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+                    std::move(Emitter)) {
   RegInfoRecord = new MipsRegInfoRecord(this, Context);
   MipsOptionRecords.push_back(
       std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
@@ -84,6 +86,11 @@ void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   Labels.clear();
 }
 
+void MipsELFStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
+  MCELFStreamer::EmitIntValue(Value, Size);
+  Labels.clear();
+}
+
 void MipsELFStreamer::EmitMipsOptionRecords() {
   for (const auto &I : MipsOptionRecords)
     I->EmitMipsOptionRecord();
@@ -91,7 +98,8 @@ void MipsELFStreamer::EmitMipsOptionRecords() {
 
 MCELFStreamer *llvm::createMipsELFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter,
+    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
     bool RelaxAll) {
-  return new MipsELFStreamer(Context, std::move(MAB), OS, std::move(Emitter));
+  return new MipsELFStreamer(Context, std::move(MAB), std::move(OW),
+                             std::move(Emitter));
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 2fe9b08b645a..d141f5d77c61 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -34,7 +34,7 @@ class MipsELFStreamer : public MCELFStreamer {
 
 public:
   MipsELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-                  raw_pwrite_stream &OS,
+                  std::unique_ptr<MCObjectWriter> OW,
                   std::unique_ptr<MCCodeEmitter> Emitter);
 
   /// Overriding this function allows us to add arbitrary behaviour before the
@@ -54,9 +54,11 @@ public:
   void SwitchSection(MCSection *Section,
                      const MCExpr *Subsection = nullptr) override;
 
-  /// Overriding this function allows us to dismiss all labels that are
-  /// candidates for marking as microMIPS when .word directive is emitted.
+  /// Overriding these functions allows us to dismiss all labels that are
+  /// candidates for marking as microMIPS when .word/.long/.4byte etc
+  /// directives are emitted.
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
+  void EmitIntValue(uint64_t Value, unsigned Size) override;
 
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
@@ -67,7 +69,7 @@ public:
 
 MCELFStreamer *createMipsELFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> OW,
                                      std::unique_ptr<MCCodeEmitter> Emitter,
                                      bool RelaxAll);
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 6148a1b622c8..fdb560f3c72f 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -96,10 +96,14 @@ namespace Mips {
     fixup_Mips_Branch_PCRel,
 
     // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+    //                R_MICROMIPS_GPREL16/R_MICROMIPS_SUB/R_MICROMIPS_HI16
     fixup_Mips_GPOFF_HI,
+    fixup_MICROMIPS_GPOFF_HI,
 
     // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+    //                R_MICROMIPS_GPREL16/R_MICROMIPS_SUB/R_MICROMIPS_LO16
     fixup_Mips_GPOFF_LO,
+    fixup_MICROMIPS_GPOFF_LO,
 
     // resulting in - R_MIPS_PAGE
     fixup_Mips_GOT_PAGE,
@@ -110,11 +114,13 @@ namespace Mips {
     // resulting in - R_MIPS_GOT_DISP
     fixup_Mips_GOT_DISP,
 
-    // resulting in - R_MIPS_GOT_HIGHER
+    // resulting in - R_MIPS_HIGHER/R_MICROMIPS_HIGHER 
     fixup_Mips_HIGHER,
+    fixup_MICROMIPS_HIGHER,
 
-    // resulting in - R_MIPS_HIGHEST
+    // resulting in - R_MIPS_HIGHEST/R_MICROMIPS_HIGHEST
     fixup_Mips_HIGHEST,
+    fixup_MICROMIPS_HIGHEST,
 
     // resulting in - R_MIPS_GOT_HI16
     fixup_Mips_GOT_HI16,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index e63304220ae5..f498d830c8f0 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -21,16 +21,14 @@ void MipsMCAsmInfo::anchor() { }
 MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
   IsLittleEndian = TheTriple.isLittleEndian();
 
-  if ((TheTriple.getArch() == Triple::mips64el) ||
-      (TheTriple.getArch() == Triple::mips64)) {
+  if (TheTriple.isMIPS64()) {
     CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   // FIXME: This condition isn't quite right but it's the best we can do until
   //        this object can identify the ABI. It will misbehave when using O32
   //        on a mips64*-* triple.
-  if ((TheTriple.getArch() == Triple::mipsel) ||
-      (TheTriple.getArch() == Triple::mips)) {
+  if (TheTriple.isMIPS32()) {
     PrivateGlobalPrefix = "$";
     PrivateLabelPrefix = "$";
   }
@@ -54,8 +52,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
   HasMipsExpressions = true;
 
   // Enable IAS by default for O32.
-  if (TheTriple.getArch() == Triple::mips ||
-      TheTriple.getArch() == Triple::mipsel)
+  if (TheTriple.isMIPS32())
     UseIntegratedAssembler = true;
 
   // Enable IAS by default for Debian mips64/mips64el.
@@ -65,4 +62,9 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
   // Enable IAS by default for Android mips64el that uses N64 ABI.
   if (TheTriple.getArch() == Triple::mips64el && TheTriple.isAndroid())
     UseIntegratedAssembler = true;
+
+  // Enable IAS by default for FreeBSD / OpenBSD mips64/mips64el.
+  if (TheTriple.isOSFreeBSD() ||
+      TheTriple.isOSOpenBSD())
+    UseIntegratedAssembler = true;
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 2f6dd0035de3..cd34b0ab70b4 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -656,27 +656,29 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       break;
     case MipsMCExpr::MEK_LO:
       // Check for %lo(%neg(%gp_rel(X)))
-      if (MipsExpr->isGpOff()) {
-        FixupKind = Mips::fixup_Mips_GPOFF_LO;
-        break;
-      }
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
-                                   : Mips::fixup_Mips_LO16;
+      if (MipsExpr->isGpOff())
+        FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_LO
+                                     : Mips::fixup_Mips_GPOFF_LO;
+      else
+        FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                                     : Mips::fixup_Mips_LO16;
       break;
     case MipsMCExpr::MEK_HIGHEST:
-      FixupKind = Mips::fixup_Mips_HIGHEST;
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHEST
+                                   : Mips::fixup_Mips_HIGHEST;
       break;
     case MipsMCExpr::MEK_HIGHER:
-      FixupKind = Mips::fixup_Mips_HIGHER;
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHER
+                                   : Mips::fixup_Mips_HIGHER;
       break;
     case MipsMCExpr::MEK_HI:
       // Check for %hi(%neg(%gp_rel(X)))
-      if (MipsExpr->isGpOff()) {
-        FixupKind = Mips::fixup_Mips_GPOFF_HI;
-        break;
-      }
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
-                                   : Mips::fixup_Mips_HI16;
+      if (MipsExpr->isGpOff())
+        FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_HI
+                                     : Mips::fixup_Mips_GPOFF_HI;
+      else
+        FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+                                     : Mips::fixup_Mips_HI16;
       break;
     case MipsMCExpr::MEK_PCREL_HI16:
       FixupKind = Mips::fixup_MIPS_PCHI16;
@@ -1058,13 +1060,6 @@ MipsMCCodeEmitter::getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned
-MipsMCCodeEmitter::getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
-}
-
-unsigned
 MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
                                           SmallVectorImpl<MCFixup> &Fixups,
                                           const MCSubtargetInfo &STI) const {
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 1e840114b2b3..09d50d4776ba 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -245,10 +245,6 @@ public:
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
 
-  unsigned getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
-
   unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index dfacf4354516..988629ed1bca 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -24,7 +24,7 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg);
 // This function creates an MCELFStreamer for Mips NaCl.
 MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
                                          std::unique_ptr<MCAsmBackend> TAB,
-                                         raw_pwrite_stream &OS,
+                                         std::unique_ptr<MCObjectWriter> OW,
                                          std::unique_ptr<MCCodeEmitter> Emitter,
                                          bool RelaxAll);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 8fcd8aa4c19b..ce208b7f98bc 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -46,7 +47,7 @@ using namespace llvm;
 /// FIXME: Merge with the copy in MipsSubtarget.cpp
 StringRef MIPS_MC::selectMipsCPU(const Triple &TT, StringRef CPU) {
   if (CPU.empty() || CPU == "generic") {
-    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
+    if (TT.isMIPS32())
       CPU = "mips32";
     else
       CPU = "mips64";
@@ -93,15 +94,15 @@ static MCInstPrinter *createMipsMCInstPrinter(const Triple &T,
 
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                    raw_pwrite_stream &OS,
+                                    std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
                                     bool RelaxAll) {
   MCStreamer *S;
   if (!T.isOSNaCl())
-    S = createMipsELFStreamer(Context, std::move(MAB), OS, std::move(Emitter),
-                              RelaxAll);
+    S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
+                              std::move(Emitter), RelaxAll);
   else
-    S = createMipsNaClELFStreamer(Context, std::move(MAB), OS,
+    S = createMipsNaClELFStreamer(Context, std::move(MAB), std::move(OW),
                                   std::move(Emitter), RelaxAll);
   return S;
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 5dab6c3e81d6..4fc174ab5871 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -23,7 +23,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -49,8 +49,8 @@ MCAsmBackend *createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                    const MCRegisterInfo &MRI,
                                    const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
-createMipsELFObjectWriter(raw_pwrite_stream &OS, const Triple &TT, bool IsN32);
+std::unique_ptr<MCObjectTargetWriter>
+createMipsELFObjectWriter(const Triple &TT, bool IsN32);
 
 namespace MIPS_MC {
 StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index d878cf82e26d..6bf62ea618b4 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 
@@ -43,9 +44,10 @@ const unsigned LoadStoreStackMaskReg = Mips::T7;
 class MipsNaClELFStreamer : public MipsELFStreamer {
 public:
   MipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                      raw_pwrite_stream &OS,
+                      std::unique_ptr<MCObjectWriter> OW,
                       std::unique_ptr<MCCodeEmitter> Emitter)
-      : MipsELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)) {}
+      : MipsELFStreamer(Context, std::move(TAB), std::move(OW),
+                        std::move(Emitter)) {}
 
   ~MipsNaClELFStreamer() override = default;
 
@@ -260,11 +262,11 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg) {
 
 MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
                                          std::unique_ptr<MCAsmBackend> TAB,
-                                         raw_pwrite_stream &OS,
+                                         std::unique_ptr<MCObjectWriter> OW,
                                          std::unique_ptr<MCCodeEmitter> Emitter,
                                          bool RelaxAll) {
-  MipsNaClELFStreamer *S =
-      new MipsNaClELFStreamer(Context, std::move(TAB), OS, std::move(Emitter));
+  MipsNaClELFStreamer *S = new MipsNaClELFStreamer(
+      Context, std::move(TAB), std::move(OW), std::move(Emitter));
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
 
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index fb4e1ba0ded9..1eb21b6cc826 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -52,6 +52,12 @@ void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMt() {}
 void MipsTargetStreamer::emitDirectiveSetNoMt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetCRC() {}
+void MipsTargetStreamer::emitDirectiveSetNoCRC() {}
+void MipsTargetStreamer::emitDirectiveSetVirt() {}
+void MipsTargetStreamer::emitDirectiveSetNoVirt() {}
+void MipsTargetStreamer::emitDirectiveSetGINV() {}
+void MipsTargetStreamer::emitDirectiveSetNoGINV() {}
 void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
   forbidModuleDirective();
@@ -122,6 +128,12 @@ void MipsTargetStreamer::emitDirectiveModuleOddSPReg() {
 void MipsTargetStreamer::emitDirectiveModuleSoftFloat() {}
 void MipsTargetStreamer::emitDirectiveModuleHardFloat() {}
 void MipsTargetStreamer::emitDirectiveModuleMT() {}
+void MipsTargetStreamer::emitDirectiveModuleCRC() {}
+void MipsTargetStreamer::emitDirectiveModuleNoCRC() {}
+void MipsTargetStreamer::emitDirectiveModuleVirt() {}
+void MipsTargetStreamer::emitDirectiveModuleNoVirt() {}
+void MipsTargetStreamer::emitDirectiveModuleGINV() {}
+void MipsTargetStreamer::emitDirectiveModuleNoGINV() {}
 void MipsTargetStreamer::emitDirectiveSetFp(
     MipsABIFlagsSection::FpABIKind Value) {
   forbidModuleDirective();
@@ -421,6 +433,36 @@ void MipsTargetAsmStreamer::emitDirectiveSetNoMt() {
   MipsTargetStreamer::emitDirectiveSetNoMt();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetCRC() {
+  OS << "\t.set\tcrc\n";
+  MipsTargetStreamer::emitDirectiveSetCRC();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoCRC() {
+  OS << "\t.set\tnocrc\n";
+  MipsTargetStreamer::emitDirectiveSetNoCRC();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetVirt() {
+  OS << "\t.set\tvirt\n";
+  MipsTargetStreamer::emitDirectiveSetVirt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoVirt() {
+  OS << "\t.set\tnovirt\n";
+  MipsTargetStreamer::emitDirectiveSetNoVirt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetGINV() {
+  OS << "\t.set\tginv\n";
+  MipsTargetStreamer::emitDirectiveSetGINV();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoGINV() {
+  OS << "\t.set\tnoginv\n";
+  MipsTargetStreamer::emitDirectiveSetNoGINV();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetAt() {
   OS << "\t.set\tat\n";
   MipsTargetStreamer::emitDirectiveSetAt();
@@ -694,6 +736,30 @@ void MipsTargetAsmStreamer::emitDirectiveModuleMT() {
   OS << "\t.module\tmt\n";
 }
 
+void MipsTargetAsmStreamer::emitDirectiveModuleCRC() {
+  OS << "\t.module\tcrc\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleNoCRC() {
+  OS << "\t.module\tnocrc\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleVirt() {
+  OS << "\t.module\tvirt\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleNoVirt() {
+  OS << "\t.module\tnovirt\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleGINV() {
+  OS << "\t.module\tginv\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleNoGINV() {
+  OS << "\t.module\tnoginv\n";
+}
+
 // This part is for ELF object output.
 MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 25048293714d..ed5b8dd71a51 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -17,12 +17,6 @@ class MMR6Arch<string opstr> {
   string DecoderNamespace = "MicroMipsR6";
 }
 
-// Class used for microMIPS32r6 instructions.
-class MicroMipsR6Inst16 : PredicateControl {
-  string DecoderNamespace = "MicroMipsR6";
-  let InsnPredicates = [HasMicroMips32r6];
-}
-
 //===----------------------------------------------------------------------===//
 //
 // Disambiguators
@@ -50,7 +44,7 @@ class BC16_FM_MM16R6 {
   let Inst{9-0}   = offset;
 }
 
-class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 {
+class BEQZC_BNEZC_FM_MM16R6<bits<6> op> {
   bits<3> rs;
   bits<7> offset;
 
@@ -174,22 +168,6 @@ class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
   let Inst{15-0}  = imm16;
 }
 
-class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst {
-  bits<21> addr;
-  bits<5> hint;
-  bits<5> base = addr{20-16};
-  bits<9> offset = addr{8-0};
-
-  bits<32> Inst;
-
-  let Inst{31-26} = op;
-  let Inst{25-21} = hint;
-  let Inst{20-16} = base;
-  let Inst{15-12} = 0b1010;
-  let Inst{11-9} = funct;
-  let Inst{8-0}  = offset;
-}
-
 class LB32_FM_MMR6 : MipsR6Inst {
   bits<21> addr;
   bits<5> rt;
@@ -218,34 +196,6 @@ class LBU32_FM_MMR6 : MipsR6Inst {
   let Inst{15-0}  = offset;
 }
 
-class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst {
-  bits<21> addr;
-  bits<5> rt;
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 0b011000;
-  let Inst{25-21} = rt;
-  let Inst{20-16} = addr{20-16};
-  let Inst{15-12} = 0b0110;
-  let Inst{11-9} = funct;
-  let Inst{8-0}  = addr{8-0};
-}
-
-class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct>
-    : MMR6Arch<instr_asm> {
-  bits<5> rd;
-  bits<5> rt;
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 0b000000;
-  let Inst{25-21} = rd;
-  let Inst{20-16} = rt;
-  let Inst{15-6}  = funct;
-  let Inst{5-0}   = 0b111100;
-}
-
 class PCREL19_FM_MMR6<bits<2> funct> : MipsR6Inst {
   bits<5> rt;
   bits<19> imm;
@@ -436,38 +386,6 @@ class SB32_SH32_STORE_FM_MMR6<bits<6> op> {
   let Inst{15-0}  = offset;
 }
 
-class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> {
-  bits<5> rt;
-  bits<21> addr;
-  bits<5> base = addr{20-16};
-  bits<9> offset = addr{8-0};
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 0b011000;
-  let Inst{25-21} = rt;
-  let Inst{20-16} = base;
-  let Inst{15-12} = 0b1010;
-  let Inst{11-9}  = funct;
-  let Inst{8-0}   = offset;
-}
-
-class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> {
-  bits<5> rt;
-  bits<21> addr;
-  bits<5> base = addr{20-16};
-  bits<9> offset = addr{8-0};
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 0b011000;
-  let Inst{25-21} = rt;
-  let Inst{20-16} = base;
-  let Inst{15-12} = 0b0110;
-  let Inst{11-9}  = funct;
-  let Inst{8-0}   = offset;
-}
-
 class LOAD_WORD_FM_MMR6 {
   bits<5> rt;
   bits<21> addr;
@@ -631,23 +549,6 @@ class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
   let Inst{15-0}  = addr{15-0};
 }
 
-class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt,
-    bits<3> funct> : MMR6Arch<instr_asm> {
-  bits<5> rt;
-  bits<21> addr;
-  bits<5> base = addr{20-16};
-  bits<9> offset = addr{8-0};
-
-  bits<32> Inst;
-
-  let Inst{31-26} = op;
-  let Inst{25-21} = rt;
-  let Inst{20-16} = base;
-  let Inst{15-12} = fmt;
-  let Inst{11-9} = funct;
-  let Inst{8-0}  = offset;
-}
-
 class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct>
     : MMR6Arch<instr_asm>, MipsR6Inst {
   bits<5> ft;
@@ -791,7 +692,7 @@ class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
   let Inst{5-0}   = 0b111011;
 }
 
-class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16A_ADDU16_FM_MMR6 {
   bits<3> rs;
   bits<3> rt;
   bits<3> rd;
@@ -805,7 +706,7 @@ class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
   let Inst{0}     = 0;
 }
 
-class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16C_AND16_FM_MMR6 {
   bits<3> rt;
   bits<3> rs;
 
@@ -817,7 +718,7 @@ class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
   let Inst{3-0}   = 0b0001;
 }
 
-class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16C_NOT16_FM_MMR6 {
   bits<3> rt;
   bits<3> rs;
 
@@ -829,7 +730,7 @@ class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
   let Inst{3-0}   = 0b0000;
 }
 
-class POOL16C_MOVEP16_FM_MMR6 : MicroMipsR6Inst16 {
+class POOL16C_MOVEP16_FM_MMR6 {
   bits<3> dst_regs;
   bits<3> rt;
   bits<3> rs;
@@ -844,7 +745,7 @@ class POOL16C_MOVEP16_FM_MMR6 : MicroMipsR6Inst16 {
   let Inst{1-0}   = rs{1-0};
 }
 
-class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> {
   bits<3> rt;
   bits<3> rs;
 
@@ -879,7 +780,8 @@ class POOL16A_SUBU16_FM_MMR6 {
   let Inst{0}     = 0b1;
 }
 
-class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
+class POOL32A_WRPGPR_WSBH_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
   bits<5> rt;
   bits<5> rs;
 
@@ -981,6 +883,23 @@ class POOL32A_MFTC0_FM_MMR6<string instr_asm, bits<5> funct, bits<6> opcode>
   let Inst{5-0}   = opcode;
 }
 
+class POOL32A_GINV_FM_MMR6<string instr_asm, bits<2> ginv>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rs;
+  bits<2> type;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = 0b011;
+  let Inst{12-11} = ginv;
+  let Inst{10-9}  = type;
+  let Inst{8-6}   = 0b101;
+  let Inst{5-0}   = 0b111100;
+}
+
 class POOL32F_MFTC1_FM_MMR6<string instr_asm, bits<8> funct>
     : MMR6Arch<instr_asm> {
   bits<5> rt;
@@ -1037,21 +956,6 @@ class POOL32A_DVPEVP_FM_MMR6<string instr_asm, bits<10> funct>
   let Inst{5-0}   = 0b111100;
 }
 
-class POOL32B_LWP_SWP_FM_MMR6<bits<4> funct> : MipsR6Inst {
-  bits<5> rd;
-  bits<21> addr;
-  bits<5> base = addr{20-16};
-  bits<12> offset = addr{11-0};
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 0x8;
-  let Inst{25-21} = rd;
-  let Inst{20-16} = base;
-  let Inst{15-12} = funct;
-  let Inst{11-0}  = offset;
-}
-
 class CMP_BRANCH_OFF21_FM_MMR6<string opstr, bits<6> funct> : MipsR6Inst {
   bits<5> rs;
   bits<21> offset;
@@ -1107,3 +1011,21 @@ class POOL32B_LDWC2_SDWC2_FM_MMR6<string instr_asm, bits<4> funct>
   let Inst{11}    = 0;
   let Inst{10-0}  = offset;
 }
+
+class POOL32C_LL_E_SC_E_FM_MMR6<string instr_asm, bits<4> majorFunc,
+                                bits<3> minorFunc>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = majorFunc;
+  let Inst{11-9}  = minorFunc;
+  let Inst{8-0}   = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 326897dc5c63..f795112ae2b7 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -106,20 +106,20 @@ class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>;
 class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>;
 class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>;
 class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">;
+class GINVI_MMR6_ENC : POOL32A_GINV_FM_MMR6<"ginvi", 0b00>;
+class GINVT_MMR6_ENC : POOL32A_GINV_FM_MMR6<"ginvt", 0b10>;
 class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>;
 class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>;
 class JIC_MMR6_ENC   : JMP_IDX_COMPACT_FM<0b101000>;
 class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
 class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
 class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
-class LWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x1>;
 class LWPC_MMR6_ENC  : PCREL19_FM_MMR6<0b01>;
 class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
 class MFC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfc0", 0b00011, 0b111100>;
 class MFC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfc1", 0b10000000>;
 class MFC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfc2", 0b0100110100>;
 class MFHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfhc0", 0b00011, 0b110100>;
-class MFHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfhc1", 0b11000000>;
 class MFHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfhc2", 0b1000110100>;
 class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
 class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
@@ -131,15 +131,12 @@ class MTC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mtc0", 0b01011, 0b111100>;
 class MTC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mtc1", 0b10100000>;
 class MTC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mtc2", 0b0101110100>;
 class MTHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mthc0", 0b01011, 0b110100>;
-class MTHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mthc1", 0b11100000>;
 class MTHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mthc2", 0b1001110100>;
 class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
 class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
 class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
 class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>;
 class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>;
-class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>;
-class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>;
 class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>;
 class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>;
 class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>;
@@ -147,19 +144,13 @@ class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>;
 class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>;
 class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>;
 class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>;
-class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
 class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
 class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
 class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
-class SWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x9>;
-class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
-class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
-class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
-class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>;
+class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<"wrpgpr", 0x3c5>;
+class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<"wsbh", 0x1ec>;
 class LB_MMR6_ENC : LB32_FM_MMR6;
 class LBU_MMR6_ENC : LBU32_FM_MMR6;
-class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>;
-class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>;
 class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>;
 class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6;
 class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">;
@@ -184,15 +175,8 @@ class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>;
 class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>;
 class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>;
 class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>;
-class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>;
-class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>;
 class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>;
-class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>;
-class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>;
 class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>;
-class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>;
-class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
-class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
 class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
 class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
 class JALRC_HB_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc.hb", 0b0001111100>;
@@ -221,11 +205,11 @@ class BOVC_MMR6_ENC : POP35_BOVC_FM_MMR6<"bovc">;
 class BNVC_MMR6_ENC : POP37_BNVC_FM_MMR6<"bnvc">;
 class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
 class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
-class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
+class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>;
 class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6;
 class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>;
-class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16;
-class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
+class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>;
+class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>;
 class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
 class LI16_MMR6_ENC : LI_FM_MM16;
 class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
@@ -248,23 +232,20 @@ class SDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"sdc2", 0b1010>;
 class LWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"lwc2", 0b0000>;
 class SWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"swc2", 0b1000>;
 
+class LL_MMR6_ENC  : POOL32C_LL_E_SC_E_FM_MMR6<"ll", 0b0011, 0b000>;
+class SC_MMR6_ENC  : POOL32C_LL_E_SC_E_FM_MMR6<"sc", 0b1011, 0b000>;
+
 /// Floating Point Instructions
 class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
-class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
 class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>;
-class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>;
 class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>;
-class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>;
 class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>;
-class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>;
 class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>;
 class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
 class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
 class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
 class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
-class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
 class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
-class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>;
 class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
 class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
 class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>;
@@ -277,11 +258,7 @@ class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>;
 class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>;
 class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>;
 class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>;
-class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>;
-class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>;
-class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>;
 class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>;
-class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>;
 class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>;
 class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
 
@@ -390,7 +367,7 @@ class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm, II_BC> {
 class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
                                        !strconcat("bc16", "\t$offset"), [],
                                        II_BC, FrmI>,
-                       MMR6Arch<"bc16">, MicroMipsR6Inst16 {
+                       MMR6Arch<"bc16"> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -400,7 +377,8 @@ class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
 }
 
 class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm>
-    : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> {
+    : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>,
+      MMR6Arch<instr_asm> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 0;
@@ -441,17 +419,6 @@ class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd,
 class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd,
                                              II_PREF>;
 
-class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
-                                  RegisterOperand GPROpnd, InstrItinClass Itin>
-    : CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd, GPROpnd, Itin> {
-  string DecoderMethod = "DecodePrefeOpMM";
-}
-
-class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9,
-                                                    GPR32Opnd, II_PREFE>;
-class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9,
-                                                     GPR32Opnd, II_CACHEE>;
-
 class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
                             RegisterOperand GPROpnd, InstrItinClass Itin>
     : MMR6Arch<instr_asm> {
@@ -466,16 +433,6 @@ class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd, II_LB>;
 class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd,
                                             II_LBU>;
 
-class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
-                              RegisterOperand GPROpnd, InstrItinClass Itin>
-    : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd, Itin> {
-  let DecoderMethod = "DecodeLoadByte9";
-}
-class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd,
-                                              II_LBE>;
-class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd,
-                                               II_LBUE>;
-
 class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                              InstrItinClass Itin> : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rt);
@@ -498,7 +455,7 @@ class ERETNC_MMR6_DESC : ER_FT<"eretnc", II_ERETNC>;
 class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
     : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
                       [(MipsJmpLink RO:$rs)], II_JALR, FrmR>,
-      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+      MMR6Arch<opstr> {
   let isCall = 1;
   let hasDelaySlot = 0;
   let Defs = [RA];
@@ -532,7 +489,7 @@ class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
 class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
     : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
                       [], II_JR, FrmR>,
-      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+      MMR6Arch<opstr> {
   let hasDelaySlot = 0;
   let isBranch = 1;
   let isIndirectBranch = 1;
@@ -542,7 +499,7 @@ class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>;
 class JRCADDIUSP_MMR6_DESC
     : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm",
                       [], II_JRADDIUSP, FrmR>,
-      MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 {
+      MMR6Arch<"jrcaddiusp"> {
   let hasDelaySlot = 0;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -574,8 +531,6 @@ class AUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
 
 class AUI_MMR6_DESC : AUI_MMR6_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
 
-class SEB_MMR6_DESC : SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>;
-class SEH_MMR6_DESC : SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>;
 class ALUIPC_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                             InstrItinClass Itin> : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rt);
@@ -615,32 +570,6 @@ class ADDIUPC_MMR6_DESC : PCREL_MMR6_DESC_BASE<"addiupc", GPR32Opnd,
 class LWPC_MMR6_DESC: PCREL_MMR6_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2,
                                            II_LWPC>;
 
-class LWP_MMR6_DESC : MMR6Arch<"lwp"> {
-  dag OutOperandList = (outs regpair:$rd);
-  dag InOperandList = (ins mem_simm12:$addr);
-  string AsmString = !strconcat("lwp", "\t$rd, $addr");
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_LWP;
-  ComplexPattern Addr = addr;
-  Format f = FrmI;
-  string BaseOpcode = "lwp";
-  string DecoderMethod = "DecodeMemMMImm12";
-  bit mayLoad = 1;
-}
-
-class SWP_MMR6_DESC : MMR6Arch<"swp"> {
-  dag OutOperandList = (outs);
-  dag InOperandList = (ins regpair:$rd, mem_simm12:$addr);
-  string AsmString = !strconcat("swp", "\t$rd, $addr");
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_SWP;
-  ComplexPattern Addr = addr;
-  Format f = FrmI;
-  string BaseOpcode = "swp";
-  string DecoderMethod = "DecodeMemMMImm12";
-  bit mayStore = 1;
-}
-
 class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                                InstrItinClass Itin> : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
@@ -704,23 +633,11 @@ class ORI_MMR6_DESC : ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
 class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>;
 class XORI_MMR6_DESC : ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
                                    immZExt16, xor>;
-
-class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
-                  InstrItinClass Itin = NoItinerary,
-                  SDPatternOperator OpNode = null_frag,
-                  ComplexPattern Addr = addr> :
-  InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
-  let DecoderMethod = "DecodeMem";
-  let mayStore = 1;
-}
 class SW_MMR6_DESC : Store<"sw", GPR32Opnd> {
   InstrItinClass Itinerary = II_SW;
 }
-class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9, II_SWE>;
-
 class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
-                                 InstrItinClass Itin> : MMR6Arch<instr_asm> {
+                                 InstrItinClass Itin> {
   dag InOperandList = (ins RO:$rs);
   dag OutOperandList = (outs RO:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
@@ -789,12 +706,6 @@ class MTC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mtc2", COP2Opnd, GPR32Opnd,
                                            II_MTC2>;
 class MTHC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mthc0", COP0Opnd, GPR32Opnd,
                                             II_MTHC0>;
-class MTHC1_D32_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", AFGR64Opnd,
-                                                   GPR32Opnd, II_MTC1>,
-                            HARDFLOAT, FGR_32;
-class MTHC1_D64_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", FGR64Opnd,
-                                                   GPR32Opnd, II_MTC1>,
-                            HARDFLOAT, FGR_64;
 class MTHC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mthc2", COP2Opnd, GPR32Opnd,
                                             II_MTC2>;
 
@@ -838,10 +749,6 @@ class MFC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfc2", GPR32Opnd, COP2Opnd,
                                            II_MFC2>;
 class MFHC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfhc0", GPR32Opnd, COP0Opnd,
                                             II_MFHC0>;
-class MFHC1_D32_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, AFGR64Opnd,
-                                                II_MFHC1>, HARDFLOAT, FGR_32;
-class MFHC1_D64_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, FGR64Opnd,
-                                                II_MFHC1>, HARDFLOAT, FGR_64;
 class MFHC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfhc2", GPR32Opnd, COP2Opnd,
                                             II_MFC2>;
 
@@ -897,6 +804,49 @@ class SDC2_SWC2_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
 class SDC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"sdc2", II_SDC2>;
 class SWC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"swc2", II_SWC2>;
 
+class GINV_MMR6_DESC_BASE<string opstr,
+                          RegisterOperand SrcRC, InstrItinClass Itin> {
+  dag InOperandList = (ins SrcRC:$rs, uimm2:$type);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(opstr, "\t$rs, $type");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+  InstrItinClass Itinerary = Itin;
+}
+
+class GINVI_MMR6_DESC : GINV_MMR6_DESC_BASE<"ginvi", GPR32Opnd,
+                                            II_GINVI> {
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = "ginvi\t$rs";
+}
+class GINVT_MMR6_DESC : GINV_MMR6_DESC_BASE<"ginvt", GPR32Opnd,
+                                            II_GINVT>;
+
+class SC_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$dst);
+  dag InOperandList = (ins GPR32Opnd:$rt, mem_mm_9:$addr);
+  string AsmString = !strconcat(opstr, "\t$rt, $addr");
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = opstr;
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+  string DecoderMethod = "DecodeMemMMImm9";
+}
+
+class LL_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins mem_mm_9:$addr);
+  string AsmString = !strconcat(opstr, "\t$rt, $addr");
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = opstr;
+  bit mayLoad = 1;
+  string DecoderMethod = "DecodeMemMMImm9";
+}
+
+class SC_MMR6_DESC : SC_MMR6_DESC_BASE<"sc", II_SC>;
+class LL_MMR6_DESC : LL_MMR6_DESC_BASE<"ll", II_LL>;
+
 /// Floating Point Instructions
 class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
                             InstrItinClass Itin, bit isComm,
@@ -910,20 +860,12 @@ class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
 }
 class FADD_S_MMR6_DESC
   : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>;
-class FADD_D_MMR6_DESC
-  : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>;
 class FSUB_S_MMR6_DESC
   : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>;
-class FSUB_D_MMR6_DESC
-  : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>;
 class FMUL_S_MMR6_DESC
   : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>;
-class FMUL_D_MMR6_DESC
-  : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>;
 class FDIV_S_MMR6_DESC
   : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>;
-class FDIV_D_MMR6_DESC
-  : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>;
 class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd,
                                             II_MADDF_S>, HARDFLOAT;
 class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd,
@@ -946,12 +888,8 @@ class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
 }
 class FMOV_S_MMR6_DESC
   : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
-class FMOV_D_MMR6_DESC
-  : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>;
 class FNEG_S_MMR6_DESC
   : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
-class FNEG_D_MMR6_DESC
-  : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>;
 
 class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd, II_MAX_S>,
                         HARDFLOAT;
@@ -989,16 +927,8 @@ class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd,
                                              II_CVT>;
 class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd,
                                              II_CVT>;
-class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd,
-                                             II_CVT>;
-class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd,
-                                             II_CVT>;
-class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd,
-                                             II_CVT>;
 class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd,
                                              II_CVT>, FGR_64;
-class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd,
-                                             II_CVT>;
 class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd,
                                              II_CVT>;
 class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
@@ -1085,10 +1015,6 @@ class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
   list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 
-class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd,
-                                                II_ABS, fabs>;
-class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd,
-                                                II_ABS, fabs>;
 class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd,
                                                     FGR32Opnd, II_FLOOR>;
 class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd,
@@ -1154,70 +1080,35 @@ class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO,
 }
 class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd, II_SB>;
 
-class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
-                               InstrItinClass Itin>
-    : MMR6Arch<instr_asm>, MipsR6Inst {
-  dag OutOperandList = (outs);
-  dag InOperandList = (ins RO:$rt, mem_simm9:$addr);
-  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
-  string DecoderMethod = "DecodeStoreEvaOpMM";
-  bit mayStore = 1;
-  InstrItinClass Itinerary = Itin;
-}
-class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd, II_SBE>;
-class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
 class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd, II_SH>;
-class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd, II_SHE>;
-class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
-                                   InstrItinClass Itin>
-    : MMR6Arch<instr_asm>, MipsR6Inst {
-  dag OutOperandList = (outs RO:$rt);
-  dag InOperandList = (ins mem_simm9:$addr);
-  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
-  string DecoderMethod = "DecodeMemMMImm9";
-  bit mayLoad = 1;
-  InstrItinClass Itinerary = Itin;
-}
-class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
-class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd, II_LWE>;
 class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
       MMR6Arch<"addu16"> {
   int AddedComplexity = 1;
 }
-class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
-      MMR6Arch<"and16"> {
-  int AddedComplexity = 1;
-}
+class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND>,
+      MMR6Arch<"and16">;
 class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
       MMR6Arch<"andi16">;
 class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16"> {
   int AddedComplexity = 1;
 }
-class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
-      MMR6Arch<"or16"> {
-  int AddedComplexity = 1;
-}
+class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR>, MMR6Arch<"or16">;
 class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
       MMR6Arch<"sll16">;
 class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
       MMR6Arch<"srl16">;
-class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16", II_BREAK>, MMR6Arch<"break16">,
-      MicroMipsR6Inst16;
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16", II_BREAK>, MMR6Arch<"break16">;
 class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
-      MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
-class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
-      MicroMipsR6Inst16;
+      MMR6Arch<"li16">, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">;
 class MOVEP_MMR6_DESC : MovePMM16<"movep", GPRMM16OpndMoveP>, MMR6Arch<"movep">;
-class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">,
-      MicroMipsR6Inst16;
+class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">;
 class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
-      MMR6Arch<"subu16">, MicroMipsR6Inst16 {
-  int AddedComplexity = 1;
-}
-class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
-      MMR6Arch<"xor16"> {
+      MMR6Arch<"subu16"> {
   int AddedComplexity = 1;
 }
+class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR>,
+      MMR6Arch<"xor16">;
 
 class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
   dag OutOperandList = (outs GPR32Opnd:$rt);
@@ -1250,7 +1141,7 @@ class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
   bit HasSideEffects = 1;
 }
 
-class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> {
+class SYNCI_MMR6_DESC : SYNCI_FT<"synci", mem_mm_16> {
   let DecoderMethod = "DecodeSynciR6";
 }
 
@@ -1273,7 +1164,7 @@ class LWM16_MMR6_DESC
     : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
                       !strconcat("lwm16", "\t$rt, $addr"), [],
                       II_LWM, FrmI>,
-      MMR6Arch<"lwm16">, MicroMipsR6Inst16 {
+      MMR6Arch<"lwm16"> {
   let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
   let mayLoad = 1;
   ComplexPattern Addr = addr;
@@ -1283,7 +1174,7 @@ class SWM16_MMR6_DESC
     : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
                       !strconcat("swm16", "\t$rt, $addr"), [],
                       II_SWM, FrmI>,
-      MMR6Arch<"swm16">, MicroMipsR6Inst16 {
+      MMR6Arch<"swm16"> {
   let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
   let mayStore = 1;
   ComplexPattern Addr = addr;
@@ -1294,7 +1185,7 @@ class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
                           Operand MemOpnd>
     : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
                       !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
-      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+      MMR6Arch<opstr> {
   let DecoderMethod = "DecodeMemMMImm4";
   let mayStore = 1;
 }
@@ -1308,7 +1199,7 @@ class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
 class SWSP_MMR6_DESC
     : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
                       !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
-      MMR6Arch<"sw">, MicroMipsR6Inst16 {
+      MMR6Arch<"sw"> {
   let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
   let mayStore = 1;
 }
@@ -1473,6 +1364,11 @@ def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
 def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6;
 def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC,
                   ISA_MICROMIPS32R6;
+def GINVI_MMR6 : R6MMR6Rel, GINVI_MMR6_ENC, GINVI_MMR6_DESC,
+                 ISA_MICROMIPS32R6, ASE_GINV;
+def GINVT_MMR6 : R6MMR6Rel, GINVT_MMR6_ENC, GINVT_MMR6_DESC,
+                 ISA_MICROMIPS32R6, ASE_GINV;
+let FastISelShouldIgnore = 1 in
 def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC,
                    ISA_MICROMIPS32R6;
 def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6;
@@ -1481,29 +1377,17 @@ def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
 def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
                       ISA_MICROMIPS32R6;
 def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
-def LWP_MMR6 : StdMMR6Rel, LWP_MMR6_ENC, LWP_MMR6_DESC, ISA_MICROMIPS32R6;
 def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
 def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
 def MTC0_MMR6 : StdMMR6Rel, MTC0_MMR6_ENC, MTC0_MMR6_DESC, ISA_MICROMIPS32R6;
 def MTC1_MMR6 : StdMMR6Rel, MTC1_MMR6_DESC, MTC1_MMR6_ENC, ISA_MICROMIPS32R6;
 def MTC2_MMR6 : StdMMR6Rel, MTC2_MMR6_ENC, MTC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def MTHC0_MMR6 : R6MMR6Rel, MTHC0_MMR6_ENC, MTHC0_MMR6_DESC, ISA_MICROMIPS32R6;
-def MTHC1_D32_MMR6 : StdMMR6Rel, MTHC1_D32_MMR6_DESC, MTHC1_MMR6_ENC, ISA_MICROMIPS32R6;
-let DecoderNamespace = "MicroMipsFP64" in {
-  def MTHC1_D64_MMR6 : R6MMR6Rel, MTHC1_D64_MMR6_DESC, MTHC1_MMR6_ENC,
-                       ISA_MICROMIPS32R6;
-}
 def MTHC2_MMR6 : StdMMR6Rel, MTHC2_MMR6_ENC, MTHC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def MFC0_MMR6 : StdMMR6Rel, MFC0_MMR6_ENC, MFC0_MMR6_DESC, ISA_MICROMIPS32R6;
 def MFC1_MMR6 : StdMMR6Rel, MFC1_MMR6_DESC, MFC1_MMR6_ENC, ISA_MICROMIPS32R6;
 def MFC2_MMR6 : StdMMR6Rel, MFC2_MMR6_ENC, MFC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def MFHC0_MMR6 : R6MMR6Rel, MFHC0_MMR6_ENC, MFHC0_MMR6_DESC, ISA_MICROMIPS32R6;
-def MFHC1_D32_MMR6 : StdMMR6Rel, MFHC1_D32_MMR6_DESC, MFHC1_MMR6_ENC,
-                     ISA_MICROMIPS32R6;
-let DecoderNamespace = "MicroMipsFP64" in {
-  def MFHC1_D64_MMR6 : StdMMR6Rel, MFHC1_D64_MMR6_DESC, MFHC1_MMR6_ENC,
-                       ISA_MICROMIPS32R6;
-}
 def MFHC2_MMR6 : StdMMR6Rel, MFHC2_MMR6_ENC, MFHC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
 def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -1516,8 +1400,6 @@ def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6;
 def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6;
 def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6;
 def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6;
-def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6;
-def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6;
 def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC,
                   ISA_MICROMIPS32R6;
 def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC,
@@ -1529,17 +1411,11 @@ def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
 def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
 def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
 def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
-def SWP_MMR6 : StdMMR6Rel, SWP_MMR6_ENC, SWP_MMR6_DESC, ISA_MICROMIPS32R6;
-def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
-def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC,
                   ISA_MICROMIPS32R6;
 def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6;
 def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6;
 def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6;
-def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6;
-def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6;
 def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6;
 def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6;
 def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -1554,26 +1430,15 @@ def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
 let DecoderMethod = "DecodeMemMMImm16" in {
   def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6;
 }
-let DecoderMethod = "DecodeMemMMImm9" in {
-  def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6;
-}
 /// Floating Point Instructions
 def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC,
                   ISA_MICROMIPS32R6;
-def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC,
                   ISA_MICROMIPS32R6;
-def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC,
                   ISA_MICROMIPS32R6;
-def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC,
                   ISA_MICROMIPS32R6;
-def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC,
@@ -1584,12 +1449,8 @@ def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
                   ISA_MICROMIPS32R6;
-def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
                   ISA_MICROMIPS32R6;
-def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
 def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6;
 def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6;
@@ -1608,24 +1469,14 @@ def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC,
                    ISA_MICROMIPS32R6;
-def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
-def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
-def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
 def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC,
                    ISA_MICROMIPS32R6;
-def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
 def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd, II_CMP_CC_S>;
 defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd, II_CMP_CC_D>;
-def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6;
-def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6;
 def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC,
                      ISA_MICROMIPS32R6;
 def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC,
@@ -1650,17 +1501,8 @@ def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC,
                      ISA_MICROMIPS32R6;
 def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC,
                      ISA_MICROMIPS32R6;
-def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
-def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC,
-                  ISA_MICROMIPS32R6;
 def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6;
-def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6;
-def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6;
 def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6;
-def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6;
-def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6;
-def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6;
 def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6;
 def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6;
 def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC,
@@ -1747,6 +1589,8 @@ def LDC2_MMR6 : StdMMR6Rel, LDC2_MMR6_ENC, LDC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def SDC2_MMR6 : StdMMR6Rel, SDC2_MMR6_ENC, SDC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def LWC2_MMR6 : StdMMR6Rel, LWC2_MMR6_ENC, LWC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def SWC2_MMR6 : StdMMR6Rel, SWC2_MMR6_ENC, SWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def LL_MMR6   : R6MMR6Rel, LL_MMR6_ENC, LL_MMR6_DESC, ISA_MICROMIPS32R6;
+def SC_MMR6   : R6MMR6Rel, SC_MMR6_ENC, SC_MMR6_DESC, ISA_MICROMIPS32R6;
 }
 
 def BOVC_MMR6 : R6MMR6Rel, BOVC_MMR6_ENC, BOVC_MMR6_DESC, ISA_MICROMIPS32R6,
@@ -1806,6 +1650,8 @@ def : MipsInstAlias<"mfhc0 $rt, $rs",
                     ISA_MICROMIPS32R6;
 def : MipsInstAlias<"jalrc.hb $rs", (JALRC_HB_MMR6 RA, GPR32Opnd:$rs), 1>,
                     ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jal $offset", (BALC_MMR6 brtarget26_mm:$offset), 0>,
+                    ISA_MICROMIPS32R6;
 def : MipsInstAlias<"dvp", (DVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"evp", (EVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"jalrc $rs", (JALRC_MMR6 RA, GPR32Opnd:$rs), 1>,
@@ -1831,13 +1677,32 @@ def : MipsInstAlias<"xor $rs, $imm",
 def : MipsInstAlias<"not $rt, $rs",
                     (NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
                     ISA_MICROMIPS32R6;
-def : MipsInstAlias<"seh $rd", (SEH_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
-                    ISA_MICROMIPS32R6;
-def : MipsInstAlias<"seb $rd", (SEB_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+def : MipsInstAlias<"not $rt",
+                    (NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>,
                     ISA_MICROMIPS32R6;
 def : MipsInstAlias<"lapc $rd, $imm",
                     (ADDIUPC_MMR6 GPR32Opnd:$rd, simm19_lsl2:$imm)>,
                     ISA_MICROMIPS32R6;
+def : MipsInstAlias<"neg $rt, $rs",
+                    (SUB_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+      ISA_MICROMIPS32R6;
+def : MipsInstAlias<"neg $rt",
+                    (SUB_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+      ISA_MICROMIPS32R6;
+def : MipsInstAlias<"negu $rt, $rs",
+                    (SUBU_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+      ISA_MICROMIPS32R6;
+def : MipsInstAlias<"negu $rt",
+                    (SUBU_MMR6 GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+      ISA_MICROMIPS32R6;
+def : MipsInstAlias<"beqz16 $rs, $offset", (BEQZC16_MMR6 GPRMM16Opnd:$rs,
+                                                         brtarget7_mm:$offset),
+                    0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"bnez16 $rs, $offset", (BNEZC16_MMR6 GPRMM16Opnd:$rs,
+                                                         brtarget7_mm:$offset),
+                    0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"b16 $offset", (BC16_MMR6 brtarget10_mm:$offset), 0>,
+                    ISA_MICROMIPS32R6;
 
 //===----------------------------------------------------------------------===//
 //
@@ -1867,6 +1732,11 @@ defm : SelectInt_Pats<i32, OR_MM, XORI_MMR6, SLTi_MM, SLTiu_MM, SELEQZ_MMR6,
 defm S_MMR6 : Cmp_Pats<f32, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
 defm D_MMR6 : Cmp_Pats<f64, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
 
+def : MipsPat<(f32 fpimm0), (MTC1_MMR6 ZERO)>, ISA_MICROMIPS32R6;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1 ZERO))>, ISA_MICROMIPS32R6;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_W_D_MMR6 FGR64Opnd:$src)>, ISA_MICROMIPS32R6;
+
 def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
               (ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
               ISA_MICROMIPS32R6;
@@ -1898,3 +1768,37 @@ def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
 def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
               (TAILCALL_MMR6 texternalsym:$dst)>, ISA_MICROMIPS32R6;
 
+
+def : MipsPat<(brcond (i32 (setne GPR32:$lhs, 0)), bb:$dst),
+              (BNEZC_MMR6 GPR32:$lhs, bb:$dst)>, ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (seteq GPR32:$lhs, 0)), bb:$dst),
+              (BEQZC_MMR6 GPR32:$lhs, bb:$dst)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond (i32 (setge GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+              (BEQZC_MMR6 (SLT_MM GPR32:$lhs, GPR32:$rhs), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setuge GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+              (BEQZC_MMR6 (SLTu_MM GPR32:$lhs, GPR32:$rhs), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setge GPR32:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQZC_MMR6 (SLTi_MM GPR32:$lhs, immSExt16:$rhs), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setuge GPR32:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQZC_MMR6 (SLTiu_MM GPR32:$lhs, immSExt16:$rhs), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setgt GPR32:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQZC_MMR6 (SLTi_MM GPR32:$lhs, (Plus1 imm:$rhs)), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setugt GPR32:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQZC_MMR6 (SLTiu_MM GPR32:$lhs, (Plus1 imm:$rhs)), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond (i32 (setle GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+              (BEQZC_MMR6  (SLT_MM GPR32:$rhs, GPR32:$lhs), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+def : MipsPat<(brcond (i32 (setule GPR32:$lhs, GPR32:$rhs)), bb:$dst),
+              (BEQZC_MMR6  (SLTu_MM GPR32:$rhs, GPR32:$lhs), bb:$dst)>,
+      ISA_MICROMIPS32R6;
+
+def : MipsPat<(brcond GPR32:$cond, bb:$dst),
+              (BNEZC_MMR6 GPR32:$cond, bb:$dst)>, ISA_MICROMIPS32R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index af6473c468d9..0d444dfc9fad 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 class MMDSPInst<string opstr = "">
-    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
-  let InsnPredicates = [HasDSP];
-  let AdditionalPredicates = [InMicroMips];
+    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let ASEPredicate = [HasDSP];
+  let EncodingPredicates = [InMicroMips];
   string BaseOpcode = opstr;
   string Arch = "mmdsp";
   let DecoderNamespace = "MicroMips";
@@ -18,7 +18,7 @@ class MMDSPInst<string opstr = "">
 
 class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
     : InstAlias<Asm, Result, Emit>, PredicateControl {
-  let InsnPredicates = [HasDSP];
+  let ASEPredicate = [HasDSP];
   let AdditionalPredicates = [InMicroMips];
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 20c1ab5a9998..132de6be750d 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -386,6 +386,7 @@ class WRDSP_MM_DESC {
   string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
   list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
   InstrItinClass Itinerary = NoItinerary;
+  bit isMoveReg = 1;
 }
 
 class BPOSGE32C_MMR3_DESC {
@@ -416,11 +417,11 @@ class BPOSGE32_MM_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget_mm,
                                             NoItinerary>;
 
 let DecoderNamespace = "MicroMipsDSP", Arch = "mmdsp",
-    AdditionalPredicates = [HasDSP, InMicroMips] in {
-    def LWDSP_MM : Load<"lw", DSPROpnd, null_frag, II_LW>, DspMMRel,
-                   LW_FM_MM<0x3f>;
-    def SWDSP_MM : Store<"sw", DSPROpnd, null_frag, II_SW>, DspMMRel,
-                   LW_FM_MM<0x3e>;
+    EncodingPredicates = [InMicroMips], ASEPredicate = [HasDSP] in {
+  def LWDSP_MM : Load<"lw", DSPROpnd, null_frag, II_LW>, DspMMRel,
+                 LW_FM_MM<0x3f>;
+  def SWDSP_MM : Store<"sw", DSPROpnd, null_frag, II_SW>, DspMMRel,
+                 LW_FM_MM<0x3e>;
 }
 // Instruction defs.
 // microMIPS DSP Rev 1
@@ -530,7 +531,7 @@ def MODSUB_MM : DspMMRel, MODSUB_MM_ENC, MODSUB_DESC;
 def MULSAQ_S_W_PH_MM : DspMMRel, MULSAQ_S_W_PH_MM_ENC, MULSAQ_S_W_PH_DESC;
 def BITREV_MM : DspMMRel, BITREV_MM_ENC, BITREV_MM_DESC;
 def BPOSGE32_MM : DspMMRel, BPOSGE32_MM_ENC, BPOSGE32_MM_DESC,
-                  ISA_MIPS1_NOT_32R6_64R6;
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
 def CMP_EQ_PH_MM : DspMMRel, CMP_EQ_PH_MM_ENC, CMP_EQ_PH_DESC;
 def CMP_LT_PH_MM : DspMMRel, CMP_LT_PH_MM_ENC, CMP_LT_PH_DESC;
 def CMP_LE_PH_MM : DspMMRel, CMP_LE_PH_MM_ENC, CMP_LE_PH_DESC;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
index 49025cc1570a..84ae0eddf980 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -11,7 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-let isCodeGenOnly = 1 in {
+multiclass ADDS_MMM<string opstr, InstrItinClass Itin, bit IsComm,
+                    SDPatternOperator OpNode = null_frag> {
+  def _D32_MM : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
+                FGR_32 {
+    string DecoderNamespace = "MicroMips";
+  }
+  // FIXME: This needs to be part of the instruction mapping tables.
+  def _D64_MM : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>, FGR_64 {
+    string DecoderNamespace = "MicroMipsFP64";
+  }
+}
+
 def FADD_S_MM : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
                 ADDS_FM_MM<0, 0x30>, ISA_MICROMIPS;
 def FDIV_S_MM : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
@@ -21,27 +32,27 @@ def FMUL_S_MM : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
 def FSUB_S_MM : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
                 ADDS_FM_MM<0, 0x70>, ISA_MICROMIPS;
 
-def FADD_MM  : MMRel, ADDS_FT<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>,
-               ADDS_FM_MM<1, 0x30>, ISA_MICROMIPS;
-def FDIV_MM  : MMRel, ADDS_FT<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>,
-               ADDS_FM_MM<1, 0xf0>, ISA_MICROMIPS;
-def FMUL_MM  : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
-               ADDS_FM_MM<1, 0xb0>, ISA_MICROMIPS;
-def FSUB_MM  : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
-               ADDS_FM_MM<1, 0x70>, ISA_MICROMIPS;
-
-def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
-               LWXC1_FM_MM<0x48>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
-               SWXC1_FM_MM<0x88>, ISA_MICROMIPS32_NOT_MIPS32R6;
-
-// FIXME: These instruction definitions are incorrect. They should be 64-bit
-//        FPU only.
-def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
-               LWXC1_FM_MM<0x148>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
-               SWXC1_FM_MM<0x188>, ISA_MICROMIPS32_NOT_MIPS32R6;
+defm FADD : ADDS_MMM<"add.d", II_ADD_D, 1, fadd>,
+            ADDS_FM_MM<1, 0x30>, ISA_MICROMIPS;
+defm FDIV : ADDS_MMM<"div.d", II_DIV_D, 0, fdiv>,
+            ADDS_FM_MM<1, 0xf0>, ISA_MICROMIPS;
+defm FMUL : ADDS_MMM<"mul.d", II_MUL_D, 1, fmul>,
+            ADDS_FM_MM<1, 0xb0>, ISA_MICROMIPS;
+defm FSUB : ADDS_MMM<"sub.d", II_SUB_D, 0, fsub>,
+            ADDS_FM_MM<1, 0x70>, ISA_MICROMIPS;
 
+let DecoderNamespace = "MicroMips" in {
+  def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
+                 LWXC1_FM_MM<0x48>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
+                 SWXC1_FM_MM<0x88>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
+  def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>,
+                 LWXC1_FM_MM<0x148>, FGR_64, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>,
+                 SWXC1_FM_MM<0x188>, FGR_64, ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+let isCodeGenOnly = 1 in {
 def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
                   CEQS_FM_MM<0>, ISA_MICROMIPS32_NOT_MIPS32R6 {
   // FIXME: This is a required to work around the fact that these instructions
@@ -65,130 +76,174 @@ let DecoderNamespace = "MicroMips" in {
                 BC1F_FM_MM<0x1c>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
                 BC1F_FM_MM<0x1d>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                     ROUND_W_FM_MM<0, 0x24>, ISA_MICROMIPS;
 }
 
-let isCodeGenOnly = 1 in {
-def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
-                   ROUND_W_FM_MM<0, 0x24>, ISA_MICROMIPS;
-def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd,
-                                              II_ROUND>, ROUND_W_FM_MM<0, 0xec>,
-                   ISA_MICROMIPS;
-
-def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
-                 ROUND_W_FM_MM<1, 0x6c>, ISA_MICROMIPS, FGR_32;
-def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
-                 ROUND_W_FM_MM<1, 0x24>, ISA_MICROMIPS, FGR_32;
-def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
-                 ROUND_W_FM_MM<1, 0x2c>, ISA_MICROMIPS, FGR_32;
-def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd,
-                                            II_ROUND>, ROUND_W_FM_MM<1, 0xec>,
-                 ISA_MICROMIPS, FGR_32;
-def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
-                 ROUND_W_FM_MM<1, 0xac>, ISA_MICROMIPS, FGR_32;
-
-def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
-                              fsqrt>, ROUND_W_FM_MM<1, 0x28>,
-               ISA_MICROMIPS, FGR_32;
-
-def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-                   ROUND_W_FM_MM<0, 0x4>, ISA_MICROMIPS, FGR_64;
-def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
-                   ROUND_W_FM_MM<1, 0x4>, ISA_MICROMIPS, FGR_64;
+let DecoderNamespace = "MicroMips" in {
+  def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd,
+                                                FGR32Opnd, II_ROUND>,
+                     ROUND_W_FM_MM<0, 0xec>, ISA_MICROMIPS;
+
+  def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
+                   ROUND_W_FM_MM<1, 0x6c>, ISA_MICROMIPS, FGR_32;
+  def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
+                   ROUND_W_FM_MM<1, 0x2c>, ISA_MICROMIPS, FGR_32;
+  def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd,
+                                              AFGR64Opnd, II_ROUND>,
+                   ROUND_W_FM_MM<1, 0xec>, ISA_MICROMIPS, FGR_32;
+  def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
+                   ROUND_W_FM_MM<1, 0xac>, ISA_MICROMIPS, FGR_32;
+
+  def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                     ROUND_W_FM_MM<0, 0x4>, ISA_MICROMIPS, FGR_64;
+  def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+                     ROUND_W_FM_MM<1, 0x4>, ISA_MICROMIPS, FGR_64;
+
+  def CVT_W_D32_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                     ROUND_W_FM_MM<1, 0x24>, ISA_MICROMIPS, FGR_32;
+}
+let DecoderNamespace = "MicroMipsFP64" in {
+  def CVT_W_D64_MM : ABSS_FT<"cvt.w.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+                     ROUND_W_FM_MM<1, 0x24>, ISA_MICROMIPS, FGR_64;
+}
 
+multiclass ABSS_MMM<string opstr, InstrItinClass Itin,
+                    SDPatternOperator OpNode = null_frag> {
+  def _D32_MM : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
+                ISA_MICROMIPS, FGR_32 {
+    string DecoderNamespace = "MicroMips";
+  }
+  // FIXME: This needs to be part of the instruction mapping tables.
+  def _D64_MM : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
+                ISA_MICROMIPS, FGR_64 {
+    string DecoderNamespace = "MicroMipsFP64";
+  }
 }
 
+defm FSQRT : ABSS_MMM<"sqrt.d", II_SQRT_D, fsqrt>, ROUND_W_FM_MM<1, 0x28>;
+defm FABS : ABSS_MMM<"abs.d", II_SQRT_D, fabs>, ABS_FM_MM<1, 0xd>;
+
 let DecoderNamespace = "MicroMips" in {
   def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
                   ABS_FM_MM<0, 0xd>, ISA_MICROMIPS;
-  def FABS_MM : MMRel, ABSS_FT<"abs.d", AFGR64Opnd, AFGR64Opnd, II_ABS, fabs>,
-                ABS_FM_MM<1, 0xd>, ISA_MICROMIPS, FGR_32;
 }
 
-let isCodeGenOnly = 1 in {
 def FMOV_S_MM : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
-                ABS_FM_MM<0, 0x1>, ISA_MICROMIPS;
+                ABS_FM_MM<0, 0x1>, ISA_MICROMIPS {
+  let isMoveReg = 1;
+}
 def FNEG_S_MM : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
                 ABS_FM_MM<0, 0x2d>, ISA_MICROMIPS;
-def CVT_D_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                 ABS_FM_MM<0, 0x4d>, ISA_MICROMIPS, FGR_32;
-def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                   ABS_FM_MM<1, 0x4d>, ISA_MICROMIPS, FGR_32;
-def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
-                   ABS_FM_MM<0, 0x6d>, ISA_MICROMIPS, FGR_32;
-def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
-                 ABS_FM_MM<1, 0x6d>, ISA_MICROMIPS;
-
-def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
-              ABS_FM_MM<1, 0x2d>, ISA_MICROMIPS, FGR_32;
-
-def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
-                  ABS_FM_MM<1, 0x1>, ISA_MICROMIPS, FGR_32;
-
-def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
-                                     II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>,
-                  ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
+
+let DecoderNamespace = "MicroMips" in {
+  def CVT_D32_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                     ABS_FM_MM<0, 0x4d>, ISA_MICROMIPS, FGR_32;
+  def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                     ABS_FM_MM<1, 0x4d>, ISA_MICROMIPS, FGR_32;
+}
+
+let DecoderNamespace = "MicroMipsFP64" in {
+  def CVT_D64_S_MM : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                     ABS_FM_MM<0, 0x4d>, ISA_MICROMIPS, FGR_64;
+  def CVT_D64_W_MM : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+                     ABS_FM_MM<1, 0x4d>, ISA_MICROMIPS, FGR_64;
+  def CVT_S_D64_MM : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+                     ABS_FM_MM<0, 0x6d>, ISA_MICROMIPS, FGR_64;
+}
+
+let DecoderNamespace = "MicroMips" in {
+  def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                     ABS_FM_MM<0, 0x6d>, ISA_MICROMIPS, FGR_32;
+  def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+                   ABS_FM_MM<1, 0x6d>, ISA_MICROMIPS;
+}
+
+
+defm FNEG : ABSS_MMM<"neg.d", II_NEG, fneg>, ABS_FM_MM<1, 0x2d>;
+defm FMOV : ABSS_MMM<"mov.d", II_MOV_D>, ABS_FM_MM<1, 0x1>;
+
+let DecoderNamespace = "MicroMips" in {
+  def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
+                                       II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>,
+                    ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
                                      II_MOVN_S>, CMov_I_F_FM_MM<0x38, 0>,
+                    ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                         II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>,
+                      ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+  def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                         II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>,
+                      ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+
+  def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
+                                     MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>,
                   ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
-                                       II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>,
-                    ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
-                                       II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>,
-                    ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-
-def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
-                                   MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>,
-                ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
-                                   MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>,
-                ISA_MICROMIPS32_NOT_MIPS32R6;
-def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+  def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
+                                     MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
                                      MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 1>,
-                ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
-                                     MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>,
                   ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
-                             II_MFC1, bitconvert>, MFC1_FM_MM<0x80>,
-              ISA_MICROMIPS;
-def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
-                             II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>,
-              ISA_MICROMIPS;
-
-def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-                MADDS_FM_MM<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-                MADDS_FM_MM<0x21>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                 MADDS_FM_MM<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                 MADDS_FM_MM<0x22>, ISA_MICROMIPS32_NOT_MIPS32R6;
-
-def MADD_D32_MM  : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-                   MADDS_FM_MM<0x9>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def MSUB_D32_MM  : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-                   MADDS_FM_MM<0x29>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                   MADDS_FM_MM<0xa>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
-def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                   MADDS_FM_MM<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+  def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                       MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>,
+                    ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+
+  def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
+                               II_MFC1, bitconvert>, MFC1_FM_MM<0x80>,
+                ISA_MICROMIPS;
+  def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
+                               II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>,
+                ISA_MICROMIPS;
+
+  def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S>,
+                  MADDS_FM_MM<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6, MADD4;
+  def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S>,
+                  MADDS_FM_MM<0x21>, ISA_MICROMIPS32_NOT_MIPS32R6, MADD4;
+  let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
+    def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S>,
+                     MADDS_FM_MM<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
+    def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S>,
+                     MADDS_FM_MM<0x22>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  }
+  def MADD_D32_MM  : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D>,
+                     MADDS_FM_MM<0x9>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32,
+                     MADD4;
+  def MSUB_D32_MM  : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D>,
+                     MADDS_FM_MM<0x29>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32,
+                     MADD4;
+  let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
+    def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D>,
+                       MADDS_FM_MM<0xa>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+    def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D>,
+                       MADDS_FM_MM<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+  }
+
+  def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
+                                    II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>,
+                     ISA_MICROMIPS;
+  def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
+                                                FGR32Opnd, II_TRUNC>,
+                     ROUND_W_FM_MM<0, 0xac>, ISA_MICROMIPS;
+  def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+                     ROUND_W_FM_MM<0, 0x6c>, ISA_MICROMIPS;
+
+  def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+                                  fsqrt>, ROUND_W_FM_MM<0, 0x28>, ISA_MICROMIPS;
+
+  def MTHC1_D32_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+                     MFC1_FM_MM<0xe0>, ISA_MICROMIPS, FGR_32;
+  def MFHC1_D32_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                     MFC1_FM_MM<0xc0>, ISA_MICROMIPS, FGR_32;
 }
 
-def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
-                                  II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>,
-                   ISA_MICROMIPS;
-def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
-                                              FGR32Opnd, II_TRUNC>,
-                   ROUND_W_FM_MM<0, 0xac>, ISA_MICROMIPS;
-def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
-                   ROUND_W_FM_MM<0, 0x6c>, ISA_MICROMIPS;
-def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
-                                fsqrt>, ROUND_W_FM_MM<0, 0x28>, ISA_MICROMIPS;
-def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
-               MFC1_FM_MM<0xe0>, ISA_MICROMIPS, FGR_32;
-def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
-               MFC1_FM_MM<0xc0>, ISA_MICROMIPS, FGR_32;
+let DecoderNamespace = "MicroMipsFP64" in {
+  def MTHC1_D64_MM : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+                     MFC1_FM_MM<0xe0>, ISA_MICROMIPS, FGR_64;
+  def MFHC1_D64_MM : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+                     MFC1_FM_MM<0xc0>, ISA_MICROMIPS, FGR_64;
+}
 
 let DecoderNamespace = "MicroMips" in {
   def CFC1_MM : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>,
@@ -307,11 +362,13 @@ multiclass C_COND_MM<string TypeStr, RegisterOperand RC, bits<2> fmt,
     let BaseOpcode = "c.ngt."#NAME;
   }
 }
+let DecoderNamespace = "MicroMips" in {
+  defm S   : C_COND_MM<"s", FGR32Opnd, 0b00, II_C_CC_S>,
+             ISA_MICROMIPS32_NOT_MIPS32R6;
+  defm D32 : C_COND_MM<"d", AFGR64Opnd, 0b01, II_C_CC_D>,
+             ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+}
 
-defm S   : C_COND_MM<"s", FGR32Opnd, 0b00, II_C_CC_S>,
-           ISA_MICROMIPS32_NOT_MIPS32R6;
-defm D32 : C_COND_MM<"d", AFGR64Opnd, 0b01, II_C_CC_D>,
-           ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
 let DecoderNamespace = "Mips64" in
   defm D64 : C_COND_MM<"d", FGR64Opnd, 0b01, II_C_CC_D>,
              ISA_MICROMIPS32_NOT_MIPS32R6, FGR_64;
@@ -347,3 +404,36 @@ let AddedComplexity = 40 in {
   def : LoadRegImmPat<LWC1_MM, f32, load>, ISA_MICROMIPS;
   def : StoreRegImmPat<SWC1_MM, f32>, ISA_MICROMIPS;
 }
+
+def : MipsPat<(f32 fpimm0), (MTC1_MM ZERO)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S_MM (MTC1_MM ZERO))>,
+      ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+              (CVT_S_D64_MM FGR64Opnd:$src)>, ISA_MICROMIPS, FGR_64;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+              (CVT_D64_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS, FGR_64;
+def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+              (CVT_S_D32_MM AFGR64Opnd:$src)>, ISA_MICROMIPS, FGR_32;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+              (CVT_D32_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+              (TRUNC_W_MM AFGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6,
+              FGR_32;
+
+// Selects
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S_MM, XOR_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : MovnPats<GPR32, FGR32, MOVN_I_S_MM, XOR_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32_MM, SLT_MM, SLTu_MM, SLTi_MM,
+                 SLTiu_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32_MM, XOR_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32_MM, XOR_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6, FGR_32;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
index bc0045dad21e..a9c53e08b810 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -1,3 +1,16 @@
+//===-- MicroMipsInstrFormats.td - microMIPS Inst Formats -*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This files descributes the formats of the microMIPS instruction set.
+//
+//===----------------------------------------------------------------------===//
+
 //===----------------------------------------------------------------------===//
 // MicroMIPS Base Classes
 //===----------------------------------------------------------------------===//
@@ -7,8 +20,8 @@
 // This class does not depend on the instruction size.
 //
 class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
-                        InstrItinClass itin, Format f> : Instruction
-{
+                        InstrItinClass itin, Format f> : Instruction,
+                        PredicateControl {
   let Namespace = "Mips";
   let DecoderNamespace = "MicroMips";
 
@@ -19,7 +32,7 @@ class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
   let Pattern     = pattern;
   let Itinerary   = itin;
 
-  let Predicates = [InMicroMips];
+  let EncodingPredicates = [InMicroMips];
 
   Format Form = f;
 }
@@ -406,7 +419,7 @@ class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
   let Inst{8-0}  = offset;
 }
 
-class LWL_FM_MM<bits<4> funct> {
+class LWL_FM_MM<bits<4> funct> : MMArch {
   bits<5> rt;
   bits<21> addr;
 
@@ -419,7 +432,7 @@ class LWL_FM_MM<bits<4> funct> {
   let Inst{11-0}  = addr{11-0};
 }
 
-class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> {
+class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> : MMArch {
   bits<5> rt;
   bits<21> addr;
   bits<5> base = addr{20-16};
@@ -600,8 +613,9 @@ class SYNC_FM_MM : MMArch {
 }
 
 class SYNCI_FM_MM : MMArch {
-  bits<5> rs;
-  bits<16> offset;
+  bits<21> addr;
+  bits<5> rs = addr{20-16};
+  bits<16> offset = addr{15-0};
   bits<32> Inst;
 
   let Inst{31-26} = 0b010000;
@@ -629,7 +643,7 @@ class SYS_FM_MM : MMArch {
   let Inst{5-0}   = 0x3c;
 }
 
-class WAIT_FM_MM {
+class WAIT_FM_MM : MMArch {
   bits<10> code_;
   bits<32> Inst;
 
@@ -699,7 +713,7 @@ class LL_FM_MM<bits<4> funct> : MMArch {
   let Inst{11-0}  = addr{11-0};
 }
 
-class LLE_FM_MM<bits<4> funct> {
+class LLE_FM_MM<bits<4> funct> : MMArch {
   bits<5> rt;
   bits<21> addr;
   bits<5> base = addr{20-16};
@@ -730,7 +744,6 @@ class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
   let Inst{9-8}   = fmt;
   let Inst{7-0}   = funct;
 
-  list<dag> Pattern = [];
 }
 
 class LWXC1_FM_MM<bits<9> funct> : MMArch {
@@ -831,13 +844,13 @@ class ABS_FM_MM<bits<2> fmt, bits<7> funct> : MMArch {
 class CMov_F_F_FM_MM<bits<9> func, bits<2> fmt> : MMArch {
   bits<5> fd;
   bits<5> fs;
-
+  bits<3> fcc;
   bits<32> Inst;
 
   let Inst{31-26} = 0x15;
   let Inst{25-21} = fd;
   let Inst{20-16} = fs;
-  let Inst{15-13} = 0x0; //cc
+  let Inst{15-13} = fcc; //cc
   let Inst{12-11} = 0x0;
   let Inst{10-9}  = fmt;
   let Inst{8-0}   = func;
@@ -961,7 +974,7 @@ class LWM_FM_MM<bits<4> funct> : MMArch {
   let Inst{11-0}  = addr{11-0};
 }
 
-class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl {
+class LWM_FM_MM16<bits<4> funct> : MMArch {
   bits<2> rt;
   bits<4> addr;
 
@@ -1053,3 +1066,39 @@ class POOL32A_CFTC2_FM_MM<bits<10> funct> : MMArch {
   let Inst{15-6}  = funct;
   let Inst{5-0}   = 0b111100;
 }
+
+class POOL32A_TLBINV_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = 0x0;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_MFTC0_FM_MM<bits<5> funct, bits<6> opcode> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = opcode;
+}
+
+class POOL32A_HYPCALL_FM_MM : MMArch {
+  bits<32> Inst;
+
+  bits<10> code_;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0b1100001101;
+  let Inst{5-0}   = 0b111100;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
index 1fef51fd69d0..ebadb59a0432 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,3 +1,16 @@
+//===--- MicroMipsInstrFormats.td - microMIPS Inst Defs -*- tablegen -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This files describes the defintions of the microMIPSr3 instructions.
+//
+//===----------------------------------------------------------------------===//
+
 def addrimm11 : ComplexPattern<iPTR, 2, "selectIntAddr11MM", [frameindex]>;
 def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddr12MM", [frameindex]>;
 def addrimm16 : ComplexPattern<iPTR, 2, "selectIntAddr16MM", [frameindex]>;
@@ -128,6 +141,7 @@ def mem_mm_16 : Operand<i32> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncodingMMImm16";
+  let DecoderMethod = "DecodeMemMMImm16";
   let ParserMatchClass = MipsMemSimm16AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -201,6 +215,9 @@ class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
          Itin, FrmI> {
   let DecoderMethod = "DecodeMemMMImm12";
   string Constraints = "$src = $rt";
+  let BaseOpcode = opstr;
+  bit mayLoad = 1;
+  bit mayStore = 0;
 }
 
 class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
@@ -209,6 +226,9 @@ class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
          !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RO:$rt, addrimm12:$addr)], Itin, FrmI> {
   let DecoderMethod = "DecodeMemMMImm12";
+  let BaseOpcode = opstr;
+  bit mayLoad = 0;
+  bit mayStore = 1;
 }
 
 /// A register pair used by movep instruction.
@@ -231,35 +251,23 @@ MicroMipsInst16<(outs movep_regpair:$dst_regs), (ins RO:$rs, RO:$rt),
                  !strconcat(opstr, "\t$dst_regs, $rs, $rt"), [],
                  NoItinerary, FrmR> {
   let isReMaterializable = 1;
-}
-
-/// A register pair used by load/store pair instructions.
-def RegPairAsmOperand : AsmOperandClass {
-  let Name = "RegPair";
-  let ParserMethod = "parseRegisterPair";
-  let PredicateMethod = "isRegPair";
-}
-
-def regpair : Operand<i32> {
-  let EncoderMethod = "getRegisterPairOpValue";
-  let ParserMatchClass = RegPairAsmOperand;
-  let PrintMethod = "printRegisterPair";
-  let DecoderMethod = "DecodeRegPairOperand";
-  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+  let isMoveReg = 1;
 }
 
 class StorePairMM<string opstr, ComplexPattern Addr = addr>
-    :  InstSE<(outs), (ins regpair:$rt, mem_simm12:$addr),
+    :  InstSE<(outs), (ins GPR32Opnd:$rt, GPR32Opnd:$rt2, mem_simm12:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], II_SWP, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let mayStore = 1;
+  let AsmMatchConverter = "ConvertXWPOperands";
 }
 
 class LoadPairMM<string opstr, ComplexPattern Addr = addr>
-    : InstSE<(outs regpair:$rt), (ins mem_simm12:$addr),
+    : InstSE<(outs GPR32Opnd:$rt, GPR32Opnd:$rt2), (ins mem_simm12:$addr),
           !strconcat(opstr, "\t$rt, $addr"), [], II_LWP, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let mayLoad = 1;
+  let AsmMatchConverter = "ConvertXWPOperands";
 }
 
 class LLBaseMM<string opstr, RegisterOperand RO> :
@@ -273,6 +281,7 @@ class LLEBaseMM<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rt), (ins mem_simm9:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], II_LLE, FrmI> {
   let DecoderMethod = "DecodeMemMMImm9";
+  string BaseOpcode = opstr;
   let mayLoad = 1;
 }
 
@@ -288,6 +297,7 @@ class SCEBaseMM<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$dst), (ins RO:$rt, mem_simm9:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], II_SCE, FrmI> {
   let DecoderMethod = "DecodeMemMMImm9";
+  string BaseOpcode = opstr;
   let mayStore = 1;
   let Constraints = "$rt = $dst";
 }
@@ -406,12 +416,14 @@ class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
   [], II_MFHI_MFLO, FrmR> {
   let Uses = [UseReg];
   let hasSideEffects = 0;
+  let isMoveReg = 1;
 }
 
 class MoveMM16<string opstr, RegisterOperand RO>
     :  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs),
                        !strconcat(opstr, "\t$rd, $rs"), [], II_MOVE, FrmR> {
   let isReMaterializable = 1;
+  let isMoveReg = 1;
 }
 
 class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
@@ -423,7 +435,7 @@ class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
 // 16-bit Jump and Link (Call)
 class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
-           [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl {
+           [(MipsJmpLink RO:$rs)], II_JALR, FrmR> {
   let isCall = 1;
   let hasDelaySlot = 1;
   let Defs = [RA];
@@ -586,70 +598,113 @@ class UncondBranchMM16<string opstr> :
   let Defs = [AT];
 }
 
-def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
-    ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6;
-def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
-    LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6;
+class HypcallMM<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_),
+          !strconcat(opstr, "\t$code_"), [], II_HYPCALL, FrmOther> {
+  let BaseOpcode = opstr;
+}
+
+class TLBINVMM<string opstr, InstrItinClass Itin> :
+  InstSE<(outs), (ins), opstr, [], Itin, FrmOther> {
+  let BaseOpcode = opstr;
+}
+
+class MfCop0MM<string opstr, RegisterOperand DstRC,
+               RegisterOperand SrcRC, InstrItinClass Itin> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$rs, uimm3:$sel),
+          !strconcat(opstr, "\t$rt, $rs, $sel"), [], Itin, FrmR> {
+  let BaseOpcode = opstr;
+}
+
+class MtCop0MM<string opstr, RegisterOperand DstRC,
+               RegisterOperand SrcRC, InstrItinClass Itin> :
+  InstSE<(outs DstRC:$rs), (ins SrcRC:$rt, uimm3:$sel),
+          !strconcat(opstr, "\t$rt, $rs, $sel"), [], Itin, FrmR> {
+  let BaseOpcode = opstr;
+}
+
+let FastISelShouldIgnore = 1 in {
+  def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+      ARITH_FM_MM16<0>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+      LOGIC_FM_MM16<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+
 def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>,
-    ISA_MICROMIPS_NOT_32R6;
+                ISA_MICROMIPS32_NOT_MIPS32R6;
 def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>,
-    ISA_MICROMIPS_NOT_32R6;
-def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
-    ISA_MICROMIPS_NOT_32R6;
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+let FastISelShouldIgnore = 1 in
+  def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
 def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
-    SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6;
+    SHIFT_FM_MM16<0>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
-    SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6;
+    SHIFT_FM_MM16<1>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
-def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
-                ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6;
-def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
-               LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6;
+let FastISelShouldIgnore = 1 in {
+  def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+                  ARITH_FM_MM16<1>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+                 LOGIC_FM_MM16<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6;
+}
 def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
-                        mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
+                        mem_mm_4>, LOAD_STORE_FM_MM16<0x02>, ISA_MICROMIPS;
 def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
-                        mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>;
+                        mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>, ISA_MICROMIPS;
 def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, load, II_LW, mem_mm_4_lsl2>,
-                      LOAD_STORE_FM_MM16<0x1a>;
+                      LOAD_STORE_FM_MM16<0x1a>, ISA_MICROMIPS;
 def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei8,
-                        II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>;
+                        II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>,
+                        ISA_MICROMIPS32_NOT_MIPS32R6;
 def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
                         II_SH, mem_mm_4_lsl1>,
-                        LOAD_STORE_FM_MM16<0x2a>;
+                        LOAD_STORE_FM_MM16<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
-                        mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>;
+                        mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>,
+                        ISA_MICROMIPS32_NOT_MIPS32R6;
 def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
-                         LOAD_GP_FM_MM16<0x19>;
+                         LOAD_GP_FM_MM16<0x19>, ISA_MICROMIPS;
 def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
-              LOAD_STORE_SP_FM_MM16<0x12>;
+              LOAD_STORE_SP_FM_MM16<0x12>, ISA_MICROMIPS;
 def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
-              LOAD_STORE_SP_FM_MM16<0x32>;
-def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16;
-def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16;
-def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16;
-def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
-def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
-def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
-def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+              LOAD_STORE_SP_FM_MM16<0x32>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16,
+                   ISA_MICROMIPS;
+def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16,
+                 ISA_MICROMIPS;
+def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16,
+                 ISA_MICROMIPS;
+def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16, ISA_MICROMIPS;
+def MFHI16_MM : MoveFromHILOMM<"mfhi16", GPR32Opnd, AC0>,
+                MFHILO_FM_MM16<0x10>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def MFLO16_MM : MoveFromHILOMM<"mflo16", GPR32Opnd, AC0>,
+                MFHILO_FM_MM16<0x12>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
 def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16,
-               ISA_MICROMIPS_NOT_32R6;
+               ISA_MICROMIPS32_NOT_MIPS32R6;
 def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
-              IsAsCheapAsAMove;
+              IsAsCheapAsAMove, ISA_MICROMIPS32_NOT_MIPS32R6;
 def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
                 ISA_MICROMIPS32_NOT_MIPS32R6;
-def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
-def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
-def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
-def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>;
+def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
+def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
 def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
-                BEQNEZ_FM_MM16<0x23>;
+                BEQNEZ_FM_MM16<0x23>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
-                BEQNEZ_FM_MM16<0x2b>;
-def B16_MM : UncondBranchMM16<"b16">, B16_FM;
+                BEQNEZ_FM_MM16<0x2b>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def B16_MM : UncondBranchMM16<"b16">, B16_FM, ISA_MICROMIPS32_NOT_MIPS32R6;
 def BREAK16_MM : BrkSdbbp16MM<"break16", II_BREAK>, BRKSDBBP16_FM_MM<0x28>,
-    ISA_MICROMIPS_NOT_32R6;
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
 def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, BRKSDBBP16_FM_MM<0x2C>,
-    ISA_MICROMIPS_NOT_32R6;
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
 
 let DecoderNamespace = "MicroMips" in {
   /// Load and Store Instructions - multiple
@@ -657,175 +712,196 @@ let DecoderNamespace = "MicroMips" in {
                  ISA_MICROMIPS32_NOT_MIPS32R6;
   def LWM16_MM : LoadMultMM16<"lwm16", II_LWM>, LWM_FM_MM16<0x4>,
                  ISA_MICROMIPS32_NOT_MIPS32R6;
-  let AdditionalPredicates = [InMicroMips] in {
-    def CFC2_MM : InstSE<(outs GPR32Opnd:$rt), (ins COP2Opnd:$impl),
-                         "cfc2\t$rt, $impl", [], II_CFC2, FrmFR, "cfc2">,
-                  POOL32A_CFTC2_FM_MM<0b1100110100>;
-    def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
-                         "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
-                  POOL32A_CFTC2_FM_MM<0b1101110100>;
-  }
+  def CFC2_MM : InstSE<(outs GPR32Opnd:$rt), (ins COP2Opnd:$impl),
+                       "cfc2\t$rt, $impl", [], II_CFC2, FrmFR, "cfc2">,
+                POOL32A_CFTC2_FM_MM<0b1100110100>, ISA_MICROMIPS;
+  def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
+                       "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
+                POOL32A_CFTC2_FM_MM<0b1101110100>, ISA_MICROMIPS;
 }
 
 class WaitMM<string opstr> :
   InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
          II_WAIT, FrmOther, opstr>;
 
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips, NotMips32r6,
-                                                  NotMips64r6] in {
+let DecoderNamespace = "MicroMips" in {
   /// Compact Branch Instructions
   def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
-                 COMPACT_BRANCH_FM_MM<0x7>;
+                 COMPACT_BRANCH_FM_MM<0x7>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, setne, GPR32Opnd>,
-                 COMPACT_BRANCH_FM_MM<0x5>;
-}
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+                 COMPACT_BRANCH_FM_MM<0x5>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
   /// Arithmetic Instructions (ALU Immediate)
   def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU>,
-                 ADDI_FM_MM<0xc>;
+                 ADDI_FM_MM<0xc>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def ADDi_MM  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd, II_ADDI>,
-                 ADDI_FM_MM<0x4>;
+                 ADDI_FM_MM<0x4>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def SLTi_MM  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
-                 SLTI_FM_MM<0x24>;
+                 SLTI_FM_MM<0x24>, ISA_MICROMIPS;
   def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
-                 SLTI_FM_MM<0x2c>;
+                 SLTI_FM_MM<0x2c>, ISA_MICROMIPS;
   def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>,
-                 ADDI_FM_MM<0x34>;
+                 ADDI_FM_MM<0x34>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
-                                    or>, ADDI_FM_MM<0x14>;
+                                    or>, ADDI_FM_MM<0x14>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
   def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
-                                    immZExt16, xor>, ADDI_FM_MM<0x1c>;
-  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM;
+                                    immZExt16, xor>, ADDI_FM_MM<0x1c>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
 
   def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
-                     LW_FM_MM<0xc>;
+                     LW_FM_MM<0xc>, ISA_MICROMIPS;
 
   /// Arithmetic Instructions (3-Operand, R-Type)
   def ADDu_MM  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
-                 ADD_FM_MM<0, 0x150>;
+                 ADD_FM_MM<0, 0x150>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def SUBu_MM  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
-                 ADD_FM_MM<0, 0x1d0>;
-  def MUL_MM   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL>,
-                 ADD_FM_MM<0, 0x210>;
+                 ADD_FM_MM<0, 0x1d0>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  let Defs = [HI0, LO0] in
+    def MUL_MM   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+                   ADD_FM_MM<0, 0x210>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def ADD_MM   : MMRel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>,
-                 ADD_FM_MM<0, 0x110>;
+                 ADD_FM_MM<0, 0x110>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def SUB_MM   : MMRel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>,
-                 ADD_FM_MM<0, 0x190>;
-  def SLT_MM   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
+                 ADD_FM_MM<0, 0x190>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def SLT_MM   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>,
+                 ISA_MICROMIPS;
   def SLTu_MM  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
-                 ADD_FM_MM<0, 0x390>;
+                 ADD_FM_MM<0, 0x390>, ISA_MICROMIPS;
   def AND_MM   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
-                 ADD_FM_MM<0, 0x250>;
+                 ADD_FM_MM<0, 0x250>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def OR_MM    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
-                 ADD_FM_MM<0, 0x290>;
+                 ADD_FM_MM<0, 0x290>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
-                 ADD_FM_MM<0, 0x310>;
-  def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
+                 ADD_FM_MM<0, 0x310>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
   def MULT_MM  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
-                 MULT_FM_MM<0x22c>;
+                 MULT_FM_MM<0x22c>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
-                 MULT_FM_MM<0x26c>;
+                 MULT_FM_MM<0x26c>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def SDIV_MM  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
-                 MULT_FM_MM<0x2ac>, ISA_MIPS1_NOT_32R6_64R6;
+                 MULT_FM_MM<0x2ac>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def UDIV_MM  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
-                 MULT_FM_MM<0x2ec>, ISA_MIPS1_NOT_32R6_64R6;
+                 MULT_FM_MM<0x2ec>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Arithmetic Instructions with PC and Immediate
-  def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM;
+  def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM,
+                   ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Shift Instructions
   def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
-                 SRA_FM_MM<0, 0>;
+                 SRA_FM_MM<0, 0>, ISA_MICROMIPS;
   def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL>,
-                 SRA_FM_MM<0x40, 0>;
+                 SRA_FM_MM<0x40, 0>, ISA_MICROMIPS;
   def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA>,
-                 SRA_FM_MM<0x80, 0>;
+                 SRA_FM_MM<0x80, 0>, ISA_MICROMIPS;
   def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV>,
-                 SRLV_FM_MM<0x10, 0>;
+                 SRLV_FM_MM<0x10, 0>, ISA_MICROMIPS;
   def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV>,
-                 SRLV_FM_MM<0x50, 0>;
+                 SRLV_FM_MM<0x50, 0>, ISA_MICROMIPS;
   def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
-                 SRLV_FM_MM<0x90, 0>;
+                 SRLV_FM_MM<0x90, 0>, ISA_MICROMIPS;
   def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
-                 SRA_FM_MM<0xc0, 0> {
+                 SRA_FM_MM<0xc0, 0>, ISA_MICROMIPS {
     list<dag> Pattern = [(set GPR32Opnd:$rd,
                           (rotr GPR32Opnd:$rt, immZExt5:$shamt))];
   }
   def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
-                 SRLV_FM_MM<0xd0, 0> {
+                 SRLV_FM_MM<0xd0, 0>, ISA_MICROMIPS {
     list<dag> Pattern = [(set GPR32Opnd:$rd,
                           (rotr GPR32Opnd:$rt, GPR32Opnd:$rs))];
   }
 
   /// Load and Store Instructions - aligned
   let DecoderMethod = "DecodeMemMMImm16" in {
-    def LB_MM  : LoadMemory<"lb", GPR32Opnd, mem_mm_16, null_frag, II_LB>,
-                 MMRel, LW_FM_MM<0x7>;
-    def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16, null_frag, II_LBU>,
-                 MMRel, LW_FM_MM<0x5>;
-    def LH_MM  : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
-                            addrDefault>, MMRel, LW_FM_MM<0xf>;
-    def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
-                 MMRel, LW_FM_MM<0xd>;
-    def LW_MM  : Load<"lw", GPR32Opnd, null_frag, II_LW>, MMRel, LW_FM_MM<0x3f>;
-    def SB_MM  : Store<"sb", GPR32Opnd, null_frag, II_SB>, MMRel,
-                 LW_FM_MM<0x6>;
-    def SH_MM  : Store<"sh", GPR32Opnd, null_frag, II_SH>, MMRel,
-                 LW_FM_MM<0xe>;
+    def LB_MM  : LoadMemory<"lb", GPR32Opnd, mem_mm_16, sextloadi8, II_LB>,
+                 MMRel, LW_FM_MM<0x7>, ISA_MICROMIPS;
+    def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16, zextloadi8, II_LBU>,
+                 MMRel, LW_FM_MM<0x5>, ISA_MICROMIPS;
+    def LH_MM  : LoadMemory<"lh", GPR32Opnd, mem_simmptr, sextloadi16, II_LH,
+                            addrDefault>, MMRel, LW_FM_MM<0xf>, ISA_MICROMIPS;
+    def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simmptr, zextloadi16, II_LHU>,
+                 MMRel, LW_FM_MM<0xd>, ISA_MICROMIPS;
+    def LW_MM  : Load<"lw", GPR32Opnd, null_frag, II_LW>, MMRel, LW_FM_MM<0x3f>,
+                 ISA_MICROMIPS;
+    def SB_MM  : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+                 LW_FM_MM<0x6>, ISA_MICROMIPS;
+    def SH_MM  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel,
+                 LW_FM_MM<0xe>, ISA_MICROMIPS;
     def SW_MM  : Store<"sw", GPR32Opnd, null_frag, II_SW>, MMRel,
-                 LW_FM_MM<0x3e>;
+                 LW_FM_MM<0x3e>, ISA_MICROMIPS;
   }
-
+}
+let DecoderNamespace = "MicroMips" in {
   let DecoderMethod = "DecodeMemMMImm9" in {
-    def LBE_MM  : Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
-    def LBuE_MM : Load<"lbue", GPR32Opnd, null_frag, II_LBUE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
-    def LHE_MM  : LoadMemory<"lhe", GPR32Opnd, mem_simm9, null_frag, II_LHE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
-    def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9, null_frag, II_LHUE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
-    def LWE_MM  : LoadMemory<"lwe", GPR32Opnd, mem_simm9, null_frag, II_LWE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
-    def SBE_MM  : StoreMemory<"sbe", GPR32Opnd, mem_simm9, null_frag, II_SBE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
-    def SHE_MM  : StoreMemory<"she", GPR32Opnd, mem_simm9, null_frag, II_SHE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
-    def SWE_MM  : StoreMemory<"swe", GPR32Opnd, mem_simm9, null_frag, II_SWE>,
-                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
+    def LBE_MM  : MMRel, Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>, ISA_MICROMIPS, ASE_EVA;
+    def LBuE_MM : MMRel, Load<"lbue", GPR32Opnd, null_frag, II_LBUE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>, ISA_MICROMIPS, ASE_EVA;
+    def LHE_MM  : MMRel, LoadMemory<"lhe", GPR32Opnd, mem_simm9, null_frag,
+                                    II_LHE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>, ISA_MICROMIPS, ASE_EVA;
+    def LHuE_MM : MMRel, LoadMemory<"lhue", GPR32Opnd, mem_simm9, null_frag,
+                                    II_LHUE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>, ISA_MICROMIPS, ASE_EVA;
+    def LWE_MM  : MMRel, LoadMemory<"lwe", GPR32Opnd, mem_simm9, null_frag,
+                                    II_LWE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>, ISA_MICROMIPS, ASE_EVA;
+    def SBE_MM  : MMRel, StoreMemory<"sbe", GPR32Opnd, mem_simm9, null_frag,
+                                     II_SBE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>, ISA_MICROMIPS, ASE_EVA;
+    def SHE_MM  : MMRel, StoreMemory<"she", GPR32Opnd, mem_simm9, null_frag,
+                                     II_SHE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>, ISA_MICROMIPS, ASE_EVA;
+    def SWE_MM  : MMRel, StoreMemory<"swe", GPR32Opnd, mem_simm9, null_frag,
+                                     II_SWE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>, ISA_MICROMIPS, ASE_EVA;
+    def LWLE_MM : MMRel, LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9,
+                                         II_LWLE>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
+    def LWRE_MM : MMRel, LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9,
+                                         II_LWRE>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
+    def SWLE_MM : MMRel, StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9,
+                                          II_SWLE>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
+    def SWRE_MM : MMRel, StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9,
+                                          II_SWRE>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6, ASE_EVA;
   }
 
-  def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
+  def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>,
+                ISA_MICROMIPS;
 
   /// Load and Store Instructions - unaligned
-  def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12, II_LWL>,
-               LWL_FM_MM<0x0>;
-  def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12, II_LWR>,
-               LWL_FM_MM<0x1>;
-  def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12, II_SWL>,
-               LWL_FM_MM<0x8>;
-  def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12, II_SWR>,
-               LWL_FM_MM<0x9>;
-  let DecoderMethod = "DecodeMemMMImm9" in {
-    def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9,
-                                  II_LWLE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
-    def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9,
-                                  II_LWRE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
-    def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9,
-                                   II_SWLE>,
-                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
-    def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9,
-                                   II_SWRE>,
-                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
-  }
-
+  def LWL_MM : MMRel, LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12,
+                                      II_LWL>, LWL_FM_MM<0x0>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+  def LWR_MM : MMRel, LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12,
+                                      II_LWR>, LWL_FM_MM<0x1>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+  def SWL_MM : MMRel, StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12,
+                                       II_SWL>, LWL_FM_MM<0x8>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+  def SWR_MM : MMRel, StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12,
+                                       II_SWR>, LWL_FM_MM<0x9>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+let DecoderNamespace = "MicroMips" in {
   /// Load and Store Instructions - multiple
-  def SWM32_MM  : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>;
-  def LWM32_MM  : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>;
+  def SWM32_MM  : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>, ISA_MICROMIPS;
+  def LWM32_MM  : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>, ISA_MICROMIPS;
 
   /// Load and Store Pair Instructions
-  def SWP_MM  : StorePairMM<"swp">, LWM_FM_MM<0x9>;
-  def LWP_MM  : LoadPairMM<"lwp">, LWM_FM_MM<0x1>;
+  def SWP_MM  : StorePairMM<"swp">, LWM_FM_MM<0x9>, ISA_MICROMIPS;
+  def LWP_MM  : LoadPairMM<"lwp">, LWM_FM_MM<0x1>, ISA_MICROMIPS;
 
   /// Load and Store multiple pseudo Instructions
   class LoadWordMultMM<string instr_asm > :
@@ -837,168 +913,207 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                       !strconcat(instr_asm, "\t$rt, $addr")> ;
 
 
-  def SWM_MM  : StoreWordMultMM<"swm">;
-  def LWM_MM  : LoadWordMultMM<"lwm">;
+  def SWM_MM  : StoreWordMultMM<"swm">, ISA_MICROMIPS;
+  def LWM_MM  : LoadWordMultMM<"lwm">, ISA_MICROMIPS;
 
   /// Move Conditional
   def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
-                  NoItinerary>, ADD_FM_MM<0, 0x58>;
+                                     II_MOVZ>, ADD_FM_MM<0, 0x58>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
   def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
-                  NoItinerary>, ADD_FM_MM<0, 0x18>;
-  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT>,
-                  CMov_F_I_FM_MM<0x25>;
-  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF>,
-                  CMov_F_I_FM_MM<0x5>;
-
+                                     II_MOVN>, ADD_FM_MM<0, 0x18>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+                  CMov_F_I_FM_MM<0x25>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+                  CMov_F_I_FM_MM<0x5>, ISA_MICROMIPS32_NOT_MIPS32R6;
   /// Move to/from HI/LO
   def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
-                MTLO_FM_MM<0x0b5>;
+                MTLO_FM_MM<0x0b5>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
-                MTLO_FM_MM<0x0f5>;
+                MTLO_FM_MM<0x0f5>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
-                MFLO_FM_MM<0x035>;
+                MFLO_FM_MM<0x035>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
-                MFLO_FM_MM<0x075>;
+                MFLO_FM_MM<0x075>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Multiply Add/Sub Instructions
-  def MADD_MM  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>;
-  def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>;
-  def MSUB_MM  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>;
-  def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
+  def MADD_MM  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MSUB_MM  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Count Leading
   def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM_MM<0x16c>,
-               ISA_MIPS32;
+               ISA_MICROMIPS;
   def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM_MM<0x12c>,
-               ISA_MIPS32;
+               ISA_MICROMIPS;
 
   /// Sign Ext In Register Instructions.
   def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
-               SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+               SEB_FM_MM<0x0ac>, ISA_MICROMIPS;
   def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
-               SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
+               SEB_FM_MM<0x0ec>, ISA_MICROMIPS;
 
   /// Word Swap Bytes Within Halfwords
   def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
-                SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
+                SEB_FM_MM<0x1ec>, ISA_MICROMIPS;
   // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
   def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
-                              immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
+                              immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
   def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
                               immZExt5, immZExt5Plus1>,
-               EXT_FM_MM<0x0c>;
+               EXT_FM_MM<0x0c>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Jump Instructions
-}
-let DecoderNamespace = "MicroMips", DecoderMethod = "DecodeJumpTargetMM" in
-  def J_MM          : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
-                      J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>,
-                      IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6;
+  let DecoderMethod = "DecodeJumpTargetMM" in
+    def J_MM          : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+                        J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>,
+                        IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6;
 
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   let DecoderMethod = "DecodeJumpTargetMM" in {
-    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
-    def JALX_MM     : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>;
+    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>,
+                      ISA_MICROMIPS32_NOT_MIPS32R6;
+    def JALX_MM     : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>,
+                      ISA_MICROMIPS32_NOT_MIPS32R6;
   }
   def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
               ISA_MICROMIPS32_NOT_MIPS32R6;
-  def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+  def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Jump Instructions - Short Delay Slot
-  def JALS_MM   : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>;
-  def JALRS_MM  : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>;
+  def JALS_MM   : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
+  def JALRS_MM  : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Branch Instructions
   def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
-                BEQ_FM_MM<0x25>;
+                BEQ_FM_MM<0x25>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BNE_MM  : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
-                BEQ_FM_MM<0x2d>;
+                BEQ_FM_MM<0x2d>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
-                BGEZ_FM_MM<0x2>;
+                BGEZ_FM_MM<0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
-                BGEZ_FM_MM<0x6>;
+                BGEZ_FM_MM<0x6>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
-                BGEZ_FM_MM<0x4>;
+                BGEZ_FM_MM<0x4>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
-                BGEZ_FM_MM<0x0>;
+                BGEZ_FM_MM<0x0>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
-                  BGEZAL_FM_MM<0x03>;
+                  BGEZAL_FM_MM<0x03>, ISA_MICROMIPS32_NOT_MIPS32R6;
   def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
-                  BGEZAL_FM_MM<0x01>;
+                  BGEZAL_FM_MM<0x01>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def BAL_BR_MM : BAL_BR_Pseudo<BGEZAL_MM, brtarget_mm>,
+                  ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Branch Instructions - Short Delay Slot
   def BGEZALS_MM : BranchCompareToZeroLinkMM<"bgezals", brtarget_mm,
-                                             GPR32Opnd>, BGEZAL_FM_MM<0x13>;
+                                             GPR32Opnd>, BGEZAL_FM_MM<0x13>,
+                   ISA_MICROMIPS32_NOT_MIPS32R6;
   def BLTZALS_MM : BranchCompareToZeroLinkMM<"bltzals", brtarget_mm,
-                                             GPR32Opnd>, BGEZAL_FM_MM<0x11>;
-}
-def B_MM    : UncondBranch<BEQ_MM, brtarget_mm>, IsBranch, ISA_MICROMIPS;
-let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+                                             GPR32Opnd>, BGEZAL_FM_MM<0x11>,
+                   ISA_MICROMIPS32_NOT_MIPS32R6;
+  def B_MM    : UncondBranch<BEQ_MM, brtarget_mm>, IsBranch,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Control Instructions
-  def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
-  def SYNCI_MM   : MMRel, SYNCI_FT<"synci">, SYNCI_FM_MM;
-  def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM;
-  def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10, II_SYSCALL>, SYS_FM_MM;
-  def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
-  def ERET_MM    : MMRel, ER_FT<"eret", II_ERET>, ER_FM_MM<0x3cd>;
-  def DERET_MM   : MMRel, ER_FT<"deret", II_DERET>, ER_FM_MM<0x38d>;
+  def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM, ISA_MICROMIPS;
+  let DecoderMethod = "DecodeSyncI_MM" in
+    def SYNCI_MM   : MMRel, SYNCI_FT<"synci", mem_mm_16>, SYNCI_FM_MM,
+                     ISA_MICROMIPS32_NOT_MIPS32R6;
+  def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM, ISA_MICROMIPS;
+  def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10, II_SYSCALL>, SYS_FM_MM,
+                   ISA_MICROMIPS;
+  def WAIT_MM    : MMRel, WaitMM<"wait">, WAIT_FM_MM, ISA_MICROMIPS;
+  def ERET_MM    : MMRel, ER_FT<"eret", II_ERET>, ER_FM_MM<0x3cd>,
+                   ISA_MICROMIPS;
+  def DERET_MM   : MMRel, ER_FT<"deret", II_DERET>, ER_FM_MM<0x38d>,
+                   ISA_MICROMIPS;
   def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM_MM<0x15d>,
-                   ISA_MIPS32R2;
+                   ISA_MICROMIPS;
   def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM_MM<0x11d>,
-                   ISA_MIPS32R2;
+                   ISA_MICROMIPS;
+  def TRAP_MM    : TrapBase<BREAK_MM>, ISA_MICROMIPS;
 
   /// Trap Instructions
-  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4, II_TEQ>, TEQ_FM_MM<0x0>;
-  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4, II_TGE>, TEQ_FM_MM<0x08>;
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4, II_TEQ>, TEQ_FM_MM<0x0>,
+                ISA_MICROMIPS;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4, II_TGE>, TEQ_FM_MM<0x08>,
+                ISA_MICROMIPS;
   def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm4, II_TGEU>,
-                TEQ_FM_MM<0x10>;
-  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4, II_TLT>, TEQ_FM_MM<0x20>;
+                TEQ_FM_MM<0x10>, ISA_MICROMIPS;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4, II_TLT>, TEQ_FM_MM<0x20>,
+                ISA_MICROMIPS;
   def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm4, II_TLTU>,
-                TEQ_FM_MM<0x28>;
-  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4, II_TNE>, TEQ_FM_MM<0x30>;
+                TEQ_FM_MM<0x28>, ISA_MICROMIPS;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4, II_TNE>, TEQ_FM_MM<0x30>,
+                ISA_MICROMIPS;
 
-  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM_MM<0x0e>;
-  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM_MM<0x09>;
+  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM_MM<0x0e>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM_MM<0x09>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
   def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>,
-                 TEQI_FM_MM<0x0b>;
-  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM_MM<0x08>;
+                 TEQI_FM_MM<0x0b>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM_MM<0x08>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
   def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>,
-                 TEQI_FM_MM<0x0a>;
-  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM_MM<0x0c>;
+                 TEQI_FM_MM<0x0a>, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM_MM<0x0c>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Load-linked, Store-conditional
-  def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
-  def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+  def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
+  def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
 
-  def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>;
-  def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>;
+  def LLE_MM : MMRel, LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>,
+               ISA_MICROMIPS, ASE_EVA;
+  def SCE_MM : MMRel, SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>,
+               ISA_MICROMIPS, ASE_EVA;
 
   let DecoderMethod = "DecodeCacheOpMM" in {
-  def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12, II_CACHE>,
-                 CACHE_PREF_FM_MM<0x08, 0x6>;
-  def PREF_MM  : MMRel, CacheOp<"pref", mem_mm_12, II_PREF>,
-                 CACHE_PREF_FM_MM<0x18, 0x2>;
+    def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12, II_CACHE>,
+                   CACHE_PREF_FM_MM<0x08, 0x6>, ISA_MICROMIPS32_NOT_MIPS32R6;
+    def PREF_MM  : MMRel, CacheOp<"pref", mem_mm_12, II_PREF>,
+                   CACHE_PREF_FM_MM<0x18, 0x2>, ISA_MICROMIPS32_NOT_MIPS32R6;
   }
 
   let DecoderMethod = "DecodePrefeOpMM" in {
     def PREFE_MM  : MMRel, CacheOp<"prefe", mem_mm_9, II_PREFE>,
-                    CACHE_PREFE_FM_MM<0x18, 0x2>;
+                    CACHE_PREFE_FM_MM<0x18, 0x2>, ISA_MICROMIPS, ASE_EVA;
     def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9, II_CACHEE>,
-                    CACHE_PREFE_FM_MM<0x18, 0x3>;
+                    CACHE_PREFE_FM_MM<0x18, 0x3>, ISA_MICROMIPS, ASE_EVA;
   }
-  def SSNOP_MM : MMRel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM_MM<0x1>;
-  def EHB_MM   : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM_MM<0x3>;
-  def PAUSE_MM : MMRel, Barrier<"pause", II_PAUSE>, BARRIER_FM_MM<0x5>;
-
-  def TLBP_MM : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM_MM<0x0d>;
-  def TLBR_MM : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM_MM<0x4d>;
-  def TLBWI_MM : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM_MM<0x8d>;
-  def TLBWR_MM : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM_MM<0xcd>;
-
-  def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10, II_SDBBP>, SDBBP_FM_MM;
-
-  def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
+  def SSNOP_MM : MMRel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM_MM<0x1>,
+                 ISA_MICROMIPS;
+  def EHB_MM   : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM_MM<0x3>,
+                 ISA_MICROMIPS;
+  def PAUSE_MM : MMRel, Barrier<"pause", II_PAUSE>, BARRIER_FM_MM<0x5>,
+                 ISA_MICROMIPS;
+
+  def TLBP_MM : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM_MM<0x0d>,
+                ISA_MICROMIPS;
+  def TLBR_MM : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM_MM<0x4d>,
+                ISA_MICROMIPS;
+  def TLBWI_MM : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM_MM<0x8d>,
+                 ISA_MICROMIPS;
+  def TLBWR_MM : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM_MM<0xcd>,
+                 ISA_MICROMIPS;
+
+  def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10, II_SDBBP>, SDBBP_FM_MM,
+                 ISA_MICROMIPS;
+
+  def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
 }
 
 def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>, ISA_MIPS1_NOT_32R6_64R6;
@@ -1017,89 +1132,177 @@ let DecoderNamespace = "MicroMips" in {
                ISA_MICROMIPS32_NOT_MIPS32R6;
 }
 
+let DecoderNamespace = "MicroMips" in {
+  def MFGC0_MM    : MMRel, MfCop0MM<"mfgc0", GPR32Opnd, COP0Opnd, II_MFGC0>,
+                    POOL32A_MFTC0_FM_MM<0b10011, 0b111100>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+  def MFHGC0_MM   : MMRel, MfCop0MM<"mfhgc0", GPR32Opnd, COP0Opnd, II_MFHGC0>,
+                    POOL32A_MFTC0_FM_MM<0b10011, 0b110100>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+  def MTGC0_MM    : MMRel, MtCop0MM<"mtgc0", COP0Opnd, GPR32Opnd, II_MTGC0>,
+                    POOL32A_MFTC0_FM_MM<0b11011, 0b111100>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+  def MTHGC0_MM   : MMRel, MtCop0MM<"mthgc0", COP0Opnd, GPR32Opnd, II_MTHGC0>,
+                    POOL32A_MFTC0_FM_MM<0b11011, 0b110100>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+  def HYPCALL_MM  : MMRel, HypcallMM<"hypcall">, POOL32A_HYPCALL_FM_MM,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+  def TLBGINV_MM  : MMRel, TLBINVMM<"tlbginv", II_TLBGINV>,
+                    POOL32A_TLBINV_FM_MM<0x105>, ISA_MICROMIPS32R5, ASE_VIRT;
+  def TLBGINVF_MM : MMRel, TLBINVMM<"tlbginvf", II_TLBGINVF>,
+                    POOL32A_TLBINV_FM_MM<0x145>, ISA_MICROMIPS32R5, ASE_VIRT;
+  def TLBGP_MM    : MMRel, TLBINVMM<"tlbgp", II_TLBGP>,
+                    POOL32A_TLBINV_FM_MM<0x5>, ISA_MICROMIPS32R5, ASE_VIRT;
+  def TLBGR_MM    : MMRel, TLBINVMM<"tlbgr", II_TLBGR>,
+                    POOL32A_TLBINV_FM_MM<0x45>, ISA_MICROMIPS32R5, ASE_VIRT;
+  def TLBGWI_MM   : MMRel, TLBINVMM<"tlbgwi", II_TLBGWI>,
+                    POOL32A_TLBINV_FM_MM<0x85>, ISA_MICROMIPS32R5, ASE_VIRT;
+  def TLBGWR_MM   : MMRel, TLBINVMM<"tlbgwr", II_TLBGWR>,
+                    POOL32A_TLBINV_FM_MM<0xc5>, ISA_MICROMIPS32R5, ASE_VIRT;
+}
+
 //===----------------------------------------------------------------------===//
 // MicroMips arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 
-let AdditionalPredicates = [InMicroMips] in {
-  def : MipsPat<(i32 immLi16:$imm),
-                (LI16_MM immLi16:$imm)>;
-
-  defm :  MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>;
-}
-
-let Predicates = [InMicroMips] in {
-  def : MipsPat<(not GPRMM16:$in),
-                (NOT16_MM GPRMM16:$in)>;
-  def : MipsPat<(not GPR32:$in),
-                (NOR_MM GPR32Opnd:$in, ZERO)>;
-
-  def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
-                (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
-  def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
-                (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
-  def : MipsPat<(add GPR32:$src, immSExt16:$imm),
-                (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
-
-  def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
-                (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
-  def : MipsPat<(and GPR32:$src, immZExt16:$imm),
-                (ANDi_MM GPR32:$src, immZExt16:$imm)>;
-
-  def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
-                (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
-  def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
-                (SLL_MM GPR32:$src, immZExt5:$imm)>;
-  def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
-                (SLLV_MM GPR32:$lhs, GPR32:$rhs)>;
-
-  def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
-                (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
-  def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
-                (SRL_MM GPR32:$src, immZExt5:$imm)>;
-  def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
-                (SRLV_MM GPR32:$lhs, GPR32:$rhs)>;
-
-  def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
-                (SRA_MM GPR32:$src, immZExt5:$imm)>;
-  def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
-                (SRAV_MM GPR32:$lhs, GPR32:$rhs)>;
-
-  def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
-                (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
-  def : MipsPat<(store GPR32:$src, addr:$addr),
-                (SW_MM GPR32:$src, addr:$addr)>;
-
-  def : MipsPat<(load addrimm4lsl2:$addr),
-                (LW16_MM addrimm4lsl2:$addr)>;
-  def : MipsPat<(load addr:$addr),
-                (LW_MM addr:$addr)>;
-  def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
-                (SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
-}
+defm : MipsHiLoRelocs<LUi_MM, ADDiu_MM, ZERO, GPR32Opnd>, ISA_MICROMIPS;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi_MM tglobaladdr:$in)>,
+      ISA_MICROMIPS;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi_MM texternalsym:$in)>,
+      ISA_MICROMIPS;
+
+def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi_MM tglobaltlsaddr:$in)>,
+      ISA_MICROMIPS;
+
+// gp_rel relocs
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu_MM GPR32:$gp, tglobaladdr:$in)>, ISA_MICROMIPS;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu_MM GPR32:$gp, tconstpool:$in)>, ISA_MICROMIPS;
+
+def : WrapperPat<tglobaladdr, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tconstpool, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<texternalsym, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tblockaddress, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tjumptable, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+def : WrapperPat<tglobaltlsaddr, ADDiu_MM, GPR32>, ISA_MICROMIPS;
+
+def : MipsPat<(atomic_load_8 addr:$a), (LB_MM addr:$a)>, ISA_MICROMIPS;
+def : MipsPat<(atomic_load_16 addr:$a), (LH_MM addr:$a)>, ISA_MICROMIPS;
+def : MipsPat<(atomic_load_32 addr:$a), (LW_MM addr:$a)>, ISA_MICROMIPS;
+
+def : MipsPat<(i32 immLi16:$imm),
+              (LI16_MM immLi16:$imm)>, ISA_MICROMIPS;
+
+defm :  MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>, ISA_MICROMIPS;
+
+def : MipsPat<(not GPRMM16:$in),
+              (NOT16_MM GPRMM16:$in)>, ISA_MICROMIPS;
+def : MipsPat<(not GPR32:$in),
+              (NOR_MM GPR32Opnd:$in, ZERO)>, ISA_MICROMIPS;
+
+def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
+              (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
+              (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(add GPR32:$src, immSExt16:$imm),
+              (ADDiu_MM GPR32:$src, immSExt16:$imm)>, ISA_MICROMIPS;
+
+def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+              (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+              (ANDi_MM GPR32:$src, immZExt16:$imm)>, ISA_MICROMIPS;
+
+def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
+              (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
+              (SLL_MM GPR32:$src, immZExt5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
+              (SLLV_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
+              (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
+              (SRL_MM GPR32:$src, immZExt5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
+              (SRLV_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
+              (SRA_MM GPR32:$src, immZExt5:$imm)>, ISA_MICROMIPS;
+def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
+              (SRAV_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+              (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS;
+def : MipsPat<(store GPR32:$src, addr:$addr),
+              (SW_MM GPR32:$src, addr:$addr)>, ISA_MICROMIPS;
+
+def : MipsPat<(load addrimm4lsl2:$addr),
+              (LW16_MM addrimm4lsl2:$addr)>, ISA_MICROMIPS;
+def : MipsPat<(load addr:$addr),
+              (LW_MM addr:$addr)>, ISA_MICROMIPS;
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+              (SUBu_MM GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS;
+
+def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_MM addr:$src)>,
+      ISA_MICROMIPS;
+
+def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_MM addr:$src)>,
+      ISA_MICROMIPS;
+
+def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_MM addr:$src)>,
+      ISA_MICROMIPS;
+
+let AddedComplexity = 40 in
+  def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
+                (LH_MM addrRegImm:$a)>, ISA_MICROMIPS;
+
+
+def : MipsPat<(bswap GPR32:$rt), (ROTR_MM (WSBH_MM GPR32:$rt), 16)>,
+      ISA_MICROMIPS;
 
 def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
               (TAILCALL_MM tglobaladdr:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
               (TAILCALL_MM texternalsym:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
-let AddedComplexity = 40 in {
-  def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
-                (LH_MM addrRegImm:$a)>;
-}
-def : MipsPat<(atomic_load_16 addr:$a),
-              (LH_MM addr:$a)>;
-def : MipsPat<(i32 (extloadi16 addr:$src)),
-              (LHu_MM addr:$src)>;
-
 defm : BrcondPats<GPR32, BEQ_MM, BEQ_MM, BNE_MM, SLT_MM, SLTu_MM, SLTi_MM,
-                  SLTiu_MM, ZERO>;
+                  SLTiu_MM, ZERO>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+              (BLEZ_MM i32:$lhs, bb:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+              (BGEZ_MM i32:$lhs, bb:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : SeteqPats<GPR32, SLTiu_MM, XOR_MM, SLTu_MM, ZERO>, ISA_MICROMIPS;
+defm : SetlePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>, ISA_MICROMIPS;
+defm : SetgtPats<GPR32, SLT_MM, SLTu_MM>, ISA_MICROMIPS;
+defm : SetgePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>, ISA_MICROMIPS;
+defm : SetgeImmPats<GPR32, XORi_MM, SLTi_MM, SLTiu_MM>, ISA_MICROMIPS;
+
+// Select patterns
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_MM, XOR_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_MM, XORi_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
 
-defm : SeteqPats<GPR32, SLTiu_MM, XOR_MM, SLTu_MM, ZERO>;
-defm : SetlePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
-defm : SetgtPats<GPR32, SLT_MM, SLTu_MM>;
-defm : SetgePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
-defm : SetgeImmPats<GPR32, XORi_MM, SLTi_MM, SLTiu_MM>;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_MM, XOR_MM>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_MM, XOR_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_MM, XORi_MM>,
+       ISA_MICROMIPS32_NOT_MIPS32R6;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_MM, XOR_MM>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
 //===----------------------------------------------------------------------===//
 // MicroMips instruction aliases
@@ -1111,17 +1314,29 @@ class UncondBranchMMPseudo<string opstr> :
 
 def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
 
-let Predicates = [InMicroMips] in {
+let EncodingPredicates = [InMicroMips] in {
   def SDIV_MM_Pseudo : MultDivPseudo<SDIV_MM, ACC64, GPR32Opnd, MipsDivRem,
                                      II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
   def UDIV_MM_Pseudo : MultDivPseudo<UDIV_MM, ACC64, GPR32Opnd, MipsDivRemU,
                                      II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 
-  def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
-  def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
-  def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>;
-  def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
-  def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>, ISA_MICROMIPS;
+  def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>, ISA_MICROMIPS;
+  def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>, ISA_MICROMIPS;
+  def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MICROMIPS;
+  def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MICROMIPS;
+  def : MipsInstAlias<"neg $rt, $rs",
+                      (SUB_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
+  def : MipsInstAlias<"neg $rt",
+                      (SUB_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
+  def : MipsInstAlias<"negu $rt, $rs",
+                      (SUBu_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
+  def : MipsInstAlias<"negu $rt",
+                      (SUBu_MM GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
   def : MipsInstAlias<"teq $rs, $rt",
                       (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
   def : MipsInstAlias<"tge $rs, $rt",
@@ -1166,32 +1381,64 @@ let Predicates = [InMicroMips] in {
                       (SRL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
   def : MipsInstAlias<"rotr $rt, $imm",
                       (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
-  def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
+  def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>;
+  def : MipsInstAlias<"sync", (SYNC_MM 0), 1>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>;
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>;
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>;
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>;
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>;
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>, ISA_MICROMIPS;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>;
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>, ISA_MICROMIPS;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>, ISA_MICROMIPS;
 
   def : MipsInstAlias<"not $rt, $rs",
-                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
   def : MipsInstAlias<"not $rt",
-                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
+                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
   def : MipsInstAlias<"bnez $rs,$offset",
-                      (BNE_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+                      (BNE_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+        ISA_MICROMIPS;
   def : MipsInstAlias<"beqz $rs,$offset",
-                      (BEQ_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+                      (BEQ_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+        ISA_MICROMIPS;
   def : MipsInstAlias<"seh $rd", (SEH_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
-                     ISA_MIPS32R2_NOT_32R6_64R6;
+                     ISA_MICROMIPS;
   def : MipsInstAlias<"seb $rd", (SEB_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
-                     ISA_MIPS32R2_NOT_32R6_64R6;
-}
+                     ISA_MICROMIPS;
+  def : MipsInstAlias<"break", (BREAK_MM 0, 0), 1>, ISA_MICROMIPS;
+  def : MipsInstAlias<"break $imm", (BREAK_MM uimm10:$imm, 0), 1>,
+        ISA_MICROMIPS;
+  def : MipsInstAlias<"bal $offset", (BGEZAL_MM ZERO, brtarget_mm:$offset), 1>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
+
+  def : MipsInstAlias<"j $rs", (JR_MM GPR32Opnd:$rs), 0>,
+        ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+def : MipsInstAlias<"rdhwr $rt, $rs",
+                    (RDHWR_MM GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
+      ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def : MipsInstAlias<"hypcall", (HYPCALL_MM 0), 1>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mfgc0 $rt, $rs",
+                    (MFGC0_MM GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mfhgc0 $rt, $rs",
+                    (MFHGC0_MM GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mtgc0 $rt, $rs",
+                    (MTGC0_MM COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"mthgc0 $rt, $rs",
+                    (MTHGC0_MM COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+                    ISA_MICROMIPS32R5, ASE_VIRT;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp b/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
index f2e014084e46..568cdfb5b110 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -10,7 +10,6 @@
 /// This pass is used to reduce the size of instructions where applicable.
 ///
 /// TODO: Implement microMIPS64 support.
-/// TODO: Implement support for reducing into lwp/swp instruction.
 //===----------------------------------------------------------------------===//
 #include "Mips.h"
 #include "MipsInstrInfo.h"
@@ -22,8 +21,10 @@
 using namespace llvm;
 
 #define DEBUG_TYPE "micromips-reduce-size"
+#define MICROMIPS_SIZE_REDUCE_NAME "MicroMips instruction size reduce pass"
 
-STATISTIC(NumReduced, "Number of 32-bit instructions reduced to 16-bit ones");
+STATISTIC(NumReduced, "Number of instructions reduced (32-bit to 16-bit ones, "
+                      "or two instructions into one");
 
 namespace {
 
@@ -35,12 +36,15 @@ enum OperandTransfer {
   OT_Operands02,  ///< Transfer operands 0 and 2
   OT_Operand2,    ///< Transfer just operand 2
   OT_OperandsXOR, ///< Transfer operands for XOR16
+  OT_OperandsLwp, ///< Transfer operands for LWP
+  OT_OperandsSwp, ///< Transfer operands for SWP
 };
 
 /// Reduction type
 // TODO: Will be extended when additional optimizations are added
 enum ReduceType {
-  RT_OneInstr ///< Reduce one instruction into a smaller instruction
+  RT_TwoInstr, ///< Reduce two instructions into one instruction
+  RT_OneInstr  ///< Reduce one instruction into a smaller instruction
 };
 
 // Information about immediate field restrictions
@@ -76,21 +80,22 @@ struct OpCodes {
   unsigned NarrowOpc; ///< Narrow opcode
 };
 
+typedef struct ReduceEntryFunArgs ReduceEntryFunArgs;
+
 /// ReduceTable - A static table with information on mapping from wide
 /// opcodes to narrow
 struct ReduceEntry {
 
   enum ReduceType eRType; ///< Reduction type
   bool (*ReduceFunction)(
-      MachineInstr *MI,
-      const ReduceEntry &Entry); ///< Pointer to reduce function
-  struct OpCodes Ops;            ///< All relevant OpCodes
-  struct OpInfo OpInf;           ///< Characteristics of operands
-  struct ImmField Imm;           ///< Characteristics of immediate field
+      ReduceEntryFunArgs *Arguments); ///< Pointer to reduce function
+  struct OpCodes Ops;                 ///< All relevant OpCodes
+  struct OpInfo OpInf;                ///< Characteristics of operands
+  struct ImmField Imm;                ///< Characteristics of immediate field
 
   ReduceEntry(enum ReduceType RType, struct OpCodes Op,
-              bool (*F)(MachineInstr *MI, const ReduceEntry &Entry),
-              struct OpInfo OpInf, struct ImmField Imm)
+              bool (*F)(ReduceEntryFunArgs *Arguments), struct OpInfo OpInf,
+              struct ImmField Imm)
       : eRType(RType), ReduceFunction(F), Ops(Op), OpInf(OpInf), Imm(Imm) {}
 
   unsigned NarrowOpc() const { return Ops.NarrowOpc; }
@@ -113,6 +118,20 @@ struct ReduceEntry {
   }
 };
 
+// Function arguments for ReduceFunction
+struct ReduceEntryFunArgs {
+  MachineInstr *MI;         // Instruction
+  const ReduceEntry &Entry; // Entry field
+  MachineBasicBlock::instr_iterator
+      &NextMII; // Iterator to next instruction in block
+
+  ReduceEntryFunArgs(MachineInstr *argMI, const ReduceEntry &argEntry,
+                     MachineBasicBlock::instr_iterator &argNextMII)
+      : MI(argMI), Entry(argEntry), NextMII(argNextMII) {}
+};
+
+typedef llvm::SmallVector<ReduceEntry, 32> ReduceEntryVector;
+
 class MicroMipsSizeReduce : public MachineFunctionPass {
 public:
   static char ID;
@@ -132,42 +151,50 @@ private:
   bool ReduceMBB(MachineBasicBlock &MBB);
 
   /// Attempts to reduce MI, returns true on success.
-  bool ReduceMI(const MachineBasicBlock::instr_iterator &MII);
+  bool ReduceMI(const MachineBasicBlock::instr_iterator &MII,
+                MachineBasicBlock::instr_iterator &NextMII);
 
   // Attempts to reduce LW/SW instruction into LWSP/SWSP,
   // returns true on success.
-  static bool ReduceXWtoXWSP(MachineInstr *MI, const ReduceEntry &Entry);
+  static bool ReduceXWtoXWSP(ReduceEntryFunArgs *Arguments);
+
+  // Attempts to reduce two LW/SW instructions into LWP/SWP instruction,
+  // returns true on success.
+  static bool ReduceXWtoXWP(ReduceEntryFunArgs *Arguments);
 
   // Attempts to reduce LBU/LHU instruction into LBU16/LHU16,
   // returns true on success.
-  static bool ReduceLXUtoLXU16(MachineInstr *MI, const ReduceEntry &Entry);
+  static bool ReduceLXUtoLXU16(ReduceEntryFunArgs *Arguments);
 
   // Attempts to reduce SB/SH instruction into SB16/SH16,
   // returns true on success.
-  static bool ReduceSXtoSX16(MachineInstr *MI, const ReduceEntry &Entry);
+  static bool ReduceSXtoSX16(ReduceEntryFunArgs *Arguments);
 
   // Attempts to reduce arithmetic instructions, returns true on success.
-  static bool ReduceArithmeticInstructions(MachineInstr *MI,
-                                           const ReduceEntry &Entry);
+  static bool ReduceArithmeticInstructions(ReduceEntryFunArgs *Arguments);
 
   // Attempts to reduce ADDIU into ADDIUSP instruction,
   // returns true on success.
-  static bool ReduceADDIUToADDIUSP(MachineInstr *MI, const ReduceEntry &Entry);
+  static bool ReduceADDIUToADDIUSP(ReduceEntryFunArgs *Arguments);
 
   // Attempts to reduce ADDIU into ADDIUR1SP instruction,
   // returns true on success.
-  static bool ReduceADDIUToADDIUR1SP(MachineInstr *MI,
-                                     const ReduceEntry &Entry);
+  static bool ReduceADDIUToADDIUR1SP(ReduceEntryFunArgs *Arguments);
 
   // Attempts to reduce XOR into XOR16 instruction,
   // returns true on success.
-  static bool ReduceXORtoXOR16(MachineInstr *MI, const ReduceEntry &Entry);
+  static bool ReduceXORtoXOR16(ReduceEntryFunArgs *Arguments);
 
-  // Changes opcode of an instruction.
-  static bool ReplaceInstruction(MachineInstr *MI, const ReduceEntry &Entry);
+  // Changes opcode of an instruction, replaces an instruction with a
+  // new one, or replaces two instructions with a new instruction
+  // depending on their order i.e. if these are consecutive forward
+  // or consecutive backward
+  static bool ReplaceInstruction(MachineInstr *MI, const ReduceEntry &Entry,
+                                 MachineInstr *MI2 = nullptr,
+                                 bool ConsecutiveForward = true);
 
   // Table with transformation rules for each instruction.
-  static llvm::SmallVector<ReduceEntry, 16> ReduceTable;
+  static ReduceEntryVector ReduceTable;
 };
 
 char MicroMipsSizeReduce::ID = 0;
@@ -175,7 +202,7 @@ const MipsInstrInfo *MicroMipsSizeReduce::MipsII;
 
 // This table must be sorted by WideOpc as a main criterion and
 // ReduceType as a sub-criterion (when wide opcodes are the same).
-llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
+ReduceEntryVector MicroMipsSizeReduce::ReduceTable = {
 
     // ReduceType, OpCodes, ReduceFunction,
     // OpInfo(TransferOperands),
@@ -200,12 +227,20 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
      OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)},
     {RT_OneInstr, OpCodes(Mips::LEA_ADDiu, Mips::ADDIUR1SP_MM),
      ReduceADDIUToADDIUR1SP, OpInfo(OT_Operands02), ImmField(2, 0, 64, 2)},
+    {RT_OneInstr, OpCodes(Mips::LEA_ADDiu_MM, Mips::ADDIUR1SP_MM),
+     ReduceADDIUToADDIUR1SP, OpInfo(OT_Operands02), ImmField(2, 0, 64, 2)},
     {RT_OneInstr, OpCodes(Mips::LHu, Mips::LHU16_MM), ReduceLXUtoLXU16,
      OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
     {RT_OneInstr, OpCodes(Mips::LHu_MM, Mips::LHU16_MM), ReduceLXUtoLXU16,
      OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_TwoInstr, OpCodes(Mips::LW, Mips::LWP_MM), ReduceXWtoXWP,
+     OpInfo(OT_OperandsLwp), ImmField(0, -2048, 2048, 2)},
     {RT_OneInstr, OpCodes(Mips::LW, Mips::LWSP_MM), ReduceXWtoXWSP,
      OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+    {RT_TwoInstr, OpCodes(Mips::LW16_MM, Mips::LWP_MM), ReduceXWtoXWP,
+     OpInfo(OT_OperandsLwp), ImmField(0, -2048, 2048, 2)},
+    {RT_TwoInstr, OpCodes(Mips::LW_MM, Mips::LWP_MM), ReduceXWtoXWP,
+     OpInfo(OT_OperandsLwp), ImmField(0, -2048, 2048, 2)},
     {RT_OneInstr, OpCodes(Mips::LW_MM, Mips::LWSP_MM), ReduceXWtoXWSP,
      OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
     {RT_OneInstr, OpCodes(Mips::SB, Mips::SB16_MM), ReduceSXtoSX16,
@@ -222,15 +257,24 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
     {RT_OneInstr, OpCodes(Mips::SUBu_MM, Mips::SUBU16_MM),
      ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
      ImmField(0, 0, 0, -1)},
+    {RT_TwoInstr, OpCodes(Mips::SW, Mips::SWP_MM), ReduceXWtoXWP,
+     OpInfo(OT_OperandsSwp), ImmField(0, -2048, 2048, 2)},
     {RT_OneInstr, OpCodes(Mips::SW, Mips::SWSP_MM), ReduceXWtoXWSP,
      OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+    {RT_TwoInstr, OpCodes(Mips::SW16_MM, Mips::SWP_MM), ReduceXWtoXWP,
+     OpInfo(OT_OperandsSwp), ImmField(0, -2048, 2048, 2)},
+    {RT_TwoInstr, OpCodes(Mips::SW_MM, Mips::SWP_MM), ReduceXWtoXWP,
+     OpInfo(OT_OperandsSwp), ImmField(0, -2048, 2048, 2)},
     {RT_OneInstr, OpCodes(Mips::SW_MM, Mips::SWSP_MM), ReduceXWtoXWSP,
      OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
     {RT_OneInstr, OpCodes(Mips::XOR, Mips::XOR16_MM), ReduceXORtoXOR16,
      OpInfo(OT_OperandsXOR), ImmField(0, 0, 0, -1)},
     {RT_OneInstr, OpCodes(Mips::XOR_MM, Mips::XOR16_MM), ReduceXORtoXOR16,
      OpInfo(OT_OperandsXOR), ImmField(0, 0, 0, -1)}};
-} // namespace
+} // end anonymous namespace
+
+INITIALIZE_PASS(MicroMipsSizeReduce, DEBUG_TYPE, MICROMIPS_SIZE_REDUCE_NAME,
+                false, false)
 
 // Returns true if the machine operand MO is register SP.
 static bool IsSP(const MachineOperand &MO) {
@@ -297,37 +341,100 @@ static bool ImmInRange(MachineInstr *MI, const ReduceEntry &Entry) {
   return true;
 }
 
+// Returns true if MI can be reduced to lwp/swp instruction
+static bool CheckXWPInstr(MachineInstr *MI, bool ReduceToLwp,
+                          const ReduceEntry &Entry) {
+
+  if (ReduceToLwp &&
+      !(MI->getOpcode() == Mips::LW || MI->getOpcode() == Mips::LW_MM ||
+        MI->getOpcode() == Mips::LW16_MM))
+    return false;
+
+  if (!ReduceToLwp &&
+      !(MI->getOpcode() == Mips::SW || MI->getOpcode() == Mips::SW_MM ||
+        MI->getOpcode() == Mips::SW16_MM))
+    return false;
+
+  unsigned reg = MI->getOperand(0).getReg();
+  if (reg == Mips::RA)
+    return false;
+
+  if (!ImmInRange(MI, Entry))
+    return false;
+
+  if (ReduceToLwp && (MI->getOperand(0).getReg() == MI->getOperand(1).getReg()))
+    return false;
+
+  return true;
+}
+
+// Returns true if the registers Reg1 and Reg2 are consecutive
+static bool ConsecutiveRegisters(unsigned Reg1, unsigned Reg2) {
+  static SmallVector<unsigned, 31> Registers = {
+      Mips::AT, Mips::V0, Mips::V1, Mips::A0, Mips::A1, Mips::A2, Mips::A3,
+      Mips::T0, Mips::T1, Mips::T2, Mips::T3, Mips::T4, Mips::T5, Mips::T6,
+      Mips::T7, Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
+      Mips::S6, Mips::S7, Mips::T8, Mips::T9, Mips::K0, Mips::K1, Mips::GP,
+      Mips::SP, Mips::FP, Mips::RA};
+
+  for (uint8_t i = 0; i < Registers.size() - 1; i++) {
+    if (Registers[i] == Reg1) {
+      if (Registers[i + 1] == Reg2)
+        return true;
+      else
+        return false;
+    }
+  }
+  return false;
+}
+
+// Returns true if registers and offsets are consecutive
+static bool ConsecutiveInstr(MachineInstr *MI1, MachineInstr *MI2) {
+
+  int64_t Offset1, Offset2;
+  if (!GetImm(MI1, 2, Offset1))
+    return false;
+  if (!GetImm(MI2, 2, Offset2))
+    return false;
+
+  unsigned Reg1 = MI1->getOperand(0).getReg();
+  unsigned Reg2 = MI2->getOperand(0).getReg();
+
+  return ((Offset1 == (Offset2 - 4)) && (ConsecutiveRegisters(Reg1, Reg2)));
+}
+
 MicroMipsSizeReduce::MicroMipsSizeReduce() : MachineFunctionPass(ID) {}
 
-bool MicroMipsSizeReduce::ReduceMI(
-    const MachineBasicBlock::instr_iterator &MII) {
+bool MicroMipsSizeReduce::ReduceMI(const MachineBasicBlock::instr_iterator &MII,
+                                   MachineBasicBlock::instr_iterator &NextMII) {
 
   MachineInstr *MI = &*MII;
   unsigned Opcode = MI->getOpcode();
 
   // Search the table.
-  llvm::SmallVector<ReduceEntry, 16>::const_iterator Start =
-      std::begin(ReduceTable);
-  llvm::SmallVector<ReduceEntry, 16>::const_iterator End =
-      std::end(ReduceTable);
+  ReduceEntryVector::const_iterator Start = std::begin(ReduceTable);
+  ReduceEntryVector::const_iterator End = std::end(ReduceTable);
 
-  std::pair<llvm::SmallVector<ReduceEntry, 16>::const_iterator,
-            llvm::SmallVector<ReduceEntry, 16>::const_iterator>
+  std::pair<ReduceEntryVector::const_iterator,
+            ReduceEntryVector::const_iterator>
       Range = std::equal_range(Start, End, Opcode);
 
   if (Range.first == Range.second)
     return false;
 
-  for (llvm::SmallVector<ReduceEntry, 16>::const_iterator Entry = Range.first;
-       Entry != Range.second; ++Entry)
-    if (((*Entry).ReduceFunction)(&(*MII), *Entry))
+  for (ReduceEntryVector::const_iterator Entry = Range.first;
+       Entry != Range.second; ++Entry) {
+    ReduceEntryFunArgs Arguments(&(*MII), *Entry, NextMII);
+    if (((*Entry).ReduceFunction)(&Arguments))
       return true;
-
+  }
   return false;
 }
 
-bool MicroMipsSizeReduce::ReduceXWtoXWSP(MachineInstr *MI,
-                                         const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceXWtoXWSP(ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
 
   if (!ImmInRange(MI, Entry))
     return false;
@@ -338,8 +445,51 @@ bool MicroMipsSizeReduce::ReduceXWtoXWSP(MachineInstr *MI,
   return ReplaceInstruction(MI, Entry);
 }
 
+bool MicroMipsSizeReduce::ReduceXWtoXWP(ReduceEntryFunArgs *Arguments) {
+
+  const ReduceEntry &Entry = Arguments->Entry;
+  MachineBasicBlock::instr_iterator &NextMII = Arguments->NextMII;
+  const MachineBasicBlock::instr_iterator &E =
+      Arguments->MI->getParent()->instr_end();
+
+  if (NextMII == E)
+    return false;
+
+  MachineInstr *MI1 = Arguments->MI;
+  MachineInstr *MI2 = &*NextMII;
+
+  // ReduceToLwp = true/false - reduce to LWP/SWP instruction
+  bool ReduceToLwp = (MI1->getOpcode() == Mips::LW) ||
+                     (MI1->getOpcode() == Mips::LW_MM) ||
+                     (MI1->getOpcode() == Mips::LW16_MM);
+
+  if (!CheckXWPInstr(MI1, ReduceToLwp, Entry))
+    return false;
+
+  if (!CheckXWPInstr(MI2, ReduceToLwp, Entry))
+    return false;
+
+  unsigned Reg1 = MI1->getOperand(1).getReg();
+  unsigned Reg2 = MI2->getOperand(1).getReg();
+
+  if (Reg1 != Reg2)
+    return false;
+
+  bool ConsecutiveForward = ConsecutiveInstr(MI1, MI2);
+  bool ConsecutiveBackward = ConsecutiveInstr(MI2, MI1);
+
+  if (!(ConsecutiveForward || ConsecutiveBackward))
+    return false;
+
+  NextMII = std::next(NextMII);
+  return ReplaceInstruction(MI1, Entry, MI2, ConsecutiveForward);
+}
+
 bool MicroMipsSizeReduce::ReduceArithmeticInstructions(
-    MachineInstr *MI, const ReduceEntry &Entry) {
+    ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
 
   if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
       !isMMThreeBitGPRegister(MI->getOperand(1)) ||
@@ -349,8 +499,11 @@ bool MicroMipsSizeReduce::ReduceArithmeticInstructions(
   return ReplaceInstruction(MI, Entry);
 }
 
-bool MicroMipsSizeReduce::ReduceADDIUToADDIUR1SP(MachineInstr *MI,
-                                                 const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceADDIUToADDIUR1SP(
+    ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
 
   if (!ImmInRange(MI, Entry))
     return false;
@@ -361,8 +514,10 @@ bool MicroMipsSizeReduce::ReduceADDIUToADDIUR1SP(MachineInstr *MI,
   return ReplaceInstruction(MI, Entry);
 }
 
-bool MicroMipsSizeReduce::ReduceADDIUToADDIUSP(MachineInstr *MI,
-                                               const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceADDIUToADDIUSP(ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
 
   int64_t ImmValue;
   if (!GetImm(MI, Entry.ImmField(), ImmValue))
@@ -377,8 +532,10 @@ bool MicroMipsSizeReduce::ReduceADDIUToADDIUSP(MachineInstr *MI,
   return ReplaceInstruction(MI, Entry);
 }
 
-bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI,
-                                           const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceLXUtoLXU16(ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
 
   if (!ImmInRange(MI, Entry))
     return false;
@@ -390,8 +547,10 @@ bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI,
   return ReplaceInstruction(MI, Entry);
 }
 
-bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI,
-                                         const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceSXtoSX16(ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
 
   if (!ImmInRange(MI, Entry))
     return false;
@@ -403,8 +562,11 @@ bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI,
   return ReplaceInstruction(MI, Entry);
 }
 
-bool MicroMipsSizeReduce::ReduceXORtoXOR16(MachineInstr *MI,
-                                           const ReduceEntry &Entry) {
+bool MicroMipsSizeReduce::ReduceXORtoXOR16(ReduceEntryFunArgs *Arguments) {
+
+  MachineInstr *MI = Arguments->MI;
+  const ReduceEntry &Entry = Arguments->Entry;
+
   if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
       !isMMThreeBitGPRegister(MI->getOperand(1)) ||
       !isMMThreeBitGPRegister(MI->getOperand(2)))
@@ -433,23 +595,25 @@ bool MicroMipsSizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       continue;
 
     // Try to reduce 32-bit instruction into 16-bit instruction
-    Modified |= ReduceMI(MII);
+    Modified |= ReduceMI(MII, NextMII);
   }
 
   return Modified;
 }
 
 bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
-                                             const ReduceEntry &Entry) {
+                                             const ReduceEntry &Entry,
+                                             MachineInstr *MI2,
+                                             bool ConsecutiveForward) {
 
   enum OperandTransfer OpTransfer = Entry.TransferOperands();
 
-  DEBUG(dbgs() << "Converting 32-bit: " << *MI);
+  LLVM_DEBUG(dbgs() << "Converting 32-bit: " << *MI);
   ++NumReduced;
 
   if (OpTransfer == OT_OperandsAll) {
     MI->setDesc(MipsII->get(Entry.NarrowOpc()));
-    DEBUG(dbgs() << "       to 16-bit: " << *MI);
+    LLVM_DEBUG(dbgs() << "       to 16-bit: " << *MI);
     return true;
   } else {
     MachineBasicBlock &MBB = *MI->getParent();
@@ -477,6 +641,27 @@ bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
       }
       break;
     }
+    case OT_OperandsLwp:
+    case OT_OperandsSwp: {
+      if (ConsecutiveForward) {
+        MIB.add(MI->getOperand(0));
+        MIB.add(MI2->getOperand(0));
+        MIB.add(MI->getOperand(1));
+        MIB.add(MI->getOperand(2));
+      } else { // consecutive backward
+        MIB.add(MI2->getOperand(0));
+        MIB.add(MI->getOperand(0));
+        MIB.add(MI2->getOperand(1));
+        MIB.add(MI2->getOperand(2));
+      }
+
+      LLVM_DEBUG(dbgs() << "and converting 32-bit: " << *MI2
+                        << "       to: " << *MIB);
+
+      MBB.erase_instr(MI);
+      MBB.erase_instr(MI2);
+      return true;
+    }
     default:
       llvm_unreachable("Unknown operand transfer!");
     }
@@ -484,7 +669,7 @@ bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
 
-    DEBUG(dbgs() << "       to 16-bit: " << *MIB);
+    LLVM_DEBUG(dbgs() << "       to 16-bit: " << *MIB);
     MBB.erase_instr(MI);
     return true;
   }
@@ -511,6 +696,6 @@ bool MicroMipsSizeReduce::runOnMachineFunction(MachineFunction &MF) {
 }
 
 /// Returns an instance of the MicroMips size reduction pass.
-FunctionPass *llvm::createMicroMipsSizeReductionPass() {
+FunctionPass *llvm::createMicroMipsSizeReducePass() {
   return new MicroMipsSizeReduce();
 }
diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h
index 008b9505ee26..ef3a807c7648 100644
--- a/contrib/llvm/lib/Target/Mips/Mips.h
+++ b/contrib/llvm/lib/Target/Mips/Mips.h
@@ -22,6 +22,11 @@ namespace llvm {
   class MipsTargetMachine;
   class ModulePass;
   class FunctionPass;
+  class MipsRegisterBankInfo;
+  class MipsSubtarget;
+  class MipsTargetMachine;
+  class InstructionSelector;
+  class PassRegistry;
 
   ModulePass *createMipsOs16Pass();
   ModulePass *createMips16HardFloatPass();
@@ -29,10 +34,18 @@ namespace llvm {
   FunctionPass *createMipsModuleISelDagPass();
   FunctionPass *createMipsOptimizePICCallPass();
   FunctionPass *createMipsDelaySlotFillerPass();
-  FunctionPass *createMipsHazardSchedule();
-  FunctionPass *createMipsLongBranchPass();
+  FunctionPass *createMipsBranchExpansion();
   FunctionPass *createMipsConstantIslandPass();
-  FunctionPass *createMicroMipsSizeReductionPass();
+  FunctionPass *createMicroMipsSizeReducePass();
+  FunctionPass *createMipsExpandPseudoPass();
+
+  InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &,
+                                                     MipsSubtarget &,
+                                                     MipsRegisterBankInfo &);
+
+  void initializeMipsDelaySlotFillerPass(PassRegistry &);
+  void initializeMipsBranchExpansionPass(PassRegistry &);
+  void initializeMicroMipsSizeReducePass(PassRegistry &);
 } // end namespace llvm;
 
 #endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
index f8e739497f4c..2f3a1c399d3e 100644
--- a/contrib/llvm/lib/Target/Mips/Mips.td
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -28,8 +28,10 @@ class PredicateControl {
   list<Predicate> PTRPredicates = [];
   // Predicates for the FGR size and layout such as IsFP64bit
   list<Predicate> FGRPredicates = [];
-  // Predicates for the instruction group membership such as ISA's and ASE's
+  // Predicates for the instruction group membership such as ISA's.
   list<Predicate> InsnPredicates = [];
+  // Predicate for the ASE that an instruction belongs to.
+  list<Predicate> ASEPredicate = [];
   // Predicate for marking the instruction as usable in hard-float mode only.
   list<Predicate> HardFloatPredicate = [];
   // Predicates for anything else
@@ -40,6 +42,7 @@ class PredicateControl {
                                            FGRPredicates,
                                            InsnPredicates,
                                            HardFloatPredicate,
+                                           ASEPredicate,
                                            AdditionalPredicates);
 }
 
@@ -56,6 +59,7 @@ include "MipsRegisterInfo.td"
 include "MipsSchedule.td"
 include "MipsInstrInfo.td"
 include "MipsCallingConv.td"
+include "MipsRegisterBanks.td"
 
 // Avoid forward declaration issues.
 include "MipsScheduleP5600.td"
@@ -173,6 +177,14 @@ def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
 
 def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
 
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Mips R6 CRC ASE">;
+
+def FeatureVirt : SubtargetFeature<"virt", "HasVirt", "true",
+                                   "Mips Virtualization ASE">;
+
+def FeatureGINV : SubtargetFeature<"ginv", "HasGINV", "true",
+                                   "Mips Global Invalidate ASE">;
+
 def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
                                          "microMips mode">;
 
@@ -242,4 +254,5 @@ def Mips : Target {
   let InstructionSet = MipsInstrInfo;
   let AssemblyParsers = [MipsAsmParser];
   let AssemblyParserVariants = [MipsAsmParserVariant];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index cb59e2ddb1c6..122c1f5377b6 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -42,7 +42,6 @@ Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI)
 
 void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
                                        MachineBasicBlock &MBB) const {
-  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const Mips16InstrInfo &TII =
       *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
@@ -92,11 +91,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
 
 void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
                                  MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const Mips16InstrInfo &TII =
       *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
-  DebugLoc dl = MBBI->getDebugLoc();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   uint64_t StackSize = MFI.getStackSize();
 
   if (!StackSize)
@@ -117,7 +116,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           const std::vector<CalleeSavedInfo> &CSI,
                           const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  MachineBasicBlock *EntryBlock = &MF->front();
 
   //
   // Registers RA, S0,S1 are the callee saved registers and they
@@ -134,7 +132,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
       && MF->getFrameInfo().isReturnAddressTaken();
     if (!IsRAAndRetAddrIsTaken)
-      EntryBlock->addLiveIn(Reg);
+      MBB.addLiveIn(Reg);
   }
 
   return true;
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 682ea5c4ed7f..c310d9491af8 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -482,11 +482,11 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
 // remove the use-soft-float attribute
 static void removeUseSoftFloat(Function &F) {
   AttrBuilder B;
-  DEBUG(errs() << "removing -use-soft-float\n");
+  LLVM_DEBUG(errs() << "removing -use-soft-float\n");
   B.addAttribute("use-soft-float", "false");
   F.removeAttributes(AttributeList::FunctionIndex, B);
   if (F.hasFnAttribute("use-soft-float")) {
-    DEBUG(errs() << "still has -use-soft-float\n");
+    LLVM_DEBUG(errs() << "still has -use-soft-float\n");
   }
   F.addAttributes(AttributeList::FunctionIndex, B);
 }
@@ -510,7 +510,7 @@ static void removeUseSoftFloat(Function &F) {
 bool Mips16HardFloat::runOnModule(Module &M) {
   auto &TM = static_cast<const MipsTargetMachine &>(
       getAnalysis<TargetPassConfig>().getTM<TargetMachine>());
-  DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+  LLVM_DEBUG(errs() << "Run on Module Mips16HardFloat\n");
   bool Modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->hasFnAttribute("nomips16") &&
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index ce193b1734f3..a0d5bd9ef305 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -192,41 +192,6 @@ bool Mips16DAGToDAGISel::trySelect(SDNode *Node) {
   default:
     break;
 
-  case ISD::SUBE:
-  case ISD::ADDE: {
-    SDValue InFlag = Node->getOperand(2), CmpLHS;
-    unsigned Opc = InFlag.getOpcode();
-    (void)Opc;
-    assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
-            (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
-           "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
-    unsigned MOp;
-    if (Opcode == ISD::ADDE) {
-      CmpLHS = InFlag.getValue(0);
-      MOp = Mips::AdduRxRyRz16;
-    } else {
-      CmpLHS = InFlag.getOperand(0);
-      MOp = Mips::SubuRxRyRz16;
-    }
-
-    SDValue Ops[] = {CmpLHS, InFlag.getOperand(1)};
-
-    SDValue LHS = Node->getOperand(0);
-    SDValue RHS = Node->getOperand(1);
-
-    EVT VT = LHS.getValueType();
-
-    unsigned Sltu_op = Mips::SltuRxRyRz16;
-    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops);
-    unsigned Addu_op = Mips::AdduRxRyRz16;
-    SDNode *AddCarry =
-        CurDAG->getMachineNode(Addu_op, DL, VT, SDValue(Carry, 0), RHS);
-
-    CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
-    return true;
-  }
-
   /// Mul with two results
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI: {
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index e11023b4d272..219f1ad33586 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -97,6 +97,17 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
 }
 
+bool Mips16InstrInfo::isCopyInstr(const MachineInstr &MI,
+                                  const MachineOperand *&Src,
+                                  const MachineOperand *&Dest) const {
+  if (MI.isMoveReg()) {
+    Dest = &MI.getOperand(0);
+    Src = &MI.getOperand(1);
+    return true;
+  }
+  return false;
+}
+
 void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       unsigned SrcReg, bool isKill, int FI,
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
index ffdd4728c8cb..8190be6187ea 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -53,6 +53,9 @@ public:
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+                   const MachineOperand *&Dest) const override;
+
   void storeRegToStack(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI,
                        unsigned SrcReg, bool isKill, int FrameIndex,
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
index b91c94288582..b7a1b9ce41bf 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -869,7 +869,9 @@ def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>;
 //Purpose: Move
 // To move the contents of a GPR to a GPR.
 //
-def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
+def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu> {
+  let isMoveReg = 1;
+}
 
 //
 // Format: MFHI rx MIPS16e
@@ -879,6 +881,7 @@ def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
 def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
   let Uses = [HI0];
   let hasSideEffects = 0;
+  let isMoveReg = 1;
 }
 
 //
@@ -889,6 +892,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
 def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
   let Uses = [LO0];
   let hasSideEffects = 0;
+  let isMoveReg = 0;
 }
 
 //
@@ -1403,14 +1407,6 @@ def: Mips16Pat<(i32 addr16sp:$addr), (AddiuRxRyOffMemX16 addr16sp:$addr)>;
 // Large (>16 bit) immediate loads
 def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
 
-// Carry MipsPatterns
-def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
-                (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
-def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
-                (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
-def : Mips16Pat<(addc  CPU16Regs:$src, immSExt16:$imm),
-                (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
-
 //
 // Some branch conditional patterns are not generated by llvm at this time.
 // Some are for seemingly arbitrary reasons not used: i.e. with signed number
@@ -1424,7 +1420,7 @@ def : Mips16Pat<(addc  CPU16Regs:$src, immSExt16:$imm),
 // setcc instead and earlier I had implemented setcc first so may have masked
 // the problem. The setcc variants are suboptimal for mips16 so I may wantto
 // figure out how to enable the brcond patterns or else possibly new
-// combinations of of brcond and setcc.
+// combinations of brcond and setcc.
 //
 //
 // bcond-seteq
@@ -1862,11 +1858,12 @@ def : Mips16Pat<(MipsHi tglobaladdr:$in),
                 (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
 def : Mips16Pat<(MipsHi tjumptable:$in),
                 (SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
-def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
-                (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
 
 def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
 
+def : Mips16Pat<(MipsTlsHi tglobaltlsaddr:$in),
+                (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
 // wrapper_pic
 class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
   Mips16Pat<(MipsWrapper RC:$gp, node:$in),
@@ -1910,3 +1907,7 @@ def CONSTPOOL_ENTRY :
 MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                       i32imm:$size), "foo", []>;
 
+// Instruction Aliases
+
+let EncodingPredicates = [InMips16Mode] in
+def : MipsInstAlias<"nop", (Move32R16 ZERO, S0)>;
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
index ff95f3c72282..751afd5ed369 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -127,8 +127,8 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   Offset = SPOffset + (int64_t)StackSize;
   Offset += MI.getOperand(OpNo + 1).getImm();
 
-
-  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+  LLVM_DEBUG(errs() << "Offset     : " << Offset << "\n"
+                    << "<--------->\n");
 
   if (!MI.isDebugValue() &&
       !Mips16InstrInfo::validImmediate(MI.getOpcode(), FrameReg, Offset)) {
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
index 516caa34fbf2..e1d08cad88b7 100644
--- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -30,8 +30,7 @@ class MipsR6Arch<string opstr> {
   string BaseOpcode = opstr;
 }
 
-class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
-                   PredicateControl {
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
   let DecoderNamespace = "Mips32r6_64r6";
   let EncodingPredicates = [HasStdEnc];
 }
@@ -576,3 +575,30 @@ class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
   let Inst{15-11} = base;
   let Inst{10-0}  = offset;
 }
+
+class SPECIAL3_2R_SZ_CRC<bits<2> sz, bits<3> direction> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0b00000;
+  let Inst{10-8} = direction;
+  let Inst{7-6} = sz;
+  let Inst{5-0} = 0b001111;
+
+  string DecoderMethod = "DecodeCRC";
+}
+
+class SPECIAL3_GINV<bits<2> ginv> : MipsR6Inst {
+  bits<5> rs;
+  bits<2> type_;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-10} = 0x0;
+  let Inst{9-8}   = type_;
+  let Inst{7-6}   = ginv;
+  let Inst{5-0}   = 0b111101;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 9e9e074875d0..d86fc3f658ae 100644
--- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -190,6 +190,16 @@ class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
 
 class SDBBP_R6_ENC : SPECIAL_SDBBP_FM;
 
+class CRC32B_ENC  : SPECIAL3_2R_SZ_CRC<0,0>;
+class CRC32H_ENC  : SPECIAL3_2R_SZ_CRC<1,0>;
+class CRC32W_ENC  : SPECIAL3_2R_SZ_CRC<2,0>;
+class CRC32CB_ENC : SPECIAL3_2R_SZ_CRC<0,1>;
+class CRC32CH_ENC : SPECIAL3_2R_SZ_CRC<1,1>;
+class CRC32CW_ENC : SPECIAL3_2R_SZ_CRC<2,1>;
+
+class GINVI_ENC : SPECIAL3_GINV<0>;
+class GINVT_ENC : SPECIAL3_GINV<2>;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Multiclasses
@@ -804,6 +814,38 @@ class SDBBP_R6_DESC {
   InstrItinClass Itinerary = II_SDBBP;
 }
 
+class CRC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin> : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class CRC32B_DESC : CRC_DESC_BASE<"crc32b", GPR32Opnd, II_CRC32B>;
+class CRC32H_DESC : CRC_DESC_BASE<"crc32h", GPR32Opnd, II_CRC32H>;
+class CRC32W_DESC : CRC_DESC_BASE<"crc32w", GPR32Opnd, II_CRC32W>;
+class CRC32CB_DESC : CRC_DESC_BASE<"crc32cb", GPR32Opnd, II_CRC32CB>;
+class CRC32CH_DESC : CRC_DESC_BASE<"crc32ch", GPR32Opnd, II_CRC32CH>;
+class CRC32CW_DESC : CRC_DESC_BASE<"crc32cw", GPR32Opnd, II_CRC32CW>;
+
+class GINV_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                     InstrItinClass itin> : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rs, uimm2:$type_);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $type_");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+  bit hasSideEffects = 1;
+}
+
+class GINVI_DESC : GINV_DESC_BASE<"ginvi", GPR32Opnd, II_GINVI> {
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = "ginvi\t$rs";
+}
+class GINVT_DESC : GINV_DESC_BASE<"ginvt", GPR32Opnd, II_GINVT>;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -846,9 +888,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def BNEZC : R6MMR6Rel, BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
   def BNVC : R6MMR6Rel, BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
   def BOVC : R6MMR6Rel, BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
-}
-def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
-let AdditionalPredicates = [NotInMicroMips] in {
+  def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
   def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
   def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
 }
@@ -901,8 +941,8 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 }
 def NAL; // BAL with rd=0
-def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
+  def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
   def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
   def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def SC_R6 : SC_R6_ENC, SC_R6_DESC, PTR_32, ISA_MIPS32R6;
@@ -923,6 +963,20 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 }
 
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CRC32B : R6MMR6Rel, CRC32B_ENC, CRC32B_DESC, ISA_MIPS32R6, ASE_CRC;
+  def CRC32H : R6MMR6Rel, CRC32H_ENC, CRC32H_DESC, ISA_MIPS32R6, ASE_CRC;
+  def CRC32W : R6MMR6Rel, CRC32W_ENC, CRC32W_DESC, ISA_MIPS32R6, ASE_CRC;
+  def CRC32CB : R6MMR6Rel, CRC32CB_ENC, CRC32CB_DESC, ISA_MIPS32R6, ASE_CRC;
+  def CRC32CH : R6MMR6Rel, CRC32CH_ENC, CRC32CH_DESC, ISA_MIPS32R6, ASE_CRC;
+  def CRC32CW : R6MMR6Rel, CRC32CW_ENC, CRC32CW_DESC, ISA_MIPS32R6, ASE_CRC;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def GINVI : R6MMR6Rel, GINVI_ENC, GINVI_DESC, ISA_MIPS32R6, ASE_GINV;
+  def GINVT : R6MMR6Rel, GINVT_ENC, GINVT_DESC, ISA_MIPS32R6, ASE_GINV;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Aliases
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
index 828dd4f54223..878ec29b188d 100644
--- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -85,6 +85,17 @@ let usesCustomInserter = 1 in {
   def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
 }
 
+def ATOMIC_LOAD_ADD_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_SUB_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_AND_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_OR_I64_POSTRA   : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_XOR_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_NAND_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+
+def ATOMIC_SWAP_I64_POSTRA      : Atomic2OpsPostRA<GPR64>;
+
+def ATOMIC_CMP_SWAP_I64_POSTRA  : AtomicCmpSwapPostRA<GPR64>;
+
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
   def LOAD_ACC128  : Load<"", ACC128>;
@@ -106,16 +117,16 @@ let AdditionalPredicates = [NotInMicroMips] in {
 
 let isCodeGenOnly = 1 in {
 def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
-              SLTI_FM<0xa>;
+              SLTI_FM<0xa>, GPR_64;
 def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
-              SLTI_FM<0xb>;
+              SLTI_FM<0xb>, GPR_64;
 def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, II_AND, immZExt16, and>,
-             ADDI_FM<0xc>;
+             ADDI_FM<0xc>, GPR_64;
 def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
-              ADDI_FM<0xd>;
+              ADDI_FM<0xd>, GPR_64;
 def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
-              ADDI_FM<0xe>;
-def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
+              ADDI_FM<0xe>, GPR_64;
+def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM, GPR_64;
 }
 
 /// Arithmetic Instructions (3-Operand, R-Type)
@@ -131,12 +142,15 @@ let AdditionalPredicates = [NotInMicroMips] in {
 }
 
 let isCodeGenOnly = 1 in {
-def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
-def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
-def AND64  : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>;
-def OR64   : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>;
-def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>;
-def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
+def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>, GPR_64;
+def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>, GPR_64;
+def AND64  : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>,
+             GPR_64;
+def OR64   : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>,
+             GPR_64;
+def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>,
+             GPR_64;
+def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>, GPR_64;
 }
 
 /// Shift Instructions
@@ -176,22 +190,24 @@ let AdditionalPredicates = [NotInMicroMips] in {
 /// Load and Store Instructions
 ///  aligned
 let isCodeGenOnly = 1 in {
-def LB64  : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>;
-def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>;
-def LH64  : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>;
-def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>;
-def LW64  : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>;
-def SB64  : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>;
-def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
-def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>, GPR_64;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>, GPR_64;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>, GPR_64;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>, GPR_64;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>, GPR_64;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>, GPR_64;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>,
+            GPR_64;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>,
+            GPR_64;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
   def LWu : MMRel, Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>,
             LW_FM<0x27>, ISA_MIPS3;
-  def LD  : LoadMemory<"ld", GPR64Opnd, mem_simm16, load, II_LD>,
+  def LD  : LoadMemory<"ld", GPR64Opnd, mem_simmptr, load, II_LD>,
             LW_FM<0x37>, ISA_MIPS3;
-  def SD  : StoreMemory<"sd", GPR64Opnd, mem_simm16, store, II_SD>,
+  def SD  : StoreMemory<"sd", GPR64Opnd, mem_simmptr, store, II_SD>,
             LW_FM<0x3f>, ISA_MIPS3;
 }
 
@@ -199,10 +215,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
-def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>;
-def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>;
-def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
-def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>,
+            GPR_64;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>,
+            GPR_64;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>,
+            GPR_64;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>,
+            GPR_64;
 }
 
 def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
@@ -216,7 +236,7 @@ def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
 
 /// Load-linked, Store-conditional
 let AdditionalPredicates = [NotInMicroMips] in {
-  def LLD : LLBase<"lld", GPR64Opnd, mem_simm16>, LW_FM<0x34>,
+  def LLD : LLBase<"lld", GPR64Opnd, mem_simmptr>, LW_FM<0x34>,
             ISA_MIPS3_NOT_32R6_64R6;
 }
 def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
@@ -234,12 +254,18 @@ def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
-  def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
-  def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
-  def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
-  def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
-  def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
-  def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+  def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>,
+               GPR_64;
+  def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>,
+               GPR_64;
+  def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>,
+               GPR_64;
+  def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>,
+               GPR_64;
+  def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>,
+               GPR_64;
+  def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>,
+               GPR_64;
   let AdditionalPredicates = [NoIndirectJumpGuards] in
     def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
 }
@@ -323,12 +349,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
              ISA_MIPS64R2;
   def DSHD : SubwordSwap<"dshd", GPR64Opnd, II_DSHD>, SEB_FM<5, 0x24>,
              ISA_MIPS64R2;
-}
 
-def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
+  def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>,
+                    GPR_64;
+}
 
 let isCodeGenOnly = 1 in
-def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM, GPR_64;
 
 let AdditionalPredicates = [NotInMicroMips] in {
   // The 'pos + size' constraints for code generation are enforced by the
@@ -376,11 +403,13 @@ let isCodeGenOnly = 1, AdditionalPredicates = [NotInMicroMips] in {
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
-                     "dsll\t$rd, $rt, 32", [], II_DSLL>;
-  def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
-                    "sll\t$rd, $rt, 0", [], II_SLL>;
-  def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
-                    "sll\t$rd, $rt, 0", [], II_SLL>;
+                     "dsll\t$rd, $rt, 32", [], II_DSLL>, GPR_64;
+  let isMoveReg = 1 in {
+    def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
+                      "sll\t$rd, $rt, 0", [], II_SLL>, GPR_64;
+    def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
+                      "sll\t$rd, $rt, 0", [], II_SLL>, GPR_64;
+  }
 }
 
 // We need the following pseudo instruction to avoid offset calculation for
@@ -391,7 +420,7 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 // where %PART may be %hi or %lo, depending on the relocation kind
 // that $tgt is annotated with.
 def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
-  (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+  (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>, GPR_64;
 
 // Cavium Octeon cnMIPS instructions
 let DecoderNamespace = "CnMips",
@@ -545,16 +574,23 @@ def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd, II_DMTC2>, MFC2OP_FM<0x12, 5>,
 
 /// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
-def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>, MFC3OP_FM<0x10, 1>,
-            ISA_MIPS3;
-def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>, MFC3OP_FM<0x10, 5>,
-            ISA_MIPS3;
-def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>, MFC3OP_FM<0x12, 1>,
-            ISA_MIPS3;
-def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, MFC3OP_FM<0x12, 5>,
-            ISA_MIPS3;
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>,
+            MFC3OP_FM<0x10, 1, 0>, ISA_MIPS3;
+def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>,
+            MFC3OP_FM<0x10, 5, 0>, ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>,
+            MFC3OP_FM<0x12, 1, 0>, ISA_MIPS3;
+def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>,
+            MFC3OP_FM<0x12, 5, 0>, ISA_MIPS3;
 }
 
+/// Move between CPU and guest coprocessor registers (Virtualization ASE)
+let DecoderNamespace = "Mips64" in {
+  def DMFGC0 : MFC3OP<"dmfgc0", GPR64Opnd, COP0Opnd, II_DMFGC0>,
+               MFC3OP_FM<0x10, 3, 1>, ISA_MIPS64R5, ASE_VIRT;
+  def DMTGC0 : MTC3OP<"dmtgc0", COP0Opnd, GPR64Opnd, II_DMTGC0>,
+               MFC3OP_FM<0x10, 3, 3>, ISA_MIPS64R5, ASE_VIRT;
+}
 
 let AdditionalPredicates = [UseIndirectJumpsHazard] in
   def JALRHB64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR_HB64, RA_64>;
@@ -564,124 +600,130 @@ let AdditionalPredicates = [UseIndirectJumpsHazard] in
 //===----------------------------------------------------------------------===//
 
 // Materialize i64 constants.
-defm : MaterializeImms<i64, ZERO_64, DADDiu, LUi64, ORi64>;
+defm : MaterializeImms<i64, ZERO_64, DADDiu, LUi64, ORi64>, ISA_MIPS3, GPR_64;
 
 def : MipsPat<(i64 immZExt32Low16Zero:$imm),
-              (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16)>;
+              (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16)>, ISA_MIPS3, GPR_64;
 
 def : MipsPat<(i64 immZExt32:$imm),
               (ORi64 (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16),
-                     (LO16 imm:$imm))>;
+                     (LO16 imm:$imm))>, ISA_MIPS3, GPR_64;
 
 // extended loads
-def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
-def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
-def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
+def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>, ISA_MIPS3,
+      GPR_64;
+def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>, ISA_MIPS3,
+      GPR_64;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>, ISA_MIPS3,
+      GPR_64;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>, ISA_MIPS3,
+      GPR_64;
 
 // hi/lo relocs
 let AdditionalPredicates = [NotInMicroMips] in
-defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32;
+defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, ISA_MIPS3, GPR_64,
+       SYM_32;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>, ISA_MIPS3,
+      GPR_64;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi64 texternalsym:$in)>,
+      ISA_MIPS3, GPR_64;
 
-def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
-def : MipsPat<(MipsGotHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>,
+      ISA_MIPS3, GPR_64;
 
 // highest/higher/hi/lo relocs
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)),
-                (JAL texternalsym:$dst)>, SYM_64;
+                (JAL texternalsym:$dst)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
-                (LUi64 tglobaladdr:$in)>, SYM_64;
+                (LUi64 tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHighest (i64 tblockaddress:$in)),
-                (LUi64 tblockaddress:$in)>, SYM_64;
+                (LUi64 tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHighest (i64 tjumptable:$in)),
-                (LUi64 tjumptable:$in)>, SYM_64;
+                (LUi64 tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHighest (i64 tconstpool:$in)),
-                (LUi64 tconstpool:$in)>, SYM_64;
-  def : MipsPat<(MipsHighest (i64 tglobaltlsaddr:$in)),
-                (LUi64 tglobaltlsaddr:$in)>, SYM_64;
+                (LUi64 tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHighest (i64 texternalsym:$in)),
-                (LUi64 texternalsym:$in)>, SYM_64;
+                (LUi64 texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
 
   def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)),
-                (DADDiu ZERO_64, tglobaladdr:$in)>, SYM_64;
+                (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHigher (i64 tblockaddress:$in)),
-                (DADDiu ZERO_64, tblockaddress:$in)>, SYM_64;
+                (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHigher (i64 tjumptable:$in)),
-                (DADDiu ZERO_64, tjumptable:$in)>, SYM_64;
+                (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHigher (i64 tconstpool:$in)),
-                (DADDiu ZERO_64, tconstpool:$in)>, SYM_64;
-  def : MipsPat<(MipsHigher (i64 tglobaltlsaddr:$in)),
-                (DADDiu ZERO_64, tglobaltlsaddr:$in)>, SYM_64;
+                (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHigher (i64 texternalsym:$in)),
-                (DADDiu ZERO_64, texternalsym:$in)>, SYM_64;
+                (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
 
   def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))),
-                (DADDiu GPR64:$hi, tglobaladdr:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tblockaddress:$lo))),
-                (DADDiu GPR64:$hi, tblockaddress:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tblockaddress:$lo)>, ISA_MIPS3, GPR_64,
+                SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tjumptable:$lo))),
-                (DADDiu GPR64:$hi, tjumptable:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))),
-                (DADDiu GPR64:$hi, tconstpool:$lo)>, SYM_64;
-  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaltlsaddr:$lo))),
-                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
 
   def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
-                (DADDiu GPR64:$hi, tglobaladdr:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tblockaddress:$lo))),
-                (DADDiu GPR64:$hi, tblockaddress:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tblockaddress:$lo)>, ISA_MIPS3, GPR_64,
+                SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tjumptable:$lo))),
-                (DADDiu GPR64:$hi, tjumptable:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))),
-                (DADDiu GPR64:$hi, tconstpool:$lo)>, SYM_64;
-  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaltlsaddr:$lo))),
-                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
 
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
-                (DADDiu GPR64:$hi, tglobaladdr:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tblockaddress:$lo))),
-                (DADDiu GPR64:$hi, tblockaddress:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tblockaddress:$lo)>, ISA_MIPS3, GPR_64,
+                SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tjumptable:$lo))),
-                (DADDiu GPR64:$hi, tjumptable:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tconstpool:$lo))),
-                (DADDiu GPR64:$hi, tconstpool:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))),
-                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, SYM_64;
+                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MIPS3, GPR_64,
+                SYM_64;
 }
 
 // gp_rel relocs
 def : MipsPat<(add GPR64:$gp, (MipsGPRel tglobaladdr:$in)),
-              (DADDiu GPR64:$gp, tglobaladdr:$in)>, ABI_N64;
+              (DADDiu GPR64:$gp, tglobaladdr:$in)>, ISA_MIPS3, ABI_N64;
 def : MipsPat<(add GPR64:$gp, (MipsGPRel tconstpool:$in)),
-              (DADDiu GPR64:$gp, tconstpool:$in)>, ABI_N64;
+              (DADDiu GPR64:$gp, tconstpool:$in)>, ISA_MIPS3, ABI_N64;
 
-def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
-def : WrapperPat<tconstpool, DADDiu, GPR64>;
-def : WrapperPat<texternalsym, DADDiu, GPR64>;
-def : WrapperPat<tblockaddress, DADDiu, GPR64>;
-def : WrapperPat<tjumptable, DADDiu, GPR64>;
-def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+def : WrapperPat<tglobaladdr, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tconstpool, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<texternalsym, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tblockaddress, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tjumptable, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>, ISA_MIPS3, GPR_64;
 
 
 defm : BrcondPats<GPR64, BEQ64, BEQ, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
-                  ZERO_64>;
+                  ZERO_64>, ISA_MIPS3, GPR_64;
 def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
-              (BLEZ64 i64:$lhs, bb:$dst)>;
+              (BLEZ64 i64:$lhs, bb:$dst)>, ISA_MIPS3, GPR_64;
 def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
-              (BGEZ64 i64:$lhs, bb:$dst)>;
+              (BGEZ64 i64:$lhs, bb:$dst)>, ISA_MIPS3, GPR_64;
 
 // setcc patterns
 let AdditionalPredicates = [NotInMicroMips] in {
-  defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>;
-  defm : SetlePats<GPR64, XORi, SLT64, SLTu64>;
-  defm : SetgtPats<GPR64, SLT64, SLTu64>;
-  defm : SetgePats<GPR64, XORi, SLT64, SLTu64>;
-  defm : SetgeImmPats<GPR64, XORi, SLTi64, SLTiu64>;
+  defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>, ISA_MIPS3, GPR_64;
+  defm : SetlePats<GPR64, XORi, SLT64, SLTu64>, ISA_MIPS3, GPR_64;
+  defm : SetgtPats<GPR64, SLT64, SLTu64>, ISA_MIPS3, GPR_64;
+  defm : SetgePats<GPR64, XORi, SLT64, SLTu64>, ISA_MIPS3, GPR_64;
+  defm : SetgeImmPats<GPR64, XORi, SLTi64, SLTiu64>, ISA_MIPS3, GPR_64;
 }
 // truncate
 def : MipsPat<(trunc (assertsext GPR64:$src)),
-              (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+              (EXTRACT_SUBREG GPR64:$src, sub_32)>, ISA_MIPS3, GPR_64;
 // The forward compatibility strategy employed by MIPS requires us to treat
 // values as being sign extended to an infinite number of bits. This allows
 // existing software to run without modification on any future MIPS
@@ -693,80 +735,134 @@ def : MipsPat<(trunc (assertsext GPR64:$src)),
 // such as (trunc:i32 (assertzext:i64 X, i32)), because the sign-bit of the
 // lower subreg would not be replicated into the upper half.
 def : MipsPat<(trunc (assertzext_lt_i32 GPR64:$src)),
-              (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+              (EXTRACT_SUBREG GPR64:$src, sub_32)>, ISA_MIPS3, GPR_64;
 def : MipsPat<(i32 (trunc GPR64:$src)),
-              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
+              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>, ISA_MIPS3, GPR_64;
 
 // variable shift instructions patterns
 def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
-              (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+              (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+              ISA_MIPS3, GPR_64;
 def : MipsPat<(srl GPR64:$rt, (i32 (trunc GPR64:$rs))),
-              (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+              (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+              ISA_MIPS3, GPR_64;
 def : MipsPat<(sra GPR64:$rt, (i32 (trunc GPR64:$rs))),
-              (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
-                (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
-}
+              (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+              ISA_MIPS3, GPR_64;
+def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+              (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+              ISA_MIPS3, GPR_64;
 
 // 32-to-64-bit extension
 def : MipsPat<(i64 (anyext GPR32:$src)),
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
-def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>,
+      ISA_MIPS3, GPR_64;
+def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>,
+      ISA_MIPS3, GPR_64;
+def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>, ISA_MIPS3,
+      GPR_64;
 
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(i64 (zext GPR32:$src)), (DEXT64_32 GPR32:$src, 0, 32)>,
-        ISA_MIPS64R2;
+        ISA_MIPS64R2, GPR_64;
   def : MipsPat<(i64 (zext (i32 (shl GPR32:$rt, immZExt5:$imm)))),
                 (CINS64_32 GPR32:$rt, imm:$imm, (immZExt5To31 imm:$imm))>,
-        ASE_MIPS64_CNMIPS;
+        ISA_MIPS64R2, GPR_64, ASE_MIPS64_CNMIPS;
 }
 
 // Sign extend in register
 def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
-              (SLL64_64 GPR64:$src)>;
+              (SLL64_64 GPR64:$src)>, ISA_MIPS3, GPR_64;
 
 // bswap MipsPattern
-def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
+def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>, ISA_MIPS64R2;
 
 // Carry pattern
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
-                (DSUBu GPR64:$lhs, GPR64:$rhs)>;
+                (DSUBu GPR64:$lhs, GPR64:$rhs)>, ISA_MIPS3, GPR_64;
   def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
-                (DADDu GPR64:$lhs, GPR64:$rhs)>, ASE_NOT_DSP;
+                (DADDu GPR64:$lhs, GPR64:$rhs)>, ISA_MIPS3, ASE_NOT_DSP, GPR_64;
   def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
-                (DADDiu GPR64:$lhs, imm:$imm)>, ASE_NOT_DSP;
+                (DADDiu GPR64:$lhs, imm:$imm)>, ISA_MIPS3, ASE_NOT_DSP, GPR_64;
 }
 
 // Octeon bbit0/bbit1 MipsPattern
 def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
-              (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+              (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>,
+              ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
-              (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+              (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>,
+              ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
-              (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+              (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>,
+              ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
-              (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+              (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>,
+              ISA_MIPS64R2, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (seteq (and i32:$lhs, PowerOf2LO_i32:$mask), 0)), bb:$dst),
               (BBIT0 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), i32:$lhs, sub_32),
-                     (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+                     (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ISA_MIPS64R2,
+      ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (setne (and i32:$lhs, PowerOf2LO_i32:$mask), 0)), bb:$dst),
               (BBIT1 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), i32:$lhs, sub_32),
-                     (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+                     (Log2LO PowerOf2LO_i32:$mask), bb:$dst)>, ISA_MIPS64R2,
+      ASE_MIPS64_CNMIPS;
 
 // Atomic load patterns.
-def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
-def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>;
-def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>;
-def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>;
+def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>, ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>, ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>, ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>, ISA_MIPS3, GPR_64;
 
 // Atomic store patterns.
-def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>;
-def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>;
-def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>;
-def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>,
+      ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>,
+      ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>,
+      ISA_MIPS3, GPR_64;
+def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>,
+      ISA_MIPS3, GPR_64;
+
+// Patterns used for matching away redundant sign extensions.
+// MIPS32 arithmetic instructions sign extend their result implicitly.
+def : MipsPat<(i64 (sext (i32 (add GPR32:$src, immSExt16:$imm16)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (ADDiu GPR32:$src, immSExt16:$imm16), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (add GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (ADDu GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (sub GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SUBu GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (mul GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (MUL GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (MipsMFHI ACC64:$src)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (PseudoMFHI ACC64:$src), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (MipsMFLO ACC64:$src)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (PseudoMFLO ACC64:$src), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (shl GPR32:$src, immZExt5:$imm5)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SLL GPR32:$src, immZExt5:$imm5), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (shl GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SLLV GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (srl GPR32:$src, immZExt5:$imm5)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SRL GPR32:$src, immZExt5:$imm5), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (srl GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SRLV GPR32:$src, GPR32:$src2), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (sra GPR32:$src, immZExt5:$imm5)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SRA GPR32:$src, immZExt5:$imm5), sub_32)>;
+def : MipsPat<(i64 (sext (i32 (sra GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (SRAV GPR32:$src, GPR32:$src2), sub_32)>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
@@ -792,13 +888,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
                       0>, ISA_MIPS3;
 
   defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi64, GPR64Opnd, imm64>,
-         GPR_64;
+         ISA_MIPS3, GPR_64;
 
   defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi64, GPR64Opnd, imm64>,
-         GPR_64;
+         ISA_MIPS3, GPR_64;
 
   defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
-         GPR_64;
+         ISA_MIPS3, GPR_64;
 }
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"dneg $rt, $rs",
@@ -873,6 +969,12 @@ let AdditionalPredicates = [NotInMicroMips] in {
                       (DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
   def : MipsInstAlias<"dmfc0 $rt, $rd",
                       (DMFC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+  def : MipsInstAlias<"dmfgc0 $rt, $rd",
+                      (DMFGC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+                      ISA_MIPS64R5, ASE_VIRT;
+  def : MipsInstAlias<"dmtgc0 $rt, $rd",
+                      (DMTGC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>,
+                      ISA_MIPS64R5, ASE_VIRT;
 }
 def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, COP2Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 COP2Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
@@ -990,6 +1092,38 @@ let AdditionalPredicates = [NotInMicroMips] in {
                                                       GPR64Opnd:$rd,
                                                       imm64:$imm), 0>,
         ISA_MIPS3_NOT_32R6_64R6;
+  def DSRemMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "drem\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DSRemIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, simm32_relaxed:$imm),
+                                      "drem\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+  def DURemMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "dremu\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DURemIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, simm32_relaxed:$imm),
+                                      "dremu\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"drem $rt, $rs", (DSRemMacro GPR64Opnd:$rt,
+                                                   GPR64Opnd:$rt,
+                                                   GPR64Opnd:$rs), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"drem $rd, $imm", (DSRemIMacro GPR64Opnd:$rd,
+                                                     GPR64Opnd:$rd,
+                                                     simm32_relaxed:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"dremu $rt, $rs", (DURemMacro GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rs), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"dremu $rd, $imm", (DURemIMacro GPR64Opnd:$rd,
+                                                      GPR64Opnd:$rd,
+                                                      simm32_relaxed:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
 }
 
 def NORImm64 : NORIMM_DESC_BASE<GPR64Opnd, imm64>, GPR_64;
diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
index da743fbdee45..9df802cc30b9 100644
--- a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -39,6 +39,8 @@ class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011101>;
 class LDPC_ENC    : PCREL18_FM<OPCODE3_LDPC>;
 class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
 class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
+class CRC32D_ENC  : SPECIAL3_2R_SZ_CRC<3,0>;
+class CRC32CD_ENC : SPECIAL3_2R_SZ_CRC<3,1>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -71,7 +73,7 @@ class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>;
 class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
 class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
 class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
-class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simm16, II_LLD>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simmptr, II_LLD>;
 class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
 class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
 class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
@@ -114,6 +116,10 @@ class JR_HB64_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR64Opnd> {
   bit isCTI = 1;
   InstrItinClass Itinerary = II_JR_HB;
 }
+
+class CRC32D_DESC  : CRC_DESC_BASE<"crc32d", GPR32Opnd, II_CRC32D>;
+class CRC32CD_DESC : CRC_DESC_BASE<"crc32cd", GPR32Opnd, II_CRC32CD>;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -174,6 +180,10 @@ let DecoderNamespace = "Mips32r6_64r6_BranchZero" in {
 def BLTZC64 : BLTZC_ENC, BLTZC64_DESC, ISA_MIPS64R6, GPR_64;
 def BGEZC64 : BGEZC_ENC, BGEZC64_DESC, ISA_MIPS64R6, GPR_64;
 }
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CRC32D : R6MMR6Rel, CRC32D_ENC, CRC32D_DESC, ISA_MIPS64R6, ASE_CRC;
+  def CRC32CD : R6MMR6Rel, CRC32CD_ENC, CRC32CD_DESC, ISA_MIPS64R6, ASE_CRC;
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -289,6 +299,21 @@ def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
               (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
               ISA_MIPS64R6;
 
+// Patterns used for matching away redundant sign extensions.
+// MIPS32 arithmetic instructions sign extend their result implicitly.
+def : MipsPat<(i64 (sext (i32 (sdiv GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (DIV GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+def : MipsPat<(i64 (sext (i32 (udiv GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (DIVU GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+def : MipsPat<(i64 (sext (i32 (srem GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (MOD GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+def : MipsPat<(i64 (sext (i32 (urem GPR32:$src, GPR32:$src2)))),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (MODU GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6;
+
 // Pseudo instructions
 
 let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index f9de78dc281f..8ffc0731abcb 100644
--- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -160,6 +160,8 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     PrintDebugValueComment(MI, OS);
     return;
   }
+  if (MI->isDebugLabel())
+    return;
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && Opc != Mips::CONSTPOOL_ENTRY) {
@@ -499,6 +501,13 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         return true;
       O << MO.getImm() - 1;
       return false;
+    case 'y': // exact log2
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      if (!isPowerOf2_64(MO.getImm()))
+        return true;
+      O << Log2_64(MO.getImm());
+      return false;
     case 'z':
       // $0 if zero, regular printing otherwise
       if (MO.getType() == MachineOperand::MO_Immediate && MO.getImm() == 0) {
@@ -576,17 +585,27 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   assert(OffsetMO.isImm() && "Unexpected offset for inline asm memory operand.");
   int Offset = OffsetMO.getImm();
 
-  // Currently we are expecting either no ExtraCode or 'D'
+  // Currently we are expecting either no ExtraCode or 'D','M','L'.
   if (ExtraCode) {
-    if (ExtraCode[0] == 'D')
+    switch (ExtraCode[0]) {
+    case 'D':
       Offset += 4;
-    else
+      break;
+    case 'M':
+      if (Subtarget->isLittle())
+        Offset += 4;
+      break;
+    case 'L':
+      if (!Subtarget->isLittle())
+        Offset += 4;
+      break;
+    default:
       return true; // Unknown modifier.
-    // FIXME: M = high order bits
-    // FIXME: L = low order bits
+    }
   }
 
-  O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg()) << ")";
+  O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg())
+    << ")";
 
   return false;
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
index e6ecbe9b5f66..af936e6fc96b 100644
--- a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -1,4 +1,4 @@
-//===- MipsLongBranch.cpp - Emit long branches ----------------------------===//
+//===----------------------- MipsBranchExpansion.cpp ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,11 +6,70 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This pass expands a branch or jump instruction into a long branch if its
-// offset is too large to fit into its immediate field.
-//
-// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+/// \file
+///
+/// This pass do two things:
+/// - it expands a branch or jump instruction into a long branch if its offset
+///   is too large to fit into its immediate field,
+/// - it inserts nops to prevent forbidden slot hazards.
+///
+/// The reason why this pass combines these two tasks is that one of these two
+/// tasks can break the result of the previous one.
+///
+/// Example of that is a situation where at first, no branch should be expanded,
+/// but after adding at least one nop somewhere in the code to prevent a
+/// forbidden slot hazard, offset of some branches may go out of range. In that
+/// case it is necessary to check again if there is some branch that needs
+/// expansion. On the other hand, expanding some branch may cause a control
+/// transfer instruction to appear in the forbidden slot, which is a hazard that
+/// should be fixed. This pass alternates between this two tasks untill no
+/// changes are made. Only then we can be sure that all branches are expanded
+/// properly, and no hazard situations exist.
+///
+/// Regarding branch expanding:
+///
+/// When branch instruction like beqzc or bnezc has offset that is too large
+/// to fit into its immediate field, it has to be expanded to another
+/// instruction or series of instructions.
+///
+/// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+/// TODO: Handle out of range bc, b (pseudo) instructions.
+///
+/// Regarding compact branch hazard prevention:
+///
+/// Hazards handled: forbidden slots for MIPSR6.
+///
+/// A forbidden slot hazard occurs when a compact branch instruction is executed
+/// and the adjacent instruction in memory is a control transfer instruction
+/// such as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
+///
+/// For example:
+///
+/// 0x8004      bnec    a1,v0,<P+0x18>
+/// 0x8008      beqc    a1,a2,<P+0x54>
+///
+/// In such cases, the processor is required to signal a Reserved Instruction
+/// exception.
+///
+/// Here, if the instruction at 0x8004 is executed, the processor will raise an
+/// exception as there is a control transfer instruction at 0x8008.
+///
+/// There are two sources of forbidden slot hazards:
+///
+/// A) A previous pass has created a compact branch directly.
+/// B) Transforming a delay slot branch into compact branch. This case can be
+///    difficult to process as lookahead for hazards is insufficient, as
+///    backwards delay slot fillling can also produce hazards in previously
+///    processed instuctions.
+///
+/// In future this pass can be extended (or new pass can be created) to handle
+/// other pipeline hazards, such as various MIPS1 hazards, processor errata that
+/// require instruction reorganization, etc.
+///
+/// This pass has to run after the delay slot filler as that pass can introduce
+/// pipeline hazards such as compact branch hazard, hence the existing hazard
+/// recognizer is not suitable.
+///
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/MipsABIInfo.h"
@@ -30,6 +89,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
@@ -37,76 +97,126 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
+#include <utility>
 
 using namespace llvm;
 
-#define DEBUG_TYPE "mips-long-branch"
+#define DEBUG_TYPE "mips-branch-expansion"
 
+STATISTIC(NumInsertedNops, "Number of nops inserted");
 STATISTIC(LongBranches, "Number of long branches.");
 
-static cl::opt<bool> SkipLongBranch(
-  "skip-mips-long-branch",
-  cl::init(false),
-  cl::desc("MIPS: Skip long branch pass."),
-  cl::Hidden);
+static cl::opt<bool>
+    SkipLongBranch("skip-mips-long-branch", cl::init(false),
+                   cl::desc("MIPS: Skip branch expansion pass."), cl::Hidden);
 
-static cl::opt<bool> ForceLongBranch(
-  "force-mips-long-branch",
-  cl::init(false),
-  cl::desc("MIPS: Expand all branches to long format."),
-  cl::Hidden);
+static cl::opt<bool>
+    ForceLongBranch("force-mips-long-branch", cl::init(false),
+                    cl::desc("MIPS: Expand all branches to long format."),
+                    cl::Hidden);
 
 namespace {
 
-  using Iter = MachineBasicBlock::iterator;
-  using ReverseIter = MachineBasicBlock::reverse_iterator;
+using Iter = MachineBasicBlock::iterator;
+using ReverseIter = MachineBasicBlock::reverse_iterator;
 
-  struct MBBInfo {
-    uint64_t Size = 0;
-    uint64_t Address;
-    bool HasLongBranch = false;
-    MachineInstr *Br = nullptr;
+struct MBBInfo {
+  uint64_t Size = 0;
+  bool HasLongBranch = false;
+  MachineInstr *Br = nullptr;
+  MBBInfo() = default;
+};
 
-    MBBInfo() = default;
-  };
+class MipsBranchExpansion : public MachineFunctionPass {
+public:
+  static char ID;
 
-  class MipsLongBranch : public MachineFunctionPass {
-  public:
-    static char ID;
+  MipsBranchExpansion() : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {
+    initializeMipsBranchExpansionPass(*PassRegistry::getPassRegistry());
+  }
 
-    MipsLongBranch()
-        : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {}
+  StringRef getPassName() const override {
+    return "Mips Branch Expansion Pass";
+  }
 
-    StringRef getPassName() const override { return "Mips Long Branch"; }
+  bool runOnMachineFunction(MachineFunction &F) override;
 
-    bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
 
-    MachineFunctionProperties getRequiredProperties() const override {
-      return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::NoVRegs);
-    }
+private:
+  void splitMBB(MachineBasicBlock *MBB);
+  void initMBBInfo();
+  int64_t computeOffset(const MachineInstr *Br);
+  void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
+                     MachineBasicBlock *MBBOpnd);
+  void expandToLongBranch(MBBInfo &Info);
+  bool handleForbiddenSlot();
+  bool handlePossibleLongBranch();
+
+  const MipsSubtarget *STI;
+  const MipsInstrInfo *TII;
+
+  MachineFunction *MFp;
+  SmallVector<MBBInfo, 16> MBBInfos;
+  bool IsPIC;
+  MipsABIInfo ABI;
+  unsigned LongBranchSeqSize;
+  bool ForceLongBranchFirstPass = false;
+};
+
+} // end of anonymous namespace
+
+char MipsBranchExpansion::ID = 0;
+
+INITIALIZE_PASS(MipsBranchExpansion, DEBUG_TYPE,
+                "Expand out of range branch instructions and prevent forbidden"
+                " slot hazards",
+                false, false)
+
+/// Returns a pass that clears pipeline hazards.
+FunctionPass *llvm::createMipsBranchExpansion() {
+  return new MipsBranchExpansion();
+}
 
-  private:
-    void splitMBB(MachineBasicBlock *MBB);
-    void initMBBInfo();
-    int64_t computeOffset(const MachineInstr *Br);
-    void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
-                       MachineBasicBlock *MBBOpnd);
-    void expandToLongBranch(MBBInfo &Info);
+// Find the next real instruction from the current position in current basic
+// block.
+static Iter getNextMachineInstrInBB(Iter Position) {
+  Iter I = Position, E = Position->getParent()->end();
+  I = std::find_if_not(I, E,
+                       [](const Iter &Insn) { return Insn->isTransient(); });
 
-    MachineFunction *MF;
-    SmallVector<MBBInfo, 16> MBBInfos;
-    bool IsPIC;
-    MipsABIInfo ABI;
-    unsigned LongBranchSeqSize;
-  };
+  return I;
+}
 
-} // end anonymous namespace
+// Find the next real instruction from the current position, looking through
+// basic block boundaries.
+static std::pair<Iter, bool> getNextMachineInstr(Iter Position,
+                                                 MachineBasicBlock *Parent) {
+  if (Position == Parent->end()) {
+    do {
+      MachineBasicBlock *Succ = Parent->getNextNode();
+      if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+        Position = Succ->begin();
+        Parent = Succ;
+      } else {
+        return std::make_pair(Position, true);
+      }
+    } while (Parent->empty());
+  }
 
-char MipsLongBranch::ID = 0;
+  Iter Instr = getNextMachineInstrInBB(Position);
+  if (Instr == Parent->end()) {
+    return getNextMachineInstr(Instr, Parent);
+  }
+  return std::make_pair(Instr, false);
+}
 
 /// Iterate over list of Br's operands and search for a MachineBasicBlock
 /// operand.
@@ -125,14 +235,14 @@ static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
 // found or it reaches E.
 static ReverseIter getNonDebugInstr(ReverseIter B, const ReverseIter &E) {
   for (; B != E; ++B)
-    if (!B->isDebugValue())
+    if (!B->isDebugInstr())
       return B;
 
   return E;
 }
 
 // Split MBB if it has two direct jumps/branches.
-void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+void MipsBranchExpansion::splitMBB(MachineBasicBlock *MBB) {
   ReverseIter End = MBB->rend();
   ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
 
@@ -153,7 +263,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
 
   // Create a new MBB. Move instructions in MBB to the newly created MBB.
   MachineBasicBlock *NewMBB =
-    MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+      MFp->CreateMachineBasicBlock(MBB->getBasicBlock());
 
   // Insert NewMBB and fix control flow.
   MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
@@ -161,26 +271,24 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
   NewMBB->removeSuccessor(Tgt, true);
   MBB->addSuccessor(NewMBB);
   MBB->addSuccessor(Tgt);
-  MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+  MFp->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
 
   NewMBB->splice(NewMBB->end(), MBB, LastBr.getReverse(), MBB->end());
 }
 
 // Fill MBBInfos.
-void MipsLongBranch::initMBBInfo() {
+void MipsBranchExpansion::initMBBInfo() {
   // Split the MBBs if they have two branches. Each basic block should have at
   // most one branch after this loop is executed.
-  for (auto &MBB : *MF)
+  for (auto &MBB : *MFp)
     splitMBB(&MBB);
 
-  MF->RenumberBlocks();
+  MFp->RenumberBlocks();
   MBBInfos.clear();
-  MBBInfos.resize(MF->size());
+  MBBInfos.resize(MFp->size());
 
-  const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(MF->getSubtarget().getInstrInfo());
   for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
-    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+    MachineBasicBlock *MBB = MFp->getBlockNumbered(I);
 
     // Compute size of MBB.
     for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
@@ -198,7 +306,7 @@ void MipsLongBranch::initMBBInfo() {
 }
 
 // Compute offset of branch in number of bytes.
-int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+int64_t MipsBranchExpansion::computeOffset(const MachineInstr *Br) {
   int64_t Offset = 0;
   int ThisMBB = Br->getParent()->getNumber();
   int TargetMBB = getTargetMBB(*Br)->getNumber();
@@ -220,11 +328,9 @@ int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
 
 // Replace Br with a branch which has the opposite condition code and a
 // MachineBasicBlock operand MBBOpnd.
-void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
-                                   const DebugLoc &DL,
-                                   MachineBasicBlock *MBBOpnd) {
-  const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
-      MBB.getParent()->getSubtarget().getInstrInfo());
+void MipsBranchExpansion::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+                                        const DebugLoc &DL,
+                                        MachineBasicBlock *MBBOpnd) {
   unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
   const MCInstrDesc &NewDesc = TII->get(NewOpc);
 
@@ -258,24 +364,20 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 // currently assumes that all branches have 16-bit offsets, and will produce
 // wrong code if branches whose allowed offsets are [-128, -126, ..., 126]
 // are present.
-void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
   MachineBasicBlock::iterator Pos;
   MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
   DebugLoc DL = I.Br->getDebugLoc();
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
-  MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
-  const MipsSubtarget &Subtarget =
-      static_cast<const MipsSubtarget &>(MF->getSubtarget());
-  const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
+  MachineBasicBlock *LongBrMBB = MFp->CreateMachineBasicBlock(BB);
 
-  MF->insert(FallThroughMBB, LongBrMBB);
+  MFp->insert(FallThroughMBB, LongBrMBB);
   MBB->replaceSuccessor(TgtMBB, LongBrMBB);
 
   if (IsPIC) {
-    MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
-    MF->insert(FallThroughMBB, BalTgtMBB);
+    MachineBasicBlock *BalTgtMBB = MFp->CreateMachineBasicBlock(BB);
+    MFp->insert(FallThroughMBB, BalTgtMBB);
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
@@ -283,9 +385,9 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
     // pseudo-instruction wrapping BGEZAL).
     const unsigned BalOp =
-        Subtarget.hasMips32r6()
-            ? Subtarget.inMicroMipsMode() ? Mips::BALC_MMR6 : Mips::BALC
-            : Mips::BAL_BR;
+        STI->hasMips32r6()
+            ? STI->inMicroMipsMode() ? Mips::BALC_MMR6 : Mips::BALC
+            : STI->inMicroMipsMode() ? Mips::BAL_BR_MM : Mips::BAL_BR;
 
     if (!ABI.IsN64()) {
       // Pre R6:
@@ -320,9 +422,12 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       Pos = LongBrMBB->begin();
 
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
-        .addReg(Mips::SP).addImm(-8);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
-        .addReg(Mips::SP).addImm(0);
+          .addReg(Mips::SP)
+          .addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW))
+          .addReg(Mips::RA)
+          .addReg(Mips::SP)
+          .addImm(0);
 
       // LUi and ADDiu instructions create 32-bit offset of the target basic
       // block from the target of BAL(C) instruction.  We cannot use immediate
@@ -341,16 +446,17 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       // operands to lowered instructions.
 
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
-        .addMBB(TgtMBB).addMBB(BalTgtMBB);
+          .addMBB(TgtMBB, MipsII::MO_ABS_HI)
+          .addMBB(BalTgtMBB);
 
       MachineInstrBuilder BalInstr =
-          BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
+          BuildMI(*MFp, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
       MachineInstrBuilder ADDiuInstr =
-          BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+          BuildMI(*MFp, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
               .addReg(Mips::AT)
-              .addMBB(TgtMBB)
+              .addMBB(TgtMBB, MipsII::MO_ABS_LO)
               .addMBB(BalTgtMBB);
-      if (Subtarget.hasMips32r6()) {
+      if (STI->hasMips32r6()) {
         LongBrMBB->insert(Pos, ADDiuInstr);
         LongBrMBB->insert(Pos, BalInstr);
       } else {
@@ -362,35 +468,38 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       Pos = BalTgtMBB->begin();
 
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
-        .addReg(Mips::RA).addReg(Mips::AT);
+          .addReg(Mips::RA)
+          .addReg(Mips::AT);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
-        .addReg(Mips::SP).addImm(0);
-      if (Subtarget.isTargetNaCl())
+          .addReg(Mips::SP)
+          .addImm(0);
+      if (STI->isTargetNaCl())
         // Bundle-align the target of indirect branch JR.
         TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
 
       // In NaCl, modifying the sp is not allowed in branch delay slot.
       // For MIPS32R6, we can skip using a delay slot branch.
-      if (Subtarget.isTargetNaCl() ||
-          (Subtarget.hasMips32r6() && !Subtarget.useIndirectJumpsHazard()))
+      if (STI->isTargetNaCl() ||
+          (STI->hasMips32r6() && !STI->useIndirectJumpsHazard()))
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
-          .addReg(Mips::SP).addImm(8);
+            .addReg(Mips::SP)
+            .addImm(8);
 
-      if (Subtarget.hasMips32r6() && !Subtarget.useIndirectJumpsHazard()) {
+      if (STI->hasMips32r6() && !STI->useIndirectJumpsHazard()) {
         const unsigned JICOp =
-            Subtarget.inMicroMipsMode() ? Mips::JIC_MMR6 : Mips::JIC;
+            STI->inMicroMipsMode() ? Mips::JIC_MMR6 : Mips::JIC;
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(JICOp))
             .addReg(Mips::AT)
             .addImm(0);
 
       } else {
         unsigned JROp =
-            Subtarget.useIndirectJumpsHazard()
-                ? (Subtarget.hasMips32r6() ? Mips::JR_HB_R6 : Mips::JR_HB)
+            STI->useIndirectJumpsHazard()
+                ? (STI->hasMips32r6() ? Mips::JR_HB_R6 : Mips::JR_HB)
                 : Mips::JR;
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(JROp)).addReg(Mips::AT);
 
-        if (Subtarget.isTargetNaCl()) {
+        if (STI->isTargetNaCl()) {
           BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
         } else
           BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
@@ -448,23 +557,29 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       Pos = LongBrMBB->begin();
 
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
-        .addReg(Mips::SP_64).addImm(-16);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
-        .addReg(Mips::SP_64).addImm(0);
+          .addReg(Mips::SP_64)
+          .addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD))
+          .addReg(Mips::RA_64)
+          .addReg(Mips::SP_64)
+          .addImm(0);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
-              Mips::AT_64).addReg(Mips::ZERO_64)
-                          .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
+              Mips::AT_64)
+          .addReg(Mips::ZERO_64)
+          .addMBB(TgtMBB, MipsII::MO_ABS_HI)
+          .addMBB(BalTgtMBB);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(16);
+          .addReg(Mips::AT_64)
+          .addImm(16);
 
       MachineInstrBuilder BalInstr =
-          BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
+          BuildMI(*MFp, DL, TII->get(BalOp)).addMBB(BalTgtMBB);
       MachineInstrBuilder DADDiuInstr =
-          BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+          BuildMI(*MFp, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
               .addReg(Mips::AT_64)
               .addMBB(TgtMBB, MipsII::MO_ABS_LO)
               .addMBB(BalTgtMBB);
-      if (Subtarget.hasMips32r6()) {
+      if (STI->hasMips32r6()) {
         LongBrMBB->insert(Pos, DADDiuInstr);
         LongBrMBB->insert(Pos, BalInstr);
       } else {
@@ -476,11 +591,13 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       Pos = BalTgtMBB->begin();
 
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
-        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+          .addReg(Mips::RA_64)
+          .addReg(Mips::AT_64);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
-        .addReg(Mips::SP_64).addImm(0);
+          .addReg(Mips::SP_64)
+          .addImm(0);
 
-      if (Subtarget.hasMips64r6() && !Subtarget.useIndirectJumpsHazard()) {
+      if (STI->hasMips64r6() && !STI->useIndirectJumpsHazard()) {
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
             .addReg(Mips::SP_64)
             .addImm(16);
@@ -489,8 +606,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
             .addImm(0);
       } else {
         unsigned JROp =
-            Subtarget.useIndirectJumpsHazard()
-                ? (Subtarget.hasMips32r6() ? Mips::JR_HB64_R6 : Mips::JR_HB64)
+            STI->useIndirectJumpsHazard()
+                ? (STI->hasMips32r6() ? Mips::JR_HB64_R6 : Mips::JR_HB64)
                 : Mips::JR64;
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(JROp)).addReg(Mips::AT_64);
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
@@ -510,14 +627,14 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     //
     Pos = LongBrMBB->begin();
     LongBrMBB->addSuccessor(TgtMBB);
-    if (Subtarget.hasMips32r6())
+    if (STI->hasMips32r6())
       BuildMI(*LongBrMBB, Pos, DL,
-              TII->get(Subtarget.inMicroMipsMode() ? Mips::BC_MMR6 : Mips::BC))
+              TII->get(STI->inMicroMipsMode() ? Mips::BC_MMR6 : Mips::BC))
           .addMBB(TgtMBB);
     else
       MIBundleBuilder(*LongBrMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::J)).addMBB(TgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+          .append(BuildMI(*MFp, DL, TII->get(Mips::J)).addMBB(TgtMBB))
+          .append(BuildMI(*MFp, DL, TII->get(Mips::NOP)));
 
     assert(LongBrMBB->size() == LongBranchSeqSize);
   }
@@ -537,35 +654,66 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
   MachineBasicBlock::iterator I = MBB.begin();
   DebugLoc DL = MBB.findDebugLoc(MBB.begin());
   BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
   BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
-    .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+      .addReg(Mips::V0)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
   MBB.removeLiveIn(Mips::V0);
 }
 
-bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
-  const MipsSubtarget &STI =
-      static_cast<const MipsSubtarget &>(F.getSubtarget());
-  const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+bool MipsBranchExpansion::handleForbiddenSlot() {
+  // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
+  if (!STI->hasMips32r6() || STI->inMicroMipsMode())
+    return false;
 
-  const TargetMachine& TM = F.getTarget();
-  IsPIC = TM.isPositionIndependent();
-  ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+  const MipsInstrInfo *TII = STI->getInstrInfo();
+
+  bool Changed = false;
+
+  for (MachineFunction::iterator FI = MFp->begin(); FI != MFp->end(); ++FI) {
+    for (Iter I = FI->begin(); I != FI->end(); ++I) {
+
+      // Forbidden slot hazard handling. Use lookahead over state.
+      if (!TII->HasForbiddenSlot(*I))
+        continue;
+
+      Iter Inst;
+      bool LastInstInFunction =
+          std::next(I) == FI->end() && std::next(FI) == MFp->end();
+      if (!LastInstInFunction) {
+        std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
+        LastInstInFunction |= Res.second;
+        Inst = Res.first;
+      }
+
+      if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
+
+        MachineBasicBlock::instr_iterator Iit = I->getIterator();
+        if (std::next(Iit) == FI->end() ||
+            std::next(Iit)->getOpcode() != Mips::NOP) {
+          Changed = true;
+          MIBundleBuilder(&*I).append(
+              BuildMI(*MFp, I->getDebugLoc(), TII->get(Mips::NOP)));
+          NumInsertedNops++;
+        }
+      }
+    }
+  }
 
-  LongBranchSeqSize = IsPIC ? ((ABI.IsN64() || STI.isTargetNaCl()) ? 10 : 9)
-                          : (STI.hasMips32r6() ? 1 : 2);
+  return Changed;
+}
 
-  if (STI.inMips16Mode() || !STI.enableLongBranchPass())
+bool MipsBranchExpansion::handlePossibleLongBranch() {
+
+  LongBranchSeqSize = IsPIC ? ((ABI.IsN64() || STI->isTargetNaCl()) ? 10 : 9)
+                            : (STI->hasMips32r6() ? 1 : 2);
+
+  if (STI->inMips16Mode() || !STI->enableLongBranchPass())
     return false;
-  if (IsPIC && static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
-      F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
-    emitGPDisp(F, TII);
 
   if (SkipLongBranch)
-    return true;
+    return false;
 
-  MF = &F;
   initMBBInfo();
 
   SmallVectorImpl<MBBInfo>::iterator I, E = MBBInfos.end();
@@ -580,10 +728,9 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
-      int ShVal = STI.inMicroMipsMode() ? 2 : 4;
-      int64_t Offset = computeOffset(I->Br) / ShVal;
+      int64_t Offset = computeOffset(I->Br);
 
-      if (STI.isTargetNaCl()) {
+      if (STI->isTargetNaCl()) {
         // The offset calculation does not include sandboxing instructions
         // that will be added later in the MC layer.  Since at this point we
         // don't know the exact amount of code that "sandboxing" will add, we
@@ -591,8 +738,9 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
         Offset *= 2;
       }
 
-      // Check if offset fits into 16-bit immediate field of branches.
-      if (!ForceLongBranch && isInt<16>(Offset))
+      // Check if offset fits into the immediate field of the branch.
+      if (!ForceLongBranchFirstPass &&
+          TII->isBranchOffsetInRange(I->Br->getOpcode(), Offset))
         continue;
 
       I->HasLongBranch = true;
@@ -602,27 +750,49 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
     }
   }
 
-  if (!EverMadeChange)
-    return true;
-
-  // Compute basic block addresses.
-  if (IsPIC) {
-    uint64_t Address = 0;
+  ForceLongBranchFirstPass = false;
 
-    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
-      I->Address = Address;
-  }
+  if (!EverMadeChange)
+    return false;
 
   // Do the expansion.
   for (I = MBBInfos.begin(); I != E; ++I)
-    if (I->HasLongBranch)
+    if (I->HasLongBranch) {
       expandToLongBranch(*I);
+    }
 
-  MF->RenumberBlocks();
+  MFp->RenumberBlocks();
 
   return true;
 }
 
-/// createMipsLongBranchPass - Returns a pass that converts branches to long
-/// branches.
-FunctionPass *llvm::createMipsLongBranchPass() { return new MipsLongBranch(); }
+bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  IsPIC = TM.isPositionIndependent();
+  ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+  STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  TII = static_cast<const MipsInstrInfo *>(STI->getInstrInfo());
+
+  if (IsPIC && ABI.IsO32() &&
+      MF.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+    emitGPDisp(MF, TII);
+
+  MFp = &MF;
+
+  ForceLongBranchFirstPass = ForceLongBranch;
+  // Run these two at least once
+  bool longBranchChanged = handlePossibleLongBranch();
+  bool forbiddenSlotChanged = handleForbiddenSlot();
+
+  bool Changed = longBranchChanged || forbiddenSlotChanged;
+
+  // Then run them alternatively while there are changes
+  while (forbiddenSlotChanged) {
+    longBranchChanged = handlePossibleLongBranch();
+    if (!longBranchChanged)
+      break;
+    forbiddenSlotChanged = handleForbiddenSlot();
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp
new file mode 100644
index 000000000000..e82f62260b3f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -0,0 +1,441 @@
+//===- MipsCallLowering.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsCallLowering.h"
+#include "MipsCCState.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
+    : CallLowering(&TLI) {}
+
+bool MipsCallLowering::MipsHandler::assign(const CCValAssign &VA,
+                                           unsigned vreg) {
+  if (VA.isRegLoc()) {
+    assignValueToReg(vreg, VA.getLocReg());
+  } else if (VA.isMemLoc()) {
+    unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
+    unsigned Offset = VA.getLocMemOffset();
+    MachinePointerInfo MPO;
+    unsigned StackAddr = getStackAddress(Size, Offset, MPO);
+    assignValueToAddress(vreg, StackAddr, Size, MPO);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+namespace {
+class IncomingValueHandler : public MipsCallLowering::MipsHandler {
+public:
+  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+      : MipsHandler(MIRBuilder, MRI) {}
+
+  bool handle(ArrayRef<CCValAssign> ArgLocs,
+              ArrayRef<CallLowering::ArgInfo> Args);
+
+private:
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg) override;
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override;
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO) override;
+
+  virtual void markPhysRegUsed(unsigned PhysReg) {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+  }
+
+  void buildLoad(unsigned Val, unsigned Addr, uint64_t Size, unsigned Alignment,
+                 MachinePointerInfo &MPO) {
+    MachineMemOperand *MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOLoad, Size, Alignment);
+    MIRBuilder.buildLoad(Val, Addr, *MMO);
+  }
+};
+
+class CallReturnHandler : public IncomingValueHandler {
+public:
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder &MIB)
+      : IncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+private:
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIB.addDef(PhysReg, RegState::Implicit);
+  }
+
+  MachineInstrBuilder &MIB;
+};
+
+} // end anonymous namespace
+
+void IncomingValueHandler::assignValueToReg(unsigned ValVReg,
+                                            unsigned PhysReg) {
+  MIRBuilder.buildCopy(ValVReg, PhysReg);
+  markPhysRegUsed(PhysReg);
+}
+
+unsigned IncomingValueHandler::getStackAddress(uint64_t Size, int64_t Offset,
+                                               MachinePointerInfo &MPO) {
+  MachineFrameInfo &MFI = MIRBuilder.getMF().getFrameInfo();
+
+  int FI = MFI.CreateFixedObject(Size, Offset, true);
+  MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+  unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
+  MIRBuilder.buildFrameIndex(AddrReg, FI);
+
+  return AddrReg;
+}
+
+void IncomingValueHandler::assignValueToAddress(unsigned ValVReg, unsigned Addr,
+                                                uint64_t Size,
+                                                MachinePointerInfo &MPO) {
+  // If the value is not extended, a simple load will suffice.
+  buildLoad(ValVReg, Addr, Size, /* Alignment */ 0, MPO);
+}
+
+bool IncomingValueHandler::handle(ArrayRef<CCValAssign> ArgLocs,
+                                  ArrayRef<CallLowering::ArgInfo> Args) {
+  for (unsigned i = 0, ArgsSize = Args.size(); i < ArgsSize; ++i) {
+    if (!assign(ArgLocs[i], Args[i].Reg))
+      return false;
+  }
+  return true;
+}
+
+namespace {
+class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
+public:
+  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstrBuilder &MIB)
+      : MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+  bool handle(ArrayRef<CCValAssign> ArgLocs,
+              ArrayRef<CallLowering::ArgInfo> Args);
+
+private:
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg) override;
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override;
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO) override;
+
+  MachineInstrBuilder &MIB;
+};
+} // end anonymous namespace
+
+void OutgoingValueHandler::assignValueToReg(unsigned ValVReg,
+                                            unsigned PhysReg) {
+  MIRBuilder.buildCopy(PhysReg, ValVReg);
+  MIB.addUse(PhysReg, RegState::Implicit);
+}
+
+unsigned OutgoingValueHandler::getStackAddress(uint64_t Size, int64_t Offset,
+                                               MachinePointerInfo &MPO) {
+  LLT p0 = LLT::pointer(0, 32);
+  LLT s32 = LLT::scalar(32);
+  unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+  MIRBuilder.buildCopy(SPReg, Mips::SP);
+
+  unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+  MIRBuilder.buildConstant(OffsetReg, Offset);
+
+  unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+  MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+  MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+  return AddrReg;
+}
+
+void OutgoingValueHandler::assignValueToAddress(unsigned ValVReg, unsigned Addr,
+                                                uint64_t Size,
+                                                MachinePointerInfo &MPO) {
+  MachineMemOperand *MMO = MIRBuilder.getMF().getMachineMemOperand(
+      MPO, MachineMemOperand::MOStore, Size, /* Alignment */ 0);
+  MIRBuilder.buildStore(ValVReg, Addr, *MMO);
+}
+
+bool OutgoingValueHandler::handle(ArrayRef<CCValAssign> ArgLocs,
+                                  ArrayRef<CallLowering::ArgInfo> Args) {
+  for (unsigned i = 0; i < Args.size(); ++i) {
+    if (!assign(ArgLocs[i], Args[i].Reg))
+      return false;
+  }
+  return true;
+}
+
+static bool isSupportedType(Type *T) {
+  if (T->isIntegerTy() && T->getScalarSizeInBits() == 32)
+    return true;
+  if (T->isPointerTy())
+    return true;
+  return false;
+}
+
+bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                   const Value *Val, unsigned VReg) const {
+
+  MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
+
+  if (Val != nullptr) {
+    if (!isSupportedType(Val->getType()))
+      return false;
+
+    MachineFunction &MF = MIRBuilder.getMF();
+    const Function &F = MF.getFunction();
+    const DataLayout &DL = MF.getDataLayout();
+    const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+
+    SmallVector<ArgInfo, 8> RetInfos;
+    SmallVector<unsigned, 8> OrigArgIndices;
+
+    ArgInfo ArgRetInfo(VReg, Val->getType());
+    setArgFlags(ArgRetInfo, AttributeList::ReturnIndex, DL, F);
+    splitToValueTypes(ArgRetInfo, 0, RetInfos, OrigArgIndices);
+
+    SmallVector<ISD::OutputArg, 8> Outs;
+    subTargetRegTypeForCallingConv(
+        MIRBuilder, RetInfos, OrigArgIndices,
+        [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
+            unsigned origIdx, unsigned partOffs) {
+          Outs.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+        });
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+                       F.getContext());
+    CCInfo.AnalyzeReturn(Outs, TLI.CCAssignFnForReturn());
+
+    OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+    if (!RetHandler.handle(ArgLocs, RetInfos)) {
+      return false;
+    }
+  }
+  MIRBuilder.insertInstr(Ret);
+  return true;
+}
+
+bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                            const Function &F,
+                                            ArrayRef<unsigned> VRegs) const {
+
+  // Quick exit if there aren't any args.
+  if (F.arg_empty())
+    return true;
+
+  if (F.isVarArg()) {
+    return false;
+  }
+
+  for (auto &Arg : F.args()) {
+    if (!isSupportedType(Arg.getType()))
+      return false;
+  }
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const DataLayout &DL = MF.getDataLayout();
+  const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+
+  SmallVector<ArgInfo, 8> ArgInfos;
+  SmallVector<unsigned, 8> OrigArgIndices;
+  unsigned i = 0;
+  for (auto &Arg : F.args()) {
+    ArgInfo AInfo(VRegs[i], Arg.getType());
+    setArgFlags(AInfo, i + AttributeList::FirstArgIndex, DL, F);
+    splitToValueTypes(AInfo, i, ArgInfos, OrigArgIndices);
+    ++i;
+  }
+
+  SmallVector<ISD::InputArg, 8> Ins;
+  subTargetRegTypeForCallingConv(
+      MIRBuilder, ArgInfos, OrigArgIndices,
+      [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used, unsigned origIdx,
+          unsigned partOffs) {
+        Ins.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+      });
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+                     F.getContext());
+
+  const MipsTargetMachine &TM =
+      static_cast<const MipsTargetMachine &>(MF.getTarget());
+  const MipsABIInfo &ABI = TM.getABI();
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()),
+                       1);
+  CCInfo.AnalyzeFormalArguments(Ins, TLI.CCAssignFnForCall());
+
+  IncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
+  if (!Handler.handle(ArgLocs, ArgInfos))
+    return false;
+
+  return true;
+}
+
+bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                 CallingConv::ID CallConv,
+                                 const MachineOperand &Callee,
+                                 const ArgInfo &OrigRet,
+                                 ArrayRef<ArgInfo> OrigArgs) const {
+
+  if (CallConv != CallingConv::C)
+    return false;
+
+  for (auto &Arg : OrigArgs) {
+    if (!isSupportedType(Arg.Ty))
+      return false;
+    if (Arg.Flags.isByVal() || Arg.Flags.isSRet())
+      return false;
+  }
+  if (OrigRet.Reg && !isSupportedType(OrigRet.Ty))
+    return false;
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+  const MipsTargetMachine &TM =
+      static_cast<const MipsTargetMachine &>(MF.getTarget());
+  const MipsABIInfo &ABI = TM.getABI();
+
+  MachineInstrBuilder CallSeqStart =
+      MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN);
+
+  // FIXME: Add support for pic calling sequences, long call sequences for O32,
+  //       N32 and N64. First handle the case when Callee.isReg().
+  if (Callee.isReg())
+    return false;
+
+  MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(Mips::JAL);
+  MIB.addDef(Mips::SP, RegState::Implicit);
+  MIB.add(Callee);
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
+
+  TargetLowering::ArgListTy FuncOrigArgs;
+  FuncOrigArgs.reserve(OrigArgs.size());
+
+  SmallVector<ArgInfo, 8> ArgInfos;
+  SmallVector<unsigned, 8> OrigArgIndices;
+  unsigned i = 0;
+  for (auto &Arg : OrigArgs) {
+
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = Arg.Ty;
+    FuncOrigArgs.push_back(Entry);
+
+    splitToValueTypes(Arg, i, ArgInfos, OrigArgIndices);
+    ++i;
+  }
+
+  SmallVector<ISD::OutputArg, 8> Outs;
+  subTargetRegTypeForCallingConv(
+      MIRBuilder, ArgInfos, OrigArgIndices,
+      [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used, unsigned origIdx,
+          unsigned partOffs) {
+        Outs.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+      });
+
+  SmallVector<CCValAssign, 8> ArgLocs;
+  MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+                     F.getContext());
+
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+  const char *Call = Callee.isSymbol() ? Callee.getSymbolName() : nullptr;
+  CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
+
+  OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
+  if (!RetHandler.handle(ArgLocs, ArgInfos)) {
+    return false;
+  }
+
+  unsigned NextStackOffset = CCInfo.getNextStackOffset();
+  const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+  unsigned StackAlignment = TFL->getStackAlignment();
+  NextStackOffset = alignTo(NextStackOffset, StackAlignment);
+  CallSeqStart.addImm(NextStackOffset).addImm(0);
+
+  MIRBuilder.insertInstr(MIB);
+
+  if (OrigRet.Reg) {
+
+    ArgInfos.clear();
+    SmallVector<unsigned, 8> OrigRetIndices;
+
+    splitToValueTypes(OrigRet, 0, ArgInfos, OrigRetIndices);
+
+    SmallVector<ISD::InputArg, 8> Ins;
+    subTargetRegTypeForCallingConv(
+        MIRBuilder, ArgInfos, OrigRetIndices,
+        [&](ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
+            unsigned origIdx, unsigned partOffs) {
+          Ins.emplace_back(flags, vt, argvt, used, origIdx, partOffs);
+        });
+
+    SmallVector<CCValAssign, 8> ArgLocs;
+    MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+                       F.getContext());
+
+    CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), OrigRet.Ty, Call);
+
+    CallReturnHandler Handler(MIRBuilder, MF.getRegInfo(), MIB);
+    if (!Handler.handle(ArgLocs, ArgInfos))
+      return false;
+  }
+
+  MIRBuilder.buildInstr(Mips::ADJCALLSTACKUP).addImm(NextStackOffset).addImm(0);
+
+  return true;
+}
+
+void MipsCallLowering::subTargetRegTypeForCallingConv(
+    MachineIRBuilder &MIRBuilder, ArrayRef<ArgInfo> Args,
+    ArrayRef<unsigned> OrigArgIndices, const FunTy &PushBack) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+
+  unsigned ArgNo = 0;
+  for (auto &Arg : Args) {
+
+    EVT VT = TLI.getValueType(DL, Arg.Ty);
+    MVT RegisterVT = TLI.getRegisterTypeForCallingConv(F.getContext(), VT);
+
+    ISD::ArgFlagsTy Flags = Arg.Flags;
+    Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL));
+
+    PushBack(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo], 0);
+
+    ++ArgNo;
+  }
+}
+
+void MipsCallLowering::splitToValueTypes(
+    const ArgInfo &OrigArg, unsigned OriginalIndex,
+    SmallVectorImpl<ArgInfo> &SplitArgs,
+    SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const {
+
+  // TODO : perform structure and array split. For now we only deal with
+  // types that pass isSupportedType check.
+  SplitArgs.push_back(OrigArg);
+  SplitArgsOrigIndices.push_back(OriginalIndex);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallLowering.h b/contrib/llvm/lib/Target/Mips/MipsCallLowering.h
new file mode 100644
index 000000000000..e23c10cec563
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCallLowering.h
@@ -0,0 +1,86 @@
+//===- MipsCallLowering.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSCALLLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class MipsTargetLowering;
+
+class MipsCallLowering : public CallLowering {
+
+public:
+  class MipsHandler {
+  public:
+    MipsHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+        : MIRBuilder(MIRBuilder), MRI(MRI) {}
+
+    virtual ~MipsHandler() = default;
+
+  protected:
+    bool assign(const CCValAssign &VA, unsigned vreg);
+
+    MachineIRBuilder &MIRBuilder;
+    MachineRegisterInfo &MRI;
+
+  private:
+    virtual unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                                     MachinePointerInfo &MPO) = 0;
+
+    virtual void assignValueToReg(unsigned ValVReg, unsigned PhysReg) = 0;
+
+    virtual void assignValueToAddress(unsigned ValVReg, unsigned Addr,
+                                      uint64_t Size,
+                                      MachinePointerInfo &MPO) = 0;
+  };
+
+  MipsCallLowering(const MipsTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<unsigned> VRegs) const override;
+
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
+                 ArrayRef<ArgInfo> OrigArgs) const override;
+
+private:
+  using FunTy =
+      std::function<void(ISD::ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
+                         unsigned origIdx, unsigned partOffs)>;
+
+  /// Based on registers available on target machine split or extend
+  /// type if needed, also change pointer type to appropriate integer
+  /// type. Lambda will fill some info so we can tell MipsCCState to
+  /// assign physical registers.
+  void subTargetRegTypeForCallingConv(MachineIRBuilder &MIRBuilder,
+                                      ArrayRef<ArgInfo> Args,
+                                      ArrayRef<unsigned> OrigArgIndices,
+                                      const FunTy &PushBack) const;
+
+  /// Split structures and arrays, save original argument indices since
+  /// Mips calling conv needs info about original argument type.
+  void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSCALLLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsCondMov.td b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
index a0039d159248..39dc2654aa6a 100644
--- a/contrib/llvm/lib/Target/Mips/MipsCondMov.td
+++ b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
@@ -104,163 +104,162 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 }
 
 // Instantiation of instructions.
-def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
-               ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-let isCodeGenOnly = 1 in {
-  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
-  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
-  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
-}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
+                 ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+  let isCodeGenOnly = 1 in {
+    def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
+                       ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+    def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
+                       ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+    def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
+                       ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+  }
 
-def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
+                       ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
 
-let isCodeGenOnly = 1 in {
-  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
-  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
-  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
-}
+  let isCodeGenOnly = 1 in {
+    def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
+                       ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+    def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
+                       ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+    def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
+                       ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+  }
+  def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
+                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+  let isCodeGenOnly = 1 in
+  def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
+                   CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+  def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
+                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+  let isCodeGenOnly = 1 in
+  def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
+                   CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+  def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                      II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+                   INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+  def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                      II_MOVN_D>, CMov_I_F_FM<19, 17>,
+                   INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+  let DecoderNamespace = "MipsFP64" in {
+    def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
+                     CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
+                     CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    let isCodeGenOnly = 1 in {
+      def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+                         CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+      def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+                         CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    }
+  }
 
-def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
-               CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
 
-let isCodeGenOnly = 1 in
-def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
-                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  let isCodeGenOnly = 1 in
+  def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
+                 CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 
-def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
-               CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
-let isCodeGenOnly = 1 in
-def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
-                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  let isCodeGenOnly = 1 in
+  def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
+                 CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
+               CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
+               CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
-def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
-                                    II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+  def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                    MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
                  INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
-                                    II_MOVN_D>, CMov_I_F_FM<19, 17>,
+  def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                    MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
                  INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 
-let DecoderNamespace = "MipsFP64" in {
-  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
-                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
-                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-  let isCodeGenOnly = 1 in {
-    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
-                       CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
-                       CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  let DecoderNamespace = "MipsFP64" in {
+    def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
+                   CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
+                   CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   }
-}
 
-def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
-             CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-let isCodeGenOnly = 1 in
-def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-
-def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
-             CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-let isCodeGenOnly = 1 in
-def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-
-def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
-             CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
-def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
-             CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
-                                  MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
-               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
-                                  MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
-               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-
-let DecoderNamespace = "MipsFP64" in {
-  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
-                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
-                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  // Instantiation of conditional move patterns.
+  defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+         INSN_MIPS4_32_NOT_32R6_64R6;
+  defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+  defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+  defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+  defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+  defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+         GPR_64;
+  defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+         GPR_64;
+  defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+         GPR_64;
+
+  defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+         INSN_MIPS4_32_NOT_32R6_64R6;
+  defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+  defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+  defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+  defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+         GPR_64;
+  defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+         GPR_64;
+
+  defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+         INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+  defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+         FGR_32;
+  defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+         FGR_32;
+
+  defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+         INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+         FGR_64;
+  defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+         INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+         FGR_64;
+  defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+         FGR_64;
 }
-
-// Instantiation of conditional move patterns.
-defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
-       INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-
-defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
-       GPR_64;
-defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
-       GPR_64;
-defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
-       GPR_64;
-
-defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
-       INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
-
-defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
-defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
-       GPR_64;
-defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
-       GPR_64;
-
-defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
-       INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
-defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
-       FGR_32;
-defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
-       FGR_32;
-
-defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
-       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
-       FGR_64;
-defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
-       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
-defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
-       FGR_64;
-defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
-       FGR_64;
-
 // For targets that don't have conditional-move instructions
 // we have to match SELECT nodes with pseudo instructions.
 let usesCustomInserter = 1 in {
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index a9abc171b423..9eb13a68e561 100644
--- a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
@@ -442,13 +443,15 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MCP = mf.getConstantPool();
   STI = &static_cast<const MipsSubtarget &>(mf.getSubtarget());
-  DEBUG(dbgs() << "constant island machine function " << "\n");
+  LLVM_DEBUG(dbgs() << "constant island machine function "
+                    << "\n");
   if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
     return false;
   }
   TII = (const Mips16InstrInfo *)STI->getInstrInfo();
   MFI = MF->getInfo<MipsFunctionInfo>();
-  DEBUG(dbgs() << "constant island processing " << "\n");
+  LLVM_DEBUG(dbgs() << "constant island processing "
+                    << "\n");
   //
   // will need to make predermination if there is any constants we need to
   // put in constant islands. TBD.
@@ -479,7 +482,7 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // constant pool users.
   initializeFunctionInfo(CPEMIs);
   CPEMIs.clear();
-  DEBUG(dumpBBs());
+  LLVM_DEBUG(dumpBBs());
 
   /// Remove dead constant pool entries.
   MadeChange |= removeUnusedCPEntries();
@@ -489,31 +492,31 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   unsigned NoCPIters = 0, NoBRIters = 0;
   (void)NoBRIters;
   while (true) {
-    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    LLVM_DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
     bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
       CPChange |= handleConstantPoolUser(i);
     if (CPChange && ++NoCPIters > 30)
       report_fatal_error("Constant Island pass failed to converge!");
-    DEBUG(dumpBBs());
+    LLVM_DEBUG(dumpBBs());
 
     // Clear NewWaterList now.  If we split a block for branches, it should
     // appear as "new water" for the next iteration of constant pool placement.
     NewWaterList.clear();
 
-    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    LLVM_DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
     bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
       BRChange |= fixupImmediateBr(ImmBranches[i]);
     if (BRChange && ++NoBRIters > 30)
       report_fatal_error("Branch Fix Up pass failed to converge!");
-    DEBUG(dumpBBs());
+    LLVM_DEBUG(dumpBBs());
     if (!CPChange && !BRChange)
       break;
     MadeChange = true;
   }
 
-  DEBUG(dbgs() << '\n'; dumpBBs());
+  LLVM_DEBUG(dbgs() << '\n'; dumpBBs());
 
   BBInfo.clear();
   WaterList.clear();
@@ -580,10 +583,10 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
     // Add a new CPEntry, but no corresponding CPUser yet.
     CPEntries.emplace_back(1, CPEntry(CPEMI, i));
     ++NumCPEs;
-    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
-                 << Size << ", align = " << Align <<'\n');
+    LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                      << Size << ", align = " << Align << '\n');
   }
-  DEBUG(BB->dump());
+  LLVM_DEBUG(BB->dump());
 }
 
 /// BBHasFallthrough - Return true if the specified basic block can fallthrough
@@ -660,7 +663,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
     if (!BBHasFallthrough(&MBB))
       WaterList.push_back(&MBB);
     for (MachineInstr &MI : MBB) {
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       int Opc = MI.getOpcode();
@@ -986,7 +989,7 @@ bool MipsConstantIslands::isCPEntryInRange
   unsigned CPEOffset  = getOffsetOf(CPEMI);
 
   if (DoDump) {
-    DEBUG({
+    LLVM_DEBUG({
       unsigned Block = MI->getParent()->getNumber();
       const BasicBlockInfo &BBI = BBInfo[Block];
       dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
@@ -1059,7 +1062,7 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
   // Check to see if the CPE is already in-range.
   if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
                        true)) {
-    DEBUG(dbgs() << "In range\n");
+    LLVM_DEBUG(dbgs() << "In range\n");
     return 1;
   }
 
@@ -1075,8 +1078,8 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
-      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
-                   << CPEs[i].CPI << "\n");
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                        << CPEs[i].CPI << "\n");
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
@@ -1113,7 +1116,7 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
   if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
                        U.getLongFormMaxDisp(), U.NegOk,
                        true)) {
-    DEBUG(dbgs() << "In range\n");
+    LLVM_DEBUG(dbgs() << "In range\n");
     UserMI->setDesc(TII->get(U.getLongFormOpcode()));
     U.setMaxDisp(U.getLongFormMaxDisp());
     return 2;  // instruction is longer length now
@@ -1131,8 +1134,8 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
                          U.getLongFormMaxDisp(), U.NegOk)) {
-      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
-                   << CPEs[i].CPI << "\n");
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                        << CPEs[i].CPI << "\n");
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
@@ -1197,8 +1200,8 @@ bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
       // This is the least amount of required padding seen so far.
       BestGrowth = Growth;
       WaterIter = IP;
-      DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
-                   << " Growth=" << Growth << '\n');
+      LLVM_DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
+                        << " Growth=" << Growth << '\n');
 
       // Keep looking unless it is perfect.
       if (BestGrowth == 0)
@@ -1236,8 +1239,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
     unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
 
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
-      DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
-                   << format(", expected CPE offset %#x\n", CPEOffset));
+      LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
+                        << format(", expected CPE offset %#x\n", CPEOffset));
       NewMBB = &*++UserMBB->getIterator();
       // Add an unconditional branch from UserMBB to fallthrough block.  Record
       // it for branch lengthening; this new branch will not get out of range,
@@ -1263,16 +1266,16 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   unsigned LogAlign = MF->getAlignment();
   assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
   unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
-  DEBUG(dbgs() << format("Split in middle of big block before %#x",
-                         BaseInsertOffset));
+  LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                              BaseInsertOffset));
 
   // The 4 in the following is for the unconditional branch we'll be inserting
   // Alignment of the island is handled
   // inside isOffsetInRange.
   BaseInsertOffset -= 4;
 
-  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
-               << " la=" << LogAlign << '\n');
+  LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+                    << " la=" << LogAlign << '\n');
 
   // This could point off the end of the block if we've already got constant
   // pool entries following this block; only the last one is in the water list.
@@ -1280,7 +1283,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   // long unconditional).
   if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
     BaseInsertOffset = UserBBI.postOffset() - 8;
-    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+    LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
   }
   unsigned EndInsertOffset = BaseInsertOffset + 4 +
     CPEMI->getOperand(2).getImm();
@@ -1336,7 +1339,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   MachineBasicBlock *NewMBB;
   water_iterator IP;
   if (findAvailableWater(U, UserOffset, IP)) {
-    DEBUG(dbgs() << "Found water in range\n");
+    LLVM_DEBUG(dbgs() << "Found water in range\n");
     MachineBasicBlock *WaterBB = *IP;
 
     // If the original WaterList entry was "new water" on this iteration,
@@ -1355,7 +1358,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
       result = findLongFormInRangeCPEntry(U, UserOffset);
       if (result != 0) return true;
     }
-    DEBUG(dbgs() << "No water found\n");
+    LLVM_DEBUG(dbgs() << "No water found\n");
     createNewWater(CPUserIndex, UserOffset, NewMBB);
 
     // splitBlockBeforeInstr adds to WaterList, which is important when it is
@@ -1414,8 +1417,9 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
       break;
     }
 
-  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
-        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+  LLVM_DEBUG(
+      dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+             << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
 
   return true;
 }
@@ -1470,11 +1474,11 @@ bool MipsConstantIslands::isBBInRange
   unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
   unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
 
-  DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
-               << " from " << printMBBReference(*MI->getParent())
-               << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
-               << " to " << DestOffset << " offset "
-               << int(DestOffset - BrOffset) << "\t" << *MI);
+  LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
+                    << " from " << printMBBReference(*MI->getParent())
+                    << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
+                    << " to " << DestOffset << " offset "
+                    << int(DestOffset - BrOffset) << "\t" << *MI);
 
   if (BrOffset <= DestOffset) {
     // Branch before the Dest.
@@ -1539,7 +1543,7 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   HasFarJump = true;
   ++NumUBrFixed;
 
-  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+  LLVM_DEBUG(dbgs() << "  Changed B to long jump " << *MI);
 
   return true;
 }
@@ -1594,8 +1598,9 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
       MachineBasicBlock *NewDest = 
         BMI->getOperand(BMITargetOperand).getMBB();
       if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
-        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
-                     << *BMI);
+        LLVM_DEBUG(
+            dbgs() << "  Invert Bcc condition and swap its destination with "
+                   << *BMI);
         MI->setDesc(TII->get(OppositeBranchOpcode));
         BMI->getOperand(BMITargetOperand).setMBB(DestBB);
         MI->getOperand(TargetOperand).setMBB(NewDest);
@@ -1615,9 +1620,9 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   }
   MachineBasicBlock *NextBB = &*++MBB->getIterator();
 
-  DEBUG(dbgs() << "  Insert B to " << printMBBReference(*DestBB)
-               << " also invert condition and change dest. to "
-               << printMBBReference(*NextBB) << "\n");
+  LLVM_DEBUG(dbgs() << "  Insert B to " << printMBBReference(*DestBB)
+                    << " also invert condition and change dest. to "
+                    << printMBBReference(*NextBB) << "\n");
 
   // Insert a new conditional branch and a new unconditional branch.
   // Also update the ImmBranch as well as adding a new entry for the new branch.
@@ -1653,19 +1658,19 @@ void MipsConstantIslands::prescanForConstants() {
       switch(I->getDesc().getOpcode()) {
         case Mips::LwConstant32: {
           PrescannedForConstants = true;
-          DEBUG(dbgs() << "constant island constant " << *I << "\n");
+          LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
           J = I->getNumOperands();
-          DEBUG(dbgs() << "num operands " << J << "\n");
+          LLVM_DEBUG(dbgs() << "num operands " << J << "\n");
           MachineOperand& Literal = I->getOperand(1);
           if (Literal.isImm()) {
             int64_t V = Literal.getImm();
-            DEBUG(dbgs() << "literal " << V << "\n");
+            LLVM_DEBUG(dbgs() << "literal " << V << "\n");
             Type *Int32Ty =
               Type::getInt32Ty(MF->getFunction().getContext());
             const Constant *C = ConstantInt::get(Int32Ty, V);
             unsigned index = MCP->getConstantPoolIndex(C, 4);
             I->getOperand(2).ChangeToImmediate(index);
-            DEBUG(dbgs() << "constant island constant " << *I << "\n");
+            LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
             I->setDesc(TII->get(Mips::LwRxPcTcp16));
             I->RemoveOperand(1);
             I->RemoveOperand(1);
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
index 2dcefdc789a5..5f0763f5ea46 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -29,11 +29,11 @@ def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
                AssemblerPredicate<"FeatureDSPR3">;
 
 class ISA_DSPR2 {
-  list<Predicate> InsnPredicates = [HasDSPR2];
+  list<Predicate> ASEPredicate = [HasDSPR2];
 }
 
 class ISA_DSPR3 {
-  list<Predicate> InsnPredicates = [HasDSPR3];
+  list<Predicate> ASEPredicate = [HasDSPR3];
 }
 
 // Fields.
@@ -45,8 +45,8 @@ def SPECIAL3_OPCODE : Field6<0b011111>;
 def REGIMM_OPCODE : Field6<0b000001>;
 
 class DSPInst<string opstr = "">
-    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
-  let InsnPredicates = [HasDSP];
+    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let ASEPredicate = [HasDSP];
   string BaseOpcode = opstr;
   string Arch = "dsp";
 }
@@ -54,12 +54,12 @@ class DSPInst<string opstr = "">
 class PseudoDSP<dag outs, dag ins, list<dag> pattern,
                 InstrItinClass itin = IIPseudo>
     : MipsPseudo<outs, ins, pattern, itin> {
-  let InsnPredicates = [HasDSP];
+  let ASEPredicate = [HasDSP];
 }
 
 class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
     : InstAlias<Asm, Result, Emit>, PredicateControl {
-  let InsnPredicates = [HasDSP];
+  let ASEPredicate = [HasDSP];
 }
 
 // ADDU.QB sub-class format.
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
index 871135e3a22b..b9824220b558 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -447,6 +447,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
+  bit isMoveReg = 1;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -457,6 +458,7 @@ class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
+  bit isMoveReg = 1;
 }
 
 class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -500,6 +502,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
+  bit isMoveReg = 1;
 }
 
 class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
@@ -508,6 +511,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
+  bit isMoveReg = 1;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
@@ -1285,7 +1289,7 @@ let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
 }
 
 let DecoderNamespace = "MipsDSP", Arch = "dsp",
-    AdditionalPredicates = [HasDSP] in {
+    ASEPredicate = [HasDSP] in {
   def LWDSP : Load<"lw", DSPROpnd, null_frag, II_LW>, DspMMRel, LW_FM<0x23>;
   def SWDSP : Store<"sw", DSPROpnd, null_frag, II_SW>, DspMMRel, LW_FM<0x2b>;
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index e06b57e41834..33f03b954a8c 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -51,7 +51,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "delay-slot-filler"
+#define DEBUG_TYPE "mips-delay-slot-filler"
 
 STATISTIC(FilledSlots, "Number of delay slots filled");
 STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
@@ -210,9 +210,11 @@ namespace {
     bool SeenNoObjStore = false;
   };
 
-  class Filler : public MachineFunctionPass {
+  class MipsDelaySlotFiller : public MachineFunctionPass {
   public:
-    Filler() : MachineFunctionPass(ID) {}
+    MipsDelaySlotFiller() : MachineFunctionPass(ID) {
+      initializeMipsDelaySlotFillerPass(*PassRegistry::getPassRegistry());
+    }
 
     StringRef getPassName() const override { return "Mips Delay Slot Filler"; }
 
@@ -242,6 +244,8 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
+    static char ID;
+
   private:
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
 
@@ -292,18 +296,19 @@ namespace {
     bool terminateSearch(const MachineInstr &Candidate) const;
 
     const TargetMachine *TM = nullptr;
-
-    static char ID;
   };
 
 } // end anonymous namespace
 
-char Filler::ID = 0;
+char MipsDelaySlotFiller::ID = 0;
 
 static bool hasUnoccupiedSlot(const MachineInstr *MI) {
   return MI->hasDelaySlot() && !MI->isBundledWithSucc();
 }
 
+INITIALIZE_PASS(MipsDelaySlotFiller, DEBUG_TYPE,
+                "Fill delay slot for MIPS", false, false)
+
 /// This function inserts clones of Filler into predecessor blocks.
 static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
   MachineFunction *MF = Filler->getParent()->getParent();
@@ -551,8 +556,9 @@ getUnderlyingObjects(const MachineInstr &MI,
 }
 
 // Replace Branch with the compact branch instruction.
-Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
-                                      const DebugLoc &DL) {
+Iter MipsDelaySlotFiller::replaceWithCompactBranch(MachineBasicBlock &MBB,
+                                                   Iter Branch,
+                                                   const DebugLoc &DL) {
   const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
   const MipsInstrInfo *TII = STI.getInstrInfo();
 
@@ -575,6 +581,7 @@ static int getEquivalentCallShort(int Opcode) {
   case Mips::BLTZAL:
     return Mips::BLTZALS_MM;
   case Mips::JAL:
+  case Mips::JAL_MM:
     return Mips::JALS_MM;
   case Mips::JALR:
     return Mips::JALRS_MM;
@@ -591,7 +598,7 @@ static int getEquivalentCallShort(int Opcode) {
 
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
 /// We assume there is only one delay slot per delayed instruction.
-bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+bool MipsDelaySlotFiller::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
   const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
   bool InMicroMipsMode = STI.inMicroMipsMode();
@@ -632,7 +639,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
           // TODO: Implement an instruction mapping table of 16bit opcodes to
           // 32bit opcodes so that an instruction can be expanded. This would
           // save 16 bits as a TAILCALL_MM pseudo requires a fullsized nop.
-          // TODO: Permit b16 when branching backwards to the the same function
+          // TODO: Permit b16 when branching backwards to the same function
           // if it is in range.
           DSI->setDesc(TII->get(getEquivalentCallShort(DSI->getOpcode())));
         }
@@ -669,16 +676,17 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   return Changed;
 }
 
-template<typename IterTy>
-bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
-                         RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
-                         IterTy &Filler) const {
+template <typename IterTy>
+bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
+                                      IterTy End, RegDefsUses &RegDU,
+                                      InspectMemInstr &IM, Iter Slot,
+                                      IterTy &Filler) const {
   for (IterTy I = Begin; I != End;) {
     IterTy CurrI = I;
     ++I;
 
     // skip debug value
-    if (CurrI->isDebugValue())
+    if (CurrI->isDebugInstr())
       continue;
 
     if (terminateSearch(*CurrI))
@@ -720,6 +728,10 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
         (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
          Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL))
       continue;
+     // Instructions LWP/SWP should not be in a delay slot as that
+     // results in unpredictable behaviour
+     if (InMicroMipsMode && (Opcode == Mips::LWP_MM || Opcode == Mips::SWP_MM))
+       continue;
 
     Filler = CurrI;
     return true;
@@ -728,7 +740,8 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
   return false;
 }
 
-bool Filler::searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const {
+bool MipsDelaySlotFiller::searchBackward(MachineBasicBlock &MBB,
+                                         MachineInstr &Slot) const {
   if (DisableBackwardSearch)
     return false;
 
@@ -750,7 +763,8 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const {
   return true;
 }
 
-bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+bool MipsDelaySlotFiller::searchForward(MachineBasicBlock &MBB,
+                                        Iter Slot) const {
   // Can handle only calls.
   if (DisableForwardSearch || !Slot->isCall())
     return false;
@@ -770,7 +784,8 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
   return true;
 }
 
-bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+bool MipsDelaySlotFiller::searchSuccBBs(MachineBasicBlock &MBB,
+                                        Iter Slot) const {
   if (DisableSuccBBSearch)
     return false;
 
@@ -816,7 +831,8 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
   return true;
 }
 
-MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+MachineBasicBlock *
+MipsDelaySlotFiller::selectSuccBB(MachineBasicBlock &B) const {
   if (B.succ_empty())
     return nullptr;
 
@@ -832,7 +848,8 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
 }
 
 std::pair<MipsInstrInfo::BranchType, MachineInstr *>
-Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+MipsDelaySlotFiller::getBranch(MachineBasicBlock &MBB,
+                               const MachineBasicBlock &Dst) const {
   const MipsInstrInfo *TII =
       MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
   MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
@@ -867,11 +884,13 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   return std::make_pair(MipsInstrInfo::BT_None, nullptr);
 }
 
-bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
-                         RegDefsUses &RegDU, bool &HasMultipleSuccs,
-                         BB2BrMap &BrMap) const {
+bool MipsDelaySlotFiller::examinePred(MachineBasicBlock &Pred,
+                                      const MachineBasicBlock &Succ,
+                                      RegDefsUses &RegDU,
+                                      bool &HasMultipleSuccs,
+                                      BB2BrMap &BrMap) const {
   std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
-    getBranch(Pred, Succ);
+      getBranch(Pred, Succ);
 
   // Return if either getBranch wasn't able to analyze the branches or there
   // were no branches with unoccupied slots.
@@ -888,8 +907,9 @@ bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
   return true;
 }
 
-bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
-                            InspectMemInstr &IM) const {
+bool MipsDelaySlotFiller::delayHasHazard(const MachineInstr &Candidate,
+                                         RegDefsUses &RegDU,
+                                         InspectMemInstr &IM) const {
   assert(!Candidate.isKill() &&
          "KILL instructions should have been eliminated at this point.");
 
@@ -901,7 +921,7 @@ bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
   return HasHazard;
 }
 
-bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+bool MipsDelaySlotFiller::terminateSearch(const MachineInstr &Candidate) const {
   return (Candidate.isTerminator() || Candidate.isCall() ||
           Candidate.isPosition() || Candidate.isInlineAsm() ||
           Candidate.hasUnmodeledSideEffects());
@@ -909,4 +929,4 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const {
 
 /// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
 /// slots in Mips MachineFunctions
-FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new Filler(); }
+FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new MipsDelaySlotFiller(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
index 8c3024810d27..61785d0e891a 100644
--- a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
-                    PredicateControl, StdArch {
+                    StdArch {
   let DecoderNamespace = "Mips";
   let EncodingPredicates = [HasStdEnc];
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
index 26df263d228b..ff54b1f17877 100644
--- a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -59,6 +59,7 @@ class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeMemEVA";
   bit canFoldAsLoad = 1;
+  string BaseOpcode = instr_asm;
   bit mayLoad = 1;
   InstrItinClass Itinerary = itin;
 }
@@ -77,6 +78,7 @@ class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeMemEVA";
+  string BaseOpcode = instr_asm;
   bit mayStore = 1;
   InstrItinClass Itinerary = itin;
 }
@@ -93,13 +95,16 @@ class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeMemEVA";
+  string BaseOpcode = instr_asm;
   string Constraints = "$src = $rt";
   bit canFoldAsLoad = 1;
   InstrItinClass Itinerary = itin;
+  bit mayLoad = 1;
+  bit mayStore = 0;
 }
 
-class LWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle",  GPR32Opnd, II_LWLE>;
-class LWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre",  GPR32Opnd, II_LWRE>;
+class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd, II_LWLE>;
+class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd, II_LWRE>;
 
 class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                                      InstrItinClass itin = NoItinerary> {
@@ -108,11 +113,14 @@ class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeMemEVA";
+  string BaseOpcode = instr_asm;
   InstrItinClass Itinerary = itin;
+  bit mayLoad = 0;
+  bit mayStore = 1;
 }
 
-class SWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle",  GPR32Opnd, II_SWLE>;
-class SWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre",  GPR32Opnd, II_SWRE>;
+class SWLE_DESC : STORE_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd, II_SWLE>;
+class SWRE_DESC : STORE_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd, II_SWRE>;
 
 // Load-linked EVA, Store-conditional EVA descriptions
 class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
@@ -121,6 +129,7 @@ class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   dag InOperandList = (ins mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
+  string BaseOpcode = instr_asm;
   bit mayLoad = 1;
   string DecoderMethod = "DecodeMemEVA";
   InstrItinClass Itinerary = itin;
@@ -134,6 +143,7 @@ class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
+  string BaseOpcode = instr_asm;
   bit mayStore = 1;
   string Constraints = "$rt = $dst";
   string DecoderMethod = "DecodeMemEVA";
@@ -159,6 +169,7 @@ class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd,
   dag InOperandList = (ins  MemOpnd:$addr, uimm5:$hint);
   string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
   list<dag> Pattern = [];
+  string BaseOpcode = instr_asm;
   string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
   InstrItinClass Itinerary = itin;
 }
@@ -172,38 +183,32 @@ class PREFE_DESC   : CACHEE_DESC_BASE<"prefe", mem_simm9, II_PREFE>;
 //
 //===----------------------------------------------------------------------===//
 
-/// Load and Store EVA Instructions
-def LBE     : LBE_ENC, LBE_DESC, INSN_EVA;
-def LBuE    : LBuE_ENC, LBuE_DESC, INSN_EVA;
-def LHE     : LHE_ENC, LHE_DESC, INSN_EVA;
-def LHuE    : LHuE_ENC, LHuE_DESC, INSN_EVA;
-let AdditionalPredicates = [NotInMicroMips] in {
-def LWE     : LWE_ENC, LWE_DESC, INSN_EVA;
-}
-def SBE     : SBE_ENC, SBE_DESC, INSN_EVA;
-def SHE     : SHE_ENC, SHE_DESC, INSN_EVA;
-let AdditionalPredicates = [NotInMicroMips] in {
-def SWE     : SWE_ENC, SWE_DESC, INSN_EVA;
-}
-
-/// load/store left/right EVA
 let AdditionalPredicates = [NotInMicroMips] in {
-def LWLE    : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6;
-def LWRE    : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6;
-def SWLE    : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6;
-def SWRE    : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+  /// Load and Store EVA Instructions
+  def LBE     : MMRel, LBE_ENC, LBE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def LBuE    : MMRel, LBuE_ENC, LBuE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def LHE     : MMRel, LHE_ENC, LHE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def LHuE    : MMRel, LHuE_ENC, LHuE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def LWE     : MMRel, LWE_ENC, LWE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def SBE     : MMRel, SBE_ENC, SBE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def SHE     : MMRel, SHE_ENC, SHE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def SWE     : MMRel, SWE_ENC, SWE_DESC, ISA_MIPS32R2, ASE_EVA;
+
+  /// load/store left/right EVA
+  def LWLE    : MMRel, LWLE_ENC, LWLE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+  def LWRE    : MMRel, LWRE_ENC, LWRE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+  def SWLE    : MMRel, SWLE_ENC, SWLE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+  def SWRE    : MMRel, SWRE_ENC, SWRE_DESC, ISA_MIPS32R2_NOT_32R6_64R6, ASE_EVA;
+
+  /// Load-linked EVA, Store-conditional EVA
+  def LLE     : MMRel, LLE_ENC, LLE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def SCE     : MMRel, SCE_ENC, SCE_DESC, ISA_MIPS32R2, ASE_EVA;
+
+  /// TLB invalidate instructions
+  def TLBINV  : TLBINV_ENC, TLBINV_DESC, ISA_MIPS32R2, ASE_EVA;
+  def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, ISA_MIPS32R2, ASE_EVA;
+
+  /// EVA versions of cache and pref
+  def CACHEE  : MMRel, CACHEE_ENC, CACHEE_DESC, ISA_MIPS32R2, ASE_EVA;
+  def PREFE   : MMRel, PREFE_ENC, PREFE_DESC, ISA_MIPS32R2, ASE_EVA;
 }
-
-/// Load-linked EVA, Store-conditional EVA
-let AdditionalPredicates = [NotInMicroMips] in {
-def LLE     : LLE_ENC, LLE_DESC, INSN_EVA;
-def SCE     : SCE_ENC, SCE_DESC, INSN_EVA;
-}
-
-let AdditionalPredicates = [NotInMicroMips] in {
-  def TLBINV  : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
-  def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
-}
-
-def CACHEE  : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
-def PREFE   : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/contrib/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/contrib/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
new file mode 100644
index 000000000000..acf66d1fb1b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -0,0 +1,702 @@
+//===-- MipsExpandPseudoInsts.cpp - Expand pseudo instructions ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+// This is currently only used for expanding atomic pseudos after register
+// allocation. We do this to avoid the fast register allocator introducing
+// spills between ll and sc. These stores cause some MIPS implementations to
+// abort the atomic RMW sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-pseudo"
+
+namespace {
+  class MipsExpandPseudo : public MachineFunctionPass {
+  public:
+    static char ID;
+    MipsExpandPseudo() : MachineFunctionPass(ID) {}
+
+    const MipsInstrInfo *TII;
+    const MipsSubtarget *STI;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "Mips pseudo instruction expansion pass";
+    }
+
+  private:
+    bool expandAtomicCmpSwap(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             MachineBasicBlock::iterator &NextMBBI);
+    bool expandAtomicCmpSwapSubword(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    MachineBasicBlock::iterator &NextMBBI);
+
+    bool expandAtomicBinOp(MachineBasicBlock &BB,
+                           MachineBasicBlock::iterator I,
+                           MachineBasicBlock::iterator &NMBBI, unsigned Size);
+    bool expandAtomicBinOpSubword(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  MachineBasicBlock::iterator &NMBBI);
+
+    bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  MachineBasicBlock::iterator &NMBB);
+    bool expandMBB(MachineBasicBlock &MBB);
+   };
+  char MipsExpandPseudo::ID = 0;
+}
+
+bool MipsExpandPseudo::expandAtomicCmpSwapSubword(
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator &NMBBI) {
+
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+  unsigned LL, SC;
+
+  unsigned ZERO = Mips::ZERO;
+  unsigned BNE = Mips::BNE;
+  unsigned BEQ = Mips::BEQ;
+  unsigned SEOp =
+      I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I8_POSTRA ? Mips::SEB : Mips::SEH;
+
+  if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BNE = STI->hasMips32r6() ? Mips::BNEC_MMR6 : Mips::BNE_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+  } else {
+    LL = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                            : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                            : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
+  unsigned Dest = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned Mask = I->getOperand(2).getReg();
+  unsigned ShiftCmpVal = I->getOperand(3).getReg();
+  unsigned Mask2 = I->getOperand(4).getReg();
+  unsigned ShiftNewVal = I->getOperand(5).getReg();
+  unsigned ShiftAmnt = I->getOperand(6).getReg();
+  unsigned Scratch = I->getOperand(7).getReg();
+  unsigned Scratch2 = I->getOperand(8).getReg();
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), &BB,
+                  std::next(MachineBasicBlock::iterator(I)), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB.addSuccessor(loop1MBB, BranchProbability::getOne());
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop1MBB->normalizeSuccProbs();
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  loop2MBB->normalizeSuccProbs();
+  sinkMBB->addSuccessor(exitMBB, BranchProbability::getOne());
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   and Mask', dest, Mask
+  //   bne Mask', ShiftCmpVal, exitMBB
+  BuildMI(loop1MBB, DL, TII->get(LL), Scratch).addReg(Ptr).addImm(0);
+  BuildMI(loop1MBB, DL, TII->get(Mips::AND), Scratch2)
+      .addReg(Scratch)
+      .addReg(Mask);
+  BuildMI(loop1MBB, DL, TII->get(BNE))
+    .addReg(Scratch2).addReg(ShiftCmpVal).addMBB(sinkMBB);
+
+  // loop2MBB:
+  //   and dest, dest, mask2
+  //   or dest, dest, ShiftNewVal
+  //   sc dest, dest, 0(ptr)
+  //   beq dest, $0, loop1MBB
+  BuildMI(loop2MBB, DL, TII->get(Mips::AND), Scratch)
+      .addReg(Scratch, RegState::Kill)
+      .addReg(Mask2);
+  BuildMI(loop2MBB, DL, TII->get(Mips::OR), Scratch)
+      .addReg(Scratch, RegState::Kill)
+      .addReg(ShiftNewVal);
+  BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+      .addReg(Scratch, RegState::Kill)
+      .addReg(Ptr)
+      .addImm(0);
+  BuildMI(loop2MBB, DL, TII->get(BEQ))
+      .addReg(Scratch, RegState::Kill)
+      .addReg(ZERO)
+      .addMBB(loop1MBB);
+
+  //  sinkMBB:
+  //    srl     srlres, Mask', shiftamt
+  //    sign_extend dest,srlres
+  BuildMI(sinkMBB, DL, TII->get(Mips::SRLV), Dest)
+      .addReg(Scratch2)
+      .addReg(ShiftAmnt);
+  if (STI->hasMips32r2()) {
+    BuildMI(sinkMBB, DL, TII->get(SEOp), Dest).addReg(Dest);
+  } else {
+    const unsigned ShiftImm =
+        I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I16_POSTRA ? 16 : 24;
+    BuildMI(sinkMBB, DL, TII->get(Mips::SLL), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+    BuildMI(sinkMBB, DL, TII->get(Mips::SRA), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+  }
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loop1MBB);
+  computeAndAddLiveIns(LiveRegs, *loop2MBB);
+  computeAndAddLiveIns(LiveRegs, *sinkMBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+  return true;
+}
+
+bool MipsExpandPseudo::expandAtomicCmpSwap(MachineBasicBlock &BB,
+                                           MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator &NMBBI) {
+
+  const unsigned Size =
+      I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I32_POSTRA ? 4 : 8;
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+
+  unsigned LL, SC, ZERO, BNE, BEQ, MOVE;
+
+  if (Size == 4) {
+    if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BNE = STI->hasMips32r6() ? Mips::BNEC_MMR6 : Mips::BNE_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+    } else {
+      LL = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+      BNE = Mips::BNE;
+      BEQ = Mips::BEQ;
+    }
+
+    ZERO = Mips::ZERO;
+    MOVE = Mips::OR;
+  } else {
+    LL = STI->hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = STI->hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BNE = Mips::BNE64;
+    BEQ = Mips::BEQ64;
+    MOVE = Mips::OR64;
+  }
+
+  unsigned Dest = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned OldVal = I->getOperand(2).getReg();
+  unsigned NewVal = I->getOperand(3).getReg();
+  unsigned Scratch = I->getOperand(4).getReg();
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), &BB,
+                  std::next(MachineBasicBlock::iterator(I)), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB.addSuccessor(loop1MBB, BranchProbability::getOne());
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop1MBB->normalizeSuccProbs();
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
+  loop2MBB->normalizeSuccProbs();
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BuildMI(loop1MBB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+  BuildMI(loop1MBB, DL, TII->get(BNE))
+    .addReg(Dest, RegState::Kill).addReg(OldVal).addMBB(exitMBB);
+
+  // loop2MBB:
+  //   move scratch, NewVal
+  //   sc Scratch, Scratch, 0(ptr)
+  //   beq Scratch, $0, loop1MBB
+  BuildMI(loop2MBB, DL, TII->get(MOVE), Scratch).addReg(NewVal).addReg(ZERO);
+  BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+    .addReg(Scratch).addReg(Ptr).addImm(0);
+  BuildMI(loop2MBB, DL, TII->get(BEQ))
+    .addReg(Scratch, RegState::Kill).addReg(ZERO).addMBB(loop1MBB);
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loop1MBB);
+  computeAndAddLiveIns(LiveRegs, *loop2MBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+  return true;
+}
+
+bool MipsExpandPseudo::expandAtomicBinOpSubword(
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator &NMBBI) {
+
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+
+  unsigned LL, SC;
+  unsigned BEQ = Mips::BEQ;
+  unsigned SEOp = Mips::SEH;
+
+  if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+  } else {
+    LL = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                            : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                            : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
+  bool IsSwap = false;
+  bool IsNand = false;
+
+  unsigned Opcode = 0;
+  switch (I->getOpcode()) {
+  case Mips::ATOMIC_LOAD_NAND_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_NAND_I16_POSTRA:
+    IsNand = true;
+    break;
+  case Mips::ATOMIC_SWAP_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_SWAP_I16_POSTRA:
+    IsSwap = true;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_ADD_I16_POSTRA:
+    Opcode = Mips::ADDu;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_SUB_I16_POSTRA:
+    Opcode = Mips::SUBu;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_AND_I16_POSTRA:
+    Opcode = Mips::AND;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_OR_I16_POSTRA:
+    Opcode = Mips::OR;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_XOR_I16_POSTRA:
+    Opcode = Mips::XOR;
+    break;
+  default:
+    llvm_unreachable("Unknown subword atomic pseudo for expansion!");
+  }
+
+  unsigned Dest = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned Incr = I->getOperand(2).getReg();
+  unsigned Mask = I->getOperand(3).getReg();
+  unsigned Mask2 = I->getOperand(4).getReg();
+  unsigned ShiftAmnt = I->getOperand(5).getReg();
+  unsigned OldVal = I->getOperand(6).getReg();
+  unsigned BinOpRes = I->getOperand(7).getReg();
+  unsigned StoreVal = I->getOperand(8).getReg();
+
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  BB.addSuccessor(loopMBB, BranchProbability::getOne());
+  loopMBB->addSuccessor(sinkMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->normalizeSuccProbs();
+
+  BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  if (IsNand) {
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(OldVal)
+        .addReg(Incr);
+    BuildMI(loopMBB, DL, TII->get(Mips::NOR), BinOpRes)
+        .addReg(Mips::ZERO)
+        .addReg(BinOpRes);
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(BinOpRes)
+        .addReg(Mask);
+  } else if (!IsSwap) {
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(loopMBB, DL, TII->get(Opcode), BinOpRes)
+        .addReg(OldVal)
+        .addReg(Incr);
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(BinOpRes)
+        .addReg(Mask);
+  } else { // atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(Incr)
+        .addReg(Mask);
+  }
+
+  // and StoreVal, OlddVal, Mask2
+  // or StoreVal, StoreVal, BinOpRes
+  // StoreVal<tied1> = sc StoreVal, 0(Ptr)
+  // beq StoreVal, zero, loopMBB
+  BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(StoreVal).addReg(BinOpRes);
+  BuildMI(loopMBB, DL, TII->get(SC), StoreVal)
+    .addReg(StoreVal).addReg(Ptr).addImm(0);
+  BuildMI(loopMBB, DL, TII->get(BEQ))
+    .addReg(StoreVal).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sign_extend dest,srlres
+
+  sinkMBB->addSuccessor(exitMBB, BranchProbability::getOne());
+
+  BuildMI(sinkMBB, DL, TII->get(Mips::AND), Dest)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(sinkMBB, DL, TII->get(Mips::SRLV), Dest)
+      .addReg(Dest).addReg(ShiftAmnt);
+
+  if (STI->hasMips32r2()) {
+    BuildMI(sinkMBB, DL, TII->get(SEOp), Dest).addReg(Dest);
+  } else {
+    const unsigned ShiftImm = SEOp == Mips::SEH ? 16 : 24;
+    BuildMI(sinkMBB, DL, TII->get(Mips::SLL), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+    BuildMI(sinkMBB, DL, TII->get(Mips::SRA), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+  }
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loopMBB);
+  computeAndAddLiveIns(LiveRegs, *sinkMBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+
+  return true;
+}
+
+bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         MachineBasicBlock::iterator &NMBBI,
+                                         unsigned Size) {
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+
+  unsigned LL, SC, ZERO, BEQ;
+
+  if (Size == 4) {
+    if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+    } else {
+      LL = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+      BEQ = Mips::BEQ;
+    }
+
+    ZERO = Mips::ZERO;
+  } else {
+    LL = STI->hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = STI->hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned OldVal = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned Incr = I->getOperand(2).getReg();
+  unsigned Scratch = I->getOperand(3).getReg();
+
+  unsigned Opcode = 0;
+  unsigned OR = 0;
+  unsigned AND = 0;
+  unsigned NOR = 0;
+  bool IsNand = false;
+  switch (I->getOpcode()) {
+  case Mips::ATOMIC_LOAD_ADD_I32_POSTRA:
+    Opcode = Mips::ADDu;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I32_POSTRA:
+    Opcode = Mips::SUBu;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I32_POSTRA:
+    Opcode = Mips::AND;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I32_POSTRA:
+    Opcode = Mips::OR;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I32_POSTRA:
+    Opcode = Mips::XOR;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I32_POSTRA:
+    IsNand = true;
+    AND = Mips::AND;
+    NOR = Mips::NOR;
+    break;
+  case Mips::ATOMIC_SWAP_I32_POSTRA:
+    OR = Mips::OR;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I64_POSTRA:
+    Opcode = Mips::DADDu;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I64_POSTRA:
+    Opcode = Mips::DSUBu;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I64_POSTRA:
+    Opcode = Mips::AND64;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I64_POSTRA:
+    Opcode = Mips::OR64;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I64_POSTRA:
+    Opcode = Mips::XOR64;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I64_POSTRA:
+    IsNand = true;
+    AND = Mips::AND64;
+    NOR = Mips::NOR64;
+    break;
+  case Mips::ATOMIC_SWAP_I64_POSTRA:
+    OR = Mips::OR64;
+    break;
+  default:
+    llvm_unreachable("Unknown pseudo atomic!");
+  }
+
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  BB.addSuccessor(loopMBB, BranchProbability::getOne());
+  loopMBB->addSuccessor(exitMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->normalizeSuccProbs();
+
+  BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  assert((OldVal != Ptr) && "Clobbered the wrong ptr reg!");
+  assert((OldVal != Incr) && "Clobbered the wrong reg!");
+  if (Opcode) {
+    BuildMI(loopMBB, DL, TII->get(Opcode), Scratch).addReg(OldVal).addReg(Incr);
+  } else if (IsNand) {
+    assert(AND && NOR &&
+           "Unknown nand instruction for atomic pseudo expansion");
+    BuildMI(loopMBB, DL, TII->get(AND), Scratch).addReg(OldVal).addReg(Incr);
+    BuildMI(loopMBB, DL, TII->get(NOR), Scratch).addReg(ZERO).addReg(Scratch);
+  } else {
+    assert(OR && "Unknown instruction for atomic pseudo expansion!");
+    BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO);
+  }
+
+  BuildMI(loopMBB, DL, TII->get(SC), Scratch).addReg(Scratch).addReg(Ptr).addImm(0);
+  BuildMI(loopMBB, DL, TII->get(BEQ)).addReg(Scratch).addReg(ZERO).addMBB(loopMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loopMBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  return true;
+}
+
+bool MipsExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                MachineBasicBlock::iterator &NMBB) {
+
+  bool Modified = false;
+
+  switch (MBBI->getOpcode()) {
+  case Mips::ATOMIC_CMP_SWAP_I32_POSTRA:
+  case Mips::ATOMIC_CMP_SWAP_I64_POSTRA:
+    return expandAtomicCmpSwap(MBB, MBBI, NMBB);
+  case Mips::ATOMIC_CMP_SWAP_I8_POSTRA:
+  case Mips::ATOMIC_CMP_SWAP_I16_POSTRA:
+    return expandAtomicCmpSwapSubword(MBB, MBBI, NMBB);
+  case Mips::ATOMIC_SWAP_I8_POSTRA:
+  case Mips::ATOMIC_SWAP_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_ADD_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_ADD_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I16_POSTRA:
+    return expandAtomicBinOpSubword(MBB, MBBI, NMBB);
+  case Mips::ATOMIC_LOAD_ADD_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I32_POSTRA:
+  case Mips::ATOMIC_SWAP_I32_POSTRA:
+    return expandAtomicBinOp(MBB, MBBI, NMBB, 4);
+  case Mips::ATOMIC_LOAD_ADD_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I64_POSTRA:
+  case Mips::ATOMIC_SWAP_I64_POSTRA:
+    return expandAtomicBinOp(MBB, MBBI, NMBB, 8);
+  default:
+    return Modified;
+  }
+}
+
+bool MipsExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool MipsExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  TII = STI->getInstrInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= expandMBB(*MFI);
+
+  if (Modified)
+    MF.RenumberBlocks();
+
+  return Modified;
+}
+
+/// createMipsExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createMipsExpandPseudoPass() {
+  return new MipsExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
index d3048c7390e1..7b39507812ed 100644
--- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the MIPS-specific support for the FastISel class.
+/// This file defines the MIPS-specific support for the FastISel class.
 /// Some of the target-specific code is generated by tablegen in the file
 /// MipsGenFastISel.inc, which is #included here.
 ///
@@ -36,7 +36,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -64,6 +63,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -951,12 +951,9 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
   //
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
-  BI->getCondition();
   // For now, just try the simplest case where it's fed by a compare.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    unsigned CondReg = createResultReg(&Mips::GPR32RegClass);
-    if (!emitCmp(CondReg, CI))
-      return false;
+    unsigned CondReg = getRegForValue(CI);
     BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
         .addReg(CondReg)
         .addMBB(TBB);
@@ -1001,11 +998,12 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
 bool MipsFastISel::selectSelect(const Instruction *I) {
   assert(isa<SelectInst>(I) && "Expected a select instruction.");
 
-  DEBUG(dbgs() << "selectSelect\n");
+  LLVM_DEBUG(dbgs() << "selectSelect\n");
 
   MVT VT;
   if (!isTypeSupported(I->getType(), VT) || UnsupportedFPMode) {
-    DEBUG(dbgs() << ".. .. gave up (!isTypeSupported || UnsupportedFPMode)\n");
+    LLVM_DEBUG(
+        dbgs() << ".. .. gave up (!isTypeSupported || UnsupportedFPMode)\n");
     return false;
   }
 
@@ -1288,22 +1286,22 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
 }
 
 bool MipsFastISel::fastLowerArguments() {
-  DEBUG(dbgs() << "fastLowerArguments\n");
+  LLVM_DEBUG(dbgs() << "fastLowerArguments\n");
 
   if (!FuncInfo.CanLowerReturn) {
-    DEBUG(dbgs() << ".. gave up (!CanLowerReturn)\n");
+    LLVM_DEBUG(dbgs() << ".. gave up (!CanLowerReturn)\n");
     return false;
   }
 
   const Function *F = FuncInfo.Fn;
   if (F->isVarArg()) {
-    DEBUG(dbgs() << ".. gave up (varargs)\n");
+    LLVM_DEBUG(dbgs() << ".. gave up (varargs)\n");
     return false;
   }
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC != CallingConv::C) {
-    DEBUG(dbgs() << ".. gave up (calling convention is not C)\n");
+    LLVM_DEBUG(dbgs() << ".. gave up (calling convention is not C)\n");
     return false;
   }
 
@@ -1329,21 +1327,21 @@ bool MipsFastISel::fastLowerArguments() {
     if (FormalArg.hasAttribute(Attribute::InReg) ||
         FormalArg.hasAttribute(Attribute::StructRet) ||
         FormalArg.hasAttribute(Attribute::ByVal)) {
-      DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
+      LLVM_DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
       return false;
     }
 
     Type *ArgTy = FormalArg.getType();
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) {
-      DEBUG(dbgs() << ".. gave up (struct, array, or vector)\n");
+      LLVM_DEBUG(dbgs() << ".. gave up (struct, array, or vector)\n");
       return false;
     }
 
     EVT ArgVT = TLI.getValueType(DL, ArgTy);
-    DEBUG(dbgs() << ".. " << FormalArg.getArgNo() << ": "
-                 << ArgVT.getEVTString() << "\n");
+    LLVM_DEBUG(dbgs() << ".. " << FormalArg.getArgNo() << ": "
+                      << ArgVT.getEVTString() << "\n");
     if (!ArgVT.isSimple()) {
-      DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
+      LLVM_DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
       return false;
     }
 
@@ -1355,16 +1353,16 @@ bool MipsFastISel::fastLowerArguments() {
           !FormalArg.hasAttribute(Attribute::ZExt)) {
         // It must be any extend, this shouldn't happen for clang-generated IR
         // so just fall back on SelectionDAG.
-        DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
         return false;
       }
 
       if (NextGPR32 == GPR32ArgRegs.end()) {
-        DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
         return false;
       }
 
-      DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+      LLVM_DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
       Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
 
       // Allocating any GPR32 prohibits further use of floating point arguments.
@@ -1375,16 +1373,16 @@ bool MipsFastISel::fastLowerArguments() {
     case MVT::i32:
       if (FormalArg.hasAttribute(Attribute::ZExt)) {
         // The O32 ABI does not permit a zero-extended i32.
-        DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
         return false;
       }
 
       if (NextGPR32 == GPR32ArgRegs.end()) {
-        DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
         return false;
       }
 
-      DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+      LLVM_DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
       Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
 
       // Allocating any GPR32 prohibits further use of floating point arguments.
@@ -1394,14 +1392,14 @@ bool MipsFastISel::fastLowerArguments() {
 
     case MVT::f32:
       if (UnsupportedFPMode) {
-        DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
         return false;
       }
       if (NextFGR32 == FGR32ArgRegs.end()) {
-        DEBUG(dbgs() << ".. .. gave up (ran out of FGR32 arguments)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of FGR32 arguments)\n");
         return false;
       }
-      DEBUG(dbgs() << ".. .. FGR32(" << *NextFGR32 << ")\n");
+      LLVM_DEBUG(dbgs() << ".. .. FGR32(" << *NextFGR32 << ")\n");
       Allocation.emplace_back(&Mips::FGR32RegClass, *NextFGR32++);
       // Allocating an FGR32 also allocates the super-register AFGR64, and
       // ABI rules require us to skip the corresponding GPR32.
@@ -1413,14 +1411,14 @@ bool MipsFastISel::fastLowerArguments() {
 
     case MVT::f64:
       if (UnsupportedFPMode) {
-        DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
         return false;
       }
       if (NextAFGR64 == AFGR64ArgRegs.end()) {
-        DEBUG(dbgs() << ".. .. gave up (ran out of AFGR64 arguments)\n");
+        LLVM_DEBUG(dbgs() << ".. .. gave up (ran out of AFGR64 arguments)\n");
         return false;
       }
-      DEBUG(dbgs() << ".. .. AFGR64(" << *NextAFGR64 << ")\n");
+      LLVM_DEBUG(dbgs() << ".. .. AFGR64(" << *NextAFGR64 << ")\n");
       Allocation.emplace_back(&Mips::AFGR64RegClass, *NextAFGR64++);
       // Allocating an FGR32 also allocates the super-register AFGR64, and
       // ABI rules require us to skip the corresponding GPR32 pair.
@@ -1433,7 +1431,7 @@ bool MipsFastISel::fastLowerArguments() {
       break;
 
     default:
-      DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
+      LLVM_DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
       return false;
     }
   }
@@ -1629,7 +1627,7 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!MTI->getLength()->getType()->isIntegerTy(32))
       return false;
     const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
-    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 1);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -1638,7 +1636,7 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       return false;
     if (!MSI->getLength()->getType()->isIntegerTy(32))
       return false;
-    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
   }
   }
   return false;
@@ -1648,7 +1646,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
   const Function &F = *I->getParent()->getParent();
   const ReturnInst *Ret = cast<ReturnInst>(I);
 
-  DEBUG(dbgs() << "selectRet\n");
+  LLVM_DEBUG(dbgs() << "selectRet\n");
 
   if (!FuncInfo.CanLowerReturn)
     return false;
@@ -1712,7 +1710,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
 
     // Do not handle FGR64 returns for now.
     if (RVVT == MVT::f64 && UnsupportedFPMode) {
-      DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode\n");
+      LLVM_DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode\n");
       return false;
     }
 
@@ -2064,6 +2062,10 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
   if (VReg == 0)
     return 0;
   MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
+
+  if (VMVT == MVT::i1)
+    return 0;
+
   if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
index 883c3267d51a..0ead56eddd2f 100644
--- a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -36,6 +36,10 @@ public:
 
   bool isFPCloseToIncomingSP() const override { return false; }
 
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return true;
+  }
+
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
deleted file mode 100644
index da67c1bcea99..000000000000
--- a/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- MipsHazardSchedule.cpp - Workaround pipeline hazards ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This pass is used to workaround certain pipeline hazards. For now, this
-/// covers compact branch hazards. In future this pass can be extended to other
-/// pipeline hazards, such as various MIPS1 hazards, processor errata that
-/// require instruction reorganization, etc.
-///
-/// This pass has to run after the delay slot filler as that pass can introduce
-/// pipeline hazards, hence the existing hazard recognizer is not suitable.
-///
-/// Hazards handled: forbidden slots for MIPSR6.
-///
-/// A forbidden slot hazard occurs when a compact branch instruction is executed
-/// and the adjacent instruction in memory is a control transfer instruction
-/// such as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
-///
-/// For example:
-///
-/// 0x8004      bnec    a1,v0,<P+0x18>
-/// 0x8008      beqc    a1,a2,<P+0x54>
-///
-/// In such cases, the processor is required to signal a Reserved Instruction
-/// exception.
-///
-/// Here, if the instruction at 0x8004 is executed, the processor will raise an
-/// exception as there is a control transfer instruction at 0x8008.
-///
-/// There are two sources of forbidden slot hazards:
-///
-/// A) A previous pass has created a compact branch directly.
-/// B) Transforming a delay slot branch into compact branch. This case can be
-///    difficult to process as lookahead for hazards is insufficient, as
-///    backwards delay slot fillling can also produce hazards in previously
-///    processed instuctions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "Mips.h"
-#include "MipsInstrInfo.h"
-#include "MipsSubtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include <algorithm>
-#include <iterator>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mips-hazard-schedule"
-
-STATISTIC(NumInsertedNops, "Number of nops inserted");
-
-namespace {
-
-using Iter = MachineBasicBlock::iterator;
-using ReverseIter = MachineBasicBlock::reverse_iterator;
-
-class MipsHazardSchedule : public MachineFunctionPass {
-public:
-  MipsHazardSchedule() : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override { return "Mips Hazard Schedule"; }
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
-  }
-
-private:
-  static char ID;
-};
-
-} // end of anonymous namespace
-
-char MipsHazardSchedule::ID = 0;
-
-/// Returns a pass that clears pipeline hazards.
-FunctionPass *llvm::createMipsHazardSchedule() {
-  return new MipsHazardSchedule();
-}
-
-// Find the next real instruction from the current position in current basic
-// block.
-static Iter getNextMachineInstrInBB(Iter Position) {
-  Iter I = Position, E = Position->getParent()->end();
-  I = std::find_if_not(I, E,
-                       [](const Iter &Insn) { return Insn->isTransient(); });
-
-  return I;
-}
-
-// Find the next real instruction from the current position, looking through
-// basic block boundaries.
-static std::pair<Iter, bool> getNextMachineInstr(Iter Position, MachineBasicBlock * Parent) {
-  if (Position == Parent->end()) {
-    do {
-      MachineBasicBlock *Succ = Parent->getNextNode();
-      if (Succ != nullptr && Parent->isSuccessor(Succ)) {
-        Position = Succ->begin();
-        Parent = Succ;
-      } else {
-        return std::make_pair(Position, true);
-      }
-    } while (Parent->empty());
-  }
-
-  Iter Instr = getNextMachineInstrInBB(Position);
-  if (Instr == Parent->end()) {
-    return getNextMachineInstr(Instr, Parent);
-  }
-  return std::make_pair(Instr, false);
-}
-
-bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
-
-  const MipsSubtarget *STI =
-      &static_cast<const MipsSubtarget &>(MF.getSubtarget());
-
-  // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
-  if (!STI->hasMips32r6() || STI->inMicroMipsMode())
-    return false;
-
-  bool Changed = false;
-  const MipsInstrInfo *TII = STI->getInstrInfo();
-
-  for (MachineFunction::iterator FI = MF.begin(); FI != MF.end(); ++FI) {
-    for (Iter I = FI->begin(); I != FI->end(); ++I) {
-
-      // Forbidden slot hazard handling. Use lookahead over state.
-      if (!TII->HasForbiddenSlot(*I))
-        continue;
-
-      Iter Inst;
-      bool LastInstInFunction =
-          std::next(I) == FI->end() && std::next(FI) == MF.end();
-      if (!LastInstInFunction) {
-        std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
-        LastInstInFunction |= Res.second;
-        Inst = Res.first;
-      }
-
-      if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
-        Changed = true;
-        MIBundleBuilder(&*I)
-            .append(BuildMI(MF, I->getDebugLoc(), TII->get(Mips::NOP)));
-        NumInsertedNops++;
-      }
-    }
-  }
-  return Changed;
-}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0e1173f1c617..f99f3a1b3e0a 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
@@ -46,6 +47,13 @@ using namespace llvm;
 // instructions for SelectionDAG operations.
 //===----------------------------------------------------------------------===//
 
+void MipsDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  // There are multiple MipsDAGToDAGISel instances added to the pass pipeline.
+  // We need to preserve StackProtector for the next one.
+  AU.addPreserved<StackProtector>();
+  SelectionDAGISel::getAnalysisUsage(AU);
+}
+
 bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
@@ -215,12 +223,9 @@ bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
 void MipsDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
 
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return;
   }
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
index 20bdd4aa8f5f..09003459d180 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -41,6 +41,8 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
 protected:
   SDNode *getGlobalBaseReg();
 
@@ -93,34 +95,34 @@ private:
   virtual bool selectAddr16(SDValue Addr, SDValue &Base, SDValue &Offset);
   virtual bool selectAddr16SP(SDValue Addr, SDValue &Base, SDValue &Offset);
 
-  /// \brief Select constant vector splats.
+  /// Select constant vector splats.
   virtual bool selectVSplat(SDNode *N, APInt &Imm,
                             unsigned MinSizeInBits) const;
-  /// \brief Select constant vector splats whose value fits in a uimm1.
+  /// Select constant vector splats whose value fits in a uimm1.
   virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a uimm2.
+  /// Select constant vector splats whose value fits in a uimm2.
   virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a uimm3.
+  /// Select constant vector splats whose value fits in a uimm3.
   virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a uimm4.
+  /// Select constant vector splats whose value fits in a uimm4.
   virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a uimm5.
+  /// Select constant vector splats whose value fits in a uimm5.
   virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a uimm6.
+  /// Select constant vector splats whose value fits in a uimm6.
   virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a uimm8.
+  /// Select constant vector splats whose value fits in a uimm8.
   virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value fits in a simm5.
+  /// Select constant vector splats whose value fits in a simm5.
   virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value is a power of 2.
+  /// Select constant vector splats whose value is a power of 2.
   virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value is the inverse of a
+  /// Select constant vector splats whose value is the inverse of a
   /// power of 2.
   virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value is a run of set bits
+  /// Select constant vector splats whose value is a run of set bits
   /// ending at the most significant bit
   virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
-  /// \brief Select constant vector splats whose value is a run of set bits
+  /// Select constant vector splats whose value is a run of set bits
   /// starting at bit zero.
   virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 3d383b3dfe3e..9ffc38356b76 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -41,7 +41,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -64,6 +63,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -110,12 +110,6 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
 
 // The MIPS MSA ABI passes vector arguments in the integer register set.
 // The number of integer registers used is dependant on the ABI used.
-MVT MipsTargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
-  if (VT.isVector() && Subtarget.hasMSA())
-    return Subtarget.isABI_O32() ? MVT::i32 : MVT::i64;
-  return MipsTargetLowering::getRegisterType(VT);
-}
-
 MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                       EVT VT) const {
   if (VT.isVector()) {
@@ -195,11 +189,13 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Hi:                return "MipsISD::Hi";
   case MipsISD::Lo:                return "MipsISD::Lo";
   case MipsISD::GotHi:             return "MipsISD::GotHi";
+  case MipsISD::TlsHi:             return "MipsISD::TlsHi";
   case MipsISD::GPRel:             return "MipsISD::GPRel";
   case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
   case MipsISD::Ret:               return "MipsISD::Ret";
   case MipsISD::ERet:              return "MipsISD::ERet";
   case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
+  case MipsISD::FMS:               return "MipsISD::FMS";
   case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
   case MipsISD::FPCmp:             return "MipsISD::FPCmp";
   case MipsISD::FSELECT:           return "MipsISD::FSELECT";
@@ -286,10 +282,6 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::VCLE_U:            return "MipsISD::VCLE_U";
   case MipsISD::VCLT_S:            return "MipsISD::VCLT_S";
   case MipsISD::VCLT_U:            return "MipsISD::VCLT_U";
-  case MipsISD::VSMAX:             return "MipsISD::VSMAX";
-  case MipsISD::VSMIN:             return "MipsISD::VSMIN";
-  case MipsISD::VUMAX:             return "MipsISD::VUMAX";
-  case MipsISD::VUMIN:             return "MipsISD::VUMIN";
   case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
   case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
   case MipsISD::VNOR:              return "MipsISD::VNOR";
@@ -402,18 +394,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
-  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
-    setOperationAction(ISD::ADDC, MVT::i32, Expand);
-    setOperationAction(ISD::ADDE, MVT::i32, Expand);
-  }
-
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i32, Expand);
-  setOperationAction(ISD::SUBE, MVT::i32, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
-
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -761,7 +741,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
   }
 
-  // Couldn't optimize.
+  // Could not optimize.
   return SDValue();
 }
 
@@ -1301,76 +1281,76 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   default:
     llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_LOAD_ADD_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_LOAD_ADD_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_LOAD_ADD_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_LOAD_AND_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_LOAD_AND_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_LOAD_AND_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::AND);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_LOAD_AND_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_LOAD_OR_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_LOAD_OR_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_LOAD_OR_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::OR);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_LOAD_OR_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_LOAD_XOR_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_LOAD_XOR_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_LOAD_XOR_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_LOAD_XOR_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_LOAD_NAND_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_LOAD_NAND_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_LOAD_NAND_I32:
-    return emitAtomicBinary(MI, BB, 4, 0, true);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_LOAD_NAND_I64:
-    return emitAtomicBinary(MI, BB, 8, 0, true);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_LOAD_SUB_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_LOAD_SUB_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_LOAD_SUB_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_LOAD_SUB_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_SWAP_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, 0);
+    return emitAtomicBinaryPartword(MI, BB, 1);
   case Mips::ATOMIC_SWAP_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, 0);
+    return emitAtomicBinaryPartword(MI, BB, 2);
   case Mips::ATOMIC_SWAP_I32:
-    return emitAtomicBinary(MI, BB, 4, 0);
+    return emitAtomicBinary(MI, BB);
   case Mips::ATOMIC_SWAP_I64:
-    return emitAtomicBinary(MI, BB, 8, 0);
+    return emitAtomicBinary(MI, BB);
 
   case Mips::ATOMIC_CMP_SWAP_I8:
     return emitAtomicCmpSwapPartword(MI, BB, 1);
   case Mips::ATOMIC_CMP_SWAP_I16:
     return emitAtomicCmpSwapPartword(MI, BB, 2);
   case Mips::ATOMIC_CMP_SWAP_I32:
-    return emitAtomicCmpSwap(MI, BB, 4);
+    return emitAtomicCmpSwap(MI, BB);
   case Mips::ATOMIC_CMP_SWAP_I64:
-    return emitAtomicCmpSwap(MI, BB, 8);
+    return emitAtomicCmpSwap(MI, BB);
   case Mips::PseudoSDIV:
   case Mips::PseudoUDIV:
   case Mips::DIV:
@@ -1419,99 +1399,121 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
 // This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
 // Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
-MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
-                                                        MachineBasicBlock *BB,
-                                                        unsigned Size,
-                                                        unsigned BinOpcode,
-                                                        bool Nand) const {
-  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const {
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const bool ArePtrs64bit = ABI.ArePtrs64bit();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
-  if (Size == 4) {
-    if (isMicroMips) {
-      LL = Mips::LL_MM;
-      SC = Mips::SC_MM;
-    } else {
-      LL = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-      SC = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
-    }
-
-    AND = Mips::AND;
-    NOR = Mips::NOR;
-    ZERO = Mips::ZERO;
-    BEQ = Mips::BEQ;
-  } else {
-    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
-    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
-    AND = Mips::AND64;
-    NOR = Mips::NOR64;
-    ZERO = Mips::ZERO_64;
-    BEQ = Mips::BEQ64;
+  unsigned AtomicOp;
+  switch (MI.getOpcode()) {
+  case Mips::ATOMIC_LOAD_ADD_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I32:
+    AtomicOp = Mips::ATOMIC_SWAP_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I64:
+    AtomicOp = Mips::ATOMIC_SWAP_I64_POSTRA;
+    break;
+  default:
+    llvm_unreachable("Unknown pseudo atomic for replacement!");
   }
 
   unsigned OldVal = MI.getOperand(0).getReg();
   unsigned Ptr = MI.getOperand(1).getReg();
   unsigned Incr = MI.getOperand(2).getReg();
+  unsigned Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));
+
+  MachineBasicBlock::iterator II(MI);
+
+  // The scratch registers here with the EarlyClobber | Define | Implicit
+  // flags is used to persuade the register allocator and the machine
+  // verifier to accept the usage of this register. This has to be a real
+  // register which has an UNDEF value but is dead after the instruction which
+  // is unique among the registers chosen for the instruction.
+
+  // The EarlyClobber flag has the semantic properties that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction.
+  // The Define flag is needed to coerce the machine verifier that an Undef
+  // value isn't a problem.
+  // The Dead flag is needed as the value in scratch isn't used by any other
+  // instruction. Kill isn't used as Dead is more precise.
+  // The implicit flag is here due to the interaction between the other flags
+  // and the machine verifier.
+
+  // For correctness purpose, a new pseudo is introduced here. We need this
+  // new pseudo, so that FastRegisterAllocator does not see an ll/sc sequence
+  // that is spread over >1 basic blocks. A register allocator which
+  // introduces (or any codegen infact) a store, can violate the expectations
+  // of the hardware.
+  //
+  // An atomic read-modify-write sequence starts with a linked load
+  // instruction and ends with a store conditional instruction. The atomic
+  // read-modify-write sequence fails if any of the following conditions
+  // occur between the execution of ll and sc:
+  //   * A coherent store is completed by another process or coherent I/O
+  //     module into the block of synchronizable physical memory containing
+  //     the word. The size and alignment of the block is
+  //     implementation-dependent.
+  //   * A coherent store is executed between an LL and SC sequence on the
+  //     same processor to the block of synchornizable physical memory
+  //     containing the word.
+  //
 
-  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
-  unsigned AndRes = RegInfo.createVirtualRegister(RC);
-  unsigned Success = RegInfo.createVirtualRegister(RC);
+  unsigned PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
+  unsigned IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));
 
-  // insert new blocks after the current block
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), IncrCopy).addReg(Incr);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
 
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+  BuildMI(*BB, II, DL, TII->get(AtomicOp))
+      .addReg(OldVal, RegState::Define | RegState::EarlyClobber)
+      .addReg(PtrCopy)
+      .addReg(IncrCopy)
+      .addReg(Scratch, RegState::Define | RegState::EarlyClobber |
+                           RegState::Implicit | RegState::Dead);
 
-  //  thisMBB:
-  //    ...
-  //    fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(exitMBB);
-
-  //  loopMBB:
-  //    ll oldval, 0(ptr)
-  //    <binop> storeval, oldval, incr
-  //    sc success, storeval, 0(ptr)
-  //    beq success, $0, loopMBB
-  BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
-  if (Nand) {
-    //  and andres, oldval, incr
-    //  nor storeval, $0, andres
-    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
-    BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
-  } else if (BinOpcode) {
-    //  <binop> storeval, oldval, incr
-    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
-  } else {
-    StoreVal = Incr;
-  }
-  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
-  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
-
-  MI.eraseFromParent(); // The instruction is gone now.
+  MI.eraseFromParent();
 
-  return exitMBB;
+  return BB;
 }
 
 MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
@@ -1545,8 +1547,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
 }
 
 MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
-    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
-    bool Nand) const {
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
   assert((Size == 1 || Size == 2) &&
          "Unsupported size for EmitAtomicBinaryPartial.");
 
@@ -1567,39 +1568,66 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned NewVal = RegInfo.createVirtualRegister(RC);
-  unsigned OldVal = RegInfo.createVirtualRegister(RC);
   unsigned Incr2 = RegInfo.createVirtualRegister(RC);
   unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
   unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
   unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
-  unsigned AndRes = RegInfo.createVirtualRegister(RC);
-  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
-  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
-  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned Success = RegInfo.createVirtualRegister(RC);
-
-  unsigned LL, SC;
-  if (isMicroMips) {
-    LL = Mips::LL_MM;
-    SC = Mips::SC_MM;
-  } else {
-    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  unsigned Scratch = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch3 = RegInfo.createVirtualRegister(RC);
+
+  unsigned AtomicOp = 0;
+  switch (MI.getOpcode()) {
+  case Mips::ATOMIC_LOAD_NAND_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I8:
+    AtomicOp = Mips::ATOMIC_SWAP_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I16:
+    AtomicOp = Mips::ATOMIC_SWAP_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I16_POSTRA;
+    break;
+  default:
+    llvm_unreachable("Unknown subword atomic pseudo for expansion!");
   }
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loopMBB);
-  MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -1607,10 +1635,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  BB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(sinkMBB);
-  sinkMBB->addSuccessor(exitMBB);
+  BB->addSuccessor(exitMBB, BranchProbability::getOne());
 
   //  thisMBB:
   //    addiu   masklsb2,$0,-4                # 0xfffffffc
@@ -1644,159 +1669,92 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);
 
-  // atomic.load.binop
-  // loopMBB:
-  //   ll      oldval,0(alignedaddr)
-  //   binop   binopres,oldval,incr2
-  //   and     newval,binopres,mask
-  //   and     maskedoldval0,oldval,mask2
-  //   or      storeval,maskedoldval0,newval
-  //   sc      success,storeval,0(alignedaddr)
-  //   beq     success,$0,loopMBB
-
-  // atomic.swap
-  // loopMBB:
-  //   ll      oldval,0(alignedaddr)
-  //   and     newval,incr2,mask
-  //   and     maskedoldval0,oldval,mask2
-  //   or      storeval,maskedoldval0,newval
-  //   sc      success,storeval,0(alignedaddr)
-  //   beq     success,$0,loopMBB
-
-  BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
-  if (Nand) {
-    //  and andres, oldval, incr2
-    //  nor binopres, $0, andres
-    //  and newval, binopres, mask
-    BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
-    BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
-      .addReg(Mips::ZERO).addReg(AndRes);
-    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else if (BinOpcode) {
-    //  <binop> binopres, oldval, incr2
-    //  and newval, binopres, mask
-    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
-    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else { // atomic.swap
-    //  and newval, incr2, mask
-    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
-  }
-
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
-    .addReg(OldVal).addReg(Mask2);
-  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
-    .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
-    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, DL, TII->get(Mips::BEQ))
-    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
 
-  //  sinkMBB:
-  //    and     maskedoldval1,oldval,mask
-  //    srl     srlres,maskedoldval1,shiftamt
-  //    sign_extend dest,srlres
-  BB = sinkMBB;
-
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
-    .addReg(OldVal).addReg(Mask);
-  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
-      .addReg(MaskedOldVal1).addReg(ShiftAmt);
-  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+  // The purposes of the flags on the scratch registers is explained in
+  // emitAtomicBinary. In summary, we need a scratch register which is going to
+  // be undef, that is unique among registers chosen for the instruction.
+
+  BuildMI(BB, DL, TII->get(AtomicOp))
+      .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+      .addReg(AlignedAddr)
+      .addReg(Incr2)
+      .addReg(Mask)
+      .addReg(Mask2)
+      .addReg(ShiftAmt)
+      .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+                           RegState::Dead | RegState::Implicit)
+      .addReg(Scratch2, RegState::EarlyClobber | RegState::Define |
+                            RegState::Dead | RegState::Implicit)
+      .addReg(Scratch3, RegState::EarlyClobber | RegState::Define |
+                            RegState::Dead | RegState::Implicit);
 
   MI.eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
-                                                         MachineBasicBlock *BB,
-                                                         unsigned Size) const {
-  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+// Lower atomic compare and swap to a pseudo instruction, taking care to
+// define a scratch register for the pseudo instruction's expansion. The
+// instruction is expanded after the register allocator as to prevent
+// the insertion of stores between the linked load and the store conditional.
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
+                                      MachineBasicBlock *BB) const {
+
+  assert((MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ||
+          MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I64) &&
+         "Unsupported atomic psseudo for EmitAtomicCmpSwap.");
+
+  const unsigned Size = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ? 4 : 8;
 
   MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const bool ArePtrs64bit = ABI.ArePtrs64bit();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned LL, SC, ZERO, BNE, BEQ;
-
-  if (Size == 4) {
-    if (isMicroMips) {
-      LL = Mips::LL_MM;
-      SC = Mips::SC_MM;
-    } else {
-      LL = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-      SC = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
-    }
-
-    ZERO = Mips::ZERO;
-    BNE = Mips::BNE;
-    BEQ = Mips::BEQ;
-  } else {
-    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
-    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
-    ZERO = Mips::ZERO_64;
-    BNE = Mips::BNE64;
-    BEQ = Mips::BEQ64;
-  }
 
+  unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32
+                          ? Mips::ATOMIC_CMP_SWAP_I32_POSTRA
+                          : Mips::ATOMIC_CMP_SWAP_I64_POSTRA;
   unsigned Dest = MI.getOperand(0).getReg();
   unsigned Ptr = MI.getOperand(1).getReg();
   unsigned OldVal = MI.getOperand(2).getReg();
   unsigned NewVal = MI.getOperand(3).getReg();
 
-  unsigned Success = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch = MRI.createVirtualRegister(RC);
+  MachineBasicBlock::iterator II(MI);
 
-  // insert new blocks after the current block
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
+  // We need to create copies of the various registers and kill them at the
+  // atomic pseudo. If the copies are not made, when the atomic is expanded
+  // after fast register allocation, the spills will end up outside of the
+  // blocks that their values are defined in, causing livein errors.
 
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+  unsigned DestCopy = MRI.createVirtualRegister(MRI.getRegClass(Dest));
+  unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
+  unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
+  unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
 
-  //  thisMBB:
-  //    ...
-  //    fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-  loop1MBB->addSuccessor(exitMBB);
-  loop1MBB->addSuccessor(loop2MBB);
-  loop2MBB->addSuccessor(loop1MBB);
-  loop2MBB->addSuccessor(exitMBB);
-
-  // loop1MBB:
-  //   ll dest, 0(ptr)
-  //   bne dest, oldval, exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
-  BuildMI(BB, DL, TII->get(BNE))
-    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
-
-  // loop2MBB:
-  //   sc success, newval, 0(ptr)
-  //   beq success, $0, loop1MBB
-  BB = loop2MBB;
-  BuildMI(BB, DL, TII->get(SC), Success)
-    .addReg(NewVal).addReg(Ptr).addImm(0);
-  BuildMI(BB, DL, TII->get(BEQ))
-    .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), DestCopy).addReg(Dest);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), NewValCopy).addReg(NewVal);
+
+  // The purposes of the flags on the scratch registers is explained in
+  // emitAtomicBinary. In summary, we need a scratch register which is going to
+  // be undef, that is unique among registers chosen for the instruction.
+
+  BuildMI(*BB, II, DL, TII->get(AtomicOp))
+      .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+      .addReg(PtrCopy, RegState::Kill)
+      .addReg(OldValCopy, RegState::Kill)
+      .addReg(NewValCopy, RegState::Kill)
+      .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+                           RegState::Dead | RegState::Implicit);
 
   MI.eraseFromParent(); // The instruction is gone now.
 
-  return exitMBB;
+  return BB;
 }
 
 MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
@@ -1823,40 +1781,33 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
   unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
-  unsigned OldVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
   unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
   unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
   unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
   unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
   unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
   unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
-  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
-  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned Success = RegInfo.createVirtualRegister(RC);
-  unsigned LL, SC;
-
-  if (isMicroMips) {
-    LL = Mips::LL_MM;
-    SC = Mips::SC_MM;
-  } else {
-    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
-  }
+  unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I8
+                          ? Mips::ATOMIC_CMP_SWAP_I8_POSTRA
+                          : Mips::ATOMIC_CMP_SWAP_I16_POSTRA;
+
+  // The scratch registers here with the EarlyClobber | Define | Dead | Implicit
+  // flags are used to coerce the register allocator and the machine verifier to
+  // accept the usage of these registers.
+  // The EarlyClobber flag has the semantic properties that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction.
+  // The Define flag is needed to coerce the machine verifier that an Undef
+  // value isn't a problem.
+  // The Dead flag is needed as the value in scratch isn't used by any other
+  // instruction. Kill isn't used as Dead is more precise.
+  unsigned Scratch = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -1864,14 +1815,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  BB->addSuccessor(loop1MBB);
-  loop1MBB->addSuccessor(sinkMBB);
-  loop1MBB->addSuccessor(loop2MBB);
-  loop2MBB->addSuccessor(loop1MBB);
-  loop2MBB->addSuccessor(sinkMBB);
-  sinkMBB->addSuccessor(exitMBB);
+  BB->addSuccessor(exitMBB, BranchProbability::getOne());
 
-  // FIXME: computation of newval2 can be moved to loop2MBB.
   //  thisMBB:
   //    addiu   masklsb2,$0,-4                # 0xfffffffc
   //    and     alignedaddr,ptr,masklsb2
@@ -1914,40 +1859,22 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
   BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
     .addReg(MaskedNewVal).addReg(ShiftAmt);
 
-  //  loop1MBB:
-  //    ll      oldval,0(alginedaddr)
-  //    and     maskedoldval0,oldval,mask
-  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
-  BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
-    .addReg(OldVal).addReg(Mask);
-  BuildMI(BB, DL, TII->get(Mips::BNE))
-    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
-
-  //  loop2MBB:
-  //    and     maskedoldval1,oldval,mask2
-  //    or      storeval,maskedoldval1,shiftednewval
-  //    sc      success,storeval,0(alignedaddr)
-  //    beq     success,$0,loop1MBB
-  BB = loop2MBB;
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
-    .addReg(OldVal).addReg(Mask2);
-  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
-    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
-      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, DL, TII->get(Mips::BEQ))
-      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
-
-  //  sinkMBB:
-  //    srl     srlres,maskedoldval0,shiftamt
-  //    sign_extend dest,srlres
-  BB = sinkMBB;
-
-  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
-      .addReg(MaskedOldVal0).addReg(ShiftAmt);
-  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+  // The purposes of the flags on the scratch registers are explained in
+  // emitAtomicBinary. In summary, we need a scratch register which is going to
+  // be undef, that is unique among the register chosen for the instruction.
+
+  BuildMI(BB, DL, TII->get(AtomicOp))
+      .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+      .addReg(AlignedAddr)
+      .addReg(Mask)
+      .addReg(ShiftedCmpVal)
+      .addReg(Mask2)
+      .addReg(ShiftedNewVal)
+      .addReg(ShiftAmt)
+      .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+                           RegState::Dead | RegState::Implicit)
+      .addReg(Scratch2, RegState::EarlyClobber | RegState::Define |
+                            RegState::Dead | RegState::Implicit);
 
   MI.eraseFromParent(); // The instruction is gone now.
 
@@ -2073,7 +2000,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   // Local Exec TLS Model.
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   SDLoc DL(GA);
@@ -2114,7 +2041,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_DTPREL_HI);
-    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue Hi = DAG.getNode(MipsISD::TlsHi, DL, PtrVT, TGAHi);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_DTPREL_LO);
     SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
@@ -2138,7 +2065,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
                                                MipsII::MO_TPREL_HI);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                                MipsII::MO_TPREL_LO);
-    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue Hi = DAG.getNode(MipsISD::TlsHi, DL, PtrVT, TGAHi);
     SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
     Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
   }
@@ -2837,6 +2764,13 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
 
 #include "MipsGenCallingConv.inc"
 
+ CCAssignFn *MipsTargetLowering::CCAssignFnForCall() const{
+   return CC_Mips;
+ }
+
+ CCAssignFn *MipsTargetLowering::CCAssignFnForReturn() const{
+   return RetCC_Mips;
+ }
 //===----------------------------------------------------------------------===//
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2953,12 +2887,44 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(),
       MipsCCState::getSpecialCallingConvForCallee(Callee.getNode(), Subtarget));
 
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
+
+  // There is one case where CALLSEQ_START..CALLSEQ_END can be nested, which
+  // is during the lowering of a call with a byval argument which produces
+  // a call to memcpy. For the O32 case, this causes the caller to allocate
+  // stack space for the reserved argument area for the callee, then recursively
+  // again for the memcpy call. In the NEWABI case, this doesn't occur as those
+  // ABIs mandate that the callee allocates the reserved argument area. We do
+  // still produce nested CALLSEQ_START..CALLSEQ_END with zero space though.
+  //
+  // If the callee has a byval argument and memcpy is used, we are mandated
+  // to already have produced a reserved argument area for the callee for O32.
+  // Therefore, the reserved argument area can be reused for both calls.
+  //
+  // Other cases of calling memcpy cannot have a chain with a CALLSEQ_START
+  // present, as we have yet to hook that node onto the chain.
+  //
+  // Hence, the CALLSEQ_START and CALLSEQ_END nodes can be eliminated in this
+  // case. GCC does a similar trick, in that wherever possible, it calculates
+  // the maximum out going argument area (including the reserved area), and
+  // preallocates the stack space on entrance to the caller.
+  //
+  // FIXME: We should do the same for efficency and space.
+
+  // Note: The check on the calling convention below must match
+  //       MipsABIInfo::GetCalleeAllocdArgSizeInBytes().
+  bool MemcpyInByVal = ES &&
+                       StringRef(ES->getSymbol()) == StringRef("memcpy") &&
+                       CallConv != CallingConv::Fast &&
+                       Chain.getOpcode() == ISD::CALLSEQ_START;
+
   // Allocate the reserved argument area. It seems strange to do this from the
   // caller side but removing it breaks the frame size calculation.
-  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+  unsigned ReservedArgArea =
+      MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv);
+  CCInfo.AllocateStack(ReservedArgArea, 1);
 
-  const ExternalSymbolSDNode *ES =
-      dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
   CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
                              ES ? ES->getSymbol() : nullptr);
 
@@ -2993,7 +2959,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   NextStackOffset = alignTo(NextStackOffset, StackAlignment);
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
 
-  if (!IsTailCall)
+  if (!(IsTailCall || MemcpyInByVal))
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
 
   SDValue StackPtr =
@@ -3201,10 +3167,13 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
   SDValue InFlag = Chain.getValue(1);
 
-  // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
-                             DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
-  InFlag = Chain.getValue(1);
+  // Create the CALLSEQ_END node in the case of where it is not a call to
+  // memcpy.
+  if (!(MemcpyInByVal)) {
+    Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
+                               DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
@@ -3745,6 +3714,13 @@ static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
                         true);
 }
 
+EVT MipsTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                                            ISD::NodeType) const {
+  bool Cond = !Subtarget.isABI_O32() && VT.getSizeInBits() == 32;
+  EVT MinVT = getRegisterType(Context, Cond ? MVT::i64 : MVT::i32);
+  return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
 std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
 parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
   const TargetRegisterInfo *TRI =
@@ -4067,7 +4043,12 @@ void MipsTargetLowering::copyByValRegs(
 
   // Create frame object.
   EVT PtrTy = getPointerTy(DAG.getDataLayout());
-  int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+  // Make the fixed object stored to mutable so that the load instructions
+  // referencing it have their memory dependencies added.
+  // Set the frame object as isAliased which clears the underlying objects
+  // vector in ScheduleDAGInstrs::buildSchedGraph() resulting in addition of all
+  // stores as dependencies for loads referencing this fixed object.
+  int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, false, true);
   SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
   InVals.push_back(FIN);
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
index ce4f0376ca9b..b58d92c370d8 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -19,9 +19,9 @@
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "Mips.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -29,6 +29,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -83,12 +84,18 @@ class TargetRegisterClass;
       // Get the High 16 bits from a 32 bit immediate for accessing the GOT.
       GotHi,
 
+      // Get the High 16 bits from a 32-bit immediate for accessing TLS.
+      TlsHi,
+
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
       // Thread Pointer
       ThreadPointer,
 
+      // Vector Floating Point Multiply and Subtract
+      FMS,
+
       // Floating Point Branch Conditional
       FPBrcond,
 
@@ -217,12 +224,6 @@ class TargetRegisterClass;
       VCLT_S,
       VCLT_U,
 
-      // Element-wise vector max/min.
-      VSMAX,
-      VSMIN,
-      VUMAX,
-      VUMIN,
-
       // Vector Shuffle with mask as an operand
       VSHF,  // Generic shuffle
       SHF,   // 4-element set shuffle.
@@ -279,15 +280,14 @@ class TargetRegisterClass;
       return MVT::i32;
     }
 
+    EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                            ISD::NodeType) const override;
+
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
     /// Return the register type for a given MVT, ensuring vectors are treated
     /// as a series of gpr sized integers.
-    MVT getRegisterTypeForCallingConv(MVT VT) const override;
-
-    /// Return the register type for a given MVT, ensuring vectors are treated
-    /// as a series of gpr sized integers.
     MVT getRegisterTypeForCallingConv(LLVMContext &Context,
                                       EVT VT) const override;
 
@@ -371,6 +371,10 @@ class TargetRegisterClass;
       return getTargetMachine().isPositionIndependent();
     }
 
+   CCAssignFn *CCAssignFnForCall() const;
+
+   CCAssignFn *CCAssignFnForReturn() const;
+
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
@@ -681,17 +685,13 @@ class TargetRegisterClass;
                                                 unsigned Size, unsigned DstReg,
                                                 unsigned SrcRec) const;
 
-    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
-                                        unsigned Size, unsigned BinOpcode,
-                                        bool Nand = false) const;
+    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const;
     MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
                                                 MachineBasicBlock *BB,
-                                                unsigned Size,
-                                                unsigned BinOpcode,
-                                                bool Nand = false) const;
+                                                unsigned Size) const;
     MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned Size) const;
+                                         MachineBasicBlock *BB) const;
     MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
                                                  MachineBasicBlock *BB,
                                                  unsigned Size) const;
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
index c81739115373..dd30e20a743c 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -149,12 +149,16 @@ multiclass ROUND_M<string opstr, InstrItinClass Itin> {
 class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
-         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>, HARDFLOAT;
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>, HARDFLOAT {
+  let isMoveReg = 1;
+}
 
 class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
-         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>, HARDFLOAT;
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>, HARDFLOAT {
+  let isMoveReg = 1;
+}
 
 class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
                  InstrItinClass Itin> :
@@ -349,22 +353,24 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
-                 ABSS_FM<0xc, 16>, ISA_MIPS2;
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
-def TRUNC_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
-                 ABSS_FM<0xd, 16>, ISA_MIPS2;
-def CEIL_W_S   : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
-                 ABSS_FM<0xe, 16>, ISA_MIPS2;
-def FLOOR_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
-                 ABSS_FM<0xf, 16>, ISA_MIPS2;
-def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
-                 ABSS_FM<0x24, 16>;
-
-defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
-defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
-defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
-defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def ROUND_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+                   ABSS_FM<0xc, 16>, ISA_MIPS2;
+  defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+  def TRUNC_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+                   ABSS_FM<0xd, 16>, ISA_MIPS2;
+  def CEIL_W_S   : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+                   ABSS_FM<0xe, 16>, ISA_MIPS2;
+  def FLOOR_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+                   ABSS_FM<0xf, 16>, ISA_MIPS2;
+  def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                   ABSS_FM<0x24, 16>, ISA_MIPS1;
+
+  defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+  defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+  defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
+  defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>, ISA_MIPS1;
+}
 
 let AdditionalPredicates = [NotInMicroMips] in {
   def RECIP_S : MMRel, ABSS_FT<"recip.s", FGR32Opnd, FGR32Opnd, II_RECIP_S>,
@@ -391,53 +397,54 @@ let AdditionalPredicates = [NotInMicroMips] in {
 let DecoderNamespace = "MipsFP64" in {
   let AdditionalPredicates = [NotInMicroMips] in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
-                  ABSS_FM<0x8, 16>, FGR_64;
+                  ABSS_FM<0x8, 16>, ISA_MIPS2, FGR_64;
   def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
-                    ABSS_FM<0x8, 17>, FGR_64;
+                    ABSS_FM<0x8, 17>, INSN_MIPS3_32, FGR_64;
   def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
-                  ABSS_FM<0x9, 16>, FGR_64;
+                  ABSS_FM<0x9, 16>, ISA_MIPS2, FGR_64;
   def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
-                    ABSS_FM<0x9, 17>, FGR_64;
+                    ABSS_FM<0x9, 17>, INSN_MIPS3_32, FGR_64;
   def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
-                  ABSS_FM<0xa, 16>, FGR_64;
+                  ABSS_FM<0xa, 16>, ISA_MIPS2, FGR_64;
   def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
-                   ABSS_FM<0xa, 17>, FGR_64;
+                   ABSS_FM<0xa, 17>, INSN_MIPS3_32, FGR_64;
   def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
-                  ABSS_FM<0xb, 16>, FGR_64;
+                  ABSS_FM<0xb, 16>, ISA_MIPS2, FGR_64;
   def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
-                    ABSS_FM<0xb, 17>, FGR_64;
+                    ABSS_FM<0xb, 17>, INSN_MIPS3_32, FGR_64;
   }
 }
 
-def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
-              ABSS_FM<0x20, 20>;
 let AdditionalPredicates = [NotInMicroMips] in{
+  def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x20, 20>, ISA_MIPS1;
   def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
                 ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
   def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
                  ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
 }
 
-def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
-                ABSS_FM<0x20, 17>, FGR_32;
-def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                ABSS_FM<0x21, 20>, FGR_32;
-def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                ABSS_FM<0x21, 16>, FGR_32;
-
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                  ABSS_FM<0x20, 17>, ISA_MIPS1, FGR_32;
+  def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                  ABSS_FM<0x21, 16>, ISA_MIPS1, FGR_32;
+  def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                  ABSS_FM<0x21, 20>, ISA_MIPS1, FGR_32;
+}
 let DecoderNamespace = "MipsFP64" in {
-  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 17>, FGR_64;
-  let AdditionalPredicates = [NotInMicroMips] in{
+  let AdditionalPredicates = [NotInMicroMips] in {
     def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
-                    ABSS_FM<0x20, 21>, FGR_64;
+                    ABSS_FM<0x20, 21>, INSN_MIPS3_32R2, FGR_64;
+    def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+                    ABSS_FM<0x20, 17>, ISA_MIPS1, FGR_64;
+    def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+                    ABSS_FM<0x21, 20>, ISA_MIPS1, FGR_64;
+    def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                    ABSS_FM<0x21, 16>, ISA_MIPS1, FGR_64;
+    def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
+                    ABSS_FM<0x21, 21>, INSN_MIPS3_32R2, FGR_64;
   }
-  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 20>, FGR_64;
-  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 16>, FGR_64;
-  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x21, 21>, FGR_64;
 }
 
 let isPseudo = 1, isCodeGenOnly = 1 in {
@@ -450,17 +457,21 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 
 let AdditionalPredicates = [NotInMicroMips] in {
   def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
-               ABSS_FM<0x5, 16>;
-  defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+               ABSS_FM<0x5, 16>, ISA_MIPS1;
+  defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>, ISA_MIPS1;
 }
 
 def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
-             ABSS_FM<0x7, 16>;
-defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
+             ABSS_FM<0x7, 16>, ISA_MIPS1;
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>, ISA_MIPS1;
+}
 
-def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
-              II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
-defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
+                II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
+  defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+}
 
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
@@ -469,60 +480,60 @@ defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
 
 /// Move Control Registers From/To CPU Registers
 let AdditionalPredicates = [NotInMicroMips] in {
-  def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>;
-  def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>;
-}
-def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
-                          bitconvert>, MFC1_FM<0>;
-def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
-               FGR_64 {
-  let DecoderNamespace = "MipsFP64";
-}
-def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
-                          bitconvert>, MFC1_FM<4>;
-def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
-               FGR_64 {
-  let DecoderNamespace = "MipsFP64";
-}
+  def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>,
+             ISA_MIPS1;
+  def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>,
+             ISA_MIPS1;
+
+  def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
+                            bitconvert>, MFC1_FM<0>, ISA_MIPS1;
+  def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
+                 ISA_MIPS1, FGR_64 {
+    let DecoderNamespace = "MipsFP64";
+  }
+  def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
+                            bitconvert>, MFC1_FM<4>, ISA_MIPS1;
+  def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
+                 ISA_MIPS1, FGR_64 {
+    let DecoderNamespace = "MipsFP64";
+  }
 
-let AdditionalPredicates = [NotInMicroMips] in {
   def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
                   MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
   def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
                   MFC1_FM<3>, ISA_MIPS32R2, FGR_64 {
     let DecoderNamespace = "MipsFP64";
   }
-}
-let AdditionalPredicates = [NotInMicroMips] in {
+
   def MTHC1_D32 : MMRel, StdMMR6Rel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
                   MFC1_FM<7>, ISA_MIPS32R2, FGR_32;
   def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
                   MFC1_FM<7>, ISA_MIPS32R2, FGR_64 {
     let DecoderNamespace = "MipsFP64";
   }
-}
-let AdditionalPredicates = [NotInMicroMips] in {
+
   def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
               bitconvert>, MFC1_FM<5>, ISA_MIPS3;
   def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
                       bitconvert>, MFC1_FM<1>, ISA_MIPS3;
-}
-
-def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
-               ABSS_FM<0x6, 16>;
-def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
-               ABSS_FM<0x6, 17>, FGR_32;
-def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
-               ABSS_FM<0x6, 17>, FGR_64 {
-                 let DecoderNamespace = "MipsFP64";
+  let isMoveReg = 1 in {
+    def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+                   ABSS_FM<0x6, 16>, ISA_MIPS1;
+    def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+                   ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_32;
+    def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
+                   ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_64 {
+                     let DecoderNamespace = "MipsFP64";
+    }
+  } // isMoveReg
 }
 
 /// Floating Point Memory Instructions
 let AdditionalPredicates = [NotInMicroMips] in {
   def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_simm16, II_LWC1, load>,
-             LW_FM<0x31>;
+             LW_FM<0x31>, ISA_MIPS1;
   def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, mem_simm16, II_SWC1, store>,
-             LW_FM<0x39>;
+             LW_FM<0x39>, ISA_MIPS1;
 }
 
 let DecoderNamespace = "MipsFP64", AdditionalPredicates = [NotInMicroMips] in {
@@ -569,14 +580,15 @@ let DecoderNamespace="MipsFP64" in {
 
 // Load/store doubleword indexed unaligned.
 // FIXME: This instruction should not be defined for FGR_32.
-let AdditionalPredicates = [IsNotNaCl] in {
+let AdditionalPredicates = [IsNotNaCl, NotInMicroMips] in {
   def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
               INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
   def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
               INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
 }
 
-let DecoderNamespace="MipsFP64" in {
+let AdditionalPredicates = [IsNotNaCl, NotInMicroMips],
+    DecoderNamespace="MipsFP64" in {
   def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
                 INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
   def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
@@ -584,58 +596,62 @@ let DecoderNamespace="MipsFP64" in {
 }
 
 /// Floating-point Aritmetic
-def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
-             ADDS_FM<0x00, 16>;
-defm FADD :  ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>;
-def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
-             ADDS_FM<0x03, 16>;
-defm FDIV :  ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>;
-def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
-             ADDS_FM<0x02, 16>;
-defm FMUL :  ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>;
-def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
-             ADDS_FM<0x01, 16>;
-defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
-
-def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-             MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6, MADD4;
-def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-             MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6, MADD4;
-
-let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
+let AdditionalPredicates = [NotInMicroMips] in {
+  def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+               ADDS_FM<0x00, 16>, ISA_MIPS1;
+  defm FADD :  ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>,
+               ISA_MIPS1;
+  def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+               ADDS_FM<0x03, 16>, ISA_MIPS1;
+  defm FDIV :  ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>,
+               ISA_MIPS1;
+  def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+               ADDS_FM<0x02, 16>, ISA_MIPS1;
+  defm FMUL :  ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>,
+               ISA_MIPS1;
+  def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+               ADDS_FM<0x01, 16>, ISA_MIPS1;
+  defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>,
+               ISA_MIPS1;
+}
+
+let AdditionalPredicates = [NotInMicroMips, HasMadd4] in {
+  def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+               MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+  def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+               MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+
+  def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+  def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+
+  let DecoderNamespace = "MipsFP64" in {
+    def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
+                   MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+    def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
+                   MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  }
+}
+
+let AdditionalPredicates = [NoNaNsFPMath, HasMadd4, NotInMicroMips] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
                 MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
                 MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
-}
 
-def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-               MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32, MADD4;
-def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-               MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32, MADD4;
-
-let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
                   MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
                   MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
-}
-
-let DecoderNamespace = "MipsFP64" in {
-  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64, MADD4;
-  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64, MADD4;
-}
 
-let AdditionalPredicates = [NoNaNsFPMath, HasMadd4],
-    DecoderNamespace = "MipsFP64" in {
-  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
-  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  let DecoderNamespace = "MipsFP64" in {
+    def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
+                    MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+    def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
+                    MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  }
 }
-
 //===----------------------------------------------------------------------===//
 // Floating Point Branch Codes
 //===----------------------------------------------------------------------===//
@@ -844,28 +860,31 @@ let AdditionalPredicates = [NotInMicroMips] in {
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
 //===----------------------------------------------------------------------===//
-def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
-def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>, ISA_MIPS1;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>, ISA_MIPS1;
 
 def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
               (PseudoCVT_S_W GPR32Opnd:$src)>;
 def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
-              (TRUNC_W_S FGR32Opnd:$src)>;
+              (TRUNC_W_S FGR32Opnd:$src)>, ISA_MIPS1;
 
 def : MipsPat<(MipsMTC1_D64 GPR32Opnd:$src),
-              (MTC1_D64 GPR32Opnd:$src)>, FGR_64;
+              (MTC1_D64 GPR32Opnd:$src)>, ISA_MIPS1, FGR_64;
 
 def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
               (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
-def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
-              (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
-def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
-              (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
-def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
-              (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+                (TRUNC_W_D32 AFGR64Opnd:$src)>, ISA_MIPS2, FGR_32;
+  def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+                (CVT_S_D32 AFGR64Opnd:$src)>, ISA_MIPS1, FGR_32;
+  def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+                (CVT_D32_S FGR32Opnd:$src)>, ISA_MIPS1, FGR_32;
+}
 
-def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
-def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, ISA_MIPS3, GPR_64, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, ISA_MIPS3, GPR_64,
+      FGR_64;
 
 def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
               (PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
@@ -875,16 +894,18 @@ def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
               (PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
 
 def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
-              (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+              (TRUNC_W_D64 FGR64Opnd:$src)>, ISA_MIPS2, FGR_64;
 def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
-              (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+              (TRUNC_L_S FGR32Opnd:$src)>, ISA_MIPS2, FGR_64;
 def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
-              (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
+              (TRUNC_L_D64 FGR64Opnd:$src)>, ISA_MIPS2, FGR_64;
 
-def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
-              (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
-def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
-              (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+                (CVT_S_D64 FGR64Opnd:$src)>, ISA_MIPS1, FGR_64;
+  def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+                (CVT_D64_S FGR32Opnd:$src)>, ISA_MIPS1, FGR_64;
+}
 
 // To generate NMADD and NMSUB instructions when fneg node is present
 multiclass NMADD_NMSUB<Instruction Nmadd, Instruction Nmsub, RegisterOperand RC> {
@@ -903,13 +924,13 @@ let AdditionalPredicates = [NoNaNsFPMath, HasMadd4, NotInMicroMips] in {
 // Patterns for loads/stores with a reg+imm operand.
 let AdditionalPredicates = [NotInMicroMips] in {
   let AddedComplexity = 40 in {
-    def : LoadRegImmPat<LWC1, f32, load>;
-    def : StoreRegImmPat<SWC1, f32>;
+    def : LoadRegImmPat<LWC1, f32, load>, ISA_MIPS1;
+    def : StoreRegImmPat<SWC1, f32>, ISA_MIPS1;
 
-    def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
-    def : StoreRegImmPat<SDC164, f64>, FGR_64;
+    def : LoadRegImmPat<LDC164, f64, load>, ISA_MIPS1, FGR_64;
+    def : StoreRegImmPat<SDC164, f64>, ISA_MIPS1, FGR_64;
 
-    def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
-    def : StoreRegImmPat<SDC1, f64>, FGR_32;
+    def : LoadRegImmPat<LDC1, f64, load>, ISA_MIPS1, FGR_32;
+    def : StoreRegImmPat<SDC1, f64>, ISA_MIPS1, FGR_32;
   }
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
index 516edef0556c..ebbdcdf0df89 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -70,7 +70,7 @@ class StdArch {
 
 // Generic Mips Format
 class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
-               InstrItinClass itin, Format f>: Instruction
+               InstrItinClass itin, Format f>: Instruction, PredicateControl
 {
   field bits<32> Inst;
   Format Form = f;
@@ -119,8 +119,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Mips32/64 Instruction Format
 class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
              InstrItinClass itin, Format f, string opstr = ""> :
-  MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
-  let EncodingPredicates = [HasStdEnc];
+  MipsInst<outs, ins, asmstr, pattern, itin, f> {
+  let EncodingPredicates = [NotInMips16Mode];
   string BaseOpcode = opstr;
   string Arch;
 }
@@ -128,7 +128,7 @@ class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Mips Pseudo Instructions Format
 class MipsPseudo<dag outs, dag ins, list<dag> pattern,
                  InstrItinClass itin = IIPseudo> :
-  MipsInst<outs, ins, "", pattern, itin, Pseudo>, PredicateControl {
+  MipsInst<outs, ins, "", pattern, itin, Pseudo> {
   let isCodeGenOnly = 1;
   let isPseudo = 1;
 }
@@ -137,14 +137,14 @@ class MipsPseudo<dag outs, dag ins, list<dag> pattern,
 class PseudoSE<dag outs, dag ins, list<dag> pattern,
                InstrItinClass itin = IIPseudo> :
   MipsPseudo<outs, ins, pattern, itin> {
-  let EncodingPredicates = [HasStdEnc];
+  let EncodingPredicates = [NotInMips16Mode];
 }
 
 // Pseudo-instructions for alternate assembly syntax (never used by codegen).
 // These are aliases that require C++ handling to convert to the target
 // instruction, while InstAliases can be handled directly by tblgen.
 class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
-  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl {
+  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
   let isPseudo = 1;
   let Pattern = [];
 }
@@ -220,10 +220,9 @@ class FJ<bits<6> op> : StdArch
 }
 
 //===----------------------------------------------------------------------===//
-// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+// MFC instruction class in Mips : <|op|mf|rt|rd|gst|0000|sel|>
 //===----------------------------------------------------------------------===//
-class MFC3OP_FM<bits<6> op, bits<5> mfmt>
-{
+class MFC3OP_FM<bits<6> op, bits<5> mfmt, bits<3> guest> : StdArch {
   bits<5> rt;
   bits<5> rd;
   bits<3> sel;
@@ -234,7 +233,8 @@ class MFC3OP_FM<bits<6> op, bits<5> mfmt>
   let Inst{25-21} = mfmt;
   let Inst{20-16} = rt;
   let Inst{15-11} = rd;
-  let Inst{10-3}  = 0;
+  let Inst{10-8}  = guest;
+  let Inst{7-3}   = 0;
   let Inst{2-0}   = sel;
 }
 
@@ -508,6 +508,7 @@ class EXT_FM<bits<6> funct> : StdArch {
 class RDHWR_FM : StdArch {
   bits<5> rt;
   bits<5> rd;
+  bits<3> sel;
 
   bits<32> Inst;
 
@@ -515,7 +516,8 @@ class RDHWR_FM : StdArch {
   let Inst{25-21} = 0;
   let Inst{20-16} = rt;
   let Inst{15-11} = rd;
-  let Inst{10-6}  = 0;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-6}   = sel;
   let Inst{5-0}   = 0x3b;
 }
 
@@ -970,3 +972,14 @@ class CACHEOP_FM<bits<6> op> : StdArch {
   let Inst{20-16} = hint;
   let Inst{15-0}  = offset;
 }
+
+class HYPCALL_FM<bits<6> op> : StdArch {
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25}    = 1;
+  let Inst{20-11} = code_;
+  let Inst{5-0}   = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 2e30d271e130..0e0e712dba19 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -163,7 +163,7 @@ unsigned MipsInstrInfo::removeBranch(MachineBasicBlock &MBB,
   // Note that indirect branches are not removed.
   while (I != REnd && removed < 2) {
     // Skip past debug instructions.
-    if (I->isDebugValue()) {
+    if (I->isDebugInstr()) {
       ++I;
       continue;
     }
@@ -195,7 +195,7 @@ MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
   // Skip all the debug instructions.
-  while (I != REnd && I->isDebugValue())
+  while (I != REnd && I->isDebugInstr())
     ++I;
 
   if (I == REnd || !isUnpredicatedTerminator(*I)) {
@@ -220,7 +220,7 @@ MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
   // Skip past any debug instruction to see if the second last actual
   // is a branch.
   ++I;
-  while (I != REnd && I->isDebugValue())
+  while (I != REnd && I->isDebugInstr())
     ++I;
 
   if (I != REnd) {
@@ -276,6 +276,163 @@ MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
   return BT_CondUncond;
 }
 
+bool MipsInstrInfo::isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const {
+  switch (BranchOpc) {
+  case Mips::B:
+  case Mips::BAL:
+  case Mips::BC1F:
+  case Mips::BC1FL:
+  case Mips::BC1T:
+  case Mips::BC1TL:
+  case Mips::BEQ:     case Mips::BEQ64:
+  case Mips::BEQL:
+  case Mips::BGEZ:    case Mips::BGEZ64:
+  case Mips::BGEZL:
+  case Mips::BGEZAL:
+  case Mips::BGEZALL:
+  case Mips::BGTZ:    case Mips::BGTZ64:
+  case Mips::BGTZL:
+  case Mips::BLEZ:    case Mips::BLEZ64:
+  case Mips::BLEZL:
+  case Mips::BLTZ:    case Mips::BLTZ64:
+  case Mips::BLTZL:
+  case Mips::BLTZAL:
+  case Mips::BLTZALL:
+  case Mips::BNE:     case Mips::BNE64:
+  case Mips::BNEL:
+    return isInt<18>(BrOffset);
+
+  // microMIPSr3 branches
+  case Mips::B_MM:
+  case Mips::BC1F_MM:
+  case Mips::BC1T_MM:
+  case Mips::BEQ_MM:
+  case Mips::BGEZ_MM:
+  case Mips::BGEZAL_MM:
+  case Mips::BGTZ_MM:
+  case Mips::BLEZ_MM:
+  case Mips::BLTZ_MM:
+  case Mips::BLTZAL_MM:
+  case Mips::BNE_MM:
+  case Mips::BEQZC_MM:
+  case Mips::BNEZC_MM:
+    return isInt<17>(BrOffset);
+
+  // microMIPSR3 short branches.
+  case Mips::B16_MM:
+    return isInt<11>(BrOffset);
+
+  case Mips::BEQZ16_MM:
+  case Mips::BNEZ16_MM:
+    return isInt<8>(BrOffset);
+
+  // MIPSR6 branches.
+  case Mips::BALC:
+  case Mips::BC:
+    return isInt<28>(BrOffset);
+
+  case Mips::BC1EQZ:
+  case Mips::BC1NEZ:
+  case Mips::BC2EQZ:
+  case Mips::BC2NEZ:
+  case Mips::BEQC:   case Mips::BEQC64:
+  case Mips::BNEC:   case Mips::BNEC64:
+  case Mips::BGEC:   case Mips::BGEC64:
+  case Mips::BGEUC:  case Mips::BGEUC64:
+  case Mips::BGEZC:  case Mips::BGEZC64:
+  case Mips::BGTZC:  case Mips::BGTZC64:
+  case Mips::BLEZC:  case Mips::BLEZC64:
+  case Mips::BLTC:   case Mips::BLTC64:
+  case Mips::BLTUC:  case Mips::BLTUC64:
+  case Mips::BLTZC:  case Mips::BLTZC64:
+  case Mips::BNVC:
+  case Mips::BOVC:
+  case Mips::BGEZALC:
+  case Mips::BEQZALC:
+  case Mips::BGTZALC:
+  case Mips::BLEZALC:
+  case Mips::BLTZALC:
+  case Mips::BNEZALC:
+    return isInt<18>(BrOffset);
+
+  case Mips::BEQZC:  case Mips::BEQZC64:
+  case Mips::BNEZC:  case Mips::BNEZC64:
+    return isInt<23>(BrOffset);
+
+  // microMIPSR6 branches
+  case Mips::BC16_MMR6:
+    return isInt<11>(BrOffset);
+
+  case Mips::BEQZC16_MMR6:
+  case Mips::BNEZC16_MMR6:
+    return isInt<8>(BrOffset);
+
+  case Mips::BALC_MMR6:
+  case Mips::BC_MMR6:
+    return isInt<27>(BrOffset);
+
+  case Mips::BC1EQZC_MMR6:
+  case Mips::BC1NEZC_MMR6:
+  case Mips::BC2EQZC_MMR6:
+  case Mips::BC2NEZC_MMR6:
+  case Mips::BGEZALC_MMR6:
+  case Mips::BEQZALC_MMR6:
+  case Mips::BGTZALC_MMR6:
+  case Mips::BLEZALC_MMR6:
+  case Mips::BLTZALC_MMR6:
+  case Mips::BNEZALC_MMR6:
+  case Mips::BNVC_MMR6:
+  case Mips::BOVC_MMR6:
+    return isInt<17>(BrOffset);
+
+  case Mips::BEQC_MMR6:
+  case Mips::BNEC_MMR6:
+  case Mips::BGEC_MMR6:
+  case Mips::BGEUC_MMR6:
+  case Mips::BGEZC_MMR6:
+  case Mips::BGTZC_MMR6:
+  case Mips::BLEZC_MMR6:
+  case Mips::BLTC_MMR6:
+  case Mips::BLTUC_MMR6:
+  case Mips::BLTZC_MMR6:
+    return isInt<18>(BrOffset);
+
+  case Mips::BEQZC_MMR6:
+  case Mips::BNEZC_MMR6:
+    return isInt<23>(BrOffset);
+
+  // DSP branches.
+  case Mips::BPOSGE32:
+    return isInt<18>(BrOffset);
+  case Mips::BPOSGE32_MM:
+  case Mips::BPOSGE32C_MMR3:
+    return isInt<17>(BrOffset);
+
+  // cnMIPS branches.
+  case Mips::BBIT0:
+  case Mips::BBIT032:
+  case Mips::BBIT1:
+  case Mips::BBIT132:
+    return isInt<18>(BrOffset);
+
+  // MSA branches.
+  case Mips::BZ_B:
+  case Mips::BZ_H:
+  case Mips::BZ_W:
+  case Mips::BZ_D:
+  case Mips::BZ_V:
+  case Mips::BNZ_B:
+  case Mips::BNZ_H:
+  case Mips::BNZ_W:
+  case Mips::BNZ_D:
+  case Mips::BNZ_V:
+    return isInt<18>(BrOffset);
+  }
+
+  llvm_unreachable("Unknown branch instruction!");
+}
+
+
 /// Return the corresponding compact (no delay slot) form of a branch.
 unsigned MipsInstrInfo::getEquivalentCompactForm(
     const MachineBasicBlock::iterator I) const {
@@ -598,7 +755,7 @@ bool MipsInstrInfo::verifyInstruction(const MachineInstr &MI,
     case Mips::DINS:
       return verifyInsExtInstruction(MI, ErrInfo, 0, 32, 0, 32, 0, 32);
     case Mips::DINSM:
-      // The ISA spec has a subtle difference difference between dinsm and dextm
+      // The ISA spec has a subtle difference between dinsm and dextm
       // in that it says:
       // 2 <= size <= 64 for 'dinsm' but 'dextm' has 32 < size <= 64.
       // To make the bounds checks similar, the range 1 < size <= 64 is checked
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
index c18e395f9013..9d27b8f66211 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -86,6 +86,10 @@ public:
   /// Determine the opcode of a non-delay slot form for a branch if one exists.
   unsigned getEquivalentCompactForm(const MachineBasicBlock::iterator I) const;
 
+  /// Determine if the branch target is in range.
+  bool isBranchOffsetInRange(unsigned BranchOpc,
+                             int64_t BrOffset) const override;
+
   /// Predicate to determine if an instruction can go in a forbidden slot.
   bool SafeInForbiddenSlot(const MachineInstr &MI) const;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
index 33a061e12a3f..0faa13d4d63f 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -73,12 +73,8 @@ def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
 // Hi node for accessing the GOT.
 def MipsGotHi : SDNode<"MipsISD::GotHi", SDTIntUnaryOp>;
 
-// TlsGd node is used to handle General Dynamic TLS
-def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
-
-// TprelHi and TprelLo nodes are used to handle Local Exec TLS
-def MipsTprelHi    : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
-def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+// Hi node for handling TLS offsets
+def MipsTlsHi   : SDNode<"MipsISD::TlsHi", SDTIntUnaryOp>;
 
 // Thread pointer
 def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
@@ -202,12 +198,12 @@ def NotMips64    :    Predicate<"!Subtarget->hasMips64()">,
                       AssemblerPredicate<"!FeatureMips64">;
 def HasMips64r2  :    Predicate<"Subtarget->hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r5  :    Predicate<"Subtarget->hasMips64r5()">,
+                      AssemblerPredicate<"FeatureMips64r5">;
 def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"FeatureMips64r6">;
 def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"!FeatureMips64r6">;
-def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">,
-                       AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">;
 def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
 def NotInMips16Mode : Predicate<"!Subtarget->inMips16Mode()">,
@@ -237,7 +233,7 @@ def IsBE           :  Predicate<"!Subtarget->isLittle()">;
 def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
 def UseTCCInDIV    :  AssemblerPredicate<"FeatureUseTCCInDIV">;
 def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
-                      AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
+                      AssemblerPredicate<"FeatureEVA">;
 def HasMSA : Predicate<"Subtarget->hasMSA()">,
              AssemblerPredicate<"FeatureMSA">;
 def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
@@ -248,6 +244,15 @@ def UseIndirectJumpsHazard : Predicate<"Subtarget->useIndirectJumpsHazard()">,
                             AssemblerPredicate<"FeatureUseIndirectJumpsHazard">;
 def NoIndirectJumpGuards : Predicate<"!Subtarget->useIndirectJumpsHazard()">,
                            AssemblerPredicate<"!FeatureUseIndirectJumpsHazard">;
+def HasCRC   : Predicate<"Subtarget->hasCRC()">,
+               AssemblerPredicate<"FeatureCRC">;
+def HasVirt  : Predicate<"Subtarget->hasVirt()">,
+               AssemblerPredicate<"FeatureVirt">;
+def HasGINV  : Predicate<"Subtarget->hasGINV()">,
+               AssemblerPredicate<"FeatureGINV">;
+// TODO: Add support for FPOpFusion::Standard
+def AllowFPOpFusion : Predicate<"TM.Options.AllowFPOpFusion =="
+                                " FPOpFusion::Fast">;
 //===----------------------------------------------------------------------===//
 // Mips GPR size adjectives.
 // They are mutually exclusive.
@@ -277,126 +282,203 @@ class SYM_64 { list<Predicate> SYMPredicates = [IsSym64]; }
 //        subtractive predicate will hopefully keep us under the 32 predicate
 //        limit long enough to develop an alternative way to handle P1||P2
 //        predicates.
+class ISA_MIPS1 {
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
 class ISA_MIPS1_NOT_MIPS3 {
   list<Predicate> InsnPredicates = [NotMips3];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 class ISA_MIPS1_NOT_4_32 {
   list<Predicate> InsnPredicates = [NotMips4_32];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 class ISA_MIPS1_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS2 {
+  list<Predicate> InsnPredicates = [HasMips2];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
-class ISA_MIPS2    { list<Predicate> InsnPredicates = [HasMips2]; }
 class ISA_MIPS2_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS3 {
+  list<Predicate> InsnPredicates = [HasMips3];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
-class ISA_MIPS3    { list<Predicate> InsnPredicates = [HasMips3]; }
 class ISA_MIPS3_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32 {
+  list<Predicate> InsnPredicates = [HasMips32];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
-class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
 class ISA_MIPS32_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32R2 {
+  list<Predicate> InsnPredicates = [HasMips32r2];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
-class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
 class ISA_MIPS32R2_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32R5 {
+  list<Predicate> InsnPredicates = [HasMips32r5];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64 {
+  list<Predicate> InsnPredicates = [HasMips64];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
-class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; }
-class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
 class ISA_MIPS64_NOT_64R6 {
   list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64R2 {
+  list<Predicate> InsnPredicates = [HasMips64r2];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64R5 {
+  list<Predicate> InsnPredicates = [HasMips64r5];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS32R6 {
+  list<Predicate> InsnPredicates = [HasMips32r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MIPS64R6 {
+  list<Predicate> InsnPredicates = [HasMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+class ISA_MICROMIPS {
+  list<Predicate> EncodingPredicates = [InMicroMips];
+}
+class ISA_MICROMIPS32R5 {
+  list<Predicate> InsnPredicates = [HasMips32r5];
+  list<Predicate> EncodingPredicates = [InMicroMips];
 }
-class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
-class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
-class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
-class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; }
 class ISA_MICROMIPS32R6 {
-  list<Predicate> InsnPredicates = [HasMicroMips32r6];
+  list<Predicate> InsnPredicates = [HasMips32r6];
+  list<Predicate> EncodingPredicates = [InMicroMips];
 }
-class ISA_MICROMIPS32_NOT_MIPS32R6 {
-  list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6];
+class ISA_MICROMIPS64R6 {
+  list<Predicate> InsnPredicates = [HasMips64r6];
+  list<Predicate> EncodingPredicates = [InMicroMips];
 }
-
-class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; }
-class INSN_EVA_NOT_32R6_64R6 {
-  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA];
+class ISA_MICROMIPS32_NOT_MIPS32R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6];
+  list<Predicate> EncodingPredicates = [InMicroMips];
 }
+class ASE_EVA { list<Predicate> ASEPredicate = [HasEVA]; }
 
 // The portions of MIPS-III that were also added to MIPS32
-class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+class INSN_MIPS3_32 {
+  list<Predicate> InsnPredicates = [HasMips3_32];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
 
 // The portions of MIPS-III that were also added to MIPS32 but were removed in
 // MIPS32r6 and MIPS64r6.
 class INSN_MIPS3_32_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 
 // The portions of MIPS-III that were also added to MIPS32
-class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+class INSN_MIPS3_32R2 {
+  list<Predicate> InsnPredicates = [HasMips3_32r2];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
 
 // The portions of MIPS-IV that were also added to MIPS32.
-class INSN_MIPS4_32 { list <Predicate> InsnPredicates = [HasMips4_32]; }
+class INSN_MIPS4_32 {
+  list <Predicate> InsnPredicates = [HasMips4_32];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
 
 // The portions of MIPS-IV that were also added to MIPS32 but were removed in
 // MIPS32r6 and MIPS64r6.
 class INSN_MIPS4_32_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 
 // The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
 // MIPS32r6 and MIPS64r6.
 class INSN_MIPS4_32R2_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 
 // The portions of MIPS-IV that were also added to MIPS32r2.
 class INSN_MIPS4_32R2 {
   list<Predicate> InsnPredicates = [HasMips4_32r2];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 
 // The portions of MIPS-V that were also added to MIPS32r2 but were removed in
 // MIPS32r6 and MIPS64r6.
 class INSN_MIPS5_32R2_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+  list<Predicate> EncodingPredicates = [HasStdEnc];
 }
 
 class ASE_CNMIPS {
-  list<Predicate> InsnPredicates = [HasCnMips];
+  list<Predicate> ASEPredicate = [HasCnMips];
 }
 
 class NOT_ASE_CNMIPS {
-  list<Predicate> InsnPredicates = [NotCnMips];
+  list<Predicate> ASEPredicate = [NotCnMips];
 }
 
 class ASE_MIPS64_CNMIPS {
-  list<Predicate> InsnPredicates = [HasMips64, HasCnMips];
+  list<Predicate> ASEPredicate = [HasMips64, HasCnMips];
 }
 
 class ASE_MSA {
-  list<Predicate> InsnPredicates = [HasMSA];
+  list<Predicate> ASEPredicate = [HasMSA];
 }
 
 class ASE_MSA_NOT_MSA64 {
-  list<Predicate> InsnPredicates = [HasMSA, NotMips64];
+  list<Predicate> ASEPredicate = [HasMSA, NotMips64];
 }
 
 class ASE_MSA64 {
-  list<Predicate> InsnPredicates = [HasMSA, HasMips64];
+  list<Predicate> ASEPredicate = [HasMSA, HasMips64];
 }
 
 class ASE_MT {
-  list <Predicate> InsnPredicates = [HasMT];
+  list <Predicate> ASEPredicate = [HasMT];
+}
+
+class ASE_CRC {
+  list <Predicate> ASEPredicate = [HasCRC];
+}
+
+class ASE_VIRT {
+  list <Predicate> ASEPredicate = [HasVirt];
+}
+
+class ASE_GINV {
+  list <Predicate> ASEPredicate = [HasGINV];
 }
 
 // Class used for separating microMIPSr6 and microMIPS (r3) instruction.
 // It can be used only on instructions that doesn't inherit PredicateControl.
 class ISA_MICROMIPS_NOT_32R6 : PredicateControl {
-  let InsnPredicates = [InMicroMips, NotMips32r6];
+  let InsnPredicates = [NotMips32r6];
+  let EncodingPredicates = [InMicroMips];
 }
 
 class ASE_NOT_DSP {
-  list<Predicate> InsnPredicates = [NotDSP];
+  list<Predicate> ASEPredicate = [NotDSP];
 }
 
 class MADD4 {
@@ -413,11 +495,13 @@ class ABI_NOT_N64 {
   list<Predicate> AdditionalPredicates = [IsNotN64];
 }
 
+class FPOP_FUSION_FAST {
+  list <Predicate> AdditionalPredicates = [AllowFPOpFusion];
+}
+
 //===----------------------------------------------------------------------===//
 
-class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
-  let EncodingPredicates = [HasStdEnc];
-}
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl;
 
 class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
   InstAlias<Asm, Result, Emit>, PredicateControl;
@@ -1047,6 +1131,15 @@ def MipsMemSimm16AsmOperand : AsmOperandClass {
   let DiagnosticType = "MemSImm16";
 }
 
+def MipsMemSimmPtrAsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimmPtr";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithPtrSizeOffset";
+  let DiagnosticType = "MemSImmPtr";
+}
+
 def MipsInvertedImmoperand : AsmOperandClass {
   let Name = "InvNum";
   let RenderMethod = "addImmOperands";
@@ -1120,6 +1213,10 @@ def mem_simm16 : mem_generic {
   let ParserMatchClass = MipsMemSimm16AsmOperand;
 }
 
+def mem_simmptr : mem_generic {
+  let ParserMatchClass = MipsMemSimmPtrAsmOperand;
+}
+
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -1336,6 +1433,7 @@ class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
          [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
+  string BaseOpcode = opstr;
   let mayLoad = 1;
 }
 
@@ -1349,6 +1447,7 @@ class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
   InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
+  string BaseOpcode = opstr;
   let mayStore = 1;
 }
 
@@ -1366,6 +1465,7 @@ class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
          [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
   string Constraints = "$src = $rt";
+  let BaseOpcode = opstr;
 }
 
 class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
@@ -1373,6 +1473,7 @@ class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
   InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
+  let BaseOpcode = opstr;
 }
 
 // COP2 Load/Store
@@ -1548,9 +1649,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     PseudoInstExpansion<(JumpInst RO:$rs)>;
 }
 
-class BAL_BR_Pseudo<Instruction RealInst> :
-  PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>,
-  PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
+class BAL_BR_Pseudo<Instruction RealInst, DAGOperand opnd> :
+  PseudoSE<(outs), (ins opnd:$offset), [], II_BCCZAL>,
+  PseudoInstExpansion<(RealInst ZERO, opnd:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -1592,8 +1693,8 @@ class SYNC_FT<string opstr> :
   InstSE<(outs), (ins uimm5:$stype), "sync $stype",
          [(MipsSync immZExt5:$stype)], II_SYNC, FrmOther, opstr>;
 
-class SYNCI_FT<string opstr> :
-  InstSE<(outs), (ins mem_simm16:$addr), !strconcat(opstr, "\t$addr"), [],
+class SYNCI_FT<string opstr, DAGOperand MO> :
+  InstSE<(outs), (ins MO:$addr), !strconcat(opstr, "\t$addr"), [],
          II_SYNCI, FrmOther, opstr> {
   let hasSideEffects = 1;
   let DecoderMethod = "DecodeSyncI";
@@ -1665,6 +1766,7 @@ class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
          FrmR, opstr> {
   let Uses = [UseReg];
   let hasSideEffects = 0;
+  let isMoveReg = 1;
 }
 
 class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
@@ -1677,6 +1779,7 @@ class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
   FrmR, opstr> {
   let Defs = DefRegs;
   let hasSideEffects = 0;
+  let isMoveReg = 1;
 }
 
 class EffectiveAddress<string opstr, RegisterOperand RO> :
@@ -1715,8 +1818,8 @@ class SubwordSwap<string opstr, RegisterOperand RO,
 
 // Read Hardware
 class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
-  InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
-         II_RDHWR, FrmR, "rdhwr">;
+  InstSE<(outs CPURegOperand:$rt), (ins RO:$rd, uimm8:$sel),
+         "rdhwr\t$rt, $rd, $sel", [], II_RDHWR, FrmR, "rdhwr">;
 
 // Ext and Ins
 class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
@@ -1725,7 +1828,7 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
          !strconcat(opstr, "\t$rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
-         FrmR, opstr>, ISA_MIPS32R2;
+         FrmR, opstr>;
 
 // 'ins' and its' 64 bit variants are matched by C++ code.
 class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
@@ -1734,7 +1837,7 @@ class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
          !strconcat(opstr, "\t$rt, $rs, $pos, $size"),
          [(set RO:$rt, (null_frag RO:$rs, PosImm:$pos, SizeImm:$size,
                                   RO:$src))],
-         II_INS, FrmR, opstr>, ISA_MIPS32R2 {
+         II_INS, FrmR, opstr> {
   let Constraints = "$src = $rt";
 }
 
@@ -1743,11 +1846,37 @@ class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
   PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
            [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
 
+class Atomic2OpsPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr), []> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class Atomic2OpsSubwordPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr, RC:$mask, RC:$mask2,
+                                RC:$shiftamnt), []>;
+
 // Atomic Compare & Swap.
+// Atomic compare and swap is lowered into two stages. The first stage happens
+// during ISelLowering, which produces the PostRA version of this instruction.
 class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
   PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
            [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
 
+class AtomicCmpSwapPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$cmp, RC:$swap), []> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class AtomicCmpSwapSubwordPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$mask, RC:$ShiftCmpVal,
+                                RC:$mask2, RC:$ShiftNewVal, RC:$ShiftAmt), []> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+
 class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
   InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [], II_LL, FrmI, opstr> {
@@ -1766,12 +1895,16 @@ class SCBase<string opstr, RegisterOperand RO> :
 class MFC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
              InstrItinClass itin> :
   InstSE<(outs RO:$rt), (ins RD:$rd, uimm3:$sel),
-         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR> {
+  let BaseOpcode = asmstr;
+}
 
 class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
              InstrItinClass itin> :
   InstSE<(outs RO:$rd), (ins RD:$rt, uimm3:$sel),
-         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR> {
+  let BaseOpcode = asmstr;
+}
 
 class TrapBase<Instruction RealInst>
   : PseudoSE<(outs), (ins), [(trap)], II_TRAP>,
@@ -1829,8 +1962,36 @@ let usesCustomInserter = 1 in {
   def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
   def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
   def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
+
 }
 
+def ATOMIC_LOAD_ADD_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_ADD_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_ADD_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I8_POSTRA    : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I16_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I32_POSTRA   : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I8_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+
+def ATOMIC_SWAP_I8_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_SWAP_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_SWAP_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+
+def ATOMIC_CMP_SWAP_I8_POSTRA : AtomicCmpSwapSubwordPostRA<GPR32>;
+def ATOMIC_CMP_SWAP_I16_POSTRA : AtomicCmpSwapSubwordPostRA<GPR32>;
+def ATOMIC_CMP_SWAP_I32_POSTRA : AtomicCmpSwapPostRA<GPR32>;
+
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
   def LOAD_ACC64  : Load<"", ACC64>;
@@ -1860,69 +2021,72 @@ def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
 let AdditionalPredicates = [NotInMicroMips] in {
   def ADDiu : MMRel, StdMMR6Rel, ArithLogicI<"addiu", simm16_relaxed, GPR32Opnd,
                                              II_ADDIU, immSExt16, add>,
-              ADDI_FM<0x9>, IsAsCheapAsAMove;
+              ADDI_FM<0x9>, IsAsCheapAsAMove, ISA_MIPS1;
 
   def ANDi : MMRel, StdMMR6Rel,
              ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16, and>,
-             ADDI_FM<0xc>;
+             ADDI_FM<0xc>, ISA_MIPS1;
   def ORi  : MMRel, StdMMR6Rel,
              ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16, or>,
-             ADDI_FM<0xd>;
+             ADDI_FM<0xd>, ISA_MIPS1;
   def XORi : MMRel, StdMMR6Rel,
              ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>,
-             ADDI_FM<0xe>;
-}
-def ADDi  : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>, ADDI_FM<0x8>,
-            ISA_MIPS1_NOT_32R6_64R6;
-let AdditionalPredicates = [NotInMicroMips] in {
+             ADDI_FM<0xe>, ISA_MIPS1;
+  def ADDi  : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>,
+              ADDI_FM<0x8>, ISA_MIPS1_NOT_32R6_64R6;
   def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
-              SLTI_FM<0xa>;
+              SLTI_FM<0xa>, ISA_MIPS1;
   def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
-              SLTI_FM<0xb>;
-}
-def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
-let AdditionalPredicates = [NotInMicroMips] in {
+              SLTI_FM<0xb>, ISA_MIPS1;
+
+  def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM,
+              ISA_MIPS1;
+
   /// Arithmetic Instructions (3-Operand, R-Type)
   def ADDu  : MMRel, StdMMR6Rel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
-              ADD_FM<0, 0x21>;
+              ADD_FM<0, 0x21>, ISA_MIPS1;
   def SUBu  : MMRel, StdMMR6Rel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
-              ADD_FM<0, 0x23>;
-}
-let Defs = [HI0, LO0] in
-def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
-            ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
-def ADD   : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>, ADD_FM<0, 0x20>;
-def SUB   : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>, ADD_FM<0, 0x22>;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
-  def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+              ADD_FM<0, 0x23>, ISA_MIPS1;
+
+  let Defs = [HI0, LO0] in
+    def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+                ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
+
+  def ADD   : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>,
+              ADD_FM<0, 0x20>, ISA_MIPS1;
+  def SUB   : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>,
+              ADD_FM<0, 0x22>, ISA_MIPS1;
+
+  def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>,
+              ISA_MIPS1;
+  def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>,
+              ISA_MIPS1;
   def AND   : MMRel, StdMMR6Rel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
-              ADD_FM<0, 0x24>;
+              ADD_FM<0, 0x24>, ISA_MIPS1;
   def OR    : MMRel, StdMMR6Rel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
-              ADD_FM<0, 0x25>;
+              ADD_FM<0, 0x25>, ISA_MIPS1;
   def XOR   : MMRel, StdMMR6Rel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
-              ADD_FM<0, 0x26>;
-  def NOR   : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
+              ADD_FM<0, 0x26>, ISA_MIPS1;
+  def NOR   : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>,
+              ISA_MIPS1;
 }
 
-/// Shift Instructions
-let AdditionalPredicates = [NotInMicroMips] in {
-def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
-                                   immZExt5>, SRA_FM<0, 0>;
-def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
-                                   immZExt5>, SRA_FM<2, 0>;
-def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
-                                   immZExt5>, SRA_FM<3, 0>;
-def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
-           SRLV_FM<4, 0>;
-def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
-           SRLV_FM<6, 0>;
-def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
-           SRLV_FM<7, 0>;
-}
-
-// Rotate Instructions
 let AdditionalPredicates = [NotInMicroMips] in {
+  /// Shift Instructions
+  def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+                                     immZExt5>, SRA_FM<0, 0>, ISA_MIPS1;
+  def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+                                     immZExt5>, SRA_FM<2, 0>, ISA_MIPS1;
+  def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+                                     immZExt5>, SRA_FM<3, 0>, ISA_MIPS1;
+  def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+             SRLV_FM<4, 0>, ISA_MIPS1;
+  def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+             SRLV_FM<6, 0>, ISA_MIPS1;
+  def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+             SRLV_FM<7, 0>, ISA_MIPS1;
+
+  // Rotate Instructions
   def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
                                       immZExt5>,
               SRA_FM<2, 1>, ISA_MIPS32R2;
@@ -1932,39 +2096,35 @@ let AdditionalPredicates = [NotInMicroMips] in {
 
 /// Load and Store Instructions
 ///  aligned
-def LB  : LoadMemory<"lb", GPR32Opnd, mem_simm16, sextloadi8, II_LB>, MMRel,
-          LW_FM<0x20>;
-def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simm16, zextloadi8, II_LBU,
-                     addrDefault>, MMRel, LW_FM<0x24>;
 let AdditionalPredicates = [NotInMicroMips] in {
-  def LH  : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
-                       addrDefault>, MMRel, LW_FM<0x21>;
-  def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
-            MMRel, LW_FM<0x25>;
+  def LB  : LoadMemory<"lb", GPR32Opnd, mem_simmptr, sextloadi8, II_LB>, MMRel,
+            LW_FM<0x20>, ISA_MIPS1;
+  def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simmptr, zextloadi8, II_LBU,
+                       addrDefault>, MMRel, LW_FM<0x24>, ISA_MIPS1;
+  def LH  : LoadMemory<"lh", GPR32Opnd, mem_simmptr, sextloadi16, II_LH,
+                       addrDefault>, MMRel, LW_FM<0x21>, ISA_MIPS1;
+  def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simmptr, zextloadi16, II_LHU>,
+            MMRel, LW_FM<0x25>, ISA_MIPS1;
   def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
-            LW_FM<0x23>;
-}
-def SB  : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
-          LW_FM<0x28>;
-def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
-let AdditionalPredicates = [NotInMicroMips] in {
-def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
+            LW_FM<0x23>, ISA_MIPS1;
+  def SB  : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+            LW_FM<0x28>, ISA_MIPS1;
+  def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>,
+            ISA_MIPS1;
+  def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>, ISA_MIPS1;
 }
 
 /// load/store left/right
-let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
-    AdditionalPredicates = [NotInMicroMips] in {
-def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWL : MMRel, LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
           ISA_MIPS1_NOT_32R6_64R6;
-def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+def LWR : MMRel, LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
           ISA_MIPS1_NOT_32R6_64R6;
-def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+def SWL : MMRel, StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
           ISA_MIPS1_NOT_32R6_64R6;
-def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+def SWR : MMRel, StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
           ISA_MIPS1_NOT_32R6_64R6;
-}
 
-let AdditionalPredicates = [NotInMicroMips] in {
 // COP2 Memory Instructions
 def LWC2 : StdMMR6Rel, LW_FT2<"lwc2", COP2Opnd, II_LWC2, load>, LW_FM<0x32>,
            ISA_MIPS1_NOT_32R6_64R6;
@@ -1977,63 +2137,68 @@ def SDC2 : StdMMR6Rel, SW_FT2<"sdc2", COP2Opnd, II_SDC2, store>,
 
 // COP3 Memory Instructions
 let DecoderNamespace = "COP3_" in {
-  def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>;
-  def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>;
+  def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>,
+             ISA_MIPS1_NOT_32R6_64R6, NOT_ASE_CNMIPS;
+  def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>,
+             ISA_MIPS1_NOT_32R6_64R6, NOT_ASE_CNMIPS;
   def LDC3 : LW_FT3<"ldc3", COP3Opnd, II_LDC3, load>, LW_FM<0x37>,
-             ISA_MIPS2;
+             ISA_MIPS2, NOT_ASE_CNMIPS;
   def SDC3 : SW_FT3<"sdc3", COP3Opnd, II_SDC3, store>, LW_FM<0x3f>,
-             ISA_MIPS2;
+             ISA_MIPS2, NOT_ASE_CNMIPS;
 }
 
   def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS2;
-  def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
+  def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci", mem_simm16>, SYNCI_FM,
+              ISA_MIPS32R2;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
-  def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>, ISA_MIPS2;
-  def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>, ISA_MIPS2;
-  def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>, ISA_MIPS2;
-  def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>, ISA_MIPS2;
-  def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>, ISA_MIPS2;
-  def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>, ISA_MIPS2;
+  def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>,
+            ISA_MIPS2;
+  def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>,
+            ISA_MIPS2;
+  def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>,
+             ISA_MIPS2;
+  def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>,
+            ISA_MIPS2;
+  def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>,
+            ISA_MIPS2;
+  def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>,
+            ISA_MIPS2;
+
+  def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
+             ISA_MIPS2_NOT_32R6_64R6;
+  def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
+             ISA_MIPS2_NOT_32R6_64R6;
+  def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
+              ISA_MIPS2_NOT_32R6_64R6;
+  def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
+             ISA_MIPS2_NOT_32R6_64R6;
+  def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
+               ISA_MIPS2_NOT_32R6_64R6;
+  def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
+             ISA_MIPS2_NOT_32R6_64R6;
 }
 
-def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
-           ISA_MIPS2_NOT_32R6_64R6;
-
-let AdditionalPredicates = [NotInMicroMips] in {
-def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>;
-def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>;
-}
-def TRAP : TrapBase<BREAK>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
-}
+  def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>, ISA_MIPS1;
+  def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>,
+                ISA_MIPS1;
+  def TRAP : TrapBase<BREAK>, ISA_MIPS1;
+  def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM,
+              ISA_MIPS32_NOT_32R6_64R6;
 
-let AdditionalPredicates = [NotInMicroMips] in {
   def ERET : MMRel, ER_FT<"eret", II_ERET>, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
-  def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+  def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>,
+               ISA_MIPS32R5;
   def DERET : MMRel, ER_FT<"deret", II_DERET>, ER_FM<0x1f, 0x0>, ISA_MIPS32;
-}
 
-let AdditionalPredicates = [NotInMicroMips] in {
-  def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>, ISA_MIPS32R2;
-  def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>, ISA_MIPS32R2;
-}
+  def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>,
+           ISA_MIPS32R2;
+  def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>,
+           ISA_MIPS32R2;
 
-let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
-    AdditionalPredicates = [NotInMicroMips] in {
-def WAIT : WAIT_FT<"wait">, WAIT_FM;
+  def WAIT : MMRel, StdMMR6Rel, WAIT_FT<"wait">, WAIT_FM, INSN_MIPS3_32;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -2041,56 +2206,63 @@ let AdditionalPredicates = [NotInMicroMips] in {
 def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
 def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
 }
-
 /// Jump and Branch Instructions
+let AdditionalPredicates = [NotInMicroMips, RelocNotPIC] in
 def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
-              AdditionalRequires<[RelocNotPIC, NotInMicroMips]>, IsBranch;
-def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6; 
-def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+              IsBranch, ISA_MIPS1;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6;
+def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>,
+              ISA_MIPS1;
 def BEQL    : MMRel, CBranchLikely<"beql", brtarget, GPR32Opnd>,
               BEQ_FM<20>, ISA_MIPS2_NOT_32R6_64R6;
-def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>,
+              ISA_MIPS1;
 def BNEL    : MMRel, CBranchLikely<"bnel", brtarget, GPR32Opnd>,
               BEQ_FM<21>, ISA_MIPS2_NOT_32R6_64R6;
 def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
-              BGEZ_FM<1, 1>;
+              BGEZ_FM<1, 1>, ISA_MIPS1;
 def BGEZL   : MMRel, CBranchZeroLikely<"bgezl", brtarget, GPR32Opnd>,
               BGEZ_FM<1, 3>, ISA_MIPS2_NOT_32R6_64R6;
 def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
-              BGEZ_FM<7, 0>;
+              BGEZ_FM<7, 0>, ISA_MIPS1;
 def BGTZL   : MMRel, CBranchZeroLikely<"bgtzl", brtarget, GPR32Opnd>,
               BGEZ_FM<23, 0>, ISA_MIPS2_NOT_32R6_64R6;
 def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
-              BGEZ_FM<6, 0>;
+              BGEZ_FM<6, 0>, ISA_MIPS1;
 def BLEZL   : MMRel, CBranchZeroLikely<"blezl", brtarget, GPR32Opnd>,
               BGEZ_FM<22, 0>, ISA_MIPS2_NOT_32R6_64R6;
 def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
-              BGEZ_FM<1, 0>;
+              BGEZ_FM<1, 0>, ISA_MIPS1;
 def BLTZL   : MMRel, CBranchZeroLikely<"bltzl", brtarget, GPR32Opnd>,
               BGEZ_FM<1, 2>, ISA_MIPS2_NOT_32R6_64R6;
-def B       : UncondBranch<BEQ, brtarget>,
-              AdditionalRequires<[NotInMicroMips]>;
+def B       : UncondBranch<BEQ, brtarget>, ISA_MIPS1;
+
+def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>, ISA_MIPS1;
 
-def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
-let AdditionalPredicates = [NotInMicroMips, NoIndirectJumpGuards] in {
-  def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
-  def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
 
-def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
-           ISA_MIPS32_NOT_32R6_64R6;
-def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
-             ISA_MIPS1_NOT_32R6_64R6;
-def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd>,
-              BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
-def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
-             ISA_MIPS1_NOT_32R6_64R6;
-def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd>,
-              BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
-def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
+let AdditionalPredicates = [NotInMicroMips, NoIndirectJumpGuards] in {
+  def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM, ISA_MIPS1;
+  def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>, ISA_MIPS1;
+}
 
+let AdditionalPredicates = [NotInMicroMips] in {
+  def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
+             ISA_MIPS32_NOT_32R6_64R6;
+  def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+               ISA_MIPS1_NOT_32R6_64R6;
+  def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd>,
+                BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
+  def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+               ISA_MIPS1_NOT_32R6_64R6;
+  def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd>,
+                BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
+  def BAL_BR : BAL_BR_Pseudo<BGEZAL, brtarget>, ISA_MIPS1;
+}
 let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips] in {
-  def TAILCALL : TailCall<J, jmptarget>;
+  def TAILCALL : TailCall<J, jmptarget>, ISA_MIPS1;
 }
 let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
                             NoIndirectJumpGuards] in
@@ -2155,64 +2327,61 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in
 }
 
 /// Multiply and Divide Instructions.
-def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
-def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
 let AdditionalPredicates = [NotInMicroMips] in {
+  def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+              MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
+  def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+              MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
   def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
               MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
   def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
               MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
-}
-def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
-           ISA_MIPS1_NOT_32R6_64R6;
-def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
-           ISA_MIPS1_NOT_32R6_64R6;
-let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
-    AdditionalPredicates = [NotInMicroMips] in {
-def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
-           ISA_MIPS1_NOT_32R6_64R6;
-def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
-           ISA_MIPS1_NOT_32R6_64R6;
-}
+  def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+             ISA_MIPS1_NOT_32R6_64R6;
+  def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+             ISA_MIPS1_NOT_32R6_64R6;
+  def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+             ISA_MIPS1_NOT_32R6_64R6;
+  def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+             ISA_MIPS1_NOT_32R6_64R6;
 
-/// Sign Ext In Register Instructions.
-def SEB : MMRel, StdMMR6Rel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
-          SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
-def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
-          SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
+  /// Sign Ext In Register Instructions.
+  def SEB : MMRel, StdMMR6Rel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+            SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+  def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+            SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
 
-/// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
-          ISA_MIPS32_NOT_32R6_64R6;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
-          ISA_MIPS32_NOT_32R6_64R6;
+  /// Count Leading
+  def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
+            ISA_MIPS32_NOT_32R6_64R6;
+  def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
+            ISA_MIPS32_NOT_32R6_64R6;
 
-let AdditionalPredicates = [NotInMicroMips] in {
   /// Word Swap Bytes Within Halfwords
   def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>,
              ISA_MIPS32R2;
-}
 
-/// No operation.
-def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
+  /// No operation.
+  def NOP : PseudoSE<(outs), (ins), []>,
+                     PseudoInstExpansion<(SLL ZERO, ZERO, 0)>, ISA_MIPS1;
 
-// FrameIndexes are legalized when they are operands from load/store
-// instructions. The same not happens for stack address copies, so an
-// add op with mem ComplexPattern is used and the stack address copy
-// can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
+  // FrameIndexes are legalized when they are operands from load/store
+  // instructions. The same not happens for stack address copies, so an
+  // add op with mem ComplexPattern is used and the stack address copy
+  // can be matched. It's similar to Sparc LEA_ADDRi
+  let AdditionalPredicates = [NotInMicroMips] in
+    def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>, ISA_MIPS1;
 
-// MADD*/MSUB*
-def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
-            ISA_MIPS32_NOT_32R6_64R6;
-def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
-            ISA_MIPS32_NOT_32R6_64R6;
-def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
-            ISA_MIPS32_NOT_32R6_64R6;
-def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
-            ISA_MIPS32_NOT_32R6_64R6;
+  // MADD*/MSUB*
+  def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+              ISA_MIPS32_NOT_32R6_64R6;
+  def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+              ISA_MIPS32_NOT_32R6_64R6;
+  def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+              ISA_MIPS32_NOT_32R6_64R6;
+  def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+              ISA_MIPS32_NOT_32R6_64R6;
+}
 
 let AdditionalPredicates = [NotDSP] in {
 def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
@@ -2237,35 +2406,39 @@ let AdditionalPredicates = [NotInMicroMips] in {
                                  0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
   def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
                                  0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
-  def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+  def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM, ISA_MIPS1;
   // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
   def EXT : MMRel, StdMMR6Rel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
                                        immZExt5, immZExt5Plus1, MipsExt>,
-            EXT_FM<0>;
+            EXT_FM<0>, ISA_MIPS32R2;
   def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
                                        uimm5_inssize_plus1, immZExt5,
                                        immZExt5Plus1>,
-            EXT_FM<4>;
+            EXT_FM<4>, ISA_MIPS32R2;
 }
 /// Move Control Registers From/To CPU Registers
 let AdditionalPredicates = [NotInMicroMips] in {
-  def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>, MFC3OP_FM<0x10, 4>,
-             ISA_MIPS32;
-  def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>, MFC3OP_FM<0x10, 0>,
-             ISA_MIPS32;
+  def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>,
+             MFC3OP_FM<0x10, 4, 0>, ISA_MIPS1;
+  def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>,
+             MFC3OP_FM<0x10, 0, 0>, ISA_MIPS1;
+  def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>,
+             MFC3OP_FM<0x12, 0, 0>, ISA_MIPS1;
+  def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>,
+             MFC3OP_FM<0x12, 4, 0>, ISA_MIPS1;
 }
-def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>, MFC3OP_FM<0x12, 0>;
-def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>, MFC3OP_FM<0x12, 4>;
 
 class Barrier<string asmstr, InstrItinClass itin = NoItinerary> :
   InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>,
+              ISA_MIPS1;
+  def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>, ISA_MIPS1;
 
-def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>;
-def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>;
-
-let isCTI = 1 in
-def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
-            ISA_MIPS32R2;
+  let isCTI = 1 in
+  def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
+              ISA_MIPS32R2;
+}
 
 // JR_HB and JALR_HB are defined here using the new style naming
 // scheme because some of this code is shared with Mips32r6InstrInfo.td
@@ -2324,10 +2497,10 @@ let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
 class TLB<string asmstr, InstrItinClass itin = NoItinerary> :
   InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>;
-def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>;
-def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>;
-def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>;
+  def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>, ISA_MIPS1;
+  def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>, ISA_MIPS1;
+  def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>, ISA_MIPS1;
+  def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>, ISA_MIPS1;
 }
 class CacheOp<string instr_asm, Operand MemOpnd,
               InstrItinClass itin = NoItinerary> :
@@ -2337,11 +2510,13 @@ class CacheOp<string instr_asm, Operand MemOpnd,
   let DecoderMethod = "DecodeCacheOp";
 }
 
-def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
-            INSN_MIPS3_32_NOT_32R6_64R6;
-def PREF :  MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
-            INSN_MIPS3_32_NOT_32R6_64R6;
-
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
+              INSN_MIPS3_32_NOT_32R6_64R6;
+  def PREF :  MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
+              INSN_MIPS3_32_NOT_32R6_64R6;
+}
+// FIXME: We are missing the prefx instruction.
 def ROL : MipsAsmPseudoInst<(outs),
                             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
                             "rol\t$rs, $rt, $rd">;
@@ -2418,6 +2593,38 @@ def MULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
                                    "mulou\t$rd, $rs, $rt">,
                  ISA_MIPS1_NOT_32R6_64R6;
 
+// Virtualization ASE
+class HYPCALL_FT<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_),
+         !strconcat(opstr, "\t$code_"), [], II_HYPCALL, FrmOther, opstr> {
+  let BaseOpcode = opstr;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MFGC0    : MMRel, MFC3OP<"mfgc0", GPR32Opnd, COP0Opnd, II_MFGC0>,
+                 MFC3OP_FM<0x10, 3, 0>, ISA_MIPS32R5, ASE_VIRT;
+  def MTGC0    : MMRel, MTC3OP<"mtgc0", COP0Opnd, GPR32Opnd, II_MTGC0>,
+                 MFC3OP_FM<0x10, 3, 2>, ISA_MIPS32R5, ASE_VIRT;
+  def MFHGC0   : MMRel, MFC3OP<"mfhgc0", GPR32Opnd, COP0Opnd, II_MFHGC0>,
+                 MFC3OP_FM<0x10, 3, 4>, ISA_MIPS32R5, ASE_VIRT;
+  def MTHGC0   : MMRel, MTC3OP<"mthgc0", COP0Opnd, GPR32Opnd, II_MTHGC0>,
+                 MFC3OP_FM<0x10, 3, 6>, ISA_MIPS32R5, ASE_VIRT;
+  def TLBGINV  : MMRel, TLB<"tlbginv", II_TLBGINV>, COP0_TLB_FM<0b001011>,
+                 ISA_MIPS32R5, ASE_VIRT;
+  def TLBGINVF : MMRel, TLB<"tlbginvf", II_TLBGINVF>, COP0_TLB_FM<0b001100>,
+                 ISA_MIPS32R5, ASE_VIRT;
+  def TLBGP    : MMRel, TLB<"tlbgp", II_TLBGP>, COP0_TLB_FM<0b010000>,
+                 ISA_MIPS32R5, ASE_VIRT;
+  def TLBGR    : MMRel, TLB<"tlbgr", II_TLBGR>, COP0_TLB_FM<0b001001>,
+                 ISA_MIPS32R5, ASE_VIRT;
+  def TLBGWI   : MMRel, TLB<"tlbgwi", II_TLBGWI>, COP0_TLB_FM<0b001010>,
+                 ISA_MIPS32R5, ASE_VIRT;
+  def TLBGWR   : MMRel, TLB<"tlbgwr", II_TLBGWR>, COP0_TLB_FM<0b001110>,
+                 ISA_MIPS32R5, ASE_VIRT;
+  def HYPCALL  : MMRel, HYPCALL_FT<"hypcall">,
+                 HYPCALL_FM<0b101000>, ISA_MIPS32R5, ASE_VIRT;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -2436,94 +2643,111 @@ multiclass OneOrTwoOperandMacroImmediateAlias<string Memnomic,
                                         Imm:$imm), 0>;
 }
 
-def : MipsInstAlias<"move $dst, $src",
-                    (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
-      GPR_32 {
-  let AdditionalPredicates = [NotInMicroMips];
-}
-def : MipsInstAlias<"move $dst, $src",
-                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
-      GPR_32 {
-  let AdditionalPredicates = [NotInMicroMips];
-}
-def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
-      ISA_MIPS1_NOT_32R6_64R6;
-
-def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
-let Predicates = [NotInMicroMips] in {
-def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-}
-def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>,
-      ISA_MIPS32;
-def : MipsInstAlias<"neg $rt, $rs",
-                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : MipsInstAlias<"neg $rt",
-                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
-def : MipsInstAlias<"negu $rt, $rs",
-                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : MipsInstAlias<"negu $rt",
-                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
 let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"move $dst, $src",
+                      (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+        GPR_32, ISA_MIPS1;
+  def : MipsInstAlias<"move $dst, $src",
+                      (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+        GPR_32, ISA_MIPS1;
+
+  def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 1>,
+        ISA_MIPS1_NOT_32R6_64R6;
+
+  def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>, ISA_MIPS1;
+
+  def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+
+  def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>,
+        ISA_MIPS32;
+
+  def : MipsInstAlias<"neg $rt, $rs",
+                      (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS1;
+  def : MipsInstAlias<"neg $rt",
+                      (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>, ISA_MIPS1;
+  def : MipsInstAlias<"negu $rt, $rs",
+                      (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS1;
+  def : MipsInstAlias<"negu $rt",
+                      (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>, ISA_MIPS1;
   def : MipsInstAlias<
           "sgt $rd, $rs, $rt",
-          (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+          (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
   def : MipsInstAlias<
           "sgt $rs, $rt",
-          (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+          (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
   def : MipsInstAlias<
           "sgtu $rd, $rs, $rt",
-          (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+          (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
   def : MipsInstAlias<
           "sgtu $$rs, $rt",
-          (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+          (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
   def : MipsInstAlias<
           "not $rt, $rs",
-          (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+          (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1;
   def : MipsInstAlias<
           "not $rt",
-          (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
-  def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-
-  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
-
-  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>;
+          (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>, ISA_MIPS1;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, GPR_32;
+  def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>, ISA_MIPS1;
 
-  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, GPR_32;
-
-  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, GPR_32;
-
-  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, GPR_32;
-
-  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, GPR_32;
-}
-def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
-def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
-def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, COP2Opnd:$rd, 0), 0>;
-def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 COP2Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
-let AdditionalPredicates = [NotInMicroMips] in {
-def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
-}
-def : MipsInstAlias<"bnez $rs,$offset",
-                    (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"bnezl $rs,$offset",
-                    (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"beqz $rs,$offset",
-                    (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"beqzl $rs,$offset",
-                    (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
-}
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
 
-def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
-def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
-let AdditionalPredicates = [NotInMicroMips] in {
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>, ISA_MIPS1;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, ISA_MIPS1, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, ISA_MIPS1, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, ISA_MIPS1, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, ISA_MIPS1, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, ISA_MIPS1, GPR_32;
+
+  def : MipsInstAlias<"mfgc0 $rt, $rd",
+                      (MFGC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+                      ISA_MIPS32R5, ASE_VIRT;
+  def : MipsInstAlias<"mtgc0 $rt, $rd",
+                      (MTGC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+                      ISA_MIPS32R5, ASE_VIRT;
+  def : MipsInstAlias<"mfhgc0 $rt, $rd",
+                      (MFHGC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+                      ISA_MIPS32R5, ASE_VIRT;
+  def : MipsInstAlias<"mthgc0 $rt, $rd",
+                      (MTHGC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+                      ISA_MIPS32R5, ASE_VIRT;
+  def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+        ISA_MIPS1;
+  def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+        ISA_MIPS1;
+  def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, COP2Opnd:$rd, 0), 0>,
+        ISA_MIPS1;
+  def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 COP2Opnd:$rd, GPR32Opnd:$rt, 0), 0>,
+        ISA_MIPS1;
+
+  def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>,
+        ISA_MIPS1;
+
+  def : MipsInstAlias<"bnez $rs,$offset",
+                      (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+        ISA_MIPS1;
+  def : MipsInstAlias<"bnezl $rs,$offset",
+                      (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+        ISA_MIPS2;
+  def : MipsInstAlias<"beqz $rs,$offset",
+                      (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+        ISA_MIPS1;
+  def : MipsInstAlias<"beqzl $rs,$offset",
+                      (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+        ISA_MIPS2;
+
+  def : MipsInstAlias<"syscall", (SYSCALL 0), 1>, ISA_MIPS1;
+
+  def : MipsInstAlias<"break", (BREAK 0, 0), 1>, ISA_MIPS1;
+  def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>, ISA_MIPS1;
   def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
   def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
-}
-let AdditionalPredicates = [NotInMicroMips] in {
+
   def : MipsInstAlias<"teq $rs, $rt",
                       (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
   def : MipsInstAlias<"tge $rs, $rt",
@@ -2536,6 +2760,9 @@ let AdditionalPredicates = [NotInMicroMips] in {
                       (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
   def : MipsInstAlias<"tne $rs, $rt",
                       (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"rdhwr $rt, $rs",
+                      (RDHWR GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, ISA_MIPS1;
+
 }
 def : MipsInstAlias<"sub, $rd, $rs, $imm",
                     (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
@@ -2567,8 +2794,8 @@ let AdditionalPredicates = [NotInMicroMips] in {
                      ISA_MIPS32R2;
 }
 def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
-def : MipsInstAlias<"sync",
-                    (SYNC 0), 1>, ISA_MIPS2;
+let AdditionalPredicates = [NotInMicroMips] in
+  def : MipsInstAlias<"sync", (SYNC 0), 1>, ISA_MIPS2;
 
 def : MipsInstAlias<"mulo $rs, $rt",
                     (MULOMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
@@ -2577,6 +2804,9 @@ def : MipsInstAlias<"mulou $rs, $rt",
                     (MULOUMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
                     ISA_MIPS1_NOT_32R6_64R6;
 
+let AdditionalPredicates = [NotInMicroMips] in
+  def : MipsInstAlias<"hypcall", (HYPCALL 0), 1>, ISA_MIPS32R5, ASE_VIRT;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -2714,6 +2944,36 @@ def : MipsInstAlias<"divu $rd, $imm", (UDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
                                                   simm32:$imm), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
 
+def SRemMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "rem\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def SRemIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+                                   "rem\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+def URemMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "remu\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def URemIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+                                   "remu\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
+def : MipsInstAlias<"rem $rt, $rs", (SRemMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                               GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"rem $rd, $imm", (SRemIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                      simm32_relaxed:$imm), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"remu $rt, $rs", (URemMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"remu $rd, $imm", (URemIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                       simm32_relaxed:$imm), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+
 def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
                             "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
 
@@ -2768,17 +3028,17 @@ def : MipsPat<(VT immSExt16:$imm), (ADDiuOp ZEROReg, imm:$imm)>;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in
-  defm : MaterializeImms<i32, ZERO, ADDiu, LUi, ORi>;
+  defm : MaterializeImms<i32, ZERO, ADDiu, LUi, ORi>, ISA_MIPS1;
 
 // Carry MipsPatterns
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
-                (SUBu GPR32:$lhs, GPR32:$rhs)>;
+                (SUBu GPR32:$lhs, GPR32:$rhs)>, ISA_MIPS1;
 }
 def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
-              (ADDu GPR32:$lhs, GPR32:$rhs)>, ASE_NOT_DSP;
+              (ADDu GPR32:$lhs, GPR32:$rhs)>, ISA_MIPS1, ASE_NOT_DSP;
 def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
-              (ADDiu GPR32:$src, imm:$imm)>, ASE_NOT_DSP;
+              (ADDiu GPR32:$src, imm:$imm)>, ISA_MIPS1, ASE_NOT_DSP;
 
 // Support multiplication for pre-Mips32 targets that don't have
 // the MUL instruction.
@@ -2792,16 +3052,16 @@ def : MipsPat<(MipsSync (i32 immz)),
 
 // Call
 def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
-              (JAL texternalsym:$dst)>;
+              (JAL texternalsym:$dst)>, ISA_MIPS1;
 //def : MipsPat<(MipsJmpLink GPR32:$dst),
 //              (JALR GPR32:$dst)>;
 
 // Tail call
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
-                (TAILCALL tglobaladdr:$dst)>;
+                (TAILCALL tglobaladdr:$dst)>, ISA_MIPS1;
   def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
-                (TAILCALL texternalsym:$dst)>;
+                (TAILCALL texternalsym:$dst)>, ISA_MIPS1;
 }
 // hi/lo relocs
 multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
@@ -2810,7 +3070,6 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
   def : MipsPat<(MipsHi tblockaddress:$in), (Lui tblockaddress:$in)>;
   def : MipsPat<(MipsHi tjumptable:$in), (Lui tjumptable:$in)>;
   def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>;
-  def : MipsPat<(MipsHi tglobaltlsaddr:$in), (Lui tglobaltlsaddr:$in)>;
   def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>;
 
   def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
@@ -2834,44 +3093,47 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
               (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
 }
 
-defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>;
+// wrapper_pic
+class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+      MipsPat<(MipsWrapper RC:$gp, node:$in), (ADDiuOp RC:$gp, node:$in)>;
 
-def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
-def : MipsPat<(MipsGotHi texternalsym:$in), (LUi texternalsym:$in)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>, ISA_MIPS1;
 
-// gp_rel relocs
-def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
-              (ADDiu GPR32:$gp, tglobaladdr:$in)>, ABI_NOT_N64;
-def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
-              (ADDiu GPR32:$gp, tconstpool:$in)>, ABI_NOT_N64;
+  def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi tglobaladdr:$in)>, ISA_MIPS1;
+  def : MipsPat<(MipsGotHi texternalsym:$in), (LUi texternalsym:$in)>,
+        ISA_MIPS1;
 
-// wrapper_pic
-class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
-      MipsPat<(MipsWrapper RC:$gp, node:$in),
-              (ADDiuOp RC:$gp, node:$in)>;
+  def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>,
+        ISA_MIPS1;
 
-def : WrapperPat<tglobaladdr, ADDiu, GPR32>;
-def : WrapperPat<tconstpool, ADDiu, GPR32>;
-def : WrapperPat<texternalsym, ADDiu, GPR32>;
-def : WrapperPat<tblockaddress, ADDiu, GPR32>;
-def : WrapperPat<tjumptable, ADDiu, GPR32>;
-def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>;
+  // gp_rel relocs
+  def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+                (ADDiu GPR32:$gp, tglobaladdr:$in)>, ISA_MIPS1, ABI_NOT_N64;
+  def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+                (ADDiu GPR32:$gp, tconstpool:$in)>, ISA_MIPS1, ABI_NOT_N64;
 
-let AdditionalPredicates = [NotInMicroMips] in {
-// Mips does not have "not", so we expand our way
-def : MipsPat<(not GPR32:$in),
-              (NOR GPR32Opnd:$in, ZERO)>;
+  def : WrapperPat<tglobaladdr, ADDiu, GPR32>, ISA_MIPS1;
+  def : WrapperPat<tconstpool, ADDiu, GPR32>, ISA_MIPS1;
+  def : WrapperPat<texternalsym, ADDiu, GPR32>, ISA_MIPS1;
+  def : WrapperPat<tblockaddress, ADDiu, GPR32>, ISA_MIPS1;
+  def : WrapperPat<tjumptable, ADDiu, GPR32>, ISA_MIPS1;
+  def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>, ISA_MIPS1;
+
+  // Mips does not have "not", so we expand our way
+  def : MipsPat<(not GPR32:$in),
+                (NOR GPR32Opnd:$in, ZERO)>, ISA_MIPS1;
 }
 
 // extended loads
-def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
-def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
 let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
-}
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>, ISA_MIPS1;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>, ISA_MIPS1;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>, ISA_MIPS1;
 
-// peepholes
-def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+  // peepholes
+  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>, ISA_MIPS1;
+}
 
 // brcond patterns
 multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BEQOp1,
@@ -2905,12 +3167,13 @@ def : MipsPat<(brcond RC:$cond, bb:$dst),
               (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
 }
 let AdditionalPredicates = [NotInMicroMips] in {
-  defm : BrcondPats<GPR32, BEQ, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+  defm : BrcondPats<GPR32, BEQ, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>,
+         ISA_MIPS1;
+  def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+                (BLEZ i32:$lhs, bb:$dst)>, ISA_MIPS1;
+  def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+                (BGEZ i32:$lhs, bb:$dst)>, ISA_MIPS1;
 }
-def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
-              (BLEZ i32:$lhs, bb:$dst)>;
-def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
-              (BGEZ i32:$lhs, bb:$dst)>;
 
 // setcc patterns
 multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
@@ -2957,36 +3220,39 @@ multiclass SetgeImmPats<RegisterClass RC, Instruction XORiOp,
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
-  defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>;
-  defm : SetlePats<GPR32, XORi, SLT, SLTu>;
-  defm : SetgtPats<GPR32, SLT, SLTu>;
-  defm : SetgePats<GPR32, XORi, SLT, SLTu>;
-  defm : SetgeImmPats<GPR32, XORi, SLTi, SLTiu>;
-}
+  defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>, ISA_MIPS1;
+  defm : SetlePats<GPR32, XORi, SLT, SLTu>, ISA_MIPS1;
+  defm : SetgtPats<GPR32, SLT, SLTu>, ISA_MIPS1;
+  defm : SetgePats<GPR32, XORi, SLT, SLTu>, ISA_MIPS1;
+  defm : SetgeImmPats<GPR32, XORi, SLTi, SLTiu>, ISA_MIPS1;
 
-// bswap pattern
-def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
+  // bswap pattern
+  def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>, ISA_MIPS32R2;
+}
 
 // Load halfword/word patterns.
-let AddedComplexity = 40 in {
-  def : LoadRegImmPat<LBu, i32, zextloadi8>;
-  let AdditionalPredicates = [NotInMicroMips] in {
-    def : LoadRegImmPat<LH, i32, sextloadi16>;
-    def : LoadRegImmPat<LW, i32, load>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  let AddedComplexity = 40 in {
+    def : LoadRegImmPat<LBu, i32, zextloadi8>, ISA_MIPS1;
+    def : LoadRegImmPat<LHu, i32, zextloadi16>, ISA_MIPS1;
+    def : LoadRegImmPat<LB, i32, sextloadi8>, ISA_MIPS1;
+    def : LoadRegImmPat<LH, i32, sextloadi16>, ISA_MIPS1;
+    def : LoadRegImmPat<LW, i32, load>, ISA_MIPS1;
   }
-}
 
-// Atomic load patterns.
-def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
-}
-def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
+  // Atomic load patterns.
+  def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>, ISA_MIPS1;
+  def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>, ISA_MIPS1;
+  def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>, ISA_MIPS1;
 
-// Atomic store patterns.
-def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>;
-def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>;
-def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>;
+  // Atomic store patterns.
+  def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>,
+        ISA_MIPS1;
+  def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>,
+        ISA_MIPS1;
+  def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>,
+        ISA_MIPS1;
+}
 
 //===----------------------------------------------------------------------===//
 // Floating Point Support
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
new file mode 100644
index 000000000000..af0ac006bc9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -0,0 +1,184 @@
+//===- MipsInstructionSelector.cpp ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MipsRegisterBankInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+
+#define DEBUG_TYPE "mips-isel"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class MipsInstructionSelector : public InstructionSelector {
+public:
+  MipsInstructionSelector(const MipsTargetMachine &TM, const MipsSubtarget &STI,
+                          const MipsRegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  static const char *getName() { return DEBUG_TYPE; }
+
+private:
+  bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+  const MipsTargetMachine &TM;
+  const MipsSubtarget &STI;
+  const MipsInstrInfo &TII;
+  const MipsRegisterInfo &TRI;
+  const MipsRegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+MipsInstructionSelector::MipsInstructionSelector(
+    const MipsTargetMachine &TM, const MipsSubtarget &STI,
+    const MipsRegisterBankInfo &RBI)
+    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI),
+
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "MipsGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                       const RegisterBankInfo &RBI) {
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return true;
+
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+
+  if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << " operand\n");
+    return false;
+  }
+  return true;
+}
+
+bool MipsInstructionSelector::select(MachineInstr &I,
+                                     CodeGenCoverage &CoverageInfo) const {
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  if (!isPreISelGenericOpcode(I.getOpcode())) {
+    if (I.isCopy())
+      return selectCopy(I, TII, MRI, TRI, RBI);
+
+    return true;
+  }
+
+  if (selectImpl(I, CoverageInfo)) {
+    return true;
+  }
+
+  MachineInstr *MI = nullptr;
+  using namespace TargetOpcode;
+
+  switch (I.getOpcode()) {
+  case G_GEP: {
+    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
+             .add(I.getOperand(0))
+             .add(I.getOperand(1))
+             .add(I.getOperand(2));
+    break;
+  }
+  case G_FRAME_INDEX: {
+    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
+             .add(I.getOperand(0))
+             .add(I.getOperand(1))
+             .addImm(0);
+    break;
+  }
+  case G_STORE:
+  case G_LOAD: {
+    const unsigned DestReg = I.getOperand(0).getReg();
+    const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
+    const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
+
+    if (DestRegBank != Mips::GPRBRegBankID || OpSize != 32)
+      return false;
+
+    const unsigned NewOpc = I.getOpcode() == G_STORE ? Mips::SW : Mips::LW;
+
+    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
+             .add(I.getOperand(0))
+             .add(I.getOperand(1))
+             .addImm(0)
+             .addMemOperand(*I.memoperands_begin());
+    break;
+  }
+  case G_CONSTANT: {
+    int Imm = I.getOperand(1).getCImm()->getValue().getLimitedValue();
+    unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    MachineInstr *LUi, *ORi;
+
+    LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
+              .addDef(LUiReg)
+              .addImm(Imm >> 16);
+
+    ORi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ORi))
+              .addDef(I.getOperand(0).getReg())
+              .addUse(LUiReg)
+              .addImm(Imm & 0xFFFF);
+
+    if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI))
+      return false;
+    if (!constrainSelectedInstRegOperands(*ORi, TII, TRI, RBI))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  default:
+    return false;
+  }
+
+  I.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+}
+
+namespace llvm {
+InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &TM,
+                                                   MipsSubtarget &Subtarget,
+                                                   MipsRegisterBankInfo &RBI) {
+  return new MipsInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
new file mode 100644
index 000000000000..da6f9dabdaaf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -0,0 +1,41 @@
+//===- MipsLegalizerInfo.cpp ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MipsLegalizerInfo.h"
+#include "MipsTargetMachine.h"
+
+using namespace llvm;
+
+MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
+  using namespace TargetOpcode;
+
+  const LLT s32 = LLT::scalar(32);
+  const LLT p0 = LLT::pointer(0, 32);
+
+  getActionDefinitionsBuilder(G_ADD).legalFor({s32});
+
+  getActionDefinitionsBuilder({G_LOAD, G_STORE})
+      .legalForCartesianProduct({p0, s32}, {p0});
+
+  getActionDefinitionsBuilder(G_CONSTANT)
+      .legalFor({s32});
+
+  getActionDefinitionsBuilder(G_GEP)
+      .legalFor({{p0, s32}});
+
+  getActionDefinitionsBuilder(G_FRAME_INDEX)
+      .legalFor({p0});
+
+  computeTables();
+  verify(*ST.getInstrInfo());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.h b/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.h
new file mode 100644
index 000000000000..36dd39c8c1c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.h
@@ -0,0 +1,29 @@
+//===- MipsLegalizerInfo ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class MipsSubtarget;
+
+/// This class provides legalization strategies.
+class MipsLegalizerInfo : public LegalizerInfo {
+public:
+  MipsLegalizerInfo(const MipsSubtarget &ST);
+};
+} // end namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index a4ab7d3a5780..2b7f64099923 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -219,26 +219,77 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   // Lower register operand.
   OutMI.addOperand(LowerOperand(MI->getOperand(0)));
 
-  // Create %hi($tgt-$baltgt).
-  OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
-                             MI->getOperand(2).getMBB(),
-                             MipsMCExpr::MEK_HI));
+  MipsMCExpr::MipsExprKind Kind;
+  unsigned TargetFlags = MI->getOperand(1).getTargetFlags();
+  switch (TargetFlags) {
+  case MipsII::MO_HIGHEST:
+    Kind = MipsMCExpr::MEK_HIGHEST;
+    break;
+  case MipsII::MO_HIGHER:
+    Kind = MipsMCExpr::MEK_HIGHER;
+    break;
+  case MipsII::MO_ABS_HI:
+    Kind = MipsMCExpr::MEK_HI;
+    break;
+  case MipsII::MO_ABS_LO:
+    Kind = MipsMCExpr::MEK_LO;
+    break;
+  default:
+    report_fatal_error("Unexpected flags for lowerLongBranchLUi");
+  }
+
+  if (MI->getNumOperands() == 2) {
+    const MCExpr *Expr =
+        MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
+    const MipsMCExpr *MipsExpr = MipsMCExpr::create(Kind, Expr, *Ctx);
+    OutMI.addOperand(MCOperand::createExpr(MipsExpr));
+  } else if (MI->getNumOperands() == 3) {
+    // Create %hi($tgt-$baltgt).
+    OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+                               MI->getOperand(2).getMBB(), Kind));
+  }
 }
 
-void MipsMCInstLower::lowerLongBranchADDiu(
-    const MachineInstr *MI, MCInst &OutMI, int Opcode,
-    MipsMCExpr::MipsExprKind Kind) const {
+void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
+                                           MCInst &OutMI, int Opcode) const {
   OutMI.setOpcode(Opcode);
 
+  MipsMCExpr::MipsExprKind Kind;
+  unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+  switch (TargetFlags) {
+  case MipsII::MO_HIGHEST:
+    Kind = MipsMCExpr::MEK_HIGHEST;
+    break;
+  case MipsII::MO_HIGHER:
+    Kind = MipsMCExpr::MEK_HIGHER;
+    break;
+  case MipsII::MO_ABS_HI:
+    Kind = MipsMCExpr::MEK_HI;
+    break;
+  case MipsII::MO_ABS_LO:
+    Kind = MipsMCExpr::MEK_LO;
+    break;
+  default:
+    report_fatal_error("Unexpected flags for lowerLongBranchADDiu");
+  }
+
   // Lower two register operands.
   for (unsigned I = 0, E = 2; I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
     OutMI.addOperand(LowerOperand(MO));
   }
 
-  // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
-  OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
-                             MI->getOperand(3).getMBB(), Kind));
+  if (MI->getNumOperands() == 3) {
+    // Lower register operand.
+    const MCExpr *Expr =
+        MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
+    const MipsMCExpr *MipsExpr = MipsMCExpr::create(Kind, Expr, *Ctx);
+    OutMI.addOperand(MCOperand::createExpr(MipsExpr));
+  } else if (MI->getNumOperands() == 4) {
+    // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+    OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+                               MI->getOperand(3).getMBB(), Kind));
+  }
 }
 
 bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
@@ -250,16 +301,10 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
     lowerLongBranchLUi(MI, OutMI);
     return true;
   case Mips::LONG_BRANCH_ADDiu:
-    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu, MipsMCExpr::MEK_LO);
+    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu);
     return true;
   case Mips::LONG_BRANCH_DADDiu:
-    unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
-    if (TargetFlags == MipsII::MO_ABS_HI)
-      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_HI);
-    else if (TargetFlags == MipsII::MO_ABS_LO)
-      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_LO);
-    else
-      report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+    lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu);
     return true;
   }
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
index fb5079643827..e19f21c98839 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -44,8 +44,8 @@ private:
   MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
                       MipsMCExpr::MipsExprKind Kind) const;
   void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
-  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
-                            MipsMCExpr::MipsExprKind Kind) const;
+  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
+                            int Opcode) const;
   bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
 };
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
index 7d25ea56e3d5..d4e225678184 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
-                PredicateControl, ASE_MSA {
+                ASE_MSA {
   let EncodingPredicates = [HasStdEnc];
   let Inst{31-26} = 0b011110;
 }
@@ -24,7 +24,8 @@ class MSASpecial : MSAInst {
 class MSAPseudo<dag outs, dag ins, list<dag> pattern,
                 InstrItinClass itin = IIPseudo>:
   MipsPseudo<outs, ins, pattern, itin> {
-  let Predicates = [HasMSA];
+  let EncodingPredicates = [HasStdEnc];
+  let ASEPredicate = [HasMSA];
 }
 
 class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
index bf79f0f2ff82..d83f75ffa1c1 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -35,14 +35,6 @@ def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
 def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
 def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
 def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
-def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
-                       [SDNPCommutative, SDNPAssociative]>;
-def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
-                       [SDNPCommutative, SDNPAssociative]>;
-def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
-                       [SDNPCommutative, SDNPAssociative]>;
-def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
-                       [SDNPCommutative, SDNPAssociative]>;
 def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
                       [SDNPCommutative, SDNPAssociative]>;
 def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
@@ -54,6 +46,7 @@ def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
 def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
 def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
 def MipsINSVE : SDNode<"MipsISD::INSVE", SDT_INSVE>;
+def MipsFMS   : SDNode<"MipsISD::FMS", SDTFPTernaryOp>;
 
 def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
 def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
@@ -188,8 +181,28 @@ def vsplati16 : PatFrag<(ops node:$e0),
 def vsplati32 : PatFrag<(ops node:$e0),
                         (v4i32 (build_vector node:$e0, node:$e0,
                                              node:$e0, node:$e0))>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
 def vsplati64 : PatFrag<(ops node:$e0),
                         (v2i64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati64_splat_d : PatFrag<(ops node:$e0),
+                                (v2i64 (bitconvert
+                                         (v4i32 (and
+                                           (v4i32 (build_vector node:$e0,
+                                                                node:$e0,
+                                                                node:$e0,
+                                                                node:$e0)),
+                                           vsplati64_imm_eq_1))))>;
+
 def vsplatf32 : PatFrag<(ops node:$e0),
                         (v4f32 (build_vector node:$e0, node:$e0,
                                              node:$e0, node:$e0))>;
@@ -203,7 +216,8 @@ def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
 def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
                             (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
 def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
-                            (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+                            (MipsVSHF (vsplati64_splat_d node:$i),
+                                      node:$v, node:$v)>;
 
 class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
                    SDNodeXForm xform = NOOP_SDNodeXForm>
@@ -334,15 +348,6 @@ def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
          Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
 }]>;
 
-def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
-  APInt Imm;
-  SDNode *BV = N->getOperand(0).getNode();
-  EVT EltTy = N->getValueType(0).getVectorElementType();
-
-  return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
-         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
-}]>;
-
 def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
                       (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
                                           immAllOnesV))>;
@@ -377,9 +382,6 @@ def vbset_d : PatFrag<(ops node:$ws, node:$wt),
                       (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
                                          node:$wt))>;
 
-def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
-                  (fsub node:$wd, (fmul node:$ws, node:$wt))>;
-
 def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
                      (add node:$wd, (mul node:$ws, node:$wt))>;
 
@@ -1788,6 +1790,7 @@ class CFCMSA_DESC {
   string AsmString = "cfcmsa\t$rd, $cs";
   InstrItinClass Itinerary = NoItinerary;
   bit hasSideEffects = 1;
+  bit isMoveReg = 1;
 }
 
 class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
@@ -1882,6 +1885,7 @@ class CTCMSA_DESC {
   string AsmString = "ctcmsa\t$cd, $rs";
   InstrItinClass Itinerary = NoItinerary;
   bit hasSideEffects = 1;
+  bit isMoveReg = 1;
 }
 
 class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
@@ -2099,8 +2103,8 @@ class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
 class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
                                         MSA128DOpnd>;
 
-class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
-class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", MipsFMS, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", MipsFMS, MSA128DOpnd>;
 
 class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
 class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
@@ -2350,32 +2354,32 @@ class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
 class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
 class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
 
-class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
-class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
-class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
-class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", smax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", smax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", smax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", smax, MSA128DOpnd>;
 
-class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
-class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
-class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
-class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", umax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", umax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", umax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", umax, MSA128DOpnd>;
 
-class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", smax, vsplati8_simm5,
                                        MSA128BOpnd>;
-class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", smax, vsplati16_simm5,
                                        MSA128HOpnd>;
-class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", smax, vsplati32_simm5,
                                        MSA128WOpnd>;
-class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", smax, vsplati64_simm5,
                                        MSA128DOpnd>;
 
-class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", umax, vsplati8_uimm5,
                                        MSA128BOpnd>;
-class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", umax, vsplati16_uimm5,
                                        MSA128HOpnd>;
-class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", umax, vsplati32_uimm5,
                                        MSA128WOpnd>;
-class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", umax, vsplati64_uimm5,
                                        MSA128DOpnd>;
 
 class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
@@ -2383,32 +2387,32 @@ class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
 class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
 class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
 
-class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
-class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
-class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
-class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", smin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", smin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", smin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", smin, MSA128DOpnd>;
 
-class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
-class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
-class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
-class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", umin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", umin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", umin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", umin, MSA128DOpnd>;
 
-class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", smin, vsplati8_simm5,
                                        MSA128BOpnd>;
-class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", smin, vsplati16_simm5,
                                        MSA128HOpnd>;
-class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", smin, vsplati32_simm5,
                                        MSA128WOpnd>;
-class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", smin, vsplati64_simm5,
                                        MSA128DOpnd>;
 
-class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", umin, vsplati8_uimm5,
                                        MSA128BOpnd>;
-class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", umin, vsplati16_uimm5,
                                        MSA128HOpnd>;
-class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", umin, vsplati32_uimm5,
                                        MSA128WOpnd>;
-class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", umin, vsplati64_uimm5,
                                        MSA128DOpnd>;
 
 class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
@@ -2427,6 +2431,7 @@ class MOVE_V_DESC {
   string AsmString = "move.v\t$wd, $ws";
   list<dag> Pattern = [];
   InstrItinClass Itinerary = NoItinerary;
+  bit isMoveReg = 1;
 }
 
 class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
@@ -3143,6 +3148,20 @@ def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
 def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
 def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
 
+def : MipsPat<(fsub MSA128WOpnd:$wd, (fmul MSA128WOpnd:$ws, MSA128WOpnd:$wt)),
+              (FMSUB_W MSA128WOpnd:$wd, MSA128WOpnd:$ws, MSA128WOpnd:$wt)>,
+              ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+def : MipsPat<(fsub MSA128DOpnd:$wd, (fmul MSA128DOpnd:$ws, MSA128DOpnd:$wt)),
+              (FMSUB_D MSA128DOpnd:$wd, MSA128DOpnd:$ws, MSA128DOpnd:$wt)>,
+              ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+
+def : MipsPat<(fadd MSA128WOpnd:$wd, (fmul MSA128WOpnd:$ws, MSA128WOpnd:$wt)),
+              (FMADD_W MSA128WOpnd:$wd, MSA128WOpnd:$ws, MSA128WOpnd:$wt)>,
+              ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+def : MipsPat<(fadd MSA128DOpnd:$wd, (fmul MSA128DOpnd:$ws, MSA128DOpnd:$wt)),
+              (FMADD_D MSA128DOpnd:$wd, MSA128DOpnd:$ws, MSA128DOpnd:$wt)>,
+              ISA_MIPS1, ASE_MSA, FPOP_FUSION_FAST;
+
 def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
 def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
 def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
@@ -3733,7 +3752,7 @@ def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
 
 // Pseudoes used to implement transparent fp16 support.
 
-let Predicates = [HasMSA] in {
+let ASEPredicate = [HasMSA] in {
  def ST_F16 : MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr),
                           [(store (f16 MSA128F16:$ws), (addrimm10:$addr))]> {
    let usesCustomInserter = 1;
@@ -3773,12 +3792,13 @@ let Predicates = [HasMSA] in {
  }
 
  def : MipsPat<(MipsTruncIntFP MSA128F16:$ws),
-               (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>;
+               (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>, ISA_MIPS1,
+               ASE_MSA;
 
  def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond),
                (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws),
                          (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>,
-       ISA_MIPS1_NOT_32R6_64R6;
+       ISA_MIPS1_NOT_32R6_64R6, ASE_MSA;
 }
 
 def vsplati64_imm_eq_63 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
diff --git a/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td
index edc0981e6278..c2c22e2ad61c 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td
@@ -15,8 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-class MipsMTInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
-                   PredicateControl {
+class MipsMTInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
   let DecoderNamespace = "Mips";
   let EncodingPredicates = [HasStdEnc];
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
index 1ee56d830090..81b4352670c0 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -29,25 +29,27 @@ bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
 }
 
+static const TargetRegisterClass &getGlobalBaseRegClass(MachineFunction &MF) {
+  auto &STI = static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  auto &TM = static_cast<const MipsTargetMachine &>(MF.getTarget());
+
+  if (STI.inMips16Mode())
+    return Mips::CPU16RegsRegClass;
+
+  if (STI.inMicroMipsMode())
+    return Mips::GPRMM16RegClass;
+
+  if (TM.getABI().IsN64())
+    return Mips::GPR64RegClass;
+
+  return Mips::GPR32RegClass;
+}
+
 unsigned MipsFunctionInfo::getGlobalBaseReg() {
-  // Return if it has already been initialized.
-  if (GlobalBaseReg)
-    return GlobalBaseReg;
-
-  MipsSubtarget const &STI =
-      static_cast<const MipsSubtarget &>(MF.getSubtarget());
-
-  const TargetRegisterClass *RC =
-      STI.inMips16Mode()
-          ? &Mips::CPU16RegsRegClass
-          : STI.inMicroMipsMode()
-                ? &Mips::GPRMM16RegClass
-                : static_cast<const MipsTargetMachine &>(MF.getTarget())
-                          .getABI()
-                          .IsN64()
-                      ? &Mips::GPR64RegClass
-                      : &Mips::GPR32RegClass;
-  return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
+  if (!GlobalBaseReg)
+    GlobalBaseReg =
+        MF.getRegInfo().createVirtualRegister(&getGlobalBaseRegClass(MF));
+  return GlobalBaseReg;
 }
 
 void MipsFunctionInfo::createEhDataRegsFI() {
diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index ceacaa498389..a2b55e8bddcd 100644
--- a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -11,6 +11,7 @@
 #include "Mips.h"
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -32,6 +33,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<TargetPassConfig>();
+      AU.addPreserved<StackProtector>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -42,7 +44,7 @@ namespace {
 }
 
 bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+  LLVM_DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
   auto &TPC = getAnalysis<TargetPassConfig>();
   auto &TM = TPC.getTM<MipsTargetMachine>();
   TM.resetSubtarget(&MF);
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
index a9ca31a6d09f..27bc4843f410 100644
--- a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -35,6 +34,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include <cassert>
 #include <utility>
@@ -90,10 +90,10 @@ public:
   }
 
 private:
-  /// \brief Visit MBB.
+  /// Visit MBB.
   bool visitNode(MBBInfo &MBBI);
 
-  /// \brief Test if MI jumps to a function via a register.
+  /// Test if MI jumps to a function via a register.
   ///
   /// Also, return the virtual register containing the target function's address
   /// and the underlying object in Reg and Val respectively, if the function's
@@ -101,15 +101,15 @@ private:
   bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
                          ValueType &Val) const;
 
-  /// \brief Return the number of instructions that dominate the current
+  /// Return the number of instructions that dominate the current
   /// instruction and load the function address from object Entry.
   unsigned getCount(ValueType Entry);
 
-  /// \brief Return the destination virtual register of the last instruction
+  /// Return the destination virtual register of the last instruction
   /// that loads from object Entry.
   unsigned getReg(ValueType Entry);
 
-  /// \brief Update ScopedHT.
+  /// Update ScopedHT.
   void incCntAndSetReg(ValueType Entry, unsigned Reg);
 
   ScopedHTType ScopedHT;
diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
index 7ee45c28a7d0..4edcb3132ada 100644
--- a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
@@ -96,7 +96,8 @@ static bool needsFP(Function &F) {
         ;
       }
       if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-        DEBUG(dbgs() << "Working on call" << "\n");
+        LLVM_DEBUG(dbgs() << "Working on call"
+                          << "\n");
         Function &F_ =  *CI->getCalledFunction();
         if (needsFPFromSig(F_))
           return true;
@@ -110,9 +111,10 @@ bool MipsOs16::runOnModule(Module &M) {
   bool usingMask = Mips32FunctionMask.length() > 0;
   bool doneUsingMask = false; // this will make it stop repeating
 
-  DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+  LLVM_DEBUG(dbgs() << "Run on Module MipsOs16 \n"
+                    << Mips32FunctionMask << "\n");
   if (usingMask)
-    DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+    LLVM_DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
 
   unsigned int functionIndex = 0;
   bool modified = false;
@@ -121,14 +123,14 @@ bool MipsOs16::runOnModule(Module &M) {
     if (F.isDeclaration())
       continue;
 
-    DEBUG(dbgs() << "Working on " << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Working on " << F.getName() << "\n");
     if (usingMask) {
       if (!doneUsingMask) {
         if (functionIndex == Mips32FunctionMask.length())
           functionIndex = 0;
         switch (Mips32FunctionMask[functionIndex]) {
         case '1':
-          DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
+          LLVM_DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
           F.addFnAttr("nomips16");
           break;
         case '.':
@@ -142,11 +144,11 @@ bool MipsOs16::runOnModule(Module &M) {
     }
     else {
       if (needsFP(F)) {
-        DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
+        LLVM_DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
         F.addFnAttr("nomips16");
       }
       else {
-        DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
+        LLVM_DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
         F.addFnAttr("mips16");
       }
     }
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
new file mode 100644
index 000000000000..cef21f447205
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -0,0 +1,100 @@
+//===- MipsRegisterBankInfo.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "MipsRegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+
+#define DEBUG_TYPE "registerbankinfo"
+
+#include "MipsGenRegisterBank.inc"
+
+namespace llvm {
+namespace Mips {
+enum PartialMappingIdx {
+  PMI_GPR,
+  PMI_Min = PMI_GPR,
+};
+
+RegisterBankInfo::PartialMapping PartMappings[]{
+    {0, 32, GPRBRegBank}
+};
+
+enum ValueMappingIdx { InvalidIdx = 0, GPRIdx = 1 };
+
+RegisterBankInfo::ValueMapping ValueMappings[] = {
+    // invalid
+    {nullptr, 0},
+    // 3 operands in GPRs
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1}};
+
+} // end namespace Mips
+} // end namespace llvm
+
+using namespace llvm;
+
+MipsRegisterBankInfo::MipsRegisterBankInfo(const TargetRegisterInfo &TRI)
+    : MipsGenRegisterBankInfo() {}
+
+const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+  using namespace Mips;
+
+  switch (RC.getID()) {
+  case Mips::GPR32RegClassID:
+  case Mips::CPU16Regs_and_GPRMM16ZeroRegClassID:
+  case Mips::GPRMM16MoveP_and_CPU16Regs_and_GPRMM16ZeroRegClassID:
+  case Mips::SP32RegClassID:
+    return getRegBank(Mips::GPRBRegBankID);
+  default:
+    llvm_unreachable("Register class not supported");
+  }
+}
+
+const RegisterBankInfo::InstructionMapping &
+MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+
+  unsigned Opc = MI.getOpcode();
+
+  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+  if (Mapping.isValid())
+    return Mapping;
+
+  using namespace TargetOpcode;
+
+  unsigned NumOperands = MI.getNumOperands();
+  const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+
+  switch (Opc) {
+  case G_ADD:
+  case G_LOAD:
+  case G_STORE:
+  case G_GEP:
+    OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+    break;
+  case G_CONSTANT:
+  case G_FRAME_INDEX:
+    OperandsMapping =
+        getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr});
+    break;
+  default:
+    return getInvalidInstructionMapping();
+  }
+
+  return getInstructionMapping(DefaultMappingID, /*Cost=*/1, OperandsMapping,
+                               NumOperands);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
new file mode 100644
index 000000000000..64a79abaa74d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -0,0 +1,43 @@
+//===- MipsRegisterBankInfo.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for Mips.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "MipsGenRegisterBank.inc"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+class MipsGenRegisterBankInfo : public RegisterBankInfo {
+#define GET_TARGET_REGBANK_CLASS
+#include "MipsGenRegisterBank.inc"
+};
+
+/// This class provides the information for the target register banks.
+class MipsRegisterBankInfo final : public MipsGenRegisterBankInfo {
+public:
+  MipsRegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const override;
+};
+} // end namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepDecoders.h b/contrib/llvm/lib/Target/Mips/MipsRegisterBanks.td
index 020362a95909..5f1687048fac 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonDepDecoders.h
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterBanks.td
@@ -1,4 +1,4 @@
-//===- HexagonDepDecoders.h -----------------------------------------------===//
+//===- MipsRegisterBank.td ---------------------------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,8 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+//
+//
 //===----------------------------------------------------------------------===//
 
-
-
+def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 0e0d82270c89..3c108c2ba9b7 100644
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -275,18 +275,20 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
 
-  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
-        errs() << "<--------->\n" << MI);
+  LLVM_DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
+             errs() << "<--------->\n"
+                    << MI);
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   uint64_t stackSize = MF.getFrameInfo().getStackSize();
   int64_t spOffset = MF.getFrameInfo().getObjectOffset(FrameIndex);
 
-  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
-               << "spOffset   : " << spOffset << "\n"
-               << "stackSize  : " << stackSize << "\n"
-               << "alignment  : "
-               << MF.getFrameInfo().getObjectAlignment(FrameIndex) << "\n");
+  LLVM_DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+                    << "spOffset   : " << spOffset << "\n"
+                    << "stackSize  : " << stackSize << "\n"
+                    << "alignment  : "
+                    << MF.getFrameInfo().getObjectAlignment(FrameIndex)
+                    << "\n");
 
   eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
index fe8d7953ec8f..4cc50fb981ba 100644
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -57,6 +57,8 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
@@ -72,7 +74,7 @@ public:
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  /// \brief Return GPR register class.
+  /// Return GPR register class.
   virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
 
 private:
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index eb1eea7925c0..687c9f676b34 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -394,7 +394,6 @@ MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
 
 void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
                                        MachineBasicBlock &MBB) const {
-  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineFrameInfo &MFI    = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
@@ -682,7 +681,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub(
 
 void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
                                        MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   MachineFrameInfo &MFI            = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
@@ -691,7 +690,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
   const MipsRegisterInfo &RegInfo =
       *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
 
-  DebugLoc DL = MBBI->getDebugLoc();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   MipsABIInfo ABI = STI.getABI();
   unsigned SP = ABI.GetStackPtr();
   unsigned FP = ABI.GetFramePtr();
@@ -790,7 +789,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           const std::vector<CalleeSavedInfo> &CSI,
                           const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  MachineBasicBlock *EntryBlock = &MF->front();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -803,7 +801,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
         && MF->getFrameInfo().isReturnAddressTaken();
     if (!IsRAAndRetAddrIsTaken)
-      EntryBlock->addLiveIn(Reg);
+      MBB.addLiveIn(Reg);
 
     // ISRs require HI/LO to be spilled into kernel registers to be then
     // spilled to the stack frame.
@@ -828,7 +826,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     // Insert the spill to the stack frame.
     bool IsKill = !IsRAAndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill,
                             CSI[i].getFrameIdx(), RC, TRI);
   }
 
@@ -882,9 +880,10 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // Expand pseudo instructions which load, store or copy accumulators.
   // Add an emergency spill slot if a pseudo was expanded.
   if (ExpandPseudo(MF).expand()) {
-    // The spill slot should be half the size of the accumulator. If target is
-    // mips64, it should be 64-bit, otherwise it should be 32-bt.
-    const TargetRegisterClass &RC = STI.hasMips64() ?
+    // The spill slot should be half the size of the accumulator. If target have
+    // general-purpose registers 64 bits wide, it should be 64-bit, otherwise
+    // it should be 32-bit.
+    const TargetRegisterClass &RC = STI.isGP64bit() ?
       Mips::GPR64RegClass : Mips::GPR32RegClass;
     int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
                                                  TRI->getSpillAlignment(RC),
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index de8e6eed31d7..cb2119d6880b 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -40,7 +40,6 @@ public:
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
-  unsigned ehDataReg(unsigned I) const;
 
 private:
   void emitInterruptEpilogueStub(MachineFunction &MF,
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 893cae93e58f..599c1e913acf 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -288,7 +288,7 @@ void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
                     SDValue(Carry, 0)};
   SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
 
-  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+  // My reading of the MIPS DSP 3.01 specification isn't as clear as I
   // would like about whether bit 20 always gets overwritten by addwc.
   // Hence take an extremely conservative view and presume it's sticky. We
   // therefore need to clear it.
@@ -976,9 +976,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     }
 
     SDNode *Rdhwr =
-      CurDAG->getMachineNode(RdhwrOpc, DL,
-                             Node->getValueType(0),
-                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
+        CurDAG->getMachineNode(RdhwrOpc, DL, Node->getValueType(0),
+                               CurDAG->getRegister(Mips::HWR29, MVT::i32),
+                               CurDAG->getTargetConstant(0, DL, MVT::i32));
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
                                          SDValue(Rdhwr, 0));
     SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 6f38289c5a45..eb3657aae050 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -93,37 +93,37 @@ private:
   bool selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
                                SDValue &Offset) const override;
 
-  /// \brief Select constant vector splats.
+  /// Select constant vector splats.
   bool selectVSplat(SDNode *N, APInt &Imm,
                     unsigned MinSizeInBits) const override;
-  /// \brief Select constant vector splats whose value fits in a given integer.
+  /// Select constant vector splats whose value fits in a given integer.
   bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
                                   unsigned ImmBitSize) const;
-  /// \brief Select constant vector splats whose value fits in a uimm1.
+  /// Select constant vector splats whose value fits in a uimm1.
   bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a uimm2.
+  /// Select constant vector splats whose value fits in a uimm2.
   bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a uimm3.
+  /// Select constant vector splats whose value fits in a uimm3.
   bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a uimm4.
+  /// Select constant vector splats whose value fits in a uimm4.
   bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a uimm5.
+  /// Select constant vector splats whose value fits in a uimm5.
   bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a uimm6.
+  /// Select constant vector splats whose value fits in a uimm6.
   bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a uimm8.
+  /// Select constant vector splats whose value fits in a uimm8.
   bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value fits in a simm5.
+  /// Select constant vector splats whose value fits in a simm5.
   bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value is a power of 2.
+  /// Select constant vector splats whose value is a power of 2.
   bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value is the inverse of a
+  /// Select constant vector splats whose value is the inverse of a
   /// power of 2.
   bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value is a run of set bits
+  /// Select constant vector splats whose value is a run of set bits
   /// ending at the most significant bit
   bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
-  /// \brief Select constant vector splats whose value is a run of set bits
+  /// Select constant vector splats whose value is a run of set bits
   /// starting at bit zero.
   bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index eee5b23117f6..f625a2903bd7 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -40,6 +39,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -104,6 +104,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setTargetDAGCombine(ISD::SRL);
     setTargetDAGCombine(ISD::SETCC);
     setTargetDAGCombine(ISD::VSELECT);
+
+    if (Subtarget.hasMips32r2()) {
+      setOperationAction(ISD::ADDC, MVT::i32, Legal);
+      setOperationAction(ISD::ADDE, MVT::i32, Legal);
+    }
   }
 
   if (Subtarget.hasDSPR2())
@@ -331,8 +336,12 @@ addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
   setOperationAction(ISD::SRA, Ty, Legal);
   setOperationAction(ISD::SRL, Ty, Legal);
   setOperationAction(ISD::SUB, Ty, Legal);
+  setOperationAction(ISD::SMAX, Ty, Legal);
+  setOperationAction(ISD::SMIN, Ty, Legal);
   setOperationAction(ISD::UDIV, Ty, Legal);
   setOperationAction(ISD::UREM, Ty, Legal);
+  setOperationAction(ISD::UMAX, Ty, Legal);
+  setOperationAction(ISD::UMIN, Ty, Legal);
   setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
   setOperationAction(ISD::VSELECT, Ty, Legal);
   setOperationAction(ISD::XOR, Ty, Legal);
@@ -963,46 +972,7 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
   EVT Ty = N->getValueType(0);
 
-  if (Ty.is128BitVector() && Ty.isInteger()) {
-    // Try the following combines:
-    //   (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
-    //   (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
-    //   (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
-    //   (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
-    //   (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
-    //   (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
-    //   (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
-    //   (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
-    // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
-    // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
-    // legalizer.
-    SDValue Op0 = N->getOperand(0);
-
-    if (Op0->getOpcode() != ISD::SETCC)
-      return SDValue();
-
-    ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
-    bool Signed;
-
-    if (CondCode == ISD::SETLT  || CondCode == ISD::SETLE)
-      Signed = true;
-    else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
-      Signed = false;
-    else
-      return SDValue();
-
-    SDValue Op1 = N->getOperand(1);
-    SDValue Op2 = N->getOperand(2);
-    SDValue Op0Op0 = Op0->getOperand(0);
-    SDValue Op0Op1 = Op0->getOperand(1);
-
-    if (Op1 == Op0Op0 && Op2 == Op0Op1)
-      return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
-                         Ty, Op1, Op2);
-    else if (Op1 == Op0Op1 && Op2 == Op0Op0)
-      return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
-                         Ty, Op1, Op2);
-  } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+  if (Ty == MVT::v2i16 || Ty == MVT::v4i8) {
     SDValue SetCC = N->getOperand(0);
 
     if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
@@ -1075,11 +1045,9 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   }
 
   if (Val.getNode()) {
-    DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
-          N->printrWithDepth(dbgs(), &DAG);
-          dbgs() << "\n=> \n";
-          Val.getNode()->printrWithDepth(dbgs(), &DAG);
-          dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+               N->printrWithDepth(dbgs(), &DAG); dbgs() << "\n=> \n";
+               Val.getNode()->printrWithDepth(dbgs(), &DAG); dbgs() << "\n");
     return Val;
   }
 
@@ -1378,7 +1346,16 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue LaneB;
 
   if (ResVecTy == MVT::v2i64) {
-    LaneB = DAG.getConstant(0, DL, MVT::i32);
+    // In case of the index being passed as an immediate value, set the upper
+    // lane to 0 so that the splati.d instruction can be matched.
+    if (isa<ConstantSDNode>(LaneA))
+      LaneB = DAG.getConstant(0, DL, MVT::i32);
+    // Having the index passed in a register, set the upper lane to the same
+    // value as the lower - this results in the BUILD_VECTOR node not being
+    // expanded through stack. This way we are able to pattern match the set of
+    // nodes created here to splat.d.
+    else
+      LaneB = LaneA;
     ViaVecTy = MVT::v4i32;
     if(BigEndian)
       std::swap(LaneA, LaneB);
@@ -1893,10 +1870,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_fmsub_w:
   case Intrinsic::mips_fmsub_d: {
     // TODO: If intrinsics have fast-math-flags, propagate them.
-    EVT ResTy = Op->getValueType(0);
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
-                       DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
-                                   Op->getOperand(2), Op->getOperand(3)));
+    return DAG.getNode(MipsISD::FMS, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
   }
   case Intrinsic::mips_frint_w:
   case Intrinsic::mips_frint_d:
@@ -1992,49 +1967,49 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_max_s_h:
   case Intrinsic::mips_max_s_w:
   case Intrinsic::mips_max_s_d:
-    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::SMAX, DL, Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2));
   case Intrinsic::mips_max_u_b:
   case Intrinsic::mips_max_u_h:
   case Intrinsic::mips_max_u_w:
   case Intrinsic::mips_max_u_d:
-    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::UMAX, DL, Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2));
   case Intrinsic::mips_maxi_s_b:
   case Intrinsic::mips_maxi_s_h:
   case Intrinsic::mips_maxi_s_w:
   case Intrinsic::mips_maxi_s_d:
-    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::SMAX, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
   case Intrinsic::mips_maxi_u_b:
   case Intrinsic::mips_maxi_u_h:
   case Intrinsic::mips_maxi_u_w:
   case Intrinsic::mips_maxi_u_d:
-    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::UMAX, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
   case Intrinsic::mips_min_s_b:
   case Intrinsic::mips_min_s_h:
   case Intrinsic::mips_min_s_w:
   case Intrinsic::mips_min_s_d:
-    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::SMIN, DL, Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2));
   case Intrinsic::mips_min_u_b:
   case Intrinsic::mips_min_u_h:
   case Intrinsic::mips_min_u_w:
   case Intrinsic::mips_min_u_d:
-    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::UMIN, DL, Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2));
   case Intrinsic::mips_mini_s_b:
   case Intrinsic::mips_mini_s_h:
   case Intrinsic::mips_mini_s_w:
   case Intrinsic::mips_mini_s_d:
-    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::SMIN, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
   case Intrinsic::mips_mini_u_b:
   case Intrinsic::mips_mini_u_h:
   case Intrinsic::mips_mini_u_w:
   case Intrinsic::mips_mini_u_d:
-    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+    return DAG.getNode(ISD::UMIN, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
   case Intrinsic::mips_mod_s_b:
   case Intrinsic::mips_mod_s_h:
@@ -2385,7 +2360,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
   }
 }
 
-/// \brief Check if the given BuildVectorSDNode is a splat.
+/// Check if the given BuildVectorSDNode is a splat.
 /// This method currently relies on DAG nodes being reused when equivalent,
 /// so it's possible for this to return false even when isConstantSplat returns
 /// true.
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
index 5976ecbcfc61..761ff3b1fa4d 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -15,8 +15,8 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
 
 #include "MipsISelLowering.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
@@ -32,11 +32,11 @@ class TargetRegisterClass;
     explicit MipsSETargetLowering(const MipsTargetMachine &TM,
                                   const MipsSubtarget &STI);
 
-    /// \brief Enable MSA support for the given integer type and Register
+    /// Enable MSA support for the given integer type and Register
     /// class.
     void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
 
-    /// \brief Enable MSA support for the given floating-point type and
+    /// Enable MSA support for the given floating-point type and
     /// Register class.
     void addMSAFloatType(MVT::SimpleValueType Ty,
                          const TargetRegisterClass *RC);
@@ -82,7 +82,7 @@ class TargetRegisterClass;
     SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+    /// Lower VECTOR_SHUFFLE into one of a number of instructions
     /// depending on the indices in the shuffle.
     SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -92,46 +92,46 @@ class TargetRegisterClass;
     MachineBasicBlock *emitMSACBranchPseudo(MachineInstr &MI,
                                             MachineBasicBlock *BB,
                                             unsigned BranchOp) const;
-    /// \brief Emit the COPY_FW pseudo instruction
+    /// Emit the COPY_FW pseudo instruction
     MachineBasicBlock *emitCOPY_FW(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
-    /// \brief Emit the COPY_FD pseudo instruction
+    /// Emit the COPY_FD pseudo instruction
     MachineBasicBlock *emitCOPY_FD(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
-    /// \brief Emit the INSERT_FW pseudo instruction
+    /// Emit the INSERT_FW pseudo instruction
     MachineBasicBlock *emitINSERT_FW(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
-    /// \brief Emit the INSERT_FD pseudo instruction
+    /// Emit the INSERT_FD pseudo instruction
     MachineBasicBlock *emitINSERT_FD(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
-    /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+    /// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
     MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr &MI,
                                           MachineBasicBlock *BB,
                                           unsigned EltSizeInBytes,
                                           bool IsFP) const;
-    /// \brief Emit the FILL_FW pseudo instruction
+    /// Emit the FILL_FW pseudo instruction
     MachineBasicBlock *emitFILL_FW(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
-    /// \brief Emit the FILL_FD pseudo instruction
+    /// Emit the FILL_FD pseudo instruction
     MachineBasicBlock *emitFILL_FD(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
-    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    /// Emit the FEXP2_W_1 pseudo instructions.
     MachineBasicBlock *emitFEXP2_W_1(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
-    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    /// Emit the FEXP2_D_1 pseudo instructions.
     MachineBasicBlock *emitFEXP2_D_1(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
-    /// \brief Emit the FILL_FW pseudo instruction
+    /// Emit the FILL_FW pseudo instruction
     MachineBasicBlock *emitLD_F16_PSEUDO(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
-    /// \brief Emit the FILL_FD pseudo instruction
+    /// Emit the FILL_FD pseudo instruction
     MachineBasicBlock *emitST_F16_PSEUDO(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
-    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    /// Emit the FEXP2_W_1 pseudo instructions.
     MachineBasicBlock *emitFPEXTEND_PSEUDO(MachineInstr &MI,
                                            MachineBasicBlock *BB,
                                            bool IsFGR64) const;
-    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    /// Emit the FEXP2_D_1 pseudo instructions.
     MachineBasicBlock *emitFPROUND_PSEUDO(MachineInstr &MI,
                                           MachineBasicBlock *BBi,
                                           bool IsFGR64) const;
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index 59b7679971cd..7ffe4aff474d 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -179,6 +179,69 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     MIB.addReg(ZeroReg);
 }
 
+static bool isORCopyInst(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Mips::OR_MM:
+  case Mips::OR:
+    if (MI.getOperand(2).getReg() == Mips::ZERO)
+      return true;
+    break;
+  case Mips::OR64:
+    if (MI.getOperand(2).getReg() == Mips::ZERO_64)
+      return true;
+    break;
+  }
+  return false;
+}
+
+/// If @MI is WRDSP/RRDSP instruction return true with @isWrite set to true
+/// if it is WRDSP instruction.
+static bool isReadOrWriteToDSPReg(const MachineInstr &MI, bool &isWrite) {
+  switch (MI.getOpcode()) {
+  default:
+   return false;
+  case Mips::WRDSP:
+  case Mips::WRDSP_MM:
+    isWrite = true;
+    break;
+  case Mips::RDDSP:
+  case Mips::RDDSP_MM:
+    isWrite = false;
+    break;
+  }
+  return true;
+}
+
+/// We check for the common case of 'or', as it's MIPS' preferred instruction
+/// for GPRs but we have to check the operands to ensure that is the case.
+/// Other move instructions for MIPS are directly identifiable.
+bool MipsSEInstrInfo::isCopyInstr(const MachineInstr &MI,
+                                  const MachineOperand *&Src,
+                                  const MachineOperand *&Dest) const {
+  bool isDSPControlWrite = false;
+  // Condition is made to match the creation of WRDSP/RDDSP copy instruction
+  // from copyPhysReg function.
+  if (isReadOrWriteToDSPReg(MI, isDSPControlWrite)) {
+    if (!MI.getOperand(1).isImm() || MI.getOperand(1).getImm() != (1<<4))
+      return false;
+    else if (isDSPControlWrite) {
+      Src = &MI.getOperand(0);
+      Dest = &MI.getOperand(2);
+    } else {
+      Dest = &MI.getOperand(0);
+      Src = &MI.getOperand(2);
+    }
+    return true;
+  } else if (MI.isMoveReg() || isORCopyInst(MI)) {
+    Dest = &MI.getOperand(0);
+    Src = &MI.getOperand(1);
+    return true;
+  }
+  return false;
+}
+
 void MipsSEInstrInfo::
 storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                 unsigned SrcReg, bool isKill, int FI,
@@ -379,28 +442,30 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
     break;
   case Mips::PseudoCVT_D32_W:
-    expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+    Opc = isMicroMips ? Mips::CVT_D32_W_MM : Mips::CVT_D32_W;
+    expandCvtFPInt(MBB, MI, Opc, Mips::MTC1, false);
     break;
   case Mips::PseudoCVT_S_L:
     expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
     break;
   case Mips::PseudoCVT_D64_W:
-    expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+    Opc = isMicroMips ? Mips::CVT_D64_W_MM : Mips::CVT_D64_W;
+    expandCvtFPInt(MBB, MI, Opc, Mips::MTC1, true);
     break;
   case Mips::PseudoCVT_D64_L:
     expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
     break;
   case Mips::BuildPairF64:
-    expandBuildPairF64(MBB, MI, false);
+    expandBuildPairF64(MBB, MI, isMicroMips, false);
     break;
   case Mips::BuildPairF64_64:
-    expandBuildPairF64(MBB, MI, true);
+    expandBuildPairF64(MBB, MI, isMicroMips, true);
     break;
   case Mips::ExtractElementF64:
-    expandExtractElementF64(MBB, MI, false);
+    expandExtractElementF64(MBB, MI, isMicroMips, false);
     break;
   case Mips::ExtractElementF64_64:
-    expandExtractElementF64(MBB, MI, true);
+    expandExtractElementF64(MBB, MI, isMicroMips, true);
     break;
   case Mips::MIPSeh_return32:
   case Mips::MIPSeh_return64:
@@ -425,6 +490,10 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   case Mips::BGEZ:   return Mips::BLTZ;
   case Mips::BLTZ:   return Mips::BGEZ;
   case Mips::BLEZ:   return Mips::BGTZ;
+  case Mips::BGTZ_MM:   return Mips::BLEZ_MM;
+  case Mips::BGEZ_MM:   return Mips::BLTZ_MM;
+  case Mips::BLTZ_MM:   return Mips::BGEZ_MM;
+  case Mips::BLEZ_MM:   return Mips::BGTZ_MM;
   case Mips::BEQ64:  return Mips::BNE64;
   case Mips::BNE64:  return Mips::BEQ64;
   case Mips::BGTZ64: return Mips::BLEZ64;
@@ -433,16 +502,40 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   case Mips::BLEZ64: return Mips::BGTZ64;
   case Mips::BC1T:   return Mips::BC1F;
   case Mips::BC1F:   return Mips::BC1T;
-  case Mips::BEQZC_MM: return Mips::BNEZC_MM;
-  case Mips::BNEZC_MM: return Mips::BEQZC_MM;
+  case Mips::BC1T_MM:   return Mips::BC1F_MM;
+  case Mips::BC1F_MM:   return Mips::BC1T_MM;
+  case Mips::BEQZ16_MM: return Mips::BNEZ16_MM;
+  case Mips::BNEZ16_MM: return Mips::BEQZ16_MM;
+  case Mips::BEQZC_MM:  return Mips::BNEZC_MM;
+  case Mips::BNEZC_MM:  return Mips::BEQZC_MM;
   case Mips::BEQZC:  return Mips::BNEZC;
   case Mips::BNEZC:  return Mips::BEQZC;
-  case Mips::BEQC:   return Mips::BNEC;
-  case Mips::BNEC:   return Mips::BEQC;
-  case Mips::BGTZC:  return Mips::BLEZC;
+  case Mips::BLEZC:  return Mips::BGTZC;
   case Mips::BGEZC:  return Mips::BLTZC;
+  case Mips::BGEC:   return Mips::BLTC;
+  case Mips::BGTZC:  return Mips::BLEZC;
   case Mips::BLTZC:  return Mips::BGEZC;
-  case Mips::BLEZC:  return Mips::BGTZC;
+  case Mips::BLTC:   return Mips::BGEC;
+  case Mips::BGEUC:  return Mips::BLTUC;
+  case Mips::BLTUC:  return Mips::BGEUC;
+  case Mips::BEQC:   return Mips::BNEC;
+  case Mips::BNEC:   return Mips::BEQC;
+  case Mips::BC1EQZ: return Mips::BC1NEZ;
+  case Mips::BC1NEZ: return Mips::BC1EQZ;
+  case Mips::BEQZC_MMR6:  return Mips::BNEZC_MMR6;
+  case Mips::BNEZC_MMR6:  return Mips::BEQZC_MMR6;
+  case Mips::BLEZC_MMR6:  return Mips::BGTZC_MMR6;
+  case Mips::BGEZC_MMR6:  return Mips::BLTZC_MMR6;
+  case Mips::BGEC_MMR6:   return Mips::BLTC_MMR6;
+  case Mips::BGTZC_MMR6:  return Mips::BLEZC_MMR6;
+  case Mips::BLTZC_MMR6:  return Mips::BGEZC_MMR6;
+  case Mips::BLTC_MMR6:   return Mips::BGEC_MMR6;
+  case Mips::BGEUC_MMR6:  return Mips::BLTUC_MMR6;
+  case Mips::BLTUC_MMR6:  return Mips::BGEUC_MMR6;
+  case Mips::BEQC_MMR6:   return Mips::BNEC_MMR6;
+  case Mips::BNEC_MMR6:   return Mips::BEQC_MMR6;
+  case Mips::BC1EQZC_MMR6: return Mips::BC1NEZC_MMR6;
+  case Mips::BC1NEZC_MMR6: return Mips::BC1EQZC_MMR6;
   case Mips::BEQZC64:  return Mips::BNEZC64;
   case Mips::BNEZC64:  return Mips::BEQZC64;
   case Mips::BEQC64:   return Mips::BNEC64;
@@ -459,6 +552,16 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   case Mips::BBIT1:  return Mips::BBIT0;
   case Mips::BBIT032:  return Mips::BBIT132;
   case Mips::BBIT132:  return Mips::BBIT032;
+  case Mips::BZ_B:   return Mips::BNZ_B;
+  case Mips::BZ_H:   return Mips::BNZ_H;
+  case Mips::BZ_W:   return Mips::BNZ_W;
+  case Mips::BZ_D:   return Mips::BNZ_D;
+  case Mips::BZ_V:   return Mips::BNZ_V;
+  case Mips::BNZ_B:  return Mips::BZ_B;
+  case Mips::BNZ_H:  return Mips::BZ_H;
+  case Mips::BNZ_W:  return Mips::BZ_W;
+  case Mips::BNZ_D:  return Mips::BZ_D;
+  case Mips::BNZ_V:  return Mips::BZ_V;
   }
 }
 
@@ -551,7 +654,13 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
           Opc == Mips::BGTZC64 || Opc == Mips::BGEZC64 ||
           Opc == Mips::BLTZC64 || Opc == Mips::BLEZC64 || Opc == Mips::BC ||
           Opc == Mips::BBIT0 || Opc == Mips::BBIT1 || Opc == Mips::BBIT032 ||
-          Opc == Mips::BBIT132) ? Opc : 0;
+          Opc == Mips::BBIT132 ||  Opc == Mips::BC_MMR6 ||
+          Opc == Mips::BEQC_MMR6 || Opc == Mips::BNEC_MMR6 ||
+          Opc == Mips::BLTC_MMR6 || Opc == Mips::BGEC_MMR6 ||
+          Opc == Mips::BLTUC_MMR6 || Opc == Mips::BGEUC_MMR6 ||
+          Opc == Mips::BGTZC_MMR6 || Opc == Mips::BLEZC_MMR6 ||
+          Opc == Mips::BGEZC_MMR6 || Opc == Mips::BLTZC_MMR6 ||
+          Opc == Mips::BEQZC_MMR6 || Opc == Mips::BNEZC_MMR6) ? Opc : 0;
 }
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
@@ -651,6 +760,7 @@ void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
 
 void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
                                               MachineBasicBlock::iterator I,
+                                              bool isMicroMips,
                                               bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned SrcReg = I->getOperand(1).getReg();
@@ -682,7 +792,10 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
     //        We therefore pretend that it reads the bottom 32-bits to
     //        artificially create a dependency and prevent the scheduler
     //        changing the behaviour of the code.
-    BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg)
+    BuildMI(MBB, I, dl,
+            get(isMicroMips ? (FP64 ? Mips::MFHC1_D64_MM : Mips::MFHC1_D32_MM)
+                            : (FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32)),
+            DstReg)
         .addReg(SrcReg);
   } else
     BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
@@ -690,7 +803,7 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
 
 void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
-                                         bool FP64) const {
+                                         bool isMicroMips, bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
@@ -735,7 +848,10 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
     //        We therefore pretend that it reads the bottom 32-bits to
     //        artificially create a dependency and prevent the scheduler
     //        changing the behaviour of the code.
-    BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+    BuildMI(MBB, I, dl,
+            get(isMicroMips ? (FP64 ? Mips::MTHC1_D64_MM : Mips::MTHC1_D32_MM)
+                            : (FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32)),
+            DstReg)
         .addReg(DstReg)
         .addReg(HiReg);
   } else if (Subtarget.isABI_FPXX())
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index b356909bf1cf..fc55716d598a 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -47,6 +47,9 @@ public:
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+                   const MachineOperand *&Dest) const override;
+
   void storeRegToStack(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MI,
                        unsigned SrcReg, bool isKill, int FrameIndex,
@@ -107,9 +110,11 @@ private:
                       unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
 
   void expandExtractElementF64(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, bool FP64) const;
+                               MachineBasicBlock::iterator I, bool isMicroMips,
+                               bool FP64) const;
   void expandBuildPairF64(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator I, bool FP64) const;
+                          MachineBasicBlock::iterator I, bool isMicroMips,
+                          bool FP64) const;
   void expandEhReturn(MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 };
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
index 2ff6b99e78ff..e7d720a4b769 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -88,10 +88,8 @@ static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
   case Mips::SCE:
     return 16;
   case Mips::LLE_MM:
-  case Mips::LLE_MMR6:
   case Mips::LL_MM:
   case Mips::SCE_MM:
-  case Mips::SCE_MMR6:
   case Mips::SC_MM:
     return 12;
   case Mips::LL64_R6:
@@ -100,6 +98,8 @@ static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
   case Mips::SC64_R6:
   case Mips::SCD_R6:
   case Mips::SC_R6:
+  case Mips::LL_MMR6:
+  case Mips::SC_MMR6:
     return 9;
   case Mips::INLINEASM: {
     unsigned ConstraintID = InlineAsm::getMemoryConstraintID(MO.getImm());
@@ -204,7 +204,8 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   Offset = SPOffset + (int64_t)StackSize;
   Offset += MI.getOperand(OpNo + 1).getImm();
 
-  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+  LLVM_DEBUG(errs() << "Offset     : " << Offset << "\n"
+                    << "<--------->\n");
 
   if (!MI.isDebugValue()) {
     // Make sure Offset fits within the field available.
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
index 8ec55ab6284d..64db815a0f4c 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSchedule.td
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -57,6 +57,14 @@ def II_CFC1             : InstrItinClass;
 def II_CFC2             : InstrItinClass;
 def II_CLO              : InstrItinClass;
 def II_CLZ              : InstrItinClass;
+def II_CRC32B           : InstrItinClass;
+def II_CRC32CB          : InstrItinClass;
+def II_CRC32CD          : InstrItinClass;
+def II_CRC32CH          : InstrItinClass;
+def II_CRC32CW          : InstrItinClass;
+def II_CRC32D           : InstrItinClass;
+def II_CRC32H           : InstrItinClass;
+def II_CRC32W           : InstrItinClass;
 def II_CTC1             : InstrItinClass;
 def II_CTC2             : InstrItinClass;
 def II_CVT              : InstrItinClass;
@@ -84,8 +92,10 @@ def II_DIVU             : InstrItinClass;
 def II_DIV_D            : InstrItinClass;
 def II_DIV_S            : InstrItinClass;
 def II_DMFC0            : InstrItinClass;
+def II_DMFGC0           : InstrItinClass;
 def II_DMT              : InstrItinClass;
 def II_DMTC0            : InstrItinClass;
+def II_DMTGC0           : InstrItinClass;
 def II_DMFC1            : InstrItinClass;
 def II_DMTC1            : InstrItinClass;
 def II_DMOD             : InstrItinClass;
@@ -120,6 +130,9 @@ def II_EVPE             : InstrItinClass;
 def II_EXT              : InstrItinClass; // Any EXT instruction
 def II_FLOOR            : InstrItinClass;
 def II_FORK             : InstrItinClass;
+def II_GINVI            : InstrItinClass;
+def II_GINVT            : InstrItinClass;
+def II_HYPCALL          : InstrItinClass;
 def II_INS              : InstrItinClass; // Any INS instruction
 def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
 def II_J                : InstrItinClass;
@@ -225,6 +238,8 @@ def II_MFHC0            : InstrItinClass;
 def II_MFC1             : InstrItinClass;
 def II_MFHC1            : InstrItinClass;
 def II_MFC2             : InstrItinClass;
+def II_MFGC0            : InstrItinClass;
+def II_MFHGC0           : InstrItinClass;
 def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
 def II_MFTR             : InstrItinClass;
 def II_MOD              : InstrItinClass;
@@ -255,6 +270,8 @@ def II_MTHC0            : InstrItinClass;
 def II_MTC1             : InstrItinClass;
 def II_MTHC1            : InstrItinClass;
 def II_MTC2             : InstrItinClass;
+def II_MTGC0            : InstrItinClass;
+def II_MTHGC0           : InstrItinClass;
 def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
 def II_MTTR             : InstrItinClass;
 def II_MUL              : InstrItinClass;
@@ -346,6 +363,12 @@ def II_CACHEE           : InstrItinClass;
 def II_PREFE            : InstrItinClass;
 def II_LLE              : InstrItinClass;
 def II_SCE              : InstrItinClass;
+def II_TLBGINV          : InstrItinClass;
+def II_TLBGINVF         : InstrItinClass;
+def II_TLBGP            : InstrItinClass;
+def II_TLBGR            : InstrItinClass;
+def II_TLBGWI           : InstrItinClass;
+def II_TLBGWR           : InstrItinClass;
 def II_TLBINV           : InstrItinClass;
 def II_TLBINVF          : InstrItinClass;
 def II_WRPGPR           : InstrItinClass;
@@ -686,5 +709,28 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_RDPGPR          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DVP             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_EVP             , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<II_YIELD           , [InstrStage<5,  [ALU]>]>
+  InstrItinData<II_YIELD           , [InstrStage<5,  [ALU]>]>,
+  InstrItinData<II_CRC32B          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32H          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32W          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32D          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32CB         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32CH         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32CW         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CRC32CD         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MFGC0           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTGC0           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFHGC0          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTHGC0          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_HYPCALL         , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_TLBGINV         , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_TLBGINVF        , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_TLBGP           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_TLBGR           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_TLBWI           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_TLBWR           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMFGC0          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTGC0          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_GINVI           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_GINVT           , [InstrStage<1,  [ALU]>]>
 ]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
index 744392c320ef..79c55dbb9e03 100644
--- a/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -25,8 +25,11 @@ def MipsGenericModel : SchedMachineModel {
   int HighLatency = 37;
   list<Predicate> UnsupportedFeatures = [];
 
-  let CompleteModel = 1;
+  let CompleteModel = 0;
   let PostRAScheduler = 1;
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
 }
 
 let SchedModel = MipsGenericModel in {
@@ -71,12 +74,12 @@ def : ItinRW<[GenericWriteMDUtoGPR], [II_MUL]>;
 def GenericWriteDIV : SchedWriteRes<[GenericIssueDIV]> {
   // Estimated worst case
   let Latency = 33;
-  let ResourceCycles = [1, 33];
+  let ResourceCycles = [33];
 }
 def GenericWriteDIVU : SchedWriteRes<[GenericIssueDIV]> {
   // Estimated worst case
   let Latency = 31;
-  let ResourceCycles = [1, 31];
+  let ResourceCycles = [31];
 }
 
 def : ItinRW<[GenericWriteDIV], [II_DIV]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
index 440f93d5b7eb..846fa11494c7 100644
--- a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -13,14 +13,13 @@ def MipsP5600Model : SchedMachineModel {
   int LoadLatency = 4;
   int MispredictPenalty = 8; // TODO: Estimated
 
-  let CompleteModel = 1;
+  let CompleteModel = 0;
+  let FullInstRWOverlapCheck = 1;
 
   list<Predicate> UnsupportedFeatures = [HasMips32r6, HasMips64r6,
-                                         HasMips64, HasMips64r2, HasCnMips,
+                                         HasMips3, HasMips64r2, HasCnMips,
                                          InMicroMips, InMips16Mode,
-                                         HasMicroMips32r6, HasDSP,
-                                         HasDSPR2, HasMT];
-
+                                         HasDSP, HasDSPR2, HasMT, HasCRC];
 }
 
 let SchedModel = MipsP5600Model in {
@@ -37,9 +36,8 @@ def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; }
 def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>;
 
 // and, lui, nor, or, slti, sltiu, sub, subu, xor
-def : ItinRW<[P5600WriteALU],
-             [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUB, II_SUBU,
-              II_XOR]>;
+def : InstRW<[P5600WriteALU], (instrs AND, LUi, NOR, OR, SLTi, SLTiu, SUB,
+                               SUBu, XOR)>;
 
 // AGQ Pipelines
 // =============
@@ -63,20 +61,35 @@ def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
 
 // b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal,
 // jalr, jr.hb, jr
-def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR,
-                                II_JR_HB, II_DERET, II_ERET, II_ERETNC, 
-                                II_SYSCALL, II_BREAK, II_SDBBP, II_SSNOP,
-                                II_TEQ, II_TEQI, II_TGE, II_TGEI, II_TGEIU,
-                                II_TGEU, II_TLT, II_TLTI, II_TLTU, II_TNE,
-                                II_TNEI, II_TRAP, II_TTLTIU, II_WAIT,
-                                II_PAUSE]>;
-
-def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB]>;
+def : InstRW<[P5600WriteJump], (instrs B, BAL, BAL_BR, BEQ, BEQL, BGEZ, BGEZAL,
+                                BGEZALL, BGEZL, BGTZ, BGTZL, BLEZ, BLEZL, BLTZ,
+                                BLTZAL, BLTZALL, BLTZL, BNE, BNEL, BREAK,
+                                DERET, ERET, ERETNC, J, JR, JR_HB,
+                                PseudoIndirectBranch,
+                                PseudoIndirectHazardBranch, PseudoReturn,
+                                SDBBP, SSNOP, SYSCALL, TAILCALL, TAILCALLREG,
+                                TAILCALLREGHB, TEQ, TEQI, TGE, TGEI, TGEIU,
+                                TGEU, TLT, TLTI, TLTU, TNE, TNEI, TRAP,
+                                TTLTIU, WAIT, PAUSE)>;
+
+def : InstRW<[P5600WriteJumpAndLink], (instrs JAL, JALR, JALRHBPseudo,
+                                       JALRPseudo, JALR_HB)>;
+
+def : InstRW<[P5600WriteJumpAndLink], (instrs JALX)> {
+  let Unsupported = 1;
+}
 
 def P5600COP0 : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
 
-def : ItinRW<[P5600COP0], [II_TLBINV, II_TLBINVF, II_TLBP, II_TLBR, II_TLBWI,
-                           II_TLBWR, II_MFC0, II_MTC0]>;
+def : InstRW<[P5600COP0], (instrs TLBINV, TLBINVF, TLBP, TLBR, TLBWI, TLBWR,
+                           MFC0, MTC0)>;
+
+def P5600COP2 : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+
+def : InstRW<[P5600COP2], (instrs MFC2, MTC2)> {
+  let Unsupported = 1;
+}
+
 // LDST Pipeline
 // -------------
 
@@ -106,21 +119,20 @@ def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
 }
 
 // l[bhw], l[bh]u, ll
-def : ItinRW<[P5600WriteLoad], [II_LB, II_LBE, II_LBU, II_LBUE, II_LH, II_LHE,
-                                II_LHU, II_LHUE, II_LW, II_LWE, II_LL, II_LLE,
-                                II_LWPC]>;
+def : InstRW<[P5600WriteLoad], (instrs LB, LBu, LH, LHu, LW, LL, LWC2, LWC3,
+                                LDC2, LDC3, LBE, LBuE, LHE, LHuE, LWE, LLE,
+                                LWPC)>;
 
 // lw[lr]
-def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWLE, II_LWR, II_LWRE]>;
+def : InstRW<[P5600WriteLoadShifted], (instrs LWL, LWR, LWLE, LWRE)>;
 
 // s[bhw], sw[lr]
-def : ItinRW<[P5600WriteStore], [II_SB, II_SBE, II_SH, II_SHE, II_SW, II_SWE,
-                                 II_SWL, II_SWLE, II_SWR, II_SWRE, II_SC,
-                                 II_SCE]>;
+def : InstRW<[P5600WriteStore], (instrs SB, SH, SW, SWC2, SWC3, SDC2, SDC3, SC,
+                                 SBE, SHE, SWE, SCE, SWL, SWR, SWLE, SWRE)>;
 
 // pref, cache, sync, synci
-def : ItinRW<[P5600WriteCache], [II_PREF, II_PREFE, II_CACHE, II_CACHEE,
-                                 II_SYNC, II_SYNCI]>;
+def : InstRW<[P5600WriteCache], (instrs PREF, PREFE, CACHE, CACHEE, SYNC,
+                                 SYNCI)>;
 
 // LDST is also used in moves from general purpose registers to floating point
 // and MSA.
@@ -154,28 +166,31 @@ def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
 }
 
 // clo, clz, di, ei, mfhi, mflo
-def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_DI, II_EI, II_MFHI_MFLO]>;
+def : InstRW<[P5600WriteAL2], (instrs CLO, CLZ, DI, EI, MFHI, MFLO,
+                               PseudoMFHI, PseudoMFLO)>;
 
 // ehb, rdhwr, rdpgpr, wrpgpr, wsbh
-def : ItinRW<[P5600WriteAL2ShadowMov], [II_EHB, II_RDHWR, II_WSBH]>;
+def : InstRW<[P5600WriteAL2ShadowMov], (instrs EHB, RDHWR, WSBH)>;
 
 // mov[nz]
-def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>;
+def : InstRW<[P5600WriteAL2CondMov], (instrs MOVN_I_I, MOVZ_I_I)>;
 
 // divu?
-def : ItinRW<[P5600WriteAL2Div], [II_DIV]>;
-def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>;
+def : InstRW<[P5600WriteAL2Div], (instrs DIV, PseudoSDIV, SDIV)>;
+def : InstRW<[P5600WriteAL2DivU], (instrs DIVU, PseudoUDIV, UDIV)>;
 
 // mul
-def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>;
+def : InstRW<[P5600WriteAL2Mul], (instrs MUL)>;
 // multu?, multu?
-def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>;
+def : InstRW<[P5600WriteAL2Mult], (instrs MULT, MULTu, PseudoMULT,
+                                   PseudoMULTu)>;
 // maddu?, msubu?, mthi, mtlo
-def : ItinRW<[P5600WriteAL2MAdd],
-             [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>;
+def : InstRW<[P5600WriteAL2MAdd], (instrs MADD, MADDU, MSUB, MSUBU,
+                                   MTHI, MTLO, PseudoMADD, PseudoMADDU,
+                                   PseudoMSUB, PseudoMSUBU, PseudoMTLOHI)>;
 
 // ext, ins
-def : ItinRW<[P5600WriteAL2BitExt], [II_EXT, II_INS]>;
+def : InstRW<[P5600WriteAL2BitExt], (instrs EXT, INS)>;
 
 // Either ALU or AL2 Pipelines
 // ---------------------------
@@ -193,11 +208,9 @@ def P5600WriteEitherALU : SchedWriteVariant<
 
 // add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
 // xori
-def : ItinRW<[P5600WriteEitherALU],
-             [II_ADD, II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH,
-              II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV,
-              II_SRAV, II_SRLV, II_LSA]>;
-def : InstRW<[], (instrs COPY)>;
+def : InstRW<[P5600WriteEitherALU], (instrs ADD, ADDi, ADDiu, ANDi, ORi, ROTR,
+                                     SEB, SEH, SLT, SLTu, SLL, SRA, SRL, XORi,
+                                     ADDu, SLLV, SRAV, SRLV, LSA, COPY)>;
 
 // FPU Pipelines
 // =============
@@ -300,8 +313,10 @@ def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>;
 // abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1,
 // m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b,
 // sdxc1, sdc1, st.[bhwd], swc1, swxc1
-def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D,
-                                II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>;
+def : InstRW<[P5600WriteFPUS], (instrs FABS_S, FABS_D32, FABS_D64, MOVF_D32,
+                                MOVF_D64, MOVF_S, MOVT_D32, MOVT_D64,
+                                MOVT_S, FMOV_D32, FMOV_D64, FMOV_S, FNEG_S,
+                                FNEG_D32, FNEG_D64)>;
 
 // adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
 // aver?_[us].[bhwd], shf.[bhw], fill[bhwd], splat?.[bhwd]
@@ -321,23 +336,6 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
 def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
 def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
 
-// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
-// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
-def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
-
-// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
-def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
-
 // fexp2_w, fexp2_d
 def : InstRW<[P5600WriteFPUS], (instregex "^FEXP2_(W|D)$")>;
 
@@ -424,7 +422,6 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRLR|SRLRI)_[BHWD]$")>;
 def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
 def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
 def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
-def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
 
 // Long Pipe
 // ----------
@@ -432,24 +429,31 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
 // add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
 // cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
 // trunc.w.[ds], trunc.w.ps
-def : ItinRW<[P5600WriteFPUL],
-             [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D,
-              II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>;
+def : InstRW<[P5600WriteFPUL],
+             (instrs FADD_D32, FADD_D64, FADD_S, FMUL_D32, FMUL_D64, FMUL_S,
+              FSUB_D32, FSUB_D64, FSUB_S)>;
+def : InstRW<[P5600WriteFPUL], (instregex "^TRUNC_(L|W)_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL],
+             (instregex "^CVT_(S|D32|D64|L|W)_(S|D32|D64|L|W)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^C_[A-Z]+_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>;
 
 // div.[ds], div.ps
-def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>;
-def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>;
+def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>;
+def : InstRW<[P5600WriteFPUDivD], (instrs FDIV_D32, FDIV_D64)>;
 
 // sqrt.[ds], sqrt.ps
-def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>;
-def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>;
+def : InstRW<[P5600WriteFPUSqrtS], (instrs FSQRT_S)>;
+def : InstRW<[P5600WriteFPUSqrtD], (instrs FSQRT_D32, FSQRT_D64)>;
 
 // frcp.[wd], frsqrt.[wd]
 def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRCP_(W|D)$")>;
 def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRSQRT_(W|D)$")>;
 
-def : ItinRW<[P5600WriteFPURsqrtD], [II_RECIP_D, II_RSQRT_D]>;
-def : ItinRW<[P5600WriteFPURsqrtS], [II_RECIP_S, II_RSQRT_S]>;
+def : InstRW<[P5600WriteFPURsqrtD], (instrs RECIP_D32, RECIP_D64, RSQRT_D32,
+                                     RSQRT_D64)>;
+def : InstRW<[P5600WriteFPURsqrtS], (instrs RECIP_S, RSQRT_S)>;
 
 // fmadd.[wd], fmsubb.[wd], fdiv.[wd], fsqrt.[wd], fmul.[wd], fadd.[wd],
 // fsub.[wd]
@@ -481,9 +485,9 @@ def : InstRW<[P5600WriteMSALongInt], (instregex "^MUL_Q_[HW]$")>;
 
 // madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds],
 // Operand 0 is read on cycle 5. All other operands are read on operand 0.
-def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
-             [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D,
-              II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>;
+def : InstRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
+             (instrs MADD_D32, MADD_D64, MADD_S, MSUB_D32, MSUB_D64, MSUB_S,
+              NMADD_D32, NMADD_D64, NMADD_S, NMSUB_D32, NMSUB_D64, NMSUB_S)>;
 
 // madd.ps, msub.ps, nmadd.ps, nmsub.ps
 // Operand 0 and 1 are read on cycle 5. All others are read on operand 0.
@@ -536,26 +540,30 @@ def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits,
                                        P5600WriteLoadOtherUnitsToFPU]>;
 
 // ctc1, mtc1, mthc1
-def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>;
+def : InstRW<[P5600WriteMoveGPRToFPU], (instrs CTC1, MTC1, MTC1_D64, MTHC1_D32,
+                                        MTHC1_D64, BuildPairF64,
+                                        BuildPairF64_64)>;
 
 // copy.[su]_[bhwd]
 def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_U_[BHW]$")>;
 def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_S_[BHWD]$")>;
 
 // bc1[ft], cfc1, mfc1, mfhc1, movf, movt
-def : ItinRW<[P5600WriteMoveFPUToGPR],
-             [II_BC1F, II_BC1FL, II_BC1T, II_BC1TL, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>;
+def : InstRW<[P5600WriteMoveFPUToGPR], (instrs BC1F, BC1FL, BC1T, BC1TL, CFC1,
+                                        MFC1, MFC1_D64, MFHC1_D32, MFHC1_D64,
+                                        MOVF_I, MOVT_I, ExtractElementF64,
+                                        ExtractElementF64_64)>;
 
 // swc1, swxc1, st.[bhwd]
-def : ItinRW<[P5600WriteStoreFPUS], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
-                                     II_SWXC1]>;
+def : InstRW<[P5600WriteStoreFPUS], (instrs SDC1, SDXC1, SUXC1, SWC1, SWXC1)>;
 def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
 
 // movn.[ds], movz.[ds]
-def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>;
+def : InstRW<[P5600WriteStoreFPUL], (instrs MOVN_I_D32, MOVN_I_D64, MOVN_I_S,
+                                     MOVZ_I_D32, MOVZ_I_D64, MOVZ_I_S)>;
 
 // l[dw]x?c1, ld.[bhwd]
-def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1, II_LUXC1]>;
+def : InstRW<[P5600WriteLoadFPU], (instrs LDC1, LDXC1, LWC1, LWXC1, LUXC1)>;
 def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
 
 // Unsupported Instructions
@@ -577,10 +585,12 @@ def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
 
 // ceil.[lw].[ds], floor.[lw].[ds]
 // Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL
-def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>;
+def : InstRW<[P5600WriteFPUL], (instregex "^CEIL_(L|W)_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FLOOR_(L|W)_(S|D32|D64)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^ROUND_(L|W)_(S|D32|D64)$")>;
 
 // rotrv
 // Reason behind guess: rotr is in the same category and the two register forms
 //                      generally follow the immediate forms in this category
-def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>;
+def : InstRW<[P5600WriteEitherALU], (instrs ROTRV)>;
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
index ddaa07ea9bc1..0c39a45467c4 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -16,6 +16,9 @@
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
+#include "MipsCallLowering.h"
+#include "MipsLegalizerInfo.h"
+#include "MipsRegisterBankInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
@@ -57,6 +60,12 @@ static cl::opt<bool>
     GPOpt("mgpopt", cl::Hidden,
           cl::desc("Enable gp-relative addressing of mips small data items"));
 
+bool MipsSubtarget::DspWarningPrinted = false;
+bool MipsSubtarget::MSAWarningPrinted = false;
+bool MipsSubtarget::VirtWarningPrinted = false;
+bool MipsSubtarget::CRCWarningPrinted = false;
+bool MipsSubtarget::GINVWarningPrinted = false;
+
 void MipsSubtarget::anchor() {}
 
 MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -71,8 +80,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
       HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
       Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
-      HasEVA(false), DisableMadd4(false), HasMT(false),
-      UseIndirectJumpsHazard(false), StackAlignOverride(StackAlignOverride),
+      HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
+      HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
+      StackAlignOverride(StackAlignOverride),
       TM(TM), TargetTriple(TT), TSInfo(),
       InstrInfo(
           MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
@@ -108,6 +118,8 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
   if (hasMips64r6() && InMicroMipsMode)
     report_fatal_error("microMIPS64R6 is not supported", false);
 
+  if (!isABI_O32() && InMicroMipsMode)
+    report_fatal_error("microMIPS64 is not supported.", false);
 
   if (UseIndirectJumpsHazard) {
     if (InMicroMipsMode)
@@ -139,6 +151,59 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
            << "\n";
     UseSmallSection = false;
   }
+
+  if (hasDSPR2() && !DspWarningPrinted) {
+    if (hasMips64() && !hasMips64r2()) {
+      errs() << "warning: the 'dspr2' ASE requires MIPS64 revision 2 or "
+             << "greater\n";
+      DspWarningPrinted = true;
+    } else if (hasMips32() && !hasMips32r2()) {
+      errs() << "warning: the 'dspr2' ASE requires MIPS32 revision 2 or "
+             << "greater\n";
+      DspWarningPrinted = true;
+    }
+  } else if (hasDSP() && !DspWarningPrinted) {
+    if (hasMips64() && !hasMips64r2()) {
+      errs() << "warning: the 'dsp' ASE requires MIPS64 revision 2 or "
+             << "greater\n";
+      DspWarningPrinted = true;
+    } else if (hasMips32() && !hasMips32r2()) {
+      errs() << "warning: the 'dsp' ASE requires MIPS32 revision 2 or "
+             << "greater\n";
+      DspWarningPrinted = true;
+    }
+  }
+
+  StringRef ArchName = hasMips64() ? "MIPS64" : "MIPS32";
+
+  if (!hasMips32r5() && hasMSA() && !MSAWarningPrinted) {
+    errs() << "warning: the 'msa' ASE requires " << ArchName
+           << " revision 5 or greater\n";
+    MSAWarningPrinted = true;
+  }
+  if (!hasMips32r5() && hasVirt() && !VirtWarningPrinted) {
+    errs() << "warning: the 'virt' ASE requires " << ArchName
+           << " revision 5 or greater\n";
+    VirtWarningPrinted = true;
+  }
+  if (!hasMips32r6() && hasCRC() && !CRCWarningPrinted) {
+    errs() << "warning: the 'crc' ASE requires " << ArchName
+           << " revision 6 or greater\n";
+    CRCWarningPrinted = true;
+  }
+  if (!hasMips32r6() && hasGINV() && !GINVWarningPrinted) {
+    errs() << "warning: the 'ginv' ASE requires " << ArchName
+           << " revision 6 or greater\n";
+    GINVWarningPrinted = true;
+  }
+
+  CallLoweringInfo.reset(new MipsCallLowering(*getTargetLowering()));
+  Legalizer.reset(new MipsLegalizerInfo(*this));
+
+  auto *RBI = new MipsRegisterBankInfo(*getRegisterInfo());
+  RegBankInfo.reset(RBI);
+  InstSelector.reset(createMipsInstructionSelector(
+      *static_cast<const MipsTargetMachine *>(&TM), *this, *RBI));
 }
 
 bool MipsSubtarget::isPositionIndependent() const {
@@ -184,7 +249,8 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
 }
 
 bool MipsSubtarget::useConstantIslands() {
-  DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+  LLVM_DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands
+                    << "\n");
   return Mips16ConstantIslands;
 }
 
@@ -196,3 +262,19 @@ bool MipsSubtarget::isABI_N64() const { return getABI().IsN64(); }
 bool MipsSubtarget::isABI_N32() const { return getABI().IsN32(); }
 bool MipsSubtarget::isABI_O32() const { return getABI().IsO32(); }
 const MipsABIInfo &MipsSubtarget::getABI() const { return TM.getABI(); }
+
+const CallLowering *MipsSubtarget::getCallLowering() const {
+  return CallLoweringInfo.get();
+}
+
+const LegalizerInfo *MipsSubtarget::getLegalizerInfo() const {
+  return Legalizer.get();
+}
+
+const RegisterBankInfo *MipsSubtarget::getRegBankInfo() const {
+  return RegBankInfo.get();
+}
+
+const InstructionSelector *MipsSubtarget::getInstructionSelector() const {
+  return InstSelector.get();
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
index ad2905c51601..676d702ba63e 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -20,6 +20,10 @@
 #include "MipsInstrInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -44,6 +48,21 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   enum class CPU { P5600 };
 
+  // Used to avoid printing dsp warnings multiple times.
+  static bool DspWarningPrinted;
+
+  // Used to avoid printing msa warnings multiple times.
+  static bool MSAWarningPrinted;
+
+  // Used to avoid printing crc warnings multiple times.
+  static bool CRCWarningPrinted;
+
+  // Used to avoid printing ginv warnings multiple times.
+  static bool GINVWarningPrinted;
+
+  // Used to avoid printing virt warnings multiple times.
+  static bool VirtWarningPrinted;
+
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
 
@@ -152,6 +171,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasMT -- support MT ASE.
   bool HasMT;
 
+  // HasCRC -- supports R6 CRC ASE
+  bool HasCRC;
+
+  // HasVirt -- supports Virtualization ASE
+  bool HasVirt;
+
+  // HasGINV -- supports R6 Global INValidate ASE
+  bool HasGINV;
+
   // Use hazard variants of the jump register instructions for indirect
   // function calls and jump tables.
   bool UseIndirectJumpsHazard;
@@ -276,6 +304,9 @@ public:
   bool disableMadd4() const { return DisableMadd4; }
   bool hasEVA() const { return HasEVA; }
   bool hasMT() const { return HasMT; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasVirt() const { return HasVirt; }
+  bool hasGINV() const { return HasGINV; }
   bool useIndirectJumpsHazard() const {
     return UseIndirectJumpsHazard && hasMips32r2();
   }
@@ -343,6 +374,19 @@ public:
   const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
+
+protected:
+  // GlobalISel related APIs.
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
+public:
+  const CallLowering *getCallLowering() const override;
+  const LegalizerInfo *getLegalizerInfo() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
+  const InstructionSelector *getInstructionSelector() const override;
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index fb79a4bf40c5..1e6fe2b9f7e7 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -23,6 +23,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
@@ -46,6 +50,12 @@ extern "C" void LLVMInitializeMipsTarget() {
   RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
   RegisterTargetMachine<MipsebTargetMachine> A(getTheMips64Target());
   RegisterTargetMachine<MipselTargetMachine> B(getTheMips64elTarget());
+
+  PassRegistry *PR = PassRegistry::getPassRegistry();
+  initializeGlobalISel(*PR);
+  initializeMipsDelaySlotFillerPass(*PR);
+  initializeMipsBranchExpansionPass(*PR);
+  initializeMicroMipsSizeReducePass(*PR);
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -198,7 +208,7 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
-  DEBUG(dbgs() << "resetSubtarget\n");
+  LLVM_DEBUG(dbgs() << "resetSubtarget\n");
 
   Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(MF->getFunction()));
   MF->setSubtarget(Subtarget);
@@ -230,6 +240,11 @@ public:
   bool addInstSelector() override;
   void addPreEmitPass() override;
   void addPreRegAlloc() override;
+  void addPreEmit2() ;
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
 };
 
 } // end anonymous namespace
@@ -262,26 +277,62 @@ void MipsPassConfig::addPreRegAlloc() {
 TargetTransformInfo
 MipsTargetMachine::getTargetTransformInfo(const Function &F) {
   if (Subtarget->allowMixed16_32()) {
-    DEBUG(errs() << "No Target Transform Info Pass Added\n");
+    LLVM_DEBUG(errs() << "No Target Transform Info Pass Added\n");
     // FIXME: This is no longer necessary as the TTI returned is per-function.
     return TargetTransformInfo(F.getParent()->getDataLayout());
   }
 
-  DEBUG(errs() << "Target Transform Info Pass Added\n");
+  LLVM_DEBUG(errs() << "Target Transform Info Pass Added\n");
   return TargetTransformInfo(BasicTTIImpl(this, F));
 }
 
+void MipsPassConfig::addPreEmit2() {
+}
+
 // Implemented by targets that want to run passes immediately before
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 void MipsPassConfig::addPreEmitPass() {
-  addPass(createMicroMipsSizeReductionPass());
+  // Expand pseudo instructions that are sensitive to register allocation.
+  addPass(createMipsExpandPseudoPass());
 
-  // The delay slot filler and the long branch passes can potientially create
-  // forbidden slot/ hazards for MIPSR6 which the hazard schedule pass will
-  // fix. Any new pass must come before the hazard schedule pass.
+  // The microMIPS size reduction pass performs instruction reselection for
+  // instructions which can be remapped to a 16 bit instruction.
+  addPass(createMicroMipsSizeReducePass());
+
+  // The delay slot filler pass can potientially create forbidden slot hazards
+  // for MIPSR6 and therefore it should go before MipsBranchExpansion pass.
   addPass(createMipsDelaySlotFillerPass());
-  addPass(createMipsLongBranchPass());
-  addPass(createMipsHazardSchedule());
+
+  // This pass expands branches and takes care about the forbidden slot hazards.
+  // Expanding branches may potentially create forbidden slot hazards for
+  // MIPSR6, and fixing such hazard may potentially break a branch by extending
+  // its offset out of range. That's why this pass combine these two tasks, and
+  // runs them alternately until one of them finishes without any changes. Only
+  // then we can be sure that all branches are expanded properly and no hazards
+  // exists.
+  // Any new pass should go before this pass.
+  addPass(createMipsBranchExpansion());
+
   addPass(createMipsConstantIslandPass());
 }
+
+bool MipsPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+
+bool MipsPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+
+bool MipsPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+
+bool MipsPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
index 56e6e5d8daa2..d9b73d151119 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -54,7 +54,7 @@ public:
 
   const MipsSubtarget *getSubtargetImpl(const Function &F) const override;
 
-  /// \brief Reset the subtarget for the Mips target.
+  /// Reset the subtarget for the Mips target.
   void resetSubtarget(MachineFunction *MF);
 
   // Pass Pipeline Configuration
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
index 42473aac7288..a282366f6d40 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -42,6 +42,12 @@ public:
   virtual void emitDirectiveSetNoMsa();
   virtual void emitDirectiveSetMt();
   virtual void emitDirectiveSetNoMt();
+  virtual void emitDirectiveSetCRC();
+  virtual void emitDirectiveSetNoCRC();
+  virtual void emitDirectiveSetVirt();
+  virtual void emitDirectiveSetNoVirt();
+  virtual void emitDirectiveSetGINV();
+  virtual void emitDirectiveSetNoGINV();
   virtual void emitDirectiveSetAt();
   virtual void emitDirectiveSetAtWithArg(unsigned RegNo);
   virtual void emitDirectiveSetNoAt();
@@ -103,6 +109,12 @@ public:
   virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value);
   virtual void emitDirectiveSetOddSPReg();
   virtual void emitDirectiveSetNoOddSPReg();
+  virtual void emitDirectiveModuleCRC();
+  virtual void emitDirectiveModuleNoCRC();
+  virtual void emitDirectiveModuleVirt();
+  virtual void emitDirectiveModuleNoVirt();
+  virtual void emitDirectiveModuleGINV();
+  virtual void emitDirectiveModuleNoGINV();
 
   void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
              const MCSubtargetInfo *STI);
@@ -213,6 +225,12 @@ public:
   void emitDirectiveSetNoMsa() override;
   void emitDirectiveSetMt() override;
   void emitDirectiveSetNoMt() override;
+  void emitDirectiveSetCRC() override;
+  void emitDirectiveSetNoCRC() override;
+  void emitDirectiveSetVirt() override;
+  void emitDirectiveSetNoVirt() override;
+  void emitDirectiveSetGINV() override;
+  void emitDirectiveSetNoGINV() override;
   void emitDirectiveSetAt() override;
   void emitDirectiveSetAtWithArg(unsigned RegNo) override;
   void emitDirectiveSetNoAt() override;
@@ -278,6 +296,12 @@ public:
   void emitDirectiveModuleSoftFloat() override;
   void emitDirectiveModuleHardFloat() override;
   void emitDirectiveModuleMT() override;
+  void emitDirectiveModuleCRC() override;
+  void emitDirectiveModuleNoCRC() override;
+  void emitDirectiveModuleVirt() override;
+  void emitDirectiveModuleNoVirt() override;
+  void emitDirectiveModuleGINV() override;
+  void emitDirectiveModuleNoGINV() override;
   void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
   void emitDirectiveSetOddSPReg() override;
   void emitDirectiveSetNoOddSPReg() override;
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index bdd0f156c8af..f6cbd23f01c4 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -13,16 +13,9 @@
 
 #include "NVPTXMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
-// -debug-compile - Command line option to inform opt and llc passes to
-// compile for debugging
-static cl::opt<bool> CompileForDebugging("debug-compile",
-                                         cl::desc("Compile for debugging"),
-                                         cl::Hidden, cl::init(false));
-
 void NVPTXMCAsmInfo::anchor() {}
 
 NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
@@ -37,7 +30,7 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
   InlineAsmStart = " begin inline asm";
   InlineAsmEnd = " end inline asm";
 
-  SupportsDebugInformation = CompileForDebugging;
+  SupportsDebugInformation = true;
   // PTX does not allow .align on functions.
   HasFunctionAlignment = false;
   HasDotTypeDotSizeDirective = false;
@@ -45,13 +38,16 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
   HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
   ProtectedVisibilityAttr = MCSA_Invalid;
 
-  Data8bitsDirective = " .b8 ";
-  Data16bitsDirective = " .b16 ";
-  Data32bitsDirective = " .b32 ";
-  Data64bitsDirective = " .b64 ";
-  ZeroDirective = " .b8";
-  AsciiDirective = " .b8";
-  AscizDirective = " .b8";
+  // FIXME: remove comment once debug info is properly supported.
+  Data8bitsDirective = "// .b8 ";
+  Data16bitsDirective = nullptr; // not supported
+  Data32bitsDirective = "// .b32 ";
+  Data64bitsDirective = "// .b64 ";
+  ZeroDirective = "// .b8";
+  AsciiDirective = nullptr; // not supported
+  AscizDirective = nullptr; // not supported
+  SupportsQuotedNames = false;
+  SupportsExtendedDwarfLocDirective = false;
 
   // @TODO: Can we just disable this?
   WeakDirective = "\t// .weak\t";
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 9ac3c8850f75..9fd7600cf67f 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -25,6 +25,17 @@ class NVPTXMCAsmInfo : public MCAsmInfo {
 
 public:
   explicit NVPTXMCAsmInfo(const Triple &TheTriple);
+
+  /// Return true if the .section directive should be omitted when
+  /// emitting \p SectionName.  For example:
+  ///
+  /// shouldOmitSectionDirective(".text")
+  ///
+  /// returns false => .section .text,#alloc,#execinstr
+  /// returns true  => .text
+  bool shouldOmitSectionDirective(StringRef SectionName) const override {
+    return true;
+  }
 };
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 12f992749366..b1a77a17ec15 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXMCTargetDesc.h"
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "NVPTXMCAsmInfo.h"
+#include "NVPTXMCTargetDesc.h"
+#include "NVPTXTargetStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -58,6 +59,12 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
   return nullptr;
 }
 
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &,
+                                                 MCInstPrinter *, bool) {
+  return new NVPTXTargetStreamer(S);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeNVPTXTargetMC() {
   for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
@@ -75,5 +82,8 @@ extern "C" void LLVMInitializeNVPTXTargetMC() {
 
     // Register the MCInstPrinter.
     TargetRegistry::RegisterMCInstPrinter(*T, createNVPTXMCInstPrinter);
+
+    // Register the MCTargetStreamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createTargetAsmStreamer);
   }
 }
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
new file mode 100644
index 000000000000..aeb90eca3a05
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -0,0 +1,94 @@
+//=====- NVPTXTargetStreamer.cpp - NVPTXTargetStreamer class ------------=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTXTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+
+using namespace llvm;
+
+//
+// NVPTXTargetStreamer Implemenation
+//
+NVPTXTargetStreamer::NVPTXTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+NVPTXTargetStreamer::~NVPTXTargetStreamer() = default;
+
+void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
+  DwarfFiles.emplace_back(Directive);
+}
+
+static bool isDwarfSection(const MCObjectFileInfo *FI,
+                           const MCSection *Section) {
+  // FIXME: the checks for the DWARF sections are very fragile and should be
+  // fixed up in a followup patch.
+  if (!Section || Section->getKind().isText() ||
+      Section->getKind().isWriteable())
+    return false;
+  return Section == FI->getDwarfAbbrevSection() ||
+         Section == FI->getDwarfInfoSection() ||
+         Section == FI->getDwarfMacinfoSection() ||
+         Section == FI->getDwarfFrameSection() ||
+         Section == FI->getDwarfAddrSection() ||
+         Section == FI->getDwarfRangesSection() ||
+         Section == FI->getDwarfARangesSection() ||
+         Section == FI->getDwarfLocSection() ||
+         Section == FI->getDwarfStrSection() ||
+         Section == FI->getDwarfLineSection() ||
+         Section == FI->getDwarfStrOffSection() ||
+         Section == FI->getDwarfLineStrSection() ||
+         Section == FI->getDwarfPubNamesSection() ||
+         Section == FI->getDwarfPubTypesSection() ||
+         Section == FI->getDwarfSwiftASTSection() ||
+         Section == FI->getDwarfTypesDWOSection() ||
+         Section == FI->getDwarfAbbrevDWOSection() ||
+         Section == FI->getDwarfAccelObjCSection() ||
+         Section == FI->getDwarfAccelNamesSection() ||
+         Section == FI->getDwarfAccelTypesSection() ||
+         Section == FI->getDwarfAccelNamespaceSection() ||
+         Section == FI->getDwarfLocDWOSection() ||
+         Section == FI->getDwarfStrDWOSection() ||
+         Section == FI->getDwarfCUIndexSection() ||
+         Section == FI->getDwarfInfoDWOSection() ||
+         Section == FI->getDwarfLineDWOSection() ||
+         Section == FI->getDwarfTUIndexSection() ||
+         Section == FI->getDwarfStrOffDWOSection() ||
+         Section == FI->getDwarfDebugNamesSection() ||
+         Section == FI->getDwarfDebugInlineSection() ||
+         Section == FI->getDwarfGnuPubNamesSection() ||
+         Section == FI->getDwarfGnuPubTypesSection();
+}
+
+void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
+                                        MCSection *Section,
+                                        const MCExpr *SubSection,
+                                        raw_ostream &OS) {
+  assert(!SubSection && "SubSection is not null!");
+  const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo();
+  // FIXME: remove comment once debug info is properly supported.
+  // Emit closing brace for DWARF sections only.
+  if (isDwarfSection(FI, CurSection))
+    OS << "//\t}\n";
+  if (isDwarfSection(FI, Section)) {
+    // Emit DWARF .file directives in the outermost scope.
+    for (const std::string &S : DwarfFiles)
+      getStreamer().EmitRawText(S.data());
+    DwarfFiles.clear();
+    OS << "//\t.section";
+    Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
+                                  FI->getTargetTriple(), OS, SubSection);
+    // DWARF sections are enclosed into braces - emit the open one.
+    OS << "//\t{\n";
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
new file mode 100644
index 000000000000..30831ab8bbeb
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -0,0 +1,46 @@
+//=====-- NVPTXTargetStreamer.h - NVPTX Target Streamer ------*- C++ -*--=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class MCSection;
+
+/// Implments NVPTX-specific streamer.
+class NVPTXTargetStreamer : public MCTargetStreamer {
+private:
+  SmallVector<std::string, 4> DwarfFiles;
+
+public:
+  NVPTXTargetStreamer(MCStreamer &S);
+  ~NVPTXTargetStreamer() override;
+
+  /// Record DWARF file directives for later output.
+  /// According to PTX ISA, CUDA Toolkit documentation, 11.5.3. Debugging
+  /// Directives: .file
+  /// (http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file),
+  /// The .file directive is allowed only in the outermost scope, i.e., at the
+  /// same level as kernel and device function declarations. Also, the order of
+  /// the .loc and .file directive does not matter, .file directives may follow
+  /// the .loc directives where the file is referenced.
+  /// LLVM emits .file directives immediately the location debug info is
+  /// emitted, i.e. they may be emitted inside functions. We gather all these
+  /// directives and emit them outside of the sections and, thus, outside of the
+  /// functions.
+  void emitDwarfFileDirective(StringRef Directive) override;
+  void changeSection(const MCSection *CurSection, MCSection *Section,
+                     const MCExpr *SubSection, raw_ostream &OS) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
index aba37d363591..6494c46f54ab 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
@@ -52,9 +52,8 @@ def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
                              "Target SM 6.2">;
 def SM70 : SubtargetFeature<"sm_70", "SmVersion", "70",
                              "Target SM 7.0">;
-
-def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true",
-                             "Atomic operations with scope">;
+def SM72 : SubtargetFeature<"sm_72", "SmVersion", "72",
+                             "Target SM 7.2">;
 
 // PTX Versions
 def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -71,6 +70,8 @@ def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50",
                              "Use PTX version 5.0">;
 def PTX60 : SubtargetFeature<"ptx60", "PTXVersion", "60",
                              "Use PTX version 6.0">;
+def PTX61 : SubtargetFeature<"ptx61", "PTXVersion", "61",
+                             "Use PTX version 6.1">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -88,10 +89,11 @@ def : Proc<"sm_37", [SM37, PTX41]>;
 def : Proc<"sm_50", [SM50, PTX40]>;
 def : Proc<"sm_52", [SM52, PTX41]>;
 def : Proc<"sm_53", [SM53, PTX42]>;
-def : Proc<"sm_60", [SM60, PTX50, SATOM]>;
-def : Proc<"sm_61", [SM61, PTX50, SATOM]>;
-def : Proc<"sm_62", [SM62, PTX50, SATOM]>;
-def : Proc<"sm_70", [SM70, PTX60, SATOM]>;
+def : Proc<"sm_60", [SM60, PTX50]>;
+def : Proc<"sm_61", [SM61, PTX50]>;
+def : Proc<"sm_62", [SM62, PTX50]>;
+def : Proc<"sm_70", [SM70, PTX60]>;
+def : Proc<"sm_72", [SM72, PTX61]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 753cfff4cdae..a966b9928400 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -44,9 +44,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
@@ -75,16 +73,17 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <cassert>
 #include <cstdint>
 #include <cstring>
 #include <new>
-#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -93,16 +92,6 @@ using namespace llvm;
 
 #define DEPOTNAME "__local_depot"
 
-static cl::opt<bool>
-EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
-                cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
-                cl::init(true));
-
-static cl::opt<bool>
-InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
-              cl::desc("NVPTX Specific: Emit source line in ptx file"),
-              cl::init(false));
-
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
 /// depends.
 static void
@@ -151,56 +140,7 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV,
   Visiting.erase(GV);
 }
 
-void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
-  if (!EmitLineNumbers)
-    return;
-  if (ignoreLoc(MI))
-    return;
-
-  const DebugLoc &curLoc = MI.getDebugLoc();
-
-  if (!prevDebugLoc && !curLoc)
-    return;
-
-  if (prevDebugLoc == curLoc)
-    return;
-
-  prevDebugLoc = curLoc;
-
-  if (!curLoc)
-    return;
-
-  auto *Scope = cast_or_null<DIScope>(curLoc.getScope());
-  if (!Scope)
-     return;
-
-  StringRef fileName(Scope->getFilename());
-  StringRef dirName(Scope->getDirectory());
-  SmallString<128> FullPathName = dirName;
-  if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
-    sys::path::append(FullPathName, fileName);
-    fileName = FullPathName;
-  }
-
-  if (filenameMap.find(fileName) == filenameMap.end())
-    return;
-
-  // Emit the line from the source file.
-  if (InterleaveSrc)
-    this->emitSrcInText(fileName, curLoc.getLine());
-
-  std::stringstream temp;
-  temp << "\t.loc " << filenameMap[fileName] << " " << curLoc.getLine()
-       << " " << curLoc.getCol();
-  OutStreamer->EmitRawText(temp.str());
-}
-
 void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  SmallString<128> Str;
-  raw_svector_ostream OS(Str);
-  if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA)
-    emitLineNumberAsDotLoc(*MI);
-
   MCInst Inst;
   lowerToMCInst(MI, Inst);
   EmitToStreamer(*OutStreamer, Inst);
@@ -505,7 +445,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
     emitGlobals(*MF->getFunction().getParent());
     GlobalsEmitted = true;
   }
-  
+
   // Set up
   MRI = &MF->getRegInfo();
   F = &MF->getFunction();
@@ -526,14 +466,25 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
 
   OutStreamer->EmitRawText(O.str());
 
-  prevDebugLoc = DebugLoc();
-}
-
-void NVPTXAsmPrinter::EmitFunctionBodyStart() {
   VRegMapping.clear();
+  // Emit open brace for function body.
   OutStreamer->EmitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
+}
 
+bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
+  nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
+  bool Result = AsmPrinter::runOnMachineFunction(F);
+  // Emit closing brace for the body of function F.
+  // The closing brace must be emitted here because we need to emit additional
+  // debug labels/data after the last basic block.
+  // We need to emit the closing brace here because we don't have function that
+  // finished emission of the function body.
+  OutStreamer->EmitRawText(StringRef("}\n"));
+  return Result;
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
   emitDemotedVars(&MF->getFunction(), O);
@@ -541,10 +492,15 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() {
 }
 
 void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
-  OutStreamer->EmitRawText(StringRef("}\n"));
   VRegMapping.clear();
 }
 
+const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const {
+    SmallString<128> Str;
+    raw_svector_ostream(Str) << DEPOTNAME << getFunctionNumber();
+    return OutContext.getOrCreateSymbol(Str);
+}
+
 void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
   if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
@@ -818,42 +774,6 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   }
 }
 
-void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
-  DebugInfoFinder DbgFinder;
-  DbgFinder.processModule(M);
-
-  unsigned i = 1;
-  for (const DICompileUnit *DIUnit : DbgFinder.compile_units()) {
-    StringRef Filename = DIUnit->getFilename();
-    StringRef Dirname = DIUnit->getDirectory();
-    SmallString<128> FullPathName = Dirname;
-    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
-      sys::path::append(FullPathName, Filename);
-      Filename = FullPathName;
-    }
-    if (filenameMap.find(Filename) != filenameMap.end())
-      continue;
-    filenameMap[Filename] = i;
-    OutStreamer->EmitDwarfFileDirective(i, "", Filename);
-    ++i;
-  }
-
-  for (DISubprogram *SP : DbgFinder.subprograms()) {
-    StringRef Filename = SP->getFilename();
-    StringRef Dirname = SP->getDirectory();
-    SmallString<128> FullPathName = Dirname;
-    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
-      sys::path::append(FullPathName, Filename);
-      Filename = FullPathName;
-    }
-    if (filenameMap.find(Filename) != filenameMap.end())
-      continue;
-    filenameMap[Filename] = i;
-    OutStreamer->EmitDwarfFileDirective(i, "", Filename);
-    ++i;
-  }
-}
-
 static bool isEmptyXXStructor(GlobalVariable *GV) {
   if (!GV) return true;
   const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
@@ -889,24 +809,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   SmallString<128> Str1;
   raw_svector_ostream OS1(Str1);
 
-  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
-
   // We need to call the parent's one explicitly.
-  //bool Result = AsmPrinter::doInitialization(M);
-
-  // Initialize TargetLoweringObjectFile since we didn't do in
-  // AsmPrinter::doInitialization either right above or where it's commented out
-  // below.
-  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
-      .Initialize(OutContext, TM);
+  bool Result = AsmPrinter::doInitialization(M);
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1, STI);
   OutStreamer->EmitRawText(OS1.str());
 
-  // Already commented out
-  //bool Result = AsmPrinter::doInitialization(M);
-
   // Emit module-level inline asm if it exists.
   if (!M.getModuleInlineAsm().empty()) {
     OutStreamer->AddComment("Start of file scope inline assembly");
@@ -917,13 +826,9 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
     OutStreamer->AddBlankLine();
   }
 
-  // If we're not NVCL we're CUDA, go ahead and emit filenames.
-  if (TM.getTargetTriple().getOS() != Triple::NVCL)
-    recordAndEmitFilenames(M);
-
   GlobalsEmitted = false;
-    
-  return false; // success
+
+  return Result;
 }
 
 void NVPTXAsmPrinter::emitGlobals(const Module &M) {
@@ -974,13 +879,10 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
   if (NTM.getDrvInterface() == NVPTX::NVCL)
     O << ", texmode_independent";
-  else {
-    if (!STI.hasDouble())
-      O << ", map_f64_to_f32";
-  }
 
-  if (MAI->doesSupportDebugInformation())
-    O << ", debug";
+  // FIXME: remove comment once debug info is properly supported.
+  if (MMI && MMI->hasDebugInfo())
+    O << "//, debug";
 
   O << "\n";
 
@@ -995,6 +897,8 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
+  bool HasDebugInfo = MMI && MMI->hasDebugInfo();
+
   // If we did not emit any functions, then the global declarations have not
   // yet been emitted.
   if (!GlobalsEmitted) {
@@ -1029,6 +933,11 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   clearAnnotationCache(&M);
 
   delete[] gv_array;
+  // FIXME: remove comment once debug info is properly supported.
+  // Close the last emitted section
+  if (HasDebugInfo)
+    OutStreamer->EmitRawText("//\t}");
+
   return ret;
 
   //bool Result = AsmPrinter::doFinalization(M);
@@ -1365,7 +1274,8 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
     O << "shared";
     break;
   default:
-    report_fatal_error("Bad address space found while emitting PTX");
+    report_fatal_error("Bad address space found while emitting PTX: " +
+                       llvm::Twine(AddressSpace));
     break;
   }
 }
@@ -1433,7 +1343,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     return;
   }
 
-  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+  if (ETy->isFloatingPointTy() || ETy->isIntOrPtrTy()) {
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
@@ -1948,11 +1858,17 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
       llvm_unreachable("unsupported integer const type");
     break;
   }
+  case Type::HalfTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID: {
     const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
     Type *Ty = CFP->getType();
-    if (Ty == Type::getFloatTy(CPV->getContext())) {
+    if (Ty == Type::getHalfTy(CPV->getContext())) {
+      APInt API = CFP->getValueAPF().bitcastToAPInt();
+      uint16_t float16 = API.getLoBits(16).getZExtValue();
+      ConvertIntToBytes<>(ptr, float16);
+      aggBuffer->addBytes(ptr, 2, Bytes);
+    } else if (Ty == Type::getFloatTy(CPV->getContext())) {
       float float32 = (float) CFP->getValueAPF().convertToFloat();
       ConvertFloatToBytes(ptr, float32);
       aggBuffer->addBytes(ptr, 4, Bytes);
@@ -2049,65 +1965,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
   llvm_unreachable("unsupported constant type in printAggregateConstant()");
 }
 
-// buildTypeNameMap - Run through symbol table looking for type names.
-//
-
-bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case NVPTX::CallArgBeginInst:
-  case NVPTX::CallArgEndInst0:
-  case NVPTX::CallArgEndInst1:
-  case NVPTX::CallArgF32:
-  case NVPTX::CallArgF64:
-  case NVPTX::CallArgI16:
-  case NVPTX::CallArgI32:
-  case NVPTX::CallArgI32imm:
-  case NVPTX::CallArgI64:
-  case NVPTX::CallArgParam:
-  case NVPTX::CallVoidInst:
-  case NVPTX::CallVoidInstReg:
-  case NVPTX::Callseq_End:
-  case NVPTX::CallVoidInstReg64:
-  case NVPTX::DeclareParamInst:
-  case NVPTX::DeclareRetMemInst:
-  case NVPTX::DeclareRetRegInst:
-  case NVPTX::DeclareRetScalarInst:
-  case NVPTX::DeclareScalarParamInst:
-  case NVPTX::DeclareScalarRegInst:
-  case NVPTX::StoreParamF32:
-  case NVPTX::StoreParamF64:
-  case NVPTX::StoreParamI16:
-  case NVPTX::StoreParamI32:
-  case NVPTX::StoreParamI64:
-  case NVPTX::StoreParamI8:
-  case NVPTX::StoreRetvalF32:
-  case NVPTX::StoreRetvalF64:
-  case NVPTX::StoreRetvalI16:
-  case NVPTX::StoreRetvalI32:
-  case NVPTX::StoreRetvalI64:
-  case NVPTX::StoreRetvalI8:
-  case NVPTX::LastCallArgF32:
-  case NVPTX::LastCallArgF64:
-  case NVPTX::LastCallArgI16:
-  case NVPTX::LastCallArgI32:
-  case NVPTX::LastCallArgI32imm:
-  case NVPTX::LastCallArgI64:
-  case NVPTX::LastCallArgParam:
-  case NVPTX::LoadParamMemF32:
-  case NVPTX::LoadParamMemF64:
-  case NVPTX::LoadParamMemI16:
-  case NVPTX::LoadParamMemI32:
-  case NVPTX::LoadParamMemI64:
-  case NVPTX::LoadParamMemI8:
-  case NVPTX::PrototypeInst:
-  case NVPTX::DBG_VALUE:
-    return true;
-  }
-  return false;
-}
-
 /// lowerConstantForGV - Return an MCExpr for the given Constant.  This is mostly
 /// a copy from AsmPrinter::lowerConstant, except customized to only handle
 /// expressions that are representable in PTX and create
@@ -2408,44 +2265,6 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
   }
 }
 
-void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
-  std::stringstream temp;
-  LineReader *reader = this->getReader(filename);
-  temp << "\n//";
-  temp << filename.str();
-  temp << ":";
-  temp << line;
-  temp << " ";
-  temp << reader->readLine(line);
-  temp << "\n";
-  this->OutStreamer->EmitRawText(temp.str());
-}
-
-LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) {
-  if (!reader) {
-    reader = new LineReader(filename);
-  }
-
-  if (reader->fileName() != filename) {
-    delete reader;
-    reader = new LineReader(filename);
-  }
-
-  return reader;
-}
-
-std::string LineReader::readLine(unsigned lineNum) {
-  if (lineNum < theCurLine) {
-    theCurLine = 0;
-    fstr.seekg(0, std::ios::beg);
-  }
-  while (theCurLine < lineNum) {
-    fstr.getline(buff, 500);
-    theCurLine++;
-  }
-  return buff;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeNVPTXAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 8ec3476b8719..3b042c74b26c 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -41,7 +41,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
-#include <fstream>
 #include <map>
 #include <memory>
 #include <string>
@@ -60,27 +59,6 @@ namespace llvm {
 
 class MCOperand;
 
-class LineReader {
-private:
-  unsigned theCurLine;
-  std::ifstream fstr;
-  char buff[512];
-  std::string theFileName;
-  SmallVector<unsigned, 32> lineOffset;
-
-public:
-  LineReader(std::string filename) {
-    theCurLine = 0;
-    fstr.open(filename.c_str());
-    theFileName = filename;
-  }
-
-  ~LineReader() { fstr.close(); }
-
-  std::string fileName() { return theFileName; }
-  std::string readLine(unsigned line);
-};
-
 class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   class AggBuffer {
@@ -217,8 +195,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   friend class AggBuffer;
 
-  void emitSrcInText(StringRef filename, unsigned line);
-
 private:
   StringRef getPassName() const override { return "NVPTX Assembly Printer"; }
 
@@ -271,8 +247,6 @@ protected:
   bool doFinalization(Module &M) override;
 
 private:
-  std::string CurrentBankselLabelInBasicBlock;
-
   bool GlobalsEmitted;
   
   // This is specific per MachineFunction.
@@ -287,17 +261,9 @@ private:
   // Cache the subtarget here.
   const NVPTXSubtarget *nvptxSubtarget;
 
-  // Build the map between type name and ID based on module's type
-  // symbol table.
-  std::map<Type *, std::string> TypeNameMap;
-
   // List of variables demoted to a function scope.
   std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
 
-  // To record filename to ID mapping
-  std::map<std::string, unsigned> filenameMap;
-  void recordAndEmitFilenames(Module &);
-
   void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
   void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
   std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
@@ -317,10 +283,6 @@ private:
 
   bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
 
-  LineReader *reader = nullptr;
-
-  LineReader *getReader(const std::string &);
-
   // Used to control the need to emit .generic() in the initializer of
   // module scope variables.
   // Although ptx supports the hybrid mode like the following,
@@ -340,26 +302,16 @@ public:
         EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
                     NVPTX::CUDA) {}
 
-  ~NVPTXAsmPrinter() override {
-    delete reader;
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) override {
-    nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
-    return AsmPrinter::runOnMachineFunction(F);
-  }
+  bool runOnMachineFunction(MachineFunction &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineLoopInfo>();
     AsmPrinter::getAnalysisUsage(AU);
   }
 
-  bool ignoreLoc(const MachineInstr &);
-
   std::string getVirtualRegisterName(unsigned) const;
 
-  DebugLoc prevDebugLoc;
-  void emitLineNumberAsDotLoc(const MachineInstr &);
+  const MCSymbol *getFunctionFrameSymbol() const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index f02c33f9249a..41e9ae827180 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -28,7 +28,7 @@
 using namespace llvm;
 
 namespace {
-/// \brief NVPTXAssignValidGlobalNames
+/// NVPTXAssignValidGlobalNames
 class NVPTXAssignValidGlobalNames : public ModulePass {
 public:
   static char ID;
@@ -36,7 +36,7 @@ public:
 
   bool runOnModule(Module &M) override;
 
-  /// \brief Clean up the name to remove symbols invalid in PTX.
+  /// Clean up the name to remove symbols invalid in PTX.
   std::string cleanUpName(StringRef Name);
 };
 }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 729f3ed7b79e..e5e6637967b2 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -64,6 +64,14 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+int NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                               int FI,
+                                               unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  FrameReg = NVPTX::VRDepot;
+  return MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+}
+
 void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {}
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
index a802cf85d2e0..0a7856b9d5de 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -25,6 +25,8 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 916b0e115664..fd63fdbaced6 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -45,8 +45,6 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {}
 
 private:
-  Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
-                         IRBuilder<> &Builder);
   Value *remapConstant(Module *M, Function *F, Constant *C,
                        IRBuilder<> &Builder);
   Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
@@ -156,46 +154,6 @@ bool GenericToNVVM::runOnModule(Module &M) {
   return true;
 }
 
-Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
-                                      GlobalVariable *GV,
-                                      IRBuilder<> &Builder) {
-  PointerType *GVType = GV->getType();
-  Value *CVTA = nullptr;
-
-  // See if the address space conversion requires the operand to be bitcast
-  // to i8 addrspace(n)* first.
-  EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true);
-  if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
-    // A bitcast to i8 addrspace(n)* on the operand is needed.
-    LLVMContext &Context = M->getContext();
-    unsigned int AddrSpace = GVType->getAddressSpace();
-    Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
-    CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
-    // Insert the address space conversion.
-    Type *ResultType =
-        PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
-    Function *CVTAFunction = Intrinsic::getDeclaration(
-        M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy});
-    CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
-    // Another bitcast from i8 * to <the element type of GVType> * is
-    // required.
-    DestTy =
-        PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC);
-    CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
-  } else {
-    // A simple CVTA is enough.
-    SmallVector<Type *, 2> ParamTypes;
-    ParamTypes.push_back(PointerType::get(GV->getValueType(),
-                                          llvm::ADDRESS_SPACE_GENERIC));
-    ParamTypes.push_back(GVType);
-    Function *CVTAFunction = Intrinsic::getDeclaration(
-        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
-    CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
-  }
-
-  return CVTA;
-}
-
 Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
                                     IRBuilder<> &Builder) {
   // If the constant C has been converted already in the given function  F, just
@@ -207,17 +165,17 @@ Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
 
   Value *NewValue = C;
   if (isa<GlobalVariable>(C)) {
-    // If the constant C is a global variable and is found in  GVMap, generate a
-    // set set of instructions that convert the clone of C with the global
-    // address space specifier to a generic pointer.
-    // The constant C cannot be used here, as it will be erased from the
-    // module eventually.  And the clone of C with the global address space
-    // specifier cannot be used here either, as it will affect the types of
-    // other instructions in the function.  Hence, this address space conversion
-    // is required.
+    // If the constant C is a global variable and is found in GVMap, substitute
+    //
+    //   addrspacecast GVMap[C] to addrspace(0)
+    //
+    // for our use of C.
     GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
     if (I != GVMap.end()) {
-      NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+      GlobalVariable *GV = I->second;
+      NewValue = Builder.CreateAddrSpaceCast(
+          GV,
+          PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC));
     }
   } else if (isa<ConstantAggregate>(C)) {
     // If any element in the constant vector or aggregate C is or uses a global
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 57e2acc0d7e0..4dfa8477a362 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -66,6 +66,10 @@ bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
   return TL->allowUnsafeFPMath(*MF);
 }
 
+bool NVPTXDAGToDAGISel::useShortPointers() const {
+  return TM.useShortPointers();
+}
+
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 void NVPTXDAGToDAGISel::Select(SDNode *N) {
@@ -496,325 +500,11 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
   SelectCode(N);
 }
 
-// Each instruction has four addressing variants. WMMA_VARIANTS() macro below
-// constructs an array indexed by WmmaVariant which getWmmaLdVariant() uses to
-// look up the intrinsic ID of particular variant.
-enum WmmaVariant {
-  WMMA_VARIANT_ARI64,
-  WMMA_VARIANT_ARI64_STRIDE,
-  WMMA_VARIANT_AVAR,
-  WMMA_VARIANT_AVAR_STRIDE,
-};
-
-// clang-format off
-#define WMMA_VARIANTS(base) \
-  {{ base##_ari64, base##_ari64_stride, base##_avar, base##_avar_stride }}
-// clang-format on
-
-static unsigned getWmmaLdVariant(WmmaVariant Variant, bool Stride,
-                                 const std::array<unsigned, 4> Variants) {
-  if (Stride) {
-    if (Variant == WMMA_VARIANT_ARI64)
-      Variant = WMMA_VARIANT_ARI64_STRIDE;
-    else if (Variant == WMMA_VARIANT_AVAR)
-      Variant = WMMA_VARIANT_AVAR_STRIDE;
-  }
-  return Variants[Variant];
-}
-
-static Optional<unsigned>
-getWmmaLdStOpcode(unsigned IntrinsicID,
-                  WmmaVariant Variant = WMMA_VARIANT_ARI64) {
-  switch (IntrinsicID) {
-  default:
-    return None;
-  //
-  // WMMA_LOAD_A f16
-  //
-  case Intrinsic::nvvm_wmma_load_a_f16_col:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col));
-  case Intrinsic::nvvm_wmma_load_a_f16_row:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row));
-  case Intrinsic::nvvm_wmma_load_a_f16_col_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col));
-  case Intrinsic::nvvm_wmma_load_a_f16_row_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row));
-  case Intrinsic::nvvm_wmma_load_a_f16_col_shared:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared));
-  case Intrinsic::nvvm_wmma_load_a_f16_row_shared:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared));
-  case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared));
-  case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared));
-  case Intrinsic::nvvm_wmma_load_a_f16_col_global:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global));
-  case Intrinsic::nvvm_wmma_load_a_f16_row_global:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global));
-  case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global));
-  case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global));
-
-  //
-  // WMMA_LOAD_B f16
-  //
-  case Intrinsic::nvvm_wmma_load_b_f16_col:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col));
-  case Intrinsic::nvvm_wmma_load_b_f16_row:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row));
-  case Intrinsic::nvvm_wmma_load_b_f16_col_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col));
-  case Intrinsic::nvvm_wmma_load_b_f16_row_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row));
-  case Intrinsic::nvvm_wmma_load_b_f16_col_shared:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared));
-  case Intrinsic::nvvm_wmma_load_b_f16_row_shared:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared));
-  case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared));
-  case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared));
-  case Intrinsic::nvvm_wmma_load_b_f16_col_global:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global));
-  case Intrinsic::nvvm_wmma_load_b_f16_row_global:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global));
-  case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global));
-  case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global));
-
-  //
-  // WMMA_LOAD_C f16
-  //
-  case Intrinsic::nvvm_wmma_load_c_f16_col:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col));
-  case Intrinsic::nvvm_wmma_load_c_f16_row:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row));
-  case Intrinsic::nvvm_wmma_load_c_f16_col_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col));
-  case Intrinsic::nvvm_wmma_load_c_f16_row_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row));
-  case Intrinsic::nvvm_wmma_load_c_f16_col_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared));
-  case Intrinsic::nvvm_wmma_load_c_f16_row_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared));
-  case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared));
-  case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared));
-  case Intrinsic::nvvm_wmma_load_c_f16_col_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global));
-  case Intrinsic::nvvm_wmma_load_c_f16_row_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global));
-  case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global));
-  case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global));
-
-  //
-  // WMMA_LOAD_C f32
-  //
-  case Intrinsic::nvvm_wmma_load_c_f32_col:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col));
-  case Intrinsic::nvvm_wmma_load_c_f32_row:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row));
-  case Intrinsic::nvvm_wmma_load_c_f32_col_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col));
-  case Intrinsic::nvvm_wmma_load_c_f32_row_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row));
-  case Intrinsic::nvvm_wmma_load_c_f32_col_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared));
-  case Intrinsic::nvvm_wmma_load_c_f32_row_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared));
-  case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared));
-  case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared));
-  case Intrinsic::nvvm_wmma_load_c_f32_col_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global));
-  case Intrinsic::nvvm_wmma_load_c_f32_row_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global));
-  case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global));
-  case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global));
-
-  //
-  // WMMA_STORE_D f16
-  //
-  case Intrinsic::nvvm_wmma_store_d_f16_col:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col));
-  case Intrinsic::nvvm_wmma_store_d_f16_row:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row));
-  case Intrinsic::nvvm_wmma_store_d_f16_col_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col));
-  case Intrinsic::nvvm_wmma_store_d_f16_row_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row));
-  case Intrinsic::nvvm_wmma_store_d_f16_col_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared));
-  case Intrinsic::nvvm_wmma_store_d_f16_row_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared));
-  case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared));
-  case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared));
-  case Intrinsic::nvvm_wmma_store_d_f16_col_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global));
-  case Intrinsic::nvvm_wmma_store_d_f16_row_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global));
-  case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global));
-  case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global));
-
-  //
-  // WMMA_STORE_D f32
-  //
-  case Intrinsic::nvvm_wmma_store_d_f32_col:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col));
-  case Intrinsic::nvvm_wmma_store_d_f32_row:
-    return getWmmaLdVariant(Variant, /*Stride=*/false,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row));
-  case Intrinsic::nvvm_wmma_store_d_f32_col_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col));
-  case Intrinsic::nvvm_wmma_store_d_f32_row_stride:
-    return getWmmaLdVariant(Variant, /*Stride=*/true,
-                            WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row));
-  case Intrinsic::nvvm_wmma_store_d_f32_col_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared));
-  case Intrinsic::nvvm_wmma_store_d_f32_row_shared:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared));
-  case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared));
-  case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared));
-  case Intrinsic::nvvm_wmma_store_d_f32_col_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global));
-  case Intrinsic::nvvm_wmma_store_d_f32_row_global:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/false,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global));
-  case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global));
-  case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride:
-    return getWmmaLdVariant(
-        Variant, /*Stride=*/true,
-        WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global));
-  }
-}
-#undef WMMA_VARIANTS
-
 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  if (getWmmaLdStOpcode(IID))
-    return tryWMMA_LDST(N);
-
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_match_all_sync_i32p:
-  case Intrinsic::nvvm_match_all_sync_i64p:
-    SelectMatchAll(N);
-    return true;
   case Intrinsic::nvvm_ldg_global_f:
   case Intrinsic::nvvm_ldg_global_i:
   case Intrinsic::nvvm_ldg_global_p:
@@ -987,8 +677,10 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
   // We have two ways of identifying invariant loads: Loads may be explicitly
   // marked as invariant, or we may infer them to be invariant.
   //
-  // We currently infer invariance only for kernel function pointer params that
-  // are noalias (i.e. __restrict) and never written to.
+  // We currently infer invariance for loads from
+  //  - constant global variables, and
+  //  - kernel function pointer params that are noalias (i.e. __restrict) and
+  //    never written to.
   //
   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
   // not during the SelectionDAG phase).
@@ -1002,23 +694,22 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
   if (N->isInvariant())
     return true;
 
-  // Load wasn't explicitly invariant.  Attempt to infer invariance.
-  if (!isKernelFunction(F->getFunction()))
-    return false;
+  bool IsKernelFn = isKernelFunction(F->getFunction());
 
-  // We use GetUnderlyingObjects() here instead of
-  // GetUnderlyingObject() mainly because the former looks through phi
-  // nodes while the latter does not. We need to look through phi
-  // nodes to handle pointer induction variables.
+  // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
+  // because the former looks through phi nodes while the latter does not. We
+  // need to look through phi nodes to handle pointer induction variables.
   SmallVector<Value *, 8> Objs;
   GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
                        Objs, F->getDataLayout());
-  for (Value *Obj : Objs) {
-    auto *A = dyn_cast<const Argument>(Obj);
-    if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
-  }
 
-  return true;
+  return all_of(Objs, [&](Value *V) {
+    if (auto *A = dyn_cast<const Argument>(V))
+      return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
+    if (auto *GV = dyn_cast<const GlobalVariable>(V))
+      return GV->isConstant();
+    return false;
+  });
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
@@ -1029,39 +720,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
   case Intrinsic::nvvm_texsurf_handle_internal:
     SelectTexSurfHandle(N);
     return true;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32:
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite:
-    return tryWMMA_MMA(N);
   }
 }
 
@@ -1073,42 +731,11 @@ void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
                                         MVT::i64, GlobalVal));
 }
 
-void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) {
-  SDLoc DL(N);
-  enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 };
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  unsigned OpcodeIndex =
-      (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0;
-  SDValue MaskOp = N->getOperand(2);
-  SDValue ValueOp = N->getOperand(3);
-  if (ConstantSDNode *ValueConst = dyn_cast<ConstantSDNode>(ValueOp)) {
-    OpcodeIndex |= HAS_CONST_VALUE;
-    ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL,
-                                        ValueConst->getValueType(0));
-  }
-  if (ConstantSDNode *MaskConst = dyn_cast<ConstantSDNode>(MaskOp)) {
-    OpcodeIndex |= HAS_CONST_MASK;
-    MaskOp = CurDAG->getTargetConstant(MaskConst->getZExtValue(), DL,
-                                       MaskConst->getValueType(0));
-  }
-  // Maps {IS_I64, HAS_CONST_VALUE, HAS_CONST_MASK} -> opcode
-  unsigned Opcodes[8] = {
-      NVPTX::MATCH_ALLP_SYNC_32rr, NVPTX::MATCH_ALLP_SYNC_32ri,
-      NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii,
-      NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri,
-      NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii};
-  SDNode *NewNode = CurDAG->getMachineNode(
-      Opcodes[OpcodeIndex], DL, {ValueOp->getValueType(0), MVT::i1, MVT::Other},
-      {MaskOp, ValueOp});
-  ReplaceNode(N, NewNode);
-}
-
 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
   unsigned DstAddrSpace = CastN->getDestAddressSpace();
-
   assert(SrcAddrSpace != DstAddrSpace &&
          "addrspacecast must be between different address spaces");
 
@@ -1121,13 +748,19 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
       break;
     case ADDRESS_SPACE_SHARED:
-      Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
+      Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
+                                               : NVPTX::cvta_shared_yes_64)
+                         : NVPTX::cvta_shared_yes;
       break;
     case ADDRESS_SPACE_CONST:
-      Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes;
+      Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
+                                               : NVPTX::cvta_const_yes_64)
+                         : NVPTX::cvta_const_yes;
       break;
     case ADDRESS_SPACE_LOCAL:
-      Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
+      Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
+                                               : NVPTX::cvta_local_yes_64)
+                         : NVPTX::cvta_local_yes;
       break;
     }
     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
@@ -1145,16 +778,19 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
                          : NVPTX::cvta_to_global_yes;
       break;
     case ADDRESS_SPACE_SHARED:
-      Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+      Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
+                                                : NVPTX::cvta_to_shared_yes_64)
                          : NVPTX::cvta_to_shared_yes;
       break;
     case ADDRESS_SPACE_CONST:
-      Opc =
-          TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes;
+      Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
+                                             : NVPTX::cvta_to_const_yes_64)
+                         : NVPTX::cvta_to_const_yes;
       break;
     case ADDRESS_SPACE_LOCAL:
-      Opc =
-          TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
+      Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
+                                               : NVPTX::cvta_to_local_yes_64)
+                         : NVPTX::cvta_to_local_yes;
       break;
     case ADDRESS_SPACE_PARAM:
       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
@@ -1210,18 +846,20 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
     return false;
 
   // Address Space Setting
-  unsigned int codeAddrSpace = getCodeAddrSpace(LD);
-
-  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
+  unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
+  if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
+  unsigned int PointerSize =
+      CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
+
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool isVolatile = LD->isVolatile();
-  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
   // Type Setting: fromType + fromTypeWidth
@@ -1268,27 +906,27 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
-  } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
-                          : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+  } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+                               : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
-  } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
-                          : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
-    if (TM.is64Bit())
+  } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+                               : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
@@ -1300,13 +938,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
   } else {
-    if (TM.is64Bit())
+    if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
@@ -1319,7 +957,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), N1, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
@@ -1353,11 +991,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
-
   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
+  unsigned int PointerSize =
+      CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
+
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool IsVolatile = MemSD->isVolatile();
@@ -1440,8 +1080,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
-  } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
-                          : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+  } else if (PointerSize == 64
+                 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
@@ -1466,9 +1107,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
-  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
-                          : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
-    if (TM.is64Bit()) {
+  } else if (PointerSize == 64
+                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
@@ -1516,7 +1158,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else {
-    if (TM.is64Bit()) {
+    if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
@@ -1615,6 +1257,12 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
   if (EltVT.isVector()) {
     NumElts = EltVT.getVectorNumElements();
     EltVT = EltVT.getVectorElementType();
+    // vectors of f16 are loaded/stored as multiples of v2f16 elements.
+    if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
+      assert(NumElts % 2 == 0 && "Vector must have even number of elements");
+      EltVT = MVT::v2f16;
+      NumElts /= 2;
+    }
   }
 
   // Build the "promoted" result VTList for the load. If we are really loading
@@ -1632,6 +1280,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     switch (N->getOpcode()) {
     default:
       return false;
+    case ISD::LOAD:
     case ISD::INTRINSIC_W_CHAIN:
       if (IsLDG)
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
@@ -1654,6 +1303,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
       break;
+    case NVPTXISD::LoadV2:
     case NVPTXISD::LDGV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
@@ -1676,6 +1326,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
       break;
+    case NVPTXISD::LoadV4:
     case NVPTXISD::LDGV4:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
@@ -2052,14 +1703,16 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     return false;
 
   // Address Space Setting
-  unsigned int codeAddrSpace = getCodeAddrSpace(ST);
+  unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
+  unsigned int PointerSize =
+      CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool isVolatile = ST->isVolatile();
-  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
   // Vector Setting
@@ -2102,12 +1755,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
                       Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
-  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
-                          : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+  } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                               : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
@@ -2115,13 +1768,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
-  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
-                          : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (TM.is64Bit())
+  } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                               : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
@@ -2135,12 +1788,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
       return false;
 
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else {
-    if (TM.is64Bit())
+    if (PointerSize == 64)
       Opcode =
           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
@@ -2154,7 +1807,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
                       Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
@@ -2183,11 +1836,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
 
   // Address Space Setting
   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
-
   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
     report_fatal_error("Cannot store to pointer that points to constant "
                        "memory space");
   }
+  unsigned int PointerSize =
+      CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
@@ -2268,8 +1922,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       break;
     }
     StOps.push_back(Addr);
-  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
-                          : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+  } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                               : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
@@ -2290,9 +1944,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     }
     StOps.push_back(Base);
     StOps.push_back(Offset);
-  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
-                          : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (TM.is64Bit()) {
+  } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                               : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
@@ -2335,7 +1989,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     StOps.push_back(Base);
     StOps.push_back(Offset);
   } else {
-    if (TM.is64Bit()) {
+    if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
@@ -4068,172 +3722,3 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
     }
   }
 }
-
-bool NVPTXDAGToDAGISel::tryWMMA_LDST(SDNode *N) {
-  SDValue Chain = N->getOperand(0);
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  SDValue Op1 = N->getOperand(2);
-  SDValue Addr, Offset, Base;
-  Optional<unsigned> Opcode;
-  SDLoc DL(N);
-  MemSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
-  WmmaVariant Variant;
-  SmallVector<SDValue, 12> Ops;
-  bool isStore = N->getNumValues() == 1; // Store ops only return a chain.
-
-  if (SelectDirectAddr(Op1, Addr)) {
-    Variant = WMMA_VARIANT_AVAR;
-    Ops.push_back(Addr);
-  } else if (SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) ||
-             SelectADDRri64(Op1.getNode(), Op1, Base, Offset)) {
-    Variant = WMMA_VARIANT_ARI64;
-    Ops.push_back(Base);
-    Ops.push_back(Offset);
-  } else {
-    Variant = WMMA_VARIANT_AVAR;
-    Ops.push_back(Op1);
-  }
-  unsigned NumOps = N->getNumOperands();
-  // Pass through the rest of the operands to the machine node.
-  for (unsigned i = 3; i < NumOps; ++i)
-    Ops.push_back(N->getOperand(i));
-  Ops.push_back(Chain);
-
-  Opcode = getWmmaLdStOpcode(IID, Variant);
-  if (!Opcode) {
-    llvm::errs() << "tryWMMALD - no Opcode.\n";
-    return false;
-  }
-
-  EVT MemVT = MemSD->getMemoryVT();
-  assert(MemVT.isVector() && "Expected vector return type.");
-
-  SDNode *MN;
-  if (isStore) {
-    MN = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
-  } else {
-    SmallVector<EVT, 9> InstVTs(MemVT.getVectorNumElements(),
-                                MemSD->getValueType(0));
-    InstVTs.push_back(MVT::Other);
-    MN = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTs, Ops);
-  }
-
-  ReplaceNode(N, MN);
-  return true;
-}
-
-bool NVPTXDAGToDAGISel::tryWMMA_MMA(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-  SDLoc DL(N);
-  unsigned Opc;
-
-  switch (IID) {
-  default:
-    return false;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16_satfinite;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32;
-    break;
-  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite:
-    Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32_satfinite;
-    break;
-  }
-
-  SmallVector<SDValue, 24> Ops;
-  // Pass through operands and return value types to the machine node.
-  for (unsigned i = 1; i < N->getNumOperands(); ++i)
-    Ops.push_back(N->getOperand(i));
-  SmallVector<EVT, 8> InstVTs(N->getNumValues(), N->getValueType(0));
-  SDNode *MN = CurDAG->getMachineNode(Opc, DL, InstVTs, Ops);
-  ReplaceNode(N, MN);
-  return true;
-}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b23c27581a17..e911ba0c167d 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -35,6 +35,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool useF32FTZ() const;
   bool allowFMA() const;
   bool allowUnsafeFPMath() const;
+  bool useShortPointers() const;
 
 public:
   explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
@@ -58,7 +59,6 @@ private:
   bool tryIntrinsicNoChain(SDNode *N);
   bool tryIntrinsicChain(SDNode *N);
   void SelectTexSurfHandle(SDNode *N);
-  void SelectMatchAll(SDNode *N);
   bool tryLoad(SDNode *N);
   bool tryLoadVector(SDNode *N);
   bool tryLDGLDU(SDNode *N);
@@ -74,8 +74,6 @@ private:
   bool tryConstantFP16(SDNode *N);
   bool SelectSETP_F16X2(SDNode *N);
   bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
-  bool tryWMMA_LDST(SDNode *N);
-  bool tryWMMA_MMA(SDNode *N);
 
   inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
@@ -90,7 +88,6 @@ private:
                     SDValue &Offset);
   bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
                       SDValue &Offset);
-
   bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
                         SDValue &Offset, MVT mvt);
   bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f1e4251a44b5..2536623fb853 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -15,7 +15,6 @@
 #include "NVPTXISelLowering.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
-#include "NVPTXSection.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXTargetObjectFile.h"
@@ -26,7 +25,6 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -49,6 +47,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -376,29 +375,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
 
   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
 
   // Operations not directly supported by NVPTX.
-  setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f16, Expand);
-  setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
+                 MVT::i16, MVT::i32, MVT::i64}) {
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::BR_CC, VT, Expand);
+  }
+
   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
   // For others we will expand to a SHL/SRA pair.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
@@ -417,20 +406,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
 
-  if (STI.hasROT64()) {
-    setOperationAction(ISD::ROTL, MVT::i64, Legal);
-    setOperationAction(ISD::ROTR, MVT::i64, Legal);
-  } else {
-    setOperationAction(ISD::ROTL, MVT::i64, Expand);
-    setOperationAction(ISD::ROTR, MVT::i64, Expand);
-  }
-  if (STI.hasROT32()) {
-    setOperationAction(ISD::ROTL, MVT::i32, Legal);
-    setOperationAction(ISD::ROTR, MVT::i32, Legal);
-  } else {
-    setOperationAction(ISD::ROTL, MVT::i32, Expand);
-    setOperationAction(ISD::ROTR, MVT::i32, Expand);
-  }
+  // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
+  // that don't have h/w rotation we lower them to multi-instruction assembly.
+  // See ROT*_sw in NVPTXIntrInfo.td
+  setOperationAction(ISD::ROTL, MVT::i64, Legal);
+  setOperationAction(ISD::ROTR, MVT::i64, Legal);
+  setOperationAction(ISD::ROTL, MVT::i32, Legal);
+  setOperationAction(ISD::ROTR, MVT::i32, Legal);
 
   setOperationAction(ISD::ROTL, MVT::i16, Expand);
   setOperationAction(ISD::ROTR, MVT::i16, Expand);
@@ -486,9 +468,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // TRAP can be lowered to PTX trap
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-
   // Register custom handling for vector loads/stores
   for (MVT VT : MVT::vector_valuetypes()) {
     if (IsPTXVectorType(VT)) {
@@ -1251,9 +1230,9 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
 SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
+  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
 }
 
@@ -3330,30 +3309,30 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     // Our result depends on both our and other thread's arguments.
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
-  case Intrinsic::nvvm_wmma_load_a_f16_col:
-  case Intrinsic::nvvm_wmma_load_a_f16_row:
-  case Intrinsic::nvvm_wmma_load_a_f16_col_stride:
-  case Intrinsic::nvvm_wmma_load_a_f16_row_stride:
-  case Intrinsic::nvvm_wmma_load_a_f16_col_shared:
-  case Intrinsic::nvvm_wmma_load_a_f16_row_shared:
-  case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride:
-  case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride:
-  case Intrinsic::nvvm_wmma_load_a_f16_col_global:
-  case Intrinsic::nvvm_wmma_load_a_f16_row_global:
-  case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride:
-  case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride:
-  case Intrinsic::nvvm_wmma_load_b_f16_col:
-  case Intrinsic::nvvm_wmma_load_b_f16_row:
-  case Intrinsic::nvvm_wmma_load_b_f16_col_stride:
-  case Intrinsic::nvvm_wmma_load_b_f16_row_stride:
-  case Intrinsic::nvvm_wmma_load_b_f16_col_shared:
-  case Intrinsic::nvvm_wmma_load_b_f16_row_shared:
-  case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride:
-  case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride:
-  case Intrinsic::nvvm_wmma_load_b_f16_col_global:
-  case Intrinsic::nvvm_wmma_load_b_f16_row_global:
-  case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride:
-  case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride: {
+  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::v8f16;
     Info.ptrVal = I.getArgOperand(0);
@@ -3363,18 +3342,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
-  case Intrinsic::nvvm_wmma_load_c_f16_col:
-  case Intrinsic::nvvm_wmma_load_c_f16_row:
-  case Intrinsic::nvvm_wmma_load_c_f16_col_stride:
-  case Intrinsic::nvvm_wmma_load_c_f16_row_stride:
-  case Intrinsic::nvvm_wmma_load_c_f16_col_shared:
-  case Intrinsic::nvvm_wmma_load_c_f16_row_shared:
-  case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride:
-  case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride:
-  case Intrinsic::nvvm_wmma_load_c_f16_col_global:
-  case Intrinsic::nvvm_wmma_load_c_f16_row_global:
-  case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride:
-  case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride: {
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::v4f16;
     Info.ptrVal = I.getArgOperand(0);
@@ -3384,18 +3363,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
-  case Intrinsic::nvvm_wmma_load_c_f32_col:
-  case Intrinsic::nvvm_wmma_load_c_f32_row:
-  case Intrinsic::nvvm_wmma_load_c_f32_col_stride:
-  case Intrinsic::nvvm_wmma_load_c_f32_row_stride:
-  case Intrinsic::nvvm_wmma_load_c_f32_col_shared:
-  case Intrinsic::nvvm_wmma_load_c_f32_row_shared:
-  case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride:
-  case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride:
-  case Intrinsic::nvvm_wmma_load_c_f32_col_global:
-  case Intrinsic::nvvm_wmma_load_c_f32_row_global:
-  case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride:
-  case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride: {
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::v8f32;
     Info.ptrVal = I.getArgOperand(0);
@@ -3405,19 +3384,19 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
-  case Intrinsic::nvvm_wmma_store_d_f16_col:
-  case Intrinsic::nvvm_wmma_store_d_f16_row:
-  case Intrinsic::nvvm_wmma_store_d_f16_col_stride:
-  case Intrinsic::nvvm_wmma_store_d_f16_row_stride:
-  case Intrinsic::nvvm_wmma_store_d_f16_col_shared:
-  case Intrinsic::nvvm_wmma_store_d_f16_row_shared:
-  case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride:
-  case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride:
-  case Intrinsic::nvvm_wmma_store_d_f16_col_global:
-  case Intrinsic::nvvm_wmma_store_d_f16_row_global:
-  case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride:
-  case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
+    Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = MVT::v4f16;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -3426,19 +3405,19 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
-  case Intrinsic::nvvm_wmma_store_d_f32_col:
-  case Intrinsic::nvvm_wmma_store_d_f32_row:
-  case Intrinsic::nvvm_wmma_store_d_f32_col_stride:
-  case Intrinsic::nvvm_wmma_store_d_f32_row_stride:
-  case Intrinsic::nvvm_wmma_store_d_f32_col_shared:
-  case Intrinsic::nvvm_wmma_store_d_f32_row_shared:
-  case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride:
-  case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride:
-  case Intrinsic::nvvm_wmma_store_d_f32_col_global:
-  case Intrinsic::nvvm_wmma_store_d_f32_row_global:
-  case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride:
-  case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
+  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
+  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
+  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
+    Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = MVT::v8f32;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -4756,31 +4735,8 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   }
 }
 
-// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
-void NVPTXSection::anchor() {}
-
-NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
-  delete static_cast<NVPTXSection *>(TextSection);
-  delete static_cast<NVPTXSection *>(DataSection);
-  delete static_cast<NVPTXSection *>(BSSSection);
-  delete static_cast<NVPTXSection *>(ReadOnlySection);
-
-  delete static_cast<NVPTXSection *>(StaticCtorSection);
-  delete static_cast<NVPTXSection *>(StaticDtorSection);
-  delete static_cast<NVPTXSection *>(LSDASection);
-  delete static_cast<NVPTXSection *>(EHFrameSection);
-  delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
-  delete static_cast<NVPTXSection *>(DwarfInfoSection);
-  delete static_cast<NVPTXSection *>(DwarfLineSection);
-  delete static_cast<NVPTXSection *>(DwarfFrameSection);
-  delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
-  delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
-  delete static_cast<NVPTXSection *>(DwarfStrSection);
-  delete static_cast<NVPTXSection *>(DwarfLocSection);
-  delete static_cast<NVPTXSection *>(DwarfARangesSection);
-  delete static_cast<NVPTXSection *>(DwarfRangesSection);
-  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
-}
+// Pin NVPTXTargetObjectFile's vtables to this file.
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
 
 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index da563f0531d4..50815bff6c67 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -70,51 +70,6 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
-                                 unsigned &DestReg) const {
-  // Look for the appropriate part of TSFlags
-  bool isMove = false;
-
-  unsigned TSFlags =
-      (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
-  isMove = (TSFlags == 1);
-
-  if (isMove) {
-    MachineOperand dest = MI.getOperand(0);
-    MachineOperand src = MI.getOperand(1);
-    assert(dest.isReg() && "dest of a movrr is not a reg");
-    assert(src.isReg() && "src of a movrr is not a reg");
-
-    SrcReg = src.getReg();
-    DestReg = dest.getReg();
-    return true;
-  }
-
-  return false;
-}
-
-bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
-                                 unsigned &AddrSpace) const {
-  bool isLoad = false;
-  unsigned TSFlags =
-      (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
-  isLoad = (TSFlags == 1);
-  if (isLoad)
-    AddrSpace = getLdStCodeAddrSpace(MI);
-  return isLoad;
-}
-
-bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
-                                  unsigned &AddrSpace) const {
-  bool isStore = false;
-  unsigned TSFlags =
-      (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
-  isStore = (TSFlags == 1);
-  if (isStore)
-    AddrSpace = getLdStCodeAddrSpace(MI);
-  return isStore;
-}
-
 /// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
 /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
 /// implemented for a target).  Upon success, this returns false and returns
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index 18ba7684ae51..4ab1bb481958 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -52,10 +52,6 @@ public:
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
-  virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
-                           unsigned &DestReg) const;
-  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
-  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
 
   // Branch analysis.
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -68,10 +64,6 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
-  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
-    return MI.getOperand(2).getImm();
-  }
-
 };
 
 } // namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 92152a64e525..443b077184c7 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -111,28 +111,14 @@ def VecElement : Operand<i32> {
 //===----------------------------------------------------------------------===//
 
 
-def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
-def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
-def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
-def useAtomRedG32forGen32 :
-  Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
-def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
-def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
-def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
-def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
-def useAtomRedG64forGen64 :
-  Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
-def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
 def hasVote : Predicate<"Subtarget->hasVote()">;
 def hasDouble : Predicate<"Subtarget->hasDouble()">;
-def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
 def hasLDG : Predicate<"Subtarget->hasLDG()">;
 def hasLDU : Predicate<"Subtarget->hasLDU()">;
-def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
 
 def doF32FTZ : Predicate<"useF32FTZ()">;
 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
@@ -156,10 +142,12 @@ def true : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
+def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 
+def useShortPtr : Predicate<"useShortPointers()">;
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
 
 //===----------------------------------------------------------------------===//
@@ -961,13 +949,12 @@ def FDIV321r_prec_ftz :
             (ins f32imm:$a, Float32Regs:$b),
             "rcp.rn.ftz.f32 \t$dst, $b;",
             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[reqPTX20, doF32FTZ]>;
+            Requires<[doF32FTZ]>;
 def FDIV321r_prec :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins f32imm:$a, Float32Regs:$b),
             "rcp.rn.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[reqPTX20]>;
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
 //
 // F32 Accurate division
 //
@@ -976,25 +963,23 @@ def FDIV32rr_prec_ftz :
             (ins Float32Regs:$a, Float32Regs:$b),
             "div.rn.ftz.f32 \t$dst, $a, $b;",
             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[doF32FTZ, reqPTX20]>;
+            Requires<[doF32FTZ]>;
 def FDIV32ri_prec_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, f32imm:$b),
             "div.rn.ftz.f32 \t$dst, $a, $b;",
             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[doF32FTZ, reqPTX20]>;
+            Requires<[doF32FTZ]>;
 def FDIV32rr_prec :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, Float32Regs:$b),
             "div.rn.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[reqPTX20]>;
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
 def FDIV32ri_prec :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, f32imm:$b),
             "div.rn.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[reqPTX20]>;
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
 
 //
 // FMA
@@ -1544,6 +1529,7 @@ def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
                             [SDNPWantRoot]>;
 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
                               [SDNPWantRoot]>;
+def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
 
 def MEMri : Operand<i32> {
   let PrintMethod = "printMemOperand";
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c932758bd0ae..47dcdcf6e0bd 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -277,26 +277,22 @@ multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic Int
   def ii : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
                      (ins i32imm:$mask, ImmOp:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              // If would be nice if tablegen could match multiple return values,
-              // but it does not seem to be the case. Thus we have an empty pattern and
-              // lower intrinsic to instruction manually.
-              // [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$value, imm:$mask))]>,
-              []>,
+              [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
   def ir : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
                      (ins Int32Regs:$mask, ImmOp:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              []>,
+              [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
   def ri : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
                      (ins i32imm:$mask, regclass:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              []>,
+              [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
   def rr : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred),
                      (ins Int32Regs:$mask, regclass:$value),
               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
-              []>,
+              [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>,
            Requires<[hasPTX60, hasSM70]>;
 }
 defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_sync_i32p,
@@ -1025,18 +1021,19 @@ class ATOMIC_GENERIC_CHK <dag ops, dag frag>
 
 multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
-  Operand IMMType, SDNode IMM, Predicate Pred> {
+  Operand IMMType, SDNode IMM, list<Predicate> Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
     [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
     [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
 }
 multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
-  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
+  list<Predicate> Pred = []> {
   defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
     IntOp, IMMType, IMM, Pred>;
   defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
@@ -1046,7 +1043,7 @@ multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
 // has 2 operands, neg the second one
 multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
-  Operand IMMType, Predicate Pred> {
+  Operand IMMType, list<Predicate> Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
     !strconcat(
       "{{ \n\t",
@@ -1055,11 +1052,11 @@ multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
       "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
       "}}"),
     [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
 }
 multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
   string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
-  Predicate Pred> {
+  list<Predicate> Pred = []> {
  defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
    IntOp, IMMType, Pred> ;
  defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
@@ -1069,33 +1066,33 @@ multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
 // has 3 operands
 multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
-  Operand IMMType, Predicate Pred> {
+  Operand IMMType, list<Predicate> Pred> {
   def reg : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, regclass:$b, regclass:$c),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
     [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
 
   def imm1 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, IMMType:$b, regclass:$c),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
     [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
 
   def imm2 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, regclass:$b, IMMType:$c),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
     [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
 
   def imm3 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
     [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
-  Requires<[Pred]>;
+  Requires<Pred>;
 }
 multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
-  string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+  string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
   defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
     IntOp, IMMType, Pred>;
   defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
@@ -1130,36 +1127,36 @@ def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
-  atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_load_add_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
-  atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_load_add_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
-  atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_add_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
-  ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".add", atomic_load_add_32_gen, i32imm, imm>;
 
 defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
-  atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+  atomic_load_add_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
-  atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+  atomic_load_add_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
-  atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_add_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
-  ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".add", atomic_load_add_64_gen, i64imm, imm>;
 
 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
-  atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+  atomic_load_add_f32_g, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
-  atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+  atomic_load_add_f32_s, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
-  atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+  atomic_load_add_f32_gen, f32imm, fpimm>;
 
 defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
-  atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>;
+  atomic_load_add_f64_g, f64imm, fpimm, [hasAtomAddF64]>;
 defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
-  atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>;
+  atomic_load_add_f64_s, f64imm, fpimm, [hasAtomAddF64]>;
 defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
-  atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>;
+  atomic_load_add_f64_gen, f64imm, fpimm, [hasAtomAddF64]>;
 
 // atom_sub
 
@@ -1177,21 +1174,21 @@ def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_sub_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
-  atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+  atomic_load_sub_32_g, i32imm>;
 defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
-  atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+  atomic_load_sub_64_g, i64imm>;
 defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
-  atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+  atomic_load_sub_32_gen, i32imm>;
 defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
-  ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+  ".add", atomic_load_sub_32_gen, i32imm>;
 defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
-  atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+  atomic_load_sub_32_s, i32imm>;
 defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
-  atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+  atomic_load_sub_64_s, i64imm>;
 defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
-  atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+  atomic_load_sub_64_gen, i64imm>;
 defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
-  ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+  ".add", atomic_load_sub_64_gen, i64imm>;
 
 // atom_swap
 
@@ -1209,21 +1206,21 @@ def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_swap_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
-  atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_swap_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
-  atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_swap_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
-  atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_swap_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
-  ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".exch", atomic_swap_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
-  atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+  atomic_swap_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
-  atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+  atomic_swap_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
-  atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_swap_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".exch", atomic_swap_64_gen, i64imm, imm>;
 
 // atom_max
 
@@ -1253,37 +1250,37 @@ def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_umax_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
-  ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+  ".max", atomic_load_max_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
-  ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+  ".max", atomic_load_max_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
-  atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_max_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
-  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
-  ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+  ".max", atomic_load_max_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
-  ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+  ".max", atomic_load_max_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
-  atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_max_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
-  ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+  ".max", atomic_load_umax_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
-  ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+  ".max", atomic_load_umax_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
-  atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_umax_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
-  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
-  ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+  ".max", atomic_load_umax_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
-  ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+  ".max", atomic_load_umax_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
-  atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_umax_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm>;
 
 // atom_min
 
@@ -1313,37 +1310,37 @@ def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_umin_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
-  ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+  ".min", atomic_load_min_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
-  ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+  ".min", atomic_load_min_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
-  atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_min_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
-  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
-  ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+  ".min", atomic_load_min_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
-  ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+  ".min", atomic_load_min_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
-  atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_min_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
-  ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+  ".min", atomic_load_umin_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
-  ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+  ".min", atomic_load_umin_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
-  atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_umin_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
-  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
-  ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+  ".min", atomic_load_umin_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
-  ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+  ".min", atomic_load_umin_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
-  atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_umin_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
-  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm>;
 
 // atom_inc  atom_dec
 
@@ -1361,21 +1358,21 @@ def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
-  atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_load_inc_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
-  atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_load_inc_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
-  atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_inc_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
-  ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".inc", atomic_load_inc_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
-  atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_load_dec_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
-  atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_load_dec_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
-  atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_dec_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
-  ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".dec", atomic_load_dec_32_gen, i32imm, imm>;
 
 // atom_and
 
@@ -1393,21 +1390,21 @@ def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_and_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
-  atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_load_and_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
-  atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_load_and_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
-  atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_and_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
-  ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".and", atomic_load_and_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
-  atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+  atomic_load_and_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
-  atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+  atomic_load_and_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
-  atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_and_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".and", atomic_load_and_64_gen, i64imm, imm>;
 
 // atom_or
 
@@ -1425,21 +1422,21 @@ def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_or_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
-  atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_load_or_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
-  atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_or_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
-  ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".or", atomic_load_or_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
-  atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_load_or_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
-  atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+  atomic_load_or_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
-  atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_or_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".or", atomic_load_or_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
-  atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
+  atomic_load_or_64_s, i64imm, imm>;
 
 // atom_xor
 
@@ -1457,21 +1454,21 @@ def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_xor_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
-  atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+  atomic_load_xor_32_g, i32imm, imm>;
 defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
-  atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+  atomic_load_xor_32_s, i32imm, imm>;
 defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
-  atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+  atomic_load_xor_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
-  ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+  ".xor", atomic_load_xor_32_gen, i32imm, imm>;
 defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
-  atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+  atomic_load_xor_64_g, i64imm, imm>;
 defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
-  atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+  atomic_load_xor_64_s, i64imm, imm>;
 defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
-  atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+  atomic_load_xor_64_gen, i64imm, imm>;
 defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
-  ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+  ".xor", atomic_load_xor_64_gen, i64imm, imm>;
 
 // atom_cas
 
@@ -1489,21 +1486,21 @@ def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
   (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
 
 defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
-  atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+  atomic_cmp_swap_32_g, i32imm>;
 defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
-  atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+  atomic_cmp_swap_32_s, i32imm>;
 defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
-  atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+  atomic_cmp_swap_32_gen, i32imm>;
 defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
-  ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+  ".cas", atomic_cmp_swap_32_gen, i32imm>;
 defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
-  atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+  atomic_cmp_swap_64_g, i64imm>;
 defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
-  atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+  atomic_cmp_swap_64_s, i64imm>;
 defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
-  atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+  atomic_cmp_swap_64_gen, i64imm>;
 defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
-  ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+  ".cas", atomic_cmp_swap_64_gen, i64imm>;
 
 // Support for scoped atomic operations.  Matches
 // int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -1654,7 +1651,7 @@ multiclass ATOM2_add_impl<string OpStr> {
    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, []>;
    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", Float32Regs, f32imm, fpimm, f32,
-                            [hasAtomAddF32]>;
+                            []>;
    defm _f64  : ATOM2S_impl<OpStr, "f", "f64", Float64Regs, f64imm, fpimm, f64,
                             [hasAtomAddF64]>;
 }
@@ -1936,56 +1933,31 @@ defm INT_PTX_LDG_G_v4f32_ELE
 multiclass NG_TO_G<string Str, Intrinsic Intrin> {
    def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
           !strconcat("cvta.", Str, ".u32 \t$result, $src;"),
-      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
-   Requires<[hasGenericLdSt]>;
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
    def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
           !strconcat("cvta.", Str, ".u64 \t$result, $src;"),
-      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
-   Requires<[hasGenericLdSt]>;
-
-// @TODO: Are these actually needed?  I believe global addresses will be copied
-// to register values anyway.
-   /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
-          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
-      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
-      Requires<[hasGenericLdSt]>;
-   def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
-          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
-      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
-      Requires<[hasGenericLdSt]>;*/
-
-   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          "mov.u32 \t$result, $src;",
-      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
-   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          "mov.u64 \t$result, $src;",
       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
-
-// @TODO: Are these actually needed?  I believe global addresses will be copied
-// to register values anyway.
-   /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
-          "mov.u32 \t$result, $src;",
-      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
-   def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
-          "mov.u64 \t$result, $src;",
-      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+   def _yes_6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src),
+          "{{ .reg .b64 %tmp;\n\t"
+          #"  cvt.u64.u32 \t%tmp, $src;\n\t"
+          #"  cvta." # Str # ".u64 \t$result, %tmp; }}",
+      [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>,
+      Requires<[useShortPtr]>;
 }
 
 multiclass G_TO_NG<string Str, Intrinsic Intrin> {
    def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
           !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"),
-      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
-   Requires<[hasGenericLdSt]>;
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
    def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
           !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"),
-      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
-   Requires<[hasGenericLdSt]>;
-   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          "mov.u32 \t$result, $src;",
-      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
-   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          "mov.u64 \t$result, $src;",
       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+   def _yes_3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src),
+          "{{ .reg .b64 %tmp;\n\t"
+          #"  cvta.to." # Str # ".u64 \t%tmp, $src;\n\t"
+          #"  cvt.u32.u64 \t$result, %tmp; }}",
+      [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>,
+      Requires<[useShortPtr]>;
 }
 
 defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
@@ -7412,204 +7384,380 @@ def INT_PTX_SREG_WARPSIZE :
 //
 // wmma.load.[a|b|c].sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
 //
-class WMMA_LOAD_ALSTOS<string Abc, string Layout, string Space,
-                           string Type, NVPTXRegClass regclass,
-                           Operand SrcOp, int WithOffset, int WithStride>
-  : NVPTXInst<!if(!eq(Abc#Type,"cf16"),
-                  (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3),
-                  (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                         regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7)),
-              !if(WithStride,
-                  !if(WithOffset,
-                      (ins SrcOp:$src, i32imm:$offset, Int32Regs:$ldm),
-                      (ins SrcOp:$src, Int32Regs:$ldm)),
-                  !if(WithOffset,
-                      (ins SrcOp:$src, i32imm:$offset),
-                      (ins SrcOp:$src))),
-              "wmma.load."#Abc#".sync."#Layout#".m16n16k16"#Space#"." #Type# " \t"
-                 #!if(!eq(Abc#Type,"cf16"),
-                      "{{$r0, $r1, $r2, $r3}}",
-                      "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
-                 #", "
-                 #!if(WithOffset,"[$src+$offset]", "[$src]")
-                 #!if(WithStride, ", $ldm", "")
-                 #";",
-              []>,
-    Requires<[hasPTX60, hasSM70]>;
-
-multiclass WMMA_LOAD_ALSTO<string Abc, string Layout, string Space,
-                           string Type, NVPTXRegClass regclass,
-                           Operand SrcOp, int WithOffset = 0> {
-  def _stride: WMMA_LOAD_ALSTOS<Abc, Layout, Space, Type, regclass, SrcOp,
-                                WithOffset, 1>;
-  def NAME:    WMMA_LOAD_ALSTOS<Abc, Layout, Space, Type, regclass, SrcOp,
-                                WithOffset, 0>;
+
+class EmptyNVPTXInst : NVPTXInst<(outs), (ins), "?", []>;
+
+class WMMA_LOAD_GALSTOS<string Geometry, string Abc, string Layout,
+                        string Space, string Type, NVPTXRegClass regclass,
+                        DAGOperand SrcOp, bit WithStride>
+  : EmptyNVPTXInst,
+    Requires<[!if(!eq(Geometry, "m16n16k16"),
+                  hasPTX60,
+                  hasPTX61),
+              hasSM70]> {
+  // Pattern (created by WMMA_LOAD_INTR_HELPER below) that matches the intrinsic
+  // for this function.
+  PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA_"
+                                       # Geometry # "_load_"
+                                       # !subst("c", "c_" # Type, Abc)
+                                       # "_" # Layout
+                                       # !subst(".", "_", Space)
+                                       # !if(WithStride,"_stride", "")
+                                       # "_Intr");
+  dag OutsR03 = (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3);
+  dag OutsR47 = (outs regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7);
+  dag Outs = !if(!eq(Abc#Type,"cf16"), OutsR03, !con(OutsR03, OutsR47));
+
+  dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins));
+  dag Ins = !con((ins SrcOp:$src), StrideArg);
+
+  // Build a dag pattern that matches the intrinsic call.
+  // We want a dag that looks like this:
+  // (set <output args>, (intrinsic <input arguments>)) where input and
+  // output arguments are named patterns that would match corresponding
+  // input/output arguments of the instruction.
+  //
+  // First we construct (set <output arguments>) from instruction's outs dag by
+  // replacing dag operator 'outs' with 'set'.
+  dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp));
+  // Similarly, construct (intrinsic <input arguments>) sub-dag from
+  // instruction's input arguments, only now we also need to replace operands
+  // with patterns that would match them and the operator 'ins' with the
+  // intrinsic.
+  dag PatArgs = !foreach(tmp, Ins,
+                              !subst(imem, ADDRvar,
+                              !subst(MEMri64, ADDRri64,
+                              !subst(MEMri, ADDRri,
+                              !subst(ins, IntrMatcher, tmp)))));
+  // Finally, consatenate both parts together. !con() requires both dags to have
+  // the same operator, so we wrap PatArgs in a (set ...) dag.
+  let Pattern = [!con(PatOuts, (set PatArgs))];
+  let OutOperandList = Outs;
+  let InOperandList = Ins;
+  let AsmString = "wmma.load."
+                  # Abc
+                  # ".sync"
+                  # "." # Layout
+                  # "." # Geometry
+                  # Space
+                  # "." # Type # " \t"
+                  # !if(!eq(Abc#Type, "cf16"),
+                        "{{$r0, $r1, $r2, $r3}}",
+                        "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
+                  # ", [$src]"
+                  # !if(WithStride, ", $ldm", "")
+                  # ";";
 }
 
-multiclass WMMA_LOAD_ALST<string Abc, string Layout, string Space,
+class WMMA_LOAD_INTR_HELPER<string Geometry, string Abc, string Layout,
+                            string Space, string Type, bit WithStride>
+                           : PatFrag <(ops),(ops)> {
+  // Intrinsic that matches this instruction.
+  Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma"
+                                    # "_" # Geometry # "_load_"
+                                    # Abc # "_" # Type # "_" # Layout
+                                    # !if(WithStride,"_stride", ""));
+  code match_generic = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+  }];
+  code match_shared = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+  }];
+  code match_global = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+  }];
+
+  let Operands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
+  let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))];
+  let PredicateCode = !if(!eq(Space, ".shared"), match_shared,
+                      !if(!eq(Space, ".global"), match_global, match_generic));
+}
+
+multiclass WMMA_LOAD_GALSTS<string Geometry, string Abc, string Layout,
+                            string Space, string Type, NVPTXRegClass regclass,
+                            bit WithStride> {
+  def _avar:  WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+                                imem, WithStride>;
+  def _areg: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+                                Int32Regs, WithStride>;
+  def _areg64: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+                                Int64Regs, WithStride>;
+  def _ari: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+                                MEMri, WithStride>;
+  def _ari64: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
+                                MEMri64, WithStride>;
+}
+
+multiclass WMMA_LOAD_GALSTSh<string Geometry, string Abc, string Layout,
+                             string Space, string Type, NVPTXRegClass regclass,
+                             bit WithStride> {
+  // Define a PatFrag that matches appropriate intrinsic that loads from the
+  // given address space.
+  def _Intr:  WMMA_LOAD_INTR_HELPER<Geometry, Abc, Layout, Space, Type,
+                                    WithStride>;
+  defm NAME:  WMMA_LOAD_GALSTS<Geometry, Abc, Layout, Space, Type, regclass,
+                               WithStride>;
+}
+
+multiclass WMMA_LOAD_GALST<string Geometry, string Abc, string Layout,
+                           string Space, string Type, NVPTXRegClass regclass> {
+  defm _stride: WMMA_LOAD_GALSTSh<Geometry, Abc, Layout, Space, Type, regclass, 1>;
+  defm NAME:    WMMA_LOAD_GALSTSh<Geometry, Abc, Layout, Space, Type, regclass, 0>;
+}
+
+multiclass WMMA_LOAD_GALT<string Geometry, string Abc, string Layout,
                           string Type, NVPTXRegClass regclass> {
-  defm _avar:  WMMA_LOAD_ALSTO<Abc, Layout, Space, Type, regclass, imemAny, 0>;
-  defm _ari64: WMMA_LOAD_ALSTO<Abc, Layout, Space, Type, regclass, imemAny, 1>;
+  defm _global: WMMA_LOAD_GALST<Geometry, Abc, Layout, ".global",
+                                Type, regclass>;
+  defm _shared: WMMA_LOAD_GALST<Geometry, Abc, Layout, ".shared",
+                                Type, regclass>;
+  defm NAME:    WMMA_LOAD_GALST<Geometry, Abc, Layout,        "",
+                                Type, regclass>;
 }
 
-multiclass WMMA_LOAD_ALT<string Abc, string Layout,
-                        string Type, NVPTXRegClass regclass> {
-  defm _global: WMMA_LOAD_ALST<Abc, Layout, ".global", Type, regclass>;
-  defm _shared: WMMA_LOAD_ALST<Abc, Layout, ".shared", Type, regclass>;
-  defm NAME:    WMMA_LOAD_ALST<Abc, Layout,        "", Type, regclass>;
+multiclass WMMA_LOAD_GAT<string Geometry, string Abc,
+                         string Type, NVPTXRegClass regclass> {
+  defm _row: WMMA_LOAD_GALT<Geometry, Abc, "row", Type, regclass>;
+  defm _col: WMMA_LOAD_GALT<Geometry, Abc, "col", Type, regclass>;
 }
 
-multiclass WMMA_LOAD_AT<string Abc, string Type, NVPTXRegClass regclass> {
-  defm _row: WMMA_LOAD_ALT<Abc, "row", Type, regclass>;
-  defm _col: WMMA_LOAD_ALT<Abc, "col", Type, regclass>;
+multiclass WMMA_LOAD_G<string Geometry> {
+  defm _load_a: WMMA_LOAD_GAT<Geometry, "a", "f16", Float16x2Regs>;
+  defm _load_b: WMMA_LOAD_GAT<Geometry, "b", "f16", Float16x2Regs>;
+  defm _load_c_f16: WMMA_LOAD_GAT<Geometry, "c", "f16", Float16x2Regs>;
+  defm _load_c_f32: WMMA_LOAD_GAT<Geometry, "c", "f32", Float32Regs>;
 }
 
-defm INT_WMMA_LOAD_A: WMMA_LOAD_AT<"a", "f16", Float16x2Regs>;
-defm INT_WMMA_LOAD_B: WMMA_LOAD_AT<"b", "f16", Float16x2Regs>;
-defm INT_WMMA_LOAD_C_f16: WMMA_LOAD_AT<"c", "f16", Float16x2Regs>;
-defm INT_WMMA_LOAD_C_f32: WMMA_LOAD_AT<"c", "f32", Float32Regs>;
+defm INT_WMMA_m32n8k16: WMMA_LOAD_G<"m32n8k16">;
+defm INT_WMMA_m16n16k16: WMMA_LOAD_G<"m16n16k16">;
+defm INT_WMMA_m8n32k16: WMMA_LOAD_G<"m8n32k16">;
 
 //
 // wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
 //
-class WMMA_STORE_D_LSTOS<string Layout, string Space,
-                         string Type, NVPTXRegClass regclass,
-                         Operand DstOp, int WithOffset, int WithStride>
-  : NVPTXInst<(outs),
-              !if(!eq(Type,"f16"),
-                !if(WithStride,
-                  !if(WithOffset,
-                      (ins DstOp:$src, i32imm:$offset,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                           Int32Regs:$ldm),
-                      (ins DstOp:$src,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                           Int32Regs:$ldm)),
-                  !if(WithOffset,
-                      (ins DstOp:$src, i32imm:$offset,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3),
-                      (ins DstOp:$src,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3))),
-                !if(WithStride,
-                  !if(WithOffset,
-                      (ins DstOp:$src, i32imm:$offset,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                           regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7,
-                           Int32Regs:$ldm),
-                      (ins DstOp:$src,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                           regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7,
-                           Int32Regs:$ldm)),
-                  !if(WithOffset,
-                      (ins DstOp:$src, i32imm:$offset,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                            regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7),
-                      (ins DstOp:$src,
-                           regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3,
-                           regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7)))),
-              "wmma.store.d.sync."#Layout#".m16n16k16"#Space#"." #Type# " \t"
-                 #!if(WithOffset,"[$src+$offset], ", "[$src], ")
-                 #!if(!eq(Type,"f16"),
-                      "{{$r0, $r1, $r2, $r3}}",
-                      "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
-                 #!if(WithStride, ", $ldm", "")
-                 #";",
-              []>,
-    Requires<[hasPTX60, hasSM70]>;
-
-multiclass WMMA_STORE_D_LSTO<string Layout, string Space,
-                             string Type, NVPTXRegClass regclass,
-                             Operand DstOp, int WithOffset = 0> {
-  def _stride: WMMA_STORE_D_LSTOS<Layout, Space, Type, regclass, DstOp,
-                                  WithOffset, 1>;
-  def NAME:    WMMA_STORE_D_LSTOS<Layout, Space, Type, regclass, DstOp,
-                                  WithOffset, 0>;
+class WMMA_STORE_D_GLSTSO<string Geometry, string Layout, string Space,
+                          string Type, NVPTXRegClass regclass,
+                          bit WithStride, DAGOperand DstOp>
+  : EmptyNVPTXInst,
+    Requires<[!if(!eq(Geometry, "m16n16k16"),
+                  hasPTX60,
+                  hasPTX61),
+              hasSM70]> {
+  PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA"
+                                       # "_" # Geometry # "_store_d"
+                                       # "_" # Type
+                                       # "_" # Layout
+                                       # !subst(".", "_", Space)
+                                       # !if(WithStride,"_stride", "")
+                                       # "_Intr");
+  dag InsR03 = (ins DstOp:$src, regclass:$r0, regclass:$r1,
+                                regclass:$r2, regclass:$r3);
+  dag InsR47 = (ins regclass:$r4, regclass:$r5,
+                    regclass:$r6, regclass:$r7);
+  dag InsR = !if(!eq(Type,"f16"), InsR03, !con(InsR03, InsR47));
+  dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins));
+  dag Ins = !con(InsR, StrideArg);
+
+  // Construct the pattern to match corresponding intrinsic call. See the
+  // details in the comments in WMMA_LOAD_ALSTOS.
+  dag PatArgs = !foreach(tmp, Ins,
+                              !subst(imem, ADDRvar,
+                              !subst(MEMri64, ADDRri64,
+                              !subst(MEMri, ADDRri,
+                              !subst(ins, IntrMatcher, tmp)))));
+  let Pattern = [PatArgs];
+  let OutOperandList = (outs);
+  let InOperandList = Ins;
+  let AsmString = "wmma.store.d.sync."
+                  # Layout
+                  # "." # Geometry
+                  # Space
+                  # "." # Type
+                  # " \t[$src],"
+                  # !if(!eq(Type,"f16"),
+                        "{{$r0, $r1, $r2, $r3}}",
+                        "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
+                  # !if(WithStride, ", $ldm", "")
+                  # ";";
+
+}
+
+class WMMA_STORE_INTR_HELPER<string Geometry, string Layout, string Space,
+                             string Type, bit WithStride>
+                            : PatFrag <(ops),(ops)> {
+  // Intrinsic that matches this instruction.
+  Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
+                                    # Geometry
+                                    # "_store_d"
+                                    # "_" # Type
+                                    # "_" # Layout
+                                    # !if(WithStride, "_stride", ""));
+  code match_generic = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+  }];
+  code match_shared = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+  }];
+  code match_global = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+  }];
+
+  dag Args = !if(!eq(Type,"f16"),
+                 (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3),
+                 (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3,
+                                 node:$r4, node:$r5, node:$r6, node:$r7));
+  dag StrideArg = !if(WithStride, (ops node:$ldm), (ops));
+  let Operands = !con(Args, StrideArg);
+  let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))];
+  let PredicateCode = !if(!eq(Space, ".shared"), match_shared,
+                      !if(!eq(Space, ".global"), match_global, match_generic));
+}
+
+multiclass WMMA_STORE_D_GLSTS<string Geometry, string Layout, string Space,
+                              string Type, NVPTXRegClass regclass,
+                              bit WithStride> {
+  def _avar:   WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+                                   WithStride, imem>;
+  def _areg:   WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+                                   WithStride, Int32Regs>;
+  def _areg64: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+                                   WithStride, Int64Regs>;
+  def _ari:    WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+                                   WithStride, MEMri>;
+  def _ari64:  WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
+                                   WithStride, MEMri64>;
+}
+
+multiclass WMMA_STORE_D_GLSTSh<string Geometry, string Layout, string Space,
+                               string Type, NVPTXRegClass regclass,
+                               bit WithStride> {
+  // Define a PatFrag that matches appropriate intrinsic that loads from the
+  // given address space.
+  def _Intr:    WMMA_STORE_INTR_HELPER<Geometry, Layout, Space, Type,
+                                       WithStride>;
+  defm NAME:    WMMA_STORE_D_GLSTS<Geometry, Layout, Space, Type, regclass,
+                                   WithStride>;
 }
 
-multiclass WMMA_STORE_D_LST<string Layout, string Space,
-                            string Type, NVPTXRegClass regclass> {
-  defm _avar:  WMMA_STORE_D_LSTO<Layout, Space, Type, regclass, imemAny, 0>;
-  defm _ari64: WMMA_STORE_D_LSTO<Layout, Space, Type, regclass, imemAny, 1>;
+multiclass WMMA_STORE_D_GLST<string Geometry, string Layout, string Space,
+                             string Type, NVPTXRegClass regclass > {
+  defm _stride: WMMA_STORE_D_GLSTSh<Geometry, Layout, Space, Type, regclass, 1>;
+  defm NAME:    WMMA_STORE_D_GLSTSh<Geometry, Layout, Space, Type, regclass, 0>;
 }
 
-multiclass WMMA_STORE_D_LT<string Layout,
+multiclass WMMA_STORE_D_GLT<string Geometry, string Layout,
                            string Type, NVPTXRegClass regclass> {
-  defm _global: WMMA_STORE_D_LST<Layout, ".global", Type, regclass>;
-  defm _shared: WMMA_STORE_D_LST<Layout, ".shared", Type, regclass>;
-  defm NAME:    WMMA_STORE_D_LST<Layout,        "", Type, regclass>;
+  defm _global: WMMA_STORE_D_GLST<Geometry, Layout, ".global", Type, regclass>;
+  defm _shared: WMMA_STORE_D_GLST<Geometry, Layout, ".shared", Type, regclass>;
+  defm NAME:    WMMA_STORE_D_GLST<Geometry, Layout,        "", Type, regclass>;
+}
+
+multiclass WMMA_STORE_D_GT<string Geometry, string Type,
+                           NVPTXRegClass regclass> {
+  defm _row:    WMMA_STORE_D_GLT<Geometry, "row", Type, regclass>;
+  defm _col:    WMMA_STORE_D_GLT<Geometry, "col", Type, regclass>;
 }
 
-multiclass WMMA_STORE_D_T<string Type, NVPTXRegClass regclass> {
-  defm _row: WMMA_STORE_D_LT<"row", Type, regclass>;
-  defm _col: WMMA_STORE_D_LT<"col", Type, regclass>;
+multiclass WMMA_STORE_D_G<string Geometry> {
+  defm _store_d_f16: WMMA_STORE_D_GT<Geometry, "f16", Float16x2Regs>;
+  defm _store_d_f32: WMMA_STORE_D_GT<Geometry, "f32", Float32Regs>;
 }
 
-defm INT_WMMA_STORE_D_f16: WMMA_STORE_D_T<"f16", Float16x2Regs>;
-defm INT_WMMA_STORE_D_f32: WMMA_STORE_D_T<"f32", Float32Regs>;
+defm INT_WMMA_m32n8k16: WMMA_STORE_D_G<"m32n8k16">;
+defm INT_WMMA_m16n16k16: WMMA_STORE_D_G<"m16n16k16">;
+defm INT_WMMA_m8n32k16: WMMA_STORE_D_G<"m8n32k16">;
 
 // WMMA.MMA
-class WMMA_MMA_ABDCS<string ALayout, string BLayout,
+class WMMA_MMA_GABDCS<string Geometry, string ALayout, string BLayout,
                      string DType, NVPTXRegClass d_reg,
                      string CType, NVPTXRegClass c_reg,
                      NVPTXRegClass ab_reg,
                      string Satfinite = "">
-  : NVPTXInst<!if(!eq(DType,"f16"),
-                  (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3),
-                  (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3,
-                        d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7)),
-              !if(!eq(CType,"f16"),
-                  (ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
-                       ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
-                       ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
-                       ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
-                        c_reg:$c0,  c_reg:$c1,  c_reg:$c2,  c_reg:$c3),
-                  (ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
-                       ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
-                       ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
-                       ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
-                        c_reg:$c0,  c_reg:$c1,  c_reg:$c2,  c_reg:$c3,
-                        c_reg:$c4,  c_reg:$c5,  c_reg:$c6,  c_reg:$c7)),
-              "wmma.mma.sync."#ALayout#"."#BLayout#".m16n16k16."#
-                 #DType#"."#CType#Satfinite
-                 #"\n\t\t"
-                 #!if(!eq(DType,"f16"),
-                      "{{$d0, $d1, $d2, $d3}}, \n\t\t",
-                      "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t")
-                 #"{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t"
-                 #"{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t"
-                 #!if(!eq(CType,"f16"),
-                      "{{$c0, $c1, $c2, $c3}};",
-                      "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};"),
-              []>,
-    Requires<[hasPTX60, hasSM70]>;
-
-multiclass WMMA_MMA_ABDC<string ALayout, string BLayout,
+  : EmptyNVPTXInst,
+    Requires<[!if(!eq(Geometry, "m16n16k16"),
+                  hasPTX60,
+                  hasPTX61),
+              hasSM70]> {
+  Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
+                                    # Geometry
+                                    # "_mma"
+                                    # "_" # ALayout
+                                    # "_" # BLayout
+                                    # "_" # DType
+                                    # "_" # CType
+                                    # !subst(".", "_", Satfinite));
+  dag Outs = !if(!eq(DType,"f16"),
+                 (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3),
+                 (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3,
+                       d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7));
+  dag InsExtraCArgs = !if(!eq(CType,"f16"),
+                          (ins),
+                          (ins c_reg:$c4,  c_reg:$c5,  c_reg:$c6,  c_reg:$c7));
+  dag Ins = !con((ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
+                      ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
+                      ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
+                      ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
+                      c_reg:$c0,  c_reg:$c1,  c_reg:$c2,  c_reg:$c3),
+                  InsExtraCArgs);
+
+  // Construct the pattern to match corresponding intrinsic call. See the
+  // details in the comments in WMMA_LOAD_ALSTOS.
+  dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp));
+  dag PatArgs = !foreach(tmp, Ins, !subst(ins, Intr, tmp));
+  let Pattern = [!con(PatOuts, (set PatArgs))];
+  let OutOperandList = Outs;
+  let InOperandList  = Ins;
+  let AsmString = "wmma.mma.sync."
+                  # ALayout
+                  # "." # BLayout
+                  # "." # Geometry
+                  # "." # DType
+                  # "." # CType
+                  # Satfinite # "\n\t\t"
+                  # !if(!eq(DType,"f16"),
+                        "{{$d0, $d1, $d2, $d3}}, \n\t\t",
+                        "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t")
+                  # "{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t"
+                  # "{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t"
+                  # !if(!eq(CType,"f16"),
+                        "{{$c0, $c1, $c2, $c3}};",
+                        "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};");
+}
+
+multiclass WMMA_MMA_GABDC<string Geometry, string ALayout, string BLayout,
                          string DType, NVPTXRegClass d_reg,
                          string CType, NVPTXRegClass c_reg> {
-  def _satfinite: WMMA_MMA_ABDCS<ALayout, BLayout,
+  def _satfinite: WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
                                  DType, d_reg, CType, c_reg,
                                  Float16x2Regs, ".satfinite">;
-  def NAME:       WMMA_MMA_ABDCS<ALayout, BLayout,
+  def NAME:       WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
                                  DType, d_reg, CType, c_reg,
                                  Float16x2Regs>;
 }
 
-multiclass WMMA_MMA_ABD<string ALayout, string BLayout,
+multiclass WMMA_MMA_GABD<string Geometry, string ALayout, string BLayout,
                         string DType, NVPTXRegClass d_reg> {
-  defm _f16: WMMA_MMA_ABDC<ALayout, BLayout, DType, d_reg, "f16", Float16x2Regs>;
-  defm _f32: WMMA_MMA_ABDC<ALayout, BLayout, DType, d_reg, "f32", Float32Regs>;
+  defm _f16: WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_reg,
+                            "f16", Float16x2Regs>;
+  defm _f32: WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_reg,
+                            "f32", Float32Regs>;
 }
 
-multiclass WMMA_MMA_AB<string ALayout, string BLayout> {
-  defm _f16: WMMA_MMA_ABD<ALayout, BLayout, "f16", Float16x2Regs>;
-  defm _f32: WMMA_MMA_ABD<ALayout, BLayout, "f32", Float32Regs>;
+multiclass WMMA_MMA_GAB<string Geometry, string ALayout, string BLayout> {
+  defm _f16: WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f16", Float16x2Regs>;
+  defm _f32: WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f32", Float32Regs>;
 }
 
-multiclass WMMA_MMA_A<string ALayout> {
-  defm _col: WMMA_MMA_AB<ALayout, "col">;
-  defm _row: WMMA_MMA_AB<ALayout, "row">;
+multiclass WMMA_MMA_GA<string Geometry, string ALayout> {
+  defm _col: WMMA_MMA_GAB<Geometry, ALayout, "col">;
+  defm _row: WMMA_MMA_GAB<Geometry, ALayout, "row">;
 }
 
-defm INT_WMMA_MMA_col: WMMA_MMA_A<"col">;
-defm INT_WMMA_MMA_row: WMMA_MMA_A<"row">;
+multiclass WMMA_MMA_G<string Geometry> {
+  defm _col: WMMA_MMA_GA<Geometry, "col">;
+  defm _row: WMMA_MMA_GA<Geometry, "row">;
+}
 
+defm INT_WMMA_MMA_m32n8k16 : WMMA_MMA_G<"m32n8k16">;
+defm INT_WMMA_MMA_m16n16k16 : WMMA_MMA_G<"m16n16k16">;
+defm INT_WMMA_MMA_m8n32k16 : WMMA_MMA_G<"m8n32k16">;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 1402033b9e60..5bb4fc3edd09 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -97,10 +97,12 @@ AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
   Offset = (Offset + Align - 1) / Align * Align;
 
   if (StackGrowsDown) {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
+                      << "]\n");
     MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
   } else {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset
+                      << "]\n");
     MFI.setObjectOffset(FrameIdx, Offset);
     Offset += MFI.getObjectSize(FrameIdx);
   }
@@ -163,14 +165,14 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // Adjust to alignment boundary.
     Offset = (Offset + Align - 1) / Align * Align;
 
-    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+    LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
     // Resolve offsets for objects in the local block.
     for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
       std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
       int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
-      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
-            FIOffset << "]\n");
+      LLVM_DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << FIOffset
+                        << "]\n");
       MFI.setObjectOffset(Entry.first, FIOffset);
     }
     // Allocate the local block
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
deleted file mode 100644
index d736eaa41301..000000000000
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- NVPTXSection.h - NVPTX-specific section representation ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the NVPTXSection class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
-#define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
-
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/SectionKind.h"
-
-namespace llvm {
-
-/// Represents a section in PTX PTX does not have sections. We create this class
-/// in order to use the ASMPrint interface.
-///
-class NVPTXSection final : public MCSection {
-  virtual void anchor();
-
-public:
-  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
-  ~NVPTXSection() = default;
-
-  /// Override this as NVPTX has its own way of printing switching
-  /// to a section.
-  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            const MCExpr *Subsection) const override {}
-
-  /// Base address of PTX sections is zero.
-  bool UseCodeAlign() const override { return false; }
-  bool isVirtualSection() const override { return false; }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 3a0bfd221b0b..b02822a099d9 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -48,10 +48,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // FrameLowering class because TargetFrameLowering is abstract.
   NVPTXFrameLowering FrameLowering;
 
-protected:
-  // Processor supports scoped atomic operations.
-  bool HasAtomScope;
-
 public:
   /// This constructor initializes the data members to match that
   /// of the specified module.
@@ -73,37 +69,15 @@ public:
     return &TSInfo;
   }
 
-  bool hasBrkPt() const { return SmVersion >= 11; }
-  bool hasAtomRedG32() const { return SmVersion >= 11; }
-  bool hasAtomRedS32() const { return SmVersion >= 12; }
-  bool hasAtomRedG64() const { return SmVersion >= 12; }
-  bool hasAtomRedS64() const { return SmVersion >= 20; }
-  bool hasAtomRedGen32() const { return SmVersion >= 20; }
-  bool hasAtomRedGen64() const { return SmVersion >= 20; }
-  bool hasAtomAddF32() const { return SmVersion >= 20; }
   bool hasAtomAddF64() const { return SmVersion >= 60; }
-  bool hasAtomScope() const { return HasAtomScope; }
+  bool hasAtomScope() const { return SmVersion >= 60; }
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
-  bool hasVote() const { return SmVersion >= 12; }
-  bool hasDouble() const { return SmVersion >= 13; }
-  bool reqPTX20() const { return SmVersion >= 20; }
-  bool hasF32FTZ() const { return SmVersion >= 20; }
-  bool hasFMAF32() const { return SmVersion >= 20; }
-  bool hasFMAF64() const { return SmVersion >= 13; }
   bool hasLDG() const { return SmVersion >= 32; }
-  bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
-  bool hasGenericLdSt() const { return SmVersion >= 20; }
   inline bool hasHWROT32() const { return SmVersion >= 32; }
-  inline bool hasSWROT32() const {
-    return ((SmVersion >= 20) && (SmVersion < 32));
-  }
-  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
-  inline bool hasROT64() const { return SmVersion >= 20; }
   bool hasImageHandles() const;
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool allowFP16Math() const;
-
   unsigned int getSmVersion() const { return SmVersion; }
   std::string getTargetName() const { return TargetName; }
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index cb8cc7bb347a..a1b160441df3 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -52,6 +52,12 @@ static cl::opt<bool> DisableRequireStructuredCFG(
              "unexpected regressions happen."),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> UseShortPointersOpt(
+    "nvptx-short-ptr",
+    cl::desc(
+        "Use 32-bit pointers for accessing const/local/shared address spaces."),
+    cl::init(false), cl::Hidden);
+
 namespace llvm {
 
 void initializeNVVMIntrRangePass(PassRegistry&);
@@ -83,11 +89,13 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeNVPTXLowerAggrCopiesPass(PR);
 }
 
-static std::string computeDataLayout(bool is64Bit) {
+static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
   std::string Ret = "e";
 
   if (!is64Bit)
     Ret += "-p:32:32";
+  else if (UseShortPointers)
+    Ret += "-p3:32:32-p4:32:32-p5:32:32";
 
   Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
 
@@ -108,9 +116,11 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL, bool is64bit)
     // The pic relocation model is used regardless of what the client has
     // specified, as it is the only relocation model currently supported.
-    : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
-                        Reloc::PIC_, getEffectiveCodeModel(CM), OL),
-      is64bit(is64bit), TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+    : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
+                        CPU, FS, Options, Reloc::PIC_,
+                        getEffectiveCodeModel(CM), OL),
+      is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
+      TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
@@ -238,9 +248,11 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&TailDuplicateID);
   disablePass(&StackMapLivenessID);
   disablePass(&LiveDebugValuesID);
+  disablePass(&PostRAMachineSinkingID);
   disablePass(&PostRASchedulerID);
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
+  disablePass(&ShrinkWrapID);
 
   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
   // it here does nothing.  But since we need it for correctness when lowering
@@ -333,7 +345,7 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   addPass(&StackSlotColoringID);
 
   // FIXME: Needs physical registers
-  //addPass(&PostRAMachineLICMID);
+  //addPass(&MachineLICMID);
 
   printAndVerify("After StackSlotColoring");
 }
@@ -368,7 +380,7 @@ void NVPTXPassConfig::addMachineSSAOptimization() {
   if (addILPOpts())
     printAndVerify("After ILP optimizations");
 
-  addPass(&MachineLICMID);
+  addPass(&EarlyMachineLICMID);
   addPass(&MachineCSEID);
 
   addPass(&MachineSinkingID);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index eeebf64d39c3..ca540b8e0389 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -26,6 +26,8 @@ namespace llvm {
 ///
 class NVPTXTargetMachine : public LLVMTargetMachine {
   bool is64bit;
+  // Use 32-bit pointers for accessing const/local/short AS.
+  bool UseShortPointers;
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   NVPTX::DrvInterface drvInterface;
   NVPTXSubtarget Subtarget;
@@ -45,6 +47,7 @@ public:
   }
   const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
   bool is64Bit() const { return is64bit; }
+  bool useShortPointers() const { return UseShortPointers; }
   NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
   ManagedStringPool *getManagedStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index d16269f6ebea..c706b053ab8f 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -10,77 +10,20 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
 
-#include "NVPTXSection.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
 
 class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
 public:
-  NVPTXTargetObjectFile() {
-    TextSection = nullptr;
-    DataSection = nullptr;
-    BSSSection = nullptr;
-    ReadOnlySection = nullptr;
-
-    StaticCtorSection = nullptr;
-    StaticDtorSection = nullptr;
-    LSDASection = nullptr;
-    EHFrameSection = nullptr;
-    DwarfAbbrevSection = nullptr;
-    DwarfInfoSection = nullptr;
-    DwarfLineSection = nullptr;
-    DwarfFrameSection = nullptr;
-    DwarfPubTypesSection = nullptr;
-    DwarfDebugInlineSection = nullptr;
-    DwarfStrSection = nullptr;
-    DwarfLocSection = nullptr;
-    DwarfARangesSection = nullptr;
-    DwarfRangesSection = nullptr;
-    DwarfMacinfoSection = nullptr;
-  }
+  NVPTXTargetObjectFile() : TargetLoweringObjectFile() {}
 
   ~NVPTXTargetObjectFile() override;
 
   void Initialize(MCContext &ctx, const TargetMachine &TM) override {
     TargetLoweringObjectFile::Initialize(ctx, TM);
-    TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
-    DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData());
-    BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
-    ReadOnlySection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
-    StaticCtorSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    StaticDtorSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    LSDASection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    EHFrameSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfAbbrevSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfInfoSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfLineSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfFrameSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfPubTypesSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfDebugInlineSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfStrSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfLocSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfARangesSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfRangesSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
-    DwarfMacinfoSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
   }
 
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d2414b72a009..a631055d36a0 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -49,6 +49,26 @@ public:
     return AddressSpace::ADDRESS_SPACE_GENERIC;
   }
 
+  // NVPTX has infinite registers of all kinds, but the actual machine doesn't.
+  // We conservatively return 1 here which is just enough to enable the
+  // vectorizers but disables heuristics based on the number of registers.
+  // FIXME: Return a more reasonable number, while keeping an eye on
+  // LoopVectorizer's unrolling heuristics.
+  unsigned getNumberOfRegisters(bool Vector) const { return 1; }
+
+  // Only <2 x half> should be vectorized, so always return 32 for the vector
+  // register size.
+  unsigned getRegisterBitWidth(bool Vector) const { return 32; }
+  unsigned getMinVectorRegisterBitWidth() const { return 32; }
+
+  // We don't want to prevent inlining because of target-cpu and -features
+  // attributes that were added to newer versions of LLVM/Clang: There are
+  // no incompatible functions in PTX, ptxas will throw errors in such cases.
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const {
+    return true;
+  }
+
   // Increase the inlining cost threshold by a factor of 5, reflecting that
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() { return 5; }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 152b665d0fdc..60971b48adfc 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -153,7 +153,7 @@ bool NVVMReflect::runOnFunction(Function &F) {
 
     StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
-    DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+    LLVM_DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
 
     int ReflectVal = 0; // The default value is 0
     if (ReflectArg == "__CUDA_FTZ") {
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp
index 3971630c6beb..8ac08c6837d9 100644
--- a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -111,21 +112,19 @@ Nios2AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-std::unique_ptr<MCObjectWriter>
-Nios2AsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createNios2ELFObjectWriter(OS,
-                                    MCELFObjectTargetWriter::getOSABI(OSType));
+std::unique_ptr<MCObjectTargetWriter>
+Nios2AsmBackend::createObjectTargetWriter() const {
+  return createNios2ELFObjectWriter(MCELFObjectTargetWriter::getOSABI(OSType));
 }
 
-bool Nios2AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool Nios2AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   return true;
 }
 
 // MCAsmBackend
 MCAsmBackend *llvm::createNios2AsmBackend(const Target &T,
+                                          const MCSubtargetInfo &STI,
                                           const MCRegisterInfo &MRI,
-                                          const Triple &TT, StringRef CPU,
                                           const MCTargetOptions &Options) {
-
-  return new Nios2AsmBackend(T, TT.getOS());
+  return new Nios2AsmBackend(T, STI.getTargetTriple().getOS());
 }
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h
index 0aa42043ee2a..1f114bd869b1 100644
--- a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2AsmBackend.h
@@ -31,12 +31,12 @@ class Nios2AsmBackend : public MCAsmBackend {
 
 public:
   Nios2AsmBackend(const Target &T, Triple::OSType OSType)
-      : MCAsmBackend(), OSType(OSType) {}
+      : MCAsmBackend(support::little), OSType(OSType) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp
index 04f727ad390c..db432d15120d 100644
--- a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2ELFObjectWriter.cpp
@@ -37,8 +37,7 @@ unsigned Nios2ELFObjectWriter::getRelocType(MCContext &Ctx,
   return 0;
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createNios2ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
-  auto MOTW = llvm::make_unique<Nios2ELFObjectWriter>(OSABI);
-  return createELFObjectWriter(std::move(MOTW), OS, true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createNios2ELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<Nios2ELFObjectWriter>(OSABI);
 }
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
index d918a066acae..a7c4b16c6a3b 100644
--- a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
@@ -18,8 +18,9 @@
 
 namespace llvm {
 class MCAsmBackend;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
+class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
 class Triple;
@@ -28,12 +29,11 @@ class raw_pwrite_stream;
 
 Target &getTheNios2Target();
 
-MCAsmBackend *createNios2AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                    const Triple &TT, StringRef CPU,
+MCAsmBackend *createNios2AsmBackend(const Target &T, const MCSubtargetInfo &STI,
+                                    const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
-createNios2ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createNios2ELFObjectWriter(uint8_t OSABI);
 
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp
index b7e1bc36a6d3..795fd0084aa3 100644
--- a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2TargetStreamer.cpp
@@ -19,4 +19,4 @@ Nios2TargetStreamer::Nios2TargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
 Nios2TargetAsmStreamer::Nios2TargetAsmStreamer(MCStreamer &S,
                                                formatted_raw_ostream &OS)
-    : Nios2TargetStreamer(S), OS(OS) {}
+    : Nios2TargetStreamer(S) {}
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp
index 31d04ebe447e..5f9679466115 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Nios2/Nios2ISelDAGToDAG.cpp
@@ -59,12 +59,9 @@ public:
 // expanded, promoted and normal instructions
 void Nios2DAGToDAGISel::Select(SDNode *Node) {
 
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return;
   }
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2ISelLowering.cpp b/contrib/llvm/lib/Target/Nios2/Nios2ISelLowering.cpp
index 99aa43f960c1..008ce1570722 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Nios2/Nios2ISelLowering.cpp
@@ -32,9 +32,38 @@ Nios2TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+  // Analyze return values.
+  CCInfo.CheckReturn(Outs, RetCC_Nios2EABI);
 
+  SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
 
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = OutVals[i];
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getLocVT(), Val);
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
   return DAG.getNode(Nios2ISD::Ret, DL, MVT::Other, RetOps);
 }
 
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td b/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td
index 58578501d804..f57bf03bba3c 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td
+++ b/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td
@@ -20,14 +20,44 @@ class Format<bits<6> val> {
   bits<6> Value = val;
 }
 
-def Pseudo : Format<0>;
-def FrmI : Format<1>;
-def FrmR : Format<2>;
-def FrmJ : Format<3>;
-def FrmOther : Format<4>; // Instruction w/ a custom format
+def Pseudo     : Format<0>;
+// Nios2 R1 instr formats:
+def FrmI       : Format<1>;
+def FrmR       : Format<2>;
+def FrmJ       : Format<3>;
+def FrmOther   : Format<4>;  // Instruction w/ a custom format
+// Nios2 R2 instr 32-bit formats:
+def FrmL26     : Format<5>;  // corresponds to J format in R1
+def FrmF2I16   : Format<6>;  // corresponds to I format in R1
+def FrmF2X4I12 : Format<7>;
+def FrmF1X4I12 : Format<8>;
+def FrmF1X4L17 : Format<9>;
+def FrmF3X6L5  : Format<10>; // corresponds to R format in R1
+def FrmF2X6L10 : Format<11>;
+def FrmF3X6    : Format<12>; // corresponds to R format in R1
+def FrmF3X8    : Format<13>; // corresponds to custom format in R1
+// Nios2 R2 instr 16-bit formats:
+def FrmI10     : Format<14>;
+def FrmT1I7    : Format<15>; 
+def FrmT2I4    : Format<16>;
+def FrmT1X1I6  : Format<17>;
+def FrmX1I7    : Format<18>;
+def FrmL5I4X1  : Format<19>;
+def FrmT2X1L3  : Format<20>;
+def FrmT2X1I3  : Format<21>;
+def FrmT3X1    : Format<22>;
+def FrmT2X3    : Format<23>;
+def FrmF1X1    : Format<24>;
+def FrmX2L5    : Format<25>;
+def FrmF1I5    : Format<26>;
+def FrmF2      : Format<27>;
 
-def isNios2r1  : Predicate<"Subtarget->isNios2r1()">;
-def isNios2r2  : Predicate<"Subtarget->isNios2r2()">;
+//===----------------------------------------------------------------------===//
+// Instruction Predicates:
+//===----------------------------------------------------------------------===//
+
+def isNios2r1 : Predicate<"Subtarget->isNios2r1()">;
+def isNios2r2 : Predicate<"Subtarget->isNios2r2()">;
 
 class PredicateControl {
   // Predicates related to specific target CPU features
@@ -151,6 +181,27 @@ class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
 }
 
 //===----------------------------------------------------------------------===//
+// Format F3X6 (R2) instruction : <|opx|RSV|C|B|A|opcode|>
+//===----------------------------------------------------------------------===//
+
+class F3X6<bits<6> opx, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin>:
+      Nios2R2Inst32<outs, ins, asmstr, pattern, itin, FrmF3X6> {
+  bits<5> rC;
+  bits<5> rB;
+  bits<5> rA;
+  bits<5> rsv = 0;
+
+  let Opcode = 0x20; /* opcode is always 0x20 (OPX group) for F3X6 instr. */
+
+  let Inst{31-26} = opx; /* opx stands for opcode extension */
+  let Inst{25-21} = rsv;
+  let Inst{20-16} = rC;
+  let Inst{15-11} = rB;
+  let Inst{10-6}  = rA;
+}
+
+//===----------------------------------------------------------------------===//
 // Multiclasses for common instructions of both R1 and R2:
 //===----------------------------------------------------------------------===//
 
@@ -160,6 +211,7 @@ multiclass CommonInstr_R_F3X6_opx<bits<6> opxR1, bits<6> opxR2, dag outs,
                                   dag ins, string asmstr, list<dag> pattern,
                                   InstrItinClass itin> {
   def NAME#_R1 : FR<opxR1, outs, ins, asmstr, pattern, itin>;
+  def NAME#_R2 : F3X6<opxR2, outs, ins, asmstr, pattern, itin>;
 }
 
 // Multiclass for instructions that have R format in R1 and F3X6 format in R2
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.cpp b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.cpp
index df435d2715d7..9700cba3595b 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.cpp
@@ -41,3 +41,14 @@ bool Nios2InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MBB.erase(MI);
   return true;
 }
+
+void Nios2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
+  unsigned opc = Subtarget.hasNios2r2() ? Nios2::ADD_R2 : Nios2::ADD_R1;
+  BuildMI(MBB, I, DL, get(opc))
+    .addReg(DestReg, RegState::Define)
+    .addReg(Nios2::ZERO)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.h b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.h
index a994d3662db2..52f6e7e9c7c8 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.h
+++ b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.h
@@ -39,6 +39,10 @@ public:
   const Nios2RegisterInfo &getRegisterInfo() const { return RI; };
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 };
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td
index 7a39b31a25a8..dee84f74bcbe 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td
+++ b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td
@@ -30,6 +30,10 @@ def simm16     : Operand<i32> {
 // e.g. addi, andi
 def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
 
+// Custom return SDNode
+def Nios2Ret : SDNode<"Nios2ISD::Ret", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -45,6 +49,16 @@ multiclass ArithLogicRegImm16<bits<6> op, string mnemonic, SDNode opNode,
 			         (opNode CPURegs:$rA, immType:$imm))],
                                IIAlu>;
 
+// Arithmetic and logical instructions with 3 register operands.
+// Defines R1 and R2 instruction at the same time.
+multiclass ArithLogicReg<bits<6> opx, string mnemonic,
+                         SDNode opNode>:
+  CommonInstr_R_F3X6<opx, (outs CPURegs:$rC),
+                     (ins CPURegs:$rA, CPURegs:$rB),
+                     !strconcat(mnemonic, "\t$rC, $rA, $rB"),
+                     [(set CPURegs:$rC, (opNode CPURegs:$rA, CPURegs:$rB))],
+                     IIAlu>;
+
 multiclass Return<bits<6> opx, dag outs, dag ins, string mnemonic> {
   let rB = 0, rC = 0,
       isReturn = 1,
@@ -55,14 +69,31 @@ multiclass Return<bits<6> opx, dag outs, dag ins, string mnemonic> {
   }
 }
 
-// Custom return SDNode
-def Nios2Ret : SDNode<"Nios2ISD::Ret", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
 //===----------------------------------------------------------------------===//
 // Nios2 Instructions
 //===----------------------------------------------------------------------===//
 
+/// Arithmetic instructions operating on registers.
+let isCommutable = 1 ,
+    isReMaterializable = 1 in {
+  defm ADD    : ArithLogicReg<0x31, "add",    add>;
+  defm AND    : ArithLogicReg<0x0e, "and",    and>;
+  defm OR     : ArithLogicReg<0x16, "or",     or>;
+  defm XOR    : ArithLogicReg<0x1e, "xor",    xor>;
+  defm MUL    : ArithLogicReg<0x27, "mul",    mul>;
+}
+
+let isReMaterializable = 1 in {
+  defm SUB    : ArithLogicReg<0x39, "sub",    sub>;
+}
+
+defm DIVU : ArithLogicReg<0x24, "divu",   udiv>;
+defm DIV  : ArithLogicReg<0x25, "div",    sdiv>;
+
+defm SLL : ArithLogicReg<0x13, "sll",  shl>;
+defm SRL : ArithLogicReg<0x1b, "srl",  srl>;
+defm SRA : ArithLogicReg<0x3b, "sra",  sra>;
+
 /// Arithmetic Instructions (ALU Immediate)
 defm ADDI  : ArithLogicRegImm16<0x04, "addi",  add, simm16, immSExt16>;
 
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2TargetObjectFile.h b/contrib/llvm/lib/Target/Nios2/Nios2TargetObjectFile.h
index 28d7ff0ec668..e9ed6e31d937 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/Nios2/Nios2TargetObjectFile.h
@@ -16,8 +16,6 @@
 namespace llvm {
 
 class Nios2TargetObjectFile : public TargetLoweringObjectFileELF {
-  const Nios2TargetMachine *TM;
-
 public:
   Nios2TargetObjectFile() : TargetLoweringObjectFileELF() {}
 
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2TargetStreamer.h b/contrib/llvm/lib/Target/Nios2/Nios2TargetStreamer.h
index 63e4e3ccdc64..1520ac27e94f 100644
--- a/contrib/llvm/lib/Target/Nios2/Nios2TargetStreamer.h
+++ b/contrib/llvm/lib/Target/Nios2/Nios2TargetStreamer.h
@@ -22,8 +22,6 @@ public:
 
 // This part is for ascii assembly output
 class Nios2TargetAsmStreamer : public Nios2TargetStreamer {
-  formatted_raw_ostream &OS;
-
 public:
   Nios2TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 };
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index d6db354e0215..56307a84f2e5 100644
--- a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -83,6 +83,16 @@ static const MCPhysReg FRegs[32] = {
   PPC::F24, PPC::F25, PPC::F26, PPC::F27,
   PPC::F28, PPC::F29, PPC::F30, PPC::F31
 };
+static const MCPhysReg SPERegs[32] = {
+  PPC::S0,  PPC::S1,  PPC::S2,  PPC::S3,
+  PPC::S4,  PPC::S5,  PPC::S6,  PPC::S7,
+  PPC::S8,  PPC::S9,  PPC::S10, PPC::S11,
+  PPC::S12, PPC::S13, PPC::S14, PPC::S15,
+  PPC::S16, PPC::S17, PPC::S18, PPC::S19,
+  PPC::S20, PPC::S21, PPC::S22, PPC::S23,
+  PPC::S24, PPC::S25, PPC::S26, PPC::S27,
+  PPC::S28, PPC::S29, PPC::S30, PPC::S31
+};
 static const MCPhysReg VFRegs[32] = {
   PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
   PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
@@ -648,6 +658,16 @@ public:
     Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
   }
 
+  void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+  }
+
+  void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
+  }
+
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()]));
@@ -1394,6 +1414,12 @@ ExtractModifierFromExpr(const MCExpr *E,
     case MCSymbolRefExpr::VK_PPC_HA:
       Variant = PPCMCExpr::VK_PPC_HA;
       break;
+    case MCSymbolRefExpr::VK_PPC_HIGH:
+      Variant = PPCMCExpr::VK_PPC_HIGH;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHA:
+      Variant = PPCMCExpr::VK_PPC_HIGHA;
+      break;
     case MCSymbolRefExpr::VK_PPC_HIGHER:
       Variant = PPCMCExpr::VK_PPC_HIGHER;
       break;
@@ -1973,6 +1999,10 @@ PPCAsmParser::applyModifierToExpr(const MCExpr *E,
     return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HA:
     return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGH:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGH, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHA:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHA, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHER:
     return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHERA:
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 11d22377611b..db01271b87e1 100644
--- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -226,6 +226,17 @@ static const unsigned QFRegs[] = {
   PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
 };
 
+static const unsigned SPERegs[] = {
+  PPC::S0, PPC::S1, PPC::S2, PPC::S3,
+  PPC::S4, PPC::S5, PPC::S6, PPC::S7,
+  PPC::S8, PPC::S9, PPC::S10, PPC::S11,
+  PPC::S12, PPC::S13, PPC::S14, PPC::S15,
+  PPC::S16, PPC::S17, PPC::S18, PPC::S19,
+  PPC::S20, PPC::S21, PPC::S22, PPC::S23,
+  PPC::S24, PPC::S25, PPC::S26, PPC::S27,
+  PPC::S28, PPC::S29, PPC::S30, PPC::S31
+};
+
 template <std::size_t N>
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
                                         const unsigned (&Regs)[N]) {
@@ -327,6 +338,18 @@ static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, QFRegs);
 }
 
+static DecodeStatus DecodeSPE4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SPERegs);
+}
+
 #define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
 #define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
 
@@ -417,6 +440,51 @@ static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeSPE8Operands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the spe8disp field (imm, reg), which has the low 5-bits as the
+  // displacement with 8-byte aligned, and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 5;
+  uint64_t Disp = Imm & 0x1F;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(Disp << 3));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSPE4Operands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the spe4disp field (imm, reg), which has the low 5-bits as the
+  // displacement with 4-byte aligned, and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 5;
+  uint64_t Disp = Imm & 0x1F;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(Disp << 2));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSPE2Operands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the spe2disp field (imm, reg), which has the low 5-bits as the
+  // displacement with 2-byte aligned, and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 5;
+  uint64_t Disp = Imm & 0x1F;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(Disp << 1));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
                                         int64_t Address, const void *Decoder) {
   // The cr bit encoding is 0x80 >> cr_reg_num.
@@ -450,6 +518,11 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
     if (result != MCDisassembler::Fail)
       return result;
+  } else if (STI.getFeatureBits()[PPC::FeatureSPE]) {
+    DecodeStatus result =
+      decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI);
+    if (result != MCDisassembler::Fail)
+      return result;
   }
 
   return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 728e7757fd28..a405dd70c307 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -75,10 +75,9 @@ namespace {
 
 class PPCAsmBackend : public MCAsmBackend {
   const Target &TheTarget;
-  bool IsLittleEndian;
 public:
-  PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
-    IsLittleEndian(isLittle) {}
+  PPCAsmBackend(const Target &T, support::endianness Endian)
+      : MCAsmBackend(Endian), TheTarget(T) {}
 
   unsigned getNumFixupKinds() const override {
     return PPC::NumTargetFixupKinds;
@@ -111,12 +110,15 @@ public:
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
-    return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+    return (Endian == support::little
+                ? InfosLE
+                : InfosBE)[Kind - FirstTargetFixupKind];
   }
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override {
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -127,7 +129,7 @@ public:
     // from the fixup value. The Value has been "split up" into the appropriate
     // bitfields above.
     for (unsigned i = 0; i != NumBytes; ++i) {
-      unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i);
+      unsigned Idx = Endian == support::little ? i : (NumBytes - 1 - i);
       Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
     }
   }
@@ -156,7 +158,8 @@ public:
     }
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override {
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
     // FIXME.
     return false;
   }
@@ -175,12 +178,12 @@ public:
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
-      OW->write32(0x60000000);
+      support::endian::write<uint32_t>(OS, 0x60000000, Endian);
 
-    OW->WriteZeros(Count % 4);
+    OS.write_zeros(Count % 4);
 
     return true;
   }
@@ -191,10 +194,6 @@ public:
     assert(Name == "ppc32" && "Unknown target name!");
     return 4;
   }
-
-  bool isLittleEndian() const {
-    return IsLittleEndian;
-  }
 };
 } // end anonymous namespace
 
@@ -203,13 +202,12 @@ public:
 namespace {
   class DarwinPPCAsmBackend : public PPCAsmBackend {
   public:
-    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
+    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, support::big) { }
 
-    std::unique_ptr<MCObjectWriter>
-    createObjectWriter(raw_pwrite_stream &OS) const override {
+    std::unique_ptr<MCObjectTargetWriter>
+    createObjectTargetWriter() const override {
       bool is64 = getPointerSize() == 8;
       return createPPCMachObjectWriter(
-          OS,
           /*Is64Bit=*/is64,
           (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
           MachO::CPU_SUBTYPE_POWERPC_ALL);
@@ -219,13 +217,14 @@ namespace {
   class ELFPPCAsmBackend : public PPCAsmBackend {
     uint8_t OSABI;
   public:
-    ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
-      PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
+    ELFPPCAsmBackend(const Target &T, support::endianness Endian,
+                     uint8_t OSABI)
+        : PPCAsmBackend(T, Endian), OSABI(OSABI) {}
 
-    std::unique_ptr<MCObjectWriter>
-    createObjectWriter(raw_pwrite_stream &OS) const override {
+    std::unique_ptr<MCObjectTargetWriter>
+    createObjectTargetWriter() const override {
       bool is64 = getPointerSize() == 8;
-      return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
+      return createPPCELFObjectWriter(is64, OSABI);
     }
   };
 
@@ -241,5 +240,6 @@ MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
   bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
-  return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI);
+  return new ELFPPCAsmBackend(
+      T, IsLittleEndian ? support::little : support::big, OSABI);
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 44ee9733b16e..a3caf9a7a5ee 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -55,6 +55,10 @@ static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
     return MCSymbolRefExpr::VK_PPC_HI;
   case PPCMCExpr::VK_PPC_HA:
     return MCSymbolRefExpr::VK_PPC_HA;
+  case PPCMCExpr::VK_PPC_HIGH:
+    return MCSymbolRefExpr::VK_PPC_HIGH;
+  case PPCMCExpr::VK_PPC_HIGHA:
+    return MCSymbolRefExpr::VK_PPC_HIGHA;
   case PPCMCExpr::VK_PPC_HIGHERA:
     return MCSymbolRefExpr::VK_PPC_HIGHERA;
   case PPCMCExpr::VK_PPC_HIGHER:
@@ -151,6 +155,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_HA:
         Type = ELF::R_PPC_ADDR16_HA;
         break;
+      case MCSymbolRefExpr::VK_PPC_HIGH:
+        Type = ELF::R_PPC64_ADDR16_HIGH;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHA:
+        Type = ELF::R_PPC64_ADDR16_HIGHA;
+        break;
       case MCSymbolRefExpr::VK_PPC_HIGHER:
         Type = ELF::R_PPC64_ADDR16_HIGHER;
         break;
@@ -199,6 +209,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TPREL_HA:
         Type = ELF::R_PPC_TPREL16_HA;
         break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGH:
+        Type = ELF::R_PPC64_TPREL16_HIGH;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHA:
+        Type = ELF::R_PPC64_TPREL16_HIGHA;
+        break;
       case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
         Type = ELF::R_PPC64_TPREL16_HIGHER;
         break;
@@ -223,6 +239,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
         Type = ELF::R_PPC64_DTPREL16_HA;
         break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGH:
+        Type = ELF::R_PPC64_DTPREL16_HIGH;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHA;
+        break;
       case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
         Type = ELF::R_PPC64_DTPREL16_HIGHER;
         break;
@@ -417,9 +439,7 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                               bool IsLittleEndian, uint8_t OSABI) {
-  auto MOTW = llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
-  return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createPPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
+  return llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 92c8c224b71b..2b948ca60028 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -122,25 +122,18 @@ public:
 
     // Output the constant in big/little endian byte order.
     unsigned Size = Desc.getSize();
+    support::endianness E = IsLittleEndian ? support::little : support::big;
     switch (Size) {
     case 0:
       break;
     case 4:
-      if (IsLittleEndian) {
-        support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
-      } else {
-        support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
-      }
+      support::endian::write<uint32_t>(OS, Bits, E);
       break;
     case 8:
       // If we emit a pair of instructions, the first one is
       // always in the top 32 bits, even on little-endian.
-      if (IsLittleEndian) {
-        uint64_t Swapped = (Bits << 32) | (Bits >> 32);
-        support::endian::Writer<support::little>(OS).write<uint64_t>(Swapped);
-      } else {
-        support::endian::Writer<support::big>(OS).write<uint64_t>(Bits);
-      }
+      support::endian::write<uint32_t>(OS, Bits >> 32, E);
+      support::endian::write<uint32_t>(OS, Bits, E);
       break;
     default:
       llvm_unreachable("Invalid instruction size");
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 54f664314578..32e6a0bdd65f 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -44,6 +44,8 @@ void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
     case VK_PPC_LO: OS << "@l"; break;
     case VK_PPC_HI: OS << "@h"; break;
     case VK_PPC_HA: OS << "@ha"; break;
+    case VK_PPC_HIGH: OS << "@high"; break;
+    case VK_PPC_HIGHA: OS << "@higha"; break;
     case VK_PPC_HIGHER: OS << "@higher"; break;
     case VK_PPC_HIGHERA: OS << "@highera"; break;
     case VK_PPC_HIGHEST: OS << "@highest"; break;
@@ -75,6 +77,10 @@ PPCMCExpr::evaluateAsInt64(int64_t Value) const {
       return (Value >> 16) & 0xffff;
     case VK_PPC_HA:
       return ((Value + 0x8000) >> 16) & 0xffff;
+    case VK_PPC_HIGH:
+      return (Value >> 16) & 0xffff;
+    case VK_PPC_HIGHA:
+      return ((Value + 0x8000) >> 16) & 0xffff;
     case VK_PPC_HIGHER:
       return (Value >> 32) & 0xffff;
     case VK_PPC_HIGHERA:
@@ -125,6 +131,12 @@ PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
       case VK_PPC_HA:
         Modifier = MCSymbolRefExpr::VK_PPC_HA;
         break;
+      case VK_PPC_HIGH:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGH;
+        break;
+      case VK_PPC_HIGHA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHA;
+        break;
       case VK_PPC_HIGHERA:
         Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA;
         break;
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index d42a111cc43e..8bb4791d13dd 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -23,6 +23,8 @@ public:
     VK_PPC_LO,
     VK_PPC_HI,
     VK_PPC_HA,
+    VK_PPC_HIGH,
+    VK_PPC_HIGHA,
     VK_PPC_HIGHER,
     VK_PPC_HIGHERA,
     VK_PPC_HIGHEST,
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index d47b9a6e452c..316fd2ccf358 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -27,7 +27,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -49,15 +49,11 @@ MCAsmBackend *createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                   const MCTargetOptions &Options);
 
 /// Construct an PPC ELF object writer.
-std::unique_ptr<MCObjectWriter> createPPCELFObjectWriter(raw_pwrite_stream &OS,
-                                                         bool Is64Bit,
-                                                         bool IsLittleEndian,
-                                                         uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createPPCELFObjectWriter(bool Is64Bit,
+                                                               uint8_t OSABI);
 /// Construct a PPC Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createPPCMachObjectWriter(raw_pwrite_stream &OS,
-                                                          bool Is64Bit,
-                                                          uint32_t CPUType,
-                                                          uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
 
 /// Returns true iff Val consists of one contiguous run of 1s with any number of
 /// 0s on either side.  The 1s are allowed to wrap from LSB to MSB, so
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 4b9055ec7041..ff6cf584da23 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -374,10 +374,8 @@ void PPCMachObjectWriter::RecordPPCRelocation(
   Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                                uint32_t CPUType, uint32_t CPUSubtype) {
-  return createMachObjectWriter(
-      llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
-      /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                                uint32_t CPUSubtype) {
+  return llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 603ac960133f..fe7e7aeeb182 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -50,6 +50,9 @@ namespace PPC {
     PRED_UN_PLUS  = (3 << 5) | 15,
     PRED_NU_PLUS  = (3 << 5) |  7,
 
+    // SPE scalar compare instructions always set the GT bit.
+    PRED_SPE      = PRED_GT,
+
     // When dealing with individual condition-register bits, we have simple set
     // and unset predicates.
     PRED_BIT_SET =   1024,
diff --git a/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
index dc6ed16e53ce..34df8452fe16 100644
--- a/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines resources required by some of P9 instruction. This is part
-// P9 processor model used for instruction scheduling. Not every instruction
-// is listed here. Instructions in this file belong to itinerary classes that
-// have instructions with different resource requirements.
+// This file defines the resources required by P9 instructions. This is part
+// P9 processor model used for instruction scheduling. This file should contain
+// all of the instructions that may be used on Power 9. This is not just
+// instructions that are new on Power 9 but also instructions that were
+// available on earlier architectures and are still used in Power 9.
 //
 // The makeup of the P9 CPU is modeled as follows:
 //   - Each CPU is made up of two superslices.
@@ -31,85 +32,37 @@
 //===----------------------------------------------------------------------===//
 
 // Two cycle ALU vector operation that uses an entire superslice.
-//  Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-//  (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
 def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
               DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    VADDCUW,
-    VADDUBM,
-    VADDUDM,
-    VADDUHM,
-    VADDUWM,
-    VAND,
-    VANDC,
-    VCMPEQUB,
-    VCMPEQUD,
-    VCMPEQUH,
-    VCMPEQUW,
-    VCMPNEB,
-    VCMPNEH,
-    VCMPNEW,
-    VCMPNEZB,
-    VCMPNEZH,
-    VCMPNEZW,
+    (instregex "VADDU(B|H|W|D)M$"),
+    (instregex "VAND(C)?$"),
+    (instregex "VEXTS(B|H|W)2(D|W)(s)?$"),
+    (instregex "V_SET0(B|H)?$"),
+    (instregex "VS(R|L)(B|H|W|D)$"),
+    (instregex "VSUBU(B|H|W|D)M$"),
+    (instregex "VPOPCNT(B|H)$"),
+    (instregex "VRL(B|H|W|D)$"),
+    (instregex "VSRA(B|H|W|D)$"),
+    (instregex "XV(N)?ABS(D|S)P$"),
+    (instregex "XVCPSGN(D|S)P$"),
+    (instregex "XV(I|X)EXP(D|S)P$"),
+    (instregex "VRL(D|W)(MI|NM)$"),
+    (instregex "VMRG(E|O)W$"),
+    MTVSRDD,
     VEQV,
-    VEXTSB2D,
-    VEXTSB2W,
-    VEXTSH2D,
-    VEXTSH2W,
-    VEXTSW2D,
-    VRLB,
-    VRLD,
-    VRLDMI,
-    VRLDNM,
-    VRLH,
-    VRLW,
-    VRLWMI,
-    VRLWNM,
-    VSRAB,
-    VSRAD,
-    VSRAH,
-    VSRAW,
-    VSRB,
-    VSRD,
-    VSRH,
-    VSRW,
-    VSLB,
-    VSLD,
-    VSLH,
-    VSLW,
-    VMRGEW,
-    VMRGOW,
     VNAND,
     VNEGD,
     VNEGW,
     VNOR,
     VOR,
     VORC,
-    VPOPCNTB,
-    VPOPCNTH,
     VSEL,
-    VSUBUBM,
-    VSUBUDM,
-    VSUBUHM,
-    VSUBUWM,
     VXOR,
-    V_SET0B,
-    V_SET0H,
-    V_SET0,
-    XVABSDP,
-    XVABSSP,
-    XVCPSGNDP,
-    XVCPSGNSP,
-    XVIEXPDP,
-    XVNABSDP,
-    XVNABSSP,
     XVNEGDP,
     XVNEGSP,
-    XVXEXPDP,
-    XVIEXPSP,
-    XVXEXPSP,
     XXLAND,
     XXLANDC,
     XXLEQV,
@@ -119,6 +72,9 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
     XXLORf,
     XXLORC,
     XXLXOR,
+    XXLXORdpz,
+    XXLXORspz,
+    XXLXORz,
     XXSEL,
     XSABSQP,
     XSCPSGNQP,
@@ -129,54 +85,89 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
 )>;
 
 // Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
-//  slingle slice. However, since it is Restricted it requires all 3 dispatches
-//  (DISP) for that superslice.
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
 def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    FCMPUS,
-    FCMPUD,
-    XSTSTDCDP,
-    XSTSTDCSP
+    (instregex "TABORT(D|W)C(I)?$"),
+    (instregex "MTFSB(0|1)$"),
+    (instregex "MFFSC(D)?RN(I)?$"),
+    (instregex "CMPRB(8)?$"),
+    (instregex "TD(I)?$"),
+    (instregex "TW(I)?$"),
+    (instregex "FCMPU(S|D)$"),
+    (instregex "XSTSTDC(S|D)P$"),
+    FTDIV,
+    FTSQRT,
+    CMPEQB
 )>;
 
 // Standard Dispatch ALU operation for 3 cycles. Only one slice used.
 def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
       (instrs
-    XSMAXCDP,
-    XSMAXDP,
-    XSMAXJDP,
-    XSMINCDP,
-    XSMINDP,
-    XSMINJDP,
+    (instregex "XSMAX(C|J)?DP$"),
+    (instregex "XSMIN(C|J)?DP$"),
+    (instregex "XSCMP(EQ|EXP|GE|GT|O|U)DP$"),
+    (instregex "CNT(L|T)Z(D|W)(8)?(o)?$"),
+    (instregex "POPCNT(D|W)$"),
+    (instregex "CMPB(8)?$"),
     XSTDIVDP,
     XSTSQRTDP,
-    XSCMPEQDP,
-    XSCMPEXPDP,
-    XSCMPGEDP,
-    XSCMPGTDP,
-    XSCMPODP,
-    XSCMPUDP,
     XSXSIGDP,
-    XSCVSPDPN
+    XSCVSPDPN,
+    SETB,
+    BPERMD
 )>;
 
 // Standard Dispatch ALU operation for 2 cycles. Only one slice used.
 def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
       (instrs
-    ADDIStocHA,
-    ADDItocL,
+    (instregex "S(L|R)D$"),
+    (instregex "SRAD(I)?$"),
+    (instregex "EXTSWSLI$"),
+    (instregex "MFV(S)?RD$"),
+    (instregex "MTVSRD$"),
+    (instregex "MTVSRW(A|Z)$"),
+    (instregex "CMP(WI|LWI|W|LW)(8)?$"),
+    (instregex "CMP(L)?D(I)?$"),
+    (instregex "SUBF(I)?C(8)?$"),
+    (instregex "ANDI(S)?o(8)?$"),
+    (instregex "ADDC(8)?$"),
+    (instregex "ADDIC(8)?(o)?$"),
+    (instregex "ADD(8|4)(o)?$"),
+    (instregex "ADD(E|ME|ZE)(8)?(o)?$"),
+    (instregex "SUBF(E|ME|ZE)?(8)?(o)?$"),
+    (instregex "NEG(8)?(o)?$"),
+    (instregex "POPCNTB$"),
+    (instregex "ADD(I|IS)?(8)?$"),
+    (instregex "LI(S)?(8)?$"),
+    (instregex "(X)?OR(I|IS)?(8)?(o)?$"),
+    (instregex "NAND(8)?(o)?$"),
+    (instregex "AND(C)?(8)?(o)?$"),
+    (instregex "NOR(8)?(o)?$"),
+    (instregex "OR(C)?(8)?(o)?$"),
+    (instregex "EQV(8)?(o)?$"),
+    (instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(o)?$"),
+    (instregex "ADD(4|8)(TLS)?(_)?$"),
+    (instregex "NEG(8)?$"),
+    (instregex "ADDI(S)?toc(HA|L)$"),
+    COPY,
     MCRF,
     MCRXRX,
-    SLD,
-    SRD,
-    SRAD,
-    SRADI,
-    RLDIC,
     XSNABSDP,
     XSXEXPDP,
     XSABSDP,
     XSNEGDP,
-    XSCPSGNDP
+    XSCPSGNDP,
+    MFVSRWZ,
+    SRADI_32,
+    RLDIC,
+    RFEBB,
+    LA,
+    TBEGIN,
+    TRECHKPT,
+    NOP,
+    WAIT
 )>;
 
 // Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
@@ -184,80 +175,50 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
 //  (DISP) for that superslice.
 def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    RLDCL,
-    RLDCR,
+    (instregex "RLDC(L|R)$"),
+    (instregex "RLWIMI(8)?$"),
+    (instregex "RLDIC(L|R)(_32)?(_64)?$"),
+    (instregex "M(F|T)OCRF(8)?$"),
+    (instregex "CR(6)?(UN)?SET$"),
+    (instregex "CR(N)?(OR|AND)(C)?$"),
+    (instregex "S(L|R)W(8)?$"),
+    (instregex "RLW(INM|NM)(8)?$"),
+    (instregex "F(N)?ABS(D|S)$"),
+    (instregex "FNEG(D|S)$"),
+    (instregex "FCPSGN(D|S)$"),
+    (instregex "SRAW(I)?$"),
+    (instregex "ISEL(8)?$"),
     RLDIMI,
-    RLDICL,
-    RLDICR,
-    RLDICL_32_64,
     XSIEXPDP,
     FMR,
-    FABSD,
-    FABSS,
-    FNABSD,
-    FNABSS,
-    FNEGD,
-    FNEGS,
-    FCPSGND,
-    FCPSGNS
+    CREQV,
+    CRXOR,
+    TRECLAIM,
+    TSR,
+    TABORT
 )>;
 
 // Three cycle ALU vector operation that uses an entire superslice.
-//  Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-//  (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
 def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
               DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    (instregex "M(T|F)VSCR$"),
+    (instregex "VCMPNEZ(B|H|W)$"),
+    (instregex "VCMPEQU(B|H|W|D)$"),
+    (instregex "VCMPNE(B|H|W)$"),
+    (instregex "VABSDU(B|H|W)$"),
+    (instregex "VADDU(B|H|W)S$"),
+    (instregex "VAVG(S|U)(B|H|W)$"),
+    (instregex "VCMP(EQ|GE|GT)FP(o)?$"),
+    (instregex "VCMPBFP(o)?$"),
+    (instregex "VC(L|T)Z(B|H|W|D)$"),
+    (instregex "VADDS(B|H|W)S$"),
+    (instregex "V(MIN|MAX)FP$"),
+    (instregex "V(MIN|MAX)(S|U)(B|H|W|D)$"),
     VBPERMD,
-    VABSDUB,
-    VABSDUH,
-    VABSDUW,
-    VADDUBS,
-    VADDUHS,
-    VADDUWS,
-    VAVGSB,
-    VAVGSH,
-    VAVGSW,
-    VAVGUB,
-    VAVGUH,
-    VAVGUW,
-    VCMPEQFP,
-    VCMPEQFPo,
-    VCMPGEFP,
-    VCMPGEFPo,
-    VCMPBFP,
-    VCMPBFPo,
-    VCMPGTFP,
-    VCMPGTFPo,
-    VCLZB,
-    VCLZD,
-    VCLZH,
-    VCLZW,
-    VCTZB,
-    VCTZD,
-    VCTZH,
-    VCTZW,
-    VADDSBS,
-    VADDSHS,
-    VADDSWS,
-    VMINFP,
-    VMINSB,
-    VMINSD,
-    VMINSH,
-    VMINSW,
-    VMINUB,
-    VMINUD,
-    VMINUH,
-    VMINUW,
-    VMAXFP,
-    VMAXSB,
-    VMAXSD,
-    VMAXSH,
-    VMAXSW,
-    VMAXUB,
-    VMAXUD,
-    VMAXUH,
-    VMAXUW,
+    VADDCUW,
     VPOPCNTW,
     VPOPCNTD,
     VPRTYBD,
@@ -434,47 +395,38 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
     VSUMSWS
 )>;
 
+
+// 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
+//  dispatch units for the superslice.
+def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "MADD(HD|HDU|LD)$"),
+    (instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$")
+)>;
+
 // 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
 //  dispatch units for the superslice.
 def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     FRSP,
-    FRIND,
-    FRINS,
-    FRIPD,
-    FRIPS,
-    FRIZD,
-    FRIZS,
-    FRIMD,
-    FRIMS,
-    FRE,
-    FRES,
-    FRSQRTE,
-    FRSQRTES,
-    FMADDS,
-    FMADD,
-    FMSUBS,
-    FMSUB,
+    (instregex "FRI(N|P|Z|M)(D|S)$"),
+    (instregex "FRE(S)?$"),
+    (instregex "FADD(S)?$"),
+    (instregex "FMSUB(S)?$"),
+    (instregex "FMADD(S)?$"),
+    (instregex "FSUB(S)?$"),
+    (instregex "FCFID(U)?(S)?$"),
+    (instregex "FCTID(U)?(Z)?$"),
+    (instregex "FCTIW(U)?(Z)?$"),
+    (instregex "FRSQRTE(S)?$"),
     FNMADDS,
     FNMADD,
     FNMSUBS,
     FNMSUB,
     FSELD,
     FSELS,
-    FADDS,
     FMULS,
     FMUL,
-    FSUBS,
-    FCFID,
-    FCTID,
-    FCTIDZ,
-    FCFIDU,
-    FCFIDS,
-    FCFIDUS,
-    FCTIDUZ,
-    FCTIWUZ,
-    FCTIW,
-    FCTIWZ,
     XSMADDADP,
     XSMADDASP,
     XSMADDMDP,
@@ -495,16 +447,40 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
     XSNMSUBMSP
 )>;
 
-// 7 cycle Restricted DP operation and one 2 cycle ALU operation.
+// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
+// These operations can be done in parallel.
 //  The DP is restricted so we need a full 5 dispatches.
-def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C,
+def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    FMULo,
-    FMADDo,
-    FMSUBo,
-    FNMADDo,
-    FNMSUBo
+    (instregex "FSEL(D|S)o$")
+)>;
+
+// 5 Cycle Restricted DP operation and one 2 cycle ALU operation.
+def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "MUL(H|L)(D|W)(U)?o$")
+)>;
+
+// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
+// These operations must be done sequentially.
+//  The DP is restricted so we need a full 5 dispatches.
+def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "FRI(N|P|Z|M)(D|S)o$"),
+    (instregex "FRE(S)?o$"),
+    (instregex "FADD(S)?o$"),
+    (instregex "FSUB(S)?o$"),
+    (instregex "F(N)?MSUB(S)?o$"),
+    (instregex "F(N)?MADD(S)?o$"),
+    (instregex "FCFID(U)?(S)?o$"),
+    (instregex "FCTID(U)?(Z)?o$"),
+    (instregex "FCTIW(U)?(Z)?o$"),
+    (instregex "FMUL(S)?o$"),
+    (instregex "FRSQRTE(S)?o$"),
+    FRSPo
 )>;
 
 // 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
@@ -520,6 +496,8 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
     XSCVDPUXDS,
     XSCVDPUXDSs,
     XSCVDPUXWS,
+    XSCVDPSXWSs,
+    XSCVDPUXWSs,
     XSCVHPDP,
     XSCVSPDP,
     XSCVSXDDP,
@@ -533,12 +511,12 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
     XSRDPIZ,
     XSREDP,
     XSRESP,
-    //XSRSP,
     XSRSQRTEDP,
     XSRSQRTESP,
     XSSUBDP,
     XSSUBSP,
-    XSCVDPSPN
+    XSCVDPSPN,
+    XSRSP
 )>;
 
 // Three Cycle PM operation. Only one PM unit per superslice so we use the whole
@@ -546,13 +524,18 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
 //  dispatches.
 def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    (instregex "LVS(L|R)$"),
+    (instregex "VSPLTIS(W|H|B)$"),
+    (instregex "VSPLT(W|H|B)(s)?$"),
+    (instregex "V_SETALLONES(B|H)?$"),
+    (instregex "VEXTRACTU(B|H|W)$"),
+    (instregex "VINSERT(B|H|W|D)$"),
+    MFVSRLD,
+    MTVSRWS,
     VBPERMQ,
     VCLZLSBB,
     VCTZLSBB,
     VEXTRACTD,
-    VEXTRACTUB,
-    VEXTRACTUH,
-    VEXTRACTUW,
     VEXTUBLX,
     VEXTUBRX,
     VEXTUHLX,
@@ -560,10 +543,6 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
     VEXTUWLX,
     VEXTUWRX,
     VGBBD,
-    VINSERTB,
-    VINSERTD,
-    VINSERTH,
-    VINSERTW,
     VMRGHB,
     VMRGHH,
     VMRGHW,
@@ -591,14 +570,6 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
     VSLDOI,
     VSLO,
     VSLV,
-    VSPLTB,
-    VSPLTBs,
-    VSPLTH,
-    VSPLTHs,
-    VSPLTISB,
-    VSPLTISH,
-    VSPLTISW,
-    VSPLTW,
     VSR,
     VSRO,
     VSRV,
@@ -642,7 +613,17 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
     XSCMPOQP,
     XSCMPUQP,
     XSTSTDCQP,
-    XSXSIGQP
+    XSXSIGQP,
+    BCDCFNo,
+    BCDCFZo,
+    BCDCPSGNo,
+    BCDCTNo,
+    BCDCTZo,
+    BCDSETSGNo,
+    BCDSo,
+    BCDTRUNCo,
+    BCDUSo,
+    BCDUTRUNCo
 )>;
 
 // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
@@ -650,6 +631,7 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
 //  dispatches.
 def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    BCDSRo,
     XSADDQP,
     XSADDQPO,
     XSCVDPQP,
@@ -662,11 +644,20 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
     XSCVSDQP,
     XSCVUDQP,
     XSRQPI,
+    XSRQPIX,
     XSRQPXP,
     XSSUBQP,
     XSSUBQPO
 )>;
 
+// 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    BCDCTSQo
+)>;
+
 // 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
 //  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
 //  dispatches.
@@ -684,6 +675,14 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
     XSNMSUBQPO
 )>;
 
+// 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    BCDCFSQo
+)>;
+
 // 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
 //  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
 //  dispatches.
@@ -702,23 +701,58 @@ def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
     XSSQRTQPO
 )>;
 
-// 5 Cycle load uses a single slice.
+// 6 Cycle Load uses a single slice.
+def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "LXVL(L)?")
+)>;
+
+// 5 Cycle Load uses a single slice.
 def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
       (instrs
+    (instregex "LVE(B|H|W)X$"),
+    (instregex "LVX(L)?"),
+    (instregex "LXSI(B|H)ZX$"),
     LXSDX,
+    LXVB16X,
     LXVD2X,
+    LXVWSX,
     LXSIWZX,
     LXV,
     LXVX,
     LXSD,
     DFLOADf64,
-    XFLOADf64
+    XFLOADf64,
+    LIWZX
 )>;
 
-// 4 Cycle load uses a single slice.
+// 4 Cycle Load uses a single slice.
 def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
       (instrs
-    COPY
+    (instregex "DCB(F|T|ST)(EP)?$"),
+    (instregex "DCBZ(L)?(EP)?$"),
+    (instregex "DCBTST(EP)?$"),
+    (instregex "CP_COPY(8)?$"),
+    (instregex "CP_PASTE(8)?$"),
+    (instregex "ICBI(EP)?$"),
+    (instregex "ICBT(LS)?$"),
+    (instregex "LBARX(L)?$"),
+    (instregex "LBZ(CIX|8|X|X8|XTLS|XTLS_32)?(_)?$"),
+    (instregex "LD(ARX|ARXL|BRX|CIX|X|XTLS)?(_)?$"),
+    (instregex "LH(A|B)RX(L)?(8)?$"),
+    (instregex "LHZ(8|CIX|X|X8|XTLS|XTLS_32)?(_)?$"),
+    (instregex "LWARX(L)?$"),
+    (instregex "LWBRX(8)?$"),
+    (instregex "LWZ(8|CIX|X|X8|XTLS|XTLS_32)?(_)?$"),
+    CP_ABORT,
+    DARN,
+    EnforceIEIO,
+    ISYNC,
+    MSGSYNC,
+    TLBSYNC,
+    SYNC,
+    LMW,
+    LSWI
 )>;
 
 // 4 Cycle Restricted load uses a single slice but the dispatch for the whole
@@ -730,6 +764,58 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
     LFD
 )>;
 
+// Cracked Load Instructions.
+// Load instructions that can be done in parallel.
+def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    SLBIA,
+    SLBIE,
+    SLBMFEE,
+    SLBMFEV,
+    SLBMTE,
+    TLBIEL
+)>;
+
+// Cracked Load Instruction.
+// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations can be run in parallel.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "L(W|H)ZU(X)?(8)?$"),
+    TEND
+)>;
+
+// Cracked Store Instruction
+// Consecutive Store and ALU instructions. The store is restricted and requires
+// three dispatches.
+def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "ST(B|H|W|D)CX$")
+)>;
+
+// Cracked Load Instruction.
+// Two consecutive load operations for a total of 8 cycles.
+def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LDMX
+)>;
+
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
+//  operations cannot be done at the same time and so their latencies are added.
+def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "LHA(X)?(8)?$"),
+    (instregex "CP_PASTE(8)?o$"),
+    (instregex "LWA(X)?(_32)?$"),
+    TCHECK
+)>;
+
 // Cracked Restricted Load instruction.
 // Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
 //  operations cannot be done at the same time and so their latencies are added.
@@ -737,9 +823,7 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
 def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    LFIWAX,
-    LFSX,
-    LFS
+    LFIWAX
 )>;
 
 // Cracked Load instruction.
@@ -749,13 +833,42 @@ def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
 def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    LXSSPX,
     LXSIWAX,
+    LIWAX
+)>;
+
+// Cracked Load instruction.
+// Requires consecutive Load (4 cycles) and ALU (3 cycles) pieces totaling 7
+// cycles. The Load and ALU operations cannot be done at the same time and so
+// their latencies are added.
+// Full 6 dispatches are required as this is a restricted instruction.
+def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LFSX,
+    LFS
+)>;
+
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU
+//  operations cannot be done at the same time and so their latencies are added.
+// Full 4 dispatches are required as this is a cracked instruction.
+def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
     LXSSP,
-    DFLOADf32,
+    LXSSPX,
     XFLOADf32,
-    LIWAX,
-    LIWZX
+    DFLOADf32
+)>;
+
+// Cracked 3-Way Load Instruction
+// Load with two ALU operations that depend on each other
+def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "LHAU(X)?(8)?$"),
+    LWAUX
 )>;
 
 // Cracked Load that requires the PM resource.
@@ -767,8 +880,8 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
 def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    LXVH8X,
     LXVDSX,
-    LXVWSX,
     LXVW4X
 )>;
 
@@ -776,29 +889,52 @@ def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
 //  all three dispatches for the superslice.
 def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    STFS,
-    STFD,
-    STFIWX,
-    STFSX,
-    STFDX,
-    STXSDX,
-    STXSSPX,
-    STXSIWX,
-    DFSTOREf32,
-    DFSTOREf64,
-    XFSTOREf32,
-    XFSTOREf64,
-    STIWX
-)>;
-
-// Store operation that requires the whole superslice.
+    (instregex "STF(S|D|IWX|SX|DX)$"),
+    (instregex "STXS(D|DX|SPX|IWX|IBX|IHX|SP)(v)?$"),
+    (instregex "STW(8)?$"),
+    (instregex "(D|X)FSTORE(f32|f64)$"),
+    (instregex "ST(W|H|D)BRX$"),
+    (instregex "ST(B|H|D)(8)?$"),
+    (instregex "ST(B|W|H|D)(CI)?X(TLS|TLS_32)?(8)?(_)?$"),
+    STIWX,
+    SLBIEG,
+    STMW,
+    STSWI,
+    TLBIE
+)>;
+
+// Vector Store Instruction
+// Requires the whole superslice and therefore requires all three dispatches
+// as well as both the Even and Odd exec pipelines.
 def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
               DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    STXVD2X,
-    STXVW4X
+    (instregex "STVE(B|H|W)X$"),
+    (instregex "STVX(L)?$"),
+    (instregex "STXV(B16X|H8X|W4X|D2X|L|LL|X)?$")
+)>;
+
+// 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "MTCTR(8)?(loop)?$"),
+    (instregex "MTLR(8)?$")
 )>;
 
+// 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "M(T|F)VRSAVE(v)?$"),
+    (instregex "M(T|F)PMR$"),
+    (instregex "M(T|F)TB(8)?$"),
+    (instregex "MF(SPR|CTR|LR)(8)?$"),
+    (instregex "M(T|F)MSR(D)?$"),
+    (instregex "MTSPR(8)?$")
+)>;
 
 // 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
 //  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
@@ -839,6 +975,15 @@ def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
 // Cracked DIV and ALU operation. Requires one full slice for the ALU operation
 //  and one full superslice for the DIV operation since there is only one DIV
 //  per superslice. Latency of DIV plus ALU is 26.
+def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "DIVW(U)?(O)?o$")
+)>;
+
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+//  and one full superslice for the DIV operation since there is only one DIV
+//  per superslice. Latency of DIV plus ALU is 26.
 def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
@@ -868,16 +1013,40 @@ def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
 def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    MTOCRF,
-    MTOCRF8,
     MTCRF,
     MTCRF8
 )>;
 
-// Cracked, restricted, ALU operations.
+// Cracked ALU operations.
 // Here the two ALU ops can actually be done in parallel and therefore the
 //  latencies are not added together. Otherwise this is like having two
-//  instructions running together on two pipelines and 6 dispatches.
+//  instructions running together on two pipelines and 4 dispatches.
+// ALU ops are 2 cycles each.
+def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "ADDC(8)?o$"),
+    (instregex "SUBFC(8)?o$")
+)>;
+
+// Cracked ALU operations.
+// Two ALU ops can be done in parallel.
+// One is three cycle ALU the ohter is a two cycle ALU.
+// One of the ALU ops is restricted the other is not so we have a total of
+// 5 dispatches.
+def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "F(N)?ABS(D|S)o$"),
+    (instregex "FCPSGN(D|S)o$"),
+    (instregex "FNEG(D|S)o$"),
+    FMRo
+)>;
+
+// Cracked ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+//  latencies are not added together. Otherwise this is like having two
+//  instructions running together on two pipelines and 4 dispatches.
 // ALU ops are 3 cycles each.
 def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -885,7 +1054,63 @@ def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
     MCRFS
 )>;
 
-// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+// Cracked Restricted ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+//  latencies are not added together. Otherwise this is like having two
+//  instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 3 cycles each.
+def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "MTFSF(b|o)?$"),
+    (instregex "MTFSFI(o)?$")
+)>;
+
+// Cracked instruction made of two ALU ops.
+// The two ops cannot be done in parallel.
+// One of the ALU ops is restricted and takes 3 dispatches.
+def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "RLD(I)?C(R|L)o$"),
+    (instregex "RLW(IMI|INM|NM)(8)?o$"),
+    (instregex "SLW(8)?o$"),
+    (instregex "SRAW(I)?o$"),
+    (instregex "SRW(8)?o$"),
+    RLDICL_32o,
+    RLDIMIo
+)>;
+
+// Cracked instruction made of two ALU ops.
+// The two ops cannot be done in parallel.
+// Both of the ALU ops are restricted and take 3 dispatches.
+def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "MFFS(L|CE|o)?$")
+)>;
+
+// Cracked ALU instruction composed of three consecutive 2 cycle loads for a
+// total of 6 cycles. All of the ALU operations are also restricted so each
+// takes 3 dispatches for a total of 9.
+def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+              DISP_1C, DISP_1C],
+      (instrs
+    (instregex "MFCR(8)?$")
+)>;
+
+// Cracked instruction made of two ALU ops.
+// The two ops cannot be done in parallel.
+def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "EXTSWSLIo$"),
+    (instregex "SRAD(I)?o$"),
+    SLDo,
+    SRDo,
+    RLDICo
+)>;
 
 // 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
 def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -893,13 +1118,66 @@ def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
     FDIV
 )>;
 
-// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
-def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C,
+// 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
+def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     FDIVo
 )>;
 
+// 36 Cycle DP Instruction.
+// Instruction can be done on a single slice.
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSSQRTDP
+)>;
+
+// 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FSQRT
+)>;
+
+// 36 Cycle DP Vector Instruction.
+def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    XVSQRTDP
+)>;
+
+// 27 Cycle DP Vector Instruction.
+def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    XVSQRTSP
+)>;
+
+// 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
+def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FSQRTo
+)>;
+
+// 26 Cycle DP Instruction.
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSSQRTSP
+)>;
+
+// 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FSQRTS
+)>;
+
+// 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
+def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FSQRTSo
+)>;
+
 // 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
 def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
       (instrs
@@ -913,7 +1191,7 @@ def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
 )>;
 
 // 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
-def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C,
+def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     FDIVSo
@@ -943,23 +1221,40 @@ def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
     XVDIVDP
 )>;
 
-// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
-
 // Instruction cracked into three pieces. One Load and two ALU operations.
 // The Load and one of the ALU ops cannot be run at the same time and so the
 //  latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
 // Both the load and the ALU that depends on it are restricted and so they take
 //  a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
 // The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
-def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
+def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C,
               IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    LFSU,
-    LFSUX
+    (instregex "LF(SU|SUX)$")
+)>;
+
+// Cracked instruction made up of a Store and an ALU. The ALU does not depend on
+// the store and so it can be run at the same time as the store. The store is
+// also restricted.
+def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "STF(S|D)U(X)?$"),
+    (instregex "ST(B|H|W|D)U(X)?(8)?$")
+)>;
+
+// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
+// the load and so it can be run at the same time as the load.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    (instregex "LBZU(X)?(8)?$"),
+    (instregex "LDU(X)?$")
 )>;
 
+
 // Cracked instruction made up of a Load and an ALU. The ALU does not depend on
 //  the load and so it can be run at the same time as the load. The load is also
 //  restricted. 3 dispatches are from the restricted load while the other two
@@ -968,8 +1263,7 @@ def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
 def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    LFDU,
-    LFDUX
+    (instregex "LF(DU|DUX)$")
 )>;
 
 // Crypto Instructions
@@ -979,13 +1273,147 @@ def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
 //  dispatches.
 def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-  VPMSUMB,
-  VPMSUMD,
-  VPMSUMH,
-  VPMSUMW,
-  VCIPHER,
-  VCIPHERLAST,
-  VNCIPHER,
-  VNCIPHERLAST,
-  VSBOX
+    (instregex "VPMSUM(B|H|W|D)$"),
+    (instregex "V(N)?CIPHER(LAST)?$"),
+    VSBOX
+)>;
+
+// Branch Instructions
+
+// Two Cycle Branch
+def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C],
+      (instrs
+  (instregex "BCCCTR(L)?(8)?$"),
+  (instregex "BCCL(A|R|RL)?$"),
+  (instregex "BCCTR(L)?(8)?(n)?$"),
+  (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"),
+  (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"),
+  (instregex "BL(_TLS)?$"),
+  (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"),
+  (instregex "BLA(8|8_NOP)?$"),
+  (instregex "BLR(8|L)?$"),
+  (instregex "TAILB(A)?(8)?$"),
+  (instregex "TAILBCTR(8)?$"),
+  (instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"),
+  (instregex "BCLR(L)?(n)?$"),
+  (instregex "BCTR(L)?(8)?$"),
+  B,
+  BA,
+  BC,
+  BCC,
+  BCCA,
+  BCL,
+  BCLalways,
+  BCLn,
+  BCTRL8_LDinto_toc,
+  BCn,
+  CTRL_DEP
+)>;
+
+// Five Cycle Branch with a 2 Cycle ALU Op
+// Operations must be done consecutively and not in parallel.
+def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    ADDPCIS
+)>;
+
+// Special Extracted Instructions For Atomics
+
+// Atomic Load
+def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C,
+              IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C,
+              IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+              DISP_1C],
+      (instrs
+    (instregex "L(D|W)AT$")
+)>;
+
+// Atomic Store
+def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C,
+              IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+              DISP_1C],
+      (instrs
+    (instregex "ST(D|W)AT$")
 )>;
+
+// Signal Processing Engine (SPE) Instructions
+// These instructions are not supported on Power 9
+def : InstRW<[],
+    (instrs
+  BRINC,
+  EVABS,
+  EVEQV,
+  EVMRA,
+  EVNAND,
+  EVNEG,
+  (instregex "EVADD(I)?W$"),
+  (instregex "EVADD(SM|SS|UM|US)IAAW$"),
+  (instregex "EVAND(C)?$"),
+  (instregex "EVCMP(EQ|GTS|GTU|LTS|LTU)$"),
+  (instregex "EVCNTL(S|Z)W$"),
+  (instregex "EVDIVW(S|U)$"),
+  (instregex "EVEXTS(B|H)$"),
+  (instregex "EVLD(H|W|D)(X)?$"),
+  (instregex "EVLHH(E|OS|OU)SPLAT(X)?$"),
+  (instregex "EVLWHE(X)?$"),
+  (instregex "EVLWHO(S|U)(X)?$"),
+  (instregex "EVLW(H|W)SPLAT(X)?$"),
+  (instregex "EVMERGE(HI|LO|HILO|LOHI)$"),
+  (instregex "EVMHEG(S|U)M(F|I)A(A|N)$"),
+  (instregex "EVMHES(M|S)(F|I)(A|AA|AAW|ANW)?$"),
+  (instregex "EVMHEU(M|S)I(A|AA|AAW|ANW)?$"),
+  (instregex "EVMHOG(U|S)M(F|I)A(A|N)$"),
+  (instregex "EVMHOS(M|S)(F|I)(A|AA|AAW|ANW)?$"),
+  (instregex "EVMHOU(M|S)I(A|AA|ANW|AAW)?$"),
+  (instregex "EVMWHS(M|S)(F|FA|I|IA)$"),
+  (instregex "EVMWHUMI(A)?$"),
+  (instregex "EVMWLS(M|S)IA(A|N)W$"),
+  (instregex "EVMWLU(M|S)I(A|AA|AAW|ANW)?$"),
+  (instregex "EVMWSM(F|I)(A|AA|AN)?$"),
+  (instregex "EVMWSSF(A|AA|AN)?$"),
+  (instregex "EVMWUMI(A|AA|AN)?$"),
+  (instregex "EV(N|X)?OR(C)?$"),
+  (instregex "EVR(LW|LWI|NDW)$"),
+  (instregex "EVSLW(I)?$"),
+  (instregex "EVSPLAT(F)?I$"),
+  (instregex "EVSRW(I)?(S|U)$"),
+  (instregex "EVST(DD|DH|DW|WHE|WHO|WWE|WWO)(X)?$"),
+  (instregex "EVSUBF(S|U)(M|S)IAAW$"),
+  (instregex "EVSUB(I)?FW$")
+)> { let Unsupported = 1; }
+
+// General Instructions without scheduling support.
+def : InstRW<[],
+    (instrs
+  (instregex "(H)?RFI(D)?$"),
+  (instregex "DSS(ALL)?$"),
+  (instregex "DST(ST)?(T)?(64)?$"),
+  (instregex "ICBL(C|Q)$"),
+  (instregex "L(W|H|B)EPX$"),
+  (instregex "ST(W|H|B)EPX$"),
+  (instregex "(L|ST)FDEPX$"),
+  (instregex "M(T|F)SR(IN)?$"),
+  (instregex "M(T|F)DCR$"),
+  (instregex "NOP_GT_PWR(6|7)$"),
+  (instregex "TLB(IA|IVAX|SX|SX2|SX2D|LD|LI|RE|RE2|WE|WE2)$"),
+  (instregex "WRTEE(I)?$"),
+  ATTN,
+  CLRBHRB,
+  MFBHRBE,
+  MBAR,
+  MSYNC,
+  SLBSYNC,
+  NAP,
+  STOP,
+  TRAP,
+  RFCI,
+  RFDI,
+  RFMCI,
+  SC,
+  DCBA,
+  DCBI,
+  DCCCI,
+  ICCCI
+)> { let Unsupported = 1; }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td
index 46502208b175..80ad4962a20f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPC.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.td
@@ -35,6 +35,8 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500   : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500", "">;
 def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
                                        "PPC::DIR_E500mc", "">;
 def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective",
@@ -59,9 +61,12 @@ def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
                               "Enable 64-bit registers usage for ppc32 [beta]">;
 def FeatureCRBits    : SubtargetFeature<"crbits", "UseCRBits", "true",
                               "Use condition-register bits individually">;
+def FeatureFPU       : SubtargetFeature<"fpu","HasFPU","true",
+                                        "Enable classic FPU instructions",
+                                        [FeatureHardFloat]>;
 def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
                                         "Enable Altivec instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureSPE       : SubtargetFeature<"spe","HasSPE", "true",
                                         "Enable SPE instructions",
                                         [FeatureHardFloat]>;
@@ -69,36 +74,36 @@ def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
                                         "Enable the fcpsgn instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
                                         "Enable the fre instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
                                         "Enable the fres instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
                                         "Enable the frsqrte instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
                                         "Enable the frsqrtes instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
                               "Assume higher precision reciprocal estimates">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
                                         "Enable the stfiwx instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
                                         "Enable the lfiwax instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFPRND     : SubtargetFeature<"fprnd", "HasFPRND", "true",
                                         "Enable the fri[mnpz] instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFPCVT     : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
   "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
                                         "Enable the isel instruction">;
 def FeatureBPERMD    : SubtargetFeature<"bpermd", "HasBPERMD", "true",
@@ -119,13 +124,15 @@ def FeatureMSYNC     : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
                               [FeatureBookE]>;
 def FeatureE500      : SubtargetFeature<"e500", "IsE500", "true",
                                         "Enable E500/E500mc instructions">;
+def FeatureSecurePlt : SubtargetFeature<"secure-plt","SecurePlt", "true",
+                                        "Enable secure plt mode">;
 def FeaturePPC4xx    : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
                                         "Enable PPC 4xx instructions">;
 def FeaturePPC6xx    : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
                                         "Enable PPC 6xx instructions">;
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
                                         "Enable VSX instructions",
                                         [FeatureAltivec]>;
@@ -304,8 +311,8 @@ def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
                                           FeatureICBT, FeatureBookE, 
                                           FeatureMSYNC, FeatureMFTB]>;
-def : Processor<"601", G3Itineraries, [Directive601, FeatureHardFloat]>;
-def : Processor<"602", G3Itineraries, [Directive602, FeatureHardFloat,
+def : Processor<"601", G3Itineraries, [Directive601, FeatureFPU]>;
+def : Processor<"602", G3Itineraries, [Directive602, FeatureFPU,
                                        FeatureMFTB]>;
 def : Processor<"603", G3Itineraries, [Directive603,
                                        FeatureFRES, FeatureFRSQRTE,
@@ -356,6 +363,10 @@ def : ProcessorModel<"g5", G5Model,
                    FeatureFRES, FeatureFRSQRTE,
                    Feature64Bit /*, Feature64BitRegs */,
                    FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"e500", PPCE500Model,
+                  [DirectiveE500,
+                   FeatureICBT, FeatureBookE,
+                   FeatureISEL, FeatureMFTB]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc,
                    FeatureSTFIWX, FeatureICBT, FeatureBookE, 
@@ -465,4 +476,5 @@ def PPC : Target {
 
   let AssemblyParsers = [PPCAsmParser];
   let AssemblyParserVariants = [PPCAsmParserVariant];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 17451900840a..a9da64cc216f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -510,6 +510,32 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const Module *M = MF->getFunction().getParent();
   PICLevel::Level PL = M->getPICLevel();
 
+#ifndef NDEBUG
+  // Validate that SPE and FPU are mutually exclusive in codegen
+  if (!MI->isInlineAsm()) {
+    for (const MachineOperand &MO: MI->operands()) {
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (Subtarget->hasSPE()) {
+          if (PPC::F4RCRegClass.contains(Reg) ||
+              PPC::F8RCRegClass.contains(Reg) ||
+              PPC::QBRCRegClass.contains(Reg) ||
+              PPC::QFRCRegClass.contains(Reg) ||
+              PPC::QSRCRegClass.contains(Reg) ||
+              PPC::VFRCRegClass.contains(Reg) ||
+              PPC::VRRCRegClass.contains(Reg) ||
+              PPC::VSFRCRegClass.contains(Reg) ||
+              PPC::VSSRCRegClass.contains(Reg)
+              )
+            llvm_unreachable("SPE targets cannot have FPRegs!");
+        } else {
+          if (PPC::SPERCRegClass.contains(Reg))
+            llvm_unreachable("SPE register found in FPU-targeted code!");
+        }
+      }
+    }
+  }
+#endif
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
@@ -563,33 +589,63 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Transform %rd = UpdateGBR(%rt, %ri)
     // Into: lwz %rt, .L0$poff - .L0$pb(%ri)
     //       add %rd, %rt, %ri
+    // or into (if secure plt mode is on):
+    //       addis r30, r30, .LTOC - .L0$pb@ha
+    //       addi r30, r30, .LTOC - .L0$pb@l
     // Get the offset from the GOT Base Register to the GOT
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
-    MCSymbol *PICOffset =
-      MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
-    TmpInst.setOpcode(PPC::LWZ);
-    const MCExpr *Exp =
-      MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
-    const MCExpr *PB =
-      MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
-                              MCSymbolRefExpr::VK_None,
-                              OutContext);
-    const MCOperand TR = TmpInst.getOperand(1);
-    const MCOperand PICR = TmpInst.getOperand(0);
-
-    // Step 1: lwz %rt, .L$poff - .L$pb(%ri)
-    TmpInst.getOperand(1) =
-        MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
-    TmpInst.getOperand(0) = TR;
-    TmpInst.getOperand(2) = PICR;
-    EmitToStreamer(*OutStreamer, TmpInst);
+    if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
+      unsigned PICR = TmpInst.getOperand(0).getReg();
+      MCSymbol *LTOCSymbol = OutContext.getOrCreateSymbol(StringRef(".LTOC"));
+      const MCExpr *PB =
+        MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
+                                OutContext);
 
-    TmpInst.setOpcode(PPC::ADD4);
-    TmpInst.getOperand(0) = PICR;
-    TmpInst.getOperand(1) = TR;
-    TmpInst.getOperand(2) = PICR;
-    EmitToStreamer(*OutStreamer, TmpInst);
-    return;
+      const MCExpr *LTOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(LTOCSymbol, OutContext),
+                                PB, OutContext);
+
+      const MCExpr *LTOCDeltaHi =
+        PPCMCExpr::createHa(LTOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                   .addReg(PICR)
+                                   .addReg(PICR)
+                                   .addExpr(LTOCDeltaHi));
+
+      const MCExpr *LTOCDeltaLo =
+        PPCMCExpr::createLo(LTOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
+                                   .addReg(PICR)
+                                   .addReg(PICR)
+                                   .addExpr(LTOCDeltaLo));
+      return;
+    } else {
+      MCSymbol *PICOffset =
+        MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+      TmpInst.setOpcode(PPC::LWZ);
+      const MCExpr *Exp =
+        MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+      const MCExpr *PB =
+        MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
+                                MCSymbolRefExpr::VK_None,
+                                OutContext);
+      const MCOperand TR = TmpInst.getOperand(1);
+      const MCOperand PICR = TmpInst.getOperand(0);
+
+      // Step 1: lwz %rt, .L$poff - .L$pb(%ri)
+      TmpInst.getOperand(1) =
+          MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
+      TmpInst.getOperand(0) = TR;
+      TmpInst.getOperand(2) = PICR;
+      EmitToStreamer(*OutStreamer, TmpInst);
+
+      TmpInst.setOpcode(PPC::ADD4);
+      TmpInst.getOperand(0) = PICR;
+      TmpInst.getOperand(1) = TR;
+      TmpInst.getOperand(2) = PICR;
+      EmitToStreamer(*OutStreamer, TmpInst);
+      return;
+    }
   }
   case PPC::LWZtoc: {
     // Transform %r3 = LWZtoc @min1, %r2
@@ -741,11 +797,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isGlobal()) {
       const GlobalValue *GV = MO.getGlobal();
       MOSymbol = getSymbol(GV);
-      DEBUG(
-        unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
-        assert((GVFlags & PPCII::MO_NLP_FLAG) &&
-               "LDtocL used on symbol that could be accessed directly is "
-               "invalid. Must match ADDIStocHA."));
+      LLVM_DEBUG(
+          unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+          assert((GVFlags & PPCII::MO_NLP_FLAG) &&
+                 "LDtocL used on symbol that could be accessed directly is "
+                 "invalid. Must match ADDIStocHA."));
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
 
@@ -770,11 +826,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     if (MO.isGlobal()) {
       const GlobalValue *GV = MO.getGlobal();
-      DEBUG(
-        unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
-        assert (
-            !(GVFlags & PPCII::MO_NLP_FLAG) &&
-            "Interposable definitions must use indirect access."));
+      LLVM_DEBUG(unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+                 assert(!(GVFlags & PPCII::MO_NLP_FLAG) &&
+                        "Interposable definitions must use indirect access."));
       MOSymbol = getSymbol(GV);
     } else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
@@ -1233,7 +1287,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
 
   if (!Subtarget->isPPC64()) {
     const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
-    if (PPCFI->usesPICBase()) {
+    if (PPCFI->usesPICBase() && !Subtarget->isSecurePlt()) {
       MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
       MCSymbol *PICBase = MF->getPICBaseSymbol();
       OutStreamer->EmitLabel(RelocSymbol);
@@ -1255,7 +1309,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   if (Subtarget->isELFv2ABI()) {
     // In the Large code model, we allow arbitrary displacements between
     // the text section and its associated TOC section.  We place the
-    // full 8-byte offset to the TOC in memory immediatedly preceding
+    // full 8-byte offset to the TOC in memory immediately preceding
     // the function global entry point.
     if (TM.getCodeModel() == CodeModel::Large
         && !MF->getRegInfo().use_empty(PPC::X2)) {
@@ -1458,6 +1512,7 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc750",
     "ppc970",
     "ppcA2",
+    "ppce500",
     "ppce500mc",
     "ppce5500",
     "power3",
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 32d801b13ded..bbb977f090c5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -60,7 +60,7 @@ namespace llvm {
 /// expands to the following machine code:
 ///
 /// %bb.0: derived from LLVM BB %entry
-///    Live Ins: %f1 %f3 %x6
+///    liveins: %f1 %f3 %x6
 ///        <SNIP1>
 ///        %0 = COPY %f1; F8RC:%0
 ///        %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
@@ -98,7 +98,7 @@ namespace llvm {
 /// If all conditions are meet, IR should collapse to:
 ///
 /// %bb.0: derived from LLVM BB %entry
-///    Live Ins: %f1 %f3 %x6
+///    liveins: %f1 %f3 %x6
 ///        <SNIP1>
 ///        %0 = COPY %f1; F8RC:%0
 ///        %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
@@ -236,18 +236,18 @@ void PPCBranchCoalescing::initialize(MachineFunction &MF) {
 ///\return true if and only if the branch can be coalesced, false otherwise
 ///
 bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
-  DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
-               << " can be coalesced:");
+  LLVM_DEBUG(dbgs() << "Determine if branch block "
+                    << Cand.BranchBlock->getNumber() << " can be coalesced:");
   MachineBasicBlock *FalseMBB = nullptr;
 
   if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
                          Cand.Cond)) {
-    DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+    LLVM_DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
     return false;
   }
 
   for (auto &I : Cand.BranchBlock->terminators()) {
-    DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+    LLVM_DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
     if (!I.isBranch())
       continue;
 
@@ -265,14 +265,14 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
     // must then be extended to prove that none of the implicit operands are
     // changed in the blocks that are combined during coalescing.
     if (I.getNumOperands() != I.getNumExplicitOperands()) {
-      DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Terminator contains implicit operands - skip : "
+                        << I << "\n");
       return false;
     }
   }
 
   if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
-    DEBUG(dbgs() << "EH Pad - skip\n");
+    LLVM_DEBUG(dbgs() << "EH Pad - skip\n");
     return false;
   }
 
@@ -280,13 +280,13 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
   // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
   if (!Cand.BranchTargetBlock || FalseMBB ||
       !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
-    DEBUG(dbgs() << "Does not form a triangle - skip\n");
+    LLVM_DEBUG(dbgs() << "Does not form a triangle - skip\n");
     return false;
   }
 
   // Ensure there are only two successors
   if (Cand.BranchBlock->succ_size() != 2) {
-    DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+    LLVM_DEBUG(dbgs() << "Does not have 2 successors - skip\n");
     return false;
   }
 
@@ -305,18 +305,19 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
   assert(Succ && "Expecting a valid fall-through block\n");
 
   if (!Succ->empty()) {
-      DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
-      return false;
+    LLVM_DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+    return false;
   }
 
   if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
-      DEBUG(dbgs()
-            << "Successor of fall through block is not branch taken block\n");
-      return false;
+    LLVM_DEBUG(
+        dbgs()
+        << "Successor of fall through block is not branch taken block\n");
+    return false;
   }
 
   Cand.FallThroughBlock = Succ;
-  DEBUG(dbgs() << "Valid Candidate\n");
+  LLVM_DEBUG(dbgs() << "Valid Candidate\n");
   return true;
 }
 
@@ -331,7 +332,7 @@ bool PPCBranchCoalescing::identicalOperands(
     ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
 
   if (OpList1.size() != OpList2.size()) {
-    DEBUG(dbgs() << "Operand list is different size\n");
+    LLVM_DEBUG(dbgs() << "Operand list is different size\n");
     return false;
   }
 
@@ -339,8 +340,8 @@ bool PPCBranchCoalescing::identicalOperands(
     const MachineOperand &Op1 = OpList1[i];
     const MachineOperand &Op2 = OpList2[i];
 
-    DEBUG(dbgs() << "Op1: " << Op1 << "\n"
-                 << "Op2: " << Op2 << "\n");
+    LLVM_DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+                      << "Op2: " << Op2 << "\n");
 
     if (Op1.isIdenticalTo(Op2)) {
       // filter out instructions with physical-register uses
@@ -348,10 +349,10 @@ bool PPCBranchCoalescing::identicalOperands(
         // If the physical register is constant then we can assume the value
         // has not changed between uses.
           && !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) {
-        DEBUG(dbgs() << "The operands are not provably identical.\n");
+        LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n");
         return false;
       }
-      DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+      LLVM_DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
       continue;
     }
 
@@ -364,14 +365,14 @@ bool PPCBranchCoalescing::identicalOperands(
       MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
       MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
       if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
-        DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
-                     << " produce the same value!\n");
+        LLVM_DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+                          << " produce the same value!\n");
       } else {
-        DEBUG(dbgs() << "Operands produce different values\n");
+        LLVM_DEBUG(dbgs() << "Operands produce different values\n");
         return false;
       }
     } else {
-      DEBUG(dbgs() << "The operands are not provably identical.\n");
+      LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n");
       return false;
     }
   }
@@ -395,7 +396,7 @@ void PPCBranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
   MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
 
   if (MI == ME) {
-    DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+    LLVM_DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
     return;
   }
 
@@ -425,19 +426,19 @@ bool PPCBranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
                                           const MachineBasicBlock &TargetMBB
                                           ) const {
 
-  DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
-        << TargetMBB.getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+                    << TargetMBB.getNumber() << "\n");
 
   for (auto &Def : MI.defs()) { // Looking at Def
     for (auto &Use : MRI->use_instructions(Def.getReg())) {
       if (Use.isPHI() && Use.getParent() == &TargetMBB) {
-        DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n");
-       return false;
+        LLVM_DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n");
+        return false;
       }
     }
   }
 
-  DEBUG(dbgs() << "  Safe to move to the beginning.\n");
+  LLVM_DEBUG(dbgs() << "  Safe to move to the beginning.\n");
   return true;
 }
 
@@ -456,22 +457,23 @@ bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI,
                                     const MachineBasicBlock &TargetMBB
                                     ) const {
 
-  DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
-        << TargetMBB.getNumber() << "\n");
+  LLVM_DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+                    << TargetMBB.getNumber() << "\n");
 
   for (auto &Use : MI.uses()) {
     if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
       MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
       if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
-        DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
+        LLVM_DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
         return false;
       } else {
-        DEBUG(dbgs() << "    *** def is in another block -- safe to move!\n");
+        LLVM_DEBUG(
+            dbgs() << "    *** def is in another block -- safe to move!\n");
       }
     }
   }
 
-  DEBUG(dbgs() << "  Safe to move to the end.\n");
+  LLVM_DEBUG(dbgs() << "  Safe to move to the end.\n");
   return true;
 }
 
@@ -541,15 +543,17 @@ bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
     for (auto &Def : I->defs())
       for (auto &Use : MRI->use_instructions(Def.getReg())) {
         if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
-          DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
-                          "PHI within branch target block -- can't merge\n");
+          LLVM_DEBUG(dbgs()
+                     << "PHI " << *I
+                     << " defines register used in another "
+                        "PHI within branch target block -- can't merge\n");
           NumPHINotMoved++;
           return false;
         }
         if (Use.getParent() == SourceRegion.BranchBlock) {
-          DEBUG(dbgs() << "PHI " << *I
-                       << " defines register used in this "
-                          "block -- all must move down\n");
+          LLVM_DEBUG(dbgs() << "PHI " << *I
+                            << " defines register used in this "
+                               "block -- all must move down\n");
           SourceRegion.MustMoveDown = true;
         }
       }
@@ -562,13 +566,13 @@ bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
            E = SourceRegion.BranchBlock->end();
        I != E; ++I) {
     if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
-      DEBUG(dbgs() << "Instruction " << *I
-                   << " cannot move down - must move up!\n");
+      LLVM_DEBUG(dbgs() << "Instruction " << *I
+                        << " cannot move down - must move up!\n");
       SourceRegion.MustMoveUp = true;
     }
     if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
-      DEBUG(dbgs() << "Instruction " << *I
-                   << " cannot move up - must move down!\n");
+      LLVM_DEBUG(dbgs() << "Instruction " << *I
+                        << " cannot move up - must move down!\n");
       SourceRegion.MustMoveDown = true;
     }
   }
@@ -719,10 +723,10 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
 
   bool didSomething = false;
 
-  DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+  LLVM_DEBUG(dbgs() << "******** Branch Coalescing ********\n");
   initialize(MF);
 
-  DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
 
   CoalescingCandidateInfo Cand1, Cand2;
   // Walk over blocks and find candidates to merge
@@ -752,24 +756,27 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
              "Branch-taken block should post-dominate first candidate");
 
       if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
-        DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
-                     << Cand2.BranchBlock->getNumber()
-                     << " have different branches\n");
+        LLVM_DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber()
+                          << " and " << Cand2.BranchBlock->getNumber()
+                          << " have different branches\n");
         break;
       }
       if (!canMerge(Cand2, Cand1)) {
-        DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
-                     << " and " << Cand2.BranchBlock->getNumber() << "\n");
+        LLVM_DEBUG(dbgs() << "Cannot merge blocks "
+                          << Cand1.BranchBlock->getNumber() << " and "
+                          << Cand2.BranchBlock->getNumber() << "\n");
         NumBlocksNotCoalesced++;
         continue;
       }
-      DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
-                   << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+      LLVM_DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+                        << " and " << Cand1.BranchTargetBlock->getNumber()
+                        << "\n");
       MergedCandidates = mergeCandidates(Cand2, Cand1);
       if (MergedCandidates)
         didSomething = true;
 
-      DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "Function after merging: "; MF.dump();
+                 dbgs() << "\n");
     } while (MergedCandidates);
   }
 
@@ -779,6 +786,6 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
     MF.verify(nullptr, "Error in code produced by branch coalescing");
 #endif // NDEBUG
 
-  DEBUG(dbgs() << "Finished Branch Coalescing\n");
+  LLVM_DEBUG(dbgs() << "Finished Branch Coalescing\n");
   return didSomething;
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index 1d10ef9acfba..6b9e2383e36f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -30,11 +30,14 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Constants.h"
@@ -50,8 +53,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 #ifndef NDEBUG
@@ -403,15 +406,16 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
         }
 
         if (Opcode) {
-          MVT VTy = TLI->getSimpleValueType(
-              *DL, CI->getArgOperand(0)->getType(), true);
-          if (VTy == MVT::Other)
+          EVT EVTy =
+              TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
+
+          if (EVTy == MVT::Other)
             return true;
 
-          if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+          if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
             continue;
-          else if (VTy.isVector() &&
-                   TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+          else if (EVTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
             continue;
 
           return true;
@@ -503,13 +507,19 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // Process nested loops first.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
     MadeChange |= convertToCTRLoop(*I);
-    DEBUG(dbgs() << "Nested loop converted\n");
+    LLVM_DEBUG(dbgs() << "Nested loop converted\n");
   }
 
   // If a nested loop has been converted, then we can't convert this loop.
   if (MadeChange)
     return MadeChange;
 
+  // Bail out if the loop has irreducible control flow.
+  LoopBlocksRPO RPOT(L);
+  RPOT.perform(LI);
+  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
+    return false;
+
 #ifndef NDEBUG
   // Stop trying after reaching the limit (if any).
   int Limit = CTRLoopLimit;
@@ -530,14 +540,35 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   SmallVector<BasicBlock*, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
+  // If there is an exit edge known to be frequently taken,
+  // we should not transform this loop.
+  for (auto &BB : ExitingBlocks) {
+    Instruction *TI = BB->getTerminator();
+    if (!TI) continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      uint64_t TrueWeight = 0, FalseWeight = 0;
+      if (!BI->isConditional() ||
+          !BI->extractProfMetadata(TrueWeight, FalseWeight))
+        continue;
+
+      // If the exit path is more frequent than the loop path,
+      // we return here without further analysis for this loop.
+      bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+      if (( TrueIsExit && FalseWeight < TrueWeight) ||
+          (!TrueIsExit && FalseWeight > TrueWeight))
+        return MadeChange;
+    }
+  }
+
   BasicBlock *CountedExitBlock = nullptr;
   const SCEV *ExitCount = nullptr;
   BranchInst *CountedExitBranch = nullptr;
   for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
        IE = ExitingBlocks.end(); I != IE; ++I) {
     const SCEV *EC = SE->getExitCount(L, *I);
-    DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
-                    (*I)->getName() << ": " << *EC << "\n");
+    LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
+                      << (*I)->getName() << ": " << *EC << "\n");
     if (isa<SCEVCouldNotCompute>(EC))
       continue;
     if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
@@ -549,9 +580,15 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
     if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
       continue;
 
+    // If this exiting block is contained in a nested loop, it is not eligible
+    // for insertion of the branch-and-decrement since the inner loop would
+    // end up messing up the value in the CTR.
+    if (LI->getLoopFor(*I) != L)
+      continue;
+
     // We now have a loop-invariant count of loop iterations (which is not the
     // constant zero) for which we know that this loop will not exit via this
-    // exisiting block.
+    // existing block.
 
     // We need to make sure that this block will run on every loop iteration.
     // For this to be true, we must dominate all blocks with backedges. Such
@@ -605,7 +642,8 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   if (!Preheader)
     return MadeChange;
 
-  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
+                    << "\n");
 
   // Insert the count into the preheader and replace the condition used by the
   // selected branch.
@@ -693,11 +731,12 @@ check_block:
     }
 
     if (I != BI && clobbersCTR(*I)) {
-      DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
-                   << ") instruction " << *I << " clobbers CTR, invalidating "
-                   << printMBBReference(*BI->getParent()) << " ("
-                   << BI->getParent()->getFullName() << ") instruction " << *BI
-                   << "\n");
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
+                        << ") instruction " << *I
+                        << " clobbers CTR, invalidating "
+                        << printMBBReference(*BI->getParent()) << " ("
+                        << BI->getParent()->getFullName() << ") instruction "
+                        << *BI << "\n");
       return false;
     }
 
@@ -711,10 +750,10 @@ check_block:
   if (CheckPreds) {
 queue_preds:
     if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
-      DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
-                   << printMBBReference(*BI->getParent()) << " ("
-                   << BI->getParent()->getFullName() << ") instruction " << *BI
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
+                        << printMBBReference(*BI->getParent()) << " ("
+                        << BI->getParent()->getFullName() << ") instruction "
+                        << *BI << "\n");
       return false;
     }
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
index a4f4c8688cc1..12c581023234 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -45,6 +45,30 @@ def RetCC_PPC64_AnyReg : CallingConv<[
   CCCustom<"CC_PPC_AnyReg_Error">
 ]>;
 
+// Return-value convention for PowerPC coldcc.
+def RetCC_PPC_Cold : CallingConv<[
+  // Use the same return registers as RetCC_PPC, but limited to only
+  // one return value. The remaining return values will be saved to
+  // the stack.
+  CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+  CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
+
+  CCIfType<[i32], CCAssignToReg<[R3]>>,
+  CCIfType<[i64], CCAssignToReg<[X3]>>,
+  CCIfType<[i128], CCAssignToReg<[X3]>>,
+
+  CCIfType<[f32], CCAssignToReg<[F1]>>,
+  CCIfType<[f64], CCAssignToReg<[F1]>>,
+  CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>,
+
+  CCIfType<[v4f64, v4f32, v4i1],
+           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>,
+
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+           CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2]>>>
+]>;
+
 // Return-value convention for PowerPC
 def RetCC_PPC : CallingConv<[
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
@@ -59,8 +83,19 @@ def RetCC_PPC : CallingConv<[
 
   // Floating point types returned as "direct" go into F1 .. F8; note that
   // only the ELFv2 ABI fully utilizes all these registers.
-  CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
-  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfNotSubtarget<"hasSPE()",
+       CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+  CCIfNotSubtarget<"hasSPE()",
+       CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+  CCIfSubtarget<"hasSPE()",
+       CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
+  CCIfSubtarget<"hasSPE()",
+       CCIfType<[f64], CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+
+  // For P9, f128 are passed in vector registers.
+  CCIfType<[f128],
+           CCIfSubtarget<"hasP9Vector()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
 
   // QPX vectors are returned in QF1 and QF2. 
   CCIfType<[v4f64, v4f32, v4i1],
@@ -117,6 +152,9 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
   CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
   CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f128],
+           CCIfSubtarget<"hasP9Vector()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
   CCIfType<[v4f64, v4f32, v4i1],
            CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
@@ -156,7 +194,15 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
   
   // FP values are passed in F1 - F8.
-  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f32, f64],
+           CCIfNotSubtarget<"hasSPE()",
+                            CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+  CCIfType<[f64],
+           CCIfSubtarget<"hasSPE()",
+                         CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+  CCIfType<[f32],
+           CCIfSubtarget<"hasSPE()",
+                         CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
 
   // Split arguments have an alignment of 8 bytes on the stack.
   CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
@@ -165,13 +211,18 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   
   // Floats are stored in double precision format, thus they have the same
   // alignment and size as doubles.
-  CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
+  // With SPE floats are stored as single precision, so have alignment and
+  // size of int.
+  CCIfType<[f32,f64], CCIfNotSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
+  CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
 
   // QPX vectors that are stored in double precision need 32-byte alignment.
   CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
 
-  // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
+  // Vectors and float128 get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>,
+  CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>>
 ]>;
 
 // This calling convention puts vector arguments always on the stack. It is used
@@ -192,6 +243,11 @@ def CC_PPC32_SVR4 : CallingConv<[
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
            CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
                           V8, V9, V10, V11, V12, V13]>>>,
+
+  // Float128 types treated as vector arguments.
+  CCIfType<[f128],
+           CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+                          V8, V9, V10, V11, V12, V13]>>>,
            
   CCDelegateTo<CC_PPC32_SVR4_Common>
 ]>;  
@@ -227,15 +283,23 @@ def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
 
 def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
 
-def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
-                                        R21, R22, R23, R24, R25, R26, R27, R28,
-                                        R29, R30, R31, F14, F15, F16, F17, F18,
+// SPE does not use FPRs, so break out the common register set as base.
+def CSR_SVR432_COMM : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
+                                          R21, R22, R23, R24, R25, R26, R27,
+                                          R28, R29, R30, R31, CR2, CR3, CR4
+                                      )>;
+def CSR_SVR432 :  CalleeSavedRegs<(add CSR_SVR432_COMM, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                        F27, F28, F29, F30, F31
                                    )>;
+def CSR_SPE : CalleeSavedRegs<(add S14, S15, S16, S17, S18, S19, S20, S21, S22,
+                                   S23, S24, S25, S26, S27, S28, S29, S30, S31
+                              )>;
 
 def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
 
+def CSR_SVR432_SPE : CalleeSavedRegs<(add CSR_SVR432_COMM, CSR_SPE)>;
+
 def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
                                         X21, X22, X23, X24, X25, X26, X27, X28,
                                         X29, X30, X31, F14, F15, F16, F17, F18,
@@ -271,6 +335,36 @@ def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>
 
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
+// coldcc calling convection marks most registers as non-volatile.
+// Do not include r1 since the stack pointer is never considered a CSR.
+// Do not include r2, since it is the TOC register and is added depending
+// on wether or not the function uses the TOC and is a non-leaf.
+// Do not include r0,r11,r13 as they are optional in functional linkage
+// and value may be altered by inter-library calls.
+// Do not include r12 as it is used as a scratch register.
+// Do not include return registers r3, f1, v2.
+def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10),
+                                          (sequence "R%u", 14, 31),
+                                          F0, (sequence "F%u", 2, 31),
+                                          (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC,
+                                            (sequence "V%u", 0, 1),
+                                            (sequence "V%u", 3, 31))>;
+
+def CSR_SVR64_ColdCC : CalleeSavedRegs<(add  (sequence "X%u", 4, 10),
+                                             (sequence "X%u", 14, 31),
+                                             F0, (sequence "F%u", 2, 31),
+                                             (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR64_ColdCC_R2: CalleeSavedRegs<(add CSR_SVR64_ColdCC, X2)>;
+
+def CSR_SVR64_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC,
+                                             (sequence "V%u", 0, 1),
+                                             (sequence "V%u", 3, 31))>;
+
+def CSR_SVR64_ColdCC_R2_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC_Altivec, X2)>;
+
 def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
                                              (sequence "X%u", 14, 31),
                                              (sequence "F%u", 0, 31),
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 1699463c0a4b..ed5e496b32fd 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -128,7 +128,7 @@ protected:
                 if (J->getOperand(i).isMBB() &&
                     J->getOperand(i).getMBB() == &ReturnMBB)
                   OtherReference = true;
-          } else if (!J->isTerminator() && !J->isDebugValue())
+          } else if (!J->isTerminator() && !J->isDebugInstr())
             break;
 
           if (J == (*PI)->begin())
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp b/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
index b00e98b63e34..fe41e1b36a5d 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -117,7 +117,7 @@ public:
   /// instruction is still generated by default on targets that support them.
   ///
   /// \return true if ISEL should be expanded into if-then-else code sequence;
-  ///         false if ISEL instruction should be generated, i.e. not expaned.
+  ///         false if ISEL instruction should be generated, i.e. not expanded.
   ///
   static bool isExpandISELEnabled(const MachineFunction &MF);
 
@@ -126,11 +126,11 @@ public:
 #endif
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
     initialize(MF);
 
     if (!collectISELInstructions()) {
-      DEBUG(dbgs() << "No ISEL instructions in this function\n");
+      LLVM_DEBUG(dbgs() << "No ISEL instructions in this function\n");
       return false;
     }
 
@@ -170,9 +170,10 @@ bool PPCExpandISEL::collectISELInstructions() {
 #ifndef NDEBUG
 void PPCExpandISEL::DumpISELInstructions() const {
   for (const auto &I : ISELInstructions) {
-    DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first)) << ":\n");
+    LLVM_DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first))
+                      << ":\n");
     for (const auto &VI : I.second)
-      DEBUG(dbgs() << "    "; VI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "    "; VI->print(dbgs()));
   }
 }
 #endif
@@ -192,9 +193,10 @@ void PPCExpandISEL::expandAndMergeISELs() {
   bool ExpandISELEnabled = isExpandISELEnabled(*MF);
 
   for (auto &BlockList : ISELInstructions) {
-    DEBUG(dbgs() << "Expanding ISEL instructions in "
-                 << printMBBReference(*MF->getBlockNumbered(BlockList.first))
-                 << "\n");
+    LLVM_DEBUG(
+        dbgs() << "Expanding ISEL instructions in "
+               << printMBBReference(*MF->getBlockNumbered(BlockList.first))
+               << "\n");
     BlockISELList &CurrentISELList = BlockList.second;
     auto I = CurrentISELList.begin();
     auto E = CurrentISELList.end();
@@ -210,7 +212,8 @@ void PPCExpandISEL::expandAndMergeISELs() {
       // as it would be ISEL %R0, %ZERO, %R0, %CRN.
       if (useSameRegister(Dest, TrueValue) &&
           useSameRegister(Dest, FalseValue)) {
-        DEBUG(dbgs() << "Remove redudant ISEL instruction: " << **I << "\n");
+        LLVM_DEBUG(dbgs() << "Remove redundant ISEL instruction: " << **I
+                          << "\n");
         // FIXME: if the CR field used has no other uses, we could eliminate the
         // instruction that defines it. This would have to be done manually
         // since this pass runs too late to run DCE after it.
@@ -223,8 +226,9 @@ void PPCExpandISEL::expandAndMergeISELs() {
         // condition as it would be ISEL %RX, %ZERO, %R0, %CRN, which makes it
         // safe to fold ISEL to MR(OR) instead of ADDI.
         MachineBasicBlock *MBB = (*I)->getParent();
-        DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy:\n");
-        DEBUG(dbgs() << "ISEL: " << **I << "\n");
+        LLVM_DEBUG(
+            dbgs() << "Fold the ISEL instruction to an unconditional copy:\n");
+        LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
         NumFolded++;
         // Note: we're using both the TrueValue and FalseValue operands so as
         // not to lose the kill flag if it is set on either of them.
@@ -235,8 +239,8 @@ void PPCExpandISEL::expandAndMergeISELs() {
         (*I)->eraseFromParent();
         I++;
       } else if (ExpandISELEnabled) { // Normal cases expansion enabled
-        DEBUG(dbgs() << "Expand ISEL instructions:\n");
-        DEBUG(dbgs() << "ISEL: " << **I << "\n");
+        LLVM_DEBUG(dbgs() << "Expand ISEL instructions:\n");
+        LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
         BlockISELList SubISELList;
         SubISELList.push_back(*I++);
         // Collect the ISELs that can be merged together.
@@ -244,7 +248,7 @@ void PPCExpandISEL::expandAndMergeISELs() {
         // may be redundant or foldable to a register copy. So we still keep
         // the handleSpecialCases() downstream to handle them.
         while (I != E && canMerge(SubISELList.back(), *I)) {
-          DEBUG(dbgs() << "ISEL: " << **I << "\n");
+          LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
           SubISELList.push_back(*I++);
         }
 
@@ -264,7 +268,7 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
   auto MI = BIL.begin();
   while (MI != BIL.end()) {
     assert(isISEL(**MI) && "Expecting an ISEL instruction");
-    DEBUG(dbgs() << "ISEL: " << **MI << "\n");
+    LLVM_DEBUG(dbgs() << "ISEL: " << **MI << "\n");
 
     MachineOperand &Dest = (*MI)->getOperand(0);
     MachineOperand &TrueValue = (*MI)->getOperand(1);
@@ -281,7 +285,7 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
 
     // Special case 1, all registers used by ISEL are the same one.
     if (!IsADDIInstRequired && !IsORIInstRequired) {
-      DEBUG(dbgs() << "Remove redudant ISEL instruction.");
+      LLVM_DEBUG(dbgs() << "Remove redundant ISEL instruction.");
       // FIXME: if the CR field used has no other uses, we could eliminate the
       // instruction that defines it. This would have to be done manually
       // since this pass runs too late to run DCE after it.
@@ -300,7 +304,8 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
     // be zero. In this case, the useSameRegister method will return false,
     // thereby preventing this ISEL from being folded.
     if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
-      DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy.");
+      LLVM_DEBUG(
+          dbgs() << "Fold the ISEL instruction to an unconditional copy.");
       NumFolded++;
       // Note: we're using both the TrueValue and FalseValue operands so as
       // not to lose the kill flag if it is set on either of them.
@@ -439,11 +444,10 @@ void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
                                                        // condition is false
     MachineOperand &ConditionRegister = MI->getOperand(3); // Condition
 
-    DEBUG(dbgs() << "Dest: " << Dest << "\n");
-    DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
-    DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
-    DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
-
+    LLVM_DEBUG(dbgs() << "Dest: " << Dest << "\n");
+    LLVM_DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
+    LLVM_DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
+    LLVM_DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
 
     // If the Dest Register and True Value Register are not the same one, we
     // need the True Block.
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 402e29cdff72..b00655b50229 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -153,7 +153,8 @@ class PPCFastISel final : public FastISel {
       return RC->getID() == PPC::VSSRCRegClassID;
     }
     bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                    bool isZExt, unsigned DestReg);
+                    bool isZExt, unsigned DestReg,
+                    const PPC::Predicate Pred);
     bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                      const TargetRegisterClass *RC, bool IsZExt = true,
                      unsigned FP64LoadOpc = PPC::LFD);
@@ -206,6 +207,8 @@ CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
     return CC_PPC32_SVR4_ByVal;
   else if (Flag == 3)
     return CC_PPC32_SVR4_VarArg;
+  else if (Flag == 4)
+    return RetCC_PPC_Cold;
   else
     return RetCC_PPC;
 }
@@ -219,7 +222,7 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
     // result consists of 4 bits, indicating lt, eq, gt and un (unordered),
     // only one of which will be set. The result is generated by fcmpu
     // instruction. However, bc instruction only inspects one of the first 3
-    // bits, so when un is set, bc instruction may jump to to an undesired
+    // bits, so when un is set, bc instruction may jump to an undesired
     // place.
     //
     // More specifically, if we expect an unordered comparison and un is set, we
@@ -464,6 +467,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                               bool IsZExt, unsigned FP64LoadOpc) {
   unsigned Opc;
   bool UseOffset = true;
+  bool HasSPE = PPCSubTarget->hasSPE();
 
   // If ResultReg is given, it determines the register class of the load.
   // Otherwise, RC is the register class to use.  If the result of the
@@ -475,8 +479,8 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   const TargetRegisterClass *UseRC =
     (ResultReg ? MRI.getRegClass(ResultReg) :
      (RC ? RC :
-      (VT == MVT::f64 ? &PPC::F8RCRegClass :
-       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+      (VT == MVT::f64 ? (HasSPE ? &PPC::SPERCRegClass : &PPC::F8RCRegClass) :
+       (VT == MVT::f32 ? (HasSPE ? &PPC::SPE4RCRegClass : &PPC::F4RCRegClass) :
         (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
          &PPC::GPRC_and_GPRC_NOR0RegClass)))));
 
@@ -505,7 +509,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       UseOffset = ((Addr.Offset & 3) == 0);
       break;
     case MVT::f32:
-      Opc = PPC::LFS;
+      Opc = PPCSubTarget->hasSPE() ? PPC::SPELWZ : PPC::LFS;
       break;
     case MVT::f64:
       Opc = FP64LoadOpc;
@@ -576,6 +580,8 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       case PPC::LD:     Opc = PPC::LDX;     break;
       case PPC::LFS:    Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
       case PPC::LFD:    Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
+      case PPC::EVLDD:  Opc = PPC::EVLDDX;  break;
+      case PPC::SPELWZ: Opc = PPC::SPELWZX;    break;
     }
 
     auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
@@ -618,7 +624,8 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
     AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
   unsigned ResultReg = 0;
-  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC, true,
+      PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
     return false;
   updateValueMap(I, ResultReg);
   return true;
@@ -651,10 +658,10 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       UseOffset = ((Addr.Offset & 3) == 0);
       break;
     case MVT::f32:
-      Opc = PPC::STFS;
+      Opc = PPCSubTarget->hasSPE() ? PPC::SPESTW : PPC::STFS;
       break;
     case MVT::f64:
-      Opc = PPC::STFD;
+      Opc = PPCSubTarget->hasSPE() ? PPC::EVSTDD : PPC::STFD;
       break;
   }
 
@@ -719,6 +726,8 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       case PPC::STD:  Opc = PPC::STDX;  break;
       case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
       case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
+      case PPC::EVSTDD: Opc = PPC::EVSTDDX; break;
+      case PPC::SPESTW: Opc = PPC::SPESTWX; break;
     }
 
     auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
@@ -792,11 +801,12 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
       unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
 
       if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
-                      CondReg))
+                      CondReg, PPCPred))
         return false;
 
       BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
-        .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+        .addImm(PPCSubTarget->hasSPE() ? PPC::PRED_SPE : PPCPred)
+        .addReg(CondReg).addMBB(TBB);
       finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
@@ -820,7 +830,8 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
 // Attempt to emit a compare of the two source values.  Signed and unsigned
 // comparisons are supported.  Return false if we can't handle it.
 bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
-                             bool IsZExt, unsigned DestReg) {
+                             bool IsZExt, unsigned DestReg,
+                             const PPC::Predicate Pred) {
   Type *Ty = SrcValue1->getType();
   EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple())
@@ -836,6 +847,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
   // similar to ARM in this regard.
   long Imm = 0;
   bool UseImm = false;
+  const bool HasSPE = PPCSubTarget->hasSPE();
 
   // Only 16-bit integer constants can be represented in compares for
   // PowerPC.  Others will be materialized into a register.
@@ -854,10 +866,38 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
   switch (SrcVT.SimpleTy) {
     default: return false;
     case MVT::f32:
-      CmpOpc = PPC::FCMPUS;
+      if (HasSPE) {
+        switch (Pred) {
+          default: return false;
+          case PPC::PRED_EQ:
+            CmpOpc = PPC::EFSCMPEQ;
+            break;
+          case PPC::PRED_LT:
+            CmpOpc = PPC::EFSCMPLT;
+            break;
+          case PPC::PRED_GT:
+            CmpOpc = PPC::EFSCMPGT;
+            break;
+        }
+      } else
+        CmpOpc = PPC::FCMPUS;
       break;
     case MVT::f64:
-      CmpOpc = PPC::FCMPUD;
+      if (HasSPE) {
+        switch (Pred) {
+          default: return false;
+          case PPC::PRED_EQ:
+            CmpOpc = PPC::EFDCMPEQ;
+            break;
+          case PPC::PRED_LT:
+            CmpOpc = PPC::EFDCMPLT;
+            break;
+          case PPC::PRED_GT:
+            CmpOpc = PPC::EFDCMPGT;
+            break;
+        }
+      } else
+        CmpOpc = PPC::FCMPUD;
       break;
     case MVT::i1:
     case MVT::i8:
@@ -945,9 +985,19 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
     return false;
 
   // Round the result to single precision.
-  unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
-    .addReg(SrcReg);
+  unsigned DestReg;
+
+  if (PPCSubTarget->hasSPE()) {
+    DestReg = createResultReg(&PPC::SPE4RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+      TII.get(PPC::EFSCFD), DestReg)
+      .addReg(SrcReg);
+  } else {
+    DestReg = createResultReg(&PPC::F4RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+      TII.get(PPC::FRSP), DestReg)
+      .addReg(SrcReg);
+  }
 
   updateValueMap(I, DestReg);
   return true;
@@ -1029,6 +1079,22 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
   if (SrcReg == 0)
     return false;
 
+  // Shortcut for SPE.  Doesn't need to store/load, since it's all in the GPRs
+  if (PPCSubTarget->hasSPE()) {
+    unsigned Opc;
+    if (DstVT == MVT::f32)
+      Opc = IsSigned ? PPC::EFSCFSI : PPC::EFSCFUI;
+    else
+      Opc = IsSigned ? PPC::EFDCFSI : PPC::EFDCFUI;
+
+    unsigned DestReg = createResultReg(&PPC::SPERCRegClass);
+    // Generate the convert.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+      .addReg(SrcReg);
+    updateValueMap(I, DestReg);
+    return true;
+  }
+
   // We can only lower an unsigned convert if we have the newer
   // floating-point conversion operations.
   if (!IsSigned && !PPCSubTarget->hasFPCVT())
@@ -1123,8 +1189,9 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
   if (DstVT != MVT::i32 && DstVT != MVT::i64)
     return false;
 
-  // If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
-  if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
+  // If we don't have FCTIDUZ, or SPE, and we need it, punt to SelectionDAG.
+  if (DstVT == MVT::i64 && !IsSigned &&
+      !PPCSubTarget->hasFPCVT() && !PPCSubTarget->hasSPE())
     return false;
 
   Value *Src = I->getOperand(0);
@@ -1152,23 +1219,34 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
 
   // Determine the opcode for the conversion, which takes place
   // entirely within FPRs.
-  unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+  unsigned DestReg;
   unsigned Opc;
 
-  if (DstVT == MVT::i32)
+  if (PPCSubTarget->hasSPE()) {
+    DestReg = createResultReg(&PPC::GPRCRegClass);
     if (IsSigned)
-      Opc = PPC::FCTIWZ;
+      Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
     else
-      Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
-  else
-    Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+      Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
+  } else {
+    DestReg = createResultReg(&PPC::F8RCRegClass);
+    if (DstVT == MVT::i32)
+      if (IsSigned)
+        Opc = PPC::FCTIWZ;
+      else
+        Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+    else
+      Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+  }
 
   // Generate the convert.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
     .addReg(SrcReg);
 
   // Now move the integer value from a float register to an integer register.
-  unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  unsigned IntReg = PPCSubTarget->hasSPE() ? DestReg :
+    PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+
   if (IntReg == 0)
     return false;
 
@@ -1916,8 +1994,13 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
   unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
   assert(Align > 0 && "Unexpectedly missing alignment information!");
   unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
-  const TargetRegisterClass *RC =
-    (VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass;
+  const bool HasSPE = PPCSubTarget->hasSPE();
+  const TargetRegisterClass *RC;
+  if (HasSPE)
+    RC = ((VT == MVT::f32) ? &PPC::SPE4RCRegClass : &PPC::SPERCRegClass);
+  else
+    RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass);
+
   unsigned DestReg = createResultReg(RC);
   CodeModel::Model CModel = TM.getCodeModel();
 
@@ -1925,7 +2008,13 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
       MachinePointerInfo::getConstantPool(*FuncInfo.MF),
       MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
 
-  unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+  unsigned Opc;
+
+  if (HasSPE)
+    Opc = ((VT == MVT::f32) ? PPC::SPELWZ : PPC::EVLDD);
+  else
+    Opc = ((VT == MVT::f32) ? PPC::LFS : PPC::LFD);
+
   unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
 
   PPCFuncInfo->setUsesTOCBasePtr();
@@ -2261,7 +2350,8 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   unsigned ResultReg = MI->getOperand(0).getReg();
 
-  if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
+  if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt,
+        PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
     return false;
 
   MI->eraseFromParent();
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 7902da20a010..f0000c5bafd7 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -173,7 +173,27 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
       {PPC::V23, -144},
       {PPC::V22, -160},
       {PPC::V21, -176},
-      {PPC::V20, -192}};
+      {PPC::V20, -192},
+  
+      // SPE register save area (overlaps Vector save area).
+      {PPC::S31, -8},
+      {PPC::S30, -16},
+      {PPC::S29, -24},
+      {PPC::S28, -32},
+      {PPC::S27, -40},
+      {PPC::S26, -48},
+      {PPC::S25, -56},
+      {PPC::S24, -64},
+      {PPC::S23, -72},
+      {PPC::S22, -80},
+      {PPC::S21, -88},
+      {PPC::S20, -96},
+      {PPC::S19, -104},
+      {PPC::S18, -112},
+      {PPC::S17, -120},
+      {PPC::S16, -128},
+      {PPC::S15, -136},
+      {PPC::S14, -144}};
 
   static const SpillSlot Offsets64[] = {
       // Floating-point register save area offsets.
@@ -1615,7 +1635,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 
   // Make sure we don't explicitly spill r31, because, for example, we have
-  // some inline asm which explicity clobbers it, when we otherwise have a
+  // some inline asm which explicitly clobbers it, when we otherwise have a
   // frame pointer and are using r31's spill slot for the prologue/epilogue
   // code. Same goes for the base pointer and the PIC base register.
   if (needsFP(MF))
@@ -1676,7 +1696,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   unsigned MinGPR = PPC::R31;
   unsigned MinG8R = PPC::X31;
   unsigned MinFPR = PPC::F31;
-  unsigned MinVR = PPC::V31;
+  unsigned MinVR = Subtarget.hasSPE() ? PPC::S31 : PPC::V31;
 
   bool HasGPSaveArea = false;
   bool HasG8SaveArea = false;
@@ -1691,7 +1711,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    if (PPC::GPRCRegClass.contains(Reg)) {
+    if (PPC::GPRCRegClass.contains(Reg) ||
+        PPC::SPE4RCRegClass.contains(Reg)) {
       HasGPSaveArea = true;
 
       GPRegs.push_back(CSI[i]);
@@ -1720,7 +1741,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
       ; // do nothing, as we already know whether CRs are spilled
     } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
       HasVRSAVESaveArea = true;
-    } else if (PPC::VRRCRegClass.contains(Reg)) {
+    } else if (PPC::VRRCRegClass.contains(Reg) ||
+               PPC::SPERCRegClass.contains(Reg)) {
+      // Altivec and SPE are mutually exclusive, but have the same stack
+      // alignment requirements, so overload the save area for both cases.
       HasVRSaveArea = true;
 
       VRegs.push_back(CSI[i]);
@@ -1863,8 +1887,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
   }
 
+  // Both Altivec and SPE have the same alignment and padding requirements
+  // within the stack frame.
   if (HasVRSaveArea) {
-    // Insert alignment padding, we need 16-byte alignment. Note: for postive
+    // Insert alignment padding, we need 16-byte alignment. Note: for positive
     // number the alignment formula is : y = (x + (n-1)) & (~(n-1)). But since
     // we are using negative number here (the stack grows downward). We should
     // use formula : y = x & (~(n-1)). Where x is the size before aligning, n
@@ -1950,7 +1976,14 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
 
     // Add the callee-saved register as live-in; it's killed at the spill.
-    MBB.addLiveIn(Reg);
+    // Do not do this for callee-saved registers that are live-in to the
+    // function because they will already be marked live-in and this will be
+    // adding it for a second time. It is an error to add the same register
+    // to the set more than once.
+    const MachineRegisterInfo &MRI = MF->getRegInfo();
+    bool IsLiveIn = MRI.isLiveIn(Reg);
+    if (!IsLiveIn)
+       MBB.addLiveIn(Reg);
 
     if (CRSpilled && IsCRField) {
       CRMIB.addReg(Reg, RegState::ImplicitKill);
@@ -1980,7 +2013,10 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       }
     } else {
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+      // Use !IsLiveIn for the kill flag.
+      // We do not want to kill registers that are live in this function
+      // before their use because they will become undefined registers.
+      TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
                               CSI[i].getFrameIdx(), RC, TRI);
     }
   }
@@ -2149,6 +2185,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 }
 
 bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  if (MF.getInfo<PPCFunctionInfo>()->shrinkWrapDisabled())
+    return false;
   return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
           MF.getSubtarget<PPCSubtarget>().isPPC64());
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index f845d5a9ac64..01c155594c44 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -30,7 +30,7 @@ class PPCFrameLowering: public TargetFrameLowering {
   const unsigned BasePointerSaveOffset;
 
   /**
-   * \brief Find register[s] that can be used in function prologue and epilogue
+   * Find register[s] that can be used in function prologue and epilogue
    *
    * Find register[s] that can be use as scratch register[s] in function
    * prologue and epilogue to save various registers (Link Register, Base
@@ -67,7 +67,7 @@ class PPCFrameLowering: public TargetFrameLowering {
   bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
 
   /**
-   * \brief Create branch instruction for PPC::TCRETURN* (tail call return)
+   * Create branch instruction for PPC::TCRETURN* (tail call return)
    *
    * \param[in] MBB that is terminated by PPC::TCRETURN*
    */
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index f327396370f6..551220466901 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -180,9 +180,9 @@ void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
       CurGroup.clear();
       CurSlots = CurBranches = 0;
     } else {
-      DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
-                      SU->NodeNum << "): ");
-      DEBUG(DAG->dumpNode(SU));
+      LLVM_DEBUG(dbgs() << "**** Adding to dispatch group: SU(" << SU->NodeNum
+                        << "): ");
+      LLVM_DEBUG(DAG->dumpNode(SU));
 
       unsigned NSlots;
       bool MustBeFirst = mustComeFirst(MCID, NSlots);
@@ -268,7 +268,7 @@ PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
 }
 
 void PPCHazardRecognizer970::EndDispatchGroup() {
-  DEBUG(errs() << "=== Start of dispatch group\n");
+  LLVM_DEBUG(errs() << "=== Start of dispatch group\n");
   NumIssued = 0;
 
   // Structural hazard info.
@@ -330,7 +330,7 @@ getHazardType(SUnit *SU, int Stalls) {
 
   MachineInstr *MI = SU->getInstr();
 
-  if (MI->isDebugValue())
+  if (MI->isDebugInstr())
     return NoHazard;
 
   unsigned Opcode = MI->getOpcode();
@@ -388,7 +388,7 @@ getHazardType(SUnit *SU, int Stalls) {
 void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
   MachineInstr *MI = SU->getInstr();
 
-  if (MI->isDebugValue())
+  if (MI->isDebugInstr())
     return;
 
   unsigned Opcode = MI->getOpcode();
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index d3a223fe03e0..6cec664d1e66 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -32,7 +32,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -53,6 +52,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -101,6 +101,11 @@ static cl::opt<bool> EnableBranchHint(
     cl::desc("Enable static hinting of branches on ppc"),
     cl::Hidden);
 
+static cl::opt<bool> EnableTLSOpt(
+  "ppc-tls-opt", cl::init(true),
+    cl::desc("Enable tls optimization peephole"),
+    cl::Hidden);
+
 enum ICmpInGPRType { ICGPR_All, ICGPR_None, ICGPR_I32, ICGPR_I64,
   ICGPR_NonExtIn, ICGPR_Zext, ICGPR_Sext, ICGPR_ZextI32,
   ICGPR_SextI32, ICGPR_ZextI64, ICGPR_SextI64 };
@@ -199,6 +204,14 @@ namespace {
     bool tryBitPermutation(SDNode *N);
     bool tryIntCompareInGPR(SDNode *N);
 
+    // tryTLSXFormLoad - Convert an ISD::LOAD fed by a PPCISD::ADD_TLS into
+    // an X-Form load instruction with the offset being a relocation coming from
+    // the PPCISD::ADD_TLS.
+    bool tryTLSXFormLoad(LoadSDNode *N);
+    // tryTLSXFormStore - Convert an ISD::STORE fed by a PPCISD::ADD_TLS into
+    // an X-Form store instruction with the offset being a relocation coming from
+    // the PPCISD::ADD_TLS.
+    bool tryTLSXFormStore(StoreSDNode *N);
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
     SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -314,6 +327,7 @@ private:
 
     bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
+    MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
   };
 
 } // end anonymous namespace
@@ -417,6 +431,16 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
         BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
       }
     } else {
+      // We must ensure that this sequence is dominated by the prologue.
+      // FIXME: This is a bit of a big hammer since we don't get the benefits
+      // of shrink-wrapping whenever we emit this instruction. Considering
+      // this is used in any function where we emit a jump table, this may be
+      // a significant limitation. We should consider inserting this in the
+      // block where it is used and then commoning this sequence up if it
+      // appears in multiple places.
+      // Note: on ISA 3.0 cores, we can use lnia (addpcis) instead of
+      // MovePCtoLR8.
+      MF->getInfo<PPCFunctionInfo>()->setShrinkWrapDisabled(true);
       GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
@@ -494,10 +518,10 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
   if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
     return PPC::BR_NO_HINT;
 
-  DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
-               << BB->getName() << "'\n"
-               << " -> " << TBB->getName() << ": " << TProb << "\n"
-               << " -> " << FBB->getName() << ": " << FProb << "\n");
+  LLVM_DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName()
+                    << "::" << BB->getName() << "'\n"
+                    << " -> " << TBB->getName() << ": " << TProb << "\n"
+                    << " -> " << FBB->getName() << ": " << FProb << "\n");
 
   const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
 
@@ -572,6 +596,90 @@ bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
   return false;
 }
 
+bool PPCDAGToDAGISel::tryTLSXFormStore(StoreSDNode *ST) {
+  SDValue Base = ST->getBasePtr();
+  if (Base.getOpcode() != PPCISD::ADD_TLS)
+    return false;
+  SDValue Offset = ST->getOffset();
+  if (!Offset.isUndef())
+    return false;
+
+  SDLoc dl(ST);
+  EVT MemVT = ST->getMemoryVT();
+  EVT RegVT = ST->getValue().getValueType();
+
+  unsigned Opcode;
+  switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i8: {
+      Opcode = (RegVT == MVT::i32) ? PPC::STBXTLS_32 : PPC::STBXTLS;
+      break;
+    }
+    case MVT::i16: {
+      Opcode = (RegVT == MVT::i32) ? PPC::STHXTLS_32 : PPC::STHXTLS;
+      break;
+    }
+    case MVT::i32: {
+      Opcode = (RegVT == MVT::i32) ? PPC::STWXTLS_32 : PPC::STWXTLS;
+      break;
+    }
+    case MVT::i64: {
+      Opcode = PPC::STDXTLS;
+      break;
+    }
+  }
+  SDValue Chain = ST->getChain();
+  SDVTList VTs = ST->getVTList();
+  SDValue Ops[] = {ST->getValue(), Base.getOperand(0), Base.getOperand(1),
+                   Chain};
+  SDNode *MN = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
+  transferMemOperands(ST, MN);
+  ReplaceNode(ST, MN);
+  return true;
+}
+
+bool PPCDAGToDAGISel::tryTLSXFormLoad(LoadSDNode *LD) {
+  SDValue Base = LD->getBasePtr();
+  if (Base.getOpcode() != PPCISD::ADD_TLS)
+    return false;
+  SDValue Offset = LD->getOffset();
+  if (!Offset.isUndef())
+    return false;
+
+  SDLoc dl(LD);
+  EVT MemVT = LD->getMemoryVT();
+  EVT RegVT = LD->getValueType(0);
+  unsigned Opcode;
+  switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i8: {
+      Opcode = (RegVT == MVT::i32) ? PPC::LBZXTLS_32 : PPC::LBZXTLS;
+      break;
+    }
+    case MVT::i16: {
+      Opcode = (RegVT == MVT::i32) ? PPC::LHZXTLS_32 : PPC::LHZXTLS;
+      break;
+    }
+    case MVT::i32: {
+      Opcode = (RegVT == MVT::i32) ? PPC::LWZXTLS_32 : PPC::LWZXTLS;
+      break;
+    }
+    case MVT::i64: {
+      Opcode = PPC::LDXTLS;
+      break;
+    }
+  }
+  SDValue Chain = LD->getChain();
+  SDVTList VTs = LD->getVTList();
+  SDValue Ops[] = {Base.getOperand(0), Base.getOperand(1), Chain};
+  SDNode *MN = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
+  transferMemOperands(LD, MN);
+  ReplaceNode(LD, MN);
+  return true;
+}
+
 /// Turn an or of two masked values into the rotate left word immediate then
 /// mask insert (rlwimi) instruction.
 bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
@@ -1023,8 +1131,8 @@ class BitPermutationSelector {
     BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
       : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
         Repl32Coalesced(false) {
-      DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
-                      " [" << S << ", " << E << "]\n");
+      LLVM_DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R
+                        << " [" << S << ", " << E << "]\n");
     }
   };
 
@@ -1053,6 +1161,10 @@ class BitPermutationSelector {
         return true;
       else if (NumGroups < Other.NumGroups)
         return false;
+      else if (RLAmt == 0 && Other.RLAmt != 0)
+        return true;
+      else if (RLAmt != 0 && Other.RLAmt == 0)
+        return false;
       else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
         return true;
       return false;
@@ -1180,7 +1292,7 @@ class BitPermutationSelector {
         Bits[i] = ValueBit(ValueBit::ConstZero);
 
       return std::make_pair(Interesting, &Bits);
-      }
+    }
     }
 
     for (unsigned i = 0; i < NumBits; ++i)
@@ -1258,7 +1370,7 @@ class BitPermutationSelector {
           BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
           BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
           BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
-        DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
+        LLVM_DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
         BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
         BitGroups.erase(BitGroups.begin());
       }
@@ -1266,7 +1378,9 @@ class BitPermutationSelector {
   }
 
   // Take all (SDValue, RLAmt) pairs and sort them by the number of groups
-  // associated with each. If there is a degeneracy, pick the one that occurs
+  // associated with each. If the number of groups are same, we prefer a group
+  // which does not require rotate, i.e. RLAmt is 0, to avoid the first rotate
+  // instruction. If there is a degeneracy, pick the one that occurs
   // first (in the final value).
   void collectValueRotInfo() {
     ValueRots.clear();
@@ -1287,7 +1401,7 @@ class BitPermutationSelector {
     for (auto &I : ValueRots) {
       ValueRotsVec.push_back(I.second);
     }
-    std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
+    llvm::sort(ValueRotsVec.begin(), ValueRotsVec.end());
   }
 
   // In 64-bit mode, rlwinm and friends have a rotation operator that
@@ -1336,6 +1450,20 @@ class BitPermutationSelector {
     };
 
     for (auto &BG : BitGroups) {
+      // If this bit group has RLAmt of 0 and will not be merged with
+      // another bit group, we don't benefit from Repl32. We don't mark
+      // such group to give more freedom for later instruction selection.
+      if (BG.RLAmt == 0) {
+        auto PotentiallyMerged = [this](BitGroup & BG) {
+          for (auto &BG2 : BitGroups)
+            if (&BG != &BG2 && BG.V == BG2.V &&
+                (BG2.RLAmt == 0 || BG2.RLAmt == 32))
+              return true;
+          return false;
+        };
+        if (!PotentiallyMerged(BG))
+          continue;
+      }
       if (BG.StartIdx < 32 && BG.EndIdx < 32) {
         if (IsAllLow32(BG)) {
           if (BG.RLAmt >= 32) {
@@ -1345,9 +1473,9 @@ class BitPermutationSelector {
 
           BG.Repl32 = true;
 
-          DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
-                          BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
-                          " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+          LLVM_DEBUG(dbgs() << "\t32-bit replicated bit group for "
+                            << BG.V.getNode() << " RLAmt = " << BG.RLAmt << " ["
+                            << BG.StartIdx << ", " << BG.EndIdx << "]\n");
         }
       }
     }
@@ -1361,11 +1489,11 @@ class BitPermutationSelector {
       if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
           I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
 
-        DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
-                        I->V.getNode() << " RLAmt = " << I->RLAmt <<
-                        " [" << I->StartIdx << ", " << I->EndIdx <<
-                        "] with group with range [" <<
-                        IP->StartIdx << ", " << IP->EndIdx << "]\n");
+        LLVM_DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for "
+                          << I->V.getNode() << " RLAmt = " << I->RLAmt << " ["
+                          << I->StartIdx << ", " << I->EndIdx
+                          << "] with group with range [" << IP->StartIdx << ", "
+                          << IP->EndIdx << "]\n");
 
         IP->EndIdx = I->EndIdx;
         IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
@@ -1389,12 +1517,12 @@ class BitPermutationSelector {
               IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
               IsAllLow32(*I)) {
 
-            DEBUG(dbgs() << "\tcombining bit group for " <<
-                            I->V.getNode() << " RLAmt = " << I->RLAmt <<
-                            " [" << I->StartIdx << ", " << I->EndIdx <<
-                            "] with 32-bit replicated groups with ranges [" <<
-                            IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
-                            IN->StartIdx << ", " << IN->EndIdx << "]\n");
+            LLVM_DEBUG(dbgs() << "\tcombining bit group for " << I->V.getNode()
+                              << " RLAmt = " << I->RLAmt << " [" << I->StartIdx
+                              << ", " << I->EndIdx
+                              << "] with 32-bit replicated groups with ranges ["
+                              << IP->StartIdx << ", " << IP->EndIdx << "] and ["
+                              << IN->StartIdx << ", " << IN->EndIdx << "]\n");
 
             if (IP == IN) {
               // There is only one other group; change it to cover the whole
@@ -1503,15 +1631,15 @@ class BitPermutationSelector {
                              (unsigned) (ANDIMask != 0 && ANDISMask != 0) +
                              (unsigned) (bool) Res;
 
-      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
-                      " RL: " << VRI.RLAmt << ":" <<
-                      "\n\t\t\tisel using masking: " << NumAndInsts <<
-                      " using rotates: " << VRI.NumGroups << "\n");
+      LLVM_DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode()
+                        << " RL: " << VRI.RLAmt << ":"
+                        << "\n\t\t\tisel using masking: " << NumAndInsts
+                        << " using rotates: " << VRI.NumGroups << "\n");
 
       if (NumAndInsts >= VRI.NumGroups)
         continue;
 
-      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+      LLVM_DEBUG(dbgs() << "\t\t\t\tusing masking\n");
 
       if (InstCnt) *InstCnt += NumAndInsts;
 
@@ -1859,10 +1987,10 @@ class BitPermutationSelector {
         FirstBG = false;
       }
 
-      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
-                      " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
-                      "\n\t\t\tisel using masking: " << NumAndInsts <<
-                      " using rotates: " << NumRLInsts << "\n");
+      LLVM_DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode()
+                        << " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":")
+                        << "\n\t\t\tisel using masking: " << NumAndInsts
+                        << " using rotates: " << NumRLInsts << "\n");
 
       // When we'd use andi/andis, we bias toward using the rotates (andi only
       // has a record form, and is cracked on POWER cores). However, when using
@@ -1876,7 +2004,7 @@ class BitPermutationSelector {
       if ((Use32BitInsts || MoreBG) && NumAndInsts == NumRLInsts)
         continue;
 
-      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+      LLVM_DEBUG(dbgs() << "\t\t\t\tusing masking\n");
 
       if (InstCnt) *InstCnt += NumAndInsts;
 
@@ -2127,9 +2255,9 @@ public:
       return nullptr;
     Bits = std::move(*Result.second);
 
-    DEBUG(dbgs() << "Considering bit-permutation-based instruction"
-                    " selection for:    ");
-    DEBUG(N->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "Considering bit-permutation-based instruction"
+                         " selection for:    ");
+    LLVM_DEBUG(N->dump(CurDAG));
 
     // Fill it RLAmt and set HasZeros.
     computeRotationAmounts();
@@ -2145,22 +2273,22 @@ public:
     // set of bit groups, and then mask in the zeros at the end. With early
     // masking, we only insert the non-zero parts of the result at every step.
 
-    unsigned InstCnt, InstCntLateMask;
-    DEBUG(dbgs() << "\tEarly masking:\n");
+    unsigned InstCnt = 0, InstCntLateMask = 0;
+    LLVM_DEBUG(dbgs() << "\tEarly masking:\n");
     SDNode *RN = Select(N, false, &InstCnt);
-    DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+    LLVM_DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
 
-    DEBUG(dbgs() << "\tLate masking:\n");
+    LLVM_DEBUG(dbgs() << "\tLate masking:\n");
     SDNode *RNLM = Select(N, true, &InstCntLateMask);
-    DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
-                    " instructions\n");
+    LLVM_DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask
+                      << " instructions\n");
 
     if (InstCnt <= InstCntLateMask) {
-      DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+      LLVM_DEBUG(dbgs() << "\tUsing early-masking for isel\n");
       return RN;
     }
 
-    DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+    LLVM_DEBUG(dbgs() << "\tUsing late-masking for isel\n");
     return RNLM;
   }
 };
@@ -3288,7 +3416,7 @@ static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
 }
 
 /// Returns an equivalent of a SETCC node but with the result the same width as
-/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// the inputs. This can also be used for SELECT_CC if either the true or false
 /// values is a power of two while the other is zero.
 SDValue IntegerCompareEliminator::getSETCCInGPR(SDValue Compare,
                                                 SetccInGPROpts ConvOpts) {
@@ -3488,10 +3616,63 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       Opc = PPC::CMPD;
     }
   } else if (LHS.getValueType() == MVT::f32) {
-    Opc = PPC::FCMPUS;
+    if (PPCSubTarget->hasSPE()) {
+      switch (CC) {
+        default:
+        case ISD::SETEQ:
+        case ISD::SETNE:
+          Opc = PPC::EFSCMPEQ;
+          break;
+        case ISD::SETLT:
+        case ISD::SETGE:
+        case ISD::SETOLT:
+        case ISD::SETOGE:
+        case ISD::SETULT:
+        case ISD::SETUGE:
+          Opc = PPC::EFSCMPLT;
+          break;
+        case ISD::SETGT:
+        case ISD::SETLE:
+        case ISD::SETOGT:
+        case ISD::SETOLE:
+        case ISD::SETUGT:
+        case ISD::SETULE:
+          Opc = PPC::EFSCMPGT;
+          break;
+      }
+    } else
+      Opc = PPC::FCMPUS;
+  } else if (LHS.getValueType() == MVT::f64) {
+    if (PPCSubTarget->hasSPE()) {
+      switch (CC) {
+        default:
+        case ISD::SETEQ:
+        case ISD::SETNE:
+          Opc = PPC::EFDCMPEQ;
+          break;
+        case ISD::SETLT:
+        case ISD::SETGE:
+        case ISD::SETOLT:
+        case ISD::SETOGE:
+        case ISD::SETULT:
+        case ISD::SETUGE:
+          Opc = PPC::EFDCMPLT;
+          break;
+        case ISD::SETGT:
+        case ISD::SETLE:
+        case ISD::SETOGT:
+        case ISD::SETOLE:
+        case ISD::SETUGT:
+        case ISD::SETULE:
+          Opc = PPC::EFDCMPGT;
+          break;
+      }
+    } else
+      Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   } else {
-    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
-    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+    assert(LHS.getValueType() == MVT::f128 && "Unknown vt!");
+    assert(PPCSubTarget->hasVSX() && "__float128 requires VSX");
+    Opc = PPC::XSCMPUQP;
   }
   return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
@@ -3765,7 +3946,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   // Altivec Vector compare instructions do not set any CR register by default and
   // vector compare operations return the same type as the operands.
   if (LHS.getValueType().isVector()) {
-    if (PPCSubTarget->hasQPX())
+    if (PPCSubTarget->hasQPX() || PPCSubTarget->hasSPE())
       return false;
 
     EVT VecVT = LHS.getValueType();
@@ -3795,6 +3976,12 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
   SDValue IntCR;
 
+  // SPE e*cmp* instructions only set the 'gt' bit, so hard-code that
+  // The correct compare instruction is already set by SelectCC()
+  if (PPCSubTarget->hasSPE() && LHS.getValueType().isFloatingPoint()) {
+    Idx = 1;
+  }
+
   // Force the ccreg into CR7.
   SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
 
@@ -3830,20 +4017,28 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
   else if (STN)
     AddrOp = STN->getOperand(2);
 
+  // If the address points a frame object or a frame object with an offset,
+  // we need to check the object alignment.
   short Imm = 0;
-  if (AddrOp.getOpcode() == ISD::ADD) {
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(
+          AddrOp.getOpcode() == ISD::ADD ? AddrOp.getOperand(0) :
+                                           AddrOp)) {
     // If op0 is a frame index that is under aligned, we can't do it either,
     // because it is translated to r31 or r1 + slot + offset. We won't know the
     // slot number until the stack frame is finalized.
-    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(AddrOp.getOperand(0))) {
-      const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-      unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
-      if ((SlotAlign % Val) != 0)
-        return false;
-    }
-    return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+    const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
+    unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
+    if ((SlotAlign % Val) != 0)
+      return false;
+
+    // If we have an offset, we need further check on the offset.
+    if (AddrOp.getOpcode() != ISD::ADD)
+      return true;
   }
 
+  if (AddrOp.getOpcode() == ISD::ADD)
+    return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+
   // If the address comes from the outside, the offset will be zero.
   return AddrOp.getOpcode() == ISD::CopyFromReg;
 }
@@ -3855,6 +4050,51 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 }
 
+/// This method returns a node after flipping the MSB of each element
+/// of vector integer type. Additionally, if SignBitVec is non-null,
+/// this method sets a node with one at MSB of all elements
+/// and zero at other bits in SignBitVec.
+MachineSDNode *
+PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
+  SDLoc dl(N);
+  EVT VecVT = N.getValueType();
+  if (VecVT == MVT::v4i32) {
+    if (SignBitVec) {
+      SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
+      *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
+                                        SDValue(ZV, 0));
+    }
+    return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
+  }
+  else if (VecVT == MVT::v8i16) {
+    SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
+                                     getI32Imm(0x8000, dl));
+    SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
+                                         SDValue(Hi, 0),
+                                         getI32Imm(0x8000, dl));
+    SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
+                                         SDValue(ScaImm, 0));
+    /*
+    Alternatively, we can do this as follow to use VRF instead of GPR.
+      vspltish 5, 1
+      vspltish 6, 15
+      vslh 5, 6, 5
+    */
+    if (SignBitVec) *SignBitVec = VecImm;
+    return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
+                                  SDValue(VecImm, 0));
+  }
+  else if (VecVT == MVT::v16i8) {
+    SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
+                                         getI32Imm(0x80, dl));
+    if (SignBitVec) *SignBitVec = VecImm;
+    return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
+                                  SDValue(VecImm, 0));
+  }
+  else
+    llvm_unreachable("Unsupported vector data type for flipSignBit");
+}
+
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -3894,6 +4134,27 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     break;
 
+  case PPCISD::CALL: {
+    const Module *M = MF->getFunction().getParent();
+
+    if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
+        !PPCSubTarget->isSecurePlt() || !PPCSubTarget->isTargetELF() ||
+        M->getPICLevel() == PICLevel::SmallPIC)
+      break;
+
+    SDValue Op = N->getOperand(1);
+
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+      if (GA->getTargetFlags() == PPCII::MO_PLT)
+        getGlobalBaseReg();
+    }
+    else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+      if (ES->getTargetFlags() == PPCII::MO_PLT)
+        getGlobalBaseReg();
+    }
+  }
+    break;
+
   case PPCISD::GlobalBaseReg:
     ReplaceNode(N, getGlobalBaseReg());
     return;
@@ -3939,14 +4200,28 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
   }
 
+  case ISD::STORE: {
+    // Change TLS initial-exec D-form stores to X-form stores.
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    if (EnableTLSOpt && PPCSubTarget->isELFv2ABI() &&
+        ST->getAddressingMode() != ISD::PRE_INC)
+      if (tryTLSXFormStore(ST))
+        return;
+    break;
+  }
   case ISD::LOAD: {
     // Handle preincrement loads.
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT LoadedVT = LD->getMemoryVT();
 
     // Normal loads are handled by code generated from the .td file.
-    if (LD->getAddressingMode() != ISD::PRE_INC)
+    if (LD->getAddressingMode() != ISD::PRE_INC) {
+      // Change TLS initial-exec D-form loads to X-form loads.
+      if (EnableTLSOpt && PPCSubTarget->isELFv2ABI())
+        if (tryTLSXFormLoad(LD))
+          return;
       break;
+    }
 
     SDValue Offset = LD->getOffset();
     if (Offset.getOpcode() == ISD::TargetConstant ||
@@ -4338,16 +4613,24 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       SelectCCOp = PPC::SELECT_CC_I4;
     else if (N->getValueType(0) == MVT::i64)
       SelectCCOp = PPC::SELECT_CC_I8;
-    else if (N->getValueType(0) == MVT::f32)
+    else if (N->getValueType(0) == MVT::f32) {
       if (PPCSubTarget->hasP8Vector())
         SelectCCOp = PPC::SELECT_CC_VSSRC;
+      else if (PPCSubTarget->hasSPE())
+        SelectCCOp = PPC::SELECT_CC_SPE4;
       else
         SelectCCOp = PPC::SELECT_CC_F4;
-    else if (N->getValueType(0) == MVT::f64)
+    } else if (N->getValueType(0) == MVT::f64) {
       if (PPCSubTarget->hasVSX())
         SelectCCOp = PPC::SELECT_CC_VSFRC;
+      else if (PPCSubTarget->hasSPE())
+        SelectCCOp = PPC::SELECT_CC_SPE;
       else
         SelectCCOp = PPC::SELECT_CC_F8;
+    } else if (N->getValueType(0) == MVT::f128)
+      SelectCCOp = PPC::SELECT_CC_F16;
+    else if (PPCSubTarget->hasSPE())
+      SelectCCOp = PPC::SELECT_CC_SPE;
     else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
       SelectCCOp = PPC::SELECT_CC_QFRC;
     else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
@@ -4633,6 +4916,55 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     }
   }
+  case ISD::ABS: {
+    assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");
+
+    // For vector absolute difference, we use VABSDUW instruction of POWER9.
+    // Since VABSDU instructions are for unsigned integers, we need adjustment
+    // for signed integers.
+    // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
+    // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
+    // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
+    EVT VecVT = N->getOperand(0).getValueType();
+    SDNode *AbsOp = nullptr;
+    unsigned AbsOpcode;
+
+    if (VecVT == MVT::v4i32)
+      AbsOpcode = PPC::VABSDUW;
+    else if (VecVT == MVT::v8i16)
+      AbsOpcode = PPC::VABSDUH;
+    else if (VecVT == MVT::v16i8)
+      AbsOpcode = PPC::VABSDUB;
+    else
+      llvm_unreachable("Unsupported vector data type for ISD::ABS");
+
+    // Even for signed integers, we can skip adjustment if all values are
+    // known to be positive (as signed integer) due to zero-extended inputs.
+    if (N->getOperand(0).getOpcode() == ISD::SUB &&
+        N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+        N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+                                     SDValue(N->getOperand(0)->getOperand(0)),
+                                     SDValue(N->getOperand(0)->getOperand(1)));
+      ReplaceNode(N, AbsOp);
+      return;
+    }
+    if (N->getOperand(0).getOpcode() == ISD::SUB) {
+      SDValue SubVal = N->getOperand(0);
+      SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
+      SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+                                     SDValue(Op0, 0), SDValue(Op1, 0));
+    }
+    else {
+      SDNode *Op1 = nullptr;
+      SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
+                                     SDValue(Op1, 0));
+    }
+    ReplaceNode(N, AbsOp);
+    return;
+  }
   }
 
   SelectCode(N);
@@ -4924,8 +5256,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
 }
 
 void PPCDAGToDAGISel::PreprocessISelDAG() {
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
@@ -4945,11 +5276,11 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
       foldBoolExts(Res, N);
 
     if (Res) {
-      DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld:    ");
-      DEBUG(N->dump(CurDAG));
-      DEBUG(dbgs() << "\nNew: ");
-      DEBUG(Res.getNode()->dump(CurDAG));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld:    ");
+      LLVM_DEBUG(N->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "\nNew: ");
+      LLVM_DEBUG(Res.getNode()->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "\n");
 
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       MadeChange = true;
@@ -5026,13 +5357,13 @@ void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
                              User->getOperand(2),
                              User->getOperand(1));
 
-      DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
-      DEBUG(User->dump(CurDAG));
-      DEBUG(dbgs() << "\nNew: ");
-      DEBUG(ResNode->dump(CurDAG));
-      DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+    LLVM_DEBUG(User->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\nNew: ");
+    LLVM_DEBUG(ResNode->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\n");
 
-      ReplaceUses(User, ResNode);
+    ReplaceUses(User, ResNode);
   }
 }
 
@@ -5083,6 +5414,8 @@ void PPCDAGToDAGISel::PeepholeCROps() {
       case PPC::SELECT_QFRC:
       case PPC::SELECT_QSRC:
       case PPC::SELECT_QBRC:
+      case PPC::SELECT_SPE:
+      case PPC::SELECT_SPE4:
       case PPC::SELECT_VRRC:
       case PPC::SELECT_VSFRC:
       case PPC::SELECT_VSSRC:
@@ -5402,6 +5735,8 @@ void PPCDAGToDAGISel::PeepholeCROps() {
       case PPC::SELECT_QFRC:
       case PPC::SELECT_QSRC:
       case PPC::SELECT_QBRC:
+      case PPC::SELECT_SPE:
+      case PPC::SELECT_SPE4:
       case PPC::SELECT_VRRC:
       case PPC::SELECT_VSFRC:
       case PPC::SELECT_VSSRC:
@@ -5440,11 +5775,11 @@ void PPCDAGToDAGISel::PeepholeCROps() {
         SwapAllSelectUsers(MachineNode);
 
       if (ResNode != MachineNode) {
-        DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
-        DEBUG(MachineNode->dump(CurDAG));
-        DEBUG(dbgs() << "\nNew: ");
-        DEBUG(ResNode->dump(CurDAG));
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+        LLVM_DEBUG(MachineNode->dump(CurDAG));
+        LLVM_DEBUG(dbgs() << "\nNew: ");
+        LLVM_DEBUG(ResNode->dump(CurDAG));
+        LLVM_DEBUG(dbgs() << "\n");
 
         ReplaceUses(MachineNode, ResNode);
         IsModified = true;
@@ -5613,8 +5948,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
   // unnecessary. When that happens, we remove it here, and redefine the
   // relevant 32-bit operation to be a 64-bit operation.
 
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
@@ -5739,25 +6073,25 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
         else
           NewVTs.push_back(VTs.VTs[i]);
 
-      DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld:    ");
-      DEBUG(PN->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld:    ");
+      LLVM_DEBUG(PN->dump(CurDAG));
 
       CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
 
-      DEBUG(dbgs() << "\nNew: ");
-      DEBUG(PN->dump(CurDAG));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "\nNew: ");
+      LLVM_DEBUG(PN->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "\n");
     }
 
     // Now we replace the original zero extend and its associated INSERT_SUBREG
     // with the value feeding the INSERT_SUBREG (which has now been promoted to
     // return an i64).
 
-    DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld:    ");
-    DEBUG(N->dump(CurDAG));
-    DEBUG(dbgs() << "\nNew: ");
-    DEBUG(Op32.getNode()->dump(CurDAG));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld:    ");
+    LLVM_DEBUG(N->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\nNew: ");
+    LLVM_DEBUG(Op32.getNode()->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\n");
 
     ReplaceUses(N, Op32.getNode());
   }
@@ -5771,8 +6105,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
   if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
     return;
 
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
@@ -5782,28 +6115,37 @@ void PPCDAGToDAGISel::PeepholePPC64() {
 
     unsigned FirstOp;
     unsigned StorageOpcode = N->getMachineOpcode();
+    bool RequiresMod4Offset = false;
 
     switch (StorageOpcode) {
     default: continue;
 
+    case PPC::LWA:
+    case PPC::LD:
+    case PPC::DFLOADf64:
+    case PPC::DFLOADf32:
+      RequiresMod4Offset = true;
+      LLVM_FALLTHROUGH;
     case PPC::LBZ:
     case PPC::LBZ8:
-    case PPC::LD:
     case PPC::LFD:
     case PPC::LFS:
     case PPC::LHA:
     case PPC::LHA8:
     case PPC::LHZ:
     case PPC::LHZ8:
-    case PPC::LWA:
     case PPC::LWZ:
     case PPC::LWZ8:
       FirstOp = 0;
       break;
 
+    case PPC::STD:
+    case PPC::DFSTOREf64:
+    case PPC::DFSTOREf32:
+      RequiresMod4Offset = true;
+      LLVM_FALLTHROUGH;
     case PPC::STB:
     case PPC::STB8:
-    case PPC::STD:
     case PPC::STFD:
     case PPC::STFS:
     case PPC::STH:
@@ -5850,9 +6192,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       // For these cases, the immediate may not be divisible by 4, in
       // which case the fold is illegal for DS-form instructions.  (The
       // other cases provide aligned addresses and are always safe.)
-      if ((StorageOpcode == PPC::LWA ||
-           StorageOpcode == PPC::LD  ||
-           StorageOpcode == PPC::STD) &&
+      if (RequiresMod4Offset &&
           (!isa<ConstantSDNode>(Base.getOperand(1)) ||
            Base.getConstantOperandVal(1) % 4 != 0))
         continue;
@@ -5914,8 +6254,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       if (auto *C = dyn_cast<ConstantSDNode>(ImmOpnd)) {
         Offset += C->getSExtValue();
 
-        if ((StorageOpcode == PPC::LWA || StorageOpcode == PPC::LD ||
-             StorageOpcode == PPC::STD) && (Offset % 4) != 0)
+        if (RequiresMod4Offset && (Offset % 4) != 0)
           continue;
 
         if (!isInt<16>(Offset))
@@ -5932,11 +6271,11 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     // immediate and substitute them into the load or store.  If
     // needed, update the target flags for the immediate operand to
     // reflect the necessary relocation information.
-    DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
-    DEBUG(Base->dump(CurDAG));
-    DEBUG(dbgs() << "\nN: ");
-    DEBUG(N->dump(CurDAG));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    LLVM_DEBUG(Base->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\nN: ");
+    LLVM_DEBUG(N->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\n");
 
     // If the relocation information isn't already present on the
     // immediate operand, add it now.
@@ -5947,9 +6286,8 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         // We can't perform this optimization for data whose alignment
         // is insufficient for the instruction encoding.
         if (GV->getAlignment() < 4 &&
-            (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
-             StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
-          DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+            (RequiresMod4Offset || (Offset % 4) != 0)) {
+          LLVM_DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
           continue;
         }
         ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 26e9f13f9ff4..1e3e14c71144 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -47,7 +47,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -83,6 +82,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -111,6 +111,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 static cl::opt<bool> DisableSCO("disable-ppc-sco",
 cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
 
+static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
+cl::desc("enable quad precision float support on ppc"), cl::Hidden);
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 
@@ -134,8 +137,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // Set up the register classes.
   addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
   if (!useSoftFloat()) {
-    addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
-    addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+    if (hasSPE()) {
+      addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass);
+      addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
+    } else {
+      addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+      addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+    }
   }
 
   // Match BITREVERSE to customized fast code sequence in the td file.
@@ -159,15 +167,26 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
-  setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
-  setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
-  setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
-  setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
+  if (!Subtarget.hasSPE()) {
+    setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
+    setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
+    setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
+    setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
+  }
+
+  // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::ADDC, VT, Legal);
+    setOperationAction(ISD::ADDE, VT, Legal);
+    setOperationAction(ISD::SUBC, VT, Legal);
+    setOperationAction(ISD::SUBE, VT, Legal);
+  }
 
   if (Subtarget.useCRBits()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -201,9 +220,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
   }
 
-  // This is used in the ppcf128->int sequence.  Note it has different semantics
-  // from FP_ROUND:  that rounds to nearest, this rounds to zero.
-  setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).
+  setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
 
   // We do not currently implement these libm ops for PowerPC.
   setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
@@ -253,13 +273,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Legal);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
-  setOperationAction(ISD::FMA  , MVT::f32, Legal);
+  if (Subtarget.hasSPE()) {
+    setOperationAction(ISD::FMA  , MVT::f64, Expand);
+    setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  } else {
+    setOperationAction(ISD::FMA  , MVT::f64, Legal);
+    setOperationAction(ISD::FMA  , MVT::f32, Legal);
+  }
 
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
@@ -296,7 +321,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
   // to speed up scalar BSWAP64.
-  // CTPOP or CTTZ were introduced in P8/P9 respectivelly
+  // CTPOP or CTTZ were introduced in P8/P9 respectively
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
   if (Subtarget.isISA3_0()) {
     setOperationAction(ISD::BSWAP, MVT::i64  , Custom);
@@ -342,12 +367,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
 
-  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  if (Subtarget.hasSPE()) {
+    // SPE has built-in conversions
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+  } else {
+    // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 
-  // PowerPC does not have [U|S]INT_TO_FP
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+    // PowerPC does not have [U|S]INT_TO_FP
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  }
 
   if (Subtarget.hasDirectMove() && isPPC64) {
     setOperationAction(ISD::BITCAST, MVT::f32, Legal);
@@ -445,6 +477,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   // Comparisons that require checking two conditions.
+  if (Subtarget.hasSPE()) {
+    setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+  }
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
@@ -472,7 +510,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
-    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+    if (Subtarget.hasSPE())
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+    else
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
   }
 
   // With the instructions enabled under FPCVT, we can do everything.
@@ -785,6 +826,46 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SHL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRA, MVT::v1i128, Expand);
+
+      if (EnableQuadPrecision) {
+        addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+        setOperationAction(ISD::FADD, MVT::f128, Legal);
+        setOperationAction(ISD::FSUB, MVT::f128, Legal);
+        setOperationAction(ISD::FDIV, MVT::f128, Legal);
+        setOperationAction(ISD::FMUL, MVT::f128, Legal);
+        setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+        // No extending loads to f128 on PPC.
+        for (MVT FPT : MVT::fp_valuetypes())
+          setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+        setOperationAction(ISD::FMA, MVT::f128, Legal);
+        setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
+        setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
+        setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
+        setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
+        setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
+        setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
+
+        setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
+        setOperationAction(ISD::FRINT, MVT::f128, Legal);
+        setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
+        setOperationAction(ISD::FCEIL, MVT::f128, Legal);
+        setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
+        setOperationAction(ISD::FROUND, MVT::f128, Legal);
+
+        setOperationAction(ISD::SELECT, MVT::f128, Expand);
+        setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
+        setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
+        setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+        setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+        setOperationAction(ISD::BITCAST, MVT::i128, Custom);
+        // No implementation for these ops for PowerPC.
+        setOperationAction(ISD::FSIN , MVT::f128, Expand);
+        setOperationAction(ISD::FCOS , MVT::f128, Expand);
+        setOperationAction(ISD::FPOW, MVT::f128, Expand);
+        setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+        setOperationAction(ISD::FREM, MVT::f128, Expand);
+      }
+
     }
 
     if (Subtarget.hasP9Altivec()) {
@@ -1021,6 +1102,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
   }
 
+  if (EnableQuadPrecision) {
+    setLibcallName(RTLIB::LOG_F128, "logf128");
+    setLibcallName(RTLIB::LOG2_F128, "log2f128");
+    setLibcallName(RTLIB::LOG10_F128, "log10f128");
+    setLibcallName(RTLIB::EXP_F128, "expf128");
+    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+    setLibcallName(RTLIB::SIN_F128, "sinf128");
+    setLibcallName(RTLIB::COS_F128, "cosf128");
+    setLibcallName(RTLIB::POW_F128, "powf128");
+    setLibcallName(RTLIB::FMIN_F128, "fminf128");
+    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+    setLibcallName(RTLIB::POWI_F128, "__powikf2");
+    setLibcallName(RTLIB::REM_F128, "fmodf128");
+  }
+
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
   if (Subtarget.useCRBits()) {
@@ -1036,6 +1132,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   default: break;
   case PPC::DIR_970:
   case PPC::DIR_A2:
+  case PPC::DIR_E500:
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
   case PPC::DIR_PWR4:
@@ -1126,10 +1223,28 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
   return Align;
 }
 
+unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+                                                          EVT VT) const {
+  if (Subtarget.hasSPE() && VT == MVT::f64)
+    return 2;
+  return PPCTargetLowering::getNumRegisters(Context, VT);
+}
+
+MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                     EVT VT) const {
+  if (Subtarget.hasSPE() && VT == MVT::f64)
+    return MVT::i32;
+  return PPCTargetLowering::getRegisterType(Context, VT);
+}
+
 bool PPCTargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
+bool PPCTargetLowering::hasSPE() const {
+  return Subtarget.hasSPE();
+}
+
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
@@ -1142,6 +1257,10 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
   case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
   case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
+  case PPCISD::FP_TO_UINT_IN_VSR:
+                                return "PPCISD::FP_TO_UINT_IN_VSR,";
+  case PPCISD::FP_TO_SINT_IN_VSR:
+                                return "PPCISD::FP_TO_SINT_IN_VSR";
   case PPCISD::FRE:             return "PPCISD::FRE";
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
@@ -1195,6 +1314,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::SExtVElems:      return "PPCISD::SExtVElems";
   case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
   case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
+  case PPCISD::ST_VSR_SCAL_INT:
+                                return "PPCISD::ST_VSR_SCAL_INT";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
   case PPCISD::BDNZ:            return "PPCISD::BDNZ";
   case PPCISD::BDZ:             return "PPCISD::BDZ";
@@ -1231,6 +1352,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::QVESPLATI:       return "PPCISD::QVESPLATI";
   case PPCISD::QBFLT:           return "PPCISD::QBFLT";
   case PPCISD::QVLFSb:          return "PPCISD::QVLFSb";
+  case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
   }
   return nullptr;
 }
@@ -1461,7 +1583,7 @@ bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 }
 
 /**
- * \brief Common function used to match vmrgew and vmrgow shuffles
+ * Common function used to match vmrgew and vmrgow shuffles
  *
  * The indexOffset determines whether to look for even or odd words in
  * the shuffle mask. This is based on the of the endianness of the target
@@ -1518,7 +1640,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
 }
 
 /**
- * \brief Determine if the specified shuffle mask is suitable for the vmrgew or
+ * Determine if the specified shuffle mask is suitable for the vmrgew or
  * vmrgow instructions.
  *
  * \param[in] N The shuffle vector SD Node to analyze
@@ -2550,10 +2672,11 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual BlockAddress is stored in the TOC.
-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
-    setUsesTOCBasePtr(DAG);
+  if (Subtarget.isSVR4ABI() && isPositionIndependent()) {
+    if (Subtarget.isPPC64())
+      setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
-    return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
+    return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
@@ -2571,7 +2694,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   // large models could be added if users need it, at the cost of
   // additional complexity.
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   SDLoc dl(GA);
@@ -3116,7 +3239,7 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
   if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
       ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
       ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
-      ArgVT == MVT::v1i128)
+      ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
     Align = 16;
   // QPX vector types stored in double-precision are padded to a 32 byte
   // boundary.
@@ -3196,7 +3319,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
     if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
         ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
         ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
-        ArgVT == MVT::v1i128)
+        ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
       if (AvailableVRs > 0) {
         --AvailableVRs;
         return false;
@@ -3285,7 +3408,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
   // Reserve space for the linkage area on the stack.
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, PtrByteSize);
-  if (useSoftFloat())
+  if (useSoftFloat() || hasSPE())
     CCInfo.PreAnalyzeFormalArguments(Ins);
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
@@ -3309,12 +3432,16 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
         case MVT::f32:
           if (Subtarget.hasP8Vector())
             RC = &PPC::VSSRCRegClass;
+          else if (Subtarget.hasSPE())
+            RC = &PPC::SPE4RCRegClass;
           else
             RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
           if (Subtarget.hasVSX())
             RC = &PPC::VSFRCRegClass;
+          else if (Subtarget.hasSPE())
+            RC = &PPC::SPERCRegClass;
           else
             RC = &PPC::F8RCRegClass;
           break;
@@ -3403,7 +3530,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
     };
     unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
 
-    if (useSoftFloat())
+    if (useSoftFloat() || hasSPE())
        NumFPArgRegs = 0;
 
     FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
@@ -3785,23 +3912,23 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::v2f64:
     case MVT::v2i64:
     case MVT::v1i128:
+    case MVT::f128:
       if (!Subtarget.hasQPX()) {
-      // These can be scalar arguments or elements of a vector array type
-      // passed directly.  The latter are used to implement ELFv2 homogenous
-      // vector aggregates.
-      if (VR_idx != Num_VR_Regs) {
-        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
-        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
-        ++VR_idx;
-      } else {
-        if (CallConv == CallingConv::Fast)
-          ComputeArgOffset();
-
-        needsLoad = true;
-      }
-      if (CallConv != CallingConv::Fast || needsLoad)
-        ArgOffset += 16;
-      break;
+        // These can be scalar arguments or elements of a vector array type
+        // passed directly.  The latter are used to implement ELFv2 homogenous
+        // vector aggregates.
+        if (VR_idx != Num_VR_Regs) {
+          unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+          ++VR_idx;
+        } else {
+          if (CallConv == CallingConv::Fast)
+            ComputeArgOffset();
+          needsLoad = true;
+        }
+        if (CallConv != CallingConv::Fast || needsLoad)
+          ArgOffset += 16;
+        break;
       } // not QPX
 
       assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
@@ -4263,7 +4390,7 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
   PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
   unsigned CallerMinReservedArea = FI->getMinReservedArea();
   int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
-  // Remember only if the new adjustement is bigger.
+  // Remember only if the new adjustment is bigger.
   if (SPDiff < FI->getTailCallSPDelta())
     FI->setTailCallSPDelta(SPDiff);
 
@@ -4939,7 +5066,11 @@ SDValue PPCTargetLowering::LowerCallResult(
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
-  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+  CCRetInfo.AnalyzeCallResult(
+      Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+               ? RetCC_PPC_Cold
+               : RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -5108,15 +5239,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       assert(isa<GlobalAddressSDNode>(Callee) &&
              "Callee should be an llvm::Function object.");
-      DEBUG(
-        const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
-        const unsigned Width = 80 - strlen("TCO caller: ")
-                                  - strlen(", callee linkage: 0, 0");
-        dbgs() << "TCO caller: "
-               << left_justify(DAG.getMachineFunction().getName(), Width)
-               << ", callee linkage: "
-               << GV->getVisibility() << ", " << GV->getLinkage() << "\n"
-      );
+      LLVM_DEBUG(
+          const GlobalValue *GV =
+              cast<GlobalAddressSDNode>(Callee)->getGlobal();
+          const unsigned Width =
+              80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
+          dbgs() << "TCO caller: "
+                 << left_justify(DAG.getMachineFunction().getName(), Width)
+                 << ", callee linkage: " << GV->getVisibility() << ", "
+                 << GV->getLinkage() << "\n");
     }
   }
 
@@ -5159,6 +5290,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
   // of the 32-bit SVR4 ABI stack frame layout.
 
   assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Cold ||
           CallConv == CallingConv::Fast) && "Unknown calling convention!");
 
   unsigned PtrByteSize = 4;
@@ -5462,6 +5594,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // arguments that will be in registers.
   unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
 
+  // Avoid allocating parameter area for fastcc functions if all the arguments
+  // can be passed in the registers.
+  if (CallConv == CallingConv::Fast)
+    HasParameterArea = false;
+
   // Add up all the space actually used.
   for (unsigned i = 0; i != NumOps; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -5472,9 +5609,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       continue;
 
     if (CallConv == CallingConv::Fast) {
-      if (Flags.isByVal())
+      if (Flags.isByVal()) {
         NumGPRsUsed += (Flags.getByValSize()+7)/8;
-      else
+        if (NumGPRsUsed > NumGPRs)
+          HasParameterArea = true;
+      } else {
         switch (ArgVT.getSimpleVT().SimpleTy) {
         default: llvm_unreachable("Unexpected ValueType for argument!");
         case MVT::i1:
@@ -5489,6 +5628,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         case MVT::v2f64:
         case MVT::v2i64:
         case MVT::v1i128:
+        case MVT::f128:
           if (++NumVRsUsed <= NumVRs)
             continue;
           break;
@@ -5511,6 +5651,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
             continue;
           break;
         }
+        HasParameterArea = true;
+      }
     }
 
     /* Respect alignment of argument on the stack.  */
@@ -5867,6 +6009,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     case MVT::v2f64:
     case MVT::v2i64:
     case MVT::v1i128:
+    case MVT::f128:
       if (!Subtarget.hasQPX()) {
       // These can be scalar arguments or elements of a vector array type
       // passed directly.  The latter are used to implement ELFv2 homogenous
@@ -6420,7 +6563,10 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                   LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+  return CCInfo.CheckReturn(
+      Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+                ? RetCC_PPC_Cold
+                : RetCC_PPC);
 }
 
 SDValue
@@ -6432,7 +6578,10 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
-  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+  CCInfo.AnalyzeReturn(Outs,
+                       (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+                           ? RetCC_PPC_Cold
+                           : RetCC_PPC);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -6852,7 +7001,7 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
   RLI.MPI = MPI;
 }
 
-/// \brief Custom lowers floating point to integer conversions to use
+/// Custom lowers floating point to integer conversions to use
 /// the direct move instructions available in ISA 2.07 to avoid the
 /// need for load/store combinations.
 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
@@ -6889,6 +7038,51 @@ SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
 
 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                                           const SDLoc &dl) const {
+
+  // FP to INT conversions are legal for f128.
+  if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
+    return Op;
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).
+  if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
+    if (Op.getValueType() == MVT::i32) {
+      if (Op.getOpcode() == ISD::FP_TO_SINT) {
+        SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                                 MVT::f64, Op.getOperand(0),
+                                 DAG.getIntPtrConstant(0, dl));
+        SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                                 MVT::f64, Op.getOperand(0),
+                                 DAG.getIntPtrConstant(1, dl));
+
+        // Add the two halves of the long double in round-to-zero mode.
+        SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+
+        // Now use a smaller FP_TO_SINT.
+        return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+      }
+      if (Op.getOpcode() == ISD::FP_TO_UINT) {
+        const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
+        APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
+        SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
+        //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+        // FIXME: generated code sucks.
+        // TODO: Are there fast-math-flags to propagate to this FSUB?
+        SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
+                                   Op.getOperand(0), Tmp);
+        True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
+        True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
+                           DAG.getConstant(0x80000000, dl, MVT::i32));
+        SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+                                    Op.getOperand(0));
+        return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
+                               ISD::SETGE);
+      }
+    }
+
+    return SDValue();
+  }
+
   if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
     return LowerFP_TO_INTDirectMove(Op, DAG, dl);
 
@@ -6970,7 +7164,7 @@ void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
   DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
 }
 
-/// \brief Analyze profitability of direct move
+/// Analyze profitability of direct move
 /// prefer float load to int load plus direct move
 /// when there is no integer use of int load
 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
@@ -7000,7 +7194,7 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
   return false;
 }
 
-/// \brief Custom lowers integer to floating point conversions to use
+/// Custom lowers integer to floating point conversions to use
 /// the direct move instructions available in ISA 2.07 to avoid the
 /// need for load/store combinations.
 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
@@ -7036,6 +7230,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
+  // Conversions to f128 are legal.
+  if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
+    return Op;
+
   if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
     if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
       return SDValue();
@@ -7552,6 +7750,23 @@ static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
   return !(IsSplat && IsLoad);
 }
 
+// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
+SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+
+  SDLoc dl(Op);
+  SDValue Op0 = Op->getOperand(0);
+
+  if (!EnableQuadPrecision ||
+      (Op.getValueType() != MVT::f128 ) ||
+      (Op0.getOpcode() != ISD::BUILD_PAIR) ||
+      (Op0.getOperand(0).getValueType() !=  MVT::i64) ||
+      (Op0.getOperand(1).getValueType() != MVT::i64))
+    return SDValue();
+
+  return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
+                     Op0.getOperand(1));
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@@ -9302,27 +9517,19 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
-
-  case ISD::VAARG:
-    return LowerVAARG(Op, DAG);
-
-  case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG);
 
-  case ISD::STACKRESTORE:
-    return LowerSTACKRESTORE(Op, DAG);
-
-  case ISD::DYNAMIC_STACKALLOC:
-    return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  // Variable argument lowering.
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
 
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::GET_DYNAMIC_AREA_OFFSET:
     return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
 
-  case ISD::EH_DWARF_CFA:
-    return LowerEH_DWARF_CFA(Op, DAG);
-
+  // Exception handling lowering.
+  case ISD::EH_DWARF_CFA:       return LowerEH_DWARF_CFA(Op, DAG);
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
 
@@ -9331,8 +9538,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::FP_TO_UINT:
-  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
-                                                      SDLoc(Op));
+  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
@@ -9355,6 +9561,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   // For counter-based loop handling.
   case ISD::INTRINSIC_W_CHAIN:  return SDValue();
 
+  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
+
   // Frame & Return address.
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
@@ -9400,7 +9608,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
                                  N->getOperand(1));
 
-    Results.push_back(NewInt);
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
     Results.push_back(NewInt.getValue(1));
     break;
   }
@@ -9418,25 +9626,6 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
-  case ISD::FP_ROUND_INREG: {
-    assert(N->getValueType(0) == MVT::ppcf128);
-    assert(N->getOperand(0).getValueType() == MVT::ppcf128);
-    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
-                             MVT::f64, N->getOperand(0),
-                             DAG.getIntPtrConstant(0, dl));
-    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
-                             MVT::f64, N->getOperand(0),
-                             DAG.getIntPtrConstant(1, dl));
-
-    // Add the two halves of the long double in round-to-zero mode.
-    SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
-
-    // We know the low half is about to be thrown away, so just use something
-    // convenient.
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
-                                FPreg, FPreg));
-    return;
-  }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     // LowerFP_TO_INT() can only handle f32 and f64.
@@ -10083,6 +10272,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
              MI.getOpcode() == PPC::SELECT_CC_I8 ||
              MI.getOpcode() == PPC::SELECT_CC_F4 ||
              MI.getOpcode() == PPC::SELECT_CC_F8 ||
+             MI.getOpcode() == PPC::SELECT_CC_F16 ||
              MI.getOpcode() == PPC::SELECT_CC_QFRC ||
              MI.getOpcode() == PPC::SELECT_CC_QSRC ||
              MI.getOpcode() == PPC::SELECT_CC_QBRC ||
@@ -10090,13 +10280,18 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
              MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSRC ||
+             MI.getOpcode() == PPC::SELECT_CC_SPE4 ||
+             MI.getOpcode() == PPC::SELECT_CC_SPE ||
              MI.getOpcode() == PPC::SELECT_I4 ||
              MI.getOpcode() == PPC::SELECT_I8 ||
              MI.getOpcode() == PPC::SELECT_F4 ||
              MI.getOpcode() == PPC::SELECT_F8 ||
+             MI.getOpcode() == PPC::SELECT_F16 ||
              MI.getOpcode() == PPC::SELECT_QFRC ||
              MI.getOpcode() == PPC::SELECT_QSRC ||
              MI.getOpcode() == PPC::SELECT_QBRC ||
+             MI.getOpcode() == PPC::SELECT_SPE ||
+             MI.getOpcode() == PPC::SELECT_SPE4 ||
              MI.getOpcode() == PPC::SELECT_VRRC ||
              MI.getOpcode() == PPC::SELECT_VSFRC ||
              MI.getOpcode() == PPC::SELECT_VSSRC ||
@@ -10129,6 +10324,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
     if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
         MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
+        MI.getOpcode() == PPC::SELECT_F16 ||
+        MI.getOpcode() == PPC::SELECT_SPE4 ||
+        MI.getOpcode() == PPC::SELECT_SPE ||
         MI.getOpcode() == PPC::SELECT_QFRC ||
         MI.getOpcode() == PPC::SELECT_QSRC ||
         MI.getOpcode() == PPC::SELECT_QBRC ||
@@ -10681,6 +10879,7 @@ unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
     return 3;
   case PPC::DIR_440:
   case PPC::DIR_A2:
+  case PPC::DIR_E500:
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
     return 2;
@@ -10962,7 +11161,7 @@ SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
 
   // Size of integers being compared has a critical role in the following
   // analysis, so we prefer to do this when all types are legal.
-  if (!DCI.isAfterLegalizeVectorOps())
+  if (!DCI.isAfterLegalizeDAG())
     return SDValue();
 
   // If all users of SETCC extend its value to a legal integer type
@@ -11560,7 +11759,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       ShiftCst);
 }
 
-/// \brief Reduces the number of fp-to-int conversion when building a vector.
+/// Reduces the number of fp-to-int conversion when building a vector.
 ///
 /// If this vector is built out of floating to integer conversions,
 /// transform it to a vector built out of floating point values followed by a
@@ -11640,7 +11839,7 @@ combineElementTruncationToVectorTruncation(SDNode *N,
   return SDValue();
 }
 
-/// \brief Reduce the number of loads when building a vector.
+/// Reduce the number of loads when building a vector.
 ///
 /// Building a vector out of multiple loads can be converted to a load
 /// of the vector type if the loads are consecutive. If the loads are
@@ -11948,10 +12147,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
   SDLoc dl(N);
   SDValue Op(N, 0);
 
-  // Don't handle ppc_fp128 here or i1 conversions.
+  // Don't handle ppc_fp128 here or conversions that are out-of-range capable
+  // from the hardware.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
-  if (Op.getOperand(0).getValueType() == MVT::i1)
+  if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
+      Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
     return SDValue();
 
   SDValue FirstOperand(Op.getOperand(0));
@@ -12171,6 +12372,64 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
   return Store;
 }
 
+// Handle DAG combine for STORE (FP_TO_INT F).
+SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  unsigned Opcode = N->getOperand(1).getOpcode();
+
+  assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT)
+         && "Not a FP_TO_INT Instruction!");
+
+  SDValue Val = N->getOperand(1).getOperand(0);
+  EVT Op1VT = N->getOperand(1).getValueType();
+  EVT ResVT = Val.getValueType();
+
+  // Floating point types smaller than 32 bits are not legal on Power.
+  if (ResVT.getScalarSizeInBits() < 32)
+    return SDValue();
+
+  // Only perform combine for conversion to i64/i32 or power9 i16/i8.
+  bool ValidTypeForStoreFltAsInt =
+        (Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
+         (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
+
+  if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() ||
+      cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
+    return SDValue();
+
+  // Extend f32 values to f64
+  if (ResVT.getScalarSizeInBits() == 32) {
+    Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+    DCI.AddToWorklist(Val.getNode());
+  }
+
+  // Set signed or unsigned conversion opcode.
+  unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
+                          PPCISD::FP_TO_SINT_IN_VSR :
+                          PPCISD::FP_TO_UINT_IN_VSR;
+
+  Val = DAG.getNode(ConvOpcode,
+                    dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
+  DCI.AddToWorklist(Val.getNode());
+
+  // Set number of bytes being converted.
+  unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
+  SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
+                    DAG.getIntPtrConstant(ByteSize, dl, false),
+                    DAG.getValueType(Op1VT) };
+
+  Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
+          DAG.getVTList(MVT::Other), Ops,
+          cast<StoreSDNode>(N)->getMemoryVT(),
+          cast<StoreSDNode>(N)->getMemOperand());
+
+  DCI.AddToWorklist(Val.getNode());
+  return Val;
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -12210,60 +12469,22 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UINT_TO_FP:
     return combineFPToIntToFP(N, DCI);
   case ISD::STORE: {
+
     EVT Op1VT = N->getOperand(1).getValueType();
-    bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) ||
-      (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16));
-
-    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
-    if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
-        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
-        ValidTypeForStoreFltAsInt &&
-        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
-      SDValue Val = N->getOperand(1).getOperand(0);
-      if (Val.getValueType() == MVT::f32) {
-        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
-        DCI.AddToWorklist(Val.getNode());
-      }
-      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
-      DCI.AddToWorklist(Val.getNode());
-
-      if (Op1VT == MVT::i32) {
-        SDValue Ops[] = {
-          N->getOperand(0), Val, N->getOperand(2),
-          DAG.getValueType(N->getOperand(1).getValueType())
-        };
-
-        Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
-                DAG.getVTList(MVT::Other), Ops,
-                cast<StoreSDNode>(N)->getMemoryVT(),
-                cast<StoreSDNode>(N)->getMemOperand());
-      } else {
-        unsigned WidthInBytes =
-          N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2;
-        SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false);
-
-        SDValue Ops[] = {
-          N->getOperand(0), Val, N->getOperand(2), WidthConst,
-          DAG.getValueType(N->getOperand(1).getValueType())
-        };
-        Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      cast<StoreSDNode>(N)->getMemoryVT(),
-                                      cast<StoreSDNode>(N)->getMemOperand());
-      }
+    unsigned Opcode = N->getOperand(1).getOpcode();
 
-      DCI.AddToWorklist(Val.getNode());
-      return Val;
+    if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) {
+      SDValue Val= combineStoreFPToInt(N, DCI);
+      if (Val)
+        return Val;
     }
 
     // Turn STORE (BSWAP) -> sthbrx/stwbrx.
-    if (cast<StoreSDNode>(N)->isUnindexed() &&
-        N->getOperand(1).getOpcode() == ISD::BSWAP &&
+    if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
         N->getOperand(1).getNode()->hasOneUse() &&
-        (N->getOperand(1).getValueType() == MVT::i32 ||
-         N->getOperand(1).getValueType() == MVT::i16 ||
-         (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
-          N->getOperand(1).getValueType() == MVT::i64))) {
+        (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
+         (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
+
       // STBRX can only handle simple types.
       EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
       if (mVT.isExtended())
@@ -12296,9 +12517,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     // STORE Constant:i32<0>  ->  STORE<trunc to i32> Constant:i64<0>
     // So it can increase the chance of CSE constant construction.
-    EVT VT = N->getOperand(1).getValueType();
     if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
-        isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) {
+        isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
       // Need to sign-extended to 64-bits to handle negative values.
       EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
       uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
@@ -12316,8 +12536,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     // For little endian, VSX stores require generating xxswapd/lxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
-    if (VT.isSimple()) {
-      MVT StoreVT = VT.getSimpleVT();
+    if (Op1VT.isSimple()) {
+      MVT StoreVT = Op1VT.getSimpleVT();
       if (Subtarget.needsSwapsForVSXMemOps() &&
           (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
            StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
@@ -13100,14 +13320,21 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     // really care overly much here so just give them all the same reg classes.
     case 'd':
     case 'f':
-      if (VT == MVT::f32 || VT == MVT::i32)
-        return std::make_pair(0U, &PPC::F4RCRegClass);
-      if (VT == MVT::f64 || VT == MVT::i64)
-        return std::make_pair(0U, &PPC::F8RCRegClass);
-      if (VT == MVT::v4f64 && Subtarget.hasQPX())
-        return std::make_pair(0U, &PPC::QFRCRegClass);
-      if (VT == MVT::v4f32 && Subtarget.hasQPX())
-        return std::make_pair(0U, &PPC::QSRCRegClass);
+      if (Subtarget.hasSPE()) {
+        if (VT == MVT::f32 || VT == MVT::i32)
+          return std::make_pair(0U, &PPC::SPE4RCRegClass);
+        if (VT == MVT::f64 || VT == MVT::i64)
+          return std::make_pair(0U, &PPC::SPERCRegClass);
+      } else {
+        if (VT == MVT::f32 || VT == MVT::i32)
+          return std::make_pair(0U, &PPC::F4RCRegClass);
+        if (VT == MVT::f64 || VT == MVT::i64)
+          return std::make_pair(0U, &PPC::F8RCRegClass);
+        if (VT == MVT::v4f64 && Subtarget.hasQPX())
+          return std::make_pair(0U, &PPC::QFRCRegClass);
+        if (VT == MVT::v4f32 && Subtarget.hasQPX())
+          return std::make_pair(0U, &PPC::QSRCRegClass);
+      }
       break;
     case 'v':
       if (VT == MVT::v4f64 && Subtarget.hasQPX())
@@ -13590,7 +13817,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
   return MVT::i32;
 }
 
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
@@ -13639,6 +13866,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
   assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
          "invalid fpext types");
+  // Extending to float128 is not free.
+  if (DestVT == MVT::f128)
+    return false;
   return true;
 }
 
@@ -13695,6 +13925,8 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   case MVT::f32:
   case MVT::f64:
     return true;
+  case MVT::f128:
+    return (EnableQuadPrecision && Subtarget.hasP9Vector());
   default:
     break;
   }
@@ -13923,3 +14155,20 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // If the function is local then we have a good chance at tail-calling it
   return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
 }
+
+bool PPCTargetLowering::
+isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
+  const Value *Mask = AndI.getOperand(1);
+  // If the mask is suitable for andi. or andis. we should sink the and.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
+    // Can't handle constants wider than 64-bits.
+    if (CI->getBitWidth() > 64)
+      return false;
+    int64_t ConstVal = CI->getZExtValue();
+    return isUInt<16>(ConstVal) ||
+      (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
+  }
+
+  // For non-constant masks, we can always use the record-form and.
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
index b3215a84829e..9b8d6435515b 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -31,6 +30,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/MachineValueType.h"
 #include <utility>
 
 namespace llvm {
@@ -71,6 +71,9 @@ namespace llvm {
       /// unsigned integers with round toward zero.
       FCTIDUZ, FCTIWUZ,
 
+      /// Floating-point-to-interger conversion instructions
+      FP_TO_UINT_IN_VSR, FP_TO_SINT_IN_VSR,
+
       /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
       /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
       VEXTS,
@@ -186,6 +189,9 @@ namespace llvm {
       /// Direct move from a GPR to a VSX register (zero)
       MTVSRZ,
 
+      /// Direct move of 2 consective GPR to a VSX register.
+      BUILD_FP128,
+
       /// Extract a subvector from signed integer vector and convert to FP.
       /// It is primarily used to convert a (widened) illegal integer vector
       /// type to a legal floating point vector type.
@@ -426,6 +432,9 @@ namespace llvm {
       /// an xxswapd.
       STXVD2X,
 
+      /// Store scalar integers from VSR.
+      ST_VSR_SCAL_INT,
+
       /// QBRC, CHAIN = QVLFSb CHAIN, Ptr
       /// The 4xf32 load used for v4i1 constants.
       QVLFSb,
@@ -565,6 +574,8 @@ namespace llvm {
 
     bool useSoftFloat() const override;
 
+    bool hasSPE() const;
+
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
       return MVT::i32;
     }
@@ -765,7 +776,7 @@ namespace llvm {
 
     bool isFPExtFree(EVT DestVT, EVT SrcVT) const override;
 
-    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
@@ -822,7 +833,7 @@ namespace llvm {
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
                              const TargetLibraryInfo *LibInfo) const override;
 
-    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// Returns true if an argument of type Ty needs to be passed in a
     /// contiguous block of registers in calling convention CallConv.
     bool functionArgumentNeedsConsecutiveRegisters(
       Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
@@ -860,6 +871,12 @@ namespace llvm {
                                                unsigned JTI,
                                                MCContext &Ctx) const override;
 
+    unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                           EVT VT) const override;
+
+    MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                      EVT VT) const override;
+
   private:
     struct ReuseLoadInfo {
       SDValue Ptr;
@@ -884,6 +901,11 @@ namespace llvm {
       }
     };
 
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+      // Addrspacecasts are always noops.
+      return true;
+    }
+
     bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
                              SelectionDAG &DAG,
                              ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
@@ -1054,10 +1076,12 @@ namespace llvm {
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -1096,6 +1120,7 @@ namespace llvm {
     // tail call. This will cause the optimizers to attempt to move, or
     // duplicate return instructions to help enable tail call optimizations.
     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
   }; // end class PPCTargetLowering
 
   namespace PPC {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index fdd28c2ff03f..cdd57c6a1118 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -244,8 +244,8 @@ let usesCustomInserter = 1 in {
 
 // Instructions to support atomic operations
 let mayLoad = 1, hasSideEffects = 0 in {
-def LDARX : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
-                    "ldarx $rD, $ptr", IIC_LdStLDARX, []>;
+def LDARX : XForm_1_memOp<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
+                          "ldarx $rD, $ptr", IIC_LdStLDARX, []>;
 
 // Instruction to support lock versions of atomics
 // (EH=1 - see Power ISA 2.07 Book II 4.4.2)
@@ -259,8 +259,8 @@ def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
 }
 
 let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
-                    "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
+def STDCX : XForm_1_memOp<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
+                          "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
 
 let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
@@ -499,7 +499,49 @@ defm ADD8  : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
 def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc_nox0:$rA, tlsreg:$rB),
                         "add $rT, $rA, $rB", IIC_IntSimple,
                         [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
-                     
+let mayLoad = 1 in {
+def LBZXTLS : XForm_1<31,  87, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "lbzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LHZXTLS : XForm_1<31, 279, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "lhzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LWZXTLS : XForm_1<31,  23, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "lwzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LDXTLS  : XForm_1<31,  21, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "ldx $rD, $rA, $rB", IIC_LdStLD, []>, isPPC64;
+def LBZXTLS_32 : XForm_1<31,  87, (outs gprc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                         "lbzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LHZXTLS_32 : XForm_1<31, 279, (outs gprc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                         "lhzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LWZXTLS_32 : XForm_1<31,  23, (outs gprc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                         "lwzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+
+}
+
+let mayStore = 1 in {
+def STBXTLS : XForm_8<31, 215, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "stbx $rS, $rA, $rB", IIC_LdStStore, []>,
+                      PPC970_DGroup_Cracked;
+def STHXTLS : XForm_8<31, 407, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "sthx $rS, $rA, $rB", IIC_LdStStore, []>,
+                      PPC970_DGroup_Cracked;
+def STWXTLS : XForm_8<31, 151, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "stwx $rS, $rA, $rB", IIC_LdStStore, []>,
+                      PPC970_DGroup_Cracked;
+def STDXTLS  : XForm_8<31, 149, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                       "stdx $rS, $rA, $rB", IIC_LdStSTD, []>, isPPC64,
+                       PPC970_DGroup_Cracked;
+def STBXTLS_32 : XForm_8<31, 215, (outs), (ins gprc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                         "stbx $rS, $rA, $rB", IIC_LdStStore, []>,
+                         PPC970_DGroup_Cracked;
+def STHXTLS_32 : XForm_8<31, 407, (outs), (ins gprc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                         "sthx $rS, $rA, $rB", IIC_LdStStore, []>,
+                         PPC970_DGroup_Cracked;
+def STWXTLS_32 : XForm_8<31, 151, (outs), (ins gprc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                         "stwx $rS, $rA, $rB", IIC_LdStStore, []>,
+                         PPC970_DGroup_Cracked;
+
+}
+
 let isCommutable = 1 in
 defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "addc", "$rT, $rA, $rB", IIC_IntGeneral,
@@ -558,10 +600,37 @@ defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
 // FIXME: Duplicating this for the asm parser should be unnecessary, but the
 // previous definition must be marked as CodeGen only to prevent decoding
 // conflicts.
-let isAsmParserOnly = 1 in
+let isAsmParserOnly = 1 in {
 def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
                         "add $rT, $rA, $rB", IIC_IntSimple, []>;
 
+let mayLoad = 1 in {
+def LBZXTLS_ : XForm_1<31,  87, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "lbzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LHZXTLS_ : XForm_1<31, 279, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "lhzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LWZXTLS_ : XForm_1<31,  23, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "lwzx $rD, $rA, $rB", IIC_LdStLoad, []>;
+def LDXTLS_  : XForm_1<31,  21, (outs g8rc:$rD), (ins ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "ldx $rD, $rA, $rB", IIC_LdStLD, []>, isPPC64;
+}
+
+let mayStore = 1 in {
+def STBXTLS_ : XForm_8<31, 215, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "stbx $rS, $rA, $rB", IIC_LdStStore, []>,
+                      PPC970_DGroup_Cracked;
+def STHXTLS_ : XForm_8<31, 407, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "sthx $rS, $rA, $rB", IIC_LdStStore, []>,
+                      PPC970_DGroup_Cracked;
+def STWXTLS_ : XForm_8<31, 151, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                      "stwx $rS, $rA, $rB", IIC_LdStStore, []>,
+                      PPC970_DGroup_Cracked;
+def STDXTLS_  : XForm_8<31, 149, (outs), (ins g8rc:$rS, ptr_rc_nor0:$rA, tlsreg:$rB),
+                       "stdx $rS, $rA, $rB", IIC_LdStSTD, []>, isPPC64,
+                       PPC970_DGroup_Cracked;
+}
+}
+
 let isCommutable = 1 in {
 defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
@@ -837,22 +906,22 @@ def LWA  : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
                           (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
                     PPC970_DGroup_Cracked;
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
-def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
-                   "lhax $rD, $src", IIC_LdStLHA,
-                   [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
-                   PPC970_DGroup_Cracked;
-def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
-                   "lwax $rD, $src", IIC_LdStLHA,
-                   [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
-                   PPC970_DGroup_Cracked;
+def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
+                        "lhax $rD, $src", IIC_LdStLHA,
+                        [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+                        PPC970_DGroup_Cracked;
+def LWAX : XForm_1_memOp<31, 341, (outs g8rc:$rD), (ins memrr:$src),
+                        "lwax $rD, $src", IIC_LdStLHA,
+                        [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                        PPC970_DGroup_Cracked;
 // For fast-isel:
 let isCodeGenOnly = 1, mayLoad = 1 in {
 def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
                       "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
                       PPC970_DGroup_Cracked;
-def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
-                     "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
-                     PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1_memOp<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                            "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
+                            PPC970_DGroup_Cracked;
 } // end fast-isel isCodeGenOnly
 
 // Update forms.
@@ -866,16 +935,16 @@ def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
 // NO LWAU!
 
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
-def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
-                    (ins memrr:$addr),
-                    "lhaux $rD, $addr", IIC_LdStLHAUX,
-                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                    NoEncode<"$ea_result">;
-def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
-                    (ins memrr:$addr),
-                    "lwaux $rD, $addr", IIC_LdStLHAUX,
-                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                    NoEncode<"$ea_result">, isPPC64;
+def LHAUX8 : XForm_1_memOp<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                          (ins memrr:$addr),
+                          "lhaux $rD, $addr", IIC_LdStLHAUX,
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                          NoEncode<"$ea_result">;
+def LWAUX : XForm_1_memOp<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                          (ins memrr:$addr),
+                          "lwaux $rD, $addr", IIC_LdStLHAUX,
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                          NoEncode<"$ea_result">, isPPC64;
 }
 }
 
@@ -892,47 +961,50 @@ def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
                   "lwz $rD, $src", IIC_LdStLoad,
                   [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
 
-def LBZX8 : XForm_1<31,  87, (outs g8rc:$rD), (ins memrr:$src),
-                   "lbzx $rD, $src", IIC_LdStLoad,
-                   [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
-def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
-                   "lhzx $rD, $src", IIC_LdStLoad,
-                   [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
-def LWZX8 : XForm_1<31,  23, (outs g8rc:$rD), (ins memrr:$src),
-                   "lwzx $rD, $src", IIC_LdStLoad,
-                   [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+def LBZX8 : XForm_1_memOp<31,  87, (outs g8rc:$rD), (ins memrr:$src),
+                          "lbzx $rD, $src", IIC_LdStLoad,
+                          [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1_memOp<31, 279, (outs g8rc:$rD), (ins memrr:$src),
+                          "lhzx $rD, $src", IIC_LdStLoad,
+                          [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1_memOp<31,  23, (outs g8rc:$rD), (ins memrr:$src),
+                          "lwzx $rD, $src", IIC_LdStLoad,
+                          [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
                    
                    
 // Update forms.
 let mayLoad = 1, hasSideEffects = 0 in {
-def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memri:$addr),
                     "lbzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
-def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memri:$addr),
                     "lhzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
-def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memri:$addr),
                     "lwzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 
-def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
-                   (ins memrr:$addr),
-                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
-def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
-                   (ins memrr:$addr),
-                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
-def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
-                   (ins memrr:$addr),
-                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+def LBZUX8 : XForm_1_memOp<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                          (ins memrr:$addr),
+                          "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                          NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1_memOp<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                          (ins memrr:$addr),
+                          "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                          NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                          (ins memrr:$addr),
+                          "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                          NoEncode<"$ea_result">;
 }
 }
 } // Interpretation64Bit
@@ -963,35 +1035,36 @@ def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   [(set i64:$rD,
                      (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
 
-def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
-                   "ldx $rD, $src", IIC_LdStLD,
-                   [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
-def LDBRX : XForm_1<31,  532, (outs g8rc:$rD), (ins memrr:$src),
-                   "ldbrx $rD, $src", IIC_LdStLoad,
-                   [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+def LDX  : XForm_1_memOp<31,  21, (outs g8rc:$rD), (ins memrr:$src),
+                        "ldx $rD, $src", IIC_LdStLD,
+                        [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+def LDBRX : XForm_1_memOp<31,  532, (outs g8rc:$rD), (ins memrr:$src),
+                          "ldbrx $rD, $src", IIC_LdStLoad,
+                          [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
 
 let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
-def LHBRX8 : XForm_1<31, 790, (outs g8rc:$rD), (ins memrr:$src),
-                   "lhbrx $rD, $src", IIC_LdStLoad, []>;
-def LWBRX8 : XForm_1<31,  534, (outs g8rc:$rD), (ins memrr:$src),
-                   "lwbrx $rD, $src", IIC_LdStLoad, []>;
+def LHBRX8 : XForm_1_memOp<31, 790, (outs g8rc:$rD), (ins memrr:$src),
+                          "lhbrx $rD, $src", IIC_LdStLoad, []>;
+def LWBRX8 : XForm_1_memOp<31,  534, (outs g8rc:$rD), (ins memrr:$src),
+                          "lwbrx $rD, $src", IIC_LdStLoad, []>;
 }
 
 let mayLoad = 1, hasSideEffects = 0 in {
-def LDU  : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
+def LDU  : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrix:$addr),
                     "ldu $rD, $addr", IIC_LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
-def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
-                   (ins memrr:$addr),
-                   "ldux $rD, $addr", IIC_LdStLDUX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">, isPPC64;
+def LDUX : XForm_1_memOp<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                        (ins memrr:$addr),
+                        "ldux $rD, $addr", IIC_LdStLDUX,
+                        []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                        NoEncode<"$ea_result">, isPPC64;
 
 def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
                    "ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
-           Requires<[IsISA3_0]>;
+                   Requires<[IsISA3_0]>;
 }
 }
 
@@ -1116,32 +1189,32 @@ def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
 def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
                    "stw $rS, $src", IIC_LdStStore,
                    [(truncstorei32 i64:$rS, iaddr:$src)]>;
-def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stbx $rS, $dst", IIC_LdStStore,
-                   [(truncstorei8 i64:$rS, xaddr:$dst)]>,
-                   PPC970_DGroup_Cracked;
-def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "sthx $rS, $dst", IIC_LdStStore,
-                   [(truncstorei16 i64:$rS, xaddr:$dst)]>,
-                   PPC970_DGroup_Cracked;
-def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stwx $rS, $dst", IIC_LdStStore,
-                   [(truncstorei32 i64:$rS, xaddr:$dst)]>,
-                   PPC970_DGroup_Cracked;
+def STBX8 : XForm_8_memOp<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
+                          "stbx $rS, $dst", IIC_LdStStore,
+                          [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+                          PPC970_DGroup_Cracked;
+def STHX8 : XForm_8_memOp<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
+                          "sthx $rS, $dst", IIC_LdStStore,
+                          [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+                          PPC970_DGroup_Cracked;
+def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
+                          "stwx $rS, $dst", IIC_LdStStore,
+                          [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+                          PPC970_DGroup_Cracked;
 } // Interpretation64Bit
 
 // Normal 8-byte stores.
 def STD  : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
                     "std $rS, $dst", IIC_LdStSTD,
                     [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
-def STDX  : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stdx $rS, $dst", IIC_LdStSTD,
-                   [(store i64:$rS, xaddr:$dst)]>, isPPC64,
-                   PPC970_DGroup_Cracked;
-def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stdbrx $rS, $dst", IIC_LdStStore,
-                   [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
-                   PPC970_DGroup_Cracked;
+def STDX  : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
+                          "stdx $rS, $dst", IIC_LdStSTD,
+                          [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+                          PPC970_DGroup_Cracked;
+def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
+                          "stdbrx $rS, $dst", IIC_LdStStore,
+                          [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+                          PPC970_DGroup_Cracked;
 }
 
 // Stores with Update (pre-inc).
@@ -1157,29 +1230,38 @@ def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
                    "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 
-def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
-def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
-def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
+def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
+                          (ins g8rc:$rS, memrr:$dst),
+                          "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
+                          (ins g8rc:$rS, memrr:$dst),
+                          "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
+                          (ins g8rc:$rS, memrr:$dst),
+                          "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
 } // Interpretation64Bit
 
-def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res),
+                   (ins g8rc:$rS, memrix:$dst),
                    "stdu $rS, $dst", IIC_LdStSTDU, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
                    isPPC64;
 
-def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "stdux $rS, $dst", IIC_LdStSTDUX, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked, isPPC64;
+def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res),
+                          (ins g8rc:$rS, memrr:$dst),
+                          "stdux $rS, $dst", IIC_LdStSTDUX, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked, isPPC64;
 }
 
 // Patterns to match the pre-inc stores.  We can't put the patterns on
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index e751c149b0b3..24969d7ef853 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -408,46 +408,46 @@ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
                       [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
 
 let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {  // Loads.
-def LVEBX: XForm_1<31,   7, (outs vrrc:$vD), (ins memrr:$src),
+def LVEBX: XForm_1_memOp<31,   7, (outs vrrc:$vD), (ins memrr:$src),
                    "lvebx $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
-def LVEHX: XForm_1<31,  39, (outs vrrc:$vD), (ins memrr:$src),
+def LVEHX: XForm_1_memOp<31,  39, (outs vrrc:$vD), (ins memrr:$src),
                    "lvehx $vD, $src", IIC_LdStLoad,
                    [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
-def LVEWX: XForm_1<31,  71, (outs vrrc:$vD), (ins memrr:$src),
+def LVEWX: XForm_1_memOp<31,  71, (outs vrrc:$vD), (ins memrr:$src),
                    "lvewx $vD, $src", IIC_LdStLoad,
                    [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
-def LVX  : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src),
+def LVX  : XForm_1_memOp<31, 103, (outs vrrc:$vD), (ins memrr:$src),
                    "lvx $vD, $src", IIC_LdStLoad,
                    [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
-def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src),
+def LVXL : XForm_1_memOp<31, 359, (outs vrrc:$vD), (ins memrr:$src),
                    "lvxl $vD, $src", IIC_LdStLoad,
                    [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
 }
 
-def LVSL : XForm_1<31,   6, (outs vrrc:$vD), (ins memrr:$src),
+def LVSL : XForm_1_memOp<31,   6, (outs vrrc:$vD), (ins memrr:$src),
                    "lvsl $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
                    PPC970_Unit_LSU;
-def LVSR : XForm_1<31,  38, (outs vrrc:$vD), (ins memrr:$src),
+def LVSR : XForm_1_memOp<31,  38, (outs vrrc:$vD), (ins memrr:$src),
                    "lvsr $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {   // Stores.
-def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVEBX: XForm_8_memOp<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvebx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
-def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVEHX: XForm_8_memOp<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvehx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
-def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVEWX: XForm_8_memOp<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvewx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
-def STVX  : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVX  : XForm_8_memOp<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
-def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
+def STVXL : XForm_8_memOp<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvxl $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
 }
@@ -705,7 +705,7 @@ def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
                         (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
 def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
                       "vspltw $vD, $vB, $UIMM", IIC_VecPerm,
-                      [(set v16i8:$vD, 
+                      [(set v16i8:$vD,
                         (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
 let isCodeGenOnly = 1 in {
   def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
@@ -962,7 +962,7 @@ def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
 
 def : Pat<(fmul v4f32:$vA, v4f32:$vB),
           (VMADDFP $vA, $vB,
-             (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>; 
+             (v4i32 (VSLW (v4i32 (V_SETALLONES)), (v4i32 (V_SETALLONES)))))>; 
 
 // Fused multiply add and multiply sub for packed float.  These are represented
 // separately from the real instructions above, for operations that must have
@@ -991,7 +991,7 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
 def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
 def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)),
-          (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
+          (v1i128 (VSL (v16i8 (VSLO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
 def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSLB $vA, $vB))>;
 def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
@@ -999,7 +999,7 @@ def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
 def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
 def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)),
-          (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
+          (v1i128 (VSL (v16i8 (VSLO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
 
 def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
@@ -1008,7 +1008,7 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
 def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
 def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)),
-          (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
+          (v1i128 (VSR (v16i8 (VSRO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
 def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
 def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
@@ -1016,7 +1016,7 @@ def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
 def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
 def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)),
-          (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
+          (v1i128 (VSR (v16i8 (VSRO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
 
 def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRAB $vA, $vB))>;
@@ -1078,10 +1078,12 @@ def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>;
 // Vector merge 
 def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vmrgew $vD, $vA, $vB", IIC_VecFP,
-                      [(set v16i8:$vD, (vmrgew_shuffle v16i8:$vA, v16i8:$vB))]>;
+                      [(set v16i8:$vD,
+                            (v16i8 (vmrgew_shuffle v16i8:$vA, v16i8:$vB)))]>;
 def VMRGOW : VXForm_1<1676, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vmrgow $vD, $vA, $vB", IIC_VecFP,
-                      [(set v16i8:$vD, (vmrgow_shuffle v16i8:$vA, v16i8:$vB))]>;
+                      [(set v16i8:$vD,
+                            (v16i8 (vmrgow_shuffle v16i8:$vA, v16i8:$vB)))]>;
 
 // Match vmrgew(x,x) and vmrgow(x,x)
 def:Pat<(vmrgew_unary_shuffle v16i8:$vA, undef),
@@ -1502,18 +1504,4 @@ def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                        "vabsduw $vD, $vA, $vB", IIC_VecGeneral,
                        [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
 
-def : Pat<(v16i8:$vD (abs v16i8:$vA)),
-          (v16i8 (VABSDUB $vA, (V_SET0B)))>;
-def : Pat<(v8i16:$vD (abs v8i16:$vA)),
-          (v8i16 (VABSDUH $vA, (V_SET0H)))>;
-def : Pat<(v4i32:$vD (abs v4i32:$vA)),
-          (v4i32 (VABSDUW $vA, (V_SET0)))>;
-
-def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))),
-          (v16i8 (VABSDUB $vA, $vB))>;
-def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))),
-          (v8i16 (VABSDUH $vA, $vB))>;
-def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))),
-          (v4i32 (VABSDUW $vA, $vB))>;
-
 } // end HasP9Altivec
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index f2845415ecb5..f5f4b46344cf 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -46,6 +46,10 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
   bits<1> UseVSXReg = 0;
   let TSFlags{6}   = UseVSXReg;
 
+  // Indicate that this instruction is of type X-Form Load or Store
+  bits<1> XFormMemOp = 0;
+  let TSFlags{7}  = XFormMemOp;
+
   // Fields used for relation models.
   string BaseName = "";
 
@@ -71,6 +75,7 @@ class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
 class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
 
 class UseVSXReg { bits<1> UseVSXReg = 1; }
+class XFormMemOp { bits<1> XFormMemOp = 1; }
 
 // Two joined instructions; used to emit two adjacent instructions as one.
 // The itinerary from the first instruction is used for scheduling and
@@ -109,6 +114,11 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
   bit Interpretation64Bit = 0;
 }
 
+// Base class for all X-Form memory instructions
+class IXFormMemOp<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+        :I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp;
+
 // 1.7.1 I-Form
 class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
             InstrItinClass itin, list<dag> pattern>
@@ -437,6 +447,11 @@ class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms
   let Inst{31}    = RC;
 }
 
+class XForm_base_r3xo_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                            string asmstr, InstrItinClass itin,
+                            list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>, XFormMemOp;
+
 class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
                 InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> {
   let RST = 0;
@@ -469,9 +484,13 @@ class XForm_base_r3xo_swapped
 
 
 class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-              InstrItinClass itin, list<dag> pattern> 
+              InstrItinClass itin, list<dag> pattern>
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
 
+class XForm_1_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
 class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -511,6 +530,10 @@ class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
 
+class XForm_8_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
 class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -692,24 +715,34 @@ class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 }
 
 class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-               string asmstr, InstrItinClass itin, list<dag> pattern> 
+               string asmstr, InstrItinClass itin, list<dag> pattern>
   : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
   let L = 0;
 }
 
 class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin, list<dag> pattern> 
+               InstrItinClass itin, list<dag> pattern>
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
 }
 
+class XForm_25_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
 class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern>
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
   let A = 0;
 }
 
+class XForm_28_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
 class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin, list<dag> pattern> 
+               InstrItinClass itin, list<dag> pattern>
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
 }
 
@@ -980,7 +1013,7 @@ class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 // to specify an SDAG pattern for matching.
 class X_RD5_RS5_IM5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
                     string asmstr, InstrItinClass itin>
-  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, []> {
+  : XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, []> {
 }
 
 class X_BF3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -1018,6 +1051,10 @@ class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = XT{5};
 }
 
+class XX1Form_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern>, XFormMemOp;
+
 class XX1_RS6_RD5_XO<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
                      string asmstr, InstrItinClass itin, list<dag> pattern>
   : XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -2094,6 +2131,27 @@ class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> VRT;
+  bit R;
+  bits<5> VRB;
+  bits<2> idx;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = VRT;
+  let Inst{11-14} = 0;
+  let Inst{15} = R;
+  let Inst{16-20} = VRB;
+  let Inst{21-22} = idx;
+  let Inst{23-30} = xo;
+  let Inst{31}    = RC;
+}
+
 //===----------------------------------------------------------------------===//
 class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
     : I<0, OOL, IOL, asmstr, NoItinerary> {
@@ -2103,3 +2161,7 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
   let Inst{31-0} = 0;
   let hasNoSchedulingInfo = 1;
 }
+
+class PseudoXFormMemOp<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : Pseudo<OOL, IOL, asmstr, pattern>, XFormMemOp;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 4ef71effd49b..4669719744bc 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -55,6 +55,8 @@ STATISTIC(CmpIselsConverted,
           "Number of ISELs that depend on comparison of constants converted");
 STATISTIC(MissedConvertibleImmediateInstrs,
           "Number of compare-immediate instructions fed by constants");
+STATISTIC(NumRcRotatesConvertedToRcAnd,
+          "Number of record-form rotates converted to record-form andi");
 
 static cl::
 opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
@@ -71,6 +73,28 @@ static cl::opt<bool>
 UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
   cl::desc("Use the old (incorrect) instruction latency calculation"));
 
+// Index into the OpcodesForSpill array.
+enum SpillOpcodeKey {
+  SOK_Int4Spill,
+  SOK_Int8Spill,
+  SOK_Float8Spill,
+  SOK_Float4Spill,
+  SOK_CRSpill,
+  SOK_CRBitSpill,
+  SOK_VRVectorSpill,
+  SOK_VSXVectorSpill,
+  SOK_VectorFloat8Spill,
+  SOK_VectorFloat4Spill,
+  SOK_VRSaveSpill,
+  SOK_QuadFloat8Spill,
+  SOK_QuadFloat4Spill,
+  SOK_QuadBitSpill,
+  SOK_SpillToVSR,
+  SOK_SPESpill,
+  SOK_SPE4Spill,
+  SOK_LastOpcodeSpill  // This must be last on the enum.
+};
+
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -275,23 +299,11 @@ bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 
 unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                            int &FrameIndex) const {
-  // Note: This list must be kept consistent with LoadRegFromStackSlot.
-  switch (MI.getOpcode()) {
-  default: break;
-  case PPC::LD:
-  case PPC::LWZ:
-  case PPC::LFS:
-  case PPC::LFD:
-  case PPC::RESTORE_CR:
-  case PPC::RESTORE_CRBIT:
-  case PPC::LVX:
-  case PPC::LXVD2X:
-  case PPC::LXV:
-  case PPC::QVLFDX:
-  case PPC::QVLFSXs:
-  case PPC::QVLFDXb:
-  case PPC::RESTORE_VRSAVE:
-  case PPC::SPILLTOVSR_LD:
+  unsigned Opcode = MI.getOpcode();
+  const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
+  const unsigned *End = OpcodesForSpill + SOK_LastOpcodeSpill;
+
+  if (End != std::find(OpcodesForSpill, End, Opcode)) {
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
     if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
@@ -299,7 +311,6 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
       FrameIndex = MI.getOperand(2).getIndex();
       return MI.getOperand(0).getReg();
     }
-    break;
   }
   return 0;
 }
@@ -329,31 +340,16 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
 
 unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
-  // Note: This list must be kept consistent with StoreRegToStackSlot.
-  switch (MI.getOpcode()) {
-  default: break;
-  case PPC::STD:
-  case PPC::STW:
-  case PPC::STFS:
-  case PPC::STFD:
-  case PPC::SPILL_CR:
-  case PPC::SPILL_CRBIT:
-  case PPC::STVX:
-  case PPC::STXVD2X:
-  case PPC::STXV:
-  case PPC::QVSTFDX:
-  case PPC::QVSTFSXs:
-  case PPC::QVSTFDXb:
-  case PPC::SPILL_VRSAVE:
-  case PPC::SPILLTOVSR_ST:
-    // Check for the operands added by addFrameReference (the immediate is the
-    // offset which defaults to 0).
+  unsigned Opcode = MI.getOpcode();
+  const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
+  const unsigned *End = OpcodesForSpill + SOK_LastOpcodeSpill;
+
+  if (End != std::find(OpcodesForSpill, End, Opcode)) {
     if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
         MI.getOperand(2).isFI()) {
       FrameIndex = MI.getOperand(2).getIndex();
       return MI.getOperand(0).getReg();
     }
-    break;
   }
   return 0;
 }
@@ -955,8 +951,19 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg);
     getKillRegState(KillSrc);
     return;
+  } else if (PPC::SPERCRegClass.contains(SrcReg) &&
+             PPC::SPE4RCRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(PPC::EFSCFD), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
+    return;
+  } else if (PPC::SPE4RCRegClass.contains(SrcReg) &&
+             PPC::SPERCRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
+    return;
   }
 
+
   unsigned Opc;
   if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::OR;
@@ -989,6 +996,8 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = PPC::QVFMRb;
   else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::CROR;
+  else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::EVOR;
   else
     llvm_unreachable("Impossible reg-to-reg copy");
 
@@ -1000,129 +1009,212 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-// This function returns true if a CR spill is necessary and false otherwise.
-bool
-PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
-                                  unsigned SrcReg, bool isKill,
-                                  int FrameIdx,
-                                  const TargetRegisterClass *RC,
-                                  SmallVectorImpl<MachineInstr*> &NewMIs,
-                                  bool &NonRI, bool &SpillsVRS) const{
-  // Note: If additional store instructions are added here,
-  // update isStoreToStackSlot.
-
-  DebugLoc DL;
-  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
-      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
-             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    return true;
-  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    return true;
-  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
-    unsigned Op = Subtarget.hasP9Vector() ? PPC::STXV : PPC::STXVD2X;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
-    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf64 : PPC::STXSDX;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
-    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf32 : PPC::STXSSPX;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.isDarwin() &&
-           "VRSAVE only needs spill/restore on Darwin");
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    SpillsVRS = true;
-  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDX))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFSXs))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDXb))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_ST))
-                                       .addReg(SrcReg,
-                                               getKillRegState(isKill)),
-                                       FrameIdx));
+unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
+                                              const TargetRegisterClass *RC)
+                                              const {
+  const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
+  int OpcodeIndex = 0;
+
+  if (RC != nullptr) {
+    if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+        PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Int4Spill;
+    } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+               PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Int8Spill;
+    } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Float8Spill;
+    } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Float4Spill;
+    } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_SPESpill;
+    } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_SPE4Spill;
+    } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_CRSpill;
+    } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_CRBitSpill;
+    } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VRVectorSpill;
+    } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VSXVectorSpill;
+    } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VectorFloat8Spill;
+    } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VectorFloat4Spill;
+    } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VRSaveSpill;
+    } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_QuadFloat8Spill;
+    } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_QuadFloat4Spill;
+    } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_QuadBitSpill;
+    } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_SpillToVSR;
+    } else {
+      llvm_unreachable("Unknown regclass!");
+    }
   } else {
-    llvm_unreachable("Unknown regclass!");
+    if (PPC::GPRCRegClass.contains(Reg) ||
+        PPC::GPRC_NOR0RegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Int4Spill;
+    } else if (PPC::G8RCRegClass.contains(Reg) ||
+               PPC::G8RC_NOX0RegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Int8Spill;
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Float8Spill;
+    } else if (PPC::F4RCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Float4Spill;
+    } else if (PPC::CRRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_CRSpill;
+    } else if (PPC::CRBITRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_CRBitSpill;
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VRVectorSpill;
+    } else if (PPC::VSRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VSXVectorSpill;
+    } else if (PPC::VSFRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VectorFloat8Spill;
+    } else if (PPC::VSSRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VectorFloat4Spill;
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VRSaveSpill;
+    } else if (PPC::QFRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_QuadFloat8Spill;
+    } else if (PPC::QSRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_QuadFloat4Spill;
+    } else if (PPC::QBRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_QuadBitSpill;
+    } else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_SpillToVSR;
+    } else {
+      llvm_unreachable("Unknown regclass!");
+    }
   }
+  return OpcodesForSpill[OpcodeIndex];
+}
 
-  return false;
+unsigned
+PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
+                                    const TargetRegisterClass *RC) const {
+  const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
+  int OpcodeIndex = 0;
+
+  if (RC != nullptr) {
+    if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+        PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Int4Spill;
+    } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+               PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Int8Spill;
+    } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Float8Spill;
+    } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_Float4Spill;
+    } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_SPESpill;
+    } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_SPE4Spill;
+    } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_CRSpill;
+    } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_CRBitSpill;
+    } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VRVectorSpill;
+    } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VSXVectorSpill;
+    } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VectorFloat8Spill;
+    } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VectorFloat4Spill;
+    } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_VRSaveSpill;
+    } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_QuadFloat8Spill;
+    } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_QuadFloat4Spill;
+    } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_QuadBitSpill;
+    } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+      OpcodeIndex = SOK_SpillToVSR;
+    } else {
+      llvm_unreachable("Unknown regclass!");
+    }
+  } else {
+    if (PPC::GPRCRegClass.contains(Reg) ||
+        PPC::GPRC_NOR0RegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Int4Spill;
+    } else if (PPC::G8RCRegClass.contains(Reg) ||
+               PPC::G8RC_NOX0RegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Int8Spill;
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Float8Spill;
+    } else if (PPC::F4RCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_Float4Spill;
+    } else if (PPC::CRRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_CRSpill;
+    } else if (PPC::CRBITRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_CRBitSpill;
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VRVectorSpill;
+    } else if (PPC::VSRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VSXVectorSpill;
+    } else if (PPC::VSFRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VectorFloat8Spill;
+    } else if (PPC::VSSRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VectorFloat4Spill;
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_VRSaveSpill;
+    } else if (PPC::QFRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_QuadFloat8Spill;
+    } else if (PPC::QSRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_QuadFloat4Spill;
+    } else if (PPC::QBRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_QuadBitSpill;
+    } else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
+      OpcodeIndex = SOK_SpillToVSR;
+    } else {
+      llvm_unreachable("Unknown regclass!");
+    }
+  }
+  return OpcodesForSpill[OpcodeIndex];
 }
 
-void
-PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  unsigned SrcReg, bool isKill, int FrameIdx,
-                                  const TargetRegisterClass *RC,
-                                  const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  SmallVector<MachineInstr*, 4> NewMIs;
+void PPCInstrInfo::StoreRegToStackSlot(
+    MachineFunction &MF, unsigned SrcReg, bool isKill, int FrameIdx,
+    const TargetRegisterClass *RC,
+    SmallVectorImpl<MachineInstr *> &NewMIs) const {
+  unsigned Opcode = getStoreOpcodeForSpill(PPC::NoRegister, RC);
+  DebugLoc DL;
 
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setHasSpills();
 
+  NewMIs.push_back(addFrameReference(
+      BuildMI(MF, DL, get(Opcode)).addReg(SrcReg, getKillRegState(isKill)),
+      FrameIdx));
+
+  if (PPC::CRRCRegClass.hasSubClassEq(RC) ||
+      PPC::CRBITRCRegClass.hasSubClassEq(RC))
+    FuncInfo->setSpillsCR();
+
+  if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
+    FuncInfo->setSpillsVRSAVE();
+
+  if (isXFormMemOp(Opcode))
+    FuncInfo->setHasNonRISpills();
+}
+
+void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill,
+                                       int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr *, 4> NewMIs;
+
   // We need to avoid a situation in which the value from a VRRC register is
   // spilled using an Altivec instruction and reloaded into a VSRC register
   // using a VSX instruction. The issue with this is that the VSX
@@ -1132,16 +1224,7 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   // VSX instruction.
   RC = updatedRC(RC);
 
-  bool NonRI = false, SpillsVRS = false;
-  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
-                          NonRI, SpillsVRS))
-    FuncInfo->setSpillsCR();
-
-  if (SpillsVRS)
-    FuncInfo->setSpillsVRSAVE();
-
-  if (NonRI)
-    FuncInfo->setHasNonRISpills();
+  StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
@@ -1154,85 +1237,25 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
-bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                                         unsigned DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
-                                        SmallVectorImpl<MachineInstr *> &NewMIs,
-                                        bool &NonRI, bool &SpillsVRS) const {
-  // Note: If additional load instructions are added here,
-  // update isLoadFromStackSlot.
-
-  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
-      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
-                                               DestReg), FrameIdx));
-  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
-             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
-                                       FrameIdx));
-  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
-                                       FrameIdx));
-  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
-                                       FrameIdx));
-  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
-                                               get(PPC::RESTORE_CR), DestReg),
-                                       FrameIdx));
-    return true;
-  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
-                                               get(PPC::RESTORE_CRBIT), DestReg),
-                                       FrameIdx));
-    return true;
-  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
-    unsigned Op = Subtarget.hasP9Vector() ? PPC::LXV : PPC::LXVD2X;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
-    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf64 : PPC::LXSDX;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
-                                               DestReg), FrameIdx));
-    NonRI = true;
-  } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
-    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf32 : PPC::LXSSPX;
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
-                                               DestReg), FrameIdx));
-    NonRI = true;
-  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.isDarwin() &&
-           "VRSAVE only needs spill/restore on Darwin");
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
-                                               get(PPC::RESTORE_VRSAVE),
-                                               DestReg),
-                                       FrameIdx));
-    SpillsVRS = true;
-  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDX), DestReg),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFSXs), DestReg),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
-                                       FrameIdx));
-    NonRI = true;
-  } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_LD),
-                                               DestReg), FrameIdx));
-  } else {
-    llvm_unreachable("Unknown regclass!");
-  }
+                                        SmallVectorImpl<MachineInstr *> &NewMIs)
+                                        const {
+  unsigned Opcode = getLoadOpcodeForSpill(PPC::NoRegister, RC);
+  NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opcode), DestReg),
+                                     FrameIdx));
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  return false;
+  if (PPC::CRRCRegClass.hasSubClassEq(RC) ||
+      PPC::CRBITRCRegClass.hasSubClassEq(RC))
+    FuncInfo->setSpillsCR();
+
+  if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
+    FuncInfo->setSpillsVRSAVE();
+
+  if (isXFormMemOp(Opcode))
+    FuncInfo->setHasNonRISpills();
 }
 
 void
@@ -1259,16 +1282,7 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
     RC = &PPC::VSRCRegClass;
 
-  bool NonRI = false, SpillsVRS = false;
-  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
-                           NonRI, SpillsVRS))
-    FuncInfo->setSpillsCR();
-
-  if (SpillsVRS)
-    FuncInfo->setSpillsVRSAVE();
-
-  if (NonRI)
-    FuncInfo->setHasNonRISpills();
+  LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
@@ -1617,7 +1631,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   int OpC = CmpInstr.getOpcode();
   unsigned CRReg = CmpInstr.getOperand(0).getReg();
 
-  // FP record forms set CR1 based on the execption status bits, not a
+  // FP record forms set CR1 based on the exception status bits, not a
   // comparison with zero.
   if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
     return false;
@@ -1740,7 +1754,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
     unsigned PredHint = PPC::getPredicateHint(Pred);
     int16_t Immed = (int16_t)Value;
 
-    // When modyfing the condition in the predicate, we propagate hint bits
+    // When modifying the condition in the predicate, we propagate hint bits
     // from the original predicate to the new one.
     if (Immed == -1 && PredCond == PPC::PRED_GT)
       // We convert "greater than -1" into "greater than or equal to 0",
@@ -1897,6 +1911,31 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
     // specifically the case if this is the instruction directly after the
     // compare).
 
+    // Rotates are expensive instructions. If we're emitting a record-form
+    // rotate that can just be an andi, we should just emit the andi.
+    if ((MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINM8) &&
+        MI->getOperand(2).getImm() == 0) {
+      int64_t MB = MI->getOperand(3).getImm();
+      int64_t ME = MI->getOperand(4).getImm();
+      if (MB < ME && MB >= 16) {
+        uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
+        NewOpC = MIOpC == PPC::RLWINM ? PPC::ANDIo : PPC::ANDIo8;
+        MI->RemoveOperand(4);
+        MI->RemoveOperand(3);
+        MI->getOperand(2).setImm(Mask);
+        NumRcRotatesConvertedToRcAnd++;
+      }
+    } else if (MIOpC == PPC::RLDICL && MI->getOperand(2).getImm() == 0) {
+      int64_t MB = MI->getOperand(3).getImm();
+      if (MB >= 48) {
+        uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
+        NewOpC = PPC::ANDIo8;
+        MI->RemoveOperand(3);
+        MI->getOperand(2).setImm(Mask);
+        NumRcRotatesConvertedToRcAnd++;
+      }
+    }
+
     const MCInstrDesc &NewDesc = get(NewOpC);
     MI->setDesc(NewDesc);
 
@@ -2049,6 +2088,12 @@ bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
     return true;
 }
 
+#ifndef NDEBUG
+static bool isAnImmediateOperand(const MachineOperand &MO) {
+  return MO.isCPI() || MO.isGlobal() || MO.isImm();
+}
+#endif
+
 bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   auto &MBB = *MI.getParent();
   auto DL = MI.getDebugLoc();
@@ -2071,7 +2116,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case PPC::DFSTOREf64: {
     assert(Subtarget.hasP9Vector() &&
            "Invalid D-Form Pseudo-ops on Pre-P9 target.");
-    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+    assert(MI.getOperand(2).isReg() &&
+           isAnImmediateOperand(MI.getOperand(1)) &&
            "D-form op must have register and immediate operands");
     return expandVSXMemPseudo(MI);
   }
@@ -2151,28 +2197,6 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return false;
 }
 
-unsigned PPCInstrInfo::lookThruCopyLike(unsigned SrcReg,
-                                        const MachineRegisterInfo *MRI) {
-  while (true) {
-    MachineInstr *MI = MRI->getVRegDef(SrcReg);
-    if (!MI->isCopyLike())
-      return SrcReg;
-
-    unsigned CopySrcReg;
-    if (MI->isCopy())
-      CopySrcReg = MI->getOperand(1).getReg();
-    else {
-      assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
-      CopySrcReg = MI->getOperand(2).getReg();
-    }
-
-    if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
-      return CopySrcReg;
-
-    SrcReg = CopySrcReg;
-  }
-}
-
 // Essentially a compile-time implementation of a compare->isel sequence.
 // It takes two constants to compare, along with the true/false registers
 // and the comparison type (as a subreg to a CR field) and returns one
@@ -2238,7 +2262,8 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
   ConstOp = ~0U;
   MachineInstr *DefMI = nullptr;
   MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
-  // If we'ere in SSA, get the defs through the MRI. Otherwise, only look
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  // If we're in SSA, get the defs through the MRI. Otherwise, only look
   // within the basic block to see if the register is defined using an LI/LI8.
   if (MRI->isSSA()) {
     for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
@@ -2247,7 +2272,7 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
       unsigned Reg = MI.getOperand(i).getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
-      unsigned TrueReg = lookThruCopyLike(Reg, MRI);
+      unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
       if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
         DefMI = MRI->getVRegDef(TrueReg);
         if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
@@ -2313,6 +2338,38 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
   return ConstOp == ~0U ? nullptr : DefMI;
 }
 
+const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
+  static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
+      // Power 8
+      {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
+       PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,
+       PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
+       PPC::SPILLTOVSR_ST, PPC::EVSTDD, PPC::SPESTW},
+      // Power 9
+      {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
+       PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,
+       PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
+       PPC::SPILLTOVSR_ST}};
+
+  return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
+}
+
+const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
+  static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
+      // Power 8
+      {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
+       PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,
+       PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
+       PPC::SPILLTOVSR_LD, PPC::EVLDD, PPC::SPELWZ},
+      // Power 9
+      {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
+       PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32,
+       PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
+       PPC::SPILLTOVSR_LD}};
+
+  return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
+}
+
 // If this instruction has an immediate form and one of its operands is a
 // result of a load-immediate, convert it to the immediate form if the constant
 // is in range.
@@ -2391,16 +2448,17 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
         CompareUseMI.RemoveOperand(2);
         continue;
       }
-      DEBUG(dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
-      DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
-      DEBUG(dbgs() << "Is converted to:\n");
+      LLVM_DEBUG(
+          dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
+      LLVM_DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
+      LLVM_DEBUG(dbgs() << "Is converted to:\n");
       // Convert to copy and remove unneeded operands.
       CompareUseMI.setDesc(get(PPC::COPY));
       CompareUseMI.RemoveOperand(3);
       CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
       CmpIselsConverted++;
       Changed = true;
-      DEBUG(CompareUseMI.dump());
+      LLVM_DEBUG(CompareUseMI.dump());
     }
     if (Changed)
       return true;
@@ -2445,8 +2503,6 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
       Is64BitLI = Opc != PPC::RLDICL_32;
       NewImm = InVal.getSExtValue();
       SetCR = Opc == PPC::RLDICLo;
-      if (SetCR && (SExtImm & NewImm) != NewImm)
-        return false;
       break;
     }
     return false;
@@ -2460,7 +2516,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
     int64_t ME = MI.getOperand(4).getImm();
     APInt InVal(32, SExtImm, true);
     InVal = InVal.rotl(SH);
-    // Set the bits (        MB + 32       ) to (        ME + 32       ).
+    // Set the bits (        MB + 32        ) to (        ME + 32        ).
     uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
     InVal &= Mask;
     // Can't replace negative values with an LI as that will sign-extend
@@ -2474,8 +2530,6 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
       Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
       NewImm = InVal.getSExtValue();
       SetCR = Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o;
-      if (SetCR && (SExtImm & NewImm) != NewImm)
-        return false;
       break;
     }
     return false;
@@ -2501,10 +2555,37 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
   }
 
   if (ReplaceWithLI) {
-    DEBUG(dbgs() << "Replacing instruction:\n");
-    DEBUG(MI.dump());
-    DEBUG(dbgs() << "Fed by:\n");
-    DEBUG(DefMI->dump());
+    // We need to be careful with CR-setting instructions we're replacing.
+    if (SetCR) {
+      // We don't know anything about uses when we're out of SSA, so only
+      // replace if the new immediate will be reproduced.
+      bool ImmChanged = (SExtImm & NewImm) != NewImm;
+      if (PostRA && ImmChanged)
+        return false;
+
+      if (!PostRA) {
+        // If the defining load-immediate has no other uses, we can just replace
+        // the immediate with the new immediate.
+        if (MRI->hasOneUse(DefMI->getOperand(0).getReg()))
+          DefMI->getOperand(1).setImm(NewImm);
+
+        // If we're not using the GPR result of the CR-setting instruction, we
+        // just need to and with zero/non-zero depending on the new immediate.
+        else if (MRI->use_empty(MI.getOperand(0).getReg())) {
+          if (NewImm) {
+            assert(Immediate && "Transformation converted zero to non-zero?");
+            NewImm = Immediate;
+          }
+        }
+        else if (ImmChanged)
+          return false;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
+    LLVM_DEBUG(MI.dump());
+    LLVM_DEBUG(dbgs() << "Fed by:\n");
+    LLVM_DEBUG(DefMI->dump());
     LoadImmediateInfo LII;
     LII.Imm = NewImm;
     LII.Is64Bit = Is64BitLI;
@@ -2514,8 +2595,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
     if (KilledDef && SetCR)
       *KilledDef = nullptr;
     replaceInstrWithLI(MI, LII);
-    DEBUG(dbgs() << "With:\n");
-    DEBUG(MI.dump());
+    LLVM_DEBUG(dbgs() << "With:\n");
+    LLVM_DEBUG(MI.dump());
     return true;
   }
   return false;
@@ -3157,7 +3238,7 @@ bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
 }
 
 // We limit the max depth to track incoming values of PHIs or binary ops
-// (e.g. AND) to avoid exsessive cost.
+// (e.g. AND) to avoid excessive cost.
 const unsigned MAX_DEPTH = 1;
 
 bool
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 8bfb8bc88097..ba82f56a2464 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -68,7 +68,9 @@ enum {
 
   /// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX
   /// register (v0-v31).
-  UseVSXReg = 0x1 << NewDef_Shift
+  UseVSXReg = 0x1 << NewDef_Shift,
+  /// This instruction is an X-Form memory operation.
+  XFormMemOp = 0x1 << (NewDef_Shift+1)
 };
 } // end namespace PPCII
 
@@ -114,20 +116,19 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   PPCSubtarget &Subtarget;
   const PPCRegisterInfo RI;
 
-  bool StoreRegToStackSlot(MachineFunction &MF,
-                           unsigned SrcReg, bool isKill, int FrameIdx,
-                           const TargetRegisterClass *RC,
-                           SmallVectorImpl<MachineInstr*> &NewMIs,
-                           bool &NonRI, bool &SpillsVRS) const;
-  bool LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+  void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                           int FrameIdx, const TargetRegisterClass *RC,
+                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+  void LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                             unsigned DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
-                            SmallVectorImpl<MachineInstr *> &NewMIs,
-                            bool &NonRI, bool &SpillsVRS) const;
+                            SmallVectorImpl<MachineInstr *> &NewMIs) const;
   bool transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
                           unsigned ConstantOpNo, int64_t Imm) const;
   MachineInstr *getConstantDefMI(MachineInstr &MI, unsigned &ConstOp,
                                  bool &SeenIntermediateUse) const;
+  const unsigned *getStoreOpcodesForSpillArray() const;
+  const unsigned *getLoadOpcodesForSpillArray() const;
   virtual void anchor();
 
 protected:
@@ -154,6 +155,10 @@ public:
   ///
   const PPCRegisterInfo &getRegisterInfo() const { return RI; }
 
+  bool isXFormMemOp(unsigned Opcode) const {
+    return get(Opcode).TSFlags & PPCII::XFormMemOp;
+  }
+
   ScheduleHazardRecognizer *
   CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const override;
@@ -251,6 +256,12 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  unsigned getStoreOpcodeForSpill(unsigned Reg,
+                                  const TargetRegisterClass *RC = nullptr) const;
+
+  unsigned getLoadOpcodeForSpill(unsigned Reg,
+                                 const TargetRegisterClass *RC = nullptr) const;
+
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
@@ -359,13 +370,6 @@ public:
                               MachineInstr **KilledDef = nullptr) const;
   void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
 
-  // This is used to find the "true" source register for n
-  // Machine instruction. Returns the original SrcReg unless it is the target
-  // of a copy-like operation, in which case we chain backwards through all
-  // such operations to the ultimate source register.  If a
-  // physical register is encountered, we stop the search.
-  static unsigned lookThruCopyLike(unsigned SrcReg,
-                                   const MachineRegisterInfo *MRI);
   bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const;
 };
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 43dcc4479cf0..1a43037e4a4b 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -29,6 +29,12 @@ def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
 def SDT_PPCstxsix : SDTypeProfile<0, 3, [
   SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
 ]>;
+def SDT_PPCcv_fp_to_int  : SDTypeProfile<1, 1, [
+  SDTCisFP<0>, SDTCisFP<1>
+  ]>;
+def SDT_PPCstore_scal_int_from_vsr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
 def SDT_PPCVexts  : SDTypeProfile<1, 2, [
   SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
 ]>;
@@ -123,6 +129,14 @@ def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
 def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+
+def PPCcv_fp_to_uint_in_vsr:
+    SDNode<"PPCISD::FP_TO_UINT_IN_VSR", SDT_PPCcv_fp_to_int, []>;
+def PPCcv_fp_to_sint_in_vsr:
+    SDNode<"PPCISD::FP_TO_SINT_IN_VSR", SDT_PPCcv_fp_to_int, []>;
+def PPCstore_scal_int_from_vsr:
+   SDNode<"PPCISD::ST_VSR_SCAL_INT", SDT_PPCstore_scal_int_from_vsr,
+           [SDNPHasChain, SDNPMayStore]>;
 def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
                        [SDNPHasChain, SDNPMayStore]>;
 def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
@@ -204,6 +218,13 @@ def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
 def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
 def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
 
+// Move 2 i64 values into a VSX register
+def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
+                           SDTypeProfile<1, 2,
+                             [SDTCisFP<0>, SDTCisSameSizeAs<1,2>,
+                              SDTCisSameAs<1,2>]>,
+                           []>;
+
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
                            [SDNPHasChain, SDNPOutGlue]>;
@@ -308,7 +329,7 @@ def HI16 : SDNodeXForm<imm, [{
 
 def HA16 : SDNodeXForm<imm, [{
   // Transformation function: shift the immediate value down into the low bits.
-  int Val = N->getZExtValue();
+  long Val = N->getZExtValue();
   return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
 }]>;
 def MB : SDNodeXForm<imm, [{
@@ -523,6 +544,19 @@ def crrc0 : RegisterOperand<CRRC0> {
   let ParserMatchClass = PPCRegCRRCAsmOperand;
 }
 
+def PPCRegSPERCAsmOperand : AsmOperandClass {
+  let Name = "RegSPERC"; let PredicateMethod = "isRegNumber";
+}
+def sperc : RegisterOperand<SPERC> {
+  let ParserMatchClass = PPCRegSPERCAsmOperand;
+}
+def PPCRegSPE4RCAsmOperand : AsmOperandClass {
+  let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
+}
+def spe4rc : RegisterOperand<SPE4RC> {
+  let ParserMatchClass = PPCRegSPE4RCAsmOperand;
+}
+
 def PPCU1ImmAsmOperand : AsmOperandClass {
   let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
   let RenderMethod = "addImmOperands";
@@ -798,16 +832,19 @@ def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getSPE8DisEncoding";
+  let DecoderMethod = "decodeSPE8Operands";
 }
 def spe4dis : Operand<iPTR> {   // SPE displacement where the imm is 4-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getSPE4DisEncoding";
+  let DecoderMethod = "decodeSPE4Operands";
 }
 def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getSPE2DisEncoding";
+  let DecoderMethod = "decodeSPE2Operands";
 }
 
 // A single-register address. This is used with the SjLj
@@ -862,7 +899,7 @@ def HasSYNC   : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
 def IsPPC4xx  : Predicate<"PPCSubTarget->isPPC4xx()">;
 def IsPPC6xx  : Predicate<"PPCSubTarget->isPPC6xx()">;
 def IsE500  : Predicate<"PPCSubTarget->isE500()">;
-def HasSPE  : Predicate<"PPCSubTarget->HasSPE()">;
+def HasSPE  : Predicate<"PPCSubTarget->hasSPE()">;
 def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
 def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
 def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
@@ -870,6 +907,7 @@ def NaNsFPMath   : Predicate<"!TM.Options.NoNaNsFPMath">;
 def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
 def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
 def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
+def HasFPU : Predicate<"PPCSubTarget->hasFPU()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -1195,6 +1233,9 @@ let usesCustomInserter = 1,    // Expanded after instruction selection.
   def SELECT_CC_F8  : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
                               i32imm:$BROPC), "#SELECT_CC_F8",
                               []>;
+  def SELECT_CC_F16  : Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F16",
+                              []>;
   def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
                               i32imm:$BROPC), "#SELECT_CC_VRRC",
                               []>;
@@ -1207,12 +1248,17 @@ let usesCustomInserter = 1,    // Expanded after instruction selection.
   def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
                           g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
                           [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
+let Predicates = [HasFPU] in {
   def SELECT_F4  : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
                           f4rc:$T, f4rc:$F), "#SELECT_F4",
                           [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
   def SELECT_F8  : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
                           f8rc:$T, f8rc:$F), "#SELECT_F8",
                           [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+  def SELECT_F16  : Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+                          vrrc:$T, vrrc:$F), "#SELECT_F16",
+                          [(set f128:$dst, (select i1:$cond, f128:$T, f128:$F))]>;
+}
   def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
                           vrrc:$T, vrrc:$F), "#SELECT_VRRC",
                           [(set v4i32:$dst,
@@ -1724,28 +1770,28 @@ def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new),
 
 // Instructions to support atomic operations
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
-def LBARX : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
+def LBARX : XForm_1_memOp<31,  52, (outs gprc:$rD), (ins memrr:$src),
                     "lbarx $rD, $src", IIC_LdStLWARX, []>,
                     Requires<[HasPartwordAtomics]>;
 
-def LHARX : XForm_1<31,  116, (outs gprc:$rD), (ins memrr:$src),
+def LHARX : XForm_1_memOp<31,  116, (outs gprc:$rD), (ins memrr:$src),
                     "lharx $rD, $src", IIC_LdStLWARX, []>,
                     Requires<[HasPartwordAtomics]>;
 
-def LWARX : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
+def LWARX : XForm_1_memOp<31,  20, (outs gprc:$rD), (ins memrr:$src),
                     "lwarx $rD, $src", IIC_LdStLWARX, []>;
 
 // Instructions to support lock versions of atomics
 // (EH=1 - see Power ISA 2.07 Book II 4.4.2)
-def LBARXL : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
+def LBARXL : XForm_1_memOp<31,  52, (outs gprc:$rD), (ins memrr:$src),
                      "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
                      Requires<[HasPartwordAtomics]>;
 
-def LHARXL : XForm_1<31,  116, (outs gprc:$rD), (ins memrr:$src),
+def LHARXL : XForm_1_memOp<31,  116, (outs gprc:$rD), (ins memrr:$src),
                      "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
                      Requires<[HasPartwordAtomics]>;
 
-def LWARXL : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
+def LWARXL : XForm_1_memOp<31,  20, (outs gprc:$rD), (ins memrr:$src),
                      "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;
 
 // The atomic instructions use the destination register as well as the next one
@@ -1757,15 +1803,15 @@ def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
 }
 
 let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
-def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
+def STBCX : XForm_1_memOp<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
                     "stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
                     isDOT, Requires<[HasPartwordAtomics]>;
 
-def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
+def STHCX : XForm_1_memOp<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
                     "sthcx. $rS, $dst", IIC_LdStSTWCX, []>,
                     isDOT, Requires<[HasPartwordAtomics]>;
 
-def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
+def STWCX : XForm_1_memOp<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
                     "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
 }
 
@@ -1806,12 +1852,14 @@ def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
                   "lwz $rD, $src", IIC_LdStLoad,
                   [(set i32:$rD, (load iaddr:$src))]>;
 
+let Predicates = [HasFPU] in {
 def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
                   "lfs $rD, $src", IIC_LdStLFD,
                   [(set f32:$rD, (load iaddr:$src))]>;
 def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
                   "lfd $rD, $src", IIC_LdStLFD,
                   [(set f64:$rD, (load iaddr:$src))]>;
+}
 
 
 // Unindexed (r+i) Loads with Update (preinc).
@@ -1836,6 +1884,7 @@ def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
+let Predicates = [HasFPU] in {
 def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
                   "lfsu $rD, $addr", IIC_LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
@@ -1845,84 +1894,89 @@ def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr
                   "lfdu $rD, $addr", IIC_LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
+}
 
 
 // Indexed (r+r) Loads with Update (preinc).
-def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LBZUX : XForm_1_memOp<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
                    "lbzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LHAUX : XForm_1_memOp<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
                    "lhaux $rD, $addr", IIC_LdStLHAUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LHZUX : XForm_1_memOp<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
                    "lhzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+def LWZUX : XForm_1_memOp<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
                    "lwzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
+let Predicates = [HasFPU] in {
+def LFSUX : XForm_1_memOp<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
                    "lfsux $rD, $addr", IIC_LdStLFDUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
+def LFDUX : XForm_1_memOp<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
                    "lfdux $rD, $addr", IIC_LdStLFDUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
 }
+}
 
 // Indexed (r+r) Loads.
 //
 let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
-def LBZX : XForm_1<31,  87, (outs gprc:$rD), (ins memrr:$src),
+def LBZX : XForm_1_memOp<31,  87, (outs gprc:$rD), (ins memrr:$src),
                    "lbzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
-def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
+def LHAX : XForm_1_memOp<31, 343, (outs gprc:$rD), (ins memrr:$src),
                    "lhax $rD, $src", IIC_LdStLHA,
                    [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
                    PPC970_DGroup_Cracked;
-def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
+def LHZX : XForm_1_memOp<31, 279, (outs gprc:$rD), (ins memrr:$src),
                    "lhzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
-def LWZX : XForm_1<31,  23, (outs gprc:$rD), (ins memrr:$src),
+def LWZX : XForm_1_memOp<31,  23, (outs gprc:$rD), (ins memrr:$src),
                    "lwzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (load xaddr:$src))]>;
-def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
+def LHBRX : XForm_1_memOp<31, 790, (outs gprc:$rD), (ins memrr:$src),
                    "lhbrx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
-def LWBRX : XForm_1<31,  534, (outs gprc:$rD), (ins memrr:$src),
+def LWBRX : XForm_1_memOp<31,  534, (outs gprc:$rD), (ins memrr:$src),
                    "lwbrx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
-def LFSX   : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
+let Predicates = [HasFPU] in {
+def LFSX   : XForm_25_memOp<31, 535, (outs f4rc:$frD), (ins memrr:$src),
                       "lfsx $frD, $src", IIC_LdStLFD,
                       [(set f32:$frD, (load xaddr:$src))]>;
-def LFDX   : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
+def LFDX   : XForm_25_memOp<31, 599, (outs f8rc:$frD), (ins memrr:$src),
                       "lfdx $frD, $src", IIC_LdStLFD,
                       [(set f64:$frD, (load xaddr:$src))]>;
 
-def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
+def LFIWAX : XForm_25_memOp<31, 855, (outs f8rc:$frD), (ins memrr:$src),
                       "lfiwax $frD, $src", IIC_LdStLFD,
                       [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
-def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
+def LFIWZX : XForm_25_memOp<31, 887, (outs f8rc:$frD), (ins memrr:$src),
                       "lfiwzx $frD, $src", IIC_LdStLFD,
                       [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
 }
+}
 
 // Load Multiple
 def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
@@ -1943,6 +1997,7 @@ def STH  : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
 def STW  : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
                    "stw $rS, $src", IIC_LdStStore,
                    [(store i32:$rS, iaddr:$src)]>;
+let Predicates = [HasFPU] in {
 def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
                    "stfs $rS, $dst", IIC_LdStSTFD,
                    [(store f32:$rS, iaddr:$dst)]>;
@@ -1950,6 +2005,7 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
                    "stfd $rS, $dst", IIC_LdStSTFD,
                    [(store f64:$rS, iaddr:$dst)]>;
 }
+}
 
 // Unindexed (r+i) Stores with Update (preinc).
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
@@ -1962,6 +2018,7 @@ def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
 def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
                     "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+let Predicates = [HasFPU] in {
 def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
                     "stfsu $rS, $dst", IIC_LdStSTFDU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
@@ -1969,6 +2026,7 @@ def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
                     "stfdu $rS, $dst", IIC_LdStSTFDU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 }
+}
 
 // Patterns to match the pre-inc stores.  We can't put the patterns on
 // the instruction definitions directly as ISel wants the address base
@@ -1986,62 +2044,76 @@ def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
 
 // Indexed (r+r) Stores.
 let PPC970_Unit = 2 in {
-def STBX  : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
+def STBX  : XForm_8_memOp<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
                    "stbx $rS, $dst", IIC_LdStStore,
                    [(truncstorei8 i32:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
-def STHX  : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
+def STHX  : XForm_8_memOp<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
                    "sthx $rS, $dst", IIC_LdStStore,
                    [(truncstorei16 i32:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
-def STWX  : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
+def STWX  : XForm_8_memOp<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
                    "stwx $rS, $dst", IIC_LdStStore,
                    [(store i32:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
- 
-def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
+
+def STHBRX: XForm_8_memOp<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
                    "sthbrx $rS, $dst", IIC_LdStStore,
                    [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
                    PPC970_DGroup_Cracked;
-def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
+def STWBRX: XForm_8_memOp<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
                    "stwbrx $rS, $dst", IIC_LdStStore,
                    [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
                    PPC970_DGroup_Cracked;
 
-def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
+let Predicates = [HasFPU] in {
+def STFIWX: XForm_28_memOp<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
                      "stfiwx $frS, $dst", IIC_LdStSTFD,
                      [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
-                     
-def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
+
+def STFSX : XForm_28_memOp<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
                      "stfsx $frS, $dst", IIC_LdStSTFD,
                      [(store f32:$frS, xaddr:$dst)]>;
-def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
+def STFDX : XForm_28_memOp<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
                      "stfdx $frS, $dst", IIC_LdStSTFD,
                      [(store f64:$frS, xaddr:$dst)]>;
 }
+}
 
 // Indexed (r+r) Stores with Update (preinc).
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
-def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
-                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
-def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
-                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
-def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
-                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
-def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
-                    "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
-def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
-                    "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
-                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
-                    PPC970_DGroup_Cracked;
+def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
+                          (ins gprc:$rS, memrr:$dst),
+                          "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
+                          (ins gprc:$rS, memrr:$dst),
+                          "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
+                          (ins gprc:$rS, memrr:$dst),
+                          "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+let Predicates = [HasFPU] in {
+def STFSUX: XForm_8_memOp<31, 695, (outs ptr_rc_nor0:$ea_res),
+                          (ins f4rc:$rS, memrr:$dst),
+                          "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+def STFDUX: XForm_8_memOp<31, 759, (outs ptr_rc_nor0:$ea_res),
+                          (ins f8rc:$rS, memrr:$dst),
+                          "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
+                          RegConstraint<"$dst.ptrreg = $ea_res">,
+                          NoEncode<"$ea_res">,
+                          PPC970_DGroup_Cracked;
+}
 }
 
 // Patterns to match the pre-inc stores.  We can't put the patterns on
@@ -2053,10 +2125,12 @@ def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
           (STHUX $rS, $ptrreg, $ptroff)>;
 def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
           (STWUX $rS, $ptrreg, $ptroff)>;
+let Predicates = [HasFPU] in {
 def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
           (STFSUX $rS, $ptrreg, $ptroff)>;
 def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
           (STFDUX $rS, $ptrreg, $ptroff)>;
+}
 
 // Store Multiple
 def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
@@ -2240,7 +2314,7 @@ let isCompare = 1, hasSideEffects = 0 in {
                             "cmplw $crD, $rA, $rB", IIC_IntCompare>;
 }
 }
-let PPC970_Unit = 3 in {  // FPU Operations.
+let PPC970_Unit = 3, Predicates = [HasFPU] in {  // FPU Operations.
 //def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
 //                      "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
 let isCompare = 1, hasSideEffects = 0 in {
@@ -2318,13 +2392,13 @@ let Uses = [RM] in {
 /// often coalesced away and we don't want the dispatch group builder to think
 /// that they will fill slots (which could cause the load of a LSU reject to
 /// sneak into a d-group with a store).
-let hasSideEffects = 0 in
+let hasSideEffects = 0, Predicates = [HasFPU] in
 defm FMR   : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
                        "fmr", "$frD, $frB", IIC_FPGeneral,
                        []>,  // (set f32:$frD, f32:$frB)
                        PPC970_Unit_Pseudo;
 
-let PPC970_Unit = 3, hasSideEffects = 0 in {  // FPU Operations.
+let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in {  // FPU Operations.
 // These are artificially split into two different forms, for 4/8 byte FP.
 defm FABSS  : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
                         "fabs", "$frD, $frB", IIC_FPGeneral,
@@ -2573,6 +2647,7 @@ def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
                    "mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
 } // hasSideEffects = 0
 
+let Predicates = [HasFPU] in {
 // Pseudo instruction to perform FADD in round-to-zero mode.
 let usesCustomInserter = 1, Uses = [RM] in {
   def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
@@ -2632,6 +2707,7 @@ let Uses = [RM] in {
                                   "mffsl $rT", IIC_IntMFFS, []>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
+}
 
 let Predicates = [IsISA3_0] in {
 def MODSW : XForm_8<31, 779, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
@@ -2729,7 +2805,7 @@ defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
 // A-Form instructions.  Most of the instructions executed in the FPU are of
 // this type.
 //
-let PPC970_Unit = 3, hasSideEffects = 0 in {  // FPU Operations.
+let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in {  // FPU Operations.
 let Uses = [RM] in {
 let isCommutable = 1 in {
   defm FMADD : AForm_1r<63, 29, 
@@ -3055,6 +3131,7 @@ def : Pat<(extloadi16 iaddr:$src),
           (LHZ iaddr:$src)>;
 def : Pat<(extloadi16 xaddr:$src),
           (LHZX xaddr:$src)>;
+let Predicates = [HasFPU] in {
 def : Pat<(f64 (extloadf32 iaddr:$src)),
           (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
 def : Pat<(f64 (extloadf32 xaddr:$src)),
@@ -3062,6 +3139,7 @@ def : Pat<(f64 (extloadf32 xaddr:$src)),
 
 def : Pat<(f64 (fpextend f32:$src)),
           (COPY_TO_REGCLASS $src, F8RC)>;
+}
 
 // Only seq_cst fences require the heavyweight sync (SYNC 0).
 // All others can use the lightweight sync (SYNC 1).
@@ -3073,6 +3151,7 @@ def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
 def : Pat<(atomic_fence (imm),   (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
 def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
 
+let Predicates = [HasFPU] in {
 // Additional FNMSUB patterns: -a*c + b == -(a*c - b)
 def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
           (FNMSUB $A, $C, $B)>;
@@ -3088,6 +3167,7 @@ def : Pat<(fcopysign f64:$frB, f32:$frA),
           (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
 def : Pat<(fcopysign f32:$frB, f64:$frA),
           (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+}
 
 include "PPCInstrAltivec.td"
 include "PPCInstrSPE.td"
@@ -3530,6 +3610,7 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
                 (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
 
 // SETCC for f32.
+let Predicates = [HasFPU] in {
 def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
           (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
 def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
@@ -3591,6 +3672,96 @@ defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
 defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
                 (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
 
+// SETCC for f128.
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETLT)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOGT)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETGT)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)),
+          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+
+}
+
+// This must be in this file because it relies on patterns defined in this file
+// after the inclusion of the instruction sets.
+let Predicates = [HasSPE] in {
+// SETCC for f32.
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+          (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+          (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+          (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+          (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+                (EXTRACT_SUBREG (EFSCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+                (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+                (EXTRACT_SUBREG (EFSCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+                (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+                (EXTRACT_SUBREG (EFSCMPEQ $s1, $s2), sub_gt)>;
+
+// SETCC for f64.
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+          (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+          (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+          (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+          (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+                (EXTRACT_SUBREG (EFDCMPLT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+                (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+                (EXTRACT_SUBREG (EFDCMPGT $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+                (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+                (EXTRACT_SUBREG (EFDCMPEQ $s1, $s2), sub_gt)>;
+}
 // match select on i1 variables:
 def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
           (CROR (CRAND        $cond , $tval),
@@ -3673,6 +3844,7 @@ def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)),
 def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
           (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
 
+let Predicates = [HasFPU] in {
 def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
           (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
 def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
@@ -3714,6 +3886,28 @@ def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
           (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
 def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
           (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+}
+
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETLT)),
+          (SELECT_F16 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETULT)),
+          (SELECT_F16 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETLE)),
+          (SELECT_F16 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETULE)),
+          (SELECT_F16 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETEQ)),
+          (SELECT_F16 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETGE)),
+         (SELECT_F16 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETUGE)),
+          (SELECT_F16 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETGT)),
+          (SELECT_F16 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETUGT)),
+          (SELECT_F16 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f128 (selectcc i1:$lhs, i1:$rhs, f128:$tval, f128:$fval, SETNE)),
+          (SELECT_F16 (CRXOR $lhs, $rhs), $tval, $fval)>;
 
 def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
           (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
@@ -3763,13 +3957,15 @@ def : Pat<(i1 (not (trunc i64:$in))),
 
 // FIXME: For B=0 or B > 8, the registers following RT are used.
 // WARNING: Do not add patterns for this instruction without fixing this.
-def LSWI  : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
-                            "lswi $RT, $A, $B", IIC_LdStLoad, []>;
+def LSWI  : XForm_base_r3xo_memOp<31, 597, (outs gprc:$RT),
+                                  (ins gprc:$A, u5imm:$B),
+                                  "lswi $RT, $A, $B", IIC_LdStLoad, []>;
 
 // FIXME: For B=0 or B > 8, the registers following RT are used.
 // WARNING: Do not add patterns for this instruction without fixing this.
-def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
-                            "stswi $RT, $A, $B", IIC_LdStLoad, []>;
+def STSWI : XForm_base_r3xo_memOp<31, 725, (outs),
+                                  (ins gprc:$RT, gprc:$A, u5imm:$B),
+                                  "stswi $RT, $A, $B", IIC_LdStLoad, []>;
 
 def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
                          "isync", IIC_SprISYNC, []>;
@@ -3781,7 +3977,7 @@ def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
 def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
                            "eieio", IIC_LdStLoad, []>;
 
-def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
+def WAIT : XForm_24_sync<31, 30, (outs), (ins i32imm:$L),
                          "wait $L", IIC_LdStLoad, []>;
 
 def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
@@ -3843,6 +4039,7 @@ def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
 def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
 def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;
 
+let Predicates = [HasFPU] in {
 def MTFSF : XFLForm_1<63, 711, (outs),
                       (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
                       "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
@@ -3852,6 +4049,7 @@ def MTFSFo : XFLForm_1<63, 711, (outs),
 
 def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
 def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+}
 
 def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
                         "slbie $RB", IIC_SprSLBIE, []>;
@@ -3932,23 +4130,31 @@ def NAP   : XLForm_1_np<19, 434, (outs), (ins), "nap", IIC_BrB, []>;
 
 def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;
 
-def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
-                             "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
-def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
-                             "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
-def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
-                             "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
-def LDCIX :  XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
-                             "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
-
-def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
-                             "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
-def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
-                             "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
-def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
-                             "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
-def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
-                             "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LBZCIX : XForm_base_r3xo_memOp<31, 853, (outs gprc:$RST),
+                                  (ins gprc:$A, gprc:$B),
+                                  "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LHZCIX : XForm_base_r3xo_memOp<31, 821, (outs gprc:$RST),
+                                  (ins gprc:$A, gprc:$B),
+                                  "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LWZCIX : XForm_base_r3xo_memOp<31, 789, (outs gprc:$RST),
+                                  (ins gprc:$A, gprc:$B),
+                                  "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LDCIX :  XForm_base_r3xo_memOp<31, 885, (outs gprc:$RST),
+                                  (ins gprc:$A, gprc:$B),
+                                  "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+def STBCIX : XForm_base_r3xo_memOp<31, 981, (outs),
+                                  (ins gprc:$RST, gprc:$A, gprc:$B),
+                                  "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STHCIX : XForm_base_r3xo_memOp<31, 949, (outs),
+                                  (ins gprc:$RST, gprc:$A, gprc:$B),
+                                  "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STWCIX : XForm_base_r3xo_memOp<31, 917, (outs),
+                                  (ins gprc:$RST, gprc:$A, gprc:$B),
+                                  "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STDCIX : XForm_base_r3xo_memOp<31, 1013, (outs),
+                                  (ins gprc:$RST, gprc:$A, gprc:$B),
+                                  "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
 
 // External PID Load Store Instructions
 
@@ -3972,7 +4178,7 @@ def STBEPX  : XForm_8<31, 223, (outs), (ins gprc:$rS, memrr:$dst),
                       "stbepx $rS, $dst", IIC_LdStStore, []>,
                       Requires<[IsE500]>;
 
-def STFDEPX : XForm_28<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
+def STFDEPX : XForm_28_memOp<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
                       "stfdepx $frS, $dst", IIC_LdStSTFD, []>,
                       Requires<[IsE500]>;
 
@@ -4695,10 +4901,10 @@ def DWMaskValues {
 def DWSwapInByte {
   dag Swap1 = (OR8 (AND8 (RLDICL $A, 63, 1), DWMaskValues.Lo1),
                    (AND8 (RLDICR $A, 1, 62), DWMaskValues.Hi1));
-  dag Swap2 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap1, 62, 2), DWMaskValues.Lo2),
-                   (AND8 (RLDICR DWSwapInByte.Swap1, 2, 61), DWMaskValues.Hi2));
-  dag Swap4 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap2, 60, 4), DWMaskValues.Lo4),
-                   (AND8 (RLDICR DWSwapInByte.Swap2, 4, 59), DWMaskValues.Hi4));
+  dag Swap2 = (OR8 (AND8 (RLDICL Swap1, 62, 2), DWMaskValues.Lo2),
+                   (AND8 (RLDICR Swap1, 2, 61), DWMaskValues.Hi2));
+  dag Swap4 = (OR8 (AND8 (RLDICL Swap2, 60, 4), DWMaskValues.Lo4),
+                   (AND8 (RLDICR Swap2, 4, 59), DWMaskValues.Hi4));
 }
 
 // Intra-byte swap is done, now start inter-byte swap.
@@ -4718,7 +4924,7 @@ def DWBytes7656 {
 def DWBytes7654 {
   dag Word = (RLWIMI DWBytes7656.Word, DWBytes4567.Word, 8, 24, 31);
   dag DWord =
-    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes7654.Word, sub_32));
+    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), Word, sub_32));
 }
 
 def DWBytes0123 {
@@ -4737,7 +4943,7 @@ def DWBytes3212 {
 def DWBytes3210 {
   dag Word = (RLWIMI DWBytes3212.Word, DWBytes0123.Word, 8, 24, 31);
   dag DWord =
-    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes3210.Word, sub_32));
+    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), Word, sub_32));
 }
 
 // Now both high word and low word are reversed, next
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
index 4940c77c7ae5..c4bb02695b36 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
@@ -502,14 +502,14 @@ let Uses = [RM] in {
 
   // Load indexed instructions
   let mayLoad = 1 in {
-    def QVLFDX : XForm_1<31, 583,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfdx $FRT, $src", IIC_LdStLFD,
-                        [(set v4f64:$FRT, (load xoaddr:$src))]>;
+    def QVLFDX : XForm_1_memOp<31, 583,
+                              (outs qfrc:$FRT), (ins memrr:$src),
+                              "qvlfdx $FRT, $src", IIC_LdStLFD,
+                              [(set v4f64:$FRT, (load xoaddr:$src))]>;
     let isCodeGenOnly = 1 in
-    def QVLFDXb : XForm_1<31, 583,
-                        (outs qbrc:$FRT), (ins memrr:$src),
-                        "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
+    def QVLFDXb : XForm_1_memOp<31, 583,
+                                (outs qbrc:$FRT), (ins memrr:$src),
+                                "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
 
     let RC = 1 in
     def QVLFDXA : XForm_1<31, 583,
@@ -527,10 +527,10 @@ let Uses = [RM] in {
                         (outs qfrc:$FRT), (ins memrr:$src),
                         "qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
 
-    def QVLFSX : XForm_1<31, 519,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfsx $FRT, $src", IIC_LdStLFD,
-                        [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
+    def QVLFSX : XForm_1_memOp<31, 519,
+                              (outs qfrc:$FRT), (ins memrr:$src),
+                              "qvlfsx $FRT, $src", IIC_LdStLFD,
+                              [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
 
     let isCodeGenOnly = 1 in
     def QVLFSXb : XForm_1<31, 519,
@@ -538,10 +538,10 @@ let Uses = [RM] in {
                         "qvlfsx $FRT, $src", IIC_LdStLFD,
                         [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
     let isCodeGenOnly = 1 in
-    def QVLFSXs : XForm_1<31, 519,
-                        (outs qsrc:$FRT), (ins memrr:$src),
-                        "qvlfsx $FRT, $src", IIC_LdStLFD,
-                        [(set v4f32:$FRT, (load xoaddr:$src))]>;
+    def QVLFSXs : XForm_1_memOp<31, 519,
+                                (outs qsrc:$FRT), (ins memrr:$src),
+                                "qvlfsx $FRT, $src", IIC_LdStLFD,
+                                [(set v4f32:$FRT, (load xoaddr:$src))]>;
 
     let RC = 1 in
     def QVLFSXA : XForm_1<31, 519,
@@ -634,12 +634,12 @@ let Uses = [RM] in {
 
   // Store indexed instructions
   let mayStore = 1 in {
-    def QVSTFDX : XForm_8<31, 711,
+    def QVSTFDX : XForm_8_memOp<31, 711,
                         (outs), (ins qfrc:$FRT, memrr:$dst),
                         "qvstfdx $FRT, $dst", IIC_LdStSTFD,
                         [(store qfrc:$FRT, xoaddr:$dst)]>;
     let isCodeGenOnly = 1 in
-    def QVSTFDXb : XForm_8<31, 711,
+    def QVSTFDXb : XForm_8_memOp<31, 711,
                         (outs), (ins qbrc:$FRT, memrr:$dst),
                         "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
 
@@ -675,12 +675,12 @@ let Uses = [RM] in {
                         (outs), (ins qfrc:$FRT, memrr:$dst),
                         "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
 
-    def QVSTFSX : XForm_8<31, 647,
+    def QVSTFSX : XForm_8_memOp<31, 647,
                         (outs), (ins qfrc:$FRT, memrr:$dst),
                         "qvstfsx $FRT, $dst", IIC_LdStSTFD,
                         [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
     let isCodeGenOnly = 1 in
-    def QVSTFSXs : XForm_8<31, 647,
+    def QVSTFSXs : XForm_8_memOp<31, 647,
                          (outs), (ins qsrc:$FRT, memrr:$dst),
                          "qvstfsx $FRT, $dst", IIC_LdStSTFD,
                          [(store qsrc:$FRT, xoaddr:$dst)]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index cc3a4d20a9b2..96649efdc1bc 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -12,14 +12,56 @@
 //
 //===----------------------------------------------------------------------===//
 
+class EFXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> :
+               I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-31} = xo;
+}
+
+class EFXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> :
+               EFXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+class EFXForm_2a<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> :
+               EFXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let RA = 0;
+}
+
+class EFXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> :
+              I<4, OOL, IOL, asmstr, itin> {
+  bits<3> crD;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Inst{6-8}  = crD;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-31} = xo;
+}
+
 class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+               InstrItinClass itin, list<dag> pattern> :
+               I<4, OOL, IOL, asmstr, itin> {
   bits<5> RT;
   bits<5> RA;
   bits<5> RB;
 
-  let Pattern = [];
-  
+  let Pattern = pattern;
+
   let Inst{6-10}  = RT;
   let Inst{11-15} = RA;
   let Inst{16-20} = RB;
@@ -27,18 +69,26 @@ class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
 }
 
 class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin> : EVXForm_1<xo, OOL, IOL, asmstr, itin> {
+               InstrItinClass itin, list<dag> pattern> :
+               EVXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
   let RB = 0;
 }
 
+class EVXForm_2a<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> :
+               EVXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let RA = 0;
+}
+
 class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
-              InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+              InstrItinClass itin, list<dag> pattern> :
+              I<4, OOL, IOL, asmstr, itin> {
   bits<3> crD;
   bits<5> RA;
   bits<5> RB;
 
-  let Pattern = [];
-  
+  let Pattern = pattern;
+
   let Inst{6-8}  = crD;
   let Inst{9-10}  = 0;
   let Inst{11-15} = RA;
@@ -46,12 +96,30 @@ class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
   let Inst{21-31} = xo;
 }
 
+class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> :
+               I<4, OOL, IOL, asmstr, itin> {
+  bits<3> crD;
+  bits<5> RA;
+  bits<5> RB;
+  bits<5> RT;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-28} = xo;
+  let Inst{29-31} = crD;
+}
+
 class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+               InstrItinClass itin, list<dag> pattern> :
+               I<4, OOL, IOL, asmstr, itin> {
   bits<5> RT;
   bits<21> D;
 
-  let Pattern = [];
+  let Pattern = pattern;
 
   let Inst{6-10}  = RT;
   let Inst{20} = D{0};
@@ -68,380 +136,757 @@ class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
   let Inst{21-31} = xo;
 }
 
-let Predicates = [HasSPE], isAsmParserOnly = 1 in {
-
-def EVLDD          : EVXForm_D<769, (outs gprc:$RT), (ins spe8dis:$dst),
-                               "evldd $RT, $dst", IIC_VecFP>;
-def EVLDW          : EVXForm_D<771, (outs gprc:$RT), (ins spe8dis:$dst),
-                               "evldw $RT, $dst", IIC_VecFP>;
-def EVLDH          : EVXForm_D<773, (outs gprc:$RT), (ins spe8dis:$dst),
-                               "evldh $RT, $dst", IIC_VecFP>;
-def EVLHHESPLAT    : EVXForm_D<777, (outs gprc:$RT), (ins spe2dis:$dst),
-                               "evlhhesplat $RT, $dst", IIC_VecFP>;
-def EVLHHOUSPLAT   : EVXForm_D<781, (outs gprc:$RT), (ins spe2dis:$dst),
-                               "evlhhousplat $RT, $dst", IIC_VecFP>;
-def EVLHHOSSPLAT   : EVXForm_D<783, (outs gprc:$RT), (ins spe2dis:$dst),
-                               "evlhhossplat $RT, $dst", IIC_VecFP>;
-def EVLWHE         : EVXForm_D<785, (outs gprc:$RT), (ins spe4dis:$dst),
-                               "evlwhe $RT, $dst", IIC_VecFP>;
-def EVLWHOU        : EVXForm_D<789, (outs gprc:$RT), (ins spe4dis:$dst),
-                               "evlwhou $RT, $dst", IIC_VecFP>;
-def EVLWHOS        : EVXForm_D<791, (outs gprc:$RT), (ins spe4dis:$dst),
-                               "evlwhos $RT, $dst", IIC_VecFP>;
-def EVLWWSPLAT     : EVXForm_D<793, (outs gprc:$RT), (ins spe4dis:$dst),
-                               "evlwwsplat $RT, $dst", IIC_VecFP>;
-def EVLWHSPLAT     : EVXForm_D<797, (outs gprc:$RT), (ins spe4dis:$dst),
-                               "evlwhsplat $RT, $dst", IIC_VecFP>;
-
-def EVSTDD         : EVXForm_D<801, (outs), (ins gprc:$RT, spe8dis:$dst),
-                               "evstdd $RT, $dst", IIC_VecFP>;
-def EVSTDH         : EVXForm_D<805, (outs), (ins gprc:$RT, spe8dis:$dst),
-                               "evstdh $RT, $dst", IIC_VecFP>;
-def EVSTDW         : EVXForm_D<803, (outs), (ins gprc:$RT, spe8dis:$dst),
-                               "evstdw $RT, $dst", IIC_VecFP>;
-def EVSTWHE        : EVXForm_D<817, (outs), (ins gprc:$RT, spe4dis:$dst),
-                               "evstwhe $RT, $dst", IIC_VecFP>;
-def EVSTWHO        : EVXForm_D<821, (outs), (ins gprc:$RT, spe4dis:$dst),
-                               "evstwho $RT, $dst", IIC_VecFP>;
-def EVSTWWE        : EVXForm_D<825, (outs), (ins gprc:$RT, spe4dis:$dst),
-                               "evstwwe $RT, $dst", IIC_VecFP>;
-def EVSTWWO        : EVXForm_D<829, (outs), (ins gprc:$RT, spe4dis:$dst),
-                               "evstwwo $RT, $dst", IIC_VecFP>;
-
-def EVMRA : EVXForm_1<1220, (outs gprc:$RT), (ins gprc:$RA),
-                      "evmra $RT, $RA", IIC_VecFP> {
-  let RB = 0;
-}
+let DecoderNamespace = "SPE", Predicates = [HasSPE] in {
 
 def BRINC          : EVXForm_1<527, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "brinc $RT, $RA, $RB", IIC_VecFP>;
-def EVABS          : EVXForm_2<520, (outs gprc:$RT), (ins gprc:$RA),
-                               "evabs $RT, $RA", IIC_VecFP>;
-
-def EVADDIW        : EVXForm_1<514, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
-                               "evaddiw $RT, $RB, $RA", IIC_VecFP>;
-def EVADDSMIAAW    : EVXForm_2<1225, (outs gprc:$RT), (ins gprc:$RA),
-                               "evaddsmiaaw $RT, $RA", IIC_VecFP>;
-def EVADDSSIAAW    : EVXForm_2<1217, (outs gprc:$RT), (ins gprc:$RA),
-                               "evaddssiaaw $RT, $RA", IIC_VecFP>;
-def EVADDUSIAAW    : EVXForm_2<1216, (outs gprc:$RT), (ins gprc:$RA),
-                               "evaddusiaaw $RT, $RA", IIC_VecFP>;
-def EVADDUMIAAW    : EVXForm_2<1224, (outs gprc:$RT), (ins gprc:$RA),
-                               "evaddumiaaw $RT, $RA", IIC_VecFP>;
-def EVADDW         : EVXForm_1<512, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evaddw $RT, $RA, $RB", IIC_VecFP>;
-
-def EVAND          : EVXForm_1<529, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evand $RT, $RA, $RB", IIC_VecFP>;
-def EVANDC         : EVXForm_1<530, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evandc $RT, $RA, $RB", IIC_VecFP>;
-
-def EVCMPEQ        : EVXForm_3<564, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
-                               "evcmpeq $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPGTS       : EVXForm_3<561, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
-                               "evcmpgts $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPGTU       : EVXForm_3<560, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
-                               "evcmpgtu $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPLTS       : EVXForm_3<563, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
-                               "evcmplts $crD, $RA, $RB", IIC_VecFP>;
-def EVCMPLTU       : EVXForm_3<562, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
-                               "evcmpltu $crD, $RA, $RB", IIC_VecFP>;
-
-def EVCNTLSW       : EVXForm_2<526, (outs gprc:$RT), (ins gprc:$RA),
-                               "evcntlsw $RT, $RA", IIC_VecFP>;
-def EVCNTLZW       : EVXForm_2<525, (outs gprc:$RT), (ins gprc:$RA),
-                               "evcntlzw $RT, $RA", IIC_VecFP>;
-
-def EVDIVWS        : EVXForm_1<1222, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evdivws $RT, $RA, $RB", IIC_VecFP>;
-def EVDIVWU        : EVXForm_1<1223, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evdivwu $RT, $RA, $RB", IIC_VecFP>;
-
-def EVEQV          : EVXForm_1<537, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "eveqv $RT, $RA, $RB", IIC_VecFP>;
-
-def EVEXTSB        : EVXForm_2<522, (outs gprc:$RT), (ins gprc:$RA),
-                               "evextsb $RT, $RA", IIC_VecFP>;
-def EVEXTSH        : EVXForm_2<523, (outs gprc:$RT), (ins gprc:$RA),
-                               "evextsh $RT, $RA", IIC_VecFP>;
-
-def EVLDDX         : EVXForm_1<768, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlddx $RT, $RA, $RB", IIC_VecFP>;
-def EVLDWX         : EVXForm_1<770, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evldwx $RT, $RA, $RB", IIC_VecFP>;
-def EVLDHX         : EVXForm_1<772, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evldhx $RT, $RA, $RB", IIC_VecFP>;
-def EVLHHESPLATX   : EVXForm_1<776, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlhhesplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLHHOUSPLATX  : EVXForm_1<780, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlhhousplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLHHOSSPLATX  : EVXForm_1<782, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlhhossplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHEX        : EVXForm_1<784, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlwhex $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHOUX       : EVXForm_1<788, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlwhoux $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHOSX       : EVXForm_1<790, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlwhosx $RT, $RA, $RB", IIC_VecFP>;
-def EVLWWSPLATX    : EVXForm_1<792, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlwwsplatx $RT, $RA, $RB", IIC_VecFP>;
-def EVLWHSPLATX    : EVXForm_1<796, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evlwhsplatx $RT, $RA, $RB", IIC_VecFP>;
-
-def EVMERGEHI      : EVXForm_1<556, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmergehi $RT, $RA, $RB", IIC_VecFP>;
-def EVMERGELO      : EVXForm_1<557, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmergelo $RT, $RA, $RB", IIC_VecFP>;
-def EVMERGEHILO    : EVXForm_1<558, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmergehilo $RT, $RA, $RB", IIC_VecFP>;
-def EVMERGELOHI    : EVXForm_1<559, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmergelohi $RT, $RA, $RB", IIC_VecFP>;
-
-def EVMHEGSMFAA    : EVXForm_1<1323, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhegsmfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGSMFAN    : EVXForm_1<1451, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhegsmfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGSMIAA    : EVXForm_1<1321, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhegsmiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGSMIAN    : EVXForm_1<1449, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhegsmian $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGUMIAA    : EVXForm_1<1320, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhegumiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEGUMIAN    : EVXForm_1<1448, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhegumian $RT, $RA, $RB", IIC_VecFP>;
-
-def EVMHESMF       : EVXForm_1<1035, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMFA      : EVXForm_1<1067, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMFAAW    : EVXForm_1<1291, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMFANW    : EVXForm_1<1419, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMI       : EVXForm_1<1033, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMIA      : EVXForm_1<1065, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMIAAW    : EVXForm_1<1289, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESMIANW    : EVXForm_1<1417, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhesmianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSF       : EVXForm_1<1027, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhessf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSFA      : EVXForm_1<1059, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhessfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSFAAW    : EVXForm_1<1283, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhessfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSFANW    : EVXForm_1<1411, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhessfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSIAAW    : EVXForm_1<1281, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhessiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHESSIANW    : EVXForm_1<1409, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhessianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMI       : EVXForm_1<1032, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmheumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMIA      : EVXForm_1<1064, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmheumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMIAAW    : EVXForm_1<1288, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmheumiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUMIANW    : EVXForm_1<1416, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmheumianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUSIAAW    : EVXForm_1<1280, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmheusiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHEUSIANW    : EVXForm_1<1408, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmheusianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMFAA    : EVXForm_1<1327, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhogsmfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMFAN    : EVXForm_1<1455, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhogsmfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMIAA    : EVXForm_1<1325, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhogsmiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGSMIAN    : EVXForm_1<1453, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhogsmian $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGUMIAA    : EVXForm_1<1324, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhogumiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOGUMIAN    : EVXForm_1<1452, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhogumian $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMF       : EVXForm_1<1039, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMFA      : EVXForm_1<1071, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMFAAW    : EVXForm_1<1295, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMFANW    : EVXForm_1<1423, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMI       : EVXForm_1<1037, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMIA      : EVXForm_1<1069, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMIAAW    : EVXForm_1<1293, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSMIANW    : EVXForm_1<1421, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhosmianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSF       : EVXForm_1<1031, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhossf $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSFA      : EVXForm_1<1063, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhossfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSFAAW    : EVXForm_1<1287, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhossfaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSFANW    : EVXForm_1<1415, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhossfanw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSIAAW    : EVXForm_1<1285, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhossiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOSSIANW    : EVXForm_1<1413, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhossianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMI       : EVXForm_1<1036, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhoumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMIA      : EVXForm_1<1068, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhoumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMIAAW    : EVXForm_1<1292, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhoumiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUMIANW    : EVXForm_1<1420, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhoumianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUSIAAW    : EVXForm_1<1284, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhousiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMHOUSIANW    : EVXForm_1<1412, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmhousianw $RT, $RA, $RB", IIC_VecFP>;
-
-
-def EVMWHSMF       : EVXForm_1<1103, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhsmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSMFA      : EVXForm_1<1135, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhsmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSMI       : EVXForm_1<1101, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhsmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSMIA      : EVXForm_1<1133, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhsmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSSF       : EVXForm_1<1095, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhssf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHSSFA      : EVXForm_1<1127, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhssfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHUMI       : EVXForm_1<1100, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWHUMIA      : EVXForm_1<1132, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwhumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSMIAAW    : EVXForm_1<1353, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlsmiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSMIANW    : EVXForm_1<1481, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlsmianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSSIAAW    : EVXForm_1<1345, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlssiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLSSIANW    : EVXForm_1<1473, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlssianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMI       : EVXForm_1<1096, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMIA      : EVXForm_1<1128, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMIAAW    : EVXForm_1<1352, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlumiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUMIANW    : EVXForm_1<1480, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlumianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUSIAAW    : EVXForm_1<1344, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlusiaaw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWLUSIANW    : EVXForm_1<1472, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwlusianw $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMF        : EVXForm_1<1115, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMFA       : EVXForm_1<1147, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMFAA      : EVXForm_1<1371, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMFAN      : EVXForm_1<1499, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMI        : EVXForm_1<1113, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMIA       : EVXForm_1<1145, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMIAA      : EVXForm_1<1369, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSMIAN      : EVXForm_1<1497, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwsmian $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSF        : EVXForm_1<1107, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwssf $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSFA       : EVXForm_1<1139, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwssfa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSFAA      : EVXForm_1<1363, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwssfaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWSSFAN      : EVXForm_1<1491, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwssfan $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMI        : EVXForm_1<1112, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwumi $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMIA       : EVXForm_1<1144, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwumia $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMIAA      : EVXForm_1<1368, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwumiaa $RT, $RA, $RB", IIC_VecFP>;
-def EVMWUMIAN      : EVXForm_1<1496, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evmwumian $RT, $RA, $RB", IIC_VecFP>;
-
-
-def EVNAND         : EVXForm_1<542, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evnand $RT, $RA, $RB", IIC_VecFP>;
-
-def EVNEG          : EVXForm_2<521, (outs gprc:$RT), (ins gprc:$RA),
-                               "evneg $RT, $RA", IIC_VecFP>;
-
-def EVNOR          : EVXForm_1<536, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evnor $RT, $RA, $RB", IIC_VecFP>;
-def EVOR           : EVXForm_1<535, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evor $RT, $RA, $RB", IIC_VecFP>;
-def EVORC          : EVXForm_1<539, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evorc $RT, $RA, $RB", IIC_VecFP>;
-
-def EVRLWI         : EVXForm_1<554, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
-                               "evrlwi $RT, $RA, $RB", IIC_VecFP>;
-def EVRLW          : EVXForm_1<552, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evrlw $RT, $RA, $RB", IIC_VecFP>;
-
-def EVRNDW         : EVXForm_2<524, (outs gprc:$RT), (ins gprc:$RA),
-                               "evrndw $RT, $RA", IIC_VecFP>;
-
-def EVSLWI         : EVXForm_1<550, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
-                               "evslwi $RT, $RA, $RB", IIC_VecFP>;
-def EVSLW          : EVXForm_1<548, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evslw $RT, $RA, $RB", IIC_VecFP>;
-
-def EVSPLATFI      : EVXForm_2<555, (outs gprc:$RT), (ins i32imm:$RA),
-                               "evsplatfi $RT, $RA", IIC_VecFP>;
-def EVSPLATI       : EVXForm_2<553, (outs gprc:$RT), (ins i32imm:$RA),
-                               "evsplati $RT, $RA", IIC_VecFP>;
-
-def EVSRWIS        : EVXForm_1<547, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
-                               "evsrwis $RT, $RA, $RB", IIC_VecFP>;
-def EVSRWIU        : EVXForm_1<546, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
-                               "evsrwiu $RT, $RA, $RB", IIC_VecFP>;
-def EVSRWS         : EVXForm_1<545, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evsrws $RT, $RA, $RB", IIC_VecFP>;
-def EVSRWU         : EVXForm_1<544, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evsrwu $RT, $RA, $RB", IIC_VecFP>;
-
-def EVSTDDX        : EVXForm_1<800, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstddx $RT, $RA, $RB", IIC_VecFP>;
-def EVSTDHX        : EVXForm_1<804, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstdhx $RT, $RA, $RB", IIC_VecFP>;
-def EVSTDWX        : EVXForm_1<802, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstdwx $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWHEX       : EVXForm_1<816, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstwhex $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWHOX       : EVXForm_1<820, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstwhox $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWWEX       : EVXForm_1<824, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstwwex $RT, $RA, $RB", IIC_VecFP>;
-def EVSTWWOX       : EVXForm_1<828, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
-                               "evstwwox $RT, $RA, $RB", IIC_VecFP>;
-
-def EVSUBFSSIAAW   : EVXForm_2<1219, (outs gprc:$RT), (ins gprc:$RA),
-                               "evsubfssiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFSMIAAW   : EVXForm_2<1227, (outs gprc:$RT), (ins gprc:$RA),
-                               "evsubfsmiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFUMIAAW   : EVXForm_2<1226, (outs gprc:$RT), (ins gprc:$RA),
-                               "evsubfumiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFUSIAAW   : EVXForm_2<1218, (outs gprc:$RT), (ins gprc:$RA),
-                               "evsubfusiaaw $RT, $RA", IIC_VecFP>;
-def EVSUBFW        : EVXForm_1<516, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evsubfw $RT, $RA, $RB", IIC_VecFP>;
-def EVSUBIFW       : EVXForm_1<518, (outs gprc:$RT), (ins u5imm:$RA, gprc:$RB),
-                               "evsubifw $RT, $RA, $RB", IIC_VecFP>;
-def EVXOR          : EVXForm_1<534, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
-                               "evxor $RT, $RA, $RB", IIC_VecFP>;
+                               "brinc $RT, $RA, $RB", IIC_IntSimple, []>;
+
+// Double-precision floating point
+def EFDABS         : EFXForm_2<740, (outs sperc:$RT), (ins sperc:$RA),
+                                "efdabs $RT, $RA", IIC_FPDGeneral,
+                                [(set f64:$RT, (fabs f64:$RA))]>;
+
+def EFDADD         : EFXForm_1<736, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                                "efdadd $RT, $RA, $RB", IIC_FPAddSub,
+                                [(set f64:$RT, (fadd f64:$RA, f64:$RB))]>;
+
+def EFDCFS         : EFXForm_2a<751, (outs sperc:$RT), (ins spe4rc:$RB),
+                                "efdcfs $RT, $RB", IIC_FPDGeneral,
+                                [(set f64:$RT, (fpextend f32:$RB))]>;
+
+def EFDCFSF        : EFXForm_2a<755, (outs sperc:$RT), (ins spe4rc:$RB),
+                                "efdcfsf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCFSI        : EFXForm_2a<753, (outs sperc:$RT), (ins gprc:$RB),
+                                "efdcfsi $RT, $RB", IIC_FPDGeneral,
+                                [(set f64:$RT, (sint_to_fp i32:$RB))]>;
+
+def EFDCFSID       : EFXForm_2a<739, (outs sperc:$RT), (ins gprc:$RB),
+                                "efdcfsid $RT, $RB", IIC_FPDGeneral,
+                                []>;
+
+def EFDCFUF        : EFXForm_2a<754, (outs sperc:$RT), (ins spe4rc:$RB),
+                                "efdcfuf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCFUI        : EFXForm_2a<752, (outs sperc:$RT), (ins gprc:$RB),
+                                "efdcfui $RT, $RB", IIC_FPDGeneral,
+                                [(set f64:$RT, (uint_to_fp i32:$RB))]>;
+
+def EFDCFUID       : EFXForm_2a<738, (outs sperc:$RT), (ins gprc:$RB),
+                                "efdcfuid $RT, $RB", IIC_FPDGeneral,
+                                []>;
+
+let isCompare = 1 in {
+def EFDCMPEQ       : EFXForm_3<750, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efdcmpeq $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDCMPGT       : EFXForm_3<748, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efdcmpgt $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDCMPLT       : EFXForm_3<749, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efdcmplt $crD, $RA, $RB", IIC_FPDGeneral>;
+}
+
+def EFDCTSF        : EFXForm_2a<759, (outs sperc:$RT), (ins spe4rc:$RB),
+                                "efdctsf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCTSI        : EFXForm_2a<757, (outs gprc:$RT), (ins sperc:$RB),
+                                "efdctsi $RT, $RB", IIC_FPDGeneral,
+                                []>;
+
+def EFDCTSIDZ      : EFXForm_2a<747, (outs gprc:$RT), (ins sperc:$RB),
+                                "efdctsidz $RT, $RB", IIC_FPDGeneral,
+                                []>;
+
+def EFDCTSIZ       : EFXForm_2a<762, (outs gprc:$RT), (ins sperc:$RB),
+                                "efdctsiz $RT, $RB", IIC_FPDGeneral,
+                                [(set i32:$RT, (fp_to_sint f64:$RB))]>;
+
+def EFDCTUF        : EFXForm_2a<758, (outs sperc:$RT), (ins spe4rc:$RB),
+                                "efdctuf $RT, $RB", IIC_FPDGeneral, []>;
+
+def EFDCTUI        : EFXForm_2a<756, (outs gprc:$RT), (ins sperc:$RB),
+                                "efdctui $RT, $RB", IIC_FPDGeneral,
+                                []>;
+
+def EFDCTUIDZ      : EFXForm_2a<746, (outs gprc:$RT), (ins sperc:$RB),
+                                "efdctuidz $RT, $RB", IIC_FPDGeneral,
+                                []>;
+
+def EFDCTUIZ       : EFXForm_2a<760, (outs gprc:$RT), (ins sperc:$RB),
+                                "efdctuiz $RT, $RB", IIC_FPDGeneral,
+                                [(set i32:$RT, (fp_to_uint f64:$RB))]>;
+
+def EFDDIV         : EFXForm_1<745, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "efddiv $RT, $RA, $RB", IIC_FPDivD,
+                               [(set f64:$RT, (fdiv f64:$RA, f64:$RB))]>;
+
+def EFDMUL         : EFXForm_1<744, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "efdmul $RT, $RA, $RB", IIC_FPDGeneral,
+                               [(set f64:$RT, (fmul f64:$RA, f64:$RB))]>;
+
+def EFDNABS        : EFXForm_2<741, (outs sperc:$RT), (ins sperc:$RA),
+                               "efdnabs $RT, $RA", IIC_FPDGeneral,
+                               [(set f64:$RT, (fneg (fabs f64:$RA)))]>;
+
+def EFDNEG         : EFXForm_2<742, (outs sperc:$RT), (ins sperc:$RA),
+                               "efdneg $RT, $RA", IIC_FPDGeneral,
+                               [(set f64:$RT, (fneg f64:$RA))]>;
+
+def EFDSUB         : EFXForm_1<737, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "efdsub $RT, $RA, $RB", IIC_FPDGeneral,
+                               [(set f64:$RT, (fsub f64:$RA, f64:$RB))]>;
+
+let isCompare = 1 in {
+def EFDTSTEQ       : EFXForm_3<766, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efdtsteq $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDTSTGT       : EFXForm_3<764, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efdtstgt $crD, $RA, $RB", IIC_FPDGeneral>;
+def EFDTSTLT       : EFXForm_3<765, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efdtstlt $crD, $RA, $RB", IIC_FPDGeneral>;
+}
+
+// Single-precision floating point
+def EFSABS         : EFXForm_2<708, (outs spe4rc:$RT), (ins spe4rc:$RA),
+                                "efsabs $RT, $RA", IIC_FPSGeneral,
+                                [(set f32:$RT, (fabs f32:$RA))]>;
+
+def EFSADD         : EFXForm_1<704, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+                                "efsadd $RT, $RA, $RB", IIC_FPAddSub,
+                                [(set f32:$RT, (fadd f32:$RA, f32:$RB))]>;
+
+def EFSCFD         : EFXForm_2a<719, (outs spe4rc:$RT), (ins sperc:$RB),
+                                "efscfd $RT, $RB", IIC_FPSGeneral,
+                                [(set f32:$RT, (fpround f64:$RB))]>;
+
+def EFSCFSF        : EFXForm_2a<723, (outs spe4rc:$RT), (ins spe4rc:$RB),
+                                "efscfsf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCFSI        : EFXForm_2a<721, (outs spe4rc:$RT), (ins gprc:$RB),
+                                "efscfsi $RT, $RB", IIC_FPSGeneral,
+                                [(set f32:$RT, (sint_to_fp i32:$RB))]>;
+
+def EFSCFUF        : EFXForm_2a<722, (outs spe4rc:$RT), (ins spe4rc:$RB),
+                                "efscfuf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCFUI        : EFXForm_2a<720, (outs spe4rc:$RT), (ins gprc:$RB),
+                                "efscfui $RT, $RB", IIC_FPSGeneral,
+                                [(set f32:$RT, (uint_to_fp i32:$RB))]>;
+
+let isCompare = 1 in {
+def EFSCMPEQ       : EFXForm_3<718, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
+                               "efscmpeq $crD, $RA, $RB", IIC_FPCompare>;
+def EFSCMPGT       : EFXForm_3<716, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
+                               "efscmpgt $crD, $RA, $RB", IIC_FPCompare>;
+def EFSCMPLT       : EFXForm_3<717, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
+                               "efscmplt $crD, $RA, $RB", IIC_FPCompare>;
+}
+
+def EFSCTSF        : EFXForm_2a<727, (outs spe4rc:$RT), (ins spe4rc:$RB),
+                                "efsctsf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCTSI        : EFXForm_2a<725, (outs gprc:$RT), (ins spe4rc:$RB),
+                                "efsctsi $RT, $RB", IIC_FPSGeneral,
+                                []>;
+
+def EFSCTSIZ       : EFXForm_2a<730, (outs gprc:$RT), (ins spe4rc:$RB),
+                                "efsctsiz $RT, $RB", IIC_FPSGeneral,
+                                [(set i32:$RT, (fp_to_sint f32:$RB))]>;
+
+def EFSCTUF        : EFXForm_2a<726, (outs sperc:$RT), (ins spe4rc:$RB),
+                                "efsctuf $RT, $RB", IIC_FPSGeneral, []>;
+
+def EFSCTUI        : EFXForm_2a<724, (outs gprc:$RT), (ins spe4rc:$RB),
+                                "efsctui $RT, $RB", IIC_FPSGeneral,
+                                []>;
+
+def EFSCTUIZ       : EFXForm_2a<728, (outs gprc:$RT), (ins spe4rc:$RB),
+                                "efsctuiz $RT, $RB", IIC_FPSGeneral,
+                                [(set i32:$RT, (fp_to_uint f32:$RB))]>;
+
+def EFSDIV         : EFXForm_1<713, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+                               "efsdiv $RT, $RA, $RB", IIC_FPDivD,
+                               [(set f32:$RT, (fdiv f32:$RA, f32:$RB))]>;
+
+def EFSMUL         : EFXForm_1<712, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+                               "efsmul $RT, $RA, $RB", IIC_FPGeneral,
+                               [(set f32:$RT, (fmul f32:$RA, f32:$RB))]>;
+
+def EFSNABS        : EFXForm_2<709, (outs spe4rc:$RT), (ins spe4rc:$RA),
+                               "efsnabs $RT, $RA", IIC_FPGeneral,
+                               [(set f32:$RT, (fneg (fabs f32:$RA)))]>;
+
+def EFSNEG         : EFXForm_2<710, (outs spe4rc:$RT), (ins spe4rc:$RA),
+                               "efsneg $RT, $RA", IIC_FPGeneral,
+                               [(set f32:$RT, (fneg f32:$RA))]>;
+
+def EFSSUB         : EFXForm_1<705, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
+                               "efssub $RT, $RA, $RB", IIC_FPSGeneral,
+                               [(set f32:$RT, (fsub f32:$RA, f32:$RB))]>;
+
+let isCompare = 1 in {
+def EFSTSTEQ       : EFXForm_3<734, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efststeq $crD, $RA, $RB", IIC_FPCompare>;
+def EFSTSTGT       : EFXForm_3<732, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efststgt $crD, $RA, $RB", IIC_FPCompare>;
+def EFSTSTLT       : EFXForm_3<733, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "efststlt $crD, $RA, $RB", IIC_FPCompare>;
+}
+
+// SPE Vector operations
+
+def EVABS          : EVXForm_2<520, (outs sperc:$RT), (ins sperc:$RA),
+                               "evabs $RT, $RA", IIC_VecGeneral,
+                               []>;
+
+def EVADDIW        : EVXForm_1<514, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+                               "evaddiw $RT, $RB, $RA", IIC_VecGeneral, []>;
+def EVADDSMIAAW    : EVXForm_2<1225, (outs sperc:$RT), (ins sperc:$RA),
+                               "evaddsmiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDSSIAAW    : EVXForm_2<1217, (outs sperc:$RT), (ins sperc:$RA),
+                               "evaddssiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDUSIAAW    : EVXForm_2<1216, (outs sperc:$RT), (ins sperc:$RA),
+                               "evaddusiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDUMIAAW    : EVXForm_2<1224, (outs sperc:$RT), (ins sperc:$RA),
+                               "evaddumiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVADDW         : EVXForm_1<512, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evaddw $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVAND          : EVXForm_1<529, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evand $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+def EVANDC         : EVXForm_1<530, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evandc $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+let isCompare = 1 in {
+def EVCMPEQ        : EVXForm_3<564, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evcmpeq $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPGTS       : EVXForm_3<561, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evcmpgts $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPGTU       : EVXForm_3<560, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evcmpgtu $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPLTS       : EVXForm_3<563, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evcmplts $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVCMPLTU       : EVXForm_3<562, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evcmpltu $crD, $RA, $RB", IIC_VecGeneral, []>;
+}
+
+def EVCNTLSW       : EVXForm_2<526, (outs sperc:$RT), (ins sperc:$RA),
+                               "evcntlsw $RT, $RA", IIC_VecGeneral, []>;
+def EVCNTLZW       : EVXForm_2<525, (outs sperc:$RT), (ins sperc:$RA),
+                               "evcntlzw $RT, $RA", IIC_VecGeneral,
+                               []>;
+
+def EVDIVWS        : EVXForm_1<1222, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evdivws $RT, $RA, $RB", IIC_VecComplex,
+                               []>;
+def EVDIVWU        : EVXForm_1<1223, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evdivwu $RT, $RA, $RB", IIC_VecComplex,
+                               []>;
+
+def EVEQV          : EVXForm_1<537, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "eveqv $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVEXTSB        : EVXForm_2<522, (outs sperc:$RT), (ins sperc:$RA),
+                               "evextsb $RT, $RA", IIC_VecGeneral,
+                               []>;
+def EVEXTSH        : EVXForm_2<523, (outs sperc:$RT), (ins sperc:$RA),
+                               "evextsh $RT, $RA", IIC_VecGeneral,
+                               []>;
+
+def EVFSABS        : EVXForm_2<644, (outs sperc:$RT), (ins sperc:$RA),
+                               "evfsabs $RT, $RA", IIC_VecGeneral,
+                               []>;
+def EVFSADD        : EVXForm_1<640, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evfsadd $RT, $RA, $RB", IIC_VecComplex,
+                               []>;
+def EVFSCFSF       : EVXForm_2a<659, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfscfsf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCFSI       : EVXForm_2a<657, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfscfsi $RT, $RB", IIC_VecComplex,
+                                []>;
+def EVFSCFUF       : EVXForm_2a<658, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfscfuf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCFUI       : EVXForm_2a<650, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfscfui $RT, $RB", IIC_VecComplex,
+                                []>;
+let isCompare = 1 in {
+def EVFSCMPEQ      : EVXForm_3<654, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evfscmpeq $crD, $RA, $RB", IIC_FPSGeneral, []>;
+def EVFSCMPGT      : EVXForm_3<652, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evfscmpgt $crD, $RA, $RB", IIC_FPSGeneral, []>;
+def EVFSCMPLT      : EVXForm_3<653, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evfscmplt $crD, $RA, $RB", IIC_FPSGeneral, []>;
+}
+
+def EVFSCTSF        : EVXForm_2a<663, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfsctsf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCTSI        : EVXForm_2a<661, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfsctsi $RT, $RB", IIC_VecComplex,
+                                []>;
+def EVFSCTSIZ       : EVXForm_2a<666, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfsctsiz $RT, $RB", IIC_VecComplex,
+                                []>;
+def EVFSCTUF        : EVXForm_2a<662, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfsctsf $RT, $RB", IIC_VecComplex, []>;
+def EVFSCTUI        : EVXForm_2a<660, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfsctui $RT, $RB", IIC_VecComplex,
+                                []>;
+def EVFSCTUIZ       : EVXForm_2a<664, (outs sperc:$RT), (ins sperc:$RB),
+                                "evfsctsiz $RT, $RB", IIC_VecComplex,
+                                []>;
+def EVFSDIV         : EVXForm_1<649, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evfsdiv $RT, $RA, $RB", IIC_FPDivD,
+                               []>;
+def EVFSMUL         : EVXForm_1<648, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evfsmul $RT, $RA, $RB", IIC_VecComplex,
+                               []>;
+def EVFSNABS        : EVXForm_2<645, (outs sperc:$RT), (ins sperc:$RA),
+                               "evfsnabs $RT, $RA", IIC_VecGeneral,
+                               []>;
+def EVFSNEG         : EVXForm_2<646, (outs sperc:$RT), (ins sperc:$RA),
+                               "evfsneg $RT, $RA", IIC_VecGeneral,
+                               []>;
+def EVFSSUB         : EVXForm_1<641, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evfssub $RT, $RA, $RB", IIC_VecComplex,
+                               []>;
+
+let isCompare = 1 in {
+def EVFSTSTEQ       : EVXForm_3<670, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evfststeq $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVFSTSTGT       : EVXForm_3<668, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evfststgt $crD, $RA, $RB", IIC_VecGeneral, []>;
+def EVFSTSTLT       : EVXForm_3<669, (outs crrc:$crD), (ins sperc:$RA, sperc:$RB),
+                               "evfststlt $crD, $RA, $RB", IIC_VecGeneral, []>;
+}
+
+def EVLDD          : EVXForm_D<769, (outs sperc:$RT), (ins spe8dis:$dst),
+                               "evldd $RT, $dst", IIC_LdStLoad,
+                               [(set f64:$RT, (load iaddr:$dst))]>;
+def EVLDDX         : EVXForm_1<768, (outs sperc:$RT), (ins memrr:$src),
+                               "evlddx $RT, $src", IIC_LdStLoad,
+                               [(set f64:$RT, (load xaddr:$src))]>;
+def EVLDH          : EVXForm_D<773, (outs sperc:$RT), (ins spe8dis:$dst),
+                               "evldh $RT, $dst", IIC_LdStLoad, []>;
+def EVLDHX         : EVXForm_1<772, (outs sperc:$RT), (ins memrr:$src),
+                               "evldhx $RT, $src", IIC_LdStLoad, []>;
+def EVLDW          : EVXForm_D<771, (outs sperc:$RT), (ins spe8dis:$dst),
+                               "evldw $RT, $dst", IIC_LdStLoad,
+                               []>;
+def EVLDWX         : EVXForm_1<770, (outs sperc:$RT), (ins memrr:$src),
+                               "evldwx $RT, $src", IIC_LdStLoad,
+                               []>;
+def EVLHHESPLAT    : EVXForm_D<777, (outs sperc:$RT), (ins spe2dis:$dst),
+                               "evlhhesplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLHHESPLATX   : EVXForm_1<776, (outs sperc:$RT), (ins memrr:$src),
+                               "evlhhesplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLHHOUSPLAT   : EVXForm_D<781, (outs sperc:$RT), (ins spe2dis:$dst),
+                               "evlhhousplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLHHOUSPLATX  : EVXForm_1<780, (outs sperc:$RT), (ins memrr:$src),
+                               "evlhhousplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLHHOSSPLAT   : EVXForm_D<783, (outs sperc:$RT), (ins spe2dis:$dst),
+                               "evlhhossplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLHHOSSPLATX  : EVXForm_1<782, (outs sperc:$RT), (ins memrr:$src),
+                               "evlhhossplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLWHE         : EVXForm_D<785, (outs sperc:$RT), (ins spe4dis:$dst),
+                               "evlwhe $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHEX        : EVXForm_1<784, (outs sperc:$RT), (ins memrr:$src),
+                               "evlwhex $RT, $src", IIC_LdStLoad, []>;
+def EVLWHOS        : EVXForm_D<791, (outs sperc:$RT), (ins spe4dis:$dst),
+                               "evlwhos $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHOSX       : EVXForm_1<790, (outs sperc:$RT), (ins memrr:$src),
+                               "evlwhosx $RT, $src", IIC_LdStLoad, []>;
+def EVLWHOU        : EVXForm_D<789, (outs sperc:$RT), (ins spe4dis:$dst),
+                               "evlwhou $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHOUX       : EVXForm_1<788, (outs sperc:$RT), (ins memrr:$src),
+                               "evlwhoux $RT, $src", IIC_LdStLoad, []>;
+def EVLWHSPLAT     : EVXForm_D<797, (outs sperc:$RT), (ins spe4dis:$dst),
+                               "evlwhsplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLWHSPLATX    : EVXForm_1<796, (outs sperc:$RT), (ins memrr:$src),
+                               "evlwhsplatx $RT, $src", IIC_LdStLoad, []>;
+def EVLWWSPLAT     : EVXForm_D<793, (outs sperc:$RT), (ins spe4dis:$dst),
+                               "evlwwsplat $RT, $dst", IIC_LdStLoad, []>;
+def EVLWWSPLATX    : EVXForm_1<792, (outs sperc:$RT), (ins memrr:$src),
+                               "evlwwsplatx $RT, $src", IIC_LdStLoad, []>;
+
+def EVMERGEHI      : EVXForm_1<556, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmergehi $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVMERGELO      : EVXForm_1<557, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmergelo $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVMERGEHILO    : EVXForm_1<558, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmergehilo $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVMERGELOHI    : EVXForm_1<559, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmergelohi $RT, $RA, $RB", IIC_VecGeneral, []>;
+
+def EVMHEGSMFAA    : EVXForm_1<1323, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhegsmfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGSMFAN    : EVXForm_1<1451, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhegsmfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGSMIAA    : EVXForm_1<1321, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhegsmiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGSMIAN    : EVXForm_1<1449, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhegsmian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGUMIAA    : EVXForm_1<1320, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhegumiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEGUMIAN    : EVXForm_1<1448, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhegumian $RT, $RA, $RB", IIC_VecComplex, []>;
+
+def EVMHESMF       : EVXForm_1<1035, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMFA      : EVXForm_1<1067, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMFAAW    : EVXForm_1<1291, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMFANW    : EVXForm_1<1419, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMI       : EVXForm_1<1033, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMIA      : EVXForm_1<1065, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMIAAW    : EVXForm_1<1289, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESMIANW    : EVXForm_1<1417, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhesmianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSF       : EVXForm_1<1027, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhessf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSFA      : EVXForm_1<1059, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhessfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSFAAW    : EVXForm_1<1283, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhessfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSFANW    : EVXForm_1<1411, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhessfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSIAAW    : EVXForm_1<1281, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhessiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHESSIANW    : EVXForm_1<1409, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhessianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMI       : EVXForm_1<1032, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmheumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMIA      : EVXForm_1<1064, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmheumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMIAAW    : EVXForm_1<1288, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmheumiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUMIANW    : EVXForm_1<1416, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmheumianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUSIAAW    : EVXForm_1<1280, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmheusiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHEUSIANW    : EVXForm_1<1408, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmheusianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMFAA    : EVXForm_1<1327, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhogsmfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMFAN    : EVXForm_1<1455, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhogsmfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMIAA    : EVXForm_1<1325, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhogsmiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGSMIAN    : EVXForm_1<1453, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhogsmian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGUMIAA    : EVXForm_1<1324, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhogumiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOGUMIAN    : EVXForm_1<1452, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhogumian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMF       : EVXForm_1<1039, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMFA      : EVXForm_1<1071, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMFAAW    : EVXForm_1<1295, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMFANW    : EVXForm_1<1423, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMI       : EVXForm_1<1037, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMIA      : EVXForm_1<1069, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMIAAW    : EVXForm_1<1293, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSMIANW    : EVXForm_1<1421, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhosmianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSF       : EVXForm_1<1031, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhossf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSFA      : EVXForm_1<1063, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhossfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSFAAW    : EVXForm_1<1287, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhossfaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSFANW    : EVXForm_1<1415, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhossfanw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSIAAW    : EVXForm_1<1285, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhossiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOSSIANW    : EVXForm_1<1413, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhossianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMI       : EVXForm_1<1036, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhoumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMIA      : EVXForm_1<1068, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhoumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMIAAW    : EVXForm_1<1292, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhoumiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUMIANW    : EVXForm_1<1420, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhoumianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUSIAAW    : EVXForm_1<1284, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhousiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMHOUSIANW    : EVXForm_1<1412, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmhousianw $RT, $RA, $RB", IIC_VecComplex, []>;
+
+def EVMRA          : EVXForm_2<1220, (outs sperc:$RT), (ins sperc:$RA),
+                               "evmra $RT, $RA", IIC_VecComplex, []>;
+
+def EVMWHSMF       : EVXForm_1<1103, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhsmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSMFA      : EVXForm_1<1135, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhsmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSMI       : EVXForm_1<1101, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhsmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSMIA      : EVXForm_1<1133, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhsmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSSF       : EVXForm_1<1095, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhssf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHSSFA      : EVXForm_1<1127, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhssfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHUMI       : EVXForm_1<1100, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWHUMIA      : EVXForm_1<1132, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwhumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSMIAAW    : EVXForm_1<1353, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlsmiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSMIANW    : EVXForm_1<1481, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlsmianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSSIAAW    : EVXForm_1<1345, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlssiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLSSIANW    : EVXForm_1<1473, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlssianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUMI       : EVXForm_1<1096, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlumi $RT, $RA, $RB", IIC_VecComplex,
+                               []>;
+def EVMWLUMIA      : EVXForm_1<1128, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUMIAAW    : EVXForm_1<1352, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlumiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUMIANW    : EVXForm_1<1480, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlumianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUSIAAW    : EVXForm_1<1344, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlusiaaw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWLUSIANW    : EVXForm_1<1472, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwlusianw $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMF        : EVXForm_1<1115, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMFA       : EVXForm_1<1147, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMFAA      : EVXForm_1<1371, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMFAN      : EVXForm_1<1499, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMI        : EVXForm_1<1113, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMIA       : EVXForm_1<1145, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMIAA      : EVXForm_1<1369, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSMIAN      : EVXForm_1<1497, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwsmian $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSF        : EVXForm_1<1107, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwssf $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSFA       : EVXForm_1<1139, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwssfa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSFAA      : EVXForm_1<1363, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwssfaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWSSFAN      : EVXForm_1<1491, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwssfan $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMI        : EVXForm_1<1112, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwumi $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMIA       : EVXForm_1<1144, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwumia $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMIAA      : EVXForm_1<1368, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwumiaa $RT, $RA, $RB", IIC_VecComplex, []>;
+def EVMWUMIAN      : EVXForm_1<1496, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evmwumian $RT, $RA, $RB", IIC_VecComplex, []>;
+
+
+def EVNAND         : EVXForm_1<542, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evnand $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVNEG          : EVXForm_2<521, (outs sperc:$RT), (ins sperc:$RA),
+                               "evneg $RT, $RA", IIC_VecGeneral,
+                               []>;
+
+def EVNOR          : EVXForm_1<536, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evnor $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+def EVOR           : EVXForm_1<535, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evor $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+def EVORC          : EVXForm_1<539, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evorc $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVRLWI         : EVXForm_1<554, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+                               "evrlwi $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVRLW          : EVXForm_1<552, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evrlw $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVRNDW         : EVXForm_2<524, (outs sperc:$RT), (ins sperc:$RA),
+                               "evrndw $RT, $RA", IIC_VecGeneral, []>;
+
+def EVSEL          : EVXForm_4<79, (outs sperc:$RT),
+                               (ins sperc:$RA, sperc:$RB, crrc:$crD),
+                               "evsel crD,$RT,$RA,$RB", IIC_VecGeneral, []>;
+
+def EVSLWI         : EVXForm_1<550, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+                               "evslwi $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVSLW          : EVXForm_1<548, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evslw $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVSPLATFI      : EVXForm_2<555, (outs sperc:$RT), (ins s5imm:$RA),
+                               "evsplatfi $RT, $RA", IIC_VecGeneral, []>;
+def EVSPLATI       : EVXForm_2<553, (outs sperc:$RT), (ins s5imm:$RA),
+                               "evsplati $RT, $RA", IIC_VecGeneral, []>;
+
+def EVSRWIS        : EVXForm_1<547, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+                               "evsrwis $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVSRWIU        : EVXForm_1<546, (outs sperc:$RT), (ins sperc:$RA, u5imm:$RB),
+                               "evsrwiu $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVSRWS         : EVXForm_1<545, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evsrws $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+def EVSRWU         : EVXForm_1<544, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evsrwu $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+def EVSTDD         : EVXForm_D<801, (outs), (ins sperc:$RT, spe8dis:$dst),
+                               "evstdd $RT, $dst", IIC_LdStStore,
+                               [(store f64:$RT, iaddr:$dst)]>;
+def EVSTDDX        : EVXForm_1<800, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstddx $RT, $dst", IIC_LdStStore,
+                               [(store f64:$RT, xaddr:$dst)]>;
+def EVSTDH         : EVXForm_D<805, (outs), (ins sperc:$RT, spe8dis:$dst),
+                               "evstdh $RT, $dst", IIC_LdStStore, []>;
+def EVSTDHX        : EVXForm_1<804, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstdhx $RT, $dst", IIC_LdStStore, []>;
+def EVSTDW         : EVXForm_D<803, (outs), (ins sperc:$RT, spe8dis:$dst),
+                               "evstdw $RT, $dst", IIC_LdStStore,
+                               []>;
+def EVSTDWX        : EVXForm_1<802, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstdwx $RT, $dst", IIC_LdStStore,
+                               []>;
+def EVSTWHE        : EVXForm_D<817, (outs), (ins sperc:$RT, spe4dis:$dst),
+                               "evstwhe $RT, $dst", IIC_LdStStore, []>;
+def EVSTWHEX       : EVXForm_1<816, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstwhex $RT, $dst", IIC_LdStStore, []>;
+def EVSTWHO        : EVXForm_D<821, (outs), (ins sperc:$RT, spe4dis:$dst),
+                               "evstwho $RT, $dst", IIC_LdStStore, []>;
+def EVSTWHOX       : EVXForm_1<820, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstwhox $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWE        : EVXForm_D<825, (outs), (ins sperc:$RT, spe4dis:$dst),
+                               "evstwwe $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWEX       : EVXForm_1<824, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstwwex $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWO        : EVXForm_D<829, (outs), (ins sperc:$RT, spe4dis:$dst),
+                               "evstwwo $RT, $dst", IIC_LdStStore, []>;
+def EVSTWWOX       : EVXForm_1<828, (outs), (ins sperc:$RT, memrr:$dst),
+                               "evstwwox $RT, $dst", IIC_LdStStore, []>;
+
+def EVSUBFSSIAAW   : EVXForm_2<1219, (outs sperc:$RT), (ins sperc:$RA),
+                               "evsubfssiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFSMIAAW   : EVXForm_2<1227, (outs sperc:$RT), (ins sperc:$RA),
+                               "evsubfsmiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFUMIAAW   : EVXForm_2<1226, (outs sperc:$RT), (ins sperc:$RA),
+                               "evsubfumiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFUSIAAW   : EVXForm_2<1218, (outs sperc:$RT), (ins sperc:$RA),
+                               "evsubfusiaaw $RT, $RA", IIC_VecComplex, []>;
+def EVSUBFW        : EVXForm_1<516, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evsubfw $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+def EVSUBIFW       : EVXForm_1<518, (outs sperc:$RT), (ins u5imm:$RA, sperc:$RB),
+                               "evsubifw $RT, $RA, $RB", IIC_VecGeneral, []>;
+def EVXOR          : EVXForm_1<534, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+                               "evxor $RT, $RA, $RB", IIC_VecGeneral,
+                               []>;
+
+let isAsmParserOnly = 1 in {
+// Identical to the integer Load/Stores, but to handle floats
+def SPELWZ        : DForm_1<32, (outs spe4rc:$rD), (ins memri:$src),
+                            "lwz $rD, $src", IIC_LdStLoad,
+                            [(set f32:$rD, (load iaddr:$src))]>;
+def SPELWZX       : XForm_1<31,  23, (outs spe4rc:$rD), (ins memrr:$src),
+                            "lwzx $rD, $src", IIC_LdStLoad,
+                            [(set f32:$rD, (load xaddr:$src))]>;
+def SPESTW        : DForm_1<36, (outs), (ins spe4rc:$rS, memri:$src),
+                            "stw $rS, $src", IIC_LdStStore,
+                            [(store f32:$rS, iaddr:$src)]>;
+def SPESTWX       : XForm_8<31, 151, (outs), (ins spe4rc:$rS, memrr:$dst),
+                           "stwx $rS, $dst", IIC_LdStStore,
+                           [(store f32:$rS, xaddr:$dst)]>;
+}
 
 } // HasSPE
+
+let Predicates = [HasSPE] in {
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+          (COPY_TO_REGCLASS (SPELWZ iaddr:$src), SPERC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+          (COPY_TO_REGCLASS (SPELWZX xaddr:$src), SPERC)>;
+
+def : Pat<(f64 (fpextend f32:$src)),
+          (COPY_TO_REGCLASS $src, SPERC)>;
+}
+
+let Predicates = [HasSPE] in {
+  let usesCustomInserter = 1 in {
+def SELECT_CC_SPE4 : Pseudo<(outs spe4rc:$dst),
+                            (ins crrc:$cond, spe4rc:$T, spe4rc:$F,
+                            i32imm:$BROPC), "#SELECT_CC_SPE4",
+                            []>;
+def SELECT_CC_SPE  : Pseudo<(outs sperc:$dst),
+                            (ins crrc:$cond, sperc:$T, sperc:$F, i32imm:$BROPC),
+                            "#SELECT_CC_SPE",
+                            []>;
+def SELECT_SPE4  : Pseudo<(outs spe4rc:$dst), (ins crbitrc:$cond,
+                          spe4rc:$T, spe4rc:$F), "#SELECT_SPE4",
+                          [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
+def SELECT_SPE   : Pseudo<(outs sperc:$dst), (ins crbitrc:$cond,
+                          sperc:$T, sperc:$F), "#SELECT_SPE",
+                          [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+  }
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+          (SELECT_SPE4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+          (SELECT_SPE4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+          (SELECT_SPE4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+          (SELECT_SPE4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+          (SELECT_SPE4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+          (SELECT_SPE4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+          (SELECT_SPE4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+          (SELECT_SPE4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+          (SELECT_SPE4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+          (SELECT_SPE4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+          (SELECT_SPE (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_SPE (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+          (SELECT_SPE (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_SPE (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_SPE (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+          (SELECT_SPE (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_SPE (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+          (SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_SPE (CRXOR $lhs, $rhs), $tval, $fval)>;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 6f719784eb7c..ffba0e5aadb5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -126,29 +126,29 @@ let Uses = [RM] in {
   // Load indexed instructions
   let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
-    def LXSDX : XX1Form<31, 588,
+    def LXSDX : XX1Form_memOp<31, 588,
                         (outs vsfrc:$XT), (ins memrr:$src),
                         "lxsdx $XT, $src", IIC_LdStLFD,
-                        [(set f64:$XT, (load xoaddr:$src))]>;
+                        []>;
 
     // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later
     let isPseudo = 1, CodeSize = 3 in
-      def XFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+      def XFLOADf64  : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
                               "#XFLOADf64",
                               [(set f64:$XT, (load xoaddr:$src))]>;
 
     let Predicates = [HasVSX, HasOnlySwappingMemOps] in
-    def LXVD2X : XX1Form<31, 844,
+    def LXVD2X : XX1Form_memOp<31, 844,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvd2x $XT, $src", IIC_LdStLFD,
                          [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
 
-    def LXVDSX : XX1Form<31, 332,
+    def LXVDSX : XX1Form_memOp<31, 332,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvdsx $XT, $src", IIC_LdStLFD, []>;
 
     let Predicates = [HasVSX, HasOnlySwappingMemOps] in
-    def LXVW4X : XX1Form<31, 780,
+    def LXVW4X : XX1Form_memOp<31, 780,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvw4x $XT, $src", IIC_LdStLFD,
                          []>;
@@ -157,26 +157,26 @@ let Uses = [RM] in {
   // Store indexed instructions
   let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
-    def STXSDX : XX1Form<31, 716,
+    def STXSDX : XX1Form_memOp<31, 716,
                         (outs), (ins vsfrc:$XT, memrr:$dst),
                         "stxsdx $XT, $dst", IIC_LdStSTFD,
-                        [(store f64:$XT, xoaddr:$dst)]>;
+                        []>;
 
     // Pseudo instruction XFSTOREf64  will be expanded to STXSDX or STFDX later
     let isPseudo = 1, CodeSize = 3 in
-      def XFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+      def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
                               "#XFSTOREf64",
                               [(store f64:$XT, xoaddr:$dst)]>;
 
     let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
     // The behaviour of this instruction is endianness-specific so we provide no
     // pattern to match it without considering endianness.
-    def STXVD2X : XX1Form<31, 972,
+    def STXVD2X : XX1Form_memOp<31, 972,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvd2x $XT, $dst", IIC_LdStSTFD,
                          []>;
 
-    def STXVW4X : XX1Form<31, 908,
+    def STXVW4X : XX1Form_memOp<31, 908,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvw4x $XT, $dst", IIC_LdStSTFD,
                          []>;
@@ -1200,6 +1200,7 @@ def ScalarLoads {
 */
 def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
 def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
+def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
 let Predicates = [HasP8Vector] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   let isCommutable = 1, UseVSXReg = 1 in {
@@ -1226,11 +1227,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   // VSX scalar loads introduced in ISA 2.07
   let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
-    def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+    def LXSSPX : XX1Form_memOp<31, 524, (outs vssrc:$XT), (ins memrr:$src),
                          "lxsspx $XT, $src", IIC_LdStLFD, []>;
-    def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
+    def LXSIWAX : XX1Form_memOp<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
                           "lxsiwax $XT, $src", IIC_LdStLFD, []>;
-    def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+    def LXSIWZX : XX1Form_memOp<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
                           "lxsiwzx $XT, $src", IIC_LdStLFD, []>;
 
     // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
@@ -1238,15 +1239,15 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
     let isPseudo = 1 in {
       // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
       let CodeSize = 3 in
-      def XFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrr:$src),
+      def XFLOADf32  : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src),
                               "#XFLOADf32",
                               [(set f32:$XT, (load xoaddr:$src))]>;
       // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
-      def LIWAX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+      def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
                          "#LIWAX",
                          [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
       // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
-      def LIWZX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+      def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
                          "#LIWZX",
                          [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
     }
@@ -1255,9 +1256,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   // VSX scalar stores introduced in ISA 2.07
   let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
-    def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
+    def STXSSPX : XX1Form_memOp<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
                           "stxsspx $XT, $dst", IIC_LdStSTFD, []>;
-    def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+    def STXSIWX : XX1Form_memOp<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
                           "stxsiwx $XT, $dst", IIC_LdStSTFD, []>;
 
     // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
@@ -1265,11 +1266,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
     let isPseudo = 1 in {
       // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
       let CodeSize = 3 in
-      def XFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrr:$dst),
+      def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst),
                               "#XFSTOREf32",
                               [(store f32:$XT, xoaddr:$dst)]>;
       // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
-      def STIWX : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+      def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
                          "#STIWX",
                         [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
     }
@@ -1278,7 +1279,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
 
   def : Pat<(f64 (extloadf32 xoaddr:$src)),
             (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))),
+  def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))),
             (f32 (XFLOADf32 xoaddr:$src))>;
   def : Pat<(f64 (fpextend f32:$src)),
             (COPY_TO_REGCLASS $src, VSFRC)>;
@@ -1325,6 +1326,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                         (outs vssrc:$XT), (ins vssrc:$XB),
                         "xsresp $XT, $XB", IIC_VecFP,
                         [(set f32:$XT, (PPCfre f32:$XB))]>;
+  def XSRSP : XX2Form<60, 281,
+                        (outs vssrc:$XT), (ins vsfrc:$XB),
+                        "xsrsp $XT, $XB", IIC_VecFP, []>;
   def XSSQRTSP : XX2Form<60, 11,
                         (outs vssrc:$XT), (ins vssrc:$XB),
                         "xssqrtsp $XT, $XB", IIC_FPSqrtS,
@@ -1432,28 +1436,57 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   } // UseVSXReg = 1
 
   let Predicates = [IsLittleEndian] in {
-  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+  def : Pat<(f32 (PPCfcfids
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
             (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
-            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+  def : Pat<(f32 (PPCfcfids
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS
+                              (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
             (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
-            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS
+                              (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
   }
 
   let Predicates = [IsBigEndian] in {
-  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+  def : Pat<(f32 (PPCfcfids
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
             (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
-  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+  def : Pat<(f32 (PPCfcfids
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
             (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+  def : Pat<(f32 (PPCfcfidus
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))),
             (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+  def : Pat<(f32 (PPCfcfidus
+                   (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
             (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
   }
   def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
             (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>;
+
+  // Instructions for converting float to i64 feeding a store.
+  let Predicates = [NoP9Vector] in {
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8),
+            (STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8),
+            (STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>;
+  }
+
+  // Instructions for converting float to i32 feeding a store.
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4),
+            (STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
+            (STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+
 } // AddedComplexity = 400
 } // HasP8Vector
 
@@ -1614,11 +1647,11 @@ def VectorExtractions {
       This is accomplished by inverting the bits of the index and AND-ing
       with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
   */
-  dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx));
+  dag LE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDC8 (LI8 8), $Idx)));
 
   //  Number 2. above:
   //  - Now that we set up the shift amount, we shift in the VMX register
-  dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC);
+  dag LE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, LE_VBYTE_PERM_VEC));
 
   //  Number 3. above:
   //  - The doubleword containing our element is moved to a GPR
@@ -1646,11 +1679,12 @@ def VectorExtractions {
       AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
       Of course, the shift is still by 8 bytes, so we must multiply by 2.
   */
-  dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62));
+  dag LE_VHALF_PERM_VEC =
+    (v16i8 (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62)));
 
   //  Number 2. above:
   //  - Now that we set up the shift amount, we shift in the VMX register
-  dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC);
+  dag LE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, LE_VHALF_PERM_VEC));
 
   //  Number 3. above:
   //  - The doubleword containing our element is moved to a GPR
@@ -1675,11 +1709,12 @@ def VectorExtractions {
       - For elements 0-1, we shift left by 8 since they're on the right
       - For elements 2-3, we need not shift
   */
-  dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+  dag LE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+                                       (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61)));
 
   //  Number 2. above:
   //  - Now that we set up the shift amount, we shift in the VMX register
-  dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+  dag LE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VWORD_PERM_VEC));
 
   //  Number 3. above:
   //  - The doubleword containing our element is moved to a GPR
@@ -1704,11 +1739,12 @@ def VectorExtractions {
       - For element 0, we shift left by 8 since it's on the right
       - For element 1, we need not shift
   */
-  dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+  dag LE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+                                        (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60)));
 
   //  Number 2. above:
   //  - Now that we set up the shift amount, we shift in the VMX register
-  dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+  dag LE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VDWORD_PERM_VEC));
 
   // Number 3. above:
   //  - The doubleword containing our element is moved to a GPR
@@ -1722,16 +1758,17 @@ def VectorExtractions {
       - Shift the vector to line up the desired element to BE Word 0
       - Convert 32-bit float to a 64-bit single precision float
   */
-  dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+  dag LE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8,
+                                  (RLDICR (XOR8 (LI8 3), $Idx), 2, 61)));
   dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
   dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
 
   /*  LE variable double
       Same as the LE doubleword except there is no move.
   */
-  dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
-                                  (COPY_TO_REGCLASS $S, VRRC),
-                                  LE_VDWORD_PERM_VEC);
+  dag LE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+                                         (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+                                         LE_VDWORD_PERM_VEC));
   dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
 
   /*  BE variable byte
@@ -1741,8 +1778,8 @@ def VectorExtractions {
       - The order of elements after the move to GPR is reversed, so we invert
         the bits of the index prior to truncating to the range 0-7
   */
-  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
-  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+  dag BE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDIo8 $Idx, 8)));
+  dag BE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, BE_VBYTE_PERM_VEC));
   dag BE_MV_VBYTE = (MFVSRD
                       (EXTRACT_SUBREG
                         (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
@@ -1759,8 +1796,9 @@ def VectorExtractions {
       - The order of elements after the move to GPR is reversed, so we invert
         the bits of the index prior to truncating to the range 0-3
   */
-  dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62));
-  dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC);
+  dag BE_VHALF_PERM_VEC = (v16i8 (LVSL ZERO8,
+                                       (RLDICR (ANDIo8 $Idx, 4), 1, 62)));
+  dag BE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, BE_VHALF_PERM_VEC));
   dag BE_MV_VHALF = (MFVSRD
                       (EXTRACT_SUBREG
                         (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
@@ -1776,8 +1814,9 @@ def VectorExtractions {
       - The order of elements after the move to GPR is reversed, so we invert
         the bits of the index prior to truncating to the range 0-1
   */
-  dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
-  dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+  dag BE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+                                       (RLDICR (ANDIo8 $Idx, 2), 2, 61)));
+  dag BE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VWORD_PERM_VEC));
   dag BE_MV_VWORD = (MFVSRD
                       (EXTRACT_SUBREG
                         (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
@@ -1791,8 +1830,9 @@ def VectorExtractions {
       Same as the LE doubleword except we shift in the VMX register for opposite
       element indices.
   */
-  dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
-  dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+  dag BE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
+                                        (RLDICR (ANDIo8 $Idx, 1), 3, 60)));
+  dag BE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VDWORD_PERM_VEC));
   dag BE_VARIABLE_DWORD =
         (MFVSRD (EXTRACT_SUBREG
                   (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
@@ -1802,16 +1842,16 @@ def VectorExtractions {
       - Shift the vector to line up the desired element to BE Word 0
       - Convert 32-bit float to a 64-bit single precision float
   */
-  dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+  dag BE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR $Idx, 2, 61)));
   dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
   dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
 
   /* BE variable double
       Same as the BE doubleword except there is no move.
   */
-  dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
-                                  (COPY_TO_REGCLASS $S, VRRC),
-                                  BE_VDWORD_PERM_VEC);
+  dag BE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+                                         (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
+                                         BE_VDWORD_PERM_VEC));
   dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
 }
 
@@ -2282,7 +2322,7 @@ let Predicates = [HasDirectMove, HasVSX] in {
 // (convert to 32-bit fp single, shift right 1 word, move to GPR)
 def : Pat<(i32 (bitconvert f32:$S)),
           (i32 (MFVSRWZ (EXTRACT_SUBREG
-                          (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3),
+                          (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
                           sub_64)))>;
 // bitconvert i32 -> f32
 // (move to FPR, shift left 1 word, convert to 64-bit fp single)
@@ -2333,6 +2373,17 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
     : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
                     !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
 
+  // [PO VRT XO VRB XO /]
+  class X_VT5_XO5_VB5_VSFR<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vfrc:$vT), (ins vrrc:$vB),
+                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+  // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+  class X_VT5_XO5_VB5_VSFR_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                         list<dag> pattern>
+    : X_VT5_XO5_VB5_VSFR<opcode, xo2, xo, opc, pattern>, isDOT;
+
   let UseVSXReg = 1 in {
   // [PO T XO B XO BX /]
   class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
@@ -2365,43 +2416,112 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                          list<dag> pattern>
     : X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isDOT;
 
+  // [PO VRT VRA VRB XO /]
+  class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
+                          list<dag> pattern>
+    : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vTi, vrrc:$vA, vrrc:$vB),
+              !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>,
+              RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">;
+
+  // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+  class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
+                          list<dag> pattern>
+    : X_VT5_VA5_VB5_FMA<opcode, xo, opc, pattern>, isDOT;
+
   //===--------------------------------------------------------------------===//
   // Quad-Precision Scalar Move Instructions:
 
   // Copy Sign
-  def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp", []>;
+  def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp",
+                                [(set f128:$vT,
+                                      (fcopysign f128:$vB, f128:$vA))]>;
 
   // Absolute/Negative-Absolute/Negate
-  def XSABSQP   : X_VT5_XO5_VB5<63,  0, 804, "xsabsqp" , []>;
-  def XSNABSQP  : X_VT5_XO5_VB5<63,  8, 804, "xsnabsqp", []>;
-  def XSNEGQP   : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp" , []>;
+  def XSABSQP   : X_VT5_XO5_VB5<63,  0, 804, "xsabsqp",
+                                [(set f128:$vT, (fabs f128:$vB))]>;
+  def XSNABSQP  : X_VT5_XO5_VB5<63,  8, 804, "xsnabsqp",
+                                [(set f128:$vT, (fneg (fabs f128:$vB)))]>;
+  def XSNEGQP   : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp",
+                                [(set f128:$vT, (fneg f128:$vB))]>;
 
   //===--------------------------------------------------------------------===//
   // Quad-Precision Scalar Floating-Point Arithmetic Instructions:
 
   // Add/Divide/Multiply/Subtract
-  def XSADDQP   : X_VT5_VA5_VB5   <63,   4, "xsaddqp" , []>;
-  def XSADDQPO  : X_VT5_VA5_VB5_Ro<63,   4, "xsaddqpo", []>;
-  def XSDIVQP   : X_VT5_VA5_VB5   <63, 548, "xsdivqp" , []>;
-  def XSDIVQPO  : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", []>;
-  def XSMULQP   : X_VT5_VA5_VB5   <63,  36, "xsmulqp" , []>;
-  def XSMULQPO  : X_VT5_VA5_VB5_Ro<63,  36, "xsmulqpo", []>;
-  def XSSUBQP   : X_VT5_VA5_VB5   <63, 516, "xssubqp" , []>;
-  def XSSUBQPO  : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", []>;
+  let isCommutable = 1 in {
+  def XSADDQP   : X_VT5_VA5_VB5   <63,   4, "xsaddqp",
+                                   [(set f128:$vT, (fadd f128:$vA, f128:$vB))]>;
+  def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_addf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  def XSMULQP   : X_VT5_VA5_VB5   <63,  36, "xsmulqp",
+                                   [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>;
+  def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_mulf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  }
+
+  def XSSUBQP   : X_VT5_VA5_VB5   <63, 516, "xssubqp" ,
+                                   [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>;
+  def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_subf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  def XSDIVQP   : X_VT5_VA5_VB5   <63, 548, "xsdivqp",
+                                   [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>;
+  def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_divf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
 
   // Square-Root
-  def XSSQRTQP  : X_VT5_XO5_VB5   <63, 27, 804, "xssqrtqp" , []>;
-  def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", []>;
+  def XSSQRTQP  : X_VT5_XO5_VB5   <63, 27, 804, "xssqrtqp",
+                                   [(set f128:$vT, (fsqrt f128:$vB))]>;
+  def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_sqrtf128_round_to_odd f128:$vB))]>;
 
   // (Negative) Multiply-{Add/Subtract}
-  def XSMADDQP  : X_VT5_VA5_VB5   <63, 388, "xsmaddqp"  , []>;
-  def XSMADDQPO : X_VT5_VA5_VB5_Ro<63, 388, "xsmaddqpo" , []>;
-  def XSMSUBQP  : X_VT5_VA5_VB5   <63, 420, "xsmsubqp"  , []>;
-  def XSMSUBQPO : X_VT5_VA5_VB5_Ro<63, 420, "xsmsubqpo" , []>;
-  def XSNMADDQP : X_VT5_VA5_VB5   <63, 452, "xsnmaddqp" , []>;
-  def XSNMADDQPO: X_VT5_VA5_VB5_Ro<63, 452, "xsnmaddqpo", []>;
-  def XSNMSUBQP : X_VT5_VA5_VB5   <63, 484, "xsnmsubqp" , []>;
-  def XSNMSUBQPO: X_VT5_VA5_VB5_Ro<63, 484, "xsnmsubqpo", []>;
+  def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
+                                    [(set f128:$vT,
+                                          (fma f128:$vA, f128:$vB,
+                                               f128:$vTi))]>;
+
+  def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo",
+                                      [(set f128:$vT,
+                                      (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA,f128:$vB,f128:$vTi))]>;
+
+  def XSMSUBQP  : X_VT5_VA5_VB5_FMA   <63, 420, "xsmsubqp"  ,
+                                       [(set f128:$vT,
+                                             (fma f128:$vA, f128:$vB,
+                                                  (fneg f128:$vTi)))]>;
+  def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" ,
+                                      [(set f128:$vT,
+                                      (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA, f128:$vB, (fneg f128:$vTi)))]>;
+  def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
+                                     [(set f128:$vT,
+                                           (fneg (fma f128:$vA, f128:$vB,
+                                                      f128:$vTi)))]>;
+  def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo",
+                                      [(set f128:$vT,
+                                      (fneg (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA, f128:$vB, f128:$vTi)))]>;
+  def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
+                                     [(set f128:$vT,
+                                           (fneg (fma f128:$vA, f128:$vB,
+                                                      (fneg f128:$vTi))))]>;
+  def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo",
+                                      [(set f128:$vT,
+                                      (fneg (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;
+
+  // Additional fnmsub patterns: -a*c + b == -(a*c - b)
+  def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>;
+  def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>;
 
   //===--------------------------------------------------------------------===//
   // Quad/Double-Precision Compare Instructions:
@@ -2434,37 +2554,20 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                                   IIC_FPCompare, []>;
   def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
                                   IIC_FPCompare, []>;
-  def XSCMPNEDP : XX3_XT5_XA5_XB5<60, 27, "xscmpnedp", vsrc, vsfrc, vsfrc,
-                                  IIC_FPCompare, []>;
-  let UseVSXReg = 1 in {
-  // Vector Compare Not Equal
-  def XVCMPNEDP  : XX3Form_Rc<60, 123,
-                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                              "xvcmpnedp  $XT, $XA, $XB", IIC_VecFPCompare, []>;
-  let Defs = [CR6] in
-  def XVCMPNEDPo : XX3Form_Rc<60, 123,
-                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                              "xvcmpnedp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
-                              isDOT;
-  def XVCMPNESP  : XX3Form_Rc<60,  91,
-                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                              "xvcmpnesp  $XT, $XA, $XB", IIC_VecFPCompare, []>;
-  let Defs = [CR6] in
-  def XVCMPNESPo : XX3Form_Rc<60,  91,
-                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                              "xvcmpnesp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
-                              isDOT;
-  } // UseVSXReg = 1
 
   //===--------------------------------------------------------------------===//
   // Quad-Precision Floating-Point Conversion Instructions:
 
   // Convert DP -> QP
-  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc, []>;
+  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc,
+                                     [(set f128:$vT, (fpextend f64:$vB))]>;
 
   // Round & Convert QP -> DP (dword[1] is set to zero)
-  def XSCVQPDP  : X_VT5_XO5_VB5   <63, 20, 836, "xscvqpdp" , []>;
-  def XSCVQPDPO : X_VT5_XO5_VB5_Ro<63, 20, 836, "xscvqpdpo", []>;
+  def XSCVQPDP  : X_VT5_XO5_VB5_VSFR<63, 20, 836, "xscvqpdp" , []>;
+  def XSCVQPDPO : X_VT5_XO5_VB5_VSFR_Ro<63, 20, 836, "xscvqpdpo",
+                                        [(set f64:$vT,
+                                        (int_ppc_truncf128_round_to_odd
+                                        f128:$vB))]>;
 
   // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
   def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
@@ -2472,9 +2575,30 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
   def XSCVQPUWZ : X_VT5_XO5_VB5<63,  1, 836, "xscvqpuwz", []>;
 
-  // Convert (Un)Signed DWord -> QP
+  // Convert (Un)Signed DWord -> QP.
   def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
+  def : Pat<(f128 (sint_to_fp i64:$src)),
+            (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+  def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
+            (f128 (XSCVSDQP $src))>;
+  def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
+            (f128 (XSCVSDQP (VEXTSW2Ds $src)))>;
+
   def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vfrc, []>;
+  def : Pat<(f128 (uint_to_fp i64:$src)),
+            (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+  def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
+            (f128 (XSCVUDQP $src))>;
+
+  // Convert (Un)Signed Word -> QP.
+  def : Pat<(f128 (sint_to_fp i32:$src)),
+            (f128 (XSCVSDQP (MTVSRWA $src)))>;
+  def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
+            (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
+  def : Pat<(f128 (uint_to_fp i32:$src)),
+            (f128 (XSCVUDQP (MTVSRWZ $src)))>;
+  def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
+            (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
 
   let UseVSXReg = 1 in {
   //===--------------------------------------------------------------------===//
@@ -2503,7 +2627,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
                                 list<dag> pattern>
-    : Z23Form_1<opcode, xo,
+    : Z23Form_8<opcode, xo,
                 (outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
                 !strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
     let RC = ex;
@@ -2513,6 +2637,20 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
   def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
 
+  // Use current rounding mode
+  def : Pat<(f128 (fnearbyint f128:$vB)), (f128 (XSRQPI 0, $vB, 3))>;
+  // Round to nearest, ties away from zero
+  def : Pat<(f128 (fround f128:$vB)), (f128 (XSRQPI 0, $vB, 0))>;
+  // Round towards Zero
+  def : Pat<(f128 (ftrunc f128:$vB)), (f128 (XSRQPI 1, $vB, 1))>;
+  // Round towards +Inf
+  def : Pat<(f128 (fceil f128:$vB)), (f128 (XSRQPI 1, $vB, 2))>;
+  // Round towards -Inf
+  def : Pat<(f128 (ffloor f128:$vB)), (f128 (XSRQPI 1, $vB, 3))>;
+
+  // Use current rounding mode, [with Inexact]
+  def : Pat<(f128 (frint f128:$vB)), (f128 (XSRQPIX 0, $vB, 3))>;
+
   // Round Quad-Precision to Double-Extended Precision (fp80)
   def XSRQPXP  : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
 
@@ -2670,7 +2808,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   // "out" and "in" dag
   class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
                       RegisterOperand vtype, list<dag> pattern>
-    : XX1Form<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
+    : XX1Form_memOp<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
               !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>, UseVSXReg;
 
   // Load as Integer Byte/Halfword & Zero Indexed
@@ -2687,11 +2825,11 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc,
                 [(set v2f64:$XT, (load xaddr:$src))]>;
   // Load Vector (Left-justified) with Length
-  def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+  def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
                    "lxvl $XT, $src, $rB", IIC_LdStLoad,
                    [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>,
                     UseVSXReg;
-  def LXVLL : XX1Form<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+  def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
                    "lxvll $XT, $src, $rB", IIC_LdStLoad,
                    [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>,
                     UseVSXReg;
@@ -2716,7 +2854,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   // [PO S RA RB XO SX]
   class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
                       RegisterOperand vtype, list<dag> pattern>
-    : XX1Form<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
+    : XX1Form_memOp<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
               !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>, UseVSXReg;
 
   // Store as Integer Byte/Halfword Indexed
@@ -2738,51 +2876,55 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                  [(store v2f64:$XT, xaddr:$dst)]>;
 
   // Store Vector (Left-justified) with Length
-  def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
-                   "stxvl $XT, $dst, $rB", IIC_LdStLoad,
-                   [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst, i64:$rB)]>,
-                    UseVSXReg;
-  def STXVLL : XX1Form<31, 429, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
-                   "stxvll $XT, $dst, $rB", IIC_LdStLoad,
-                   [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst, i64:$rB)]>,
-                    UseVSXReg;
+  def STXVL : XX1Form_memOp<31, 397, (outs),
+                            (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+                            "stxvl $XT, $dst, $rB", IIC_LdStLoad,
+                            [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst,
+                              i64:$rB)]>,
+                            UseVSXReg;
+  def STXVLL : XX1Form_memOp<31, 429, (outs),
+                            (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+                            "stxvll $XT, $dst, $rB", IIC_LdStLoad,
+                            [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst,
+                              i64:$rB)]>,
+                            UseVSXReg;
   } // mayStore
 
   let Predicates = [IsLittleEndian] in {
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
   }
 
   let Predicates = [IsBigEndian] in {
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
-  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
            (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
   }
 
@@ -2795,21 +2937,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // Patterns for which instructions from ISA 3.0 are a better match
   let Predicates = [IsLittleEndian, HasP9Vector] in {
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
   def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
             (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
@@ -2830,21 +2972,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // IsLittleEndian, HasP9Vector
 
   let Predicates = [IsBigEndian, HasP9Vector] in {
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
             (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
   def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
             (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
@@ -2869,12 +3011,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
   def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
   def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(f128  (quadwOffsetLoad iqaddr:$src)),
+            (COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
   def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
   def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
 
   def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
   def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
   def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore  f128:$rS, iqaddr:$dst),
+            (STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
   def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
   def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
@@ -2888,6 +3034,10 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
   def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
   def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(f128  (nonQuadwOffsetLoad xoaddr:$src)),
+            (COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>;
+  def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst),
+            (STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
   def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
             (STXVX $rS, xoaddr:$dst)>;
   def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
@@ -2904,7 +3054,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (v4i32 (LXVWSX xoaddr:$src))>;
   def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
             (v4f32 (LXVWSX xoaddr:$src))>;
-  def : Pat<(v4f32 (scalar_to_vector (f32 (fpround (extloadf32 xoaddr:$src))))),
+  def : Pat<(v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extloadf32 xoaddr:$src)))))),
             (v4f32 (LXVWSX xoaddr:$src))>;
 
   // Build vectors from i8 loads
@@ -2936,109 +3087,109 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   let Predicates = [IsBigEndian, HasP9Vector] in {
   // Scalar stores of i8
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
             (STXSIBXv $S, xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
 
   // Scalar stores of i16
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
             (STXSIHXv $S, xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
   } // IsBigEndian, HasP9Vector
 
   let Predicates = [IsLittleEndian, HasP9Vector] in {
   // Scalar stores of i8
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
             (STXSIBXv $S, xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
-            (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+            (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>;
 
   // Scalar stores of i16
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
             (STXSIHXv $S, xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
   def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
-            (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+            (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
   } // IsLittleEndian, HasP9Vector
 
 
@@ -3064,21 +3215,264 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   }
   def : Pat<(f64 (extloadf32 ixaddr:$src)),
             (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))),
+  def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
             (f32 (DFLOADf32 ixaddr:$src))>;
+
+  let Predicates = [IsBigEndian, HasP9Vector] in {
+
+    // (Un)Signed DWord vector extract -> QP
+    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+              (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+              (f128 (XSCVSDQP
+                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+              (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+              (f128 (XSCVUDQP
+                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+
+    // (Un)Signed Word vector extract -> QP
+    def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 1)))),
+              (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
+    foreach Idx = [0,2,3] in {
+      def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
+                (f128 (XSCVSDQP (EXTRACT_SUBREG
+                                (VEXTSW2D (VSPLTW Idx, $src)), sub_64)))>;
+    }
+    foreach Idx = 0-3 in {
+      def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
+                (f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>;
+    }
+
+    // (Un)Signed HWord vector extract -> QP
+    foreach Idx = 0-7 in {
+      def : Pat<(f128 (sint_to_fp
+                        (i32 (sext_inreg
+                               (vector_extract v8i16:$src, Idx), i16)))),
+              (f128 (XSCVSDQP (EXTRACT_SUBREG
+                                (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
+                                sub_64)))>;
+      // The SDAG adds the `and` since an `i16` is being extracted as an `i32`.
+      def : Pat<(f128 (uint_to_fp
+                        (and (i32 (vector_extract v8i16:$src, Idx)), 65535))),
+                (f128 (XSCVUDQP (EXTRACT_SUBREG
+                                  (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
+    }
+
+    // (Un)Signed Byte vector extract -> QP
+    foreach Idx = 0-15 in {
+      def : Pat<(f128 (sint_to_fp
+                        (i32 (sext_inreg (vector_extract v16i8:$src, Idx),
+                                         i8)))),
+                (f128 (XSCVSDQP (EXTRACT_SUBREG
+                                  (VEXTSB2D (VEXTRACTUB Idx, $src)), sub_64)))>;
+      def : Pat<(f128 (uint_to_fp
+                        (and (i32 (vector_extract v16i8:$src, Idx)), 255))),
+                (f128 (XSCVUDQP
+                        (EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>;
+    }
+
+    // Unsiged int in vsx register -> QP
+    def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
+              (f128 (XSCVUDQP
+                      (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
+  } // IsBigEndian, HasP9Vector
+
+  let Predicates = [IsLittleEndian, HasP9Vector] in {
+
+    // (Un)Signed DWord vector extract -> QP
+    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+              (f128 (XSCVSDQP
+                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+              (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+              (f128 (XSCVUDQP
+                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+              (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+
+    // (Un)Signed Word vector extract -> QP
+    foreach Idx = [[0,3],[1,2],[3,0]] in {
+      def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
+                (f128 (XSCVSDQP (EXTRACT_SUBREG
+                                  (VEXTSW2D (VSPLTW !head(!tail(Idx)), $src)),
+                                  sub_64)))>;
+    }
+    def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 2)))),
+              (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
+
+    foreach Idx = [[0,12],[1,8],[2,4],[3,0]] in {
+      def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
+                (f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>;
+    }
+
+    // (Un)Signed HWord vector extract -> QP
+    // The Nested foreach lists identifies the vector element and corresponding
+    // register byte location.
+    foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in {
+      def : Pat<(f128 (sint_to_fp
+                        (i32 (sext_inreg
+                               (vector_extract v8i16:$src, !head(Idx)), i16)))),
+                (f128 (XSCVSDQP
+                        (EXTRACT_SUBREG (VEXTSH2D
+                                          (VEXTRACTUH !head(!tail(Idx)), $src)),
+                                        sub_64)))>;
+      def : Pat<(f128 (uint_to_fp
+                        (and (i32 (vector_extract v8i16:$src, !head(Idx))),
+                             65535))),
+                (f128 (XSCVUDQP (EXTRACT_SUBREG
+                                  (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
+    }
+
+    // (Un)Signed Byte vector extract -> QP
+    foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7],
+                   [9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in {
+      def : Pat<(f128 (sint_to_fp
+                        (i32 (sext_inreg
+                               (vector_extract v16i8:$src, !head(Idx)), i8)))),
+                (f128 (XSCVSDQP
+                        (EXTRACT_SUBREG
+                          (VEXTSB2D (VEXTRACTUB !head(!tail(Idx)), $src)),
+                          sub_64)))>;
+      def : Pat<(f128 (uint_to_fp
+                        (and (i32 (vector_extract v16i8:$src, !head(Idx))),
+                             255))),
+                (f128 (XSCVUDQP
+                        (EXTRACT_SUBREG
+                          (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
+    }
+
+    // Unsiged int in vsx register -> QP
+    def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
+              (f128 (XSCVUDQP
+                      (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 8)))>;
+  } // IsLittleEndian, HasP9Vector
+
+  // Convert (Un)Signed DWord in memory -> QP
+  def : Pat<(f128 (sint_to_fp (i64 (load xaddr:$src)))),
+            (f128 (XSCVSDQP (LXSDX xaddr:$src)))>;
+  def : Pat<(f128 (sint_to_fp (i64 (load ixaddr:$src)))),
+            (f128 (XSCVSDQP (LXSD ixaddr:$src)))>;
+  def : Pat<(f128 (uint_to_fp (i64 (load xaddr:$src)))),
+            (f128 (XSCVUDQP (LXSDX xaddr:$src)))>;
+  def : Pat<(f128 (uint_to_fp (i64 (load ixaddr:$src)))),
+            (f128 (XSCVUDQP (LXSD ixaddr:$src)))>;
+
+  // Convert Unsigned HWord in memory -> QP
+  def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
+            (f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>;
+
+  // Convert Unsigned Byte in memory -> QP
+  def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
+            (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
+
+  // Truncate & Convert QP -> (Un)Signed (D)Word.
+  def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
+  def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
+  def : Pat<(i32 (fp_to_sint f128:$src)),
+            (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
+  def : Pat<(i32 (fp_to_uint f128:$src)),
+            (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;
+
+  // Instructions for store(fptosi).
+  // The 8-byte version is repeated here due to availability of D-Form STXSD.
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddr:$dst, 8),
+            (STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+                    xaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ixaddr:$dst, 8),
+            (STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+                   ixaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
+            (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2),
+            (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
+            (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddr:$dst, 8),
+            (STXSDX (XSCVDPSXDS f64:$src), xaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ixaddr:$dst, 8),
+            (STXSD (XSCVDPSXDS f64:$src), ixaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
+            (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1),
+            (STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+
+  // Instructions for store(fptoui).
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddr:$dst, 8),
+            (STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+                    xaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ixaddr:$dst, 8),
+            (STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+                   ixaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
+            (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2),
+            (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
+            (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddr:$dst, 8),
+            (STXSDX (XSCVDPUXDS f64:$src), xaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ixaddr:$dst, 8),
+            (STXSD (XSCVDPUXDS f64:$src), ixaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
+            (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1),
+            (STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+
+  // Round & Convert QP -> DP/SP
+  def : Pat<(f64 (fpround f128:$src)), (f64 (XSCVQPDP $src))>;
+  def : Pat<(f32 (fpround f128:$src)), (f32 (XSRSP (XSCVQPDPO $src)))>;
+
+  // Convert SP -> QP
+  def : Pat<(f128 (fpextend f32:$src)),
+            (f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>;
+
 } // end HasP9Vector, AddedComplexity
 
+let AddedComplexity = 400 in {
+  let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsBigEndian] in {
+    def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
+              (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
+  }
+  let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsLittleEndian] in {
+    def : Pat<(f128 (PPCbuild_fp128 i64:$rA, i64:$rB)),
+              (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
+  }
+}
+
 let Predicates = [HasP9Vector] in {
   let isPseudo = 1 in {
     let mayStore = 1 in {
-      def SPILLTOVSR_STX : Pseudo<(outs), (ins spilltovsrrc:$XT, memrr:$dst),
-                                "#SPILLTOVSR_STX", []>;
+      def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
+                                            (ins spilltovsrrc:$XT, memrr:$dst),
+                                            "#SPILLTOVSR_STX", []>;
       def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
                                 "#SPILLTOVSR_ST", []>;
     }
     let mayLoad = 1 in {
-      def SPILLTOVSR_LDX : Pseudo<(outs spilltovsrrc:$XT), (ins memrr:$src),
-                                "#SPILLTOVSR_LDX", []>;
+      def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
+                                            (ins memrr:$src),
+                                            "#SPILLTOVSR_LDX", []>;
       def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
                                 "#SPILLTOVSR_LD", []>;
 
@@ -3170,10 +3564,10 @@ def FltToULongLoadP9 {
   dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToLong {
-  dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
+  dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A)))));
 }
 def FltToULong {
-  dag A = (i64 (PPCmfvsr (PPCfctiduz (fpextend f32:$A))));
+  dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz (fpextend f32:$A)))));
 }
 def DblToInt {
   dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
@@ -3219,7 +3613,6 @@ def MrgFP {
 }
 
 // Patterns for BUILD_VECTOR nodes.
-def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
 let AddedComplexity = 400 in {
 
   let Predicates = [HasVSX] in {
@@ -3389,8 +3782,10 @@ let AddedComplexity = 400 in {
     def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
               (v2i64 (MTVSRDD $rB, $rA))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC),
-                      (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC))>;
+              (VMRGOW
+                (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC)),
+                (v4i32
+                  (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC)))>;
   }
 
   let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
@@ -3400,8 +3795,10 @@ let AddedComplexity = 400 in {
     def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
               (v2i64 (MTVSRDD $rB, $rA))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC),
-                      (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>;
+              (VMRGOW
+                (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC)),
+                (v4i32
+                  (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC)))>;
   }
   // P9 Altivec instructions that can be used to build vectors.
   // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index cdf544bdfac3..2217fa4693ce 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
@@ -47,8 +48,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <cassert>
 #include <iterator>
@@ -246,15 +247,14 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
   if (!L->empty())
     return MadeChange;
 
-  DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
+  LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
 
   BasicBlock *Header = L->getHeader();
 
   const PPCSubtarget *ST =
     TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
 
-  unsigned HeaderLoopPredCount =
-    std::distance(pred_begin(Header), pred_end(Header));
+  unsigned HeaderLoopPredCount = pred_size(Header);
 
   // Collect buckets of comparable addresses used by loads and stores.
   SmallVector<Bucket, 16> Buckets;
@@ -294,6 +294,19 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
       if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) {
         if (LARSCEV->getLoop() != L)
           continue;
+        // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
+        // be 4's multiple (DS-form). For i64 loads/stores when the displacement
+        // fits in a 16-bit signed field but isn't a multiple of 4, it will be
+        // useless and possible to break some original well-form addressing mode
+        // to make this pre-inc prep for it.
+        if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
+          if (const SCEVConstant *StepConst =
+                  dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
+            const APInt &ConstInt = StepConst->getValue()->getValue();
+            if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
+              continue;
+          }
+        }
       } else {
         continue;
       }
@@ -332,7 +345,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
   if (!LoopPredecessor)
     return MadeChange;
 
-  DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
+  LLVM_DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
 
   SmallSet<BasicBlock *, 16> BBChanged;
   for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
@@ -381,7 +394,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     if (!BasePtrSCEV->isAffine())
       continue;
 
-    DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+    LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
     assert(BasePtrSCEV->getLoop() == L &&
            "AddRec for the wrong loop?");
 
@@ -407,7 +420,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     if (!isSafeToExpand(BasePtrStartSCEV, *SE))
       continue;
 
-    DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+    LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
 
     if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
       continue;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 1e40711328ec..62a612feb55c 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -21,13 +21,13 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
@@ -107,10 +107,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
       break;
   }
 
-  if (MO.getTargetFlags() == PPCII::MO_PLT)
+ if (MO.getTargetFlags() == PPCII::MO_PLT)
     RefKind = MCSymbolRefExpr::VK_PLT;
 
+  const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+  const PPCSubtarget *Subtarget = &(MF->getSubtarget<PPCSubtarget>());
+  const TargetMachine &TM = Printer.TM;
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
+  // -msecure-plt option works only in PIC mode. If secure plt mode
+  // is on add 32768 to symbol.
+  if (Subtarget->isSecurePlt() && TM.isPositionIndependent() &&
+      MO.getTargetFlags() == PPCII::MO_PLT)
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(32768, Ctx),
+                                   Ctx);
 
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::createAdd(Expr,
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index a4c7a030389b..dbe1fe37ddf8 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -119,8 +119,8 @@ void PPCMIPeephole::initialize(MachineFunction &MFParm) {
   MRI = &MF->getRegInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
   TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
-  DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
-  DEBUG(MF->dump());
+  LLVM_DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
+  LLVM_DEBUG(MF->dump());
 }
 
 static MachineInstr *getVRegDefOrNull(MachineOperand *Op,
@@ -190,18 +190,18 @@ getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) {
 }
 
 // This function maintains a map for the pairs <TOC Save Instr, Keep>
-// Each time a new TOC save is encountered, it checks if any of the exisiting
-// ones are dominated by the new one. If so, it marks the exisiting one as
+// Each time a new TOC save is encountered, it checks if any of the existing
+// ones are dominated by the new one. If so, it marks the existing one as
 // redundant by setting it's entry in the map as false. It then adds the new
 // instruction to the map with either true or false depending on if any
-// exisiting instructions dominated the new one.
+// existing instructions dominated the new one.
 void PPCMIPeephole::UpdateTOCSaves(
   std::map<MachineInstr *, bool> &TOCSaves, MachineInstr *MI) {
   assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here");
   bool Keep = true;
   for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) {
     MachineInstr *CurrInst = It->first;
-    // If new instruction dominates an exisiting one, mark exisiting one as
+    // If new instruction dominates an existing one, mark existing one as
     // redundant.
     if (It->second && MDT->dominates(MI, CurrInst))
       It->second = false;
@@ -220,7 +220,7 @@ bool PPCMIPeephole::simplifyCode(void) {
   bool Simplified = false;
   MachineInstr* ToErase = nullptr;
   std::map<MachineInstr *, bool> TOCSaves;
-
+  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
   NumFunctionsEnteredInMIPeephole++;
   if (ConvertRegReg) {
     // Fixed-point conversion of reg/reg instructions fed by load-immediate
@@ -232,14 +232,14 @@ bool PPCMIPeephole::simplifyCode(void) {
       SomethingChanged = false;
       for (MachineBasicBlock &MBB : *MF) {
         for (MachineInstr &MI : MBB) {
-          if (MI.isDebugValue())
+          if (MI.isDebugInstr())
             continue;
 
           if (TII->convertToImmediateForm(MI)) {
             // We don't erase anything in case the def has other uses. Let DCE
             // remove it if it can be removed.
-            DEBUG(dbgs() << "Converted instruction to imm form: ");
-            DEBUG(MI.dump());
+            LLVM_DEBUG(dbgs() << "Converted instruction to imm form: ");
+            LLVM_DEBUG(MI.dump());
             NumConvertedToImmediateForm++;
             SomethingChanged = true;
             Simplified = true;
@@ -261,7 +261,7 @@ bool PPCMIPeephole::simplifyCode(void) {
       }
 
       // Ignore debug instructions.
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       // Per-opcode peepholes.
@@ -276,7 +276,7 @@ bool PPCMIPeephole::simplifyCode(void) {
             !MF->getSubtarget<PPCSubtarget>().isELFv2ABI())
           break;
         // When encountering a TOC save instruction, call UpdateTOCSaves
-        // to add it to the TOCSaves map and mark any exisiting TOC saves
+        // to add it to the TOCSaves map and mark any existing TOC saves
         // it dominates as redundant.
         if (TII->isTOCSaveMI(MI))
           UpdateTOCSaves(TOCSaves, &MI);
@@ -297,9 +297,9 @@ bool PPCMIPeephole::simplifyCode(void) {
           // We have to look through chains of COPY and SUBREG_TO_REG
           // to find the real source values for comparison.
           unsigned TrueReg1 =
-            TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
+            TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
           unsigned TrueReg2 =
-            TII->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
+            TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
 
           if (TrueReg1 == TrueReg2
               && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
@@ -314,7 +314,7 @@ bool PPCMIPeephole::simplifyCode(void) {
               if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
                 return false;
               unsigned DefReg =
-                TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+                TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
               if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
                 MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
                 if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
@@ -324,10 +324,9 @@ bool PPCMIPeephole::simplifyCode(void) {
             };
             if (DefMI && (Immed == 0 || Immed == 3)) {
               if (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat()) {
-                DEBUG(dbgs()
-                      << "Optimizing load-and-splat/splat "
-                      "to load-and-splat/copy: ");
-                DEBUG(MI.dump());
+                LLVM_DEBUG(dbgs() << "Optimizing load-and-splat/splat "
+                                     "to load-and-splat/copy: ");
+                LLVM_DEBUG(MI.dump());
                 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                         MI.getOperand(0).getReg())
                     .add(MI.getOperand(1));
@@ -341,15 +340,14 @@ bool PPCMIPeephole::simplifyCode(void) {
             if (DefOpc == PPC::XXPERMDI) {
               unsigned FeedImmed = DefMI->getOperand(3).getImm();
               unsigned FeedReg1 =
-                TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+                TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
               unsigned FeedReg2 =
-                TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
+                TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
 
               if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
-                DEBUG(dbgs()
-                      << "Optimizing splat/swap or splat/splat "
-                      "to splat/copy: ");
-                DEBUG(MI.dump());
+                LLVM_DEBUG(dbgs() << "Optimizing splat/swap or splat/splat "
+                                     "to splat/copy: ");
+                LLVM_DEBUG(MI.dump());
                 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                         MI.getOperand(0).getReg())
                     .add(MI.getOperand(1));
@@ -362,8 +360,8 @@ bool PPCMIPeephole::simplifyCode(void) {
               // parameter.
               else if ((Immed == 0 || Immed == 3)
                        && FeedImmed == 2 && FeedReg1 == FeedReg2) {
-                DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
-                DEBUG(MI.dump());
+                LLVM_DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
+                LLVM_DEBUG(MI.dump());
                 MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
                 MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
                 MI.getOperand(3).setImm(3 - Immed);
@@ -373,8 +371,8 @@ bool PPCMIPeephole::simplifyCode(void) {
               // If this is a swap fed by a swap, we can replace it
               // with a copy from the first swap's input.
               else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
-                DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
-                DEBUG(MI.dump());
+                LLVM_DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
+                LLVM_DEBUG(MI.dump());
                 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                         MI.getOperand(0).getReg())
                     .add(DefMI->getOperand(1));
@@ -389,8 +387,8 @@ bool PPCMIPeephole::simplifyCode(void) {
               DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
               ToErase = &MI;
               Simplified = true;
-              DEBUG(dbgs() << "Removing redundant splat: ");
-              DEBUG(MI.dump());
+              LLVM_DEBUG(dbgs() << "Removing redundant splat: ");
+              LLVM_DEBUG(MI.dump());
             }
           }
         }
@@ -402,7 +400,7 @@ bool PPCMIPeephole::simplifyCode(void) {
         unsigned MyOpcode = MI.getOpcode();
         unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
         unsigned TrueReg =
-          TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
+          TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
         if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -429,8 +427,8 @@ bool PPCMIPeephole::simplifyCode(void) {
         // If the instruction[s] that feed this splat have already splat
         // the value, this splat is redundant.
         if (AlreadySplat) {
-          DEBUG(dbgs() << "Changing redundant splat to a copy: ");
-          DEBUG(MI.dump());
+          LLVM_DEBUG(dbgs() << "Changing redundant splat to a copy: ");
+          LLVM_DEBUG(MI.dump());
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                   MI.getOperand(0).getReg())
               .add(MI.getOperand(OpNo));
@@ -448,14 +446,14 @@ bool PPCMIPeephole::simplifyCode(void) {
           if (ShiftOp1 == ShiftOp2) {
             unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
             if (MRI->hasOneNonDBGUse(ShiftRes)) {
-              DEBUG(dbgs() << "Removing redundant shift: ");
-              DEBUG(DefMI->dump());
+              LLVM_DEBUG(dbgs() << "Removing redundant shift: ");
+              LLVM_DEBUG(DefMI->dump());
               ToErase = DefMI;
             }
             Simplified = true;
-            DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
-                  " to " << NewElem << " in instruction: ");
-            DEBUG(MI.dump());
+            LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm
+                              << " to " << NewElem << " in instruction: ");
+            LLVM_DEBUG(MI.dump());
             MI.getOperand(1).setReg(ShiftOp1);
             MI.getOperand(2).setImm(NewElem);
           }
@@ -465,7 +463,7 @@ bool PPCMIPeephole::simplifyCode(void) {
       case PPC::XVCVDPSP: {
         // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
         unsigned TrueReg =
-          TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
+          TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
         if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -474,9 +472,9 @@ bool PPCMIPeephole::simplifyCode(void) {
         // values.
         if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
           unsigned DefsReg1 =
-            TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+            TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
           unsigned DefsReg2 =
-            TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
+            TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
           if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
               !TargetRegisterInfo::isVirtualRegister(DefsReg2))
             break;
@@ -499,12 +497,12 @@ bool PPCMIPeephole::simplifyCode(void) {
                 if (Use.getOperand(i).isReg() &&
                     Use.getOperand(i).getReg() == FRSPDefines)
                   Use.getOperand(i).setReg(ConvReg1);
-              DEBUG(dbgs() << "Removing redundant FRSP:\n");
-              DEBUG(RoundInstr->dump());
-              DEBUG(dbgs() << "As it feeds instruction:\n");
-              DEBUG(MI.dump());
-              DEBUG(dbgs() << "Through instruction:\n");
-              DEBUG(DefMI->dump());
+              LLVM_DEBUG(dbgs() << "Removing redundant FRSP:\n");
+              LLVM_DEBUG(RoundInstr->dump());
+              LLVM_DEBUG(dbgs() << "As it feeds instruction:\n");
+              LLVM_DEBUG(MI.dump());
+              LLVM_DEBUG(dbgs() << "Through instruction:\n");
+              LLVM_DEBUG(DefMI->dump());
               RoundInstr->eraseFromParent();
             }
           };
@@ -552,11 +550,11 @@ bool PPCMIPeephole::simplifyCode(void) {
           };
           unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
                                        isXForm(SrcMI->getOpcode()));
-          DEBUG(dbgs() << "Zero-extending load\n");
-          DEBUG(SrcMI->dump());
-          DEBUG(dbgs() << "and sign-extension\n");
-          DEBUG(MI.dump());
-          DEBUG(dbgs() << "are merged into sign-extending load\n");
+          LLVM_DEBUG(dbgs() << "Zero-extending load\n");
+          LLVM_DEBUG(SrcMI->dump());
+          LLVM_DEBUG(dbgs() << "and sign-extension\n");
+          LLVM_DEBUG(MI.dump());
+          LLVM_DEBUG(dbgs() << "are merged into sign-extending load\n");
           SrcMI->setDesc(TII->get(Opc));
           SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
           ToErase = &MI;
@@ -596,11 +594,11 @@ bool PPCMIPeephole::simplifyCode(void) {
           };
           unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
                                        isXForm(SrcMI->getOpcode()));
-          DEBUG(dbgs() << "Zero-extending load\n");
-          DEBUG(SrcMI->dump());
-          DEBUG(dbgs() << "and sign-extension\n");
-          DEBUG(MI.dump());
-          DEBUG(dbgs() << "are merged into sign-extending load\n");
+          LLVM_DEBUG(dbgs() << "Zero-extending load\n");
+          LLVM_DEBUG(SrcMI->dump());
+          LLVM_DEBUG(dbgs() << "and sign-extension\n");
+          LLVM_DEBUG(MI.dump());
+          LLVM_DEBUG(dbgs() << "are merged into sign-extending load\n");
           SrcMI->setDesc(TII->get(Opc));
           SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
           ToErase = &MI;
@@ -610,7 +608,7 @@ bool PPCMIPeephole::simplifyCode(void) {
                    TII->isSignExtended(*SrcMI)) {
           // We can eliminate EXTSW if the input is known to be already
           // sign-extended.
-          DEBUG(dbgs() << "Removing redundant sign-extension\n");
+          LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
           unsigned TmpReg =
             MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF),
@@ -661,7 +659,7 @@ bool PPCMIPeephole::simplifyCode(void) {
 
         unsigned KnownZeroCount = getKnownLeadingZeroCount(SrcMI, TII);
         if (MI.getOperand(3).getImm() <= KnownZeroCount) {
-          DEBUG(dbgs() << "Removing redundant zero-extension\n");
+          LLVM_DEBUG(dbgs() << "Removing redundant zero-extension\n");
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                   MI.getOperand(0).getReg())
               .addReg(SrcReg);
@@ -727,8 +725,8 @@ bool PPCMIPeephole::simplifyCode(void) {
         MachineInstr *DefPhiMI = getVRegDefOrNull(&Op1, MRI);
         for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) {
           MachineInstr *LiMI = getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI);
-          DEBUG(dbgs() << "Optimizing LI to ADDI: ");
-          DEBUG(LiMI->dump());
+          LLVM_DEBUG(dbgs() << "Optimizing LI to ADDI: ");
+          LLVM_DEBUG(LiMI->dump());
 
           // There could be repeated registers in the PHI, e.g: %1 =
           // PHI %6, <%bb.2>, %8, <%bb.3>, %8, <%bb.6>; So if we've
@@ -746,12 +744,12 @@ bool PPCMIPeephole::simplifyCode(void) {
           MachineInstrBuilder(*LiMI->getParent()->getParent(), *LiMI)
               .addReg(DominatorReg)
               .addImm(LiImm); // restore the imm of LI
-          DEBUG(LiMI->dump());
+          LLVM_DEBUG(LiMI->dump());
         }
 
         // Replace ADD with COPY
-        DEBUG(dbgs() << "Optimizing ADD to COPY: ");
-        DEBUG(MI.dump());
+        LLVM_DEBUG(dbgs() << "Optimizing ADD to COPY: ");
+        LLVM_DEBUG(MI.dump());
         BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                 MI.getOperand(0).getReg())
             .add(Op1);
@@ -849,7 +847,7 @@ static unsigned getPredicateToIncImm(MachineInstr *BI, MachineInstr *CMPI) {
   return 0;
 }
 
-// This takes a Phi node and returns a register value for the spefied BB.
+// This takes a Phi node and returns a register value for the specified BB.
 static unsigned getIncomingRegForBlock(MachineInstr *Phi,
                                        MachineBasicBlock *MBB) {
   for (unsigned I = 2, E = Phi->getNumOperands() + 1; I != E; I += 2) {
@@ -979,9 +977,9 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
 }
 
 // This function will iterate over the input map containing a pair of TOC save
-// instruction and a flag. The flag will be set to false if the TOC save is proven
-// redundant. This function will erase from the basic block all the TOC saves
-// marked as redundant.
+// instruction and a flag. The flag will be set to false if the TOC save is
+// proven redundant. This function will erase from the basic block all the TOC
+// saves marked as redundant.
 bool PPCMIPeephole::eliminateRedundantTOCSaves(
     std::map<MachineInstr *, bool> &TOCSaves) {
   bool Simplified = false;
@@ -1192,16 +1190,16 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
         }
       }
 
-      // We cannnot merge two compares if the immediates are not same.
+      // We cannot merge two compares if the immediates are not same.
       if (NewImm2 != NewImm1)
         continue;
     }
 
-    DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n");
-    DEBUG(CMPI1->dump());
-    DEBUG(BI1->dump());
-    DEBUG(CMPI2->dump());
-    DEBUG(BI2->dump());
+    LLVM_DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n");
+    LLVM_DEBUG(CMPI1->dump());
+    LLVM_DEBUG(BI1->dump());
+    LLVM_DEBUG(CMPI2->dump());
+    LLVM_DEBUG(BI2->dump());
 
     // We adjust opcode, predicates and immediate as we determined above.
     if (NewOpCode != 0 && NewOpCode != CMPI1->getOpcode()) {
@@ -1260,15 +1258,15 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
     BI2->getOperand(1).setIsKill(true);
     BI1->getOperand(1).setIsKill(false);
 
-    DEBUG(dbgs() << "into a compare and two branches:\n");
-    DEBUG(CMPI1->dump());
-    DEBUG(BI1->dump());
-    DEBUG(BI2->dump());
+    LLVM_DEBUG(dbgs() << "into a compare and two branches:\n");
+    LLVM_DEBUG(CMPI1->dump());
+    LLVM_DEBUG(BI1->dump());
+    LLVM_DEBUG(BI2->dump());
     if (IsPartiallyRedundant) {
-      DEBUG(dbgs() << "The following compare is moved into "
-                   << printMBBReference(*MBBtoMoveCmp)
-                   << " to handle partial redundancy.\n");
-      DEBUG(CMPI2->dump());
+      LLVM_DEBUG(dbgs() << "The following compare is moved into "
+                        << printMBBReference(*MBBtoMoveCmp)
+                        << " to handle partial redundancy.\n");
+      LLVM_DEBUG(CMPI2->dump());
     }
 
     Simplified = true;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
deleted file mode 100644
index 628ea2ab9fe6..000000000000
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
+++ /dev/null
@@ -1,198 +0,0 @@
-//==-- PPCMachineBasicBlockUtils.h - Functions for common MBB operations ---==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines utility functions for commonly used operations on
-// MachineBasicBlock's.
-// NOTE: Include this file after defining DEBUG_TYPE so that the debug messages
-//       can be emitted for the pass that is using this.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
-#define LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
-
-#include "PPCInstrInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-#ifndef DEBUG_TYPE
-#define DEBUG_TYPE "ppc-generic-mbb-utilities"
-#endif
-
-using namespace llvm;
-
-/// Given a basic block \p Successor that potentially contains PHIs, this
-/// function will look for any incoming values in the PHIs that are supposed to
-/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
-/// Any such PHIs will be updated to reflect reality.
-static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB,
-                       MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) {
-  for (auto &MI : Successor->instrs()) {
-    if (!MI.isPHI())
-      continue;
-    // This is a really ugly-looking loop, but it was pillaged directly from
-    // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
-    for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.getMBB() == OrigMBB) {
-        // Check if the instruction is actualy defined in NewMBB.
-        if (MI.getOperand(i-1).isReg()) {
-          MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i-1).getReg());
-          if (DefMI->getParent() == NewMBB || !OrigMBB->isSuccessor(Successor)) {
-            MO.setMBB(NewMBB);
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
-/// Given a basic block \p Successor that potentially contains PHIs, this
-/// function will look for PHIs that have an incoming value from \p OrigMBB
-/// and will add the same incoming value from \p NewMBB.
-/// NOTE: This should only be used if \p NewMBB is an immediate dominator of
-/// \p OrigMBB.
-static void addIncomingValuesToPHIs(MachineBasicBlock *Successor,
-                                    MachineBasicBlock *OrigMBB,
-                                    MachineBasicBlock *NewMBB,
-                                    MachineRegisterInfo *MRI) {
-  assert(OrigMBB->isSuccessor(NewMBB) && "NewMBB must be a sucessor of OrigMBB");
-  for (auto &MI : Successor->instrs()) {
-    if (!MI.isPHI())
-      continue;
-    // This is a really ugly-looking loop, but it was pillaged directly from
-    // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
-    for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.getMBB() == OrigMBB) {
-        MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
-        MIB.addReg(MI.getOperand(i-1).getReg()).addMBB(NewMBB);
-        break;
-      }
-    }
-  }
-}
-
-struct BlockSplitInfo {
-  MachineInstr *OrigBranch;
-  MachineInstr *SplitBefore;
-  MachineInstr *SplitCond;
-  bool InvertNewBranch;
-  bool InvertOrigBranch;
-  bool BranchToFallThrough;
-  const MachineBranchProbabilityInfo *MBPI;
-  MachineInstr *MIToDelete;
-  MachineInstr *NewCond;
-  bool allInstrsInSameMBB() {
-    if (!OrigBranch || !SplitBefore || !SplitCond)
-      return false;
-    MachineBasicBlock *MBB = OrigBranch->getParent();
-    if (SplitBefore->getParent() != MBB ||
-        SplitCond->getParent() != MBB)
-      return false;
-    if (MIToDelete && MIToDelete->getParent() != MBB)
-      return false;
-    if (NewCond && NewCond->getParent() != MBB)
-      return false;
-    return true;
-  }
-};
-
-/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original
-/// branch is \p OrigBranch. The target of the new branch can either be the same
-/// as the target of the original branch or the fallthrough successor of the
-/// original block as determined by \p BranchToFallThrough. The branch
-/// conditions will be inverted according to \p InvertNewBranch and
-/// \p InvertOrigBranch. If an instruction that previously fed the branch is to
-/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as
-/// the branch condition. The branch probabilities will be set if the
-/// MachineBranchProbabilityInfo isn't null.
-static bool splitMBB(BlockSplitInfo &BSI) {
-  assert(BSI.allInstrsInSameMBB() &&
-         "All instructions must be in the same block.");
-
-  MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent();
-  MachineFunction *MF = ThisMBB->getParent();
-  MachineRegisterInfo *MRI = &MF->getRegInfo();
-  assert(MRI->isSSA() && "Can only do this while the function is in SSA form.");
-  if (ThisMBB->succ_size() != 2) {
-    DEBUG(dbgs() << "Don't know how to handle blocks that don't have exactly"
-                 << " two succesors.\n");
-    return false;
-  }
-
-  const PPCInstrInfo *TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
-  unsigned OrigBROpcode = BSI.OrigBranch->getOpcode();
-  unsigned InvertedOpcode =
-    OrigBROpcode == PPC::BC ? PPC::BCn :
-    OrigBROpcode == PPC::BCn ? PPC::BC :
-    OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR;
-  unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode;
-  MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB();
-  MachineBasicBlock *OrigFallThrough =
-    OrigTarget == *ThisMBB->succ_begin() ? *ThisMBB->succ_rbegin() :
-    *ThisMBB->succ_begin();
-  MachineBasicBlock *NewBRTarget =
-    BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
-  BranchProbability ProbToNewTarget =
-    !BSI.MBPI ? BranchProbability::getUnknown() :
-    BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
-
-  // Create a new basic block.
-  MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
-  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
-  MachineFunction::iterator It = ThisMBB->getIterator();
-  MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(++It, NewMBB);
-
-  // Move everything after SplitBefore into the new block.
-  NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
-  NewMBB->transferSuccessors(ThisMBB);
-
-  // Add the two successors to ThisMBB. The probabilities come from the
-  // existing blocks if available.
-  ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
-  ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
-
-  // Add the branches to ThisMBB.
-  BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
-          TII->get(NewBROpcode)).addReg(BSI.SplitCond->getOperand(0).getReg())
-          .addMBB(NewBRTarget);
-  BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
-          TII->get(PPC::B)).addMBB(NewMBB);
-  if (BSI.MIToDelete)
-    BSI.MIToDelete->eraseFromParent();
-
-  // Change the condition on the original branch and invert it if requested.
-  auto FirstTerminator = NewMBB->getFirstTerminator();
-  if (BSI.NewCond) {
-    assert(FirstTerminator->getOperand(0).isReg() &&
-           "Can't update condition of unconditional branch.");
-    FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg());
-  }
-  if (BSI.InvertOrigBranch)
-    FirstTerminator->setDesc(TII->get(InvertedOpcode));
-
-  // If any of the PHIs in the successors of NewMBB reference values that
-  // now come from NewMBB, they need to be updated.
-  for (auto *Succ : NewMBB->successors()) {
-    updatePHIs(Succ, ThisMBB, NewMBB, MRI);
-  }
-  addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
-
-  DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
-  DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
-  DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
-  return true;
-}
-
-
-#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index a9b6073106ea..b14bbad2039a 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -45,6 +45,11 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// PEI.
   bool MustSaveLR;
 
+  /// Do we have to disable shrink-wrapping? This has to be set if we emit any
+  /// instructions that clobber LR in the entry block because discovering this
+  /// in PEI is too late (happens after shrink-wrapping);
+  bool ShrinkWrapDisabled = false;
+
   /// Does this function have any stack spills.
   bool HasSpills = false;
 
@@ -147,6 +152,12 @@ public:
   void setMustSaveLR(bool U) { MustSaveLR = U; }
   bool mustSaveLR() const    { return MustSaveLR; }
 
+  /// We certainly don't want to shrink wrap functions if we've emitted a
+  /// MovePCtoLR8 as that has to go into the entry, so the prologue definitely
+  /// has to go into the entry block.
+  void setShrinkWrapDisabled(bool U) { ShrinkWrapDisabled = U; }
+  bool shrinkWrapDisabled() const { return ShrinkWrapDisabled; }
+
   void setHasSpills()      { HasSpills = true; }
   bool hasSpills() const   { return HasSpills; }
 
@@ -185,11 +196,11 @@ public:
     LiveInAttrs.push_back(std::make_pair(VReg, Flags));
   }
 
-  /// This function returns true if the spesified vreg is
+  /// This function returns true if the specified vreg is
   /// a live-in register and sign-extended.
   bool isLiveInSExt(unsigned VReg) const;
 
-  /// This function returns true if the spesified vreg is
+  /// This function returns true if the specified vreg is
   /// a live-in register and zero-extended.
   bool isLiveInZExt(unsigned VReg) const;
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index d524c354ed35..1892d1e3dc26 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -67,8 +67,8 @@ namespace {
           if (TII->convertToImmediateForm(MI, &DefMIToErase)) {
             Changed = true;
             NumRRConvertedInPreEmit++;
-            DEBUG(dbgs() << "Converted instruction to imm form: ");
-            DEBUG(MI.dump());
+            LLVM_DEBUG(dbgs() << "Converted instruction to imm form: ");
+            LLVM_DEBUG(MI.dump());
             if (DefMIToErase) {
               InstrsToErase.push_back(DefMIToErase);
             }
@@ -76,8 +76,8 @@ namespace {
         }
       }
       for (MachineInstr *MI : InstrsToErase) {
-        DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: ");
-        DEBUG(MI->dump());
+        LLVM_DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: ");
+        LLVM_DEBUG(MI->dump());
         MI->eraseFromParent();
         NumRemovedInPreEmit++;
       }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/contrib/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 5b2d7191683c..173fc18b9ebf 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -15,18 +15,21 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
 #include "PPC.h"
+#include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-reduce-cr-ops"
-#include "PPCMachineBasicBlockUtils.h"
 
 STATISTIC(NumContainedSingleUseBinOps,
           "Number of single-use binary CR logical ops contained in a block");
@@ -50,7 +53,177 @@ namespace llvm {
   void initializePPCReduceCRLogicalsPass(PassRegistry&);
 }
 
-namespace {
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for any incoming values in the PHIs that are supposed to
+/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
+/// Any such PHIs will be updated to reflect reality.
+static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB,
+                       MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) {
+  for (auto &MI : Successor->instrs()) {
+    if (!MI.isPHI())
+      continue;
+    // This is a really ugly-looking loop, but it was pillaged directly from
+    // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == OrigMBB) {
+        // Check if the instruction is actually defined in NewMBB.
+        if (MI.getOperand(i - 1).isReg()) {
+          MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i - 1).getReg());
+          if (DefMI->getParent() == NewMBB ||
+              !OrigMBB->isSuccessor(Successor)) {
+            MO.setMBB(NewMBB);
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for PHIs that have an incoming value from \p OrigMBB
+/// and will add the same incoming value from \p NewMBB.
+/// NOTE: This should only be used if \p NewMBB is an immediate dominator of
+/// \p OrigMBB.
+static void addIncomingValuesToPHIs(MachineBasicBlock *Successor,
+                                    MachineBasicBlock *OrigMBB,
+                                    MachineBasicBlock *NewMBB,
+                                    MachineRegisterInfo *MRI) {
+  assert(OrigMBB->isSuccessor(NewMBB) &&
+         "NewMBB must be a successor of OrigMBB");
+  for (auto &MI : Successor->instrs()) {
+    if (!MI.isPHI())
+      continue;
+    // This is a really ugly-looking loop, but it was pillaged directly from
+    // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == OrigMBB) {
+        MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+        MIB.addReg(MI.getOperand(i - 1).getReg()).addMBB(NewMBB);
+        break;
+      }
+    }
+  }
+}
+
+struct BlockSplitInfo {
+  MachineInstr *OrigBranch;
+  MachineInstr *SplitBefore;
+  MachineInstr *SplitCond;
+  bool InvertNewBranch;
+  bool InvertOrigBranch;
+  bool BranchToFallThrough;
+  const MachineBranchProbabilityInfo *MBPI;
+  MachineInstr *MIToDelete;
+  MachineInstr *NewCond;
+  bool allInstrsInSameMBB() {
+    if (!OrigBranch || !SplitBefore || !SplitCond)
+      return false;
+    MachineBasicBlock *MBB = OrigBranch->getParent();
+    if (SplitBefore->getParent() != MBB || SplitCond->getParent() != MBB)
+      return false;
+    if (MIToDelete && MIToDelete->getParent() != MBB)
+      return false;
+    if (NewCond && NewCond->getParent() != MBB)
+      return false;
+    return true;
+  }
+};
+
+/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original
+/// branch is \p OrigBranch. The target of the new branch can either be the same
+/// as the target of the original branch or the fallthrough successor of the
+/// original block as determined by \p BranchToFallThrough. The branch
+/// conditions will be inverted according to \p InvertNewBranch and
+/// \p InvertOrigBranch. If an instruction that previously fed the branch is to
+/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as
+/// the branch condition. The branch probabilities will be set if the
+/// MachineBranchProbabilityInfo isn't null.
+static bool splitMBB(BlockSplitInfo &BSI) {
+  assert(BSI.allInstrsInSameMBB() &&
+         "All instructions must be in the same block.");
+
+  MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent();
+  MachineFunction *MF = ThisMBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  assert(MRI->isSSA() && "Can only do this while the function is in SSA form.");
+  if (ThisMBB->succ_size() != 2) {
+    LLVM_DEBUG(
+        dbgs() << "Don't know how to handle blocks that don't have exactly"
+               << " two successors.\n");
+    return false;
+  }
+
+  const PPCInstrInfo *TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+  unsigned OrigBROpcode = BSI.OrigBranch->getOpcode();
+  unsigned InvertedOpcode =
+      OrigBROpcode == PPC::BC
+          ? PPC::BCn
+          : OrigBROpcode == PPC::BCn
+                ? PPC::BC
+                : OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR;
+  unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode;
+  MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB();
+  MachineBasicBlock *OrigFallThrough = OrigTarget == *ThisMBB->succ_begin()
+                                           ? *ThisMBB->succ_rbegin()
+                                           : *ThisMBB->succ_begin();
+  MachineBasicBlock *NewBRTarget =
+      BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
+  BranchProbability ProbToNewTarget =
+      !BSI.MBPI ? BranchProbability::getUnknown()
+                : BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
+
+  // Create a new basic block.
+  MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
+  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+  MachineFunction::iterator It = ThisMBB->getIterator();
+  MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(++It, NewMBB);
+
+  // Move everything after SplitBefore into the new block.
+  NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
+  NewMBB->transferSuccessors(ThisMBB);
+
+  // Add the two successors to ThisMBB. The probabilities come from the
+  // existing blocks if available.
+  ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
+  ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
+
+  // Add the branches to ThisMBB.
+  BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+          TII->get(NewBROpcode))
+      .addReg(BSI.SplitCond->getOperand(0).getReg())
+      .addMBB(NewBRTarget);
+  BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+          TII->get(PPC::B))
+      .addMBB(NewMBB);
+  if (BSI.MIToDelete)
+    BSI.MIToDelete->eraseFromParent();
+
+  // Change the condition on the original branch and invert it if requested.
+  auto FirstTerminator = NewMBB->getFirstTerminator();
+  if (BSI.NewCond) {
+    assert(FirstTerminator->getOperand(0).isReg() &&
+           "Can't update condition of unconditional branch.");
+    FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg());
+  }
+  if (BSI.InvertOrigBranch)
+    FirstTerminator->setDesc(TII->get(InvertedOpcode));
+
+  // If any of the PHIs in the successors of NewMBB reference values that
+  // now come from NewMBB, they need to be updated.
+  for (auto *Succ : NewMBB->successors()) {
+    updatePHIs(Succ, ThisMBB, NewMBB, MRI);
+  }
+  addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
+
+  LLVM_DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
+  LLVM_DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
+  LLVM_DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
+  return true;
+}
 
 static bool isBinary(MachineInstr &MI) {
   return MI.getNumOperands() == 3;
@@ -149,6 +322,8 @@ computeBranchTargetAndInversion(unsigned CROp, unsigned BROp, bool UsingDef1,
     llvm_unreachable("Don't know how to handle this branch.");
 }
 
+namespace {
+
 class PPCReduceCRLogicals : public MachineFunctionPass {
 
 public:
@@ -317,7 +492,7 @@ PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) {
       Ret.ContainedInBlock &=
         (MIParam.getParent() == Ret.TrueDefs.second->getParent());
   }
-  DEBUG(Ret.dump());
+  LLVM_DEBUG(Ret.dump());
   if (Ret.IsBinary && Ret.ContainedInBlock && Ret.SingleUse) {
     NumContainedSingleUseBinOps++;
     if (Ret.FeedsBR && Ret.DefsSingleUse)
@@ -326,7 +501,7 @@ PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) {
   return Ret;
 }
 
-/// Looks trhough a COPY instruction to the actual definition of the CR-bit
+/// Looks through a COPY instruction to the actual definition of the CR-bit
 /// register and returns the instruction that defines it.
 /// FIXME: This currently handles what is by-far the most common case:
 /// an instruction that defines a CR field followed by a single copy of a bit
@@ -411,14 +586,15 @@ bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) {
 ///    BC %vr9<kill>, <BB#2>; CRBITRC:%vr9
 bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
   if (CRI.CopyDefs.first == CRI.CopyDefs.second) {
-    DEBUG(dbgs() << "Unable to split as the two operands are the same\n");
+    LLVM_DEBUG(dbgs() << "Unable to split as the two operands are the same\n");
     NumNotSplitIdenticalOperands++;
     return false;
   }
   if (CRI.TrueDefs.first->isCopy() || CRI.TrueDefs.second->isCopy() ||
       CRI.TrueDefs.first->isPHI() || CRI.TrueDefs.second->isPHI()) {
-    DEBUG(dbgs() << "Unable to split because one of the operands is a PHI or "
-          "chain of copies.\n");
+    LLVM_DEBUG(
+        dbgs() << "Unable to split because one of the operands is a PHI or "
+                  "chain of copies.\n");
     NumNotSplitChainCopies++;
     return false;
   }
@@ -429,11 +605,11 @@ bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
       CRI.MI->getOpcode() != PPC::CRNAND &&
       CRI.MI->getOpcode() != PPC::CRORC &&
       CRI.MI->getOpcode() != PPC::CRANDC) {
-    DEBUG(dbgs() << "Unable to split blocks on this opcode.\n");
+    LLVM_DEBUG(dbgs() << "Unable to split blocks on this opcode.\n");
     NumNotSplitWrongOpcode++;
     return false;
   }
-  DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump());
+  LLVM_DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump());
   MachineBasicBlock::iterator Def1It = CRI.TrueDefs.first;
   MachineBasicBlock::iterator Def2It = CRI.TrueDefs.second;
 
@@ -447,9 +623,9 @@ bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
     }
   }
 
-  DEBUG(dbgs() << "We will split the following block:\n";);
-  DEBUG(CRI.MI->getParent()->dump());
-  DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump());
+  LLVM_DEBUG(dbgs() << "We will split the following block:\n";);
+  LLVM_DEBUG(CRI.MI->getParent()->dump());
+  LLVM_DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump());
 
   // Get the branch instruction.
   MachineInstr *Branch =
@@ -482,10 +658,11 @@ bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
                                   TargetIsFallThrough);
   MachineInstr *SplitCond =
     UsingDef1 ? CRI.CopyDefs.second : CRI.CopyDefs.first;
-  DEBUG(dbgs() << "We will " <<  (InvertNewBranch ? "invert" : "copy"));
-  DEBUG(dbgs() << " the original branch and the target is the " <<
-        (TargetIsFallThrough ? "fallthrough block\n" : "orig. target block\n"));
-  DEBUG(dbgs() << "Original branch instruction: "; Branch->dump());
+  LLVM_DEBUG(dbgs() << "We will " << (InvertNewBranch ? "invert" : "copy"));
+  LLVM_DEBUG(dbgs() << " the original branch and the target is the "
+                    << (TargetIsFallThrough ? "fallthrough block\n"
+                                            : "orig. target block\n"));
+  LLVM_DEBUG(dbgs() << "Original branch instruction: "; Branch->dump());
   BlockSplitInfo BSI { Branch, SplitBefore, SplitCond, InvertNewBranch,
     InvertOrigBranch, TargetIsFallThrough, MBPI, CRI.MI,
     UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second };
@@ -522,7 +699,7 @@ void PPCReduceCRLogicals::collectCRLogicals() {
   }
 }
 
-} // end annonymous namespace
+} // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE,
                       "PowerPC Reduce CR logical Operation", false, false)
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6b62a82ef7bf..6647ceace5eb 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -65,6 +65,12 @@ static cl::opt<bool>
 EnableGPRToVecSpills("ppc-enable-gpr-to-vsr-spills", cl::Hidden, cl::init(false),
          cl::desc("Enable spills from gpr to vsr rather than stack"));
 
+static cl::opt<bool>
+StackPtrConst("ppc-stack-ptr-caller-preserved",
+                cl::desc("Consider R1 caller preserved so stack saves of "
+                         "caller preserved registers can be LICM candidates"),
+                cl::init(true), cl::Hidden);
+
 PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
   : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
                        TM.isPPC64() ? 0 : 1,
@@ -100,6 +106,12 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
   ImmToIdxMap[PPC::STXV] = PPC::STXVX;
   ImmToIdxMap[PPC::STXSD] = PPC::STXSDX;
   ImmToIdxMap[PPC::STXSSP] = PPC::STXSSPX;
+
+  // SPE
+  ImmToIdxMap[PPC::EVLDD] = PPC::EVLDDX;
+  ImmToIdxMap[PPC::EVSTDD] = PPC::EVSTDDX;
+  ImmToIdxMap[PPC::SPESTW] = PPC::SPESTWX;
+  ImmToIdxMap[PPC::SPELWZ] = PPC::SPELWZX;
 }
 
 /// getPointerRegClass - Return the register class to use to hold pointers.
@@ -141,9 +153,23 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
     return CSR_SRV464_TLS_PE_SaveList;
 
+  if (Subtarget.hasSPE())
+    return CSR_SVR432_SPE_SaveList;
+
   // On PPC64, we might need to save r2 (but only if it is not reserved).
   bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
 
+  if (MF->getFunction().getCallingConv() == CallingConv::Cold) {
+    return TM.isPPC64()
+               ? (Subtarget.hasAltivec()
+                      ? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
+                                : CSR_SVR64_ColdCC_Altivec_SaveList)
+                      : (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList
+                                : CSR_SVR64_ColdCC_SaveList))
+               : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList
+                                         : CSR_SVR32_ColdCC_SaveList);
+  }
+
   return TM.isPPC64()
              ? (Subtarget.hasAltivec()
                     ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
@@ -196,6 +222,13 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                         : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
                                                   : CSR_Darwin32_RegMask);
 
+  if (CC == CallingConv::Cold) {
+    return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
+                                                  : CSR_SVR64_ColdCC_RegMask)
+                        : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask
+                                                  : CSR_SVR32_ColdCC_RegMask);
+  }
+
   return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
                                                 : CSR_SVR464_RegMask)
                       : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
@@ -286,15 +319,26 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
                                                const MachineFunction &MF) const {
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
-  if (TM.isELFv2ABI() && PhysReg == PPC::X2) {
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!TM.isPPC64())
+    return false;
+
+  if (!Subtarget.isSVR4ABI())
+    return false;
+  if (PhysReg == PPC::X2)
     // X2 is guaranteed to be preserved within a function if it is reserved.
     // The reason it's reserved is that it's the TOC pointer (and the function
     // uses the TOC). In functions where it isn't reserved (i.e. leaf functions
     // with no TOC access), we can't claim that it is preserved.
     return (getReservedRegs(MF).test(PPC::X2));
-  } else {
-    return false;
-  }
+  if (StackPtrConst && (PhysReg == PPC::X1) && !MFI.hasVarSizedObjects()
+      && !MFI.hasOpaqueSPAdjustment())
+    // The value of the stack pointer does not change within a function after
+    // the prologue and before the epilogue if there are no dynamic allocations
+    // and no inline asm which clobbers X1.
+    return true;
+  return false;
 }
 
 unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
@@ -307,6 +351,8 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 0;
   case PPC::G8RC_NOX0RegClassID:
   case PPC::GPRC_NOR0RegClassID:
+  case PPC::SPERCRegClassID:
+  case PPC::SPE4RCRegClassID:
   case PPC::G8RCRegClassID:
   case PPC::GPRCRegClassID: {
     unsigned FP = TFI->hasFP(MF) ? 1 : 0;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 0bbb71fdf9fb..91a98ee4efc7 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -85,6 +85,8 @@ public:
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   /// We require the register scavenger.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index f7807907bd64..0e641cf9e00a 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -38,6 +38,13 @@ class GP8<GPR SubReg, string n> : PPCReg<n> {
   let SubRegIndices = [sub_32];
 }
 
+// SPE - One of the 32 64-bit general-purpose registers (SPE)
+class SPE<GPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_32];
+}
+
 // SPR - One of the 32-bit special-purpose registers
 class SPR<bits<10> num, string n> : PPCReg<n> {
   let HWEncoding{9-0} = num;
@@ -100,6 +107,12 @@ foreach Index = 0-31 in {
                     DwarfRegNum<[Index, -2]>;
 }
 
+// SPE registers
+foreach Index = 0-31 in {
+  def S#Index : SPE<!cast<GPR>("R"#Index), "r"#Index>,
+                    DwarfRegNum<[!add(Index, 1200), !add(Index, 1200)]>;
+}
+
 // Floating-point registers
 foreach Index = 0-31 in {
   def F#Index : FPR<Index, "f"#Index>,
@@ -208,10 +221,20 @@ def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
 // VRsave register
 def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
 
+// SPE extra registers
+// SPE Accumulator for multiply-accumulate SPE operations.  Never directly
+// accessed, so there's no real encoding for it.
+def SPEACC: DwarfRegNum<[99, 111]>;
+def SPEFSCR: SPR<512, "spefscr">, DwarfRegNum<[612, 112]>;
+
+def XER: SPR<1, "xer">, DwarfRegNum<[76]>;
+
 // Carry bit.  In the architecture this is really bit 0 of the XER register
 // (which really is SPR register 1);  this is the only bit interesting to a
 // compiler.
-def CARRY: SPR<1, "ca">, DwarfRegNum<[76]>;
+def CARRY: SPR<1, "xer">, DwarfRegNum<[76]> {
+  let Aliases = [XER];
+}
 
 // FP rounding mode:  bits 30 and 31 of the FP status and control register
 // This is not allocated as a normal register; it appears only in
@@ -272,6 +295,12 @@ def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
   }];
 }
 
+def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12),
+                                                (sequence "S%u", 30, 13),
+                                                S31, S0, S1)>;
+
+def SPE4RC : RegisterClass<"PPC", [f32], 32, (add GPRC)>;
+
 // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
 // ABI the size of the Floating-point register save area is determined by the
 // allocated non-volatile register with the lowest register number, as FP
@@ -283,7 +312,9 @@ def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
                                                 (sequence "F%u", 31, 14))>;
 def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
 
-def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64], 128,
+def VRRC : RegisterClass<"PPC",
+                         [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64, f128],
+                         128,
                          (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
                              V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
                              V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
@@ -351,7 +382,7 @@ def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
 }
 
 def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
-def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
   let CopyCost = -1;
 }
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
index d240529bc731..5ad0a517c117 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -87,6 +87,8 @@ def IIC_SprMTSRIN    : InstrItinClass;
 def IIC_SprRFI       : InstrItinClass;
 def IIC_SprSC        : InstrItinClass;
 def IIC_FPGeneral    : InstrItinClass;
+def IIC_FPDGeneral   : InstrItinClass;
+def IIC_FPSGeneral   : InstrItinClass;
 def IIC_FPAddSub     : InstrItinClass;
 def IIC_FPCompare    : InstrItinClass;
 def IIC_FPDivD       : InstrItinClass;
@@ -133,5 +135,6 @@ include "PPCScheduleP7.td"
 include "PPCScheduleP8.td"
 include "PPCScheduleP9.td"
 include "PPCScheduleA2.td"
+include "PPCScheduleE500.td"
 include "PPCScheduleE500mc.td"
 include "PPCScheduleE5500.td"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500.td
new file mode 100644
index 000000000000..d7c2bd15a258
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500.td
@@ -0,0 +1,274 @@
+//===-- PPCScheduleE500.td - e500 Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500 32-bit
+// Power processor.
+//
+// All information is derived from the "e500 Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500 core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx) or Branch issue queue (BIQ).
+def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SU0, SU1, BU, LSU, MU.
+//    Some instructions can only execute in SU0 but not SU1.
+def E500_SU0  : FuncUnit; // Simple unit 0
+def E500_SU1  : FuncUnit; // Simple unit 1
+def E500_BU    : FuncUnit; // Branch unit
+def E500_MU    : FuncUnit; // MU pipeline
+def E500_LSU_0 : FuncUnit; // LSU pipeline
+
+def E500_GPR_Bypass : Bypass;
+def E500_CR_Bypass  : Bypass;
+def E500_DivBypass  : Bypass;
+
+def PPCE500Itineraries : ProcessorItineraries<
+  [E500_DIS0, E500_DIS1, E500_SU0, E500_SU1, E500_BU,
+   E500_MU, E500_LSU_0],
+  [E500_CR_Bypass, E500_GPR_Bypass, E500_DivBypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass,
+                                  E500_CR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [5, 1, 1], // Latency = 1 or 2
+                                 [E500_CR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_MU], 0>,
+                                  InstrStage<14, [E500_MU]>],
+                                 [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_MU]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_MU]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_MU]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SU0]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 2
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass,
+                                  E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1], // Latency = r+3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<3, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3, Repeat rate = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>]>,
+  InstrItinData<IIC_SprMFSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SU0]>],
+                                 [7, 1],
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SU0, E500_SU1]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0]>],
+                                 [5, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SU0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SU0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFPMR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SU0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SU0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMTPMR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SU0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0]>],
+                                 [4, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_FPDGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<6, [E500_MU]>],
+                                 [9, 1, 1],  // Latency = 6, Repeat rate = 1
+                                 [NoBypass]>,
+  InstrItinData<IIC_FPSGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_MU]>],
+                                 [7, 1, 1],  // Latency = 4, Repeat rate = 1
+                                 [NoBypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<32, [E500_MU]>],
+                                 [35, 1, 1], // Latency = 32, Repeat rate = 32
+                                 [E500_DivBypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<29, [E500_MU]>],
+                                 [32, 1, 1], // Latency = 29, Repeat rate = 29
+                                 [E500_DivBypass]>,
+  InstrItinData<IIC_VecGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0]>],
+                                 [4, 1, 1], // Latency = 1, Repeat rate = 1
+                                 [NoBypass]>,
+  InstrItinData<IIC_VecComplex,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_MU]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [NoBypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let CompleteModel = 0;
+
+  let Itineraries = PPCE500Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
index 15d5991b938c..5f95f2a79f66 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -19,299 +19,299 @@
 //  * Decode & Dispatch
 //    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
 //    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
-def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
-def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+def E500mc_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500mc_DIS1 : FuncUnit; // Dispatch stage - insn 2
 
 //  * Execute
 //    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
 //    Some instructions can only execute in SFX0 but not SFX1.
 //    The CFX has a bypass path, allowing non-divide instructions to execute
 //    while a divide instruction is executed.
-def E500_SFX0  : FuncUnit; // Simple unit 0
-def E500_SFX1  : FuncUnit; // Simple unit 1
-def E500_BU    : FuncUnit; // Branch unit
-def E500_CFX_DivBypass
+def E500mc_SFX0  : FuncUnit; // Simple unit 0
+def E500mc_SFX1  : FuncUnit; // Simple unit 1
+def E500mc_BU    : FuncUnit; // Branch unit
+def E500mc_CFX_DivBypass
                : FuncUnit; // CFX divide bypass path
-def E500_CFX_0 : FuncUnit; // CFX pipeline
-def E500_LSU_0 : FuncUnit; // LSU pipeline
-def E500_FPU_0 : FuncUnit; // FPU pipeline
+def E500mc_CFX_0 : FuncUnit; // CFX pipeline
+def E500mc_LSU_0 : FuncUnit; // LSU pipeline
+def E500mc_FPU_0 : FuncUnit; // FPU pipeline
 
-def E500_GPR_Bypass : Bypass;
-def E500_FPR_Bypass : Bypass;
-def E500_CR_Bypass  : Bypass;
+def E500mc_GPR_Bypass : Bypass;
+def E500mc_FPR_Bypass : Bypass;
+def E500mc_CR_Bypass  : Bypass;
 
 def PPCE500mcItineraries : ProcessorItineraries<
-  [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass,
-   E500_CFX_0, E500_LSU_0, E500_FPU_0],
-  [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [
-  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+  [E500mc_DIS0, E500mc_DIS1, E500mc_SFX0, E500mc_SFX1, E500mc_BU, E500mc_CFX_DivBypass,
+   E500mc_CFX_0, E500mc_LSU_0, E500mc_FPU_0],
+  [E500mc_CR_Bypass, E500mc_GPR_Bypass, E500mc_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1, 1], // Latency = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1, 1], // Latency = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1, 1, 1], // Latency = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass,
-                                  E500_CR_Bypass]>,
-  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass,
+                                  E500mc_CR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [5, 1, 1], // Latency = 1 or 2
-                                 [E500_CR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_CFX_0], 0>,
-                                  InstrStage<14, [E500_CFX_DivBypass]>],
+                                 [E500mc_CR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_CFX_0], 0>,
+                                  InstrStage<14, [E500mc_CFX_DivBypass]>],
                                  [17, 1, 1], // Latency=4..35, Repeat= 4..35
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<8, [E500_FPU_0]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<8, [E500mc_FPU_0]>],
                                  [11], // Latency = 8
-                                 [E500_FPR_Bypass]>,
-  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<8, [E500_FPU_0]>],
+                                 [E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<8, [E500mc_FPU_0]>],
                                  [11, 1, 1], // Latency = 8
                                  [NoBypass, NoBypass, NoBypass]>,
-  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_CFX_0]>],
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_CFX_0]>],
                                  [7, 1, 1], // Latency = 4, Repeat rate = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_CFX_0]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_CFX_0]>],
                                  [7, 1, 1], // Latency = 4, Repeat rate = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_CFX_0]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_CFX_0]>],
                                  [7, 1, 1], // Latency = 4, Repeat rate = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1, 1], // Latency = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntShift,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1, 1], // Latency = 1
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<2, [E500_SFX0]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<2, [E500mc_SFX0]>],
                                  [5, 1], // Latency = 2, Repeat rate = 2
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_BrB,         [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_BU]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_BU]>],
                                  [4, 1], // Latency = 1
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_BrCR,        [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_BU]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_BU]>],
                                  [4, 1, 1], // Latency = 1
-                                 [E500_CR_Bypass,
-                                  E500_CR_Bypass, E500_CR_Bypass]>,
-  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_BU]>],
+                                 [E500mc_CR_Bypass,
+                                  E500mc_CR_Bypass, E500mc_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_BU]>],
                                  [4, 1], // Latency = 1
-                                 [E500_CR_Bypass, E500_CR_Bypass]>,
-  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_CR_Bypass, E500mc_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1, 1], // Latency = 1
-                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_CR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3, Repeat rate = 1
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [NoBypass, E500_GPR_Bypass],
+                                 [NoBypass, E500mc_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1, 1], // Latency = 3
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1, 1], // Latency = 3
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [7, 1, 1], // Latency = 4
-                                 [E500_FPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [7, 1, 1], // Latency = 4
-                                 [E500_FPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [7, 1, 1], // Latency = 4
-                                 [E500_FPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [7, 1], // Latency = r+3
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<3, [E500_LSU_0]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<3, [E500mc_LSU_0]>],
                                  [6, 1, 1], // Latency = 3, Repeat rate = 3
-                                 [E500_GPR_Bypass,
-                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>],
+                                 [E500mc_GPR_Bypass,
+                                  E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0]>]>,
-  InstrItinData<IIC_SprMFSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<4, [E500_SFX0]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>]>,
+  InstrItinData<IIC_SprMFSR,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<4, [E500mc_SFX0]>],
                                  [7, 1],
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<2, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<2, [E500mc_SFX0, E500mc_SFX1]>],
                                  [5, 1], // Latency = 2, Repeat rate = 4
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMTSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0]>],
                                  [5, 1],
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_LSU_0], 0>]>,
-  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<5, [E500_SFX0]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<5, [E500mc_SFX0]>],
                                  [8, 1],
-                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
-  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<5, [E500_SFX0]>],
+                                 [E500mc_GPR_Bypass, E500mc_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<5, [E500mc_SFX0]>],
                                  [8, 1],
-                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
-  InstrItinData<IIC_SprMFPMR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<4, [E500_SFX0]>],
+                                 [E500mc_GPR_Bypass, E500mc_CR_Bypass]>,
+  InstrItinData<IIC_SprMFPMR,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<4, [E500mc_SFX0]>],
                                  [7, 1], // Latency = 4, Repeat rate = 4
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<4, [E500_SFX0]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<4, [E500mc_SFX0]>],
                                  [7, 1], // Latency = 4, Repeat rate = 4
-                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [E500mc_GPR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1], // Latency = 1, Repeat rate = 1
-                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
-  InstrItinData<IIC_SprMTPMR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0]>],
+                                 [E500mc_GPR_Bypass, E500mc_CR_Bypass]>,
+  InstrItinData<IIC_SprMTPMR,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0]>],
                                  [4, 1], // Latency = 1, Repeat rate = 1
-                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<4, [E500_SFX0]>],
+                                 [E500mc_CR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<4, [E500mc_SFX0]>],
                                  [7, 1], // Latency = 4, Repeat rate = 4
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1]>],
                                  [4, 1], // Latency = 1, Repeat rate = 1
-                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_SprMTSRIN,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<1, [E500_SFX0]>],
+                                 [E500mc_CR_Bypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0]>],
                                  [4, 1],
-                                 [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<2, [E500_FPU_0]>],
+                                 [NoBypass, E500mc_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<2, [E500mc_FPU_0]>],
                                  [11, 1, 1], // Latency = 8, Repeat rate = 2
-                                 [E500_FPR_Bypass,
-                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
-  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<4, [E500_FPU_0]>],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<4, [E500mc_FPU_0]>],
                                  [13, 1, 1], // Latency = 10, Repeat rate = 4
-                                 [E500_FPR_Bypass,
-                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
-  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<2, [E500_FPU_0]>],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<2, [E500mc_FPU_0]>],
                                  [11, 1, 1], // Latency = 8, Repeat rate = 2
-                                 [E500_CR_Bypass,
-                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
-  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<68, [E500_FPU_0]>],
+                                 [E500mc_CR_Bypass,
+                                  E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<68, [E500mc_FPU_0]>],
                                  [71, 1, 1], // Latency = 68, Repeat rate = 68
-                                 [E500_FPR_Bypass,
-                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
-  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<38, [E500_FPU_0]>],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<38, [E500mc_FPU_0]>],
                                  [41, 1, 1], // Latency = 38, Repeat rate = 38
-                                 [E500_FPR_Bypass,
-                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
-  InstrItinData<IIC_FPFused,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<4, [E500_FPU_0]>],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_FPR_Bypass, E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<4, [E500mc_FPU_0]>],
                                  [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
-                                 [E500_FPR_Bypass,
-                                  E500_FPR_Bypass, E500_FPR_Bypass,
-                                  E500_FPR_Bypass]>,
-  InstrItinData<IIC_FPRes,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
-                                  InstrStage<38, [E500_FPU_0]>],
+                                 [E500mc_FPR_Bypass,
+                                  E500mc_FPR_Bypass, E500mc_FPR_Bypass,
+                                  E500mc_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<38, [E500mc_FPU_0]>],
                                  [41, 1], // Latency = 38, Repeat rate = 38
-                                 [E500_FPR_Bypass, E500_FPR_Bypass]>
+                                 [E500mc_FPR_Bypass, E500mc_FPR_Bypass]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index b24f4fc603a1..e1a480117315 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -13,18 +13,31 @@
 include "PPCInstrInfo.td"
 
 def P9Model : SchedMachineModel {
+  // The maximum number of instructions to be issued at the same time.
+  // While a value of 8 is technically correct since 8 instructions can be
+  // fetched from the instruction cache. However, only 6 instructions may be
+  // actually dispatched at a time.
   let IssueWidth = 8;
 
+  // Load latency is 4 or 5 cycles depending on the load. This latency assumes
+  // that we have a cache hit. For a cache miss the load latency will be more.
+  // There are two instructions (lxvl, lxvll) that have a latencty of 6 cycles.
+  // However it is not worth bumping this value up to 6 when the vast majority
+  // of instructions are 4 or 5 cycles.
   let LoadLatency = 5;
 
+  // A total of 16 cycles to recover from a branch mispredict.
   let MispredictPenalty = 16;
 
   // Try to make sure we have at least 10 dispatch groups in a loop.
+  // A dispatch group is 6 instructions.
   let LoopMicroOpBufferSize = 60;
 
   let CompleteModel = 1;
 
-  let UnsupportedFeatures = [HasQPX];
+  // Do not support QPX (Quad Processing eXtension) or SPE (Signal Procesing
+  // Engine) on Power 9.
+  let UnsupportedFeatures = [HasQPX, HasSPE];
 
 }
 
@@ -36,6 +49,12 @@ let SchedModel = P9Model in {
   def DISPATCHER : ProcResource<12>;
 
   // Issue Ports
+  // An instruction can go down one of two issue queues.
+  // Address Generation (AGEN) mainly for loads and stores.
+  // Execution (EXEC) for most other instructions.
+  // Some instructions cannot be run on just any issue queue and may require an
+  // Even or an Odd queue. The EXECE represents the even queues and the EXECO
+  // represents the odd queues.
   def IP_AGEN : ProcResource<4>;
   def IP_EXEC : ProcResource<4>;
   def IP_EXECE : ProcResource<2> {
@@ -48,6 +67,7 @@ let SchedModel = P9Model in {
   }
 
   // Pipeline Groups
+  // Four ALU (Fixed Point Arithmetic) units in total. Two even, two Odd.
   def ALU : ProcResource<4>;
   def ALUE : ProcResource<2> {
     //Even ALU pipelines
@@ -57,7 +77,11 @@ let SchedModel = P9Model in {
     //Odd ALU pipelines
     let Super = ALU;
   }
+
+  // Two DIV (Fixed Point Divide) units.
   def DIV : ProcResource<2>;
+
+  // Four DP (Floating Point) units in total. Two even, two Odd.
   def DP : ProcResource<4>;
   def DPE : ProcResource<2> {
     //Even DP pipelines
@@ -67,15 +91,23 @@ let SchedModel = P9Model in {
     //Odd DP pipelines
     let Super = DP;
   }
+
+  // Four LS (Load or Store) units.
   def LS : ProcResource<4>;
+
+  // Two PM (Permute) units.
   def PM : ProcResource<2>;
+
+  // Only one DFU (Decimal Floating Point and Quad Precision) unit.
   def DFU : ProcResource<1>;
+
+  // Only one Branch unit.
   def BR : ProcResource<1> {
     let BufferSize = 16;
   }
-  def CY : ProcResource<1>;
 
-  def TestGroup : ProcResGroup<[ALU, DP]>;
+  // Only one CY (Crypto) unit.
+  def CY : ProcResource<1>;
 
   // ***************** SchedWriteRes Definitions *****************
 
@@ -107,6 +139,11 @@ let SchedModel = P9Model in {
   }
 
   //Pipeline Groups
+
+  // ALU Units
+  // An ALU may take either 2 or 3 cycles to complete the operation.
+  // However, the ALU unit is only every busy for 1 cycle at a time and may
+  // receive new instructions each cycle.
   def P9_ALU_2C : SchedWriteRes<[ALU]> {
     let Latency = 2;
   }
@@ -131,26 +168,13 @@ let SchedModel = P9Model in {
     let Latency = 3;
   }
 
-  def P9_ALU_4C : SchedWriteRes<[ALU]> {
-    let Latency = 4;
-  }
-
-  def P9_ALUE_4C : SchedWriteRes<[ALUE]> {
-    let Latency = 4;
-  }
-
-  def P9_ALUO_4C : SchedWriteRes<[ALUO]> {
-    let Latency = 4;
-  }
-
-  def P9_ALU_5C : SchedWriteRes<[ALU]> {
+  // DIV Unit
+  // A DIV unit may take from 5 to 40 cycles to complete.
+  // Some DIV operations may keep the unit busy for up to 8 cycles.
+  def P9_DIV_5C : SchedWriteRes<[DIV]> {
     let Latency = 5;
   }
 
-  def P9_ALU_6C : SchedWriteRes<[ALU]> {
-    let Latency = 6;
-  }
-
   def P9_DIV_12C : SchedWriteRes<[DIV]> {
     let Latency = 12;
   }
@@ -170,6 +194,9 @@ let SchedModel = P9Model in {
     let Latency = 40;
   }
 
+  // DP Unit
+  // A DP unit may take from 2 to 36 cycles to complete.
+  // Some DP operations keep the unit busy for up to 10 cycles.
   def P9_DP_2C : SchedWriteRes<[DP]> {
     let Latency = 2;
   }
@@ -220,6 +247,16 @@ let SchedModel = P9Model in {
     let Latency = 27;
   }
 
+  def P9_DPE_27C_10 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [10];
+    let Latency = 27;
+  }
+
+  def P9_DPO_27C_10 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [10];
+    let Latency = 27;
+  }
+
   def P9_DP_33C_8 : SchedWriteRes<[DP]> {
     let ResourceCycles = [8];
     let Latency = 33;
@@ -240,14 +277,28 @@ let SchedModel = P9Model in {
     let Latency = 36;
   }
 
-  def P9_PM_3C : SchedWriteRes<[PM]> {
-    let Latency = 3;
+  def P9_DPE_36C_10 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [10];
+    let Latency = 36;
   }
 
-  def P9_PM_7C : SchedWriteRes<[PM]> {
+  def P9_DPO_36C_10 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [10];
+    let Latency = 36;
+  }
+
+  // PM Unit
+  // Three cycle permute operations.
+  def P9_PM_3C : SchedWriteRes<[PM]> {
     let Latency = 3;
   }
 
+  // Load and Store Units
+  // Loads can have 4, 5 or 6 cycles of latency.
+  // Stores are listed as having a single cycle of latency. This is not
+  // completely accurate since it takes more than 1 cycle to actually store
+  // the value. However, since the store does not produce a result it can be
+  // considered complete after one cycle.
   def P9_LS_1C : SchedWriteRes<[LS]> {
     let Latency = 1;
   }
@@ -260,25 +311,44 @@ let SchedModel = P9Model in {
     let Latency = 5;
   }
 
+  def P9_LS_6C : SchedWriteRes<[LS]> {
+    let Latency = 6;
+  }
+
+  // DFU Unit
+  // Some of the most expensive ops use the DFU.
+  // Can take from 12 cycles to 76 cycles to obtain a result.
+  // The unit may be busy for up to 62 cycles.
   def P9_DFU_12C : SchedWriteRes<[DFU]> {
     let Latency = 12;
   }
 
+  def P9_DFU_23C : SchedWriteRes<[DFU]> {
+    let Latency = 23;
+    let ResourceCycles = [11];
+  }
+
   def P9_DFU_24C : SchedWriteRes<[DFU]> {
     let Latency = 24;
     let ResourceCycles = [12];
   }
 
+  def P9_DFU_37C : SchedWriteRes<[DFU]> {
+    let Latency = 37;
+    let ResourceCycles = [25];
+  }
+
   def P9_DFU_58C : SchedWriteRes<[DFU]> {
     let Latency = 58;
     let ResourceCycles = [44];
   }
 
-  def P9_DFU_76C : SchedWriteRes<[TestGroup, DFU]> {
+  def P9_DFU_76C : SchedWriteRes<[DFU]> {
     let Latency = 76;
     let ResourceCycles = [62];
   }
 
+  // 2 or 5 cycle latencies for the branch unit.
   def P9_BR_2C : SchedWriteRes<[BR]> {
     let Latency = 2;
   }
@@ -287,138 +357,43 @@ let SchedModel = P9Model in {
     let Latency = 5;
   }
 
+  // 6 cycle latency for the crypto unit
   def P9_CY_6C : SchedWriteRes<[CY]> {
     let Latency = 6;
   }
 
   // ***************** WriteSeq Definitions *****************
 
+  // These are combinations of the resources listed above.
+  // The idea is that some cracked instructions cannot be done in parallel and
+  // so the latencies for their resources must be added.
   def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
   def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
+  def P9_LoadAndALU2Op_7C : WriteSequence<[P9_LS_4C, P9_ALU_3C]>;
+  def P9_LoadAndALU2Op_8C : WriteSequence<[P9_LS_5C, P9_ALU_3C]>;
   def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
   def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>;
+  def P9_IntDivAndALUOp_18C_8 : WriteSequence<[P9_DIV_16C_8, P9_ALU_2C]>;
   def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
   def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
+  def P9_StoreAndALUOp_3C : WriteSequence<[P9_LS_1C, P9_ALU_2C]>;
   def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
   def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
+  def P9_ALU2OpAndALU2Op_6C : WriteSequence<[P9_ALU_3C, P9_ALU_3C]>;
+  def P9_ALUOpAndALUOpAndALUOp_6C :
+    WriteSequence<[P9_ALU_2C, P9_ALU_2C, P9_ALU_2C]>;
+  def P9_DPOpAndALUOp_7C : WriteSequence<[P9_DP_5C, P9_ALU_2C]>;
   def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>;
+  def P9_DPOpAndALU2Op_10C : WriteSequence<[P9_DP_7C, P9_ALU_3C]>;
   def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>;
   def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>;
+  def P9_DPOpAndALU2Op_25C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_3C]>;
+  def P9_DPOpAndALU2Op_29C_5 : WriteSequence<[P9_DP_26C_5, P9_ALU_3C]>;
+  def P9_DPOpAndALU2Op_36C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_3C]>;
+  def P9_DPOpAndALU2Op_39C_10 : WriteSequence<[P9_DP_36C_10, P9_ALU_3C]>;
+  def P9_BROpAndALUOp_7C : WriteSequence<[P9_BR_5C, P9_ALU_2C]>;
 
-  // ***************** Defining Itinerary Class Resources *****************
-
-  // The following itineraries are fully covered by the InstRW definitions in
-  // P9InstrResources.td so aren't listed here.
-  // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU,
-  // IIC_LdStLFDUX
-
-  def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID,
-                IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD,
-                IIC_SprRFI]>;
-
-  def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_IntTrapW]>;
-
-  def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
-
-  def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
-
-  def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
-                DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>;
-
-  def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>;
-
-  def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>;
-
-  def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLoadUpd, IIC_LdStLDU]>;
-
-  def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXECE_1C, IP_EXECO_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLoadUpdX, IIC_LdStLDUX]>;
-
-  def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStSTFDU]>;
-
-  def : ItinRW<[P9_LoadAndALUOp_6C,
-                IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLHA, IIC_LdStLWA]>;
-
-  def : ItinRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
-                IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLHAU, IIC_LdStLHAUX]>;
-
-  // IIC_LdStLMW contains two microcoded insns. This is not accurate, but
-  // those insns are not used that much, if at all.
-  def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
-
-  def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF,
-                IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC,
-                IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>;
-
-  def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
-
-  def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG,
-                IIC_SprTLBIA, IIC_SprTLBIE]>;
-
-  def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStSTDCX, IIC_LdStSTWCX]>;
-
-  def : ItinRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_BrCR, IIC_IntMTFSB0]>;
-
-  def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
-                IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
-                DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>;
-
-  def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>;
-  def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>;
-
-  // This class should be broken down to instruction level, once some missing
-  // info is obtained.
-  def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
-                DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
-
-  def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE,
-                IIC_SprTLBIEL]>;
-
-  // IIC_VecFP is added here although many instructions with that itinerary
-  // use very different resources. It would appear that instructions were
-  // given that itinerary rather carelessly over time. Specific instructions
-  // that use different resources are listed in various InstrRW classes.
-  def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>;
-
-  def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
-                DISP_1C, DISP_1C], [IIC_VecFPCompare]>;
-
-  def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
-               [IIC_VecPerm]>;
-
-  def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
-  def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
-
-  def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C],
-               [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB,
-                IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>;
-
-  def : ItinRW<[], [IIC_SprSTOP]>;
-
+  // Include the resource requirements of individual instructions.
   include "P9InstrResources.td"
 
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index ccf0f80c336b..c0cbfd779cb9 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -65,6 +65,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasHardFloat = false;
   HasAltivec = false;
   HasSPE = false;
+  HasFPU = false;
   HasQPX = false;
   HasVSX = false;
   HasP8Vector = false;
@@ -106,6 +107,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasFloat128 = false;
   IsISA3_0 = false;
   UseLongCalls = false;
+  SecurePlt = false;
 
   HasPOPCNTD = POPCNTD_Unavailable;
 }
@@ -136,6 +138,16 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isDarwin())
     HasLazyResolverStubs = true;
 
+  if (HasSPE && IsPPC64)
+    report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
+  if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
+    report_fatal_error(
+        "SPE and traditional floating point cannot both be enabled.\n", false);
+
+  // If not SPE, set standard FPU
+  if (!HasSPE)
+    HasFPU = true;
+
   // QPX requires a 32-byte aligned stack. Note that we need to do this if
   // we're compiling for a BG/Q system regardless of whether or not QPX
   // is enabled because external functions will assume this alignment.
@@ -163,27 +175,8 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
   return false;
 }
 
-// Embedded cores need aggressive scheduling (and some others also benefit).
-static bool needsAggressiveScheduling(unsigned Directive) {
-  switch (Directive) {
-  default: return false;
-  case PPC::DIR_440:
-  case PPC::DIR_A2:
-  case PPC::DIR_E500mc:
-  case PPC::DIR_E5500:
-  case PPC::DIR_PWR7:
-  case PPC::DIR_PWR8:
-  // FIXME: Same as P8 until POWER9 scheduling info is available
-  case PPC::DIR_PWR9:
-    return true;
-  }
-}
-
 bool PPCSubtarget::enableMachineScheduler() const {
-  // Enable MI scheduling for the embedded cores.
-  // FIXME: Enable this for all cores (some additional modeling
-  // may be necessary).
-  return needsAggressiveScheduling(DarwinDirective);
+  return true;
 }
 
 // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
@@ -201,19 +194,19 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
 
 void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                        unsigned NumRegionInstrs) const {
-  if (needsAggressiveScheduling(DarwinDirective)) {
-    Policy.OnlyTopDown = false;
-    Policy.OnlyBottomUp = false;
-  }
-
+  // The GenericScheduler that we use defaults to scheduling bottom up only.
+  // We want to schedule from both the top and the bottom and so we set
+  // OnlyBottomUp to false.
+  // We want to do bi-directional scheduling since it provides a more balanced
+  // schedule leading to better performance.
+  Policy.OnlyBottomUp = false;
   // Spilling is generally expensive on all PPC cores, so always enable
   // register-pressure tracking.
   Policy.ShouldTrackPressure = true;
 }
 
 bool PPCSubtarget::useAA() const {
-  // Use AA during code generation for the embedded cores.
-  return needsAggressiveScheduling(DarwinDirective);
+  return true;
 }
 
 bool PPCSubtarget::enableSubRegLiveness() const {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
index c351b5c04a05..c56f254d6bec 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -46,6 +46,7 @@ namespace PPC {
     DIR_750,
     DIR_970,
     DIR_A2,
+    DIR_E500,
     DIR_E500mc,
     DIR_E5500,
     DIR_PWR3,
@@ -94,6 +95,7 @@ protected:
   bool HasHardFloat;
   bool IsPPC64;
   bool HasAltivec;
+  bool HasFPU;
   bool HasSPE;
   bool HasQPX;
   bool HasVSX;
@@ -133,6 +135,7 @@ protected:
   bool HasFloat128;
   bool IsISA3_0;
   bool UseLongCalls;
+  bool SecurePlt;
 
   POPCNTDKind HasPOPCNTD;
 
@@ -238,6 +241,7 @@ public:
   bool hasFPCVT() const { return HasFPCVT; }
   bool hasAltivec() const { return HasAltivec; }
   bool hasSPE() const { return HasSPE; }
+  bool hasFPU() const { return HasFPU; }
   bool hasQPX() const { return HasQPX; }
   bool hasVSX() const { return HasVSX; }
   bool hasP8Vector() const { return HasP8Vector; }
@@ -255,6 +259,7 @@ public:
   bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
   bool isPPC4xx() const { return IsPPC4xx; }
   bool isPPC6xx() const { return IsPPC6xx; }
+  bool isSecurePlt() const {return SecurePlt; }
   bool isE500() const { return IsE500; }
   bool isFeatureMFTB() const { return FeatureMFTB; }
   bool isDeprecatedDST() const { return DeprecatedDST; }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 49f2699ab082..ac36abbe8439 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -77,7 +77,7 @@ protected:
           continue;
         }
 
-        DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n    " << MI);
+        LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n    " << MI);
 
         unsigned OutReg = MI.getOperand(0).getReg();
         unsigned InReg = MI.getOperand(1).getReg();
@@ -108,7 +108,7 @@ protected:
         }
 
         // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
-        // as schduling fence to avoid it is scheduled before
+        // as scheduling fence to avoid it is scheduled before
         // mflr in the prologue and the address in LR is clobbered (PR25839).
         // We don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 20a83c973026..a8d7955ef548 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -23,8 +23,8 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -32,6 +32,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cassert>
@@ -303,7 +304,12 @@ namespace {
 class PPCPassConfig : public TargetPassConfig {
 public:
   PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // At any optimization level above -O0 we use the Machine Scheduler and not
+    // the default Post RA List Scheduler.
+    if (TM.getOptLevel() != CodeGenOpt::None)
+      substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+  }
 
   PPCTargetMachine &getPPCTargetMachine() const {
     return getTM<PPCTargetMachine>();
@@ -343,7 +349,7 @@ void PPCPassConfig::addIRPasses() {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
     // multiple GEPs with single index.
-    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    addPass(createSeparateConstOffsetFromGEPPass(true));
     // Call EarlyCSE pass to find and remove subexpressions in the lowered
     // result.
     addPass(createEarlyCSEPass());
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
index 8343a90696d9..417b8ed0d612 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
 
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -25,7 +25,7 @@ namespace llvm {
     MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
                                       const TargetMachine &TM) const override;
 
-    /// \brief Describe a TLS variable address within debug info.
+    /// Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index aa4073f7ea02..226c75f704f4 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -27,6 +27,11 @@ static cl::opt<unsigned>
 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
               cl::desc("The loop prefetch cache line size"));
 
+static cl::opt<bool>
+EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
+                cl::desc("Enable using coldcc calling conv for cold "
+                         "internal functions"));
+
 //===----------------------------------------------------------------------===//
 //
 // PPC cost model.
@@ -215,6 +220,14 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getUnrollingPreferences(L, SE, UP);
 }
 
+// This function returns true to allow using coldcc calling convention.
+// Returning true results in coldcc being used for functions which are cold at
+// all call sites when the callers of the functions are not calling any other
+// non coldcc functions.
+bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
+  return EnablePPCColdCC;
+}
+
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
   // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
   // on combining the loads generated for consecutive accesses, and failure to
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index b42dae4a0254..2ee2b3eb8084 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -61,7 +61,7 @@ public:
 
   /// \name Vector TTI Implementations
   /// @{
-
+  bool useColdCCForColdCall(Function &F);
   bool enableAggressiveInterleaving(bool LoopHasReductions);
   const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
       bool IsZeroCmp) const;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f15af790de8f..6586f503a7b8 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -241,7 +241,7 @@ protected:
         assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
                "Addend copy not tied to old FMA output!");
 
-        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << MI);
+        LLVM_DEBUG(dbgs() << "VSX FMA Mutation:\n    " << MI);
 
         MI.getOperand(0).setReg(KilledProdReg);
         MI.getOperand(1).setReg(KilledProdReg);
@@ -273,7 +273,7 @@ protected:
           MI.getOperand(2).setIsUndef(OtherProdRegUndef);
         }
 
-        DEBUG(dbgs() << " -> " << MI);
+        LLVM_DEBUG(dbgs() << " -> " << MI);
 
         // The killed product operand was killed here, so we can reuse it now
         // for the result of the fma.
@@ -310,7 +310,7 @@ protected:
           NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
                                                      NewFMAValNo));
         }
-        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
+        LLVM_DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
 
         // Extend the live interval of the addend source (it might end at the
         // copy to be removed, or somewhere in between there and here). This
@@ -323,15 +323,15 @@ protected:
             LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
             AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB),
                                          FMAIdx.getRegSlot());
-            DEBUG(dbgs() << "  extended: " << AddendSrcRange << '\n');
+            LLVM_DEBUG(dbgs() << "  extended: " << AddendSrcRange << '\n');
           }
 
         FMAInt.removeValNo(FMAValNo);
-        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
+        LLVM_DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
 
         // Remove the (now unused) copy.
 
-        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
+        LLVM_DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
         LIS->RemoveMachineInstrFromMaps(*AddendMI);
         AddendMI->eraseFromParent();
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 8a5fb9fdaef1..1e8a1750ec3b 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -51,6 +51,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -248,7 +249,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
 
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       bool RelevantInstr = false;
@@ -519,14 +520,16 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       // permute control vectors (for shift values 1, 2, 3).  However,
       // VPERM has a more restrictive register class.
       case PPC::XXSLDWI:
+      case PPC::XSCVDPSPN:
+      case PPC::XSCVSPDPN:
         break;
       }
     }
   }
 
   if (RelevantFunction) {
-    DEBUG(dbgs() << "Swap vector when first built\n\n");
-    DEBUG(dumpSwapVector());
+    LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
+    LLVM_DEBUG(dumpSwapVector());
   }
 
   return RelevantFunction;
@@ -585,14 +588,14 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
 // as such so their containing webs will not be optimized.
 void PPCVSXSwapRemoval::formWebs() {
 
-  DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
+  LLVM_DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
 
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
 
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
 
-    DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
-    DEBUG(MI->dump());
+    LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
+    LLVM_DEBUG(MI->dump());
 
     // It's sufficient to walk vector uses and join them to their unique
     // definitions.  In addition, check full vector register operands
@@ -622,10 +625,11 @@ void PPCVSXSwapRemoval::formWebs() {
       (void)EC->unionSets(SwapVector[DefIdx].VSEId,
                           SwapVector[EntryIdx].VSEId);
 
-      DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId,
-                             SwapVector[EntryIdx].VSEId));
-      DEBUG(dbgs() << "  Def: ");
-      DEBUG(DefMI->dump());
+      LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
+                                  SwapVector[DefIdx].VSEId,
+                                  SwapVector[EntryIdx].VSEId));
+      LLVM_DEBUG(dbgs() << "  Def: ");
+      LLVM_DEBUG(DefMI->dump());
     }
   }
 }
@@ -636,7 +640,7 @@ void PPCVSXSwapRemoval::formWebs() {
 // as rejected.
 void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
 
-  DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
+  LLVM_DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
 
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
     int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
@@ -654,12 +658,13 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
 
       SwapVector[Repr].WebRejected = 1;
 
-      DEBUG(dbgs() <<
-            format("Web %d rejected for physreg, partial reg, or not "
-                   "swap[pable]\n", Repr));
-      DEBUG(dbgs() << "  in " << EntryIdx << ": ");
-      DEBUG(SwapVector[EntryIdx].VSEMI->dump());
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(
+          dbgs() << format("Web %d rejected for physreg, partial reg, or not "
+                           "swap[pable]\n",
+                           Repr));
+      LLVM_DEBUG(dbgs() << "  in " << EntryIdx << ": ");
+      LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
+      LLVM_DEBUG(dbgs() << "\n");
     }
 
     // Reject webs than contain swapping loads that feed something other
@@ -680,13 +685,13 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
 
           SwapVector[Repr].WebRejected = 1;
 
-          DEBUG(dbgs() <<
-                format("Web %d rejected for load not feeding swap\n", Repr));
-          DEBUG(dbgs() << "  def " << EntryIdx << ": ");
-          DEBUG(MI->dump());
-          DEBUG(dbgs() << "  use " << UseIdx << ": ");
-          DEBUG(UseMI.dump());
-          DEBUG(dbgs() << "\n");
+          LLVM_DEBUG(dbgs() << format(
+                         "Web %d rejected for load not feeding swap\n", Repr));
+          LLVM_DEBUG(dbgs() << "  def " << EntryIdx << ": ");
+          LLVM_DEBUG(MI->dump());
+          LLVM_DEBUG(dbgs() << "  use " << UseIdx << ": ");
+          LLVM_DEBUG(UseMI.dump());
+          LLVM_DEBUG(dbgs() << "\n");
         }
       }
 
@@ -704,13 +709,13 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
 
         SwapVector[Repr].WebRejected = 1;
 
-        DEBUG(dbgs() <<
-              format("Web %d rejected for store not fed by swap\n", Repr));
-        DEBUG(dbgs() << "  def " << DefIdx << ": ");
-        DEBUG(DefMI->dump());
-        DEBUG(dbgs() << "  use " << EntryIdx << ": ");
-        DEBUG(MI->dump());
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << format(
+                       "Web %d rejected for store not fed by swap\n", Repr));
+        LLVM_DEBUG(dbgs() << "  def " << DefIdx << ": ");
+        LLVM_DEBUG(DefMI->dump());
+        LLVM_DEBUG(dbgs() << "  use " << EntryIdx << ": ");
+        LLVM_DEBUG(MI->dump());
+        LLVM_DEBUG(dbgs() << "\n");
       }
 
       // Ensure all uses of the register defined by DefMI feed store
@@ -721,21 +726,22 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
         if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
           SwapVector[Repr].WebRejected = 1;
 
-          DEBUG(dbgs() <<
-                format("Web %d rejected for swap not feeding only stores\n",
-                       Repr));
-          DEBUG(dbgs() << "  def " << " : ");
-          DEBUG(DefMI->dump());
-          DEBUG(dbgs() << "  use " << UseIdx << ": ");
-          DEBUG(SwapVector[UseIdx].VSEMI->dump());
-          DEBUG(dbgs() << "\n");
+          LLVM_DEBUG(
+              dbgs() << format(
+                  "Web %d rejected for swap not feeding only stores\n", Repr));
+          LLVM_DEBUG(dbgs() << "  def "
+                            << " : ");
+          LLVM_DEBUG(DefMI->dump());
+          LLVM_DEBUG(dbgs() << "  use " << UseIdx << ": ");
+          LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
+          LLVM_DEBUG(dbgs() << "\n");
         }
       }
     }
   }
 
-  DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
-  DEBUG(dumpSwapVector());
+  LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
+  LLVM_DEBUG(dumpSwapVector());
 }
 
 // Walk the swap vector entries looking for swaps fed by permuting loads
@@ -745,7 +751,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
 // such that multiple loads feed the same swap, etc.)
 void PPCVSXSwapRemoval::markSwapsForRemoval() {
 
-  DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
+  LLVM_DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
 
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
 
@@ -760,8 +766,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
           int UseIdx = SwapMap[&UseMI];
           SwapVector[UseIdx].WillRemove = 1;
 
-          DEBUG(dbgs() << "Marking swap fed by load for removal: ");
-          DEBUG(UseMI.dump());
+          LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
+          LLVM_DEBUG(UseMI.dump());
         }
       }
 
@@ -775,8 +781,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
         int DefIdx = SwapMap[DefMI];
         SwapVector[DefIdx].WillRemove = 1;
 
-        DEBUG(dbgs() << "Marking swap feeding store for removal: ");
-        DEBUG(DefMI->dump());
+        LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
+        LLVM_DEBUG(DefMI->dump());
       }
 
     } else if (SwapVector[EntryIdx].IsSwappable &&
@@ -821,8 +827,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
     unsigned NElts;
 
-    DEBUG(dbgs() << "Changing splat: ");
-    DEBUG(MI->dump());
+    LLVM_DEBUG(dbgs() << "Changing splat: ");
+    LLVM_DEBUG(MI->dump());
 
     switch (MI->getOpcode()) {
     default:
@@ -845,8 +851,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     else
       MI->getOperand(1).setImm(EltNo);
 
-    DEBUG(dbgs() << "  Into: ");
-    DEBUG(MI->dump());
+    LLVM_DEBUG(dbgs() << "  Into: ");
+    LLVM_DEBUG(MI->dump());
     break;
   }
 
@@ -859,8 +865,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
   case SHValues::SH_XXPERMDI: {
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
 
-    DEBUG(dbgs() << "Changing XXPERMDI: ");
-    DEBUG(MI->dump());
+    LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
+    LLVM_DEBUG(MI->dump());
 
     unsigned Selector = MI->getOperand(3).getImm();
     if (Selector == 0 || Selector == 3)
@@ -872,8 +878,14 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     MI->getOperand(1).setReg(Reg2);
     MI->getOperand(2).setReg(Reg1);
 
-    DEBUG(dbgs() << "  Into: ");
-    DEBUG(MI->dump());
+    // We also need to swap kill flag associated with the register.
+    bool IsKill1 = MI->getOperand(1).isKill();
+    bool IsKill2 = MI->getOperand(2).isKill();
+    MI->getOperand(1).setIsKill(IsKill2);
+    MI->getOperand(2).setIsKill(IsKill1);
+
+    LLVM_DEBUG(dbgs() << "  Into: ");
+    LLVM_DEBUG(MI->dump());
     break;
   }
 
@@ -883,16 +895,16 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
   case SHValues::SH_COPYWIDEN: {
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
 
-    DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
-    DEBUG(MI->dump());
+    LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
+    LLVM_DEBUG(MI->dump());
 
     unsigned DstReg = MI->getOperand(0).getReg();
     const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
     unsigned NewVReg = MRI->createVirtualRegister(DstRC);
 
     MI->getOperand(0).setReg(NewVReg);
-    DEBUG(dbgs() << "  Into: ");
-    DEBUG(MI->dump());
+    LLVM_DEBUG(dbgs() << "  Into: ");
+    LLVM_DEBUG(MI->dump());
 
     auto InsertPoint = ++MachineBasicBlock::iterator(MI);
 
@@ -908,19 +920,19 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
       BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
               TII->get(PPC::COPY), VSRCTmp1)
         .addReg(NewVReg);
-      DEBUG(std::prev(InsertPoint)->dump());
+      LLVM_DEBUG(std::prev(InsertPoint)->dump());
 
       insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
-      DEBUG(std::prev(InsertPoint)->dump());
+      LLVM_DEBUG(std::prev(InsertPoint)->dump());
 
       BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
               TII->get(PPC::COPY), DstReg)
         .addReg(VSRCTmp2);
-      DEBUG(std::prev(InsertPoint)->dump());
+      LLVM_DEBUG(std::prev(InsertPoint)->dump());
 
     } else {
       insertSwap(MI, InsertPoint, DstReg, NewVReg);
-      DEBUG(std::prev(InsertPoint)->dump());
+      LLVM_DEBUG(std::prev(InsertPoint)->dump());
     }
     break;
   }
@@ -931,7 +943,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
 // a copy operation.
 bool PPCVSXSwapRemoval::removeSwaps() {
 
-  DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
+  LLVM_DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
 
   bool Changed = false;
 
@@ -944,9 +956,9 @@ bool PPCVSXSwapRemoval::removeSwaps() {
               MI->getOperand(0).getReg())
           .add(MI->getOperand(1));
 
-      DEBUG(dbgs() << format("Replaced %d with copy: ",
-                             SwapVector[EntryIdx].VSEId));
-      DEBUG(MI->dump());
+      LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
+                                  SwapVector[EntryIdx].VSEId));
+      LLVM_DEBUG(MI->dump());
 
       MI->eraseFromParent();
     }
diff --git a/contrib/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/contrib/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 3299a53ff5ba..9a455c105482 100644
--- a/contrib/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/contrib/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -10,11 +10,13 @@
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "MCTargetDesc/RISCVTargetStreamer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -22,10 +24,17 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 
+#include <limits>
+
 using namespace llvm;
 
+// Include the auto-generated portion of the compress emitter.
+#define GEN_COMPRESS_INSTR
+#include "RISCVGenCompressInstEmitter.inc"
+
 namespace {
 struct RISCVOperand;
 
@@ -33,11 +42,16 @@ class RISCVAsmParser : public MCTargetAsmParser {
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
   bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
 
+  RISCVTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<RISCVTargetStreamer &>(TS);
+  }
+
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
 
   bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
-                                  int Lower, int Upper, Twine Msg);
+                                  int64_t Lower, int64_t Upper, Twine Msg);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -51,6 +65,20 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
+  // Helper to actually emit an instruction to the MCStreamer. Also, when
+  // possible, compression of the instruction is performed.
+  void emitToStreamer(MCStreamer &S, const MCInst &Inst);
+
+  // Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
+  // synthesize the desired immedate value into the destination register.
+  void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);
+
+  /// Helper for processing MC instructions that have been successfully matched
+  /// by MatchAndEmitInstruction. Modifications to the emitted instructions,
+  /// like the expansion of pseudo instructions (e.g., "li"), can be performed
+  /// in this method.
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+
 // Auto-generated instruction matching functions
 #define GET_ASSEMBLER_HEADER
 #include "RISCVGenAsmMatcher.inc"
@@ -61,8 +89,25 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
   OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
 
-  bool parseOperand(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool ForceImmediate);
+
+  bool parseDirectiveOption();
+
+  void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
+    if (!(getSTI().getFeatureBits()[Feature])) {
+      MCSubtargetInfo &STI = copySTI();
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+    }
+  }
 
+  void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
+    if (getSTI().getFeatureBits()[Feature]) {
+      MCSubtargetInfo &STI = copySTI();
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+    }
+  }
 public:
   enum RISCVMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -78,6 +123,10 @@ public:
   RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                  const MCInstrInfo &MII, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, STI, MII) {
+    Parser.addAliasForDirective(".half", ".2byte");
+    Parser.addAliasForDirective(".hword", ".2byte");
+    Parser.addAliasForDirective(".word", ".4byte");
+    Parser.addAliasForDirective(".dword", ".8byte");
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 };
@@ -167,6 +216,16 @@ public:
 
   // Predicate methods for AsmOperands defined in RISCVInstrInfo.td
 
+  bool isBareSymbol() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK;
+    // Must be of 'immediate' type but not a constant.
+    if (!isImm() || evaluateConstantImm(Imm, VK))
+      return false;
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   /// Return true if the operand is a valid for the fence instruction e.g.
   /// ('iorw').
   bool isFenceArg() const {
@@ -206,6 +265,18 @@ public:
     return RISCVFPRndMode::stringToRoundingMode(Str) != RISCVFPRndMode::Invalid;
   }
 
+  bool isImmXLen() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK;
+    if (!isImm())
+      return false;
+    bool IsConstantImm = evaluateConstantImm(Imm, VK);
+    // Given only Imm, ensuring that the actually specified constant is either
+    // a signed or unsigned 64-bit number is unfortunately impossible.
+    bool IsInRange = isRV64() ? true : isInt<32>(Imm) || isUInt<32>(Imm);
+    return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isUImmLog2XLen() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK;
@@ -260,12 +331,26 @@ public:
            (VK == RISCVMCExpr::VK_RISCV_None || VK == RISCVMCExpr::VK_RISCV_LO);
   }
 
-  bool isUImm6NonZero() const {
+  bool isSImm6NonZero() const {
+    RISCVMCExpr::VariantKind VK;
+    int64_t Imm;
+    bool IsValid;
+    bool IsConstantImm = evaluateConstantImm(Imm, VK);
+    if (!IsConstantImm)
+      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+    else
+      IsValid = ((Imm != 0) && isInt<6>(Imm));
+    return IsValid &&
+           (VK == RISCVMCExpr::VK_RISCV_None || VK == RISCVMCExpr::VK_RISCV_LO);
+  }
+
+  bool isCLUIImm() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK;
     bool IsConstantImm = evaluateConstantImm(Imm, VK);
-    return IsConstantImm && isUInt<6>(Imm) && (Imm != 0) &&
-           VK == RISCVMCExpr::VK_RISCV_None;
+    return IsConstantImm && (Imm != 0) &&
+           (isUInt<5>(Imm) || (Imm >= 0xfffe0 && Imm <= 0xfffff)) &&
+            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
   bool isUImm7Lsb00() const {
@@ -321,8 +406,9 @@ public:
       IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
     else
       IsValid = isInt<12>(Imm);
-    return IsValid &&
-           (VK == RISCVMCExpr::VK_RISCV_None || VK == RISCVMCExpr::VK_RISCV_LO);
+    return IsValid && (VK == RISCVMCExpr::VK_RISCV_None ||
+                       VK == RISCVMCExpr::VK_RISCV_LO ||
+                       VK == RISCVMCExpr::VK_RISCV_PCREL_LO);
   }
 
   bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }
@@ -338,11 +424,11 @@ public:
 
   bool isSImm13Lsb0() const { return isBareSimmNLsb0<13>(); }
 
-  bool isSImm10Lsb0000() const {
+  bool isSImm10Lsb0000NonZero() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK;
     bool IsConstantImm = evaluateConstantImm(Imm, VK);
-    return IsConstantImm && isShiftedInt<6, 4>(Imm) &&
+    return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) &&
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
@@ -564,7 +650,7 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
 }
 
 bool RISCVAsmParser::generateImmOutOfRangeError(
-    OperandVector &Operands, uint64_t ErrorInfo, int Lower, int Upper,
+    OperandVector &Operands, uint64_t ErrorInfo, int64_t Lower, int64_t Upper,
     Twine Msg = "immediate must be an integer in the range") {
   SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
   return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
@@ -581,9 +667,7 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   default:
     break;
   case Match_Success:
-    Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, getSTI());
-    return false;
+    return processInstruction(Inst, IDLoc, Out);
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
   case Match_MnemonicFail:
@@ -600,6 +684,14 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     }
     return Error(ErrorLoc, "invalid operand for instruction");
   }
+  case Match_InvalidImmXLen:
+    if (isRV64()) {
+      SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+      return Error(ErrorLoc, "operand must be a constant 64-bit integer");
+    }
+    return generateImmOutOfRangeError(Operands, ErrorInfo,
+                                      std::numeric_limits<int32_t>::min(),
+                                      std::numeric_limits<uint32_t>::max());
   case Match_InvalidUImmLog2XLen:
     if (isRV64())
       return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
@@ -613,8 +705,14 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSImm6:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
                                       (1 << 5) - 1);
-  case Match_InvalidUImm6NonZero:
-    return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 6) - 1);
+  case Match_InvalidSImm6NonZero:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
+                                      (1 << 5) - 1,
+        "immediate must be non-zero in the range");
+  case Match_InvalidCLUIImm:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 1, (1 << 5) - 1,
+        "immediate must be in [0xfffe0, 0xfffff] or");
   case Match_InvalidUImm7Lsb00:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, 0, (1 << 7) - 4,
@@ -639,10 +737,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, 4, (1 << 10) - 4,
         "immediate must be a multiple of 4 bytes in the range");
-  case Match_InvalidSImm10Lsb0000:
+  case Match_InvalidSImm10Lsb0000NonZero:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16,
-        "immediate must be a multiple of 16 bytes in the range");
+        "immediate must be a multiple of 16 bytes and non-zero in the range");
   case Match_InvalidSImm12:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 11),
                                       (1 << 11) - 1);
@@ -674,6 +772,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         ErrorLoc,
         "operand must be a valid floating point rounding mode mnemonic");
   }
+  case Match_InvalidBareSymbol: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be a bare symbol name");
+  }
   }
 
   llvm_unreachable("Unknown match type detected!");
@@ -838,12 +940,15 @@ RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-/// Looks at a token type and creates the relevant operand
-/// from this information, adding to Operands.
-/// If operand was parsed, returns false, else true.
-bool RISCVAsmParser::parseOperand(OperandVector &Operands) {
-  // Attempt to parse token as register
-  if (parseRegister(Operands, true) == MatchOperand_Success)
+/// Looks at a token type and creates the relevant operand from this
+/// information, adding to Operands. If operand was parsed, returns false, else
+/// true. If ForceImmediate is true, no attempt will be made to parse the
+/// operand as a register, which is needed for pseudoinstructions such as
+/// call.
+bool RISCVAsmParser::parseOperand(OperandVector &Operands,
+                                  bool ForceImmediate) {
+  // Attempt to parse token as register, unless ForceImmediate.
+  if (!ForceImmediate && parseRegister(Operands, true) == MatchOperand_Success)
     return false;
 
   // Attempt to parse token as an immediate
@@ -870,7 +975,8 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
     return false;
 
   // Parse first operand
-  if (parseOperand(Operands))
+  bool ForceImmediate = (Name == "call" || Name == "tail");
+  if (parseOperand(Operands, ForceImmediate))
     return true;
 
   // Parse until end of statement, consuming commas between operands
@@ -879,7 +985,7 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
     getLexer().Lex();
 
     // Parse next operand
-    if (parseOperand(Operands))
+    if (parseOperand(Operands, false))
       return true;
   }
 
@@ -924,7 +1030,7 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
       isa<MCSymbolRefExpr>(BE->getRHS()))
     return true;
 
-  // See if the addend is is a constant, otherwise there's more going
+  // See if the addend is a constant, otherwise there's more going
   // on here than we can deal with.
   auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
   if (!AddendExpr)
@@ -938,7 +1044,165 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
   return Kind != RISCVMCExpr::VK_RISCV_Invalid;
 }
 
-bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) { return true; }
+bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
+  // This returns false if this function recognizes the directive
+  // regardless of whether it is successfully handles or reports an
+  // error. Otherwise it returns true to give the generic parser a
+  // chance at recognizing it.
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".option")
+    return parseDirectiveOption();
+
+  return true;
+}
+
+bool RISCVAsmParser::parseDirectiveOption() {
+  MCAsmParser &Parser = getParser();
+  // Get the option token.
+  AsmToken Tok = Parser.getTok();
+  // At the moment only identifiers are supported.
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(Parser.getTok().getLoc(),
+                 "unexpected token, expected identifier");
+
+  StringRef Option = Tok.getIdentifier();
+
+  if (Option == "rvc") {
+    getTargetStreamer().emitDirectiveOptionRVC();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    setFeatureBits(RISCV::FeatureStdExtC, "c");
+    return false;
+  }
+
+  if (Option == "norvc") {
+    getTargetStreamer().emitDirectiveOptionNoRVC();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    clearFeatureBits(RISCV::FeatureStdExtC, "c");
+    return false;
+  }
+
+  // Unknown option.
+  Warning(Parser.getTok().getLoc(),
+          "unknown option, expected 'rvc' or 'norvc'");
+  Parser.eatToEndOfStatement();
+  return false;
+}
+
+void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  MCInst CInst;
+  bool Res = compressInst(CInst, Inst, getSTI(), S.getContext());
+  CInst.setLoc(Inst.getLoc());
+  S.EmitInstruction((Res ? CInst : Inst), getSTI());
+}
+
+void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
+                                 MCStreamer &Out) {
+  if (isInt<32>(Value)) {
+    // Emits the MC instructions for loading a 32-bit constant into a register.
+    //
+    // Depending on the active bits in the immediate Value v, the following
+    // instruction sequences are emitted:
+    //
+    // v == 0                        : ADDI(W)
+    // v[0,12) != 0 && v[12,32) == 0 : ADDI(W)
+    // v[0,12) == 0 && v[12,32) != 0 : LUI
+    // v[0,32) != 0                  : LUI+ADDI(W)
+    //
+    int64_t Hi20 = ((Value + 0x800) >> 12) & 0xFFFFF;
+    int64_t Lo12 = SignExtend64<12>(Value);
+    unsigned SrcReg = RISCV::X0;
+
+    if (Hi20) {
+      emitToStreamer(Out,
+                     MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Hi20));
+      SrcReg = DestReg;
+    }
+
+    if (Lo12 || Hi20 == 0) {
+      unsigned AddiOpcode =
+          STI->hasFeature(RISCV::Feature64Bit) ? RISCV::ADDIW : RISCV::ADDI;
+      emitToStreamer(Out, MCInstBuilder(AddiOpcode)
+                              .addReg(DestReg)
+                              .addReg(SrcReg)
+                              .addImm(Lo12));
+    }
+    return;
+  }
+  assert(STI->hasFeature(RISCV::Feature64Bit) &&
+         "Target must be 64-bit to support a >32-bit constant");
+
+  // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
+  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
+  // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
+  // while the following ADDI instructions contribute up to 12 bits each.
+  //
+  // On the first glance, implementing this seems to be possible by simply
+  // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left
+  // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the
+  // fact that ADDI performs a sign extended addition, doing it like that would
+  // only be possible when at most 11 bits of the ADDI instructions are used.
+  // Using all 12 bits of the ADDI instructions, like done by GAS, actually
+  // requires that the constant is processed starting with the least significant
+  // bit.
+  //
+  // In the following, constants are processed from LSB to MSB but instruction
+  // emission is performed from MSB to LSB by recursively calling
+  // emitLoadImm. In each recursion, first the lowest 12 bits are removed
+  // from the constant and the optimal shift amount, which can be greater than
+  // 12 bits if the constant is sparse, is determined. Then, the shifted
+  // remaining constant is processed recursively and gets emitted as soon as it
+  // fits into 32 bits. The emission of the shifts and additions is subsequently
+  // performed when the recursion returns.
+  //
+  int64_t Lo12 = SignExtend64<12>(Value);
+  int64_t Hi52 = (Value + 0x800) >> 12;
+  int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
+  Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
+
+  emitLoadImm(DestReg, Hi52, Out);
+
+  emitToStreamer(Out, MCInstBuilder(RISCV::SLLI)
+                          .addReg(DestReg)
+                          .addReg(DestReg)
+                          .addImm(ShiftAmount));
+
+  if (Lo12)
+    emitToStreamer(Out, MCInstBuilder(RISCV::ADDI)
+                            .addReg(DestReg)
+                            .addReg(DestReg)
+                            .addImm(Lo12));
+}
+
+bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                        MCStreamer &Out) {
+  Inst.setLoc(IDLoc);
+
+  if (Inst.getOpcode() == RISCV::PseudoLI) {
+    auto Reg = Inst.getOperand(0).getReg();
+    int64_t Imm = Inst.getOperand(1).getImm();
+    // On RV32 the immediate here can either be a signed or an unsigned
+    // 32-bit number. Sign extension has to be performed to ensure that Imm
+    // represents the expected signed 64-bit number.
+    if (!isRV64())
+      Imm = SignExtend64<32>(Imm);
+    emitLoadImm(Reg, Imm, Out);
+    return false;
+  }
+
+  emitToStreamer(Out, Inst);
+  return false;
+}
 
 extern "C" void LLVMInitializeRISCVAsmParser() {
   RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
diff --git a/contrib/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/contrib/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 563edc9e29d8..7bbb371a757f 100644
--- a/contrib/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/contrib/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -232,6 +232,17 @@ static DecodeStatus decodeSImmOperandAndLsl1(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeCLUIImmOperand(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address,
+                                         const void *Decoder) {
+  assert(isUInt<6>(Imm) && "Invalid immediate");
+  if (Imm > 31) {
+    Imm = (SignExtend64<6>(Imm) & 0xfffff);
+  }
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
 #include "RISCVGenDisassemblerTables.inc"
 
 DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
@@ -247,14 +258,15 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   // It's a 32 bit instruction if bit 0 and 1 are 1.
   if ((Bytes[0] & 0x3) == 0x3) {
     Insn = support::endian::read32le(Bytes.data());
-    DEBUG(dbgs() << "Trying RISCV32 table :\n");
+    LLVM_DEBUG(dbgs() << "Trying RISCV32 table :\n");
     Result = decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
     Size = 4;
   } else {
     Insn = support::endian::read16le(Bytes.data());
 
     if (!STI.getFeatureBits()[RISCV::Feature64Bit]) {
-      DEBUG(dbgs() << "Trying RISCV32Only_16 table (16-bit Instruction):\n");
+      LLVM_DEBUG(
+          dbgs() << "Trying RISCV32Only_16 table (16-bit Instruction):\n");
       // Calling the auto-generated decoder function.
       Result = decodeInstruction(DecoderTableRISCV32Only_16, MI, Insn, Address,
                                  this, STI);
@@ -264,7 +276,7 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       }
     }
 
-    DEBUG(dbgs() << "Trying RISCV_C table (16-bit Instruction):\n");
+    LLVM_DEBUG(dbgs() << "Trying RISCV_C table (16-bit Instruction):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTable16, MI, Insn, Address, this, STI);
     Size = 2;
diff --git a/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
index ff56fc5d90ff..300e6fd9750a 100644
--- a/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
@@ -13,10 +13,12 @@
 
 #include "RISCVInstPrinter.h"
 #include "MCTargetDesc/RISCVBaseInfo.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,6 +31,10 @@ using namespace llvm;
 #define PRINT_ALIAS_INSTR
 #include "RISCVGenAsmWriter.inc"
 
+// Include the auto-generated portion of the compress emitter.
+#define GEN_UNCOMPRESS_INSTR
+#include "RISCVGenCompressInstEmitter.inc"
+
 static cl::opt<bool>
 NoAliases("riscv-no-aliases",
             cl::desc("Disable the emission of assembler pseudo instructions"),
@@ -37,8 +43,15 @@ NoAliases("riscv-no-aliases",
 
 void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                  StringRef Annot, const MCSubtargetInfo &STI) {
-  if (NoAliases || !printAliasInstr(MI, O))
-    printInstruction(MI, O);
+  bool Res = false;
+  const MCInst *NewMI = MI;
+  MCInst UncompressedMI;
+  if (!NoAliases)
+    Res = uncompressInst(UncompressedMI, *MI, MRI, STI);
+  if (Res)
+    NewMI = const_cast<MCInst*>(&UncompressedMI);
+  if (NoAliases || !printAliasInstr(NewMI, STI, O))
+    printInstruction(NewMI, STI, O);
   printAnnotation(O, Annot);
 }
 
@@ -47,6 +60,7 @@ void RISCVInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
 }
 
 void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
                                     raw_ostream &O, const char *Modifier) {
   assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &MO = MI->getOperand(OpNo);
@@ -66,6 +80,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
   unsigned FenceArg = MI->getOperand(OpNo).getImm();
   if ((FenceArg & RISCVFenceField::I) != 0)
@@ -79,6 +94,7 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
 }
 
 void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
                                    raw_ostream &O) {
   auto FRMArg =
       static_cast<RISCVFPRndMode::RoundingMode>(MI->getOperand(OpNo).getImm());
diff --git a/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h b/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
index 58f3f8410159..241be8daf113 100644
--- a/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
+++ b/contrib/llvm/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
@@ -30,16 +30,21 @@ public:
                  const MCSubtargetInfo &STI) override;
   void printRegName(raw_ostream &O, unsigned RegNo) const override;
 
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = nullptr);
-  void printFenceArg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printFRMArg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O, const char *Modifier = nullptr);
+  void printFenceArg(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
 
   // Autogenerated by tblgen.
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                       raw_ostream &O);
   void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+                               unsigned PrintMethodIdx,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = RISCV::ABIRegAltName);
 };
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 6e06a4975e2a..9ba7ebd0eb0f 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -27,46 +27,74 @@ using namespace llvm;
 
 namespace {
 class RISCVAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo &STI;
   uint8_t OSABI;
   bool Is64Bit;
 
 public:
-  RISCVAsmBackend(uint8_t OSABI, bool Is64Bit)
-      : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
+  RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit)
+      : MCAsmBackend(support::little), STI(STI), OSABI(OSABI),
+        Is64Bit(Is64Bit) {}
   ~RISCVAsmBackend() override {}
 
+  // Generate diff expression relocations if the relax feature is enabled,
+  // otherwise it is safe for the assembler to calculate these internally.
+  bool requiresDiffExpressionRelocations() const override {
+    return STI.getFeatureBits()[RISCV::FeatureRelax];
+  }
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  // If linker relaxation is enabled, always emit relocations even if the fixup
+  // can be resolved. This is necessary for correctness as offsets may change
+  // during relaxation.
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
+    return STI.getFeatureBits()[RISCV::FeatureRelax];
+  }
 
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
-    return false;
+    llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
   }
 
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout,
+                                    const bool WasForced) const override;
+
   unsigned getNumFixupKinds() const override {
     return RISCV::NumTargetFixupKinds;
   }
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
-    const static MCFixupKindInfo Infos[RISCV::NumTargetFixupKinds] = {
+    const static MCFixupKindInfo Infos[] = {
       // This table *must* be in the order that the fixup_* kinds are defined in
       // RISCVFixupKinds.h.
       //
-      // name                    offset bits  flags
-      { "fixup_riscv_hi20",       12,     20,  0 },
-      { "fixup_riscv_lo12_i",     20,     12,  0 },
-      { "fixup_riscv_lo12_s",      0,     32,  0 },
-      { "fixup_riscv_pcrel_hi20", 12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_jal",        12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_branch",      0,     32,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_rvc_jump",    2,     11,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_rvc_branch",  0,     16,  MCFixupKindInfo::FKF_IsPCRel }
+      // name                      offset bits  flags
+      { "fixup_riscv_hi20",         12,     20,  0 },
+      { "fixup_riscv_lo12_i",       20,     12,  0 },
+      { "fixup_riscv_lo12_s",        0,     32,  0 },
+      { "fixup_riscv_pcrel_hi20",   12,     20,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_pcrel_lo12_i", 20,     12,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_pcrel_lo12_s",  0,     32,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_jal",          12,     20,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_branch",        0,     32,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_rvc_jump",      2,     11,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_rvc_branch",    0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_call",          0,     64,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_relax",         0,      0,  0 }
     };
+    static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
+                  "Not all fixup kinds added to Infos array");
 
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
@@ -76,26 +104,121 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override;
+  unsigned getRelaxedOpcode(unsigned Op) const;
 
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {
+                        MCInst &Res) const override;
 
-    report_fatal_error("RISCVAsmBackend::relaxInstruction() unimplemented");
-  }
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
-bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // Once support for the compressed instruction set is added, we will be able
-  // to conditionally support 16-bit NOPs
-  if ((Count % 4) != 0)
+
+bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+                                                   bool Resolved,
+                                                   uint64_t Value,
+                                                   const MCRelaxableFragment *DF,
+                                                   const MCAsmLayout &Layout,
+                                                   const bool WasForced) const {
+  // Return true if the symbol is actually unresolved.
+  // Resolved could be always false when shouldForceRelocation return true.
+  // We use !WasForced to indicate that the symbol is unresolved and not forced
+  // by shouldForceRelocation.
+  if (!Resolved && !WasForced)
+    return true;
+
+  int64_t Offset = int64_t(Value);
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+  case RISCV::fixup_riscv_rvc_branch:
+    // For compressed branch instructions the immediate must be
+    // in the range [-256, 254].
+    return Offset > 254 || Offset < -256;
+  case RISCV::fixup_riscv_rvc_jump:
+    // For compressed jump instructions the immediate must be
+    // in the range [-2048, 2046].
+    return Offset > 2046 || Offset < -2048;
+  }
+}
+
+void RISCVAsmBackend::relaxInstruction(const MCInst &Inst,
+                                       const MCSubtargetInfo &STI,
+                                       MCInst &Res) const {
+  // TODO: replace this with call to auto generated uncompressinstr() function.
+  switch (Inst.getOpcode()) {
+  default:
+    llvm_unreachable("Opcode not expected!");
+  case RISCV::C_BEQZ:
+    // c.beqz $rs1, $imm -> beq $rs1, X0, $imm.
+    Res.setOpcode(RISCV::BEQ);
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(MCOperand::createReg(RISCV::X0));
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  case RISCV::C_BNEZ:
+    // c.bnez $rs1, $imm -> bne $rs1, X0, $imm.
+    Res.setOpcode(RISCV::BNE);
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(MCOperand::createReg(RISCV::X0));
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  case RISCV::C_J:
+    // c.j $imm -> jal X0, $imm.
+    Res.setOpcode(RISCV::JAL);
+    Res.addOperand(MCOperand::createReg(RISCV::X0));
+    Res.addOperand(Inst.getOperand(0));
+    break;
+  case RISCV::C_JAL:
+    // c.jal $imm -> jal X1, $imm.
+    Res.setOpcode(RISCV::JAL);
+    Res.addOperand(MCOperand::createReg(RISCV::X1));
+    Res.addOperand(Inst.getOperand(0));
+    break;
+  }
+}
+
+// Given a compressed control flow instruction this function returns
+// the expanded instruction.
+unsigned RISCVAsmBackend::getRelaxedOpcode(unsigned Op) const {
+  switch (Op) {
+  default:
+    return Op;
+  case RISCV::C_BEQZ:
+    return RISCV::BEQ;
+  case RISCV::C_BNEZ:
+    return RISCV::BNE;
+  case RISCV::C_J:
+  case RISCV::C_JAL: // fall through.
+    return RISCV::JAL;
+  }
+}
+
+bool RISCVAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+                                        const MCSubtargetInfo &STI) const {
+  return getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode();
+}
+
+bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+  bool HasStdExtC = STI.getFeatureBits()[RISCV::FeatureStdExtC];
+  unsigned MinNopLen = HasStdExtC ? 2 : 4;
+
+  if ((Count % MinNopLen) != 0)
     return false;
 
-  // The canonical nop on RISC-V is addi x0, x0, 0
-  for (uint64_t i = 0; i < Count; i += 4)
-    OW->write32(0x13);
+  // The canonical nop on RISC-V is addi x0, x0, 0.
+  uint64_t Nop32Count = Count / 4;
+  for (uint64_t i = Nop32Count; i != 0; --i)
+    OS.write("\x13\0\0\0", 4);
+
+  // The canonical nop on RVC is c.nop.
+  if (HasStdExtC) {
+    uint64_t Nop16Count = (Count - Nop32Count * 4) / 2;
+    for (uint64_t i = Nop16Count; i != 0; --i)
+      OS.write("\x01\0", 2);
+  }
 
   return true;
 }
@@ -112,8 +235,10 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_8:
     return Value;
   case RISCV::fixup_riscv_lo12_i:
+  case RISCV::fixup_riscv_pcrel_lo12_i:
     return Value & 0xfff;
   case RISCV::fixup_riscv_lo12_s:
+  case RISCV::fixup_riscv_pcrel_lo12_s:
     return (((Value >> 5) & 0x7f) << 25) | ((Value & 0x1f) << 7);
   case RISCV::fixup_riscv_hi20:
   case RISCV::fixup_riscv_pcrel_hi20:
@@ -154,6 +279,14 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value = (Sbit << 31) | (Mid6 << 25) | (Lo4 << 8) | (Hi1 << 7);
     return Value;
   }
+  case RISCV::fixup_riscv_call: {
+    // Jalr will add UpperImm with the sign-extended 12-bit LowerImm,
+    // we need to add 0x800ULL before extract upper bits to reflect the
+    // effect of the sign extension.
+    uint64_t UpperImm = (Value + 0x800ULL) & 0xfffff000ULL;
+    uint64_t LowerImm = Value & 0xfffULL;
+    return UpperImm | ((LowerImm << 20) << 32);
+  }
   case RISCV::fixup_riscv_rvc_jump: {
     // Need to produce offset[11|4|9:8|10|6|7|3:1|5] from the 11-bit Value.
     unsigned Bit11  = (Value >> 11) & 0x1;
@@ -183,20 +316,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-static unsigned getSize(unsigned Kind) {
-  switch (Kind) {
-  default:
-    return 4;
-  case RISCV::fixup_riscv_rvc_jump:
-  case RISCV::fixup_riscv_rvc_branch:
-    return 2;
-  }
-}
-
 void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                  const MCValue &Target,
                                  MutableArrayRef<char> Data, uint64_t Value,
-                                 bool IsResolved) const {
+                                 bool IsResolved,
+                                 const MCSubtargetInfo *STI) const {
   MCContext &Ctx = Asm.getContext();
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
   if (!Value)
@@ -208,23 +332,20 @@ void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  unsigned FullSize = getSize(Fixup.getKind());
+  unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
 
-#ifndef NDEBUG
-  unsigned NumBytes = (Info.TargetSize + 7) / 8;
   assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
-#endif
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
-  for (unsigned i = 0; i != FullSize; ++i) {
+  for (unsigned i = 0; i != NumBytes; ++i) {
     Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-RISCVAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createRISCVELFObjectWriter(OS, OSABI, Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+RISCVAsmBackend::createObjectTargetWriter() const {
+  return createRISCVELFObjectWriter(OSABI, Is64Bit);
 }
 
 } // end anonymous namespace
@@ -235,5 +356,5 @@ MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T,
                                           const MCTargetOptions &Options) {
   const Triple &TT = STI.getTargetTriple();
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
-  return new RISCVAsmBackend(OSABI, TT.isArch64Bit());
+  return new RISCVAsmBackend(STI, OSABI, TT.isArch64Bit());
 }
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index ad53228c104a..9b88614aa693 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -23,6 +23,15 @@ public:
 
   ~RISCVELFObjectWriter() override;
 
+  // Return true if the given relocation must be with a symbol rather than
+  // section plus offset.
+  bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                               unsigned Type) const override {
+    // TODO: this is very conservative, update once RISC-V psABI requirements
+    //       are clarified.
+    return true;
+  }
+
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
@@ -47,6 +56,22 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_RISCV_32;
   case FK_Data_8:
     return ELF::R_RISCV_64;
+  case FK_Data_Add_1:
+    return ELF::R_RISCV_ADD8;
+  case FK_Data_Add_2:
+    return ELF::R_RISCV_ADD16;
+  case FK_Data_Add_4:
+    return ELF::R_RISCV_ADD32;
+  case FK_Data_Add_8:
+    return ELF::R_RISCV_ADD64;
+  case FK_Data_Sub_1:
+    return ELF::R_RISCV_SUB8;
+  case FK_Data_Sub_2:
+    return ELF::R_RISCV_SUB16;
+  case FK_Data_Sub_4:
+    return ELF::R_RISCV_SUB32;
+  case FK_Data_Sub_8:
+    return ELF::R_RISCV_SUB64;
   case RISCV::fixup_riscv_hi20:
     return ELF::R_RISCV_HI20;
   case RISCV::fixup_riscv_lo12_i:
@@ -55,6 +80,10 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_RISCV_LO12_S;
   case RISCV::fixup_riscv_pcrel_hi20:
     return ELF::R_RISCV_PCREL_HI20;
+  case RISCV::fixup_riscv_pcrel_lo12_i:
+    return ELF::R_RISCV_PCREL_LO12_I;
+  case RISCV::fixup_riscv_pcrel_lo12_s:
+    return ELF::R_RISCV_PCREL_LO12_S;
   case RISCV::fixup_riscv_jal:
     return ELF::R_RISCV_JAL;
   case RISCV::fixup_riscv_branch:
@@ -63,13 +92,14 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_RISCV_RVC_JUMP;
   case RISCV::fixup_riscv_rvc_branch:
     return ELF::R_RISCV_RVC_BRANCH;
+  case RISCV::fixup_riscv_call:
+    return ELF::R_RISCV_CALL;
+  case RISCV::fixup_riscv_relax:
+    return ELF::R_RISCV_RELAX;
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createRISCVELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                                 bool Is64Bit) {
-  return createELFObjectWriter(
-      llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit), OS,
-      /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createRISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
+  return llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit);
 }
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
new file mode 100644
index 000000000000..6428b11cfe9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -0,0 +1,42 @@
+//===-- RISCVELFStreamer.cpp - RISCV ELF Target Streamer Methods ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides RISCV specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVELFStreamer.h"
+#include "RISCVMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+// This part is for ELF object output.
+RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
+                                               const MCSubtargetInfo &STI)
+    : RISCVTargetStreamer(S) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+
+  const FeatureBitset &Features = STI.getFeatureBits();
+
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  if (Features[RISCV::FeatureStdExtC])
+    EFlags |= ELF::EF_RISCV_RVC;
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+MCELFStreamer &RISCVTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void RISCVTargetELFStreamer::emitDirectiveOptionRVC() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionNoRVC() {}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
new file mode 100644
index 000000000000..daa7abfe1336
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -0,0 +1,27 @@
+//===-- RISCVELFStreamer.h - RISCV ELF Target Streamer ---------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H
+#define LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H
+
+#include "RISCVTargetStreamer.h"
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+class RISCVTargetELFStreamer : public RISCVTargetStreamer {
+public:
+  MCELFStreamer &getStreamer();
+  RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+  virtual void emitDirectiveOptionRVC();
+  virtual void emitDirectiveOptionNoRVC();
+};
+}
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index cfb5d99e79f5..6a1224be774e 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -29,6 +29,12 @@ enum Fixups {
   // fixup_riscv_pcrel_hi20 - 20-bit fixup corresponding to pcrel_hi(foo) for
   // instructions like auipc
   fixup_riscv_pcrel_hi20,
+  // fixup_riscv_pcrel_lo12_i - 12-bit fixup corresponding to pcrel_lo(foo) for
+  // instructions like addi
+  fixup_riscv_pcrel_lo12_i,
+  // fixup_riscv_pcrel_lo12_s - 12-bit fixup corresponding to pcrel_lo(foo) for
+  // the S-type store instructions
+  fixup_riscv_pcrel_lo12_s,
   // fixup_riscv_jal - 20-bit fixup for symbol references in the jal
   // instruction
   fixup_riscv_jal,
@@ -41,6 +47,12 @@ enum Fixups {
   // fixup_riscv_rvc_branch - 8-bit fixup for symbol references in the
   // compressed branch instruction
   fixup_riscv_rvc_branch,
+  // fixup_riscv_call - A fixup representing a call attached to the auipc
+  // instruction in a pair composed of adjacent auipc+jalr instructions.
+  fixup_riscv_call,
+  // fixup_riscv_relax - Used to generate an R_RISCV_RELAX relocation type,
+  // which indicates the linker may relax the instruction pair.
+  fixup_riscv_relax,
 
   // fixup_riscv_invalid - used as a sentinel and a marker, must be last fixup
   fixup_riscv_invalid,
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index d622911e92c4..780dae410cd0 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -22,4 +22,6 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
   CommentString = "#";
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
 }
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 641997e67e06..8a796a014b33 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -52,6 +53,10 @@ public:
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override;
 
+  void expandFunctionCall(const MCInst &MI, raw_ostream &OS,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
   /// TableGen'erated function for getting the binary encoding for an
   /// instruction.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -80,6 +85,46 @@ MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
   return new RISCVMCCodeEmitter(Ctx, MCII);
 }
 
+// Expand PseudoCALL and PseudoTAIL to AUIPC and JALR with relocation types.
+// We expand PseudoCALL and PseudoTAIL while encoding, meaning AUIPC and JALR
+// won't go through RISCV MC to MC compressed instruction transformation. This
+// is acceptable because AUIPC has no 16-bit form and C_JALR have no immediate
+// operand field.  We let linker relaxation deal with it. When linker
+// relaxation enabled, AUIPC and JALR have chance relax to JAL. If C extension
+// is enabled, JAL has chance relax to C_JAL.
+void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  MCInst TmpInst;
+  MCOperand Func = MI.getOperand(0);
+  unsigned Ra = (MI.getOpcode() == RISCV::PseudoTAIL) ? RISCV::X6 : RISCV::X1;
+  uint32_t Binary;
+
+  assert(Func.isExpr() && "Expected expression");
+
+  const MCExpr *Expr = Func.getExpr();
+
+  // Create function call expression CallExpr for AUIPC.
+  const MCExpr *CallExpr =
+      RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx);
+
+  // Emit AUIPC Ra, Func with R_RISCV_CALL relocation type.
+  TmpInst = MCInstBuilder(RISCV::AUIPC)
+                .addReg(Ra)
+                .addOperand(MCOperand::createExpr(CallExpr));
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  support::endian::write(OS, Binary, support::little);
+
+  if (MI.getOpcode() == RISCV::PseudoTAIL)
+    // Emit JALR X0, X6, 0
+    TmpInst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Ra).addImm(0);
+  else
+    // Emit JALR X1, X1, 0
+    TmpInst = MCInstBuilder(RISCV::JALR).addReg(Ra).addReg(Ra).addImm(0);
+  Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+  support::endian::write(OS, Binary, support::little);
+}
+
 void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                            SmallVectorImpl<MCFixup> &Fixups,
                                            const MCSubtargetInfo &STI) const {
@@ -87,17 +132,24 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   // Get byte count of instruction.
   unsigned Size = Desc.getSize();
 
+  if (MI.getOpcode() == RISCV::PseudoCALL ||
+      MI.getOpcode() == RISCV::PseudoTAIL) {
+    expandFunctionCall(MI, OS, Fixups, STI);
+    MCNumEmitted += 2;
+    return;
+  }
+
   switch (Size) {
   default:
     llvm_unreachable("Unhandled encodeInstruction length!");
   case 2: {
     uint16_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
-    support::endian::Writer<support::little>(OS).write<uint16_t>(Bits);
+    support::endian::write<uint16_t>(OS, Bits, support::little);
     break;
   }
   case 4: {
     uint32_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
-    support::endian::Writer<support::little>(OS).write(Bits);
+    support::endian::write(OS, Bits, support::little);
     break;
   }
   }
@@ -138,7 +190,7 @@ RISCVMCCodeEmitter::getImmOpValueAsr1(const MCInst &MI, unsigned OpNo,
 unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
                                            SmallVectorImpl<MCFixup> &Fixups,
                                            const MCSubtargetInfo &STI) const {
-
+  bool EnableRelax = STI.getFeatureBits()[RISCV::FeatureRelax];
   const MCOperand &MO = MI.getOperand(OpNo);
 
   MCInstrDesc const &Desc = MCII.get(MI.getOpcode());
@@ -161,15 +213,31 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     case RISCVMCExpr::VK_RISCV_Invalid:
       llvm_unreachable("Unhandled fixup kind!");
     case RISCVMCExpr::VK_RISCV_LO:
-      FixupKind = MIFrm == RISCVII::InstFormatI ? RISCV::fixup_riscv_lo12_i
-                                                : RISCV::fixup_riscv_lo12_s;
+      if (MIFrm == RISCVII::InstFormatI)
+        FixupKind = RISCV::fixup_riscv_lo12_i;
+      else if (MIFrm == RISCVII::InstFormatS)
+        FixupKind = RISCV::fixup_riscv_lo12_s;
+      else
+        llvm_unreachable("VK_RISCV_LO used with unexpected instruction format");
       break;
     case RISCVMCExpr::VK_RISCV_HI:
       FixupKind = RISCV::fixup_riscv_hi20;
       break;
+    case RISCVMCExpr::VK_RISCV_PCREL_LO:
+      if (MIFrm == RISCVII::InstFormatI)
+        FixupKind = RISCV::fixup_riscv_pcrel_lo12_i;
+      else if (MIFrm == RISCVII::InstFormatS)
+        FixupKind = RISCV::fixup_riscv_pcrel_lo12_s;
+      else
+        llvm_unreachable(
+            "VK_RISCV_PCREL_LO used with unexpected instruction format");
+      break;
     case RISCVMCExpr::VK_RISCV_PCREL_HI:
       FixupKind = RISCV::fixup_riscv_pcrel_hi20;
       break;
+    case RISCVMCExpr::VK_RISCV_CALL:
+      FixupKind = RISCV::fixup_riscv_call;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() == MCSymbolRefExpr::VK_None) {
@@ -190,6 +258,15 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc()));
   ++MCNumFixups;
 
+  if (EnableRelax) {
+    if (FixupKind == RISCV::fixup_riscv_call) {
+      Fixups.push_back(
+      MCFixup::create(0, Expr, MCFixupKind(RISCV::fixup_riscv_relax),
+                      MI.getLoc()));
+      ++MCNumFixups;
+    }
+  }
+
   return 0;
 }
 
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index b36236ea155f..085dcd4e5f66 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "RISCV.h"
 #include "RISCVMCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -31,7 +32,8 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, VariantKind Kind,
 }
 
 void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool HasVariant = getKind() != VK_RISCV_None;
+  bool HasVariant =
+      ((getKind() != VK_RISCV_None) && (getKind() != VK_RISCV_CALL));
   if (HasVariant)
     OS << '%' << getVariantKindName(getKind()) << '(';
   Expr->print(OS, MAI);
@@ -42,7 +44,23 @@ void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                             const MCAsmLayout *Layout,
                                             const MCFixup *Fixup) const {
-  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  // Some custom fixup types are not valid with symbol difference expressions
+  if (Res.getSymA() && Res.getSymB()) {
+    switch (getKind()) {
+    default:
+      return true;
+    case VK_RISCV_LO:
+    case VK_RISCV_HI:
+    case VK_RISCV_PCREL_LO:
+    case VK_RISCV_PCREL_HI:
+      return false;
+    }
+  }
+
+  return true;
 }
 
 void RISCVMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
@@ -53,6 +71,7 @@ RISCVMCExpr::VariantKind RISCVMCExpr::getVariantKindForName(StringRef name) {
   return StringSwitch<RISCVMCExpr::VariantKind>(name)
       .Case("lo", VK_RISCV_LO)
       .Case("hi", VK_RISCV_HI)
+      .Case("pcrel_lo", VK_RISCV_PCREL_LO)
       .Case("pcrel_hi", VK_RISCV_PCREL_HI)
       .Default(VK_RISCV_Invalid);
 }
@@ -65,6 +84,8 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
     return "lo";
   case VK_RISCV_HI:
     return "hi";
+  case VK_RISCV_PCREL_LO:
+    return "pcrel_lo";
   case VK_RISCV_PCREL_HI:
     return "pcrel_hi";
   }
@@ -73,7 +94,8 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
 bool RISCVMCExpr::evaluateAsConstant(int64_t &Res) const {
   MCValue Value;
 
-  if (Kind == VK_RISCV_PCREL_HI)
+  if (Kind == VK_RISCV_PCREL_HI || Kind == VK_RISCV_PCREL_LO ||
+      Kind == VK_RISCV_CALL)
     return false;
 
   if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr))
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 69b55ca6f7cd..d2e0f6b6cdae 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -20,13 +20,16 @@
 namespace llvm {
 
 class StringRef;
+class MCOperand;
 class RISCVMCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
     VK_RISCV_None,
     VK_RISCV_LO,
     VK_RISCV_HI,
+    VK_RISCV_PCREL_LO,
     VK_RISCV_PCREL_HI,
+    VK_RISCV_CALL,
     VK_RISCV_Invalid
   };
 
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 45de976ec6c2..133f3cd3d39a 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -13,7 +13,9 @@
 
 #include "RISCVMCTargetDesc.h"
 #include "InstPrinter/RISCVInstPrinter.h"
+#include "RISCVELFStreamer.h"
 #include "RISCVMCAsmInfo.h"
+#include "RISCVTargetStreamer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -67,6 +69,21 @@ static MCInstPrinter *createRISCVMCInstPrinter(const Triple &T,
   return new RISCVInstPrinter(MAI, MII, MRI);
 }
 
+static MCTargetStreamer *
+createRISCVObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new RISCVTargetELFStreamer(S, STI);
+  return nullptr;
+}
+
+static MCTargetStreamer *createRISCVAsmTargetStreamer(MCStreamer &S,
+                                                      formatted_raw_ostream &OS,
+                                                      MCInstPrinter *InstPrint,
+                                                      bool isVerboseAsm) {
+  return new RISCVTargetAsmStreamer(S, OS);
+}
+
 extern "C" void LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
     TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
@@ -76,5 +93,10 @@ extern "C" void LLVMInitializeRISCVTargetMC() {
     TargetRegistry::RegisterMCCodeEmitter(*T, createRISCVMCCodeEmitter);
     TargetRegistry::RegisterMCInstPrinter(*T, createRISCVMCInstPrinter);
     TargetRegistry::RegisterMCSubtargetInfo(*T, createRISCVMCSubtargetInfo);
+    TargetRegistry::RegisterObjectTargetStreamer(
+        *T, createRISCVObjectTargetStreamer);
+
+    // Register the asm target streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createRISCVAsmTargetStreamer);
   }
 }
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index ef58a6b8cbca..0228253c08cb 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class StringRef;
@@ -44,8 +44,8 @@ MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter>
-createRISCVELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter> createRISCVELFObjectWriter(uint8_t OSABI,
+                                                                 bool Is64Bit);
 }
 
 // Defines symbolic names for RISC-V registers.
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
new file mode 100644
index 000000000000..2d5205aa7ef7
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -0,0 +1,32 @@
+//===-- RISCVTargetStreamer.cpp - RISCV Target Streamer Methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides RISCV specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetStreamer.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+RISCVTargetStreamer::RISCVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+// This part is for ascii assembly output
+RISCVTargetAsmStreamer::RISCVTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : RISCVTargetStreamer(S), OS(OS) {}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() {
+  OS << "\t.option\trvc\n";
+}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionNoRVC() {
+  OS << "\t.option\tnorvc\n";
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
new file mode 100644
index 000000000000..525c20810f24
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -0,0 +1,37 @@
+//===-- RISCVTargetStreamer.h - RISCV Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class RISCVTargetStreamer : public MCTargetStreamer {
+public:
+  RISCVTargetStreamer(MCStreamer &S);
+
+  virtual void emitDirectiveOptionRVC() = 0;
+  virtual void emitDirectiveOptionNoRVC() = 0;
+};
+
+// This part is for ascii assembly output
+class RISCVTargetAsmStreamer : public RISCVTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  RISCVTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void emitDirectiveOptionRVC() override;
+  void emitDirectiveOptionNoRVC() override;
+};
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/RISCV.h b/contrib/llvm/lib/Target/RISCV/RISCV.h
index 884cb2e5014d..2e4f536aca35 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCV.h
+++ b/contrib/llvm/lib/Target/RISCV/RISCV.h
@@ -25,6 +25,7 @@ class MCInst;
 class MCOperand;
 class MachineInstr;
 class MachineOperand;
+class PassRegistry;
 
 void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     const AsmPrinter &AP);
@@ -32,6 +33,9 @@ bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
                                          MCOperand &MCOp, const AsmPrinter &AP);
 
 FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
+
+FunctionPass *createRISCVMergeBaseOffsetOptPass();
+void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 }
 
 #endif
diff --git a/contrib/llvm/lib/Target/RISCV/RISCV.td b/contrib/llvm/lib/Target/RISCV/RISCV.td
index c74d560b2e03..281378cb2eee 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCV.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCV.td
@@ -55,6 +55,10 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
 def RV64           : HwMode<"+64bit">;
 def RV32           : HwMode<"-64bit">;
 
+def FeatureRelax
+    : SubtargetFeature<"relax", "EnableLinkerRelax", "true",
+                       "Enable Linker relaxation.">;
+
 //===----------------------------------------------------------------------===//
 // Registers, calling conventions, instruction descriptions.
 //===----------------------------------------------------------------------===//
@@ -84,7 +88,13 @@ def RISCVAsmParser : AsmParser {
   let AllowDuplicateRegisterNames = 1;
 }
 
+def RISCVAsmWriter : AsmWriter {
+  int PassSubtarget = 1;
+}
+
 def RISCV : Target {
   let InstructionSet = RISCVInstrInfo;
   let AssemblyParsers = [RISCVAsmParser];
+  let AssemblyWriters = [RISCVAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/contrib/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 4808e6c73c50..bdf8e5d840b3 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -14,6 +14,7 @@
 
 #include "RISCV.h"
 #include "InstPrinter/RISCVInstPrinter.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -41,6 +42,14 @@ public:
 
   void EmitInstruction(const MachineInstr *MI) override;
 
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
 
@@ -51,6 +60,15 @@ public:
 };
 }
 
+#define GEN_COMPRESS_INSTR
+#include "RISCVGenCompressInstEmitter.inc"
+void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  MCInst CInst;
+  bool Res = compressInst(CInst, Inst, *TM.getMCSubtargetInfo(),
+                          OutStreamer->getContext());
+  AsmPrinter::EmitToStreamer(*OutStreamer, Res ? CInst : Inst);
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "RISCVGenMCPseudoLowering.inc"
@@ -65,6 +83,54 @@ void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &OS) {
+  if (AsmVariant != 0)
+    report_fatal_error("There are no defined alternate asm variants");
+
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+    return false;
+
+  if (!ExtraCode) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (MO.getType()) {
+    case MachineOperand::MO_Immediate:
+      OS << MO.getImm();
+      return false;
+    case MachineOperand::MO_Register:
+      OS << RISCVInstPrinter::getRegisterName(MO.getReg());
+      return false;
+    default:
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &OS) {
+  if (AsmVariant != 0)
+    report_fatal_error("There are no defined alternate asm variants");
+
+  if (!ExtraCode) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // For now, we only support register memory operands in registers and
+    // assume there is no addend
+    if (!MO.isReg())
+      return true;
+
+    OS << "0(" << RISCVInstPrinter::getRegisterName(MO.getReg()) << ")";
+    return false;
+  }
+
+  return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeRISCVAsmPrinter() {
   RegisterAsmPrinter<RISCVAsmPrinter> X(getTheRISCV32Target());
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVCallingConv.td b/contrib/llvm/lib/Target/RISCV/RISCVCallingConv.td
index d2b17c64c9c2..ef146258c383 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -18,3 +18,40 @@ def CSR : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
 
 // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+// Interrupt handler needs to save/restore all registers that are used,
+// both Caller and Callee saved registers.
+def CSR_Interrupt : CalleeSavedRegs<(add X1,
+    (sequence "X%u", 3, 9),
+    (sequence "X%u", 10, 11),
+    (sequence "X%u", 12, 17),
+    (sequence "X%u", 18, 27),
+    (sequence "X%u", 28, 31))>;
+
+// Same as CSR_Interrupt, but including all 32-bit FP registers.
+def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add X1,
+    (sequence "X%u", 3, 9),
+    (sequence "X%u", 10, 11),
+    (sequence "X%u", 12, 17),
+    (sequence "X%u", 18, 27),
+    (sequence "X%u", 28, 31),
+    (sequence "F%u_32", 0, 7),
+    (sequence "F%u_32", 10, 11),
+    (sequence "F%u_32", 12, 17),
+    (sequence "F%u_32", 28, 31),
+    (sequence "F%u_32", 8, 9),
+    (sequence "F%u_32", 18, 27))>;
+
+// Same as CSR_Interrupt, but including all 64-bit FP registers.
+def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1,
+    (sequence "X%u", 3, 9),
+    (sequence "X%u", 10, 11),
+    (sequence "X%u", 12, 17),
+    (sequence "X%u", 18, 27),
+    (sequence "X%u", 28, 31),
+    (sequence "F%u_64", 0, 7),
+    (sequence "F%u_64", 10, 11),
+    (sequence "F%u_64", 12, 17),
+    (sequence "F%u_64", 28, 31),
+    (sequence "F%u_64", 8, 9),
+    (sequence "F%u_64", 18, 27))>;
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index e9e003e63d59..a816028f9d8b 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -12,15 +12,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVFrameLowering.h"
+#include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 
 using namespace llvm;
 
-bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         RegInfo->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+         MFI.isFrameAddressTaken();
+}
 
 // Determines the size of the frame and maximum call frame size.
 void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
@@ -34,21 +43,6 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
                                                       : getStackAlignment();
 
-  // Get the maximum call frame size of all the calls.
-  uint64_t MaxCallFrameSize = MFI.getMaxCallFrameSize();
-
-  // If we have dynamic alloca then MaxCallFrameSize needs to be aligned so
-  // that allocations will be aligned.
-  if (MFI.hasVarSizedObjects())
-    MaxCallFrameSize = alignTo(MaxCallFrameSize, StackAlign);
-
-  // Update maximum call frame size.
-  MFI.setMaxCallFrameSize(MaxCallFrameSize);
-
-  // Include call frame size in total.
-  if (!(hasReservedCallFrame(MF) && MFI.adjustsStack()))
-    FrameSize += MaxCallFrameSize;
-
   // Make sure the frame is aligned.
   FrameSize = alignTo(FrameSize, StackAlign);
 
@@ -61,18 +55,34 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
                                    const DebugLoc &DL, unsigned DestReg,
                                    unsigned SrcReg, int64_t Val,
                                    MachineInstr::MIFlag Flag) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
 
   if (DestReg == SrcReg && Val == 0)
     return;
 
-  if (!isInt<12>(Val))
-    report_fatal_error("adjustReg cannot yet handle adjustments >12 bits");
-
-  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
-      .addReg(SrcReg)
-      .addImm(Val)
-      .setMIFlag(Flag);
+  if (isInt<12>(Val)) {
+    BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
+        .addReg(SrcReg)
+        .addImm(Val)
+        .setMIFlag(Flag);
+  } else if (isInt<32>(Val)) {
+    unsigned Opc = RISCV::ADD;
+    bool isSub = Val < 0;
+    if (isSub) {
+      Val = -Val;
+      Opc = RISCV::SUB;
+    }
+
+    unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag);
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .setMIFlag(Flag);
+  } else {
+    report_fatal_error("adjustReg cannot yet handle adjustments >32 bits");
+  }
 }
 
 // Returns the register used to hold the frame pointer.
@@ -85,12 +95,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
 
-  if (!hasFP(MF)) {
-    report_fatal_error(
-        "emitPrologue doesn't support framepointer-less functions");
-  }
-
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
 
   unsigned FPReg = getFPReg(STI);
@@ -124,19 +130,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   std::advance(MBBI, CSI.size());
 
   // Generate new FP.
-  adjustReg(MBB, MBBI, DL, FPReg, SPReg, StackSize, MachineInstr::FrameSetup);
+  if (hasFP(MF))
+    adjustReg(MBB, MBBI, DL, FPReg, SPReg,
+              StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);
 }
 
 void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
-  if (!hasFP(MF)) {
-    report_fatal_error(
-        "emitEpilogue doesn't support framepointer-less functions");
-  }
-
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   DebugLoc DL = MBBI->getDebugLoc();
   unsigned FPReg = getFPReg(STI);
   unsigned SPReg = getSPReg(STI);
@@ -153,7 +157,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   // necessary if the stack pointer was modified, meaning the stack size is
   // unknown.
   if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) {
-    adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -StackSize,
+    assert(hasFP(MF) && "frame pointer should not have been eliminated");
+    adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
+              -StackSize + RVFI->getVarArgsSaveSize(),
               MachineInstr::FrameDestroy);
   }
 
@@ -166,6 +172,7 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                unsigned &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
   // Callee-saved registers should be referenced relative to the stack
   // pointer (positive offset), otherwise use the frame pointer (negative
@@ -182,10 +189,15 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
     MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
   }
 
-  FrameReg = RI->getFrameRegister(MF);
   if (FI >= MinCSFI && FI <= MaxCSFI) {
     FrameReg = RISCV::X2;
     Offset += MF.getFrameInfo().getStackSize();
+  } else {
+    FrameReg = RI->getFrameRegister(MF);
+    if (hasFP(MF))
+      Offset += RVFI->getVarArgsSaveSize();
+    else
+      Offset += MF.getFrameInfo().getStackSize();
   }
   return Offset;
 }
@@ -194,8 +206,94 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                               BitVector &SavedRegs,
                                               RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
-  // TODO: Once frame pointer elimination is implemented, don't
-  // unconditionally spill the frame pointer and return address.
-  SavedRegs.set(RISCV::X1);
-  SavedRegs.set(RISCV::X8);
+  // Unconditionally spill RA and FP only if the function uses a frame
+  // pointer.
+  if (hasFP(MF)) {
+    SavedRegs.set(RISCV::X1);
+    SavedRegs.set(RISCV::X8);
+  }
+
+  // If interrupt is enabled and there are calls in the handler,
+  // unconditionally save all Caller-saved registers and
+  // all FP registers, regardless whether they are used.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (MF.getFunction().hasFnAttribute("interrupt") && MFI.hasCalls()) {
+
+    static const MCPhysReg CSRegs[] = { RISCV::X1,      /* ra */
+      RISCV::X5, RISCV::X6, RISCV::X7,                  /* t0-t2 */
+      RISCV::X10, RISCV::X11,                           /* a0-a1, a2-a7 */
+      RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17,
+      RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31, 0 /* t3-t6 */
+    };
+
+    for (unsigned i = 0; CSRegs[i]; ++i)
+      SavedRegs.set(CSRegs[i]);
+
+    if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD() ||
+        MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
+
+      // If interrupt is enabled, this list contains all FP registers.
+      const MCPhysReg * Regs = MF.getRegInfo().getCalleeSavedRegs();
+
+      for (unsigned i = 0; Regs[i]; ++i)
+        if (RISCV::FPR32RegClass.contains(Regs[i]) ||
+            RISCV::FPR64RegClass.contains(Regs[i]))
+          SavedRegs.set(Regs[i]);
+    }
+  }
+}
+
+void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = &RISCV::GPRRegClass;
+  // estimateStackSize has been observed to under-estimate the final stack
+  // size, so give ourselves wiggle-room by checking for stack size
+  // representable an 11-bit signed field rather than 12-bits.
+  // FIXME: It may be possible to craft a function with a small stack that
+  // still needs an emergency spill slot for branch relaxation. This case
+  // would currently be missed.
+  if (!isInt<11>(MFI.estimateStackSize(MF))) {
+    int RegScavFI = MFI.CreateStackObject(
+        RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
+    RS->addScavengingFrameIndex(RegScavFI);
+  }
+}
+
+// Not preserve stack space within prologue for outgoing variables when the
+// function contains variable size objects and let eliminateCallFramePseudoInstr
+// preserve stack space for it.
+bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
+MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MI) const {
+  unsigned SPReg = RISCV::X2;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (!hasReservedCallFrame(MF)) {
+    // If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
+    // ADJCALLSTACKUP must be converted to instructions manipulating the stack
+    // pointer. This is necessary when there is a variable length stack
+    // allocation (e.g. alloca), which means it's not possible to allocate
+    // space for outgoing arguments from within the function prologue.
+    int64_t Amount = MI->getOperand(0).getImm();
+
+    if (Amount != 0) {
+      // Ensure the stack remains aligned after adjustment.
+      Amount = alignSPAdjust(Amount);
+
+      if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
+        Amount = -Amount;
+
+      adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
+    }
+  }
+
+  return MBB.erase(MI);
 }
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index d92bb70c76da..ca653c2b9f17 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/contrib/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -36,13 +36,15 @@ public:
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+
   bool hasFP(const MachineFunction &MF) const override;
 
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const override {
-    return MBB.erase(MI);
-  }
+                                MachineBasicBlock::iterator MI) const override;
 
 protected:
   const RISCVSubtarget &STI;
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 113a45ac7cc0..04441b9a9b15 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -42,25 +42,36 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
+  void PostprocessISelDAG() override;
+
   void Select(SDNode *Node) override;
 
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
   bool SelectAddrFI(SDValue Addr, SDValue &Base);
 
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
+
+private:
+  void doPeepholeLoadStoreADDI();
+  void doPeepholeBuildPairF64SplitF64();
 };
 }
 
+void RISCVDAGToDAGISel::PostprocessISelDAG() {
+  doPeepholeLoadStoreADDI();
+  doPeepholeBuildPairF64SplitF64();
+}
+
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   MVT XLenVT = Subtarget->getXLenVT();
 
-  // Dump information about the Node being selected.
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
-
   // If we have a custom node, we have already selected
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
     Node->setNodeId(-1);
     return;
   }
@@ -82,7 +93,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   if (Opcode == ISD::FrameIndex) {
     SDLoc DL(Node);
     SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
-    int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     EVT VT = Node->getValueType(0);
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
     ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
@@ -93,6 +104,22 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   SelectCode(Node);
 }
 
+bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  switch (ConstraintID) {
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_m:
+    // We just support simple memory operands that have a single address
+    // operand and need no special handling.
+    OutOps.push_back(Op);
+    return false;
+  default:
+    break;
+  }
+
+  return true;
+}
+
 bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
@@ -101,6 +128,131 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   return false;
 }
 
+// Merge an ADDI into the offset of a load/store instruction where possible.
+// (load (add base, off), 0) -> (load base, off)
+// (store val, (add base, off)) -> (store val, base, off)
+void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    int OffsetOpIdx;
+    int BaseOpIdx;
+
+    // Only attempt this optimisation for I-type loads and S-type stores.
+    switch (N->getMachineOpcode()) {
+    default:
+      continue;
+    case RISCV::LB:
+    case RISCV::LH:
+    case RISCV::LW:
+    case RISCV::LBU:
+    case RISCV::LHU:
+    case RISCV::LWU:
+    case RISCV::LD:
+    case RISCV::FLW:
+    case RISCV::FLD:
+      BaseOpIdx = 0;
+      OffsetOpIdx = 1;
+      break;
+    case RISCV::SB:
+    case RISCV::SH:
+    case RISCV::SW:
+    case RISCV::SD:
+    case RISCV::FSW:
+    case RISCV::FSD:
+      BaseOpIdx = 1;
+      OffsetOpIdx = 2;
+      break;
+    }
+
+    // Currently, the load/store offset must be 0 to be considered for this
+    // peephole optimisation.
+    if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)) ||
+        N->getConstantOperandVal(OffsetOpIdx) != 0)
+      continue;
+
+    SDValue Base = N->getOperand(BaseOpIdx);
+
+    // If the base is an ADDI, we can merge it in to the load/store.
+    if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
+      continue;
+
+    SDValue ImmOperand = Base.getOperand(1);
+
+    if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
+      ImmOperand = CurDAG->getTargetConstant(
+          Const->getSExtValue(), SDLoc(ImmOperand), ImmOperand.getValueType());
+    } else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+      ImmOperand = CurDAG->getTargetGlobalAddress(
+          GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
+          GA->getOffset(), GA->getTargetFlags());
+    } else {
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    LLVM_DEBUG(Base->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\nN: ");
+    LLVM_DEBUG(N->dump(CurDAG));
+    LLVM_DEBUG(dbgs() << "\n");
+
+    // Modify the offset operand of the load/store.
+    if (BaseOpIdx == 0) // Load
+      CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                                 N->getOperand(2));
+    else // Store
+      CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                                 ImmOperand, N->getOperand(3));
+
+    // The add-immediate may now be dead, in which case remove it.
+    if (Base.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Base.getNode());
+  }
+}
+
+// Remove redundant BuildPairF64+SplitF64 pairs. i.e. cases where an f64 is
+// built of two i32 values, only to be split apart again. This must be done
+// here as a peephole optimisation as the DAG has not been fully legalized at
+// the point BuildPairF64/SplitF64 nodes are created in RISCVISelLowering, so
+// some nodes would not yet have been replaced with libcalls.
+void RISCVDAGToDAGISel::doPeepholeBuildPairF64SplitF64() {
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any nodes other than SplitF64Pseudo.
+    if (N->use_empty() || !N->isMachineOpcode() ||
+        !(N->getMachineOpcode() == RISCV::SplitF64Pseudo))
+      continue;
+
+    // If the operand to SplitF64 is a BuildPairF64, the split operation is
+    // redundant. Just use the operands to BuildPairF64 as the result.
+    SDValue F64Val = N->getOperand(0);
+    if (F64Val.isMachineOpcode() &&
+        F64Val.getMachineOpcode() == RISCV::BuildPairF64Pseudo) {
+      LLVM_DEBUG(
+          dbgs() << "Removing redundant SplitF64Pseudo and replacing uses "
+                    "with BuildPairF64Pseudo operands:\n");
+      LLVM_DEBUG(dbgs() << "N:    ");
+      LLVM_DEBUG(N->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "F64Val: ");
+      LLVM_DEBUG(F64Val->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "\n");
+      SDValue From[] = {SDValue(N, 0), SDValue(N, 1)};
+      SDValue To[] = {F64Val.getOperand(0), F64Val.getOperand(1)};
+      CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
+    }
+  }
+  CurDAG->RemoveDeadNodes();
+}
+
 // This pass converts a legalized DAG into a RISCV-specific DAG, ready
 // for instruction scheduling.
 FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 805ca7dd956e..87796e5b1097 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14,9 +14,11 @@
 
 #include "RISCVISelLowering.h"
 #include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
 #include "RISCVRegisterInfo.h"
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -35,6 +37,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "riscv-lower"
 
+STATISTIC(NumTailCalls, "Number of tail calls");
+
 RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                                          const RISCVSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -44,6 +48,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   // Set up the register classes.
   addRegisterClass(XLenVT, &RISCV::GPRRegClass);
 
+  if (Subtarget.hasStdExtF())
+    addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
+  if (Subtarget.hasStdExtD())
+    addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(STI.getRegisterInfo());
 
@@ -63,26 +72,28 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
   for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
-  setOperationAction(ISD::ADDC, XLenVT, Expand);
-  setOperationAction(ISD::ADDE, XLenVT, Expand);
-  setOperationAction(ISD::SUBC, XLenVT, Expand);
-  setOperationAction(ISD::SUBE, XLenVT, Expand);
+  if (!Subtarget.hasStdExtM()) {
+    setOperationAction(ISD::MUL, XLenVT, Expand);
+    setOperationAction(ISD::MULHS, XLenVT, Expand);
+    setOperationAction(ISD::MULHU, XLenVT, Expand);
+    setOperationAction(ISD::SDIV, XLenVT, Expand);
+    setOperationAction(ISD::UDIV, XLenVT, Expand);
+    setOperationAction(ISD::SREM, XLenVT, Expand);
+    setOperationAction(ISD::UREM, XLenVT, Expand);
+  }
 
-  setOperationAction(ISD::SREM, XLenVT, Expand);
   setOperationAction(ISD::SDIVREM, XLenVT, Expand);
-  setOperationAction(ISD::SDIV, XLenVT, Expand);
-  setOperationAction(ISD::UREM, XLenVT, Expand);
   setOperationAction(ISD::UDIVREM, XLenVT, Expand);
-  setOperationAction(ISD::UDIV, XLenVT, Expand);
-
-  setOperationAction(ISD::MUL, XLenVT, Expand);
   setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
   setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);
-  setOperationAction(ISD::MULHS, XLenVT, Expand);
-  setOperationAction(ISD::MULHU, XLenVT, Expand);
 
   setOperationAction(ISD::SHL_PARTS, XLenVT, Expand);
   setOperationAction(ISD::SRL_PARTS, XLenVT, Expand);
@@ -95,19 +106,128 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTLZ, XLenVT, Expand);
   setOperationAction(ISD::CTPOP, XLenVT, Expand);
 
+  ISD::CondCode FPCCToExtend[] = {
+      ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETO,   ISD::SETUEQ,
+      ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE,
+      ISD::SETGT,  ISD::SETGE,  ISD::SETNE};
+
+  if (Subtarget.hasStdExtF()) {
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    for (auto CC : FPCCToExtend)
+      setCondCodeAction(CC, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f32, Custom);
+    setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  }
+
+  if (Subtarget.hasStdExtD()) {
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    for (auto CC : FPCCToExtend)
+      setCondCodeAction(CC, MVT::f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Custom);
+    setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  }
+
   setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
   setOperationAction(ISD::BlockAddress, XLenVT, Custom);
+  setOperationAction(ISD::ConstantPool, XLenVT, Custom);
+
+  if (Subtarget.hasStdExtA())
+    setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
+  else
+    setMaxAtomicSizeInBitsSupported(0);
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
   // Function alignments (log2).
-  setMinFunctionAlignment(3);
-  setPrefFunctionAlignment(3);
+  unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
+  setMinFunctionAlignment(FunctionAlignment);
+  setPrefFunctionAlignment(FunctionAlignment);
 
   // Effectively disable jump table generation.
   setMinimumJumpTableEntries(INT_MAX);
 }
 
+EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                            EVT VT) const {
+  if (!VT.isVector())
+    return getPointerTy(DL);
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS,
+                                                Instruction *I) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Require a 12-bit signed offset.
+  if (!isInt<12>(AM.BaseOffs))
+    return false;
+
+  switch (AM.Scale) {
+  case 0: // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (!AM.HasBaseReg) // allow "r+i".
+      break;
+    return false; // disallow "r+r" or "r+r+i".
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<12>(Imm);
+}
+
+bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return isInt<12>(Imm);
+}
+
+// On RV32, 64-bit integers are split into their high and low parts and held
+// in two different registers, so the trunc is free since the low register can
+// just be used.
+bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
+  if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+    return false;
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBits = DstTy->getPrimitiveSizeInBits();
+  return (SrcBits == 64 && DestBits == 32);
+}
+
+bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
+  if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() ||
+      !SrcVT.isInteger() || !DstVT.isInteger())
+    return false;
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  unsigned DestBits = DstVT.getSizeInBits();
+  return (SrcBits == 64 && DestBits == 32);
+}
+
+bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  // Zexts are free if they can be combined with a load.
+  if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
+    EVT MemVT = LD->getMemoryVT();
+    if ((MemVT == MVT::i8 || MemVT == MVT::i16 ||
+         (Subtarget.is64Bit() && MemVT == MVT::i32)) &&
+        (LD->getExtensionType() == ISD::NON_EXTLOAD ||
+         LD->getExtensionType() == ISD::ZEXTLOAD))
+      return true;
+  }
+
+  return TargetLowering::isZExtFree(Val, VT2);
+}
+
 // Changes the condition code and swaps operands if necessary, so the SetCC
 // operation matches one of the comparisons supported directly in the RISC-V
 // ISA.
@@ -156,8 +276,16 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerGlobalAddress(Op, DAG);
   case ISD::BlockAddress:
     return lowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:
+    return lowerConstantPool(Op, DAG);
   case ISD::SELECT:
     return lowerSELECT(Op, DAG);
+  case ISD::VASTART:
+    return lowerVASTART(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
   }
 }
 
@@ -168,17 +296,22 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
   int64_t Offset = N->getOffset();
+  MVT XLenVT = Subtarget.getXLenVT();
 
   if (isPositionIndependent() || Subtarget.is64Bit())
     report_fatal_error("Unable to lowerGlobalAddress");
-
-  SDValue GAHi =
-    DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_HI);
-  SDValue GALo =
-    DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_LO);
+  // In order to maximise the opportunity for common subexpression elimination,
+  // emit a separate ADD node for the global address offset instead of folding
+  // it in the global address node. Later peephole optimisations may choose to
+  // fold it back in when profitable.
+  SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_HI);
+  SDValue GALo = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_LO);
   SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, GAHi), 0);
   SDValue MNLo =
     SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, GALo), 0);
+  if (Offset != 0)
+    return DAG.getNode(ISD::ADD, DL, Ty, MNLo,
+                       DAG.getConstant(Offset, DL, XLenVT));
   return MNLo;
 }
 
@@ -201,6 +334,29 @@ SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
   return MNLo;
 }
 
+SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  const Constant *CPA = N->getConstVal();
+  int64_t Offset = N->getOffset();
+  unsigned Alignment = N->getAlignment();
+
+  if (!isPositionIndependent()) {
+    SDValue CPAHi =
+        DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_HI);
+    SDValue CPALo =
+        DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_LO);
+    SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, CPAHi), 0);
+    SDValue MNLo =
+        SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, CPALo), 0);
+    return MNLo;
+  } else {
+    report_fatal_error("Unable to lowerConstantPool");
+  }
+}
+
 SDValue RISCVTargetLowering::lowerExternalSymbol(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -261,14 +417,153 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
 }
 
+SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy(MF.getDataLayout()));
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue RISCVTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+  unsigned FrameReg = RI.getFrameRegister(MF);
+  int XLenInBytes = Subtarget.getXLen() / 8;
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  while (Depth--) {
+    int Offset = -(XLenInBytes * 2);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset, DL));
+    FrameAddr =
+        DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+  }
+  return FrameAddr;
+}
+
+SDValue RISCVTargetLowering::LowerRETURNADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+  MVT XLenVT = Subtarget.getXLenVT();
+  int XLenInBytes = Subtarget.getXLen() / 8;
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    int Off = -XLenInBytes;
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(Off, DL, VT);
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Return the value of the return address register, marking it an implicit
+  // live-in.
+  unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
+}
+
+static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
+                                             MachineBasicBlock *BB) {
+  assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");
+
+  MachineFunction &MF = *BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  unsigned LoReg = MI.getOperand(0).getReg();
+  unsigned HiReg = MI.getOperand(1).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
+  int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
+
+  TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
+                          RI);
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+                              MachineMemOperand::MOLoad, 8, 8);
+  BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addMemOperand(MMO);
+  BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
+      .addFrameIndex(FI)
+      .addImm(4)
+      .addMemOperand(MMO);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) {
+  assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
+         "Unexpected instruction");
+
+  MachineFunction &MF = *BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned LoReg = MI.getOperand(1).getReg();
+  unsigned HiReg = MI.getOperand(2).getReg();
+  const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
+  int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+                              MachineMemOperand::MOStore, 8, 8);
+  BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
+      .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addMemOperand(MMO);
+  BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
+      .addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
+      .addFrameIndex(FI)
+      .addImm(4)
+      .addMemOperand(MMO);
+  TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
 MachineBasicBlock *
 RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *BB) const {
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
-
-  assert(MI.getOpcode() == RISCV::Select_GPR_Using_CC_GPR &&
-         "Unexpected instr type to insert");
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  case RISCV::Select_GPR_Using_CC_GPR:
+  case RISCV::Select_FPR32_Using_CC_GPR:
+  case RISCV::Select_FPR64_Using_CC_GPR:
+    break;
+  case RISCV::BuildPairF64Pseudo:
+    return emitBuildPairF64Pseudo(MI, BB);
+  case RISCV::SplitF64Pseudo:
+    return emitSplitF64Pseudo(MI, BB);
+  }
 
   // To "insert" a SELECT instruction, we actually have to insert the triangle
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -281,7 +576,9 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   //     |  IfFalseMBB
   //     | /
   //    TailMBB
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction::iterator I = ++BB->getIterator();
 
   MachineBasicBlock *HeadMBB = BB;
@@ -398,19 +695,36 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
 // Implements the RISC-V calling convention. Returns true upon failure.
 static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
                      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                     CCState &State, bool IsFixed, bool IsRet) {
+                     CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
   unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
   assert(XLen == 32 || XLen == 64);
   MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
-  assert(ValVT == XLenVT && "Unexpected ValVT");
-  assert(LocVT == XLenVT && "Unexpected LocVT");
-  assert(IsFixed && "Vararg support not yet implemented");
+  if (ValVT == MVT::f32) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::BCvt;
+  }
 
   // Any return value split in to more than two values can't be returned
   // directly.
   if (IsRet && ValNo > 1)
     return true;
 
+  // If this is a variadic argument, the RISC-V calling convention requires
+  // that it is assigned an 'even' or 'aligned' register if it has 8-byte
+  // alignment (RV32) or 16-byte alignment (RV64). An aligned register should
+  // be used regardless of whether the original argument was split during
+  // legalisation or not. The argument will not be passed by registers if the
+  // original type is larger than 2*XLEN, so the register alignment rule does
+  // not apply.
+  unsigned TwoXLenInBytes = (2 * XLen) / 8;
+  if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes &&
+      DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
+    unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
+    // Skip 'odd' register if necessary.
+    if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
+      State.AllocateReg(ArgGPRs);
+  }
+
   SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
   SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
       State.getPendingArgFlags();
@@ -418,6 +732,28 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
   assert(PendingLocs.size() == PendingArgFlags.size() &&
          "PendingLocs and PendingArgFlags out of sync");
 
+  // Handle passing f64 on RV32D with a soft float ABI.
+  if (XLen == 32 && ValVT == MVT::f64) {
+    assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
+           "Can't lower f64 if it is split");
+    // Depending on available argument GPRS, f64 may be passed in a pair of
+    // GPRs, split between a GPR and the stack, or passed completely on the
+    // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
+    // cases.
+    unsigned Reg = State.AllocateReg(ArgGPRs);
+    LocVT = MVT::i32;
+    if (!Reg) {
+      unsigned StackOffset = State.AllocateStack(8, 8);
+      State.addLoc(
+          CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+      return false;
+    }
+    if (!State.AllocateReg(ArgGPRs))
+      State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
   // Split arguments might be passed indirectly, so keep track of the pending
   // values.
   if (ArgFlags.isSplit() || !PendingLocs.empty()) {
@@ -482,15 +818,22 @@ void RISCVTargetLowering::analyzeInputArgs(
     MachineFunction &MF, CCState &CCInfo,
     const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
   unsigned NumArgs = Ins.size();
+  FunctionType *FType = MF.getFunction().getFunctionType();
 
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
 
+    Type *ArgTy = nullptr;
+    if (IsRet)
+      ArgTy = FType->getReturnType();
+    else if (Ins[i].isOrigArg())
+      ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
+
     if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full,
-                 ArgFlags, CCInfo, /*IsRet=*/true, IsRet)) {
-      DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
-                   << EVT(ArgVT).getEVTString() << '\n');
+                 ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) {
+      LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
+                        << EVT(ArgVT).getEVTString() << '\n');
       llvm_unreachable(nullptr);
     }
   }
@@ -498,17 +841,19 @@ void RISCVTargetLowering::analyzeInputArgs(
 
 void RISCVTargetLowering::analyzeOutputArgs(
     MachineFunction &MF, CCState &CCInfo,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
+    CallLoweringInfo *CLI) const {
   unsigned NumArgs = Outs.size();
 
   for (unsigned i = 0; i != NumArgs; i++) {
     MVT ArgVT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
 
     if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full,
-                 ArgFlags, CCInfo, Outs[i].IsFixed, IsRet)) {
-      DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
-                   << EVT(ArgVT).getEVTString() << "\n");
+                 ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+      LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
+                        << EVT(ArgVT).getEVTString() << "\n");
       llvm_unreachable(nullptr);
     }
   }
@@ -521,6 +866,7 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   EVT LocVT = VA.getLocVT();
+  EVT ValVT = VA.getValVT();
   SDValue Val;
 
   unsigned VReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
@@ -532,8 +878,12 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
   case CCValAssign::Full:
   case CCValAssign::Indirect:
-    return Val;
+    break;
+  case CCValAssign::BCvt:
+    Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+    break;
   }
+  return Val;
 }
 
 // The caller is responsible for loading the full value if the argument is
@@ -565,6 +915,43 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
   return Val;
 }
 
+static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
+                                       const CCValAssign &VA, const SDLoc &DL) {
+  assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
+         "Unexpected VA");
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  if (VA.isMemLoc()) {
+    // f64 is passed on the stack.
+    int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true);
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    return DAG.getLoad(MVT::f64, DL, Chain, FIN,
+                       MachinePointerInfo::getFixedStack(MF, FI));
+  }
+
+  assert(VA.isRegLoc() && "Expected register VA assignment");
+
+  unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+  RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
+  SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
+  SDValue Hi;
+  if (VA.getLocReg() == RISCV::X17) {
+    // Second half of f64 is passed on the stack.
+    int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true);
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
+                     MachinePointerInfo::getFixedStack(MF, FI));
+  } else {
+    // Second half of f64 is passed in another GPR.
+    unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+    RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
+    Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
+  }
+  return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+}
+
 // Transform physical registers into virtual registers.
 SDValue RISCVTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@@ -580,10 +967,26 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   }
 
   MachineFunction &MF = DAG.getMachineFunction();
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
-  if (IsVarArg)
-    report_fatal_error("VarArg not supported");
+  const Function &Func = MF.getFunction();
+  if (Func.hasFnAttribute("interrupt")) {
+    if (!Func.arg_empty())
+      report_fatal_error(
+        "Functions with the interrupt attribute cannot have arguments!");
+
+    StringRef Kind =
+      MF.getFunction().getFnAttribute("interrupt").getValueAsString();
+
+    if (!(Kind == "user" || Kind == "supervisor" || Kind == "machine"))
+      report_fatal_error(
+        "Function interrupt attribute argument not supported!");
+  }
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  MVT XLenVT = Subtarget.getXLenVT();
+  unsigned XLenInBytes = Subtarget.getXLen() / 8;
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -592,9 +995,13 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    assert(VA.getLocVT() == Subtarget.getXLenVT() && "Unhandled argument type");
+    assert(VA.getLocVT() == XLenVT && "Unhandled argument type");
     SDValue ArgValue;
-    if (VA.isRegLoc())
+    // Passing f64 on RV32D with a soft float ABI must be handled as a special
+    // case.
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
+      ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
+    else if (VA.isRegLoc())
       ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
     else
       ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
@@ -620,9 +1027,155 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     }
     InVals.push_back(ArgValue);
   }
+
+  if (IsVarArg) {
+    ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
+    unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
+    const TargetRegisterClass *RC = &RISCV::GPRRegClass;
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+
+    // Offset of the first variable argument from stack pointer, and size of
+    // the vararg save area. For now, the varargs save area is either zero or
+    // large enough to hold a0-a7.
+    int VaArgOffset, VarArgsSaveSize;
+
+    // If all registers are allocated, then all varargs must be passed on the
+    // stack and we don't need to save any argregs.
+    if (ArgRegs.size() == Idx) {
+      VaArgOffset = CCInfo.getNextStackOffset();
+      VarArgsSaveSize = 0;
+    } else {
+      VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
+      VaArgOffset = -VarArgsSaveSize;
+    }
+
+    // Record the frame index of the first variable argument
+    // which is a value necessary to VASTART.
+    int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
+    RVFI->setVarArgsFrameIndex(FI);
+
+    // If saving an odd number of registers then create an extra stack slot to
+    // ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
+    // offsets to even-numbered registered remain 2*XLEN-aligned.
+    if (Idx % 2) {
+      FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
+                                 true);
+      VarArgsSaveSize += XLenInBytes;
+    }
+
+    // Copy the integer registers that may have been used for passing varargs
+    // to the vararg save area.
+    for (unsigned I = Idx; I < ArgRegs.size();
+         ++I, VaArgOffset += XLenInBytes) {
+      const unsigned Reg = RegInfo.createVirtualRegister(RC);
+      RegInfo.addLiveIn(ArgRegs[I], Reg);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
+      FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
+      SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
+                                   MachinePointerInfo::getFixedStack(MF, FI));
+      cast<StoreSDNode>(Store.getNode())
+          ->getMemOperand()
+          ->setValue((Value *)nullptr);
+      OutChains.push_back(Store);
+    }
+    RVFI->setVarArgsSaveSize(VarArgsSaveSize);
+  }
+
+  // All stores are grouped in one node to allow the matching between
+  // the size of Ins and InVals. This only happens for vararg functions.
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  }
+
   return Chain;
 }
 
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization.
+/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
+bool RISCVTargetLowering::IsEligibleForTailCallOptimization(
+  CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
+  const SmallVector<CCValAssign, 16> &ArgLocs) const {
+
+  auto &Callee = CLI.Callee;
+  auto CalleeCC = CLI.CallConv;
+  auto IsVarArg = CLI.IsVarArg;
+  auto &Outs = CLI.Outs;
+  auto &Caller = MF.getFunction();
+  auto CallerCC = Caller.getCallingConv();
+
+  // Do not tail call opt functions with "disable-tail-calls" attribute.
+  if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+    return false;
+
+  // Exception-handling functions need a special set of instructions to
+  // indicate a return to the hardware. Tail-calling another function would
+  // probably break this.
+  // TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
+  // should be expanded as new function attributes are introduced.
+  if (Caller.hasFnAttribute("interrupt"))
+    return false;
+
+  // Do not tail call opt functions with varargs.
+  if (IsVarArg)
+    return false;
+
+  // Do not tail call opt if the stack is used to pass parameters.
+  if (CCInfo.getNextStackOffset() != 0)
+    return false;
+
+  // Do not tail call opt if any parameters need to be passed indirectly.
+  // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
+  // passed indirectly. So the address of the value will be passed in a
+  // register, or if not available, then the address is put on the stack. In
+  // order to pass indirectly, space on the stack often needs to be allocated
+  // in order to store the value. In this case the CCInfo.getNextStackOffset()
+  // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
+  // are passed CCValAssign::Indirect.
+  for (auto &VA : ArgLocs)
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      return false;
+
+  // Do not tail call opt if either caller or callee uses struct return
+  // semantics.
+  auto IsCallerStructRet = Caller.hasStructRetAttr();
+  auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
+  if (IsCallerStructRet || IsCalleeStructRet)
+    return false;
+
+  // Externally-defined functions with weak linkage should not be
+  // tail-called. The behaviour of branch instructions in this situation (as
+  // used for tail calls) is implementation-defined, so we cannot rely on the
+  // linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->hasExternalWeakLinkage())
+      return false;
+  }
+
+  // The callee has to preserve all registers the caller needs to preserve.
+  const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  if (CalleeCC != CallerCC) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+      return false;
+  }
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible
+  // but less efficient and uglier in LowerCall.
+  for (auto &Arg : Outs)
+    if (Arg.Flags.isByVal())
+      return false;
+
+  return true;
+}
+
 // Lower a call to a callseq_start + CALL + callseq_end chain, and add input
 // and output parameter nodes.
 SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -634,22 +1187,29 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
-  CLI.IsTailCall = false;
+  bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   MVT XLenVT = Subtarget.getXLenVT();
 
-  if (IsVarArg) {
-    report_fatal_error("LowerCall with varargs not implemented");
-  }
-
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false);
+  analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
+
+  // Check if it's really possible to do a tail call.
+  if (IsTailCall)
+    IsTailCall = IsEligibleForTailCallOptimization(ArgCCInfo, CLI, MF,
+                                                   ArgLocs);
+
+  if (IsTailCall)
+    ++NumTailCalls;
+  else if (CLI.CS && CLI.CS.isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getNextStackOffset();
@@ -672,12 +1232,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
                           /*IsVolatile=*/false,
                           /*AlwaysInline=*/false,
-                          /*isTailCall=*/false, MachinePointerInfo(),
+                          IsTailCall, MachinePointerInfo(),
                           MachinePointerInfo());
     ByValArgs.push_back(FIPtr);
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
@@ -688,11 +1249,45 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     SDValue ArgValue = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
+    // Handle passing f64 on RV32D with a soft float ABI as a special case.
+    bool IsF64OnRV32DSoftABI =
+        VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
+    if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
+      SDValue SplitF64 = DAG.getNode(
+          RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
+      SDValue Lo = SplitF64.getValue(0);
+      SDValue Hi = SplitF64.getValue(1);
+
+      unsigned RegLo = VA.getLocReg();
+      RegsToPass.push_back(std::make_pair(RegLo, Lo));
+
+      if (RegLo == RISCV::X17) {
+        // Second half of f64 is passed on the stack.
+        // Work out the address of the stack slot.
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
+        // Emit the store.
+        MemOpChains.push_back(
+            DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
+      } else {
+        // Second half of f64 is passed in another GPR.
+        unsigned RegHigh = RegLo + 1;
+        RegsToPass.push_back(std::make_pair(RegHigh, Hi));
+      }
+      continue;
+    }
+
+    // IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
+    // as any other MemLoc.
+
     // Promote the value if needed.
     // For now, only handle fully promoted and indirect arguments.
     switch (VA.getLocInfo()) {
     case CCValAssign::Full:
       break;
+    case CCValAssign::BCvt:
+      ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), ArgValue);
+      break;
     case CCValAssign::Indirect: {
       // Store the argument in a stack slot and pass its address.
       SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
@@ -730,6 +1325,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
+      assert(!IsTailCall && "Tail call not allowed if stack is used "
+                            "for passing parameters");
 
       // Work out the address of the stack slot.
       if (!StackPtr.getNode())
@@ -756,10 +1353,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Glue = Chain.getValue(1);
   }
 
-  if (isa<GlobalAddressSDNode>(Callee)) {
-    Callee = lowerGlobalAddress(Callee, DAG);
-  } else if (isa<ExternalSymbolSDNode>(Callee)) {
-    Callee = lowerExternalSymbol(Callee, DAG);
+  // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
+  // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
+  // split it and then direct call can be matched by PseudoCALL.
+  if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT, 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, 0);
   }
 
   // The first call operand is the chain and the second is the target address.
@@ -772,11 +1372,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   for (auto &Reg : RegsToPass)
     Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
 
-  // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  if (!IsTailCall) {
+    // Add a register mask operand representing the call-preserved registers.
+    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+    const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   // Glue the call to the argument copies, if any.
   if (Glue.getNode())
@@ -784,6 +1386,12 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  if (IsTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
+  }
+
   Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
   Glue = Chain.getValue(1);
 
@@ -801,13 +1409,32 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Copy all of the result registers out of their specified physreg.
   for (auto &VA : RVLocs) {
-    // Copy the value out, gluing the copy to the end of the call sequence.
-    SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
-                                          VA.getLocVT(), Glue);
+    // Copy the value out
+    SDValue RetValue =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
+    // Glue the RetValue to the end of the call sequence
     Chain = RetValue.getValue(1);
     Glue = RetValue.getValue(2);
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+      assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
+      SDValue RetValue2 =
+          DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
+      Chain = RetValue2.getValue(1);
+      Glue = RetValue2.getValue(2);
+      RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
+                             RetValue2);
+    }
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      RetValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), RetValue);
+      break;
+    }
 
-    assert(VA.getLocInfo() == CCValAssign::Full && "Unknown loc info!");
     InVals.push_back(RetValue);
   }
 
@@ -823,22 +1450,34 @@ bool RISCVTargetLowering::CanLowerReturn(
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     if (CC_RISCV(MF.getDataLayout(), i, VT, VT, CCValAssign::Full, ArgFlags,
-                 CCInfo, /*IsFixed=*/true, /*IsRet=*/true))
+                 CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
       return false;
   }
   return true;
 }
 
+static SDValue packIntoRegLoc(SelectionDAG &DAG, SDValue Val,
+                              const CCValAssign &VA, const SDLoc &DL) {
+  EVT LocVT = VA.getLocVT();
+
+  switch (VA.getLocInfo()) {
+  default:
+    llvm_unreachable("Unexpected CCValAssign::LocInfo");
+  case CCValAssign::Full:
+    break;
+  case CCValAssign::BCvt:
+    Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
+    break;
+  }
+  return Val;
+}
+
 SDValue
 RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool IsVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &DL, SelectionDAG &DAG) const {
-  if (IsVarArg) {
-    report_fatal_error("VarArg not supported");
-  }
-
   // Stores the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -846,9 +1485,10 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
-  analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true);
+  analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
+                    nullptr);
 
-  SDValue Flag;
+  SDValue Glue;
   SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
@@ -856,21 +1496,60 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     SDValue Val = OutVals[i];
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    assert(VA.getLocInfo() == CCValAssign::Full &&
-           "Unexpected CCValAssign::LocInfo");
 
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+      // Handle returning f64 on RV32D with a soft float ABI.
+      assert(VA.isRegLoc() && "Expected return via registers");
+      SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
+                                     DAG.getVTList(MVT::i32, MVT::i32), Val);
+      SDValue Lo = SplitF64.getValue(0);
+      SDValue Hi = SplitF64.getValue(1);
+      unsigned RegLo = VA.getLocReg();
+      unsigned RegHi = RegLo + 1;
+      Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
+      Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
+    } else {
+      // Handle a 'normal' return.
+      Val = packIntoRegLoc(DAG, Val, VA, DL);
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
 
-    // Guarantee that all emitted copies are stuck together.
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      // Guarantee that all emitted copies are stuck together.
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+    }
   }
 
   RetOps[0] = Chain; // Update chain.
 
-  // Add the flag if we have it.
-  if (Flag.getNode()) {
-    RetOps.push_back(Flag);
+  // Add the glue node if we have it.
+  if (Glue.getNode()) {
+    RetOps.push_back(Glue);
+  }
+
+  // Interrupt service routines use different return instructions.
+  const Function &Func = DAG.getMachineFunction().getFunction();
+  if (Func.hasFnAttribute("interrupt")) {
+    if (!Func.getReturnType()->isVoidTy())
+      report_fatal_error(
+          "Functions with the interrupt attribute must have void return type!");
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    StringRef Kind =
+      MF.getFunction().getFnAttribute("interrupt").getValueAsString();
+
+    unsigned RetOpc;
+    if (Kind == "user")
+      RetOpc = RISCVISD::URET_FLAG;
+    else if (Kind == "supervisor")
+      RetOpc = RISCVISD::SRET_FLAG;
+    else
+      RetOpc = RISCVISD::MRET_FLAG;
+
+    return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
   }
 
   return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
@@ -882,10 +1561,58 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
     break;
   case RISCVISD::RET_FLAG:
     return "RISCVISD::RET_FLAG";
+  case RISCVISD::URET_FLAG:
+    return "RISCVISD::URET_FLAG";
+  case RISCVISD::SRET_FLAG:
+    return "RISCVISD::SRET_FLAG";
+  case RISCVISD::MRET_FLAG:
+    return "RISCVISD::MRET_FLAG";
   case RISCVISD::CALL:
     return "RISCVISD::CALL";
   case RISCVISD::SELECT_CC:
     return "RISCVISD::SELECT_CC";
+  case RISCVISD::BuildPairF64:
+    return "RISCVISD::BuildPairF64";
+  case RISCVISD::SplitF64:
+    return "RISCVISD::SplitF64";
+  case RISCVISD::TAIL:
+    return "RISCVISD::TAIL";
   }
   return nullptr;
 }
+
+std::pair<unsigned, const TargetRegisterClass *>
+RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to a
+  // RISCV register class.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, &RISCV::GPRRegClass);
+    default:
+      break;
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                   Instruction *Inst,
+                                                   AtomicOrdering Ord) const {
+  if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
+    return Builder.CreateFence(Ord);
+  if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
+    return Builder.CreateFence(AtomicOrdering::Release);
+  return nullptr;
+}
+
+Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                    Instruction *Inst,
+                                                    AtomicOrdering Ord) const {
+  if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
+    return Builder.CreateFence(AtomicOrdering::Acquire);
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h b/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9c5c7ca008c0..280adb29fd02 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -25,8 +25,14 @@ namespace RISCVISD {
 enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   RET_FLAG,
+  URET_FLAG,
+  SRET_FLAG,
+  MRET_FLAG,
   CALL,
-  SELECT_CC
+  SELECT_CC,
+  BuildPairF64,
+  SplitF64,
+  TAIL
 };
 }
 
@@ -37,23 +43,47 @@ public:
   explicit RISCVTargetLowering(const TargetMachine &TM,
                                const RISCVSubtarget &STI);
 
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS,
+                             Instruction *I = nullptr) const override;
+  bool isLegalICmpImmediate(int64_t Imm) const override;
+  bool isLegalAddImmediate(int64_t Imm) const override;
+  bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
+  bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
   // Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   // This method returns the name of a target specific DAG node.
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+    return isa<LoadInst>(I) || isa<StoreInst>(I);
+  }
+  Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                AtomicOrdering Ord) const override;
+  Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                 AtomicOrdering Ord) const override;
+
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins,
                         bool IsRet) const;
   void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
                          const SmallVectorImpl<ISD::OutputArg> &Outs,
-                         bool IsRet) const;
+                         bool IsRet, CallLoweringInfo *CLI) const;
   // Lower incoming arguments, copy physregs into vregs
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
@@ -76,8 +106,16 @@ private:
   }
   SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+  bool IsEligibleForTailCallOptimization(CCState &CCInfo,
+    CallLoweringInfo &CLI, MachineFunction &MF,
+    const SmallVector<CCValAssign, 16> &ArgLocs) const;
 };
 }
 
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 7479ffbc9532..529e048045c6 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -102,8 +102,8 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
 }
 
 // Pseudo instructions
-class Pseudo<dag outs, dag ins, list<dag> pattern>
-    : RVInst<outs, ins, "", "", pattern, InstFormatPseudo> {
+class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string argstr = "">
+    : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 186fe363edd9..327e4a7d615f 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -31,16 +32,78 @@ using namespace llvm;
 RISCVInstrInfo::RISCVInstrInfo()
     : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {}
 
+unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    return 0;
+  case RISCV::LB:
+  case RISCV::LBU:
+  case RISCV::LH:
+  case RISCV::LHU:
+  case RISCV::LW:
+  case RISCV::FLW:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLD:
+    break;
+  }
+
+  if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+      MI.getOperand(2).getImm() == 0) {
+    FrameIndex = MI.getOperand(1).getIndex();
+    return MI.getOperand(0).getReg();
+  }
+
+  return 0;
+}
+
+unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    return 0;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::FSW:
+  case RISCV::SD:
+  case RISCV::FSD:
+    break;
+  }
+
+  if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+      MI.getOperand(1).getImm() == 0) {
+    FrameIndex = MI.getOperand(0).getIndex();
+    return MI.getOperand(2).getReg();
+  }
+
+  return 0;
+}
+
 void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  const DebugLoc &DL, unsigned DstReg,
                                  unsigned SrcReg, bool KillSrc) const {
-  assert(RISCV::GPRRegClass.contains(DstReg, SrcReg) &&
-         "Impossible reg-to-reg copy");
+  if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addImm(0);
+    return;
+  }
 
-  BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+  // FPR->FPR copies
+  unsigned Opc;
+  if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
+    Opc = RISCV::FSGNJ_S;
+  else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg))
+    Opc = RISCV::FSGNJ_D;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
       .addReg(SrcReg, getKillRegState(KillSrc))
-      .addImm(0);
+      .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -52,13 +115,22 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (I != MBB.end())
     DL = I->getDebugLoc();
 
+  unsigned Opcode;
+
   if (RISCV::GPRRegClass.hasSubClassEq(RC))
-    BuildMI(MBB, I, DL, get(RISCV::SW))
-        .addReg(SrcReg, getKillRegState(IsKill))
-        .addFrameIndex(FI)
-        .addImm(0);
+    Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
+             RISCV::SW : RISCV::SD;
+  else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
+    Opcode = RISCV::FSW;
+  else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
+    Opcode = RISCV::FSD;
   else
     llvm_unreachable("Can't store this register to stack slot");
+
+  BuildMI(MBB, I, DL, get(Opcode))
+      .addReg(SrcReg, getKillRegState(IsKill))
+      .addFrameIndex(FI)
+      .addImm(0);
 }
 
 void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -70,8 +142,310 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (I != MBB.end())
     DL = I->getDebugLoc();
 
+  unsigned Opcode;
+
   if (RISCV::GPRRegClass.hasSubClassEq(RC))
-    BuildMI(MBB, I, DL, get(RISCV::LW), DstReg).addFrameIndex(FI).addImm(0);
+    Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
+             RISCV::LW : RISCV::LD;
+  else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
+    Opcode = RISCV::FLW;
+  else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
+    Opcode = RISCV::FLD;
   else
     llvm_unreachable("Can't load this register from stack slot");
+
+  BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
+}
+
+void RISCVInstrInfo::movImm32(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              const DebugLoc &DL, unsigned DstReg, uint64_t Val,
+                              MachineInstr::MIFlag Flag) const {
+  assert(isInt<32>(Val) && "Can only materialize 32-bit constants");
+
+  // TODO: If the value can be materialized using only one instruction, only
+  // insert a single instruction.
+
+  uint64_t Hi20 = ((Val + 0x800) >> 12) & 0xfffff;
+  uint64_t Lo12 = SignExtend64<12>(Val);
+  BuildMI(MBB, MBBI, DL, get(RISCV::LUI), DstReg)
+      .addImm(Hi20)
+      .setMIFlag(Flag);
+  BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+      .addReg(DstReg, RegState::Kill)
+      .addImm(Lo12)
+      .setMIFlag(Flag);
+}
+
+// The contents of values added to Cond are not examined outside of
+// RISCVInstrInfo, giving us flexibility in what to push to it. For RISCV, we
+// push BranchOpcode, Reg1, Reg2.
+static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  assert(LastInst.getDesc().isConditionalBranch() &&
+         "Unknown conditional branch");
+  Target = LastInst.getOperand(2).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(LastInst.getOpcode()));
+  Cond.push_back(LastInst.getOperand(0));
+  Cond.push_back(LastInst.getOperand(1));
+}
+
+static unsigned getOppositeBranchOpcode(int Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unrecognized conditional branch");
+  case RISCV::BEQ:
+    return RISCV::BNE;
+  case RISCV::BNE:
+    return RISCV::BEQ;
+  case RISCV::BLT:
+    return RISCV::BGE;
+  case RISCV::BGE:
+    return RISCV::BLT;
+  case RISCV::BLTU:
+    return RISCV::BGEU;
+  case RISCV::BGEU:
+    return RISCV::BLTU;
+  }
+}
+
+bool RISCVInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  TBB = FBB = nullptr;
+  Cond.clear();
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end() || !isUnpredicatedTerminator(*I))
+    return false;
+
+  // Count the number of terminators and find the first unconditional or
+  // indirect branch.
+  MachineBasicBlock::iterator FirstUncondOrIndirectBr = MBB.end();
+  int NumTerminators = 0;
+  for (auto J = I.getReverse(); J != MBB.rend() && isUnpredicatedTerminator(*J);
+       J++) {
+    NumTerminators++;
+    if (J->getDesc().isUnconditionalBranch() ||
+        J->getDesc().isIndirectBranch()) {
+      FirstUncondOrIndirectBr = J.getReverse();
+    }
+  }
+
+  // If AllowModify is true, we can erase any terminators after
+  // FirstUncondOrIndirectBR.
+  if (AllowModify && FirstUncondOrIndirectBr != MBB.end()) {
+    while (std::next(FirstUncondOrIndirectBr) != MBB.end()) {
+      std::next(FirstUncondOrIndirectBr)->eraseFromParent();
+      NumTerminators--;
+    }
+    I = FirstUncondOrIndirectBr;
+  }
+
+  // We can't handle blocks that end in an indirect branch.
+  if (I->getDesc().isIndirectBranch())
+    return true;
+
+  // We can't handle blocks with more than 2 terminators.
+  if (NumTerminators > 2)
+    return true;
+
+  // Handle a single unconditional branch.
+  if (NumTerminators == 1 && I->getDesc().isUnconditionalBranch()) {
+    TBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  // Handle a single conditional branch.
+  if (NumTerminators == 1 && I->getDesc().isConditionalBranch()) {
+    parseCondBranch(*I, TBB, Cond);
+    return false;
+  }
+
+  // Handle a conditional branch followed by an unconditional branch.
+  if (NumTerminators == 2 && std::prev(I)->getDesc().isConditionalBranch() &&
+      I->getDesc().isUnconditionalBranch()) {
+    parseCondBranch(*std::prev(I), TBB, Cond);
+    FBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  // Otherwise, we can't handle this.
+  return true;
+}
+
+unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                      int *BytesRemoved) const {
+  if (BytesRemoved)
+    *BytesRemoved = 0;
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  if (!I->getDesc().isUnconditionalBranch() &&
+      !I->getDesc().isConditionalBranch())
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  if (BytesRemoved)
+    *BytesRemoved += getInstSizeInBytes(*I);
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (!I->getDesc().isConditionalBranch())
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  if (BytesRemoved)
+    *BytesRemoved += getInstSizeInBytes(*I);
+  return 2;
+}
+
+// Inserts a branch into the end of the specific MachineBasicBlock, returning
+// the number of instructions inserted.
+unsigned RISCVInstrInfo::insertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
+  if (BytesAdded)
+    *BytesAdded = 0;
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 3 || Cond.size() == 0) &&
+         "RISCV branch conditions have two components!");
+
+  // Unconditional branch.
+  if (Cond.empty()) {
+    MachineInstr &MI = *BuildMI(&MBB, DL, get(RISCV::PseudoBR)).addMBB(TBB);
+    if (BytesAdded)
+      *BytesAdded += getInstSizeInBytes(MI);
+    return 1;
+  }
+
+  // Either a one or two-way conditional branch.
+  unsigned Opc = Cond[0].getImm();
+  MachineInstr &CondMI =
+      *BuildMI(&MBB, DL, get(Opc)).add(Cond[1]).add(Cond[2]).addMBB(TBB);
+  if (BytesAdded)
+    *BytesAdded += getInstSizeInBytes(CondMI);
+
+  // One-way conditional branch.
+  if (!FBB)
+    return 1;
+
+  // Two-way conditional branch.
+  MachineInstr &MI = *BuildMI(&MBB, DL, get(RISCV::PseudoBR)).addMBB(FBB);
+  if (BytesAdded)
+    *BytesAdded += getInstSizeInBytes(MI);
+  return 2;
+}
+
+unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                              MachineBasicBlock &DestBB,
+                                              const DebugLoc &DL,
+                                              int64_t BrOffset,
+                                              RegScavenger *RS) const {
+  assert(RS && "RegScavenger required for long branching");
+  assert(MBB.empty() &&
+         "new block should be inserted for expanding unconditional branch");
+  assert(MBB.pred_size() == 1);
+
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
+  const auto &STI = MF->getSubtarget<RISCVSubtarget>();
+
+  if (TM.isPositionIndependent() || STI.is64Bit())
+    report_fatal_error("Unable to insert indirect branch");
+
+  if (!isInt<32>(BrOffset))
+    report_fatal_error(
+        "Branch offsets outside of the signed 32-bit range not supported");
+
+  // FIXME: A virtual register must be used initially, as the register
+  // scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
+  // uses the same workaround).
+  unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  auto II = MBB.end();
+
+  MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg)
+                             .addMBB(&DestBB, RISCVII::MO_HI);
+  BuildMI(MBB, II, DL, get(RISCV::PseudoBRIND))
+      .addReg(ScratchReg, RegState::Kill)
+      .addMBB(&DestBB, RISCVII::MO_LO);
+
+  RS->enterBasicBlockEnd(MBB);
+  unsigned Scav = RS->scavengeRegisterBackwards(
+      RISCV::GPRRegClass, MachineBasicBlock::iterator(LuiMI), false, 0);
+  MRI.replaceRegWith(ScratchReg, Scav);
+  MRI.clearVirtRegs();
+  RS->setRegUsed(Scav);
+  return 8;
+}
+
+bool RISCVInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert((Cond.size() == 3) && "Invalid branch condition!");
+  Cond[0].setImm(getOppositeBranchOpcode(Cond[0].getImm()));
+  return false;
+}
+
+MachineBasicBlock *
+RISCVInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
+  assert(MI.getDesc().isBranch() && "Unexpected opcode!");
+  // The branch target is always the last operand.
+  int NumOp = MI.getNumExplicitOperands();
+  return MI.getOperand(NumOp - 1).getMBB();
+}
+
+bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+                                           int64_t BrOffset) const {
+  // Ideally we could determine the supported branch offset from the
+  // RISCVII::FormMask, but this can't be used for Pseudo instructions like
+  // PseudoBR.
+  switch (BranchOp) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case RISCV::BEQ:
+  case RISCV::BNE:
+  case RISCV::BLT:
+  case RISCV::BGE:
+  case RISCV::BLTU:
+  case RISCV::BGEU:
+    return isIntN(13, BrOffset);
+  case RISCV::JAL:
+  case RISCV::PseudoBR:
+    return isIntN(21, BrOffset);
+  }
+}
+
+unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+
+  switch (Opcode) {
+  default: { return get(Opcode).getSize(); }
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case RISCV::PseudoCALL:
+  case RISCV::PseudoTAIL:
+    return 8;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const auto &TM = static_cast<const RISCVTargetMachine &>(MF.getTarget());
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                              *TM.getMCAsmInfo());
+  }
+  }
 }
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 05c8378445cf..1d3279c3d31e 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -27,6 +27,11 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 public:
   RISCVInstrInfo();
 
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, unsigned DstReg, unsigned SrcReg,
                    bool KillSrc) const override;
@@ -41,6 +46,39 @@ public:
                             MachineBasicBlock::iterator MBBI, unsigned DstReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
+
+  // Materializes the given int32 Val into DstReg.
+  void movImm32(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, unsigned DstReg, uint64_t Val,
+                MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &dl,
+                        int *BytesAdded = nullptr) const override;
+
+  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
+                                MachineBasicBlock &NewDestBB,
+                                const DebugLoc &DL, int64_t BrOffset,
+                                RegScavenger *RS = nullptr) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+  bool isBranchOffsetInRange(unsigned BranchOpc,
+                             int64_t BrOffset) const override;
 };
 }
 #endif
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 1aae2f39dbdd..b51e4e70330d 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -36,13 +36,28 @@ def CallSeqEnd   : SDNode<"ISD::CALLSEQ_END", SDT_RISCVCallSeqEnd,
                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def RetFlag      : SDNode<"RISCVISD::RET_FLAG", SDTNone,
                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def URetFlag     : SDNode<"RISCVISD::URET_FLAG", SDTNone,
+                          [SDNPHasChain, SDNPOptInGlue]>;
+def SRetFlag     : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
+                          [SDNPHasChain, SDNPOptInGlue]>;
+def MRetFlag     : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
+                          [SDNPHasChain, SDNPOptInGlue]>;
 def SelectCC     : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
                           [SDNPInGlue]>;
+def Tail         : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPVariadic]>;
 
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
+class ImmXLenAsmOperand<string prefix, string suffix = ""> : AsmOperandClass {
+  let Name = prefix # "ImmXLen" # suffix;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = !strconcat("Invalid", Name);
+}
+
 class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass {
   let Name = prefix # "Imm" # width # suffix;
   let RenderMethod = "addImmOperands";
@@ -83,6 +98,14 @@ def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{
   let ParserMatchClass = UImmLog2XLenAsmOperand;
   // TODO: should ensure invalid shamt is rejected when decoding.
   let DecoderMethod = "decodeUImmOperand<6>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    if (STI.getTargetTriple().isArch64Bit())
+      return  isUInt<6>(Imm);
+    return isUInt<5>(Imm);
+  }];
 }
 
 def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
@@ -94,6 +117,12 @@ def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<12>;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeSImmOperand<12>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isInt<12>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
 }
 
 def uimm12 : Operand<XLenVT> {
@@ -106,12 +135,24 @@ def simm13_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<12, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
 }
 
 def uimm20 : Operand<XLenVT> {
   let ParserMatchClass = UImmAsmOperand<20>;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeUImmOperand<20>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isUInt<20>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
 }
 
 // A 21-bit signed immediate where the least significant bit is zero.
@@ -119,13 +160,36 @@ def simm21_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<20, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+
+def BareSymbol : AsmOperandClass {
+  let Name = "BareSymbol";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidBareSymbol";
+}
+
+// A bare symbol.
+def bare_symbol : Operand<XLenVT> {
+  let ParserMatchClass = BareSymbol;
+  let MCOperandPredicate = [{
+     return MCOp.isBareSymbolRef();
+  }];
 }
 
 // A parameterized register class alternative to i32imm/i64imm from Target.td.
-def ixlenimm : Operand<XLenVT>;
+def ixlenimm : Operand<XLenVT> {
+  let ParserMatchClass = ImmXLenAsmOperand<"">;
+}
 
 // Standalone (codegen-only) immleaf patterns.
-def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
+def simm32     : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
+def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
 
 // Addressing modes.
 // Necessary because a frameindex can't be matched directly in a pattern.
@@ -220,7 +284,7 @@ class Priv<string opcodestr, bits<7> funct7>
 // Instructions
 //===----------------------------------------------------------------------===//
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, isReMaterializable = 1, mayLoad = 0, mayStore = 0 in {
 def LUI : RVInstU<OPC_LUI, (outs GPR:$rd), (ins uimm20:$imm20),
                   "lui", "$rd, $imm20">;
 
@@ -254,7 +318,11 @@ def SB : Store_rri<0b000, "sb">;
 def SH : Store_rri<0b001, "sh">;
 def SW : Store_rri<0b010, "sw">;
 
+// ADDI isn't always rematerializable, but isReMaterializable will be used as
+// a hint which is verified in isReallyTriviallyReMaterializable.
+let isReMaterializable = 1 in
 def ADDI  : ALU_ri<0b000, "addi">;
+
 def SLTI  : ALU_ri<0b010, "slti">;
 def SLTIU : ALU_ri<0b011, "sltiu">;
 def XORI  : ALU_ri<0b100, "xori">;
@@ -288,6 +356,12 @@ def FENCE : RVInstI<0b000, OPC_MISC_MEM, (outs),
   let imm12 = {0b0000,pred,succ};
 }
 
+def FENCE_TSO : RVInstI<0b000, OPC_MISC_MEM, (outs), (ins), "fence.tso", ""> {
+  let rs1 = 0;
+  let rd = 0;
+  let imm12 = {0b1000,0b0011,0b0011};
+}
+
 def FENCE_I : RVInstI<0b001, OPC_MISC_MEM, (outs), (ins), "fence.i", ""> {
   let rs1 = 0;
   let rd = 0;
@@ -386,7 +460,16 @@ def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
 // TODO RV64I: sd
 
 def : InstAlias<"nop",           (ADDI      X0,      X0,       0)>;
-// TODO li
+
+// Note that the size is 32 because up to 8 32-bit instructions are needed to
+// generate an arbitrary 64-bit immediate. However, the size does not really
+// matter since PseudoLI is currently only used in the AsmParser where it gets
+// expanded to real instructions immediately.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 32,
+    isCodeGenOnly = 0, isAsmParserOnly = 1 in
+def PseudoLI : Pseudo<(outs GPR:$rd), (ins ixlenimm:$imm), [],
+                      "li", "$rd, $imm">;
+
 def : InstAlias<"mv $rd, $rs",   (ADDI GPR:$rd, GPR:$rs,       0)>;
 def : InstAlias<"not $rd, $rs",  (XORI GPR:$rd, GPR:$rs,      -1)>;
 def : InstAlias<"neg $rd, $rs",  (SUB  GPR:$rd,      X0, GPR:$rs)>;
@@ -401,6 +484,11 @@ def : InstAlias<"snez $rd, $rs", (SLTU  GPR:$rd,      X0, GPR:$rs)>;
 def : InstAlias<"sltz $rd, $rs", (SLT   GPR:$rd, GPR:$rs,      X0)>;
 def : InstAlias<"sgtz $rd, $rs", (SLT   GPR:$rd,      X0, GPR:$rs)>;
 
+// sgt/sgtu are recognised by the GNU assembler but the canonical slt/sltu
+// form will always be printed. Therefore, set a zero weight.
+def : InstAlias<"sgt $rd, $rs, $rt", (SLT GPR:$rd, GPR:$rt, GPR:$rs), 0>;
+def : InstAlias<"sgtu $rd, $rs, $rt", (SLTU GPR:$rd, GPR:$rt, GPR:$rs), 0>;
+
 def : InstAlias<"beqz $rs, $offset",
                 (BEQ GPR:$rs,      X0, simm13_lsb0:$offset)>;
 def : InstAlias<"bnez $rs, $offset",
@@ -489,7 +577,7 @@ def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
 /// Immediates
 
 def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
-// TODO: Add a pattern for immediates with all zeroes in the lower 12 bits.
+def : Pat<(simm32hi20:$imm), (LUI (HI20 imm:$imm))>;
 def : Pat<(simm32:$imm), (ADDI (LUI (HI20 imm:$imm)), (LO12Sext imm:$imm))>;
 
 /// Simple arithmetic operations
@@ -536,11 +624,14 @@ def : Pat<(setge GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs1, GPR:$rs2), 1)>;
 def : Pat<(setle GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs2, GPR:$rs1), 1)>;
 
 let usesCustomInserter = 1 in
-def Select_GPR_Using_CC_GPR
-    : Pseudo<(outs GPR:$dst),
-             (ins GPR:$lhs, GPR:$rhs, ixlenimm:$imm, GPR:$src, GPR:$src2),
-             [(set XLenVT:$dst, (SelectCC GPR:$lhs, GPR:$rhs,
-              (XLenVT imm:$imm), GPR:$src, GPR:$src2))]>;
+class SelectCC_rrirr<RegisterClass valty, RegisterClass cmpty>
+    : Pseudo<(outs valty:$dst),
+             (ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm,
+              valty:$truev, valty:$falsev),
+             [(set valty:$dst, (SelectCC cmpty:$lhs, cmpty:$rhs,
+              (XLenVT imm:$imm), valty:$truev, valty:$falsev))]>;
+
+def Select_GPR_Using_CC_GPR : SelectCC_rrirr<GPR, GPR>;
 
 /// Branches and jumps
 
@@ -585,14 +676,50 @@ def : Pat<(brind GPR:$rs1), (PseudoBRIND GPR:$rs1, 0)>;
 def : Pat<(brind (add GPR:$rs1, simm12:$imm12)),
           (PseudoBRIND GPR:$rs1, simm12:$imm12)>;
 
+// PseudoCALL is a pseudo instruction which will eventually expand to auipc
+// and jalr while encoding. This is desirable, as an auipc+jalr pair with
+// R_RISCV_CALL and R_RISCV_RELAX relocations can be be relaxed by the linker
+// if the offset fits in a signed 21-bit immediate.
+// Define AsmString to print "call" when compile with -S flag.
+// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
+let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in
+def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func),
+                        [(Call tglobaladdr:$func)]> {
+  let AsmString = "call\t$func";
+}
+
+def : Pat<(Call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+
+def : Pat<(URetFlag), (URET X0, X0)>;
+def : Pat<(SRetFlag), (SRET X0, X0)>;
+def : Pat<(MRetFlag), (MRET X0, X0)>;
+
 let isCall = 1, Defs = [X1] in
-def PseudoCALL : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
-                 PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
+                         PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
 
 let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>,
                 PseudoInstExpansion<(JALR X0, X1, 0)>;
 
+// PseudoTAIL is a pseudo instruction similar to PseudoCALL and will eventually
+// expand to auipc and jalr while encoding.
+// Define AsmString to print "tail" when compile with -S flag.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2],
+    isCodeGenOnly = 0 in
+def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst), []> {
+  let AsmString = "tail\t$dst";
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in
+def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1), [(Tail GPRTC:$rs1)]>,
+                         PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
+
+def : Pat<(Tail (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL texternalsym:$dst)>;
+def : Pat<(Tail (iPTR texternalsym:$dst)),
+          (PseudoTAIL texternalsym:$dst)>;
+
 /// Loads
 
 multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
@@ -616,20 +743,40 @@ defm : LdPat<zextloadi16, LHU>;
 
 /// Stores
 
-multiclass StPat<PatFrag StoreOp, RVInst Inst> {
-  def : Pat<(StoreOp GPR:$rs2, GPR:$rs1), (Inst GPR:$rs2, GPR:$rs1, 0)>;
-  def : Pat<(StoreOp GPR:$rs2, AddrFI:$rs1), (Inst GPR:$rs2, AddrFI:$rs1, 0)>;
-  def : Pat<(StoreOp GPR:$rs2, (add GPR:$rs1, simm12:$imm12)),
-            (Inst GPR:$rs2, GPR:$rs1, simm12:$imm12)>;
-  def : Pat<(StoreOp GPR:$rs2, (add AddrFI:$rs1, simm12:$imm12)),
-            (Inst GPR:$rs2, AddrFI:$rs1, simm12:$imm12)>;
-  def : Pat<(StoreOp GPR:$rs2, (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
-            (Inst GPR:$rs2, AddrFI:$rs1, simm12:$imm12)>;
+multiclass StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy> {
+  def : Pat<(StoreOp StTy:$rs2, GPR:$rs1), (Inst StTy:$rs2, GPR:$rs1, 0)>;
+  def : Pat<(StoreOp StTy:$rs2, AddrFI:$rs1), (Inst StTy:$rs2, AddrFI:$rs1, 0)>;
+  def : Pat<(StoreOp StTy:$rs2, (add GPR:$rs1, simm12:$imm12)),
+            (Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
+  def : Pat<(StoreOp StTy:$rs2, (add AddrFI:$rs1, simm12:$imm12)),
+            (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
+  def : Pat<(StoreOp StTy:$rs2, (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
+            (Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
 }
 
-defm : StPat<truncstorei8, SB>;
-defm : StPat<truncstorei16, SH>;
-defm : StPat<store, SW>;
+defm : StPat<truncstorei8, SB, GPR>;
+defm : StPat<truncstorei16, SH, GPR>;
+defm : StPat<store, SW, GPR>;
+
+/// Fences
+
+// Refer to Table A.6 in the version 2.3 draft of the RISC-V Instruction Set
+// Manual: Volume I.
+
+// fence acquire -> fence r, rw
+def : Pat<(atomic_fence (i32 4), (imm)), (FENCE 0b10, 0b11)>;
+// fence release -> fence rw, w
+def : Pat<(atomic_fence (i32 5), (imm)), (FENCE 0b11, 0b1)>;
+// fence acq_rel -> fence.tso
+def : Pat<(atomic_fence (i32 6), (imm)), (FENCE_TSO)>;
+// fence seq_cst -> fence rw, rw
+def : Pat<(atomic_fence (i32 7), (imm)), (FENCE 0b11, 0b11)>;
+
+// Lowering for atomic load and store is defined in RISCVInstrInfoA.td.
+// Although these are lowered to fence+load/store instructions defined in the
+// base RV32I/RV64I ISA, this lowering is only used when the A extension is
+// present. This is necessary as it isn't valid to mix __atomic_* libcalls
+// with inline atomic operations for the same object.
 
 /// Other pseudo-instructions
 
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 33e863ba6a10..379322060438 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -75,3 +75,23 @@ defm AMOMAX_D   : AMO_rr_aq_rl<0b10100, 0b011, "amomax.d">;
 defm AMOMINU_D  : AMO_rr_aq_rl<0b11000, 0b011, "amominu.d">;
 defm AMOMAXU_D  : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">;
 } // Predicates = [HasStedExtA, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtA] in {
+
+/// Atomic loads and stores
+
+// Fences will be inserted for atomic load/stores according to the logic in
+// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
+
+defm : LdPat<atomic_load_8,  LB>;
+defm : LdPat<atomic_load_16, LH>;
+defm : LdPat<atomic_load_32, LW>;
+
+defm : StPat<atomic_store_8,  SB, GPR>;
+defm : StPat<atomic_store_16,  SH, GPR>;
+defm : StPat<atomic_store_32, SW, GPR>;
+} // Predicates = [HasStdExtF]
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 661d2a78eeef..5d1c62c0b653 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -27,18 +27,67 @@ def uimmlog2xlennonzero : Operand<XLenVT>, ImmLeaf<XLenVT, [{
   let ParserMatchClass = UImmLog2XLenNonZeroAsmOperand;
   // TODO: should ensure invalid shamt is rejected when decoding.
   let DecoderMethod = "decodeUImmOperand<6>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    if (STI.getTargetTriple().isArch64Bit())
+      return  isUInt<6>(Imm) && (Imm != 0);
+    return isUInt<5>(Imm) && (Imm != 0);
+  }];
 }
 
 def simm6 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<6>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<6>;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeSImmOperand<6>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isInt<6>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
 }
 
-def uimm6nonzero : Operand<XLenVT>,
-                   ImmLeaf<XLenVT, [{return isUInt<6>(Imm) && (Imm != 0);}]> {
-  let ParserMatchClass = UImmAsmOperand<6, "NonZero">;
-  let DecoderMethod = "decodeUImmOperand<6>";
+def simm6nonzero : Operand<XLenVT>,
+                   ImmLeaf<XLenVT, [{return (Imm != 0) && isInt<6>(Imm);}]> {
+  let ParserMatchClass = SImmAsmOperand<6, "NonZero">;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeSImmOperand<6>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return (Imm != 0) && isInt<6>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+
+def CLUIImmAsmOperand : AsmOperandClass {
+  let Name = "CLUIImm";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = !strconcat("Invalid", Name);
+}
+
+
+// c_lui_imm checks the immediate range is in [1, 31] or [0xfffe0, 0xfffff].
+// The RISC-V ISA describes the constraint as [1, 63], with that value being
+// loaded in to bits 17-12 of the destination register and sign extended from
+// bit 17. Therefore, this 6-bit immediate can represent values in the ranges
+// [1, 31] and [0xfffe0, 0xfffff].
+def c_lui_imm : Operand<XLenVT>,
+                ImmLeaf<XLenVT, [{return (Imm != 0) &&
+                                 (isUInt<5>(Imm) ||
+                                  (Imm >= 0xfffe0 && Imm <= 0xfffff));}]> {
+  let ParserMatchClass = CLUIImmAsmOperand;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeCLUIImmOperand";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return (Imm != 0) && (isUInt<5>(Imm) ||
+             (Imm >= 0xfffe0 && Imm <= 0xfffff));
+    return MCOp.isBareSymbolRef();
+  }];
 }
 
 // A 7-bit unsigned immediate where the least significant two bits are zero.
@@ -47,6 +96,12 @@ def uimm7_lsb00 : Operand<XLenVT>,
   let ParserMatchClass = UImmAsmOperand<7, "Lsb00">;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeUImmOperand<7>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedUInt<5, 2>(Imm);
+  }];
 }
 
 // A 8-bit unsigned immediate where the least significant two bits are zero.
@@ -55,6 +110,12 @@ def uimm8_lsb00 : Operand<XLenVT>,
   let ParserMatchClass = UImmAsmOperand<8, "Lsb00">;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeUImmOperand<8>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedUInt<6, 2>(Imm);
+  }];
 }
 
 // A 8-bit unsigned immediate where the least significant three bits are zero.
@@ -63,6 +124,12 @@ def uimm8_lsb000 : Operand<XLenVT>,
   let ParserMatchClass = UImmAsmOperand<8, "Lsb000">;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeUImmOperand<8>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedUInt<5, 3>(Imm);
+  }];
 }
 
 // A 9-bit signed immediate where the least significant bit is zero.
@@ -70,6 +137,13 @@ def simm9_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<9, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<9>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<8, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+
+  }];
 }
 
 // A 9-bit unsigned immediate where the least significant three bits are zero.
@@ -78,6 +152,12 @@ def uimm9_lsb000 : Operand<XLenVT>,
   let ParserMatchClass = UImmAsmOperand<9, "Lsb000">;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeUImmOperand<9>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedUInt<6, 3>(Imm);
+  }];
 }
 
 // A 10-bit unsigned immediate where the least significant two bits are zero
@@ -88,21 +168,40 @@ def uimm10_lsb00nonzero : Operand<XLenVT>,
   let ParserMatchClass = UImmAsmOperand<10, "Lsb00NonZero">;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeUImmOperand<10>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedUInt<8, 2>(Imm) && (Imm != 0);
+  }];
 }
 
 // A 10-bit signed immediate where the least significant four bits are zero.
-def simm10_lsb0000 : Operand<XLenVT>,
-                     ImmLeaf<XLenVT, [{return isShiftedInt<6, 4>(Imm);}]> {
-  let ParserMatchClass = SImmAsmOperand<10, "Lsb0000">;
+def simm10_lsb0000nonzero : Operand<XLenVT>,
+                            ImmLeaf<XLenVT,
+                            [{return (Imm != 0) && isShiftedInt<6, 4>(Imm);}]> {
+  let ParserMatchClass = SImmAsmOperand<10, "Lsb0000NonZero">;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeSImmOperand<10>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedInt<6, 4>(Imm);
+  }];
 }
 
 // A 12-bit signed immediate where the least significant bit is zero.
-def simm12_lsb0 : Operand<OtherVT> {
+def simm12_lsb0 : Operand<XLenVT> {
   let ParserMatchClass = SImmAsmOperand<12, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<12>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<11, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -188,8 +287,8 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
   let Inst{5} = imm{3};
 }
 
-def C_FLD  : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
-             Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FLD  : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
@@ -202,24 +301,24 @@ def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00> {
   let Inst{5} = imm{6};
 }
 
-let DecoderNamespace = "RISCV32Only_" in
-def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
-             Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+    Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00> {
   bits<7> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6} = imm{2};
   let Inst{5} = imm{6};
 }
 
-def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
-           Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
 }
 
-def C_FSD  : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
-             Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FSD  : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
@@ -232,17 +331,17 @@ def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00> {
   let Inst{5} = imm{6};
 }
 
-let DecoderNamespace = "RISCV32Only_" in
-def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
-             Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+    Predicates = [HasStdExtC, HasStdExtF, IsRV32]  in
+def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00> {
   bits<7> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6} = imm{2};
   let Inst{5} = imm{6};
 }
 
-def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000>,
-           Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
@@ -253,23 +352,23 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
-                        (ins GPRNoX0:$rd, simm6:$imm),
+                        (ins GPRNoX0:$rd, simm6nonzero:$imm),
                         "c.addi", "$rd, $imm"> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
-    DecoderNamespace = "RISCV32Only_", Defs = [X1] in
+    DecoderNamespace = "RISCV32Only_", Defs = [X1],
+    Predicates = [HasStdExtC, IsRV32]  in
 def C_JAL : RVInst16CJ<0b001, 0b01, (outs), (ins simm12_lsb0:$offset),
-                       "c.jal", "$offset">,
-            Requires<[IsRV32]>;
+                       "c.jal", "$offset">;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+    Predicates = [HasStdExtC, IsRV64] in
 def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
                          (ins GPRNoX0:$rd, simm6:$imm),
-                         "c.addiw", "$rd, $imm">,
-              Requires<[IsRV64]> {
+                         "c.addiw", "$rd, $imm"> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
 }
@@ -282,7 +381,7 @@ def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
-                            (ins SP:$rd, simm10_lsb0000:$imm),
+                            (ins SP:$rd, simm10_lsb0000nonzero:$imm),
                             "c.addi16sp", "$rd, $imm"> {
   let Constraints = "$rd = $rd_wb";
   let Inst{12} = imm{9};
@@ -295,7 +394,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
-                       (ins uimm6nonzero:$imm),
+                       (ins c_lui_imm:$imm),
                        "c.lui", "$rd, $imm"> {
   let Inst{6-2} = imm{4-0};
 }
@@ -317,8 +416,10 @@ def C_XOR  : CS_ALU<0b01, "c.xor", GPRC, 0>;
 def C_OR   : CS_ALU<0b10, "c.or" , GPRC, 0>;
 def C_AND  : CS_ALU<0b11, "c.and", GPRC, 0>;
 
-def C_SUBW : CS_ALU<0b00, "c.subw", GPRC, 1>, Requires<[IsRV64]>;
-def C_ADDW : CS_ALU<0b01, "c.addw", GPRC, 1>, Requires<[IsRV64]>;
+let Predicates = [HasStdExtC, IsRV64] in {
+def C_SUBW : CS_ALU<0b00, "c.subw", GPRC, 1>;
+def C_ADDW : CS_ALU<0b01, "c.addw", GPRC, 1>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_J : RVInst16CJ<0b101, 0b01, (outs), (ins simm12_lsb0:$offset),
@@ -339,8 +440,8 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
   let Inst{6-2} = imm{4-0};
 }
 
-def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
-               Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000> {
   let Inst{6-5} = imm{4-3};
   let Inst{4-2} = imm{8-6};
 }
@@ -350,15 +451,15 @@ def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00> {
   let Inst{3-2} = imm{7-6};
 }
 
-let DecoderNamespace = "RISCV32Only_" in
-def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
-               Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+    Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00> {
   let Inst{6-4} = imm{4-2};
   let Inst{3-2} = imm{7-6};
 }
 
-def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>,
-             Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000> {
   let Inst{6-5} = imm{4-3};
   let Inst{4-2} = imm{8-6};
 }
@@ -392,8 +493,8 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rs1_wb),
   let Constraints = "$rs1 = $rs1_wb";
 }
 
-def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
-               Requires<[HasStdExtD]> {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000> {
   let Inst{12-10} = imm{5-3};
   let Inst{9-7}   = imm{8-6};
 }
@@ -403,17 +504,204 @@ def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00> {
   let Inst{8-7}  = imm{7-6};
 }
 
-let DecoderNamespace = "RISCV32Only_" in
-def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
-               Requires<[HasStdExtF, IsRV32]> {
+let DecoderNamespace = "RISCV32Only_",
+    Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00> {
   let Inst{12-9} = imm{5-2};
   let Inst{8-7}  = imm{7-6};
 }
 
-def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>,
-             Requires<[IsRV64]> {
+let Predicates = [HasStdExtC, IsRV64] in
+def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000> {
   let Inst{12-10} = imm{5-3};
   let Inst{9-7}   = imm{8-6};
 }
 
 } // Predicates = [HasStdExtC]
+
+//===----------------------------------------------------------------------===//
+// Compress Instruction tablegen backend.
+//===----------------------------------------------------------------------===//
+
+class CompressPat<dag input, dag output> {
+  dag Input  = input;
+  dag Output    = output;
+  list<Predicate> Predicates = [];
+}
+
+// Patterns are defined in the same order the compressed instructions appear
+// on page 82 of the ISA manual.
+
+// Quadrant 0
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(ADDI GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm),
+                  (C_ADDI4SPN GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FLD FPR64C:$rd, GPRC:$rs1, uimm8_lsb000:$imm),
+                  (C_FLD FPR64C:$rd, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(LW GPRC:$rd, GPRC:$rs1, uimm7_lsb00:$imm),
+                  (C_LW GPRC:$rd, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FLW FPR32C:$rd, GPRC:$rs1, uimm7_lsb00:$imm),
+                  (C_FLW FPR32C:$rd, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(LD GPRC:$rd, GPRC:$rs1, uimm8_lsb000:$imm),
+                  (C_LD GPRC:$rd, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FSD FPR64C:$rs2, GPRC:$rs1, uimm8_lsb000:$imm),
+                  (C_FSD FPR64C:$rs2, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(SW GPRC:$rs2, GPRC:$rs1, uimm7_lsb00:$imm),
+                  (C_SW GPRC:$rs2, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FSW FPR32C:$rs2, GPRC:$rs1,uimm7_lsb00:$imm),
+                  (C_FSW FPR32C:$rs2, GPRC:$rs1, uimm7_lsb00:$imm)>;
+} // Predicate = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(SD GPRC:$rs2, GPRC:$rs1, uimm8_lsb000:$imm),
+                  (C_SD GPRC:$rs2, GPRC:$rs1, uimm8_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+// Quadrant 1
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(ADDI X0, X0, 0), (C_NOP)>;
+def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs1, simm6nonzero:$imm),
+                  (C_ADDI GPRNoX0:$rs1, simm6nonzero:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, IsRV32] in {
+def : CompressPat<(JAL X1, simm12_lsb0:$offset),
+                  (C_JAL simm12_lsb0:$offset)>;
+} // Predicates = [HasStdExtC, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(ADDIW GPRNoX0:$rs1, GPRNoX0:$rs1, simm6:$imm),
+                  (C_ADDIW GPRNoX0:$rs1, simm6:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(ADDI GPRNoX0:$rd, X0, simm6:$imm),
+                  (C_LI GPRNoX0:$rd, simm6:$imm)>;
+def : CompressPat<(ADDI X2, X2, simm10_lsb0000nonzero:$imm),
+                  (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
+def : CompressPat<(LUI GPRNoX0X2:$rd, c_lui_imm:$imm),
+                  (C_LUI GPRNoX0X2:$rd, c_lui_imm:$imm)>;
+def : CompressPat<(SRLI GPRC:$rs1, GPRC:$rs1, uimmlog2xlennonzero:$imm),
+                  (C_SRLI GPRC:$rs1, uimmlog2xlennonzero:$imm)>;
+def : CompressPat<(SRAI GPRC:$rs1, GPRC:$rs1, uimmlog2xlennonzero:$imm),
+                  (C_SRAI GPRC:$rs1, uimmlog2xlennonzero:$imm)>;
+def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, simm6:$imm),
+                  (C_ANDI GPRC:$rs1, simm6:$imm)>;
+def : CompressPat<(SUB GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+                  (C_SUB GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+                  (C_XOR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+                  (C_XOR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(OR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+                  (C_OR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(OR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+                  (C_OR GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(AND GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+                  (C_AND GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(AND GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+                  (C_AND GPRC:$rs1, GPRC:$rs2)>;
+} //  Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(SUBW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+                  (C_SUBW GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
+                   (C_ADDW GPRC:$rs1, GPRC:$rs2)>;
+def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
+                   (C_ADDW GPRC:$rs1, GPRC:$rs2)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(JAL X0, simm12_lsb0:$offset),
+                  (C_J simm12_lsb0:$offset)>;
+def : CompressPat<(BEQ GPRC:$rs1, X0, simm9_lsb0:$imm),
+                  (C_BEQZ GPRC:$rs1, simm9_lsb0:$imm)>;
+def : CompressPat<(BNE GPRC:$rs1, X0, simm9_lsb0:$imm),
+                  (C_BNEZ GPRC:$rs1, simm9_lsb0:$imm)>;
+} //  Predicates = [HasStdExtC]
+
+// Quadrant 2
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(SLLI GPRNoX0:$rs1, GPRNoX0:$rs1, uimmlog2xlennonzero:$imm),
+                  (C_SLLI GPRNoX0:$rs1, uimmlog2xlennonzero:$imm)>;
+} //  Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FLD FPR64:$rd, SP:$rs1, uimm9_lsb000:$imm),
+                  (C_FLDSP FPR64:$rd, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(LW GPRNoX0:$rd, SP:$rs1,  uimm8_lsb00:$imm),
+                  (C_LWSP GPRNoX0:$rd, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FLW FPR32:$rd, SP:$rs1, uimm8_lsb00:$imm),
+                  (C_FLWSP FPR32:$rd, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(LD GPRNoX0:$rd, SP:$rs1, uimm9_lsb000:$imm),
+                  (C_LDSP GPRNoX0:$rd, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, IsRV64]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(JALR X0, GPRNoX0:$rs1, 0),
+                  (C_JR GPRNoX0:$rs1)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, X0, GPRNoX0:$rs2),
+                  (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, X0),
+                  (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0),
+                  (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(EBREAK), (C_EBREAK)>;
+def : CompressPat<(JALR X1, GPRNoX0:$rs1, 0),
+                  (C_JALR GPRNoX0:$rs1)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2),
+                  (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1),
+                  (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtD] in {
+def : CompressPat<(FSD FPR64:$rs2, SP:$rs1, uimm9_lsb000:$imm),
+                  (C_FSDSP FPR64:$rs2, SP:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtD]
+
+let Predicates = [HasStdExtC] in {
+def : CompressPat<(SW GPR:$rs2, SP:$rs1, uimm8_lsb00:$imm),
+                  (C_SWSP GPR:$rs2, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC]
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in {
+def : CompressPat<(FSW FPR32:$rs2, SP:$rs1, uimm8_lsb00:$imm),
+                  (C_FSWSP FPR32:$rs2, SP:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtC, IsRV64] in {
+def : CompressPat<(SD GPR:$rs2, SP:$rs1, uimm9_lsb000:$imm),
+                  (C_SDSP GPR:$rs2, SP:$rs1, uimm9_lsb000:$imm)>;
+} //  Predicates = [HasStdExtC, IsRV64]
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 48d91c0054d3..06b834d55ade 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -13,6 +13,20 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCVBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                 SDTCisVT<1, i32>,
+                                                 SDTCisSameAs<1, 2>]>;
+def SDT_RISCVSplitF64     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
+                                                 SDTCisVT<1, i32>,
+                                                 SDTCisVT<2, f64>]>;
+
+def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
+def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
+
+//===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
@@ -171,4 +185,105 @@ let Predicates = [HasStdExtD] in {
 def : InstAlias<"fmv.d $rd, $rs",  (FSGNJ_D  FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
 def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
 def : InstAlias<"fneg.d $rd, $rs", (FSGNJN_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
+
+// fgt.d/fge.d are recognised by the GNU assembler but the canonical
+// flt.d/fle.d forms will always be printed. Therefore, set a zero weight.
+def : InstAlias<"fgt.d $rd, $rs, $rt",
+                (FLT_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>;
+def : InstAlias<"fge.d $rd, $rs, $rt",
+                (FLE_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>;
+} // Predicates = [HasStdExtD]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+class PatFpr64Fpr64<SDPatternOperator OpNode, RVInstR Inst>
+    : Pat<(OpNode FPR64:$rs1, FPR64:$rs2), (Inst $rs1, $rs2)>;
+
+class PatFpr64Fpr64DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
+    : Pat<(OpNode FPR64:$rs1, FPR64:$rs2), (Inst $rs1, $rs2, 0b111)>;
+
+let Predicates = [HasStdExtD] in {
+
+/// Float conversion operations
+
+// f64 -> f32, f32 -> f64
+def : Pat<(fpround FPR64:$rs1), (FCVT_S_D FPR64:$rs1, 0b111)>;
+def : Pat<(fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>;
+
+// FP->[u]int. Round-to-zero must be used
+def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_W_D FPR64:$rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>;
+
+// [u]int->fp
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_W GPR:$rs1)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>;
+
+/// Float arithmetic operations
+
+def : PatFpr64Fpr64DynFrm<fadd, FADD_D>;
+def : PatFpr64Fpr64DynFrm<fsub, FSUB_D>;
+def : PatFpr64Fpr64DynFrm<fmul, FMUL_D>;
+def : PatFpr64Fpr64DynFrm<fdiv, FDIV_D>;
+
+def : Pat<(fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>;
+
+def : Pat<(fneg FPR64:$rs1), (FSGNJN_D $rs1, $rs1)>;
+def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
+
+def : PatFpr64Fpr64<fcopysign, FSGNJ_D>;
+def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
+
+// The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
+// canonical NaN when giving a signaling NaN. This doesn't match the LLVM
+// behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
+// draft 2.3 ISA spec changes the definition of fmin and fmax in a way that
+// matches LLVM's fminnum and fmaxnum
+// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
+def : PatFpr64Fpr64<fminnum, FMIN_D>;
+def : PatFpr64Fpr64<fmaxnum, FMAX_D>;
+
+/// Setcc
+
+def : PatFpr64Fpr64<seteq, FEQ_D>;
+def : PatFpr64Fpr64<setoeq, FEQ_D>;
+def : PatFpr64Fpr64<setlt, FLT_D>;
+def : PatFpr64Fpr64<setolt, FLT_D>;
+def : PatFpr64Fpr64<setle, FLE_D>;
+def : PatFpr64Fpr64<setole, FLE_D>;
+
+// Define pattern expansions for setcc operations which aren't directly
+// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
+// Legalizer.
+
+def : Pat<(setuo FPR64:$rs1, FPR64:$rs2),
+          (SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
+                      (FEQ_D FPR64:$rs2, FPR64:$rs2)),
+                 1)>;
+
+def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>;
+
+/// Loads
+
+defm : LdPat<load, FLD>;
+
+/// Stores
+
+defm : StPat<store, FSD, FPR64>;
+
+/// Pseudo-instructions needed for the soft-float ABI with RV32D
+
+// Moves two GPRs to an FPR.
+let usesCustomInserter = 1 in
+def BuildPairF64Pseudo
+    : Pseudo<(outs FPR64:$dst), (ins GPR:$src1, GPR:$src2),
+             [(set FPR64:$dst, (RISCVBuildPairF64 GPR:$src1, GPR:$src2))]>;
+
+// Moves an FPR to two GPRs.
+let usesCustomInserter = 1 in
+def SplitF64Pseudo
+    : Pseudo<(outs GPR:$dst1, GPR:$dst2), (ins FPR64:$src),
+             [(set GPR:$dst1, GPR:$dst2, (RISCVSplitF64 FPR64:$src))]>;
+
 } // Predicates = [HasStdExtD]
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 07722d2cbf34..6d7c59becf24 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -200,6 +200,13 @@ def : InstAlias<"fmv.s $rd, $rs",  (FSGNJ_S  FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
 def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
 def : InstAlias<"fneg.s $rd, $rs", (FSGNJN_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
 
+// fgt.s/fge.s are recognised by the GNU assembler but the canonical
+// flt.s/fle.s forms will always be printed. Therefore, set a zero weight.
+def : InstAlias<"fgt.s $rd, $rs, $rt",
+                (FLT_S GPR:$rd, FPR32:$rt, FPR32:$rs), 0>;
+def : InstAlias<"fge.s $rd, $rs, $rt",
+                (FLE_S GPR:$rd, FPR32:$rt, FPR32:$rs), 0>;
+
 // The following csr instructions actually alias instructions from the base ISA.
 // However, it only makes sense to support them when the F extension is enabled.
 // CSR Addresses: 0x003 == fcsr, 0x002 == frm, 0x001 == fflags
@@ -219,4 +226,90 @@ def : InstAlias<"fsflags $rd, $rs",   (CSRRW  GPR:$rd, 0x001, GPR:$rs)>;
 def : InstAlias<"fsflags $rs",        (CSRRW       X0, 0x001, GPR:$rs), 2>;
 def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, 0x001, uimm5:$imm)>;
 def : InstAlias<"fsflagsi $imm",      (CSRRWI      X0, 0x001, uimm5:$imm), 2>;
+
+// fmv.w.x and fmv.x.w were previously known as fmv.s.x and fmv.x.s. Both
+// spellings should be supported by standard tools.
+def : MnemonicAlias<"fmv.s.x", "fmv.w.x">;
+def : MnemonicAlias<"fmv.x.s", "fmv.x.w">;
+} // Predicates = [HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+/// Generic pattern classes
+class PatFpr32Fpr32<SDPatternOperator OpNode, RVInstR Inst>
+    : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>;
+
+class PatFpr32Fpr32DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
+    : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2, 0b111)>;
+
+let Predicates = [HasStdExtF] in {
+
+/// Float conversion operations
+
+// Moves (no conversion)
+def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
+def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
+
+// FP->[u]int. Round-to-zero must be used
+def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+
+// [u]int->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
+
+/// Float arithmetic operations
+
+def : PatFpr32Fpr32DynFrm<fadd, FADD_S>;
+def : PatFpr32Fpr32DynFrm<fsub, FSUB_S>;
+def : PatFpr32Fpr32DynFrm<fmul, FMUL_S>;
+def : PatFpr32Fpr32DynFrm<fdiv, FDIV_S>;
+
+def : Pat<(fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>;
+
+def : Pat<(fneg FPR32:$rs1), (FSGNJN_S $rs1, $rs1)>;
+def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>;
+
+def : PatFpr32Fpr32<fcopysign, FSGNJ_S>;
+def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
+
+// The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
+// canonical NaN when given a signaling NaN. This doesn't match the LLVM
+// behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
+// draft 2.3 ISA spec changes the definition of fmin and fmax in a way that
+// matches LLVM's fminnum and fmaxnum
+// <https://github.com/riscv/riscv-isa-manual/commit/cd20cee7efd9bac7c5aa127ec3b451749d2b3cce>.
+def : PatFpr32Fpr32<fminnum, FMIN_S>;
+def : PatFpr32Fpr32<fmaxnum, FMAX_S>;
+
+/// Setcc
+
+def : PatFpr32Fpr32<seteq, FEQ_S>;
+def : PatFpr32Fpr32<setoeq, FEQ_S>;
+def : PatFpr32Fpr32<setlt, FLT_S>;
+def : PatFpr32Fpr32<setolt, FLT_S>;
+def : PatFpr32Fpr32<setle, FLE_S>;
+def : PatFpr32Fpr32<setole, FLE_S>;
+
+// Define pattern expansions for setcc operations which aren't directly
+// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
+// Legalizer.
+
+def : Pat<(setuo FPR32:$rs1, FPR32:$rs2),
+          (SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
+                      (FEQ_S FPR32:$rs2, FPR32:$rs2)),
+                 1)>;
+
+def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>;
+
+/// Loads
+
+defm : LdPat<load, FLW>;
+
+/// Stores
+
+defm : StPat<store, FSW, FPR32>;
+
 } // Predicates = [HasStdExtF]
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index fec9c1f93997..2dd10ada4003 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -34,3 +34,18 @@ def DIVUW   : ALUW_rr<0b0000001, 0b101, "divuw">;
 def REMW    : ALUW_rr<0b0000001, 0b110, "remw">;
 def REMUW   : ALUW_rr<0b0000001, 0b111, "remuw">;
 } // Predicates = [HasStdExtM, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtM] in {
+def : PatGprGpr<mul, MUL>;
+def : PatGprGpr<mulhs, MULH>;
+def : PatGprGpr<mulhu, MULHU>;
+// No ISDOpcode for mulhsu
+def : PatGprGpr<sdiv, DIV>;
+def : PatGprGpr<udiv, DIVU>;
+def : PatGprGpr<srem, REM>;
+def : PatGprGpr<urem, REMU>;
+} // Predicates = [HasStdExtM]
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/contrib/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index d8ae11f2bd90..e0100b1679be 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -48,11 +48,12 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   const MCExpr *ME =
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
 
-  if (!MO.isJTI() && MO.getOffset())
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
     ME = MCBinaryExpr::createAdd(
         ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
-  ME = RISCVMCExpr::create(ME, Kind, Ctx);
+  if (Kind != RISCVMCExpr::VK_RISCV_None)
+    ME = RISCVMCExpr::create(ME, Kind, Ctx);
   return MCOperand::createExpr(ME);
 }
 
@@ -75,8 +76,7 @@ bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
     MCOp = MCOperand::createImm(MO.getImm());
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::createExpr(
-        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext));
+    MCOp = lowerSymbolOperand(MO, MO.getMBB()->getSymbol(), AP);
     break;
   case MachineOperand::MO_GlobalAddress:
     MCOp = lowerSymbolOperand(MO, AP.getSymbol(MO.getGlobal()), AP);
@@ -89,6 +89,9 @@ bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
     MCOp = lowerSymbolOperand(
         MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
     break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+    break;
   }
   return true;
 }
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/contrib/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
new file mode 100644
index 000000000000..2fea3a1bdd2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -0,0 +1,55 @@
+//=- RISCVMachineFunctionInfo.h - RISCV machine function info -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares RISCV-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// RISCVMachineFunctionInfo - This class is derived from MachineFunctionInfo
+/// and contains private RISCV-specific information for each MachineFunction.
+class RISCVMachineFunctionInfo : public MachineFunctionInfo {
+private:
+  MachineFunction &MF;
+  /// FrameIndex for start of varargs area
+  int VarArgsFrameIndex = 0;
+  /// Size of the save area used for varargs
+  int VarArgsSaveSize = 0;
+  /// FrameIndex used for transferring values between 64-bit FPRs and a pair
+  /// of 32-bit GPRs via the stack.
+  int MoveF64FrameIndex = -1;
+
+public:
+  //  RISCVMachineFunctionInfo() = default;
+
+  RISCVMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  unsigned getVarArgsSaveSize() const { return VarArgsSaveSize; }
+  void setVarArgsSaveSize(int Size) { VarArgsSaveSize = Size; }
+
+  int getMoveF64FrameIndex() {
+    if (MoveF64FrameIndex == -1)
+      MoveF64FrameIndex = MF.getFrameInfo().CreateStackObject(8, 8, false);
+    return MoveF64FrameIndex;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/contrib/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
new file mode 100644
index 000000000000..b8fa8a97d41a
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -0,0 +1,286 @@
+//===----- RISCVMergeBaseOffset.cpp - Optimise address calculations  ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Merge the offset of address calculation into the offset field
+// of instructions in a global address lowering sequence. This pass transforms:
+//   lui  vreg1, %hi(s)
+//   addi vreg2, vreg1, %lo(s)
+//   addi vreg3, verg2, Offset
+//
+//   Into:
+//   lui  vreg1, %hi(s+Offset)
+//   addi vreg2, vreg1, %lo(s+Offset)
+//
+// The transformation is carried out under certain conditions:
+// 1) The offset field in the base of global address lowering sequence is zero.
+// 2) The lowered global address has only one use.
+//
+// The offset field can be in a different form. This pass handles all of them.
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-merge-base-offset"
+#define RISCV_MERGE_BASE_OFFSET_NAME "RISCV Merge Base Offset"
+namespace {
+
+struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
+  static char ID;
+  const MachineFunction *MF;
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+  bool detectLuiAddiGlobal(MachineInstr &LUI, MachineInstr *&ADDI);
+
+  bool detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI);
+  void foldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI, MachineInstr &Tail,
+                  int64_t Offset);
+  bool matchLargeOffset(MachineInstr &TailAdd, unsigned GSReg, int64_t &Offset);
+  RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  StringRef getPassName() const override {
+    return RISCV_MERGE_BASE_OFFSET_NAME;
+  }
+
+private:
+  MachineRegisterInfo *MRI;
+  std::set<MachineInstr *> DeadInstrs;
+};
+}; // end anonymous namespace
+
+char RISCVMergeBaseOffsetOpt::ID = 0;
+INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, "riscv-merge-base-offset",
+                RISCV_MERGE_BASE_OFFSET_NAME, false, false)
+
+// Detect the pattern:
+//   lui   vreg1, %hi(s)
+//   addi  vreg2, vreg1, %lo(s)
+//
+//   Pattern only accepted if:
+//     1) ADDI has only one use.
+//     2) LUI has only one use; which is the ADDI.
+//     3) Both ADDI and LUI have GlobalAddress type which indicates that these
+//        are generated from global address lowering.
+//     4) Offset value in the Global Address is 0.
+bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI,
+                                                  MachineInstr *&LoADDI) {
+  if (HiLUI.getOpcode() != RISCV::LUI ||
+      HiLUI.getOperand(1).getTargetFlags() != RISCVII::MO_HI ||
+      HiLUI.getOperand(1).getType() != MachineOperand::MO_GlobalAddress ||
+      HiLUI.getOperand(1).getOffset() != 0 ||
+      !MRI->hasOneUse(HiLUI.getOperand(0).getReg()))
+    return false;
+  unsigned HiLuiDestReg = HiLUI.getOperand(0).getReg();
+  LoADDI = MRI->use_begin(HiLuiDestReg)->getParent();
+  if (LoADDI->getOpcode() != RISCV::ADDI ||
+      LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO ||
+      LoADDI->getOperand(2).getType() != MachineOperand::MO_GlobalAddress ||
+      LoADDI->getOperand(2).getOffset() != 0 ||
+      !MRI->hasOneUse(LoADDI->getOperand(0).getReg()))
+    return false;
+  return true;
+}
+
+// Update the offset in HiLUI and LoADDI instructions.
+// Delete the tail instruction and update all the uses to use the
+// output from LoADDI.
+void RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &HiLUI,
+                                         MachineInstr &LoADDI,
+                                         MachineInstr &Tail, int64_t Offset) {
+  // Put the offset back in HiLUI and the LoADDI
+  HiLUI.getOperand(1).setOffset(Offset);
+  LoADDI.getOperand(2).setOffset(Offset);
+  // Delete the tail instruction.
+  DeadInstrs.insert(&Tail);
+  MRI->replaceRegWith(Tail.getOperand(0).getReg(),
+                      LoADDI.getOperand(0).getReg());
+  LLVM_DEBUG(dbgs() << "  Merged offset " << Offset << " into base.\n"
+                    << "     " << HiLUI << "     " << LoADDI;);
+}
+
+// Detect patterns for large offsets that are passed into an ADD instruction.
+//
+//                     Base address lowering is of the form:
+//                        HiLUI:  lui   vreg1, %hi(s)
+//                       LoADDI:  addi  vreg2, vreg1, %lo(s)
+//                       /                                  \
+//                      /                                    \
+//                     /                                      \
+//                    /  The large offset can be of two forms: \
+//  1) Offset that has non zero bits in lower      2) Offset that has non zero
+//     12 bits and upper 20 bits                      bits in upper 20 bits only
+//   OffseLUI: lui   vreg3, 4
+// OffsetTail: addi  voff, vreg3, 188                OffsetTail: lui  voff, 128
+//                    \                                        /
+//                     \                                      /
+//                      \                                    /
+//                       \                                  /
+//                         TailAdd: add  vreg4, vreg2, voff
+bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
+                                               unsigned GAReg,
+                                               int64_t &Offset) {
+  assert((TailAdd.getOpcode() == RISCV::ADD) && "Expected ADD instruction!");
+  unsigned Rs = TailAdd.getOperand(1).getReg();
+  unsigned Rt = TailAdd.getOperand(2).getReg();
+  unsigned Reg = Rs == GAReg ? Rt : Rs;
+
+  // Can't fold if the register has more than one use.
+  if (!MRI->hasOneUse(Reg))
+    return false;
+  // This can point to an ADDI or a LUI:
+  MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
+  if (OffsetTail.getOpcode() == RISCV::ADDI) {
+    // The offset value has non zero bits in both %hi and %lo parts.
+    // Detect an ADDI that feeds from a LUI instruction.
+    MachineOperand &AddiImmOp = OffsetTail.getOperand(2);
+    if (AddiImmOp.getTargetFlags() != RISCVII::MO_None)
+      return false;
+    int64_t OffLo = AddiImmOp.getImm();
+    MachineInstr &OffsetLui =
+        *MRI->getVRegDef(OffsetTail.getOperand(1).getReg());
+    MachineOperand &LuiImmOp = OffsetLui.getOperand(1);
+    if (OffsetLui.getOpcode() != RISCV::LUI ||
+        LuiImmOp.getTargetFlags() != RISCVII::MO_None ||
+        !MRI->hasOneUse(OffsetLui.getOperand(0).getReg()))
+      return false;
+    int64_t OffHi = OffsetLui.getOperand(1).getImm();
+    Offset = (OffHi << 12) + OffLo;
+    LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail
+                      << "                 " << OffsetLui);
+    DeadInstrs.insert(&OffsetTail);
+    DeadInstrs.insert(&OffsetLui);
+    return true;
+  } else if (OffsetTail.getOpcode() == RISCV::LUI) {
+    // The offset value has all zero bits in the lower 12 bits. Only LUI
+    // exists.
+    LLVM_DEBUG(dbgs() << "  Offset Instr: " << OffsetTail);
+    Offset = OffsetTail.getOperand(1).getImm() << 12;
+    DeadInstrs.insert(&OffsetTail);
+    return true;
+  }
+  return false;
+}
+
+bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
+                                                  MachineInstr &LoADDI) {
+  unsigned DestReg = LoADDI.getOperand(0).getReg();
+  assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI");
+  // LoADDI has only one use.
+  MachineInstr &Tail = *MRI->use_begin(DestReg)->getParent();
+  switch (Tail.getOpcode()) {
+  default:
+    LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
+                      << Tail);
+    return false;
+  case RISCV::ADDI: {
+    // Offset is simply an immediate operand.
+    int64_t Offset = Tail.getOperand(2).getImm();
+    LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
+    foldOffset(HiLUI, LoADDI, Tail, Offset);
+    return true;
+  } break;
+  case RISCV::ADD: {
+    // The offset is too large to fit in the immediate field of ADDI.
+    // This can be in two forms:
+    // 1) LUI hi_Offset followed by:
+    //    ADDI lo_offset
+    //    This happens in case the offset has non zero bits in
+    //    both hi 20 and lo 12 bits.
+    // 2) LUI (offset20)
+    //    This happens in case the lower 12 bits of the offset are zeros.
+    int64_t Offset;
+    if (!matchLargeOffset(Tail, DestReg, Offset))
+      return false;
+    foldOffset(HiLUI, LoADDI, Tail, Offset);
+    return true;
+  } break;
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLW:
+  case RISCV::FLD:
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSW:
+  case RISCV::FSD: {
+    // Transforms the sequence:            Into:
+    // HiLUI:  lui vreg1, %hi(foo)          --->  lui vreg1, %hi(foo+8)
+    // LoADDI: addi vreg2, vreg1, %lo(foo)  --->  lw vreg3, lo(foo+8)(vreg1)
+    // Tail:   lw vreg3, 8(vreg2)
+    if (Tail.getOperand(1).isFI())
+      return false;
+    // Register defined by LoADDI should be used in the base part of the
+    // load\store instruction. Otherwise, no folding possible.
+    unsigned BaseAddrReg = Tail.getOperand(1).getReg();
+    if (DestReg != BaseAddrReg)
+      return false;
+    MachineOperand &TailImmOp = Tail.getOperand(2);
+    int64_t Offset = TailImmOp.getImm();
+    // Update the offsets in global address lowering.
+    HiLUI.getOperand(1).setOffset(Offset);
+    // Update the immediate in the Tail instruction to add the offset.
+    Tail.RemoveOperand(2);
+    MachineOperand &ImmOp = LoADDI.getOperand(2);
+    ImmOp.setOffset(Offset);
+    Tail.addOperand(ImmOp);
+    // Update the base reg in the Tail instruction to feed from LUI.
+    // Output of HiLUI is only used in LoADDI, no need to use
+    // MRI->replaceRegWith().
+    Tail.getOperand(1).setReg(HiLUI.getOperand(0).getReg());
+    DeadInstrs.insert(&LoADDI);
+    return true;
+  } break;
+  }
+  return false;
+}
+
+bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(Fn.getFunction()))
+    return false;
+
+  DeadInstrs.clear();
+  MRI = &Fn.getRegInfo();
+  for (MachineBasicBlock &MBB : Fn) {
+    LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
+    for (MachineInstr &HiLUI : MBB) {
+      MachineInstr *LoADDI = nullptr;
+      if (!detectLuiAddiGlobal(HiLUI, LoADDI))
+        continue;
+      LLVM_DEBUG(dbgs() << "  Found lowered global address with one use: "
+                        << *LoADDI->getOperand(2).getGlobal() << "\n");
+      // If the use count is only one, merge the offset
+      detectAndFoldOffset(HiLUI, *LoADDI);
+    }
+  }
+  // Delete dead instructions.
+  for (auto *MI : DeadInstrs)
+    MI->eraseFromParent();
+  return true;
+}
+
+/// Returns an instance of the Merge Base Offset Optimization pass.
+FunctionPass *llvm::createRISCVMergeBaseOffsetOptPass() {
+  return new RISCVMergeBaseOffsetOpt();
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 5776a92cab91..3ed1dec434ce 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -33,6 +33,13 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
 
 const MCPhysReg *
 RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (MF->getFunction().hasFnAttribute("interrupt")) {
+    if (MF->getSubtarget<RISCVSubtarget>().hasStdExtD())
+      return CSR_XLEN_F64_Interrupt_SaveList;
+    if (MF->getSubtarget<RISCVSubtarget>().hasStdExtF())
+      return CSR_XLEN_F32_Interrupt_SaveList;
+    return CSR_Interrupt_SaveList;
+  }
   return CSR_SaveList;
 }
 
@@ -50,6 +57,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+bool RISCVRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+  return PhysReg == RISCV::X0;
+}
+
 const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
   return CSR_NoRegs_RegMask;
 }
@@ -61,6 +72,8 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const RISCVInstrInfo *TII = MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
@@ -69,25 +82,47 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg) +
       MI.getOperand(FIOperandNum + 1).getImm();
 
-  assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
-         "eliminateFrameIndex currently requires hasFP");
+  if (!isInt<32>(Offset)) {
+    report_fatal_error(
+        "Frame offsets outside of the signed 32-bit range not supported");
+  }
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  bool FrameRegIsKill = false;
 
-  // Offsets must be directly encoded in a 12-bit immediate field
   if (!isInt<12>(Offset)) {
-    report_fatal_error(
-        "Frame offsets outside of the signed 12-bit range not supported");
+    assert(isInt<32>(Offset) && "Int32 expected");
+    // The offset won't fit in an immediate, so use a scratch register instead
+    // Modify Offset and FrameReg appropriately
+    unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    TII->movImm32(MBB, II, DL, ScratchReg, Offset);
+    BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg)
+        .addReg(FrameReg)
+        .addReg(ScratchReg, RegState::Kill);
+    Offset = 0;
+    FrameReg = ScratchReg;
+    FrameRegIsKill = true;
   }
 
-  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+  MI.getOperand(FIOperandNum)
+      .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
 unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return RISCV::X8;
+  const TargetFrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? RISCV::X8 : RISCV::X2;
 }
 
 const uint32_t *
-RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
                                         CallingConv::ID /*CC*/) const {
+  if (MF.getFunction().hasFnAttribute("interrupt")) {
+    if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD())
+      return CSR_XLEN_F64_Interrupt_RegMask;
+    if (MF.getSubtarget<RISCVSubtarget>().hasStdExtF())
+      return CSR_XLEN_F32_Interrupt_RegMask;
+    return CSR_Interrupt_RegMask;
+  }
   return CSR_RegMask;
 }
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 0b2bc3776fc6..cbbb70079dd1 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -32,6 +32,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  bool isConstantPhysReg(unsigned PhysReg) const override;
+
   const uint32_t *getNoPreservedMask() const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
@@ -39,6 +41,18 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
                            RegScavenger *RS = nullptr) const override;
 
   unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
+    return true;
+  }
 };
 }
 
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 21be2e332e59..4be8ff9200e9 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -38,8 +38,16 @@ def ABIRegAltName : RegAltNameIndex;
 } // Namespace = "RISCV"
 
 // Integer registers
+// CostPerUse is set higher for registers that may not be compressible as they
+// are not part of GPRC, the most restrictive register class used by the
+// compressed instruction set. This will influence the greedy register
+// allocator to reduce the use of registers that can't be encoded in 16 bit
+// instructions. This affects register allocation even when compressed
+// instruction isn't targeted, we see no major negative codegen impact.
+
 let RegAltNameIndices = [ABIRegAltName] in {
   def X0  : RISCVReg<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
+  let CostPerUse = 1 in {
   def X1  : RISCVReg<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
   def X2  : RISCVReg<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
   def X3  : RISCVReg<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
@@ -47,6 +55,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
   def X5  : RISCVReg<5, "x5", ["t0"]>, DwarfRegNum<[5]>;
   def X6  : RISCVReg<6, "x6", ["t1"]>, DwarfRegNum<[6]>;
   def X7  : RISCVReg<7, "x7", ["t2"]>, DwarfRegNum<[7]>;
+  }
   def X8  : RISCVReg<8, "x8", ["s0"]>, DwarfRegNum<[8]>;
   def X9  : RISCVReg<9, "x9", ["s1"]>, DwarfRegNum<[9]>;
   def X10 : RISCVReg<10,"x10", ["a0"]>, DwarfRegNum<[10]>;
@@ -55,6 +64,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
   def X13 : RISCVReg<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
   def X14 : RISCVReg<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
   def X15 : RISCVReg<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
+  let CostPerUse = 1 in {
   def X16 : RISCVReg<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
   def X17 : RISCVReg<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
   def X18 : RISCVReg<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
@@ -71,6 +81,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
   def X29 : RISCVReg<29,"x29", ["t4"]>, DwarfRegNum<[29]>;
   def X30 : RISCVReg<30,"x30", ["t5"]>, DwarfRegNum<[30]>;
   def X31 : RISCVReg<31,"x31", ["t6"]>, DwarfRegNum<[31]>;
+  }
 }
 
 def XLenVT : ValueTypeByHwMode<[RV32, RV64, DefaultMode],
@@ -128,6 +139,19 @@ def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
       [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
 }
 
+// For indirect tail calls, we can't use callee-saved registers, as they are
+// restored to the saved value before the tail call, which would clobber a call
+// address.
+def GPRTC : RegisterClass<"RISCV", [XLenVT], 32, (add
+    (sequence "X%u", 5, 7),
+    (sequence "X%u", 10, 17),
+    (sequence "X%u", 28, 31)
+  )> {
+  let RegInfos = RegInfoByHwMode<
+      [RV32,              RV64,              DefaultMode],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+}
+
 def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
   let RegInfos = RegInfoByHwMode<
       [RV32,              RV64,              DefaultMode],
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVSubtarget.h b/contrib/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 928ba5815a22..0e09391e7829 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/contrib/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -36,6 +36,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtD = false;
   bool HasStdExtC = false;
   bool HasRV64 = false;
+  bool EnableLinkerRelax = false;
   unsigned XLen = 32;
   MVT XLenVT = MVT::i32;
   RISCVFrameLowering FrameLowering;
@@ -77,6 +78,7 @@ public:
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
   bool is64Bit() const { return HasRV64; }
+  bool enableLinkerRelax() const { return EnableLinkerRelax; }
   MVT getXLenVT() const { return XLenVT; }
   unsigned getXLen() const { return XLen; }
 };
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index e12168b73999..a2ebf5bf3e6b 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "RISCV.h"
 #include "RISCVTargetMachine.h"
+#include "RISCVTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -59,7 +60,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(TT, RM),
                         getEffectiveCodeModel(CM), OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(make_unique<RISCVELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
@@ -74,7 +75,10 @@ public:
     return getTM<RISCVTargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
+  void addPreEmitPass() override;
+  void addPreRegAlloc() override;
 };
 }
 
@@ -82,8 +86,19 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new RISCVPassConfig(*this, PM);
 }
 
+void RISCVPassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass());
+  TargetPassConfig::addIRPasses();
+}
+
 bool RISCVPassConfig::addInstSelector() {
   addPass(createRISCVISelDag(getRISCVTargetMachine()));
 
   return false;
 }
+
+void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
+
+void RISCVPassConfig::addPreRegAlloc() {
+  addPass(createRISCVMergeBaseOffsetOptPass());
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/contrib/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
new file mode 100644
index 000000000000..46e81b628b65
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -0,0 +1,19 @@
+//===-- RISCVTargetObjectFile.cpp - RISCV Object Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetObjectFile.h"
+#include "RISCVTargetMachine.h"
+
+using namespace llvm;
+
+void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                          const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h b/contrib/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
new file mode 100644
index 000000000000..5467220301c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -0,0 +1,25 @@
+//===-- RISCVTargetObjectFile.h - RISCV Object Info -*- C++ ---------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class RISCVTargetMachine;
+
+/// This implementation is used for RISCV ELF targets.
+class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 05f78a48badf..c7a5a1e8e6ee 100644
--- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -95,7 +95,6 @@ class SparcAsmParser : public MCTargetAsmParser {
                          unsigned &RegKind);
 
   bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc);
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
 
   bool is64Bit() const {
     return getSTI().getTargetTriple().getArch() == Triple::sparcv9;
@@ -109,6 +108,14 @@ public:
                 const MCInstrInfo &MII,
                 const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, sti, MII), Parser(parser) {
+    Parser.addAliasForDirective(".half", ".2byte");
+    Parser.addAliasForDirective(".uahalf", ".2byte");
+    Parser.addAliasForDirective(".word", ".4byte");
+    Parser.addAliasForDirective(".uaword", ".4byte");
+    Parser.addAliasForDirective(".nword", is64Bit() ? ".8byte" : ".4byte");
+    if (is64Bit())
+      Parser.addAliasForDirective(".xword", ".8byte");
+
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
@@ -682,21 +689,6 @@ ParseDirective(AsmToken DirectiveID)
 {
   StringRef IDVal = DirectiveID.getString();
 
-  if (IDVal == ".byte")
-    return parseDirectiveWord(1, DirectiveID.getLoc());
-
-  if (IDVal == ".half")
-    return parseDirectiveWord(2, DirectiveID.getLoc());
-
-  if (IDVal == ".word")
-    return parseDirectiveWord(4, DirectiveID.getLoc());
-
-  if (IDVal == ".nword")
-    return parseDirectiveWord(is64Bit() ? 8 : 4, DirectiveID.getLoc());
-
-  if (is64Bit() && IDVal == ".xword")
-    return parseDirectiveWord(8, DirectiveID.getLoc());
-
   if (IDVal == ".register") {
     // For now, ignore .register directive.
     Parser.eatToEndOfStatement();
@@ -713,28 +705,6 @@ ParseDirective(AsmToken DirectiveID)
   return true;
 }
 
-bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    while (true) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
-      Parser.Lex();
-    }
-  }
-  Parser.Lex();
-  return false;
-}
-
 OperandMatchResultTy
 SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
   SMLoc S, E;
@@ -915,9 +885,17 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
 
       const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
                                                   getContext());
-      if (isCall && getContext().getObjectFileInfo()->isPositionIndependent())
-        Res = SparcMCExpr::create(SparcMCExpr::VK_Sparc_WPLT30, Res,
-                                  getContext());
+      SparcMCExpr::VariantKind Kind = SparcMCExpr::VK_Sparc_13;
+
+      if (getContext().getObjectFileInfo()->isPositionIndependent()) {
+        if (isCall)
+          Kind = SparcMCExpr::VK_Sparc_WPLT30;
+        else
+          Kind = SparcMCExpr::VK_Sparc_GOT13;
+      }
+
+      Res = SparcMCExpr::create(Kind, Res, getContext());
+
       Op = SparcOperand::CreateImm(Res, S, E);
     }
     break;
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
index 9b1d0f5bf3c9..6290e5a15a8b 100644
--- a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -207,8 +207,8 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
     if (!done)
       --I;
 
-    // skip debug value
-    if (I->isDebugValue())
+    // skip debug instruction
+    if (I->isDebugInstr())
       continue;
 
     if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
diff --git a/contrib/llvm/lib/Target/Sparc/LeonFeatures.td b/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
index d9efe094d078..a7dea068cb11 100755
--- a/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
+++ b/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
@@ -37,14 +37,6 @@ def LeonCASA : SubtargetFeature<
   "Enable CASA instruction for LEON3 and LEON4 processors"
 >;
 
-
-def ReplaceSDIV : SubtargetFeature<
-  "replacesdiv",
-  "PerformSDIVReplace",
-  "true",
-  "AT697E erratum fix: Do not emit SDIV, emit SDIVCC instead"
->;
-          
 def InsertNOPLoad: SubtargetFeature<
   "insertnopload",
   "InsertNOPLoad",
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index f2438ee43075..5f5e2ef7d45a 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -54,6 +54,10 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Sparc::fixup_sparc_hi22:
     return (Value >> 10) & 0x3fffff;
 
+  case Sparc::fixup_sparc_got13:
+  case Sparc::fixup_sparc_13:
+    return Value & 0x1fff;
+
   case Sparc::fixup_sparc_pc10:
   case Sparc::fixup_sparc_got10:
   case Sparc::fixup_sparc_tls_gd_lo10:
@@ -100,14 +104,13 @@ namespace {
   class SparcAsmBackend : public MCAsmBackend {
   protected:
     const Target &TheTarget;
-    bool IsLittleEndian;
     bool Is64Bit;
 
   public:
     SparcAsmBackend(const Target &T)
-        : MCAsmBackend(), TheTarget(T),
-          IsLittleEndian(StringRef(TheTarget.getName()) == "sparcel"),
-          Is64Bit(StringRef(TheTarget.getName()) == "sparcv9") {}
+        : MCAsmBackend(StringRef(T.getName()) == "sparcel" ? support::little
+                                                           : support::big),
+          TheTarget(T), Is64Bit(StringRef(TheTarget.getName()) == "sparcv9") {}
 
     unsigned getNumFixupKinds() const override {
       return Sparc::NumTargetFixupKinds;
@@ -121,6 +124,7 @@ namespace {
         { "fixup_sparc_br19",      13,     19,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_br16_2",    10,      2,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_br16_14",   18,     14,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_13",        19,     13,  0 },
         { "fixup_sparc_hi22",      10,     22,  0 },
         { "fixup_sparc_lo10",      22,     10,  0 },
         { "fixup_sparc_h44",       10,     22,  0 },
@@ -132,6 +136,7 @@ namespace {
         { "fixup_sparc_pc10",      22,     10,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_got22",     10,     22,  0 },
         { "fixup_sparc_got10",     22,     10,  0 },
+        { "fixup_sparc_got13",     19,     13,  0 },
         { "fixup_sparc_wplt30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_tls_gd_hi22",   10, 22,  0 },
         { "fixup_sparc_tls_gd_lo10",   22, 10,  0 },
@@ -160,6 +165,7 @@ namespace {
         { "fixup_sparc_br19",       0,     19,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_br16_2",    20,      2,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_br16_14",    0,     14,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_13",         0,     13,  0 },
         { "fixup_sparc_hi22",       0,     22,  0 },
         { "fixup_sparc_lo10",       0,     10,  0 },
         { "fixup_sparc_h44",        0,     22,  0 },
@@ -171,6 +177,7 @@ namespace {
         { "fixup_sparc_pc10",       0,     10,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_got22",      0,     22,  0 },
         { "fixup_sparc_got10",      0,     10,  0 },
+        { "fixup_sparc_got13",      0,     13,  0 },
         { "fixup_sparc_wplt30",      0,     30,  MCFixupKindInfo::FKF_IsPCRel },
         { "fixup_sparc_tls_gd_hi22",    0, 22,  0 },
         { "fixup_sparc_tls_gd_lo10",    0, 10,  0 },
@@ -197,7 +204,7 @@ namespace {
 
       assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
              "Invalid kind!");
-      if (IsLittleEndian)
+      if (Endian == support::little)
         return InfosLE[Kind - FirstTargetFixupKind];
 
       return InfosBE[Kind - FirstTargetFixupKind];
@@ -234,7 +241,8 @@ namespace {
       }
     }
 
-    bool mayNeedRelaxation(const MCInst &Inst) const override {
+    bool mayNeedRelaxation(const MCInst &Inst,
+                           const MCSubtargetInfo &STI) const override {
       // FIXME.
       return false;
     }
@@ -255,14 +263,14 @@ namespace {
       llvm_unreachable("relaxInstruction() unimplemented");
     }
 
-    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+    bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
       // Cannot emit NOP with size not multiple of 32 bits.
       if (Count % 4 != 0)
         return false;
 
       uint64_t NumNops = Count / 4;
       for (uint64_t i = 0; i != NumNops; ++i)
-        OW->write32(0x01000000);
+        support::endian::write<uint32_t>(OS, 0x01000000, Endian);
 
       return true;
     }
@@ -276,7 +284,8 @@ namespace {
 
     void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                     const MCValue &Target, MutableArrayRef<char> Data,
-                    uint64_t Value, bool IsResolved) const override {
+                    uint64_t Value, bool IsResolved,
+                    const MCSubtargetInfo *STI) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
@@ -287,15 +296,15 @@ namespace {
       // from the fixup value. The Value has been "split up" into the
       // appropriate bitfields above.
       for (unsigned i = 0; i != 4; ++i) {
-        unsigned Idx = IsLittleEndian ? i : 3 - i;
+        unsigned Idx = Endian == support::little ? i : 3 - i;
         Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
       }
     }
 
-    std::unique_ptr<MCObjectWriter>
-    createObjectWriter(raw_pwrite_stream &OS) const override {
+    std::unique_ptr<MCObjectTargetWriter>
+    createObjectTargetWriter() const override {
       uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
-      return createSparcELFObjectWriter(OS, Is64Bit, IsLittleEndian, OSABI);
+      return createSparcELFObjectWriter(Is64Bit, OSABI);
     }
   };
 
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index a204036a0975..5a730947796e 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -79,6 +79,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
   case FK_Data_8:                return ((Fixup.getOffset() % 8)
                                          ? ELF::R_SPARC_UA64
                                          : ELF::R_SPARC_64);
+  case Sparc::fixup_sparc_13:    return ELF::R_SPARC_13;
   case Sparc::fixup_sparc_hi22:  return ELF::R_SPARC_HI22;
   case Sparc::fixup_sparc_lo10:  return ELF::R_SPARC_LO10;
   case Sparc::fixup_sparc_h44:   return ELF::R_SPARC_H44;
@@ -88,6 +89,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
   case Sparc::fixup_sparc_hm:    return ELF::R_SPARC_HM10;
   case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
   case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
+  case Sparc::fixup_sparc_got13: return ELF::R_SPARC_GOT13;
   case Sparc::fixup_sparc_tls_gd_hi22:   return ELF::R_SPARC_TLS_GD_HI22;
   case Sparc::fixup_sparc_tls_gd_lo10:   return ELF::R_SPARC_TLS_GD_LO10;
   case Sparc::fixup_sparc_tls_gd_add:    return ELF::R_SPARC_TLS_GD_ADD;
@@ -132,9 +134,7 @@ bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                                 bool IsLittleEndian, uint8_t OSABI) {
-  auto MOTW = llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
-  return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createSparcELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
+  return llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
 }
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
index 8d79396d936e..99aa63fe2290 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -30,6 +30,9 @@ namespace llvm {
       fixup_sparc_br16_2,
       fixup_sparc_br16_14,
 
+      /// fixup_sparc_13 - 13-bit fixup
+      fixup_sparc_13,
+
       /// fixup_sparc_hi22  - 22-bit fixup corresponding to %hi(foo)
       /// for sethi
       fixup_sparc_hi22,
@@ -64,6 +67,9 @@ namespace llvm {
       /// fixup_sparc_got10 - 10-bit fixup corresponding to %got10(foo)
       fixup_sparc_got10,
 
+      /// fixup_sparc_got13 - 13-bit fixup corresponding to %got13(foo)
+      fixup_sparc_got13,
+
       /// fixup_sparc_wplt30
       fixup_sparc_wplt30,
 
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 684f66970dbe..647be159a151 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -98,14 +98,9 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
   unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
-
-  if (Ctx.getAsmInfo()->isLittleEndian()) {
-    // Output the bits in little-endian byte order.
-    support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
-  } else {
-    // Output the bits in big-endian byte order.
-    support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
-  }
+  support::endian::write(OS, Bits,
+                         Ctx.getAsmInfo()->isLittleEndian() ? support::little
+                                                            : support::big);
   unsigned tlsOpNo = 0;
   switch (MI.getOpcode()) {
   default: break;
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index a77f760d9eff..f736a37a266c 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -58,6 +58,8 @@ bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
     // FIXME: use %got22/%got10, if system assembler supports them.
   case VK_Sparc_GOT22:    OS << "%hi("; break;
   case VK_Sparc_GOT10:    OS << "%lo("; break;
+  case VK_Sparc_GOT13:    closeParen = false; break;
+  case VK_Sparc_13:       closeParen = false; break;
   case VK_Sparc_WPLT30:   closeParen = false; break;
   case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
   case VK_Sparc_TLS_GD_HI22:   OS << "%tgd_hi22(";   break;
@@ -96,6 +98,7 @@ SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
     .Case("pc10",  VK_Sparc_PC10)
     .Case("got22", VK_Sparc_GOT22)
     .Case("got10", VK_Sparc_GOT10)
+    .Case("got13", VK_Sparc_GOT13)
     .Case("r_disp32",   VK_Sparc_R_DISP32)
     .Case("tgd_hi22",   VK_Sparc_TLS_GD_HI22)
     .Case("tgd_lo10",   VK_Sparc_TLS_GD_LO10)
@@ -132,6 +135,8 @@ Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
   case VK_Sparc_PC10:     return Sparc::fixup_sparc_pc10;
   case VK_Sparc_GOT22:    return Sparc::fixup_sparc_got22;
   case VK_Sparc_GOT10:    return Sparc::fixup_sparc_got10;
+  case VK_Sparc_GOT13:    return Sparc::fixup_sparc_got13;
+  case VK_Sparc_13:       return Sparc::fixup_sparc_13;
   case VK_Sparc_WPLT30:   return Sparc::fixup_sparc_wplt30;
   case VK_Sparc_TLS_GD_HI22:   return Sparc::fixup_sparc_tls_gd_hi22;
   case VK_Sparc_TLS_GD_LO10:   return Sparc::fixup_sparc_tls_gd_lo10;
@@ -193,14 +198,26 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
 void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   switch(getKind()) {
   default: return;
+  case VK_Sparc_TLS_GD_CALL:
+  case VK_Sparc_TLS_LDM_CALL: {
+    // The corresponding relocations reference __tls_get_addr, as they call it,
+    // but this is only implicit; we must explicitly add it to our symbol table
+    // to bind it for these uses.
+    MCSymbol *Symbol = Asm.getContext().getOrCreateSymbol("__tls_get_addr");
+    Asm.registerSymbol(*Symbol);
+    auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+    if (!ELFSymbol->isBindingSet()) {
+      ELFSymbol->setBinding(ELF::STB_GLOBAL);
+      ELFSymbol->setExternal(true);
+    }
+    LLVM_FALLTHROUGH;
+  }
   case VK_Sparc_TLS_GD_HI22:
   case VK_Sparc_TLS_GD_LO10:
   case VK_Sparc_TLS_GD_ADD:
-  case VK_Sparc_TLS_GD_CALL:
   case VK_Sparc_TLS_LDM_HI22:
   case VK_Sparc_TLS_LDM_LO10:
   case VK_Sparc_TLS_LDM_ADD:
-  case VK_Sparc_TLS_LDM_CALL:
   case VK_Sparc_TLS_LDO_HIX22:
   case VK_Sparc_TLS_LDO_LOX10:
   case VK_Sparc_TLS_LDO_ADD:
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 13f08195c764..cf2db067749c 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -36,6 +36,8 @@ public:
     VK_Sparc_PC10,
     VK_Sparc_GOT22,
     VK_Sparc_GOT10,
+    VK_Sparc_GOT13,
+    VK_Sparc_13,
     VK_Sparc_WPLT30,
     VK_Sparc_R_DISP32,
     VK_Sparc_TLS_GD_HI22,
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index 8390198479ba..3cd24104c443 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -23,7 +23,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -43,9 +43,8 @@ MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
 MCAsmBackend *createSparcAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
-std::unique_ptr<MCObjectWriter>
-createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                           bool IsLIttleEndian, uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createSparcELFObjectWriter(bool Is64Bit,
+                                                                 uint8_t OSABI);
 } // End llvm namespace
 
 // Defines symbolic names for Sparc registers.  This defines a mapping from
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.td b/contrib/llvm/lib/Target/Sparc/Sparc.td
index 9e0a297c8812..2f9b57f76041 100644
--- a/contrib/llvm/lib/Target/Sparc/Sparc.td
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.td
@@ -130,7 +130,7 @@ def : Processor<"leon2", LEON2Itineraries,
 // LEON 2 FT (AT697E)
 // TO DO: Place-holder: Processor specific features will be added *very* soon here.
 def : Processor<"at697e", LEON2Itineraries,
-                [FeatureLeon, ReplaceSDIV, InsertNOPLoad]>;
+                [FeatureLeon, InsertNOPLoad]>;
 
 // LEON 2 FT (AT697F)
 // TO DO: Place-holder: Processor specific features will be added *very* soon here.
@@ -176,4 +176,5 @@ def Sparc : Target {
   let InstructionSet = SparcInstrInfo;
   let AssemblyParsers  = [SparcAsmParser];
   let AssemblyWriters = [SparcAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index c36e75d1b076..f845c41ede45 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -311,6 +311,8 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
   if (!Changed)
     return false;
 
+  SelectInlineAsmMemoryOperands(AsmNodeOperands, SDLoc(N));
+
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
@@ -360,12 +362,6 @@ void SparcDAGToDAGISel::Select(SDNode *N) {
 
     // FIXME: Handle div by immediate.
     unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
-    // SDIV is a hardware erratum on some LEON2 processors. Replace it with SDIVcc here.
-    if (((SparcTargetMachine&)TM).getSubtargetImpl()->performSDIVReplace()
-        &&
-        Opcode == SP::SDIVrr) {
-      Opcode = SP::SDIVCCrr;
-    }
     CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, TopPart);
     return;
   }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index d9548ff90d7f..b04c6b112682 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1450,7 +1450,7 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
 SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
                                          const SparcSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
 
   // Instructions which use registers as conditionals examine all the
   // bits (as does the pseudo SELECT_CC expansion). I don't think it
@@ -1590,6 +1590,11 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::ADDC, MVT::i64, Custom);
     setOperationAction(ISD::ADDE, MVT::i64, Custom);
@@ -1700,6 +1705,9 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::UDIV, MVT::i32, Expand);
     setLibcallName(RTLIB::UDIV_I32, ".udiv");
+
+    setLibcallName(RTLIB::SREM_I32, ".rem");
+    setLibcallName(RTLIB::UREM_I32, ".urem");
   }
 
   if (Subtarget->is64Bit()) {
@@ -1722,6 +1730,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VAARG             , MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP              , MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP         , MVT::Other, Legal);
 
   // Use the default implementation.
   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
@@ -1975,11 +1984,22 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
 
   // Handle PIC mode first. SPARC needs a got load for every variable!
   if (isPositionIndependent()) {
-    // This is the pic32 code model, the GOT is known to be smaller than 4GB.
-    SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
-                                SparcMCExpr::VK_Sparc_GOT10, DAG);
+    const Module *M = DAG.getMachineFunction().getFunction().getParent();
+    PICLevel::Level picLevel = M->getPICLevel();
+    SDValue Idx;
+
+    if (picLevel == PICLevel::SmallPIC) {
+      // This is the pic13 code model, the GOT is known to be smaller than 8KiB.
+      Idx = DAG.getNode(SPISD::Lo, DL, Op.getValueType(),
+                        withTargetFlags(Op, SparcMCExpr::VK_Sparc_GOT13, DAG));
+    } else {
+      // This is the pic32 code model, the GOT is known to be smaller than 4GB.
+      Idx = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
+                         SparcMCExpr::VK_Sparc_GOT10, DAG);
+    }
+
     SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
-    SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+    SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Idx);
     // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
     // function has calls.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -2036,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   SDLoc DL(GA);
@@ -3513,6 +3533,22 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
                                                           VT);
     }
+    if (name.substr(0, 1).equals("f") &&
+        !name.substr(1).getAsInteger(10, intVal) && intVal <= 63) {
+      std::string newConstraint;
+
+      if (VT == MVT::f32 || VT == MVT::Other) {
+        newConstraint = "{f" + utostr(intVal) + "}";
+      } else if (VT == MVT::f64 && (intVal % 2 == 0)) {
+        newConstraint = "{d" + utostr(intVal / 2) + "}";
+      } else if (VT == MVT::f128 && (intVal % 4 == 0)) {
+        newConstraint = "{q" + utostr(intVal / 4) + "}";
+      } else {
+        return std::make_pair(0U, nullptr);
+      }
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
+                                                          VT);
+    }
   }
 
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
index df570cea8da8..352090ed92c1 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -474,6 +474,19 @@ def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
 // flush -> flush %g0
 def : InstAlias<"flush", (FLUSH), 0>;
 
+def : MnemonicAlias<"iflush", "flush">;
+
+def : MnemonicAlias<"stub", "stb">;
+def : MnemonicAlias<"stsb", "stb">;
+
+def : MnemonicAlias<"stuba", "stba">;
+def : MnemonicAlias<"stsba", "stba">;
+
+def : MnemonicAlias<"stuh", "sth">;
+def : MnemonicAlias<"stsh", "sth">;
+
+def : MnemonicAlias<"stuha", "stha">;
+def : MnemonicAlias<"stsha", "stha">;
 
 def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
 def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index ea8ed830bafc..6750763d8ee5 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -280,7 +280,7 @@ unsigned SparcInstrInfo::removeBranch(MachineBasicBlock &MBB,
   while (I != MBB.begin()) {
     --I;
 
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     if (I->getOpcode() != SP::BA
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 08bccbde0bd6..5b7fb3c485e8 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -421,7 +421,7 @@ let hasSideEffects = 1, mayStore = 1 in {
     def FLUSHW : F3_1<0b10, 0b101011, (outs), (ins),
                       "flushw",
                       [(flushw)]>, Requires<[HasV9]>;
-  let rd = 0, rs1 = 1, simm13 = 3 in
+  let rd = 8, rs1 = 0, simm13 = 3 in
     def TA3 : F3_2<0b10, 0b111010, (outs), (ins),
                    "ta 3",
                    [(flushw)]>;
@@ -1009,6 +1009,9 @@ let DecoderNamespace = "SparcV9", DecoderMethod = "DecodeTRAP", Predicates = [Ha
 let isBarrier = 1, isTerminator = 1, rd = 0b01000, rs1 = 0, simm13 = 5 in
   def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
 
+let hasSideEffects = 1, rd = 0b01000, rs1 = 0, simm13 = 1 in
+  def TA1 : F3_2<0b10, 0b111010, (outs), (ins), "ta 1", [(debugtrap)]>;
+
 // Section B.28 - Read State Register Instructions
 let rs2 = 0 in
   def RDASR : F3_1<2, 0b101000,
@@ -1599,6 +1602,9 @@ let Predicates = [HasV9] in {
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
 
+// Zero immediate.
+def : Pat<(i32 0),
+          (ORrr (i32 G0), (i32 G0))>;
 // Small immediates.
 def : Pat<(i32 simm13:$val),
           (ORri (i32 G0), imm:$val)>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
index 8dd2569d10de..2a279dad5ae2 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -35,6 +35,8 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                 unsigned Kind) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index 01545b8d20a0..40c5683f8495 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -44,7 +44,6 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
   // Leon features
   HasLeonCasa = false;
   HasUmacSmac = false;
-  PerformSDIVReplace = false;
   InsertNOPLoad = false;
   FixAllFDIVSQRT = false;
   DetectRoundChange = false;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
index bcdc96e68103..588a6765bcdf 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -50,7 +50,6 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool InsertNOPLoad;
   bool FixAllFDIVSQRT;
   bool DetectRoundChange;
-  bool PerformSDIVReplace;
 
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
@@ -92,7 +91,6 @@ public:
 
   // Leon options
   bool hasUmacSmac() const { return HasUmacSmac; }
-  bool performSDIVReplace() const { return PerformSDIVReplace; }
   bool hasLeonCasa() const { return HasLeonCasa; }
   bool insertNOPLoad() const { return InsertNOPLoad; }
   bool fixAllFDIVSQRT() const { return FixAllFDIVSQRT; }
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 5cd4a7daf0fa..2146832f7794 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -44,7 +44,7 @@ class SystemZMCAsmBackend : public MCAsmBackend {
   uint8_t OSABI;
 public:
   SystemZMCAsmBackend(uint8_t osABI)
-    : OSABI(osABI) {}
+      : MCAsmBackend(support::big), OSABI(osABI) {}
 
   // Override MCAsmBackend
   unsigned getNumFixupKinds() const override {
@@ -53,8 +53,10 @@ public:
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
-  bool mayNeedRelaxation(const MCInst &Inst) const override {
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
     return false;
   }
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -66,10 +68,10 @@ public:
                         MCInst &Res) const override {
     llvm_unreachable("SystemZ does do not have assembler relaxation");
   }
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createSystemZObjectWriter(OS, OSABI);
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createSystemZObjectWriter(OSABI);
   }
 };
 } // end anonymous namespace
@@ -96,7 +98,8 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
                                      const MCFixup &Fixup,
                                      const MCValue &Target,
                                      MutableArrayRef<char> Data, uint64_t Value,
-                                     bool IsResolved) const {
+                                     bool IsResolved,
+                                     const MCSubtargetInfo *STI) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
@@ -115,10 +118,9 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
   }
 }
 
-bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
-                                       MCObjectWriter *OW) const {
+bool SystemZMCAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   for (uint64_t I = 0; I != Count; ++I)
-    OW->write8(7);
+    OS << '\x7';
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 238926d6c8e0..888be519fb16 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -161,8 +161,7 @@ unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
-  return createELFObjectWriter(llvm::make_unique<SystemZObjectWriter>(OSABI),
-                               OS, /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createSystemZObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<SystemZObjectWriter>(OSABI);
 }
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index ed1b1b95b8f3..1617a807e65a 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -20,7 +20,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -93,8 +93,7 @@ MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI,
                                         const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter> createSystemZObjectWriter(raw_pwrite_stream &OS,
-                                                          uint8_t OSABI);
+std::unique_ptr<MCObjectTargetWriter> createSystemZObjectWriter(uint8_t OSABI);
 } // end namespace llvm
 
 // Defines symbolic names for SystemZ registers.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.h b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
index 9a8e508e4119..fdbde3d8dbc3 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
@@ -47,6 +47,22 @@ const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
 const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
 const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
 
+// Condition-code mask assignments for arithmetical operations.
+const unsigned CCMASK_ARITH_EQ       = CCMASK_0;
+const unsigned CCMASK_ARITH_LT       = CCMASK_1;
+const unsigned CCMASK_ARITH_GT       = CCMASK_2;
+const unsigned CCMASK_ARITH_OVERFLOW = CCMASK_3;
+const unsigned CCMASK_ARITH          = CCMASK_ANY;
+
+// Condition-code mask assignments for logical operations.
+const unsigned CCMASK_LOGICAL_ZERO     = CCMASK_0 | CCMASK_2;
+const unsigned CCMASK_LOGICAL_NONZERO  = CCMASK_1 | CCMASK_2;
+const unsigned CCMASK_LOGICAL_CARRY    = CCMASK_2 | CCMASK_3;
+const unsigned CCMASK_LOGICAL_NOCARRY  = CCMASK_0 | CCMASK_1;
+const unsigned CCMASK_LOGICAL_BORROW   = CCMASK_LOGICAL_NOCARRY;
+const unsigned CCMASK_LOGICAL_NOBORROW = CCMASK_LOGICAL_CARRY;
+const unsigned CCMASK_LOGICAL          = CCMASK_ANY;
+
 // Condition-code mask assignments for CS.
 const unsigned CCMASK_CS_EQ = CCMASK_0;
 const unsigned CCMASK_CS_NE = CCMASK_1;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.td b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
index 06905fb41e44..3800f7a26b79 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZ.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
@@ -75,4 +75,5 @@ def SystemZAsmParser : AsmParser {
 def SystemZ : Target {
   let InstructionSet = SystemZInstrInfo;
   let AssemblyParsers = [SystemZAsmParser];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index b39245b20b3c..bd99fabb48c9 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -460,6 +460,14 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     break;
 
+  case TargetOpcode::STACKMAP:
+    LowerSTACKMAP(*MI);
+    return;
+
+  case TargetOpcode::PATCHPOINT:
+    LowerPATCHPOINT(*MI, Lower);
+    return;
+
   default:
     Lower.lower(MI, LoweredMI);
     break;
@@ -467,6 +475,123 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, LoweredMI);
 }
 
+
+// Emit the largest nop instruction smaller than or equal to NumBytes
+// bytes.  Return the size of nop emitted.
+static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer,
+                        unsigned NumBytes, const MCSubtargetInfo &STI) {
+  if (NumBytes < 2) {
+    llvm_unreachable("Zero nops?");
+    return 0;
+  }
+  else if (NumBytes < 4) {
+    OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BCRAsm)
+                                  .addImm(0).addReg(SystemZ::R0D), STI);
+    return 2;
+  }
+  else if (NumBytes < 6) {
+    OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BCAsm)
+                                  .addImm(0).addReg(0).addImm(0).addReg(0),
+                                STI);
+    return 4;
+  }
+  else {
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BRCLAsm)
+                                  .addImm(0).addExpr(Dot), STI);
+    OutStreamer.EmitLabel(DotSym);
+    return 6;
+  }
+}
+
+void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+  const SystemZInstrInfo *TII =
+    static_cast<const SystemZInstrInfo *>(MF->getSubtarget().getInstrInfo());
+
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  assert(NumNOPBytes % 2 == 0 && "Invalid number of NOP bytes requested!");
+
+  // Scan ahead to trim the shadow.
+  unsigned ShadowBytes = 0;
+  const MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::const_iterator MII(MI);
+  ++MII;
+  while (ShadowBytes < NumNOPBytes) {
+    if (MII == MBB.end() ||
+        MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+        MII->getOpcode() == TargetOpcode::STACKMAP)
+      break;
+    ShadowBytes += TII->getInstSizeInBytes(*MII);
+    if (MII->isCall())
+      break;
+    ++MII;
+  }
+
+  // Emit nops.
+  while (ShadowBytes < NumNOPBytes)
+    ShadowBytes += EmitNop(OutContext, *OutStreamer, NumNOPBytes - ShadowBytes,
+                           getSubtargetInfo());
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+                                        SystemZMCInstLower &Lower) {
+  SM.recordPatchPoint(MI);
+  PatchPointOpers Opers(&MI);
+
+  unsigned EncodedBytes = 0;
+  const MachineOperand &CalleeMO = Opers.getCallTarget();
+
+  if (CalleeMO.isImm()) {
+    uint64_t CallTarget = CalleeMO.getImm();
+    if (CallTarget) {
+      unsigned ScratchIdx = -1;
+      unsigned ScratchReg = 0;
+      do {
+        ScratchIdx = Opers.getNextScratchIdx(ScratchIdx + 1);
+        ScratchReg = MI.getOperand(ScratchIdx).getReg();
+      } while (ScratchReg == SystemZ::R0D);
+
+      // Materialize the call target address
+      EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::LLILF)
+                                      .addReg(ScratchReg)
+                                      .addImm(CallTarget & 0xFFFFFFFF));
+      EncodedBytes += 6;
+      if (CallTarget >> 32) {
+        EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::IIHF)
+                                        .addReg(ScratchReg)
+                                        .addImm(CallTarget >> 32));
+        EncodedBytes += 6;
+      }
+
+      EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BASR)
+                                     .addReg(SystemZ::R14D)
+                                     .addReg(ScratchReg));
+      EncodedBytes += 2;
+    }
+  } else if (CalleeMO.isGlobal()) {
+    const MCExpr *Expr = Lower.getExpr(CalleeMO, MCSymbolRefExpr::VK_PLT);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
+                                   .addReg(SystemZ::R14D)
+                                   .addExpr(Expr));
+    EncodedBytes += 6;
+  }
+
+  // Emit padding.
+  unsigned NumBytes = Opers.getNumPatchBytes();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 2 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  while (EncodedBytes < NumBytes)
+    EncodedBytes += EmitNop(OutContext, *OutStreamer, NumBytes - EncodedBytes,
+                            getSubtargetInfo());
+}
+
 // Convert a SystemZ-specific constant pool modifier into the associated
 // MCSymbolRefExpr variant kind.
 static MCSymbolRefExpr::VariantKind
@@ -521,6 +646,10 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  SM.serializeToStackMapSection();
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeSystemZAsmPrinter() {
   RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index fe8c88fe23e3..cb88ec32f83a 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -11,7 +11,9 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
 
 #include "SystemZTargetMachine.h"
+#include "SystemZMCInstLower.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -22,20 +24,33 @@ class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+private:
+  StackMaps SM;
+
 public:
   SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+      : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
 
   // Override AsmPrinter.
   StringRef getPassName() const override { return "SystemZ Assembly Printer"; }
   void EmitInstruction(const MachineInstr *MI) override;
   void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+  void EmitEndOfAsmFile(Module &M) override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &OS) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &OS) override;
+
+  bool doInitialization(Module &M) override {
+    SM.reset();
+    return AsmPrinter::doInitialization(M);
+  }
+
+private:
+  void LowerSTACKMAP(const MachineInstr &MI);
+  void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 2bf5ac29865f..deba27fee7fe 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -120,3 +120,12 @@ def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
 
 // R9 is used to return SwiftError; remove it from CSR.
 def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ, R9D)>;
+
+// "All registers" as used by the AnyReg calling convention.
+// Note that registers 0 and 1 are still defined as intra-call scratch
+// registers that may be clobbered e.g. by PLT stubs.
+def CSR_SystemZ_AllRegs : CalleeSavedRegs<(add (sequence "R%dD", 2, 15),
+                                               (sequence "F%dD", 0, 15))>;
+def CSR_SystemZ_AllRegs_Vector : CalleeSavedRegs<(add (sequence "R%dD", 2, 15),
+                                                      (sequence "V%d", 0, 31))>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 55f7a7b8d0d1..9edd1fc36406 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -86,9 +86,11 @@ private:
                      SmallVectorImpl<MachineInstr *> &CCUsers);
   bool convertToLoadAndTrap(MachineInstr &MI, MachineInstr &Compare,
                             SmallVectorImpl<MachineInstr *> &CCUsers);
-  bool convertToLoadAndTest(MachineInstr &MI);
+  bool convertToLoadAndTest(MachineInstr &MI, MachineInstr &Compare,
+                            SmallVectorImpl<MachineInstr *> &CCUsers);
   bool adjustCCMasksForInstr(MachineInstr &MI, MachineInstr &Compare,
-                             SmallVectorImpl<MachineInstr *> &CCUsers);
+                             SmallVectorImpl<MachineInstr *> &CCUsers,
+                             unsigned ConvOpc = 0);
   bool optimizeCompareZero(MachineInstr &Compare,
                            SmallVectorImpl<MachineInstr *> &CCUsers);
   bool fuseCompareOperations(MachineInstr &Compare,
@@ -282,26 +284,37 @@ bool SystemZElimCompare::convertToLoadAndTrap(
 
 // If MI is a load instruction, try to convert it into a LOAD AND TEST.
 // Return true on success.
-bool SystemZElimCompare::convertToLoadAndTest(MachineInstr &MI) {
+bool SystemZElimCompare::convertToLoadAndTest(
+    MachineInstr &MI, MachineInstr &Compare,
+    SmallVectorImpl<MachineInstr *> &CCUsers) {
+
+  // Try to adjust CC masks for the LOAD AND TEST opcode that could replace MI.
   unsigned Opcode = TII->getLoadAndTest(MI.getOpcode());
-  if (!Opcode)
+  if (!Opcode || !adjustCCMasksForInstr(MI, Compare, CCUsers, Opcode))
     return false;
 
-  MI.setDesc(TII->get(Opcode));
-  MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-      .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  // Rebuild to get the CC operand in the right place.
+  MachineInstr *BuiltMI =
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode));
+  for (const auto &MO : MI.operands())
+    BuiltMI->addOperand(MO);
+  BuiltMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MI.eraseFromParent();
+
   return true;
 }
 
 // The CC users in CCUsers are testing the result of a comparison of some
-// value X against zero and we know that any CC value produced by MI
-// would also reflect the value of X.  Try to adjust CCUsers so that
-// they test the result of MI directly, returning true on success.
-// Leave everything unchanged on failure.
+// value X against zero and we know that any CC value produced by MI would
+// also reflect the value of X.  ConvOpc may be used to pass the transfomed
+// opcode MI will have if this succeeds.  Try to adjust CCUsers so that they
+// test the result of MI directly, returning true on success.  Leave
+// everything unchanged on failure.
 bool SystemZElimCompare::adjustCCMasksForInstr(
     MachineInstr &MI, MachineInstr &Compare,
-    SmallVectorImpl<MachineInstr *> &CCUsers) {
-  int Opcode = MI.getOpcode();
+    SmallVectorImpl<MachineInstr *> &CCUsers,
+    unsigned ConvOpc) {
+  int Opcode = (ConvOpc ? ConvOpc : MI.getOpcode());
   const MCInstrDesc &Desc = TII->get(Opcode);
   unsigned MIFlags = Desc.TSFlags;
 
@@ -319,53 +332,72 @@ bool SystemZElimCompare::adjustCCMasksForInstr(
   unsigned CCValues = SystemZII::getCCValues(MIFlags);
   assert((ReusableCCMask & ~CCValues) == 0 && "Invalid CCValues");
 
-  // Now check whether these flags are enough for all users.
-  SmallVector<MachineOperand *, 4> AlterMasks;
-  for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
-    MachineInstr *MI = CCUsers[I];
-
-    // Fail if this isn't a use of CC that we understand.
-    unsigned Flags = MI->getDesc().TSFlags;
-    unsigned FirstOpNum;
-    if (Flags & SystemZII::CCMaskFirst)
-      FirstOpNum = 0;
-    else if (Flags & SystemZII::CCMaskLast)
-      FirstOpNum = MI->getNumExplicitOperands() - 2;
-    else
-      return false;
-
-    // Check whether the instruction predicate treats all CC values
-    // outside of ReusableCCMask in the same way.  In that case it
-    // doesn't matter what those CC values mean.
-    unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
-    unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
-    unsigned OutValid = ~ReusableCCMask & CCValid;
-    unsigned OutMask = ~ReusableCCMask & CCMask;
-    if (OutMask != 0 && OutMask != OutValid)
-      return false;
+  bool MIEquivalentToCmp =
+    (ReusableCCMask == CCValues &&
+     CCValues == SystemZII::getCCValues(CompareFlags));
+
+  if (!MIEquivalentToCmp) {
+    // Now check whether these flags are enough for all users.
+    SmallVector<MachineOperand *, 4> AlterMasks;
+    for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
+      MachineInstr *MI = CCUsers[I];
+
+      // Fail if this isn't a use of CC that we understand.
+      unsigned Flags = MI->getDesc().TSFlags;
+      unsigned FirstOpNum;
+      if (Flags & SystemZII::CCMaskFirst)
+        FirstOpNum = 0;
+      else if (Flags & SystemZII::CCMaskLast)
+        FirstOpNum = MI->getNumExplicitOperands() - 2;
+      else
+        return false;
+
+      // Check whether the instruction predicate treats all CC values
+      // outside of ReusableCCMask in the same way.  In that case it
+      // doesn't matter what those CC values mean.
+      unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
+      unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
+      unsigned OutValid = ~ReusableCCMask & CCValid;
+      unsigned OutMask = ~ReusableCCMask & CCMask;
+      if (OutMask != 0 && OutMask != OutValid)
+        return false;
+
+      AlterMasks.push_back(&MI->getOperand(FirstOpNum));
+      AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+    }
 
-    AlterMasks.push_back(&MI->getOperand(FirstOpNum));
-    AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+    // All users are OK.  Adjust the masks for MI.
+    for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
+      AlterMasks[I]->setImm(CCValues);
+      unsigned CCMask = AlterMasks[I + 1]->getImm();
+      if (CCMask & ~ReusableCCMask)
+        AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
+                                  (CCValues & ~ReusableCCMask));
+    }
   }
 
-  // All users are OK.  Adjust the masks for MI.
-  for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
-    AlterMasks[I]->setImm(CCValues);
-    unsigned CCMask = AlterMasks[I + 1]->getImm();
-    if (CCMask & ~ReusableCCMask)
-      AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
-                                (CCValues & ~ReusableCCMask));
+  // CC is now live after MI.
+  if (!ConvOpc) {
+    int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+    assert(CCDef >= 0 && "Couldn't find CC set");
+    MI.getOperand(CCDef).setIsDead(false);
   }
 
-  // CC is now live after MI.
-  int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
-  assert(CCDef >= 0 && "Couldn't find CC set");
-  MI.getOperand(CCDef).setIsDead(false);
+  // Check if MI lies before Compare.
+  bool BeforeCmp = false;
+  MachineBasicBlock::iterator MBBI = MI, MBBE = MI.getParent()->end();
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (MBBI == Compare) {
+      BeforeCmp = true;
+      break;
+    }
 
   // Clear any intervening kills of CC.
-  MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
-  for (++MBBI; MBBI != MBBE; ++MBBI)
-    MBBI->clearRegisterKills(SystemZ::CC, TRI);
+  if (BeforeCmp) {
+    MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
+    for (++MBBI; MBBI != MBBE; ++MBBI)
+      MBBI->clearRegisterKills(SystemZ::CC, TRI);
+  }
 
   return true;
 }
@@ -398,12 +430,12 @@ bool SystemZElimCompare::optimizeCompareZero(
   // Search back for CC results that are based on the first operand.
   unsigned SrcReg = getCompareSourceReg(Compare);
   MachineBasicBlock &MBB = *Compare.getParent();
-  MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
   Reference CCRefs;
   Reference SrcRefs;
-  while (MBBI != MBBE) {
-    --MBBI;
-    MachineInstr &MI = *MBBI;
+  for (MachineBasicBlock::reverse_iterator MBBI =
+         std::next(MachineBasicBlock::reverse_iterator(&Compare)),
+         MBBE = MBB.rend(); MBBI != MBBE;) {
+    MachineInstr &MI = *MBBI++;
     if (resultTests(MI, SrcReg)) {
       // Try to remove both MI and Compare by converting a branch to BRCT(G).
       // or a load-and-trap instruction.  We don't care in this case whether
@@ -419,7 +451,7 @@ bool SystemZElimCompare::optimizeCompareZero(
         }
       }
       // Try to eliminate Compare by reusing a CC result from MI.
-      if ((!CCRefs && convertToLoadAndTest(MI)) ||
+      if ((!CCRefs && convertToLoadAndTest(MI, Compare, CCUsers)) ||
           (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
         EliminatedComparisons += 1;
         return true;
@@ -434,17 +466,15 @@ bool SystemZElimCompare::optimizeCompareZero(
   }
 
   // Also do a forward search to handle cases where an instruction after the
-  // compare can be converted like
-  //
-  // LTEBRCompare %f0s, %f0s, implicit-def %cc LTEBRCompare %f0s, %f0s,
-  // implicit-def %cc %f2s = LER %f0s
-  //
-  MBBI = Compare, MBBE = MBB.end();
-  while (++MBBI != MBBE) {
-    MachineInstr &MI = *MBBI;
+  // compare can be converted, like
+  // LTEBRCompare %f0s, %f0s; %f2s = LER %f0s  =>  LTEBRCompare %f2s, %f0s
+  for (MachineBasicBlock::iterator MBBI =
+         std::next(MachineBasicBlock::iterator(&Compare)), MBBE = MBB.end();
+       MBBI != MBBE;) {
+    MachineInstr &MI = *MBBI++;
     if (preservesValueOf(MI, SrcReg)) {
       // Try to eliminate Compare by reusing a CC result from MI.
-      if (convertToLoadAndTest(MI)) {
+      if (convertToLoadAndTest(MI, Compare, CCUsers)) {
         EliminatedComparisons += 1;
         return true;
       }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
index d02db9a617a3..67c80899d491 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -55,7 +55,7 @@ char SystemZExpandPseudo::ID = 0;
 INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo",
                 SYSTEMZ_EXPAND_PSEUDO_NAME, false, false)
 
-/// \brief Returns an instance of the pseudo instruction expansion pass.
+/// Returns an instance of the pseudo instruction expansion pass.
 FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) {
   return new SystemZExpandPseudo();
 }
@@ -112,7 +112,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
   return true;
 }
 
-/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
@@ -127,7 +127,7 @@ bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
   return false;
 }
 
-/// \brief Iterate over the instructions in basic block MBB and expand any
+/// Iterate over the instructions in basic block MBB and expand any
 /// pseudo instructions.  Return true if anything was modified.
 bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
index fda9c30fe3fc..beff45dba81d 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -62,6 +62,7 @@ def FeatureLoadStoreOnCond : SystemZFeature<
   "load-store-on-cond", "LoadStoreOnCond",
   "Assume that the load/store-on-condition facility is installed"
 >;
+def FeatureNoLoadStoreOnCond : SystemZMissingFeature<"LoadStoreOnCond">;
 
 def FeaturePopulationCount : SystemZFeature<
   "population-count", "PopulationCount",
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index b600aa61cd0b..565299c90139 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -204,7 +204,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
   }
 
-  // Save FPRs in the normal TargetInstrInfo way.
+  // Save FPRs/VRs in the normal TargetInstrInfo way.
   for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
     unsigned Reg = CSI[I].getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
@@ -212,6 +212,11 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
                                &SystemZ::FP64BitRegClass, TRI);
     }
+    if (SystemZ::VR128BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::VR128BitRegClass, TRI);
+    }
   }
 
   return true;
@@ -231,12 +236,15 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   bool HasFP = hasFP(MF);
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
-  // Restore FPRs in the normal TargetInstrInfo way.
+  // Restore FPRs/VRs in the normal TargetInstrInfo way.
   for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
     unsigned Reg = CSI[I].getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
                                 &SystemZ::FP64BitRegClass, TRI);
+    if (SystemZ::VR128BitRegClass.contains(Reg))
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+                                &SystemZ::VR128BitRegClass, TRI);
   }
 
   // Restore call-saved GPRs (but not call-clobbered varargs, which at
@@ -371,7 +379,15 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  uint64_t StackSize = getAllocatedStackSize(MF);
+  uint64_t StackSize = MFFrame.getStackSize();
+  // We need to allocate the ABI-defined 160-byte base area whenever
+  // we allocate stack space for our own use and whenever we call another
+  // function.
+  if (StackSize || MFFrame.hasVarSizedObjects() || MFFrame.hasCalls()) {
+    StackSize += SystemZMC::CallFrameSize;
+    MFFrame.setStackSize(StackSize);
+  }
+
   if (StackSize) {
     // Determine if we want to store a backchain.
     bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
@@ -417,7 +433,7 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
       I->addLiveIn(SystemZ::R11D);
   }
 
-  // Skip over the FPR saves.
+  // Skip over the FPR/VR saves.
   SmallVector<unsigned, 8> CFIIndexes;
   for (auto &Save : CSI) {
     unsigned Reg = Save.getReg();
@@ -428,19 +444,26 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
         ++MBBI;
       else
         llvm_unreachable("Couldn't skip over FPR save");
+    } else if (SystemZ::VR128BitRegClass.contains(Reg)) {
+      if (MBBI != MBB.end() &&
+          MBBI->getOpcode() == SystemZ::VST)
+        ++MBBI;
+      else
+        llvm_unreachable("Couldn't skip over VR save");
+    } else
+      continue;
 
-      // Add CFI for the this save.
-      unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-      unsigned IgnoredFrameReg;
-      int64_t Offset =
-          getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+    // Add CFI for the this save.
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned IgnoredFrameReg;
+    int64_t Offset =
+        getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
 
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, DwarfReg, SPOffsetFromCFA + Offset));
-      CFIIndexes.push_back(CFIIndex);
-    }
+    CFIIndexes.push_back(CFIIndex);
   }
-  // Complete the CFI for the FPR saves, modelling them as taking effect
+  // Complete the CFI for the FPR/VR saves, modelling them as taking effect
   // after the last save.
   for (auto CFIIndex : CFIIndexes) {
     BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -454,11 +477,12 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
   auto *ZII =
       static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
 
   // Skip the return instruction.
   assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
 
-  uint64_t StackSize = getAllocatedStackSize(MF);
+  uint64_t StackSize = MFFrame.getStackSize();
   if (ZFI->getLowSavedGPR()) {
     --MBBI;
     unsigned Opcode = MBBI->getOpcode();
@@ -495,46 +519,6 @@ bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
           MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
 }
 
-int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                 int FI,
-                                                 unsigned &FrameReg) const {
-  const MachineFrameInfo &MFFrame = MF.getFrameInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
-
-  // Fill in FrameReg output argument.
-  FrameReg = RI->getFrameRegister(MF);
-
-  // Start with the offset of FI from the top of the caller-allocated frame
-  // (i.e. the top of the 160 bytes allocated by the caller).  This initial
-  // offset is therefore negative.
-  int64_t Offset = (MFFrame.getObjectOffset(FI) +
-                    MFFrame.getOffsetAdjustment());
-
-  // Make the offset relative to the incoming stack pointer.
-  Offset -= getOffsetOfLocalArea();
-
-  // Make the offset relative to the bottom of the frame.
-  Offset += getAllocatedStackSize(MF);
-
-  return Offset;
-}
-
-uint64_t SystemZFrameLowering::
-getAllocatedStackSize(const MachineFunction &MF) const {
-  const MachineFrameInfo &MFFrame = MF.getFrameInfo();
-
-  // Start with the size of the local variables and spill slots.
-  uint64_t StackSize = MFFrame.getStackSize();
-
-  // We need to allocate the ABI-defined 160-byte base area whenever
-  // we allocate stack space for our own use and whenever we call another
-  // function.
-  if (StackSize || MFFrame.hasVarSizedObjects() || MFFrame.hasCalls())
-    StackSize += SystemZMC::CallFrameSize;
-
-  return StackSize;
-}
-
 bool
 SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   // The ABI requires us to allocate 160 bytes of stack space for the callee,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index a75d111b0294..08c84c785cc0 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -43,16 +43,11 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   bool hasFP(const MachineFunction &MF) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
 
-  // Return the number of bytes in the callee-allocated part of the frame.
-  uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
-
   // Return the byte offset from the incoming stack pointer of Reg's
   // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
   unsigned getRegSpillOffset(unsigned Reg) const {
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index f37216022762..d300d1d88abc 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -59,10 +59,18 @@ getNumDecoderSlots(SUnit *SU) const {
   return 1; // Normal instruction
 }
 
-unsigned SystemZHazardRecognizer::getCurrCycleIdx() {
+unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
   unsigned Idx = CurrGroupSize;
   if (GrpCount % 2)
     Idx += 3;
+
+  if (SU != nullptr && !fitsIntoCurrentGroup(SU)) {
+    if (Idx == 1 || Idx == 2)
+      Idx = 3;
+    else if (Idx == 4 || Idx == 5)
+      Idx = 0;
+  }
+
   return Idx;
 }
 
@@ -77,7 +85,7 @@ void SystemZHazardRecognizer::Reset() {
   GrpCount = 0;
   LastFPdOpCycleIdx = UINT_MAX;
   LastEmittedMI = nullptr;
-  DEBUG(CurGroupDbg = "";);
+  LLVM_DEBUG(CurGroupDbg = "";);
 }
 
 bool
@@ -100,30 +108,30 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
   return true;
 }
 
-void SystemZHazardRecognizer::nextGroup(bool DbgOutput) {
-  if (CurrGroupSize > 0) {
-    DEBUG(dumpCurrGroup("Completed decode group"));
-    DEBUG(CurGroupDbg = "";);
+void SystemZHazardRecognizer::nextGroup() {
+  if (CurrGroupSize == 0)
+    return;
+
+  LLVM_DEBUG(dumpCurrGroup("Completed decode group"));
+  LLVM_DEBUG(CurGroupDbg = "";);
 
-    GrpCount++;
+  GrpCount++;
 
-    // Reset counter for next group.
-    CurrGroupSize = 0;
+  // Reset counter for next group.
+  CurrGroupSize = 0;
 
-    // Decrease counters for execution units by one.
-    for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
-      if (ProcResourceCounters[i] > 0)
-        ProcResourceCounters[i]--;
+  // Decrease counters for execution units by one.
+  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+    if (ProcResourceCounters[i] > 0)
+      ProcResourceCounters[i]--;
 
-    // Clear CriticalResourceIdx if it is now below the threshold.
-    if (CriticalResourceIdx != UINT_MAX &&
-        (ProcResourceCounters[CriticalResourceIdx] <=
-         ProcResCostLim))
-      CriticalResourceIdx = UINT_MAX;
-  }
+  // Clear CriticalResourceIdx if it is now below the threshold.
+  if (CriticalResourceIdx != UINT_MAX &&
+      (ProcResourceCounters[CriticalResourceIdx] <=
+       ProcResCostLim))
+    CriticalResourceIdx = UINT_MAX;
 
-  DEBUG(if (DbgOutput)
-          dumpProcResourceCounters(););
+  LLVM_DEBUG(dumpState(););
 }
 
 #ifndef NDEBUG // Debug output
@@ -143,7 +151,11 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
     std::string FU(PRD.Name);
     // trim e.g. Z13_FXaUnit -> FXa
     FU = FU.substr(FU.find("_") + 1);
-    FU.resize(FU.find("Unit"));
+    size_t Pos = FU.find("Unit");
+    if (Pos != std::string::npos)
+      FU.resize(Pos);
+    if (FU == "LS") // LSUnit -> LSU
+      FU = "LSU";
     OS << "/" << FU;
 
     if (PI->Cycles > 1)
@@ -163,7 +175,7 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
 }
 
 void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
-  dbgs() << "+++ " << Msg;
+  dbgs() << "++ " << Msg;
   dbgs() << ": ";
 
   if (CurGroupDbg.empty())
@@ -188,15 +200,28 @@ void SystemZHazardRecognizer::dumpProcResourceCounters() const {
   if (!any)
     return;
 
-  dbgs() << "+++ Resource counters:\n";
+  dbgs() << "++ | Resource counters: ";
   for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
-    if (ProcResourceCounters[i] > 0) {
-      dbgs() << "+++ Extra schedule for execution unit "
-             << SchedModel->getProcResource(i)->Name
-             << ": " << ProcResourceCounters[i] << "\n";
-      any = true;
-    }
+    if (ProcResourceCounters[i] > 0)
+      dbgs() << SchedModel->getProcResource(i)->Name
+             << ":" << ProcResourceCounters[i] << " ";
+  dbgs() << "\n";
+
+  if (CriticalResourceIdx != UINT_MAX)
+    dbgs() << "++ | Critical resource: "
+           << SchedModel->getProcResource(CriticalResourceIdx)->Name
+           << "\n";
+}
+
+void SystemZHazardRecognizer::dumpState() const {
+  dumpCurrGroup("| Current decoder group");
+  dbgs() << "++ | Current cycle index: "
+         << getCurrCycleIdx() << "\n";
+  dumpProcResourceCounters();
+  if (LastFPdOpCycleIdx != UINT_MAX)
+    dbgs() << "++ | Last FPd cycle index: " << LastFPdOpCycleIdx << "\n";
 }
+
 #endif //NDEBUG
 
 void SystemZHazardRecognizer::clearProcResCounters() {
@@ -213,30 +238,25 @@ static inline bool isBranchRetTrap(MachineInstr *MI) {
 void SystemZHazardRecognizer::
 EmitInstruction(SUnit *SU) {
   const MCSchedClassDesc *SC = getSchedClass(SU);
-  DEBUG( dumpCurrGroup("Decode group before emission"););
+  LLVM_DEBUG(dbgs() << "++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
+             dbgs() << "\n";);
+  LLVM_DEBUG(dumpCurrGroup("Decode group before emission"););
 
   // If scheduling an SU that must begin a new decoder group, move on
   // to next group.
   if (!fitsIntoCurrentGroup(SU))
     nextGroup();
 
-  DEBUG( dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
-         dbgs() << "\n";
-         raw_string_ostream cgd(CurGroupDbg);
-         if (CurGroupDbg.length())
-           cgd << ", ";
-         dumpSU(SU, cgd););
+  LLVM_DEBUG(raw_string_ostream cgd(CurGroupDbg);
+             if (CurGroupDbg.length()) cgd << ", "; dumpSU(SU, cgd););
 
   LastEmittedMI = SU->getInstr();
 
   // After returning from a call, we don't know much about the state.
   if (SU->isCall) {
-    DEBUG (dbgs() << "+++ Clearing state after call.\n";);
-    clearProcResCounters();
-    LastFPdOpCycleIdx = UINT_MAX;
-    CurrGroupSize += getNumDecoderSlots(SU);
-    assert (CurrGroupSize <= 3);
-    nextGroup();
+    LLVM_DEBUG(dbgs() << "++ Clearing state after call.\n";);
+    Reset();
+    LastEmittedMI = SU->getInstr();
     return;
   }
 
@@ -256,23 +276,21 @@ EmitInstruction(SUnit *SU) {
          (PI->ProcResourceIdx != CriticalResourceIdx &&
           CurrCounter >
           ProcResourceCounters[CriticalResourceIdx]))) {
-      DEBUG( dbgs() << "+++ New critical resource: "
-             << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
-             << "\n";);
+      LLVM_DEBUG(
+          dbgs() << "++ New critical resource: "
+                 << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
+                 << "\n";);
       CriticalResourceIdx = PI->ProcResourceIdx;
     }
   }
 
   // Make note of an instruction that uses a blocking resource (FPd).
   if (SU->isUnbuffered) {
-    LastFPdOpCycleIdx = getCurrCycleIdx();
-    DEBUG (dbgs() << "+++ Last FPd cycle index: "
-           << LastFPdOpCycleIdx << "\n";);
+    LastFPdOpCycleIdx = getCurrCycleIdx(SU);
+    LLVM_DEBUG(dbgs() << "++ Last FPd cycle index: " << LastFPdOpCycleIdx
+                      << "\n";);
   }
 
-  bool GroupEndingBranch =
-    (CurrGroupSize >= 1 && isBranchRetTrap(SU->getInstr()));
-
   // Insert SU into current group by increasing number of slots used
   // in current group.
   CurrGroupSize += getNumDecoderSlots(SU);
@@ -280,7 +298,7 @@ EmitInstruction(SUnit *SU) {
 
   // Check if current group is now full/ended. If so, move on to next
   // group to be ready to evaluate more candidates.
-  if (CurrGroupSize == 3 || SC->EndGroup || GroupEndingBranch)
+  if (CurrGroupSize == 3 || SC->EndGroup)
     nextGroup();
 }
 
@@ -311,7 +329,7 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
   return 0;
 }
 
-bool SystemZHazardRecognizer::isFPdOpPreferred_distance(const SUnit *SU) {
+bool SystemZHazardRecognizer::isFPdOpPreferred_distance(SUnit *SU) const {
   assert (SU->isUnbuffered);
   // If this is the first FPd op, it should be scheduled high.
   if (LastFPdOpCycleIdx == UINT_MAX)
@@ -320,9 +338,10 @@ bool SystemZHazardRecognizer::isFPdOpPreferred_distance(const SUnit *SU) {
   // of the processor to use the other FPd unit there. This should
   // generally happen if two FPd ops are placed with 2 other
   // instructions between them (modulo 6).
-  if (LastFPdOpCycleIdx > getCurrCycleIdx())
-    return ((LastFPdOpCycleIdx - getCurrCycleIdx()) == 3);
-  return ((getCurrCycleIdx() - LastFPdOpCycleIdx) == 3);
+  unsigned SUCycleIdx = getCurrCycleIdx(SU);
+  if (LastFPdOpCycleIdx > SUCycleIdx)
+    return ((LastFPdOpCycleIdx - SUCycleIdx) == 3);
+  return ((SUCycleIdx - LastFPdOpCycleIdx) == 3);
 }
 
 int SystemZHazardRecognizer::
@@ -373,10 +392,17 @@ void SystemZHazardRecognizer::emitInstruction(MachineInstr *MI,
     }
   }
 
+  unsigned GroupSizeBeforeEmit = CurrGroupSize;
   EmitInstruction(&SU);
 
+  if (!TakenBranch && isBranchRetTrap(MI)) {
+    // NT Branch on second slot ends group.
+    if (GroupSizeBeforeEmit == 1)
+      nextGroup();
+  }
+
   if (TakenBranch && CurrGroupSize > 0)
-    nextGroup(false /*DbgOutput*/);
+    nextGroup();
 
   assert ((!MI->isTerminator() || isBranchRetTrap(MI)) &&
           "Scheduler: unhandled terminator!");
@@ -386,7 +412,7 @@ void SystemZHazardRecognizer::
 copyState(SystemZHazardRecognizer *Incoming) {
   // Current decoder group
   CurrGroupSize = Incoming->CurrGroupSize;
-  DEBUG (CurGroupDbg = Incoming->CurGroupDbg;);
+  LLVM_DEBUG(CurGroupDbg = Incoming->CurGroupDbg;);
 
   // Processor resources
   ProcResourceCounters = Incoming->ProcResourceCounters;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 7e1b5fb2e4fe..40cb3acc7009 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -75,9 +75,11 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
 
   /// Two decoder groups per cycle are formed (for z13), meaning 2x3
   /// instructions. This function returns a number between 0 and 5,
-  /// representing the current decoder slot of the current cycle.
-  unsigned getCurrCycleIdx();
-  
+  /// representing the current decoder slot of the current cycle.  If an SU
+  /// is passed which will begin a new decoder group, the returned value is
+  /// the cycle index of the next group.
+  unsigned getCurrCycleIdx(SUnit *SU = nullptr) const;
+
   /// LastFPdOpCycleIdx stores the numbeer returned by getCurrCycleIdx()
   /// when a stalling operation is scheduled (which uses the FPd resource).
   unsigned LastFPdOpCycleIdx;
@@ -88,14 +90,14 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
   unsigned getCurrGroupSize() {return CurrGroupSize;};
 
   /// Start next decoder group.
-  void nextGroup(bool DbgOutput = true);
+  void nextGroup();
 
   /// Clear all counters for processor resources.
   void clearProcResCounters();
 
   /// With the goal of alternating processor sides for stalling (FPd)
   /// ops, return true if it seems good to schedule an FPd op next.
-  bool isFPdOpPreferred_distance(const SUnit *SU);
+  bool isFPdOpPreferred_distance(SUnit *SU) const;
 
   /// Last emitted instruction or nullptr.
   MachineInstr *LastEmittedMI;
@@ -145,6 +147,7 @@ public:
   void dumpSU(SUnit *SU, raw_ostream &OS) const;
   void dumpCurrGroup(std::string Msg = "") const;
   void dumpProcResourceCounters() const;
+  void dumpState() const;
 #endif
 
   MachineBasicBlock::iterator getLastEmittedMI() { return LastEmittedMI; }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index ce6f3d37f5c9..5425f1d16e5e 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -310,6 +310,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // Try to use scatter instruction Opcode to implement store Store.
   bool tryScatter(StoreSDNode *Store, unsigned Opcode);
 
+  // Change a chain of {load; op; store} of the same value into a simple op
+  // through memory of that value, if the uses of the modified value and its
+  // address are suitable.
+  bool tryFoldLoadStoreIntoMemOperand(SDNode *Node);
+
   // Return true if Load and Store are loads and stores of the same size
   // and are guaranteed not to overlap.  Such operations can be implemented
   // using block (SS-format) instructions.
@@ -330,6 +335,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // to X.
   bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
 
+  // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
+  SDValue expandSelectBoolean(SDNode *Node);
+
 public:
   SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
       : SelectionDAGISel(TM, OptLevel) {}
@@ -348,6 +356,8 @@ public:
   void Select(SDNode *Node) override;
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
+  bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+  void PreprocessISelDAG() override;
 
   // Include the pieces autogenerated from the target description.
   #include "SystemZGenDAGISel.inc"
@@ -579,7 +589,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
   if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
     return false;
 
-  DEBUG(AM.dump());
+  LLVM_DEBUG(AM.dump());
   return true;
 }
 
@@ -589,10 +599,16 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
 // The selection DAG must no longer depend on their uniqueness when this
 // function is used.
 static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
-  if (N.getNode()->getNodeId() == -1 ||
-      N.getNode()->getNodeId() > Pos->getNodeId()) {
+  if (N->getNodeId() == -1 ||
+      (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
+       SelectionDAGISel::getUninvalidatedNodeId(Pos))) {
     DAG->RepositionNode(Pos->getIterator(), N.getNode());
-    N.getNode()->setNodeId(Pos->getNodeId());
+    // Mark Node as invalid for pruning as after this it may be a successor to a
+    // selected node but otherwise be in the same position of Pos.
+    // Conservatively mark it with the same -abs(Id) to assure node id
+    // invariant is preserved.
+    N->setNodeId(Pos->getNodeId());
+    SelectionDAGISel::InvalidateNodeId(N.getNode());
   }
 }
 
@@ -989,7 +1005,8 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
         N = New.getNode();
       }
       // Now, select the machine opcode to implement this operation.
-      SelectCode(N);
+      if (!N->isMachineOpcode())
+        SelectCode(N);
       return true;
     }
   }
@@ -1022,8 +1039,7 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
   };
   SDValue New = convertTo(
       DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
-  ReplaceUses(N, New.getNode());
-  CurDAG->RemoveDeadNode(N);
+  ReplaceNode(N, New.getNode());
   return true;
 }
 
@@ -1114,8 +1130,7 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
   SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
   SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
 
-  ReplaceUses(Node, Or.getNode());
-  CurDAG->RemoveDeadNode(Node);
+  ReplaceNode(Node, Or.getNode());
 
   SelectCode(Or.getNode());
 }
@@ -1186,6 +1201,171 @@ bool SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
   return true;
 }
 
+// Check whether or not the chain ending in StoreNode is suitable for doing
+// the {load; op; store} to modify transformation.
+static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
+                                        SDValue StoredVal, SelectionDAG *CurDAG,
+                                        LoadSDNode *&LoadNode,
+                                        SDValue &InputChain) {
+  // Is the stored value result 0 of the operation?
+  if (StoredVal.getResNo() != 0)
+    return false;
+
+  // Are there other uses of the loaded value than the operation?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0))
+    return false;
+
+  // Is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode()))
+    return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        // Drop Load, but keep its chain. No cycle check necessary.
+        ChainOps.push_back(Load.getOperand(0));
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+                                   MVT::Other, ChainOps);
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+// Change a chain of {load; op; store} of the same value into a simple op
+// through memory of that value, if the uses of the modified value and its
+// address are suitable.
+//
+// The tablegen pattern memory operand pattern is currently not able to match
+// the case where the CC on the original operation are used.
+//
+// See the equivalent routine in X86ISelDAGToDAG for further comments.
+bool SystemZDAGToDAGISel::tryFoldLoadStoreIntoMemOperand(SDNode *Node) {
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+  SDValue StoredVal = StoreNode->getOperand(1);
+  unsigned Opc = StoredVal->getOpcode();
+  SDLoc DL(StoreNode);
+
+  // Before we try to select anything, make sure this is memory operand size
+  // and opcode we can handle. Note that this must match the code below that
+  // actually lowers the opcodes.
+  EVT MemVT = StoreNode->getMemoryVT();
+  unsigned NewOpc = 0;
+  bool NegateOperand = false;
+  switch (Opc) {
+  default:
+    return false;
+  case SystemZISD::SSUBO:
+    NegateOperand = true;
+    /* fall through */
+  case SystemZISD::SADDO:
+    if (MemVT == MVT::i32)
+      NewOpc = SystemZ::ASI;
+    else if (MemVT == MVT::i64)
+      NewOpc = SystemZ::AGSI;
+    else
+      return false;
+    break;
+  case SystemZISD::USUBO:
+    NegateOperand = true;
+    /* fall through */
+  case SystemZISD::UADDO:
+    if (MemVT == MVT::i32)
+      NewOpc = SystemZ::ALSI;
+    else if (MemVT == MVT::i64)
+      NewOpc = SystemZ::ALGSI;
+    else
+      return false;
+    break;
+  }
+
+  LoadSDNode *LoadNode = nullptr;
+  SDValue InputChain;
+  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode,
+                                   InputChain))
+    return false;
+
+  SDValue Operand = StoredVal.getOperand(1);
+  auto *OperandC = dyn_cast<ConstantSDNode>(Operand);
+  if (!OperandC)
+    return false;
+  auto OperandV = OperandC->getAPIntValue();
+  if (NegateOperand)
+    OperandV = -OperandV;
+  if (OperandV.getMinSignedBits() > 8)
+    return false;
+  Operand = CurDAG->getTargetConstant(OperandV, DL, MemVT);
+
+  SDValue Base, Disp;
+  if (!selectBDAddr20Only(StoreNode->getBasePtr(), Base, Disp))
+    return false;
+
+  SDValue Ops[] = { Base, Disp, Operand, InputChain };
+  MachineSDNode *Result =
+    CurDAG->getMachineNode(NewOpc, DL, MVT::i32, MVT::Other, Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+  MemOp[0] = StoreNode->getMemOperand();
+  MemOp[1] = LoadNode->getMemOperand();
+  Result->setMemRefs(MemOp, MemOp + 2);
+
+  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+  CurDAG->RemoveDeadNode(Node);
+  return true;
+}
+
 bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
                                                LoadSDNode *Load) const {
   // Check that the two memory operands have the same size.
@@ -1245,12 +1425,9 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
 }
 
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return;
   }
@@ -1332,7 +1509,13 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
       CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node),
                                    CCMask.getValueType());
       SDValue Op4 = Node->getOperand(4);
-      Node = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+      SDNode *UpdatedNode =
+        CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+      if (UpdatedNode != Node) {
+        // In case this node already exists then replace Node with it.
+        ReplaceNode(Node, UpdatedNode);
+        Node = UpdatedNode;
+      }
     }
     break;
   }
@@ -1351,6 +1534,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::STORE: {
+    if (tryFoldLoadStoreIntoMemOperand(Node))
+      return;
     auto *Store = cast<StoreSDNode>(Node);
     unsigned ElemBitSize = Store->getValue().getValueSizeInBits();
     if (ElemBitSize == 32) {
@@ -1438,3 +1623,227 @@ SelectInlineAsmMemoryOperand(const SDValue &Op,
 
   return true;
 }
+
+// IsProfitableToFold - Returns true if is profitable to fold the specific
+// operand node N of U during instruction selection that starts at Root.
+bool
+SystemZDAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+                                        SDNode *Root) const {
+  // We want to avoid folding a LOAD into an ICMP node if as a result
+  // we would be forced to spill the condition code into a GPR.
+  if (N.getOpcode() == ISD::LOAD && U->getOpcode() == SystemZISD::ICMP) {
+    if (!N.hasOneUse() || !U->hasOneUse())
+      return false;
+
+    // The user of the CC value will usually be a CopyToReg into the
+    // physical CC register, which in turn is glued and chained to the
+    // actual instruction that uses the CC value.  Bail out if we have
+    // anything else than that.
+    SDNode *CCUser = *U->use_begin();
+    SDNode *CCRegUser = nullptr;
+    if (CCUser->getOpcode() == ISD::CopyToReg ||
+        cast<RegisterSDNode>(CCUser->getOperand(1))->getReg() == SystemZ::CC) {
+      for (auto *U : CCUser->uses()) {
+        if (CCRegUser == nullptr)
+          CCRegUser = U;
+        else if (CCRegUser != U)
+          return false;
+      }
+    }
+    if (CCRegUser == nullptr)
+      return false;
+
+    // If the actual instruction is a branch, the only thing that remains to be
+    // checked is whether the CCUser chain is a predecessor of the load.
+    if (CCRegUser->isMachineOpcode() &&
+        CCRegUser->getMachineOpcode() == SystemZ::BRC)
+      return !N->isPredecessorOf(CCUser->getOperand(0).getNode());
+
+    // Otherwise, the instruction may have multiple operands, and we need to
+    // verify that none of them are a predecessor of the load.  This is exactly
+    // the same check that would be done by common code if the CC setter were
+    // glued to the CC user, so simply invoke that check here.
+    if (!IsLegalToFold(N, U, CCRegUser, OptLevel, false))
+      return false;
+  }
+
+  return true;
+}
+
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+  int64_t XORValue;
+  int64_t AddValue;
+  unsigned Bit;
+};
+} // end anonymous namespace
+
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+  // Deal with cases where the result can be taken directly from a bit
+  // of the IPM result.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+  // Deal with cases where we can add a value to force the sign bit
+  // to contain the right value.  Putting the bit in 31 means we can
+  // use SRL rather than RISBG(L), and also makes it easier to get a
+  // 0/-1 value, so it has priority over the other tests below.
+  //
+  // These sequences rely on the fact that the upper two bits of the
+  // IPM result are zero.
+  uint64_t TopBit = uint64_t(1) << 31;
+  if (CCMask == (CCValid & SystemZ::CCMASK_0))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2)))
+    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_3))
+    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  // Next try inverting the value and testing a bit.  0/1 could be
+  // handled this way too, but we dealt with that case above.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+    return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+  // Handle cases where adding a value forces a non-sign bit to contain
+  // the right value.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+  // The remaining cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // can be done by inverting the low CC bit and applying one of the
+  // sign-based extractions above.
+  if (CCMask == (CCValid & SystemZ::CCMASK_1))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_2))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  llvm_unreachable("Unexpected CC combination");
+}
+
+SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
+  auto *TrueOp = dyn_cast<ConstantSDNode>(Node->getOperand(0));
+  auto *FalseOp = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+  if (!TrueOp || !FalseOp)
+    return SDValue();
+  if (FalseOp->getZExtValue() != 0)
+    return SDValue();
+  if (TrueOp->getSExtValue() != 1 && TrueOp->getSExtValue() != -1)
+    return SDValue();
+
+  auto *CCValidOp = dyn_cast<ConstantSDNode>(Node->getOperand(2));
+  auto *CCMaskOp = dyn_cast<ConstantSDNode>(Node->getOperand(3));
+  if (!CCValidOp || !CCMaskOp)
+    return SDValue();
+  int CCValid = CCValidOp->getZExtValue();
+  int CCMask = CCMaskOp->getZExtValue();
+
+  SDLoc DL(Node);
+  SDValue CCReg = Node->getOperand(4);
+  IPMConversion IPM = getIPMConversion(CCValid, CCMask);
+  SDValue Result = CurDAG->getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
+
+  if (IPM.XORValue)
+    Result = CurDAG->getNode(ISD::XOR, DL, MVT::i32, Result,
+                             CurDAG->getConstant(IPM.XORValue, DL, MVT::i32));
+
+  if (IPM.AddValue)
+    Result = CurDAG->getNode(ISD::ADD, DL, MVT::i32, Result,
+                             CurDAG->getConstant(IPM.AddValue, DL, MVT::i32));
+
+  EVT VT = Node->getValueType(0);
+  if (VT == MVT::i32 && IPM.Bit == 31) {
+    unsigned ShiftOp = TrueOp->getSExtValue() == 1 ? ISD::SRL : ISD::SRA;
+    Result = CurDAG->getNode(ShiftOp, DL, MVT::i32, Result,
+                             CurDAG->getConstant(IPM.Bit, DL, MVT::i32));
+  } else {
+    if (VT != MVT::i32)
+      Result = CurDAG->getNode(ISD::ANY_EXTEND, DL, VT, Result);
+
+    if (TrueOp->getSExtValue() == 1) {
+      // The SHR/AND sequence should get optimized to an RISBG.
+      Result = CurDAG->getNode(ISD::SRL, DL, VT, Result,
+                               CurDAG->getConstant(IPM.Bit, DL, MVT::i32));
+      Result = CurDAG->getNode(ISD::AND, DL, VT, Result,
+                               CurDAG->getConstant(1, DL, VT));
+    } else {
+      // Sign-extend from IPM.Bit using a pair of shifts.
+      int ShlAmt = VT.getSizeInBits() - 1 - IPM.Bit;
+      int SraAmt = VT.getSizeInBits() - 1;
+      Result = CurDAG->getNode(ISD::SHL, DL, VT, Result,
+                               CurDAG->getConstant(ShlAmt, DL, MVT::i32));
+      Result = CurDAG->getNode(ISD::SRA, DL, VT, Result,
+                               CurDAG->getConstant(SraAmt, DL, MVT::i32));
+    }
+  }
+
+  return Result;
+}
+
+void SystemZDAGToDAGISel::PreprocessISelDAG() {
+  // If we have conditional immediate loads, we always prefer
+  // using those over an IPM sequence.
+  if (Subtarget->hasLoadStoreOnCond2())
+    return;
+
+  bool MadeChange = false;
+
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+                                       E = CurDAG->allnodes_end();
+       I != E;) {
+    SDNode *N = &*I++;
+    if (N->use_empty())
+      continue;
+
+    SDValue Res;
+    switch (N->getOpcode()) {
+    default: break;
+    case SystemZISD::SELECT_CCMASK:
+      Res = expandSelectBoolean(N);
+      break;
+    }
+
+    if (Res) {
+      LLVM_DEBUG(dbgs() << "SystemZ DAG preprocessing replacing:\nOld:    ");
+      LLVM_DEBUG(N->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "\nNew: ");
+      LLVM_DEBUG(Res.getNode()->dump(CurDAG));
+      LLVM_DEBUG(dbgs() << "\n");
+
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+      MadeChange = true;
+    }
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index adf368319dc3..302c7883f97b 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -31,17 +31,6 @@ using namespace llvm;
 #define DEBUG_TYPE "systemz-lower"
 
 namespace {
-// Represents a sequence for extracting a 0/1 value from an IPM result:
-// (((X ^ XORValue) + AddValue) >> Bit)
-struct IPMConversion {
-  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
-    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
-
-  int64_t XORValue;
-  int64_t AddValue;
-  unsigned Bit;
-};
-
 // Represents information about a comparison.
 struct Comparison {
   Comparison(SDValue Op0In, SDValue Op1In)
@@ -87,7 +76,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
 SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
                                              const SystemZSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
-  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
 
   // Set up the register classes.
   if (Subtarget.hasHighWord())
@@ -133,6 +122,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
 
   // Instructions are strings of 2-byte aligned 2-byte values.
   setMinFunctionAlignment(2);
+  // For performance reasons we prefer 16-byte alignment.
+  setPrefFunctionAlignment(4);
 
   // Handle operations that are handled in a similar way for all types.
   for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
@@ -173,6 +164,18 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SDIVREM, VT, Custom);
       setOperationAction(ISD::UDIVREM, VT, Custom);
 
+      // Support addition/subtraction with overflow.
+      setOperationAction(ISD::SADDO, VT, Custom);
+      setOperationAction(ISD::SSUBO, VT, Custom);
+
+      // Support addition/subtraction with carry.
+      setOperationAction(ISD::UADDO, VT, Custom);
+      setOperationAction(ISD::USUBO, VT, Custom);
+
+      // Support carry in as value rather than glue.
+      setOperationAction(ISD::ADDCARRY, VT, Custom);
+      setOperationAction(ISD::SUBCARRY, VT, Custom);
+
       // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
       // stores, putting a serialization instruction after the stores.
       setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
@@ -517,7 +520,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
 
   // Codes for which we want to perform some z-specific combinations.
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
@@ -643,7 +648,8 @@ supportedAddressingMode(Instruction *I, bool HasVector) {
     if (SingleUser->getParent() == I->getParent()) {
       if (isa<ICmpInst>(SingleUser)) {
         if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
-          if (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue()))
+          if (C->getBitWidth() <= 64 &&
+              (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue())))
             // Comparison of memory with 16 bit signed / unsigned immediate
             return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
       } else if (isa<StoreInst>(SingleUser))
@@ -748,6 +754,7 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'f': // Floating-point register
     case 'h': // High-part register
     case 'r': // General-purpose register
+    case 'v': // Vector register
       return C_RegisterClass;
 
     case 'Q': // Memory with base and unsigned 12-bit displacement
@@ -800,6 +807,12 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
       weight = CW_Register;
     break;
 
+  case 'v': // Vector register
+    if ((type->isVectorTy() || type->isFloatingPointTy()) &&
+        Subtarget.hasVector())
+      weight = CW_Register;
+    break;
+
   case 'I': // Unsigned 8-bit constant
     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
       if (isUInt<8>(C->getZExtValue()))
@@ -838,13 +851,13 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
 // Map maps 0-based register numbers to LLVM register numbers.
 static std::pair<unsigned, const TargetRegisterClass *>
 parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
-                    const unsigned *Map) {
+                    const unsigned *Map, unsigned Size) {
   assert(*(Constraint.end()-1) == '}' && "Missing '}'");
   if (isdigit(Constraint[2])) {
     unsigned Index;
     bool Failed =
         Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
-    if (!Failed && Index < 16 && Map[Index])
+    if (!Failed && Index < Size && Map[Index])
       return std::make_pair(Map[Index], RC);
   }
   return std::make_pair(0U, nullptr);
@@ -881,6 +894,16 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
       else if (VT == MVT::f128)
         return std::make_pair(0U, &SystemZ::FP128BitRegClass);
       return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+
+    case 'v': // Vector register
+      if (Subtarget.hasVector()) {
+        if (VT == MVT::f32)
+          return std::make_pair(0U, &SystemZ::VR32BitRegClass);
+        if (VT == MVT::f64)
+          return std::make_pair(0U, &SystemZ::VR64BitRegClass);
+        return std::make_pair(0U, &SystemZ::VR128BitRegClass);
+      }
+      break;
     }
   }
   if (Constraint.size() > 0 && Constraint[0] == '{') {
@@ -891,22 +914,32 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
     if (Constraint[1] == 'r') {
       if (VT == MVT::i32)
         return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
-                                   SystemZMC::GR32Regs);
+                                   SystemZMC::GR32Regs, 16);
       if (VT == MVT::i128)
         return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
-                                   SystemZMC::GR128Regs);
+                                   SystemZMC::GR128Regs, 16);
       return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
-                                 SystemZMC::GR64Regs);
+                                 SystemZMC::GR64Regs, 16);
     }
     if (Constraint[1] == 'f') {
       if (VT == MVT::f32)
         return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
-                                   SystemZMC::FP32Regs);
+                                   SystemZMC::FP32Regs, 16);
       if (VT == MVT::f128)
         return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
-                                   SystemZMC::FP128Regs);
+                                   SystemZMC::FP128Regs, 16);
       return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
-                                 SystemZMC::FP64Regs);
+                                 SystemZMC::FP64Regs, 16);
+    }
+    if (Constraint[1] == 'v') {
+      if (VT == MVT::f32)
+        return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
+                                   SystemZMC::VR32Regs, 32);
+      if (VT == MVT::f64)
+        return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
+                                   SystemZMC::VR64Regs, 32);
+      return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
+                                 SystemZMC::VR128Regs, 32);
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -964,6 +997,13 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
 
 #include "SystemZGenCallingConv.inc"
 
+const MCPhysReg *SystemZTargetLowering::getScratchRegisters(
+  CallingConv::ID) const {
+  static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
+                                           SystemZ::R14D, 0 };
+  return ScratchRegs;
+}
+
 bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
                                                      Type *ToType) const {
   return isTruncateFree(FromType, ToType);
@@ -1634,9 +1674,9 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
   }
 }
 
-// Emit an intrinsic with chain with a glued value instead of its CC result.
-static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
-                                             unsigned Opcode) {
+// Emit an intrinsic with chain and an explicit CC register result.
+static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op,
+                                           unsigned Opcode) {
   // Copy all operands except the intrinsic ID.
   unsigned NumOps = Op.getNumOperands();
   SmallVector<SDValue, 6> Ops;
@@ -1646,17 +1686,17 @@ static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
     Ops.push_back(Op.getOperand(I));
 
   assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
-  SDVTList RawVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
   SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
   SDValue OldChain = SDValue(Op.getNode(), 1);
-  SDValue NewChain = SDValue(Intr.getNode(), 0);
+  SDValue NewChain = SDValue(Intr.getNode(), 1);
   DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
-  return Intr;
+  return Intr.getNode();
 }
 
-// Emit an intrinsic with a glued value instead of its CC result.
-static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
-                                     unsigned Opcode) {
+// Emit an intrinsic with an explicit CC register result.
+static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op,
+                                   unsigned Opcode) {
   // Copy all operands except the intrinsic ID.
   unsigned NumOps = Op.getNumOperands();
   SmallVector<SDValue, 6> Ops;
@@ -1664,11 +1704,8 @@ static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
   for (unsigned I = 1; I < NumOps; ++I)
     Ops.push_back(Op.getOperand(I));
 
-  if (Op->getNumValues() == 1)
-    return DAG.getNode(Opcode, SDLoc(Op), MVT::Glue, Ops);
-  assert(Op->getNumValues() == 2 && "Expected exactly one non-CC result");
-  SDVTList RawVTs = DAG.getVTList(Op->getValueType(0), MVT::Glue);
-  return DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
+  return Intr.getNode();
 }
 
 // CC is a comparison that will be implemented using an integer or
@@ -1699,73 +1736,6 @@ static unsigned CCMaskForCondCode(ISD::CondCode CC) {
 #undef CONV
 }
 
-// Return a sequence for getting a 1 from an IPM result when CC has a
-// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
-// The handling of CC values outside CCValid doesn't matter.
-static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
-  // Deal with cases where the result can be taken directly from a bit
-  // of the IPM result.
-  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
-    return IPMConversion(0, 0, SystemZ::IPM_CC);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
-    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
-
-  // Deal with cases where we can add a value to force the sign bit
-  // to contain the right value.  Putting the bit in 31 means we can
-  // use SRL rather than RISBG(L), and also makes it easier to get a
-  // 0/-1 value, so it has priority over the other tests below.
-  //
-  // These sequences rely on the fact that the upper two bits of the
-  // IPM result are zero.
-  uint64_t TopBit = uint64_t(1) << 31;
-  if (CCMask == (CCValid & SystemZ::CCMASK_0))
-    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
-    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_0
-                            | SystemZ::CCMASK_1
-                            | SystemZ::CCMASK_2)))
-    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & SystemZ::CCMASK_3))
-    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_1
-                            | SystemZ::CCMASK_2
-                            | SystemZ::CCMASK_3)))
-    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
-
-  // Next try inverting the value and testing a bit.  0/1 could be
-  // handled this way too, but we dealt with that case above.
-  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
-    return IPMConversion(-1, 0, SystemZ::IPM_CC);
-
-  // Handle cases where adding a value forces a non-sign bit to contain
-  // the right value.
-  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
-    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
-    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
-
-  // The remaining cases are 1, 2, 0/1/3 and 0/2/3.  All these are
-  // can be done by inverting the low CC bit and applying one of the
-  // sign-based extractions above.
-  if (CCMask == (CCValid & SystemZ::CCMASK_1))
-    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & SystemZ::CCMASK_2))
-    return IPMConversion(1 << SystemZ::IPM_CC,
-                         TopBit - (3 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_0
-                            | SystemZ::CCMASK_1
-                            | SystemZ::CCMASK_3)))
-    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
-  if (CCMask == (CCValid & (SystemZ::CCMASK_0
-                            | SystemZ::CCMASK_2
-                            | SystemZ::CCMASK_3)))
-    return IPMConversion(1 << SystemZ::IPM_CC,
-                         TopBit - (1 << SystemZ::IPM_CC), 31);
-
-  llvm_unreachable("Unexpected CC combination");
-}
-
 // If C can be converted to a comparison against zero, adjust the operands
 // as necessary.
 static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
@@ -2237,6 +2207,24 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   C.CCMask = NewCCMask;
 }
 
+// See whether the comparison argument contains a redundant AND
+// and remove it if so.  This sometimes happens due to the generic
+// BRCOND expansion.
+static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
+                                  Comparison &C) {
+  if (C.Op0.getOpcode() != ISD::AND)
+    return;
+  auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
+  if (!Mask)
+    return;
+  KnownBits Known;
+  DAG.computeKnownBits(C.Op0.getOperand(0), Known);
+  if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
+    return;
+
+  C.Op0 = C.Op0.getOperand(0);
+}
+
 // Return a Comparison that tests the condition-code result of intrinsic
 // node Call against constant integer CC using comparison code Cond.
 // Opcode is the opcode of the SystemZISD operation for the intrinsic
@@ -2311,6 +2299,7 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
     else
       C.ICmpType = SystemZICMP::SignedOnly;
     C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
+    adjustForRedundantAnd(DAG, DL, C);
     adjustZeroCmp(DAG, DL, C);
     adjustSubwordCmp(DAG, DL, C);
     adjustForSubtraction(DAG, DL, C);
@@ -2330,29 +2319,28 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
 // Emit the comparison instruction described by C.
 static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
   if (!C.Op1.getNode()) {
-    SDValue Op;
+    SDNode *Node;
     switch (C.Op0.getOpcode()) {
     case ISD::INTRINSIC_W_CHAIN:
-      Op = emitIntrinsicWithChainAndGlue(DAG, C.Op0, C.Opcode);
-      break;
+      Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
+      return SDValue(Node, 0);
     case ISD::INTRINSIC_WO_CHAIN:
-      Op = emitIntrinsicWithGlue(DAG, C.Op0, C.Opcode);
-      break;
+      Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
+      return SDValue(Node, Node->getNumValues() - 1);
     default:
       llvm_unreachable("Invalid comparison operands");
     }
-    return SDValue(Op.getNode(), Op->getNumValues() - 1);
   }
   if (C.Opcode == SystemZISD::ICMP)
-    return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+    return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
                        DAG.getConstant(C.ICmpType, DL, MVT::i32));
   if (C.Opcode == SystemZISD::TM) {
     bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
                          bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
-    return DAG.getNode(SystemZISD::TM, DL, MVT::Glue, C.Op0, C.Op1,
+    return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
                        DAG.getConstant(RegisterOnly, DL, MVT::i32));
   }
-  return DAG.getNode(C.Opcode, DL, MVT::Glue, C.Op0, C.Op1);
+  return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
 }
 
 // Implement a 32-bit *MUL_LOHI operation by extending both operands to
@@ -2383,29 +2371,16 @@ static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
 }
 
-// Return an i32 value that is 1 if the CC value produced by Glue is
+// Return an i32 value that is 1 if the CC value produced by CCReg is
 // in the mask CCMask and 0 otherwise.  CC is known to have a value
 // in CCValid, so other values can be ignored.
-static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue Glue,
+static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
                          unsigned CCValid, unsigned CCMask) {
-  IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
-  SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
-
-  if (Conversion.XORValue)
-    Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
-                         DAG.getConstant(Conversion.XORValue, DL, MVT::i32));
-
-  if (Conversion.AddValue)
-    Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
-                         DAG.getConstant(Conversion.AddValue, DL, MVT::i32));
-
-  // The SHR/AND sequence should get optimized to an RISBG.
-  Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
-                       DAG.getConstant(Conversion.Bit, DL, MVT::i32));
-  if (Conversion.Bit != 31)
-    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
-                         DAG.getConstant(1, DL, MVT::i32));
-  return Result;
+  SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
+                    DAG.getConstant(0, DL, MVT::i32),
+                    DAG.getConstant(CCValid, DL, MVT::i32),
+                    DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
 }
 
 // Return the SystemISD vector comparison operation for CC, or 0 if it cannot
@@ -2554,8 +2529,8 @@ SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
     return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
 
   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
-  SDValue Glue = emitCmp(DAG, DL, C);
-  return emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+  SDValue CCReg = emitCmp(DAG, DL, C);
+  return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
 }
 
 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -2566,10 +2541,10 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
-  SDValue Glue = emitCmp(DAG, DL, C);
+  SDValue CCReg = emitCmp(DAG, DL, C);
   return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
                      Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
-                     DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, Glue);
+                     DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
 }
 
 // Return true if Pos is CmpOp and Neg is the negative of CmpOp,
@@ -2619,36 +2594,11 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
       return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
   }
 
-  SDValue Glue = emitCmp(DAG, DL, C);
-
-  // Special case for handling -1/0 results.  The shifts we use here
-  // should get optimized with the IPM conversion sequence.
-  auto *TrueC = dyn_cast<ConstantSDNode>(TrueOp);
-  auto *FalseC = dyn_cast<ConstantSDNode>(FalseOp);
-  if (TrueC && FalseC) {
-    int64_t TrueVal = TrueC->getSExtValue();
-    int64_t FalseVal = FalseC->getSExtValue();
-    if ((TrueVal == -1 && FalseVal == 0) || (TrueVal == 0 && FalseVal == -1)) {
-      // Invert the condition if we want -1 on false.
-      if (TrueVal == 0)
-        C.CCMask ^= C.CCValid;
-      SDValue Result = emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
-      EVT VT = Op.getValueType();
-      // Extend the result to VT.  Upper bits are ignored.
-      if (!is32Bit(VT))
-        Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
-      // Sign-extend from the low bit.
-      SDValue ShAmt = DAG.getConstant(VT.getSizeInBits() - 1, DL, MVT::i32);
-      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Result, ShAmt);
-      return DAG.getNode(ISD::SRA, DL, VT, Shl, ShAmt);
-    }
-  }
-
+  SDValue CCReg = emitCmp(DAG, DL, C);
   SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
-                   DAG.getConstant(C.CCMask, DL, MVT::i32), Glue};
+                   DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};
 
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
-  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
 }
 
 SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -2757,7 +2707,7 @@ SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
 
 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                                      SelectionDAG &DAG) const {
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(Node, DAG);
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
@@ -3266,6 +3216,99 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
                                    MVT::i64, HighOp, Low32);
 }
 
+// Lower SADDO/SSUBO/UADDO/USUBO nodes.
+SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDLoc DL(N);
+  unsigned BaseOp = 0;
+  unsigned CCValid = 0;
+  unsigned CCMask = 0;
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown instruction!");
+  case ISD::SADDO:
+    BaseOp = SystemZISD::SADDO;
+    CCValid = SystemZ::CCMASK_ARITH;
+    CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
+    break;
+  case ISD::SSUBO:
+    BaseOp = SystemZISD::SSUBO;
+    CCValid = SystemZ::CCMASK_ARITH;
+    CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
+    break;
+  case ISD::UADDO:
+    BaseOp = SystemZISD::UADDO;
+    CCValid = SystemZ::CCMASK_LOGICAL;
+    CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
+    break;
+  case ISD::USUBO:
+    BaseOp = SystemZISD::USUBO;
+    CCValid = SystemZ::CCMASK_LOGICAL;
+    CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
+    break;
+  }
+
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+  SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
+  if (N->getValueType(1) == MVT::i1)
+    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
+}
+
+// Lower ADDCARRY/SUBCARRY nodes.
+SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
+                                                SelectionDAG &DAG) const {
+
+  SDNode *N = Op.getNode();
+  MVT VT = N->getSimpleValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDLoc DL(N);
+  unsigned BaseOp = 0;
+  unsigned CCValid = 0;
+  unsigned CCMask = 0;
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown instruction!");
+  case ISD::ADDCARRY:
+    BaseOp = SystemZISD::ADDCARRY;
+    CCValid = SystemZ::CCMASK_LOGICAL;
+    CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
+    break;
+  case ISD::SUBCARRY:
+    BaseOp = SystemZISD::SUBCARRY;
+    CCValid = SystemZ::CCMASK_LOGICAL;
+    CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
+    break;
+  }
+
+  // Set the condition code from the carry flag.
+  Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
+                      DAG.getConstant(CCValid, DL, MVT::i32),
+                      DAG.getConstant(CCMask, DL, MVT::i32));
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);
+
+  SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
+  if (N->getValueType(1) == MVT::i1)
+    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
+}
+
 SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
                                           SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -3512,16 +3555,16 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
   EVT NarrowVT = Node->getMemoryVT();
   EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
   if (NarrowVT == WideVT) {
-    SDVTList Tys = DAG.getVTList(WideVT, MVT::Other, MVT::Glue);
+    SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
     SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
     SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
                                                DL, Tys, Ops, NarrowVT, MMO);
-    SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(2),
+    SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
                                 SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
 
     DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
     DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
-    DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(1));
+    DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
     return SDValue();
   }
 
@@ -3546,17 +3589,17 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
                                     DAG.getConstant(0, DL, WideVT), BitShift);
 
   // Construct the ATOMIC_CMP_SWAPW node.
-  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other, MVT::Glue);
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
   SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
                     NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
   SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
                                              VTList, Ops, NarrowVT, MMO);
-  SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(2),
+  SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
                               SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
 
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
-  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(1));
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
   return SDValue();
 }
 
@@ -3613,12 +3656,10 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
                                  Node->getMemoryVT(), Node->getMemOperand());
 }
 
-// Return an i32 that contains the value of CC immediately after After,
-// whose final operand must be MVT::Glue.
-static SDValue getCCResult(SelectionDAG &DAG, SDNode *After) {
-  SDLoc DL(After);
-  SDValue Glue = SDValue(After, After->getNumValues() - 1);
-  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+// Convert condition code in CCReg to an i32 value.
+static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
+  SDLoc DL(CCReg);
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
   return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
                      DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
 }
@@ -3629,8 +3670,8 @@ SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   unsigned Opcode, CCValid;
   if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
     assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
-    SDValue Glued = emitIntrinsicWithChainAndGlue(DAG, Op, Opcode);
-    SDValue CC = getCCResult(DAG, Glued.getNode());
+    SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
+    SDValue CC = getCCResult(DAG, SDValue(Node, 0));
     DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
     return SDValue();
   }
@@ -3643,13 +3684,12 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                SelectionDAG &DAG) const {
   unsigned Opcode, CCValid;
   if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
-    SDValue Glued = emitIntrinsicWithGlue(DAG, Op, Opcode);
-    SDValue CC = getCCResult(DAG, Glued.getNode());
+    SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
     if (Op->getNumValues() == 1)
-      return CC;
+      return getCCResult(DAG, SDValue(Node, 0));
     assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
-    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued,
-                       CC);
+    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
+                       SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
   }
 
   unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3853,20 +3893,34 @@ static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
   return nullptr;
 }
 
-// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask,
+// Convert the mask of the given shuffle op into a byte-level mask,
 // as if it had type vNi8.
-static void getVPermMask(ShuffleVectorSDNode *VSN,
+static bool getVPermMask(SDValue ShuffleOp,
                          SmallVectorImpl<int> &Bytes) {
-  EVT VT = VSN->getValueType(0);
+  EVT VT = ShuffleOp.getValueType();
   unsigned NumElements = VT.getVectorNumElements();
   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
-  Bytes.resize(NumElements * BytesPerElement, -1);
-  for (unsigned I = 0; I < NumElements; ++I) {
-    int Index = VSN->getMaskElt(I);
-    if (Index >= 0)
+
+  if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
+    Bytes.resize(NumElements * BytesPerElement, -1);
+    for (unsigned I = 0; I < NumElements; ++I) {
+      int Index = VSN->getMaskElt(I);
+      if (Index >= 0)
+        for (unsigned J = 0; J < BytesPerElement; ++J)
+          Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
+    }
+    return true;
+  }
+  if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
+      isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
+    unsigned Index = ShuffleOp.getConstantOperandVal(1);
+    Bytes.resize(NumElements * BytesPerElement, -1);
+    for (unsigned I = 0; I < NumElements; ++I)
       for (unsigned J = 0; J < BytesPerElement; ++J)
         Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
+    return true;
   }
+  return false;
 }
 
 // Bytes is a VPERM-like permute vector, except that -1 is used for
@@ -4035,7 +4089,8 @@ bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
       // See whether the bytes we need come from a contiguous part of one
       // operand.
       SmallVector<int, SystemZ::VectorBytes> OpBytes;
-      getVPermMask(cast<ShuffleVectorSDNode>(Op), OpBytes);
+      if (!getVPermMask(Op, OpBytes))
+        break;
       int NewByte;
       if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
         break;
@@ -4217,9 +4272,9 @@ static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
     if (!Op.isUndef()) {
       uint64_t Value;
       if (Op.getOpcode() == ISD::Constant)
-        Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue();
+        Value = cast<ConstantSDNode>(Op)->getZExtValue();
       else if (Op.getOpcode() == ISD::ConstantFP)
-        Value = (dyn_cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
+        Value = (cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
                  .getZExtValue());
       else
         return false;
@@ -4245,12 +4300,15 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
                                        const SDLoc &DL, EVT VT, uint64_t Value,
                                        unsigned BitsPerElement) {
   // Signed 16-bit values can be replicated using VREPI.
+  // Mark the constants as opaque or DAGCombiner will convert back to
+  // BUILD_VECTOR.
   int64_t SignedValue = SignExtend64(Value, BitsPerElement);
   if (isInt<16>(SignedValue)) {
     MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
                                  SystemZ::VectorBits / BitsPerElement);
-    SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
-                             DAG.getConstant(SignedValue, DL, MVT::i32));
+    SDValue Op = DAG.getNode(
+        SystemZISD::REPLICATE, DL, VecVT,
+        DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
     return DAG.getNode(ISD::BITCAST, DL, VT, Op);
   }
   // See whether rotating the constant left some N places gives a value that
@@ -4266,9 +4324,10 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
     End -= 64 - BitsPerElement;
     MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
                                  SystemZ::VectorBits / BitsPerElement);
-    SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
-                             DAG.getConstant(Start, DL, MVT::i32),
-                             DAG.getConstant(End, DL, MVT::i32));
+    SDValue Op = DAG.getNode(
+        SystemZISD::ROTATE_MASK, DL, VecVT,
+        DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
+        DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
     return DAG.getNode(ISD::BITCAST, DL, VT, Op);
   }
   return SDValue();
@@ -4481,8 +4540,9 @@ SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
     // priority over other methods below.
     uint64_t Mask = 0;
     if (tryBuildVectorByteMask(BVN, Mask)) {
-      SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
-                               DAG.getConstant(Mask, DL, MVT::i32));
+      SDValue Op = DAG.getNode(
+          SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+          DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/));
       return DAG.getNode(ISD::BITCAST, DL, VT, Op);
     }
 
@@ -4597,7 +4657,7 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
       Op1.getOpcode() != ISD::BITCAST &&
       Op1.getOpcode() != ISD::ConstantFP &&
       Op2.getOpcode() == ISD::Constant) {
-    uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
+    uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
     unsigned Mask = VT.getVectorNumElements() - 1;
     if (Index <= Mask)
       return Op;
@@ -4753,6 +4813,14 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerSDIVREM(Op, DAG);
   case ISD::UDIVREM:
     return lowerUDIVREM(Op, DAG);
+  case ISD::SADDO:
+  case ISD::SSUBO:
+  case ISD::UADDO:
+  case ISD::USUBO:
+    return lowerXALUO(Op, DAG);
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY:
+    return lowerADDSUBCARRY(Op, DAG);
   case ISD::OR:
     return lowerOR(Op, DAG);
   case ISD::CTPOP:
@@ -4881,19 +4949,19 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
   }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     SDLoc DL(N);
-    SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other, MVT::Glue);
+    SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                       lowerI128ToGR128(DAG, N->getOperand(2)),
                       lowerI128ToGR128(DAG, N->getOperand(3)) };
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
                                           DL, Tys, Ops, MVT::i128, MMO);
-    SDValue Success = emitSETCC(DAG, DL, Res.getValue(2),
+    SDValue Success = emitSETCC(DAG, DL, Res.getValue(1),
                                 SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
     Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
     Results.push_back(lowerGR128ToI128(DAG, Res));
     Results.push_back(Success);
-    Results.push_back(Res.getValue(1));
+    Results.push_back(Res.getValue(2));
     break;
   }
   default:
@@ -4931,6 +4999,13 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(UMUL_LOHI);
     OPCODE(SDIVREM);
     OPCODE(UDIVREM);
+    OPCODE(SADDO);
+    OPCODE(SSUBO);
+    OPCODE(UADDO);
+    OPCODE(USUBO);
+    OPCODE(ADDCARRY);
+    OPCODE(SUBCARRY);
+    OPCODE(GET_CCMASK);
     OPCODE(MVC);
     OPCODE(MVC_LOOP);
     OPCODE(NC);
@@ -5049,13 +5124,14 @@ SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
     if (Opcode == ISD::BITCAST)
       // Look through bitcasts.
       Op = Op.getOperand(0);
-    else if (Opcode == ISD::VECTOR_SHUFFLE &&
+    else if ((Opcode == ISD::VECTOR_SHUFFLE || Opcode == SystemZISD::SPLAT) &&
              canTreatAsByteVector(Op.getValueType())) {
       // Get a VPERM-like permute mask and see whether the bytes covered
       // by the extracted element are a contiguous sequence from one
       // source operand.
       SmallVector<int, SystemZ::VectorBytes> Bytes;
-      getVPermMask(cast<ShuffleVectorSDNode>(Op), Bytes);
+      if (!getVPermMask(Op, Bytes))
+        break;
       int First;
       if (!getShuffleInput(Bytes, Index * BytesPerElement,
                            BytesPerElement, First))
@@ -5174,6 +5250,54 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineZERO_EXTEND(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() == SystemZISD::SELECT_CCMASK) {
+    auto *TrueOp = dyn_cast<ConstantSDNode>(N0.getOperand(0));
+    auto *FalseOp = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (TrueOp && FalseOp) {
+      SDLoc DL(N0);
+      SDValue Ops[] = { DAG.getConstant(TrueOp->getZExtValue(), DL, VT),
+                        DAG.getConstant(FalseOp->getZExtValue(), DL, VT),
+                        N0.getOperand(2), N0.getOperand(3), N0.getOperand(4) };
+      SDValue NewSelect = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VT, Ops);
+      // If N0 has multiple uses, change other uses as well.
+      if (!N0.hasOneUse()) {
+        SDValue TruncSelect =
+          DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), NewSelect);
+        DCI.CombineTo(N0.getNode(), TruncSelect);
+      }
+      return NewSelect;
+    }
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // Convert (sext_in_reg (setcc LHS, RHS, COND), i1)
+  // and (sext_in_reg (any_extend (setcc LHS, RHS, COND)), i1)
+  // into (select_cc LHS, RHS, -1, 0, COND)
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  if (N0.hasOneUse() && N0.getOpcode() == ISD::ANY_EXTEND)
+    N0 = N0.getOperand(0);
+  if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) {
+    SDLoc DL(N0);
+    SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1),
+                      DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT),
+                      N0.getOperand(2) };
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
+  }
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::combineSIGN_EXTEND(
     SDNode *N, DAGCombinerInfo &DCI) const {
   // Convert (sext (ashr (shl X, C1), C2)) to
@@ -5249,7 +5373,7 @@ SDValue SystemZTargetLowering::combineSTORE(
   // for the extraction to be done on a vMiN value, so that we can use VSTE.
   // If X has wider elements then convert it to:
   // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
-  if (MemVT.isInteger()) {
+  if (MemVT.isInteger() && SN->isTruncatingStore()) {
     if (SDValue Value =
             combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
       DCI.AddToWorklist(Value.getNode());
@@ -5261,9 +5385,7 @@ SDValue SystemZTargetLowering::combineSTORE(
     }
   }
   // Combine STORE (BSWAP) into STRVH/STRV/STRVG
-  // See comment in combineBSWAP about volatile accesses.
   if (!SN->isTruncatingStore() &&
-      !SN->isVolatile() &&
       Op1.getOpcode() == ISD::BSWAP &&
       Op1.getNode()->hasOneUse() &&
       (Op1.getValueType() == MVT::i16 ||
@@ -5364,13 +5486,10 @@ SDValue SystemZTargetLowering::combineBSWAP(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   // Combine BSWAP (LOAD) into LRVH/LRV/LRVG
-  // These loads are allowed to access memory multiple times, and so we must check
-  // that the loads are not volatile before performing the combine.
   if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
       N->getOperand(0).hasOneUse() &&
       (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 ||
-       N->getValueType(0) == MVT::i64) &&
-       !cast<LoadSDNode>(N->getOperand(0))->isVolatile()) {
+       N->getValueType(0) == MVT::i64)) {
       SDValue Load = N->getOperand(0);
       LoadSDNode *LD = cast<LoadSDNode>(Load);
 
@@ -5475,11 +5594,157 @@ SDValue SystemZTargetLowering::combineSHIFTROT(
   return SDValue();
 }
 
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
+  // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
+  // set by the CCReg instruction using the CCValid / CCMask masks,
+  // If the CCReg instruction is itself a (ICMP (SELECT_CCMASK)) testing
+  // the condition code set by some other instruction, see whether we
+  // can directly use that condition code.
+  bool Invert = false;
+
+  // Verify that we have an appropriate mask for a EQ or NE comparison.
+  if (CCValid != SystemZ::CCMASK_ICMP)
+    return false;
+  if (CCMask == SystemZ::CCMASK_CMP_NE)
+    Invert = !Invert;
+  else if (CCMask != SystemZ::CCMASK_CMP_EQ)
+    return false;
+
+  // Verify that we have an ICMP that is the user of a SELECT_CCMASK.
+  SDNode *ICmp = CCReg.getNode();
+  if (ICmp->getOpcode() != SystemZISD::ICMP)
+    return false;
+  SDNode *Select = ICmp->getOperand(0).getNode();
+  if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
+    return false;
+
+  // Verify that the ICMP compares against one of select values.
+  auto *CompareVal = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
+  if (!CompareVal)
+    return false;
+  auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
+  if (!TrueVal)
+    return false;
+  auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
+  if (!FalseVal)
+    return false;
+  if (CompareVal->getZExtValue() == FalseVal->getZExtValue())
+    Invert = !Invert;
+  else if (CompareVal->getZExtValue() != TrueVal->getZExtValue())
+    return false;
+
+  // Compute the effective CC mask for the new branch or select.
+  auto *NewCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
+  auto *NewCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
+  if (!NewCCValid || !NewCCMask)
+    return false;
+  CCValid = NewCCValid->getZExtValue();
+  CCMask = NewCCMask->getZExtValue();
+  if (Invert)
+    CCMask ^= CCValid;
+
+  // Return the updated CCReg link.
+  CCReg = Select->getOperand(4);
+  return true;
+}
+
+SDValue SystemZTargetLowering::combineBR_CCMASK(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
+  auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
+  if (!CCValid || !CCMask)
+    return SDValue();
+
+  int CCValidVal = CCValid->getZExtValue();
+  int CCMaskVal = CCMask->getZExtValue();
+  SDValue Chain = N->getOperand(0);
+  SDValue CCReg = N->getOperand(4);
+
+  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+    return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
+                       Chain,
+                       DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
+                       DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+                       N->getOperand(3), CCReg);
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSELECT_CCMASK(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Combine SELECT_CCMASK (ICMP (SELECT_CCMASK)) into a single SELECT_CCMASK.
+  auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(2));
+  auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(3));
+  if (!CCValid || !CCMask)
+    return SDValue();
+
+  int CCValidVal = CCValid->getZExtValue();
+  int CCMaskVal = CCMask->getZExtValue();
+  SDValue CCReg = N->getOperand(4);
+
+  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+    return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
+                       N->getOperand(0),
+                       N->getOperand(1),
+                       DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
+                       DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+                       CCReg);
+  return SDValue();
+}
+
+
+SDValue SystemZTargetLowering::combineGET_CCMASK(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+
+  // Optimize away GET_CCMASK (SELECT_CCMASK) if the CC masks are compatible
+  auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
+  if (!CCValid || !CCMask)
+    return SDValue();
+  int CCValidVal = CCValid->getZExtValue();
+  int CCMaskVal = CCMask->getZExtValue();
+
+  SDValue Select = N->getOperand(0);
+  if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
+    return SDValue();
+
+  auto *SelectCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
+  auto *SelectCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
+  if (!SelectCCValid || !SelectCCMask)
+    return SDValue();
+  int SelectCCValidVal = SelectCCValid->getZExtValue();
+  int SelectCCMaskVal = SelectCCMask->getZExtValue();
+
+  auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
+  auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
+  if (!TrueVal || !FalseVal)
+    return SDValue();
+  if (TrueVal->getZExtValue() != 0 && FalseVal->getZExtValue() == 0)
+    ;
+  else if (TrueVal->getZExtValue() == 0 && FalseVal->getZExtValue() != 0)
+    SelectCCMaskVal ^= SelectCCValidVal;
+  else
+    return SDValue();
+
+  if (SelectCCValidVal & ~CCValidVal)
+    return SDValue();
+  if (SelectCCMaskVal != (CCMaskVal & SelectCCValidVal))
+    return SDValue();
+
+  return Select->getOperand(4);
+}
+
 SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   switch(N->getOpcode()) {
   default: break;
+  case ISD::ZERO_EXTEND:        return combineZERO_EXTEND(N, DCI);
   case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
+  case ISD::SIGN_EXTEND_INREG:  return combineSIGN_EXTEND_INREG(N, DCI);
   case SystemZISD::MERGE_HIGH:
   case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
   case ISD::STORE:              return combineSTORE(N, DCI);
@@ -5491,11 +5756,303 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:               return combineSHIFTROT(N, DCI);
+  case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
+  case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
+  case SystemZISD::GET_CCMASK:  return combineGET_CCMASK(N, DCI);
   }
 
   return SDValue();
 }
 
+// Return the demanded elements for the OpNo source operand of Op. DemandedElts
+// are for Op.
+static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
+                                    unsigned OpNo) {
+  EVT VT = Op.getValueType();
+  unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
+  APInt SrcDemE;
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+      // VECTOR PACK truncates the elements of two source vectors into one.
+      SrcDemE = DemandedElts;
+      if (OpNo == 2)
+        SrcDemE.lshrInPlace(NumElts / 2);
+      SrcDemE = SrcDemE.trunc(NumElts / 2);
+      break;
+      // VECTOR UNPACK extends half the elements of the source vector.
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+    case Intrinsic::s390_vuplhh:
+    case Intrinsic::s390_vuplhf:
+      SrcDemE = APInt(NumElts * 2, 0);
+      SrcDemE.insertBits(DemandedElts, 0);
+      break;
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf:
+    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+    case Intrinsic::s390_vupllh:
+    case Intrinsic::s390_vupllf:
+      SrcDemE = APInt(NumElts * 2, 0);
+      SrcDemE.insertBits(DemandedElts, NumElts);
+      break;
+    case Intrinsic::s390_vpdi: {
+      // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
+      SrcDemE = APInt(NumElts, 0);
+      if (!DemandedElts[OpNo - 1])
+        break;
+      unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
+      // Demand input element 0 or 1, given by the mask bit value.
+      SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
+      break;
+    }
+    case Intrinsic::s390_vsldb: {
+      // VECTOR SHIFT LEFT DOUBLE BY BYTE
+      assert(VT == MVT::v16i8 && "Unexpected type.");
+      unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
+      unsigned NumSrc0Els = 16 - FirstIdx;
+      SrcDemE = APInt(NumElts, 0);
+      if (OpNo == 1) {
+        APInt DemEls = DemandedElts.trunc(NumSrc0Els);
+        SrcDemE.insertBits(DemEls, FirstIdx);
+      } else {
+        APInt DemEls = DemandedElts.lshr(NumSrc0Els);
+        SrcDemE.insertBits(DemEls, 0);
+      }
+      break;
+    }
+    case Intrinsic::s390_vperm:
+      SrcDemE = APInt(NumElts, 1);
+      break;
+    default:
+      llvm_unreachable("Unhandled intrinsic.");
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::JOIN_DWORDS:
+      // Scalar operand.
+      SrcDemE = APInt(1, 1);
+      break;
+    case SystemZISD::SELECT_CCMASK:
+      SrcDemE = DemandedElts;
+      break;
+    default:
+      llvm_unreachable("Unhandled opcode.");
+      break;
+    }
+  }
+  return SrcDemE;
+}
+
+static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
+                                  const APInt &DemandedElts,
+                                  const SelectionDAG &DAG, unsigned Depth,
+                                  unsigned OpNo) {
+  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+  KnownBits LHSKnown(SrcBitWidth), RHSKnown(SrcBitWidth);
+  DAG.computeKnownBits(Op.getOperand(OpNo), LHSKnown, Src0DemE, Depth + 1);
+  DAG.computeKnownBits(Op.getOperand(OpNo + 1), RHSKnown, Src1DemE, Depth + 1);
+  Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
+  Known.One = LHSKnown.One & RHSKnown.One;
+}
+
+void
+SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                     KnownBits &Known,
+                                                     const APInt &DemandedElts,
+                                                     const SelectionDAG &DAG,
+                                                     unsigned Depth) const {
+  Known.resetAll();
+
+  // Intrinsic CC result is returned in the two low bits.
+  unsigned tmp0, tmp1; // not used
+  if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
+    Known.Zero.setBitsFrom(2);
+    return;
+  }
+  EVT VT = Op.getValueType();
+  if (Op.getResNo() != 0 || VT == MVT::Untyped)
+    return;
+  assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
+          "KnownBits does not match VT in bitwidth");
+  assert ((!VT.isVector() ||
+           (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
+          "DemandedElts does not match VT number of elements");
+  unsigned BitWidth = Known.getBitWidth();
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    bool IsLogical = false;
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+    case Intrinsic::s390_vpdi:
+    case Intrinsic::s390_vsldb:
+    case Intrinsic::s390_vperm:
+      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
+      break;
+    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+    case Intrinsic::s390_vuplhh:
+    case Intrinsic::s390_vuplhf:
+    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+    case Intrinsic::s390_vupllh:
+    case Intrinsic::s390_vupllf:
+      IsLogical = true;
+      LLVM_FALLTHROUGH;
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf: {
+      SDValue SrcOp = Op.getOperand(1);
+      unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits();
+      Known = KnownBits(SrcBitWidth);
+      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
+      DAG.computeKnownBits(SrcOp, Known, SrcDemE, Depth + 1);
+      if (IsLogical) {
+        Known = Known.zext(BitWidth);
+        Known.Zero.setBitsFrom(SrcBitWidth);
+      } else
+        Known = Known.sext(BitWidth);
+      break;
+    }
+    default:
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::JOIN_DWORDS:
+    case SystemZISD::SELECT_CCMASK:
+      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
+      break;
+    case SystemZISD::REPLICATE: {
+      SDValue SrcOp = Op.getOperand(0);
+      DAG.computeKnownBits(SrcOp, Known, Depth + 1);
+      if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
+        Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  // Known has the width of the source operand(s). Adjust if needed to match
+  // the passed bitwidth.
+  if (Known.getBitWidth() != BitWidth)
+    Known = Known.zextOrTrunc(BitWidth);
+}
+
+static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
+                                        const SelectionDAG &DAG, unsigned Depth,
+                                        unsigned OpNo) {
+  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+  unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
+  if (LHS == 1) return 1; // Early out.
+  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+  unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
+  if (RHS == 1) return 1; // Early out.
+  unsigned Common = std::min(LHS, RHS);
+  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getScalarSizeInBits();
+  if (SrcBitWidth > VTBits) { // PACK
+    unsigned SrcExtraBits = SrcBitWidth - VTBits;
+    if (Common > SrcExtraBits)
+      return (Common - SrcExtraBits);
+    return 1;
+  }
+  assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
+  return Common;
+}
+
+unsigned
+SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
+  if (Op.getResNo() != 0)
+    return 1;
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+    case Intrinsic::s390_vpdi:
+    case Intrinsic::s390_vsldb:
+    case Intrinsic::s390_vperm:
+      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf: {
+      SDValue PackedOp = Op.getOperand(1);
+      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
+      unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
+      EVT VT = Op.getValueType();
+      unsigned VTBits = VT.getScalarSizeInBits();
+      Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
+      return Tmp;
+    }
+    default:
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::SELECT_CCMASK:
+      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
+    default:
+      break;
+    }
+  }
+
+  return 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom insertion
 //===----------------------------------------------------------------------===//
@@ -5546,34 +6103,141 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
   return Reg;
 }
 
+// The CC operand of MI might be missing a kill marker because there
+// were multiple uses of CC, and ISel didn't know which to mark.
+// Figure out whether MI should have had a kill marker.
+static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
+  // Scan forward through BB for a use/def of CC.
+  MachineBasicBlock::iterator miI(std::next(MachineBasicBlock::iterator(MI)));
+  for (MachineBasicBlock::iterator miE = MBB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(SystemZ::CC))
+      return false;
+    if (mi.definesRegister(SystemZ::CC))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether CC is live into a
+  // successor.
+  if (miI == MBB->end()) {
+    for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(SystemZ::CC))
+        return false;
+  }
+
+  return true;
+}
+
+// Return true if it is OK for this Select pseudo-opcode to be cascaded
+// together with other Select pseudo-opcodes into a single basic-block with
+// a conditional jump around it.
+static bool isSelectPseudo(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case SystemZ::Select32:
+  case SystemZ::Select64:
+  case SystemZ::SelectF32:
+  case SystemZ::SelectF64:
+  case SystemZ::SelectF128:
+  case SystemZ::SelectVR32:
+  case SystemZ::SelectVR64:
+  case SystemZ::SelectVR128:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+// Helper function, which inserts PHI functions into SinkMBB:
+//   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
+// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects
+// in [MIItBegin, MIItEnd) range.
+static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
+                                 MachineBasicBlock::iterator MIItEnd,
+                                 MachineBasicBlock *TrueMBB,
+                                 MachineBasicBlock *FalseMBB,
+                                 MachineBasicBlock *SinkMBB) {
+  MachineFunction *MF = TrueMBB->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  unsigned CCValid = MIItBegin->getOperand(3).getImm();
+  unsigned CCMask = MIItBegin->getOperand(4).getImm();
+  DebugLoc DL = MIItBegin->getDebugLoc();
+
+  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later Selects may reference the results of earlier Selects, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned TrueReg = MIIt->getOperand(1).getReg();
+    unsigned FalseReg = MIIt->getOperand(2).getReg();
+
+    // If this Select we are generating is the opposite condition from
+    // the jump we generated, then we have to swap the operands for the
+    // PHI that is going to be generated.
+    if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask))
+      std::swap(TrueReg, FalseReg);
+
+    if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
+      TrueReg = RegRewriteTable[TrueReg].first;
+
+    if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
+      FalseReg = RegRewriteTable[FalseReg].second;
+
+    BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
+      .addReg(TrueReg).addMBB(TrueMBB)
+      .addReg(FalseReg).addMBB(FalseMBB);
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
+  }
+}
+
 // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr &MI,
-                                  MachineBasicBlock *MBB,
-                                  unsigned LOCROpcode) const {
+                                  MachineBasicBlock *MBB) const {
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned TrueReg = MI.getOperand(1).getReg();
-  unsigned FalseReg = MI.getOperand(2).getReg();
   unsigned CCValid = MI.getOperand(3).getImm();
   unsigned CCMask = MI.getOperand(4).getImm();
   DebugLoc DL = MI.getDebugLoc();
 
-  // Use LOCROpcode if possible.
-  if (LOCROpcode && Subtarget.hasLoadStoreOnCond()) {
-    BuildMI(*MBB, MI, DL, TII->get(LOCROpcode), DestReg)
-      .addReg(FalseReg).addReg(TrueReg)
-      .addImm(CCValid).addImm(CCMask);
-    MI.eraseFromParent();
-    return MBB;
-  }
+  // If we have a sequence of Select* pseudo instructions using the
+  // same condition code value, we want to expand all of them into
+  // a single pair of basic blocks using the same condition.
+  MachineInstr *LastMI = &MI;
+  MachineBasicBlock::iterator NextMIIt =
+      std::next(MachineBasicBlock::iterator(MI));
+
+  if (isSelectPseudo(MI))
+    while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
+           NextMIIt->getOperand(3).getImm() == CCValid &&
+           (NextMIIt->getOperand(4).getImm() == CCMask ||
+            NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
+      LastMI = &*NextMIIt;
+      ++NextMIIt;
+    }
 
   MachineBasicBlock *StartMBB = MBB;
   MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
+  // Unless CC was killed in the last Select instruction, mark it as
+  // live-in to both FalseMBB and JoinMBB.
+  if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) {
+    FalseMBB->addLiveIn(SystemZ::CC);
+    JoinMBB->addLiveIn(SystemZ::CC);
+  }
+
   //  StartMBB:
   //   BRC CCMask, JoinMBB
   //   # fallthrough to FalseMBB
@@ -5592,11 +6256,12 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
   //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
   //  ...
   MBB = JoinMBB;
-  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
-    .addReg(TrueReg).addMBB(StartMBB)
-    .addReg(FalseReg).addMBB(FalseMBB);
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+      std::next(MachineBasicBlock::iterator(LastMI));
+  createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);
 
-  MI.eraseFromParent();
+  StartMBB->erase(MIItBegin, MIItEnd);
   return JoinMBB;
 }
 
@@ -5658,6 +6323,13 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
   MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
+  // Unless CC was killed in the CondStore instruction, mark it as
+  // live-in to both FalseMBB and JoinMBB.
+  if (!MI.killsRegister(SystemZ::CC) && !checkCCKill(MI, JoinMBB)) {
+    FalseMBB->addLiveIn(SystemZ::CC);
+    JoinMBB->addLiveIn(SystemZ::CC);
+  }
+
   //  StartMBB:
   //   BRC CCMask, JoinMBB
   //   # fallthrough to FalseMBB
@@ -6223,6 +6895,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     DestBase = MachineOperand::CreateReg(NextDestReg, false);
     SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
     Length &= 255;
+    if (EndMBB && !Length)
+      // If the loop handled the whole CLC range, DoneMBB will be empty with
+      // CC live-through into EndMBB, so add it as live-in.
+      DoneMBB->addLiveIn(SystemZ::CC);
     MBB = DoneMBB;
   }
   // Handle any remaining bytes with straight-line code.
@@ -6415,18 +7091,15 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
 MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   switch (MI.getOpcode()) {
-  case SystemZ::Select32Mux:
-    return emitSelect(MI, MBB,
-                      Subtarget.hasLoadStoreOnCond2()? SystemZ::LOCRMux : 0);
   case SystemZ::Select32:
-    return emitSelect(MI, MBB, SystemZ::LOCR);
   case SystemZ::Select64:
-    return emitSelect(MI, MBB, SystemZ::LOCGR);
   case SystemZ::SelectF32:
   case SystemZ::SelectF64:
   case SystemZ::SelectF128:
+  case SystemZ::SelectVR32:
+  case SystemZ::SelectVR64:
   case SystemZ::SelectVR128:
-    return emitSelect(MI, MBB, 0);
+    return emitSelect(MI, MBB);
 
   case SystemZ::CondStore8Mux:
     return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
@@ -6675,6 +7348,10 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::LTXBRCompare_VecPseudo:
     return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
 
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, MBB);
+
   default:
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 2cdc88db5a4d..0ca93a38a016 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -93,6 +93,19 @@ enum NodeType : unsigned {
   SDIVREM,
   UDIVREM,
 
+  // Add/subtract with overflow/carry.  These have the same operands as
+  // the corresponding standard operations, except with the carry flag
+  // replaced by a condition code value.
+  SADDO, SSUBO, UADDO, USUBO, ADDCARRY, SUBCARRY,
+
+  // Set the condition code from a boolean value in operand 0.
+  // Operand 1 is a mask of all condition-code values that may result of this
+  // operation, operand 2 is a mask of condition-code values that may result
+  // if the boolean is true.
+  // Note that this operation is always optimized away, we will never
+  // generate any code for it.
+  GET_CCMASK,
+
   // Use a series of MVCs to copy bytes from one memory location to another.
   // The operands are:
   // - the target address
@@ -142,11 +155,11 @@ enum NodeType : unsigned {
 
   // Transaction begin.  The first operand is the chain, the second
   // the TDB pointer, and the third the immediate control field.
-  // Returns chain and glue.
+  // Returns CC value and chain.
   TBEGIN,
   TBEGIN_NOFLOAT,
 
-  // Transaction end.  Just the chain operand.  Returns chain and glue.
+  // Transaction end.  Just the chain operand.  Returns CC value and chain.
   TEND,
 
   // Create a vector constant by filling byte N of the result with bit
@@ -308,8 +321,8 @@ enum NodeType : unsigned {
   // Operand 5: the width of the field in bits (8 or 16)
   ATOMIC_CMP_SWAPW,
 
-  // Atomic compare-and-swap returning glue (condition code).
-  // Val, OUTCHAIN, glue = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+  // Atomic compare-and-swap returning CC value.
+  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
   ATOMIC_CMP_SWAP,
 
   // 128-bit atomic load.
@@ -321,7 +334,7 @@ enum NodeType : unsigned {
   ATOMIC_STORE_128,
 
   // 128-bit atomic compare-and-swap.
-  // Val, OUTCHAIN, glue = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
   ATOMIC_CMP_SWAP_128,
 
   // Byte swapping load.
@@ -470,6 +483,7 @@ public:
                              SelectionDAG &DAG) const override;
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                           SelectionDAG &DAG) const override;
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
   bool allowTruncateForTailCall(Type *, Type *) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
@@ -490,6 +504,20 @@ public:
                       SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+  /// Determine which of the bits specified in Mask are known to be either
+  /// zero or one and return them in the KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  /// Determine the number of bits in the operation that are sign bits.
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                           const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
+                                           unsigned Depth) const override;
+
   ISD::NodeType getExtendForAtomicOps() const override {
     return ISD::ANY_EXTEND;
   }
@@ -533,6 +561,8 @@ private:
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
@@ -563,7 +593,9 @@ private:
                          bool Force) const;
   SDValue combineTruncateExtract(const SDLoc &DL, EVT TruncVT, SDValue Op,
                                  DAGCombinerInfo &DCI) const;
+  SDValue combineZERO_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -571,6 +603,9 @@ private:
   SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
@@ -582,8 +617,7 @@ private:
                                   MachineBasicBlock *Target) const;
 
   // Implement EmitInstrWithCustomInserter for individual operation types.
-  MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB,
-                                unsigned LOCROpcode) const;
+  MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB,
                                    unsigned StoreOpcode, unsigned STOCOpcode,
                                    bool Invert) const;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 16edbea87cda..4e47752ed122 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -15,6 +15,10 @@
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
+let Predicates = [FeatureVector] in {
+  def SelectVR32 : SelectWrapper<f32, VR32>;
+  def SelectVR64 : SelectWrapper<f64, VR64>;
+}
 def SelectF32  : SelectWrapper<f32, FP32>;
 def SelectF64  : SelectWrapper<f64, FP64>;
 let Predicates = [FeatureNoVectorEnhancements1] in
@@ -65,7 +69,7 @@ let Predicates = [FeatureNoVector] in {
 
 // Use a normal load-and-test for compare against zero in case of
 // vector support (via a pseudo to simplify instruction selection).
-let Defs = [CC], usesCustomInserter = 1 in {
+let Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
   def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
   def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
   def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 06da66ad8764..e3f9a9645d13 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2469,7 +2469,7 @@ class StoreVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                TypedReg tr, bits<5> bytes, bits<4> type = 0>
   : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2),
             mnemonic#"\t$V1, $XBD2",
-            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+            [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2))]> {
   let M3 = type;
   let mayStore = 1;
   let AccessBytes = bytes;
@@ -2844,7 +2844,7 @@ class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr, Immediate imm, bits<4> type = 0>
   : InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
              mnemonic#"\t$V1, $I2",
-             [(set tr.op:$V1, (tr.vt (operator imm:$I2)))]> {
+             [(set (tr.vt tr.op:$V1), (operator imm:$I2))]> {
   let M3 = type;
 }
 
@@ -2857,7 +2857,7 @@ class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 bits<4> m5 = 0>
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
              mnemonic#"\t$V1, $V2",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2)))]> {
   let M3 = type;
   let M4 = m4;
   let M5 = m5;
@@ -2913,7 +2913,7 @@ class UnaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                TypedReg tr, bits<5> bytes, bits<4> type = 0>
   : InstVRX<opcode, (outs tr.op:$V1), (ins bdxaddr12only:$XBD2),
             mnemonic#"\t$V1, $XBD2",
-            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+            [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2))]> {
   let M3 = type;
   let mayLoad = 1;
   let AccessBytes = bytes;
@@ -3132,7 +3132,9 @@ class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                    RegisterOperand cls2>
   : InstRRFc<opcode, (outs cls1:$R1),
              (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3),
-             mnemonic#"$M3\t$R1, $R2", []> {
+             mnemonic#"$M3\t$R1, $R2",
+             [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
+                                              cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
   let CCMaskLast = 1;
@@ -3385,7 +3387,7 @@ class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr, bits<4> type>
   : InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
              mnemonic#"\t$V1, $I2, $I3",
-             [(set tr.op:$V1, (tr.vt (operator imm32zx8:$I2, imm32zx8:$I3)))]> {
+             [(set (tr.vt tr.op:$V1), (operator imm32zx8:$I2, imm32zx8:$I3))]> {
   let M4 = type;
 }
 
@@ -3398,8 +3400,8 @@ class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type>
   : InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
              mnemonic#"\t$V1, $V3, $I2",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
-                                                 imm32zx16:$I2)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3),
+                                                  imm32zx16:$I2))]> {
   let M4 = type;
 }
 
@@ -3412,8 +3414,8 @@ class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m5>
   : InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
              mnemonic#"\t$V1, $V2, $I3",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 imm32zx12:$I3)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  imm32zx12:$I3))]> {
   let M4 = type;
   let M5 = m5;
 }
@@ -3432,8 +3434,8 @@ class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $M5",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 imm32zx12:$M5)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  imm32zx12:$M5))]> {
   let M3 = type;
   let M4 = m4;
 }
@@ -3448,8 +3450,8 @@ class BinaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  bits<4> modifier = 0>
   : InstVRRb<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
              mnemonic#"\t$V1, $V2, $V3",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3))))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3)))]> {
   let M4 = type;
   let M5 = modifier;
 }
@@ -3507,8 +3509,8 @@ class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  bits<4> m6 = 0>
   : InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
              mnemonic#"\t$V1, $V2, $V3",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3))))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3)))]> {
   let M4 = type;
   let M5 = m5;
   let M6 = m6;
@@ -3554,7 +3556,7 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr>
   : InstVRRf<opcode, (outs tr.op:$V1), (ins GR64:$R2, GR64:$R3),
              mnemonic#"\t$V1, $R2, $R3",
-             [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
+             [(set (tr.vt tr.op:$V1), (operator GR64:$R2, GR64:$R3))]>;
 
 class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
@@ -3564,8 +3566,8 @@ class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type>
   : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
              mnemonic#"\t$V1, $V3, $BD2",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
-                                                 shift12only:$BD2)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3),
+                                                  shift12only:$BD2))]> {
   let M4 = type;
 }
 
@@ -3610,8 +3612,8 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr, bits<5> bytes>
   : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
             mnemonic#"\t$V1, $XBD2, $M3",
-            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2,
-                                              imm32zx4:$M3)))]> {
+            [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2,
+                                               imm32zx4:$M3))]> {
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -3688,7 +3690,7 @@ class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
            mnemonic#"\t$R1, $R2",
-           [(operator cls1:$R1, cls2:$R2)]> {
+           [(set CC, (operator cls1:$R1, cls2:$R2))]> {
   let OpKey = mnemonic#cls1;
   let OpType = "reg";
   let isCompare = 1;
@@ -3698,7 +3700,7 @@ class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls1, RegisterOperand cls2>
   : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
             mnemonic#"\t$R1, $R2",
-            [(operator cls1:$R1, cls2:$R2)]> {
+            [(set CC, (operator cls1:$R1, cls2:$R2))]> {
   let OpKey = mnemonic#cls1;
   let OpType = "reg";
   let isCompare = 1;
@@ -3708,7 +3710,7 @@ class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                 RegisterOperand cls, Immediate imm>
   : InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2),
             mnemonic#"\t$R1, $I2",
-            [(operator cls:$R1, imm:$I2)]> {
+            [(set CC, (operator cls:$R1, imm:$I2))]> {
   let isCompare = 1;
 }
 
@@ -3716,7 +3718,7 @@ class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                  RegisterOperand cls, Immediate imm>
   : InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2),
              mnemonic#"\t$R1, $I2",
-             [(operator cls:$R1, imm:$I2)]> {
+             [(set CC, (operator cls:$R1, imm:$I2))]> {
   let isCompare = 1;
 }
 
@@ -3724,7 +3726,7 @@ class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                    RegisterOperand cls, SDPatternOperator load>
   : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
              mnemonic#"\t$R1, $RI2",
-             [(operator cls:$R1, (load pcrel32:$RI2))]> {
+             [(set CC, (operator cls:$R1, (load pcrel32:$RI2)))]> {
   let isCompare = 1;
   let mayLoad = 1;
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
@@ -3738,7 +3740,7 @@ class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 AddressingMode mode = bdxaddr12only>
   : InstRXa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
             mnemonic#"\t$R1, $XBD2",
-            [(operator cls:$R1, (load mode:$XBD2))]> {
+            [(set CC, (operator cls:$R1, (load mode:$XBD2)))]> {
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let isCompare = 1;
@@ -3750,7 +3752,7 @@ class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
   : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
             mnemonic#"\t$R1, $XBD2",
-            [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+            [(set CC, (operator cls:$R1, (load bdxaddr12only:$XBD2)))]> {
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let isCompare = 1;
@@ -3764,7 +3766,7 @@ class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  AddressingMode mode = bdxaddr20only>
   : InstRXYa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
              mnemonic#"\t$R1, $XBD2",
-             [(operator cls:$R1, (load mode:$XBD2))]> {
+             [(set CC, (operator cls:$R1, (load mode:$XBD2)))]> {
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let isCompare = 1;
@@ -3824,7 +3826,7 @@ class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 AddressingMode mode = bdaddr12only>
   : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
-           [(operator (load mode:$BD1), imm:$I2)]> {
+           [(set CC, (operator (load mode:$BD1), imm:$I2))]> {
   let isCompare = 1;
   let mayLoad = 1;
 }
@@ -3833,7 +3835,7 @@ class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  SDPatternOperator load, Immediate imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+            [(set CC, (operator (load bdaddr12only:$BD1), imm:$I2))]> {
   let isCompare = 1;
   let mayLoad = 1;
 }
@@ -3843,7 +3845,7 @@ class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  AddressingMode mode = bdaddr20only>
   : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(operator (load mode:$BD1), imm:$I2)]> {
+            [(set CC, (operator (load mode:$BD1), imm:$I2))]> {
   let isCompare = 1;
   let mayLoad = 1;
 }
@@ -3864,7 +3866,7 @@ class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr, bits<4> type>
   : InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
              mnemonic#"\t$V1, $V2",
-             [(operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2))]> {
+             [(set CC, (operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2)))]> {
   let isCompare = 1;
   let M3 = type;
   let M4 = 0;
@@ -3893,14 +3895,26 @@ class CompareVRRh<string mnemonic, bits<16> opcode>
   let isCompare = 1;
 }
 
+class TestInherentS<string mnemonic, bits<16> opcode,
+                    SDPatternOperator operator>
+  : InstS<opcode, (outs), (ins), mnemonic, [(set CC, (operator))]> {
+  let BD2 = 0;
+}
+
 class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
               RegisterOperand cls>
   : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
             mnemonic#"\t$R1, $XBD2",
-            [(operator cls:$R1, bdxaddr12only:$XBD2)]> {
+            [(set CC, (operator cls:$R1, bdxaddr12only:$XBD2))]> {
   let M3 = 0;
 }
 
+class TestBinarySIL<string mnemonic, bits<16> opcode,
+                    SDPatternOperator operator, Immediate imm>
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
+
 class TestRSL<string mnemonic, bits<16> opcode>
   : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
              mnemonic#"\t$BDL1", []> {
@@ -4097,8 +4111,8 @@ class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
   : InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
              mnemonic#"\t$V1, $I2, $M3",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
-                                                 imm:$I2, index:$M3)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+                                                  imm:$I2, index:$M3))]> {
   let Constraints = "$V1 = $V1src";
   let DisableEncoding = "$V1src";
 }
@@ -4108,9 +4122,9 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRId<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
              mnemonic#"\t$V1, $V2, $V3, $I4",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 imm32zx8:$I4)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  imm32zx8:$I4))]> {
   let M5 = type;
 }
 
@@ -4124,9 +4138,9 @@ class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRRa<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $M4, $M5",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 imm32zx4:$M4,
-                                                 imm32zx4:$M5)))],
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  imm32zx4:$M4,
+                                                  imm32zx4:$M5))],
              m4or> {
   let M3 = type;
 }
@@ -4142,9 +4156,9 @@ class TernaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRRb<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, m5mask:$M5),
              mnemonic#"\t$V1, $V2, $V3, $M5",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 m5mask:$M5)))],
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  m5mask:$M5))],
              m5or> {
   let M4 = type;
 }
@@ -4184,9 +4198,9 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRRc<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M4),
              mnemonic#"\t$V1, $V2, $V3, $M4",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 imm32zx4:$M4)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  imm32zx4:$M4))]> {
   let M5 = 0;
   let M6 = 0;
 }
@@ -4197,9 +4211,9 @@ class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
   : InstVRRc<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6),
              mnemonic#"\t$V1, $V2, $V3, $M6",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 imm32zx4:$M6)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  imm32zx4:$M6))]> {
   let M4 = type;
   let M5 = m5;
 }
@@ -4215,9 +4229,9 @@ class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRRd<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
              mnemonic#"\t$V1, $V2, $V3, $V4",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 (tr1.vt tr1.op:$V4))))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  (tr1.vt tr1.op:$V4)))]> {
   let M5 = type;
   let M6 = 0;
 }
@@ -4234,9 +4248,9 @@ class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRRe<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
              mnemonic#"\t$V1, $V2, $V3, $V4",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 (tr1.vt tr1.op:$V4))))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  (tr1.vt tr1.op:$V4)))]> {
   let M5 = m5;
   let M6 = type;
 }
@@ -4251,9 +4265,9 @@ class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRSb<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V1src, cls:$R3, shift12only:$BD2),
              mnemonic#"\t$V1, $R3, $BD2",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
-                                                 cls:$R3,
-                                                 shift12only:$BD2)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+                                                  cls:$R3,
+                                                  shift12only:$BD2))]> {
   let Constraints = "$V1 = $V1src";
   let DisableEncoding = "$V1src";
   let M4 = type;
@@ -4283,9 +4297,9 @@ class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRX<opcode, (outs tr1.op:$V1),
            (ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
            mnemonic#"\t$V1, $XBD2, $M3",
-           [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
-                                               bdxaddr12only:$XBD2,
-                                               index:$M3)))]> {
+           [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+                                                bdxaddr12only:$XBD2,
+                                                index:$M3))]> {
   let Constraints = "$V1 = $V1src";
   let DisableEncoding = "$V1src";
   let mayLoad = 1;
@@ -4297,10 +4311,10 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato
   : InstVRId<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V1src, tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
              mnemonic#"\t$V1, $V2, $V3, $I4",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
-                                                 (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 imm32zx8:$I4)))]> {
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
+                                                  (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  imm32zx8:$I4))]> {
   let Constraints = "$V1 = $V1src";
   let DisableEncoding = "$V1src";
   let M5 = type;
@@ -4334,10 +4348,10 @@ class QuaternaryVRRd<string mnemonic, bits<16> opcode,
   : InstVRRd<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
              mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
-             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr3.vt tr3.op:$V3),
-                                                 (tr4.vt tr4.op:$V4),
-                                                 m6mask:$M6)))],
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr3.vt tr3.op:$V3),
+                                                  (tr4.vt tr4.op:$V4),
+                                                  m6mask:$M6))],
              m6or> {
   let M5 = type;
 }
@@ -4527,11 +4541,6 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
   let isCodeGenOnly = 1;
 }
 
-// Like SideEffectBinarySIL, but expanded later.
-class SideEffectBinarySILPseudo<SDPatternOperator operator, Immediate imm>
-  : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
-           [(operator bdaddr12only:$BD1, imm:$I2)]>;
-
 // Like UnaryRI, but expanded after RA depending on the choice of register.
 class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
                     Immediate imm>
@@ -4591,7 +4600,8 @@ multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
 // Like CompareRI, but expanded after RA depending on the choice of register.
 class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
                       Immediate imm>
-  : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  : Pseudo<(outs), (ins cls:$R1, imm:$I2),
+           [(set CC, (operator cls:$R1, imm:$I2))]> {
   let isCompare = 1;
 }
 
@@ -4600,18 +4610,25 @@ class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
                        SDPatternOperator load, bits<5> bytes,
                        AddressingMode mode = bdxaddr20only>
   : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
-           [(operator cls:$R1, (load mode:$XBD2))]> {
+           [(set CC, (operator cls:$R1, (load mode:$XBD2)))]> {
   let mayLoad = 1;
   let Has20BitOffset = 1;
   let HasIndex = 1;
   let AccessBytes = bytes;
 }
 
+// Like TestBinarySIL, but expanded later.
+class TestBinarySILPseudo<SDPatternOperator operator, Immediate imm>
+  : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
+           [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
+
 // Like CondBinaryRRF, but expanded after RA depending on the choice of
 // register.
 class CondBinaryRRFPseudo<RegisterOperand cls1, RegisterOperand cls2>
   : Pseudo<(outs cls1:$R1),
-           (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3), []> {
+           (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3),
+           [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
+                                            cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
   let CCMaskLast = 1;
@@ -4685,17 +4702,14 @@ class SelectWrapper<ValueType vt, RegisterOperand cls>
            [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
                                             imm32zx4:$valid, imm32zx4:$cc))]> {
   let usesCustomInserter = 1;
-  // Although the instructions used by these nodes do not in themselves
-  // change CC, the insertion requires new blocks, and CC cannot be live
-  // across them.
-  let Defs = [CC];
+  let hasNoSchedulingInfo = 1;
   let Uses = [CC];
 }
 
 // Stores $new to $addr if $cc is true ("" case) or false (Inv case).
 multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
                       SDPatternOperator load, AddressingMode mode> {
-  let Defs = [CC], Uses = [CC], usesCustomInserter = 1,
+  let Uses = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1,
       mayLoad = 1, mayStore = 1 in {
     def "" : Pseudo<(outs),
                     (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
@@ -4765,7 +4779,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
 multiclass MemorySS<string mnemonic, bits<8> opcode,
                     SDPatternOperator sequence, SDPatternOperator loop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
-  let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+  let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
     def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
                                        imm64:$length),
                            [(sequence bdaddr12only:$dest, bdaddr12only:$src,
@@ -4777,6 +4791,22 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
   }
 }
 
+// The same, but setting a CC result as comparion operator.
+multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
+                          SDPatternOperator sequence, SDPatternOperator loop> {
+  def "" : SideEffectBinarySSa<mnemonic, opcode>;
+  let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                               imm64:$length))]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length, GR64:$count256))]>;
+  }
+}
+
 // Define an instruction that operates on two strings, both terminated
 // by the character in R0.  The instruction processes a CPU-determinated
 // number of bytes at a time and sets CC to 3 if the instruction needs
@@ -4809,13 +4839,13 @@ class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
 // An alias of a UnaryVRR*, but with different register sizes.
 class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
   : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
-          [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+          [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2)))]>;
 
 // An alias of a UnaryVRX, but with different register sizes.
 class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
                     AddressingMode mode = bdxaddr12only>
   : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
-          [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+          [(set (tr.vt tr.op:$V1), (operator mode:$XBD2))]>;
 
 // An alias of a StoreVRX, but with different register sizes.
 class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
@@ -4846,7 +4876,8 @@ class BinaryAliasVRRf<RegisterOperand cls>
 // An alias of a CompareRI, but with different register sizes.
 class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
                      Immediate imm>
-  : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  : Alias<4, (outs), (ins cls:$R1, imm:$I2),
+          [(set CC, (operator cls:$R1, imm:$I2))]> {
   let isCompare = 1;
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 572446c1aa12..f0f9211efd5d 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -389,7 +389,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     // Working from the bottom, when we see a non-terminator instruction, we're
@@ -479,7 +479,7 @@ unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (!I->isBranch())
       break;
@@ -906,6 +906,23 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Move CC value from/to a GR32.
+  if (SrcReg == SystemZ::CC) {
+    auto MIB = BuildMI(MBB, MBBI, DL, get(SystemZ::IPM), DestReg);
+    if (KillSrc) {
+      const MachineFunction *MF = MBB.getParent();
+      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+      MIB->addRegisterKilled(SrcReg, TRI);
+    }
+    return;
+  }
+  if (DestReg == SystemZ::CC) {
+    BuildMI(MBB, MBBI, DL, get(SystemZ::TMLH))
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(3 << (SystemZ::IPM_CC - 16));
+    return;
+  }
+
   // Everything else needs only one instruction.
   unsigned Opcode;
   if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
@@ -1174,6 +1191,36 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     return BuiltMI;
   }
 
+  if ((Opcode == SystemZ::ALFI && OpNum == 0 &&
+       isInt<8>((int32_t)MI.getOperand(2).getImm())) ||
+      (Opcode == SystemZ::ALGFI && OpNum == 0 &&
+       isInt<8>((int64_t)MI.getOperand(2).getImm()))) {
+    // AL(G)FI %reg, CONST -> AL(G)SI %mem, CONST
+    Opcode = (Opcode == SystemZ::ALFI ? SystemZ::ALSI : SystemZ::ALGSI);
+    MachineInstr *BuiltMI =
+        BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addImm((int8_t)MI.getOperand(2).getImm());
+    transferDeadCC(&MI, BuiltMI);
+    return BuiltMI;
+  }
+
+  if ((Opcode == SystemZ::SLFI && OpNum == 0 &&
+       isInt<8>((int32_t)-MI.getOperand(2).getImm())) ||
+      (Opcode == SystemZ::SLGFI && OpNum == 0 &&
+       isInt<8>((int64_t)-MI.getOperand(2).getImm()))) {
+    // SL(G)FI %reg, CONST -> AL(G)SI %mem, -CONST
+    Opcode = (Opcode == SystemZ::SLFI ? SystemZ::ALSI : SystemZ::ALGSI);
+    MachineInstr *BuiltMI =
+        BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addImm((int8_t)-MI.getOperand(2).getImm());
+    transferDeadCC(&MI, BuiltMI);
+    return BuiltMI;
+  }
+
   if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
     bool Op0IsGPR = (Opcode == SystemZ::LGDR);
     bool Op1IsGPR = (Opcode == SystemZ::LDGR);
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index abb804597f4e..9d7312269957 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -325,9 +325,10 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>;
-def Select32    : SelectWrapper<i32, GR32>;
-def Select64    : SelectWrapper<i64, GR64>;
+def Select32    : SelectWrapper<i32, GR32>,
+                  Requires<[FeatureNoLoadStoreOnCond]>;
+def Select64    : SelectWrapper<i64, GR64>,
+                  Requires<[FeatureNoLoadStoreOnCond]>;
 
 // We don't define 32-bit Mux stores if we don't have STOCFH, because the
 // low-only STOC should then always be used if possible.
@@ -495,7 +496,7 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
   defm LOCHI   : CondBinaryRIEPair<"lochi",  0xEC42, GR32, imm32sx16>;
   defm LOCGHI  : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>;
 
-  // Move register on condition.  Expanded from Select* pseudos and
+  // Move register on condition.  Matched via DAG pattern and
   // created by early if-conversion.
   let isCommutable = 1 in {
     // Expands to LOCR or LOCFHR or a branch-and-move sequence,
@@ -530,7 +531,7 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
 }
 
 let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
-  // Move register on condition.  Expanded from Select* pseudos and
+  // Move register on condition.  Matched via DAG pattern and
   // created by early if-conversion.
   let isCommutable = 1 in {
     defm LOCR  : CondBinaryRRFPair<"locr",  0xB9F2, GR32, GR32>;
@@ -681,7 +682,7 @@ let Predicates = [FeatureLoadAndTrap], hasSideEffects = 1 in {
 }
 
 // Extend GR64s to GR128s.
-let usesCustomInserter = 1 in
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
   def ZEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
 
 //===----------------------------------------------------------------------===//
@@ -693,7 +694,7 @@ def : Pat<(i64 (anyext GR32:$src)),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
 
 // Extend GR64s to GR128s.
-let usesCustomInserter = 1 in
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
   def AEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
 
 //===----------------------------------------------------------------------===//
@@ -890,12 +891,12 @@ def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
 // Addition
 //===----------------------------------------------------------------------===//
 
-// Plain addition.
+// Addition producing a signed overflow flag.
 let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Addition of a register.
   let isCommutable = 1 in {
-    defm AR : BinaryRRAndK<"ar", 0x1A, 0xB9F8, add, GR32, GR32>;
-    defm AGR : BinaryRREAndK<"agr", 0xB908, 0xB9E8, add, GR64, GR64>;
+    defm AR : BinaryRRAndK<"ar", 0x1A, 0xB9F8, z_sadd, GR32, GR32>;
+    defm AGR : BinaryRREAndK<"agr", 0xB908, 0xB9E8, z_sadd, GR64, GR64>;
   }
   def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
 
@@ -906,38 +907,38 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
               Requires<[FeatureHighWord]>;
 
   // Addition of signed 16-bit immediates.
-  defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
-  defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
-  defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
+  defm AHIMux : BinaryRIAndKPseudo<"ahimux", z_sadd, GRX32, imm32sx16>;
+  defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, z_sadd, GR32, imm32sx16>;
+  defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, z_sadd, GR64, imm64sx16>;
 
   // Addition of signed 32-bit immediates.
-  def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+  def AFIMux : BinaryRIPseudo<z_sadd, GRX32, simm32>,
                Requires<[FeatureHighWord]>;
-  def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
-  def AIH  : BinaryRIL<"aih",  0xCC8, add, GRH32, simm32>,
+  def AFI  : BinaryRIL<"afi",  0xC29, z_sadd, GR32, simm32>;
+  def AIH  : BinaryRIL<"aih",  0xCC8, z_sadd, GRH32, simm32>,
              Requires<[FeatureHighWord]>;
-  def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
+  def AGFI : BinaryRIL<"agfi", 0xC28, z_sadd, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
-  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
-  def  AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>,
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
+  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, z_sadd, GR32, load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
-  def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
+  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
+  def  AG  : BinaryRXY<"ag",  0xE308, z_sadd, GR64, load, 8>;
 
   // Addition to memory.
   def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
   def AGSI : BinarySIY<"agsi", 0xEB7A, add, imm64sx8>;
 }
-defm : SXB<add, GR64, AGFR>;
+defm : SXB<z_sadd, GR64, AGFR>;
 
 // Addition producing a carry.
 let Defs = [CC] in {
   // Addition of a register.
   let isCommutable = 1 in {
-    defm ALR : BinaryRRAndK<"alr", 0x1E, 0xB9FA, addc, GR32, GR32>;
-    defm ALGR : BinaryRREAndK<"algr", 0xB90A, 0xB9EA, addc, GR64, GR64>;
+    defm ALR : BinaryRRAndK<"alr", 0x1E, 0xB9FA, z_uadd, GR32, GR32>;
+    defm ALGR : BinaryRREAndK<"algr", 0xB90A, 0xB9EA, z_uadd, GR64, GR64>;
   }
   def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
 
@@ -948,56 +949,56 @@ let Defs = [CC] in {
                Requires<[FeatureHighWord]>;
 
   // Addition of signed 16-bit immediates.
-  def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
+  def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, z_uadd, GR32, imm32sx16>,
                 Requires<[FeatureDistinctOps]>;
-  def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, addc, GR64, imm64sx16>,
+  def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, z_uadd, GR64, imm64sx16>,
                 Requires<[FeatureDistinctOps]>;
 
   // Addition of unsigned 32-bit immediates.
-  def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
-  def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
+  def ALFI  : BinaryRIL<"alfi",  0xC2B, z_uadd, GR32, uimm32>;
+  def ALGFI : BinaryRIL<"algfi", 0xC2A, z_uadd, GR64, imm64zx32>;
 
   // Addition of signed 32-bit immediates.
   def ALSIH : BinaryRIL<"alsih", 0xCCA, null_frag, GRH32, simm32>,
               Requires<[FeatureHighWord]>;
 
   // Addition of memory.
-  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
-  def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
+  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
+  def  ALG  : BinaryRXY<"alg",  0xE30A, z_uadd, GR64, load, 8>;
 
   // Addition to memory.
   def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
   def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
 }
-defm : ZXB<addc, GR64, ALGFR>;
+defm : ZXB<z_uadd, GR64, ALGFR>;
 
 // Addition producing and using a carry.
 let Defs = [CC], Uses = [CC] in {
   // Addition of a register.
-  def ALCR  : BinaryRRE<"alcr",  0xB998, adde, GR32, GR32>;
-  def ALCGR : BinaryRRE<"alcgr", 0xB988, adde, GR64, GR64>;
+  def ALCR  : BinaryRRE<"alcr",  0xB998, z_addcarry, GR32, GR32>;
+  def ALCGR : BinaryRRE<"alcgr", 0xB988, z_addcarry, GR64, GR64>;
 
   // Addition of memory.
-  def ALC  : BinaryRXY<"alc",  0xE398, adde, GR32, load, 4>;
-  def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
+  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, load, 8>;
 }
 
 // Addition that does not modify the condition code.
 def ALSIHN : BinaryRIL<"alsihn", 0xCCB, null_frag, GRH32, simm32>,
              Requires<[FeatureHighWord]>;
 
+
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
 
-// Plain subtraction.  Although immediate forms exist, we use the
-// add-immediate instruction instead.
+// Subtraction producing a signed overflow flag.
 let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Subtraction of a register.
-  defm SR : BinaryRRAndK<"sr", 0x1B, 0xB9F9, sub, GR32, GR32>;
+  defm SR : BinaryRRAndK<"sr", 0x1B, 0xB9F9, z_ssub, GR32, GR32>;
   def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
-  defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
+  defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, z_ssub, GR64, GR64>;
 
   // Subtraction from a high register.
   def SHHHR : BinaryRRFa<"shhhr", 0xB9C9, null_frag, GRH32, GRH32, GRH32>,
@@ -1006,21 +1007,39 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
               Requires<[FeatureHighWord]>;
 
   // Subtraction of memory.
-  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
-  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
-  def  SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>,
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
+  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
-  def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
+  def  SG  : BinaryRXY<"sg",  0xE309, z_ssub, GR64, load, 8>;
+}
+defm : SXB<z_ssub, GR64, SGFR>;
+
+// Subtracting an immediate is the same as adding the negated immediate.
+let AddedComplexity = 1 in {
+  def : Pat<(z_ssub GR32:$src1, imm32sx16n:$src2),
+            (AHIMux GR32:$src1, imm32sx16n:$src2)>,
+        Requires<[FeatureHighWord]>;
+  def : Pat<(z_ssub GR32:$src1, simm32n:$src2),
+            (AFIMux GR32:$src1, simm32n:$src2)>,
+        Requires<[FeatureHighWord]>;
+  def : Pat<(z_ssub GR32:$src1, imm32sx16n:$src2),
+            (AHI GR32:$src1, imm32sx16n:$src2)>;
+  def : Pat<(z_ssub GR32:$src1, simm32n:$src2),
+            (AFI GR32:$src1, simm32n:$src2)>;
+  def : Pat<(z_ssub GR64:$src1, imm64sx16n:$src2),
+            (AGHI GR64:$src1, imm64sx16n:$src2)>;
+  def : Pat<(z_ssub GR64:$src1, imm64sx32n:$src2),
+            (AGFI GR64:$src1, imm64sx32n:$src2)>;
 }
-defm : SXB<sub, GR64, SGFR>;
 
 // Subtraction producing a carry.
 let Defs = [CC] in {
   // Subtraction of a register.
-  defm SLR : BinaryRRAndK<"slr", 0x1F, 0xB9FB, subc, GR32, GR32>;
+  defm SLR : BinaryRRAndK<"slr", 0x1F, 0xB9FB, z_usub, GR32, GR32>;
   def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
-  defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
+  defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, z_usub, GR64, GR64>;
 
   // Subtraction from a high register.
   def SLHHHR : BinaryRRFa<"slhhhr", 0xB9CB, null_frag, GRH32, GRH32, GRH32>,
@@ -1028,29 +1047,43 @@ let Defs = [CC] in {
   def SLHHLR : BinaryRRFa<"slhhlr", 0xB9DB, null_frag, GRH32, GRH32, GR32>,
                Requires<[FeatureHighWord]>;
 
-  // Subtraction of unsigned 32-bit immediates.  These don't match
-  // subc because we prefer addc for constants.
-  def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
-  def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
+  // Subtraction of unsigned 32-bit immediates.
+  def SLFI  : BinaryRIL<"slfi",  0xC25, z_usub, GR32, uimm32>;
+  def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
 
   // Subtraction of memory.
-  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
-  def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load, 8>;
+  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
+  def  SLG  : BinaryRXY<"slg",  0xE30B, z_usub, GR64, load, 8>;
+}
+defm : ZXB<z_usub, GR64, SLGFR>;
+
+// Subtracting an immediate is the same as adding the negated immediate.
+let AddedComplexity = 1 in {
+  def : Pat<(z_usub GR32:$src1, imm32sx16n:$src2),
+            (ALHSIK GR32:$src1, imm32sx16n:$src2)>,
+        Requires<[FeatureDistinctOps]>;
+  def : Pat<(z_usub GR64:$src1, imm64sx16n:$src2),
+            (ALGHSIK GR64:$src1, imm64sx16n:$src2)>,
+        Requires<[FeatureDistinctOps]>;
 }
-defm : ZXB<subc, GR64, SLGFR>;
+
+// And vice versa in one special case (but we prefer addition).
+def : Pat<(add GR64:$src1, imm64zx32n:$src2),
+          (SLGFI GR64:$src1, imm64zx32n:$src2)>;
 
 // Subtraction producing and using a carry.
 let Defs = [CC], Uses = [CC] in {
   // Subtraction of a register.
-  def SLBR  : BinaryRRE<"slbr",  0xB999, sube, GR32, GR32>;
-  def SLBGR : BinaryRRE<"slbgr", 0xB989, sube, GR64, GR64>;
+  def SLBR  : BinaryRRE<"slbr",  0xB999, z_subcarry, GR32, GR32>;
+  def SLBGR : BinaryRRE<"slbgr", 0xB989, z_subcarry, GR64, GR64>;
 
   // Subtraction of memory.
-  def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load, 4>;
-  def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load, 8>;
+  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, load, 8>;
 }
 
+
 //===----------------------------------------------------------------------===//
 // AND
 //===----------------------------------------------------------------------===//
@@ -1492,7 +1525,7 @@ defm : ZXB<z_ucmp, GR64, CLGFR>;
 
 // Memory-to-memory comparison.
 let mayLoad = 1, Defs = [CC] in {
-  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
   def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
   def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
   def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
@@ -1933,15 +1966,16 @@ let isCall = 1, Defs = [CC] in
 let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in {
   // Transaction Begin
   let mayStore = 1, usesCustomInserter = 1, Defs = [CC] in {
-    def TBEGIN : SideEffectBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
-    def TBEGIN_nofloat : SideEffectBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
+    def TBEGIN : TestBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
+    let hasNoSchedulingInfo = 1 in
+     def TBEGIN_nofloat : TestBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
     def TBEGINC : SideEffectBinarySIL<"tbeginc", 0xE561,
                                       int_s390_tbeginc, imm32zx16>;
   }
 
   // Transaction End
   let Defs = [CC] in
-    def TEND : SideEffectInherentS<"tend", 0xB2F8, z_tend>;
+    def TEND : TestInherentS<"tend", 0xB2F8, z_tend>;
 
   // Transaction Abort
   let isTerminator = 1, isBarrier = 1, mayStore = 1,
@@ -2117,32 +2151,6 @@ let isCodeGenOnly = 1, hasSideEffects = 1 in {
 // Peepholes.
 //===----------------------------------------------------------------------===//
 
-// Use AL* for GR64 additions of unsigned 32-bit values.
-defm : ZXB<add, GR64, ALGFR>;
-def  : Pat<(add GR64:$src1, imm64zx32:$src2),
-           (ALGFI GR64:$src1, imm64zx32:$src2)>;
-def  : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
-           (ALGF GR64:$src1, bdxaddr20only:$addr)>;
-
-// Use SL* for GR64 subtractions of unsigned 32-bit values.
-defm : ZXB<sub, GR64, SLGFR>;
-def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
-           (SLGFI GR64:$src1, imm64zx32n:$src2)>;
-def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
-           (SLGF GR64:$src1, bdxaddr20only:$addr)>;
-
-// Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
-// for vector legalization.
-def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)),
-                         (i32 31)),
-                    (i32 31)),
-          (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
-def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
-                                                       imm32zx4:$cc)))),
-                    (i32 63)),
-               (i32 63)),
-          (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
-
 // Avoid generating 2 XOR instructions. (xor (and x, y), y) is
 // equivalent to (and (xor x, -1), y)
 def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y),
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index 791f0334e0f1..802962bd4db0 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -295,7 +295,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
 
     // Add the terminators.
     while (MI != End) {
-      if (!MI->isDebugValue()) {
+      if (!MI->isDebugInstr()) {
         assert(MI->isTerminator() && "Terminator followed by non-terminator");
         Terminators.push_back(describeTerminator(*MI));
         skipTerminator(Position, Terminators.back(), false);
@@ -312,7 +312,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
 // relaxed if it were placed at address Address.
 bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
                                         uint64_t Address) {
-  if (!Terminator.Branch)
+  if (!Terminator.Branch || Terminator.ExtraRelaxSize == 0)
     return false;
 
   const MBBInfo &Target = MBBs[Terminator.TargetBlock];
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 08eb73fc362e..fcbf4c4b5fe4 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -65,25 +65,29 @@ advanceTo(MachineBasicBlock::iterator NextBegin) {
      std::next(LastEmittedMI) : MBB->begin());
 
   for (; I != NextBegin; ++I) {
-    if (I->isPosition() || I->isDebugValue())
+    if (I->isPosition() || I->isDebugInstr())
       continue;
     HazardRec->emitInstruction(&*I);
   }
 }
 
+void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+  LLVM_DEBUG(HazardRec->dumpState(););
+}
+
 void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
   assert ((SchedStates.find(NextMBB) == SchedStates.end()) &&
           "Entering MBB twice?");
-  DEBUG(dbgs() << "+++ Entering " << printMBBReference(*NextMBB));
+  LLVM_DEBUG(dbgs() << "** Entering " << printMBBReference(*NextMBB));
 
   MBB = NextMBB;
+
   /// Create a HazardRec for MBB, save it in SchedStates and set HazardRec to
   /// point to it.
   HazardRec = SchedStates[MBB] = new SystemZHazardRecognizer(TII, &SchedModel);
-  DEBUG (const MachineLoop *Loop = MLI->getLoopFor(MBB);
-         if(Loop && Loop->getHeader() == MBB)
-           dbgs() << " (Loop header)";
-         dbgs() << ":\n";);
+  LLVM_DEBUG(const MachineLoop *Loop = MLI->getLoopFor(MBB);
+             if (Loop && Loop->getHeader() == MBB) dbgs() << " (Loop header)";
+             dbgs() << ":\n";);
 
   // Try to take over the state from a single predecessor, if it has been
   // scheduled. If this is not possible, we are done.
@@ -93,16 +97,17 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
       SchedStates.find(SinglePredMBB) == SchedStates.end())
     return;
 
-  DEBUG(dbgs() << "+++ Continued scheduling from "
-               << printMBBReference(*SinglePredMBB) << "\n";);
+  LLVM_DEBUG(dbgs() << "** Continued scheduling from "
+                    << printMBBReference(*SinglePredMBB) << "\n";);
 
   HazardRec->copyState(SchedStates[SinglePredMBB]);
+  LLVM_DEBUG(HazardRec->dumpState(););
 
   // Emit incoming terminator(s). Be optimistic and assume that branch
   // prediction will generally do "the right thing".
   for (MachineBasicBlock::iterator I = SinglePredMBB->getFirstTerminator();
        I != SinglePredMBB->end(); I++) {
-    DEBUG (dbgs() << "+++ Emitting incoming branch: "; I->dump(););
+    LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; I->dump(););
     bool TakenBranch = (I->isBranch() &&
       (TII->getBranchInfo(*I).Target->isReg() || // Relative branch
        TII->getBranchInfo(*I).Target->getMBB() == MBB));
@@ -113,7 +118,7 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
 }
 
 void SystemZPostRASchedStrategy::leaveMBB() {
-  DEBUG(dbgs() << "+++ Leaving " << printMBBReference(*MBB) << "\n";);
+  LLVM_DEBUG(dbgs() << "** Leaving " << printMBBReference(*MBB) << "\n";);
 
   // Advance to first terminator. The successor block will handle terminators
   // dependent on CFG layout (T/NT branch etc).
@@ -127,7 +132,7 @@ SystemZPostRASchedStrategy(const MachineSchedContext *C)
         (C->MF->getSubtarget().getInstrInfo())), 
     MBB(nullptr), HazardRec(nullptr) {
   const TargetSubtargetInfo *ST = &C->MF->getSubtarget();
-  SchedModel.init(ST->getSchedModel(), ST, TII);
+  SchedModel.init(ST);
 }
 
 SystemZPostRASchedStrategy::~SystemZPostRASchedStrategy() {
@@ -159,14 +164,14 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
 
   // If only one choice, return it.
   if (Available.size() == 1) {
-    DEBUG (dbgs() << "+++ Only one: ";
-           HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
+    LLVM_DEBUG(dbgs() << "** Only one: ";
+               HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
     return *Available.begin();
   }
 
   // All nodes that are possible to schedule are stored by in the
   // Available set.
-  DEBUG(dbgs() << "+++ Available: "; Available.dump(*HazardRec););
+  LLVM_DEBUG(dbgs() << "** Available: "; Available.dump(*HazardRec););
 
   Candidate Best;
   for (auto *SU : Available) {
@@ -177,15 +182,11 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
     // Remeber which SU is the best candidate.
     if (Best.SU == nullptr || c < Best) {
       Best = c;
-      DEBUG(dbgs() << "+++ Best sofar: ";
-            HazardRec->dumpSU(Best.SU, dbgs());
-            if (Best.GroupingCost != 0)
-              dbgs() << "\tGrouping cost:" << Best.GroupingCost;
-            if (Best.ResourcesCost != 0)
-              dbgs() << " Resource cost:" << Best.ResourcesCost;
-            dbgs() << " Height:" << Best.SU->getHeight();
-            dbgs() << "\n";);
-    }
+      LLVM_DEBUG(dbgs() << "** Best so far: ";);
+    } else
+      LLVM_DEBUG(dbgs() << "** Tried      : ";);
+    LLVM_DEBUG(HazardRec->dumpSU(c.SU, dbgs()); c.dumpCosts();
+               dbgs() << " Height:" << c.SU->getHeight(); dbgs() << "\n";);
 
     // Once we know we have seen all SUs that affect grouping or use unbuffered
     // resources, we can stop iterating if Best looks good.
@@ -206,7 +207,7 @@ Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec) : Candidate() {
   // if it would fit naturally into the schedule.
   GroupingCost = HazardRec.groupingCost(SU);
 
-    // Check the resources cost for this SU.
+  // Check the resources cost for this SU.
   ResourcesCost = HazardRec.resourcesCost(SU);
 }
 
@@ -239,7 +240,9 @@ operator<(const Candidate &other) {
 }
 
 void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
-  DEBUG(dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";);
+  LLVM_DEBUG(dbgs() << "** Scheduling SU(" << SU->NodeNum << ") ";
+             if (Available.size() == 1) dbgs() << "(only one) ";
+             Candidate c(SU, *HazardRec); c.dumpCosts(); dbgs() << "\n";);
 
   // Remove SU from Available set and update HazardRec.
   Available.erase(SU);
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index de1bf4655c54..cb0304825966 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -58,6 +58,15 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
     bool noCost() const {
       return (GroupingCost <= 0 && !ResourcesCost);
     }
+
+#ifndef NDEBUG
+    void dumpCosts() {
+      if (GroupingCost != 0)
+        dbgs() << "  Grouping cost:" << GroupingCost;
+      if (ResourcesCost != 0)
+        dbgs() << "  Resource cost:" << ResourcesCost;
+    }
+#endif
   };
 
   // A sorter for the Available set that makes sure that SUs are considered
@@ -119,7 +128,7 @@ public:
   // transferrred over scheduling boundaries.
   bool doMBBSchedRegionsTopDown() const override { return true; }
 
-  void initialize(ScheduleDAGMI *dag) override {}
+  void initialize(ScheduleDAGMI *dag) override;
 
   /// Tell the strategy that MBB is about to be processed.
   void enterMBB(MachineBasicBlock *NextMBB) override;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
index 713612129d90..da682cb4e5ab 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -115,13 +115,13 @@ class AddressingMode<string seltype, string bitsize, string dispsize,
 class BDMode<string type, string bitsize, string dispsize, string suffix>
   : AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
+                        !cast<Operand>("disp"##dispsize##"imm"##bitsize))>;
 
 // An addressing mode with a base, displacement and index.
 class BDXMode<string type, string bitsize, string dispsize, string suffix>
   : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
                         !cast<RegisterOperand>("ADDR"##bitsize))>;
 
 // A BDMode paired with an immediate length operand of LENSIZE bits.
@@ -130,21 +130,21 @@ class BDLMode<string type, string bitsize, string dispsize, string suffix,
   : AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
                    "BDLAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
-                        !cast<Immediate>("imm"##bitsize))>;
+                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Operand>("imm"##bitsize))>;
 
 // A BDMode paired with a register length operand.
 class BDRMode<string type, string bitsize, string dispsize, string suffix>
   : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDRAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
                         !cast<RegisterOperand>("GR"##bitsize))>;
 
 // An addressing mode with a base, displacement and a vector index.
 class BDVMode<string bitsize, string dispsize>
   : AddressOperand<bitsize, dispsize, "", "BDVAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
                         !cast<RegisterOperand>("VR128"))>;
 
 //===----------------------------------------------------------------------===//
@@ -219,6 +219,12 @@ def SIMM16 : SDNodeXForm<imm, [{
                                    MVT::i64);
 }]>;
 
+// Negate and then truncate an immediate to a 16-bit signed quantity.
+def NEGSIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int16_t(-N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
 // Truncate an immediate to a 16-bit unsigned quantity.
 def UIMM16 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(uint16_t(N->getZExtValue()), SDLoc(N),
@@ -231,24 +237,30 @@ def SIMM32 : SDNodeXForm<imm, [{
                                    MVT::i64);
 }]>;
 
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGSIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int32_t(-N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
 // Truncate an immediate to a 32-bit unsigned quantity.
 def UIMM32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(uint32_t(N->getZExtValue()), SDLoc(N),
                                    MVT::i64);
 }]>;
 
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGUIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
 // Truncate an immediate to a 48-bit unsigned quantity.
 def UIMM48 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(uint64_t(N->getZExtValue()) & 0xffffffffffff,
                                    SDLoc(N), MVT::i64);
 }]>;
 
-// Negate and then truncate an immediate to a 32-bit unsigned quantity.
-def NEGIMM32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), SDLoc(N),
-                                   MVT::i64);
-}]>;
-
 //===----------------------------------------------------------------------===//
 // Immediate asm operands.
 //===----------------------------------------------------------------------===//
@@ -336,6 +348,10 @@ def imm32sx16 : Immediate<i32, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
 
+def imm32sx16n : Immediate<i32, [{
+  return isInt<16>(-N->getSExtValue());
+}], NEGSIMM16, "S16Imm">;
+
 def imm32zx16 : Immediate<i32, [{
   return isUInt<16>(N->getZExtValue());
 }], UIMM16, "U16Imm">;
@@ -348,6 +364,10 @@ def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
 def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
 def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
 
+def simm32n : Immediate<i32, [{
+  return isInt<32>(-N->getSExtValue());
+}], NEGSIMM32, "S32Imm">;
+
 def imm32 : ImmLeaf<i32, [{}]>;
 
 //===----------------------------------------------------------------------===//
@@ -423,6 +443,10 @@ def imm64sx16 : Immediate<i64, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
 
+def imm64sx16n : Immediate<i64, [{
+  return isInt<16>(-N->getSExtValue());
+}], NEGSIMM16, "S16Imm">;
+
 def imm64zx16 : Immediate<i64, [{
   return isUInt<16>(N->getZExtValue());
 }], UIMM16, "U16Imm">;
@@ -431,13 +455,17 @@ def imm64sx32 : Immediate<i64, [{
   return isInt<32>(N->getSExtValue());
 }], SIMM32, "S32Imm">;
 
+def imm64sx32n : Immediate<i64, [{
+  return isInt<32>(-N->getSExtValue());
+}], NEGSIMM32, "S32Imm">;
+
 def imm64zx32 : Immediate<i64, [{
   return isUInt<32>(N->getZExtValue());
 }], UIMM32, "U32Imm">;
 
 def imm64zx32n : Immediate<i64, [{
   return isUInt<32>(-N->getSExtValue());
-}], NEGIMM32, "U32Imm">;
+}], NEGUIMM32, "U32Imm">;
 
 def imm64zx48 : Immediate<i64, [{
   return isUInt<64>(N->getZExtValue());
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
index d067f331f677..3cfe23aec417 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -15,19 +15,24 @@ def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>,
 def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
-def SDT_ZICmp               : SDTypeProfile<0, 3,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisVT<2, i32>]>;
-def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
+def SDT_ZCmp                : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisSameAs<1, 2>]>;
+def SDT_ZICmp               : SDTypeProfile<1, 3,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZBRCCMask           : SDTypeProfile<0, 4,
                                             [SDTCisVT<0, i32>,
                                              SDTCisVT<1, i32>,
-                                             SDTCisVT<2, OtherVT>]>;
-def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
+                                             SDTCisVT<2, OtherVT>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZSelectCCMask       : SDTypeProfile<1, 5,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<1, 2>,
                                              SDTCisVT<3, i32>,
-                                             SDTCisVT<4, i32>]>;
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>]>;
 def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisPtrTy<0>]>;
@@ -40,6 +45,17 @@ def SDT_ZGR128Binary        : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, untyped>,
                                              SDTCisInt<1>,
                                              SDTCisInt<2>]>;
+def SDT_ZBinaryWithFlags    : SDTypeProfile<2, 2,
+                                            [SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>]>;
+def SDT_ZBinaryWithCarry    : SDTypeProfile<2, 3,
+                                            [SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisVT<1, i32>]>;
 def SDT_ZAtomicLoadBinaryW  : SDTypeProfile<1, 5,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>,
@@ -47,45 +63,67 @@ def SDT_ZAtomicLoadBinaryW  : SDTypeProfile<1, 5,
                                              SDTCisVT<3, i32>,
                                              SDTCisVT<4, i32>,
                                              SDTCisVT<5, i32>]>;
-def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
+def SDT_ZAtomicCmpSwapW     : SDTypeProfile<2, 6,
                                             [SDTCisVT<0, i32>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisPtrTy<2>,
                                              SDTCisVT<3, i32>,
                                              SDTCisVT<4, i32>,
                                              SDTCisVT<5, i32>,
-                                             SDTCisVT<6, i32>]>;
-def SDT_ZAtomicCmpSwap      : SDTypeProfile<1, 3,
+                                             SDTCisVT<6, i32>,
+                                             SDTCisVT<7, i32>]>;
+def SDT_ZAtomicCmpSwap      : SDTypeProfile<2, 3,
                                             [SDTCisInt<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisSameAs<0, 3>]>;
+                                             SDTCisVT<1, i32>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisSameAs<0, 4>]>;
 def SDT_ZAtomicLoad128      : SDTypeProfile<1, 1,
                                             [SDTCisVT<0, untyped>,
                                              SDTCisPtrTy<1>]>;
 def SDT_ZAtomicStore128     : SDTypeProfile<0, 2,
                                             [SDTCisVT<0, untyped>,
                                              SDTCisPtrTy<1>]>;
-def SDT_ZAtomicCmpSwap128   : SDTypeProfile<1, 3,
+def SDT_ZAtomicCmpSwap128   : SDTypeProfile<2, 3,
                                             [SDTCisVT<0, untyped>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, untyped>,
-                                             SDTCisVT<3, untyped>]>;
+                                             SDTCisVT<1, i32>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, untyped>,
+                                             SDTCisVT<4, untyped>]>;
 def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
                                              SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLengthCC     : SDTypeProfile<1, 3,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i64>]>;
 def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
                                              SDTCisVT<2, i64>,
                                              SDTCisVT<3, i64>]>;
+def SDT_ZMemMemLoopCC       : SDTypeProfile<1, 4,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i64>,
+                                             SDTCisVT<4, i64>]>;
 def SDT_ZString             : SDTypeProfile<1, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
                                              SDTCisPtrTy<2>,
                                              SDTCisVT<3, i32>]>;
-def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZStringCC           : SDTypeProfile<2, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisPtrTy<3>,
+                                             SDTCisVT<4, i32>]>;
+def SDT_ZIPM                : SDTypeProfile<1, 1,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>]>;
 def SDT_ZPrefetch           : SDTypeProfile<0, 2,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>]>;
@@ -97,9 +135,12 @@ def SDT_ZStoreBSwap         : SDTypeProfile<0, 3,
                                             [SDTCisInt<0>,
                                              SDTCisPtrTy<1>,
                                              SDTCisVT<2, OtherVT>]>;
-def SDT_ZTBegin             : SDTypeProfile<0, 2,
-                                            [SDTCisPtrTy<0>,
-                                             SDTCisVT<1, i32>]>;
+def SDT_ZTBegin             : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZTEnd               : SDTypeProfile<1, 0,
+                                            [SDTCisVT<0, i32>]>;
 def SDT_ZInsertVectorElt    : SDTypeProfile<1, 3,
                                             [SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
@@ -115,10 +156,19 @@ def SDT_ZVecUnaryConv       : SDTypeProfile<1, 1,
 def SDT_ZVecUnary           : SDTypeProfile<1, 1,
                                             [SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>]>;
+def SDT_ZVecUnaryCC         : SDTypeProfile<2, 1,
+                                            [SDTCisVec<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>]>;
 def SDT_ZVecBinary          : SDTypeProfile<1, 2,
                                             [SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<0, 2>]>;
+def SDT_ZVecBinaryCC        : SDTypeProfile<2, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 2>]>;
 def SDT_ZVecBinaryInt       : SDTypeProfile<1, 2,
                                             [SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
@@ -127,10 +177,16 @@ def SDT_ZVecBinaryConv      : SDTypeProfile<1, 2,
                                             [SDTCisVec<0>,
                                              SDTCisVec<1>,
                                              SDTCisSameAs<1, 2>]>;
-def SDT_ZVecBinaryConvInt   : SDTypeProfile<1, 2,
+def SDT_ZVecBinaryConvCC    : SDTypeProfile<2, 2,
                                             [SDTCisVec<0>,
-                                             SDTCisVec<1>,
-                                             SDTCisVT<2, i32>]>;
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVec<2>,
+                                             SDTCisSameAs<2, 3>]>;
+def SDT_ZVecBinaryConvIntCC : SDTypeProfile<2, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVec<2>,
+                                             SDTCisVT<3, i32>]>;
 def SDT_ZRotateMask         : SDTypeProfile<1, 2,
                                             [SDTCisVec<0>,
                                              SDTCisVT<1, i32>,
@@ -149,13 +205,28 @@ def SDT_ZVecTernaryInt      : SDTypeProfile<1, 3,
                                              SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisVT<3, i32>]>;
+def SDT_ZVecTernaryIntCC    : SDTypeProfile<2, 3,
+                                            [SDTCisVec<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisVT<4, i32>]>;
 def SDT_ZVecQuaternaryInt   : SDTypeProfile<1, 4,
                                             [SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
                                              SDTCisVT<4, i32>]>;
-def SDT_ZTest               : SDTypeProfile<0, 2, [SDTCisVT<1, i64>]>;
+def SDT_ZVecQuaternaryIntCC : SDTypeProfile<2, 4,
+                                            [SDTCisVec<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisSameAs<0, 4>,
+                                             SDTCisVT<5, i32>]>;
+def SDT_ZTest               : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<2, i64>]>;
 
 //===----------------------------------------------------------------------===//
 // Node definitions
@@ -188,19 +259,26 @@ def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
 def z_iabs              : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
-def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
-def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
-def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
-def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
-                                 [SDNPHasChain, SDNPInGlue]>;
-def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
-    		                 [SDNPInGlue]>;
+def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
+def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
+def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp>;
+def z_br_ccmask_1       : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
+                                 [SDNPHasChain]>;
+def z_select_ccmask_1   : SDNode<"SystemZISD::SELECT_CCMASK",
+                                 SDT_ZSelectCCMask>;
+def z_ipm_1             : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
 def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
+def z_saddo             : SDNode<"SystemZISD::SADDO", SDT_ZBinaryWithFlags>;
+def z_ssubo             : SDNode<"SystemZISD::SSUBO", SDT_ZBinaryWithFlags>;
+def z_uaddo             : SDNode<"SystemZISD::UADDO", SDT_ZBinaryWithFlags>;
+def z_usubo             : SDNode<"SystemZISD::USUBO", SDT_ZBinaryWithFlags>;
+def z_addcarry_1        : SDNode<"SystemZISD::ADDCARRY", SDT_ZBinaryWithCarry>;
+def z_subcarry_1        : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
 
 def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
                                  [SDNPHasChain, SDNPSideEffect]>;
@@ -210,7 +288,7 @@ def z_loadbswap        : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
 def z_storebswap       : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
-def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest, [SDNPOutGlue]>;
+def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
 
 // Defined because the index is an i32 rather than a pointer.
 def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
@@ -229,10 +307,8 @@ def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
                                  SDT_ZVecTernaryInt>;
 def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
 def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
-def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConv,
-                                 [SDNPOutGlue]>;
-def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConv,
-                                 [SDNPOutGlue]>;
+def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConvCC>;
+def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConvCC>;
 def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
 def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
 def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
@@ -247,44 +323,30 @@ def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>;
 def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
 def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;
 def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
-def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
-def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
-def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
+def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinaryCC>;
+def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinaryCC>;
+def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinaryCC>;
 def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
 def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
 def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
-def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConv,
-                                 [SDNPOutGlue]>;
-def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConv,
-                                 [SDNPOutGlue]>;
-def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConv,
-                                 [SDNPOutGlue]>;
+def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConvCC>;
+def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConvCC>;
+def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConvCC>;
 def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
 def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
-def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp, [SDNPOutGlue]>;
-def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryInt,
-                                 [SDNPOutGlue]>;
-def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryInt,
-                                 [SDNPOutGlue]>;
-def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
-def z_vfeez_cc          : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
-def z_vfene_cc          : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
-def z_vfenez_cc         : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinary,
-                                 [SDNPOutGlue]>;
-def z_vistr_cc          : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnary,
-                                 [SDNPOutGlue]>;
-def z_vstrc_cc          : SDNode<"SystemZISD::VSTRC_CC", SDT_ZVecQuaternaryInt,
-                                 [SDNPOutGlue]>;
+def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp>;
+def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryIntCC>;
+def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryIntCC>;
+def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinaryCC>;
+def z_vfeez_cc          : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinaryCC>;
+def z_vfene_cc          : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinaryCC>;
+def z_vfenez_cc         : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinaryCC>;
+def z_vistr_cc          : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnaryCC>;
+def z_vstrc_cc          : SDNode<"SystemZISD::VSTRC_CC",
+                                 SDT_ZVecQuaternaryIntCC>;
 def z_vstrcz_cc         : SDNode<"SystemZISD::VSTRCZ_CC",
-                                 SDT_ZVecQuaternaryInt, [SDNPOutGlue]>;
-def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvInt,
-                                 [SDNPOutGlue]>;
+                                 SDT_ZVecQuaternaryIntCC>;
+def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
 
 class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
   : SDNode<"SystemZISD::"##name, profile,
@@ -305,11 +367,11 @@ def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 def z_atomic_cmp_swap   : SDNode<"SystemZISD::ATOMIC_CMP_SWAP",
                                  SDT_ZAtomicCmpSwap,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
-                                  SDNPOutGlue, SDNPMemOperand]>;
+                                  SDNPMemOperand]>;
 def z_atomic_cmp_swapw  : SDNode<"SystemZISD::ATOMIC_CMP_SWAPW",
                                  SDT_ZAtomicCmpSwapW,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
-                                  SDNPOutGlue, SDNPMemOperand]>;
+                                  SDNPMemOperand]>;
 
 def z_atomic_load_128   : SDNode<"SystemZISD::ATOMIC_LOAD_128",
                                  SDT_ZAtomicLoad128,
@@ -320,7 +382,7 @@ def z_atomic_store_128  : SDNode<"SystemZISD::ATOMIC_STORE_128",
 def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
                                    SDT_ZAtomicCmpSwap128,
                                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
-                                    SDNPOutGlue, SDNPMemOperand]>;
+                                    SDNPMemOperand]>;
 
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
@@ -338,30 +400,26 @@ def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
-def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
-def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
+                                 [SDNPHasChain, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoopCC,
+                                 [SDNPHasChain, SDNPMayLoad]>;
+def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
+                                 [SDNPHasChain, SDNPMayLoad]>;
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
-def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
-                                 [SDNPInGlue]>;
+def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZStringCC,
+                                 [SDNPHasChain, SDNPMayLoad]>;
 def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
                                   SDNPMemOperand]>;
 
 def z_tbegin            : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
-                                  SDNPSideEffect]>;
+                                 [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
 def z_tbegin_nofloat    : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
-                                  SDNPSideEffect]>;
-def z_tend              : SDNode<"SystemZISD::TEND", SDTNone,
-                                 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+                                 [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+def z_tend              : SDNode<"SystemZISD::TEND", SDT_ZTEnd,
+                                 [SDNPHasChain, SDNPSideEffect]>;
 
 def z_vshl              : SDNode<"ISD::SHL", SDT_ZVecBinary>;
 def z_vsra              : SDNode<"ISD::SRA", SDT_ZVecBinary>;
@@ -382,6 +440,20 @@ def z_strv  : PatFrag<(ops node:$src, node:$addr),
 def z_strvg : PatFrag<(ops node:$src, node:$addr),
                       (z_storebswap node:$src, node:$addr, i64)>;
 
+// Fragments including CC as an implicit source.
+def z_br_ccmask
+  : PatFrag<(ops node:$valid, node:$mask, node:$bb),
+            (z_br_ccmask_1 node:$valid, node:$mask, node:$bb, CC)>;
+def z_select_ccmask
+  : PatFrag<(ops node:$true, node:$false, node:$valid, node:$mask),
+            (z_select_ccmask_1 node:$true, node:$false,
+                               node:$valid, node:$mask, CC)>;
+def z_ipm : PatFrag<(ops), (z_ipm_1 CC)>;
+def z_addcarry : PatFrag<(ops node:$lhs, node:$rhs),
+                              (z_addcarry_1 node:$lhs, node:$rhs, CC)>;
+def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs),
+                              (z_subcarry_1 node:$lhs, node:$rhs, CC)>;
+
 // Signed and unsigned comparisons.
 def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
   unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
@@ -574,6 +646,20 @@ def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
 def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                        (add (mul node:$src1, node:$src2), node:$src3)>;
 
+// Alternatives to match operations with or without an overflow CC result.
+def z_sadd : PatFrags<(ops node:$src1, node:$src2),
+                      [(z_saddo node:$src1, node:$src2),
+                       (add node:$src1, node:$src2)]>;
+def z_uadd : PatFrags<(ops node:$src1, node:$src2),
+                      [(z_uaddo node:$src1, node:$src2),
+                       (add node:$src1, node:$src2)]>;
+def z_ssub : PatFrags<(ops node:$src1, node:$src2),
+                      [(z_ssubo node:$src1, node:$src2),
+                       (sub node:$src1, node:$src2)]>;
+def z_usub : PatFrags<(ops node:$src1, node:$src2),
+                      [(z_usubo node:$src1, node:$src2),
+                       (sub node:$src1, node:$src2)]>;
+
 // Fused multiply-subtract, using the natural operand order.
 def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                   (fma node:$src1, node:$src2, (fneg node:$src3))>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 856505e00a10..76ed6f80ba55 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -108,6 +108,10 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
 
 const MCPhysReg *
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
+  if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
+    return Subtarget.hasVector()? CSR_SystemZ_AllRegs_Vector_SaveList
+                                : CSR_SystemZ_AllRegs_SaveList;
   if (MF->getSubtarget().getTargetLowering()->supportSwiftError() &&
       MF->getFunction().getAttributes().hasAttrSomewhere(
           Attribute::SwiftError))
@@ -118,6 +122,10 @@ SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t *
 SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  if (CC == CallingConv::AnyReg)
+    return Subtarget.hasVector()? CSR_SystemZ_AllRegs_Vector_RegMask
+                                : CSR_SystemZ_AllRegs_RegMask;
   if (MF.getSubtarget().getTargetLowering()->supportSwiftError() &&
       MF.getFunction().getAttributes().hasAttrSomewhere(
           Attribute::SwiftError))
@@ -307,3 +315,11 @@ SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const SystemZFrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
 }
+
+const TargetRegisterClass *
+SystemZRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &SystemZ::CCRRegClass)
+    return &SystemZ::GR32BitRegClass;
+  return RC;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 8787a90b1e25..94781659a50a 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -44,6 +44,12 @@ public:
     return &SystemZ::ADDR64BitRegClass;
   }
 
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
   bool getRegAllocationHints(unsigned VirtReg,
                              ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
@@ -71,7 +77,7 @@ public:
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+  /// SrcRC and DstRC will be morphed into NewRC if this returns true.
  bool shouldCoalesce(MachineInstr *MI,
                       const TargetRegisterClass *SrcRC,
                       unsigned SubReg,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index a1cfaf699401..79ba7534f92c 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -130,7 +130,7 @@ defm AnyReg : SystemZRegClass<"AnyReg",
                               [i64, f64, v8i8, v4i16, v2i32, v2f32], 64,
                               (add (sequence "R%uD", 0, 15),
                                    (sequence "F%uD", 0, 15),
-                                   (sequence "V%u", 0, 15))>;
+                                   (sequence "V%u", 0, 15)), 0/*allocatable*/>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point registers
@@ -263,7 +263,7 @@ defm VF128 : SystemZRegClass<"VF128",
 
 // All vector registers.
 defm VR128 : SystemZRegClass<"VR128",
-                             [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
                              128, (add (sequence "V%u", 0, 7),
                                        (sequence "V%u", 16, 31),
                                        (sequence "V%u", 8, 15))>;
@@ -296,8 +296,8 @@ def v128any : TypedReg<untyped, VR128>;
 // The 2-bit condition code field of the PSW.  Every register named in an
 // inline asm needs a class associated with it.
 def CC : SystemZReg<"cc">;
-let isAllocatable = 0 in
-  def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+let isAllocatable = 0, CopyCost = -1 in
+  def CCR : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
 
 // Access registers.
 class ACR32<bits<16> num, string n> : SystemZReg<n> {
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
index 8dba89f70a42..385a94b5d6a9 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
@@ -8,75 +8,57 @@
 //===----------------------------------------------------------------------===//
 
 // Scheduler resources
-// Resources ending with a '2' use that resource for 2 cycles. An instruction
-// using two such resources use the mapped unit for 4 cycles, and 2 is added
-// to the total number of uops of the sched class.
 
-// These three resources are used to express decoder grouping rules.
-// The number of decoder slots needed by an instructions is normally
-// one. For a cracked instruction (BeginGroup && !EndGroup) it is
-// two. Expanded instructions (BeginGroup && EndGroup) group alone.
+// These resources are used to express decoder grouping rules.  The number of
+// decoder slots needed by an instructions is normally one, but there are
+// exceptions.
+def NormalGr   : SchedWrite;
+def Cracked    : SchedWrite;
 def GroupAlone : SchedWrite;
 def BeginGroup : SchedWrite;
 def EndGroup   : SchedWrite;
 
-// Latencies, to make code a bit neater. If more than one resource is
-// used for an instruction, the greatest latency (not the sum) will be
-// output by Tablegen. Therefore, in such cases one of these resources
-// is needed.
-def Lat2 : SchedWrite;
-def Lat3 : SchedWrite;
-def Lat4 : SchedWrite;
-def Lat5 : SchedWrite;
-def Lat6 : SchedWrite;
-def Lat7 : SchedWrite;
-def Lat8 : SchedWrite;
-def Lat9 : SchedWrite;
-def Lat10 : SchedWrite;
-def Lat11 : SchedWrite;
-def Lat12 : SchedWrite;
-def Lat15 : SchedWrite;
-def Lat20 : SchedWrite;
-def Lat30 : SchedWrite;
+// A SchedWrite added to other SchedWrites to make LSU latency parameterizable.
+def LSULatency : SchedWrite;
 
-// Fixed-point
-def FXa         : SchedWrite;
-def FXa2        : SchedWrite;
-def FXb         : SchedWrite;
-def FXU         : SchedWrite;
+// Operand WriteLatencies.
+foreach L = 1 - 30 in def "WLat"#L : SchedWrite;
 
-// Load/store unit
-def LSU         : SchedWrite;
+foreach L = 1 - 16 in
+  def "WLat"#L#"LSU" : WriteSequence<[!cast<SchedWrite>("WLat"#L),
+                                      LSULatency]>;
 
-// Model a return without latency, otherwise if-converter will model
-// extra cost and abort (currently there is an assert that checks that
-// all instructions have at least one uop).
-def LSU_lat1    : SchedWrite;
+// ReadAdvances, used for the register operand next to a memory operand,
+// modelling that the register operand is needed later than the address
+// operands.
+def RegReadAdv : SchedRead;
 
-// Floating point unit (zEC12 and earlier)
-def FPU  : SchedWrite;
-def FPU2 : SchedWrite;
-def DFU  : SchedWrite;
-def DFU2 : SchedWrite;
+foreach Num = ["", "2", "3", "4", "5", "6"] in {
+  // Fixed-point units
+  def "FXa"#Num : SchedWrite;
+  def "FXb"#Num : SchedWrite;
+  def "FXU"#Num : SchedWrite;
+  // Load/store unit
+  def "LSU"#Num : SchedWrite;
+  // Vector sub units (z13 and later)
+  def "VecBF"#Num : SchedWrite;
+  def "VecDF"#Num : SchedWrite;
+  def "VecDFX"#Num : SchedWrite;
+  def "VecMul"#Num : SchedWrite;
+  def "VecStr"#Num : SchedWrite;
+  def "VecXsPm"#Num : SchedWrite;
+  // Floating point unit (zEC12 and earlier)
+  def "FPU"#Num : SchedWrite;
+  def "DFU"#Num : SchedWrite;
+}
 
-// Vector sub units (z13 and later)
-def VecBF     : SchedWrite;
-def VecBF2    : SchedWrite;
-def VecDF     : SchedWrite;
-def VecDF2    : SchedWrite;
-def VecDFX    : SchedWrite;
-def VecDFX2   : SchedWrite;
-def VecFPd    : SchedWrite; // Blocking BFP div/sqrt unit.
-def VecMul    : SchedWrite;
-def VecStr    : SchedWrite;
-def VecXsPm   : SchedWrite;
+def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
 
-// Virtual branching unit
-def VBU         : SchedWrite;
+def VBU : SchedWrite; // Virtual branching unit
 
+def MCD : SchedWrite; // Millicode
 
 include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
 include "SystemZScheduleZEC12.td"
 include "SystemZScheduleZ196.td"
-
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index 72543c1eaee2..5d32232107af 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -10,13 +10,15 @@
 // This file defines the machine model for Z13 to support instruction
 // scheduling and other instruction cost heuristics.
 //
+// Pseudos expanded right after isel do not need to be modelled here.
+//
 //===----------------------------------------------------------------------===//
 
 def Z13Model : SchedMachineModel {
 
     let UnsupportedFeatures = Arch11UnsupportedFeatures.List;
 
-    let IssueWidth = 8;
+    let IssueWidth = 6;             // Number of instructions decoded per cycle.
     let MicroOpBufferSize = 60;     // Issue queues
     let LoadLatency = 1;            // Optimistic load latency.
 
@@ -27,37 +29,39 @@ def Z13Model : SchedMachineModel {
 }
 
 let SchedModel = Z13Model in  {
-
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
-def : WriteRes<GroupAlone, []> {
-  let NumMicroOps = 0;
-  let BeginGroup  = 1;
-  let EndGroup    = 1;
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
 }
-def : WriteRes<BeginGroup, []> {
-  let NumMicroOps = 0;
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
   let BeginGroup  = 1;
 }
-def : WriteRes<EndGroup, []> {
-  let NumMicroOps = 0;
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 3;
+  let BeginGroup  = 1;
   let EndGroup    = 1;
 }
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
 
 // Execution units.
 def Z13_FXaUnit     : ProcResource<2>;
@@ -66,33 +70,39 @@ def Z13_LSUnit      : ProcResource<2>;
 def Z13_VecUnit     : ProcResource<2>;
 def Z13_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
 def Z13_VBUnit      : ProcResource<2>;
+def Z13_MCD         : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXa,     [Z13_FXaUnit]> { let Latency = 1; }
-def : WriteRes<FXa2,    [Z13_FXaUnit, Z13_FXaUnit]> { let Latency = 2; }
-def : WriteRes<FXb,     [Z13_FXbUnit]> { let Latency = 1; }
-def : WriteRes<LSU,     [Z13_LSUnit]>  { let Latency = 4; }
-def : WriteRes<VecBF,   [Z13_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecBF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDF,   [Z13_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecDF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDFX,  [Z13_VecUnit]> { let Latency = 1; }
-def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
-def : WriteRes<VecFPd,  [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
-                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit]>
-                         { let Latency = 30; }
-def : WriteRes<VecMul,  [Z13_VecUnit]> { let Latency = 5; }
-def : WriteRes<VecStr,  [Z13_VecUnit]> { let Latency = 4; }
-def : WriteRes<VecXsPm, [Z13_VecUnit]> { let Latency = 3; }
-def : WriteRes<VBU,     [Z13_VBUnit]>; // Virtual Branching Unit
+let NumMicroOps = 0 in {
+  def : WriteRes<FXa, [Z13_FXaUnit]>;
+  def : WriteRes<FXb, [Z13_FXbUnit]>;
+  def : WriteRes<LSU, [Z13_LSUnit]>;
+  def : WriteRes<VecBF,  [Z13_VecUnit]>;
+  def : WriteRes<VecDF,  [Z13_VecUnit]>;
+  def : WriteRes<VecDFX, [Z13_VecUnit]>;
+  def : WriteRes<VecMul,  [Z13_VecUnit]>;
+  def : WriteRes<VecStr,  [Z13_VecUnit]>;
+  def : WriteRes<VecXsPm, [Z13_VecUnit]>;
+  foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z13_FXaUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z13_FXbUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z13_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z13_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z13_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z13_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z13_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z13_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z13_VecUnit]>;
+  }}
+
+  def : WriteRes<VecFPd,  [Z13_VecFPdUnit]> { let ResourceCycles = [30]; }
+
+  def : WriteRes<VBU,     [Z13_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z13_MCD]> { let NumMicroOps = 3;
+                                 let BeginGroup  = 1;
+                                 let EndGroup    = 1; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -106,26 +116,27 @@ def : WriteRes<VBU,     [Z13_VBUnit]>; // Virtual Branching Unit
 // Stack allocation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
 //===----------------------------------------------------------------------===//
 
 // Branch
-def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb2, GroupAlone],
              (instregex "B(R)?X(H|L).*$")>;
 
 // Compare and branch
-def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
              (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -133,593 +144,609 @@ def : InstRW<[FXb, FXb, Lat2, GroupAlone],
 //===----------------------------------------------------------------------===//
 
 // Trap
-def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
 
 // Compare and trap
-def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Call and return instructions
 //===----------------------------------------------------------------------===//
 
 // Call
-def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[FXb], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo
-def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
 //===----------------------------------------------------------------------===//
 
 // Moves
-def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
 
 // Move character
-def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
-def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
 
 // Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
 
-def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
-def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
 
 // Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
 
 // Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
 
 // Load and test
-def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
 
 // Stores
-def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
 
 // String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
 
 //===----------------------------------------------------------------------===//
 // Conditional move instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+             (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXa], (instregex "LTGFR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
 
 // Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LLZRGF$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
 
 // Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
 
 // Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-             (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
 
-// Store multiple (estimated average of ceil(5/2) FXb ops)
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
-              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Byte swaps
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
 
 // Load the Global Offset Table address ( -> larl )
-def : InstRW<[FXa], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
 
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, Lat2], (instregex "LP(G)?R$")>;
-def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXa, Lat2], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat3, WLat3, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Insertion
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "IILF(64)?$")>;
-def : InstRW<[FXa], (instregex "IILH(64)?$")>;
-def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
-def : InstRW<[FXa], (instregex "AIH$")>;
-def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
-def : InstRW<[FXa], (instregex "AGFI$")>;
-def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AGR(K)?$")>;
-def : InstRW<[FXa], (instregex "AHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
-def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
-def : InstRW<[FXa], (instregex "ALGHSIK$")>;
-def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXa], (instregex "ALR(K)?$")>;
-def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
 
 // Add with sign extension (32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "AGF$")>;
-def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AGF$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
-def : InstRW<[FXa], (instregex "SGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLFI$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLR(K)?$")>;
-def : InstRW<[FXa], (instregex "SR(K)?$")>;
-def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
 
 // Subtraction with sign extension (32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "SGF$")>;
-def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SGF$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // AND
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "NGR(K)?$")>;
-def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "NILF(64)?$")>;
-def : InstRW<[FXa], (instregex "NILH(64)?$")>;
-def : InstRW<[FXa], (instregex "NILL(64)?$")>;
-def : InstRW<[FXa], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
 
 //===----------------------------------------------------------------------===//
 // OR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "OGR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "OILF(64)?$")>;
-def : InstRW<[FXa], (instregex "OILH(64)?$")>;
-def : InstRW<[FXa], (instregex "OILL(64)?$")>;
-def : InstRW<[FXa], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
 
 //===----------------------------------------------------------------------===//
 // XOR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXa], (instregex "XIFMux$")>;
-def : InstRW<[FXa], (instregex "XGR(K)?$")>;
-def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "XILF(64)?$")>;
-def : InstRW<[FXa], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
 
 //===----------------------------------------------------------------------===//
 // Multiplication
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXa, Lat6], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXa, LSU, Lat12], (instregex "MSG$")>;
-def : InstRW<[FXa, Lat8], (instregex "MSGR$")>;
-def : InstRW<[FXa, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXa2, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXa2, Lat9, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
-def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
-def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXa2, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXa2, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat6, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat8, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat6, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "MLG$")>;
+def : InstRW<[WLat9, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat7, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
-def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
-def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
-def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "DL(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Shifts
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone],
              (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
-def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
 
 // Rotate and insert
-def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
 
 // Rotate and Select
-def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat3, WLat3, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
 
 //===----------------------------------------------------------------------===//
 // Comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXb], (instregex "C(G)?R$")>;
-def : InstRW<[FXb], (instregex "CIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXb], (instregex "CLGR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXb], (instregex "CLIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXb], (instregex "CLR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
-def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
 
 // Compare with sign extension (32 -> 64)
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
 
 // Compare logical character
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
 
 // Test under mask
-def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
 def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
-def : InstRW<[FXb], (instregex "NIAI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
 
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
 
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
 
 // Test and set
-def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
 
 // Compare and swap
-def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
 
 // Compare double and swap
-def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone],
              (instregex "CDS(Y)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
 
 // Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
 
 // Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
 
 // Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone],
+             (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone],
              (instregex "CVBG$")>;
-def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
-             (instregex "CVDG$")>;
-def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
-
-def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF, LSU, GroupAlone],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
-             (instregex "(M|D)P$")>;
-def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
-             (instregex "SRP$")>;
-def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
-def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
 // Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
 
 // Load address extended
-def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
 
 // Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Program mask and addressing mode
 //===----------------------------------------------------------------------===//
 
 // Insert Program Mask
-def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
 
 // Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
 
 // Branch and link
-def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
 
 // Test addressing mode
-def : InstRW<[FXb], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
 
 // Set addressing mode
-def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
 
 // Branch (and save) and set mode.
-def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
 
 //===----------------------------------------------------------------------===//
 // Transactional execution
 //===----------------------------------------------------------------------===//
 
 // Transaction begin
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
-              (instregex "TBEGIN(C|_nofloat)?$")>;
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone], (instregex "TBEGIN(C)?$")>;
 
 // Transaction end
-def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
 
 // Transaction abort
-def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
 
 // Extract Transaction Nesting Depth
-def : InstRW<[FXa], (instregex "ETND$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
 
 // Nontransactional store
-def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
 
 //===----------------------------------------------------------------------===//
 // Processor assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "PPA$")>;
+def : InstRW<[WLat30, MCD], (instregex "PPA$")>;
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXa, FXa, Lat6, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat7, WLat7, FXa2, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
-def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXa], (instregex "AEXT128$")>;
-def : InstRW<[FXa], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT$")>;
 
 // String instructions
-def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
 
 // Execute
-def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
@@ -733,168 +760,158 @@ def : InstRW<[], (instregex "Insn.*")>;
 // ----------------------------- Floating point ----------------------------- //
 
 //===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXa], (instregex "SelectF(32|64|128)$")>;
-def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
 // FP: Move instructions
 //===----------------------------------------------------------------------===//
 
 // Load zero
-def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
 
 // Load
-def : InstRW<[VecXsPm], (instregex "LER$")>;
-def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
 
 // Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
-             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+             (instregex "LTXBR(Compare)?$")>;
 
 // Copy sign
-def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Load instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Store instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
 
 // Load lengthened
-def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[VecBF], (instregex "LDEBR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)BR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked],
+             (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
 
 // Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
 
 // Load FP integer
-def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
-def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
 
 // Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
 
 // Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
 
 // Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+             (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
-def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Floating-point control register instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -904,108 +921,113 @@ def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[VecBF], (instregex "LEXR$")>;
-def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
 
 // Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXb], (instregex "LDER$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
 
 // Convert from fixed
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)R$")>;
 
 // Convert to fixed
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
 
 // Convert BFP to HFP / HFP to BFP.
-def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
-def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
 
 // Halve
-def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "H(E|D)R$")>;
 
 // Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
 
 // Load FP integer
-def : InstRW<[VecBF], (instregex "FIER$")>;
-def : InstRW<[VecBF], (instregex "FIDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
 
 // Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
 
 // Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXD$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+             (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "C(E|D)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
 
 
 // ------------------------ Decimal floating point -------------------------- //
@@ -1015,121 +1037,123 @@ def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[VecDF], (instregex "LTDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
 
 // Load lengthened
-def : InstRW<[VecDF], (instregex "LDETR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)TR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
+             (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
-def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Convert from / to zoned
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
 
 // Convert from / to packed
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
 
 // Perform floating-point operation
-def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load FP integer
-def : InstRW<[VecDF], (instregex "FIDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
 
 // Extract significance
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
 
 // Subtraction
-def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
 
 // Multiply
-def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
 
 // Division
-def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
 
 // Quantize
-def : InstRW<[VecDF], (instregex "QADTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
-def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
-def : InstRW<[VecDF], (instregex "CEDTR$")>;
-def : InstRW<[VecDF], (instregex "CEXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
-def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 
 // --------------------------------- Vector --------------------------------- //
@@ -1138,234 +1162,236 @@ def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 // Vector: Move instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
-def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
-def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat4, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Immediate instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VZERO$")>;
-def : InstRW<[VecXsPm], (instregex "VONE$")>;
-def : InstRW<[VecXsPm], (instregex "VGBM$")>;
-def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Loads
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
-def : InstRW<[LSU], (instregex "VL(32|64)$")>;
-def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H)?$")>;
-def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-              (instregex "VLM$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+             (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Stores
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
-def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
-              (instregex "VSTM$")>;
-def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Selects and permutes
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPERM$")>;
-def : InstRW<[VecXsPm], (instregex "VPDI$")>;
-def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Widening and narrowing
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Integer arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VN(C|O)?$")>;
-def : InstRW<[VecXsPm], (instregex "VO$")>;
-def : InstRW<[VecMul], (instregex "VCKSM$")>;
-def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VX$")>;
-def : InstRW<[VecMul], (instregex "VGFM?$")>;
-def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
-def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VPOPCT$")>;
-
-def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
-def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
-def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
-def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
-def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
-
-def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat3, VecXsPm2, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat3, VecXsPm2, NormalGr], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Integer comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Floating-point arithmetic
 //===----------------------------------------------------------------------===//
 
 // Conversion and rounding
-def : InstRW<[VecBF2], (instregex "VCD(L)?G$")>;
-def : InstRW<[VecBF2], (instregex "VCD(L)?GB$")>;
-def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
-def : InstRW<[VecBF2], (instregex "VC(L)?GD$")>;
-def : InstRW<[VecBF2], (instregex "VC(L)?GDB$")>;
-def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
-def : InstRW<[VecBF2], (instregex "VL(DE|ED)$")>;
-def : InstRW<[VecBF2], (instregex "VL(DE|ED)B$")>;
-def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
-def : InstRW<[VecBF2], (instregex "VFI$")>;
-def : InstRW<[VecBF2], (instregex "VFIDB$")>;
-def : InstRW<[VecBF], (instregex "WFIDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFI$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFIDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFIDB$")>;
 
 // Sign operations
-def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
 
 // Test data class
-def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
 
 // Add / subtract
-def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
-def : InstRW<[VecBF2], (instregex "VF(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
 
 // Multiply / multiply-and-add/subtract
-def : InstRW<[VecBF2], (instregex "VFM$")>;
-def : InstRW<[VecBF2], (instregex "VFMDB$")>;
-def : InstRW<[VecBF], (instregex "WFMDB$")>;
-def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>;
-def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>;
 
 // Divide / square root
-def : InstRW<[VecFPd], (instregex "VFD$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
-def : InstRW<[VecFPd], (instregex "VFSQ$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Floating-point comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)$")>;
-def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WFC(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VFC(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WFC(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)$")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Floating-point insertion and extraction
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "LEFR$")>;
-def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat4, FXb, NormalGr], (instregex "LFER$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: String instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
-def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1374,156 +1400,150 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
 // System: Program-Status Word Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat30, MCD], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Prefix-Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Storage-Key and Real Memory Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXb, Lat30], (instregex "TB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Dynamic-Address-Translation Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Memory-move Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Time-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
-             (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Miscellaneous Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Measurement Facility Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "LPP$")>;
-def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: I/O Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index 698eb5627d19..515f968e5091 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -10,13 +10,15 @@
 // This file defines the machine model for Z14 to support instruction
 // scheduling and other instruction cost heuristics.
 //
+// Pseudos expanded right after isel do not need to be modelled here.
+//
 //===----------------------------------------------------------------------===//
 
 def Z14Model : SchedMachineModel {
 
     let UnsupportedFeatures = Arch12UnsupportedFeatures.List;
 
-    let IssueWidth = 8;
+    let IssueWidth = 6;             // Number of instructions decoded per cycle.
     let MicroOpBufferSize = 60;     // Issue queues
     let LoadLatency = 1;            // Optimistic load latency.
 
@@ -27,37 +29,39 @@ def Z14Model : SchedMachineModel {
 }
 
 let SchedModel = Z14Model in  {
-
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
-def : WriteRes<GroupAlone, []> {
-  let NumMicroOps = 0;
-  let BeginGroup  = 1;
-  let EndGroup    = 1;
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
 }
-def : WriteRes<BeginGroup, []> {
-  let NumMicroOps = 0;
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
   let BeginGroup  = 1;
 }
-def : WriteRes<EndGroup, []> {
-  let NumMicroOps = 0;
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 3;
+  let BeginGroup  = 1;
   let EndGroup    = 1;
 }
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
 
 // Execution units.
 def Z14_FXaUnit     : ProcResource<2>;
@@ -66,33 +70,39 @@ def Z14_LSUnit      : ProcResource<2>;
 def Z14_VecUnit     : ProcResource<2>;
 def Z14_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
 def Z14_VBUnit      : ProcResource<2>;
+def Z14_MCD         : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXa,     [Z14_FXaUnit]> { let Latency = 1; }
-def : WriteRes<FXa2,    [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; }
-def : WriteRes<FXb,     [Z14_FXbUnit]> { let Latency = 1; }
-def : WriteRes<LSU,     [Z14_LSUnit]>  { let Latency = 4; }
-def : WriteRes<VecBF,   [Z14_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecBF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDF,   [Z14_VecUnit]> { let Latency = 8; }
-def : WriteRes<VecDF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
-def : WriteRes<VecDFX,  [Z14_VecUnit]> { let Latency = 1; }
-def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; }
-def : WriteRes<VecFPd,  [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
-                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]>
-                         { let Latency = 30; }
-def : WriteRes<VecMul,  [Z14_VecUnit]> { let Latency = 5; }
-def : WriteRes<VecStr,  [Z14_VecUnit]> { let Latency = 4; }
-def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; }
-def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
+let NumMicroOps = 0 in {
+  def : WriteRes<FXa, [Z14_FXaUnit]>;
+  def : WriteRes<FXb, [Z14_FXbUnit]>;
+  def : WriteRes<LSU, [Z14_LSUnit]>;
+  def : WriteRes<VecBF,  [Z14_VecUnit]>;
+  def : WriteRes<VecDF,  [Z14_VecUnit]>;
+  def : WriteRes<VecDFX, [Z14_VecUnit]>;
+  def : WriteRes<VecMul,  [Z14_VecUnit]>;
+  def : WriteRes<VecStr,  [Z14_VecUnit]>;
+  def : WriteRes<VecXsPm, [Z14_VecUnit]>;
+  foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z14_FXaUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z14_FXbUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z14_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z14_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z14_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z14_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z14_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z14_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z14_VecUnit]>;
+  }}
+
+  def : WriteRes<VecFPd,  [Z14_VecFPdUnit]> { let ResourceCycles = [30]; }
+
+  def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z14_MCD]> { let NumMicroOps = 3;
+                                 let BeginGroup  = 1;
+                                 let EndGroup    = 1; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -106,27 +116,28 @@ def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
 // Stack allocation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
 //===----------------------------------------------------------------------===//
 
 // Branch
-def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>;
-def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb2, GroupAlone],
              (instregex "B(R)?X(H|L).*$")>;
 
 // Compare and branch
-def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
              (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -134,609 +145,627 @@ def : InstRW<[FXb, FXb, Lat2, GroupAlone],
 //===----------------------------------------------------------------------===//
 
 // Trap
-def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
 
 // Compare and trap
-def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Call and return instructions
 //===----------------------------------------------------------------------===//
 
 // Call
-def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[FXb], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo
-def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
 //===----------------------------------------------------------------------===//
 
 // Moves
-def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
 
 // Move character
-def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
-def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
 
 // Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
 
-def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
-def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
 
 // Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
 
 // Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
 
 // Load and test
-def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
 
 // Stores
-def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
 
 // String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
 
 //===----------------------------------------------------------------------===//
 // Conditional move instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+             (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXa], (instregex "LTGFR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
 
 // Load and zero rightmost byte
-def : InstRW<[LSU], (instregex "LLZRGF$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
 
 // Load and trap
-def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
 
 // Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-             (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
 
-// Store multiple (estimated average of ceil(5/2) FXb ops)
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
-              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Byte swaps
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
 
 // Load the Global Offset Table address ( -> larl )
-def : InstRW<[FXa], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
 
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "LP(G)?R$")>;
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXa], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Insertion
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "IILF(64)?$")>;
-def : InstRW<[FXa], (instregex "IILH(64)?$")>;
-def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
-def : InstRW<[FXa], (instregex "AIH$")>;
-def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
-def : InstRW<[FXa], (instregex "AGFI$")>;
-def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AGR(K)?$")>;
-def : InstRW<[FXa], (instregex "AHI(K)?$")>;
-def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
-def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
-def : InstRW<[FXa], (instregex "ALGHSIK$")>;
-def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXa], (instregex "ALR(K)?$")>;
-def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
 
 // Add with sign extension (16/32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>;
-def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
-def : InstRW<[FXa], (instregex "SGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLFI$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXa], (instregex "SLR(K)?$")>;
-def : InstRW<[FXa], (instregex "SR(K)?$")>;
-def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
 
 // Subtraction with sign extension (16/32 -> 64)
-def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>;
-def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // AND
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "NGR(K)?$")>;
-def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "NILF(64)?$")>;
-def : InstRW<[FXa], (instregex "NILH(64)?$")>;
-def : InstRW<[FXa], (instregex "NILL(64)?$")>;
-def : InstRW<[FXa], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
 
 //===----------------------------------------------------------------------===//
 // OR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXa], (instregex "OGR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXa], (instregex "OILF(64)?$")>;
-def : InstRW<[FXa], (instregex "OILH(64)?$")>;
-def : InstRW<[FXa], (instregex "OILL(64)?$")>;
-def : InstRW<[FXa], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
 
 //===----------------------------------------------------------------------===//
 // XOR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXa], (instregex "XIFMux$")>;
-def : InstRW<[FXa], (instregex "XGR(K)?$")>;
-def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXa], (instregex "XILF(64)?$")>;
-def : InstRW<[FXa], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
 
 //===----------------------------------------------------------------------===//
 // Multiplication
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>;
-def : InstRW<[FXa, Lat7], (instregex "MSGR$")>;
-def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXa, Lat4], (instregex "MGHI$")>;
-def : InstRW<[FXa, Lat4], (instregex "MHI$")>;
-def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>;
-def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>;
-def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>;
-def : InstRW<[FXa, FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>;
-def : InstRW<[FXa, FXa, Lat8, GroupAlone], (instregex "MGRK$")>;
-def : InstRW<[FXa, LSU, Lat9], (instregex "MSC$")>;
-def : InstRW<[FXa, LSU, Lat11], (instregex "MSGC$")>;
-def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>;
-def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
+def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSC$")>;
+def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSGC$")>;
+def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>;
+def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
-def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
-def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
-def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
-def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
-def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone], (instregex "DL(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Shifts
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone],
              (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
-def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
 
 // Rotate and insert
-def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXa], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
 
 // Rotate and Select
-def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
 
 //===----------------------------------------------------------------------===//
 // Comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXb], (instregex "C(G)?R$")>;
-def : InstRW<[FXb], (instregex "CIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXb], (instregex "CLGR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXb], (instregex "CLIH$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXb], (instregex "CLR$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
-def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
 
 // Compare with sign extension (32 -> 64)
-def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
 
 // Compare logical character
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
 
 // Test under mask
-def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
 def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
-def : InstRW<[FXb], (instregex "NIAI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
 
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
 
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
 
 // Test and set
-def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
 
 // Compare and swap
-def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
 
 // Compare double and swap
-def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone],
              (instregex "CDS(Y)?$")>;
-def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
-             (instregex "CDSG$")>;
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
+              GroupAlone], (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
 
 // Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
 
 // Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
 
 // Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone],
+             (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>;
-def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>;
-def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(KIMD|KLMD|KMAC)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(PCC|PPNO|PRNO)$")>;
 
 //===----------------------------------------------------------------------===//
 // Guarded storage
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "LGG$")>;
-def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>;
-def : InstRW<[LSU, Lat30], (instregex "(L|ST)GSC$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
+def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone],
              (instregex "CVBG$")>;
-def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
-             (instregex "CVDG$")>;
-def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
-def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
-
-def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+def : InstRW<[WLat30, RegReadAdv, FXb, VecDF, LSU, GroupAlone],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
-             (instregex "(M|D)P$")>;
-def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
-             (instregex "SRP$")>;
-def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
-def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
 // Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
 
 // Load address extended
-def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
 
 // Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Program mask and addressing mode
 //===----------------------------------------------------------------------===//
 
 // Insert Program Mask
-def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
 
 // Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
 
 // Branch and link
-def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
 
 // Test addressing mode
-def : InstRW<[FXb], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
 
 // Set addressing mode
-def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
 
 // Branch (and save) and set mode.
-def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
 
 //===----------------------------------------------------------------------===//
 // Transactional execution
 //===----------------------------------------------------------------------===//
 
 // Transaction begin
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
-              (instregex "TBEGIN(C|_nofloat)?$")>;
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone], (instregex "TBEGIN(C)?$")>;
 
 // Transaction end
-def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
 
 // Transaction abort
-def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
 
 // Extract Transaction Nesting Depth
-def : InstRW<[FXa], (instregex "ETND$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
 
 // Nontransactional store
-def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
 
 //===----------------------------------------------------------------------===//
 // Processor assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, GroupAlone], (instregex "PPA$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
-def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXa], (instregex "AEXT128$")>;
-def : InstRW<[FXa], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT$")>;
 
 // String instructions
-def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
 
 // Execute
-def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
@@ -750,168 +779,158 @@ def : InstRW<[], (instregex "Insn.*")>;
 // ----------------------------- Floating point ----------------------------- //
 
 //===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>;
-def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
 // FP: Move instructions
 //===----------------------------------------------------------------------===//
 
 // Load zero
-def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
 
 // Load
-def : InstRW<[VecXsPm], (instregex "LER$")>;
-def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
 
 // Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
-             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+             (instregex "LTXBR(Compare)?$")>;
 
 // Copy sign
-def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Load instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Store instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
 
 // Load lengthened
-def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[VecBF], (instregex "LDEBR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)BR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked],
+             (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
 
 // Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
 
 // Load FP integer
-def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
-def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
 
 // Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
 
 // Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
 
 // Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+             (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
-def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Floating-point control register instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -921,108 +940,111 @@ def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[VecBF], (instregex "LEXR$")>;
-def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
 
 // Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXb], (instregex "LDER$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat8LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
 
 // Convert from fixed
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)R$")>;
 
 // Convert to fixed
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat10, WLat10, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
 
 // Convert BFP to HFP / HFP to BFP.
-def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
-def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
 
 // Halve
-def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "H(E|D)R$")>;
 
 // Square root
-def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
 
 // Load FP integer
-def : InstRW<[VecBF], (instregex "FIER$")>;
-def : InstRW<[VecBF], (instregex "FIDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
 
 // Subtraction
-def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
 
 // Multiply
-def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXD$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
-def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat8, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
-def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
-def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
-def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "C(E|D)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
 
 
 // ------------------------ Decimal floating point -------------------------- //
@@ -1032,121 +1054,123 @@ def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[VecDF], (instregex "LTDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
-def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
 
 // Load lengthened
-def : InstRW<[VecDF], (instregex "LDETR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone], (instregex "CXL(F|G)TR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
+             (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
-def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Convert from / to zoned
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
 
 // Convert from / to packed
-def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
-def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
-def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
-def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
 
 // Perform floating-point operation
-def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load FP integer
-def : InstRW<[VecDF], (instregex "FIDTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
 
 // Extract significance
-def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
-def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
 
 // Subtraction
-def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
 
 // Multiply
-def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
 
 // Division
-def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
 
 // Quantize
-def : InstRW<[VecDF], (instregex "QADTR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
-def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
-def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
-def : InstRW<[VecDF], (instregex "CEDTR$")>;
-def : InstRW<[VecDF], (instregex "CEXTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
-def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 
 // --------------------------------- Vector --------------------------------- //
@@ -1155,298 +1179,307 @@ def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 // Vector: Move instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
-def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
-def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
-def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Immediate instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VZERO$")>;
-def : InstRW<[VecXsPm], (instregex "VONE$")>;
-def : InstRW<[VecXsPm], (instregex "VGBM$")>;
-def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Loads
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
-def : InstRW<[LSU], (instregex "VL(32|64)$")>;
-def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
-def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-              (instregex "VLM$")>;
-def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+             (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Stores
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
-def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
-def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
-def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
-              (instregex "VSTM$")>;
-def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
-def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Selects and permutes
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPERM$")>;
-def : InstRW<[VecXsPm], (instregex "VPDI$")>;
-def : InstRW<[VecXsPm], (instregex "VBPERM$")>;
-def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Widening and narrowing
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
-def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Integer arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>;
-def : InstRW<[VecXsPm], (instregex "VO(C)?$")>;
-def : InstRW<[VecMul], (instregex "VCKSM$")>;
-def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VX$")>;
-def : InstRW<[VecMul], (instregex "VGFM?$")>;
-def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
-def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
-def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
-def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
-def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSLB$")>;
-def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
-def : InstRW<[VecXsPm], (instregex "VSR(A|L)B$")>;
-
-def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
-def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
-def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
-
-def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
-def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Integer comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
-def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Floating-point arithmetic
 //===----------------------------------------------------------------------===//
 
 // Conversion and rounding
-def : InstRW<[VecBF], (instregex "VCD(L)?G$")>;
-def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>;
-def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
-def : InstRW<[VecBF], (instregex "VC(L)?GD$")>;
-def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>;
-def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
-def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>;
-def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>;
-def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
-def : InstRW<[VecBF], (instregex "VFL(L|R)$")>;
-def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>;
-def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>;
-def : InstRW<[VecBF2], (instregex "WFLLD$")>;
-def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>;
-def : InstRW<[VecBF2], (instregex "VFI$")>;
-def : InstRW<[VecBF], (instregex "VFIDB$")>;
-def : InstRW<[VecBF], (instregex "WFIDB$")>;
-def : InstRW<[VecBF2], (instregex "VFISB$")>;
-def : InstRW<[VecBF], (instregex "WFISB$")>;
-def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "WFLLD$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFI$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFIDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFIDB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFISB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFISB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>;
 
 // Sign operations
-def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>;
-def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>;
-def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
 
 // Minimum / maximum
-def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>;
-def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>;
-def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>;
-def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
 
 // Test data class
-def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>;
-def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
 
 // Add / subtract
-def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
-def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
-def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>;
-def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>;
-def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
 
 // Multiply / multiply-and-add/subtract
-def : InstRW<[VecBF2], (instregex "VFM$")>;
-def : InstRW<[VecBF], (instregex "VFMDB$")>;
-def : InstRW<[VecBF], (instregex "WFMDB$")>;
-def : InstRW<[VecBF2], (instregex "VFMSB$")>;
-def : InstRW<[VecBF], (instregex "WFMSB$")>;
-def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>;
-def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>;
-def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>;
-def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>;
-def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFMDB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>;
+def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
 
 // Divide / square root
-def : InstRW<[VecFPd], (instregex "VFD$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>;
-def : InstRW<[VecFPd], (instregex "WFDXB$")>;
-def : InstRW<[VecFPd], (instregex "VFSQ$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
-def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>;
-def : InstRW<[VecFPd], (instregex "WFSQXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Floating-point comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>;
-def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>;
-def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>;
-def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>;
-def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>;
-def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
+def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Floating-point insertion and extraction
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "LEFR$")>;
-def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: String instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
-def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
-def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
-def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Packed-decimal instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecDF, VecDF, Lat10], (instregex "VLIP$")>;
-def : InstRW<[VecDFX, LSU, GroupAlone], (instregex "VPKZ$")>;
-def : InstRW<[VecDFX, FXb, LSU, Lat12, BeginGroup], (instregex "VUPKZ$")>;
-def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>;
-def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>;
-def : InstRW<[VecDFX], (instregex "V(A|S)P$")>;
-def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>;
-def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>;
-def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>;
-def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>;
-def : InstRW<[VecDFX], (instregex "VPSOP$")>;
-def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "VLIP$")>;
+def : InstRW<[WLat6, VecDFX, LSU, GroupAlone], (instregex "VPKZ$")>;
+def : InstRW<[WLat1, VecDFX, FXb, LSU, Cracked], (instregex "VUPKZ$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone], (instregex "VCVB(G)?$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone], (instregex "VCVD(G)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "VSDP$")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1455,157 +1488,151 @@ def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
 // System: Program-Status Word Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat20, GroupAlone], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Prefix-Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Storage-Key and Real Memory Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXb, Lat30], (instregex "IRBM$")>;
-def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXb, Lat30], (instregex "TB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Dynamic-Address-Translation Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Memory-move Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Time-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
-             (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Miscellaneous Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Measurement Facility Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb], (instregex "LPP$")>;
-def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: I/O Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 4d986e8391cf..3012b565d5ef 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -10,13 +10,15 @@
 // This file defines the machine model for Z196 to support instruction
 // scheduling and other instruction cost heuristics.
 //
+// Pseudos expanded right after isel do not need to be modelled here.
+//
 //===----------------------------------------------------------------------===//
 
 def Z196Model : SchedMachineModel {
 
     let UnsupportedFeatures = Arch9UnsupportedFeatures.List;
     
-    let IssueWidth = 5;
+    let IssueWidth = 3;
     let MicroOpBufferSize = 40;     // Issue queues
     let LoadLatency = 1;            // Optimistic load latency.
 
@@ -27,48 +29,65 @@ def Z196Model : SchedMachineModel {
 }
 
 let SchedModel = Z196Model in  {
-
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
+}
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
+  let BeginGroup  = 1;
+}
 def : WriteRes<GroupAlone, []> {
-  let NumMicroOps = 0;
+  let NumMicroOps = 3;
   let BeginGroup  = 1;
   let EndGroup    = 1;
 }
-def : WriteRes<EndGroup, []> {
-  let NumMicroOps = 0;
-  let EndGroup    = 1;
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in {
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+  }
 }
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 
 // Execution units.
 def Z196_FXUnit : ProcResource<2>;
 def Z196_LSUnit : ProcResource<2>;
 def Z196_FPUnit : ProcResource<1>;
 def Z196_DFUnit : ProcResource<1>;
+def Z196_MCD    : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXU,       [Z196_FXUnit]> { let Latency = 1; }
-def : WriteRes<LSU,       [Z196_LSUnit]> { let Latency = 4; }
-def : WriteRes<LSU_lat1,  [Z196_LSUnit]> { let Latency = 1; }
-def : WriteRes<FPU,       [Z196_FPUnit]> { let Latency = 8; }
-def : WriteRes<FPU2,      [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
-def : WriteRes<DFU,       [Z196_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2,      [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
+let NumMicroOps = 0 in {
+  def : WriteRes<FXU, [Z196_FXUnit]>;
+  def : WriteRes<LSU, [Z196_LSUnit]>;
+  def : WriteRes<FPU, [Z196_FPUnit]>;
+  def : WriteRes<DFU, [Z196_DFUnit]>;
+  foreach Num = 2-6 in { let ResourceCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXU"#Num), [Z196_FXUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z196_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FPU"#Num), [Z196_FPUnit]>;
+    def : WriteRes<!cast<SchedWrite>("DFU"#Num), [Z196_DFUnit]>;
+  }}
+}
+
+def : WriteRes<MCD, [Z196_MCD]> { let NumMicroOps = 3;
+                                  let BeginGroup  = 1;
+                                  let EndGroup    = 1; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -82,26 +101,26 @@ def : WriteRes<DFU2,      [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
 // Stack allocation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
 //===----------------------------------------------------------------------===//
 
 // Branch
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[LSU, EndGroup], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCT(G|H)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BRCT(G|H)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXU3, LSU, GroupAlone],
              (instregex "B(R)?X(H|L).*$")>;
 
 // Compare and branch
-def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+def : InstRW<[WLat1, FXU, LSU, GroupAlone],
              (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+def : InstRW<[WLat1, FXU, LSU, GroupAlone],
              (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -109,546 +128,558 @@ def : InstRW<[FXU, LSU, Lat5, GroupAlone],
 //===----------------------------------------------------------------------===//
 
 // Trap
-def : InstRW<[LSU, EndGroup], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "(Cond)?Trap$")>;
 
 // Compare and trap
-def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Call and return instructions
 //===----------------------------------------------------------------------===//
 
 // Call
-def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, LSU, FXU2, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
-def : InstRW<[LSU_lat1, EndGroup], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo 
-def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "CondReturn$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
 //===----------------------------------------------------------------------===//
 
 // Moves
-def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MVI(Y)?$")>;
 
 // Move character
-def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXU, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
-def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "REG_SEQUENCE$")>;
 
 // Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
 
-def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
-def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
 
 // Load and test
-def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXU, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LT(G)?R$")>;
 
 // Stores
-def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
 
 // String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
 
 //===----------------------------------------------------------------------===//
 // Conditional move instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat2, EndGroup], (instregex "LOC(G)?R(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat6, EndGroup], (instregex "LOC(G)?(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "STOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXU, EndGroup], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, EndGroup],
+             (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, EndGroup], (instregex "STOC(G)?(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
-def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXU], (instregex "LTGFR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LTGFR$")>;
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLG(C|F|H|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
 
 // Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-             (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
 
 // Store multiple (estimated average of 3 ops)
-def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
-             (instregex "STM(H|Y|G)?$")>;
+def : InstRW<[WLat1, LSU2, FXU5, GroupAlone], (instregex "STM(H|Y|G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Byte swaps
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LA(Y|RL)?$")>;
 
 // Load the Global Offset Table address
-def : InstRW<[FXU], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "GOT$")>;
 
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "LCGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Insertion
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "IILF(64)?$")>;
-def : InstRW<[FXU], (instregex "IILH(64)?$")>;
-def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILL(64)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
-def : InstRW<[FXU], (instregex "AIH$")>;
-def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "AGFI$")>;
-def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AGR(K)?$")>;
-def : InstRW<[FXU], (instregex "AHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
-def : InstRW<[FXU], (instregex "ALGHSIK$")>;
-def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXU], (instregex "ALR(K)?$")>;
-def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "A(L)?(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?SI$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "ALGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "A(L)?G$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?GSI$")>;
 
 // Logical addition with carry
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "ALC(G)?R$")>;
 
 // Add with sign extension (32 -> 64)
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AGF$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "AGF$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "AGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SH(Y)?$")>;
-def : InstRW<[FXU], (instregex "SGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLFI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLR(K)?$")>;
-def : InstRW<[FXU], (instregex "SR(K)?$")>;
-def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "SLB(G)?R$")>;
 
 // Subtraction with sign extension (32 -> 64)
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SGF$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "SGF$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "SGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // AND
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "NGR(K)?$")>;
-def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "NILF(64)?$")>;
-def : InstRW<[FXU], (instregex "NILH(64)?$")>;
-def : InstRW<[FXU], (instregex "NILL(64)?$")>;
-def : InstRW<[FXU], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "NC$")>;
 
 //===----------------------------------------------------------------------===//
 // OR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "OGR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "OILF(64)?$")>;
-def : InstRW<[FXU], (instregex "OILH(64)?$")>;
-def : InstRW<[FXU], (instregex "OILL(64)?$")>;
-def : InstRW<[FXU], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "OC$")>;
 
 //===----------------------------------------------------------------------===//
 // XOR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXU], (instregex "XIFMux$")>;
-def : InstRW<[FXU], (instregex "XGR(K)?$")>;
-def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "XILF(64)?$")>;
-def : InstRW<[FXU], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "XC$")>;
 
 //===----------------------------------------------------------------------===//
 // Multiplication
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
-def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
-def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
-def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat8, FXU, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "MLG$")>;
+def : InstRW<[WLat9, FXU2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat7, FXU2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "D$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DSG(F)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DSG(F)?$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DL(G)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DL(G)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+             (instregex "D$")>;
+def : InstRW<[WLat30, FPU4, FXU4, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU3, GroupAlone],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DL(G)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+             (instregex "DL(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Shifts
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXU4, LSU, GroupAlone],
              (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
-def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "RLL(G)?$")>;
 
 // Rotate and insert
-def : InstRW<[FXU], (instregex "RISBG(32)?$")>;
-def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(32)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBMux$")>;
 
 // Rotate and Select
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "R(N|O|X)SBG$")>;
 
 //===----------------------------------------------------------------------===//
 // Comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXU], (instregex "C(G)?R$")>;
-def : InstRW<[FXU], (instregex "CIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "CLGR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXU], (instregex "CLIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXU], (instregex "CLR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CHHSI$")>;
 
 // Compare with sign extension (32 -> 64)
-def : InstRW<[FXU, FXU, LSU, Lat6, Lat2, GroupAlone], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "CGFR$")>;
 
 // Compare logical character
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat9, FXU, LSU2, GroupAlone], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
 
 // Test under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, GroupAlone], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PFD(RL)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Serialize$")>;
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAX(G)?$")>;
 
 // Test and set
-def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, EndGroup], (instregex "TS$")>;
 
 // Compare and swap
-def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXU2, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
 
 // Compare double and swap
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat5LSU, WLat5LSU, FXU5, LSU, GroupAlone],
              (instregex "CDS(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+def : InstRW<[WLat12, WLat12, FXU6, LSU2, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
 
 // Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
 
 // Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXU2, LSU2, GroupAlone], (instregex "STPQ$")>;
 
 // Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
-             (instregex "CVDG$")>;
-def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat30, RegReadAdv, FXU, DFU2, LSU2, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXU, DFU, LSU, GroupAlone],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXU3, DFU4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXU2, DFU, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat10, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "UNPK$")>;
 
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[WLat11LSU, FXU, DFU4, LSU2, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
-             (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
-             (instregex "SRP$")>;
-def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXU, DFU4, LSU2, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXU2, DFU4, LSU3, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat11, DFU4, LSU2, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat5LSU, DFU2, LSU2, GroupAlone], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
 // Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
 
 // Load address extended
-def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXU, GroupAlone], (instregex "LAE(Y)?$")>;
 
 // Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Program mask and addressing mode
 //===----------------------------------------------------------------------===//
 
 // Insert Program Mask
-def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXU, EndGroup], (instregex "IPM$")>;
 
 // Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
 
 // Branch and link
-def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BAL(R)?$")>;
 
 // Test addressing mode
-def : InstRW<[FXU], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TAM$")>;
 
 // Set addressing mode
-def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
 
 // Branch (and save) and set mode.
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BASSM$")>;
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat7, WLat7, FXU2, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
-def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXU], (instregex "AEXT128$")>;
-def : InstRW<[FXU], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXU, NormalGr], (instregex "POPCNT$")>;
 
 // String instructions
-def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -665,167 +696,155 @@ def : InstRW<[], (instregex "Insn.*")>;
 // ----------------------------- Floating point ----------------------------- //
 
 //===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
-def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
 // FP: Move instructions
 //===----------------------------------------------------------------------===//
 
 // Load zero
-def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
 
 // Load
-def : InstRW<[FXU], (instregex "LER$")>;
-def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
 
 // Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
-def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
-             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "LTXBR(Compare)?$")>;
 
 // Copy sign
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Load instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Store instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, FPU2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
 
 // Load lengthened
-def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[FPU], (instregex "LDEBR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, FPU, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "LX(E|D)BR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A)?$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CXL(F|G)BR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A)?$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+             (instregex "C(F|G)(E|D)BR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone],
+             (instregex "C(F|G)XBR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+             (instregex "CL(F|G)(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
 
 // Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXBR$")>;
 
 // Load FP integer
-def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
-def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXBR(A)?$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "AXBR$")>;
 
 // Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "SXBR$")>;
 
 // Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat30, FPU2, NormalGr], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
-def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat15, FPU, LSU, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat15, FPU4, LSU, GroupAlone], (instregex "TCXB$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Floating-point control register instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXU, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat2, FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -835,108 +854,111 @@ def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "LTXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[FPU], (instregex "LEXR$")>;
-def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(LDXR|LRDR)$")>;
 
 // Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXU], (instregex "LDER$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat9, FPU4, GroupAlone], (instregex "LX(E|D)R$")>;
 
 // Convert from fixed
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat10, FXU, FPU4, GroupAlone], (instregex "CX(F|G)R$")>;
 
 // Convert to fixed
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+             (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat30, WLat30, FXU, FPU2, GroupAlone], (instregex "C(F|G)XR$")>;
 
 // Convert BFP to HFP / HFP to BFP.
-def : InstRW<[FPU], (instregex "THD(E)?R$")>;
-def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "TB(E)?DR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "L(C|N|P)XR$")>;
 
 // Halve
-def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "H(E|D)R$")>;
 
 // Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXR$")>;
 
 // Load FP integer
-def : InstRW<[FPU], (instregex "FIER$")>;
-def : InstRW<[FPU], (instregex "FIDR$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "AXR$")>;
 
 // Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "SXR$")>;
 
 // Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
-def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[FPU2, FPU2,  Lat10, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(D|EE)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(DE|E)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|EE)R$")>;
+def : InstRW<[WLat8, FPU, NormalGr], (instregex "M(DE|E)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, RegReadAdv, FPU4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[FPU], (instregex "C(E|D)R$")>;
-def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "C(E|D)$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat15, FPU2, NormalGr], (instregex "CXR$")>;
 
 
 // ------------------------ Decimal floating point -------------------------- //
@@ -946,114 +968,115 @@ def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat4, WLat4, DFU, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat6, WLat6, DFU4, GroupAlone], (instregex "LTXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
-def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat30, DFU2, NormalGr], (instregex "LDXTR$")>;
 
 // Load lengthened
-def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat7, DFU, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat6, DFU4, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[WLat30, FXU, DFU, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat30, FXU, DFU4, GroupAlone], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat9, FXU, DFU4, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU2, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat8, FXU2, DFU4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat12, FXU2, DFU4, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Perform floating-point operation
-def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load FP integer
-def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, DFU, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, DFU4, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "ESXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "AXTR(A)?$")>;
 
 // Subtraction
-def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "SXTR(A)?$")>;
 
 // Multiply
-def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "MXTR(A)?$")>;
 
 // Division
-def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "DXTR(A)?$")>;
 
 // Quantize
-def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, DFU, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, DFU4, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "RRDTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU4, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat7LSU, LSU, DFU, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, DFU4, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "IEDTR$")>;
+def : InstRW<[WLat7, FXU, DFU4, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat9, DFU, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat10, DFU2, NormalGr], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
-def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[WLat4, DFU, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat5, DFU2, NormalGr], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
-def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat9, LSU, DFU, NormalGr], (instregex "TD(C|G)DT$")>;
+def : InstRW<[WLat10, LSU, DFU, NormalGr], (instregex "TD(C|G)ET$")>;
+def : InstRW<[WLat10, LSU, DFU2, NormalGr], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1062,156 +1085,151 @@ def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 // System: Program-Status Word Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat30, MCD], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXU, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-             (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat10, WLat10, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Prefix-Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Storage-Key and Real Memory Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXU, Lat30], (instregex "TB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Dynamic-Address-Translation Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Memory-move Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Time-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
-def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKPF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "STPT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Miscellaneous Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXU, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Measurement Facility Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LPP$")>;
-def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: I/O Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index a0f2115eb9d7..892f493570d1 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -10,13 +10,15 @@
 // This file defines the machine model for ZEC12 to support instruction
 // scheduling and other instruction cost heuristics.
 //
+// Pseudos expanded right after isel do not need to be modelled here.
+//
 //===----------------------------------------------------------------------===//
 
 def ZEC12Model : SchedMachineModel {
 
     let UnsupportedFeatures = Arch10UnsupportedFeatures.List;
     
-    let IssueWidth = 5;
+    let IssueWidth = 3;
     let MicroOpBufferSize = 40;     // Issue queues
     let LoadLatency = 1;            // Optimistic load latency.
 
@@ -26,34 +28,41 @@ def ZEC12Model : SchedMachineModel {
     let MispredictPenalty = 16;
 }
 
-let SchedModel = ZEC12Model in  {
+let SchedModel = ZEC12Model in {
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
 
-// These definitions could be put in a subtarget common include file,
-// but it seems the include system in Tablegen currently rejects
-// multiple includes of same file.
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
+}
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
+  let BeginGroup  = 1;
+}
 def : WriteRes<GroupAlone, []> {
-  let NumMicroOps = 0;
+  let NumMicroOps = 3;
   let BeginGroup  = 1;
   let EndGroup    = 1;
 }
-def : WriteRes<EndGroup, []> {
-  let NumMicroOps = 0;
-  let EndGroup    = 1;
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in {
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+  }
 }
-def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
-def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
-def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
-def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
-def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
-def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
-def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
-def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
-def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
-def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
-def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
-def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
-def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
-def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 
 // Execution units.
 def ZEC12_FXUnit : ProcResource<2>;
@@ -61,16 +70,27 @@ def ZEC12_LSUnit : ProcResource<2>;
 def ZEC12_FPUnit : ProcResource<1>;
 def ZEC12_DFUnit : ProcResource<1>;
 def ZEC12_VBUnit : ProcResource<1>;
+def ZEC12_MCD    : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
-def : WriteRes<FXU,      [ZEC12_FXUnit]> { let Latency = 1; }
-def : WriteRes<LSU,      [ZEC12_LSUnit]> { let Latency = 4; }
-def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
-def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
-def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
-def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; }
-def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
+let NumMicroOps = 0 in {
+  def : WriteRes<FXU, [ZEC12_FXUnit]>;
+  def : WriteRes<LSU, [ZEC12_LSUnit]>;
+  def : WriteRes<FPU, [ZEC12_FPUnit]>;
+  def : WriteRes<DFU, [ZEC12_DFUnit]>;
+  foreach Num = 2-6 in { let ResourceCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXU"#Num), [ZEC12_FXUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [ZEC12_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FPU"#Num), [ZEC12_FPUnit]>;
+    def : WriteRes<!cast<SchedWrite>("DFU"#Num), [ZEC12_DFUnit]>;
+  }}
+
+  def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [ZEC12_MCD]> { let NumMicroOps = 3;
+                                   let BeginGroup  = 1;
+                                   let EndGroup    = 1; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -84,26 +104,27 @@ def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 // Stack allocation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ADJDYNALLOC$")>;
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
 //===----------------------------------------------------------------------===//
 
 // Branch
-def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[LSU, Lat4], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[LSU, Lat4], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[FXU, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXU3, LSU, GroupAlone],
              (instregex "B(R)?X(H|L).*$")>;
 
 // Compare and branch
-def : InstRW<[FXU], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone],
              (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -111,582 +132,592 @@ def : InstRW<[FXU, LSU, Lat5, GroupAlone],
 //===----------------------------------------------------------------------===//
 
 // Trap
-def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
 
 // Compare and trap
-def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Call and return instructions
 //===----------------------------------------------------------------------===//
 
 // Call
-def : InstRW<[VBU, FXU, FXU, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+def : InstRW<[WLat1, FXU2, VBU, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
 
 // Return
-def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
-def : InstRW<[LSU_lat1], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Select instructions
-//===----------------------------------------------------------------------===//
-
-// Select pseudo 
-def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
-
-// CondStore pseudos
-def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "CondReturn$")>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
 //===----------------------------------------------------------------------===//
 
 // Moves
-def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "MVI(Y)?$")>;
 
 // Move character
-def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, FXU, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
-def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
-def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "REG_SEQUENCE$")>;
 
 // Loads
-def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
-def : InstRW<[LSU], (instregex "LG(RL)?$")>;
-def : InstRW<[LSU], (instregex "L128$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
 
-def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
-def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
-def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
 
 // Load and trap
-def : InstRW<[FXU, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
 
 // Load and test
-def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
-def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXU, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LT(G)?R$")>;
 
 // Stores
-def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
 
 // String moves.
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
 
 //===----------------------------------------------------------------------===//
 // Conditional move instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat2], (instregex "LOC(G)?R(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "LOC(G)?(Asm.*)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat2, FXU, NormalGr], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STOC(G)?(Asm.*)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
-def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(B|H|F)R$")>;
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
-def : InstRW<[FXU], (instregex "LTGFR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LTGFR$")>;
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[FXU], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSU], (instregex "LLHRL$")>;
-def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
 
 // Load and trap
-def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
 
 // Load multiple (estimated average of 5 ops)
-def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
-             (instregex "LM(H|Y|G)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
 
 // Store multiple (estimated average of 3 ops)
-def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
-             (instregex "STM(H|Y|G)?$")>;
+def : InstRW<[WLat1, LSU2, FXU5, GroupAlone], (instregex "STM(H|Y|G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Byte swaps
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LA(Y|RL)?$")>;
 
 // Load the Global Offset Table address
-def : InstRW<[FXU], (instregex "GOT$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "GOT$")>;
 
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
-def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
-def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXU2, GroupAlone], (instregex "LCGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Insertion
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
-def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "IILF(64)?$")>;
-def : InstRW<[FXU], (instregex "IILH(64)?$")>;
-def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "IILL(64)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
-def : InstRW<[FXU], (instregex "AIH$")>;
-def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
-def : InstRW<[FXU], (instregex "AGFI$")>;
-def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AGR(K)?$")>;
-def : InstRW<[FXU], (instregex "AHI(K)?$")>;
-def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
-def : InstRW<[FXU], (instregex "ALGHSIK$")>;
-def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
-def : InstRW<[FXU], (instregex "ALR(K)?$")>;
-def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, Lat2], (instregex "A(L)?HHLR$")>;
-def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "A(L)?(Y)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?SI$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "ALGF$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "A(L)?G$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "A(L)?GSI$")>;
 
 // Logical addition with carry
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "ALC(G)?R$")>;
 
 // Add with sign extension (32 -> 64)
-def : InstRW<[FXU, LSU, Lat6], (instregex "AGF$")>;
-def : InstRW<[FXU, Lat2], (instregex "AGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "AGF$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "AGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "SH(Y)?$")>;
-def : InstRW<[FXU], (instregex "SGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLFI$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
-def : InstRW<[FXU], (instregex "SLR(K)?$")>;
-def : InstRW<[FXU], (instregex "SR(K)?$")>;
-def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, Lat2], (instregex "S(L)?HHLR$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXU, GroupAlone], (instregex "SLB(G)?R$")>;
 
 // Subtraction with sign extension (32 -> 64)
-def : InstRW<[FXU, LSU, Lat6], (instregex "SGF$")>;
-def : InstRW<[FXU, Lat2], (instregex "SGFR$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "SGF$")>;
+def : InstRW<[WLat2, WLat2, FXU, NormalGr], (instregex "SGFR$")>;
 
 //===----------------------------------------------------------------------===//
 // AND
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "NGR(K)?$")>;
-def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
-def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "NILF(64)?$")>;
-def : InstRW<[FXU], (instregex "NILH(64)?$")>;
-def : InstRW<[FXU], (instregex "NILL(64)?$")>;
-def : InstRW<[FXU], (instregex "NR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "NC$")>;
 
 //===----------------------------------------------------------------------===//
 // OR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
-def : InstRW<[FXU], (instregex "OGR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
-def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
-def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
-def : InstRW<[FXU], (instregex "OILF(64)?$")>;
-def : InstRW<[FXU], (instregex "OILH(64)?$")>;
-def : InstRW<[FXU], (instregex "OILL(64)?$")>;
-def : InstRW<[FXU], (instregex "OR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "OC$")>;
 
 //===----------------------------------------------------------------------===//
 // XOR
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
-def : InstRW<[FXU], (instregex "XIFMux$")>;
-def : InstRW<[FXU], (instregex "XGR(K)?$")>;
-def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
-def : InstRW<[FXU], (instregex "XILF(64)?$")>;
-def : InstRW<[FXU], (instregex "XR(K)?$")>;
-def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat5LSU, LSU2, FXU, GroupAlone], (instregex "XC$")>;
 
 //===----------------------------------------------------------------------===//
 // Multiplication
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
-def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
-def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
-def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
-def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat8, FXU, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat6, FXU, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "MLG$")>;
+def : InstRW<[WLat9, FXU2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat5, FXU, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat7, FXU2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXU2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "D$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DSG(F)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DSG(F)?$")>;
-def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DL(G)?R$")>;
-def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
-              (instregex "DL(G)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+             (instregex "D$")>;
+def : InstRW<[WLat30, FPU4, FXU4, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU3, GroupAlone],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat30, FPU4, FXU5, GroupAlone], (instregex "DL(G)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU4, LSU, FXU4, GroupAlone],
+             (instregex "DL(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Shifts
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXU4, LSU, GroupAlone],
              (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
-def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "RLL(G)?$")>;
 
 // Rotate and insert
-def : InstRW<[FXU], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[FXU], (instregex "RISBMux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBMux$")>;
 
 // Rotate and Select
-def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+def : InstRW<[WLat3, WLat3, FXU2, GroupAlone], (instregex "R(N|O|X)SBG$")>;
 
 //===----------------------------------------------------------------------===//
 // Comparison
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
-def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[FXU], (instregex "C(G)?R$")>;
-def : InstRW<[FXU], (instregex "CIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
-def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
-def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
-def : InstRW<[FXU], (instregex "CLGR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
-def : InstRW<[FXU], (instregex "CLIH$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
-def : InstRW<[FXU], (instregex "CLR$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
-def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, Lat2], (instregex "C(L)?HLR$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXU, NormalGr], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
-def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
-def : InstRW<[FXU, LSU, Lat6], (instregex "CGH(RL)?$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXU2, LSU, GroupAlone], (instregex "CHHSI$")>;
 
 // Compare with sign extension (32 -> 64)
-def : InstRW<[FXU, LSU, Lat6], (instregex "CGF(RL)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXU, NormalGr], (instregex "CGFR$")>;
 
 // Compare logical character
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+def : InstRW<[WLat9, FXU, LSU2, GroupAlone], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
 
 // Test under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
-def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
-def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXU, LSU, NormalGr],
+             (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
-def : InstRW<[LSU], (instregex "BP(R)?P$")>;
-def : InstRW<[FXU], (instregex "NIAI$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "BP(R)?P$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "NIAI$")>;
 
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "Serialize$")>;
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, FXU, LSU, NormalGr], (instregex "LAX(G)?$")>;
 
 // Test and set
-def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+def : InstRW<[WLat1LSU, FXU, LSU, EndGroup], (instregex "TS$")>;
 
 // Compare and swap
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXU2, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
 
 // Compare double and swap
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+def : InstRW<[WLat5LSU, WLat5LSU, FXU5, LSU, GroupAlone],
              (instregex "CDS(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+def : InstRW<[WLat12, WLat12, FXU6, LSU2, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
 
 // Perform locked operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
 
 // Load/store pair from/to quadword
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXU2, LSU2, GroupAlone], (instregex "STPQ$")>;
 
 // Load pair disjoint
-def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
-def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXU3, LSU2, GroupAlone],
+             (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
-def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
-             (instregex "CVDG$")>;
-def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
-def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
-def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat30, RegReadAdv, FXU, DFU2, LSU2, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXU, DFU, LSU, GroupAlone],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXU3, DFU4, LSU, GroupAlone], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXU2, DFU, LSU, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat10, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXU, LSU2, GroupAlone], (instregex "UNPK$")>;
 
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[WLat11LSU, FXU, DFU4, LSU2, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
-             (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
-             (instregex "SRP$")>;
-def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+def : InstRW<[WLat1, FXU, DFU4, LSU2, GroupAlone], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXU2, DFU4, LSU3, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat11, DFU4, LSU2, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat5LSU, DFU2, LSU2, GroupAlone], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
 // Extract/set/copy access register
-def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
 
 // Load address extended
-def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+def : InstRW<[WLat5, LSU, FXU, GroupAlone], (instregex "LAE(Y)?$")>;
 
 // Load/store access multiple (not modeled precisely)
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Program mask and addressing mode
 //===----------------------------------------------------------------------===//
 
 // Insert Program Mask
-def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+def : InstRW<[WLat3, FXU, EndGroup], (instregex "IPM$")>;
 
 // Set Program Mask
-def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
 
 // Branch and link
-def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BAL(R)?$")>;
 
 // Test addressing mode
-def : InstRW<[FXU], (instregex "TAM$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TAM$")>;
 
 // Set addressing mode
-def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
 
 // Branch (and save) and set mode.
-def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXU2, LSU, GroupAlone], (instregex "BASSM$")>;
 
 //===----------------------------------------------------------------------===//
 // Transactional execution
 //===----------------------------------------------------------------------===//
 
 // Transaction begin
-def : InstRW<[LSU, LSU, FXU, FXU, FXU, FXU, FXU, Lat15, GroupAlone],
-              (instregex "TBEGIN(C|_nofloat)?$")>;
+def : InstRW<[WLat9, LSU2, FXU5, GroupAlone], (instregex "TBEGIN(C)?$")>;
 
 // Transaction end
-def : InstRW<[LSU, GroupAlone], (instregex "TEND$")>;
+def : InstRW<[WLat4, LSU, GroupAlone], (instregex "TEND$")>;
 
 // Transaction abort
-def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
 
 // Extract Transaction Nesting Depth
-def : InstRW<[FXU], (instregex "ETND$")>;
+def : InstRW<[WLat30, MCD], (instregex "ETND$")>;
 
 // Nontransactional store
-def : InstRW<[FXU, LSU, Lat5], (instregex "NTSTG$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "NTSTG$")>;
 
 //===----------------------------------------------------------------------===//
 // Processor assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "PPA$")>;
+def : InstRW<[WLat30, MCD], (instregex "PPA$")>;
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[WLat7, WLat7, FXU2, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
-def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
-
-// Extend
-def : InstRW<[FXU], (instregex "AEXT128$")>;
-def : InstRW<[FXU], (instregex "ZEXT128$")>;
+def : InstRW<[WLat3, WLat3, FXU, NormalGr], (instregex "POPCNT$")>;
 
 // String instructions
-def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
-def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -703,167 +734,155 @@ def : InstRW<[], (instregex "Insn.*")>;
 // ----------------------------- Floating point ----------------------------- //
 
 //===----------------------------------------------------------------------===//
-// FP: Select instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
-def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
-def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
-
-//===----------------------------------------------------------------------===//
 // FP: Move instructions
 //===----------------------------------------------------------------------===//
 
 // Load zero
-def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
 
 // Load
-def : InstRW<[FXU], (instregex "LER$")>;
-def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
-def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
 
 // Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
-def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
-             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "LTXBR(Compare)?$")>;
 
 // Copy sign
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
-def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Load instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU], (instregex "LE(Y)?$")>;
-def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSU], (instregex "LX$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Store instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
-def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, FPU2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
 
 // Load lengthened
-def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
-def : InstRW<[FPU], (instregex "LDEBR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+def : InstRW<[WLat7LSU, FPU, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "LX(E|D)BR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A?)$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[WLat11, FXU, FPU4, GroupAlone], (instregex "CXL(F|G)BR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A?)$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A?)$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
-def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+             (instregex "C(F|G)(E|D)BR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone],
+             (instregex "C(F|G)XBR(A?)$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+             (instregex "CL(F|G)(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
-def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
-def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
 
 // Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXBR$")>;
 
 // Load FP integer
-def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
-def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXBR(A)?$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
-def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "AXBR$")>;
 
 // Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
-def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat20, WLat20, FPU4, GroupAlone], (instregex "SXBR$")>;
 
 // Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat30, FPU2, NormalGr], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
-def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat15, FPU, LSU, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat15, FPU4, LSU, GroupAlone], (instregex "TCXB$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Floating-point control register instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
-def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[WLat4, FXU, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat2, FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -873,108 +892,111 @@ def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "LTXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[FPU], (instregex "LEXR$")>;
-def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "(LDXR|LRDR)$")>;
 
 // Load lengthened
-def : InstRW<[LSU], (instregex "LDE$")>;
-def : InstRW<[FXU], (instregex "LDER$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat11LSU, FPU4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat9, FPU4, GroupAlone], (instregex "LX(E|D)R$")>;
 
 // Convert from fixed
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
-def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
-def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+def : InstRW<[WLat8, FXU, FPU, GroupAlone], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat10, FXU, FPU4, GroupAlone], (instregex "CX(F|G)R$")>;
 
 // Convert to fixed
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
-def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
-def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+def : InstRW<[WLat12, WLat12, FXU, FPU, GroupAlone],
+             (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat30, WLat30, FXU, FPU2, GroupAlone], (instregex "C(F|G)XR$")>;
 
 // Convert BFP to HFP / HFP to BFP.
-def : InstRW<[FPU], (instregex "THD(E)?R$")>;
-def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "TB(E)?DR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load Complement / Negative / Positive
-def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
-def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
-def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat9, WLat9, FPU4, GroupAlone], (instregex "L(C|N|P)XR$")>;
 
 // Halve
-def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "H(E|D)R$")>;
 
 // Square root
-def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+def : InstRW<[WLat30, FPU, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "SQXR$")>;
 
 // Load FP integer
-def : InstRW<[FPU], (instregex "FIER$")>;
-def : InstRW<[FPU], (instregex "FIDR$")>;
-def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat15, FPU4, GroupAlone], (instregex "FIXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "AXR$")>;
 
 // Subtraction
-def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
-def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+def : InstRW<[WLat7LSU, WLat7LSU, RegReadAdv, FPU, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat15, WLat15, FPU4, GroupAlone], (instregex "SXR$")>;
 
 // Multiply
-def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
-def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(D|EE)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "M(DE|E)$")>;
+def : InstRW<[WLat7, FPU, NormalGr], (instregex "M(D|EE)R$")>;
+def : InstRW<[WLat8, FPU, NormalGr], (instregex "M(DE|E)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
-def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
-def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, RegReadAdv, FPU4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, FPU2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat10, FPU4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat7, FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
-def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
-def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
-def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+def : InstRW<[WLat30, RegReadAdv, FPU, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, FPU, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, FPU4, GroupAlone], (instregex "DXR$")>;
 
 //===----------------------------------------------------------------------===//
 // HFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
-def : InstRW<[FPU], (instregex "C(E|D)R$")>;
-def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+def : InstRW<[WLat11LSU, RegReadAdv, FPU, LSU, NormalGr], (instregex "C(E|D)$")>;
+def : InstRW<[WLat9, FPU, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat15, FPU2, NormalGr], (instregex "CXR$")>;
 
 
 // ------------------------ Decimal floating point -------------------------- //
@@ -984,120 +1006,121 @@ def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
 //===----------------------------------------------------------------------===//
 
 // Load and Test
-def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+def : InstRW<[WLat4, WLat4, DFU, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat6, WLat6, DFU4, GroupAlone], (instregex "LTXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Conversion instructions
 //===----------------------------------------------------------------------===//
 
 // Load rounded
-def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
-def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat30, DFU2, NormalGr], (instregex "LDXTR$")>;
 
 // Load lengthened
-def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+def : InstRW<[WLat7, DFU, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat6, DFU4, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[WLat30, FXU, DFU, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat30, FXU, DFU4, GroupAlone], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat9, FXU, DFU, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[WLat9, FXU, DFU4, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[WLat5, FXU, DFU4, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU2, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat7, WLat7, FXU, DFU2, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
-def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
-def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat8, FXU2, DFU4, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat12, FXU2, DFU4, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Convert from / to zoned
-def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
-def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
-def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
+def : InstRW<[WLat4LSU, LSU, DFU2, GroupAlone], (instregex "CDZT$")>;
+def : InstRW<[WLat11LSU, LSU2, DFU4, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXU, LSU, DFU2, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXU, LSU, DFU2, GroupAlone], (instregex "CZXT$")>;
 
 // Perform floating-point operation
-def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Load FP integer
-def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
-def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+def : InstRW<[WLat8, DFU, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, DFU4, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+def : InstRW<[WLat7, FXU, DFU, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[WLat8, FXU, DFU2, GroupAlone], (instregex "ESXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition
-def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "AXTR(A)?$")>;
 
 // Subtraction
-def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+def : InstRW<[WLat9, WLat9, DFU, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, DFU4, GroupAlone], (instregex "SXTR(A)?$")>;
 
 // Multiply
-def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "MXTR(A)?$")>;
 
 // Division
-def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+def : InstRW<[WLat30, DFU, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, DFU4, GroupAlone], (instregex "DXTR(A)?$")>;
 
 // Quantize
-def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
-def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+def : InstRW<[WLat8, WLat8, DFU, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, DFU4, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+def : InstRW<[WLat11, WLat11, FXU, DFU, GroupAlone], (instregex "RRDTR$")>;
+def : InstRW<[WLat30, WLat30, FXU, DFU4, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+def : InstRW<[WLat7LSU, LSU, DFU, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, DFU4, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+def : InstRW<[WLat5, FXU, DFU, GroupAlone], (instregex "IEDTR$")>;
+def : InstRW<[WLat7, FXU, DFU4, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
+def : InstRW<[WLat9, DFU, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat10, DFU2, NormalGr], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
-def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[WLat4, DFU, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat5, DFU2, NormalGr], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
-def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
+def : InstRW<[WLat9, LSU, DFU, NormalGr], (instregex "TD(C|G)DT$")>;
+def : InstRW<[WLat10, LSU, DFU, NormalGr], (instregex "TD(C|G)ET$")>;
+def : InstRW<[WLat10, LSU, DFU2, NormalGr], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1106,157 +1129,152 @@ def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 // System: Program-Status Word Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat30, MCD], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXU, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXU, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
-             (instregex "STCT(L|G)$")>;
-def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
-def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+def : InstRW<[WLat10, WLat10, FXU, LSU, NormalGr], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, FXU5, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Prefix-Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Storage-Key and Real Memory Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
-def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
-def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
-def : InstRW<[FXU, Lat30], (instregex "TB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Dynamic-Address-Translation Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Memory-move Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
-def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
-def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
-def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
-def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Time-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
-def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
-def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKPF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SPT$")>;
+def : InstRW<[WLat9, FXU, LSU2, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXU2, GroupAlone], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat30, MCD], (instregex "STPT$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Related Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
-def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
-def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Miscellaneous Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
-def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
-def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
-def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXU, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
 
 //===----------------------------------------------------------------------===//
 // System: CPU-Measurement Facility Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU], (instregex "LPP$")>;
-def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
-def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: I/O Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
-def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
-def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 657482504045..e0d7bca9a94b 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -145,7 +145,7 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
 // deciding whether to use a loop or straight-line code.
 static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                        SDValue Src1, SDValue Src2, uint64_t Size) {
-  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
   EVT PtrVT = Src1.getValueType();
   // A two-CLC sequence is a clear win over a loop, not least because it
   // needs only one branch.  A three-CLC sequence needs the same number
@@ -167,9 +167,9 @@ static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
 // less than zero if CC == 1 and greater than zero if CC >= 2.
 // The sequence starts with IPM, which puts CC into bits 29 and 28
 // of an integer and clears bits 30 and 31.
-static SDValue addIPMSequence(const SDLoc &DL, SDValue Glue,
+static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg,
                               SelectionDAG &DAG) {
-  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
   SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
                             DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
   SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
@@ -184,9 +184,9 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
     uint64_t Bytes = CSize->getZExtValue();
     assert(Bytes > 0 && "Caller should have handled 0-size case");
-    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
-    SDValue Glue = Chain.getValue(1);
-    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+    SDValue CCReg = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+    Chain = CCReg.getValue(1);
+    return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
   }
   return std::make_pair(SDValue(), SDValue());
 }
@@ -196,7 +196,7 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
     SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const {
   // Use SRST to find the character.  End is its address on success.
   EVT PtrVT = Src.getValueType();
-  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::i32, MVT::Other);
   Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
   Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
   Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
@@ -204,17 +204,16 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
   SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
   SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
                             Limit, Src, Char);
-  Chain = End.getValue(1);
-  SDValue Glue = End.getValue(2);
+  SDValue CCReg = End.getValue(1);
+  Chain = End.getValue(2);
 
   // Now select between End and null, depending on whether the character
   // was found.
   SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT),
                    DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
                    DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32),
-                   Glue};
-  VTs = DAG.getVTList(PtrVT, MVT::Glue);
-  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+                   CCReg};
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, PtrVT, Ops);
   return std::make_pair(End, Chain);
 }
 
@@ -232,12 +231,12 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
     SDValue Src2, MachinePointerInfo Op1PtrInfo,
     MachinePointerInfo Op2PtrInfo) const {
-  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::i32, MVT::Other);
   SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
                                DAG.getConstant(0, DL, MVT::i32));
-  Chain = Unused.getValue(1);
-  SDValue Glue = Chain.getValue(2);
-  return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+  SDValue CCReg = Unused.getValue(1);
+  Chain = Unused.getValue(2);
+  return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
 }
 
 // Search from Src for a null character, stopping once Src reaches Limit.
@@ -250,10 +249,10 @@ static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG,
                                                     SDValue Chain, SDValue Src,
                                                     SDValue Limit) {
   EVT PtrVT = Src.getValueType();
-  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::i32, MVT::Other);
   SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
                             Limit, Src, DAG.getConstant(0, DL, MVT::i32));
-  Chain = End.getValue(1);
+  Chain = End.getValue(2);
   SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
   return std::make_pair(Len, Chain);
 }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 3a167a6d452a..f3620dcf3b92 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -18,12 +18,12 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
 #include <string>
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 37c55c4e3889..e2a3efda5c5e 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -737,7 +737,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
       unsigned PredicateExtraCost = 0;
       if (I != nullptr) {
         // Some predicates cost one or two extra instructions.
-        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        switch (cast<CmpInst>(I)->getPredicate()) {
         case CmpInst::Predicate::ICMP_NE:
         case CmpInst::Predicate::ICMP_UGE:
         case CmpInst::Predicate::ICMP_ULE:
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 72baf5985eac..907ecf46e8ff 100644
--- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -12,9 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -52,11 +51,24 @@ TargetLoweringObjectFile::~TargetLoweringObjectFile() {
   delete Mang;
 }
 
-static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
+static bool isNullOrUndef(const Constant *C) {
+  // Check that the constant isn't all zeros or undefs.
+  if (C->isNullValue() || isa<UndefValue>(C))
+    return true;
+  if (!isa<ConstantAggregate>(C))
+    return false;
+  for (auto Operand : C->operand_values()) {
+    if (!isNullOrUndef(cast<Constant>(Operand)))
+      return false;
+  }
+  return true;
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV) {
   const Constant *C = GV->getInitializer();
 
   // Must have zero initializer.
-  if (!C->isNullValue())
+  if (!isNullOrUndef(C))
     return false;
 
   // Leave constant zeros in readonly constant sections, so they can be shared.
@@ -67,10 +79,6 @@ static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
   if (GV->hasSection())
     return false;
 
-  // If -nozero-initialized-in-bss is specified, don't ever use BSS.
-  if (NoZerosInBSS)
-    return false;
-
   // Otherwise, put it in BSS!
   return true;
 }
@@ -126,25 +134,24 @@ void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
 
 
 /// getKindForGlobal - This is a top-level target-independent classifier for
-/// a global variable.  Given an global variable and information from TM, it
-/// classifies the global in a variety of ways that make various target
-/// implementations simpler.  The target implementation is free to ignore this
-/// extra info of course.
+/// a global object.  Given a global variable and information from the TM, this
+/// function classifies the global in a target independent manner. This function
+/// may be overridden by the target implementation.
 SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
                                                        const TargetMachine &TM){
   assert(!GO->isDeclaration() && !GO->hasAvailableExternallyLinkage() &&
          "Can only be used for global definitions");
 
-  Reloc::Model ReloModel = TM.getRelocationModel();
-
-  // Early exit - functions should be always in text sections.
-  const auto *GVar = dyn_cast<GlobalVariable>(GO);
-  if (!GVar)
+  // Functions are classified as text sections.
+  if (isa<Function>(GO))
     return SectionKind::getText();
 
+  // Global variables require more detailed analysis.
+  const auto *GVar = cast<GlobalVariable>(GO);
+
   // Handle thread-local data first.
   if (GVar->isThreadLocal()) {
-    if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
+    if (isSuitableForBSS(GVar) && !TM.Options.NoZerosInBSS)
       return SectionKind::getThreadBSS();
     return SectionKind::getThreadData();
   }
@@ -153,8 +160,9 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
   if (GVar->hasCommonLinkage())
     return SectionKind::getCommon();
 
-  // Variable can be easily put to BSS section.
-  if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
+  // Most non-mergeable zero data can be put in the BSS section unless otherwise
+  // specified.
+  if (isSuitableForBSS(GVar) && !TM.Options.NoZerosInBSS) {
     if (GVar->hasLocalLinkage())
       return SectionKind::getBSSLocal();
     else if (GVar->hasExternalLinkage())
@@ -162,14 +170,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
     return SectionKind::getBSS();
   }
 
-  const Constant *C = GVar->getInitializer();
-
   // If the global is marked constant, we can put it into a mergable section,
   // a mergable string section, or general .data if it contains relocations.
   if (GVar->isConstant()) {
     // If the initializer for the global contains something that requires a
     // relocation, then we may have to drop this into a writable data section
     // even though it is marked const.
+    const Constant *C = GVar->getInitializer();
     if (!C->needsRelocation()) {
       // If the global is required to have a unique address, it can't be put
       // into a mergable section: just drop it into the general read-only
@@ -215,6 +222,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
       // the time the app starts up.  However, we can't put this into a
       // mergable section, because the linker doesn't take relocations into
       // consideration when it tries to merge entries in the section.
+      Reloc::Model ReloModel = TM.getRelocationModel();
       if (ReloModel == Reloc::Static || ReloModel == Reloc::ROPI ||
           ReloModel == Reloc::RWPI || ReloModel == Reloc::ROPI_RWPI)
         return SectionKind::getReadOnly();
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
index c4c0dd22ee0c..092f5ea4104b 100644
--- a/contrib/llvm/lib/Target/TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -13,8 +13,6 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -27,6 +25,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 //---------------------------------------------------------------------------
@@ -52,7 +51,7 @@ bool TargetMachine::isPositionIndependent() const {
   return getRelocationModel() == Reloc::PIC_;
 }
 
-/// \brief Reset the target options based on the function's attributes.
+/// Reset the target options based on the function's attributes.
 // FIXME: This function needs to go away for a number of reasons:
 // a) global state on the TargetMachine is terrible in general,
 // b) these target options should be passed only on the function
@@ -116,12 +115,24 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   if (GV && GV->isDSOLocal())
     return true;
 
-  // According to the llvm language reference, we should be able to just return
-  // false in here if we have a GV, as we know it is dso_preemptable.
-  // At this point in time, the various IR producers have not been transitioned
-  // to always produce a dso_local when it is possible to do so. As a result we
-  // still have some pre-dso_local logic in here to improve the quality of the
-  // generated code:
+  // If we are not supossed to use a PLT, we cannot assume that intrinsics are
+  // local since the linker can convert some direct access to access via plt.
+  if (M.getRtLibUseGOT() && !GV)
+    return false;
+
+  // According to the llvm language reference, we should be able to
+  // just return false in here if we have a GV, as we know it is
+  // dso_preemptable.  At this point in time, the various IR producers
+  // have not been transitioned to always produce a dso_local when it
+  // is possible to do so.
+  // In the case of intrinsics, GV is null and there is nowhere to put
+  // dso_local. Returning false for those will produce worse code in some
+  // architectures. For example, on x86 the caller has to set ebx before calling
+  // a plt.
+  // As a result we still have some logic in here to improve the quality of the
+  // generated code.
+  // FIXME: Add a module level metadata for whether intrinsics should be assumed
+  // local.
 
   Reloc::Model RM = getRelocationModel();
   const Triple &TT = getTargetTriple();
@@ -131,7 +142,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     return false;
 
   // Every other GV is local on COFF.
-  // Make an exception for windows OS in the triple: Some firmwares builds use
+  // Make an exception for windows OS in the triple: Some firmware builds use
   // *-win32-macho triples. This (accidentally?) produced windows relocations
   // without GOT tables in older clang versions; Keep this behaviour.
   if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO()))
@@ -141,12 +152,10 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   // produce a 0 if it turns out the symbol is undefined. While this
   // is ABI and relocation depended, it seems worth it to handle it
   // here.
-  // FIXME: this is probably not ELF specific.
-  if (GV && isPositionIndependent() && TT.isOSBinFormatELF() &&
-      GV->hasExternalWeakLinkage())
+  if (GV && isPositionIndependent() && GV->hasExternalWeakLinkage())
     return false;
 
-  if (GV && (GV->hasLocalLinkage() || !GV->hasDefaultVisibility()))
+  if (GV && !GV->hasDefaultVisibility())
     return true;
 
   if (TT.isOSBinFormatMachO()) {
@@ -174,7 +183,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
 
     bool IsTLS = GV && GV->isThreadLocal();
     bool IsAccessViaCopyRelocs =
-        Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
+        GV && Options.MCOptions.MCPIECopyRelocations && isa<GlobalVariable>(GV);
     Triple::ArchType Arch = TT.getArch();
     bool IsPPC =
         Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::ppc64le;
@@ -187,6 +196,14 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   return false;
 }
 
+bool TargetMachine::useEmulatedTLS() const {
+  // Returns Options.EmulatedTLS if the -emulated-tls or -no-emulated-tls
+  // was specified explicitly; otherwise uses target triple to decide default.
+  if (Options.ExplicitEmulatedTLS)
+    return Options.EmulatedTLS;
+  return getTargetTriple().hasDefaultEmulatedTLS();
+}
+
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
   Reloc::Model RM = getRelocationModel();
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
index 74fe7c5d3cde..37d398d580f8 100644
--- a/contrib/llvm/lib/Target/TargetMachineC.cpp
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -18,12 +18,13 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CodeGenCWrappers.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/CodeGenCWrappers.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdlib>
@@ -195,7 +196,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
       ft = TargetMachine::CGFT_ObjectFile;
       break;
   }
-  if (TM->addPassesToEmitFile(pass, OS, ft)) {
+  if (TM->addPassesToEmitFile(pass, OS, nullptr, ft)) {
     error = "TargetMachine can't emit a file of this type";
     *ErrorMessage = strdup(error.c_str());
     return true;
@@ -237,6 +238,25 @@ char *LLVMGetDefaultTargetTriple(void) {
   return strdup(sys::getDefaultTargetTriple().c_str());
 }
 
+char *LLVMNormalizeTargetTriple(const char* triple) {
+  return strdup(Triple::normalize(StringRef(triple)).c_str());
+}
+
+char *LLVMGetHostCPUName(void) {
+  return strdup(sys::getHostCPUName().data());
+}
+
+char *LLVMGetHostCPUFeatures(void) {
+  SubtargetFeatures Features;
+  StringMap<bool> HostFeatures;
+
+  if (sys::getHostCPUFeatures(HostFeatures))
+    for (auto &F : HostFeatures)
+      Features.AddFeature(F.first(), F.second);
+
+  return strdup(Features.getString().c_str());
+}
+
 void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
   unwrap(PM)->add(
       createTargetTransformInfoWrapperPass(unwrap(T)->getTargetIRAnalysis()));
diff --git a/contrib/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/contrib/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
new file mode 100644
index 000000000000..2d92b93ca704
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -0,0 +1,561 @@
+//==- WebAssemblyAsmParser.cpp - Assembler for WebAssembly -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file is part of the WebAssembly Assembler.
+///
+/// It contains code to translate a parsed .s file into MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "WebAssembly.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-asm-parser"
+
+namespace {
+
+// We store register types as SimpleValueType to retain SIMD layout
+// information, but must also be able to supply them as the (unnamed)
+// register enum from WebAssemblyRegisterInfo.td/.inc.
+static unsigned MVTToWasmReg(MVT::SimpleValueType Type) {
+  switch(Type) {
+    case MVT::i32: return WebAssembly::I32_0;
+    case MVT::i64: return WebAssembly::I64_0;
+    case MVT::f32: return WebAssembly::F32_0;
+    case MVT::f64: return WebAssembly::F64_0;
+    case MVT::v16i8: return WebAssembly::V128_0;
+    case MVT::v8i16: return WebAssembly::V128_0;
+    case MVT::v4i32: return WebAssembly::V128_0;
+    case MVT::v4f32: return WebAssembly::V128_0;
+    default: return MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+}
+
+/// WebAssemblyOperand - Instances of this class represent the operands in a
+/// parsed WASM machine instruction.
+struct WebAssemblyOperand : public MCParsedAsmOperand {
+  enum KindTy { Token, Local, Stack, Integer, Float, Symbol } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokOp {
+    StringRef Tok;
+  };
+
+  struct RegOp {
+    // This is a (virtual) local or stack register represented as 0..
+    unsigned RegNo;
+    // In most targets, the register number also encodes the type, but for
+    // wasm we have to track that seperately since we have an unbounded
+    // number of registers.
+    // This has the unfortunate side effect that we supply a different value
+    // to the table-gen matcher at different times in the process (when it
+    // calls getReg() or addRegOperands().
+    // TODO: While this works, it feels brittle. and would be nice to clean up.
+    MVT::SimpleValueType Type;
+  };
+
+  struct IntOp {
+    int64_t Val;
+  };
+
+  struct FltOp {
+    double Val;
+  };
+
+  struct SymOp {
+    const MCExpr *Exp;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct IntOp Int;
+    struct FltOp Flt;
+    struct SymOp Sym;
+  };
+
+  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, TokOp T)
+    : Kind(K), StartLoc(Start), EndLoc(End), Tok(T) {}
+  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, RegOp R)
+    : Kind(K), StartLoc(Start), EndLoc(End), Reg(R) {}
+  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, IntOp I)
+    : Kind(K), StartLoc(Start), EndLoc(End), Int(I) {}
+  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, FltOp F)
+    : Kind(K), StartLoc(Start), EndLoc(End), Flt(F) {}
+  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, SymOp S)
+    : Kind(K), StartLoc(Start), EndLoc(End), Sym(S) {}
+
+  bool isToken() const override { return Kind == Token; }
+  bool isImm() const override { return Kind == Integer ||
+                                       Kind == Float ||
+                                       Kind == Symbol; }
+  bool isReg() const override { return Kind == Local || Kind == Stack; }
+  bool isMem() const override { return false; }
+
+  unsigned getReg() const override {
+    assert(isReg());
+    // This is called from the tablegen matcher (MatchInstructionImpl)
+    // where it expects to match the type of register, see RegOp above.
+    return MVTToWasmReg(Reg.Type);
+  }
+
+  StringRef getToken() const {
+    assert(isToken());
+    return Tok.Tok;
+  }
+
+  SMLoc getStartLoc() const override { return StartLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(isReg() && "Not a register operand!");
+    // This is called from the tablegen matcher (MatchInstructionImpl)
+    // where it expects to output the actual register index, see RegOp above.
+    unsigned R = Reg.RegNo;
+    if (Kind == Stack) {
+      // A stack register is represented as a large negative number.
+      // See WebAssemblyRegNumbering::runOnMachineFunction and
+      // getWARegStackId for why this | is needed.
+      R |= INT32_MIN;
+    }
+    Inst.addOperand(MCOperand::createReg(R));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == Integer)
+      Inst.addOperand(MCOperand::createImm(Int.Val));
+    else if (Kind == Float)
+      Inst.addOperand(MCOperand::createFPImm(Flt.Val));
+    else if (Kind == Symbol)
+      Inst.addOperand(MCOperand::createExpr(Sym.Exp));
+    else
+      llvm_unreachable("Should be immediate or symbol!");
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case Token:
+      OS << "Tok:" << Tok.Tok;
+      break;
+    case Local:
+      OS << "Loc:" << Reg.RegNo << ":" << static_cast<int>(Reg.Type);
+      break;
+    case Stack:
+      OS << "Stk:" << Reg.RegNo << ":" << static_cast<int>(Reg.Type);
+      break;
+    case Integer:
+      OS << "Int:" << Int.Val;
+      break;
+    case Float:
+      OS << "Flt:" << Flt.Val;
+      break;
+    case Symbol:
+      OS << "Sym:" << Sym.Exp;
+      break;
+    }
+  }
+};
+
+class WebAssemblyAsmParser final : public MCTargetAsmParser {
+  MCAsmParser &Parser;
+  MCAsmLexer &Lexer;
+  // These are for the current function being parsed:
+  // These are vectors since register assignments are so far non-sparse.
+  // Replace by map if necessary.
+  std::vector<MVT::SimpleValueType> LocalTypes;
+  std::vector<MVT::SimpleValueType> StackTypes;
+  MCSymbol *LastLabel;
+
+public:
+  WebAssemblyAsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+                       const MCInstrInfo &mii, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, sti, mii), Parser(Parser),
+        Lexer(Parser.getLexer()), LastLabel(nullptr) {
+  }
+
+#define GET_ASSEMBLER_HEADER
+#include "WebAssemblyGenAsmMatcher.inc"
+
+  // TODO: This is required to be implemented, but appears unused.
+  bool ParseRegister(unsigned &/*RegNo*/, SMLoc &/*StartLoc*/,
+                     SMLoc &/*EndLoc*/) override {
+    llvm_unreachable("ParseRegister is not implemented.");
+  }
+
+  bool Error(const StringRef &msg, const AsmToken &tok) {
+    return Parser.Error(tok.getLoc(), msg + tok.getString());
+  }
+
+  bool IsNext(AsmToken::TokenKind Kind) {
+    auto ok = Lexer.is(Kind);
+    if (ok) Parser.Lex();
+    return ok;
+  }
+
+  bool Expect(AsmToken::TokenKind Kind, const char *KindName) {
+    if (!IsNext(Kind))
+      return Error(std::string("Expected ") + KindName + ", instead got: ",
+                   Lexer.getTok());
+    return false;
+  }
+
+  MVT::SimpleValueType ParseRegType(const StringRef &RegType) {
+    // Derive type from .param .local decls, or the instruction itself.
+    return StringSwitch<MVT::SimpleValueType>(RegType)
+        .Case("i32", MVT::i32)
+        .Case("i64", MVT::i64)
+        .Case("f32", MVT::f32)
+        .Case("f64", MVT::f64)
+        .Case("i8x16", MVT::v16i8)
+        .Case("i16x8", MVT::v8i16)
+        .Case("i32x4", MVT::v4i32)
+        .Case("f32x4", MVT::v4f32)
+        .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
+  }
+
+  MVT::SimpleValueType &GetType(
+      std::vector<MVT::SimpleValueType> &Types, size_t i) {
+    Types.resize(std::max(i + 1, Types.size()), MVT::INVALID_SIMPLE_VALUE_TYPE);
+    return Types[i];
+  }
+
+  bool ParseReg(OperandVector &Operands, StringRef TypePrefix) {
+    if (Lexer.is(AsmToken::Integer)) {
+      auto &Local = Lexer.getTok();
+      // This is a reference to a local, turn it into a virtual register.
+      auto LocalNo = static_cast<unsigned>(Local.getIntVal());
+      Operands.push_back(make_unique<WebAssemblyOperand>(
+                           WebAssemblyOperand::Local, Local.getLoc(),
+                           Local.getEndLoc(),
+                           WebAssemblyOperand::RegOp{LocalNo,
+                               GetType(LocalTypes, LocalNo)}));
+      Parser.Lex();
+    } else if (Lexer.is(AsmToken::Identifier)) {
+      auto &StackRegTok = Lexer.getTok();
+      // These are push/pop/drop pseudo stack registers, which we turn
+      // into virtual registers also. The stackify pass will later turn them
+      // back into implicit stack references if possible.
+      auto StackReg = StackRegTok.getString();
+      auto StackOp = StackReg.take_while([](char c) { return isalpha(c); });
+      auto Reg = StackReg.drop_front(StackOp.size());
+      unsigned long long ParsedRegNo = 0;
+      if (!Reg.empty() && getAsUnsignedInteger(Reg, 10, ParsedRegNo))
+        return Error("Cannot parse stack register index: ", StackRegTok);
+      unsigned RegNo = static_cast<unsigned>(ParsedRegNo);
+      if (StackOp == "push") {
+        // This defines a result, record register type.
+        auto RegType = ParseRegType(TypePrefix);
+        GetType(StackTypes, RegNo) = RegType;
+        Operands.push_back(make_unique<WebAssemblyOperand>(
+                             WebAssemblyOperand::Stack,
+                             StackRegTok.getLoc(),
+                             StackRegTok.getEndLoc(),
+                             WebAssemblyOperand::RegOp{RegNo, RegType}));
+      } else if (StackOp == "pop") {
+        // This uses a previously defined stack value.
+        auto RegType = GetType(StackTypes, RegNo);
+        Operands.push_back(make_unique<WebAssemblyOperand>(
+                             WebAssemblyOperand::Stack,
+                             StackRegTok.getLoc(),
+                             StackRegTok.getEndLoc(),
+                             WebAssemblyOperand::RegOp{RegNo, RegType}));
+      } else if (StackOp == "drop") {
+        // This operand will be dropped, since it is part of an instruction
+        // whose result is void.
+      } else {
+        return Error("Unknown stack register prefix: ", StackRegTok);
+      }
+      Parser.Lex();
+    } else {
+      return Error(
+            "Expected identifier/integer following $, instead got: ",
+            Lexer.getTok());
+    }
+    IsNext(AsmToken::Equal);
+    return false;
+  }
+
+  void ParseSingleInteger(bool IsNegative, OperandVector &Operands) {
+    auto &Int = Lexer.getTok();
+    int64_t Val = Int.getIntVal();
+    if (IsNegative) Val = -Val;
+    Operands.push_back(make_unique<WebAssemblyOperand>(
+                         WebAssemblyOperand::Integer, Int.getLoc(),
+                         Int.getEndLoc(), WebAssemblyOperand::IntOp{Val}));
+    Parser.Lex();
+  }
+
+  bool ParseOperandStartingWithInteger(bool IsNegative,
+                                       OperandVector &Operands,
+                                       StringRef InstType) {
+    ParseSingleInteger(IsNegative, Operands);
+    if (Lexer.is(AsmToken::LParen)) {
+      // Parse load/store operands of the form: offset($reg)align
+      auto &LParen = Lexer.getTok();
+      Operands.push_back(
+            make_unique<WebAssemblyOperand>(WebAssemblyOperand::Token,
+                                            LParen.getLoc(),
+                                            LParen.getEndLoc(),
+                                            WebAssemblyOperand::TokOp{
+                                              LParen.getString()}));
+      Parser.Lex();
+      if (Expect(AsmToken::Dollar, "register")) return true;
+      if (ParseReg(Operands, InstType)) return true;
+      auto &RParen = Lexer.getTok();
+      Operands.push_back(
+            make_unique<WebAssemblyOperand>(WebAssemblyOperand::Token,
+                                            RParen.getLoc(),
+                                            RParen.getEndLoc(),
+                                            WebAssemblyOperand::TokOp{
+                                              RParen.getString()}));
+      if (Expect(AsmToken::RParen, ")")) return true;
+      if (Lexer.is(AsmToken::Integer)) {
+        ParseSingleInteger(false, Operands);
+      } else {
+        // Alignment not specified.
+        // FIXME: correctly derive a default from the instruction.
+        Operands.push_back(make_unique<WebAssemblyOperand>(
+                             WebAssemblyOperand::Integer, RParen.getLoc(),
+                             RParen.getEndLoc(), WebAssemblyOperand::IntOp{0}));
+      }
+    }
+    return false;
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &/*Info*/, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override {
+    Operands.push_back(
+          make_unique<WebAssemblyOperand>(WebAssemblyOperand::Token, NameLoc,
+                                          SMLoc::getFromPointer(
+                                            NameLoc.getPointer() + Name.size()),
+                                          WebAssemblyOperand::TokOp{
+                                            StringRef(NameLoc.getPointer(),
+                                                    Name.size())}));
+    auto NamePair = Name.split('.');
+    // If no '.', there is no type prefix.
+    if (NamePair.second.empty()) std::swap(NamePair.first, NamePair.second);
+    while (Lexer.isNot(AsmToken::EndOfStatement)) {
+      auto &Tok = Lexer.getTok();
+      switch (Tok.getKind()) {
+      case AsmToken::Dollar: {
+        Parser.Lex();
+        if (ParseReg(Operands, NamePair.first)) return true;
+        break;
+      }
+      case AsmToken::Identifier: {
+        auto &Id = Lexer.getTok();
+        const MCExpr *Val;
+        SMLoc End;
+        if (Parser.parsePrimaryExpr(Val, End))
+          return Error("Cannot parse symbol: ", Lexer.getTok());
+        Operands.push_back(make_unique<WebAssemblyOperand>(
+                             WebAssemblyOperand::Symbol, Id.getLoc(),
+                             Id.getEndLoc(), WebAssemblyOperand::SymOp{Val}));
+        break;
+      }
+      case AsmToken::Minus:
+        Parser.Lex();
+        if (Lexer.isNot(AsmToken::Integer))
+          return Error("Expected integer instead got: ", Lexer.getTok());
+        if (ParseOperandStartingWithInteger(true, Operands, NamePair.first))
+          return true;
+        break;
+      case AsmToken::Integer:
+        if (ParseOperandStartingWithInteger(false, Operands, NamePair.first))
+          return true;
+        break;
+      case AsmToken::Real: {
+        double Val;
+        if (Tok.getString().getAsDouble(Val, false))
+          return Error("Cannot parse real: ", Tok);
+        Operands.push_back(make_unique<WebAssemblyOperand>(
+                             WebAssemblyOperand::Float, Tok.getLoc(),
+                             Tok.getEndLoc(), WebAssemblyOperand::FltOp{Val}));
+        Parser.Lex();
+        break;
+      }
+      default:
+        return Error("Unexpected token in operand: ", Tok);
+      }
+      if (Lexer.isNot(AsmToken::EndOfStatement)) {
+        if (Expect(AsmToken::Comma, ",")) return true;
+      }
+    }
+    Parser.Lex();
+    // Call instructions are vararg, but the tablegen matcher doesn't seem to
+    // support that, so for now we strip these extra operands.
+    // This is problematic if these arguments are not simple $pop stack
+    // registers, since e.g. a local register would get lost, so we check for
+    // this. This can be the case when using -disable-wasm-explicit-locals
+    // which currently s2wasm requires.
+    // TODO: Instead, we can move this code to MatchAndEmitInstruction below and
+    // actually generate get_local instructions on the fly.
+    // Or even better, improve the matcher to support vararg?
+    auto IsIndirect = NamePair.second == "call_indirect";
+    if (IsIndirect || NamePair.second == "call") {
+      // Figure out number of fixed operands from the instruction.
+      size_t CallOperands = 1;  // The name token.
+      if (!IsIndirect) CallOperands++;  // The function index.
+      if (!NamePair.first.empty()) CallOperands++;  // The result register.
+      if (Operands.size() > CallOperands) {
+        // Ensure operands we drop are all $pop.
+        for (size_t I = CallOperands; I < Operands.size(); I++) {
+          auto Operand =
+              reinterpret_cast<WebAssemblyOperand *>(Operands[I].get());
+          if (Operand->Kind != WebAssemblyOperand::Stack)
+            Parser.Error(NameLoc,
+              "Call instruction has non-stack arguments, if this code was "
+              "generated with -disable-wasm-explicit-locals please remove it");
+        }
+        // Drop unneeded operands.
+        Operands.resize(CallOperands);
+      }
+    }
+    // Block instructions require a signature index, but these are missing in
+    // assembly, so we add a dummy one explicitly (since we have no control
+    // over signature tables here, we assume these will be regenerated when
+    // the wasm module is generated).
+    if (NamePair.second == "block" || NamePair.second == "loop") {
+      Operands.push_back(make_unique<WebAssemblyOperand>(
+                           WebAssemblyOperand::Integer, NameLoc,
+                           NameLoc, WebAssemblyOperand::IntOp{-1}));
+    }
+    // These don't specify the type, which has to derived from the local index.
+    if (NamePair.second == "get_local" || NamePair.second == "tee_local") {
+      if (Operands.size() >= 3 && Operands[1]->isReg() &&
+          Operands[2]->isImm()) {
+        auto Op1 = reinterpret_cast<WebAssemblyOperand *>(Operands[1].get());
+        auto Op2 = reinterpret_cast<WebAssemblyOperand *>(Operands[2].get());
+        auto Type = GetType(LocalTypes, static_cast<size_t>(Op2->Int.Val));
+        Op1->Reg.Type = Type;
+        GetType(StackTypes, Op1->Reg.RegNo) = Type;
+      }
+    }
+    return false;
+  }
+
+  void onLabelParsed(MCSymbol *Symbol) override {
+    LastLabel = Symbol;
+  }
+
+  bool ParseDirective(AsmToken DirectiveID) override {
+    assert(DirectiveID.getKind() == AsmToken::Identifier);
+    auto &Out = getStreamer();
+    auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
+                   *Out.getTargetStreamer());
+    // TODO: we're just parsing the subset of directives we're interested in,
+    // and ignoring ones we don't recognise. We should ideally verify
+    // all directives here.
+    if (DirectiveID.getString() == ".type") {
+      // This could be the start of a function, check if followed by
+      // "label,@function"
+      if (!(IsNext(AsmToken::Identifier) &&
+            IsNext(AsmToken::Comma) &&
+            IsNext(AsmToken::At) &&
+            Lexer.is(AsmToken::Identifier)))
+        return Error("Expected label,@type declaration, got: ", Lexer.getTok());
+      if (Lexer.getTok().getString() == "function") {
+        // Track locals from start of function.
+        LocalTypes.clear();
+        StackTypes.clear();
+      }
+      Parser.Lex();
+      //Out.EmitSymbolAttribute(??, MCSA_ELF_TypeFunction);
+    } else if (DirectiveID.getString() == ".param" ||
+               DirectiveID.getString() == ".local") {
+      // Track the number of locals, needed for correct virtual register
+      // assignment elsewhere.
+      // Also output a directive to the streamer.
+      std::vector<MVT> Params;
+      std::vector<MVT> Locals;
+      while (Lexer.is(AsmToken::Identifier)) {
+        auto RegType = ParseRegType(Lexer.getTok().getString());
+        if (RegType == MVT::INVALID_SIMPLE_VALUE_TYPE) return true;
+        LocalTypes.push_back(RegType);
+        if (DirectiveID.getString() == ".param") {
+          Params.push_back(RegType);
+        } else {
+          Locals.push_back(RegType);
+        }
+        Parser.Lex();
+        if (!IsNext(AsmToken::Comma)) break;
+      }
+      assert(LastLabel);
+      TOut.emitParam(LastLabel, Params);
+      TOut.emitLocal(Locals);
+    } else {
+      // For now, ignore anydirective we don't recognize:
+      while (Lexer.isNot(AsmToken::EndOfStatement)) Parser.Lex();
+    }
+    return Expect(AsmToken::EndOfStatement, "EOL");
+  }
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &/*Opcode*/,
+                               OperandVector &Operands,
+                               MCStreamer &Out, uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override {
+    MCInst Inst;
+    unsigned MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+    switch (MatchResult) {
+    case Match_Success: {
+      Out.EmitInstruction(Inst, getSTI());
+      return false;
+    }
+    case Match_MissingFeature:
+      return Parser.Error(IDLoc,
+          "instruction requires a WASM feature not currently enabled");
+    case Match_MnemonicFail:
+      return Parser.Error(IDLoc, "invalid instruction");
+    case Match_NearMisses:
+      return Parser.Error(IDLoc, "ambiguous instruction");
+    case Match_InvalidTiedOperand:
+    case Match_InvalidOperand: {
+      SMLoc ErrorLoc = IDLoc;
+      if (ErrorInfo != ~0ULL) {
+        if (ErrorInfo >= Operands.size())
+          return Parser.Error(IDLoc, "too few operands for instruction");
+        ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+        if (ErrorLoc == SMLoc())
+          ErrorLoc = IDLoc;
+      }
+      return Parser.Error(ErrorLoc, "invalid operand for instruction");
+    }
+    }
+    llvm_unreachable("Implement any new match types added!");
+  }
+};
+} // end anonymous namespace
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyAsmParser() {
+  RegisterMCAsmParser<WebAssemblyAsmParser> X(getTheWebAssemblyTarget32());
+  RegisterMCAsmParser<WebAssemblyAsmParser> Y(getTheWebAssemblyTarget64());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "WebAssemblyGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 9be11da9afac..2f0960271e30 100644
--- a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file is part of the WebAssembly Disassembler.
+/// This file is part of the WebAssembly Disassembler.
 ///
 /// It contains code to translate the data produced by the decoder into
 /// MCInsts.
@@ -19,16 +19,23 @@
 #include "WebAssembly.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-disassembler"
 
+using DecodeStatus = MCDisassembler::DecodeStatus;
+
+#include "WebAssemblyGenDisassemblerTables.inc"
+
 namespace {
 class WebAssemblyDisassembler final : public MCDisassembler {
   std::unique_ptr<const MCInstrInfo> MCII;
@@ -60,11 +67,120 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
                                          createWebAssemblyDisassembler);
 }
 
-MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
-    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
-    raw_ostream &OS, raw_ostream &CS) const {
+static int nextByte(ArrayRef<uint8_t> Bytes, uint64_t &Size) {
+  if (Size >= Bytes.size())
+    return -1;
+  auto V = Bytes[Size];
+  Size++;
+  return V;
+}
 
-  // TODO: Implement disassembly.
+static bool parseLEBImmediate(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, bool Signed) {
+  unsigned N = 0;
+  const char *Error = nullptr;
+  auto Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
+                                    Bytes.data() + Bytes.size(), &Error)
+                    : static_cast<int64_t>(
+                          decodeULEB128(Bytes.data() + Size, &N,
+                                        Bytes.data() + Bytes.size(), &Error));
+  if (Error)
+    return false;
+  Size += N;
+  MI.addOperand(MCOperand::createImm(Val));
+  return true;
+}
+
+template <typename T>
+bool parseFPImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
+  if (Size + sizeof(T) > Bytes.size())
+    return false;
+  T Val;
+  memcpy(&Val, Bytes.data() + Size, sizeof(T));
+  support::endian::byte_swap<T, support::endianness::little>(Val);
+  Size += sizeof(T);
+  MI.addOperand(MCOperand::createFPImm(static_cast<double>(Val)));
+  return true;
+}
 
-  return MCDisassembler::Fail;
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+    raw_ostream & /*OS*/, raw_ostream &CS) const {
+  CommentStream = &CS;
+  Size = 0;
+  auto Opc = nextByte(Bytes, Size);
+  if (Opc < 0)
+    return MCDisassembler::Fail;
+  const auto *WasmInst = &InstructionTable0[Opc];
+  // If this is a prefix byte, indirect to another table.
+  if (WasmInst->ET == ET_Prefix) {
+    WasmInst = nullptr;
+    // Linear search, so far only 2 entries.
+    for (auto PT = PrefixTable; PT->Table; PT++) {
+      if (PT->Prefix == Opc) {
+        WasmInst = PT->Table;
+        break;
+      }
+    }
+    if (!WasmInst)
+      return MCDisassembler::Fail;
+    Opc = nextByte(Bytes, Size);
+    if (Opc < 0)
+      return MCDisassembler::Fail;
+    WasmInst += Opc;
+  }
+  if (WasmInst->ET == ET_Unused)
+    return MCDisassembler::Fail;
+  // At this point we must have a valid instruction to decode.
+  assert(WasmInst->ET == ET_Instruction);
+  MI.setOpcode(WasmInst->Opcode);
+  // Parse any operands.
+  for (uint8_t OPI = 0; OPI < WasmInst->NumOperands; OPI++) {
+    switch (WasmInst->Operands[OPI]) {
+    // ULEB operands:
+    case WebAssembly::OPERAND_BASIC_BLOCK:
+    case WebAssembly::OPERAND_LOCAL:
+    case WebAssembly::OPERAND_GLOBAL:
+    case WebAssembly::OPERAND_FUNCTION32:
+    case WebAssembly::OPERAND_OFFSET32:
+    case WebAssembly::OPERAND_P2ALIGN:
+    case WebAssembly::OPERAND_TYPEINDEX:
+    case MCOI::OPERAND_IMMEDIATE: {
+      if (!parseLEBImmediate(MI, Size, Bytes, false))
+        return MCDisassembler::Fail;
+      break;
+    }
+    // SLEB operands:
+    case WebAssembly::OPERAND_I32IMM:
+    case WebAssembly::OPERAND_I64IMM:
+    case WebAssembly::OPERAND_SIGNATURE: {
+      if (!parseLEBImmediate(MI, Size, Bytes, true))
+        return MCDisassembler::Fail;
+      break;
+    }
+    // FP operands.
+    case WebAssembly::OPERAND_F32IMM: {
+      if (!parseFPImmediate<float>(MI, Size, Bytes))
+        return MCDisassembler::Fail;
+      break;
+    }
+    case WebAssembly::OPERAND_F64IMM: {
+      if (!parseFPImmediate<double>(MI, Size, Bytes))
+        return MCDisassembler::Fail;
+      break;
+    }
+    case MCOI::OPERAND_REGISTER: {
+      // These are NOT actually in the instruction stream, but MC is going to
+      // expect operands to be present for them!
+      // FIXME: can MC re-generate register assignments or do we have to
+      // do this? Since this function decodes a single instruction, we don't
+      // have the proper context for tracking an operand stack here.
+      MI.addOperand(MCOperand::createReg(0));
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown operand type in WebAssemblyDisassembler");
+    }
+  }
+  return MCDisassembler::Success;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index c3f0f2787146..10fa798ac8d7 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Print MCInst instructions to wasm format.
+/// Print MCInst instructions to wasm format.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -46,7 +46,7 @@ void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
 
 void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                        StringRef Annot,
-                                       const MCSubtargetInfo & /*STI*/) {
+                                       const MCSubtargetInfo &STI) {
   // Print the instruction (this uses the AsmStrings from the .td files).
   printInstruction(MI, OS);
 
@@ -82,10 +82,12 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
       break;
     case WebAssembly::END_LOOP:
-      ControlFlowStack.pop_back();
+      // Have to guard against an empty stack, in case of mismatched pairs
+      // in assembly parsing.
+      if (!ControlFlowStack.empty()) ControlFlowStack.pop_back();
       break;
     case WebAssembly::END_BLOCK:
-      printAnnotation(
+      if (!ControlFlowStack.empty()) printAnnotation(
           OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
       break;
     }
@@ -176,10 +178,10 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
       // TODO: MC converts all floating point immediate operands to double.
       // This is fine for numeric values, but may cause NaNs to change bits.
-      O << toString(APFloat(float(Op.getFPImm())));
+      O << ::toString(APFloat(float(Op.getFPImm())));
     } else {
       assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
-      O << toString(APFloat(Op.getFPImm()));
+      O << ::toString(APFloat(Op.getFPImm()));
     }
   } else {
     assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
@@ -192,20 +194,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void
-WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
-                                                       unsigned OpNo,
-                                                       raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(
+    const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   int64_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
     return;
   O << ":p2align=" << Imm;
 }
 
-void
-WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
-                                                         unsigned OpNo,
-                                                         raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(
+    const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   int64_t Imm = MI->getOperand(OpNo).getImm();
   switch (WebAssembly::ExprType(Imm)) {
   case WebAssembly::ExprType::Void: break;
@@ -220,6 +218,7 @@ WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
   case WebAssembly::ExprType::B8x16: O << "b8x16"; break;
   case WebAssembly::ExprType::B16x8: O << "b16x8"; break;
   case WebAssembly::ExprType::B32x4: O << "b32x4"; break;
+  case WebAssembly::ExprType::ExceptRef: O << "except_ref"; break;
   }
 }
 
@@ -238,6 +237,8 @@ const char *llvm::WebAssembly::TypeToString(MVT Ty) {
   case MVT::v4i32:
   case MVT::v4f32:
     return "v128";
+  case MVT::ExceptRef:
+    return "except_ref";
   default:
     llvm_unreachable("unsupported type");
   }
@@ -253,6 +254,8 @@ const char *llvm::WebAssembly::TypeToString(wasm::ValType Type) {
     return "f32";
   case wasm::ValType::F64:
     return "f64";
+  case wasm::ValType::EXCEPT_REF:
+    return "except_ref";
   }
   llvm_unreachable("unsupported type");
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index b1de84d7e8e6..f5b890a7615e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This class prints an WebAssembly MCInst to wasm file syntax.
+/// This class prints an WebAssembly MCInst to wasm file syntax.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,8 +17,8 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/Wasm.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 226a3b35f2cf..244c2189b455 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the WebAssemblyAsmBackend class.
+/// This file implements the WebAssemblyAsmBackend class.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -26,51 +25,17 @@
 #include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 namespace {
-class WebAssemblyAsmBackendELF final : public MCAsmBackend {
-  bool Is64Bit;
-
-public:
-  explicit WebAssemblyAsmBackendELF(bool Is64Bit)
-      : MCAsmBackend(), Is64Bit(Is64Bit) {}
-  ~WebAssemblyAsmBackendELF() override {}
-
-  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
-                  const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsPCRel) const override;
-
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
-
-  // No instruction requires relaxation
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override {
-    return false;
-  }
-
-  unsigned getNumFixupKinds() const override {
-    // We currently just use the generic fixups in MCFixup.h and don't have any
-    // target-specific fixups.
-    return 0;
-  }
-
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
-
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {}
-
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-};
 
 class WebAssemblyAsmBackend final : public MCAsmBackend {
   bool Is64Bit;
 
 public:
   explicit WebAssemblyAsmBackend(bool Is64Bit)
-      : MCAsmBackend(), Is64Bit(Is64Bit) {}
+      : MCAsmBackend(support::little), Is64Bit(Is64Bit) {}
   ~WebAssemblyAsmBackend() override {}
 
   unsigned getNumFixupKinds() const override {
@@ -81,10 +46,11 @@ public:
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel,
+                  const MCSubtargetInfo *STI) const override;
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
   // No instruction requires relaxation
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -93,51 +59,17 @@ public:
     return false;
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
 
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
-bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
-                                            MCObjectWriter *OW) const {
-  for (uint64_t i = 0; i < Count; ++i)
-    OW->write8(WebAssembly::Nop);
-
-  return true;
-}
-
-void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm,
-                                          const MCFixup &Fixup,
-                                          const MCValue &Target,
-                                          MutableArrayRef<char> Data,
-                                          uint64_t Value, bool IsPCRel) const {
-  const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
-  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
-
-  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
-  if (Value == 0)
-    return; // Doesn't change encoding.
-
-  // Shift the value into position.
-  Value <<= Info.TargetOffset;
-
-  unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
-
-  // For each byte of the fragment that the fixup touches, mask in the
-  // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-std::unique_ptr<MCObjectWriter>
-WebAssemblyAsmBackendELF::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
-}
-
 const MCFixupKindInfo &
 WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo Infos[WebAssembly::NumTargetFixupKinds] = {
@@ -158,13 +90,10 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
-                                         MCObjectWriter *OW) const {
-  if (Count == 0)
-    return true;
-
+bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS,
+                                         uint64_t Count) const {
   for (uint64_t i = 0; i < Count; ++i)
-    OW->write8(WebAssembly::Nop);
+    OS << char(WebAssembly::Nop);
 
   return true;
 }
@@ -173,7 +102,8 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
                                        const MCFixup &Fixup,
                                        const MCValue &Target,
                                        MutableArrayRef<char> Data,
-                                       uint64_t Value, bool IsPCRel) const {
+                                       uint64_t Value, bool IsPCRel,
+                                       const MCSubtargetInfo *STI) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
@@ -193,14 +123,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
     Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
 }
 
-std::unique_ptr<MCObjectWriter>
-WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createWebAssemblyWasmObjectWriter(OS, Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+WebAssemblyAsmBackend::createObjectTargetWriter() const {
+  return createWebAssemblyWasmObjectWriter(Is64Bit);
 }
+
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
-  if (TT.isOSBinFormatELF())
-    return new WebAssemblyAsmBackendELF(TT.isArch64Bit());
   return new WebAssemblyAsmBackend(TT.isArch64Bit());
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
deleted file mode 100644
index b67ecfa455b3..000000000000
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file handles ELF-specific object emission, converting LLVM's
-/// internal fixups into the appropriate relocations.
-///
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/Support/ErrorHandling.h"
-using namespace llvm;
-
-namespace {
-class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter {
-public:
-  WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
-
-protected:
-  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
-                        const MCFixup &Fixup, bool IsPCRel) const override;
-};
-} // end anonymous namespace
-
-WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
-                                                       uint8_t OSABI)
-    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
-                              /*HasRelocationAddend=*/false) {}
-
-unsigned WebAssemblyELFObjectWriter::getRelocType(MCContext &Ctx,
-                                                  const MCValue &Target,
-                                                  const MCFixup &Fixup,
-                                                  bool IsPCRel) const {
-  // WebAssembly functions are not allocated in the address space. To resolve a
-  // pointer to a function, we must use a special relocation type.
-  if (const MCSymbolRefExpr *SyExp =
-          dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-    if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
-      return ELF::R_WEBASSEMBLY_FUNCTION;
-
-  switch (Fixup.getKind()) {
-  case FK_Data_4:
-    assert(!is64Bit() && "4-byte relocations only supported on wasm32");
-    return ELF::R_WEBASSEMBLY_DATA;
-  case FK_Data_8:
-    assert(is64Bit() && "8-byte relocations only supported on wasm64");
-    return ELF::R_WEBASSEMBLY_DATA;
-  default:
-    llvm_unreachable("unimplemented fixup kind");
-  }
-}
-
-std::unique_ptr<MCObjectWriter>
-llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
-                                       bool Is64Bit,
-                                       uint8_t OSABI) {
-  auto MOTW = llvm::make_unique<WebAssemblyELFObjectWriter>(Is64Bit, OSABI);
-  return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian=*/true);
-}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 5f8c78ed1683..44fcc129c39e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -8,50 +8,18 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the declarations of the WebAssemblyMCAsmInfo
+/// This file contains the declarations of the WebAssemblyMCAsmInfo
 /// properties.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-asm-info"
 
-WebAssemblyMCAsmInfoELF::~WebAssemblyMCAsmInfoELF() {}
-
-WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) {
-  CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
-
-  // TODO: What should MaxInstLength be?
-
-  UseDataRegionDirectives = true;
-
-  // Use .skip instead of .zero because .zero is confusing when used with two
-  // arguments (it doesn't actually zero things out).
-  ZeroDirective = "\t.skip\t";
-
-  Data8bitsDirective = "\t.int8\t";
-  Data16bitsDirective = "\t.int16\t";
-  Data32bitsDirective = "\t.int32\t";
-  Data64bitsDirective = "\t.int64\t";
-
-  AlignmentIsInBytes = false;
-  COMMDirectiveAlignmentIsInBytes = false;
-  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
-
-  SupportsDebugInformation = true;
-
-  // For now, WebAssembly does not support exceptions.
-  ExceptionsType = ExceptionHandling::None;
-
-  // TODO: UseIntegratedAssembler?
-
-  // WebAssembly's stack is never executable.
-  UsesNonexecutableStackSection = false;
-}
-
 WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
 
 WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
@@ -76,8 +44,5 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
 
   SupportsDebugInformation = true;
 
-  // For now, WebAssembly does not support exceptions.
-  ExceptionsType = ExceptionHandling::None;
-
   // TODO: UseIntegratedAssembler?
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index d9547096190e..8627a6e40c6a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -8,26 +8,19 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the declaration of the WebAssemblyMCAsmInfo class.
+/// This file contains the declaration of the WebAssemblyMCAsmInfo class.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 
-#include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCAsmInfoWasm.h"
 
 namespace llvm {
 
 class Triple;
 
-class WebAssemblyMCAsmInfoELF final : public MCAsmInfoELF {
-public:
-  explicit WebAssemblyMCAsmInfoELF(const Triple &T);
-  ~WebAssemblyMCAsmInfoELF() override;
-};
-
 class WebAssemblyMCAsmInfo final : public MCAsmInfoWasm {
 public:
   explicit WebAssemblyMCAsmInfo(const Triple &T);
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 77744e53d62f..94ca94e1e18c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the WebAssemblyMCCodeEmitter class.
+/// This file implements the WebAssemblyMCCodeEmitter class.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -23,9 +23,11 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -86,14 +88,18 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         assert(Desc.TSFlags == 0 &&
                "WebAssembly non-variable_ops don't use TSFlags");
         const MCOperandInfo &Info = Desc.OpInfo[i];
+        LLVM_DEBUG(dbgs() << "Encoding immediate: type="
+                          << int(Info.OperandType) << "\n");
         if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
           encodeSLEB128(int32_t(MO.getImm()), OS);
+        } else if (Info.OperandType == WebAssembly::OPERAND_OFFSET32) {
+          encodeULEB128(uint32_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
           llvm_unreachable("wasm globals should only be accessed symbolicly");
         } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
-          encodeSLEB128(int64_t(MO.getImm()), OS);
+          OS << uint8_t(MO.getImm());
         } else {
           encodeULEB128(uint64_t(MO.getImm()), OS);
         }
@@ -112,11 +118,11 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         // TODO: MC converts all floating point immediate operands to double.
         // This is fine for numeric values, but may cause NaNs to change bits.
         float f = float(MO.getFPImm());
-        support::endian::Writer<support::little>(OS).write<float>(f);
+        support::endian::write<float>(OS, f, support::little);
       } else {
         assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
         double d = MO.getFPImm();
-        support::endian::Writer<support::little>(OS).write<double>(d);
+        support::endian::write<double>(OS, d, support::little);
       }
     } else if (MO.isExpr()) {
       const MCOperandInfo &Info = Desc.OpInfo[i];
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index e7c8809de70e..baf8a0c96c0a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file provides WebAssembly-specific target descriptions.
+/// This file provides WebAssembly-specific target descriptions.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -36,8 +36,6 @@ using namespace llvm;
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT) {
-  if (TT.isOSBinFormatELF())
-    return new WebAssemblyMCAsmInfoELF(TT);
   return new WebAssemblyMCAsmInfo(TT);
 }
 
@@ -82,10 +80,6 @@ static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
 
 static MCTargetStreamer *
 createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
-  const Triple &TT = STI.getTargetTriple();
-  if (TT.isOSBinFormatELF())
-    return new WebAssemblyTargetELFStreamer(S);
-
   return new WebAssemblyTargetWasmStreamer(S);
 }
 
@@ -135,6 +129,7 @@ wasm::ValType WebAssembly::toValType(const MVT &Ty) {
   case MVT::i64: return wasm::ValType::I64;
   case MVT::f32: return wasm::ValType::F32;
   case MVT::f64: return wasm::ValType::F64;
+  case MVT::ExceptRef: return wasm::ValType::EXCEPT_REF;
   default: llvm_unreachable("unexpected type");
   }
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 7dca89ab822d..c1c8d243e920 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file provides WebAssembly-specific target descriptions.
+/// This file provides WebAssembly-specific target descriptions.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -26,7 +26,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCSubtargetInfo;
 class MVT;
 class Target;
@@ -40,13 +40,8 @@ MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
-std::unique_ptr<MCObjectWriter>
-createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
-                                 bool Is64Bit, uint8_t OSABI);
-
-std::unique_ptr<MCObjectWriter>
-createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
-                                  bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+createWebAssemblyWasmObjectWriter(bool Is64Bit);
 
 namespace WebAssembly {
 enum OperandType {
@@ -111,38 +106,166 @@ namespace WebAssembly {
 inline unsigned GetDefaultP2Align(unsigned Opcode) {
   switch (Opcode) {
   case WebAssembly::LOAD8_S_I32:
+  case WebAssembly::LOAD8_S_I32_S:
   case WebAssembly::LOAD8_U_I32:
+  case WebAssembly::LOAD8_U_I32_S:
   case WebAssembly::LOAD8_S_I64:
+  case WebAssembly::LOAD8_S_I64_S:
   case WebAssembly::LOAD8_U_I64:
+  case WebAssembly::LOAD8_U_I64_S:
   case WebAssembly::ATOMIC_LOAD8_U_I32:
+  case WebAssembly::ATOMIC_LOAD8_U_I32_S:
   case WebAssembly::ATOMIC_LOAD8_U_I64:
+  case WebAssembly::ATOMIC_LOAD8_U_I64_S:
   case WebAssembly::STORE8_I32:
+  case WebAssembly::STORE8_I32_S:
   case WebAssembly::STORE8_I64:
+  case WebAssembly::STORE8_I64_S:
+  case WebAssembly::ATOMIC_STORE8_I32:
+  case WebAssembly::ATOMIC_STORE8_I32_S:
+  case WebAssembly::ATOMIC_STORE8_I64:
+  case WebAssembly::ATOMIC_STORE8_I64_S:
+  case WebAssembly::ATOMIC_RMW8_U_ADD_I32:
+  case WebAssembly::ATOMIC_RMW8_U_ADD_I32_S:
+  case WebAssembly::ATOMIC_RMW8_U_ADD_I64:
+  case WebAssembly::ATOMIC_RMW8_U_ADD_I64_S:
+  case WebAssembly::ATOMIC_RMW8_U_SUB_I32:
+  case WebAssembly::ATOMIC_RMW8_U_SUB_I32_S:
+  case WebAssembly::ATOMIC_RMW8_U_SUB_I64:
+  case WebAssembly::ATOMIC_RMW8_U_SUB_I64_S:
+  case WebAssembly::ATOMIC_RMW8_U_AND_I32:
+  case WebAssembly::ATOMIC_RMW8_U_AND_I32_S:
+  case WebAssembly::ATOMIC_RMW8_U_AND_I64:
+  case WebAssembly::ATOMIC_RMW8_U_AND_I64_S:
+  case WebAssembly::ATOMIC_RMW8_U_OR_I32:
+  case WebAssembly::ATOMIC_RMW8_U_OR_I32_S:
+  case WebAssembly::ATOMIC_RMW8_U_OR_I64:
+  case WebAssembly::ATOMIC_RMW8_U_OR_I64_S:
+  case WebAssembly::ATOMIC_RMW8_U_XOR_I32:
+  case WebAssembly::ATOMIC_RMW8_U_XOR_I32_S:
+  case WebAssembly::ATOMIC_RMW8_U_XOR_I64:
+  case WebAssembly::ATOMIC_RMW8_U_XOR_I64_S:
+  case WebAssembly::ATOMIC_RMW8_U_XCHG_I32:
+  case WebAssembly::ATOMIC_RMW8_U_XCHG_I32_S:
+  case WebAssembly::ATOMIC_RMW8_U_XCHG_I64:
+  case WebAssembly::ATOMIC_RMW8_U_XCHG_I64_S:
     return 0;
   case WebAssembly::LOAD16_S_I32:
+  case WebAssembly::LOAD16_S_I32_S:
   case WebAssembly::LOAD16_U_I32:
+  case WebAssembly::LOAD16_U_I32_S:
   case WebAssembly::LOAD16_S_I64:
+  case WebAssembly::LOAD16_S_I64_S:
   case WebAssembly::LOAD16_U_I64:
+  case WebAssembly::LOAD16_U_I64_S:
   case WebAssembly::ATOMIC_LOAD16_U_I32:
+  case WebAssembly::ATOMIC_LOAD16_U_I32_S:
   case WebAssembly::ATOMIC_LOAD16_U_I64:
+  case WebAssembly::ATOMIC_LOAD16_U_I64_S:
   case WebAssembly::STORE16_I32:
+  case WebAssembly::STORE16_I32_S:
   case WebAssembly::STORE16_I64:
+  case WebAssembly::STORE16_I64_S:
+  case WebAssembly::ATOMIC_STORE16_I32:
+  case WebAssembly::ATOMIC_STORE16_I32_S:
+  case WebAssembly::ATOMIC_STORE16_I64:
+  case WebAssembly::ATOMIC_STORE16_I64_S:
+  case WebAssembly::ATOMIC_RMW16_U_ADD_I32:
+  case WebAssembly::ATOMIC_RMW16_U_ADD_I32_S:
+  case WebAssembly::ATOMIC_RMW16_U_ADD_I64:
+  case WebAssembly::ATOMIC_RMW16_U_ADD_I64_S:
+  case WebAssembly::ATOMIC_RMW16_U_SUB_I32:
+  case WebAssembly::ATOMIC_RMW16_U_SUB_I32_S:
+  case WebAssembly::ATOMIC_RMW16_U_SUB_I64:
+  case WebAssembly::ATOMIC_RMW16_U_SUB_I64_S:
+  case WebAssembly::ATOMIC_RMW16_U_AND_I32:
+  case WebAssembly::ATOMIC_RMW16_U_AND_I32_S:
+  case WebAssembly::ATOMIC_RMW16_U_AND_I64:
+  case WebAssembly::ATOMIC_RMW16_U_AND_I64_S:
+  case WebAssembly::ATOMIC_RMW16_U_OR_I32:
+  case WebAssembly::ATOMIC_RMW16_U_OR_I32_S:
+  case WebAssembly::ATOMIC_RMW16_U_OR_I64:
+  case WebAssembly::ATOMIC_RMW16_U_OR_I64_S:
+  case WebAssembly::ATOMIC_RMW16_U_XOR_I32:
+  case WebAssembly::ATOMIC_RMW16_U_XOR_I32_S:
+  case WebAssembly::ATOMIC_RMW16_U_XOR_I64:
+  case WebAssembly::ATOMIC_RMW16_U_XOR_I64_S:
+  case WebAssembly::ATOMIC_RMW16_U_XCHG_I32:
+  case WebAssembly::ATOMIC_RMW16_U_XCHG_I32_S:
+  case WebAssembly::ATOMIC_RMW16_U_XCHG_I64:
+  case WebAssembly::ATOMIC_RMW16_U_XCHG_I64_S:
     return 1;
   case WebAssembly::LOAD_I32:
+  case WebAssembly::LOAD_I32_S:
   case WebAssembly::LOAD_F32:
+  case WebAssembly::LOAD_F32_S:
   case WebAssembly::STORE_I32:
+  case WebAssembly::STORE_I32_S:
   case WebAssembly::STORE_F32:
+  case WebAssembly::STORE_F32_S:
   case WebAssembly::LOAD32_S_I64:
+  case WebAssembly::LOAD32_S_I64_S:
   case WebAssembly::LOAD32_U_I64:
+  case WebAssembly::LOAD32_U_I64_S:
   case WebAssembly::STORE32_I64:
+  case WebAssembly::STORE32_I64_S:
   case WebAssembly::ATOMIC_LOAD_I32:
+  case WebAssembly::ATOMIC_LOAD_I32_S:
   case WebAssembly::ATOMIC_LOAD32_U_I64:
+  case WebAssembly::ATOMIC_LOAD32_U_I64_S:
+  case WebAssembly::ATOMIC_STORE_I32:
+  case WebAssembly::ATOMIC_STORE_I32_S:
+  case WebAssembly::ATOMIC_STORE32_I64:
+  case WebAssembly::ATOMIC_STORE32_I64_S:
+  case WebAssembly::ATOMIC_RMW_ADD_I32:
+  case WebAssembly::ATOMIC_RMW_ADD_I32_S:
+  case WebAssembly::ATOMIC_RMW32_U_ADD_I64:
+  case WebAssembly::ATOMIC_RMW32_U_ADD_I64_S:
+  case WebAssembly::ATOMIC_RMW_SUB_I32:
+  case WebAssembly::ATOMIC_RMW_SUB_I32_S:
+  case WebAssembly::ATOMIC_RMW32_U_SUB_I64:
+  case WebAssembly::ATOMIC_RMW32_U_SUB_I64_S:
+  case WebAssembly::ATOMIC_RMW_AND_I32:
+  case WebAssembly::ATOMIC_RMW_AND_I32_S:
+  case WebAssembly::ATOMIC_RMW32_U_AND_I64:
+  case WebAssembly::ATOMIC_RMW32_U_AND_I64_S:
+  case WebAssembly::ATOMIC_RMW_OR_I32:
+  case WebAssembly::ATOMIC_RMW_OR_I32_S:
+  case WebAssembly::ATOMIC_RMW32_U_OR_I64:
+  case WebAssembly::ATOMIC_RMW32_U_OR_I64_S:
+  case WebAssembly::ATOMIC_RMW_XOR_I32:
+  case WebAssembly::ATOMIC_RMW_XOR_I32_S:
+  case WebAssembly::ATOMIC_RMW32_U_XOR_I64:
+  case WebAssembly::ATOMIC_RMW32_U_XOR_I64_S:
+  case WebAssembly::ATOMIC_RMW_XCHG_I32:
+  case WebAssembly::ATOMIC_RMW_XCHG_I32_S:
+  case WebAssembly::ATOMIC_RMW32_U_XCHG_I64:
+  case WebAssembly::ATOMIC_RMW32_U_XCHG_I64_S:
     return 2;
   case WebAssembly::LOAD_I64:
+  case WebAssembly::LOAD_I64_S:
   case WebAssembly::LOAD_F64:
+  case WebAssembly::LOAD_F64_S:
   case WebAssembly::STORE_I64:
+  case WebAssembly::STORE_I64_S:
   case WebAssembly::STORE_F64:
+  case WebAssembly::STORE_F64_S:
   case WebAssembly::ATOMIC_LOAD_I64:
+  case WebAssembly::ATOMIC_LOAD_I64_S:
+  case WebAssembly::ATOMIC_STORE_I64:
+  case WebAssembly::ATOMIC_STORE_I64_S:
+  case WebAssembly::ATOMIC_RMW_ADD_I64:
+  case WebAssembly::ATOMIC_RMW_ADD_I64_S:
+  case WebAssembly::ATOMIC_RMW_SUB_I64:
+  case WebAssembly::ATOMIC_RMW_SUB_I64_S:
+  case WebAssembly::ATOMIC_RMW_AND_I64:
+  case WebAssembly::ATOMIC_RMW_AND_I64_S:
+  case WebAssembly::ATOMIC_RMW_OR_I64:
+  case WebAssembly::ATOMIC_RMW_OR_I64_S:
+  case WebAssembly::ATOMIC_RMW_XOR_I64:
+  case WebAssembly::ATOMIC_RMW_XOR_I64_S:
+  case WebAssembly::ATOMIC_RMW_XCHG_I64:
+  case WebAssembly::ATOMIC_RMW_XCHG_I64_S:
     return 3;
   default:
     llvm_unreachable("Only loads and stores have p2align values");
@@ -158,19 +281,20 @@ static const unsigned LoadP2AlignOperandNo = 1;
 static const unsigned StoreP2AlignOperandNo = 0;
 
 /// This is used to indicate block signatures.
-enum class ExprType {
-  Void    = -0x40,
-  I32     = -0x01,
-  I64     = -0x02,
-  F32     = -0x03,
-  F64     = -0x04,
-  I8x16   = -0x05,
-  I16x8   = -0x06,
-  I32x4   = -0x07,
-  F32x4   = -0x08,
-  B8x16   = -0x09,
-  B16x8   = -0x0a,
-  B32x4   = -0x0b
+enum class ExprType : unsigned {
+  Void      = 0x40,
+  I32       = 0x7F,
+  I64       = 0x7E,
+  F32       = 0x7D,
+  F64       = 0x7C,
+  I8x16     = 0x7B,
+  I16x8     = 0x7A,
+  I32x4     = 0x79,
+  F32x4     = 0x78,
+  B8x16     = 0x77,
+  B16x8     = 0x76,
+  B32x4     = 0x75,
+  ExceptRef = 0x68
 };
 
 /// Instruction opcodes emitted via means other than CodeGen.
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 0ca52ad651b5..5272e188e1d0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines WebAssembly-specific target streamer classes.
+/// This file defines WebAssembly-specific target streamer classes.
 /// These are for implementing support for target-specific assembly directives.
 ///
 //===----------------------------------------------------------------------===//
@@ -17,10 +17,8 @@
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -31,16 +29,13 @@ WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
 void WebAssemblyTargetStreamer::emitValueType(wasm::ValType Type) {
-  Streamer.EmitSLEB128IntValue(int32_t(Type));
+  Streamer.EmitIntValue(uint8_t(Type), 1);
 }
 
 WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
     MCStreamer &S, formatted_raw_ostream &OS)
     : WebAssemblyTargetStreamer(S), OS(OS) {}
 
-WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
-    : WebAssemblyTargetStreamer(S) {}
-
 WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S)
     : WebAssemblyTargetStreamer(S) {}
 
@@ -87,27 +82,6 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
   }
 }
 
-void WebAssemblyTargetAsmStreamer::emitGlobal(
-    ArrayRef<wasm::Global> Globals) {
-  if (!Globals.empty()) {
-    OS << "\t.globalvar  \t";
-
-    bool First = true;
-    for (const wasm::Global &G : Globals) {
-      if (First)
-        First = false;
-      else
-        OS << ", ";
-      OS << WebAssembly::TypeToString(G.Type);
-      if (!G.InitialModule.empty())
-        OS << '=' << G.InitialModule << ':' << G.InitialName;
-      else
-        OS << '=' << G.InitialValue;
-    }
-    OS << '\n';
-  }
-}
-
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
 void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
@@ -128,46 +102,13 @@ void WebAssemblyTargetAsmStreamer::emitGlobalImport(StringRef name) {
   OS << "\t.import_global\t" << name << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
-  OS << "\t.indidx  \t" << *Value << '\n';
-}
-
-void WebAssemblyTargetELFStreamer::emitParam(MCSymbol *Symbol,
-                                             ArrayRef<MVT> Types) {
-  // Nothing to emit; params are declared as part of the function signature.
-}
-
-void WebAssemblyTargetELFStreamer::emitResult(MCSymbol *Symbol,
-                                              ArrayRef<MVT> Types) {
-  // Nothing to emit; results are declared as part of the function signature.
-}
-
-void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
-  Streamer.EmitULEB128IntValue(Types.size());
-  for (MVT Type : Types)
-    emitValueType(WebAssembly::toValType(Type));
-}
-
-void WebAssemblyTargetELFStreamer::emitGlobal(
-    ArrayRef<wasm::Global> Globals) {
-  llvm_unreachable(".globalvar encoding not yet implemented");
-}
-
-void WebAssemblyTargetELFStreamer::emitEndFunc() {
-  Streamer.EmitIntValue(WebAssembly::End, 1);
-}
-
-void WebAssemblyTargetELFStreamer::emitIndIdx(const MCExpr *Value) {
-  llvm_unreachable(".indidx encoding not yet implemented");
-}
-
-void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
-    MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
-  // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
-  // whether it's necessary for .o files to declare indirect function types.
+void WebAssemblyTargetAsmStreamer::emitImportModule(MCSymbolWasm *Sym,
+                                                    StringRef ModuleName) {
+  OS << "\t.import_module\t" << Sym->getName() << ", " << ModuleName << '\n';
 }
 
-void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
+void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
+  OS << "\t.indidx  \t" << *Value << '\n';
 }
 
 void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol,
@@ -204,31 +145,6 @@ void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
   }
 }
 
-void WebAssemblyTargetWasmStreamer::emitGlobal(
-    ArrayRef<wasm::Global> Globals) {
-  // Encode the globals use by the funciton into the special .global_variables
-  // section. This will later be decoded and turned into contents for the
-  // Globals Section.
-  Streamer.PushSection();
-  Streamer.SwitchSection(Streamer.getContext().getWasmSection(
-      ".global_variables", SectionKind::getMetadata()));
-  for (const wasm::Global &G : Globals) {
-    Streamer.EmitIntValue(int32_t(G.Type), 1);
-    Streamer.EmitIntValue(G.Mutable, 1);
-    if (G.InitialModule.empty()) {
-      Streamer.EmitIntValue(0, 1); // indicate that we have an int value
-      Streamer.EmitSLEB128IntValue(0);
-    } else {
-      Streamer.EmitIntValue(1, 1); // indicate that we have a module import
-      Streamer.EmitBytes(G.InitialModule);
-      Streamer.EmitIntValue(0, 1); // nul-terminate
-      Streamer.EmitBytes(G.InitialName);
-      Streamer.EmitIntValue(0, 1); // nul-terminate
-    }
-  }
-  Streamer.PopSection();
-}
-
 void WebAssemblyTargetWasmStreamer::emitEndFunc() {
   llvm_unreachable(".end_func is not needed for direct wasm output");
 }
@@ -256,9 +172,14 @@ void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
 
   WasmSym->setParams(std::move(ValParams));
   WasmSym->setReturns(std::move(ValResults));
-  WasmSym->setIsFunction(true);
+  WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 }
 
 void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
   llvm_unreachable(".global_import is not needed for direct wasm output");
 }
+
+void WebAssemblyTargetWasmStreamer::emitImportModule(MCSymbolWasm *Sym,
+                                                     StringRef ModuleName) {
+  Sym->setModuleName(ModuleName);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 2cb21a20580b..cafcb04ccd11 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares WebAssembly-specific target streamer classes.
+/// This file declares WebAssembly-specific target streamer classes.
 /// These are for implementing support for target-specific assembly directives.
 ///
 //===----------------------------------------------------------------------===//
@@ -17,13 +17,13 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
 
 #include "llvm/BinaryFormat/Wasm.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
-class MCELFStreamer;
 class MCWasmStreamer;
+class MCSymbolWasm;
 
 /// WebAssembly-specific streamer interface, to implement support
 /// WebAssembly-specific assembly directives.
@@ -37,8 +37,6 @@ public:
   virtual void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .local
   virtual void emitLocal(ArrayRef<MVT> Types) = 0;
-  /// .globalvar
-  virtual void emitGlobal(ArrayRef<wasm::Global> Globals) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
   /// .functype
@@ -49,6 +47,8 @@ public:
   virtual void emitIndIdx(const MCExpr *Value) = 0;
   /// .import_global
   virtual void emitGlobalImport(StringRef name) = 0;
+  /// .import_module
+  virtual void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) = 0;
 
 protected:
   void emitValueType(wasm::ValType Type);
@@ -64,30 +64,13 @@ public:
   void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
-  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
-  void emitEndFunc() override;
-  void emitIndirectFunctionType(MCSymbol *Symbol,
-                                SmallVectorImpl<MVT> &Params,
-                                SmallVectorImpl<MVT> &Results) override;
-  void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalImport(StringRef name) override;
-};
-
-/// This part is for ELF object output
-class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
-public:
-  explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
-
-  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
-  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
-  void emitLocal(ArrayRef<MVT> Types) override;
-  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
   void emitEndFunc() override;
   void emitIndirectFunctionType(MCSymbol *Symbol,
                                 SmallVectorImpl<MVT> &Params,
                                 SmallVectorImpl<MVT> &Results) override;
   void emitIndIdx(const MCExpr *Value) override;
   void emitGlobalImport(StringRef name) override;
+  void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
 /// This part is for Wasm object output
@@ -98,13 +81,13 @@ public:
   void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
-  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
   void emitEndFunc() override;
   void emitIndirectFunctionType(MCSymbol *Symbol,
                                 SmallVectorImpl<MVT> &Params,
                                 SmallVectorImpl<MVT> &Results) override;
   void emitIndIdx(const MCExpr *Value) override;
   void emitGlobalImport(StringRef name) override;
+  void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 39abde26df7f..4fb12d40b01b 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file handles Wasm-specific object emission, converting LLVM's
+/// This file handles Wasm-specific object emission, converting LLVM's
 /// internal fixups into the appropriate relocations.
 ///
 //===----------------------------------------------------------------------===//
@@ -20,9 +20,10 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -61,6 +62,25 @@ static bool IsFunctionType(const MCValue &Target) {
   return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX;
 }
 
+static const MCSection *GetFixupSection(const MCExpr *Expr) {
+  if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) {
+    if (SyExp->getSymbol().isInSection())
+      return &SyExp->getSymbol().getSection();
+    return nullptr;
+  }
+
+  if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr)) {
+    auto SectionLHS = GetFixupSection(BinOp->getLHS());
+    auto SectionRHS = GetFixupSection(BinOp->getRHS());
+    return SectionLHS == SectionRHS ? nullptr : SectionLHS;
+  }
+
+  if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
+    return GetFixupSection(UnOp->getSubExpr());
+
+  return nullptr;
+}
+
 unsigned
 WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
                                           const MCFixup &Fixup) const {
@@ -86,6 +106,13 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
   case FK_Data_4:
     if (IsFunction)
       return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32;
+    if (auto Section = static_cast<const MCSectionWasm *>(
+            GetFixupSection(Fixup.getValue()))) {
+      if (Section->getKind().isText())
+        return wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32;
+      else if (!Section->isWasmData())
+        return wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32;
+    }
     return wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32;
   case FK_Data_8:
     llvm_unreachable("FK_Data_8 not implemented yet");
@@ -94,9 +121,7 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
-                                        bool Is64Bit) {
-  auto MOTW = llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit);
-  return createWasmObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit) {
+  return llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit);
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt
index 3433b1553e8c..ef0099f07efb 100644
--- a/contrib/llvm/lib/Target/WebAssembly/README.txt
+++ b/contrib/llvm/lib/Target/WebAssembly/README.txt
@@ -2,15 +2,42 @@
 
 This WebAssembly backend is presently under development.
 
-Currently the easiest way to use it is through Emscripten, which provides a
-compilation environment that includes standard libraries, tools, and packaging
-for producing WebAssembly applications that can run in browsers and other
-environments. For more information, see the Emscripten documentation in
-general, and this page in particular:
+The most notable feature which is not yet stable is the ".o" file format.
+".o" file support is needed for many common ways of using LLVM, such as
+using it through "clang -c", so this backend is not yet considered widely
+usable. However, this backend is usable within some language toolchain
+packages:
+
+Emscripten provides a C/C++ compilation environment that includes standard
+libraries, tools, and packaging for producing WebAssembly applications that
+can run in browsers and other environments. For more information, see the
+Emscripten documentation in general, and this page in particular:
+
   * https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend
+ 
+Rust provides WebAssembly support integrated into Cargo. There are two
+main options:
+ - wasm32-unknown-unknown, which provides a relatively minimal environment
+   that has an emphasis on being "native"
+ - wasm32-unknown-emscripten, which uses Emscripten internally and
+   provides standard C/C++ libraries, filesystem emulation, GL and SDL
+   bindings
+For more information, see:
+  * https://www.hellorust.com/
+
+
+This backend does not yet support debug info. Full DWARF support needs a
+design for how DWARF should be represented in WebAssembly. Sourcemap support
+has an existing design and some corresponding browser implementations, so it
+just needs implementing in LLVM.
 
-Other ways of using this backend, such as via a standalone "clang", are also
-under development, though they are not generally usable yet.
+Work-in-progress documentation for the ".o" file format is here:
+
+  * https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+
+A corresponding linker implementation is also under development:
+
+  * https://lld.llvm.org/WebAssembly.html
 
 For more information on WebAssembly itself, see the home page:
   * https://webassembly.github.io/
@@ -30,6 +57,8 @@ turn red if not. Once most of these pass, further testing will use LLVM's own
 test suite. The tests can be run locally using:
   https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
 
+Some notes on ways that the generated code could be improved follow:
+
 //===---------------------------------------------------------------------===//
 
 Br, br_if, and br_table instructions can support having a value on the value
@@ -127,7 +156,7 @@ However, if moving the binary operator to its user moves it to a place where
 its operands can't be moved to, it would be better to leave it in place, or
 perhaps move it up, so that it can stackify its operands. A binary operator
 has two operands and one result, so in such cases there could be a net win by
-prefering the operands.
+preferring the operands.
 
 //===---------------------------------------------------------------------===//
 
@@ -138,11 +167,10 @@ instructions advantageously for this purpose.
 
 //===---------------------------------------------------------------------===//
 
-WebAssembly is now officially a stack machine, rather than an AST, and this
-comes with additional opportunities for WebAssemblyRegStackify. Specifically,
-the stack doesn't need to be empty after an instruction with no return values.
-WebAssemblyRegStackify could be extended, or possibly rewritten, to take
-advantage of the new opportunities.
+WebAssemblyRegStackify currently assumes that the stack must be empty after
+an instruction with no return values, however wasm doesn't actually require
+this. WebAssemblyRegStackify could be extended, or possibly rewritten, to take
+full advantage of what WebAssembly permits.
 
 //===---------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index a2c03b1a0400..f7a417c0ed49 100644
--- a/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file registers the WebAssembly target.
+/// This file registers the WebAssembly target.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
index 7ac6c3991531..05b7b21fb597 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the entry points for global functions defined in
+/// This file contains the entry points for global functions defined in
 /// the LLVM WebAssembly back-end.
 ///
 //===----------------------------------------------------------------------===//
@@ -27,8 +27,8 @@ class FunctionPass;
 
 // LLVM IR passes.
 ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
-void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
 ModulePass *createWebAssemblyLowerGlobalDtors();
+ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
 
@@ -47,6 +47,7 @@ FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
 FunctionPass *createWebAssemblyExplicitLocals();
 FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyLateEHPrepare();
 FunctionPass *createWebAssemblyCFGSort();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
@@ -54,6 +55,31 @@ FunctionPass *createWebAssemblyRegNumbering();
 FunctionPass *createWebAssemblyPeephole();
 FunctionPass *createWebAssemblyCallIndirectFixup();
 
+// PassRegistry initialization declarations.
+void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
+void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+void initializeLowerGlobalDtorsPass(PassRegistry &);
+void initializeFixFunctionBitcastsPass(PassRegistry &);
+void initializeOptimizeReturnedPass(PassRegistry &);
+void initializeWebAssemblyArgumentMovePass(PassRegistry &);
+void initializeWebAssemblySetP2AlignOperandsPass(PassRegistry &);
+void initializeWebAssemblyReplacePhysRegsPass(PassRegistry &);
+void initializeWebAssemblyPrepareForLiveIntervalsPass(PassRegistry &);
+void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &);
+void initializeWebAssemblyStoreResultsPass(PassRegistry &);
+void initializeWebAssemblyRegStackifyPass(PassRegistry &);
+void initializeWebAssemblyRegColoringPass(PassRegistry &);
+void initializeWebAssemblyExplicitLocalsPass(PassRegistry &);
+void initializeWebAssemblyFixIrreducibleControlFlowPass(PassRegistry &);
+void initializeWebAssemblyLateEHPreparePass(PassRegistry &);
+void initializeWebAssemblyExceptionInfoPass(PassRegistry &);
+void initializeWebAssemblyCFGSortPass(PassRegistry &);
+void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
+void initializeWebAssemblyLowerBrUnlessPass(PassRegistry &);
+void initializeWebAssemblyRegNumberingPass(PassRegistry &);
+void initializeWebAssemblyPeepholePass(PassRegistry &);
+void initializeWebAssemblyCallIndirectFixupPass(PassRegistry &);
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
index 99cf1f119a20..2f301da8e422 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This is a target description file for the WebAssembly architecture,
+/// This is a target description file for the WebAssembly architecture,
 /// which is also known as "wasm".
 ///
 //===----------------------------------------------------------------------===//
@@ -32,6 +32,15 @@ def FeatureNontrappingFPToInt :
                        "HasNontrappingFPToInt", "true",
                        "Enable non-trapping float-to-int conversion operators">;
 
+def FeatureSignExt :
+      SubtargetFeature<"sign-ext",
+                       "HasSignExt", "true",
+                       "Enable sign extension operators">;
+
+def FeatureExceptionHandling :
+      SubtargetFeature<"exception-handling", "HasExceptionHandling", "true",
+                       "Enable Wasm exception handling">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //===----------------------------------------------------------------------===//
@@ -68,6 +77,20 @@ def : ProcessorModel<"bleeding-edge", NoSchedModel,
 // Target Declaration
 //===----------------------------------------------------------------------===//
 
+def WebAssemblyAsmParser : AsmParser {
+  // The physical register names are not in the binary format or asm text
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def WebAssemblyAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 0;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
 def WebAssembly : Target {
   let InstructionSet = WebAssemblyInstrInfo;
+  let AssemblyParsers  = [WebAssemblyAsmParser];
+  let AssemblyWriters = [WebAssemblyAsmWriter];
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
new file mode 100644
index 000000000000..4af9cd150bf7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -0,0 +1,144 @@
+//===-- WebAssemblyAddMissingPrototypes.cpp - Fix prototypeless functions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Add prototypes to prototypes-less functions.
+///
+/// WebAssembly has strict function prototype checking so we need functions
+/// declarations to match the call sites.  Clang treats prototype-less functions
+/// as varargs (foo(...)) which happens to work on existing platforms but
+/// doesn't under WebAssembly.  This pass will find all the call sites of each
+/// prototype-less function, ensure they agree, and then set the signature
+/// on the function declaration accordingly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-add-missing-prototypes"
+
+namespace {
+class WebAssemblyAddMissingPrototypes final : public ModulePass {
+  StringRef getPassName() const override {
+    return "Add prototypes to prototypes-less functions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  bool runOnModule(Module &M) override;
+
+public:
+  static char ID;
+  WebAssemblyAddMissingPrototypes() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char WebAssemblyAddMissingPrototypes::ID = 0;
+INITIALIZE_PASS(WebAssemblyAddMissingPrototypes, DEBUG_TYPE,
+                "Add prototypes to prototypes-less functions", false, false)
+
+ModulePass *llvm::createWebAssemblyAddMissingPrototypes() {
+  return new WebAssemblyAddMissingPrototypes();
+}
+
+bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
+  LLVM_DEBUG(dbgs() << "runnning AddMissingPrototypes\n");
+
+  std::vector<std::pair<Function*, Function*>> Replacements;
+
+  // Find all the prototype-less function declarations
+  for (Function &F : M) {
+    if (!F.isDeclaration() || !F.hasFnAttribute("no-prototype"))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Found no-prototype function: " << F.getName() << "\n");
+
+    // When clang emits prototype-less C functions it uses (...), i.e. varargs
+    // function that take no arguments (have no sentinel).  When we see a
+    // no-prototype attribute we expect the function have these properties.
+    if (!F.isVarArg())
+      report_fatal_error(
+          "Functions with 'no-prototype' attribute must take varargs: " +
+          F.getName());
+    if (F.getFunctionType()->getNumParams() != 0)
+      report_fatal_error(
+          "Functions with 'no-prototype' attribute should not have params: " +
+          F.getName());
+
+
+    // Create a function prototype based on the first call site (first bitcast)
+    // that we find.
+    FunctionType *NewType = nullptr;
+    Function* NewF = nullptr;
+    for (Use &U : F.uses()) {
+      LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n");
+      if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) {
+        FunctionType *DestType =
+            cast<FunctionType>(BC->getDestTy()->getPointerElementType());
+
+        // Create a new function with the correct type
+        NewType = DestType;
+        NewF = Function::Create(NewType, F.getLinkage(), F.getName());
+        NewF->setAttributes(F.getAttributes());
+        NewF->removeFnAttr("no-prototype");
+        break;
+      }
+    }
+
+    if (!NewType) {
+      LLVM_DEBUG(
+          dbgs() << "could not derive a function prototype from usage: " +
+                        F.getName() + "\n");
+      continue;
+    }
+
+    for (Use &U : F.uses()) {
+      if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) {
+        FunctionType *DestType =
+            cast<FunctionType>(BC->getDestTy()->getPointerElementType());
+        if (NewType != DestType) {
+          report_fatal_error(
+              "Prototypeless function used with conflicting signatures: " +
+              F.getName());
+        }
+        BC->replaceAllUsesWith(NewF);
+        Replacements.emplace_back(&F, NewF);
+      } else {
+        dbgs() << *U.getUser()->getType() << "\n";
+#ifndef NDEBUG
+        U.getUser()->dump();
+#endif
+        report_fatal_error(
+            "unexpected use of prototypeless function: " + F.getName() + "\n");
+      }
+    }
+  }
+
+  // Finally replace the old function declarations with the new ones
+  for (auto &Pair : Replacements) {
+    Function* Old = Pair.first;
+    Function* New = Pair.second;
+    Old->eraseFromParent();
+    M.getFunctionList().push_back(New);
+  }
+
+  return !Replacements.empty();
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 5fadca38b820..7c8a631cde8a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+/// This file moves ARGUMENT instructions after ScheduleDAG scheduling.
 ///
 /// Arguments are really live-in registers, however, since we use virtual
 /// registers and LLVM doesn't support live-in virtual registers, we're
@@ -60,12 +60,15 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyArgumentMove::ID = 0;
+INITIALIZE_PASS(WebAssemblyArgumentMove, DEBUG_TYPE,
+                "Move ARGUMENT instructions for WebAssembly", false, false)
+
 FunctionPass *llvm::createWebAssemblyArgumentMove() {
   return new WebAssemblyArgumentMove();
 }
 
 bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Argument Move **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 204d97cbdd44..1f280e1d13fc 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains a printer that converts from our internal
+/// This file contains a printer that converts from our internal
 /// representation of machine-dependent LLVM code to the WebAssembly assembly
 /// language.
 ///
@@ -31,10 +31,10 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -53,7 +53,7 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
                 MVT::v4i32, MVT::v4f32})
     if (TRI->isTypeLegalForClass(*TRC, T))
       return T;
-  DEBUG(errs() << "Unknown type for register number: " << RegNo);
+  LLVM_DEBUG(errs() << "Unknown type for register number: " << RegNo);
   llvm_unreachable("Unknown register type");
   return MVT::Other;
 }
@@ -84,21 +84,47 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       SmallVector<MVT, 4> Results;
       SmallVector<MVT, 4> Params;
       ComputeSignatureVTs(F, TM, Params, Results);
-      getTargetStreamer()->emitIndirectFunctionType(getSymbol(&F), Params,
-                                                    Results);
+      MCSymbol *Sym = getSymbol(&F);
+      getTargetStreamer()->emitIndirectFunctionType(Sym, Params, Results);
+
+      if (TM.getTargetTriple().isOSBinFormatWasm() &&
+          F.hasFnAttribute("wasm-import-module")) {
+        MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+        StringRef Name = F.getFnAttribute("wasm-import-module")
+                             .getValueAsString();
+        getTargetStreamer()->emitImportModule(WasmSym, Name);
+      }
     }
   }
   for (const auto &G : M.globals()) {
     if (!G.hasInitializer() && G.hasExternalLinkage()) {
       if (G.getValueType()->isSized()) {
         uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
-        if (TM.getTargetTriple().isOSBinFormatELF())
-          getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
         OutStreamer->emitELFSize(getSymbol(&G),
                                  MCConstantExpr::create(Size, OutContext));
       }
     }
   }
+
+  if (const NamedMDNode *Named = M.getNamedMetadata("wasm.custom_sections")) {
+    for (const Metadata *MD : Named->operands()) {
+      const MDTuple *Tuple = dyn_cast<MDTuple>(MD);
+      if (!Tuple || Tuple->getNumOperands() != 2)
+        continue;
+      const MDString *Name = dyn_cast<MDString>(Tuple->getOperand(0));
+      const MDString *Contents = dyn_cast<MDString>(Tuple->getOperand(1));
+      if (!Name || !Contents)
+        continue;
+
+      OutStreamer->PushSection();
+      std::string SectionName = (".custom_section." + Name->getString()).str();
+      MCSectionWasm *mySection =
+          OutContext.getWasmSection(SectionName, SectionKind::getMetadata());
+      OutStreamer->SwitchSection(mySection);
+      OutStreamer->EmitBytes(Contents->getString());
+      OutStreamer->PopSection();
+    }
+  }
 }
 
 void WebAssemblyAsmPrinter::EmitConstantPool() {
@@ -133,36 +159,13 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   else
     getTargetStreamer()->emitResult(CurrentFnSym, ArrayRef<MVT>());
 
-  if (TM.getTargetTriple().isOSBinFormatELF()) {
-    assert(MFI->getLocals().empty());
-    for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
-      unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
-      unsigned WAReg = MFI->getWAReg(VReg);
-      // Don't declare unused registers.
-      if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
-        continue;
-      // Don't redeclare parameters.
-      if (WAReg < MFI->getParams().size())
-        continue;
-      // Don't declare stackified registers.
-      if (int(WAReg) < 0)
-        continue;
-      MFI->addLocal(getRegType(VReg));
-    }
-  }
-
   getTargetStreamer()->emitLocal(MFI->getLocals());
 
   AsmPrinter::EmitFunctionBodyStart();
 }
 
-void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
-  if (TM.getTargetTriple().isOSBinFormatELF())
-    getTargetStreamer()->emitEndFunc();
-}
-
 void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+  LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
 
   switch (MI->getOpcode()) {
   case WebAssembly::ARGUMENT_I32:
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index a37f8bcf6ba5..23817b4e5126 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -57,7 +57,6 @@ public:
   void EmitJumpTableInfo() override;
   void EmitConstantPool() override;
   void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
   void EmitInstruction(const MachineInstr *MI) override;
   const MCExpr *lowerConstant(const Constant *CV) override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 700111743ee8..267a51433cd1 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a CFG sorting pass.
+/// This file implements a CFG sorting pass.
 ///
 /// This pass reorders the blocks in a function to put them into topological
 /// order, ignoring loop backedges, and without any loop being interrupted
@@ -56,6 +56,9 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyCFGSort::ID = 0;
+INITIALIZE_PASS(WebAssemblyCFGSort, DEBUG_TYPE,
+                "Reorders blocks in topological order", false, false)
+
 FunctionPass *llvm::createWebAssemblyCFGSort() {
   return new WebAssemblyCFGSort();
 }
@@ -250,7 +253,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
       assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
              "Blocks must be nested in their loops");
     }
-    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+    while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back()))
       OnStack.pop_back();
   }
   assert(OnStack.pop_back_val() == nullptr &&
@@ -261,9 +264,9 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
 }
 
 bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** CFG Sorting **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** CFG Sorting **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   const auto &MLI = getAnalysis<MachineLoopInfo>();
   auto &MDT = getAnalysis<MachineDominatorTree>();
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 21e0f6b23777..70ce40cefed7 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a CFG stacking pass.
+/// This file implements a CFG stacking pass.
 ///
 /// This pass inserts BLOCK and LOOP markers to mark the start of scopes, since
 /// scope boundaries serve as the labels for WebAssembly's control transfers.
@@ -57,6 +57,10 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyCFGStackify::ID = 0;
+INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE,
+                "Insert BLOCK and LOOP markers for WebAssembly scopes",
+                false, false)
+
 FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
 }
@@ -123,7 +127,8 @@ static void PlaceBlockMarker(
   // Decide where in Header to put the BLOCK.
   MachineBasicBlock::iterator InsertPos;
   MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
-  if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
+  if (HeaderLoop &&
+      MBB.getNumber() > WebAssembly::getBottom(HeaderLoop)->getNumber()) {
     // Header is the header of a loop that does not lexically contain MBB, so
     // the BLOCK needs to be above the LOOP, after any END constructs.
     InsertPos = Header->begin();
@@ -143,9 +148,10 @@ static void PlaceBlockMarker(
   }
 
   // Add the BLOCK.
-  MachineInstr *Begin = BuildMI(*Header, InsertPos, DebugLoc(),
-                                TII.get(WebAssembly::BLOCK))
-      .addImm(int64_t(WebAssembly::ExprType::Void));
+  MachineInstr *Begin =
+      BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
+              TII.get(WebAssembly::BLOCK))
+          .addImm(int64_t(WebAssembly::ExprType::Void));
 
   // Mark the end of the block.
   InsertPos = MBB.begin();
@@ -153,7 +159,7 @@ static void PlaceBlockMarker(
          InsertPos->getOpcode() == WebAssembly::END_LOOP &&
          LoopTops[&*InsertPos]->getParent()->getNumber() >= Header->getNumber())
     ++InsertPos;
-  MachineInstr *End = BuildMI(MBB, InsertPos, DebugLoc(),
+  MachineInstr *End = BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
                               TII.get(WebAssembly::END_BLOCK));
   BlockTops[End] = Begin;
 
@@ -176,7 +182,7 @@ static void PlaceLoopMarker(
 
   // The operand of a LOOP is the first block after the loop. If the loop is the
   // bottom of the function, insert a dummy block at the end.
-  MachineBasicBlock *Bottom = LoopBottom(Loop);
+  MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop);
   auto Iter = std::next(MachineFunction::iterator(Bottom));
   if (Iter == MF.end()) {
     MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
@@ -193,12 +199,14 @@ static void PlaceLoopMarker(
   while (InsertPos != MBB.end() &&
          InsertPos->getOpcode() == WebAssembly::END_LOOP)
     ++InsertPos;
-  MachineInstr *Begin = BuildMI(MBB, InsertPos, DebugLoc(),
+  MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos),
                                 TII.get(WebAssembly::LOOP))
-      .addImm(int64_t(WebAssembly::ExprType::Void));
+                            .addImm(int64_t(WebAssembly::ExprType::Void));
 
-  // Mark the end of the loop.
-  MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(),
+  // Mark the end of the loop (using arbitrary debug location that branched
+  // to the loop end as its location).
+  DebugLoc EndDL = (*AfterLoop->pred_rbegin())->findBranchDebugLoc();
+  MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), EndDL,
                               TII.get(WebAssembly::END_LOOP));
   LoopTops[End] = Begin;
 
@@ -249,12 +257,13 @@ static void FixEndsAtEndOfFunction(
   case MVT::v8i16: retType = WebAssembly::ExprType::I16x8; break;
   case MVT::v4i32: retType = WebAssembly::ExprType::I32x4; break;
   case MVT::v4f32: retType = WebAssembly::ExprType::F32x4; break;
+  case MVT::ExceptRef: retType = WebAssembly::ExprType::ExceptRef; break;
   default: llvm_unreachable("unexpected return type");
   }
 
   for (MachineBasicBlock &MBB : reverse(MF)) {
     for (MachineInstr &MI : reverse(MBB)) {
-      if (MI.isPosition() || MI.isDebugValue())
+      if (MI.isPosition() || MI.isDebugInstr())
         continue;
       if (MI.getOpcode() == WebAssembly::END_BLOCK) {
         BlockTops[&MI]->getOperand(0).setImm(int32_t(retType));
@@ -275,7 +284,8 @@ static void FixEndsAtEndOfFunction(
 static void AppendEndToFunction(
     MachineFunction &MF,
     const WebAssemblyInstrInfo &TII) {
-  BuildMI(MF.back(), MF.back().end(), DebugLoc(),
+  BuildMI(MF.back(), MF.back().end(),
+          MF.back().findPrevDebugLoc(MF.back().end()),
           TII.get(WebAssembly::END_FUNCTION));
 }
 
@@ -348,15 +358,13 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
   FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
 
   // Add an end instruction at the end of the function body.
-  if (!MF.getSubtarget<WebAssemblySubtarget>()
-        .getTargetTriple().isOSBinFormatELF())
-    AppendEndToFunction(MF, TII);
+  AppendEndToFunction(MF, TII);
 }
 
 bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** CFG Stackifying **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** CFG Stackifying **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   const auto &MLI = getAnalysis<MachineLoopInfo>();
   auto &MDT = getAnalysis<MachineDominatorTree>();
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
index 1af92f02d8e0..c1820bf66bc0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file converts pseudo call_indirect instructions into real
+/// This file converts pseudo call_indirect instructions into real
 /// call_indirects.
 ///
 /// The order of arguments for a call_indirect is the arguments to the function
@@ -54,6 +54,9 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyCallIndirectFixup::ID = 0;
+INITIALIZE_PASS(WebAssemblyCallIndirectFixup, DEBUG_TYPE,
+                "Rewrite call_indirect argument orderings", false, false)
+
 FunctionPass *llvm::createWebAssemblyCallIndirectFixup() {
   return new WebAssemblyCallIndirectFixup();
 }
@@ -80,8 +83,8 @@ static bool IsPseudoCallIndirect(const MachineInstr &MI) {
 }
 
 bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
+                    << MF.getName() << '\n');
 
   bool Changed = false;
   const WebAssemblyInstrInfo *TII =
@@ -90,7 +93,7 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       if (IsPseudoCallIndirect(MI)) {
-        DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
+        LLVM_DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
 
         // Rewrite pseudo to non-pseudo
         const MCInstrDesc &Desc = TII->get(GetNonPseudoCallIndirectOpcode(MI));
@@ -120,13 +123,13 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
         for (const MachineOperand &MO : Ops)
           MI.addOperand(MO);
 
-        DEBUG(dbgs() << "  After transform: " << MI);
+        LLVM_DEBUG(dbgs() << "  After transform: " << MI);
         Changed = true;
       }
     }
   }
 
-  DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
+  LLVM_DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
 
   return Changed;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
new file mode 100644
index 000000000000..84683d48a90a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -0,0 +1,197 @@
+//===--- WebAssemblyExceptionInfo.cpp - Exception Infomation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements WebAssemblyException information analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyExceptionInfo.h"
+#include "WebAssemblyUtilities.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-exception-info"
+
+char WebAssemblyExceptionInfo::ID = 0;
+
+INITIALIZE_PASS_BEGIN(WebAssemblyExceptionInfo, DEBUG_TYPE,
+                      "WebAssembly Exception Information", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(WebAssemblyExceptionInfo, DEBUG_TYPE,
+                    "WebAssembly Exception Information", true, true)
+
+bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &F) {
+  releaseMemory();
+  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  recalculate(MDT, MDF);
+  return false;
+}
+
+void WebAssemblyExceptionInfo::recalculate(
+    MachineDominatorTree &MDT, const MachineDominanceFrontier &MDF) {
+  // Postorder traversal of the dominator tree.
+  SmallVector<WebAssemblyException *, 8> Exceptions;
+  for (auto DomNode : post_order(&MDT)) {
+    MachineBasicBlock *EHPad = DomNode->getBlock();
+    if (!EHPad->isEHPad())
+      continue;
+    // We group catch & catch-all terminate pads together, so skip the second
+    // one
+    if (WebAssembly::isCatchAllTerminatePad(*EHPad))
+      continue;
+    auto *WE = new WebAssemblyException(EHPad);
+    discoverAndMapException(WE, MDT, MDF);
+    Exceptions.push_back(WE);
+  }
+
+  // Add BBs to exceptions
+  for (auto DomNode : post_order(&MDT)) {
+    MachineBasicBlock *MBB = DomNode->getBlock();
+    WebAssemblyException *WE = getExceptionFor(MBB);
+    for (; WE; WE = WE->getParentException())
+      WE->addBlock(MBB);
+  }
+
+  // Add subexceptions to exceptions
+  for (auto *WE : Exceptions) {
+    if (WE->getParentException())
+      WE->getParentException()->getSubExceptions().push_back(WE);
+    else
+      addTopLevelException(WE);
+  }
+
+  // For convenience, Blocks and SubExceptions are inserted in postorder.
+  // Reverse the lists.
+  for (auto *WE : Exceptions) {
+    WE->reverseBlock();
+    std::reverse(WE->getSubExceptions().begin(), WE->getSubExceptions().end());
+  }
+}
+
+void WebAssemblyExceptionInfo::releaseMemory() {
+  BBMap.clear();
+  DeleteContainerPointers(TopLevelExceptions);
+  TopLevelExceptions.clear();
+}
+
+void WebAssemblyExceptionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void WebAssemblyExceptionInfo::discoverAndMapException(
+    WebAssemblyException *WE, const MachineDominatorTree &MDT,
+    const MachineDominanceFrontier &MDF) {
+  unsigned NumBlocks = 0;
+  unsigned NumSubExceptions = 0;
+
+  // Map blocks that belong to a catchpad / cleanuppad
+  MachineBasicBlock *EHPad = WE->getEHPad();
+
+  // We group catch & catch-all terminate pads together within an exception
+  if (WebAssembly::isCatchTerminatePad(*EHPad)) {
+    assert(EHPad->succ_size() == 1 &&
+           "Catch terminate pad has more than one successors");
+    changeExceptionFor(EHPad, WE);
+    changeExceptionFor(*(EHPad->succ_begin()), WE);
+    return;
+  }
+
+  SmallVector<MachineBasicBlock *, 8> WL;
+  WL.push_back(EHPad);
+  while (!WL.empty()) {
+    MachineBasicBlock *MBB = WL.pop_back_val();
+
+    // Find its outermost discovered exception. If this is a discovered block,
+    // check if it is already discovered to be a subexception of this exception.
+    WebAssemblyException *SubE = getOutermostException(MBB);
+    if (SubE) {
+      if (SubE != WE) {
+        // Discover a subexception of this exception.
+        SubE->setParentException(WE);
+        ++NumSubExceptions;
+        NumBlocks += SubE->getBlocksVector().capacity();
+        // All blocks that belong to this subexception have been already
+        // discovered. Skip all of them. Add the subexception's landing pad's
+        // dominance frontier to the worklist.
+        for (auto &Frontier : MDF.find(SubE->getEHPad())->second)
+          if (MDT.dominates(EHPad, Frontier))
+            WL.push_back(Frontier);
+      }
+      continue;
+    }
+
+    // This is an undiscovered block. Map it to the current exception.
+    changeExceptionFor(MBB, WE);
+    ++NumBlocks;
+
+    // Add successors dominated by the current BB to the worklist.
+    for (auto *Succ : MBB->successors())
+      if (MDT.dominates(EHPad, Succ))
+        WL.push_back(Succ);
+  }
+
+  WE->getSubExceptions().reserve(NumSubExceptions);
+  WE->reserveBlocks(NumBlocks);
+}
+
+WebAssemblyException *
+WebAssemblyExceptionInfo::getOutermostException(MachineBasicBlock *MBB) const {
+  WebAssemblyException *WE = getExceptionFor(MBB);
+  if (WE) {
+    while (WebAssemblyException *Parent = WE->getParentException())
+      WE = Parent;
+  }
+  return WE;
+}
+
+void WebAssemblyException::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth * 2) << "Exception at depth " << getExceptionDepth()
+                       << " containing: ";
+
+  for (unsigned I = 0; I < getBlocks().size(); ++I) {
+    MachineBasicBlock *MBB = getBlocks()[I];
+    if (I)
+      OS << ", ";
+    OS << "%bb." << MBB->getNumber();
+    if (const auto *BB = MBB->getBasicBlock())
+      if (BB->hasName())
+        OS << "." << BB->getName();
+
+    if (getEHPad() == MBB)
+      OS << " (landing-pad)";
+  }
+  OS << "\n";
+
+  for (auto &SubE : SubExceptions)
+    SubE->print(OS, Depth + 2);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WebAssemblyException::dump() const { print(dbgs()); }
+#endif
+
+raw_ostream &operator<<(raw_ostream &OS, const WebAssemblyException &WE) {
+  WE.print(OS);
+  return OS;
+}
+
+void WebAssemblyExceptionInfo::print(raw_ostream &OS, const Module *) const {
+  for (auto *WE : TopLevelExceptions)
+    WE->print(OS);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
new file mode 100644
index 000000000000..fcd7e2366e03
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -0,0 +1,170 @@
+//===-- WebAssemblyExceptionInfo.h - WebAssembly Exception Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements WebAssemblyException information analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYEXCEPTIONINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYEXCEPTIONINFO_H
+
+#include "WebAssembly.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class MachineDominatorTree;
+class MachineDominanceFrontier;
+
+// WebAssembly instructions for exception handling are structured as follows:
+//   try
+//     instructions*
+//   catch             ----|
+//     instructions*       | -> A WebAssemblyException consists of this region
+//   end               ----|
+//
+// A WebAssemblyException object contains BBs that belong to a 'catch' part of
+// the try-catch-end structure to be created later. 'try' and 'end' markers
+// are not present at this stage and will be generated in CFGStackify pass.
+// Because CFGSort requires all the BBs within a catch part to be sorted
+// together as it does for loops, this pass calculates the nesting structure of
+// catch part of exceptions in a function.
+//
+// An exception catch part is defined as a BB with catch instruction and all
+// other BBs dominated by this BB.
+class WebAssemblyException {
+  MachineBasicBlock *EHPad = nullptr;
+
+  WebAssemblyException *ParentException = nullptr;
+  std::vector<WebAssemblyException *> SubExceptions;
+  std::vector<MachineBasicBlock *> Blocks;
+  SmallPtrSet<const MachineBasicBlock *, 8> BlockSet;
+
+public:
+  WebAssemblyException(MachineBasicBlock *EHPad) : EHPad(EHPad) {}
+  ~WebAssemblyException() { DeleteContainerPointers(SubExceptions); }
+  WebAssemblyException(const WebAssemblyException &) = delete;
+  const WebAssemblyException &operator=(const WebAssemblyException &) = delete;
+
+  MachineBasicBlock *getEHPad() const { return EHPad; }
+  MachineBasicBlock *getHeader() const { return EHPad; }
+  WebAssemblyException *getParentException() const { return ParentException; }
+  void setParentException(WebAssemblyException *WE) { ParentException = WE; }
+
+  bool contains(const WebAssemblyException *WE) const {
+    if (WE == this)
+      return true;
+    if (!WE)
+      return false;
+    return contains(WE->getParentException());
+  }
+  bool contains(const MachineBasicBlock *MBB) const {
+    return BlockSet.count(MBB);
+  }
+
+  void addBlock(MachineBasicBlock *MBB) {
+    Blocks.push_back(MBB);
+    BlockSet.insert(MBB);
+  }
+  ArrayRef<MachineBasicBlock *> getBlocks() const { return Blocks; }
+  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+  block_iterator block_begin() const { return getBlocks().begin(); }
+  block_iterator block_end() const { return getBlocks().end(); }
+  inline iterator_range<block_iterator> blocks() const {
+    return make_range(block_begin(), block_end());
+  }
+  unsigned getNumBlocks() const { return Blocks.size(); }
+  std::vector<MachineBasicBlock *> &getBlocksVector() { return Blocks; }
+
+  const std::vector<WebAssemblyException *> &getSubExceptions() const {
+    return SubExceptions;
+  }
+  std::vector<WebAssemblyException *> &getSubExceptions() {
+    return SubExceptions;
+  }
+  void addSubException(WebAssemblyException *E) { SubExceptions.push_back(E); }
+  using iterator = typename std::vector<WebAssemblyException *>::const_iterator;
+  iterator begin() const { return SubExceptions.begin(); }
+  iterator end() const { return SubExceptions.end(); }
+
+  void reserveBlocks(unsigned Size) { Blocks.reserve(Size); }
+  void reverseBlock(unsigned From = 0) {
+    std::reverse(Blocks.begin() + From, Blocks.end());
+  }
+
+  // Return the nesting level. An outermost one has depth 1.
+  unsigned getExceptionDepth() const {
+    unsigned D = 1;
+    for (const WebAssemblyException *CurException = ParentException;
+         CurException; CurException = CurException->ParentException)
+      ++D;
+    return D;
+  }
+
+  void print(raw_ostream &OS, unsigned Depth = 0) const;
+  void dump() const;
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const WebAssemblyException &WE);
+
+class WebAssemblyExceptionInfo final : public MachineFunctionPass {
+  // Mapping of basic blocks to the innermost exception they occur in
+  DenseMap<const MachineBasicBlock *, WebAssemblyException *> BBMap;
+  std::vector<WebAssemblyException *> TopLevelExceptions;
+
+  void discoverAndMapException(WebAssemblyException *WE,
+                               const MachineDominatorTree &MDT,
+                               const MachineDominanceFrontier &MDF);
+  WebAssemblyException *getOutermostException(MachineBasicBlock *MBB) const;
+
+public:
+  static char ID;
+  WebAssemblyExceptionInfo() : MachineFunctionPass(ID) {
+    initializeWebAssemblyExceptionInfoPass(*PassRegistry::getPassRegistry());
+  }
+  ~WebAssemblyExceptionInfo() override { releaseMemory(); }
+  WebAssemblyExceptionInfo(const WebAssemblyExceptionInfo &) = delete;
+  WebAssemblyExceptionInfo &
+  operator=(const WebAssemblyExceptionInfo &) = delete;
+
+  bool runOnMachineFunction(MachineFunction &) override;
+  void releaseMemory() override;
+  void recalculate(MachineDominatorTree &MDT,
+                   const MachineDominanceFrontier &MDF);
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool empty() const { return TopLevelExceptions.empty(); }
+
+  // Return the innermost exception that MBB lives in. If the block is not in an
+  // exception, null is returned.
+  WebAssemblyException *getExceptionFor(const MachineBasicBlock *MBB) const {
+    return BBMap.lookup(MBB);
+  }
+
+  void changeExceptionFor(MachineBasicBlock *MBB, WebAssemblyException *WE) {
+    if (!WE) {
+      BBMap.erase(MBB);
+      return;
+    }
+    BBMap[MBB] = WE;
+  }
+
+  void addTopLevelException(WebAssemblyException *WE) {
+    assert(!WE->getParentException() && "Not a top level exception!");
+    TopLevelExceptions.push_back(WE);
+  }
+
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index e2edb924d4d2..8619cbdcb5ee 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file converts any remaining registers into WebAssembly locals.
+/// This file converts any remaining registers into WebAssembly locals.
 ///
 /// After register stackification and register coloring, convert non-stackified
 /// registers into locals, inserting explicit get_local and set_local
@@ -60,6 +60,9 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyExplicitLocals::ID = 0;
+INITIALIZE_PASS(WebAssemblyExplicitLocals, DEBUG_TYPE,
+                "Convert registers to WebAssembly locals", false, false)
+
 FunctionPass *llvm::createWebAssemblyExplicitLocals() {
   return new WebAssemblyExplicitLocals();
 }
@@ -86,6 +89,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::DROP_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::DROP_V128;
+  if (RC == &WebAssembly::EXCEPT_REFRegClass)
+    return WebAssembly::DROP_EXCEPT_REF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -101,6 +106,8 @@ static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::GET_LOCAL_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::GET_LOCAL_V128;
+  if (RC == &WebAssembly::EXCEPT_REFRegClass)
+    return WebAssembly::GET_LOCAL_EXCEPT_REF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -116,6 +123,8 @@ static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::SET_LOCAL_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::SET_LOCAL_V128;
+  if (RC == &WebAssembly::EXCEPT_REFRegClass)
+    return WebAssembly::SET_LOCAL_EXCEPT_REF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -131,6 +140,8 @@ static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::TEE_LOCAL_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::TEE_LOCAL_V128;
+  if (RC == &WebAssembly::EXCEPT_REFRegClass)
+    return WebAssembly::TEE_LOCAL_EXCEPT_REF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -144,6 +155,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
     return MVT::f32;
   if (RC == &WebAssembly::F64RegClass)
     return MVT::f64;
+  if (RC == &WebAssembly::EXCEPT_REFRegClass)
+    return MVT::ExceptRef;
   llvm_unreachable("unrecognized register class");
 }
 
@@ -168,19 +181,14 @@ static MachineInstr *FindStartOfTree(MachineOperand &MO,
 }
 
 bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Make Locals Explicit **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Make Locals Explicit **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   // Disable this pass if directed to do so.
   if (DisableWebAssemblyExplicitLocals)
     return false;
 
-  // Disable this pass if we aren't doing direct wasm object emission.
-  if (MF.getSubtarget<WebAssemblySubtarget>()
-        .getTargetTriple().isOSBinFormatELF())
-    return false;
-
   bool Changed = false;
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -218,7 +226,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I++;
       assert(!WebAssembly::isArgument(MI));
 
-      if (MI.isDebugValue() || MI.isLabel())
+      if (MI.isDebugInstr() || MI.isLabel())
         continue;
 
       // Replace tee instructions with tee_local. The difference is that tee
@@ -271,8 +279,11 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
           }
           if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
             unsigned Opc = getDropOpcode(RC);
-            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
-                .addReg(NewReg);
+            MachineInstr *Drop =
+                BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                    .addReg(NewReg);
+            // After the drop instruction, this reg operand will not be used
+            Drop->getOperand(0).setIsKill();
           } else {
             unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
             unsigned Opc = getSetLocalOpcode(RC);
@@ -281,6 +292,9 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                 .addReg(NewReg);
           }
           MI.getOperand(0).setReg(NewReg);
+          // This register operand is now being used by the inserted drop
+          // instruction, so make it undead.
+          MI.getOperand(0).setIsDead(false);
           MFI.stackifyVReg(NewReg);
           Changed = true;
         }
@@ -362,7 +376,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // Assert that all registers have been stackified at this point.
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      if (MI.isDebugValue() || MI.isLabel())
+      if (MI.isDebugInstr() || MI.isLabel())
         continue;
       for (const MachineOperand &MO : MI.explicit_operands()) {
         assert(
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 7e284ea950fd..566ef68c027d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the WebAssembly-specific support for the FastISel
+/// This file defines the WebAssembly-specific support for the FastISel
 /// class. Some of the target-specific code is generated by tablegen in the file
 /// WebAssemblyGenFastISel.inc, which is #included here.
 ///
@@ -127,6 +127,7 @@ private:
     case MVT::i64:
     case MVT::f32:
     case MVT::f64:
+    case MVT::ExceptRef:
       return VT;
     case MVT::f16:
       return MVT::f32;
@@ -418,7 +419,7 @@ unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
         return getRegForValue(ICmp->getOperand(0));
       }
 
-  if (BinaryOperator::isNot(V)) {
+  if (BinaryOperator::isNot(V) && V->getType()->isIntegerTy(32)) {
     Not = true;
     return getRegForValue(BinaryOperator::getNotArgument(V));
   }
@@ -681,6 +682,10 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       Opc = WebAssembly::ARGUMENT_v4f32;
       RC = &WebAssembly::V128RegClass;
       break;
+    case MVT::ExceptRef:
+      Opc = WebAssembly::ARGUMENT_EXCEPT_REF;
+      RC = &WebAssembly::EXCEPT_REFRegClass;
+      break;
     default:
       return false;
     }
@@ -695,11 +700,23 @@ bool WebAssemblyFastISel::fastLowerArguments() {
   MRI.addLiveIn(WebAssembly::ARGUMENTS);
 
   auto *MFI = MF->getInfo<WebAssemblyFunctionInfo>();
-  for (auto const &Arg : F->args())
-    MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+  for (auto const &Arg : F->args()) {
+    MVT::SimpleValueType ArgTy = getLegalType(getSimpleType(Arg.getType()));
+    if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE) {
+      MFI->clearParamsAndResults();
+      return false;
+    }
+    MFI->addParam(ArgTy);
+  }
 
-  if (!F->getReturnType()->isVoidTy())
-    MFI->addResult(getLegalType(getSimpleType(F->getReturnType())));
+  if (!F->getReturnType()->isVoidTy()) {
+    MVT::SimpleValueType RetTy = getLegalType(getSimpleType(F->getReturnType()));
+    if (RetTy == MVT::INVALID_SIMPLE_VALUE_TYPE) {
+      MFI->clearParamsAndResults();
+      return false;
+    }
+    MFI->addResult(RetTy);
+  }
 
   return true;
 }
@@ -770,6 +787,11 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
           IsDirect ? WebAssembly::CALL_v4f32 : WebAssembly::PCALL_INDIRECT_v4f32;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
+    case MVT::ExceptRef:
+      Opc = IsDirect ? WebAssembly::CALL_EXCEPT_REF
+                     : WebAssembly::PCALL_INDIRECT_EXCEPT_REF;
+      ResultReg = createResultReg(&WebAssembly::EXCEPT_REFRegClass);
+      break;
     default:
       return false;
     }
@@ -868,6 +890,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
     Opc = WebAssembly::SELECT_F64;
     RC = &WebAssembly::F64RegClass;
     break;
+  case MVT::ExceptRef:
+    Opc = WebAssembly::SELECT_EXCEPT_REF;
+    RC = &WebAssembly::EXCEPT_REFRegClass;
+    break;
   default:
     return false;
   }
@@ -1165,6 +1191,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
   switch (getSimpleType(Store->getValueOperand()->getType())) {
   case MVT::i1:
     VTIsi1 = true;
+    LLVM_FALLTHROUGH;
   case MVT::i8:
     Opc = WebAssembly::STORE8_I32;
     break;
@@ -1273,6 +1300,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   case MVT::v4f32:
     Opc = WebAssembly::RETURN_v4f32;
     break;
+  case MVT::ExceptRef:
+    Opc = WebAssembly::RETURN_EXCEPT_REF;
+    break;
   default: return false;
   }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 666337acccce..d5e47ee82513 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Fix bitcasted functions.
+/// Fix bitcasted functions.
 ///
 /// WebAssembly requires caller and callee signatures to match, however in LLVM,
 /// some amount of slop is vaguely permitted. Detect mismatch by looking for
@@ -61,6 +61,9 @@ public:
 } // End anonymous namespace
 
 char FixFunctionBitcasts::ID = 0;
+INITIALIZE_PASS(FixFunctionBitcasts, DEBUG_TYPE,
+                "Fix mismatching bitcasts for WebAssembly", false, false)
+
 ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
   return new FixFunctionBitcasts();
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 88daea7e3681..bea027be7711 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a pass that transforms irreducible control flow
+/// This file implements a pass that transforms irreducible control flow
 /// into reducible control flow. Irreducible control flow means multiple-entry
 /// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
 /// due to being unnatural.
@@ -71,6 +71,9 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyFixIrreducibleControlFlow::ID = 0;
+INITIALIZE_PASS(WebAssemblyFixIrreducibleControlFlow, DEBUG_TYPE,
+                "Removes irreducible control flow", false, false)
+
 FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
   return new WebAssemblyFixIrreducibleControlFlow();
 }
@@ -136,7 +139,7 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
   MachineBasicBlock *Header = Loop ? Loop->getHeader() : &*MF.begin();
   SetVector<MachineBasicBlock *> RewriteSuccs;
 
-  // DFS through Loop's body, looking for for irreducible control flow. Loop is
+  // DFS through Loop's body, looking for irreducible control flow. Loop is
   // natural, and we stay in its body, and we treat any nested loops
   // monolithically, so any cycles we encounter indicate irreducibility.
   SmallPtrSet<MachineBasicBlock *, 8> OnStack;
@@ -174,7 +177,7 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
   if (LLVM_LIKELY(RewriteSuccs.empty()))
     return false;
 
-  DEBUG(dbgs() << "Irreducible control flow detected!\n");
+  LLVM_DEBUG(dbgs() << "Irreducible control flow detected!\n");
 
   // Ok. We have irreducible control flow! Create a dispatch block which will
   // contains a jump table to any block in the problematic set of blocks.
@@ -205,7 +208,8 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
       continue;
 
     unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
-    DEBUG(dbgs() << printMBBReference(*MBB) << " has index " << Index << "\n");
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has index " << Index
+                      << "\n");
 
     Pair.first->second = Index;
     for (auto Pred : MBB->predecessors())
@@ -264,9 +268,9 @@ bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
 
 bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
     MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   bool Changed = false;
   auto &MLI = getAnalysis<MachineLoopInfo>();
@@ -284,7 +288,7 @@ bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
 
   // If we made any changes, completely recompute everything.
   if (LLVM_UNLIKELY(Changed)) {
-    DEBUG(dbgs() << "Recomputing dominators and loops.\n");
+    LLVM_DEBUG(dbgs() << "Recomputing dominators and loops.\n");
     MF.getRegInfo().invalidateLiveness();
     MF.RenumberBlocks();
     getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 84246052f601..052c94e9d6a9 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the WebAssembly implementation of
+/// This file contains the WebAssembly implementation of
 /// TargetFrameLowering class.
 ///
 /// On WebAssembly, there aren't a lot of things to do here. There are no
@@ -106,29 +106,9 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
 
   const char *ES = "__stack_pointer";
   auto *SPSymbol = MF.createExternalSymbolName(ES);
-  if (MF.getSubtarget<WebAssemblySubtarget>()
-        .getTargetTriple().isOSBinFormatELF()) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    const TargetRegisterClass *PtrRC =
-        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-    unsigned Zero = MRI.createVirtualRegister(PtrRC);
-
-    BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
-        .addImm(0);
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-        MachineMemOperand::MOStore, 4, 4);
-    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
-        .addImm(2)  // p2align
-        .addExternalSymbol(SPSymbol)
-        .addReg(Zero)
-        .addReg(SrcReg)
-        .addMemOperand(MMO);
-  } else {
-    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
-        .addExternalSymbol(SPSymbol)
-        .addReg(SrcReg);
-  }
+  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
+      .addExternalSymbol(SPSymbol)
+      .addReg(SrcReg);
 }
 
 MachineBasicBlock::iterator
@@ -172,25 +152,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
 
   const char *ES = "__stack_pointer";
   auto *SPSymbol = MF.createExternalSymbolName(ES);
-  if (MF.getSubtarget<WebAssemblySubtarget>()
-        .getTargetTriple().isOSBinFormatELF()) {
-    unsigned Zero = MRI.createVirtualRegister(PtrRC);
-
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
-        .addImm(0);
-    MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
-        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-        MachineMemOperand::MOLoad, 4, 4);
-    // Load the SP value.
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
-        .addImm(2)       // p2align
-        .addExternalSymbol(SPSymbol)
-        .addReg(Zero)    // addr
-        .addMemOperand(LoadMMO);
-  } else {
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
-        .addExternalSymbol(SPSymbol);
-  }
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
+      .addExternalSymbol(SPSymbol);
 
   bool HasBP = hasBP(MF);
   if (HasBP) {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 4cc7f5ae058a..fe23e418a3f1 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This class implements WebAssembly-specific bits of
+/// This class implements WebAssembly-specific bits of
 /// TargetFrameLowering class.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 2f0f106ef5b7..c12550feabbb 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file describes the various WebAssembly ISD node types.
+/// This file describes the various WebAssembly ISD node types.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 9f40d35689a5..fdf3a30a5c0e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines an instruction selector for the WebAssembly target.
+/// This file defines an instruction selector for the WebAssembly target.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -68,27 +68,21 @@ private:
 } // end anonymous namespace
 
 void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected.
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return;
   }
 
-  // Few custom selection stuff.
-  EVT VT = Node->getValueType(0);
-
+  // Few custom selection stuff. If we need WebAssembly-specific selection,
+  // uncomment this block add corresponding case statements.
+  /*
   switch (Node->getOpcode()) {
   default:
     break;
-    // If we need WebAssembly-specific selection, it would go here.
-    (void)VT;
   }
+  */
 
   // Select the default instruction.
   SelectCode(Node);
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 299009fa6674..283e703e1f6c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the WebAssemblyTargetLowering class.
+/// This file implements the WebAssemblyTargetLowering class.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -117,8 +117,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-  if (!Subtarget->hasAtomics()) {
-    // The Atomics feature includes signext intructions.
+  if (!Subtarget->hasSignExt()) {
     for (auto T : {MVT::i8, MVT::i16, MVT::i32})
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
   }
@@ -152,6 +151,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   // Trap lowers to wasm unreachable
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
+  // Exception handling intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
   setMaxAtomicSizeInBitsSupported(64);
 }
 
@@ -427,6 +429,15 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
   return true;
 }
 
+EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
+                                                  LLVMContext &C,
+                                                  EVT VT) const {
+  if (VT.isVector())
+    return VT.changeVectorElementTypeToInteger();
+
+  return TargetLowering::getSetCCResultType(DL, C, VT);
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Lowering private implementation.
 //===----------------------------------------------------------------------===//
@@ -485,6 +496,7 @@ SDValue WebAssemblyTargetLowering::LowerCall(
 
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  unsigned NumFixedArgs = 0;
   for (unsigned i = 0; i < Outs.size(); ++i) {
     const ISD::OutputArg &Out = Outs[i];
     SDValue &OutVal = OutVals[i];
@@ -510,11 +522,11 @@ SDValue WebAssemblyTargetLowering::LowerCall(
           /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
       OutVal = FINode;
     }
+    // Count the number of fixed args *after* legalization.
+    NumFixedArgs += Out.IsFixed;
   }
 
   bool IsVarArg = CLI.IsVarArg;
-  unsigned NumFixedArgs = CLI.NumFixedArgs;
-
   auto PtrVT = getPointerTy(Layout);
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -738,6 +750,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
       return LowerFRAMEADDR(Op, DAG);
     case ISD::CopyToReg:
       return LowerCopyToReg(Op, DAG);
+    case ISD::INTRINSIC_WO_CHAIN:
+      return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   }
 }
 
@@ -870,6 +884,21 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
                       MachinePointerInfo(SV), 0);
 }
 
+SDValue
+WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc DL(Op);
+  switch (IntNo) {
+  default:
+    return {}; // Don't custom lower most intrinsics.
+
+  case Intrinsic::wasm_lsda:
+    // TODO For now, just return 0 not to crash
+    return DAG.getConstant(0, DL, Op.getValueType());
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                          WebAssembly Optimization Hooks
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 7bb8e71ab974..79819493ac6a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the interfaces that WebAssembly uses to lower LLVM
+/// This file defines the interfaces that WebAssembly uses to lower LLVM
 /// code into a selection DAG.
 ///
 //===----------------------------------------------------------------------===//
@@ -64,6 +64,9 @@ class WebAssemblyTargetLowering final : public TargetLowering {
                                       bool *Fast) const override;
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -90,6 +93,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index a49172df158f..d879932b3232 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly Atomic operand code-gen constructs.
+/// WebAssembly Atomic operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,8 +17,8 @@
 //===----------------------------------------------------------------------===//
 
 let Defs = [ARGUMENTS] in {
-def ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
-def ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
+defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
+defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
 } // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
@@ -40,7 +40,6 @@ def : LoadPatGlobalAddr<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 def : LoadPatExternalSym<i32, atomic_load_32, ATOMIC_LOAD_I32>;
 def : LoadPatExternalSym<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 
-
 // Select loads with just a constant offset.
 def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
 def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
@@ -56,14 +55,14 @@ def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 // Extending loads. Note that there are only zero-extending atomic loads, no
 // sign-extending loads.
 let Defs = [ARGUMENTS] in {
-def ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
-def ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
-def ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
-def ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
-def ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
+defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
+defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
+defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
+defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
+defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
 } // Defs = [ARGUMENTS]
 
-// Fragments for exending loads. These are different from regular loads because
+// Fragments for extending loads. These are different from regular loads because
 // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
 // therefore don't have the extension type field. So instead of matching that,
 // we match the patterns that the type legalizer expands them to.
@@ -72,10 +71,10 @@ def ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
 // i32 (zext (i8 (atomic_load_8))) gets legalized to
 // i32 (and (i32 (atomic_load_8)), 255)
 // These can be selected to a single zero-extending atomic load instruction.
-def zext_aload_8 : PatFrag<(ops node:$addr),
-                           (and (i32 (atomic_load_8 node:$addr)), 255)>;
-def zext_aload_16 : PatFrag<(ops node:$addr),
-                            (and (i32 (atomic_load_16 node:$addr)), 65535)>;
+def zext_aload_8_32 :
+  PatFrag<(ops node:$addr), (and (i32 (atomic_load_8 node:$addr)), 255)>;
+def zext_aload_16_32 :
+  PatFrag<(ops node:$addr), (and (i32 (atomic_load_16 node:$addr)), 65535)>;
 // Unlike regular loads, extension to i64 is handled differently than i32.
 // i64 (zext (i8 (atomic_load_8))) gets legalized to
 // i64 (and (i64 (anyext (i32 (atomic_load_8)))), 255)
@@ -93,15 +92,15 @@ def zext_aload_32_64 :
 // match bare subword loads (for 32-bit results) and anyext loads (for 64-bit
 // results) and select a zext load; the next instruction will be sext_inreg
 // which is selected by itself.
-def anyext_aload_8_64 :
+def sext_aload_8_64 :
   PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_8 node:$addr)))>;
-def anyext_aload_16_64 :
+def sext_aload_16_64 :
   PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_16 node:$addr)))>;
 
 let Predicates = [HasAtomics] in {
 // Select zero-extending loads with no constant offset.
-def : LoadPatNoOffset<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatNoOffset<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatNoOffset<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatNoOffset<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
 def : LoadPatNoOffset<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
 def : LoadPatNoOffset<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 def : LoadPatNoOffset<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
@@ -109,16 +108,15 @@ def : LoadPatNoOffset<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
 // Select sign-extending loads with no constant offset
 def : LoadPatNoOffset<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
 def : LoadPatNoOffset<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatNoOffset<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatNoOffset<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-// 32->64 sext load gets selected as i32.atomic.load, i64.extend_s/i64
-
+def : LoadPatNoOffset<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatNoOffset<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+// 32->64 sext load gets selected as i32.atomic.load, i64.extend_s/i32
 
 // Zero-extending loads with constant offset
-def : LoadPatImmOff<i32, zext_aload_8, regPlusImm, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_16, regPlusImm, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_8, or_is_add, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_16, or_is_add, ATOMIC_LOAD16_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_8_32, regPlusImm, ATOMIC_LOAD8_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_16_32, regPlusImm, ATOMIC_LOAD16_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_8_32, or_is_add, ATOMIC_LOAD8_U_I32>;
+def : LoadPatImmOff<i32, zext_aload_16_32, or_is_add, ATOMIC_LOAD16_U_I32>;
 def : LoadPatImmOff<i64, zext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
 def : LoadPatImmOff<i64, zext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
 def : LoadPatImmOff<i64, zext_aload_32_64, regPlusImm, ATOMIC_LOAD32_U_I64>;
@@ -131,64 +129,62 @@ def : LoadPatImmOff<i32, atomic_load_8, regPlusImm, ATOMIC_LOAD8_U_I32>;
 def : LoadPatImmOff<i32, atomic_load_16, regPlusImm, ATOMIC_LOAD16_U_I32>;
 def : LoadPatImmOff<i32, atomic_load_8, or_is_add, ATOMIC_LOAD8_U_I32>;
 def : LoadPatImmOff<i32, atomic_load_16, or_is_add, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i64, anyext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, anyext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
-def : LoadPatImmOff<i64, anyext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, anyext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
+def : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
 // No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64
 
-def : LoadPatGlobalAddr<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatGlobalAddr<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatGlobalAddr<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
 def : LoadPatGlobalAddr<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
 def : LoadPatGlobalAddr<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 def : LoadPatGlobalAddr<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
 def : LoadPatGlobalAddr<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
 def : LoadPatGlobalAddr<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+def : LoadPatGlobalAddr<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatGlobalAddr<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 
-def : LoadPatExternalSym<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternalSym<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatExternalSym<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatExternalSym<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
 def : LoadPatExternalSym<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
 def : LoadPatExternalSym<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 def : LoadPatExternalSym<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
 def : LoadPatExternalSym<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
 def : LoadPatExternalSym<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternalSym<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternalSym<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
+def : LoadPatExternalSym<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatExternalSym<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 
 // Extending loads with just a constant offset
-def : LoadPatOffsetOnly<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatOffsetOnly<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
 def : LoadPatOffsetOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
 def : LoadPatOffsetOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 def : LoadPatOffsetOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
 def : LoadPatOffsetOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
 def : LoadPatOffsetOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatOffsetOnly<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatOffsetOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+def : LoadPatOffsetOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatOffsetOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 
-def : LoadPatGlobalAddrOffOnly<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatGlobalAddrOffOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatGlobalAddrOffOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
 def : LoadPatGlobalAddrOffOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
 def : LoadPatGlobalAddrOffOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 def : LoadPatGlobalAddrOffOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
 def : LoadPatGlobalAddrOffOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
 def : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+def : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 
-def : LoadPatExternSymOffOnly<i32, zext_aload_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternSymOffOnly<i32, zext_aload_16, ATOMIC_LOAD16_U_I32>;
+def : LoadPatExternSymOffOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
+def : LoadPatExternSymOffOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
 def : LoadPatExternSymOffOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
 def : LoadPatExternSymOffOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 def : LoadPatExternSymOffOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
 def : LoadPatExternSymOffOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
 def : LoadPatExternSymOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternSymOffOnly<i64, anyext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternSymOffOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
+def : LoadPatExternSymOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
+def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 
 } // Predicates = [HasAtomics]
 
@@ -196,19 +192,466 @@ def : LoadPatExternSymOffOnly<i64, anyext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 // Atomic stores
 //===----------------------------------------------------------------------===//
 
-// TODO: add atomic stores here...
+let Defs = [ARGUMENTS] in {
+defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
+defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
+} // Defs = [ARGUMENTS]
+
+// We need an 'atomic' version of store patterns because store and atomic_store
+// nodes have different operand orders:
+// store: (store $val, $ptr)
+// atomic_store: (store $ptr, $val)
+
+let Predicates = [HasAtomics] in {
+
+// Select stores with no constant offset.
+class AStorePatNoOffset<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind I32:$addr, ty:$val), (inst 0, 0, I32:$addr, ty:$val)>;
+def : AStorePatNoOffset<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatNoOffset<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+// Select stores with a constant offset.
+
+// Pattern with address + immediate offset
+class AStorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+  Pat<(kind (operand I32:$addr, imm:$off), ty:$val),
+      (inst 0, imm:$off, I32:$addr, ty:$val)>;
+def : AStorePatImmOff<i32, atomic_store_32, regPlusImm, ATOMIC_STORE_I32>;
+def : AStorePatImmOff<i64, atomic_store_64, regPlusImm, ATOMIC_STORE_I64>;
+def : AStorePatImmOff<i32, atomic_store_32, or_is_add, ATOMIC_STORE_I32>;
+def : AStorePatImmOff<i64, atomic_store_64, or_is_add, ATOMIC_STORE_I64>;
+
+class AStorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
+            ty:$val),
+      (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+def : AStorePatGlobalAddr<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatGlobalAddr<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+class AStorePatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), ty:$val),
+      (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
+def : AStorePatExternalSym<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatExternalSym<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+// Select stores with just a constant offset.
+class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+def : AStorePatOffsetOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatOffsetOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+class AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
+      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
+def : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+class AStorePatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind (WebAssemblywrapper texternalsym:$off), ty:$val),
+      (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
+def : AStorePatExternSymOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
+def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+
+} // Predicates = [HasAtomics]
+
+// Truncating stores.
+let Defs = [ARGUMENTS] in {
+defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
+defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
+defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
+defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
+defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
+} // Defs = [ARGUMENTS]
+
+// Fragments for truncating stores.
+
+// We don't have single truncating atomic store instructions. For 32-bit
+// instructions, we just need to match bare atomic stores. On the other hand,
+// truncating stores from i64 values are once truncated to i32 first.
+class trunc_astore_64<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+          (kind node:$addr, (i32 (trunc (i64 node:$val))))>;
+def trunc_astore_8_64 : trunc_astore_64<atomic_store_8>;
+def trunc_astore_16_64 : trunc_astore_64<atomic_store_16>;
+def trunc_astore_32_64 : trunc_astore_64<atomic_store_32>;
+
+let Predicates = [HasAtomics] in {
+
+// Truncating stores with no constant offset
+def : AStorePatNoOffset<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatNoOffset<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatNoOffset<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatNoOffset<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatNoOffset<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+// Truncating stores with a constant offset
+def : AStorePatImmOff<i32, atomic_store_8, regPlusImm, ATOMIC_STORE8_I32>;
+def : AStorePatImmOff<i32, atomic_store_16, regPlusImm, ATOMIC_STORE16_I32>;
+def : AStorePatImmOff<i64, trunc_astore_8_64, regPlusImm, ATOMIC_STORE8_I64>;
+def : AStorePatImmOff<i64, trunc_astore_16_64, regPlusImm, ATOMIC_STORE16_I64>;
+def : AStorePatImmOff<i64, trunc_astore_32_64, regPlusImm, ATOMIC_STORE32_I64>;
+def : AStorePatImmOff<i32, atomic_store_8, or_is_add, ATOMIC_STORE8_I32>;
+def : AStorePatImmOff<i32, atomic_store_16, or_is_add, ATOMIC_STORE16_I32>;
+def : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, ATOMIC_STORE8_I64>;
+def : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add, ATOMIC_STORE16_I64>;
+def : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add, ATOMIC_STORE32_I64>;
+
+def : AStorePatGlobalAddr<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatGlobalAddr<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatGlobalAddr<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatGlobalAddr<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatGlobalAddr<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+def : AStorePatExternalSym<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatExternalSym<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatExternalSym<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatExternalSym<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatExternalSym<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+// Truncating stores with just a constant offset
+def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatOffsetOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatOffsetOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatOffsetOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+def : AStorePatGlobalAddrOffOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatGlobalAddrOffOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+def : AStorePatExternSymOffOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
+def : AStorePatExternSymOffOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
+def : AStorePatExternSymOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
+def : AStorePatExternSymOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
+def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+
+} // Predicates = [HasAtomics]
 
 //===----------------------------------------------------------------------===//
-// Low-level exclusive operations
+// Atomic binary read-modify-writes
 //===----------------------------------------------------------------------===//
 
-// TODO: add exclusive operations here...
+let Defs = [ARGUMENTS] in {
+
+multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
+  defm "" : I<(outs rc:$dst),
+              (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
+              (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+              !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}, $val"),
+              !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+}
+
+defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.add", 0xfe1e>;
+defm ATOMIC_RMW_ADD_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.add", 0xfe1f>;
+defm ATOMIC_RMW8_U_ADD_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.add", 0xfe20>;
+defm ATOMIC_RMW16_U_ADD_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.add", 0xfe21>;
+defm ATOMIC_RMW8_U_ADD_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.add", 0xfe22>;
+defm ATOMIC_RMW16_U_ADD_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.add", 0xfe23>;
+defm ATOMIC_RMW32_U_ADD_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.add", 0xfe24>;
+
+defm ATOMIC_RMW_SUB_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.sub", 0xfe25>;
+defm ATOMIC_RMW_SUB_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.sub", 0xfe26>;
+defm ATOMIC_RMW8_U_SUB_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.sub", 0xfe27>;
+defm ATOMIC_RMW16_U_SUB_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.sub", 0xfe28>;
+defm ATOMIC_RMW8_U_SUB_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.sub", 0xfe29>;
+defm ATOMIC_RMW16_U_SUB_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.sub", 0xfe2a>;
+defm ATOMIC_RMW32_U_SUB_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.sub", 0xfe2b>;
+
+defm ATOMIC_RMW_AND_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.and", 0xfe2c>;
+defm ATOMIC_RMW_AND_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.and", 0xfe2d>;
+defm ATOMIC_RMW8_U_AND_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.and", 0xfe2e>;
+defm ATOMIC_RMW16_U_AND_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.and", 0xfe2f>;
+defm ATOMIC_RMW8_U_AND_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.and", 0xfe30>;
+defm ATOMIC_RMW16_U_AND_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.and", 0xfe31>;
+defm ATOMIC_RMW32_U_AND_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.and", 0xfe32>;
+
+defm ATOMIC_RMW_OR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.or", 0xfe33>;
+defm ATOMIC_RMW_OR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.or", 0xfe34>;
+defm ATOMIC_RMW8_U_OR_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.or", 0xfe35>;
+defm ATOMIC_RMW16_U_OR_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.or", 0xfe36>;
+defm ATOMIC_RMW8_U_OR_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.or", 0xfe37>;
+defm ATOMIC_RMW16_U_OR_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.or", 0xfe38>;
+defm ATOMIC_RMW32_U_OR_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.or", 0xfe39>;
+
+defm ATOMIC_RMW_XOR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.xor", 0xfe3a>;
+defm ATOMIC_RMW_XOR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.xor", 0xfe3b>;
+defm ATOMIC_RMW8_U_XOR_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.xor", 0xfe3c>;
+defm ATOMIC_RMW16_U_XOR_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.xor", 0xfe3d>;
+defm ATOMIC_RMW8_U_XOR_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.xor", 0xfe3e>;
+defm ATOMIC_RMW16_U_XOR_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xor", 0xfe3f>;
+defm ATOMIC_RMW32_U_XOR_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xor", 0xfe40>;
+
+defm ATOMIC_RMW_XCHG_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw.xchg", 0xfe41>;
+defm ATOMIC_RMW_XCHG_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw.xchg", 0xfe42>;
+defm ATOMIC_RMW8_U_XCHG_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw8_u.xchg", 0xfe43>;
+defm ATOMIC_RMW16_U_XCHG_I32 :
+  WebAssemblyBinRMW<I32, "i32.atomic.rmw16_u.xchg", 0xfe44>;
+defm ATOMIC_RMW8_U_XCHG_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw8_u.xchg", 0xfe45>;
+defm ATOMIC_RMW16_U_XCHG_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xchg", 0xfe46>;
+defm ATOMIC_RMW32_U_XCHG_I64 :
+  WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xchg", 0xfe47>;
+}
+
+// Select binary RMWs with no constant offset.
+class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind I32:$addr, ty:$val)), (inst 0, 0, I32:$addr, ty:$val)>;
+
+// Select binary RMWs with a constant offset.
+
+// Pattern with address + immediate offset
+class BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+  Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
+      (inst 0, imm:$off, I32:$addr, ty:$val)>;
+
+class BinRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
+                ty:$val)),
+      (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+
+class BinRMWPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)),
+                ty:$val)),
+      (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
+
+// Select binary RMWs with just a constant offset.
+class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind imm:$off, ty:$val)),
+      (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+
+class BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
+      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
 
-// Load-exclusives.
+class BinRMWPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (WebAssemblywrapper texternalsym:$off), ty:$val)),
+      (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
 
-// Store-exclusives.
+// Patterns for various addressing modes.
+multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
+                         NI inst_64> {
+  def : BinRMWPatNoOffset<i32, rmw_32, inst_32>;
+  def : BinRMWPatNoOffset<i64, rmw_64, inst_64>;
 
-// Store-release-exclusives.
+  def : BinRMWPatImmOff<i32, rmw_32, regPlusImm, inst_32>;
+  def : BinRMWPatImmOff<i64, rmw_64, regPlusImm, inst_64>;
+  def : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
+  def : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
 
-// And clear exclusive.
+  def : BinRMWPatGlobalAddr<i32, rmw_32, inst_32>;
+  def : BinRMWPatGlobalAddr<i64, rmw_64, inst_64>;
 
+  def : BinRMWPatExternalSym<i32, rmw_32, inst_32>;
+  def : BinRMWPatExternalSym<i64, rmw_64, inst_64>;
+
+  def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
+  def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
+
+  def : BinRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
+  def : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
+
+  def : BinRMWPatExternSymOffOnly<i32, rmw_32, inst_32>;
+  def : BinRMWPatExternSymOffOnly<i64, rmw_64, inst_64>;
+}
+
+let Predicates = [HasAtomics] in {
+defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64, ATOMIC_RMW_ADD_I32,
+                     ATOMIC_RMW_ADD_I64>;
+defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64, ATOMIC_RMW_SUB_I32,
+                     ATOMIC_RMW_SUB_I64>;
+defm : BinRMWPattern<atomic_load_and_32, atomic_load_and_64, ATOMIC_RMW_AND_I32,
+                     ATOMIC_RMW_AND_I64>;
+defm : BinRMWPattern<atomic_load_or_32, atomic_load_or_64, ATOMIC_RMW_OR_I32,
+                     ATOMIC_RMW_OR_I64>;
+defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64, ATOMIC_RMW_XOR_I32,
+                     ATOMIC_RMW_XOR_I64>;
+defm : BinRMWPattern<atomic_swap_32, atomic_swap_64, ATOMIC_RMW_XCHG_I32,
+                     ATOMIC_RMW_XCHG_I64>;
+} // Predicates = [HasAtomics]
+
+// Truncating & zero-extending binary RMW patterns.
+// These are combined patterns of truncating store patterns and zero-extending
+// load patterns above.
+class zext_bin_rmw_8_32<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+          (and (i32 (kind node:$addr, node:$val)), 255)>;
+class zext_bin_rmw_16_32<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+          (and (i32 (kind node:$addr, node:$val)), 65535)>;
+class zext_bin_rmw_8_64<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+    (and (i64 (anyext (i32 (kind node:$addr,
+                                 (i32 (trunc (i64 node:$val))))))), 255)>;
+class zext_bin_rmw_16_64<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+    (and (i64 (anyext (i32 (kind node:$addr,
+                                 (i32 (trunc (i64 node:$val))))))), 65535)>;
+class zext_bin_rmw_32_64<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+          (zext (i32 (kind node:$addr, (i32 (trunc (i64 node:$val))))))>;
+
+// Truncating & sign-extending binary RMW patterns.
+// These are combined patterns of truncating store patterns and sign-extending
+// load patterns above. We match subword RMWs (for 32-bit) and anyext RMWs (for
+// 64-bit) and select a zext RMW; the next instruction will be sext_inreg which
+// is selected by itself.
+class sext_bin_rmw_8_32<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val), (kind node:$addr, node:$val)>;
+class sext_bin_rmw_16_32<PatFrag kind> : sext_bin_rmw_8_32<kind>;
+class sext_bin_rmw_8_64<PatFrag kind> :
+  PatFrag<(ops node:$addr, node:$val),
+          (anyext (i32 (kind node:$addr, (i32 (trunc (i64 node:$val))))))>;
+class sext_bin_rmw_16_64<PatFrag kind> : sext_bin_rmw_8_64<kind>;
+// 32->64 sext RMW gets selected as i32.atomic.rmw.***, i64.extend_s/i32
+
+// Patterns for various addressing modes for truncating-extending binary RMWs.
+multiclass BinRMWTruncExtPattern<
+  PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
+  NI inst8_32, NI inst16_32, NI inst8_64, NI inst16_64, NI inst32_64> {
+  // Truncating-extending binary RMWs with no constant offset
+  def : BinRMWPatNoOffset<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatNoOffset<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatNoOffset<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatNoOffset<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  def : BinRMWPatNoOffset<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  def : BinRMWPatNoOffset<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatNoOffset<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatNoOffset<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatNoOffset<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+  // Truncating-extending binary RMWs with a constant offset
+  def : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+  def : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
+  def : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+  def : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
+  def : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, regPlusImm, inst32_64>;
+  def : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+  def : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+  def : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+  def : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+  def : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, or_is_add, inst32_64>;
+
+  def : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+  def : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
+  def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+  def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
+  def : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+  def : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+  def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+  def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+
+  def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+  def : BinRMWPatExternalSym<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatExternalSym<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatExternalSym<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatExternalSym<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  def : BinRMWPatExternalSym<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  def : BinRMWPatExternalSym<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatExternalSym<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatExternalSym<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatExternalSym<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+  // Truncating-extending binary RMWs with just a constant offset
+  def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  def : BinRMWPatOffsetOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatOffsetOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatOffsetOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatOffsetOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+  def : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+  def : BinRMWPatExternSymOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatExternSymOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  def : BinRMWPatExternSymOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  def : BinRMWPatExternSymOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  def : BinRMWPatExternSymOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  def : BinRMWPatExternSymOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+}
+
+let Predicates = [HasAtomics] in {
+defm : BinRMWTruncExtPattern<
+  atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, atomic_load_add_64,
+  ATOMIC_RMW8_U_ADD_I32, ATOMIC_RMW16_U_ADD_I32,
+  ATOMIC_RMW8_U_ADD_I64, ATOMIC_RMW16_U_ADD_I64, ATOMIC_RMW32_U_ADD_I64>;
+defm : BinRMWTruncExtPattern<
+  atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32, atomic_load_sub_64,
+  ATOMIC_RMW8_U_SUB_I32, ATOMIC_RMW16_U_SUB_I32,
+  ATOMIC_RMW8_U_SUB_I64, ATOMIC_RMW16_U_SUB_I64, ATOMIC_RMW32_U_SUB_I64>;
+defm : BinRMWTruncExtPattern<
+  atomic_load_and_8, atomic_load_and_16, atomic_load_and_32, atomic_load_and_64,
+  ATOMIC_RMW8_U_AND_I32, ATOMIC_RMW16_U_AND_I32,
+  ATOMIC_RMW8_U_AND_I64, ATOMIC_RMW16_U_AND_I64, ATOMIC_RMW32_U_AND_I64>;
+defm : BinRMWTruncExtPattern<
+  atomic_load_or_8, atomic_load_or_16, atomic_load_or_32, atomic_load_or_64,
+  ATOMIC_RMW8_U_OR_I32, ATOMIC_RMW16_U_OR_I32,
+  ATOMIC_RMW8_U_OR_I64, ATOMIC_RMW16_U_OR_I64, ATOMIC_RMW32_U_OR_I64>;
+defm : BinRMWTruncExtPattern<
+  atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32, atomic_load_xor_64,
+  ATOMIC_RMW8_U_XOR_I32, ATOMIC_RMW16_U_XOR_I32,
+  ATOMIC_RMW8_U_XOR_I64, ATOMIC_RMW16_U_XOR_I64, ATOMIC_RMW32_U_XOR_I64>;
+defm : BinRMWTruncExtPattern<
+  atomic_swap_8, atomic_swap_16, atomic_swap_32, atomic_swap_64,
+  ATOMIC_RMW8_U_XCHG_I32, ATOMIC_RMW16_U_XCHG_I32,
+  ATOMIC_RMW8_U_XCHG_I64, ATOMIC_RMW16_U_XCHG_I64, ATOMIC_RMW32_U_XCHG_I64>;
+} // Predicates = [HasAtomics]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 6b45839c14b0..34262752430c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -8,89 +8,111 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly Call operand code-gen constructs.
+/// WebAssembly Call operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
 // TODO: addr64: These currently assume the callee address is 32-bit.
+// FIXME: add $type to first call_indirect asmstr (and maybe $flags)
 
 let Defs = [ARGUMENTS] in {
 
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
-                         [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
-def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
-                       [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
+defm ADJCALLSTACKDOWN : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                            [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
+defm ADJCALLSTACKUP : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                          [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
 } // isCodeGenOnly = 1
 
 multiclass CALL<WebAssemblyRegClass vt, string prefix> {
-  def CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
-                   [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
-                   !strconcat(prefix, "call\t$dst, $callee"),
-                   0x10>;
+  defm CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
+                    (outs), (ins function32_op:$callee),
+                    [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
+                    !strconcat(prefix, "call\t$dst, $callee"),
+                    !strconcat(prefix, "call\t$callee"),
+                    0x10>;
 
   let isCodeGenOnly = 1 in {
-    def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
-                              [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
-                              "PSEUDO CALL INDIRECT\t$callee">;
+    defm PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
+                                (outs), (ins I32:$callee),
+                               [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
+                               "PSEUDO CALL INDIRECT\t$callee",
+                               "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_#vt : I<(outs vt:$dst),
-                            (ins TypeIndex:$type, i32imm:$flags, variable_ops),
-                            [],
-                            !strconcat(prefix, "call_indirect\t$dst"),
-                            0x11>;
+  defm CALL_INDIRECT_#vt : I<(outs vt:$dst),
+                             (ins TypeIndex:$type, i32imm:$flags, variable_ops),
+                             (outs), (ins TypeIndex:$type, i32imm:$flags),
+                             [],
+                             !strconcat(prefix, "call_indirect\t$dst"),
+                             !strconcat(prefix, "call_indirect\t$type"),
+                             0x11>;
 }
 
 multiclass SIMD_CALL<ValueType vt, string prefix> {
-  def CALL_#vt : SIMD_I<(outs V128:$dst), (ins function32_op:$callee, variable_ops),
+  defm CALL_#vt : SIMD_I<(outs V128:$dst), (ins function32_op:$callee,
+                           variable_ops),
+                         (outs), (ins function32_op:$callee),
                          [(set (vt V128:$dst),
-                               (WebAssemblycall1 (i32 imm:$callee)))],
+                            (WebAssemblycall1 (i32 imm:$callee)))],
                          !strconcat(prefix, "call\t$dst, $callee"),
+                         !strconcat(prefix, "call\t$callee"),
                          0x10>;
 
   let isCodeGenOnly = 1 in {
-    def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
-                                    (ins I32:$callee, variable_ops),
-                                    [(set (vt V128:$dst),
-                                          (WebAssemblycall1 I32:$callee))],
-                                    "PSEUDO CALL INDIRECT\t$callee">;
+    defm PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+                                     (ins I32:$callee, variable_ops),
+                                     (outs), (ins I32:$callee),
+                                     [(set (vt V128:$dst),
+                                           (WebAssemblycall1 I32:$callee))],
+                                     "PSEUDO CALL INDIRECT\t$callee",
+                                     "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+  defm CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
                                   (ins TypeIndex:$type, i32imm:$flags,
-                                       variable_ops),
+                                        variable_ops),
+                                  (outs), (ins TypeIndex:$type, i32imm:$flags),
                                   [],
-                                  !strconcat(prefix, "call_indirect\t$dst"),
+                                  !strconcat(prefix,
+                                    "call_indirect\t$dst"),
+                                  !strconcat(prefix, "call_indirect\t$type"),
                                   0x11>;
 }
 
 let Uses = [SP32, SP64], isCall = 1 in {
-  defm : CALL<I32, "i32.">;
-  defm : CALL<I64, "i64.">;
-  defm : CALL<F32, "f32.">;
-  defm : CALL<F64, "f64.">;
-  defm : SIMD_CALL<v16i8, "i8x16.">;
-  defm : SIMD_CALL<v8i16, "i16x8.">;
-  defm : SIMD_CALL<v4i32, "i32x4.">;
-  defm : SIMD_CALL<v4f32, "f32x4.">;
-
-  def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
-                    [(WebAssemblycall0 (i32 imm:$callee))],
-                    "call    \t$callee", 0x10>;
+  defm "" : CALL<I32, "i32.">;
+  defm "" : CALL<I64, "i64.">;
+  defm "" : CALL<F32, "f32.">;
+  defm "" : CALL<F64, "f64.">;
+  defm "" : CALL<EXCEPT_REF, "except_ref.">;
+  defm "" : SIMD_CALL<v16i8, "i8x16.">;
+  defm "" : SIMD_CALL<v8i16, "i16x8.">;
+  defm "" : SIMD_CALL<v4i32, "i32x4.">;
+  defm "" : SIMD_CALL<v4f32, "f32x4.">;
+
+  defm CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
+                     (outs), (ins function32_op:$callee),
+                     [(WebAssemblycall0 (i32 imm:$callee))],
+                     "call    \t$callee", "call\t$callee", 0x10>;
 
   let isCodeGenOnly = 1 in {
-    def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
-                      [(WebAssemblycall0 I32:$callee)],
-                      "PSEUDO CALL INDIRECT\t$callee">;
+    defm PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
+                                 (outs), (ins I32:$callee),
+                                 [(WebAssemblycall0 I32:$callee)],
+                                 "PSEUDO CALL INDIRECT\t$callee",
+                                 "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_VOID : I<(outs),
-                             (ins TypeIndex:$type, i32imm:$flags, variable_ops),
-                             [],
-                             "call_indirect\t", 0x11>;
+  defm CALL_INDIRECT_VOID : I<(outs),
+                              (ins TypeIndex:$type, i32imm:$flags,
+                                variable_ops),
+                              (outs), (ins TypeIndex:$type, i32imm:$flags),
+                              [],
+                              "call_indirect\t", "call_indirect\t$type",
+                              0x11>;
 } // Uses = [SP32,SP64], isCall = 1
 
 } // Defs = [ARGUMENTS]
@@ -112,6 +134,9 @@ def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
 def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(ExceptRef
+           (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_EXCEPT_REF tglobaladdr:$callee)>;
 def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
           (CALL_VOID tglobaladdr:$callee)>;
 
@@ -132,5 +157,8 @@ def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
           (CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
 def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
           (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(ExceptRef
+           (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_EXCEPT_REF texternalsym:$callee)>;
 def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
           (CALL_VOID texternalsym:$callee)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 129794171464..d90244b90662 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly control-flow code-gen constructs.
+/// WebAssembly control-flow code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -16,15 +16,17 @@ let Defs = [ARGUMENTS] in {
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
-def BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
-              [(brcond I32:$cond, bb:$dst)],
-               "br_if   \t$dst, $cond", 0x0d>;
+defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
+               (outs), (ins bb_op:$dst),
+               [(brcond I32:$cond, bb:$dst)],
+                "br_if   \t$dst, $cond", "br_if   \t$dst", 0x0d>;
 let isCodeGenOnly = 1 in
-def BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), []>;
+defm BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond),
+                   (outs), (ins bb_op:$dst), []>;
 let isBarrier = 1 in {
-def BR   : I<(outs), (ins bb_op:$dst),
-             [(br bb:$dst)],
-             "br      \t$dst", 0x0c>;
+defm BR   : NRI<(outs), (ins bb_op:$dst),
+                [(br bb:$dst)],
+                "br      \t$dst", 0x0c>;
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
@@ -42,92 +44,151 @@ let Defs = [ARGUMENTS] in {
 // currently.
 // Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
 // Set TSFlags{1} to 1 to indicate that the immediates represent labels.
+// FIXME: this can't inherit from I<> since there is no way to inherit from a
+// multiclass and still have the let statements.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-def BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
-                     [(WebAssemblybr_table I32:$index)],
-                     "br_table \t$index", 0x0e> {
+def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops),
+                      [(WebAssemblybr_table I32:$index)], 0,
+                      "br_table \t$index", 0x0e> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
-def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
-                     [(WebAssemblybr_table I64:$index)],
-                     "br_table \t$index"> {
+def BR_TABLE_I32_S : NI<(outs), (ins I32:$index),
+                        [], 1,
+                        "br_table \t$index", 0x0e> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
+def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops),
+                      [(WebAssemblybr_table I64:$index)], 0,
+                      "br_table \t$index"> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
+def BR_TABLE_I64_S : NI<(outs), (ins I64:$index),
+                        [], 1,
+                        "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
-// Placemarkers to indicate the start or end of a block, loop, or try scope.
+// This is technically a control-flow instruction, since all it affects is the
+// IP.
+defm NOP : NRI<(outs), (ins), [], "nop", 0x01>;
+
+// Placemarkers to indicate the start or end of a block or loop scope.
 // These use/clobber VALUE_STACK to prevent them from being moved into the
 // middle of an expression tree.
 let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
-def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
-def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
-def TRY       : I<(outs), (ins Signature:$sig), [], "try     \t$sig", 0x06>;
-
-// END_BLOCK, END_LOOP, END_TRY, and END_FUNCTION are represented with the same
-// opcode in wasm.
-def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
-def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
-def END_TRY   : I<(outs), (ins), [], "end_try", 0x0b>;
+defm BLOCK     : NRI<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
+defm LOOP      : NRI<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
+
+// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode in
+// wasm.
+defm END_BLOCK : NRI<(outs), (ins), [], "end_block", 0x0b>;
+defm END_LOOP  : NRI<(outs), (ins), [], "end_loop", 0x0b>;
 let isTerminator = 1, isBarrier = 1 in
-def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
+defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
 multiclass RETURN<WebAssemblyRegClass vt> {
-  def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
-                     "return  \t$val", 0x0f>;
+  defm RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins),
+                      [(WebAssemblyreturn vt:$val)],
+                      "return  \t$val", "return", 0x0f>;
   // Equivalent to RETURN_#vt, for use at the end of a function when wasm
   // semantics return by falling off the end of the block.
   let isCodeGenOnly = 1 in
-  def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
+  defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins), []>;
 }
 
 multiclass SIMD_RETURN<ValueType vt> {
-  def RETURN_#vt : SIMD_I<(outs), (ins V128:$val),
-                          [(WebAssemblyreturn (vt V128:$val))],
-                          "return  \t$val", 0x0f>;
+  defm RETURN_#vt : SIMD_I<(outs), (ins V128:$val), (outs), (ins),
+                           [(WebAssemblyreturn (vt V128:$val))],
+                           "return  \t$val", "return", 0x0f>;
   // Equivalent to RETURN_#vt, for use at the end of a function when wasm
   // semantics return by falling off the end of the block.
   let isCodeGenOnly = 1 in
-  def FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), []>;
+  defm FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), (outs), (ins),
+                                       []>;
 }
 
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 
 let isReturn = 1 in {
-  defm : RETURN<I32>;
-  defm : RETURN<I64>;
-  defm : RETURN<F32>;
-  defm : RETURN<F64>;
-  defm : SIMD_RETURN<v16i8>;
-  defm : SIMD_RETURN<v8i16>;
-  defm : SIMD_RETURN<v4i32>;
-  defm : SIMD_RETURN<v4f32>;
-
-  def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
+  defm "": RETURN<I32>;
+  defm "": RETURN<I64>;
+  defm "": RETURN<F32>;
+  defm "": RETURN<F64>;
+  defm "": RETURN<EXCEPT_REF>;
+  defm "": SIMD_RETURN<v16i8>;
+  defm "": SIMD_RETURN<v8i16>;
+  defm "": SIMD_RETURN<v4i32>;
+  defm "": SIMD_RETURN<v4f32>;
+
+  defm RETURN_VOID : NRI<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
 
   // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
   let isCodeGenOnly = 1 in
-  def FALLTHROUGH_RETURN_VOID : I<(outs), (ins), []>;
+  defm FALLTHROUGH_RETURN_VOID : NRI<(outs), (ins), []>;
 } // isReturn = 1
 
-def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
+defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+//===----------------------------------------------------------------------===//
+// Exception handling instructions
+//===----------------------------------------------------------------------===//
 
-def THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$obj),
-                  [(int_wasm_throw imm:$tag, I32:$obj)], "throw   \t$tag, $obj",
-                  0x08>;
-def THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$obj),
-                  [(int_wasm_throw imm:$tag, I64:$obj)], "throw   \t$tag, $obj",
-                  0x08>;
-def RETHROW : I<(outs), (ins i32imm:$rel_depth), [], "rethrow \t$rel_depth",
-                0x09>;
+let Predicates = [HasExceptionHandling] in {
 
+// Throwing an exception: throw / rethrow
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+defm THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$val),
+                   (outs), (ins i32imm:$tag),
+                   [(int_wasm_throw imm:$tag, I32:$val)],
+                   "throw   \t$tag, $val", "throw   \t$tag",
+                   0x08>;
+defm THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$val),
+                   (outs), (ins i32imm:$tag),
+                   [(int_wasm_throw imm:$tag, I64:$val)],
+                   "throw   \t$tag, $val", "throw   \t$tag",
+                   0x08>;
+defm RETHROW : NRI<(outs), (ins bb_op:$dst), [], "rethrow \t$dst", 0x09>;
+let isCodeGenOnly = 1 in
+// This is used when the destination for rethrow is the caller function. This
+// will be converted to a rethrow in CFGStackify.
+defm RETHROW_TO_CALLER : NRI<(outs), (ins), [], "rethrow">;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
-} // Defs = [ARGUMENTS]
+// Region within which an exception is caught: try / end_try
+let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
+defm TRY     : NRI<(outs), (ins Signature:$sig), [], "try     \t$sig", 0x06>;
+defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
+} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
-// rethrow takes a relative depth as an argument, for which currently only 0 is
-// possible for C++. Once other languages need depths other than 0, depths will
-// be computed in CFGStackify.
-def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
+// Catching an exception: catch / catch_all
+let hasCtrlDep = 1 in {
+defm CATCH_I32 : I<(outs I32:$dst), (ins i32imm:$tag),
+                   (outs), (ins i32imm:$tag),
+                   [(set I32:$dst, (int_wasm_catch imm:$tag))],
+                   "i32.catch   \t$dst, $tag", "i32.catch   \t$tag", 0x07>;
+defm CATCH_I64 : I<(outs I64:$dst), (ins i32imm:$tag),
+                   (outs), (ins i32imm:$tag),
+                   [(set I64:$dst, (int_wasm_catch imm:$tag))],
+                   "i64.catch   \t$dst, $tag", "i64.catch   \t$tag", 0x07>;
+defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
+}
+
+// Pseudo instructions: cleanupret / catchret
+// They are not return instructions in wasm, but setting 'isReturn' to true as
+// in X86 is necessary for computing EH scope membership.
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+    isCodeGenOnly = 1, isReturn = 1 in {
+  defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "", 0>;
+  defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from),
+                   [(catchret bb:$dst, bb:$from)], "", 0>;
+}
+}
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 426c2c802172..c89c1b549816 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -8,41 +8,48 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly datatype conversions, truncations, reinterpretations,
+/// WebAssembly datatype conversions, truncations, reinterpretations,
 /// promotions, and demotions operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
 let Defs = [ARGUMENTS] in {
 
-def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src),
+defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                       [(set I32:$dst, (trunc I64:$src))],
-                      "i32.wrap/i64\t$dst, $src", 0xa7>;
+                      "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>;
 
-def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src),
+defm I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src), (outs), (ins),
                           [(set I64:$dst, (sext I32:$src))],
-                          "i64.extend_s/i32\t$dst, $src", 0xac>;
-def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
-                         [(set I64:$dst, (zext I32:$src))],
-                         "i64.extend_u/i32\t$dst, $src", 0xad>;
+                          "i64.extend_s/i32\t$dst, $src", "i64.extend_s/i32",
+                          0xac>;
+defm I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src), (outs), (ins),
+                          [(set I64:$dst, (zext I32:$src))],
+                          "i64.extend_u/i32\t$dst, $src", "i64.extend_u/i32",
+                          0xad>;
 
-let Predicates = [HasAtomics] in {
-def I32_EXTEND8_S_I32 : I<(outs I32:$dst), (ins I32:$src),
-                          [(set I32:$dst, (sext_inreg I32:$src, i8))],
-                          "i32.extend8_s\t$dst, $src", 0xc0>;
-def I32_EXTEND16_S_I32 : I<(outs I32:$dst), (ins I32:$src),
-                           [(set I32:$dst, (sext_inreg I32:$src, i16))],
-                           "i32.extend16_s\t$dst, $src", 0xc1>;
-def I64_EXTEND8_S_I64 : I<(outs I64:$dst), (ins I64:$src),
-                           [(set I64:$dst, (sext_inreg I64:$src, i8))],
-                           "i64.extend8_s\t$dst, $src", 0xc2>;
-def I64_EXTEND16_S_I64 : I<(outs I64:$dst), (ins I64:$src),
-                           [(set I64:$dst, (sext_inreg I64:$src, i16))],
-                           "i64.extend16_s\t$dst, $src", 0xc3>;
-def I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src),
-                           [(set I64:$dst, (sext_inreg I64:$src, i32))],
-                           "i64.extend32_s\t$dst, $src", 0xc4>;
-} // Predicates = [HasAtomics]
+let Predicates = [HasSignExt] in {
+defm I32_EXTEND8_S_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+                           [(set I32:$dst, (sext_inreg I32:$src, i8))],
+                           "i32.extend8_s\t$dst, $src", "i32.extend8_s",
+                           0xc0>;
+defm I32_EXTEND16_S_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+                            [(set I32:$dst, (sext_inreg I32:$src, i16))],
+                            "i32.extend16_s\t$dst, $src", "i32.extend16_s",
+                            0xc1>;
+defm I64_EXTEND8_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+                            [(set I64:$dst, (sext_inreg I64:$src, i8))],
+                            "i64.extend8_s\t$dst, $src", "i64.extend8_s",
+                            0xc2>;
+defm I64_EXTEND16_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+                            [(set I64:$dst, (sext_inreg I64:$src, i16))],
+                            "i64.extend16_s\t$dst, $src", "i64.extend16_s",
+                            0xc3>;
+defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+                            [(set I64:$dst, (sext_inreg I64:$src, i32))],
+                            "i64.extend32_s\t$dst, $src", "i64.extend32_s",
+                            0xc4>;
+} // Predicates = [HasSignExt]
 
 } // defs = [ARGUMENTS]
 
@@ -55,131 +62,161 @@ let Defs = [ARGUMENTS] in {
 
 // Conversion from floating point to integer instructions which don't trap on
 // overflow or invalid.
-def I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src),
-                            [(set I32:$dst, (fp_to_sint F32:$src))],
-                            "i32.trunc_s:sat/f32\t$dst, $src", 0xfc00>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I32_TRUNC_U_SAT_F32 : I<(outs I32:$dst), (ins F32:$src),
-                            [(set I32:$dst, (fp_to_uint F32:$src))],
-                            "i32.trunc_u:sat/f32\t$dst, $src", 0xfc01>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_S_SAT_F32 : I<(outs I64:$dst), (ins F32:$src),
-                            [(set I64:$dst, (fp_to_sint F32:$src))],
-                            "i64.trunc_s:sat/f32\t$dst, $src", 0xfc04>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_U_SAT_F32 : I<(outs I64:$dst), (ins F32:$src),
-                            [(set I64:$dst, (fp_to_uint F32:$src))],
-                            "i64.trunc_u:sat/f32\t$dst, $src", 0xfc05>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I32_TRUNC_S_SAT_F64 : I<(outs I32:$dst), (ins F64:$src),
-                            [(set I32:$dst, (fp_to_sint F64:$src))],
-                            "i32.trunc_s:sat/f64\t$dst, $src", 0xfc02>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I32_TRUNC_U_SAT_F64 : I<(outs I32:$dst), (ins F64:$src),
-                            [(set I32:$dst, (fp_to_uint F64:$src))],
-                            "i32.trunc_u:sat/f64\t$dst, $src", 0xfc03>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_S_SAT_F64 : I<(outs I64:$dst), (ins F64:$src),
-                            [(set I64:$dst, (fp_to_sint F64:$src))],
-                            "i64.trunc_s:sat/f64\t$dst, $src", 0xfc06>,
-                            Requires<[HasNontrappingFPToInt]>;
-def I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src),
-                            [(set I64:$dst, (fp_to_uint F64:$src))],
-                            "i64.trunc_u:sat/f64\t$dst, $src", 0xfc07>,
-                            Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                             [(set I32:$dst, (fp_to_sint F32:$src))],
+                             "i32.trunc_s:sat/f32\t$dst, $src",
+                             "i32.trunc_s:sat/f32", 0xfc00>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_U_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                             [(set I32:$dst, (fp_to_uint F32:$src))],
+                             "i32.trunc_u:sat/f32\t$dst, $src",
+                             "i32.trunc_u:sat/f32", 0xfc01>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_S_SAT_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+                             [(set I64:$dst, (fp_to_sint F32:$src))],
+                             "i64.trunc_s:sat/f32\t$dst, $src",
+                             "i64.trunc_s:sat/f32", 0xfc04>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_U_SAT_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+                             [(set I64:$dst, (fp_to_uint F32:$src))],
+                             "i64.trunc_u:sat/f32\t$dst, $src",
+                             "i64.trunc_u:sat/f32", 0xfc05>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_S_SAT_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+                             [(set I32:$dst, (fp_to_sint F64:$src))],
+                             "i32.trunc_s:sat/f64\t$dst, $src",
+                             "i32.trunc_s:sat/f64", 0xfc02>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I32_TRUNC_U_SAT_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+                             [(set I32:$dst, (fp_to_uint F64:$src))],
+                             "i32.trunc_u:sat/f64\t$dst, $src",
+                             "i32.trunc_u:sat/f64", 0xfc03>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_S_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                             [(set I64:$dst, (fp_to_sint F64:$src))],
+                             "i64.trunc_s:sat/f64\t$dst, $src",
+                             "i64.trunc_s:sat/f64", 0xfc06>,
+                             Requires<[HasNontrappingFPToInt]>;
+defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                             [(set I64:$dst, (fp_to_uint F64:$src))],
+                             "i64.trunc_u:sat/f64\t$dst, $src",
+                             "i64.trunc_u:sat/f64", 0xfc07>,
+                             Requires<[HasNontrappingFPToInt]>;
 
 // Conversion from floating point to integer pseudo-instructions which don't
 // trap on overflow or invalid.
 let usesCustomInserter = 1, isCodeGenOnly = 1 in {
-def FP_TO_SINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src),
-                        [(set I32:$dst, (fp_to_sint F32:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src),
-                        [(set I32:$dst, (fp_to_uint F32:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_SINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src),
-                        [(set I64:$dst, (fp_to_sint F32:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src),
-                        [(set I64:$dst, (fp_to_uint F32:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_SINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src),
-                        [(set I32:$dst, (fp_to_sint F64:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src),
-                        [(set I32:$dst, (fp_to_uint F64:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_SINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src),
-                        [(set I64:$dst, (fp_to_sint F64:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
-def FP_TO_UINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src),
-                        [(set I64:$dst, (fp_to_uint F64:$src))], "", 0>,
-                        Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                            [(set I32:$dst, (fp_to_sint F32:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I32_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                            [(set I32:$dst, (fp_to_uint F32:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+                            [(set I64:$dst, (fp_to_sint F32:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I64_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+                            [(set I64:$dst, (fp_to_uint F32:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+                            [(set I32:$dst, (fp_to_sint F64:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I32_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+                            [(set I32:$dst, (fp_to_uint F64:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_SINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                            [(set I64:$dst, (fp_to_sint F64:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
+defm FP_TO_UINT_I64_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                            [(set I64:$dst, (fp_to_uint F64:$src))], "", "", 0>,
+                            Requires<[NotHasNontrappingFPToInt]>;
 } // usesCustomInserter, isCodeGenOnly = 1
 
 // Conversion from floating point to integer traps on overflow and invalid.
 let hasSideEffects = 1 in {
-def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
-                        [], "i32.trunc_s/f32\t$dst, $src", 0xa8>;
-def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src),
-                        [], "i32.trunc_u/f32\t$dst, $src", 0xa9>;
-def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src),
-                        [], "i64.trunc_s/f32\t$dst, $src", 0xae>;
-def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src),
-                        [], "i64.trunc_u/f32\t$dst, $src", 0xaf>;
-def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src),
-                        [], "i32.trunc_s/f64\t$dst, $src", 0xaa>;
-def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src),
-                        [], "i32.trunc_u/f64\t$dst, $src", 0xab>;
-def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src),
-                        [], "i64.trunc_s/f64\t$dst, $src", 0xb0>;
-def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src),
-                        [], "i64.trunc_u/f64\t$dst, $src", 0xb1>;
+defm I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                         [], "i32.trunc_s/f32\t$dst, $src", "i32.trunc_s/f32",
+                         0xa8>;
+defm I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                         [], "i32.trunc_u/f32\t$dst, $src", "i32.trunc_u/f32",
+                         0xa9>;
+defm I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+                         [], "i64.trunc_s/f32\t$dst, $src", "i64.trunc_s/f32",
+                         0xae>;
+defm I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src), (outs), (ins),
+                         [], "i64.trunc_u/f32\t$dst, $src", "i64.trunc_u/f32",
+                         0xaf>;
+defm I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+                         [], "i32.trunc_s/f64\t$dst, $src", "i32.trunc_s/f64",
+                         0xaa>;
+defm I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src), (outs), (ins),
+                         [], "i32.trunc_u/f64\t$dst, $src", "i32.trunc_u/f64",
+                         0xab>;
+defm I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                         [], "i64.trunc_s/f64\t$dst, $src", "i64.trunc_s/f64",
+                         0xb0>;
+defm I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                         [], "i64.trunc_u/f64\t$dst, $src", "i64.trunc_u/f64",
+                         0xb1>;
 } // hasSideEffects = 1
 
-def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src),
-                          [(set F32:$dst, (sint_to_fp I32:$src))],
-                          "f32.convert_s/i32\t$dst, $src", 0xb2>;
-def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src),
-                          [(set F32:$dst, (uint_to_fp I32:$src))],
-                          "f32.convert_u/i32\t$dst, $src", 0xb3>;
-def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src),
-                          [(set F64:$dst, (sint_to_fp I32:$src))],
-                          "f64.convert_s/i32\t$dst, $src", 0xb7>;
-def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src),
-                          [(set F64:$dst, (uint_to_fp I32:$src))],
-                          "f64.convert_u/i32\t$dst, $src", 0xb8>;
-def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src),
-                          [(set F32:$dst, (sint_to_fp I64:$src))],
-                          "f32.convert_s/i64\t$dst, $src", 0xb4>;
-def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src),
-                          [(set F32:$dst, (uint_to_fp I64:$src))],
-                          "f32.convert_u/i64\t$dst, $src", 0xb5>;
-def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src),
-                          [(set F64:$dst, (sint_to_fp I64:$src))],
-                          "f64.convert_s/i64\t$dst, $src", 0xb9>;
-def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src),
-                          [(set F64:$dst, (uint_to_fp I64:$src))],
-                          "f64.convert_u/i64\t$dst, $src", 0xba>;
+defm F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
+                           [(set F32:$dst, (sint_to_fp I32:$src))],
+                           "f32.convert_s/i32\t$dst, $src", "f32.convert_s/i32",
+                           0xb2>;
+defm F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
+                           [(set F32:$dst, (uint_to_fp I32:$src))],
+                           "f32.convert_u/i32\t$dst, $src", "f32.convert_u/i32",
+                           0xb3>;
+defm F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src), (outs), (ins),
+                           [(set F64:$dst, (sint_to_fp I32:$src))],
+                           "f64.convert_s/i32\t$dst, $src", "f64.convert_s/i32",
+                           0xb7>;
+defm F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src), (outs), (ins),
+                           [(set F64:$dst, (uint_to_fp I32:$src))],
+                           "f64.convert_u/i32\t$dst, $src", "f64.convert_u/i32",
+                           0xb8>;
+defm F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src), (outs), (ins),
+                           [(set F32:$dst, (sint_to_fp I64:$src))],
+                           "f32.convert_s/i64\t$dst, $src", "f32.convert_s/i64",
+                           0xb4>;
+defm F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src), (outs), (ins),
+                           [(set F32:$dst, (uint_to_fp I64:$src))],
+                           "f32.convert_u/i64\t$dst, $src", "f32.convert_u/i64",
+                           0xb5>;
+defm F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
+                           [(set F64:$dst, (sint_to_fp I64:$src))],
+                           "f64.convert_s/i64\t$dst, $src", "f64.convert_s/i64",
+                           0xb9>;
+defm F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
+                           [(set F64:$dst, (uint_to_fp I64:$src))],
+                           "f64.convert_u/i64\t$dst, $src", "f64.convert_u/i64",
+                           0xba>;
 
-def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
-                        [(set F64:$dst, (fpextend F32:$src))],
-                        "f64.promote/f32\t$dst, $src", 0xbb>;
-def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
-                       [(set F32:$dst, (fpround F64:$src))],
-                       "f32.demote/f64\t$dst, $src", 0xb6>;
+defm F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src), (outs), (ins),
+                         [(set F64:$dst, (fpextend F32:$src))],
+                         "f64.promote/f32\t$dst, $src", "f64.promote/f32",
+                         0xbb>;
+defm F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src), (outs), (ins),
+                        [(set F32:$dst, (fpround F64:$src))],
+                        "f32.demote/f64\t$dst, $src", "f32.demote/f64",
+                        0xb6>;
 
-def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
-                            [(set I32:$dst, (bitconvert F32:$src))],
-                            "i32.reinterpret/f32\t$dst, $src", 0xbc>;
-def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src),
-                            [(set F32:$dst, (bitconvert I32:$src))],
-                            "f32.reinterpret/i32\t$dst, $src", 0xbe>;
-def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src),
-                            [(set I64:$dst, (bitconvert F64:$src))],
-                            "i64.reinterpret/f64\t$dst, $src", 0xbd>;
-def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src),
-                            [(set F64:$dst, (bitconvert I64:$src))],
-                            "f64.reinterpret/i64\t$dst, $src", 0xbf>;
+defm I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
+                             [(set I32:$dst, (bitconvert F32:$src))],
+                             "i32.reinterpret/f32\t$dst, $src",
+                             "i32.reinterpret/f32", 0xbc>;
+defm F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
+                             [(set F32:$dst, (bitconvert I32:$src))],
+                             "f32.reinterpret/i32\t$dst, $src",
+                             "f32.reinterpret/i32", 0xbe>;
+defm I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
+                             [(set I64:$dst, (bitconvert F64:$src))],
+                             "i64.reinterpret/f64\t$dst, $src",
+                             "i64.reinterpret/f64", 0xbd>;
+defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
+                             [(set F64:$dst, (bitconvert I64:$src))],
+                             "f64.reinterpret/i64\t$dst, $src",
+                             "f64.reinterpret/i64", 0xbf>;
 
 } // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
new file mode 100644
index 000000000000..41b39f69e51c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
@@ -0,0 +1,31 @@
+// WebAssemblyInstrExceptRef.td-WebAssembly except_ref codegen --*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// WebAssembly except_ref operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
+                           (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
+                           (outs), (ins),
+                           [(set EXCEPT_REF:$dst,
+                            (select I32:$cond, EXCEPT_REF:$lhs,
+                             EXCEPT_REF:$rhs))],
+                           "except_ref.select\t$dst, $lhs, $rhs, $cond",
+                           "except_ref.select", 0x1b>;
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
+          (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
+          (SELECT_EXCEPT_REF EXCEPT_REF:$rhs, EXCEPT_REF:$lhs, I32:$cond)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 03c9c1f8d5c0..8db75d38942b 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly Floating-point operand code-gen constructs.
+/// WebAssembly Floating-point operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -77,12 +77,14 @@ def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
 let Defs = [ARGUMENTS] in {
 
-def SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
-                   [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
-                   "f32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
-def SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
-                   [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
-                   "f64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
+                    (outs), (ins),
+                    [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
+                    "f32.select\t$dst, $lhs, $rhs, $cond", "f32.select", 0x1b>;
+defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
+                    (outs), (ins),
+                    [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
+                    "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>;
 
 } // Defs = [ARGUMENTS]
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 4f41fcc232e9..403152c80660 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -8,99 +8,160 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly instruction format definitions.
+/// WebAssembly instruction format definitions.
 ///
 //===----------------------------------------------------------------------===//
 
 // WebAssembly Instruction Format.
-class WebAssemblyInst<bits<32> inst, string asmstr> : Instruction {
+// We instantiate 2 of these for every actual instruction (register based
+// and stack based), see below.
+class WebAssemblyInst<bits<32> inst, string asmstr, bit stack> : Instruction {
   field bits<32> Inst = inst; // Instruction encoding.
+  field bit StackBased = stack;
   let Namespace   = "WebAssembly";
   let Pattern     = [];
   let AsmString   = asmstr;
 }
 
-// Normal instructions.
-class I<dag oops, dag iops, list<dag> pattern, string asmstr = "", bits<32> inst = -1>
-    : WebAssemblyInst<inst, asmstr> {
+// Normal instructions. Default instantiation of a WebAssemblyInst.
+class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
+         bits<32> inst = -1>
+    : WebAssemblyInst<inst, asmstr, stack> {
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
 }
 
-class SIMD_I<dag oops, dag iops, list<dag> pattern,
-             string asmstr = "", bits<32> inst = -1>
-    : I<oops, iops, pattern, asmstr, inst>, Requires<[HasSIMD128]>;
+// Generates both register and stack based versions of one actual instruction.
+// We have 2 sets of operands (oops & iops) for the register and stack
+// based version of this instruction, as well as the corresponding asmstr.
+// The register versions have virtual-register operands which correspond to wasm
+// locals or stack locations. Each use and def of the register corresponds to an
+// implicit get_local / set_local or access of stack operands in wasm. These
+// instructions are used for ISel and all MI passes. The stack versions of the
+// instructions do not have register operands (they implicitly operate on the
+// stack), and get_locals and set_locals are explicit. The register instructions
+// are converted to their corresponding stack instructions before lowering to
+// MC.
+// Every instruction should want to be based on this multi-class to guarantee
+// there is always an equivalent pair of instructions.
+multiclass I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+             list<dag> pattern_r, string asmstr_r = "", string asmstr_s = "",
+             bits<32> inst = -1> {
+  def "" : NI<oops_r, iops_r, pattern_r, 0, asmstr_r, inst>;
+  def _S : NI<oops_s, iops_s, [], 1, asmstr_s, inst>;
+}
+
+// For instructions that have no register ops, so both sets are the same.
+multiclass NRI<dag oops, dag iops, list<dag> pattern, string asmstr = "",
+               bits<32> inst = -1> {
+  defm "": I<oops, iops, oops, iops, pattern, asmstr, asmstr, inst>;
+}
 
-class ATOMIC_I<dag oops, dag iops, list<dag> pattern,
-               string asmstr = "", bits<32> inst = -1>
-    : I<oops, iops, pattern, asmstr, inst>, Requires<[HasAtomics]>;
+multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                  list<dag> pattern_r, string asmstr_r = "",
+                  string asmstr_s = "", bits<32> inst = -1> {
+  defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
+              inst>,
+            Requires<[HasSIMD128]>;
+}
+
+multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                    list<dag> pattern_r, string asmstr_r = "",
+                    string asmstr_s = "", bits<32> inst = -1> {
+  defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
+              inst>,
+            Requires<[HasAtomics]>;
+}
 
 // Unary and binary instructions, for the local types that WebAssembly supports.
-multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
-  def _I32 : I<(outs I32:$dst), (ins I32:$src),
-               [(set I32:$dst, (node I32:$src))],
-               !strconcat("i32.", !strconcat(name, "\t$dst, $src")), i32Inst>;
-  def _I64 : I<(outs I64:$dst), (ins I64:$src),
-               [(set I64:$dst, (node I64:$src))],
-               !strconcat("i64.", !strconcat(name, "\t$dst, $src")), i64Inst>;
+multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst,
+                    bits<32> i64Inst> {
+  defm _I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+                [(set I32:$dst, (node I32:$src))],
+                !strconcat("i32.", !strconcat(name, "\t$dst, $src")),
+                !strconcat("i32.", name), i32Inst>;
+  defm _I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
+                [(set I64:$dst, (node I64:$src))],
+                !strconcat("i64.", !strconcat(name, "\t$dst, $src")),
+                !strconcat("i64.", name), i64Inst>;
 }
-multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
-  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
-               [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
-               !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")), i32Inst>;
-  def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs),
-               [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
-               !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")), i64Inst>;
+multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst,
+                     bits<32> i64Inst> {
+  defm _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), (outs), (ins),
+                [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
+                !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("i32.", name), i32Inst>;
+  defm _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs), (outs), (ins),
+                [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
+                !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("i64.", name), i64Inst>;
 }
-multiclass UnaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
-  def _F32 : I<(outs F32:$dst), (ins F32:$src),
-               [(set F32:$dst, (node F32:$src))],
-               !strconcat("f32.", !strconcat(name, "\t$dst, $src")), f32Inst>;
-  def _F64 : I<(outs F64:$dst), (ins F64:$src),
-               [(set F64:$dst, (node F64:$src))],
-               !strconcat("f64.", !strconcat(name, "\t$dst, $src")), f64Inst>;
+multiclass UnaryFP<SDNode node, string name, bits<32> f32Inst,
+                   bits<32> f64Inst> {
+  defm _F32 : I<(outs F32:$dst), (ins F32:$src), (outs), (ins),
+                [(set F32:$dst, (node F32:$src))],
+                !strconcat("f32.", !strconcat(name, "\t$dst, $src")),
+                !strconcat("f32.", name), f32Inst>;
+  defm _F64 : I<(outs F64:$dst), (ins F64:$src), (outs), (ins),
+                [(set F64:$dst, (node F64:$src))],
+                !strconcat("f64.", !strconcat(name, "\t$dst, $src")),
+                !strconcat("f64.", name), f64Inst>;
 }
-multiclass BinaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
-  def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs),
-               [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
-               !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")), f32Inst>;
-  def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs),
-               [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
-               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")), f64Inst>;
+multiclass BinaryFP<SDNode node, string name, bits<32> f32Inst,
+                    bits<32> f64Inst> {
+  defm _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs), (outs), (ins),
+                [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
+                !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("f32.", name), f32Inst>;
+  defm _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs), (outs), (ins),
+                [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
+                !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("f64.", name), f64Inst>;
 }
 multiclass SIMDBinary<SDNode node, SDNode fnode, string name> {
-  def _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
-                      [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
-                      !strconcat("i8x16.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
-  def _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
-                      [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
-                      !strconcat("i16x8.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
-  def _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
-                      [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
-                      !strconcat("i32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
-  def _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
-                      [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
-                      !strconcat("f32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
-
+  defm _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                       (outs), (ins),
+                       [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                       !strconcat("i8x16.",
+                         !strconcat(name, "\t$dst, $lhs, $rhs")),
+                       !strconcat("i8x16.", name)>;
+  defm _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                       (outs), (ins),
+                       [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                       !strconcat("i16x8.",
+                         !strconcat(name, "\t$dst, $lhs, $rhs")),
+                       !strconcat("i16x8.", name)>;
+  defm _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                       (outs), (ins),
+                       [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                       !strconcat("i32x4.",
+                         !strconcat(name, "\t$dst, $lhs, $rhs")),
+                       !strconcat("i32x4.", name)>;
+  defm _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                       (outs), (ins),
+                       [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
+                       !strconcat("f32x4.",
+                         !strconcat(name, "\t$dst, $lhs, $rhs")),
+                       !strconcat("f32x4.", name)>;
 }
 multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32> i64Inst> {
-  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
-               [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
-               !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
-               i32Inst>;
-  def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs),
-               [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
-               !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
-               i64Inst>;
+  defm _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), (outs), (ins),
+                [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
+                !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("i32.", name), i32Inst>;
+  defm _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs), (outs), (ins),
+                [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
+                !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("i64.", name), i64Inst>;
 }
 multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f64Inst> {
-  def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs),
-               [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
-               !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
-               f32Inst>;
-  def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs),
-               [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
-               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
-               f64Inst>;
+  defm _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs), (outs), (ins),
+                [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
+                !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("f32.", name), f32Inst>;
+  defm  _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs), (outs), (ins),
+                [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
+                !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+                !strconcat("f64.", name), f64Inst>;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 8846952e5af4..cd49bd1682ad 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
 /// TargetInstrInfo class.
 ///
 //===----------------------------------------------------------------------===//
@@ -30,7 +30,8 @@ using namespace llvm;
 
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
     : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
-                              WebAssembly::ADJCALLSTACKUP),
+                              WebAssembly::ADJCALLSTACKUP,
+                              WebAssembly::CATCHRET),
       RI(STI.getTargetTriple()) {}
 
 bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
@@ -151,7 +152,7 @@ unsigned WebAssemblyInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.instr_begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (!I->isTerminator())
       break;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index eb74106336ed..4a3763c345b0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
 /// TargetInstrInfo class.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index f8d311ac3b00..aeb282a7febb 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly Instruction definitions.
+/// WebAssembly Instruction definitions.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -30,6 +30,24 @@ def NotHasNontrappingFPToInt :
     Predicate<"!Subtarget->hasNontrappingFPToInt()">,
               AssemblerPredicate<"!FeatureNontrappingFPToInt",
                                  "nontrapping-fptoint">;
+def HasSignExt :
+    Predicate<"Subtarget->hasSignExt()">,
+              AssemblerPredicate<"FeatureSignExt",
+                                 "sign-ext">;
+def NotHasSignExt :
+    Predicate<"!Subtarget->hasSignExt()">,
+              AssemblerPredicate<"!FeatureSignExt",
+                                 "sign-ext">;
+
+def HasExceptionHandling :
+    Predicate<"Subtarget->hasExceptionHandling()">,
+              AssemblerPredicate<"FeatureExceptionHandling",
+                                 "exception-handling">;
+
+def NotHasExceptionHandling :
+    Predicate<"!Subtarget->hasExceptionHandling()">,
+              AssemblerPredicate<"!FeatureExceptionHandling",
+                                 "exception-handling">;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Node Types.
@@ -135,23 +153,26 @@ include "WebAssemblyInstrFormats.td"
 
 multiclass ARGUMENT<WebAssemblyRegClass vt> {
   let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
-  def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
-                       [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+  defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
+                        (outs), (ins i32imm:$argno),
+                        [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
 }
 multiclass SIMD_ARGUMENT<ValueType vt> {
   let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
-  def ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
-                            [(set (vt V128:$res),
+  defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
+                             (outs), (ins i32imm:$argno),
+                             [(set (vt V128:$res),
                                   (WebAssemblyargument timm:$argno))]>;
 }
-defm : ARGUMENT<I32>;
-defm : ARGUMENT<I64>;
-defm : ARGUMENT<F32>;
-defm : ARGUMENT<F64>;
-defm : SIMD_ARGUMENT<v16i8>;
-defm : SIMD_ARGUMENT<v8i16>;
-defm : SIMD_ARGUMENT<v4i32>;
-defm : SIMD_ARGUMENT<v4f32>;
+defm "": ARGUMENT<I32>;
+defm "": ARGUMENT<I64>;
+defm "": ARGUMENT<F32>;
+defm "": ARGUMENT<F64>;
+defm "": ARGUMENT<EXCEPT_REF>;
+defm "": SIMD_ARGUMENT<v16i8>;
+defm "": SIMD_ARGUMENT<v8i16>;
+defm "": SIMD_ARGUMENT<v4i32>;
+defm "": SIMD_ARGUMENT<v4f32>;
 
 let Defs = [ARGUMENTS] in {
 
@@ -165,69 +186,83 @@ let hasSideEffects = 0 in {
   // and set_local. COPYs are eliminated (and replaced with
   // get_local/set_local) in the ExplicitLocals pass.
   let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
-  def COPY_#vt : I<(outs vt:$res), (ins vt:$src), [], "copy_local\t$res, $src">;
+  defm COPY_#vt : I<(outs vt:$res), (ins vt:$src), (outs), (ins), [],
+                    "copy_local\t$res, $src", "copy_local">;
 
   // TEE is similar to COPY, but writes two copies of its result. Typically
   // this would be used to stackify one result and write the other result to a
   // local.
   let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
-  def TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
-                  "tee_local\t$res, $also, $src">;
+  defm TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), (outs), (ins), [],
+                   "tee_local\t$res, $also, $src", "tee_local">;
 
   // This is the actual get_local instruction in wasm. These are made explicit
   // by the ExplicitLocals pass. It has mayLoad because it reads from a wasm
   // local, which is a side effect not otherwise modeled in LLVM.
   let mayLoad = 1, isAsCheapAsAMove = 1 in
-  def GET_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local), [],
-                        "get_local\t$res, $local", 0x20>;
+  defm GET_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local),
+                         (outs), (ins local_op:$local), [],
+                         "get_local\t$res, $local", "get_local\t$local", 0x20>;
 
   // This is the actual set_local instruction in wasm. These are made explicit
   // by the ExplicitLocals pass. It has mayStore because it writes to a wasm
   // local, which is a side effect not otherwise modeled in LLVM.
   let mayStore = 1, isAsCheapAsAMove = 1 in
-  def SET_LOCAL_#vt : I<(outs), (ins local_op:$local, vt:$src), [],
-                        "set_local\t$local, $src", 0x21>;
+  defm SET_LOCAL_#vt : I<(outs), (ins local_op:$local, vt:$src),
+                         (outs), (ins local_op:$local), [],
+                         "set_local\t$local, $src", "set_local\t$local", 0x21>;
 
   // This is the actual tee_local instruction in wasm. TEEs are turned into
   // TEE_LOCALs by the ExplicitLocals pass. It has mayStore for the same reason
   // as SET_LOCAL.
   let mayStore = 1, isAsCheapAsAMove = 1 in
-  def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
-                         "tee_local\t$res, $local, $src", 0x22>;
+  defm TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src),
+                         (outs), (ins local_op:$local), [],
+                         "tee_local\t$res, $local, $src", "tee_local\t$local",
+                         0x22>;
 
   // Unused values must be dropped in some contexts.
-  def DROP_#vt : I<(outs), (ins vt:$src), [],
-                   "drop\t$src", 0x1a>;
+  defm DROP_#vt : I<(outs), (ins vt:$src), (outs), (ins), [],
+                    "drop\t$src", "drop", 0x1a>;
 
   let mayLoad = 1 in
-  def GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local), [],
-                         "get_global\t$res, $local", 0x23>;
+  defm GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local),
+                          (outs), (ins global_op:$local), [],
+                          "get_global\t$res, $local", "get_global\t$local",
+                          0x23>;
 
   let mayStore = 1 in
-  def SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src), [],
-                         "set_global\t$local, $src", 0x24>;
+  defm SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src),
+                          (outs), (ins global_op:$local), [],
+                          "set_global\t$local, $src", "set_global\t$local",
+                          0x24>;
 
 } // hasSideEffects = 0
 }
-defm : LOCAL<I32>;
-defm : LOCAL<I64>;
-defm : LOCAL<F32>;
-defm : LOCAL<F64>;
-defm : LOCAL<V128>, Requires<[HasSIMD128]>;
+defm "" : LOCAL<I32>;
+defm "" : LOCAL<I64>;
+defm "" : LOCAL<F32>;
+defm "" : LOCAL<F64>;
+defm "" : LOCAL<V128>, Requires<[HasSIMD128]>;
+defm "" : LOCAL<EXCEPT_REF>, Requires<[HasExceptionHandling]>;
 
 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
-def CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
-                  [(set I32:$res, imm:$imm)],
-                  "i32.const\t$res, $imm", 0x41>;
-def CONST_I64 : I<(outs I64:$res), (ins i64imm_op:$imm),
-                  [(set I64:$res, imm:$imm)],
-                  "i64.const\t$res, $imm", 0x42>;
-def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
-                  [(set F32:$res, fpimm:$imm)],
-                  "f32.const\t$res, $imm", 0x43>;
-def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
-                  [(set F64:$res, fpimm:$imm)],
-                  "f64.const\t$res, $imm", 0x44>;
+defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
+                   (outs), (ins i32imm_op:$imm),
+                   [(set I32:$res, imm:$imm)],
+                   "i32.const\t$res, $imm", "i32.const\t$imm", 0x41>;
+defm CONST_I64 : I<(outs I64:$res), (ins i64imm_op:$imm),
+                   (outs), (ins i64imm_op:$imm),
+                   [(set I64:$res, imm:$imm)],
+                   "i64.const\t$res, $imm", "i64.const\t$imm", 0x42>;
+defm CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
+                   (outs), (ins f32imm_op:$imm),
+                   [(set F32:$res, fpimm:$imm)],
+                   "f32.const\t$res, $imm", "f32.const\t$imm", 0x43>;
+defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
+                   (outs), (ins f64imm_op:$imm),
+                   [(set F64:$res, fpimm:$imm)],
+                   "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
 } // Defs = [ARGUMENTS]
@@ -249,3 +284,4 @@ include "WebAssemblyInstrConv.td"
 include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
+include "WebAssemblyInstrExceptRef.td"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index e872dc219846..f9f21fd1d754 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly Integer operand code-gen constructs.
+/// WebAssembly Integer operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -56,12 +56,12 @@ defm CLZ : UnaryInt<ctlz, "clz ", 0x67, 0x79>;
 defm CTZ : UnaryInt<cttz, "ctz ", 0x68, 0x7a>;
 defm POPCNT : UnaryInt<ctpop, "popcnt", 0x69, 0x7b>;
 
-def EQZ_I32 : I<(outs I32:$dst), (ins I32:$src),
-                [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
-                "i32.eqz \t$dst, $src", 0x45>;
-def EQZ_I64 : I<(outs I32:$dst), (ins I64:$src),
-                [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
-                "i64.eqz \t$dst, $src", 0x50>;
+defm EQZ_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
+                 [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
+                 "i32.eqz \t$dst, $src", "i32.eqz", 0x45>;
+defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
+                 [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
+                 "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
 } // Defs = [ARGUMENTS]
 
@@ -73,12 +73,14 @@ def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
 let Defs = [ARGUMENTS] in {
 
-def SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
-                   [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
-                   "i32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
-def SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
-                   [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
-                   "i64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
+                    (outs), (ins),
+                    [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
+                    "i32.select\t$dst, $lhs, $rhs, $cond", "i32.select", 0x1b>;
+defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
+                    (outs), (ins),
+                    [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
+                    "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
 
 } // Defs = [ARGUMENTS]
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 9d58895ca5a6..8a49325af2bd 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly Memory operand code-gen constructs.
+/// WebAssembly Memory operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -56,24 +56,27 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 let Defs = [ARGUMENTS] in {
 
 // Defines atomic and non-atomic loads, regular and extending.
-class WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> :
-  I<(outs rc:$dst),
-    (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
-    [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"), Opcode>;
+multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
+  defm "": I<(outs rc:$dst),
+             (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+             (outs), (ins P2Align:$p2align, offset32_op:$off),
+             [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
+             !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+}
 
 // Basic load.
 // FIXME: When we can break syntax compatibility, reorder the fields in the
 // asmstrings to match the binary encoding.
-def LOAD_I32 : WebAssemblyLoad<I32, "i32.load", 0x28>;
-def LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
-def LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
-def LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
+defm LOAD_I32 : WebAssemblyLoad<I32, "i32.load", 0x28>;
+defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
+defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
+defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
 
 } // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
-class LoadPatNoOffset<ValueType ty, PatFrag node, I inst> :
-  Pat<(ty (node I32:$addr)), (inst 0, 0, $addr)>;
+class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
 
 def : LoadPatNoOffset<i32, load, LOAD_I32>;
 def : LoadPatNoOffset<i64, load, LOAD_I64>;
@@ -84,9 +87,8 @@ def : LoadPatNoOffset<f64, load, LOAD_F64>;
 // Select loads with a constant offset.
 
 // Pattern with address + immediate offset
-class LoadPatImmOff<ValueType ty, PatFrag loadkind, PatFrag operand, I inst> :
-  Pat<(ty (loadkind (operand I32:$addr, imm:$off))),
-      (inst 0, imm:$off, $addr)>;
+class LoadPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+  Pat<(ty (kind (operand I32:$addr, imm:$off))), (inst 0, imm:$off, I32:$addr)>;
 
 def : LoadPatImmOff<i32, load, regPlusImm, LOAD_I32>;
 def : LoadPatImmOff<i64, load, regPlusImm, LOAD_I64>;
@@ -97,18 +99,18 @@ def : LoadPatImmOff<i64, load, or_is_add, LOAD_I64>;
 def : LoadPatImmOff<f32, load, or_is_add, LOAD_F32>;
 def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>;
 
-class LoadPatGlobalAddr<ValueType ty, PatFrag loadkind, I inst> :
-  Pat<(ty (loadkind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
-      (inst 0, tglobaladdr:$off, $addr)>;
+class LoadPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
+      (inst 0, tglobaladdr:$off, I32:$addr)>;
 
 def : LoadPatGlobalAddr<i32, load, LOAD_I32>;
 def : LoadPatGlobalAddr<i64, load, LOAD_I64>;
 def : LoadPatGlobalAddr<f32, load, LOAD_F32>;
 def : LoadPatGlobalAddr<f64, load, LOAD_F64>;
 
-class LoadPatExternalSym<ValueType ty, PatFrag loadkind, I inst> :
-  Pat<(ty (loadkind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-      (inst 0, texternalsym:$off, $addr)>;
+class LoadPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+      (inst 0, texternalsym:$off, I32:$addr)>;
 def : LoadPatExternalSym<i32, load, LOAD_I32>;
 def : LoadPatExternalSym<i64, load, LOAD_I64>;
 def : LoadPatExternalSym<f32, load, LOAD_F32>;
@@ -116,16 +118,16 @@ def : LoadPatExternalSym<f64, load, LOAD_F64>;
 
 
 // Select loads with just a constant offset.
-class LoadPatOffsetOnly<ValueType ty, PatFrag loadkind, I inst> :
-  Pat<(ty (loadkind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
+class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
 
 def : LoadPatOffsetOnly<i32, load, LOAD_I32>;
 def : LoadPatOffsetOnly<i64, load, LOAD_I64>;
 def : LoadPatOffsetOnly<f32, load, LOAD_F32>;
 def : LoadPatOffsetOnly<f64, load, LOAD_F64>;
 
-class LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag loadkind, I inst> :
-  Pat<(ty (loadkind (WebAssemblywrapper tglobaladdr:$off))),
+class LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
       (inst 0, tglobaladdr:$off, (CONST_I32 0))>;
 
 def : LoadPatGlobalAddrOffOnly<i32, load, LOAD_I32>;
@@ -133,8 +135,8 @@ def : LoadPatGlobalAddrOffOnly<i64, load, LOAD_I64>;
 def : LoadPatGlobalAddrOffOnly<f32, load, LOAD_F32>;
 def : LoadPatGlobalAddrOffOnly<f64, load, LOAD_F64>;
 
-class LoadPatExternSymOffOnly<ValueType ty, PatFrag loadkind, I inst> :
-  Pat<(ty (loadkind (WebAssemblywrapper texternalsym:$off))),
+class LoadPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(ty (kind (WebAssemblywrapper texternalsym:$off))),
       (inst 0, texternalsym:$off, (CONST_I32 0))>;
 def : LoadPatExternSymOffOnly<i32, load, LOAD_I32>;
 def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
@@ -144,16 +146,16 @@ def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
 let Defs = [ARGUMENTS] in {
 
 // Extending load.
-def LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
-def LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
-def LOAD16_S_I32 : WebAssemblyLoad<I32, "i32.load16_s", 0x2e>;
-def LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.load16_u", 0x2f>;
-def LOAD8_S_I64 : WebAssemblyLoad<I64, "i64.load8_s", 0x30>;
-def LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.load8_u", 0x31>;
-def LOAD16_S_I64 : WebAssemblyLoad<I64, "i64.load16_s", 0x32>;
-def LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x32>;
-def LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
-def LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
+defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
+defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
+defm LOAD16_S_I32 : WebAssemblyLoad<I32, "i32.load16_s", 0x2e>;
+defm LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.load16_u", 0x2f>;
+defm LOAD8_S_I64 : WebAssemblyLoad<I64, "i64.load8_s", 0x30>;
+defm LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.load8_u", 0x31>;
+defm LOAD16_S_I64 : WebAssemblyLoad<I64, "i64.load16_s", 0x32>;
+defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
+defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
+defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
 
 } // Defs = [ARGUMENTS]
 
@@ -303,236 +305,191 @@ def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
 
 let Defs = [ARGUMENTS] in {
 
+// Defines atomic and non-atomic stores, regular and truncating
+multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
+  defm "" : I<(outs),
+              (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
+              (outs),
+              (ins P2Align:$p2align, offset32_op:$off), [],
+              !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
+              !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+}
 // Basic store.
 // Note: WebAssembly inverts SelectionDAG's usual operand order.
-def STORE_I32  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                            I32:$val), [],
-                   "i32.store\t${off}(${addr})${p2align}, $val", 0x36>;
-def STORE_I64  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                            I64:$val), [],
-                   "i64.store\t${off}(${addr})${p2align}, $val", 0x37>;
-def STORE_F32  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                            F32:$val), [],
-                   "f32.store\t${off}(${addr})${p2align}, $val", 0x38>;
-def STORE_F64  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                            F64:$val), [],
-                   "f64.store\t${off}(${addr})${p2align}, $val", 0x39>;
+defm STORE_I32  : WebAssemblyStore<I32, "i32.store", 0x36>;
+defm STORE_I64  : WebAssemblyStore<I64, "i64.store", 0x37>;
+defm STORE_F32  : WebAssemblyStore<F32, "f32.store", 0x38>;
+defm STORE_F64  : WebAssemblyStore<F64, "f64.store", 0x39>;
 
 } // Defs = [ARGUMENTS]
 
 // Select stores with no constant offset.
-def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, 0, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, 0, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, 0, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, 0, I32:$addr, F64:$val)>;
+class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
+  Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
+
+def : StorePatNoOffset<i32, store, STORE_I32>;
+def : StorePatNoOffset<i64, store, STORE_I64>;
+def : StorePatNoOffset<f32, store, STORE_F32>;
+def : StorePatNoOffset<f64, store, STORE_F64>;
 
 // Select stores with a constant offset.
-def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (regPlusGA I32:$addr,
-                                      (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusGA I32:$addr,
-                                      (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusGA I32:$addr,
-                                      (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_F32 0, tglobaladdr:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusGA I32:$addr,
-                                      (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_F64 0, tglobaladdr:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (add I32:$addr,
-                                (WebAssemblywrapper texternalsym:$off))),
-          (STORE_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (add I32:$addr,
-                                (WebAssemblywrapper texternalsym:$off))),
-          (STORE_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (add I32:$addr,
-                                (WebAssemblywrapper texternalsym:$off))),
-          (STORE_F32 0, texternalsym:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (add I32:$addr,
-                                (WebAssemblywrapper texternalsym:$off))),
-          (STORE_F64 0, texternalsym:$off, I32:$addr, F64:$val)>;
+class StorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
+  Pat<(kind ty:$val, (operand I32:$addr, imm:$off)),
+      (inst 0, imm:$off, I32:$addr, ty:$val)>;
+
+def : StorePatImmOff<i32, store, regPlusImm, STORE_I32>;
+def : StorePatImmOff<i64, store, regPlusImm, STORE_I64>;
+def : StorePatImmOff<f32, store, regPlusImm, STORE_F32>;
+def : StorePatImmOff<f64, store, regPlusImm, STORE_F64>;
+def : StorePatImmOff<i32, store, or_is_add, STORE_I32>;
+def : StorePatImmOff<i64, store, or_is_add, STORE_I64>;
+def : StorePatImmOff<f32, store, or_is_add, STORE_F32>;
+def : StorePatImmOff<f64, store, or_is_add, STORE_F64>;
+
+class StorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind ty:$val,
+            (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))),
+      (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+def : StorePatGlobalAddr<i32, store, STORE_I32>;
+def : StorePatGlobalAddr<i64, store, STORE_I64>;
+def : StorePatGlobalAddr<f32, store, STORE_F32>;
+def : StorePatGlobalAddr<f64, store, STORE_F64>;
+
+class StorePatExternalSym<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind ty:$val, (add I32:$addr, (WebAssemblywrapper texternalsym:$off))),
+      (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
+def : StorePatExternalSym<i32, store, STORE_I32>;
+def : StorePatExternalSym<i64, store, STORE_I64>;
+def : StorePatExternalSym<f32, store, STORE_F32>;
+def : StorePatExternalSym<f64, store, STORE_F64>;
 
 // Select stores with just a constant offset.
-def : Pat<(store I32:$val, imm:$off),
-          (STORE_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(store I64:$val, imm:$off),
-          (STORE_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(store F32:$val, imm:$off),
-          (STORE_F32 0, imm:$off, (CONST_I32 0), F32:$val)>;
-def : Pat<(store F64:$val, imm:$off),
-          (STORE_F64 0, imm:$off, (CONST_I32 0), F64:$val)>;
-def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_F32 0, tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
-def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_F64 0, tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
-def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_F32 0, texternalsym:$off, (CONST_I32 0), F32:$val)>;
-def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_F64 0, texternalsym:$off, (CONST_I32 0), F64:$val)>;
+class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+def : StorePatOffsetOnly<i32, store, STORE_I32>;
+def : StorePatOffsetOnly<i64, store, STORE_I64>;
+def : StorePatOffsetOnly<f32, store, STORE_F32>;
+def : StorePatOffsetOnly<f64, store, STORE_F64>;
+
+class StorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
+      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
+def : StorePatGlobalAddrOffOnly<i32, store, STORE_I32>;
+def : StorePatGlobalAddrOffOnly<i64, store, STORE_I64>;
+def : StorePatGlobalAddrOffOnly<f32, store, STORE_F32>;
+def : StorePatGlobalAddrOffOnly<f64, store, STORE_F64>;
+
+class StorePatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
+  Pat<(kind ty:$val, (WebAssemblywrapper texternalsym:$off)),
+      (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
+def : StorePatExternSymOffOnly<i32, store, STORE_I32>;
+def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
+def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
+def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
+
 
 let Defs = [ARGUMENTS] in {
 
 // Truncating store.
-def STORE8_I32  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                             I32:$val), [],
-                    "i32.store8\t${off}(${addr})${p2align}, $val", 0x3a>;
-def STORE16_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                             I32:$val), [],
-                    "i32.store16\t${off}(${addr})${p2align}, $val", 0x3b>;
-def STORE8_I64  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                             I64:$val), [],
-                    "i64.store8\t${off}(${addr})${p2align}, $val", 0x3c>;
-def STORE16_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                             I64:$val), [],
-                    "i64.store16\t${off}(${addr})${p2align}, $val", 0x3d>;
-def STORE32_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
-                             I64:$val), [],
-                    "i64.store32\t${off}(${addr})${p2align}, $val", 0x3e>;
+defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
+defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
+defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
+defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
+defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
 } // Defs = [ARGUMENTS]
 
 // Select truncating stores with no constant offset.
-def : Pat<(truncstorei8 I32:$val, I32:$addr),
-          (STORE8_I32 0, 0, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, I32:$addr),
-          (STORE16_I32 0, 0, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, I32:$addr),
-          (STORE8_I64 0, 0, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, I32:$addr),
-          (STORE16_I64 0, 0, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, I32:$addr),
-          (STORE32_I64 0, 0, I32:$addr, I64:$val)>;
+def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
+def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
+def : StorePatNoOffset<i64, truncstorei8, STORE8_I64>;
+def : StorePatNoOffset<i64, truncstorei16, STORE16_I64>;
+def : StorePatNoOffset<i64, truncstorei32, STORE32_I64>;
 
 // Select truncating stores with a constant offset.
-def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (or_is_add I32:$addr, imm:$off)),
-          (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val,
-                        (regPlusGA I32:$addr,
-                                   (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE8_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val,
-                         (regPlusGA I32:$addr,
-                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE16_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val,
-                        (regPlusGA I32:$addr,
-                                   (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE8_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val,
-                         (regPlusGA I32:$addr,
-                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE16_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val,
-                         (regPlusGA I32:$addr,
-                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE32_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
-                                       (WebAssemblywrapper texternalsym:$off))),
-          (STORE8_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val,
-                         (add I32:$addr,
-                              (WebAssemblywrapper texternalsym:$off))),
-          (STORE16_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val,
-                        (add I32:$addr,
-                             (WebAssemblywrapper texternalsym:$off))),
-          (STORE8_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val,
-                         (add I32:$addr,
-                              (WebAssemblywrapper texternalsym:$off))),
-          (STORE16_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val,
-                         (add I32:$addr,
-                              (WebAssemblywrapper texternalsym:$off))),
-          (STORE32_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : StorePatImmOff<i32, truncstorei8, regPlusImm, STORE8_I32>;
+def : StorePatImmOff<i32, truncstorei16, regPlusImm, STORE16_I32>;
+def : StorePatImmOff<i64, truncstorei8, regPlusImm, STORE8_I64>;
+def : StorePatImmOff<i64, truncstorei16, regPlusImm, STORE16_I64>;
+def : StorePatImmOff<i64, truncstorei32, regPlusImm, STORE32_I64>;
+def : StorePatImmOff<i32, truncstorei8, or_is_add, STORE8_I32>;
+def : StorePatImmOff<i32, truncstorei16, or_is_add, STORE16_I32>;
+def : StorePatImmOff<i64, truncstorei8, or_is_add, STORE8_I64>;
+def : StorePatImmOff<i64, truncstorei16, or_is_add, STORE16_I64>;
+def : StorePatImmOff<i64, truncstorei32, or_is_add, STORE32_I64>;
+
+def : StorePatGlobalAddr<i32, truncstorei8, STORE8_I32>;
+def : StorePatGlobalAddr<i32, truncstorei16, STORE16_I32>;
+def : StorePatGlobalAddr<i64, truncstorei8, STORE8_I64>;
+def : StorePatGlobalAddr<i64, truncstorei16, STORE16_I64>;
+def : StorePatGlobalAddr<i64, truncstorei32, STORE32_I64>;
+def : StorePatExternalSym<i32, truncstorei8, STORE8_I32>;
+def : StorePatExternalSym<i32, truncstorei16, STORE16_I32>;
+def : StorePatExternalSym<i64, truncstorei8, STORE8_I64>;
+def : StorePatExternalSym<i64, truncstorei16, STORE16_I64>;
+def : StorePatExternalSym<i64, truncstorei32, STORE32_I64>;
 
 // Select truncating stores with just a constant offset.
-def : Pat<(truncstorei8 I32:$val, imm:$off),
-          (STORE8_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, imm:$off),
-          (STORE16_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, imm:$off),
-          (STORE8_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, imm:$off),
-          (STORE16_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, imm:$off),
-          (STORE32_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE8_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE16_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE8_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE16_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE32_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE8_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE16_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE8_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE16_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE32_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>;
+def : StorePatOffsetOnly<i32, truncstorei16, STORE16_I32>;
+def : StorePatOffsetOnly<i64, truncstorei8, STORE8_I64>;
+def : StorePatOffsetOnly<i64, truncstorei16, STORE16_I64>;
+def : StorePatOffsetOnly<i64, truncstorei32, STORE32_I64>;
+def : StorePatGlobalAddrOffOnly<i32, truncstorei8, STORE8_I32>;
+def : StorePatGlobalAddrOffOnly<i32, truncstorei16, STORE16_I32>;
+def : StorePatGlobalAddrOffOnly<i64, truncstorei8, STORE8_I64>;
+def : StorePatGlobalAddrOffOnly<i64, truncstorei16, STORE16_I64>;
+def : StorePatGlobalAddrOffOnly<i64, truncstorei32, STORE32_I64>;
+def : StorePatExternSymOffOnly<i32, truncstorei8, STORE8_I32>;
+def : StorePatExternSymOffOnly<i32, truncstorei16, STORE16_I32>;
+def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
+def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
+def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
 
 let Defs = [ARGUMENTS] in {
 
 // Current memory size.
-def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
-                           [],
-                           "current_memory\t$dst", 0x3f>,
-                         Requires<[HasAddr32]>;
+defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+                         (outs), (ins i32imm:$flags),
+                         [(set I32:$dst,
+                           (int_wasm_memory_size (i32 imm:$flags)))],
+                         "memory.size\t$dst, $flags", "memory.size\t$flags",
+                         0x3f>,
+                       Requires<[HasAddr32]>;
+defm MEM_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+                      (outs), (ins i32imm:$flags),
+                      [(set I32:$dst, (int_wasm_mem_size (i32 imm:$flags)))],
+                      "mem.size\t$dst, $flags", "mem.size\t$flags", 0x3f>,
+                    Requires<[HasAddr32]>;
+defm CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+                            (outs), (ins i32imm:$flags),
+                            [],
+                            "current_memory\t$dst",
+                            "current_memory\t$flags", 0x3f>,
+                          Requires<[HasAddr32]>;
 
 // Grow memory.
-def GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
-                        [],
-                        "grow_memory\t$dst, $delta", 0x40>,
-                      Requires<[HasAddr32]>;
+defm MEMORY_GROW_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+                         (outs), (ins i32imm:$flags, I32:$delta),
+                         [(set I32:$dst,
+                           (int_wasm_memory_grow (i32 imm:$flags),
+                             I32:$delta))],
+                         "memory.grow\t$dst, $flags, $delta",
+                         "memory.grow\t$flags, $delta", 0x3f>,
+                       Requires<[HasAddr32]>;
+defm MEM_GROW_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+                      (outs), (ins i32imm:$flags),
+                      [(set I32:$dst,
+                            (int_wasm_mem_grow (i32 imm:$flags), I32:$delta))],
+                      "mem.grow\t$dst, $flags, $delta", "mem.grow\t$flags",
+                      0x3f>,
+                    Requires<[HasAddr32]>;
+defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+                         (outs), (ins i32imm:$flags),
+                         [],
+                         "grow_memory\t$dst, $delta", "grow_memory\t$flags",
+                         0x40>,
+                       Requires<[HasAddr32]>;
 
 } // Defs = [ARGUMENTS]
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index e403534d580a..7d1edccdeb3c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief WebAssembly SIMD operand code-gen constructs.
+/// WebAssembly SIMD operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
new file mode 100644
index 000000000000..e42dcbc0a8ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -0,0 +1,383 @@
+//=== WebAssemblyLateEHPrepare.cpp - WebAssembly Exception Preparation -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Does various transformations for exception handling.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-exception-prepare"
+
+namespace {
+class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Prepare Exception";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool replaceFuncletReturns(MachineFunction &MF);
+  bool hoistCatches(MachineFunction &MF);
+  bool addCatchAlls(MachineFunction &MF);
+  bool addRethrows(MachineFunction &MF);
+  bool ensureSingleBBTermPads(MachineFunction &MF);
+  bool mergeTerminatePads(MachineFunction &MF);
+  bool addCatchAllTerminatePads(MachineFunction &MF);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyLateEHPrepare() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLateEHPrepare::ID = 0;
+INITIALIZE_PASS(WebAssemblyLateEHPrepare, DEBUG_TYPE,
+                "WebAssembly Exception Preparation", false, false)
+
+FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
+  return new WebAssemblyLateEHPrepare();
+}
+
+// Returns the nearest EH pad that dominates this instruction. This does not use
+// dominator analysis; it just does BFS on its predecessors until arriving at an
+// EH pad. This assumes valid EH scopes so the first EH pad it arrives in all
+// possible search paths should be the same.
+// Returns nullptr in case it does not find any EH pad in the search, or finds
+// multiple different EH pads.
+MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
+  MachineFunction *MF = MI->getParent()->getParent();
+  SmallVector<MachineBasicBlock *, 2> WL;
+  SmallPtrSet<MachineBasicBlock *, 2> Visited;
+  WL.push_back(MI->getParent());
+  MachineBasicBlock *EHPad = nullptr;
+  while (!WL.empty()) {
+    MachineBasicBlock *MBB = WL.pop_back_val();
+    if (Visited.count(MBB))
+      continue;
+    Visited.insert(MBB);
+    if (MBB->isEHPad()) {
+      if (EHPad && EHPad != MBB)
+        return nullptr;
+      EHPad = MBB;
+      continue;
+    }
+    if (MBB == &MF->front())
+      return nullptr;
+    WL.append(MBB->pred_begin(), MBB->pred_end());
+  }
+  return EHPad;
+}
+
+// Erases the given BB and all its children from the function. If other BBs have
+// this BB as a successor, the successor relationships will be deleted as well.
+static void EraseBBAndChildren(MachineBasicBlock *MBB) {
+  SmallVector<MachineBasicBlock *, 8> WL;
+  WL.push_back(MBB);
+  while (!WL.empty()) {
+    MachineBasicBlock *MBB = WL.pop_back_val();
+    for (auto *Pred : MBB->predecessors())
+      Pred->removeSuccessor(MBB);
+    for (auto *Succ : MBB->successors()) {
+      WL.push_back(Succ);
+      MBB->removeSuccessor(Succ);
+    }
+    MBB->eraseFromParent();
+  }
+}
+
+bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() !=
+      ExceptionHandling::Wasm)
+    return false;
+
+  bool Changed = false;
+  Changed |= addRethrows(MF);
+  if (!MF.getFunction().hasPersonalityFn())
+    return Changed;
+  Changed |= replaceFuncletReturns(MF);
+  Changed |= hoistCatches(MF);
+  Changed |= addCatchAlls(MF);
+  Changed |= ensureSingleBBTermPads(MF);
+  Changed |= mergeTerminatePads(MF);
+  Changed |= addCatchAllTerminatePads(MF);
+  return Changed;
+}
+
+bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
+  bool Changed = false;
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto *EHInfo = MF.getWasmEHFuncInfo();
+
+  for (auto &MBB : MF) {
+    auto Pos = MBB.getFirstTerminator();
+    if (Pos == MBB.end())
+      continue;
+    MachineInstr *TI = &*Pos;
+
+    switch (TI->getOpcode()) {
+    case WebAssembly::CATCHRET: {
+      // Replace a catchret with a branch
+      MachineBasicBlock *TBB = TI->getOperand(0).getMBB();
+      if (!MBB.isLayoutSuccessor(TBB))
+        BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::BR))
+            .addMBB(TBB);
+      TI->eraseFromParent();
+      Changed = true;
+      break;
+    }
+    case WebAssembly::CLEANUPRET: {
+      // Replace a cleanupret with a rethrow
+      if (EHInfo->hasThrowUnwindDest(&MBB))
+        BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
+            .addMBB(EHInfo->getThrowUnwindDest(&MBB));
+      else
+        BuildMI(MBB, TI, TI->getDebugLoc(),
+                TII.get(WebAssembly::RETHROW_TO_CALLER));
+
+      TI->eraseFromParent();
+      Changed = true;
+      break;
+    }
+    }
+  }
+  return Changed;
+}
+
+// Hoist catch instructions to the beginning of their matching EH pad BBs in
+// case,
+// (1) catch instruction is not the first instruction in EH pad.
+// ehpad:
+//   some_other_instruction
+//   ...
+//   %exn = catch 0
+// (2) catch instruction is in a non-EH pad BB. For example,
+// ehpad:
+//   br bb0
+// bb0:
+//   %exn = catch 0
+bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) {
+  bool Changed = false;
+  SmallVector<MachineInstr *, 16> Catches;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (WebAssembly::isCatch(MI))
+        Catches.push_back(&MI);
+
+  for (auto *Catch : Catches) {
+    MachineBasicBlock *EHPad = GetMatchingEHPad(Catch);
+    assert(EHPad && "No matching EH pad for catch");
+    if (EHPad->begin() == Catch)
+      continue;
+    Changed = true;
+    EHPad->insert(EHPad->begin(), Catch->removeFromParent());
+  }
+  return Changed;
+}
+
+// Add catch_all to beginning of cleanup pads.
+bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
+  bool Changed = false;
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  for (auto &MBB : MF) {
+    if (!MBB.isEHPad())
+      continue;
+    // This runs after hoistCatches(), so we assume that if there is a catch,
+    // that should be the first instruction in an EH pad.
+    if (!WebAssembly::isCatch(*MBB.begin())) {
+      Changed = true;
+      BuildMI(MBB, MBB.begin(), MBB.begin()->getDebugLoc(),
+              TII.get(WebAssembly::CATCH_ALL));
+    }
+  }
+  return Changed;
+}
+
+// Add a 'rethrow' instruction after __cxa_rethrow() call
+bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) {
+  bool Changed = false;
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto *EHInfo = MF.getWasmEHFuncInfo();
+
+  for (auto &MBB : MF)
+    for (auto &MI : MBB) {
+      // Check if it is a call to __cxa_rethrow()
+      if (!MI.isCall())
+        continue;
+      MachineOperand &CalleeOp = MI.getOperand(0);
+      if (!CalleeOp.isGlobal() ||
+          CalleeOp.getGlobal()->getName() != WebAssembly::CxaRethrowFn)
+        continue;
+
+      // Now we have __cxa_rethrow() call
+      Changed = true;
+      auto InsertPt = std::next(MachineBasicBlock::iterator(MI));
+      while (InsertPt != MBB.end() && InsertPt->isLabel()) // Skip EH_LABELs
+        ++InsertPt;
+      MachineInstr *Rethrow = nullptr;
+      if (EHInfo->hasThrowUnwindDest(&MBB))
+        Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
+                          TII.get(WebAssembly::RETHROW))
+                      .addMBB(EHInfo->getThrowUnwindDest(&MBB));
+      else
+        Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
+                          TII.get(WebAssembly::RETHROW_TO_CALLER));
+
+      // Becasue __cxa_rethrow does not return, the instruction after the
+      // rethrow should be an unreachable or a branch to another BB that should
+      // eventually lead to an unreachable. Delete it because rethrow itself is
+      // a terminator, and also delete non-EH pad successors if any.
+      MBB.erase(std::next(MachineBasicBlock::iterator(Rethrow)), MBB.end());
+      for (auto *Succ : MBB.successors())
+        if (!Succ->isEHPad())
+          EraseBBAndChildren(Succ);
+    }
+  return Changed;
+}
+
+// Terminate pads are an single-BB EH pad in the form of
+// termpad:
+//   %exn = catch 0
+//   call @__clang_call_terminate(%exn)
+//   unreachable
+// (There can be set_local and get_locals before the call if we didn't run
+// RegStackify)
+// But code transformations can change or add more control flow, so the call to
+// __clang_call_terminate() function may not be in the original EH pad anymore.
+// This ensures every terminate pad is a single BB in the form illustrated
+// above.
+bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  // Find calls to __clang_call_terminate()
+  SmallVector<MachineInstr *, 8> ClangCallTerminateCalls;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (MI.isCall()) {
+        const MachineOperand &CalleeOp = MI.getOperand(0);
+        if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() ==
+                                       WebAssembly::ClangCallTerminateFn)
+          ClangCallTerminateCalls.push_back(&MI);
+      }
+
+  bool Changed = false;
+  for (auto *Call : ClangCallTerminateCalls) {
+    MachineBasicBlock *EHPad = GetMatchingEHPad(Call);
+    assert(EHPad && "No matching EH pad for catch");
+
+    // If it is already the form we want, skip it
+    if (Call->getParent() == EHPad &&
+        Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE)
+      continue;
+
+    // In case the __clang_call_terminate() call is not in its matching EH pad,
+    // move the call to the end of EH pad and add an unreachable instruction
+    // after that. Delete all successors and their children if any, because here
+    // the program terminates.
+    Changed = true;
+    MachineInstr *Catch = &*EHPad->begin();
+    // This runs after hoistCatches(), so catch instruction should be at the top
+    assert(WebAssembly::isCatch(*Catch));
+    // Takes the result register of the catch instruction as argument. There may
+    // have been some other set_local/get_locals in between, but at this point
+    // we don't care.
+    Call->getOperand(1).setReg(Catch->getOperand(0).getReg());
+    auto InsertPos = std::next(MachineBasicBlock::iterator(Catch));
+    EHPad->insert(InsertPos, Call->removeFromParent());
+    BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
+            TII.get(WebAssembly::UNREACHABLE));
+    EHPad->erase(InsertPos, EHPad->end());
+    for (auto *Succ : EHPad->successors())
+      EraseBBAndChildren(Succ);
+  }
+  return Changed;
+}
+
+// In case there are multiple terminate pads, merge them into one for code size.
+// This runs after ensureSingleBBTermPads() and assumes every terminate pad is a
+// single BB.
+// In principle this violates EH scope relationship because it can merge
+// multiple inner EH scopes, each of which is in different outer EH scope. But
+// getEHScopeMembership() function will not be called after this, so it is fine.
+bool WebAssemblyLateEHPrepare::mergeTerminatePads(MachineFunction &MF) {
+  SmallVector<MachineBasicBlock *, 8> TermPads;
+  for (auto &MBB : MF)
+    if (WebAssembly::isCatchTerminatePad(MBB))
+      TermPads.push_back(&MBB);
+  if (TermPads.empty())
+    return false;
+
+  MachineBasicBlock *UniqueTermPad = TermPads.front();
+  for (auto *TermPad :
+       llvm::make_range(std::next(TermPads.begin()), TermPads.end())) {
+    SmallVector<MachineBasicBlock *, 2> Preds(TermPad->pred_begin(),
+                                              TermPad->pred_end());
+    for (auto *Pred : Preds)
+      Pred->replaceSuccessor(TermPad, UniqueTermPad);
+    TermPad->eraseFromParent();
+  }
+  return true;
+}
+
+// Terminate pads are cleanup pads, so they should start with a 'catch_all'
+// instruction. But in the Itanium model, when we have a C++ exception object,
+// we pass them to __clang_call_terminate function, which calls __cxa_end_catch
+// with the passed exception pointer and then std::terminate. This is the reason
+// that terminate pads are generated with not a catch_all but a catch
+// instruction in clang and earlier llvm passes. Here we append a terminate pad
+// with a catch_all after each existing terminate pad so we can also catch
+// foreign exceptions. For every terminate pad:
+//   %exn = catch 0
+//   call @__clang_call_terminate(%exn)
+//   unreachable
+// We append this BB right after that:
+//   catch_all
+//   call @std::terminate()
+//   unreachable
+bool WebAssemblyLateEHPrepare::addCatchAllTerminatePads(MachineFunction &MF) {
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  SmallVector<MachineBasicBlock *, 8> TermPads;
+  for (auto &MBB : MF)
+    if (WebAssembly::isCatchTerminatePad(MBB))
+      TermPads.push_back(&MBB);
+  if (TermPads.empty())
+    return false;
+
+  Function *StdTerminateFn =
+      MF.getFunction().getParent()->getFunction(WebAssembly::StdTerminateFn);
+  assert(StdTerminateFn && "There is no std::terminate() function");
+  for (auto *CatchTermPad : TermPads) {
+    DebugLoc DL = CatchTermPad->findDebugLoc(CatchTermPad->begin());
+    auto *CatchAllTermPad = MF.CreateMachineBasicBlock();
+    MF.insert(std::next(MachineFunction::iterator(CatchTermPad)),
+              CatchAllTermPad);
+    CatchAllTermPad->setIsEHPad();
+    BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CATCH_ALL));
+    BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CALL_VOID))
+        .addGlobalAddress(StdTerminateFn);
+    BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::UNREACHABLE));
+
+    // Actually this CatchAllTermPad (new terminate pad with a catch_all) is not
+    // a successor of an existing terminate pad. CatchAllTermPad should have all
+    // predecessors CatchTermPad has instead. This is a hack to force
+    // CatchAllTermPad be always sorted right after CatchTermPad; the correct
+    // predecessor-successor relationships will be restored in CFGStackify pass.
+    CatchTermPad->addSuccessor(CatchAllTermPad);
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 5b867aa763a1..5fb97e38939a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file lowers br_unless into br_if with an inverted condition.
+/// This file lowers br_unless into br_if with an inverted condition.
 ///
 /// br_unless is not currently in the spec, but it's very convenient for LLVM
 /// to use. This pass allows LLVM to use it, for now.
@@ -47,14 +47,17 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyLowerBrUnless::ID = 0;
+INITIALIZE_PASS(WebAssemblyLowerBrUnless, DEBUG_TYPE,
+                "Lowers br_unless into inverted br_if", false, false)
+
 FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
   return new WebAssemblyLowerBrUnless();
 }
 
 bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Lowering br_unless **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index f0b6a3e35dba..e9cb7c10113b 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file lowers exception-related instructions and setjmp/longjmp
+/// This file lowers exception-related instructions and setjmp/longjmp
 /// function calls in order to use Emscripten's JavaScript try and catch
 /// mechanism.
 ///
@@ -225,13 +225,8 @@ static cl::list<std::string>
 
 namespace {
 class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
-  static const char *ThrewGVName;
-  static const char *ThrewValueGVName;
-  static const char *TempRet0GVName;
   static const char *ResumeFName;
   static const char *EHTypeIDFName;
-  static const char *SetThrewFName;
-  static const char *SetTempRet0FName;
   static const char *EmLongjmpFName;
   static const char *EmLongjmpJmpbufFName;
   static const char *SaveSetjmpFName;
@@ -300,14 +295,9 @@ public:
 };
 } // End anonymous namespace
 
-const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewGVName = "__THREW__";
-const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewValueGVName = "__threwValue";
-const char *WebAssemblyLowerEmscriptenEHSjLj::TempRet0GVName = "__tempRet0";
 const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException";
 const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName =
     "llvm_eh_typeid_for";
-const char *WebAssemblyLowerEmscriptenEHSjLj::SetThrewFName = "setThrew";
-const char *WebAssemblyLowerEmscriptenEHSjLj::SetTempRet0FName = "setTempRet0";
 const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName =
     "emscripten_longjmp";
 const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName =
@@ -343,15 +333,13 @@ static bool canThrow(const Value *V) {
   return true;
 }
 
-// Returns an available name for a global value.
-// If the proposed name already exists in the module, adds '_' at the end of
-// the name until the name is available.
-static inline std::string createGlobalValueName(const Module &M,
-                                                const std::string &Propose) {
-  std::string Name = Propose;
-  while (M.getNamedGlobal(Name))
-    Name += "_";
-  return Name;
+static GlobalVariable *createGlobalVariableI32(Module &M, IRBuilder<> &IRB,
+                                               const char *Name) {
+  if (M.getNamedGlobal(Name))
+    report_fatal_error(Twine("variable name is reserved: ") + Name);
+
+  return new GlobalVariable(M, IRB.getInt32Ty(), false,
+                            GlobalValue::WeakODRLinkage, IRB.getInt32(0), Name);
 }
 
 // Simple function name mangler.
@@ -613,11 +601,13 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
   LLVMContext &C = M.getContext();
   IRBuilder<> IRB(C);
 
-  assert(!M.getNamedGlobal(SetThrewFName) && "setThrew already exists");
+  if (M.getNamedGlobal("setThrew"))
+    report_fatal_error("setThrew already exists");
+
   Type *Params[] = {IRB.getInt32Ty(), IRB.getInt32Ty()};
   FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
   Function *F =
-      Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
+      Function::Create(FTy, GlobalValue::WeakODRLinkage, "setThrew", &M);
   Argument *Arg1 = &*(F->arg_begin());
   Argument *Arg2 = &*std::next(F->arg_begin());
   Arg1->setName("threw");
@@ -648,11 +638,12 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetTempRet0Function(Module &M) {
   LLVMContext &C = M.getContext();
   IRBuilder<> IRB(C);
 
-  assert(!M.getNamedGlobal(SetTempRet0FName) && "setTempRet0 already exists");
+  if (M.getNamedGlobal("setTempRet0"))
+    report_fatal_error("setTempRet0 already exists");
   Type *Params[] = {IRB.getInt32Ty()};
   FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
   Function *F =
-      Function::Create(FTy, GlobalValue::ExternalLinkage, SetTempRet0FName, &M);
+      Function::Create(FTy, GlobalValue::WeakODRLinkage, "setTempRet0", &M);
   F->arg_begin()->setName("value");
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRB.SetInsertPoint(EntryBB);
@@ -699,15 +690,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
 
   // Create global variables __THREW__, threwValue, and __tempRet0, which are
   // used in common for both exception handling and setjmp/longjmp handling
-  ThrewGV = new GlobalVariable(M, IRB.getInt32Ty(), false,
-                               GlobalValue::ExternalLinkage, IRB.getInt32(0),
-                               createGlobalValueName(M, ThrewGVName));
-  ThrewValueGV = new GlobalVariable(
-      M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, IRB.getInt32(0),
-      createGlobalValueName(M, ThrewValueGVName));
-  TempRet0GV = new GlobalVariable(M, IRB.getInt32Ty(), false,
-                                  GlobalValue::ExternalLinkage, IRB.getInt32(0),
-                                  createGlobalValueName(M, TempRet0GVName));
+  ThrewGV = createGlobalVariableI32(M, IRB, "__THREW__");
+  ThrewValueGV = createGlobalVariableI32(M, IRB, "__threwValue");
+  TempRet0GV = createGlobalVariableI32(M, IRB, "__tempRet0");
 
   bool Changed = false;
 
@@ -736,12 +721,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   if (DoSjLj) {
     Changed = true; // We have setjmp or longjmp somewhere
 
-    Function *MallocF = M.getFunction("malloc");
-    Function *FreeF = M.getFunction("free");
-    if (!MallocF || !FreeF)
-      report_fatal_error(
-          "malloc and free must be linked into the module if setjmp is used");
-
     // Register saveSetjmp function
     FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
     SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 0020817aee41..ee708d637b25 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Lower @llvm.global_dtors.
+/// Lower @llvm.global_dtors.
 ///
 /// WebAssembly doesn't have a builtin way to invoke static destructors.
 /// Implement @llvm.global_dtors by creating wrapper functions that are
@@ -51,6 +51,9 @@ public:
 } // End anonymous namespace
 
 char LowerGlobalDtors::ID = 0;
+INITIALIZE_PASS(LowerGlobalDtors, DEBUG_TYPE,
+                "Lower @llvm.global_dtors for WebAssembly", false, false)
+
 ModulePass *llvm::createWebAssemblyLowerGlobalDtors() {
   return new LowerGlobalDtors();
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 4a93d4810c7d..d85db14fc679 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains code to lower WebAssembly MachineInstrs to their
+/// This file contains code to lower WebAssembly MachineInstrs to their
 /// corresponding MCInst records.
 ///
 //===----------------------------------------------------------------------===//
@@ -25,7 +25,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -34,11 +33,7 @@ using namespace llvm;
 MCSymbol *
 WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
   const GlobalValue *Global = MO.getGlobal();
-  MCSymbol *Sym = Printer.getSymbol(Global);
-  if (isa<MCSymbolELF>(Sym))
-    return Sym;
-
-  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
 
   if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
     const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
@@ -74,7 +69,7 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
 
     WasmSym->setReturns(std::move(Returns));
     WasmSym->setParams(std::move(Params));
-    WasmSym->setIsFunction(true);
+    WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
   }
 
   return WasmSym;
@@ -83,17 +78,22 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
 MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
     const MachineOperand &MO) const {
   const char *Name = MO.getSymbolName();
-  MCSymbol *Sym = Printer.GetExternalSymbolSymbol(Name);
-  if (isa<MCSymbolELF>(Sym))
-    return Sym;
-
-  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+  MCSymbolWasm *WasmSym =
+      cast<MCSymbolWasm>(Printer.GetExternalSymbolSymbol(Name));
   const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
 
   // __stack_pointer is a global variable; all other external symbols used by
-  // CodeGen are functions.
-  if (strcmp(Name, "__stack_pointer") == 0)
+  // CodeGen are functions.  It's OK to hardcode knowledge of specific symbols
+  // here; this method is precisely there for fetching the signatures of known
+  // Clang-provided symbols.
+  if (strcmp(Name, "__stack_pointer") == 0) {
+    WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+    WasmSym->setGlobalType(wasm::WasmGlobalType{
+        uint8_t(Subtarget.hasAddr64() ? wasm::WASM_TYPE_I64
+                                      : wasm::WASM_TYPE_I32),
+        true});
     return WasmSym;
+  }
 
   SmallVector<wasm::ValType, 4> Returns;
   SmallVector<wasm::ValType, 4> Params;
@@ -101,7 +101,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
 
   WasmSym->setReturns(std::move(Returns));
   WasmSym->setParams(std::move(Params));
-  WasmSym->setIsFunction(true);
+  WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 
   return WasmSym;
 }
@@ -169,35 +169,32 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
         const MCOperandInfo &Info = Desc.OpInfo[i];
         if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
           MCSymbol *Sym = Printer.createTempSymbol("typeindex");
-          if (!isa<MCSymbolELF>(Sym)) {
-            SmallVector<wasm::ValType, 4> Returns;
-            SmallVector<wasm::ValType, 4> Params;
-
-            const MachineRegisterInfo &MRI =
-                MI->getParent()->getParent()->getRegInfo();
-            for (const MachineOperand &MO : MI->defs())
-              Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
-            for (const MachineOperand &MO : MI->explicit_uses())
-              if (MO.isReg())
-                Params.push_back(getType(MRI.getRegClass(MO.getReg())));
-
-            // call_indirect instructions have a callee operand at the end which
-            // doesn't count as a param.
-            if (WebAssembly::isCallIndirect(*MI))
-              Params.pop_back();
-
-            MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
-            WasmSym->setReturns(std::move(Returns));
-            WasmSym->setParams(std::move(Params));
-            WasmSym->setIsFunction(true);
-
-            const MCExpr *Expr =
-                MCSymbolRefExpr::create(WasmSym,
-                                        MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX,
-                                        Ctx);
-            MCOp = MCOperand::createExpr(Expr);
-            break;
-          }
+
+          SmallVector<wasm::ValType, 4> Returns;
+          SmallVector<wasm::ValType, 4> Params;
+
+          const MachineRegisterInfo &MRI =
+              MI->getParent()->getParent()->getRegInfo();
+          for (const MachineOperand &MO : MI->defs())
+            Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
+          for (const MachineOperand &MO : MI->explicit_uses())
+            if (MO.isReg())
+              Params.push_back(getType(MRI.getRegClass(MO.getReg())));
+
+          // call_indirect instructions have a callee operand at the end which
+          // doesn't count as a param.
+          if (WebAssembly::isCallIndirect(*MI))
+            Params.pop_back();
+
+          MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+          WasmSym->setReturns(std::move(Returns));
+          WasmSym->setParams(std::move(Params));
+          WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+
+          const MCExpr *Expr = MCSymbolRefExpr::create(
+              WasmSym, MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX, Ctx);
+          MCOp = MCOperand::createExpr(Expr);
+          break;
         }
       }
       MCOp = MCOperand::createImm(MO.getImm());
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index d1d2794c3b8f..41b4313bb38c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares the class to lower WebAssembly MachineInstrs to
+/// This file declares the class to lower WebAssembly MachineInstrs to
 /// their corresponding MCInst records.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index ccf6a18b32ea..e511e574050f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements WebAssembly-specific per-machine-function
+/// This file implements WebAssembly-specific per-machine-function
 /// information.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 1fcbb7791d4e..a60b10fc5309 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares WebAssembly-specific per-machine-function
+/// This file declares WebAssembly-specific per-machine-function
 /// information.
 ///
 //===----------------------------------------------------------------------===//
@@ -60,6 +60,8 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   void addResult(MVT VT) { Results.push_back(VT); }
   const std::vector<MVT> &getResults() const { return Results; }
 
+  void clearParamsAndResults() { Params.clear(); Results.clear(); }
+
   void setNumLocals(size_t NumLocals) { Locals.resize(NumLocals, MVT::i32); }
   void setLocal(size_t i, MVT VT) { Locals[i] = VT; }
   void addLocal(MVT VT) { Locals.push_back(VT); }
@@ -81,25 +83,29 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
 
   void stackifyVReg(unsigned VReg) {
     assert(MF.getRegInfo().getUniqueVRegDef(VReg));
-    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
-      VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
-    VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
+    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    if (I >= VRegStackified.size())
+      VRegStackified.resize(I + 1);
+    VRegStackified.set(I);
   }
   bool isVRegStackified(unsigned VReg) const {
-    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    if (I >= VRegStackified.size())
       return false;
-    return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg));
+    return VRegStackified.test(I);
   }
 
   void initWARegs();
   void setWAReg(unsigned VReg, unsigned WAReg) {
     assert(WAReg != UnusedReg);
-    assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size());
-    WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
+    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    assert(I < WARegs.size());
+    WARegs[I] = WAReg;
   }
-  unsigned getWAReg(unsigned Reg) const {
-    assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
-    return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
+  unsigned getWAReg(unsigned VReg) const {
+    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    assert(I < WARegs.size());
+    return WARegs[I];
   }
 
   // For a given stackified WAReg, return the id number to print with push/pop.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index ebe97848d461..04ac22a589ea 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -8,11 +8,11 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Optimize LiveIntervals for use in a post-RA context.
+/// Optimize LiveIntervals for use in a post-RA context.
 //
 /// LiveIntervals normally runs before register allocation when the code is
 /// only recently lowered out of SSA form, so it's uncommon for registers to
-/// have multiple defs, and then they do, the defs are usually closely related.
+/// have multiple defs, and when they do, the defs are usually closely related.
 /// Later, after coalescing, tail duplication, and other optimizations, it's
 /// more common to see registers with multiple unrelated defs. This pass
 /// updates LiveIntervals to distribute the value numbers across separate
@@ -58,14 +58,17 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyOptimizeLiveIntervals::ID = 0;
+INITIALIZE_PASS(WebAssemblyOptimizeLiveIntervals, DEBUG_TYPE,
+                "Optimize LiveIntervals for WebAssembly", false, false)
+
 FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
   return new WebAssemblyOptimizeLiveIntervals();
 }
 
 bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 559165e4c86b..113ee2532bce 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Optimize calls with "returned" attributes for WebAssembly.
+/// Optimize calls with "returned" attributes for WebAssembly.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -48,6 +48,10 @@ public:
 } // End anonymous namespace
 
 char OptimizeReturned::ID = 0;
+INITIALIZE_PASS(OptimizeReturned, DEBUG_TYPE,
+                "Optimize calls with \"returned\" attributes for WebAssembly",
+                false, false)
+
 FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
   return new OptimizeReturned();
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index d2fbc5a22308..a54484407805 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Late peephole optimizations for WebAssembly.
+/// Late peephole optimizations for WebAssembly.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -50,6 +50,9 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyPeephole::ID = 0;
+INITIALIZE_PASS(WebAssemblyPeephole, DEBUG_TYPE,
+                "WebAssembly peephole optimizations", false, false)
+
 FunctionPass *llvm::createWebAssemblyPeephole() {
   return new WebAssemblyPeephole();
 }
@@ -80,18 +83,13 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
     return false;
   if (&MBB != &MF.back())
     return false;
-  if (MF.getSubtarget<WebAssemblySubtarget>()
-        .getTargetTriple().isOSBinFormatELF()) {
-    if (&MI != &MBB.back())
-      return false;
-  } else {
-    MachineBasicBlock::iterator End = MBB.end();
-    --End;
-    assert(End->getOpcode() == WebAssembly::END_FUNCTION);
-    --End;
-    if (&MI != &*End)
-      return false;
-  }
+
+  MachineBasicBlock::iterator End = MBB.end();
+  --End;
+  assert(End->getOpcode() == WebAssembly::END_FUNCTION);
+  --End;
+  if (&MI != &*End)
+    return false;
 
   if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
     // If the operand isn't stackified, insert a COPY to read the operand and
@@ -113,7 +111,7 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
 }
 
 bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Peephole **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 3a2876bfcde2..e44e7057e233 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Fix up code to meet LiveInterval's requirements.
+/// Fix up code to meet LiveInterval's requirements.
 ///
 /// Some CodeGen passes don't preserve LiveInterval's requirements, because
 /// they run after register allocation and it isn't important. However,
@@ -55,6 +55,9 @@ private:
 } // end anonymous namespace
 
 char WebAssemblyPrepareForLiveIntervals::ID = 0;
+INITIALIZE_PASS(WebAssemblyPrepareForLiveIntervals, DEBUG_TYPE,
+                "Fix up code for LiveIntervals", false, false)
+
 FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
   return new WebAssemblyPrepareForLiveIntervals();
 }
@@ -68,7 +71,7 @@ static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
 }
 
 bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Prepare For LiveIntervals **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 2ac3a839c3c8..d69a27937105 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a virtual register coloring pass.
+/// This file implements a virtual register coloring pass.
 ///
 /// WebAssembly doesn't have a fixed number of registers, but it is still
 /// desirable to minimize the total number of registers used in each function.
@@ -55,6 +55,9 @@ private:
 } // end anonymous namespace
 
 char WebAssemblyRegColoring::ID = 0;
+INITIALIZE_PASS(WebAssemblyRegColoring, DEBUG_TYPE,
+                "Minimize number of registers used", false, false)
+
 FunctionPass *llvm::createWebAssemblyRegColoring() {
   return new WebAssemblyRegColoring();
 }
@@ -71,7 +74,7 @@ static float computeWeight(const MachineRegisterInfo *MRI,
 }
 
 bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Register Coloring **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
@@ -94,7 +97,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
   SmallVector<LiveInterval *, 0> SortedIntervals;
   SortedIntervals.reserve(NumVRegs);
 
-  DEBUG(dbgs() << "Interesting register intervals:\n");
+  LLVM_DEBUG(dbgs() << "Interesting register intervals:\n");
   for (unsigned i = 0; i < NumVRegs; ++i) {
     unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
     if (MFI.isVRegStackified(VReg))
@@ -106,27 +109,27 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     LiveInterval *LI = &Liveness->getInterval(VReg);
     assert(LI->weight == 0.0f);
     LI->weight = computeWeight(MRI, MBFI, VReg);
-    DEBUG(LI->dump());
+    LLVM_DEBUG(LI->dump());
     SortedIntervals.push_back(LI);
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   // Sort them to put arguments first (since we don't want to rename live-in
   // registers), by weight next, and then by position.
   // TODO: Investigate more intelligent sorting heuristics. For starters, we
   // should try to coalesce adjacent live intervals before non-adjacent ones.
-  std::sort(SortedIntervals.begin(), SortedIntervals.end(),
-            [MRI](LiveInterval *LHS, LiveInterval *RHS) {
-              if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
-                return MRI->isLiveIn(LHS->reg);
-              if (LHS->weight != RHS->weight)
-                return LHS->weight > RHS->weight;
-              if (LHS->empty() || RHS->empty())
-                return !LHS->empty() && RHS->empty();
-              return *LHS < *RHS;
-            });
-
-  DEBUG(dbgs() << "Coloring register intervals:\n");
+  llvm::sort(SortedIntervals.begin(), SortedIntervals.end(),
+             [MRI](LiveInterval *LHS, LiveInterval *RHS) {
+               if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
+                 return MRI->isLiveIn(LHS->reg);
+               if (LHS->weight != RHS->weight)
+                 return LHS->weight > RHS->weight;
+               if (LHS->empty() || RHS->empty())
+                 return !LHS->empty() && RHS->empty();
+               return *LHS < *RHS;
+             });
+
+  LLVM_DEBUG(dbgs() << "Coloring register intervals:\n");
   SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u);
   SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments(
       SortedIntervals.size());
@@ -156,9 +159,9 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     Changed |= Old != New;
     UsedColors.set(Color);
     Assignments[Color].push_back(LI);
-    DEBUG(dbgs() << "Assigning vreg"
-                 << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg"
-                 << TargetRegisterInfo::virtReg2Index(New) << "\n");
+    LLVM_DEBUG(
+        dbgs() << "Assigning vreg" << TargetRegisterInfo::virtReg2Index(LI->reg)
+               << " to vreg" << TargetRegisterInfo::virtReg2Index(New) << "\n");
   }
   if (!Changed)
     return false;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index 766ab456a8e6..1e2a248f097e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a pass which assigns WebAssembly register
+/// This file implements a pass which assigns WebAssembly register
 /// numbers for CodeGen virtual registers.
 ///
 //===----------------------------------------------------------------------===//
@@ -51,14 +51,18 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyRegNumbering::ID = 0;
+INITIALIZE_PASS(WebAssemblyRegNumbering, DEBUG_TYPE,
+                "Assigns WebAssembly register numbers for virtual registers",
+                false, false)
+
 FunctionPass *llvm::createWebAssemblyRegNumbering() {
   return new WebAssemblyRegNumbering();
 }
 
 bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Register Numbering **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Register Numbering **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -73,8 +77,8 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
       break;
 
     int64_t Imm = MI.getOperand(1).getImm();
-    DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
-                 << Imm << "\n");
+    LLVM_DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg()
+                      << " -> WAReg " << Imm << "\n");
     MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
   }
 
@@ -92,13 +96,13 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
       continue;
     // Handle stackified registers.
     if (MFI.isVRegStackified(VReg)) {
-      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
-                   << (INT32_MIN | NumStackRegs) << "\n");
+      LLVM_DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
+                        << (INT32_MIN | NumStackRegs) << "\n");
       MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
       continue;
     }
     if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) {
-      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
+      LLVM_DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
       MFI.setWAReg(VReg, CurReg++);
     }
   }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index a4bb967f36f6..9f5d5bd87831 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a register stacking pass.
+/// This file implements a register stacking pass.
 ///
 /// This pass reorders instructions to put register uses and defs in an order
 /// such that they form single-use expression trees. Registers fitting this form
@@ -67,6 +67,10 @@ public:
 } // end anonymous namespace
 
 char WebAssemblyRegStackify::ID = 0;
+INITIALIZE_PASS(WebAssemblyRegStackify, DEBUG_TYPE,
+                "Reorder instructions to use the WebAssembly value stack",
+                false, false)
+
 FunctionPass *llvm::createWebAssemblyRegStackify() {
   return new WebAssemblyRegStackify();
 }
@@ -156,10 +160,9 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
 // and/or uses the stack pointer value.
 static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
                   bool &Write, bool &Effects, bool &StackPointer) {
-  assert(!MI.isPosition());
   assert(!MI.isTerminator());
 
-  if (MI.isDebugValue())
+  if (MI.isDebugInstr() || MI.isPosition())
     return;
 
   // Check for loads.
@@ -469,7 +472,7 @@ static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
                                       MachineInstr *Insert, LiveIntervals &LIS,
                                       WebAssemblyFunctionInfo &MFI,
                                       MachineRegisterInfo &MRI) {
-  DEBUG(dbgs() << "Move for single use: "; Def->dump());
+  LLVM_DEBUG(dbgs() << "Move for single use: "; Def->dump());
 
   MBB.splice(Insert, &MBB, Def);
   LIS.handleMove(*Def);
@@ -496,7 +499,7 @@ static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
 
     MFI.stackifyVReg(NewReg);
 
-    DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+    LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
   }
 
   ImposeStackOrdering(Def);
@@ -510,8 +513,8 @@ static MachineInstr *RematerializeCheapDef(
     MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
     WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
     const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
-  DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
-  DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
+  LLVM_DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
+  LLVM_DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
 
   unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
   TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
@@ -522,7 +525,7 @@ static MachineInstr *RematerializeCheapDef(
   MFI.stackifyVReg(NewReg);
   ImposeStackOrdering(Clone);
 
-  DEBUG(dbgs() << " - Cloned to "; Clone->dump());
+  LLVM_DEBUG(dbgs() << " - Cloned to "; Clone->dump());
 
   // Shrink the interval.
   bool IsDead = MRI.use_empty(Reg);
@@ -534,7 +537,7 @@ static MachineInstr *RematerializeCheapDef(
 
   // If that was the last use of the original, delete the original.
   if (IsDead) {
-    DEBUG(dbgs() << " - Deleting original\n");
+    LLVM_DEBUG(dbgs() << " - Deleting original\n");
     SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
     LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
     LIS.removeInterval(Reg);
@@ -569,7 +572,7 @@ static MachineInstr *MoveAndTeeForMultiUse(
     unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
     MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
     MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
-  DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
+  LLVM_DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
 
   // Move Def into place.
   MBB.splice(Insert, &MBB, Def);
@@ -605,8 +608,8 @@ static MachineInstr *MoveAndTeeForMultiUse(
   ImposeStackOrdering(Def);
   ImposeStackOrdering(Tee);
 
-  DEBUG(dbgs() << " - Replaced register: "; Def->dump());
-  DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
+  LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  LLVM_DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
   return Def;
 }
 
@@ -733,9 +736,9 @@ public:
 } // end anonymous namespace
 
 bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** Register Stackifying **********\n"
-                  "********** Function: "
-               << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Register Stackifying **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
 
   bool Changed = false;
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -746,14 +749,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
 
-  // Disable the TEE optimization if we aren't doing direct wasm object
-  // emission, because lowering TEE to TEE_LOCAL is done in the ExplicitLocals
-  // pass, which is also disabled.
-  bool UseTee = true;
-  if (MF.getSubtarget<WebAssemblySubtarget>()
-        .getTargetTriple().isOSBinFormatELF())
-    UseTee = false;
-
   // Walk the instructions from the bottom up. Currently we don't look past
   // block boundaries, and the blocks aren't ordered so the block visitation
   // order isn't significant, but we may want to change this in the future.
@@ -819,7 +814,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           Insert =
               RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
                                     LIS, MFI, MRI, TII, TRI);
-        } else if (UseTee && CanMove &&
+        } else if (CanMove &&
                    OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
           Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
                                          MRI, TII);
@@ -867,7 +862,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   SmallVector<unsigned, 0> Stack;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
       for (MachineOperand &MO : reverse(MI.explicit_operands())) {
         if (!MO.isReg())
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 5e7ebd19fac7..b6481ac2d4ae 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
 /// TargetRegisterInfo class.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index ad1d71eebf22..2a73dfd4b065 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the WebAssembly implementation of the
+/// This file contains the WebAssembly implementation of the
 /// WebAssemblyRegisterInfo class.
 ///
 //===----------------------------------------------------------------------===//
@@ -45,6 +45,8 @@ public:
   const TargetRegisterClass *
   getPointerRegClass(const MachineFunction &MF,
                      unsigned Kind = 0) const override;
+  // This does not apply to wasm.
+  const uint32_t *getNoPreservedMask() const override { return nullptr; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 90888100be17..29f42b96b249 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file describes the WebAssembly register classes and some nominal
+/// This file describes the WebAssembly register classes and some nominal
 /// physical registers.
 ///
 //===----------------------------------------------------------------------===//
@@ -34,13 +34,18 @@ def SP32 : WebAssemblyReg<"%SP32">;
 def SP64 : WebAssemblyReg<"%SP64">;
 
 // The register allocation framework requires register classes have at least
-// one register, so we define a few for the floating point register classes
-// since we otherwise don't need a physical register in those classes.
+// one register, so we define a few for the integer / floating point register
+// classes since we otherwise don't need a physical register in those classes.
+// These are also used a "types" in the generated assembly matcher.
+def I32_0 : WebAssemblyReg<"%i32.0">;
+def I64_0 : WebAssemblyReg<"%i64.0">;
 def F32_0 : WebAssemblyReg<"%f32.0">;
 def F64_0 : WebAssemblyReg<"%f64.0">;
 
 def V128_0: WebAssemblyReg<"%v128">;
 
+def EXCEPT_REF_0 : WebAssemblyReg<"%except_ref.0">;
+
 // The value stack "register". This is an opaque entity which serves to order
 // uses and defs that must remain in LIFO order.
 def VALUE_STACK : WebAssemblyReg<"STACK">;
@@ -54,9 +59,10 @@ def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">;
 //  Register classes
 //===----------------------------------------------------------------------===//
 
-def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
-def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
+def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>;
+def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
 def V128 : WebAssemblyRegClass<[v4f32, v4i32, v16i8, v8i16], 128, (add V128_0)>;
+def EXCEPT_REF : WebAssemblyRegClass<[ExceptRef], 0, (add EXCEPT_REF_0)>;
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index 878ffd08d228..f432b367d156 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements a pass that replaces physical registers with
+/// This file implements a pass that replaces physical registers with
 /// virtual registers.
 ///
 /// LLVM expects certain physical registers, such as a stack pointer. However,
@@ -53,12 +53,16 @@ private:
 } // end anonymous namespace
 
 char WebAssemblyReplacePhysRegs::ID = 0;
+INITIALIZE_PASS(WebAssemblyReplacePhysRegs, DEBUG_TYPE,
+                "Replace physical registers with virtual registers",
+                false, false)
+
 FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
   return new WebAssemblyReplacePhysRegs();
 }
 
 bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Replace Physical Registers **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index f808c063d7e4..fe8a5e4c06f1 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains signature information for runtime libcalls.
+/// This file contains signature information for runtime libcalls.
 ///
 /// CodeGen uses external symbols, which it refers to by name. The WebAssembly
 /// target needs type information for all functions. This file contains a big
@@ -22,6 +22,7 @@
 #include "WebAssemblyRuntimeLibcallSignatures.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -58,13 +59,16 @@ enum RuntimeLibcallSignature {
   i32_func_f32_f32,
   i32_func_f64_f64,
   i32_func_i32_i32,
+  i32_func_i32_i32_iPTR,
   i64_func_i64_i64,
+  i64_func_i64_i64_iPTR,
   i64_i64_func_f32,
   i64_i64_func_f64,
   i16_i16_func_i16_i16,
   i32_i32_func_i32_i32,
   i64_i64_func_i64_i64,
   i64_i64_func_i64_i64_i64_i64,
+  i64_i64_func_i64_i64_i64_i64_iPTR,
   i64_i64_i64_i64_func_i64_i64_i64_i64,
   i64_i64_func_i64_i64_i32,
   iPTR_func_iPTR_i32_iPTR,
@@ -84,918 +88,405 @@ enum RuntimeLibcallSignature {
   unsupported
 };
 
-} // end anonymous namespace
-
-static const RuntimeLibcallSignature
-RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
-// Integer
-/* SHL_I16 */ i16_func_i16_i16,
-/* SHL_I32 */ i32_func_i32_i32,
-/* SHL_I64 */ i64_func_i64_i64,
-/* SHL_I128 */ i64_i64_func_i64_i64_i32,
-/* SRL_I16 */ i16_func_i16_i16,
-/* SRL_I32 */ i32_func_i32_i32,
-/* SRL_I64 */ i64_func_i64_i64,
-/* SRL_I128 */ i64_i64_func_i64_i64_i32,
-/* SRA_I16 */ i16_func_i16_i16,
-/* SRA_I32 */ i32_func_i32_i32,
-/* SRA_I64 */ i64_func_i64_i64,
-/* SRA_I128 */ i64_i64_func_i64_i64_i32,
-/* MUL_I8 */ i8_func_i8_i8,
-/* MUL_I16 */ i16_func_i16_i16,
-/* MUL_I32 */ i32_func_i32_i32,
-/* MUL_I64 */ i64_func_i64_i64,
-/* MUL_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* MULO_I32 */ i32_func_i32_i32,
-/* MULO_I64 */ i64_func_i64_i64,
-/* MULO_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* SDIV_I8 */ i8_func_i8_i8,
-/* SDIV_I16 */ i16_func_i16_i16,
-/* SDIV_I32 */ i32_func_i32_i32,
-/* SDIV_I64 */ i64_func_i64_i64,
-/* SDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* UDIV_I8 */ i8_func_i8_i8,
-/* UDIV_I16 */ i16_func_i16_i16,
-/* UDIV_I32 */ i32_func_i32_i32,
-/* UDIV_I64 */ i64_func_i64_i64,
-/* UDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* SREM_I8 */ i8_func_i8_i8,
-/* SREM_I16 */ i16_func_i16_i16,
-/* SREM_I32 */ i32_func_i32_i32,
-/* SREM_I64 */ i64_func_i64_i64,
-/* SREM_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* UREM_I8 */ i8_func_i8_i8,
-/* UREM_I16 */ i16_func_i16_i16,
-/* UREM_I32 */ i32_func_i32_i32,
-/* UREM_I64 */ i64_func_i64_i64,
-/* UREM_I128 */ i64_i64_func_i64_i64_i64_i64,
-/* SDIVREM_I8 */ i8_func_i8_i8,
-/* SDIVREM_I16 */ i16_i16_func_i16_i16,
-/* SDIVREM_I32 */ i32_i32_func_i32_i32,
-/* SDIVREM_I64 */ i64_func_i64_i64,
-/* SDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
-/* UDIVREM_I8 */ i8_func_i8_i8,
-/* UDIVREM_I16 */ i16_i16_func_i16_i16,
-/* UDIVREM_I32 */ i32_i32_func_i32_i32,
-/* UDIVREM_I64 */ i64_i64_func_i64_i64,
-/* UDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
-/* NEG_I32 */ i32_func_i32,
-/* NEG_I64 */ i64_func_i64,
-
-// FLOATING POINT
-/* ADD_F32 */ f32_func_f32_f32,
-/* ADD_F64 */ f64_func_f64_f64,
-/* ADD_F80 */ unsupported,
-/* ADD_F128 */ func_iPTR_i64_i64_i64_i64,
-/* ADD_PPCF128 */ unsupported,
-/* SUB_F32 */ f32_func_f32_f32,
-/* SUB_F64 */ f64_func_f64_f64,
-/* SUB_F80 */ unsupported,
-/* SUB_F128 */ func_iPTR_i64_i64_i64_i64,
-/* SUB_PPCF128 */ unsupported,
-/* MUL_F32 */ f32_func_f32_f32,
-/* MUL_F64 */ f64_func_f64_f64,
-/* MUL_F80 */ unsupported,
-/* MUL_F128 */ func_iPTR_i64_i64_i64_i64,
-/* MUL_PPCF128 */ unsupported,
-/* DIV_F32 */ f32_func_f32_f32,
-/* DIV_F64 */ f64_func_f64_f64,
-/* DIV_F80 */ unsupported,
-/* DIV_F128 */ func_iPTR_i64_i64_i64_i64,
-/* DIV_PPCF128 */ unsupported,
-/* REM_F32 */ f32_func_f32_f32,
-/* REM_F64 */ f64_func_f64_f64,
-/* REM_F80 */ unsupported,
-/* REM_F128 */ func_iPTR_i64_i64_i64_i64,
-/* REM_PPCF128 */ unsupported,
-/* FMA_F32 */ f32_func_f32_f32_f32,
-/* FMA_F64 */ f64_func_f64_f64_f64,
-/* FMA_F80 */ unsupported,
-/* FMA_F128 */ func_iPTR_i64_i64_i64_i64_i64_i64,
-/* FMA_PPCF128 */ unsupported,
-/* POWI_F32 */ f32_func_f32_i32,
-/* POWI_F64 */ f64_func_f64_i32,
-/* POWI_F80 */ unsupported,
-/* POWI_F128 */ func_iPTR_i64_i64_i64_i64,
-/* POWI_PPCF128 */ unsupported,
-/* SQRT_F32 */ f32_func_f32,
-/* SQRT_F64 */ f64_func_f64,
-/* SQRT_F80 */ unsupported,
-/* SQRT_F128 */ func_iPTR_i64_i64,
-/* SQRT_PPCF128 */ unsupported,
-/* LOG_F32 */ f32_func_f32,
-/* LOG_F64 */ f64_func_f64,
-/* LOG_F80 */ unsupported,
-/* LOG_F128 */ func_iPTR_i64_i64,
-/* LOG_PPCF128 */ unsupported,
-/* LOG2_F32 */ f32_func_f32,
-/* LOG2_F64 */ f64_func_f64,
-/* LOG2_F80 */ unsupported,
-/* LOG2_F128 */ func_iPTR_i64_i64,
-/* LOG2_PPCF128 */ unsupported,
-/* LOG10_F32 */ f32_func_f32,
-/* LOG10_F64 */ f64_func_f64,
-/* LOG10_F80 */ unsupported,
-/* LOG10_F128 */ func_iPTR_i64_i64,
-/* LOG10_PPCF128 */ unsupported,
-/* EXP_F32 */ f32_func_f32,
-/* EXP_F64 */ f64_func_f64,
-/* EXP_F80 */ unsupported,
-/* EXP_F128 */ func_iPTR_i64_i64,
-/* EXP_PPCF128 */ unsupported,
-/* EXP2_F32 */ f32_func_f32,
-/* EXP2_F64 */ f64_func_f64,
-/* EXP2_F80 */ unsupported,
-/* EXP2_F128 */ func_iPTR_i64_i64,
-/* EXP2_PPCF128 */ unsupported,
-/* SIN_F32 */ f32_func_f32,
-/* SIN_F64 */ f64_func_f64,
-/* SIN_F80 */ unsupported,
-/* SIN_F128 */ func_iPTR_i64_i64,
-/* SIN_PPCF128 */ unsupported,
-/* COS_F32 */ f32_func_f32,
-/* COS_F64 */ f64_func_f64,
-/* COS_F80 */ unsupported,
-/* COS_F128 */ func_iPTR_i64_i64,
-/* COS_PPCF128 */ unsupported,
-/* SINCOS_F32 */ func_f32_iPTR_iPTR,
-/* SINCOS_F64 */ func_f64_iPTR_iPTR,
-/* SINCOS_F80 */ unsupported,
-/* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,
-/* SINCOS_PPCF128 */ unsupported,
-/* SINCOS_STRET_F32 */ unsupported,
-/* SINCOS_STRET_F64 */ unsupported,
-/* POW_F32 */ f32_func_f32_f32,
-/* POW_F64 */ f64_func_f64_f64,
-/* POW_F80 */ unsupported,
-/* POW_F128 */ func_iPTR_i64_i64_i64_i64,
-/* POW_PPCF128 */ unsupported,
-/* CEIL_F32 */ f32_func_f32,
-/* CEIL_F64 */ f64_func_f64,
-/* CEIL_F80 */ unsupported,
-/* CEIL_F128 */ func_iPTR_i64_i64,
-/* CEIL_PPCF128 */ unsupported,
-/* TRUNC_F32 */ f32_func_f32,
-/* TRUNC_F64 */ f64_func_f64,
-/* TRUNC_F80 */ unsupported,
-/* TRUNC_F128 */ func_iPTR_i64_i64,
-/* TRUNC_PPCF128 */ unsupported,
-/* RINT_F32 */ f32_func_f32,
-/* RINT_F64 */ f64_func_f64,
-/* RINT_F80 */ unsupported,
-/* RINT_F128 */ func_iPTR_i64_i64,
-/* RINT_PPCF128 */ unsupported,
-/* NEARBYINT_F32 */ f32_func_f32,
-/* NEARBYINT_F64 */ f64_func_f64,
-/* NEARBYINT_F80 */ unsupported,
-/* NEARBYINT_F128 */ func_iPTR_i64_i64,
-/* NEARBYINT_PPCF128 */ unsupported,
-/* ROUND_F32 */ f32_func_f32,
-/* ROUND_F64 */ f64_func_f64,
-/* ROUND_F80 */ unsupported,
-/* ROUND_F128 */ func_iPTR_i64_i64,
-/* ROUND_PPCF128 */ unsupported,
-/* FLOOR_F32 */ f32_func_f32,
-/* FLOOR_F64 */ f64_func_f64,
-/* FLOOR_F80 */ unsupported,
-/* FLOOR_F128 */ func_iPTR_i64_i64,
-/* FLOOR_PPCF128 */ unsupported,
-/* COPYSIGN_F32 */ f32_func_f32_f32,
-/* COPYSIGN_F64 */ f64_func_f64_f64,
-/* COPYSIGN_F80 */ unsupported,
-/* COPYSIGN_F128 */ func_iPTR_i64_i64_i64_i64,
-/* COPYSIGN_PPCF128 */ unsupported,
-/* FMIN_F32 */ f32_func_f32_f32,
-/* FMIN_F64 */ f64_func_f64_f64,
-/* FMIN_F80 */ unsupported,
-/* FMIN_F128 */ func_iPTR_i64_i64_i64_i64,
-/* FMIN_PPCF128 */ unsupported,
-/* FMAX_F32 */ f32_func_f32_f32,
-/* FMAX_F64 */ f64_func_f64_f64,
-/* FMAX_F80 */ unsupported,
-/* FMAX_F128 */ func_iPTR_i64_i64_i64_i64,
-/* FMAX_PPCF128 */ unsupported,
-
-// CONVERSION
-/* FPEXT_F32_PPCF128 */ unsupported,
-/* FPEXT_F64_PPCF128 */ unsupported,
-/* FPEXT_F64_F128 */ func_iPTR_f64,
-/* FPEXT_F32_F128 */ func_iPTR_f32,
-/* FPEXT_F32_F64 */ f64_func_f32,
-/* FPEXT_F16_F32 */ f32_func_i16,
-/* FPROUND_F32_F16 */ i16_func_f32,
-/* FPROUND_F64_F16 */ unsupported,
-/* FPROUND_F80_F16 */ unsupported,
-/* FPROUND_F128_F16 */ unsupported,
-/* FPROUND_PPCF128_F16 */ unsupported,
-/* FPROUND_F64_F32 */ f32_func_f64,
-/* FPROUND_F80_F32 */ unsupported,
-/* FPROUND_F128_F32 */ f32_func_i64_i64,
-/* FPROUND_PPCF128_F32 */ unsupported,
-/* FPROUND_F80_F64 */ unsupported,
-/* FPROUND_F128_F64 */ f64_func_i64_i64,
-/* FPROUND_PPCF128_F64 */ unsupported,
-/* FPTOSINT_F32_I32 */ i32_func_f32,
-/* FPTOSINT_F32_I64 */ i64_func_f32,
-/* FPTOSINT_F32_I128 */ i64_i64_func_f32,
-/* FPTOSINT_F64_I32 */ i32_func_f64,
-/* FPTOSINT_F64_I64 */ i64_func_f64,
-/* FPTOSINT_F64_I128 */ i64_i64_func_f64,
-/* FPTOSINT_F80_I32 */ unsupported,
-/* FPTOSINT_F80_I64 */ unsupported,
-/* FPTOSINT_F80_I128 */ unsupported,
-/* FPTOSINT_F128_I32 */ i32_func_i64_i64,
-/* FPTOSINT_F128_I64 */ i64_func_i64_i64,
-/* FPTOSINT_F128_I128 */ i64_i64_func_i64_i64,
-/* FPTOSINT_PPCF128_I32 */ unsupported,
-/* FPTOSINT_PPCF128_I64 */ unsupported,
-/* FPTOSINT_PPCF128_I128 */ unsupported,
-/* FPTOUINT_F32_I32 */ i32_func_f32,
-/* FPTOUINT_F32_I64 */ i64_func_f32,
-/* FPTOUINT_F32_I128 */ i64_i64_func_f32,
-/* FPTOUINT_F64_I32 */ i32_func_f64,
-/* FPTOUINT_F64_I64 */ i64_func_f64,
-/* FPTOUINT_F64_I128 */ i64_i64_func_f64,
-/* FPTOUINT_F80_I32 */ unsupported,
-/* FPTOUINT_F80_I64 */ unsupported,
-/* FPTOUINT_F80_I128 */ unsupported,
-/* FPTOUINT_F128_I32 */ i32_func_i64_i64,
-/* FPTOUINT_F128_I64 */ i64_func_i64_i64,
-/* FPTOUINT_F128_I128 */ i64_i64_func_i64_i64,
-/* FPTOUINT_PPCF128_I32 */ unsupported,
-/* FPTOUINT_PPCF128_I64 */ unsupported,
-/* FPTOUINT_PPCF128_I128 */ unsupported,
-/* SINTTOFP_I32_F32 */ f32_func_i32,
-/* SINTTOFP_I32_F64 */ f64_func_i32,
-/* SINTTOFP_I32_F80 */ unsupported,
-/* SINTTOFP_I32_F128 */ func_iPTR_i32,
-/* SINTTOFP_I32_PPCF128 */ unsupported,
-/* SINTTOFP_I64_F32 */ f32_func_i64,
-/* SINTTOFP_I64_F64 */ f64_func_i64,
-/* SINTTOFP_I64_F80 */ unsupported,
-/* SINTTOFP_I64_F128 */ func_iPTR_i64,
-/* SINTTOFP_I64_PPCF128 */ unsupported,
-/* SINTTOFP_I128_F32 */ f32_func_i64_i64,
-/* SINTTOFP_I128_F64 */ f64_func_i64_i64,
-/* SINTTOFP_I128_F80 */ unsupported,
-/* SINTTOFP_I128_F128 */ func_iPTR_i64_i64,
-/* SINTTOFP_I128_PPCF128 */ unsupported,
-/* UINTTOFP_I32_F32 */ f32_func_i32,
-/* UINTTOFP_I32_F64 */ f64_func_i64,
-/* UINTTOFP_I32_F80 */ unsupported,
-/* UINTTOFP_I32_F128 */ func_iPTR_i32,
-/* UINTTOFP_I32_PPCF128 */ unsupported,
-/* UINTTOFP_I64_F32 */ f32_func_i64,
-/* UINTTOFP_I64_F64 */ f64_func_i64,
-/* UINTTOFP_I64_F80 */ unsupported,
-/* UINTTOFP_I64_F128 */ func_iPTR_i64,
-/* UINTTOFP_I64_PPCF128 */ unsupported,
-/* UINTTOFP_I128_F32 */ f32_func_i64_i64,
-/* UINTTOFP_I128_F64 */ f64_func_i64_i64,
-/* UINTTOFP_I128_F80 */ unsupported,
-/* UINTTOFP_I128_F128 */ func_iPTR_i64_i64,
-/* UINTTOFP_I128_PPCF128 */ unsupported,
-
-// COMPARISON
-/* OEQ_F32 */ i32_func_f32_f32,
-/* OEQ_F64 */ i32_func_f64_f64,
-/* OEQ_F128 */ i32_func_i64_i64_i64_i64,
-/* OEQ_PPCF128 */ unsupported,
-/* UNE_F32 */ i32_func_f32_f32,
-/* UNE_F64 */ i32_func_f64_f64,
-/* UNE_F128 */ i32_func_i64_i64_i64_i64,
-/* UNE_PPCF128 */ unsupported,
-/* OGE_F32 */ i32_func_f32_f32,
-/* OGE_F64 */ i32_func_f64_f64,
-/* OGE_F128 */ i32_func_i64_i64_i64_i64,
-/* OGE_PPCF128 */ unsupported,
-/* OLT_F32 */ i32_func_f32_f32,
-/* OLT_F64 */ i32_func_f64_f64,
-/* OLT_F128 */ i32_func_i64_i64_i64_i64,
-/* OLT_PPCF128 */ unsupported,
-/* OLE_F32 */ i32_func_f32_f32,
-/* OLE_F64 */ i32_func_f64_f64,
-/* OLE_F128 */ i32_func_i64_i64_i64_i64,
-/* OLE_PPCF128 */ unsupported,
-/* OGT_F32 */ i32_func_f32_f32,
-/* OGT_F64 */ i32_func_f64_f64,
-/* OGT_F128 */ i32_func_i64_i64_i64_i64,
-/* OGT_PPCF128 */ unsupported,
-/* UO_F32 */ i32_func_f32_f32,
-/* UO_F64 */ i32_func_f64_f64,
-/* UO_F128 */ i32_func_i64_i64_i64_i64,
-/* UO_PPCF128 */ unsupported,
-/* O_F32 */ i32_func_f32_f32,
-/* O_F64 */ i32_func_f64_f64,
-/* O_F128 */ i32_func_i64_i64_i64_i64,
-/* O_PPCF128 */ unsupported,
-
-// MEMORY
-/* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR,
-/* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR,
-/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
-/* BZERO */ unsupported,
-
-// ELEMENT-WISE ATOMIC MEMORY
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
-
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
-
-// EXCEPTION HANDLING
-/* UNWIND_RESUME */ unsupported,
-
-// Note: there's two sets of atomics libcalls; see
-// <http://llvm.org/docs/Atomics.html> for more info on the
-// difference between them.
-
-// Atomic '__sync_*' libcalls.
-/* SYNC_VAL_COMPARE_AND_SWAP_1 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_2 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_4 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_8 */ unsupported,
-/* SYNC_VAL_COMPARE_AND_SWAP_16 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_1 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_2 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_4 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_8 */ unsupported,
-/* SYNC_LOCK_TEST_AND_SET_16 */ unsupported,
-/* SYNC_FETCH_AND_ADD_1 */ unsupported,
-/* SYNC_FETCH_AND_ADD_2 */ unsupported,
-/* SYNC_FETCH_AND_ADD_4 */ unsupported,
-/* SYNC_FETCH_AND_ADD_8 */ unsupported,
-/* SYNC_FETCH_AND_ADD_16 */ unsupported,
-/* SYNC_FETCH_AND_SUB_1 */ unsupported,
-/* SYNC_FETCH_AND_SUB_2 */ unsupported,
-/* SYNC_FETCH_AND_SUB_4 */ unsupported,
-/* SYNC_FETCH_AND_SUB_8 */ unsupported,
-/* SYNC_FETCH_AND_SUB_16 */ unsupported,
-/* SYNC_FETCH_AND_AND_1 */ unsupported,
-/* SYNC_FETCH_AND_AND_2 */ unsupported,
-/* SYNC_FETCH_AND_AND_4 */ unsupported,
-/* SYNC_FETCH_AND_AND_8 */ unsupported,
-/* SYNC_FETCH_AND_AND_16 */ unsupported,
-/* SYNC_FETCH_AND_OR_1 */ unsupported,
-/* SYNC_FETCH_AND_OR_2 */ unsupported,
-/* SYNC_FETCH_AND_OR_4 */ unsupported,
-/* SYNC_FETCH_AND_OR_8 */ unsupported,
-/* SYNC_FETCH_AND_OR_16 */ unsupported,
-/* SYNC_FETCH_AND_XOR_1 */ unsupported,
-/* SYNC_FETCH_AND_XOR_2 */ unsupported,
-/* SYNC_FETCH_AND_XOR_4 */ unsupported,
-/* SYNC_FETCH_AND_XOR_8 */ unsupported,
-/* SYNC_FETCH_AND_XOR_16 */ unsupported,
-/* SYNC_FETCH_AND_NAND_1 */ unsupported,
-/* SYNC_FETCH_AND_NAND_2 */ unsupported,
-/* SYNC_FETCH_AND_NAND_4 */ unsupported,
-/* SYNC_FETCH_AND_NAND_8 */ unsupported,
-/* SYNC_FETCH_AND_NAND_16 */ unsupported,
-/* SYNC_FETCH_AND_MAX_1 */ unsupported,
-/* SYNC_FETCH_AND_MAX_2 */ unsupported,
-/* SYNC_FETCH_AND_MAX_4 */ unsupported,
-/* SYNC_FETCH_AND_MAX_8 */ unsupported,
-/* SYNC_FETCH_AND_MAX_16 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_1 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_2 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_4 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_8 */ unsupported,
-/* SYNC_FETCH_AND_UMAX_16 */ unsupported,
-/* SYNC_FETCH_AND_MIN_1 */ unsupported,
-/* SYNC_FETCH_AND_MIN_2 */ unsupported,
-/* SYNC_FETCH_AND_MIN_4 */ unsupported,
-/* SYNC_FETCH_AND_MIN_8 */ unsupported,
-/* SYNC_FETCH_AND_MIN_16 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_1 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_2 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_4 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_8 */ unsupported,
-/* SYNC_FETCH_AND_UMIN_16 */ unsupported,
-
-// Atomic '__atomic_*' libcalls.
-/* ATOMIC_LOAD */ unsupported,
-/* ATOMIC_LOAD_1 */ unsupported,
-/* ATOMIC_LOAD_2 */ unsupported,
-/* ATOMIC_LOAD_4 */ unsupported,
-/* ATOMIC_LOAD_8 */ unsupported,
-/* ATOMIC_LOAD_16 */ unsupported,
-
-/* ATOMIC_STORE */ unsupported,
-/* ATOMIC_STORE_1 */ unsupported,
-/* ATOMIC_STORE_2 */ unsupported,
-/* ATOMIC_STORE_4 */ unsupported,
-/* ATOMIC_STORE_8 */ unsupported,
-/* ATOMIC_STORE_16 */ unsupported,
-
-/* ATOMIC_EXCHANGE */ unsupported,
-/* ATOMIC_EXCHANGE_1 */ unsupported,
-/* ATOMIC_EXCHANGE_2 */ unsupported,
-/* ATOMIC_EXCHANGE_4 */ unsupported,
-/* ATOMIC_EXCHANGE_8 */ unsupported,
-/* ATOMIC_EXCHANGE_16 */ unsupported,
-
-/* ATOMIC_COMPARE_EXCHANGE */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_1 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_2 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_4 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_8 */ unsupported,
-/* ATOMIC_COMPARE_EXCHANGE_16 */ unsupported,
-
-/* ATOMIC_FETCH_ADD_1 */ unsupported,
-/* ATOMIC_FETCH_ADD_2 */ unsupported,
-/* ATOMIC_FETCH_ADD_4 */ unsupported,
-/* ATOMIC_FETCH_ADD_8 */ unsupported,
-/* ATOMIC_FETCH_ADD_16 */ unsupported,
-
-/* ATOMIC_FETCH_SUB_1 */ unsupported,
-/* ATOMIC_FETCH_SUB_2 */ unsupported,
-/* ATOMIC_FETCH_SUB_4 */ unsupported,
-/* ATOMIC_FETCH_SUB_8 */ unsupported,
-/* ATOMIC_FETCH_SUB_16 */ unsupported,
-
-/* ATOMIC_FETCH_AND_1 */ unsupported,
-/* ATOMIC_FETCH_AND_2 */ unsupported,
-/* ATOMIC_FETCH_AND_4 */ unsupported,
-/* ATOMIC_FETCH_AND_8 */ unsupported,
-/* ATOMIC_FETCH_AND_16 */ unsupported,
-
-/* ATOMIC_FETCH_OR_1 */ unsupported,
-/* ATOMIC_FETCH_OR_2 */ unsupported,
-/* ATOMIC_FETCH_OR_4 */ unsupported,
-/* ATOMIC_FETCH_OR_8 */ unsupported,
-/* ATOMIC_FETCH_OR_16 */ unsupported,
-
-/* ATOMIC_FETCH_XOR_1 */ unsupported,
-/* ATOMIC_FETCH_XOR_2 */ unsupported,
-/* ATOMIC_FETCH_XOR_4 */ unsupported,
-/* ATOMIC_FETCH_XOR_8 */ unsupported,
-/* ATOMIC_FETCH_XOR_16 */ unsupported,
-
-/* ATOMIC_FETCH_NAND_1 */ unsupported,
-/* ATOMIC_FETCH_NAND_2 */ unsupported,
-/* ATOMIC_FETCH_NAND_4 */ unsupported,
-/* ATOMIC_FETCH_NAND_8 */ unsupported,
-/* ATOMIC_FETCH_NAND_16 */ unsupported,
-
-// Stack Protector Fail.
-/* STACKPROTECTOR_CHECK_FAIL */ func,
-
-// Deoptimization.
-/* DEOPTIMIZE */ unsupported,
 
+struct RuntimeLibcallSignatureTable {
+  std::vector<RuntimeLibcallSignature> Table;
+
+  // Any newly-added libcalls will be unsupported by default.
+  RuntimeLibcallSignatureTable() : Table(RTLIB::UNKNOWN_LIBCALL, unsupported) {
+    // Integer
+    Table[RTLIB::SHL_I16] = i16_func_i16_i16;
+    Table[RTLIB::SHL_I32] = i32_func_i32_i32;
+    Table[RTLIB::SHL_I64] = i64_func_i64_i64;
+    Table[RTLIB::SHL_I128] = i64_i64_func_i64_i64_i32;
+    Table[RTLIB::SRL_I16] = i16_func_i16_i16;
+    Table[RTLIB::SRL_I32] = i32_func_i32_i32;
+    Table[RTLIB::SRL_I64] = i64_func_i64_i64;
+    Table[RTLIB::SRL_I128] = i64_i64_func_i64_i64_i32;
+    Table[RTLIB::SRA_I16] = i16_func_i16_i16;
+    Table[RTLIB::SRA_I32] = i32_func_i32_i32;
+    Table[RTLIB::SRA_I64] = i64_func_i64_i64;
+    Table[RTLIB::SRA_I128] = i64_i64_func_i64_i64_i32;
+    Table[RTLIB::MUL_I8] = i8_func_i8_i8;
+    Table[RTLIB::MUL_I16] = i16_func_i16_i16;
+    Table[RTLIB::MUL_I32] = i32_func_i32_i32;
+    Table[RTLIB::MUL_I64] = i64_func_i64_i64;
+    Table[RTLIB::MUL_I128] = i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::MULO_I32] = i32_func_i32_i32_iPTR;
+    Table[RTLIB::MULO_I64] = i64_func_i64_i64_iPTR;
+    Table[RTLIB::MULO_I128] = i64_i64_func_i64_i64_i64_i64_iPTR;
+    Table[RTLIB::SDIV_I8] = i8_func_i8_i8;
+    Table[RTLIB::SDIV_I16] = i16_func_i16_i16;
+    Table[RTLIB::SDIV_I32] = i32_func_i32_i32;
+    Table[RTLIB::SDIV_I64] = i64_func_i64_i64;
+    Table[RTLIB::SDIV_I128] = i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::UDIV_I8] = i8_func_i8_i8;
+    Table[RTLIB::UDIV_I16] = i16_func_i16_i16;
+    Table[RTLIB::UDIV_I32] = i32_func_i32_i32;
+    Table[RTLIB::UDIV_I64] = i64_func_i64_i64;
+    Table[RTLIB::UDIV_I128] = i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::SREM_I8] = i8_func_i8_i8;
+    Table[RTLIB::SREM_I16] = i16_func_i16_i16;
+    Table[RTLIB::SREM_I32] = i32_func_i32_i32;
+    Table[RTLIB::SREM_I64] = i64_func_i64_i64;
+    Table[RTLIB::SREM_I128] = i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::UREM_I8] = i8_func_i8_i8;
+    Table[RTLIB::UREM_I16] = i16_func_i16_i16;
+    Table[RTLIB::UREM_I32] = i32_func_i32_i32;
+    Table[RTLIB::UREM_I64] = i64_func_i64_i64;
+    Table[RTLIB::UREM_I128] = i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::SDIVREM_I8] = i8_func_i8_i8;
+    Table[RTLIB::SDIVREM_I16] = i16_i16_func_i16_i16;
+    Table[RTLIB::SDIVREM_I32] = i32_i32_func_i32_i32;
+    Table[RTLIB::SDIVREM_I64] = i64_func_i64_i64;
+    Table[RTLIB::SDIVREM_I128] = i64_i64_i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::UDIVREM_I8] = i8_func_i8_i8;
+    Table[RTLIB::UDIVREM_I16] = i16_i16_func_i16_i16;
+    Table[RTLIB::UDIVREM_I32] = i32_i32_func_i32_i32;
+    Table[RTLIB::UDIVREM_I64] = i64_i64_func_i64_i64;
+    Table[RTLIB::UDIVREM_I128] = i64_i64_i64_i64_func_i64_i64_i64_i64;
+    Table[RTLIB::NEG_I32] = i32_func_i32;
+    Table[RTLIB::NEG_I64] = i64_func_i64;
+
+    // Floating-point.
+    // All F80 and PPCF128 routines are unsupported.
+    Table[RTLIB::ADD_F32] = f32_func_f32_f32;
+    Table[RTLIB::ADD_F64] = f64_func_f64_f64;
+    Table[RTLIB::ADD_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::SUB_F32] = f32_func_f32_f32;
+    Table[RTLIB::SUB_F64] = f64_func_f64_f64;
+    Table[RTLIB::SUB_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::MUL_F32] = f32_func_f32_f32;
+    Table[RTLIB::MUL_F64] = f64_func_f64_f64;
+    Table[RTLIB::MUL_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::DIV_F32] = f32_func_f32_f32;
+    Table[RTLIB::DIV_F64] = f64_func_f64_f64;
+    Table[RTLIB::DIV_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::REM_F32] = f32_func_f32_f32;
+    Table[RTLIB::REM_F64] = f64_func_f64_f64;
+    Table[RTLIB::REM_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::FMA_F32] = f32_func_f32_f32_f32;
+    Table[RTLIB::FMA_F64] = f64_func_f64_f64_f64;
+    Table[RTLIB::FMA_F128] = func_iPTR_i64_i64_i64_i64_i64_i64;
+    Table[RTLIB::POWI_F32] = f32_func_f32_i32;
+    Table[RTLIB::POWI_F64] = f64_func_f64_i32;
+    Table[RTLIB::POWI_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::SQRT_F32] = f32_func_f32;
+    Table[RTLIB::SQRT_F64] = f64_func_f64;
+    Table[RTLIB::SQRT_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::LOG_F32] = f32_func_f32;
+    Table[RTLIB::LOG_F64] = f64_func_f64;
+    Table[RTLIB::LOG_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::LOG2_F32] = f32_func_f32;
+    Table[RTLIB::LOG2_F64] = f64_func_f64;
+    Table[RTLIB::LOG2_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::LOG10_F32] = f32_func_f32;
+    Table[RTLIB::LOG10_F64] = f64_func_f64;
+    Table[RTLIB::LOG10_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::EXP_F32] = f32_func_f32;
+    Table[RTLIB::EXP_F64] = f64_func_f64;
+    Table[RTLIB::EXP_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::EXP2_F32] = f32_func_f32;
+    Table[RTLIB::EXP2_F64] = f64_func_f64;
+    Table[RTLIB::EXP2_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::SIN_F32] = f32_func_f32;
+    Table[RTLIB::SIN_F64] = f64_func_f64;
+    Table[RTLIB::SIN_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::COS_F32] = f32_func_f32;
+    Table[RTLIB::COS_F64] = f64_func_f64;
+    Table[RTLIB::COS_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR;
+    Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR;
+    Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR;
+    Table[RTLIB::POW_F32] = f32_func_f32_f32;
+    Table[RTLIB::POW_F64] = f64_func_f64_f64;
+    Table[RTLIB::POW_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::CEIL_F32] = f32_func_f32;
+    Table[RTLIB::CEIL_F64] = f64_func_f64;
+    Table[RTLIB::CEIL_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::TRUNC_F32] = f32_func_f32;
+    Table[RTLIB::TRUNC_F64] = f64_func_f64;
+    Table[RTLIB::TRUNC_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::RINT_F32] = f32_func_f32;
+    Table[RTLIB::RINT_F64] = f64_func_f64;
+    Table[RTLIB::RINT_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::NEARBYINT_F32] = f32_func_f32;
+    Table[RTLIB::NEARBYINT_F64] = f64_func_f64;
+    Table[RTLIB::NEARBYINT_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::ROUND_F32] = f32_func_f32;
+    Table[RTLIB::ROUND_F64] = f64_func_f64;
+    Table[RTLIB::ROUND_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::FLOOR_F32] = f32_func_f32;
+    Table[RTLIB::FLOOR_F64] = f64_func_f64;
+    Table[RTLIB::FLOOR_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::COPYSIGN_F32] = f32_func_f32_f32;
+    Table[RTLIB::COPYSIGN_F64] = f64_func_f64_f64;
+    Table[RTLIB::COPYSIGN_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::FMIN_F32] = f32_func_f32_f32;
+    Table[RTLIB::FMIN_F64] = f64_func_f64_f64;
+    Table[RTLIB::FMIN_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::FMAX_F32] = f32_func_f32_f32;
+    Table[RTLIB::FMAX_F64] = f64_func_f64_f64;
+    Table[RTLIB::FMAX_F128] = func_iPTR_i64_i64_i64_i64;
+
+    // Conversion
+    // All F80 and PPCF128 routines are unspported.
+    Table[RTLIB::FPEXT_F64_F128] = func_iPTR_f64;
+    Table[RTLIB::FPEXT_F32_F128] = func_iPTR_f32;
+    Table[RTLIB::FPEXT_F32_F64] = f64_func_f32;
+    Table[RTLIB::FPEXT_F16_F32] = f32_func_i16;
+    Table[RTLIB::FPROUND_F32_F16] = i16_func_f32;
+    Table[RTLIB::FPROUND_F64_F32] = f32_func_f64;
+    Table[RTLIB::FPROUND_F128_F32] = f32_func_i64_i64;
+    Table[RTLIB::FPROUND_F128_F64] = f64_func_i64_i64;
+    Table[RTLIB::FPTOSINT_F32_I32] = i32_func_f32;
+    Table[RTLIB::FPTOSINT_F32_I64] = i64_func_f32;
+    Table[RTLIB::FPTOSINT_F32_I128] = i64_i64_func_f32;
+    Table[RTLIB::FPTOSINT_F64_I32] = i32_func_f64;
+    Table[RTLIB::FPTOSINT_F64_I64] = i64_func_f64;
+    Table[RTLIB::FPTOSINT_F64_I128] = i64_i64_func_f64;
+    Table[RTLIB::FPTOSINT_F128_I32] = i32_func_i64_i64;
+    Table[RTLIB::FPTOSINT_F128_I64] = i64_func_i64_i64;
+    Table[RTLIB::FPTOSINT_F128_I128] = i64_i64_func_i64_i64;
+    Table[RTLIB::FPTOUINT_F32_I32] = i32_func_f32;
+    Table[RTLIB::FPTOUINT_F32_I64] = i64_func_f32;
+    Table[RTLIB::FPTOUINT_F32_I128] = i64_i64_func_f32;
+    Table[RTLIB::FPTOUINT_F64_I32] = i32_func_f64;
+    Table[RTLIB::FPTOUINT_F64_I64] = i64_func_f64;
+    Table[RTLIB::FPTOUINT_F64_I128] = i64_i64_func_f64;
+    Table[RTLIB::FPTOUINT_F128_I32] = i32_func_i64_i64;
+    Table[RTLIB::FPTOUINT_F128_I64] = i64_func_i64_i64;
+    Table[RTLIB::FPTOUINT_F128_I128] = i64_i64_func_i64_i64;
+    Table[RTLIB::SINTTOFP_I32_F32] = f32_func_i32;
+    Table[RTLIB::SINTTOFP_I32_F64] = f64_func_i32;
+    Table[RTLIB::SINTTOFP_I32_F128] = func_iPTR_i32;
+    Table[RTLIB::SINTTOFP_I64_F32] = f32_func_i64;
+    Table[RTLIB::SINTTOFP_I64_F64] = f64_func_i64;
+    Table[RTLIB::SINTTOFP_I64_F128] = func_iPTR_i64;
+    Table[RTLIB::SINTTOFP_I128_F32] = f32_func_i64_i64;
+    Table[RTLIB::SINTTOFP_I128_F64] = f64_func_i64_i64;
+    Table[RTLIB::SINTTOFP_I128_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::UINTTOFP_I32_F32] = f32_func_i32;
+    Table[RTLIB::UINTTOFP_I32_F64] = f64_func_i64;
+    Table[RTLIB::UINTTOFP_I32_F128] = func_iPTR_i32;
+    Table[RTLIB::UINTTOFP_I64_F32] = f32_func_i64;
+    Table[RTLIB::UINTTOFP_I64_F64] = f64_func_i64;
+    Table[RTLIB::UINTTOFP_I64_F128] = func_iPTR_i64;
+    Table[RTLIB::UINTTOFP_I128_F32] = f32_func_i64_i64;
+    Table[RTLIB::UINTTOFP_I128_F64] = f64_func_i64_i64;
+    Table[RTLIB::UINTTOFP_I128_F128] = func_iPTR_i64_i64;
+
+    // Comparison
+    // ALl F80 and PPCF128 routines are unsupported.
+    Table[RTLIB::OEQ_F32] = i32_func_f32_f32;
+    Table[RTLIB::OEQ_F64] = i32_func_f64_f64;
+    Table[RTLIB::OEQ_F128] = i32_func_i64_i64_i64_i64;
+    Table[RTLIB::UNE_F32] = i32_func_f32_f32;
+    Table[RTLIB::UNE_F64] = i32_func_f64_f64;
+    Table[RTLIB::UNE_F128] = i32_func_i64_i64_i64_i64;
+    Table[RTLIB::OGE_F32] = i32_func_f32_f32;
+    Table[RTLIB::OGE_F64] = i32_func_f64_f64;
+    Table[RTLIB::OGE_F128] = i32_func_i64_i64_i64_i64;
+    Table[RTLIB::OLT_F32] = i32_func_f32_f32;
+    Table[RTLIB::OLT_F64] = i32_func_f64_f64;
+    Table[RTLIB::OLT_F128] = i32_func_i64_i64_i64_i64;
+    Table[RTLIB::OLE_F32] = i32_func_f32_f32;
+    Table[RTLIB::OLE_F64] = i32_func_f64_f64;
+    Table[RTLIB::OLE_F128] = i32_func_i64_i64_i64_i64;
+    Table[RTLIB::OGT_F32] = i32_func_f32_f32;
+    Table[RTLIB::OGT_F64] = i32_func_f64_f64;
+    Table[RTLIB::OGT_F128] = i32_func_i64_i64_i64_i64;
+    Table[RTLIB::UO_F32] = i32_func_f32_f32;
+    Table[RTLIB::UO_F64] = i32_func_f64_f64;
+    Table[RTLIB::UO_F128] = i32_func_i64_i64_i64_i64;
+    // O_FXX has the weird property that it uses the same libcall name as UO_FXX
+    // This breaks our name-based lookup. Fortunately only the UO family of
+    // libcalls appears to be actually used.
+    Table[RTLIB::O_F32] = unsupported;
+    Table[RTLIB::O_F64] = unsupported;
+    Table[RTLIB::O_F128] = unsupported;
+
+    // Memory
+    Table[RTLIB::MEMCPY] = iPTR_func_iPTR_iPTR_iPTR;
+    Table[RTLIB::MEMSET] = iPTR_func_iPTR_i32_iPTR;
+    Table[RTLIB::MEMMOVE] = iPTR_func_iPTR_iPTR_iPTR;
+
+    // Element-wise Atomic memory
+    // TODO: Fix these when we implement atomic support
+    Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
+    Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_2] = unsupported;
+    Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_4] = unsupported;
+    Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_8] = unsupported;
+    Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_16] = unsupported;
+    Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
+    Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2] = unsupported;
+    Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4] = unsupported;
+    Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8] = unsupported;
+    Table[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16] = unsupported;
+
+    Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
+    Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_2] = unsupported;
+    Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_4] = unsupported;
+    Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_8] = unsupported;
+    Table[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_16] = unsupported;
+
+    // Atomic '__sync_*' libcalls.
+    // TODO: Fix these when we implement atomic support
+    Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = unsupported;
+    Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = unsupported;
+    Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = unsupported;
+    Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = unsupported;
+    Table[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = unsupported;
+    Table[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = unsupported;
+    Table[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = unsupported;
+    Table[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = unsupported;
+    Table[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = unsupported;
+    Table[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_ADD_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_ADD_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_ADD_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_ADD_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_ADD_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_SUB_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_SUB_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_SUB_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_SUB_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_SUB_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_AND_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_AND_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_AND_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_AND_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_AND_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_OR_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_OR_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_OR_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_OR_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_OR_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_XOR_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_XOR_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_XOR_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_XOR_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_XOR_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_NAND_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_NAND_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_NAND_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_NAND_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_NAND_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MAX_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MAX_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MAX_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MAX_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MAX_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMAX_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMAX_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMAX_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMAX_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMAX_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MIN_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MIN_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MIN_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MIN_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_MIN_16] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMIN_1] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMIN_2] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMIN_4] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMIN_8] = unsupported;
+    Table[RTLIB::SYNC_FETCH_AND_UMIN_16] = unsupported;
+
+    // Atomic '__atomic_*' libcalls.
+    // TODO: Fix these when we implement atomic support
+    Table[RTLIB::ATOMIC_LOAD] = unsupported;
+    Table[RTLIB::ATOMIC_LOAD_1] = unsupported;
+    Table[RTLIB::ATOMIC_LOAD_2] = unsupported;
+    Table[RTLIB::ATOMIC_LOAD_4] = unsupported;
+    Table[RTLIB::ATOMIC_LOAD_8] = unsupported;
+    Table[RTLIB::ATOMIC_LOAD_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_STORE] = unsupported;
+    Table[RTLIB::ATOMIC_STORE_1] = unsupported;
+    Table[RTLIB::ATOMIC_STORE_2] = unsupported;
+    Table[RTLIB::ATOMIC_STORE_4] = unsupported;
+    Table[RTLIB::ATOMIC_STORE_8] = unsupported;
+    Table[RTLIB::ATOMIC_STORE_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_EXCHANGE] = unsupported;
+    Table[RTLIB::ATOMIC_EXCHANGE_1] = unsupported;
+    Table[RTLIB::ATOMIC_EXCHANGE_2] = unsupported;
+    Table[RTLIB::ATOMIC_EXCHANGE_4] = unsupported;
+    Table[RTLIB::ATOMIC_EXCHANGE_8] = unsupported;
+    Table[RTLIB::ATOMIC_EXCHANGE_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_COMPARE_EXCHANGE] = unsupported;
+    Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_1] = unsupported;
+    Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_2] = unsupported;
+    Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_4] = unsupported;
+    Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_8] = unsupported;
+    Table[RTLIB::ATOMIC_COMPARE_EXCHANGE_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_FETCH_ADD_1] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_ADD_2] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_ADD_4] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_ADD_8] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_ADD_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_FETCH_SUB_1] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_SUB_2] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_SUB_4] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_SUB_8] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_SUB_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_FETCH_AND_1] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_AND_2] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_AND_4] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_AND_8] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_AND_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_FETCH_OR_1] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_OR_2] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_OR_4] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_OR_8] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_OR_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_FETCH_XOR_1] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_XOR_2] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_XOR_4] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_XOR_8] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_XOR_16] = unsupported;
+
+    Table[RTLIB::ATOMIC_FETCH_NAND_1] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_NAND_2] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_NAND_4] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_NAND_8] = unsupported;
+    Table[RTLIB::ATOMIC_FETCH_NAND_16] = unsupported;
+  }
 };
 
-static const char *
-RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
-/* SHL_I16 */ "__ashlhi3",
-/* SHL_I32 */ "__ashlsi3",
-/* SHL_I64 */ "__ashldi3",
-/* SHL_I128 */ "__ashlti3",
-/* SRL_I16 */ "__lshrhi3",
-/* SRL_I32 */ "__lshrsi3",
-/* SRL_I64 */ "__lshrdi3",
-/* SRL_I128 */ "__lshrti3",
-/* SRA_I16 */ "__ashrhi3",
-/* SRA_I32 */ "__ashrsi3",
-/* SRA_I64 */ "__ashrdi3",
-/* SRA_I128 */ "__ashrti3",
-/* MUL_I8 */ "__mulqi3",
-/* MUL_I16 */ "__mulhi3",
-/* MUL_I32 */ "__mulsi3",
-/* MUL_I64 */ "__muldi3",
-/* MUL_I128 */ "__multi3",
-/* MULO_I32 */ "__mulosi4",
-/* MULO_I64 */ "__mulodi4",
-/* MULO_I128 */ "__muloti4",
-/* SDIV_I8 */ "__divqi3",
-/* SDIV_I16 */ "__divhi3",
-/* SDIV_I32 */ "__divsi3",
-/* SDIV_I64 */ "__divdi3",
-/* SDIV_I128 */ "__divti3",
-/* UDIV_I8 */ "__udivqi3",
-/* UDIV_I16 */ "__udivhi3",
-/* UDIV_I32 */ "__udivsi3",
-/* UDIV_I64 */ "__udivdi3",
-/* UDIV_I128 */ "__udivti3",
-/* SREM_I8 */ "__modqi3",
-/* SREM_I16 */ "__modhi3",
-/* SREM_I32 */ "__modsi3",
-/* SREM_I64 */ "__moddi3",
-/* SREM_I128 */ "__modti3",
-/* UREM_I8 */ "__umodqi3",
-/* UREM_I16 */ "__umodhi3",
-/* UREM_I32 */ "__umodsi3",
-/* UREM_I64 */ "__umoddi3",
-/* UREM_I128 */ "__umodti3",
-/* SDIVREM_I8 */ nullptr,
-/* SDIVREM_I16 */ nullptr,
-/* SDIVREM_I32 */ nullptr,
-/* SDIVREM_I64 */ nullptr,
-/* SDIVREM_I128 */ nullptr,
-/* UDIVREM_I8 */ nullptr,
-/* UDIVREM_I16 */ nullptr,
-/* UDIVREM_I32 */ nullptr,
-/* UDIVREM_I64 */ nullptr,
-/* UDIVREM_I128 */ nullptr,
-/* NEG_I32 */ "__negsi2",
-/* NEG_I64 */ "__negdi2",
-/* ADD_F32 */ "__addsf3",
-/* ADD_F64 */ "__adddf3",
-/* ADD_F80 */ nullptr,
-/* ADD_F128 */ "__addtf3",
-/* ADD_PPCF128 */ nullptr,
-/* SUB_F32 */ "__subsf3",
-/* SUB_F64 */ "__subdf3",
-/* SUB_F80 */ nullptr,
-/* SUB_F128 */ "__subtf3",
-/* SUB_PPCF128 */ nullptr,
-/* MUL_F32 */ "__mulsf3",
-/* MUL_F64 */ "__muldf3",
-/* MUL_F80 */ nullptr,
-/* MUL_F128 */ "__multf3",
-/* MUL_PPCF128 */ nullptr,
-/* DIV_F32 */ "__divsf3",
-/* DIV_F64 */ "__divdf3",
-/* DIV_F80 */ nullptr,
-/* DIV_F128 */ "__divtf3",
-/* DIV_PPCF128 */ nullptr,
-/* REM_F32 */ "fmodf",
-/* REM_F64 */ "fmod",
-/* REM_F80 */ nullptr,
-/* REM_F128 */ "fmodl",
-/* REM_PPCF128 */ nullptr,
-/* FMA_F32 */ "fmaf",
-/* FMA_F64 */ "fma",
-/* FMA_F80 */ nullptr,
-/* FMA_F128 */ "fmal",
-/* FMA_PPCF128 */ nullptr,
-/* POWI_F32 */ "__powisf2",
-/* POWI_F64 */ "__powidf2",
-/* POWI_F80 */ nullptr,
-/* POWI_F128 */ "__powitf2",
-/* POWI_PPCF128 */ nullptr,
-/* SQRT_F32 */ "sqrtf",
-/* SQRT_F64 */ "sqrt",
-/* SQRT_F80 */ nullptr,
-/* SQRT_F128 */ "sqrtl",
-/* SQRT_PPCF128 */ nullptr,
-/* LOG_F32 */ "logf",
-/* LOG_F64 */ "log",
-/* LOG_F80 */ nullptr,
-/* LOG_F128 */ "logl",
-/* LOG_PPCF128 */ nullptr,
-/* LOG2_F32 */ "log2f",
-/* LOG2_F64 */ "log2",
-/* LOG2_F80 */ nullptr,
-/* LOG2_F128 */ "log2l",
-/* LOG2_PPCF128 */ nullptr,
-/* LOG10_F32 */ "log10f",
-/* LOG10_F64 */ "log10",
-/* LOG10_F80 */ nullptr,
-/* LOG10_F128 */ "log10l",
-/* LOG10_PPCF128 */ nullptr,
-/* EXP_F32 */ "expf",
-/* EXP_F64 */ "exp",
-/* EXP_F80 */ nullptr,
-/* EXP_F128 */ "expl",
-/* EXP_PPCF128 */ nullptr,
-/* EXP2_F32 */ "exp2f",
-/* EXP2_F64 */ "exp2",
-/* EXP2_F80 */ nullptr,
-/* EXP2_F128 */ "exp2l",
-/* EXP2_PPCF128 */ nullptr,
-/* SIN_F32 */ "sinf",
-/* SIN_F64 */ "sin",
-/* SIN_F80 */ nullptr,
-/* SIN_F128 */ "sinl",
-/* SIN_PPCF128 */ nullptr,
-/* COS_F32 */ "cosf",
-/* COS_F64 */ "cos",
-/* COS_F80 */ nullptr,
-/* COS_F128 */ "cosl",
-/* COS_PPCF128 */ nullptr,
-/* SINCOS_F32 */ "sincosf",
-/* SINCOS_F64 */ "sincos",
-/* SINCOS_F80 */ nullptr,
-/* SINCOS_F128 */ "sincosl",
-/* SINCOS_PPCF128 */ nullptr,
-/* SINCOS_STRET_F32 */ nullptr,
-/* SINCOS_STRET_F64 */ nullptr,
-/* POW_F32 */ "powf",
-/* POW_F64 */ "pow",
-/* POW_F80 */ nullptr,
-/* POW_F128 */ "powl",
-/* POW_PPCF128 */ nullptr,
-/* CEIL_F32 */ "ceilf",
-/* CEIL_F64 */ "ceil",
-/* CEIL_F80 */ nullptr,
-/* CEIL_F128 */ "ceill",
-/* CEIL_PPCF128 */ nullptr,
-/* TRUNC_F32 */ "truncf",
-/* TRUNC_F64 */ "trunc",
-/* TRUNC_F80 */ nullptr,
-/* TRUNC_F128 */ "truncl",
-/* TRUNC_PPCF128 */ nullptr,
-/* RINT_F32 */ "rintf",
-/* RINT_F64 */ "rint",
-/* RINT_F80 */ nullptr,
-/* RINT_F128 */ "rintl",
-/* RINT_PPCF128 */ nullptr,
-/* NEARBYINT_F32 */ "nearbyintf",
-/* NEARBYINT_F64 */ "nearbyint",
-/* NEARBYINT_F80 */ nullptr,
-/* NEARBYINT_F128 */ "nearbyintl",
-/* NEARBYINT_PPCF128 */ nullptr,
-/* ROUND_F32 */ "roundf",
-/* ROUND_F64 */ "round",
-/* ROUND_F80 */ nullptr,
-/* ROUND_F128 */ "roundl",
-/* ROUND_PPCF128 */ nullptr,
-/* FLOOR_F32 */ "floorf",
-/* FLOOR_F64 */ "floor",
-/* FLOOR_F80 */ nullptr,
-/* FLOOR_F128 */ "floorl",
-/* FLOOR_PPCF128 */ nullptr,
-/* COPYSIGN_F32 */ "copysignf",
-/* COPYSIGN_F64 */ "copysign",
-/* COPYSIGN_F80 */ nullptr,
-/* COPYSIGN_F128 */ "copysignl",
-/* COPYSIGN_PPCF128 */ nullptr,
-/* FMIN_F32 */ "fminf",
-/* FMIN_F64 */ "fmin",
-/* FMIN_F80 */ nullptr,
-/* FMIN_F128 */ "fminl",
-/* FMIN_PPCF128 */ nullptr,
-/* FMAX_F32 */ "fmaxf",
-/* FMAX_F64 */ "fmax",
-/* FMAX_F80 */ nullptr,
-/* FMAX_F128 */ "fmaxl",
-/* FMAX_PPCF128 */ nullptr,
-/* FPEXT_F32_PPCF128 */ nullptr,
-/* FPEXT_F64_PPCF128 */ nullptr,
-/* FPEXT_F64_F128 */ "__extenddftf2",
-/* FPEXT_F32_F128 */ "__extendsftf2",
-/* FPEXT_F32_F64 */ "__extendsfdf2",
-/* FPEXT_F16_F32 */ "__gnu_h2f_ieee",
-/* FPROUND_F32_F16 */ "__gnu_f2h_ieee",
-/* FPROUND_F64_F16 */ nullptr,
-/* FPROUND_F80_F16 */ nullptr,
-/* FPROUND_F128_F16 */ nullptr,
-/* FPROUND_PPCF128_F16 */ nullptr,
-/* FPROUND_F64_F32 */ "__truncdfsf2",
-/* FPROUND_F80_F32 */ "__truncxfsf2",
-/* FPROUND_F128_F32 */ "__trunctfsf2",
-/* FPROUND_PPCF128_F32 */ nullptr,
-/* FPROUND_F80_F64 */ "__truncxfdf2",
-/* FPROUND_F128_F64 */ "__trunctfdf2",
-/* FPROUND_PPCF128_F64 */ nullptr,
-/* FPTOSINT_F32_I32 */ "__fixsfsi",
-/* FPTOSINT_F32_I64 */ "__fixsfdi",
-/* FPTOSINT_F32_I128 */ "__fixsfti",
-/* FPTOSINT_F64_I32 */ "__fixdfsi",
-/* FPTOSINT_F64_I64 */ "__fixdfdi",
-/* FPTOSINT_F64_I128 */ "__fixdfti",
-/* FPTOSINT_F80_I32 */ "__fixxfsi",
-/* FPTOSINT_F80_I64 */ "__fixxfdi",
-/* FPTOSINT_F80_I128 */ "__fixxfti",
-/* FPTOSINT_F128_I32 */ "__fixtfsi",
-/* FPTOSINT_F128_I64 */ "__fixtfdi",
-/* FPTOSINT_F128_I128 */ "__fixtfti",
-/* FPTOSINT_PPCF128_I32 */ nullptr,
-/* FPTOSINT_PPCF128_I64 */ nullptr,
-/* FPTOSINT_PPCF128_I128 */ nullptr,
-/* FPTOUINT_F32_I32 */ "__fixunssfsi",
-/* FPTOUINT_F32_I64 */ "__fixunssfdi",
-/* FPTOUINT_F32_I128 */ "__fixunssfti",
-/* FPTOUINT_F64_I32 */ "__fixunsdfsi",
-/* FPTOUINT_F64_I64 */ "__fixunsdfdi",
-/* FPTOUINT_F64_I128 */ "__fixunsdfti",
-/* FPTOUINT_F80_I32 */ "__fixunsxfsi",
-/* FPTOUINT_F80_I64 */ "__fixunsxfdi",
-/* FPTOUINT_F80_I128 */ "__fixunsxfti",
-/* FPTOUINT_F128_I32 */ "__fixunstfsi",
-/* FPTOUINT_F128_I64 */ "__fixunstfdi",
-/* FPTOUINT_F128_I128 */ "__fixunstfti",
-/* FPTOUINT_PPCF128_I32 */ nullptr,
-/* FPTOUINT_PPCF128_I64 */ nullptr,
-/* FPTOUINT_PPCF128_I128 */ nullptr,
-/* SINTTOFP_I32_F32 */ "__floatsisf",
-/* SINTTOFP_I32_F64 */ "__floatsidf",
-/* SINTTOFP_I32_F80 */ nullptr,
-/* SINTTOFP_I32_F128 */ "__floatsitf",
-/* SINTTOFP_I32_PPCF128 */ nullptr,
-/* SINTTOFP_I64_F32 */ "__floatdisf",
-/* SINTTOFP_I64_F64 */ "__floatdidf",
-/* SINTTOFP_I64_F80 */ nullptr,
-/* SINTTOFP_I64_F128 */ "__floatditf",
-/* SINTTOFP_I64_PPCF128 */ nullptr,
-/* SINTTOFP_I128_F32 */ "__floattisf",
-/* SINTTOFP_I128_F64 */ "__floattidf",
-/* SINTTOFP_I128_F80 */ nullptr,
-/* SINTTOFP_I128_F128 */ "__floattitf",
-/* SINTTOFP_I128_PPCF128 */ nullptr,
-/* UINTTOFP_I32_F32 */ "__floatunsisf",
-/* UINTTOFP_I32_F64 */ "__floatunsidf",
-/* UINTTOFP_I32_F80 */ nullptr,
-/* UINTTOFP_I32_F128 */ "__floatunsitf",
-/* UINTTOFP_I32_PPCF128 */ nullptr,
-/* UINTTOFP_I64_F32 */ "__floatundisf",
-/* UINTTOFP_I64_F64 */ "__floatundidf",
-/* UINTTOFP_I64_F80 */ nullptr,
-/* UINTTOFP_I64_F128 */ "__floatunditf",
-/* UINTTOFP_I64_PPCF128 */ nullptr,
-/* UINTTOFP_I128_F32 */ "__floatuntisf",
-/* UINTTOFP_I128_F64 */ "__floatuntidf",
-/* UINTTOFP_I128_F80 */ nullptr,
-/* UINTTOFP_I128_F128 */ "__floatuntitf",
-/* UINTTOFP_I128_PPCF128 */ nullptr,
-/* OEQ_F32 */ "__eqsf2",
-/* OEQ_F64 */ "__eqdf2",
-/* OEQ_F128 */ "__eqtf2",
-/* OEQ_PPCF128 */ nullptr,
-/* UNE_F32 */ "__nesf2",
-/* UNE_F64 */ "__nedf2",
-/* UNE_F128 */ "__netf2",
-/* UNE_PPCF128 */ nullptr,
-/* OGE_F32 */ "__gesf2",
-/* OGE_F64 */ "__gedf2",
-/* OGE_F128 */ "__getf2",
-/* OGE_PPCF128 */ nullptr,
-/* OLT_F32 */ "__ltsf2",
-/* OLT_F64 */ "__ltdf2",
-/* OLT_F128 */ "__lttf2",
-/* OLT_PPCF128 */ nullptr,
-/* OLE_F32 */ "__lesf2",
-/* OLE_F64 */ "__ledf2",
-/* OLE_F128 */ "__letf2",
-/* OLE_PPCF128 */ nullptr,
-/* OGT_F32 */ "__gtsf2",
-/* OGT_F64 */ "__gtdf2",
-/* OGT_F128 */ "__gttf2",
-/* OGT_PPCF128 */ nullptr,
-/* UO_F32 */ "__unordsf2",
-/* UO_F64 */ "__unorddf2",
-/* UO_F128 */ "__unordtf2",
-/* UO_PPCF128 */ nullptr,
-/* O_F32 */ "__unordsf2",
-/* O_F64 */ "__unorddf2",
-/* O_F128 */ "__unordtf2",
-/* O_PPCF128 */ nullptr,
-/* MEMCPY */ "memcpy",
-/* MEMMOVE */ "memset",
-/* MEMSET */ "memmove",
-/* BZERO */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
-/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
-/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
-/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
-/* UNWIND_RESUME */ "_Unwind_Resume",
-/* SYNC_VAL_COMPARE_AND_SWAP_1 */ "__sync_val_compare_and_swap_1",
-/* SYNC_VAL_COMPARE_AND_SWAP_2 */ "__sync_val_compare_and_swap_2",
-/* SYNC_VAL_COMPARE_AND_SWAP_4 */ "__sync_val_compare_and_swap_4",
-/* SYNC_VAL_COMPARE_AND_SWAP_8 */ "__sync_val_compare_and_swap_8",
-/* SYNC_VAL_COMPARE_AND_SWAP_16 */ "__sync_val_compare_and_swap_16",
-/* SYNC_LOCK_TEST_AND_SET_1 */ "__sync_lock_test_and_set_1",
-/* SYNC_LOCK_TEST_AND_SET_2 */ "__sync_lock_test_and_set_2",
-/* SYNC_LOCK_TEST_AND_SET_4 */ "__sync_lock_test_and_set_4",
-/* SYNC_LOCK_TEST_AND_SET_8 */ "__sync_lock_test_and_set_8",
-/* SYNC_LOCK_TEST_AND_SET_16 */ "__sync_lock_test_and_set_16",
-/* SYNC_FETCH_AND_ADD_1 */ "__sync_fetch_and_add_1",
-/* SYNC_FETCH_AND_ADD_2 */ "__sync_fetch_and_add_2",
-/* SYNC_FETCH_AND_ADD_4 */ "__sync_fetch_and_add_4",
-/* SYNC_FETCH_AND_ADD_8 */ "__sync_fetch_and_add_8",
-/* SYNC_FETCH_AND_ADD_16 */ "__sync_fetch_and_add_16",
-/* SYNC_FETCH_AND_SUB_1 */ "__sync_fetch_and_sub_1",
-/* SYNC_FETCH_AND_SUB_2 */ "__sync_fetch_and_sub_2",
-/* SYNC_FETCH_AND_SUB_4 */ "__sync_fetch_and_sub_4",
-/* SYNC_FETCH_AND_SUB_8 */ "__sync_fetch_and_sub_8",
-/* SYNC_FETCH_AND_SUB_16 */ "__sync_fetch_and_sub_16",
-/* SYNC_FETCH_AND_AND_1 */ "__sync_fetch_and_and_1",
-/* SYNC_FETCH_AND_AND_2 */ "__sync_fetch_and_and_2",
-/* SYNC_FETCH_AND_AND_4 */ "__sync_fetch_and_and_4",
-/* SYNC_FETCH_AND_AND_8 */ "__sync_fetch_and_and_8",
-/* SYNC_FETCH_AND_AND_16 */ "__sync_fetch_and_and_16",
-/* SYNC_FETCH_AND_OR_1 */ "__sync_fetch_and_or_1",
-/* SYNC_FETCH_AND_OR_2 */ "__sync_fetch_and_or_2",
-/* SYNC_FETCH_AND_OR_4 */ "__sync_fetch_and_or_4",
-/* SYNC_FETCH_AND_OR_8 */ "__sync_fetch_and_or_8",
-/* SYNC_FETCH_AND_OR_16 */ "__sync_fetch_and_or_16",
-/* SYNC_FETCH_AND_XOR_1 */ "__sync_fetch_and_xor_1",
-/* SYNC_FETCH_AND_XOR_2 */ "__sync_fetch_and_xor_2",
-/* SYNC_FETCH_AND_XOR_4 */ "__sync_fetch_and_xor_4",
-/* SYNC_FETCH_AND_XOR_8 */ "__sync_fetch_and_xor_8",
-/* SYNC_FETCH_AND_XOR_16 */ "__sync_fetch_and_xor_16",
-/* SYNC_FETCH_AND_NAND_1 */ "__sync_fetch_and_nand_1",
-/* SYNC_FETCH_AND_NAND_2 */ "__sync_fetch_and_nand_2",
-/* SYNC_FETCH_AND_NAND_4 */ "__sync_fetch_and_nand_4",
-/* SYNC_FETCH_AND_NAND_8 */ "__sync_fetch_and_nand_8",
-/* SYNC_FETCH_AND_NAND_16 */ "__sync_fetch_and_nand_16",
-/* SYNC_FETCH_AND_MAX_1 */ "__sync_fetch_and_max_1",
-/* SYNC_FETCH_AND_MAX_2 */ "__sync_fetch_and_max_2",
-/* SYNC_FETCH_AND_MAX_4 */ "__sync_fetch_and_max_4",
-/* SYNC_FETCH_AND_MAX_8 */ "__sync_fetch_and_max_8",
-/* SYNC_FETCH_AND_MAX_16 */ "__sync_fetch_and_max_16",
-/* SYNC_FETCH_AND_UMAX_1 */ "__sync_fetch_and_umax_1",
-/* SYNC_FETCH_AND_UMAX_2 */ "__sync_fetch_and_umax_2",
-/* SYNC_FETCH_AND_UMAX_4 */ "__sync_fetch_and_umax_4",
-/* SYNC_FETCH_AND_UMAX_8 */ "__sync_fetch_and_umax_8",
-/* SYNC_FETCH_AND_UMAX_16 */ "__sync_fetch_and_umax_16",
-/* SYNC_FETCH_AND_MIN_1 */ "__sync_fetch_and_min_1",
-/* SYNC_FETCH_AND_MIN_2 */ "__sync_fetch_and_min_2",
-/* SYNC_FETCH_AND_MIN_4 */ "__sync_fetch_and_min_4",
-/* SYNC_FETCH_AND_MIN_8 */ "__sync_fetch_and_min_8",
-/* SYNC_FETCH_AND_MIN_16 */ "__sync_fetch_and_min_16",
-/* SYNC_FETCH_AND_UMIN_1 */ "__sync_fetch_and_umin_1",
-/* SYNC_FETCH_AND_UMIN_2 */ "__sync_fetch_and_umin_2",
-/* SYNC_FETCH_AND_UMIN_4 */ "__sync_fetch_and_umin_4",
-/* SYNC_FETCH_AND_UMIN_8 */ "__sync_fetch_and_umin_8",
-/* SYNC_FETCH_AND_UMIN_16 */ "__sync_fetch_and_umin_16",
-
-/* ATOMIC_LOAD */ "__atomic_load",
-/* ATOMIC_LOAD_1 */ "__atomic_load_1",
-/* ATOMIC_LOAD_2 */ "__atomic_load_2",
-/* ATOMIC_LOAD_4 */ "__atomic_load_4",
-/* ATOMIC_LOAD_8 */ "__atomic_load_8",
-/* ATOMIC_LOAD_16 */ "__atomic_load_16",
-
-/* ATOMIC_STORE */ "__atomic_store",
-/* ATOMIC_STORE_1 */ "__atomic_store_1",
-/* ATOMIC_STORE_2 */ "__atomic_store_2",
-/* ATOMIC_STORE_4 */ "__atomic_store_4",
-/* ATOMIC_STORE_8 */ "__atomic_store_8",
-/* ATOMIC_STORE_16 */ "__atomic_store_16",
-
-/* ATOMIC_EXCHANGE */ "__atomic_exchange",
-/* ATOMIC_EXCHANGE_1 */ "__atomic_exchange_1",
-/* ATOMIC_EXCHANGE_2 */ "__atomic_exchange_2",
-/* ATOMIC_EXCHANGE_4 */ "__atomic_exchange_4",
-/* ATOMIC_EXCHANGE_8 */ "__atomic_exchange_8",
-/* ATOMIC_EXCHANGE_16 */ "__atomic_exchange_16",
-
-/* ATOMIC_COMPARE_EXCHANGE */ "__atomic_compare_exchange",
-/* ATOMIC_COMPARE_EXCHANGE_1 */ "__atomic_compare_exchange_1",
-/* ATOMIC_COMPARE_EXCHANGE_2 */ "__atomic_compare_exchange_2",
-/* ATOMIC_COMPARE_EXCHANGE_4 */ "__atomic_compare_exchange_4",
-/* ATOMIC_COMPARE_EXCHANGE_8 */ "__atomic_compare_exchange_8",
-/* ATOMIC_COMPARE_EXCHANGE_16 */ "__atomic_compare_exchange_16",
+ManagedStatic<RuntimeLibcallSignatureTable> RuntimeLibcallSignatures;
+
+// Maps libcall names to their RTLIB::Libcall number. Builds the map in a
+// constructor for use with ManagedStatic
+struct StaticLibcallNameMap {
+  StringMap<RTLIB::Libcall> Map;
+  StaticLibcallNameMap() {
+    static const std::pair<const char *, RTLIB::Libcall> NameLibcalls[] = {
+#define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code},
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+    };
+    for (const auto &NameLibcall : NameLibcalls) {
+      if (NameLibcall.first != nullptr &&
+          RuntimeLibcallSignatures->Table[NameLibcall.second] != unsupported) {
+        assert(Map.find(NameLibcall.first) == Map.end() &&
+               "duplicate libcall names in name map");
+        Map[NameLibcall.first] = NameLibcall.second;
+      }
+    }
+  }
+};
 
-/* ATOMIC_FETCH_ADD_1 */ "__atomic_fetch_add_1",
-/* ATOMIC_FETCH_ADD_2 */ "__atomic_fetch_add_2",
-/* ATOMIC_FETCH_ADD_4 */ "__atomic_fetch_add_4",
-/* ATOMIC_FETCH_ADD_8 */ "__atomic_fetch_add_8",
-/* ATOMIC_FETCH_ADD_16 */ "__atomic_fetch_add_16",
-/* ATOMIC_FETCH_SUB_1 */ "__atomic_fetch_sub_1",
-/* ATOMIC_FETCH_SUB_2 */ "__atomic_fetch_sub_2",
-/* ATOMIC_FETCH_SUB_4 */ "__atomic_fetch_sub_4",
-/* ATOMIC_FETCH_SUB_8 */ "__atomic_fetch_sub_8",
-/* ATOMIC_FETCH_SUB_16 */ "__atomic_fetch_sub_16",
-/* ATOMIC_FETCH_AND_1 */ "__atomic_fetch_and_1",
-/* ATOMIC_FETCH_AND_2 */ "__atomic_fetch_and_2",
-/* ATOMIC_FETCH_AND_4 */ "__atomic_fetch_and_4",
-/* ATOMIC_FETCH_AND_8 */ "__atomic_fetch_and_8",
-/* ATOMIC_FETCH_AND_16 */ "__atomic_fetch_and_16",
-/* ATOMIC_FETCH_OR_1 */ "__atomic_fetch_or_1",
-/* ATOMIC_FETCH_OR_2 */ "__atomic_fetch_or_2",
-/* ATOMIC_FETCH_OR_4 */ "__atomic_fetch_or_4",
-/* ATOMIC_FETCH_OR_8 */ "__atomic_fetch_or_8",
-/* ATOMIC_FETCH_OR_16 */ "__atomic_fetch_or_16",
-/* ATOMIC_FETCH_XOR_1 */ "__atomic_fetch_xor_1",
-/* ATOMIC_FETCH_XOR_2 */ "__atomic_fetch_xor_2",
-/* ATOMIC_FETCH_XOR_4 */ "__atomic_fetch_xor_4",
-/* ATOMIC_FETCH_XOR_8 */ "__atomic_fetch_xor_8",
-/* ATOMIC_FETCH_XOR_16 */ "__atomic_fetch_xor_16",
-/* ATOMIC_FETCH_NAND_1 */ "__atomic_fetch_nand_1",
-/* ATOMIC_FETCH_NAND_2 */ "__atomic_fetch_nand_2",
-/* ATOMIC_FETCH_NAND_4 */ "__atomic_fetch_nand_4",
-/* ATOMIC_FETCH_NAND_8 */ "__atomic_fetch_nand_8",
-/* ATOMIC_FETCH_NAND_16 */ "__atomic_fetch_nand_16",
+} // end anonymous namespace
 
-/* STACKPROTECTOR_CHECK_FAIL */ "__stack_chk_fail",
 
-/* DEOPTIMIZE */ "__llvm_deoptimize",
-};
 
 void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
                         RTLIB::Libcall LC, SmallVectorImpl<wasm::ValType> &Rets,
@@ -1003,11 +494,11 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
   assert(Rets.empty());
   assert(Params.empty());
 
-  WebAssembly::ExprType iPTR = Subtarget.hasAddr64() ?
-                               WebAssembly::ExprType::I64 :
-                               WebAssembly::ExprType::I32;
+  wasm::ValType iPTR =
+      Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32;
 
-  switch (RuntimeLibcallSignatures[LC]) {
+  auto& Table = RuntimeLibcallSignatures->Table;
+  switch (Table[LC]) {
   case func:
     break;
   case f32_func_f32:
@@ -1111,13 +602,13 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     break;
   case func_f32_iPTR_iPTR:
     Params.push_back(wasm::ValType::F32);
-    Params.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
+    Params.push_back(iPTR);
     break;
   case func_f64_iPTR_iPTR:
     Params.push_back(wasm::ValType::F64);
-    Params.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
+    Params.push_back(iPTR);
     break;
   case i16_func_i16_i16:
     Rets.push_back(wasm::ValType::I32);
@@ -1139,17 +630,29 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     Params.push_back(wasm::ValType::I32);
     break;
+  case i32_func_i32_i32_iPTR:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(iPTR);
+    break;
   case i64_func_i64_i64:
     Rets.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     break;
+  case i64_func_i64_i64_iPTR:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(iPTR);
+    break;
   case i64_i64_func_f32:
 #if 0 // TODO: Enable this when wasm gets multiple-return-value support.
     Rets.push_back(wasm::ValType::I64);
     Rets.push_back(wasm::ValType::I64);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::F32);
     break;
@@ -1158,7 +661,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I64);
     Rets.push_back(wasm::ValType::I64);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::F64);
     break;
@@ -1167,7 +670,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I32);
     Rets.push_back(wasm::ValType::I32);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::I32);
     Params.push_back(wasm::ValType::I32);
@@ -1177,7 +680,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I32);
     Rets.push_back(wasm::ValType::I32);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::I32);
     Params.push_back(wasm::ValType::I32);
@@ -1187,7 +690,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I64);
     Rets.push_back(wasm::ValType::I64);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
@@ -1197,13 +700,26 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I64);
     Rets.push_back(wasm::ValType::I64);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     break;
+  case i64_i64_func_i64_i64_i64_i64_iPTR:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(iPTR);
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(iPTR);
+    break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
 #if 0 // TODO: Enable this when wasm gets multiple-return-value support.
     Rets.push_back(wasm::ValType::I64);
@@ -1211,7 +727,7 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I64);
     Rets.push_back(wasm::ValType::I64);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
@@ -1225,23 +741,23 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
     Rets.push_back(wasm::ValType::I64);
     Rets.push_back(wasm::ValType::I64);
 #else
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
 #endif
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I32);
     break;
   case iPTR_func_iPTR_i32_iPTR:
-    Rets.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
+    Rets.push_back(iPTR);
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::I32);
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     break;
   case iPTR_func_iPTR_iPTR_iPTR:
-    Rets.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
+    Rets.push_back(iPTR);
+    Params.push_back(iPTR);
+    Params.push_back(iPTR);
+    Params.push_back(iPTR);
     break;
   case f32_func_f32_f32_f32:
     Rets.push_back(wasm::ValType::F32);
@@ -1258,39 +774,39 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
   case func_i64_i64_iPTR_iPTR:
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
-    Params.push_back(wasm::ValType(iPTR));
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
+    Params.push_back(iPTR);
     break;
   case func_iPTR_f32:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::F32);
     break;
   case func_iPTR_f64:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::F64);
     break;
   case func_iPTR_i32:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::I32);
     break;
   case func_iPTR_i64:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::I64);
     break;
   case func_iPTR_i64_i64:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     break;
   case func_iPTR_i64_i64_i64_i64:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     break;
   case func_iPTR_i64_i64_i64_i64_i64_i64:
-    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(iPTR);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
@@ -1315,15 +831,14 @@ void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
   }
 }
 
+static ManagedStatic<StaticLibcallNameMap> LibcallNameMap;
+// TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed
+// other than here, just roll its logic into this version.
 void llvm::GetSignature(const WebAssemblySubtarget &Subtarget, const char *Name,
                         SmallVectorImpl<wasm::ValType> &Rets,
                         SmallVectorImpl<wasm::ValType> &Params) {
-  assert(strcmp(RuntimeLibcallNames[RTLIB::DEOPTIMIZE], "__llvm_deoptimize") ==
-         0);
-
-  for (size_t i = 0, e = RTLIB::UNKNOWN_LIBCALL; i < e; ++i)
-    if (RuntimeLibcallNames[i] && strcmp(RuntimeLibcallNames[i], Name) == 0)
-      return GetSignature(Subtarget, RTLIB::Libcall(i), Rets, Params);
-
-  llvm_unreachable("unexpected runtime library name");
+  auto& Map = LibcallNameMap->Map;
+  auto val = Map.find(Name);
+  assert(val != Map.end() && "unexpected runtime library name");
+  return GetSignature(Subtarget, val->second, Rets, Params);
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index 129067604784..2ba65ff5b716 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file provides signature information for runtime libcalls.
+/// This file provides signature information for runtime libcalls.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index fae9c6100510..bec72049258a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the WebAssemblySelectionDAGInfo class.
+/// This file implements the WebAssemblySelectionDAGInfo class.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 533c66b7a22f..31d150eded67 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the WebAssembly subclass for
+/// This file defines the WebAssembly subclass for
 /// SelectionDAGTargetInfo.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index c4b9e915b41e..14221993603a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file sets the p2align operands on load and store instructions.
+/// This file sets the p2align operands on load and store instructions.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -46,6 +46,10 @@ public:
 } // end anonymous namespace
 
 char WebAssemblySetP2AlignOperands::ID = 0;
+INITIALIZE_PASS(WebAssemblySetP2AlignOperands, DEBUG_TYPE,
+                "Set the p2align operands for WebAssembly loads and stores",
+                false, false)
+
 FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
   return new WebAssemblySetP2AlignOperands();
 }
@@ -72,7 +76,7 @@ static void RewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
 }
 
 bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Set p2align Operands **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
@@ -103,6 +107,48 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::ATOMIC_LOAD8_U_I64:
       case WebAssembly::ATOMIC_LOAD16_U_I64:
       case WebAssembly::ATOMIC_LOAD32_U_I64:
+      case WebAssembly::ATOMIC_RMW8_U_ADD_I32:
+      case WebAssembly::ATOMIC_RMW8_U_ADD_I64:
+      case WebAssembly::ATOMIC_RMW8_U_SUB_I32:
+      case WebAssembly::ATOMIC_RMW8_U_SUB_I64:
+      case WebAssembly::ATOMIC_RMW8_U_AND_I32:
+      case WebAssembly::ATOMIC_RMW8_U_AND_I64:
+      case WebAssembly::ATOMIC_RMW8_U_OR_I32:
+      case WebAssembly::ATOMIC_RMW8_U_OR_I64:
+      case WebAssembly::ATOMIC_RMW8_U_XOR_I32:
+      case WebAssembly::ATOMIC_RMW8_U_XOR_I64:
+      case WebAssembly::ATOMIC_RMW8_U_XCHG_I32:
+      case WebAssembly::ATOMIC_RMW8_U_XCHG_I64:
+      case WebAssembly::ATOMIC_RMW16_U_ADD_I32:
+      case WebAssembly::ATOMIC_RMW16_U_ADD_I64:
+      case WebAssembly::ATOMIC_RMW16_U_SUB_I32:
+      case WebAssembly::ATOMIC_RMW16_U_SUB_I64:
+      case WebAssembly::ATOMIC_RMW16_U_AND_I32:
+      case WebAssembly::ATOMIC_RMW16_U_AND_I64:
+      case WebAssembly::ATOMIC_RMW16_U_OR_I32:
+      case WebAssembly::ATOMIC_RMW16_U_OR_I64:
+      case WebAssembly::ATOMIC_RMW16_U_XOR_I32:
+      case WebAssembly::ATOMIC_RMW16_U_XOR_I64:
+      case WebAssembly::ATOMIC_RMW16_U_XCHG_I32:
+      case WebAssembly::ATOMIC_RMW16_U_XCHG_I64:
+      case WebAssembly::ATOMIC_RMW_ADD_I32:
+      case WebAssembly::ATOMIC_RMW32_U_ADD_I64:
+      case WebAssembly::ATOMIC_RMW_SUB_I32:
+      case WebAssembly::ATOMIC_RMW32_U_SUB_I64:
+      case WebAssembly::ATOMIC_RMW_AND_I32:
+      case WebAssembly::ATOMIC_RMW32_U_AND_I64:
+      case WebAssembly::ATOMIC_RMW_OR_I32:
+      case WebAssembly::ATOMIC_RMW32_U_OR_I64:
+      case WebAssembly::ATOMIC_RMW_XOR_I32:
+      case WebAssembly::ATOMIC_RMW32_U_XOR_I64:
+      case WebAssembly::ATOMIC_RMW_XCHG_I32:
+      case WebAssembly::ATOMIC_RMW32_U_XCHG_I64:
+      case WebAssembly::ATOMIC_RMW_ADD_I64:
+      case WebAssembly::ATOMIC_RMW_SUB_I64:
+      case WebAssembly::ATOMIC_RMW_AND_I64:
+      case WebAssembly::ATOMIC_RMW_OR_I64:
+      case WebAssembly::ATOMIC_RMW_XOR_I64:
+      case WebAssembly::ATOMIC_RMW_XCHG_I64:
         RewriteP2Align(MI, WebAssembly::LoadP2AlignOperandNo);
         break;
       case WebAssembly::STORE_I32:
@@ -114,6 +160,13 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE8_I64:
       case WebAssembly::STORE16_I64:
       case WebAssembly::STORE32_I64:
+      case WebAssembly::ATOMIC_STORE_I32:
+      case WebAssembly::ATOMIC_STORE8_I32:
+      case WebAssembly::ATOMIC_STORE16_I32:
+      case WebAssembly::ATOMIC_STORE_I64:
+      case WebAssembly::ATOMIC_STORE8_I64:
+      case WebAssembly::ATOMIC_STORE16_I64:
+      case WebAssembly::ATOMIC_STORE32_I64:
         RewriteP2Align(MI, WebAssembly::StoreP2AlignOperandNo);
         break;
       default:
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 22a5a9099e72..893e8484c4c6 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements an optimization pass using store result values.
+/// This file implements an optimization pass using store result values.
 ///
 /// WebAssembly's store instructions return the stored value. This is to enable
 /// an optimization wherein uses of the stored value can be replaced by uses of
@@ -68,6 +68,9 @@ private:
 } // end anonymous namespace
 
 char WebAssemblyStoreResults::ID = 0;
+INITIALIZE_PASS(WebAssemblyStoreResults, DEBUG_TYPE,
+                "Optimize store result values for WebAssembly", false, false)
+
 FunctionPass *llvm::createWebAssemblyStoreResults() {
   return new WebAssemblyStoreResults();
 }
@@ -108,8 +111,8 @@ static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
       continue;
 
     Changed = true;
-    DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
-                 << MI << "\n");
+    LLVM_DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
+                      << MI << "\n");
     O.setReg(ToReg);
 
     // If the store's def was previously dead, it is no longer.
@@ -167,7 +170,7 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
 }
 
 bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "********** Store Results **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
@@ -186,7 +189,7 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
   assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
 
   for (auto &MBB : MF) {
-    DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
     for (auto &MI : MBB)
       switch (MI.getOpcode()) {
       default:
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 9e122a5f1574..d6af0fb219d7 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the WebAssembly-specific subclass of
+/// This file implements the WebAssembly-specific subclass of
 /// TargetSubtarget.
 ///
 //===----------------------------------------------------------------------===//
@@ -41,9 +41,9 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
                                            const std::string &FS,
                                            const TargetMachine &TM)
     : WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false),
-      HasAtomics(false), HasNontrappingFPToInt(false), CPUString(CPU),
-      TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+      HasAtomics(false), HasNontrappingFPToInt(false), HasSignExt(false),
+      HasExceptionHandling(false), CPUString(CPU), TargetTriple(TT),
+      FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
       TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableMachineScheduler() const {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index a6bf0b6d54f6..b170dbff3b32 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares the WebAssembly-specific subclass of
+/// This file declares the WebAssembly-specific subclass of
 /// TargetSubtarget.
 ///
 //===----------------------------------------------------------------------===//
@@ -32,6 +32,8 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool HasSIMD128;
   bool HasAtomics;
   bool HasNontrappingFPToInt;
+  bool HasSignExt;
+  bool HasExceptionHandling;
 
   /// String name of used CPU.
   std::string CPUString;
@@ -78,6 +80,8 @@ public:
   bool hasSIMD128() const { return HasSIMD128; }
   bool hasAtomics() const { return HasAtomics; }
   bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
+  bool hasSignExt() const { return HasSignExt; }
+  bool hasExceptionHandling() const { return HasExceptionHandling; }
 
   /// Parses features string setting specified subtarget options. Definition of
   /// function is auto generated by tblgen.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index d38cde74d2ec..7c10f022cbbc 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the WebAssembly-specific subclass of TargetMachine.
+/// This file defines the WebAssembly-specific subclass of TargetMachine.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -25,6 +25,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm"
@@ -48,9 +49,31 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
   RegisterTargetMachine<WebAssemblyTargetMachine> Y(
       getTheWebAssemblyTarget64());
 
-  // Register exception handling pass to opt
-  initializeWebAssemblyLowerEmscriptenEHSjLjPass(
-      *PassRegistry::getPassRegistry());
+  // Register backend passes
+  auto &PR = *PassRegistry::getPassRegistry();
+  initializeWebAssemblyAddMissingPrototypesPass(PR);
+  initializeWebAssemblyLowerEmscriptenEHSjLjPass(PR);
+  initializeLowerGlobalDtorsPass(PR);
+  initializeFixFunctionBitcastsPass(PR);
+  initializeOptimizeReturnedPass(PR);
+  initializeWebAssemblyArgumentMovePass(PR);
+  initializeWebAssemblySetP2AlignOperandsPass(PR);
+  initializeWebAssemblyReplacePhysRegsPass(PR);
+  initializeWebAssemblyPrepareForLiveIntervalsPass(PR);
+  initializeWebAssemblyOptimizeLiveIntervalsPass(PR);
+  initializeWebAssemblyStoreResultsPass(PR);
+  initializeWebAssemblyRegStackifyPass(PR);
+  initializeWebAssemblyRegColoringPass(PR);
+  initializeWebAssemblyExplicitLocalsPass(PR);
+  initializeWebAssemblyFixIrreducibleControlFlowPass(PR);
+  initializeWebAssemblyLateEHPreparePass(PR);
+  initializeWebAssemblyExceptionInfoPass(PR);
+  initializeWebAssemblyCFGSortPass(PR);
+  initializeWebAssemblyCFGStackifyPass(PR);
+  initializeWebAssemblyLowerBrUnlessPass(PR);
+  initializeWebAssemblyRegNumberingPass(PR);
+  initializeWebAssemblyPeepholePass(PR);
+  initializeWebAssemblyCallIndirectFixupPass(PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -74,11 +97,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, getEffectiveRelocModel(RM),
                         CM ? *CM : CodeModel::Large, OL),
-      TLOF(TT.isOSBinFormatELF() ?
-              static_cast<TargetLoweringObjectFile*>(
-                  new WebAssemblyTargetObjectFileELF()) :
-              static_cast<TargetLoweringObjectFile*>(
-                  new WebAssemblyTargetObjectFile())) {
+      TLOF(new WebAssemblyTargetObjectFile()) {
   // WebAssembly type-checks instructions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
   // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
@@ -87,11 +106,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
 
   // WebAssembly treats each function as an independent unit. Force
   // -ffunction-sections, effectively, so that we can emit them independently.
-  if (!TT.isOSBinFormatELF()) {
-    this->Options.FunctionSections = true;
-    this->Options.DataSections = true;
-    this->Options.UniqueSectionNames = true;
-  }
+  this->Options.FunctionSections = true;
+  this->Options.DataSections = true;
+  this->Options.UniqueSectionNames = true;
 
   initAsmInfo();
 
@@ -126,6 +143,22 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 namespace {
+class StripThreadLocal final : public ModulePass {
+  // The default thread model for wasm is single, where thread-local variables
+  // are identical to regular globals and should be treated the same. So this
+  // pass just converts all GlobalVariables to NotThreadLocal
+  static char ID;
+
+ public:
+  StripThreadLocal() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override {
+    for (auto &GV : M.globals())
+      GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal);
+    return true;
+  }
+};
+char StripThreadLocal::ID = 0;
+
 /// WebAssembly Code Generator Pass Configuration Options.
 class WebAssemblyPassConfig final : public TargetPassConfig {
 public:
@@ -166,13 +199,18 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyPassConfig::addIRPasses() {
-  if (TM->Options.ThreadModel == ThreadModel::Single)
+  if (TM->Options.ThreadModel == ThreadModel::Single) {
     // In "single" mode, atomics get lowered to non-atomics.
     addPass(createLowerAtomicPass());
-  else
+    addPass(new StripThreadLocal());
+  } else {
     // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
     // control specifically what gets lowered.
     addPass(createAtomicExpandPass());
+  }
+
+  // Add signatures to prototype-less function declarations
+  addPass(createWebAssemblyAddMissingPrototypes());
 
   // Lower .llvm.global_dtors into .llvm_global_ctors with __cxa_atexit calls.
   addPass(createWebAssemblyLowerGlobalDtors());
@@ -190,7 +228,8 @@ void WebAssemblyPassConfig::addIRPasses() {
   // blocks. Lowering invokes when there is no EH support is done in
   // TargetPassConfig::addPassesToHandleExceptions, but this runs after this
   // function and SjLj handling expects all invokes to be lowered before.
-  if (!EnableEmException) {
+  if (!EnableEmException &&
+      TM->Options.ExceptionModel == ExceptionHandling::None) {
     addPass(createLowerInvokePass());
     // The lower invoke pass may create unreachable code. Remove it in order not
     // to process dead blocks in setjmp/longjmp handling.
@@ -225,16 +264,15 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
   // virtual registers. Consider removing their restrictions and re-enabling
   // them.
 
-  // Has no asserts of its own, but was not written to handle virtual regs.
-  disablePass(&ShrinkWrapID);
-
   // These functions all require the NoVRegs property.
   disablePass(&MachineCopyPropagationID);
+  disablePass(&PostRAMachineSinkingID);
   disablePass(&PostRASchedulerID);
   disablePass(&FuncletLayoutID);
   disablePass(&StackMapLivenessID);
   disablePass(&LiveDebugValuesID);
   disablePass(&PatchableFunctionID);
+  disablePass(&ShrinkWrapID);
 
   TargetPassConfig::addPostRegAlloc();
 }
@@ -282,6 +320,9 @@ void WebAssemblyPassConfig::addPreEmitPass() {
   // Insert explicit get_local and set_local operators.
   addPass(createWebAssemblyExplicitLocals());
 
+  // Do various transformations for exception handling
+  addPass(createWebAssemblyLateEHPrepare());
+
   // Sort the blocks of the CFG into topological order, a prerequisite for
   // BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGSort());
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index dd826befd117..41001e7a0cc7 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares the WebAssembly-specific subclass of
+/// This file declares the WebAssembly-specific subclass of
 /// TargetMachine.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
index b1fd108bc249..0459bfca418d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -8,20 +8,15 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the functions of the WebAssembly-specific subclass
+/// This file defines the functions of the WebAssembly-specific subclass
 /// of TargetLoweringObjectFile.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyTargetObjectFile.h"
 #include "WebAssemblyTargetMachine.h"
-using namespace llvm;
 
-void WebAssemblyTargetObjectFileELF::Initialize(MCContext &Ctx,
-                                                const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
+using namespace llvm;
 
 void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
                                              const TargetMachine &TM) {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index ace87c9e442f..ce744ba8b8e8 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file declares the WebAssembly-specific subclass of
+/// This file declares the WebAssembly-specific subclass of
 /// TargetLoweringObjectFile.
 ///
 //===----------------------------------------------------------------------===//
@@ -20,12 +20,6 @@
 
 namespace llvm {
 
-class WebAssemblyTargetObjectFileELF final
-    : public TargetLoweringObjectFileELF {
-public:
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-};
-
 class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileWasm {
 public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 2e002781f43d..4a2777cc3a9f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file defines the WebAssembly-specific TargetTransformInfo
+/// This file defines the WebAssembly-specific TargetTransformInfo
 /// implementation.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 7b35fc916133..4300ca3defbf 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file a TargetTransformInfo::Concept conforming object specific
+/// This file a TargetTransformInfo::Concept conforming object specific
 /// to the WebAssembly target machine.
 ///
 /// It uses the target's detailed information to provide more precise answers to
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index e32772d491cf..5944cea5abd1 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements several utility functions for WebAssembly.
+/// This file implements several utility functions for WebAssembly.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -18,6 +18,13 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 using namespace llvm;
 
+const char *const WebAssembly::ClangCallTerminateFn = "__clang_call_terminate";
+const char *const WebAssembly::CxaBeginCatchFn = "__cxa_begin_catch";
+const char *const WebAssembly::CxaRethrowFn = "__cxa_rethrow";
+const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev";
+const char *const WebAssembly::PersonalityWrapperFn =
+    "_Unwind_Wasm_CallPersonality";
+
 bool WebAssembly::isArgument(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case WebAssembly::ARGUMENT_I32:
@@ -71,6 +78,24 @@ bool WebAssembly::isChild(const MachineInstr &MI,
          MFI.isVRegStackified(Reg);
 }
 
+bool WebAssembly::isCallDirect(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CALL_VOID:
+  case WebAssembly::CALL_I32:
+  case WebAssembly::CALL_I64:
+  case WebAssembly::CALL_F32:
+  case WebAssembly::CALL_F64:
+  case WebAssembly::CALL_v16i8:
+  case WebAssembly::CALL_v8i16:
+  case WebAssembly::CALL_v4i32:
+  case WebAssembly::CALL_v4f32:
+  case WebAssembly::CALL_EXCEPT_REF:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case WebAssembly::CALL_INDIRECT_VOID:
@@ -82,16 +107,136 @@ bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
   case WebAssembly::CALL_INDIRECT_v8i16:
   case WebAssembly::CALL_INDIRECT_v4i32:
   case WebAssembly::CALL_INDIRECT_v4f32:
+  case WebAssembly::CALL_INDIRECT_EXCEPT_REF:
+    return true;
+  default:
+    return false;
+  }
+}
+
+unsigned WebAssembly::getCalleeOpNo(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CALL_VOID:
+  case WebAssembly::CALL_INDIRECT_VOID:
+    return 0;
+  case WebAssembly::CALL_I32:
+  case WebAssembly::CALL_I64:
+  case WebAssembly::CALL_F32:
+  case WebAssembly::CALL_F64:
+  case WebAssembly::CALL_EXCEPT_REF:
+  case WebAssembly::CALL_INDIRECT_I32:
+  case WebAssembly::CALL_INDIRECT_I64:
+  case WebAssembly::CALL_INDIRECT_F32:
+  case WebAssembly::CALL_INDIRECT_F64:
+  case WebAssembly::CALL_INDIRECT_EXCEPT_REF:
+    return 1;
+  default:
+    llvm_unreachable("Not a call instruction");
+  }
+}
+
+bool WebAssembly::isMarker(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::BLOCK:
+  case WebAssembly::END_BLOCK:
+  case WebAssembly::LOOP:
+  case WebAssembly::END_LOOP:
+  case WebAssembly::TRY:
+  case WebAssembly::END_TRY:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssembly::isThrow(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::THROW_I32:
+  case WebAssembly::THROW_I64:
     return true;
   default:
     return false;
   }
 }
 
-MachineBasicBlock *llvm::LoopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
+bool WebAssembly::isRethrow(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::RETHROW:
+  case WebAssembly::RETHROW_TO_CALLER:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssembly::isCatch(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CATCH_I32:
+  case WebAssembly::CATCH_I64:
+  case WebAssembly::CATCH_ALL:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssembly::mayThrow(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::THROW_I32:
+  case WebAssembly::THROW_I64:
+  case WebAssembly::RETHROW:
+    return true;
+  }
+  if (isCallIndirect(MI))
+    return true;
+  if (!MI.isCall())
+    return false;
+
+  const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI));
+  assert(MO.isGlobal());
+  const auto *F = dyn_cast<Function>(MO.getGlobal());
+  if (!F)
+    return true;
+  if (F->doesNotThrow())
+    return false;
+  // These functions never throw
+  if (F->getName() == CxaBeginCatchFn || F->getName() == PersonalityWrapperFn ||
+      F->getName() == ClangCallTerminateFn || F->getName() == StdTerminateFn)
+    return false;
+  return true;
+}
+
+bool WebAssembly::isCatchTerminatePad(const MachineBasicBlock &MBB) {
+  if (!MBB.isEHPad())
+    return false;
+  bool SeenCatch = false;
+  for (auto &MI : MBB) {
+    if (MI.getOpcode() == WebAssembly::CATCH_I32 ||
+        MI.getOpcode() == WebAssembly::CATCH_I64)
+      SeenCatch = true;
+    if (SeenCatch && MI.isCall()) {
+      const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI));
+      if (CalleeOp.isGlobal() &&
+          CalleeOp.getGlobal()->getName() == ClangCallTerminateFn)
+        return true;
+    }
+  }
+  return false;
+}
+
+bool WebAssembly::isCatchAllTerminatePad(const MachineBasicBlock &MBB) {
+  if (!MBB.isEHPad())
+    return false;
+  bool SeenCatchAll = false;
+  for (auto &MI : MBB) {
+    if (MI.getOpcode() == WebAssembly::CATCH_ALL)
+      SeenCatchAll = true;
+    if (SeenCatchAll && MI.isCall()) {
+      const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI));
+      if (CalleeOp.isGlobal() &&
+          CalleeOp.getGlobal()->getName() == StdTerminateFn)
+        return true;
+    }
+  }
+  return false;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
index 595491f1bf5b..cdb7873e9013 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the declaration of the WebAssembly-specific
+/// This file contains the declaration of the WebAssembly-specific
 /// utility functions.
 ///
 //===----------------------------------------------------------------------===//
@@ -16,11 +16,10 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
 
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
 namespace llvm {
 
-class MachineBasicBlock;
-class MachineInstr;
-class MachineLoop;
 class WebAssemblyFunctionInfo;
 
 namespace WebAssembly {
@@ -29,14 +28,44 @@ bool isArgument(const MachineInstr &MI);
 bool isCopy(const MachineInstr &MI);
 bool isTee(const MachineInstr &MI);
 bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+bool isCallDirect(const MachineInstr &MI);
 bool isCallIndirect(const MachineInstr &MI);
+bool isMarker(const MachineInstr &MI);
+bool isThrow(const MachineInstr &MI);
+bool isRethrow(const MachineInstr &MI);
+bool isCatch(const MachineInstr &MI);
+bool mayThrow(const MachineInstr &MI);
 
-} // end namespace WebAssembly
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+unsigned getCalleeOpNo(const MachineInstr &MI);
+
+/// Returns if the given BB is a single BB terminate pad which starts with a
+/// 'catch' instruction.
+bool isCatchTerminatePad(const MachineBasicBlock &MBB);
+/// Returns if the given BB is a single BB terminate pad which starts with a
+/// 'catch_all' insrtruction.
+bool isCatchAllTerminatePad(const MachineBasicBlock &MBB);
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *LoopBottom(const MachineLoop *Loop);
+// Exception-related function names
+extern const char *const ClangCallTerminateFn;
+extern const char *const CxaBeginCatchFn;
+extern const char *const CxaRethrowFn;
+extern const char *const StdTerminateFn;
+extern const char *const PersonalityWrapperFn;
+
+/// Return the "bottom" block of an entity, which can be either a MachineLoop or
+/// WebAssemblyException. This differs from MachineLoop::getBottomBlock in that
+/// it works even if the entity is discontiguous.
+template <typename T> MachineBasicBlock *getBottom(const T *Unit) {
+  MachineBasicBlock *Bottom = Unit->getHeader();
+  for (MachineBasicBlock *MBB : Unit->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+} // end namespace WebAssembly
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 2eb73befc50b..364c871f61b0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -5,22 +5,22 @@
 # they pass. (Known failures that do not run at all will not cause an
 # error). The format is
 # <name> <attributes> # comment
-#
-# The attributes in this case represent the different arguments used to
-# compiler: 'wasm-s' is for compiling to .s files, and 'wasm-o' for compiling
-# to wasm object files (.o).
 
 # Computed gotos are not supported (Cannot select BlockAddress/BRIND)
-20071220-1.c wasm-o,O0
+20071220-1.c
+20071220-2.c
 20040302-1.c
 20041214-1.c O0
 20071210-1.c
-20071220-1.c wasm-s,O0
 920501-4.c
 920501-5.c
 comp-goto-1.c
 980526-1.c
 990208-1.c
+label13.C O0
+label13a.C O0
+label3.C
+pr42462.C O0
 
 # WebAssembly hasn't implemented (will never?) __builtin_return_address
 20010122-1.c
@@ -76,6 +76,44 @@ pr41935.c
 920728-1.c
 pr28865.c
 widechar-2.c
+attr-alias-1.C
+attr-alias-2.C
+attr-ifunc-1.C
+attr-ifunc-2.C
+attr-ifunc-3.C
+attr-ifunc-4.C
+complit12.C
+va-arg-pack-1.C
+va-arg-pack-len-1.C
+builtin-line1.C
+builtin-location.C
+devirt-6.C  # bad main signature
+devirt-13.C  # bad main signature
+devirt-14.C  # bad main signature
+devirt-21.C  # bad main signature
+devirt-23.C  # bad main signature
+lifetime2.C  # violates C++ DR1696
 
-# Untriaged: Assertion failure in WasmObjectWriter::applyRelocations
-20071220-2.c wasm-o,O0
+# Untriaged C++ failures
+spec5.C
+addr1.C
+ef_test.C
+friend18.C
+member2.C
+new39.C
+new40.C
+nrv8.C
+offsetof9.C
+opaque-1.C
+pr19650.C
+pr37146-1.C
+pr46149.C
+pr59470.C
+rtti2.C
+self1.C
+type-generic-1.C
+vbase8-10.C
+vbase8-21.C
+vbase8-22.C
+vbase8-4.C
+vector1.C
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f2ffba7d5418..b84c2d31a63e 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,7 @@
 
 #include "InstPrinter/X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCExpr.h"
 #include "MCTargetDesc/X86TargetStreamer.h"
 #include "X86AsmInstrumentation.h"
 #include "X86AsmParserCommon.h"
@@ -345,7 +346,7 @@ private:
   public:
     IntelExprStateMachine()
         : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
-          TmpReg(0), Scale(1), Imm(0), Sym(nullptr), BracCount(0),
+          TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0),
           MemExpr(false) {}
 
     void addImm(int64_t imm) { Imm += imm; }
@@ -451,7 +452,7 @@ private:
         IC.pushOperator(IC_PLUS);
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
-          // a scale of 1.
+          // no explicit scale.
           if (!BaseReg) {
             BaseReg = TmpReg;
           } else {
@@ -460,7 +461,7 @@ private:
               return true;
             }
             IndexReg = TmpReg;
-            Scale = 1;
+            Scale = 0;
           }
         }
         break;
@@ -504,7 +505,7 @@ private:
           IC.pushOperator(IC_NEG);
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
-          // a scale of 1.
+          // no explicit scale.
           if (!BaseReg) {
             BaseReg = TmpReg;
           } else {
@@ -513,7 +514,7 @@ private:
               return true;
             }
             IndexReg = TmpReg;
-            Scale = 1;
+            Scale = 0;
           }
         }
         break;
@@ -736,13 +737,13 @@ private:
         State = IES_RBRAC;
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
-          // a scale of 1.
+          // no explicit scale.
           if (!BaseReg) {
             BaseReg = TmpReg;
           } else {
             assert (!IndexReg && "BaseReg/IndexReg already set!");
             IndexReg = TmpReg;
-            Scale = 1;
+            Scale = 0;
           }
         }
         break;
@@ -825,7 +826,7 @@ private:
   bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
   unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
   unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
-  std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+  std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start);
   bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
   void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
                               SMLoc End);
@@ -834,7 +835,7 @@ private:
                                      InlineAsmIdentifierInfo &Info,
                                      bool IsUnevaluatedOperand, SMLoc &End);
 
-  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc MemStart);
 
   bool ParseIntelMemoryOperandSize(unsigned &Size);
   std::unique_ptr<X86Operand>
@@ -844,7 +845,6 @@ private:
                         const InlineAsmIdentifierInfo &Info);
 
   bool parseDirectiveEven(SMLoc L);
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   /// CodeView FPO data directives.
@@ -943,6 +943,8 @@ public:
       : MCTargetAsmParser(Options, sti, mii),  InstInfo(nullptr),
         Code16GCC(false) {
 
+    Parser.addAliasForDirective(".word", ".2byte");
+
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
     Instrumentation.reset(
@@ -953,6 +955,8 @@ public:
 
   void SetFrameRegister(unsigned RegNo) override;
 
+  bool parseAssignmentExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
+
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
@@ -968,27 +972,68 @@ static unsigned MatchRegisterName(StringRef Name);
 /// }
 
 static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
-                                            unsigned Scale, StringRef &ErrMsg) {
+                                            unsigned Scale, bool Is64BitMode,
+                                            StringRef &ErrMsg) {
   // If we have both a base register and an index register make sure they are
   // both 64-bit or 32-bit registers.
   // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
 
-  if ((BaseReg == X86::RIP && IndexReg != 0) || (IndexReg == X86::RIP)) {
+  if (BaseReg != 0 &&
+      !(BaseReg == X86::RIP || BaseReg == X86::EIP ||
+        X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) ||
+        X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg))) {
+    ErrMsg = "invalid base+index expression";
+    return true;
+  }
+
+  if (IndexReg != 0 &&
+      !(IndexReg == X86::EIZ || IndexReg == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::VR128XRegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::VR256XRegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::VR512RegClassID].contains(IndexReg))) {
     ErrMsg = "invalid base+index expression";
     return true;
   }
+
+  if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg != 0) ||
+      IndexReg == X86::EIP || IndexReg == X86::RIP ||
+      IndexReg == X86::ESP || IndexReg == X86::RSP) {
+    ErrMsg = "invalid base+index expression";
+    return true;
+  }
+
+  // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+  // and then only in non-64-bit modes.
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+      (Is64BitMode || (BaseReg != X86::BX && BaseReg != X86::BP &&
+                       BaseReg != X86::SI && BaseReg != X86::DI)) &&
+      BaseReg != X86::DX) {
+    ErrMsg = "invalid 16-bit base register";
+    return true;
+  }
+
+  if (BaseReg == 0 &&
+      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+    ErrMsg = "16-bit memory operand may not include only index register";
+    return true;
+  }
+
   if (BaseReg != 0 && IndexReg != 0) {
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
         (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
-         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
-        IndexReg != X86::RIZ) {
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+         IndexReg == X86::EIZ)) {
       ErrMsg = "base register is 64-bit, but index register is not";
       return true;
     }
     if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
         (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
-         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
-        IndexReg != X86::EIZ){
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+         IndexReg == X86::RIZ)) {
       ErrMsg = "base register is 32-bit, but index register is not";
       return true;
     }
@@ -998,15 +1043,21 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
         ErrMsg = "base register is 16-bit, but index register is not";
         return true;
       }
-      if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
-           IndexReg != X86::SI && IndexReg != X86::DI) ||
-          ((BaseReg == X86::SI || BaseReg == X86::DI) &&
-           IndexReg != X86::BX && IndexReg != X86::BP)) {
+      if ((BaseReg != X86::BX && BaseReg != X86::BP) ||
+          (IndexReg != X86::SI && IndexReg != X86::DI)) {
         ErrMsg = "invalid 16-bit base/index register combination";
         return true;
       }
     }
   }
+
+  // RIP/EIP-relative addressing is only supported in 64-bit mode.
+  if (!Is64BitMode && BaseReg != 0 &&
+      (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
+    ErrMsg = "RIP-relative addressing requires 64-bit mode";
+    return true;
+  }
+
   return checkScale(Scale, ErrMsg);
 }
 
@@ -1048,18 +1099,13 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
     // checked.
     // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
     // REX prefix.
-    if (RegNo == X86::RIZ ||
+    if (RegNo == X86::RIZ || RegNo == X86::RIP || RegNo == X86::EIP ||
         X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
         X86II::isX86_64NonExtLowByteReg(RegNo) ||
         X86II::isX86_64ExtendedReg(RegNo))
       return Error(StartLoc, "register %"
                    + Tok.getString() + " is only available in 64-bit mode",
                    SMRange(StartLoc, EndLoc));
-  } else if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) {
-    if (X86II::is32ExtendedReg(RegNo))
-      return Error(StartLoc, "register %"
-                   + Tok.getString() + " is only available with AVX512",
-                   SMRange(StartLoc, EndLoc));
   }
 
   // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
@@ -1388,6 +1434,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       if (ParseIntelDotOperator(SM, End))
         return true;
       break;
+    case AsmToken::At:
     case AsmToken::String:
     case AsmToken::Identifier: {
       SMLoc IdentLoc = Tok.getLoc();
@@ -1395,7 +1442,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       UpdateLocLex = false;
       // Register
       unsigned Reg;
-      if (Tok.isNot(AsmToken::String) && !ParseRegister(Reg, IdentLoc, End)) {
+      if (Tok.is(AsmToken::Identifier) && !ParseRegister(Reg, IdentLoc, End)) {
         if (SM.onRegister(Reg, ErrMsg))
           return Error(Tok.getLoc(), ErrMsg);
         break;
@@ -1433,6 +1480,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
         break;
       }
       // MS InlineAsm identifier
+      // Call parseIdentifier() to combine @ with the identifier behind it.
+      if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
+        return Error(IdentLoc, "expected identifier");
       if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
         return true;
       else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
@@ -1595,7 +1645,7 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val,
 
 //ParseRoundingModeOp - Parse AVX-512 rounding mode operand
 std::unique_ptr<X86Operand>
-X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
+X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   // Eat "{" and mark the current place.
@@ -1616,6 +1666,7 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
     Parser.Lex();  // Eat the sae
     if (!getLexer().is(AsmToken::RCurly))
       return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+    SMLoc End = Tok.getEndLoc();
     Parser.Lex();  // Eat "}"
     const MCExpr *RndModeOp =
       MCConstantExpr::create(rndMode, Parser.getContext());
@@ -1760,7 +1811,6 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
     .Cases("XMMWORD", "xmmword", 128)
     .Cases("YMMWORD", "ymmword", 256)
     .Cases("ZMMWORD", "zmmword", 512)
-    .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
     .Default(0);
   if (Size) {
     const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
@@ -1792,9 +1842,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   Start = Tok.getLoc();
 
   // Rounding mode operand.
-  if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
-      getLexer().is(AsmToken::LCurly))
-    return ParseRoundingModeOp(Start, End);
+  if (getLexer().is(AsmToken::LCurly))
+    return ParseRoundingModeOp(Start);
 
   // Register operand.
   unsigned RegNo = 0;
@@ -1839,8 +1888,39 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   unsigned IndexReg = SM.getIndexReg();
   unsigned Scale = SM.getScale();
 
+  if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP &&
+      (IndexReg == X86::ESP || IndexReg == X86::RSP))
+    std::swap(BaseReg, IndexReg);
+
+  // If BaseReg is a vector register and IndexReg is not, swap them unless
+  // Scale was specified in which case it would be an error.
+  if (Scale == 0 &&
+      !(X86MCRegisterClasses[X86::VR128XRegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::VR256XRegClassID].contains(IndexReg) ||
+        X86MCRegisterClasses[X86::VR512RegClassID].contains(IndexReg)) &&
+      (X86MCRegisterClasses[X86::VR128XRegClassID].contains(BaseReg) ||
+       X86MCRegisterClasses[X86::VR256XRegClassID].contains(BaseReg) ||
+       X86MCRegisterClasses[X86::VR512RegClassID].contains(BaseReg)))
+    std::swap(BaseReg, IndexReg);
+
+  if (Scale != 0 &&
+      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))
+    return ErrorOperand(Start, "16-bit addresses cannot have a scale");
+
+  // If there was no explicit scale specified, change it to 1.
+  if (Scale == 0)
+    Scale = 1;
+
+  // If this is a 16-bit addressing mode with the base and index in the wrong
+  // order, swap them so CheckBaseRegAndIndexRegAndScale doesn't fail. It is
+  // shared with att syntax where order matters.
+  if ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+      (IndexReg == X86::BX || IndexReg == X86::BP))
+    std::swap(BaseReg, IndexReg);
+
   if ((BaseReg || IndexReg) &&
-      CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg))
+      CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
+                                      ErrMsg))
     return ErrorOperand(Start, ErrMsg);
   if (isParsingInlineAsm())
     return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg,
@@ -1895,10 +1975,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
     return X86Operand::CreateImm(Val, Start, End);
   }
   case AsmToken::LCurly:{
-    SMLoc Start = Parser.getTok().getLoc(), End;
-    if (getSTI().getFeatureBits()[X86::FeatureAVX512])
-      return ParseRoundingModeOp(Start, End);
-    return ErrorOperand(Start, "Unexpected '{' in expression");
+    SMLoc Start = Parser.getTok().getLoc();
+    return ParseRoundingModeOp(Start);
   }
   }
 }
@@ -1928,82 +2006,80 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
 bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
                                        const MCParsedAsmOperand &Op) {
   MCAsmParser &Parser = getParser();
-  if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
-    if (getLexer().is(AsmToken::LCurly)) {
-      // Eat "{" and mark the current place.
-      const SMLoc consumedToken = consumeToken();
-      // Distinguish {1to<NUM>} from {%k<NUM>}.
-      if(getLexer().is(AsmToken::Integer)) {
-        // Parse memory broadcasting ({1to<NUM>}).
-        if (getLexer().getTok().getIntVal() != 1)
-          return TokError("Expected 1to<NUM> at this point");
-        Parser.Lex();  // Eat "1" of 1to8
-        if (!getLexer().is(AsmToken::Identifier) ||
-            !getLexer().getTok().getIdentifier().startswith("to"))
-          return TokError("Expected 1to<NUM> at this point");
-        // Recognize only reasonable suffixes.
-        const char *BroadcastPrimitive =
-          StringSwitch<const char*>(getLexer().getTok().getIdentifier())
-            .Case("to2",  "{1to2}")
-            .Case("to4",  "{1to4}")
-            .Case("to8",  "{1to8}")
-            .Case("to16", "{1to16}")
-            .Default(nullptr);
-        if (!BroadcastPrimitive)
-          return TokError("Invalid memory broadcast primitive.");
-        Parser.Lex();  // Eat "toN" of 1toN
-        if (!getLexer().is(AsmToken::RCurly))
-          return TokError("Expected } at this point");
-        Parser.Lex();  // Eat "}"
-        Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
-                                                   consumedToken));
-        // No AVX512 specific primitives can pass
-        // after memory broadcasting, so return.
-        return false;
-      } else {
-        // Parse either {k}{z}, {z}{k}, {k} or {z}
-        // last one have no meaning, but GCC accepts it
-        // Currently, we're just pass a '{' mark
-        std::unique_ptr<X86Operand> Z;
-        if (ParseZ(Z, consumedToken))
-          return true;
-        // Reaching here means that parsing of the allegadly '{z}' mark yielded
-        // no errors.
-        // Query for the need of further parsing for a {%k<NUM>} mark
-        if (!Z || getLexer().is(AsmToken::LCurly)) {
-          SMLoc StartLoc = Z ? consumeToken() : consumedToken;
-          // Parse an op-mask register mark ({%k<NUM>}), which is now to be
-          // expected
-          unsigned RegNo;
-          SMLoc RegLoc;
-          if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
-              X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
-            if (RegNo == X86::K0)
-              return Error(RegLoc, "Register k0 can't be used as write mask");
-            if (!getLexer().is(AsmToken::RCurly))
-              return Error(getLexer().getLoc(), "Expected } at this point");
-            Operands.push_back(X86Operand::CreateToken("{", StartLoc));
-            Operands.push_back(
-                X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
-            Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
-          } else
+  if (getLexer().is(AsmToken::LCurly)) {
+    // Eat "{" and mark the current place.
+    const SMLoc consumedToken = consumeToken();
+    // Distinguish {1to<NUM>} from {%k<NUM>}.
+    if(getLexer().is(AsmToken::Integer)) {
+      // Parse memory broadcasting ({1to<NUM>}).
+      if (getLexer().getTok().getIntVal() != 1)
+        return TokError("Expected 1to<NUM> at this point");
+      Parser.Lex();  // Eat "1" of 1to8
+      if (!getLexer().is(AsmToken::Identifier) ||
+          !getLexer().getTok().getIdentifier().startswith("to"))
+        return TokError("Expected 1to<NUM> at this point");
+      // Recognize only reasonable suffixes.
+      const char *BroadcastPrimitive =
+        StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+          .Case("to2",  "{1to2}")
+          .Case("to4",  "{1to4}")
+          .Case("to8",  "{1to8}")
+          .Case("to16", "{1to16}")
+          .Default(nullptr);
+      if (!BroadcastPrimitive)
+        return TokError("Invalid memory broadcast primitive.");
+      Parser.Lex();  // Eat "toN" of 1toN
+      if (!getLexer().is(AsmToken::RCurly))
+        return TokError("Expected } at this point");
+      Parser.Lex();  // Eat "}"
+      Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+                                                 consumedToken));
+      // No AVX512 specific primitives can pass
+      // after memory broadcasting, so return.
+      return false;
+    } else {
+      // Parse either {k}{z}, {z}{k}, {k} or {z}
+      // last one have no meaning, but GCC accepts it
+      // Currently, we're just pass a '{' mark
+      std::unique_ptr<X86Operand> Z;
+      if (ParseZ(Z, consumedToken))
+        return true;
+      // Reaching here means that parsing of the allegadly '{z}' mark yielded
+      // no errors.
+      // Query for the need of further parsing for a {%k<NUM>} mark
+      if (!Z || getLexer().is(AsmToken::LCurly)) {
+        SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+        // Parse an op-mask register mark ({%k<NUM>}), which is now to be
+        // expected
+        unsigned RegNo;
+        SMLoc RegLoc;
+        if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
+            X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
+          if (RegNo == X86::K0)
+            return Error(RegLoc, "Register k0 can't be used as write mask");
+          if (!getLexer().is(AsmToken::RCurly))
+            return Error(getLexer().getLoc(), "Expected } at this point");
+          Operands.push_back(X86Operand::CreateToken("{", StartLoc));
+          Operands.push_back(
+              X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
+          Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+        } else
+          return Error(getLexer().getLoc(),
+                        "Expected an op-mask register at this point");
+        // {%k<NUM>} mark is found, inquire for {z}
+        if (getLexer().is(AsmToken::LCurly) && !Z) {
+          // Have we've found a parsing error, or found no (expected) {z} mark
+          // - report an error
+          if (ParseZ(Z, consumeToken()) || !Z)
             return Error(getLexer().getLoc(),
-                          "Expected an op-mask register at this point");
-          // {%k<NUM>} mark is found, inquire for {z}
-          if (getLexer().is(AsmToken::LCurly) && !Z) {
-            // Have we've found a parsing error, or found no (expected) {z} mark
-            // - report an error
-            if (ParseZ(Z, consumeToken()) || !Z)
-              return Error(getLexer().getLoc(),
-                           "Expected a {z} mark at this point");
+                         "Expected a {z} mark at this point");
 
-          }
-          // '{z}' on its own is meaningless, hence should be ignored.
-          // on the contrary - have it been accompanied by a K register,
-          // allow it.
-          if (Z)
-            Operands.push_back(std::move(Z));
         }
+        // '{z}' on its own is meaningless, hence should be ignored.
+        // on the contrary - have it been accompanied by a K register,
+        // allow it.
+        if (Z)
+          Operands.push_back(std::move(Z));
       }
     }
   }
@@ -2024,6 +2100,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
   if (getLexer().isNot(AsmToken::LParen)) {
     SMLoc ExprEnd;
     if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+    // Disp may be a variable, handle register values.
+    if (auto *RE = dyn_cast<X86MCExpr>(Disp))
+      return X86Operand::CreateReg(RE->getRegNo(), MemStart, ExprEnd);
 
     // After parsing the base expression we could either have a parenthesized
     // memory address or not.  If not, return now.  If so, eat the (.
@@ -2114,12 +2193,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
       if (getLexer().isNot(AsmToken::RParen)) {
         // Parse the scale amount:
         //  ::= ',' [scale-expression]
-        if (getLexer().isNot(AsmToken::Comma)) {
-          Error(Parser.getTok().getLoc(),
-                "expected comma in scale expression");
+        if (parseToken(AsmToken::Comma, "expected comma in scale expression"))
           return nullptr;
-        }
-        Parser.Lex(); // Eat the comma.
 
         if (getLexer().isNot(AsmToken::RParen)) {
           SMLoc Loc = Parser.getTok().getLoc();
@@ -2160,31 +2235,21 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
   }
 
   // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
-  if (getLexer().isNot(AsmToken::RParen)) {
-    Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
-    return nullptr;
-  }
   SMLoc MemEnd = Parser.getTok().getEndLoc();
-  Parser.Lex(); // Eat the ')'.
-
-  // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
-  // and then only in non-64-bit modes. Except for DX, which is a special case
-  // because an unofficial form of in/out instructions uses it.
-  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
-      (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
-                         BaseReg != X86::SI && BaseReg != X86::DI)) &&
-      BaseReg != X86::DX) {
-    Error(BaseLoc, "invalid 16-bit base register");
-    return nullptr;
-  }
-  if (BaseReg == 0 &&
-      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
-    Error(IndexLoc, "16-bit memory operand may not include only index register");
+  if (parseToken(AsmToken::RParen, "unexpected token in memory operand"))
     return nullptr;
-  }
+
+  // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
+  // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
+  // documented form in various unofficial manuals, so a lot of code uses it.
+  if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 &&
+      SegReg == 0 && isa<MCConstantExpr>(Disp) &&
+      cast<MCConstantExpr>(Disp)->getValue() == 0)
+    return X86Operand::CreateDXReg(BaseLoc, BaseLoc);
 
   StringRef ErrMsg;
-  if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg)) {
+  if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
+                                      ErrMsg)) {
     Error(BaseLoc, ErrMsg);
     return nullptr;
   }
@@ -2195,6 +2260,25 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
   return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
 }
 
+// Parse either a standard expression or a register.
+bool X86AsmParser::parseAssignmentExpression(const MCExpr *&Res,
+                                             SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.parseExpression(Res, EndLoc)) {
+    SMLoc StartLoc = Parser.getTok().getLoc();
+    // Normal Expression parse fails, check if it could be a register.
+    unsigned RegNo;
+    if (Parser.getTargetParser().ParseRegister(RegNo, StartLoc, EndLoc))
+      return true;
+    // Clear previous parse error and return correct expression.
+    Parser.clearPendingErrors();
+    Res = X86MCExpr::create(RegNo, Parser.getContext());
+    return false;
+  }
+
+  return false;
+}
+
 bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
@@ -2358,21 +2442,22 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                       .Cases("acquire", "release", isParsingIntelSyntax())
                       .Default(false);
 
-  auto isLockRepeatPrefix = [](StringRef N) {
+  auto isLockRepeatNtPrefix = [](StringRef N) {
     return StringSwitch<bool>(N)
-        .Cases("lock", "rep", "repe", "repz", "repne", "repnz", true)
+        .Cases("lock", "rep", "repe", "repz", "repne", "repnz", "notrack", true)
         .Default(false);
   };
 
   bool CurlyAsEndOfStatement = false;
 
   unsigned Flags = X86::IP_NO_PREFIX;
-  while (isLockRepeatPrefix(Name.lower())) {
+  while (isLockRepeatNtPrefix(Name.lower())) {
     unsigned Prefix =
         StringSwitch<unsigned>(Name)
             .Cases("lock", "lock", X86::IP_HAS_LOCK)
             .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
             .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
+            .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK)
             .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
     Flags |= Prefix;
     if (getLexer().is(AsmToken::EndOfStatement)) {
@@ -2396,6 +2481,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   if (Flags)
     PatchedName = Name;
+
+  // Hacks to handle 'data16' and 'data32'
+  if (PatchedName == "data16" && is16BitMode()) {
+    return Error(NameLoc, "redundant data16 prefix");
+  }
+  if (PatchedName == "data32") {
+    if (is32BitMode())
+      return Error(NameLoc, "redundant data32 prefix");
+    if (is64BitMode())
+      return Error(NameLoc, "'data32' is not supported in 64-bit mode");
+    // Hack to 'data16' for the table lookup.
+    PatchedName = "data16";
+  }
+
   Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
 
   // This does the actual operand parsing.  Don't parse any more if we have a
@@ -2430,7 +2529,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
     if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
       return TokError("unexpected token in argument list");
-   }
+  }
 
   // Consume the EndOfStatement or the prefix separator Slash
   if (getLexer().is(AsmToken::EndOfStatement) ||
@@ -2486,26 +2585,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
        Name == "outl" || Name == "outsl" || Name == "out" || Name == "outs") &&
       Operands.size() == 3) {
     X86Operand &Op = (X86Operand &)*Operands.back();
-    if (Op.isMem() && Op.Mem.SegReg == 0 &&
-        isa<MCConstantExpr>(Op.Mem.Disp) &&
-        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
-        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
-      SMLoc Loc = Op.getEndLoc();
-      Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
-    }
+    if (Op.isDXReg())
+      Operands.back() = X86Operand::CreateReg(X86::DX, Op.getStartLoc(),
+                                              Op.getEndLoc());
   }
   // Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
   if ((Name == "inb" || Name == "insb" || Name == "inw" || Name == "insw" ||
        Name == "inl" || Name == "insl" || Name == "in" || Name == "ins") &&
       Operands.size() == 3) {
     X86Operand &Op = (X86Operand &)*Operands[1];
-    if (Op.isMem() && Op.Mem.SegReg == 0 &&
-        isa<MCConstantExpr>(Op.Mem.Disp) &&
-        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
-        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
-      SMLoc Loc = Op.getEndLoc();
-      Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
-    }
+    if (Op.isDXReg())
+      Operands[1] = X86Operand::CreateReg(X86::DX, Op.getStartLoc(),
+                                          Op.getEndLoc());
   }
 
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
@@ -2710,6 +2801,39 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
                                             "should be distinct");
     break;
   }
+  case X86::V4FMADDPSrm:
+  case X86::V4FMADDPSrmk:
+  case X86::V4FMADDPSrmkz:
+  case X86::V4FMADDSSrm:
+  case X86::V4FMADDSSrmk:
+  case X86::V4FMADDSSrmkz:
+  case X86::V4FNMADDPSrm:
+  case X86::V4FNMADDPSrmk:
+  case X86::V4FNMADDPSrmkz:
+  case X86::V4FNMADDSSrm:
+  case X86::V4FNMADDSSrmk:
+  case X86::V4FNMADDSSrmkz:
+  case X86::VP4DPWSSDSrm:
+  case X86::VP4DPWSSDSrmk:
+  case X86::VP4DPWSSDSrmkz:
+  case X86::VP4DPWSSDrm:
+  case X86::VP4DPWSSDrmk:
+  case X86::VP4DPWSSDrmkz: {
+    unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
+                                    X86::AddrNumOperands - 1).getReg();
+    unsigned Src2Enc = MRI->getEncodingValue(Src2);
+    if (Src2Enc % 4 != 0) {
+      StringRef RegName = X86IntelInstPrinter::getRegisterName(Src2);
+      unsigned GroupStart = (Src2Enc / 4) * 4;
+      unsigned GroupEnd = GroupStart + 3;
+      return Warning(Ops[0]->getStartLoc(),
+                     "source register '" + RegName + "' implicitly denotes '" +
+                     RegName.take_front(3) + Twine(GroupStart) + "' to '" +
+                     RegName.take_front(3) + Twine(GroupEnd) +
+                     "' source group");
+    }
+    break;
+  }
   }
 
   return false;
@@ -3153,9 +3277,7 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
 bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getIdentifier();
-  if (IDVal == ".word")
-    return ParseDirectiveWord(2, DirectiveID.getLoc());
-  else if (IDVal.startswith(".code"))
+  if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
     getParser().setParsingInlineAsm(false);
@@ -3202,10 +3324,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
 /// parseDirectiveEven
 ///  ::= .even
 bool X86AsmParser::parseDirectiveEven(SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    TokError("unexpected token in directive");
-    return false;  
-  }
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return false;
+
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
   if (!Section) {
     getStreamer().InitSections(false);
@@ -3217,42 +3338,6 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
     getStreamer().EmitValueToAlignment(2, 0, 1, 0);
   return false;
 }
-/// ParseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
-  MCAsmParser &Parser = getParser();
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      const MCExpr *Value;
-      SMLoc ExprLoc = getLexer().getLoc();
-      if (getParser().parseExpression(Value))
-        return false;
-
-      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
-        assert(Size <= 8 && "Invalid size");
-        uint64_t IntValue = MCE->getValue();
-        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
-          return Error(ExprLoc, "literal value out of range for directive");
-        getStreamer().EmitIntValue(IntValue, Size);
-      } else {
-        getStreamer().EmitValue(Value, Size, ExprLoc);
-      }
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma)) {
-        Error(L, "unexpected token in directive");
-        return false;
-      }
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
-}
 
 /// ParseDirectiveCode
 ///  ::= .code16 | .code32 | .code64
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 43a0561e769b..4d4aae0a1c6a 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
 
+#include "InstPrinter/X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86AsmParserCommon.h"
 #include "llvm/ADT/STLExtras.h"
@@ -28,8 +29,8 @@ namespace llvm {
 
 /// X86Operand - Instances of this class represent a parsed X86 machine
 /// instruction.
-struct X86Operand : public MCParsedAsmOperand {
-  enum KindTy { Token, Register, Immediate, Memory, Prefix } Kind;
+struct X86Operand final : public MCParsedAsmOperand {
+  enum KindTy { Token, Register, Immediate, Memory, Prefix, DXRegister } Kind;
 
   SMLoc StartLoc, EndLoc;
   SMLoc OffsetOfLoc;
@@ -77,7 +78,7 @@ struct X86Operand : public MCParsedAsmOperand {
   };
 
   X86Operand(KindTy K, SMLoc Start, SMLoc End)
-    : Kind(K), StartLoc(Start), EndLoc(End) {}
+      : Kind(K), StartLoc(Start), EndLoc(End) {}
 
   StringRef getSymName() override { return SymName; }
   void *getOpDecl() override { return OpDecl; }
@@ -95,7 +96,55 @@ struct X86Operand : public MCParsedAsmOperand {
   /// getOffsetOfLoc - Get the location of the offset operator.
   SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
 
-  void print(raw_ostream &OS) const override {}
+  void print(raw_ostream &OS) const override {
+
+    auto PrintImmValue = [&](const MCExpr *Val, const char *VName) {
+      if (Val->getKind() == MCExpr::Constant) {
+        if (auto Imm = cast<MCConstantExpr>(Val)->getValue())
+          OS << VName << Imm;
+      } else if (Val->getKind() == MCExpr::SymbolRef) {
+        if (auto *SRE = dyn_cast<MCSymbolRefExpr>(Val)) {
+          const MCSymbol &Sym = SRE->getSymbol();
+          if (auto SymName = Sym.getName().data())
+            OS << VName << SymName;
+        }
+      }
+    };
+
+    switch (Kind) {
+    case Token:
+      OS << Tok.Data;
+      break;
+    case Register:
+      OS << "Reg:" << X86IntelInstPrinter::getRegisterName(Reg.RegNo);
+      break;
+    case DXRegister:
+      OS << "DXReg";
+      break;
+    case Immediate:
+      PrintImmValue(Imm.Val, "Imm:");
+      break;
+    case Prefix:
+      OS << "Prefix:" << Pref.Prefixes;
+      break;
+    case Memory:
+      OS << "Memory: ModeSize=" << Mem.ModeSize;
+      if (Mem.Size)
+        OS << ",Size=" << Mem.Size;
+      if (Mem.BaseReg)
+        OS << ",BaseReg=" << X86IntelInstPrinter::getRegisterName(Mem.BaseReg);
+      if (Mem.IndexReg)
+        OS << ",IndexReg="
+           << X86IntelInstPrinter::getRegisterName(Mem.IndexReg);
+      if (Mem.Scale)
+        OS << ",Scale=" << Mem.Scale;
+      if (Mem.Disp)
+        PrintImmValue(Mem.Disp, ",Disp=");
+      if (Mem.SegReg)
+        OS << ",SegReg=" << X86IntelInstPrinter::getRegisterName(Mem.SegReg);
+      break;
+    }
+  }
 
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
@@ -395,6 +444,7 @@ struct X86Operand : public MCParsedAsmOperand {
 
   bool isPrefix() const { return Kind == Prefix; }
   bool isReg() const override { return Kind == Register; }
+  bool isDXReg() const { return Kind == DXRegister; }
 
   bool isGR32orGR64() const {
     return Kind == Register &&
@@ -415,34 +465,11 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(getReg()));
   }
 
-  static unsigned getGR32FromGR64(unsigned RegNo) {
-    switch (RegNo) {
-    default: llvm_unreachable("Unexpected register");
-    case X86::RAX: return X86::EAX;
-    case X86::RCX: return X86::ECX;
-    case X86::RDX: return X86::EDX;
-    case X86::RBX: return X86::EBX;
-    case X86::RBP: return X86::EBP;
-    case X86::RSP: return X86::ESP;
-    case X86::RSI: return X86::ESI;
-    case X86::RDI: return X86::EDI;
-    case X86::R8: return X86::R8D;
-    case X86::R9: return X86::R9D;
-    case X86::R10: return X86::R10D;
-    case X86::R11: return X86::R11D;
-    case X86::R12: return X86::R12D;
-    case X86::R13: return X86::R13D;
-    case X86::R14: return X86::R14D;
-    case X86::R15: return X86::R15D;
-    case X86::RIP: return X86::EIP;
-    }
-  }
-
   void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned RegNo = getReg();
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
-      RegNo = getGR32FromGR64(RegNo);
+      RegNo = getX86SubSuperRegister(RegNo, 32);
     Inst.addOperand(MCOperand::createReg(RegNo));
   }
 
@@ -517,6 +544,11 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<X86Operand>
+  CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) {
+    return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
+  }
+
+  static std::unique_ptr<X86Operand>
   CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
     auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
     Res->Pref.Prefixes = Prefixes;
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index b3c491b3de5e..62312777318e 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -103,7 +103,7 @@ StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode,
   return MII->getName(Opcode);
 }
 
-#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+#define debug(s) LLVM_DEBUG(Debug(__FILE__, __LINE__, s));
 
 namespace llvm {
 
@@ -247,6 +247,8 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
                  // It should not be 'pause' f3 90
                  InternalInstr.opcode != 0x90)
           Flags |= X86::IP_HAS_REPEAT;
+        if (InternalInstr.hasLockPrefix)
+          Flags |= X86::IP_HAS_LOCK;
       }
       Instr.setFlags(Flags);
     }
@@ -661,8 +663,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   case TYPE_ZMM:
     mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
     return;
-  case TYPE_BNDR:
-    mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
   default:
     // operand is 64 bits wide.  Do nothing.
     break;
@@ -758,7 +758,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
 #undef ENTRY
       }
     } else {
-      baseReg = MCOperand::createReg(0);
+      baseReg = MCOperand::createReg(X86::NoRegister);
     }
 
     if (insn.sibIndex != SIB_INDEX_NONE) {
@@ -777,7 +777,22 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
 #undef ENTRY
       }
     } else {
-      indexReg = MCOperand::createReg(0);
+      // Use EIZ/RIZ for a few ambiguous cases where the SIB byte is present,
+      // but no index is used and modrm alone should have been enough.
+      // -No base register in 32-bit mode. In 64-bit mode this is used to
+      //  avoid rip-relative addressing.
+      // -Any base register used other than ESP/RSP/R12D/R12. Using these as a
+      //  base always requires a SIB byte.
+      // -A scale other than 1 is used.
+      if (insn.sibScale != 1 ||
+          (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
+          (insn.sibBase != SIB_BASE_NONE &&
+           insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
+           insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12)) {
+        indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ :
+                                                                X86::RIZ);
+      } else
+        indexReg = MCOperand::createReg(X86::NoRegister);
     }
 
     scaleAmount = MCOperand::createImm(insn.sibScale);
@@ -794,12 +809,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
         tryAddingPcLoadReferenceComment(insn.startLocation +
                                         insn.displacementOffset,
                                         insn.displacement + pcrel, Dis);
-        baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6
+        // Section 2.2.1.6
+        baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP :
+                                                               X86::RIP);
       }
       else
-        baseReg = MCOperand::createReg(0);
+        baseReg = MCOperand::createReg(X86::NoRegister);
 
-      indexReg = MCOperand::createReg(0);
+      indexReg = MCOperand::createReg(X86::NoRegister);
       break;
     case EA_BASE_BX_SI:
       baseReg = MCOperand::createReg(X86::BX);
@@ -818,7 +835,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
       indexReg = MCOperand::createReg(X86::DI);
       break;
     default:
-      indexReg = MCOperand::createReg(0);
+      indexReg = MCOperand::createReg(X86::NoRegister);
       switch (insn.eaBase) {
       default:
         debug("Unexpected eaBase");
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 843d037ad3cd..1ac304f3be03 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -103,6 +103,9 @@ static int modRMRequired(OpcodeType type,
   case XOPA_MAP:
     decision = &XOPA_MAP_SYM;
     break;
+  case THREEDNOW_MAP:
+    decision = &THREEDNOW_MAP_SYM;
+    break;
   }
 
   return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
@@ -147,6 +150,9 @@ static InstrUID decode(OpcodeType type,
   case XOPA_MAP:
     dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case THREEDNOW_MAP:
+    dec = &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
 
   switch (dec->modrm_type) {
@@ -292,6 +298,9 @@ static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
 static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) {
   uint8_t nextByte;
   switch (prefix) {
+  case 0xf0:
+    insn->hasLockPrefix = true;
+    break;
   case 0xf2:
   case 0xf3:
     if (lookAtByte(insn, &nextByte))
@@ -623,6 +632,8 @@ static int readPrefixes(struct InternalInstruction* insn) {
   return 0;
 }
 
+static int readModRM(struct InternalInstruction* insn);
+
 /*
  * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
  *   extended or escape opcodes).
@@ -715,6 +726,17 @@ static int readOpcode(struct InternalInstruction* insn) {
         return -1;
 
       insn->opcodeType = THREEBYTE_3A;
+    } else if (current == 0x0f) {
+      dbgprintf(insn, "Found a 3dnow escape prefix (0x%hhx)", current);
+
+      // Consume operands before the opcode to comply with the 3DNow encoding
+      if (readModRM(insn))
+        return -1;
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEDNOW_MAP;
     } else {
       dbgprintf(insn, "Didn't find a three-byte escape prefix");
 
@@ -735,8 +757,6 @@ static int readOpcode(struct InternalInstruction* insn) {
   return 0;
 }
 
-static int readModRM(struct InternalInstruction* insn);
-
 /*
  * getIDWithAttrMask - Determines the ID of an instruction, consuming
  *   the ModR/M byte as appropriate for extended and escape opcodes,
@@ -947,6 +967,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
       attrMask |= ATTR_ADSIZE;
       break;
     }
+
   }
 
   if (insn->rexPrefix & 0x08) {
@@ -1039,13 +1060,15 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   }
 
   /*
-   * Absolute moves need special handling.
+   * Absolute moves, umonitor, and movdir64b need special handling.
    * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
    *  inverted w.r.t.
    * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
    *  any position.
    */
-  if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+  if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) ||
+      (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) ||
+      (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) {
     /* Make sure we observed the prefixes in any position. */
     if (insn->hasAdSize)
       attrMask |= ATTR_ADSIZE;
@@ -1053,8 +1076,13 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
       attrMask |= ATTR_OPSIZE;
 
     /* In 16-bit, invert the attributes. */
-    if (insn->mode == MODE_16BIT)
-      attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+    if (insn->mode == MODE_16BIT) {
+      attrMask ^= ATTR_ADSIZE;
+
+      /* The OpSize attribute is only valid with the absolute moves. */
+      if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0))
+        attrMask ^= ATTR_OPSIZE;
+    }
 
     if (getIDWithAttrMask(&instructionID, insn, attrMask))
       return -1;
@@ -1279,7 +1307,7 @@ static int readDisplacement(struct InternalInstruction* insn) {
  * @return      - 0 if the information was successfully read; nonzero otherwise.
  */
 static int readModRM(struct InternalInstruction* insn) {
-  uint8_t mod, rm, reg;
+  uint8_t mod, rm, reg, evexrm;
 
   dbgprintf(insn, "readModRM()");
 
@@ -1316,16 +1344,18 @@ static int readModRM(struct InternalInstruction* insn) {
 
   reg |= rFromREX(insn->rexPrefix) << 3;
   rm  |= bFromREX(insn->rexPrefix) << 3;
-  if (insn->vectorExtensionType == TYPE_EVEX) {
+
+  evexrm = 0;
+  if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) {
     reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
-    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+    evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
   }
 
   insn->reg = (Reg)(insn->regBase + reg);
 
   switch (insn->addressSize) {
-  case 2:
-    insn->eaBaseBase = EA_BASE_BX_SI;
+  case 2: {
+    EABase eaBaseBase = EA_BASE_BX_SI;
 
     switch (mod) {
     case 0x0:
@@ -1335,19 +1365,19 @@ static int readModRM(struct InternalInstruction* insn) {
         if (readDisplacement(insn))
           return -1;
       } else {
-        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaBase = (EABase)(eaBaseBase + rm);
         insn->eaDisplacement = EA_DISP_NONE;
       }
       break;
     case 0x1:
-      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaBase = (EABase)(eaBaseBase + rm);
       insn->eaDisplacement = EA_DISP_8;
       insn->displacementSize = 1;
       if (readDisplacement(insn))
         return -1;
       break;
     case 0x2:
-      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaBase = (EABase)(eaBaseBase + rm);
       insn->eaDisplacement = EA_DISP_16;
       if (readDisplacement(insn))
         return -1;
@@ -1359,9 +1389,10 @@ static int readModRM(struct InternalInstruction* insn) {
       break;
     }
     break;
+  }
   case 4:
-  case 8:
-    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+  case 8: {
+    EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
 
     switch (mod) {
     case 0x0:
@@ -1383,7 +1414,7 @@ static int readModRM(struct InternalInstruction* insn) {
           return -1;
         break;
       default:
-        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaBase = (EABase)(eaBaseBase + rm);
         break;
       }
       break;
@@ -1399,7 +1430,7 @@ static int readModRM(struct InternalInstruction* insn) {
           return -1;
         break;
       default:
-        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaBase = (EABase)(eaBaseBase + rm);
         if (readDisplacement(insn))
           return -1;
         break;
@@ -1407,16 +1438,17 @@ static int readModRM(struct InternalInstruction* insn) {
       break;
     case 0x3:
       insn->eaDisplacement = EA_DISP_NONE;
-      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm);
       break;
     }
     break;
+  }
   } /* switch (insn->addressSize) */
 
   return 0;
 }
 
-#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+#define GENERIC_FIXUP_FUNC(name, base, prefix, mask)      \
   static uint16_t name(struct InternalInstruction *insn,  \
                        OperandType type,                  \
                        uint8_t index,                     \
@@ -1430,6 +1462,9 @@ static int readModRM(struct InternalInstruction* insn) {
     case TYPE_Rv:                                         \
       return base + index;                                \
     case TYPE_R8:                                         \
+      index &= mask;                                      \
+      if (index > 0xf)                                    \
+        *valid = 0;                                       \
       if (insn->rexPrefix &&                              \
          index >= 4 && index <= 7) {                      \
         return prefix##_SPL + (index - 4);                \
@@ -1437,10 +1472,19 @@ static int readModRM(struct InternalInstruction* insn) {
         return prefix##_AL + index;                       \
       }                                                   \
     case TYPE_R16:                                        \
+      index &= mask;                                      \
+      if (index > 0xf)                                    \
+        *valid = 0;                                       \
       return prefix##_AX + index;                         \
     case TYPE_R32:                                        \
+      index &= mask;                                      \
+      if (index > 0xf)                                    \
+        *valid = 0;                                       \
       return prefix##_EAX + index;                        \
     case TYPE_R64:                                        \
+      index &= mask;                                      \
+      if (index > 0xf)                                    \
+        *valid = 0;                                       \
       return prefix##_RAX + index;                        \
     case TYPE_ZMM:                                        \
       return prefix##_ZMM0 + index;                       \
@@ -1449,6 +1493,7 @@ static int readModRM(struct InternalInstruction* insn) {
     case TYPE_XMM:                                        \
       return prefix##_XMM0 + index;                       \
     case TYPE_VK:                                         \
+      index &= 0xf;                                       \
       if (index > 7)                                      \
         *valid = 0;                                       \
       return prefix##_K0 + index;                         \
@@ -1488,8 +1533,8 @@ static int readModRM(struct InternalInstruction* insn) {
  *                field is valid for the register class; 0 if not.
  * @return      - The proper value.
  */
-GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
-GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG, 0x1f)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG,    0xf)
 
 /*
  * fixupReg - Consults an operand specifier to determine which of the
@@ -1670,7 +1715,7 @@ static int readVVVV(struct InternalInstruction* insn) {
     return -1;
 
   if (insn->mode != MODE_64BIT)
-    vvvv &= 0x7;
+    vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later.
 
   insn->vvvv = static_cast<Reg>(vvvv);
   return 0;
@@ -1731,10 +1776,10 @@ static int readOperands(struct InternalInstruction* insn) {
 
       // If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
       if (insn->sibIndex == SIB_INDEX_NONE)
-        insn->sibIndex = (SIBIndex)4;
+        insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4);
 
       // If EVEX.v2 is set this is one of the 16-31 registers.
-      if (insn->vectorExtensionType == TYPE_EVEX &&
+      if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT &&
           v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
         insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
 
@@ -1835,6 +1880,8 @@ static int readOperands(struct InternalInstruction* insn) {
       needVVVV = 0; /* Mark that we have found a VVVV operand. */
       if (!hasVVVV)
         return -1;
+      if (insn->mode != MODE_64BIT)
+        insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7);
       if (fixupReg(insn, &Op))
         return -1;
       break;
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index ecd9d8dccafa..3b8a4f732eed 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -16,8 +16,8 @@
 #ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
 #define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
 
-#include "X86DisassemblerDecoderCommon.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/X86DisassemblerDecoderCommon.h"
 
 namespace llvm {
 namespace X86Disassembler {
@@ -400,7 +400,7 @@ namespace X86Disassembler {
   REGS_BOUND          \
   ENTRY(RIP)
 
-/// \brief All possible values of the base field for effective-address
+/// All possible values of the base field for effective-address
 /// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
 /// We distinguish between bases (EA_BASE_*) and registers that just happen
 /// to be referred to when Mod == 0b11 (EA_REG_*).
@@ -415,7 +415,7 @@ enum EABase {
   EA_max
 };
 
-/// \brief All possible values of the SIB index field.
+/// All possible values of the SIB index field.
 /// borrows entries from ALL_EA_BASES with the special case that
 /// sib is synonymous with NONE.
 /// Vector SIB: index can be XMM or YMM.
@@ -430,7 +430,7 @@ enum SIBIndex {
   SIB_INDEX_max
 };
 
-/// \brief All possible values of the SIB base field.
+/// All possible values of the SIB base field.
 enum SIBBase {
   SIB_BASE_NONE,
 #define ENTRY(x) SIB_BASE_##x,
@@ -439,7 +439,7 @@ enum SIBBase {
   SIB_BASE_max
 };
 
-/// \brief Possible displacement types for effective-address computations.
+/// Possible displacement types for effective-address computations.
 typedef enum {
   EA_DISP_NONE,
   EA_DISP_8,
@@ -447,7 +447,7 @@ typedef enum {
   EA_DISP_32
 } EADisplacement;
 
-/// \brief All possible values of the reg field in the ModR/M byte.
+/// All possible values of the reg field in the ModR/M byte.
 enum Reg {
 #define ENTRY(x) MODRM_REG_##x,
   ALL_REGS
@@ -455,7 +455,7 @@ enum Reg {
   MODRM_REG_max
 };
 
-/// \brief All possible segment overrides.
+/// All possible segment overrides.
 enum SegmentOverride {
   SEG_OVERRIDE_NONE,
   SEG_OVERRIDE_CS,
@@ -467,7 +467,7 @@ enum SegmentOverride {
   SEG_OVERRIDE_max
 };
 
-/// \brief Possible values for the VEX.m-mmmm field
+/// Possible values for the VEX.m-mmmm field
 enum VEXLeadingOpcodeByte {
   VEX_LOB_0F = 0x1,
   VEX_LOB_0F38 = 0x2,
@@ -480,7 +480,7 @@ enum XOPMapSelect {
   XOP_MAP_SELECT_A = 0xA
 };
 
-/// \brief Possible values for the VEX.pp/EVEX.pp field
+/// Possible values for the VEX.pp/EVEX.pp field
 enum VEXPrefixCode {
   VEX_PREFIX_NONE = 0x0,
   VEX_PREFIX_66 = 0x1,
@@ -496,7 +496,7 @@ enum VectorExtensionType {
   TYPE_XOP          = 0x4
 };
 
-/// \brief Type for the byte reader that the consumer must provide to
+/// Type for the byte reader that the consumer must provide to
 /// the decoder. Reads a single byte from the instruction's address space.
 /// \param arg     A baton that the consumer can associate with any internal
 ///                state that it needs.
@@ -507,7 +507,7 @@ enum VectorExtensionType {
 /// \return        -1 if the byte cannot be read for any reason; 0 otherwise.
 typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
 
-/// \brief Type for the logging function that the consumer can provide to
+/// Type for the logging function that the consumer can provide to
 /// get debugging output from the decoder.
 /// \param arg A baton that the consumer can associate with any internal
 ///            state that it needs.
@@ -563,6 +563,8 @@ struct InternalInstruction {
   bool hasAdSize;
   // Operand-size override
   bool hasOpSize;
+  // Lock prefix
+  bool hasLockPrefix;
   // The repeat prefix if any
   uint8_t repeatPrefix;
 
@@ -627,7 +629,6 @@ struct InternalInstruction {
 
   // These fields determine the allowable values for the ModR/M fields, which
   // depend on operand and address widths.
-  EABase                        eaBaseBase;
   EABase                        eaRegBase;
   Reg                           regBase;
 
@@ -650,7 +651,7 @@ struct InternalInstruction {
   ArrayRef<OperandSpecifier> operands;
 };
 
-/// \brief Decode one instruction and store the decoding results in
+/// Decode one instruction and store the decoding results in
 /// a buffer provided by the consumer.
 /// \param insn      The buffer to store the instruction in.  Allocated by the
 ///                  consumer.
@@ -674,7 +675,7 @@ int decodeInstruction(InternalInstruction *insn,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
-/// \brief Print a message to debugs()
+/// Print a message to debugs()
 /// \param file The name of the file printing the debug message.
 /// \param line The line number that printed the debug message.
 /// \param s    The message to print.
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 0c99dbbe328b..82e82fe1efd9 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -17,7 +17,6 @@
 #include "X86InstComments.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
@@ -42,24 +41,11 @@ void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                   StringRef Annot, const MCSubtargetInfo &STI) {
-  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-  uint64_t TSFlags = Desc.TSFlags;
-
   // If verbose assembly is enabled, we can print some informative comments.
   if (CommentStream)
-    HasCustomInstComment =
-        EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
-
-  unsigned Flags = MI->getFlags();
-  if (TSFlags & X86II::LOCK)
-    OS << "\tlock\t";
-  if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK)
-    OS << "\tlock\t";
+    HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
 
-  if (Flags & X86::IP_HAS_REPEAT_NE)
-    OS << "\trepne\t";
-  else if (Flags & X86::IP_HAS_REPEAT)
-    OS << "\trep\t";
+  printInstFlags(MI, OS);
 
   // Output CALLpcrel32 as "callq" in 64-bit mode.
   // In Intel annotation it's always emitted as "call".
@@ -78,10 +64,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   // 0x66 to be interpreted as "data16" by the asm printer.
   // Thus we add an adjustment here in order to print the "right" instruction.
   else if (MI->getOpcode() == X86::DATA16_PREFIX &&
-    (STI.getFeatureBits()[X86::Mode16Bit])) {
-    MCInst Data32MI(*MI);
-    Data32MI.setOpcode(X86::DATA32_PREFIX);
-    printInstruction(&Data32MI, OS);
+           STI.getFeatureBits()[X86::Mode16Bit]) {
+   OS << "\tdata32";
   }
   // Try to print any aliases first.
   else if (!printAliasInstr(MI, OS))
@@ -91,97 +75,6 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
-void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
-                                      raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm();
-  switch (Imm) {
-  default: llvm_unreachable("Invalid ssecc/avxcc argument!");
-  case    0: O << "eq"; break;
-  case    1: O << "lt"; break;
-  case    2: O << "le"; break;
-  case    3: O << "unord"; break;
-  case    4: O << "neq"; break;
-  case    5: O << "nlt"; break;
-  case    6: O << "nle"; break;
-  case    7: O << "ord"; break;
-  case    8: O << "eq_uq"; break;
-  case    9: O << "nge"; break;
-  case  0xa: O << "ngt"; break;
-  case  0xb: O << "false"; break;
-  case  0xc: O << "neq_oq"; break;
-  case  0xd: O << "ge"; break;
-  case  0xe: O << "gt"; break;
-  case  0xf: O << "true"; break;
-  case 0x10: O << "eq_os"; break;
-  case 0x11: O << "lt_oq"; break;
-  case 0x12: O << "le_oq"; break;
-  case 0x13: O << "unord_s"; break;
-  case 0x14: O << "neq_us"; break;
-  case 0x15: O << "nlt_uq"; break;
-  case 0x16: O << "nle_uq"; break;
-  case 0x17: O << "ord_s"; break;
-  case 0x18: O << "eq_us"; break;
-  case 0x19: O << "nge_uq"; break;
-  case 0x1a: O << "ngt_uq"; break;
-  case 0x1b: O << "false_os"; break;
-  case 0x1c: O << "neq_os"; break;
-  case 0x1d: O << "ge_oq"; break;
-  case 0x1e: O << "gt_oq"; break;
-  case 0x1f: O << "true_us"; break;
-  }
-}
-
-void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
-                                   raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm();
-  switch (Imm) {
-  default: llvm_unreachable("Invalid xopcc argument!");
-  case 0: O << "lt"; break;
-  case 1: O << "le"; break;
-  case 2: O << "gt"; break;
-  case 3: O << "ge"; break;
-  case 4: O << "eq"; break;
-  case 5: O << "neq"; break;
-  case 6: O << "false"; break;
-  case 7: O << "true"; break;
-  }
-}
-
-void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
-                                            raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
-  switch (Imm) {
-  case 0: O << "{rn-sae}"; break;
-  case 1: O << "{rd-sae}"; break;
-  case 2: O << "{ru-sae}"; break;
-  case 3: O << "{rz-sae}"; break;
-  }
-}
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls).  These
-/// print slightly differently than normal immediates.  For example, a $ is not
-/// emitted.
-void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm())
-    O << formatImm(Op.getImm());
-  else {
-    assert(Op.isExpr() && "unknown pcrel immediate operand");
-    // If a symbolic branch target was added as a constant expression then print
-    // that address in hex.
-    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
-    int64_t Address;
-    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
-      O << formatHex((uint64_t)Address);
-    } else {
-      // Otherwise, just print the expression.
-      Op.getExpr()->print(O, &MAI);
-    }
-  }
-}
-
 void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -220,15 +113,11 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
   const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
   const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
-  const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg);
 
   O << markup("<mem:");
 
   // If this has a segment register, print it.
-  if (SegReg.getReg()) {
-    printOperand(MI, Op + X86::AddrSegmentReg, O);
-    O << ':';
-  }
+  printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
 
   if (DispSpec.isImm()) {
     int64_t DispVal = DispSpec.getImm();
@@ -261,15 +150,10 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
 
 void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
                                     raw_ostream &O) {
-  const MCOperand &SegReg = MI->getOperand(Op + 1);
-
   O << markup("<mem:");
 
   // If this has a segment register, print it.
-  if (SegReg.getReg()) {
-    printOperand(MI, Op + 1, O);
-    O << ':';
-  }
+  printOptionalSegReg(MI, Op + 1, O);
 
   O << "(";
   printOperand(MI, Op, O);
@@ -292,15 +176,11 @@ void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
 void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
                                        raw_ostream &O) {
   const MCOperand &DispSpec = MI->getOperand(Op);
-  const MCOperand &SegReg = MI->getOperand(Op + 1);
 
   O << markup("<mem:");
 
   // If this has a segment register, print it.
-  if (SegReg.getReg()) {
-    printOperand(MI, Op + 1, O);
-    O << ':';
-  }
+  printOptionalSegReg(MI, Op + 1, O);
 
   if (DispSpec.isImm()) {
     O << formatImm(DispSpec.getImm());
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 946c1c73f088..57422bc9a0b2 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -14,15 +14,15 @@
 #ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
 #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
 
-#include "llvm/MC/MCInstPrinter.h"
+#include "X86InstPrinterCommon.h"
 
 namespace llvm {
 
-class X86ATTInstPrinter final : public MCInstPrinter {
+class X86ATTInstPrinter final : public X86InstPrinterCommon {
 public:
   X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                     const MCRegisterInfo &MRI)
-      : MCInstPrinter(MAI, MII, MRI) {}
+      : X86InstPrinterCommon(MAI, MII, MRI) {}
 
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
@@ -38,21 +38,16 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &OS);
   static const char *getRegisterName(unsigned RegNo);
 
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override;
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
 
   void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
-
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
index a46f22ff40f5..37bed37b0994 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -13,10 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86InstComments.h"
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "Utils/X86ShuffleDecode.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -158,6 +160,46 @@ using namespace llvm;
   CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
   CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
 
+#define CASE_AVX512_FMA(Inst, suf)                \
+  CASE_AVX512_INS_COMMON(Inst, Z, suf)            \
+  CASE_AVX512_INS_COMMON(Inst, Z256, suf)         \
+  CASE_AVX512_INS_COMMON(Inst, Z128, suf)
+
+#define CASE_FMA(Inst, suf)                       \
+  CASE_AVX512_FMA(Inst, suf)                      \
+  CASE_AVX_INS_COMMON(Inst, , suf)                \
+  CASE_AVX_INS_COMMON(Inst, Y, suf)
+
+#define CASE_FMA_PACKED_REG(Inst)                 \
+  CASE_FMA(Inst##PD, r)                           \
+  CASE_FMA(Inst##PS, r)
+
+#define CASE_FMA_PACKED_MEM(Inst)                 \
+  CASE_FMA(Inst##PD, m)                           \
+  CASE_FMA(Inst##PS, m)                           \
+  CASE_AVX512_FMA(Inst##PD, mb)                   \
+  CASE_AVX512_FMA(Inst##PS, mb)
+
+#define CASE_FMA_SCALAR_REG(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD, , r)              \
+  CASE_AVX_INS_COMMON(Inst##SS, , r)              \
+  CASE_AVX_INS_COMMON(Inst##SD, , r_Int)          \
+  CASE_AVX_INS_COMMON(Inst##SS, , r_Int)          \
+  CASE_AVX_INS_COMMON(Inst##SD, Z, r)             \
+  CASE_AVX_INS_COMMON(Inst##SS, Z, r)             \
+  CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int)      \
+  CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int)
+
+#define CASE_FMA_SCALAR_MEM(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD, , m)              \
+  CASE_AVX_INS_COMMON(Inst##SS, , m)              \
+  CASE_AVX_INS_COMMON(Inst##SD, , m_Int)          \
+  CASE_AVX_INS_COMMON(Inst##SS, , m_Int)          \
+  CASE_AVX_INS_COMMON(Inst##SD, Z, m)             \
+  CASE_AVX_INS_COMMON(Inst##SS, Z, m)             \
+  CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int)      \
+  CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
+
 static unsigned getVectorRegSize(unsigned RegNo) {
   if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
     return 512;
@@ -171,230 +213,32 @@ static unsigned getVectorRegSize(unsigned RegNo) {
   llvm_unreachable("Unknown vector reg!");
 }
 
-static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
-                                 unsigned OperandIndex) {
+static unsigned getRegOperandNumElts(const MCInst *MI, unsigned ScalarSize,
+                                     unsigned OperandIndex) {
   unsigned OpReg = MI->getOperand(OperandIndex).getReg();
-  return MVT::getVectorVT(ScalarVT,
-                          getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+  return getVectorRegSize(OpReg) / ScalarSize;
 }
 
-/// \brief Extracts the dst type for a given zero extension instruction.
-static MVT getZeroExtensionResultType(const MCInst *MI) {
-  switch (MI->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown zero extension instruction");
-  // zero extension to i16
-  CASE_PMOVZX(PMOVZXBW, m)
-  CASE_PMOVZX(PMOVZXBW, r)
-    return getRegOperandVectorVT(MI, MVT::i16, 0);
-  // zero extension to i32
-  CASE_PMOVZX(PMOVZXBD, m)
-  CASE_PMOVZX(PMOVZXBD, r)
-  CASE_PMOVZX(PMOVZXWD, m)
-  CASE_PMOVZX(PMOVZXWD, r)
-    return getRegOperandVectorVT(MI, MVT::i32, 0);
-  // zero extension to i64
-  CASE_PMOVZX(PMOVZXBQ, m)
-  CASE_PMOVZX(PMOVZXBQ, r)
-  CASE_PMOVZX(PMOVZXWQ, m)
-  CASE_PMOVZX(PMOVZXWQ, r)
-  CASE_PMOVZX(PMOVZXDQ, m)
-  CASE_PMOVZX(PMOVZXDQ, r)
-    return getRegOperandVectorVT(MI, MVT::i64, 0);
-  }
+static const char *getRegName(unsigned Reg) {
+  return X86ATTInstPrinter::getRegisterName(Reg);
 }
 
 /// Wraps the destination register name with AVX512 mask/maskz filtering.
 static void printMasking(raw_ostream &OS, const MCInst *MI,
-                         const char *(*getRegName)(unsigned)) {
-  bool MaskWithZero = false;
-  const char *MaskRegName = nullptr;
+                         const MCInstrInfo &MCII) {
+  const MCInstrDesc &Desc = MCII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
 
-  switch (MI->getOpcode()) {
-  default:
+  if (!(TSFlags & X86II::EVEX_K))
     return;
-  CASE_MASKZ_MOVDUP(MOVDDUP, m)
-  CASE_MASKZ_MOVDUP(MOVDDUP, r)
-  CASE_MASKZ_MOVDUP(MOVSHDUP, m)
-  CASE_MASKZ_MOVDUP(MOVSHDUP, r)
-  CASE_MASKZ_MOVDUP(MOVSLDUP, m)
-  CASE_MASKZ_MOVDUP(MOVSLDUP, r)
-  CASE_MASKZ_PMOVZX(PMOVZXBD, m)
-  CASE_MASKZ_PMOVZX(PMOVZXBD, r)
-  CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
-  CASE_MASKZ_PMOVZX(PMOVZXBQ, r)
-  CASE_MASKZ_PMOVZX(PMOVZXBW, m)
-  CASE_MASKZ_PMOVZX(PMOVZXBW, r)
-  CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
-  CASE_MASKZ_PMOVZX(PMOVZXDQ, r)
-  CASE_MASKZ_PMOVZX(PMOVZXWD, m)
-  CASE_MASKZ_PMOVZX(PMOVZXWD, r)
-  CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
-  CASE_MASKZ_PMOVZX(PMOVZXWQ, r)
-  CASE_MASKZ_UNPCK(PUNPCKHBW, m)
-  CASE_MASKZ_UNPCK(PUNPCKHBW, r)
-  CASE_MASKZ_UNPCK(PUNPCKHWD, m)
-  CASE_MASKZ_UNPCK(PUNPCKHWD, r)
-  CASE_MASKZ_UNPCK(PUNPCKHDQ, m)
-  CASE_MASKZ_UNPCK(PUNPCKHDQ, r)
-  CASE_MASKZ_UNPCK(PUNPCKLBW, m)
-  CASE_MASKZ_UNPCK(PUNPCKLBW, r)
-  CASE_MASKZ_UNPCK(PUNPCKLWD, m)
-  CASE_MASKZ_UNPCK(PUNPCKLWD, r)
-  CASE_MASKZ_UNPCK(PUNPCKLDQ, m)
-  CASE_MASKZ_UNPCK(PUNPCKLDQ, r)
-  CASE_MASKZ_UNPCK(UNPCKHPD, m)
-  CASE_MASKZ_UNPCK(UNPCKHPD, r)
-  CASE_MASKZ_UNPCK(UNPCKHPS, m)
-  CASE_MASKZ_UNPCK(UNPCKHPS, r)
-  CASE_MASKZ_UNPCK(UNPCKLPD, m)
-  CASE_MASKZ_UNPCK(UNPCKLPD, r)
-  CASE_MASKZ_UNPCK(UNPCKLPS, m)
-  CASE_MASKZ_UNPCK(UNPCKLPS, r)
-  CASE_MASKZ_SHUF(PALIGNR, r)
-  CASE_MASKZ_SHUF(PALIGNR, m)
-  CASE_MASKZ_SHUF(ALIGNQ, r)
-  CASE_MASKZ_SHUF(ALIGNQ, m)
-  CASE_MASKZ_SHUF(ALIGND, r)
-  CASE_MASKZ_SHUF(ALIGND, m)
-  CASE_MASKZ_SHUF(SHUFPD, m)
-  CASE_MASKZ_SHUF(SHUFPD, r)
-  CASE_MASKZ_SHUF(SHUFPS, m)
-  CASE_MASKZ_SHUF(SHUFPS, r)
-  CASE_MASKZ_VPERMILPI(PERMILPD, m)
-  CASE_MASKZ_VPERMILPI(PERMILPD, r)
-  CASE_MASKZ_VPERMILPI(PERMILPS, m)
-  CASE_MASKZ_VPERMILPI(PERMILPS, r)
-  CASE_MASKZ_VPERMILPI(PSHUFD, m)
-  CASE_MASKZ_VPERMILPI(PSHUFD, r)
-  CASE_MASKZ_VPERMILPI(PSHUFHW, m)
-  CASE_MASKZ_VPERMILPI(PSHUFHW, r)
-  CASE_MASKZ_VPERMILPI(PSHUFLW, m)
-  CASE_MASKZ_VPERMILPI(PSHUFLW, r)
-  CASE_MASKZ_VPERM(PERMPD, m)
-  CASE_MASKZ_VPERM(PERMPD, r)
-  CASE_MASKZ_VPERM(PERMQ, m)
-  CASE_MASKZ_VPERM(PERMQ, r)
-  CASE_MASKZ_VSHUF(64X2, m)
-  CASE_MASKZ_VSHUF(64X2, r)
-  CASE_MASKZ_VSHUF(32X4, m)
-  CASE_MASKZ_VSHUF(32X4, r)
-  CASE_MASKZ_INS_COMMON(BROADCASTF64X2, Z128, rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI64X2, Z128, rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTF64X2, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI64X2, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTF64X4, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI64X4, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X4, Z256, rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X4, Z256, rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X4, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, r)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, m)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, m)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, r)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, r)
-  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, m)
-  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, m)
-    MaskWithZero = true;
-    MaskRegName = getRegName(MI->getOperand(1).getReg());
-    break;
-  CASE_MASK_MOVDUP(MOVDDUP, m)
-  CASE_MASK_MOVDUP(MOVDDUP, r)
-  CASE_MASK_MOVDUP(MOVSHDUP, m)
-  CASE_MASK_MOVDUP(MOVSHDUP, r)
-  CASE_MASK_MOVDUP(MOVSLDUP, m)
-  CASE_MASK_MOVDUP(MOVSLDUP, r)
-  CASE_MASK_PMOVZX(PMOVZXBD, m)
-  CASE_MASK_PMOVZX(PMOVZXBD, r)
-  CASE_MASK_PMOVZX(PMOVZXBQ, m)
-  CASE_MASK_PMOVZX(PMOVZXBQ, r)
-  CASE_MASK_PMOVZX(PMOVZXBW, m)
-  CASE_MASK_PMOVZX(PMOVZXBW, r)
-  CASE_MASK_PMOVZX(PMOVZXDQ, m)
-  CASE_MASK_PMOVZX(PMOVZXDQ, r)
-  CASE_MASK_PMOVZX(PMOVZXWD, m)
-  CASE_MASK_PMOVZX(PMOVZXWD, r)
-  CASE_MASK_PMOVZX(PMOVZXWQ, m)
-  CASE_MASK_PMOVZX(PMOVZXWQ, r)
-  CASE_MASK_UNPCK(PUNPCKHBW, m)
-  CASE_MASK_UNPCK(PUNPCKHBW, r)
-  CASE_MASK_UNPCK(PUNPCKHWD, m)
-  CASE_MASK_UNPCK(PUNPCKHWD, r)
-  CASE_MASK_UNPCK(PUNPCKHDQ, m)
-  CASE_MASK_UNPCK(PUNPCKHDQ, r)
-  CASE_MASK_UNPCK(PUNPCKLBW, m)
-  CASE_MASK_UNPCK(PUNPCKLBW, r)
-  CASE_MASK_UNPCK(PUNPCKLWD, m)
-  CASE_MASK_UNPCK(PUNPCKLWD, r)
-  CASE_MASK_UNPCK(PUNPCKLDQ, m)
-  CASE_MASK_UNPCK(PUNPCKLDQ, r)
-  CASE_MASK_UNPCK(UNPCKHPD, m)
-  CASE_MASK_UNPCK(UNPCKHPD, r)
-  CASE_MASK_UNPCK(UNPCKHPS, m)
-  CASE_MASK_UNPCK(UNPCKHPS, r)
-  CASE_MASK_UNPCK(UNPCKLPD, m)
-  CASE_MASK_UNPCK(UNPCKLPD, r)
-  CASE_MASK_UNPCK(UNPCKLPS, m)
-  CASE_MASK_UNPCK(UNPCKLPS, r)
-  CASE_MASK_SHUF(PALIGNR, r)
-  CASE_MASK_SHUF(PALIGNR, m)
-  CASE_MASK_SHUF(ALIGNQ, r)
-  CASE_MASK_SHUF(ALIGNQ, m)
-  CASE_MASK_SHUF(ALIGND, r)
-  CASE_MASK_SHUF(ALIGND, m)
-  CASE_MASK_SHUF(SHUFPD, m)
-  CASE_MASK_SHUF(SHUFPD, r)
-  CASE_MASK_SHUF(SHUFPS, m)
-  CASE_MASK_SHUF(SHUFPS, r)
-  CASE_MASK_VPERMILPI(PERMILPD, m)
-  CASE_MASK_VPERMILPI(PERMILPD, r)
-  CASE_MASK_VPERMILPI(PERMILPS, m)
-  CASE_MASK_VPERMILPI(PERMILPS, r)
-  CASE_MASK_VPERMILPI(PSHUFD, m)
-  CASE_MASK_VPERMILPI(PSHUFD, r)
-  CASE_MASK_VPERMILPI(PSHUFHW, m)
-  CASE_MASK_VPERMILPI(PSHUFHW, r)
-  CASE_MASK_VPERMILPI(PSHUFLW, m)
-  CASE_MASK_VPERMILPI(PSHUFLW, r)
-  CASE_MASK_VPERM(PERMPD, m)
-  CASE_MASK_VPERM(PERMPD, r)
-  CASE_MASK_VPERM(PERMQ, m)
-  CASE_MASK_VPERM(PERMQ, r)
-  CASE_MASK_VSHUF(64X2, m)
-  CASE_MASK_VSHUF(64X2, r)
-  CASE_MASK_VSHUF(32X4, m)
-  CASE_MASK_VSHUF(32X4, r)
-  CASE_MASK_INS_COMMON(BROADCASTF64X2, Z128, rm)
-  CASE_MASK_INS_COMMON(BROADCASTI64X2, Z128, rm)
-  CASE_MASK_INS_COMMON(BROADCASTF64X2, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTI64X2, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTF64X4, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTI64X4, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTF32X4, Z256, rm)
-  CASE_MASK_INS_COMMON(BROADCASTI32X4, Z256, rm)
-  CASE_MASK_INS_COMMON(BROADCASTF32X4, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm)
-  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, r)
-  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, m)
-  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r)
-  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r)
-  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m)
-  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, m)
-  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, r)
-  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, r)
-  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, m)
-  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, m)
-    MaskRegName = getRegName(MI->getOperand(2).getReg());
-    break;
-  }
+
+  bool MaskWithZero = (TSFlags & X86II::EVEX_Z);
+  unsigned MaskOp = Desc.getNumDefs();
+
+  if (Desc.getOperandConstraint(MaskOp, MCOI::TIED_TO) != -1)
+    ++MaskOp;
+
+  const char *MaskRegName = getRegName(MI->getOperand(MaskOp).getReg());
 
   // MASK: zmmX {%kY}
   OS << " {%" << MaskRegName << "}";
@@ -404,6 +248,248 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
     OS << " {z}";
 }
 
+static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
+  const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
+  unsigned NumOperands = MI->getNumOperands();
+  bool RegForm = false;
+  bool Negate = false;
+  StringRef AccStr = "+";
+
+  // The operands for FMA instructions without rounding fall into two forms.
+  //  dest, src1, src2, src3
+  //  dest, src1, mask, src2, src3
+  // Where src3 is either a register or 5 memory address operands. So to find
+  // dest and src1 we can index from the front. To find src2 and src3 we can
+  // index from the end by taking into account memory vs register form when
+  // finding src2.
+
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  CASE_FMA_PACKED_REG(FMADD132)
+  CASE_FMA_SCALAR_REG(FMADD132)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMADD132)
+  CASE_FMA_SCALAR_MEM(FMADD132)
+    AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  CASE_FMA_PACKED_REG(FMADD213)
+  CASE_FMA_SCALAR_REG(FMADD213)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMADD213)
+  CASE_FMA_SCALAR_MEM(FMADD213)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul2Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  CASE_FMA_PACKED_REG(FMADD231)
+  CASE_FMA_SCALAR_REG(FMADD231)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMADD231)
+  CASE_FMA_SCALAR_MEM(FMADD231)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    AccName = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  CASE_FMA_PACKED_REG(FMSUB132)
+  CASE_FMA_SCALAR_REG(FMSUB132)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMSUB132)
+  CASE_FMA_SCALAR_MEM(FMSUB132)
+    AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+
+  CASE_FMA_PACKED_REG(FMSUB213)
+  CASE_FMA_SCALAR_REG(FMSUB213)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMSUB213)
+  CASE_FMA_SCALAR_MEM(FMSUB213)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul2Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+
+  CASE_FMA_PACKED_REG(FMSUB231)
+  CASE_FMA_SCALAR_REG(FMSUB231)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMSUB231)
+  CASE_FMA_SCALAR_MEM(FMSUB231)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    AccName = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+
+  CASE_FMA_PACKED_REG(FNMADD132)
+  CASE_FMA_SCALAR_REG(FNMADD132)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FNMADD132)
+  CASE_FMA_SCALAR_MEM(FNMADD132)
+    AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+
+  CASE_FMA_PACKED_REG(FNMADD213)
+  CASE_FMA_SCALAR_REG(FNMADD213)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FNMADD213)
+  CASE_FMA_SCALAR_MEM(FNMADD213)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul2Name = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+
+  CASE_FMA_PACKED_REG(FNMADD231)
+  CASE_FMA_SCALAR_REG(FNMADD231)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FNMADD231)
+  CASE_FMA_SCALAR_MEM(FNMADD231)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    AccName = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+
+  CASE_FMA_PACKED_REG(FNMSUB132)
+  CASE_FMA_SCALAR_REG(FNMSUB132)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FNMSUB132)
+  CASE_FMA_SCALAR_MEM(FNMSUB132)
+    AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+
+  CASE_FMA_PACKED_REG(FNMSUB213)
+  CASE_FMA_SCALAR_REG(FNMSUB213)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FNMSUB213)
+  CASE_FMA_SCALAR_MEM(FNMSUB213)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul2Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+
+  CASE_FMA_PACKED_REG(FNMSUB231)
+  CASE_FMA_SCALAR_REG(FNMSUB231)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FNMSUB231)
+  CASE_FMA_SCALAR_MEM(FNMSUB231)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    AccName = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+
+  CASE_FMA_PACKED_REG(FMADDSUB132)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMADDSUB132)
+    AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+
+  CASE_FMA_PACKED_REG(FMADDSUB213)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMADDSUB213)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul2Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+
+  CASE_FMA_PACKED_REG(FMADDSUB231)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMADDSUB231)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    AccName = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+
+  CASE_FMA_PACKED_REG(FMSUBADD132)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMSUBADD132)
+    AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+
+  CASE_FMA_PACKED_REG(FMSUBADD213)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMSUBADD213)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    Mul2Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+
+  CASE_FMA_PACKED_REG(FMSUBADD231)
+    Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+  CASE_FMA_PACKED_MEM(FMSUBADD231)
+    Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    AccName = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+  }
+
+  const char *DestName = getRegName(MI->getOperand(0).getReg());
+
+  if (!Mul1Name) Mul1Name = "mem";
+  if (!Mul2Name) Mul2Name = "mem";
+  if (!AccName)  AccName = "mem";
+
+  OS << DestName << " = ";
+  // TODO: Print masking information?
+
+  if (Negate)
+    OS << '-';
+
+  OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' '
+     << AccName;
+
+  return true;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Top Level Entrypoint
 //===----------------------------------------------------------------------===//
@@ -412,13 +498,16 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
 /// newline terminated strings to the specified string if desired.  This
 /// information is shown in disassembly dumps when verbose assembly is enabled.
 bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
-                                  const char *(*getRegName)(unsigned)) {
+                                  const MCInstrInfo &MCII) {
   // If this is a shuffle operation, the switch should fill in this state.
   SmallVector<int, 8> ShuffleMask;
   const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
   unsigned NumOperands = MI->getNumOperands();
   bool RegForm = false;
 
+  if (printFMA3Comments(MI, OS))
+    return true;
+
   switch (MI->getOpcode()) {
   default:
     // Not an instruction for which we can decode comments.
@@ -433,7 +522,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VBLENDPDrmi:
   case X86::VBLENDPDYrmi:
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+      DecodeBLENDMask(getRegOperandNumElts(MI, 64, 0),
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -449,7 +538,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VBLENDPSrmi:
   case X86::VBLENDPSYrmi:
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+      DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0),
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -465,7 +554,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPBLENDWrmi:
   case X86::VPBLENDWYrmi:
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+      DecodeBLENDMask(getRegOperandNumElts(MI, 16, 0),
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -479,7 +568,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPBLENDDrmi:
   case X86::VPBLENDDYrmi:
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+      DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0),
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -524,7 +613,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVHPDZ128rm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeInsertElementMask(MVT::v2f64, 1, 1, ShuffleMask);
+    DecodeInsertElementMask(2, 1, 1, ShuffleMask);
     break;
 
   case X86::MOVHPSrm:
@@ -532,7 +621,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVHPSZ128rm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeInsertElementMask(MVT::v4f32, 2, 2, ShuffleMask);
+    DecodeInsertElementMask(4, 2, 2, ShuffleMask);
     break;
 
   case X86::MOVLPDrm:
@@ -540,7 +629,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVLPDZ128rm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeInsertElementMask(MVT::v2f64, 0, 1, ShuffleMask);
+    DecodeInsertElementMask(2, 0, 1, ShuffleMask);
     break;
 
   case X86::MOVLPSrm:
@@ -548,7 +637,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVLPSZ128rm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeInsertElementMask(MVT::v4f32, 0, 2, ShuffleMask);
+    DecodeInsertElementMask(4, 0, 2, ShuffleMask);
     break;
 
   CASE_MOVDUP(MOVSLDUP, r)
@@ -557,7 +646,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_MOVDUP(MOVSLDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    DecodeMOVSLDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask);
     break;
 
   CASE_MOVDUP(MOVSHDUP, r)
@@ -566,7 +655,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_MOVDUP(MOVSHDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    DecodeMOVSHDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask);
     break;
 
   CASE_MOVDUP(MOVDDUP, r)
@@ -575,7 +664,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_MOVDUP(MOVDDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    DecodeMOVDDUPMask(getRegOperandNumElts(MI, 64, 0), ShuffleMask);
     break;
 
   case X86::PSLLDQri:
@@ -591,7 +680,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPSLLDQZrm:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+      DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0),
                        MI->getOperand(NumOperands - 1).getImm(),
                        ShuffleMask);
     break;
@@ -609,7 +698,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPSRLDQZrm:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+      DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0),
                        MI->getOperand(NumOperands - 1).getImm(),
                        ShuffleMask);
     break;
@@ -623,7 +712,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+      DecodePALIGNRMask(getRegOperandNumElts(MI, 8, 0),
                         MI->getOperand(NumOperands - 1).getImm(),
                         ShuffleMask);
     break;
@@ -641,7 +730,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+      DecodeVALIGNMask(getRegOperandNumElts(MI, 64, 0),
                        MI->getOperand(NumOperands - 1).getImm(),
                        ShuffleMask);
     break;
@@ -659,7 +748,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+      DecodeVALIGNMask(getRegOperandNumElts(MI, 32, 0),
                        MI->getOperand(NumOperands - 1).getImm(),
                        ShuffleMask);
     break;
@@ -671,7 +760,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_SHUF(PSHUFD, mi)
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+      DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32,
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     break;
@@ -683,7 +772,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_SHUF(PSHUFHW, mi)
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+      DecodePSHUFHWMask(getRegOperandNumElts(MI, 16, 0),
                         MI->getOperand(NumOperands - 1).getImm(),
                         ShuffleMask);
     break;
@@ -695,7 +784,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_SHUF(PSHUFLW, mi)
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+      DecodePSHUFLWMask(getRegOperandNumElts(MI, 16, 0),
                         MI->getOperand(NumOperands - 1).getImm(),
                         ShuffleMask);
     break;
@@ -707,8 +796,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PSHUFWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSHUFMask(MVT::v4i16,
-                      MI->getOperand(NumOperands - 1).getImm(),
+      DecodePSHUFMask(4, 16, MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     break;
 
@@ -718,7 +806,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::PSWAPDrm:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSWAPMask(MVT::v2i32, ShuffleMask);
+    DecodePSWAPMask(2, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHBW, r)
@@ -731,7 +819,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PUNPCKHBWirm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHWD, r)
@@ -744,7 +832,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PUNPCKHWDirm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHDQ, r)
@@ -757,7 +845,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PUNPCKHDQirm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHQDQ, r)
@@ -768,7 +856,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_UNPCK(PUNPCKHQDQ, m)
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLBW, r)
@@ -781,7 +869,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PUNPCKLBWirm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLWD, r)
@@ -794,7 +882,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PUNPCKLWDirm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLDQ, r)
@@ -807,7 +895,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MMX_PUNPCKLDQirm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLQDQ, r)
@@ -818,7 +906,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_UNPCK(PUNPCKLQDQ, m)
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
     break;
 
   CASE_SHUF(SHUFPD, rri)
@@ -828,9 +916,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_SHUF(SHUFPD, rmi)
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
-                      MI->getOperand(NumOperands - 1).getImm(),
-                      ShuffleMask);
+      DecodeSHUFPMask(getRegOperandNumElts(MI, 64, 0), 64,
+                      MI->getOperand(NumOperands - 1).getImm(), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -842,7 +929,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_SHUF(SHUFPS, rmi)
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+      DecodeSHUFPMask(getRegOperandNumElts(MI, 32, 0), 32,
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
@@ -855,7 +942,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
 
   CASE_VSHUF(64X2, m)
-    decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+    decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 64, 0), 64,
                               MI->getOperand(NumOperands - 1).getImm(),
                               ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
@@ -868,7 +955,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
 
   CASE_VSHUF(32X4, m)
-    decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+    decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 32, 0), 32,
                               MI->getOperand(NumOperands - 1).getImm(),
                               ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
@@ -881,7 +968,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(UNPCKLPD, m)
-    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -892,7 +979,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(UNPCKLPS, m)
-    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -903,7 +990,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(UNPCKHPD, m)
-    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -914,7 +1001,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(UNPCKHPS, m)
-    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -925,7 +1012,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_VPERMILPI(PERMILPS, m)
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+      DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32,
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -937,7 +1024,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_VPERMILPI(PERMILPD, m)
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+      DecodePSHUFMask(getRegOperandNumElts(MI, 64, 0), 64,
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -952,8 +1039,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPERM2I128rm:
     // For instruction comments purpose, assume the 256-bit vector is v4i64.
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeVPERM2X128Mask(MVT::v4i64,
-                           MI->getOperand(NumOperands - 1).getImm(),
+      DecodeVPERM2X128Mask(4, MI->getOperand(NumOperands - 1).getImm(),
                            ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -965,7 +1051,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_VPERM(PERMPD, m)
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+      DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0),
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -977,7 +1063,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_VPERM(PERMQ, m)
     if (MI->getOperand(NumOperands - 1).isImm())
-      DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+      DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0),
                       MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -993,7 +1079,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MOVSDrm:
   case X86::VMOVSDrm:
   case X86::VMOVSDZrm:
-    DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+    DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
@@ -1007,13 +1093,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MOVSSrm:
   case X86::VMOVSSrm:
   case X86::VMOVSSZrm:
-    DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+    DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   case X86::MOVPQI2QIrr:
   case X86::MOVZPQILo2PQIrr:
   case X86::VMOVPQI2QIrr:
+  case X86::VMOVPQI2QIZrr:
   case X86::VMOVZPQILo2PQIrr:
   case X86::VMOVZPQILo2PQIZrr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -1022,23 +1109,22 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MOVQI2PQIrm:
   case X86::VMOVQI2PQIrm:
   case X86::VMOVQI2PQIZrm:
-    DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+    DecodeZeroMoveLowMask(2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   case X86::MOVDI2PDIrm:
   case X86::VMOVDI2PDIrm:
   case X86::VMOVDI2PDIZrm:
-    DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+    DecodeZeroMoveLowMask(4, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   case X86::EXTRQI:
     if (MI->getOperand(2).isImm() &&
         MI->getOperand(3).isImm())
-      DecodeEXTRQIMask(MVT::v16i8, MI->getOperand(2).getImm(),
-                       MI->getOperand(3).getImm(),
-                       ShuffleMask);
+      DecodeEXTRQIMask(16, 8, MI->getOperand(2).getImm(),
+                       MI->getOperand(3).getImm(), ShuffleMask);
 
     DestName = getRegName(MI->getOperand(0).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -1047,9 +1133,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::INSERTQI:
     if (MI->getOperand(3).isImm() &&
         MI->getOperand(4).isImm())
-      DecodeINSERTQIMask(MVT::v16i8, MI->getOperand(3).getImm(),
-                         MI->getOperand(4).getImm(),
-                         ShuffleMask);
+      DecodeINSERTQIMask(16, 8, MI->getOperand(3).getImm(),
+                         MI->getOperand(4).getImm(), ShuffleMask);
 
     DestName = getRegName(MI->getOperand(0).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -1060,39 +1145,39 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VBROADCASTI128:
   CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm)
   CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm)
-    DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask);
+    DecodeSubVectorBroadcast(4, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm)
   CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm)
-    DecodeSubVectorBroadcast(MVT::v8f64, MVT::v2f64, ShuffleMask);
+    DecodeSubVectorBroadcast(8, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm)
   CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm)
-    DecodeSubVectorBroadcast(MVT::v8f64, MVT::v4f64, ShuffleMask);
+    DecodeSubVectorBroadcast(8, 4, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm)
   CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm)
-    DecodeSubVectorBroadcast(MVT::v8f32, MVT::v4f32, ShuffleMask);
+    DecodeSubVectorBroadcast(8, 4, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm)
   CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm)
-    DecodeSubVectorBroadcast(MVT::v16f32, MVT::v4f32, ShuffleMask);
+    DecodeSubVectorBroadcast(16, 4, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm)
   CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm)
-    DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask);
+    DecodeSubVectorBroadcast(16, 8, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m)
-    DecodeSubVectorBroadcast(MVT::v4f32, MVT::v2f32, ShuffleMask);
+    DecodeSubVectorBroadcast(4, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
@@ -1101,7 +1186,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
-    DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
+    DecodeSubVectorBroadcast(8, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
@@ -1110,40 +1195,55 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
-    DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
+    DecodeSubVectorBroadcast(16, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_PMOVZX(PMOVZXBW, r)
-  CASE_PMOVZX(PMOVZXBD, r)
-  CASE_PMOVZX(PMOVZXBQ, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-
   CASE_PMOVZX(PMOVZXBW, m)
+    DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXBD, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_PMOVZX(PMOVZXBD, m)
+    DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXBQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_PMOVZX(PMOVZXBQ, m)
-    DecodeZeroExtendMask(MVT::i8, getZeroExtensionResultType(MI), ShuffleMask);
+    DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_PMOVZX(PMOVZXWD, r)
-  CASE_PMOVZX(PMOVZXWQ, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-
   CASE_PMOVZX(PMOVZXWD, m)
+    DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXWQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_PMOVZX(PMOVZXWQ, m)
-    DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask);
+    DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_PMOVZX(PMOVZXDQ, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-
   CASE_PMOVZX(PMOVZXDQ, m)
-    DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask);
+    DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   }
@@ -1156,7 +1256,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   if (!DestName) DestName = Src1Name;
   if (DestName) {
     OS << DestName;
-    printMasking(OS, MI, getRegName);
+    printMasking(OS, MI, MCII);
   } else
     OS << "mem";
 
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
index 629c02c95c7f..40dffa5fbb8a 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -15,19 +15,13 @@
 #ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
 #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
 
-#include "llvm/CodeGen/MachineInstr.h"
-
 namespace llvm {
 
-  enum AsmComments {
-    // For instr that was compressed from EVEX to VEX.
-    AC_EVEX_2_VEX = MachineInstr::TAsmComments
-  };
-
   class MCInst;
+  class MCInstrInfo;
   class raw_ostream;
   bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
-                              const char *(*getRegName)(unsigned));
+                              const MCInstrInfo &MCII);
 }
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
new file mode 100644
index 000000000000..432cd47ae499
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
@@ -0,0 +1,142 @@
+//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes common code for rendering MCInst instances as Intel-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstPrinterCommon.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
+#include <cassert>
+
+using namespace llvm;
+
+void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86InstPrinterCommon::printXOPCC(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
+void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls).  In
+/// Intel-style these print slightly differently than normal immediates.
+/// for example, a $ is not emitted.
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    } else {
+      // Otherwise, just print the expression.
+      Op.getExpr()->print(O, &MAI);
+    }
+  }
+}
+
+void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) {
+  if (MI->getOperand(OpNo).getReg()) {
+    printOperand(MI, OpNo, O);
+    O << ':';
+  }
+}
+
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+  unsigned Flags = MI->getFlags();
+
+  if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
+    O << "\tlock\t";
+
+  if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
+    O << "\tnotrack\t";
+
+  if (Flags & X86::IP_HAS_REPEAT_NE)
+    O << "\trepne\t";
+  else if (Flags & X86::IP_HAS_REPEAT)
+    O << "\trep\t";
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h
new file mode 100644
index 000000000000..f2875e71f22c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h
@@ -0,0 +1,38 @@
+//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code common for rendering MCInst instances as AT&T-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86InstPrinterCommon : public MCInstPrinter {
+public:
+  using MCInstPrinter::MCInstPrinter;
+
+  virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0;
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+protected:
+  void printInstFlags(const MCInst *MI, raw_ostream &O);
+  void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 79a8e3049702..044b71564152 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
@@ -37,116 +38,21 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                     StringRef Annot,
                                     const MCSubtargetInfo &STI) {
-  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-  uint64_t TSFlags = Desc.TSFlags;
-  unsigned Flags = MI->getFlags();
+  printInstFlags(MI, OS);
 
-  if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
-    OS << "\tlock\t";
-
-  if (Flags & X86::IP_HAS_REPEAT_NE)
-    OS << "\trepne\t";
-  else if (Flags & X86::IP_HAS_REPEAT)
-    OS << "\trep\t";
-
-  printInstruction(MI, OS);
+  // In 16-bit mode, print data16 as data32.
+  if (MI->getOpcode() == X86::DATA16_PREFIX &&
+      STI.getFeatureBits()[X86::Mode16Bit]) {
+    OS << "\tdata32";
+  } else
+    printInstruction(MI, OS);
 
   // Next always print the annotation.
   printAnnotation(OS, Annot);
 
   // If verbose assembly is enabled, we can print some informative comments.
   if (CommentStream)
-    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
-}
-
-void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
-                                        raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm();
-  switch (Imm) {
-  default: llvm_unreachable("Invalid avxcc argument!");
-  case    0: O << "eq"; break;
-  case    1: O << "lt"; break;
-  case    2: O << "le"; break;
-  case    3: O << "unord"; break;
-  case    4: O << "neq"; break;
-  case    5: O << "nlt"; break;
-  case    6: O << "nle"; break;
-  case    7: O << "ord"; break;
-  case    8: O << "eq_uq"; break;
-  case    9: O << "nge"; break;
-  case  0xa: O << "ngt"; break;
-  case  0xb: O << "false"; break;
-  case  0xc: O << "neq_oq"; break;
-  case  0xd: O << "ge"; break;
-  case  0xe: O << "gt"; break;
-  case  0xf: O << "true"; break;
-  case 0x10: O << "eq_os"; break;
-  case 0x11: O << "lt_oq"; break;
-  case 0x12: O << "le_oq"; break;
-  case 0x13: O << "unord_s"; break;
-  case 0x14: O << "neq_us"; break;
-  case 0x15: O << "nlt_uq"; break;
-  case 0x16: O << "nle_uq"; break;
-  case 0x17: O << "ord_s"; break;
-  case 0x18: O << "eq_us"; break;
-  case 0x19: O << "nge_uq"; break;
-  case 0x1a: O << "ngt_uq"; break;
-  case 0x1b: O << "false_os"; break;
-  case 0x1c: O << "neq_os"; break;
-  case 0x1d: O << "ge_oq"; break;
-  case 0x1e: O << "gt_oq"; break;
-  case 0x1f: O << "true_us"; break;
-  }
-}
-
-void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
-                                     raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm();
-  switch (Imm) {
-  default: llvm_unreachable("Invalid xopcc argument!");
-  case 0: O << "lt"; break;
-  case 1: O << "le"; break;
-  case 2: O << "gt"; break;
-  case 3: O << "ge"; break;
-  case 4: O << "eq"; break;
-  case 5: O << "neq"; break;
-  case 6: O << "false"; break;
-  case 7: O << "true"; break;
-  }
-}
-
-void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
-                                               raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
-  switch (Imm) {
-  case 0: O << "{rn-sae}"; break;
-  case 1: O << "{rd-sae}"; break;
-  case 2: O << "{ru-sae}"; break;
-  case 3: O << "{rz-sae}"; break;
-  }
-}
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value.
-void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm())
-    O << formatImm(Op.getImm());
-  else {
-    assert(Op.isExpr() && "unknown pcrel immediate operand");
-    // If a symbolic branch target was added as a constant expression then print
-    // that address in hex.
-    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
-    int64_t Address;
-    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
-      O << formatHex((uint64_t)Address);
-    }
-    else {
-      // Otherwise, just print the expression.
-      Op.getExpr()->print(O, &MAI);
-    }
-  }
+    EmitAnyX86InstComments(MI, *CommentStream, MII);
 }
 
 void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -169,13 +75,9 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
   const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
   const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
-  const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
 
   // If this has a segment register, print it.
-  if (SegReg.getReg()) {
-    printOperand(MI, Op+X86::AddrSegmentReg, O);
-    O << ':';
-  }
+  printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
 
   O << '[';
 
@@ -217,13 +119,8 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
 
 void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
                                       raw_ostream &O) {
-  const MCOperand &SegReg   = MI->getOperand(Op+1);
-
   // If this has a segment register, print it.
-  if (SegReg.getReg()) {
-    printOperand(MI, Op+1, O);
-    O << ':';
-  }
+  printOptionalSegReg(MI, Op + 1, O);
   O << '[';
   printOperand(MI, Op, O);
   O << ']';
@@ -240,13 +137,9 @@ void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
 void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
                                          raw_ostream &O) {
   const MCOperand &DispSpec = MI->getOperand(Op);
-  const MCOperand &SegReg   = MI->getOperand(Op+1);
 
   // If this has a segment register, print it.
-  if (SegReg.getReg()) {
-    printOperand(MI, Op+1, O);
-    O << ':';
-  }
+  printOptionalSegReg(MI, Op + 1, O);
 
   O << '[';
 
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index ace31186a054..3b34a8052bec 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -14,16 +14,16 @@
 #ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
 #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
 
-#include "llvm/MC/MCInstPrinter.h"
+#include "X86InstPrinterCommon.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
-class X86IntelInstPrinter final : public MCInstPrinter {
+class X86IntelInstPrinter final : public X86InstPrinterCommon {
 public:
   X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                       const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI) {}
+    : X86InstPrinterCommon(MAI, MII, MRI) {}
 
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
@@ -33,15 +33,11 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override;
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
 
   void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
@@ -49,7 +45,6 @@ public:
   }
 
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "opaque ptr ";
     printMemReference(MI, OpNo, O);
   }
 
@@ -90,7 +85,7 @@ public:
     printMemReference(MI, OpNo, O);
   }
   void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "xword ptr ";
+    O << "tbyte ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 34db5918926b..0e4c4398e49d 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -46,6 +46,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   case X86::reloc_signed_4byte:
   case X86::reloc_signed_4byte_relax:
   case X86::reloc_global_offset_table:
+  case X86::reloc_branch_4byte_pcrel:
   case FK_SecRel_4:
   case FK_Data_4:
     return 2;
@@ -67,19 +68,10 @@ public:
 };
 
 class X86AsmBackend : public MCAsmBackend {
-  const StringRef CPU;
-  bool HasNopl;
-  const uint64_t MaxNopLength;
+  const MCSubtargetInfo &STI;
 public:
-  X86AsmBackend(const Target &T, StringRef CPU)
-      : MCAsmBackend(), CPU(CPU),
-        MaxNopLength((CPU == "slm" || CPU == "silvermont") ? 7 : 15) {
-    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
-              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
-              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
-              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
-              CPU != "c3" && CPU != "c3-2" && CPU != "lakemont" && CPU != "";
-  }
+  X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
+      : MCAsmBackend(support::little), STI(STI) {}
 
   unsigned getNumFixupKinds() const override {
     return X86::NumTargetFixupKinds;
@@ -95,6 +87,7 @@ public:
         {"reloc_signed_4byte_relax", 0, 32, 0},
         {"reloc_global_offset_table", 0, 32, 0},
         {"reloc_global_offset_table8", 0, 64, 0},
+        {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -102,12 +95,14 @@ public:
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
+    assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!");
     return Infos[Kind - FirstTargetFixupKind];
   }
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override {
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
     assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
@@ -123,7 +118,8 @@ public:
       Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override;
 
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
@@ -132,7 +128,7 @@ public:
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 } // end anonymous namespace
 
@@ -270,7 +266,8 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
   return getRelaxedOpcodeBranch(Inst, is16BitMode);
 }
 
-bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst,
+                                      const MCSubtargetInfo &STI) const {
   // Branches can always be relaxed in either mode.
   if (getRelaxedOpcodeBranch(Inst, false) != Inst.getOpcode())
     return true;
@@ -318,52 +315,61 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
   Res.setOpcode(RelaxedOp);
 }
 
-/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// Write a sequence of optimal nops to the output, covering \p Count
 /// bytes.
 /// \return - true on success, false on failure
-bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  static const uint8_t Nops[10][10] = {
+bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+  static const char Nops[10][11] = {
     // nop
-    {0x90},
+    "\x90",
     // xchg %ax,%ax
-    {0x66, 0x90},
+    "\x66\x90",
     // nopl (%[re]ax)
-    {0x0f, 0x1f, 0x00},
+    "\x0f\x1f\x00",
     // nopl 0(%[re]ax)
-    {0x0f, 0x1f, 0x40, 0x00},
+    "\x0f\x1f\x40\x00",
     // nopl 0(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    "\x0f\x1f\x44\x00\x00",
     // nopw 0(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    "\x66\x0f\x1f\x44\x00\x00",
     // nopl 0L(%[re]ax)
-    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    "\x0f\x1f\x80\x00\x00\x00\x00",
     // nopl 0L(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    "\x0f\x1f\x84\x00\x00\x00\x00\x00",
     // nopw 0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
     // nopw %cs:0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
   };
 
   // This CPU doesn't support long nops. If needed add more.
-  // FIXME: Can we get this from the subtarget somehow?
   // FIXME: We could generated something better than plain 0x90.
-  if (!HasNopl) {
+  if (!STI.getFeatureBits()[X86::FeatureNOPL]) {
     for (uint64_t i = 0; i < Count; ++i)
-      OW->write8(0x90);
+      OS << '\x90';
     return true;
   }
 
-  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
-  // needed, then emit a nop of the remaining length.
+  // 15-bytes is the longest single NOP instruction, but 10-bytes is
+  // commonly the longest that can be efficiently decoded.
+  uint64_t MaxNopLength = 10;
+  if (STI.getFeatureBits()[X86::ProcIntelSLM])
+    MaxNopLength = 7;
+  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+    MaxNopLength = 15;
+  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+    MaxNopLength = 11;
+
+  // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
+  // length.
   do {
     const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
     const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
     for (uint8_t i = 0; i < Prefixes; i++)
-      OW->write8(0x66);
+      OS << '\x66';
     const uint8_t Rest = ThisNopLength - Prefixes;
-    for (uint8_t i = 0; i < Rest; i++)
-      OW->write8(Nops[Rest - 1][i]);
+    if (Rest != 0)
+      OS.write(Nops[Rest - 1], Rest);
     Count -= ThisNopLength;
   } while (Count != 0);
 
@@ -377,53 +383,57 @@ namespace {
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
-  ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-      : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+  ELFX86AsmBackend(const Target &T, uint8_t OSABI, const MCSubtargetInfo &STI)
+      : X86AsmBackend(T, STI), OSABI(OSABI) {}
 };
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-    : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI,
+                      const MCSubtargetInfo &STI)
+    : ELFX86AsmBackend(T, OSABI, STI) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI, ELF::EM_386);
   }
 };
 
 class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-      : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI,
+                       const MCSubtargetInfo &STI)
+      : ELFX86AsmBackend(T, OSABI, STI) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI,
                                     ELF::EM_X86_64);
   }
 };
 
 class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-      : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI,
+                         const MCSubtargetInfo &STI)
+      : ELFX86AsmBackend(T, OSABI, STI) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI,
                                     ELF::EM_IAMCU);
   }
 };
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-    : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI,
+                      const MCSubtargetInfo &STI)
+    : ELFX86AsmBackend(T, OSABI, STI) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86ELFObjectWriter(/*IsELF64*/ true, OSABI, ELF::EM_X86_64);
   }
 };
 
@@ -431,8 +441,9 @@ class WindowsX86AsmBackend : public X86AsmBackend {
   bool Is64Bit;
 
 public:
-  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
-    : X86AsmBackend(T, CPU)
+  WindowsX86AsmBackend(const Target &T, bool is64Bit,
+                       const MCSubtargetInfo &STI)
+    : X86AsmBackend(T, STI)
     , Is64Bit(is64Bit) {
   }
 
@@ -444,9 +455,9 @@ public:
         .Default(MCAsmBackend::getFixupKind(Name));
   }
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86WinCOFFObjectWriter(OS, Is64Bit);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86WinCOFFObjectWriter(Is64Bit);
   }
 };
 
@@ -479,7 +490,7 @@ namespace CU {
 class DarwinX86AsmBackend : public X86AsmBackend {
   const MCRegisterInfo &MRI;
 
-  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  /// Number of registers that can be saved in a compact unwind encoding.
   enum { CU_NUM_SAVED_REGS = 6 };
 
   mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
@@ -489,7 +500,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
   unsigned MoveInstrSize;                ///< Size of a "move" instruction.
   unsigned StackDivide;                  ///< Amount to adjust stack size by.
 protected:
-  /// \brief Size of a "push" instruction for the given register.
+  /// Size of a "push" instruction for the given register.
   unsigned PushInstrSize(unsigned Reg) const {
     switch (Reg) {
       case X86::EBX:
@@ -510,7 +521,7 @@ protected:
     return 1;
   }
 
-  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
   uint32_t
   generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
@@ -655,8 +666,7 @@ protected:
         // instruction.
         CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
 
-        // Encode any extra stack stack adjustments (done via push
-        // instructions).
+        // Encode any extra stack adjustments (done via push instructions).
         CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
       }
 
@@ -678,7 +688,7 @@ protected:
   }
 
 private:
-  /// \brief Get the compact unwind number for a given register. The number
+  /// Get the compact unwind number for a given register. The number
   /// corresponds to the enum lists in compact_unwind_encoding.h.
   int getCompactUnwindRegNum(unsigned Reg) const {
     static const MCPhysReg CU32BitRegs[7] = {
@@ -695,7 +705,7 @@ private:
     return -1;
   }
 
-  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// Return the registers encoded for a compact encoding with a frame
   /// pointer.
   uint32_t encodeCompactUnwindRegistersWithFrame() const {
     // Encode the registers in the order they were saved --- 3-bits per
@@ -719,7 +729,7 @@ private:
     return RegEnc;
   }
 
-  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// Create the permutation encoding used with frameless stacks. It is
   /// passed the number of registers to be saved and an array of the registers
   /// saved.
   uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
@@ -790,9 +800,9 @@ private:
   }
 
 public:
-  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
-                      bool Is64Bit)
-    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                      const MCSubtargetInfo &STI, bool Is64Bit)
+    : X86AsmBackend(T, STI), MRI(MRI), Is64Bit(Is64Bit) {
     memset(SavedRegs, 0, sizeof(SavedRegs));
     OffsetSize = Is64Bit ? 8 : 4;
     MoveInstrSize = Is64Bit ? 3 : 2;
@@ -803,17 +813,17 @@ public:
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
 public:
   DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         StringRef CPU)
-      : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+                         const MCSubtargetInfo &STI)
+      : DarwinX86AsmBackend(T, MRI, STI, false) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86MachObjectWriter(/*Is64Bit=*/false,
                                      MachO::CPU_TYPE_I386,
                                      MachO::CPU_SUBTYPE_I386_ALL);
   }
 
-  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  /// Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
     return generateCompactUnwindEncodingImpl(Instrs);
@@ -824,16 +834,16 @@ class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
   const MachO::CPUSubTypeX86 Subtype;
 public:
   DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         StringRef CPU, MachO::CPUSubTypeX86 st)
-      : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+                         const MCSubtargetInfo &STI, MachO::CPUSubTypeX86 st)
+      : DarwinX86AsmBackend(T, MRI, STI, true), Subtype(st) {}
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
-                                     MachO::CPU_TYPE_X86_64, Subtype);
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createX86MachObjectWriter(/*Is64Bit=*/true, MachO::CPU_TYPE_X86_64,
+                                     Subtype);
   }
 
-  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  /// Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
     return generateCompactUnwindEncodingImpl(Instrs);
@@ -847,19 +857,18 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
-  StringRef CPU = STI.getCPU();
   if (TheTriple.isOSBinFormatMachO())
-    return new DarwinX86_32AsmBackend(T, MRI, CPU);
+    return new DarwinX86_32AsmBackend(T, MRI, STI);
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
-    return new WindowsX86AsmBackend(T, false, CPU);
+    return new WindowsX86AsmBackend(T, false, STI);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
 
   if (TheTriple.isOSIAMCU())
-    return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+    return new ELFX86_IAMCUAsmBackend(T, OSABI, STI);
 
-  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+  return new ELFX86_32AsmBackend(T, OSABI, STI);
 }
 
 MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
@@ -867,21 +876,20 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
-  StringRef CPU = STI.getCPU();
   if (TheTriple.isOSBinFormatMachO()) {
     MachO::CPUSubTypeX86 CS =
         StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
             .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
             .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
-    return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+    return new DarwinX86_64AsmBackend(T, MRI, STI, CS);
   }
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
-    return new WindowsX86AsmBackend(T, true, CPU);
+    return new WindowsX86AsmBackend(T, true, STI);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
 
   if (TheTriple.getEnvironment() == Triple::GNUX32)
-    return new ELFX86_X32AsmBackend(T, OSABI, CPU);
-  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+    return new ELFX86_X32AsmBackend(T, OSABI, STI);
+  return new ELFX86_64AsmBackend(T, OSABI, STI);
 }
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 07cc488d047e..497e29fe628e 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -60,8 +60,9 @@ namespace X86 {
     IP_HAS_REPEAT_NE = 4,
     IP_HAS_REPEAT = 8,
     IP_HAS_LOCK = 16,
-    NO_SCHED_INFO = 32 // Don't add sched comment to the current instr because
-                       // it was already added
+    NO_SCHED_INFO = 32, // Don't add sched comment to the current instr because
+                        // it was already added
+    IP_HAS_NOTRACK = 64
   };
 } // end namespace X86;
 
@@ -368,15 +369,13 @@ namespace X86II {
     // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
     // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
     // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
-    // prefix in 16-bit mode. OpSizeIgnore means that the instruction may
-    // take a optional 0x66 byte but should not emit with one.
+    // prefix in 16-bit mode.
     OpSizeShift = 7,
     OpSizeMask = 0x3 << OpSizeShift,
 
     OpSizeFixed  = 0 << OpSizeShift,
     OpSize16     = 1 << OpSizeShift,
     OpSize32     = 2 << OpSizeShift,
-    OpSizeIgnore = 3 << OpSizeShift,
 
     // AsSize - AdSizeX implies this instruction determines its need of 0x67
     // prefix from a normal ModRM memory operand. The other types indicate that
@@ -385,7 +384,7 @@ namespace X86II {
     AdSizeShift = OpSizeShift + 2,
     AdSizeMask  = 0x3 << AdSizeShift,
 
-    AdSizeX  = 1 << AdSizeShift,
+    AdSizeX  = 0 << AdSizeShift,
     AdSize16 = 1 << AdSizeShift,
     AdSize32 = 2 << AdSizeShift,
     AdSize64 = 3 << AdSizeShift,
@@ -396,21 +395,21 @@ namespace X86II {
     // no prefix.
     //
     OpPrefixShift = AdSizeShift + 2,
-    OpPrefixMask  = 0x7 << OpPrefixShift,
+    OpPrefixMask  = 0x3 << OpPrefixShift,
 
-    // PS, PD - Prefix code for packed single and double precision vector
-    // floating point operations performed in the SSE registers.
-    PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+    // PD - Prefix code for packed double precision vector floating point
+    // operations performed in the SSE registers.
+    PD = 1 << OpPrefixShift,
 
     // XS, XD - These prefix codes are for single and double precision scalar
     // floating point operations performed in the SSE registers.
-    XS = 3 << OpPrefixShift,  XD = 4 << OpPrefixShift,
+    XS = 2 << OpPrefixShift,  XD = 3 << OpPrefixShift,
 
     //===------------------------------------------------------------------===//
     // OpMap - This field determines which opcode map this instruction
     // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
     //
-    OpMapShift = OpPrefixShift + 3,
+    OpMapShift = OpPrefixShift + 2,
     OpMapMask  = 0x7 << OpMapShift,
 
     // OB - OneByte - Set if this instruction has a one byte opcode.
@@ -432,6 +431,14 @@ namespace X86II {
     // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
     XOPA = 6 << OpMapShift,
 
+    /// ThreeDNow - This indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    ThreeDNow = 7 << OpMapShift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
@@ -561,24 +568,19 @@ namespace X86II {
     CD8_Scale_Shift = EVEX_BShift + 1,
     CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
 
-    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
-    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
-    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
-    /// storing a classifier in the imm8 field.  To simplify our implementation,
-    /// we handle this by storeing the classifier in the opcode field and using
-    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
-    Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
-
     /// Explicitly specified rounding control
-    EVEX_RCShift = Has3DNow0F0FOpcodeShift + 1,
-    EVEX_RC = 1ULL << EVEX_RCShift
+    EVEX_RCShift = CD8_Scale_Shift + 7,
+    EVEX_RC = 1ULL << EVEX_RCShift,
+
+    // NOTRACK prefix
+    NoTrackShift = EVEX_RCShift + 1,
+    NOTRACK = 1ULL << NoTrackShift
   };
 
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
   // specified machine instruction.
   //
-  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+  inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
     return TSFlags >> X86II::OpcodeShift;
   }
 
@@ -641,30 +643,44 @@ namespace X86II {
     }
   }
 
-  /// getOperandBias - compute any additional adjustment needed to
-  ///                  the offset to the start of the memory operand
-  ///                  in this instruction.
-  /// If this is a two-address instruction,skip one of the register operands.
-  /// FIXME: This should be handled during MCInst lowering.
-  inline unsigned getOperandBias(const MCInstrDesc& Desc)
-  {
+  /// getOperandBias - compute whether all of the def operands are repeated
+  ///                  in the uses and therefore should be skipped.
+  /// This determines the start of the unique operand list. We need to determine
+  /// if all of the defs have a corresponding tied operand in the uses.
+  /// Unfortunately, the tied operand information is encoded in the uses not
+  /// the defs so we have to use some heuristics to find which operands to
+  /// query.
+  inline unsigned getOperandBias(const MCInstrDesc& Desc) {
+    unsigned NumDefs = Desc.getNumDefs();
     unsigned NumOps = Desc.getNumOperands();
-    if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
-      return 1;
-    if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
-        Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
-      // Special case for AVX-512 GATHER with 2 TIED_TO operands
-      // Skip the first 2 operands: dst, mask_wb
-      return 2;
-    if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
-        Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
-      // Special case for GATHER with 2 TIED_TO operands
-      // Skip the first 2 operands: dst, mask_wb
-      return 2;
-    if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
-      // SCATTER
-      return 1;
-    return 0;
+    switch (NumDefs) {
+    default: llvm_unreachable("Unexpected number of defs");
+    case 0:
+      return 0;
+    case 1:
+      // Common two addr case.
+      if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+        return 1;
+      // Check for AVX-512 scatter which has a TIED_TO in the second to last
+      // operand.
+      if (NumOps == 8 &&
+          Desc.getOperandConstraint(6, MCOI::TIED_TO) == 0)
+        return 1;
+      return 0;
+    case 2:
+      // XCHG/XADD have two destinations and two sources.
+      if (NumOps >= 4 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+          Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+        return 2;
+      // Check for gather. AVX-512 has the second tied operand early. AVX2
+      // has it as the last op.
+      if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+          (Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 ||
+           Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1) &&
+          "Instruction with 2 defs isn't gather?")
+        return 2;
+      return 0;
+    }
   }
 
   /// getMemoryOperandNo - The function returns the MCInst operand # for the
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 4cdbae4d0d96..b724a89f81d2 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -75,6 +75,9 @@ static X86_64RelType getType64(unsigned Kind,
   case X86::reloc_riprel_4byte_relax_rex:
   case X86::reloc_riprel_4byte_movq_load:
     return RT64_32;
+  case X86::reloc_branch_4byte_pcrel:
+    Modifier = MCSymbolRefExpr::VK_PLT;
+    return RT64_32;
   case FK_PCRel_2:
   case FK_Data_2:
     return RT64_16;
@@ -298,9 +301,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
   return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
-                               uint8_t OSABI, uint16_t EMachine) {
-  auto MOTW = llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
-  return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) {
+  return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
 }
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index dfdc9ec29aec..3c04b13e002e 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -30,6 +30,7 @@ enum Fixups {
                                              // of the instruction. Used only
                                              // for _GLOBAL_OFFSET_TABLE_.
   reloc_global_offset_table8,                // 64-bit variant.
+  reloc_branch_4byte_pcrel,                  // 32-bit PC relative branch.
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 4ddc1f0ba429..f5371db9e77a 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -152,6 +152,8 @@ public:
 
   uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
                              int MemOperand, const MCInstrDesc &Desc) const;
+
+  bool isPCRel32Branch(const MCInst &MI) const;
 };
 
 } // end anonymous namespace
@@ -217,6 +219,8 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
     assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
     return true;
   }
+  if (IndexReg.getReg() == X86::EIZ)
+    return true;
   return false;
 }
 
@@ -276,6 +280,22 @@ static bool HasSecRelSymbolRef(const MCExpr *Expr) {
   return false;
 }
 
+bool X86MCCodeEmitter::isPCRel32Branch(const MCInst &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) ||
+      getImmFixupKind(Desc.TSFlags) != FK_PCRel_4)
+    return false;
+
+  unsigned CurOp = X86II::getOperandBias(Desc);
+  const MCOperand &Op = MI.getOperand(CurOp);
+  if (!Op.isExpr())
+    return false;
+
+  const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+  return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
+}
+
 void X86MCCodeEmitter::
 EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
               MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
@@ -331,8 +351,15 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
-      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex))
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex) ||
+      FixupKind == MCFixupKind(X86::reloc_branch_4byte_pcrel)) {
     ImmOffset -= 4;
+    // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_:
+    // leaq _GLOBAL_OFFSET_TABLE_(%rip), %r15
+    // this needs to be a GOTPC32 relocation.
+    if (StartsWithGlobalOffsetTable(Expr) != GOT_None)
+      FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+  }
   if (FixupKind == FK_PCRel_2)
     ImmOffset -= 2;
   if (FixupKind == FK_PCRel_1)
@@ -380,6 +407,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
         return X86::reloc_riprel_4byte_movq_load;
       case X86::CALL64m:
       case X86::JMP64m:
+      case X86::TAILJMPm64:
       case X86::TEST64mr:
       case X86::ADC64rm:
       case X86::ADD64rm:
@@ -450,7 +478,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       }
 
       if (Disp.isImm() && isDisp8(Disp.getImm())) {
-        if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+        if (Disp.getImm() == 0 && RMfield != 6) {
           // There is no displacement; just the register.
           EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
           return;
@@ -681,10 +709,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b10: F3
   //  0b11: F2
   //
-  uint8_t VEX_PP;
+  uint8_t VEX_PP = 0;
   switch (TSFlags & X86II::OpPrefixMask) {
-  default: llvm_unreachable("Invalid op prefix!");
-  case X86II::PS: VEX_PP = 0x0; break; // none
   case X86II::PD: VEX_PP = 0x1; break; // 66
   case X86II::XS: VEX_PP = 0x2; break; // F3
   case X86II::XD: VEX_PP = 0x3; break; // F2
@@ -1115,6 +1141,10 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
     EmitByte(0xF0, CurByte, OS);
 
+  // Emit the NOTRACK opcode prefix.
+  if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
+    EmitByte(0x3E, CurByte, OS);
+
   switch (TSFlags & X86II::OpPrefixMask) {
   case X86II::PD:   // 66
     EmitByte(0x66, CurByte, OS);
@@ -1140,9 +1170,10 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
   // 0x0F escape code must be emitted just before the opcode.
   switch (TSFlags & X86II::OpMapMask) {
-  case X86II::TB:  // Two-byte opcode map
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
+  case X86II::TB:         // Two-byte opcode map
+  case X86II::T8:         // 0F 38
+  case X86II::TA:         // 0F 3A
+  case X86II::ThreeDNow:  // 0F 0F, second 0F emitted by caller.
     EmitByte(0x0F, CurByte, OS);
     break;
   }
@@ -1238,7 +1269,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
 
-  if (TSFlags & X86II::Has3DNow0F0FOpcode)
+  if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
     BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
 
   uint64_t Form = TSFlags & X86II::FormMask;
@@ -1287,9 +1318,18 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
     break;
   }
-  case X86II::RawFrm:
+  case X86II::RawFrm: {
     EmitByte(BaseOpcode, CurByte, OS);
+
+    if (!is64BitMode(STI) || !isPCRel32Branch(MI))
+      break;
+
+    const MCOperand &Op = MI.getOperand(CurOp++);
+    EmitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
+                  MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS,
+                  Fixups);
     break;
+  }
   case X86II::RawFrmMemOffs:
     // Emit segment override opcode prefix as needed.
     EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
@@ -1523,7 +1563,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
     }
   }
 
-  if (TSFlags & X86II::Has3DNow0F0FOpcode)
+  if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
     EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
 
 #ifndef NDEBUG
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
new file mode 100644
index 000000000000..f1438cd24960
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -0,0 +1,75 @@
+//=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes X86-specific MCExprs, i.e, registers used for
+// extended variable assignments.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
+
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class X86MCExpr : public MCTargetExpr {
+
+private:
+  const int64_t RegNo; // All
+
+  explicit X86MCExpr(int64_t R) : RegNo(R) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const X86MCExpr *create(int64_t RegNo, MCContext &Ctx) {
+    return new (Ctx) X86MCExpr(RegNo);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getSubExpr - Get the child of this expression.
+  int64_t getRegNo() const { return RegNo; }
+
+  /// @}
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+    if (MAI->getAssemblerDialect() == 0)
+      OS << '%';
+    OS << X86ATTInstPrinter::getRegisterName(RegNo);
+  }
+
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override {
+    return false;
+  }
+  // Register values should be inlined as they are not valid .set expressions.
+  bool inlineAssignedExpr() const override { return true; }
+  void visitUsedExpr(MCStreamer &Streamer) const override{};
+  MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+  // There are no TLS X86MCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index cdd43478baed..d030f26d98de 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -14,7 +14,9 @@
 #include "X86MCTargetDesc.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86BaseInfo.h"
 #include "X86MCAsmInfo.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -37,6 +39,7 @@ using namespace llvm;
 #include "X86GenRegisterInfo.inc"
 
 #define GET_INSTRINFO_MC_DESC
+#define GET_GENINSTRINFO_MC_HELPERS
 #include "X86GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
@@ -78,120 +81,120 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
     codeview::RegisterId CVReg;
     MCPhysReg Reg;
   } RegMap[] = {
-    { codeview::RegisterId::AL, X86::AL},
-    { codeview::RegisterId::CL, X86::CL},
-    { codeview::RegisterId::DL, X86::DL},
-    { codeview::RegisterId::BL, X86::BL},
-    { codeview::RegisterId::AH, X86::AH},
-    { codeview::RegisterId::CH, X86::CH},
-    { codeview::RegisterId::DH, X86::DH},
-    { codeview::RegisterId::BH, X86::BH},
-    { codeview::RegisterId::AX, X86::AX},
-    { codeview::RegisterId::CX, X86::CX},
-    { codeview::RegisterId::DX, X86::DX},
-    { codeview::RegisterId::BX, X86::BX},
-    { codeview::RegisterId::SP, X86::SP},
-    { codeview::RegisterId::BP, X86::BP},
-    { codeview::RegisterId::SI, X86::SI},
-    { codeview::RegisterId::DI, X86::DI},
-    { codeview::RegisterId::EAX, X86::EAX},
-    { codeview::RegisterId::ECX, X86::ECX},
-    { codeview::RegisterId::EDX, X86::EDX},
-    { codeview::RegisterId::EBX, X86::EBX},
-    { codeview::RegisterId::ESP, X86::ESP},
-    { codeview::RegisterId::EBP, X86::EBP},
-    { codeview::RegisterId::ESI, X86::ESI},
-    { codeview::RegisterId::EDI, X86::EDI},
-
-    { codeview::RegisterId::EFLAGS, X86::EFLAGS},
-
-    { codeview::RegisterId::ST0, X86::FP0},
-    { codeview::RegisterId::ST1, X86::FP1},
-    { codeview::RegisterId::ST2, X86::FP2},
-    { codeview::RegisterId::ST3, X86::FP3},
-    { codeview::RegisterId::ST4, X86::FP4},
-    { codeview::RegisterId::ST5, X86::FP5},
-    { codeview::RegisterId::ST6, X86::FP6},
-    { codeview::RegisterId::ST7, X86::FP7},
-
-    { codeview::RegisterId::XMM0, X86::XMM0},
-    { codeview::RegisterId::XMM1, X86::XMM1},
-    { codeview::RegisterId::XMM2, X86::XMM2},
-    { codeview::RegisterId::XMM3, X86::XMM3},
-    { codeview::RegisterId::XMM4, X86::XMM4},
-    { codeview::RegisterId::XMM5, X86::XMM5},
-    { codeview::RegisterId::XMM6, X86::XMM6},
-    { codeview::RegisterId::XMM7, X86::XMM7},
-
-    { codeview::RegisterId::XMM8, X86::XMM8},
-    { codeview::RegisterId::XMM9, X86::XMM9},
-    { codeview::RegisterId::XMM10, X86::XMM10},
-    { codeview::RegisterId::XMM11, X86::XMM11},
-    { codeview::RegisterId::XMM12, X86::XMM12},
-    { codeview::RegisterId::XMM13, X86::XMM13},
-    { codeview::RegisterId::XMM14, X86::XMM14},
-    { codeview::RegisterId::XMM15, X86::XMM15},
-
-    { codeview::RegisterId::SIL, X86::SIL},
-    { codeview::RegisterId::DIL, X86::DIL},
-    { codeview::RegisterId::BPL, X86::BPL},
-    { codeview::RegisterId::SPL, X86::SPL},
-    { codeview::RegisterId::RAX, X86::RAX},
-    { codeview::RegisterId::RBX, X86::RBX},
-    { codeview::RegisterId::RCX, X86::RCX},
-    { codeview::RegisterId::RDX, X86::RDX},
-    { codeview::RegisterId::RSI, X86::RSI},
-    { codeview::RegisterId::RDI, X86::RDI},
-    { codeview::RegisterId::RBP, X86::RBP},
-    { codeview::RegisterId::RSP, X86::RSP},
-    { codeview::RegisterId::R8, X86::R8},
-    { codeview::RegisterId::R9, X86::R9},
-    { codeview::RegisterId::R10, X86::R10},
-    { codeview::RegisterId::R11, X86::R11},
-    { codeview::RegisterId::R12, X86::R12},
-    { codeview::RegisterId::R13, X86::R13},
-    { codeview::RegisterId::R14, X86::R14},
-    { codeview::RegisterId::R15, X86::R15},
-    { codeview::RegisterId::R8B, X86::R8B},
-    { codeview::RegisterId::R9B, X86::R9B},
-    { codeview::RegisterId::R10B, X86::R10B},
-    { codeview::RegisterId::R11B, X86::R11B},
-    { codeview::RegisterId::R12B, X86::R12B},
-    { codeview::RegisterId::R13B, X86::R13B},
-    { codeview::RegisterId::R14B, X86::R14B},
-    { codeview::RegisterId::R15B, X86::R15B},
-    { codeview::RegisterId::R8W, X86::R8W},
-    { codeview::RegisterId::R9W, X86::R9W},
-    { codeview::RegisterId::R10W, X86::R10W},
-    { codeview::RegisterId::R11W, X86::R11W},
-    { codeview::RegisterId::R12W, X86::R12W},
-    { codeview::RegisterId::R13W, X86::R13W},
-    { codeview::RegisterId::R14W, X86::R14W},
-    { codeview::RegisterId::R15W, X86::R15W},
-    { codeview::RegisterId::R8D, X86::R8D},
-    { codeview::RegisterId::R9D, X86::R9D},
-    { codeview::RegisterId::R10D, X86::R10D},
-    { codeview::RegisterId::R11D, X86::R11D},
-    { codeview::RegisterId::R12D, X86::R12D},
-    { codeview::RegisterId::R13D, X86::R13D},
-    { codeview::RegisterId::R14D, X86::R14D},
-    { codeview::RegisterId::R15D, X86::R15D},
-    { codeview::RegisterId::AMD64_YMM0, X86::YMM0},
-    { codeview::RegisterId::AMD64_YMM1, X86::YMM1},
-    { codeview::RegisterId::AMD64_YMM2, X86::YMM2},
-    { codeview::RegisterId::AMD64_YMM3, X86::YMM3},
-    { codeview::RegisterId::AMD64_YMM4, X86::YMM4},
-    { codeview::RegisterId::AMD64_YMM5, X86::YMM5},
-    { codeview::RegisterId::AMD64_YMM6, X86::YMM6},
-    { codeview::RegisterId::AMD64_YMM7, X86::YMM7},
-    { codeview::RegisterId::AMD64_YMM8, X86::YMM8},
-    { codeview::RegisterId::AMD64_YMM9, X86::YMM9},
-    { codeview::RegisterId::AMD64_YMM10, X86::YMM10},
-    { codeview::RegisterId::AMD64_YMM11, X86::YMM11},
-    { codeview::RegisterId::AMD64_YMM12, X86::YMM12},
-    { codeview::RegisterId::AMD64_YMM13, X86::YMM13},
-    { codeview::RegisterId::AMD64_YMM14, X86::YMM14},
-    { codeview::RegisterId::AMD64_YMM15, X86::YMM15},
+    { codeview::RegisterId::CVRegAL, X86::AL},
+    { codeview::RegisterId::CVRegCL, X86::CL},
+    { codeview::RegisterId::CVRegDL, X86::DL},
+    { codeview::RegisterId::CVRegBL, X86::BL},
+    { codeview::RegisterId::CVRegAH, X86::AH},
+    { codeview::RegisterId::CVRegCH, X86::CH},
+    { codeview::RegisterId::CVRegDH, X86::DH},
+    { codeview::RegisterId::CVRegBH, X86::BH},
+    { codeview::RegisterId::CVRegAX, X86::AX},
+    { codeview::RegisterId::CVRegCX, X86::CX},
+    { codeview::RegisterId::CVRegDX, X86::DX},
+    { codeview::RegisterId::CVRegBX, X86::BX},
+    { codeview::RegisterId::CVRegSP, X86::SP},
+    { codeview::RegisterId::CVRegBP, X86::BP},
+    { codeview::RegisterId::CVRegSI, X86::SI},
+    { codeview::RegisterId::CVRegDI, X86::DI},
+    { codeview::RegisterId::CVRegEAX, X86::EAX},
+    { codeview::RegisterId::CVRegECX, X86::ECX},
+    { codeview::RegisterId::CVRegEDX, X86::EDX},
+    { codeview::RegisterId::CVRegEBX, X86::EBX},
+    { codeview::RegisterId::CVRegESP, X86::ESP},
+    { codeview::RegisterId::CVRegEBP, X86::EBP},
+    { codeview::RegisterId::CVRegESI, X86::ESI},
+    { codeview::RegisterId::CVRegEDI, X86::EDI},
+
+    { codeview::RegisterId::CVRegEFLAGS, X86::EFLAGS},
+
+    { codeview::RegisterId::CVRegST0, X86::FP0},
+    { codeview::RegisterId::CVRegST1, X86::FP1},
+    { codeview::RegisterId::CVRegST2, X86::FP2},
+    { codeview::RegisterId::CVRegST3, X86::FP3},
+    { codeview::RegisterId::CVRegST4, X86::FP4},
+    { codeview::RegisterId::CVRegST5, X86::FP5},
+    { codeview::RegisterId::CVRegST6, X86::FP6},
+    { codeview::RegisterId::CVRegST7, X86::FP7},
+
+    { codeview::RegisterId::CVRegXMM0, X86::XMM0},
+    { codeview::RegisterId::CVRegXMM1, X86::XMM1},
+    { codeview::RegisterId::CVRegXMM2, X86::XMM2},
+    { codeview::RegisterId::CVRegXMM3, X86::XMM3},
+    { codeview::RegisterId::CVRegXMM4, X86::XMM4},
+    { codeview::RegisterId::CVRegXMM5, X86::XMM5},
+    { codeview::RegisterId::CVRegXMM6, X86::XMM6},
+    { codeview::RegisterId::CVRegXMM7, X86::XMM7},
+
+    { codeview::RegisterId::CVRegXMM8, X86::XMM8},
+    { codeview::RegisterId::CVRegXMM9, X86::XMM9},
+    { codeview::RegisterId::CVRegXMM10, X86::XMM10},
+    { codeview::RegisterId::CVRegXMM11, X86::XMM11},
+    { codeview::RegisterId::CVRegXMM12, X86::XMM12},
+    { codeview::RegisterId::CVRegXMM13, X86::XMM13},
+    { codeview::RegisterId::CVRegXMM14, X86::XMM14},
+    { codeview::RegisterId::CVRegXMM15, X86::XMM15},
+
+    { codeview::RegisterId::CVRegSIL, X86::SIL},
+    { codeview::RegisterId::CVRegDIL, X86::DIL},
+    { codeview::RegisterId::CVRegBPL, X86::BPL},
+    { codeview::RegisterId::CVRegSPL, X86::SPL},
+    { codeview::RegisterId::CVRegRAX, X86::RAX},
+    { codeview::RegisterId::CVRegRBX, X86::RBX},
+    { codeview::RegisterId::CVRegRCX, X86::RCX},
+    { codeview::RegisterId::CVRegRDX, X86::RDX},
+    { codeview::RegisterId::CVRegRSI, X86::RSI},
+    { codeview::RegisterId::CVRegRDI, X86::RDI},
+    { codeview::RegisterId::CVRegRBP, X86::RBP},
+    { codeview::RegisterId::CVRegRSP, X86::RSP},
+    { codeview::RegisterId::CVRegR8, X86::R8},
+    { codeview::RegisterId::CVRegR9, X86::R9},
+    { codeview::RegisterId::CVRegR10, X86::R10},
+    { codeview::RegisterId::CVRegR11, X86::R11},
+    { codeview::RegisterId::CVRegR12, X86::R12},
+    { codeview::RegisterId::CVRegR13, X86::R13},
+    { codeview::RegisterId::CVRegR14, X86::R14},
+    { codeview::RegisterId::CVRegR15, X86::R15},
+    { codeview::RegisterId::CVRegR8B, X86::R8B},
+    { codeview::RegisterId::CVRegR9B, X86::R9B},
+    { codeview::RegisterId::CVRegR10B, X86::R10B},
+    { codeview::RegisterId::CVRegR11B, X86::R11B},
+    { codeview::RegisterId::CVRegR12B, X86::R12B},
+    { codeview::RegisterId::CVRegR13B, X86::R13B},
+    { codeview::RegisterId::CVRegR14B, X86::R14B},
+    { codeview::RegisterId::CVRegR15B, X86::R15B},
+    { codeview::RegisterId::CVRegR8W, X86::R8W},
+    { codeview::RegisterId::CVRegR9W, X86::R9W},
+    { codeview::RegisterId::CVRegR10W, X86::R10W},
+    { codeview::RegisterId::CVRegR11W, X86::R11W},
+    { codeview::RegisterId::CVRegR12W, X86::R12W},
+    { codeview::RegisterId::CVRegR13W, X86::R13W},
+    { codeview::RegisterId::CVRegR14W, X86::R14W},
+    { codeview::RegisterId::CVRegR15W, X86::R15W},
+    { codeview::RegisterId::CVRegR8D, X86::R8D},
+    { codeview::RegisterId::CVRegR9D, X86::R9D},
+    { codeview::RegisterId::CVRegR10D, X86::R10D},
+    { codeview::RegisterId::CVRegR11D, X86::R11D},
+    { codeview::RegisterId::CVRegR12D, X86::R12D},
+    { codeview::RegisterId::CVRegR13D, X86::R13D},
+    { codeview::RegisterId::CVRegR14D, X86::R14D},
+    { codeview::RegisterId::CVRegR15D, X86::R15D},
+    { codeview::RegisterId::CVRegAMD64_YMM0, X86::YMM0},
+    { codeview::RegisterId::CVRegAMD64_YMM1, X86::YMM1},
+    { codeview::RegisterId::CVRegAMD64_YMM2, X86::YMM2},
+    { codeview::RegisterId::CVRegAMD64_YMM3, X86::YMM3},
+    { codeview::RegisterId::CVRegAMD64_YMM4, X86::YMM4},
+    { codeview::RegisterId::CVRegAMD64_YMM5, X86::YMM5},
+    { codeview::RegisterId::CVRegAMD64_YMM6, X86::YMM6},
+    { codeview::RegisterId::CVRegAMD64_YMM7, X86::YMM7},
+    { codeview::RegisterId::CVRegAMD64_YMM8, X86::YMM8},
+    { codeview::RegisterId::CVRegAMD64_YMM9, X86::YMM9},
+    { codeview::RegisterId::CVRegAMD64_YMM10, X86::YMM10},
+    { codeview::RegisterId::CVRegAMD64_YMM11, X86::YMM11},
+    { codeview::RegisterId::CVRegAMD64_YMM12, X86::YMM12},
+    { codeview::RegisterId::CVRegAMD64_YMM13, X86::YMM13},
+    { codeview::RegisterId::CVRegAMD64_YMM14, X86::YMM14},
+    { codeview::RegisterId::CVRegAMD64_YMM15, X86::YMM15},
   };
   for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
     MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
@@ -293,8 +296,79 @@ static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
   return llvm::createMCRelocationInfo(TheTriple, Ctx);
 }
 
+namespace llvm {
+namespace X86_MC {
+
+class X86MCInstrAnalysis : public MCInstrAnalysis {
+  X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
+  X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
+  virtual ~X86MCInstrAnalysis() = default;
+
+public:
+  X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
+
+  bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+                            APInt &Mask) const override;
+};
+
+bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
+                                              const MCInst &Inst,
+                                              APInt &Mask) const {
+  const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+  unsigned NumDefs = Desc.getNumDefs();
+  unsigned NumImplicitDefs = Desc.getNumImplicitDefs();
+  assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+         "Unexpected number of bits in the mask!");
+
+  bool HasVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::VEX;
+  bool HasEVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX;
+  bool HasXOP = (Desc.TSFlags & X86II::EncodingMask) == X86II::XOP;
+
+  const MCRegisterClass &GR32RC = MRI.getRegClass(X86::GR32RegClassID);
+  const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID);
+  const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID);
+
+  auto ClearsSuperReg = [=](unsigned RegID) {
+    // On X86-64, a general purpose integer register is viewed as a 64-bit
+    // register internal to the processor.
+    // An update to the lower 32 bits of a 64 bit integer register is
+    // architecturally defined to zero extend the upper 32 bits.
+    if (GR32RC.contains(RegID))
+      return true;
+
+    // Early exit if this instruction has no vex/evex/xop prefix.
+    if (!HasEVEX && !HasVEX && !HasXOP)
+      return false;
+
+    // All VEX and EVEX encoded instructions are defined to zero the high bits
+    // of the destination register up to VLMAX (i.e. the maximum vector register
+    // width pertaining to the instruction).
+    // We assume the same behavior for XOP instructions too.
+    return VR128XRC.contains(RegID) || VR256XRC.contains(RegID);
+  };
+
+  Mask.clearAllBits();
+  for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+    const MCOperand &Op = Inst.getOperand(I);
+    if (ClearsSuperReg(Op.getReg()))
+      Mask.setBit(I);
+  }
+
+  for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+    const MCPhysReg Reg = Desc.getImplicitDefs()[I];
+    if (ClearsSuperReg(Reg))
+      Mask.setBit(NumDefs + I);
+  }
+
+  return Mask.getBoolValue();
+}
+
+} // end of namespace X86_MC
+
+} // end of namespace llvm
+
 static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
-  return new MCInstrAnalysis(Info);
+  return new X86_MC::X86MCInstrAnalysis(Info);
 }
 
 // Force static initialization.
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index d758c0588cb1..595c26d31e3f 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -23,6 +23,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
+class MCObjectTargetWriter;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
@@ -95,25 +96,21 @@ MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &OS,
 /// Takes ownership of \p AB and \p CE.
 MCStreamer *createX86WinCOFFStreamer(MCContext &C,
                                      std::unique_ptr<MCAsmBackend> &&AB,
-                                     raw_pwrite_stream &OS,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&CE,
                                      bool RelaxAll,
                                      bool IncrementalLinkerCompatible);
 
 /// Construct an X86 Mach-O object writer.
-std::unique_ptr<MCObjectWriter> createX86MachObjectWriter(raw_pwrite_stream &OS,
-                                                          bool Is64Bit,
-                                                          uint32_t CPUType,
-                                                          uint32_t CPUSubtype);
+std::unique_ptr<MCObjectTargetWriter>
+createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
 
 /// Construct an X86 ELF object writer.
-std::unique_ptr<MCObjectWriter> createX86ELFObjectWriter(raw_pwrite_stream &OS,
-                                                         bool IsELF64,
-                                                         uint8_t OSABI,
-                                                         uint16_t EMachine);
+std::unique_ptr<MCObjectTargetWriter>
+createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
 /// Construct an X86 Win COFF object writer.
-std::unique_ptr<MCObjectWriter>
-createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit);
+std::unique_ptr<MCObjectTargetWriter>
+createX86WinCOFFObjectWriter(bool Is64Bit);
 
 /// Returns the sub or super register of a specific X86 register.
 /// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
@@ -137,6 +134,7 @@ unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
 // Defines symbolic names for the X86 instructions.
 //
 #define GET_INSTRINFO_ENUM
+#define GET_GENINSTRINFO_MC_DECL
 #include "X86GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 965f7de809b3..883278b7bc1f 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -94,6 +94,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   case X86::reloc_riprel_4byte_movq_load:
   case X86::reloc_signed_4byte:
   case X86::reloc_signed_4byte_relax:
+  case X86::reloc_branch_4byte_pcrel:
   case FK_Data_4: return 2;
   case FK_Data_8: return 3;
   }
@@ -597,10 +598,8 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                                uint32_t CPUType, uint32_t CPUSubtype) {
-  return createMachObjectWriter(
-      llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
-      /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                                uint32_t CPUSubtype) {
+  return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
 }
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 5139bb46b561..a5e115e5ff4d 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -62,6 +62,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
     case X86::reloc_riprel_4byte_movq_load:
     case X86::reloc_riprel_4byte_relax:
     case X86::reloc_riprel_4byte_relax_rex:
+    case X86::reloc_branch_4byte_pcrel:
       return COFF::IMAGE_REL_AMD64_REL32;
     case FK_Data_4:
     case X86::reloc_signed_4byte:
@@ -105,8 +106,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
     llvm_unreachable("Unsupported COFF machine type.");
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) {
-  auto MOTW = llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
-  return createWinCOFFObjectWriter(std::move(MOTW), OS);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86WinCOFFObjectWriter(bool Is64Bit) {
+  return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
 }
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 5b1357ae4a7b..0085787e576a 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -11,6 +11,7 @@
 #include "X86TargetStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCWin64EH.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 
@@ -21,8 +22,9 @@ class X86WinCOFFStreamer : public MCWinCOFFStreamer {
   Win64EH::UnwindEmitter EHStreamer;
 public:
   X86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
-                     std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS)
-      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+                     std::unique_ptr<MCCodeEmitter> CE,
+                     std::unique_ptr<MCObjectWriter> OW)
+      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
   void EmitWinEHHandlerData(SMLoc Loc) override;
   void EmitWindowsUnwindTables() override;
@@ -60,12 +62,12 @@ void X86WinCOFFStreamer::FinishImpl() {
 
 MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
                                            std::unique_ptr<MCAsmBackend> &&AB,
-                                           raw_pwrite_stream &OS,
+                                           std::unique_ptr<MCObjectWriter> &&OW,
                                            std::unique_ptr<MCCodeEmitter> &&CE,
                                            bool RelaxAll,
                                            bool IncrementalLinkerCompatible) {
   X86WinCOFFStreamer *S =
-      new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), OS);
+      new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW));
   S->getAssembler().setRelaxAll(RelaxAll);
   S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
   return S;
diff --git a/contrib/llvm/lib/Target/X86/ShadowCallStack.cpp b/contrib/llvm/lib/Target/X86/ShadowCallStack.cpp
new file mode 100644
index 000000000000..9a39455f9dd5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/ShadowCallStack.cpp
@@ -0,0 +1,326 @@
+//===------- ShadowCallStack.cpp - Shadow Call Stack pass -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ShadowCallStack pass instruments function prologs/epilogs to check that
+// the return address has not been corrupted during the execution of the
+// function. The return address is stored in a 'shadow call stack' addressed
+// using the %gs segment register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeShadowCallStackPass(PassRegistry &);
+}
+
+namespace {
+
+class ShadowCallStack : public MachineFunctionPass {
+public:
+  static char ID;
+
+  ShadowCallStack() : MachineFunctionPass(ID) {
+    initializeShadowCallStackPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+  // Do not instrument leaf functions with this many or fewer instructions. The
+  // shadow call stack instrumented prolog/epilog are slightly race-y reading
+  // and checking the saved return address, so it is better to not instrument
+  // functions that have fewer instructions than the instrumented prolog/epilog
+  // race.
+  static const size_t SkipLeafInstructions = 3;
+};
+
+char ShadowCallStack::ID = 0;
+} // end anonymous namespace.
+
+static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
+                      MachineBasicBlock &MBB, const DebugLoc &DL);
+static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
+                          MachineBasicBlock &MBB, const DebugLoc &DL,
+                          MCPhysReg FreeRegister);
+
+static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+                      MachineInstr &MI, MachineBasicBlock &TrapBB);
+static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+                          MachineInstr &MI, MachineBasicBlock &TrapBB,
+                          MCPhysReg FreeRegister);
+// Generate a longer epilog that only uses r10 when a tailcall branches to r11.
+static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+                             MachineInstr &MI, MachineBasicBlock &TrapBB);
+
+// Helper function to add ModR/M references for [Seg: Reg + Offset] memory
+// accesses
+static inline const MachineInstrBuilder &
+addSegmentedMem(const MachineInstrBuilder &MIB, MCPhysReg Seg, MCPhysReg Reg,
+                int Offset = 0) {
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset).addReg(Seg);
+}
+
+static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
+                      MachineBasicBlock &MBB, const DebugLoc &DL) {
+  const MCPhysReg ReturnReg = X86::R10;
+  const MCPhysReg OffsetReg = X86::R11;
+
+  auto MBBI = MBB.begin();
+  // mov r10, [rsp]
+  addDirectMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(ReturnReg),
+               X86::RSP);
+  // xor r11, r11
+  BuildMI(MBB, MBBI, DL, TII->get(X86::XOR64rr))
+      .addDef(OffsetReg)
+      .addReg(OffsetReg, RegState::Undef)
+      .addReg(OffsetReg, RegState::Undef);
+  // add QWORD [gs:r11], 8
+  addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64mi8)), X86::GS,
+                  OffsetReg)
+      .addImm(8);
+  // mov r11, [gs:r11]
+  addSegmentedMem(
+      BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(OffsetReg), X86::GS,
+      OffsetReg);
+  // mov [gs:r11], r10
+  addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64mr)), X86::GS,
+                  OffsetReg)
+      .addReg(ReturnReg);
+}
+
+static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
+                          MachineBasicBlock &MBB, const DebugLoc &DL,
+                          MCPhysReg FreeRegister) {
+  // mov REG, [rsp]
+  addDirectMem(BuildMI(MBB, MBB.begin(), DL, TII->get(X86::MOV64rm))
+                   .addDef(FreeRegister),
+               X86::RSP);
+}
+
+static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+                      MachineInstr &MI, MachineBasicBlock &TrapBB) {
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  // xor r11, r11
+  BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
+      .addDef(X86::R11)
+      .addReg(X86::R11, RegState::Undef)
+      .addReg(X86::R11, RegState::Undef);
+  // mov r10, [gs:r11]
+  addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+                  X86::GS, X86::R11);
+  // mov r10, [gs:r10]
+  addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+                  X86::GS, X86::R10);
+  // sub QWORD [gs:r11], 8
+  // This instruction should not be moved up to avoid a signal race.
+  addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)),
+                  X86::GS, X86::R11)
+      .addImm(8);
+  // cmp [rsp], r10
+  addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
+      .addReg(X86::R10);
+  // jne trap
+  BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
+  MBB.addSuccessor(&TrapBB);
+}
+
+static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+                          MachineInstr &MI, MachineBasicBlock &TrapBB,
+                          MCPhysReg FreeRegister) {
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  // cmp [rsp], REG
+  addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
+      .addReg(FreeRegister);
+  // jne trap
+  BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
+  MBB.addSuccessor(&TrapBB);
+}
+
+static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
+                             MachineInstr &MI, MachineBasicBlock &TrapBB) {
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  // xor r10, r10
+  BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
+      .addDef(X86::R10)
+      .addReg(X86::R10, RegState::Undef)
+      .addReg(X86::R10, RegState::Undef);
+  // mov r10, [gs:r10]
+  addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+                  X86::GS, X86::R10);
+  // mov r10, [gs:r10]
+  addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
+                  X86::GS, X86::R10);
+  // sub QWORD [gs:0], 8
+  // This instruction should not be moved up to avoid a signal race.
+  addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), X86::GS, 0)
+      .addImm(8);
+  // cmp [rsp], r10
+  addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
+      .addReg(X86::R10);
+  // jne trap
+  BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
+  MBB.addSuccessor(&TrapBB);
+}
+
+bool ShadowCallStack::runOnMachineFunction(MachineFunction &Fn) {
+  if (!Fn.getFunction().hasFnAttribute(Attribute::ShadowCallStack) ||
+      Fn.getFunction().hasFnAttribute(Attribute::Naked))
+    return false;
+
+  if (Fn.empty() || !Fn.getRegInfo().tracksLiveness())
+    return false;
+
+  // FIXME: Skip functions that have r10 or r11 live on entry (r10 can be live
+  // on entry for parameters with the nest attribute.)
+  if (Fn.front().isLiveIn(X86::R10) || Fn.front().isLiveIn(X86::R11))
+    return false;
+
+  // FIXME: Skip functions with conditional and r10 tail calls for now.
+  bool HasReturn = false;
+  for (auto &MBB : Fn) {
+    if (MBB.empty())
+      continue;
+
+    const MachineInstr &MI = MBB.instr_back();
+    if (MI.isReturn())
+      HasReturn = true;
+
+    if (MI.isReturn() && MI.isCall()) {
+      if (MI.findRegisterUseOperand(X86::EFLAGS))
+        return false;
+      // This should only be possible on Windows 64 (see GR64_TC versus
+      // GR64_TCW64.)
+      if (MI.findRegisterUseOperand(X86::R10) ||
+          MI.hasRegisterImplicitUseOperand(X86::R10))
+        return false;
+    }
+  }
+
+  if (!HasReturn)
+    return false;
+
+  // For leaf functions:
+  // 1. Do not instrument very short functions where it would not improve that
+  //    function's security.
+  // 2. Detect if there is an unused caller-saved register we can reserve to
+  //    hold the return address instead of writing/reading it from the shadow
+  //    call stack.
+  MCPhysReg LeafFuncRegister = X86::NoRegister;
+  if (!Fn.getFrameInfo().adjustsStack()) {
+    size_t InstructionCount = 0;
+    std::bitset<X86::NUM_TARGET_REGS> UsedRegs;
+    for (auto &MBB : Fn) {
+      for (auto &LiveIn : MBB.liveins())
+        UsedRegs.set(LiveIn.PhysReg);
+      for (auto &MI : MBB) {
+        if (!MI.isDebugValue() && !MI.isCFIInstruction() && !MI.isLabel())
+          InstructionCount++;
+        for (auto &Op : MI.operands())
+          if (Op.isReg() && Op.isDef())
+            UsedRegs.set(Op.getReg());
+      }
+    }
+
+    if (InstructionCount <= SkipLeafInstructions)
+      return false;
+
+    std::bitset<X86::NUM_TARGET_REGS> CalleeSavedRegs;
+    const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs();
+    for (size_t i = 0; CSRegs[i]; i++)
+      CalleeSavedRegs.set(CSRegs[i]);
+
+    const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+    for (auto &Reg : X86::GR64_NOSPRegClass.getRegisters()) {
+      // FIXME: Optimization opportunity: spill/restore a callee-saved register
+      // if a caller-saved register is unavailable.
+      if (CalleeSavedRegs.test(Reg))
+        continue;
+
+      bool Used = false;
+      for (MCSubRegIterator SR(Reg, TRI, true); SR.isValid(); ++SR)
+        if ((Used = UsedRegs.test(*SR)))
+          break;
+
+      if (!Used) {
+        LeafFuncRegister = Reg;
+        break;
+      }
+    }
+  }
+
+  const bool LeafFuncOptimization = LeafFuncRegister != X86::NoRegister;
+  if (LeafFuncOptimization)
+    // Mark the leaf function register live-in for all MBBs except the entry MBB
+    for (auto I = ++Fn.begin(), E = Fn.end(); I != E; ++I)
+      I->addLiveIn(LeafFuncRegister);
+
+  MachineBasicBlock &MBB = Fn.front();
+  const MachineBasicBlock *NonEmpty = MBB.empty() ? MBB.getFallThrough() : &MBB;
+  const DebugLoc &DL = NonEmpty->front().getDebugLoc();
+
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  if (LeafFuncOptimization)
+    addPrologLeaf(Fn, TII, MBB, DL, LeafFuncRegister);
+  else
+    addProlog(Fn, TII, MBB, DL);
+
+  MachineBasicBlock *Trap = nullptr;
+  for (auto &MBB : Fn) {
+    if (MBB.empty())
+      continue;
+
+    MachineInstr &MI = MBB.instr_back();
+    if (MI.isReturn()) {
+      if (!Trap) {
+        Trap = Fn.CreateMachineBasicBlock();
+        BuildMI(Trap, MI.getDebugLoc(), TII->get(X86::TRAP));
+        Fn.push_back(Trap);
+      }
+
+      if (LeafFuncOptimization)
+        addEpilogLeaf(TII, MBB, MI, *Trap, LeafFuncRegister);
+      else if (MI.findRegisterUseOperand(X86::R11))
+        addEpilogOnlyR10(TII, MBB, MI, *Trap);
+      else
+        addEpilog(TII, MBB, MI, *Trap);
+    }
+  }
+
+  return true;
+}
+
+INITIALIZE_PASS(ShadowCallStack, "shadow-call-stack", "Shadow Call Stack",
+                false, false)
+
+FunctionPass *llvm::createShadowCallStackPass() {
+  return new ShadowCallStack();
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 8a0fbfb45b22..fe567f4cece8 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -14,7 +14,6 @@
 
 #include "X86ShuffleDecode.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/MachineValueType.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
@@ -45,9 +44,8 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
 }
 
-void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
                              SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
   assert((Idx + Len) <= NumElts && "Insertion out of range");
 
   for (unsigned i = 0; i != NumElts; ++i)
@@ -74,41 +72,31 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
     ShuffleMask.push_back(NElts + i);
 }
 
-void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
   for (int i = 0, e = NumElts / 2; i < e; ++i) {
     ShuffleMask.push_back(2 * i);
     ShuffleMask.push_back(2 * i);
   }
 }
 
-void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
   for (int i = 0, e = NumElts / 2; i < e; ++i) {
     ShuffleMask.push_back(2 * i + 1);
     ShuffleMask.push_back(2 * i + 1);
   }
 }
 
-void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VectorSizeInBits = VT.getSizeInBits();
-  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VectorSizeInBits / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
-  unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 2;
 
   for (unsigned l = 0; l < NumElts; l += NumLaneElts)
-    for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
-      for (unsigned s = 0; s != NumLaneSubElts; s++)
-        ShuffleMask.push_back(l + s);
+    for (unsigned i = 0; i < NumLaneElts; ++i)
+      ShuffleMask.push_back(l);
 }
 
-void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VectorSizeInBits = VT.getSizeInBits();
-  unsigned NumElts = VectorSizeInBits / 8;
-  unsigned NumLanes = VectorSizeInBits / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 16;
 
   for (unsigned l = 0; l < NumElts; l += NumLaneElts)
     for (unsigned i = 0; i < NumLaneElts; ++i) {
@@ -118,11 +106,9 @@ void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
     }
 }
 
-void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VectorSizeInBits = VT.getSizeInBits();
-  unsigned NumElts = VectorSizeInBits / 8;
-  unsigned NumLanes = VectorSizeInBits / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 16;
 
   for (unsigned l = 0; l < NumElts; l += NumLaneElts)
     for (unsigned i = 0; i < NumLaneElts; ++i) {
@@ -133,58 +119,50 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
     }
 }
 
-void DecodePALIGNRMask(MVT VT, unsigned Imm,
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
-
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
+  const unsigned NumLaneElts = 16;
 
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
     for (unsigned i = 0; i != NumLaneElts; ++i) {
-      unsigned Base = i + Offset;
-      // if i+offset is out of this lane then we actually need the other source
+      unsigned Base = i + Imm;
+      // if i+imm is out of this lane then we actually need the other source
       if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
       ShuffleMask.push_back(Base + l);
     }
   }
 }
 
-void DecodeVALIGNMask(MVT VT, unsigned Imm,
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
                       SmallVectorImpl<int> &ShuffleMask) {
-  int NumElts = VT.getVectorNumElements();
   // Not all bits of the immediate are used so mask it.
   assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
   Imm = Imm & (NumElts - 1);
-  for (int i = 0; i != NumElts; ++i)
+  for (unsigned i = 0; i != NumElts; ++i)
     ShuffleMask.push_back(i + Imm);
 }
 
 /// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned NumLanes = VT.getSizeInBits() / 128;
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask) {
+  unsigned Size = NumElts * ScalarBits;
+  unsigned NumLanes = Size / 128;
   if (NumLanes == 0) NumLanes = 1;  // Handle MMX
   unsigned NumLaneElts = NumElts / NumLanes;
 
-  unsigned NewImm = Imm;
+  uint32_t SplatImm = (Imm & 0xff) * 0x01010101;
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
     for (unsigned i = 0; i != NumLaneElts; ++i) {
-      ShuffleMask.push_back(NewImm % NumLaneElts + l);
-      NewImm /= NumLaneElts;
+      ShuffleMask.push_back(SplatImm % NumLaneElts + l);
+      SplatImm /= NumLaneElts;
     }
-    if (NumLaneElts == 4) NewImm = Imm; // reload imm
   }
 }
 
-void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
   for (unsigned l = 0; l != NumElts; l += 8) {
     unsigned NewImm = Imm;
     for (unsigned i = 0, e = 4; i != e; ++i) {
@@ -197,10 +175,8 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm,
   }
 }
 
-void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
   for (unsigned l = 0; l != NumElts; l += 8) {
     unsigned NewImm = Imm;
     for (unsigned i = 0, e = 4; i != e; ++i) {
@@ -213,8 +189,7 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm,
   }
 }
 
-void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumHalfElts = NumElts / 2;
 
   for (unsigned l = 0; l != NumHalfElts; ++l)
@@ -226,11 +201,9 @@ void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
+                     unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumLaneElts = 128 / ScalarBits;
 
   unsigned NewImm = Imm;
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
@@ -248,12 +221,11 @@ void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask) {
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLanes = (NumElts * ScalarBits) / 128;
   if (NumLanes == 0) NumLanes = 1;  // Handle MMX
   unsigned NumLaneElts = NumElts / NumLanes;
 
@@ -268,12 +240,11 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask) {
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLanes = (NumElts * ScalarBits) / 128;
   if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
   unsigned NumLaneElts = NumElts / NumLanes;
 
@@ -286,47 +257,44 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
 }
 
 /// Decodes a broadcast of the first element of a vector.
-void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = DstVT.getVectorNumElements();
+void DecodeVectorBroadcast(unsigned NumElts,
+                           SmallVectorImpl<int> &ShuffleMask) {
   ShuffleMask.append(NumElts, 0);
 }
 
 /// Decodes a broadcast of a subvector to a larger vector type.
-void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
                               SmallVectorImpl<int> &ShuffleMask) {
-  assert(SrcVT.getScalarType() == DstVT.getScalarType() &&
-         "Non matching vector element types");
-  unsigned NumElts = SrcVT.getVectorNumElements();
-  unsigned Scale = DstVT.getSizeInBits() / SrcVT.getSizeInBits();
+  unsigned Scale = DstNumElts / SrcNumElts;
 
   for (unsigned i = 0; i != Scale; ++i)
-    for (unsigned j = 0; j != NumElts; ++j)
+    for (unsigned j = 0; j != SrcNumElts; ++j)
       ShuffleMask.push_back(j);
 }
 
-/// \brief Decode a shuffle packed values at 128-bit granularity
+/// Decode a shuffle packed values at 128-bit granularity
 /// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
 /// immediate mask into a shuffle mask.
-void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
-                        SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
-  unsigned ControlBitsMask = NumLanes - 1;
-  unsigned NumControlBits  = NumLanes / 2;
-
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+                               unsigned Imm,
+                               SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElementsInLane = 128 / ScalarSize;
+  unsigned NumLanes = NumElts / NumElementsInLane;
+
+  for (unsigned l = 0; l != NumElts; l += NumElementsInLane) {
+    unsigned Index = (Imm % NumLanes) * NumElementsInLane;
+    Imm /= NumLanes; // Discard the bits we just used.
     // We actually need the other source.
-    if (l >= NumLanes / 2)
-      LaneMask += NumLanes;
+    if (l >= (NumElts / 2))
+      Index += NumElts;
     for (unsigned i = 0; i != NumElementsInLane; ++i)
-      ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+      ShuffleMask.push_back(Index + i);
   }
 }
 
-void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
-  unsigned HalfSize = VT.getVectorNumElements() / 2;
+  unsigned HalfSize = NumElts / 2;
 
   for (unsigned l = 0; l != 2; ++l) {
     unsigned HalfMask = Imm >> (l * 4);
@@ -358,17 +326,13 @@ void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
   }
 }
 
-void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  int ElementBits = VT.getScalarSizeInBits();
-  int NumElements = VT.getVectorNumElements();
-  for (int i = 0; i < NumElements; ++i) {
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i < NumElts; ++i) {
     // If there are more than 8 elements in the vector, then any immediate blend
-    // mask applies to each 128-bit lane. There can never be more than
-    // 8 elements in a 128-bit lane with an immediate blend.
-    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
-    assert(Bit < 8 &&
-           "Immediate blends only operate over 8 elements at a time!");
-    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+    // mask wraps around.
+    unsigned Bit = i % 8;
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElts + i : i);
   }
 }
 
@@ -412,19 +376,15 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
 }
 
 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
-void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  assert((VT.is256BitVector() || VT.is512BitVector()) &&
-         (VT.getScalarSizeInBits() == 64) && "Unexpected vector value type");
-  unsigned NumElts = VT.getVectorNumElements();
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask) {
   for (unsigned l = 0; l != NumElts; l += 4)
     for (unsigned i = 0; i != 4; ++i)
       ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
 }
 
-void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
-  unsigned NumDstElts = DstVT.getVectorNumElements();
-  unsigned SrcScalarBits = SrcScalarVT.getSizeInBits();
-  unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+                          unsigned NumDstElts, SmallVectorImpl<int> &Mask) {
   unsigned Scale = DstScalarBits / SrcScalarBits;
   assert(SrcScalarBits < DstScalarBits &&
          "Expected zero extension mask to increase scalar size");
@@ -436,27 +396,24 @@ void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask
   }
 }
 
-void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
+void DecodeZeroMoveLowMask(unsigned NumElts,
+                           SmallVectorImpl<int> &ShuffleMask) {
   ShuffleMask.push_back(0);
   for (unsigned i = 1; i < NumElts; i++)
     ShuffleMask.push_back(SM_SentinelZero);
 }
 
-void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
+                          SmallVectorImpl<int> &Mask) {
   // First element comes from the first element of second source.
   // Remaining elements: Load zero extends / Move copies from first source.
-  unsigned NumElts = VT.getVectorNumElements();
   Mask.push_back(NumElts);
   for (unsigned i = 1; i < NumElts; i++)
     Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
 }
 
-void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
                       SmallVectorImpl<int> &ShuffleMask) {
-  assert(VT.is128BitVector() && "Expected 128-bit vector");
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned EltSize = VT.getScalarSizeInBits();
   unsigned HalfElts = NumElts / 2;
 
   // Only the bottom 6 bits are valid for each immediate.
@@ -492,11 +449,8 @@ void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
     ShuffleMask.push_back(SM_SentinelUndef);
 }
 
-void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
                         SmallVectorImpl<int> &ShuffleMask) {
-  assert(VT.is128BitVector() && "Expected 128-bit vector");
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned EltSize = VT.getScalarSizeInBits();
   unsigned HalfElts = NumElts / 2;
 
   // Only the bottom 6 bits are valid for each immediate.
@@ -535,33 +489,32 @@ void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
     ShuffleMask.push_back(SM_SentinelUndef);
 }
 
-void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+                        ArrayRef<uint64_t> RawMask,
                         SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VecSize = VT.getSizeInBits();
-  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned VecSize = NumElts * ScalarBits;
   unsigned NumLanes = VecSize / 128;
-  unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
+  unsigned NumEltsPerLane = NumElts / NumLanes;
   assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
          "Unexpected vector size");
-  assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+  assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
     uint64_t M = RawMask[i];
-    M = (EltSize == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+    M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
     unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
     ShuffleMask.push_back((int)(LaneOffset + M));
   }
 }
 
-void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+                         ArrayRef<uint64_t> RawMask,
                          SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VecSize = VT.getSizeInBits();
-  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned VecSize = NumElts * ScalarBits;
   unsigned NumLanes = VecSize / 128;
-  unsigned NumElts = VT.getVectorNumElements();
   unsigned NumEltsPerLane = NumElts / NumLanes;
   assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
-  assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+  assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
   assert((NumElts == RawMask.size()) && "Unexpected mask size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
@@ -584,7 +537,7 @@ void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
     }
 
     int Index = i & ~(NumEltsPerLane - 1);
-    if (EltSize == 64)
+    if (ScalarBits == 64)
       Index += (Selector >> 1) & 0x1;
     else
       Index += Selector & 0x3;
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
index 251c9f7558ec..6d13bd58a127 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -23,7 +23,6 @@
 
 namespace llvm {
 template <typename T> class ArrayRef;
-class MVT;
 
 enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
 
@@ -32,7 +31,7 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 // Insert the bottom Len elements from a second source into a vector starting at
 // element Idx.
-void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
                              SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
@@ -43,58 +42,68 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 /// i.e. <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask);
 
-void DecodeVALIGNMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for pshufhw.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for pshuflw.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes a PSWAPD 3DNow! instruction.
-void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for shufp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes a broadcast of the first element of a vector.
-void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes a broadcast of a subvector to a larger vector type.
-void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
                               SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a PSHUFB mask from a raw array of constants such as from
@@ -103,18 +112,20 @@ void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a BLEND immediate mask into a shuffle mask.
-void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
 
-void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a shuffle packed values at 128-bit granularity
 /// immediate mask into a shuffle mask.
-void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
-                               SmallVectorImpl<int> &ShuffleMask);
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+                               unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for VPERMQ/VPERMPD.
-void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM mask from a raw array of constants such as from
 /// BUILD_VECTOR.
@@ -124,30 +135,33 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a zero extension instruction as a shuffle mask.
-void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT,
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+                          unsigned NumDstElts,
                           SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a move lower and zero upper instruction as a shuffle mask.
-void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a scalar float move instruction as a shuffle mask.
-void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
                           SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a SSE4A EXTRQ instruction as a shuffle mask.
-void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a SSE4A INSERTQ instruction as a shuffle mask.
-void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
-void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+                        ArrayRef<uint64_t> RawMask,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
-void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+                         ArrayRef<uint64_t> RawMask,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index 642dda8f4225..73bb0f2af285 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -50,6 +50,15 @@ FunctionPass *createX86FloatingPointStackifierPass();
 /// transition penalty between functions encoded with AVX and SSE.
 FunctionPass *createX86IssueVZeroUpperPass();
 
+/// This pass instruments the function prolog to save the return address to a
+/// 'shadow call stack' and the function epilog to check that the return address
+/// did not change during function execution.
+FunctionPass *createShadowCallStackPass();
+
+/// This pass inserts ENDBR instructions before indirect jump/call
+/// destinations as part of CET IBT mechanism.
+FunctionPass *createX86IndirectBranchTrackingPass();
+
 /// Return a pass that pads short functions with NOOPs.
 /// This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
@@ -66,6 +75,9 @@ FunctionPass *createX86OptimizeLEAs();
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
+/// Return a pass that avoids creating store forward block issues in the hardware.
+FunctionPass *createX86AvoidStoreForwardingBlocks();
+
 /// Return a pass that lowers EFLAGS copy pseudo instructions.
 FunctionPass *createX86FlagsCopyLoweringPass();
 
@@ -115,6 +127,8 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
 
 void initializeEvexToVexInstPassPass(PassRegistry &);
 
+FunctionPass *createX86SpeculativeLoadHardeningPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
index cc4c8823c3da..63c2dc4da6cc 100644
--- a/contrib/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -34,6 +34,9 @@ def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
 def FeatureX87     : SubtargetFeature<"x87","HasX87", "true",
                                       "Enable X87 float instructions">;
 
+def FeatureNOPL    : SubtargetFeature<"nopl", "HasNOPL", "true",
+                                      "Enable NOPL instruction">;
+
 def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
                                       "Enable conditional move instructions">;
 
@@ -215,8 +218,6 @@ def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
                                       [FeatureSSE2]>;
 def FeatureSHSTK   : SubtargetFeature<"shstk", "HasSHSTK", "true",
                        "Support CET Shadow-Stack instructions">;
-def FeatureIBT     : SubtargetFeature<"ibt", "HasIBT", "true",
-                       "Support CET Indirect-Branch-Tracking instructions">;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
@@ -227,6 +228,10 @@ def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
                                       "Enable MONITORX/MWAITX timer functionality">;
 def FeatureCLZERO  : SubtargetFeature<"clzero", "HasCLZERO", "true",
                                       "Enable Cache Line Zero">;
+def FeatureCLDEMOTE  : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
+                                      "Enable Cache Demote">;
+def FeaturePTWRITE  : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
+                                      "Support ptwrite instruction">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -240,12 +245,20 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
+def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
+                                      "Invalidate Process-Context Identifier">;
 def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
                                       "Enable Software Guard Extensions">;
 def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
                                       "Flush A Cache Line Optimized">;
 def FeatureCLWB    : SubtargetFeature<"clwb", "HasCLWB", "true",
                                       "Cache Line Write Back">;
+def FeatureWBNOINVD    : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
+                                      "Write Back No Invalidate">;
+def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
+                                    "Support RDPID instructions">;
+def FeatureWAITPKG  : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
+                                      "Wait and pause enhancements">;
 // On some processors, instructions that implicitly take two memory operands are
 // slow. In practice, this means that CALL, PUSH, and POP with memory operands
 // should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -263,6 +276,14 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
+def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
+                                     "HasPOPCNTFalseDeps", "true",
+                                     "POPCNT has a false dependency on dest register">;
+def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
+                                     "HasLZCNTFalseDeps", "true",
+                                     "LZCNT/TZCNT have a false dependency on dest register">;
+def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
+                                      "platform configuration instruction">;
 // On recent X86 (port bound) processors, its preferable to combine to a single shuffle
 // using a variable mask over multiple fixed shuffles.
 def FeatureFastVariableShuffle
@@ -294,8 +315,16 @@ def FeatureFastLZCNT
     : SubtargetFeature<
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
-
-
+// If the target can efficiently decode NOPs upto 11-bytes in length.
+def FeatureFast11ByteNOP
+    : SubtargetFeature<
+          "fast-11bytenop", "HasFast11ByteNOP", "true",
+          "Target can quickly decode up to 11 byte NOPs">;
+// If the target can efficiently decode NOPs upto 15-bytes in length.
+def FeatureFast15ByteNOP
+    : SubtargetFeature<
+          "fast-15bytenop", "HasFast15ByteNOP", "true",
+          "Target can quickly decode up to 15 byte NOPs">;
 // Sandy Bridge and newer processors can use SHLD with the same source on both
 // inputs to implement rotate to avoid the partial flag update of the normal
 // rotate instructions.
@@ -329,6 +358,10 @@ def FeatureHasFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast.">;
 
+def FeaturePrefer256Bit
+    : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
+                       "Prefer 256-bit AVX instructions">;
+
 // Enable mitigation of some aspects of speculative execution related
 // vulnerabilities by removing speculatable indirect branches. This disables
 // jump-table formation, rewrites explicit `indirectbr` instructions into
@@ -350,6 +383,12 @@ def FeatureRetpolineExternalThunk
           "Enable retpoline, but with an externally provided thunk.",
           [FeatureRetpoline]>;
 
+// Direct Move instructions.
+def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
+                                       "Support movdiri instruction">;
+def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
+                                        "Support movdir64b instruction">;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -363,6 +402,7 @@ include "X86RegisterBanks.td"
 
 include "X86Schedule.td"
 include "X86InstrInfo.td"
+include "X86SchedPredicates.td"
 
 def X86InstrInfo : InstrInfo;
 
@@ -386,6 +426,10 @@ def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
                     "Intel Silvermont processors">;
 def ProcIntelGLM  : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
                     "Intel Goldmont processors">;
+def ProcIntelGLP  : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
+                    "Intel Goldmont Plus processors">;
+def ProcIntelTRM  : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
+                    "Intel Tremont processors">;
 def ProcIntelHSW  : SubtargetFeature<"haswell", "X86ProcFamily",
                     "IntelHaswell", "Intel Haswell processors">;
 def ProcIntelBDW  : SubtargetFeature<"broadwell", "X86ProcFamily",
@@ -398,8 +442,10 @@ def ProcIntelSKX  : SubtargetFeature<"skx", "X86ProcFamily",
                     "IntelSKX", "Intel Skylake Server processors">;
 def ProcIntelCNL  : SubtargetFeature<"cannonlake", "X86ProcFamily",
                     "IntelCannonlake", "Intel Cannonlake processors">;
-def ProcIntelICL  : SubtargetFeature<"icelake", "X86ProcFamily",
-                    "IntelIcelake", "Intel Icelake processors">;
+def ProcIntelICL  : SubtargetFeature<"icelake-client", "X86ProcFamily",
+                    "IntelIcelakeClient", "Intel Icelake processors">;
+def ProcIntelICX  : SubtargetFeature<"icelake-server", "X86ProcFamily",
+                    "IntelIcelakeServer", "Intel Icelake Server processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -411,16 +457,16 @@ def : Proc<"i586",            [FeatureX87, FeatureSlowUAMem16]>;
 def : Proc<"pentium",         [FeatureX87, FeatureSlowUAMem16]>;
 def : Proc<"pentium-mmx",     [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
 
-foreach P = ["i686", "pentiumpro"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
-}
+def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV,
+                          FeatureNOPL]>;
 
 def : Proc<"pentium2",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                               FeatureCMOV, FeatureFXSR]>;
+                               FeatureCMOV, FeatureFXSR, FeatureNOPL]>;
 
 foreach P = ["pentium3", "pentium3m"] in {
   def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
-                 FeatureFXSR]>;
+                 FeatureFXSR, FeatureNOPL]>;
 }
 
 // Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -435,12 +481,12 @@ foreach P = ["pentium3", "pentium3m"] in {
 
 def : ProcessorModel<"pentium-m", GenericPostRAModel,
                      [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                      FeatureSSE2, FeatureFXSR]>;
+                      FeatureSSE2, FeatureFXSR, FeatureNOPL]>;
 
 foreach P = ["pentium4", "pentium4m"] in {
   def : ProcessorModel<P, GenericPostRAModel,
                        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                        FeatureSSE2, FeatureFXSR]>;
+                        FeatureSSE2, FeatureFXSR, FeatureNOPL]>;
 }
 
 // Intel Quark.
@@ -449,18 +495,19 @@ def : Proc<"lakemont",        []>;
 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
                      [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
-                      FeatureFXSR]>;
+                      FeatureFXSR, FeatureNOPL]>;
 
 // NetBurst.
 def : ProcessorModel<"prescott", GenericPostRAModel,
                      [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
-                      FeatureFXSR]>;
+                      FeatureFXSR, FeatureNOPL]>;
 def : ProcessorModel<"nocona", GenericPostRAModel, [
   FeatureX87,
   FeatureSlowUAMem16,
   FeatureMMX,
   FeatureSSE3,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B
 ]>;
 
@@ -471,6 +518,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
   FeatureMMX,
   FeatureSSSE3,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeatureLAHFSAHF,
   FeatureMacroFusion
@@ -481,6 +529,7 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureMMX,
   FeatureSSE41,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeatureLAHFSAHF,
   FeatureMacroFusion
@@ -494,6 +543,7 @@ class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
   FeatureMMX,
   FeatureSSSE3,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeatureMOVBE,
   FeatureLEAForSP,
@@ -513,6 +563,7 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeatureMOVBE,
   FeaturePOPCNT,
@@ -524,17 +575,29 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
   FeatureSlowLEA,
   FeatureSlowIncDec,
   FeatureSlowPMULLD,
-  FeatureLAHFSAHF
+  FeatureRDRAND,
+  FeatureLAHFSAHF,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
 
-class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
-  ProcIntelGLM,
+class ProcessorFeatures<list<SubtargetFeature> Inherited,
+                        list<SubtargetFeature> NewFeatures> {
+  list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+}
+
+class ProcModel<string Name, SchedMachineModel Model,
+                list<SubtargetFeature> ProcFeatures,
+                list<SubtargetFeature> OtherFeatures> :
+  ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
+
+def GLMFeatures : ProcessorFeatures<[], [
   FeatureX87,
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeatureMOVBE,
   FeaturePOPCNT,
@@ -556,14 +619,44 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
   FeatureCLFLUSHOPT,
   FeatureFSGSBase
 ]>;
+
+class GoldmontProc<string Name> : ProcModel<Name, SLMModel,
+      GLMFeatures.Value, [
+  ProcIntelGLM,
+  FeaturePOPCNTFalseDeps
+]>;
 def : GoldmontProc<"goldmont">;
 
+def GLPFeatures : ProcessorFeatures<GLMFeatures.Value, [
+  FeaturePTWRITE,
+  FeatureRDPID,
+  FeatureSGX
+]>;
+
+class GoldmontPlusProc<string Name> : ProcModel<Name, SLMModel,
+      GLPFeatures.Value, [
+  ProcIntelGLP
+]>;
+def : GoldmontPlusProc<"goldmont-plus">;
+
+class TremontProc<string Name> : ProcModel<Name, SLMModel,
+      GLPFeatures.Value, [
+  ProcIntelTRM,
+  FeatureCLDEMOTE,
+  FeatureGFNI,
+  FeatureMOVDIRI,
+  FeatureMOVDIR64B,
+  FeatureWAITPKG
+]>;
+def : TremontProc<"tremont">;
+
 // "Arrandale" along with corei3 and corei5
 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureX87,
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeaturePOPCNT,
   FeatureLAHFSAHF,
@@ -579,6 +672,7 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeaturePOPCNT,
   FeatureAES,
@@ -588,16 +682,6 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
 ]>;
 def : WestmereProc<"westmere">;
 
-class ProcessorFeatures<list<SubtargetFeature> Inherited,
-                        list<SubtargetFeature> NewFeatures> {
-  list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
-}
-
-class ProcModel<string Name, SchedMachineModel Model,
-                list<SubtargetFeature> ProcFeatures,
-                list<SubtargetFeature> OtherFeatures> :
-  ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
-
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
 def SNBFeatures : ProcessorFeatures<[], [
@@ -605,6 +689,7 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeaturePOPCNT,
   FeatureAES,
@@ -622,7 +707,8 @@ def SNBFeatures : ProcessorFeatures<[], [
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
                                                SNBFeatures.Value, [
-  FeatureSlowUAMem32
+  FeatureSlowUAMem32,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : SandyBridgeProc<"sandybridge">;
 def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
@@ -635,7 +721,8 @@ def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
 
 class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
                                              IVBFeatures.Value, [
-  FeatureSlowUAMem32
+  FeatureSlowUAMem32,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : IvyBridgeProc<"ivybridge">;
 def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
@@ -646,6 +733,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureBMI2,
   FeatureERMSB,
   FeatureFMA,
+  FeatureINVPCID,
   FeatureLZCNT,
   FeatureMOVBE,
   FeatureFastVariableShuffle
@@ -653,7 +741,9 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
 
 class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
                                            HSWFeatures.Value, [
-  ProcIntelHSW
+  ProcIntelHSW,
+  FeaturePOPCNTFalseDeps,
+  FeatureLZCNTFalseDeps
 ]>;
 def : HaswellProc<"haswell">;
 def : HaswellProc<"core-avx2">; // Legacy alias.
@@ -665,7 +755,9 @@ def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
                                              BDWFeatures.Value, [
-  ProcIntelBDW
+  ProcIntelBDW,
+  FeaturePOPCNTFalseDeps,
+  FeatureLZCNTFalseDeps
 ]>;
 def : BroadwellProc<"broadwell">;
 
@@ -674,7 +766,6 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
   FeatureRTM,
   FeatureXSAVEC,
   FeatureXSAVES,
-  FeatureSGX,
   FeatureCLFLUSHOPT,
   FeatureFastVectorFSQRT
 ]>;
@@ -682,7 +773,9 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
 class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
                                                  SKLFeatures.Value, [
   ProcIntelSKL,
-  FeatureHasFastGather
+  FeatureHasFastGather,
+  FeaturePOPCNTFalseDeps,
+  FeatureSGX
 ]>;
 def : SkylakeClientProc<"skylake">;
 
@@ -735,7 +828,8 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
 class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  SKXFeatures.Value, [
   ProcIntelSKX,
-  FeatureHasFastGather
+  FeatureHasFastGather,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : SkylakeServerProc<"skylake-avx512">;
 def : SkylakeServerProc<"skx">; // Legacy alias.
@@ -749,7 +843,8 @@ def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
   FeaturePKU,
   FeatureVBMI,
   FeatureIFMA,
-  FeatureSHA
+  FeatureSHA,
+  FeatureSGX
 ]>;
 
 class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
@@ -767,15 +862,25 @@ def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
   FeatureVPCLMULQDQ,
   FeatureVPOPCNTDQ,
   FeatureGFNI,
-  FeatureCLWB
+  FeatureCLWB,
+  FeatureRDPID
 ]>;
 
-class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
-                                           ICLFeatures.Value, [
+class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel,
+                                                 ICLFeatures.Value, [
   ProcIntelICL,
   FeatureHasFastGather
 ]>;
-def : IcelakeProc<"icelake">;
+def : IcelakeClientProc<"icelake-client">;
+
+class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
+                                                 ICLFeatures.Value, [
+  ProcIntelICX,
+  FeaturePCONFIG,
+  FeatureWBNOINVD,
+  FeatureHasFastGather
+]>;
+def : IcelakeServerProc<"icelake-server">;
 
 // AMD CPUs.
 
@@ -784,27 +889,28 @@ def : Proc<"k6-2",            [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
 def : Proc<"k6-3",            [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowSHLD]>;
+  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
+                 FeatureNOPL, FeatureSlowSHLD]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
   def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
-                 Feature3DNowA, FeatureFXSR, FeatureSlowSHLD]>;
+                 Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureSlowSHLD]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
   def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
-                 FeatureFXSR, Feature64Bit, FeatureSlowSHLD]>;
+                 FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
   def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
-                 FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowSHLD]>;
+                 FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD]>;
 }
 
 foreach P = ["amdfam10", "barcelona"] in {
   def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR,
-                 FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+                 FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
                  FeatureSlowSHLD, FeatureLAHFSAHF]>;
 }
 
@@ -815,12 +921,14 @@ def : Proc<"btver1", [
   FeatureSSSE3,
   FeatureSSE4A,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureCMPXCHG16B,
   FeaturePRFCHW,
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureSlowSHLD,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFast15ByteNOP
 ]>;
 
 // Jaguar
@@ -829,6 +937,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureSSE4A,
   FeatureCMPXCHG16B,
   FeaturePRFCHW,
@@ -844,6 +953,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
+  FeatureFast15ByteNOP,
   FeatureFastPartialYMMorZMMWrite
 ]>;
 
@@ -859,6 +969,7 @@ def : Proc<"bdver1", [
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureSSE4A,
   FeatureLZCNT,
   FeaturePOPCNT,
@@ -866,6 +977,7 @@ def : Proc<"bdver1", [
   FeatureLWP,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMacroFusion
 ]>;
 // Piledriver
@@ -880,6 +992,7 @@ def : Proc<"bdver2", [
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureSSE4A,
   FeatureF16C,
   FeatureLZCNT,
@@ -891,6 +1004,7 @@ def : Proc<"bdver2", [
   FeatureFMA,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMacroFusion
 ]>;
 
@@ -906,6 +1020,7 @@ def : Proc<"bdver3", [
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureSSE4A,
   FeatureF16C,
   FeatureLZCNT,
@@ -919,6 +1034,7 @@ def : Proc<"bdver3", [
   FeatureSlowSHLD,
   FeatureFSGSBase,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMacroFusion
 ]>;
 
@@ -928,6 +1044,7 @@ def : Proc<"bdver4", [
   FeatureMMX,
   FeatureAVX2,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureXOP,
   FeatureFMA4,
   FeatureCMPXCHG16B,
@@ -947,6 +1064,7 @@ def : Proc<"bdver4", [
   FeatureSlowSHLD,
   FeatureFSGSBase,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMWAITX,
   FeatureMacroFusion
 ]>;
@@ -965,9 +1083,11 @@ def: ProcessorModel<"znver1", Znver1Model, [
   FeatureFMA,
   FeatureFSGSBase,
   FeatureFXSR,
+  FeatureNOPL,
   FeatureFastLZCNT,
   FeatureLAHFSAHF,
   FeatureLZCNT,
+  FeatureFast15ByteNOP,
   FeatureMacroFusion,
   FeatureMMX,
   FeatureMOVBE,
@@ -1009,6 +1129,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
   FeatureMMX,
   FeatureSSE2,
   FeatureFXSR,
+  FeatureNOPL,
   Feature64Bit,
   FeatureSlow3OpsLEA,
   FeatureSlowIncDec,
@@ -1072,4 +1193,11 @@ def X86 : Target {
   let InstructionSet = X86InstrInfo;
   let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
   let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+  let AllowRegisterRenaming = 1;
 }
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "X86PfmCounters.td"
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 2a501efbc1bf..7d8f7b9dfe46 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -19,9 +19,9 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Mangler.h"
@@ -31,11 +31,13 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -372,6 +374,12 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   unsigned Reg = MO.getReg();
   bool EmitPercent = true;
 
+  if (!X86::GR8RegClass.contains(Reg) &&
+      !X86::GR16RegClass.contains(Reg) &&
+      !X86::GR32RegClass.contains(Reg) &&
+      !X86::GR64RegClass.contains(Reg))
+    return true;
+
   switch (Mode) {
   default: return true;  // Unknown mode.
   case 'b': // Print QImode register
@@ -482,7 +490,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       printPCRelImm(*this, MI, OpNo, O);
       return false;
 
-    case 'n':  // Negate the immediate or print a '-' before the operand.
+    case 'n': // Negate the immediate or print a '-' before the operand.
       // Note: this is a temporary solution. It should be handled target
       // independently as part of the 'MC' work.
       if (MO.isImm()) {
@@ -533,6 +541,42 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
+  if (TT.isOSBinFormatELF()) {
+    // Assemble feature flags that may require creation of a note section.
+    unsigned FeatureFlagsAnd = 0;
+    if (M.getModuleFlag("cf-protection-branch"))
+      FeatureFlagsAnd |= ELF::GNU_PROPERTY_X86_FEATURE_1_IBT;
+    if (M.getModuleFlag("cf-protection-return"))
+      FeatureFlagsAnd |= ELF::GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+    if (FeatureFlagsAnd) {
+      // Emit a .note.gnu.property section with the flags.
+      if (!TT.isArch32Bit() && !TT.isArch64Bit())
+        llvm_unreachable("CFProtection used on invalid architecture!");
+      MCSection *Cur = OutStreamer->getCurrentSectionOnly();
+      MCSection *Nt = MMI->getContext().getELFSection(
+          ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
+      OutStreamer->SwitchSection(Nt);
+
+      // Emitting note header.
+      int WordSize = TT.isArch64Bit() ? 8 : 4;
+      EmitAlignment(WordSize == 4 ? 2 : 3);
+      OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
+      OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
+      OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
+      OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+
+      // Emitting an Elf_Prop for the CET properties.
+      OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
+      OutStreamer->EmitIntValue(WordSize, 4);               // data size
+      OutStreamer->EmitIntValue(FeatureFlagsAnd, WordSize); // data
+      EmitAlignment(WordSize == 4 ? 2 : 3);                 // padding
+
+      OutStreamer->endSection(Nt);
+      OutStreamer->SwitchSection(Cur);
+    }
+  }
+
   if (TT.isOSBinFormatMachO())
     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
 
@@ -587,64 +631,48 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
         4 /*size*/);
 }
 
-MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  if (Subtarget->isTargetKnownWindowsMSVC()) {
-    const MachineConstantPoolEntry &CPE =
-        MF->getConstantPool()->getConstants()[CPID];
-    if (!CPE.isMachineConstantPoolEntry()) {
-      const DataLayout &DL = MF->getDataLayout();
-      SectionKind Kind = CPE.getSectionKind(&DL);
-      const Constant *C = CPE.Val.ConstVal;
-      unsigned Align = CPE.Alignment;
-      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
-              getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
-        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
-          if (Sym->isUndefined())
-            OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
-          return Sym;
-        }
-      }
-    }
-  }
+static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
+
+  MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+  // Output stubs for dynamically-linked functions.
+  MachineModuleInfoMachO::SymbolListTy Stubs;
+
+  // Output stubs for external and common global variables.
+  Stubs = MMIMacho.GetGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(MMI->getContext().getMachOSection(
+        "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+        SectionKind::getMetadata()));
+
+    for (auto &Stub : Stubs)
+      emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
 
-  return AsmPrinter::GetCPISymbol(CPID);
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
 }
 
 void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
   if (TT.isOSBinFormatMachO()) {
-    // All darwin targets use mach-o.
-    MachineModuleInfoMachO &MMIMacho =
-        MMI->getObjFileInfo<MachineModuleInfoMachO>();
-
-    // Output stubs for dynamically-linked functions.
-    MachineModuleInfoMachO::SymbolListTy Stubs;
-
-    // Output stubs for external and common global variables.
-    Stubs = MMIMacho.GetGVStubList();
-    if (!Stubs.empty()) {
-      MCSection *TheSection = OutContext.getMachOSection(
-          "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
-          SectionKind::getMetadata());
-      OutStreamer->SwitchSection(TheSection);
-
-      for (auto &Stub : Stubs)
-        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
-
-      Stubs.clear();
-      OutStreamer->AddBlankLine();
-    }
+    // Mach-O uses non-lazy symbol stubs to encode per-TU information into
+    // global table for symbol lookup.
+    emitNonLazyStubs(MMI, *OutStreamer);
 
+    // Emit stack and fault map information.
     SM.serializeToStackMapSection();
     FM.serializeToFaultMapSection();
 
-    // Funny Darwin hack: This flag tells the linker that no global symbols
-    // contain code that falls through to other global symbols (e.g. the obvious
-    // implementation of multiple entry points).  If this doesn't occur, the
-    // linker can safely perform dead code stripping.  Since LLVM never
-    // generates code that does this, it is always safe to set.
+    // This flag tells the linker that no global symbols contain code that fall
+    // through to other global symbols (e.g. an implementation of multiple entry
+    // points). If this doesn't occur, the linker can safely perform dead code
+    // stripping. Since LLVM never generates code that does this, it is always
+    // safe to set.
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    return;
   }
 
   if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
@@ -652,36 +680,18 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
     MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
     OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+    return;
   }
 
   if (TT.isOSBinFormatCOFF()) {
-    const TargetLoweringObjectFileCOFF &TLOFCOFF =
-        static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
-
-    std::string Flags;
-    raw_string_ostream FlagsOS(Flags);
-
-    for (const auto &Function : M)
-      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function);
-    for (const auto &Global : M.globals())
-      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global);
-    for (const auto &Alias : M.aliases())
-      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias);
-
-    FlagsOS.flush();
-
-    // Output collected flags.
-    if (!Flags.empty()) {
-      OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
-      OutStreamer->EmitBytes(Flags);
-    }
-
     SM.serializeToStackMapSection();
+    return;
   }
 
   if (TT.isOSBinFormatELF()) {
     SM.serializeToStackMapSection();
     FM.serializeToFaultMapSection();
+    return;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
index 31328e6aea95..55abdf2ba601 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -95,6 +95,8 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
+                                       X86MCInstLower &MCIL);
 
   void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
 
@@ -128,9 +130,6 @@ public:
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &OS) override;
 
-  /// \brief Return the symbol for the specified constant pool entry.
-  MCSymbol *GetCPISymbol(unsigned CPID) const override;
-
   bool doInitialization(Module &M) override {
     SMShadowTracker.reset(0);
     SM.reset();
diff --git a/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
new file mode 100644
index 000000000000..ab2cbfc33e17
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -0,0 +1,732 @@
+//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// If a load follows a store and reloads data that the store has written to
+// memory, Intel microarchitectures can in many cases forward the data directly
+// from the store to the load, This "store forwarding" saves cycles by enabling
+// the load to directly obtain the data instead of accessing the data from
+// cache or memory.
+// A "store forward block" occurs in cases that a store cannot be forwarded to
+// the load. The most typical case of store forward block on Intel Core
+// microarchitecture that a small store cannot be forwarded to a large load.
+// The estimated penalty for a store forward block is ~13 cycles.
+//
+// This pass tries to recognize and handle cases where "store forward block"
+// is created by the compiler when lowering memcpy calls to a sequence
+// of a load and a store.
+//
+// The pass currently only handles cases where memcpy is lowered to
+// XMM/YMM registers, it tries to break the memcpy into smaller copies.
+// breaking the memcpy should be possible since there is no atomicity
+// guarantee for loads and stores to XMM/YMM.
+//
+// It could be better for performance to solve the problem by loading
+// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
+// to memory, but this will result in a more conservative optimization since it
+// requires we prove that all memory accesses between the blocking store and the
+// load must alias/don't alias before we can move the store, whereas the
+// transformation done here is correct regardless to other memory accesses.
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-avoid-SFB"
+
+namespace llvm {
+void initializeX86AvoidSFBPassPass(PassRegistry &);
+} // end namespace llvm
+
+static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
+    "x86-disable-avoid-SFB", cl::Hidden,
+    cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
+
+static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
+    "x86-sfb-inspection-limit",
+    cl::desc("X86: Number of instructions backward to "
+             "inspect for store forwarding blocks."),
+    cl::init(20), cl::Hidden);
+
+namespace {
+
+using DisplacementSizeMap = std::map<int64_t, unsigned>;
+
+class X86AvoidSFBPass : public MachineFunctionPass {
+public:
+  static char ID;
+  X86AvoidSFBPass() : MachineFunctionPass(ID) {
+    initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "X86 Avoid Store Forwarding Blocks";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+
+private:
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+  SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
+      BlockedLoadsStoresPairs;
+  SmallVector<MachineInstr *, 2> ForRemoval;
+  AliasAnalysis *AA;
+
+  /// Returns couples of Load then Store to memory which look
+  ///  like a memcpy.
+  void findPotentiallylBlockedCopies(MachineFunction &MF);
+  /// Break the memcpy's load and store into smaller copies
+  /// such that each memory load that was blocked by a smaller store
+  /// would now be copied separately.
+  void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
+                          const DisplacementSizeMap &BlockingStoresDispSizeMap);
+  /// Break a copy of size Size to smaller copies.
+  void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
+                   MachineInstr *StoreInst, int64_t StDispImm,
+                   int64_t LMMOffset, int64_t SMMOffset);
+
+  void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
+                 MachineInstr *StoreInst, unsigned NStoreOpcode,
+                 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
+                 int64_t SMMOffset);
+
+  bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
+
+  unsigned getRegSizeInBytes(MachineInstr *Inst);
+};
+
+} // end anonymous namespace
+
+char X86AvoidSFBPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
+                    false)
+
+FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
+  return new X86AvoidSFBPass();
+}
+
+static bool isXMMLoadOpcode(unsigned Opcode) {
+  return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
+         Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
+         Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
+         Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
+         Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
+         Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
+         Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
+         Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
+}
+static bool isYMMLoadOpcode(unsigned Opcode) {
+  return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
+         Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
+         Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
+         Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
+         Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
+         Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
+         Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
+}
+
+static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
+  return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
+}
+
+static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
+  switch (LdOpcode) {
+  case X86::MOVUPSrm:
+  case X86::MOVAPSrm:
+    return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPSrm:
+    return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
+  case X86::VMOVUPDrm:
+  case X86::VMOVAPDrm:
+    return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
+  case X86::VMOVDQUrm:
+  case X86::VMOVDQArm:
+    return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVAPSZ128rm:
+    return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVAPDZ128rm:
+    return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPSYrm:
+    return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
+  case X86::VMOVUPDYrm:
+  case X86::VMOVAPDYrm:
+    return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
+  case X86::VMOVDQUYrm:
+  case X86::VMOVDQAYrm:
+    return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPSZ256rm:
+    return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVAPDZ256rm:
+    return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVDQA64Z128rm:
+    return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQA32Z128rm:
+    return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQA64Z256rm:
+    return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA32Z256rm:
+    return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
+  default:
+    return false;
+  }
+}
+
+static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
+  bool PBlock = false;
+  PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
+            Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
+            Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
+            Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
+  if (isYMMLoadOpcode(LoadOpcode))
+    PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
+              Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
+              Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
+              Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
+              Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
+              Opcode == X86::VMOVDQU64Z128mr ||
+              Opcode == X86::VMOVDQA64Z128mr ||
+              Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
+  return PBlock;
+}
+
+static const int MOV128SZ = 16;
+static const int MOV64SZ = 8;
+static const int MOV32SZ = 4;
+static const int MOV16SZ = 2;
+static const int MOV8SZ = 1;
+
+static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
+  switch (LoadOpcode) {
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPSYrm:
+    return X86::VMOVUPSrm;
+  case X86::VMOVUPDYrm:
+  case X86::VMOVAPDYrm:
+    return X86::VMOVUPDrm;
+  case X86::VMOVDQUYrm:
+  case X86::VMOVDQAYrm:
+    return X86::VMOVDQUrm;
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPSZ256rm:
+    return X86::VMOVUPSZ128rm;
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVAPDZ256rm:
+    return X86::VMOVUPDZ128rm;
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQA64Z256rm:
+    return X86::VMOVDQU64Z128rm;
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA32Z256rm:
+    return X86::VMOVDQU32Z128rm;
+  default:
+    llvm_unreachable("Unexpected Load Instruction Opcode");
+  }
+  return 0;
+}
+
+static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
+  switch (StoreOpcode) {
+  case X86::VMOVUPSYmr:
+  case X86::VMOVAPSYmr:
+    return X86::VMOVUPSmr;
+  case X86::VMOVUPDYmr:
+  case X86::VMOVAPDYmr:
+    return X86::VMOVUPDmr;
+  case X86::VMOVDQUYmr:
+  case X86::VMOVDQAYmr:
+    return X86::VMOVDQUmr;
+  case X86::VMOVUPSZ256mr:
+  case X86::VMOVAPSZ256mr:
+    return X86::VMOVUPSZ128mr;
+  case X86::VMOVUPDZ256mr:
+  case X86::VMOVAPDZ256mr:
+    return X86::VMOVUPDZ128mr;
+  case X86::VMOVDQU64Z256mr:
+  case X86::VMOVDQA64Z256mr:
+    return X86::VMOVDQU64Z128mr;
+  case X86::VMOVDQU32Z256mr:
+  case X86::VMOVDQA32Z256mr:
+    return X86::VMOVDQU32Z128mr;
+  default:
+    llvm_unreachable("Unexpected Load Instruction Opcode");
+  }
+  return 0;
+}
+
+static int getAddrOffset(MachineInstr *MI) {
+  const MCInstrDesc &Descl = MI->getDesc();
+  int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
+  assert(AddrOffset != -1 && "Expected Memory Operand");
+  AddrOffset += X86II::getOperandBias(Descl);
+  return AddrOffset;
+}
+
+static MachineOperand &getBaseOperand(MachineInstr *MI) {
+  int AddrOffset = getAddrOffset(MI);
+  return MI->getOperand(AddrOffset + X86::AddrBaseReg);
+}
+
+static MachineOperand &getDispOperand(MachineInstr *MI) {
+  int AddrOffset = getAddrOffset(MI);
+  return MI->getOperand(AddrOffset + X86::AddrDisp);
+}
+
+// Relevant addressing modes contain only base register and immediate
+// displacement or frameindex and immediate displacement.
+// TODO: Consider expanding to other addressing modes in the future
+static bool isRelevantAddressingMode(MachineInstr *MI) {
+  int AddrOffset = getAddrOffset(MI);
+  MachineOperand &Base = getBaseOperand(MI);
+  MachineOperand &Disp = getDispOperand(MI);
+  MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
+  MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+  MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
+
+  if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
+    return false;
+  if (!Disp.isImm())
+    return false;
+  if (Scale.getImm() != 1)
+    return false;
+  if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
+    return false;
+  if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
+    return false;
+  return true;
+}
+
+// Collect potentially blocking stores.
+// Limit the number of instructions backwards we want to inspect
+// since the effect of store block won't be visible if the store
+// and load instructions have enough instructions in between to
+// keep the core busy.
+static SmallVector<MachineInstr *, 2>
+findPotentialBlockers(MachineInstr *LoadInst) {
+  SmallVector<MachineInstr *, 2> PotentialBlockers;
+  unsigned BlockCount = 0;
+  const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
+  for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
+            E = LoadInst->getParent()->rend();
+       PBInst != E; ++PBInst) {
+    BlockCount++;
+    if (BlockCount >= InspectionLimit)
+      break;
+    MachineInstr &MI = *PBInst;
+    if (MI.getDesc().isCall())
+      return PotentialBlockers;
+    PotentialBlockers.push_back(&MI);
+  }
+  // If we didn't get to the instructions limit try predecessing blocks.
+  // Ideally we should traverse the predecessor blocks in depth with some
+  // coloring algorithm, but for now let's just look at the first order
+  // predecessors.
+  if (BlockCount < InspectionLimit) {
+    MachineBasicBlock *MBB = LoadInst->getParent();
+    int LimitLeft = InspectionLimit - BlockCount;
+    for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
+                                          PE = MBB->pred_end();
+         PB != PE; ++PB) {
+      MachineBasicBlock *PMBB = *PB;
+      int PredCount = 0;
+      for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
+                                               PME = PMBB->rend();
+           PBInst != PME; ++PBInst) {
+        PredCount++;
+        if (PredCount >= LimitLeft)
+          break;
+        if (PBInst->getDesc().isCall())
+          break;
+        PotentialBlockers.push_back(&*PBInst);
+      }
+    }
+  }
+  return PotentialBlockers;
+}
+
+void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
+                                int64_t LoadDisp, MachineInstr *StoreInst,
+                                unsigned NStoreOpcode, int64_t StoreDisp,
+                                unsigned Size, int64_t LMMOffset,
+                                int64_t SMMOffset) {
+  MachineOperand &LoadBase = getBaseOperand(LoadInst);
+  MachineOperand &StoreBase = getBaseOperand(StoreInst);
+  MachineBasicBlock *MBB = LoadInst->getParent();
+  MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
+  MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
+
+  unsigned Reg1 = MRI->createVirtualRegister(
+      TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
+  MachineInstr *NewLoad =
+      BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
+              Reg1)
+          .add(LoadBase)
+          .addImm(1)
+          .addReg(X86::NoRegister)
+          .addImm(LoadDisp)
+          .addReg(X86::NoRegister)
+          .addMemOperand(
+              MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
+  if (LoadBase.isReg())
+    getBaseOperand(NewLoad).setIsKill(false);
+  LLVM_DEBUG(NewLoad->dump());
+  // If the load and store are consecutive, use the loadInst location to
+  // reduce register pressure.
+  MachineInstr *StInst = StoreInst;
+  if (StoreInst->getPrevNode() == LoadInst)
+    StInst = LoadInst;
+  MachineInstr *NewStore =
+      BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
+          .add(StoreBase)
+          .addImm(1)
+          .addReg(X86::NoRegister)
+          .addImm(StoreDisp)
+          .addReg(X86::NoRegister)
+          .addReg(Reg1)
+          .addMemOperand(
+              MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
+  if (StoreBase.isReg())
+    getBaseOperand(NewStore).setIsKill(false);
+  MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
+  assert(StoreSrcVReg.isReg() && "Expected virtual register");
+  NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
+  LLVM_DEBUG(NewStore->dump());
+}
+
+void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
+                                  int64_t LdDispImm, MachineInstr *StoreInst,
+                                  int64_t StDispImm, int64_t LMMOffset,
+                                  int64_t SMMOffset) {
+  int LdDisp = LdDispImm;
+  int StDisp = StDispImm;
+  while (Size > 0) {
+    if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
+      Size = Size - MOV128SZ;
+      buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
+                StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
+                StDisp, MOV128SZ, LMMOffset, SMMOffset);
+      LdDisp += MOV128SZ;
+      StDisp += MOV128SZ;
+      LMMOffset += MOV128SZ;
+      SMMOffset += MOV128SZ;
+      continue;
+    }
+    if (Size - MOV64SZ >= 0) {
+      Size = Size - MOV64SZ;
+      buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
+                MOV64SZ, LMMOffset, SMMOffset);
+      LdDisp += MOV64SZ;
+      StDisp += MOV64SZ;
+      LMMOffset += MOV64SZ;
+      SMMOffset += MOV64SZ;
+      continue;
+    }
+    if (Size - MOV32SZ >= 0) {
+      Size = Size - MOV32SZ;
+      buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
+                MOV32SZ, LMMOffset, SMMOffset);
+      LdDisp += MOV32SZ;
+      StDisp += MOV32SZ;
+      LMMOffset += MOV32SZ;
+      SMMOffset += MOV32SZ;
+      continue;
+    }
+    if (Size - MOV16SZ >= 0) {
+      Size = Size - MOV16SZ;
+      buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
+                MOV16SZ, LMMOffset, SMMOffset);
+      LdDisp += MOV16SZ;
+      StDisp += MOV16SZ;
+      LMMOffset += MOV16SZ;
+      SMMOffset += MOV16SZ;
+      continue;
+    }
+    if (Size - MOV8SZ >= 0) {
+      Size = Size - MOV8SZ;
+      buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
+                MOV8SZ, LMMOffset, SMMOffset);
+      LdDisp += MOV8SZ;
+      StDisp += MOV8SZ;
+      LMMOffset += MOV8SZ;
+      SMMOffset += MOV8SZ;
+      continue;
+    }
+  }
+  assert(Size == 0 && "Wrong size division");
+}
+
+static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
+  MachineOperand &LoadBase = getBaseOperand(LoadInst);
+  MachineOperand &StoreBase = getBaseOperand(StoreInst);
+  if (LoadBase.isReg()) {
+    MachineInstr *LastLoad = LoadInst->getPrevNode();
+    // If the original load and store to xmm/ymm were consecutive
+    // then the partial copies were also created in
+    // a consecutive order to reduce register pressure,
+    // and the location of the last load is before the last store.
+    if (StoreInst->getPrevNode() == LoadInst)
+      LastLoad = LoadInst->getPrevNode()->getPrevNode();
+    getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
+  }
+  if (StoreBase.isReg()) {
+    MachineInstr *StInst = StoreInst;
+    if (StoreInst->getPrevNode() == LoadInst)
+      StInst = LoadInst;
+    getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
+  }
+}
+
+bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
+                            const MachineMemOperand &Op2) const {
+  if (!Op1.getValue() || !Op2.getValue())
+    return true;
+
+  int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
+  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
+  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+
+  AliasResult AAResult =
+      AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
+                MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
+  return AAResult != NoAlias;
+}
+
+void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
+  for (auto &MBB : MF)
+    for (auto &MI : MBB) {
+      if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
+        continue;
+      int DefVR = MI.getOperand(0).getReg();
+      if (!MRI->hasOneUse(DefVR))
+        continue;
+      for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
+           UI != UE;) {
+        MachineOperand &StoreMO = *UI++;
+        MachineInstr &StoreMI = *StoreMO.getParent();
+        // Skip cases where the memcpy may overlap.
+        if (StoreMI.getParent() == MI.getParent() &&
+            isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
+            isRelevantAddressingMode(&MI) &&
+            isRelevantAddressingMode(&StoreMI)) {
+          assert(MI.hasOneMemOperand() &&
+                 "Expected one memory operand for load instruction");
+          assert(StoreMI.hasOneMemOperand() &&
+                 "Expected one memory operand for store instruction");
+          if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
+            BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
+        }
+      }
+    }
+}
+
+unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
+  auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
+                              *LoadInst->getParent()->getParent());
+  return TRI->getRegSizeInBits(*TRC) / 8;
+}
+
+void X86AvoidSFBPass::breakBlockedCopies(
+    MachineInstr *LoadInst, MachineInstr *StoreInst,
+    const DisplacementSizeMap &BlockingStoresDispSizeMap) {
+  int64_t LdDispImm = getDispOperand(LoadInst).getImm();
+  int64_t StDispImm = getDispOperand(StoreInst).getImm();
+  int64_t LMMOffset = 0;
+  int64_t SMMOffset = 0;
+
+  int64_t LdDisp1 = LdDispImm;
+  int64_t LdDisp2 = 0;
+  int64_t StDisp1 = StDispImm;
+  int64_t StDisp2 = 0;
+  unsigned Size1 = 0;
+  unsigned Size2 = 0;
+  int64_t LdStDelta = StDispImm - LdDispImm;
+
+  for (auto DispSizePair : BlockingStoresDispSizeMap) {
+    LdDisp2 = DispSizePair.first;
+    StDisp2 = DispSizePair.first + LdStDelta;
+    Size2 = DispSizePair.second;
+    // Avoid copying overlapping areas.
+    if (LdDisp2 < LdDisp1) {
+      int OverlapDelta = LdDisp1 - LdDisp2;
+      LdDisp2 += OverlapDelta;
+      StDisp2 += OverlapDelta;
+      Size2 -= OverlapDelta;
+    }
+    Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
+
+    // Build a copy for the point until the current blocking store's
+    // displacement.
+    buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
+                SMMOffset);
+    // Build a copy for the current blocking store.
+    buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
+                SMMOffset + Size1);
+    LdDisp1 = LdDisp2 + Size2;
+    StDisp1 = StDisp2 + Size2;
+    LMMOffset += Size1 + Size2;
+    SMMOffset += Size1 + Size2;
+  }
+  unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
+  buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
+              LMMOffset);
+}
+
+static bool hasSameBaseOpValue(MachineInstr *LoadInst,
+                               MachineInstr *StoreInst) {
+  MachineOperand &LoadBase = getBaseOperand(LoadInst);
+  MachineOperand &StoreBase = getBaseOperand(StoreInst);
+  if (LoadBase.isReg() != StoreBase.isReg())
+    return false;
+  if (LoadBase.isReg())
+    return LoadBase.getReg() == StoreBase.getReg();
+  return LoadBase.getIndex() == StoreBase.getIndex();
+}
+
+static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
+                            int64_t StoreDispImm, unsigned StoreSize) {
+  return ((StoreDispImm >= LoadDispImm) &&
+          (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
+}
+
+// Keep track of all stores blocking a load
+static void
+updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
+                                int64_t DispImm, unsigned Size) {
+  if (BlockingStoresDispSizeMap.count(DispImm)) {
+    // Choose the smallest blocking store starting at this displacement.
+    if (BlockingStoresDispSizeMap[DispImm] > Size)
+      BlockingStoresDispSizeMap[DispImm] = Size;
+
+  } else
+    BlockingStoresDispSizeMap[DispImm] = Size;
+}
+
+// Remove blocking stores contained in each other.
+static void
+removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
+  if (BlockingStoresDispSizeMap.size() <= 1)
+    return;
+
+  int64_t PrevDisp = BlockingStoresDispSizeMap.begin()->first;
+  unsigned PrevSize = BlockingStoresDispSizeMap.begin()->second;
+  SmallVector<int64_t, 2> ForRemoval;
+  for (auto DispSizePair = std::next(BlockingStoresDispSizeMap.begin());
+       DispSizePair != BlockingStoresDispSizeMap.end(); ++DispSizePair) {
+    int64_t CurrDisp = DispSizePair->first;
+    unsigned CurrSize = DispSizePair->second;
+    if (CurrDisp + CurrSize <= PrevDisp + PrevSize) {
+      ForRemoval.push_back(PrevDisp);
+    }
+    PrevDisp = CurrDisp;
+    PrevSize = CurrSize;
+  }
+  for (auto Disp : ForRemoval)
+    BlockingStoresDispSizeMap.erase(Disp);
+}
+
+bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
+      !MF.getSubtarget<X86Subtarget>().is64Bit())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  assert(MRI->isSSA() && "Expected MIR to be in SSA form");
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
+  // Look for a load then a store to XMM/YMM which look like a memcpy
+  findPotentiallylBlockedCopies(MF);
+
+  for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
+    MachineInstr *LoadInst = LoadStoreInstPair.first;
+    int64_t LdDispImm = getDispOperand(LoadInst).getImm();
+    DisplacementSizeMap BlockingStoresDispSizeMap;
+
+    SmallVector<MachineInstr *, 2> PotentialBlockers =
+        findPotentialBlockers(LoadInst);
+    for (auto PBInst : PotentialBlockers) {
+      if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
+                                        LoadInst->getOpcode()) ||
+          !isRelevantAddressingMode(PBInst))
+        continue;
+      int64_t PBstDispImm = getDispOperand(PBInst).getImm();
+      assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
+      unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
+      // This check doesn't cover all cases, but it will suffice for now.
+      // TODO: take branch probability into consideration, if the blocking
+      // store is in an unreached block, breaking the memcopy could lose
+      // performance.
+      if (hasSameBaseOpValue(LoadInst, PBInst) &&
+          isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
+                          PBstSize))
+        updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
+                                        PBstSize);
+    }
+
+    if (BlockingStoresDispSizeMap.empty())
+      continue;
+
+    // We found a store forward block, break the memcpy's load and store
+    // into smaller copies such that each smaller store that was causing
+    // a store block would now be copied separately.
+    MachineInstr *StoreInst = LoadStoreInstPair.second;
+    LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
+    LLVM_DEBUG(LoadInst->dump());
+    LLVM_DEBUG(StoreInst->dump());
+    LLVM_DEBUG(dbgs() << "Replaced with:\n");
+    removeRedundantBlockingStores(BlockingStoresDispSizeMap);
+    breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
+    updateKillStatus(LoadInst, StoreInst);
+    ForRemoval.push_back(LoadInst);
+    ForRemoval.push_back(StoreInst);
+  }
+  for (auto RemovedInst : ForRemoval) {
+    RemovedInst->eraseFromParent();
+  }
+  ForRemoval.clear();
+  BlockedLoadsStoresPairs.clear();
+  LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 086e828e0f56..dea95f56f4d5 100644
--- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -380,7 +380,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   // Skip over DEBUG_VALUE.
   // For globals in PIC mode, we can have some LEAs here. Skip them as well.
   // TODO: Extend this to something that covers more cases.
-  while (I->getOpcode() == X86::LEA32r || I->isDebugValue())
+  while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())
     ++I;
 
   unsigned StackPtr = RegInfo.getStackRegister();
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
index ccb982f9ac16..96ea64dc8c48 100644
--- a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -43,6 +42,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
 #include <cassert>
 #include <cstdint>
 
@@ -126,7 +126,25 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
   void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
                         CCValAssign &VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
-    unsigned ExtReg = extendRegister(ValVReg, VA);
+
+    unsigned ExtReg;
+    // If we are copying the value to a physical register with the
+    // size larger than the size of the value itself - build AnyExt
+    // to the size of the register first and only then do the copy.
+    // The example of that would be copying from s32 to xmm0, for which
+    // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
+    // we expect normal extendRegister mechanism to work.
+    unsigned PhysRegSize =
+        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
+    unsigned ValSize = VA.getValVT().getSizeInBits();
+    unsigned LocSize = VA.getLocVT().getSizeInBits();
+    if (PhysRegSize > ValSize && LocSize == ValSize) {
+      assert((PhysRegSize == 128 || PhysRegSize == 80)  && "We expect that to be 128 bit");
+      auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg);
+      ExtReg = MIB->getOperand(0).getReg();
+    } else
+      ExtReg = extendRegister(ValVReg, VA);
+
     MIRBuilder.buildCopy(PhysReg, ExtReg);
   }
 
@@ -229,10 +247,28 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
   void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
                         CCValAssign &VA) override {
     markPhysRegUsed(PhysReg);
+
     switch (VA.getLocInfo()) {
-    default:
+    default: {
+      // If we are copying the value from a physical register with the
+      // size larger than the size of the value itself - build the copy
+      // of the phys reg first and then build the truncation of that copy.
+      // The example of that would be copying from xmm0 to s32, for which
+      // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
+      // we expect this to be handled in SExt/ZExt/AExt case.
+      unsigned PhysRegSize =
+          MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
+      unsigned ValSize = VA.getValVT().getSizeInBits();
+      unsigned LocSize = VA.getLocVT().getSizeInBits();
+      if (PhysRegSize > ValSize && LocSize == ValSize) {
+        auto Copy = MIRBuilder.buildCopy(LLT::scalar(PhysRegSize), PhysReg);
+        MIRBuilder.buildTrunc(ValVReg, Copy);
+        return;
+      }
+
       MIRBuilder.buildCopy(ValVReg, PhysReg);
       break;
+    }
     case CCValAssign::LocInfo::SExt:
     case CCValAssign::LocInfo::ZExt:
     case CCValAssign::LocInfo::AExt: {
@@ -402,8 +438,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   if (Callee.isReg())
     MIB->getOperand(0).setReg(constrainOperandRegClass(
         MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
-        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
-        Callee.getReg(), 0));
+        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arguments, the physical register must be an
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
index 5d806fe60b86..fcc9a296de93 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -18,6 +18,12 @@ class CCIfSubtarget<string F, CCAction A>
                        "(State.getMachineFunction().getSubtarget()).", F),
            A>;
 
+/// CCIfNotSubtarget - Match if the current subtarget doesn't has a feature F.
+class CCIfNotSubtarget<string F, CCAction A>
+    : CCIf<!strconcat("!static_cast<const X86Subtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).", F),
+           A>;
+
 // Register classes for RegCall
 class RC_X86_RegCall {
   list<Register> GPR_8 = [];
@@ -246,8 +252,9 @@ def RetCC_X86Common : CallingConv<[
   // MM0, it doesn't support these vector types.
   CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
 
-  // Long double types are always returned in FP0 (even with SSE).
-  CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
+  // Long double types are always returned in FP0 (even with SSE),
+  // except on Win64.
+  CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
 ]>;
 
 // X86-32 C return-value convention.
@@ -535,7 +542,7 @@ def CC_X86_64_C : CallingConv<[
   // fixed arguments to vararg functions are supposed to be passed in
   // registers.  Actually modeling that would be a lot of work, though.
   CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                          CCIfSubtarget<"hasFp256()",
+                          CCIfSubtarget<"hasAVX()",
                           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
                                          YMM4, YMM5, YMM6, YMM7]>>>>,
 
@@ -586,8 +593,8 @@ def CC_X86_Win64_C : CallingConv<[
   // FIXME: Handle byval stuff.
   // FIXME: Handle varargs.
 
-  // Promote i1/i8/i16/v1i1 arguments to i32.
-  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+  // Promote i1/v1i1 arguments to i8.
+  CCIfType<[i1, v1i1], CCPromoteToType<i8>>,
 
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCAssignToReg<[R10]>>,
@@ -605,10 +612,17 @@ def CC_X86_Win64_C : CallingConv<[
   // 512 bit vectors are passed by pointer
   CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
 
+  // Long doubles are passed by pointer
+  CCIfType<[f80], CCPassIndirect<i64>>,
+
   // The first 4 MMX vector arguments are passed in GPRs.
   CCIfType<[x86mmx], CCBitConvertToType<i64>>,
 
   // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i8 ], CCAssignToRegWithShadow<[CL  , DL  , R8B , R9B ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[i16], CCAssignToRegWithShadow<[CX  , DX  , R8W , R9W ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
 
@@ -628,11 +642,7 @@ def CC_X86_Win64_C : CallingConv<[
 
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
-
-  // Long doubles get stack slots whose size and alignment depends on the
-  // subtarget.
-  CCIfType<[f80], CCAssignToStack<0, 0>>
+  CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
 def CC_X86_Win64_VectorCall : CallingConv<[
@@ -731,7 +741,7 @@ def CC_X86_32_Vector_Standard : CallingConv<[
 
   // AVX 256-bit vector arguments are passed in YMM registers.
   CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                CCIfSubtarget<"hasFp256()",
+                CCIfSubtarget<"hasAVX()",
                 CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
 
   // AVX 512-bit vector arguments are passed in ZMM registers.
@@ -750,7 +760,7 @@ def CC_X86_32_Vector_Darwin : CallingConv<[
 
   // AVX 256-bit vector arguments are passed in YMM registers.
   CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                CCIfSubtarget<"hasFp256()",
+                CCIfSubtarget<"hasAVX()",
                 CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
 
   // AVX 512-bit vector arguments are passed in ZMM registers.
@@ -841,13 +851,15 @@ def CC_X86_32_MCU : CallingConv<[
 ]>;
 
 def CC_X86_32_FastCall : CallingConv<[
-  // Promote i1/i8/i16/v1i1 arguments to i32.
-  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+  // Promote i1 to i8.
+  CCIfType<[i1], CCPromoteToType<i8>>,
 
   // The 'nest' parameter, if any, is passed in EAX.
   CCIfNest<CCAssignToReg<[EAX]>>,
 
   // The first 2 integer arguments are passed in ECX/EDX
+  CCIfInReg<CCIfType<[ i8], CCAssignToReg<[ CL,  DL]>>>,
+  CCIfInReg<CCIfType<[i16], CCAssignToReg<[ CX,  DX]>>>,
   CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
 
   // Otherwise, same as everything else.
diff --git a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp
index 489d9d86e254..f73455cc31b8 100644
--- a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -169,8 +169,8 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
   if (!EnableCmovConverter)
     return false;
 
-  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
-               << "**********\n");
+  LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+                    << "**********\n");
 
   bool Changed = false;
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
@@ -178,7 +178,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   TII = STI.getInstrInfo();
   TRI = STI.getRegisterInfo();
-  TSchedModel.init(STI.getSchedModel(), &STI, TII);
+  TSchedModel.init(&STI);
 
   // Before we handle the more subtle cases of register-register CMOVs inside
   // of potentially hot loops, we want to quickly remove all CMOVs with
@@ -295,7 +295,7 @@ bool X86CmovConverterPass::collectCmovCandidates(
 
     for (auto &I : *MBB) {
       // Skip debug instructions.
-      if (I.isDebugValue())
+      if (I.isDebugInstr())
         continue;
       X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
       // Check if we found a X86::CMOVrr instruction.
@@ -435,7 +435,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
       RegDefMaps[PhyRegType].clear();
       for (MachineInstr &MI : *MBB) {
         // Skip debug instructions.
-        if (MI.isDebugValue())
+        if (MI.isDebugInstr())
           continue;
         unsigned MIDepth = 0;
         unsigned MIDepthOpt = 0;
@@ -605,7 +605,7 @@ static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
 
   SmallVector<MachineInstr *, 2> DBGInstructions;
   for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       DBGInstructions.push_back(&*I);
   }
 
@@ -776,7 +776,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
     auto *NewCMOV = NewMIs.pop_back_val();
     assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
            "Last new instruction isn't the expected CMOV!");
-    DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
+    LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
     MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
     if (&*MIItBegin == &MI)
       MIItBegin = MachineBasicBlock::iterator(NewCMOV);
@@ -784,7 +784,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
     // Sink whatever instructions were needed to produce the unfolded operand
     // into the false block.
     for (auto *NewMI : NewMIs) {
-      DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
+      LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
       FalseMBB->insert(FalseInsertionPoint, NewMI);
       // Re-map any operands that are from other cmovs to the inputs for this block.
       for (auto &MOp : NewMI->uses()) {
@@ -846,8 +846,8 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
               .addReg(Op2Reg)
               .addMBB(MBB);
     (void)MIB;
-    DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
-    DEBUG(dbgs() << "\tTo: "; MIB->dump());
+    LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+    LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
 
     // Add this PHI to the rewrite table.
     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
diff --git a/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp
index ffe176ad4770..5196446b39e9 100644
--- a/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -340,7 +340,7 @@ public:
       if (!First)
         dbgs() << ", ";
       First = false;
-      dbgs() << printReg(Reg, MRI->getTargetRegisterInfo());
+      dbgs() << printReg(Reg, MRI->getTargetRegisterInfo(), 0, MRI);
     }
     dbgs() << "\n" << "Instructions:";
     for (MachineInstr *MI : Instrs) {
@@ -708,8 +708,9 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   if (DisableX86DomainReassignment)
     return false;
 
-  DEBUG(dbgs() << "***** Machine Function before Domain Reassignment *****\n");
-  DEBUG(MF.print(dbgs()));
+  LLVM_DEBUG(
+      dbgs() << "***** Machine Function before Domain Reassignment *****\n");
+  LLVM_DEBUG(MF.print(dbgs()));
 
   STI = &MF.getSubtarget<X86Subtarget>();
   // GPR->K is the only transformation currently supported, bail out early if no
@@ -752,7 +753,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   }
 
   for (Closure &C : Closures) {
-    DEBUG(C.dump(MRI));
+    LLVM_DEBUG(C.dump(MRI));
     if (isReassignmentProfitable(C, MaskDomain)) {
       reassign(C, MaskDomain);
       ++NumClosuresConverted;
@@ -762,8 +763,9 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
 
   DeleteContainerSeconds(Converters);
 
-  DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n");
-  DEBUG(MF.print(dbgs()));
+  LLVM_DEBUG(
+      dbgs() << "***** Machine Function after Domain Reassignment *****\n");
+  LLVM_DEBUG(MF.print(dbgs()));
 
   return Changed;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
index 6dd4631a4844..80674c7251fe 100755
--- a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -25,7 +25,6 @@
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -42,6 +41,15 @@ using namespace llvm;
 struct X86EvexToVexCompressTableEntry {
   uint16_t EvexOpcode;
   uint16_t VexOpcode;
+
+  bool operator<(const X86EvexToVexCompressTableEntry &RHS) const {
+    return EvexOpcode < RHS.EvexOpcode;
+  }
+
+  friend bool operator<(const X86EvexToVexCompressTableEntry &TE,
+                        unsigned Opc) {
+    return TE.EvexOpcode < Opc;
+  }
 };
 #include "X86GenEVEX2VEXTables.inc"
 
@@ -54,35 +62,15 @@ namespace {
 
 class EvexToVexInstPass : public MachineFunctionPass {
 
-  /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map.
-  using EvexToVexTableType = DenseMap<unsigned, uint16_t>;
-  EvexToVexTableType EvexToVex128Table;
-  EvexToVexTableType EvexToVex256Table;
-
   /// For EVEX instructions that can be encoded using VEX encoding, replace
   /// them by the VEX encoding in order to reduce size.
   bool CompressEvexToVexImpl(MachineInstr &MI) const;
 
-  /// For initializing the hash map tables of all AVX-512 EVEX
-  /// corresponding to AVX/AVX2 opcodes.
-  void AddTableEntry(EvexToVexTableType &EvexToVexTable, uint16_t EvexOp,
-                     uint16_t VexOp);
-
 public:
   static char ID;
 
   EvexToVexInstPass() : MachineFunctionPass(ID) {
     initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
-
-    // Initialize the EVEX to VEX 128 table map.
-    for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex128CompressTable) {
-      AddTableEntry(EvexToVex128Table, Entry.EvexOpcode, Entry.VexOpcode);
-    }
-
-    // Initialize the EVEX to VEX 256 table map.
-    for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex256CompressTable) {
-      AddTableEntry(EvexToVex256Table, Entry.EvexOpcode, Entry.VexOpcode);
-    }
   }
 
   StringRef getPassName() const override { return EVEX2VEX_DESC; }
@@ -127,11 +115,6 @@ bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
-                                      uint16_t EvexOp, uint16_t VexOp) {
-  EvexToVexTable[EvexOp] = VexOp;
-}
-
 static bool usesExtendedRegister(const MachineInstr &MI) {
   auto isHiRegIdx = [](unsigned Reg) {
     // Check for XMM register with indexes between 16 - 31.
@@ -164,7 +147,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
 }
 
 // Do any custom cleanup needed to finalize the conversion.
-static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
   (void)NewOpc;
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
@@ -197,7 +180,31 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
     Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
     break;
   }
+  case X86::VRNDSCALEPDZ128rri:
+  case X86::VRNDSCALEPDZ128rmi:
+  case X86::VRNDSCALEPSZ128rri:
+  case X86::VRNDSCALEPSZ128rmi:
+  case X86::VRNDSCALEPDZ256rri:
+  case X86::VRNDSCALEPDZ256rmi:
+  case X86::VRNDSCALEPSZ256rri:
+  case X86::VRNDSCALEPSZ256rmi:
+  case X86::VRNDSCALESDZr:
+  case X86::VRNDSCALESDZm:
+  case X86::VRNDSCALESSZr:
+  case X86::VRNDSCALESSZm:
+  case X86::VRNDSCALESDZr_Int:
+  case X86::VRNDSCALESDZm_Int:
+  case X86::VRNDSCALESSZr_Int:
+  case X86::VRNDSCALESSZm_Int:
+    const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+    int64_t ImmVal = Imm.getImm();
+    // Ensure that only bits 3:0 of the immediate are used.
+    if ((ImmVal & 0xf) != ImmVal)
+      return false;
+    break;
   }
+
+  return true;
 }
 
 
@@ -224,46 +231,44 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
     return false;
 
-  // Check for non EVEX_V512 instrs only.
-  // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0.
-  if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L))
+  // Check for EVEX instructions with L2 set. These instructions are 512-bits
+  // and can't be converted to VEX.
+  if (Desc.TSFlags & X86II::EVEX_L2)
     return false;
 
-  // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0.
-  bool IsEVEX_V128 =
-      (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L));
-
-  // EVEX_V256 instr: bit EVEX_L2 = 0, bit VEX_L = 1.
-  bool IsEVEX_V256 =
-      (!(Desc.TSFlags & X86II::EVEX_L2) && (Desc.TSFlags & X86II::VEX_L));
-
-  unsigned NewOpc = 0;
-
-  // Check for EVEX_V256 instructions.
-  if (IsEVEX_V256) {
-    // Search for opcode in the EvexToVex256 table.
-    auto It = EvexToVex256Table.find(MI.getOpcode());
-    if (It != EvexToVex256Table.end())
-      NewOpc = It->second;
-  }
-  // Check for EVEX_V128 or Scalar instructions.
-  else if (IsEVEX_V128) {
-    // Search for opcode in the EvexToVex128 table.
-    auto It = EvexToVex128Table.find(MI.getOpcode());
-    if (It != EvexToVex128Table.end())
-      NewOpc = It->second;
+#ifndef NDEBUG
+  // Make sure the tables are sorted.
+  static std::atomic<bool> TableChecked(false);
+  if (!TableChecked.load(std::memory_order_relaxed)) {
+    assert(std::is_sorted(std::begin(X86EvexToVex128CompressTable),
+                          std::end(X86EvexToVex128CompressTable)) &&
+           "X86EvexToVex128CompressTable is not sorted!");
+    assert(std::is_sorted(std::begin(X86EvexToVex256CompressTable),
+                          std::end(X86EvexToVex256CompressTable)) &&
+           "X86EvexToVex256CompressTable is not sorted!");
+    TableChecked.store(true, std::memory_order_relaxed);
   }
+#endif
+
+  // Use the VEX.L bit to select the 128 or 256-bit table.
+  ArrayRef<X86EvexToVexCompressTableEntry> Table =
+    (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
+                                  : makeArrayRef(X86EvexToVex128CompressTable);
 
-  if (!NewOpc)
+  auto I = std::lower_bound(Table.begin(), Table.end(), MI.getOpcode());
+  if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
     return false;
 
+  unsigned NewOpc = I->VexOpcode;
+
   if (usesExtendedRegister(MI))
     return false;
 
-  performCustomAdjustments(MI, NewOpc);
+  if (!performCustomAdjustments(MI, NewOpc))
+    return false;
 
   MI.setDesc(TII->get(NewOpc));
-  MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
+  MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index ab2ef26d1cc9..1dd73163080b 100644
--- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -59,12 +59,112 @@ public:
   }
 
 private:
+  void ExpandICallBranchFunnel(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator MBBI);
+
   bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool ExpandMBB(MachineBasicBlock &MBB);
 };
 char X86ExpandPseudo::ID = 0;
 } // End anonymous namespace.
 
+void X86ExpandPseudo::ExpandICallBranchFunnel(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) {
+  MachineBasicBlock *JTMBB = MBB;
+  MachineInstr *JTInst = &*MBBI;
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  auto InsPt = MachineFunction::iterator(MBB);
+  ++InsPt;
+
+  std::vector<std::pair<MachineBasicBlock *, unsigned>> TargetMBBs;
+  DebugLoc DL = JTInst->getDebugLoc();
+  MachineOperand Selector = JTInst->getOperand(0);
+  const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal();
+
+  auto CmpTarget = [&](unsigned Target) {
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11)
+        .addReg(X86::RIP)
+        .addImm(1)
+        .addReg(0)
+        .addGlobalAddress(CombinedGlobal,
+                          JTInst->getOperand(2 + 2 * Target).getImm())
+        .addReg(0);
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::CMP64rr))
+        .add(Selector)
+        .addReg(X86::R11);
+  };
+
+  auto CreateMBB = [&]() {
+    auto *NewMBB = MF->CreateMachineBasicBlock(BB);
+    MBB->addSuccessor(NewMBB);
+    return NewMBB;
+  };
+
+  auto EmitCondJump = [&](unsigned Opcode, MachineBasicBlock *ThenMBB) {
+    BuildMI(*MBB, MBBI, DL, TII->get(Opcode)).addMBB(ThenMBB);
+
+    auto *ElseMBB = CreateMBB();
+    MF->insert(InsPt, ElseMBB);
+    MBB = ElseMBB;
+    MBBI = MBB->end();
+  };
+
+  auto EmitCondJumpTarget = [&](unsigned Opcode, unsigned Target) {
+    auto *ThenMBB = CreateMBB();
+    TargetMBBs.push_back({ThenMBB, Target});
+    EmitCondJump(Opcode, ThenMBB);
+  };
+
+  auto EmitTailCall = [&](unsigned Target) {
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::TAILJMPd64))
+        .add(JTInst->getOperand(3 + 2 * Target));
+  };
+
+  std::function<void(unsigned, unsigned)> EmitBranchFunnel =
+      [&](unsigned FirstTarget, unsigned NumTargets) {
+    if (NumTargets == 1) {
+      EmitTailCall(FirstTarget);
+      return;
+    }
+
+    if (NumTargets == 2) {
+      CmpTarget(FirstTarget + 1);
+      EmitCondJumpTarget(X86::JB_1, FirstTarget);
+      EmitTailCall(FirstTarget + 1);
+      return;
+    }
+
+    if (NumTargets < 6) {
+      CmpTarget(FirstTarget + 1);
+      EmitCondJumpTarget(X86::JB_1, FirstTarget);
+      EmitCondJumpTarget(X86::JE_1, FirstTarget + 1);
+      EmitBranchFunnel(FirstTarget + 2, NumTargets - 2);
+      return;
+    }
+
+    auto *ThenMBB = CreateMBB();
+    CmpTarget(FirstTarget + (NumTargets / 2));
+    EmitCondJump(X86::JB_1, ThenMBB);
+    EmitCondJumpTarget(X86::JE_1, FirstTarget + (NumTargets / 2));
+    EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1,
+                  NumTargets - (NumTargets / 2) - 1);
+
+    MF->insert(InsPt, ThenMBB);
+    MBB = ThenMBB;
+    MBBI = MBB->end();
+    EmitBranchFunnel(FirstTarget, NumTargets / 2);
+  };
+
+  EmitBranchFunnel(0, (JTInst->getNumOperands() - 2) / 2);
+  for (auto P : TargetMBBs) {
+    MF->insert(InsPt, P.first);
+    BuildMI(P.first, DL, TII->get(X86::TAILJMPd64))
+        .add(JTInst->getOperand(3 + 2 * P.second));
+  }
+  JTMBB->erase(JTInst);
+}
+
 /// If \p MBBI is a pseudo instruction, this method expands
 /// it to the corresponding (sequence of) actual instruction(s).
 /// \returns true if \p MBBI has been expanded.
@@ -106,7 +206,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
-      X86FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+      X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
     }
 
     // Jump to label or value in register.
@@ -186,7 +286,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case X86::IRET: {
     // Adjust stack to erase error code
     int64_t StackAdj = MBBI->getOperand(0).getImm();
-    X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+    X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, true);
     // Replace pseudo with machine iret
     BuildMI(MBB, MBBI, DL,
             TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
@@ -210,7 +310,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       // A ret can only handle immediates as big as 2**16-1.  If we need to pop
       // off bytes before the return address, we must do it manually.
       BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
-      X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+      X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, /*InEpilogue=*/true);
       BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
       MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
     }
@@ -259,6 +359,9 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBBI->eraseFromParent();
     return true;
   }
+  case TargetOpcode::ICALL_BRANCH_FUNNEL:
+    ExpandICallBranchFunnel(&MBB, MBBI);
+    return true;
   }
   llvm_unreachable("Previous switch has a fallthrough?");
 }
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index dca6c592614c..de8b40f28a86 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -68,7 +68,7 @@ public:
 
   bool fastSelectInstruction(const Instruction *I) override;
 
-  /// \brief The specified machine instr operand is a vreg, and that
+  /// The specified machine instr operand is a vreg, and that
   /// vreg is being provided by the specified load instruction.  If possible,
   /// try to fold the load as an operand to the instruction, returning true if
   /// possible.
@@ -134,6 +134,8 @@ private:
   bool X86SelectFPExt(const Instruction *I);
   bool X86SelectFPTrunc(const Instruction *I);
   bool X86SelectSIToFP(const Instruction *I);
+  bool X86SelectUIToFP(const Instruction *I);
+  bool X86SelectIntToFP(const Instruction *I, bool IsSigned);
 
   const X86InstrInfo *getInstrInfo() const {
     return Subtarget->getInstrInfo();
@@ -217,7 +219,7 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) {
   return std::make_pair(CC, NeedSwap);
 }
 
-/// \brief Adds a complex addressing mode to the given machine instr builder.
+/// Adds a complex addressing mode to the given machine instr builder.
 /// Note, this will constrain the index register.  If its not possible to
 /// constrain the given index register, then a new one will be created.  The
 /// IndexReg field of the addressing mode will be updated to match in this case.
@@ -231,7 +233,7 @@ X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
   return ::addFullAddress(MIB, AM);
 }
 
-/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// Check if it is possible to fold the condition from the XALU intrinsic
 /// into the user. The condition code will only be updated on success.
 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
                                        const Value *Cond) {
@@ -2019,7 +2021,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
   return true;
 }
 
-/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// Emit a conditional move instruction (if the are supported) to lower
 /// the select.
 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   // Check if the subtarget supports these instructions.
@@ -2148,7 +2150,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   return true;
 }
 
-/// \brief Emit SSE or AVX instructions to lower the select.
+/// Emit SSE or AVX instructions to lower the select.
 ///
 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
@@ -2410,15 +2412,19 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
   return false;
 }
 
-bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+// Common code for X86SelectSIToFP and X86SelectUIToFP.
+bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
   // The target-independent selection algorithm in FastISel already knows how
   // to select a SINT_TO_FP if the target is SSE but not AVX.
   // Early exit if the subtarget doesn't have AVX.
-  if (!Subtarget->hasAVX())
+  // Unsigned conversion requires avx512.
+  bool HasAVX512 = Subtarget->hasAVX512();
+  if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))
     return false;
 
-  Type *InTy = I->getOperand(0)->getType();
-  if (!InTy->isIntegerTy(32) && !InTy->isIntegerTy(64))
+  // TODO: We could sign extend narrower types.
+  MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
     return false;
 
   // Select integer to float/double conversion.
@@ -2426,20 +2432,31 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
   if (OpReg == 0)
     return false;
 
-  const TargetRegisterClass *RC = nullptr;
   unsigned Opcode;
 
+  static const uint16_t SCvtOpc[2][2][2] = {
+    { { X86::VCVTSI2SSrr,  X86::VCVTSI642SSrr },
+      { X86::VCVTSI2SDrr,  X86::VCVTSI642SDrr } },
+    { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
+      { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
+  };
+  static const uint16_t UCvtOpc[2][2] = {
+    { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
+    { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
+  };
+  bool Is64Bit = SrcVT == MVT::i64;
+
   if (I->getType()->isDoubleTy()) {
-    // sitofp int -> double
-    Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SDrr : X86::VCVTSI2SDrr;
-    RC = &X86::FR64RegClass;
+    // s/uitofp int -> double
+    Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
   } else if (I->getType()->isFloatTy()) {
-    // sitofp int -> float
-    Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SSrr : X86::VCVTSI2SSrr;
-    RC = &X86::FR32RegClass;
+    // s/uitofp int -> float
+    Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
   } else
     return false;
 
+  MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
   unsigned ImplicitDefReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2449,6 +2466,14 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
   return true;
 }
 
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+  return X86SelectIntToFP(I, /*IsSigned*/true);
+}
+
+bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
+  return X86SelectIntToFP(I, /*IsSigned*/false);
+}
+
 // Helper method used by X86SelectFPExt and X86SelectFPTrunc.
 bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
                                           unsigned TargetOpc,
@@ -2682,7 +2707,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
             (FrameReg == X86::EBP && VT == MVT::i32)) &&
            "Invalid Frame Register!");
 
-    // Always make a copy of the frame register to to a vreg first, so that we
+    // Always make a copy of the frame register to a vreg first, so that we
     // never directly reference the frame register (the TwoAddressInstruction-
     // Pass doesn't like that).
     unsigned SrcReg = createResultReg(RC);
@@ -2733,7 +2758,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
       return false;
 
-    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -2748,7 +2773,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (MSI->getDestAddressSpace() > 255)
       return false;
 
-    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
   }
   case Intrinsic::stackprotector: {
     // Emit code to store the stack guard onto the stack.
@@ -2799,17 +2824,19 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
     // is not generated by FastISel yet.
     // FIXME: Update this code once tablegen can handle it.
-    static const uint16_t SqrtOpc[2][2] = {
-      {X86::SQRTSSr, X86::VSQRTSSr},
-      {X86::SQRTSDr, X86::VSQRTSDr}
+    static const uint16_t SqrtOpc[3][2] = {
+      { X86::SQRTSSr,   X86::SQRTSDr },
+      { X86::VSQRTSSr,  X86::VSQRTSDr },
+      { X86::VSQRTSSZr, X86::VSQRTSDZr },
     };
-    bool HasAVX = Subtarget->hasAVX();
+    unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
+                        Subtarget->hasAVX()    ? 1 :
+                                                 0;
     unsigned Opc;
-    const TargetRegisterClass *RC;
     switch (VT.SimpleTy) {
     default: return false;
-    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
-    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+    case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
+    case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
     }
 
     const Value *SrcVal = II->getArgOperand(0);
@@ -2818,8 +2845,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (SrcReg == 0)
       return false;
 
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
     unsigned ImplicitDefReg = 0;
-    if (HasAVX) {
+    if (AVXLevel > 0) {
       ImplicitDefReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2996,18 +3024,22 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    static const uint16_t CvtOpc[2][2][2] = {
-      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
-        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
-      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
-        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
+    static const uint16_t CvtOpc[3][2][2] = {
+      { { X86::CVTTSS2SIrr,   X86::CVTTSS2SI64rr },
+        { X86::CVTTSD2SIrr,   X86::CVTTSD2SI64rr } },
+      { { X86::VCVTTSS2SIrr,  X86::VCVTTSS2SI64rr },
+        { X86::VCVTTSD2SIrr,  X86::VCVTTSD2SI64rr } },
+      { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
+        { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
     };
-    bool HasAVX = Subtarget->hasAVX();
+    unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
+                        Subtarget->hasAVX()    ? 1 :
+                                                 0;
     unsigned Opc;
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected result type.");
-    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
-    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+    case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
+    case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
     }
 
     // Check if we can fold insertelement instructions into the convert.
@@ -3174,6 +3206,13 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
   const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
 
+  // Call / invoke instructions with NoCfCheck attribute require special
+  // handling.
+  const auto *II =
+      CLI.CS ? dyn_cast<InvokeInst>(CLI.CS->getInstruction()) : nullptr;
+  if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()))
+    return false;
+
   // Functions with no_caller_saved_registers that need special handling.
   if ((CI && CI->hasFnAttr("no_caller_saved_registers")) ||
       (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
@@ -3609,6 +3648,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
     return X86SelectFPTrunc(I);
   case Instruction::SIToFP:
     return X86SelectSIToFP(I);
+  case Instruction::UIToFP:
+    return X86SelectUIToFP(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 855ea683a8af..d9bf60c2c9fb 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -155,18 +155,18 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   LiveRegs.init(TII->getRegisterInfo());
 
-  DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
+  LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
 
   // Process all basic blocks.
   for (auto &MBB : MF)
     processBasicBlock(MF, MBB);
 
-  DEBUG(dbgs() << "End X86FixupBWInsts\n";);
+  LLVM_DEBUG(dbgs() << "End X86FixupBWInsts\n";);
 
   return true;
 }
 
-/// \brief Check if after \p OrigMI the only portion of super register
+/// Check if after \p OrigMI the only portion of super register
 /// of the destination register of \p OrigMI that is alive is that
 /// destination register.
 ///
@@ -249,15 +249,16 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
 
     assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
 
-    for (MCSuperRegIterator Supers(OrigDestReg, TRI, true); Supers.isValid();
-         ++Supers) {
-      if (*Supers == MO.getReg()) {
-        if (MO.isDef())
-          IsDefined = true;
-        else
-          return false; // SuperReg Imp-used' -> live before the MI
-      }
-    }
+    if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
+        IsDefined = true;
+
+    // If MO is a use of any part of the destination register but is not equal
+    // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
+    // For example, if OrigDestReg is %al then an implicit use of %ah, %ax,
+    // %eax, or %rax will prevent us from using the %eax register.
+    if (MO.isUse() && !TRI->isSubRegisterEq(OrigDestReg, MO.getReg()) &&
+        TRI->regsOverlap(SuperDestReg, MO.getReg()))
+      return false;
   }
   // Reg is not Imp-def'ed -> it's live both before/after the instruction.
   if (!IsDefined)
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
index b41bf99f19b2..d85389a0a7f1 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -40,13 +40,13 @@ namespace {
 class FixupLEAPass : public MachineFunctionPass {
   enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
 
-  /// \brief Loop over all of the instructions in the basic block
+  /// Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
   bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
 
-  /// \brief Given a machine register, look for the instruction
+  /// Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
   /// try to replace it with an equivalent LEA instruction.
   /// If replacement succeeds, then also process the newly created
@@ -54,20 +54,20 @@ class FixupLEAPass : public MachineFunctionPass {
   void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
                     MachineFunction::iterator MFI);
 
-  /// \brief Given a memory access or LEA instruction
+  /// Given a memory access or LEA instruction
   /// whose address mode uses a base and/or index register, look for
   /// an opportunity to replace the instruction which sets the base or index
   /// register with an equivalent LEA instruction.
   void processInstruction(MachineBasicBlock::iterator &I,
                           MachineFunction::iterator MFI);
 
-  /// \brief Given a LEA instruction which is unprofitable
+  /// Given a LEA instruction which is unprofitable
   /// on Silvermont try to replace it with an equivalent ADD instruction
   void processInstructionForSLM(MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI);
 
 
-  /// \brief Given a LEA instruction which is unprofitable
+  /// Given a LEA instruction which is unprofitable
   /// on SNB+ try to replace it with other instructions.
   /// According to Intel's Optimization Reference Manual:
   /// " For LEA instructions with three source operands and some specific
@@ -82,23 +82,23 @@ class FixupLEAPass : public MachineFunctionPass {
   MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
                                           MachineFunction::iterator MFI);
 
-  /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+  /// Look for LEAs that add 1 to reg or subtract 1 from reg
   /// and convert them to INC or DEC respectively.
   bool fixupIncDec(MachineBasicBlock::iterator &I,
                    MachineFunction::iterator MFI) const;
 
-  /// \brief Determine if an instruction references a machine register
+  /// Determine if an instruction references a machine register
   /// and, if so, whether it reads or writes the register.
   RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
 
-  /// \brief Step backwards through a basic block, looking
+  /// Step backwards through a basic block, looking
   /// for an instruction which writes a register within
   /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
   MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
                                               MachineBasicBlock::iterator &I,
                                               MachineFunction::iterator MFI);
 
-  /// \brief if an instruction can be converted to an
+  /// if an instruction can be converted to an
   /// equivalent LEA, insert the new instruction into the basic block
   /// and return a pointer to it. Otherwise, return zero.
   MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
@@ -113,7 +113,7 @@ public:
     initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
   }
 
-  /// \brief Loop over all of the basic blocks,
+  /// Loop over all of the basic blocks,
   /// replacing instructions by equivalent LEA instructions
   /// if needed and when possible.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -125,6 +125,7 @@ public:
   }
 
 private:
+  TargetSchedModel TSM;
   MachineFunction *MF;
   const X86InstrInfo *TII; // Machine instruction info.
   bool OptIncDec;
@@ -202,13 +203,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   if (!OptLEA && !OptIncDec)
     return false;
 
+  TSM.init(&Func.getSubtarget());
   TII = ST.getInstrInfo();
 
-  DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+  LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
   for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
     processBasicBlock(Func, I);
-  DEBUG(dbgs() << "End X86FixupLEAs\n";);
+  LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
 
   return true;
 }
@@ -264,8 +266,7 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
     if (usesRegister(p, CurInst) == RU_Write) {
       return CurInst;
     }
-    InstrDistance += TII->getInstrLatency(
-        MF->getSubtarget().getInstrItineraryData(), *CurInst);
+    InstrDistance += TSM.computeInstrLatency(&*CurInst);
     Found = getPreviousInstr(CurInst, MFI);
   }
   return MachineBasicBlock::iterator();
@@ -285,6 +286,8 @@ static inline bool isRegOperand(const MachineOperand &Op) {
 }
 /// hasIneffecientLEARegs - LEA that uses base and index registers
 /// where the base is EBP, RBP, or R13
+// TODO: use a variant scheduling class to model the latency profile
+// of LEA instructions, and implement this logic as a scheduling predicate.
 static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
                                             const MachineOperand &Index) {
   return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
@@ -295,13 +298,6 @@ static inline bool hasLEAOffset(const MachineOperand &Offset) {
   return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
 }
 
-// LEA instruction that has all three operands: offset, base and index
-static inline bool isThreeOperandsLEA(const MachineOperand &Base,
-                                      const MachineOperand &Index,
-                                      const MachineOperand &Offset) {
-  return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
-}
-
 static inline int getADDrrFromLEA(int LEAOpcode) {
   switch (LEAOpcode) {
   default:
@@ -407,9 +403,9 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
     MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
     if (NewMI) {
       ++NumLEAs;
-      DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+      LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
       // now to replace with an equivalent LEA...
-      DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+      LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
       MFI->erase(MBI);
       MachineBasicBlock::iterator J =
           static_cast<MachineBasicBlock::iterator>(NewMI);
@@ -434,8 +430,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   if (MI.getOperand(2).getImm() > 1)
     return;
-  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
-  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+  LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
   MachineInstr *NewMI = nullptr;
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
@@ -443,7 +439,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
     NewMI =
         BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
-    DEBUG(NewMI->dump(););
+    LLVM_DEBUG(NewMI->dump(););
   }
   // Make ADD instruction for immediate
   if (MI.getOperand(4).getImm() != 0) {
@@ -453,7 +449,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
                 .add(SrcR)
                 .addImm(MI.getOperand(4).getImm());
-    DEBUG(NewMI->dump(););
+    LLVM_DEBUG(NewMI->dump(););
   }
   if (NewMI) {
     MFI->erase(I);
@@ -476,7 +472,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
   const MachineOperand &Offset = MI.getOperand(4);
   const MachineOperand &Segment = MI.getOperand(5);
 
-  if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+  if (!(TII->isThreeOperandsLEA(MI) ||
         hasInefficientLEABaseReg(Base, Index)) ||
       !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
       Segment.getReg() != X86::NoRegister)
@@ -503,8 +499,8 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
   const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
   const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
 
-  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
-  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+  LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
 
   // First try to replace LEA with one or two (for the 3-op LEA case)
   // add instructions:
@@ -514,11 +510,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
     const MachineOperand &Src = DstR == BaseR ? Index : Base;
     MachineInstr *NewMI =
         BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
-    DEBUG(NewMI->dump(););
+    LLVM_DEBUG(NewMI->dump(););
     // Create ADD instruction for the Offset in case of 3-Ops LEA.
     if (hasLEAOffset(Offset)) {
       NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
-      DEBUG(NewMI->dump(););
+      LLVM_DEBUG(NewMI->dump(););
     }
     return NewMI;
   }
@@ -534,11 +530,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
                               .add(IsInefficientBase ? Base : Index)
                               .addImm(0)
                               .add(Segment);
-    DEBUG(NewMI->dump(););
+    LLVM_DEBUG(NewMI->dump(););
     // Create ADD instruction for the Offset in case of 3-Ops LEA.
     if (hasLEAOffset(Offset)) {
       NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
-      DEBUG(NewMI->dump(););
+      LLVM_DEBUG(NewMI->dump(););
     }
     return NewMI;
   }
@@ -548,12 +544,13 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
 
   // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
   if (IsScale1 && !hasLEAOffset(Offset)) {
-    TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
-    DEBUG(MI.getPrevNode()->dump(););
+    bool BIK = Base.isKill() && BaseR != IndexR;
+    TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, BIK);
+    LLVM_DEBUG(MI.getPrevNode()->dump(););
 
     MachineInstr *NewMI =
         BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
-    DEBUG(NewMI->dump(););
+    LLVM_DEBUG(NewMI->dump(););
     return NewMI;
   }
   // lea offset(%base,%index,scale), %dst =>
@@ -565,10 +562,10 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
                             .add(Index)
                             .add(Offset)
                             .add(Segment);
-  DEBUG(NewMI->dump(););
+  LLVM_DEBUG(NewMI->dump(););
 
   NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
-  DEBUG(NewMI->dump(););
+  LLVM_DEBUG(NewMI->dump(););
   return NewMI;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index a6fccd134740..1ba08d39c595 100644
--- a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -27,6 +27,7 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -102,7 +103,7 @@ private:
   MachineDominatorTree *MDT;
 
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
-                                  MachineInstr &CopyDefI);
+                                  MachineBasicBlock::iterator CopyDefI);
 
   unsigned promoteCondToReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator TestPos,
@@ -342,8 +343,8 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
 }
 
 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
-               << " **********\n");
+  LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+                    << " **********\n");
 
   auto &Subtarget = MF.getSubtarget<X86Subtarget>();
   MRI = &MF.getRegInfo();
@@ -356,9 +357,14 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
     // Nothing to do for a degenerate empty function...
     return false;
 
+  // Collect the copies in RPO so that when there are chains where a copy is in
+  // turn copied again we visit the first one first. This ensures we can find
+  // viable locations for testing the original EFLAGS that dominate all the
+  // uses across complex CFGs.
   SmallVector<MachineInstr *, 4> Copies;
-  for (MachineBasicBlock &MBB : MF)
-    for (MachineInstr &MI : MBB)
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT)
+    for (MachineInstr &MI : *MBB)
       if (MI.getOpcode() == TargetOpcode::COPY &&
           MI.getOperand(0).getReg() == X86::EFLAGS)
         Copies.push_back(&MI);
@@ -385,8 +391,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
       // instructions. Until we have a motivating test case and fail to avoid
       // it by changing other parts of LLVM's lowering, we refuse to handle
       // this complex case here.
-      DEBUG(dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
-            CopyDefI.dump());
+      LLVM_DEBUG(
+          dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
+          CopyDefI.dump());
       report_fatal_error(
           "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
     }
@@ -406,15 +413,102 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
     if (DOp.isDead())
       continue;
 
-    MachineBasicBlock &TestMBB = *CopyDefI.getParent();
+    MachineBasicBlock *TestMBB = CopyDefI.getParent();
     auto TestPos = CopyDefI.getIterator();
     DebugLoc TestLoc = CopyDefI.getDebugLoc();
 
-    DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+    LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+
+    // Walk up across live-in EFLAGS to find where they were actually def'ed.
+    //
+    // This copy's def may just be part of a region of blocks covered by
+    // a single def of EFLAGS and we want to find the top of that region where
+    // possible.
+    //
+    // This is essentially a search for a *candidate* reaching definition
+    // location. We don't need to ever find the actual reaching definition here,
+    // but we want to walk up the dominator tree to find the highest point which
+    // would be viable for such a definition.
+    auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
+                                MachineBasicBlock::iterator End) {
+      // Scan backwards as we expect these to be relatively short and often find
+      // a clobber near the end.
+      return llvm::any_of(
+          llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
+            // Flag any instruction (other than the copy we are
+            // currently rewriting) that defs EFLAGS.
+            return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
+          });
+    };
+    auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
+                                    MachineBasicBlock *EndMBB) {
+      assert(MDT->dominates(BeginMBB, EndMBB) &&
+             "Only support paths down the dominator tree!");
+      SmallPtrSet<MachineBasicBlock *, 4> Visited;
+      SmallVector<MachineBasicBlock *, 4> Worklist;
+      // We terminate at the beginning. No need to scan it.
+      Visited.insert(BeginMBB);
+      Worklist.push_back(EndMBB);
+      do {
+        auto *MBB = Worklist.pop_back_val();
+        for (auto *PredMBB : MBB->predecessors()) {
+          if (!Visited.insert(PredMBB).second)
+            continue;
+          if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
+            return true;
+          // Enqueue this block to walk its predecessors.
+          Worklist.push_back(PredMBB);
+        }
+      } while (!Worklist.empty());
+      // No clobber found along a path from the begin to end.
+      return false;
+    };
+    while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
+           !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
+      // Find the nearest common dominator of the predecessors, as
+      // that will be the best candidate to hoist into.
+      MachineBasicBlock *HoistMBB =
+          std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
+                          *TestMBB->pred_begin(),
+                          [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
+                            return MDT->findNearestCommonDominator(LHS, RHS);
+                          });
+
+      // Now we need to scan all predecessors that may be reached along paths to
+      // the hoist block. A clobber anywhere in any of these blocks the hoist.
+      // Note that this even handles loops because we require *no* clobbers.
+      if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
+        break;
+
+      // We also need the terminators to not sneakily clobber flags.
+      if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
+                           HoistMBB->instr_end()))
+        break;
+
+      // We found a viable location, hoist our test position to it.
+      TestMBB = HoistMBB;
+      TestPos = TestMBB->getFirstTerminator()->getIterator();
+      // Clear the debug location as it would just be confusing after hoisting.
+      TestLoc = DebugLoc();
+    }
+    LLVM_DEBUG({
+      auto DefIt = llvm::find_if(
+          llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
+          [&](MachineInstr &MI) {
+            return MI.findRegisterDefOperand(X86::EFLAGS);
+          });
+      if (DefIt.base() != TestMBB->instr_begin()) {
+        dbgs() << "  Using EFLAGS defined by: ";
+        DefIt->dump();
+      } else {
+        dbgs() << "  Using live-in flags for BB:\n";
+        TestMBB->dump();
+      }
+    });
 
-    // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer
-    // jumps because their usage is very constrained.
-    bool FlagsKilled = false;
+    // While rewriting uses, we buffer jumps and rewrite them in a second pass
+    // because doing so will perturb the CFG that we are walking to find the
+    // uses in the first place.
     SmallVector<MachineInstr *, 4> JmpIs;
 
     // Gather the condition flags that have already been preserved in
@@ -422,7 +516,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
     // very few of them and we expect to not revisit the same copy definition
     // many times. If either of those change sufficiently we could build a map
     // of these up front instead.
-    CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
+    CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
 
     // Collect the basic blocks we need to scan. Typically this will just be
     // a single basic block but we may have to scan multiple blocks if the
@@ -430,41 +524,39 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
     SmallVector<MachineBasicBlock *, 2> Blocks;
     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
     Blocks.push_back(&MBB);
-    VisitedBlocks.insert(&MBB);
 
     do {
       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
 
-      // We currently don't do any PHI insertion and so we require that the
-      // test basic block dominates all of the use basic blocks.
-      //
-      // We could in theory do PHI insertion here if it becomes useful by just
-      // taking undef values in along every edge that we don't trace this
-      // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
-      // but still seems like a great deal of complexity.
-      //
-      // Because it is theoretically possible that some earlier MI pass or
-      // other lowering transformation could induce this to happen, we do
-      // a hard check even in non-debug builds here.
-      if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
-        DEBUG({
-          dbgs() << "ERROR: Encountered use that is not dominated by our test "
-                    "basic block! Rewriting this would require inserting PHI "
-                    "nodes to track the flag state across the CFG.\n\nTest "
-                    "block:\n";
-          TestMBB.dump();
-          dbgs() << "Use block:\n";
-          UseMBB.dump();
-        });
-        report_fatal_error("Cannot lower EFLAGS copy when original copy def "
-                           "does not dominate all uses.");
-      }
-
-      for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
-                                      : UseMBB.instr_begin(),
+      // Track when if/when we find a kill of the flags in this block.
+      bool FlagsKilled = false;
+
+      // In most cases, we walk from the beginning to the end of the block. But
+      // when the block is the same block as the copy is from, we will visit it
+      // twice. The first time we start from the copy and go to the end. The
+      // second time we start from the beginning and go to the copy. This lets
+      // us handle copies inside of cycles.
+      // FIXME: This loop is *super* confusing. This is at least in part
+      // a symptom of all of this routine needing to be refactored into
+      // documentable components. Once done, there may be a better way to write
+      // this loop.
+      for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
+                          ? std::next(CopyI->getIterator())
+                          : UseMBB.instr_begin(),
                 MIE = UseMBB.instr_end();
            MII != MIE;) {
         MachineInstr &MI = *MII++;
+        // If we are in the original copy block and encounter either the copy
+        // def or the copy itself, break so that we don't re-process any part of
+        // the block or process the instructions in the range that was copied
+        // over.
+        if (&MI == CopyI || &MI == &CopyDefI) {
+          assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
+                 "Should only encounter these on the second pass over the "
+                 "original block.");
+          break;
+        }
+
         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
         if (!FlagUse) {
           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
@@ -481,7 +573,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
-        DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
+        LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
 
         // Check the kill flag before we rewrite as that may change it.
         if (FlagUse->isKill())
@@ -508,10 +600,10 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
 
         // Otherwise we can just rewrite in-place.
         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
-          rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
                    X86::COND_INVALID) {
-          rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (MI.getOpcode() == TargetOpcode::COPY) {
           rewriteCopy(MI, *FlagUse, CopyDefI);
         } else {
@@ -534,13 +626,13 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           case X86::SETB_C64r:
             // Use custom lowering for arithmetic that is merely extending the
             // carry flag. We model this as the SETB_C* pseudo instructions.
-            rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+            rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                                     CondRegs);
             break;
 
           default:
             // Generically handle remaining uses as arithmetic instructions.
-            rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+            rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                               CondRegs);
             break;
           }
@@ -554,14 +646,44 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
 
       // If the flags were killed, we're done with this block.
       if (FlagsKilled)
-        break;
+        continue;
 
       // Otherwise we need to scan successors for ones where the flags live-in
       // and queue those up for processing.
       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
-            VisitedBlocks.insert(SuccMBB).second)
+            VisitedBlocks.insert(SuccMBB).second) {
+          // We currently don't do any PHI insertion and so we require that the
+          // test basic block dominates all of the use basic blocks. Further, we
+          // can't have a cycle from the test block back to itself as that would
+          // create a cycle requiring a PHI to break it.
+          //
+          // We could in theory do PHI insertion here if it becomes useful by
+          // just taking undef values in along every edge that we don't trace
+          // this EFLAGS copy along. This isn't as bad as fully general PHI
+          // insertion, but still seems like a great deal of complexity.
+          //
+          // Because it is theoretically possible that some earlier MI pass or
+          // other lowering transformation could induce this to happen, we do
+          // a hard check even in non-debug builds here.
+          if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
+            LLVM_DEBUG({
+              dbgs()
+                  << "ERROR: Encountered use that is not dominated by our test "
+                     "basic block! Rewriting this would require inserting PHI "
+                     "nodes to track the flag state across the CFG.\n\nTest "
+                     "block:\n";
+              TestMBB->dump();
+              dbgs() << "Use block:\n";
+              SuccMBB->dump();
+            });
+            report_fatal_error(
+                "Cannot lower EFLAGS copy when original copy def "
+                "does not dominate all uses.");
+          }
+
           Blocks.push_back(SuccMBB);
+        }
     } while (!Blocks.empty());
 
     // Now rewrite the jumps that use the flags. These we handle specially
@@ -576,7 +698,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
       else
         LastJmpMBB = JmpI->getParent();
 
-      rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+      rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
 
     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
@@ -589,7 +711,8 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOpcode() == TargetOpcode::COPY &&
           (MI.getOperand(0).getReg() == X86::EFLAGS ||
            MI.getOperand(1).getReg() == X86::EFLAGS)) {
-        DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; MI.dump());
+        LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
+                   MI.dump());
         llvm_unreachable("Unlowered EFLAGS copy!");
       }
 #endif
@@ -599,14 +722,13 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
 
 /// Collect any conditions that have already been set in registers so that we
 /// can re-use them rather than adding duplicates.
-CondRegArray
-X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
-                                             MachineInstr &CopyDefI) {
+CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
   CondRegArray CondRegs = {};
 
   // Scan backwards across the range of instructions with live EFLAGS.
-  for (MachineInstr &MI : llvm::reverse(
-           llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
+  for (MachineInstr &MI :
+       llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
     if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
         TRI->isVirtualRegister(MI.getOperand(0).getReg()))
@@ -627,7 +749,7 @@ unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
                       TII->get(X86::getSETFromCond(Cond)), Reg);
   (void)SetI;
-  DEBUG(dbgs() << "    save cond: "; SetI->dump());
+  LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
   ++NumSetCCsInserted;
   return Reg;
 }
@@ -649,15 +771,10 @@ std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator Pos,
                                           DebugLoc Loc, unsigned Reg) {
-  // We emit test instructions as register/immediate test against -1. This
-  // allows register allocation to fold a memory operand if needed (that will
-  // happen often due to the places this code is emitted). But hopefully will
-  // also allow us to select a shorter encoding of `testb %reg, %reg` when that
-  // would be equivalent.
   auto TestI =
       BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
   (void)TestI;
-  DEBUG(dbgs() << "    test cond: "; TestI->dump());
+  LLVM_DEBUG(dbgs() << "    test cond: "; TestI->dump());
   ++NumTestsInserted;
 }
 
@@ -709,7 +826,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
           .addReg(CondReg)
           .addImm(Addend);
   (void)AddI;
-  DEBUG(dbgs() << "    add cond: "; AddI->dump());
+  LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
   ++NumAddsInserted;
   FlagUse.setIsKill(true);
 }
@@ -739,7 +856,7 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
       Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
       !CMovI.memoperands_empty())));
   FlagUse.setIsKill(true);
-  DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
+  LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
 }
 
 void X86FlagsCopyLoweringPass::rewriteCondJmp(
@@ -763,13 +880,13 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp(
       X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
   const int ImplicitEFLAGSOpIdx = 1;
   JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
-  DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
+  LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
 }
 
 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
                                            MachineOperand &FlagUse,
                                            MachineInstr &CopyDefI) {
-  // Just replace this copy with the the original copy def.
+  // Just replace this copy with the original copy def.
   MRI->replaceRegWith(MI.getOperand(0).getReg(),
                       CopyDefI.getOperand(0).getReg());
   MI.eraseFromParent();
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 9a72e7114be0..ae748901164a 100644
--- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -39,6 +39,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -434,7 +435,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       PrevMI = &*std::prev(I);
 
     ++NumFP;  // Keep track of # of pseudo instrs
-    DEBUG(dbgs() << "\nFPInst:\t" << MI);
+    LLVM_DEBUG(dbgs() << "\nFPInst:\t" << MI);
 
     // Get dead variables list now because the MI pointer may be deleted as part
     // of processing!
@@ -464,13 +465,13 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       // is in the clobber list and marked dead might not be live on the stack.
       static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
       if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
-        DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+        LLVM_DEBUG(dbgs() << "Register FP#" << Reg - X86::FP0 << " is dead!\n");
         freeStackSlotAfter(I, Reg-X86::FP0);
       }
     }
 
     // Print out all of the instructions expanded to if -debug
-    DEBUG({
+    LLVM_DEBUG({
       MachineBasicBlock::iterator PrevI = PrevMI;
       if (I == PrevI) {
         dbgs() << "Just deleted pseudo instruction\n";
@@ -499,15 +500,15 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
 /// setupBlockStack - Use the live bundles to set up our model of the stack
 /// to match predecessors' live out stack.
 void FPS::setupBlockStack() {
-  DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB)
-               << " derived from " << MBB->getName() << ".\n");
+  LLVM_DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB)
+                    << " derived from " << MBB->getName() << ".\n");
   StackTop = 0;
   // Get the live-in bundle for MBB.
   const LiveBundle &Bundle =
     LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
 
   if (!Bundle.Mask) {
-    DEBUG(dbgs() << "Block has no FP live-ins.\n");
+    LLVM_DEBUG(dbgs() << "Block has no FP live-ins.\n");
     return;
   }
 
@@ -516,8 +517,8 @@ void FPS::setupBlockStack() {
 
   // Push the fixed live-in registers.
   for (unsigned i = Bundle.FixCount; i > 0; --i) {
-    DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %fp"
-                 << unsigned(Bundle.FixStack[i-1]) << '\n');
+    LLVM_DEBUG(dbgs() << "Live-in st(" << (i - 1) << "): %fp"
+                      << unsigned(Bundle.FixStack[i - 1]) << '\n');
     pushReg(Bundle.FixStack[i-1]);
   }
 
@@ -526,7 +527,7 @@ void FPS::setupBlockStack() {
   // to be revived at the end of a short block. It might save a few instrs.
   unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true);
   adjustLiveRegs(Mask, MBB->begin());
-  DEBUG(MBB->dump());
+  LLVM_DEBUG(MBB->dump());
 }
 
 /// finishBlockStack - Revive live-outs that are implicitly defined out of
@@ -538,8 +539,8 @@ void FPS::finishBlockStack() {
   if (MBB->succ_empty())
     return;
 
-  DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB)
-               << " derived from " << MBB->getName() << ".\n");
+  LLVM_DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB)
+                    << " derived from " << MBB->getName() << ".\n");
 
   // Get MBB's live-out bundle.
   unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
@@ -551,18 +552,18 @@ void FPS::finishBlockStack() {
   adjustLiveRegs(Bundle.Mask, Term);
 
   if (!Bundle.Mask) {
-    DEBUG(dbgs() << "No live-outs.\n");
+    LLVM_DEBUG(dbgs() << "No live-outs.\n");
     return;
   }
 
   // Has the stack order been fixed yet?
-  DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+  LLVM_DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
   if (Bundle.isFixed()) {
-    DEBUG(dbgs() << "Shuffling stack to match.\n");
+    LLVM_DEBUG(dbgs() << "Shuffling stack to match.\n");
     shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
   } else {
     // Not fixed yet, we get to choose.
-    DEBUG(dbgs() << "Fixing stack order now.\n");
+    LLVM_DEBUG(dbgs() << "Fixing stack order now.\n");
     Bundle.FixCount = StackTop;
     for (unsigned i = 0; i < StackTop; ++i)
       Bundle.FixStack[i] = getStackEntry(i);
@@ -599,13 +600,14 @@ static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
 #ifdef NDEBUG
 #define ASSERT_SORTED(TABLE)
 #else
-#define ASSERT_SORTED(TABLE)                                              \
-  { static bool TABLE##Checked = false;                                   \
-    if (!TABLE##Checked) {                                                \
-       assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) &&       \
-              "All lookup tables must be sorted for efficient access!");  \
-       TABLE##Checked = true;                                             \
-    }                                                                     \
+#define ASSERT_SORTED(TABLE)                                                   \
+  {                                                                            \
+    static std::atomic<bool> TABLE##Checked(false);                            \
+    if (!TABLE##Checked.load(std::memory_order_relaxed)) {                     \
+      assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) &&             \
+             "All lookup tables must be sorted for efficient access!");        \
+      TABLE##Checked.store(true, std::memory_order_relaxed);                   \
+    }                                                                          \
   }
 #endif
 
@@ -893,7 +895,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
   while (Kills && Defs) {
     unsigned KReg = countTrailingZeros(Kills);
     unsigned DReg = countTrailingZeros(Defs);
-    DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg << "\n");
+    LLVM_DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg
+                      << "\n");
     std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
     std::swap(RegMap[KReg], RegMap[DReg]);
     Kills &= ~(1 << KReg);
@@ -907,7 +910,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
       unsigned KReg = getStackEntry(0);
       if (!(Kills & (1 << KReg)))
         break;
-      DEBUG(dbgs() << "Popping %fp" << KReg << "\n");
+      LLVM_DEBUG(dbgs() << "Popping %fp" << KReg << "\n");
       popStackAfter(I2);
       Kills &= ~(1 << KReg);
     }
@@ -916,7 +919,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
   // Manually kill the rest.
   while (Kills) {
     unsigned KReg = countTrailingZeros(Kills);
-    DEBUG(dbgs() << "Killing %fp" << KReg << "\n");
+    LLVM_DEBUG(dbgs() << "Killing %fp" << KReg << "\n");
     freeStackSlotBefore(I, KReg);
     Kills &= ~(1 << KReg);
   }
@@ -924,14 +927,14 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
   // Load zeros for all the imp-defs.
   while(Defs) {
     unsigned DReg = countTrailingZeros(Defs);
-    DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
+    LLVM_DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
     BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
     pushReg(DReg);
     Defs &= ~(1 << DReg);
   }
 
   // Now we should have the correct registers live.
-  DEBUG(dumpStack());
+  LLVM_DEBUG(dumpStack());
   assert(StackTop == countPopulation(Mask) && "Live count mismatch");
 }
 
@@ -954,7 +957,7 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
     if (FixCount > 0)
       moveToTop(OldReg, I);
   }
-  DEBUG(dumpStack());
+  LLVM_DEBUG(dumpStack());
 }
 
 
@@ -1466,7 +1469,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
   case TargetOpcode::IMPLICIT_DEF: {
     // All FP registers must be explicitly defined, so load a 0 instead.
     unsigned Reg = MI.getOperand(0).getReg() - X86::FP0;
-    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    LLVM_DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
     BuildMI(*MBB, Inst, MI.getDebugLoc(), TII->get(X86::LD_F0));
     pushReg(Reg);
     break;
@@ -1571,8 +1574,9 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
       MI.emitError("implicitly popped regs must be last on the x87 stack");
     unsigned NumSTPopped = countTrailingOnes(STPopped);
 
-    DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
-                 << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+    LLVM_DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+                      << NumSTPopped << ", and defines " << NumSTDefs
+                      << " regs.\n");
 
 #ifndef NDEBUG
     // If any input operand uses constraint "f", all output register
@@ -1610,7 +1614,10 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
       STUsesArray[I] = I;
 
     shuffleStackTop(STUsesArray, NumSTUses, Inst);
-    DEBUG({dbgs() << "Before asm: "; dumpStack();});
+    LLVM_DEBUG({
+      dbgs() << "Before asm: ";
+      dumpStack();
+    });
 
     // With the stack layout fixed, rewrite the FP registers.
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
@@ -1658,7 +1665,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
   // We want to leave I pointing to the previous instruction, but what if we
   // just erased the first instruction?
   if (Inst == MBB->begin()) {
-    DEBUG(dbgs() << "Inserting dummy KILL\n");
+    LLVM_DEBUG(dbgs() << "Inserting dummy KILL\n");
     Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
   } else
     --Inst;
@@ -1673,7 +1680,7 @@ void FPS::setKillFlags(MachineBasicBlock &MBB) const {
 
   for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
        I != E; ++I) {
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     std::bitset<8> Defs;
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
index 11808f8995fe..a257ec41f75b 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -248,6 +248,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
 /// stack pointer by a constant value.
 void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MBBI,
+                                    const DebugLoc &DL,
                                     int64_t NumBytes, bool InEpilogue) const {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
@@ -255,7 +256,6 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
       isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
 
   uint64_t Chunk = (1LL << 31) - 1;
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   if (Offset > Chunk) {
     // Rather than emit a long series of instructions for large offsets,
@@ -399,28 +399,30 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
     return 0;
 
   MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
-  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
-                                                       : std::next(MBBI);
+
   PI = skipDebugInstructionsBackward(PI, MBB.begin());
-  if (NI != nullptr)
-    NI = skipDebugInstructionsForward(NI, MBB.end());
+  // It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
+  // instruction, and that there are no DBG_VALUE or other instructions between
+  // ADD/SUB/LEA and its corresponding CFI instruction.
+  /* TODO: Add support for the case where there are multiple CFI instructions
+    below the ADD/SUB/LEA, e.g.:
+    ...
+    add
+    cfi_def_cfa_offset
+    cfi_offset
+    ...
+  */
+  if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
+    PI = std::prev(PI);
 
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
-  if (!doMergeWithPrevious && NI != MBB.end() &&
-      NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
-    // Don't merge with the next instruction if it has CFI.
-    return Offset;
-  }
-
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
       PI->getOperand(0).getReg() == StackPtr){
     assert(PI->getOperand(1).getReg() == StackPtr);
-    Offset += PI->getOperand(2).getImm();
-    MBB.erase(PI);
-    if (!doMergeWithPrevious) MBBI = NI;
+    Offset = PI->getOperand(2).getImm();
   } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
              PI->getOperand(0).getReg() == StackPtr &&
              PI->getOperand(1).getReg() == StackPtr &&
@@ -428,17 +430,19 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
              PI->getOperand(3).getReg() == X86::NoRegister &&
              PI->getOperand(5).getReg() == X86::NoRegister) {
     // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
-    Offset += PI->getOperand(4).getImm();
-    MBB.erase(PI);
-    if (!doMergeWithPrevious) MBBI = NI;
+    Offset = PI->getOperand(4).getImm();
   } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
              PI->getOperand(0).getReg() == StackPtr) {
     assert(PI->getOperand(1).getReg() == StackPtr);
-    Offset -= PI->getOperand(2).getImm();
-    MBB.erase(PI);
-    if (!doMergeWithPrevious) MBBI = NI;
-  }
+    Offset = -PI->getOperand(2).getImm();
+  } else
+    return 0;
+
+  PI = MBB.erase(PI);
+  if (PI != MBB.end() && PI->isCFIInstruction()) PI = MBB.erase(PI);
+  if (!doMergeWithPrevious)
+    MBBI = skipDebugInstructionsForward(PI, MBB.end());
 
   return Offset;
 }
@@ -998,7 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       Fn.arg_size() == 2) {
     StackSize += 8;
     MFI.setStackSize(StackSize);
-    emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false);
+    emitSPUpdate(MBB, MBBI, DL, -8, /*InEpilogue=*/false);
   }
 
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
@@ -1213,30 +1217,34 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     bool isEAXAlive = isEAXLiveIn(MBB);
 
     if (isEAXAlive) {
-      // Sanity check that EAX is not livein for this function.
-      // It should not be, so throw an assert.
-      assert(!Is64Bit && "EAX is livein in x64 case!");
-
-      // Save EAX
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
-        .addReg(X86::EAX, RegState::Kill)
-        .setMIFlag(MachineInstr::FrameSetup);
+      if (Is64Bit) {
+        // Save RAX
+        BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+          .addReg(X86::RAX, RegState::Kill)
+          .setMIFlag(MachineInstr::FrameSetup);
+      } else {
+        // Save EAX
+        BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+          .addReg(X86::EAX, RegState::Kill)
+          .setMIFlag(MachineInstr::FrameSetup);
+      }
     }
 
     if (Is64Bit) {
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
-      if (isUInt<32>(NumBytes)) {
+      int Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
+      if (isUInt<32>(Alloc)) {
         BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-            .addImm(NumBytes)
+            .addImm(Alloc)
             .setMIFlag(MachineInstr::FrameSetup);
-      } else if (isInt<32>(NumBytes)) {
+      } else if (isInt<32>(Alloc)) {
         BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
-            .addImm(NumBytes)
+            .addImm(Alloc)
             .setMIFlag(MachineInstr::FrameSetup);
       } else {
         BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
-            .addImm(NumBytes)
+            .addImm(Alloc)
             .setMIFlag(MachineInstr::FrameSetup);
       }
     } else {
@@ -1251,15 +1259,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     emitStackProbe(MF, MBB, MBBI, DL, true);
 
     if (isEAXAlive) {
-      // Restore EAX
-      MachineInstr *MI =
-          addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
-                       StackPtr, false, NumBytes - 4);
+      // Restore RAX/EAX
+      MachineInstr *MI;
+      if (Is64Bit)
+        MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
+                          StackPtr, false, NumBytes - 8);
+      else
+        MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+                          StackPtr, false, NumBytes - 4);
       MI->setFlag(MachineInstr::FrameSetup);
       MBB.insert(MBBI, MI);
     }
   } else if (NumBytes) {
-    emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+    emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false);
   }
 
   if (NeedsWinCFI && NumBytes) {
@@ -1565,6 +1577,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   bool HasFP = hasFP(MF);
   uint64_t NumBytes = 0;
 
+  bool NeedsDwarfCFI =
+      (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+       !MF.getTarget().getTargetTriple().isOSWindows()) &&
+      (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry());
+
   if (IsFunclet) {
     assert(HasFP && "EH funclets without FP not yet implemented");
     NumBytes = getWinEHFuncletFrameSize(MF);
@@ -1587,6 +1604,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
             MachineFramePtr)
         .setMIFlag(MachineInstr::FrameDestroy);
+    if (NeedsDwarfCFI) {
+      unsigned DwarfStackPtr =
+          TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
+                                  nullptr, DwarfStackPtr, -SlotSize));
+      --MBBI;
+    }
   }
 
   MachineBasicBlock::iterator FirstCSPop = MBBI;
@@ -1649,7 +1673,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     }
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
-    emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+    emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
+    if (!hasFP(MF) && NeedsDwarfCFI) {
+      // Define the current CFA rule to use the provided offset.
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+                                  nullptr, -CSSize - SlotSize));
+    }
     --MBBI;
   }
 
@@ -1662,6 +1691,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (NeedsWin64CFI && MF.hasWinCFI())
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
 
+  if (!hasFP(MF) && NeedsDwarfCFI) {
+    MBBI = FirstCSPop;
+    int64_t Offset = -CSSize - SlotSize;
+    // Mark callee-saved pop instruction.
+    // Define the current CFA rule to use the provided offset.
+    while (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator PI = MBBI;
+      unsigned Opc = PI->getOpcode();
+      ++MBBI;
+      if (Opc == X86::POP32r || Opc == X86::POP64r) {
+        Offset += SlotSize;
+        BuildCFI(MBB, MBBI, DL,
+                 MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+      }
+    }
+  }
+
   if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
     // Add the return addr area delta back since we are not tail calling.
     int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -1669,7 +1715,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, Terminator, true);
-      emitSPUpdate(MBB, Terminator, Offset, /*InEpilogue=*/true);
+      emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
     }
   }
 }
@@ -1860,6 +1906,32 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
   unsigned CalleeSavedFrameSize = 0;
   int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
 
+  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MFI.CreateFixedObject(-TailCallReturnAddrDelta,
+                           TailCallReturnAddrDelta - SlotSize, true);
+  }
+
+  // Spill the BasePtr if it's used.
+  if (this->TRI->hasBasePointer(MF)) {
+    // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+    if (MF.hasEHFunclets()) {
+      int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
+      X86FI->setHasSEHFramePtrSave(true);
+      X86FI->setSEHFramePtrSaveIndex(FI);
+    }
+  }
+
   if (hasFP(MF)) {
     // emitPrologue always spills frame register the first thing.
     SpillSlotOffset -= SlotSize;
@@ -1899,7 +1971,12 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    // If this is k-register make sure we lookup via the largest legal type.
+    MVT VT = MVT::Other;
+    if (X86::VK16RegClass.contains(Reg))
+      VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
     unsigned Size = TRI->getSpillSize(*RC);
     unsigned Align = TRI->getSpillAlignment(*RC);
     // ensure alignment
@@ -1966,9 +2043,15 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
     unsigned Reg = CSI[i-1].getReg();
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
+
+    // If this is k-register make sure we lookup via the largest legal type.
+    MVT VT = MVT::Other;
+    if (X86::VK16RegClass.contains(Reg))
+      VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
 
     TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
                             TRI);
@@ -2042,7 +2125,12 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         X86::GR32RegClass.contains(Reg))
       continue;
 
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    // If this is k-register make sure we lookup via the largest legal type.
+    MVT VT = MVT::Other;
+    if (X86::VK16RegClass.contains(Reg))
+      VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
     TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
   }
 
@@ -2065,35 +2153,12 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-
-  if (TailCallReturnAddrDelta < 0) {
-    // create RETURNADDR area
-    //   arg
-    //   arg
-    //   RETADDR
-    //   { ...
-    //     RETADDR area
-    //     ...
-    //   }
-    //   [EBP]
-    MFI.CreateFixedObject(-TailCallReturnAddrDelta,
-                           TailCallReturnAddrDelta - SlotSize, true);
-  }
-
   // Spill the BasePtr if it's used.
-  if (TRI->hasBasePointer(MF)) {
-    SavedRegs.set(TRI->getBaseRegister());
-
-    // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
-    if (MF.hasEHFunclets()) {
-      int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
-      X86FI->setHasSEHFramePtrSave(true);
-      X86FI->setSEHFramePtrSaveIndex(FI);
-    }
+  if (TRI->hasBasePointer(MF)){
+    unsigned BasePtr = TRI->getBaseRegister();
+    if (STI.isTarget64BitILP32())
+      BasePtr = getX86SubSuperRegister(BasePtr, 64);
+    SavedRegs.set(BasePtr);
   }
 }
 
@@ -2176,8 +2241,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
   // prologue.
   StackSize = MFI.getStackSize();
 
-  // Do not generate a prologue for functions with a stack of size zero
-  if (StackSize == 0)
+  // Do not generate a prologue for leaf functions with a stack of size zero.
+  // For non-leaf functions we have to allow for the possibility that the
+  // call is to a non-split function, as in PR37807.
+  if (StackSize == 0 && !MFI.hasTailCall())
     return;
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
@@ -2692,7 +2759,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
     // Add Amount to SP to destroy a frame, or subtract to setup.
     int64_t StackAdjustment = isDestroy ? Amount : -Amount;
-    int64_t CfaAdjustment = -StackAdjustment;
 
     if (StackAdjustment) {
       // Merge with any previous or following adjustment instruction. Note: the
@@ -2717,6 +2783,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // offset to be correct at each call site, while for debugging we want
       // it to be more precise.
 
+      int64_t CfaAdjustment = -StackAdjustment;
       // TODO: When not using precise CFA, we also need to adjust for the
       // InternalAmt here.
       if (CfaAdjustment) {
@@ -2847,6 +2914,15 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   return MBBI;
 }
 
+int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+  return TRI->getSlotSize();
+}
+
+unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
+    const {
+  return TRI->getDwarfRegNum(StackPtr, true);
+}
+
 namespace {
 // Struct used by orderFrameObjects to help sort the stack objects.
 struct X86FrameSortingObject {
@@ -2951,7 +3027,7 @@ void X86FrameLowering::orderFrameObjects(
   // Count the number of uses for each object.
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
       for (const MachineOperand &MO : MI.operands()) {
         // Check to see if it's a local stack symbol.
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
index 909319fc18fc..3bd805aae123 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -125,7 +125,7 @@ public:
   /// Emit a series of instructions to increment / decrement the stack
   /// pointer by a constant value.
   void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                    int64_t NumBytes, bool InEpilogue) const;
+                    const DebugLoc &DL, int64_t NumBytes, bool InEpilogue) const;
 
   /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
   bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
@@ -168,6 +168,10 @@ public:
                               MachineBasicBlock::iterator MBBI,
                               const DebugLoc &DL, bool RestoreSP = false) const;
 
+  int getInitialCFAOffset(const MachineFunction &MF) const override;
+
+  unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+
 private:
   uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
 
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d79fd0ca4daa..a28d4eac8393 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -100,11 +101,11 @@ namespace {
     }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    void dump() {
+    void dump(SelectionDAG *DAG = nullptr) {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
       if (Base_Reg.getNode())
-        Base_Reg.getNode()->dump();
+        Base_Reg.getNode()->dump(DAG);
       else
         dbgs() << "nul\n";
       if (BaseType == FrameIndexBase)
@@ -112,7 +113,7 @@ namespace {
       dbgs() << " Scale " << Scale << '\n'
              << "IndexReg ";
       if (IndexReg.getNode())
-        IndexReg.getNode()->dump();
+        IndexReg.getNode()->dump(DAG);
       else
         dbgs() << "nul\n";
       dbgs() << " Disp " << Disp << '\n'
@@ -181,6 +182,7 @@ namespace {
     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 
     void PreprocessISelDAG() override;
+    void PostprocessISelDAG() override;
 
 // Include the pieces autogenerated from the target description.
 #include "X86GenDAGISel.inc"
@@ -213,7 +215,7 @@ namespace {
     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
                            SDValue &Segment);
-    bool selectScalarSSELoad(SDNode *Root, SDValue N,
+    bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
                              SDValue &Base, SDValue &Scale,
                              SDValue &Index, SDValue &Disp,
                              SDValue &Segment,
@@ -225,7 +227,7 @@ namespace {
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment);
 
-    // Convience method where P is also root.
+    // Convenience method where P is also root.
     bool tryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
@@ -233,6 +235,12 @@ namespace {
       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
     }
 
+    // Try to fold a vector load. This makes sure the load isn't non-temporal.
+    bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+                        SDValue &Base, SDValue &Scale,
+                        SDValue &Index, SDValue &Disp,
+                        SDValue &Segment);
+
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
@@ -368,6 +376,11 @@ namespace {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
     }
 
+    /// Return a target constant with the specified value, of type i64.
+    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
+      return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
+    }
+
     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
                                         const SDLoc &DL) {
       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
@@ -401,7 +414,7 @@ namespace {
       return Subtarget->getInstrInfo();
     }
 
-    /// \brief Address-mode matching performs shift-of-and to and-of-shift
+    /// Address-mode matching performs shift-of-and to and-of-shift
     /// reassociation in order to expose more scaled addressing
     /// opportunities.
     bool ComplexPatternFuncMutatesDAG() const override {
@@ -440,10 +453,15 @@ namespace {
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
-
     bool matchBEXTRFromAnd(SDNode *Node);
-
+    bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
+
+    MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+                                const SDLoc &dl, MVT VT, SDNode *Node);
+    MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+                                const SDLoc &dl, MVT VT, SDNode *Node,
+                                SDValue &InFlag);
   };
 }
 
@@ -452,19 +470,21 @@ namespace {
 // type.
 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
   unsigned Opcode = N->getOpcode();
-  if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
-      Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
-      Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU ||
-      Opcode == X86ISD::CMPM_RND) {
+  if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
+      Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
     // We can get 256-bit 8 element types here without VLX being enabled. When
     // this happens we will use 512-bit operations and the mask will not be
     // zero extended.
     EVT OpVT = N->getOperand(0).getValueType();
-    if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
+    if (OpVT.is256BitVector() || OpVT.is128BitVector())
       return Subtarget->hasVLX();
 
     return true;
   }
+  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
+  if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
+      Opcode == X86ISD::FSETCCM_RND)
+    return true;
 
   return false;
 }
@@ -518,10 +538,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
       // addl 4(%esp), %eax
       // The former is 2 bytes shorter. In case where the increment is 1, then
       // the saving can be 4 bytes (by using incl %eax).
-      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
         if (Imm->getAPIntValue().isSignedIntN(8))
           return false;
 
+        // If this is a 64-bit AND with an immediate that fits in 32-bits,
+        // prefer using the smaller and over folding the load. This is needed to
+        // make sure immediates created by shrinkAndImmediate are always folded.
+        // Ideally we would narrow the load during DAG combine and get the
+        // best of both worlds.
+        if (U->getOpcode() == ISD::AND &&
+            Imm->getAPIntValue().getBitWidth() == 64 &&
+            Imm->getAPIntValue().isIntN(32))
+          return false;
+      }
+
       // If the other operand is a TLS address, we should fold it instead.
       // This produces
       // movl    %gs:0, %eax
@@ -537,10 +568,60 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
           return false;
       }
+
+      // Don't fold load if this matches the BTS/BTR/BTC patterns.
+      // BTS: (or X, (shl 1, n))
+      // BTR: (and X, (rotl -2, n))
+      // BTC: (xor X, (shl 1, n))
+      if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
+        if (U->getOperand(0).getOpcode() == ISD::SHL &&
+            isOneConstant(U->getOperand(0).getOperand(0)))
+          return false;
+
+        if (U->getOperand(1).getOpcode() == ISD::SHL &&
+            isOneConstant(U->getOperand(1).getOperand(0)))
+          return false;
+      }
+      if (U->getOpcode() == ISD::AND) {
+        SDValue U0 = U->getOperand(0);
+        SDValue U1 = U->getOperand(1);
+        if (U0.getOpcode() == ISD::ROTL) {
+          auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
+          if (C && C->getSExtValue() == -2)
+            return false;
+        }
+
+        if (U1.getOpcode() == ISD::ROTL) {
+          auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
+          if (C && C->getSExtValue() == -2)
+            return false;
+        }
+      }
+
+      break;
     }
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      // Don't fold a load into a shift by immediate. The BMI2 instructions
+      // support folding a load, but not an immediate. The legacy instructions
+      // support folding an immediate, but can't fold a load. Folding an
+      // immediate is preferable to folding a load.
+      if (isa<ConstantSDNode>(U->getOperand(1)))
+        return false;
+
+      break;
     }
   }
 
+  // Prevent folding a load if this can implemented with an insert_subreg or
+  // a move that implicitly zeroes.
+  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
+      isNullConstant(Root->getOperand(2)) &&
+      (Root->getOperand(0).isUndef() ||
+       ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
+    return false;
+
   return true;
 }
 
@@ -628,6 +709,18 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
        E = CurDAG->allnodes_end(); I != E; ) {
     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 
+    // If this is a target specific AND node with no flag usages, turn it back
+    // into ISD::AND to enable test instruction matching.
+    if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
+      SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
+                                    N->getOperand(0), N->getOperand(1));
+      --I;
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+      ++I;
+      CurDAG->DeleteNode(N);
+      continue;
+    }
+
     if (OptLevel != CodeGenOpt::None &&
         // Only do this when the target can fold the load into the call or
         // jmp.
@@ -735,6 +828,70 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 }
 
 
+void X86DAGToDAGISel::PostprocessISelDAG() {
+  // Skip peepholes at -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // Attempt to remove vectors moves that were inserted to zero upper bits.
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
+      continue;
+
+    unsigned SubRegIdx = N->getConstantOperandVal(2);
+    if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
+      continue;
+
+    SDValue Move = N->getOperand(1);
+    if (!Move.isMachineOpcode())
+      continue;
+
+    // Make sure its one of the move opcodes we recognize.
+    switch (Move.getMachineOpcode()) {
+    default:
+      continue;
+    case X86::VMOVAPDrr:       case X86::VMOVUPDrr:
+    case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
+    case X86::VMOVDQArr:       case X86::VMOVDQUrr:
+    case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
+    case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
+    case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
+    case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
+    case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
+    case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
+    case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
+    case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
+    case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
+    case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
+    case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
+      break;
+    }
+
+    SDValue In = Move.getOperand(0);
+    if (!In.isMachineOpcode() ||
+        In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
+      continue;
+
+    // Producing instruction is another vector instruction. We can drop the
+    // move.
+    CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
+
+    // If the move is now dead, delete it.
+    if (Move.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Move.getNode());
+  }
+}
+
+
 /// Emit any code that needs to be executed only in the main function.
 void X86DAGToDAGISel::emitSpecialCodeForMain() {
   if (Subtarget->isTargetCygMing()) {
@@ -771,9 +928,14 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
 
 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
                                             X86ISelAddressMode &AM) {
+  // If there's no offset to fold, we don't need to do any work.
+  if (Offset == 0)
+    return false;
+
   // Cannot combine ExternalSymbol displacements with integer offsets.
-  if (Offset != 0 && (AM.ES || AM.MCSym))
+  if (AM.ES || AM.MCSym)
     return true;
+
   int64_t Val = AM.Disp + Offset;
   CodeModel::Model M = TM.getCodeModel();
   if (Subtarget->is64Bit()) {
@@ -827,94 +989,60 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   if (AM.hasSymbolicDisplacement())
     return true;
 
-  SDValue N0 = N.getOperand(0);
+  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+
+  // We can't use an addressing mode in the 64-bit large code model. In the
+  // medium code model, we use can use an mode when RIP wrappers are present.
+  // That signifies access to globals that are known to be "near", such as the
+  // GOT itself.
   CodeModel::Model M = TM.getCodeModel();
+  if (Subtarget->is64Bit() &&
+      (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+    return true;
 
-  // Handle X86-64 rip-relative addresses.  We check this before checking direct
-  // folding because RIP is preferable to non-RIP accesses.
-  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
-      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
-      // they cannot be folded into immediate fields.
-      // FIXME: This can be improved for kernel and other models?
-      (M == CodeModel::Small || M == CodeModel::Kernel)) {
-    // Base and index reg must be 0 in order to use %rip as base.
-    if (AM.hasBaseOrIndexReg())
-      return true;
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
-      X86ISelAddressMode Backup = AM;
-      AM.GV = G->getGlobal();
-      AM.SymbolFlags = G->getTargetFlags();
-      if (foldOffsetIntoAddress(G->getOffset(), AM)) {
-        AM = Backup;
-        return true;
-      }
-    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
-      X86ISelAddressMode Backup = AM;
-      AM.CP = CP->getConstVal();
-      AM.Align = CP->getAlignment();
-      AM.SymbolFlags = CP->getTargetFlags();
-      if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
-        AM = Backup;
-        return true;
-      }
-    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
-      AM.ES = S->getSymbol();
-      AM.SymbolFlags = S->getTargetFlags();
-    } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
-      AM.MCSym = S->getMCSymbol();
-    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
-      AM.JT = J->getIndex();
-      AM.SymbolFlags = J->getTargetFlags();
-    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
-      X86ISelAddressMode Backup = AM;
-      AM.BlockAddr = BA->getBlockAddress();
-      AM.SymbolFlags = BA->getTargetFlags();
-      if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
-        AM = Backup;
-        return true;
-      }
-    } else
-      llvm_unreachable("Unhandled symbol reference node.");
+  // Base and index reg must be 0 in order to use %rip as base.
+  if (IsRIPRel && AM.hasBaseOrIndexReg())
+    return true;
 
-    if (N.getOpcode() == X86ISD::WrapperRIP)
-      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
-    return false;
-  }
+  // Make a local copy in case we can't do this fold.
+  X86ISelAddressMode Backup = AM;
 
-  // Handle the case when globals fit in our immediate field: This is true for
-  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
-  // mode, this only applies to a non-RIP-relative computation.
-  if (!Subtarget->is64Bit() ||
-      M == CodeModel::Small || M == CodeModel::Kernel) {
-    assert(N.getOpcode() != X86ISD::WrapperRIP &&
-           "RIP-relative addressing already handled");
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
-      AM.GV = G->getGlobal();
-      AM.Disp += G->getOffset();
-      AM.SymbolFlags = G->getTargetFlags();
-    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
-      AM.CP = CP->getConstVal();
-      AM.Align = CP->getAlignment();
-      AM.Disp += CP->getOffset();
-      AM.SymbolFlags = CP->getTargetFlags();
-    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
-      AM.ES = S->getSymbol();
-      AM.SymbolFlags = S->getTargetFlags();
-    } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
-      AM.MCSym = S->getMCSymbol();
-    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
-      AM.JT = J->getIndex();
-      AM.SymbolFlags = J->getTargetFlags();
-    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
-      AM.BlockAddr = BA->getBlockAddress();
-      AM.Disp += BA->getOffset();
-      AM.SymbolFlags = BA->getTargetFlags();
-    } else
-      llvm_unreachable("Unhandled symbol reference node.");
-    return false;
+  int64_t Offset = 0;
+  SDValue N0 = N.getOperand(0);
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+    AM.GV = G->getGlobal();
+    AM.SymbolFlags = G->getTargetFlags();
+    Offset = G->getOffset();
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+    AM.CP = CP->getConstVal();
+    AM.Align = CP->getAlignment();
+    AM.SymbolFlags = CP->getTargetFlags();
+    Offset = CP->getOffset();
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+    AM.ES = S->getSymbol();
+    AM.SymbolFlags = S->getTargetFlags();
+  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+    AM.MCSym = S->getMCSymbol();
+  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+    AM.JT = J->getIndex();
+    AM.SymbolFlags = J->getTargetFlags();
+  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+    AM.BlockAddr = BA->getBlockAddress();
+    AM.SymbolFlags = BA->getTargetFlags();
+    Offset = BA->getOffset();
+  } else
+    llvm_unreachable("Unhandled symbol reference node.");
+
+  if (foldOffsetIntoAddress(Offset, AM)) {
+    AM = Backup;
+    return true;
   }
 
-  return true;
+  if (IsRIPRel)
+    AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+
+  // Commit the changes now that we know this fold is safe.
+  return false;
 }
 
 /// Add the specified node to the specified addressing mode, returning true if
@@ -988,10 +1116,16 @@ bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
 // IDs! The selection DAG must no longer depend on their uniqueness when this
 // is used.
 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
-  if (N.getNode()->getNodeId() == -1 ||
-      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
-    DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
-    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+  if (N->getNodeId() == -1 ||
+      (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
+       SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
+    DAG.RepositionNode(Pos->getIterator(), N.getNode());
+    // Mark Node as invalid for pruning as after this it may be a successor to a
+    // selected node but otherwise be in the same position of Pos.
+    // Conservatively mark it with the same -abs(Id) to assure node id
+    // invariant is preserved.
+    N->setNodeId(Pos->getNodeId());
+    SelectionDAGISel::InvalidateNodeId(N.getNode());
   }
 }
 
@@ -1196,10 +1330,10 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
   SDLoc dl(N);
-  DEBUG({
-      dbgs() << "MatchAddress: ";
-      AM.dump();
-    });
+  LLVM_DEBUG({
+    dbgs() << "MatchAddress: ";
+    AM.dump(CurDAG);
+  });
   // Limit recursion.
   if (Depth > 5)
     return matchAddressBase(N, AM);
@@ -1508,6 +1642,12 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
   // TODO: Support other operations.
   switch (N.getOpcode()) {
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!foldOffsetIntoAddress(Val, AM))
+      return false;
+    break;
+  }
   case X86ISD::Wrapper:
     if (!matchWrapper(N, AM))
       return false;
@@ -1523,7 +1663,7 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   X86ISelAddressMode AM;
   auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
   AM.IndexReg = Mgs->getIndex();
-  AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8;
+  AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
 
   unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1534,14 +1674,8 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (AddrSpace == 258)
     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
 
-  // If Base is 0, the whole address is in index and the Scale is 1
-  if (isa<ConstantSDNode>(N)) {
-    assert(cast<ConstantSDNode>(N)->isNullValue() &&
-           "Unexpected base in gather/scatter");
-    AM.Scale = 1;
-  }
-  // Otherwise, try to match into the base and displacement fields.
-  else if (matchVectorAddress(N, AM))
+  // Try to match into the base and displacement fields.
+  if (matchVectorAddress(N, AM))
     return false;
 
   MVT VT = N.getSimpleValueType();
@@ -1604,8 +1738,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
 // We can only fold a load if all nodes between it and the root node have a
 // single use. If there are additional uses, we could end up duplicating the
 // load.
-static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
-  SDNode *User = *N->use_begin();
+static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
   while (User != Root) {
     if (!User->hasOneUse())
       return false;
@@ -1622,17 +1755,19 @@ static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
 /// We also return:
 ///   PatternChainNode: this is the matched node that has a chain input and
 ///   output.
-bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
                                           SDValue N, SDValue &Base,
                                           SDValue &Scale, SDValue &Index,
                                           SDValue &Disp, SDValue &Segment,
                                           SDValue &PatternNodeWithChain) {
+  if (!hasSingleUsesFromRoot(Root, Parent))
+    return false;
+
   // We can allow a full vector load here since narrowing a load is ok.
   if (ISD::isNON_EXTLoad(N.getNode())) {
     PatternNodeWithChain = N;
     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
-        IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
-        hasSingleUsesFromRoot(Root, N.getNode())) {
+        IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
@@ -1643,8 +1778,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
   if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
     PatternNodeWithChain = N;
     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
-        IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
-        hasSingleUsesFromRoot(Root, N.getNode())) {
+        IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
       auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
       return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
@@ -1658,8 +1792,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
     PatternNodeWithChain = N.getOperand(0);
     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
-        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
-        hasSingleUsesFromRoot(Root, N.getNode())) {
+        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
@@ -1675,8 +1808,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
     PatternNodeWithChain = N.getOperand(0).getOperand(0);
     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
-        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
-        hasSingleUsesFromRoot(Root, N.getNode())) {
+        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
       // Okay, this is a zero extending load.  Fold it.
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
@@ -1699,10 +1831,10 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
   }
 
   // In static codegen with small code model, we can get the address of a label
-  // into a register with 'movl'. TableGen has already made sure we're looking
-  // at a label of some kind.
-  assert(N->getOpcode() == X86ISD::Wrapper &&
-         "Unexpected node type for MOV32ri64");
+  // into a register with 'movl'
+  if (N->getOpcode() != X86ISD::Wrapper)
+    return false;
+
   N = N.getOperand(0);
 
   // At least GNU as does not accept 'movl' for TPOFF relocations.
@@ -1907,6 +2039,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
+bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+                                     SDValue &Base, SDValue &Scale,
+                                     SDValue &Index, SDValue &Disp,
+                                     SDValue &Segment) {
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      useNonTemporalLoad(cast<LoadSDNode>(N)) ||
+      !IsProfitableToFold(N, P, Root) ||
+      !IsLegalToFold(N, P, Root, OptLevel))
+    return false;
+
+  return selectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
 /// Return an SDNode that returns the value of the global base register.
 /// Output instructions required to initialize the global base register,
 /// if necessary.
@@ -2092,50 +2238,84 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
       LoadNode->getOffset() != StoreNode->getOffset())
     return false;
 
-  // Check if the chain is produced by the load or is a TokenFactor with
-  // the load output chain as an operand. Return InputChain by reference.
+  bool FoundLoad = false;
+  SmallVector<SDValue, 4> ChainOps;
+  SmallVector<const SDNode *, 4> LoopWorklist;
+  SmallPtrSet<const SDNode *, 16> Visited;
+  const unsigned int Max = 1024;
+
+  //  Visualization of Load-Op-Store fusion:
+  // -------------------------
+  // Legend:
+  //    *-lines = Chain operand dependencies.
+  //    |-lines = Normal operand dependencies.
+  //    Dependencies flow down and right. n-suffix references multiple nodes.
+  //
+  //        C                        Xn  C
+  //        *                         *  *
+  //        *                          * *
+  //  Xn  A-LD    Yn                    TF         Yn
+  //   *    * \   |                       *        |
+  //    *   *  \  |                        *       |
+  //     *  *   \ |             =>       A--LD_OP_ST
+  //      * *    \|                                 \
+  //       TF    OP                                  \
+  //         *   | \                                  Zn
+  //          *  |  \
+  //         A-ST    Zn
+  //
+
+  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
+  //                                      #2: Yn -> LD
+  //                                      #3: ST -> Zn
+
+  // Ensure the transform is safe by checking for the dual
+  // dependencies to make sure we do not induce a loop.
+
+  // As LD is a predecessor to both OP and ST we can do this by checking:
+  //  a). if LD is a predecessor to a member of Xn or Yn.
+  //  b). if a Zn is a predecessor to ST.
+
+  // However, (b) can only occur through being a chain predecessor to
+  // ST, which is the same as Zn being a member or predecessor of Xn,
+  // which is a subset of LD being a predecessor of Xn. So it's
+  // subsumed by check (a).
+
   SDValue Chain = StoreNode->getChain();
 
-  bool ChainCheck = false;
+  // Gather X elements in ChainOps.
   if (Chain == Load.getValue(1)) {
-    ChainCheck = true;
-    InputChain = LoadNode->getChain();
+    FoundLoad = true;
+    ChainOps.push_back(Load.getOperand(0));
   } else if (Chain.getOpcode() == ISD::TokenFactor) {
-    SmallVector<SDValue, 4> ChainOps;
     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
       SDValue Op = Chain.getOperand(i);
       if (Op == Load.getValue(1)) {
-        ChainCheck = true;
+        FoundLoad = true;
         // Drop Load, but keep its chain. No cycle check necessary.
         ChainOps.push_back(Load.getOperand(0));
         continue;
       }
-
-      // Make sure using Op as part of the chain would not cause a cycle here.
-      // In theory, we could check whether the chain node is a predecessor of
-      // the load. But that can be very expensive. Instead visit the uses and
-      // make sure they all have smaller node id than the load.
-      int LoadId = LoadNode->getNodeId();
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-             UE = UI->use_end(); UI != UE; ++UI) {
-        if (UI.getUse().getResNo() != 0)
-          continue;
-        if (UI->getNodeId() > LoadId)
-          return false;
-      }
-
+      LoopWorklist.push_back(Op.getNode());
       ChainOps.push_back(Op);
     }
-
-    if (ChainCheck)
-      // Make a new TokenFactor with all the other input chains except
-      // for the load.
-      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
-                                   MVT::Other, ChainOps);
   }
-  if (!ChainCheck)
+
+  if (!FoundLoad)
     return false;
 
+  // Worklist is currently Xn. Add Yn to worklist.
+  for (SDValue Op : StoredVal->ops())
+    if (Op.getNode() != LoadNode)
+      LoopWorklist.push_back(Op.getNode());
+
+  // Check (a) if Load is a predecessor to Xn + Yn
+  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+                                   true))
+    return false;
+
+  InputChain =
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
   return true;
 }
 
@@ -2177,7 +2357,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   case X86ISD::INC:
   case X86ISD::DEC:
   case X86ISD::ADD:
+  case X86ISD::ADC:
   case X86ISD::SUB:
+  case X86ISD::SBB:
   case X86ISD::AND:
   case X86ISD::OR:
   case X86ISD::XOR:
@@ -2225,7 +2407,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
     break;
   }
   case X86ISD::ADD:
+  case X86ISD::ADC:
   case X86ISD::SUB:
+  case X86ISD::SBB:
   case X86ISD::AND:
   case X86ISD::OR:
   case X86ISD::XOR: {
@@ -2234,9 +2418,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
       case X86ISD::ADD:
         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
                             X86::ADD8mr);
+      case X86ISD::ADC:
+        return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
+                            X86::ADC8mr);
       case X86ISD::SUB:
         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
                             X86::SUB8mr);
+      case X86ISD::SBB:
+        return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
+                            X86::SBB8mr);
       case X86ISD::AND:
         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
                             X86::AND8mr);
@@ -2253,8 +2443,12 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
       switch (Opc) {
       case X86ISD::ADD:
         return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
+      case X86ISD::ADC:
+        return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
       case X86ISD::SUB:
         return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
+      case X86ISD::SBB:
+        return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
       case X86ISD::AND:
         return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
       case X86ISD::OR:
@@ -2270,9 +2464,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
       case X86ISD::ADD:
         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
                             X86::ADD8mi);
+      case X86ISD::ADC:
+        return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
+                            X86::ADC8mi);
       case X86ISD::SUB:
         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
                             X86::SUB8mi);
+      case X86ISD::SBB:
+        return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
+                            X86::SBB8mi);
       case X86ISD::AND:
         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
                             X86::AND8mi);
@@ -2320,10 +2520,21 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
       }
     }
 
-    const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
-                           Segment, Operand, InputChain};
-    Result =
-        CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+    if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
+      SDValue CopyTo =
+          CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
+                               StoredVal.getOperand(2), SDValue());
+
+      const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
+                             Segment, Operand, CopyTo, CopyTo.getValue(1)};
+      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+                                      Ops);
+    } else {
+      const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
+                             Segment, Operand, InputChain};
+      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+                                      Ops);
+    }
     break;
   }
   default:
@@ -2335,6 +2546,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   MemOp[1] = LoadNode->getMemOperand();
   Result->setMemRefs(MemOp, MemOp + 2);
 
+  // Update Load Chain uses as well.
+  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
   CurDAG->RemoveDeadNode(Node);
@@ -2388,57 +2601,169 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
   if (Shift + MaskSize > NVT.getSizeInBits())
     return false;
 
-  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
-  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
-  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+  // Create a BEXTR node and run it through selection.
+  SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT);
+  SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT,
+                                N0->getOperand(0), C);
+  ReplaceNode(Node, New.getNode());
+  SelectCode(New.getNode());
+  return true;
+}
 
-  // BMI requires the immediate to placed in a register.
-  if (!Subtarget->hasTBM()) {
-    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
-    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
-    New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0);
-    if (NVT == MVT::i64) {
-      New =
-          SDValue(CurDAG->getMachineNode(
-                      TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
-                      CurDAG->getTargetConstant(0, dl, MVT::i64), New,
-                      CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
-                  0);
-    }
+// Emit a PCMISTR(I/M) instruction.
+MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
+                                             bool MayFoldLoad, const SDLoc &dl,
+                                             MVT VT, SDNode *Node) {
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+  SDValue Imm = Node->getOperand(2);
+  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+  // If there is a load, it will be behind a bitcast. We don't need to check
+  // alignment on this load.
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
+      tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+                     Tmp3, Tmp4)) {
+    SDValue Load = N1.getOperand(0);
+    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+                      Load.getOperand(0) };
+    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
+    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    // Update the chain.
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    // Record the mem-refs
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+    CNode->setMemRefs(MemOp, MemOp + 1);
+    return CNode;
   }
 
-  MachineSDNode *NewNode;
-  SDValue Input = N0->getOperand(0);
+  SDValue Ops[] = { N0, N1, Imm };
+  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
+  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+  return CNode;
+}
+
+// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
+// to emit a second instruction after this one. This is needed since we have two
+// copyToReg nodes glued before this and we need to continue that glue through.
+MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
+                                             bool MayFoldLoad, const SDLoc &dl,
+                                             MVT VT, SDNode *Node,
+                                             SDValue &InFlag) {
+  SDValue N0 = Node->getOperand(0);
+  SDValue N2 = Node->getOperand(2);
+  SDValue Imm = Node->getOperand(4);
+  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+  // If there is a load, it will be behind a bitcast. We don't need to check
+  // alignment on this load.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
-    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
-    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+  if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
+      tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+                     Tmp3, Tmp4)) {
+    SDValue Load = N2.getOperand(0);
+    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+                      Load.getOperand(0), InFlag };
+    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
+    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    InFlag = SDValue(CNode, 3);
     // Update the chain.
-    ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
     MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-    MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand();
-    NewNode->setMemRefs(MemOp, MemOp + 1);
-  } else {
-    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+    MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+    CNode->setMemRefs(MemOp, MemOp + 1);
+    return CNode;
   }
 
-  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
-  CurDAG->RemoveDeadNode(Node);
+  SDValue Ops[] = { N0, N2, Imm, InFlag };
+  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
+  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+  InFlag = SDValue(CNode, 2);
+  return CNode;
+}
+
+/// If the high bits of an 'and' operand are known zero, try setting the
+/// high bits of an 'and' constant operand to produce a smaller encoding by
+/// creating a small, sign-extended negative immediate rather than a large
+/// positive one. This reverses a transform in SimplifyDemandedBits that
+/// shrinks mask constants by clearing bits. There is also a possibility that
+/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
+/// case, just replace the 'and'. Return 'true' if the node is replaced.
+bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
+  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
+  // have immediate operands.
+  MVT VT = And->getSimpleValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
+  if (!And1C)
+    return false;
+
+  // Bail out if the mask constant is already negative. It's can't shrink more.
+  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
+  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
+  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
+  // are negative too.
+  APInt MaskVal = And1C->getAPIntValue();
+  unsigned MaskLZ = MaskVal.countLeadingZeros();
+  if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
+    return false;
+
+  // Don't extend into the upper 32 bits of a 64 bit mask.
+  if (VT == MVT::i64 && MaskLZ >= 32) {
+    MaskLZ -= 32;
+    MaskVal = MaskVal.trunc(32);
+  }
+
+  SDValue And0 = And->getOperand(0);
+  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
+  APInt NegMaskVal = MaskVal | HighZeros;
+
+  // If a negative constant would not allow a smaller encoding, there's no need
+  // to continue. Only change the constant when we know it's a win.
+  unsigned MinWidth = NegMaskVal.getMinSignedBits();
+  if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
+    return false;
+
+  // Extend masks if we truncated above.
+  if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
+    NegMaskVal = NegMaskVal.zext(64);
+    HighZeros = HighZeros.zext(64);
+  }
+
+  // The variable operand must be all zeros in the top bits to allow using the
+  // new, negative constant as the mask.
+  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
+    return false;
+
+  // Check if the mask is -1. In that case, this is an unnecessary instruction
+  // that escaped earlier analysis.
+  if (NegMaskVal.isAllOnesValue()) {
+    ReplaceNode(And, And0.getNode());
+    return true;
+  }
+
+  // A negative mask allows a smaller encoding. Create a new 'and' node.
+  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
+  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
+  ReplaceNode(And, NewAnd.getNode());
+  SelectCode(NewAnd.getNode());
   return true;
 }
 
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
-  unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
   SDLoc dl(Node);
 
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
-
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
     Node->setNodeId(-1);
     return;   // Already selected.
   }
@@ -2483,9 +2808,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::AND:
-    // Try to match BEXTR/BEXTRI instruction.
     if (matchBEXTRFromAnd(Node))
       return;
+    if (shrinkAndImmediate(Node))
+      return;
 
     LLVM_FALLTHROUGH;
   case ISD::OR:
@@ -2577,7 +2903,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
-    Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+    unsigned Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
                                           N0, SDValue()).getValue(1);
@@ -2594,7 +2920,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
-    unsigned LoReg;
+    unsigned LoReg, Opc;
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     // MVT::i8 is handled by X86ISD::UMUL8.
@@ -2619,13 +2945,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
+    unsigned Opc, MOpc;
     bool isSigned = Opcode == ISD::SMUL_LOHI;
     bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
-      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
       case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
                      MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
       case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
@@ -2634,8 +2959,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     } else {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
-      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
       case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
       case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
       }
@@ -2644,14 +2967,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     unsigned SrcReg, LoReg, HiReg;
     switch (Opc) {
     default: llvm_unreachable("Unknown MUL opcode!");
-    case X86::IMUL8r:
-    case X86::MUL8r:
-      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
-      break;
-    case X86::IMUL16r:
-    case X86::MUL16r:
-      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
-      break;
     case X86::IMUL32r:
     case X86::MUL32r:
       SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
@@ -2721,27 +3036,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       }
     }
 
-    // Prevent use of AH in a REX instruction by referencing AX instead.
-    if (HiReg == X86::AH && Subtarget->is64Bit() &&
-        !SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::AX, MVT::i16, InFlag);
-      InFlag = Result.getValue(2);
-      // Get the low part if needed. Don't use getCopyFromReg for aliasing
-      // registers.
-      if (!SDValue(Node, 0).use_empty())
-        ReplaceUses(SDValue(Node, 0),
-          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
-
-      // Shift AX down 8 bits.
-      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
-                                              Result,
-                                     CurDAG->getTargetConstant(8, dl, MVT::i8)),
-                       0);
-      // Then truncate it down to i8.
-      ReplaceUses(SDValue(Node, 1),
-        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
-    }
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       if (!ResLo.getNode()) {
@@ -2751,7 +3045,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         InFlag = ResLo.getValue(2);
       }
       ReplaceUses(SDValue(Node, 0), ResLo);
-      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
+                 dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
@@ -2762,7 +3057,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         InFlag = ResHi.getValue(2);
       }
       ReplaceUses(SDValue(Node, 1), ResHi);
-      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
+                 dbgs() << '\n');
     }
 
     CurDAG->RemoveDeadNode(Node);
@@ -2776,6 +3072,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
+    unsigned Opc, MOpc;
     bool isSigned = (Opcode == ISD::SDIVREM ||
                      Opcode == X86ISD::SDIVREM8_SEXT_HREG);
     if (!isSigned) {
@@ -2909,7 +3206,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
       unsigned AHExtOpcode =
-          isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+          isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
 
       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
                                              MVT::Glue, AHCopy, InFlag);
@@ -2924,7 +3221,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
             CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
       }
       ReplaceUses(SDValue(Node, 1), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+                 dbgs() << '\n');
     }
     // Copy the division (low) result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
@@ -2932,7 +3230,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                                                 LoReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 0), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+                 dbgs() << '\n');
     }
     // Copy the remainder (high) result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
@@ -2940,18 +3239,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                                               HiReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 1), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+                 dbgs() << '\n');
     }
     CurDAG->RemoveDeadNode(Node);
     return;
   }
 
-  case X86ISD::CMP:
-  case X86ISD::SUB: {
-    // Sometimes a SUB is used to perform comparison.
-    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
-      // This node is not a CMP.
-      break;
+  case X86ISD::CMP: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
@@ -2962,8 +3257,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
     // use a smaller encoding.
     // Look past the truncate if CMP is the only use of it.
-    if ((N0.getOpcode() == ISD::AND ||
-         (N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) &&
+    if (N0.getOpcode() == ISD::AND &&
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8 &&
         X86::isZeroNode(N1)) {
@@ -2971,98 +3265,119 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       if (!C) break;
       uint64_t Mask = C->getZExtValue();
 
-      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      MVT VT;
+      int SubRegOp;
+      unsigned Op;
+
       if (isUInt<8>(Mask) &&
           (!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the l-register.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
-                                                        MVT::i8, Reg);
-
-        // Emit a testb.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
-                                                 Subreg, Imm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
+        // For example, convert "testl %eax, $8" to "testb %al, $8"
+        VT = MVT::i8;
+        SubRegOp = X86::sub_8bit;
+        Op = X86::TEST8ri;
+      } else if (OptForMinSize && isUInt<16>(Mask) &&
+                 (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
+        // For example, "testl %eax, $32776" to "testw %ax, $32776".
+        // NOTE: We only want to form TESTW instructions if optimizing for
+        // min size. Otherwise we only save one byte and possibly get a length
+        // changing prefix penalty in the decoders.
+        VT = MVT::i16;
+        SubRegOp = X86::sub_16bit;
+        Op = X86::TEST16ri;
+      } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
+                 (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
+        // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+        // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
+        // Otherwize, we find ourselves in a position where we have to do
+        // promotion. If previous passes did not promote the and, we assume
+        // they had a good reason not to and do not promote here.
+        VT = MVT::i32;
+        SubRegOp = X86::sub_32bit;
+        Op = X86::TEST32ri;
+      } else {
+        // No eligible transformation was found.
+        break;
       }
 
-      // For example, "testl %eax, $2048" to "testb %ah, $8".
-      if (isShiftedUInt<8, 8>(Mask) &&
-          (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
-        // Shift the immediate right by 8 bits.
-        SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the h-register.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
-                                                        MVT::i8, Reg);
-
-        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
-        // target GR8_NOREX registers, so make sure the register class is
-        // forced.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
-                                                 MVT::i32, Subreg, ShiftedImm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
+      SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
+      SDValue Reg = N0.getOperand(0);
 
-      // For example, "testl %eax, $32776" to "testw %ax, $32776".
-      // NOTE: We only want to form TESTW instructions if optimizing for
-      // min size. Otherwise we only save one byte and possibly get a length
-      // changing prefix penalty in the decoders.
-      if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
-          (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the 16-bit subregister.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
-                                                        MVT::i16, Reg);
-
-        // Emit a testw.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
-                                                 Subreg, Imm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
+      // Extract the subregister if necessary.
+      if (N0.getValueType() != VT)
+        Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
 
-      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
-      if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
-          (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the 32-bit subregister.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
-                                                        MVT::i32, Reg);
-
-        // Emit a testl.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
-                                                 Subreg, Imm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
+      // Emit a testl or testw.
+      SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
+      // Replace CMP with TEST.
+      ReplaceNode(Node, NewNode);
+      return;
     }
     break;
   }
+  case X86ISD::PCMPISTR: {
+    if (!Subtarget->hasSSE42())
+      break;
+
+    bool NeedIndex = !SDValue(Node, 0).use_empty();
+    bool NeedMask = !SDValue(Node, 1).use_empty();
+    // We can't fold a load if we are going to make two instructions.
+    bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+    MachineSDNode *CNode;
+    if (NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
+      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+    }
+    if (NeedIndex || !NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
+      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    }
+
+    // Connect the flag usage to the last instruction created.
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case X86ISD::PCMPESTR: {
+    if (!Subtarget->hasSSE42())
+      break;
+
+    // Copy the two implicit register inputs.
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
+                                          Node->getOperand(1),
+                                          SDValue()).getValue(1);
+    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+                                  Node->getOperand(3), InFlag).getValue(1);
+
+    bool NeedIndex = !SDValue(Node, 0).use_empty();
+    bool NeedMask = !SDValue(Node, 1).use_empty();
+    // We can't fold a load if we are going to make two instructions.
+    bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+    MachineSDNode *CNode;
+    if (NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
+      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
+                           InFlag);
+      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+    }
+    if (NeedIndex || !NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
+      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    }
+    // Connect the flag usage to the last instruction created.
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+
   case ISD::STORE:
     if (foldLoadStoreIntoMemOperand(Node))
       return;
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index c1ddb771e2fa..7dcdb7967058 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -103,7 +103,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
   X86ScalarSSEf64 = Subtarget.hasSSE2();
   X86ScalarSSEf32 = Subtarget.hasSSE1();
-  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   // Set up the TargetLowering object.
 
@@ -216,6 +216,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // We have an algorithm for SSE2, and we turn this into a 64-bit
     // FILD or VCVTUSI2SS/SD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
+  } else {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Expand);
   }
 
   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
@@ -235,7 +237,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   } else {
     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
-    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Expand);
   }
 
   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
@@ -611,7 +613,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // Long double always uses X87, except f128 in MMX.
   if (UseX87) {
     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
-      addRegisterClass(MVT::f128, &X86::FR128RegClass);
+      addRegisterClass(MVT::f128, &X86::VR128RegClass);
       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
       setOperationAction(ISD::FABS , MVT::f128, Custom);
       setOperationAction(ISD::FNEG , MVT::f128, Custom);
@@ -790,19 +792,33 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
-    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
+      setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
+      setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
+      setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
+    }
 
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
+    // Provide custom widening for v2f32 setcc. This is really for VLX when
+    // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
+    // type legalization changing the result type to v4i1 during widening.
+    // It works fine for SSE2 and is probably faster so no need to qualify with
+    // VLX support.
+    setOperationAction(ISD::SETCC,               MVT::v2i32, Custom);
+
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::CTPOP,              VT, Custom);
       setOperationAction(ISD::CTTZ,               VT, Custom);
+
+      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+      // setcc all the way to isel and prefer SETGT in some isel patterns.
+      setCondCodeAction(ISD::SETLT, VT, Custom);
+      setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
@@ -874,6 +890,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+    if (!Subtarget.hasAVX512())
+      setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
@@ -886,6 +904,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
     }
+
+    setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::ROTL,               MVT::v16i8, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -967,7 +989,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
-  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
     bool HasInt256 = Subtarget.hasInt256();
 
     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
@@ -1003,6 +1025,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
+    if (!Subtarget.hasAVX512())
+      setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
@@ -1014,6 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SRA, VT, Custom);
     }
 
+    setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
+    setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v32i8,  Custom);
+
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
@@ -1034,6 +1063,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTPOP,           VT, Custom);
       setOperationAction(ISD::CTTZ,            VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
+
+      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+      // setcc all the way to isel and prefer SETGT in some isel patterns.
+      setCondCodeAction(ISD::SETLT, VT, Custom);
+      setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     if (Subtarget.hasAnyFMA()) {
@@ -1060,6 +1094,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
 
+    setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
+    setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
+    setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
+    setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
+
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
@@ -1137,13 +1176,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  // This block controls legalization of the mask vector sizes that are
+  // available with AVX512. 512-bit vectors are in a separate block controlled
+  // by useAVX512Regs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
-    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
-    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
-    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
-    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
-
     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
+    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
+    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
@@ -1151,35 +1190,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
 
-    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
-    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
-    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1,  MVT::v4i32);
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
-
-    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
-    if (Subtarget.hasVLX()) {
-      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i1,  Custom);
-      setOperationAction(ISD::FP_TO_UINT,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i1,  Custom);
+
+    // There is no byte sized k-register load or store without AVX512DQ.
+    if (!Subtarget.hasDQI()) {
+      setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
+      setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
+      setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
+      setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
+
+      setOperationAction(ISD::STORE, MVT::v1i1, Custom);
+      setOperationAction(ISD::STORE, MVT::v2i1, Custom);
+      setOperationAction(ISD::STORE, MVT::v4i1, Custom);
+      setOperationAction(ISD::STORE, MVT::v8i1, Custom);
     }
 
-    // Extends of v16i1/v8i1 to 128-bit vectors.
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i8, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v16i8, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i16, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i16, Custom);
+    // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+      setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
+    }
 
-    for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
+    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::ADD,              VT, Custom);
       setOperationAction(ISD::SUB,              VT, Custom);
       setOperationAction(ISD::MUL,              VT, Custom);
@@ -1195,10 +1233,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1,  Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v2i1,  Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
-    for (auto VT : { MVT::v1i1, MVT::v8i1 })
+    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+  }
+
+  // This block controls legalization for 512-bit operations with 32/64 bit
+  // elements. 512-bits can be disabled based on prefer-vector-width and
+  // required-vector-width function attributes.
+  if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
@@ -1211,16 +1263,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
     }
 
-    for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
-                   MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
-                   MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
-      setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
-      setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
-      setTruncStoreAction(VT, MaskVT, Custom);
-    }
-
     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FNEG,  VT, Custom);
       setOperationAction(ISD::FABS,  VT, Custom);
@@ -1231,7 +1273,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
@@ -1306,6 +1350,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTTZ,             VT, Custom);
       setOperationAction(ISD::ROTL,             VT, Custom);
       setOperationAction(ISD::ROTR,             VT, Custom);
+      setOperationAction(ISD::SETCC,            VT, Custom);
+
+      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+      // setcc all the way to isel and prefer SETGT in some isel patterns.
+      setCondCodeAction(ISD::SETLT, VT, Custom);
+      setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     // Need to promote to 64-bit even though we have 32-bit masked instructions
@@ -1320,6 +1370,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+
+      setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
     }
 
     if (Subtarget.hasCDI()) {
@@ -1359,10 +1411,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
     }
+
+    // Need to custom split v32i16/v64i8 bitcasts.
+    if (!Subtarget.hasBWI()) {
+      setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
+      setOperationAction(ISD::BITCAST, MVT::v64i8,  Custom);
+    }
   }// has  AVX-512
 
-  if (!Subtarget.useSoftFloat() &&
-      (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
+  // This block controls legalization for operations that don't have
+  // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
+  // narrower widths.
+  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
@@ -1386,6 +1446,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ROTR,     VT, Custom);
     }
 
+    // Custom legalize 2x32 to get a little better code.
+    setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
+    setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
+
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
       setOperationAction(ISD::MSCATTER, VT, Custom);
@@ -1396,6 +1460,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
+
+        setOperationAction(ISD::MUL,            VT, Legal);
       }
     }
 
@@ -1412,10 +1478,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  // This block control legalization of v32i1/v64i1 which are available with
+  // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
+  // useBWIRegs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
-    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
-    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
-
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
@@ -1445,6 +1511,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
+  }
+
+  // This block controls legalization for v32i16 and v64i8. 512-bits can be
+  // disabled based on prefer-vector-width and required-vector-width function
+  // attributes.
+  if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
+    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
+
     // Extends from v64i1 masks to 512-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
@@ -1494,6 +1569,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMAX,         VT, Legal);
       setOperationAction(ISD::SMIN,         VT, Legal);
       setOperationAction(ISD::UMIN,         VT, Legal);
+      setOperationAction(ISD::SETCC,        VT, Custom);
 
       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
@@ -1510,8 +1586,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
-  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
-      (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
@@ -1528,41 +1603,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
-    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
-    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
-
-    for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
-      setOperationAction(ISD::ADD,                VT, Custom);
-      setOperationAction(ISD::SUB,                VT, Custom);
-      setOperationAction(ISD::MUL,                VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Expand);
-
-      setOperationAction(ISD::TRUNCATE,           VT, Custom);
-      setOperationAction(ISD::SETCC,              VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
-      setOperationAction(ISD::SELECT,             VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-    }
-
-    // TODO: v8i1 concat should be legal without VLX to support concats of
-    // v1i1, but we won't legalize it correctly currently without introducing
-    // a v4i1 concat in the middle.
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
-    for (auto VT : { MVT::v2i1, MVT::v4i1 })
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
-    // Extends from v2i1/v4i1 masks to 128-bit vectors.
-    setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
-    setOperationAction(ISD::ANY_EXTEND,       MVT::v4i32, Custom);
-    setOperationAction(ISD::ANY_EXTEND,       MVT::v2i64, Custom);
-
     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
@@ -1662,6 +1702,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
@@ -1747,6 +1788,9 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
 
 TargetLoweringBase::LegalizeTypeAction
 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+    return TypeSplitVector;
+
   if (ExperimentalVectorWideningLegalization &&
       VT.getVectorNumElements() != 1 &&
       VT.getVectorElementType().getSimpleVT() != MVT::i1)
@@ -1755,6 +1799,20 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                     EVT VT) const {
+  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+    return MVT::v32i8;
+  return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
+}
+
+unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+                                                          EVT VT) const {
+  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+    return 1;
+  return TargetLowering::getNumRegistersForCallingConv(Context, VT);
+}
+
 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                           LLVMContext& Context,
                                           EVT VT) const {
@@ -1951,7 +2009,7 @@ void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
   // Mark the first N int arguments as having reg
   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
     Type *T = Args[Idx].Ty;
-    if (T->isPointerTy() || T->isIntegerTy())
+    if (T->isIntOrPtrTy())
       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
         unsigned numRegs = 1;
         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
@@ -2065,7 +2123,8 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
 
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
-  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
     // MSVC CRT has a global variable holding security cookie.
     M.getOrInsertGlobal("__security_cookie",
                         Type::getInt8PtrTy(M.getContext()));
@@ -2087,15 +2146,19 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
 
 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
   // MSVC CRT has a global variable holding security cookie.
-  if (Subtarget.getTargetTriple().isOSMSVCRT())
+  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
     return M.getGlobalVariable("__security_cookie");
+  }
   return TargetLowering::getSDagStackGuard(M);
 }
 
 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
   // MSVC CRT has a function to validate security cookie.
-  if (Subtarget.getTargetTriple().isOSMSVCRT())
+  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
     return M.getFunction("__security_check_cookie");
+  }
   return TargetLowering::getSSPStackGuardCheck(M);
 }
 
@@ -2168,13 +2231,16 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
     if (ValLoc == MVT::i32)
       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
     return ValToCopy;
-  } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
-             (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
+  }
+
+  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
+      (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
     // One stage lowering is required
     // bitcast:   v32i1 -> i32 / v64i1 -> i64
     return DAG.getBitcast(ValLoc, ValArg);
-  } else
-    return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
+  }
+
+  return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
 }
 
 /// Breaks v64i1 value into two registers and adds the new node to the DAG
@@ -2492,10 +2558,10 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterClass *RC = &X86::GR32RegClass;
 
-  // Read a 32 bit value from the registers
+  // Read a 32 bit value from the registers.
   if (nullptr == InFlag) {
     // When no physical register is present,
-    // create an intermediate virtual register
+    // create an intermediate virtual register.
     Reg = MF.addLiveIn(VA.getLocReg(), RC);
     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
@@ -2511,13 +2577,13 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
     *InFlag = ArgValueHi.getValue(2);
   }
 
-  // Convert the i32 type into v32i1 type
+  // Convert the i32 type into v32i1 type.
   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
 
-  // Convert the i32 type into v32i1 type
+  // Convert the i32 type into v32i1 type.
   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
 
-  // Concatenate the two values together
+  // Concatenate the two values together.
   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
 }
 
@@ -2658,7 +2724,7 @@ enum StructReturnType {
   StackStructReturn
 };
 static StructReturnType
-callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
+callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
   if (Outs.empty())
     return NotStructReturn;
 
@@ -2672,7 +2738,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
 
 /// Determines whether a function uses struct return semantics.
 static StructReturnType
-argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
+argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
   if (Ins.empty())
     return NotStructReturn;
 
@@ -2792,7 +2858,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   if (Flags.isByVal()) {
     unsigned Bytes = Flags.getByValSize();
     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
-    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+
+    // FIXME: For now, all byval parameter objects are marked as aliasing. This
+    // can be improved with deeper analysis.
+    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
+                                   /*isAliased=*/true);
     // Adjust SP offset of interrupt parameter.
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
@@ -2916,7 +2986,7 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
 }
 
 #ifndef NDEBUG
-static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
                           return A.getValNo() < B.getValNo();
@@ -2993,7 +3063,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
       } else {
         const TargetRegisterClass *RC;
-        if (RegVT == MVT::i32)
+        if (RegVT == MVT::i8)
+          RC = &X86::GR8RegClass;
+        else if (RegVT == MVT::i16)
+          RC = &X86::GR16RegClass;
+        else if (RegVT == MVT::i32)
           RC = &X86::GR32RegClass;
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
@@ -3004,7 +3078,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
         else if (RegVT == MVT::f80)
           RC = &X86::RFP80RegClass;
         else if (RegVT == MVT::f128)
-          RC = &X86::FR128RegClass;
+          RC = &X86::VR128RegClass;
         else if (RegVT.is512BitVector())
           RC = &X86::VR512RegClass;
         else if (RegVT.is256BitVector())
@@ -3379,6 +3453,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
+  const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
+  bool HasNoCfCheck =
+      (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
+  const Module *M = MF.getMMI().getModule();
+  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
 
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
@@ -3761,6 +3840,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     Callee = DAG.getTargetExternalSymbol(
         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+
+    if (OpFlags == X86II::MO_GOTPCREL) {
+      Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+          getPointerTy(DAG.getDataLayout()), Callee);
+      Callee = DAG.getLoad(
+          getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+    }
   } else if (Subtarget.isTarget64BitILP32() &&
              Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -3822,9 +3909,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
     // Allocate a new Reg Mask and copy Mask.
-    RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
-    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
-    memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+    RegMask = MF.allocateRegMask();
+    unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+    memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
 
     // Make sure all sub registers of the argument registers are reset
     // in the RegMask.
@@ -3854,7 +3941,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
-  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+  if (HasNoCfCheck && IsCFProtectionSupported) {
+    Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
+  } else {
+    Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+  }
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -4278,8 +4369,6 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::VSRLDQ:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVHLPS:
-  case X86ISD::MOVLPS:
-  case X86ISD::MOVLPD:
   case X86ISD::MOVSHDUP:
   case X86ISD::MOVSLDUP:
   case X86ISD::MOVDDUP:
@@ -4291,12 +4380,12 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
+  case X86ISD::SHUF128:
   case X86ISD::VPERMIL2:
   case X86ISD::VPERMI:
   case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
-  case X86ISD::VPERMIV3:
   case X86ISD::VZEXT_MOVL:
     return true;
   }
@@ -4312,7 +4401,6 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
   case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
-  case X86ISD::VPERMIV3:
     return true;
   // 'Faux' Target Shuffles.
   case ISD::AND:
@@ -4389,7 +4477,7 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
   }
 }
 
-/// \brief Return true if the condition is an unsigned comparison operation.
+/// Return true if the condition is an unsigned comparison operation.
 static bool isX86CCUnsigned(unsigned X86CC) {
   switch (X86CC) {
   default:
@@ -4536,20 +4624,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   Info.offset = 0;
 
   switch (IntrData->Type) {
-  case EXPAND_FROM_MEM: {
-    Info.ptrVal = I.getArgOperand(0);
-    Info.memVT = MVT::getVT(I.getType());
-    Info.align = 1;
-    Info.flags |= MachineMemOperand::MOLoad;
-    break;
-  }
-  case COMPRESS_TO_MEM: {
-    Info.ptrVal = I.getArgOperand(0);
-    Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
-    Info.align = 1;
-    Info.flags |= MachineMemOperand::MOStore;
-    break;
-  }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
   case TRUNCATE_TO_MEM_VI32: {
@@ -4598,7 +4672,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
   return true;
 }
 
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
@@ -4675,14 +4749,52 @@ bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
 }
 
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  if (VT.isVector())
+    return false;
+
   if (!Subtarget.hasBMI())
     return false;
 
   // There are only 32-bit and 64-bit forms for 'andn'.
-  EVT VT = Y.getValueType();
   if (VT != MVT::i32 && VT != MVT::i64)
     return false;
 
+  // A mask and compare against constant is ok for an 'andn' too
+  // even though the BMI instruction doesn't have an immediate form.
+
+  return true;
+}
+
+bool X86TargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
+    return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
+
+  // Vector.
+
+  if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
+    return false;
+
+  if (VT == MVT::v4i32)
+    return true;
+
+  return Subtarget.hasSSE2();
+}
+
+bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // For vectors, we don't have a preference, but we probably want a mask.
+  if (VT.isVector())
+    return false;
+
+  // 64-bit shifts on 32-bit targets produce really bad bloated code.
+  if (VT == MVT::i64 && !Subtarget.is64Bit())
+    return false;
+
   return true;
 }
 
@@ -4725,10 +4837,24 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   return true;
 }
 
+/// Return true if Val falls within the specified range (L, H].
+static bool isInRange(int Val, int Low, int Hi) {
+  return (Val >= Low && Val < Hi);
+}
+
+/// Return true if the value of any element in Mask falls within the specified
+/// range (L, H].
+static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
+  for (int M : Mask)
+    if (isInRange(M, Low, Hi))
+      return true;
+  return false;
+}
+
 /// Return true if Val is undef or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
-  return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
+  return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
 }
 
 /// Return true if every element in Mask is undef or if its value
@@ -4744,7 +4870,7 @@ static bool isUndefOrInRange(ArrayRef<int> Mask,
 /// Return true if Val is undef, zero or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
-  return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
+  return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
 }
 
 /// Return true if every element in Mask is undef, zero or if its value
@@ -4757,11 +4883,11 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
 }
 
 /// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (Low, Low+Size]. or is undef.
-static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
-                                       unsigned Pos, unsigned Size, int Low) {
-  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+/// from position Pos and ending in Pos + Size, falls within the specified
+/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
+                                       unsigned Size, int Low, int Step = 1) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
     if (!isUndefOrEqual(Mask[i], Low))
       return false;
   return true;
@@ -4788,7 +4914,7 @@ static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
   return true;
 }
 
-/// \brief Helper function to test whether a shuffle mask could be
+/// Helper function to test whether a shuffle mask could be
 /// simplified by widening the elements being shuffled.
 ///
 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
@@ -4847,6 +4973,24 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
   return true;
 }
 
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+                                    const APInt &Zeroable,
+                                    SmallVectorImpl<int> &WidenedMask) {
+  SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
+  for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+    if (TargetMask[i] == SM_SentinelUndef)
+      continue;
+    if (Zeroable[i])
+      TargetMask[i] = SM_SentinelZero;
+  }
+  return canWidenShuffleElements(TargetMask, WidenedMask);
+}
+
+static bool canWidenShuffleElements(ArrayRef<int> Mask) {
+  SmallVector<int, 32> WidenedMask;
+  return canWidenShuffleElements(Mask, WidenedMask);
+}
+
 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -4942,8 +5086,6 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
   } else if (VT.getVectorElementType() == MVT::i1) {
     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
            "Unexpected vector type");
-    assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
-           "Unexpected vector type");
     Vec = DAG.getConstant(0, dl, VT);
   } else {
     unsigned Num32BitElts = VT.getSizeInBits() / 32;
@@ -5033,10 +5175,66 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
-static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
-  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
+                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                              const SDLoc &dl) {
+  assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
+         Vec.getValueType().getScalarType() == VT.getScalarType() &&
+         "Unsupported vector widening type");
+  SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
+                                : DAG.getUNDEF(VT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+// Helper for splitting operands of an operation to legal target size and
+// apply a function on each part.
+// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
+// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
+// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
+// The argument Builder is a function that will be applied on each split part:
+// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
+template <typename F>
+SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
+                         const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
+                         F Builder, bool CheckBWI = true) {
+  assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
+  unsigned NumSubs = 1;
+  if ((CheckBWI && Subtarget.useBWIRegs()) ||
+      (!CheckBWI && Subtarget.useAVX512Regs())) {
+    if (VT.getSizeInBits() > 512) {
+      NumSubs = VT.getSizeInBits() / 512;
+      assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
+    }
+  } else if (Subtarget.hasAVX2()) {
+    if (VT.getSizeInBits() > 256) {
+      NumSubs = VT.getSizeInBits() / 256;
+      assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
+    }
+  } else {
+    if (VT.getSizeInBits() > 128) {
+      NumSubs = VT.getSizeInBits() / 128;
+      assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
+    }
+  }
+
+  if (NumSubs == 1)
+    return Builder(DAG, DL, Ops);
+
+  SmallVector<SDValue, 4> Subs;
+  for (unsigned i = 0; i != NumSubs; ++i) {
+    SmallVector<SDValue, 2> SubOps;
+    for (SDValue Op : Ops) {
+      EVT OpVT = Op.getValueType();
+      unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
+      unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
+      SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
+    }
+    Subs.push_back(Builder(DAG, DL, SubOps));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
 }
 
 // Return true if the instruction zeroes the unused upper part of the
@@ -5045,13 +5243,9 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
   switch (Opcode) {
   default:
     return false;
-  case X86ISD::TESTM:
-  case X86ISD::TESTNM:
-  case X86ISD::PCMPEQM:
-  case X86ISD::PCMPGTM:
   case X86ISD::CMPM:
-  case X86ISD::CMPMU:
   case X86ISD::CMPM_RND:
+  case ISD::SETCC:
     return true;
   }
 }
@@ -5192,22 +5386,11 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
 }
 
-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
-                                   unsigned NumElems, SelectionDAG &DAG,
-                                   const SDLoc &dl) {
-  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
-}
-
-static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
-                                   unsigned NumElems, SelectionDAG &DAG,
-                                   const SDLoc &dl) {
-  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
+static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
+                                unsigned NumElems, SelectionDAG &DAG,
+                                const SDLoc &dl, unsigned VectorWidth) {
+  SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
+  return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
 }
 
 /// Returns a vector of specified type with all bits set.
@@ -5291,6 +5474,13 @@ static SDValue peekThroughOneUseBitcasts(SDValue V) {
   return V;
 }
 
+// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
+static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
+  while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    V = V.getOperand(0);
+  return V;
+}
+
 static const Constant *getTargetConstantFromNode(SDValue Op) {
   Op = peekThroughBitcasts(Op);
 
@@ -5415,6 +5605,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
+  if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+    APInt UndefSrcElts = APInt::getNullValue(1);
+    APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
+    SmallVector<APInt, 64> SrcEltBits(1, RawBits);
+    return CastBitData(UndefSrcElts, SrcEltBits);
+  }
 
   // Extract constant bits from build vector.
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
@@ -5551,14 +5747,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
+                    cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::INSERTPS:
@@ -5574,7 +5771,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         isa<ConstantSDNode>(N->getOperand(2))) {
       int BitLen = N->getConstantOperandVal(1);
       int BitIdx = N->getConstantOperandVal(2);
-      DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
+      DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
+                       Mask);
       IsUnary = true;
     }
     break;
@@ -5585,20 +5783,21 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         isa<ConstantSDNode>(N->getOperand(3))) {
       int BitLen = N->getConstantOperandVal(2);
       int BitIdx = N->getConstantOperandVal(3);
-      DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
+      DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
+                         Mask);
       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     }
     break;
   case X86ISD::UNPCKH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeUNPCKHMask(VT, Mask);
+    DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeUNPCKLMask(VT, Mask);
+    DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVHLPS:
@@ -5618,7 +5817,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                      Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     Ops.push_back(N->getOperand(1));
     Ops.push_back(N->getOperand(0));
@@ -5627,38 +5827,43 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                     Mask);
     IsUnary = true;
     break;
   case X86ISD::VSRLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                     Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
+                    cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFHW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                      Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                      Mask);
     IsUnary = true;
     break;
   case X86ISD::VZEXT_MOVL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    DecodeZeroMoveLowMask(VT, Mask);
+    DecodeZeroMoveLowMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
@@ -5674,7 +5879,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     // came from an extract from the original width. If we found one, we
     // pushed it the Ops vector above.
     if (N0.getValueType() == VT || !Ops.empty()) {
-      DecodeVectorBroadcast(VT, Mask);
+      DecodeVectorBroadcast(NumElems, Mask);
       IsUnary = true;
       break;
     }
@@ -5687,7 +5892,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     SmallVector<uint64_t, 32> RawMask;
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-      DecodeVPERMILPMask(VT, RawMask, Mask);
+      DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
@@ -5716,41 +5921,47 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::VPERMI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
+    DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
     break;
   case X86ISD::VPERM2X128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                         Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::SHUF128:
+    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
+                              cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                              Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    DecodeMOVSLDUPMask(VT, Mask);
+    DecodeMOVSLDUPMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSHDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    DecodeMOVSHDUPMask(VT, Mask);
+    DecodeMOVSHDUPMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVDDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    DecodeMOVDDUPMask(VT, Mask);
+    DecodeMOVDDUPMask(NumElems, Mask);
     IsUnary = true;
     break;
-  case X86ISD::MOVLPD:
-  case X86ISD::MOVLPS:
-    // Not yet implemented
-    return false;
   case X86ISD::VPERMIL2: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
@@ -5762,7 +5973,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       unsigned CtrlImm = CtrlOp->getZExtValue();
       SmallVector<uint64_t, 32> RawMask;
       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-        DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
+        DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
+                            RawMask, Mask);
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
@@ -5821,21 +6033,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     }
     return false;
   }
-  case X86ISD::VPERMIV3: {
-    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
-    IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
-    // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
-    Ops.push_back(N->getOperand(1));
-    Ops.push_back(N->getOperand(2));
-    SDValue MaskNode = N->getOperand(0);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
-      break;
-    }
-    return false;
-  }
   default: llvm_unreachable("unknown target shuffle node");
   }
 
@@ -5953,7 +6150,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
 // destination value type.
 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
-                               SelectionDAG &DAG) {
+                               const SelectionDAG &DAG) {
   Mask.clear();
   Ops.clear();
 
@@ -5966,6 +6163,17 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
+  case ISD::VECTOR_SHUFFLE: {
+    // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
+    if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
+      Mask.append(ShuffleMask.begin(), ShuffleMask.end());
+      Ops.push_back(N.getOperand(0));
+      Ops.push_back(N.getOperand(1));
+      return true;
+    }
+    return false;
+  }
   case ISD::AND:
   case X86ISD::ANDNP: {
     // Attempt to decode as a per-byte mask.
@@ -6027,8 +6235,11 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
   case X86ISD::PINSRW: {
     SDValue InVec = N.getOperand(0);
     SDValue InScl = N.getOperand(1);
+    SDValue InIndex = N.getOperand(2);
+    if (!isa<ConstantSDNode>(InIndex) ||
+        cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
+      return false;
     uint64_t InIdx = N.getConstantOperandVal(2);
-    assert(InIdx < NumElts && "Illegal insertion index");
 
     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
     if (X86::isZeroNode(InScl)) {
@@ -6046,8 +6257,12 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
       return false;
 
     SDValue ExVec = InScl.getOperand(0);
+    SDValue ExIndex = InScl.getOperand(1);
+    if (!isa<ConstantSDNode>(ExIndex) ||
+        cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
+      return false;
     uint64_t ExIdx = InScl.getConstantOperandVal(1);
-    assert(ExIdx < NumElts && "Illegal extraction index");
+
     Ops.push_back(InVec);
     Ops.push_back(ExVec);
     for (unsigned i = 0; i != NumElts; ++i)
@@ -6123,7 +6338,8 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     MVT SrcVT = Src.getSimpleValueType();
     if (NumSizeInBits != SrcVT.getSizeInBits())
       break;
-    DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
+    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
+                         VT.getVectorNumElements(), Mask);
     Ops.push_back(Src);
     return true;
   }
@@ -6167,7 +6383,7 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
 static bool resolveTargetShuffleInputs(SDValue Op,
                                        SmallVectorImpl<SDValue> &Inputs,
                                        SmallVectorImpl<int> &Mask,
-                                       SelectionDAG &DAG) {
+                                       const SelectionDAG &DAG) {
   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
     if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
       return false;
@@ -6477,9 +6693,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
   MVT ShVT = MVT::v16i8;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
-  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
-  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
@@ -6831,17 +7046,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
         BOperand = ZeroExtended.getOperand(0);
       else
         BOperand = Ld.getOperand(0).getOperand(0);
-      if (BOperand.getValueType().isVector() &&
-          BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
-        if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
-                                     NumElts == 8)) || // for broadcastmb2q
-            (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
-                                     NumElts == 16))) { // for broadcastmw2d
-          SDValue Brdcst =
-              DAG.getNode(X86ISD::VBROADCASTM, dl,
-                          MVT::getVectorVT(EltType, NumElts), BOperand);
-          return DAG.getBitcast(VT, Brdcst);
-        }
+      MVT MaskVT = BOperand.getSimpleValueType();
+      if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
+          (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
+        SDValue Brdcst =
+            DAG.getNode(X86ISD::VBROADCASTM, dl,
+                        MVT::getVectorVT(EltType, NumElts), BOperand);
+        return DAG.getBitcast(VT, Brdcst);
       }
     }
   }
@@ -7008,7 +7219,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   return SDValue();
 }
 
-/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// For an EXTRACT_VECTOR_ELT with a constant index return the real
 /// underlying vector and index.
 ///
 /// Modifies \p ExtractedFromVec to the real vector and returns the real
@@ -7221,7 +7432,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
   return DstVec;
 }
 
-/// \brief Return true if \p N implements a horizontal binop and return the
+/// Return true if \p N implements a horizontal binop and return the
 /// operands for the horizontal binop into V0 and V1.
 ///
 /// This is a helper function of LowerToHorizontalOp().
@@ -7318,7 +7529,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   return CanFold;
 }
 
-/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// Emit a sequence of two 128-bit horizontal add/sub followed by
 /// a concat_vector.
 ///
 /// This is a helper function of LowerToHorizontalOp().
@@ -7386,18 +7597,18 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
 }
 
 /// Returns true iff \p BV builds a vector with the result equivalent to
-/// the result of ADDSUB operation.
-/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
-/// are written to the parameters \p Opnd0 and \p Opnd1.
-static bool isAddSub(const BuildVectorSDNode *BV,
-                     const X86Subtarget &Subtarget, SelectionDAG &DAG,
-                     SDValue &Opnd0, SDValue &Opnd1,
-                     unsigned &NumExtracts) {
+/// the result of ADDSUB/SUBADD operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
+/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
+/// \p Opnd0 and \p Opnd1.
+static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
+                             const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                             SDValue &Opnd0, SDValue &Opnd1,
+                             unsigned &NumExtracts,
+                             bool &IsSubAdd) {
 
   MVT VT = BV->getSimpleValueType(0);
-  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
-      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
-      (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+  if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -7407,26 +7618,20 @@ static bool isAddSub(const BuildVectorSDNode *BV,
   NumExtracts = 0;
 
   // Odd-numbered elements in the input build vector are obtained from
-  // adding two integer/float elements.
+  // adding/subtracting two integer/float elements.
   // Even-numbered elements in the input build vector are obtained from
-  // subtracting two integer/float elements.
-  unsigned ExpectedOpcode = ISD::FSUB;
-  unsigned NextExpectedOpcode = ISD::FADD;
-  bool AddFound = false;
-  bool SubFound = false;
-
+  // subtracting/adding two integer/float elements.
+  unsigned Opc[2] {0, 0};
   for (unsigned i = 0, e = NumElts; i != e; ++i) {
     SDValue Op = BV->getOperand(i);
 
     // Skip 'undef' values.
     unsigned Opcode = Op.getOpcode();
-    if (Opcode == ISD::UNDEF) {
-      std::swap(ExpectedOpcode, NextExpectedOpcode);
+    if (Opcode == ISD::UNDEF)
       continue;
-    }
 
     // Early exit if we found an unexpected opcode.
-    if (Opcode != ExpectedOpcode)
+    if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
       return false;
 
     SDValue Op0 = Op.getOperand(0);
@@ -7446,11 +7651,11 @@ static bool isAddSub(const BuildVectorSDNode *BV,
     if (I0 != i)
       return false;
 
-    // We found a valid add/sub node. Update the information accordingly.
-    if (i & 1)
-      AddFound = true;
-    else
-      SubFound = true;
+    // We found a valid add/sub node, make sure its the same opcode as previous
+    // elements for this parity.
+    if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
+      return false;
+    Opc[i % 2] = Opcode;
 
     // Update InVec0 and InVec1.
     if (InVec0.isUndef()) {
@@ -7467,7 +7672,7 @@ static bool isAddSub(const BuildVectorSDNode *BV,
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.
     if (InVec0 != Op0.getOperand(0)) {
-      if (ExpectedOpcode == ISD::FSUB)
+      if (Opcode == ISD::FSUB)
         return false;
 
       // FADD is commutable. Try to commute the operands
@@ -7480,17 +7685,19 @@ static bool isAddSub(const BuildVectorSDNode *BV,
     if (InVec1 != Op1.getOperand(0))
       return false;
 
-    // Update the pair of expected opcodes.
-    std::swap(ExpectedOpcode, NextExpectedOpcode);
-
     // Increment the number of extractions done.
     ++NumExtracts;
   }
 
-  // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
-  if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
+  // Ensure we have found an opcode for both parities and that they are
+  // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
+  // inputs are undef.
+  if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
+      InVec0.isUndef() || InVec1.isUndef())
     return false;
 
+  IsSubAdd = Opc[0] == ISD::FADD;
+
   Opnd0 = InVec0;
   Opnd1 = InVec1;
   return true;
@@ -7547,14 +7754,17 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
   return true;
 }
 
-/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
-/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
+/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
+/// X86ISD::FMSUBADD node.
 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
   unsigned NumExtracts;
-  if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
+  bool IsSubAdd;
+  if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
+                        IsSubAdd))
     return SDValue();
 
   MVT VT = BV->getSimpleValueType(0);
@@ -7562,10 +7772,14 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
-  // TODO: According to coverage reports, the FMADDSUB transform is not
-  // triggered by any tests.
-  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
-    return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
+    unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+    return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
+  }
+
+  // We only support ADDSUB.
+  if (IsSubAdd)
+    return SDValue();
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   // the ADDSUB idiom has been successfully recognized. There are no known
@@ -7792,66 +8006,268 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
-// reasoned to be a permutation of a vector by indices in a non-constant vector.
-// (build_vector (extract_elt V, (extract_elt I, 0)),
-//               (extract_elt V, (extract_elt I, 1)),
-//                    ...
-// ->
-// (vpermv I, V)
-//
-// TODO: Handle undefs
-// TODO: Utilize pshufb and zero mask blending to support more efficient
-// construction of vectors with constant-0 elements.
-// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
-// when no native operation available.
-static SDValue
-LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
-                                   const X86Subtarget &Subtarget) {
-  // Look for VPERMV and PSHUFB opportunities.
-  MVT VT = V.getSimpleValueType();
+/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
+/// from a vector of source values and a vector of extraction indices.
+/// The vectors might be manipulated to match the type of the permute op.
+static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
+                                     SDLoc &DL, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  MVT ShuffleVT = VT;
+  EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned SizeInBits = VT.getSizeInBits();
+
+  // Adjust IndicesVec to match VT size.
+  assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
+         "Illegal variable permute mask size");
+  if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
+    IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
+                                  NumElts * VT.getScalarSizeInBits());
+  IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+
+  // Handle SrcVec that don't match VT type.
+  if (SrcVec.getValueSizeInBits() != SizeInBits) {
+    if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
+      // Handle larger SrcVec by treating it as a larger permute.
+      unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
+      VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
+      IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
+      IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
+                                  Subtarget, DAG, SDLoc(IndicesVec));
+      return extractSubVector(
+          createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
+          DAG, DL, SizeInBits);
+    } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
+      // Widen smaller SrcVec to match VT.
+      SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
+    } else
+      return SDValue();
+  }
+
+  auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
+    assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
+    EVT SrcVT = Idx.getValueType();
+    unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
+    uint64_t IndexScale = 0;
+    uint64_t IndexOffset = 0;
+
+    // If we're scaling a smaller permute op, then we need to repeat the
+    // indices, scaling and offsetting them as well.
+    // e.g. v4i32 -> v16i8 (Scale = 4)
+    // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
+    // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
+    for (uint64_t i = 0; i != Scale; ++i) {
+      IndexScale |= Scale << (i * NumDstBits);
+      IndexOffset |= i << (i * NumDstBits);
+    }
+
+    Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
+                      DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
+    Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
+                      DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
+    return Idx;
+  };
+
+  unsigned Opcode = 0;
   switch (VT.SimpleTy) {
   default:
-    return SDValue();
+    break;
   case MVT::v16i8:
-    if (!Subtarget.hasSSE3())
-      return SDValue();
+    if (Subtarget.hasSSSE3())
+      Opcode = X86ISD::PSHUFB;
+    break;
+  case MVT::v8i16:
+    if (Subtarget.hasVLX() && Subtarget.hasBWI())
+      Opcode = X86ISD::VPERMV;
+    else if (Subtarget.hasSSSE3()) {
+      Opcode = X86ISD::PSHUFB;
+      ShuffleVT = MVT::v16i8;
+    }
+    break;
+  case MVT::v4f32:
+  case MVT::v4i32:
+    if (Subtarget.hasAVX()) {
+      Opcode = X86ISD::VPERMILPV;
+      ShuffleVT = MVT::v4f32;
+    } else if (Subtarget.hasSSSE3()) {
+      Opcode = X86ISD::PSHUFB;
+      ShuffleVT = MVT::v16i8;
+    }
+    break;
+  case MVT::v2f64:
+  case MVT::v2i64:
+    if (Subtarget.hasAVX()) {
+      // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
+      IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
+      Opcode = X86ISD::VPERMILPV;
+      ShuffleVT = MVT::v2f64;
+    } else if (Subtarget.hasSSE41()) {
+      // SSE41 can compare v2i64 - select between indices 0 and 1.
+      return DAG.getSelectCC(
+          DL, IndicesVec,
+          getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
+          DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
+          DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
+          ISD::CondCode::SETEQ);
+    }
+    break;
+  case MVT::v32i8:
+    if (Subtarget.hasVLX() && Subtarget.hasVBMI())
+      Opcode = X86ISD::VPERMV;
+    else if (Subtarget.hasXOP()) {
+      SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
+      SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
+      SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
+      SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
+      return DAG.getNode(
+          ISD::CONCAT_VECTORS, DL, VT,
+          DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
+          DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
+    } else if (Subtarget.hasAVX()) {
+      SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
+      SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
+      SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
+      SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
+      auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                              ArrayRef<SDValue> Ops) {
+        // Permute Lo and Hi and then select based on index range.
+        // This works as SHUFB uses bits[3:0] to permute elements and we don't
+        // care about the bit[7] as its just an index vector.
+        SDValue Idx = Ops[2];
+        EVT VT = Idx.getValueType();
+        return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
+                               DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
+                               DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
+                               ISD::CondCode::SETGT);
+      };
+      SDValue Ops[] = {LoLo, HiHi, IndicesVec};
+      return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
+                              PSHUFBBuilder);
+    }
+    break;
+  case MVT::v16i16:
+    if (Subtarget.hasVLX() && Subtarget.hasBWI())
+      Opcode = X86ISD::VPERMV;
+    else if (Subtarget.hasAVX()) {
+      // Scale to v32i8 and perform as v32i8.
+      IndicesVec = ScaleIndices(IndicesVec, 2);
+      return DAG.getBitcast(
+          VT, createVariablePermute(
+                  MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
+                  DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
+    }
     break;
   case MVT::v8f32:
   case MVT::v8i32:
-    if (!Subtarget.hasAVX2())
-      return SDValue();
+    if (Subtarget.hasAVX2())
+      Opcode = X86ISD::VPERMV;
+    else if (Subtarget.hasAVX()) {
+      SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
+      SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
+                                          {0, 1, 2, 3, 0, 1, 2, 3});
+      SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
+                                          {4, 5, 6, 7, 4, 5, 6, 7});
+      if (Subtarget.hasXOP())
+        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
+                                              LoLo, HiHi, IndicesVec,
+                                              DAG.getConstant(0, DL, MVT::i8)));
+      // Permute Lo and Hi and then select based on index range.
+      // This works as VPERMILPS only uses index bits[0:1] to permute elements.
+      SDValue Res = DAG.getSelectCC(
+          DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
+          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
+          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
+          ISD::CondCode::SETGT);
+      return DAG.getBitcast(VT, Res);
+    }
     break;
   case MVT::v4i64:
   case MVT::v4f64:
-    if (!Subtarget.hasVLX())
-      return SDValue();
+    if (Subtarget.hasAVX512()) {
+      if (!Subtarget.hasVLX()) {
+        MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
+        SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
+                                SDLoc(SrcVec));
+        IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
+                                    DAG, SDLoc(IndicesVec));
+        SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
+                                            DAG, Subtarget);
+        return extract256BitVector(Res, 0, DAG, DL);
+      }
+      Opcode = X86ISD::VPERMV;
+    } else if (Subtarget.hasAVX()) {
+      SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
+      SDValue LoLo =
+          DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
+      SDValue HiHi =
+          DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
+      // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
+      IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
+      if (Subtarget.hasXOP())
+        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
+                                              LoLo, HiHi, IndicesVec,
+                                              DAG.getConstant(0, DL, MVT::i8)));
+      // Permute Lo and Hi and then select based on index range.
+      // This works as VPERMILPD only uses index bit[1] to permute elements.
+      SDValue Res = DAG.getSelectCC(
+          DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
+          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
+          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
+          ISD::CondCode::SETGT);
+      return DAG.getBitcast(VT, Res);
+    }
     break;
-  case MVT::v16f32:
-  case MVT::v8f64:
-  case MVT::v16i32:
-  case MVT::v8i64:
-    if (!Subtarget.hasAVX512())
-      return SDValue();
+  case MVT::v64i8:
+    if (Subtarget.hasVBMI())
+      Opcode = X86ISD::VPERMV;
     break;
   case MVT::v32i16:
-    if (!Subtarget.hasBWI())
-      return SDValue();
-    break;
-  case MVT::v8i16:
-  case MVT::v16i16:
-    if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
-      return SDValue();
-    break;
-  case MVT::v64i8:
-    if (!Subtarget.hasVBMI())
-      return SDValue();
+    if (Subtarget.hasBWI())
+      Opcode = X86ISD::VPERMV;
     break;
-  case MVT::v32i8:
-    if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
-      return SDValue();
+  case MVT::v16f32:
+  case MVT::v16i32:
+  case MVT::v8f64:
+  case MVT::v8i64:
+    if (Subtarget.hasAVX512())
+      Opcode = X86ISD::VPERMV;
     break;
   }
+  if (!Opcode)
+    return SDValue();
+
+  assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
+         (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
+         "Illegal variable permute shuffle type");
+
+  uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
+  if (Scale > 1)
+    IndicesVec = ScaleIndices(IndicesVec, Scale);
+
+  EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
+  IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
+
+  SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
+  SDValue Res = Opcode == X86ISD::VPERMV
+                    ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
+                    : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
+  return DAG.getBitcast(VT, Res);
+}
+
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+//               (extract_elt V, (extract_elt I, 1)),
+//                    ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
   SDValue SrcVec, IndicesVec;
   // Check for a match of the permute source vector and permute index elements.
   // This is done by checking that the i-th build_vector operand is of the form:
@@ -7888,19 +8304,10 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
     if (!PermIdx || PermIdx->getZExtValue() != Idx)
       return SDValue();
   }
-  MVT IndicesVT = VT;
-  if (VT.isFloatingPoint())
-    IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
-                                 VT.getVectorNumElements());
-  IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
-  if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
-    SrcVec =
-        DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
-                    SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
-  }
-  if (VT == MVT::v16i8)
-    return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
-  return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
+
+  SDLoc DL(V);
+  MVT VT = V.getSimpleValueType();
+  return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
 }
 
 SDValue
@@ -7908,7 +8315,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   MVT VT = Op.getSimpleValueType();
-  MVT ExtVT = VT.getVectorElementType();
+  MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
   // Generate vectors for predicate vectors.
@@ -7919,8 +8326,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return VectorConstant;
 
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
-  // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
-  // transform here.
   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
     return AddSub;
   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
@@ -7930,7 +8335,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
     return BitOp;
 
-  unsigned EVTBits = ExtVT.getSizeInBits();
+  unsigned EVTBits = EltVT.getSizeInBits();
 
   unsigned NumZero  = 0;
   unsigned NumNonZero = 0;
@@ -7966,13 +8371,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // supported, we assume that we will fall back to a shuffle to get the scalar
   // blended with the constants. Insertion into a zero vector is handled as a
   // special-case somewhere below here.
-  LLVMContext &Context = *DAG.getContext();
   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
     // Create an all-constant vector. The variable element in the old
     // build vector is replaced by undef in the constant vector. Save the
     // variable scalar element and its index for use in the insertelement.
+    LLVMContext &Context = *DAG.getContext();
     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
     SDValue VarElt;
@@ -8011,27 +8416,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     unsigned Idx = countTrailingZeros(NonZeros);
     SDValue Item = Op.getOperand(Idx);
 
-    // If this is an insertion of an i64 value on x86-32, and if the top bits of
-    // the value are obviously zero, truncate the value to i32 and do the
-    // insertion that way.  Only do this if the value is non-constant or if the
-    // value is a constant being inserted into element 0.  It is cheaper to do
-    // a constant pool load than it is to do a movd + shuffle.
-    if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
-        (!IsAllConstants || Idx == 0)) {
-      if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
-        // Handle SSE only.
-        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
-        MVT VecVT = MVT::v4i32;
-
-        // Truncate the value (which may itself be a constant) to i32, and
-        // convert it to a vector with movd (S2V+shuffle to zero extend).
-        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
-        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
-                                      Item, Idx * 2, true, Subtarget, DAG));
-      }
-    }
-
     // If we have a constant or non-constant insertion into the low element of
     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
@@ -8040,8 +8424,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
-      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
-          (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
+      if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+          (EltVT == MVT::i64 && Subtarget.is64Bit())) {
         assert((VT.is128BitVector() || VT.is256BitVector() ||
                 VT.is512BitVector()) &&
                "Expected an SSE value type!");
@@ -8052,7 +8436,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       // We can't directly insert an i8 or i16 into a vector, so zero extend
       // it to i32 first.
-      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+      if (EltVT == MVT::i16 || EltVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
         if (VT.getSizeInBits() >= 256) {
           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
@@ -8124,17 +8508,43 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return V;
 
   // See if we can use a vector load to get all of the elements.
-  if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+  {
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
     if (SDValue LD =
             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
       return LD;
   }
 
+  // If this is a splat of pairs of 32-bit elements, we can use a narrower
+  // build_vector and broadcast it.
+  // TODO: We could probably generalize this more.
+  if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
+    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
+                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+    auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
+      // Make sure all the even/odd operands match.
+      for (unsigned i = 2; i != NumElems; ++i)
+        if (Ops[i % 2] != Op.getOperand(i))
+          return false;
+      return true;
+    };
+    if (CanSplat(Op, NumElems, Ops)) {
+      MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
+      MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
+      // Create a new build vector and cast to v2i64/v2f64.
+      SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
+                                     DAG.getBuildVector(NarrowVT, dl, Ops));
+      // Broadcast from v2i64/v2f64 and cast to final VT.
+      MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
+      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
+                                            NewBV));
+    }
+  }
+
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
-  if (VT.is256BitVector() || VT.is512BitVector()) {
-    EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
+  if (VT.getSizeInBits() > 128) {
+    MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
 
     // Build both the lower and upper subvector.
     SDValue Lower =
@@ -8143,9 +8553,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
 
     // Recreate the wider vector with the lower and upper part.
-    if (VT.is256BitVector())
-      return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
-    return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
+                            VT.getSizeInBits() / 2);
   }
 
   // Let legalizer expand 2-wide build_vectors.
@@ -8270,30 +8679,60 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+// TODO: Detect subvector broadcast here instead of DAG combine?
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
 
   assert((ResVT.is256BitVector() ||
           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
 
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  unsigned NumElems = ResVT.getVectorNumElements();
-  if (ResVT.is256BitVector())
-    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+  unsigned NumOperands = Op.getNumOperands();
+  unsigned NumZero = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  for (unsigned i = 0; i != NumOperands; ++i) {
+    SDValue SubVec = Op.getOperand(i);
+    if (SubVec.isUndef())
+      continue;
+    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+      ++NumZero;
+    else {
+      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+      NonZeros |= 1 << i;
+      ++NumNonZero;
+    }
+  }
 
-  if (Op.getNumOperands() == 4) {
+  // If we have more than 2 non-zeros, build each half separately.
+  if (NumNonZero > 2) {
     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
-    SDValue V3 = Op.getOperand(2);
-    SDValue V4 = Op.getOperand(3);
-    return concat256BitVectors(
-        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
-        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
-        NumElems, DAG, dl);
+    ArrayRef<SDUse> Ops = Op->ops();
+    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+                             Ops.slice(0, NumOperands/2));
+    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+                             Ops.slice(NumOperands/2));
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+  }
+
+  // Otherwise, build it up through insert_subvectors.
+  SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
+                        : DAG.getUNDEF(ResVT);
+
+  MVT SubVT = Op.getOperand(0).getSimpleValueType();
+  unsigned NumSubElems = SubVT.getVectorNumElements();
+  for (unsigned i = 0; i != NumOperands; ++i) {
+    if ((NonZeros & (1 << i)) == 0)
+      continue;
+
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
+                      Op.getOperand(i),
+                      DAG.getIntPtrConstant(i * NumSubElems, dl));
   }
-  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+  return Vec;
 }
 
 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
@@ -8350,6 +8789,7 @@ static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
   return SDValue();
 }
 
+// TODO: Merge this with LowerAVXCONCAT_VECTORS?
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
@@ -8364,12 +8804,8 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   // of a node with instruction that zeroes all upper (irrelevant) bits of the
   // output register, mark it as legal and catch the pattern in instruction
   // selection to avoid emitting extra instructions (for zeroing upper bits).
-  if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
-    SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
-    SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
-                       ZeroC);
-  }
+  if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
+    return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
 
   unsigned NumZero = 0;
   unsigned NumNonZero = 0;
@@ -8440,7 +8876,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
   // from two other 128-bit ones.
 
   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
-  return LowerAVXCONCAT_VECTORS(Op, DAG);
+  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
 }
 
 //===----------------------------------------------------------------------===//
@@ -8454,7 +8890,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
 // patterns.
 //===----------------------------------------------------------------------===//
 
-/// \brief Tiny helper function to identify a no-op mask.
+/// Tiny helper function to identify a no-op mask.
 ///
 /// This is a somewhat boring predicate function. It checks whether the mask
 /// array input, which is assumed to be a single-input shuffle mask of the kind
@@ -8470,7 +8906,7 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   return true;
 }
 
-/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// Test whether there are elements crossing 128-bit lanes in this
 /// shuffle mask.
 ///
 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
@@ -8484,7 +8920,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   return false;
 }
 
-/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
+/// Test whether a shuffle mask is equivalent within each sub-lane.
 ///
 /// This checks a shuffle mask to see if it is performing the same
 /// lane-relative shuffle in each sub-lane. This trivially implies
@@ -8530,6 +8966,12 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
 
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+  SmallVector<int, 32> RepeatedMask;
+  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
 static bool
 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
@@ -8573,7 +9015,7 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
   return true;
 }
 
-/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
 /// This is a fast way to test a shuffle mask against a fixed pattern:
@@ -8670,7 +9112,7 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
   return IsUnpackwdMask;
 }
 
-/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+/// Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
@@ -8698,7 +9140,7 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
-/// \brief Compute whether each element of a shuffle is zeroable.
+/// Compute whether each element of a shuffle is zeroable.
 ///
 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
 /// Either it is an undef element in the shuffle mask, the element of the input
@@ -8895,8 +9337,8 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
 
 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
                                         unsigned &UnpackOpcode, bool IsUnary,
-                                        ArrayRef<int> TargetMask, SDLoc &DL,
-                                        SelectionDAG &DAG,
+                                        ArrayRef<int> TargetMask,
+                                        const SDLoc &DL, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   int NumElts = VT.getVectorNumElements();
 
@@ -9005,6 +9447,99 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
   return SDValue();
 }
 
+static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
+                                         int Delta) {
+  int Size = (int)Mask.size();
+  int Split = Size / Delta;
+  int TruncatedVectorStart = SwappedOps ? Size : 0;
+
+  // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
+  if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
+    return false;
+
+  // The rest of the mask should not refer to the truncated vector's elements.
+  if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
+                   TruncatedVectorStart + Size))
+    return false;
+
+  return true;
+}
+
+// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
+//
+// An example is the following:
+//
+// t0: ch = EntryToken
+//           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
+//         t25: v4i32 = truncate t2
+//       t41: v8i16 = bitcast t25
+//       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
+//       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
+//     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
+//   t18: v2i64 = bitcast t51
+//
+// Without avx512vl, this is lowered to:
+//
+// vpmovqd %zmm0, %ymm0
+// vpshufb {{.*#+}} xmm0 =
+// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+//
+// But when avx512vl is available, one can just use a single vpmovdw
+// instruction.
+static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG,
+                                           const X86Subtarget &Subtarget) {
+  if (VT != MVT::v16i8 && VT != MVT::v8i16)
+    return SDValue();
+
+  if (Mask.size() != VT.getVectorNumElements())
+    return SDValue();
+
+  bool SwappedOps = false;
+
+  if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
+    if (!ISD::isBuildVectorAllZeros(V1.getNode()))
+      return SDValue();
+
+    std::swap(V1, V2);
+    SwappedOps = true;
+  }
+
+  // Look for:
+  //
+  // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
+  // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
+  //
+  // and similar ones.
+  if (V1.getOpcode() != ISD::BITCAST)
+    return SDValue();
+  if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  SDValue Src = V1.getOperand(0).getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+
+  // The vptrunc** instructions truncating 128 bit and 256 bit vectors
+  // are only available with avx512vl.
+  if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
+    return SDValue();
+
+  // Down Convert Word to Byte is only available with avx512bw. The case with
+  // 256-bit output doesn't contain a shuffle and is therefore not handled here.
+  if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+      !Subtarget.hasBWI())
+    return SDValue();
+
+  // The first half/quarter of the mask should refer to every second/fourth
+  // element of the vector truncated and bitcasted.
+  if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
+      !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+}
+
 // X86 has dedicated pack instructions that can handle specific truncation
 // operations: PACKSS and PACKUS.
 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
@@ -9020,15 +9555,6 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
   auto MatchPACK = [&](SDValue N1, SDValue N2) {
     SDValue VV1 = DAG.getBitcast(PackVT, N1);
     SDValue VV2 = DAG.getBitcast(PackVT, N2);
-    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
-        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
-      V1 = VV1;
-      V2 = VV2;
-      SrcVT = PackVT;
-      PackOpcode = X86ISD::PACKSS;
-      return true;
-    }
-
     if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
       APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
@@ -9040,7 +9566,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
         return true;
       }
     }
-
+    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
+        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+      V1 = VV1;
+      V2 = VV2;
+      SrcVT = PackVT;
+      PackOpcode = X86ISD::PACKSS;
+      return true;
+    }
     return false;
   };
 
@@ -9075,7 +9608,7 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
   return SDValue();
 }
 
-/// \brief Try to emit a bitmask instruction for a shuffle.
+/// Try to emit a bitmask instruction for a shuffle.
 ///
 /// This handles cases where we can model a blend exactly as a bitmask due to
 /// one of the inputs being zeroable.
@@ -9108,7 +9641,7 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
 }
 
-/// \brief Try to emit a blend instruction for a shuffle using bit math.
+/// Try to emit a blend instruction for a shuffle using bit math.
 ///
 /// This is used as a fallback approach when first class blend instructions are
 /// unavailable. Currently it is only suitable for integer vectors, but could
@@ -9195,7 +9728,7 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
   return ScaledMask;
 }
 
-/// \brief Try to emit a blend instruction for a shuffle.
+/// Try to emit a blend instruction for a shuffle.
 ///
 /// This doesn't do any checks for the availability of instructions for blending
 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
@@ -9341,7 +9874,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   }
 }
 
-/// \brief Try to lower as a blend of elements from two inputs followed by
+/// Try to lower as a blend of elements from two inputs followed by
 /// a single-input permutation.
 ///
 /// This matches the pattern where we can blend elements from two inputs and
@@ -9373,7 +9906,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
 
-/// \brief Generic routine to decompose a shuffle and blend into independent
+/// Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
@@ -9414,7 +9947,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
 }
 
-/// \brief Try to lower a vector shuffle as a rotation.
+/// Try to lower a vector shuffle as a rotation.
 ///
 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
@@ -9486,7 +10019,7 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
   return Rotation;
 }
 
-/// \brief Try to lower a vector shuffle as a byte rotation.
+/// Try to lower a vector shuffle as a byte rotation.
 ///
 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
@@ -9570,7 +10103,7 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
 }
 
-/// \brief Try to lower a vector shuffle as a dword/qword rotation.
+/// Try to lower a vector shuffle as a dword/qword rotation.
 ///
 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
 /// rotation of the concatenation of two vectors; This routine will
@@ -9601,7 +10134,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
                      DAG.getConstant(Rotation, DL, MVT::i8));
 }
 
-/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
 ///
 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
@@ -9845,7 +10378,7 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
   return false;
 }
 
-/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
@@ -9865,7 +10398,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
   return SDValue();
 }
 
-/// \brief Lower a vector shuffle as a zero or any extension.
+/// Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
@@ -10020,7 +10553,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   return DAG.getBitcast(VT, InputV);
 }
 
-/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
+/// Try to lower a vector shuffle as a zero extension on any microarch.
 ///
 /// This routine will try to do everything in its power to cleverly lower
 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
@@ -10148,7 +10681,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   return SDValue();
 }
 
-/// \brief Try to get a scalar value for a specific element of a vector.
+/// Try to get a scalar value for a specific element of a vector.
 ///
 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
@@ -10175,7 +10708,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   return SDValue();
 }
 
-/// \brief Helper to test for a load that can be folded with x86 shuffles.
+/// Helper to test for a load that can be folded with x86 shuffles.
 ///
 /// This is particularly important because the set of instructions varies
 /// significantly based on whether the operand is a load or not.
@@ -10184,7 +10717,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
   return ISD::isNON_EXTLoad(V.getNode());
 }
 
-/// \brief Try to lower insertion of a single element into a zero vector.
+/// Try to lower insertion of a single element into a zero vector.
 ///
 /// This is a common pattern that we have especially efficient patterns to lower
 /// across all subtarget feature sets.
@@ -10275,9 +10808,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
       V2 = DAG.getBitcast(MVT::v16i8, V2);
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
-          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
-                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
-                              DAG.getDataLayout(), VT)));
+          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
       V2 = DAG.getBitcast(VT, V2);
     }
   }
@@ -10331,13 +10862,13 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
   if (const int OffsetIdx = BroadcastIdx % Scale)
     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
-            DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+                         DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
 
   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
 }
 
-/// \brief Try to lower broadcast of a single element.
+/// Try to lower broadcast of a single element.
 ///
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
@@ -10662,7 +11193,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
 }
 
-/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// Try to lower a shuffle as a permute of the inputs followed by an
 /// UNPCK instruction.
 ///
 /// This specifically targets cases where we end up with alternating between
@@ -10774,7 +11305,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
   return SDValue();
 }
 
-/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+/// Handle lowering of 2-lane 64-bit floating point shuffles.
 ///
 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
 /// support for floating point shuffles but not integer shuffles. These
@@ -10813,22 +11344,23 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   }
-  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
-  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+  assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
 
-  // If we have a single input, insert that into V1 if we can do so cheaply.
-  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
-      return Insertion;
-    // Try inverting the insertion since for v2 masks it is easy to do and we
-    // can't reliably sort the mask one way or the other.
-    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
-                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
-      return Insertion;
-  }
+  // When loading a scalar and then shuffling it into a vector we can often do
+  // the insertion cheaply.
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return Insertion;
+  // Try inverting the insertion since for v2 masks it is easy to do and we
+  // can't reliably sort the mask one way or the other.
+  int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                        Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+    return Insertion;
 
   // Try to use one of the special instruction patterns to handle two common
   // blend patterns if a zero-blend above didn't work.
@@ -10838,8 +11370,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       // We can either use a special instruction to load over the low double or
       // to move just the low double.
       return DAG.getNode(
-          isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
-          DL, MVT::v2f64, V2,
+          X86ISD::MOVSD, DL, MVT::v2f64, V2,
           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
 
   if (Subtarget.hasSSE41())
@@ -10857,7 +11388,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
 }
 
-/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+/// Handle lowering of 2-lane 64-bit integer shuffles.
 ///
 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
 /// the integer unit to minimize domain crossing penalties. However, for blends
@@ -10954,7 +11485,7 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
 
-/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+/// Test whether this can be lowered with a single SHUFPS instruction.
 ///
 /// This is used to disable more specialized lowerings when the shufps lowering
 /// will happen to be efficient.
@@ -10976,7 +11507,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   return true;
 }
 
-/// \brief Lower a vector shuffle using the SHUFPS instruction.
+/// Lower a vector shuffle using the SHUFPS instruction.
 ///
 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
 /// It makes no assumptions about whether this is the *best* lowering, it simply
@@ -11063,7 +11594,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
 }
 
-/// \brief Lower 4-lane 32-bit floating point shuffles.
+/// Lower 4-lane 32-bit floating point shuffles.
 ///
 /// Uses instructions exclusively from the floating point unit to minimize
 /// domain crossing penalties, as these are sufficient to implement all v4f32
@@ -11159,7 +11690,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
 }
 
-/// \brief Lower 4-lane i32 vector shuffles.
+/// Lower 4-lane i32 vector shuffles.
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
@@ -11271,7 +11802,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return DAG.getBitcast(MVT::v4i32, ShufPS);
 }
 
-/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
 /// shuffle lowering, and the most complex part.
 ///
 /// The lowering strategy is to try to form pairs of input lanes which are
@@ -11313,11 +11844,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
 
   SmallVector<int, 4> LoInputs;
   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
-  std::sort(LoInputs.begin(), LoInputs.end());
+  array_pod_sort(LoInputs.begin(), LoInputs.end());
   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   SmallVector<int, 4> HiInputs;
   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
-  std::sort(HiInputs.begin(), HiInputs.end());
+  array_pod_sort(HiInputs.begin(), HiInputs.end());
   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   int NumLToL =
       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
@@ -11836,7 +12367,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
   return DAG.getBitcast(VT, V);
 }
 
-/// \brief Generic lowering of 8-lane i16 shuffles.
+/// Generic lowering of 8-lane i16 shuffles.
 ///
 /// This handles both single-input shuffles and combined shuffle/blends with
 /// two inputs. The single input shuffles are immediately delegated to
@@ -11969,7 +12500,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                     Mask, DAG);
 }
 
-/// \brief Check whether a compaction lowering can be done by dropping even
+/// Check whether a compaction lowering can be done by dropping even
 /// elements and compute how many times even elements must be dropped.
 ///
 /// This handles shuffles which take every Nth element where N is a power of
@@ -12048,7 +12579,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
 }
 
-/// \brief Generic lowering of v16i8 shuffles.
+/// Generic lowering of v16i8 shuffles.
 ///
 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
 /// detect any complexity reducing interleaving. If that doesn't help, it uses
@@ -12120,12 +12651,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       SmallVector<int, 4> LoInputs;
       copy_if(Mask, std::back_inserter(LoInputs),
               [](int M) { return M >= 0 && M < 8; });
-      std::sort(LoInputs.begin(), LoInputs.end());
+      array_pod_sort(LoInputs.begin(), LoInputs.end());
       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
                      LoInputs.end());
       SmallVector<int, 4> HiInputs;
       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
-      std::sort(HiInputs.begin(), HiInputs.end());
+      array_pod_sort(HiInputs.begin(), HiInputs.end());
       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
                      HiInputs.end());
 
@@ -12348,7 +12879,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
 }
 
-/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+/// Dispatching routine to lower various 128-bit x86 vector shuffles.
 ///
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
@@ -12376,7 +12907,7 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 }
 
-/// \brief Generic routine to split vector shuffle into half-sized shuffles.
+/// Generic routine to split vector shuffle into half-sized shuffles.
 ///
 /// This routine just extracts two subvectors, shuffles them independently, and
 /// then concatenates them back together. This should work effectively with all
@@ -12499,7 +13030,7 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
 }
 
-/// \brief Either split a vector in halves or decompose the shuffles and the
+/// Either split a vector in halves or decompose the shuffles and the
 /// blend.
 ///
 /// This is provided as a good fallback for many lowerings of non-single-input
@@ -12557,7 +13088,7 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
 }
 
-/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// Lower a vector shuffle crossing multiple 128-bit lanes as
 /// a permutation and blend of those lanes.
 ///
 /// This essentially blends the out-of-lane inputs to each lane into the lane
@@ -12615,7 +13146,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
 }
 
-/// \brief Handle lowering 2-lane 128-bit shuffles.
+/// Handle lowering 2-lane 128-bit shuffles.
 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
@@ -12626,9 +13157,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
     return SDValue();
 
   SmallVector<int, 4> WidenedMask;
-  if (!canWidenShuffleElements(Mask, WidenedMask))
+  if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
     return SDValue();
 
+  bool IsLowZero = (Zeroable & 0x3) == 0x3;
+  bool IsHighZero = (Zeroable & 0xc) == 0xc;
+
+  // Try to use an insert into a zero vector.
+  if (WidenedMask[0] == 0 && IsHighZero) {
+    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                       getZeroVector(VT, Subtarget, DAG, DL), LoV,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
   // TODO: If minimizing size and one of the inputs is a zero vector and the
   // the zero vector has only one use, we could use a VPERM2X128 to save the
   // instruction bytes needed to explicitly generate the zero vector.
@@ -12638,9 +13182,6 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
-  bool IsLowZero = (Zeroable & 0x3) == 0x3;
-  bool IsHighZero = (Zeroable & 0xc) == 0xc;
-
   // If either input operand is a zero vector, use VPERM2X128 because its mask
   // allows us to replace the zero input with an implicit zero.
   if (!IsLowZero && !IsHighZero) {
@@ -12652,14 +13193,12 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
       // this will likely become vinsertf128 which can't fold a 256-bit memop.
       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
-        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                     VT.getVectorNumElements() / 2);
-        SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
-                                  DAG.getIntPtrConstant(0, DL));
-        SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                                  OnlyUsesV1 ? V1 : V2,
-                                  DAG.getIntPtrConstant(0, DL));
-        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+        SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                                     OnlyUsesV1 ? V1 : V2,
+                                     DAG.getIntPtrConstant(0, DL));
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
+                           DAG.getIntPtrConstant(2, DL));
       }
     }
 
@@ -12687,7 +13226,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   //    [6]   - ignore
   //    [7]   - zero high half of destination
 
-  assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
+  assert((WidenedMask[0] >= 0 || IsLowZero) &&
+         (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
 
   unsigned PermMask = 0;
   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
@@ -12703,7 +13243,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
-/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// Lower a vector shuffle by first fixing the 128-bit lanes and then
 /// shuffling each lane.
 ///
 /// This will only succeed when the result of fixing the 128-bit lanes results
@@ -12906,7 +13446,7 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
                      DAG.getIntPtrConstant(Offset, DL));
 }
 
-/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// Test whether the specified input (0 or 1) is in-place blended by the
 /// given mask.
 ///
 /// This returns true if the elements from a particular input are already in the
@@ -13142,7 +13682,7 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                      DAG.getConstant(Immediate, DL, MVT::i8));
 }
 
-/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+/// Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
@@ -13184,7 +13724,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
     // Try to create an in-lane repeating shuffle mask and then shuffle the
-    // the results into the target lanes.
+    // results into the target lanes.
     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
@@ -13209,7 +13749,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Op;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
-  // the results into the target lanes.
+  // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
     return V;
@@ -13239,7 +13779,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
 }
 
-/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+/// Handle lowering of 4-lane 64-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
@@ -13312,6 +13852,12 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
     return V;
 
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
@@ -13327,7 +13873,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                     Mask, DAG);
 }
 
-/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+/// Handle lowering of 8-lane 32-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
@@ -13377,7 +13923,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
-  // the results into the target lanes.
+  // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return V;
@@ -13426,7 +13972,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
 }
 
-/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+/// Handle lowering of 8-lane 32-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
@@ -13539,7 +14085,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                     Mask, DAG);
 }
 
-/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+/// Handle lowering of 16-lane 16-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
@@ -13590,7 +14136,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
-  // the results into the target lanes.
+  // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return V;
@@ -13630,7 +14176,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
 }
 
-/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+/// Handle lowering of 32-lane 8-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
@@ -13681,7 +14227,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
-  // the results into the target lanes.
+  // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return V;
@@ -13710,7 +14256,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
 }
 
-/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+/// High-level routine to lower various 256-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 256-bit x86 vector
 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
@@ -13780,10 +14326,13 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 }
 
-/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+/// Try to lower a vector shuffle as a 128-bit shuffles.
 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
-                                        ArrayRef<int> Mask, SDValue V1,
-                                        SDValue V2, SelectionDAG &DAG) {
+                                        ArrayRef<int> Mask,
+                                        const APInt &Zeroable,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
   assert(VT.getScalarSizeInBits() == 64 &&
          "Unexpected element type size for 128bit shuffle.");
 
@@ -13791,10 +14340,23 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
   // function lowerV2X128VectorShuffle() is better solution.
   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
 
+  // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
   SmallVector<int, 4> WidenedMask;
   if (!canWidenShuffleElements(Mask, WidenedMask))
     return SDValue();
 
+  // Try to use an insert into a zero vector.
+  if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+      (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+    unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
+    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                       getZeroVector(VT, Subtarget, DAG, DL), LoV,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
   // Check for patterns which can be matched with a single insert of a 256-bit
   // subvector.
   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
@@ -13802,12 +14364,11 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
-    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+    SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                                 OnlyUsesV1 ? V1 : V2,
                               DAG.getIntPtrConstant(0, DL));
-    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                              OnlyUsesV1 ? V1 : V2,
-                              DAG.getIntPtrConstant(0, DL));
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
+                       DAG.getIntPtrConstant(4, DL));
   }
 
   assert(WidenedMask.size() == 4);
@@ -13842,7 +14403,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
   }
 
-  // Try to lower to to vshuf64x2/vshuf32x4.
+  // Try to lower to vshuf64x2/vshuf32x4.
   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
   unsigned PermMask = 0;
   // Insure elements came from the same Op.
@@ -13867,7 +14428,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
-/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+/// Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
@@ -13900,7 +14461,8 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 
   if (SDValue Shuf128 =
-          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
+                                   Subtarget, DAG))
     return Shuf128;
 
   if (SDValue Unpck =
@@ -13923,7 +14485,7 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
-/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+/// Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
@@ -13978,7 +14540,7 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
 
-/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+/// Handle lowering of 8-lane 64-bit integer shuffles.
 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
@@ -14010,7 +14572,8 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 
   if (SDValue Shuf128 =
-          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
+                                   V1, V2, Subtarget, DAG))
     return Shuf128;
 
   // Try to use shift instructions.
@@ -14043,7 +14606,7 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
 
-/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+/// Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
@@ -14114,7 +14677,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
-/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+/// Handle lowering of 32-lane 16-bit integer shuffles.
 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
@@ -14169,7 +14732,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
-/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+/// Handle lowering of 64-lane 8-bit integer shuffles.
 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
@@ -14211,7 +14774,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
-  // the results into the target lanes.
+  // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return V;
@@ -14224,7 +14787,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
 
-/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+/// High-level routine to lower various 512-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 512-bit x86 vector
 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
@@ -14286,8 +14849,36 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 // vector, shuffle and then truncate it back.
 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                       MVT VT, SDValue V1, SDValue V2,
+                                      const APInt &Zeroable,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
+  unsigned NumElts = Mask.size();
+
+  // Try to recognize shuffles that are just padding a subvector with zeros.
+  unsigned SubvecElts = 0;
+  for (int i = 0; i != (int)NumElts; ++i) {
+    if (Mask[i] >= 0 && Mask[i] != i)
+      break;
+
+    ++SubvecElts;
+  }
+  assert(SubvecElts != NumElts && "Identity shuffle?");
+
+  // Clip to a power 2.
+  SubvecElts = PowerOf2Floor(SubvecElts);
+
+  // Make sure the number of zeroable bits in the top at least covers the bits
+  // not covered by the subvector.
+  if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+    MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
+                                  V1, DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                       getZeroVector(VT, Subtarget, DAG, DL),
+                       Extract, DAG.getIntPtrConstant(0, DL));
+  }
+
+
   assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/o basic ISA!");
   MVT ExtVT;
@@ -14306,38 +14897,31 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
     break;
   case MVT::v16i1:
-    ExtVT = MVT::v16i32;
+    // Take 512-bit type, unless we are avoiding 512-bit types and have the
+    // 256-bit operation available.
+    ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
     break;
   case MVT::v32i1:
-    ExtVT = MVT::v32i16;
+    // Take 512-bit type, unless we are avoiding 512-bit types and have the
+    // 256-bit operation available.
+    assert(Subtarget.hasBWI() && "Expected AVX512BW support");
+    ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
     break;
   case MVT::v64i1:
     ExtVT = MVT::v64i8;
     break;
   }
 
-  if (ISD::isBuildVectorAllZeros(V1.getNode()))
-    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
-  else if (ISD::isBuildVectorAllOnes(V1.getNode()))
-    V1 = getOnesVector(ExtVT, DAG, DL);
-  else
-    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
-
-  if (V2.isUndef())
-    V2 = DAG.getUNDEF(ExtVT);
-  else if (ISD::isBuildVectorAllZeros(V2.getNode()))
-    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
-  else if (ISD::isBuildVectorAllOnes(V2.getNode()))
-    V2 = getOnesVector(ExtVT, DAG, DL);
-  else
-    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+  V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+  V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
 
   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
   // i1 was sign extended we can use X86ISD::CVT2MASK.
   int NumElems = VT.getVectorNumElements();
   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
       (Subtarget.hasDQI() && (NumElems < 32)))
-    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
+    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
+                       Shuffle, ISD::SETGT);
 
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
 }
@@ -14406,7 +14990,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
   return false;
 }
 
-/// \brief Top-level lowering for x86 vector shuffles.
+/// Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -14464,20 +15048,49 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
+  bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // Create an alternative mask with info about zeroable elements.
+  // Here we do not set undef elements as zeroable.
+  SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+  if (V2IsZero) {
+    assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+    for (int i = 0; i != NumElements; ++i)
+      if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+        ZeroableMask[i] = SM_SentinelZero;
+  }
+
   // Try to collapse shuffles into using a vector type with fewer elements but
   // wider element types. We cap this to not form integers or floating point
   // elements wider than 64 bits, but it might be interesting to form i128
   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   SmallVector<int, 16> WidenedMask;
   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
-      canWidenShuffleElements(Mask, WidenedMask)) {
+      canWidenShuffleElements(ZeroableMask, WidenedMask)) {
     MVT NewEltVT = VT.isFloatingPoint()
                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
-    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    int NewNumElts = NumElements / 2;
+    MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
     // Make sure that the new vector type is legal. For example, v2f64 isn't
     // legal on SSE1.
     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      if (V2IsZero) {
+        // Modify the new Mask to take all zeros from the all-zero vector.
+        // Choose indices that are blend-friendly.
+        bool UsedZeroVector = false;
+        assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
+               "V2's non-undef elements are used?!");
+        for (int i = 0; i != NewNumElts; ++i)
+          if (WidenedMask[i] == SM_SentinelZero) {
+            WidenedMask[i] = i + NewNumElts;
+            UsedZeroVector = true;
+          }
+        // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
+        // some elements to be undef.
+        if (UsedZeroVector)
+          V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
+      }
       V1 = DAG.getBitcast(NewVT, V1);
       V2 = DAG.getBitcast(NewVT, V2);
       return DAG.getBitcast(
@@ -14489,6 +15102,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   if (canonicalizeShuffleMaskWithCommute(Mask))
     return DAG.getCommutedVectorShuffle(*SVOp);
 
+  if (SDValue V =
+          lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
+    return V;
+
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.is128BitVector())
     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
@@ -14503,12 +15120,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
                                     DAG);
 
   if (Is1BitVector)
-    return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+    return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+                                  DAG);
 
   llvm_unreachable("Unimplemented!");
 }
 
-/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+/// Try to lower a VSELECT instruction to a vector shuffle.
 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
@@ -14527,9 +15145,12 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
   SmallVector<int, 32> Mask;
   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
     SDValue CondElt = CondBV->getOperand(i);
-    Mask.push_back(
-        isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
-                                     : -1);
+    int M = i;
+    // We can't map undef to undef here. They have different meanings. Treat
+    // as the same as zero.
+    if (CondElt.isUndef() || isNullConstant(CondElt))
+      M += Size;
+    Mask.push_back(M);
   }
   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
 }
@@ -14569,9 +15190,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
     assert(Cond.getValueType().getScalarSizeInBits() ==
                VT.getScalarSizeInBits() &&
            "Should have a size-matched integer condition!");
-    // Build a mask by testing the condition against itself (tests for zero).
+    // Build a mask by testing the condition against zero.
     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+    SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
+                                getZeroVector(VT, Subtarget, DAG, dl),
+                                ISD::SETNE);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
   }
@@ -14592,10 +15215,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
 
   case MVT::v8i16:
-  case MVT::v16i16:
-    // FIXME: We should custom lower this by fixing the condition and using i8
-    // blends.
-    return SDValue();
+  case MVT::v16i16: {
+    // Bitcast everything to the vXi8 type and use a vXi8 vselect.
+    MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+    SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
+    SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
+    SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
+    SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
+    return DAG.getBitcast(VT, Select);
+  }
   }
 }
 
@@ -14667,36 +15295,35 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   }
 
-  // Canonicalize result type to MVT::i32.
-  if (EltVT != MVT::i32) {
-    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                  Vec, Idx);
-    return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
-  }
-
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
-  // Extracts from element 0 are always allowed.
-  if (IdxVal == 0)
-    return Op;
-
   // If the kshift instructions of the correct width aren't natively supported
   // then we need to promote the vector to the native size to get the correct
   // zeroing behavior.
-  if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
-      (VecVT.getVectorNumElements() < 8)) {
+  if (VecVT.getVectorNumElements() < 16) {
     VecVT = MVT::v16i1;
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
-                      DAG.getUNDEF(VecVT),
-                      Vec,
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+                      DAG.getUNDEF(VecVT), Vec,
                       DAG.getIntPtrConstant(0, dl));
   }
 
-  // Use kshiftr instruction to move to the lower element.
-  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
-                    DAG.getConstant(IdxVal, dl, MVT::i8));
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
-                     DAG.getIntPtrConstant(0, dl));
+  // Extracts from element 0 are always allowed.
+  if (IdxVal != 0) {
+    // Use kshiftr instruction to move to the lower element.
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
+                      DAG.getConstant(IdxVal, dl, MVT::i8));
+  }
+
+  // Shrink to v16i1 since that's always legal.
+  if (VecVT.getVectorNumElements() > 16) {
+    VecVT = MVT::v16i1;
+    Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
+                      DAG.getIntPtrConstant(0, dl));
+  }
+
+  // Convert to a bitcast+aext/trunc.
+  MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
+  return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
 }
 
 SDValue
@@ -14799,7 +15426,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       int ShiftVal = (IdxVal % 4) * 8;
       if (ShiftVal != 0)
         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
-                          DAG.getConstant(ShiftVal, dl, MVT::i32));
+                          DAG.getConstant(ShiftVal, dl, MVT::i8));
       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
     }
 
@@ -14810,7 +15437,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     int ShiftVal = (IdxVal % 2) * 8;
     if (ShiftVal != 0)
       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
-                        DAG.getConstant(ShiftVal, dl, MVT::i16));
+                        DAG.getConstant(ShiftVal, dl, MVT::i8));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   }
 
@@ -14866,74 +15493,11 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   }
 
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  unsigned NumElems = VecVT.getVectorNumElements();
-
-  // If the kshift instructions of the correct width aren't natively supported
-  // then we need to promote the vector to the native size to get the correct
-  // zeroing behavior.
-  if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
-    // Need to promote to v16i1, do the insert, then extract back.
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
-                      DAG.getUNDEF(MVT::v16i1), Vec,
-                      DAG.getIntPtrConstant(0, dl));
-    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
-                       DAG.getIntPtrConstant(0, dl));
-  }
-
-  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
-
-  if (Vec.isUndef()) {
-    if (IdxVal)
-      EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
-                             DAG.getConstant(IdxVal, dl, MVT::i8));
-    return EltInVec;
-  }
-
-  // Insertion of one bit into first position
-  if (IdxVal == 0 ) {
-    // Clean top bits of vector.
-    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
-                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
-    EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
-                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
-    // Clean the first bit in source vector.
-    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
-                      DAG.getConstant(1 , dl, MVT::i8));
-    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
-                      DAG.getConstant(1, dl, MVT::i8));
-
-    return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
-  }
-  // Insertion of one bit into last position
-  if (IdxVal == NumElems - 1) {
-    // Move the bit to the last position inside the vector.
-    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
-                           DAG.getConstant(IdxVal, dl, MVT::i8));
-    // Clean the last bit in the source vector.
-    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
-                      DAG.getConstant(1, dl, MVT::i8));
-    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
-                      DAG.getConstant(1 , dl, MVT::i8));
-
-    return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
-  }
+  // Copy into a k-register, extract to v1i1 and insert_subvector.
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
 
-  // Move the current value of the bit to be replace to bit 0.
-  SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
-                               DAG.getConstant(IdxVal, dl, MVT::i8));
-  // Xor with the new bit.
-  Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
-  // Shift to MSB, filling bottom bits with 0.
-  Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
-                       DAG.getConstant(NumElems - 1, dl, MVT::i8));
-  // Shift to the final position, filling upper bits with 0.
-  Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
-                       DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
-  // Xor with original vector to cancel out the original bit value that's still
-  // present.
-  return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
+                     Op.getOperand(2));
 }
 
 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -15143,7 +15707,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
-unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
+unsigned X86TargetLowering::getGlobalWrapperKind(
+    const GlobalValue *GV, const unsigned char OpFlags) const {
   // References to absolute symbols are never PC-relative.
   if (GV && GV->isAbsoluteSymbolRef())
     return X86ISD::Wrapper;
@@ -15153,6 +15718,10 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
       (M == CodeModel::Small || M == CodeModel::Kernel))
     return X86ISD::WrapperRIP;
 
+  // GOTPCREL references must always use RIP.
+  if (OpFlags == X86II::MO_GOTPCREL)
+    return X86ISD::WrapperRIP;
+
   return X86ISD::Wrapper;
 }
 
@@ -15276,7 +15845,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
-  Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
+  Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
@@ -15458,7 +16027,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
-  if (DAG.getTarget().Options.EmulatedTLS)
+  if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   const GlobalValue *GV = GA->getGlobal();
@@ -15578,7 +16147,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
       auto &DL = DAG.getDataLayout();
       SDValue Scale =
-          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
 
       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
@@ -15634,24 +16203,47 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   // values for large shift amounts.
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, dl, MVT::i8));
-  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
-                             AndNode, DAG.getConstant(0, dl, MVT::i8));
+  SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
+                             DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
 
   SDValue Hi, Lo;
-  SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
-  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
-  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
-
   if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+    Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+    Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
   } else {
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+    Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+    Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
   }
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
+  return DAG.getMergeValues({ Lo, Hi }, dl);
+}
+
+// Try to use a packed vector operation to handle i64 on 32-bit targets when
+// AVX512DQ is enabled.
+static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  assert((Op.getOpcode() == ISD::SINT_TO_FP ||
+          Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = Op.getSimpleValueType();
+
+   if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
+       (VT != MVT::f32 && VT != MVT::f64))
+    return SDValue();
+
+  // Pack the i64 into a vector, do the operation and extract.
+
+  // Using 256-bit to ensure result is 128-bits for f32 case.
+  unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
+  MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
+  MVT VecVT = MVT::getVectorVT(VT, NumElts);
+
+  SDLoc dl(Op);
+  SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
+  SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -15667,14 +16259,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
                                      DAG.getUNDEF(SrcVT)));
     }
-    if (SrcVT == MVT::v2i1) {
-      // For v2i1, we need to widen to v4i1 first.
-      assert(VT == MVT::v2f64 && "Unexpected type");
-      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
-                        DAG.getUNDEF(MVT::v2i1));
-      return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
-                         DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
-    }
     return SDValue();
   }
 
@@ -15683,15 +16267,17 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   // These are really Legal; return the operand so the caller accepts it as
   // Legal.
-  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
     return Op;
-  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
-      Subtarget.is64Bit()) {
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
     return Op;
   }
 
+  if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
+    return V;
+
   SDValue ValueToStore = Op.getOperand(0);
-  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
       !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
@@ -15876,7 +16462,8 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget, SDLoc &DL) {
+                                     const X86Subtarget &Subtarget,
+                                     const SDLoc &DL) {
   if (Op.getSimpleValueType() != MVT::v2f64)
     return SDValue();
 
@@ -16010,15 +16597,6 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
   MVT SrcVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (SrcVT == MVT::v2i1) {
-    // For v2i1, we need to widen to v4i1 first.
-    assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
-    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
-                     DAG.getUNDEF(MVT::v2i1));
-    return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
-                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
-  }
-
   switch (SrcVT.SimpleTy) {
   default:
     llvm_unreachable("Custom UINT_TO_FP is not supported!");
@@ -16050,6 +16628,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     return Op;
   }
 
+  if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
+    return V;
+
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
@@ -16315,15 +16896,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
-  if ((VT != MVT::v4i64  || InVT != MVT::v4i32) &&
-      (VT != MVT::v8i32  || InVT != MVT::v8i16) &&
-      (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
-      (VT != MVT::v8i64  || InVT != MVT::v8i32) &&
-      (VT != MVT::v8i64  || InVT != MVT::v8i16) &&
-      (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
-      (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
-      (VT != MVT::v32i16 || InVT != MVT::v32i8))
-    return SDValue();
+  assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+  assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+         "Expected same number of elements");
+  assert((VT.getVectorElementType() == MVT::i16 ||
+          VT.getVectorElementType() == MVT::i32 ||
+          VT.getVectorElementType() == MVT::i64) &&
+         "Unexpected element type");
+  assert((InVT.getVectorElementType() == MVT::i8 ||
+          InVT.getVectorElementType() == MVT::i16 ||
+          InVT.getVectorElementType() == MVT::i32) &&
+         "Unexpected element type");
 
   if (Subtarget.hasInt256())
     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
@@ -16356,6 +16939,20 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
+// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
+static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
+                                   const SDLoc &dl, SelectionDAG &DAG) {
+  assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
+                           DAG.getIntPtrConstant(0, dl));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
+                           DAG.getIntPtrConstant(8, dl));
+  Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
+  Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
+  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+}
+
 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
@@ -16366,11 +16963,23 @@ static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
   SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
 
-  // Extend VT if the scalar type is v8/v16 and BWI is not supported.
+  // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
+  // avoids a constant pool load.
+  if (VT.getVectorElementType() != MVT::i8) {
+    SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
+    return DAG.getNode(ISD::SRL, DL, VT, Extend,
+                       DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
+  }
+
+  // Extend VT if BWI is not supported.
   MVT ExtVT = VT;
-  if (!Subtarget.hasBWI() &&
-      (VT.getVectorElementType().getSizeInBits() <= 16))
+  if (!Subtarget.hasBWI()) {
+    // If v16i32 is to be avoided, we'll need to split and concatenate.
+    if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
+      return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
+
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+  }
 
   // Widen to 512-bits if VLX is not supported.
   MVT WideVT = ExtVT;
@@ -16388,9 +16997,9 @@ static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
 
   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
 
-  // Truncate if we had to extend i16/i8 above.
+  // Truncate if we had to extend above.
   if (VT != ExtVT) {
-    WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+    WideVT = MVT::getVectorVT(MVT::i8, NumElts);
     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
   }
 
@@ -16410,14 +17019,8 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   if (SVT.getVectorElementType() == MVT::i1)
     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
 
-  if (Subtarget.hasFp256())
-    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
-      return Res;
-
-  assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
-         Op.getSimpleValueType().getVectorNumElements() !=
-             SVT.getVectorNumElements());
-  return SDValue();
+  assert(Subtarget.hasAVX() && "Expected AVX support");
+  return LowerAVXExtend(Op, DAG, Subtarget);
 }
 
 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
@@ -16431,8 +17034,8 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
          "Unexpected PACK opcode");
 
-  // Requires SSE2 but AVX512 has fast truncate.
-  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+  // Requires SSE2 but AVX512 has fast vector truncate.
+  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
     return SDValue();
 
   EVT SrcVT = In.getValueType();
@@ -16441,40 +17044,53 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
   if (SrcVT == DstVT)
     return In;
 
-  // We only support vector truncation to 128bits or greater from a
-  // 256bits or greater source.
+  // We only support vector truncation to 64bits or greater from a
+  // 128bits or greater source.
   unsigned DstSizeInBits = DstVT.getSizeInBits();
   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
-  if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
+  if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
     return SDValue();
 
-  LLVMContext &Ctx = *DAG.getContext();
   unsigned NumElems = SrcVT.getVectorNumElements();
+  if (!isPowerOf2_32(NumElems))
+    return SDValue();
+
+  LLVMContext &Ctx = *DAG.getContext();
   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
 
   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
 
-  // Extract lower/upper subvectors.
-  unsigned NumSubElts = NumElems / 2;
-  SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
-  SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
-
   // Pack to the largest type possible:
   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
   EVT InVT = MVT::i16, OutVT = MVT::i8;
-  if (DstVT.getScalarSizeInBits() > 8 &&
+  if (SrcVT.getScalarSizeInBits() > 16 &&
       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
     InVT = MVT::i32;
     OutVT = MVT::i16;
   }
 
+  // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
+  if (SrcVT.is128BitVector()) {
+    InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
+    OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
+    In = DAG.getBitcast(InVT, In);
+    SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
+    Res = extractSubVector(Res, 0, DAG, DL, 64);
+    return DAG.getBitcast(DstVT, Res);
+  }
+
+  // Extract lower/upper subvectors.
+  unsigned NumSubElts = NumElems / 2;
+  SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+  SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+
   unsigned SubSizeInBits = SrcSizeInBits / 2;
   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
 
   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
-  if (SrcVT.is256BitVector()) {
+  if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
     Lo = DAG.getBitcast(InVT, Lo);
     Hi = DAG.getBitcast(InVT, Hi);
     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
@@ -16503,7 +17119,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
   }
 
   // Recursively pack lower/upper subvectors, concat result and pack again.
-  assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
+  assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
@@ -16537,12 +17153,40 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
                          DAG.getConstant(ShiftInx, DL, ExtVT));
         In = DAG.getBitcast(InVT, In);
       }
-      return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
+      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
+                          In, ISD::SETGT);
     }
     // Use TESTD/Q, extended vector to packed dword/qword.
     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
            "Unexpected vector type.");
     unsigned NumElts = InVT.getVectorNumElements();
+    assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
+    // We need to change to a wider element type that we have support for.
+    // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
+    // For 16 element vectors we extend to v16i32 unless we are explicitly
+    // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
+    // we need to split into two 8 element vectors which we can extend to v8i32,
+    // truncate and concat the results. There's an additional complication if
+    // the original type is v16i8. In that case we can't split the v16i8 so
+    // first we pre-extend it to v16i16 which we can split to v8i16, then extend
+    // to v8i32, truncate that to v8i1 and concat the two halves.
+    if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
+      if (InVT == MVT::v16i8) {
+        // First we need to sign extend up to 256-bits so we can split that.
+        InVT = MVT::v16i16;
+        In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
+      }
+      SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+      SDValue Hi = extract128BitVector(In, 8, DAG, DL);
+      // We're split now, just emit two truncates and a concat. The two
+      // truncates will trigger legalization to come back to this function.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+    }
+    // We either have 8 elements or we're allowed to use 512-bit vectors.
+    // If we have VLX, we want to use the narrowest vector that can get the
+    // job done so we use vXi32.
     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
@@ -16555,7 +17199,12 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
     In = DAG.getNode(ISD::SHL, DL, InVT, In,
                      DAG.getConstant(ShiftInx, DL, InVT));
   }
-  return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
+  // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
+  if (Subtarget.hasDQI())
+    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
+                       In, ISD::SETGT);
+  return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
+                      ISD::SETNE);
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
@@ -16574,31 +17223,36 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   if (Subtarget.hasAVX512()) {
     // word to byte only under BWI
-    if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
-      return DAG.getNode(X86ISD::VTRUNC, DL, VT,
-                         getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
-    return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+    if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
+      // Make sure we're allowed to promote 512-bits.
+      if (Subtarget.canExtendTo512DQ())
+        return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                           DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+    } else {
+      return Op;
+    }
   }
 
-  // Truncate with PACKSS if we are truncating a vector with sign-bits that
-  // extend all the way to the packed/truncated value.
-  unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
-  if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
-    if (SDValue V =
-            truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
-      return V;
+  unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero bits
   // that extend all the way to the packed/truncated value.
   // Pre-SSE41 we can only use PACKUSWB.
   KnownBits Known;
   DAG.computeKnownBits(In, Known);
-  NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
-  if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
+  if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
     if (SDValue V =
             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
       return V;
 
+  // Truncate with PACKSS if we are truncating a vector with sign-bits that
+  // extend all the way to the packed/truncated value.
+  if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
+    if (SDValue V =
+            truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
+      return V;
+
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
@@ -16665,10 +17319,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Handle truncation of V256 to V128 using shuffles.
-  if (!VT.is128BitVector() || !InVT.is256BitVector())
-    return SDValue();
+  assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
 
-  assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
+  assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
@@ -16907,8 +17560,16 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   return Res;
 }
 
+/// Helper for creating a X86ISD::SETCC node.
+static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
+                        SelectionDAG &DAG) {
+  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                     DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+}
+
 // Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+                                      const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
@@ -16995,10 +17656,12 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
+  SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                            VecIns.back(), VecIns.back());
+  return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
 }
 
-/// \brief return true if \c Op has a use that doesn't just read flags.
+/// return true if \c Op has a use that doesn't just read flags.
 static bool hasNonFlagsUse(SDValue Op) {
   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
        ++UI) {
@@ -17021,11 +17684,6 @@ static bool hasNonFlagsUse(SDValue Op) {
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
                                     SelectionDAG &DAG) const {
-  if (Op.getValueType() == MVT::i1) {
-    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
-    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
-                       DAG.getConstant(0, dl, MVT::i8));
-  }
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
   bool NeedCF = false;
@@ -17234,14 +17892,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
-    case ISD::OR: {
-      if (!NeedTruncation && ZeroCheck) {
-        if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
-          return EFLAGS;
-      }
-      Opcode = X86ISD::OR;
-      break;
-    }
+    case ISD::OR:  Opcode = X86ISD::OR;  break;
     }
 
     NumOperands = 2;
@@ -17283,7 +17934,8 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
-        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+        SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+        Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
       }
     }
   }
@@ -17297,7 +17949,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
 
   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
-  DAG.ReplaceAllUsesWith(Op, New);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
   return SDValue(New.getNode(), 1);
 }
 
@@ -17382,7 +18034,6 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
   EVT VT = Op.getValueType();
 
   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
-  // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   // instructions: convert to single, rsqrtss, convert back to double, refine
@@ -17393,12 +18044,15 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
-      (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+      (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+      (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 1;
 
     UseOneConstNR = false;
-    return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+    // There is no FSQRT for 512-bits, but there is RSQRT14.
+    unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
+    return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
   }
   return SDValue();
 }
@@ -17411,7 +18065,6 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
   EVT VT = Op.getValueType();
 
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
-  // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // reciprocal estimate with refinement on x86 prior to FMA requires
   // 15 instructions: convert to single, rcpss, convert back to double, refine
@@ -17420,7 +18073,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 
   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
-      (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+      (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+      (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
     // Enable estimate codegen with 1 refinement step for vector division.
     // Scalar division estimates are disabled because they break too much
     // real-world code. These defaults are intended to match GCC behavior.
@@ -17430,7 +18084,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 1;
 
-    return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+    // There is no FSQRT for 512-bits, but there is RCP14.
+    unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
+    return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
   }
   return SDValue();
 }
@@ -17445,13 +18101,6 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   return 2;
 }
 
-/// Helper for creating a X86ISD::SETCC node.
-static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
-                        SelectionDAG &DAG) {
-  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                     DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
-}
-
 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
 /// according to equal/not-equal condition code \p CC.
 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
@@ -17519,12 +18168,15 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
       LHS = AndLHS.getOperand(0);
       RHS = AndLHS.getOperand(1);
-    }
-
-    // Use BT if the immediate can't be encoded in a TEST instruction.
-    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
-      LHS = AndLHS;
-      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+    } else {
+      // Use BT if the immediate can't be encoded in a TEST instruction or we
+      // are optimizing for size and the immedaite won't fit in a byte.
+      bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+      if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
+          isPowerOf2_64(AndRHSVal)) {
+        LHS = AndLHS;
+        RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+      }
     }
   }
 
@@ -17609,49 +18261,6 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
-static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
-  SDValue Op0 = Op.getOperand(0);
-  SDValue Op1 = Op.getOperand(1);
-  SDValue CC = Op.getOperand(2);
-  MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
-
-  assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
-         "Unexpected type for boolean compare operation");
-  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
-                               DAG.getConstant(-1, dl, VT));
-  SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
-                               DAG.getConstant(-1, dl, VT));
-  switch (SetCCOpcode) {
-  default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETEQ:
-    // (x == y) -> ~(x ^ y)
-    return DAG.getNode(ISD::XOR, dl, VT,
-                       DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
-                       DAG.getConstant(-1, dl, VT));
-  case ISD::SETNE:
-    // (x != y) -> (x ^ y)
-    return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
-  case ISD::SETUGT:
-  case ISD::SETGT:
-    // (x > y) -> (x & ~y)
-    return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
-  case ISD::SETULT:
-  case ISD::SETLT:
-    // (x < y) -> (~x & y)
-    return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
-  case ISD::SETULE:
-  case ISD::SETLE:
-    // (x <= y) -> (~x | y)
-    return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
-  case ISD::SETUGE:
-  case ISD::SETGE:
-    // (x >=y) -> (x | ~y)
-    return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
-  }
-}
-
 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
 
   SDValue Op0 = Op.getOperand(0);
@@ -17664,48 +18273,24 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
          "Cannot set masked compare for this operation");
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  unsigned  Opc = 0;
-  bool Unsigned = false;
-  bool Swap = false;
-  unsigned SSECC;
-  switch (SetCCOpcode) {
-  default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETNE:  SSECC = 4; break;
-  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
-  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
-  case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
-  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
-  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
-  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
-  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
-  case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
-  case ISD::SETLE:  SSECC = 2; break;
-  }
 
-  if (Swap)
+  // If this is a seteq make sure any build vectors of all zeros are on the RHS.
+  // This helps with vptestm matching.
+  // TODO: Should we just canonicalize the setcc during DAG combine?
+  if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
+      ISD::isBuildVectorAllZeros(Op0.getNode()))
     std::swap(Op0, Op1);
 
-  //  See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
-  if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
-    SDValue A = peekThroughBitcasts(Op0);
-    if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
-        ISD::isBuildVectorAllZeros(Op1.getNode())) {
-      MVT VT0 = Op0.getSimpleValueType();
-      SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
-      SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
-      return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
-                         dl, VT, RHS, LHS);
-    }
+  // Prefer SETGT over SETLT.
+  if (SetCCOpcode == ISD::SETLT) {
+    SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(Op0, Op1);
   }
 
-  if (Opc)
-    return DAG.getNode(Opc, dl, VT, Op0, Op1);
-  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
-  return DAG.getNode(Opc, dl, VT, Op0, Op1,
-                     DAG.getConstant(SSECC, dl, MVT::i8));
+  return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
 }
 
-/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// Try to turn a VSETULT into a VSETULE by modifying its second
 /// operand \p Op1.  If non-trivial (for example because it's not constant)
 /// return an empty value.
 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
@@ -17735,6 +18320,51 @@ static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
   return DAG.getBuildVector(VT, dl, ULTOp1);
 }
 
+/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+/// Op0 u<= Op1:
+///   t = psubus Op0, Op1
+///   pcmpeq t, <0..0>
+static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
+                                    ISD::CondCode Cond, const SDLoc &dl,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  MVT VET = VT.getVectorElementType();
+  if (VET != MVT::i8 && VET != MVT::i16)
+    return SDValue();
+
+  switch (Cond) {
+  default:
+    return SDValue();
+  case ISD::SETULT: {
+    // If the comparison is against a constant we can turn this into a
+    // setule.  With psubus, setule does not require a swap.  This is
+    // beneficial because the constant in the register is no longer
+    // destructed as the destination so it can be hoisted out of a loop.
+    // Only do this pre-AVX since vpcmp* is no longer destructive.
+    if (Subtarget.hasAVX())
+      return SDValue();
+    SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+    if (!ULEOp1)
+      return SDValue();
+    Op1 = ULEOp1;
+    break;
+  }
+  // Psubus is better than flip-sign because it requires no inversion.
+  case ISD::SETUGE:
+    std::swap(Op0, Op1);
+    break;
+  case ISD::SETULE:
+    break;
+  }
+
+  SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
+  return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+                     getZeroVector(VT, Subtarget, DAG, dl));
+}
+
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
@@ -17808,23 +18438,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
          "Invalid number of packed elements for source and destination!");
 
-  if (VT.is128BitVector() && VTOp0.is256BitVector()) {
-    // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
-    // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
-    // legalizer firstly checks if the first operand in input to the setcc has
-    // a legal type. If so, then it promotes the return type to that same type.
-    // Otherwise, the return type is promoted to the 'next legal type' which,
-    // for a vector of MVT::i1 is always a 128-bit integer vector type.
-    //
-    // We reach this code only if the following two conditions are met:
-    // 1. Both return type and operand type have been promoted to wider types
-    //    by the type legalizer.
-    // 2. The original operand type has been promoted to a 256-bit vector.
-    //
-    // Note that condition 2. only applies for AVX targets.
-    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
-    return DAG.getZExtOrTrunc(NewOp, dl, VT);
-  }
+  // This is being called by type legalization because v2i32 is marked custom
+  // for result type legalization for v2f32.
+  if (VTOp0 == MVT::v2i32)
+    return SDValue();
 
   // The non-AVX512 code below works under the assumption that source and
   // destination types are the same.
@@ -17835,31 +18452,17 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
-  // Operands are boolean (vectors of i1)
-  MVT OpVT = Op1.getSimpleValueType();
-  if (OpVT.getVectorElementType() == MVT::i1)
-    return LowerBoolVSETCC_AVX512(Op, DAG);
-
   // The result is boolean, but operands are int/float
   if (VT.getVectorElementType() == MVT::i1) {
     // In AVX-512 architecture setcc returns mask with i1 elements,
     // But there is no compare instruction for i8 and i16 elements in KNL.
-    // In this case use SSE compare
-    bool UseAVX512Inst =
-      (OpVT.is512BitVector() ||
-       OpVT.getScalarSizeInBits() >= 32 ||
-       (Subtarget.hasBWI() && Subtarget.hasVLX()));
-
-    if (UseAVX512Inst)
-      return LowerIntVSETCC_AVX512(Op, DAG);
-
-    return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                        DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+    assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
+           "Unexpected operand type");
+    return LowerIntVSETCC_AVX512(Op, DAG);
   }
 
   // Lower using XOP integer comparisons.
-  if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
-       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
+  if (VT.is128BitVector() && Subtarget.hasXOP()) {
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
     switch (Cond) {
@@ -17902,15 +18505,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // We are handling one of the integer comparisons here. Since SSE only has
-  // GT and EQ comparisons for integer, swapping operands and multiple
-  // operations may be required for some comparisons.
-  unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
-                                                            : X86ISD::PCMPGT;
-  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
-              Cond == ISD::SETGE || Cond == ISD::SETUGE;
-  bool Invert = Cond == ISD::SETNE ||
-                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+  // If this is a SETNE against the signed minimum value, change it to SETGT.
+  // If this is a SETNE against the signed maximum value, change it to SETLT.
+  // which will be swapped to SETGT.
+  // Otherwise we use PCMPEQ+invert.
+  APInt ConstValue;
+  if (Cond == ISD::SETNE &&
+      ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
+    if (ConstValue.isMinSignedValue())
+      Cond = ISD::SETGT;
+    else if (ConstValue.isMaxSignedValue())
+      Cond = ISD::SETLT;
+  }
 
   // If both operands are known non-negative, then an unsigned compare is the
   // same as a signed compare and there's no need to flip signbits.
@@ -17919,58 +18525,47 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
 
-  // Special case: Use min/max operations for SETULE/SETUGE
-  MVT VET = VT.getVectorElementType();
-  bool HasMinMax =
-      (Subtarget.hasAVX512() && VET == MVT::i64) ||
-      (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
-      (Subtarget.hasSSE2() && (VET == MVT::i8));
-  bool MinMax = false;
-  if (HasMinMax) {
+  // Special case: Use min/max operations for unsigned compares. We only want
+  // to do this for unsigned compares if we need to flip signs or if it allows
+  // use to avoid an invert.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (ISD::isUnsignedIntSetCC(Cond) &&
+      (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
+      TLI.isOperationLegal(ISD::UMIN, VT)) {
+    bool Invert = false;
+    unsigned Opc;
     switch (Cond) {
-    default: break;
-    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
-    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+    default: llvm_unreachable("Unexpected condition code");
+    case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
+    case ISD::SETULE: Opc = ISD::UMIN; break;
+    case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
+    case ISD::SETUGE: Opc = ISD::UMAX; break;
     }
 
-    if (MinMax)
-      Swap = Invert = FlipSigns = false;
-  }
+    SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
 
-  bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
-  bool Subus = false;
-  if (!MinMax && HasSubus) {
-    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
-    // Op0 u<= Op1:
-    //   t = psubus Op0, Op1
-    //   pcmpeq t, <0..0>
-    switch (Cond) {
-    default: break;
-    case ISD::SETULT: {
-      // If the comparison is against a constant we can turn this into a
-      // setule.  With psubus, setule does not require a swap.  This is
-      // beneficial because the constant in the register is no longer
-      // destructed as the destination so it can be hoisted out of a loop.
-      // Only do this pre-AVX since vpcmp* is no longer destructive.
-      if (Subtarget.hasAVX())
-        break;
-      if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
-        Op1 = ULEOp1;
-        Subus = true; Invert = false; Swap = false;
-      }
-      break;
-    }
-    // Psubus is better than flip-sign because it requires no inversion.
-    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
-    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
-    }
+    // If the logical-not of the result is required, perform that now.
+    if (Invert)
+      Result = DAG.getNOT(dl, Result, VT);
 
-    if (Subus) {
-      Opc = X86ISD::SUBUS;
-      FlipSigns = false;
-    }
+    return Result;
   }
 
+  // Try to use SUBUS and PCMPEQ.
+  if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
+    return V;
+
+  // We are handling one of the integer comparisons here. Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+                                                            : X86ISD::PCMPGT;
+  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+              Cond == ISD::SETGE || Cond == ISD::SETUGE;
+  bool Invert = Cond == ISD::SETNE ||
+                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+
   if (Swap)
     std::swap(Op0, Op1);
 
@@ -18058,13 +18653,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
 
-  if (MinMax)
-    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
-
-  if (Subus)
-    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
-                         getZeroVector(VT, Subtarget, DAG, dl));
-
   return Result;
 }
 
@@ -18082,18 +18670,30 @@ static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
 
   Op0 = Op0.getOperand(0);
   MVT VT = Op0.getSimpleValueType();
-  if (!(Subtarget.hasDQI() && (VT == MVT::v8i1  || VT == MVT::v16i1)) &&
+  if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
+      !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
     return SDValue();
 
   X86::CondCode X86CC;
   if (isNullConstant(Op1)) {
     X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+  } else if (isAllOnesConstant(Op1)) {
+    // C flag is set for all ones.
+    X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
   } else
     return SDValue();
 
-  SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
-  return getSETCC(X86CC, KTEST, dl, DAG);
+  // If the input is an OR, we can combine it's operands into the KORTEST.
+  SDValue LHS = Op0;
+  SDValue RHS = Op0;
+  if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
+    LHS = Op0.getOperand(0);
+    RHS = Op0.getOperand(1);
+  }
+
+  SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+  return getSETCC(X86CC, KORTEST, dl, DAG);
 }
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -18118,6 +18718,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       return NewSetCC;
   }
 
+  // Try to use PTEST for a tree ORs equality compared with 0.
+  // TODO: We could do AND tree with all 1s as well by using the C flag.
+  if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
+      return NewSetCC;
+  }
+
   // Try to lower using KTEST.
   if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
     return NewSetCC;
@@ -18213,7 +18821,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   if (Cond.getOpcode() == ISD::SETCC &&
-      ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+      ((Subtarget.hasSSE2() && VT == MVT::f64) ||
        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
@@ -18534,6 +19142,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // Promote i16 cmovs if it won't prevent folding a load.
+  if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
+    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+    Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+    SDValue Ops[] = { Op2, Op1, CC, Cond };
+    SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
+    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+  }
+
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -18554,8 +19171,13 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
 
   // Extend VT if the scalar type is v8/v16 and BWI is not supported.
   MVT ExtVT = VT;
-  if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
+  if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
+    // If v16i32 is to be avoided, we'll need to split and concatenate.
+    if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
+      return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
+
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+  }
 
   // Widen to 512-bits if VLX is not supported.
   MVT WideVT = ExtVT;
@@ -18571,7 +19193,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   MVT WideEltVT = WideVT.getVectorElementType();
   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
-    V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
+    V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
   } else {
     SDValue NegOne = getOnesVector(WideVT, DAG, dl);
     SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
@@ -18600,11 +19222,8 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   if (InVT.getVectorElementType() == MVT::i1)
     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
 
-  if (Subtarget.hasFp256())
-    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
-      return Res;
-
-  return SDValue();
+  assert(Subtarget.hasAVX() && "Expected AVX support");
+  return LowerAVXExtend(Op, DAG, Subtarget);
 }
 
 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
@@ -18704,15 +19323,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   if (InVT.getVectorElementType() == MVT::i1)
     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
 
-  if ((VT != MVT::v4i64  || InVT != MVT::v4i32) &&
-      (VT != MVT::v8i32  || InVT != MVT::v8i16) &&
-      (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
-      (VT != MVT::v8i64  || InVT != MVT::v8i32) &&
-      (VT != MVT::v8i64  || InVT != MVT::v8i16) &&
-      (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
-      (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
-      (VT != MVT::v32i16 || InVT != MVT::v32i8))
-    return SDValue();
+  assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+  assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+         "Expected same number of elements");
+  assert((VT.getVectorElementType() == MVT::i16 ||
+          VT.getVectorElementType() == MVT::i32 ||
+          VT.getVectorElementType() == MVT::i64) &&
+         "Unexpected element type");
+  assert((InVT.getVectorElementType() == MVT::i8 ||
+          InVT.getVectorElementType() == MVT::i16 ||
+          InVT.getVectorElementType() == MVT::i32) &&
+         "Unexpected element type");
 
   if (Subtarget.hasInt256())
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
@@ -18750,165 +19371,29 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
-// Lower truncating store. We need a special lowering to vXi1 vectors
-static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
-                                    SelectionDAG &DAG) {
-  StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG) {
+  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
   SDLoc dl(St);
-  EVT MemVT = St->getMemoryVT();
-  assert(St->isTruncatingStore() && "We only custom truncating store.");
-  assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
-         "Expected truncstore of i1 vector");
-
-  SDValue Op = St->getValue();
-  MVT OpVT = Op.getValueType().getSimpleVT();
-  unsigned NumElts = OpVT.getVectorNumElements();
-  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
-      NumElts == 16) {
-    // Truncate and store - everything is legal
-    Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
-    if (MemVT.getSizeInBits() < 8)
-      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
-                       DAG.getUNDEF(MVT::v8i1), Op,
-                       DAG.getIntPtrConstant(0, dl));
-    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
-                        St->getMemOperand());
-  }
-
-  // A subset, assume that we have only AVX-512F
-  if (NumElts <= 8) {
-    if (NumElts < 8) {
-      // Extend to 8-elts vector
-      MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
-      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
-                        DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
-    }
-    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
-    Op = DAG.getBitcast(MVT::i8, Op);
-    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
-                        St->getMemOperand());
-  }
-  // v32i8
-  assert(OpVT == MVT::v32i8 && "Unexpected operand type");
-  // Divide the vector into 2 parts and store each part separately
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
-                            DAG.getIntPtrConstant(0, dl));
-  Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
-  SDValue BasePtr = St->getBasePtr();
-  SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
-                              St->getMemOperand());
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
-                            DAG.getIntPtrConstant(16, dl));
-  Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
-
-  SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
-
-  SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
-                              BasePtrHi, St->getPointerInfo().getWithOffset(2),
-                              MinAlign(St->getAlignment(), 2U),
-                              St->getMemOperand()->getFlags());
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
-}
-
-static SDValue LowerExtended1BitVectorLoad(SDValue Op,
-                                           const X86Subtarget &Subtarget,
-                                           SelectionDAG &DAG) {
-
-  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
-  SDLoc dl(Ld);
-  EVT MemVT = Ld->getMemoryVT();
-  assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
-         "Expected i1 vector load");
-  unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
-    ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
-  MVT VT = Op.getValueType().getSimpleVT();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  if ((Subtarget.hasBWI() && NumElts >= 32) ||
-      (Subtarget.hasDQI() && NumElts < 16) ||
-      NumElts == 16) {
-    // Load and extend - everything is legal
-    if (NumElts < 8) {
-      SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
-                                 Ld->getBasePtr(),
-                                 Ld->getMemOperand());
-      // Replace chain users with the new chain.
-      assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
-      if (Subtarget.hasVLX()) {
-        // Extract to v4i1/v2i1.
-        SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
-                                      DAG.getIntPtrConstant(0, dl));
-        // Finally, do a normal sign-extend to the desired register.
-        return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
-      }
-
-      MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
-      SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
-
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
-                                   DAG.getIntPtrConstant(0, dl));
-    }
-    SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
-                               Ld->getBasePtr(),
-                               Ld->getMemOperand());
-    // Replace chain users with the new chain.
-    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
-
-    // Finally, do a normal sign-extend to the desired register.
-    return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
-  }
-
-  if (NumElts <= 8) {
-    // A subset, assume that we have only AVX-512F
-    SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
-                              Ld->getBasePtr(),
-                              Ld->getMemOperand());
-    // Replace chain users with the new chain.
-    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
-
-    SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);
-
-    if (NumElts == 8)
-      return DAG.getNode(ExtOpcode, dl, VT, BitVec);
-
-    if (Subtarget.hasVLX()) {
-      // Extract to v4i1/v2i1.
-      SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
-                                    DAG.getIntPtrConstant(0, dl));
-      // Finally, do a normal sign-extend to the desired register.
-      return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
-    }
-
-    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
-    SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
-                        DAG.getIntPtrConstant(0, dl));
-  }
-
-  assert(VT == MVT::v32i8 && "Unexpected extload type");
-
-  SDValue BasePtr = Ld->getBasePtr();
-  SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
-                               Ld->getBasePtr(),
-                               Ld->getMemOperand());
-
-  SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
-
-  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
-                               Ld->getPointerInfo().getWithOffset(2),
-                               MinAlign(Ld->getAlignment(), 2U),
-                               Ld->getMemOperand()->getFlags());
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                                 LoadLo.getValue(1), LoadHi.getValue(1));
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+  SDValue StoredVal = St->getValue();
+
+  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+  assert(StoredVal.getValueType().isVector() &&
+         StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
+         StoredVal.getValueType().getVectorNumElements() <= 8 &&
+         "Unexpected VT");
+  assert(!St->isTruncatingStore() && "Expected non-truncating store");
+  assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+         "Expected AVX512F without AVX512DQI");
+
+  StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                          DAG.getUNDEF(MVT::v8i1), StoredVal,
+                          DAG.getIntPtrConstant(0, dl));
+  StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
 
-  SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
-  SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+  return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                      St->getPointerInfo(), St->getAlignment(),
+                      St->getMemOperand()->getFlags());
 }
 
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
@@ -18918,21 +19403,40 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
-static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
   assert(RegVT.isInteger() &&
          "We only custom lower integer vector sext loads.");
 
-  // Nothing useful we can do without SSE2 shuffles.
-  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
-  if (MemVT.getScalarType() == MVT::i1)
-    return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
+  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+  if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
+    assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+    assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
+    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+           "Expected AVX512F without AVX512DQI");
+
+    SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
+                                Ld->getPointerInfo(), Ld->getAlignment(),
+                                Ld->getMemOperand()->getFlags());
+
+    // Replace chain users with the new chain.
+    assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
+
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
+                                  DAG.getBitcast(MVT::v8i1, NewLd),
+                                  DAG.getIntPtrConstant(0, dl));
+    return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
+  }
+
+  // Nothing useful we can do without SSE2 shuffles.
+  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
@@ -19775,7 +20279,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
-/// \brief Return Mask with the necessary casting or extending
+/// Return Mask with the necessary casting or extending
 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
@@ -19793,27 +20297,19 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   }
 
   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
-    if (MaskVT == MVT::v64i1) {
-      assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
-      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
-      SDValue Lo, Hi;
-      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
-                          DAG.getConstant(0, dl, MVT::i32));
-      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
-                          DAG.getConstant(1, dl, MVT::i32));
-
-      Lo = DAG.getBitcast(MVT::v32i1, Lo);
-      Hi = DAG.getBitcast(MVT::v32i1, Hi);
-
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
-    } else {
-      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
-      // and bitcast.
-      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
-      return DAG.getBitcast(MaskVT,
-                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
-    }
-
+    assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
+    assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+    // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+    SDValue Lo, Hi;
+    Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                        DAG.getConstant(0, dl, MVT::i32));
+    Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                        DAG.getConstant(1, dl, MVT::i32));
+
+    Lo = DAG.getBitcast(MVT::v32i1, Lo);
+    Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   } else {
     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
                                      Mask.getSimpleValueType().getSizeInBits());
@@ -19825,7 +20321,7 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   }
 }
 
-/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// Return (and \p Op, \p Mask) for compare instructions or
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
@@ -19846,11 +20342,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   default: break;
   case X86ISD::CMPM:
   case X86ISD::CMPM_RND:
-  case X86ISD::CMPMU:
   case X86ISD::VPSHUFBITQMB:
-    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
   case X86ISD::VFPCLASS:
-    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case ISD::TRUNCATE:
   case X86ISD::VTRUNC:
   case X86ISD::VTRUNCS:
   case X86ISD::VTRUNCUS:
@@ -19866,7 +20361,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
 
-/// \brief Creates an SDNode for a predicated scalar operation.
+/// Creates an SDNode for a predicated scalar operation.
 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
 /// The mask is coming as MVT::i8 and it should be transformed
 /// to MVT::v1i1 while lowering masking intrinsics.
@@ -19885,12 +20380,12 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
   SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
   if (Op.getOpcode() == X86ISD::FSETCCM ||
-      Op.getOpcode() == X86ISD::FSETCCM_RND)
+      Op.getOpcode() == X86ISD::FSETCCM_RND ||
+      Op.getOpcode() == X86ISD::VFPCLASSS)
     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
-  if (Op.getOpcode() == X86ISD::VFPCLASSS)
-    return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
 
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
@@ -19975,14 +20470,67 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   if (IntrData) {
     switch(IntrData->Type) {
-    case INTR_TYPE_1OP:
+    case INTR_TYPE_1OP: {
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(2);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+                             Op.getOperand(1), Rnd);
+        }
+      }
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+    }
     case INTR_TYPE_2OP:
-      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
-        Op.getOperand(2));
+    case INTR_TYPE_2OP_IMM8: {
+      SDValue Src2 = Op.getOperand(2);
+
+      if (IntrData->Type == INTR_TYPE_2OP_IMM8)
+        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(3);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+                             Op.getOperand(1), Src2, Rnd);
+        }
+      }
+
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+                         Op.getOperand(1), Src2);
+    }
     case INTR_TYPE_3OP:
-      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
-        Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_3OP_IMM8: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+
+      if (IntrData->Type == INTR_TYPE_3OP_IMM8)
+        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(4);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return DAG.getNode(IntrWithRoundingModeOpcode,
+                             dl, Op.getValueType(),
+                             Src1, Src2, Src3, Rnd);
+        }
+      }
+
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+                         Src1, Src2, Src3);
+    }
     case INTR_TYPE_4OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
@@ -20083,16 +20631,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                               RoundingMode, Sae),
                                   Mask, Src0, Subtarget, DAG);
     }
-    case INTR_TYPE_2OP_MASK:
-    case INTR_TYPE_2OP_IMM8_MASK: {
+    case INTR_TYPE_2OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
 
-      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
-        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
-
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -20147,26 +20691,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                               Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case INTR_TYPE_3OP_MASK_RM: {
-      SDValue Src1 = Op.getOperand(1);
-      SDValue Src2 = Op.getOperand(2);
-      SDValue Imm = Op.getOperand(3);
-      SDValue PassThru = Op.getOperand(4);
-      SDValue Mask = Op.getOperand(5);
-      // We specify 2 possible modes for intrinsics, with/without rounding
-      // modes.
-      // First, we check if the intrinsic have rounding mode (7 operands),
-      // if not, we set rounding mode to "current".
-      SDValue Rnd;
-      if (Op.getNumOperands() == 7)
-        Rnd = Op.getOperand(6);
-      else
-        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
-                                              Src1, Src2, Imm, Rnd),
-                                  Mask, PassThru, Subtarget, DAG);
-    }
-    case INTR_TYPE_3OP_IMM8_MASK:
     case INTR_TYPE_3OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
@@ -20174,9 +20698,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
 
-      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
-        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -20194,41 +20715,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case VPERM_2OP_MASK : {
+    case VPERM_2OP : {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
-      SDValue PassThru = Op.getOperand(3);
-      SDValue Mask = Op.getOperand(4);
 
       // Swap Src1 and Src2 in the node creation
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
-                                  Mask, PassThru, Subtarget, DAG);
+      return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
     }
-    case VPERM_3OP_MASKZ:
-    case VPERM_3OP_MASK:{
-      MVT VT = Op.getSimpleValueType();
-      // Src2 is the PassThru
-      SDValue Src1 = Op.getOperand(1);
-      // PassThru needs to be the same type as the destination in order
-      // to pattern match correctly.
-      SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
-      SDValue Src3 = Op.getOperand(3);
-      SDValue Mask = Op.getOperand(4);
-      SDValue PassThru = SDValue();
-
-      // set PassThru element
-      if (IntrData->Type == VPERM_3OP_MASKZ)
-        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-      else
-        PassThru = Src2;
-
-      // Swap Src1 and Src2 in the node creation
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
-                                              dl, Op.getValueType(),
-                                              Src2, Src1, Src3),
-                                  Mask, PassThru, Subtarget, DAG);
-    }
-    case FMA_OP_MASK3:
     case FMA_OP_MASKZ:
     case FMA_OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
@@ -20241,8 +20734,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // set PassThru element
       if (IntrData->Type == FMA_OP_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-      else if (IntrData->Type == FMA_OP_MASK3)
-        PassThru = Src3;
       else
         PassThru = Src1;
 
@@ -20263,76 +20754,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case FMA_OP_SCALAR_MASK:
-    case FMA_OP_SCALAR_MASK3:
-    case FMA_OP_SCALAR_MASKZ: {
-      SDValue Src1 = Op.getOperand(1);
-      SDValue Src2 = Op.getOperand(2);
-      SDValue Src3 = Op.getOperand(3);
-      SDValue Mask = Op.getOperand(4);
-      MVT VT = Op.getSimpleValueType();
-      SDValue PassThru = SDValue();
-
-      // set PassThru element
-      if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
-        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-      else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
-        PassThru = Src3;
-      else
-        PassThru = Src1;
-
-      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
-      if (IntrWithRoundingModeOpcode != 0) {
-        SDValue Rnd = Op.getOperand(5);
-        if (!isRoundModeCurDirection(Rnd))
-          return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
-                                                  Op.getValueType(), Src1, Src2,
-                                                  Src3, Rnd),
-                                      Mask, PassThru, Subtarget, DAG);
-      }
-
-      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
-                                              Op.getValueType(), Src1, Src2,
-                                              Src3),
-                                  Mask, PassThru, Subtarget, DAG);
-    }
-    case IFMA_OP_MASKZ:
-    case IFMA_OP_MASK: {
-      SDValue Src1 = Op.getOperand(1);
-      SDValue Src2 = Op.getOperand(2);
-      SDValue Src3 = Op.getOperand(3);
-      SDValue Mask = Op.getOperand(4);
-      MVT VT = Op.getSimpleValueType();
-      SDValue PassThru = Src1;
-
-      // set PassThru element
-      if (IntrData->Type == IFMA_OP_MASKZ)
-        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
-      // Node we need to swizzle the operands to pass the multiply operands
+    case IFMA_OP:
+      // NOTE: We need to swizzle the operands to pass the multiply operands
       // first.
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
-                                              dl, Op.getValueType(),
-                                              Src2, Src3, Src1),
-                                  Mask, PassThru, Subtarget, DAG);
-    }
-    case TERLOG_OP_MASK:
-    case TERLOG_OP_MASKZ: {
-      SDValue Src1 = Op.getOperand(1);
-      SDValue Src2 = Op.getOperand(2);
-      SDValue Src3 = Op.getOperand(3);
-      SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
-      SDValue Mask = Op.getOperand(5);
-      MVT VT = Op.getSimpleValueType();
-      SDValue PassThru = Src1;
-      // Set PassThru element.
-      if (IntrData->Type == TERLOG_OP_MASKZ)
-        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
-                                              Src1, Src2, Src3, Src4),
-                                  Mask, PassThru, Subtarget, DAG);
-    }
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+                         Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
     case CVTPD2PS:
       // ISD::FP_ROUND has a second argument that indicates if the truncation
       // does not change the value. Set it to 0 since it can change.
@@ -20363,21 +20789,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                   Mask, PassThru, Subtarget, DAG);
     }
     case FPCLASS: {
-      // FPclass intrinsics with mask
-       SDValue Src1 = Op.getOperand(1);
-       MVT VT = Src1.getSimpleValueType();
-       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-       SDValue Imm = Op.getOperand(2);
-       SDValue Mask = Op.getOperand(3);
-       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
-       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
-       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
-                                                  Subtarget, DAG);
-       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
-                                 DAG.getUNDEF(BitcastVT), FPclassMask,
-                                 DAG.getIntPtrConstant(0, dl));
-       return DAG.getBitcast(Op.getValueType(), Res);
+      // FPclass intrinsics
+      SDValue Src1 = Op.getOperand(1);
+      MVT MaskVT = Op.getSimpleValueType();
+      SDValue Imm = Op.getOperand(2);
+      return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
     }
     case FPCLASSS: {
       SDValue Src1 = Op.getOperand(1);
@@ -20386,17 +20802,20 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
                                                  Subtarget, DAG);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
-                         DAG.getIntPtrConstant(0, dl));
-    }
-    case CMP_MASK:
-    case CMP_MASK_CC: {
+      // Need to fill with zeros to ensure the bitcast will produce zeroes
+      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                                DAG.getConstant(0, dl, MVT::v8i1),
+                                FPclassMask, DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(MVT::i8, Ins);
+    }
+    case CMP_MASK: {
       // Comparison intrinsics with masks.
       // Example of transformation:
       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
       // (i8 (bitcast
-      //   (v8i1 (insert_subvector undef,
+      //   (v8i1 (insert_subvector zero,
       //           (v2i1 (and (PCMPEQM %a, %b),
       //                      (extract_subvector
       //                         (v8i1 (bitcast %mask)), 0))), 0))))
@@ -20405,36 +20824,39 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
                                        Mask.getSimpleValueType().getSizeInBits());
-      SDValue Cmp;
-      if (IntrData->Type == CMP_MASK_CC) {
-        SDValue CC = Op.getOperand(3);
-        CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
-        // We specify 2 possible opcodes for intrinsics with rounding modes.
-        // First, we check if the intrinsic may have non-default rounding mode,
-        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
-        if (IntrData->Opc1 != 0) {
-          SDValue Rnd = Op.getOperand(5);
-          if (!isRoundModeCurDirection(Rnd))
-            Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
-                              Op.getOperand(2), CC, Rnd);
-        }
-        //default rounding mode
-        if(!Cmp.getNode())
-            Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
-                              Op.getOperand(2), CC);
-
-      } else {
-        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
-        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
-                          Op.getOperand(2));
-      }
+      SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                                Op.getOperand(2));
       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
                                              Subtarget, DAG);
+      // Need to fill with zeros to ensure the bitcast will produce zeroes
+      // for the upper bits in the v2i1/v4i1 case.
       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
-                                DAG.getUNDEF(BitcastVT), CmpMask,
-                                DAG.getIntPtrConstant(0, dl));
+                                DAG.getConstant(0, dl, BitcastVT),
+                                CmpMask, DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
+
+    case CMP_MASK_CC: {
+      MVT MaskVT = Op.getSimpleValueType();
+      SDValue Cmp;
+      SDValue CC = Op.getOperand(3);
+      CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      if (IntrData->Opc1 != 0) {
+        SDValue Rnd = Op.getOperand(4);
+        if (!isRoundModeCurDirection(Rnd))
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+                            Op.getOperand(2), CC, Rnd);
+      }
+      //default rounding mode
+      if (!Cmp.getNode())
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                          Op.getOperand(2), CC);
+
+      return Cmp;
+    }
     case CMP_MASK_SCALAR_CC: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
@@ -20453,8 +20875,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
                                              Subtarget, DAG);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
-                         DAG.getIntPtrConstant(0, dl));
+      // Need to fill with zeros to ensure the bitcast will produce zeroes
+      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                                DAG.getConstant(0, dl, MVT::v8i1),
+                                CmpMask, DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(MVT::i8, Ins);
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -20507,8 +20933,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       else
         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
-                         DAG.getIntPtrConstant(0, dl));
+      // Need to fill with zeros to ensure the bitcast will produce zeroes
+      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+                                DAG.getConstant(0, dl, MVT::v16i1),
+                                FCmp, DAG.getIntPtrConstant(0, dl));
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
+                         DAG.getBitcast(MVT::i16, Ins));
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -20525,34 +20956,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                               DataToCompress),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case BROADCASTM: {
-      SDValue Mask = Op.getOperand(1);
-      MVT MaskVT = MVT::getVectorVT(MVT::i1,
-                                    Mask.getSimpleValueType().getSizeInBits());
-      Mask = DAG.getBitcast(MaskVT, Mask);
-      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
-    }
-    case KUNPCK: {
-      MVT VT = Op.getSimpleValueType();
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
-
-      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
-      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
-      // Arguments should be swapped.
-      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
-                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
-                                Src2, Src1);
-      return DAG.getBitcast(VT, Res);
-    }
-    case MASK_BINOP: {
-      MVT VT = Op.getSimpleValueType();
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
-
-      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
-      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
-      SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
-      return DAG.getBitcast(VT, Res);
-    }
     case FIXUPIMMS:
     case FIXUPIMMS_MASKZ:
     case FIXUPIMM:
@@ -20582,18 +20985,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                        Src1, Src2, Src3, Imm, Rnd),
                                     Mask, Passthru, Subtarget, DAG);
     }
-    case CONVERT_TO_MASK: {
-      MVT SrcVT = Op.getOperand(1).getSimpleValueType();
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
-
-      SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
-                                    Op.getOperand(1));
-      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
-                                DAG.getUNDEF(BitcastVT), CvtMask,
-                                DAG.getIntPtrConstant(0, dl));
-      return DAG.getBitcast(Op.getValueType(), Res);
-    }
     case ROUNDP: {
       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
@@ -20622,13 +21013,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
 
-  case Intrinsic::x86_avx2_permd:
-  case Intrinsic::x86_avx2_permps:
-    // Operands intentionally swapped. Mask is last operand to intrinsic,
-    // but second operand for node/instruction.
-    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
-                       Op.getOperand(2), Op.getOperand(1));
-
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
@@ -20696,43 +21080,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
-  case Intrinsic::x86_avx512_kortestz_w:
-  case Intrinsic::x86_avx512_kortestc_w: {
-    X86::CondCode X86CC =
-        (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
-    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
-    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
-    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
-    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
-    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
-  }
-
-  case Intrinsic::x86_avx512_knot_w: {
-    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
-    SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
-    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
-    return DAG.getBitcast(MVT::i16, Res);
-  }
-
-  case Intrinsic::x86_avx512_kandn_w: {
-    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
-    // Invert LHS for the not.
-    LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
-                      DAG.getConstant(1, dl, MVT::v16i1));
-    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
-    SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
-    return DAG.getBitcast(MVT::i16, Res);
-  }
-
-  case Intrinsic::x86_avx512_kxnor_w: {
-    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
-    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
-    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
-    // Invert result for the not.
-    Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
-                      DAG.getConstant(1, dl, MVT::v16i1));
-    return DAG.getBitcast(MVT::i16, Res);
-  }
 
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
@@ -20749,50 +21096,50 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse42_pcmpistria128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpestria128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpistric128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpestric128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpistrio128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpestrio128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpistris128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpestris128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpistriz128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_sse42_pcmpestriz128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_E;
       break;
     }
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
-    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
-    SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
+    SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
@@ -20800,15 +21147,28 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::x86_sse42_pcmpestri128: {
     unsigned Opcode;
     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
     else
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
-    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
 
+  case Intrinsic::x86_sse42_pcmpistrm128:
+  case Intrinsic::x86_sse42_pcmpestrm128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
+      Opcode = X86ISD::PCMPISTR;
+    else
+      Opcode = X86ISD::PCMPESTR;
+
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
+  }
+
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -20876,7 +21236,7 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
-  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  // TODO: use undef instead and let BreakFalseDeps deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
@@ -20904,7 +21264,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
-  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  // TODO: use undef instead and let BreakFalseDeps deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
@@ -21197,17 +21557,35 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return SDValue();
     }
     case Intrinsic::x86_lwpins32:
-    case Intrinsic::x86_lwpins64: {
+    case Intrinsic::x86_lwpins64:
+    case Intrinsic::x86_umwait:
+    case Intrinsic::x86_tpause: {
       SDLoc dl(Op);
       SDValue Chain = Op->getOperand(0);
       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
-      SDValue LwpIns =
-          DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
+      unsigned Opcode;
+
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_umwait:
+        Opcode = X86ISD::UMWAIT;
+        break;
+      case Intrinsic::x86_tpause:
+        Opcode = X86ISD::TPAUSE;
+        break;
+      case Intrinsic::x86_lwpins32:
+      case Intrinsic::x86_lwpins64:
+        Opcode = X86ISD::LWPINS;
+        break;
+      }
+
+      SDValue Operation =
+          DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
                       Op->getOperand(3), Op->getOperand(4));
-      SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
+      SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
-                         LwpIns.getValue(1));
+                         Operation.getValue(1));
     }
     }
     return SDValue();
@@ -21323,27 +21701,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     SDValue Results[] = { SetCC, Store };
     return DAG.getMergeValues(Results, dl);
   }
-  case COMPRESS_TO_MEM: {
-    SDValue Mask = Op.getOperand(4);
-    SDValue DataToCompress = Op.getOperand(3);
-    SDValue Addr = Op.getOperand(2);
-    SDValue Chain = Op.getOperand(0);
-    MVT VT = DataToCompress.getSimpleValueType();
-
-    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
-    assert(MemIntr && "Expected MemIntrinsicSDNode!");
-
-    if (isAllOnesConstant(Mask)) // return just a store
-      return DAG.getStore(Chain, dl, DataToCompress, Addr,
-                          MemIntr->getMemOperand());
-
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-
-    return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
-                              MemIntr->getMemOperand(),
-                              false /* truncating */, true /* compressing */);
-  }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
   case TRUNCATE_TO_MEM_VI32: {
@@ -21387,28 +21744,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       llvm_unreachable("Unsupported truncstore intrinsic");
     }
   }
-
-  case EXPAND_FROM_MEM: {
-    SDValue Mask = Op.getOperand(4);
-    SDValue PassThru = Op.getOperand(3);
-    SDValue Addr = Op.getOperand(2);
-    SDValue Chain = Op.getOperand(0);
-    MVT VT = Op.getSimpleValueType();
-
-    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
-    assert(MemIntr && "Expected MemIntrinsicSDNode!");
-
-    if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
-      return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
-    if (X86::isZeroNode(Mask))
-      return DAG.getUNDEF(VT);
-
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-    return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
-                             MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
-                             true /* expanding */);
-  }
   }
 }
 
@@ -21825,14 +22160,16 @@ static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumElems = VT.getVectorNumElements();
   unsigned SizeInBits = VT.getSizeInBits();
+  MVT EltVT = VT.getVectorElementType();
+  SDValue Src = Op.getOperand(0);
+  assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
+         "Src and Op should have the same element type!");
 
   // Extract the Lo/Hi vectors
   SDLoc dl(Op);
-  SDValue Src = Op.getOperand(0);
   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
 
-  MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
@@ -21855,13 +22192,14 @@ static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
   return LowerVectorIntUnary(Op, DAG);
 }
 
-/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+/// Lower a vector CTLZ using native supported vector CTLZ instruction.
 //
 // i8/i16 vector implemented using dword LZCNT vector instruction
 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
 // split the vector, perform operation on it's Lo a Hi part and
 // concatenate the results.
-static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
   assert(Op.getOpcode() == ISD::CTLZ);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
@@ -21872,7 +22210,8 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
           "Unsupported element type");
 
   // Split vector, it's Lo and Hi parts will be handled in next iteration.
-  if (16 < NumElems)
+  if (NumElems > 16 ||
+      (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
     return LowerVectorIntUnary(Op, DAG);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
@@ -21977,8 +22316,10 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
-  if (Subtarget.hasCDI())
-    return LowerVectorCTLZ_AVX512CDI(Op, DAG);
+  if (Subtarget.hasCDI() &&
+      // vXi8 vectors need to be promoted to 512-bits for vXi32.
+      (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
+    return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
@@ -22167,10 +22508,42 @@ static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getSimpleValueType().is256BitVector() &&
-         Op.getSimpleValueType().isInteger() &&
-         "Only handle AVX 256-bit vector integer operation");
-  return Lower256IntArith(Op, DAG);
+  MVT VT = Op.getSimpleValueType();
+
+  // For AVX1 cases, split to use legal ops (everything but v4i64).
+  if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
+    return Lower256IntArith(Op, DAG);
+
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+
+  // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
+  // using the SMIN/SMAX instructions and flipping the signbit back.
+  if (VT == MVT::v8i16) {
+    assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
+           "Unexpected MIN/MAX opcode");
+    SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
+    N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
+    N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
+    Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
+    SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
+    return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
+  }
+
+  // Else, expand to a compare/select.
+  ISD::CondCode CC;
+  switch (Opcode) {
+  case ISD::SMIN: CC = ISD::CondCode::SETLT;  break;
+  case ISD::SMAX: CC = ISD::CondCode::SETGT;  break;
+  case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
+  case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
+  default: llvm_unreachable("Unknown MINMAX opcode");
+  }
+
+  SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
+  return DAG.getSelect(DL, VT, Cond, N0, N1);
 }
 
 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
@@ -22216,40 +22589,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
     MVT ExVT = MVT::v8i16;
 
     // Extract the lo parts and sign extend to i16
-    SDValue ALo, BLo;
-    if (Subtarget.hasSSE41()) {
-      ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
-      BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
-    } else {
-      const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-                              -1, 4, -1, 5, -1, 6, -1, 7};
-      ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
-      BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      ALo = DAG.getBitcast(ExVT, ALo);
-      BLo = DAG.getBitcast(ExVT, BLo);
-      ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
-      BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
-    }
+    // We're going to mask off the low byte of each result element of the
+    // pmullw, so it doesn't matter what's in the high byte of each 16-bit
+    // element.
+    const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
+                              4, -1, 5, -1, 6, -1, 7, -1};
+    SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
+    SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
+    ALo = DAG.getBitcast(ExVT, ALo);
+    BLo = DAG.getBitcast(ExVT, BLo);
 
     // Extract the hi parts and sign extend to i16
-    SDValue AHi, BHi;
-    if (Subtarget.hasSSE41()) {
-      const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
-                              -1, -1, -1, -1, -1, -1, -1, -1};
-      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
-      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
-      BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
-    } else {
-      const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
-                              -1, 12, -1, 13, -1, 14, -1, 15};
-      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
-      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getBitcast(ExVT, AHi);
-      BHi = DAG.getBitcast(ExVT, BHi);
-      AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
-      BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
-    }
+    // We're going to mask off the low byte of each result element of the
+    // pmullw, so it doesn't matter what's in the high byte of each 16-bit
+    // element.
+    const int HiShufMask[] = {8,  -1, 9,  -1, 10, -1, 11, -1,
+                              12, -1, 13, -1, 14, -1, 15, -1};
+    SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
+    SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
+    AHi = DAG.getBitcast(ExVT, AHi);
+    BHi = DAG.getBitcast(ExVT, BHi);
 
     // Multiply, mask the lower 8bits of the lo/hi results and pack
     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
@@ -22264,22 +22623,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
            "Should not custom lower when pmulld is available!");
 
-    // If the upper 17 bits of each element are zero then we can use PMADD.
-    APInt Mask17 = APInt::getHighBitsSet(32, 17);
-    if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
-      return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
-                         DAG.getBitcast(MVT::v8i16, A),
-                         DAG.getBitcast(MVT::v8i16, B));
-
     // Extract the odd parts.
     static const int UnpackMask[] = { 1, -1, 3, -1 };
     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
 
     // Multiply the even parts.
-    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+                                DAG.getBitcast(MVT::v2i64, A),
+                                DAG.getBitcast(MVT::v2i64, B));
     // Now multiply odd parts.
-    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+                               DAG.getBitcast(MVT::v2i64, Aodds),
+                               DAG.getBitcast(MVT::v2i64, Bodds));
 
     Evens = DAG.getBitcast(VT, Evens);
     Odds = DAG.getBitcast(VT, Odds);
@@ -22292,17 +22648,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
 
   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
          "Only know how to lower V2I64/V4I64/V8I64 multiply");
-
-  // 32-bit vector types used for MULDQ/MULUDQ.
-  MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
-
-  // MULDQ returns the 64-bit result of the signed multiplication of the lower
-  // 32-bits. We can lower with this if the sign bits stretch that far.
-  if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
-      DAG.ComputeNumSignBits(B) > 32) {
-    return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
-                       DAG.getBitcast(MulVT, B));
-  }
+  assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
 
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
@@ -22313,42 +22659,35 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   //
   //  Hi = psllqi(AloBhi + AhiBlo, 32);
   //  return AloBlo + Hi;
+  KnownBits AKnown, BKnown;
+  DAG.computeKnownBits(A, AKnown);
+  DAG.computeKnownBits(B, BKnown);
+
   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
-  bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
-  bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
+  bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
+  bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
 
   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
-  bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
-  bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
-
-  // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
-  // the high bits are known to be zero.
-  if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
-    return Op;
-
-  // Bit cast to 32-bit vectors for MULUDQ.
-  SDValue Alo = DAG.getBitcast(MulVT, A);
-  SDValue Blo = DAG.getBitcast(MulVT, B);
+  bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
+  bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
 
   SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
 
   // Only multiply lo/hi halves that aren't known to be zero.
   SDValue AloBlo = Zero;
   if (!ALoIsZero && !BLoIsZero)
-    AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
+    AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
 
   SDValue AloBhi = Zero;
   if (!ALoIsZero && !BHiIsZero) {
     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
-    Bhi = DAG.getBitcast(MulVT, Bhi);
-    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
+    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   }
 
   SDValue AhiBlo = Zero;
   if (!AHiIsZero && !BLoIsZero) {
     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
-    Ahi = DAG.getBitcast(MulVT, Ahi);
-    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
+    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   }
 
   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
@@ -22394,7 +22733,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
     SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
 
     if (VT == MVT::v32i8) {
-      if (Subtarget.hasBWI()) {
+      if (Subtarget.canExtendTo512BW()) {
         SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
         SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
         SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
@@ -22445,13 +22784,14 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   assert(VT == MVT::v16i8 &&
          "Pre-AVX2 support only supports v16i8 multiplication");
   MVT ExVT = MVT::v8i16;
-  unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+  unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
+                                          : ISD::SIGN_EXTEND_VECTOR_INREG;
 
   // Extract the lo parts and zero/sign extend to i16.
   SDValue ALo, BLo;
   if (Subtarget.hasSSE41()) {
-    ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
-    BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
+    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
+    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
   } else {
     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                             -1, 4, -1, 5, -1, 6, -1, 7};
@@ -22470,8 +22810,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                             -1, -1, -1, -1, -1, -1, -1, -1};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
-    BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
+    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
+    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
   } else {
     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                             -1, 12, -1, 13, -1, 14, -1, 15};
@@ -22606,10 +22946,14 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   // => <2 x i64> <ae|cg>
-  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+                                                DAG.getBitcast(MulVT, Op0),
+                                                DAG.getBitcast(MulVT, Op1)));
   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   // => <2 x i64> <bf|dh>
-  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+                                                DAG.getBitcast(MulVT, Odd0),
+                                                DAG.getBitcast(MulVT, Odd1)));
 
   // Shuffle it back into the right order.
   SmallVector<int, 16> HighMask(NumElts);
@@ -22769,7 +23113,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
           if (VT.is512BitVector()) {
             assert(VT == MVT::v64i8 && "Unexpected element type!");
-            SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
+            SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
+                                       ISD::SETGT);
             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
           }
           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
@@ -22879,57 +23224,81 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Determine if V is a splat value, and return the scalar.
+static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
+                            SelectionDAG &DAG, const X86Subtarget &Subtarget,
+                            unsigned Opcode) {
+   V = peekThroughEXTRACT_SUBVECTORs(V);
+
+  // Check if this is a splat build_vector node.
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
+    SDValue SplatAmt = BV->getSplatValue();
+    if (SplatAmt && SplatAmt.isUndef())
+      return SDValue();
+    return SplatAmt;
+  }
+
+  // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
+  if (V.getOpcode() == ISD::SUB &&
+      !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
+    SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
+    SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
+
+    // Ensure that the corresponding splat BV element is not UNDEF.
+    BitVector UndefElts;
+    BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
+    if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
+      unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
+      if (!UndefElts[SplatIdx])
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                           VT.getVectorElementType(), V,
+                           DAG.getIntPtrConstant(SplatIdx, dl));
+    }
+  }
+
+  // Check if this is a shuffle node doing a splat.
+  ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
+  if (!SVN || !SVN->isSplat())
+    return SDValue();
+
+  unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
+  SDValue InVec = V.getOperand(0);
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+    assert((SplatIdx < VT.getVectorNumElements()) &&
+           "Unexpected shuffle index found!");
+    return InVec.getOperand(SplatIdx);
+  } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
+      if (C->getZExtValue() == SplatIdx)
+        return InVec.getOperand(1);
+  }
+
+  // Avoid introducing an extract element from a shuffle.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                     VT.getVectorElementType(), InVec,
+                     DAG.getIntPtrConstant(SplatIdx, dl));
+}
+
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  unsigned Opcode = Op.getOpcode();
 
-  unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
-    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
-
-  unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
-    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
-
-  if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
-    SDValue BaseShAmt;
-    MVT EltVT = VT.getVectorElementType();
+  unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
+    (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
 
-    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
-      // Check if this build_vector node is doing a splat.
-      // If so, then set BaseShAmt equal to the splat value.
-      BaseShAmt = BV->getSplatValue();
-      if (BaseShAmt && BaseShAmt.isUndef())
-        BaseShAmt = SDValue();
-    } else {
-      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-        Amt = Amt.getOperand(0);
+  unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
+    (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
 
-      ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
-      if (SVN && SVN->isSplat()) {
-        unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
-        SDValue InVec = Amt.getOperand(0);
-        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
-          assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
-                 "Unexpected shuffle index found!");
-          BaseShAmt = InVec.getOperand(SplatIdx);
-        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
-           if (ConstantSDNode *C =
-               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
-             if (C->getZExtValue() == SplatIdx)
-               BaseShAmt = InVec.getOperand(1);
-           }
-        }
+  Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
 
-        if (!BaseShAmt)
-          // Avoid introducing an extract element from a shuffle.
-          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
-                                  DAG.getIntPtrConstant(SplatIdx, dl));
-      }
-    }
-
-    if (BaseShAmt.getNode()) {
+  if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
+    if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
+      MVT EltVT = VT.getVectorElementType();
       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
@@ -22961,6 +23330,70 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Convert a shift/rotate left amount to a multiplication scale factor.
+static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Amt.getSimpleValueType();
+  if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
+        (Subtarget.hasInt256() && VT == MVT::v16i16) ||
+        (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
+    return SDValue();
+
+  if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    MVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    APInt One(SVTBits, 1);
+    unsigned NumElems = VT.getVectorNumElements();
+
+    for (unsigned i = 0; i != NumElems; ++i) {
+      SDValue Op = Amt->getOperand(i);
+      if (Op->isUndef()) {
+        Elts.push_back(Op);
+        continue;
+      }
+
+      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+      uint64_t ShAmt = C.getZExtValue();
+      if (ShAmt >= SVTBits) {
+        Elts.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+    }
+    return DAG.getBuildVector(VT, dl, Elts);
+  }
+
+  // If the target doesn't support variable shifts, use either FP conversion 
+  // or integer multiplication to avoid shifting each element individually.
+  if (VT == MVT::v4i32) {
+    Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
+                      DAG.getConstant(0x3f800000U, dl, VT));
+    Amt = DAG.getBitcast(MVT::v4f32, Amt);
+    return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
+  }
+
+  // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
+  if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
+    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
+    SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
+    Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
+    Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
+    if (Subtarget.hasSSE41())
+      return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+
+    return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
+                                        DAG.getBitcast(VT, Hi),
+                                        {0, 2, 4, 6, 8, 10, 12, 14});
+  }
+
+  return SDValue();
+}
+
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -22983,11 +23416,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 
   // XOP has 128-bit variable logical/arithmetic shifts.
   // +ve/-ve Amt = shift left/right.
-  if (Subtarget.hasXOP() &&
-      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
-       VT == MVT::v8i16 || VT == MVT::v16i8)) {
+  if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+                             VT == MVT::v8i16 || VT == MVT::v16i8)) {
     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
-      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
     }
     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
@@ -23020,51 +23452,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     return R;
   }
 
-  // If possible, lower this packed shift into a vector multiply instead of
-  // expanding it into a sequence of scalar shifts.
-  // Do this only if the vector shift count is a constant build_vector.
-  if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
-      (VT == MVT::v8i16 || VT == MVT::v4i32 ||
-       (Subtarget.hasInt256() && VT == MVT::v16i16))) {
-    SmallVector<SDValue, 8> Elts;
-    MVT SVT = VT.getVectorElementType();
-    unsigned SVTBits = SVT.getSizeInBits();
-    APInt One(SVTBits, 1);
-    unsigned NumElems = VT.getVectorNumElements();
-
-    for (unsigned i=0; i !=NumElems; ++i) {
-      SDValue Op = Amt->getOperand(i);
-      if (Op->isUndef()) {
-        Elts.push_back(Op);
-        continue;
-      }
-
-      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
-      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
-      uint64_t ShAmt = C.getZExtValue();
-      if (ShAmt >= SVTBits) {
-        Elts.push_back(DAG.getUNDEF(SVT));
-        continue;
-      }
-      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
-    }
-    SDValue BV = DAG.getBuildVector(VT, dl, Elts);
-    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
-  }
-
-  // Lower SHL with variable shift amount.
-  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
-    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
-
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op,
-                     DAG.getConstant(0x3f800000U, dl, VT));
-    Op = DAG.getBitcast(MVT::v4f32, Op);
-    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
-    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
-  }
-
   // If possible, lower this shift as a sequence of two shifts by
-  // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
+  // constant plus a BLENDing shuffle instead of scalarizing it.
   // Example:
   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   //
@@ -23072,67 +23461,54 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   //
   // The advantage is that the two shifts from the example would be
-  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
-  // the vector shift into four scalar shifts plus four pairs of vector
-  // insert/extract.
-  if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
-    bool UseMOVSD = false;
-    bool CanBeSimplified;
-    // The splat value for the first packed shift (the 'X' from the example).
-    SDValue Amt1 = Amt->getOperand(0);
-    // The splat value for the second packed shift (the 'Y' from the example).
-    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
-
-    // See if it is possible to replace this node with a sequence of
-    // two shifts followed by a MOVSS/MOVSD/PBLEND.
-    if (VT == MVT::v4i32) {
-      // Check if it is legal to use a MOVSS.
-      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
-                        Amt2 == Amt->getOperand(3);
-      if (!CanBeSimplified) {
-        // Otherwise, check if we can still simplify this node using a MOVSD.
-        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
-                          Amt->getOperand(2) == Amt->getOperand(3);
-        UseMOVSD = true;
-        Amt2 = Amt->getOperand(2);
+  // lowered as X86ISD::VSRLI nodes in parallel before blending.
+  if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+                      (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
+    SDValue Amt1, Amt2;
+    unsigned NumElts = VT.getVectorNumElements();
+    SmallVector<int, 8> ShuffleMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue A = Amt->getOperand(i);
+      if (A.isUndef()) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
       }
-    } else {
-      // Do similar checks for the case where the machine value type
-      // is MVT::v8i16.
-      CanBeSimplified = Amt1 == Amt->getOperand(1);
-      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
-        CanBeSimplified = Amt2 == Amt->getOperand(i);
-
-      if (!CanBeSimplified) {
-        UseMOVSD = true;
-        CanBeSimplified = true;
-        Amt2 = Amt->getOperand(4);
-        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
-          CanBeSimplified = Amt1 == Amt->getOperand(i);
-        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
-          CanBeSimplified = Amt2 == Amt->getOperand(j);
+      if (!Amt1 || Amt1 == A) {
+        ShuffleMask.push_back(i);
+        Amt1 = A;
+        continue;
+      }
+      if (!Amt2 || Amt2 == A) {
+        ShuffleMask.push_back(i + NumElts);
+        Amt2 = A;
+        continue;
       }
+      break;
     }
 
-    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
-        isa<ConstantSDNode>(Amt2)) {
-      // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
+    // Only perform this blend if we can perform it without loading a mask.
+    if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
+        isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
+        (VT != MVT::v16i16 ||
+         is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
+        (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
+         Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
       SDValue Splat1 =
           DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
       SDValue Splat2 =
           DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
-      SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
-      SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
-      if (UseMOVSD)
-        return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
-                                                       BitCast2, {0, 1, 6, 7}));
-      return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
-                                                     BitCast2, {0, 5, 6, 7}));
+      return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
     }
   }
 
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  if (Op.getOpcode() == ISD::SHL)
+    if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
+      return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
+
   // v4i32 Non Uniform Shifts.
   // If the shift amount is constant we can shift each lane using the SSE2
   // immediate shifts, else we need to zero-extend each lane to the lower i64
@@ -23162,31 +23538,56 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         break;
       }
       // The SSE2 shifts use the lower i64 as the same shift amount for
-      // all lanes and the upper i64 is ignored. These shuffle masks
-      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
-      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
-      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
-      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
-      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
-      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+      // all lanes and the upper i64 is ignored. On AVX we're better off
+      // just zero-extending, but for SSE just duplicating the top 16-bits is
+      // cheaper and has the same effect for out of range values.
+      if (Subtarget.hasAVX()) {
+        SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+        Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+        Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+        Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+        Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+      } else {
+        SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
+        SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+                                             {4, 5, 6, 7, -1, -1, -1, -1});
+        Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+                                    {0, 1, 1, 1, -1, -1, -1, -1});
+        Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+                                    {2, 3, 3, 3, -1, -1, -1, -1});
+        Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
+                                    {0, 1, 1, 1, -1, -1, -1, -1});
+        Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
+                                    {2, 3, 3, 3, -1, -1, -1, -1});
+      }
     }
 
-    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
-    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
-    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
-    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
-    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
-    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
-    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+    SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
+    SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
+    SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
+    SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
+
+    // Merge the shifted lane results optimally with/without PBLENDW.
+    // TODO - ideally shuffle combining would handle this.
+    if (Subtarget.hasSSE41()) {
+      SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+      SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+      return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+    }
+    SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
+    SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
+    return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
   }
 
   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
   // make the existing SSE solution better.
+  // NOTE: We honor prefered vector width before promoting to 512-bits.
   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
-      (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
-      (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
-      (Subtarget.hasBWI() && VT == MVT::v32i8)) {
+      (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
+      (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
+      (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
+      (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
            "Unexpected vector type");
     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
@@ -23214,7 +23615,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
-        Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+        Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
+                           ISD::SETGT);
         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
@@ -23410,13 +23812,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
+  assert(VT.isVector() && "Custom lowering only for vector rotates!");
+
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
-  if (Subtarget.hasAVX512()) {
+  if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
     APInt UndefElts;
     SmallVector<APInt, 16> EltBits;
@@ -23435,31 +23839,178 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return Op;
   }
 
-  assert(VT.isVector() && "Custom lowering only for vector rotates!");
-  assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
+  if (Subtarget.hasXOP()) {
+    // Split 256-bit integers.
+    if (VT.is256BitVector())
+      return Lower256IntArith(Op, DAG);
+    assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
 
-  // Split 256-bit integers.
-  if (VT.is256BitVector())
+    // Attempt to rotate by immediate.
+    if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+      if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+        uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+        assert(RotateAmt < EltSizeInBits && "Rotation out of range");
+        return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
+                           DAG.getConstant(RotateAmt, DL, MVT::i8));
+      }
+    }
+
+    // Use general rotate by variable (per-element).
+    return Op;
+  }
+
+  // Split 256-bit integers on pre-AVX2 targets.
+  if (VT.is256BitVector() && !Subtarget.hasAVX2())
     return Lower256IntArith(Op, DAG);
 
-  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+  assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
+          ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
+           Subtarget.hasAVX2())) &&
+         "Only vXi32/vXi16/vXi8 vector rotates supported");
 
-  // Attempt to rotate by immediate.
+  // Rotate by an uniform constant - expand back to shifts.
+  // TODO - legalizers should be able to handle this.
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
       assert(RotateAmt < EltSizeInBits && "Rotation out of range");
-      return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
-                         DAG.getConstant(RotateAmt, DL, MVT::i8));
+      if (RotateAmt == 0)
+        return R;
+
+      SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
+      SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+      SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+      return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
     }
   }
 
-  // Use general rotate by variable (per-element).
-  return Op;
+  // Rotate by splat - expand back to shifts.
+  // TODO - legalizers should be able to handle this.
+  if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
+      IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
+    SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+    AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+    SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+    SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+    return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+  }
+
+  // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
+  // the amount bit.
+  if (EltSizeInBits == 8) {
+    if (Subtarget.hasBWI()) {
+      SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+      AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+      SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+      SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+      return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+    }
+
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+      if (Subtarget.hasSSE41()) {
+        // On SSE41 targets we make use of the fact that VSELECT lowers
+        // to PBLENDVB which selects bytes based just on the sign bit.
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we test for the sign bit by comparing to
+      // zero - a negative value will set all bits of the lanes to true
+      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
+      SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
+      return DAG.getSelect(DL, SelVT, C, V0, V1);
+    };
+
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+    // We can safely do this using i16 shifts as we're only interested in
+    // the 3 lower bits of each byte.
+    Amt = DAG.getBitcast(ExtVT, Amt);
+    Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
+    Amt = DAG.getBitcast(VT, Amt);
+
+    // r = VSELECT(r, rot(r, 4), a);
+    SDValue M;
+    M = DAG.getNode(
+        ISD::OR, DL, VT,
+        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
+        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
+    R = SignBitSelect(VT, Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
+
+    // r = VSELECT(r, rot(r, 2), a);
+    M = DAG.getNode(
+        ISD::OR, DL, VT,
+        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
+        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
+    R = SignBitSelect(VT, Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
+
+    // return VSELECT(r, rot(r, 1), a);
+    M = DAG.getNode(
+        ISD::OR, DL, VT,
+        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
+        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
+    return SignBitSelect(VT, Amt, M, R);
+  }
+
+  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+  bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
+                        SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
+
+  // Best to fallback for all supported variable shifts.
+  // AVX2 - best to fallback for non-constants as well.
+  // TODO - legalizers should be able to handle this.
+  if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
+    SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+    AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+    SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+    SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+    return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+  }
+
+  // As with shifts, convert the rotation amount to a multiplication factor.
+  SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
+  assert(Scale && "Failed to convert ROTL amount to scale");
+
+  // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
+  if (EltSizeInBits == 16) {
+    SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
+    SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
+    return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+  }
+
+  // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
+  // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
+  // that can then be OR'd with the lower 32-bits.
+  assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
+  static const int OddMask[] = {1, -1, 3, -1};
+  SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
+  SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
+
+  SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
+                              DAG.getBitcast(MVT::v2i64, R),
+                              DAG.getBitcast(MVT::v2i64, Scale));
+  SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
+                              DAG.getBitcast(MVT::v2i64, R13),
+                              DAG.getBitcast(MVT::v2i64, Scale13));
+  Res02 = DAG.getBitcast(VT, Res02);
+  Res13 = DAG.getBitcast(VT, Res13);
+
+  return DAG.getNode(ISD::OR, DL, VT,
+                     DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
+                     DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
 }
 
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -23521,9 +24072,6 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 
     SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
 
-    if (N->getValueType(1) == MVT::i1)
-      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
@@ -23534,9 +24082,6 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 
   SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
 
-  if (N->getValueType(1) == MVT::i1)
-    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
@@ -23740,11 +24285,68 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
   return SDValue();
 }
 
+// Create MOVMSKB, taking into account whether we need to split for AVX1.
+static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
+  MVT InVT = V.getSimpleValueType();
+
+  if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
+    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
+    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+                     DAG.getConstant(16, DL, MVT::i8));
+    return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+  }
+
+  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+}
+
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
-  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
+  // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
+  // half to v32i1 and concatenating the result.
+  if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
+    assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
+    assert(Subtarget.hasBWI() && "Expected BWI target");
+    SDLoc dl(Op);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
+                             DAG.getIntPtrConstant(0, dl));
+    Lo = DAG.getBitcast(MVT::v32i1, Lo);
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
+                             DAG.getIntPtrConstant(1, dl));
+    Hi = DAG.getBitcast(MVT::v32i1, Hi);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+  }
+
+  // Custom splitting for BWI types when AVX512F is available but BWI isn't.
+  if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
+    DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
+    SDLoc dl(Op);
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+    EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
+                                  DstVT.getVectorNumElements() / 2);
+    Lo = DAG.getBitcast(CastVT, Lo);
+    Hi = DAG.getBitcast(CastVT, Hi);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
+  }
+
+  // Use MOVMSK for vector to scalar conversion to prevent scalarization.
+  if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
+    assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
+    MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
+    SDLoc DL(Op);
+    SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
+    V = getPMOVMSKB(DL, V, DAG, Subtarget);
+    return DAG.getZExtOrTrunc(V, DL, DstVT);
+  }
+
   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
       SrcVT == MVT::i64) {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
@@ -23752,7 +24354,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
       // This conversion needs to be expanded.
       return SDValue();
 
-    SDValue Op0 = Op->getOperand(0);
     SmallVector<SDValue, 16> Elts;
     SDLoc dl(Op);
     unsigned NumElts;
@@ -23764,14 +24365,14 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
       // Widen the vector in input in the case of MVT::v2i32.
       // Example: from MVT::v2i32 to MVT::v4i32.
       for (unsigned i = 0, e = NumElts; i != e; ++i)
-        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
                                    DAG.getIntPtrConstant(i, dl)));
     } else {
       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
              "Unexpected source type in LowerBITCAST");
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
                                  DAG.getIntPtrConstant(0, dl)));
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
                                  DAG.getIntPtrConstant(1, dl)));
       NumElts = 2;
       SVT = MVT::i32;
@@ -24010,7 +24611,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
     unsigned NumElems = VT.getVectorNumElements();
     assert((VT.getVectorElementType() == MVT::i8 ||
             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
-    if (NumElems <= 16) {
+    if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
@@ -24392,76 +24993,81 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   SDLoc dl(Op);
 
+  SDValue Scale = N->getScale();
   SDValue Index = N->getIndex();
   SDValue Mask = N->getMask();
   SDValue Chain = N->getChain();
   SDValue BasePtr = N->getBasePtr();
-  MVT MemVT = N->getMemoryVT().getSimpleVT();
+
+  if (VT == MVT::v2f32) {
+    assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+    // If the index is v2i64 and we have VLX we can use xmm for data and index.
+    if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
+      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+                        DAG.getUNDEF(MVT::v2f32));
+      SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+      SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+      SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+          VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+      DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+      return SDValue(NewScatter.getNode(), 1);
+    }
+    return SDValue();
+  }
+
+  if (VT == MVT::v2i32) {
+    assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+                      DAG.getUNDEF(MVT::v2i32));
+    // If the index is v2i64 and we have VLX we can use xmm for data and index.
+    if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
+      SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+      SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+      SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+          VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+      DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+      return SDValue(NewScatter.getNode(), 1);
+    }
+    // Custom widen all the operands to avoid promotion.
+    EVT NewIndexVT = EVT::getVectorVT(
+        *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
+    Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+                        DAG.getUNDEF(Index.getValueType()));
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+                       DAG.getConstant(0, dl, MVT::v2i1));
+    SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
+                                Ops, N->getMemOperand());
+  }
+
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
-  if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
-    // The v2i32 value was promoted to v2i64.
-    // Now we "redo" the type legalizer's work and widen the original
-    // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
-    // with a shuffle.
-    assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
-           "Unexpected memory type");
-    int ShuffleMask[] = {0, 2, -1, -1};
-    Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
-                               DAG.getUNDEF(MVT::v4i32), ShuffleMask);
-    // Now we have 4 elements instead of 2.
-    // Expand the index.
-    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
-    Index = ExtendToType(Index, NewIndexVT, DAG);
-
-    // Expand the mask with zeroes
-    // Mask may be <2 x i64> or <2 x i1> at this moment
-    assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
-           "Unexpected mask type");
-    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
-    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
-    VT = MVT::v4i32;
-  }
+  // If the index is v2i32, we're being called by type legalization and we
+  // should just let the default handling take care of it.
+  if (IndexVT == MVT::v2i32)
+    return SDValue();
 
-  unsigned NumElts = VT.getVectorNumElements();
+  // If we don't have VLX and neither the passthru or index is 512-bits, we
+  // need to widen until one is.
   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
-    // AVX512F supports only 512-bit vectors. Or data or index should
-    // be 512 bit wide. If now the both index and data are 256-bit, but
-    // the vector contains 8 elements, we just sign-extend the index
-    if (IndexVT == MVT::v8i32)
-      // Just extend index
-      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-    else {
-      // The minimal number of elts in scatter is 8
-      NumElts = 8;
-      // Index
-      MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
-      // Use original index here, do not modify the index twice
-      Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
-      if (IndexVT.getScalarType() == MVT::i32)
-        Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-
-      // Mask
-      // At this point we have promoted mask operand
-      assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
-      MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
-      // Use the original mask here, do not modify the mask twice
-      Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
-
-      // The value that should be stored
-      MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
-      Src = ExtendToType(Src, NewVT, DAG);
-    }
-  }
-  // If the mask is "wide" at this point - truncate it to i1 vector
-  MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
-  Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
-
-  // The mask is killed by scatter, add it to the values
-  SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
-  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+    // Determine how much we need to widen by to get a 512-bit type.
+    unsigned Factor = std::min(512/VT.getSizeInBits(),
+                               512/IndexVT.getSizeInBits());
+    unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+    VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+    Src = ExtendToType(Src, VT, DAG);
+    Index = ExtendToType(Index, IndexVT, DAG);
+    Mask = ExtendToType(Mask, MaskVT, DAG, true);
+  }
+
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
   SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
       VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
@@ -24483,11 +25089,6 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
          "Expanding masked load is supported for 32 and 64-bit types only!");
 
-  // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
-  // VLX. These types for exp-loads are handled here.
-  if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
-    return Op;
-
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked load op.");
 
@@ -24504,16 +25105,12 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   Src0 = ExtendToType(Src0, WideDataVT, DAG);
 
   // Mask element has to be i1.
-  MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
-  assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
+  assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
+         "Unexpected mask type");
 
-  MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
 
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
-  if (MaskEltTy != MVT::i1)
-    Mask = DAG.getNode(ISD::TRUNCATE, dl,
-                       MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
                                       N->getBasePtr(), Mask, Src0,
                                       N->getMemoryVT(), N->getMemOperand(),
@@ -24542,10 +25139,6 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
          "Expanding masked load is supported for 32 and 64-bit types only!");
 
-  // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
-  if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
-    return Op;
-
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked store op.");
 
@@ -24560,17 +25153,13 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
 
   // Mask element has to be i1.
-  MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
-  assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
+  assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
+         "Unexpected mask type");
 
-  MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
 
   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
-  if (MaskEltTy != MVT::i1)
-    Mask = DAG.getNode(ISD::TRUNCATE, dl,
-                       MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
                             Mask, N->getMemoryVT(), N->getMemOperand(),
                             N->isTruncatingStore(), N->isCompressingStore());
@@ -24590,63 +25179,40 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
-  unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
 
   // If the index is v2i32, we're being called by type legalization.
   if (IndexVT == MVT::v2i32)
     return SDValue();
 
+  // If we don't have VLX and neither the passthru or index is 512-bits, we
+  // need to widen until one is.
+  MVT OrigVT = VT;
   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
-      !Index.getSimpleValueType().is512BitVector()) {
-    // AVX512F supports only 512-bit vectors. Or data or index should
-    // be 512 bit wide. If now the both index and data are 256-bit, but
-    // the vector contains 8 elements, we just sign-extend the index
-    if (NumElts == 8) {
-      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-      SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
-      SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-          DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
-          N->getMemOperand());
-      return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
-    }
-
-    // Minimal number of elements in Gather
-    NumElts = 8;
-    // Index
-    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
-    Index = ExtendToType(Index, NewIndexVT, DAG);
-    if (IndexVT.getScalarType() == MVT::i32)
-      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-
-    // Mask
-    MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
-    // At this point we have promoted mask operand
-    assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
-    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
-    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
-    Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
-
-    // The pass-through value
-    MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
-    Src0 = ExtendToType(Src0, NewVT, DAG);
-
-    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
-    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-        DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
-        N->getMemOperand());
-    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                  NewGather.getValue(0),
-                                  DAG.getIntPtrConstant(0, dl));
-    SDValue RetOps[] = {Extract, NewGather.getValue(2)};
-    return DAG.getMergeValues(RetOps, dl);
+      !IndexVT.is512BitVector()) {
+    // Determine how much we need to widen by to get a 512-bit type.
+    unsigned Factor = std::min(512/VT.getSizeInBits(),
+                               512/IndexVT.getSizeInBits());
+
+    unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+    VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+    Src0 = ExtendToType(Src0, VT, DAG);
+    Index = ExtendToType(Index, IndexVT, DAG);
+    Mask = ExtendToType(Mask, MaskVT, DAG, true);
   }
 
-  SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+  SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
+                    N->getScale() };
   SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
       DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
       N->getMemOperand());
-  return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
+  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
+                                NewGather, DAG.getIntPtrConstant(0, dl));
+  return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
 }
 
 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
@@ -24735,7 +25301,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
-  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
+  case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
+  case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   case ISD::FABS:
   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
@@ -24804,7 +25371,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GC_TRANSITION_START:
                                 return LowerGC_TRANSITION_START(Op, DAG);
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
-  case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
   }
 }
 
@@ -24845,19 +25411,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
 
     auto InVT = N->getValueType(0);
-    auto InVTSize = InVT.getSizeInBits();
-    const unsigned RegSize =
-        (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
-    assert((Subtarget.hasBWI() || RegSize < 512) &&
-           "512-bit vector requires AVX512BW");
-    assert((Subtarget.hasAVX2() || RegSize < 256) &&
-           "256-bit vector requires AVX2");
-
-    auto ElemVT = InVT.getVectorElementType();
-    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
-                                  RegSize / ElemVT.getSizeInBits());
-    assert(RegSize % InVT.getSizeInBits() == 0);
-    unsigned NumConcat = RegSize / InVT.getSizeInBits();
+    assert(InVT.getSizeInBits() < 128);
+    assert(128 % InVT.getSizeInBits() == 0);
+    unsigned NumConcat = 128 / InVT.getSizeInBits();
+
+    EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
+                                 InVT.getVectorElementType(),
+                                 NumConcat * InVT.getVectorNumElements());
 
     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
     Ops[0] = N->getOperand(0);
@@ -24866,12 +25426,32 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
 
     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
-    if (!ExperimentalVectorWideningLegalization)
+    if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
                         DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
   }
+  case ISD::SETCC: {
+    // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
+    // setCC result type is v2i1 because type legalzation will end up with
+    // a v4i1 setcc plus an extend.
+    assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
+    if (N->getOperand(0).getValueType() != MVT::v2f32)
+      return;
+    SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
+    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(0), UNDEF);
+    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(1), UNDEF);
+    SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
+                              N->getOperand(2));
+    if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
+      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+                        DAG.getIntPtrConstant(0, dl));
+    Results.push_back(Res);
+    return;
+  }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
@@ -24900,12 +25480,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+    EVT VT = N->getValueType(0);
+    SDValue Src = N->getOperand(0);
+    EVT SrcVT = Src.getValueType();
 
-    if (N->getValueType(0) == MVT::v2i32) {
+    if (VT == MVT::v2i32) {
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
-      SDValue Src = N->getOperand(0);
       if (Src.getValueType() == MVT::v2f64) {
         MVT ResVT = MVT::v4i32;
         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -24918,20 +25500,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                             Src, DAG.getIntPtrConstant(0, dl));
         }
         SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
-        ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
-                                                       : MVT::v2i32;
+        bool WidenType = getTypeAction(*DAG.getContext(),
+                                       MVT::v2i32) == TypeWidenVector;
+        ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
                           DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
         return;
       }
-      if (Src.getValueType() == MVT::v2f32) {
+      if (SrcVT == MVT::v2f32) {
         SDValue Idx = DAG.getIntPtrConstant(0, dl);
         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                   DAG.getUNDEF(MVT::v2f32));
         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
-        if (!ExperimentalVectorWideningLegalization)
+        if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
         Results.push_back(Res);
         return;
@@ -24942,11 +25525,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (Subtarget.hasDQI() && VT == MVT::i64 &&
+        (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
+      assert(!Subtarget.is64Bit() && "i64 should be legal");
+      unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
+      // Using a 256-bit input here to guarantee 128-bit input for f32 case.
+      // TODO: Use 128-bit vectors for f64 case?
+      // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
+      MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
+      MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
+
+      SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+      SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
+                                DAG.getConstantFP(0.0, dl, VecInVT), Src,
+                                ZeroIdx);
+      Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
+      Results.push_back(Res);
+      return;
+    }
+
     std::pair<SDValue,SDValue> Vals =
         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
     if (FIST.getNode()) {
-      EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
       if (StackSlot.getNode())
         Results.push_back(
@@ -25132,6 +25734,32 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT DstVT = N->getValueType(0);
     EVT SrcVT = N->getOperand(0).getValueType();
 
+    // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
+    // we can split using the k-register rather than memory.
+    if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
+      assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+      Lo = DAG.getBitcast(MVT::i32, Lo);
+      Hi = DAG.getBitcast(MVT::i32, Hi);
+      SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+      Results.push_back(Res);
+      return;
+    }
+
+    // Custom splitting for BWI types when AVX512F is available but BWI isn't.
+    if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
+        SrcVT.isVector() && isTypeLegal(SrcVT)) {
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+      MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
+      Lo = DAG.getBitcast(CastVT, Lo);
+      Hi = DAG.getBitcast(CastVT, Hi);
+      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
+      Results.push_back(Res);
+      return;
+    }
+
     if (SrcVT != MVT::f64 ||
         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
       return;
@@ -25143,7 +25771,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                    MVT::v2f64, N->getOperand(0));
     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
 
-    if (ExperimentalVectorWideningLegalization) {
+    if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
       // If we are legalizing vectors by widening, we already have the desired
       // legal vector type, just return it.
       Results.push_back(ToVecInt);
@@ -25178,7 +25806,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
       }
       SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
-                        Index };
+                        Index, Gather->getScale() };
       SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
         DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
         Gather->getMemoryVT(), Gather->getMemOperand());
@@ -25205,12 +25833,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
           Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
         }
         SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
-                          Index };
+                          Index, Gather->getScale() };
         SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
           DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
           Gather->getMemoryVT(), Gather->getMemOperand());
         SDValue Chain = Res.getValue(2);
-        if (!ExperimentalVectorWideningLegalization)
+        if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                             DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
@@ -25226,12 +25854,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                          DAG.getConstant(0, dl, MVT::v2i1));
       SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
-                        Index };
+                        Index, Gather->getScale() };
       SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
                                         Gather->getMemoryVT(), dl, Ops,
                                         Gather->getMemOperand());
       SDValue Chain = Res.getValue(1);
-      if (!ExperimentalVectorWideningLegalization)
+      if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                           DAG.getIntPtrConstant(0, dl));
       Results.push_back(Res);
@@ -25270,7 +25898,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   case X86ISD::CMPM:               return "X86ISD::CMPM";
-  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
@@ -25361,7 +25988,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
-  case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -25377,8 +26003,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
-  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
-  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   case X86ISD::PHMINPOS:           return "X86ISD::PHMINPOS";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
@@ -25395,14 +26019,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
-  case X86ISD::TESTM:              return "X86ISD::TESTM";
-  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   case X86ISD::KTEST:              return "X86ISD::KTEST";
+  case X86ISD::KADD:               return "X86ISD::KADD";
   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
@@ -25420,8 +26044,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
-  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
-  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
@@ -25437,7 +26059,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
-  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
@@ -25477,26 +26098,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
-  case X86ISD::FMADDS1:            return "X86ISD::FMADDS1";
-  case X86ISD::FNMADDS1:           return "X86ISD::FNMADDS1";
-  case X86ISD::FMSUBS1:            return "X86ISD::FMSUBS1";
-  case X86ISD::FNMSUBS1:           return "X86ISD::FNMSUBS1";
-  case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
-  case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
-  case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
-  case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
-  case X86ISD::FMADDS3:            return "X86ISD::FMADDS3";
-  case X86ISD::FNMADDS3:           return "X86ISD::FNMADDS3";
-  case X86ISD::FMSUBS3:            return "X86ISD::FMSUBS3";
-  case X86ISD::FNMSUBS3:           return "X86ISD::FNMSUBS3";
-  case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
-  case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
-  case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
-  case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
-  case X86ISD::FMADD4S:            return "X86ISD::FMADD4S";
-  case X86ISD::FNMADD4S:           return "X86ISD::FNMADD4S";
-  case X86ISD::FMSUB4S:            return "X86ISD::FMSUB4S";
-  case X86ISD::FNMSUB4S:           return "X86ISD::FNMSUB4S";
   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
@@ -25511,8 +26112,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VGETMANT_RND:       return "X86ISD::VGETMANT_RND";
   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
   case X86ISD::VGETMANTS_RND:      return "X86ISD::VGETMANTS_RND";
-  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
-  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
+  case X86ISD::PCMPESTR:           return "X86ISD::PCMPESTR";
+  case X86ISD::PCMPISTR:           return "X86ISD::PCMPISTR";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
@@ -25581,6 +26182,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::GF2P8MULB:          return "X86ISD::GF2P8MULB";
   case X86ISD::GF2P8AFFINEQB:      return "X86ISD::GF2P8AFFINEQB";
   case X86ISD::GF2P8AFFINEINVQB:   return "X86ISD::GF2P8AFFINEINVQB";
+  case X86ISD::NT_CALL:            return "X86ISD::NT_CALL";
+  case X86ISD::NT_BRIND:           return "X86ISD::NT_BRIND";
+  case X86ISD::UMWAIT:             return "X86ISD::UMWAIT";
+  case X86ISD::TPAUSE:             return "X86ISD::TPAUSE";
   }
   return nullptr;
 }
@@ -25647,11 +26252,20 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   if (Bits == 8)
     return false;
 
+  // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
+  if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
+      (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
+    return false;
+
   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
   // shifts just as cheap as scalar ones.
   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
     return false;
 
+  // AVX512BW has shifts such as vpsllvw.
+  if (Subtarget.hasBWI() && Bits == 16)
+      return false;
+
   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   // fully general vector.
   return true;
@@ -25730,7 +26344,15 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
-bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+  EVT SrcVT = ExtVal.getOperand(0).getValueType();
+
+  // There is no extending load for vXi1.
+  if (SrcVT.getScalarType() == MVT::i1)
+    return false;
+
+  return true;
+}
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
@@ -25779,9 +26401,14 @@ bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   return isTypeLegal(VT.getSimpleVT());
 }
 
-bool
-X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
-                                          EVT VT) const {
+bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
+                                               EVT VT) const {
+  // Don't convert an 'and' into a shuffle that we don't directly support.
+  // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
+  if (!Subtarget.hasAVX2())
+    if (VT == MVT::v32i8 || VT == MVT::v16i16)
+      return false;
+
   // Just delegate to the generic legality, clear masks aren't special.
   return isShuffleMaskLegal(Mask, VT);
 }
@@ -25875,79 +26502,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   return sinkMBB;
 }
 
-// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
-// or XMM0_V32I8 in AVX all of this code can be replaced with that
-// in the .td file.
-static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
-                                       const TargetInstrInfo *TII) {
-  unsigned Opc;
-  switch (MI.getOpcode()) {
-  default: llvm_unreachable("illegal opcode!");
-  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
-  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
-  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
-  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
-  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
-  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
-  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
-  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
-  }
-
-  DebugLoc dl = MI.getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
-  unsigned NumArgs = MI.getNumOperands();
-  for (unsigned i = 1; i < NumArgs; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!(Op.isReg() && Op.isImplicit()))
-      MIB.add(Op);
-  }
-  if (MI.hasOneMemOperand())
-    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
-      .addReg(X86::XMM0);
-
-  MI.eraseFromParent();
-  return BB;
-}
-
-// FIXME: Custom handling because TableGen doesn't support multiple implicit
-// defs in an instruction pattern
-static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
-                                       const TargetInstrInfo *TII) {
-  unsigned Opc;
-  switch (MI.getOpcode()) {
-  default: llvm_unreachable("illegal opcode!");
-  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
-  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
-  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
-  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
-  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
-  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
-  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
-  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
-  }
-
-  DebugLoc dl = MI.getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
-  unsigned NumArgs = MI.getNumOperands(); // remove the results
-  for (unsigned i = 1; i < NumArgs; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!(Op.isReg() && Op.isImplicit()))
-      MIB.add(Op);
-  }
-  if (MI.hasOneMemOperand())
-    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
-      .addReg(X86::ECX);
-
-  MI.eraseFromParent();
-  return BB;
-}
-
 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
                                      const X86Subtarget &Subtarget) {
   DebugLoc dl = MI.getDebugLoc();
@@ -26336,7 +26890,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
          "Expected last argument to be EFLAGS");
-  unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
@@ -27221,6 +27775,60 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
   return BB;
 }
 
+/// SetJmp implies future control flow change upon calling the corresponding
+/// LongJmp.
+/// Instead of using the 'return' instruction, the long jump fixes the stack and
+/// performs an indirect branch. To do so it uses the registers that were stored
+/// in the jump buffer (when calling SetJmp).
+/// In case the shadow stack is enabled we need to fix it as well, because some
+/// return addresses will be skipped.
+/// The function will save the SSP for future fixing in the function
+/// emitLongJmpShadowStackFix.
+/// \sa emitLongJmpShadowStackFix
+/// \param [in] MI The temporary Machine Instruction for the builtin.
+/// \param [in] MBB The Machine Basic Block that will be modified.
+void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
+                                                 MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB;
+
+  // Memory Reference.
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  // Initialize a register with zero.
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+  unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+  unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
+  BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
+      .addDef(ZReg)
+      .addReg(ZReg, RegState::Undef)
+      .addReg(ZReg, RegState::Undef);
+
+  // Read the current SSP Register value to the zeroed register.
+  unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+  unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
+  BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+
+  // Write the SSP register value to offset 3 in input memory buffer.
+  unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
+  const int64_t SSPOffset = 3 * PVT.getStoreSize();
+  const unsigned MemOpndSlot = 1;
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
+    else
+      MIB.add(MI.getOperand(MemOpndSlot + i));
+  }
+  MIB.addReg(SSPCopyReg);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+}
+
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
@@ -27330,6 +27938,11 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   else
     MIB.addMBB(restoreMBB);
   MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+    emitSetJmpShadowStackFix(MI, thisMBB);
+  }
+
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
@@ -27371,6 +27984,183 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   return sinkMBB;
 }
 
+/// Fix the shadow stack using the previously saved SSP pointer.
+/// \sa emitSetJmpShadowStackFix
+/// \param [in] MI The temporary Machine Instruction for the builtin.
+/// \param [in] MBB The Machine Basic Block that will be modified.
+/// \return The sink MBB that will perform the future indirect branch.
+MachineBasicBlock *
+X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
+                                             MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+
+  // checkSspMBB:
+  //         xor vreg1, vreg1
+  //         rdssp vreg1
+  //         test vreg1, vreg1
+  //         je sinkMBB   # Jump if Shadow Stack is not supported
+  // fallMBB:
+  //         mov buf+24/12(%rip), vreg2
+  //         sub vreg1, vreg2
+  //         jbe sinkMBB  # No need to fix the Shadow Stack
+  // fixShadowMBB:
+  //         shr 3/2, vreg2
+  //         incssp vreg2  # fix the SSP according to the lower 8 bits
+  //         shr 8, vreg2
+  //         je sinkMBB
+  // fixShadowLoopPrepareMBB:
+  //         shl vreg2
+  //         mov 128, vreg3
+  // fixShadowLoopMBB:
+  //         incssp vreg3
+  //         dec vreg2
+  //         jne fixShadowLoopMBB # Iterate until you finish fixing
+  //                              # the Shadow Stack
+  // sinkMBB:
+
+  MachineFunction::iterator I = ++MBB->getIterator();
+  const BasicBlock *BB = MBB->getBasicBlock();
+
+  MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, checkSspMBB);
+  MF->insert(I, fallMBB);
+  MF->insert(I, fixShadowMBB);
+  MF->insert(I, fixShadowLoopPrepareMBB);
+  MF->insert(I, fixShadowLoopMBB);
+  MF->insert(I, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
+                  MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  MBB->addSuccessor(checkSspMBB);
+
+  // Initialize a register with zero.
+  unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+  unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
+  BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
+      .addDef(ZReg)
+      .addReg(ZReg, RegState::Undef)
+      .addReg(ZReg, RegState::Undef);
+
+  // Read the current SSP Register value to the zeroed register.
+  unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+  unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
+  BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+
+  // Check whether the result of the SSP register is zero and jump directly
+  // to the sink.
+  unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
+  BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
+      .addReg(SSPCopyReg)
+      .addReg(SSPCopyReg);
+  BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+  checkSspMBB->addSuccessor(sinkMBB);
+  checkSspMBB->addSuccessor(fallMBB);
+
+  // Reload the previously saved SSP register value.
+  unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  const int64_t SPPOffset = 3 * PVT.getStoreSize();
+  MachineInstrBuilder MIB =
+      BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI.getOperand(i), SPPOffset);
+    else
+      MIB.add(MI.getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Subtract the current SSP from the previous SSP.
+  unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
+  unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
+  BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
+      .addReg(PrevSSPReg)
+      .addReg(SSPCopyReg);
+
+  // Jump to sink in case PrevSSPReg <= SSPCopyReg.
+  BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
+  fallMBB->addSuccessor(sinkMBB);
+  fallMBB->addSuccessor(fixShadowMBB);
+
+  // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
+  unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
+  unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
+  unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
+  BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
+      .addReg(SspSubReg)
+      .addImm(Offset);
+
+  // Increase SSP when looking only on the lower 8 bits of the delta.
+  unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
+  BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
+
+  // Reset the lower 8 bits.
+  unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
+  BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
+      .addReg(SspFirstShrReg)
+      .addImm(8);
+
+  // Jump if the result of the shift is zero.
+  BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+  fixShadowMBB->addSuccessor(sinkMBB);
+  fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
+
+  // Do a single shift left.
+  unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
+  unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
+  BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
+      .addReg(SspSecondShrReg);
+
+  // Save the value 128 to a register (will be used next with incssp).
+  unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
+  unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
+  BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
+      .addImm(128);
+  fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
+
+  // Since incssp only looks at the lower 8 bits, we might need to do several
+  // iterations of incssp until we finish fixing the shadow stack.
+  unsigned DecReg = MRI.createVirtualRegister(PtrRC);
+  unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
+  BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
+      .addReg(SspAfterShlReg)
+      .addMBB(fixShadowLoopPrepareMBB)
+      .addReg(DecReg)
+      .addMBB(fixShadowLoopMBB);
+
+  // Every iteration we increase the SSP by 128.
+  BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
+
+  // Every iteration we decrement the counter by 1.
+  unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
+  BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
+
+  // Jump if the counter is not zero yet.
+  BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
+  fixShadowLoopMBB->addSuccessor(sinkMBB);
+  fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
+
+  return sinkMBB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
@@ -27403,13 +28193,21 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
 
+  MachineBasicBlock *thisMBB = MBB;
+
+  // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
+  if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+    thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
+  }
+
   // Reload FP
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI.getOperand(i));
   MIB.setMemRefs(MMOBegin, MMOEnd);
+
   // Reload IP
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), LabelOffset);
@@ -27417,8 +28215,9 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
       MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
+
   // Reload SP
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), SPOffset);
@@ -27426,11 +28225,12 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
       MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
+
   // Jump
-  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+  BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
 
   MI.eraseFromParent();
-  return MBB;
+  return thisMBB;
 }
 
 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
@@ -27503,7 +28303,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
     MCSymbol *Sym = nullptr;
     for (const auto &MI : MBB) {
-      if (MI.isDebugValue())
+      if (MI.isDebugInstr())
         continue;
 
       assert(MI.isEHLabel() && "expected EH_LABEL");
@@ -27721,16 +28521,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
-  case X86::TAILJMPd64:
-  case X86::TAILJMPr64:
-  case X86::TAILJMPm64:
-  case X86::TAILJMPr64_REX:
-  case X86::TAILJMPm64_REX:
-    llvm_unreachable("TAILJMP64 would not be touched here.");
-  case X86::TCRETURNdi64:
-  case X86::TCRETURNri64:
-  case X86::TCRETURNmi64:
-    return BB;
   case X86::TLS_addr32:
   case X86::TLS_addr64:
   case X86::TLS_base_addr32:
@@ -27753,7 +28543,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return EmitLoweredTLSCall(MI, BB);
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
-  case X86::CMOV_FR128:
+  case X86::CMOV_F128:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
@@ -27873,32 +28663,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
-    // String/text processing lowering.
-  case X86::PCMPISTRM128REG:
-  case X86::VPCMPISTRM128REG:
-  case X86::PCMPISTRM128MEM:
-  case X86::VPCMPISTRM128MEM:
-  case X86::PCMPESTRM128REG:
-  case X86::VPCMPESTRM128REG:
-  case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM:
-    assert(Subtarget.hasSSE42() &&
-           "Target must have SSE4.2 or AVX features enabled");
-    return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
-
-  // String/text processing lowering.
-  case X86::PCMPISTRIREG:
-  case X86::VPCMPISTRIREG:
-  case X86::PCMPISTRIMEM:
-  case X86::VPCMPISTRIMEM:
-  case X86::PCMPESTRIREG:
-  case X86::VPCMPESTRIREG:
-  case X86::PCMPESTRIMEM:
-  case X86::VPCMPESTRIMEM:
-    assert(Subtarget.hasSSE42() &&
-           "Target must have SSE4.2 or AVX features enabled");
-    return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
-
   // Thread synchronization.
   case X86::MONITOR:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
@@ -27945,8 +28709,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitPatchPoint(MI, BB);
 
   case TargetOpcode::PATCHABLE_EVENT_CALL:
-    // Do nothing here, handle in xray instrumentation pass.
-    return BB;
+    return emitXRayCustomEvent(MI, BB);
+
+  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+    return emitXRayTypedEvent(MI, BB);
 
   case X86::LCMPXCHG8B: {
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -28014,6 +28780,65 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
+bool
+X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
+                                                const APInt &Demanded,
+                                                TargetLoweringOpt &TLO) const {
+  // Only optimize Ands to prevent shrinking a constant that could be
+  // matched by movzx.
+  if (Op.getOpcode() != ISD::AND)
+    return false;
+
+  EVT VT = Op.getValueType();
+
+  // Ignore vectors.
+  if (VT.isVector())
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+
+  // Make sure the RHS really is a constant.
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+
+  const APInt &Mask = C->getAPIntValue();
+
+  // Clear all non-demanded bits initially.
+  APInt ShrunkMask = Mask & Demanded;
+
+  // Find the width of the shrunk mask.
+  unsigned Width = ShrunkMask.getActiveBits();
+
+  // If the mask is all 0s there's nothing to do here.
+  if (Width == 0)
+    return false;
+
+  // Find the next power of 2 width, rounding up to a byte.
+  Width = PowerOf2Ceil(std::max(Width, 8U));
+  // Truncate the width to size to handle illegal types.
+  Width = std::min(Width, Size);
+
+  // Calculate a possible zero extend mask for this constant.
+  APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
+
+  // If we aren't changing the mask, just return true to keep it and prevent
+  // the caller from optimizing.
+  if (ZeroExtendMask == Mask)
+    return true;
+
+  // Make sure the new mask can be represented by a combination of mask bits
+  // and non-demanded bits.
+  if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
+    return false;
+
+  // Replace the constant with the zero extend mask.
+  SDLoc DL(Op);
+  SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
+  SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+  return TLO.CombineTo(Op, NewOp);
+}
+
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
@@ -28075,6 +28900,19 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::PACKUS: {
+    // PACKUS is just a truncation if the upper half is zero.
+    // TODO: Add DemandedElts support.
+    KnownBits Known2;
+    DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
+    if (Known.countMinLeadingZeros() < BitWidth)
+      Known.resetAll();
+    Known = Known.trunc(BitWidth);
+    break;
+  }
   case X86ISD::VZEXT: {
     // TODO: Add DemandedElts support.
     SDValue N0 = Op.getOperand(0);
@@ -28113,6 +28951,57 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.Zero.setBitsFrom(8);
     break;
   }
+
+  // Handle target shuffles.
+  // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+  if (isTargetShuffle(Opc)) {
+    bool IsUnary;
+    SmallVector<int, 64> Mask;
+    SmallVector<SDValue, 2> Ops;
+    if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+                             IsUnary)) {
+      unsigned NumOps = Ops.size();
+      unsigned NumElts = VT.getVectorNumElements();
+      if (Mask.size() == NumElts) {
+        SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+        Known.Zero.setAllBits(); Known.One.setAllBits();
+        for (unsigned i = 0; i != NumElts; ++i) {
+          if (!DemandedElts[i])
+            continue;
+          int M = Mask[i];
+          if (M == SM_SentinelUndef) {
+            // For UNDEF elements, we don't know anything about the common state
+            // of the shuffle result.
+            Known.resetAll();
+            break;
+          } else if (M == SM_SentinelZero) {
+            Known.One.clearAllBits();
+            continue;
+          }
+          assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+                 "Shuffle index out of range");
+
+          unsigned OpIdx = (unsigned)M / NumElts;
+          unsigned EltIdx = (unsigned)M % NumElts;
+          if (Ops[OpIdx].getValueType() != VT) {
+            // TODO - handle target shuffle ops with different value types.
+            Known.resetAll();
+            break;
+          }
+          DemandedOps[OpIdx].setBit(EltIdx);
+        }
+        // Known bits are the values that are shared by every demanded element.
+        for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
+          if (!DemandedOps[i])
+            continue;
+          KnownBits Known2;
+          DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
+          Known.One &= Known2.One;
+          Known.Zero &= Known2.Zero;
+        }
+      }
+    }
+  }
 }
 
 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
@@ -28229,12 +29118,21 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                     bool AllowFloatDomain, bool AllowIntDomain,
-                                    SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
+                                    SDValue &V1, const SDLoc &DL,
+                                    SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
+  // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
+  if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
+      isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+    Shuffle = X86ISD::VZEXT_MOVL;
+    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+    return true;
+  }
+
   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
@@ -28477,7 +29375,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                      bool AllowFloatDomain, bool AllowIntDomain,
-                                     SDValue &V1, SDValue &V2, SDLoc &DL,
+                                     SDValue &V1, SDValue &V2, const SDLoc &DL,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
@@ -28487,27 +29385,28 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (MaskVT.is128BitVector()) {
     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
-      Shuffle = X86ISD::MOVLHPS;
-      SrcVT = DstVT = MVT::v4f32;
+      V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
+      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
+      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
-      Shuffle = X86ISD::MOVHLPS;
-      SrcVT = DstVT = MVT::v4f32;
+      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
+      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
-      SrcVT = DstVT = MaskVT;
+      SrcVT = DstVT = MVT::v2f64;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
-      SrcVT = DstVT = MaskVT;
+      SrcVT = DstVT = MVT::v4f32;
       return true;
     }
   }
@@ -28540,15 +29439,11 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   return false;
 }
 
-static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                            const APInt &Zeroable,
-                                            bool AllowFloatDomain,
-                                            bool AllowIntDomain,
-                                            SDValue &V1, SDValue &V2, SDLoc &DL,
-                                            SelectionDAG &DAG,
-                                            const X86Subtarget &Subtarget,
-                                            unsigned &Shuffle, MVT &ShuffleVT,
-                                            unsigned &PermuteImm) {
+static bool matchBinaryPermuteVectorShuffle(
+    MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
+    bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
+    const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
+    unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
@@ -28697,7 +29592,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   return false;
 }
 
-/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// Combine an arbitrary chain of shuffles into a single instruction if
 /// possible.
 ///
 /// This is the leaf of the recursive combine below. When we have found some
@@ -28709,7 +29604,6 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                       ArrayRef<int> BaseMask, int Depth,
                                       bool HasVariableMask, SelectionDAG &DAG,
-                                      TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
@@ -28742,6 +29636,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
+                     (RootVT.isFloatingPoint() && Depth >= 2) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
   // Don't combine if we are a AVX512/EVEX target and the mask element size
@@ -28770,11 +29665,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
 
     Res = DAG.getBitcast(ShuffleVT, V1);
-    DCI.AddToWorklist(Res.getNode());
     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
                       DAG.getUNDEF(ShuffleVT),
                       DAG.getConstant(PermMask, DL, MVT::i8));
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -28840,9 +29733,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
-      DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
@@ -28853,10 +29744,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleVT, V1);
-      DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
                         DAG.getConstant(PermuteImm, DL, MVT::i8));
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
   }
@@ -28870,11 +29759,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
-    DCI.AddToWorklist(NewV1.getNode());
     NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
-    DCI.AddToWorklist(NewV2.getNode());
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -28887,12 +29773,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
-    DCI.AddToWorklist(NewV1.getNode());
     NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
-    DCI.AddToWorklist(NewV2.getNode());
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
                       DAG.getConstant(PermuteImm, DL, MVT::i8));
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -28908,11 +29791,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
-      DCI.AddToWorklist(V1.getNode());
       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
                         DAG.getConstant(BitLen, DL, MVT::i8),
                         DAG.getConstant(BitIdx, DL, MVT::i8));
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
@@ -28920,13 +29801,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
-      DCI.AddToWorklist(V1.getNode());
       V2 = DAG.getBitcast(IntMaskVT, V2);
-      DCI.AddToWorklist(V2.getNode());
       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
                         DAG.getConstant(BitLen, DL, MVT::i8),
                         DAG.getConstant(BitIdx, DL, MVT::i8));
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
   }
@@ -28956,11 +29834,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-      DCI.AddToWorklist(VPermMask.getNode());
       Res = DAG.getBitcast(MaskVT, V1);
-      DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
@@ -28983,13 +29858,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
           Mask[i] = NumMaskElts + i;
 
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-      DCI.AddToWorklist(VPermMask.getNode());
       Res = DAG.getBitcast(MaskVT, V1);
-      DCI.AddToWorklist(Res.getNode());
       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
-      DCI.AddToWorklist(Zero.getNode());
       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
@@ -29006,13 +29877,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-      DCI.AddToWorklist(VPermMask.getNode());
       V1 = DAG.getBitcast(MaskVT, V1);
-      DCI.AddToWorklist(V1.getNode());
       V2 = DAG.getBitcast(MaskVT, V2);
-      DCI.AddToWorklist(V2.getNode());
       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
-      DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
     return SDValue();
@@ -29038,13 +29905,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       EltBits[i] = AllOnes;
     }
     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
-    DCI.AddToWorklist(BitMask.getNode());
     Res = DAG.getBitcast(MaskVT, V1);
-    DCI.AddToWorklist(Res.getNode());
     unsigned AndOpcode =
         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -29061,11 +29925,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       VPermIdx.push_back(Idx);
     }
     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
-    DCI.AddToWorklist(VPermMask.getNode());
     Res = DAG.getBitcast(MaskVT, V1);
-    DCI.AddToWorklist(Res.getNode());
     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -29097,14 +29958,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       VPerm2Idx.push_back(Index);
     }
     V1 = DAG.getBitcast(MaskVT, V1);
-    DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(MaskVT, V2);
-    DCI.AddToWorklist(V2.getNode());
     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
-    DCI.AddToWorklist(VPerm2MaskOp.getNode());
     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
                       DAG.getConstant(M2ZImm, DL, MVT::i8));
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -29136,11 +29993,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
     Res = DAG.getBitcast(ByteVT, V1);
-    DCI.AddToWorklist(Res.getNode());
     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
-    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -29169,13 +30023,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
     MVT ByteVT = MVT::v16i8;
     V1 = DAG.getBitcast(ByteVT, V1);
-    DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(ByteVT, V2);
-    DCI.AddToWorklist(V2.getNode());
     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
-    DCI.AddToWorklist(VPPERMMaskOp.getNode());
     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
-    DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -29186,11 +30036,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 // Attempt to constant fold all of the constant source ops.
 // Returns true if the entire shuffle is folded to a constant.
 // TODO: Extend this to merge multiple constant Ops and update the mask.
-static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
                                            ArrayRef<int> Mask, SDValue Root,
                                            bool HasVariableMask,
                                            SelectionDAG &DAG,
-                                           TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
   MVT VT = Root.getSimpleValueType();
 
@@ -29266,11 +30115,10 @@ static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
   SDLoc DL(Root);
   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
-  DCI.AddToWorklist(CstOp.getNode());
   return DAG.getBitcast(VT, CstOp);
 }
 
-/// \brief Fully generic combining of x86 shuffle instructions.
+/// Fully generic combining of x86 shuffle instructions.
 ///
 /// This should be the last combine run over the x86 shuffle instructions. Once
 /// they have been fully optimized, this will recursively consider all chains
@@ -29301,12 +30149,12 @@ static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 /// combining in this recursive walk.
 static SDValue combineX86ShufflesRecursively(
     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
-    ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
-    bool HasVariableMask, SelectionDAG &DAG,
-    TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
+    ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
+    bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
-  if (Depth > 8)
+  const unsigned MaxRecursionDepth = 8;
+  if (Depth > MaxRecursionDepth)
     return SDValue();
 
   // Directly rip through bitcasts to find the underlying operand.
@@ -29459,17 +30307,21 @@ static SDValue combineX86ShufflesRecursively(
   // See if we can recurse into each shuffle source op (if it's a target
   // shuffle). The source op should only be combined if it either has a
   // single use (i.e. current Op) or all its users have already been combined.
-  for (int i = 0, e = Ops.size(); i < e; ++i)
-    if (Ops[i].getNode()->hasOneUse() ||
-        SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
-      if (SDValue Res = combineX86ShufflesRecursively(
-              Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
-              DAG, DCI, Subtarget))
-        return Res;
+  // Don't recurse if we already have more source ops than we can combine in
+  // the remaining recursion depth.
+  if (Ops.size() < (MaxRecursionDepth - Depth)) {
+    for (int i = 0, e = Ops.size(); i < e; ++i)
+      if (Ops[i].getNode()->hasOneUse() ||
+          SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+        if (SDValue Res = combineX86ShufflesRecursively(
+                Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
+                DAG, Subtarget))
+          return Res;
+  }
 
   // Attempt to constant fold all of the constant source ops.
   if (SDValue Cst = combineX86ShufflesConstants(
-          Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
+          Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
     return Cst;
 
   // We can only combine unary and binary shuffle mask cases.
@@ -29495,10 +30347,10 @@ static SDValue combineX86ShufflesRecursively(
 
   // Finally, try to combine into a single shuffle instruction.
   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
-                                DCI, Subtarget);
+                                Subtarget);
 }
 
-/// \brief Get the PSHUF-style mask from PSHUF node.
+/// Get the PSHUF-style mask from PSHUF node.
 ///
 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
 /// PSHUF-style masks that can be reused with such instructions.
@@ -29541,7 +30393,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   }
 }
 
-/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+/// Search for a combinable shuffle across a chain ending in pshufd.
 ///
 /// We walk up the chain and look for a combinable shuffle, skipping over
 /// shuffles that we could hoist this shuffle's transformation past without
@@ -29674,7 +30526,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   return V;
 }
 
-/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// Search for a combinable shuffle across a chain ending in pshuflw or
 /// pshufhw.
 ///
 /// We walk up the chain, skipping shuffles of the other half and looking
@@ -29742,7 +30594,7 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
   return true;
 }
 
-/// \brief Try to combine x86 target specific shuffles.
+/// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
@@ -29775,12 +30627,33 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
       }
       SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
-      DCI.AddToWorklist(Horiz.getNode());
       return DAG.getBitcast(VT, Horiz);
     }
   }
 
   switch (Opcode) {
+  case X86ISD::VBROADCAST: {
+    // If broadcasting from another shuffle, attempt to simplify it.
+    // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
+    SDValue Src = N.getOperand(0);
+    SDValue BC = peekThroughBitcasts(Src);
+    EVT SrcVT = Src.getValueType();
+    EVT BCVT = BC.getValueType();
+    if (isTargetShuffle(BC.getOpcode()) &&
+        VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
+      unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
+      SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
+                                        SM_SentinelUndef);
+      for (unsigned i = 0; i != Scale; ++i)
+        DemandedMask[i] = i;
+      if (SDValue Res = combineX86ShufflesRecursively(
+              {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
+              /*HasVarMask*/ false, DAG, Subtarget))
+        return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+                           DAG.getBitcast(SrcVT, Res));
+    }
+    return SDValue();
+  }
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
@@ -29821,53 +30694,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     }
     return SDValue();
   }
-  case X86ISD::BLENDI: {
-    SDValue V0 = N->getOperand(0);
-    SDValue V1 = N->getOperand(1);
-    assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
-           "Unexpected input vector types");
-
-    // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
-    // operands and changing the mask to 1. This saves us a bunch of
-    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
-    // x86InstrInfo knows how to commute this back after instruction selection
-    // if it would help register allocation.
-
-    // TODO: If optimizing for size or a processor that doesn't suffer from
-    // partial register update stalls, this should be transformed into a MOVSD
-    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
-
-    if (VT == MVT::v2f64)
-      if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
-        if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
-          SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
-          return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
-        }
-
-    return SDValue();
-  }
   case X86ISD::MOVSD:
   case X86ISD::MOVSS: {
-    SDValue V0 = peekThroughBitcasts(N->getOperand(0));
-    SDValue V1 = peekThroughBitcasts(N->getOperand(1));
-    bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
-    bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
-    if (isZero0 && isZero1)
-      return SDValue();
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
 
-    // We often lower to MOVSD/MOVSS from integer as well as native float
-    // types; remove unnecessary domain-crossing bitcasts if we can to make it
-    // easier to combine shuffles later on. We've already accounted for the
-    // domain switching cost when we decided to lower with it.
-    bool isFloat = VT.isFloatingPoint();
-    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
-    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
-    if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
-      MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
-                          : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
-      V0 = DAG.getBitcast(NewVT, V0);
-      V1 = DAG.getBitcast(NewVT, V1);
-      return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
+    // Canonicalize scalar FPOps:
+    // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
+    // If commutable, allow OP(N1[0], N0[0]).
+    unsigned Opcode1 = N1.getOpcode();
+    if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
+        Opcode1 == ISD::FDIV) {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if (N10 == N0 ||
+          (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
+        if (N10 != N0)
+          std::swap(N10, N11);
+        MVT SVT = VT.getVectorElementType();
+        SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+        N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
+        N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
+        SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
+        SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+        return DAG.getNode(Opcode, DL, VT, N0, SclVec);
+      }
     }
 
     return SDValue();
@@ -29963,7 +30814,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
 
   // Nuke no-op shuffles that show up after combining.
   if (isNoopShuffleMask(Mask))
-    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+    return N.getOperand(0);
 
   // Look for simplifications involving one or two shuffle instructions.
   SDValue V = N.getOperand(0);
@@ -29987,10 +30838,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       DMask[DOffset + 1] = DOffset + 0;
       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
       V = DAG.getBitcast(DVT, V);
-      DCI.AddToWorklist(V.getNode());
       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
-      DCI.AddToWorklist(V.getNode());
       return DAG.getBitcast(VT, V);
     }
 
@@ -30021,7 +30870,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
           // We can replace all three shuffles with an unpack.
           V = DAG.getBitcast(VT, D.getOperand(0));
-          DCI.AddToWorklist(V.getNode());
           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                 : X86ISD::UNPCKH,
                              DL, VT, V, V);
@@ -30041,6 +30889,37 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// Checks if the shuffle mask takes subsequent elements
+/// alternately from two vectors.
+/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
+static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
+
+  int ParitySrc[2] = {-1, -1};
+  unsigned Size = Mask.size();
+  for (unsigned i = 0; i != Size; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+
+    // Make sure we are using the matching element from the input.
+    if ((M % Size) != i)
+      return false;
+
+    // Make sure we use the same input for all elements of the same parity.
+    int Src = M / Size;
+    if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
+      return false;
+    ParitySrc[i % 2] = Src;
+  }
+
+  // Make sure each input is used.
+  if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
+    return false;
+
+  Op0Even = ParitySrc[0] == 0;
+  return true;
+}
+
 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
@@ -30051,13 +30930,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
 /// by this operation to try to flow through the rest of the combiner
 /// the fact that they're unused.
 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
-                             SDValue &Opnd0, SDValue &Opnd1,
-                             bool matchSubAdd = false) {
+                             SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
+                             bool &IsSubAdd) {
 
   EVT VT = N->getValueType(0);
-  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
-      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
-      (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
+      !VT.getSimpleVT().isFloatingPoint())
     return false;
 
   // We only handle target-independent shuffles.
@@ -30066,21 +30945,13 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
-  ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
-  SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
-
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
 
-  unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
-  unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
-
-  // We require the first shuffle operand to be the ExpectedOpcode node,
-  // and the second to be the NextExpectedOpcode node.
-  if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
-    ShuffleVectorSDNode::commuteMask(Mask);
-    std::swap(V1, V2);
-  } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
+  // Make sure we have an FADD and an FSUB.
+  if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
+      (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
+      V1.getOpcode() == V2.getOpcode())
     return false;
 
   // If there are other uses of these operations we can't fold them.
@@ -30089,41 +30960,101 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
 
   // Ensure that both operations have the same operands. Note that we can
   // commute the FADD operands.
-  SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
-  if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
-      (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
-    return false;
+  SDValue LHS, RHS;
+  if (V1.getOpcode() == ISD::FSUB) {
+    LHS = V1->getOperand(0); RHS = V1->getOperand(1);
+    if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+        (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+      return false;
+  } else {
+    assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
+    LHS = V2->getOperand(0); RHS = V2->getOperand(1);
+    if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
+        (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
+      return false;
+  }
 
-  // We're looking for blends between FADD and FSUB nodes. We insist on these
-  // nodes being lined up in a specific expected pattern.
-  if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
-        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
-        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
-        isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
-                                           8, 25, 10, 27, 12, 29, 14, 31})))
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+  bool Op0Even;
+  if (!isAddSubOrSubAddMask(Mask, Op0Even))
     return false;
 
+  // It's a subadd if the vector in the even parity is an FADD.
+  IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
+                     : V2->getOpcode() == ISD::FADD;
+
   Opnd0 = LHS;
   Opnd1 = RHS;
   return true;
 }
 
-/// \brief Try to combine a shuffle into a target-specific add-sub or
+/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
+static SDValue combineShuffleToFMAddSub(SDNode *N,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  // We only handle target-independent shuffles.
+  // FIXME: It would be easy and harmless to use the target shuffle mask
+  // extraction tool to support more.
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue FMAdd = Op0, FMSub = Op1;
+  if (FMSub.getOpcode() != X86ISD::FMSUB)
+    std::swap(FMAdd, FMSub);
+
+  if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
+      FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
+      FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
+      FMAdd.getOperand(2) != FMSub.getOperand(2))
+    return SDValue();
+
+  // Check for correct shuffle mask.
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+  bool Op0Even;
+  if (!isAddSubOrSubAddMask(Mask, Op0Even))
+    return SDValue();
+
+  // FMAddSub takes zeroth operand from FMSub node.
+  SDLoc DL(N);
+  bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
+  unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+  return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
+                     FMAdd.getOperand(2));
+}
+
+/// Try to combine a shuffle into a target-specific add-sub or
 /// mul-add-sub node.
 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
                                                 const X86Subtarget &Subtarget,
                                                 SelectionDAG &DAG) {
+  if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
+    return V;
+
   SDValue Opnd0, Opnd1;
-  if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
+  bool IsSubAdd;
+  if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
     return SDValue();
 
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
-  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
-    return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
+    unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+    return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
+  }
+
+  if (IsSubAdd)
+    return SDValue();
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   // the ADDSUB idiom has been successfully recognized. There are no known
@@ -30134,26 +31065,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
-/// \brief Try to combine a shuffle into a target-specific
-/// mul-sub-add node.
-static SDValue combineShuffleToFMSubAdd(SDNode *N,
-                                        const X86Subtarget &Subtarget,
-                                        SelectionDAG &DAG) {
-  SDValue Opnd0, Opnd1;
-  if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
-  // Try to generate X86ISD::FMSUBADD node here.
-  SDValue Opnd2;
-  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
-    return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
-
-  return SDValue();
-}
-
 // We are looking for a shuffle where both sources are concatenated with undef
 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
 // if we can express this as a single-source shuffle, that's preferable.
@@ -30213,8 +31124,8 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
   // lanes of each operand as:
   // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
   // ...similarly for v2f64 and v8i16.
-  // TODO: 256-bit is not the same because...x86.
-  if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
+  // TODO: Handle UNDEF operands.
+  if (HOp.getOperand(0) != HOp.getOperand(1))
     return SDValue();
 
   // When the operands of a horizontal math op are identical, the low half of
@@ -30225,9 +31136,17 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
   // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
   // but this should be tied to whatever horizontal op matching and shuffle
   // canonicalization are producing.
-  if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
-      isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
-      isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
+  if (HOp.getValueSizeInBits() == 128 &&
+      (isTargetShuffleEquivalent(Mask, {0, 0}) ||
+       isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
+       isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
+    return HOp;
+
+  if (HOp.getValueSizeInBits() == 256 &&
+      (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
+       isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
+       isTargetShuffleEquivalent(
+           Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
     return HOp;
 
   return SDValue();
@@ -30245,9 +31164,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
-    if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
-      return FMSubAdd;
-
     if (SDValue HAddSub = foldShuffleOfHorizOp(N))
       return HAddSub;
   }
@@ -30351,10 +31267,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     // a particular chain.
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
-            /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
-      DCI.CombineTo(N, Res);
-      return SDValue();
-    }
+            /*HasVarMask*/ false, DAG, Subtarget))
+      return Res;
   }
 
   return SDValue();
@@ -30538,17 +31452,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
   SDLoc DL(BitCast);
   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
 
-  if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
-    // Handle pre-AVX2 cases by splitting to two v16i1's.
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
-    SDValue Lo = extract128BitVector(V, 0, DAG, DL);
-    SDValue Hi = extract128BitVector(V, 16, DAG, DL);
-    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
-    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
-    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
-                     DAG.getConstant(16, DL, ShiftTy));
-    V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+  if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
+    V = getPMOVMSKB(DL, V, DAG, Subtarget);
     return DAG.getZExtOrTrunc(V, DL, VT);
   }
 
@@ -30565,6 +31470,153 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
   return DAG.getZExtOrTrunc(V, DL, VT);
 }
 
+// Convert a vXi1 constant build vector to the same width scalar integer.
+static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
+  EVT SrcVT = Op.getValueType();
+  assert(SrcVT.getVectorElementType() == MVT::i1 &&
+         "Expected a vXi1 vector");
+  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+         "Expected a constant build vector");
+
+  APInt Imm(SrcVT.getVectorNumElements(), 0);
+  for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
+    SDValue In = Op.getOperand(Idx);
+    if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
+      Imm.setBit(Idx);
+  }
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
+  return DAG.getConstant(Imm, SDLoc(Op), IntVT);
+}
+
+static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
+
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Only do this if we have k-registers.
+  if (!Subtarget.hasAVX512())
+    return SDValue();
+
+  EVT DstVT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+
+  if (!Op.hasOneUse())
+    return SDValue();
+
+  // Look for logic ops.
+  if (Op.getOpcode() != ISD::AND &&
+      Op.getOpcode() != ISD::OR &&
+      Op.getOpcode() != ISD::XOR)
+    return SDValue();
+
+  // Make sure we have a bitcast between mask registers and a scalar type.
+  if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+        DstVT.isScalarInteger()) &&
+      !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
+        SrcVT.isScalarInteger()))
+    return SDValue();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
+      LHS.getOperand(0).getValueType() == DstVT)
+    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
+                       DAG.getBitcast(DstVT, RHS));
+
+  if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
+      RHS.getOperand(0).getValueType() == DstVT)
+    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+                       DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+
+  // If the RHS is a vXi1 build vector, this is a good reason to flip too.
+  // Most of these have to move a constant from the scalar domain anyway.
+  if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
+    RHS = combinevXi1ConstantToInteger(RHS, DAG);
+    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+                       DAG.getBitcast(DstVT, LHS), RHS);
+  }
+
+  return SDValue();
+}
+
+static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  unsigned NumElts = N.getNumOperands();
+
+  auto *BV = cast<BuildVectorSDNode>(N);
+  SDValue Splat = BV->getSplatValue();
+
+  // Build MMX element from integer GPR or SSE float values.
+  auto CreateMMXElement = [&](SDValue V) {
+    if (V.isUndef())
+      return DAG.getUNDEF(MVT::x86mmx);
+    if (V.getValueType().isFloatingPoint()) {
+      if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
+        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
+        V = DAG.getBitcast(MVT::v2i64, V);
+        return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
+      }
+      V = DAG.getBitcast(MVT::i32, V);
+    } else {
+      V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
+    }
+    return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
+  };
+
+  // Convert build vector ops to MMX data in the bottom elements.
+  SmallVector<SDValue, 8> Ops;
+
+  // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
+  if (Splat) {
+    if (Splat.isUndef())
+      return DAG.getUNDEF(MVT::x86mmx);
+
+    Splat = CreateMMXElement(Splat);
+
+    if (Subtarget.hasSSE1()) {
+      // Unpack v8i8 to splat i8 elements to lowest 16-bits.
+      if (NumElts == 8)
+        Splat = DAG.getNode(
+            ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+            DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
+            Splat);
+
+      // Use PSHUFW to repeat 16-bit elements.
+      unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
+      return DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+          DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
+          DAG.getConstant(ShufMask, DL, MVT::i8));
+    }
+    Ops.append(NumElts, Splat);
+  } else {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Ops.push_back(CreateMMXElement(N.getOperand(i)));
+  }
+
+  // Use tree of PUNPCKLs to build up general MMX vector.
+  while (Ops.size() > 1) {
+    unsigned NumOps = Ops.size();
+    unsigned IntrinOp =
+        (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
+                     : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
+                                    : Intrinsic::x86_mmx_punpcklbw));
+    SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
+    for (unsigned i = 0; i != NumOps; i += 2)
+      Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
+                               Ops[i], Ops[i + 1]);
+    Ops.resize(NumOps / 2);
+  }
+
+  return Ops[0];
+}
+
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
@@ -30585,7 +31637,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
-        Subtarget.hasVLX()) {
+        Subtarget.hasAVX512()) {
       SDLoc dl(N);
       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
       N0 = DAG.getBitcast(MVT::v8i1, N0);
@@ -30596,7 +31648,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
-        Subtarget.hasVLX()) {
+        Subtarget.hasAVX512()) {
       SDLoc dl(N);
       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
@@ -30610,36 +31662,92 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // Since MMX types are special and don't usually play with other vector types,
   // it's better to handle them early to be sure we emit efficient code by
   // avoiding store-load conversions.
+  if (VT == MVT::x86mmx) {
+    // Detect MMX constant vectors.
+    APInt UndefElts;
+    SmallVector<APInt, 1> EltBits;
+    if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
+      SDLoc DL(N0);
+      // Handle zero-extension of i32 with MOVD.
+      if (EltBits[0].countLeadingZeros() >= 32)
+        return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
+                           DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
+      // Else, bitcast to a double.
+      // TODO - investigate supporting sext 32-bit immediates on x86_64.
+      APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
+      return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
+    }
+
+    // Detect bitcasts to x86mmx low word.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+        (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
+        N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
+      bool LowUndef = true, AllUndefOrZero = true;
+      for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
+        SDValue Op = N0.getOperand(i);
+        LowUndef &= Op.isUndef() || (i >= e/2);
+        AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
+      }
+      if (AllUndefOrZero) {
+        SDValue N00 = N0.getOperand(0);
+        SDLoc dl(N00);
+        N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
+                       : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
+        return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
+      }
+    }
+
+    // Detect bitcasts of 64-bit build vectors and convert to a
+    // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
+    // lowest element.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+        (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
+         SrcVT == MVT::v8i8))
+      return createMMXBuildVector(N0, DAG, Subtarget);
+
+    // Detect bitcasts between element or subvector extraction to x86mmx.
+    if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+         N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+        isNullConstant(N0.getOperand(1))) {
+      SDValue N00 = N0.getOperand(0);
+      if (N00.getValueType().is128BitVector())
+        return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+                           DAG.getBitcast(MVT::v2i64, N00));
+    }
 
-  // Detect bitcasts between i32 to x86mmx low word.
-  if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
-      SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
-    SDValue N00 = N0->getOperand(0);
-    if (N00.getValueType() == MVT::i32)
-      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+    // Detect bitcasts from FP_TO_SINT to x86mmx.
+    if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
+      SDLoc DL(N0);
+      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+                                DAG.getUNDEF(MVT::v2i32));
+      return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+                         DAG.getBitcast(MVT::v2i64, Res));
+    }
   }
 
-  // Detect bitcasts between element or subvector extraction to x86mmx.
-  if (VT == MVT::x86mmx &&
-      (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
-       N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
-      isNullConstant(N0.getOperand(1))) {
-    SDValue N00 = N0->getOperand(0);
-    if (N00.getValueType().is128BitVector())
-      return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
-                         DAG.getBitcast(MVT::v2i64, N00));
+  // Try to remove a bitcast of constant vXi1 vector. We have to legalize
+  // most of these to scalar anyway.
+  if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
+      SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+    return combinevXi1ConstantToInteger(N0, DAG);
   }
 
-  // Detect bitcasts from FP_TO_SINT to x86mmx.
-  if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
-      N0.getOpcode() == ISD::FP_TO_SINT) {
-    SDLoc DL(N0);
-    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
-                              DAG.getUNDEF(MVT::v2i32));
-    return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
-                       DAG.getBitcast(MVT::v2i64, Res));
+  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+      isa<ConstantSDNode>(N0)) {
+    auto *C = cast<ConstantSDNode>(N0);
+    if (C->isAllOnesValue())
+      return DAG.getConstant(1, SDLoc(N0), VT);
+    if (C->isNullValue())
+      return DAG.getConstant(0, SDLoc(N0), VT);
   }
 
+  // Try to remove bitcasts from input and output of mask arithmetic to
+  // remove GPR<->K-register crossings.
+  if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
+    return V;
+
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
@@ -30812,8 +31920,8 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
 // to these zexts.
 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
-                            const SDValue &Zext1, const SDLoc &DL) {
-
+                            const SDValue &Zext1, const SDLoc &DL,
+                            const X86Subtarget &Subtarget) {
   // Find the appropriate width for the PSADBW.
   EVT InVT = Zext0.getOperand(0).getValueType();
   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
@@ -30828,9 +31936,15 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
   Ops[0] = Zext1.getOperand(0);
   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
 
-  // Actually build the SAD
+  // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
+  auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                          ArrayRef<SDValue> Ops) {
+    MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
+    return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
+  };
   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
-  return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
+  return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
+                          PSADBWBuilder);
 }
 
 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
@@ -30997,12 +32111,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
     return SDValue();
 
   unsigned RegSize = 128;
-  if (Subtarget.hasBWI())
+  if (Subtarget.useBWIRegs())
     RegSize = 512;
-  else if (Subtarget.hasAVX2())
+  else if (Subtarget.hasAVX())
     RegSize = 256;
 
-  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
+  // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
   if (RegSize / VT.getVectorNumElements() < 8)
@@ -31037,7 +32151,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
 
   // Create the SAD instruction.
   SDLoc DL(Extract);
-  SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
+  SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
 
   // If the original vector was wider than 8 elements, sum over the results
   // in the SAD vector.
@@ -31208,8 +32322,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       isa<ConstantSDNode>(EltIdx) &&
       isa<ConstantSDNode>(InputVector.getOperand(0))) {
     uint64_t ExtractedElt = N->getConstantOperandVal(1);
-    uint64_t InputValue = InputVector.getConstantOperandVal(0);
-    uint64_t Res = (InputValue >> ExtractedElt) & 1;
+    auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
+    const APInt &InputValue = InputC->getAPIntValue();
+    uint64_t Res = InputValue[ExtractedElt];
     return DAG.getConstant(Res, dl, MVT::i1);
   }
 
@@ -31227,102 +32342,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
     return MinMax;
 
-  // Only operate on vectors of 4 elements, where the alternative shuffling
-  // gets to be more expensive.
-  if (SrcVT != MVT::v4i32)
-    return SDValue();
-
-  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
-  // single use which is a sign-extend or zero-extend, and all elements are
-  // used.
-  SmallVector<SDNode *, 4> Uses;
-  unsigned ExtractedElements = 0;
-  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
-       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
-    if (UI.getUse().getResNo() != InputVector.getResNo())
-      return SDValue();
-
-    SDNode *Extract = *UI;
-    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-      return SDValue();
-
-    if (Extract->getValueType(0) != MVT::i32)
-      return SDValue();
-    if (!Extract->hasOneUse())
-      return SDValue();
-    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
-        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
-      return SDValue();
-    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
-      return SDValue();
-
-    // Record which element was extracted.
-    ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
-    Uses.push_back(Extract);
-  }
-
-  // If not all the elements were used, this may not be worthwhile.
-  if (ExtractedElements != 15)
-    return SDValue();
-
-  // Ok, we've now decided to do the transformation.
-  // If 64-bit shifts are legal, use the extract-shift sequence,
-  // otherwise bounce the vector off the cache.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Vals[4];
-
-  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
-    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
-    auto &DL = DAG.getDataLayout();
-    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
-    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
-      DAG.getConstant(0, dl, VecIdxTy));
-    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
-      DAG.getConstant(1, dl, VecIdxTy));
-
-    SDValue ShAmt = DAG.getConstant(
-        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
-    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
-    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
-      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
-    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
-    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
-      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
-  } else {
-    // Store the value to a temporary stack slot.
-    SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
-    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
-                              MachinePointerInfo());
-
-    EVT ElementType = SrcVT.getVectorElementType();
-    unsigned EltSize = ElementType.getSizeInBits() / 8;
-
-    // Replace each use (extract) with a load of the appropriate element.
-    for (unsigned i = 0; i < 4; ++i) {
-      uint64_t Offset = EltSize * i;
-      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
-
-      SDValue ScalarAddr =
-          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
-
-      // Load the scalar.
-      Vals[i] =
-          DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
-    }
-  }
-
-  // Replace the extracts
-  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
-    UE = Uses.end(); UI != UE; ++UI) {
-    SDNode *Extract = *UI;
-
-    uint64_t IdxVal = Extract->getConstantOperandVal(1);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
-  }
-
-  // The replacement was made in place; return N so it won't be revisited.
-  return SDValue(N, 0);
+  return SDValue();
 }
 
 /// If a vector select has an operand that is -1 or 0, try to simplify the
@@ -31351,8 +32371,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   if (TValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
-    SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
-                                  DAG.getAllOnesConstant(DL, CondVT));
+    SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
@@ -31491,68 +32510,77 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// If this is a bitcasted op that can be represented as another type, push the
-// the bitcast to the inputs. This allows more opportunities for pattern
-// matching masked instructions. This is called when we know that the operation
-// is used as one of the inputs of a vselect.
-static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
-                                      TargetLowering::DAGCombinerInfo &DCI) {
-  // Make sure we have a bitcast.
-  if (OrigOp.getOpcode() != ISD::BITCAST)
-    return false;
-
-  SDValue Op = OrigOp.getOperand(0);
-
-  // If the operation is used by anything other than the bitcast, we shouldn't
-  // do this combine as that would replicate the operation.
-  if (!Op.hasOneUse())
-    return false;
+/// If this is a *dynamic* select (non-constant condition) and we can match
+/// this node with one of the variable blend instructions, restructure the
+/// condition so that blends can use the high (sign) bit of each element.
+static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  SDValue Cond = N->getOperand(0);
+  if (N->getOpcode() != ISD::VSELECT ||
+      ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Don't optimize before the condition has been transformed to a legal type
+  // and don't ever optimize vector selects that map to AVX512 mask-registers.
+  unsigned BitWidth = Cond.getScalarValueSizeInBits();
+  if (BitWidth < 8 || BitWidth > 64)
+    return SDValue();
+
+  // We can only handle the cases where VSELECT is directly legal on the
+  // subtarget. We custom lower VSELECT nodes with constant conditions and
+  // this makes it hard to see whether a dynamic VSELECT will correctly
+  // lower, so we both check the operation's status and explicitly handle the
+  // cases where a *dynamic* blend will fail even though a constant-condition
+  // blend could be custom lowered.
+  // FIXME: We should find a better way to handle this class of problems.
+  // Potentially, we should combine constant-condition vselect nodes
+  // pre-legalization into shuffles and not mark as many types as custom
+  // lowered.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+  // FIXME: We don't support i16-element blends currently. We could and
+  // should support them by making *all* the bits in the condition be set
+  // rather than just the high bit and using an i8-element blend.
+  if (VT.getVectorElementType() == MVT::i16)
+    return SDValue();
+  // Dynamic blending was only available from SSE4.1 onward.
+  if (VT.is128BitVector() && !Subtarget.hasSSE41())
+    return SDValue();
+  // Byte blends are only available in AVX2
+  if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
+    return SDValue();
+  // There are no 512-bit blend instructions that use sign bits.
+  if (VT.is512BitVector())
+    return SDValue();
 
-  MVT VT = OrigOp.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
-  SDLoc DL(Op.getNode());
+  // TODO: Add other opcodes eventually lowered into BLEND.
+  for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+       UI != UE; ++UI)
+    if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
+      return SDValue();
 
-  auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
-                                      SDValue Op2) {
-    Op0 = DAG.getBitcast(VT, Op0);
-    DCI.AddToWorklist(Op0.getNode());
-    Op1 = DAG.getBitcast(VT, Op1);
-    DCI.AddToWorklist(Op1.getNode());
-    DCI.CombineTo(OrigOp.getNode(),
-                  DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
-    return true;
-  };
+  APInt DemandedMask(APInt::getSignMask(BitWidth));
+  KnownBits Known;
+  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                        !DCI.isBeforeLegalizeOps());
+  if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+    return SDValue();
 
-  unsigned Opcode = Op.getOpcode();
-  switch (Opcode) {
-  case X86ISD::SHUF128: {
-    if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
-      return false;
-    // Only change element size, not type.
-    if (VT.isInteger() != Op.getSimpleValueType().isInteger())
-      return false;
-    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
-                                    Op.getOperand(2));
-  }
-  case X86ISD::SUBV_BROADCAST: {
-    unsigned EltSize = EltVT.getSizeInBits();
-    if (EltSize != 32 && EltSize != 64)
-      return false;
-    // Only change element size, not type.
-    if (VT.isInteger() != Op.getSimpleValueType().isInteger())
-      return false;
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = MVT::getVectorVT(EltVT,
-                            Op0.getSimpleValueType().getSizeInBits() / EltSize);
-    Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
-    DCI.AddToWorklist(Op0.getNode());
-    DCI.CombineTo(OrigOp.getNode(),
-                  DAG.getNode(Opcode, DL, VT, Op0));
-    return true;
-  }
+  // If we changed the computation somewhere in the DAG, this change will
+  // affect all users of Cond. Update all the nodes so that we do not use
+  // the generic VSELECT anymore. Otherwise, we may perform wrong
+  // optimizations as we messed with the actual expectation for the vector
+  // boolean values.
+  for (SDNode *U : Cond->uses()) {
+    SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
+                             Cond, U->getOperand(1), U->getOperand(2));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
   }
-
-  return false;
+  DCI.CommitTargetLoweringOpt(TLO);
+  return SDValue(N, 0);
 }
 
 /// Do target-specific dag combines on SELECT and VSELECT nodes.
@@ -31568,6 +32596,23 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   EVT CondVT = Cond.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  // Convert vselects with constant condition into shuffles.
+  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
+      DCI.isBeforeLegalizeOps()) {
+    SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
+    for (int i = 0, Size = Mask.size(); i != Size; ++i) {
+      SDValue CondElt = Cond->getOperand(i);
+      Mask[i] = i;
+      // Arbitrarily choose from the 2nd operand if the select condition element
+      // is undef.
+      // TODO: Can we do better by matching patterns such as even/odd?
+      if (CondElt.isUndef() || isNullConstant(CondElt))
+        Mask[i] += Size;
+    }
+
+    return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+  }
+
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
@@ -31592,7 +32637,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!DAG.getTarget().Options.UnsafeFPMath &&
-              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+              !(DAG.isKnownNeverZeroFloat(LHS) ||
+                DAG.isKnownNeverZeroFloat(RHS)))
             break;
           std::swap(LHS, RHS);
         }
@@ -31602,7 +32648,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
-            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+            !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
           break;
         Opcode = X86ISD::FMIN;
         break;
@@ -31621,7 +32667,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
-            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+            !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
@@ -31631,7 +32677,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!DAG.getTarget().Options.UnsafeFPMath &&
-              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+              !(DAG.isKnownNeverZeroFloat(LHS) ||
+                DAG.isKnownNeverZeroFloat(RHS)))
             break;
           std::swap(LHS, RHS);
         }
@@ -31658,7 +32705,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
-            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+            !(DAG.isKnownNeverZeroFloat(LHS) ||
+              DAG.isKnownNeverZeroFloat(RHS))) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
@@ -31694,7 +32742,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
-            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+            !DAG.isKnownNeverZeroFloat(LHS) &&
+            !DAG.isKnownNeverZeroFloat(RHS)) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
@@ -31718,19 +32767,38 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
 
+  // Some mask scalar intrinsics rely on checking if only one bit is set
+  // and implement it in C code like this:
+  // A[0] = (U & 1) ? A[0] : W[0];
+  // This creates some redundant instructions that break pattern matching.
+  // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
+  if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
+      Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    SDValue AndNode = Cond.getOperand(0);
+    if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
+        isNullConstant(Cond.getOperand(1)) &&
+        isOneConstant(AndNode.getOperand(1))) {
+      // LHS and RHS swapped due to
+      // setcc outputting 1 when AND resulted in 0 and vice versa.
+      AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
+      return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
+    }
+  }
+
   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   // lowering on KNL. In this case we convert it to
   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
-  // The same situation for all 128 and 256-bit vectors of i8 and i16.
+  // The same situation all vectors of i8 and i16 without BWI.
+  // Make sure we extend these even before type legalization gets a chance to
+  // split wide vectors.
   // Since SKX these selects have a proper lowering.
-  if (Subtarget.hasAVX512() && CondVT.isVector() &&
+  if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1 &&
-      (VT.is128BitVector() || VT.is256BitVector()) &&
+      VT.getVectorNumElements() > 4 &&
       (VT.getVectorElementType() == MVT::i8 ||
-       VT.getVectorElementType() == MVT::i16) &&
-      !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
+       VT.getVectorElementType() == MVT::i16)) {
     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
-    DCI.AddToWorklist(Cond.getNode());
     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
   }
 
@@ -31776,7 +32844,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
-       (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+       (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
@@ -31794,40 +32862,50 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
       SDValue CondRHS = Cond->getOperand(1);
 
+      auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                             ArrayRef<SDValue> Ops) {
+        return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
+      };
+
       // Look for a general sub with unsigned saturation first.
       // x >= y ? x-y : 0 --> subus x, y
       // x >  y ? x-y : 0 --> subus x, y
       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
-        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+        return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+                                SUBUSBuilder);
 
       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
-        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
-          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
-            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
-              // If the RHS is a constant we have to reverse the const
-              // canonicalization.
-              // x > C-1 ? x+-C : 0 --> subus x, C
-              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
-                  CondRHSConst->getAPIntValue() ==
-                      (-OpRHSConst->getAPIntValue() - 1))
-                return DAG.getNode(
-                    X86ISD::SUBUS, DL, VT, OpLHS,
-                    DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
+        if (isa<BuildVectorSDNode>(CondRHS)) {
+          // If the RHS is a constant we have to reverse the const
+          // canonicalization.
+          // x > C-1 ? x+-C : 0 --> subus x, C
+          auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+            return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
+          };
+          if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+              ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
+            OpRHS = DAG.getNode(ISD::SUB, DL, VT,
+                                DAG.getConstant(0, DL, VT), OpRHS);
+            return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+                                    SUBUSBuilder);
+          }
 
           // Another special case: If C was a sign bit, the sub has been
           // canonicalized into a xor.
           // FIXME: Would it be better to use computeKnownBits to determine
           //        whether it's safe to decanonicalize the xor?
           // x s< 0 ? x^C : 0 --> subus x, C
-          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
-              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
-              OpRHSConst->getAPIntValue().isSignMask())
-            // Note that we have to rebuild the RHS constant here to ensure we
-            // don't rely on particular values of undef lanes.
-            return DAG.getNode(
-                X86ISD::SUBUS, DL, VT, OpLHS,
-                DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
+          if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
+            if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
+                ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+                OpRHSConst->getAPIntValue().isSignMask()) {
+              OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
+              // Note that we have to rebuild the RHS constant here to ensure we
+              // don't rely on particular values of undef lanes.
+              return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+                                      SUBUSBuilder);
+            }
         }
     }
   }
@@ -31835,100 +32913,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
     return V;
 
-  // If this is a *dynamic* select (non-constant condition) and we can match
-  // this node with one of the variable blend instructions, restructure the
-  // condition so that blends can use the high (sign) bit of each element and
-  // use SimplifyDemandedBits to simplify the condition operand.
-  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() &&
-      !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
-    unsigned BitWidth = Cond.getScalarValueSizeInBits();
-
-    // Don't optimize vector selects that map to mask-registers.
-    if (BitWidth == 1)
-      return SDValue();
-
-    // We can only handle the cases where VSELECT is directly legal on the
-    // subtarget. We custom lower VSELECT nodes with constant conditions and
-    // this makes it hard to see whether a dynamic VSELECT will correctly
-    // lower, so we both check the operation's status and explicitly handle the
-    // cases where a *dynamic* blend will fail even though a constant-condition
-    // blend could be custom lowered.
-    // FIXME: We should find a better way to handle this class of problems.
-    // Potentially, we should combine constant-condition vselect nodes
-    // pre-legalization into shuffles and not mark as many types as custom
-    // lowered.
-    if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
-      return SDValue();
-    // FIXME: We don't support i16-element blends currently. We could and
-    // should support them by making *all* the bits in the condition be set
-    // rather than just the high bit and using an i8-element blend.
-    if (VT.getVectorElementType() == MVT::i16)
-      return SDValue();
-    // Dynamic blending was only available from SSE4.1 onward.
-    if (VT.is128BitVector() && !Subtarget.hasSSE41())
-      return SDValue();
-    // Byte blends are only available in AVX2
-    if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
-      return SDValue();
-    // There are no 512-bit blend instructions that use sign bits.
-    if (VT.is512BitVector())
-      return SDValue();
-
-    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
-    APInt DemandedMask(APInt::getSignMask(BitWidth));
-    KnownBits Known;
-    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                          !DCI.isBeforeLegalizeOps());
-    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
-        TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
-      // If we changed the computation somewhere in the DAG, this change will
-      // affect all users of Cond. Make sure it is fine and update all the nodes
-      // so that we do not use the generic VSELECT anymore. Otherwise, we may
-      // perform wrong optimizations as we messed with the actual expectation
-      // for the vector boolean values.
-      if (Cond != TLO.Old) {
-        // Check all uses of the condition operand to check whether it will be
-        // consumed by non-BLEND instructions. Those may require that all bits
-        // are set properly.
-        for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
-             UI != UE; ++UI) {
-          // TODO: Add other opcodes eventually lowered into BLEND.
-          if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
-            return SDValue();
-        }
-
-        // Update all users of the condition before committing the change, so
-        // that the VSELECT optimizations that expect the correct vector boolean
-        // value will not be triggered.
-        for (SDNode *U : Cond->uses()) {
-          SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
-                                   U->getValueType(0), Cond, U->getOperand(1),
-                                   U->getOperand(2));
-          DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
-        }
-        DCI.CommitTargetLoweringOpt(TLO);
-        return SDValue();
-      }
-      // Only Cond (rather than other nodes in the computation chain) was
-      // changed. Change the condition just for N to keep the opportunity to
-      // optimize all other users their own way.
-      SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
-      return SDValue();
-    }
-  }
-
-  // Look for vselects with LHS/RHS being bitcasted from an operation that
-  // can be executed on another type. Push the bitcast to the inputs of
-  // the operation. This exposes opportunities for using masking instructions.
-  if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
-      CondVT.getVectorElementType() == MVT::i1) {
-    if (combineBitcastForMaskedOp(LHS, DAG, DCI))
-      return SDValue(N, 0);
-    if (combineBitcastForMaskedOp(RHS, DAG, DCI))
-      return SDValue(N, 0);
-  }
+  if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
+    return V;
 
   // Custom action for SELECT MMX
   if (VT == MVT::x86mmx) {
@@ -32270,17 +33256,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   SDValue Cond = N->getOperand(3);
 
-  if (CC == X86::COND_E || CC == X86::COND_NE) {
-    switch (Cond.getOpcode()) {
-    default: break;
-    case X86ISD::BSR:
-    case X86ISD::BSF:
-      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
-      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
-        return (CC == X86::COND_E) ? FalseOp : TrueOp;
-    }
-  }
-
   // Try to simplify the EFLAGS and condition code operands.
   // We can't always do this as FCMOV only supports a subset of X86 cond.
   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
@@ -32450,6 +33425,36 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Handle (CMOV C-1, (ADD (CTTZ X), C), (X != 0)) ->
+  // (ADD (CMOV (CTTZ X), -1, (X != 0)), C) or
+  // (CMOV (ADD (CTTZ X), C), C-1, (X == 0)) ->
+  // (ADD (CMOV C-1, (CTTZ X), (X == 0)), C)
+  if (CC == X86::COND_NE || CC == X86::COND_E) {
+    auto *Cnst = CC == X86::COND_E ? dyn_cast<ConstantSDNode>(TrueOp)
+                                   : dyn_cast<ConstantSDNode>(FalseOp);
+    SDValue Add = CC == X86::COND_E ? FalseOp : TrueOp;
+
+    if (Cnst && Add.getOpcode() == ISD::ADD && Add.hasOneUse()) {
+      auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+      SDValue AddOp2 = Add.getOperand(0);
+      if (AddOp1 && (AddOp2.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
+                     AddOp2.getOpcode() == ISD::CTTZ)) {
+        APInt Diff = Cnst->getAPIntValue() - AddOp1->getAPIntValue();
+        if (CC == X86::COND_E) {
+          Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), AddOp2,
+                            DAG.getConstant(Diff, DL, Add.getValueType()),
+                            DAG.getConstant(CC, DL, MVT::i8), Cond);
+        } else {
+          Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(),
+                            DAG.getConstant(Diff, DL, Add.getValueType()),
+                            AddOp2, DAG.getConstant(CC, DL, MVT::i8), Cond);
+        }
+        return DAG.getNode(X86ISD::ADD, DL, Add.getValueType(), Add,
+                           SDValue(AddOp1, 0));
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -32577,13 +33582,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   if ((NumElts % 2) != 0)
     return SDValue();
 
-  // If the upper 17 bits of each element are zero then we can use PMADD.
-  APInt Mask17 = APInt::getHighBitsSet(32, 17);
-  if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
-      DAG.MaskedValueIsZero(N1, Mask17))
-    return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
-                       DAG.getBitcast(MVT::v8i16, N1));
-
   unsigned RegSize = 128;
   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
@@ -32679,7 +33677,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
-                                 EVT VT, SDLoc DL) {
+                                 EVT VT, const SDLoc &DL) {
 
   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
@@ -32691,10 +33689,11 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
     return Result;
   };
 
-  auto combineMulMulAddOrSub = [&](bool isAdd) {
+  auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
-                                 DAG.getConstant(9, DL, VT));
-    Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
+                                 DAG.getConstant(Mul1, DL, VT));
+    Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
+                         DAG.getConstant(Mul2, DL, VT));
     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
                          N->getOperand(0));
     return Result;
@@ -32709,43 +33708,137 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
   case 21:
     // mul x, 21 => add ((shl (mul x, 5), 2), x)
     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+  case 41:
+    // mul x, 41 => add ((shl (mul x, 5), 3), x)
+    return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
   case 22:
     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
   case 19:
-    // mul x, 19 => sub ((shl (mul x, 5), 2), x)
-    return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
+    // mul x, 19 => add ((shl (mul x, 9), 1), x)
+    return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
+  case 37:
+    // mul x, 37 => add ((shl (mul x, 9), 2), x)
+    return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
+  case 73:
+    // mul x, 73 => add ((shl (mul x, 9), 3), x)
+    return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
   case 13:
     // mul x, 13 => add ((shl (mul x, 3), 2), x)
     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
   case 23:
-    // mul x, 13 => sub ((shl (mul x, 3), 3), x)
+    // mul x, 23 => sub ((shl (mul x, 3), 3), x)
     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
-  case 14:
-    // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
-    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                       combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
   case 26:
-    // mul x, 26 => sub ((mul (mul x, 9), 3), x)
-    return combineMulMulAddOrSub(/*isAdd*/ false);
+    // mul x, 26 => add ((mul (mul x, 5), 5), x)
+    return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
   case 28:
     // mul x, 28 => add ((mul (mul x, 9), 3), x)
-    return combineMulMulAddOrSub(/*isAdd*/ true);
+    return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
   case 29:
     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                       combineMulMulAddOrSub(/*isAdd*/ true));
-  case 30:
-    // mul x, 30 => sub (sub ((shl x, 5), x), x)
-    return DAG.getNode(
-        ISD::SUB, DL, VT,
-        DAG.getNode(ISD::SUB, DL, VT,
-                    DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                                DAG.getConstant(5, DL, MVT::i8)),
-                    N->getOperand(0)),
-        N->getOperand(0));
+                       combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
+  }
+
+  // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
+  // by a single LEA.
+  // First check if this a sum of two power of 2s because that's easy. Then
+  // count how many zeros are up to the first bit.
+  // TODO: We can do this even without LEA at a cost of two shifts and an add.
+  if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+    unsigned ScaleShift = countTrailingZeros(MulAmt);
+    if (ScaleShift >= 1 && ScaleShift < 4) {
+      unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
+      SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
+      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                   DAG.getConstant(ScaleShift, DL, MVT::i8));
+      return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
+    }
+  }
+
+  return SDValue();
+}
+
+// If the upper 17 bits of each element are zero then we can use PMADDWD,
+// which is always at least as quick as PMULLD, expect on KNL.
+static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // Only support vXi32 vectors.
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
+    return SDValue();
+
+  // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
+  MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  APInt Mask17 = APInt::getHighBitsSet(32, 17);
+  if (!DAG.MaskedValueIsZero(N1, Mask17) ||
+      !DAG.MaskedValueIsZero(N0, Mask17))
+    return SDValue();
+
+  // Use SplitOpsAndApply to handle AVX splitting.
+  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                           ArrayRef<SDValue> Ops) {
+    MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+    return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+  };
+  return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+                          { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
+                          PMADDWDBuilder);
+}
+
+static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // Only support vXi64 vectors.
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // MULDQ returns the 64-bit result of the signed multiplication of the lower
+  // 32-bits. We can lower with this if the sign bits stretch that far.
+  if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
+      DAG.ComputeNumSignBits(N1) > 32) {
+    auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                            ArrayRef<SDValue> Ops) {
+      return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
+    };
+    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+                            PMULDQBuilder, /*CheckBWI*/false);
   }
+
+  // If the upper bits are zero we can use a single pmuludq.
+  APInt Mask = APInt::getHighBitsSet(64, 32);
+  if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
+    auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                             ArrayRef<SDValue> Ops) {
+      return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
+    };
+    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+                            PMULUDQBuilder, /*CheckBWI*/false);
+  }
+
   return SDValue();
 }
 
@@ -32755,6 +33848,13 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
+
+  if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
+    return V;
+
+  if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
+    return V;
+
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
@@ -32774,9 +33874,14 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   if (!C)
     return SDValue();
   uint64_t MulAmt = C->getZExtValue();
-  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+  if (isPowerOf2_64(MulAmt))
     return SDValue();
 
+  SDLoc DL(N);
+  if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                       N->getOperand(1));
+
   uint64_t MulAmt1 = 0;
   uint64_t MulAmt2 = 0;
   if ((MulAmt % 9) == 0) {
@@ -32790,7 +33895,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
     MulAmt2 = MulAmt / 3;
   }
 
-  SDLoc DL(N);
   SDValue NewMul;
   if (MulAmt2 &&
       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
@@ -32824,39 +33928,47 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
            "Both cases that could cause potential overflows should have "
            "already been handled.");
     int64_t SignMulAmt = C->getSExtValue();
-    if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
-        (SignMulAmt != -INT64_MAX)) {
-      int NumSign = SignMulAmt > 0 ? 1 : -1;
-      bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
-      bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
-      if (IsPowerOf2_64PlusOne) {
-        // (mul x, 2^N + 1) => (add (shl x, N), x)
-        NewMul = DAG.getNode(
-            ISD::ADD, DL, VT, N->getOperand(0),
-            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                        DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
-                                        MVT::i8)));
-      } else if (IsPowerOf2_64MinusOne) {
-        // (mul x, 2^N - 1) => (sub (shl x, N), x)
-        NewMul = DAG.getNode(
-            ISD::SUB, DL, VT,
-            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                        DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
-                                        MVT::i8)),
-            N->getOperand(0));
-      }
+    assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
+    uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
+    if (isPowerOf2_64(AbsMulAmt - 1)) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      NewMul = DAG.getNode(
+          ISD::ADD, DL, VT, N->getOperand(0),
+          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                      DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
+                                      MVT::i8)));
       // To negate, subtract the number from zero
-      if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
-        NewMul =
-            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+      if (SignMulAmt < 0)
+        NewMul = DAG.getNode(ISD::SUB, DL, VT,
+                             DAG.getConstant(0, DL, VT), NewMul);
+    } else if (isPowerOf2_64(AbsMulAmt + 1)) {
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(AbsMulAmt + 1),
+                                           DL, MVT::i8));
+      // To negate, reverse the operands of the subtract.
+      if (SignMulAmt < 0)
+        NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
+      else
+        NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+    } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
+      // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(AbsMulAmt - 2),
+                                           DL, MVT::i8));
+      NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
+      NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
+    } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
+      // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(AbsMulAmt + 2),
+                                           DL, MVT::i8));
+      NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+      NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
     }
   }
 
-  if (NewMul)
-    // Do not add new nodes to DAG combiner worklist.
-    DCI.CombineTo(N, NewMul, false);
-
-  return SDValue();
+  return NewMul;
 }
 
 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
@@ -32971,11 +34083,17 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
+  // Only do this on the last DAG combine as it can interfere with other
+  // combines.
+  if (!DCI.isAfterLegalizeDAG())
+    return SDValue();
+
   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
   // TODO: This is a generic DAG combine that became an x86-only combine to
   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
@@ -32992,6 +34110,14 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
   // transform should reduce code size. It may also enable secondary transforms
   // from improved known-bits analysis or instruction selection.
   APInt MaskVal = AndC->getAPIntValue();
+
+  // If this can be matched by a zero extend, don't optimize.
+  if (MaskVal.isMask()) {
+    unsigned TO = MaskVal.countTrailingOnes();
+    if (TO >= 8 && isPowerOf2_32(TO))
+      return SDValue();
+  }
+
   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
   unsigned OldMaskSize = MaskVal.getMinSignedBits();
   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
@@ -33018,7 +34144,7 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
       return V;
 
   if (N->getOpcode() == ISD::SRL)
-    if (SDValue V = combineShiftRightLogical(N, DAG))
+    if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
       return V;
 
   return SDValue();
@@ -33098,12 +34224,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
 
   // Attempt to combine as shuffle.
   SDValue Op(N, 0);
-  if (SDValue Res = combineX86ShufflesRecursively(
-          {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
-          /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
-    DCI.CombineTo(N, Res);
-    return SDValue();
-  }
+  if (SDValue Res =
+          combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+                                        /*HasVarMask*/ false, DAG, Subtarget))
+    return Res;
 
   return SDValue();
 }
@@ -33162,10 +34286,8 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
-            /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
-      DCI.CombineTo(N, Res);
-      return SDValue();
-    }
+            /*HasVarMask*/ false, DAG, Subtarget))
+      return Res;
   }
 
   // Constant Folding.
@@ -33201,12 +34323,10 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
 
   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
   SDValue Op(N, 0);
-  if (SDValue Res = combineX86ShufflesRecursively(
-          {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
-          /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
-    DCI.CombineTo(N, Res);
-    return SDValue();
-  }
+  if (SDValue Res =
+          combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+                                        /*HasVarMask*/ false, DAG, Subtarget))
+    return Res;
 
   return SDValue();
 }
@@ -33274,9 +34394,13 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
             SDValue FSetCC =
                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
                             DAG.getConstant(x86cc, DL, MVT::i8));
-            return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                               N->getSimpleValueType(0), FSetCC,
-                               DAG.getIntPtrConstant(0, DL));
+            // Need to fill with zeros to ensure the bitcast will produce zeroes
+            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+            SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
+                                      DAG.getConstant(0, DL, MVT::v16i1),
+                                      FSetCC, DAG.getIntPtrConstant(0, DL));
+            return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
+                                      N->getSimpleValueType(0));
           }
           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
                                               CMP00.getValueType(), CMP00, CMP01,
@@ -33313,25 +34437,40 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
+static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
+  if (N->getOpcode() != ISD::AND)
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
+    X = N0.getOperand(0);
+    Y = N1;
+    return true;
+  }
+  if (N1.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
+    X = N1.getOperand(0);
+    Y = N0;
+    return true;
+  }
+
+  return false;
+}
+
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
 
   EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
     return SDValue();
 
-  if (N0.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
-    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
-
-  if (N1.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
-    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+  SDValue X, Y;
+  if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
+    return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
 
   return SDValue();
 }
@@ -33343,8 +34482,7 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const X86Subtarget &Subtarget) {
+                                   const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   assert(VT.isVector() && "Expected vector type");
 
@@ -33515,7 +34653,7 @@ static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
 // It's equivalent to performing bzhi (zero high bits) on the input, with the
 // same index of the load.
 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
-    const X86Subtarget &Subtarget) {
+                                    const X86Subtarget &Subtarget) {
   MVT VT = Node->getSimpleValueType(0);
   SDLoc dl(Node);
 
@@ -33570,15 +34708,16 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
           //    that will be replaced with one bzhi instruction.
           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
-          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
+          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
 
           // Get the Node which indexes into the array.
           SDValue Index = getIndexFromUnindexedLoad(Ld);
           if (!Index)
             return SDValue();
-          Index = DAG.getZExtOrTrunc(Index, dl, VT);
+          Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
 
-          SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
+          SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
+          Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
 
           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
@@ -33604,6 +34743,20 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   }
 
+  // Use a 32-bit and+zext if upper bits known zero.
+  if (VT == MVT::i64 && Subtarget.is64Bit() &&
+      !isa<ConstantSDNode>(N->getOperand(1))) {
+    APInt HiMask = APInt::getHighBitsSet(64, 32);
+    if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
+        DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
+      SDLoc dl(N);
+      SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
+      SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
+                         DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
+    }
+  }
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -33627,10 +34780,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
-            /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
-      DCI.CombineTo(N, Res);
-      return SDValue();
-    }
+            /*HasVarMask*/ false, DAG, Subtarget))
+      return Res;
   }
 
   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
@@ -33666,7 +34817,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
 
       if (SDValue Shuffle = combineX86ShufflesRecursively(
               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
-              /*HasVarMask*/ false, DAG, DCI, Subtarget))
+              /*HasVarMask*/ false, DAG, Subtarget))
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
                            N->getOperand(0).getOperand(1));
     }
@@ -33675,6 +34826,38 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
+  if (N->getOpcode() != ISD::OR)
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Canonicalize AND to LHS.
+  if (N1.getOpcode() == ISD::AND)
+    std::swap(N0, N1);
+
+  // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
+  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
+    return false;
+
+  Mask = N1.getOperand(0);
+  X = N1.getOperand(1);
+
+  // Check to see if the mask appeared in both the AND and ANDNP.
+  if (N0.getOperand(0) == Mask)
+    Y = N0.getOperand(1);
+  else if (N0.getOperand(1) == Mask)
+    Y = N0.getOperand(0);
+  else
+    return false;
+
+  // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
+  // ANDNP combine allows other combines to happen that prevent matching.
+  return true;
+}
+
 // Try to fold:
 //   (or (and (m, y), (pandn m, x)))
 // into:
@@ -33687,33 +34870,13 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
-
   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
         (VT.is256BitVector() && Subtarget.hasInt256())))
     return SDValue();
 
-  // Canonicalize AND to LHS.
-  if (N1.getOpcode() == ISD::AND)
-    std::swap(N0, N1);
-
-  // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
-  // ANDNP combine allows other combines to happen that prevent matching.
-  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
-    return SDValue();
-
-  SDValue Mask = N1.getOperand(0);
-  SDValue X = N1.getOperand(1);
-  SDValue Y;
-  if (N0.getOperand(0) == Mask)
-    Y = N0.getOperand(1);
-  if (N0.getOperand(1) == Mask)
-    Y = N0.getOperand(0);
-
-  // Check to see if the mask appeared in both the AND and ANDNP.
-  if (!Y.getNode())
+  SDValue X, Y, Mask;
+  if (!matchLogicBlend(N, X, Y, Mask))
     return SDValue();
 
   // Validate that X, Y, and Mask are bitcasts, and see through them.
@@ -33810,7 +34973,7 @@ static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
   // encoding of shr and lzcnt is more desirable.
   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
-                            DAG.getConstant(Log2b, dl, VT));
+                            DAG.getConstant(Log2b, dl, MVT::i8));
   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
 }
 
@@ -34130,63 +35293,180 @@ static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
     return false;
 
   // FIXME: Scalar type may be supported if we move it to vector register.
-  if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+  if (!SrcVT.isVector())
     return false;
 
   EVT SrcElVT = SrcVT.getScalarType();
   EVT DstElVT = DstVT.getScalarType();
-  if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
-    return false;
-  if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+  if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
     return false;
   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
   return false;
 }
 
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Detect patterns of truncation with unsigned saturation:
+///
+/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+///   Return the source value x to be truncated or SDValue() if the pattern was
+///   not matched.
+///
+/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
+///   where C1 >= 0 and C2 is unsigned max of destination type.
+///
+///    (truncate (smax (smin (x, C2), C1)) to dest_type)
+///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
+///
+///   These two patterns are equivalent to:
+///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
+///   So return the smax(x, C1) value to be truncated or SDValue() if the
+///   pattern was not matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                 const SDLoc &DL) {
+  EVT InVT = In.getValueType();
+
+  // Saturation with truncation. We truncate from InVT to VT.
+  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+         "Unexpected types for truncate operation");
+
+  // Match min/max and return limit value as a parameter.
+  auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
+    if (V.getOpcode() == Opcode &&
+        ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
+      return V.getOperand(0);
+    return SDValue();
+  };
+
+  APInt C1, C2;
+  if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
+    // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+    // the element size of the destination type.
+    if (C2.isMask(VT.getScalarSizeInBits()))
+      return UMin;
+
+  if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
+    if (MatchMinMax(SMin, ISD::SMAX, C1))
+      if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
+        return SMin;
+
+  if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
+    if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
+      if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
+          C2.uge(C1)) {
+        return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
+      }
+
+  return SDValue();
+}
+
+/// Detect patterns of truncation with signed saturation:
+/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
+///                  signed_max_of_dest_type)) to dest_type)
+/// or:
+/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
+///                  signed_min_of_dest_type)) to dest_type).
+/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
-static SDValue detectUSatPattern(SDValue In, EVT VT) {
-  if (In.getOpcode() != ISD::UMIN)
+static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
+  unsigned NumDstBits = VT.getScalarSizeInBits();
+  unsigned NumSrcBits = In.getScalarValueSizeInBits();
+  assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
+
+  auto MatchMinMax = [](SDValue V, unsigned Opcode,
+                        const APInt &Limit) -> SDValue {
+    APInt C;
+    if (V.getOpcode() == Opcode &&
+        ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
+      return V.getOperand(0);
     return SDValue();
+  };
 
-  //Saturation with truncation. We truncate from InVT to VT.
-  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
-    "Unexpected types for truncate operation");
-
-  APInt C;
-  if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
-    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
-    // the element size of the destination type.
-    return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
-      SDValue();
+  APInt SignedMax, SignedMin;
+  if (MatchPackUS) {
+    SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
+    SignedMin = APInt(NumSrcBits, 0);
+  } else {
+    SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
+    SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
   }
+
+  if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
+    if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
+      return SMax;
+
+  if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
+    if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
+      return SMin;
+
   return SDValue();
 }
 
+/// Detect a pattern of truncation with signed saturation.
+/// The types should allow to use VPMOVSS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
+                                       const X86Subtarget &Subtarget,
+                                       const TargetLowering &TLI) {
+  if (!TLI.isTypeLegal(In.getValueType()))
+    return SDValue();
+  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+    return SDValue();
+  return detectSSatPattern(In, VT);
+}
+
 /// Detect a pattern of truncation with saturation:
 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 /// The types should allow to use VPMOVUS* instruction on AVX512.
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
-static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
-                                       const X86Subtarget &Subtarget) {
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                       const SDLoc &DL,
+                                       const X86Subtarget &Subtarget,
+                                       const TargetLowering &TLI) {
+  if (!TLI.isTypeLegal(In.getValueType()))
+    return SDValue();
   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
     return SDValue();
-  return detectUSatPattern(In, VT);
+  return detectUSatPattern(In, VT, DAG, DL);
 }
 
-static SDValue
-combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
-                        const X86Subtarget &Subtarget) {
+static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
+                                      SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  EVT SVT = VT.getScalarType();
+  EVT InVT = In.getValueType();
+  EVT InSVT = InVT.getScalarType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
-    return SDValue();
-  if (auto USatVal = detectUSatPattern(In, VT))
-    if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+  if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
+      isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
+    if (auto SSatVal = detectSSatPattern(In, VT))
+      return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
+    if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+  }
+  if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
+      (SVT == MVT::i8 || SVT == MVT::i16) &&
+      (InSVT == MVT::i16 || InSVT == MVT::i32)) {
+    if (auto USatVal = detectSSatPattern(In, VT, true)) {
+      // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
+      if (SVT == MVT::i8 && InSVT == MVT::i32) {
+        EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                     VT.getVectorNumElements());
+        SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
+                                             DAG, Subtarget);
+        if (Mid)
+          return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
+                                        Subtarget);
+      } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
+        return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
+                                      Subtarget);
+    }
+    if (auto SSatVal = detectSSatPattern(In, VT))
+      return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
+                                    Subtarget);
+  }
   return SDValue();
 }
 
@@ -34196,7 +35476,7 @@ combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget,
                                 const SDLoc &DL) {
-  if (!VT.isVector() || !VT.isSimple())
+  if (!VT.isVector())
     return SDValue();
   EVT InVT = In.getValueType();
   unsigned NumElems = VT.getVectorNumElements();
@@ -34238,42 +35518,13 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
       if (!C)
         return false;
-      uint64_t Val = C->getZExtValue();
-      if (Val < Min || Val > Max)
+      const APInt &Val = C->getAPIntValue();
+      if (Val.ult(Min) || Val.ugt(Max))
         return false;
     }
     return true;
   };
 
-  // Split vectors to legal target size and apply AVG.
-  auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
-    unsigned NumSubs = 1;
-    if (Subtarget.hasBWI()) {
-      if (VT.getSizeInBits() > 512)
-        NumSubs = VT.getSizeInBits() / 512;
-    } else if (Subtarget.hasAVX2()) {
-      if (VT.getSizeInBits() > 256)
-        NumSubs = VT.getSizeInBits() / 256;
-    } else {
-      if (VT.getSizeInBits() > 128)
-        NumSubs = VT.getSizeInBits() / 128;
-    }
-
-    if (NumSubs == 1)
-      return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);
-
-    SmallVector<SDValue, 4> Subs;
-    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
-                                 VT.getVectorNumElements() / NumSubs);
-    for (unsigned i = 0; i != NumSubs; ++i) {
-      unsigned Idx = i * SubVT.getVectorNumElements();
-      SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
-      SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
-      Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
-    }
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
-  };
-
   // Check if each element of the vector is left-shifted by one.
   auto LHS = In.getOperand(0);
   auto RHS = In.getOperand(1);
@@ -34287,6 +35538,11 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   Operands[0] = LHS.getOperand(0);
   Operands[1] = LHS.getOperand(1);
 
+  auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                       ArrayRef<SDValue> Ops) {
+    return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
+  };
+
   // Take care of the case when one of the operands is a constant vector whose
   // element is in the range [1, 256].
   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
@@ -34297,7 +35553,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
-    return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
+    return SplitOpsAndApply(DAG, Subtarget, DL, VT,
+                            { Operands[0].getOperand(0), Operands[1] },
+                            AVGBuilder);
   }
 
   if (Operands[0].getOpcode() == ISD::ADD)
@@ -34320,8 +35578,10 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
           Operands[j].getOperand(0).getValueType() != VT)
         return SDValue();
 
-    // The pattern is detected, emit X86ISD::AVG instruction.
-    return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
+    // The pattern is detected, emit X86ISD::AVG instruction(s).
+    return SplitOpsAndApply(DAG, Subtarget, DL, VT,
+                            { Operands[0].getOperand(0),
+                              Operands[1].getOperand(0) }, AVGBuilder);
   }
 
   return SDValue();
@@ -34752,6 +36012,63 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
+  // This will avoid a copy to k-register.
+  if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
+      StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      StoredVal.getOperand(0).getValueType() == MVT::i8) {
+    return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
+                        St->getBasePtr(), St->getPointerInfo(),
+                        St->getAlignment(), St->getMemOperand()->getFlags());
+  }
+
+  // Widen v2i1/v4i1 stores to v8i1.
+  if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+      Subtarget.hasAVX512()) {
+    unsigned NumConcats = 8 / VT.getVectorNumElements();
+    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
+    Ops[0] = StoredVal;
+    StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
+  // Turn vXi1 stores of constants into a scalar store.
+  if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
+       VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
+      ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
+    // If its a v64i1 store without 64-bit support, we need two stores.
+    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+      SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
+                                      StoredVal->ops().slice(0, 32));
+      Lo = combinevXi1ConstantToInteger(Lo, DAG);
+      SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
+                                      StoredVal->ops().slice(32, 32));
+      Hi = combinevXi1ConstantToInteger(Hi, DAG);
+
+      unsigned Alignment = St->getAlignment();
+
+      SDValue Ptr0 = St->getBasePtr();
+      SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
+
+      SDValue Ch0 =
+          DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
+                       Alignment, St->getMemOperand()->getFlags());
+      SDValue Ch1 =
+          DAG.getStore(St->getChain(), dl, Hi, Ptr1,
+                       St->getPointerInfo().getWithOffset(4),
+                       MinAlign(Alignment, 4U),
+                       St->getMemOperand()->getFlags());
+      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+    }
+
+    StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
+    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
   // If we are saving a concatenation of two XMM registers and 32-byte stores
   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   bool Fast;
@@ -34794,13 +36111,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (SDValue Val =
-        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+        detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
+                                TLI))
+      return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
+                             dl, Val, St->getBasePtr(),
+                             St->getMemoryVT(), St->getMemOperand(), DAG);
+    if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
+                                              DAG, dl, Subtarget, TLI))
       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
                              dl, Val, St->getBasePtr(),
                              St->getMemoryVT(), St->getMemOperand(), DAG);
 
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
     unsigned FromSz = VT.getScalarSizeInBits();
@@ -35113,7 +36436,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
 
   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+       (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, IsFadd)) {
     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
@@ -35126,7 +36449,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget,
-                                          SDLoc &DL) {
+                                          const SDLoc &DL) {
   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
   SDValue Src = N->getOperand(0);
   unsigned Opcode = Src.getOpcode();
@@ -35199,7 +36522,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
     // better to truncate if we have the chance.
     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
-        !Subtarget.hasDQI())
+        !TLI.isOperationLegal(Opcode, SrcVT))
       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
     LLVM_FALLTHROUGH;
   case ISD::ADD: {
@@ -35216,88 +36539,50 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
-static SDValue
-combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
-                                  SmallVector<SDValue, 8> &Regs) {
-  assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
-                             Regs[0].getValueType() == MVT::v2i64));
+/// Truncate using ISD::AND mask and X86ISD::PACKUS.
+static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
+                                                 const X86Subtarget &Subtarget,
+                                                 SelectionDAG &DAG) {
+  SDValue In = N->getOperand(0);
+  EVT InVT = In.getValueType();
+  EVT InSVT = InVT.getVectorElementType();
   EVT OutVT = N->getValueType(0);
   EVT OutSVT = OutVT.getVectorElementType();
-  EVT InVT = Regs[0].getValueType();
-  EVT InSVT = InVT.getVectorElementType();
-  SDLoc DL(N);
 
-  // First, use mask to unset all bits that won't appear in the result.
-  assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
-         "OutSVT can only be either i8 or i16.");
+  // Split a long vector into vectors of legal type and mask to unset all bits
+  // that won't appear in the result to prevent saturation.
+  // TODO - we should be doing this at the maximum legal size but this is
+  // causing regressions where we're concatenating back to max width just to
+  // perform the AND and then extracting back again.....
+  unsigned NumSubRegs = InVT.getSizeInBits() / 128;
+  unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
+  EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
+  SmallVector<SDValue, 8> SubVecs(NumSubRegs);
+
   APInt Mask =
       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
-  SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
-  for (auto &Reg : Regs)
-    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
-
-  MVT UnpackedVT, PackedVT;
-  if (OutSVT == MVT::i8) {
-    UnpackedVT = MVT::v8i16;
-    PackedVT = MVT::v16i8;
-  } else {
-    UnpackedVT = MVT::v4i32;
-    PackedVT = MVT::v8i16;
-  }
-
-  // In each iteration, truncate the type by a half size.
-  auto RegNum = Regs.size();
-  for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
-       j < e; j *= 2, RegNum /= 2) {
-    for (unsigned i = 0; i < RegNum; i++)
-      Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
-    for (unsigned i = 0; i < RegNum / 2; i++)
-      Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
-                            Regs[i * 2 + 1]);
-  }
-
-  // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
-  // then extract a subvector as the result since v8i8 is not a legal type.
-  if (OutVT == MVT::v8i8) {
-    Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
-    Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
-                          DAG.getIntPtrConstant(0, DL));
-    return Regs[0];
-  } else if (RegNum > 1) {
-    Regs.resize(RegNum);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
-  } else
-    return Regs[0];
-}
+  SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
 
-/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
-static SDValue
-combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
-                                  SelectionDAG &DAG,
-                                  SmallVector<SDValue, 8> &Regs) {
-  assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
-  EVT OutVT = N->getValueType(0);
-  SDLoc DL(N);
-
-  // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
-  SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
-  for (auto &Reg : Regs) {
-    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
-                              Subtarget, DAG);
-    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
-                              Subtarget, DAG);
+  for (unsigned i = 0; i < NumSubRegs; i++) {
+    SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
+                              DAG.getIntPtrConstant(i * NumSubRegElts, DL));
+    SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
   }
+  In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
 
-  for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
-    Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
-                          Regs[i * 2 + 1]);
+  return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
+}
 
-  if (Regs.size() > 2) {
-    Regs.resize(Regs.size() / 2);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
-  } else
-    return Regs[0];
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
+                                                 const X86Subtarget &Subtarget,
+                                                 SelectionDAG &DAG) {
+  SDValue In = N->getOperand(0);
+  EVT InVT = In.getValueType();
+  EVT OutVT = N->getValueType(0);
+  In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
+                   DAG.getValueType(OutVT));
+  return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
 }
 
 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
@@ -35338,32 +36623,21 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDLoc DL(N);
-
-  // Split a long vector into vectors of legal type.
-  unsigned RegNum = InVT.getSizeInBits() / 128;
-  SmallVector<SDValue, 8> SubVec(RegNum);
-  unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
-  EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
-
-  for (unsigned i = 0; i < RegNum; i++)
-    SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
-                            DAG.getIntPtrConstant(i * NumSubRegElts, DL));
-
   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   // truncate 2 x v4i32 to v8i16.
   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
-    return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
-  else if (InSVT == MVT::i32)
-    return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
-  else
-    return SDValue();
+    return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
+  if (InSVT == MVT::i32)
+    return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
+
+  return SDValue();
 }
 
 /// This function transforms vector truncation of 'extended sign-bits' or
 /// 'extended zero-bits' values.
 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
-static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
+static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
                                                SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
   // Requires SSE2 but AVX512 has fast truncate.
@@ -35383,7 +36657,7 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
   MVT InVT = In.getValueType().getSimpleVT();
   MVT InSVT = InVT.getScalarType();
 
-  // Check we have a truncation suited for PACKSS.
+  // Check we have a truncation suited for PACKSS/PACKUS.
   if (!VT.is128BitVector() && !VT.is256BitVector())
     return SDValue();
   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
@@ -35391,25 +36665,79 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
     return SDValue();
 
-  // Use PACKSS if the input has sign-bits that extend all the way to the
-  // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
-  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
-  unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
-  if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
-    return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+  unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Use PACKUS if the input has zero-bits that extend all the way to the
   // packed/truncated value. e.g. masks, zext_in_reg, etc.
   KnownBits Known;
   DAG.computeKnownBits(In, Known);
   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
-  NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
-  if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+  if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
 
+  // Use PACKSS if the input has sign-bits that extend all the way to the
+  // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+  if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
+    return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
   return SDValue();
 }
 
+// Try to form a MULHU or MULHS node by looking for
+// (trunc (srl (mul ext, ext), 16))
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
+                            SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+  // First instruction should be a right shift of a multiply.
+  if (Src.getOpcode() != ISD::SRL ||
+      Src.getOperand(0).getOpcode() != ISD::MUL)
+    return SDValue();
+
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  // Only handle vXi16 types that are at least 128-bits.
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
+      VT.getVectorNumElements() < 8)
+    return SDValue();
+
+  // Input type should be vXi32.
+  EVT InVT = Src.getValueType();
+  if (InVT.getVectorElementType() != MVT::i32)
+    return SDValue();
+
+  // Need a shift by 16.
+  APInt ShiftAmt;
+  if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
+      ShiftAmt != 16)
+    return SDValue();
+
+  SDValue LHS = Src.getOperand(0).getOperand(0);
+  SDValue RHS = Src.getOperand(0).getOperand(1);
+
+  unsigned ExtOpc = LHS.getOpcode();
+  if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+      RHS.getOpcode() != ExtOpc)
+    return SDValue();
+
+  // Peek through the extends.
+  LHS = LHS.getOperand(0);
+  RHS = RHS.getOperand(0);
+
+  // Ensure the input types match.
+  if (LHS.getValueType() != VT || RHS.getValueType() != VT)
+    return SDValue();
+
+  unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+  return DAG.getNode(Opc, DL, VT, LHS, RHS);
+}
+
 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
@@ -35424,10 +36752,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
-  // Try to combine truncation with unsigned saturation.
-  if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+  // Try to combine truncation with signed/unsigned saturation.
+  if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
     return Val;
 
+  // Try to combine PMULHUW/PMULHW for vXi16.
+  if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
+    return V;
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -35621,6 +36953,39 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget &Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  unsigned NumBits = VT.getSizeInBits();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                        !DCI.isBeforeLegalizeOps());
+
+  // TODO - Constant Folding.
+  if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+    // Reduce Cst1 to the bottom 16-bits.
+    // NOTE: SimplifyDemandedBits won't do this for constants.
+    const APInt &Val1 = Cst1->getAPIntValue();
+    APInt MaskedVal1 = Val1 & 0xFFFF;
+    if (MaskedVal1 != Val1)
+      return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
+                         DAG.getConstant(MaskedVal1, SDLoc(N), VT));
+  }
+
+  // Only bottom 16-bits of the control bits are required.
+  KnownBits Known;
+  APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
+  if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+    return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
 
 static bool isNullFPScalarOrVectorConst(SDValue V) {
   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
@@ -35751,8 +37116,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.useSoftFloat())
     return SDValue();
 
-  // TODO: Check for global or instruction-level "nnan". In that case, we
-  //       should be able to lower to FMAX/FMIN alone.
   // TODO: If an operand is already known to be a NaN or not a NaN, this
   //       should be an optional swap and FMAX/FMIN.
 
@@ -35762,14 +37125,21 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
     return SDValue();
 
-  // This takes at least 3 instructions, so favor a library call when operating
-  // on a scalar and minimizing code size.
-  if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
-    return SDValue();
-
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
+  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+  // If we don't have to respect NaN inputs, this is a direct translation to x86
+  // min/max instructions.
+  if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+  // If we have to respect NaN inputs, this takes at least 3 instructions.
+  // Favor a library call when operating on a scalar and minimizing code size.
+  if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+    return SDValue();
+
   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
       DAG.getDataLayout(), *DAG.getContext(), VT);
 
@@ -35792,9 +37162,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   // use those instructions for fmaxnum by selecting away a NaN input.
 
   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
-  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
-  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
@@ -35820,10 +37189,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
-            /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
-      DCI.CombineTo(N, Res);
-      return SDValue();
-    }
+            /*HasVarMask*/ false, DAG, Subtarget))
+      return Res;
   }
 
   return SDValue();
@@ -35843,12 +37210,54 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget) {
+// Try to combine sext_in_reg of a cmov of constants by extending the constants.
+static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  if (!VT.isVector())
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+
+  if (ExtraVT != MVT::i16)
+    return SDValue();
+
+  // Look through single use any_extends.
+  if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
+    N0 = N0.getOperand(0);
+
+  // See if we have a single use cmov.
+  if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
     return SDValue();
 
+  SDValue CMovOp0 = N0.getOperand(0);
+  SDValue CMovOp1 = N0.getOperand(1);
+
+  // Make sure both operands are constants.
+  if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
+      !isa<ConstantSDNode>(CMovOp1.getNode()))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // If we looked through an any_extend above, add one to the constants.
+  if (N0.getValueType() != VT) {
+    CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
+    CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
+  }
+
+  CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
+  CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
+
+  return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
+                     N0.getOperand(2), N0.getOperand(3));
+}
+
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  if (SDValue V = combineSextInRegCmov(N, DAG))
+    return V;
+
+  EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
@@ -35987,7 +37396,7 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
 //         promotion).
 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
   SDValue CMovN = Extend->getOperand(0);
-  if (CMovN.getOpcode() != X86ISD::CMOV)
+  if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
     return SDValue();
 
   EVT TargetVT = Extend->getValueType(0);
@@ -35998,20 +37407,36 @@ static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
   SDValue CMovOp0 = CMovN.getOperand(0);
   SDValue CMovOp1 = CMovN.getOperand(1);
 
-  bool DoPromoteCMOV =
-      (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
-      CMovN.hasOneUse() &&
-      (isa<ConstantSDNode>(CMovOp0.getNode()) &&
-       isa<ConstantSDNode>(CMovOp1.getNode()));
+  if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
+      !isa<ConstantSDNode>(CMovOp1.getNode()))
+    return SDValue();
+
+  // Only extend to i32 or i64.
+  if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
+    return SDValue();
 
-  if (!DoPromoteCMOV)
+  // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
+  // are free.
+  if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
     return SDValue();
 
-  CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
-  CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
+  // If this a zero extend to i64, we should only extend to i32 and use a free
+  // zero extend to finish.
+  EVT ExtendVT = TargetVT;
+  if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
+    ExtendVT = MVT::i32;
 
-  return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
-                     CMovN.getOperand(2), CMovN.getOperand(3));
+  CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
+  CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
+
+  SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
+                            CMovN.getOperand(2), CMovN.getOperand(3));
+
+  // Finish extending if needed.
+  if (ExtendVT != TargetVT)
+    Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
+
+  return Res;
 }
 
 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
@@ -36167,7 +37592,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
   // Also use this if we don't have SSE41 to allow the legalizer do its job.
   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
       (VT.is256BitVector() && Subtarget.hasInt256()) ||
-      (VT.is512BitVector() && Subtarget.hasAVX512())) {
+      (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
     return Opcode == ISD::SIGN_EXTEND
                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
@@ -36200,12 +37625,55 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
 
   // On pre-AVX512 targets, split into 256-bit nodes of
   // ISD::*_EXTEND_VECTOR_INREG.
-  if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
+  if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
     return SplitAndExtendInReg(256);
 
   return SDValue();
 }
 
+// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
+// result type.
+static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // Only do this combine with AVX512 for vector extends.
+  if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Only combine legal element types.
+  EVT SVT = VT.getVectorElementType();
+  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
+      SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
+    return SDValue();
+
+  // We can only do this if the vector size in 256 bits or less.
+  unsigned Size = VT.getSizeInBits();
+  if (Size > 256)
+    return SDValue();
+
+  // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
+  // that's the only integer compares with we have.
+  ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
+  if (ISD::isUnsignedIntSetCC(CC))
+    return SDValue();
+
+  // Only do this combine if the extension will be fully consumed by the setcc.
+  EVT N00VT = N0.getOperand(0).getValueType();
+  EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+  if (Size != MatchingVecType.getSizeInBits())
+    return SDValue();
+
+  SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
+
+  return Res;
+}
+
 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
@@ -36223,6 +37691,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
+    return V;
+
   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
     // Invert and sign-extend a boolean is the same as zero-extend and subtract
@@ -36240,7 +37711,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+    if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
@@ -36249,9 +37720,40 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
+  if (NegMul) {
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::FMA:             Opcode = X86ISD::FNMADD;       break;
+    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMADD_RND;   break;
+    case X86ISD::FMSUB:        Opcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FNMADD:       Opcode = ISD::FMA;             break;
+    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMADD_RND;    break;
+    case X86ISD::FNMSUB:       Opcode = X86ISD::FMSUB;        break;
+    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMSUB_RND;    break;
+    }
+  }
+
+  if (NegAcc) {
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::FMA:             Opcode = X86ISD::FMSUB;        break;
+    case X86ISD::FMADD_RND:    Opcode = X86ISD::FMSUB_RND;    break;
+    case X86ISD::FMSUB:        Opcode = ISD::FMA;             break;
+    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FMADD_RND;    break;
+    case X86ISD::FNMADD:       Opcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FNMSUB:       Opcode = X86ISD::FNMADD;       break;
+    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FNMADD_RND;   break;
+    }
+  }
+
+  return Opcode;
+}
+
 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
-  // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
@@ -36267,96 +37769,41 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   SDValue B = N->getOperand(1);
   SDValue C = N->getOperand(2);
 
-  auto invertIfNegative = [](SDValue &V) {
+  auto invertIfNegative = [&DAG](SDValue &V) {
     if (SDValue NegVal = isFNEG(V.getNode())) {
-      V = NegVal;
+      V = DAG.getBitcast(V.getValueType(), NegVal);
       return true;
     }
+    // Look through extract_vector_elts. If it comes from an FNEG, create a
+    // new extract from the FNEG input.
+    if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        isa<ConstantSDNode>(V.getOperand(1)) &&
+        cast<ConstantSDNode>(V.getOperand(1))->getZExtValue() == 0) {
+      if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
+        NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
+        V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
+                        NegVal, V.getOperand(1));
+        return true;
+      }
+    }
+
     return false;
   };
 
   // Do not convert the passthru input of scalar intrinsics.
   // FIXME: We could allow negations of the lower element only.
-  bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
-              N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+  bool NegA = invertIfNegative(A);
   bool NegB = invertIfNegative(B);
-  bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
-              N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
-
-  // Negative multiplication when NegA xor NegB
-  bool NegMul = (NegA != NegB);
-  bool HasNeg = NegA || NegB || NegC;
+  bool NegC = invertIfNegative(C);
 
-  unsigned NewOpcode;
-  if (!NegMul)
-    NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
-  else
-    NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
-
-  // For FMA, we risk reconstructing the node we started with.
-  // In order to avoid this, we check for negation or opcode change. If
-  // one of the two happened, then it is a new node and we return it.
-  if (N->getOpcode() == ISD::FMA) {
-    if (HasNeg || NewOpcode != N->getOpcode())
-      return DAG.getNode(NewOpcode, dl, VT, A, B, C);
-    return SDValue();
-  }
-
-  if (N->getOpcode() == X86ISD::FMADD_RND) {
-    switch (NewOpcode) {
-    case ISD::FMA:       NewOpcode = X86ISD::FMADD_RND; break;
-    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
-    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
-    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
-    }
-  } else if (N->getOpcode() == X86ISD::FMADDS1) {
-    switch (NewOpcode) {
-    case ISD::FMA:       NewOpcode = X86ISD::FMADDS1; break;
-    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1; break;
-    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
-    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
-    }
-  } else if (N->getOpcode() == X86ISD::FMADDS3) {
-    switch (NewOpcode) {
-    case ISD::FMA:       NewOpcode = X86ISD::FMADDS3; break;
-    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3; break;
-    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
-    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
-    }
-  } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
-    switch (NewOpcode) {
-    case ISD::FMA:       NewOpcode = X86ISD::FMADDS1_RND; break;
-    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
-    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
-    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
-    }
-  } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
-    switch (NewOpcode) {
-    case ISD::FMA:       NewOpcode = X86ISD::FMADDS3_RND; break;
-    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
-    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
-    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
-    }
-  } else if (N->getOpcode() == X86ISD::FMADD4S) {
-    switch (NewOpcode) {
-    case ISD::FMA:       NewOpcode = X86ISD::FMADD4S; break;
-    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB4S; break;
-    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
-    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
-    }
-  } else {
-    llvm_unreachable("Unexpected opcode!");
-  }
+  if (!NegA && !NegB && !NegC)
+    return SDValue();
 
-  // Only return the node is the opcode was changed or one of the
-  // operand was negated. If not, we'll just recreate the same node.
-  if (HasNeg || NewOpcode != N->getOpcode()) {
-    if (N->getNumOperands() == 4)
-      return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
-    return DAG.getNode(NewOpcode, dl, VT, A, B, C);
-  }
+  unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
 
-  return SDValue();
+  if (N->getNumOperands() == 4)
+    return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+  return DAG.getNode(NewOpcode, dl, VT, A, B, C);
 }
 
 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@@ -36425,6 +37872,10 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
+  if (DCI.isBeforeLegalizeOps())
+    if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
+      return V;
+
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
@@ -36432,7 +37883,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+    if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
       return R;
 
   if (SDValue DivRem8 = getDivRem8(N, DAG))
@@ -36495,13 +37946,13 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
       SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
       SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
       SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
-      SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
-      SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
+      SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
+      SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
       Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
     } else {
       SDValue VecX = DAG.getBitcast(VecVT, X);
       SDValue VecY = DAG.getBitcast(VecVT, Y);
-      Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
+      Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
     }
     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
@@ -36523,10 +37974,10 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  EVT OpVT = LHS.getValueType();
   SDLoc DL(N);
 
   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
-    EVT OpVT = LHS.getValueType();
     // 0-x == y --> x+y == 0
     // 0-x != y --> x+y != 0
     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
@@ -36575,6 +38026,20 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
+  // pre-promote its result type since vXi1 vectors don't get promoted
+  // during type legalization.
+  // NOTE: The element count check is to ignore operand types that need to
+  // go through type promotion to a 128-bit vector.
+  if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
+      (OpVT.getVectorElementType() == MVT::i8 ||
+       OpVT.getVectorElementType() == MVT::i16)) {
+    SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
+                                N->getOperand(2));
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
+  }
+
   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
   // to avoid scalarization via legalization because v4i32 is not a legal type.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
@@ -36589,6 +38054,19 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
   SDValue Src = N->getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
+  // Perform constant folding.
+  if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
+    assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
+    APInt Imm(32, 0);
+    for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
+      SDValue In = Src.getOperand(Idx);
+      if (!In.isUndef() &&
+          cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
+        Imm.setBit(Idx);
+    }
+    return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
+  }
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
@@ -36620,12 +38098,14 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
           Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
         NewOps[4] = Index.getOperand(0);
-        DAG.UpdateNodeOperands(N, NewOps);
-        // The original sign extend has less users, add back to worklist in case
-        // it needs to be removed
-        DCI.AddToWorklist(Index.getNode());
-        DCI.AddToWorklist(N);
-        return SDValue(N, 0);
+        SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+        if (Res == N) {
+          // The original sign extend has less users, add back to worklist in
+          // case it needs to be removed
+          DCI.AddToWorklist(Index.getNode());
+          DCI.AddToWorklist(N);
+        }
+        return SDValue(Res, 0);
       }
     }
 
@@ -36638,9 +38118,10 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
       SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
       NewOps[4] = Index;
-      DAG.UpdateNodeOperands(N, NewOps);
-      DCI.AddToWorklist(N);
-      return SDValue(N, 0);
+      SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+      if (Res == N)
+        DCI.AddToWorklist(N);
+      return SDValue(Res, 0);
     }
 
     // Try to remove zero extends from 32->64 if we know the sign bit of
@@ -36651,32 +38132,24 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
       if (DAG.SignBitIsZero(Index.getOperand(0))) {
         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
         NewOps[4] = Index.getOperand(0);
-        DAG.UpdateNodeOperands(N, NewOps);
-        // The original zero extend has less users, add back to worklist in case
-        // it needs to be removed
-        DCI.AddToWorklist(Index.getNode());
-        DCI.AddToWorklist(N);
-        return SDValue(N, 0);
+        SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+        if (Res == N) {
+          // The original sign extend has less users, add back to worklist in
+          // case it needs to be removed
+          DCI.AddToWorklist(Index.getNode());
+          DCI.AddToWorklist(N);
+        }
+        return SDValue(Res, 0);
       }
     }
   }
 
-  // Gather and Scatter instructions use k-registers for masks. The type of
-  // the masks is v*i1. So the mask will be truncated anyway.
-  // The SIGN_EXTEND_INREG my be dropped.
-  SDValue Mask = N->getOperand(2);
-  if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
-    SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
-    NewOps[2] = Mask.getOperand(0);
-    DAG.UpdateNodeOperands(N, NewOps);
-    return SDValue(N, 0);
-  }
-
   // With AVX2 we only demand the upper bit of the mask.
   if (!Subtarget.hasAVX512()) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
+    SDValue Mask = N->getOperand(2);
     KnownBits Known;
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
@@ -36773,11 +38246,11 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
-  EVT InSVT = InVT.getScalarType();
 
+  // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
-  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
     SDLoc dl(N);
     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements());
@@ -36807,14 +38280,11 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
-  EVT InSVT = InVT.getScalarType();
 
   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
-  if (InVT.isVector() &&
-      (InSVT == MVT::i8 || InSVT == MVT::i16 ||
-       (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
+  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
     SDLoc dl(N);
     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements());
@@ -36849,6 +38319,11 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
     if (VT == MVT::f16 || VT == MVT::f128)
       return SDValue();
 
+    // If we have AVX512DQ we can use packed conversion instructions unless
+    // the VT is f80.
+    if (Subtarget.hasDQI() && VT != MVT::f80)
+      return SDValue();
+
     if (!Ld->isVolatile() && !VT.isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !Subtarget.is64Bit() && LdVT == MVT::i64) {
@@ -37103,15 +38578,9 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
 
   EVT VT = N->getValueType(0);
 
-  unsigned RegSize = 128;
-  if (Subtarget.hasBWI())
-    RegSize = 512;
-  else if (Subtarget.hasAVX2())
-    RegSize = 256;
-  unsigned VectorSize = VT.getVectorNumElements() * 16;
   // If the vector size is less than 128, or greater than the supported RegSize,
   // do not use PMADD.
-  if (VectorSize < 128 || VectorSize > RegSize)
+  if (VT.getVectorNumElements() < 8)
     return SDValue();
 
   SDLoc DL(N);
@@ -37125,7 +38594,13 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
 
   // Madd vector size is half of the original vector size
-  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                           ArrayRef<SDValue> Ops) {
+    MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+    return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+  };
+  SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
+                                  PMADDWDBuilder);
   // Fill the rest of the output with 0
   SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
@@ -37149,12 +38624,12 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   unsigned RegSize = 128;
-  if (Subtarget.hasBWI())
+  if (Subtarget.useBWIRegs())
     RegSize = 512;
-  else if (Subtarget.hasAVX2())
+  else if (Subtarget.hasAVX())
     RegSize = 256;
 
-  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
   if (VT.getSizeInBits() / 4 > RegSize)
@@ -37180,7 +38655,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
   // reduction. Note that the number of elements of the result of SAD is less
   // than the number of elements of its input. Therefore, we could only update
   // part of elements in the reduction vector.
-  SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
+  SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
 
   // The output of PSADBW is a vector of i64.
   // We need to turn the vector of i64 into a vector of i32.
@@ -37230,6 +38705,236 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
 }
 
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
+                            const SDLoc &DL, EVT VT,
+                            const X86Subtarget &Subtarget) {
+  // Example of pattern we try to detect:
+  // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
+  //(add (build_vector (extract_elt t, 0),
+  //                   (extract_elt t, 2),
+  //                   (extract_elt t, 4),
+  //                   (extract_elt t, 6)),
+  //     (build_vector (extract_elt t, 1),
+  //                   (extract_elt t, 3),
+  //                   (extract_elt t, 5),
+  //                   (extract_elt t, 7)))
+
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
+      Op1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
+      VT.getVectorNumElements() < 4 ||
+      !isPowerOf2_32(VT.getVectorNumElements()))
+    return SDValue();
+
+  // Check if one of Op0,Op1 is of the form:
+  // (build_vector (extract_elt Mul, 0),
+  //               (extract_elt Mul, 2),
+  //               (extract_elt Mul, 4),
+  //                   ...
+  // the other is of the form:
+  // (build_vector (extract_elt Mul, 1),
+  //               (extract_elt Mul, 3),
+  //               (extract_elt Mul, 5),
+  //                   ...
+  // and identify Mul.
+  SDValue Mul;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
+    SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
+            Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
+    // TODO: Be more tolerant to undefs.
+    if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+    auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
+    auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
+    auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
+    auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
+    if (!Const0L || !Const1L || !Const0H || !Const1H)
+      return SDValue();
+    unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
+             Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
+    // Commutativity of mul allows factors of a product to reorder.
+    if (Idx0L > Idx1L)
+      std::swap(Idx0L, Idx1L);
+    if (Idx0H > Idx1H)
+      std::swap(Idx0H, Idx1H);
+    // Commutativity of add allows pairs of factors to reorder.
+    if (Idx0L > Idx0H) {
+      std::swap(Idx0L, Idx0H);
+      std::swap(Idx1L, Idx1H);
+    }
+    if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
+        Idx1H != 2 * i + 3)
+      return SDValue();
+    if (!Mul) {
+      // First time an extract_elt's source vector is visited. Must be a MUL
+      // with 2X number of vector elements than the BUILD_VECTOR.
+      // Both extracts must be from same MUL.
+      Mul = Op0L->getOperand(0);
+      if (Mul->getOpcode() != ISD::MUL ||
+          Mul.getValueType().getVectorNumElements() != 2 * e)
+        return SDValue();
+    }
+    // Check that the extract is from the same MUL previously seen.
+    if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
+        Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
+      return SDValue();
+  }
+
+  // Check if the Mul source can be safely shrunk.
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
+    return SDValue();
+
+  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                         ArrayRef<SDValue> Ops) {
+    // Shrink by adding truncate nodes and let DAGCombine fold with the
+    // sources.
+    EVT InVT = Ops[0].getValueType();
+    assert(InVT.getScalarType() == MVT::i32 &&
+           "Unexpected scalar element type");
+    assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements() / 2);
+    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                   InVT.getVectorNumElements());
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
+                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
+                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+  };
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT,
+                          { Mul.getOperand(0), Mul.getOperand(1) },
+                          PMADDBuilder);
+}
+
+// Attempt to turn this pattern into PMADDWD.
+// (mul (add (zext (build_vector)), (zext (build_vector))),
+//      (add (zext (build_vector)), (zext (build_vector)))
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
+                              const SDLoc &DL, EVT VT,
+                              const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
+      VT.getVectorNumElements() < 4 ||
+      !isPowerOf2_32(VT.getVectorNumElements()))
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  SDValue N10 = N1.getOperand(0);
+  SDValue N11 = N1.getOperand(1);
+
+  // All inputs need to be sign extends.
+  // TODO: Support ZERO_EXTEND from known positive?
+  if (N00.getOpcode() != ISD::SIGN_EXTEND ||
+      N01.getOpcode() != ISD::SIGN_EXTEND ||
+      N10.getOpcode() != ISD::SIGN_EXTEND ||
+      N11.getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+
+  // Peek through the extends.
+  N00 = N00.getOperand(0);
+  N01 = N01.getOperand(0);
+  N10 = N10.getOperand(0);
+  N11 = N11.getOperand(0);
+
+  // Must be extending from vXi16.
+  EVT InVT = N00.getValueType();
+  if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
+      N10.getValueType() != InVT || N11.getValueType() != InVT)
+    return SDValue();
+
+  // All inputs should be build_vectors.
+  if (N00.getOpcode() != ISD::BUILD_VECTOR ||
+      N01.getOpcode() != ISD::BUILD_VECTOR ||
+      N10.getOpcode() != ISD::BUILD_VECTOR ||
+      N11.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // For each element, we need to ensure we have an odd element from one vector
+  // multiplied by the odd element of another vector and the even element from
+  // one of the same vectors being multiplied by the even element from the
+  // other vector. So we need to make sure for each element i, this operator
+  // is being performed:
+  //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
+  SDValue In0, In1;
+  for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
+    SDValue N00Elt = N00.getOperand(i);
+    SDValue N01Elt = N01.getOperand(i);
+    SDValue N10Elt = N10.getOperand(i);
+    SDValue N11Elt = N11.getOperand(i);
+    // TODO: Be more tolerant to undefs.
+    if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+    auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
+    auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
+    auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
+    auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
+    if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
+      return SDValue();
+    unsigned IdxN00 = ConstN00Elt->getZExtValue();
+    unsigned IdxN01 = ConstN01Elt->getZExtValue();
+    unsigned IdxN10 = ConstN10Elt->getZExtValue();
+    unsigned IdxN11 = ConstN11Elt->getZExtValue();
+    // Add is commutative so indices can be reordered.
+    if (IdxN00 > IdxN10) {
+      std::swap(IdxN00, IdxN10);
+      std::swap(IdxN01, IdxN11);
+    }
+    // N0 indices be the even elemtn. N1 indices must be the next odd element.
+    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
+        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+      return SDValue();
+    SDValue N00In = N00Elt.getOperand(0);
+    SDValue N01In = N01Elt.getOperand(0);
+    SDValue N10In = N10Elt.getOperand(0);
+    SDValue N11In = N11Elt.getOperand(0);
+    // First time we find an input capture it.
+    if (!In0) {
+      In0 = N00In;
+      In1 = N01In;
+    }
+    // Mul is commutative so the input vectors can be in any order.
+    // Canonicalize to make the compares easier.
+    if (In0 != N00In)
+      std::swap(N00In, N01In);
+    if (In0 != N10In)
+      std::swap(N10In, N11In);
+    if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
+      return SDValue();
+  }
+
+  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                         ArrayRef<SDValue> Ops) {
+    // Shrink by adding truncate nodes and let DAGCombine fold with the
+    // sources.
+    EVT InVT = Ops[0].getValueType();
+    assert(InVT.getScalarType() == MVT::i16 &&
+           "Unexpected scalar element type");
+    assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements() / 2);
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
+  };
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
+                          PMADDBuilder);
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   const SDNodeFlags Flags = N->getFlags();
@@ -37243,11 +38948,22 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
+  if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+    return MAdd;
+  if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+    return MAdd;
+
   // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
-      isHorizontalBinOp(Op0, Op1, true))
-    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
+       VT == MVT::v8i32) &&
+      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
+    auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                          ArrayRef<SDValue> Ops) {
+      return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
+    };
+    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
+                            HADDBuilder);
+  }
 
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
@@ -37261,20 +38977,19 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  // PSUBUS is supported, starting from SSE2, but special preprocessing
-  // for v8i32 requires umin, which appears in SSE41.
+  // PSUBUS is supported, starting from SSE2, but truncation for v8i32
+  // is only worth it with SSSE3 (PSHUFB).
   if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
-      !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
-      !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
-      !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
-        (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
-         VT == MVT::v8i64)))
+      !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
+      !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
+      !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
+                                   VT == MVT::v16i32 || VT == MVT::v8i64)))
     return SDValue();
 
   SDValue SubusLHS, SubusRHS;
   // Try to find umax(a,b) - b or a - umin(a,b) patterns
   // they may be converted to subus(a,b).
-  // TODO: Need to add IR cannonicialization for this code.
+  // TODO: Need to add IR canonicalization for this code.
   if (Op0.getOpcode() == ISD::UMAX) {
     SubusRHS = Op1;
     SDValue MaxLHS = Op0.getOperand(0);
@@ -37298,10 +39013,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   } else
     return SDValue();
 
+  auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                         ArrayRef<SDValue> Ops) {
+    return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
+  };
+
   // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
   // special preprocessing in some cases.
   if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
-    return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
+    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+                            { SubusLHS, SubusRHS }, SUBUSBuilder);
 
   // Special preprocessing case can be only applied
   // if the value was zero extended from 16 bit,
@@ -37331,8 +39052,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   SDValue NewSubusLHS =
       DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
   SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
-  SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
-                               NewSubusLHS, NewSubusRHS);
+  SDValue Psubus =
+      SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
+                       { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
   // Zero extend the result, it may be used somewhere as 32 bit,
   // if not zext and following trunc will shrink.
   return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
@@ -37363,10 +39085,16 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
 
   // Try to synthesize horizontal subs from subs of shuffles.
   EVT VT = N->getValueType(0);
-  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
-      isHorizontalBinOp(Op0, Op1, false))
-    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
+       VT == MVT::v8i32) &&
+      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
+    auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                          ArrayRef<SDValue> Ops) {
+      return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
+    };
+    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
+                            HSUBBuilder);
+  }
 
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
@@ -37470,28 +39198,6 @@ static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
-                            const X86Subtarget &Subtarget) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  MVT VT = N->getSimpleValueType(0);
-  SDLoc DL(N);
-
-  // TEST (AND a, b) ,(AND a, b) -> TEST a, b
-  if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
-    return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
-                       Op0->getOperand(1));
-
-  // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
-  // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
-  if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
-      ISD::isBuildVectorAllZeros(Op1.getNode()))
-    return getZeroVector(VT, Subtarget, DAG, DL);
-
-  return SDValue();
-}
-
 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
@@ -37515,9 +39221,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
 
   MVT OpVT = N->getSimpleValueType(0);
 
-  // Early out for mask vectors.
-  if (OpVT.getVectorElementType() == MVT::i1)
-    return SDValue();
+  bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
 
   SDLoc dl(N);
   SDValue Vec = N->getOperand(0);
@@ -37529,23 +39233,40 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // Inserting zeros into zeros is a nop.
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
-      return Vec;
+      return getZeroVector(OpVT, Subtarget, DAG, dl);
 
     // If we're inserting into a zero vector and then into a larger zero vector,
     // just insert into the larger zero vector directly.
     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
       unsigned Idx2Val = SubVec.getConstantOperandVal(2);
-      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+                         getZeroVector(OpVT, Subtarget, DAG, dl),
                          SubVec.getOperand(1),
                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
     }
 
+    // If we're inserting into a zero vector and our input was extracted from an
+    // insert into a zero vector of the same type and the extraction was at
+    // least as large as the original insertion. Just insert the original
+    // subvector into a zero vector.
+    if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
+        SubVec.getConstantOperandVal(1) == 0 &&
+        SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
+      SDValue Ins = SubVec.getOperand(0);
+      if (Ins.getConstantOperandVal(2) == 0 &&
+          ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
+          Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+                           getZeroVector(OpVT, Subtarget, DAG, dl),
+                           Ins.getOperand(1), N->getOperand(2));
+    }
+
     // If we're inserting a bitcast into zeros, rewrite the insert and move the
     // bitcast to the other side. This helps with detecting zero extending
     // during isel.
     // TODO: Is this useful for other indices than 0?
-    if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
+    if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
       MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
       unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
       MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
@@ -37556,6 +39277,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Stop here if this is an i1 vector.
+  if (IsI1Vector)
+    return SDValue();
+
   // If this is an insert of an extract, combine to a shuffle. Don't do this
   // if the insert or extract can be represented with a subregister operation.
   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -37642,7 +39367,6 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
       if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
         Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
                           SubVec2, Vec.getOperand(2));
-        DCI.AddToWorklist(Vec.getNode());
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
                            N->getOperand(2));
 
@@ -37677,6 +39401,75 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
         OpVT, SDLoc(N),
         InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
 
+  // If we're extracting the lowest subvector and we're the only user,
+  // we may be able to perform this with a smaller vector width.
+  if (IdxVal == 0 && InVec.hasOneUse()) {
+    unsigned InOpcode = InVec.getOpcode();
+    if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+      // v2f64 CVTDQ2PD(v4i32).
+      if (InOpcode == ISD::SINT_TO_FP &&
+          InVec.getOperand(0).getValueType() == MVT::v4i32) {
+        return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
+      }
+      // v2f64 CVTPS2PD(v4f32).
+      if (InOpcode == ISD::FP_EXTEND &&
+          InVec.getOperand(0).getValueType() == MVT::v4f32) {
+        return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
+      }
+    }
+    if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
+        OpVT.is128BitVector() &&
+        InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
+      unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
+                                                 : ISD::SIGN_EXTEND_VECTOR_INREG;
+      return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+
+  // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
+  // This occurs frequently in our masked scalar intrinsic code and our
+  // floating point select lowering with AVX512.
+  // TODO: SimplifyDemandedBits instead?
+  if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
+    if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
+      if (C->getAPIntValue().isOneValue())
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
+                           Src.getOperand(0));
+
+  return SDValue();
+}
+
+// Simplify PMULDQ and PMULUDQ operations.
+static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                        !DCI.isBeforeLegalizeOps());
+  APInt DemandedMask(APInt::getLowBitsSet(64, 32));
+
+  // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
+  KnownBits LHSKnown;
+  if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+    return SDValue(N, 0);
+  }
+
+  KnownBits RHSKnown;
+  if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+    return SDValue(N, 0);
+  }
+
   return SDValue();
 }
 
@@ -37685,6 +39478,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
+  case ISD::SCALAR_TO_VECTOR:
+    return combineScalarToVector(N, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
@@ -37709,6 +39504,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
+  case X86ISD::BEXTR:       return combineBEXTR(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
@@ -37774,20 +39570,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERMI:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
-  case X86ISD::VPERMIV3:
   case X86ISD::VPERMIL2:
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
+  case X86ISD::SHUF128:
   case X86ISD::VZEXT_MOVL:
   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   case X86ISD::FMADD_RND:
-  case X86ISD::FMADDS1_RND:
-  case X86ISD::FMADDS3_RND:
-  case X86ISD::FMADDS1:
-  case X86ISD::FMADDS3:
-  case X86ISD::FMADD4S:
-  case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
+  case X86ISD::FMSUB:
+  case X86ISD::FMSUB_RND:
+  case X86ISD::FNMADD:
+  case X86ISD::FNMADD_RND:
+  case X86ISD::FNMSUB:
+  case X86ISD::FNMSUB_RND:
+  case ISD::FMA: return combineFMA(N, DAG, Subtarget);
   case X86ISD::FMADDSUB_RND:
   case X86ISD::FMSUBADD_RND:
   case X86ISD::FMADDSUB:
@@ -37797,9 +39594,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::MSCATTER:
   case ISD::MGATHER:
   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI, Subtarget);
-  case X86ISD::TESTM:       return combineTestM(N, DAG, Subtarget);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
+  case X86ISD::PMULDQ:
+  case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI);
   }
 
   return SDValue();
@@ -37812,6 +39610,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (!isTypeLegal(VT))
     return false;
+
+  // There are no vXi8 shifts.
+  if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
+    return false;
+
   if (VT != MVT::i16)
     return true;
 
@@ -37834,6 +39637,22 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   }
 }
 
+SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
+                                                  SDValue Value, SDValue Addr,
+                                                  SelectionDAG &DAG) const {
+  const Module *M = DAG.getMachineFunction().getMMI().getModule();
+  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+  if (IsCFProtectionSupported) {
+    // In case control-flow branch protection is enabled, we need to add
+    // notrack prefix to the indirect branch.
+    // In order to do that we create NT_BRIND SDNode.
+    // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
+    return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
+  }
+
+  return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
+}
+
 /// This method query the target whether it is beneficial for dag combiner to
 /// promote the specified node. If true, it should return the desired promotion
 /// type by reference.
@@ -37842,22 +39661,30 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   if (VT != MVT::i16)
     return false;
 
-  bool Promote = false;
+  auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
+    if (!Op.hasOneUse())
+      return false;
+    SDNode *User = *Op->use_begin();
+    if (!ISD::isNormalStore(User))
+      return false;
+    auto *Ld = cast<LoadSDNode>(Load);
+    auto *St = cast<StoreSDNode>(User);
+    return Ld->getBasePtr() == St->getBasePtr();
+  };
+
   bool Commute = false;
   switch (Op.getOpcode()) {
-  default: break;
+  default: return false;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
-    Promote = true;
     break;
   case ISD::SHL:
   case ISD::SRL: {
     SDValue N0 = Op.getOperand(0);
     // Look out for (store (shl (load), x)).
-    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+    if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
       return false;
-    Promote = true;
     break;
   }
   case ISD::ADD:
@@ -37870,19 +39697,20 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   case ISD::SUB: {
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
-    if (!Commute && MayFoldLoad(N1))
-      return false;
     // Avoid disabling potential load folding opportunities.
-    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+    if (MayFoldLoad(N1) &&
+        (!Commute || !isa<ConstantSDNode>(N0) ||
+         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
       return false;
-    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+    if (MayFoldLoad(N0) &&
+        ((Commute && !isa<ConstantSDNode>(N1)) ||
+         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
-    Promote = true;
   }
   }
 
   PVT = MVT::i32;
-  return Promote;
+  return true;
 }
 
 bool X86TargetLowering::
@@ -38168,7 +39996,7 @@ TargetLowering::ConstraintWeight
     LLVM_FALLTHROUGH;
   case 'x':
     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
-        ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
       weight = CW_Register;
     break;
   case 'k':
@@ -38659,6 +40487,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     return Res;
   }
 
+  // Make sure it isn't a register that requires 64-bit mode.
+  if (!Subtarget.is64Bit() &&
+      (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
+      TRI->getEncodingValue(Res.first) >= 8) {
+    // Register requires REX prefix, but we're in 32-bit mode.
+    Res.first = 0;
+    Res.second = nullptr;
+    return Res;
+  }
+
+  // Make sure it isn't a register that requires AVX512.
+  if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
+      TRI->getEncodingValue(Res.first) & 0x10) {
+    // Register requires EVEX prefix.
+    Res.first = 0;
+    Res.second = nullptr;
+    return Res;
+  }
+
   // Otherwise, check to see if this is a register class of the wrong value
   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   // turn into {ax},{dx}.
@@ -38727,7 +40574,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   // will take 2 allocations in the out of order engine instead of 1
   // for plain addressing mode, i.e. inst (reg1).
   // E.g.,
-  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // vaddps (%rsi,%rdx), %ymm0, %ymm1
   // Requires two allocations (one for the load, one for the computation)
   // whereas:
   // vaddps (%rsi), %ymm0, %ymm1
@@ -38822,7 +40669,8 @@ StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const
 
   // Generally, if we aren't on Windows, the platform ABI does not include
   // support for stack probes, so don't emit them.
-  if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+  if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
+      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
     return "";
 
   // We need a stack probe to conform to the Windows ABI. Choose the right
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
index 7820c3e032e5..32215b170a8c 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -75,6 +75,9 @@ namespace llvm {
       ///
       CALL,
 
+      /// Same as call except it adds the NoTrack prefix.
+      NT_CALL,
+
       /// This operation implements the lowering for readcyclecounter.
       RDTSC_DAG,
 
@@ -122,6 +125,10 @@ namespace llvm {
       /// or TEST instruction.
       BRCOND,
 
+      /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+      /// operand 1 is the target address.
+      NT_BRIND,
+
       /// Return with a flag operand. Operand 0 is the chain operand, operand
       /// 1 is the number of bytes of stack to pop.
       RET_FLAG,
@@ -304,9 +311,6 @@ namespace llvm {
       // Vector FP round.
       VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
 
-      // Convert a vector to mask, set bits base on MSB.
-      CVT2MASK,
-
       // 128-bit vector logical left / right shift
       VSHLDQ, VSRLDQ,
 
@@ -332,8 +336,6 @@ namespace llvm {
 
       // Vector integer comparisons.
       PCMPEQ, PCMPGT,
-      // Vector integer comparisons, the result is in a mask vector.
-      PCMPEQM, PCMPGTM,
 
       // v8i16 Horizontal minimum and position.
       PHMINPOS,
@@ -343,7 +345,6 @@ namespace llvm {
       /// Vector comparison generating mask bits for fp and
       /// integer signed and unsigned data types.
       CMPM,
-      CMPMU,
       // Vector comparison with rounding mode for FP values
       CMPM_RND,
 
@@ -351,6 +352,9 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
+      // Bit field extract.
+      BEXTR,
+
       // LOW, HI, FLAGS = umul LHS, RHS.
       UMUL,
 
@@ -373,14 +377,13 @@ namespace llvm {
       // Vector packed fp sign bitwise comparisons.
       TESTP,
 
-      // Vector "test" in AVX-512, the result is in a mask vector.
-      TESTM,
-      TESTNM,
-
       // OR/AND test for masks.
       KORTEST,
       KTEST,
 
+      // ADD for masks.
+      KADD,
+
       // Several flavors of instructions with vector shuffle behaviors.
       // Saturated signed/unnsigned packing.
       PACKSS,
@@ -405,8 +408,6 @@ namespace llvm {
       MOVSLDUP,
       MOVLHPS,
       MOVHLPS,
-      MOVLPS,
-      MOVLPD,
       MOVSD,
       MOVSS,
       UNPCKL,
@@ -424,10 +425,6 @@ namespace llvm {
       // Res = VPERMV3 V0, MaskV, V1
       VPERMV3,
 
-      // 3-op Variable Permute overwriting the index (VPERMI2).
-      // Res = VPERMIV3 V0, MaskV, V1
-      VPERMIV3,
-
       // Bitwise ternary logic.
       VPTERNLOG,
       // Fix Up Special Packed Float32/64 values.
@@ -502,22 +499,6 @@ namespace llvm {
       FMADDSUB_RND,
       FMSUBADD_RND,
 
-      // FMA4 specific scalar intrinsics bits that zero the non-scalar bits.
-      FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S,
-
-      // Scalar intrinsic FMA.
-      FMADDS1, FMADDS3,
-      FNMADDS1, FNMADDS3,
-      FMSUBS1, FMSUBS3,
-      FNMSUBS1, FNMSUBS3,
-
-      // Scalar intrinsic FMA with rounding mode.
-      // Two versions, passthru bits on op1 or op3.
-      FMADDS1_RND, FMADDS3_RND,
-      FNMADDS1_RND, FNMADDS3_RND,
-      FMSUBS1_RND, FMSUBS3_RND,
-      FNMSUBS1_RND, FNMSUBS3_RND,
-
       // Compress and expand.
       COMPRESS,
       EXPAND,
@@ -572,8 +553,13 @@ namespace llvm {
       RDSEED,
 
       // SSE42 string comparisons.
-      PCMPISTRI,
-      PCMPESTRI,
+      // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+      // will emit one or two instructions based on which results are used. If
+      // flags and index/mask this allows us to use a single instruction since
+      // we won't have to pick and opcode for flags. Instead we can rely on the
+      // DAG to CSE everything and decide at isel.
+      PCMPISTR,
+      PCMPESTR,
 
       // Test if in transactional execution.
       XTEST,
@@ -590,6 +576,9 @@ namespace llvm {
       // LWP insert record.
       LWPINS,
 
+      // User level wait
+      UMWAIT, TPAUSE,
+
       // Compare and swap.
       LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LCMPXCHG8_DAG,
@@ -822,6 +811,28 @@ namespace llvm {
 
     bool hasAndNotCompare(SDValue Y) const override;
 
+    bool hasAndNot(SDValue Y) const override;
+
+    bool preferShiftsToClearExtremeBits(SDValue Y) const override;
+
+    bool
+    shouldTransformSignedTruncationCheck(EVT XVT,
+                                         unsigned KeptBits) const override {
+      // For vectors, we don't have a preference..
+      if (XVT.isVector())
+        return false;
+
+      auto VTIsOk = [](EVT VT) -> bool {
+        return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+               VT == MVT::i64;
+      };
+
+      // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
+      // XVT will be larger than KeptBitsVT.
+      MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+      return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
+    }
+
     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
       return VT.isScalarInteger();
     }
@@ -829,10 +840,18 @@ namespace llvm {
     /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
     MVT hasFastEqualityCompare(unsigned NumBits) const override;
 
+    /// Allow multiple load pairs per block for smaller and faster code.
+    unsigned getMemcmpEqZeroLoadsPerBlock() const override {
+      return 2;
+    }
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
 
+    bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                      TargetLoweringOpt &TLO) const override;
+
     /// Determine which of the bits specified in Mask are known to be either
     /// zero or one and return them in the KnownZero/KnownOne bitsets.
     void computeKnownBitsForTargetNode(const SDValue Op,
@@ -913,7 +932,7 @@ namespace llvm {
     /// the immediate into a register.
     bool isLegalAddImmediate(int64_t Imm) const override;
 
-    /// \brief Return the cost of the scaling factor used in the addressing
+    /// Return the cost of the scaling factor used in the addressing
     /// mode represented by AM for this target, for a load/store
     /// of the specified type.
     /// If the AM is supported, the return value must be >= 0.
@@ -976,11 +995,10 @@ namespace llvm {
     /// be legal.
     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
 
-    /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
-    /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
-    /// replace a VAND with a constant pool entry.
-    bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
-                                EVT VT) const override;
+    /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
+    /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
+    /// constant pool entry.
+    bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
 
     /// Returns true if lowering to a jump table is allowed.
     bool areJTsAllowed(const Function *Fn) const override;
@@ -1007,7 +1025,7 @@ namespace llvm {
              (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
     }
 
-    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
@@ -1076,29 +1094,41 @@ namespace llvm {
 
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
-    /// \brief Customize the preferred legalization strategy for certain types.
+    /// Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
+    MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                      EVT VT) const override;
+
+    unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                           EVT VT) const override;
+
     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
     bool supportSwiftError() const override;
 
     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
 
+    bool hasVectorBlend() const override { return true; }
+
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
-    /// \brief Lower interleaved load(s) into target specific
+    /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
     bool lowerInterleavedLoad(LoadInst *LI,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
 
-    /// \brief Lower interleaved store(s) into target specific
+    /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
+    SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, 
+                                   SDValue Addr, SelectionDAG &DAG) 
+                                   const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -1173,7 +1203,8 @@ namespace llvm {
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 
-    unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
+    unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
+                                  const unsigned char OpFlags = 0) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
@@ -1300,9 +1331,15 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
 
+    void emitSetJmpShadowStackFix(MachineInstr &MI,
+                                  MachineBasicBlock *MBB) const;
+
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
                                          MachineBasicBlock *MBB) const;
 
+    MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
+                                                 MachineBasicBlock *MBB) const;
+
     MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const;
 
@@ -1443,6 +1480,7 @@ namespace llvm {
     const SDValue &getIndex()   const { return getOperand(4); }
     const SDValue &getMask()    const { return getOperand(2); }
     const SDValue &getValue()   const { return getOperand(1); }
+    const SDValue &getScale()   const { return getOperand(5); }
 
     static bool classof(const SDNode *N) {
       return N->getOpcode() == X86ISD::MGATHER ||
diff --git a/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
new file mode 100644
index 000000000000..7c00c9260d15
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -0,0 +1,121 @@
+//===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that enables Indirect Branch Tracking (IBT) as part
+// of Control-Flow Enforcement Technology (CET).
+// The pass adds ENDBR (End Branch) machine instructions at the beginning of
+// each basic block or function that is referenced by an indrect jump/call
+// instruction.
+// The ENDBR instructions have a NOP encoding and as such are ignored in
+// targets that do not support CET IBT mechanism.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-indirect-branch-tracking"
+
+static cl::opt<bool> IndirectBranchTracking(
+    "x86-indirect-branch-tracking", cl::init(false), cl::Hidden,
+    cl::desc("Enable X86 indirect branch tracking pass."));
+
+STATISTIC(NumEndBranchAdded, "Number of ENDBR instructions added");
+
+namespace {
+class X86IndirectBranchTrackingPass : public MachineFunctionPass {
+public:
+  X86IndirectBranchTrackingPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "X86 Indirect Branch Tracking";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  static char ID;
+
+  /// Machine instruction info used throughout the class.
+  const X86InstrInfo *TII;
+
+  /// Endbr opcode for the current machine function.
+  unsigned int EndbrOpcode;
+
+  /// Adds a new ENDBR instruction to the begining of the MBB.
+  /// The function will not add it if already exists.
+  /// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
+  /// \returns true if the ENDBR was added and false otherwise.
+  bool addENDBR(MachineBasicBlock &MBB) const;
+};
+
+} // end anonymous namespace
+
+char X86IndirectBranchTrackingPass::ID = 0;
+
+FunctionPass *llvm::createX86IndirectBranchTrackingPass() {
+  return new X86IndirectBranchTrackingPass();
+}
+
+bool X86IndirectBranchTrackingPass::addENDBR(MachineBasicBlock &MBB) const {
+  assert(TII && "Target instruction info was not initialized");
+  assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) &&
+         "Unexpected Endbr opcode");
+
+  auto MI = MBB.begin();
+  // If the MBB is empty or the first instruction is not ENDBR,
+  // add the ENDBR instruction to the beginning of the MBB.
+  if (MI == MBB.end() || EndbrOpcode != MI->getOpcode()) {
+    BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(EndbrOpcode));
+    NumEndBranchAdded++;
+    return true;
+  }
+
+  return false;
+}
+
+bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
+
+  // Check that the cf-protection-branch is enabled.
+  Metadata *isCFProtectionSupported =
+      MF.getMMI().getModule()->getModuleFlag("cf-protection-branch");
+  if (!isCFProtectionSupported && !IndirectBranchTracking)
+    return false;
+
+  // True if the current MF was changed and false otherwise.
+  bool Changed = false;
+
+  TII = SubTarget.getInstrInfo();
+  EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
+
+  // Non-internal function or function whose address was taken, can be
+  // accessed through indirect calls. Mark the first BB with ENDBR instruction
+  // unless nocf_check attribute is used.
+  if ((MF.getFunction().hasAddressTaken() ||
+       !MF.getFunction().hasLocalLinkage()) &&
+      !MF.getFunction().doesNoCfCheck()) {
+    auto MBB = MF.begin();
+    Changed |= addENDBR(*MBB);
+  }
+
+  for (auto &MBB : MF)
+    // Find all basic blocks that their address was taken (for example
+    // in the case of indirect jump) and add ENDBR instruction.
+    if (MBB.hasAddressTaken())
+      Changed |= addENDBR(MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
index 0d30b7d47f3e..46dc6bf7661a 100644
--- a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -12,109 +12,71 @@
 //
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteFAdd in {
-def I3DNOW_FALU_ITINS : OpndItins<
-  IIC_3DNOW_FALU_RR, IIC_3DNOW_FALU_RM
->;
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, Requires<[Has3DNow]> {
 }
 
-let Sched = WriteCvtF2I in {
-def I3DNOW_FCVT_F2I_ITINS : OpndItins<
-  IIC_3DNOW_FCVT_F2I_RR, IIC_3DNOW_FCVT_F2I_RM
->;
-}
-
-let Sched = WriteCvtI2F in {
-def I3DNOW_FCVT_I2F_ITINS : OpndItins<
-  IIC_3DNOW_FCVT_I2F_RR, IIC_3DNOW_FCVT_I2F_RM
->;
-}
-
-let Sched = WriteVecIMul in {
-def I3DNOW_MISC_FUNC_ITINS : OpndItins<
-  IIC_3DNOW_MISC_FUNC_REG, IIC_3DNOW_MISC_FUNC_MEM
->;
-}
-
-let Sched = WriteShuffle in {
-def I3DNOW_PSHUF_ITINS : OpndItins<
-  IIC_MMX_PSHUF, IIC_MMX_PSHUF
->;
-}
-
-class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat,
-             InstrItinClass itin>
-      : I<o, F, outs, ins, asm, pat, itin>, TB, Requires<[Has3DNow]> {
-}
-
-class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat,
-                   InstrItinClass itin>
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
       : I3DNow<o, F, (outs VR64:$dst), ins,
-          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat, itin>,
-        Has3DNow0F0FOpcode {
-  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
-  let isAsmParserOnly = 1;
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, ThreeDNow {
   let Constraints = "$src1 = $dst";
 }
 
-class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat,
-                  InstrItinClass itin>
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
       : I3DNow<o, F, (outs VR64:$dst), ins,
-          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat, itin>,
-        Has3DNow0F0FOpcode {
-  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
-  let isAsmParserOnly = 1;
-}
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, ThreeDNow;
 
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, OpndItins itins,
-                               bit Commutable = 0, string Ver = ""> {
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn,
+                               X86FoldableSchedWrite sched, bit Commutable = 0,
+                               string Ver = ""> {
   let isCommutable = Commutable in
   def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
-      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))],
-      itins.rr>, Sched<[itins.Sched]>;
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>,
+      Sched<[sched]>;
   def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
       !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
-        (bitconvert (load_mmx addr:$src2))))], itins.rm>,
-        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+        (bitconvert (load_mmx addr:$src2))))]>,
+        Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, OpndItins itins,
-                              string Ver = ""> {
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn,
+                              X86FoldableSchedWrite sched, string Ver = ""> {
   def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
-      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))], itins.rr>,
-      Sched<[itins.Sched]>;
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>,
+      Sched<[sched]>;
   def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
       !strconcat("int_x86_3dnow", Ver, "_", Mn))
-        (bitconvert (load_mmx addr:$src))))], itins.rm>,
-        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+        (bitconvert (load_mmx addr:$src))))]>,
+        Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb", I3DNOW_MISC_FUNC_ITINS, 1>;
-defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id", I3DNOW_FCVT_F2I_ITINS>;
-defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc", I3DNOW_FALU_ITINS>;
-defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd", I3DNOW_FALU_ITINS, 1>;
-defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq", I3DNOW_FALU_ITINS, 1>;
-defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge", I3DNOW_FALU_ITINS>;
-defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt", I3DNOW_FALU_ITINS>;
-defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax", I3DNOW_FALU_ITINS>;
-defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin", I3DNOW_FALU_ITINS>;
-defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul", I3DNOW_FALU_ITINS, 1>;
-defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp", I3DNOW_FALU_ITINS>;
-defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", I3DNOW_FALU_ITINS>;
-defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", I3DNOW_FALU_ITINS>;
-defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", I3DNOW_FALU_ITINS>;
-defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt", I3DNOW_FALU_ITINS>;
-defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub", I3DNOW_FALU_ITINS, 1>;
-defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr", I3DNOW_FALU_ITINS, 1>;
-defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd", I3DNOW_FCVT_I2F_ITINS>;
-defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>;
-
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id", WriteCvtPS2I>;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc", WriteFAdd>;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd", WriteFAdd, 1>;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq", WriteFAdd, 1>;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge", WriteFAdd>;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt", WriteFAdd>;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax", WriteFAdd>;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin", WriteFAdd>;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul", WriteFAdd, 1>;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp", WriteFAdd>;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", WriteFAdd>;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", WriteFAdd>;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", WriteFAdd>;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt", WriteFAdd>;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub", WriteFAdd, 1>;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
+
+let SchedRW = [WriteEMMS] in
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
-                   [(int_x86_mmx_femms)], IIC_MMX_EMMS>;
+                   [(int_x86_mmx_femms)]>, TB;
 
 // PREFETCHWT1 is supported we want to use it for everything but T0.
 def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{
@@ -130,21 +92,20 @@ let SchedRW = [WriteLoad] in {
 let Predicates = [Has3DNow, NoSSEPrefetch] in
 def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
                       "prefetch\t$addr",
-                      [(prefetch addr:$addr, imm, imm, (i32 1))],
-                      IIC_SSE_PREFETCH>;
+                      [(prefetch addr:$addr, imm, imm, (i32 1))]>, TB;
 
 def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
-                  [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))],
-                  IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>;
+                  [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))]>,
+                  TB, Requires<[HasPrefetchW]>;
 
 def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr",
-                    [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))],
-                    IIC_SSE_PREFETCH>, TB, Requires<[HasPREFETCHWT1]>;
+                    [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>,
+                    TB, Requires<[HasPREFETCHWT1]>;
 }
 
 // "3DNowA" instructions
-defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", I3DNOW_FCVT_F2I_ITINS, "a">;
-defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", I3DNOW_FCVT_I2F_ITINS, "a">;
-defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", I3DNOW_FALU_ITINS, 0, "a">;
-defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", I3DNOW_FALU_ITINS, 0, "a">;
-defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", I3DNOW_PSHUF_ITINS, "a">;
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", WriteCvtPS2I, "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", WriteCvtI2PS, "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", WriteFAdd, 0, "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", WriteFAdd, 0, "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", SchedWriteShuffle.MMX, "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index 458f68072d6c..2d95061a8213 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -90,22 +90,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                                           !cast<ComplexPattern>("sse_load_f64"),
                                     ?));
 
-  // The corresponding float type, e.g. v16f32 for v16i32
-  // Note: For EltSize < 32, FloatVT is illegal and TableGen
-  //       fails to compile, so we choose FloatVT = VT
-  ValueType FloatVT = !cast<ValueType>(
-                        !if (!eq (!srl(EltSize,5),0),
-                             VTName,
-                             !if (!eq(TypeVariantName, "i"),
-                                  "v" # NumElts # "f" # EltSize,
-                                  VTName)));
-
-  ValueType IntVT = !cast<ValueType>(
-                        !if (!eq (!srl(EltSize,5),0),
-                             VTName,
-                             !if (!eq(TypeVariantName, "f"),
-                                  "v" # NumElts # "i" # EltSize,
-                                  VTName)));
   // The string to specify embedded broadcast in assembly.
   string BroadcastStr = "{1to" # NumElts # "}";
 
@@ -212,22 +196,22 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                                   list<dag> Pattern,
                                   list<dag> MaskingPattern,
                                   list<dag> ZeroMaskingPattern,
-                                  InstrItinClass itin,
                                   string MaskingConstraint = "",
                                   bit IsCommutable = 0,
-                                  bit IsKCommutable = 0> {
+                                  bit IsKCommutable = 0,
+                                  bit IsKZCommutable = IsCommutable> {
   let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
                                      "$dst, "#IntelSrcAsm#"}",
-                       Pattern, itin>;
+                       Pattern>;
 
   // Prefer over VMOV*rrk Pat<>
   let isCommutable = IsKCommutable in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
-                       MaskingPattern, itin>,
+                       MaskingPattern>,
               EVEX_K {
       // In case of the 3src subclass this is overridden with a let.
       string Constraints = MaskingConstraint;
@@ -235,12 +219,11 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
 
   // Zero mask does not add any restrictions to commute operands transformation.
   // So, it is Ok to use IsCommutable instead of IsKCommutable.
-  let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+  let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
-                       ZeroMaskingPattern,
-                       itin>,
+                       ZeroMaskingPattern>,
               EVEX_KZ;
 }
 
@@ -252,19 +235,19 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   dag RHS, dag MaskingRHS,
-                                  InstrItinClass itin,
                                   SDNode Select = vselect,
                                   string MaskingConstraint = "",
                                   bit IsCommutable = 0,
-                                  bit IsKCommutable = 0> :
+                                  bit IsKCommutable = 0,
+                                  bit IsKZCommutable = IsCommutable> :
   AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.RC:$dst, RHS)],
                          [(set _.RC:$dst, MaskingRHS)],
                          [(set _.RC:$dst,
                                (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
-                         itin, MaskingConstraint, IsCommutable,
-                         IsKCommutable>;
+                         MaskingConstraint, IsCommutable,
+                         IsKCommutable, IsKZCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -274,7 +257,6 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS, dag MaskRHS,
-                           InstrItinClass itin,
                            bit IsCommutable = 0, bit IsKCommutable = 0,
                            SDNode Select = vselect> :
    AVX512_maskable_custom<O, F, Outs, Ins,
@@ -286,7 +268,7 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
                               (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
                           [(set _.RC:$dst,
                               (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
-                          itin, "$src0 = $dst", IsCommutable, IsKCommutable>;
+                          "$src0 = $dst", IsCommutable, IsKCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -295,15 +277,16 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS,
-                           InstrItinClass itin,
                            bit IsCommutable = 0, bit IsKCommutable = 0,
+                           bit IsKZCommutable = IsCommutable,
                            SDNode Select = vselect> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (Select _.KRCWM:$mask, RHS, _.RC:$src0), itin,
-                          Select, "$src0 = $dst", IsCommutable, IsKCommutable>;
+                          (Select _.KRCWM:$mask, RHS, _.RC:$src0),
+                          Select, "$src0 = $dst", IsCommutable, IsKCommutable,
+                          IsKZCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the scalar instruction.
@@ -311,10 +294,9 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS,
-                           InstrItinClass itin,
                            bit IsCommutable = 0> :
    AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
-                   RHS, itin, IsCommutable, 0, X86selects>;
+                   RHS, IsCommutable, 0, IsCommutable, X86selects>;
 
 // Similar to AVX512_maskable but in this case one of the source operands
 // ($src1) is already tied to $dst so we just use that for the preserved
@@ -323,7 +305,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
 multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                                 dag Outs, dag NonTiedIns, string OpcodeStr,
                                 string AttSrcAsm, string IntelSrcAsm,
-                                dag RHS, InstrItinClass itin,
+                                dag RHS,
                                 bit IsCommutable = 0,
                                 bit IsKCommutable = 0,
                                 SDNode Select = vselect,
@@ -334,32 +316,60 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm,
                           !if(MaskOnly, (null_frag), RHS),
-                          (Select _.KRCWM:$mask, RHS, _.RC:$src1), itin,
+                          (Select _.KRCWM:$mask, RHS, _.RC:$src1),
                           Select, "", IsCommutable, IsKCommutable>;
 
+// Similar to AVX512_maskable_3src but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+// NOTE: The unmasked pattern is disabled.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+                                     X86VectorVTInfo InVT,
+                                     dag Outs, dag NonTiedIns, string OpcodeStr,
+                                     string AttSrcAsm, string IntelSrcAsm,
+                                     dag RHS, bit IsCommutable = 0> :
+   AVX512_maskable_common<O, F, OutVT, Outs,
+                          !con((ins InVT.RC:$src1), NonTiedIns),
+                          !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+                          !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag),
+                          (vselect InVT.KRCWM:$mask, RHS,
+                           (bitconvert InVT.RC:$src1)),
+                           vselect, "", IsCommutable>;
+
 multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      dag Outs, dag NonTiedIns, string OpcodeStr,
                                      string AttSrcAsm, string IntelSrcAsm,
-                                     dag RHS, InstrItinClass itin,
+                                     dag RHS,
                                      bit IsCommutable = 0,
                                      bit IsKCommutable = 0,
                                      bit MaskOnly = 0> :
    AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
-                        IntelSrcAsm, RHS, itin, IsCommutable, IsKCommutable,
+                        IntelSrcAsm, RHS, IsCommutable, IsKCommutable,
                         X86selects, MaskOnly>;
 
 multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Outs, dag Ins,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
-                                  list<dag> Pattern,
-                                  InstrItinClass itin> :
+                                  list<dag> Pattern> :
    AVX512_maskable_custom<O, F, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
-                          itin, "$src0 = $dst">;
-
+                          "$src0 = $dst">;
+
+multiclass AVX512_maskable_3src_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+                                       dag Outs, dag NonTiedIns,
+                                       string OpcodeStr,
+                                       string AttSrcAsm, string IntelSrcAsm,
+                                       list<dag> Pattern> :
+   AVX512_maskable_custom<O, F, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+                          "">;
 
 // Instruction with mask that puts result in mask register,
 // like "compare" and "vptest"
@@ -370,18 +380,17 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   list<dag> Pattern,
                                   list<dag> MaskingPattern,
-                                  InstrItinClass itin,
                                   bit IsCommutable = 0> {
     let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
                                      "$dst, "#IntelSrcAsm#"}",
-                       Pattern, itin>;
+                       Pattern>;
 
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
-                       MaskingPattern, itin>, EVEX_K;
+                       MaskingPattern>, EVEX_K;
 }
 
 multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -390,30 +399,27 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   dag RHS, dag MaskingRHS,
-                                  InstrItinClass itin,
                                   bit IsCommutable = 0> :
   AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.KRC:$dst, RHS)],
-                         [(set _.KRC:$dst, MaskingRHS)], itin, IsCommutable>;
+                         [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
 
 multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS, InstrItinClass itin,
-                           bit IsCommutable = 0> :
+                           dag RHS, bit IsCommutable = 0> :
    AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (and _.KRCWM:$mask, RHS), itin, IsCommutable>;
+                          (and _.KRCWM:$mask, RHS), IsCommutable>;
 
 multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
-                           string AttSrcAsm, string IntelSrcAsm,
-                           InstrItinClass itin> :
+                           string AttSrcAsm, string IntelSrcAsm> :
    AVX512_maskable_custom_cmp<O, F, Outs,
                              Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
-                             AttSrcAsm, IntelSrcAsm, [],[], itin>;
+                             AttSrcAsm, IntelSrcAsm, [], []>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -422,7 +428,6 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS, dag MaskedRHS,
-                           InstrItinClass itin,
                            bit IsCommutable = 0, SDNode Select = vselect> :
    AVX512_maskable_custom<O, F, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
@@ -434,12 +439,12 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
                           [(set _.RC:$dst,
                                 (Select _.KRCWM:$mask, MaskedRHS,
                                         _.ImmAllZerosV))],
-                          itin, "$src0 = $dst", IsCommutable>;
+                          "$src0 = $dst", IsCommutable>;
 
 
 // Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
-// swizzled by ExecutionDepsFix to pxor.
+// swizzled by ExecutionDomainFix to pxor.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -494,7 +499,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                                   X86VectorVTInfo To,
                                   SDPatternOperator vinsert_insert,
                                   SDPatternOperator vinsert_for_mask,
-                                  OpndItins itins> {
+                                  X86FoldableSchedWrite sched> {
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
@@ -505,8 +510,8 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                                          (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
                                            (From.VT From.RC:$src2),
-                                           (iPTR imm)), itins.rr>,
-                   AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>;
+                                           (iPTR imm))>,
+                   AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
     let mayLoad = 1 in
     defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
@@ -517,9 +522,9 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
                                (From.VT (bitconvert (From.LdFrag addr:$src2))),
-                               (iPTR imm)), itins.rm>, AVX512AIi8Base, EVEX_4V,
+                               (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                   Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
@@ -527,8 +532,8 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
 multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
                             X86VectorVTInfo To,
                             SDPatternOperator vinsert_insert,
-                            OpndItins itins> :
-  vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, itins>;
+                            X86FoldableSchedWrite sched> :
+  vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, sched>;
 
 multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
                        X86VectorVTInfo To, PatFrag vinsert_insert,
@@ -552,60 +557,51 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
 
 multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
                             ValueType EltVT64, int Opcode256,
-                            OpndItins itins> {
+                            X86FoldableSchedWrite sched> {
 
   let Predicates = [HasVLX] in
     defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
-                                 vinsert128_insert, itins>, EVEX_V256;
+                                 vinsert128_insert, sched>, EVEX_V256;
 
   defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
-                                 vinsert128_insert, itins>, EVEX_V512;
+                                 vinsert128_insert, sched>, EVEX_V512;
 
   defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
                                  X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 vinsert256_insert, itins>, VEX_W, EVEX_V512;
+                                 vinsert256_insert, sched>, VEX_W, EVEX_V512;
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasVLX, HasDQI] in
     defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
                                    X86VectorVTInfo< 2, EltVT64, VR128X>,
                                    X86VectorVTInfo< 4, EltVT64, VR256X>,
-                                   null_frag, vinsert128_insert, itins>,
-                                   VEX_W, EVEX_V256;
+                                   null_frag, vinsert128_insert, sched>,
+                                   VEX_W1X, EVEX_V256;
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasDQI] in {
     defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 null_frag, vinsert128_insert, itins>,
+                                 null_frag, vinsert128_insert, sched>,
                                  VEX_W, EVEX_V512;
 
     defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
                                    X86VectorVTInfo< 8, EltVT32, VR256X>,
                                    X86VectorVTInfo<16, EltVT32, VR512>,
-                                   null_frag, vinsert256_insert, itins>,
+                                   null_frag, vinsert256_insert, sched>,
                                    EVEX_V512;
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VINSERTF/VINSERTI?
-let Sched = WriteFShuffle256 in
-def AVX512_VINSERTF : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-let Sched = WriteShuffle256 in
-def AVX512_VINSERTI : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, AVX512_VINSERTF>;
-defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, AVX512_VINSERTI>;
+// FIXME: Is there a better scheduler class for VINSERTF/VINSERTI?
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, WriteFShuffle256>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, WriteShuffle256>;
 
 // Codegen pattern with the alternative types,
 // Even with AVX512DQ we'll still use these for unmasked operations.
@@ -778,15 +774,16 @@ let ExeDomain = SSEPackedSingle in {
 def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))],
-      IIC_SSE_INSERTPS_RR>, EVEX_4V, Sched<[WriteFShuffle]>;
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
 def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                          imm:$src3))], IIC_SSE_INSERTPS_RM>, EVEX_4V,
-      EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd, ReadAfterLd]>;
+                          imm:$src3))]>,
+      EVEX_4V, EVEX_CD8<32, CD8VT1>,
+      Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -799,7 +796,7 @@ multiclass vextract_for_size_split<int Opcode,
                                    X86VectorVTInfo From, X86VectorVTInfo To,
                                    SDPatternOperator vextract_extract,
                                    SDPatternOperator vextract_for_mask,
-                                   OpndItins itins> {
+                                   SchedWrite SchedRR, SchedWrite SchedMR> {
 
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
@@ -807,8 +804,8 @@ multiclass vextract_for_size_split<int Opcode,
                 "vextract" # To.EltTypeName # "x" # To.NumElts,
                 "$idx, $src1", "$src1, $idx",
                 (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
-                (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm)),
-                itins.rr>, AVX512AIi8Base, EVEX, Sched<[itins.Sched]>;
+                (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>,
+                AVX512AIi8Base, EVEX, Sched<[SchedRR]>;
 
     def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
                     (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
@@ -816,8 +813,8 @@ multiclass vextract_for_size_split<int Opcode,
                         "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
                     [(store (To.VT (vextract_extract:$idx
                                     (From.VT From.RC:$src1), (iPTR imm))),
-                             addr:$dst)], itins.rm>, EVEX,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                             addr:$dst)]>, EVEX,
+                    Sched<[SchedMR]>;
 
     let mayStore = 1, hasSideEffects = 0 in
     def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
@@ -825,9 +822,8 @@ multiclass vextract_for_size_split<int Opcode,
                                         From.RC:$src1, u8imm:$idx),
                      "vextract" # To.EltTypeName # "x" # To.NumElts #
                           "\t{$idx, $src1, $dst {${mask}}|"
-                          "$dst {${mask}}, $src1, $idx}",
-                    [], itins.rm>, EVEX_K, EVEX,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          "$dst {${mask}}, $src1, $idx}", []>,
+                    EVEX_K, EVEX, Sched<[SchedMR]>, NotMemoryFoldable;
   }
 }
 
@@ -835,8 +831,8 @@ multiclass vextract_for_size_split<int Opcode,
 multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
                              X86VectorVTInfo To,
                              SDPatternOperator vextract_extract,
-                             OpndItins itins> :
-  vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, itins>;
+                             SchedWrite SchedRR, SchedWrite SchedMR> :
+  vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, SchedRR, SchedMR>;
 
 // Codegen pattern for the alternative types
 multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
@@ -856,24 +852,24 @@ multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
 
 multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
                              ValueType EltVT64, int Opcode256,
-                             OpndItins itins> {
+                             SchedWrite SchedRR, SchedWrite SchedMR> {
   let Predicates = [HasAVX512] in {
     defm NAME # "32x4Z" : vextract_for_size<Opcode128,
                                    X86VectorVTInfo<16, EltVT32, VR512>,
                                    X86VectorVTInfo< 4, EltVT32, VR128X>,
-                                   vextract128_extract, itins>,
+                                   vextract128_extract, SchedRR, SchedMR>,
                                        EVEX_V512, EVEX_CD8<32, CD8VT4>;
     defm NAME # "64x4Z" : vextract_for_size<Opcode256,
                                    X86VectorVTInfo< 8, EltVT64, VR512>,
                                    X86VectorVTInfo< 4, EltVT64, VR256X>,
-                                   vextract256_extract, itins>,
+                                   vextract256_extract, SchedRR, SchedMR>,
                                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
   }
   let Predicates = [HasVLX] in
     defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
-                                 vextract128_extract, itins>,
+                                 vextract128_extract, SchedRR, SchedMR>,
                                      EVEX_V256, EVEX_CD8<32, CD8VT4>;
 
   // Even with DQI we'd like to only use these instructions for masking.
@@ -881,36 +877,27 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
     defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
                                  X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
-                                 null_frag, vextract128_extract, itins>,
-                                     VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+                                 null_frag, vextract128_extract, SchedRR, SchedMR>,
+                                     VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>;
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasDQI] in {
     defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
-                                 null_frag, vextract128_extract, itins>,
+                                 null_frag, vextract128_extract, SchedRR, SchedMR>,
                                      VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
     defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
-                                 null_frag, vextract256_extract, itins>,
+                                 null_frag, vextract256_extract, SchedRR, SchedMR>,
                                      EVEX_V512, EVEX_CD8<32, CD8VT8>;
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VEXTRACTF/VEXTRACTI?
-let Sched = WriteFShuffle256 in
-def AVX512_VEXTRACTF : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-let Sched = WriteShuffle256 in
-def AVX512_VEXTRACTI : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, AVX512_VEXTRACTF>;
-defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, AVX512_VEXTRACTI>;
+// TODO - replace WriteFStore/WriteVecStore with X86SchedWriteMoveLSWidths types.
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, WriteFShuffle256, WriteFStore>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, WriteShuffle256, WriteVecStore>;
 
 // extract_subvector codegen patterns with the alternative types.
 // Even with AVX512DQ we'll still use these for unmasked operations.
@@ -1116,41 +1103,43 @@ defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
 def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
       (ins VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))],
-      IIC_SSE_EXTRACTPS_RR>, EVEX, VEX_WIG, Sched<[WriteFShuffle]>;
+      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      EVEX, VEX_WIG, Sched<[WriteVecExtract]>;
 
 def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
       (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
-                          addr:$dst)], IIC_SSE_EXTRACTPS_RM>,
-      EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd]>;
+                          addr:$dst)]>,
+      EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecExtractSt]>;
 
 //===---------------------------------------------------------------------===//
 // AVX-512 BROADCAST
 //---
 // broadcast with a scalar argument.
 multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
+                            string Name,
                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
   def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
-            (!cast<Instruction>(NAME#DestInfo.ZSuffix#r)
-             (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#r)
+             (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                                   (X86VBroadcast SrcInfo.FRC:$src),
                                   DestInfo.RC:$src0)),
-            (!cast<Instruction>(NAME#DestInfo.ZSuffix#rk)
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#rk)
              DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
-             (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+             (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                                   (X86VBroadcast SrcInfo.FRC:$src),
                                   DestInfo.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)
-             DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#rkz)
+             DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
 }
 
 // Split version to allow mask and broadcast node to be different types. This
 // helps support the 32x2 broadcasts.
 multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
+                                     string Name,
                                      SchedWrite SchedRR, SchedWrite SchedRM,
                                      X86VectorVTInfo MaskInfo,
                                      X86VectorVTInfo DestInfo,
@@ -1167,8 +1156,8 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                    (MaskInfo.VT
                     (bitconvert
                      (DestInfo.VT
-                      (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
-                   NoItinerary>, T8PD, EVEX, Sched<[SchedRR]>;
+                      (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
+                   T8PD, EVEX, Sched<[SchedRR]>;
   let mayLoad = 1 in
   defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
                    (outs MaskInfo.RC:$dst),
@@ -1180,8 +1169,8 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                    (MaskInfo.VT
                     (bitconvert
                      (DestInfo.VT (X86VBroadcast
-                                   (SrcInfo.ScalarLdFrag addr:$src))))),
-                   NoItinerary>, T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
+                                   (SrcInfo.ScalarLdFrag addr:$src)))))>,
+                   T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
                    Sched<[SchedRM]>;
   }
 
@@ -1190,7 +1179,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
               (DestInfo.VT (UnmaskedOp
                             (SrcInfo.VT (scalar_to_vector
                                          (SrcInfo.ScalarLdFrag addr:$src))))))),
-            (!cast<Instruction>(NAME#MaskInfo.ZSuffix#m) addr:$src)>;
+            (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>;
   def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
                           (bitconvert
                            (DestInfo.VT
@@ -1198,7 +1187,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                              (SrcInfo.VT (scalar_to_vector
                                           (SrcInfo.ScalarLdFrag addr:$src)))))),
                           MaskInfo.RC:$src0)),
-            (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#mk)
              MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
   def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
                           (bitconvert
@@ -1207,62 +1196,64 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                              (SrcInfo.VT (scalar_to_vector
                                           (SrcInfo.ScalarLdFrag addr:$src)))))),
                           MaskInfo.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#MaskInfo.ZSuffix#mkz)
+            (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz)
              MaskInfo.KRCWM:$mask, addr:$src)>;
 }
 
 // Helper class to force mask and broadcast result to same type.
-multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
                                SchedWrite SchedRR, SchedWrite SchedRM,
                                X86VectorVTInfo DestInfo,
                                X86VectorVTInfo SrcInfo> :
-  avx512_broadcast_rm_split<opc, OpcodeStr, SchedRR, SchedRM,
+  avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
                             DestInfo, DestInfo, SrcInfo>;
 
 multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
                                                        AVX512VLVectorVTInfo _> {
-  let Predicates = [HasAVX512] in
-    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256, 
+  let Predicates = [HasAVX512] in {
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
                                   WriteFShuffle256Ld, _.info512, _.info128>,
-              avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
-                                      EVEX_V512;
+              avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
+                                      _.info128>,
+              EVEX_V512;
+  }
 
   let Predicates = [HasVLX] in {
-    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
                                      WriteFShuffle256Ld, _.info256, _.info128>,
-                 avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
-                                         EVEX_V256;
+                 avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
+                                         _.info128>,
+                 EVEX_V256;
   }
 }
 
 multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
                                                        AVX512VLVectorVTInfo _> {
-  let Predicates = [HasAVX512] in
-    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+  let Predicates = [HasAVX512] in {
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
                                   WriteFShuffle256Ld, _.info512, _.info128>,
-              avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
-                               EVEX_V512;
+              avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
+                                      _.info128>,
+              EVEX_V512;
+  }
 
   let Predicates = [HasVLX] in {
-    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
                                      WriteFShuffle256Ld, _.info256, _.info128>,
-                 avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
-                             EVEX_V256;
-    defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+                 avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
+                                         _.info128>,
+                 EVEX_V256;
+    defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
                                      WriteFShuffle256Ld, _.info128, _.info128>,
-                 avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
-                             EVEX_V128;
+                 avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
+                                         _.info128>,
+                 EVEX_V128;
   }
 }
 defm VBROADCASTSS  : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
                                        avx512vl_f32_info>;
 defm VBROADCASTSD  : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
-                                       avx512vl_f64_info>, VEX_W;
-
-def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
-          (VBROADCASTSSZm addr:$src)>;
-def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
-          (VBROADCASTSDZm addr:$src)>;
+                                       avx512vl_f64_info>, VEX_W1X;
 
 multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
                                     X86VectorVTInfo _, SDPatternOperator OpNode,
@@ -1271,7 +1262,7 @@ multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins SrcRC:$src),
                          "vpbroadcast"##_.Suffix, "$src", "$src",
-                         (_.VT (OpNode SrcRC:$src)), NoItinerary>, T8PD, EVEX,
+                         (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX,
                          Sched<[SchedRR]>;
 }
 
@@ -1284,7 +1275,7 @@ multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite Sched
                         !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
                         !con((ins _.KRCWM:$mask), (ins GR32:$src)),
                         "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
-                        NoItinerary, "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+                        "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
 
   def : Pat <(_.VT (OpNode SrcRC:$src)),
              (!cast<Instruction>(Name#r)
@@ -1337,37 +1328,34 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
 defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
                                                  X86VBroadcast, GR64, HasAVX512>, VEX_W;
 
-def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
-           (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
-def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
-           (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
-
 // Provide aliases for broadcast from the same register class that
 // automatically does the extract.
-multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
-                                            X86VectorVTInfo SrcInfo> {
+multiclass avx512_int_broadcast_rm_lowering<string Name,
+                                            X86VectorVTInfo DestInfo,
+                                            X86VectorVTInfo SrcInfo,
+                                            X86VectorVTInfo ExtInfo> {
   def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
-            (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
-                (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#"r")
+                (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>;
 }
 
 multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
                                         AVX512VLVectorVTInfo _, Predicate prd> {
   let Predicates = [prd] in {
-    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
+    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
                                    WriteShuffle256Ld, _.info512, _.info128>,
-               avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+               avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>,
                                   EVEX_V512;
     // Defined separately to avoid redefinition.
-    defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+    defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>;
   }
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
                                     WriteShuffle256Ld, _.info256, _.info128>,
-                avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+                avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>,
                                  EVEX_V256;
-    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle,
-                                    WriteShuffleLd, _.info128, _.info128>,
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
+                                    WriteShuffleXLd, _.info128, _.info128>,
                                  EVEX_V128;
   }
 }
@@ -1379,16 +1367,16 @@ defm VPBROADCASTW  : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
 defm VPBROADCASTD  : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
                                            avx512vl_i32_info, HasAVX512>;
 defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
-                                           avx512vl_i64_info, HasAVX512>, VEX_W;
+                                           avx512vl_i64_info, HasAVX512>, VEX_W1X;
 
 multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))),
-                           NoItinerary>, AVX5128IBase, EVEX,
-                           Sched<[WriteShuffleLd]>;
+                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                           Sched<[SchedWriteShuffle.YMM.Folded]>,
+                           AVX5128IBase, EVEX;
 }
 
 // This should be used for the AVX512DQ broadcast instructions. It disables
@@ -1401,9 +1389,9 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (null_frag),
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))),
-                            NoItinerary>, AVX5128IBase, EVEX,
-                            Sched<[WriteShuffleLd]>;
+                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                           Sched<[SchedWriteShuffle.YMM.Folded]>,
+                           AVX5128IBase, EVEX;
 }
 
 let Predicates = [HasAVX512] in {
@@ -1490,6 +1478,41 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
 def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                   (bc_v16f32 (v16i32 immAllZerosV))),
+          (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                   VR512:$src0),
+          (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                   (v16i32 immAllZerosV)),
+          (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                   VR512:$src0),
+          (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+                   (bc_v8f64 (v16i32 immAllZerosV))),
+          (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+                   VR512:$src0),
+          (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 immAllZerosV))),
+          (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   VR512:$src0),
+          (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {
@@ -1509,6 +1532,25 @@ def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                   (bc_v8f32 (v8i32 immAllZerosV))),
+          (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                   VR256X:$src0),
+          (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                   (v8i32 immAllZerosV)),
+          (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                   VR256X:$src0),
+          (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+
+
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
@@ -1533,11 +1575,29 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
 
 let Predicates = [HasVLX, HasDQI] in {
 defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
-                           v4i64x_info, v2i64x_info>, VEX_W,
+                           v4i64x_info, v2i64x_info>, VEX_W1X,
                            EVEX_V256, EVEX_CD8<64, CD8VT2>;
 defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
-                           v4f64x_info, v2f64x_info>, VEX_W,
+                           v4f64x_info, v2f64x_info>, VEX_W1X,
                            EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK4WM:$mask,
+                   (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                   (bc_v4f64 (v8i32 immAllZerosV))),
+          (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect VK4WM:$mask,
+                   (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                   VR256X:$src0),
+          (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect VK4WM:$mask,
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 immAllZerosV))),
+          (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect VK4WM:$mask,
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   VR256X:$src0),
+          (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
 
 let Predicates = [HasDQI] in {
@@ -1553,17 +1613,52 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
 defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
                        v16f32_info, v8f32x_info>,
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+                   (bc_v16f32 (v16i32 immAllZerosV))),
+          (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+                   VR512:$src0),
+          (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+                   (v16i32 immAllZerosV)),
+          (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect VK16WM:$mask,
+                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+                   VR512:$src0),
+          (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                   (bc_v8f64 (v16i32 immAllZerosV))),
+          (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                   VR512:$src0),
+          (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 immAllZerosV))),
+          (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect VK8WM:$mask,
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   VR512:$src0),
+          (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
 
 multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
                          AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
   let Predicates = [HasDQI] in
-    defm Z :    avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
+    defm Z :    avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
                                           WriteShuffle256Ld, _Dst.info512,
                                           _Src.info512, _Src.info128, null_frag>,
                                           EVEX_V512;
   let Predicates = [HasDQI, HasVLX] in
-    defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
+    defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
                                           WriteShuffle256Ld, _Dst.info256,
                                           _Src.info256, _Src.info128, null_frag>,
                                           EVEX_V256;
@@ -1574,8 +1669,8 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
   avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
 
   let Predicates = [HasDQI, HasVLX] in
-    defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle,
-                                          WriteShuffleLd, _Dst.info128,
+    defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
+                                          WriteShuffleXLd, _Dst.info128,
                                           _Src.info128, _Src.info128, null_frag>,
                                           EVEX_V128;
 }
@@ -1587,20 +1682,20 @@ defm VBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
 
 let Predicates = [HasVLX] in {
 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
-          (VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+          (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
-          (VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+          (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
 }
 
 def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
-          (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+          (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>;
 def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
-          (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+          (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
 
 def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
-          (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+          (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>;
 def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
-          (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+          (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST MASK TO VECTOR REGISTER
@@ -1609,8 +1704,8 @@ multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
                                   X86VectorVTInfo _, RegisterClass KRC> {
   def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))],
-                  IIC_SSE_PSHUF_RI>, EVEX, Sched<[WriteShuffle]>;
+                  [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>,
+                  EVEX, Sched<[WriteShuffle]>;
 }
 
 multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
@@ -1630,111 +1725,146 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
 
 //===----------------------------------------------------------------------===//
 // -- VPERMI2 - 3 source operands form --
-
-let Sched = WriteFShuffle256 in
-def AVX512_PERM2_F : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
-let Sched = WriteShuffle256 in
-def AVX512_PERM2_I : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, OpndItins itins,
-                         X86VectorVTInfo _> {
-let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
-  // The index operand in the pattern should really be an integer type. However,
-  // if we do that and it happens to come from a bitcast, then it becomes
-  // difficult to find the bitcast needed to convert the index to the
-  // destination type for the passthru since it will be folded with the bitcast
-  // of the index operand.
-  defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+                         X86FoldableSchedWrite sched,
+                         X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+    hasSideEffects = 0 in {
+  defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)),
-          itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>;
+          (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>,
+          EVEX_4V, AVX5128IBase, Sched<[sched]>;
 
-  defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  let mayLoad = 1 in
+  defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
-                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), itins.rm, 1>,
-            EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+            (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
+                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+            EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
-multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
-                            X86VectorVTInfo _> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
-  defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+                            X86FoldableSchedWrite sched,
+                            X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+      hasSideEffects = 0, mayLoad = 1 in
+  defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
               (ins _.RC:$src2, _.ScalarMemOp:$src3),
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
-              (_.VT (X86VPermi2X _.RC:$src1,
-               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
-              itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B,
-              Sched<[itins.Sched.Folded, ReadAfterLd]>;
+              (_.VT (X86VPermt2 _.RC:$src2,
+               IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+              AVX5128IBase, EVEX_4V, EVEX_B,
+              Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, OpndItins itins,
-                               AVX512VLVectorVTInfo VTInfo> {
-  defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>,
-            avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+                               X86FoldableSchedWrite sched,
+                               AVX512VLVectorVTInfo VTInfo,
+                               AVX512VLVectorVTInfo ShuffleMask> {
+  defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
+                           ShuffleMask.info512>,
+            avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info512,
+                             ShuffleMask.info512>, EVEX_V512;
   let Predicates = [HasVLX] in {
-  defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>,
-                 avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
-  defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>,
-                 avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
+                               ShuffleMask.info128>,
+                 avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info128,
+                                  ShuffleMask.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
+                               ShuffleMask.info256>,
+                 avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info256,
+                                  ShuffleMask.info256>, EVEX_V256;
   }
 }
 
 multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
-                                  OpndItins itins,
+                                  X86FoldableSchedWrite sched,
                                   AVX512VLVectorVTInfo VTInfo,
+                                  AVX512VLVectorVTInfo Idx,
                                   Predicate Prd> {
   let Predicates = [Prd] in
-  defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+  defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
+                           Idx.info512>, EVEX_V512;
   let Predicates = [Prd, HasVLX] in {
-  defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
-  defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>,  EVEX_V256;
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
+                               Idx.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
+                               Idx.info256>,  EVEX_V256;
   }
 }
 
-defm VPERMI2D  : avx512_perm_i_sizes<0x76, "vpermi2d", AVX512_PERM2_I,
-                  avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q  : avx512_perm_i_sizes<0x76, "vpermi2q", AVX512_PERM2_I,
-                  avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2W  : avx512_perm_i_sizes_bw<0x75, "vpermi2w", AVX512_PERM2_I,
-                  avx512vl_i16_info, HasBWI>,
+defm VPERMI2D  : avx512_perm_i_sizes<0x76, "vpermi2d", WriteVarShuffle256,
+                  avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_i_sizes<0x76, "vpermi2q", WriteVarShuffle256,
+                  avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W  : avx512_perm_i_sizes_bw<0x75, "vpermi2w", WriteVarShuffle256,
+                  avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
                   VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMI2B  : avx512_perm_i_sizes_bw<0x75, "vpermi2b", AVX512_PERM2_I,
-                  avx512vl_i8_info, HasVBMI>,
+defm VPERMI2B  : avx512_perm_i_sizes_bw<0x75, "vpermi2b", WriteVarShuffle256,
+                  avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
                   EVEX_CD8<8, CD8VF>;
-defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", AVX512_PERM2_F,
-                  avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", AVX512_PERM2_F,
-                  avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", WriteFVarShuffle256,
+                  avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256,
+                  avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Extra patterns to deal with extra bitcasts due to passthru and index being
+// different types on the fp versions.
+multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
+                                  X86VectorVTInfo IdxVT,
+                                  X86VectorVTInfo CastVT> {
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                             (X86VPermt2 (_.VT _.RC:$src2),
+                                         (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), _.RC:$src3),
+                             (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+            (!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask,
+                                                _.RC:$src2, _.RC:$src3)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                             (X86VPermt2 _.RC:$src2,
+                                         (IdxVT.VT (bitconvert  (CastVT.VT _.RC:$src1))),
+                                         (_.LdFrag addr:$src3)),
+                             (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
+            (!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask,
+                                                _.RC:$src2, addr:$src3)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                             (X86VPermt2 _.RC:$src2,
+                                         (IdxVT.VT (bitconvert  (CastVT.VT _.RC:$src1))),
+                                         (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+                             (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
+            (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
+                                                 _.RC:$src2, addr:$src3)>;
+}
+
+// TODO: Should we add more casts? The vXi64 case is common due to ABI.
+defm : avx512_perm_i_lowering<"VPERMI2PS", v16f32_info, v16i32_info, v8i64_info>;
+defm : avx512_perm_i_lowering<"VPERMI2PS256", v8f32x_info, v8i32x_info, v4i64x_info>;
+defm : avx512_perm_i_lowering<"VPERMI2PS128", v4f32x_info, v4i32x_info, v2i64x_info>;
 
 // VPERMT2
-multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+                         X86FoldableSchedWrite sched,
                          X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
 let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins IdxVT.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)),
-          itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>;
+          (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
+          EVEX_4V, AVX5128IBase, Sched<[sched]>;
 
   defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
-                   (bitconvert (_.LdFrag addr:$src3)))), itins.rm, 1>,
-            EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                   (bitconvert (_.LdFrag addr:$src3)))), 1>,
+            EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
-multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+                            X86FoldableSchedWrite sched,
                             X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
   defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -1742,167 +1872,176 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src1,
-               IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
-              itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B,
-              Sched<[itins.Sched.Folded, ReadAfterLd]>;
+               IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+              AVX5128IBase, EVEX_4V, EVEX_B,
+              Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+                               X86FoldableSchedWrite sched,
                                AVX512VLVectorVTInfo VTInfo,
                                AVX512VLVectorVTInfo ShuffleMask> {
-  defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512,
+  defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512,
                               ShuffleMask.info512>,
-            avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info512,
+            avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info512,
                               ShuffleMask.info512>, EVEX_V512;
   let Predicates = [HasVLX] in {
-  defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128,
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128,
                               ShuffleMask.info128>,
-                 avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info128,
+                 avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info128,
                               ShuffleMask.info128>, EVEX_V128;
-  defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256,
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256,
                               ShuffleMask.info256>,
-                 avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info256,
+                 avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info256,
                               ShuffleMask.info256>, EVEX_V256;
   }
 }
 
-multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr, OpndItins itins,
-                                 AVX512VLVectorVTInfo VTInfo,
-                                 AVX512VLVectorVTInfo Idx,
-                                 Predicate Prd> {
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
+                                  X86FoldableSchedWrite sched,
+                                  AVX512VLVectorVTInfo VTInfo,
+                                  AVX512VLVectorVTInfo Idx, Predicate Prd> {
   let Predicates = [Prd] in
-  defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512,
+  defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512,
                            Idx.info512>, EVEX_V512;
   let Predicates = [Prd, HasVLX] in {
-  defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128,
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128,
                                Idx.info128>, EVEX_V128;
-  defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256,
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256,
                                Idx.info256>, EVEX_V256;
   }
 }
 
-defm VPERMT2D  : avx512_perm_t_sizes<0x7E, "vpermt2d", AVX512_PERM2_I,
+defm VPERMT2D  : avx512_perm_t_sizes<0x7E, "vpermt2d", WriteVarShuffle256,
                   avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q  : avx512_perm_t_sizes<0x7E, "vpermt2q", AVX512_PERM2_I,
+defm VPERMT2Q  : avx512_perm_t_sizes<0x7E, "vpermt2q", WriteVarShuffle256,
                   avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2W  : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", AVX512_PERM2_I,
+defm VPERMT2W  : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", WriteVarShuffle256,
                   avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
                   VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMT2B  : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", AVX512_PERM2_I,
+defm VPERMT2B  : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", WriteVarShuffle256,
                   avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
                   EVEX_CD8<8, CD8VF>;
-defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", AVX512_PERM2_F,
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", WriteFVarShuffle256,
                   avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", AVX512_PERM2_F,
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", WriteFVarShuffle256,
                   avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
 
-let Sched = WriteFVarBlend in
-def AVX512_BLENDM : OpndItins<
-  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
->;
-
-let Sched = WriteVarBlend in
-def AVX512_PBLENDM : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, OpndItins itins,
-                            X86VectorVTInfo _> {
+multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
+                             X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
-             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
-             [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>,
+             EVEX_4V, Sched<[sched]>;
   def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+             []>, EVEX_4V, EVEX_K, Sched<[sched]>;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
-             [], itins.rr>, EVEX_4V, EVEX_KZ, Sched<[itins.Sched]>;
+             []>, EVEX_4V, EVEX_KZ, Sched<[sched]>, NotMemoryFoldable;
   let mayLoad = 1 in {
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
-             [], itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+             []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+             Sched<[sched.Folded, ReadAfterLd]>;
   def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [], itins.rm>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
-             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+             []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+             Sched<[sched.Folded, ReadAfterLd]>;
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
-             [], itins.rm>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
-             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+             []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+             Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
   }
   }
 }
-multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, OpndItins itins,
-                                X86VectorVTInfo _> {
+multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
+                                 X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let mayLoad = 1, hasSideEffects = 0 in {
   def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
-            "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
-      [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
-      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+            "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      Sched<[sched.Folded, ReadAfterLd]>;
+
+  def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|",
+            "$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+      EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
 
   def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.RC:$src1, _.ScalarMemOp:$src2),
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
-            "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
-      [], itins.rm>,  EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
-      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+            "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+      EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
-multiclass blendmask_dq <bits<8> opc, string OpcodeStr, OpndItins itins,
-                                 AVX512VLVectorVTInfo VTInfo> {
-  defm Z : avx512_blendmask      <opc, OpcodeStr, itins, VTInfo.info512>,
-           avx512_blendmask_rmb  <opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+multiclass blendmask_dq<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched,
+                        AVX512VLVectorVTInfo VTInfo> {
+  defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+           WriteFVarBlendask_rmb<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+                                 EVEX_V512;
 
   let Predicates = [HasVLX] in {
-    defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>,
-                avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
-    defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>,
-                avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
+    defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+                WriteFVarBlendask_rmb<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+                                      EVEX_V256;
+    defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+                WriteFVarBlendask_rmb<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+                                      EVEX_V128;
   }
 }
 
-multiclass blendmask_bw <bits<8> opc, string OpcodeStr, OpndItins itins,
-                         AVX512VLVectorVTInfo VTInfo> {
+multiclass blendmask_bw<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched,
+                        AVX512VLVectorVTInfo VTInfo> {
   let Predicates = [HasBWI] in
-    defm Z : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
+    defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+                               EVEX_V512;
 
   let Predicates = [HasBWI, HasVLX] in {
-    defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
-    defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
+    defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+                                  EVEX_V256;
+    defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+                                  EVEX_V128;
   }
 }
 
-
-defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", AVX512_BLENDM, avx512vl_f32_info>;
-defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", AVX512_BLENDM, avx512vl_f64_info>, VEX_W;
-defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", AVX512_PBLENDM, avx512vl_i32_info>;
-defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", AVX512_PBLENDM, avx512vl_i64_info>, VEX_W;
-defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", AVX512_PBLENDM, avx512vl_i8_info>;
-defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_info>, VEX_W;
-
+defm VBLENDMPS : blendmask_dq<0x65, "vblendmps", SchedWriteFVarBlend,
+                              avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq<0x65, "vblendmpd", SchedWriteFVarBlend,
+                              avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq<0x64, "vpblendmd", SchedWriteVarBlend,
+                              avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq<0x64, "vpblendmq", SchedWriteVarBlend,
+                              avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw<0x66, "vpblendmb", SchedWriteVarBlend,
+                              avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend,
+                              avx512vl_i16_info>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // Compare Instructions
@@ -1911,7 +2050,7 @@ defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_i
 // avx512_cmp_scalar - AVX512 CMPSS and CMPSD
 
 multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
-                             OpndItins itins> {
+                             X86FoldableSchedWrite sched> {
   defm  rr_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                       (outs _.KRC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
@@ -1919,7 +2058,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
                       "$src2, $src1", "$src1, $src2",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              imm:$cc), itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+                              imm:$cc)>, EVEX_4V, Sched<[sched]>;
   let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
@@ -1927,8 +2066,8 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
                     "vcmp${cc}"#_.Suffix,
                     "$src2, $src1", "$src1, $src2",
                     (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
-                        imm:$cc), itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                        imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
 
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),
@@ -1938,31 +2077,31 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
                      (OpNodeRnd (_.VT _.RC:$src1),
                                 (_.VT _.RC:$src2),
                                 imm:$cc,
-                                (i32 FROUND_NO_EXC)), itins.rr>,
-                     EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
+                                (i32 FROUND_NO_EXC))>,
+                     EVEX_4V, EVEX_B, Sched<[sched]>;
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     defm  rri_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
                         (outs VK1:$dst),
                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                         "vcmp"#_.Suffix,
-                        "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>, EVEX_4V,
-                        Sched<[itins.Sched]>;
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V,
+                        Sched<[sched]>, NotMemoryFoldable;
   let mayLoad = 1 in
     defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
                         (outs _.KRC:$dst),
                         (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
                         "vcmp"#_.Suffix,
-                        "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">,
                         EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
-                        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                        Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
 
     defm  rrb_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
                        (outs _.KRC:$dst),
                        (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                        "vcmp"#_.Suffix,
-                       "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", itins.rr>,
-                       EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
+                       "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
+                       EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable;
   }// let isAsmParserOnly = 1, hasSideEffects = 0
 
   let isCodeGenOnly = 1 in {
@@ -1973,8 +2112,8 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                           _.FRC:$src2,
-                                          imm:$cc))],
-                itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+                                          imm:$cc))]>,
+                EVEX_4V, Sched<[sched]>;
     def rm : AVX512Ii8<0xC2, MRMSrcMem,
               (outs _.KRC:$dst),
               (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
@@ -1982,43 +2121,44 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
               [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                         (_.ScalarLdFrag addr:$src2),
-                                        imm:$cc))],
-              itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
-              Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                        imm:$cc))]>,
+              EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+              Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 let Predicates = [HasAVX512] in {
   let ExeDomain = SSEPackedSingle in
   defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd,
-                                   SSE_ALU_F32S>, AVX512XSIi8Base;
+                                   SchedWriteFCmp.Scl>, AVX512XSIi8Base;
   let ExeDomain = SSEPackedDouble in
   defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd,
-                                   SSE_ALU_F64S>, AVX512XDIi8Base, VEX_W;
+                                   SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
 }
 
-multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
-              OpndItins itins, X86VectorVTInfo _, bit IsCommutable> {
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                              X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                              bit IsCommutable> {
   let isCommutable = IsCommutable in
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
-             itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>,
+             EVEX_4V, Sched<[sched]>;
   def rm : AVX512BI<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                                     (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
-             itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                       (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+             EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
   let isCommutable = IsCommutable in
   def rrk : AVX512BI<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
-              itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
+              EVEX_4V, EVEX_K, Sched<[sched]>;
   def rmk : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
@@ -2026,20 +2166,21 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                    (OpNode (_.VT _.RC:$src1),
                                        (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))))))],
-              itins.rm>, EVEX_4V, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                              (_.LdFrag addr:$src2))))))]>,
+              EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-              OpndItins itins,  X86VectorVTInfo _, bit IsCommutable> :
-           avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, _, IsCommutable> {
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                                  bit IsCommutable> :
+           avx512_icmp_packed<opc, OpcodeStr, OpNode, sched, _, IsCommutable> {
   def rmb : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
               !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
                                     "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
               [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                              (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
-              itins.rm>, EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
+              EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   def rmbk : AVX512BI<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2),
@@ -2049,112 +2190,110 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                       (OpNode (_.VT _.RC:$src1),
                                         (X86VBroadcast
-                                          (_.ScalarLdFrag addr:$src2)))))],
-               itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
-               Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                          (_.ScalarLdFrag addr:$src2)))))]>,
+               EVEX_4V, EVEX_K, EVEX_B,
+               Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins, AVX512VLVectorVTInfo VTInfo,
-                                 Predicate prd, bit IsCommutable = 0> {
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                                 X86SchedWriteWidths sched,
+                                 AVX512VLVectorVTInfo VTInfo, Predicate prd,
+                                 bit IsCommutable = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info512,
-                              IsCommutable>, EVEX_V512;
+  defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.ZMM,
+                              VTInfo.info512, IsCommutable>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info256,
-                                   IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info128,
-                                   IsCommutable>, EVEX_V128;
+    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.YMM,
+                                   VTInfo.info256, IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.XMM,
+                                   VTInfo.info128, IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
-                                     SDNode OpNode, OpndItins itins,
+                                     PatFrag OpNode, X86SchedWriteWidths sched,
                                      AVX512VLVectorVTInfo VTInfo,
                                      Predicate prd, bit IsCommutable = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512,
-                                  IsCommutable>, EVEX_V512;
+  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.ZMM,
+                                  VTInfo.info512, IsCommutable>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256,
-                                       IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128,
-                                       IsCommutable>, EVEX_V128;
-  }
-}
-
-// FIXME: Is there a better scheduler itinerary for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
-                      SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>,
+    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.YMM,
+                                       VTInfo.info256, IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.XMM,
+                                       VTInfo.info128, IsCommutable>, EVEX_V128;
+  }
+}
+
+// This fragment treats X86cmpm as commutable to help match loads in both
+// operands for PCMPEQ.
+def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
+def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
+                           (X86setcc_commute node:$src1, node:$src2, SETEQ)>;
+def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
+                         (setcc node:$src1, node:$src2, SETGT)>;
+
+// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+// increase the pattern complexity the way an immediate would.
+let AddedComplexity = 2 in {
+// FIXME: Is there a better scheduler class for VPCMP?
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c,
+                      SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
                 EVEX_CD8<8, CD8VF>, VEX_WIG;
 
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
-                      SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c,
+                      SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
                 EVEX_CD8<16, CD8VF>, VEX_WIG;
 
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
-                      SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c,
+                      SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
                 EVEX_CD8<32, CD8VF>;
 
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
-                      SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c,
+                      SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
-                      SSE_ALU_F32P, avx512vl_i8_info, HasBWI>,
+                      SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
                 EVEX_CD8<8, CD8VF>, VEX_WIG;
 
 defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
-                      SSE_ALU_F32P, avx512vl_i16_info, HasBWI>,
+                      SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
                 EVEX_CD8<16, CD8VF>, VEX_WIG;
 
 defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
-                      SSE_ALU_F32P, avx512vl_i32_info, HasAVX512>,
+                      SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
                 EVEX_CD8<32, CD8VF>;
 
 defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
-                      SSE_ALU_F32P, avx512vl_i64_info, HasAVX512>,
+                      SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+}
 
-// Transforms to swizzle an immediate to help matching memory operand in first
-// operand.
-def CommutePCMPCC : SDNodeXForm<imm, [{
-  uint8_t Imm = N->getZExtValue() & 0x7;
-  switch (Imm) {
-  default: llvm_unreachable("Unreachable!");
-  case 0x01: Imm = 0x06; break; // LT  -> NLE
-  case 0x02: Imm = 0x05; break; // LE  -> NLT
-  case 0x05: Imm = 0x02; break; // NLT -> LE
-  case 0x06: Imm = 0x01; break; // NLE -> LT
-  case 0x00: // EQ
-  case 0x03: // FALSE
-  case 0x04: // NE
-  case 0x07: // TRUE
-    break;
-  }
-  return getI8Imm(Imm, SDLoc(N));
-}]>;
-
-multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
-                          OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
+                          PatFrag CommFrag, X86FoldableSchedWrite sched,
+                          X86VectorVTInfo _, string Name> {
   let isCommutable = 1 in
   def rri : AVX512AIi8<opc, MRMSrcReg,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                       imm:$cc))],
-             itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+             [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
+                                                (_.VT _.RC:$src2),
+                                                cond)))]>,
+             EVEX_4V, Sched<[sched]>;
   def rmi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                              imm:$cc))],
-             itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+             [(set _.KRC:$dst, (_.KVT
+                                (Frag:$cc
+                                 (_.VT _.RC:$src1),
+                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                 cond)))]>,
+             EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
   let isCommutable = 1 in
   def rrik : AVX512AIi8<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@@ -2163,9 +2302,10 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                          "\t{$src2, $src1, $dst {${mask}}|",
                          "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                  (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                          imm:$cc)))],
-              itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+                                     (_.KVT (Frag:$cc (_.VT _.RC:$src1),
+                                                      (_.VT _.RC:$src2),
+                                                      cond))))]>,
+              EVEX_4V, EVEX_K, Sched<[sched]>;
   def rmik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
                                     AVX512ICC:$cc),
@@ -2173,69 +2313,74 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                          "\t{$src2, $src1, $dst {${mask}}|",
                          "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                   (OpNode (_.VT _.RC:$src1),
-                                      (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                                      imm:$cc)))],
-              itins.rm>, EVEX_4V, EVEX_K,
-              Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (_.KVT
+                                      (Frag:$cc
+                                       (_.VT _.RC:$src1),
+                                       (_.VT (bitconvert
+                                              (_.LdFrag addr:$src2))),
+                                       cond))))]>,
+              EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512AIi8<opc, MRMSrcReg,
                (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
-                          "$dst, $src1, $src2, $cc}"),
-               [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+                          "$dst, $src1, $src2, $cc}"), []>,
+               EVEX_4V, Sched<[sched]>, NotMemoryFoldable;
     let mayLoad = 1 in
     def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
                !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
-                          "$dst, $src1, $src2, $cc}"),
-               [], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          "$dst, $src1, $src2, $cc}"), []>,
+               EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
     def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
                                        u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                           "\t{$cc, $src2, $src1, $dst {${mask}}|",
-                          "$dst {${mask}}, $src1, $src2, $cc}"),
-               [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
+                          "$dst {${mask}}, $src1, $src2, $cc}"), []>,
+               EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable;
     let mayLoad = 1 in
     def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
                                        u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                           "\t{$cc, $src2, $src1, $dst {${mask}}|",
-                          "$dst {${mask}}, $src1, $src2, $cc}"),
-               [], itins.rm>, EVEX_4V, EVEX_K,
-               Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          "$dst {${mask}}, $src1, $src2, $cc}"), []>,
+               EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>,
+               NotMemoryFoldable;
   }
 
-  def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)),
-                    (_.VT _.RC:$src1), imm:$cc),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
-                                                      (CommutePCMPCC imm:$cc))>;
+  def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+                                 (_.VT _.RC:$src1), cond)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmi")
+             _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
-  def : Pat<(and _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src2)),
-                                        (_.VT _.RC:$src1), imm:$cc)),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask,
-                                                       _.RC:$src1, addr:$src2,
-                                                       (CommutePCMPCC imm:$cc))>;
+  def : Pat<(and _.KRCWM:$mask,
+                 (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+                                      (_.VT _.RC:$src1), cond))),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmik")
+             _.KRCWM:$mask, _.RC:$src1, addr:$src2,
+             (CommFrag.OperandTransform $cc))>;
 }
 
-multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
-                              OpndItins itins, X86VectorVTInfo _> :
-           avx512_icmp_cc<opc, Suffix, OpNode, itins, _> {
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
+                              PatFrag CommFrag, X86FoldableSchedWrite sched,
+                              X86VectorVTInfo _, string Name> :
+           avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> {
   def rmib : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
                                      AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
                         "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
-             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                               (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
-                               imm:$cc))],
-             itins.rm>, EVEX_4V, EVEX_B,
-             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+             [(set _.KRC:$dst, (_.KVT (Frag:$cc
+                                       (_.VT _.RC:$src1),
+                                       (X86VBroadcast
+                                        (_.ScalarLdFrag addr:$src2)),
+                                       cond)))]>,
+             EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   def rmibk : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2, AVX512ICC:$cc),
@@ -2243,11 +2388,12 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
                        "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
                        "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                  (OpNode (_.VT _.RC:$src1),
-                                    (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
-                                    imm:$cc)))],
-              itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
-              Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (_.KVT (Frag:$cc
+                                             (_.VT _.RC:$src1),
+                                             (X86VBroadcast
+                                              (_.ScalarLdFrag addr:$src2)),
+                                             cond))))]>,
+              EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
@@ -2256,99 +2402,142 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
                                        u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                    "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
-                   "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
-               [], itins.rm>, EVEX_4V, EVEX_B,
-               Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                   "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
+               EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>,
+               NotMemoryFoldable;
     def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2, u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                   "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
-                  "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
-               [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
-               Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                  "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
+               EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>,
+               NotMemoryFoldable;
   }
 
-  def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
-                    (_.VT _.RC:$src1), imm:$cc),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2,
-                                                       (CommutePCMPCC imm:$cc))>;
+  def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                    (_.VT _.RC:$src1), cond)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmib")
+             _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
-  def : Pat<(and _.KRCWM:$mask, (OpNode (X86VBroadcast
-                                         (_.ScalarLdFrag addr:$src2)),
-                                        (_.VT _.RC:$src1), imm:$cc)),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmibk") _.KRCWM:$mask,
-                                                       _.RC:$src1, addr:$src2,
-                                                       (CommutePCMPCC imm:$cc))>;
+  def : Pat<(and _.KRCWM:$mask,
+                 (_.KVT (CommFrag:$cc (X86VBroadcast
+                                       (_.ScalarLdFrag addr:$src2)),
+                                      (_.VT _.RC:$src1), cond))),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmibk")
+             _.KRCWM:$mask, _.RC:$src1, addr:$src2,
+             (CommFrag.OperandTransform $cc))>;
 }
 
-multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
-                             OpndItins itins, AVX512VLVectorVTInfo VTInfo,
-                             Predicate prd> {
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
+                             PatFrag CommFrag, X86SchedWriteWidths sched,
+                             AVX512VLVectorVTInfo VTInfo, Predicate prd> {
   let Predicates = [prd] in
-  defm Z : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info512>,
-                          EVEX_V512;
+  defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM,
+                          VTInfo.info512, NAME>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info256>,
-                               EVEX_V256;
-    defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info128>,
-                               EVEX_V128;
+    defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM,
+                               VTInfo.info256, NAME>, EVEX_V256;
+    defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM,
+                               VTInfo.info128, NAME>, EVEX_V128;
   }
 }
 
-multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
-                                 OpndItins itins, AVX512VLVectorVTInfo VTInfo,
-                                 Predicate prd> {
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
+                                 PatFrag CommFrag, X86SchedWriteWidths sched,
+                                 AVX512VLVectorVTInfo VTInfo, Predicate prd> {
   let Predicates = [prd] in
-  defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info512>,
-           EVEX_V512;
+  defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM,
+                              VTInfo.info512, NAME>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info256>,
-                EVEX_V256;
-    defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info128>,
-                EVEX_V128;
+    defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM,
+                                    VTInfo.info256, NAME>, EVEX_V256;
+    defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM,
+                                   VTInfo.info128, NAME>, EVEX_V128;
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU?
-defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P,
-                                avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
-defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P,
-                                 avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
+def X86pcmpm_imm : SDNodeXForm<setcc, [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+  return getI8Imm(SSECC, SDLoc(N));
+}]>;
+
+// Swapped operand version of the above.
+def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+  SSECC = X86::getSwappedVPCMPImm(SSECC);
+  return getI8Imm(SSECC, SDLoc(N));
+}]>;
 
-defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P,
-                                avx512vl_i16_info, HasBWI>,
+def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+                       (setcc node:$src1, node:$src2, node:$cc), [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  return !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+// Same as above, but commutes immediate. Use for load folding.
+def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+                               (setcc node:$src1, node:$src2, node:$cc), [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  return !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+                        (setcc node:$src1, node:$src2, node:$cc), [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  return ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+// Same as above, but commutes immediate. Use for load folding.
+def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+                                (setcc node:$src1, node:$src2, node:$cc), [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  return ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute,
+                                SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+                                EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute,
+                                 SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+                                 EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute,
+                                SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
                                 VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P,
-                                 avx512vl_i16_info, HasBWI>,
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute,
+                                 SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
                                  VEX_W, EVEX_CD8<16, CD8VF>;
 
-defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P,
-                                    avx512vl_i32_info, HasAVX512>,
-                                    EVEX_CD8<32, CD8VF>;
-defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P,
-                                     avx512vl_i32_info, HasAVX512>,
-                                     EVEX_CD8<32, CD8VF>;
-
-defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P,
-                                    avx512vl_i64_info, HasAVX512>,
-                                    VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P,
-                                     avx512vl_i64_info, HasAVX512>,
-                                     VEX_W, EVEX_CD8<64, CD8VF>;
-
-
-multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute,
+                                    SchedWriteVecALU, avx512vl_i32_info,
+                                    HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute,
+                                     SchedWriteVecALU, avx512vl_i32_info,
+                                     HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute,
+                                    SchedWriteVecALU, avx512vl_i64_info,
+                                    HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute,
+                                     SchedWriteVecALU, avx512vl_i64_info,
+                                     HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                              string Name> {
   defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                    (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
                    "vcmp${cc}"#_.Suffix,
                    "$src2, $src1", "$src1, $src2",
                    (X86cmpm (_.VT _.RC:$src1),
                          (_.VT _.RC:$src2),
-                           imm:$cc), itins.rr, 1>,
-                   Sched<[itins.Sched]>;
+                           imm:$cc), 1>,
+                   Sched<[sched]>;
 
   defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                 (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
@@ -2356,8 +2545,8 @@ multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
                 "$src2, $src1", "$src1, $src2",
                 (X86cmpm (_.VT _.RC:$src1),
                         (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                        imm:$cc), itins.rm>,
-                Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                        imm:$cc)>,
+                Sched<[sched.Folded, ReadAfterLd]>;
 
   defm  rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                 (outs _.KRC:$dst),
@@ -2367,63 +2556,65 @@ multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
                 "$src1, ${src2}"##_.BroadcastStr,
                 (X86cmpm (_.VT _.RC:$src1),
                         (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                        imm:$cc), itins.rm>,
-                EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                        imm:$cc)>,
+                EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     defm  rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
                          (outs _.KRC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                          "vcmp"#_.Suffix,
-                         "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>,
-                         Sched<[itins.Sched]>;
+                         "$cc, $src2, $src1", "$src1, $src2, $cc">,
+                         Sched<[sched]>, NotMemoryFoldable;
 
     let mayLoad = 1 in {
       defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
                              (outs _.KRC:$dst),
                              (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
                              "vcmp"#_.Suffix,
-                             "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>,
-                             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                             "$cc, $src2, $src1", "$src1, $src2, $cc">,
+                             Sched<[sched.Folded, ReadAfterLd]>,
+                             NotMemoryFoldable;
 
       defm  rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
                          (outs _.KRC:$dst),
                          (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
                          "vcmp"#_.Suffix,
                          "$cc, ${src2}"##_.BroadcastStr##", $src1",
-                         "$src1, ${src2}"##_.BroadcastStr##", $cc", itins.rm>,
-                         EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         "$src1, ${src2}"##_.BroadcastStr##", $cc">,
+                         EVEX_B, Sched<[sched.Folded, ReadAfterLd]>,
+                         NotMemoryFoldable;
     }
   }
 
   // Patterns for selecting with loads in other operand.
   def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
                      CommutableCMPCC:$cc),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+            (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
                                                       imm:$cc)>;
 
   def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2),
                                          (_.VT _.RC:$src1),
                                          CommutableCMPCC:$cc)),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
                                                        _.RC:$src1, addr:$src2,
                                                        imm:$cc)>;
 
   def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
                      (_.VT _.RC:$src1), CommutableCMPCC:$cc),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
                                                        imm:$cc)>;
 
   def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast
                                           (_.ScalarLdFrag addr:$src2)),
                                          (_.VT _.RC:$src1),
                                          CommutableCMPCC:$cc)),
-            (!cast<Instruction>(NAME#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
                                                         _.RC:$src1, addr:$src2,
                                                         imm:$cc)>;
 }
 
-multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   // comparison code form (VCMP[EQ/LT/LE/...]
   defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
@@ -2432,8 +2623,8 @@ multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
                      (X86cmpmRnd (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     imm:$cc,
-                                (i32 FROUND_NO_EXC)), itins.rr>,
-                     EVEX_B, Sched<[itins.Sched]>;
+                                (i32 FROUND_NO_EXC))>,
+                     EVEX_B, Sched<[sched]>;
 
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     defm  rrib_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
@@ -2441,29 +2632,28 @@ multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
                          (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                          "vcmp"#_.Suffix,
                          "$cc, {sae}, $src2, $src1",
-                         "$src1, $src2, {sae}, $cc", itins.rr>,
-                         EVEX_B, Sched<[itins.Sched]>;
+                         "$src1, $src2, {sae}, $cc">,
+                         EVEX_B, Sched<[sched]>, NotMemoryFoldable;
    }
 }
 
-multiclass avx512_vcmp<OpndItins itins, AVX512VLVectorVTInfo _> {
+multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
-    defm Z    : avx512_vcmp_common<itins, _.info512>,
-                avx512_vcmp_sae<itins, _.info512>, EVEX_V512;
+    defm Z    : avx512_vcmp_common<sched.ZMM, _.info512, NAME>,
+                avx512_vcmp_sae<sched.ZMM, _.info512>, EVEX_V512;
 
   }
   let Predicates = [HasAVX512,HasVLX] in {
-   defm Z128 : avx512_vcmp_common<itins, _.info128>, EVEX_V128;
-   defm Z256 : avx512_vcmp_common<itins, _.info256>, EVEX_V256;
+   defm Z128 : avx512_vcmp_common<sched.XMM, _.info128, NAME>, EVEX_V128;
+   defm Z256 : avx512_vcmp_common<sched.YMM, _.info256, NAME>, EVEX_V256;
   }
 }
 
-defm VCMPPD : avx512_vcmp<SSE_ALU_F64P, avx512vl_f64_info>,
+defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
                           AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VCMPPS : avx512_vcmp<SSE_ALU_F32P, avx512vl_f32_info>,
+defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
                           AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
 
-
 // Patterns to select fp compares with load as first operand.
 let Predicates = [HasAVX512] in {
   def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
@@ -2480,39 +2670,39 @@ let Predicates = [HasAVX512] in {
 //handle fpclass instruction  mask =  op(reg_scalar,imm)
 //                                    op(mem_scalar,imm)
 multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins,  X86VectorVTInfo _,
+                                 X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                  Predicate prd> {
   let Predicates = [prd], ExeDomain = _.ExeDomain in {
       def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
-                              (i32 imm:$src2)))], itins.rr>,
-                      Sched<[itins.Sched]>;
+                              (i32 imm:$src2)))]>,
+                      Sched<[sched]>;
       def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                      [(set _.KRC:$dst,(and _.KRCWM:$mask,
                                       (OpNode (_.VT _.RC:$src1),
-                                      (i32 imm:$src2))))], itins.rr>,
-                      EVEX_K, Sched<[itins.Sched]>;
+                                      (i32 imm:$src2))))]>,
+                      EVEX_K, Sched<[sched]>;
     def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix##
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,
                           (OpNode _.ScalarIntMemCPat:$src1,
-                                  (i32 imm:$src2)))], itins.rm>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                  (i32 imm:$src2)))]>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
     def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix##
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                    [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                    [(set _.KRC:$dst,(and _.KRCWM:$mask,
                         (OpNode _.ScalarIntMemCPat:$src1,
-                            (i32 imm:$src2))))], itins.rm>,
-                    EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i32 imm:$src2))))]>,
+                    EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
@@ -2520,39 +2710,39 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
 //                                  fpclass(reg_vec, mem_vec, imm)
 //                                  fpclass(reg_vec, broadcast(eltVt), imm)
 multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins, X86VectorVTInfo _,
+                                 X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                  string mem, string broadcast>{
   let ExeDomain = _.ExeDomain in {
   def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
-                                       (i32 imm:$src2)))], itins.rr>,
-                      Sched<[itins.Sched]>;
+                                       (i32 imm:$src2)))]>,
+                      Sched<[sched]>;
   def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                      [(set _.KRC:$dst,(and _.KRCWM:$mask,
                                        (OpNode (_.VT _.RC:$src1),
-                                       (i32 imm:$src2))))], itins.rr>,
-                      EVEX_K, Sched<[itins.Sched]>;
+                                       (i32 imm:$src2))))]>,
+                      EVEX_K, Sched<[sched]>;
   def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(OpNode
                                      (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                     (i32 imm:$src2)))], itins.rm>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (i32 imm:$src2)))]>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                    [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+                    [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
                                   (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                  (i32 imm:$src2))))], itins.rm>,
-                    EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                  (i32 imm:$src2))))]>,
+                    EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
@@ -2561,56 +2751,58 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     [(set _.KRC:$dst,(OpNode
                                      (_.VT (X86VBroadcast
                                            (_.ScalarLdFrag addr:$src1))),
-                                     (i32 imm:$src2)))], itins.rm>,
-                    EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (i32 imm:$src2)))]>,
+                    EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
                           _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
                                                    _.BroadcastStr##", $src2}",
-                    [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+                    [(set _.KRC:$dst,(and _.KRCWM:$mask, (OpNode
                                      (_.VT (X86VBroadcast
                                            (_.ScalarLdFrag addr:$src1))),
-                                     (i32 imm:$src2))))], itins.rm>,
-                    EVEX_B, EVEX_K,  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (i32 imm:$src2))))]>,
+                    EVEX_B, EVEX_K,  Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
                                      bits<8> opc, SDNode OpNode,
-                                     OpndItins itins, Predicate prd,
+                                     X86SchedWriteWidths sched, Predicate prd,
                                      string broadcast>{
   let Predicates = [prd] in {
-    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.ZMM,
                                       _.info512, "{z}", broadcast>, EVEX_V512;
   }
   let Predicates = [prd, HasVLX] in {
-    defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+    defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.XMM,
                                       _.info128, "{x}", broadcast>, EVEX_V128;
-    defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+    defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.YMM,
                                       _.info256, "{y}", broadcast>, EVEX_V256;
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VFPCLASS?
 multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
-             bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+                                 bits<8> opcScalar, SDNode VecOpNode,
+                                 SDNode ScalarOpNode, X86SchedWriteWidths sched,
+                                 Predicate prd> {
   defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec,
-                                      VecOpNode, SSE_ALU_F32P, prd, "{l}">,
+                                      VecOpNode, sched, prd, "{l}">,
                                       EVEX_CD8<32, CD8VF>;
   defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec,
-                                      VecOpNode, SSE_ALU_F64P, prd, "{q}">,
+                                      VecOpNode, sched, prd, "{q}">,
                                       EVEX_CD8<64, CD8VF> , VEX_W;
-  defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
-                                  SSE_ALU_F32S, f32x_info, prd>,
-                                  EVEX_CD8<32, CD8VT1>;
-  defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
-                                  SSE_ALU_F64S, f64x_info, prd>,
-                                  EVEX_CD8<64, CD8VT1>, VEX_W;
+  defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                   sched.Scl, f32x_info, prd>,
+                                   EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                   sched.Scl, f64x_info, prd>,
+                                   EVEX_CD8<64, CD8VT1>, VEX_W;
 }
 
 defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
-                                      X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+                                      X86Vfpclasss, SchedWriteFCmp, HasDQI>,
+                                      AVX512AIi8Base, EVEX;
 
 //-----------------------------------------------------------------
 // Mask register copy, including
@@ -2621,16 +2813,18 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                          string OpcodeStr, RegisterClass KRC,
                          ValueType vvt, X86MemOperand x86memop> {
-  let hasSideEffects = 0, SchedRW = [WriteMove] in
+  let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
   def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
-             IIC_SSE_MOVDQ>;
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+             Sched<[WriteMove]>;
   def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-             [(set KRC:$dst, (vvt (load addr:$src)))], IIC_SSE_MOVDQ>;
+             [(set KRC:$dst, (vvt (load addr:$src)))]>,
+             Sched<[WriteLoad]>;
   def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-             [(store KRC:$src, addr:$dst)], IIC_SSE_MOVDQ>;
+             [(store KRC:$src, addr:$dst)]>,
+             Sched<[WriteStore]>;
 }
 
 multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
@@ -2638,11 +2832,11 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
                              RegisterClass KRC, RegisterClass GRC> {
   let hasSideEffects = 0 in {
     def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
-               IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+               Sched<[WriteMove]>;
     def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
-               IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+               Sched<[WriteMove]>;
   }
 }
 
@@ -2684,8 +2878,6 @@ def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
           (COPY_TO_REGCLASS VK16:$src, GR32)>;
 
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>;
-def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
           (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
 def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
           (COPY_TO_REGCLASS VK8:$src, GR32)>;
@@ -2701,47 +2893,18 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
 
 // Load/store kreg
 let Predicates = [HasDQI] in {
-  def : Pat<(store VK4:$src, addr:$dst),
-            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
-  def : Pat<(store VK2:$src, addr:$dst),
-            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
   def : Pat<(store VK1:$src, addr:$dst),
             (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
 
+  def : Pat<(v1i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
   def : Pat<(v2i1 (load addr:$src)),
             (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
   def : Pat<(v4i1 (load addr:$src)),
             (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
 }
-let Predicates = [HasAVX512, NoDQI] in {
-  def : Pat<(store VK1:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
-              sub_8bit)))>;
-  def : Pat<(store VK2:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
-              sub_8bit)))>;
-  def : Pat<(store VK4:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
-              sub_8bit)))>;
-  def : Pat<(store VK8:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
-              sub_8bit)))>;
-
-  def : Pat<(v8i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
-  def : Pat<(v2i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
-  def : Pat<(v4i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
-}
 
 let Predicates = [HasAVX512] in {
-  def : Pat<(v1i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK1)>;
   def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
             (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
 }
@@ -2751,9 +2914,6 @@ let Predicates = [HasAVX512] in {
     def : Pat<(maskVT (scalar_to_vector GR32:$src)),
               (COPY_TO_REGCLASS GR32:$src, maskRC)>;
 
-    def : Pat<(i32 (X86kextract maskRC:$src, (iPTR 0))),
-              (COPY_TO_REGCLASS maskRC:$src, GR32)>;
-
     def : Pat<(maskVT (scalar_to_vector GR8:$src)),
               (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
   }
@@ -2766,46 +2926,41 @@ let Predicates = [HasAVX512] in {
   defm : operation_gpr_mask_copy_lowering<VK32,  v32i1>;
   defm : operation_gpr_mask_copy_lowering<VK64,  v64i1>;
 
-  def : Pat<(X86kshiftr  (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
-          (COPY_TO_REGCLASS
-                (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                            GR8:$src, sub_8bit), (i32 1))), VK1)>;
-  def : Pat<(X86kshiftr  (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+  def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                              (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)),
             (COPY_TO_REGCLASS
-                (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                            GR8:$src, sub_8bit), (i32 1))), VK16)>;
-  def : Pat<(X86kshiftr  (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
-         (COPY_TO_REGCLASS
-          (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                            GR8:$src, sub_8bit), (i32 1))), VK8)>;
-
+             (KMOVWkr (AND32ri8
+                       (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
+                       (i32 1))), VK16)>;
 }
 
 // Mask unary operation
 // - KNOT
 multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
                             RegisterClass KRC, SDPatternOperator OpNode,
-                            OpndItins itins, Predicate prd> {
+                            X86FoldableSchedWrite sched, Predicate prd> {
   let Predicates = [prd] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
                !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-               [(set KRC:$dst, (OpNode KRC:$src))], itins.rr>,
-               Sched<[itins.Sched]>;
+               [(set KRC:$dst, (OpNode KRC:$src))]>,
+               Sched<[sched]>;
 }
 
 multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
-                                SDPatternOperator OpNode, OpndItins itins> {
+                                SDPatternOperator OpNode,
+                                X86FoldableSchedWrite sched> {
   defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                            itins, HasDQI>, VEX, PD;
+                            sched, HasDQI>, VEX, PD;
   defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                            itins, HasAVX512>, VEX, PS;
+                            sched, HasAVX512>, VEX, PS;
   defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                            itins, HasBWI>, VEX, PD, VEX_W;
+                            sched, HasBWI>, VEX, PD, VEX_W;
   defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                            itins, HasBWI>, VEX, PS, VEX_W;
+                            sched, HasBWI>, VEX, PS, VEX_W;
 }
 
-defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SSE_BIT_ITINS_P>;
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SchedWriteVecLogic.XMM>;
 
 // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
 let Predicates = [HasAVX512, NoDQI] in
@@ -2821,26 +2976,28 @@ def : Pat<(vnot VK2:$src),
 // - KAND, KANDN, KOR, KXNOR, KXOR
 multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
                            RegisterClass KRC, SDPatternOperator OpNode,
-                           OpndItins itins, Predicate prd, bit IsCommutable> {
+                           X86FoldableSchedWrite sched, Predicate prd,
+                           bit IsCommutable> {
   let Predicates = [prd], isCommutable = IsCommutable in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))], itins.rr>,
-               Sched<[itins.Sched]>;
+               [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>,
+               Sched<[sched]>;
 }
 
 multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator OpNode, OpndItins itins,
-                               bit IsCommutable, Predicate prdW = HasAVX512> {
+                                 SDPatternOperator OpNode,
+                                 X86FoldableSchedWrite sched, bit IsCommutable,
+                                 Predicate prdW = HasAVX512> {
   defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                             itins, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+                             sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                             itins, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+                             sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
   defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                             itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+                             sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
   defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                             itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+                             sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
 }
 
 def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
@@ -2849,12 +3006,13 @@ def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
 def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
 def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
 
-defm KAND  : avx512_mask_binop_all<0x41, "kand",  and,   SSE_BIT_ITINS_P, 1>;
-defm KOR   : avx512_mask_binop_all<0x45, "kor",   or,    SSE_BIT_ITINS_P, 1>;
-defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SSE_BIT_ITINS_P, 1>;
-defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,   SSE_BIT_ITINS_P, 1>;
-defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SSE_BIT_ITINS_P, 0>;
-defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,   SSE_BIT_ITINS_P, 1, HasDQI>;
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KAND  : avx512_mask_binop_all<0x41, "kand",  and,     SchedWriteVecLogic.XMM, 1>;
+defm KOR   : avx512_mask_binop_all<0x45, "kor",   or,      SchedWriteVecLogic.XMM, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor,   SchedWriteVecLogic.XMM, 1>;
+defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,     SchedWriteVecLogic.XMM, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn,   SchedWriteVecLogic.XMM, 0>;
+defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>;
 
 multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
                             Instruction Inst> {
@@ -2889,13 +3047,14 @@ defm : avx512_binop_pat<xor,   xor,  KXORWrr>;
 
 // Mask unpacking
 multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
-                             RegisterClass KRCSrc, OpndItins itins, Predicate prd> {
+                             RegisterClass KRCSrc, X86FoldableSchedWrite sched,
+                             Predicate prd> {
   let Predicates = [prd] in {
     let hasSideEffects = 0 in
     def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
                (ins KRC:$src1, KRC:$src2),
-               "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-               itins.rr>, VEX_4V, VEX_L, Sched<[itins.Sched]>;
+               "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               VEX_4V, VEX_L, Sched<[sched]>;
 
     def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
               (!cast<Instruction>(NAME##rr)
@@ -2904,104 +3063,199 @@ multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
   }
 }
 
-defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, SSE_UNPCK, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, SSE_UNPCK, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, SSE_UNPCK, HasBWI>, PS, VEX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W;
 
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
-                              SDNode OpNode, OpndItins itins, Predicate prd> {
+                              SDNode OpNode, X86FoldableSchedWrite sched,
+                              Predicate prd> {
   let Predicates = [prd], Defs = [EFLAGS] in
     def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
-               [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))], itins.rr>,
-               Sched<[itins.Sched]>;
+               [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>,
+               Sched<[sched]>;
 }
 
 multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prdW = HasAVX512> {
-  defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, itins, HasDQI>,
+                                X86FoldableSchedWrite sched,
+                                Predicate prdW = HasAVX512> {
+  defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>,
                                                                 VEX, PD;
-  defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, itins, prdW>,
+  defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>,
                                                                 VEX, PS;
-  defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, itins, HasBWI>,
+  defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>,
                                                                 VEX, PS, VEX_W;
-  defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, itins, HasBWI>,
+  defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>,
                                                                 VEX, PD, VEX_W;
 }
 
-defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SSE_PTEST>;
-defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest, SSE_PTEST, HasDQI>;
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SchedWriteVecLogic.XMM>;
+defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest, SchedWriteVecLogic.XMM, HasDQI>;
 
 // Mask shift
 multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
-                             SDNode OpNode, OpndItins itins> {
+                               SDNode OpNode, X86FoldableSchedWrite sched> {
   let Predicates = [HasAVX512] in
     def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
                  !strconcat(OpcodeStr,
                             "\t{$imm, $src, $dst|$dst, $src, $imm}"),
-                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))],
-                 itins.rr>, Sched<[itins.Sched]>;
+                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>,
+                 Sched<[sched]>;
 }
 
 multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
-                               SDNode OpNode, OpndItins itins> {
+                                 SDNode OpNode, X86FoldableSchedWrite sched> {
   defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                               itins>, VEX, TAPD, VEX_W;
+                               sched>, VEX, TAPD, VEX_W;
   let Predicates = [HasDQI] in
   defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                               itins>, VEX, TAPD;
+                               sched>, VEX, TAPD;
   let Predicates = [HasBWI] in {
   defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                               itins>, VEX, TAPD, VEX_W;
+                               sched>, VEX, TAPD, VEX_W;
   defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                               itins>, VEX, TAPD;
+                               sched>, VEX, TAPD;
   }
 }
 
-defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, SSE_PSHUF>;
-defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, SSE_PSHUF>;
-
-multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
-def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr)
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
 
-def : Pat<(v8i1 (and VK8:$mask,
-                     (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
+// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
+multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
+                                              X86VectorVTInfo Narrow,
+                                              X86VectorVTInfo Wide> {
+  def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
+                              (Narrow.VT Narrow.RC:$src2))),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr#"Zrr")
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
+           Narrow.KRC)>;
+
+  def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                             (Frag (Narrow.VT Narrow.RC:$src1),
+                                   (Narrow.VT Narrow.RC:$src2)))),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr#"Zrrk")
+            (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
+           Narrow.KRC)>;
+}
+
+// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag,
+                                                 string InstStr,
+                                                 X86VectorVTInfo Narrow,
+                                                 X86VectorVTInfo Wide> {
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+                                (Narrow.VT Narrow.RC:$src2), cond)),
           (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr##Zrrk)
-            (COPY_TO_REGCLASS VK8:$mask, VK16),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
-           VK8)>;
-}
-
-multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
-                                                AVX512VLVectorVTInfo _> {
-def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri)
-            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-
-def : Pat<(v8i1 (and VK8:$mask, (OpNode (_.info256.VT VR256X:$src1),
-                                        (_.info256.VT VR256X:$src2), imm:$cc))),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
-            (COPY_TO_REGCLASS VK8:$mask, VK16),
-            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
+           (!cast<Instruction>(InstStr##Zrri)
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+            (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                           (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+                                                 (Narrow.VT Narrow.RC:$src2),
+                                                 cond)))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+           (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+           (Frag.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+// Same as above, but for fp types which don't use PatFrags.
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+                                                X86VectorVTInfo Narrow,
+                                                X86VectorVTInfo Wide> {
+def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
+                              (Narrow.VT Narrow.RC:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr##Zrri)
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+            imm:$cc), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                           (OpNode (Narrow.VT Narrow.RC:$src1),
+                                   (Narrow.VT Narrow.RC:$src2), imm:$cc))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+           (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+           imm:$cc), Narrow.KRC)>;
 }
 
 let Predicates = [HasAVX512, NoVLX] in {
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">;
+  // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+  // increase the pattern complexity the way an immediate would.
+  let AddedComplexity = 2 in {
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>;
+
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>;
+
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>;
 
-  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>;
-  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>;
-  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>;
+  }
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>;
+
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>;
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>;
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+  // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+  // increase the pattern complexity the way an immediate would.
+  let AddedComplexity = 2 in {
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>;
+
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>;
+
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>;
+
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>;
+  }
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>;
 }
 
 // Mask setting all 0s or 1s
@@ -3074,23 +3328,25 @@ defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
 // AVX-512 - Aligned and unaligned load and store
 //
 
-
-multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                        X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
+                       X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
                        bit NoRMPattern = 0,
                        SDPatternOperator SelectOprr = vselect> {
   let hasSideEffects = 0 in {
+  let isMoveReg = 1 in
   def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
-                    _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>;
+                    _.ExeDomain>, EVEX, Sched<[Sched.RR]>,
+                    EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
                       (ins _.KRCWM:$mask,  _.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
                        "${dst} {${mask}} {z}, $src}"),
                        [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                            (_.VT _.RC:$src),
-                                           _.ImmAllZerosV)))], _.ExeDomain,
-                       itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>;
+                                           _.ImmAllZerosV)))], _.ExeDomain>,
+                       EVEX, EVEX_KZ, Sched<[Sched.RR]>;
 
   let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
@@ -3098,7 +3354,8 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
                           (_.VT (bitconvert (ld_frag addr:$src))))]),
-                    _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>;
+                    _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
+                    EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
 
   let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
     def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
@@ -3107,8 +3364,8 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                       "${dst} {${mask}}, $src1}"),
                       [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                           (_.VT _.RC:$src1),
-                                          (_.VT _.RC:$src0))))], _.ExeDomain,
-                       itins.rr>, EVEX, EVEX_K, Sched<[WriteMove]>;
+                                          (_.VT _.RC:$src0))))], _.ExeDomain>,
+                       EVEX, EVEX_K, Sched<[Sched.RR]>;
     def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3116,8 +3373,8 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                      [(set _.RC:$dst, (_.VT
                          (vselect _.KRCWM:$mask,
                           (_.VT (bitconvert (ld_frag addr:$src1))),
-                           (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>,
-                     EVEX, EVEX_K, Sched<[WriteLoad]>;
+                           (_.VT _.RC:$src0))))], _.ExeDomain>,
+                     EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.KRCWM:$mask, _.MemOp:$src),
@@ -3125,77 +3382,83 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
                     (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
-                  _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
+                  _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
-            (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+            (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
 
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+            (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
 
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
-            (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
+            (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0,
              _.KRCWM:$mask, addr:$ptr)>;
 }
 
 multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
-                                  AVX512VLVectorVTInfo _,
-                                  Predicate prd> {
+                                 AVX512VLVectorVTInfo _, Predicate prd,
+                                 X86SchedWriteMoveLSWidths Sched,
+                                 string EVEX2VEXOvrd, bit NoRMPattern = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info512,
-                       _.info512.AlignedLdFrag, masked_load_aligned512>,
-                       EVEX_V512;
+  defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,
+                       _.info512.AlignedLdFrag, masked_load_aligned512,
+                       Sched.ZMM, "", NoRMPattern>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-  defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info256,
-                          _.info256.AlignedLdFrag, masked_load_aligned256>,
-                          EVEX_V256;
-  defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info128,
-                          _.info128.AlignedLdFrag, masked_load_aligned128>,
-                          EVEX_V128;
+  defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,
+                          _.info256.AlignedLdFrag, masked_load_aligned256,
+                          Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,
+                          _.info128.AlignedLdFrag, masked_load_aligned128,
+                          Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128;
   }
 }
 
 multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
-                                  AVX512VLVectorVTInfo _,
-                                  Predicate prd,
-                                  bit NoRMPattern = 0,
-                                  SDPatternOperator SelectOprr = vselect> {
+                          AVX512VLVectorVTInfo _, Predicate prd,
+                          X86SchedWriteMoveLSWidths Sched,
+                          string EVEX2VEXOvrd, bit NoRMPattern = 0,
+                          SDPatternOperator SelectOprr = vselect> {
   let Predicates = [prd] in
-  defm Z : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info512, _.info512.LdFrag,
-                       masked_load_unaligned, NoRMPattern,
-                       SelectOprr>, EVEX_V512;
+  defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag,
+                       masked_load_unaligned, Sched.ZMM, "",
+                       NoRMPattern, SelectOprr>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-  defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info256, _.info256.LdFrag,
-                         masked_load_unaligned, NoRMPattern,
-                         SelectOprr>, EVEX_V256;
-  defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info128, _.info128.LdFrag,
-                         masked_load_unaligned, NoRMPattern,
-                         SelectOprr>, EVEX_V128;
+  defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag,
+                         masked_load_unaligned, Sched.YMM, EVEX2VEXOvrd#"Y",
+                         NoRMPattern, SelectOprr>, EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag,
+                         masked_load_unaligned, Sched.XMM, EVEX2VEXOvrd,
+                         NoRMPattern, SelectOprr>, EVEX_V128;
   }
 }
 
-multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,
                         X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
-                        string Name, bit NoMRPattern = 0> {
-  let hasSideEffects = 0 in {
+                        X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
+                        bit NoMRPattern = 0> {
+  let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+  let isMoveReg = 1 in
   def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
-                         OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
-                         [], _.ExeDomain, itins.rr>, EVEX, FoldGenData<Name#rr>,
-                         Sched<[WriteMove]>;
+                         OpcodeStr # "\t{$src, $dst|$dst, $src}",
+                         [], _.ExeDomain>, EVEX,
+                         FoldGenData<BaseName#_.ZSuffix#rr>, Sched<[Sched.RR]>,
+                         EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">;
   def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                          (ins _.KRCWM:$mask, _.RC:$src),
-                         OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+                         OpcodeStr # "\t{$src, ${dst} {${mask}}|"#
                          "${dst} {${mask}}, $src}",
-                         [], _.ExeDomain, itins.rr>,  EVEX, EVEX_K,
-                         FoldGenData<Name#rrk>, Sched<[WriteMove]>;
+                         [], _.ExeDomain>,  EVEX, EVEX_K,
+                         FoldGenData<BaseName#_.ZSuffix#rrk>,
+                         Sched<[Sched.RR]>;
   def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                           (ins _.KRCWM:$mask, _.RC:$src),
-                          OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
+                          OpcodeStr # "\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
-                          [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ,
-                          FoldGenData<Name#rrkz>, Sched<[WriteMove]>;
+                          [], _.ExeDomain>, EVEX, EVEX_KZ,
+                          FoldGenData<BaseName#_.ZSuffix#rrkz>,
+                          Sched<[Sched.RR]>;
   }
 
   let hasSideEffects = 0, mayStore = 1 in
@@ -3203,132 +3466,154 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoMRPattern, [],
                         [(st_frag (_.VT _.RC:$src), addr:$dst)]),
-                    _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>;
+                    _.ExeDomain>, EVEX, Sched<[Sched.MR]>,
+                    EVEX2VEXOverride<EVEX2VEXOvrd#"mr">;
   def mrk : AVX512PI<opc, MRMDestMem, (outs),
                      (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
               OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
-               [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>;
+               [], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.MR]>,
+               NotMemoryFoldable;
 
   def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
-           (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
-                                                    _.KRCWM:$mask, _.RC:$src)>;
-}
+           (!cast<Instruction>(BaseName#_.ZSuffix#mrk) addr:$ptr,
+                                                        _.KRCWM:$mask, _.RC:$src)>;
 
+  def : InstAlias<OpcodeStr#".s\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(BaseName#_.ZSuffix#"rr_REV")
+                   _.RC:$dst, _.RC:$src), 0>;
+  def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+                  (!cast<Instruction>(BaseName#_.ZSuffix#"rrk_REV")
+                   _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>;
+  def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}",
+                  (!cast<Instruction>(BaseName#_.ZSuffix#"rrkz_REV")
+                   _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>;
+}
 
 multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
                             AVX512VLVectorVTInfo _, Predicate prd,
-                            string Name, bit NoMRPattern = 0> {
+                            X86SchedWriteMoveLSWidths Sched,
+                            string EVEX2VEXOvrd, bit NoMRPattern = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info512, store,
-                        masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512;
-
+  defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store,
+                        masked_store_unaligned, Sched.ZMM, "",
+                        NoMRPattern>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info256, store,
-                             masked_store_unaligned, Name#Z256,
-                             NoMRPattern>, EVEX_V256;
-    defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info128, store,
-                             masked_store_unaligned, Name#Z128,
+    defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store,
+                             masked_store_unaligned, Sched.YMM,
+                             EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+    defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store,
+                             masked_store_unaligned, Sched.XMM, EVEX2VEXOvrd,
                              NoMRPattern>, EVEX_V128;
   }
 }
 
 multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
-                                  AVX512VLVectorVTInfo _,  Predicate prd,
-                                  string Name> {
+                                  AVX512VLVectorVTInfo _, Predicate prd,
+                                  X86SchedWriteMoveLSWidths Sched,
+                                  string EVEX2VEXOvrd, bit NoMRPattern = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info512, alignedstore,
-                        masked_store_aligned512, Name#Z>, EVEX_V512;
+  defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore,
+                        masked_store_aligned512, Sched.ZMM, "",
+                        NoMRPattern>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info256, alignedstore,
-                             masked_store_aligned256, Name#Z256>, EVEX_V256;
-    defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info128, alignedstore,
-                             masked_store_aligned128, Name#Z128>, EVEX_V128;
+    defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore,
+                             masked_store_aligned256, Sched.YMM,
+                             EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+    defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore,
+                             masked_store_aligned128, Sched.XMM, EVEX2VEXOvrd,
+                             NoMRPattern>, EVEX_V128;
   }
 }
 
 defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
-                                     HasAVX512>,
+                                     HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
-                                      HasAVX512, "VMOVAPS">,
+                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
                PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
-                                     HasAVX512>,
+                                     HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
-                                     HasAVX512, "VMOVAPD">,
+                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
-                              0, null_frag>,
+                              SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
                avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
-                               "VMOVUPS">,
-                              PS, EVEX_CD8<32, CD8VF>;
+                               SchedWriteFMoveLS, "VMOVUPS">,
+                               PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
-                              0, null_frag>,
+                              SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
-                               "VMOVUPD">,
+                               SchedWriteFMoveLS, "VMOVUPD">,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512>,
+                                       HasAVX512, SchedWriteVecMoveLS,
+                                       "VMOVDQA", 1>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512, "VMOVDQA32">,
+                                        HasAVX512, SchedWriteVecMoveLS,
+                                        "VMOVDQA", 1>,
                  PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
-                                       HasAVX512>,
+                                       HasAVX512, SchedWriteVecMoveLS,
+                                       "VMOVDQA">,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512, "VMOVDQA64">,
+                                        HasAVX512, SchedWriteVecMoveLS,
+                                        "VMOVDQA">,
                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,
-                avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI, "VMOVDQU8", 1>,
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
+                               SchedWriteVecMoveLS, "VMOVDQU", 1>,
+                avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI,
+                                SchedWriteVecMoveLS, "VMOVDQU", 1>,
                 XD, EVEX_CD8<8, CD8VF>;
 
-defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,
-                 avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI, "VMOVDQU16", 1>,
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
+                                SchedWriteVecMoveLS, "VMOVDQU", 1>,
+                 avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI,
+                                 SchedWriteVecMoveLS, "VMOVDQU", 1>,
                  XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
-                                0, null_frag>,
-                 avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512, "VMOVDQU32">,
+                                SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>,
+                 avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+                                 SchedWriteVecMoveLS, "VMOVDQU", 1>,
                  XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
-                                0, null_frag>,
-                 avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512, "VMOVDQU64">,
+                                SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>,
+                 avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+                                 SchedWriteVecMoveLS, "VMOVDQU">,
                  XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
 // to load or store from a ZMM register instead. These are converted in
 // expandPostRAPseudos.
 let isReMaterializable = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
+    isPseudo = 1, mayLoad = 1, hasSideEffects = 0 in {
 def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
-                            "", [], IIC_SSE_MOVA_P_RM>;
+                            "", []>, Sched<[WriteFLoadX]>;
 def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
-                            "", [], IIC_SSE_MOVA_P_RM>;
+                            "", []>, Sched<[WriteFLoadY]>;
 def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
-                            "", [], IIC_SSE_MOVA_P_RM>;
+                            "", []>, Sched<[WriteFLoadX]>;
 def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
-                            "", [], IIC_SSE_MOVA_P_RM>;
+                            "", []>, Sched<[WriteFLoadY]>;
 }
 
-let isPseudo = 1, SchedRW = [WriteStore], mayStore = 1, hasSideEffects = 0 in {
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
 def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
-                            "", [], IIC_SSE_MOVA_P_MR>;
+                            "", []>, Sched<[WriteFStoreX]>;
 def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
-                            "", [], IIC_SSE_MOVA_P_MR>;
+                            "", []>, Sched<[WriteFStoreY]>;
 def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
-                            "", [], IIC_SSE_MOVA_P_MR>;
+                            "", []>, Sched<[WriteFStoreX]>;
 def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
-                            "", [], IIC_SSE_MOVA_P_MR>;
+                            "", []>, Sched<[WriteFStoreY]>;
 }
 
 def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
@@ -3376,42 +3661,69 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
 // Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
 // available. Use a 512-bit operation and extract.
 let Predicates = [HasAVX512, NoVLX] in {
+  defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
+  defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
   defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
   defm : mask_move_lowering<"VMOVDQA32Z", v8i32x_info, v16i32_info>;
+
+  defm : mask_move_lowering<"VMOVAPDZ", v2f64x_info, v8f64_info>;
+  defm : mask_move_lowering<"VMOVDQA64Z", v2i64x_info, v8i64_info>;
+  defm : mask_move_lowering<"VMOVAPDZ", v4f64x_info, v8f64_info>;
+  defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+  defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
+  defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
+
+  defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
+  defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
 }
 
 let Predicates = [HasAVX512] in {
   // 512-bit store.
+  def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
+            (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
-            (VMOVDQA32Zmr addr:$dst, VR512:$src)>;
+            (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
-            (VMOVDQA32Zmr addr:$dst, VR512:$src)>;
+            (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+  def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+            (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(store (v32i16 VR512:$src), addr:$dst),
-            (VMOVDQU32Zmr addr:$dst, VR512:$src)>;
+            (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(store (v64i8 VR512:$src), addr:$dst),
-            (VMOVDQU32Zmr addr:$dst, VR512:$src)>;
+            (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
 }
 
 let Predicates = [HasVLX] in {
   // 128-bit store.
+  def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
+            (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
-            (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+            (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
-            (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+            (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(store (v4i32 VR128X:$src), addr:$dst),
+            (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
-            (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+            (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
-            (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+            (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
 
   // 256-bit store.
+  def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
+            (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
-            (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+            (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
-            (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+            (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(store (v8i32 VR256X:$src), addr:$dst),
+            (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
-            (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+            (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
-            (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+            (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
 }
 
 multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
@@ -3423,7 +3735,7 @@ multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
                               To.RC:$src0)),
             (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
                       Cast.RC:$src0, Cast.KRCWM:$mask,
-                      (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>;
+                      (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
 
   def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
                               (bitconvert
@@ -3432,7 +3744,7 @@ multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
                               Cast.ImmAllZerosV)),
             (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
                       Cast.KRCWM:$mask,
-                      (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>;
+                      (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
 }
 
 
@@ -3489,40 +3801,40 @@ let ExeDomain = SSEPackedInt in {
 def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
-                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
-                        EVEX, Sched<[WriteMove]>;
+                        (v4i32 (scalar_to_vector GR32:$src)))]>,
+                        EVEX, Sched<[WriteVecMoveFromGpr]>;
 def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
-                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>;
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+                      EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
 def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                         [(set VR128X:$dst,
-                          (v2i64 (scalar_to_vector GR64:$src)))],
-                          IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+                          (v2i64 (scalar_to_vector GR64:$src)))]>,
+                      EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
 def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
                       (ins i64mem:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
-                      EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteLoad]>;
+                      "vmovq\t{$src, $dst|$dst, $src}", []>,
+                      EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecLoad]>;
 let isCodeGenOnly = 1 in {
 def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
                        "vmovq\t{$src, $dst|$dst, $src}",
-                       [(set FR64X:$dst, (bitconvert GR64:$src))],
-                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+                       [(set FR64X:$dst, (bitconvert GR64:$src))]>,
+                       EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
 def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
-                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>;
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
 def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
-                         [(set GR64:$dst, (bitconvert FR64X:$src))],
-                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+                         [(set GR64:$dst, (bitconvert FR64X:$src))]>,
+                         EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
 def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
-                         [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>,
+                         EVEX, VEX_W, Sched<[WriteVecStore]>,
                          EVEX_CD8<64, CD8VT1>;
 }
 } // ExeDomain = SSEPackedInt
@@ -3532,13 +3844,13 @@ def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$
 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
 def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set FR32X:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, EVEX, Sched<[WriteMove]>;
+                      [(set FR32X:$dst, (bitconvert GR32:$src))]>,
+                      EVEX, Sched<[WriteVecMoveFromGpr]>;
 
 def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>;
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>,
+                      EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 // Move doubleword from xmm register to r/m32
@@ -3547,14 +3859,14 @@ let ExeDomain = SSEPackedInt in {
 def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
                        "vmovd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
-                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
-                       EVEX, Sched<[WriteMove]>;
+                                        (iPTR 0)))]>,
+                       EVEX, Sched<[WriteVecMoveToGpr]>;
 def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128X:$src),
                        "vmovd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (extractelt (v4i32 VR128X:$src),
-                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                       EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>;
+                                     (iPTR 0))), addr:$dst)]>,
+                       EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
 } // ExeDomain = SSEPackedInt
 
 // Move quadword from xmm1 register to r/m64
@@ -3563,44 +3875,47 @@ let ExeDomain = SSEPackedInt in {
 def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
-                                                   (iPTR 0)))],
-                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteMove]>,
-                      Requires<[HasAVX512, In64BitMode]>;
+                                                   (iPTR 0)))]>,
+                      PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>,
+                      Requires<[HasAVX512]>;
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}",
-                      [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteStore]>,
+                      "vmovq\t{$src, $dst|$dst, $src}", []>, PD,
+                      EVEX, VEX_W, Sched<[WriteVecStore]>,
                       Requires<[HasAVX512, In64BitMode]>;
 
 def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
                       (ins i64mem:$dst, VR128X:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
-                              addr:$dst)], IIC_SSE_MOVDQ>,
+                              addr:$dst)]>,
                       EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
-                      Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+                      Sched<[WriteVecStore]>, Requires<[HasAVX512]>;
 
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
 def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
                              (ins VR128X:$src),
-                             "vmovq.s\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
-                             EVEX, VEX_W, Sched<[WriteMove]>;
+                             "vmovq\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, VEX_W, Sched<[SchedWriteVecLogic.XMM]>;
 } // ExeDomain = SSEPackedInt
 
+def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
+                (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
+
 // Move Scalar Single to Double Int
 //
 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
 def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
                       (ins FR32X:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32X:$src))],
-                      IIC_SSE_MOVD_ToGP>, EVEX, Sched<[WriteMove]>;
+                      [(set GR32:$dst, (bitconvert FR32X:$src))]>,
+                      EVEX, Sched<[WriteVecMoveToGpr]>;
 def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                       (ins i32mem:$dst, FR32X:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>;
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>,
+                      EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 // Move Quadword Int to Packed Quadword Int
@@ -3611,20 +3926,27 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
-                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>;
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
 } // ExeDomain = SSEPackedInt
 
+// Allow "vmovd" but print "vmovq".
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIZrr VR128X:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64Zrr GR64:$dst, VR128X:$src), 0>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode,
                               X86VectorVTInfo _> {
+  let Predicates = [HasAVX512, OptForSize] in
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
-             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, Sched<[WriteMove]>;
+             _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -3632,7 +3954,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                       (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                       _.ImmAllZerosV)))],
-              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ, Sched<[WriteMove]>;
+              _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3641,34 +3963,35 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                      (_.VT _.RC:$src0))))],
-             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K, Sched<[WriteMove]>;
+             _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
   let canFoldAsLoad = 1, isReMaterializable = 1 in
   def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
              [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
-             _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, Sched<[WriteLoad]>;
+             _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
   let mayLoad = 1, hasSideEffects = 0 in {
     let Constraints = "$src0 = $dst" in
     def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
                (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
                !strconcat(asm, "\t{$src, $dst {${mask}}|",
                "$dst {${mask}}, $src}"),
-               [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K, Sched<[WriteLoad]>;
+               [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFLoad]>;
     def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
                (ins _.KRCWM:$mask, _.ScalarMemOp:$src),
                !strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
                "$dst {${mask}} {z}, $src}"),
-               [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
+               [], _.ExeDomain>, EVEX, EVEX_KZ, Sched<[WriteFLoad]>;
   }
   def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-             [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
-             EVEX, Sched<[WriteStore]>;
+             [(store _.FRC:$src, addr:$dst)],  _.ExeDomain>,
+             EVEX, Sched<[WriteFStore]>;
   let mayStore = 1, hasSideEffects = 0 in
   def mrk: AVX512PI<0x11, MRMDestMem, (outs),
               (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
               !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
-              [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K, Sched<[WriteStore]>;
+              [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
+              NotMemoryFoldable;
 }
 
 defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
@@ -3683,24 +4006,24 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
                         (_.VT (scalar_to_vector
-                                  (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
+                                  (_.EltVT (X86selects VK1WM:$mask,
                                                        (_.EltVT _.FRC:$src1),
                                                        (_.EltVT _.FRC:$src2))))))),
           (!cast<Instruction>(InstrStr#rrk)
-                        (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
-                        (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+                        (_.VT (COPY_TO_REGCLASS _.FRC:$src2, _.RC)),
+                        VK1WM:$mask,
                         (_.VT _.RC:$src0),
-                        (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>;
+                        (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>;
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
                         (_.VT (scalar_to_vector
-                                  (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
+                                  (_.EltVT (X86selects VK1WM:$mask,
                                                        (_.EltVT _.FRC:$src1),
                                                        (_.EltVT ZeroFP))))))),
           (!cast<Instruction>(InstrStr#rrkz)
-                        (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+                        VK1WM:$mask,
                         (_.VT _.RC:$src0),
-                        (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>;
+                        (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>;
 }
 
 multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3708,9 +4031,7 @@ multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
 
 def : Pat<(masked_store addr:$dst, Mask,
              (_.info512.VT (insert_subvector undef,
-                               (_.info256.VT (insert_subvector undef,
-                                                 (_.info128.VT _.info128.RC:$src),
-                                                 (iPTR 0))),
+                               (_.info128.VT _.info128.RC:$src),
                                (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                       (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
@@ -3725,14 +4046,37 @@ multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
 
 def : Pat<(masked_store addr:$dst, Mask,
              (_.info512.VT (insert_subvector undef,
-                               (_.info256.VT (insert_subvector undef,
-                                                 (_.info128.VT _.info128.RC:$src),
-                                                 (iPTR 0))),
+                               (_.info128.VT _.info128.RC:$src),
+                               (iPTR 0)))),
+          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked store directly. Codegen will widen 128-bit masked store to 512
+// bits on AVX512F only targets.
+multiclass avx512_store_scalar_lowering_subreg2<string InstrStr,
+                                               AVX512VLVectorVTInfo _,
+                                               dag Mask512, dag Mask128,
+                                               RegisterClass MaskRC,
+                                               SubRegIndex subreg> {
+
+// AVX512F pattern.
+def : Pat<(masked_store addr:$dst, Mask512,
+             (_.info512.VT (insert_subvector undef,
+                               (_.info128.VT _.info128.RC:$src),
                                (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                       (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
                       (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 
+// AVX512VL pattern.
+def : Pat<(masked_store addr:$dst, Mask128, (_.info128.VT _.info128.RC:$src)),
+          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 }
 
 multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3750,9 +4094,7 @@ def : Pat<(_.info128.VT (extract_subvector
 def : Pat<(_.info128.VT (extract_subvector
                 (_.info512.VT (masked_load addr:$srcAddr, Mask,
                       (_.info512.VT (insert_subvector undef,
-                            (_.info256.VT (insert_subvector undef,
-                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (iPTR 0))),
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
                             (iPTR 0))))),
                 (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
@@ -3778,9 +4120,7 @@ def : Pat<(_.info128.VT (extract_subvector
 def : Pat<(_.info128.VT (extract_subvector
                 (_.info512.VT (masked_load addr:$srcAddr, Mask,
                       (_.info512.VT (insert_subvector undef,
-                            (_.info256.VT (insert_subvector undef,
-                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (iPTR 0))),
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
                             (iPTR 0))))),
                 (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
@@ -3789,6 +4129,48 @@ def : Pat<(_.info128.VT (extract_subvector
 
 }
 
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked load directly. Codegen will widen 128-bit masked load to 512
+// bits on AVX512F only targets.
+multiclass avx512_load_scalar_lowering_subreg2<string InstrStr,
+                                              AVX512VLVectorVTInfo _,
+                                              dag Mask512, dag Mask128,
+                                              RegisterClass MaskRC,
+                                              SubRegIndex subreg> {
+// AVX512F patterns.
+def : Pat<(_.info128.VT (extract_subvector
+                         (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+                                        (_.info512.VT (bitconvert
+                                                       (v16i32 immAllZerosV))))),
+                           (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmkz)
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+                (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+                      (_.info512.VT (insert_subvector undef,
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+                            (iPTR 0))))),
+                (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      addr:$srcAddr)>;
+
+// AVX512Vl patterns.
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+                         (_.info128.VT (bitconvert (v4i32 immAllZerosV))))),
+          (!cast<Instruction>(InstrStr#rmkz)
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+                         (_.info128.VT (X86vzmovl _.info128.RC:$src)))),
+          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      addr:$srcAddr)>;
+}
+
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
@@ -3799,6 +4181,31 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (insert_subvector
+                           (v16i1 immAllZerosV),
+                           (v4i1 (extract_subvector
+                                  (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                                  (iPTR 0))),
+                           (iPTR 0))),
+                   (v4i1 (extract_subvector
+                          (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                          (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1
+                    (extract_subvector
+                     (v16i1
+                      (insert_subvector
+                       (v16i1 immAllZerosV),
+                       (v2i1 (extract_subvector
+                              (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+                              (iPTR 0))),
+                       (iPTR 0))),
+                     (iPTR 0))),
+                   (v2i1 (extract_subvector
+                          (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+                          (iPTR 0))), GR8, sub_8bit>;
+
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -3806,121 +4213,203 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
-def : Pat<(f32 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))),
-                           (f32 FR32X:$src1), (f32 FR32X:$src2))),
-          (COPY_TO_REGCLASS
-            (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-                        (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                          GR8:$mask, sub_8bit)), VK1WM),
-            (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
-            FR32X)>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (insert_subvector
+                           (v16i1 immAllZerosV),
+                           (v4i1 (extract_subvector
+                                  (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                                  (iPTR 0))),
+                           (iPTR 0))),
+                   (v4i1 (extract_subvector
+                          (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                          (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1
+                    (extract_subvector
+                     (v16i1
+                      (insert_subvector
+                       (v16i1 immAllZerosV),
+                       (v2i1 (extract_subvector
+                              (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+                              (iPTR 0))),
+                       (iPTR 0))),
+                     (iPTR 0))),
+                   (v2i1 (extract_subvector
+                          (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+                          (iPTR 0))), GR8, sub_8bit>;
 
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+          (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
+           (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
            VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
-           (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+           (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
 
-def : Pat<(f64 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))),
-                           (f64 FR64X:$src1), (f64 FR64X:$src2))),
-          (COPY_TO_REGCLASS
-            (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-                        (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                          GR8:$mask, sub_8bit)), VK1WM),
-            (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
-            FR64X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)),
+          (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
+           (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
 
 def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+          (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk
+           (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)),
            VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
-           (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+           (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
 
-def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
-          (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
-           (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)),
+          (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
+           (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
 
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
-                           "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG,
-                           FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>;
+                           "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           []>, XS, EVEX_4V, VEX_LIG,
+                           FoldGenData<"VMOVSSZrr">,
+                           Sched<[SchedWriteFShuffle.XMM]>;
 
-let Constraints = "$src0 = $dst" in
+  let Constraints = "$src0 = $dst" in
   def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                              (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
                                                    VR128X:$src1, VR128X:$src2),
-                             "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+                             "vmovss\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG,
-                             FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>;
+                             []>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+                             FoldGenData<"VMOVSSZrrk">,
+                             Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
-                         "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                         "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
-                         [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
-                         FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>;
+                         []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+                         FoldGenData<"VMOVSSZrrkz">,
+                         Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
-                           "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W,
-                           FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>;
+                           "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           []>, XD, EVEX_4V, VEX_LIG, VEX_W,
+                           FoldGenData<"VMOVSDZrr">,
+                           Sched<[SchedWriteFShuffle.XMM]>;
 
-let Constraints = "$src0 = $dst" in
+  let Constraints = "$src0 = $dst" in
   def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                              (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
                                                    VR128X:$src1, VR128X:$src2),
-                             "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+                             "vmovsd\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG,
-                             VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>;
+                             []>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+                             VEX_W, FoldGenData<"VMOVSDZrrk">,
+                             Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                               (ins f64x_info.KRCWM:$mask, VR128X:$src1,
                                                           VR128X:$src2),
-                              "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                              "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
-                              VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>;
-}
+                              []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+                              VEX_W, FoldGenData<"VMOVSDZrrkz">,
+                              Sched<[SchedWriteFShuffle.XMM]>;
+}
+
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+                             "$dst {${mask}}, $src1, $src2}",
+                (VMOVSSZrrk_REV VR128X:$dst, VK1WM:$mask,
+                                VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                             "$dst {${mask}} {z}, $src1, $src2}",
+                (VMOVSSZrrkz_REV VR128X:$dst, VK1WM:$mask,
+                                 VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSDZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+                             "$dst {${mask}}, $src1, $src2}",
+                (VMOVSDZrrk_REV VR128X:$dst, VK1WM:$mask,
+                                VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                             "$dst {${mask}} {z}, $src1, $src2}",
+                (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
+                                 VR128X:$src1, VR128X:$src2), 0>;
 
-let Predicates = [HasAVX512] in {
-  let AddedComplexity = 15 in {
+let Predicates = [HasAVX512, OptForSize] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
             (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
             (VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>;
-  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
-            (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
-                       (COPY_TO_REGCLASS FR64X:$src, VR128))>;
-  }
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
-              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+             (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
+              (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>;
   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
-              (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+             (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
+              (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;
+
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+              (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
+              (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>;
+
+  def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
+              (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>;
+  def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
+              (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+  def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+              (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+  def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
+              (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+}
+
+// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
+// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
+let Predicates = [HasAVX512, OptForSpeed] in {
   def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
-              (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
+             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
+                          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
+                          (i8 1))), sub_xmm)>;
   def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
-              (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
+             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
+                          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
+                          (i8 3))), sub_xmm)>;
+
+  def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
+                          (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)),
+                          (i8 1))), sub_xmm)>;
+  def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
+                          (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)),
+                          (i8 0xf))), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512] in {
 
-  let AddedComplexity = 20 in {
   // MOVSSrm zeros the high parts of the register; represent this
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
             (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
-  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
   def : Pat<(v4f32 (X86vzload addr:$src)),
@@ -3930,8 +4419,6 @@ let Predicates = [HasAVX512] in {
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
-  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
@@ -3943,7 +4430,7 @@ let Predicates = [HasAVX512] in {
   // 256-bit types
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                    (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
@@ -3959,7 +4446,7 @@ let Predicates = [HasAVX512] in {
   // 512-bit types
   def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
                    (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
   def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
@@ -3970,164 +4457,127 @@ let Predicates = [HasAVX512] in {
             (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
   def : Pat<(v8f64 (X86vzload addr:$src)),
             (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
-  }
+
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                    (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
-
-  // Move low f64 and clear high bits.
-  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
-                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
-  def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
-                       (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
-
-  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
-                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
-  def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
-                       (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
 
   // Extract and store.
   def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
-
-  // Shuffle with VMOVSS
-  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
-            (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
-
-  def : Pat<(v4f32 (X86Movss VR128X:$src1, (scalar_to_vector FR32X:$src2))),
-            (VMOVSSZrr VR128X:$src1,
-                       (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
-
-  // Shuffle with VMOVSD
-  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
-
-  def : Pat<(v2f64 (X86Movsd VR128X:$src1, (scalar_to_vector FR64X:$src2))),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
-
-  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
-  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
 }
 
-let AddedComplexity = 15 in
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
                                 (ins VR128X:$src),
                                 "vmovq\t{$src, $dst|$dst, $src}",
                                 [(set VR128X:$dst, (v2i64 (X86vzmovl
-                                                   (v2i64 VR128X:$src))))],
-                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+                                                   (v2i64 VR128X:$src))))]>,
+                                EVEX, VEX_W;
+}
 
 let Predicates = [HasAVX512] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
-              (VMOVDI2PDIZrr GR32:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+            (VMOVDI2PDIZrr GR32:$src)>;
 
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
-              (VMOV64toPQIZrr GR64:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+            (VMOV64toPQIZrr GR64:$src)>;
 
-    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                                 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-              (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
+
+  def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
 
-    def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
-                                 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-              (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
-  }
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
-  let AddedComplexity = 20 in {
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
-              (VMOVDI2PDIZrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
-              (VMOVDI2PDIZrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-              (VMOVDI2PDIZrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-              (VMOVDI2PDIZrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzload addr:$src)),
-              (VMOVDI2PDIZrm addr:$src)>;
-    def : Pat<(v8i32 (X86vzload addr:$src)),
-              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (VMOVQI2PQIZrm addr:$src)>;
-    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
-              (VMOVZPQILo2PQIZrr VR128X:$src)>;
-    def : Pat<(v2i64 (X86vzload addr:$src)),
-              (VMOVQI2PQIZrm addr:$src)>;
-    def : Pat<(v4i64 (X86vzload addr:$src)),
-              (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
-  }
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+            (VMOVDI2PDIZrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+            (VMOVDI2PDIZrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVDI2PDIZrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzload addr:$src)),
+            (VMOVDI2PDIZrm addr:$src)>;
+  def : Pat<(v8i32 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVQI2PQIZrm addr:$src)>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+            (VMOVZPQILo2PQIZrr VR128X:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVQI2PQIZrm addr:$src)>;
+  def : Pat<(v4i64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
 
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
   def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
                                 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
 
   // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
   def : Pat<(v16i32 (X86vzload addr:$src)),
-            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
   def : Pat<(v8i64 (X86vzload addr:$src)),
-            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
 }
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
-let SchedRW = [WriteLoad] in {
-  def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
-                        (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
-                        [], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
-                        EVEX_CD8<64, CD8VF>;
 
-  let Predicates = [HasVLX] in {
-    def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
-                         (ins i256mem:$src),
-                         "vmovntdqa\t{$src, $dst|$dst, $src}",
-                         [], SSEPackedInt>, EVEX, T8PD, EVEX_V256,
-                         EVEX_CD8<64, CD8VF>;
+def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+                      (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+                      [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>,
+                      EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
-    def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
-                        (ins i128mem:$src),
-                        "vmovntdqa\t{$src, $dst|$dst, $src}",
-                        [], SSEPackedInt>, EVEX, T8PD, EVEX_V128,
-                        EVEX_CD8<64, CD8VF>;
-  }
+let Predicates = [HasVLX] in {
+  def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+                       (ins i256mem:$src),
+                       "vmovntdqa\t{$src, $dst|$dst, $src}",
+                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+                       EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
+
+  def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i128mem:$src),
+                      "vmovntdqa\t{$src, $dst|$dst, $src}",
+                      [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+                      EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                        PatFrag st_frag = alignednontemporalstore,
-                        InstrItinClass itin = IIC_SSE_MOVNT> {
-  let SchedRW = [WriteStore], AddedComplexity = 400 in
+                        X86SchedWriteMoveLS Sched,
+                        PatFrag st_frag = alignednontemporalstore> {
+  let SchedRW = [Sched.MR], AddedComplexity = 400 in
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(st_frag (_.VT _.RC:$src), addr:$dst)],
-                    _.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+                    _.ExeDomain>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
 }
 
 multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
-                                                  AVX512VLVectorVTInfo VTInfo> {
+                           AVX512VLVectorVTInfo VTInfo,
+                           X86SchedWriteMoveLSWidths Sched> {
   let Predicates = [HasAVX512] in
-    defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+    defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512, Sched.ZMM>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-    defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
-    defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+    defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256, Sched.YMM>, EVEX_V256;
+    defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128, Sched.XMM>, EVEX_V128;
   }
 }
 
-defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD;
-defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W;
-defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS;
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info,
+                                SchedWriteVecMoveLSNT>, PD;
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info,
+                                SchedWriteFMoveLSNT>, PD, VEX_W;
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info,
+                                SchedWriteFMoveLSNT>, PS;
 
 let Predicates = [HasAVX512], AddedComplexity = 400 in {
   def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
@@ -4179,131 +4629,135 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
 // AVX-512 - Integer arithmetic
 //
 multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           X86VectorVTInfo _, OpndItins itins,
+                           X86VectorVTInfo _, X86FoldableSchedWrite sched,
                            bit IsCommutable = 0> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
-                    itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V,
-                    Sched<[itins.Sched]>;
+                    IsCommutable>, AVX512BIBase, EVEX_4V,
+                    Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1,
-                                (bitconvert (_.LdFrag addr:$src2)))),
-                  itins.rm>, AVX512BIBase, EVEX_4V,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                (bitconvert (_.LdFrag addr:$src2))))>,
+                  AVX512BIBase, EVEX_4V,
+                  Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            X86VectorVTInfo _, OpndItins itins,
+                            X86VectorVTInfo _, X86FoldableSchedWrite sched,
                             bit IsCommutable = 0> :
-           avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+           avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> {
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                   "${src2}"##_.BroadcastStr##", $src1",
                   "$src1, ${src2}"##_.BroadcastStr,
                   (_.VT (OpNode _.RC:$src1,
                                 (X86VBroadcast
-                                    (_.ScalarLdFrag addr:$src2)))),
-                  itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                    (_.ScalarLdFrag addr:$src2))))>,
+                  AVX512BIBase, EVEX_4V, EVEX_B,
+                  Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              AVX512VLVectorVTInfo VTInfo, OpndItins itins,
-                              Predicate prd, bit IsCommutable = 0> {
+                              AVX512VLVectorVTInfo VTInfo,
+                              X86SchedWriteWidths sched, Predicate prd,
+                              bit IsCommutable = 0> {
   let Predicates = [prd] in
-    defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+    defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM,
                              IsCommutable>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
-                             IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
-                             IsCommutable>, EVEX_V128;
+    defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256,
+                                sched.YMM, IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128,
+                                sched.XMM, IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
-                               Predicate prd, bit IsCommutable = 0> {
+                               AVX512VLVectorVTInfo VTInfo,
+                               X86SchedWriteWidths sched, Predicate prd,
+                               bit IsCommutable = 0> {
   let Predicates = [prd] in
-    defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+    defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM,
                              IsCommutable>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
-                             IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
-                             IsCommutable>, EVEX_V128;
+    defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+                                 sched.YMM, IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+                                 sched.XMM, IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
+                                X86SchedWriteWidths sched, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
-                               itins, prd, IsCommutable>,
-                               VEX_W, EVEX_CD8<64, CD8VF>;
+                                  sched, prd, IsCommutable>,
+                                  VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
+                                X86SchedWriteWidths sched, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
-                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+                                  sched, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
 }
 
 multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
+                                X86SchedWriteWidths sched, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
-                              itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>,
-                              VEX_WIG;
+                                 sched, prd, IsCommutable>, EVEX_CD8<16, CD8VF>,
+                                 VEX_WIG;
 }
 
 multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
+                                X86SchedWriteWidths sched, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
-                              itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>,
-                              VEX_WIG;
+                                 sched, prd, IsCommutable>, EVEX_CD8<8, CD8VF>,
+                                 VEX_WIG;
 }
 
 multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins, Predicate prd,
-                                 bit IsCommutable = 0> {
-  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+                                 SDNode OpNode, X86SchedWriteWidths sched,
+                                 Predicate prd, bit IsCommutable = 0> {
+  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, sched, prd,
                                    IsCommutable>;
 
-  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, sched, prd,
                                    IsCommutable>;
 }
 
 multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins, Predicate prd,
-                                 bit IsCommutable = 0> {
-  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
+                                 SDNode OpNode, X86SchedWriteWidths sched,
+                                 Predicate prd, bit IsCommutable = 0> {
+  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, sched, prd,
                                    IsCommutable>;
 
-  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
+  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, sched, prd,
                                    IsCommutable>;
 }
 
 multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
                                   bits<8> opc_d, bits<8> opc_q,
                                   string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins, bit IsCommutable = 0> {
+                                  X86SchedWriteWidths sched,
+                                  bit IsCommutable = 0> {
   defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
-                                    itins, HasAVX512, IsCommutable>,
+                                    sched, HasAVX512, IsCommutable>,
               avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
-                                    itins, HasBWI, IsCommutable>;
+                                    sched, HasBWI, IsCommutable>;
 }
 
-multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+                            X86FoldableSchedWrite sched,
                             SDNode OpNode,X86VectorVTInfo _Src,
                             X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
                             bit IsCommutable = 0> {
@@ -4313,15 +4767,15 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                             (_Dst.VT (OpNode
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
-                            itins.rr, IsCommutable>,
-                            AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>;
+                            IsCommutable>,
+                            AVX512BIBase, EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2)))),
-                        itins.rm>, AVX512BIBase, EVEX_4V,
-                        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                        AVX512BIBase, EVEX_4V,
+                        Sched<[sched.Folded, ReadAfterLd]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                     (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
@@ -4330,71 +4784,72 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                      "$src1, ${src2}"##_Brdct.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Brdct.VT (X86VBroadcast
-                                          (_Brdct.ScalarLdFrag addr:$src2)))))),
-                    itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                          (_Brdct.ScalarLdFrag addr:$src2))))))>,
+                    AVX512BIBase, EVEX_4V, EVEX_B,
+                    Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
-                                    SSE_INTALU_ITINS_P, 1>;
+                                    SchedWriteVecALU, 1>;
 defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
-                                    SSE_INTALU_ITINS_P, 0>;
+                                    SchedWriteVecALU, 0>;
 defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
-                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+                                    SchedWriteVecALU, HasBWI, 1>;
 defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
-                                    SSE_INTALU_ITINS_P, HasBWI, 0>;
+                                    SchedWriteVecALU, HasBWI, 0>;
 defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
-                                     SSE_INTALU_ITINS_P, HasBWI, 0>;
+                                     SchedWriteVecALU, HasBWI, 0>;
 defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
-                                    SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD;
+                                    SchedWritePMULLD, HasAVX512, 1>, T8PD;
 defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
-                                    SSE_INTMUL_ITINS_P, HasBWI, 1>;
+                                    SchedWriteVecIMul, HasBWI, 1>;
 defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
-                                    SSE_INTMUL_ITINS_P, HasDQI, 1>, T8PD;
-defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTMUL_ITINS_P,
+                                    SchedWriteVecIMul, HasDQI, 1>, T8PD,
+                                    NotEVEX2VEXConvertible;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,
                                     HasBWI, 1>;
-defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
                                      HasBWI, 1>;
-defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
-                                      HasBWI, 1>, T8PD;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
+                                      SchedWriteVecIMul, HasBWI, 1>, T8PD;
 defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
-                                   SSE_INTALU_ITINS_P, HasBWI, 1>;
-
-multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
-                            AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
+                                   SchedWriteVecALU, HasBWI, 1>;
+defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
+                                    SchedWriteVecIMul, HasAVX512, 1>, T8PD;
+defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq,
+                                     SchedWriteVecIMul, HasAVX512, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr,
+                            X86SchedWriteWidths sched,
+                            AVX512VLVectorVTInfo _SrcVTInfo,
+                            AVX512VLVectorVTInfo _DstVTInfo,
                             SDNode OpNode, Predicate prd,  bit IsCommutable = 0> {
   let Predicates = [prd] in
-    defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+    defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
                                  _SrcVTInfo.info512, _DstVTInfo.info512,
                                  v8i64_info, IsCommutable>,
                                   EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
   let Predicates = [HasVLX, prd] in {
-    defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+    defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
                                       _SrcVTInfo.info256, _DstVTInfo.info256,
                                       v4i64x_info, IsCommutable>,
                                       EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
-    defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+    defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode,
                                       _SrcVTInfo.info128, _DstVTInfo.info128,
                                       v2i64x_info, IsCommutable>,
                                      EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
   }
 }
 
-defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTMUL_ITINS_P,
-                                avx512vl_i32_info, avx512vl_i64_info,
-                                X86pmuldq, HasAVX512, 1>,T8PD;
-defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
-                                avx512vl_i32_info, avx512vl_i64_info,
-                                X86pmuludq, HasAVX512, 1>;
-defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
+defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU,
                                 avx512vl_i8_info, avx512vl_i8_info,
                                 X86multishift, HasVBMI, 0>, T8PD;
 
 multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
-                            OpndItins itins> {
+                            X86FoldableSchedWrite sched> {
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                     (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
                     OpcodeStr,
@@ -4402,14 +4857,14 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                      "$src1, ${src2}"##_Src.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Src.VT (X86VBroadcast
-                                          (_Src.ScalarLdFrag addr:$src2)))))),
-                    itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                          (_Src.ScalarLdFrag addr:$src2))))))>,
+                    EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                             SDNode OpNode,X86VectorVTInfo _Src,
-                            X86VectorVTInfo _Dst, OpndItins itins,
+                            X86VectorVTInfo _Dst, X86FoldableSchedWrite sched,
                             bit IsCommutable = 0> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
                             (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
@@ -4417,45 +4872,49 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                             (_Dst.VT (OpNode
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
-                            itins.rr, IsCommutable>,
-                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[itins.Sched]>;
+                            IsCommutable>,
+                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2)))), itins.rm>,
+                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
                          EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode> {
   let Predicates = [HasBWI] in
   defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
-                                 v32i16_info, SSE_PACK>,
+                                 v32i16_info, SchedWriteShuffle.ZMM>,
                 avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
-                                 v32i16_info, SSE_PACK>, EVEX_V512;
+                                 v32i16_info, SchedWriteShuffle.ZMM>, EVEX_V512;
   let Predicates = [HasBWI, HasVLX] in {
     defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
-                                     v16i16x_info, SSE_PACK>,
+                                     v16i16x_info, SchedWriteShuffle.YMM>,
                      avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
-                                     v16i16x_info, SSE_PACK>, EVEX_V256;
+                                      v16i16x_info, SchedWriteShuffle.YMM>,
+                                      EVEX_V256;
     defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
-                                     v8i16x_info, SSE_PACK>,
+                                     v8i16x_info, SchedWriteShuffle.XMM>,
                      avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
-                                     v8i16x_info, SSE_PACK>, EVEX_V128;
+                                      v8i16x_info, SchedWriteShuffle.XMM>,
+                                      EVEX_V128;
   }
 }
 multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
                             SDNode OpNode> {
   let Predicates = [HasBWI] in
-  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
-                                v64i8_info, SSE_PACK>, EVEX_V512, VEX_WIG;
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, v64i8_info,
+                                SchedWriteShuffle.ZMM>, EVEX_V512, VEX_WIG;
   let Predicates = [HasBWI, HasVLX] in {
     defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
-                                    v32i8x_info, SSE_PACK>, EVEX_V256, VEX_WIG;
+                                     v32i8x_info, SchedWriteShuffle.YMM>,
+                                     EVEX_V256, VEX_WIG;
     defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
-                                    v16i8x_info, SSE_PACK>, EVEX_V128, VEX_WIG;
+                                     v16i8x_info, SchedWriteShuffle.XMM>,
+                                     EVEX_V128, VEX_WIG;
   }
 }
 
@@ -4464,12 +4923,15 @@ multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
                             AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
   let Predicates = [HasBWI] in
   defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
-                                _Dst.info512, SSE_PMADD, IsCommutable>, EVEX_V512;
+                                _Dst.info512, SchedWriteVecIMul.ZMM,
+                                IsCommutable>, EVEX_V512;
   let Predicates = [HasBWI, HasVLX] in {
     defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
-                                     _Dst.info256, SSE_PMADD, IsCommutable>, EVEX_V256;
+                                     _Dst.info256, SchedWriteVecIMul.YMM,
+                                     IsCommutable>, EVEX_V256;
     defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
-                                     _Dst.info128, SSE_PMADD, IsCommutable>, EVEX_V128;
+                                     _Dst.info128, SchedWriteVecIMul.XMM,
+                                     IsCommutable>, EVEX_V128;
   }
 }
 
@@ -4484,32 +4946,44 @@ defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
                      avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, VEX_WIG;
 
 defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
 defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
-                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>;
+defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    NotEVEX2VEXConvertible;
 
 defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+                                    SchedWriteVecALU, HasBWI, 1>;
 defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
-                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    NotEVEX2VEXConvertible;
 
 defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
 defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
-                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>;
+defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    NotEVEX2VEXConvertible;
 
 defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+                                    SchedWriteVecALU, HasBWI, 1>;
 defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
-                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
-                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    NotEVEX2VEXConvertible;
 
 // PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
 let Predicates = [HasDQI, NoVLX] in {
@@ -4576,8 +5050,8 @@ let Predicates = [HasAVX512, NoVLX] in {
 // be set to null_frag for 32-bit elements.
 multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
                            SDPatternOperator OpNode,
-                           SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _,
-                           bit IsCommutable = 0> {
+                           SDNode OpNodeMsk, X86FoldableSchedWrite sched,
+                           X86VectorVTInfo _, bit IsCommutable = 0> {
   let hasSideEffects = 0 in
   defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -4586,8 +5060,8 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
                                      (bitconvert (_.VT _.RC:$src2)))),
                     (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
                                                           _.RC:$src2)))),
-                    itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V,
-                    Sched<[itins.Sched]>;
+                    IsCommutable>, AVX512BIBase, EVEX_4V,
+                    Sched<[sched]>;
 
   let hasSideEffects = 0, mayLoad = 1 in
   defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4596,18 +5070,18 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
                   (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
                                    (bitconvert (_.LdFrag addr:$src2)))),
                   (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (bitconvert (_.LdFrag addr:$src2)))))),
-                  itins.rm>, AVX512BIBase, EVEX_4V,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (bitconvert (_.LdFrag addr:$src2))))))>,
+                  AVX512BIBase, EVEX_4V,
+                  Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 // OpNodeMsk is the OpNode to use where element size is important. So use
 // for all of the broadcast patterns.
 multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
                             SDPatternOperator OpNode,
-                            SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _,
+                            SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _,
                             bit IsCommutable = 0> :
-           avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, itins, _,
+           avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, sched, _,
                            IsCommutable> {
   defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -4620,327 +5094,350 @@ multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
                   (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
                                      (bitconvert
                                       (_.VT (X86VBroadcast
-                                             (_.ScalarLdFrag addr:$src2)))))))),
-                  itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                             (_.ScalarLdFrag addr:$src2))))))))>,
+                  AVX512BIBase, EVEX_4V, EVEX_B,
+                  Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr,
                                SDPatternOperator OpNode,
-                               SDNode OpNodeMsk, OpndItins itins,
+                               SDNode OpNodeMsk, X86SchedWriteWidths sched,
                                AVX512VLVectorVTInfo VTInfo,
                                bit IsCommutable = 0> {
   let Predicates = [HasAVX512] in
-    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.ZMM,
                               VTInfo.info512, IsCommutable>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.YMM,
                                  VTInfo.info256, IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.XMM,
                                  VTInfo.info128, IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins,
+                                 SDNode OpNode, X86SchedWriteWidths sched,
                                  bit IsCommutable = 0> {
-  defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, itins,
+  defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, sched,
                                avx512vl_i64_info, IsCommutable>,
                                VEX_W, EVEX_CD8<64, CD8VF>;
-  defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, itins,
+  defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, sched,
                                avx512vl_i32_info, IsCommutable>,
                                EVEX_CD8<32, CD8VF>;
 }
 
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, SSE_BIT_ITINS_P, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, SSE_BIT_ITINS_P, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_BIT_ITINS_P, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, SSE_BIT_ITINS_P>;
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+                                   SchedWriteVecLogic, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+                                  SchedWriteVecLogic, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+                                   SchedWriteVecLogic, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+                                    SchedWriteVecLogic>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
 //===----------------------------------------------------------------------===//
+
 multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode OpNode, SDNode VecNode, OpndItins itins,
-                         bit IsCommutable> {
+                            SDNode OpNode, SDNode VecNode,
+                            X86FoldableSchedWrite sched, bit IsCommutable> {
   let ExeDomain = _.ExeDomain in {
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (_.VT (VecNode _.RC:$src1, _.RC:$src2,
-                                          (i32 FROUND_CURRENT))),
-                           itins.rr>, Sched<[itins.Sched]>;
+                                          (i32 FROUND_CURRENT)))>,
+                           Sched<[sched]>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
                                         _.ScalarIntMemCPat:$src2,
-                                        (i32 FROUND_CURRENT))),
-                         itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                        (i32 FROUND_CURRENT)))>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
-                          itins.rr>, Sched<[itins.Sched]> {
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+                          Sched<[sched]> {
     let isCommutable = IsCommutable;
   }
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
-                         (_.ScalarLdFrag addr:$src2)))], itins.rm>,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         (_.ScalarLdFrag addr:$src2)))]>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   }
   }
 }
 
 multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
+                                  SDNode VecNode, X86FoldableSchedWrite sched,
+                                  bit IsCommutable = 0> {
   let ExeDomain = _.ExeDomain in
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                           (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                           "$rc, $src2, $src1", "$src1, $src2, $rc",
                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                          (i32 imm:$rc)), itins.rr, IsCommutable>,
-                          EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+                          (i32 imm:$rc)), IsCommutable>,
+                          EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                                 SDNode OpNode, SDNode VecNode, SDNode SaeNode,
-                                OpndItins itins, bit IsCommutable> {
+                                X86FoldableSchedWrite sched, bit IsCommutable> {
   let ExeDomain = _.ExeDomain in {
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (_.VT (VecNode _.RC:$src1, _.RC:$src2)),
-                           itins.rr>, Sched<[itins.Sched]>;
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
+                           Sched<[sched]>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
-                                        _.ScalarIntMemCPat:$src2)),
-                         itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                        _.ScalarIntMemCPat:$src2))>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
 
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
-                          itins.rr>, Sched<[itins.Sched]> {
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+                          Sched<[sched]> {
     let isCommutable = IsCommutable;
   }
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
-                         (_.ScalarLdFrag addr:$src2)))], itins.rm>,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         (_.ScalarLdFrag addr:$src2)))]>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   }
 
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                             (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                            (i32 FROUND_NO_EXC)), itins.rr>, EVEX_B,
-                            Sched<[itins.Sched]>;
+                            (i32 FROUND_NO_EXC))>, EVEX_B,
+                            Sched<[sched]>;
   }
 }
 
 multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SDNode VecNode,
-                                  SizeItins itins, bit IsCommutable> {
+                                SDNode VecNode, X86SchedWriteSizes sched,
+                                bit IsCommutable> {
   defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
-                              itins.s, IsCommutable>,
+                              sched.PS.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
-                              itins.s, IsCommutable>,
+                              sched.PS.Scl, IsCommutable>,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
-                              itins.d,                  IsCommutable>,
+                              sched.PD.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
-                              itins.d, IsCommutable>,
+                              sched.PD.Scl, IsCommutable>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SDNode VecNode, SDNode SaeNode,
-                                  SizeItins itins, bit IsCommutable> {
+                              SDNode VecNode, SDNode SaeNode,
+                              X86SchedWriteSizes sched, bit IsCommutable> {
   defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
-                              VecNode, SaeNode, itins.s, IsCommutable>,
+                              VecNode, SaeNode, sched.PS.Scl, IsCommutable>,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
-                              VecNode, SaeNode, itins.d, IsCommutable>,
+                              VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 }
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>;
-defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
-                                 SSE_ALU_ITINS_S, 0>;
-defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
-                                 SSE_ALU_ITINS_S, 0>;
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds,
+                                 SchedWriteFAddSizes, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds,
+                                 SchedWriteFMulSizes, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds,
+                                 SchedWriteFAddSizes, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds,
+                                 SchedWriteFDivSizes, 0>;
+defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+                               SchedWriteFCmpSizes, 0>;
+defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+                               SchedWriteFCmpSizes, 0>;
 
 // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
-                          X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
+                                    X86VectorVTInfo _, SDNode OpNode,
+                                    X86FoldableSchedWrite sched> {
   let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
-                          itins.rr>, Sched<[itins.Sched]> {
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+                          Sched<[sched]> {
     let isCommutable = 1;
   }
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
-                         (_.ScalarLdFrag addr:$src2)))], itins.rm>,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         (_.ScalarLdFrag addr:$src2)))]>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
-                                SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
-                                EVEX_CD8<32, CD8VT1>;
+                                         SchedWriteFCmp.Scl>, XS, EVEX_4V,
+                                         VEX_LIG, EVEX_CD8<32, CD8VT1>;
 
 defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
-                                SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
-                                EVEX_CD8<64, CD8VT1>;
+                                         SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
+                                         VEX_LIG, EVEX_CD8<64, CD8VT1>;
 
 defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
-                                SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
-                                EVEX_CD8<32, CD8VT1>;
+                                         SchedWriteFCmp.Scl>, XS, EVEX_4V,
+                                         VEX_LIG, EVEX_CD8<32, CD8VT1>;
 
 defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
-                                SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
-                                EVEX_CD8<64, CD8VT1>;
+                                         SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
+                                         VEX_LIG, EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
-                            X86VectorVTInfo _, OpndItins itins,
-                            bit IsCommutable> {
+                            X86VectorVTInfo _, X86FoldableSchedWrite sched,
+                            bit IsCommutable,
+                            bit IsKZCommutable = IsCommutable> {
   let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
-                  IsCommutable>, EVEX_4V, Sched<[itins.Sched]>;
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0,
+                  IsKZCommutable>,
+                  EVEX_4V, Sched<[sched]>;
   let mayLoad = 1 in {
     defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
                     "$src2, $src1", "$src1, $src2",
-                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
-                    EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
+                    EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
     defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                      (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
                      "${src2}"##_.BroadcastStr##", $src1",
                      "$src1, ${src2}"##_.BroadcastStr,
                      (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2)))),
-                     itins.rm>, EVEX_4V, EVEX_B,
-                     Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                                (_.ScalarLdFrag addr:$src2))))>,
+                     EVEX_4V, EVEX_B,
+                     Sched<[sched.Folded, ReadAfterLd]>;
     }
   }
 }
 
-multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
-                                  OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
+                                  SDPatternOperator OpNodeRnd,
+                                  X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
                   "$rc, $src2, $src1", "$src1, $src2, $rc",
-                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc))), itins.rr>,
-                  EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+                  EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
-multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
-                                OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
+                                SDPatternOperator OpNodeRnd,
+                                X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "{sae}, $src2, $src1", "$src1, $src2, {sae}",
-                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC))), itins.rr>,
-                  EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+                  EVEX_4V, EVEX_B, Sched<[sched]>;
 }
 
 multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
-                             Predicate prd, SizeItins itins,
-                             bit IsCommutable = 0> {
+                             Predicate prd, X86SchedWriteSizes sched,
+                             bit IsCommutable = 0,
+                             bit IsPD128Commutable = IsCommutable> {
   let Predicates = [prd] in {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
-                              itins.s, IsCommutable>, EVEX_V512, PS,
+                              sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
                               EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
-                              itins.d, IsCommutable>, EVEX_V512, PD, VEX_W,
+                              sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W,
                               EVEX_CD8<64, CD8VF>;
   }
 
     // Define only if AVX512VL feature is present.
   let Predicates = [prd, HasVLX] in {
     defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
-                                   itins.s, IsCommutable>, EVEX_V128, PS,
+                                   sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
                                    EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
-                                   itins.s, IsCommutable>, EVEX_V256, PS,
+                                   sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
                                    EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
-                                   itins.d, IsCommutable>, EVEX_V128, PD, VEX_W,
+                                   sched.PD.XMM, IsPD128Commutable,
+                                   IsCommutable>, EVEX_V128, PD, VEX_W,
                                    EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
-                                   itins.d, IsCommutable>, EVEX_V256, PD, VEX_W,
+                                   sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W,
                                    EVEX_CD8<64, CD8VF>;
   }
 }
 
 multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
-                                   SizeItins itins> {
-  defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>,
-                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>,
-                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+                                   X86SchedWriteSizes sched> {
+  defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
+                                    v16f32_info>,
+                                    EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
+                                    v8f64_info>,
+                                    EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
-                                 SizeItins itins> {
-  defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>,
-                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>,
-                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+                                 X86SchedWriteSizes sched> {
+  defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
+                                  v16f32_info>,
+                                  EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
+                                  v8f64_info>,
+                                  EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
 }
 
 defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
-                              SSE_ALU_ITINS_P, 1>,
-            avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SSE_ALU_ITINS_P>;
+                              SchedWriteFAddSizes, 1>,
+            avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
 defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
-                              SSE_MUL_ITINS_P, 1>,
-            avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SSE_MUL_ITINS_P>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
-            avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SSE_ALU_ITINS_P>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
-            avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SSE_DIV_ITINS_P>;
+                              SchedWriteFMulSizes, 1>,
+            avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512,
+                              SchedWriteFAddSizes>,
+            avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512,
+                              SchedWriteFDivSizes>,
+            avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
 defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
-                              SSE_ALU_ITINS_P, 0>,
-            avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SSE_ALU_ITINS_P>;
+                              SchedWriteFCmpSizes, 0>,
+            avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>;
 defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
-                              SSE_ALU_ITINS_P, 0>,
-            avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SSE_ALU_ITINS_P>;
+                              SchedWriteFCmpSizes, 0>,
+            avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>;
 let isCodeGenOnly = 1 in {
   defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
-                                 SSE_ALU_ITINS_P, 1>;
+                                 SchedWriteFCmpSizes, 1>;
   defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
-                                 SSE_ALU_ITINS_P, 1>;
+                                 SchedWriteFCmpSizes, 1>;
 }
 defm VAND  : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
-                               SSE_ALU_ITINS_P, 1>;
+                               SchedWriteFLogicSizes, 1>;
 defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
-                               SSE_ALU_ITINS_P, 0>;
+                               SchedWriteFLogicSizes, 0>;
 defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
-                               SSE_ALU_ITINS_P, 1>;
+                               SchedWriteFLogicSizes, 1>;
 defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
-                               SSE_ALU_ITINS_P, 1>;
+                               SchedWriteFLogicSizes, 1>;
 
 // Patterns catch floating point selects with bitcasted integer logic ops.
 multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
@@ -5012,370 +5509,444 @@ defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
 let Predicates = [HasVLX,HasDQI] in {
   // Use packed logical operations for scalar ops.
   def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
-            (COPY_TO_REGCLASS (VANDPDZ128rr
-                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+                                  (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+             FR64X)>;
   def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
-            (COPY_TO_REGCLASS (VORPDZ128rr
-                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+                                 (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+             FR64X)>;
   def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
-            (COPY_TO_REGCLASS (VXORPDZ128rr
-                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+                                  (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+             FR64X)>;
   def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
-            (COPY_TO_REGCLASS (VANDNPDZ128rr
-                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+                                   (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
+             FR64X)>;
 
   def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
-            (COPY_TO_REGCLASS (VANDPSZ128rr
-                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+                                  (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+             FR32X)>;
   def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
-            (COPY_TO_REGCLASS (VORPSZ128rr
-                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+                                 (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+             FR32X)>;
   def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
-            (COPY_TO_REGCLASS (VXORPSZ128rr
-                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+                                  (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+             FR32X)>;
   def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
-            (COPY_TO_REGCLASS (VANDNPSZ128rr
-                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+                                   (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
+             FR32X)>;
 }
 
 multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              OpndItins itins, X86VectorVTInfo _> {
+                              X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))),
-                  itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+                  EVEX_4V, Sched<[sched]>;
   defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT)),
-                  itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                  (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>,
+                  EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
   defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
                    "${src2}"##_.BroadcastStr##", $src1",
                    "$src1, ${src2}"##_.BroadcastStr,
                    (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
                                               (_.ScalarLdFrag addr:$src2))),
-                                              (i32 FROUND_CURRENT)), itins.rm>,
-                   EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                              (i32 FROUND_CURRENT))>,
+                   EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   OpndItins itins, X86VectorVTInfo _> {
+                                   X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))), itins.rr>,
-                  Sched<[itins.Sched]>;
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+                  Sched<[sched]>;
   defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2,
-                          (i32 FROUND_CURRENT)), itins.rm>,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          (i32 FROUND_CURRENT))>,
+                  Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
-multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
-  defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>,
-             avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>,
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
+                                SDNode OpNode, SDNode OpNodeScal,
+                                X86SchedWriteWidths sched> {
+  defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
+             avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
                               EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>,
-             avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>,
+  defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
+             avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
                               EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F32S, f32x_info>,
-                avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
-                              EVEX_4V,EVEX_CD8<32, CD8VT1>;
-  defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F64S, f64x_info>,
-                avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
-                              EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+  defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f32x_info>,
+             avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, sched.Scl>,
+                           EVEX_4V,EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f64x_info>,
+             avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, sched.Scl>,
+                           EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
-    defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v4f32x_info>,
+    defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v4f32x_info>,
                                    EVEX_V128, EVEX_CD8<32, CD8VF>;
-    defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v8f32x_info>,
+    defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v8f32x_info>,
                                    EVEX_V256, EVEX_CD8<32, CD8VF>;
-    defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v2f64x_info>,
+    defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v2f64x_info>,
                                    EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
-    defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v4f64x_info>,
+    defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v4f64x_info>,
                                    EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
   }
 }
-defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs>, T8PD;
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs,
+                                    SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                         string Name> {
   let ExeDomain = _.ExeDomain in {
   let isCommutable = 1 in
   defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>,
-                   EVEX_4V, Sched<[itins.Sched]>;
+                   (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+                           _.ImmAllZerosV)>,
+                   EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (OpNode (_.VT _.RC:$src1),
-                    (_.VT (bitconvert (_.LdFrag addr:$src2)))), itins.rm>,
+                   (OpNode (bitconvert
+                            (_.i64VT (and _.RC:$src1,
+                                          (bitconvert (_.LdFrag addr:$src2))))),
+                           _.ImmAllZerosV)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                   Sched<[sched.Folded, ReadAfterLd]>;
   }
+
+  // Patterns for compare with 0 that just use the same source twice.
+  def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
+            (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr")
+                                      _.RC:$src, _.RC:$src))>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
+            (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk")
+                                      _.KRC:$mask, _.RC:$src, _.RC:$src))>;
 }
 
-multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            OpndItins itins, X86VectorVTInfo _> {
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                            X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr,
-                    (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2)))),
-                    itins.rm>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                    (OpNode (and _.RC:$src1,
+                                       (X86VBroadcast
+                                        (_.ScalarLdFrag addr:$src2))),
+                            _.ImmAllZerosV)>,
+                    EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 // Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
-                                  X86VectorVTInfo _, string Suffix> {
-    def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
-              (_.KVT (COPY_TO_REGCLASS
-                       (!cast<Instruction>(NAME # Suffix # "Zrr")
-                         (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                        _.RC:$src1, _.SubRegIdx),
-                         (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                        _.RC:$src2, _.SubRegIdx)),
-                     _.KRC))>;
-}
-
-multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins, AVX512VLVectorVTInfo _,
-                                  string Suffix> {
+multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
+                                  X86VectorVTInfo _, string Name> {
+  def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+                           _.ImmAllZerosV)),
+            (_.KVT (COPY_TO_REGCLASS
+                     (!cast<Instruction>(Name # "Zrr")
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src1, _.SubRegIdx),
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src2, _.SubRegIdx)),
+                   _.KRC))>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                        (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+                                _.ImmAllZerosV))),
+            (COPY_TO_REGCLASS
+             (!cast<Instruction>(Name # "Zrrk")
+              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src1, _.SubRegIdx),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src2, _.SubRegIdx)),
+             _.KRC)>;
+
+  def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
+            (_.KVT (COPY_TO_REGCLASS
+                     (!cast<Instruction>(Name # "Zrr")
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src, _.SubRegIdx),
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src, _.SubRegIdx)),
+                   _.KRC))>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
+            (COPY_TO_REGCLASS
+             (!cast<Instruction>(Name # "Zrrk")
+              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src, _.SubRegIdx),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src, _.SubRegIdx)),
+             _.KRC)>;
+}
+
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                                  X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
-  defm Z : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info512>,
-           avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+  defm Z : avx512_vptest<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, NAME>,
+           avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-  defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info256>,
-              avx512_vptest_mb<opc, OpcodeStr, OpNode,itins,  _.info256>, EVEX_V256;
-  defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info128>,
-              avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
+  defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, sched.YMM, _.info256, NAME>,
+              avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
+  defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, sched.XMM, _.info128, NAME>,
+              avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
   }
   let Predicates = [HasAVX512, NoVLX] in {
-  defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
-  defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>;
+  defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>;
+  defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>;
   }
 }
 
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            OpndItins itins> {
-  defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, itins,
-                                 avx512vl_i32_info, "D">;
-  defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, itins,
-                                 avx512vl_i64_info, "Q">, VEX_W;
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+                            X86SchedWriteWidths sched> {
+  defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, sched,
+                                 avx512vl_i32_info>;
+  defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, sched,
+                                 avx512vl_i64_info>, VEX_W;
 }
 
 multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins> {
+                            PatFrag OpNode, X86SchedWriteWidths sched> {
   let Predicates = [HasBWI] in {
-  defm WZ:    avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v32i16_info>,
-              EVEX_V512, VEX_W;
-  defm BZ:    avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v64i8_info>,
-              EVEX_V512;
+  defm WZ:    avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.ZMM,
+                            v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
+  defm BZ:    avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.ZMM,
+                            v64i8_info, NAME#"B">, EVEX_V512;
   }
   let Predicates = [HasVLX, HasBWI] in {
 
-  defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v16i16x_info>,
-              EVEX_V256, VEX_W;
-  defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v8i16x_info>,
-              EVEX_V128, VEX_W;
-  defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v32i8x_info>,
-              EVEX_V256;
-  defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v16i8x_info>,
-              EVEX_V128;
+  defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.YMM,
+                            v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
+  defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.XMM,
+                            v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
+  defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.YMM,
+                            v32i8x_info, NAME#"B">, EVEX_V256;
+  defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.XMM,
+                            v16i8x_info, NAME#"B">, EVEX_V128;
   }
 
   let Predicates = [HasAVX512, NoVLX] in {
-  defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">;
-  defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">;
-  defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
-  defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
+  defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">;
+  defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">;
+  defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">;
+  defm WZ128_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v8i16x_info, NAME#"W">;
   }
 }
 
-multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
-                                   SDNode OpNode, OpndItins itins> :
-  avx512_vptest_wb <opc_wb, OpcodeStr, OpNode, itins>,
-  avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, itins>;
+// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
+// as commutable here because we already canonicalized all zeros vectors to the
+// RHS during lowering.
+def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
+                         (setcc node:$src1, node:$src2, SETEQ)>;
+def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
+                         (setcc node:$src1, node:$src2, SETNE)>;
 
-defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm,
-                                         SSE_BIT_ITINS_P>, T8PD;
-defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm,
-                                         SSE_BIT_ITINS_P>, T8XS;
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+                                   PatFrag OpNode, X86SchedWriteWidths sched> :
+  avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, sched>,
+  avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, sched>;
 
+defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
+                                         SchedWriteVecLogic>, T8PD;
+defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
+                                         SchedWriteVecLogic>, T8XS;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
 //===----------------------------------------------------------------------===//
+
 multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
-                            string OpcodeStr, SDNode OpNode, OpndItins itins,
-                            X86VectorVTInfo _> {
+                            string OpcodeStr, SDNode OpNode,
+                            X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
-                   itins.rr>, Sched<[itins.Sched]>;
+                   (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>,
+                   Sched<[sched]>;
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                          (i8 imm:$src2))),
-                   itins.rm>, Sched<[itins.Sched.Folded]>;
+                          (i8 imm:$src2)))>,
+                   Sched<[sched.Folded]>;
   }
 }
 
 multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
-                             string OpcodeStr, SDNode OpNode, OpndItins itins,
-                             X86VectorVTInfo _> {
+                             string OpcodeStr, SDNode OpNode,
+                             X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
       "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
-     (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
-     itins.rm>, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+     (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>,
+     EVEX_B, Sched<[sched.Folded]>;
 }
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            OpndItins itins, ValueType SrcVT, PatFrag bc_frag,
-                            X86VectorVTInfo _> {
+                            X86FoldableSchedWrite sched, ValueType SrcVT,
+                            PatFrag bc_frag, X86VectorVTInfo _> {
    // src2 is always 128-bit
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
-                   itins.rr>, AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>;
+                   (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
+                   AVX512BIBase, EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
-                   itins.rm>, AVX512BIBase,
-                   EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2))))>,
+                   AVX512BIBase,
+                   EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              OpndItins itins, ValueType SrcVT, PatFrag bc_frag,
-                              AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+                              X86SchedWriteWidths sched, ValueType SrcVT,
+                              PatFrag bc_frag, AVX512VLVectorVTInfo VTInfo,
+                              Predicate prd> {
   let Predicates = [prd] in
-  defm Z    : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
-                            VTInfo.info512>, EVEX_V512,
-                            EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+  defm Z    : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.ZMM, SrcVT,
+                               bc_frag, VTInfo.info512>, EVEX_V512,
+                               EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
   let Predicates = [prd, HasVLX] in {
-  defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
-                            VTInfo.info256>, EVEX_V256,
-                            EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
-  defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
-                            VTInfo.info128>, EVEX_V128,
-                            EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+  defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.YMM, SrcVT,
+                               bc_frag, VTInfo.info256>, EVEX_V256,
+                               EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+  defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.XMM, SrcVT,
+                               bc_frag, VTInfo.info128>, EVEX_V128,
+                               EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
                               string OpcodeStr, SDNode OpNode,
-                              OpndItins itins> {
-  defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, itins, v4i32,
+                              X86SchedWriteWidths sched,
+                              bit NotEVEX2VEXConvertibleQ = 0> {
+  defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,
                               bc_v4i32, avx512vl_i32_info, HasAVX512>;
-  defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, itins, v2i64,
+  let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
+  defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,
                               bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W;
-  defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, itins, v8i16,
+  defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16,
                               bc_v2i64, avx512vl_i16_info, HasBWI>;
 }
 
 multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
                                   string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
+                                  X86SchedWriteWidths sched,
+                                  AVX512VLVectorVTInfo VTInfo> {
   let Predicates = [HasAVX512] in
-  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins,
-                              VTInfo.info512>,
-             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
-                              VTInfo.info512>, EVEX_V512;
+  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              sched.ZMM, VTInfo.info512>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.ZMM,
+                               VTInfo.info512>, EVEX_V512;
   let Predicates = [HasAVX512, HasVLX] in {
-  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins,
-                              VTInfo.info256>,
-             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
-                              VTInfo.info256>, EVEX_V256;
+  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              sched.YMM, VTInfo.info256>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.YMM,
+                               VTInfo.info256>, EVEX_V256;
   defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
-                              itins, VTInfo.info128>,
-             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
-                              VTInfo.info128>, EVEX_V128;
+                              sched.XMM, VTInfo.info128>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.XMM,
+                               VTInfo.info128>, EVEX_V128;
   }
 }
 
-multiclass avx512_shift_rmi_w<bits<8> opcw,
-                                 Format ImmFormR, Format ImmFormM,
-                                 string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins> {
+multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM,
+                              string OpcodeStr, SDNode OpNode,
+                              X86SchedWriteWidths sched> {
   let Predicates = [HasBWI] in
   defm WZ:    avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
-                               itins, v32i16_info>, EVEX_V512, VEX_WIG;
+                               sched.ZMM, v32i16_info>, EVEX_V512, VEX_WIG;
   let Predicates = [HasVLX, HasBWI] in {
   defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
-                               itins, v16i16x_info>, EVEX_V256, VEX_WIG;
+                               sched.YMM, v16i16x_info>, EVEX_V256, VEX_WIG;
   defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
-                               itins, v8i16x_info>, EVEX_V128, VEX_WIG;
+                               sched.XMM, v8i16x_info>, EVEX_V128, VEX_WIG;
   }
 }
 
 multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
-                                 Format ImmFormR, Format ImmFormM,
-                                 string OpcodeStr, SDNode OpNode, OpndItins itins> {
+                               Format ImmFormR, Format ImmFormM,
+                               string OpcodeStr, SDNode OpNode,
+                               X86SchedWriteWidths sched,
+                               bit NotEVEX2VEXConvertibleQ = 0> {
   defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
-                                 itins, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+                                 sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+  let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
   defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
-                                 itins, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+                                 sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
 defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
-                                 SSE_INTSHIFT_P>,
+                                 SchedWriteVecShiftImm>,
              avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
-                                SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
-                                 SSE_INTSHIFT_P>,
+                                 SchedWriteVecShiftImm>,
              avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
-                                SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
-                                 SSE_INTSHIFT_P>,
+                                 SchedWriteVecShiftImm, 1>,
              avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
-                                SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
 
 defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
-                                 SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
 defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
-                                 SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
 
-defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SSE_INTSHIFT_P>;
-defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, SSE_INTSHIFT_P>;
-defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SSE_INTSHIFT_P>;
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
+                                SchedWriteVecShift>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra,
+                                SchedWriteVecShift, 1>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
+                                SchedWriteVecShift>;
 
 // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
 let Predicates = [HasAVX512, NoVLX] in {
@@ -5407,59 +5978,57 @@ let Predicates = [HasAVX512, NoVLX] in {
 //===-------------------------------------------------------------------===//
 // Variable Bit Shifts
 //===-------------------------------------------------------------------===//
+
 multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            OpndItins itins, X86VectorVTInfo _> {
+                            X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
-                   itins.rr>, AVX5128IBase, EVEX_4V,
-                   Sched<[itins.Sched]>;
+                   (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
+                   AVX5128IBase, EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src2))))),
-                   itins.rm>, AVX5128IBase, EVEX_4V,
-                   EVEX_CD8<_.EltSize, CD8VF>,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                   (_.VT (bitconvert (_.LdFrag addr:$src2)))))>,
+                   AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                   Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               OpndItins itins, X86VectorVTInfo _> {
+                               X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr,
                     (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2))))),
-                    itins.rm>, AVX5128IBase, EVEX_B,
-                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                                (_.ScalarLdFrag addr:$src2)))))>,
+                    AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins, AVX512VLVectorVTInfo _> {
+                                  X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
-  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
-           avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
-              avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
-  defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>,
-              avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
+  defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
   }
 }
 
 multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins> {
-  defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, itins,
+                                  SDNode OpNode, X86SchedWriteWidths sched> {
+  defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, sched,
                                  avx512vl_i32_info>;
-  defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, itins,
+  defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, sched,
                                  avx512vl_i64_info>, VEX_W;
 }
 
@@ -5485,30 +6054,30 @@ multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
   }
 }
 multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
-                              SDNode OpNode, OpndItins itins> {
+                              SDNode OpNode, X86SchedWriteWidths sched> {
   let Predicates = [HasBWI] in
-  defm WZ:    avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i16_info>,
+  defm WZ:    avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v32i16_info>,
               EVEX_V512, VEX_W;
   let Predicates = [HasVLX, HasBWI] in {
 
-  defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i16x_info>,
+  defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v16i16x_info>,
               EVEX_V256, VEX_W;
-  defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v8i16x_info>,
+  defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v8i16x_info>,
               EVEX_V128, VEX_W;
   }
 }
 
-defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SSE_INTSHIFT_P>,
-              avx512_var_shift_w<0x12, "vpsllvw", shl, SSE_INTSHIFT_P>;
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>,
+              avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>;
 
-defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SSE_INTSHIFT_P>,
-              avx512_var_shift_w<0x11, "vpsravw", sra, SSE_INTSHIFT_P>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>,
+              avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>;
 
-defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SSE_INTSHIFT_P>,
-              avx512_var_shift_w<0x10, "vpsrlvw", srl, SSE_INTSHIFT_P>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>,
+              avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>;
 
-defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SSE_INTSHIFT_P>;
-defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SSE_INTSHIFT_P>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
 
 defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
 defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
@@ -5579,7 +6148,6 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
 
-
 // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
 let Predicates = [HasAVX512, NoVLX] in {
   def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
@@ -5685,87 +6253,89 @@ let Predicates = [HasAVX512, NoVLX] in {
 //===-------------------------------------------------------------------===//
 // 1-src variable permutation VPERMW/D/Q
 //===-------------------------------------------------------------------===//
+
 multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins, AVX512VLVectorVTInfo _> {
+                                 X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
-  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
-           avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info512>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in
-  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
-              avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info256>, EVEX_V256;
 }
 
 multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
                                  string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
+                                 X86FoldableSchedWrite sched, AVX512VLVectorVTInfo VTInfo> {
   let Predicates = [HasAVX512] in
   defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
-                              itins, VTInfo.info512>,
+                              sched, VTInfo.info512>,
              avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
-                               itins, VTInfo.info512>, EVEX_V512;
+                               sched, VTInfo.info512>, EVEX_V512;
   let Predicates = [HasAVX512, HasVLX] in
   defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
-                              itins, VTInfo.info256>,
+                              sched, VTInfo.info256>,
              avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
-                               itins, VTInfo.info256>, EVEX_V256;
+                               sched, VTInfo.info256>, EVEX_V256;
 }
 
 multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
                               Predicate prd, SDNode OpNode,
-                              OpndItins itins, AVX512VLVectorVTInfo _> {
+                              X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> {
   let Predicates = [prd] in
-  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
+  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>,
               EVEX_V512 ;
   let Predicates = [HasVLX, prd] in {
-  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
+  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>,
               EVEX_V256 ;
-  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>,
+  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info128>,
               EVEX_V128 ;
   }
 }
 
 defm VPERMW  : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
-                               AVX2_PERMV_I, avx512vl_i16_info>, VEX_W;
+                               WriteVarShuffle256, avx512vl_i16_info>, VEX_W;
 defm VPERMB  : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
-                               AVX2_PERMV_I, avx512vl_i8_info>;
+                               WriteVarShuffle256, avx512vl_i8_info>;
 
 defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
-                                    AVX2_PERMV_I, avx512vl_i32_info>;
+                                    WriteVarShuffle256, avx512vl_i32_info>;
 defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
-                                    AVX2_PERMV_I, avx512vl_i64_info>, VEX_W;
+                                    WriteVarShuffle256, avx512vl_i64_info>, VEX_W;
 defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
-                                     AVX2_PERMV_F, avx512vl_f32_info>;
+                                     WriteFVarShuffle256, avx512vl_f32_info>;
 defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
-                                     AVX2_PERMV_F, avx512vl_f64_info>, VEX_W;
+                                     WriteFVarShuffle256, avx512vl_f64_info>, VEX_W;
 
 defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
-                             X86VPermi, AVX2_PERMV_I, avx512vl_i64_info>,
+                             X86VPermi, WriteShuffle256, avx512vl_i64_info>,
                              EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
 defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
-                             X86VPermi, AVX2_PERMV_F, avx512vl_f64_info>,
+                             X86VPermi, WriteFShuffle256, avx512vl_f64_info>,
                              EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - VPERMIL
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
-                             OpndItins itins, X86VectorVTInfo _,
+                             X86FoldableSchedWrite sched, X86VectorVTInfo _,
                              X86VectorVTInfo Ctrl> {
   defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1,
-                               (Ctrl.VT Ctrl.RC:$src2))), itins.rr>,
-                  T8PD, EVEX_4V, Sched<[itins.Sched]>;
+                               (Ctrl.VT Ctrl.RC:$src2)))>,
+                  T8PD, EVEX_4V, Sched<[sched]>;
   defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
-                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2))))),
-                  itins.rm>, T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                  T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                  Sched<[sched.Folded, ReadAfterLd]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                    "${src2}"##_.BroadcastStr##", $src1",
@@ -5773,31 +6343,33 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                    (_.VT (OpNode
                             _.RC:$src1,
                             (Ctrl.VT (X86VBroadcast
-                                       (Ctrl.ScalarLdFrag addr:$src2))))),
-                   itins.rm>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                       (Ctrl.ScalarLdFrag addr:$src2)))))>,
+                   T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                   Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
-                                    OpndItins itins, AVX512VLVectorVTInfo _,
+                                    X86SchedWriteWidths sched,
+                                    AVX512VLVectorVTInfo _,
                                     AVX512VLVectorVTInfo Ctrl> {
   let Predicates = [HasAVX512] in {
-    defm Z    : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+    defm Z    : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.ZMM,
                                   _.info512, Ctrl.info512>, EVEX_V512;
   }
   let Predicates = [HasAVX512, HasVLX] in {
-    defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+    defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.XMM,
                                   _.info128, Ctrl.info128>, EVEX_V128;
-    defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+    defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.YMM,
                                   _.info256, Ctrl.info256>, EVEX_V256;
   }
 }
 
 multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
                          AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
-  defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, AVX_VPERMILV, _, Ctrl>;
+  defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, SchedWriteFVarShuffle,
+                                      _, Ctrl>;
   defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
-                                    X86VPermilpi, AVX_VPERMILV, _>,
+                                    X86VPermilpi, SchedWriteFShuffle, _>,
                     EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
 }
 
@@ -5806,54 +6378,68 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
                                avx512vl_i32_info>;
 let ExeDomain = SSEPackedDouble in
 defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
-                               avx512vl_i64_info>, VEX_W;
+                               avx512vl_i64_info>, VEX_W1X;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
 //===----------------------------------------------------------------------===//
 
 defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
-                             X86PShufd, SSE_PSHUF, avx512vl_i32_info>,
+                             X86PShufd, SchedWriteShuffle, avx512vl_i32_info>,
                              EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
 defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
-                                  X86PShufhw, SSE_PSHUF>, EVEX, AVX512XSIi8Base;
+                                  X86PShufhw, SchedWriteShuffle>,
+                                  EVEX, AVX512XSIi8Base;
 defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
-                                  X86PShuflw, SSE_PSHUF>, EVEX, AVX512XDIi8Base;
+                                  X86PShuflw, SchedWriteShuffle>,
+                                  EVEX, AVX512XDIi8Base;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFB
+//===----------------------------------------------------------------------===//
 
 multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               OpndItins itins> {
+                               X86SchedWriteWidths sched> {
   let Predicates = [HasBWI] in
-  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, itins, v64i8_info>, EVEX_V512;
+  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v64i8_info>,
+                              EVEX_V512;
 
   let Predicates = [HasVLX, HasBWI] in {
-  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i8x_info>, EVEX_V256;
-  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i8x_info>, EVEX_V128;
+  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v32i8x_info>,
+                              EVEX_V256;
+  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v16i8x_info>,
+                              EVEX_V128;
   }
 }
 
-defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb, SSE_PSHUFB>, VEX_WIG;
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb,
+                                  SchedWriteVarShuffle>, VEX_WIG;
 
 //===----------------------------------------------------------------------===//
 // Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
+
 def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
           "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
-           IIC_SSE_MOV_LH>, EVEX_4V;
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
+          Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+let isCommutable = 1 in
 def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
           "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
-          IIC_SSE_MOV_LH>, EVEX_4V;
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>,
+          Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V, NotMemoryFoldable;
 
 //===----------------------------------------------------------------------===//
 // VMOVHPS/PD VMOVLPS Instructions
 // All patterns was taken from SSS implementation.
 //===----------------------------------------------------------------------===//
-multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
+                                  SDPatternOperator OpNode,
                                   X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let hasSideEffects = 0, mayLoad = 1, ExeDomain = _.ExeDomain in
   def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.RC:$src1, f64mem:$src2),
                   !strconcat(OpcodeStr,
@@ -5861,71 +6447,57 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   [(set _.RC:$dst,
                      (OpNode _.RC:$src1,
                        (_.VT (bitconvert
-                         (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
-                  IIC_SSE_MOV_LH>, EVEX_4V;
+                         (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>,
+                  Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>, EVEX_4V;
 }
 
-defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
+// SSE1. And MOVLPS pattern is even more complex.
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
                                   v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
 defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
                                   v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
-defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag,
                                   v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
-defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
                                   v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
 
 let Predicates = [HasAVX512] in {
-  // VMOVHPS patterns
-  def : Pat<(X86Movlhps VR128X:$src1,
-               (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
-          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
-  def : Pat<(X86Movlhps VR128X:$src1,
-               (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
-          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
   // VMOVHPD patterns
   def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
                     (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
            (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
-  // VMOVLPS patterns
-  def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
-          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
-  // VMOVLPD patterns
-  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
-          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Movsd VR128X:$src1,
-                           (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
 }
 
+let SchedRW = [WriteFStore] in {
 def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovhps\t{$src, $dst|$dst, $src}",
                        [(store (f64 (extractelt
                                      (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
                                                 (bc_v2f64 (v4f32 VR128X:$src))),
-                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                                     (iPTR 0))), addr:$dst)]>,
                        EVEX, EVEX_CD8<32, CD8VT2>;
 def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovhpd\t{$src, $dst|$dst, $src}",
                        [(store (f64 (extractelt
                                      (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
-                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                                     (iPTR 0))), addr:$dst)]>,
                        EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
 def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovlps\t{$src, $dst|$dst, $src}",
                        [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
-                                     (iPTR 0))), addr:$dst)],
-                                     IIC_SSE_MOV_LH>,
+                                     (iPTR 0))), addr:$dst)]>,
                        EVEX, EVEX_CD8<32, CD8VT2>;
 def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovlpd\t{$src, $dst|$dst, $src}",
                        [(store (f64 (extractelt (v2f64 VR128X:$src),
-                                     (iPTR 0))), addr:$dst)],
-                                     IIC_SSE_MOV_LH>,
+                                     (iPTR 0))), addr:$dst)]>,
                        EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+} // SchedRW
 
 let Predicates = [HasAVX512] in {
   // VMOVHPD patterns
@@ -5933,77 +6505,75 @@ let Predicates = [HasAVX512] in {
                            (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
                            (iPTR 0))), addr:$dst),
            (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
-  // VMOVLPS patterns
-  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
-                   addr:$src1),
-            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
-  // VMOVLPD patterns
-  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
-                   addr:$src1),
-            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
 }
 //===----------------------------------------------------------------------===//
 // FMA - Fused Multiply Operations
 //
 
 multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), NoItinerary, 1, 1>,
-          AVX512FMA3Base, Sched<[WriteFMA]>;
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+          AVX512FMA3Base, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
-          NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+          AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
             OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
-             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))),
-             NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B,
-             Sched<[WriteFMALd, ReadAfterLd]>;
+             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+             AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86FoldableSchedWrite sched,
                                  X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))),
-          NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
-                                   string Suff> {
+                                   SDNode OpNodeRnd, X86SchedWriteWidths sched,
+                                   AVX512VLVectorVTInfo _, string Suff> {
   let Predicates = [HasAVX512] in {
-    defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
-                  avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512,
-                      Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+    defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.ZMM,
+                                      _.info512, Suff>,
+                  avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+                                        _.info512, Suff>,
+                              EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.YMM,
+                                    _.info256, Suff>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.XMM,
+                                    _.info128, Suff>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              SDNode OpNodeRnd > {
+                              SDNode OpNodeRnd> {
     defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
-                                      avx512vl_f32_info, "PS">;
+                                      SchedWriteFMA, avx512vl_f32_info, "PS">;
     defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
-                                      avx512vl_f64_info, "PD">, VEX_W;
+                                      SchedWriteFMA, avx512vl_f64_info, "PD">,
+                                      VEX_W;
 }
 
 defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
@@ -6015,19 +6585,20 @@ defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR
 
 
 multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), NoItinerary, 1, 1,
-          vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>;
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1,
+          vselect, 1>, AVX512FMA3Base, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
-          NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+          AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6035,34 +6606,39 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          "$src2, ${src3}"##_.BroadcastStr,
          (_.VT (OpNode _.RC:$src2,
                       (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                      _.RC:$src1)), NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B,
-         Sched<[WriteFMALd, ReadAfterLd]>;
+                      _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+         Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86FoldableSchedWrite sched,
                                  X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))),
-          NoItinerary, 1, 1, vselect, 1>,
-          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
+          1, 1, vselect, 1>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
-                                   string Suff> {
+                                   SDNode OpNodeRnd, X86SchedWriteWidths sched,
+                                   AVX512VLVectorVTInfo _, string Suff> {
   let Predicates = [HasAVX512] in {
-    defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
-                  avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512,
-                      Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+    defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.ZMM,
+                                      _.info512, Suff>,
+                  avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+                                        _.info512, Suff>,
+                              EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.YMM,
+                                    _.info256, Suff>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.XMM,
+                                    _.info128, Suff>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
@@ -6070,9 +6646,10 @@ multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               SDNode OpNodeRnd > {
     defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
-                                      avx512vl_f32_info, "PS">;
+                                      SchedWriteFMA, avx512vl_f32_info, "PS">;
     defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
-                                      avx512vl_f64_info, "PD">, VEX_W;
+                                      SchedWriteFMA, avx512vl_f64_info, "PD">,
+                                      VEX_W;
 }
 
 defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
@@ -6083,21 +6660,22 @@ defm VFNMADD231   : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddR
 defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
 
 multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), NoItinerary,
-          1, 1, vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>;
+          (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>,
+          AVX512FMA3Base, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
-          NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+          (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
+          AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6106,34 +6684,39 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
          "$src2, ${src3}"##_.BroadcastStr,
          (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                       _.RC:$src1, _.RC:$src2)), NoItinerary, 1, 0>,
-         AVX512FMA3Base, EVEX_B, Sched<[WriteFMALd, ReadAfterLd]>;
+                       _.RC:$src1, _.RC:$src2)), 1, 0>,
+         AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86FoldableSchedWrite sched,
                                  X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))),
-          NoItinerary, 1, 1, vselect, 1>,
-          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
+          1, 1, vselect, 1>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
-                                   string Suff> {
+                                   SDNode OpNodeRnd, X86SchedWriteWidths sched,
+                                   AVX512VLVectorVTInfo _, string Suff> {
   let Predicates = [HasAVX512] in {
-    defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
-                  avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512,
-                      Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+    defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.ZMM,
+                                      _.info512, Suff>,
+                  avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+                                        _.info512, Suff>,
+                              EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.YMM,
+                                    _.info256, Suff>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.XMM,
+                                    _.info128, Suff>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
@@ -6141,9 +6724,10 @@ multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               SDNode OpNodeRnd > {
     defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
-                                      avx512vl_f32_info, "PS">;
+                                      SchedWriteFMA, avx512vl_f32_info, "PS">;
     defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
-                                      avx512vl_f64_info, "PD">, VEX_W;
+                                      SchedWriteFMA, avx512vl_f64_info, "PD">,
+                                      VEX_W;
 }
 
 defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
@@ -6155,129 +6739,337 @@ defm VFNMSUB132   : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubR
 
 // Scalar FMA
 multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                               dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
-                               dag RHS_r, dag RHS_m, bit MaskOnlyReg> {
+                               dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> {
 let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
   defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
-          "$src3, $src2", "$src2, $src3", RHS_VEC_r, NoItinerary, 1, 1>,
-          AVX512FMA3Base, Sched<[WriteFMA]>;
+          "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>;
 
+  let mayLoad = 1 in
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
-          "$src3, $src2", "$src2, $src3", RHS_VEC_m, NoItinerary, 1, 1>,
-          AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+          "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>;
 
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
-         OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb,
-         NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC,
-         Sched<[WriteFMA]>;
+         OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
+         AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
 
   let isCodeGenOnly = 1, isCommutable = 1 in {
     def r     : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
                      !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[WriteFMA]>;
+                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>;
     def m     : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
                     (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
                     !strconcat(OpcodeStr,
                                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [RHS_m]>, Sched<[WriteFMALd, ReadAfterLd]>;
+                    [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>;
+
+    def rb    : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+                     (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
+                     !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                     !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
+                     Sched<[SchedWriteFMA.Scl]>;
   }// isCodeGenOnly = 1
 }// Constraints = "$src1 = $dst"
 }
 
 multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
-                            string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
-                            SDNode OpNodeRnds1, SDNode OpNodes3,
-                            SDNode OpNodeRnds3, X86VectorVTInfo _,
-                            string SUFF> {
+                            string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd,
+                            X86VectorVTInfo _, string SUFF> {
   let ExeDomain = _.ExeDomain in {
   defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
                 // Operands for intrinsic are in 123 order to preserve passthu
                 // semantics.
-                (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)),
-                (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2,
-                         _.ScalarIntMemCPat:$src3)),
-                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
-                         (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
                          _.FRC:$src3))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
-                         (_.ScalarLdFrag addr:$src3)))), 0>;
+                         (_.ScalarLdFrag addr:$src3)))),
+                (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1,
+                         _.FRC:$src3, (i32 imm:$rc)))), 0>;
 
   defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
-                (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)),
-                (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
-                              _.RC:$src1)),
-                (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
-                                  (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
                                           _.FRC:$src1))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
-                            (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>;
+                            (_.ScalarLdFrag addr:$src3), _.FRC:$src1))),
+                (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3,
+                         _.FRC:$src1, (i32 imm:$rc)))), 1>;
 
   // One pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
   defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
-                (null_frag),
-                (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
-                              _.RC:$src2)),
-                (null_frag),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
                          _.FRC:$src2))),
                 (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
-                                 _.FRC:$src1, _.FRC:$src2))), 1>;
+                                 _.FRC:$src1, _.FRC:$src2))),
+                (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3,
+                         _.FRC:$src2, (i32 imm:$rc)))), 1>;
   }
 }
 
 multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
-                        string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
-                        SDNode OpNodeRnds1, SDNode OpNodes3,
-                        SDNode OpNodeRnds3> {
+                        string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> {
   let Predicates = [HasAVX512] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
-                                 OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
-                                 f32x_info, "SS">,
+                                 OpNodeRnd, f32x_info, "SS">,
                                  EVEX_CD8<32, CD8VT1>, VEX_LIG;
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
-                                 OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
-                                 f64x_info, "SD">,
+                                 OpNodeRnd, f64x_info, "SD">,
                                  EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
   }
 }
 
-defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1,
-                            X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>;
-defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1,
-                            X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
-                            X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
-                            X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
+defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+
+multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
+                                      string Suffix, SDNode Move,
+                                      X86VectorVTInfo _, PatLeaf ZeroFP> {
+  let Predicates = [HasAVX512] in {
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (Op _.FRC:$src2,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    _.FRC:$src3))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zr_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (Op _.FRC:$src2, _.FRC:$src3,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zr_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (Op _.FRC:$src2,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    (_.ScalarLdFrag addr:$src3)))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zm_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    (_.ScalarLdFrag addr:$src3), _.FRC:$src2))))),
+              (!cast<I>(Prefix#"132"#Suffix#"Zm_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zm_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    _.FRC:$src3),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zr_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    (_.ScalarLdFrag addr:$src3)),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zm_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2, _.FRC:$src3,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    _.FRC:$src3),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2, _.FRC:$src3,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2,
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    (_.ScalarLdFrag addr:$src3)),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                    _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+    // Patterns with rounding mode.
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (RndOp _.FRC:$src2,
+                       (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                       _.FRC:$src3, (i32 imm:$rc)))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zrb_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+                (RndOp _.FRC:$src2, _.FRC:$src3,
+                       (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                       (i32 imm:$rc)))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zrb_Int")
+               VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (RndOp _.FRC:$src2,
+                       (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                       _.FRC:$src3, (i32 imm:$rc)),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (RndOp _.FRC:$src2, _.FRC:$src3,
+                       (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                       (i32 imm:$rc)),
+                (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (RndOp _.FRC:$src2,
+                       (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                       _.FRC:$src3, (i32 imm:$rc)),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+
+    def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+               (X86selects VK1WM:$mask,
+                (RndOp _.FRC:$src2, _.FRC:$src3,
+                       (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                       (i32 imm:$rc)),
+                (_.EltVT ZeroFP)))))),
+              (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz")
+               VR128X:$src1, VK1WM:$mask,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+  }
+}
+
+defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SS",
+                                  X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS",
+                                  X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
+                                  X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
+                                  X86Movss, v4f32x_info, fp32imm0>;
+
+defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SD",
+                                  X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD",
+                                  X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
+                                  X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
+                                  X86Movsd, v2f64x_info, fp64imm0>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
 //===----------------------------------------------------------------------===//
 let Constraints = "$src1 = $dst" in {
 multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                             OpndItins itins, X86VectorVTInfo _> {
+                             X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   // NOTE: The SDNode have the multiply operands first with the add last.
   // This enables commuted load patterns to be autogenerated by tablegen.
   let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), itins.rr, 1, 1>,
-         AVX512FMA3Base, Sched<[itins.Sched]>;
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+         AVX512FMA3Base, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
-          itins.rm>, AVX512FMA3Base, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+          AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6285,48 +7077,50 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
                     (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
-                    _.RC:$src1), itins.rm>,
-            AVX512FMA3Base, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                    _.RC:$src1)>,
+            AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 } // Constraints = "$src1 = $dst"
 
 multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins, AVX512VLVectorVTInfo _> {
+                                 X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
   let Predicates = [HasIFMA] in {
-    defm Z      : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info512>,
+    defm Z      : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
                       EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasIFMA] in {
-    defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info256>,
+    defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info128>,
+    defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
-                                  SSE_PMADD, avx512vl_i64_info>, VEX_W;
+                                         SchedWriteVecIMul, avx512vl_i64_info>,
+                                         VEX_W;
 defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
-                                  SSE_PMADD, avx512vl_i64_info>, VEX_W;
+                                         SchedWriteVecIMul, avx512vl_i64_info>,
+                                         VEX_W;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Scalar convert from sign integer to float/double
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched,
                     RegisterClass SrcRC, X86VectorVTInfo DstVT,
                     X86MemOperand x86memop, PatFrag ld_frag, string asm> {
   let hasSideEffects = 0 in {
     def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, SrcRC:$src),
-              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
-              itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V, Sched<[sched]>;
     let mayLoad = 1 in
       def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, x86memop:$src),
-              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
-              itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
   } // hasSideEffects = 0
   let isCodeGenOnly = 1 in {
     def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
@@ -6335,8 +7129,8 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
                   [(set DstVT.RC:$dst,
                         (OpNode (DstVT.VT DstVT.RC:$src1),
                                  SrcRC:$src2,
-                                 (i32 FROUND_CURRENT)))], itins.rr>,
-                 EVEX_4V, Sched<[itins.Sched]>;
+                                 (i32 FROUND_CURRENT)))]>,
+                 EVEX_4V, Sched<[sched]>;
 
     def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
                   (ins DstVT.RC:$src1, x86memop:$src2),
@@ -6344,13 +7138,14 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
                   [(set DstVT.RC:$dst,
                         (OpNode (DstVT.VT DstVT.RC:$src1),
                                  (ld_frag addr:$src2),
-                                 (i32 FROUND_CURRENT)))], itins.rm>,
-                  EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                 (i32 FROUND_CURRENT)))]>,
+                  EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
   }//isCodeGenOnly = 1
 }
 
-multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, OpndItins itins,
-                    RegisterClass SrcRC, X86VectorVTInfo DstVT, string asm> {
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
+                               X86FoldableSchedWrite sched, RegisterClass SrcRC,
+                               X86VectorVTInfo DstVT, string asm> {
   def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
               (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
               !strconcat(asm,
@@ -6358,36 +7153,37 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, OpndItins itins,
               [(set DstVT.RC:$dst,
                     (OpNode (DstVT.VT DstVT.RC:$src1),
                              SrcRC:$src2,
-                             (i32 imm:$rc)))], itins.rr>,
-              EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+                             (i32 imm:$rc)))]>,
+              EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
-multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, OpndItins itins,
-                    RegisterClass SrcRC, X86VectorVTInfo DstVT,
-                    X86MemOperand x86memop, PatFrag ld_frag, string asm> {
-  defm NAME : avx512_vcvtsi_round<opc, OpNode, itins, SrcRC, DstVT, asm>,
-              avx512_vcvtsi<opc, OpNode, itins, SrcRC, DstVT, x86memop,
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode,
+                                X86FoldableSchedWrite sched,
+                                RegisterClass SrcRC, X86VectorVTInfo DstVT,
+                                X86MemOperand x86memop, PatFrag ld_frag, string asm> {
+  defm NAME : avx512_vcvtsi_round<opc, OpNode, sched, SrcRC, DstVT, asm>,
+              avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop,
                             ld_frag, asm>, VEX_LIG;
 }
 
 let Predicates = [HasAVX512] in {
-defm VCVTSI2SSZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR32,
+defm VCVTSI2SSZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32,
                                  v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
                                  XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR64,
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64,
                                  v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
                                  XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSI2SDZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR32,
+defm VCVTSI2SDZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32,
                                  v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
                                  XD, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR64,
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64,
                                  v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
                                  XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-              (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+              (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-              (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+              (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
 
 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
           (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -6407,23 +7203,23 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
 def : Pat<(f64 (sint_to_fp GR64:$src)),
           (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
 
-defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR32,
+defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32,
                                   v4f32x_info, i32mem, loadi32,
                                   "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR64,
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64,
                                   v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
                                   XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info,
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info,
                                   i32mem, loadi32, "cvtusi2sd{l}">,
                                   XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR64,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64,
                                   v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-              (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+              (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
 def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-              (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+              (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
 
 def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
           (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -6450,50 +7246,70 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
 
 multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                                   X86VectorVTInfo DstVT, SDNode OpNode,
-                                  OpndItins itins, string asm> {
+                                  X86FoldableSchedWrite sched, string asm,
+                                  string aliasStr,
+                                  bit CodeGenOnly = 1> {
   let Predicates = [HasAVX512] in {
     def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-                [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))],
-                itins.rr>, EVEX, VEX_LIG, Sched<[itins.Sched]>;
+                [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
+                EVEX, VEX_LIG, Sched<[sched]>;
     def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
                  !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
-                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))],
-                 itins.rr>, EVEX, VEX_LIG, EVEX_B, EVEX_RC,
-                 Sched<[itins.Sched]>;
+                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
+                 EVEX, VEX_LIG, EVEX_B, EVEX_RC,
+                 Sched<[sched]>;
+    let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
     def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
                       (SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
-                      (i32 FROUND_CURRENT)))], itins.rm>,
-                EVEX, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                      (i32 FROUND_CURRENT)))]>,
+                EVEX, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>;
+
+    def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+            (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
+    def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
+            (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
+  } // Predicates = [HasAVX512]
+}
+
+multiclass avx512_cvt_s_int_round_aliases<bits<8> opc, X86VectorVTInfo SrcVT,
+                                          X86VectorVTInfo DstVT, SDNode OpNode,
+                                          X86FoldableSchedWrite sched, string asm,
+                                          string aliasStr> :
+  avx512_cvt_s_int_round<opc, SrcVT, DstVT, OpNode, sched, asm, aliasStr, 0> {
+  let Predicates = [HasAVX512] in {
+    def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+            (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
+                                            SrcVT.IntScalarMemOp:$src), 0, "att">;
   } // Predicates = [HasAVX512]
 }
 
 // Convert float/double to signed/unsigned int 32/64
 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
-                                   X86cvts2si, SSE_CVT_SS2SI_32, "cvtss2si">,
+                                   X86cvts2si, WriteCvtSS2I, "cvtss2si", "{l}">,
                                    XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
-                                   X86cvts2si, SSE_CVT_SS2SI_64, "cvtss2si">,
+                                   X86cvts2si, WriteCvtSS2I, "cvtss2si", "{q}">,
                                    XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
-                                   X86cvts2usi, SSE_CVT_SS2SI_32, "cvtss2usi">,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info,
+                                   X86cvts2usi, WriteCvtSS2I, "cvtss2usi", "{l}">,
                                    XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
-                                   X86cvts2usi, SSE_CVT_SS2SI_64, "cvtss2usi">,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info,
+                                   X86cvts2usi, WriteCvtSS2I, "cvtss2usi", "{q}">,
                                    XS, VEX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
-                                   X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">,
+                                   X86cvts2si, WriteCvtSD2I, "cvtsd2si", "{l}">,
                                    XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
-                                   X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">,
+                                   X86cvts2si, WriteCvtSD2I, "cvtsd2si", "{q}">,
                                    XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
-                                   X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">,
+defm VCVTSD2USIZ:   avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info,
+                                   X86cvts2usi, WriteCvtSD2I, "cvtsd2usi", "{l}">,
                                    XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
-                                   X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info,
+                                   X86cvts2usi, WriteCvtSD2I, "cvtsd2usi", "{q}">,
                                    XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 // The SSE version of these instructions are disabled for AVX512.
@@ -6517,29 +7333,6 @@ let Predicates = [HasAVX512] in {
             (VCVTSD2SI64Zrm_Int sse_load_f64:$src)>;
 } // HasAVX512
 
-let Predicates = [HasAVX512] in {
-  def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2),
-            (VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>;
-  def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)),
-            (VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>;
-  def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2),
-            (VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>;
-  def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)),
-            (VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>;
-  def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2),
-            (VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
-  def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)),
-            (VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
-  def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2),
-            (VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>;
-  def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)),
-            (VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>;
-  def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2),
-            (VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
-  def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)),
-            (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
-} // Predicates = [HasAVX512]
-
 // Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
 // which produce unnecessary vmovs{s,d} instructions
 let Predicates = [HasAVX512] in {
@@ -6550,9 +7343,19 @@ def : Pat<(v4f32 (X86Movss
 
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+          (VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
           (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
 
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+          (VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>;
+
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128X:$dst),
                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
@@ -6560,83 +7363,143 @@ def : Pat<(v2f64 (X86Movsd
 
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+          (VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
           (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+          (VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (uint_to_fp GR64:$src)))))),
+          (VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi64 addr:$src))))))),
+          (VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (uint_to_fp GR32:$src)))))),
+          (VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi32 addr:$src))))))),
+          (VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (uint_to_fp GR64:$src)))))),
+          (VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi64 addr:$src))))))),
+          (VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (uint_to_fp GR32:$src)))))),
+          (VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi32 addr:$src))))))),
+          (VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>;
 } // Predicates = [HasAVX512]
 
 // Convert float/double to signed/unsigned int 32/64 with truncation
 multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
                             X86VectorVTInfo _DstRC, SDNode OpNode,
-                            SDNode OpNodeRnd, OpndItins itins, string aliasStr>{
+                            SDNode OpNodeRnd, X86FoldableSchedWrite sched,
+                            string aliasStr, bit CodeGenOnly = 1>{
 let Predicates = [HasAVX512] in {
+  let isCodeGenOnly = 1 in {
   def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))], itins.rr>,
-              EVEX, Sched<[itins.Sched]>;
-  let hasSideEffects = 0 in
-  def rrb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
-                !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
-                [], itins.rr>, EVEX, EVEX_B, Sched<[itins.Sched]>;
+              [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
+              EVEX, Sched<[sched]>;
   def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))],
-              itins.rm>, EVEX, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+              EVEX, Sched<[sched.Folded, ReadAfterLd]>;
+  }
+
+  def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+            !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+           [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+                                 (i32 FROUND_CURRENT)))]>,
+           EVEX, VEX_LIG, Sched<[sched]>;
+  def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+            !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+            [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+                                  (i32 FROUND_NO_EXC)))]>,
+                                  EVEX,VEX_LIG , EVEX_B, Sched<[sched]>;
+  let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
+  def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+              (ins _SrcRC.IntScalarMemOp:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set _DstRC.RC:$dst, (OpNodeRnd
+                                     (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src),
+                                     (i32 FROUND_CURRENT)))]>,
+              EVEX, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>;
 
   def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
-          (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
-  def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}",
-          (!cast<Instruction>(NAME # "rrb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
-  def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
-          (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
-                                          _SrcRC.ScalarMemOp:$src), 0>;
-
-  let isCodeGenOnly = 1 in {
-    def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-             [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
-                                   (i32 FROUND_CURRENT)))], itins.rr>,
-             EVEX, VEX_LIG, Sched<[itins.Sched]>;
-    def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
-              !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
-              [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
-                                    (i32 FROUND_NO_EXC)))], itins.rr>,
-                                    EVEX,VEX_LIG , EVEX_B, Sched<[itins.Sched]>;
-    let mayLoad = 1, hasSideEffects = 0 in
-      def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
-                  (ins _SrcRC.IntScalarMemOp:$src),
-                  !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-                  [], itins.rm>, EVEX, VEX_LIG,
-                  Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  } // isCodeGenOnly = 1
+          (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
+  def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}",
+          (!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
 } //HasAVX512
 }
 
+multiclass avx512_cvt_s_all_unsigned<bits<8> opc, string asm,
+                                     X86VectorVTInfo _SrcRC,
+                                     X86VectorVTInfo _DstRC, SDNode OpNode,
+                                     SDNode OpNodeRnd, X86FoldableSchedWrite sched,
+                                     string aliasStr> :
+  avx512_cvt_s_all<opc, asm, _SrcRC, _DstRC, OpNode, OpNodeRnd, sched,
+                   aliasStr, 0> {
+let Predicates = [HasAVX512] in {
+  def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+          (!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst,
+                                          _SrcRC.IntScalarMemOp:$src), 0, "att">;
+}
+}
 
 defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
-                        fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_32, "{l}">,
+                        fp_to_sint, X86cvtts2IntRnd, WriteCvtSS2I, "{l}">,
                         XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
-                        fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_64, "{q}">,
+                        fp_to_sint, X86cvtts2IntRnd, WriteCvtSS2I, "{q}">,
                         VEX_W, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
-                        fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{l}">,
+                        fp_to_sint, X86cvtts2IntRnd, WriteCvtSD2I, "{l}">,
                         XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
-                        fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{q}">,
+                        fp_to_sint, X86cvtts2IntRnd, WriteCvtSD2I, "{q}">,
                         VEX_W, XD, EVEX_CD8<64, CD8VT1>;
 
-defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
-                        fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_32, "{l}">,
+defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, WriteCvtSS2I, "{l}">,
                         XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
-                        fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_64, "{q}">,
+defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, WriteCvtSS2I, "{q}">,
                         XS,VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
-                        fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{l}">,
+defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, WriteCvtSD2I, "{l}">,
                         XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
-                        fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{q}">,
+defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, WriteCvtSD2I, "{q}">,
                         XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
             (VCVTTSS2SIZrr_Int VR128X:$src)>;
@@ -6661,93 +7524,94 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins> {
+                                X86VectorVTInfo _Src, SDNode OpNode,
+                                X86FoldableSchedWrite sched> {
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2),
-                                       (i32 FROUND_CURRENT))), itins.rr>,
-                         EVEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+                                       (i32 FROUND_CURRENT)))>,
+                         EVEX_4V, VEX_LIG, Sched<[sched]>;
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                   (_Src.VT _Src.ScalarIntMemCPat:$src2),
-                                  (i32 FROUND_CURRENT))), itins.rm>,
+                                  (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         Sched<[sched.Folded, ReadAfterLd]>;
 
   let isCodeGenOnly = 1, hasSideEffects = 0 in {
     def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _Src.FRC:$src2),
-               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-               itins.rr>, EVEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[sched]>;
     let mayLoad = 1 in
     def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
-               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-               itins.rm>, EVEX_4V, VEX_LIG,
-               Sched<[itins.Sched.Folded, ReadAfterLd]>;
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 // Scalar Coversion with SAE - suppress all exceptions
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> {
+                                    X86VectorVTInfo _Src, SDNode OpNodeRnd,
+                                    X86FoldableSchedWrite sched> {
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2),
-                                         (i32 FROUND_NO_EXC))), itins.rr>,
-                        EVEX_4V, VEX_LIG, EVEX_B, Sched<[itins.Sched]>;
+                                         (i32 FROUND_NO_EXC)))>,
+                        EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
 }
 
 // Scalar Conversion with rounding control (RC)
 multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> {
+                                   X86VectorVTInfo _Src, SDNode OpNodeRnd,
+                                   X86FoldableSchedWrite sched> {
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
-                                         (_Src.VT _Src.RC:$src2), (i32 imm:$rc))),
-                                         itins.rr>,
-                        EVEX_4V, VEX_LIG, Sched<[itins.Sched]>,
+                                         (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+                        EVEX_4V, VEX_LIG, Sched<[sched]>,
                         EVEX_B, EVEX_RC;
 }
 multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNodeRnd, OpndItins itins,
+                                  SDNode OpNodeRnd, X86FoldableSchedWrite sched,
                                   X86VectorVTInfo _src, X86VectorVTInfo _dst> {
   let Predicates = [HasAVX512] in {
-    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
              avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
-                               OpNodeRnd, itins>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+                               OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
   }
 }
 
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
-                                    SDNode OpNodeRnd, OpndItins itins,
-                                    X86VectorVTInfo _src, X86VectorVTInfo _dst> {
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+                                      X86FoldableSchedWrite sched,
+                                      X86VectorVTInfo _src, X86VectorVTInfo _dst> {
   let Predicates = [HasAVX512] in {
-    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
-             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
+             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
              EVEX_CD8<32, CD8VT1>, XS;
   }
 }
 defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
-                                         X86froundRnd, SSE_CVT_SD2SS, f64x_info,
-                                         f32x_info>, NotMemoryFoldable;
+                                         X86froundRnd, WriteCvtSD2SS, f64x_info,
+                                         f32x_info>;
 defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
-                                          X86fpextRnd, SSE_CVT_SS2SD, f32x_info,
-                                          f64x_info>, NotMemoryFoldable;
+                                          X86fpextRnd, WriteCvtSS2SD, f32x_info,
+                                          f64x_info>;
 
 def : Pat<(f64 (fpextend FR32X:$src)),
           (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
           Requires<[HasAVX512]>;
 def : Pat<(f64 (fpextend (loadf32 addr:$src))),
           (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX512]>;
+          Requires<[HasAVX512, OptForSize]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
           (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -6781,110 +7645,109 @@ def : Pat<(v2f64 (X86Movsd
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins,
-                         string Broadcast = _.BroadcastStr,
-                         string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
+                          X86VectorVTInfo _Src, SDNode OpNode,
+                          X86FoldableSchedWrite sched,
+                          string Broadcast = _.BroadcastStr,
+                          string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
 
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
-                         (_.VT (OpNode (_Src.VT _Src.RC:$src))), itins.rr>,
-                         EVEX, Sched<[itins.Sched]>;
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src)))>,
+                         EVEX, Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
                          (_.VT (OpNode (_Src.VT
-                             (bitconvert (_Src.LdFrag addr:$src))))), itins.rm>,
-                         EVEX, Sched<[itins.Sched.Folded]>;
+                             (bitconvert (_Src.LdFrag addr:$src)))))>,
+                         EVEX, Sched<[sched.Folded]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _Src.ScalarMemOp:$src), OpcodeStr,
                          "${src}"##Broadcast, "${src}"##Broadcast,
                          (_.VT (OpNode (_Src.VT
                                   (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
-                            )), itins.rm>, EVEX, EVEX_B,
-                         Sched<[itins.Sched.Folded]>;
+                            ))>, EVEX, EVEX_B,
+                         Sched<[sched.Folded]>;
 }
 // Coversion with SAE - suppress all exceptions
 multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                               X86VectorVTInfo _Src, SDNode OpNodeRnd,
-                              OpndItins itins> {
+                              X86FoldableSchedWrite sched> {
   defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _Src.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
                         (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
-                               (i32 FROUND_NO_EXC))), itins.rr>,
-                        EVEX, EVEX_B, Sched<[itins.Sched]>;
+                               (i32 FROUND_NO_EXC)))>,
+                        EVEX, EVEX_B, Sched<[sched]>;
 }
 
 // Conversion with rounding control (RC)
 multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd,
-                         OpndItins itins> {
+                         X86FoldableSchedWrite sched> {
   defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src", "$src, $rc",
-                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc))),
-                        itins.rr>, EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+                        EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 // Extend Float to Double
 multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
-                           OpndItins itins> {
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info,
-                            fpextend, itins>,
+                            fpextend, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
-                                X86vfpextRnd, itins>, EVEX_V512;
+                                X86vfpextRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
-                               X86vfpext, itins, "{1to2}", "", f64mem>, EVEX_V128;
+                               X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Truncate Double to Float
-multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, OpndItins itins> {
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, itins>,
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
-                               X86vfproundRnd, itins>, EVEX_V512;
+                               X86vfproundRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
-                               X86vfpround, itins, "{1to2}", "{x}">, EVEX_V128;
+                               X86vfpround, sched.XMM, "{1to2}", "{x}">, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
-                               itins, "{1to4}", "{y}">, EVEX_V256;
+                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
 
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
   }
 }
 
-defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SSE_CVT_PD2PS>,
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
                                   VEX_W, PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SSE_CVT_PS2PD>,
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
                                   PS, EVEX_CD8<32, CD8VH>;
 
 def : Pat<(v8f64 (extloadv8f32 addr:$src)),
             (VCVTPS2PDZrm addr:$src)>;
 
 let Predicates = [HasVLX] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                 (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
-              (VCVTPD2PSZ128rr VR128X:$src)>;
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                 (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
-              (VCVTPD2PSZ128rm addr:$src)>;
-  }
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+            (VCVTPD2PSZ128rr VR128X:$src)>;
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
+            (VCVTPD2PSZ128rm addr:$src)>;
   def : Pat<(v2f64 (extloadv2f32 addr:$src)),
               (VCVTPS2PDZ128rm addr:$src)>;
   def : Pat<(v4f64 (extloadv4f32 addr:$src)),
@@ -6893,80 +7756,79 @@ let Predicates = [HasVLX] in {
 
 // Convert Signed/Unsigned Doubleword to Double
 multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNode128, OpndItins itins> {
+                           SDNode OpNode128, X86SchedWriteWidths sched> {
   // No rounding in this op
   let Predicates = [HasAVX512] in
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode,
-                            itins>, EVEX_V512;
+                            sched.ZMM>, EVEX_V512;
 
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
-                               OpNode128, itins, "{1to2}", "", i64mem>, EVEX_V128;
+                               OpNode128, sched.XMM, "{1to2}", "", i64mem>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Signed/Unsigned Doubleword to Float
 multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
-                               OpNodeRnd, itins>, EVEX_V512;
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
 
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode,
-                               itins>, EVEX_V128;
+                               sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Float to Signed/Unsigned Doubleword with truncation
 multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNodeRnd, OpndItins itins> {
+                            SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
-                                OpNodeRnd, itins>, EVEX_V512;
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
-                               itins>, EVEX_V128;
+                               sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Float to Signed/Unsigned Doubleword
 multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
-                                OpNodeRnd, itins>, EVEX_V512;
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
-                               itins>, EVEX_V128;
+                               sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Double to Signed/Unsigned Doubleword with truncation
 multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNode128, SDNode OpNodeRnd,
-                            OpndItins itins> {
+                            SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
-                                OpNodeRnd, itins>, EVEX_V512;
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     // we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -6974,29 +7836,29 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
-                               OpNode128, itins, "{1to2}", "{x}">, EVEX_V128;
+                               OpNode, sched.XMM, "{1to2}", "{x}">, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
-                               itins, "{1to4}", "{y}">, EVEX_V256;
+                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
 
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
   }
 }
 
 // Convert Double to Signed/Unsigned Doubleword
 multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
-                               OpNodeRnd, itins>, EVEX_V512;
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     // we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7004,118 +7866,118 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
-                               itins, "{1to2}", "{x}">, EVEX_V128;
+                               sched.XMM, "{1to2}", "{x}">, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
-                               itins, "{1to4}", "{y}">, EVEX_V256;
+                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
 
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
   }
 }
 
 // Convert Double to Signed/Unsigned Quardword
 multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
-                               OpNodeRnd,itins>, EVEX_V512;
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
-                               itins>, EVEX_V128;
+                               sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Double to Signed/Unsigned Quardword with truncation
 multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNodeRnd, OpndItins itins> {
+                            SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
-                                OpNodeRnd, itins>, EVEX_V512;
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
-                               itins>, EVEX_V128;
+                               sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Signed/Unsigned Quardword to Double
 multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
-                               OpNodeRnd, itins>, EVEX_V512;
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
-                               itins>, EVEX_V128;
+                               sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
   }
 }
 
 // Convert Float to Signed/Unsigned Quardword
 multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
-                               OpNodeRnd, itins>, EVEX_V512;
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     // Explicitly specified broadcast string, since we take only 2 elements
     // from v4f32x_info source
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
-                               itins, "{1to2}", "", f64mem>, EVEX_V128;
+                               sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Float to Signed/Unsigned Quardword with truncation
 multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> {
+                            SDNode OpNodeRnd, X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
-                            itins>,
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
-                                OpNodeRnd, itins>, EVEX_V512;
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     // Explicitly specified broadcast string, since we take only 2 elements
     // from v4f32x_info source
-    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
-                               itins, "{1to2}", "", f64mem>, EVEX_V128;
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
-                               itins>, EVEX_V256;
+                               sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Signed/Unsigned Quardword to Float
 multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> {
+                           SDNode OpNode128, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
-                            itins>,
+                            sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
-                               OpNodeRnd, itins>, EVEX_V512;
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     // we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7123,116 +7985,226 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
-                               itins, "{1to2}", "{x}">, EVEX_V128;
+                               sched.XMM, "{1to2}", "{x}">, EVEX_V128,
+                               NotEVEX2VEXConvertible;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
-                               itins, "{1to4}", "{y}">, EVEX_V256;
+                               sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+                               NotEVEX2VEXConvertible;
 
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
     def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
   }
 }
 
 defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP,
-                                 SSE_CVT_I2PD>, XS, EVEX_CD8<32, CD8VH>;
+                                 SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
 
 defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
-                                X86VSintToFpRnd, SSE_CVT_I2PS>,
+                                X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
                                 PS, EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
-                                X86cvttp2siRnd, SSE_CVT_PS2I>,
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si,
+                                X86cvttp2siRnd, SchedWriteCvtPS2DQ>,
                                 XS, EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
-                                 X86cvttp2siRnd, SSE_CVT_PD2I>,
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si,
+                                 X86cvttp2siRnd, SchedWriteCvtPD2DQ>,
                                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
-                                 X86cvttp2uiRnd, SSE_CVT_PS2I>, PS,
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui,
+                                 X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS,
                                  EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
-                                 X86cvttp2ui, X86cvttp2uiRnd, SSE_CVT_PD2I>,
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui,
+                                 X86cvttp2uiRnd, SchedWriteCvtPD2DQ>,
                                  PS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp,
-                                  X86VUintToFP, SSE_CVT_I2PD>, XS,
+                                  X86VUintToFP, SchedWriteCvtDQ2PD>, XS,
                                   EVEX_CD8<32, CD8VH>;
 
 defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
-                                 X86VUintToFpRnd, SSE_CVT_I2PS>, XD,
+                                 X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD,
                                  EVEX_CD8<32, CD8VF>;
 
 defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
-                                 X86cvtp2IntRnd, SSE_CVT_PS2I>, PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
-                                 X86cvtp2IntRnd, SSE_CVT_PD2I>, XD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
                                  VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SSE_CVT_PS2I>,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
                                  PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
-                                 X86cvtp2IntRnd, SSE_CVT_PD2I>, VEX_W,
+                                 X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
-                                 X86cvtp2IntRnd, SSE_CVT_PS2I>, PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SSE_CVT_PS2I>, PD,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
-defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
-                                 X86cvttp2siRnd, SSE_CVT_PD2I>, VEX_W,
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si,
+                                 X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
-                                 X86cvttp2siRnd, SSE_CVT_PS2I>, PD,
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si,
+                                 X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
-defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
-                                 X86cvttp2uiRnd, SSE_CVT_PD2I>, VEX_W,
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui,
+                                 X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
-                                 X86cvttp2uiRnd, SSE_CVT_PS2I>, PD,
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui,
+                                 X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
-                            X86VSintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
-                            X86VUintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS,
+                            X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
-                            X86VSintToFpRnd, SSE_CVT_I2PS>, VEX_W, PS,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
-                            X86VUintToFpRnd, SSE_CVT_I2PS>, VEX_W, XD,
+                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
                             EVEX_CD8<64, CD8VF>;
 
+let Predicates = [HasAVX512] in  {
+  def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))),
+            (VCVTTPS2DQZrr VR512:$src)>;
+  def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))),
+            (VCVTTPS2DQZrm addr:$src)>;
+
+  def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))),
+            (VCVTTPS2UDQZrr VR512:$src)>;
+  def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))),
+            (VCVTTPS2UDQZrm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))),
+            (VCVTTPD2DQZrr VR512:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))),
+            (VCVTTPD2DQZrm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))),
+            (VCVTTPD2UDQZrr VR512:$src)>;
+  def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))),
+            (VCVTTPD2UDQZrm addr:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))),
+            (VCVTTPS2DQZ128rr VR128X:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+            (VCVTTPS2DQZ128rm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))),
+            (VCVTTPS2UDQZ128rr VR128X:$src)>;
+  def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))),
+            (VCVTTPS2UDQZ128rm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))),
+            (VCVTTPS2DQZ256rr VR256X:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+            (VCVTTPS2DQZ256rm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))),
+            (VCVTTPS2UDQZ256rr VR256X:$src)>;
+  def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))),
+            (VCVTTPS2UDQZ256rm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))),
+            (VCVTTPD2DQZ256rr VR256X:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+            (VCVTTPD2DQZ256rm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))),
+            (VCVTTPD2UDQZ256rr VR256X:$src)>;
+  def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))),
+            (VCVTTPD2UDQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))),
+            (VCVTTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))),
+            (VCVTTPS2QQZrm addr:$src)>;
+
+  def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))),
+            (VCVTTPS2UQQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))),
+            (VCVTTPS2UQQZrm addr:$src)>;
+
+  def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))),
+            (VCVTTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))),
+            (VCVTTPD2QQZrm addr:$src)>;
+
+  def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))),
+            (VCVTTPD2UQQZrr VR512:$src)>;
+  def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))),
+            (VCVTTPD2UQQZrm addr:$src)>;
+}
+
+let Predicates = [HasDQI, HasVLX] in {
+  def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))),
+            (VCVTTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))),
+            (VCVTTPS2QQZ256rm addr:$src)>;
+
+  def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))),
+            (VCVTTPS2UQQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))),
+            (VCVTTPS2UQQZ256rm addr:$src)>;
+
+  def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))),
+            (VCVTTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))),
+            (VCVTTPD2QQZ128rm addr:$src)>;
+
+  def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))),
+            (VCVTTPD2UQQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))),
+            (VCVTTPD2UQQZ128rm addr:$src)>;
+
+  def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))),
+            (VCVTTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))),
+            (VCVTTPD2QQZ256rm addr:$src)>;
+
+  def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))),
+            (VCVTTPD2UQQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))),
+            (VCVTTPD2UQQZ256rm addr:$src)>;
+}
+
 let Predicates = [HasAVX512, NoVLX] in {
 def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
@@ -7271,26 +8243,24 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
 }
 
 let Predicates = [HasAVX512, HasVLX] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
-              (VCVTPD2DQZ128rr VR128X:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
-              (VCVTPD2DQZ128rm addr:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
-              (VCVTPD2UDQZ128rr VR128X:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
-              (VCVTTPD2DQZ128rr VR128X:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
-              (VCVTTPD2DQZ128rm addr:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
-              (VCVTTPD2UDQZ128rr VR128X:$src)>;
-  }
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                              (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
+            (VCVTPD2DQZ128rr VR128X:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                              (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
+            (VCVTPD2DQZ128rm addr:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
+            (VCVTPD2UDQZ128rr VR128X:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                              (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
+            (VCVTTPD2DQZ128rr VR128X:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                              (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
+            (VCVTTPD2DQZ128rm addr:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
+            (VCVTTPD2UDQZ128rr VR128X:$src)>;
 
   def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (VCVTDQ2PDZ128rm addr:$src)>;
@@ -7311,14 +8281,12 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [HasDQI, HasVLX] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
-              (VCVTQQ2PSZ128rr VR128X:$src)>;
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
-              (VCVTUQQ2PSZ128rr VR128X:$src)>;
-  }
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                              (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+            (VCVTQQ2PSZ128rr VR128X:$src)>;
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                              (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+            (VCVTUQQ2PSZ128rr VR128X:$src)>;
 }
 
 let Predicates = [HasDQI, NoVLX] in {
@@ -7389,41 +8357,41 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
 
 multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                            X86MemOperand x86memop, PatFrag ld_frag,
-                           OpndItins itins> {
+                           X86FoldableSchedWrite sched> {
   defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
                             (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
-                            (X86cvtph2ps (_src.VT _src.RC:$src)),itins.rr>,
-                            T8PD, Sched<[itins.Sched]>;
+                            (X86cvtph2ps (_src.VT _src.RC:$src))>,
+                            T8PD, Sched<[sched]>;
   defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86cvtph2ps (_src.VT
                                           (bitconvert
-                                           (ld_frag addr:$src)))), itins.rm>,
-                            T8PD, Sched<[itins.Sched.Folded]>;
+                                           (ld_frag addr:$src))))>,
+                            T8PD, Sched<[sched.Folded]>;
 }
 
 multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
-                               OpndItins itins> {
+                               X86FoldableSchedWrite sched> {
   defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
                              (ins _src.RC:$src), "vcvtph2ps",
                              "{sae}, $src", "$src, {sae}",
                              (X86cvtph2psRnd (_src.VT _src.RC:$src),
-                                             (i32 FROUND_NO_EXC)), itins.rr>,
-                             T8PD, EVEX_B, Sched<[itins.Sched]>;
+                                             (i32 FROUND_NO_EXC))>,
+                             T8PD, EVEX_B, Sched<[sched]>;
 }
 
 let Predicates = [HasAVX512] in
   defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
-                                    SSE_CVT_PH2PS>,
-                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, SSE_CVT_PH2PS>,
+                                    WriteCvtPH2PSZ>,
+                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V256,
+                       loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V128,
+                       loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
@@ -7437,48 +8405,47 @@ let Predicates = [HasVLX] in {
 }
 
 multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
-                           X86MemOperand x86memop, OpndItins itins> {
+                           X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> {
   defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
                    (ins _src.RC:$src1, i32u8imm:$src2),
                    "vcvtps2ph", "$src2, $src1", "$src1, $src2",
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
-                                (i32 imm:$src2)),
-                   itins.rr, 0, 0>, AVX512AIi8Base, Sched<[itins.Sched]>;
+                                (i32 imm:$src2)), 0, 0>,
+                   AVX512AIi8Base, Sched<[RR]>;
   let hasSideEffects = 0, mayStore = 1 in {
     def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               Sched<[MR]>;
     def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                [], itins.rm>, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+               "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>,
+                EVEX_K, Sched<[MR]>, NotMemoryFoldable;
   }
 }
 
 multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
-                               OpndItins itins> {
+                               SchedWrite Sched> {
   let hasSideEffects = 0 in
   defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
                    (outs _dest.RC:$dst),
                    (ins _src.RC:$src1, i32u8imm:$src2),
-                   "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
-                   [], itins.rr>, EVEX_B, AVX512AIi8Base, Sched<[itins.Sched]>;
+                   "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>,
+                   EVEX_B, AVX512AIi8Base, Sched<[Sched]>;
 }
 
 let Predicates = [HasAVX512] in {
   defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem,
-                                    SSE_CVT_PS2PH>,
-                    avx512_cvtps2ph_sae<v16i16x_info, v16f32_info,
-                                        SSE_CVT_PS2PH>, EVEX, EVEX_V512,
-                                        EVEX_CD8<32, CD8VH>;
+                                    WriteCvtPS2PHZ, WriteCvtPS2PHZSt>,
+                    avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>,
+                                        EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
   let Predicates = [HasVLX] in {
     defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
-                                         SSE_CVT_PS2PH>, EVEX, EVEX_V256,
-                                         EVEX_CD8<32, CD8VH>;
+                                         WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
+                                         EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
     defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
-                                         SSE_CVT_PS2PH>, EVEX, EVEX_V128,
-                                         EVEX_CD8<32, CD8VH>;
+                                         WriteCvtPS2PH, WriteCvtPS2PHSt>,
+                                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 
   def : Pat<(store (f64 (extractelt
@@ -7503,431 +8470,430 @@ let Predicates = [HasVLX] in {
   // more consistent with other instructions, which are always controlled by it.
   // It's encoded as 0b100.
   def : Pat<(fp_to_f16 FR32X:$src),
-            (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr
-              (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>;
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr
+              (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>;
 
   def : Pat<(f16_to_fp GR16:$src),
-            (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
-              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >;
+            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
+              (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >;
 
   def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
-            (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
-              (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
+            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
+              (v8i16 (VCVTPS2PHZ128rr
+               (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >;
 }
 
 //  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
 multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
-                            string OpcodeStr, OpndItins itins> {
+                            string OpcodeStr, X86FoldableSchedWrite sched> {
   let hasSideEffects = 0 in
   def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
-                  !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
-                  [], itins.rr>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
-                  Sched<[itins.Sched]>;
+                  !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
+                  EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>;
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
-  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSE_COMIS>,
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", WriteFCom>,
                                    AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
-  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSE_COMIS>,
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", WriteFCom>,
                                    AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
-  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSE_COMIS>,
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", WriteFCom>,
                                    AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
-  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSE_COMIS>,
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", WriteFCom>,
                                    AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
-                                 "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
+                                 "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSE_COMIS>, PD, EVEX,
+                                  "ucomisd", WriteFCom>, PD, EVEX,
                                   VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   let Pattern = []<dag> in {
     defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
-                                   "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
+                                   "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
                                    EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
-                                   "comisd", SSE_COMIS>, PD, EVEX,
+                                   "comisd", WriteFCom>, PD, EVEX,
                                     VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
   let isCodeGenOnly = 1 in {
-    defm Int_VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
-                              sse_load_f32, "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
-                              EVEX_CD8<32, CD8VT1>;
-    defm Int_VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
-                              sse_load_f64, "ucomisd", SSE_COMIS>, PD, EVEX,
-                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+    defm VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
+                          sse_load_f32, "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
+                          EVEX_CD8<32, CD8VT1>;
+    defm VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
+                          sse_load_f64, "ucomisd", WriteFCom>, PD, EVEX,
+                          VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
 
-    defm Int_VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
-                              sse_load_f32, "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
-                              EVEX_CD8<32, CD8VT1>;
-    defm Int_VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
-                              sse_load_f64, "comisd", SSE_COMIS>, PD, EVEX,
-                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+    defm VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
+                          sse_load_f32, "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
+                          EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
+                          sse_load_f64, "comisd", WriteFCom>, PD, EVEX,
+                          VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
 }
 
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         OpndItins itins, X86VectorVTInfo _> {
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>,
-                           EVEX_4V, Sched<[itins.Sched]>;
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+                           EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
-                          _.ScalarIntMemCPat:$src2), itins.rm>, EVEX_4V,
-                          Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          _.ScalarIntMemCPat:$src2)>, EVEX_4V,
+                          Sched<[sched.Folded, ReadAfterLd]>;
 }
 }
 
-defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SSE_RCPS, f32x_info>,
-                  EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SSE_RCPS, f64x_info>,
-                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, SSE_RSQRTSS, f32x_info>,
-                  EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, SSE_RSQRTSS, f64x_info>,
-                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
+defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
+                               f32x_info>, EVEX_CD8<32, CD8VT1>,
+                               T8PD;
+defm VRCP14SDZ : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SchedWriteFRcp.Scl,
+                               f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>,
+                               T8PD;
+defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
+                                 SchedWriteFRsqrt.Scl, f32x_info>,
+                                 EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
+                                 SchedWriteFRsqrt.Scl, f64x_info>, VEX_W,
+                                 EVEX_CD8<64, CD8VT1>, T8PD;
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         OpndItins itins, X86VectorVTInfo _> {
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                         (_.FloatVT (OpNode _.RC:$src)), itins.rr>, EVEX, T8PD,
-                         Sched<[itins.Sched]>;
+                         (_.VT (OpNode _.RC:$src))>, EVEX, T8PD,
+                         Sched<[sched]>;
   defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
-                         (OpNode (_.FloatVT
-                           (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX, T8PD,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         (OpNode (_.VT
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                           (ins _.ScalarMemOp:$src), OpcodeStr,
                           "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
-                          (OpNode (_.FloatVT
-                            (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>,
-                          EVEX, T8PD, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          (OpNode (_.VT
+                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                          EVEX, T8PD, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                SizeItins itins> {
-  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, itins.s,
+                                X86SchedWriteWidths sched> {
+  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
                            v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, itins.d,
+  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM,
                            v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
-                                OpNode, itins.s, v4f32x_info>,
+                                OpNode, sched.XMM, v4f32x_info>,
                                EVEX_V128, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
-                                OpNode, itins.s, v8f32x_info>,
+                                OpNode, sched.YMM, v8f32x_info>,
                                EVEX_V256, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
-                                OpNode, itins.d, v2f64x_info>,
+                                OpNode, sched.XMM, v2f64x_info>,
                                EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
-                                OpNode, itins.d, v4f64x_info>,
+                                OpNode, sched.YMM, v4f64x_info>,
                                EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
   }
 }
 
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SSE_RSQRT_P>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SSE_RCP_P>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
 
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode OpNode, OpndItins itins> {
+                         SDNode OpNode, X86FoldableSchedWrite sched> {
   let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 FROUND_CURRENT)), itins.rr>,
-                           Sched<[itins.Sched]>;
+                           (i32 FROUND_CURRENT))>,
+                           Sched<[sched]>;
 
   defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                             (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                            (i32 FROUND_NO_EXC)), itins.rm>, EVEX_B,
-                            Sched<[itins.Sched]>;
+                            (i32 FROUND_NO_EXC))>, EVEX_B,
+                            Sched<[sched]>;
 
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
-                         (i32 FROUND_CURRENT)), itins.rm>,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         (i32 FROUND_CURRENT))>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                        SizeItins itins> {
-  defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, itins.s>,
-              EVEX_CD8<32, CD8VT1>;
-  defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, itins.d>,
-              EVEX_CD8<64, CD8VT1>, VEX_W;
+                        X86FoldableSchedWrite sched> {
+  defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, sched>,
+               EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, sched>,
+               EVEX_CD8<64, CD8VT1>, VEX_W;
 }
 
 let Predicates = [HasERI] in {
-  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28",   X86rcp28s, SSE_RCP_S>,
-                              T8PD, EVEX_4V;
-  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, SSE_RSQRT_S>,
+  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>,
                               T8PD, EVEX_4V;
+  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s,
+                               SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
 }
 
-defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, SSE_ALU_ITINS_S>,
-                             T8PD, EVEX_4V;
+defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds,
+                              SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
 /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         SDNode OpNode, OpndItins itins> {
+                         SDNode OpNode, X86FoldableSchedWrite sched> {
   let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                         (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT)),
-                         itins.rr>, Sched<[itins.Sched]>;
+                         (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>,
+                         Sched<[sched]>;
 
   defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
-                         (OpNode (_.FloatVT
+                         (OpNode (_.VT
                              (bitconvert (_.LdFrag addr:$src))),
-                          (i32 FROUND_CURRENT)), itins.rm>,
-                          Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          (i32 FROUND_CURRENT))>,
+                          Sched<[sched.Folded, ReadAfterLd]>;
 
   defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.ScalarMemOp:$src), OpcodeStr,
                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
-                         (OpNode (_.FloatVT
+                         (OpNode (_.VT
                                   (X86VBroadcast (_.ScalarLdFrag addr:$src))),
-                                 (i32 FROUND_CURRENT)), itins.rm>, EVEX_B,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                 (i32 FROUND_CURRENT))>, EVEX_B,
+                         Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         SDNode OpNode, OpndItins itins> {
+                         SDNode OpNode, X86FoldableSchedWrite sched> {
   let ExeDomain = _.ExeDomain in
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
-                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
-                        itins.rr>, EVEX_B, Sched<[itins.Sched]>;
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>,
+                        EVEX_B, Sched<[sched]>;
 }
 
 multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                       SizeItins itins> {
-   defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>,
-             avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>,
-             T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-   defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>,
-             avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>,
-             T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+                       X86SchedWriteWidths sched> {
+   defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+              avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+              T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+   defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+              avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+              T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNode, SizeItins itins> {
+                                  SDNode OpNode, X86SchedWriteWidths sched> {
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
-    defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, itins.s>,
+    defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, sched.XMM>,
                                      EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
-    defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, itins.s>,
+    defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, sched.YMM>,
                                      EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
-    defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, itins.d>,
+    defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, sched.XMM>,
                                      EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
-    defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, itins.d>,
+    defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, sched.YMM>,
                                      EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
   }
 }
-let Predicates = [HasERI] in {
 
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SSE_RSQRT_P>, EVEX;
- defm VRCP28   : avx512_eri<0xCA, "vrcp28", X86rcp28, SSE_RCP_P>, EVEX;
- defm VEXP2    : avx512_eri<0xC8, "vexp2", X86exp2, SSE_ALU_ITINS_P>, EVEX;
+let Predicates = [HasERI] in {
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX;
+ defm VRCP28   : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX;
+ defm VEXP2    : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX;
 }
-defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SSE_ALU_ITINS_P>,
+defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>,
                  avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd,
-                                          SSE_ALU_ITINS_P>, EVEX;
+                                          SchedWriteFRnd>, EVEX;
 
-multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, OpndItins itins,
-                                    X86VectorVTInfo _>{
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+                                    X86FoldableSchedWrite sched, X86VectorVTInfo _>{
   let ExeDomain = _.ExeDomain in
   defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
-                         (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc))), itins.rr>,
-                         EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
+                         (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc)))>,
+                         EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
-multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, OpndItins itins,
-                              X86VectorVTInfo _>{
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+                              X86FoldableSchedWrite sched, X86VectorVTInfo _>{
   let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                         (_.FloatVT (fsqrt _.RC:$src)), itins.rr>, EVEX,
-                         Sched<[itins.Sched]>;
+                         (_.VT (fsqrt _.RC:$src))>, EVEX,
+                         Sched<[sched]>;
   defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
-                         (fsqrt (_.FloatVT
-                           (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX,
-                           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                         (fsqrt (_.VT
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX,
+                           Sched<[sched.Folded, ReadAfterLd]>;
   defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                           (ins _.ScalarMemOp:$src), OpcodeStr,
                           "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
-                          (fsqrt (_.FloatVT
-                            (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>,
-                          EVEX, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                          (fsqrt (_.VT
+                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                          EVEX, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
-multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr> {
-  defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS, v16f32_info>,
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+                                  X86SchedWriteSizes sched> {
+  defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                sched.PS.ZMM, v16f32_info>,
                                 EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD, v8f64_info>,
+  defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                sched.PD.ZMM, v8f64_info>,
                                 EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
-                                     SSE_SQRTPS, v4f32x_info>,
+                                     sched.PS.XMM, v4f32x_info>,
                                      EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
-                                     SSE_SQRTPS, v8f32x_info>,
+                                     sched.PS.YMM, v8f32x_info>,
                                      EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
-                                     SSE_SQRTPD, v2f64x_info>,
+                                     sched.PD.XMM, v2f64x_info>,
                                      EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
-                                     SSE_SQRTPD, v4f64x_info>,
+                                     sched.PD.YMM, v4f64x_info>,
                                      EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
   }
 }
 
-multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr> {
-  defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS,
-                                v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD,
-                                v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+                                        X86SchedWriteSizes sched> {
+  defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
+                                      sched.PS.ZMM, v16f32_info>,
+                                      EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"),
+                                      sched.PD.ZMM, v8f64_info>,
+                                      EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
 }
 
-multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, OpndItins itins,
-                              X86VectorVTInfo _, string SUFF, Intrinsic Intr> {
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
+                              X86VectorVTInfo _, string Name> {
   let ExeDomain = _.ExeDomain in {
-  defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+    defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (X86fsqrtRnds (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
-                                    (i32 FROUND_CURRENT)), itins.rr>,
-                         Sched<[itins.Sched]>;
-  defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                       (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
-                       "$src2, $src1", "$src1, $src2",
-                       (X86fsqrtRnds (_.VT _.RC:$src1),
-                                  _.ScalarIntMemCPat:$src2,
-                                  (i32 FROUND_CURRENT)), itins.rm>,
-                       Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                                    (i32 FROUND_CURRENT))>,
+                         Sched<[sched]>;
+    defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (X86fsqrtRnds (_.VT _.RC:$src1),
+                                    _.ScalarIntMemCPat:$src2,
+                                    (i32 FROUND_CURRENT))>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
+    defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                          "$rc, $src2, $src1", "$src1, $src2, $rc",
                          (X86fsqrtRnds (_.VT _.RC:$src1),
                                      (_.VT _.RC:$src2),
-                                     (i32 imm:$rc)), itins.rr>,
-                         EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
-
-  let isCodeGenOnly = 1, hasSideEffects = 0 in {
-    def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
-               (ins _.FRC:$src1, _.FRC:$src2),
-               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rr>,
-               Sched<[itins.Sched]>;
-    let mayLoad = 1 in
-      def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
-                 (ins _.FRC:$src1, _.ScalarMemOp:$src2),
-                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rm>,
-                 Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  }
+                                     (i32 imm:$rc))>,
+                         EVEX_B, EVEX_RC, Sched<[sched]>;
+
+    let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
+      def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+                (ins _.FRC:$src1, _.FRC:$src2),
+                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                Sched<[sched]>;
+      let mayLoad = 1 in
+        def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+                  (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                  Sched<[sched.Folded, ReadAfterLd]>;
+    }
   }
 
-let Predicates = [HasAVX512] in {
-  def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
-            (!cast<Instruction>(NAME#SUFF#Zr)
-                (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
-
-   def : Pat<(Intr VR128X:$src),
-             (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src,
-                                 VR128X:$src)>;
-}
-
-let Predicates = [HasAVX512, OptForSize] in {
-  def : Pat<(_.EltVT (fsqrt (load addr:$src))),
-            (!cast<Instruction>(NAME#SUFF#Zm)
-                (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
-
-  def : Pat<(Intr _.ScalarIntMemCPat:$src2),
-            (!cast<Instruction>(NAME#SUFF#Zm_Int)
-                  (_.VT (IMPLICIT_DEF)), addr:$src2)>;
-}
+  let Predicates = [HasAVX512] in {
+    def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
+              (!cast<Instruction>(Name#Zr)
+                  (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+  }
 
+  let Predicates = [HasAVX512, OptForSize] in {
+    def : Pat<(_.EltVT (fsqrt (load addr:$src))),
+              (!cast<Instruction>(Name#Zm)
+                  (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+  }
 }
 
-multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
-  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", SSE_SQRTPS, f32x_info, "SS",
-                        int_x86_sse_sqrt_ss>,
-                        EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable;
-  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", SSE_SQRTPD, f64x_info, "SD",
-                        int_x86_sse2_sqrt_sd>,
-                        EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
-                        NotMemoryFoldable;
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
+                                  X86SchedWriteSizes sched> {
+  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
+                        EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
+                        EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
 }
 
-defm VSQRT   : avx512_sqrt_packed_all<0x51, "vsqrt">,
-               avx512_sqrt_packed_all_round<0x51, "vsqrt">;
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
+             avx512_sqrt_packed_all_round<0x51, "vsqrt", SchedWriteFSqrtSizes>;
 
-defm VSQRT   : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, VEX_LIG;
 
 multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
-                                  OpndItins itins, X86VectorVTInfo _> {
+                                  X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                            "$src3, $src2, $src1", "$src1, $src2, $src3",
                            (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 imm:$src3))), itins.rr>,
-                           Sched<[itins.Sched]>;
+                           (i32 imm:$src3)))>,
+                           Sched<[sched]>;
 
   defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                          "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
                          (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                         (i32 imm:$src3), (i32 FROUND_NO_EXC))), itins.rr>, EVEX_B,
-                         Sched<[itins.Sched]>;
+                         (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B,
+                         Sched<[sched]>;
 
   defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
                          OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
                          (_.VT (X86RndScales _.RC:$src1,
-                                _.ScalarIntMemCPat:$src2, (i32 imm:$src3))), itins.rm>,
-                         Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>,
+                         Sched<[sched.Folded, ReadAfterLd]>;
 
-  let isCodeGenOnly = 1, hasSideEffects = 0 in {
+  let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
     def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
                OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-               [], itins.rr>, Sched<[itins.Sched]>;
+               []>, Sched<[sched]>;
 
     let mayLoad = 1 in
       def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
                  OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-                 [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                 []>, Sched<[sched.Folded, ReadAfterLd]>;
   }
   }
 
@@ -7968,344 +8934,397 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", SSE_ALU_F32S,
-                      f32x_info>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
+                                           SchedWriteFRnd.Scl, f32x_info>,
+                                           AVX512AIi8Base, EVEX_4V,
+                                           EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
+                                           SchedWriteFRnd.Scl, f64x_info>,
+                                           VEX_W, AVX512AIi8Base, EVEX_4V,
+                                           EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
+                                dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
+                                dag OutMask, Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+               (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+               (extractelt _.VT:$dst, (iPTR 0))))),
+              (!cast<Instruction>("V"#OpcPrefix#r_Intk)
+               _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
+
+    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+               (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+               ZeroFP))),
+              (!cast<Instruction>("V"#OpcPrefix#r_Intkz)
+               OutMask, _.VT:$src2, _.VT:$src1)>;
+  }
+}
+
+defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
+                            (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
+                            fp32imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasAVX512>;
+defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
+                            (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info,
+                            fp64imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasAVX512>;
+
+multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,
+                                    X86VectorVTInfo _, PatLeaf ZeroFP,
+                                    bits<8> ImmV, Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
+               (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+               (extractelt _.VT:$dst, (iPTR 0))))),
+              (!cast<Instruction>("V"#OpcPrefix#Zr_Intk)
+               _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
+
+    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
+               (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
+              (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz)
+               VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
+  }
+}
+
+defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
+                                v4f32x_info, fp32imm0, 0x01, HasAVX512>;
+defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
+                                v4f32x_info, fp32imm0, 0x02, HasAVX512>;
+defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
+                                v2f64x_info, fp64imm0, 0x01, HasAVX512>;
+defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
+                                v2f64x_info, fp64imm0, 0x02,  HasAVX512>;
 
-defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", SSE_ALU_F64S,
-                      f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V,
-                      EVEX_CD8<64, CD8VT1>;
 
 //-------------------------------------------------
 // Integer truncate and extend operations
 //-------------------------------------------------
 
-let Sched = WriteShuffle256 in
-def AVX512_EXTEND : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
-let Sched = WriteShuffle256 in
-def AVX512_TRUNCATE : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
 multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              OpndItins itins, X86VectorVTInfo SrcInfo,
+                              X86FoldableSchedWrite sched, X86VectorVTInfo SrcInfo,
                               X86VectorVTInfo DestInfo, X86MemOperand x86memop> {
   let ExeDomain = DestInfo.ExeDomain in
   defm rr  : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
                       (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
-                      (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
-                      itins.rr>, EVEX, T8XS, Sched<[itins.Sched]>;
+                      (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+                      EVEX, T8XS, Sched<[sched]>;
 
-  let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
-      ExeDomain = DestInfo.ExeDomain in {
+  let mayStore = 1, hasSideEffects = 0, ExeDomain = DestInfo.ExeDomain in {
     def mr : AVX512XS8I<opc, MRMDestMem, (outs),
                (ins x86memop:$dst, SrcInfo.RC:$src),
-               OpcodeStr # "\t{$src, $dst|$dst, $src}",
-               [], itins.rm>, EVEX, Sched<[itins.Sched.Folded]>;
+               OpcodeStr # "\t{$src, $dst|$dst, $src}", []>,
+               EVEX, Sched<[sched.Folded]>;
 
     def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
                (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
-               OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
-               [], itins.rm>, EVEX, EVEX_K, Sched<[itins.Sched.Folded]>;
-  }//mayStore = 1, mayLoad = 1, hasSideEffects = 0
+               OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>,
+               EVEX, EVEX_K, Sched<[sched.Folded]>, NotMemoryFoldable;
+  }//mayStore = 1, hasSideEffects = 0
 }
 
 multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
                                     X86VectorVTInfo DestInfo,
-                                    PatFrag truncFrag, PatFrag mtruncFrag > {
+                                    PatFrag truncFrag, PatFrag mtruncFrag,
+                                    string Name> {
 
   def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
-            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+            (!cast<Instruction>(Name#SrcInfo.ZSuffix##mr)
                                     addr:$dst, SrcInfo.RC:$src)>;
 
   def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
                                                (SrcInfo.VT SrcInfo.RC:$src)),
-            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+            (!cast<Instruction>(Name#SrcInfo.ZSuffix##mrk)
                             addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
 }
 
-multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
-         OpndItins itins, AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
-         X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
-         X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
-         X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
-                                                     Predicate prd = HasAVX512>{
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode128,
+                        SDNode OpNode256, SDNode OpNode512, X86FoldableSchedWrite sched,
+                        AVX512VLVectorVTInfo VTSrcInfo,
+                        X86VectorVTInfo DestInfoZ128,
+                        X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+                        X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+                        X86MemOperand x86memopZ, PatFrag truncFrag,
+                        PatFrag mtruncFrag, Predicate prd = HasAVX512>{
 
   let Predicates = [HasVLX, prd] in {
-    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode128, sched,
                              VTSrcInfo.info128, DestInfoZ128, x86memopZ128>,
                 avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
-                             truncFrag, mtruncFrag>, EVEX_V128;
+                             truncFrag, mtruncFrag, NAME>, EVEX_V128;
 
-    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode256, sched,
                              VTSrcInfo.info256, DestInfoZ256, x86memopZ256>,
                 avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
-                             truncFrag, mtruncFrag>, EVEX_V256;
+                             truncFrag, mtruncFrag, NAME>, EVEX_V256;
   }
   let Predicates = [prd] in
-    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode512, sched,
                              VTSrcInfo.info512, DestInfoZ, x86memopZ>,
                 avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
-                             truncFrag, mtruncFrag>, EVEX_V512;
+                             truncFrag, mtruncFrag, NAME>, EVEX_V512;
 }
 
 multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, PatFrag StoreNode,
-                           PatFrag MaskedStoreNode> {
-  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
-               v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
-               StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
+                           X86FoldableSchedWrite sched, PatFrag StoreNode,
+                           PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, InVecNode, sched,
+                          avx512vl_i64_info, v16i8x_info, v16i8x_info,
+                          v16i8x_info, i16mem, i32mem, i64mem, StoreNode,
+                          MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
 }
 
 multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, PatFrag StoreNode,
-                           PatFrag MaskedStoreNode> {
-  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
-               v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
-               StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
+                           X86FoldableSchedWrite sched, PatFrag StoreNode,
+                           PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode, sched,
+                          avx512vl_i64_info, v8i16x_info, v8i16x_info,
+                          v8i16x_info, i32mem, i64mem, i128mem, StoreNode,
+                          MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
 }
 
 multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, PatFrag StoreNode,
-                           PatFrag MaskedStoreNode> {
-  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
-               v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
-               StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
+                           X86FoldableSchedWrite sched, PatFrag StoreNode,
+                           PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, sched,
+                          avx512vl_i64_info, v4i32x_info, v4i32x_info,
+                          v8i32x_info, i64mem, i128mem, i256mem, StoreNode,
+                          MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
 }
 
 multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, PatFrag StoreNode,
-                           PatFrag MaskedStoreNode> {
-  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info,
-               v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
-               StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
+                           X86FoldableSchedWrite sched, PatFrag StoreNode,
+                           PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode, sched,
+                          avx512vl_i32_info, v16i8x_info, v16i8x_info,
+                          v16i8x_info, i32mem, i64mem, i128mem, StoreNode,
+                          MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
 }
 
 multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, PatFrag StoreNode,
-                           PatFrag MaskedStoreNode> {
-  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info,
-              v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
-              StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
+                           X86FoldableSchedWrite sched, PatFrag StoreNode,
+                           PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, sched,
+                          avx512vl_i32_info, v8i16x_info, v8i16x_info,
+                          v16i16x_info, i64mem, i128mem, i256mem, StoreNode,
+                          MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
 }
 
 multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, PatFrag StoreNode,
-                           PatFrag MaskedStoreNode> {
-  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i16_info,
-              v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
-              StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
+                           X86FoldableSchedWrite sched, PatFrag StoreNode,
+                           PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode,
+                          sched, avx512vl_i16_info, v16i8x_info, v16i8x_info,
+                          v32i8x_info, i64mem, i128mem, i256mem, StoreNode,
+                          MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
 }
 
-defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb",   X86vtrunc, AVX512_TRUNCATE,
-                                  truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSQB   : avx512_trunc_qb<0x22, "vpmovsqb",  X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb",   trunc, WriteShuffle256,
+                                  truncstorevi8, masked_truncstorevi8, X86vtrunc>;
+defm VPMOVSQB   : avx512_trunc_qb<0x22, "vpmovsqb",  X86vtruncs, WriteShuffle256,
                                   truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSQB  : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSQB  : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, WriteShuffle256,
                                   truncstore_us_vi8, masked_truncstore_us_vi8>;
 
-defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw",   X86vtrunc, AVX512_TRUNCATE,
-                                  truncstorevi16, masked_truncstorevi16>;
-defm VPMOVSQW   : avx512_trunc_qw<0x24, "vpmovsqw",  X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw",   trunc, WriteShuffle256,
+                                  truncstorevi16, masked_truncstorevi16, X86vtrunc>;
+defm VPMOVSQW   : avx512_trunc_qw<0x24, "vpmovsqw",  X86vtruncs, WriteShuffle256,
                                   truncstore_s_vi16, masked_truncstore_s_vi16>;
-defm VPMOVUSQW  : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSQW  : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, WriteShuffle256,
                                   truncstore_us_vi16, masked_truncstore_us_vi16>;
 
-defm VPMOVQD    : avx512_trunc_qd<0x35, "vpmovqd",   X86vtrunc, AVX512_TRUNCATE,
-                                  truncstorevi32, masked_truncstorevi32>;
-defm VPMOVSQD   : avx512_trunc_qd<0x25, "vpmovsqd",  X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVQD    : avx512_trunc_qd<0x35, "vpmovqd",   trunc, WriteShuffle256,
+                                  truncstorevi32, masked_truncstorevi32, X86vtrunc>;
+defm VPMOVSQD   : avx512_trunc_qd<0x25, "vpmovsqd",  X86vtruncs, WriteShuffle256,
                                   truncstore_s_vi32, masked_truncstore_s_vi32>;
-defm VPMOVUSQD  : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSQD  : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, WriteShuffle256,
                                   truncstore_us_vi32, masked_truncstore_us_vi32>;
 
-defm VPMOVDB    : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, AVX512_TRUNCATE,
-                                  truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSDB   : avx512_trunc_db<0x21, "vpmovsdb",   X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVDB    : avx512_trunc_db<0x31, "vpmovdb", trunc, WriteShuffle256,
+                                  truncstorevi8, masked_truncstorevi8, X86vtrunc>;
+defm VPMOVSDB   : avx512_trunc_db<0x21, "vpmovsdb",   X86vtruncs, WriteShuffle256,
                                   truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSDB  : avx512_trunc_db<0x11, "vpmovusdb",  X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSDB  : avx512_trunc_db<0x11, "vpmovusdb",  X86vtruncus, WriteShuffle256,
                                   truncstore_us_vi8, masked_truncstore_us_vi8>;
 
-defm VPMOVDW    : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, AVX512_TRUNCATE,
-                                  truncstorevi16, masked_truncstorevi16>;
-defm VPMOVSDW   : avx512_trunc_dw<0x23, "vpmovsdw",   X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVDW    : avx512_trunc_dw<0x33, "vpmovdw", trunc, WriteShuffle256,
+                                  truncstorevi16, masked_truncstorevi16, X86vtrunc>;
+defm VPMOVSDW   : avx512_trunc_dw<0x23, "vpmovsdw",   X86vtruncs, WriteShuffle256,
                                   truncstore_s_vi16, masked_truncstore_s_vi16>;
-defm VPMOVUSDW  : avx512_trunc_dw<0x13, "vpmovusdw",  X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSDW  : avx512_trunc_dw<0x13, "vpmovusdw",  X86vtruncus, WriteShuffle256,
                                   truncstore_us_vi16, masked_truncstore_us_vi16>;
 
-defm VPMOVWB    : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, AVX512_TRUNCATE,
-                                  truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSWB   : avx512_trunc_wb<0x20, "vpmovswb",   X86vtruncs, AVX512_TRUNCATE,
+defm VPMOVWB    : avx512_trunc_wb<0x30, "vpmovwb", trunc, WriteShuffle256,
+                                  truncstorevi8, masked_truncstorevi8, X86vtrunc>;
+defm VPMOVSWB   : avx512_trunc_wb<0x20, "vpmovswb",   X86vtruncs, WriteShuffle256,
                                   truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSWB  : avx512_trunc_wb<0x10, "vpmovuswb",  X86vtruncus, AVX512_TRUNCATE,
+defm VPMOVUSWB  : avx512_trunc_wb<0x10, "vpmovuswb",  X86vtruncus, WriteShuffle256,
                                   truncstore_us_vi8, masked_truncstore_us_vi8>;
 
 let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
          (v8i16 (EXTRACT_SUBREG
                  (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
                                           VR256X:$src, sub_ymm)))), sub_xmm))>;
-def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
          (v4i32 (EXTRACT_SUBREG
                  (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
                                            VR256X:$src, sub_ymm)))), sub_xmm))>;
 }
 
 let Predicates = [HasBWI, NoVLX] in {
-def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
          (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
                                             VR256X:$src, sub_ymm))), sub_xmm))>;
 }
 
-multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, OpndItins itins,
+multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
               X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
-              X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
+              X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
   let ExeDomain = DestInfo.ExeDomain in {
   defm rr   : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
                     (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
-                    (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src))), itins.rr>,
-                  EVEX, Sched<[itins.Sched]>;
+                    (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+                  EVEX, Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
                   (ins x86memop:$src), OpcodeStr ,"$src", "$src",
-                  (DestInfo.VT (LdFrag addr:$src)), itins.rm>,
-                EVEX, Sched<[itins.Sched.Folded]>;
+                  (DestInfo.VT (LdFrag addr:$src))>,
+                EVEX, Sched<[sched.Folded]>;
   }
 }
 
-multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
-          OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+multiclass WriteShuffle256_BW<bits<8> opc, string OpcodeStr,
+          SDNode OpNode, SDNode InVecNode, string ExtTy,
+          X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasBWI] in {
-    defm Z128:  avx512_extend_common<opc, OpcodeStr, itins, v8i16x_info,
+    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i16x_info,
                     v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  avx512_extend_common<opc, OpcodeStr, itins, v16i16x_info,
+    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v16i16x_info,
                     v16i8x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasBWI] in {
-    defm Z   :  avx512_extend_common<opc, OpcodeStr, itins, v32i16_info,
+    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v32i16_info,
                     v32i8x_info, i256mem, LdFrag, OpNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
-          OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr,
+          SDNode OpNode, SDNode InVecNode, string ExtTy,
+          X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info,
+    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info,
+    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
                    v16i8x_info, i64mem, LdFrag, OpNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  avx512_extend_common<opc, OpcodeStr, itins, v16i32_info,
+    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
                    v16i8x_info, i128mem, LdFrag, OpNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
-          OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr,
+          SDNode OpNode, SDNode InVecNode, string ExtTy,
+          X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
+    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
                    v16i8x_info, i16mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
+    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
                    v16i8x_info, i32mem, LdFrag, OpNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
+    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
                    v16i8x_info, i64mem, LdFrag, OpNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
-         OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+multiclass WriteShuffle256_WD<bits<8> opc, string OpcodeStr,
+         SDNode OpNode, SDNode InVecNode, string ExtTy,
+         X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info,
+    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info,
+    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  avx512_extend_common<opc, OpcodeStr, itins, v16i32_info,
+    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
                    v16i16x_info, i256mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
-         OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr,
+         SDNode OpNode, SDNode InVecNode, string ExtTy,
+         X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
+    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
                    v8i16x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
+    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
                    v8i16x_info, i64mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
+    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
-         OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr,
+         SDNode OpNode, SDNode InVecNode, string ExtTy,
+         X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
 
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
+    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
                    v4i32x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
 
-    defm Z256:  avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
+    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
                    v4i32x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
+    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
                    v8i32x_info, i256mem, LdFrag, OpNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
   }
 }
 
-defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>;
 
-defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>;
 
 
 multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
-                                 SDNode InVecOp, PatFrag ExtLoad16> {
+                                 SDNode InVecOp> {
   // 128-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
   def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -8329,7 +9348,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
@@ -8445,8 +9464,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   }
 }
 
-defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>;
-defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;
+defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>;
 
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
@@ -8470,7 +9489,7 @@ multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
-                                      vy512mem, mgatherv8i32>, EVEX_V512, VEX_W;
+                                      vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W;
   defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
                                       vz512mem,  mgatherv8i64>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
@@ -8489,7 +9508,7 @@ multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mgatherv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256xmem,
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem,
                                        mgatherv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
@@ -8512,16 +9531,17 @@ defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q
                 avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
 
 multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                          X86MemOperand memop, PatFrag ScatterNode> {
+                          X86MemOperand memop, PatFrag ScatterNode,
+                          RegisterClass MaskRC = _.KRCWM> {
 
 let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
 
-  def mr  : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
-            (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
+  def mr  : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb),
+            (ins memop:$dst, MaskRC:$mask, _.RC:$src),
             !strconcat(OpcodeStr#_.Suffix,
             "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
-            [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
-                                     _.KRCWM:$mask,  vectoraddr:$dst))]>,
+            [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+                                    MaskRC:$mask,  vectoraddr:$dst))]>,
             EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
             Sched<[WriteStore]>;
 }
@@ -8529,7 +9549,7 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
 multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
-                                      vy512mem, mscatterv8i32>, EVEX_V512, VEX_W;
+                                      vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W;
   defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
                                       vz512mem,  mscatterv8i64>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
@@ -8548,7 +9568,7 @@ multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mscatterv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256xmem,
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem,
                                        mscatterv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
@@ -8558,7 +9578,8 @@ let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
                                           vx128xmem, mscatterv4i32>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mscatterv2i64>, EVEX_V128;
+                                          vx64xmem, mscatterv2i64, VK2WM>,
+                                          EVEX_V128;
 }
 }
 
@@ -8571,20 +9592,20 @@ defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter",
 // prefetch
 multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
                        RegisterClass KRC, X86MemOperand memop> {
-  let Predicates = [HasPFI], hasSideEffects = 1 in
+  let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in
   def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
-            !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
-            [], IIC_SSE_PREFETCH>, EVEX, EVEX_K, Sched<[WriteLoad]>;
+            !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>,
+            EVEX, EVEX_K, Sched<[WriteLoad]>;
 }
 
 defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
-                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
-                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
                      VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8593,10 +9614,10 @@ defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
-                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
-                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
                      VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8605,10 +9626,10 @@ defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
-                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
-                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
                      VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8617,10 +9638,10 @@ defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
-                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
-                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
                      VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -8628,8 +9649,8 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd
 multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
 def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
-                  [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))],
-                  IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
+                  [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
+                  EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
 }
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
@@ -8651,17 +9672,18 @@ defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI
 multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
     def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                        [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))],
-                        IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
+                        [(set _.KRC:$dst, (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src)))]>,
+                        EVEX, Sched<[WriteMove]>;
 }
 
 // Use 512bit version to implement 128/256 bit in case NoVLX.
 multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
-                                                            X86VectorVTInfo _> {
+                                           X86VectorVTInfo _,
+                                           string Name> {
 
-  def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))),
+  def : Pat<(_.KVT (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src))),
             (_.KVT (COPY_TO_REGCLASS
-                     (!cast<Instruction>(NAME#"Zrr")
+                     (!cast<Instruction>(Name#"Zrr")
                        (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
                                       _.RC:$src, _.SubRegIdx)),
                    _.KRC))>;
@@ -8680,8 +9702,8 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
                                                EVEX_V128;
   }
   let Predicates = [prd, NoVLX] in {
-    defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256>;
-    defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128>;
+    defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
+    defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
   }
 }
 
@@ -8694,125 +9716,131 @@ defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
 defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
                                               avx512vl_i64_info, HasDQI>, VEX_W;
 
+// Patterns for handling sext from a mask register to v16i8/v16i16 when DQI
+// is available, but BWI is not. We can't handle this in lowering because
+// a target independent DAG combine likes to combine sext and trunc.
+let Predicates = [HasDQI, NoBWI] in {
+  def : Pat<(v16i8 (sext (v16i1 VK16:$src))),
+            (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+  def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
+            (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - COMPRESS and EXPAND
 //
 
-// FIXME: Is there a better scheduler itinerary for VPCOMPRESS/VPEXPAND?
-let Sched = WriteShuffle256 in {
-def AVX512_COMPRESS : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-def AVX512_EXPAND : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-}
-
 multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
-                                 string OpcodeStr, OpndItins itins> {
+                                 string OpcodeStr, X86FoldableSchedWrite sched> {
   defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
               (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
-              (_.VT (X86compress _.RC:$src1)), itins.rr>, AVX5128IBase,
-              Sched<[itins.Sched]>;
+              (_.VT (X86compress _.RC:$src1))>, AVX5128IBase,
+              Sched<[sched]>;
 
   let mayStore = 1, hasSideEffects = 0 in
   def mr : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.RC:$src),
               OpcodeStr # "\t{$src, $dst|$dst, $src}",
               []>, EVEX_CD8<_.EltSize, CD8VT1>,
-              Sched<[itins.Sched.Folded]>;
+              Sched<[sched.Folded]>;
 
   def mrk : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
               OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
               []>,
               EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
-              Sched<[itins.Sched.Folded]>;
+              Sched<[sched.Folded]>;
 }
 
-multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
+multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
   def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
                                                (_.VT _.RC:$src)),
-            (!cast<Instruction>(NAME#_.ZSuffix##mrk)
+            (!cast<Instruction>(Name#_.ZSuffix##mrk)
                             addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
 }
 
 multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
-                                 OpndItins itins,
+                                 X86FoldableSchedWrite sched,
                                  AVX512VLVectorVTInfo VTInfo,
                                  Predicate Pred = HasAVX512> {
   let Predicates = [Pred] in
-  defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, itins>,
-           compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+  defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, sched>,
+           compress_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512;
 
   let Predicates = [Pred, HasVLX] in {
-    defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, itins>,
-                compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
-    defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, itins>,
-                compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+    defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, sched>,
+                compress_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256;
+    defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, sched>,
+                compress_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128;
   }
 }
 
-defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", AVX512_COMPRESS,
-                                          avx512vl_i32_info>, EVEX;
-defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", AVX512_COMPRESS,
-                                          avx512vl_i64_info>, EVEX, VEX_W;
-defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", AVX512_COMPRESS,
-                                          avx512vl_f32_info>, EVEX;
-defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", AVX512_COMPRESS,
-                                          avx512vl_f64_info>, EVEX, VEX_W;
+// FIXME: Is there a better scheduler class for VPCOMPRESS?
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", WriteVarShuffle256,
+                                          avx512vl_i32_info>, EVEX, NotMemoryFoldable;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", WriteVarShuffle256,
+                                          avx512vl_i64_info>, EVEX, VEX_W, NotMemoryFoldable;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", WriteVarShuffle256,
+                                          avx512vl_f32_info>, EVEX, NotMemoryFoldable;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", WriteVarShuffle256,
+                                          avx512vl_f64_info>, EVEX, VEX_W, NotMemoryFoldable;
 
 // expand
 multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
-                                 string OpcodeStr, OpndItins itins> {
+                                 string OpcodeStr, X86FoldableSchedWrite sched> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
               (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
-              (_.VT (X86expand _.RC:$src1)), itins.rr>, AVX5128IBase,
-              Sched<[itins.Sched]>;
+              (_.VT (X86expand _.RC:$src1))>, AVX5128IBase,
+              Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
               (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
               (_.VT (X86expand (_.VT (bitconvert
-                                      (_.LdFrag addr:$src1))))), itins.rm>,
+                                      (_.LdFrag addr:$src1)))))>,
             AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>,
-            Sched<[itins.Sched.Folded, ReadAfterLd]>;
+            Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
+multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
 
   def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
-            (!cast<Instruction>(NAME#_.ZSuffix##rmkz)
+            (!cast<Instruction>(Name#_.ZSuffix##rmkz)
+                                        _.KRCWM:$mask, addr:$src)>;
+
+  def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)),
+            (!cast<Instruction>(Name#_.ZSuffix##rmkz)
                                         _.KRCWM:$mask, addr:$src)>;
 
   def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
                                                (_.VT _.RC:$src0))),
-            (!cast<Instruction>(NAME#_.ZSuffix##rmk)
+            (!cast<Instruction>(Name#_.ZSuffix##rmk)
                             _.RC:$src0, _.KRCWM:$mask, addr:$src)>;
 }
 
 multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
-                               OpndItins itins,
+                               X86FoldableSchedWrite sched,
                                AVX512VLVectorVTInfo VTInfo,
                                Predicate Pred = HasAVX512> {
   let Predicates = [Pred] in
-  defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, itins>,
-           expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+  defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, sched>,
+           expand_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512;
 
   let Predicates = [Pred, HasVLX] in {
-    defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, itins>,
-                expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
-    defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, itins>,
-                expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+    defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, sched>,
+                expand_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256;
+    defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, sched>,
+                expand_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128;
   }
 }
 
-defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", AVX512_EXPAND,
+// FIXME: Is there a better scheduler class for VPEXPAND?
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", WriteVarShuffle256,
                                       avx512vl_i32_info>, EVEX;
-defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", AVX512_EXPAND,
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", WriteVarShuffle256,
                                       avx512vl_i64_info>, EVEX, VEX_W;
-defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", AVX512_EXPAND,
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", WriteVarShuffle256,
                                       avx512vl_f32_info>, EVEX;
-defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", AVX512_EXPAND,
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
                                       avx512vl_f64_info>, EVEX, VEX_W;
 
 //handle instruction  reg_vec1 = op(reg_vec,imm)
@@ -8820,32 +9848,32 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", AVX512_EXPAND,
 //                               op(broadcast(eltVt),imm)
 //all instruction created with FROUND_CURRENT
 multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                      OpndItins itins, X86VectorVTInfo _> {
+                                      X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
                       (OpNode (_.VT _.RC:$src1),
-                              (i32 imm:$src2)), itins.rr>, Sched<[itins.Sched]>;
+                              (i32 imm:$src2))>, Sched<[sched]>;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
                     (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                            (i32 imm:$src2)), itins.rm>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i32 imm:$src2))>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
                     "${src1}"##_.BroadcastStr##", $src2",
                     (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
-                            (i32 imm:$src2)), itins.rm>, EVEX_B,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i32 imm:$src2))>, EVEX_B,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
-                                          SDNode OpNode, OpndItins itins,
+                                          SDNode OpNode, X86FoldableSchedWrite sched,
                                           X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -8854,23 +9882,23 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                       "$src1, {sae}, $src2",
                       (OpNode (_.VT _.RC:$src1),
                               (i32 imm:$src2),
-                              (i32 FROUND_NO_EXC)), itins.rr>,
-                      EVEX_B, Sched<[itins.Sched]>;
+                              (i32 FROUND_NO_EXC))>,
+                      EVEX_B, Sched<[sched]>;
 }
 
 multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
             AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
-            SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
+            SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
   let Predicates = [prd] in {
-    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM,
                                            _.info512>,
                 avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd,
-                                               itins, _.info512>, EVEX_V512;
+                                               sched.ZMM, _.info512>, EVEX_V512;
   }
   let Predicates = [prd, HasVLX] in {
-    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM,
                                            _.info128>, EVEX_V128;
-    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM,
                                            _.info256>, EVEX_V256;
   }
 }
@@ -8880,37 +9908,37 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
 //                               op(reg_vec2,broadcast(eltVt),imm)
 //all instruction created with FROUND_CURRENT
 multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, X86VectorVTInfo _>{
+                                X86FoldableSchedWrite sched, X86VectorVTInfo _>{
   let ExeDomain = _.ExeDomain in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i32 imm:$src3)), itins.rr>,
-                      Sched<[itins.Sched]>;
+                              (i32 imm:$src3))>,
+                      Sched<[sched]>;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                            (i32 imm:$src3)), itins.rm>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i32 imm:$src3))>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr##", $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                            (i32 imm:$src3)), itins.rm>, EVEX_B,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i32 imm:$src3))>, EVEX_B,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
 //                               op(reg_vec2,mem_vec,imm)
 multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              OpndItins itins, X86VectorVTInfo DestInfo,
+                              X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo,
                               X86VectorVTInfo SrcInfo>{
   let ExeDomain = DestInfo.ExeDomain in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
@@ -8918,16 +9946,16 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                   (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
                                (SrcInfo.VT SrcInfo.RC:$src2),
-                               (i8 imm:$src3))), itins.rr>,
-                  Sched<[itins.Sched]>;
+                               (i8 imm:$src3)))>,
+                  Sched<[sched]>;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
                 (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
                 OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                 (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
                              (SrcInfo.VT (bitconvert
                                                 (SrcInfo.LdFrag addr:$src2))),
-                             (i8 imm:$src3))), itins.rm>,
-                Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                             (i8 imm:$src3)))>,
+                Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
@@ -8935,8 +9963,8 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
 //                               op(reg_vec2,mem_vec,imm)
 //                               op(reg_vec2,broadcast(eltVt),imm)
 multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, X86VectorVTInfo _>:
-  avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, itins, _, _>{
+                           X86FoldableSchedWrite sched, X86VectorVTInfo _>:
+  avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{
 
   let ExeDomain = _.ExeDomain in
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -8945,36 +9973,36 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "$src1, ${src2}"##_.BroadcastStr##", $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                            (i8 imm:$src3)), itins.rm>, EVEX_B,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i8 imm:$src3))>, EVEX_B,
+                    Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
 //                                      op(reg_vec2,mem_scalar,imm)
 multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, X86VectorVTInfo _> {
+                                X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i32 imm:$src3)), itins.rr>,
-                      Sched<[itins.Sched]>;
+                              (i32 imm:$src3))>,
+                      Sched<[sched]>;
   defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (scalar_to_vector
                                       (_.ScalarLdFrag addr:$src2))),
-                            (i32 imm:$src3)), itins.rm>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                            (i32 imm:$src3))>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
-                                    SDNode OpNode, OpndItins itins,
+                                    SDNode OpNode, X86FoldableSchedWrite sched,
                                     X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -8984,13 +10012,13 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (i32 imm:$src3),
-                              (i32 FROUND_NO_EXC)), itins.rr>,
-                      EVEX_B, Sched<[itins.Sched]>;
+                              (i32 FROUND_NO_EXC))>,
+                      EVEX_B, Sched<[sched]>;
 }
 
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                    OpndItins itins, X86VectorVTInfo _> {
+                                    X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in
   defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
@@ -8999,203 +10027,379 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (i32 imm:$src3),
-                              (i32 FROUND_NO_EXC)), itins.rr>,
-                      EVEX_B, Sched<[itins.Sched]>;
+                              (i32 FROUND_NO_EXC))>,
+                      EVEX_B, Sched<[sched]>;
 }
 
 multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
             AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
-            SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
+            SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
   let Predicates = [prd] in {
-    defm Z    : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info512>,
-                avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, itins, _.info512>,
+    defm Z    : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+                avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512>,
                                   EVEX_V512;
 
   }
   let Predicates = [prd, HasVLX] in {
-    defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info128>,
+    defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
                                   EVEX_V128;
-    defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info256>,
+    defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
                                   EVEX_V256;
   }
 }
 
 multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
-                   OpndItins itins, AVX512VLVectorVTInfo DestInfo,
+                   X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo,
                    AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
   let Predicates = [Pred] in {
-    defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info512,
+    defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512,
                            SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
   }
   let Predicates = [Pred, HasVLX] in {
-    defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info128,
+    defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128,
                            SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
-    defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins,  DestInfo.info256,
+    defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256,
                            SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
   }
 }
 
 multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
-                                  bits<8> opc, SDNode OpNode, OpndItins itins,
+                                  bits<8> opc, SDNode OpNode, X86SchedWriteWidths sched,
                                   Predicate Pred = HasAVX512> {
   let Predicates = [Pred] in {
-    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+                                EVEX_V512;
   }
   let Predicates = [Pred, HasVLX] in {
-    defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
-    defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
+    defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+                                EVEX_V128;
+    defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+                                EVEX_V256;
   }
 }
 
 multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
                   X86VectorVTInfo _, bits<8> opc, SDNode OpNode,
-                  SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
+                  SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> {
   let Predicates = [prd] in {
-     defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, itins, _>,
-                 avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, itins, _>;
+     defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>,
+              avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, sched.XMM, _>;
   }
 }
 
 multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
                     bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
-                    SDNode OpNodeRnd, SizeItins itins, Predicate prd>{
+                    SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
   defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
-                            opcPs, OpNode, OpNodeRnd, itins.s, prd>,
+                            opcPs, OpNode, OpNodeRnd, sched, prd>,
                             EVEX_CD8<32, CD8VF>;
   defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
-                            opcPd, OpNode, OpNodeRnd, itins.d, prd>,
+                            opcPd, OpNode, OpNodeRnd, sched, prd>,
                             EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
 defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
-                              X86VReduce, X86VReduceRnd, SSE_ALU_ITINS_P, HasDQI>,
+                              X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>,
                               AVX512AIi8Base, EVEX;
 defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
-                              X86VRndScale, X86VRndScaleRnd, SSE_ALU_ITINS_P, HasAVX512>,
+                              X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>,
                               AVX512AIi8Base, EVEX;
 defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
-                              X86VGetMant, X86VGetMantRnd, SSE_ALU_ITINS_P, HasAVX512>,
+                              X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>,
                               AVX512AIi8Base, EVEX;
 
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                 0x50, X86VRange, X86VRangeRnd,
-                                                SSE_ALU_F64P, HasDQI>,
+                                                SchedWriteFAdd, HasDQI>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
                                                 0x50, X86VRange, X86VRangeRnd,
-                                                SSE_ALU_F32P, HasDQI>,
+                                                SchedWriteFAdd, HasDQI>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
 
 defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
-      f64x_info, 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F64S, HasDQI>,
+      f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
 defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
-      0x51, X86Ranges, X86RangesRnd, SSE_ALU_F32S, HasDQI>,
+      0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
 defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
-      0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F64S, HasDQI>,
+      0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
 defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
-      0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F32S, HasDQI>,
+      0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
 defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
-      0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F64S, HasAVX512>,
+      0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
 defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
-      0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F32S, HasAVX512>,
+      0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
+
+multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> {
+  // Register
+  def : Pat<(_.VT (ffloor _.RC:$src)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+             _.RC:$src, (i32 0x9))>;
+  def : Pat<(_.VT (fnearbyint _.RC:$src)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+             _.RC:$src, (i32 0xC))>;
+  def : Pat<(_.VT (fceil _.RC:$src)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+             _.RC:$src, (i32 0xA))>;
+  def : Pat<(_.VT (frint _.RC:$src)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+             _.RC:$src, (i32 0x4))>;
+  def : Pat<(_.VT (ftrunc _.RC:$src)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+             _.RC:$src, (i32 0xB))>;
+
+  // Merge-masking
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+             _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+             _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+             _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+             _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+             _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
+
+  // Zero-masking
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+             _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+             _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+             _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+             _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+             _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
+
+  // Load
+  def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+             addr:$src, (i32 0x9))>;
+  def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+             addr:$src, (i32 0xC))>;
+  def : Pat<(_.VT (fceil (_.LdFrag addr:$src))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+             addr:$src, (i32 0xA))>;
+  def : Pat<(_.VT (frint (_.LdFrag addr:$src))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+             addr:$src, (i32 0x4))>;
+  def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+             addr:$src, (i32 0xB))>;
+
+  // Merge-masking + load
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+  // Zero-masking + load
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+             _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+             _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+             _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+             _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+             _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+  // Broadcast load
+  def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+             addr:$src, (i32 0x9))>;
+  def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+             addr:$src, (i32 0xC))>;
+  def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+             addr:$src, (i32 0xA))>;
+  def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+             addr:$src, (i32 0x4))>;
+  def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+             addr:$src, (i32 0xB))>;
+
+  // Merge-masking + broadcast load
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.RC:$dst)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+             _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+  // Zero-masking + broadcast load
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+             _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+             _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+             _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+             _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                           (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+             _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+}
+
 let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (ffloor VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
-def : Pat<(v16f32 (fnearbyint VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
-def : Pat<(v16f32 (fceil VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
-def : Pat<(v16f32 (frint VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
-def : Pat<(v16f32 (ftrunc VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
-
-def : Pat<(v8f64 (ffloor VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
-def : Pat<(v8f64 (fnearbyint VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
-def : Pat<(v8f64 (fceil VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
-def : Pat<(v8f64 (frint VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
-def : Pat<(v8f64 (ftrunc VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
+  defm : AVX512_rndscale_lowering<v16f32_info, "PS">;
+  defm : AVX512_rndscale_lowering<v8f64_info,  "PD">;
 }
 
 let Predicates = [HasVLX] in {
-def : Pat<(v4f32 (ffloor VR128X:$src)),
-          (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>;
-def : Pat<(v4f32 (fnearbyint VR128X:$src)),
-          (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>;
-def : Pat<(v4f32 (fceil VR128X:$src)),
-          (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>;
-def : Pat<(v4f32 (frint VR128X:$src)),
-          (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>;
-def : Pat<(v4f32 (ftrunc VR128X:$src)),
-          (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xB))>;
-
-def : Pat<(v2f64 (ffloor VR128X:$src)),
-          (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>;
-def : Pat<(v2f64 (fnearbyint VR128X:$src)),
-          (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>;
-def : Pat<(v2f64 (fceil VR128X:$src)),
-          (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>;
-def : Pat<(v2f64 (frint VR128X:$src)),
-          (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>;
-def : Pat<(v2f64 (ftrunc VR128X:$src)),
-          (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xB))>;
-
-def : Pat<(v8f32 (ffloor VR256X:$src)),
-          (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>;
-def : Pat<(v8f32 (fnearbyint VR256X:$src)),
-          (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>;
-def : Pat<(v8f32 (fceil VR256X:$src)),
-          (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>;
-def : Pat<(v8f32 (frint VR256X:$src)),
-          (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>;
-def : Pat<(v8f32 (ftrunc VR256X:$src)),
-          (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xB))>;
-
-def : Pat<(v4f64 (ffloor VR256X:$src)),
-          (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>;
-def : Pat<(v4f64 (fnearbyint VR256X:$src)),
-          (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>;
-def : Pat<(v4f64 (fceil VR256X:$src)),
-          (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>;
-def : Pat<(v4f64 (frint VR256X:$src)),
-          (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>;
-def : Pat<(v4f64 (ftrunc VR256X:$src)),
-          (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xB))>;
-}
-
-multiclass avx512_shuff_packed_128<string OpcodeStr, OpndItins itins,
-                                   AVX512VLVectorVTInfo _, bits<8> opc>{
-  let Predicates = [HasAVX512] in {
-    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info512>, EVEX_V512;
+  defm : AVX512_rndscale_lowering<v8f32x_info, "PS">;
+  defm : AVX512_rndscale_lowering<v4f64x_info, "PD">;
+  defm : AVX512_rndscale_lowering<v4f32x_info, "PS">;
+  defm : AVX512_rndscale_lowering<v2f64x_info, "PD">;
+}
 
-  }
-  let Predicates = [HasAVX512, HasVLX] in {
-     defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info256>, EVEX_V256;
+multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
+                                          X86FoldableSchedWrite sched,
+                                          X86VectorVTInfo _,
+                                          X86VectorVTInfo CastInfo,
+                                          string EVEX2VEXOvrd> {
+  let ExeDomain = _.ExeDomain in {
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (_.VT (bitconvert
+                         (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
+                                                  (i8 imm:$src3)))))>,
+                  Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                (_.VT
+                 (bitconvert
+                  (CastInfo.VT (X86Shuf128 _.RC:$src1,
+                                           (bitconvert (_.LdFrag addr:$src2)),
+                                           (i8 imm:$src3)))))>,
+                Sched<[sched.Folded, ReadAfterLd]>,
+                EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    (_.VT
+                     (bitconvert
+                      (CastInfo.VT
+                       (X86Shuf128 _.RC:$src1,
+                                   (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                   (i8 imm:$src3)))))>, EVEX_B,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
-defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", SSE_SHUFP,
-      avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", SSE_SHUFP,
-      avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", SSE_SHUFP,
-      avx512vl_i32_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", SSE_SHUFP,
-      avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched,
+                                   AVX512VLVectorVTInfo _,
+                                   AVX512VLVectorVTInfo CastInfo, bits<8> opc,
+                                   string EVEX2VEXOvrd>{
+  let Predicates = [HasAVX512] in
+  defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
+                                          _.info512, CastInfo.info512, "">, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
+                                             _.info256, CastInfo.info256,
+                                             EVEX2VEXOvrd>, EVEX_V256;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
+      avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
+      avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
+      avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
+      avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 
 let Predicates = [HasAVX512] in {
 // Provide fallback in case the load node that is used in the broadcast
@@ -9230,20 +10434,61 @@ def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
                           0)>;
 }
 
-multiclass avx512_valign<string OpcodeStr, OpndItins itins,
-                         AVX512VLVectorVTInfo VTInfo_I> {
-  defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign, itins>,
-                           AVX512AIi8Base, EVEX_4V;
+multiclass avx512_valign<bits<8> opc, string OpcodeStr,
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+  // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the
+  // instantiation of this class.
+  let ExeDomain = _.ExeDomain in {
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>,
+                  Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                (_.VT (X86VAlign _.RC:$src1,
+                                 (bitconvert (_.LdFrag addr:$src2)),
+                                 (i8 imm:$src3)))>,
+                Sched<[sched.Folded, ReadAfterLd]>,
+                EVEX2VEXOverride<"VPALIGNRrmi">;
+
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                   OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                   "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                   (X86VAlign _.RC:$src1,
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                              (i8 imm:$src3))>, EVEX_B,
+                   Sched<[sched.Folded, ReadAfterLd]>;
+  }
+}
+
+multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
+                                AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>,
+                                AVX512AIi8Base, EVEX_4V, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
+                                AVX512AIi8Base, EVEX_4V, EVEX_V128;
+    // We can't really override the 256-bit version so change it back to unset.
+    let EVEX2VEXOverride = ? in
+    defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
+                                AVX512AIi8Base, EVEX_4V, EVEX_V256;
+  }
 }
 
-defm VALIGND: avx512_valign<"valignd", SSE_PALIGN, avx512vl_i32_info>,
-                                                  EVEX_CD8<32, CD8VF>;
-defm VALIGNQ: avx512_valign<"valignq", SSE_PALIGN, avx512vl_i64_info>,
-                                                  EVEX_CD8<64, CD8VF>, VEX_W;
+defm VALIGND: avx512_valign_common<"valignd", SchedWriteShuffle,
+                                   avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign_common<"valignq", SchedWriteShuffle,
+                                   avx512vl_i64_info>, EVEX_CD8<64, CD8VF>,
+                                   VEX_W;
 
-defm VPALIGNR:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", SSE_PALIGN,
-                                          avx512vl_i8_info, avx512vl_i8_info>,
-                EVEX_CD8<8, CD8VF>;
+defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr",
+                                         SchedWriteShuffle, avx512vl_i8_info,
+                                         avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
 
 // Fragments to help convert valignq into masked valignd. Or valignq/valignd
 // into vpalignr.
@@ -9363,97 +10608,100 @@ let Predicates = [HasVLX, HasBWI] in {
 }
 
 defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw",
-                SSE_INTMUL_ITINS_P, avx512vl_i16_info, avx512vl_i8_info>,
-                EVEX_CD8<8, CD8VF>;
+                SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>,
+                EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible;
 
 multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins, X86VectorVTInfo _> {
+                           X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1), OpcodeStr,
                     "$src1", "$src1",
-                    (_.VT (OpNode _.RC:$src1)), itins.rr>, EVEX, AVX5128IBase,
-                    Sched<[itins.Sched]>;
+                    (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase,
+                    Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.MemOp:$src1), OpcodeStr,
                   "$src1", "$src1",
-                  (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1)))), itins.rm>,
+                  (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
             EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
-            Sched<[itins.Sched.Folded]>;
+            Sched<[sched.Folded]>;
   }
 }
 
 multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            OpndItins itins, X86VectorVTInfo _> :
-           avx512_unary_rm<opc, OpcodeStr, OpNode, itins, _> {
+                            X86FoldableSchedWrite sched, X86VectorVTInfo _> :
+           avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> {
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.ScalarMemOp:$src1), OpcodeStr,
                   "${src1}"##_.BroadcastStr,
                   "${src1}"##_.BroadcastStr,
                   (_.VT (OpNode (X86VBroadcast
-                                    (_.ScalarLdFrag addr:$src1)))), itins.rm>,
+                                    (_.ScalarLdFrag addr:$src1))))>,
              EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
-             Sched<[itins.Sched.Folded]>;
+             Sched<[sched.Folded]>;
 }
 
 multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              OpndItins itins, AVX512VLVectorVTInfo VTInfo,
-                              Predicate prd> {
+                              X86SchedWriteWidths sched,
+                              AVX512VLVectorVTInfo VTInfo, Predicate prd> {
   let Predicates = [prd] in
-    defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info512>,
+    defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
                              EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info256>,
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
                               EVEX_V256;
-    defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info128>,
+    defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
                               EVEX_V128;
   }
 }
 
 multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+                               X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo,
                                Predicate prd> {
   let Predicates = [prd] in
-    defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512>,
+    defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
                               EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256>,
+    defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
                                  EVEX_V256;
-    defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128>,
+    defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
                                  EVEX_V128;
   }
 }
 
 multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins, Predicate prd> {
-  defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, itins,
+                                 SDNode OpNode, X86SchedWriteWidths sched,
+                                 Predicate prd> {
+  defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, sched,
                                avx512vl_i64_info, prd>, VEX_W;
-  defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, itins,
+  defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, sched,
                                avx512vl_i32_info, prd>;
 }
 
 multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins, Predicate prd> {
-  defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, itins,
+                                 SDNode OpNode, X86SchedWriteWidths sched,
+                                 Predicate prd> {
+  defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, sched,
                               avx512vl_i16_info, prd>, VEX_WIG;
-  defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, itins,
+  defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, sched,
                               avx512vl_i8_info, prd>, VEX_WIG;
 }
 
 multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
                                   bits<8> opc_d, bits<8> opc_q,
                                   string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins> {
-  defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, itins,
+                                  X86SchedWriteWidths sched> {
+  defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, sched,
                                     HasAVX512>,
-              avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, itins,
+              avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, sched,
                                     HasBWI>;
 }
 
-defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SSE_PABS>;
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
+                                    SchedWriteVecALU>;
 
 // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
 let Predicates = [HasAVX512, NoVLX] in {
@@ -9491,13 +10739,12 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VPLZCNT?
 defm VPLZCNT    : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz,
-                                        SSE_INTALU_ITINS_P, HasCDI>;
+                                        SchedWriteVecIMul, HasCDI>;
 
-// FIXME: Is there a better scheduler itinerary for VPCONFLICT?
+// FIXME: Is there a better scheduler class for VPCONFLICT?
 defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict,
-                                        SSE_INTALU_ITINS_P, HasCDI>;
+                                        SchedWriteVecALU, HasCDI>;
 
 // VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
 defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>;
@@ -9507,9 +10754,9 @@ defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>;
 // Counts number of ones - VPOPCNTD and VPOPCNTQ
 //===---------------------------------------------------------------------===//
 
-// FIXME: Is there a better scheduler itinerary for VPOPCNTD/VPOPCNTQ?
+// FIXME: Is there a better scheduler class for VPOPCNTD/VPOPCNTQ?
 defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop,
-                                     SSE_INTALU_ITINS_P, HasVPOPCNTDQ>;
+                                     SchedWriteVecALU, HasVPOPCNTDQ>;
 
 defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>;
 defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
@@ -9517,71 +10764,74 @@ defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>
 //===---------------------------------------------------------------------===//
 // Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
+
 multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            OpndItins itins> {
-  defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, itins,
+                            X86SchedWriteWidths sched> {
+  defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, sched,
                                       avx512vl_f32_info, HasAVX512>, XS;
 }
 
-defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup, SSE_MOVDDUP>;
-defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, SSE_MOVDDUP>;
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup,
+                                  SchedWriteFShuffle>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup,
+                                  SchedWriteFShuffle>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - MOVDDUP
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              OpndItins itins, X86VectorVTInfo _> {
+                              X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                   (_.VT (OpNode (_.VT _.RC:$src))), itins.rr>, EVEX,
-                   Sched<[itins.Sched]>;
+                   (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX,
+                   Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                  (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
                  (_.VT (OpNode (_.VT (scalar_to_vector
-                                       (_.ScalarLdFrag addr:$src))))),
-                 itins.rm>, EVEX, EVEX_CD8<_.EltSize, CD8VH>,
-                 Sched<[itins.Sched.Folded]>;
+                                       (_.ScalarLdFrag addr:$src)))))>,
+                 EVEX, EVEX_CD8<_.EltSize, CD8VH>,
+                 Sched<[sched.Folded]>;
   }
 }
 
 multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
-
-  defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info512>, EVEX_V512;
+                                 X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo> {
+  defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.ZMM,
+                           VTInfo.info512>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-    defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info256>,
-                               EVEX_V256;
-    defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, itins, VTInfo.info128>,
-                                  EVEX_V128;
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM,
+                                VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM,
+                                   VTInfo.info128>, EVEX_V128;
   }
 }
 
 multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins> {
-  defm NAME:      avx512_movddup_common<opc, OpcodeStr, OpNode, itins,
+                          X86SchedWriteWidths sched> {
+  defm NAME:      avx512_movddup_common<opc, OpcodeStr, OpNode, sched,
                                         avx512vl_f64_info>, XD, VEX_W;
 }
 
-defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SSE_MOVDDUP>;
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
 
 let Predicates = [HasVLX] in {
 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
           (VMOVDDUPZ128rm addr:$src)>;
 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
-          (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+          (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
 def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
           (VMOVDDUPZ128rm addr:$src)>;
 
 def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
                    (v2f64 VR128X:$src0)),
           (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
-                           (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+                           (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
 def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
                    (bitconvert (v4i32 immAllZerosV))),
-          (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+          (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
 
 def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
                    (v2f64 VR128X:$src0)),
@@ -9601,28 +10851,29 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$sr
 //===----------------------------------------------------------------------===//
 // AVX-512 - Unpack Instructions
 //===----------------------------------------------------------------------===//
+
 defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
-                                 SSE_ALU_ITINS_S>;
+                                 SchedWriteFShuffleSizes, 0, 1>;
 defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
-                                 SSE_ALU_ITINS_S>;
+                                 SchedWriteFShuffleSizes>;
 
 defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
-                                       SSE_INTALU_ITINS_P, HasBWI>;
+                                       SchedWriteShuffle, HasBWI>;
 defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
-                                       SSE_INTALU_ITINS_P, HasBWI>;
+                                       SchedWriteShuffle, HasBWI>;
 defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
-                                       SSE_INTALU_ITINS_P, HasBWI>;
+                                       SchedWriteShuffle, HasBWI>;
 defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
-                                       SSE_INTALU_ITINS_P, HasBWI>;
+                                       SchedWriteShuffle, HasBWI>;
 
 defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
-                                       SSE_INTALU_ITINS_P, HasAVX512>;
+                                       SchedWriteShuffle, HasAVX512>;
 defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
-                                       SSE_INTALU_ITINS_P, HasAVX512>;
+                                       SchedWriteShuffle, HasAVX512>;
 defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
-                                       SSE_INTALU_ITINS_P, HasAVX512>;
+                                        SchedWriteShuffle, HasAVX512>;
 defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
-                                       SSE_INTALU_ITINS_P, HasAVX512>;
+                                        SchedWriteShuffle, HasAVX512>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - Extract & Insert Integer Instructions
@@ -9635,7 +10886,7 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
               [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))),
                        addr:$dst)]>,
-              EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd]>;
+              EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>;
 }
 
 multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
@@ -9645,7 +10896,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
                         (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
-                  EVEX, TAPD, Sched<[WriteShuffle]>;
+                  EVEX, TAPD, Sched<[WriteVecExtract]>;
 
     defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
   }
@@ -9657,15 +10908,15 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
                   (ins _.RC:$src1, u8imm:$src2),
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
-                        (X86pextrw (_.VT _.RC:$src1), imm:$src2))],
-                  IIC_SSE_PEXTRW>, EVEX, PD, Sched<[WriteShuffle]>;
+                        (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, PD, Sched<[WriteVecExtract]>;
 
-    let hasSideEffects = 0 in
+    let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
     def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins _.RC:$src1, u8imm:$src2),
-                   OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-                   IIC_SSE_PEXTRW>, EVEX, TAPD, FoldGenData<NAME#rr>,
-                   Sched<[WriteShuffle]>;
+                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                   EVEX, TAPD, FoldGenData<NAME#rr>,
+                   Sched<[WriteVecExtract]>;
 
     defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
   }
@@ -9679,7 +10930,7 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GRC:$dst,
                       (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
-                  EVEX, TAPD, Sched<[WriteShuffle]>;
+                  EVEX, TAPD, Sched<[WriteVecExtract]>;
 
     def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
                 (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
@@ -9687,7 +10938,7 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
                 [(store (extractelt (_.VT _.RC:$src1),
                                     imm:$src2),addr:$dst)]>,
                 EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD,
-                Sched<[WriteShuffleLd]>;
+                Sched<[WriteVecExtractSt]>;
   }
 }
 
@@ -9703,7 +10954,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
       OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set _.RC:$dst,
           (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
-      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
 }
 
 multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -9714,7 +10965,7 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
             (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V,
-        Sched<[WriteShuffle]>;
+        Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
   }
@@ -9728,7 +10979,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
             (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
-        EVEX_4V, TAPD, Sched<[WriteShuffle]>;
+        EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
                                     _.ScalarLdFrag>, TAPD;
@@ -9747,10 +10998,11 @@ defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
-                                                AVX512VLVectorVTInfo VTInfo_FP>{
+                        AVX512VLVectorVTInfo VTInfo_FP>{
   defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
-                        SSE_SHUFP>, EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
-                        AVX512AIi8Base, EVEX_4V;
+                                    SchedWriteFShuffle>,
+                                    EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+                                    AVX512AIi8Base, EVEX_4V;
 }
 
 defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
@@ -9760,85 +11012,80 @@ defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD,
 // AVX-512 - Byte shift Left/Right
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteVecShift in
-def AVX512_BYTESHIFT : OpndItins<
-  IIC_SSE_INTSHDQ_P_RI, IIC_SSE_INTSHDQ_P_RI
->;
-
+// FIXME: The SSE/AVX names are PSLLDQri etc. - should we add the i here as well?
 multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
                                Format MRMm, string OpcodeStr,
-                               OpndItins itins, X86VectorVTInfo _>{
+                               X86FoldableSchedWrite sched, X86VectorVTInfo _>{
   def rr : AVX512<opc, MRMr,
              (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))],
-             itins.rr>, Sched<[itins.Sched]>;
+             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
+             Sched<[sched]>;
   def rm : AVX512<opc, MRMm,
            (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set _.RC:$dst,(_.VT (OpNode
                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                 (i8 imm:$src2))))], itins.rm>,
-           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                 (i8 imm:$src2))))]>,
+           Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
                                    Format MRMm, string OpcodeStr,
-                                   OpndItins itins, Predicate prd>{
+                                   X86SchedWriteWidths sched, Predicate prd>{
   let Predicates = [prd] in
-    defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
-                                 OpcodeStr, itins, v64i8_info>, EVEX_V512;
+    defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+                                 sched.ZMM, v64i8_info>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
-                                    OpcodeStr, itins, v32i8x_info>, EVEX_V256;
-    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
-                                    OpcodeStr, itins, v16i8x_info>, EVEX_V128;
+    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+                                    sched.YMM, v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+                                    sched.XMM, v16i8x_info>, EVEX_V128;
   }
 }
 defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
-                                       AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base,
-                                       EVEX_4V, VEX_WIG;
+                                       SchedWriteShuffle, HasBWI>,
+                                       AVX512PDIi8Base, EVEX_4V, VEX_WIG;
 defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
-                                       AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base,
-                                       EVEX_4V, VEX_WIG;
-
+                                       SchedWriteShuffle, HasBWI>,
+                                       AVX512PDIi8Base, EVEX_4V, VEX_WIG;
 
 multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
-                                string OpcodeStr, OpndItins itins,
+                                string OpcodeStr, X86FoldableSchedWrite sched,
                                 X86VectorVTInfo _dst, X86VectorVTInfo _src> {
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _dst.RC:$dst,(_dst.VT
                                 (OpNode (_src.VT _src.RC:$src1),
-                                        (_src.VT _src.RC:$src2))))], itins.rr>,
-             Sched<[itins.Sched]>;
+                                        (_src.VT _src.RC:$src2))))]>,
+             Sched<[sched]>;
   def rm : AVX512BI<opc, MRMSrcMem,
            (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set _dst.RC:$dst,(_dst.VT
                               (OpNode (_src.VT _src.RC:$src1),
                               (_src.VT (bitconvert
-                                        (_src.LdFrag addr:$src2))))))], itins.rm>,
-           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                        (_src.LdFrag addr:$src2))))))]>,
+           Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
-                                    string OpcodeStr, OpndItins itins,
+                                    string OpcodeStr, X86SchedWriteWidths sched,
                                     Predicate prd> {
   let Predicates = [prd] in
-    defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v8i64_info,
-                                  v64i8_info>, EVEX_V512;
+    defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.ZMM,
+                                  v8i64_info, v64i8_info>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v4i64x_info,
-                                    v32i8x_info>, EVEX_V256;
-    defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v2i64x_info,
-                                    v16i8x_info>, EVEX_V128;
+    defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.YMM,
+                                     v4i64x_info, v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.XMM,
+                                     v2i64x_info, v16i8x_info>, EVEX_V128;
   }
 }
 
 defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
-                                        SSE_MPSADBW_ITINS, HasBWI>, EVEX_4V, VEX_WIG;
+                                        SchedWritePSADBW, HasBWI>, EVEX_4V, VEX_WIG;
 
 // Transforms to swizzle an immediate to enable better matching when
 // memory operand isn't in the right place.
@@ -9903,7 +11150,8 @@ def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
 }]>;
 
 multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins, X86VectorVTInfo _>{
+                          X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                          string Name>{
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
   defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
@@ -9911,17 +11159,17 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (_.VT _.RC:$src3),
-                              (i8 imm:$src4)), itins.rr, 1, 1>,
-                      AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>;
+                              (i8 imm:$src4)), 1, 1>,
+                      AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
   defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
                     OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT _.RC:$src2),
                             (_.VT (bitconvert (_.LdFrag addr:$src3))),
-                            (i8 imm:$src4)), itins.rm, 1, 0>,
+                            (i8 imm:$src4)), 1, 0>,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                    Sched<[sched.Folded, ReadAfterLd]>;
   defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
                     OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
@@ -9929,32 +11177,32 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT _.RC:$src2),
                             (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                            (i8 imm:$src4)), itins.rm, 1, 0>, EVEX_B,
+                            (i8 imm:$src4)), 1, 0>, EVEX_B,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
-                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                    Sched<[sched.Folded, ReadAfterLd]>;
   }// Constraints = "$src1 = $dst"
 
   // Additional patterns for matching passthru operand in other positions.
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
 
   // Additional patterns for matching loads in other positions.
   def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
                           _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+            (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
                                    addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (OpNode _.RC:$src1,
                           (bitconvert (_.LdFrag addr:$src3)),
                           _.RC:$src2, (i8 imm:$src4))),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+            (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
                                    addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
 
   // Additional patterns for matching zero masking with loads in other
@@ -9963,13 +11211,13 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
                    _.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, (i8 imm:$src4)),
                    _.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
 
   // Additional patterns for matching masked loads with different
@@ -9978,42 +11226,42 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1,
                     (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src1, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
 
   // Additional patterns for matching broadcasts in other positions.
   def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                           _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
                                    addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (OpNode _.RC:$src1,
                           (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                           _.RC:$src2, (i8 imm:$src4))),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
                                    addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
 
   // Additional patterns for matching zero masking with broadcasts in other
@@ -10022,7 +11270,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
                    _.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
              _.KRCWM:$mask, _.RC:$src2, addr:$src3,
              (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
@@ -10030,7 +11278,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     _.RC:$src2, (i8 imm:$src4)),
                    _.ImmAllZerosV)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
              _.KRCWM:$mask, _.RC:$src2, addr:$src3,
              (VPTERNLOG132_imm8 imm:$src4))>;
 
@@ -10041,90 +11289,129 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     _.RC:$src2, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1,
                     (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     (i8 imm:$src4)), _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2,
                     (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     _.RC:$src1, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
                     _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
                    _.RC:$src1)),
-            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+            (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
 }
 
-multiclass avx512_common_ternlog<string OpcodeStr, OpndItins itins,
+multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched,
                                  AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in
-    defm Z    : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info512>, EVEX_V512;
+    defm Z    : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.ZMM,
+                               _.info512, NAME>, EVEX_V512;
   let Predicates = [HasAVX512, HasVLX] in {
-    defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info128>, EVEX_V128;
-    defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info256>, EVEX_V256;
+    defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.XMM,
+                               _.info128, NAME>, EVEX_V128;
+    defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.YMM,
+                               _.info256, NAME>, EVEX_V256;
   }
 }
 
-defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SSE_INTALU_ITINS_P,
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
                                         avx512vl_i32_info>;
-defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SSE_INTALU_ITINS_P,
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
                                         avx512vl_i64_info>, VEX_W;
 
+// Patterns to implement vnot using vpternlog instead of creating all ones
+// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
+// so that the result is only dependent on src0. But we use the same source
+// for all operands to prevent a false dependency.
+// TODO: We should maybe have a more generalized algorithm for folding to
+// vpternlog.
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+}
+
+let Predicates = [HasVLX] in {
+  def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+  def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - FixupImm
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins, X86VectorVTInfo _>{
+                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                                  X86VectorVTInfo TblVT>{
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
     defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
                          OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                         (OpNode (_.VT _.RC:$src1),
                                 (_.VT _.RC:$src2),
-                                (_.IntVT _.RC:$src3),
+                                (TblVT.VT _.RC:$src3),
                                 (i32 imm:$src4),
-                                (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>;
+                                (i32 FROUND_CURRENT))>, Sched<[sched]>;
     defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
                       OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (_.IntVT (bitconvert (_.LdFrag addr:$src3))),
+                              (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
                               (i32 imm:$src4),
-                              (i32 FROUND_CURRENT)), itins.rm>,
-                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                              (i32 FROUND_CURRENT))>,
+                      Sched<[sched.Folded, ReadAfterLd]>;
     defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
                     OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
                     "$src2, ${src3}"##_.BroadcastStr##", $src4",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                              (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
                               (i32 imm:$src4),
-                              (i32 FROUND_CURRENT)), itins.rm>,
-                    EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                              (i32 FROUND_CURRENT))>,
+                    EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
   } // Constraints = "$src1 = $dst"
 }
 
 multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
-                                      SDNode OpNode, OpndItins itins,
-                                      X86VectorVTInfo _>{
+                                      SDNode OpNode, X86FoldableSchedWrite sched,
+                                      X86VectorVTInfo _, X86VectorVTInfo TblVT>{
 let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
   defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
@@ -10132,15 +11419,15 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
                       "$src2, $src3, {sae}, $src4",
                       (OpNode (_.VT _.RC:$src1),
                                 (_.VT _.RC:$src2),
-                                (_.IntVT _.RC:$src3),
+                                (TblVT.VT _.RC:$src3),
                                 (i32 imm:$src4),
-                                (i32 FROUND_NO_EXC)), itins.rr>,
-                      EVEX_B, Sched<[itins.Sched]>;
+                                (i32 FROUND_NO_EXC))>,
+                      EVEX_B, Sched<[sched]>;
   }
 }
 
 multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  OpndItins itins, X86VectorVTInfo _,
+                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                   X86VectorVTInfo _src3VT> {
   let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
       ExeDomain = _.ExeDomain in {
@@ -10151,7 +11438,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (_src3VT.VT _src3VT.RC:$src3),
                               (i32 imm:$src4),
-                              (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>;
+                              (i32 FROUND_CURRENT))>, Sched<[sched]>;
     defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
                       OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -10160,8 +11447,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (_src3VT.VT _src3VT.RC:$src3),
                               (i32 imm:$src4),
-                              (i32 FROUND_NO_EXC)), itins.rm>,
-                      EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                              (i32 FROUND_NO_EXC))>,
+                      EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
     defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                      (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -10170,37 +11457,40 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                              (_src3VT.VT (scalar_to_vector
                                        (_src3VT.ScalarLdFrag addr:$src3))),
                              (i32 imm:$src4),
-                             (i32 FROUND_CURRENT)), itins.rm>,
-                     Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                             (i32 FROUND_CURRENT))>,
+                     Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
-multiclass avx512_fixupimm_packed_all<OpndItins itins, AVX512VLVectorVTInfo _Vec> {
+multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
+                                      AVX512VLVectorVTInfo _Vec, 
+                                      AVX512VLVectorVTInfo _Tbl> {
   let Predicates = [HasAVX512] in
-    defm Z    : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
-                                       _Vec.info512>,
-                avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, itins,
-                                _Vec.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512;
+    defm Z    : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
+                                       _Vec.info512, _Tbl.info512>,
+                avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
+                                _Vec.info512, _Tbl.info512>, AVX512AIi8Base,
+                                EVEX_4V, EVEX_V512;
   let Predicates = [HasAVX512, HasVLX] in {
-    defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
-                            _Vec.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128;
-    defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
-                            _Vec.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256;
-  }
-}
-
-defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
-                                          SSE_ALU_F32S, f32x_info, v4i32x_info>,
-                         AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
-                                          SSE_ALU_F64S, f64x_info, v2i64x_info>,
-                         AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
-defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SSE_ALU_F32P, avx512vl_f32_info>,
-                         EVEX_CD8<32, CD8VF>;
-defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SSE_ALU_F64P, avx512vl_f64_info>,
-                         EVEX_CD8<64, CD8VF>, VEX_W;
-
-
+    defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM,
+                            _Vec.info128, _Tbl.info128>, AVX512AIi8Base,
+                            EVEX_4V, EVEX_V128;
+    defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM,
+                            _Vec.info256, _Tbl.info256>, AVX512AIi8Base,
+                            EVEX_4V, EVEX_V256;
+  }
+}
+
+defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+                                           SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
+                          AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+                                           SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
+                          AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
+                         avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
+                         avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
 
 // Patterns used to select SSE scalar fp arithmetic instructions from
 // either:
@@ -10244,69 +11534,85 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SSE_ALU_F64P, avx512vl_f64_info>,
 
 // TODO: Some canonicalization in lowering would simplify the number of
 // patterns we have to try to match.
-multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode,
+                                           X86VectorVTInfo _, PatLeaf ZeroFP> {
   let Predicates = [HasAVX512] in {
     // extracted scalar math op with insert via movss
-    def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
-          FR32X:$src))))),
-      (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+    def : Pat<(MoveNode
+               (_.VT VR128X:$dst),
+               (_.VT (scalar_to_vector
+                      (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+                          _.FRC:$src)))),
+              (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
 
-    // vector math op with insert via movss
-    def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
-          (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
-      (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+    // extracted masked scalar math op with insert via movss
+    def : Pat<(MoveNode (_.VT VR128X:$src1),
+               (scalar_to_vector
+                (X86selects VK1WM:$mask,
+                            (Op (_.EltVT
+                                 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                                _.FRC:$src2),
+                            _.FRC:$src0))),
+              (!cast<Instruction>("V"#OpcPrefix#Zrr_Intk)
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+               VK1WM:$mask, _.VT:$src1,
+               (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
 
     // extracted masked scalar math op with insert via movss
-    def : Pat<(X86Movss (v4f32 VR128X:$src1),
+    def : Pat<(MoveNode (_.VT VR128X:$src1),
                (scalar_to_vector
                 (X86selects VK1WM:$mask,
-                            (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
-                                FR32X:$src2),
-                            FR32X:$src0))),
-      (!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
-          VK1WM:$mask, v4f32:$src1,
-          (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
+                            (Op (_.EltVT
+                                 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                                _.FRC:$src2), (_.EltVT ZeroFP)))),
+      (!cast<I>("V"#OpcPrefix#Zrr_Intkz) 
+          VK1WM:$mask, _.VT:$src1,
+          (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
   }
 }
 
-defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
-defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
-defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
-defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;
+defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
 
-multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
+
+multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
+                                             SDNode Move, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
-    // extracted scalar math op with insert via movsd
-    def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
-          FR64X:$src))))),
-      (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
-          (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
-
-    // vector math op with insert via movsd
-    def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
-          (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
-      (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+    def : Pat<(_.VT (Move _.VT:$dst,
+                     (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
+              (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src)>;
+  }
+}
 
-    // extracted masked scalar math op with insert via movss
-    def : Pat<(X86Movsd (v2f64 VR128X:$src1),
-               (scalar_to_vector
-                (X86selects VK1WM:$mask,
-                            (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
-                                FR64X:$src2),
-                            FR64X:$src0))),
-      (!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
-          VK1WM:$mask, v2f64:$src1,
-          (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
+defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
+defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+
+multiclass AVX512_scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix,
+                                                 SDNode Move, X86VectorVTInfo _,
+                                                 bits<8> ImmV> {
+  let Predicates = [HasAVX512] in {
+    def : Pat<(_.VT (Move _.VT:$dst,
+                     (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
+              (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src,
+                                                        (i32 ImmV))>;
   }
 }
 
-defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
-defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
-defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
-defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
+defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESS", X86Movss,
+                                             v4f32x_info, 0x01>;
+defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESS", X86Movss,
+                                             v4f32x_info, 0x02>;
+defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESD", X86Movsd,
+                                             v2f64x_info, 0x01>;
+defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESD", X86Movsd,
+                                             v2f64x_info, 0x02>;
 
 //===----------------------------------------------------------------------===//
 // AES instructions
@@ -10362,27 +11668,27 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
 //===----------------------------------------------------------------------===//
 
 multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
-                              OpndItins itins, X86VectorVTInfo VTI> {
+                              X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in {
     defm r:   AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3)),
-                itins.rr>, AVX512FMA3Base, Sched<[itins.Sched]>;
+                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+                AVX512FMA3Base, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (bitconvert (VTI.LdFrag addr:$src3))))),
-                itins.rm>, AVX512FMA3Base,
-                Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                        (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>,
+                AVX512FMA3Base,
+                Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
-                               OpndItins itins, X86VectorVTInfo VTI>
-         : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI> {
+                               X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
+         : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in
   defm mb:  AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -10390,66 +11696,74 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
               "${src3}"##VTI.BroadcastStr##", $src2",
               "$src2, ${src3}"##VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
-               (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3)))),
-              itins.rm>, AVX512FMA3Base, EVEX_B,
-              Sched<[itins.Sched.Folded, ReadAfterLd]>;
+               (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>,
+              AVX512FMA3Base, EVEX_B,
+              Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
-                                     OpndItins itins, AVX512VLVectorVTInfo VTI> {
+                                     X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512;
+  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+                                   EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128;
+    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+                                   EVEX_V256;
+    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+                                   EVEX_V128;
   }
 }
 
 multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
-                                      OpndItins itins, AVX512VLVectorVTInfo VTI> {
+                                      X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512;
+  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+                                    EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128;
+    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+                                    EVEX_V256;
+    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+                                    EVEX_V128;
   }
 }
 multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
-                           SDNode OpNode, OpndItins itins> {
-  defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, itins,
+                           SDNode OpNode, X86SchedWriteWidths sched> {
+  defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, sched,
              avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
-  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, itins,
+  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, sched,
              avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, itins,
+  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, sched,
              avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
-                           SDNode OpNode, OpndItins itins> {
-  defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", itins,
+                           SDNode OpNode, X86SchedWriteWidths sched> {
+  defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", sched,
              avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
              VEX_W, EVEX_CD8<16, CD8VF>;
   defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp,
-             OpNode, itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+             OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
   defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode,
-             itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+             sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
 // Concat & Shift
-defm VPSHLDV     : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SSE_INTMUL_ITINS_P>;
-defm VPSHRDV     : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SSE_INTMUL_ITINS_P>;
-defm VPSHLD      : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SSE_INTMUL_ITINS_P>;
-defm VPSHRD      : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SSE_INTMUL_ITINS_P>;
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLD  : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
+defm VPSHRD  : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
 
 // Compress
-defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", AVX512_COMPRESS,
-                                         avx512vl_i8_info, HasVBMI2>, EVEX;
-defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", AVX512_COMPRESS,
-                                          avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
+defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", WriteVarShuffle256,
+                                         avx512vl_i8_info, HasVBMI2>, EVEX,
+                                         NotMemoryFoldable;
+defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", WriteVarShuffle256,
+                                          avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W,
+                                          NotMemoryFoldable;
 // Expand
-defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", AVX512_EXPAND,
+defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", WriteVarShuffle256,
                                       avx512vl_i8_info, HasVBMI2>, EVEX;
-defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND,
+defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
                                       avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
 
 //===----------------------------------------------------------------------===//
@@ -10458,113 +11772,116 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND,
 
 let Constraints = "$src1 = $dst" in
 multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
-                    OpndItins itins, X86VectorVTInfo VTI> {
+                    X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
   defm r  :   AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1,
-                                            VTI.RC:$src2, VTI.RC:$src3)),
-                                   itins.rr>, EVEX_4V, T8PD, Sched<[itins.Sched]>;
+                                            VTI.RC:$src2, VTI.RC:$src3))>,
+                                   EVEX_4V, T8PD, Sched<[sched]>;
   defm m  :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                             (VTI.VT (bitconvert
-                                                     (VTI.LdFrag addr:$src3))))),
-                                   itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
-                                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                                     (VTI.LdFrag addr:$src3)))))>,
+                                   EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+                                   Sched<[sched.Folded, ReadAfterLd]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3),
                                    OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
                                    "$src2, ${src3}"##VTI.BroadcastStr,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                     (VTI.VT (X86VBroadcast
-                                             (VTI.ScalarLdFrag addr:$src3)))),
-                                   itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
-                                   T8PD, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                             (VTI.ScalarLdFrag addr:$src3))))>,
+                                   EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
+                                   T8PD, Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, OpndItins itins> {
+multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
+                       X86SchedWriteWidths sched> {
   let Predicates = [HasVNNI] in
-  defm Z      :   VNNI_rmb<Op, OpStr, OpNode, itins, v16i32_info>, EVEX_V512;
+  defm Z      :   VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512;
   let Predicates = [HasVNNI, HasVLX] in {
-    defm Z256 :   VNNI_rmb<Op, OpStr, OpNode, itins, v8i32x_info>, EVEX_V256;
-    defm Z128 :   VNNI_rmb<Op, OpStr, OpNode, itins, v4i32x_info>, EVEX_V128;
+    defm Z256 :   VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256;
+    defm Z128 :   VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128;
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VPDP?
-defm VPDPBUSD   : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SSE_PMADD>;
-defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SSE_PMADD>;
-defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SSE_PMADD>;
-defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SSE_PMADD>;
+// FIXME: Is there a better scheduler class for VPDP?
+defm VPDPBUSD   : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>;
+defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>;
+defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>;
+defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>;
 
 //===----------------------------------------------------------------------===//
 // Bit Algorithms
 //===----------------------------------------------------------------------===//
 
-// FIXME: Is there a better scheduler itinerary for VPOPCNTB/VPOPCNTW?
-defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SSE_INTALU_ITINS_P,
+// FIXME: Is there a better scheduler class for VPOPCNTB/VPOPCNTW?
+defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SchedWriteVecALU,
                                    avx512vl_i8_info, HasBITALG>;
-defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SSE_INTALU_ITINS_P,
+defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU,
                                    avx512vl_i16_info, HasBITALG>, VEX_W;
 
 defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
 defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
 
-multiclass VPSHUFBITQMB_rm<OpndItins itins, X86VectorVTInfo VTI> {
+multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
   defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),
                                 (ins VTI.RC:$src1, VTI.RC:$src2),
                                 "vpshufbitqmb",
                                 "$src2, $src1", "$src1, $src2",
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
-                                (VTI.VT VTI.RC:$src2)), itins.rr>, EVEX_4V, T8PD,
-                                Sched<[itins.Sched]>;
+                                (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
+                                Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
                                 (ins VTI.RC:$src1, VTI.MemOp:$src2),
                                 "vpshufbitqmb",
                                 "$src2, $src1", "$src1, $src2",
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
-                                (VTI.VT (bitconvert (VTI.LdFrag addr:$src2)))),
-                                itins.rm>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
-                                Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>,
+                                EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+                                Sched<[sched.Folded, ReadAfterLd]>;
 }
 
-multiclass VPSHUFBITQMB_common<OpndItins itins, AVX512VLVectorVTInfo VTI> {
+multiclass VPSHUFBITQMB_common<X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasBITALG] in
-  defm Z      : VPSHUFBITQMB_rm<itins, VTI.info512>, EVEX_V512;
+  defm Z      : VPSHUFBITQMB_rm<sched.ZMM, VTI.info512>, EVEX_V512;
   let Predicates = [HasBITALG, HasVLX] in {
-    defm Z256 : VPSHUFBITQMB_rm<itins, VTI.info256>, EVEX_V256;
-    defm Z128 : VPSHUFBITQMB_rm<itins, VTI.info128>, EVEX_V128;
+    defm Z256 : VPSHUFBITQMB_rm<sched.YMM, VTI.info256>, EVEX_V256;
+    defm Z128 : VPSHUFBITQMB_rm<sched.XMM, VTI.info128>, EVEX_V128;
   }
 }
 
-// FIXME: Is there a better scheduler itinerary for VPSHUFBITQMB?
-defm VPSHUFBITQMB : VPSHUFBITQMB_common<SSE_INTMUL_ITINS_P, avx512vl_i8_info>;
+// FIXME: Is there a better scheduler class for VPSHUFBITQMB?
+defm VPSHUFBITQMB : VPSHUFBITQMB_common<SchedWriteVecIMul, avx512vl_i8_info>;
 
 //===----------------------------------------------------------------------===//
 // GFNI
 //===----------------------------------------------------------------------===//
 
-multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode> {
+multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
+                                   X86SchedWriteWidths sched> {
   let Predicates = [HasGFNI, HasAVX512, HasBWI] in
-  defm Z      : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info,
-                                SSE_INTALU_ITINS_P, 1>, EVEX_V512;
+  defm Z      : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info, sched.ZMM, 1>,
+                                EVEX_V512;
   let Predicates = [HasGFNI, HasVLX, HasBWI] in {
-    defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info,
-                                SSE_INTALU_ITINS_P, 1>, EVEX_V256;
-    defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info,
-                                SSE_INTALU_ITINS_P, 1>, EVEX_V128;
+    defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info, sched.YMM, 1>,
+                                EVEX_V256;
+    defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info, sched.XMM, 1>,
+                                EVEX_V128;
   }
 }
 
-defm GF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb>,
-                 EVEX_CD8<8, CD8VF>, T8PD;
+defm VGF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb,
+                                          SchedWriteVecALU>,
+                                          EVEX_CD8<8, CD8VF>, T8PD;
 
 multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
-                                      OpndItins itins, X86VectorVTInfo VTI,
+                                      X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
                                       X86VectorVTInfo BcstVTI>
-           : avx512_3Op_rm_imm8<Op, OpStr, OpNode, itins, VTI, VTI> {
+           : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> {
   let ExeDomain = VTI.ExeDomain in
   defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
@@ -10572,27 +11889,78 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
                 "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
                 (OpNode (VTI.VT VTI.RC:$src1),
                  (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))),
-                 (i8 imm:$src3)), itins.rm>, EVEX_B,
-                 Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                 (i8 imm:$src3))>, EVEX_B,
+                 Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
-                                     OpndItins itins> {
+                                     X86SchedWriteWidths sched> {
   let Predicates = [HasGFNI, HasAVX512, HasBWI] in
-  defm Z      : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v64i8_info,
-                                           v8i64_info>, EVEX_V512;
+  defm Z      : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.ZMM,
+                                           v64i8_info, v8i64_info>, EVEX_V512;
   let Predicates = [HasGFNI, HasVLX, HasBWI] in {
-    defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v32i8x_info,
-                                           v4i64x_info>, EVEX_V256;
-    defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v16i8x_info,
-                                           v2i64x_info>, EVEX_V128;
+    defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.YMM,
+                                           v32i8x_info, v4i64x_info>, EVEX_V256;
+    defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.XMM,
+                                           v16i8x_info, v2i64x_info>, EVEX_V128;
   }
 }
 
-defm GF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
-                        X86GF2P8affineinvqb, SSE_INTMUL_ITINS_P>,
-                        EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
-defm GF2P8AFFINEQB    : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
-                        X86GF2P8affineqb, SSE_INTMUL_ITINS_P>,
-                        EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
+                         X86GF2P8affineinvqb, SchedWriteVecIMul>,
+                         EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+defm VGF2P8AFFINEQB    : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
+                         X86GF2P8affineqb, SchedWriteVecIMul>,
+                         EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+
+
+//===----------------------------------------------------------------------===//
+// AVX5124FMAPS
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
+    Constraints = "$src1 = $dst" in {
+defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
+                    (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+                    "v4fmaddps", "$src3, $src2", "$src2, $src3",
+                    []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                    Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
+                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+                     "v4fnmaddps", "$src3, $src2", "$src2, $src3",
+                     []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                     Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
+                    (outs VR128X:$dst), (ins  VR128X:$src2, f128mem:$src3),
+                    "v4fmaddss", "$src3, $src2", "$src2, $src3",
+                    []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+                    Sched<[SchedWriteFMA.Scl.Folded]>;
+
+defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
+                     (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
+                     "v4fnmaddss", "$src3, $src2", "$src2, $src3",
+                     []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+                     Sched<[SchedWriteFMA.Scl.Folded]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX5124VNNIW
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
+    Constraints = "$src1 = $dst" in {
+defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
+                    (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+                     "vp4dpwssd", "$src3, $src2", "$src2, $src3",
+                    []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                    Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
+                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+                     "vp4dpwssds", "$src3, $src2", "$src2, $src3",
+                     []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                     Sched<[SchedWriteFMA.ZMM.Folded]>;
+}
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
index 98cc8fb7439e..c444fa761960 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -18,24 +18,24 @@ let SchedRW = [WriteLEA] in {
 let hasSideEffects = 0 in
 def LEA16r   : I<0x8D, MRMSrcMem,
                  (outs GR16:$dst), (ins anymem:$src),
-                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16;
 let isReMaterializable = 1 in
 def LEA32r   : I<0x8D, MRMSrcMem,
                  (outs GR32:$dst), (ins anymem:$src),
                  "lea{l}\t{$src|$dst}, {$dst|$src}",
-                 [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+                 [(set GR32:$dst, lea32addr:$src)]>,
                  OpSize32, Requires<[Not64BitMode]>;
 
 def LEA64_32r : I<0x8D, MRMSrcMem,
                   (outs GR32:$dst), (ins lea64_32mem:$src),
                   "lea{l}\t{$src|$dst}, {$dst|$src}",
-                  [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+                  [(set GR32:$dst, lea64_32addr:$src)]>,
                   OpSize32, Requires<[In64BitMode]>;
 
 let isReMaterializable = 1 in
 def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
                   "lea{q}\t{$src|$dst}, {$dst|$src}",
-                  [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+                  [(set GR64:$dst, lea64addr:$src)]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -63,24 +63,24 @@ def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
                // This probably ought to be moved to a def : Pat<> if the
                // syntax can be accepted.
                [(set AL, (mul AL, GR8:$src)),
-                (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+                (implicit EFLAGS)]>, Sched<[WriteIMul]>;
 // AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
 def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
                "mul{w}\t$src",
-               [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+               []>, OpSize16, Sched<[WriteIMul]>;
 // EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
 def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
                "mul{l}\t$src",
-               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
-               IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>,
+               OpSize32, Sched<[WriteIMul]>;
 // RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
 def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
                 "mul{q}\t$src",
-                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
-                IIC_MUL64>, Sched<[WriteIMul]>;
+                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>,
+                Sched<[WriteIMul64]>;
 // AL,AH = AL*[mem8]
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
@@ -89,62 +89,58 @@ def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
                // This probably ought to be moved to a def : Pat<> if the
                // syntax can be accepted.
                [(set AL, (mul AL, (loadi8 addr:$src))),
-                (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+                (implicit EFLAGS)]>, SchedLoadReg<WriteIMul.Folded>;
 // AX,DX = AX*[mem16]
 let mayLoad = 1, hasSideEffects = 0 in {
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
-               "mul{w}\t$src",
-               [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+               "mul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul.Folded>;
 // EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
-              "mul{l}\t$src",
-              [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+              "mul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul.Folded>;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
-                "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>,
+                "mul{q}\t$src", []>, SchedLoadReg<WriteIMul64.Folded>,
                 Requires<[In64BitMode]>;
 }
 
 let hasSideEffects = 0 in {
 // AL,AH = AL*GR8
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
-              IIC_IMUL8>, Sched<[WriteIMul]>;
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", []>,
+                Sched<[WriteIMul]>;
 // AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
-              IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", []>,
+                OpSize16, Sched<[WriteIMul]>;
 // EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
-              IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", []>,
+                OpSize32, Sched<[WriteIMul]>;
 // RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
-              IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>,
+                 Sched<[WriteIMul64]>;
 
 let mayLoad = 1 in {
 // AL,AH = AL*[mem8]
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
-                "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+                "imul{b}\t$src", []>, SchedLoadReg<WriteIMul.Folded>;
 // AX,DX = AX*[mem16]
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
-                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
-              SchedLoadReg<WriteIMulLd>;
+                "imul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul.Folded>;
 // EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
-                "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
-              SchedLoadReg<WriteIMulLd>;
+                "imul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul.Folded>;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
-                 "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>,
+                 "imul{q}\t$src", []>, SchedLoadReg<WriteIMul64.Folded>,
                  Requires<[In64BitMode]>;
 }
 } // hasSideEffects
@@ -153,218 +149,195 @@ def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
 let Defs = [EFLAGS] in {
 let Constraints = "$src1 = $dst" in {
 
-let isCommutable = 1, SchedRW = [WriteIMul] in {
+let isCommutable = 1 in {
 // X = IMUL Y, Z --> X = IMUL Z, Y
 // Register-Register Signed Integer Multiply
 def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
                  [(set GR16:$dst, EFLAGS,
-                       (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
-                       TB, OpSize16;
+                       (X86smul_flag GR16:$src1, GR16:$src2))]>,
+                 Sched<[WriteIMul]>, TB, OpSize16;
 def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, EFLAGS,
-                       (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
-                 TB, OpSize32;
+                       (X86smul_flag GR32:$src1, GR32:$src2))]>,
+                 Sched<[WriteIMul]>, TB, OpSize32;
 def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
                                    (ins GR64:$src1, GR64:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, EFLAGS,
-                        (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
-                 TB;
-} // isCommutable, SchedRW
+                        (X86smul_flag GR64:$src1, GR64:$src2))]>,
+                  Sched<[WriteIMul64]>, TB;
+} // isCommutable
 
 // Register-Memory Signed Integer Multiply
-let SchedRW = [WriteIMulLd, ReadAfterLd] in {
 def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
                                   (ins GR16:$src1, i16mem:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
                  [(set GR16:$dst, EFLAGS,
-                       (X86smul_flag GR16:$src1, (load addr:$src2)))],
-                       IIC_IMUL16_RM>,
-               TB, OpSize16;
+                       (X86smul_flag GR16:$src1, (loadi16 addr:$src2)))]>,
+                 Sched<[WriteIMul.Folded, ReadAfterLd]>, TB, OpSize16;
 def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
                  (ins GR32:$src1, i32mem:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, EFLAGS,
-                       (X86smul_flag GR32:$src1, (load addr:$src2)))],
-                       IIC_IMUL32_RM>,
-               TB, OpSize32;
+                       (X86smul_flag GR32:$src1, (loadi32 addr:$src2)))]>,
+                 Sched<[WriteIMul.Folded, ReadAfterLd]>, TB, OpSize32;
 def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
                                    (ins GR64:$src1, i64mem:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, EFLAGS,
-                        (X86smul_flag GR64:$src1, (load addr:$src2)))],
-                        IIC_IMUL64_RM>,
-               TB;
-} // SchedRW
+                        (X86smul_flag GR64:$src1, (loadi64 addr:$src2)))]>,
+                  Sched<[WriteIMul64.Folded, ReadAfterLd]>, TB;
 } // Constraints = "$src1 = $dst"
 
 } // Defs = [EFLAGS]
 
 // Surprisingly enough, these are not two address instructions!
 let Defs = [EFLAGS] in {
-let SchedRW = [WriteIMul] in {
 // Register-Integer Signed Integer Multiply
 def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
                       (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR16:$dst, EFLAGS,
-                            (X86smul_flag GR16:$src1, imm:$src2))],
-                            IIC_IMUL16_RRI>, OpSize16;
+                            (X86smul_flag GR16:$src1, imm:$src2))]>,
+                      Sched<[WriteIMul]>, OpSize16;
 def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
                      (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR16:$dst, EFLAGS,
-                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
-                           IIC_IMUL16_RRI>, OpSize16;
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+                     Sched<[WriteIMul]>, OpSize16;
 def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
                       (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR32:$dst, EFLAGS,
-                            (X86smul_flag GR32:$src1, imm:$src2))],
-                            IIC_IMUL32_RRI>, OpSize32;
+                            (X86smul_flag GR32:$src1, imm:$src2))]>,
+                      Sched<[WriteIMul]>, OpSize32;
 def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
                      (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR32:$dst, EFLAGS,
-                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
-                           IIC_IMUL32_RRI>, OpSize32;
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>,
+                     Sched<[WriteIMul]>, OpSize32;
 def IMUL64rri32 : RIi32S<0x69, MRMSrcReg,                    // GR64 = GR64*I32
                          (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
                          "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set GR64:$dst, EFLAGS,
-                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
-                             IIC_IMUL64_RRI>;
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
+                         Sched<[WriteIMul64]>;
 def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
                       (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
                       "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR64:$dst, EFLAGS,
-                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
-                            IIC_IMUL64_RRI>;
-} // SchedRW
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>,
+                      Sched<[WriteIMul64]>;
 
 // Memory-Integer Signed Integer Multiply
-let SchedRW = [WriteIMulLd] in {
 def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
                       (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR16:$dst, EFLAGS,
-                            (X86smul_flag (load addr:$src1), imm:$src2))],
-                            IIC_IMUL16_RMI>,
-                 OpSize16;
+                            (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
+                      Sched<[WriteIMul.Folded]>, OpSize16;
 def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
                      (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR16:$dst, EFLAGS,
-                           (X86smul_flag (load addr:$src1),
-                                         i16immSExt8:$src2))], IIC_IMUL16_RMI>,
-                                         OpSize16;
+                           (X86smul_flag (loadi16 addr:$src1),
+                                         i16immSExt8:$src2))]>,
+                     Sched<[WriteIMul.Folded]>, OpSize16;
 def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
                       (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR32:$dst, EFLAGS,
-                            (X86smul_flag (load addr:$src1), imm:$src2))],
-                            IIC_IMUL32_RMI>, OpSize32;
+                            (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
+                      Sched<[WriteIMul.Folded]>, OpSize32;
 def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
                      (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR32:$dst, EFLAGS,
-                           (X86smul_flag (load addr:$src1),
-                                         i32immSExt8:$src2))],
-                                         IIC_IMUL32_RMI>, OpSize32;
+                           (X86smul_flag (loadi32 addr:$src1),
+                                         i32immSExt8:$src2))]>,
+                     Sched<[WriteIMul.Folded]>, OpSize32;
 def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
                          (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
                          "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set GR64:$dst, EFLAGS,
-                              (X86smul_flag (load addr:$src1),
-                                            i64immSExt32:$src2))],
-                                            IIC_IMUL64_RMI>;
+                              (X86smul_flag (loadi64 addr:$src1),
+                                            i64immSExt32:$src2))]>,
+                         Sched<[WriteIMul64.Folded]>;
 def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
                       (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
                       "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR64:$dst, EFLAGS,
-                            (X86smul_flag (load addr:$src1),
-                                          i64immSExt8:$src2))],
-                                          IIC_IMUL64_RMI>;
-} // SchedRW
+                            (X86smul_flag (loadi64 addr:$src1),
+                                          i64immSExt8:$src2))]>,
+                      Sched<[WriteIMul64.Folded]>;
 } // Defs = [EFLAGS]
 
-
-
-
 // unsigned division/remainder
 let hasSideEffects = 1 in { // so that we don't speculatively execute
-let SchedRW = [WriteIDiv] in {
 let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
-               "div{b}\t$src", [], IIC_DIV8_REG>;
+               "div{b}\t$src", []>, Sched<[WriteDiv8]>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
-               "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+               "div{w}\t$src", []>, Sched<[WriteDiv16]>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
-               "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+               "div{l}\t$src", []>, Sched<[WriteDiv32]>, OpSize32;
 // RDX:RAX/r64 = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
-                "div{q}\t$src", [], IIC_DIV64>;
-} // SchedRW
+                "div{q}\t$src", []>, Sched<[WriteDiv64]>;
 
 let mayLoad = 1 in {
 let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
-               "div{b}\t$src", [], IIC_DIV8_MEM>,
-             SchedLoadReg<WriteIDivLd>;
+               "div{b}\t$src", []>, SchedLoadReg<WriteDiv8.Folded>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
-               "div{w}\t$src", [], IIC_DIV16>, OpSize16,
-             SchedLoadReg<WriteIDivLd>;
+               "div{w}\t$src", []>, OpSize16, SchedLoadReg<WriteDiv16.Folded>;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
 def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
-               "div{l}\t$src", [], IIC_DIV32>,
-             SchedLoadReg<WriteIDivLd>, OpSize32;
+               "div{l}\t$src", []>, SchedLoadReg<WriteDiv32.Folded>, OpSize32;
 // RDX:RAX/[mem64] = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
-                "div{q}\t$src", [], IIC_DIV64>,
-             SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>;
+                "div{q}\t$src", []>, SchedLoadReg<WriteDiv64.Folded>,
+                Requires<[In64BitMode]>;
 }
 
 // Signed division/remainder.
-let SchedRW = [WriteIDiv] in {
 let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
-               "idiv{b}\t$src", [], IIC_IDIV8>;
+               "idiv{b}\t$src", []>, Sched<[WriteIDiv8]>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
-               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+               "idiv{w}\t$src", []>, Sched<[WriteIDiv16]>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
-               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+               "idiv{l}\t$src", []>, Sched<[WriteIDiv32]>, OpSize32;
 // RDX:RAX/r64 = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
-                "idiv{q}\t$src", [], IIC_IDIV64>;
-} // SchedRW
+                "idiv{q}\t$src", []>, Sched<[WriteIDiv64]>;
 
 let mayLoad = 1 in {
 let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
-               "idiv{b}\t$src", [], IIC_IDIV8>,
-             SchedLoadReg<WriteIDivLd>;
+               "idiv{b}\t$src", []>, SchedLoadReg<WriteIDiv8.Folded>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
-               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
-             SchedLoadReg<WriteIDivLd>;
+               "idiv{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIDiv16.Folded>;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
 def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
-               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
-             SchedLoadReg<WriteIDivLd>;
+               "idiv{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIDiv32.Folded>;
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
 def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
-                "idiv{q}\t$src", [], IIC_IDIV64>,
-             SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>;
+                "idiv{q}\t$src", []>, SchedLoadReg<WriteIDiv64.Folded>,
+                Requires<[In64BitMode]>;
 }
 } // hasSideEffects = 0
 
@@ -379,37 +352,37 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
                "neg{b}\t$dst",
                [(set GR8:$dst, (ineg GR8:$src1)),
-                (implicit EFLAGS)], IIC_UNARY_REG>;
+                (implicit EFLAGS)]>;
 def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                "neg{w}\t$dst",
                [(set GR16:$dst, (ineg GR16:$src1)),
-                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+                (implicit EFLAGS)]>, OpSize16;
 def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                "neg{l}\t$dst",
                [(set GR32:$dst, (ineg GR32:$src1)),
-                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+                (implicit EFLAGS)]>, OpSize32;
 def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
                 [(set GR64:$dst, (ineg GR64:$src1)),
-                 (implicit EFLAGS)], IIC_UNARY_REG>;
+                 (implicit EFLAGS)]>;
 } // Constraints = "$src1 = $dst", SchedRW
 
 // Read-modify-write negate.
-let SchedRW = [WriteALULd, WriteRMW] in {
+let SchedRW = [WriteALURMW] in {
 def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
                "neg{b}\t$dst",
                [(store (ineg (loadi8 addr:$dst)), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>;
+                (implicit EFLAGS)]>;
 def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
                "neg{w}\t$dst",
                [(store (ineg (loadi16 addr:$dst)), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+                (implicit EFLAGS)]>, OpSize16;
 def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
                "neg{l}\t$dst",
                [(store (ineg (loadi32 addr:$dst)), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+                (implicit EFLAGS)]>, OpSize32;
 def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
                 [(store (ineg (loadi64 addr:$dst)), addr:$dst),
-                 (implicit EFLAGS)], IIC_UNARY_MEM>,
+                 (implicit EFLAGS)]>,
                 Requires<[In64BitMode]>;
 } // SchedRW
 } // Defs = [EFLAGS]
@@ -418,36 +391,33 @@ def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
 // Note: NOT does not set EFLAGS!
 
 let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-// Match xor -1 to not. Favors these over a move imm + xor to save code size.
-let AddedComplexity = 15 in {
 def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
                "not{b}\t$dst",
-               [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+               [(set GR8:$dst, (not GR8:$src1))]>;
 def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                "not{w}\t$dst",
-               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+               [(set GR16:$dst, (not GR16:$src1))]>, OpSize16;
 def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                "not{l}\t$dst",
-               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+               [(set GR32:$dst, (not GR32:$src1))]>, OpSize32;
 def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
-                [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
-}
+                [(set GR64:$dst, (not GR64:$src1))]>;
 } // Constraints = "$src1 = $dst", SchedRW
 
-let SchedRW = [WriteALULd, WriteRMW] in {
+let SchedRW = [WriteALURMW] in {
 def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
                "not{b}\t$dst",
-               [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+               [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
 def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
                "not{w}\t$dst",
-               [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               [(store (not (loadi16 addr:$dst)), addr:$dst)]>,
                OpSize16;
 def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
                "not{l}\t$dst",
-               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               [(store (not (loadi32 addr:$dst)), addr:$dst)]>,
                OpSize32;
 def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
-                [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+                [(store (not (loadi64 addr:$dst)), addr:$dst)]>,
                 Requires<[In64BitMode]>;
 } // SchedRW
 } // CodeSize
@@ -458,49 +428,45 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 let CodeSize = 2 in
 def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                "inc{b}\t$dst",
-               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
-               IIC_UNARY_REG>;
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
 def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                "inc{w}\t$dst",
-               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
-               IIC_UNARY_REG>, OpSize16;
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, OpSize16;
 def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                "inc{l}\t$dst",
-               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
-               IIC_UNARY_REG>, OpSize32;
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, OpSize32;
 def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
-                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
-                IIC_UNARY_REG>;
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
 // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
 let CodeSize = 1, hasSideEffects = 0 in {
 def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
-                   "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                   "inc{w}\t$dst", []>,
                  OpSize16, Requires<[Not64BitMode]>;
 def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
-                   "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                   "inc{l}\t$dst", []>,
                  OpSize32, Requires<[Not64BitMode]>;
 } // CodeSize = 1, hasSideEffects = 0
 } // Constraints = "$src1 = $dst", SchedRW
 
-let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+let CodeSize = 2, SchedRW = [WriteALURMW] in {
 let Predicates = [UseIncDec] in {
   def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
                [(store (add (loadi8 addr:$dst), 1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>;
+                (implicit EFLAGS)]>;
   def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
                [(store (add (loadi16 addr:$dst), 1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+                (implicit EFLAGS)]>, OpSize16;
   def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
                [(store (add (loadi32 addr:$dst), 1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+                (implicit EFLAGS)]>, OpSize32;
 } // Predicates
 let Predicates = [UseIncDec, In64BitMode] in {
   def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), 1), addr:$dst),
-                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+                   (implicit EFLAGS)]>;
 } // Predicates
 } // CodeSize = 2, SchedRW
 
@@ -508,50 +474,46 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
 let CodeSize = 2 in
 def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                "dec{b}\t$dst",
-               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
-               IIC_UNARY_REG>;
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
 def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                "dec{w}\t$dst",
-               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
-               IIC_UNARY_REG>, OpSize16;
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, OpSize16;
 def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                "dec{l}\t$dst",
-               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
-               IIC_UNARY_REG>, OpSize32;
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, OpSize32;
 def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
-                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
-                IIC_UNARY_REG>;
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
 // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
 let CodeSize = 1, hasSideEffects = 0 in {
 def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
-                   "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                   "dec{w}\t$dst", []>,
                  OpSize16, Requires<[Not64BitMode]>;
 def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
-                   "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                   "dec{l}\t$dst", []>,
                  OpSize32, Requires<[Not64BitMode]>;
 } // CodeSize = 1, hasSideEffects = 0
 } // Constraints = "$src1 = $dst", SchedRW
 
 
-let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+let CodeSize = 2, SchedRW = [WriteALURMW] in {
 let Predicates = [UseIncDec] in {
   def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
                [(store (add (loadi8 addr:$dst), -1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>;
+                (implicit EFLAGS)]>;
   def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
                [(store (add (loadi16 addr:$dst), -1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+                (implicit EFLAGS)]>, OpSize16;
   def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
                [(store (add (loadi32 addr:$dst), -1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+                (implicit EFLAGS)]>, OpSize32;
 } // Predicates
 let Predicates = [UseIncDec, In64BitMode] in {
   def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), -1), addr:$dst),
-                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+                   (implicit EFLAGS)]>;
 } // Predicates
 } // CodeSize = 2, SchedRW
 } // Defs = [EFLAGS]
@@ -649,13 +611,11 @@ def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
 /// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
 ///    or 1 (for i16,i32,i64 operations).
 class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
-          string mnemonic, string args, list<dag> pattern,
-          InstrItinClass itin = IIC_BIN_NONMEM>
+          string mnemonic, string args, list<dag> pattern>
   : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
        opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
       f, outs, ins,
-      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
-      itin> {
+      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
 
   // Infer instruction prefixes from type info.
   let OpSize = typeinfo.OpSize;
@@ -664,47 +624,45 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
 
 // BinOpRR - Instructions like "add reg, reg, reg".
 class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, list<dag> pattern, InstrItinClass itin>
+              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
   : ITy<opcode, MRMDestReg, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
-    Sched<[WriteALU]>;
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+    Sched<[sched]>;
 
 // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
 // just a EFLAGS as a result.
 class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDPatternOperator opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs), WriteALU,
             [(set EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
-            IIC_BIN_NONMEM>;
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
 
 // BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
 // both a regclass and EFLAGS as a result.
 class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
             [(set typeinfo.RegClass:$dst, EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
-                  IIC_BIN_NONMEM>;
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
 
 // BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
 // both a regclass and EFLAGS as a result, and has EFLAGS as input.
 class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
             [(set typeinfo.RegClass:$dst, EFLAGS,
                   (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+                          EFLAGS))]>;
 
 // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
 class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 InstrItinClass itin = IIC_BIN_NONMEM>
+                  X86FoldableSchedWrite sched = WriteALU>
   : ITy<opcode, MRMSrcReg, typeinfo,
         (outs typeinfo.RegClass:$dst),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
-    Sched<[WriteALU]> {
+        mnemonic, "{$src2, $dst|$dst, $src2}", []>,
+    Sched<[sched]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
   let ForceDisassemble = 1;
@@ -713,13 +671,13 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
 class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
-  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>;
 
 // BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", []>,
     Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
@@ -729,137 +687,134 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
 
 // BinOpRM - Instructions like "add reg, reg, [mem]".
 class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, list<dag> pattern,
-              InstrItinClass itin = IIC_BIN_MEM>
+              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
   : ITy<opcode, MRMSrcMem, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
-    Sched<[WriteALULd, ReadAfterLd]>;
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+    Sched<[sched.Folded, ReadAfterLd]>;
 
 // BinOpRM_F - Instructions like "cmp reg, [mem]".
 class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs), WriteALU,
             [(set EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
 
 // BinOpRM_RF - Instructions like "add reg, reg, [mem]".
 class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
 
 // BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
 class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
-                    EFLAGS))], IIC_BIN_CARRY_MEM>;
+                    EFLAGS))]>;
 
 // BinOpRI - Instructions like "add reg, reg, imm".
 class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, dag outlist, list<dag> pattern,
-              InstrItinClass itin = IIC_BIN_NONMEM>
+              Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
-    Sched<[WriteALU]> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+    Sched<[sched]> {
   let ImmT = typeinfo.ImmEncoding;
 }
 
 // BinOpRI_F - Instructions like "cmp reg, imm".
 class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDPatternOperator opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
             [(set EFLAGS,
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
 
 // BinOpRI_RF - Instructions like "add reg, reg, imm".
 class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
             [(set typeinfo.RegClass:$dst, EFLAGS,
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
 // BinOpRI_RFF - Instructions like "adc reg, reg, imm".
 class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
             [(set typeinfo.RegClass:$dst, EFLAGS,
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
-                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+                        EFLAGS))]>;
 
 // BinOpRI8 - Instructions like "add reg, reg, imm8".
 class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-               Format f, dag outlist, list<dag> pattern,
-               InstrItinClass itin = IIC_BIN_NONMEM>
+               Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
-    Sched<[WriteALU]> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+    Sched<[sched]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
 
 // BinOpRI8_F - Instructions like "cmp reg, imm8".
 class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDPatternOperator opnode, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
              [(set EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
 
 // BinOpRI8_RF - Instructions like "add reg, reg, imm8".
 class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDPatternOperator opnode, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
 
 // BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
 class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                    SDPatternOperator opnode, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
-                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+                       EFLAGS))]>;
 
 // BinOpMR - Instructions like "add [mem], reg".
 class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+              list<dag> pattern>
   : ITy<opcode, MRMDestMem, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
-    Sched<[WriteALULd, WriteRMW]>;
+        mnemonic, "{$src, $dst|$dst, $src}", pattern>;
 
 // BinOpMR_RMW - Instructions like "add [mem], reg".
 class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
           [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
-           (implicit EFLAGS)]>;
+           (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
 
 // BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
 class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                     SDNode opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
-          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
-                  addr:$dst),
-           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+            [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+                    addr:$dst),
+             (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
 
 // BinOpMR_F - Instructions like "cmp [mem], reg".
 class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDPatternOperator opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
-            [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+            [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+                                   typeinfo.RegClass:$src))]>,
+            Sched<[WriteALULd, ReadDefault, ReadDefault, ReadDefault,
+                   ReadDefault, ReadDefault, ReadAfterLd]>;
 
 // BinOpMI - Instructions like "add [mem], imm".
 class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern,
-              InstrItinClass itin = IIC_BIN_MEM>
+              Format f, list<dag> pattern>
   : ITy<opcode, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
-    Sched<[WriteALULd, WriteRMW]> {
+        mnemonic, "{$src, $dst|$dst, $src}", pattern> {
   let ImmT = typeinfo.ImmEncoding;
 }
 
@@ -869,30 +824,29 @@ class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src), addr:$dst),
-             (implicit EFLAGS)]>;
+             (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
 // BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
 class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                      SDNode opnode, Format f>
   : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
-                            typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-             (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+             (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
 
 // BinOpMI_F - Instructions like "cmp [mem], imm".
 class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDPatternOperator opnode, Format f>
   : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
-                                               typeinfo.ImmOperator:$src))]>;
+            [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+                                  typeinfo.ImmOperator:$src))]>,
+            Sched<[WriteALULd]>;
 
 // BinOpMI8 - Instructions like "add [mem], imm8".
 class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
-               Format f, list<dag> pattern,
-               InstrItinClass itin = IIC_BIN_MEM>
+               Format f, list<dag> pattern>
   : ITy<0x82, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
-    Sched<[WriteALULd, WriteRMW]> {
+        mnemonic, "{$src, $dst|$dst, $src}", pattern> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
 
@@ -902,7 +856,7 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src), addr:$dst),
-              (implicit EFLAGS)]>;
+              (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
 
 // BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
 class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
@@ -910,22 +864,22 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
-              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+              (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
 
 // BinOpMI8_F - Instructions like "cmp [mem], imm8".
 class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
                  SDPatternOperator opnode, Format f>
   : BinOpMI8<mnemonic, typeinfo, f,
-             [(set EFLAGS, (opnode (load addr:$dst),
-                                   typeinfo.Imm8Operator:$src))]>;
+             [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+                                    typeinfo.Imm8Operator:$src))]>,
+             Sched<[WriteALULd]>;
 
 // BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg, string operands,
-              InstrItinClass itin = IIC_BIN_NONMEM>
+              Register areg, string operands, X86FoldableSchedWrite sched = WriteALU>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+        mnemonic, operands, []>, Sched<[sched]> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
   let Defs = [areg, EFLAGS];
@@ -936,8 +890,7 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // and use EFLAGS.
 class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
-            IIC_BIN_CARRY_NONMEM> {
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, WriteADC> {
   let Uses = [areg, EFLAGS];
 }
 
@@ -1257,14 +1210,6 @@ let isCompare = 1 in {
     def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
     let Predicates = [In64BitMode] in
     def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
-
-    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
-    // register class is constrained to GR8_NOREX. This pseudo is explicitly
-    // marked side-effect free, since it doesn't have an isel pattern like
-    // other test instructions.
-    let isPseudo = 1, hasSideEffects = 0 in
-    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
-                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
   } // Defs = [EFLAGS]
 
   def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
@@ -1284,21 +1229,22 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
                     PatFrag ld_frag> {
   def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
-            IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
+            Sched<[WriteALU]>;
   def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [(set RC:$dst, EFLAGS,
-             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
            Sched<[WriteALULd, ReadAfterLd]>;
 }
 
-let Predicates = [HasBMI], Defs = [EFLAGS] in {
+// Complexity is reduced to give and with immediate a chance to match first.
+let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
   defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
   defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
 }
 
-let Predicates = [HasBMI] in {
+let Predicates = [HasBMI], AddedComplexity = -6 in {
   def : Pat<(and (not GR32:$src1), GR32:$src2),
             (ANDN32rr GR32:$src1, GR32:$src2)>;
   def : Pat<(and (not GR64:$src1), GR64:$src2),
@@ -1312,74 +1258,81 @@ let Predicates = [HasBMI] in {
 //===----------------------------------------------------------------------===//
 // MULX Instruction
 //
-multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    X86FoldableSchedWrite sched> {
 let hasSideEffects = 0 in {
   let isCommutable = 1 in
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+             []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
 
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+             []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
 }
 }
 
 let Predicates = [HasBMI2] in {
   let Uses = [EDX] in
-    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul>;
   let Uses = [RDX] in
-    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W;
 }
 
 //===----------------------------------------------------------------------===//
 // ADCX and ADOX Instructions
 //
 let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
-    Constraints = "$src0 = $dst", AddedComplexity = 10 in {
-  let SchedRW = [WriteALU] in {
+    Constraints = "$src1 = $dst", AddedComplexity = 10 in {
+  let SchedRW = [WriteADC] in {
   def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
-             (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
-             [(set GR32:$dst, EFLAGS,
-                 (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
-             IIC_BIN_CARRY_NONMEM>, T8PD;
+                   (ins GR32:$src1, GR32:$src2),
+                   "adcx{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, EFLAGS,
+                     (X86adc_flag GR32:$src1, GR32:$src2, EFLAGS))]>, T8PD;
   def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
-             (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
-             [(set GR64:$dst, EFLAGS,
-                 (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
-             IIC_BIN_CARRY_NONMEM>, T8PD;
+                    (ins GR64:$src1, GR64:$src2),
+                    "adcx{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, EFLAGS,
+                      (X86adc_flag GR64:$src1, GR64:$src2, EFLAGS))]>, T8PD;
 
   // We don't have patterns for ADOX yet.
   let hasSideEffects = 0 in {
-  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src0, GR32:$src),
-             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
 
-  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src0, GR64:$src),
-             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+                    (ins GR64:$src1, GR64:$src2),
+                    "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
   } // hasSideEffects = 0
   } // SchedRW
 
-  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  let mayLoad = 1, SchedRW = [WriteADCLd, ReadAfterLd] in {
   def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
-             (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
-             [(set GR32:$dst, EFLAGS,
-                 (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
-             IIC_BIN_CARRY_MEM>, T8PD;
+                   (ins GR32:$src1, i32mem:$src2),
+                   "adcx{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, EFLAGS,
+                     (X86adc_flag GR32:$src1, (loadi32 addr:$src2), EFLAGS))]>,
+                   T8PD;
 
   def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
-             (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
-             [(set GR64:$dst, EFLAGS,
-                 (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
-             IIC_BIN_CARRY_MEM>, T8PD;
+                    (ins GR64:$src1, i64mem:$src2),
+                    "adcx{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, EFLAGS,
+                      (X86adc_flag GR64:$src1, (loadi64 addr:$src2), EFLAGS))]>,
+                    T8PD;
 
   // We don't have patterns for ADOX yet.
   let hasSideEffects = 0 in {
-  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src0, i32mem:$src),
-             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+                   (ins GR32:$src1, i32mem:$src2),
+                   "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
 
-  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src0, i64mem:$src),
-             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
-  }
+  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+                    (ins GR64:$src1, i64mem:$src2),
+                    "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
   } // hasSideEffects = 0
+  } // mayLoad = 1, SchedRW = [WriteADCLd]
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 8dd5e1c0626b..eda4ba5ae6f0 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -14,69 +14,67 @@
 
 
 // CMOV instructions.
-multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+multiclass CMOV<bits<8> opc, string Mnemonic, X86FoldableSchedWrite Sched,
+                PatLeaf CondNode> {
   let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
-      isCommutable = 1, SchedRW = [WriteALU] in {
+      isCommutable = 1, SchedRW = [Sched] in {
     def NAME#16rr
       : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
           !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
           [(set GR16:$dst,
-                (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
-                IIC_CMOV16_RR>, TB, OpSize16;
+                (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,
+                TB, OpSize16;
     def NAME#32rr
       : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
           !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
           [(set GR32:$dst,
-                (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
-                IIC_CMOV32_RR>, TB, OpSize32;
+                (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>,
+                TB, OpSize32;
     def NAME#64rr
       :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
           !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
           [(set GR64:$dst,
-                (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
-                IIC_CMOV32_RR>, TB;
+                (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB;
   }
 
   let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
-      SchedRW = [WriteALULd, ReadAfterLd] in {
+      SchedRW = [Sched.Folded, ReadAfterLd] in {
     def NAME#16rm
       : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
           !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
           [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
-                                    CondNode, EFLAGS))], IIC_CMOV16_RM>,
-                                    TB, OpSize16;
+                                    CondNode, EFLAGS))]>, TB, OpSize16;
     def NAME#32rm
       : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
           !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
           [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
-                                    CondNode, EFLAGS))], IIC_CMOV32_RM>,
-                                    TB, OpSize32;
+                                    CondNode, EFLAGS))]>, TB, OpSize32;
     def NAME#64rm
       :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
           !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
           [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
-                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+                                    CondNode, EFLAGS))]>, TB;
   } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
 } // end multiclass
 
 
 // Conditional Moves.
-defm CMOVO  : CMOV<0x40, "cmovo" , X86_COND_O>;
-defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
-defm CMOVB  : CMOV<0x42, "cmovb" , X86_COND_B>;
-defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
-defm CMOVE  : CMOV<0x44, "cmove" , X86_COND_E>;
-defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
-defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
-defm CMOVA  : CMOV<0x47, "cmova" , X86_COND_A>;
-defm CMOVS  : CMOV<0x48, "cmovs" , X86_COND_S>;
-defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
-defm CMOVP  : CMOV<0x4A, "cmovp" , X86_COND_P>;
-defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
-defm CMOVL  : CMOV<0x4C, "cmovl" , X86_COND_L>;
-defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
-defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
-defm CMOVG  : CMOV<0x4F, "cmovg" , X86_COND_G>;
+defm CMOVO  : CMOV<0x40, "cmovo" , WriteCMOV,  X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV,  X86_COND_NO>;
+defm CMOVB  : CMOV<0x42, "cmovb" , WriteCMOV,  X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV,  X86_COND_AE>;
+defm CMOVE  : CMOV<0x44, "cmove" , WriteCMOV,  X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV,  X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>;
+defm CMOVA  : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>;
+defm CMOVS  : CMOV<0x48, "cmovs" , WriteCMOV,  X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV,  X86_COND_NS>;
+defm CMOVP  : CMOV<0x4A, "cmovp" , WriteCMOV,  X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV,  X86_COND_NP>;
+defm CMOVL  : CMOV<0x4C, "cmovl" , WriteCMOV,  X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV,  X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV,  X86_COND_LE>;
+defm CMOVG  : CMOV<0x4F, "cmovg" , WriteCMOV,  X86_COND_G>;
 
 
 // SetCC instructions.
@@ -84,12 +82,12 @@ multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
   let Uses = [EFLAGS] in {
     def r    : I<opc, MRMXr,  (outs GR8:$dst), (ins),
                      !strconcat(Mnemonic, "\t$dst"),
-                     [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
-                     IIC_SET_R>, TB, Sched<[WriteALU]>;
+                     [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>,
+                     TB, Sched<[WriteSETCC]>;
     def m    : I<opc, MRMXm,  (outs), (ins i8mem:$dst),
                      !strconcat(Mnemonic, "\t$dst"),
-                     [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
-                     IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+                     [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>,
+                     TB, Sched<[WriteSETCCStore]>;
   } // Uses = [EFLAGS]
 }
 
@@ -114,5 +112,5 @@ defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
 // here http://www.rcollins.org/secrets/opcodes/SALC.html
 // Set AL if carry. 
 let Uses = [EFLAGS], Defs = [AL], SchedRW = [WriteALU] in {
-  def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", [], IIC_AHF>, Requires<[Not64BitMode]>;
+  def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
index b3371c96cc29..373f85020372 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -17,7 +17,7 @@
 
 def GetLo32XForm : SDNodeXForm<imm, [{
   // Transformation function: get the low 32 bits.
-  return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
+  return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
 }]>;
 
 def GetLo8XForm : SDNodeXForm<imm, [{
@@ -35,8 +35,12 @@ def GetLo8XForm : SDNodeXForm<imm, [{
 let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
     SchedRW = [WriteJump] in
   def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
-                      "", [], IIC_CALL_RI>;
+                      "", []>;
 
+// 64-bit large code model PIC base construction.
+let hasSideEffects = 0, mayLoad = 1, isNotDuplicable = 1, SchedRW = [WriteJump] in
+  def MOVGOT64r : PseudoI<(outs GR64:$reg),
+                          (ins GR64:$scratch, i64i32imm_pcrel:$got), []>;
 
 // ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
@@ -46,12 +50,11 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
 let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in {
 def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
                            (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
-                           "#ADJCALLSTACKDOWN", [], IIC_ALU_NONMEM>,
-                           Requires<[NotLP64]>;
+                           "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>;
 def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
-                           [(X86callseq_end timm:$amt1, timm:$amt2)],
-                           IIC_ALU_NONMEM>, Requires<[NotLP64]>;
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                           Requires<[NotLP64]>;
 }
 def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
        (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
@@ -65,12 +68,11 @@ def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
 let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in {
 def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
                            (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
-                           "#ADJCALLSTACKDOWN",
-                           [], IIC_ALU_NONMEM>, Requires<[IsLP64]>;
+                           "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>;
 def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
-                           [(X86callseq_end timm:$amt1, timm:$amt2)],
-                           IIC_ALU_NONMEM>, Requires<[IsLP64]>;
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                           Requires<[IsLP64]>;
 }
 def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
         (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
@@ -148,10 +150,10 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
 // frame register after register allocation.
 let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in {
   def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
-                  "xorl\t$$FP, $src", [], IIC_BIN_NONMEM>,
+                  "xorl\t$$FP, $src", []>,
                   Requires<[NotLP64]>, Sched<[WriteALU]>;
   def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src),
-                  "xorq\t$$FP $src", [], IIC_BIN_NONMEM>,
+                  "xorq\t$$FP $src", []>,
                   Requires<[In64BitMode]>, Sched<[WriteALU]>;
 }
 
@@ -163,7 +165,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
                     "ret\t#eh_return, addr: $addr",
-                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+                    [(X86ehret GR32:$addr)]>, Sched<[WriteJumpLd]>;
 
 }
 
@@ -171,7 +173,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
                      "ret\t#eh_return, addr: $addr",
-                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+                     [(X86ehret GR64:$addr)]>, Sched<[WriteJumpLd]>;
 
 }
 
@@ -256,14 +258,12 @@ let isPseudo = 1, SchedRW = [WriteSystem] in {
 // this so that we don't have to have a MachineBasicBlock which ends
 // with a RET and also has successors.
 let isPseudo = 1, SchedRW = [WriteJumpLd] in {
-def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
-                          "", [], IIC_RET>;
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins), "", []>;
 
 // This instruction is lowered to a RET followed by a MOV.  The two
 // instructions are not generated on a higher level since then the
 // verifier sees a MachineBasicBlock ending with a non-terminator.
-def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
-                                  "", [], IIC_RET>;
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -275,7 +275,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
     isPseudo = 1, AddedComplexity = 10 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+                 [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
@@ -292,9 +292,9 @@ let Predicates = [OptForSize, Not64BitMode],
   // which only require 3 bytes compared to MOV32ri which requires 5.
   let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
     def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
-                        [(set GR32:$dst, 1)], IIC_ALU_NONMEM>;
+                        [(set GR32:$dst, 1)]>;
     def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
-                        [(set GR32:$dst, -1)], IIC_ALU_NONMEM>;
+                        [(set GR32:$dst, -1)]>;
   }
   } // SchedRW
 
@@ -307,10 +307,10 @@ let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5,
     SchedRW = [WriteALU] in {
 // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
 def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
-                       [(set GR32:$dst, i32immSExt8:$src)], IIC_ALU_NONMEM>,
+                       [(set GR32:$dst, i32immSExt8:$src)]>,
                        Requires<[OptForMinSize, NotWin64WithoutFP]>;
 def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
-                       [(set GR64:$dst, i64immSExt8:$src)], IIC_ALU_NONMEM>,
+                       [(set GR64:$dst, i64immSExt8:$src)]>,
                        Requires<[OptForMinSize, NotWin64WithoutFP]>;
 }
 
@@ -318,9 +318,8 @@ def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
 let isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteALU] in
-def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", [],
-                  IIC_ALU_NONMEM>;
+    isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
 
 // This 64-bit pseudo-move can be used for both a 64-bit constant that is
 // actually the zero-extension of a 32-bit constant and for labels in the
@@ -398,28 +397,28 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
 def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
-                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                    [(X86rep_movs i8)]>, REP,
                    Requires<[Not64BitMode]>;
 def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
-                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                    [(X86rep_movs i16)]>, REP, OpSize16,
                    Requires<[Not64BitMode]>;
 def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
-                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                    [(X86rep_movs i32)]>, REP, OpSize32,
                    Requires<[Not64BitMode]>;
 }
 
 let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
 def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
-                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                    [(X86rep_movs i8)]>, REP,
                    Requires<[In64BitMode]>;
 def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
-                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                    [(X86rep_movs i16)]>, REP, OpSize16,
                    Requires<[In64BitMode]>;
 def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
-                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                    [(X86rep_movs i32)]>, REP, OpSize32,
                    Requires<[In64BitMode]>;
 def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
-                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+                    [(X86rep_movs i64)]>, REP,
                    Requires<[In64BitMode]>;
 }
 
@@ -427,36 +426,36 @@ def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
 let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
   let Uses = [AL,ECX,EDI] in
   def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
-                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                      [(X86rep_stos i8)]>, REP,
                      Requires<[Not64BitMode]>;
   let Uses = [AX,ECX,EDI] in
   def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
-                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                      [(X86rep_stos i16)]>, REP, OpSize16,
                      Requires<[Not64BitMode]>;
   let Uses = [EAX,ECX,EDI] in
   def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
-                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                      [(X86rep_stos i32)]>, REP, OpSize32,
                      Requires<[Not64BitMode]>;
 }
 
 let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
   let Uses = [AL,RCX,RDI] in
   def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
-                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
-                     Requires<[In64BitMode]>;
+                       [(X86rep_stos i8)]>, REP,
+                       Requires<[In64BitMode]>;
   let Uses = [AX,RCX,RDI] in
   def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
-                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
-                     Requires<[In64BitMode]>;
+                       [(X86rep_stos i16)]>, REP, OpSize16,
+                       Requires<[In64BitMode]>;
   let Uses = [RAX,RCX,RDI] in
   def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
-                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
-                     Requires<[In64BitMode]>;
+                       [(X86rep_stos i32)]>, REP, OpSize32,
+                       Requires<[In64BitMode]>;
 
   let Uses = [RAX,RCX,RDI] in
   def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
-                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
-                     Requires<[In64BitMode]>;
+                        [(X86rep_stos i64)]>, REP,
+                        Requires<[In64BitMode]>;
 }
 } // SchedRW
 
@@ -568,7 +567,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
 
   defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
   defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
-  defm _FR128  : CMOVrr_PSEUDO<FR128, f128>;
+  defm _F128   : CMOVrr_PSEUDO<VR128, f128>;
   defm _V4F32  : CMOVrr_PSEUDO<VR128, v4f32>;
   defm _V2F64  : CMOVrr_PSEUDO<VR128, v2f64>;
   defm _V2I64  : CMOVrr_PSEUDO<VR128, v2i64>;
@@ -595,9 +594,9 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
 // TODO: Get this to fold the constant into the instruction.
 let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
-                      "or{l}\t{$zero, $dst|$dst, $zero}", [],
-                      IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
-                    Sched<[WriteALULd, WriteRMW]>;
+                      "or{l}\t{$zero, $dst|$dst, $zero}", []>,
+                      Requires<[Not64BitMode]>, OpSize32, LOCK,
+                      Sched<[WriteALULd, WriteRMW]>;
 
 let hasSideEffects = 1 in
 def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
@@ -618,89 +617,85 @@ def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                   MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                   !strconcat(mnemonic, "{b}\t",
                              "{$src2, $dst|$dst, $src2}"),
-                  [(set EFLAGS, (Op addr:$dst, GR8:$src2))],
-                  IIC_ALU_NONMEM>, LOCK;
+                  [(set EFLAGS, (Op addr:$dst, GR8:$src2))]>, LOCK;
 
 def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    !strconcat(mnemonic, "{w}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   [(set EFLAGS, (Op addr:$dst, GR16:$src2))],
-                   IIC_ALU_NONMEM>, OpSize16, LOCK;
+                   [(set EFLAGS, (Op addr:$dst, GR16:$src2))]>,
+                   OpSize16, LOCK;
 
 def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    !strconcat(mnemonic, "{l}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   [(set EFLAGS, (Op addr:$dst, GR32:$src2))],
-                   IIC_ALU_NONMEM>, OpSize32, LOCK;
+                   [(set EFLAGS, (Op addr:$dst, GR32:$src2))]>,
+                   OpSize32, LOCK;
 
 def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     !strconcat(mnemonic, "{q}\t",
                                "{$src2, $dst|$dst, $src2}"),
-                    [(set EFLAGS, (Op addr:$dst, GR64:$src2))],
-                    IIC_ALU_NONMEM>, LOCK;
+                    [(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK;
 
 def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                     ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
                     ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
                     !strconcat(mnemonic, "{b}\t",
                                "{$src2, $dst|$dst, $src2}"),
-                    [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))],
-                    IIC_ALU_MEM>, LOCK;
+                    [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))]>, LOCK;
 
 def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))],
-                      IIC_ALU_MEM>, OpSize16, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))]>,
+                      OpSize16, LOCK;
 
 def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))],
-                      IIC_ALU_MEM>, OpSize32, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))]>,
+                      OpSize32, LOCK;
 
 def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                           ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                           ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
                           !strconcat(mnemonic, "{q}\t",
                                      "{$src2, $dst|$dst, $src2}"),
-                          [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))],
-                          IIC_ALU_MEM>, LOCK;
+                          [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>,
+                          LOCK;
 
 def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))],
-                      IIC_ALU_MEM>, OpSize16, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
+                      OpSize16, LOCK;
 
 def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))],
-                      IIC_ALU_MEM>, OpSize32, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
+                      OpSize32, LOCK;
 
 def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
                        !strconcat(mnemonic, "{q}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))],
-                       IIC_ALU_MEM>, LOCK;
-
+                       [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
+                       LOCK;
 }
 
 }
@@ -717,20 +712,20 @@ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
     SchedRW = [WriteALULd, WriteRMW] in {
 def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
                  !strconcat(mnemonic, "{b}\t$dst"),
-                 [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))],
-                  IIC_UNARY_MEM>, LOCK;
+                 [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))]>,
+                 LOCK;
 def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
                  !strconcat(mnemonic, "{w}\t$dst"),
-                 [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))],
-                 IIC_UNARY_MEM>, OpSize16, LOCK;
+                 [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))]>,
+                 OpSize16, LOCK;
 def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
                  !strconcat(mnemonic, "{l}\t$dst"),
-                 [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))],
-                 IIC_UNARY_MEM>, OpSize32, LOCK;
+                 [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))]>,
+                 OpSize32, LOCK;
 def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
                   !strconcat(mnemonic, "{q}\t$dst"),
-                  [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))],
-                  IIC_UNARY_MEM>, LOCK;
+                  [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))]>,
+                  LOCK;
 }
 }
 
@@ -761,43 +756,39 @@ defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">;
 
 // Atomic compare and swap.
 multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
-                         SDPatternOperator frag, X86MemOperand x86memop,
-                         InstrItinClass itin> {
+                         SDPatternOperator frag, X86MemOperand x86memop> {
 let isCodeGenOnly = 1, usesCustomInserter = 1 in {
   def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
                !strconcat(mnemonic, "\t$ptr"),
-               [(frag addr:$ptr)], itin>, TB, LOCK;
+               [(frag addr:$ptr)]>, TB, LOCK;
 }
 }
 
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
-                          string mnemonic, SDPatternOperator frag,
-                          InstrItinClass itin8, InstrItinClass itin> {
+                          string mnemonic, SDPatternOperator frag> {
 let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
   let Defs = [AL, EFLAGS], Uses = [AL] in
   def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
                   !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
-                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+                  [(frag addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
   let Defs = [AX, EFLAGS], Uses = [AX] in
   def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
                   !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
-                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+                  [(frag addr:$ptr, GR16:$swap, 2)]>, TB, OpSize16, LOCK;
   let Defs = [EAX, EFLAGS], Uses = [EAX] in
   def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
                   !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
-                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+                  [(frag addr:$ptr, GR32:$swap, 4)]>, TB, OpSize32, LOCK;
   let Defs = [RAX, EFLAGS], Uses = [RAX] in
   def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
                    !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
-                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+                   [(frag addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
 }
 }
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
     SchedRW = [WriteALULd, WriteRMW] in {
-defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
-                                X86cas8, i64mem,
-                                IIC_CMPX_LOCK_8B>;
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
 }
 
 // This pseudo must be used when the frame uses RBX as
@@ -827,16 +818,14 @@ def LCMPXCHG8B_SAVE_EBX :
       (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
       !strconcat("cmpxchg8b", "\t$ptr"),
       [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
-                                        GR32:$ebx_save))],
-      IIC_CMPX_LOCK_8B>;
+                                        GR32:$ebx_save))]>;
 }
 
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
     Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
 defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
-                                 X86cas16, i128mem,
-                                 IIC_CMPX_LOCK_16B>, REX_W;
+                                 X86cas16, i128mem>, REX_W;
 }
 
 // Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
@@ -849,52 +838,45 @@ def LCMPXCHG16B_SAVE_RBX :
       (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
       !strconcat("cmpxchg16b", "\t$ptr"),
       [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
-                                                    GR64:$rbx_save))],
-      IIC_CMPX_LOCK_16B>;
+                                                    GR64:$rbx_save))]>;
 }
 
-defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
-                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
 
 // Atomic exchange and add
 multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
-                             string frag,
-                             InstrItinClass itin8, InstrItinClass itin> {
+                             string frag> {
   let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
       SchedRW = [WriteALULd, WriteRMW] in {
     def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
                     (ins GR8:$val, i8mem:$ptr),
                     !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
                     [(set GR8:$dst,
-                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
-                    itin8>;
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
     def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
                     (ins GR16:$val, i16mem:$ptr),
                     !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
                     [(set
                        GR16:$dst,
-                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
-                    itin>, OpSize16;
+                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+                    OpSize16;
     def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
                     (ins GR32:$val, i32mem:$ptr),
                     !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
                     [(set
                        GR32:$dst,
-                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
-                    itin>, OpSize32;
+                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, 
+                    OpSize32;
     def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
                      (ins GR64:$val, i64mem:$ptr),
                      !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
                      [(set
                         GR64:$dst,
-                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
-                     itin>;
+                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
   }
 }
 
-defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
-                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
-             TB, LOCK;
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK;
 
 /* The following multiclass tries to make sure that in code like
  *    x.store (immediate op x.load(acquire), release)
@@ -1376,12 +1358,50 @@ def ADD64ri8_DB : I<0, Pseudo,
                                                 i64immSExt8:$src2))]>;
 def ADD64ri32_DB : I<0, Pseudo,
                      (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
-                      "", // orq/addq REG, imm
-                      [(set GR64:$dst, (or_is_add GR64:$src1,
-                                                  i64immSExt32:$src2))]>;
+                     "", // orq/addq REG, imm
+                     [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                 i64immSExt32:$src2))]>;
 }
 } // AddedComplexity, SchedRW
 
+//===----------------------------------------------------------------------===//
+// Pattern match SUB as XOR
+//===----------------------------------------------------------------------===//
+
+// An immediate in the LHS of a subtract can't be encoded in the instruction.
+// If there is no possibility of a borrow we can use an XOR instead of a SUB
+// to enable the immediate to be folded.
+// TODO: Move this to a DAG combine?
+
+def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+    KnownBits Known;
+    CurDAG->computeKnownBits(N->getOperand(1), Known);
+
+    // If all possible ones in the RHS are set in the LHS then there can't be
+    // a borrow and we can use xor.
+    return (~Known.Zero).isSubsetOf(CN->getAPIntValue());
+  }
+
+  return false;
+}]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(sub_is_xor imm:$src2, GR8:$src1),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub_is_xor imm:$src2, GR16:$src1),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(sub_is_xor imm:$src2, GR32:$src1),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Some peepholes
@@ -1471,6 +1491,37 @@ def : Pat<(and GR64:$src, 0xff),
 } // AddedComplexity = 1
 
 
+// Try to use BTS/BTR/BTC for single bit operations on the upper 32-bits.
+
+def BTRXForm : SDNodeXForm<imm, [{
+  // Transformation function: Find the lowest 0.
+  return getI64Imm((uint8_t)N->getAPIntValue().countTrailingOnes(), SDLoc(N));
+}]>;
+
+def BTCBTSXForm : SDNodeXForm<imm, [{
+  // Transformation function: Find the lowest 1.
+  return getI64Imm((uint8_t)N->getAPIntValue().countTrailingZeros(), SDLoc(N));
+}]>;
+
+def BTRMask64 : ImmLeaf<i64, [{
+  return !isUInt<32>(Imm) && !isInt<32>(Imm) && isPowerOf2_64(~Imm);
+}]>;
+
+def BTCBTSMask64 : ImmLeaf<i64, [{
+  return !isInt<32>(Imm) && isPowerOf2_64(Imm);
+}]>;
+
+// For now only do this for optsize.
+let AddedComplexity = 1, Predicates=[OptForSize] in {
+  def : Pat<(and GR64:$src1, BTRMask64:$mask),
+            (BTR64ri8 GR64:$src1, (BTRXForm imm:$mask))>;
+  def : Pat<(or GR64:$src1, BTCBTSMask64:$mask),
+            (BTS64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
+  def : Pat<(xor GR64:$src1, BTCBTSMask64:$mask),
+            (BTC64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
+}
+
+
 // sext_inreg patterns
 def : Pat<(sext_inreg GR32:$src, i16),
           (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
@@ -1522,6 +1573,10 @@ def : Pat<(i8 (trunc GR16:$src)),
           (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
       Requires<[In64BitMode]>;
 
+def immff00_ffff  : ImmLeaf<i32, [{
+  return Imm >= 0xff00 && Imm <= 0xffff;
+}]>;
+
 // h-register tricks
 def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
           (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
@@ -1534,16 +1589,16 @@ def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
       Requires<[Not64BitMode]>;
 def : Pat<(srl GR16:$src, (i8 8)),
           (EXTRACT_SUBREG
-            (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
+            (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
             sub_16bit)>;
 def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
-          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
+          (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
 def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
-          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
+          (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
 def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
-          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
-def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
-          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+          (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)),
+          (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
 
 // h-register tricks.
 // For now, be conservative on x86-64 and use an h-register extract only if the
@@ -1556,19 +1611,19 @@ def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
 def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
           (SUBREG_TO_REG
             (i64 0),
-            (MOVZX32_NOREXrr8
+            (MOVZX32rr8_NOREX
               (EXTRACT_SUBREG GR64:$src, sub_8bit_hi)),
             sub_32bit)>;
 def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
           (SUBREG_TO_REG
             (i64 0),
-            (MOVZX32_NOREXrr8
+            (MOVZX32rr8_NOREX
               (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
             sub_32bit)>;
 def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
           (SUBREG_TO_REG
             (i64 0),
-            (MOVZX32_NOREXrr8
+            (MOVZX32rr8_NOREX
               (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
             sub_32bit)>;
 
@@ -1719,36 +1774,65 @@ let Predicates = [HasBMI2] in {
                           (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
   }
 
-  let AddedComplexity = -20 in {
-    def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
-              (SARX32rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-    def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
-              (SARX64rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-
-    def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
-              (SHRX32rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-    def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
-              (SHRX64rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+            (SARX32rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+            (SARX64rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+  def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+            (SHRX32rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+            (SHRX64rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+  def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+            (SHLX32rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+            (SHLX64rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+}
 
-    def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
-              (SHLX32rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-    def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
-              (SHLX64rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-  }
+// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
+multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
+                            Instruction BTS, Instruction BTC,
+                            ImmLeaf ImmShift> {
+  def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
+            (BTR RC:$src1,
+                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(or RC:$src1, (shl 1, GR8:$src2)),
+            (BTS RC:$src1,
+                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(xor RC:$src1, (shl 1, GR8:$src2)),
+            (BTC RC:$src1,
+                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+  // Similar to above, but removing unneeded masking of the shift amount.
+  def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))),
+            (BTR RC:$src1,
+                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+            (BTS RC:$src1,
+                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+            (BTC RC:$src1,
+                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
 }
 
+defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, immShift16>;
+defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, immShift32>;
+defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, immShift64>;
+
+
 // (anyext (setcc_carry)) -> (setcc_carry)
 def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C16r)>;
@@ -1765,6 +1849,7 @@ def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
 def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>;
 
 // add reg, mem
 def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
@@ -1773,6 +1858,8 @@ def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
           (ADD16rm GR16:$src1, addr:$src2)>;
 def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
           (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
 
 // add reg, imm
 def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
@@ -1782,11 +1869,16 @@ def : Pat<(add GR16:$src1, i16immSExt8:$src2),
           (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(add GR32:$src1, i32immSExt8:$src2),
           (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // sub reg, reg
 def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>;
 
 // sub reg, mem
 def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
@@ -1795,6 +1887,8 @@ def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
           (SUB16rm GR16:$src1, addr:$src2)>;
 def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
           (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
 
 // sub reg, imm
 def : Pat<(sub GR8:$src1, imm:$src2),
@@ -1807,6 +1901,10 @@ def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
           (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
           (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // sub 0, reg
 def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
@@ -1825,12 +1923,16 @@ def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(mul GR32:$src1, GR32:$src2),
           (IMUL32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(mul GR64:$src1, GR64:$src2),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
 
 // mul reg, mem
 def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
           (IMUL16rm GR16:$src1, addr:$src2)>;
 def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
           (IMUL32rm GR32:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
 
 // mul reg, imm
 def : Pat<(mul GR16:$src1, imm:$src2),
@@ -1841,6 +1943,10 @@ def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
           (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
           (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // reg = mul mem, imm
 def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
@@ -1851,38 +1957,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
           (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
 def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
           (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
-
-// Patterns for nodes that do not produce flags, for instructions that do.
-
-// addition
-def : Pat<(add GR64:$src1, GR64:$src2),
-          (ADD64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(add GR64:$src1, i64immSExt8:$src2),
-          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(add GR64:$src1, i64immSExt32:$src2),
-          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
-          (ADD64rm GR64:$src1, addr:$src2)>;
-
-// subtraction
-def : Pat<(sub GR64:$src1, GR64:$src2),
-          (SUB64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
-          (SUB64rm GR64:$src1, addr:$src2)>;
-def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
-          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
-          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Multiply
-def : Pat<(mul GR64:$src1, GR64:$src2),
-          (IMUL64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
-          (IMUL64rm GR64:$src1, addr:$src2)>;
-def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
-          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
-          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
           (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
@@ -2007,3 +2081,23 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
 let Predicates = [HasMOVBE] in {
  def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
 }
+
+// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that
+// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch
+// of manual code for folding loads.
+let Predicates = [HasBMI, NoTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>;
+  def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2),
+            (BEXTR64rr GR64:$src1,
+                       (SUBREG_TO_REG (i64 0),
+                                      (MOV32ri64 mov64imm32:$src2),
+                                      sub_32bit))>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2),
+            (BEXTR64rm addr:$src1,
+                       (SUBREG_TO_REG (i64 0),
+                                      (MOV32ri64 mov64imm32:$src2),
+                                      sub_32bit))>;
+} // HasBMI, NoTBM
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
index 7932686ebc87..650bce74dcf2 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrControl.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -22,47 +22,37 @@
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
   def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
-                    "ret{l}", [], IIC_RET>, OpSize32,
-                    Requires<[Not64BitMode]>;
+                    "ret{l}", []>, OpSize32, Requires<[Not64BitMode]>;
   def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
-                    "ret{q}", [], IIC_RET>, OpSize32,
-                    Requires<[In64BitMode]>;
+                    "ret{q}", []>, OpSize32, Requires<[In64BitMode]>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
-                    "ret{w}",
-                    [], IIC_RET>, OpSize16;
+                    "ret{w}", []>, OpSize16;
   def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
-                    "ret{l}\t$amt",
-                    [], IIC_RET_IMM>, OpSize32,
-               Requires<[Not64BitMode]>;
+                    "ret{l}\t$amt", []>, OpSize32, Requires<[Not64BitMode]>;
   def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
-                    "ret{q}\t$amt",
-                    [], IIC_RET_IMM>, OpSize32,
-               Requires<[In64BitMode]>;
+                    "ret{q}\t$amt", []>, OpSize32, Requires<[In64BitMode]>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
-                    "ret{w}\t$amt",
-                    [], IIC_RET_IMM>, OpSize16;
+                    "ret{w}\t$amt", []>, OpSize16;
   def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
-                    "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+                    "{l}ret{l|f}", []>, OpSize32;
   def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
-                    "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+                    "{l}ret{|f}q", []>, Requires<[In64BitMode]>;
   def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
-                    "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+                    "{l}ret{w|f}", []>, OpSize16;
   def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+                    "{l}ret{l|f}\t$amt", []>, OpSize32;
   def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+                    "{l}ret{|f}q\t$amt", []>, Requires<[In64BitMode]>;
   def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+                    "{l}ret{w|f}\t$amt", []>, OpSize16;
 
   // The machine return from interrupt instruction, but sometimes we need to
   // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
   // which expands to include an SP adjustment if necessary.
-  def IRET16 : I   <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+  def IRET16 : I   <0xcf, RawFrm, (outs), (ins), "iret{w}", []>,
                OpSize16;
-  def IRET32 : I   <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
-                    IIC_IRET>, OpSize32;
-  def IRET64 : RI  <0xcf, RawFrm, (outs), (ins), "iretq", [],
-                    IIC_IRET>, Requires<[In64BitMode]>;
+  def IRET32 : I   <0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>, OpSize32;
+  def IRET64 : RI  <0xcf, RawFrm, (outs), (ins), "iretq", []>, Requires<[In64BitMode]>;
   let isCodeGenOnly = 1 in
   def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
   def RET  : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
@@ -71,12 +61,12 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
 // Unconditional branches.
 let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
   def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
-                       "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+                       "jmp\t$dst", [(br bb:$dst)]>;
   let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
     def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
-                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+                          "jmp\t$dst", []>, OpSize16;
     def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
-                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+                          "jmp\t$dst", []>, OpSize32;
   }
 }
 
@@ -84,12 +74,12 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
 let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
   multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
     def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
-                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+                       [(X86brcond bb:$dst, Cond, EFLAGS)]>;
     let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
       def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
-                         [], IIC_Jcc>, OpSize16, TB;
+                         []>, OpSize16, TB;
       def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
-                         [], IIC_Jcc>, TB, OpSize32;
+                         []>, TB, OpSize32;
     }
   }
 }
@@ -118,69 +108,91 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
   // jecxz.
   let Uses = [CX] in
     def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
-                        Requires<[Not64BitMode]>;
+                        "jcxz\t$dst", []>, AdSize16, Requires<[Not64BitMode]>;
   let Uses = [ECX] in
     def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                        "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
+                        "jecxz\t$dst", []>, AdSize32;
 
   let Uses = [RCX] in
     def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                         "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
-                         Requires<[In64BitMode]>;
+                         "jrcxz\t$dst", []>, AdSize64, Requires<[In64BitMode]>;
 }
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def JMP16r     : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
-                     [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
-                   OpSize16, Sched<[WriteJump]>;
+                     [(brind GR16:$dst)]>, Requires<[Not64BitMode]>,
+                     OpSize16, Sched<[WriteJump]>;
   def JMP16m     : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
-                     [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
-                   Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+                     [(brind (loadi16 addr:$dst))]>, Requires<[Not64BitMode]>,
+                     OpSize16, Sched<[WriteJumpLd]>;
 
   def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
-                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
-                   OpSize32, Sched<[WriteJump]>;
+                     [(brind GR32:$dst)]>, Requires<[Not64BitMode]>,
+                     OpSize32, Sched<[WriteJump]>;
   def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
-                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
-                   Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+                     [(brind (loadi32 addr:$dst))]>, Requires<[Not64BitMode]>,
+                     OpSize32, Sched<[WriteJumpLd]>;
 
   def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
-                     [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
-                   Sched<[WriteJump]>;
+                     [(brind GR64:$dst)]>, Requires<[In64BitMode]>,
+                     Sched<[WriteJump]>;
   def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
-                     [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
-                   Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+                     [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
+                     Sched<[WriteJumpLd]>;
+
+  // Non-tracking jumps for IBT, use with caution.
+  let isCodeGenOnly = 1 in {
+    def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
+                      [(X86NoTrackBrind GR16 : $dst)]>, Requires<[Not64BitMode]>,
+                      OpSize16, Sched<[WriteJump]>, NOTRACK;
+
+    def JMP16m_NT : I<0xFF, MRM4m, (outs), (ins i16mem : $dst), "jmp{w}\t{*}$dst",
+                      [(X86NoTrackBrind (loadi16 addr : $dst))]>,
+                      Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>,
+                      NOTRACK;
+
+    def JMP32r_NT : I<0xFF, MRM4r, (outs), (ins GR32 : $dst), "jmp{l}\t{*}$dst",
+                      [(X86NoTrackBrind GR32 : $dst)]>, Requires<[Not64BitMode]>,
+                      OpSize32, Sched<[WriteJump]>, NOTRACK;
+    def JMP32m_NT : I<0xFF, MRM4m, (outs), (ins i32mem : $dst), "jmp{l}\t{*}$dst",
+                      [(X86NoTrackBrind (loadi32 addr : $dst))]>,
+                      Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>,
+                      NOTRACK;
+
+    def JMP64r_NT : I<0xFF, MRM4r, (outs), (ins GR64 : $dst), "jmp{q}\t{*}$dst",
+                      [(X86NoTrackBrind GR64 : $dst)]>, Requires<[In64BitMode]>,
+                      Sched<[WriteJump]>, NOTRACK;
+    def JMP64m_NT : I<0xFF, MRM4m, (outs), (ins i64mem : $dst), "jmp{q}\t{*}$dst",
+                      [(X86NoTrackBrind(loadi64 addr : $dst))]>,
+                      Requires<[In64BitMode]>, Sched<[WriteJumpLd]>, NOTRACK;
+  }
 
-  let Predicates = [Not64BitMode] in {
+  let Predicates = [Not64BitMode], AsmVariantName = "att" in {
     def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
                             (ins i16imm:$off, i16imm:$seg),
-                            "ljmp{w}\t$seg, $off", [],
-                            IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+                            "ljmp{w}\t$seg, $off", []>,
+                            OpSize16, Sched<[WriteJump]>;
     def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
                             (ins i32imm:$off, i16imm:$seg),
-                            "ljmp{l}\t$seg, $off", [],
-                            IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+                            "ljmp{l}\t$seg, $off", []>,
+                            OpSize32, Sched<[WriteJump]>;
   }
-  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
-                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
-                   Sched<[WriteJump]>;
-
-  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
-                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
-                   Sched<[WriteJumpLd]>;
-  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
-                     "{l}jmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
-                   Sched<[WriteJumpLd]>;
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+                      "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
+
+  let AsmVariantName = "att" in
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+                     "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+                     "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
 }
 
-
 // Loop instructions
 let SchedRW = [WriteJump] in {
-def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
-def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
-def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
+def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -194,48 +206,62 @@ let isCall = 1 in
   let Uses = [ESP, SSP] in {
     def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
                            (outs), (ins i32imm_pcrel:$dst),
-                           "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                           "call{l}\t$dst", []>, OpSize32,
                       Requires<[Not64BitMode]>, Sched<[WriteJump]>;
     let hasSideEffects = 0 in
       def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
                              (outs), (ins i16imm_pcrel:$dst),
-                             "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+                             "call{w}\t$dst", []>, OpSize16,
                         Sched<[WriteJump]>;
     def CALL16r     : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
-                        "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+                        "call{w}\t{*}$dst", [(X86call GR16:$dst)]>,
                       OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
     def CALL16m     : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
-                        "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
-                        IIC_CALL_MEM>, OpSize16,
-                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
-                      Sched<[WriteJumpLd]>;
+                        "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))]>,
+                        OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                        Sched<[WriteJumpLd]>;
     def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
-                        "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
-                      OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>,
-                      Sched<[WriteJump]>;
+                        "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
+                        Requires<[Not64BitMode,NotUseRetpoline]>, Sched<[WriteJump]>;
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
-                        "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
-                        IIC_CALL_MEM>, OpSize32,
-                      Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>,
-                      Sched<[WriteJumpLd]>;
+                        "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
+                        OpSize32,
+                        Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>,
+                        Sched<[WriteJumpLd]>;
+
+    // Non-tracking calls for IBT, use with caution.
+    let isCodeGenOnly = 1 in {
+      def CALL16r_NT : I<0xFF, MRM2r, (outs), (ins GR16 : $dst),
+                        "call{w}\t{*}$dst",[(X86NoTrackCall GR16 : $dst)]>,
+                        OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
+      def CALL16m_NT : I<0xFF, MRM2m, (outs), (ins i16mem : $dst),
+                        "call{w}\t{*}$dst",[(X86NoTrackCall(loadi16 addr : $dst))]>,
+                        OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                        Sched<[WriteJumpLd]>, NOTRACK;
+      def CALL32r_NT : I<0xFF, MRM2r, (outs), (ins GR32 : $dst),
+                        "call{l}\t{*}$dst",[(X86NoTrackCall GR32 : $dst)]>,
+                        OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
+      def CALL32m_NT : I<0xFF, MRM2m, (outs), (ins i32mem : $dst),
+                        "call{l}\t{*}$dst",[(X86NoTrackCall(loadi32 addr : $dst))]>,
+                        OpSize32, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                        Sched<[WriteJumpLd]>, NOTRACK;
+    }
 
-    let Predicates = [Not64BitMode] in {
+    let Predicates = [Not64BitMode], AsmVariantName = "att" in {
       def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
                                (ins i16imm:$off, i16imm:$seg),
-                               "lcall{w}\t$seg, $off", [],
-                               IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+                               "lcall{w}\t$seg, $off", []>,
+                               OpSize16, Sched<[WriteJump]>;
       def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
                                (ins i32imm:$off, i16imm:$seg),
-                               "lcall{l}\t$seg, $off", [],
-                               IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+                               "lcall{l}\t$seg, $off", []>,
+                               OpSize32, Sched<[WriteJump]>;
     }
 
-    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
-                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
-                      Sched<[WriteJumpLd]>;
-    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
-                        "{l}call{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
-                      Sched<[WriteJumpLd]>;
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+                        "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+                        "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
   }
 
 
@@ -254,15 +280,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   // FIXME: The should be pseudo instructions that are lowered when going to
   // mcinst.
   def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
-                           (ins i32imm_pcrel:$dst),
-                           "jmp\t$dst",
-                           [], IIC_JMP_REL>;
+                           (ins i32imm_pcrel:$dst), "jmp\t$dst", []>;
 
   def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                   "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
+                   "", []>;  // FIXME: Remove encoding when JIT is dead.
   let mayLoad = 1 in
   def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
-                   "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
+                   "jmp{l}\t{*}$dst", []>;
 }
 
 // Conditional tail calls are similar to the above, but they are branches
@@ -275,9 +299,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
 
   // This gets substituted to a conditional jump instruction in MC lowering.
   def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
-                           (ins i32imm_pcrel:$dst, i32imm:$cond),
-                           "",
-                           [], IIC_JMP_REL>;
+                           (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>;
 }
 
 
@@ -294,25 +316,33 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
   // the 32-bit pcrel field that we have.
   def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
                         (outs), (ins i64i32imm_pcrel:$dst),
-                        "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                        "call{q}\t$dst", []>, OpSize32,
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
-                        "call{q}\t{*}$dst", [(X86call GR64:$dst)],
-                        IIC_CALL_RI>,
+                        "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
                       Requires<[In64BitMode,NotUseRetpoline]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
-                        "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
-                        IIC_CALL_MEM>,
+                        "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
                                 NotUseRetpoline]>;
 
-  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
-                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+  // Non-tracking calls for IBT, use with caution.
+  let isCodeGenOnly = 1 in {
+    def CALL64r_NT : I<0xFF, MRM2r, (outs), (ins GR64 : $dst),
+                      "call{q}\t{*}$dst",[(X86NoTrackCall GR64 : $dst)]>,
+                      Requires<[In64BitMode]>, NOTRACK;
+    def CALL64m_NT : I<0xFF, MRM2m, (outs), (ins i64mem : $dst),
+                       "call{q}\t{*}$dst",
+                       [(X86NoTrackCall(loadi64 addr : $dst))]>,
+                       Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK;
+  }
+
+  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+                       "lcall{q}\t{*}$dst", []>;
 }
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    isCodeGenOnly = 1, Uses = [RSP, SSP], usesCustomInserter = 1,
-    SchedRW = [WriteJump] in {
+    isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
   def TCRETURNdi64   : PseudoI<(outs),
                         (ins i64i32imm_pcrel:$dst, i32imm:$offset),
                         []>;
@@ -323,23 +353,23 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                         (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;
 
   def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
-                   "jmp\t$dst", [], IIC_JMP_REL>;
+                   "jmp\t$dst", []>;
 
   def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+                     "jmp{q}\t{*}$dst", []>;
 
   let mayLoad = 1 in
   def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
-                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+                     "jmp{q}\t{*}$dst", []>;
 
   // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
   let hasREX_WPrefix = 1 in {
     def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+                           "rex64 jmp{q}\t{*}$dst", []>;
 
     let mayLoad = 1 in
     def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
-                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+                           "rex64 jmp{q}\t{*}$dst", []>;
   }
 }
 
@@ -375,7 +405,5 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
 
   // This gets substituted to a conditional jump instruction in MC lowering.
   def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
-                           (ins i64i32imm_pcrel:$dst, i32imm:$cond),
-                           "",
-                           [], IIC_JMP_REL>;
+                           (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
index 2a8ab0069b1e..421792c5599f 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -14,104 +14,124 @@
 let hasSideEffects = 0 in {
   let Defs = [AX], Uses = [AL] in // AX = signext(AL)
   def CBW : I<0x98, RawFrm, (outs), (ins),
-              "{cbtw|cbw}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>;
+              "{cbtw|cbw}", []>, OpSize16, Sched<[WriteALU]>;
   let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
   def CWDE : I<0x98, RawFrm, (outs), (ins),
-              "{cwtl|cwde}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>;
+              "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>;
 
   let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
   def CWD : I<0x99, RawFrm, (outs), (ins),
-              "{cwtd|cwd}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>;
+              "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>;
   let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
   def CDQ : I<0x99, RawFrm, (outs), (ins),
-              "{cltd|cdq}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>;
+              "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>;
 
 
   let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
   def CDQE : RI<0x98, RawFrm, (outs), (ins),
-               "{cltq|cdqe}", [], IIC_CBW>, Sched<[WriteALU]>;
+               "{cltq|cdqe}", []>, Sched<[WriteALU]>;
 
   let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
   def CQO  : RI<0x99, RawFrm, (outs), (ins),
-                "{cqto|cqo}", [], IIC_CBW>, Sched<[WriteALU]>;
+                "{cqto|cqo}", []>, Sched<[WriteALU]>;
 }
 
 // Sign/Zero extenders
 let hasSideEffects = 0 in {
 def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
-                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", []>,
                    TB, OpSize16, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
-                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", []>,
                    TB, OpSize16, Sched<[WriteALULd]>;
 } // hasSideEffects = 0
 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                   [(set GR32:$dst, (sext GR8:$src))]>, TB,
                    OpSize32, Sched<[WriteALU]>;
 def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB,
                    OpSize32, Sched<[WriteALULd]>;
 def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
                    "movs{wl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                   [(set GR32:$dst, (sext GR16:$src))]>, TB,
                    OpSize32, Sched<[WriteALU]>;
 def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movs{wl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))]>,
                    OpSize32, TB, Sched<[WriteALULd]>;
 
 let hasSideEffects = 0 in {
 def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
-                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", []>,
                    TB, OpSize16, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
-                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", []>,
                    TB, OpSize16, Sched<[WriteALULd]>;
 } // hasSideEffects = 0
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+                   [(set GR32:$dst, (zext GR8:$src))]>, TB,
                    OpSize32, Sched<[WriteALU]>;
 def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB,
                    OpSize32, Sched<[WriteALULd]>;
 def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
                    "movz{wl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+                   [(set GR32:$dst, (zext GR16:$src))]>, TB,
                    OpSize32, Sched<[WriteALU]>;
 def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movz{wl|x}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))]>,
                    TB, OpSize32, Sched<[WriteALULd]>;
 
+// These instructions exist as a consequence of operand size prefix having
+// control of the destination size, but not the input size. Only support them
+// for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVSX16rr16: I<0xBF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                   "movs{ww|x}\t{$src, $dst|$dst, $src}",
+                   []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable;
+def MOVZX16rr16: I<0xB7, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                   "movz{ww|x}\t{$src, $dst|$dst, $src}",
+                   []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable;
+let mayLoad = 1 in {
+def MOVSX16rm16: I<0xBF, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "movs{ww|x}\t{$src, $dst|$dst, $src}",
+                   []>, OpSize16, TB, Sched<[WriteALULd]>, NotMemoryFoldable;
+def MOVZX16rm16: I<0xB7, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "movz{ww|x}\t{$src, $dst|$dst, $src}",
+                   []>, TB, OpSize16, Sched<[WriteALULd]>, NotMemoryFoldable;
+} // mayLoad = 1
+} // isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0
+
 // These are the same as the regular MOVZX32rr8 and MOVZX32rm8
 // except that they use GR32_NOREX for the output operand register class
 // instead of GR32. This allows them to operate on h registers on x86-64.
 let hasSideEffects = 0, isCodeGenOnly = 1 in {
-def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+def MOVZX32rr8_NOREX : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
-                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>;
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         []>, TB, OpSize32, Sched<[WriteALU]>;
 let mayLoad = 1 in
-def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+def MOVZX32rm8_NOREX : I<0xB6, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
-                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>;
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         []>, TB, OpSize32, Sched<[WriteALULd]>;
 
-def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+def MOVSX32rr8_NOREX : I<0xBE, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
-                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>;
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                         []>, TB, OpSize32, Sched<[WriteALU]>;
 let mayLoad = 1 in
-def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+def MOVSX32rm8_NOREX : I<0xBE, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
-                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>;
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                         []>, TB, OpSize32, Sched<[WriteALULd]>;
 }
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register
@@ -120,44 +140,44 @@ def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
 // were generalized, this would require a special register class.
 def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
                     "movs{bq|x}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                    [(set GR64:$dst, (sext GR8:$src))]>, TB,
                     Sched<[WriteALU]>;
 def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
                     "movs{bq|x}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))]>,
                     TB, Sched<[WriteALULd]>;
 def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
                     "movs{wq|x}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                    [(set GR64:$dst, (sext GR16:$src))]>, TB,
                     Sched<[WriteALU]>;
 def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                     "movs{wq|x}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))]>,
                     TB, Sched<[WriteALULd]>;
 def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
                     "movs{lq|xd}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+                    [(set GR64:$dst, (sext GR32:$src))]>,
                     Sched<[WriteALU]>, Requires<[In64BitMode]>;
 def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
                     "movs{lq|xd}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))]>,
                     Sched<[WriteALULd]>, Requires<[In64BitMode]>;
 
 // movzbq and movzwq encodings for the disassembler
 let hasSideEffects = 0 in {
 def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
-                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", []>,
                      TB, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
-                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", []>,
                      TB, Sched<[WriteALULd]>;
 def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
-                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", []>,
                      TB, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", []>,
                      TB, Sched<[WriteALULd]>;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
index 35fa45590fc6..a559f62c8f38 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -36,13 +36,13 @@
 
 multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
                         ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
-                        SDNode Op> {
+                        SDNode Op, X86FoldableSchedWrite sched> {
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>,
-                   Sched<[WriteFMA]>;
+                   Sched<[sched]>;
 
   let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
@@ -51,18 +51,18 @@ multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst, (VT (Op RC:$src2, RC:$src1,
                                           (MemFrag addr:$src3))))]>,
-                   Sched<[WriteFMALd, ReadAfterLd]>;
+                   Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
                         ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
-                        SDNode Op> {
+                        SDNode Op, X86FoldableSchedWrite sched> {
   let hasSideEffects = 0 in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   []>, Sched<[WriteFMA]>;
+                   []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
@@ -70,18 +70,19 @@ multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3),
-                                          RC:$src1)))]>, Sched<[WriteFMALd, ReadAfterLd]>;
+                                          RC:$src1)))]>,
+                   Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
                         ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
-                        SDNode Op> {
+                        SDNode Op, X86FoldableSchedWrite sched> {
   let hasSideEffects = 0 in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   []>, Sched<[WriteFMA]>;
+                   []>, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -91,71 +92,77 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1,
-                                          RC:$src2)))]>, Sched<[WriteFMALd, ReadAfterLd]>;
+                                          RC:$src2)))]>,
+                   Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy, string Suff,
                        PatFrag MemFrag128, PatFrag MemFrag256,
-                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256,
+                       X86SchedWriteWidths sched> {
   defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
-                                    VR128, OpTy128, f128mem, MemFrag128, Op>;
+                                    VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
   defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
-                                    VR128, OpTy128, f128mem, MemFrag128, Op>;
+                                    VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
   defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
-                                    VR128, OpTy128, f128mem, MemFrag128, Op>;
+                                    VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
 
   defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
-                                      VR256, OpTy256, f256mem, MemFrag256, Op>,
+                                      VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
                                       VEX_L;
   defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
-                                      VR256, OpTy256, f256mem, MemFrag256, Op>,
+                                      VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
                                       VEX_L;
   defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
-                                      VR256, OpTy256, f256mem, MemFrag256, Op>,
+                                      VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
                                       VEX_L;
 }
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
-                               loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32>;
+                               loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32,
+                               SchedWriteFMA>;
   defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
-                               loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
+                               loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
+                               SchedWriteFMA>;
   defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
-                               loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>;
+                               loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
+                               SchedWriteFMA>;
   defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
-                               loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>;
+                               loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32,
+                               SchedWriteFMA>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
                                loadv2f64, loadv4f64, X86Fmadd, v2f64,
-                               v4f64>, VEX_W;
+                               v4f64, SchedWriteFMA>, VEX_W;
   defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
                                loadv2f64, loadv4f64, X86Fmsub, v2f64,
-                               v4f64>, VEX_W;
+                               v4f64, SchedWriteFMA>, VEX_W;
   defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
                                loadv2f64, loadv4f64, X86Fmaddsub,
-                               v2f64, v4f64>, VEX_W;
+                               v2f64, v4f64, SchedWriteFMA>, VEX_W;
   defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
                                loadv2f64, loadv4f64, X86Fmsubadd,
-                               v2f64, v4f64>, VEX_W;
+                               v2f64, v4f64, SchedWriteFMA>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
-                             loadv8f32, X86Fnmadd, v4f32, v8f32>;
+                             loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>;
   defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
-                             loadv8f32, X86Fnmsub, v4f32, v8f32>;
+                             loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
-                             loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+                             loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
   defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
-                             loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W;
+                             loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
 }
 
 // All source register operands of FMA opcodes defined in fma3s_rm multiclass
@@ -169,13 +176,14 @@ let ExeDomain = SSEPackedDouble in {
 // defining FMA3 opcodes above.
 multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
                         X86MemOperand x86memop, RegisterClass RC,
-                        SDPatternOperator OpNode> {
+                        SDPatternOperator OpNode,
+                        X86FoldableSchedWrite sched> {
   def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
                 (ins RC:$src1, RC:$src2, RC:$src3),
                 !strconcat(OpcodeStr,
                            "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                 [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>,
-                Sched<[WriteFMA]>;
+                Sched<[sched]>;
 
   let mayLoad = 1 in
   def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
@@ -184,18 +192,18 @@ multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
                            "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                 [(set RC:$dst,
                   (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>,
-                Sched<[WriteFMALd, ReadAfterLd]>;
+                Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
                         X86MemOperand x86memop, RegisterClass RC,
-                        SDPatternOperator OpNode> {
+                        SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
   let hasSideEffects = 0 in
   def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
                 (ins RC:$src1, RC:$src2, RC:$src3),
                 !strconcat(OpcodeStr,
                            "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                []>, Sched<[WriteFMA]>;
+                []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
@@ -204,18 +212,18 @@ multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
                            "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                 [(set RC:$dst,
                   (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>,
-                Sched<[WriteFMALd, ReadAfterLd]>;
+                Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
                         X86MemOperand x86memop, RegisterClass RC,
-                        SDPatternOperator OpNode> {
+                        SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
   let hasSideEffects = 0 in
   def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
                 (ins RC:$src1, RC:$src2, RC:$src3),
                 !strconcat(OpcodeStr,
                            "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                []>, Sched<[WriteFMA]>;
+                []>, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -226,20 +234,20 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
                            "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                 [(set RC:$dst,
                   (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>,
-                Sched<[WriteFMALd, ReadAfterLd]>;
+                Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpStr, string PackTy, string Suff,
                        SDNode OpNode, RegisterClass RC,
-                       X86MemOperand x86memop> {
+                       X86MemOperand x86memop, X86FoldableSchedWrite sched> {
   defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
-                                    x86memop, RC, OpNode>;
+                                    x86memop, RC, OpNode, sched>;
   defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy),
-                                    x86memop, RC, OpNode>;
+                                    x86memop, RC, OpNode, sched>;
   defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy),
-                                    x86memop, RC, OpNode>;
+                                    x86memop, RC, OpNode, sched>;
 }
 
 // These FMA*_Int instructions are defined specially for being used when
@@ -258,19 +266,20 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
     hasSideEffects = 0 in
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
-                        Operand memopr, RegisterClass RC> {
+                        Operand memopr, RegisterClass RC,
+                        X86FoldableSchedWrite sched> {
   def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
                         (ins RC:$src1, RC:$src2, RC:$src3),
                         !strconcat(OpcodeStr,
                                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                        []>, Sched<[WriteFMA]>;
+                        []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
                         (ins RC:$src1, RC:$src2, memopr:$src3),
                         !strconcat(OpcodeStr,
                                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                        []>, Sched<[WriteFMALd, ReadAfterLd]>;
+                        []>, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
 }
 
 // The FMA 213 form is created for lowering of scalar FMA intrinscis
@@ -284,82 +293,101 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
 // such analysis will be implemented eventually.
 multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                            string OpStr, string PackTy, string Suff,
-                           RegisterClass RC, Operand memop> {
+                           RegisterClass RC, Operand memop,
+                           X86FoldableSchedWrite sched> {
   defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
-                                    memop, RC>;
+                                    memop, RC, sched>;
   defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
-                                    memop, RC>;
+                                    memop, RC, sched>;
   defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
-                                    memop, RC>;
+                                    memop, RC, sched>;
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                 string OpStr, SDNode OpNodeIntrin, SDNode OpNode> {
+                 string OpStr, SDNode OpNode, X86FoldableSchedWrite sched> {
   let ExeDomain = SSEPackedSingle in
   defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
-                          FR32, f32mem>,
+                          FR32, f32mem, sched>,
               fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
-                              VR128, ssmem>;
+                              VR128, ssmem, sched>;
 
   let ExeDomain = SSEPackedDouble in
   defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
-                        FR64, f64mem>,
+                        FR64, f64mem, sched>,
               fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
-                              VR128, sdmem>, VEX_W;
+                              VR128, sdmem, sched>, VEX_W;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadd,
+                    SchedWriteFMA.Scl>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
+                    SchedWriteFMA.Scl>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd,
+                     SchedWriteFMA.Scl>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub,
+                     SchedWriteFMA.Scl>, VEX_LIG;
 
-  // These patterns use the 123 ordering, instead of 213, even though
-  // they match the intrinsic to the 213 version of the instruction.
-  // This is because src1 is tied to dest, and the scalar intrinsics
-  // require the pass-through values to come from the first source
-  // operand, not the second.
+multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
+                               SDNode Move, ValueType VT, ValueType EltVT,
+                               RegisterClass RC, PatFrag mem_frag> {
   let Predicates = [HasFMA, NoAVX512] in {
-    def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
-              (!cast<Instruction>(NAME#"213SSr_Int")
-               VR128:$src1, VR128:$src2, VR128:$src3)>;
-
-    def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
-              (!cast<Instruction>(NAME#"213SDr_Int")
-               VR128:$src1, VR128:$src2, VR128:$src3)>;
-
-    def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2,
-                                   sse_load_f32:$src3)),
-              (!cast<Instruction>(NAME#"213SSm_Int")
-               VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
-
-    def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2,
-                                   sse_load_f64:$src3)),
-              (!cast<Instruction>(NAME#"213SDm_Int")
-               VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
-
-    def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3,
-                                   VR128:$src2)),
-              (!cast<Instruction>(NAME#"132SSm_Int")
-               VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
-
-    def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3,
-                                   VR128:$src2)),
-              (!cast<Instruction>(NAME#"132SDm_Int")
-               VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
+    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+                (Op RC:$src2,
+                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+                    RC:$src3))))),
+              (!cast<Instruction>(Prefix#"213"#Suffix#"r_Int")
+               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+                (Op RC:$src2, RC:$src3,
+                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+              (!cast<Instruction>(Prefix#"231"#Suffix#"r_Int")
+               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+                (Op RC:$src2,
+                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+                    (mem_frag addr:$src3)))))),
+              (!cast<Instruction>(Prefix#"213"#Suffix#"m_Int")
+               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+               addr:$src3)>;
+
+    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+                (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+                    (mem_frag addr:$src3), RC:$src2))))),
+              (!cast<Instruction>(Prefix#"132"#Suffix#"m_Int")
+               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+               addr:$src3)>;
+
+    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+                (Op RC:$src2, (mem_frag addr:$src3),
+                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+              (!cast<Instruction>(Prefix#"231"#Suffix#"m_Int")
+               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+               addr:$src3)>;
   }
 }
 
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG;
-
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>,
-                     VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>,
-                     VEX_LIG;
+defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
 
+defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
 
 //===----------------------------------------------------------------------===//
 // FMA4 - AMD 4 operand Fused Multiply-Add instructions
 //===----------------------------------------------------------------------===//
 
-
 multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                  X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
-                 PatFrag mem_frag> {
+                 PatFrag mem_frag, X86FoldableSchedWrite sched> {
   let isCommutable = 1 in
   def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst),
            (ins RC:$src1, RC:$src2, RC:$src3),
@@ -367,66 +395,74 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
              (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG,
-           Sched<[WriteFMA]>;
+           Sched<[sched]>;
   def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst),
            (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
                            (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG,
-           Sched<[WriteFMALd, ReadAfterLd]>;
+           Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
   def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
              (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG,
-           Sched<[WriteFMALd, ReadAfterLd]>;
+           Sched<[sched.Folded, ReadAfterLd,
+                  // x86memop:$src2
+                  ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                  ReadDefault,
+                  // RC:$src3
+                  ReadAfterLd]>;
 // For disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-               VEX_LIG, FoldGenData<NAME#rr>, Sched<[WriteFMA]>;
+               VEX_LIG, FoldGenData<NAME#rr>, Sched<[sched]>;
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
-                     ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
-let isCodeGenOnly = 1 in {
+                     ValueType VT, X86FoldableSchedWrite sched> {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
   def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-               [(set VR128:$dst,
-                 (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
-               VEX_LIG, Sched<[WriteFMA]>;
+               []>, VEX_W, VEX_LIG, Sched<[sched]>;
+  let mayLoad = 1 in
   def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, memop:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-               [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
-                                  mem_cpat:$src3)))]>, VEX_W, VEX_LIG,
-               Sched<[WriteFMALd, ReadAfterLd]>;
+               []>, VEX_W, VEX_LIG,
+               Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
+  let mayLoad = 1 in
   def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, memop:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-               [(set VR128:$dst,
-                 (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
-               VEX_LIG, Sched<[WriteFMALd, ReadAfterLd]>;
-let hasSideEffects = 0 in
+               []>,
+               VEX_LIG, Sched<[sched.Folded, ReadAfterLd,
+                               // memop:$src2
+                               ReadDefault, ReadDefault, ReadDefault,
+                               ReadDefault, ReadDefault,
+                               // VR128::$src3
+                               ReadAfterLd]>;
   def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-               []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[WriteFMA]>;
+               []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[sched]>;
 } // isCodeGenOnly = 1
 }
 
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                  ValueType OpVT128, ValueType OpVT256,
-                 PatFrag ld_frag128, PatFrag ld_frag256> {
+                 PatFrag ld_frag128, PatFrag ld_frag256,
+                 X86SchedWriteWidths sched> {
   let isCommutable = 1 in
   def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -434,21 +470,26 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
              (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
-           VEX_W, Sched<[WriteFMA]>;
+           VEX_W, Sched<[sched.XMM]>;
   def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, f128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
                               (ld_frag128 addr:$src3)))]>, VEX_W,
-           Sched<[WriteFMALd, ReadAfterLd]>;
+           Sched<[sched.XMM.Folded, ReadAfterLd, ReadAfterLd]>;
   def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, f128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
              (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>,
-           Sched<[WriteFMALd, ReadAfterLd]>;
+           Sched<[sched.XMM.Folded, ReadAfterLd,
+                  // f128mem:$src2
+                  ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                  ReadDefault,
+                  // VR128::$src3
+                  ReadAfterLd]>;
   let isCommutable = 1 in
   def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, VR256:$src3),
@@ -456,95 +497,140 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
              (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
-           VEX_W, VEX_L, Sched<[WriteFMA]>;
+           VEX_W, VEX_L, Sched<[sched.YMM]>;
   def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, f256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
                               (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L,
-           Sched<[WriteFMALd, ReadAfterLd]>;
+           Sched<[sched.YMM.Folded, ReadAfterLd, ReadAfterLd]>;
   def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (OpNode VR256:$src1,
                               (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L,
-           Sched<[WriteFMALd, ReadAfterLd]>;
+           Sched<[sched.YMM.Folded, ReadAfterLd,
+                  // f256mem:$src2
+                  ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                  ReadDefault,
+                  // VR256::$src3
+                  ReadAfterLd]>;
 // For disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-               Sched<[WriteFMA]>, FoldGenData<NAME#rr>;
+               Sched<[sched.XMM]>, FoldGenData<NAME#rr>;
   def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
                 (ins VR256:$src1, VR256:$src2, VR256:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-                VEX_L, Sched<[WriteFMA]>, FoldGenData<NAME#Yrr>;
+                VEX_L, Sched<[sched.YMM]>, FoldGenData<NAME#Yrr>;
 } // isCodeGenOnly = 1
 }
 
 let ExeDomain = SSEPackedSingle in {
   // Scalar Instructions
-  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
-                    fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
-                              X86Fmadd4s>;
-  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
-                    fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
-                              X86Fmsub4s>;
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32,
+                          SchedWriteFMA.Scl>,
+                    fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
+                              SchedWriteFMA.Scl>;
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32,
+                          SchedWriteFMA.Scl>,
+                    fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
+                              SchedWriteFMA.Scl>;
   defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
-                          X86Fnmadd, loadf32>,
-                    fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
-                              X86Fnmadd4s>;
+                          X86Fnmadd, loadf32, SchedWriteFMA.Scl>,
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
+                              SchedWriteFMA.Scl>;
   defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
-                          X86Fnmsub, loadf32>,
-                    fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
-                              X86Fnmsub4s>;
+                          X86Fnmsub, loadf32, SchedWriteFMA.Scl>,
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
+                              SchedWriteFMA.Scl>;
   // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
-                            loadv4f32, loadv8f32>;
+                            loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
-                            loadv4f32, loadv8f32>;
+                            loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
-                            loadv4f32, loadv8f32>;
+                            loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
-                            loadv4f32, loadv8f32>;
+                            loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
-                            loadv4f32, loadv8f32>;
+                            loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
-                            loadv4f32, loadv8f32>;
+                            loadv4f32, loadv8f32, SchedWriteFMA>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   // Scalar Instructions
-  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
-                    fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
-                              X86Fmadd4s>;
-  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
-                    fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
-                              X86Fmsub4s>;
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64,
+                          SchedWriteFMA.Scl>,
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
+                              SchedWriteFMA.Scl>;
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64,
+                          SchedWriteFMA.Scl>,
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
+                              SchedWriteFMA.Scl>;
   defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
-                          X86Fnmadd, loadf64>,
-                    fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
-                              X86Fnmadd4s>;
+                          X86Fnmadd, loadf64, SchedWriteFMA.Scl>,
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
+                              SchedWriteFMA.Scl>;
   defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
-                          X86Fnmsub, loadf64>,
-                    fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
-                              X86Fnmsub4s>;
+                          X86Fnmsub, loadf64, SchedWriteFMA.Scl>,
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
+                              SchedWriteFMA.Scl>;
   // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
-                            loadv2f64, loadv4f64>;
+                            loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
-                            loadv2f64, loadv4f64>;
+                            loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
-                            loadv2f64, loadv4f64>;
+                            loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
-                            loadv2f64, loadv4f64>;
+                            loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
-                            loadv2f64, loadv4f64>;
+                            loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
-                            loadv2f64, loadv4f64>;
+                            loadv2f64, loadv4f64, SchedWriteFMA>;
 }
 
+multiclass scalar_fma4_patterns<SDNode Op, string Name,
+                               ValueType VT, ValueType EltVT,
+                               RegisterClass RC, PatFrag mem_frag> {
+  let Predicates = [HasFMA4] in {
+    def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+                                  (Op RC:$src1, RC:$src2, RC:$src3))))),
+              (!cast<Instruction>(Name#"rr_Int")
+               (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
+               (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+    def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+                                  (Op RC:$src1, RC:$src2,
+                                      (mem_frag addr:$src3)))))),
+              (!cast<Instruction>(Name#"rm_Int")
+               (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
+               (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>;
+
+    def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+                                  (Op RC:$src1, (mem_frag addr:$src2),
+                                      RC:$src3))))),
+              (!cast<Instruction>(Name#"mr_Int")
+               (VT (COPY_TO_REGCLASS RC:$src1, VR128)), addr:$src2,
+               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+  }
+}
+
+defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+
+defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 00ef65cdb6bd..def732a2dd00 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -21,268 +21,150 @@
 
 using namespace llvm;
 
-/// This flag is used in the method llvm::call_once() used below to make the
-/// initialization of the map 'OpcodeToGroup' thread safe.
-static llvm::once_flag InitGroupsOnceFlag;
-
-static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
-X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
-  return &*X86InstrFMA3InfoObj;
-}
-
-void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
-                                   const uint16_t *MemOpcodes, unsigned Attr) {
-  // Create a new instance of this class that would hold a group of FMA opcodes.
-  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
-
-  // Add the references from indvidual opcodes to the group holding them.
-  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
-          !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
-          !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
-         "Duplication or rewrite of elements in OpcodeToGroup.");
-  OpcodeToGroup[RegOpcodes[0]] = G;
-  OpcodeToGroup[RegOpcodes[1]] = G;
-  OpcodeToGroup[RegOpcodes[2]] = G;
-  OpcodeToGroup[MemOpcodes[0]] = G;
-  OpcodeToGroup[MemOpcodes[1]] = G;
-  OpcodeToGroup[MemOpcodes[2]] = G;
-}
-
-void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
-  // Create a new instance of this class that would hold a group of FMA opcodes.
-  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
-
-  // Add the references from indvidual opcodes to the group holding them.
-  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
-          !OpcodeToGroup[RegOpcodes[2]]) &&
-         "Duplication or rewrite of elements in OpcodeToGroup.");
-  OpcodeToGroup[RegOpcodes[0]] = G;
-  OpcodeToGroup[RegOpcodes[1]] = G;
-  OpcodeToGroup[RegOpcodes[2]] = G;
-}
-
-void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
-  // Create a new instance of this class that would hold a group of FMA opcodes.
-  X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
-
-  // Add the references from indvidual opcodes to the group holding them.
-  assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
-          !OpcodeToGroup[MemOpcodes[2]]) &&
-         "Duplication or rewrite of elements in OpcodeToGroup.");
-  OpcodeToGroup[MemOpcodes[0]] = G;
-  OpcodeToGroup[MemOpcodes[1]] = G;
-  OpcodeToGroup[MemOpcodes[2]] = G;
-}
-
-#define FMA3RM(R132, R213, R231, M132, M213, M231)                             \
-  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
-  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
-  initRMGroup(Reg##R132, Mem##R132);
-
-#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs)                     \
-  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
-  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
-  initRMGroup(Reg##R132, Mem##R132, (Attrs));
-
-#define FMA3R(R132, R213, R231)                                                \
-  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
-  initRGroup(Reg##R132);
-
-#define FMA3RA(R132, R213, R231, Attrs)                                        \
-  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
-  initRGroup(Reg##R132, (Attrs));
-
-#define FMA3M(M132, M213, M231)                                                \
-  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
-  initMGroup(Mem##M132);
-
-#define FMA3MA(M132, M213, M231, Attrs)                                        \
-  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
-  initMGroup(Mem##M132, (Attrs));
-
-#define FMA3_AVX2_VECTOR_GROUP(Name)                                           \
-  FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr,                             \
-         Name##132PSm, Name##213PSm, Name##231PSm);                            \
-  FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr,                             \
-         Name##132PDm, Name##213PDm, Name##231PDm);                            \
-  FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr,                          \
-         Name##132PSYm, Name##213PSYm, Name##231PSYm);                         \
-  FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr,                          \
-         Name##132PDYm, Name##213PDYm, Name##231PDYm);
-
-#define FMA3_AVX2_SCALAR_GROUP(Name)                                           \
-  FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr,                             \
-         Name##132SSm, Name##213SSm, Name##231SSm);                            \
-  FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr,                             \
-         Name##132SDm, Name##213SDm, Name##231SDm);                            \
-  FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int,                \
-          Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int,                \
-          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
-  FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int,                \
-          Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int,                \
-          X86InstrFMA3Group::X86FMA3Intrinsic);
-
-#define FMA3_AVX2_FULL_GROUP(Name)                                             \
-  FMA3_AVX2_VECTOR_GROUP(Name);                                                \
-  FMA3_AVX2_SCALAR_GROUP(Name);
-
-#define FMA3_AVX512_VECTOR_GROUP(Name)                                         \
-  FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r,                 \
-         Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m);                \
-  FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r,                 \
-         Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m);                \
-  FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r,                 \
-         Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m);                \
-  FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r,                 \
-         Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m);                \
-  FMA3RM(Name##132PSZr,    Name##213PSZr,    Name##231PSZr,                    \
-         Name##132PSZm,    Name##213PSZm,    Name##231PSZm);                   \
-  FMA3RM(Name##132PDZr,    Name##213PDZr,    Name##231PDZr,                    \
-         Name##132PDZm,    Name##213PDZm,    Name##231PDZm);                   \
-  FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk,             \
-          Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk,             \
-          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
-  FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk,             \
-          Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk,             \
-          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
-  FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk,             \
-          Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk,             \
-          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
-  FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk,             \
-          Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk,             \
-          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
-  FMA3RMA(Name##132PSZrk,    Name##213PSZrk,    Name##231PSZrk,                \
-          Name##132PSZmk,    Name##213PSZmk,    Name##231PSZmk,                \
-          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
-  FMA3RMA(Name##132PDZrk,    Name##213PDZrk,    Name##231PDZrk,                \
-          Name##132PDZmk,    Name##213PDZmk,    Name##231PDZmk,                \
-          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
-  FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz,          \
-          Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz,          \
-          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
-  FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz,          \
-          Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz,          \
-          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
-  FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz,          \
-          Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz,          \
-          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
-  FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz,          \
-          Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz,          \
-          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
-  FMA3RMA(Name##132PSZrkz,    Name##213PSZrkz,    Name##231PSZrkz,             \
-          Name##132PSZmkz,    Name##213PSZmkz,    Name##231PSZmkz,             \
-          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
-  FMA3RMA(Name##132PDZrkz,    Name##213PDZrkz,    Name##231PDZrkz,             \
-          Name##132PDZmkz,    Name##213PDZmkz,    Name##231PDZmkz,             \
-          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
-  FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb);                       \
-  FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb);                       \
-  FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk,                    \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk,                    \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz,                 \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz,                 \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb);              \
-  FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb);              \
-  FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb);              \
-  FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb);              \
-  FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb);                       \
-  FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb);                       \
-  FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk,           \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk,           \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk,           \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk,           \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3MA(Name##132PSZmbk,    Name##213PSZmbk,    Name##231PSZmbk,              \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3MA(Name##132PDZmbk,    Name##213PDZmbk,    Name##231PDZmbk,              \
-         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
-  FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz,        \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz,        \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz,        \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz,        \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz,                 \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
-  FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz,                 \
-         X86InstrFMA3Group::X86FMA3KZeroMasked);
-
-#define FMA3_AVX512_SCALAR_GROUP(Name)                                         \
-  FMA3RM(Name##132SSZr,      Name##213SSZr,     Name##231SSZr,                 \
-         Name##132SSZm,      Name##213SSZm,     Name##231SSZm);                \
-  FMA3RM(Name##132SDZr,      Name##213SDZr,     Name##231SDZr,                 \
-         Name##132SDZm,      Name##213SDZm,     Name##231SDZm);                \
-  FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int,             \
-          Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int,             \
-          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
-  FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int,             \
-          Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int,             \
-          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
-  FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk,          \
-          Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk,          \
-          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
-              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
-  FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk,          \
-          Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk,          \
-          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
-              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
-  FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz,       \
-          Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz,       \
-          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
-              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
-  FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz,       \
-          Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz,       \
-          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
-              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
-  FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int,           \
-         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
-  FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int,           \
-         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
-  FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk,        \
-         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
-             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
-  FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk,        \
-         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
-             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
-  FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz,     \
-         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
-             X86InstrFMA3Group::X86FMA3KZeroMasked);                           \
-  FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz,     \
-         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
-             X86InstrFMA3Group::X86FMA3KZeroMasked);
-
-#define FMA3_AVX512_FULL_GROUP(Name)                                           \
-  FMA3_AVX512_VECTOR_GROUP(Name);                                              \
-  FMA3_AVX512_SCALAR_GROUP(Name);
-
-void X86InstrFMA3Info::initGroupsOnceImpl() {
-  FMA3_AVX2_FULL_GROUP(VFMADD);
-  FMA3_AVX2_FULL_GROUP(VFMSUB);
-  FMA3_AVX2_FULL_GROUP(VFNMADD);
-  FMA3_AVX2_FULL_GROUP(VFNMSUB);
-
-  FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
-  FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
-
-  FMA3_AVX512_FULL_GROUP(VFMADD);
-  FMA3_AVX512_FULL_GROUP(VFMSUB);
-  FMA3_AVX512_FULL_GROUP(VFNMADD);
-  FMA3_AVX512_FULL_GROUP(VFNMSUB);
-
-  FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
-  FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+#define FMA3GROUP(Name, Suf, Attrs) \
+  { { X86::Name##132##Suf, X86::Name##213##Suf, X86::Name##231##Suf }, Attrs },
+
+#define FMA3GROUP_MASKED(Name, Suf, Attrs) \
+  FMA3GROUP(Name, Suf, Attrs) \
+  FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \
+  FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked)
+
+#define FMA3GROUP_PACKED_WIDTHS(Name, Suf, Attrs) \
+  FMA3GROUP(Name, Suf##Ym, Attrs) \
+  FMA3GROUP(Name, Suf##Yr, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Z256m, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Z256r, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Zm, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Zr, Attrs) \
+  FMA3GROUP(Name, Suf##m, Attrs) \
+  FMA3GROUP(Name, Suf##r, Attrs)
+
+#define FMA3GROUP_PACKED(Name, Attrs) \
+  FMA3GROUP_PACKED_WIDTHS(Name, PD, Attrs) \
+  FMA3GROUP_PACKED_WIDTHS(Name, PS, Attrs)
+
+#define FMA3GROUP_SCALAR_WIDTHS(Name, Suf, Attrs) \
+  FMA3GROUP(Name, Suf##Zm, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+  FMA3GROUP(Name, Suf##Zr, Attrs) \
+  FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+  FMA3GROUP(Name, Suf##m, Attrs) \
+  FMA3GROUP(Name, Suf##m_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+  FMA3GROUP(Name, Suf##r, Attrs) \
+  FMA3GROUP(Name, Suf##r_Int, Attrs | X86InstrFMA3Group::Intrinsic)
+
+#define FMA3GROUP_SCALAR(Name, Attrs) \
+  FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
+  FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) \
+
+#define FMA3GROUP_FULL(Name, Attrs) \
+  FMA3GROUP_PACKED(Name, Attrs) \
+  FMA3GROUP_SCALAR(Name, Attrs)
+
+static const X86InstrFMA3Group Groups[] = {
+  FMA3GROUP_FULL(VFMADD, 0)
+  FMA3GROUP_PACKED(VFMADDSUB, 0)
+  FMA3GROUP_FULL(VFMSUB, 0)
+  FMA3GROUP_PACKED(VFMSUBADD, 0)
+  FMA3GROUP_FULL(VFNMADD, 0)
+  FMA3GROUP_FULL(VFNMSUB, 0)
+};
+
+#define FMA3GROUP_PACKED_AVX512_WIDTHS(Name, Type, Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, Type##Z128##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, Type##Z256##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, Type##Z##Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs)
+
+#define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \
+  FMA3GROUP(Name, SDZ##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \
+  FMA3GROUP(Name, SSZ##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
+
+static const X86InstrFMA3Group BroadcastGroups[] = {
+  FMA3GROUP_PACKED_AVX512(VFMADD, mb, 0)
+  FMA3GROUP_PACKED_AVX512(VFMADDSUB, mb, 0)
+  FMA3GROUP_PACKED_AVX512(VFMSUB, mb, 0)
+  FMA3GROUP_PACKED_AVX512(VFMSUBADD, mb, 0)
+  FMA3GROUP_PACKED_AVX512(VFNMADD, mb, 0)
+  FMA3GROUP_PACKED_AVX512(VFNMSUB, mb, 0)
+};
+
+static const X86InstrFMA3Group RoundGroups[] = {
+  FMA3GROUP_PACKED_AVX512_ROUND(VFMADD, rb, 0)
+  FMA3GROUP_SCALAR_AVX512_ROUND(VFMADD, rb, X86InstrFMA3Group::Intrinsic)
+  FMA3GROUP_PACKED_AVX512_ROUND(VFMADDSUB, rb, 0)
+  FMA3GROUP_PACKED_AVX512_ROUND(VFMSUB, rb, 0)
+  FMA3GROUP_SCALAR_AVX512_ROUND(VFMSUB, rb, X86InstrFMA3Group::Intrinsic)
+  FMA3GROUP_PACKED_AVX512_ROUND(VFMSUBADD, rb, 0)
+  FMA3GROUP_PACKED_AVX512_ROUND(VFNMADD, rb, 0)
+  FMA3GROUP_SCALAR_AVX512_ROUND(VFNMADD, rb, X86InstrFMA3Group::Intrinsic)
+  FMA3GROUP_PACKED_AVX512_ROUND(VFNMSUB, rb, 0)
+  FMA3GROUP_SCALAR_AVX512_ROUND(VFNMSUB, rb, X86InstrFMA3Group::Intrinsic)
+};
+
+static void verifyTables() {
+#ifndef NDEBUG
+  static std::atomic<bool> TableChecked(false);
+  if (!TableChecked.load(std::memory_order_relaxed)) {
+    assert(std::is_sorted(std::begin(Groups), std::end(Groups)) &&
+           std::is_sorted(std::begin(RoundGroups), std::end(RoundGroups)) &&
+           std::is_sorted(std::begin(BroadcastGroups),
+                          std::end(BroadcastGroups)) &&
+           "FMA3 tables not sorted!");
+    TableChecked.store(true, std::memory_order_relaxed);
+  }
+#endif
 }
 
-void X86InstrFMA3Info::initGroupsOnce() {
-  llvm::call_once(InitGroupsOnceFlag,
-                  []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+/// Returns a reference to a group of FMA3 opcodes to where the given
+/// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+/// and not included into any FMA3 group, then nullptr is returned.
+const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
+
+  // FMA3 instructions have a well defined encoding pattern we can exploit.
+  uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+  bool IsFMA3 = ((TSFlags & X86II::EncodingMask) == X86II::VEX ||
+                 (TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+                (TSFlags & X86II::OpMapMask) == X86II::T8 &&
+                (TSFlags & X86II::OpPrefixMask) == X86II::PD &&
+                ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
+                 (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
+                 (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
+  if (!IsFMA3)
+    return nullptr;
+
+  verifyTables();
+
+  ArrayRef<X86InstrFMA3Group> Table;
+  if (TSFlags & X86II::EVEX_RC)
+    Table = makeArrayRef(RoundGroups);
+  else if (TSFlags & X86II::EVEX_B)
+    Table = makeArrayRef(BroadcastGroups);
+  else
+    Table = makeArrayRef(Groups);
+
+  // FMA 132 instructions have an opcode of 0x96-0x9F
+  // FMA 213 instructions have an opcode of 0xA6-0xAF
+  // FMA 231 instructions have an opcode of 0xB6-0xBF
+  unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3;
+
+  auto I = std::lower_bound(Table.begin(), Table.end(), Opcode,
+                            [FormIndex](const X86InstrFMA3Group &Group,
+                                        unsigned Opcode) {
+                              return Group.Opcodes[FormIndex] < Opcode;
+                            });
+  assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode &&
+         "Couldn't find FMA3 opcode!");
+  return I;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
index e3568160da46..6eec1db98bf8 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -24,294 +24,78 @@
 namespace llvm {
 
 /// This class is used to group {132, 213, 231} forms of FMA opcodes together.
-/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
-/// or 6 register and memory opcodes. Also, each group has an attrubutes field
-/// describing it.
-class X86InstrFMA3Group {
-private:
-  /// Reference to an array holding 3 forms of register FMA opcodes.
-  /// It may be set to nullptr if the group of FMA opcodes does not have
-  /// any register form opcodes.
-  const uint16_t *RegOpcodes;
-
-  /// Reference to an array holding 3 forms of memory FMA opcodes.
-  /// It may be set to nullptr if the group of FMA opcodes does not have
-  /// any register form opcodes.
-  const uint16_t *MemOpcodes;
+/// Each of the groups has either 3 opcodes, Also, each group has an attributes
+/// field describing it.
+struct X86InstrFMA3Group {
+  /// An array holding 3 forms of FMA opcodes.
+  uint16_t Opcodes[3];
 
   /// This bitfield specifies the attributes associated with the created
   /// FMA groups of opcodes.
-  unsigned Attributes;
-
-  static const unsigned Form132 = 0;
-  static const unsigned Form213 = 1;
-  static const unsigned Form231 = 2;
-
-public:
-  /// This bit must be set in the 'Attributes' field of FMA group if such
-  /// group of FMA opcodes consists of FMA intrinsic opcodes.
-  static const unsigned X86FMA3Intrinsic = 0x1;
+  uint16_t Attributes;
 
-  /// This bit must be set in the 'Attributes' field of FMA group if such
-  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
-  /// passing the elements from the 1st operand to the result of the operation
-  /// when the correpondings bits in the k-mask are unset.
-  static const unsigned X86FMA3KMergeMasked = 0x2;
-
-  /// This bit must be set in the 'Attributes' field of FMA group if such
-  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
-  static const unsigned X86FMA3KZeroMasked = 0x4;
-
-  /// Constructor. Creates a new group of FMA opcodes with three register form
-  /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
-  /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
-  /// which means that the created group of FMA opcodes does not have the
-  /// corresponding (register or memory) opcodes.
-  /// The parameter \p Attr specifies the attributes describing the created
-  /// group.
-  X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
-                    unsigned Attr)
-      : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
-    assert((RegOpcodes || MemOpcodes) &&
-           "Cannot create a group not having any opcodes.");
-  }
+  enum {
+    Form132,
+    Form213,
+    Form231,
+  };
 
-  /// Returns a memory form opcode that is the equivalent of the given register
-  /// form opcode \p RegOpcode. 0 is returned if the group does not have
-  /// either register of memory opcodes.
-  unsigned getMemOpcode(unsigned RegOpcode) const {
-    if (!RegOpcodes || !MemOpcodes)
-      return 0;
-    for (unsigned Form = 0; Form < 3; Form++)
-      if (RegOpcodes[Form] == RegOpcode)
-        return MemOpcodes[Form];
-    return 0;
-  }
+  enum : uint16_t {
+    /// This bit must be set in the 'Attributes' field of FMA group if such
+    /// group of FMA opcodes consists of FMA intrinsic opcodes.
+    Intrinsic = 0x1,
 
-  /// Returns the 132 form of FMA register opcode.
-  unsigned getReg132Opcode() const {
-    assert(RegOpcodes && "The group does not have register opcodes.");
-    return RegOpcodes[Form132];
-  }
-
-  /// Returns the 213 form of FMA register opcode.
-  unsigned getReg213Opcode() const {
-    assert(RegOpcodes && "The group does not have register opcodes.");
-    return RegOpcodes[Form213];
-  }
+    /// This bit must be set in the 'Attributes' field of FMA group if such
+    /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+    /// passing the elements from the 1st operand to the result of the operation
+    /// when the correpondings bits in the k-mask are unset.
+    KMergeMasked = 0x2,
 
-  /// Returns the 231 form of FMA register opcode.
-  unsigned getReg231Opcode() const {
-    assert(RegOpcodes && "The group does not have register opcodes.");
-    return RegOpcodes[Form231];
-  }
+    /// This bit must be set in the 'Attributes' field of FMA group if such
+    /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+    KZeroMasked = 0x4,
+  };
 
-  /// Returns the 132 form of FMA memory opcode.
-  unsigned getMem132Opcode() const {
-    assert(MemOpcodes && "The group does not have memory opcodes.");
-    return MemOpcodes[Form132];
+  /// Returns the 132 form of FMA opcode.
+  unsigned get132Opcode() const {
+    return Opcodes[Form132];
   }
 
-  /// Returns the 213 form of FMA memory opcode.
-  unsigned getMem213Opcode() const {
-    assert(MemOpcodes && "The group does not have memory opcodes.");
-    return MemOpcodes[Form213];
+  /// Returns the 213 form of FMA opcode.
+  unsigned get213Opcode() const {
+    return Opcodes[Form213];
   }
 
-  /// Returns the 231 form of FMA memory opcode.
-  unsigned getMem231Opcode() const {
-    assert(MemOpcodes && "The group does not have memory opcodes.");
-    return MemOpcodes[Form231];
+  /// Returns the 231 form of FMA opcode.
+  unsigned get231Opcode() const {
+    return Opcodes[Form231];
   }
 
   /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
-  bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+  bool isIntrinsic() const { return (Attributes & Intrinsic) != 0; }
 
   /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
   bool isKMergeMasked() const {
-    return (Attributes & X86FMA3KMergeMasked) != 0;
+    return (Attributes & KMergeMasked) != 0;
   }
 
   /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
-  bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+  bool isKZeroMasked() const { return (Attributes &KZeroMasked) != 0; }
 
   /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
   bool isKMasked() const {
-    return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
-  }
-
-  /// Returns true iff the given \p Opcode is a register opcode from the
-  /// groups of FMA opcodes.
-  bool isRegOpcodeFromGroup(unsigned Opcode) const {
-    if (!RegOpcodes)
-      return false;
-    for (unsigned Form = 0; Form < 3; Form++)
-      if (Opcode == RegOpcodes[Form])
-        return true;
-    return false;
+    return (Attributes & (KMergeMasked | KZeroMasked)) != 0;
   }
 
-  /// Returns true iff the given \p Opcode is a memory opcode from the
-  /// groups of FMA opcodes.
-  bool isMemOpcodeFromGroup(unsigned Opcode) const {
-    if (!MemOpcodes)
-      return false;
-    for (unsigned Form = 0; Form < 3; Form++)
-      if (Opcode == MemOpcodes[Form])
-        return true;
-    return false;
+  bool operator<(const X86InstrFMA3Group &RHS) const {
+    return Opcodes[0] < RHS.Opcodes[0];
   }
 };
 
-/// This class provides information about all existing FMA3 opcodes
-///
-class X86InstrFMA3Info {
-private:
-  /// A map that is used to find the group of FMA opcodes using any FMA opcode
-  /// from the group.
-  DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
-
-  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
-  /// This method can be called many times, but the actual initialization is
-  /// called only once.
-  static void initGroupsOnce();
-
-  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
-  /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
-  /// call is not thread safe.
-  void initGroupsOnceImpl();
-
-  /// Creates one group of FMA opcodes having the register opcodes
-  /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
-  /// specifies the attributes describing the created group.
-  void initRMGroup(const uint16_t *RegOpcodes,
-                   const uint16_t *MemOpcodes, unsigned Attr = 0);
-
-  /// Creates one group of FMA opcodes having only the register opcodes
-  /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
-  /// the created group.
-  void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
-
-  /// Creates one group of FMA opcodes having only the memory opcodes
-  /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
-  /// the created group.
-  void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
-
-public:
-  /// Returns the reference to an object of this class. It is assumed that
-  /// only one object may exist.
-  static X86InstrFMA3Info *getX86InstrFMA3Info();
-
-  /// Constructor. Just creates an object of the class.
-  X86InstrFMA3Info() = default;
-
-  /// Destructor. Deallocates the memory used for FMA3 Groups.
-  ~X86InstrFMA3Info() {
-    std::set<const X86InstrFMA3Group *> DeletedGroups;
-    auto E = OpcodeToGroup.end();
-    for (auto I = OpcodeToGroup.begin(); I != E; I++) {
-      const X86InstrFMA3Group *G = I->second;
-      if (DeletedGroups.find(G) == DeletedGroups.end()) {
-        DeletedGroups.insert(G);
-        delete G;
-      }
-    }
-  }
-
-  /// Returns a reference to a group of FMA3 opcodes to where the given
-  /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
-  /// and not included into any FMA3 group, then nullptr is returned.
-  static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
-    // Ensure that the groups of opcodes are initialized.
-    initGroupsOnce();
-
-    // Find the group including the given opcode.
-    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
-    auto I = FMA3Info->OpcodeToGroup.find(Opcode);
-    if (I == FMA3Info->OpcodeToGroup.end())
-      return nullptr;
-
-    return I->second;
-  }
-
-  /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
-  static bool isFMA3(unsigned Opcode) {
-    return getFMA3Group(Opcode) != nullptr;
-  }
-
-  /// Iterator that is used to walk on FMA register opcodes having memory
-  /// form equivalents.
-  class rm_iterator {
-  private:
-    /// Iterator associated with the OpcodeToGroup map. It must always be
-    /// initialized with an entry from OpcodeToGroup for which I->first
-    /// points to a register FMA opcode and I->second points to a group of
-    /// FMA opcodes having memory form equivalent of I->first.
-    DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
-
-  public:
-    /// Constructor. Creates rm_iterator. The parameter \p I must be an
-    /// iterator to OpcodeToGroup map entry having I->first pointing to
-    /// register form FMA opcode and I->second pointing to a group of FMA
-    /// opcodes holding memory form equivalent for I->fist.
-    rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
-        : I(I) {}
-
-    /// Returns the register form FMA opcode.
-    unsigned getRegOpcode() const { return I->first; };
-
-    /// Returns the memory form equivalent opcode for FMA register opcode
-    /// referenced by I->first.
-    unsigned getMemOpcode() const {
-      unsigned Opcode = I->first;
-      const X86InstrFMA3Group *Group = I->second;
-      return Group->getMemOpcode(Opcode);
-    }
-
-    /// Returns a reference to a group of FMA opcodes.
-    const X86InstrFMA3Group *getGroup() const { return I->second; }
-
-    bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
-    bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
-
-    /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
-    /// having I->first pointing to register form FMA and I->second pointing
-    /// to a group of FMA opcodes holding memory form equivalen for I->first.
-    rm_iterator &operator++() {
-      auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
-      for (++I; I != E; ++I) {
-        unsigned RegOpcode = I->first;
-        const X86InstrFMA3Group *Group = I->second;
-        if (Group->getMemOpcode(RegOpcode) != 0)
-          break;
-      }
-      return *this;
-    }
-  };
-
-  /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
-  /// with a register FMA opcode having memory form opcode equivalent.
-  static rm_iterator rm_begin() {
-    initGroupsOnce();
-    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
-    auto I = FMA3Info->OpcodeToGroup.begin();
-    auto E = FMA3Info->OpcodeToGroup.end();
-    while (I != E) {
-      unsigned Opcode = I->first;
-      const X86InstrFMA3Group *G = I->second;
-      if (G->getMemOpcode(Opcode) != 0)
-        break;
-      I++;
-    }
-    return rm_iterator(I);
-  }
-
-  /// Returns the last rm_iterator.
-  static rm_iterator rm_end() {
-    initGroupsOnce();
-    return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
-  }
-};
+/// Returns a reference to a group of FMA3 opcodes to where the given
+/// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+/// and not included into any FMA3 group, then nullptr is returned.
+const X86InstrFMA3Group *getFMA3Group(unsigned Opcode, uint64_t TSFlags);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
index 619b399ef8d8..cc81a919ec99 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -118,12 +118,10 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
 // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
 // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
 // f80 instructions cannot use SSE and use neither of these.
-class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern,
-             InstrItinClass itin = NoItinerary> :
-             FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32]>;
-class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern,
-             InstrItinClass itin = NoItinerary> :
-             FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64]>;
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+             FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+             FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
 
 // Factoring for arithmetic.
 multiclass FPBinary_rr<SDNode OpNode> {
@@ -279,6 +277,8 @@ def SUB_FPrST0  : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
 def SUB_FST0r   : FPST0rInst <MRM4r, "fsub\t$op">;
 def SUBR_FrST0  : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
 def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFCom] in {
 def COM_FST0r   : FPST0rInst <MRM2r, "fcom\t$op">;
 def COMP_FST0r  : FPST0rInst <MRM3r, "fcomp\t$op">;
 } // SchedRW
@@ -297,46 +297,45 @@ def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
 } // SchedRW
 
 // Unary operations.
-multiclass FPUnary<SDNode OpNode, Format fp, string asmstring,
-                   InstrItinClass itin> {
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
 def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
-                 [(set RFP32:$dst, (OpNode RFP32:$src))], itin>;
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
 def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
-                 [(set RFP64:$dst, (OpNode RFP64:$src))], itin>;
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
 def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
-                 [(set RFP80:$dst, (OpNode RFP80:$src))], itin>;
-def _F     : FPI<0xD9, fp, (outs), (ins), asmstring, itin>;
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<0xD9, fp, (outs), (ins), asmstring>;
 }
 
 let Defs = [FPSW] in {
 
-let SchedRW = [WriteVecLogic] in {
-defm CHS : FPUnary<fneg, MRM_E0, "fchs", IIC_FSIGN>;
-defm ABS : FPUnary<fabs, MRM_E1, "fabs", IIC_FSIGN>;
+let SchedRW = [WriteFSign] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
 }
 
-let SchedRW = [WriteFSqrt] in
-defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt", IIC_FSQRT>;
+let SchedRW = [WriteFSqrt80] in
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
 
 let SchedRW = [WriteMicrocoded] in {
-defm SIN : FPUnary<fsin, MRM_FE, "fsin", IIC_FSINCOS>;
-defm COS : FPUnary<fcos, MRM_FF, "fcos", IIC_FSINCOS>;
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
 }
 
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCom] in {
 let hasSideEffects = 0 in {
 def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
 def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
 def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
 } // hasSideEffects
 
-def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst", IIC_FCOMI>;
+def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
 } // SchedRW
 } // Defs = [FPSW]
 
 // Versions of FP instructions that take a single memory operand.  Added for the
 //   disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFAddLd] in {
+let SchedRW = [WriteFComLd] in {
 def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
 def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
 
@@ -363,31 +362,29 @@ def FBSTPm   : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
 } // SchedRW
 
 // Floating point cmovs.
-class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern,
-                 InstrItinClass itin> :
-  FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32, HasCMov]>;
-class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern,
-                 InstrItinClass itin> :
-  FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64, HasCMov]>;
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
 
 multiclass FPCMov<PatLeaf cc> {
   def _Fp32  : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
                        CondMovFP,
                      [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
-                                        cc, EFLAGS))], IIC_FCMOV>;
+                                        cc, EFLAGS))]>;
   def _Fp64  : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
                        CondMovFP,
                      [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
-                                        cc, EFLAGS))], IIC_FCMOV>;
+                                        cc, EFLAGS))]>;
   def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
                      CondMovFP,
                      [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
-                                        cc, EFLAGS))], IIC_FCMOV>,
+                                        cc, EFLAGS))]>,
                                         Requires<[HasCMov]>;
 }
 
 let Defs = [FPSW] in {
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCMOV] in {
 let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
 defm CMOVB  : FPCMov<X86_COND_B>;
 defm CMOVBE : FPCMov<X86_COND_BE>;
@@ -402,21 +399,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
 let Predicates = [HasCMov] in {
 // These are not factored because there's no clean way to pass DA/DB.
 def CMOVB_F  : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
-                  "fcmovb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovb\t{$op, %st(0)|st(0), $op}">;
 def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
-                  "fcmovbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovbe\t{$op, %st(0)|st(0), $op}">;
 def CMOVE_F  : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
-                  "fcmove\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmove\t{$op, %st(0)|st(0), $op}">;
 def CMOVP_F  : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
-                  "fcmovu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovu\t{$op, %st(0)|st(0), $op}">;
 def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
-                  "fcmovnb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovnb\t{$op, %st(0)|st(0), $op}">;
 def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
-                  "fcmovnbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
 def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
-                  "fcmovne\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovne\t{$op, %st(0)|st(0), $op}">;
 def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
-                  "fcmovnu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
+                  "fcmovnu\t{$op, %st(0)|st(0), $op}">;
 } // Predicates = [HasCMov]
 } // SchedRW
 
@@ -495,40 +492,24 @@ def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
 } // SchedRW
 
 let mayLoad = 1, SchedRW = [WriteLoad] in {
-def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
-                    IIC_FLD>;
-def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
-                    IIC_FLD>;
-def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
-                    IIC_FLD80>;
-def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
-                    IIC_FILD>;
-def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
-                    IIC_FILD>;
-def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
-                    IIC_FILD>;
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
 }
 let mayStore = 1, SchedRW = [WriteStore] in {
-def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
-                    IIC_FST>;
-def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
-                    IIC_FST>;
-def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
-                    IIC_FST>;
-def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
-                    IIC_FST>;
-def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
-                    IIC_FST80>;
-def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
-                    IIC_FIST>;
-def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
-                    IIC_FIST>;
-def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
-                    IIC_FIST>;
-def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
-                    IIC_FIST>;
-def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
-                    IIC_FIST>;
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
 }
 
 // FISTTP requires SSE3 even though it's a FPStack op.
@@ -554,20 +535,17 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
 } // Predicates = [HasSSE3]
 
 let mayStore = 1, SchedRW = [WriteStore] in {
-def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
-  IIC_FST>;
-def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
-  IIC_FST>;
-def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
-  "fisttp{ll}\t$dst", IIC_FST>;
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
 }
 
 // FP Stack manipulation instructions.
 let SchedRW = [WriteMove] in {
-def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
-def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
-def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
-def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op">;
+def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op">;
+def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op">;
+def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op">;
 }
 
 // Floating point constant loads.
@@ -586,13 +564,22 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
                 [(set RFP80:$dst, fpimm1)]>;
 }
 
-let SchedRW = [WriteZero] in {
-def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
-def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
-}
+let SchedRW = [WriteFLD0] in
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
+
+let SchedRW = [WriteFLD1] in
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
+
+let SchedRW = [WriteFLDC], Defs = [FPSW] in {
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", []>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
+} // SchedRW
 
 // Floating point compares.
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCom] in {
 def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                         [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
 def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -602,7 +589,7 @@ def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
 } // SchedRW
 } // Defs = [FPSW]
 
-let SchedRW = [WriteFAdd] in {
+let SchedRW = [WriteFCom] in {
 // CC = ST(0) cmp ST(i)
 let Defs = [EFLAGS, FPSW] in {
 def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
@@ -615,25 +602,23 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
 
 let Defs = [FPSW], Uses = [ST0] in {
 def UCOM_Fr    : FPI<0xDD, MRM4r,    // FPSW = cmp ST(0) with ST(i)
-                    (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+                    (outs), (ins RST:$reg), "fucom\t$reg">;
 def UCOM_FPr   : FPI<0xDD, MRM5r,    // FPSW = cmp ST(0) with ST(i), pop
-                    (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+                    (outs), (ins RST:$reg), "fucomp\t$reg">;
 def UCOM_FPPr  : FPI<0xDA, MRM_E9,       // cmp ST(0) with ST(1), pop, pop
-                    (outs), (ins), "fucompp", IIC_FUCOM>;
+                    (outs), (ins), "fucompp">;
 }
 
 let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
 def UCOM_FIr   : FPI<0xDB, MRM5r,     // CC = cmp ST(0) with ST(i)
-                    (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+                    (outs), (ins RST:$reg), "fucomi\t$reg">;
 def UCOM_FIPr  : FPI<0xDF, MRM5r,     // CC = cmp ST(0) with ST(i), pop
-                    (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+                    (outs), (ins RST:$reg), "fucompi\t$reg">;
 }
 
 let Defs = [EFLAGS, FPSW] in {
-def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
-                  "fcomi\t$reg", IIC_FCOMI>;
-def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
-                   "fcompi\t$reg", IIC_FCOMI>;
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg">;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg">;
 }
 } // SchedRW
 
@@ -642,71 +627,64 @@ let SchedRW = [WriteALU] in {
 let Defs = [AX], Uses = [FPSW] in
 def FNSTSW16r : I<0xDF, MRM_E0,                  // AX = fp flags
                   (outs), (ins), "fnstsw\t{%ax|ax}",
-                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+                  [(set AX, (X86fp_stsw FPSW))]>;
 let Defs = [FPSW] in
 def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
                   (outs), (ins i16mem:$dst), "fnstcw\t$dst",
-                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+                  [(X86fp_cwd_get16 addr:$dst)]>;
 } // SchedRW
 let Defs = [FPSW], mayLoad = 1 in
 def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
-                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", []>,
                 Sched<[WriteLoad]>;
 
 // FPU control instructions
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [FPSW] in {
-def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
-def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
-                "ffree\t$reg", IIC_FFREE>;
-def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg),
-                "ffreep\t$reg", IIC_FFREE>;
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg">;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg">;
 
 // Clear exceptions
-def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>;
 } // Defs = [FPSW]
 } // SchedRW
 
-// Operandless floating-point instructions for the disassembler.
-let SchedRW = [WriteMicrocoded] in {
-def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+// Operand-less floating-point instructions for the disassembler.
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", []>, Sched<[WriteNop]>;
 
+let SchedRW = [WriteMicrocoded] in {
 let Defs = [FPSW] in {
-def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
-def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
-def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
-def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
-def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
-def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
-def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
-def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
-def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
-def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
-def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
-def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
-def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
-def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
-def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
-def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
-def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
-def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
-def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
-def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
-def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", []>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", []>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", []>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", []>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", []>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", []>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", []>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", []>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", []>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", []>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", []>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
 } // Defs = [FPSW]
 
-def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB,
+def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
+             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
              Requires<[HasFXSR]>;
-def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-               "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
-               IIC_FXSAVE>, TB, Requires<[HasFXSR, In64BitMode]>;
-def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-              "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>,
+def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
+               "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
+               TB, Requires<[HasFXSR, In64BitMode]>;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
+              "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
               TB, Requires<[HasFXSR]>;
-def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
-                IIC_FXRSTOR>, TB, Requires<[HasFXSR, In64BitMode]>;
+def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
+                "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
+                TB, Requires<[HasFXSR, In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp
new file mode 100644
index 000000000000..5d8400595bfa
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -0,0 +1,5412 @@
+//===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 memory folding tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFoldTables.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include <vector>
+
+using namespace llvm;
+
+// These tables are sorted by their RegOp value allowing them to be binary
+// searched at runtime without the need for additional storage. The enum values
+// are currently emitted in X86GenInstrInfo.inc in alphabetical order. Which
+// makes sorting these tables a simple matter of alphabetizing the table.
+//
+// We also have a tablegen emitter that tries to autogenerate these tables
+// by comparing encoding information. This can be enabled by passing
+// X86_GEN_FOLD_TABLES=ON to cmake which fill produce X86GenFoldTables.inc
+// in the build area. There are currently some bugs in the autogenerated table
+// that require a manual review to copy them from the autogenerated table into
+// this table. It is unclear if we will ever be able to fully automate this
+// because as new instruction are added into holes in the X86 opcode map they
+// potentially pair up with old instructions and create new entries in the
+// tables that would be incorrect. The manual review process allows us a chance
+// to catch these before they become observable bugs.
+static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+  { X86::ADC16ri,     X86::ADC16mi,    0 },
+  { X86::ADC16ri8,    X86::ADC16mi8,   0 },
+  { X86::ADC16rr,     X86::ADC16mr,    0 },
+  { X86::ADC32ri,     X86::ADC32mi,    0 },
+  { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+  { X86::ADC32rr,     X86::ADC32mr,    0 },
+  { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+  { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+  { X86::ADC64rr,     X86::ADC64mr,    0 },
+  { X86::ADC8ri,      X86::ADC8mi,     0 },
+  { X86::ADC8ri8,     X86::ADC8mi8,    0 },
+  { X86::ADC8rr,      X86::ADC8mr,     0 },
+  { X86::ADD16ri,     X86::ADD16mi,    0 },
+  { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+  { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+  { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+  { X86::ADD16rr,     X86::ADD16mr,    0 },
+  { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+  { X86::ADD32ri,     X86::ADD32mi,    0 },
+  { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+  { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+  { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+  { X86::ADD32rr,     X86::ADD32mr,    0 },
+  { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+  { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+  { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+  { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+  { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+  { X86::ADD64rr,     X86::ADD64mr,    0 },
+  { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+  { X86::ADD8ri,      X86::ADD8mi,     0 },
+  { X86::ADD8ri8,     X86::ADD8mi8,    0 },
+  { X86::ADD8rr,      X86::ADD8mr,     0 },
+  { X86::AND16ri,     X86::AND16mi,    0 },
+  { X86::AND16ri8,    X86::AND16mi8,   0 },
+  { X86::AND16rr,     X86::AND16mr,    0 },
+  { X86::AND32ri,     X86::AND32mi,    0 },
+  { X86::AND32ri8,    X86::AND32mi8,   0 },
+  { X86::AND32rr,     X86::AND32mr,    0 },
+  { X86::AND64ri32,   X86::AND64mi32,  0 },
+  { X86::AND64ri8,    X86::AND64mi8,   0 },
+  { X86::AND64rr,     X86::AND64mr,    0 },
+  { X86::AND8ri,      X86::AND8mi,     0 },
+  { X86::AND8ri8,     X86::AND8mi8,    0 },
+  { X86::AND8rr,      X86::AND8mr,     0 },
+  { X86::BTC16ri8,    X86::BTC16mi8,   0 },
+  { X86::BTC32ri8,    X86::BTC32mi8,   0 },
+  { X86::BTC64ri8,    X86::BTC64mi8,   0 },
+  { X86::BTR16ri8,    X86::BTR16mi8,   0 },
+  { X86::BTR32ri8,    X86::BTR32mi8,   0 },
+  { X86::BTR64ri8,    X86::BTR64mi8,   0 },
+  { X86::BTS16ri8,    X86::BTS16mi8,   0 },
+  { X86::BTS32ri8,    X86::BTS32mi8,   0 },
+  { X86::BTS64ri8,    X86::BTS64mi8,   0 },
+  { X86::DEC16r,      X86::DEC16m,     0 },
+  { X86::DEC32r,      X86::DEC32m,     0 },
+  { X86::DEC64r,      X86::DEC64m,     0 },
+  { X86::DEC8r,       X86::DEC8m,      0 },
+  { X86::INC16r,      X86::INC16m,     0 },
+  { X86::INC32r,      X86::INC32m,     0 },
+  { X86::INC64r,      X86::INC64m,     0 },
+  { X86::INC8r,       X86::INC8m,      0 },
+  { X86::NEG16r,      X86::NEG16m,     0 },
+  { X86::NEG32r,      X86::NEG32m,     0 },
+  { X86::NEG64r,      X86::NEG64m,     0 },
+  { X86::NEG8r,       X86::NEG8m,      0 },
+  { X86::NOT16r,      X86::NOT16m,     0 },
+  { X86::NOT32r,      X86::NOT32m,     0 },
+  { X86::NOT64r,      X86::NOT64m,     0 },
+  { X86::NOT8r,       X86::NOT8m,      0 },
+  { X86::OR16ri,      X86::OR16mi,     0 },
+  { X86::OR16ri8,     X86::OR16mi8,    0 },
+  { X86::OR16rr,      X86::OR16mr,     0 },
+  { X86::OR32ri,      X86::OR32mi,     0 },
+  { X86::OR32ri8,     X86::OR32mi8,    0 },
+  { X86::OR32rr,      X86::OR32mr,     0 },
+  { X86::OR64ri32,    X86::OR64mi32,   0 },
+  { X86::OR64ri8,     X86::OR64mi8,    0 },
+  { X86::OR64rr,      X86::OR64mr,     0 },
+  { X86::OR8ri,       X86::OR8mi,      0 },
+  { X86::OR8ri8,      X86::OR8mi8,     0 },
+  { X86::OR8rr,       X86::OR8mr,      0 },
+  { X86::RCL16r1,     X86::RCL16m1,    0 },
+  { X86::RCL16rCL,    X86::RCL16mCL,   0 },
+  { X86::RCL16ri,     X86::RCL16mi,    0 },
+  { X86::RCL32r1,     X86::RCL32m1,    0 },
+  { X86::RCL32rCL,    X86::RCL32mCL,   0 },
+  { X86::RCL32ri,     X86::RCL32mi,    0 },
+  { X86::RCL64r1,     X86::RCL64m1,    0 },
+  { X86::RCL64rCL,    X86::RCL64mCL,   0 },
+  { X86::RCL64ri,     X86::RCL64mi,    0 },
+  { X86::RCL8r1,      X86::RCL8m1,     0 },
+  { X86::RCL8rCL,     X86::RCL8mCL,    0 },
+  { X86::RCL8ri,      X86::RCL8mi,     0 },
+  { X86::RCR16r1,     X86::RCR16m1,    0 },
+  { X86::RCR16rCL,    X86::RCR16mCL,   0 },
+  { X86::RCR16ri,     X86::RCR16mi,    0 },
+  { X86::RCR32r1,     X86::RCR32m1,    0 },
+  { X86::RCR32rCL,    X86::RCR32mCL,   0 },
+  { X86::RCR32ri,     X86::RCR32mi,    0 },
+  { X86::RCR64r1,     X86::RCR64m1,    0 },
+  { X86::RCR64rCL,    X86::RCR64mCL,   0 },
+  { X86::RCR64ri,     X86::RCR64mi,    0 },
+  { X86::RCR8r1,      X86::RCR8m1,     0 },
+  { X86::RCR8rCL,     X86::RCR8mCL,    0 },
+  { X86::RCR8ri,      X86::RCR8mi,     0 },
+  { X86::ROL16r1,     X86::ROL16m1,    0 },
+  { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+  { X86::ROL16ri,     X86::ROL16mi,    0 },
+  { X86::ROL32r1,     X86::ROL32m1,    0 },
+  { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+  { X86::ROL32ri,     X86::ROL32mi,    0 },
+  { X86::ROL64r1,     X86::ROL64m1,    0 },
+  { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+  { X86::ROL64ri,     X86::ROL64mi,    0 },
+  { X86::ROL8r1,      X86::ROL8m1,     0 },
+  { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+  { X86::ROL8ri,      X86::ROL8mi,     0 },
+  { X86::ROR16r1,     X86::ROR16m1,    0 },
+  { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+  { X86::ROR16ri,     X86::ROR16mi,    0 },
+  { X86::ROR32r1,     X86::ROR32m1,    0 },
+  { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+  { X86::ROR32ri,     X86::ROR32mi,    0 },
+  { X86::ROR64r1,     X86::ROR64m1,    0 },
+  { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+  { X86::ROR64ri,     X86::ROR64mi,    0 },
+  { X86::ROR8r1,      X86::ROR8m1,     0 },
+  { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+  { X86::ROR8ri,      X86::ROR8mi,     0 },
+  { X86::SAR16r1,     X86::SAR16m1,    0 },
+  { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+  { X86::SAR16ri,     X86::SAR16mi,    0 },
+  { X86::SAR32r1,     X86::SAR32m1,    0 },
+  { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+  { X86::SAR32ri,     X86::SAR32mi,    0 },
+  { X86::SAR64r1,     X86::SAR64m1,    0 },
+  { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+  { X86::SAR64ri,     X86::SAR64mi,    0 },
+  { X86::SAR8r1,      X86::SAR8m1,     0 },
+  { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+  { X86::SAR8ri,      X86::SAR8mi,     0 },
+  { X86::SBB16ri,     X86::SBB16mi,    0 },
+  { X86::SBB16ri8,    X86::SBB16mi8,   0 },
+  { X86::SBB16rr,     X86::SBB16mr,    0 },
+  { X86::SBB32ri,     X86::SBB32mi,    0 },
+  { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+  { X86::SBB32rr,     X86::SBB32mr,    0 },
+  { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+  { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+  { X86::SBB64rr,     X86::SBB64mr,    0 },
+  { X86::SBB8ri,      X86::SBB8mi,     0 },
+  { X86::SBB8ri8,     X86::SBB8mi8,    0 },
+  { X86::SBB8rr,      X86::SBB8mr,     0 },
+  { X86::SHL16r1,     X86::SHL16m1,    0 },
+  { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+  { X86::SHL16ri,     X86::SHL16mi,    0 },
+  { X86::SHL32r1,     X86::SHL32m1,    0 },
+  { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+  { X86::SHL32ri,     X86::SHL32mi,    0 },
+  { X86::SHL64r1,     X86::SHL64m1,    0 },
+  { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+  { X86::SHL64ri,     X86::SHL64mi,    0 },
+  { X86::SHL8r1,      X86::SHL8m1,     0 },
+  { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+  { X86::SHL8ri,      X86::SHL8mi,     0 },
+  { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+  { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+  { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+  { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+  { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+  { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+  { X86::SHR16r1,     X86::SHR16m1,    0 },
+  { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+  { X86::SHR16ri,     X86::SHR16mi,    0 },
+  { X86::SHR32r1,     X86::SHR32m1,    0 },
+  { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+  { X86::SHR32ri,     X86::SHR32mi,    0 },
+  { X86::SHR64r1,     X86::SHR64m1,    0 },
+  { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+  { X86::SHR64ri,     X86::SHR64mi,    0 },
+  { X86::SHR8r1,      X86::SHR8m1,     0 },
+  { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+  { X86::SHR8ri,      X86::SHR8mi,     0 },
+  { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+  { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+  { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+  { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+  { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+  { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+  { X86::SUB16ri,     X86::SUB16mi,    0 },
+  { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+  { X86::SUB16rr,     X86::SUB16mr,    0 },
+  { X86::SUB32ri,     X86::SUB32mi,    0 },
+  { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+  { X86::SUB32rr,     X86::SUB32mr,    0 },
+  { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+  { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+  { X86::SUB64rr,     X86::SUB64mr,    0 },
+  { X86::SUB8ri,      X86::SUB8mi,     0 },
+  { X86::SUB8ri8,     X86::SUB8mi8,    0 },
+  { X86::SUB8rr,      X86::SUB8mr,     0 },
+  { X86::XOR16ri,     X86::XOR16mi,    0 },
+  { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+  { X86::XOR16rr,     X86::XOR16mr,    0 },
+  { X86::XOR32ri,     X86::XOR32mi,    0 },
+  { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+  { X86::XOR32rr,     X86::XOR32mr,    0 },
+  { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+  { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+  { X86::XOR64rr,     X86::XOR64mr,    0 },
+  { X86::XOR8ri,      X86::XOR8mi,     0 },
+  { X86::XOR8ri8,     X86::XOR8mi8,    0 },
+  { X86::XOR8rr,      X86::XOR8mr,     0 }
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+  { X86::BT16ri8,             X86::BT16mi8,             TB_FOLDED_LOAD },
+  { X86::BT32ri8,             X86::BT32mi8,             TB_FOLDED_LOAD },
+  { X86::BT64ri8,             X86::BT64mi8,             TB_FOLDED_LOAD },
+  { X86::CALL16r,             X86::CALL16m,             TB_FOLDED_LOAD },
+  { X86::CALL16r_NT,          X86::CALL16m_NT,          TB_FOLDED_LOAD },
+  { X86::CALL32r,             X86::CALL32m,             TB_FOLDED_LOAD },
+  { X86::CALL32r_NT,          X86::CALL32m_NT,          TB_FOLDED_LOAD },
+  { X86::CALL64r,             X86::CALL64m,             TB_FOLDED_LOAD },
+  { X86::CALL64r_NT,          X86::CALL64m_NT,          TB_FOLDED_LOAD },
+  { X86::CMP16ri,             X86::CMP16mi,             TB_FOLDED_LOAD },
+  { X86::CMP16ri8,            X86::CMP16mi8,            TB_FOLDED_LOAD },
+  { X86::CMP16rr,             X86::CMP16mr,             TB_FOLDED_LOAD },
+  { X86::CMP32ri,             X86::CMP32mi,             TB_FOLDED_LOAD },
+  { X86::CMP32ri8,            X86::CMP32mi8,            TB_FOLDED_LOAD },
+  { X86::CMP32rr,             X86::CMP32mr,             TB_FOLDED_LOAD },
+  { X86::CMP64ri32,           X86::CMP64mi32,           TB_FOLDED_LOAD },
+  { X86::CMP64ri8,            X86::CMP64mi8,            TB_FOLDED_LOAD },
+  { X86::CMP64rr,             X86::CMP64mr,             TB_FOLDED_LOAD },
+  { X86::CMP8ri,              X86::CMP8mi,              TB_FOLDED_LOAD },
+  { X86::CMP8ri8,             X86::CMP8mi8,             TB_FOLDED_LOAD },
+  { X86::CMP8rr,              X86::CMP8mr,              TB_FOLDED_LOAD },
+  { X86::DIV16r,              X86::DIV16m,              TB_FOLDED_LOAD },
+  { X86::DIV32r,              X86::DIV32m,              TB_FOLDED_LOAD },
+  { X86::DIV64r,              X86::DIV64m,              TB_FOLDED_LOAD },
+  { X86::DIV8r,               X86::DIV8m,               TB_FOLDED_LOAD },
+  { X86::EXTRACTPSrr,         X86::EXTRACTPSmr,         TB_FOLDED_STORE },
+  { X86::IDIV16r,             X86::IDIV16m,             TB_FOLDED_LOAD },
+  { X86::IDIV32r,             X86::IDIV32m,             TB_FOLDED_LOAD },
+  { X86::IDIV64r,             X86::IDIV64m,             TB_FOLDED_LOAD },
+  { X86::IDIV8r,              X86::IDIV8m,              TB_FOLDED_LOAD },
+  { X86::IMUL16r,             X86::IMUL16m,             TB_FOLDED_LOAD },
+  { X86::IMUL32r,             X86::IMUL32m,             TB_FOLDED_LOAD },
+  { X86::IMUL64r,             X86::IMUL64m,             TB_FOLDED_LOAD },
+  { X86::IMUL8r,              X86::IMUL8m,              TB_FOLDED_LOAD },
+  { X86::JMP16r,              X86::JMP16m,              TB_FOLDED_LOAD },
+  { X86::JMP16r_NT,           X86::JMP16m_NT,           TB_FOLDED_LOAD },
+  { X86::JMP32r,              X86::JMP32m,              TB_FOLDED_LOAD },
+  { X86::JMP32r_NT,           X86::JMP32m_NT,           TB_FOLDED_LOAD },
+  { X86::JMP64r,              X86::JMP64m,              TB_FOLDED_LOAD },
+  { X86::JMP64r_NT,           X86::JMP64m_NT,           TB_FOLDED_LOAD },
+  { X86::MOV16ri,             X86::MOV16mi,             TB_FOLDED_STORE },
+  { X86::MOV16rr,             X86::MOV16mr,             TB_FOLDED_STORE },
+  { X86::MOV32ri,             X86::MOV32mi,             TB_FOLDED_STORE },
+  { X86::MOV32rr,             X86::MOV32mr,             TB_FOLDED_STORE },
+  { X86::MOV64ri32,           X86::MOV64mi32,           TB_FOLDED_STORE },
+  { X86::MOV64rr,             X86::MOV64mr,             TB_FOLDED_STORE },
+  { X86::MOV8ri,              X86::MOV8mi,              TB_FOLDED_STORE },
+  { X86::MOV8rr,              X86::MOV8mr,              TB_FOLDED_STORE },
+  { X86::MOV8rr_NOREX,        X86::MOV8mr_NOREX,        TB_FOLDED_STORE },
+  { X86::MOVAPDrr,            X86::MOVAPDmr,            TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::MOVAPSrr,            X86::MOVAPSmr,            TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::MOVDQArr,            X86::MOVDQAmr,            TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::MOVDQUrr,            X86::MOVDQUmr,            TB_FOLDED_STORE },
+  { X86::MOVPDI2DIrr,         X86::MOVPDI2DImr,         TB_FOLDED_STORE },
+  { X86::MOVPQIto64rr,        X86::MOVPQI2QImr,         TB_FOLDED_STORE },
+  { X86::MOVSDto64rr,         X86::MOVSDto64mr,         TB_FOLDED_STORE },
+  { X86::MOVSS2DIrr,          X86::MOVSS2DImr,          TB_FOLDED_STORE },
+  { X86::MOVUPDrr,            X86::MOVUPDmr,            TB_FOLDED_STORE },
+  { X86::MOVUPSrr,            X86::MOVUPSmr,            TB_FOLDED_STORE },
+  { X86::MUL16r,              X86::MUL16m,              TB_FOLDED_LOAD },
+  { X86::MUL32r,              X86::MUL32m,              TB_FOLDED_LOAD },
+  { X86::MUL64r,              X86::MUL64m,              TB_FOLDED_LOAD },
+  { X86::MUL8r,               X86::MUL8m,               TB_FOLDED_LOAD },
+  { X86::PEXTRDrr,            X86::PEXTRDmr,            TB_FOLDED_STORE },
+  { X86::PEXTRQrr,            X86::PEXTRQmr,            TB_FOLDED_STORE },
+  { X86::PTWRITE64r,          X86::PTWRITE64m,          TB_FOLDED_LOAD },
+  { X86::PTWRITEr,            X86::PTWRITEm,            TB_FOLDED_LOAD },
+  { X86::PUSH16r,             X86::PUSH16rmm,           TB_FOLDED_LOAD },
+  { X86::PUSH32r,             X86::PUSH32rmm,           TB_FOLDED_LOAD },
+  { X86::PUSH64r,             X86::PUSH64rmm,           TB_FOLDED_LOAD },
+  { X86::SETAEr,              X86::SETAEm,              TB_FOLDED_STORE },
+  { X86::SETAr,               X86::SETAm,               TB_FOLDED_STORE },
+  { X86::SETBEr,              X86::SETBEm,              TB_FOLDED_STORE },
+  { X86::SETBr,               X86::SETBm,               TB_FOLDED_STORE },
+  { X86::SETEr,               X86::SETEm,               TB_FOLDED_STORE },
+  { X86::SETGEr,              X86::SETGEm,              TB_FOLDED_STORE },
+  { X86::SETGr,               X86::SETGm,               TB_FOLDED_STORE },
+  { X86::SETLEr,              X86::SETLEm,              TB_FOLDED_STORE },
+  { X86::SETLr,               X86::SETLm,               TB_FOLDED_STORE },
+  { X86::SETNEr,              X86::SETNEm,              TB_FOLDED_STORE },
+  { X86::SETNOr,              X86::SETNOm,              TB_FOLDED_STORE },
+  { X86::SETNPr,              X86::SETNPm,              TB_FOLDED_STORE },
+  { X86::SETNSr,              X86::SETNSm,              TB_FOLDED_STORE },
+  { X86::SETOr,               X86::SETOm,               TB_FOLDED_STORE },
+  { X86::SETPr,               X86::SETPm,               TB_FOLDED_STORE },
+  { X86::SETSr,               X86::SETSm,               TB_FOLDED_STORE },
+  { X86::TAILJMPr,            X86::TAILJMPm,            TB_FOLDED_LOAD },
+  { X86::TAILJMPr64,          X86::TAILJMPm64,          TB_FOLDED_LOAD },
+  { X86::TAILJMPr64_REX,      X86::TAILJMPm64_REX,      TB_FOLDED_LOAD },
+  { X86::TCRETURNri,          X86::TCRETURNmi,          TB_FOLDED_LOAD | TB_NO_FORWARD },
+  { X86::TCRETURNri64,        X86::TCRETURNmi64,        TB_FOLDED_LOAD | TB_NO_FORWARD },
+  { X86::TEST16ri,            X86::TEST16mi,            TB_FOLDED_LOAD },
+  { X86::TEST16rr,            X86::TEST16mr,            TB_FOLDED_LOAD },
+  { X86::TEST32ri,            X86::TEST32mi,            TB_FOLDED_LOAD },
+  { X86::TEST32rr,            X86::TEST32mr,            TB_FOLDED_LOAD },
+  { X86::TEST64ri32,          X86::TEST64mi32,          TB_FOLDED_LOAD },
+  { X86::TEST64rr,            X86::TEST64mr,            TB_FOLDED_LOAD },
+  { X86::TEST8ri,             X86::TEST8mi,             TB_FOLDED_LOAD },
+  { X86::TEST8rr,             X86::TEST8mr,             TB_FOLDED_LOAD },
+  { X86::VCVTPS2PHYrr,        X86::VCVTPS2PHYmr,        TB_FOLDED_STORE },
+  { X86::VCVTPS2PHZ256rr,     X86::VCVTPS2PHZ256mr,     TB_FOLDED_STORE },
+  { X86::VCVTPS2PHZrr,        X86::VCVTPS2PHZmr,        TB_FOLDED_STORE },
+  { X86::VEXTRACTF128rr,      X86::VEXTRACTF128mr,      TB_FOLDED_STORE },
+  { X86::VEXTRACTF32x4Z256rr, X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+  { X86::VEXTRACTF32x4Zrr,    X86::VEXTRACTF32x4Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTF32x8Zrr,    X86::VEXTRACTF32x8Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTF64x2Z256rr, X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+  { X86::VEXTRACTF64x2Zrr,    X86::VEXTRACTF64x2Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTF64x4Zrr,    X86::VEXTRACTF64x4Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTI128rr,      X86::VEXTRACTI128mr,      TB_FOLDED_STORE },
+  { X86::VEXTRACTI32x4Z256rr, X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+  { X86::VEXTRACTI32x4Zrr,    X86::VEXTRACTI32x4Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTI32x8Zrr,    X86::VEXTRACTI32x8Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTI64x2Z256rr, X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
+  { X86::VEXTRACTI64x2Zrr,    X86::VEXTRACTI64x2Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTI64x4Zrr,    X86::VEXTRACTI64x4Zmr,    TB_FOLDED_STORE },
+  { X86::VEXTRACTPSZrr,       X86::VEXTRACTPSZmr,       TB_FOLDED_STORE },
+  { X86::VEXTRACTPSrr,        X86::VEXTRACTPSmr,        TB_FOLDED_STORE },
+  { X86::VMOVAPDYrr,          X86::VMOVAPDYmr,          TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVAPDZ128rr,       X86::VMOVAPDZ128mr,       TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVAPDZ256rr,       X86::VMOVAPDZ256mr,       TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVAPDZrr,          X86::VMOVAPDZmr,          TB_FOLDED_STORE | TB_ALIGN_64 },
+  { X86::VMOVAPDrr,           X86::VMOVAPDmr,           TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVAPSYrr,          X86::VMOVAPSYmr,          TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVAPSZ128rr,       X86::VMOVAPSZ128mr,       TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVAPSZ256rr,       X86::VMOVAPSZ256mr,       TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVAPSZrr,          X86::VMOVAPSZmr,          TB_FOLDED_STORE | TB_ALIGN_64 },
+  { X86::VMOVAPSrr,           X86::VMOVAPSmr,           TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVDQA32Z128rr,     X86::VMOVDQA32Z128mr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVDQA32Z256rr,     X86::VMOVDQA32Z256mr,     TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVDQA32Zrr,        X86::VMOVDQA32Zmr,        TB_FOLDED_STORE | TB_ALIGN_64 },
+  { X86::VMOVDQA64Z128rr,     X86::VMOVDQA64Z128mr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVDQA64Z256rr,     X86::VMOVDQA64Z256mr,     TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVDQA64Zrr,        X86::VMOVDQA64Zmr,        TB_FOLDED_STORE | TB_ALIGN_64 },
+  { X86::VMOVDQAYrr,          X86::VMOVDQAYmr,          TB_FOLDED_STORE | TB_ALIGN_32 },
+  { X86::VMOVDQArr,           X86::VMOVDQAmr,           TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVDQU16Z128rr,     X86::VMOVDQU16Z128mr,     TB_FOLDED_STORE },
+  { X86::VMOVDQU16Z256rr,     X86::VMOVDQU16Z256mr,     TB_FOLDED_STORE },
+  { X86::VMOVDQU16Zrr,        X86::VMOVDQU16Zmr,        TB_FOLDED_STORE },
+  { X86::VMOVDQU32Z128rr,     X86::VMOVDQU32Z128mr,     TB_FOLDED_STORE },
+  { X86::VMOVDQU32Z256rr,     X86::VMOVDQU32Z256mr,     TB_FOLDED_STORE },
+  { X86::VMOVDQU32Zrr,        X86::VMOVDQU32Zmr,        TB_FOLDED_STORE },
+  { X86::VMOVDQU64Z128rr,     X86::VMOVDQU64Z128mr,     TB_FOLDED_STORE },
+  { X86::VMOVDQU64Z256rr,     X86::VMOVDQU64Z256mr,     TB_FOLDED_STORE },
+  { X86::VMOVDQU64Zrr,        X86::VMOVDQU64Zmr,        TB_FOLDED_STORE },
+  { X86::VMOVDQU8Z128rr,      X86::VMOVDQU8Z128mr,      TB_FOLDED_STORE },
+  { X86::VMOVDQU8Z256rr,      X86::VMOVDQU8Z256mr,      TB_FOLDED_STORE },
+  { X86::VMOVDQU8Zrr,         X86::VMOVDQU8Zmr,         TB_FOLDED_STORE },
+  { X86::VMOVDQUYrr,          X86::VMOVDQUYmr,          TB_FOLDED_STORE },
+  { X86::VMOVDQUrr,           X86::VMOVDQUmr,           TB_FOLDED_STORE },
+  { X86::VMOVPDI2DIZrr,       X86::VMOVPDI2DIZmr,       TB_FOLDED_STORE },
+  { X86::VMOVPDI2DIrr,        X86::VMOVPDI2DImr,        TB_FOLDED_STORE },
+  { X86::VMOVPQIto64Zrr,      X86::VMOVPQI2QIZmr,       TB_FOLDED_STORE },
+  { X86::VMOVPQIto64rr,       X86::VMOVPQI2QImr,        TB_FOLDED_STORE },
+  { X86::VMOVSDto64Zrr,       X86::VMOVSDto64Zmr,       TB_FOLDED_STORE },
+  { X86::VMOVSDto64rr,        X86::VMOVSDto64mr,        TB_FOLDED_STORE },
+  { X86::VMOVSS2DIZrr,        X86::VMOVSS2DIZmr,        TB_FOLDED_STORE },
+  { X86::VMOVSS2DIrr,         X86::VMOVSS2DImr,         TB_FOLDED_STORE },
+  { X86::VMOVUPDYrr,          X86::VMOVUPDYmr,          TB_FOLDED_STORE },
+  { X86::VMOVUPDZ128rr,       X86::VMOVUPDZ128mr,       TB_FOLDED_STORE },
+  { X86::VMOVUPDZ256rr,       X86::VMOVUPDZ256mr,       TB_FOLDED_STORE },
+  { X86::VMOVUPDZrr,          X86::VMOVUPDZmr,          TB_FOLDED_STORE },
+  { X86::VMOVUPDrr,           X86::VMOVUPDmr,           TB_FOLDED_STORE },
+  { X86::VMOVUPSYrr,          X86::VMOVUPSYmr,          TB_FOLDED_STORE },
+  { X86::VMOVUPSZ128rr,       X86::VMOVUPSZ128mr,       TB_FOLDED_STORE },
+  { X86::VMOVUPSZ256rr,       X86::VMOVUPSZ256mr,       TB_FOLDED_STORE },
+  { X86::VMOVUPSZrr,          X86::VMOVUPSZmr,          TB_FOLDED_STORE },
+  { X86::VMOVUPSrr,           X86::VMOVUPSmr,           TB_FOLDED_STORE },
+  { X86::VPEXTRDZrr,          X86::VPEXTRDZmr,          TB_FOLDED_STORE },
+  { X86::VPEXTRDrr,           X86::VPEXTRDmr,           TB_FOLDED_STORE },
+  { X86::VPEXTRQZrr,          X86::VPEXTRQZmr,          TB_FOLDED_STORE },
+  { X86::VPEXTRQrr,           X86::VPEXTRQmr,           TB_FOLDED_STORE },
+  { X86::VPMOVDBZrr,          X86::VPMOVDBZmr,          TB_FOLDED_STORE },
+  { X86::VPMOVDWZ256rr,       X86::VPMOVDWZ256mr,       TB_FOLDED_STORE },
+  { X86::VPMOVDWZrr,          X86::VPMOVDWZmr,          TB_FOLDED_STORE },
+  { X86::VPMOVQDZ256rr,       X86::VPMOVQDZ256mr,       TB_FOLDED_STORE },
+  { X86::VPMOVQDZrr,          X86::VPMOVQDZmr,          TB_FOLDED_STORE },
+  { X86::VPMOVQWZrr,          X86::VPMOVQWZmr,          TB_FOLDED_STORE },
+  { X86::VPMOVSDBZrr,         X86::VPMOVSDBZmr,         TB_FOLDED_STORE },
+  { X86::VPMOVSDWZ256rr,      X86::VPMOVSDWZ256mr,      TB_FOLDED_STORE },
+  { X86::VPMOVSDWZrr,         X86::VPMOVSDWZmr,         TB_FOLDED_STORE },
+  { X86::VPMOVSQDZ256rr,      X86::VPMOVSQDZ256mr,      TB_FOLDED_STORE },
+  { X86::VPMOVSQDZrr,         X86::VPMOVSQDZmr,         TB_FOLDED_STORE },
+  { X86::VPMOVSQWZrr,         X86::VPMOVSQWZmr,         TB_FOLDED_STORE },
+  { X86::VPMOVSWBZ256rr,      X86::VPMOVSWBZ256mr,      TB_FOLDED_STORE },
+  { X86::VPMOVSWBZrr,         X86::VPMOVSWBZmr,         TB_FOLDED_STORE },
+  { X86::VPMOVUSDBZrr,        X86::VPMOVUSDBZmr,        TB_FOLDED_STORE },
+  { X86::VPMOVUSDWZ256rr,     X86::VPMOVUSDWZ256mr,     TB_FOLDED_STORE },
+  { X86::VPMOVUSDWZrr,        X86::VPMOVUSDWZmr,        TB_FOLDED_STORE },
+  { X86::VPMOVUSQDZ256rr,     X86::VPMOVUSQDZ256mr,     TB_FOLDED_STORE },
+  { X86::VPMOVUSQDZrr,        X86::VPMOVUSQDZmr,        TB_FOLDED_STORE },
+  { X86::VPMOVUSQWZrr,        X86::VPMOVUSQWZmr,        TB_FOLDED_STORE },
+  { X86::VPMOVUSWBZ256rr,     X86::VPMOVUSWBZ256mr,     TB_FOLDED_STORE },
+  { X86::VPMOVUSWBZrr,        X86::VPMOVUSWBZmr,        TB_FOLDED_STORE },
+  { X86::VPMOVWBZ256rr,       X86::VPMOVWBZ256mr,       TB_FOLDED_STORE },
+  { X86::VPMOVWBZrr,          X86::VPMOVWBZmr,          TB_FOLDED_STORE },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+  { X86::AESIMCrr,             X86::AESIMCrm,             TB_ALIGN_16 },
+  { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
+  { X86::BEXTR32rr,            X86::BEXTR32rm,            0 },
+  { X86::BEXTR64rr,            X86::BEXTR64rm,            0 },
+  { X86::BEXTRI32ri,           X86::BEXTRI32mi,           0 },
+  { X86::BEXTRI64ri,           X86::BEXTRI64mi,           0 },
+  { X86::BLCFILL32rr,          X86::BLCFILL32rm,          0 },
+  { X86::BLCFILL64rr,          X86::BLCFILL64rm,          0 },
+  { X86::BLCI32rr,             X86::BLCI32rm,             0 },
+  { X86::BLCI64rr,             X86::BLCI64rm,             0 },
+  { X86::BLCIC32rr,            X86::BLCIC32rm,            0 },
+  { X86::BLCIC64rr,            X86::BLCIC64rm,            0 },
+  { X86::BLCMSK32rr,           X86::BLCMSK32rm,           0 },
+  { X86::BLCMSK64rr,           X86::BLCMSK64rm,           0 },
+  { X86::BLCS32rr,             X86::BLCS32rm,             0 },
+  { X86::BLCS64rr,             X86::BLCS64rm,             0 },
+  { X86::BLSFILL32rr,          X86::BLSFILL32rm,          0 },
+  { X86::BLSFILL64rr,          X86::BLSFILL64rm,          0 },
+  { X86::BLSI32rr,             X86::BLSI32rm,             0 },
+  { X86::BLSI64rr,             X86::BLSI64rm,             0 },
+  { X86::BLSIC32rr,            X86::BLSIC32rm,            0 },
+  { X86::BLSIC64rr,            X86::BLSIC64rm,            0 },
+  { X86::BLSMSK32rr,           X86::BLSMSK32rm,           0 },
+  { X86::BLSMSK64rr,           X86::BLSMSK64rm,           0 },
+  { X86::BLSR32rr,             X86::BLSR32rm,             0 },
+  { X86::BLSR64rr,             X86::BLSR64rm,             0 },
+  { X86::BSF16rr,              X86::BSF16rm,              0 },
+  { X86::BSF32rr,              X86::BSF32rm,              0 },
+  { X86::BSF64rr,              X86::BSF64rm,              0 },
+  { X86::BSR16rr,              X86::BSR16rm,              0 },
+  { X86::BSR32rr,              X86::BSR32rm,              0 },
+  { X86::BSR64rr,              X86::BSR64rm,              0 },
+  { X86::BZHI32rr,             X86::BZHI32rm,             0 },
+  { X86::BZHI64rr,             X86::BZHI64rm,             0 },
+  { X86::CMP16rr,              X86::CMP16rm,              0 },
+  { X86::CMP32rr,              X86::CMP32rm,              0 },
+  { X86::CMP64rr,              X86::CMP64rm,              0 },
+  { X86::CMP8rr,               X86::CMP8rm,               0 },
+  { X86::COMISDrr,             X86::COMISDrm,             0 },
+  { X86::COMISDrr_Int,         X86::COMISDrm_Int,         TB_NO_REVERSE },
+  { X86::COMISSrr,             X86::COMISSrm,             0 },
+  { X86::COMISSrr_Int,         X86::COMISSrm_Int,         TB_NO_REVERSE },
+  { X86::CVTDQ2PDrr,           X86::CVTDQ2PDrm,           TB_NO_REVERSE },
+  { X86::CVTDQ2PSrr,           X86::CVTDQ2PSrm,           TB_ALIGN_16 },
+  { X86::CVTPD2DQrr,           X86::CVTPD2DQrm,           TB_ALIGN_16 },
+  { X86::CVTPD2PSrr,           X86::CVTPD2PSrm,           TB_ALIGN_16 },
+  { X86::CVTPS2DQrr,           X86::CVTPS2DQrm,           TB_ALIGN_16 },
+  { X86::CVTPS2PDrr,           X86::CVTPS2PDrm,           TB_NO_REVERSE },
+  { X86::CVTSD2SI64rr_Int,     X86::CVTSD2SI64rm_Int,     TB_NO_REVERSE },
+  { X86::CVTSD2SIrr_Int,       X86::CVTSD2SIrm_Int,       TB_NO_REVERSE },
+  { X86::CVTSD2SSrr,           X86::CVTSD2SSrm,           0 },
+  { X86::CVTSI2SDrr,           X86::CVTSI2SDrm,           0 },
+  { X86::CVTSI2SSrr,           X86::CVTSI2SSrm,           0 },
+  { X86::CVTSI642SDrr,         X86::CVTSI642SDrm,         0 },
+  { X86::CVTSI642SSrr,         X86::CVTSI642SSrm,         0 },
+  { X86::CVTSS2SDrr,           X86::CVTSS2SDrm,           0 },
+  { X86::CVTSS2SI64rr_Int,     X86::CVTSS2SI64rm_Int,     TB_NO_REVERSE },
+  { X86::CVTSS2SIrr_Int,       X86::CVTSS2SIrm_Int,       TB_NO_REVERSE },
+  { X86::CVTTPD2DQrr,          X86::CVTTPD2DQrm,          TB_ALIGN_16 },
+  { X86::CVTTPS2DQrr,          X86::CVTTPS2DQrm,          TB_ALIGN_16 },
+  { X86::CVTTSD2SI64rr,        X86::CVTTSD2SI64rm,        0 },
+  { X86::CVTTSD2SI64rr_Int,    X86::CVTTSD2SI64rm_Int,    TB_NO_REVERSE },
+  { X86::CVTTSD2SIrr,          X86::CVTTSD2SIrm,          0 },
+  { X86::CVTTSD2SIrr_Int,      X86::CVTTSD2SIrm_Int,      TB_NO_REVERSE },
+  { X86::CVTTSS2SI64rr,        X86::CVTTSS2SI64rm,        0 },
+  { X86::CVTTSS2SI64rr_Int,    X86::CVTTSS2SI64rm_Int,    TB_NO_REVERSE },
+  { X86::CVTTSS2SIrr,          X86::CVTTSS2SIrm,          0 },
+  { X86::CVTTSS2SIrr_Int,      X86::CVTTSS2SIrm_Int,      TB_NO_REVERSE },
+  { X86::IMUL16rri,            X86::IMUL16rmi,            0 },
+  { X86::IMUL16rri8,           X86::IMUL16rmi8,           0 },
+  { X86::IMUL32rri,            X86::IMUL32rmi,            0 },
+  { X86::IMUL32rri8,           X86::IMUL32rmi8,           0 },
+  { X86::IMUL64rri32,          X86::IMUL64rmi32,          0 },
+  { X86::IMUL64rri8,           X86::IMUL64rmi8,           0 },
+  { X86::LWPINS32rri,          X86::LWPINS32rmi,          0 },
+  { X86::LWPINS64rri,          X86::LWPINS64rmi,          0 },
+  { X86::LWPVAL32rri,          X86::LWPVAL32rmi,          0 },
+  { X86::LWPVAL64rri,          X86::LWPVAL64rmi,          0 },
+  { X86::LZCNT16rr,            X86::LZCNT16rm,            0 },
+  { X86::LZCNT32rr,            X86::LZCNT32rm,            0 },
+  { X86::LZCNT64rr,            X86::LZCNT64rm,            0 },
+  { X86::MMX_CVTPD2PIirr,      X86::MMX_CVTPD2PIirm,      TB_ALIGN_16 },
+  { X86::MMX_CVTPI2PDirr,      X86::MMX_CVTPI2PDirm,      0 },
+  { X86::MMX_CVTPS2PIirr,      X86::MMX_CVTPS2PIirm,      TB_NO_REVERSE },
+  { X86::MMX_CVTTPD2PIirr,     X86::MMX_CVTTPD2PIirm,     TB_ALIGN_16 },
+  { X86::MMX_CVTTPS2PIirr,     X86::MMX_CVTTPS2PIirm,     TB_NO_REVERSE },
+  { X86::MMX_MOVD64to64rr,     X86::MMX_MOVQ64rm,         0 },
+  { X86::MMX_PABSBrr,          X86::MMX_PABSBrm,          0 },
+  { X86::MMX_PABSDrr,          X86::MMX_PABSDrm,          0 },
+  { X86::MMX_PABSWrr,          X86::MMX_PABSWrm,          0 },
+  { X86::MMX_PSHUFWri,         X86::MMX_PSHUFWmi,         0 },
+  { X86::MOV16rr,              X86::MOV16rm,              0 },
+  { X86::MOV32rr,              X86::MOV32rm,              0 },
+  { X86::MOV64rr,              X86::MOV64rm,              0 },
+  { X86::MOV64toPQIrr,         X86::MOVQI2PQIrm,          0 },
+  { X86::MOV64toSDrr,          X86::MOV64toSDrm,          0 },
+  { X86::MOV8rr,               X86::MOV8rm,               0 },
+  { X86::MOVAPDrr,             X86::MOVAPDrm,             TB_ALIGN_16 },
+  { X86::MOVAPSrr,             X86::MOVAPSrm,             TB_ALIGN_16 },
+  { X86::MOVDDUPrr,            X86::MOVDDUPrm,            TB_NO_REVERSE },
+  { X86::MOVDI2PDIrr,          X86::MOVDI2PDIrm,          0 },
+  { X86::MOVDI2SSrr,           X86::MOVDI2SSrm,           0 },
+  { X86::MOVDQArr,             X86::MOVDQArm,             TB_ALIGN_16 },
+  { X86::MOVDQUrr,             X86::MOVDQUrm,             0 },
+  { X86::MOVSHDUPrr,           X86::MOVSHDUPrm,           TB_ALIGN_16 },
+  { X86::MOVSLDUPrr,           X86::MOVSLDUPrm,           TB_ALIGN_16 },
+  { X86::MOVSX16rr8,           X86::MOVSX16rm8,           0 },
+  { X86::MOVSX32rr16,          X86::MOVSX32rm16,          0 },
+  { X86::MOVSX32rr8,           X86::MOVSX32rm8,           0 },
+  { X86::MOVSX32rr8_NOREX,     X86::MOVSX32rm8_NOREX,     0 },
+  { X86::MOVSX64rr16,          X86::MOVSX64rm16,          0 },
+  { X86::MOVSX64rr32,          X86::MOVSX64rm32,          0 },
+  { X86::MOVSX64rr8,           X86::MOVSX64rm8,           0 },
+  { X86::MOVUPDrr,             X86::MOVUPDrm,             0 },
+  { X86::MOVUPSrr,             X86::MOVUPSrm,             0 },
+  { X86::MOVZPQILo2PQIrr,      X86::MOVQI2PQIrm,          TB_NO_REVERSE },
+  { X86::MOVZX16rr8,           X86::MOVZX16rm8,           0 },
+  { X86::MOVZX32rr16,          X86::MOVZX32rm16,          0 },
+  { X86::MOVZX32rr8,           X86::MOVZX32rm8,           0 },
+  { X86::MOVZX32rr8_NOREX,     X86::MOVZX32rm8_NOREX,     0 },
+  { X86::MOVZX64rr16,          X86::MOVZX64rm16,          0 },
+  { X86::MOVZX64rr8,           X86::MOVZX64rm8,           0 },
+  { X86::PABSBrr,              X86::PABSBrm,              TB_ALIGN_16 },
+  { X86::PABSDrr,              X86::PABSDrm,              TB_ALIGN_16 },
+  { X86::PABSWrr,              X86::PABSWrm,              TB_ALIGN_16 },
+  { X86::PCMPESTRIrr,          X86::PCMPESTRIrm,          0 },
+  { X86::PCMPESTRMrr,          X86::PCMPESTRMrm,          0 },
+  { X86::PCMPISTRIrr,          X86::PCMPISTRIrm,          0 },
+  { X86::PCMPISTRMrr,          X86::PCMPISTRMrm,          0 },
+  { X86::PF2IDrr,              X86::PF2IDrm,              0 },
+  { X86::PF2IWrr,              X86::PF2IWrm,              0 },
+  { X86::PFRCPrr,              X86::PFRCPrm,              0 },
+  { X86::PFRSQRTrr,            X86::PFRSQRTrm,            0 },
+  { X86::PHMINPOSUWrr,         X86::PHMINPOSUWrm,         TB_ALIGN_16 },
+  { X86::PI2FDrr,              X86::PI2FDrm,              0 },
+  { X86::PI2FWrr,              X86::PI2FWrm,              0 },
+  { X86::PMOVSXBDrr,           X86::PMOVSXBDrm,           TB_NO_REVERSE },
+  { X86::PMOVSXBQrr,           X86::PMOVSXBQrm,           TB_NO_REVERSE },
+  { X86::PMOVSXBWrr,           X86::PMOVSXBWrm,           TB_NO_REVERSE },
+  { X86::PMOVSXDQrr,           X86::PMOVSXDQrm,           TB_NO_REVERSE },
+  { X86::PMOVSXWDrr,           X86::PMOVSXWDrm,           TB_NO_REVERSE },
+  { X86::PMOVSXWQrr,           X86::PMOVSXWQrm,           TB_NO_REVERSE },
+  { X86::PMOVZXBDrr,           X86::PMOVZXBDrm,           TB_NO_REVERSE },
+  { X86::PMOVZXBQrr,           X86::PMOVZXBQrm,           TB_NO_REVERSE },
+  { X86::PMOVZXBWrr,           X86::PMOVZXBWrm,           TB_NO_REVERSE },
+  { X86::PMOVZXDQrr,           X86::PMOVZXDQrm,           TB_NO_REVERSE },
+  { X86::PMOVZXWDrr,           X86::PMOVZXWDrm,           TB_NO_REVERSE },
+  { X86::PMOVZXWQrr,           X86::PMOVZXWQrm,           TB_NO_REVERSE },
+  { X86::POPCNT16rr,           X86::POPCNT16rm,           0 },
+  { X86::POPCNT32rr,           X86::POPCNT32rm,           0 },
+  { X86::POPCNT64rr,           X86::POPCNT64rm,           0 },
+  { X86::PSHUFDri,             X86::PSHUFDmi,             TB_ALIGN_16 },
+  { X86::PSHUFHWri,            X86::PSHUFHWmi,            TB_ALIGN_16 },
+  { X86::PSHUFLWri,            X86::PSHUFLWmi,            TB_ALIGN_16 },
+  { X86::PSWAPDrr,             X86::PSWAPDrm,             0 },
+  { X86::PTESTrr,              X86::PTESTrm,              TB_ALIGN_16 },
+  { X86::RCPPSr,               X86::RCPPSm,               TB_ALIGN_16 },
+  { X86::RCPSSr,               X86::RCPSSm,               0 },
+  { X86::RORX32ri,             X86::RORX32mi,             0 },
+  { X86::RORX64ri,             X86::RORX64mi,             0 },
+  { X86::ROUNDPDr,             X86::ROUNDPDm,             TB_ALIGN_16 },
+  { X86::ROUNDPSr,             X86::ROUNDPSm,             TB_ALIGN_16 },
+  { X86::ROUNDSDr,             X86::ROUNDSDm,             0 },
+  { X86::ROUNDSSr,             X86::ROUNDSSm,             0 },
+  { X86::RSQRTPSr,             X86::RSQRTPSm,             TB_ALIGN_16 },
+  { X86::RSQRTSSr,             X86::RSQRTSSm,             0 },
+  { X86::SARX32rr,             X86::SARX32rm,             0 },
+  { X86::SARX64rr,             X86::SARX64rm,             0 },
+  { X86::SHLX32rr,             X86::SHLX32rm,             0 },
+  { X86::SHLX64rr,             X86::SHLX64rm,             0 },
+  { X86::SHRX32rr,             X86::SHRX32rm,             0 },
+  { X86::SHRX64rr,             X86::SHRX64rm,             0 },
+  { X86::SQRTPDr,              X86::SQRTPDm,              TB_ALIGN_16 },
+  { X86::SQRTPSr,              X86::SQRTPSm,              TB_ALIGN_16 },
+  { X86::SQRTSDr,              X86::SQRTSDm,              0 },
+  { X86::SQRTSSr,              X86::SQRTSSm,              0 },
+  { X86::T1MSKC32rr,           X86::T1MSKC32rm,           0 },
+  { X86::T1MSKC64rr,           X86::T1MSKC64rm,           0 },
+  // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+  { X86::TZCNT16rr,            X86::TZCNT16rm,            0 },
+  { X86::TZCNT32rr,            X86::TZCNT32rm,            0 },
+  { X86::TZCNT64rr,            X86::TZCNT64rm,            0 },
+  { X86::TZMSK32rr,            X86::TZMSK32rm,            0 },
+  { X86::TZMSK64rr,            X86::TZMSK64rm,            0 },
+  { X86::UCOMISDrr,            X86::UCOMISDrm,            0 },
+  { X86::UCOMISDrr_Int,        X86::UCOMISDrm_Int,        TB_NO_REVERSE },
+  { X86::UCOMISSrr,            X86::UCOMISSrm,            0 },
+  { X86::UCOMISSrr_Int,        X86::UCOMISSrm_Int,        TB_NO_REVERSE },
+  { X86::VAESIMCrr,            X86::VAESIMCrm,            0 },
+  { X86::VAESKEYGENASSIST128rr,X86::VAESKEYGENASSIST128rm,0 },
+  { X86::VBROADCASTF32X2Z256r, X86::VBROADCASTF32X2Z256m, TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Zr,    X86::VBROADCASTF32X2Zm,    TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z128r, X86::VBROADCASTI32X2Z128m, TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z256r, X86::VBROADCASTI32X2Z256m, TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Zr,    X86::VBROADCASTI32X2Zm,    TB_NO_REVERSE },
+  { X86::VBROADCASTSDYrr,      X86::VBROADCASTSDYrm,      TB_NO_REVERSE },
+  { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
+  { X86::VBROADCASTSDZr,       X86::VBROADCASTSDZm,       TB_NO_REVERSE },
+  { X86::VBROADCASTSSYrr,      X86::VBROADCASTSSYrm,      TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
+  { X86::VBROADCASTSSZr,       X86::VBROADCASTSSZm,       TB_NO_REVERSE },
+  { X86::VBROADCASTSSrr,       X86::VBROADCASTSSrm,       TB_NO_REVERSE },
+  { X86::VCOMISDZrr,           X86::VCOMISDZrm,           0 },
+  { X86::VCOMISDZrr_Int,       X86::VCOMISDZrm_Int,       TB_NO_REVERSE },
+  { X86::VCOMISDrr,            X86::VCOMISDrm,            0 },
+  { X86::VCOMISDrr_Int,        X86::VCOMISDrm_Int,        TB_NO_REVERSE },
+  { X86::VCOMISSZrr,           X86::VCOMISSZrm,           0 },
+  { X86::VCOMISSZrr_Int,       X86::VCOMISSZrm_Int,       TB_NO_REVERSE },
+  { X86::VCOMISSrr,            X86::VCOMISSrm,            0 },
+  { X86::VCOMISSrr_Int,        X86::VCOMISSrm_Int,        TB_NO_REVERSE },
+  { X86::VCVTDQ2PDYrr,         X86::VCVTDQ2PDYrm,         0 },
+  { X86::VCVTDQ2PDZ128rr,      X86::VCVTDQ2PDZ128rm,      TB_NO_REVERSE },
+  { X86::VCVTDQ2PDZ256rr,      X86::VCVTDQ2PDZ256rm,       0 },
+  { X86::VCVTDQ2PDZrr,         X86::VCVTDQ2PDZrm,         0 },
+  { X86::VCVTDQ2PDrr,          X86::VCVTDQ2PDrm,          TB_NO_REVERSE },
+  { X86::VCVTDQ2PSYrr,         X86::VCVTDQ2PSYrm,         0 },
+  { X86::VCVTDQ2PSZ128rr,      X86::VCVTDQ2PSZ128rm,      0 },
+  { X86::VCVTDQ2PSZ256rr,      X86::VCVTDQ2PSZ256rm,      0 },
+  { X86::VCVTDQ2PSZrr,         X86::VCVTDQ2PSZrm,         0 },
+  { X86::VCVTDQ2PSrr,          X86::VCVTDQ2PSrm,          0 },
+  { X86::VCVTPD2DQYrr,         X86::VCVTPD2DQYrm,         0 },
+  { X86::VCVTPD2DQZ128rr,      X86::VCVTPD2DQZ128rm,      0 },
+  { X86::VCVTPD2DQZ256rr,      X86::VCVTPD2DQZ256rm,      0 },
+  { X86::VCVTPD2DQZrr,         X86::VCVTPD2DQZrm,         0 },
+  { X86::VCVTPD2DQrr,          X86::VCVTPD2DQrm,          0 },
+  { X86::VCVTPD2PSYrr,         X86::VCVTPD2PSYrm,         0 },
+  { X86::VCVTPD2PSZ128rr,      X86::VCVTPD2PSZ128rm,      0 },
+  { X86::VCVTPD2PSZ256rr,      X86::VCVTPD2PSZ256rm,      0 },
+  { X86::VCVTPD2PSZrr,         X86::VCVTPD2PSZrm,         0 },
+  { X86::VCVTPD2PSrr,          X86::VCVTPD2PSrm,          0 },
+  { X86::VCVTPD2QQZ128rr,      X86::VCVTPD2QQZ128rm,      0 },
+  { X86::VCVTPD2QQZ256rr,      X86::VCVTPD2QQZ256rm,      0 },
+  { X86::VCVTPD2QQZrr,         X86::VCVTPD2QQZrm,         0 },
+  { X86::VCVTPD2UDQZ128rr,     X86::VCVTPD2UDQZ128rm,     0 },
+  { X86::VCVTPD2UDQZ256rr,     X86::VCVTPD2UDQZ256rm,     0 },
+  { X86::VCVTPD2UDQZrr,        X86::VCVTPD2UDQZrm,        0 },
+  { X86::VCVTPD2UQQZ128rr,     X86::VCVTPD2UQQZ128rm,     0 },
+  { X86::VCVTPD2UQQZ256rr,     X86::VCVTPD2UQQZ256rm,     0 },
+  { X86::VCVTPD2UQQZrr,        X86::VCVTPD2UQQZrm,        0 },
+  { X86::VCVTPH2PSYrr,         X86::VCVTPH2PSYrm,         0 },
+  { X86::VCVTPH2PSZ128rr,      X86::VCVTPH2PSZ128rm,      TB_NO_REVERSE },
+  { X86::VCVTPH2PSZ256rr,      X86::VCVTPH2PSZ256rm,      0 },
+  { X86::VCVTPH2PSZrr,         X86::VCVTPH2PSZrm,         0 },
+  { X86::VCVTPH2PSrr,          X86::VCVTPH2PSrm,          TB_NO_REVERSE },
+  { X86::VCVTPS2DQYrr,         X86::VCVTPS2DQYrm,         0 },
+  { X86::VCVTPS2DQZ128rr,      X86::VCVTPS2DQZ128rm,      0 },
+  { X86::VCVTPS2DQZ256rr,      X86::VCVTPS2DQZ256rm,      0 },
+  { X86::VCVTPS2DQZrr,         X86::VCVTPS2DQZrm,         0 },
+  { X86::VCVTPS2DQrr,          X86::VCVTPS2DQrm,          0 },
+  { X86::VCVTPS2PDYrr,         X86::VCVTPS2PDYrm,         0 },
+  { X86::VCVTPS2PDZ128rr,      X86::VCVTPS2PDZ128rm,      TB_NO_REVERSE },
+  { X86::VCVTPS2PDZ256rr,      X86::VCVTPS2PDZ256rm,      0 },
+  { X86::VCVTPS2PDZrr,         X86::VCVTPS2PDZrm,         0 },
+  { X86::VCVTPS2PDrr,          X86::VCVTPS2PDrm,          TB_NO_REVERSE },
+  { X86::VCVTPS2QQZ128rr,      X86::VCVTPS2QQZ128rm,      TB_NO_REVERSE },
+  { X86::VCVTPS2QQZ256rr,      X86::VCVTPS2QQZ256rm,      0 },
+  { X86::VCVTPS2QQZrr,         X86::VCVTPS2QQZrm,         0 },
+  { X86::VCVTPS2UDQZ128rr,     X86::VCVTPS2UDQZ128rm,     0 },
+  { X86::VCVTPS2UDQZ256rr,     X86::VCVTPS2UDQZ256rm,     0 },
+  { X86::VCVTPS2UDQZrr,        X86::VCVTPS2UDQZrm,        0 },
+  { X86::VCVTPS2UQQZ128rr,     X86::VCVTPS2UQQZ128rm,     TB_NO_REVERSE },
+  { X86::VCVTPS2UQQZ256rr,     X86::VCVTPS2UQQZ256rm,     0 },
+  { X86::VCVTPS2UQQZrr,        X86::VCVTPS2UQQZrm,        0 },
+  { X86::VCVTQQ2PDZ128rr,      X86::VCVTQQ2PDZ128rm,      0 },
+  { X86::VCVTQQ2PDZ256rr,      X86::VCVTQQ2PDZ256rm,      0 },
+  { X86::VCVTQQ2PDZrr,         X86::VCVTQQ2PDZrm,         0 },
+  { X86::VCVTQQ2PSZ128rr,      X86::VCVTQQ2PSZ128rm,      0 },
+  { X86::VCVTQQ2PSZ256rr,      X86::VCVTQQ2PSZ256rm,      0 },
+  { X86::VCVTQQ2PSZrr,         X86::VCVTQQ2PSZrm,         0 },
+  { X86::VCVTSD2SI64Zrr_Int,   X86::VCVTSD2SI64Zrm_Int,   TB_NO_REVERSE },
+  { X86::VCVTSD2SI64rr_Int,    X86::VCVTSD2SI64rm_Int,    TB_NO_REVERSE },
+  { X86::VCVTSD2SIZrr_Int,     X86::VCVTSD2SIZrm_Int,     TB_NO_REVERSE },
+  { X86::VCVTSD2SIrr_Int,      X86::VCVTSD2SIrm_Int,      TB_NO_REVERSE },
+  { X86::VCVTSD2USI64Zrr_Int,  X86::VCVTSD2USI64Zrm_Int,  TB_NO_REVERSE },
+  { X86::VCVTSD2USIZrr_Int,    X86::VCVTSD2USIZrm_Int,    TB_NO_REVERSE },
+  { X86::VCVTSS2SI64Zrr_Int,   X86::VCVTSS2SI64Zrm_Int,   TB_NO_REVERSE },
+  { X86::VCVTSS2SI64rr_Int,    X86::VCVTSS2SI64rm_Int,    TB_NO_REVERSE },
+  { X86::VCVTSS2SIZrr_Int,     X86::VCVTSS2SIZrm_Int,     TB_NO_REVERSE },
+  { X86::VCVTSS2SIrr_Int,      X86::VCVTSS2SIrm_Int,      TB_NO_REVERSE },
+  { X86::VCVTSS2USI64Zrr_Int,  X86::VCVTSS2USI64Zrm_Int,  TB_NO_REVERSE },
+  { X86::VCVTSS2USIZrr_Int,    X86::VCVTSS2USIZrm_Int,    TB_NO_REVERSE },
+  { X86::VCVTTPD2DQYrr,        X86::VCVTTPD2DQYrm,        0 },
+  { X86::VCVTTPD2DQZ128rr,     X86::VCVTTPD2DQZ128rm,     0 },
+  { X86::VCVTTPD2DQZ256rr,     X86::VCVTTPD2DQZ256rm,     0 },
+  { X86::VCVTTPD2DQZrr,        X86::VCVTTPD2DQZrm,        0 },
+  { X86::VCVTTPD2DQrr,         X86::VCVTTPD2DQrm,         0 },
+  { X86::VCVTTPD2QQZ128rr,     X86::VCVTTPD2QQZ128rm,     0 },
+  { X86::VCVTTPD2QQZ256rr,     X86::VCVTTPD2QQZ256rm,     0 },
+  { X86::VCVTTPD2QQZrr,        X86::VCVTTPD2QQZrm,        0 },
+  { X86::VCVTTPD2UDQZ128rr,    X86::VCVTTPD2UDQZ128rm,    0 },
+  { X86::VCVTTPD2UDQZ256rr,    X86::VCVTTPD2UDQZ256rm,    0 },
+  { X86::VCVTTPD2UDQZrr,       X86::VCVTTPD2UDQZrm,       0 },
+  { X86::VCVTTPD2UQQZ128rr,    X86::VCVTTPD2UQQZ128rm,    0 },
+  { X86::VCVTTPD2UQQZ256rr,    X86::VCVTTPD2UQQZ256rm,    0 },
+  { X86::VCVTTPD2UQQZrr,       X86::VCVTTPD2UQQZrm,       0 },
+  { X86::VCVTTPS2DQYrr,        X86::VCVTTPS2DQYrm,        0 },
+  { X86::VCVTTPS2DQZ128rr,     X86::VCVTTPS2DQZ128rm,     0 },
+  { X86::VCVTTPS2DQZ256rr,     X86::VCVTTPS2DQZ256rm,     0 },
+  { X86::VCVTTPS2DQZrr,        X86::VCVTTPS2DQZrm,        0 },
+  { X86::VCVTTPS2DQrr,         X86::VCVTTPS2DQrm,         0 },
+  { X86::VCVTTPS2QQZ128rr,     X86::VCVTTPS2QQZ128rm,     TB_NO_REVERSE },
+  { X86::VCVTTPS2QQZ256rr,     X86::VCVTTPS2QQZ256rm,     0 },
+  { X86::VCVTTPS2QQZrr,        X86::VCVTTPS2QQZrm,        0 },
+  { X86::VCVTTPS2UDQZ128rr,    X86::VCVTTPS2UDQZ128rm,    0 },
+  { X86::VCVTTPS2UDQZ256rr,    X86::VCVTTPS2UDQZ256rm,    0 },
+  { X86::VCVTTPS2UDQZrr,       X86::VCVTTPS2UDQZrm,       0 },
+  { X86::VCVTTPS2UQQZ128rr,    X86::VCVTTPS2UQQZ128rm,    TB_NO_REVERSE },
+  { X86::VCVTTPS2UQQZ256rr,    X86::VCVTTPS2UQQZ256rm,    0 },
+  { X86::VCVTTPS2UQQZrr,       X86::VCVTTPS2UQQZrm,       0 },
+  { X86::VCVTTSD2SI64Zrr,      X86::VCVTTSD2SI64Zrm,      0 },
+  { X86::VCVTTSD2SI64Zrr_Int,  X86::VCVTTSD2SI64Zrm_Int,  TB_NO_REVERSE },
+  { X86::VCVTTSD2SI64rr,       X86::VCVTTSD2SI64rm,       0 },
+  { X86::VCVTTSD2SI64rr_Int,   X86::VCVTTSD2SI64rm_Int,   TB_NO_REVERSE },
+  { X86::VCVTTSD2SIZrr,        X86::VCVTTSD2SIZrm,        0 },
+  { X86::VCVTTSD2SIZrr_Int,    X86::VCVTTSD2SIZrm_Int,    TB_NO_REVERSE },
+  { X86::VCVTTSD2SIrr,         X86::VCVTTSD2SIrm,         0 },
+  { X86::VCVTTSD2SIrr_Int,     X86::VCVTTSD2SIrm_Int,     TB_NO_REVERSE },
+  { X86::VCVTTSD2USI64Zrr,     X86::VCVTTSD2USI64Zrm,     0 },
+  { X86::VCVTTSD2USI64Zrr_Int, X86::VCVTTSD2USI64Zrm_Int, TB_NO_REVERSE },
+  { X86::VCVTTSD2USIZrr,       X86::VCVTTSD2USIZrm,       0 },
+  { X86::VCVTTSD2USIZrr_Int,   X86::VCVTTSD2USIZrm_Int,   TB_NO_REVERSE },
+  { X86::VCVTTSS2SI64Zrr,      X86::VCVTTSS2SI64Zrm,      0 },
+  { X86::VCVTTSS2SI64Zrr_Int,  X86::VCVTTSS2SI64Zrm_Int,  TB_NO_REVERSE },
+  { X86::VCVTTSS2SI64rr,       X86::VCVTTSS2SI64rm,       0 },
+  { X86::VCVTTSS2SI64rr_Int,   X86::VCVTTSS2SI64rm_Int,   TB_NO_REVERSE },
+  { X86::VCVTTSS2SIZrr,        X86::VCVTTSS2SIZrm,        0 },
+  { X86::VCVTTSS2SIZrr_Int,    X86::VCVTTSS2SIZrm_Int,    TB_NO_REVERSE },
+  { X86::VCVTTSS2SIrr,         X86::VCVTTSS2SIrm,         0 },
+  { X86::VCVTTSS2SIrr_Int,     X86::VCVTTSS2SIrm_Int,     TB_NO_REVERSE },
+  { X86::VCVTTSS2USI64Zrr,     X86::VCVTTSS2USI64Zrm,     0 },
+  { X86::VCVTTSS2USI64Zrr_Int, X86::VCVTTSS2USI64Zrm_Int, TB_NO_REVERSE },
+  { X86::VCVTTSS2USIZrr,       X86::VCVTTSS2USIZrm,       0 },
+  { X86::VCVTTSS2USIZrr_Int,   X86::VCVTTSS2USIZrm_Int,   TB_NO_REVERSE },
+  { X86::VCVTUDQ2PDZ128rr,     X86::VCVTUDQ2PDZ128rm,     TB_NO_REVERSE },
+  { X86::VCVTUDQ2PDZ256rr,     X86::VCVTUDQ2PDZ256rm,     0 },
+  { X86::VCVTUDQ2PDZrr,        X86::VCVTUDQ2PDZrm,        0 },
+  { X86::VCVTUDQ2PSZ128rr,     X86::VCVTUDQ2PSZ128rm,     0 },
+  { X86::VCVTUDQ2PSZ256rr,     X86::VCVTUDQ2PSZ256rm,     0 },
+  { X86::VCVTUDQ2PSZrr,        X86::VCVTUDQ2PSZrm,        0 },
+  { X86::VCVTUQQ2PDZ128rr,     X86::VCVTUQQ2PDZ128rm,     0 },
+  { X86::VCVTUQQ2PDZ256rr,     X86::VCVTUQQ2PDZ256rm,     0 },
+  { X86::VCVTUQQ2PDZrr,        X86::VCVTUQQ2PDZrm,        0 },
+  { X86::VCVTUQQ2PSZ128rr,     X86::VCVTUQQ2PSZ128rm,     0 },
+  { X86::VCVTUQQ2PSZ256rr,     X86::VCVTUQQ2PSZ256rm,     0 },
+  { X86::VCVTUQQ2PSZrr,        X86::VCVTUQQ2PSZrm,        0 },
+  { X86::VEXP2PDZr,            X86::VEXP2PDZm,            0 },
+  { X86::VEXP2PSZr,            X86::VEXP2PSZm,            0 },
+  { X86::VEXPANDPDZ128rr,      X86::VEXPANDPDZ128rm,      TB_NO_REVERSE },
+  { X86::VEXPANDPDZ256rr,      X86::VEXPANDPDZ256rm,      TB_NO_REVERSE },
+  { X86::VEXPANDPDZrr,         X86::VEXPANDPDZrm,         TB_NO_REVERSE },
+  { X86::VEXPANDPSZ128rr,      X86::VEXPANDPSZ128rm,      TB_NO_REVERSE },
+  { X86::VEXPANDPSZ256rr,      X86::VEXPANDPSZ256rm,      TB_NO_REVERSE },
+  { X86::VEXPANDPSZrr,         X86::VEXPANDPSZrm,         TB_NO_REVERSE },
+  { X86::VFPCLASSPDZ128rr,     X86::VFPCLASSPDZ128rm,     0 },
+  { X86::VFPCLASSPDZ256rr,     X86::VFPCLASSPDZ256rm,     0 },
+  { X86::VFPCLASSPDZrr,        X86::VFPCLASSPDZrm,        0 },
+  { X86::VFPCLASSPSZ128rr,     X86::VFPCLASSPSZ128rm,     0 },
+  { X86::VFPCLASSPSZ256rr,     X86::VFPCLASSPSZ256rm,     0 },
+  { X86::VFPCLASSPSZrr,        X86::VFPCLASSPSZrm,        0 },
+  { X86::VFPCLASSSDZrr,        X86::VFPCLASSSDZrm,        TB_NO_REVERSE },
+  { X86::VFPCLASSSSZrr,        X86::VFPCLASSSSZrm,        TB_NO_REVERSE },
+  { X86::VFRCZPDYrr,           X86::VFRCZPDYrm,           0 },
+  { X86::VFRCZPDrr,            X86::VFRCZPDrm,            0 },
+  { X86::VFRCZPSYrr,           X86::VFRCZPSYrm,           0 },
+  { X86::VFRCZPSrr,            X86::VFRCZPSrm,            0 },
+  { X86::VFRCZSDrr,            X86::VFRCZSDrm,            TB_NO_REVERSE },
+  { X86::VFRCZSSrr,            X86::VFRCZSSrm,            TB_NO_REVERSE },
+  { X86::VGETEXPPDZ128r,       X86::VGETEXPPDZ128m,       0 },
+  { X86::VGETEXPPDZ256r,       X86::VGETEXPPDZ256m,       0 },
+  { X86::VGETEXPPDZr,          X86::VGETEXPPDZm,          0 },
+  { X86::VGETEXPPSZ128r,       X86::VGETEXPPSZ128m,       0 },
+  { X86::VGETEXPPSZ256r,       X86::VGETEXPPSZ256m,       0 },
+  { X86::VGETEXPPSZr,          X86::VGETEXPPSZm,          0 },
+  { X86::VGETMANTPDZ128rri,    X86::VGETMANTPDZ128rmi,    0 },
+  { X86::VGETMANTPDZ256rri,    X86::VGETMANTPDZ256rmi,    0 },
+  { X86::VGETMANTPDZrri,       X86::VGETMANTPDZrmi,       0 },
+  { X86::VGETMANTPSZ128rri,    X86::VGETMANTPSZ128rmi,    0 },
+  { X86::VGETMANTPSZ256rri,    X86::VGETMANTPSZ256rmi,    0 },
+  { X86::VGETMANTPSZrri,       X86::VGETMANTPSZrmi,       0 },
+  { X86::VMOV64toPQIZrr,       X86::VMOVQI2PQIZrm,        0 },
+  { X86::VMOV64toPQIrr,        X86::VMOVQI2PQIrm,         0 },
+  { X86::VMOV64toSDZrr,        X86::VMOV64toSDZrm,        0 },
+  { X86::VMOV64toSDrr,         X86::VMOV64toSDrm,         0 },
+  { X86::VMOVAPDYrr,           X86::VMOVAPDYrm,           TB_ALIGN_32 },
+  { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
+  { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
+  { X86::VMOVAPDZrr,           X86::VMOVAPDZrm,           TB_ALIGN_64 },
+  { X86::VMOVAPDrr,            X86::VMOVAPDrm,            TB_ALIGN_16 },
+  { X86::VMOVAPSYrr,           X86::VMOVAPSYrm,           TB_ALIGN_32 },
+  { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
+  { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
+  { X86::VMOVAPSZrr,           X86::VMOVAPSZrm,           TB_ALIGN_64 },
+  { X86::VMOVAPSrr,            X86::VMOVAPSrm,            TB_ALIGN_16 },
+  { X86::VMOVDDUPYrr,          X86::VMOVDDUPYrm,          0 },
+  { X86::VMOVDDUPZ128rr,       X86::VMOVDDUPZ128rm,       TB_NO_REVERSE },
+  { X86::VMOVDDUPZ256rr,       X86::VMOVDDUPZ256rm,       0 },
+  { X86::VMOVDDUPZrr,          X86::VMOVDDUPZrm,          0 },
+  { X86::VMOVDDUPrr,           X86::VMOVDDUPrm,           TB_NO_REVERSE },
+  { X86::VMOVDI2PDIZrr,        X86::VMOVDI2PDIZrm,        0 },
+  { X86::VMOVDI2PDIrr,         X86::VMOVDI2PDIrm,         0 },
+  { X86::VMOVDI2SSZrr,         X86::VMOVDI2SSZrm,         0 },
+  { X86::VMOVDI2SSrr,          X86::VMOVDI2SSrm,          0 },
+  { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
+  { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
+  { X86::VMOVDQA32Zrr,         X86::VMOVDQA32Zrm,         TB_ALIGN_64 },
+  { X86::VMOVDQA64Z128rr,      X86::VMOVDQA64Z128rm,      TB_ALIGN_16 },
+  { X86::VMOVDQA64Z256rr,      X86::VMOVDQA64Z256rm,      TB_ALIGN_32 },
+  { X86::VMOVDQA64Zrr,         X86::VMOVDQA64Zrm,         TB_ALIGN_64 },
+  { X86::VMOVDQAYrr,           X86::VMOVDQAYrm,           TB_ALIGN_32 },
+  { X86::VMOVDQArr,            X86::VMOVDQArm,            TB_ALIGN_16 },
+  { X86::VMOVDQU16Z128rr,      X86::VMOVDQU16Z128rm,      0 },
+  { X86::VMOVDQU16Z256rr,      X86::VMOVDQU16Z256rm,      0 },
+  { X86::VMOVDQU16Zrr,         X86::VMOVDQU16Zrm,         0 },
+  { X86::VMOVDQU32Z128rr,      X86::VMOVDQU32Z128rm,      0 },
+  { X86::VMOVDQU32Z256rr,      X86::VMOVDQU32Z256rm,      0 },
+  { X86::VMOVDQU32Zrr,         X86::VMOVDQU32Zrm,         0 },
+  { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
+  { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
+  { X86::VMOVDQU64Zrr,         X86::VMOVDQU64Zrm,         0 },
+  { X86::VMOVDQU8Z128rr,       X86::VMOVDQU8Z128rm,       0 },
+  { X86::VMOVDQU8Z256rr,       X86::VMOVDQU8Z256rm,       0 },
+  { X86::VMOVDQU8Zrr,          X86::VMOVDQU8Zrm,          0 },
+  { X86::VMOVDQUYrr,           X86::VMOVDQUYrm,           0 },
+  { X86::VMOVDQUrr,            X86::VMOVDQUrm,            0 },
+  { X86::VMOVSHDUPYrr,         X86::VMOVSHDUPYrm,         0 },
+  { X86::VMOVSHDUPZ128rr,      X86::VMOVSHDUPZ128rm,      0 },
+  { X86::VMOVSHDUPZ256rr,      X86::VMOVSHDUPZ256rm,      0 },
+  { X86::VMOVSHDUPZrr,         X86::VMOVSHDUPZrm,         0 },
+  { X86::VMOVSHDUPrr,          X86::VMOVSHDUPrm,          0 },
+  { X86::VMOVSLDUPYrr,         X86::VMOVSLDUPYrm,         0 },
+  { X86::VMOVSLDUPZ128rr,      X86::VMOVSLDUPZ128rm,      0 },
+  { X86::VMOVSLDUPZ256rr,      X86::VMOVSLDUPZ256rm,      0 },
+  { X86::VMOVSLDUPZrr,         X86::VMOVSLDUPZrm,         0 },
+  { X86::VMOVSLDUPrr,          X86::VMOVSLDUPrm,          0 },
+  { X86::VMOVUPDYrr,           X86::VMOVUPDYrm,           0 },
+  { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
+  { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
+  { X86::VMOVUPDZrr,           X86::VMOVUPDZrm,           0 },
+  { X86::VMOVUPDrr,            X86::VMOVUPDrm,            0 },
+  { X86::VMOVUPSYrr,           X86::VMOVUPSYrm,           0 },
+  { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+  { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+  { X86::VMOVUPSZrr,           X86::VMOVUPSZrm,           0 },
+  { X86::VMOVUPSrr,            X86::VMOVUPSrm,            0 },
+  { X86::VMOVZPQILo2PQIZrr,    X86::VMOVQI2PQIZrm,        TB_NO_REVERSE },
+  { X86::VMOVZPQILo2PQIrr,     X86::VMOVQI2PQIrm,         TB_NO_REVERSE },
+  { X86::VPABSBYrr,            X86::VPABSBYrm,            0 },
+  { X86::VPABSBZ128rr,         X86::VPABSBZ128rm,         0 },
+  { X86::VPABSBZ256rr,         X86::VPABSBZ256rm,         0 },
+  { X86::VPABSBZrr,            X86::VPABSBZrm,            0 },
+  { X86::VPABSBrr,             X86::VPABSBrm,             0 },
+  { X86::VPABSDYrr,            X86::VPABSDYrm,            0 },
+  { X86::VPABSDZ128rr,         X86::VPABSDZ128rm,         0 },
+  { X86::VPABSDZ256rr,         X86::VPABSDZ256rm,         0 },
+  { X86::VPABSDZrr,            X86::VPABSDZrm,            0 },
+  { X86::VPABSDrr,             X86::VPABSDrm,             0 },
+  { X86::VPABSQZ128rr,         X86::VPABSQZ128rm,         0 },
+  { X86::VPABSQZ256rr,         X86::VPABSQZ256rm,         0 },
+  { X86::VPABSQZrr,            X86::VPABSQZrm,            0 },
+  { X86::VPABSWYrr,            X86::VPABSWYrm,            0 },
+  { X86::VPABSWZ128rr,         X86::VPABSWZ128rm,         0 },
+  { X86::VPABSWZ256rr,         X86::VPABSWZ256rm,         0 },
+  { X86::VPABSWZrr,            X86::VPABSWZrm,            0 },
+  { X86::VPABSWrr,             X86::VPABSWrm,             0 },
+  { X86::VPBROADCASTBYrr,      X86::VPBROADCASTBYrm,      TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ128r,    X86::VPBROADCASTBZ128m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ256r,    X86::VPBROADCASTBZ256m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTBZr,       X86::VPBROADCASTBZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTBrr,       X86::VPBROADCASTBrm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTDYrr,      X86::VPBROADCASTDYrm,      TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ128r,    X86::VPBROADCASTDZ128m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ256r,    X86::VPBROADCASTDZ256m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTDZr,       X86::VPBROADCASTDZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTDrr,       X86::VPBROADCASTDrm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTQYrr,      X86::VPBROADCASTQYrm,      TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ128r,    X86::VPBROADCASTQZ128m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ256r,    X86::VPBROADCASTQZ256m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTQZr,       X86::VPBROADCASTQZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTQrr,       X86::VPBROADCASTQrm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTWYrr,      X86::VPBROADCASTWYrm,      TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ128r,    X86::VPBROADCASTWZ128m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ256r,    X86::VPBROADCASTWZ256m,    TB_NO_REVERSE },
+  { X86::VPBROADCASTWZr,       X86::VPBROADCASTWZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTWrr,       X86::VPBROADCASTWrm,       TB_NO_REVERSE },
+  { X86::VPCMPESTRIrr,         X86::VPCMPESTRIrm,         0 },
+  { X86::VPCMPESTRMrr,         X86::VPCMPESTRMrm,         0 },
+  { X86::VPCMPISTRIrr,         X86::VPCMPISTRIrm,         0 },
+  { X86::VPCMPISTRMrr,         X86::VPCMPISTRMrm,         0 },
+  { X86::VPCONFLICTDZ128rr,    X86::VPCONFLICTDZ128rm,    0 },
+  { X86::VPCONFLICTDZ256rr,    X86::VPCONFLICTDZ256rm,    0 },
+  { X86::VPCONFLICTDZrr,       X86::VPCONFLICTDZrm,       0 },
+  { X86::VPCONFLICTQZ128rr,    X86::VPCONFLICTQZ128rm,    0 },
+  { X86::VPCONFLICTQZ256rr,    X86::VPCONFLICTQZ256rm,    0 },
+  { X86::VPCONFLICTQZrr,       X86::VPCONFLICTQZrm,       0 },
+  { X86::VPERMILPDYri,         X86::VPERMILPDYmi,         0 },
+  { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
+  { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
+  { X86::VPERMILPDZri,         X86::VPERMILPDZmi,         0 },
+  { X86::VPERMILPDri,          X86::VPERMILPDmi,          0 },
+  { X86::VPERMILPSYri,         X86::VPERMILPSYmi,         0 },
+  { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
+  { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
+  { X86::VPERMILPSZri,         X86::VPERMILPSZmi,         0 },
+  { X86::VPERMILPSri,          X86::VPERMILPSmi,          0 },
+  { X86::VPERMPDYri,           X86::VPERMPDYmi,           0 },
+  { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
+  { X86::VPERMPDZri,           X86::VPERMPDZmi,           0 },
+  { X86::VPERMQYri,            X86::VPERMQYmi,            0 },
+  { X86::VPERMQZ256ri,         X86::VPERMQZ256mi,         0 },
+  { X86::VPERMQZri,            X86::VPERMQZmi,            0 },
+  { X86::VPEXPANDBZ128rr,      X86::VPEXPANDBZ128rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDBZ256rr,      X86::VPEXPANDBZ256rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDBZrr,         X86::VPEXPANDBZrm,         TB_NO_REVERSE },
+  { X86::VPEXPANDDZ128rr,      X86::VPEXPANDDZ128rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDDZ256rr,      X86::VPEXPANDDZ256rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDDZrr,         X86::VPEXPANDDZrm,         TB_NO_REVERSE },
+  { X86::VPEXPANDQZ128rr,      X86::VPEXPANDQZ128rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDQZ256rr,      X86::VPEXPANDQZ256rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDQZrr,         X86::VPEXPANDQZrm,         TB_NO_REVERSE },
+  { X86::VPEXPANDWZ128rr,      X86::VPEXPANDWZ128rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDWZ256rr,      X86::VPEXPANDWZ256rm,      TB_NO_REVERSE },
+  { X86::VPEXPANDWZrr,         X86::VPEXPANDWZrm,         TB_NO_REVERSE },
+  { X86::VPHADDBDrr,           X86::VPHADDBDrm,           0 },
+  { X86::VPHADDBQrr,           X86::VPHADDBQrm,           0 },
+  { X86::VPHADDBWrr,           X86::VPHADDBWrm,           0 },
+  { X86::VPHADDDQrr,           X86::VPHADDDQrm,           0 },
+  { X86::VPHADDUBDrr,          X86::VPHADDUBDrm,          0 },
+  { X86::VPHADDUBQrr,          X86::VPHADDUBQrm,          0 },
+  { X86::VPHADDUBWrr,          X86::VPHADDUBWrm,          0 },
+  { X86::VPHADDUDQrr,          X86::VPHADDUDQrm,          0 },
+  { X86::VPHADDUWDrr,          X86::VPHADDUWDrm,          0 },
+  { X86::VPHADDUWQrr,          X86::VPHADDUWQrm,          0 },
+  { X86::VPHADDWDrr,           X86::VPHADDWDrm,           0 },
+  { X86::VPHADDWQrr,           X86::VPHADDWQrm,           0 },
+  { X86::VPHMINPOSUWrr,        X86::VPHMINPOSUWrm,        0 },
+  { X86::VPHSUBBWrr,           X86::VPHSUBBWrm,           0 },
+  { X86::VPHSUBDQrr,           X86::VPHSUBDQrm,           0 },
+  { X86::VPHSUBWDrr,           X86::VPHSUBWDrm,           0 },
+  { X86::VPLZCNTDZ128rr,       X86::VPLZCNTDZ128rm,       0 },
+  { X86::VPLZCNTDZ256rr,       X86::VPLZCNTDZ256rm,       0 },
+  { X86::VPLZCNTDZrr,          X86::VPLZCNTDZrm,          0 },
+  { X86::VPLZCNTQZ128rr,       X86::VPLZCNTQZ128rm,       0 },
+  { X86::VPLZCNTQZ256rr,       X86::VPLZCNTQZ256rm,       0 },
+  { X86::VPLZCNTQZrr,          X86::VPLZCNTQZrm,          0 },
+  { X86::VPMOVSXBDYrr,         X86::VPMOVSXBDYrm,         TB_NO_REVERSE },
+  { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXBDZ256rr,      X86::VPMOVSXBDZ256rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXBDZrr,         X86::VPMOVSXBDZrm,         0 },
+  { X86::VPMOVSXBDrr,          X86::VPMOVSXBDrm,          TB_NO_REVERSE },
+  { X86::VPMOVSXBQYrr,         X86::VPMOVSXBQYrm,         TB_NO_REVERSE },
+  { X86::VPMOVSXBQZ128rr,      X86::VPMOVSXBQZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXBQZ256rr,      X86::VPMOVSXBQZ256rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXBQZrr,         X86::VPMOVSXBQZrm,         TB_NO_REVERSE },
+  { X86::VPMOVSXBQrr,          X86::VPMOVSXBQrm,          TB_NO_REVERSE },
+  { X86::VPMOVSXBWYrr,         X86::VPMOVSXBWYrm,         0 },
+  { X86::VPMOVSXBWZ128rr,      X86::VPMOVSXBWZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXBWZ256rr,      X86::VPMOVSXBWZ256rm,      0 },
+  { X86::VPMOVSXBWZrr,         X86::VPMOVSXBWZrm,         0 },
+  { X86::VPMOVSXBWrr,          X86::VPMOVSXBWrm,          TB_NO_REVERSE },
+  { X86::VPMOVSXDQYrr,         X86::VPMOVSXDQYrm,         0 },
+  { X86::VPMOVSXDQZ128rr,      X86::VPMOVSXDQZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXDQZ256rr,      X86::VPMOVSXDQZ256rm,      0 },
+  { X86::VPMOVSXDQZrr,         X86::VPMOVSXDQZrm,         0 },
+  { X86::VPMOVSXDQrr,          X86::VPMOVSXDQrm,          TB_NO_REVERSE },
+  { X86::VPMOVSXWDYrr,         X86::VPMOVSXWDYrm,         0 },
+  { X86::VPMOVSXWDZ128rr,      X86::VPMOVSXWDZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXWDZ256rr,      X86::VPMOVSXWDZ256rm,      0 },
+  { X86::VPMOVSXWDZrr,         X86::VPMOVSXWDZrm,         0 },
+  { X86::VPMOVSXWDrr,          X86::VPMOVSXWDrm,          TB_NO_REVERSE },
+  { X86::VPMOVSXWQYrr,         X86::VPMOVSXWQYrm,         TB_NO_REVERSE },
+  { X86::VPMOVSXWQZ128rr,      X86::VPMOVSXWQZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXWQZ256rr,      X86::VPMOVSXWQZ256rm,      TB_NO_REVERSE },
+  { X86::VPMOVSXWQZrr,         X86::VPMOVSXWQZrm,         0 },
+  { X86::VPMOVSXWQrr,          X86::VPMOVSXWQrm,          TB_NO_REVERSE },
+  { X86::VPMOVZXBDYrr,         X86::VPMOVZXBDYrm,         TB_NO_REVERSE },
+  { X86::VPMOVZXBDZ128rr,      X86::VPMOVZXBDZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXBDZ256rr,      X86::VPMOVZXBDZ256rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXBDZrr,         X86::VPMOVZXBDZrm,         0 },
+  { X86::VPMOVZXBDrr,          X86::VPMOVZXBDrm,          TB_NO_REVERSE },
+  { X86::VPMOVZXBQYrr,         X86::VPMOVZXBQYrm,         TB_NO_REVERSE },
+  { X86::VPMOVZXBQZ128rr,      X86::VPMOVZXBQZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXBQZ256rr,      X86::VPMOVZXBQZ256rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXBQZrr,         X86::VPMOVZXBQZrm,         TB_NO_REVERSE },
+  { X86::VPMOVZXBQrr,          X86::VPMOVZXBQrm,          TB_NO_REVERSE },
+  { X86::VPMOVZXBWYrr,         X86::VPMOVZXBWYrm,         0 },
+  { X86::VPMOVZXBWZ128rr,      X86::VPMOVZXBWZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXBWZ256rr,      X86::VPMOVZXBWZ256rm,      0 },
+  { X86::VPMOVZXBWZrr,         X86::VPMOVZXBWZrm,         0 },
+  { X86::VPMOVZXBWrr,          X86::VPMOVZXBWrm,          TB_NO_REVERSE },
+  { X86::VPMOVZXDQYrr,         X86::VPMOVZXDQYrm,         0 },
+  { X86::VPMOVZXDQZ128rr,      X86::VPMOVZXDQZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXDQZ256rr,      X86::VPMOVZXDQZ256rm,      0 },
+  { X86::VPMOVZXDQZrr,         X86::VPMOVZXDQZrm,         0 },
+  { X86::VPMOVZXDQrr,          X86::VPMOVZXDQrm,          TB_NO_REVERSE },
+  { X86::VPMOVZXWDYrr,         X86::VPMOVZXWDYrm,         0 },
+  { X86::VPMOVZXWDZ128rr,      X86::VPMOVZXWDZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXWDZ256rr,      X86::VPMOVZXWDZ256rm,      0 },
+  { X86::VPMOVZXWDZrr,         X86::VPMOVZXWDZrm,         0 },
+  { X86::VPMOVZXWDrr,          X86::VPMOVZXWDrm,          TB_NO_REVERSE },
+  { X86::VPMOVZXWQYrr,         X86::VPMOVZXWQYrm,         TB_NO_REVERSE },
+  { X86::VPMOVZXWQZ128rr,      X86::VPMOVZXWQZ128rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXWQZ256rr,      X86::VPMOVZXWQZ256rm,      TB_NO_REVERSE },
+  { X86::VPMOVZXWQZrr,         X86::VPMOVZXWQZrm,         0 },
+  { X86::VPMOVZXWQrr,          X86::VPMOVZXWQrm,          TB_NO_REVERSE },
+  { X86::VPOPCNTBZ128rr,       X86::VPOPCNTBZ128rm,       0 },
+  { X86::VPOPCNTBZ256rr,       X86::VPOPCNTBZ256rm,       0 },
+  { X86::VPOPCNTBZrr,          X86::VPOPCNTBZrm,          0 },
+  { X86::VPOPCNTDZ128rr,       X86::VPOPCNTDZ128rm,       0 },
+  { X86::VPOPCNTDZ256rr,       X86::VPOPCNTDZ256rm,       0 },
+  { X86::VPOPCNTDZrr,          X86::VPOPCNTDZrm,          0 },
+  { X86::VPOPCNTQZ128rr,       X86::VPOPCNTQZ128rm,       0 },
+  { X86::VPOPCNTQZ256rr,       X86::VPOPCNTQZ256rm,       0 },
+  { X86::VPOPCNTQZrr,          X86::VPOPCNTQZrm,          0 },
+  { X86::VPOPCNTWZ128rr,       X86::VPOPCNTWZ128rm,       0 },
+  { X86::VPOPCNTWZ256rr,       X86::VPOPCNTWZ256rm,       0 },
+  { X86::VPOPCNTWZrr,          X86::VPOPCNTWZrm,          0 },
+  { X86::VPROLDZ128ri,         X86::VPROLDZ128mi,         0 },
+  { X86::VPROLDZ256ri,         X86::VPROLDZ256mi,         0 },
+  { X86::VPROLDZri,            X86::VPROLDZmi,            0 },
+  { X86::VPROLQZ128ri,         X86::VPROLQZ128mi,         0 },
+  { X86::VPROLQZ256ri,         X86::VPROLQZ256mi,         0 },
+  { X86::VPROLQZri,            X86::VPROLQZmi,            0 },
+  { X86::VPRORDZ128ri,         X86::VPRORDZ128mi,         0 },
+  { X86::VPRORDZ256ri,         X86::VPRORDZ256mi,         0 },
+  { X86::VPRORDZri,            X86::VPRORDZmi,            0 },
+  { X86::VPRORQZ128ri,         X86::VPRORQZ128mi,         0 },
+  { X86::VPRORQZ256ri,         X86::VPRORQZ256mi,         0 },
+  { X86::VPRORQZri,            X86::VPRORQZmi,            0 },
+  { X86::VPROTBri,             X86::VPROTBmi,             0 },
+  { X86::VPROTBrr,             X86::VPROTBmr,             0 },
+  { X86::VPROTDri,             X86::VPROTDmi,             0 },
+  { X86::VPROTDrr,             X86::VPROTDmr,             0 },
+  { X86::VPROTQri,             X86::VPROTQmi,             0 },
+  { X86::VPROTQrr,             X86::VPROTQmr,             0 },
+  { X86::VPROTWri,             X86::VPROTWmi,             0 },
+  { X86::VPROTWrr,             X86::VPROTWmr,             0 },
+  { X86::VPSHABrr,             X86::VPSHABmr,             0 },
+  { X86::VPSHADrr,             X86::VPSHADmr,             0 },
+  { X86::VPSHAQrr,             X86::VPSHAQmr,             0 },
+  { X86::VPSHAWrr,             X86::VPSHAWmr,             0 },
+  { X86::VPSHLBrr,             X86::VPSHLBmr,             0 },
+  { X86::VPSHLDrr,             X86::VPSHLDmr,             0 },
+  { X86::VPSHLQrr,             X86::VPSHLQmr,             0 },
+  { X86::VPSHLWrr,             X86::VPSHLWmr,             0 },
+  { X86::VPSHUFDYri,           X86::VPSHUFDYmi,           0 },
+  { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
+  { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
+  { X86::VPSHUFDZri,           X86::VPSHUFDZmi,           0 },
+  { X86::VPSHUFDri,            X86::VPSHUFDmi,            0 },
+  { X86::VPSHUFHWYri,          X86::VPSHUFHWYmi,          0 },
+  { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
+  { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
+  { X86::VPSHUFHWZri,          X86::VPSHUFHWZmi,          0 },
+  { X86::VPSHUFHWri,           X86::VPSHUFHWmi,           0 },
+  { X86::VPSHUFLWYri,          X86::VPSHUFLWYmi,          0 },
+  { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
+  { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
+  { X86::VPSHUFLWZri,          X86::VPSHUFLWZmi,          0 },
+  { X86::VPSHUFLWri,           X86::VPSHUFLWmi,           0 },
+  { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
+  { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
+  { X86::VPSLLDQZrr,           X86::VPSLLDQZrm,           0 },
+  { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
+  { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
+  { X86::VPSLLDZri,            X86::VPSLLDZmi,            0 },
+  { X86::VPSLLQZ128ri,         X86::VPSLLQZ128mi,         0 },
+  { X86::VPSLLQZ256ri,         X86::VPSLLQZ256mi,         0 },
+  { X86::VPSLLQZri,            X86::VPSLLQZmi,            0 },
+  { X86::VPSLLWZ128ri,         X86::VPSLLWZ128mi,         0 },
+  { X86::VPSLLWZ256ri,         X86::VPSLLWZ256mi,         0 },
+  { X86::VPSLLWZri,            X86::VPSLLWZmi,            0 },
+  { X86::VPSRADZ128ri,         X86::VPSRADZ128mi,         0 },
+  { X86::VPSRADZ256ri,         X86::VPSRADZ256mi,         0 },
+  { X86::VPSRADZri,            X86::VPSRADZmi,            0 },
+  { X86::VPSRAQZ128ri,         X86::VPSRAQZ128mi,         0 },
+  { X86::VPSRAQZ256ri,         X86::VPSRAQZ256mi,         0 },
+  { X86::VPSRAQZri,            X86::VPSRAQZmi,            0 },
+  { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
+  { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
+  { X86::VPSRAWZri,            X86::VPSRAWZmi,            0 },
+  { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
+  { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
+  { X86::VPSRLDQZrr,           X86::VPSRLDQZrm,           0 },
+  { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
+  { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
+  { X86::VPSRLDZri,            X86::VPSRLDZmi,            0 },
+  { X86::VPSRLQZ128ri,         X86::VPSRLQZ128mi,         0 },
+  { X86::VPSRLQZ256ri,         X86::VPSRLQZ256mi,         0 },
+  { X86::VPSRLQZri,            X86::VPSRLQZmi,            0 },
+  { X86::VPSRLWZ128ri,         X86::VPSRLWZ128mi,         0 },
+  { X86::VPSRLWZ256ri,         X86::VPSRLWZ256mi,         0 },
+  { X86::VPSRLWZri,            X86::VPSRLWZmi,            0 },
+  { X86::VPTESTYrr,            X86::VPTESTYrm,            0 },
+  { X86::VPTESTrr,             X86::VPTESTrm,             0 },
+  { X86::VRCP14PDZ128r,        X86::VRCP14PDZ128m,        0 },
+  { X86::VRCP14PDZ256r,        X86::VRCP14PDZ256m,        0 },
+  { X86::VRCP14PDZr,           X86::VRCP14PDZm,           0 },
+  { X86::VRCP14PSZ128r,        X86::VRCP14PSZ128m,        0 },
+  { X86::VRCP14PSZ256r,        X86::VRCP14PSZ256m,        0 },
+  { X86::VRCP14PSZr,           X86::VRCP14PSZm,           0 },
+  { X86::VRCP28PDZr,           X86::VRCP28PDZm,           0 },
+  { X86::VRCP28PSZr,           X86::VRCP28PSZm,           0 },
+  { X86::VRCPPSYr,             X86::VRCPPSYm,             0 },
+  { X86::VRCPPSr,              X86::VRCPPSm,              0 },
+  { X86::VREDUCEPDZ128rri,     X86::VREDUCEPDZ128rmi,     0 },
+  { X86::VREDUCEPDZ256rri,     X86::VREDUCEPDZ256rmi,     0 },
+  { X86::VREDUCEPDZrri,        X86::VREDUCEPDZrmi,        0 },
+  { X86::VREDUCEPSZ128rri,     X86::VREDUCEPSZ128rmi,     0 },
+  { X86::VREDUCEPSZ256rri,     X86::VREDUCEPSZ256rmi,     0 },
+  { X86::VREDUCEPSZrri,        X86::VREDUCEPSZrmi,        0 },
+  { X86::VRNDSCALEPDZ128rri,   X86::VRNDSCALEPDZ128rmi,   0 },
+  { X86::VRNDSCALEPDZ256rri,   X86::VRNDSCALEPDZ256rmi,   0 },
+  { X86::VRNDSCALEPDZrri,      X86::VRNDSCALEPDZrmi,      0 },
+  { X86::VRNDSCALEPSZ128rri,   X86::VRNDSCALEPSZ128rmi,   0 },
+  { X86::VRNDSCALEPSZ256rri,   X86::VRNDSCALEPSZ256rmi,   0 },
+  { X86::VRNDSCALEPSZrri,      X86::VRNDSCALEPSZrmi,      0 },
+  { X86::VROUNDPDYr,           X86::VROUNDPDYm,           0 },
+  { X86::VROUNDPDr,            X86::VROUNDPDm,            0 },
+  { X86::VROUNDPSYr,           X86::VROUNDPSYm,           0 },
+  { X86::VROUNDPSr,            X86::VROUNDPSm,            0 },
+  { X86::VRSQRT14PDZ128r,      X86::VRSQRT14PDZ128m,      0 },
+  { X86::VRSQRT14PDZ256r,      X86::VRSQRT14PDZ256m,      0 },
+  { X86::VRSQRT14PDZr,         X86::VRSQRT14PDZm,         0 },
+  { X86::VRSQRT14PSZ128r,      X86::VRSQRT14PSZ128m,      0 },
+  { X86::VRSQRT14PSZ256r,      X86::VRSQRT14PSZ256m,      0 },
+  { X86::VRSQRT14PSZr,         X86::VRSQRT14PSZm,         0 },
+  { X86::VRSQRT28PDZr,         X86::VRSQRT28PDZm,         0 },
+  { X86::VRSQRT28PSZr,         X86::VRSQRT28PSZm,         0 },
+  { X86::VRSQRTPSYr,           X86::VRSQRTPSYm,           0 },
+  { X86::VRSQRTPSr,            X86::VRSQRTPSm,            0 },
+  { X86::VSQRTPDYr,            X86::VSQRTPDYm,            0 },
+  { X86::VSQRTPDZ128r,         X86::VSQRTPDZ128m,         0 },
+  { X86::VSQRTPDZ256r,         X86::VSQRTPDZ256m,         0 },
+  { X86::VSQRTPDZr,            X86::VSQRTPDZm,            0 },
+  { X86::VSQRTPDr,             X86::VSQRTPDm,             0 },
+  { X86::VSQRTPSYr,            X86::VSQRTPSYm,            0 },
+  { X86::VSQRTPSZ128r,         X86::VSQRTPSZ128m,         0 },
+  { X86::VSQRTPSZ256r,         X86::VSQRTPSZ256m,         0 },
+  { X86::VSQRTPSZr,            X86::VSQRTPSZm,            0 },
+  { X86::VSQRTPSr,             X86::VSQRTPSm,             0 },
+  { X86::VTESTPDYrr,           X86::VTESTPDYrm,           0 },
+  { X86::VTESTPDrr,            X86::VTESTPDrm,            0 },
+  { X86::VTESTPSYrr,           X86::VTESTPSYrm,           0 },
+  { X86::VTESTPSrr,            X86::VTESTPSrm,            0 },
+  { X86::VUCOMISDZrr,          X86::VUCOMISDZrm,          0 },
+  { X86::VUCOMISDZrr_Int,      X86::VUCOMISDZrm_Int,      TB_NO_REVERSE },
+  { X86::VUCOMISDrr,           X86::VUCOMISDrm,           0 },
+  { X86::VUCOMISDrr_Int,       X86::VUCOMISDrm_Int,       TB_NO_REVERSE },
+  { X86::VUCOMISSZrr,          X86::VUCOMISSZrm,          0 },
+  { X86::VUCOMISSZrr_Int,      X86::VUCOMISSZrm_Int,      TB_NO_REVERSE },
+  { X86::VUCOMISSrr,           X86::VUCOMISSrm,           0 },
+  { X86::VUCOMISSrr_Int,       X86::VUCOMISSrm_Int,       TB_NO_REVERSE },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+  { X86::ADC16rr,                  X86::ADC16rm,                  0 },
+  { X86::ADC32rr,                  X86::ADC32rm,                  0 },
+  { X86::ADC64rr,                  X86::ADC64rm,                  0 },
+  { X86::ADC8rr,                   X86::ADC8rm,                   0 },
+  { X86::ADCX32rr,                 X86::ADCX32rm,                 0 },
+  { X86::ADCX64rr,                 X86::ADCX64rm,                 0 },
+  { X86::ADD16rr,                  X86::ADD16rm,                  0 },
+  { X86::ADD16rr_DB,               X86::ADD16rm,                  TB_NO_REVERSE },
+  { X86::ADD32rr,                  X86::ADD32rm,                  0 },
+  { X86::ADD32rr_DB,               X86::ADD32rm,                  TB_NO_REVERSE },
+  { X86::ADD64rr,                  X86::ADD64rm,                  0 },
+  { X86::ADD64rr_DB,               X86::ADD64rm,                  TB_NO_REVERSE },
+  { X86::ADD8rr,                   X86::ADD8rm,                   0 },
+  { X86::ADDPDrr,                  X86::ADDPDrm,                  TB_ALIGN_16 },
+  { X86::ADDPSrr,                  X86::ADDPSrm,                  TB_ALIGN_16 },
+  { X86::ADDSDrr,                  X86::ADDSDrm,                  0 },
+  { X86::ADDSDrr_Int,              X86::ADDSDrm_Int,              TB_NO_REVERSE },
+  { X86::ADDSSrr,                  X86::ADDSSrm,                  0 },
+  { X86::ADDSSrr_Int,              X86::ADDSSrm_Int,              TB_NO_REVERSE },
+  { X86::ADDSUBPDrr,               X86::ADDSUBPDrm,               TB_ALIGN_16 },
+  { X86::ADDSUBPSrr,               X86::ADDSUBPSrm,               TB_ALIGN_16 },
+  { X86::ADOX32rr,                 X86::ADOX32rm,                 0 },
+  { X86::ADOX64rr,                 X86::ADOX64rm,                 0 },
+  { X86::AESDECLASTrr,             X86::AESDECLASTrm,             TB_ALIGN_16 },
+  { X86::AESDECrr,                 X86::AESDECrm,                 TB_ALIGN_16 },
+  { X86::AESENCLASTrr,             X86::AESENCLASTrm,             TB_ALIGN_16 },
+  { X86::AESENCrr,                 X86::AESENCrm,                 TB_ALIGN_16 },
+  { X86::AND16rr,                  X86::AND16rm,                  0 },
+  { X86::AND32rr,                  X86::AND32rm,                  0 },
+  { X86::AND64rr,                  X86::AND64rm,                  0 },
+  { X86::AND8rr,                   X86::AND8rm,                   0 },
+  { X86::ANDN32rr,                 X86::ANDN32rm,                 0 },
+  { X86::ANDN64rr,                 X86::ANDN64rm,                 0 },
+  { X86::ANDNPDrr,                 X86::ANDNPDrm,                 TB_ALIGN_16 },
+  { X86::ANDNPSrr,                 X86::ANDNPSrm,                 TB_ALIGN_16 },
+  { X86::ANDPDrr,                  X86::ANDPDrm,                  TB_ALIGN_16 },
+  { X86::ANDPSrr,                  X86::ANDPSrm,                  TB_ALIGN_16 },
+  { X86::BLENDPDrri,               X86::BLENDPDrmi,               TB_ALIGN_16 },
+  { X86::BLENDPSrri,               X86::BLENDPSrmi,               TB_ALIGN_16 },
+  { X86::BLENDVPDrr0,              X86::BLENDVPDrm0,              TB_ALIGN_16 },
+  { X86::BLENDVPSrr0,              X86::BLENDVPSrm0,              TB_ALIGN_16 },
+  { X86::CMOVA16rr,                X86::CMOVA16rm,                0 },
+  { X86::CMOVA32rr,                X86::CMOVA32rm,                0 },
+  { X86::CMOVA64rr,                X86::CMOVA64rm,                0 },
+  { X86::CMOVAE16rr,               X86::CMOVAE16rm,               0 },
+  { X86::CMOVAE32rr,               X86::CMOVAE32rm,               0 },
+  { X86::CMOVAE64rr,               X86::CMOVAE64rm,               0 },
+  { X86::CMOVB16rr,                X86::CMOVB16rm,                0 },
+  { X86::CMOVB32rr,                X86::CMOVB32rm,                0 },
+  { X86::CMOVB64rr,                X86::CMOVB64rm,                0 },
+  { X86::CMOVBE16rr,               X86::CMOVBE16rm,               0 },
+  { X86::CMOVBE32rr,               X86::CMOVBE32rm,               0 },
+  { X86::CMOVBE64rr,               X86::CMOVBE64rm,               0 },
+  { X86::CMOVE16rr,                X86::CMOVE16rm,                0 },
+  { X86::CMOVE32rr,                X86::CMOVE32rm,                0 },
+  { X86::CMOVE64rr,                X86::CMOVE64rm,                0 },
+  { X86::CMOVG16rr,                X86::CMOVG16rm,                0 },
+  { X86::CMOVG32rr,                X86::CMOVG32rm,                0 },
+  { X86::CMOVG64rr,                X86::CMOVG64rm,                0 },
+  { X86::CMOVGE16rr,               X86::CMOVGE16rm,               0 },
+  { X86::CMOVGE32rr,               X86::CMOVGE32rm,               0 },
+  { X86::CMOVGE64rr,               X86::CMOVGE64rm,               0 },
+  { X86::CMOVL16rr,                X86::CMOVL16rm,                0 },
+  { X86::CMOVL32rr,                X86::CMOVL32rm,                0 },
+  { X86::CMOVL64rr,                X86::CMOVL64rm,                0 },
+  { X86::CMOVLE16rr,               X86::CMOVLE16rm,               0 },
+  { X86::CMOVLE32rr,               X86::CMOVLE32rm,               0 },
+  { X86::CMOVLE64rr,               X86::CMOVLE64rm,               0 },
+  { X86::CMOVNE16rr,               X86::CMOVNE16rm,               0 },
+  { X86::CMOVNE32rr,               X86::CMOVNE32rm,               0 },
+  { X86::CMOVNE64rr,               X86::CMOVNE64rm,               0 },
+  { X86::CMOVNO16rr,               X86::CMOVNO16rm,               0 },
+  { X86::CMOVNO32rr,               X86::CMOVNO32rm,               0 },
+  { X86::CMOVNO64rr,               X86::CMOVNO64rm,               0 },
+  { X86::CMOVNP16rr,               X86::CMOVNP16rm,               0 },
+  { X86::CMOVNP32rr,               X86::CMOVNP32rm,               0 },
+  { X86::CMOVNP64rr,               X86::CMOVNP64rm,               0 },
+  { X86::CMOVNS16rr,               X86::CMOVNS16rm,               0 },
+  { X86::CMOVNS32rr,               X86::CMOVNS32rm,               0 },
+  { X86::CMOVNS64rr,               X86::CMOVNS64rm,               0 },
+  { X86::CMOVO16rr,                X86::CMOVO16rm,                0 },
+  { X86::CMOVO32rr,                X86::CMOVO32rm,                0 },
+  { X86::CMOVO64rr,                X86::CMOVO64rm,                0 },
+  { X86::CMOVP16rr,                X86::CMOVP16rm,                0 },
+  { X86::CMOVP32rr,                X86::CMOVP32rm,                0 },
+  { X86::CMOVP64rr,                X86::CMOVP64rm,                0 },
+  { X86::CMOVS16rr,                X86::CMOVS16rm,                0 },
+  { X86::CMOVS32rr,                X86::CMOVS32rm,                0 },
+  { X86::CMOVS64rr,                X86::CMOVS64rm,                0 },
+  { X86::CMPPDrri,                 X86::CMPPDrmi,                 TB_ALIGN_16 },
+  { X86::CMPPSrri,                 X86::CMPPSrmi,                 TB_ALIGN_16 },
+  { X86::CMPSDrr,                  X86::CMPSDrm,                  0 },
+  { X86::CMPSDrr_Int,              X86::CMPSDrm_Int,              TB_NO_REVERSE },
+  { X86::CMPSSrr,                  X86::CMPSSrm,                  0 },
+  { X86::CMPSSrr_Int,              X86::CMPSSrm_Int,              TB_NO_REVERSE },
+  { X86::CRC32r32r16,              X86::CRC32r32m16,              0 },
+  { X86::CRC32r32r32,              X86::CRC32r32m32,              0 },
+  { X86::CRC32r32r8,               X86::CRC32r32m8,               0 },
+  { X86::CRC32r64r64,              X86::CRC32r64m64,              0 },
+  { X86::CRC32r64r8,               X86::CRC32r64m8,               0 },
+  { X86::CVTSD2SSrr_Int,           X86::CVTSD2SSrm_Int,           TB_NO_REVERSE },
+  { X86::CVTSI2SDrr_Int,           X86::CVTSI2SDrm_Int,           0 },
+  { X86::CVTSI2SSrr_Int,           X86::CVTSI2SSrm_Int,           0 },
+  { X86::CVTSI642SDrr_Int,         X86::CVTSI642SDrm_Int,         0 },
+  { X86::CVTSI642SSrr_Int,         X86::CVTSI642SSrm_Int,         0 },
+  { X86::CVTSS2SDrr_Int,           X86::CVTSS2SDrm_Int,           TB_NO_REVERSE },
+  { X86::DIVPDrr,                  X86::DIVPDrm,                  TB_ALIGN_16 },
+  { X86::DIVPSrr,                  X86::DIVPSrm,                  TB_ALIGN_16 },
+  { X86::DIVSDrr,                  X86::DIVSDrm,                  0 },
+  { X86::DIVSDrr_Int,              X86::DIVSDrm_Int,              TB_NO_REVERSE },
+  { X86::DIVSSrr,                  X86::DIVSSrm,                  0 },
+  { X86::DIVSSrr_Int,              X86::DIVSSrm_Int,              TB_NO_REVERSE },
+  { X86::DPPDrri,                  X86::DPPDrmi,                  TB_ALIGN_16 },
+  { X86::DPPSrri,                  X86::DPPSrmi,                  TB_ALIGN_16 },
+  { X86::GF2P8AFFINEINVQBrri,      X86::GF2P8AFFINEINVQBrmi,      TB_ALIGN_16 },
+  { X86::GF2P8AFFINEQBrri,         X86::GF2P8AFFINEQBrmi,         TB_ALIGN_16 },
+  { X86::GF2P8MULBrr,              X86::GF2P8MULBrm,              TB_ALIGN_16 },
+  { X86::HADDPDrr,                 X86::HADDPDrm,                 TB_ALIGN_16 },
+  { X86::HADDPSrr,                 X86::HADDPSrm,                 TB_ALIGN_16 },
+  { X86::HSUBPDrr,                 X86::HSUBPDrm,                 TB_ALIGN_16 },
+  { X86::HSUBPSrr,                 X86::HSUBPSrm,                 TB_ALIGN_16 },
+  { X86::IMUL16rr,                 X86::IMUL16rm,                 0 },
+  { X86::IMUL32rr,                 X86::IMUL32rm,                 0 },
+  { X86::IMUL64rr,                 X86::IMUL64rm,                 0 },
+  { X86::MAXCPDrr,                 X86::MAXCPDrm,                 TB_ALIGN_16 },
+  { X86::MAXCPSrr,                 X86::MAXCPSrm,                 TB_ALIGN_16 },
+  { X86::MAXCSDrr,                 X86::MAXCSDrm,                 0 },
+  { X86::MAXCSSrr,                 X86::MAXCSSrm,                 0 },
+  { X86::MAXPDrr,                  X86::MAXPDrm,                  TB_ALIGN_16 },
+  { X86::MAXPSrr,                  X86::MAXPSrm,                  TB_ALIGN_16 },
+  { X86::MAXSDrr,                  X86::MAXSDrm,                  0 },
+  { X86::MAXSDrr_Int,              X86::MAXSDrm_Int,              TB_NO_REVERSE },
+  { X86::MAXSSrr,                  X86::MAXSSrm,                  0 },
+  { X86::MAXSSrr_Int,              X86::MAXSSrm_Int,              TB_NO_REVERSE },
+  { X86::MINCPDrr,                 X86::MINCPDrm,                 TB_ALIGN_16 },
+  { X86::MINCPSrr,                 X86::MINCPSrm,                 TB_ALIGN_16 },
+  { X86::MINCSDrr,                 X86::MINCSDrm,                 0 },
+  { X86::MINCSSrr,                 X86::MINCSSrm,                 0 },
+  { X86::MINPDrr,                  X86::MINPDrm,                  TB_ALIGN_16 },
+  { X86::MINPSrr,                  X86::MINPSrm,                  TB_ALIGN_16 },
+  { X86::MINSDrr,                  X86::MINSDrm,                  0 },
+  { X86::MINSDrr_Int,              X86::MINSDrm_Int,              TB_NO_REVERSE },
+  { X86::MINSSrr,                  X86::MINSSrm,                  0 },
+  { X86::MINSSrr_Int,              X86::MINSSrm_Int,              TB_NO_REVERSE },
+  { X86::MMX_CVTPI2PSirr,          X86::MMX_CVTPI2PSirm,          0 },
+  { X86::MMX_PACKSSDWirr,          X86::MMX_PACKSSDWirm,          0 },
+  { X86::MMX_PACKSSWBirr,          X86::MMX_PACKSSWBirm,          0 },
+  { X86::MMX_PACKUSWBirr,          X86::MMX_PACKUSWBirm,          0 },
+  { X86::MMX_PADDBirr,             X86::MMX_PADDBirm,             0 },
+  { X86::MMX_PADDDirr,             X86::MMX_PADDDirm,             0 },
+  { X86::MMX_PADDQirr,             X86::MMX_PADDQirm,             0 },
+  { X86::MMX_PADDSBirr,            X86::MMX_PADDSBirm,            0 },
+  { X86::MMX_PADDSWirr,            X86::MMX_PADDSWirm,            0 },
+  { X86::MMX_PADDUSBirr,           X86::MMX_PADDUSBirm,           0 },
+  { X86::MMX_PADDUSWirr,           X86::MMX_PADDUSWirm,           0 },
+  { X86::MMX_PADDWirr,             X86::MMX_PADDWirm,             0 },
+  { X86::MMX_PALIGNRrri,           X86::MMX_PALIGNRrmi,           0 },
+  { X86::MMX_PANDNirr,             X86::MMX_PANDNirm,             0 },
+  { X86::MMX_PANDirr,              X86::MMX_PANDirm,              0 },
+  { X86::MMX_PAVGBirr,             X86::MMX_PAVGBirm,             0 },
+  { X86::MMX_PAVGWirr,             X86::MMX_PAVGWirm,             0 },
+  { X86::MMX_PCMPEQBirr,           X86::MMX_PCMPEQBirm,           0 },
+  { X86::MMX_PCMPEQDirr,           X86::MMX_PCMPEQDirm,           0 },
+  { X86::MMX_PCMPEQWirr,           X86::MMX_PCMPEQWirm,           0 },
+  { X86::MMX_PCMPGTBirr,           X86::MMX_PCMPGTBirm,           0 },
+  { X86::MMX_PCMPGTDirr,           X86::MMX_PCMPGTDirm,           0 },
+  { X86::MMX_PCMPGTWirr,           X86::MMX_PCMPGTWirm,           0 },
+  { X86::MMX_PHADDDrr,             X86::MMX_PHADDDrm,             0 },
+  { X86::MMX_PHADDSWrr,            X86::MMX_PHADDSWrm,            0 },
+  { X86::MMX_PHADDWrr,             X86::MMX_PHADDWrm,             0 },
+  { X86::MMX_PHSUBDrr,             X86::MMX_PHSUBDrm,             0 },
+  { X86::MMX_PHSUBSWrr,            X86::MMX_PHSUBSWrm,            0 },
+  { X86::MMX_PHSUBWrr,             X86::MMX_PHSUBWrm,             0 },
+  { X86::MMX_PINSRWrr,             X86::MMX_PINSRWrm,             TB_NO_REVERSE },
+  { X86::MMX_PMADDUBSWrr,          X86::MMX_PMADDUBSWrm,          0 },
+  { X86::MMX_PMADDWDirr,           X86::MMX_PMADDWDirm,           0 },
+  { X86::MMX_PMAXSWirr,            X86::MMX_PMAXSWirm,            0 },
+  { X86::MMX_PMAXUBirr,            X86::MMX_PMAXUBirm,            0 },
+  { X86::MMX_PMINSWirr,            X86::MMX_PMINSWirm,            0 },
+  { X86::MMX_PMINUBirr,            X86::MMX_PMINUBirm,            0 },
+  { X86::MMX_PMULHRSWrr,           X86::MMX_PMULHRSWrm,           0 },
+  { X86::MMX_PMULHUWirr,           X86::MMX_PMULHUWirm,           0 },
+  { X86::MMX_PMULHWirr,            X86::MMX_PMULHWirm,            0 },
+  { X86::MMX_PMULLWirr,            X86::MMX_PMULLWirm,            0 },
+  { X86::MMX_PMULUDQirr,           X86::MMX_PMULUDQirm,           0 },
+  { X86::MMX_PORirr,               X86::MMX_PORirm,               0 },
+  { X86::MMX_PSADBWirr,            X86::MMX_PSADBWirm,            0 },
+  { X86::MMX_PSHUFBrr,             X86::MMX_PSHUFBrm,             0 },
+  { X86::MMX_PSIGNBrr,             X86::MMX_PSIGNBrm,             0 },
+  { X86::MMX_PSIGNDrr,             X86::MMX_PSIGNDrm,             0 },
+  { X86::MMX_PSIGNWrr,             X86::MMX_PSIGNWrm,             0 },
+  { X86::MMX_PSLLDrr,              X86::MMX_PSLLDrm,              0 },
+  { X86::MMX_PSLLQrr,              X86::MMX_PSLLQrm,              0 },
+  { X86::MMX_PSLLWrr,              X86::MMX_PSLLWrm,              0 },
+  { X86::MMX_PSRADrr,              X86::MMX_PSRADrm,              0 },
+  { X86::MMX_PSRAWrr,              X86::MMX_PSRAWrm,              0 },
+  { X86::MMX_PSRLDrr,              X86::MMX_PSRLDrm,              0 },
+  { X86::MMX_PSRLQrr,              X86::MMX_PSRLQrm,              0 },
+  { X86::MMX_PSRLWrr,              X86::MMX_PSRLWrm,              0 },
+  { X86::MMX_PSUBBirr,             X86::MMX_PSUBBirm,             0 },
+  { X86::MMX_PSUBDirr,             X86::MMX_PSUBDirm,             0 },
+  { X86::MMX_PSUBQirr,             X86::MMX_PSUBQirm,             0 },
+  { X86::MMX_PSUBSBirr,            X86::MMX_PSUBSBirm,            0 },
+  { X86::MMX_PSUBSWirr,            X86::MMX_PSUBSWirm,            0 },
+  { X86::MMX_PSUBUSBirr,           X86::MMX_PSUBUSBirm,           0 },
+  { X86::MMX_PSUBUSWirr,           X86::MMX_PSUBUSWirm,           0 },
+  { X86::MMX_PSUBWirr,             X86::MMX_PSUBWirm,             0 },
+  { X86::MMX_PUNPCKHBWirr,         X86::MMX_PUNPCKHBWirm,         0 },
+  { X86::MMX_PUNPCKHDQirr,         X86::MMX_PUNPCKHDQirm,         0 },
+  { X86::MMX_PUNPCKHWDirr,         X86::MMX_PUNPCKHWDirm,         0 },
+  { X86::MMX_PUNPCKLBWirr,         X86::MMX_PUNPCKLBWirm,         TB_NO_REVERSE },
+  { X86::MMX_PUNPCKLDQirr,         X86::MMX_PUNPCKLDQirm,         TB_NO_REVERSE },
+  { X86::MMX_PUNPCKLWDirr,         X86::MMX_PUNPCKLWDirm,         TB_NO_REVERSE },
+  { X86::MMX_PXORirr,              X86::MMX_PXORirm,              0 },
+  { X86::MOVLHPSrr,                X86::MOVHPSrm,                 TB_NO_REVERSE },
+  { X86::MPSADBWrri,               X86::MPSADBWrmi,               TB_ALIGN_16 },
+  { X86::MULPDrr,                  X86::MULPDrm,                  TB_ALIGN_16 },
+  { X86::MULPSrr,                  X86::MULPSrm,                  TB_ALIGN_16 },
+  { X86::MULSDrr,                  X86::MULSDrm,                  0 },
+  { X86::MULSDrr_Int,              X86::MULSDrm_Int,              TB_NO_REVERSE },
+  { X86::MULSSrr,                  X86::MULSSrm,                  0 },
+  { X86::MULSSrr_Int,              X86::MULSSrm_Int,              TB_NO_REVERSE },
+  { X86::MULX32rr,                 X86::MULX32rm,                 0 },
+  { X86::MULX64rr,                 X86::MULX64rm,                 0 },
+  { X86::OR16rr,                   X86::OR16rm,                   0 },
+  { X86::OR32rr,                   X86::OR32rm,                   0 },
+  { X86::OR64rr,                   X86::OR64rm,                   0 },
+  { X86::OR8rr,                    X86::OR8rm,                    0 },
+  { X86::ORPDrr,                   X86::ORPDrm,                   TB_ALIGN_16 },
+  { X86::ORPSrr,                   X86::ORPSrm,                   TB_ALIGN_16 },
+  { X86::PACKSSDWrr,               X86::PACKSSDWrm,               TB_ALIGN_16 },
+  { X86::PACKSSWBrr,               X86::PACKSSWBrm,               TB_ALIGN_16 },
+  { X86::PACKUSDWrr,               X86::PACKUSDWrm,               TB_ALIGN_16 },
+  { X86::PACKUSWBrr,               X86::PACKUSWBrm,               TB_ALIGN_16 },
+  { X86::PADDBrr,                  X86::PADDBrm,                  TB_ALIGN_16 },
+  { X86::PADDDrr,                  X86::PADDDrm,                  TB_ALIGN_16 },
+  { X86::PADDQrr,                  X86::PADDQrm,                  TB_ALIGN_16 },
+  { X86::PADDSBrr,                 X86::PADDSBrm,                 TB_ALIGN_16 },
+  { X86::PADDSWrr,                 X86::PADDSWrm,                 TB_ALIGN_16 },
+  { X86::PADDUSBrr,                X86::PADDUSBrm,                TB_ALIGN_16 },
+  { X86::PADDUSWrr,                X86::PADDUSWrm,                TB_ALIGN_16 },
+  { X86::PADDWrr,                  X86::PADDWrm,                  TB_ALIGN_16 },
+  { X86::PALIGNRrri,               X86::PALIGNRrmi,               TB_ALIGN_16 },
+  { X86::PANDNrr,                  X86::PANDNrm,                  TB_ALIGN_16 },
+  { X86::PANDrr,                   X86::PANDrm,                   TB_ALIGN_16 },
+  { X86::PAVGBrr,                  X86::PAVGBrm,                  TB_ALIGN_16 },
+  { X86::PAVGUSBrr,                X86::PAVGUSBrm,                0 },
+  { X86::PAVGWrr,                  X86::PAVGWrm,                  TB_ALIGN_16 },
+  { X86::PBLENDVBrr0,              X86::PBLENDVBrm0,              TB_ALIGN_16 },
+  { X86::PBLENDWrri,               X86::PBLENDWrmi,               TB_ALIGN_16 },
+  { X86::PCLMULQDQrr,              X86::PCLMULQDQrm,              TB_ALIGN_16 },
+  { X86::PCMPEQBrr,                X86::PCMPEQBrm,                TB_ALIGN_16 },
+  { X86::PCMPEQDrr,                X86::PCMPEQDrm,                TB_ALIGN_16 },
+  { X86::PCMPEQQrr,                X86::PCMPEQQrm,                TB_ALIGN_16 },
+  { X86::PCMPEQWrr,                X86::PCMPEQWrm,                TB_ALIGN_16 },
+  { X86::PCMPGTBrr,                X86::PCMPGTBrm,                TB_ALIGN_16 },
+  { X86::PCMPGTDrr,                X86::PCMPGTDrm,                TB_ALIGN_16 },
+  { X86::PCMPGTQrr,                X86::PCMPGTQrm,                TB_ALIGN_16 },
+  { X86::PCMPGTWrr,                X86::PCMPGTWrm,                TB_ALIGN_16 },
+  { X86::PDEP32rr,                 X86::PDEP32rm,                 0 },
+  { X86::PDEP64rr,                 X86::PDEP64rm,                 0 },
+  { X86::PEXT32rr,                 X86::PEXT32rm,                 0 },
+  { X86::PEXT64rr,                 X86::PEXT64rm,                 0 },
+  { X86::PFACCrr,                  X86::PFACCrm,                  0 },
+  { X86::PFADDrr,                  X86::PFADDrm,                  0 },
+  { X86::PFCMPEQrr,                X86::PFCMPEQrm,                0 },
+  { X86::PFCMPGErr,                X86::PFCMPGErm,                0 },
+  { X86::PFCMPGTrr,                X86::PFCMPGTrm,                0 },
+  { X86::PFMAXrr,                  X86::PFMAXrm,                  0 },
+  { X86::PFMINrr,                  X86::PFMINrm,                  0 },
+  { X86::PFMULrr,                  X86::PFMULrm,                  0 },
+  { X86::PFNACCrr,                 X86::PFNACCrm,                 0 },
+  { X86::PFPNACCrr,                X86::PFPNACCrm,                0 },
+  { X86::PFRCPIT1rr,               X86::PFRCPIT1rm,               0 },
+  { X86::PFRCPIT2rr,               X86::PFRCPIT2rm,               0 },
+  { X86::PFRSQIT1rr,               X86::PFRSQIT1rm,               0 },
+  { X86::PFSUBRrr,                 X86::PFSUBRrm,                 0 },
+  { X86::PFSUBrr,                  X86::PFSUBrm,                  0 },
+  { X86::PHADDDrr,                 X86::PHADDDrm,                 TB_ALIGN_16 },
+  { X86::PHADDSWrr,                X86::PHADDSWrm,                TB_ALIGN_16 },
+  { X86::PHADDWrr,                 X86::PHADDWrm,                 TB_ALIGN_16 },
+  { X86::PHSUBDrr,                 X86::PHSUBDrm,                 TB_ALIGN_16 },
+  { X86::PHSUBSWrr,                X86::PHSUBSWrm,                TB_ALIGN_16 },
+  { X86::PHSUBWrr,                 X86::PHSUBWrm,                 TB_ALIGN_16 },
+  { X86::PINSRBrr,                 X86::PINSRBrm,                 TB_NO_REVERSE },
+  { X86::PINSRDrr,                 X86::PINSRDrm,                 0 },
+  { X86::PINSRQrr,                 X86::PINSRQrm,                 0 },
+  { X86::PINSRWrr,                 X86::PINSRWrm,                 TB_NO_REVERSE },
+  { X86::PMADDUBSWrr,              X86::PMADDUBSWrm,              TB_ALIGN_16 },
+  { X86::PMADDWDrr,                X86::PMADDWDrm,                TB_ALIGN_16 },
+  { X86::PMAXSBrr,                 X86::PMAXSBrm,                 TB_ALIGN_16 },
+  { X86::PMAXSDrr,                 X86::PMAXSDrm,                 TB_ALIGN_16 },
+  { X86::PMAXSWrr,                 X86::PMAXSWrm,                 TB_ALIGN_16 },
+  { X86::PMAXUBrr,                 X86::PMAXUBrm,                 TB_ALIGN_16 },
+  { X86::PMAXUDrr,                 X86::PMAXUDrm,                 TB_ALIGN_16 },
+  { X86::PMAXUWrr,                 X86::PMAXUWrm,                 TB_ALIGN_16 },
+  { X86::PMINSBrr,                 X86::PMINSBrm,                 TB_ALIGN_16 },
+  { X86::PMINSDrr,                 X86::PMINSDrm,                 TB_ALIGN_16 },
+  { X86::PMINSWrr,                 X86::PMINSWrm,                 TB_ALIGN_16 },
+  { X86::PMINUBrr,                 X86::PMINUBrm,                 TB_ALIGN_16 },
+  { X86::PMINUDrr,                 X86::PMINUDrm,                 TB_ALIGN_16 },
+  { X86::PMINUWrr,                 X86::PMINUWrm,                 TB_ALIGN_16 },
+  { X86::PMULDQrr,                 X86::PMULDQrm,                 TB_ALIGN_16 },
+  { X86::PMULHRSWrr,               X86::PMULHRSWrm,               TB_ALIGN_16 },
+  { X86::PMULHRWrr,                X86::PMULHRWrm,                0 },
+  { X86::PMULHUWrr,                X86::PMULHUWrm,                TB_ALIGN_16 },
+  { X86::PMULHWrr,                 X86::PMULHWrm,                 TB_ALIGN_16 },
+  { X86::PMULLDrr,                 X86::PMULLDrm,                 TB_ALIGN_16 },
+  { X86::PMULLWrr,                 X86::PMULLWrm,                 TB_ALIGN_16 },
+  { X86::PMULUDQrr,                X86::PMULUDQrm,                TB_ALIGN_16 },
+  { X86::PORrr,                    X86::PORrm,                    TB_ALIGN_16 },
+  { X86::PSADBWrr,                 X86::PSADBWrm,                 TB_ALIGN_16 },
+  { X86::PSHUFBrr,                 X86::PSHUFBrm,                 TB_ALIGN_16 },
+  { X86::PSIGNBrr,                 X86::PSIGNBrm,                 TB_ALIGN_16 },
+  { X86::PSIGNDrr,                 X86::PSIGNDrm,                 TB_ALIGN_16 },
+  { X86::PSIGNWrr,                 X86::PSIGNWrm,                 TB_ALIGN_16 },
+  { X86::PSLLDrr,                  X86::PSLLDrm,                  TB_ALIGN_16 },
+  { X86::PSLLQrr,                  X86::PSLLQrm,                  TB_ALIGN_16 },
+  { X86::PSLLWrr,                  X86::PSLLWrm,                  TB_ALIGN_16 },
+  { X86::PSRADrr,                  X86::PSRADrm,                  TB_ALIGN_16 },
+  { X86::PSRAWrr,                  X86::PSRAWrm,                  TB_ALIGN_16 },
+  { X86::PSRLDrr,                  X86::PSRLDrm,                  TB_ALIGN_16 },
+  { X86::PSRLQrr,                  X86::PSRLQrm,                  TB_ALIGN_16 },
+  { X86::PSRLWrr,                  X86::PSRLWrm,                  TB_ALIGN_16 },
+  { X86::PSUBBrr,                  X86::PSUBBrm,                  TB_ALIGN_16 },
+  { X86::PSUBDrr,                  X86::PSUBDrm,                  TB_ALIGN_16 },
+  { X86::PSUBQrr,                  X86::PSUBQrm,                  TB_ALIGN_16 },
+  { X86::PSUBSBrr,                 X86::PSUBSBrm,                 TB_ALIGN_16 },
+  { X86::PSUBSWrr,                 X86::PSUBSWrm,                 TB_ALIGN_16 },
+  { X86::PSUBUSBrr,                X86::PSUBUSBrm,                TB_ALIGN_16 },
+  { X86::PSUBUSWrr,                X86::PSUBUSWrm,                TB_ALIGN_16 },
+  { X86::PSUBWrr,                  X86::PSUBWrm,                  TB_ALIGN_16 },
+  { X86::PUNPCKHBWrr,              X86::PUNPCKHBWrm,              TB_ALIGN_16 },
+  { X86::PUNPCKHDQrr,              X86::PUNPCKHDQrm,              TB_ALIGN_16 },
+  { X86::PUNPCKHQDQrr,             X86::PUNPCKHQDQrm,             TB_ALIGN_16 },
+  { X86::PUNPCKHWDrr,              X86::PUNPCKHWDrm,              TB_ALIGN_16 },
+  { X86::PUNPCKLBWrr,              X86::PUNPCKLBWrm,              TB_ALIGN_16 },
+  { X86::PUNPCKLDQrr,              X86::PUNPCKLDQrm,              TB_ALIGN_16 },
+  { X86::PUNPCKLQDQrr,             X86::PUNPCKLQDQrm,             TB_ALIGN_16 },
+  { X86::PUNPCKLWDrr,              X86::PUNPCKLWDrm,              TB_ALIGN_16 },
+  { X86::PXORrr,                   X86::PXORrm,                   TB_ALIGN_16 },
+  { X86::RCPSSr_Int,               X86::RCPSSm_Int,               TB_NO_REVERSE },
+  { X86::ROUNDSDr_Int,             X86::ROUNDSDm_Int,             TB_NO_REVERSE },
+  { X86::ROUNDSSr_Int,             X86::ROUNDSSm_Int,             TB_NO_REVERSE },
+  { X86::RSQRTSSr_Int,             X86::RSQRTSSm_Int,             TB_NO_REVERSE },
+  { X86::SBB16rr,                  X86::SBB16rm,                  0 },
+  { X86::SBB32rr,                  X86::SBB32rm,                  0 },
+  { X86::SBB64rr,                  X86::SBB64rm,                  0 },
+  { X86::SBB8rr,                   X86::SBB8rm,                   0 },
+  { X86::SHA1MSG1rr,               X86::SHA1MSG1rm,               TB_ALIGN_16 },
+  { X86::SHA1MSG2rr,               X86::SHA1MSG2rm,               TB_ALIGN_16 },
+  { X86::SHA1NEXTErr,              X86::SHA1NEXTErm,              TB_ALIGN_16 },
+  { X86::SHA1RNDS4rri,             X86::SHA1RNDS4rmi,             TB_ALIGN_16 },
+  { X86::SHA256MSG1rr,             X86::SHA256MSG1rm,             TB_ALIGN_16 },
+  { X86::SHA256MSG2rr,             X86::SHA256MSG2rm,             TB_ALIGN_16 },
+  { X86::SHA256RNDS2rr,            X86::SHA256RNDS2rm,            TB_ALIGN_16 },
+  { X86::SHUFPDrri,                X86::SHUFPDrmi,                TB_ALIGN_16 },
+  { X86::SHUFPSrri,                X86::SHUFPSrmi,                TB_ALIGN_16 },
+  { X86::SQRTSDr_Int,              X86::SQRTSDm_Int,              TB_NO_REVERSE },
+  { X86::SQRTSSr_Int,              X86::SQRTSSm_Int,              TB_NO_REVERSE },
+  { X86::SUB16rr,                  X86::SUB16rm,                  0 },
+  { X86::SUB32rr,                  X86::SUB32rm,                  0 },
+  { X86::SUB64rr,                  X86::SUB64rm,                  0 },
+  { X86::SUB8rr,                   X86::SUB8rm,                   0 },
+  { X86::SUBPDrr,                  X86::SUBPDrm,                  TB_ALIGN_16 },
+  { X86::SUBPSrr,                  X86::SUBPSrm,                  TB_ALIGN_16 },
+  { X86::SUBSDrr,                  X86::SUBSDrm,                  0 },
+  { X86::SUBSDrr_Int,              X86::SUBSDrm_Int,              TB_NO_REVERSE },
+  { X86::SUBSSrr,                  X86::SUBSSrm,                  0 },
+  { X86::SUBSSrr_Int,              X86::SUBSSrm_Int,              TB_NO_REVERSE },
+  // FIXME: TEST*rr -> swapped      operand of TEST      *mr.     
+  { X86::UNPCKHPDrr,               X86::UNPCKHPDrm,               TB_ALIGN_16 },
+  { X86::UNPCKHPSrr,               X86::UNPCKHPSrm,               TB_ALIGN_16 },
+  { X86::UNPCKLPDrr,               X86::UNPCKLPDrm,               TB_ALIGN_16 },
+  { X86::UNPCKLPSrr,               X86::UNPCKLPSrm,               TB_ALIGN_16 },
+  { X86::VADDPDYrr,                X86::VADDPDYrm,                0 },
+  { X86::VADDPDZ128rr,             X86::VADDPDZ128rm,             0 },
+  { X86::VADDPDZ256rr,             X86::VADDPDZ256rm,             0 },
+  { X86::VADDPDZrr,                X86::VADDPDZrm,                0 },
+  { X86::VADDPDrr,                 X86::VADDPDrm,                 0 },
+  { X86::VADDPSYrr,                X86::VADDPSYrm,                0 },
+  { X86::VADDPSZ128rr,             X86::VADDPSZ128rm,             0 },
+  { X86::VADDPSZ256rr,             X86::VADDPSZ256rm,             0 },
+  { X86::VADDPSZrr,                X86::VADDPSZrm,                0 },
+  { X86::VADDPSrr,                 X86::VADDPSrm,                 0 },
+  { X86::VADDSDZrr,                X86::VADDSDZrm,                0 },
+  { X86::VADDSDZrr_Int,            X86::VADDSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VADDSDrr,                 X86::VADDSDrm,                 0 },
+  { X86::VADDSDrr_Int,             X86::VADDSDrm_Int,             TB_NO_REVERSE },
+  { X86::VADDSSZrr,                X86::VADDSSZrm,                0 },
+  { X86::VADDSSZrr_Int,            X86::VADDSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VADDSSrr,                 X86::VADDSSrm,                 0 },
+  { X86::VADDSSrr_Int,             X86::VADDSSrm_Int,             TB_NO_REVERSE },
+  { X86::VADDSUBPDYrr,             X86::VADDSUBPDYrm,             0 },
+  { X86::VADDSUBPDrr,              X86::VADDSUBPDrm,              0 },
+  { X86::VADDSUBPSYrr,             X86::VADDSUBPSYrm,             0 },
+  { X86::VADDSUBPSrr,              X86::VADDSUBPSrm,              0 },
+  { X86::VAESDECLASTYrr,           X86::VAESDECLASTYrm,           0 },
+  { X86::VAESDECLASTZ128rr,        X86::VAESDECLASTZ128rm,        0 },
+  { X86::VAESDECLASTZ256rr,        X86::VAESDECLASTZ256rm,        0 },
+  { X86::VAESDECLASTZrr,           X86::VAESDECLASTZrm,           0 },
+  { X86::VAESDECLASTrr,            X86::VAESDECLASTrm,            0 },
+  { X86::VAESDECYrr,               X86::VAESDECYrm,               0 },
+  { X86::VAESDECZ128rr,            X86::VAESDECZ128rm,            0 },
+  { X86::VAESDECZ256rr,            X86::VAESDECZ256rm,            0 },
+  { X86::VAESDECZrr,               X86::VAESDECZrm,               0 },
+  { X86::VAESDECrr,                X86::VAESDECrm,                0 },
+  { X86::VAESENCLASTYrr,           X86::VAESENCLASTYrm,           0 },
+  { X86::VAESENCLASTZ128rr,        X86::VAESENCLASTZ128rm,        0 },
+  { X86::VAESENCLASTZ256rr,        X86::VAESENCLASTZ256rm,        0 },
+  { X86::VAESENCLASTZrr,           X86::VAESENCLASTZrm,           0 },
+  { X86::VAESENCLASTrr,            X86::VAESENCLASTrm,            0 },
+  { X86::VAESENCYrr,               X86::VAESENCYrm,               0 },
+  { X86::VAESENCZ128rr,            X86::VAESENCZ128rm,            0 },
+  { X86::VAESENCZ256rr,            X86::VAESENCZ256rm,            0 },
+  { X86::VAESENCZrr,               X86::VAESENCZrm,               0 },
+  { X86::VAESENCrr,                X86::VAESENCrm,                0 },
+  { X86::VALIGNDZ128rri,           X86::VALIGNDZ128rmi,           0 },
+  { X86::VALIGNDZ256rri,           X86::VALIGNDZ256rmi,           0 },
+  { X86::VALIGNDZrri,              X86::VALIGNDZrmi,              0 },
+  { X86::VALIGNQZ128rri,           X86::VALIGNQZ128rmi,           0 },
+  { X86::VALIGNQZ256rri,           X86::VALIGNQZ256rmi,           0 },
+  { X86::VALIGNQZrri,              X86::VALIGNQZrmi,              0 },
+  { X86::VANDNPDYrr,               X86::VANDNPDYrm,               0 },
+  { X86::VANDNPDZ128rr,            X86::VANDNPDZ128rm,            0 },
+  { X86::VANDNPDZ256rr,            X86::VANDNPDZ256rm,            0 },
+  { X86::VANDNPDZrr,               X86::VANDNPDZrm,               0 },
+  { X86::VANDNPDrr,                X86::VANDNPDrm,                0 },
+  { X86::VANDNPSYrr,               X86::VANDNPSYrm,               0 },
+  { X86::VANDNPSZ128rr,            X86::VANDNPSZ128rm,            0 },
+  { X86::VANDNPSZ256rr,            X86::VANDNPSZ256rm,            0 },
+  { X86::VANDNPSZrr,               X86::VANDNPSZrm,               0 },
+  { X86::VANDNPSrr,                X86::VANDNPSrm,                0 },
+  { X86::VANDPDYrr,                X86::VANDPDYrm,                0 },
+  { X86::VANDPDZ128rr,             X86::VANDPDZ128rm,             0 },
+  { X86::VANDPDZ256rr,             X86::VANDPDZ256rm,             0 },
+  { X86::VANDPDZrr,                X86::VANDPDZrm,                0 },
+  { X86::VANDPDrr,                 X86::VANDPDrm,                 0 },
+  { X86::VANDPSYrr,                X86::VANDPSYrm,                0 },
+  { X86::VANDPSZ128rr,             X86::VANDPSZ128rm,             0 },
+  { X86::VANDPSZ256rr,             X86::VANDPSZ256rm,             0 },
+  { X86::VANDPSZrr,                X86::VANDPSZrm,                0 },
+  { X86::VANDPSrr,                 X86::VANDPSrm,                 0 },
+  { X86::VBLENDMPDZ128rr,          X86::VBLENDMPDZ128rm,          0 },
+  { X86::VBLENDMPDZ256rr,          X86::VBLENDMPDZ256rm,          0 },
+  { X86::VBLENDMPDZrr,             X86::VBLENDMPDZrm,             0 },
+  { X86::VBLENDMPSZ128rr,          X86::VBLENDMPSZ128rm,          0 },
+  { X86::VBLENDMPSZ256rr,          X86::VBLENDMPSZ256rm,          0 },
+  { X86::VBLENDMPSZrr,             X86::VBLENDMPSZrm,             0 },
+  { X86::VBLENDPDYrri,             X86::VBLENDPDYrmi,             0 },
+  { X86::VBLENDPDrri,              X86::VBLENDPDrmi,              0 },
+  { X86::VBLENDPSYrri,             X86::VBLENDPSYrmi,             0 },
+  { X86::VBLENDPSrri,              X86::VBLENDPSrmi,              0 },
+  { X86::VBLENDVPDYrr,             X86::VBLENDVPDYrm,             0 },
+  { X86::VBLENDVPDrr,              X86::VBLENDVPDrm,              0 },
+  { X86::VBLENDVPSYrr,             X86::VBLENDVPSYrm,             0 },
+  { X86::VBLENDVPSrr,              X86::VBLENDVPSrm,              0 },
+  { X86::VBROADCASTF32X2Z256rkz,   X86::VBROADCASTF32X2Z256mkz,   TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Zrkz,      X86::VBROADCASTF32X2Zmkz,      TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z128rkz,   X86::VBROADCASTI32X2Z128mkz,   TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z256rkz,   X86::VBROADCASTI32X2Z256mkz,   TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Zrkz,      X86::VBROADCASTI32X2Zmkz,      TB_NO_REVERSE },
+  { X86::VBROADCASTSDZ256rkz,      X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
+  { X86::VBROADCASTSDZrkz,         X86::VBROADCASTSDZmkz,         TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ128rkz,      X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ256rkz,      X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+  { X86::VBROADCASTSSZrkz,         X86::VBROADCASTSSZmkz,         TB_NO_REVERSE },
+  { X86::VCMPPDYrri,               X86::VCMPPDYrmi,               0 },
+  { X86::VCMPPDZ128rri,            X86::VCMPPDZ128rmi,            0 },
+  { X86::VCMPPDZ256rri,            X86::VCMPPDZ256rmi,            0 },
+  { X86::VCMPPDZrri,               X86::VCMPPDZrmi,               0 },
+  { X86::VCMPPDrri,                X86::VCMPPDrmi,                0 },
+  { X86::VCMPPSYrri,               X86::VCMPPSYrmi,               0 },
+  { X86::VCMPPSZ128rri,            X86::VCMPPSZ128rmi,            0 },
+  { X86::VCMPPSZ256rri,            X86::VCMPPSZ256rmi,            0 },
+  { X86::VCMPPSZrri,               X86::VCMPPSZrmi,               0 },
+  { X86::VCMPPSrri,                X86::VCMPPSrmi,                0 },
+  { X86::VCMPSDZrr,                X86::VCMPSDZrm,                0 },
+  { X86::VCMPSDZrr_Int,            X86::VCMPSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VCMPSDrr,                 X86::VCMPSDrm,                 0 },
+  { X86::VCMPSDrr_Int,             X86::VCMPSDrm_Int,             TB_NO_REVERSE },
+  { X86::VCMPSSZrr,                X86::VCMPSSZrm,                0 },
+  { X86::VCMPSSZrr_Int,            X86::VCMPSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VCMPSSrr,                 X86::VCMPSSrm,                 0 },
+  { X86::VCMPSSrr_Int,             X86::VCMPSSrm_Int,             TB_NO_REVERSE },
+  { X86::VCVTDQ2PDZ128rrkz,        X86::VCVTDQ2PDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VCVTDQ2PDZ256rrkz,        X86::VCVTDQ2PDZ256rmkz,        0 },
+  { X86::VCVTDQ2PDZrrkz,           X86::VCVTDQ2PDZrmkz,           0 },
+  { X86::VCVTDQ2PSZ128rrkz,        X86::VCVTDQ2PSZ128rmkz,        0 },
+  { X86::VCVTDQ2PSZ256rrkz,        X86::VCVTDQ2PSZ256rmkz,        0 },
+  { X86::VCVTDQ2PSZrrkz,           X86::VCVTDQ2PSZrmkz,           0 },
+  { X86::VCVTPD2DQZ128rrkz,        X86::VCVTPD2DQZ128rmkz,        0 },
+  { X86::VCVTPD2DQZ256rrkz,        X86::VCVTPD2DQZ256rmkz,        0 },
+  { X86::VCVTPD2DQZrrkz,           X86::VCVTPD2DQZrmkz,           0 },
+  { X86::VCVTPD2PSZ128rrkz,        X86::VCVTPD2PSZ128rmkz,        0 },
+  { X86::VCVTPD2PSZ256rrkz,        X86::VCVTPD2PSZ256rmkz,        0 },
+  { X86::VCVTPD2PSZrrkz,           X86::VCVTPD2PSZrmkz,           0 },
+  { X86::VCVTPD2QQZ128rrkz,        X86::VCVTPD2QQZ128rmkz,        0 },
+  { X86::VCVTPD2QQZ256rrkz,        X86::VCVTPD2QQZ256rmkz,        0 },
+  { X86::VCVTPD2QQZrrkz,           X86::VCVTPD2QQZrmkz,           0 },
+  { X86::VCVTPD2UDQZ128rrkz,       X86::VCVTPD2UDQZ128rmkz,       0 },
+  { X86::VCVTPD2UDQZ256rrkz,       X86::VCVTPD2UDQZ256rmkz,       0 },
+  { X86::VCVTPD2UDQZrrkz,          X86::VCVTPD2UDQZrmkz,          0 },
+  { X86::VCVTPD2UQQZ128rrkz,       X86::VCVTPD2UQQZ128rmkz,       0 },
+  { X86::VCVTPD2UQQZ256rrkz,       X86::VCVTPD2UQQZ256rmkz,       0 },
+  { X86::VCVTPD2UQQZrrkz,          X86::VCVTPD2UQQZrmkz,          0 },
+  { X86::VCVTPH2PSZ128rrkz,        X86::VCVTPH2PSZ128rmkz,        TB_NO_REVERSE },
+  { X86::VCVTPH2PSZ256rrkz,        X86::VCVTPH2PSZ256rmkz,        0 },
+  { X86::VCVTPH2PSZrrkz,           X86::VCVTPH2PSZrmkz,           0 },
+  { X86::VCVTPS2DQZ128rrkz,        X86::VCVTPS2DQZ128rmkz,        0 },
+  { X86::VCVTPS2DQZ256rrkz,        X86::VCVTPS2DQZ256rmkz,        0 },
+  { X86::VCVTPS2DQZrrkz,           X86::VCVTPS2DQZrmkz,           0 },
+  { X86::VCVTPS2PDZ128rrkz,        X86::VCVTPS2PDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VCVTPS2PDZ256rrkz,        X86::VCVTPS2PDZ256rmkz,        0 },
+  { X86::VCVTPS2PDZrrkz,           X86::VCVTPS2PDZrmkz,           0 },
+  { X86::VCVTPS2QQZ128rrkz,        X86::VCVTPS2QQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VCVTPS2QQZ256rrkz,        X86::VCVTPS2QQZ256rmkz,        0 },
+  { X86::VCVTPS2QQZrrkz,           X86::VCVTPS2QQZrmkz,           0 },
+  { X86::VCVTPS2UDQZ128rrkz,       X86::VCVTPS2UDQZ128rmkz,       0 },
+  { X86::VCVTPS2UDQZ256rrkz,       X86::VCVTPS2UDQZ256rmkz,       0 },
+  { X86::VCVTPS2UDQZrrkz,          X86::VCVTPS2UDQZrmkz,          0 },
+  { X86::VCVTPS2UQQZ128rrkz,       X86::VCVTPS2UQQZ128rmkz,       TB_NO_REVERSE },
+  { X86::VCVTPS2UQQZ256rrkz,       X86::VCVTPS2UQQZ256rmkz,       0 },
+  { X86::VCVTPS2UQQZrrkz,          X86::VCVTPS2UQQZrmkz,          0 },
+  { X86::VCVTQQ2PDZ128rrkz,        X86::VCVTQQ2PDZ128rmkz,        0 },
+  { X86::VCVTQQ2PDZ256rrkz,        X86::VCVTQQ2PDZ256rmkz,        0 },
+  { X86::VCVTQQ2PDZrrkz,           X86::VCVTQQ2PDZrmkz,           0 },
+  { X86::VCVTQQ2PSZ128rrkz,        X86::VCVTQQ2PSZ128rmkz,        0 },
+  { X86::VCVTQQ2PSZ256rrkz,        X86::VCVTQQ2PSZ256rmkz,        0 },
+  { X86::VCVTQQ2PSZrrkz,           X86::VCVTQQ2PSZrmkz,           0 },
+  { X86::VCVTSD2SSZrr,             X86::VCVTSD2SSZrm,             0 },
+  { X86::VCVTSD2SSZrr_Int,         X86::VCVTSD2SSZrm_Int,         TB_NO_REVERSE },
+  { X86::VCVTSD2SSrr,              X86::VCVTSD2SSrm,              0 },
+  { X86::VCVTSD2SSrr_Int,          X86::VCVTSD2SSrm_Int,          TB_NO_REVERSE },
+  { X86::VCVTSI2SDZrr,             X86::VCVTSI2SDZrm,             0 },
+  { X86::VCVTSI2SDZrr_Int,         X86::VCVTSI2SDZrm_Int,         0 },
+  { X86::VCVTSI2SDrr,              X86::VCVTSI2SDrm,              0 },
+  { X86::VCVTSI2SDrr_Int,          X86::VCVTSI2SDrm_Int,          0 },
+  { X86::VCVTSI2SSZrr,             X86::VCVTSI2SSZrm,             0 },
+  { X86::VCVTSI2SSZrr_Int,         X86::VCVTSI2SSZrm_Int,         0 },
+  { X86::VCVTSI2SSrr,              X86::VCVTSI2SSrm,              0 },
+  { X86::VCVTSI2SSrr_Int,          X86::VCVTSI2SSrm_Int,          0 },
+  { X86::VCVTSI642SDZrr,           X86::VCVTSI642SDZrm,           0 },
+  { X86::VCVTSI642SDZrr_Int,       X86::VCVTSI642SDZrm_Int,       0 },
+  { X86::VCVTSI642SDrr,            X86::VCVTSI642SDrm,            0 },
+  { X86::VCVTSI642SDrr_Int,        X86::VCVTSI642SDrm_Int,        0 },
+  { X86::VCVTSI642SSZrr,           X86::VCVTSI642SSZrm,           0 },
+  { X86::VCVTSI642SSZrr_Int,       X86::VCVTSI642SSZrm_Int,       0 },
+  { X86::VCVTSI642SSrr,            X86::VCVTSI642SSrm,            0 },
+  { X86::VCVTSI642SSrr_Int,        X86::VCVTSI642SSrm_Int,        0 },
+  { X86::VCVTSS2SDZrr,             X86::VCVTSS2SDZrm,             0 },
+  { X86::VCVTSS2SDZrr_Int,         X86::VCVTSS2SDZrm_Int,         TB_NO_REVERSE },
+  { X86::VCVTSS2SDrr,              X86::VCVTSS2SDrm,              0 },
+  { X86::VCVTSS2SDrr_Int,          X86::VCVTSS2SDrm_Int,          TB_NO_REVERSE },
+  { X86::VCVTTPD2DQZ128rrkz,       X86::VCVTTPD2DQZ128rmkz,       0 },
+  { X86::VCVTTPD2DQZ256rrkz,       X86::VCVTTPD2DQZ256rmkz,       0 },
+  { X86::VCVTTPD2DQZrrkz,          X86::VCVTTPD2DQZrmkz,          0 },
+  { X86::VCVTTPD2QQZ128rrkz,       X86::VCVTTPD2QQZ128rmkz,       0 },
+  { X86::VCVTTPD2QQZ256rrkz,       X86::VCVTTPD2QQZ256rmkz,       0 },
+  { X86::VCVTTPD2QQZrrkz,          X86::VCVTTPD2QQZrmkz,          0 },
+  { X86::VCVTTPD2UDQZ128rrkz,      X86::VCVTTPD2UDQZ128rmkz,      0 },
+  { X86::VCVTTPD2UDQZ256rrkz,      X86::VCVTTPD2UDQZ256rmkz,      0 },
+  { X86::VCVTTPD2UDQZrrkz,         X86::VCVTTPD2UDQZrmkz,         0 },
+  { X86::VCVTTPD2UQQZ128rrkz,      X86::VCVTTPD2UQQZ128rmkz,      0 },
+  { X86::VCVTTPD2UQQZ256rrkz,      X86::VCVTTPD2UQQZ256rmkz,      0 },
+  { X86::VCVTTPD2UQQZrrkz,         X86::VCVTTPD2UQQZrmkz,         0 },
+  { X86::VCVTTPS2DQZ128rrkz,       X86::VCVTTPS2DQZ128rmkz,       0 },
+  { X86::VCVTTPS2DQZ256rrkz,       X86::VCVTTPS2DQZ256rmkz,       0 },
+  { X86::VCVTTPS2DQZrrkz,          X86::VCVTTPS2DQZrmkz,          0 },
+  { X86::VCVTTPS2QQZ128rrkz,       X86::VCVTTPS2QQZ128rmkz,       TB_NO_REVERSE },
+  { X86::VCVTTPS2QQZ256rrkz,       X86::VCVTTPS2QQZ256rmkz,       0 },
+  { X86::VCVTTPS2QQZrrkz,          X86::VCVTTPS2QQZrmkz,          0 },
+  { X86::VCVTTPS2UDQZ128rrkz,      X86::VCVTTPS2UDQZ128rmkz,      0 },
+  { X86::VCVTTPS2UDQZ256rrkz,      X86::VCVTTPS2UDQZ256rmkz,      0 },
+  { X86::VCVTTPS2UDQZrrkz,         X86::VCVTTPS2UDQZrmkz,         0 },
+  { X86::VCVTTPS2UQQZ128rrkz,      X86::VCVTTPS2UQQZ128rmkz,      TB_NO_REVERSE },
+  { X86::VCVTTPS2UQQZ256rrkz,      X86::VCVTTPS2UQQZ256rmkz,      0 },
+  { X86::VCVTTPS2UQQZrrkz,         X86::VCVTTPS2UQQZrmkz,         0 },
+  { X86::VCVTUDQ2PDZ128rrkz,       X86::VCVTUDQ2PDZ128rmkz,       TB_NO_REVERSE },
+  { X86::VCVTUDQ2PDZ256rrkz,       X86::VCVTUDQ2PDZ256rmkz,       0 },
+  { X86::VCVTUDQ2PDZrrkz,          X86::VCVTUDQ2PDZrmkz,          0 },
+  { X86::VCVTUDQ2PSZ128rrkz,       X86::VCVTUDQ2PSZ128rmkz,       0 },
+  { X86::VCVTUDQ2PSZ256rrkz,       X86::VCVTUDQ2PSZ256rmkz,       0 },
+  { X86::VCVTUDQ2PSZrrkz,          X86::VCVTUDQ2PSZrmkz,          0 },
+  { X86::VCVTUQQ2PDZ128rrkz,       X86::VCVTUQQ2PDZ128rmkz,       0 },
+  { X86::VCVTUQQ2PDZ256rrkz,       X86::VCVTUQQ2PDZ256rmkz,       0 },
+  { X86::VCVTUQQ2PDZrrkz,          X86::VCVTUQQ2PDZrmkz,          0 },
+  { X86::VCVTUQQ2PSZ128rrkz,       X86::VCVTUQQ2PSZ128rmkz,       0 },
+  { X86::VCVTUQQ2PSZ256rrkz,       X86::VCVTUQQ2PSZ256rmkz,       0 },
+  { X86::VCVTUQQ2PSZrrkz,          X86::VCVTUQQ2PSZrmkz,          0 },
+  { X86::VCVTUSI2SDZrr,            X86::VCVTUSI2SDZrm,            0 },
+  { X86::VCVTUSI2SDZrr_Int,        X86::VCVTUSI2SDZrm_Int,        0 },
+  { X86::VCVTUSI2SSZrr,            X86::VCVTUSI2SSZrm,            0 },
+  { X86::VCVTUSI2SSZrr_Int,        X86::VCVTUSI2SSZrm_Int,        0 },
+  { X86::VCVTUSI642SDZrr,          X86::VCVTUSI642SDZrm,          0 },
+  { X86::VCVTUSI642SDZrr_Int,      X86::VCVTUSI642SDZrm_Int,      0 },
+  { X86::VCVTUSI642SSZrr,          X86::VCVTUSI642SSZrm,          0 },
+  { X86::VCVTUSI642SSZrr_Int,      X86::VCVTUSI642SSZrm_Int,      0 },
+  { X86::VDBPSADBWZ128rri,         X86::VDBPSADBWZ128rmi,         0 },
+  { X86::VDBPSADBWZ256rri,         X86::VDBPSADBWZ256rmi,         0 },
+  { X86::VDBPSADBWZrri,            X86::VDBPSADBWZrmi,            0 },
+  { X86::VDIVPDYrr,                X86::VDIVPDYrm,                0 },
+  { X86::VDIVPDZ128rr,             X86::VDIVPDZ128rm,             0 },
+  { X86::VDIVPDZ256rr,             X86::VDIVPDZ256rm,             0 },
+  { X86::VDIVPDZrr,                X86::VDIVPDZrm,                0 },
+  { X86::VDIVPDrr,                 X86::VDIVPDrm,                 0 },
+  { X86::VDIVPSYrr,                X86::VDIVPSYrm,                0 },
+  { X86::VDIVPSZ128rr,             X86::VDIVPSZ128rm,             0 },
+  { X86::VDIVPSZ256rr,             X86::VDIVPSZ256rm,             0 },
+  { X86::VDIVPSZrr,                X86::VDIVPSZrm,                0 },
+  { X86::VDIVPSrr,                 X86::VDIVPSrm,                 0 },
+  { X86::VDIVSDZrr,                X86::VDIVSDZrm,                0 },
+  { X86::VDIVSDZrr_Int,            X86::VDIVSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VDIVSDrr,                 X86::VDIVSDrm,                 0 },
+  { X86::VDIVSDrr_Int,             X86::VDIVSDrm_Int,             TB_NO_REVERSE },
+  { X86::VDIVSSZrr,                X86::VDIVSSZrm,                0 },
+  { X86::VDIVSSZrr_Int,            X86::VDIVSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VDIVSSrr,                 X86::VDIVSSrm,                 0 },
+  { X86::VDIVSSrr_Int,             X86::VDIVSSrm_Int,             TB_NO_REVERSE },
+  { X86::VDPPDrri,                 X86::VDPPDrmi,                 0 },
+  { X86::VDPPSYrri,                X86::VDPPSYrmi,                0 },
+  { X86::VDPPSrri,                 X86::VDPPSrmi,                 0 },
+  { X86::VEXP2PDZrkz,              X86::VEXP2PDZmkz,              0 },
+  { X86::VEXP2PSZrkz,              X86::VEXP2PSZmkz,              0 },
+  { X86::VEXPANDPDZ128rrkz,        X86::VEXPANDPDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VEXPANDPDZ256rrkz,        X86::VEXPANDPDZ256rmkz,        TB_NO_REVERSE },
+  { X86::VEXPANDPDZrrkz,           X86::VEXPANDPDZrmkz,           TB_NO_REVERSE },
+  { X86::VEXPANDPSZ128rrkz,        X86::VEXPANDPSZ128rmkz,        TB_NO_REVERSE },
+  { X86::VEXPANDPSZ256rrkz,        X86::VEXPANDPSZ256rmkz,        TB_NO_REVERSE },
+  { X86::VEXPANDPSZrrkz,           X86::VEXPANDPSZrmkz,           TB_NO_REVERSE },
+  { X86::VFMADDPD4Yrr,             X86::VFMADDPD4Ymr,             0 },
+  { X86::VFMADDPD4rr,              X86::VFMADDPD4mr,              0 },
+  { X86::VFMADDPS4Yrr,             X86::VFMADDPS4Ymr,             0 },
+  { X86::VFMADDPS4rr,              X86::VFMADDPS4mr,              0 },
+  { X86::VFMADDSD4rr,              X86::VFMADDSD4mr,              0 },
+  { X86::VFMADDSD4rr_Int,          X86::VFMADDSD4mr_Int,          TB_NO_REVERSE },
+  { X86::VFMADDSS4rr,              X86::VFMADDSS4mr,              0 },
+  { X86::VFMADDSS4rr_Int,          X86::VFMADDSS4mr_Int,          TB_NO_REVERSE },
+  { X86::VFMADDSUBPD4Yrr,          X86::VFMADDSUBPD4Ymr,          0 },
+  { X86::VFMADDSUBPD4rr,           X86::VFMADDSUBPD4mr,           0 },
+  { X86::VFMADDSUBPS4Yrr,          X86::VFMADDSUBPS4Ymr,          0 },
+  { X86::VFMADDSUBPS4rr,           X86::VFMADDSUBPS4mr,           0 },
+  { X86::VFMSUBADDPD4Yrr,          X86::VFMSUBADDPD4Ymr,          0 },
+  { X86::VFMSUBADDPD4rr,           X86::VFMSUBADDPD4mr,           0 },
+  { X86::VFMSUBADDPS4Yrr,          X86::VFMSUBADDPS4Ymr,          0 },
+  { X86::VFMSUBADDPS4rr,           X86::VFMSUBADDPS4mr,           0 },
+  { X86::VFMSUBPD4Yrr,             X86::VFMSUBPD4Ymr,             0 },
+  { X86::VFMSUBPD4rr,              X86::VFMSUBPD4mr,              0 },
+  { X86::VFMSUBPS4Yrr,             X86::VFMSUBPS4Ymr,             0 },
+  { X86::VFMSUBPS4rr,              X86::VFMSUBPS4mr,              0 },
+  { X86::VFMSUBSD4rr,              X86::VFMSUBSD4mr,              0 },
+  { X86::VFMSUBSD4rr_Int,          X86::VFMSUBSD4mr_Int,          TB_NO_REVERSE },
+  { X86::VFMSUBSS4rr,              X86::VFMSUBSS4mr,              0 },
+  { X86::VFMSUBSS4rr_Int,          X86::VFMSUBSS4mr_Int,          TB_NO_REVERSE },
+  { X86::VFNMADDPD4Yrr,            X86::VFNMADDPD4Ymr,            0 },
+  { X86::VFNMADDPD4rr,             X86::VFNMADDPD4mr,             0 },
+  { X86::VFNMADDPS4Yrr,            X86::VFNMADDPS4Ymr,            0 },
+  { X86::VFNMADDPS4rr,             X86::VFNMADDPS4mr,             0 },
+  { X86::VFNMADDSD4rr,             X86::VFNMADDSD4mr,             0 },
+  { X86::VFNMADDSD4rr_Int,         X86::VFNMADDSD4mr_Int,         TB_NO_REVERSE },
+  { X86::VFNMADDSS4rr,             X86::VFNMADDSS4mr,             0 },
+  { X86::VFNMADDSS4rr_Int,         X86::VFNMADDSS4mr_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUBPD4Yrr,            X86::VFNMSUBPD4Ymr,            0 },
+  { X86::VFNMSUBPD4rr,             X86::VFNMSUBPD4mr,             0 },
+  { X86::VFNMSUBPS4Yrr,            X86::VFNMSUBPS4Ymr,            0 },
+  { X86::VFNMSUBPS4rr,             X86::VFNMSUBPS4mr,             0 },
+  { X86::VFNMSUBSD4rr,             X86::VFNMSUBSD4mr,             0 },
+  { X86::VFNMSUBSD4rr_Int,         X86::VFNMSUBSD4mr_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUBSS4rr,             X86::VFNMSUBSS4mr,             0 },
+  { X86::VFNMSUBSS4rr_Int,         X86::VFNMSUBSS4mr_Int,         TB_NO_REVERSE },
+  { X86::VFPCLASSPDZ128rrk,        X86::VFPCLASSPDZ128rmk,        0 },
+  { X86::VFPCLASSPDZ256rrk,        X86::VFPCLASSPDZ256rmk,        0 },
+  { X86::VFPCLASSPDZrrk,           X86::VFPCLASSPDZrmk,           0 },
+  { X86::VFPCLASSPSZ128rrk,        X86::VFPCLASSPSZ128rmk,        0 },
+  { X86::VFPCLASSPSZ256rrk,        X86::VFPCLASSPSZ256rmk,        0 },
+  { X86::VFPCLASSPSZrrk,           X86::VFPCLASSPSZrmk,           0 },
+  { X86::VFPCLASSSDZrrk,           X86::VFPCLASSSDZrmk,           TB_NO_REVERSE },
+  { X86::VFPCLASSSSZrrk,           X86::VFPCLASSSSZrmk,           TB_NO_REVERSE },
+  { X86::VGETEXPPDZ128rkz,         X86::VGETEXPPDZ128mkz,         0 },
+  { X86::VGETEXPPDZ256rkz,         X86::VGETEXPPDZ256mkz,         0 },
+  { X86::VGETEXPPDZrkz,            X86::VGETEXPPDZmkz,            0 },
+  { X86::VGETEXPPSZ128rkz,         X86::VGETEXPPSZ128mkz,         0 },
+  { X86::VGETEXPPSZ256rkz,         X86::VGETEXPPSZ256mkz,         0 },
+  { X86::VGETEXPPSZrkz,            X86::VGETEXPPSZmkz,            0 },
+  { X86::VGETEXPSDZr,              X86::VGETEXPSDZm,              TB_NO_REVERSE },
+  { X86::VGETEXPSSZr,              X86::VGETEXPSSZm,              TB_NO_REVERSE },
+  { X86::VGETMANTPDZ128rrikz,      X86::VGETMANTPDZ128rmikz,      0 },
+  { X86::VGETMANTPDZ256rrikz,      X86::VGETMANTPDZ256rmikz,      0 },
+  { X86::VGETMANTPDZrrikz,         X86::VGETMANTPDZrmikz,         0 },
+  { X86::VGETMANTPSZ128rrikz,      X86::VGETMANTPSZ128rmikz,      0 },
+  { X86::VGETMANTPSZ256rrikz,      X86::VGETMANTPSZ256rmikz,      0 },
+  { X86::VGETMANTPSZrrikz,         X86::VGETMANTPSZrmikz,         0 },
+  { X86::VGETMANTSDZrri,           X86::VGETMANTSDZrmi,           TB_NO_REVERSE },
+  { X86::VGETMANTSSZrri,           X86::VGETMANTSSZrmi,           TB_NO_REVERSE },
+  { X86::VGF2P8AFFINEINVQBYrri,    X86::VGF2P8AFFINEINVQBYrmi,    0 },
+  { X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 },
+  { X86::VGF2P8AFFINEINVQBZ256rri, X86::VGF2P8AFFINEINVQBZ256rmi, 0 },
+  { X86::VGF2P8AFFINEINVQBZrri,    X86::VGF2P8AFFINEINVQBZrmi,    0 },
+  { X86::VGF2P8AFFINEINVQBrri,     X86::VGF2P8AFFINEINVQBrmi,     0 },
+  { X86::VGF2P8AFFINEQBYrri,       X86::VGF2P8AFFINEQBYrmi,       0 },
+  { X86::VGF2P8AFFINEQBZ128rri,    X86::VGF2P8AFFINEQBZ128rmi,    0 },
+  { X86::VGF2P8AFFINEQBZ256rri,    X86::VGF2P8AFFINEQBZ256rmi,    0 },
+  { X86::VGF2P8AFFINEQBZrri,       X86::VGF2P8AFFINEQBZrmi,       0 },
+  { X86::VGF2P8AFFINEQBrri,        X86::VGF2P8AFFINEQBrmi,        0 },
+  { X86::VGF2P8MULBYrr,            X86::VGF2P8MULBYrm,            0 },
+  { X86::VGF2P8MULBZ128rr,         X86::VGF2P8MULBZ128rm,         0 },
+  { X86::VGF2P8MULBZ256rr,         X86::VGF2P8MULBZ256rm,         0 },
+  { X86::VGF2P8MULBZrr,            X86::VGF2P8MULBZrm,            0 },
+  { X86::VGF2P8MULBrr,             X86::VGF2P8MULBrm,             0 },
+  { X86::VHADDPDYrr,               X86::VHADDPDYrm,               0 },
+  { X86::VHADDPDrr,                X86::VHADDPDrm,                0 },
+  { X86::VHADDPSYrr,               X86::VHADDPSYrm,               0 },
+  { X86::VHADDPSrr,                X86::VHADDPSrm,                0 },
+  { X86::VHSUBPDYrr,               X86::VHSUBPDYrm,               0 },
+  { X86::VHSUBPDrr,                X86::VHSUBPDrm,                0 },
+  { X86::VHSUBPSYrr,               X86::VHSUBPSYrm,               0 },
+  { X86::VHSUBPSrr,                X86::VHSUBPSrm,                0 },
+  { X86::VINSERTF128rr,            X86::VINSERTF128rm,            0 },
+  { X86::VINSERTF32x4Z256rr,       X86::VINSERTF32x4Z256rm,       0 },
+  { X86::VINSERTF32x4Zrr,          X86::VINSERTF32x4Zrm,          0 },
+  { X86::VINSERTF32x8Zrr,          X86::VINSERTF32x8Zrm,          0 },
+  { X86::VINSERTF64x2Z256rr,       X86::VINSERTF64x2Z256rm,       0 },
+  { X86::VINSERTF64x2Zrr,          X86::VINSERTF64x2Zrm,          0 },
+  { X86::VINSERTF64x4Zrr,          X86::VINSERTF64x4Zrm,          0 },
+  { X86::VINSERTI128rr,            X86::VINSERTI128rm,            0 },
+  { X86::VINSERTI32x4Z256rr,       X86::VINSERTI32x4Z256rm,       0 },
+  { X86::VINSERTI32x4Zrr,          X86::VINSERTI32x4Zrm,          0 },
+  { X86::VINSERTI32x8Zrr,          X86::VINSERTI32x8Zrm,          0 },
+  { X86::VINSERTI64x2Z256rr,       X86::VINSERTI64x2Z256rm,       0 },
+  { X86::VINSERTI64x2Zrr,          X86::VINSERTI64x2Zrm,          0 },
+  { X86::VINSERTI64x4Zrr,          X86::VINSERTI64x4Zrm,          0 },
+  { X86::VMAXCPDYrr,               X86::VMAXCPDYrm,               0 },
+  { X86::VMAXCPDZ128rr,            X86::VMAXCPDZ128rm,            0 },
+  { X86::VMAXCPDZ256rr,            X86::VMAXCPDZ256rm,            0 },
+  { X86::VMAXCPDZrr,               X86::VMAXCPDZrm,               0 },
+  { X86::VMAXCPDrr,                X86::VMAXCPDrm,                0 },
+  { X86::VMAXCPSYrr,               X86::VMAXCPSYrm,               0 },
+  { X86::VMAXCPSZ128rr,            X86::VMAXCPSZ128rm,            0 },
+  { X86::VMAXCPSZ256rr,            X86::VMAXCPSZ256rm,            0 },
+  { X86::VMAXCPSZrr,               X86::VMAXCPSZrm,               0 },
+  { X86::VMAXCPSrr,                X86::VMAXCPSrm,                0 },
+  { X86::VMAXCSDZrr,               X86::VMAXCSDZrm,               0 },
+  { X86::VMAXCSDrr,                X86::VMAXCSDrm,                0 },
+  { X86::VMAXCSSZrr,               X86::VMAXCSSZrm,               0 },
+  { X86::VMAXCSSrr,                X86::VMAXCSSrm,                0 },
+  { X86::VMAXPDYrr,                X86::VMAXPDYrm,                0 },
+  { X86::VMAXPDZ128rr,             X86::VMAXPDZ128rm,             0 },
+  { X86::VMAXPDZ256rr,             X86::VMAXPDZ256rm,             0 },
+  { X86::VMAXPDZrr,                X86::VMAXPDZrm,                0 },
+  { X86::VMAXPDrr,                 X86::VMAXPDrm,                 0 },
+  { X86::VMAXPSYrr,                X86::VMAXPSYrm,                0 },
+  { X86::VMAXPSZ128rr,             X86::VMAXPSZ128rm,             0 },
+  { X86::VMAXPSZ256rr,             X86::VMAXPSZ256rm,             0 },
+  { X86::VMAXPSZrr,                X86::VMAXPSZrm,                0 },
+  { X86::VMAXPSrr,                 X86::VMAXPSrm,                 0 },
+  { X86::VMAXSDZrr,                X86::VMAXSDZrm,                0 },
+  { X86::VMAXSDZrr_Int,            X86::VMAXSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VMAXSDrr,                 X86::VMAXSDrm,                 0 },
+  { X86::VMAXSDrr_Int,             X86::VMAXSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMAXSSZrr,                X86::VMAXSSZrm,                0 },
+  { X86::VMAXSSZrr_Int,            X86::VMAXSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VMAXSSrr,                 X86::VMAXSSrm,                 0 },
+  { X86::VMAXSSrr_Int,             X86::VMAXSSrm_Int,             TB_NO_REVERSE },
+  { X86::VMINCPDYrr,               X86::VMINCPDYrm,               0 },
+  { X86::VMINCPDZ128rr,            X86::VMINCPDZ128rm,            0 },
+  { X86::VMINCPDZ256rr,            X86::VMINCPDZ256rm,            0 },
+  { X86::VMINCPDZrr,               X86::VMINCPDZrm,               0 },
+  { X86::VMINCPDrr,                X86::VMINCPDrm,                0 },
+  { X86::VMINCPSYrr,               X86::VMINCPSYrm,               0 },
+  { X86::VMINCPSZ128rr,            X86::VMINCPSZ128rm,            0 },
+  { X86::VMINCPSZ256rr,            X86::VMINCPSZ256rm,            0 },
+  { X86::VMINCPSZrr,               X86::VMINCPSZrm,               0 },
+  { X86::VMINCPSrr,                X86::VMINCPSrm,                0 },
+  { X86::VMINCSDZrr,               X86::VMINCSDZrm,               0 },
+  { X86::VMINCSDrr,                X86::VMINCSDrm,                0 },
+  { X86::VMINCSSZrr,               X86::VMINCSSZrm,               0 },
+  { X86::VMINCSSrr,                X86::VMINCSSrm,                0 },
+  { X86::VMINPDYrr,                X86::VMINPDYrm,                0 },
+  { X86::VMINPDZ128rr,             X86::VMINPDZ128rm,             0 },
+  { X86::VMINPDZ256rr,             X86::VMINPDZ256rm,             0 },
+  { X86::VMINPDZrr,                X86::VMINPDZrm,                0 },
+  { X86::VMINPDrr,                 X86::VMINPDrm,                 0 },
+  { X86::VMINPSYrr,                X86::VMINPSYrm,                0 },
+  { X86::VMINPSZ128rr,             X86::VMINPSZ128rm,             0 },
+  { X86::VMINPSZ256rr,             X86::VMINPSZ256rm,             0 },
+  { X86::VMINPSZrr,                X86::VMINPSZrm,                0 },
+  { X86::VMINPSrr,                 X86::VMINPSrm,                 0 },
+  { X86::VMINSDZrr,                X86::VMINSDZrm,                0 },
+  { X86::VMINSDZrr_Int,            X86::VMINSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VMINSDrr,                 X86::VMINSDrm,                 0 },
+  { X86::VMINSDrr_Int,             X86::VMINSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMINSSZrr,                X86::VMINSSZrm,                0 },
+  { X86::VMINSSZrr_Int,            X86::VMINSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VMINSSrr,                 X86::VMINSSrm,                 0 },
+  { X86::VMINSSrr_Int,             X86::VMINSSrm_Int,             TB_NO_REVERSE },
+  { X86::VMOVAPDZ128rrkz,          X86::VMOVAPDZ128rmkz,          TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVAPDZ256rrkz,          X86::VMOVAPDZ256rmkz,          TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVAPDZrrkz,             X86::VMOVAPDZrmkz,             TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVAPSZ128rrkz,          X86::VMOVAPSZ128rmkz,          TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVAPSZ256rrkz,          X86::VMOVAPSZ256rmkz,          TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVAPSZrrkz,             X86::VMOVAPSZrmkz,             TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVDDUPZ128rrkz,         X86::VMOVDDUPZ128rmkz,         TB_NO_REVERSE },
+  { X86::VMOVDDUPZ256rrkz,         X86::VMOVDDUPZ256rmkz,         0 },
+  { X86::VMOVDDUPZrrkz,            X86::VMOVDDUPZrmkz,            0 },
+  { X86::VMOVDQA32Z128rrkz,        X86::VMOVDQA32Z128rmkz,        TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVDQA32Z256rrkz,        X86::VMOVDQA32Z256rmkz,        TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVDQA32Zrrkz,           X86::VMOVDQA32Zrmkz,           TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVDQA64Z128rrkz,        X86::VMOVDQA64Z128rmkz,        TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVDQA64Z256rrkz,        X86::VMOVDQA64Z256rmkz,        TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVDQA64Zrrkz,           X86::VMOVDQA64Zrmkz,           TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVDQU16Z128rrkz,        X86::VMOVDQU16Z128rmkz,        TB_NO_REVERSE },
+  { X86::VMOVDQU16Z256rrkz,        X86::VMOVDQU16Z256rmkz,        TB_NO_REVERSE },
+  { X86::VMOVDQU16Zrrkz,           X86::VMOVDQU16Zrmkz,           TB_NO_REVERSE },
+  { X86::VMOVDQU32Z128rrkz,        X86::VMOVDQU32Z128rmkz,        TB_NO_REVERSE },
+  { X86::VMOVDQU32Z256rrkz,        X86::VMOVDQU32Z256rmkz,        TB_NO_REVERSE },
+  { X86::VMOVDQU32Zrrkz,           X86::VMOVDQU32Zrmkz,           TB_NO_REVERSE },
+  { X86::VMOVDQU64Z128rrkz,        X86::VMOVDQU64Z128rmkz,        TB_NO_REVERSE },
+  { X86::VMOVDQU64Z256rrkz,        X86::VMOVDQU64Z256rmkz,        TB_NO_REVERSE },
+  { X86::VMOVDQU64Zrrkz,           X86::VMOVDQU64Zrmkz,           TB_NO_REVERSE },
+  { X86::VMOVDQU8Z128rrkz,         X86::VMOVDQU8Z128rmkz,         TB_NO_REVERSE },
+  { X86::VMOVDQU8Z256rrkz,         X86::VMOVDQU8Z256rmkz,         TB_NO_REVERSE },
+  { X86::VMOVDQU8Zrrkz,            X86::VMOVDQU8Zrmkz,            TB_NO_REVERSE },
+  { X86::VMOVLHPSZrr,              X86::VMOVHPSZ128rm,            TB_NO_REVERSE },
+  { X86::VMOVLHPSrr,               X86::VMOVHPSrm,                TB_NO_REVERSE },
+  { X86::VMOVSHDUPZ128rrkz,        X86::VMOVSHDUPZ128rmkz,        0 },
+  { X86::VMOVSHDUPZ256rrkz,        X86::VMOVSHDUPZ256rmkz,        0 },
+  { X86::VMOVSHDUPZrrkz,           X86::VMOVSHDUPZrmkz,           0 },
+  { X86::VMOVSLDUPZ128rrkz,        X86::VMOVSLDUPZ128rmkz,        0 },
+  { X86::VMOVSLDUPZ256rrkz,        X86::VMOVSLDUPZ256rmkz,        0 },
+  { X86::VMOVSLDUPZrrkz,           X86::VMOVSLDUPZrmkz,           0 },
+  { X86::VMOVUPDZ128rrkz,          X86::VMOVUPDZ128rmkz,          TB_NO_REVERSE },
+  { X86::VMOVUPDZ256rrkz,          X86::VMOVUPDZ256rmkz,          TB_NO_REVERSE },
+  { X86::VMOVUPDZrrkz,             X86::VMOVUPDZrmkz,             TB_NO_REVERSE },
+  { X86::VMOVUPSZ128rrkz,          X86::VMOVUPSZ128rmkz,          TB_NO_REVERSE },
+  { X86::VMOVUPSZ256rrkz,          X86::VMOVUPSZ256rmkz,          TB_NO_REVERSE },
+  { X86::VMOVUPSZrrkz,             X86::VMOVUPSZrmkz,             TB_NO_REVERSE },
+  { X86::VMPSADBWYrri,             X86::VMPSADBWYrmi,             0 },
+  { X86::VMPSADBWrri,              X86::VMPSADBWrmi,              0 },
+  { X86::VMULPDYrr,                X86::VMULPDYrm,                0 },
+  { X86::VMULPDZ128rr,             X86::VMULPDZ128rm,             0 },
+  { X86::VMULPDZ256rr,             X86::VMULPDZ256rm,             0 },
+  { X86::VMULPDZrr,                X86::VMULPDZrm,                0 },
+  { X86::VMULPDrr,                 X86::VMULPDrm,                 0 },
+  { X86::VMULPSYrr,                X86::VMULPSYrm,                0 },
+  { X86::VMULPSZ128rr,             X86::VMULPSZ128rm,             0 },
+  { X86::VMULPSZ256rr,             X86::VMULPSZ256rm,             0 },
+  { X86::VMULPSZrr,                X86::VMULPSZrm,                0 },
+  { X86::VMULPSrr,                 X86::VMULPSrm,                 0 },
+  { X86::VMULSDZrr,                X86::VMULSDZrm,                0 },
+  { X86::VMULSDZrr_Int,            X86::VMULSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VMULSDrr,                 X86::VMULSDrm,                 0 },
+  { X86::VMULSDrr_Int,             X86::VMULSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMULSSZrr,                X86::VMULSSZrm,                0 },
+  { X86::VMULSSZrr_Int,            X86::VMULSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VMULSSrr,                 X86::VMULSSrm,                 0 },
+  { X86::VMULSSrr_Int,             X86::VMULSSrm_Int,             TB_NO_REVERSE },
+  { X86::VORPDYrr,                 X86::VORPDYrm,                 0 },
+  { X86::VORPDZ128rr,              X86::VORPDZ128rm,              0 },
+  { X86::VORPDZ256rr,              X86::VORPDZ256rm,              0 },
+  { X86::VORPDZrr,                 X86::VORPDZrm,                 0 },
+  { X86::VORPDrr,                  X86::VORPDrm,                  0 },
+  { X86::VORPSYrr,                 X86::VORPSYrm,                 0 },
+  { X86::VORPSZ128rr,              X86::VORPSZ128rm,              0 },
+  { X86::VORPSZ256rr,              X86::VORPSZ256rm,              0 },
+  { X86::VORPSZrr,                 X86::VORPSZrm,                 0 },
+  { X86::VORPSrr,                  X86::VORPSrm,                  0 },
+  { X86::VPABSBZ128rrkz,           X86::VPABSBZ128rmkz,           0 },
+  { X86::VPABSBZ256rrkz,           X86::VPABSBZ256rmkz,           0 },
+  { X86::VPABSBZrrkz,              X86::VPABSBZrmkz,              0 },
+  { X86::VPABSDZ128rrkz,           X86::VPABSDZ128rmkz,           0 },
+  { X86::VPABSDZ256rrkz,           X86::VPABSDZ256rmkz,           0 },
+  { X86::VPABSDZrrkz,              X86::VPABSDZrmkz,              0 },
+  { X86::VPABSQZ128rrkz,           X86::VPABSQZ128rmkz,           0 },
+  { X86::VPABSQZ256rrkz,           X86::VPABSQZ256rmkz,           0 },
+  { X86::VPABSQZrrkz,              X86::VPABSQZrmkz,              0 },
+  { X86::VPABSWZ128rrkz,           X86::VPABSWZ128rmkz,           0 },
+  { X86::VPABSWZ256rrkz,           X86::VPABSWZ256rmkz,           0 },
+  { X86::VPABSWZrrkz,              X86::VPABSWZrmkz,              0 },
+  { X86::VPACKSSDWYrr,             X86::VPACKSSDWYrm,             0 },
+  { X86::VPACKSSDWZ128rr,          X86::VPACKSSDWZ128rm,          0 },
+  { X86::VPACKSSDWZ256rr,          X86::VPACKSSDWZ256rm,          0 },
+  { X86::VPACKSSDWZrr,             X86::VPACKSSDWZrm,             0 },
+  { X86::VPACKSSDWrr,              X86::VPACKSSDWrm,              0 },
+  { X86::VPACKSSWBYrr,             X86::VPACKSSWBYrm,             0 },
+  { X86::VPACKSSWBZ128rr,          X86::VPACKSSWBZ128rm,          0 },
+  { X86::VPACKSSWBZ256rr,          X86::VPACKSSWBZ256rm,          0 },
+  { X86::VPACKSSWBZrr,             X86::VPACKSSWBZrm,             0 },
+  { X86::VPACKSSWBrr,              X86::VPACKSSWBrm,              0 },
+  { X86::VPACKUSDWYrr,             X86::VPACKUSDWYrm,             0 },
+  { X86::VPACKUSDWZ128rr,          X86::VPACKUSDWZ128rm,          0 },
+  { X86::VPACKUSDWZ256rr,          X86::VPACKUSDWZ256rm,          0 },
+  { X86::VPACKUSDWZrr,             X86::VPACKUSDWZrm,             0 },
+  { X86::VPACKUSDWrr,              X86::VPACKUSDWrm,              0 },
+  { X86::VPACKUSWBYrr,             X86::VPACKUSWBYrm,             0 },
+  { X86::VPACKUSWBZ128rr,          X86::VPACKUSWBZ128rm,          0 },
+  { X86::VPACKUSWBZ256rr,          X86::VPACKUSWBZ256rm,          0 },
+  { X86::VPACKUSWBZrr,             X86::VPACKUSWBZrm,             0 },
+  { X86::VPACKUSWBrr,              X86::VPACKUSWBrm,              0 },
+  { X86::VPADDBYrr,                X86::VPADDBYrm,                0 },
+  { X86::VPADDBZ128rr,             X86::VPADDBZ128rm,             0 },
+  { X86::VPADDBZ256rr,             X86::VPADDBZ256rm,             0 },
+  { X86::VPADDBZrr,                X86::VPADDBZrm,                0 },
+  { X86::VPADDBrr,                 X86::VPADDBrm,                 0 },
+  { X86::VPADDDYrr,                X86::VPADDDYrm,                0 },
+  { X86::VPADDDZ128rr,             X86::VPADDDZ128rm,             0 },
+  { X86::VPADDDZ256rr,             X86::VPADDDZ256rm,             0 },
+  { X86::VPADDDZrr,                X86::VPADDDZrm,                0 },
+  { X86::VPADDDrr,                 X86::VPADDDrm,                 0 },
+  { X86::VPADDQYrr,                X86::VPADDQYrm,                0 },
+  { X86::VPADDQZ128rr,             X86::VPADDQZ128rm,             0 },
+  { X86::VPADDQZ256rr,             X86::VPADDQZ256rm,             0 },
+  { X86::VPADDQZrr,                X86::VPADDQZrm,                0 },
+  { X86::VPADDQrr,                 X86::VPADDQrm,                 0 },
+  { X86::VPADDSBYrr,               X86::VPADDSBYrm,               0 },
+  { X86::VPADDSBZ128rr,            X86::VPADDSBZ128rm,            0 },
+  { X86::VPADDSBZ256rr,            X86::VPADDSBZ256rm,            0 },
+  { X86::VPADDSBZrr,               X86::VPADDSBZrm,               0 },
+  { X86::VPADDSBrr,                X86::VPADDSBrm,                0 },
+  { X86::VPADDSWYrr,               X86::VPADDSWYrm,               0 },
+  { X86::VPADDSWZ128rr,            X86::VPADDSWZ128rm,            0 },
+  { X86::VPADDSWZ256rr,            X86::VPADDSWZ256rm,            0 },
+  { X86::VPADDSWZrr,               X86::VPADDSWZrm,               0 },
+  { X86::VPADDSWrr,                X86::VPADDSWrm,                0 },
+  { X86::VPADDUSBYrr,              X86::VPADDUSBYrm,              0 },
+  { X86::VPADDUSBZ128rr,           X86::VPADDUSBZ128rm,           0 },
+  { X86::VPADDUSBZ256rr,           X86::VPADDUSBZ256rm,           0 },
+  { X86::VPADDUSBZrr,              X86::VPADDUSBZrm,              0 },
+  { X86::VPADDUSBrr,               X86::VPADDUSBrm,               0 },
+  { X86::VPADDUSWYrr,              X86::VPADDUSWYrm,              0 },
+  { X86::VPADDUSWZ128rr,           X86::VPADDUSWZ128rm,           0 },
+  { X86::VPADDUSWZ256rr,           X86::VPADDUSWZ256rm,           0 },
+  { X86::VPADDUSWZrr,              X86::VPADDUSWZrm,              0 },
+  { X86::VPADDUSWrr,               X86::VPADDUSWrm,               0 },
+  { X86::VPADDWYrr,                X86::VPADDWYrm,                0 },
+  { X86::VPADDWZ128rr,             X86::VPADDWZ128rm,             0 },
+  { X86::VPADDWZ256rr,             X86::VPADDWZ256rm,             0 },
+  { X86::VPADDWZrr,                X86::VPADDWZrm,                0 },
+  { X86::VPADDWrr,                 X86::VPADDWrm,                 0 },
+  { X86::VPALIGNRYrri,             X86::VPALIGNRYrmi,             0 },
+  { X86::VPALIGNRZ128rri,          X86::VPALIGNRZ128rmi,          0 },
+  { X86::VPALIGNRZ256rri,          X86::VPALIGNRZ256rmi,          0 },
+  { X86::VPALIGNRZrri,             X86::VPALIGNRZrmi,             0 },
+  { X86::VPALIGNRrri,              X86::VPALIGNRrmi,              0 },
+  { X86::VPANDDZ128rr,             X86::VPANDDZ128rm,             0 },
+  { X86::VPANDDZ256rr,             X86::VPANDDZ256rm,             0 },
+  { X86::VPANDDZrr,                X86::VPANDDZrm,                0 },
+  { X86::VPANDNDZ128rr,            X86::VPANDNDZ128rm,            0 },
+  { X86::VPANDNDZ256rr,            X86::VPANDNDZ256rm,            0 },
+  { X86::VPANDNDZrr,               X86::VPANDNDZrm,               0 },
+  { X86::VPANDNQZ128rr,            X86::VPANDNQZ128rm,            0 },
+  { X86::VPANDNQZ256rr,            X86::VPANDNQZ256rm,            0 },
+  { X86::VPANDNQZrr,               X86::VPANDNQZrm,               0 },
+  { X86::VPANDNYrr,                X86::VPANDNYrm,                0 },
+  { X86::VPANDNrr,                 X86::VPANDNrm,                 0 },
+  { X86::VPANDQZ128rr,             X86::VPANDQZ128rm,             0 },
+  { X86::VPANDQZ256rr,             X86::VPANDQZ256rm,             0 },
+  { X86::VPANDQZrr,                X86::VPANDQZrm,                0 },
+  { X86::VPANDYrr,                 X86::VPANDYrm,                 0 },
+  { X86::VPANDrr,                  X86::VPANDrm,                  0 },
+  { X86::VPAVGBYrr,                X86::VPAVGBYrm,                0 },
+  { X86::VPAVGBZ128rr,             X86::VPAVGBZ128rm,             0 },
+  { X86::VPAVGBZ256rr,             X86::VPAVGBZ256rm,             0 },
+  { X86::VPAVGBZrr,                X86::VPAVGBZrm,                0 },
+  { X86::VPAVGBrr,                 X86::VPAVGBrm,                 0 },
+  { X86::VPAVGWYrr,                X86::VPAVGWYrm,                0 },
+  { X86::VPAVGWZ128rr,             X86::VPAVGWZ128rm,             0 },
+  { X86::VPAVGWZ256rr,             X86::VPAVGWZ256rm,             0 },
+  { X86::VPAVGWZrr,                X86::VPAVGWZrm,                0 },
+  { X86::VPAVGWrr,                 X86::VPAVGWrm,                 0 },
+  { X86::VPBLENDDYrri,             X86::VPBLENDDYrmi,             0 },
+  { X86::VPBLENDDrri,              X86::VPBLENDDrmi,              0 },
+  { X86::VPBLENDMBZ128rr,          X86::VPBLENDMBZ128rm,          0 },
+  { X86::VPBLENDMBZ256rr,          X86::VPBLENDMBZ256rm,          0 },
+  { X86::VPBLENDMBZrr,             X86::VPBLENDMBZrm,             0 },
+  { X86::VPBLENDMDZ128rr,          X86::VPBLENDMDZ128rm,          0 },
+  { X86::VPBLENDMDZ256rr,          X86::VPBLENDMDZ256rm,          0 },
+  { X86::VPBLENDMDZrr,             X86::VPBLENDMDZrm,             0 },
+  { X86::VPBLENDMQZ128rr,          X86::VPBLENDMQZ128rm,          0 },
+  { X86::VPBLENDMQZ256rr,          X86::VPBLENDMQZ256rm,          0 },
+  { X86::VPBLENDMQZrr,             X86::VPBLENDMQZrm,             0 },
+  { X86::VPBLENDMWZ128rr,          X86::VPBLENDMWZ128rm,          0 },
+  { X86::VPBLENDMWZ256rr,          X86::VPBLENDMWZ256rm,          0 },
+  { X86::VPBLENDMWZrr,             X86::VPBLENDMWZrm,             0 },
+  { X86::VPBLENDVBYrr,             X86::VPBLENDVBYrm,             0 },
+  { X86::VPBLENDVBrr,              X86::VPBLENDVBrm,              0 },
+  { X86::VPBLENDWYrri,             X86::VPBLENDWYrmi,             0 },
+  { X86::VPBLENDWrri,              X86::VPBLENDWrmi,              0 },
+  { X86::VPBROADCASTBZ128rkz,      X86::VPBROADCASTBZ128mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ256rkz,      X86::VPBROADCASTBZ256mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTBZrkz,         X86::VPBROADCASTBZmkz,         TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ128rkz,      X86::VPBROADCASTDZ128mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ256rkz,      X86::VPBROADCASTDZ256mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTDZrkz,         X86::VPBROADCASTDZmkz,         TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ128rkz,      X86::VPBROADCASTQZ128mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ256rkz,      X86::VPBROADCASTQZ256mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTQZrkz,         X86::VPBROADCASTQZmkz,         TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ128rkz,      X86::VPBROADCASTWZ128mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ256rkz,      X86::VPBROADCASTWZ256mkz,      TB_NO_REVERSE },
+  { X86::VPBROADCASTWZrkz,         X86::VPBROADCASTWZmkz,         TB_NO_REVERSE },
+  { X86::VPCLMULQDQYrr,            X86::VPCLMULQDQYrm,            0 },
+  { X86::VPCLMULQDQZ128rr,         X86::VPCLMULQDQZ128rm,         0 },
+  { X86::VPCLMULQDQZ256rr,         X86::VPCLMULQDQZ256rm,         0 },
+  { X86::VPCLMULQDQZrr,            X86::VPCLMULQDQZrm,            0 },
+  { X86::VPCLMULQDQrr,             X86::VPCLMULQDQrm,             0 },
+  { X86::VPCMOVYrrr,               X86::VPCMOVYrmr,               0 },
+  { X86::VPCMOVrrr,                X86::VPCMOVrmr,                0 },
+  { X86::VPCMPBZ128rri,            X86::VPCMPBZ128rmi,            0 },
+  { X86::VPCMPBZ256rri,            X86::VPCMPBZ256rmi,            0 },
+  { X86::VPCMPBZrri,               X86::VPCMPBZrmi,               0 },
+  { X86::VPCMPDZ128rri,            X86::VPCMPDZ128rmi,            0 },
+  { X86::VPCMPDZ256rri,            X86::VPCMPDZ256rmi,            0 },
+  { X86::VPCMPDZrri,               X86::VPCMPDZrmi,               0 },
+  { X86::VPCMPEQBYrr,              X86::VPCMPEQBYrm,              0 },
+  { X86::VPCMPEQBZ128rr,           X86::VPCMPEQBZ128rm,           0 },
+  { X86::VPCMPEQBZ256rr,           X86::VPCMPEQBZ256rm,           0 },
+  { X86::VPCMPEQBZrr,              X86::VPCMPEQBZrm,              0 },
+  { X86::VPCMPEQBrr,               X86::VPCMPEQBrm,               0 },
+  { X86::VPCMPEQDYrr,              X86::VPCMPEQDYrm,              0 },
+  { X86::VPCMPEQDZ128rr,           X86::VPCMPEQDZ128rm,           0 },
+  { X86::VPCMPEQDZ256rr,           X86::VPCMPEQDZ256rm,           0 },
+  { X86::VPCMPEQDZrr,              X86::VPCMPEQDZrm,              0 },
+  { X86::VPCMPEQDrr,               X86::VPCMPEQDrm,               0 },
+  { X86::VPCMPEQQYrr,              X86::VPCMPEQQYrm,              0 },
+  { X86::VPCMPEQQZ128rr,           X86::VPCMPEQQZ128rm,           0 },
+  { X86::VPCMPEQQZ256rr,           X86::VPCMPEQQZ256rm,           0 },
+  { X86::VPCMPEQQZrr,              X86::VPCMPEQQZrm,              0 },
+  { X86::VPCMPEQQrr,               X86::VPCMPEQQrm,               0 },
+  { X86::VPCMPEQWYrr,              X86::VPCMPEQWYrm,              0 },
+  { X86::VPCMPEQWZ128rr,           X86::VPCMPEQWZ128rm,           0 },
+  { X86::VPCMPEQWZ256rr,           X86::VPCMPEQWZ256rm,           0 },
+  { X86::VPCMPEQWZrr,              X86::VPCMPEQWZrm,              0 },
+  { X86::VPCMPEQWrr,               X86::VPCMPEQWrm,               0 },
+  { X86::VPCMPGTBYrr,              X86::VPCMPGTBYrm,              0 },
+  { X86::VPCMPGTBZ128rr,           X86::VPCMPGTBZ128rm,           0 },
+  { X86::VPCMPGTBZ256rr,           X86::VPCMPGTBZ256rm,           0 },
+  { X86::VPCMPGTBZrr,              X86::VPCMPGTBZrm,              0 },
+  { X86::VPCMPGTBrr,               X86::VPCMPGTBrm,               0 },
+  { X86::VPCMPGTDYrr,              X86::VPCMPGTDYrm,              0 },
+  { X86::VPCMPGTDZ128rr,           X86::VPCMPGTDZ128rm,           0 },
+  { X86::VPCMPGTDZ256rr,           X86::VPCMPGTDZ256rm,           0 },
+  { X86::VPCMPGTDZrr,              X86::VPCMPGTDZrm,              0 },
+  { X86::VPCMPGTDrr,               X86::VPCMPGTDrm,               0 },
+  { X86::VPCMPGTQYrr,              X86::VPCMPGTQYrm,              0 },
+  { X86::VPCMPGTQZ128rr,           X86::VPCMPGTQZ128rm,           0 },
+  { X86::VPCMPGTQZ256rr,           X86::VPCMPGTQZ256rm,           0 },
+  { X86::VPCMPGTQZrr,              X86::VPCMPGTQZrm,              0 },
+  { X86::VPCMPGTQrr,               X86::VPCMPGTQrm,               0 },
+  { X86::VPCMPGTWYrr,              X86::VPCMPGTWYrm,              0 },
+  { X86::VPCMPGTWZ128rr,           X86::VPCMPGTWZ128rm,           0 },
+  { X86::VPCMPGTWZ256rr,           X86::VPCMPGTWZ256rm,           0 },
+  { X86::VPCMPGTWZrr,              X86::VPCMPGTWZrm,              0 },
+  { X86::VPCMPGTWrr,               X86::VPCMPGTWrm,               0 },
+  { X86::VPCMPQZ128rri,            X86::VPCMPQZ128rmi,            0 },
+  { X86::VPCMPQZ256rri,            X86::VPCMPQZ256rmi,            0 },
+  { X86::VPCMPQZrri,               X86::VPCMPQZrmi,               0 },
+  { X86::VPCMPUBZ128rri,           X86::VPCMPUBZ128rmi,           0 },
+  { X86::VPCMPUBZ256rri,           X86::VPCMPUBZ256rmi,           0 },
+  { X86::VPCMPUBZrri,              X86::VPCMPUBZrmi,              0 },
+  { X86::VPCMPUDZ128rri,           X86::VPCMPUDZ128rmi,           0 },
+  { X86::VPCMPUDZ256rri,           X86::VPCMPUDZ256rmi,           0 },
+  { X86::VPCMPUDZrri,              X86::VPCMPUDZrmi,              0 },
+  { X86::VPCMPUQZ128rri,           X86::VPCMPUQZ128rmi,           0 },
+  { X86::VPCMPUQZ256rri,           X86::VPCMPUQZ256rmi,           0 },
+  { X86::VPCMPUQZrri,              X86::VPCMPUQZrmi,              0 },
+  { X86::VPCMPUWZ128rri,           X86::VPCMPUWZ128rmi,           0 },
+  { X86::VPCMPUWZ256rri,           X86::VPCMPUWZ256rmi,           0 },
+  { X86::VPCMPUWZrri,              X86::VPCMPUWZrmi,              0 },
+  { X86::VPCMPWZ128rri,            X86::VPCMPWZ128rmi,            0 },
+  { X86::VPCMPWZ256rri,            X86::VPCMPWZ256rmi,            0 },
+  { X86::VPCMPWZrri,               X86::VPCMPWZrmi,               0 },
+  { X86::VPCOMBri,                 X86::VPCOMBmi,                 0 },
+  { X86::VPCOMDri,                 X86::VPCOMDmi,                 0 },
+  { X86::VPCOMQri,                 X86::VPCOMQmi,                 0 },
+  { X86::VPCOMUBri,                X86::VPCOMUBmi,                0 },
+  { X86::VPCOMUDri,                X86::VPCOMUDmi,                0 },
+  { X86::VPCOMUQri,                X86::VPCOMUQmi,                0 },
+  { X86::VPCOMUWri,                X86::VPCOMUWmi,                0 },
+  { X86::VPCOMWri,                 X86::VPCOMWmi,                 0 },
+  { X86::VPCONFLICTDZ128rrkz,      X86::VPCONFLICTDZ128rmkz,      0 },
+  { X86::VPCONFLICTDZ256rrkz,      X86::VPCONFLICTDZ256rmkz,      0 },
+  { X86::VPCONFLICTDZrrkz,         X86::VPCONFLICTDZrmkz,         0 },
+  { X86::VPCONFLICTQZ128rrkz,      X86::VPCONFLICTQZ128rmkz,      0 },
+  { X86::VPCONFLICTQZ256rrkz,      X86::VPCONFLICTQZ256rmkz,      0 },
+  { X86::VPCONFLICTQZrrkz,         X86::VPCONFLICTQZrmkz,         0 },
+  { X86::VPERM2F128rr,             X86::VPERM2F128rm,             0 },
+  { X86::VPERM2I128rr,             X86::VPERM2I128rm,             0 },
+  { X86::VPERMBZ128rr,             X86::VPERMBZ128rm,             0 },
+  { X86::VPERMBZ256rr,             X86::VPERMBZ256rm,             0 },
+  { X86::VPERMBZrr,                X86::VPERMBZrm,                0 },
+  { X86::VPERMDYrr,                X86::VPERMDYrm,                0 },
+  { X86::VPERMDZ256rr,             X86::VPERMDZ256rm,             0 },
+  { X86::VPERMDZrr,                X86::VPERMDZrm,                0 },
+  { X86::VPERMIL2PDYrr,            X86::VPERMIL2PDYmr,            0 },
+  { X86::VPERMIL2PDrr,             X86::VPERMIL2PDmr,             0 },
+  { X86::VPERMIL2PSYrr,            X86::VPERMIL2PSYmr,            0 },
+  { X86::VPERMIL2PSrr,             X86::VPERMIL2PSmr,             0 },
+  { X86::VPERMILPDYrr,             X86::VPERMILPDYrm,             0 },
+  { X86::VPERMILPDZ128rikz,        X86::VPERMILPDZ128mikz,        0 },
+  { X86::VPERMILPDZ128rr,          X86::VPERMILPDZ128rm,          0 },
+  { X86::VPERMILPDZ256rikz,        X86::VPERMILPDZ256mikz,        0 },
+  { X86::VPERMILPDZ256rr,          X86::VPERMILPDZ256rm,          0 },
+  { X86::VPERMILPDZrikz,           X86::VPERMILPDZmikz,           0 },
+  { X86::VPERMILPDZrr,             X86::VPERMILPDZrm,             0 },
+  { X86::VPERMILPDrr,              X86::VPERMILPDrm,              0 },
+  { X86::VPERMILPSYrr,             X86::VPERMILPSYrm,             0 },
+  { X86::VPERMILPSZ128rikz,        X86::VPERMILPSZ128mikz,        0 },
+  { X86::VPERMILPSZ128rr,          X86::VPERMILPSZ128rm,          0 },
+  { X86::VPERMILPSZ256rikz,        X86::VPERMILPSZ256mikz,        0 },
+  { X86::VPERMILPSZ256rr,          X86::VPERMILPSZ256rm,          0 },
+  { X86::VPERMILPSZrikz,           X86::VPERMILPSZmikz,           0 },
+  { X86::VPERMILPSZrr,             X86::VPERMILPSZrm,             0 },
+  { X86::VPERMILPSrr,              X86::VPERMILPSrm,              0 },
+  { X86::VPERMPDZ256rikz,          X86::VPERMPDZ256mikz,          0 },
+  { X86::VPERMPDZ256rr,            X86::VPERMPDZ256rm,            0 },
+  { X86::VPERMPDZrikz,             X86::VPERMPDZmikz,             0 },
+  { X86::VPERMPDZrr,               X86::VPERMPDZrm,               0 },
+  { X86::VPERMPSYrr,               X86::VPERMPSYrm,               0 },
+  { X86::VPERMPSZ256rr,            X86::VPERMPSZ256rm,            0 },
+  { X86::VPERMPSZrr,               X86::VPERMPSZrm,               0 },
+  { X86::VPERMQZ256rikz,           X86::VPERMQZ256mikz,           0 },
+  { X86::VPERMQZ256rr,             X86::VPERMQZ256rm,             0 },
+  { X86::VPERMQZrikz,              X86::VPERMQZmikz,              0 },
+  { X86::VPERMQZrr,                X86::VPERMQZrm,                0 },
+  { X86::VPERMWZ128rr,             X86::VPERMWZ128rm,             0 },
+  { X86::VPERMWZ256rr,             X86::VPERMWZ256rm,             0 },
+  { X86::VPERMWZrr,                X86::VPERMWZrm,                0 },
+  { X86::VPEXPANDBZ128rrkz,        X86::VPEXPANDBZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDBZ256rrkz,        X86::VPEXPANDBZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDBZrrkz,           X86::VPEXPANDBZrmkz,           TB_NO_REVERSE },
+  { X86::VPEXPANDDZ128rrkz,        X86::VPEXPANDDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDDZ256rrkz,        X86::VPEXPANDDZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDDZrrkz,           X86::VPEXPANDDZrmkz,           TB_NO_REVERSE },
+  { X86::VPEXPANDQZ128rrkz,        X86::VPEXPANDQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDQZ256rrkz,        X86::VPEXPANDQZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDQZrrkz,           X86::VPEXPANDQZrmkz,           TB_NO_REVERSE },
+  { X86::VPEXPANDWZ128rrkz,        X86::VPEXPANDWZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDWZ256rrkz,        X86::VPEXPANDWZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPEXPANDWZrrkz,           X86::VPEXPANDWZrmkz,           TB_NO_REVERSE },
+  { X86::VPHADDDYrr,               X86::VPHADDDYrm,               0 },
+  { X86::VPHADDDrr,                X86::VPHADDDrm,                0 },
+  { X86::VPHADDSWYrr,              X86::VPHADDSWYrm,              0 },
+  { X86::VPHADDSWrr,               X86::VPHADDSWrm,               0 },
+  { X86::VPHADDWYrr,               X86::VPHADDWYrm,               0 },
+  { X86::VPHADDWrr,                X86::VPHADDWrm,                0 },
+  { X86::VPHSUBDYrr,               X86::VPHSUBDYrm,               0 },
+  { X86::VPHSUBDrr,                X86::VPHSUBDrm,                0 },
+  { X86::VPHSUBSWYrr,              X86::VPHSUBSWYrm,              0 },
+  { X86::VPHSUBSWrr,               X86::VPHSUBSWrm,               0 },
+  { X86::VPHSUBWYrr,               X86::VPHSUBWYrm,               0 },
+  { X86::VPHSUBWrr,                X86::VPHSUBWrm,                0 },
+  { X86::VPINSRBZrr,               X86::VPINSRBZrm,               TB_NO_REVERSE },
+  { X86::VPINSRBrr,                X86::VPINSRBrm,                TB_NO_REVERSE },
+  { X86::VPINSRDZrr,               X86::VPINSRDZrm,               0 },
+  { X86::VPINSRDrr,                X86::VPINSRDrm,                0 },
+  { X86::VPINSRQZrr,               X86::VPINSRQZrm,               0 },
+  { X86::VPINSRQrr,                X86::VPINSRQrm,                0 },
+  { X86::VPINSRWZrr,               X86::VPINSRWZrm,               TB_NO_REVERSE },
+  { X86::VPINSRWrr,                X86::VPINSRWrm,                TB_NO_REVERSE },
+  { X86::VPLZCNTDZ128rrkz,         X86::VPLZCNTDZ128rmkz,         0 },
+  { X86::VPLZCNTDZ256rrkz,         X86::VPLZCNTDZ256rmkz,         0 },
+  { X86::VPLZCNTDZrrkz,            X86::VPLZCNTDZrmkz,            0 },
+  { X86::VPLZCNTQZ128rrkz,         X86::VPLZCNTQZ128rmkz,         0 },
+  { X86::VPLZCNTQZ256rrkz,         X86::VPLZCNTQZ256rmkz,         0 },
+  { X86::VPLZCNTQZrrkz,            X86::VPLZCNTQZrmkz,            0 },
+  { X86::VPMACSDDrr,               X86::VPMACSDDrm,               0 },
+  { X86::VPMACSDQHrr,              X86::VPMACSDQHrm,              0 },
+  { X86::VPMACSDQLrr,              X86::VPMACSDQLrm,              0 },
+  { X86::VPMACSSDDrr,              X86::VPMACSSDDrm,              0 },
+  { X86::VPMACSSDQHrr,             X86::VPMACSSDQHrm,             0 },
+  { X86::VPMACSSDQLrr,             X86::VPMACSSDQLrm,             0 },
+  { X86::VPMACSSWDrr,              X86::VPMACSSWDrm,              0 },
+  { X86::VPMACSSWWrr,              X86::VPMACSSWWrm,              0 },
+  { X86::VPMACSWDrr,               X86::VPMACSWDrm,               0 },
+  { X86::VPMACSWWrr,               X86::VPMACSWWrm,               0 },
+  { X86::VPMADCSSWDrr,             X86::VPMADCSSWDrm,             0 },
+  { X86::VPMADCSWDrr,              X86::VPMADCSWDrm,              0 },
+  { X86::VPMADDUBSWYrr,            X86::VPMADDUBSWYrm,            0 },
+  { X86::VPMADDUBSWZ128rr,         X86::VPMADDUBSWZ128rm,         0 },
+  { X86::VPMADDUBSWZ256rr,         X86::VPMADDUBSWZ256rm,         0 },
+  { X86::VPMADDUBSWZrr,            X86::VPMADDUBSWZrm,            0 },
+  { X86::VPMADDUBSWrr,             X86::VPMADDUBSWrm,             0 },
+  { X86::VPMADDWDYrr,              X86::VPMADDWDYrm,              0 },
+  { X86::VPMADDWDZ128rr,           X86::VPMADDWDZ128rm,           0 },
+  { X86::VPMADDWDZ256rr,           X86::VPMADDWDZ256rm,           0 },
+  { X86::VPMADDWDZrr,              X86::VPMADDWDZrm,              0 },
+  { X86::VPMADDWDrr,               X86::VPMADDWDrm,               0 },
+  { X86::VPMAXSBYrr,               X86::VPMAXSBYrm,               0 },
+  { X86::VPMAXSBZ128rr,            X86::VPMAXSBZ128rm,            0 },
+  { X86::VPMAXSBZ256rr,            X86::VPMAXSBZ256rm,            0 },
+  { X86::VPMAXSBZrr,               X86::VPMAXSBZrm,               0 },
+  { X86::VPMAXSBrr,                X86::VPMAXSBrm,                0 },
+  { X86::VPMAXSDYrr,               X86::VPMAXSDYrm,               0 },
+  { X86::VPMAXSDZ128rr,            X86::VPMAXSDZ128rm,            0 },
+  { X86::VPMAXSDZ256rr,            X86::VPMAXSDZ256rm,            0 },
+  { X86::VPMAXSDZrr,               X86::VPMAXSDZrm,               0 },
+  { X86::VPMAXSDrr,                X86::VPMAXSDrm,                0 },
+  { X86::VPMAXSQZ128rr,            X86::VPMAXSQZ128rm,            0 },
+  { X86::VPMAXSQZ256rr,            X86::VPMAXSQZ256rm,            0 },
+  { X86::VPMAXSQZrr,               X86::VPMAXSQZrm,               0 },
+  { X86::VPMAXSWYrr,               X86::VPMAXSWYrm,               0 },
+  { X86::VPMAXSWZ128rr,            X86::VPMAXSWZ128rm,            0 },
+  { X86::VPMAXSWZ256rr,            X86::VPMAXSWZ256rm,            0 },
+  { X86::VPMAXSWZrr,               X86::VPMAXSWZrm,               0 },
+  { X86::VPMAXSWrr,                X86::VPMAXSWrm,                0 },
+  { X86::VPMAXUBYrr,               X86::VPMAXUBYrm,               0 },
+  { X86::VPMAXUBZ128rr,            X86::VPMAXUBZ128rm,            0 },
+  { X86::VPMAXUBZ256rr,            X86::VPMAXUBZ256rm,            0 },
+  { X86::VPMAXUBZrr,               X86::VPMAXUBZrm,               0 },
+  { X86::VPMAXUBrr,                X86::VPMAXUBrm,                0 },
+  { X86::VPMAXUDYrr,               X86::VPMAXUDYrm,               0 },
+  { X86::VPMAXUDZ128rr,            X86::VPMAXUDZ128rm,            0 },
+  { X86::VPMAXUDZ256rr,            X86::VPMAXUDZ256rm,            0 },
+  { X86::VPMAXUDZrr,               X86::VPMAXUDZrm,               0 },
+  { X86::VPMAXUDrr,                X86::VPMAXUDrm,                0 },
+  { X86::VPMAXUQZ128rr,            X86::VPMAXUQZ128rm,            0 },
+  { X86::VPMAXUQZ256rr,            X86::VPMAXUQZ256rm,            0 },
+  { X86::VPMAXUQZrr,               X86::VPMAXUQZrm,               0 },
+  { X86::VPMAXUWYrr,               X86::VPMAXUWYrm,               0 },
+  { X86::VPMAXUWZ128rr,            X86::VPMAXUWZ128rm,            0 },
+  { X86::VPMAXUWZ256rr,            X86::VPMAXUWZ256rm,            0 },
+  { X86::VPMAXUWZrr,               X86::VPMAXUWZrm,               0 },
+  { X86::VPMAXUWrr,                X86::VPMAXUWrm,                0 },
+  { X86::VPMINSBYrr,               X86::VPMINSBYrm,               0 },
+  { X86::VPMINSBZ128rr,            X86::VPMINSBZ128rm,            0 },
+  { X86::VPMINSBZ256rr,            X86::VPMINSBZ256rm,            0 },
+  { X86::VPMINSBZrr,               X86::VPMINSBZrm,               0 },
+  { X86::VPMINSBrr,                X86::VPMINSBrm,                0 },
+  { X86::VPMINSDYrr,               X86::VPMINSDYrm,               0 },
+  { X86::VPMINSDZ128rr,            X86::VPMINSDZ128rm,            0 },
+  { X86::VPMINSDZ256rr,            X86::VPMINSDZ256rm,            0 },
+  { X86::VPMINSDZrr,               X86::VPMINSDZrm,               0 },
+  { X86::VPMINSDrr,                X86::VPMINSDrm,                0 },
+  { X86::VPMINSQZ128rr,            X86::VPMINSQZ128rm,            0 },
+  { X86::VPMINSQZ256rr,            X86::VPMINSQZ256rm,            0 },
+  { X86::VPMINSQZrr,               X86::VPMINSQZrm,               0 },
+  { X86::VPMINSWYrr,               X86::VPMINSWYrm,               0 },
+  { X86::VPMINSWZ128rr,            X86::VPMINSWZ128rm,            0 },
+  { X86::VPMINSWZ256rr,            X86::VPMINSWZ256rm,            0 },
+  { X86::VPMINSWZrr,               X86::VPMINSWZrm,               0 },
+  { X86::VPMINSWrr,                X86::VPMINSWrm,                0 },
+  { X86::VPMINUBYrr,               X86::VPMINUBYrm,               0 },
+  { X86::VPMINUBZ128rr,            X86::VPMINUBZ128rm,            0 },
+  { X86::VPMINUBZ256rr,            X86::VPMINUBZ256rm,            0 },
+  { X86::VPMINUBZrr,               X86::VPMINUBZrm,               0 },
+  { X86::VPMINUBrr,                X86::VPMINUBrm,                0 },
+  { X86::VPMINUDYrr,               X86::VPMINUDYrm,               0 },
+  { X86::VPMINUDZ128rr,            X86::VPMINUDZ128rm,            0 },
+  { X86::VPMINUDZ256rr,            X86::VPMINUDZ256rm,            0 },
+  { X86::VPMINUDZrr,               X86::VPMINUDZrm,               0 },
+  { X86::VPMINUDrr,                X86::VPMINUDrm,                0 },
+  { X86::VPMINUQZ128rr,            X86::VPMINUQZ128rm,            0 },
+  { X86::VPMINUQZ256rr,            X86::VPMINUQZ256rm,            0 },
+  { X86::VPMINUQZrr,               X86::VPMINUQZrm,               0 },
+  { X86::VPMINUWYrr,               X86::VPMINUWYrm,               0 },
+  { X86::VPMINUWZ128rr,            X86::VPMINUWZ128rm,            0 },
+  { X86::VPMINUWZ256rr,            X86::VPMINUWZ256rm,            0 },
+  { X86::VPMINUWZrr,               X86::VPMINUWZrm,               0 },
+  { X86::VPMINUWrr,                X86::VPMINUWrm,                0 },
+  { X86::VPMOVSXBDZ128rrkz,        X86::VPMOVSXBDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXBDZ256rrkz,        X86::VPMOVSXBDZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXBDZrrkz,           X86::VPMOVSXBDZrmkz,           0 },
+  { X86::VPMOVSXBQZ128rrkz,        X86::VPMOVSXBQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXBQZ256rrkz,        X86::VPMOVSXBQZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXBQZrrkz,           X86::VPMOVSXBQZrmkz,           TB_NO_REVERSE },
+  { X86::VPMOVSXBWZ128rrkz,        X86::VPMOVSXBWZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXBWZ256rrkz,        X86::VPMOVSXBWZ256rmkz,        0 },
+  { X86::VPMOVSXBWZrrkz,           X86::VPMOVSXBWZrmkz,           0 },
+  { X86::VPMOVSXDQZ128rrkz,        X86::VPMOVSXDQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXDQZ256rrkz,        X86::VPMOVSXDQZ256rmkz,        0 },
+  { X86::VPMOVSXDQZrrkz,           X86::VPMOVSXDQZrmkz,           0 },
+  { X86::VPMOVSXWDZ128rrkz,        X86::VPMOVSXWDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXWDZ256rrkz,        X86::VPMOVSXWDZ256rmkz,        0 },
+  { X86::VPMOVSXWDZrrkz,           X86::VPMOVSXWDZrmkz,           0 },
+  { X86::VPMOVSXWQZ128rrkz,        X86::VPMOVSXWQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXWQZ256rrkz,        X86::VPMOVSXWQZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVSXWQZrrkz,           X86::VPMOVSXWQZrmkz,           0 },
+  { X86::VPMOVZXBDZ128rrkz,        X86::VPMOVZXBDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXBDZ256rrkz,        X86::VPMOVZXBDZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXBDZrrkz,           X86::VPMOVZXBDZrmkz,           0 },
+  { X86::VPMOVZXBQZ128rrkz,        X86::VPMOVZXBQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXBQZ256rrkz,        X86::VPMOVZXBQZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXBQZrrkz,           X86::VPMOVZXBQZrmkz,           TB_NO_REVERSE },
+  { X86::VPMOVZXBWZ128rrkz,        X86::VPMOVZXBWZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXBWZ256rrkz,        X86::VPMOVZXBWZ256rmkz,        0 },
+  { X86::VPMOVZXBWZrrkz,           X86::VPMOVZXBWZrmkz,           0 },
+  { X86::VPMOVZXDQZ128rrkz,        X86::VPMOVZXDQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXDQZ256rrkz,        X86::VPMOVZXDQZ256rmkz,        0 },
+  { X86::VPMOVZXDQZrrkz,           X86::VPMOVZXDQZrmkz,           0 },
+  { X86::VPMOVZXWDZ128rrkz,        X86::VPMOVZXWDZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXWDZ256rrkz,        X86::VPMOVZXWDZ256rmkz,        0 },
+  { X86::VPMOVZXWDZrrkz,           X86::VPMOVZXWDZrmkz,           0 },
+  { X86::VPMOVZXWQZ128rrkz,        X86::VPMOVZXWQZ128rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXWQZ256rrkz,        X86::VPMOVZXWQZ256rmkz,        TB_NO_REVERSE },
+  { X86::VPMOVZXWQZrrkz,           X86::VPMOVZXWQZrmkz,           0 },
+  { X86::VPMULDQYrr,               X86::VPMULDQYrm,               0 },
+  { X86::VPMULDQZ128rr,            X86::VPMULDQZ128rm,            0 },
+  { X86::VPMULDQZ256rr,            X86::VPMULDQZ256rm,            0 },
+  { X86::VPMULDQZrr,               X86::VPMULDQZrm,               0 },
+  { X86::VPMULDQrr,                X86::VPMULDQrm,                0 },
+  { X86::VPMULHRSWYrr,             X86::VPMULHRSWYrm,             0 },
+  { X86::VPMULHRSWZ128rr,          X86::VPMULHRSWZ128rm,          0 },
+  { X86::VPMULHRSWZ256rr,          X86::VPMULHRSWZ256rm,          0 },
+  { X86::VPMULHRSWZrr,             X86::VPMULHRSWZrm,             0 },
+  { X86::VPMULHRSWrr,              X86::VPMULHRSWrm,              0 },
+  { X86::VPMULHUWYrr,              X86::VPMULHUWYrm,              0 },
+  { X86::VPMULHUWZ128rr,           X86::VPMULHUWZ128rm,           0 },
+  { X86::VPMULHUWZ256rr,           X86::VPMULHUWZ256rm,           0 },
+  { X86::VPMULHUWZrr,              X86::VPMULHUWZrm,              0 },
+  { X86::VPMULHUWrr,               X86::VPMULHUWrm,               0 },
+  { X86::VPMULHWYrr,               X86::VPMULHWYrm,               0 },
+  { X86::VPMULHWZ128rr,            X86::VPMULHWZ128rm,            0 },
+  { X86::VPMULHWZ256rr,            X86::VPMULHWZ256rm,            0 },
+  { X86::VPMULHWZrr,               X86::VPMULHWZrm,               0 },
+  { X86::VPMULHWrr,                X86::VPMULHWrm,                0 },
+  { X86::VPMULLDYrr,               X86::VPMULLDYrm,               0 },
+  { X86::VPMULLDZ128rr,            X86::VPMULLDZ128rm,            0 },
+  { X86::VPMULLDZ256rr,            X86::VPMULLDZ256rm,            0 },
+  { X86::VPMULLDZrr,               X86::VPMULLDZrm,               0 },
+  { X86::VPMULLDrr,                X86::VPMULLDrm,                0 },
+  { X86::VPMULLQZ128rr,            X86::VPMULLQZ128rm,            0 },
+  { X86::VPMULLQZ256rr,            X86::VPMULLQZ256rm,            0 },
+  { X86::VPMULLQZrr,               X86::VPMULLQZrm,               0 },
+  { X86::VPMULLWYrr,               X86::VPMULLWYrm,               0 },
+  { X86::VPMULLWZ128rr,            X86::VPMULLWZ128rm,            0 },
+  { X86::VPMULLWZ256rr,            X86::VPMULLWZ256rm,            0 },
+  { X86::VPMULLWZrr,               X86::VPMULLWZrm,               0 },
+  { X86::VPMULLWrr,                X86::VPMULLWrm,                0 },
+  { X86::VPMULTISHIFTQBZ128rr,     X86::VPMULTISHIFTQBZ128rm,     0 },
+  { X86::VPMULTISHIFTQBZ256rr,     X86::VPMULTISHIFTQBZ256rm,     0 },
+  { X86::VPMULTISHIFTQBZrr,        X86::VPMULTISHIFTQBZrm,        0 },
+  { X86::VPMULUDQYrr,              X86::VPMULUDQYrm,              0 },
+  { X86::VPMULUDQZ128rr,           X86::VPMULUDQZ128rm,           0 },
+  { X86::VPMULUDQZ256rr,           X86::VPMULUDQZ256rm,           0 },
+  { X86::VPMULUDQZrr,              X86::VPMULUDQZrm,              0 },
+  { X86::VPMULUDQrr,               X86::VPMULUDQrm,               0 },
+  { X86::VPOPCNTBZ128rrkz,         X86::VPOPCNTBZ128rmkz,         0 },
+  { X86::VPOPCNTBZ256rrkz,         X86::VPOPCNTBZ256rmkz,         0 },
+  { X86::VPOPCNTBZrrkz,            X86::VPOPCNTBZrmkz,            0 },
+  { X86::VPOPCNTDZ128rrkz,         X86::VPOPCNTDZ128rmkz,         0 },
+  { X86::VPOPCNTDZ256rrkz,         X86::VPOPCNTDZ256rmkz,         0 },
+  { X86::VPOPCNTDZrrkz,            X86::VPOPCNTDZrmkz,            0 },
+  { X86::VPOPCNTQZ128rrkz,         X86::VPOPCNTQZ128rmkz,         0 },
+  { X86::VPOPCNTQZ256rrkz,         X86::VPOPCNTQZ256rmkz,         0 },
+  { X86::VPOPCNTQZrrkz,            X86::VPOPCNTQZrmkz,            0 },
+  { X86::VPOPCNTWZ128rrkz,         X86::VPOPCNTWZ128rmkz,         0 },
+  { X86::VPOPCNTWZ256rrkz,         X86::VPOPCNTWZ256rmkz,         0 },
+  { X86::VPOPCNTWZrrkz,            X86::VPOPCNTWZrmkz,            0 },
+  { X86::VPORDZ128rr,              X86::VPORDZ128rm,              0 },
+  { X86::VPORDZ256rr,              X86::VPORDZ256rm,              0 },
+  { X86::VPORDZrr,                 X86::VPORDZrm,                 0 },
+  { X86::VPORQZ128rr,              X86::VPORQZ128rm,              0 },
+  { X86::VPORQZ256rr,              X86::VPORQZ256rm,              0 },
+  { X86::VPORQZrr,                 X86::VPORQZrm,                 0 },
+  { X86::VPORYrr,                  X86::VPORYrm,                  0 },
+  { X86::VPORrr,                   X86::VPORrm,                   0 },
+  { X86::VPPERMrrr,                X86::VPPERMrmr,                0 },
+  { X86::VPROLDZ128rikz,           X86::VPROLDZ128mikz,           0 },
+  { X86::VPROLDZ256rikz,           X86::VPROLDZ256mikz,           0 },
+  { X86::VPROLDZrikz,              X86::VPROLDZmikz,              0 },
+  { X86::VPROLQZ128rikz,           X86::VPROLQZ128mikz,           0 },
+  { X86::VPROLQZ256rikz,           X86::VPROLQZ256mikz,           0 },
+  { X86::VPROLQZrikz,              X86::VPROLQZmikz,              0 },
+  { X86::VPROLVDZ128rr,            X86::VPROLVDZ128rm,            0 },
+  { X86::VPROLVDZ256rr,            X86::VPROLVDZ256rm,            0 },
+  { X86::VPROLVDZrr,               X86::VPROLVDZrm,               0 },
+  { X86::VPROLVQZ128rr,            X86::VPROLVQZ128rm,            0 },
+  { X86::VPROLVQZ256rr,            X86::VPROLVQZ256rm,            0 },
+  { X86::VPROLVQZrr,               X86::VPROLVQZrm,               0 },
+  { X86::VPRORDZ128rikz,           X86::VPRORDZ128mikz,           0 },
+  { X86::VPRORDZ256rikz,           X86::VPRORDZ256mikz,           0 },
+  { X86::VPRORDZrikz,              X86::VPRORDZmikz,              0 },
+  { X86::VPRORQZ128rikz,           X86::VPRORQZ128mikz,           0 },
+  { X86::VPRORQZ256rikz,           X86::VPRORQZ256mikz,           0 },
+  { X86::VPRORQZrikz,              X86::VPRORQZmikz,              0 },
+  { X86::VPRORVDZ128rr,            X86::VPRORVDZ128rm,            0 },
+  { X86::VPRORVDZ256rr,            X86::VPRORVDZ256rm,            0 },
+  { X86::VPRORVDZrr,               X86::VPRORVDZrm,               0 },
+  { X86::VPRORVQZ128rr,            X86::VPRORVQZ128rm,            0 },
+  { X86::VPRORVQZ256rr,            X86::VPRORVQZ256rm,            0 },
+  { X86::VPRORVQZrr,               X86::VPRORVQZrm,               0 },
+  { X86::VPROTBrr,                 X86::VPROTBrm,                 0 },
+  { X86::VPROTDrr,                 X86::VPROTDrm,                 0 },
+  { X86::VPROTQrr,                 X86::VPROTQrm,                 0 },
+  { X86::VPROTWrr,                 X86::VPROTWrm,                 0 },
+  { X86::VPSADBWYrr,               X86::VPSADBWYrm,               0 },
+  { X86::VPSADBWZ128rr,            X86::VPSADBWZ128rm,            0 },
+  { X86::VPSADBWZ256rr,            X86::VPSADBWZ256rm,            0 },
+  { X86::VPSADBWZrr,               X86::VPSADBWZrm,               0 },
+  { X86::VPSADBWrr,                X86::VPSADBWrm,                0 },
+  { X86::VPSHABrr,                 X86::VPSHABrm,                 0 },
+  { X86::VPSHADrr,                 X86::VPSHADrm,                 0 },
+  { X86::VPSHAQrr,                 X86::VPSHAQrm,                 0 },
+  { X86::VPSHAWrr,                 X86::VPSHAWrm,                 0 },
+  { X86::VPSHLBrr,                 X86::VPSHLBrm,                 0 },
+  { X86::VPSHLDDZ128rri,           X86::VPSHLDDZ128rmi,           0 },
+  { X86::VPSHLDDZ256rri,           X86::VPSHLDDZ256rmi,           0 },
+  { X86::VPSHLDDZrri,              X86::VPSHLDDZrmi,              0 },
+  { X86::VPSHLDQZ128rri,           X86::VPSHLDQZ128rmi,           0 },
+  { X86::VPSHLDQZ256rri,           X86::VPSHLDQZ256rmi,           0 },
+  { X86::VPSHLDQZrri,              X86::VPSHLDQZrmi,              0 },
+  { X86::VPSHLDWZ128rri,           X86::VPSHLDWZ128rmi,           0 },
+  { X86::VPSHLDWZ256rri,           X86::VPSHLDWZ256rmi,           0 },
+  { X86::VPSHLDWZrri,              X86::VPSHLDWZrmi,              0 },
+  { X86::VPSHLDrr,                 X86::VPSHLDrm,                 0 },
+  { X86::VPSHLQrr,                 X86::VPSHLQrm,                 0 },
+  { X86::VPSHLWrr,                 X86::VPSHLWrm,                 0 },
+  { X86::VPSHRDDZ128rri,           X86::VPSHRDDZ128rmi,           0 },
+  { X86::VPSHRDDZ256rri,           X86::VPSHRDDZ256rmi,           0 },
+  { X86::VPSHRDDZrri,              X86::VPSHRDDZrmi,              0 },
+  { X86::VPSHRDQZ128rri,           X86::VPSHRDQZ128rmi,           0 },
+  { X86::VPSHRDQZ256rri,           X86::VPSHRDQZ256rmi,           0 },
+  { X86::VPSHRDQZrri,              X86::VPSHRDQZrmi,              0 },
+  { X86::VPSHRDWZ128rri,           X86::VPSHRDWZ128rmi,           0 },
+  { X86::VPSHRDWZ256rri,           X86::VPSHRDWZ256rmi,           0 },
+  { X86::VPSHRDWZrri,              X86::VPSHRDWZrmi,              0 },
+  { X86::VPSHUFBITQMBZ128rr,       X86::VPSHUFBITQMBZ128rm,       0 },
+  { X86::VPSHUFBITQMBZ256rr,       X86::VPSHUFBITQMBZ256rm,       0 },
+  { X86::VPSHUFBITQMBZrr,          X86::VPSHUFBITQMBZrm,          0 },
+  { X86::VPSHUFBYrr,               X86::VPSHUFBYrm,               0 },
+  { X86::VPSHUFBZ128rr,            X86::VPSHUFBZ128rm,            0 },
+  { X86::VPSHUFBZ256rr,            X86::VPSHUFBZ256rm,            0 },
+  { X86::VPSHUFBZrr,               X86::VPSHUFBZrm,               0 },
+  { X86::VPSHUFBrr,                X86::VPSHUFBrm,                0 },
+  { X86::VPSHUFDZ128rikz,          X86::VPSHUFDZ128mikz,          0 },
+  { X86::VPSHUFDZ256rikz,          X86::VPSHUFDZ256mikz,          0 },
+  { X86::VPSHUFDZrikz,             X86::VPSHUFDZmikz,             0 },
+  { X86::VPSHUFHWZ128rikz,         X86::VPSHUFHWZ128mikz,         0 },
+  { X86::VPSHUFHWZ256rikz,         X86::VPSHUFHWZ256mikz,         0 },
+  { X86::VPSHUFHWZrikz,            X86::VPSHUFHWZmikz,            0 },
+  { X86::VPSHUFLWZ128rikz,         X86::VPSHUFLWZ128mikz,         0 },
+  { X86::VPSHUFLWZ256rikz,         X86::VPSHUFLWZ256mikz,         0 },
+  { X86::VPSHUFLWZrikz,            X86::VPSHUFLWZmikz,            0 },
+  { X86::VPSIGNBYrr,               X86::VPSIGNBYrm,               0 },
+  { X86::VPSIGNBrr,                X86::VPSIGNBrm,                0 },
+  { X86::VPSIGNDYrr,               X86::VPSIGNDYrm,               0 },
+  { X86::VPSIGNDrr,                X86::VPSIGNDrm,                0 },
+  { X86::VPSIGNWYrr,               X86::VPSIGNWYrm,               0 },
+  { X86::VPSIGNWrr,                X86::VPSIGNWrm,                0 },
+  { X86::VPSLLDYrr,                X86::VPSLLDYrm,                0 },
+  { X86::VPSLLDZ128rikz,           X86::VPSLLDZ128mikz,           0 },
+  { X86::VPSLLDZ128rr,             X86::VPSLLDZ128rm,             0 },
+  { X86::VPSLLDZ256rikz,           X86::VPSLLDZ256mikz,           0 },
+  { X86::VPSLLDZ256rr,             X86::VPSLLDZ256rm,             0 },
+  { X86::VPSLLDZrikz,              X86::VPSLLDZmikz,              0 },
+  { X86::VPSLLDZrr,                X86::VPSLLDZrm,                0 },
+  { X86::VPSLLDrr,                 X86::VPSLLDrm,                 0 },
+  { X86::VPSLLQYrr,                X86::VPSLLQYrm,                0 },
+  { X86::VPSLLQZ128rikz,           X86::VPSLLQZ128mikz,           0 },
+  { X86::VPSLLQZ128rr,             X86::VPSLLQZ128rm,             0 },
+  { X86::VPSLLQZ256rikz,           X86::VPSLLQZ256mikz,           0 },
+  { X86::VPSLLQZ256rr,             X86::VPSLLQZ256rm,             0 },
+  { X86::VPSLLQZrikz,              X86::VPSLLQZmikz,              0 },
+  { X86::VPSLLQZrr,                X86::VPSLLQZrm,                0 },
+  { X86::VPSLLQrr,                 X86::VPSLLQrm,                 0 },
+  { X86::VPSLLVDYrr,               X86::VPSLLVDYrm,               0 },
+  { X86::VPSLLVDZ128rr,            X86::VPSLLVDZ128rm,            0 },
+  { X86::VPSLLVDZ256rr,            X86::VPSLLVDZ256rm,            0 },
+  { X86::VPSLLVDZrr,               X86::VPSLLVDZrm,               0 },
+  { X86::VPSLLVDrr,                X86::VPSLLVDrm,                0 },
+  { X86::VPSLLVQYrr,               X86::VPSLLVQYrm,               0 },
+  { X86::VPSLLVQZ128rr,            X86::VPSLLVQZ128rm,            0 },
+  { X86::VPSLLVQZ256rr,            X86::VPSLLVQZ256rm,            0 },
+  { X86::VPSLLVQZrr,               X86::VPSLLVQZrm,               0 },
+  { X86::VPSLLVQrr,                X86::VPSLLVQrm,                0 },
+  { X86::VPSLLVWZ128rr,            X86::VPSLLVWZ128rm,            0 },
+  { X86::VPSLLVWZ256rr,            X86::VPSLLVWZ256rm,            0 },
+  { X86::VPSLLVWZrr,               X86::VPSLLVWZrm,               0 },
+  { X86::VPSLLWYrr,                X86::VPSLLWYrm,                0 },
+  { X86::VPSLLWZ128rikz,           X86::VPSLLWZ128mikz,           0 },
+  { X86::VPSLLWZ128rr,             X86::VPSLLWZ128rm,             0 },
+  { X86::VPSLLWZ256rikz,           X86::VPSLLWZ256mikz,           0 },
+  { X86::VPSLLWZ256rr,             X86::VPSLLWZ256rm,             0 },
+  { X86::VPSLLWZrikz,              X86::VPSLLWZmikz,              0 },
+  { X86::VPSLLWZrr,                X86::VPSLLWZrm,                0 },
+  { X86::VPSLLWrr,                 X86::VPSLLWrm,                 0 },
+  { X86::VPSRADYrr,                X86::VPSRADYrm,                0 },
+  { X86::VPSRADZ128rikz,           X86::VPSRADZ128mikz,           0 },
+  { X86::VPSRADZ128rr,             X86::VPSRADZ128rm,             0 },
+  { X86::VPSRADZ256rikz,           X86::VPSRADZ256mikz,           0 },
+  { X86::VPSRADZ256rr,             X86::VPSRADZ256rm,             0 },
+  { X86::VPSRADZrikz,              X86::VPSRADZmikz,              0 },
+  { X86::VPSRADZrr,                X86::VPSRADZrm,                0 },
+  { X86::VPSRADrr,                 X86::VPSRADrm,                 0 },
+  { X86::VPSRAQZ128rikz,           X86::VPSRAQZ128mikz,           0 },
+  { X86::VPSRAQZ128rr,             X86::VPSRAQZ128rm,             0 },
+  { X86::VPSRAQZ256rikz,           X86::VPSRAQZ256mikz,           0 },
+  { X86::VPSRAQZ256rr,             X86::VPSRAQZ256rm,             0 },
+  { X86::VPSRAQZrikz,              X86::VPSRAQZmikz,              0 },
+  { X86::VPSRAQZrr,                X86::VPSRAQZrm,                0 },
+  { X86::VPSRAVDYrr,               X86::VPSRAVDYrm,               0 },
+  { X86::VPSRAVDZ128rr,            X86::VPSRAVDZ128rm,            0 },
+  { X86::VPSRAVDZ256rr,            X86::VPSRAVDZ256rm,            0 },
+  { X86::VPSRAVDZrr,               X86::VPSRAVDZrm,               0 },
+  { X86::VPSRAVDrr,                X86::VPSRAVDrm,                0 },
+  { X86::VPSRAVQZ128rr,            X86::VPSRAVQZ128rm,            0 },
+  { X86::VPSRAVQZ256rr,            X86::VPSRAVQZ256rm,            0 },
+  { X86::VPSRAVQZrr,               X86::VPSRAVQZrm,               0 },
+  { X86::VPSRAVWZ128rr,            X86::VPSRAVWZ128rm,            0 },
+  { X86::VPSRAVWZ256rr,            X86::VPSRAVWZ256rm,            0 },
+  { X86::VPSRAVWZrr,               X86::VPSRAVWZrm,               0 },
+  { X86::VPSRAWYrr,                X86::VPSRAWYrm,                0 },
+  { X86::VPSRAWZ128rikz,           X86::VPSRAWZ128mikz,           0 },
+  { X86::VPSRAWZ128rr,             X86::VPSRAWZ128rm,             0 },
+  { X86::VPSRAWZ256rikz,           X86::VPSRAWZ256mikz,           0 },
+  { X86::VPSRAWZ256rr,             X86::VPSRAWZ256rm,             0 },
+  { X86::VPSRAWZrikz,              X86::VPSRAWZmikz,              0 },
+  { X86::VPSRAWZrr,                X86::VPSRAWZrm,                0 },
+  { X86::VPSRAWrr,                 X86::VPSRAWrm,                 0 },
+  { X86::VPSRLDYrr,                X86::VPSRLDYrm,                0 },
+  { X86::VPSRLDZ128rikz,           X86::VPSRLDZ128mikz,           0 },
+  { X86::VPSRLDZ128rr,             X86::VPSRLDZ128rm,             0 },
+  { X86::VPSRLDZ256rikz,           X86::VPSRLDZ256mikz,           0 },
+  { X86::VPSRLDZ256rr,             X86::VPSRLDZ256rm,             0 },
+  { X86::VPSRLDZrikz,              X86::VPSRLDZmikz,              0 },
+  { X86::VPSRLDZrr,                X86::VPSRLDZrm,                0 },
+  { X86::VPSRLDrr,                 X86::VPSRLDrm,                 0 },
+  { X86::VPSRLQYrr,                X86::VPSRLQYrm,                0 },
+  { X86::VPSRLQZ128rikz,           X86::VPSRLQZ128mikz,           0 },
+  { X86::VPSRLQZ128rr,             X86::VPSRLQZ128rm,             0 },
+  { X86::VPSRLQZ256rikz,           X86::VPSRLQZ256mikz,           0 },
+  { X86::VPSRLQZ256rr,             X86::VPSRLQZ256rm,             0 },
+  { X86::VPSRLQZrikz,              X86::VPSRLQZmikz,              0 },
+  { X86::VPSRLQZrr,                X86::VPSRLQZrm,                0 },
+  { X86::VPSRLQrr,                 X86::VPSRLQrm,                 0 },
+  { X86::VPSRLVDYrr,               X86::VPSRLVDYrm,               0 },
+  { X86::VPSRLVDZ128rr,            X86::VPSRLVDZ128rm,            0 },
+  { X86::VPSRLVDZ256rr,            X86::VPSRLVDZ256rm,            0 },
+  { X86::VPSRLVDZrr,               X86::VPSRLVDZrm,               0 },
+  { X86::VPSRLVDrr,                X86::VPSRLVDrm,                0 },
+  { X86::VPSRLVQYrr,               X86::VPSRLVQYrm,               0 },
+  { X86::VPSRLVQZ128rr,            X86::VPSRLVQZ128rm,            0 },
+  { X86::VPSRLVQZ256rr,            X86::VPSRLVQZ256rm,            0 },
+  { X86::VPSRLVQZrr,               X86::VPSRLVQZrm,               0 },
+  { X86::VPSRLVQrr,                X86::VPSRLVQrm,                0 },
+  { X86::VPSRLVWZ128rr,            X86::VPSRLVWZ128rm,            0 },
+  { X86::VPSRLVWZ256rr,            X86::VPSRLVWZ256rm,            0 },
+  { X86::VPSRLVWZrr,               X86::VPSRLVWZrm,               0 },
+  { X86::VPSRLWYrr,                X86::VPSRLWYrm,                0 },
+  { X86::VPSRLWZ128rikz,           X86::VPSRLWZ128mikz,           0 },
+  { X86::VPSRLWZ128rr,             X86::VPSRLWZ128rm,             0 },
+  { X86::VPSRLWZ256rikz,           X86::VPSRLWZ256mikz,           0 },
+  { X86::VPSRLWZ256rr,             X86::VPSRLWZ256rm,             0 },
+  { X86::VPSRLWZrikz,              X86::VPSRLWZmikz,              0 },
+  { X86::VPSRLWZrr,                X86::VPSRLWZrm,                0 },
+  { X86::VPSRLWrr,                 X86::VPSRLWrm,                 0 },
+  { X86::VPSUBBYrr,                X86::VPSUBBYrm,                0 },
+  { X86::VPSUBBZ128rr,             X86::VPSUBBZ128rm,             0 },
+  { X86::VPSUBBZ256rr,             X86::VPSUBBZ256rm,             0 },
+  { X86::VPSUBBZrr,                X86::VPSUBBZrm,                0 },
+  { X86::VPSUBBrr,                 X86::VPSUBBrm,                 0 },
+  { X86::VPSUBDYrr,                X86::VPSUBDYrm,                0 },
+  { X86::VPSUBDZ128rr,             X86::VPSUBDZ128rm,             0 },
+  { X86::VPSUBDZ256rr,             X86::VPSUBDZ256rm,             0 },
+  { X86::VPSUBDZrr,                X86::VPSUBDZrm,                0 },
+  { X86::VPSUBDrr,                 X86::VPSUBDrm,                 0 },
+  { X86::VPSUBQYrr,                X86::VPSUBQYrm,                0 },
+  { X86::VPSUBQZ128rr,             X86::VPSUBQZ128rm,             0 },
+  { X86::VPSUBQZ256rr,             X86::VPSUBQZ256rm,             0 },
+  { X86::VPSUBQZrr,                X86::VPSUBQZrm,                0 },
+  { X86::VPSUBQrr,                 X86::VPSUBQrm,                 0 },
+  { X86::VPSUBSBYrr,               X86::VPSUBSBYrm,               0 },
+  { X86::VPSUBSBZ128rr,            X86::VPSUBSBZ128rm,            0 },
+  { X86::VPSUBSBZ256rr,            X86::VPSUBSBZ256rm,            0 },
+  { X86::VPSUBSBZrr,               X86::VPSUBSBZrm,               0 },
+  { X86::VPSUBSBrr,                X86::VPSUBSBrm,                0 },
+  { X86::VPSUBSWYrr,               X86::VPSUBSWYrm,               0 },
+  { X86::VPSUBSWZ128rr,            X86::VPSUBSWZ128rm,            0 },
+  { X86::VPSUBSWZ256rr,            X86::VPSUBSWZ256rm,            0 },
+  { X86::VPSUBSWZrr,               X86::VPSUBSWZrm,               0 },
+  { X86::VPSUBSWrr,                X86::VPSUBSWrm,                0 },
+  { X86::VPSUBUSBYrr,              X86::VPSUBUSBYrm,              0 },
+  { X86::VPSUBUSBZ128rr,           X86::VPSUBUSBZ128rm,           0 },
+  { X86::VPSUBUSBZ256rr,           X86::VPSUBUSBZ256rm,           0 },
+  { X86::VPSUBUSBZrr,              X86::VPSUBUSBZrm,              0 },
+  { X86::VPSUBUSBrr,               X86::VPSUBUSBrm,               0 },
+  { X86::VPSUBUSWYrr,              X86::VPSUBUSWYrm,              0 },
+  { X86::VPSUBUSWZ128rr,           X86::VPSUBUSWZ128rm,           0 },
+  { X86::VPSUBUSWZ256rr,           X86::VPSUBUSWZ256rm,           0 },
+  { X86::VPSUBUSWZrr,              X86::VPSUBUSWZrm,              0 },
+  { X86::VPSUBUSWrr,               X86::VPSUBUSWrm,               0 },
+  { X86::VPSUBWYrr,                X86::VPSUBWYrm,                0 },
+  { X86::VPSUBWZ128rr,             X86::VPSUBWZ128rm,             0 },
+  { X86::VPSUBWZ256rr,             X86::VPSUBWZ256rm,             0 },
+  { X86::VPSUBWZrr,                X86::VPSUBWZrm,                0 },
+  { X86::VPSUBWrr,                 X86::VPSUBWrm,                 0 },
+  { X86::VPTESTMBZ128rr,           X86::VPTESTMBZ128rm,           0 },
+  { X86::VPTESTMBZ256rr,           X86::VPTESTMBZ256rm,           0 },
+  { X86::VPTESTMBZrr,              X86::VPTESTMBZrm,              0 },
+  { X86::VPTESTMDZ128rr,           X86::VPTESTMDZ128rm,           0 },
+  { X86::VPTESTMDZ256rr,           X86::VPTESTMDZ256rm,           0 },
+  { X86::VPTESTMDZrr,              X86::VPTESTMDZrm,              0 },
+  { X86::VPTESTMQZ128rr,           X86::VPTESTMQZ128rm,           0 },
+  { X86::VPTESTMQZ256rr,           X86::VPTESTMQZ256rm,           0 },
+  { X86::VPTESTMQZrr,              X86::VPTESTMQZrm,              0 },
+  { X86::VPTESTMWZ128rr,           X86::VPTESTMWZ128rm,           0 },
+  { X86::VPTESTMWZ256rr,           X86::VPTESTMWZ256rm,           0 },
+  { X86::VPTESTMWZrr,              X86::VPTESTMWZrm,              0 },
+  { X86::VPTESTNMBZ128rr,          X86::VPTESTNMBZ128rm,          0 },
+  { X86::VPTESTNMBZ256rr,          X86::VPTESTNMBZ256rm,          0 },
+  { X86::VPTESTNMBZrr,             X86::VPTESTNMBZrm,             0 },
+  { X86::VPTESTNMDZ128rr,          X86::VPTESTNMDZ128rm,          0 },
+  { X86::VPTESTNMDZ256rr,          X86::VPTESTNMDZ256rm,          0 },
+  { X86::VPTESTNMDZrr,             X86::VPTESTNMDZrm,             0 },
+  { X86::VPTESTNMQZ128rr,          X86::VPTESTNMQZ128rm,          0 },
+  { X86::VPTESTNMQZ256rr,          X86::VPTESTNMQZ256rm,          0 },
+  { X86::VPTESTNMQZrr,             X86::VPTESTNMQZrm,             0 },
+  { X86::VPTESTNMWZ128rr,          X86::VPTESTNMWZ128rm,          0 },
+  { X86::VPTESTNMWZ256rr,          X86::VPTESTNMWZ256rm,          0 },
+  { X86::VPTESTNMWZrr,             X86::VPTESTNMWZrm,             0 },
+  { X86::VPUNPCKHBWYrr,            X86::VPUNPCKHBWYrm,            0 },
+  { X86::VPUNPCKHBWZ128rr,         X86::VPUNPCKHBWZ128rm,         0 },
+  { X86::VPUNPCKHBWZ256rr,         X86::VPUNPCKHBWZ256rm,         0 },
+  { X86::VPUNPCKHBWZrr,            X86::VPUNPCKHBWZrm,            0 },
+  { X86::VPUNPCKHBWrr,             X86::VPUNPCKHBWrm,             0 },
+  { X86::VPUNPCKHDQYrr,            X86::VPUNPCKHDQYrm,            0 },
+  { X86::VPUNPCKHDQZ128rr,         X86::VPUNPCKHDQZ128rm,         0 },
+  { X86::VPUNPCKHDQZ256rr,         X86::VPUNPCKHDQZ256rm,         0 },
+  { X86::VPUNPCKHDQZrr,            X86::VPUNPCKHDQZrm,            0 },
+  { X86::VPUNPCKHDQrr,             X86::VPUNPCKHDQrm,             0 },
+  { X86::VPUNPCKHQDQYrr,           X86::VPUNPCKHQDQYrm,           0 },
+  { X86::VPUNPCKHQDQZ128rr,        X86::VPUNPCKHQDQZ128rm,        0 },
+  { X86::VPUNPCKHQDQZ256rr,        X86::VPUNPCKHQDQZ256rm,        0 },
+  { X86::VPUNPCKHQDQZrr,           X86::VPUNPCKHQDQZrm,           0 },
+  { X86::VPUNPCKHQDQrr,            X86::VPUNPCKHQDQrm,            0 },
+  { X86::VPUNPCKHWDYrr,            X86::VPUNPCKHWDYrm,            0 },
+  { X86::VPUNPCKHWDZ128rr,         X86::VPUNPCKHWDZ128rm,         0 },
+  { X86::VPUNPCKHWDZ256rr,         X86::VPUNPCKHWDZ256rm,         0 },
+  { X86::VPUNPCKHWDZrr,            X86::VPUNPCKHWDZrm,            0 },
+  { X86::VPUNPCKHWDrr,             X86::VPUNPCKHWDrm,             0 },
+  { X86::VPUNPCKLBWYrr,            X86::VPUNPCKLBWYrm,            0 },
+  { X86::VPUNPCKLBWZ128rr,         X86::VPUNPCKLBWZ128rm,         0 },
+  { X86::VPUNPCKLBWZ256rr,         X86::VPUNPCKLBWZ256rm,         0 },
+  { X86::VPUNPCKLBWZrr,            X86::VPUNPCKLBWZrm,            0 },
+  { X86::VPUNPCKLBWrr,             X86::VPUNPCKLBWrm,             0 },
+  { X86::VPUNPCKLDQYrr,            X86::VPUNPCKLDQYrm,            0 },
+  { X86::VPUNPCKLDQZ128rr,         X86::VPUNPCKLDQZ128rm,         0 },
+  { X86::VPUNPCKLDQZ256rr,         X86::VPUNPCKLDQZ256rm,         0 },
+  { X86::VPUNPCKLDQZrr,            X86::VPUNPCKLDQZrm,            0 },
+  { X86::VPUNPCKLDQrr,             X86::VPUNPCKLDQrm,             0 },
+  { X86::VPUNPCKLQDQYrr,           X86::VPUNPCKLQDQYrm,           0 },
+  { X86::VPUNPCKLQDQZ128rr,        X86::VPUNPCKLQDQZ128rm,        0 },
+  { X86::VPUNPCKLQDQZ256rr,        X86::VPUNPCKLQDQZ256rm,        0 },
+  { X86::VPUNPCKLQDQZrr,           X86::VPUNPCKLQDQZrm,           0 },
+  { X86::VPUNPCKLQDQrr,            X86::VPUNPCKLQDQrm,            0 },
+  { X86::VPUNPCKLWDYrr,            X86::VPUNPCKLWDYrm,            0 },
+  { X86::VPUNPCKLWDZ128rr,         X86::VPUNPCKLWDZ128rm,         0 },
+  { X86::VPUNPCKLWDZ256rr,         X86::VPUNPCKLWDZ256rm,         0 },
+  { X86::VPUNPCKLWDZrr,            X86::VPUNPCKLWDZrm,            0 },
+  { X86::VPUNPCKLWDrr,             X86::VPUNPCKLWDrm,             0 },
+  { X86::VPXORDZ128rr,             X86::VPXORDZ128rm,             0 },
+  { X86::VPXORDZ256rr,             X86::VPXORDZ256rm,             0 },
+  { X86::VPXORDZrr,                X86::VPXORDZrm,                0 },
+  { X86::VPXORQZ128rr,             X86::VPXORQZ128rm,             0 },
+  { X86::VPXORQZ256rr,             X86::VPXORQZ256rm,             0 },
+  { X86::VPXORQZrr,                X86::VPXORQZrm,                0 },
+  { X86::VPXORYrr,                 X86::VPXORYrm,                 0 },
+  { X86::VPXORrr,                  X86::VPXORrm,                  0 },
+  { X86::VRANGEPDZ128rri,          X86::VRANGEPDZ128rmi,          0 },
+  { X86::VRANGEPDZ256rri,          X86::VRANGEPDZ256rmi,          0 },
+  { X86::VRANGEPDZrri,             X86::VRANGEPDZrmi,             0 },
+  { X86::VRANGEPSZ128rri,          X86::VRANGEPSZ128rmi,          0 },
+  { X86::VRANGEPSZ256rri,          X86::VRANGEPSZ256rmi,          0 },
+  { X86::VRANGEPSZrri,             X86::VRANGEPSZrmi,             0 },
+  { X86::VRANGESDZrri,             X86::VRANGESDZrmi,             TB_NO_REVERSE },
+  { X86::VRANGESSZrri,             X86::VRANGESSZrmi,             TB_NO_REVERSE },
+  { X86::VRCP14PDZ128rkz,          X86::VRCP14PDZ128mkz,          0 },
+  { X86::VRCP14PDZ256rkz,          X86::VRCP14PDZ256mkz,          0 },
+  { X86::VRCP14PDZrkz,             X86::VRCP14PDZmkz,             0 },
+  { X86::VRCP14PSZ128rkz,          X86::VRCP14PSZ128mkz,          0 },
+  { X86::VRCP14PSZ256rkz,          X86::VRCP14PSZ256mkz,          0 },
+  { X86::VRCP14PSZrkz,             X86::VRCP14PSZmkz,             0 },
+  { X86::VRCP14SDZrr,              X86::VRCP14SDZrm,              TB_NO_REVERSE },
+  { X86::VRCP14SSZrr,              X86::VRCP14SSZrm,              TB_NO_REVERSE },
+  { X86::VRCP28PDZrkz,             X86::VRCP28PDZmkz,             0 },
+  { X86::VRCP28PSZrkz,             X86::VRCP28PSZmkz,             0 },
+  { X86::VRCP28SDZr,               X86::VRCP28SDZm,               TB_NO_REVERSE },
+  { X86::VRCP28SSZr,               X86::VRCP28SSZm,               TB_NO_REVERSE },
+  { X86::VRCPSSr,                  X86::VRCPSSm,                  0 },
+  { X86::VRCPSSr_Int,              X86::VRCPSSm_Int,              TB_NO_REVERSE },
+  { X86::VREDUCEPDZ128rrikz,       X86::VREDUCEPDZ128rmikz,       0 },
+  { X86::VREDUCEPDZ256rrikz,       X86::VREDUCEPDZ256rmikz,       0 },
+  { X86::VREDUCEPDZrrikz,          X86::VREDUCEPDZrmikz,          0 },
+  { X86::VREDUCEPSZ128rrikz,       X86::VREDUCEPSZ128rmikz,       0 },
+  { X86::VREDUCEPSZ256rrikz,       X86::VREDUCEPSZ256rmikz,       0 },
+  { X86::VREDUCEPSZrrikz,          X86::VREDUCEPSZrmikz,          0 },
+  { X86::VREDUCESDZrri,            X86::VREDUCESDZrmi,            TB_NO_REVERSE },
+  { X86::VREDUCESSZrri,            X86::VREDUCESSZrmi,            TB_NO_REVERSE },
+  { X86::VRNDSCALEPDZ128rrikz,     X86::VRNDSCALEPDZ128rmikz,     0 },
+  { X86::VRNDSCALEPDZ256rrikz,     X86::VRNDSCALEPDZ256rmikz,     0 },
+  { X86::VRNDSCALEPDZrrikz,        X86::VRNDSCALEPDZrmikz,        0 },
+  { X86::VRNDSCALEPSZ128rrikz,     X86::VRNDSCALEPSZ128rmikz,     0 },
+  { X86::VRNDSCALEPSZ256rrikz,     X86::VRNDSCALEPSZ256rmikz,     0 },
+  { X86::VRNDSCALEPSZrrikz,        X86::VRNDSCALEPSZrmikz,        0 },
+  { X86::VRNDSCALESDZr,            X86::VRNDSCALESDZm,            0 },
+  { X86::VRNDSCALESDZr_Int,        X86::VRNDSCALESDZm_Int,        TB_NO_REVERSE },
+  { X86::VRNDSCALESSZr,            X86::VRNDSCALESSZm,            0 },
+  { X86::VRNDSCALESSZr_Int,        X86::VRNDSCALESSZm_Int,        TB_NO_REVERSE },
+  { X86::VROUNDSDr,                X86::VROUNDSDm,                0 },
+  { X86::VROUNDSDr_Int,            X86::VROUNDSDm_Int,            TB_NO_REVERSE },
+  { X86::VROUNDSSr,                X86::VROUNDSSm,                0 },
+  { X86::VROUNDSSr_Int,            X86::VROUNDSSm_Int,            TB_NO_REVERSE },
+  { X86::VRSQRT14PDZ128rkz,        X86::VRSQRT14PDZ128mkz,        0 },
+  { X86::VRSQRT14PDZ256rkz,        X86::VRSQRT14PDZ256mkz,        0 },
+  { X86::VRSQRT14PDZrkz,           X86::VRSQRT14PDZmkz,           0 },
+  { X86::VRSQRT14PSZ128rkz,        X86::VRSQRT14PSZ128mkz,        0 },
+  { X86::VRSQRT14PSZ256rkz,        X86::VRSQRT14PSZ256mkz,        0 },
+  { X86::VRSQRT14PSZrkz,           X86::VRSQRT14PSZmkz,           0 },
+  { X86::VRSQRT14SDZrr,            X86::VRSQRT14SDZrm,            TB_NO_REVERSE },
+  { X86::VRSQRT14SSZrr,            X86::VRSQRT14SSZrm,            TB_NO_REVERSE },
+  { X86::VRSQRT28PDZrkz,           X86::VRSQRT28PDZmkz,           0 },
+  { X86::VRSQRT28PSZrkz,           X86::VRSQRT28PSZmkz,           0 },
+  { X86::VRSQRT28SDZr,             X86::VRSQRT28SDZm,             TB_NO_REVERSE },
+  { X86::VRSQRT28SSZr,             X86::VRSQRT28SSZm,             TB_NO_REVERSE },
+  { X86::VRSQRTSSr,                X86::VRSQRTSSm,                0 },
+  { X86::VRSQRTSSr_Int,            X86::VRSQRTSSm_Int,            TB_NO_REVERSE },
+  { X86::VSCALEFPDZ128rr,          X86::VSCALEFPDZ128rm,          0 },
+  { X86::VSCALEFPDZ256rr,          X86::VSCALEFPDZ256rm,          0 },
+  { X86::VSCALEFPDZrr,             X86::VSCALEFPDZrm,             0 },
+  { X86::VSCALEFPSZ128rr,          X86::VSCALEFPSZ128rm,          0 },
+  { X86::VSCALEFPSZ256rr,          X86::VSCALEFPSZ256rm,          0 },
+  { X86::VSCALEFPSZrr,             X86::VSCALEFPSZrm,             0 },
+  { X86::VSCALEFSDZrr,             X86::VSCALEFSDZrm,             TB_NO_REVERSE },
+  { X86::VSCALEFSSZrr,             X86::VSCALEFSSZrm,             TB_NO_REVERSE },
+  { X86::VSHUFF32X4Z256rri,        X86::VSHUFF32X4Z256rmi,        0 },
+  { X86::VSHUFF32X4Zrri,           X86::VSHUFF32X4Zrmi,           0 },
+  { X86::VSHUFF64X2Z256rri,        X86::VSHUFF64X2Z256rmi,        0 },
+  { X86::VSHUFF64X2Zrri,           X86::VSHUFF64X2Zrmi,           0 },
+  { X86::VSHUFI32X4Z256rri,        X86::VSHUFI32X4Z256rmi,        0 },
+  { X86::VSHUFI32X4Zrri,           X86::VSHUFI32X4Zrmi,           0 },
+  { X86::VSHUFI64X2Z256rri,        X86::VSHUFI64X2Z256rmi,        0 },
+  { X86::VSHUFI64X2Zrri,           X86::VSHUFI64X2Zrmi,           0 },
+  { X86::VSHUFPDYrri,              X86::VSHUFPDYrmi,              0 },
+  { X86::VSHUFPDZ128rri,           X86::VSHUFPDZ128rmi,           0 },
+  { X86::VSHUFPDZ256rri,           X86::VSHUFPDZ256rmi,           0 },
+  { X86::VSHUFPDZrri,              X86::VSHUFPDZrmi,              0 },
+  { X86::VSHUFPDrri,               X86::VSHUFPDrmi,               0 },
+  { X86::VSHUFPSYrri,              X86::VSHUFPSYrmi,              0 },
+  { X86::VSHUFPSZ128rri,           X86::VSHUFPSZ128rmi,           0 },
+  { X86::VSHUFPSZ256rri,           X86::VSHUFPSZ256rmi,           0 },
+  { X86::VSHUFPSZrri,              X86::VSHUFPSZrmi,              0 },
+  { X86::VSHUFPSrri,               X86::VSHUFPSrmi,               0 },
+  { X86::VSQRTPDZ128rkz,           X86::VSQRTPDZ128mkz,           0 },
+  { X86::VSQRTPDZ256rkz,           X86::VSQRTPDZ256mkz,           0 },
+  { X86::VSQRTPDZrkz,              X86::VSQRTPDZmkz,              0 },
+  { X86::VSQRTPSZ128rkz,           X86::VSQRTPSZ128mkz,           0 },
+  { X86::VSQRTPSZ256rkz,           X86::VSQRTPSZ256mkz,           0 },
+  { X86::VSQRTPSZrkz,              X86::VSQRTPSZmkz,              0 },
+  { X86::VSQRTSDZr,                X86::VSQRTSDZm,                0 },
+  { X86::VSQRTSDZr_Int,            X86::VSQRTSDZm_Int,            TB_NO_REVERSE },
+  { X86::VSQRTSDr,                 X86::VSQRTSDm,                 0 },
+  { X86::VSQRTSDr_Int,             X86::VSQRTSDm_Int,             TB_NO_REVERSE },
+  { X86::VSQRTSSZr,                X86::VSQRTSSZm,                0 },
+  { X86::VSQRTSSZr_Int,            X86::VSQRTSSZm_Int,            TB_NO_REVERSE },
+  { X86::VSQRTSSr,                 X86::VSQRTSSm,                 0 },
+  { X86::VSQRTSSr_Int,             X86::VSQRTSSm_Int,             TB_NO_REVERSE },
+  { X86::VSUBPDYrr,                X86::VSUBPDYrm,                0 },
+  { X86::VSUBPDZ128rr,             X86::VSUBPDZ128rm,             0 },
+  { X86::VSUBPDZ256rr,             X86::VSUBPDZ256rm,             0 },
+  { X86::VSUBPDZrr,                X86::VSUBPDZrm,                0 },
+  { X86::VSUBPDrr,                 X86::VSUBPDrm,                 0 },
+  { X86::VSUBPSYrr,                X86::VSUBPSYrm,                0 },
+  { X86::VSUBPSZ128rr,             X86::VSUBPSZ128rm,             0 },
+  { X86::VSUBPSZ256rr,             X86::VSUBPSZ256rm,             0 },
+  { X86::VSUBPSZrr,                X86::VSUBPSZrm,                0 },
+  { X86::VSUBPSrr,                 X86::VSUBPSrm,                 0 },
+  { X86::VSUBSDZrr,                X86::VSUBSDZrm,                0 },
+  { X86::VSUBSDZrr_Int,            X86::VSUBSDZrm_Int,            TB_NO_REVERSE },
+  { X86::VSUBSDrr,                 X86::VSUBSDrm,                 0 },
+  { X86::VSUBSDrr_Int,             X86::VSUBSDrm_Int,             TB_NO_REVERSE },
+  { X86::VSUBSSZrr,                X86::VSUBSSZrm,                0 },
+  { X86::VSUBSSZrr_Int,            X86::VSUBSSZrm_Int,            TB_NO_REVERSE },
+  { X86::VSUBSSrr,                 X86::VSUBSSrm,                 0 },
+  { X86::VSUBSSrr_Int,             X86::VSUBSSrm_Int,             TB_NO_REVERSE },
+  { X86::VUNPCKHPDYrr,             X86::VUNPCKHPDYrm,             0 },
+  { X86::VUNPCKHPDZ128rr,          X86::VUNPCKHPDZ128rm,          0 },
+  { X86::VUNPCKHPDZ256rr,          X86::VUNPCKHPDZ256rm,          0 },
+  { X86::VUNPCKHPDZrr,             X86::VUNPCKHPDZrm,             0 },
+  { X86::VUNPCKHPDrr,              X86::VUNPCKHPDrm,              0 },
+  { X86::VUNPCKHPSYrr,             X86::VUNPCKHPSYrm,             0 },
+  { X86::VUNPCKHPSZ128rr,          X86::VUNPCKHPSZ128rm,          0 },
+  { X86::VUNPCKHPSZ256rr,          X86::VUNPCKHPSZ256rm,          0 },
+  { X86::VUNPCKHPSZrr,             X86::VUNPCKHPSZrm,             0 },
+  { X86::VUNPCKHPSrr,              X86::VUNPCKHPSrm,              0 },
+  { X86::VUNPCKLPDYrr,             X86::VUNPCKLPDYrm,             0 },
+  { X86::VUNPCKLPDZ128rr,          X86::VUNPCKLPDZ128rm,          0 },
+  { X86::VUNPCKLPDZ256rr,          X86::VUNPCKLPDZ256rm,          0 },
+  { X86::VUNPCKLPDZrr,             X86::VUNPCKLPDZrm,             0 },
+  { X86::VUNPCKLPDrr,              X86::VUNPCKLPDrm,              0 },
+  { X86::VUNPCKLPSYrr,             X86::VUNPCKLPSYrm,             0 },
+  { X86::VUNPCKLPSZ128rr,          X86::VUNPCKLPSZ128rm,          0 },
+  { X86::VUNPCKLPSZ256rr,          X86::VUNPCKLPSZ256rm,          0 },
+  { X86::VUNPCKLPSZrr,             X86::VUNPCKLPSZrm,             0 },
+  { X86::VUNPCKLPSrr,              X86::VUNPCKLPSrm,              0 },
+  { X86::VXORPDYrr,                X86::VXORPDYrm,                0 },
+  { X86::VXORPDZ128rr,             X86::VXORPDZ128rm,             0 },
+  { X86::VXORPDZ256rr,             X86::VXORPDZ256rm,             0 },
+  { X86::VXORPDZrr,                X86::VXORPDZrm,                0 },
+  { X86::VXORPDrr,                 X86::VXORPDrm,                 0 },
+  { X86::VXORPSYrr,                X86::VXORPSYrm,                0 },
+  { X86::VXORPSZ128rr,             X86::VXORPSZ128rm,             0 },
+  { X86::VXORPSZ256rr,             X86::VXORPSZ256rm,             0 },
+  { X86::VXORPSZrr,                X86::VXORPSZrm,                0 },
+  { X86::VXORPSrr,                 X86::VXORPSrm,                 0 },
+  { X86::XOR16rr,                  X86::XOR16rm,                  0 },
+  { X86::XOR32rr,                  X86::XOR32rm,                  0 },
+  { X86::XOR64rr,                  X86::XOR64rm,                  0 },
+  { X86::XOR8rr,                   X86::XOR8rm,                   0 },
+  { X86::XORPDrr,                  X86::XORPDrm,                  TB_ALIGN_16 },
+  { X86::XORPSrr,                  X86::XORPSrm,                  TB_ALIGN_16 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+  { X86::VADDPDZ128rrkz,             X86::VADDPDZ128rmkz,             0 },
+  { X86::VADDPDZ256rrkz,             X86::VADDPDZ256rmkz,             0 },
+  { X86::VADDPDZrrkz,                X86::VADDPDZrmkz,                0 },
+  { X86::VADDPSZ128rrkz,             X86::VADDPSZ128rmkz,             0 },
+  { X86::VADDPSZ256rrkz,             X86::VADDPSZ256rmkz,             0 },
+  { X86::VADDPSZrrkz,                X86::VADDPSZrmkz,                0 },
+  { X86::VADDSDZrr_Intkz,            X86::VADDSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VADDSSZrr_Intkz,            X86::VADDSSZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VALIGNDZ128rrikz,           X86::VALIGNDZ128rmikz,           0 },
+  { X86::VALIGNDZ256rrikz,           X86::VALIGNDZ256rmikz,           0 },
+  { X86::VALIGNDZrrikz,              X86::VALIGNDZrmikz,              0 },
+  { X86::VALIGNQZ128rrikz,           X86::VALIGNQZ128rmikz,           0 },
+  { X86::VALIGNQZ256rrikz,           X86::VALIGNQZ256rmikz,           0 },
+  { X86::VALIGNQZrrikz,              X86::VALIGNQZrmikz,              0 },
+  { X86::VANDNPDZ128rrkz,            X86::VANDNPDZ128rmkz,            0 },
+  { X86::VANDNPDZ256rrkz,            X86::VANDNPDZ256rmkz,            0 },
+  { X86::VANDNPDZrrkz,               X86::VANDNPDZrmkz,               0 },
+  { X86::VANDNPSZ128rrkz,            X86::VANDNPSZ128rmkz,            0 },
+  { X86::VANDNPSZ256rrkz,            X86::VANDNPSZ256rmkz,            0 },
+  { X86::VANDNPSZrrkz,               X86::VANDNPSZrmkz,               0 },
+  { X86::VANDPDZ128rrkz,             X86::VANDPDZ128rmkz,             0 },
+  { X86::VANDPDZ256rrkz,             X86::VANDPDZ256rmkz,             0 },
+  { X86::VANDPDZrrkz,                X86::VANDPDZrmkz,                0 },
+  { X86::VANDPSZ128rrkz,             X86::VANDPSZ128rmkz,             0 },
+  { X86::VANDPSZ256rrkz,             X86::VANDPSZ256rmkz,             0 },
+  { X86::VANDPSZrrkz,                X86::VANDPSZrmkz,                0 },
+  { X86::VBLENDMPDZ128rrk,           X86::VBLENDMPDZ128rmk,           0 },
+  { X86::VBLENDMPDZ256rrk,           X86::VBLENDMPDZ256rmk,           0 },
+  { X86::VBLENDMPDZrrk,              X86::VBLENDMPDZrmk,              0 },
+  { X86::VBLENDMPSZ128rrk,           X86::VBLENDMPSZ128rmk,           0 },
+  { X86::VBLENDMPSZ256rrk,           X86::VBLENDMPSZ256rmk,           0 },
+  { X86::VBLENDMPSZrrk,              X86::VBLENDMPSZrmk,              0 },
+  { X86::VBROADCASTF32X2Z256rk,      X86::VBROADCASTF32X2Z256mk,      TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Zrk,         X86::VBROADCASTF32X2Zmk,         TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z128rk,      X86::VBROADCASTI32X2Z128mk,      TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z256rk,      X86::VBROADCASTI32X2Z256mk,      TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Zrk,         X86::VBROADCASTI32X2Zmk,         TB_NO_REVERSE },
+  { X86::VBROADCASTSDZ256rk,         X86::VBROADCASTSDZ256mk,         TB_NO_REVERSE },
+  { X86::VBROADCASTSDZrk,            X86::VBROADCASTSDZmk,            TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ128rk,         X86::VBROADCASTSSZ128mk,         TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ256rk,         X86::VBROADCASTSSZ256mk,         TB_NO_REVERSE },
+  { X86::VBROADCASTSSZrk,            X86::VBROADCASTSSZmk,            TB_NO_REVERSE },
+  { X86::VCMPPDZ128rrik,             X86::VCMPPDZ128rmik,             0 },
+  { X86::VCMPPDZ256rrik,             X86::VCMPPDZ256rmik,             0 },
+  { X86::VCMPPDZrrik,                X86::VCMPPDZrmik,                0 },
+  { X86::VCMPPSZ128rrik,             X86::VCMPPSZ128rmik,             0 },
+  { X86::VCMPPSZ256rrik,             X86::VCMPPSZ256rmik,             0 },
+  { X86::VCMPPSZrrik,                X86::VCMPPSZrmik,                0 },
+  { X86::VCMPSDZrr_Intk,             X86::VCMPSDZrm_Intk,             TB_NO_REVERSE },
+  { X86::VCMPSSZrr_Intk,             X86::VCMPSSZrm_Intk,             TB_NO_REVERSE },
+  { X86::VCVTDQ2PDZ128rrk,           X86::VCVTDQ2PDZ128rmk,           TB_NO_REVERSE },
+  { X86::VCVTDQ2PDZ256rrk,           X86::VCVTDQ2PDZ256rmk,           0 },
+  { X86::VCVTDQ2PDZrrk,              X86::VCVTDQ2PDZrmk,              0 },
+  { X86::VCVTDQ2PSZ128rrk,           X86::VCVTDQ2PSZ128rmk,           0 },
+  { X86::VCVTDQ2PSZ256rrk,           X86::VCVTDQ2PSZ256rmk,           0 },
+  { X86::VCVTDQ2PSZrrk,              X86::VCVTDQ2PSZrmk,              0 },
+  { X86::VCVTPD2DQZ128rrk,           X86::VCVTPD2DQZ128rmk,           0 },
+  { X86::VCVTPD2DQZ256rrk,           X86::VCVTPD2DQZ256rmk,           0 },
+  { X86::VCVTPD2DQZrrk,              X86::VCVTPD2DQZrmk,              0 },
+  { X86::VCVTPD2PSZ128rrk,           X86::VCVTPD2PSZ128rmk,           0 },
+  { X86::VCVTPD2PSZ256rrk,           X86::VCVTPD2PSZ256rmk,           0 },
+  { X86::VCVTPD2PSZrrk,              X86::VCVTPD2PSZrmk,              0 },
+  { X86::VCVTPD2QQZ128rrk,           X86::VCVTPD2QQZ128rmk,           0 },
+  { X86::VCVTPD2QQZ256rrk,           X86::VCVTPD2QQZ256rmk,           0 },
+  { X86::VCVTPD2QQZrrk,              X86::VCVTPD2QQZrmk,              0 },
+  { X86::VCVTPD2UDQZ128rrk,          X86::VCVTPD2UDQZ128rmk,          0 },
+  { X86::VCVTPD2UDQZ256rrk,          X86::VCVTPD2UDQZ256rmk,          0 },
+  { X86::VCVTPD2UDQZrrk,             X86::VCVTPD2UDQZrmk,             0 },
+  { X86::VCVTPD2UQQZ128rrk,          X86::VCVTPD2UQQZ128rmk,          0 },
+  { X86::VCVTPD2UQQZ256rrk,          X86::VCVTPD2UQQZ256rmk,          0 },
+  { X86::VCVTPD2UQQZrrk,             X86::VCVTPD2UQQZrmk,             0 },
+  { X86::VCVTPH2PSZ128rrk,           X86::VCVTPH2PSZ128rmk,           TB_NO_REVERSE },
+  { X86::VCVTPH2PSZ256rrk,           X86::VCVTPH2PSZ256rmk,           0 },
+  { X86::VCVTPH2PSZrrk,              X86::VCVTPH2PSZrmk,              0 },
+  { X86::VCVTPS2DQZ128rrk,           X86::VCVTPS2DQZ128rmk,           0 },
+  { X86::VCVTPS2DQZ256rrk,           X86::VCVTPS2DQZ256rmk,           0 },
+  { X86::VCVTPS2DQZrrk,              X86::VCVTPS2DQZrmk,              0 },
+  { X86::VCVTPS2PDZ128rrk,           X86::VCVTPS2PDZ128rmk,           TB_NO_REVERSE },
+  { X86::VCVTPS2PDZ256rrk,           X86::VCVTPS2PDZ256rmk,           0 },
+  { X86::VCVTPS2PDZrrk,              X86::VCVTPS2PDZrmk,              0 },
+  { X86::VCVTPS2QQZ128rrk,           X86::VCVTPS2QQZ128rmk,           TB_NO_REVERSE },
+  { X86::VCVTPS2QQZ256rrk,           X86::VCVTPS2QQZ256rmk,           0 },
+  { X86::VCVTPS2QQZrrk,              X86::VCVTPS2QQZrmk,              0 },
+  { X86::VCVTPS2UDQZ128rrk,          X86::VCVTPS2UDQZ128rmk,          0 },
+  { X86::VCVTPS2UDQZ256rrk,          X86::VCVTPS2UDQZ256rmk,          0 },
+  { X86::VCVTPS2UDQZrrk,             X86::VCVTPS2UDQZrmk,             0 },
+  { X86::VCVTPS2UQQZ128rrk,          X86::VCVTPS2UQQZ128rmk,          TB_NO_REVERSE },
+  { X86::VCVTPS2UQQZ256rrk,          X86::VCVTPS2UQQZ256rmk,          0 },
+  { X86::VCVTPS2UQQZrrk,             X86::VCVTPS2UQQZrmk,             0 },
+  { X86::VCVTQQ2PDZ128rrk,           X86::VCVTQQ2PDZ128rmk,           0 },
+  { X86::VCVTQQ2PDZ256rrk,           X86::VCVTQQ2PDZ256rmk,           0 },
+  { X86::VCVTQQ2PDZrrk,              X86::VCVTQQ2PDZrmk,              0 },
+  { X86::VCVTQQ2PSZ128rrk,           X86::VCVTQQ2PSZ128rmk,           0 },
+  { X86::VCVTQQ2PSZ256rrk,           X86::VCVTQQ2PSZ256rmk,           0 },
+  { X86::VCVTQQ2PSZrrk,              X86::VCVTQQ2PSZrmk,              0 },
+  { X86::VCVTSD2SSZrr_Intkz,         X86::VCVTSD2SSZrm_Intkz,         TB_NO_REVERSE },
+  { X86::VCVTSS2SDZrr_Intkz,         X86::VCVTSS2SDZrm_Intkz,         TB_NO_REVERSE },
+  { X86::VCVTTPD2DQZ128rrk,          X86::VCVTTPD2DQZ128rmk,          0 },
+  { X86::VCVTTPD2DQZ256rrk,          X86::VCVTTPD2DQZ256rmk,          0 },
+  { X86::VCVTTPD2DQZrrk,             X86::VCVTTPD2DQZrmk,             0 },
+  { X86::VCVTTPD2QQZ128rrk,          X86::VCVTTPD2QQZ128rmk,          0 },
+  { X86::VCVTTPD2QQZ256rrk,          X86::VCVTTPD2QQZ256rmk,          0 },
+  { X86::VCVTTPD2QQZrrk,             X86::VCVTTPD2QQZrmk,             0 },
+  { X86::VCVTTPD2UDQZ128rrk,         X86::VCVTTPD2UDQZ128rmk,         0 },
+  { X86::VCVTTPD2UDQZ256rrk,         X86::VCVTTPD2UDQZ256rmk,         0 },
+  { X86::VCVTTPD2UDQZrrk,            X86::VCVTTPD2UDQZrmk,            0 },
+  { X86::VCVTTPD2UQQZ128rrk,         X86::VCVTTPD2UQQZ128rmk,         0 },
+  { X86::VCVTTPD2UQQZ256rrk,         X86::VCVTTPD2UQQZ256rmk,         0 },
+  { X86::VCVTTPD2UQQZrrk,            X86::VCVTTPD2UQQZrmk,            0 },
+  { X86::VCVTTPS2DQZ128rrk,          X86::VCVTTPS2DQZ128rmk,          0 },
+  { X86::VCVTTPS2DQZ256rrk,          X86::VCVTTPS2DQZ256rmk,          0 },
+  { X86::VCVTTPS2DQZrrk,             X86::VCVTTPS2DQZrmk,             0 },
+  { X86::VCVTTPS2QQZ128rrk,          X86::VCVTTPS2QQZ128rmk,          TB_NO_REVERSE },
+  { X86::VCVTTPS2QQZ256rrk,          X86::VCVTTPS2QQZ256rmk,          0 },
+  { X86::VCVTTPS2QQZrrk,             X86::VCVTTPS2QQZrmk,             0 },
+  { X86::VCVTTPS2UDQZ128rrk,         X86::VCVTTPS2UDQZ128rmk,         0 },
+  { X86::VCVTTPS2UDQZ256rrk,         X86::VCVTTPS2UDQZ256rmk,         0 },
+  { X86::VCVTTPS2UDQZrrk,            X86::VCVTTPS2UDQZrmk,            0 },
+  { X86::VCVTTPS2UQQZ128rrk,         X86::VCVTTPS2UQQZ128rmk,         TB_NO_REVERSE },
+  { X86::VCVTTPS2UQQZ256rrk,         X86::VCVTTPS2UQQZ256rmk,         0 },
+  { X86::VCVTTPS2UQQZrrk,            X86::VCVTTPS2UQQZrmk,            0 },
+  { X86::VCVTUDQ2PDZ128rrk,          X86::VCVTUDQ2PDZ128rmk,          TB_NO_REVERSE },
+  { X86::VCVTUDQ2PDZ256rrk,          X86::VCVTUDQ2PDZ256rmk,          0 },
+  { X86::VCVTUDQ2PDZrrk,             X86::VCVTUDQ2PDZrmk,             0 },
+  { X86::VCVTUDQ2PSZ128rrk,          X86::VCVTUDQ2PSZ128rmk,          0 },
+  { X86::VCVTUDQ2PSZ256rrk,          X86::VCVTUDQ2PSZ256rmk,          0 },
+  { X86::VCVTUDQ2PSZrrk,             X86::VCVTUDQ2PSZrmk,             0 },
+  { X86::VCVTUQQ2PDZ128rrk,          X86::VCVTUQQ2PDZ128rmk,          0 },
+  { X86::VCVTUQQ2PDZ256rrk,          X86::VCVTUQQ2PDZ256rmk,          0 },
+  { X86::VCVTUQQ2PDZrrk,             X86::VCVTUQQ2PDZrmk,             0 },
+  { X86::VCVTUQQ2PSZ128rrk,          X86::VCVTUQQ2PSZ128rmk,          0 },
+  { X86::VCVTUQQ2PSZ256rrk,          X86::VCVTUQQ2PSZ256rmk,          0 },
+  { X86::VCVTUQQ2PSZrrk,             X86::VCVTUQQ2PSZrmk,             0 },
+  { X86::VDBPSADBWZ128rrikz,         X86::VDBPSADBWZ128rmikz,         0 },
+  { X86::VDBPSADBWZ256rrikz,         X86::VDBPSADBWZ256rmikz,         0 },
+  { X86::VDBPSADBWZrrikz,            X86::VDBPSADBWZrmikz,            0 },
+  { X86::VDIVPDZ128rrkz,             X86::VDIVPDZ128rmkz,             0 },
+  { X86::VDIVPDZ256rrkz,             X86::VDIVPDZ256rmkz,             0 },
+  { X86::VDIVPDZrrkz,                X86::VDIVPDZrmkz,                0 },
+  { X86::VDIVPSZ128rrkz,             X86::VDIVPSZ128rmkz,             0 },
+  { X86::VDIVPSZ256rrkz,             X86::VDIVPSZ256rmkz,             0 },
+  { X86::VDIVPSZrrkz,                X86::VDIVPSZrmkz,                0 },
+  { X86::VDIVSDZrr_Intkz,            X86::VDIVSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VDIVSSZrr_Intkz,            X86::VDIVSSZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VEXP2PDZrk,                 X86::VEXP2PDZmk,                 0 },
+  { X86::VEXP2PSZrk,                 X86::VEXP2PSZmk,                 0 },
+  { X86::VEXPANDPDZ128rrk,           X86::VEXPANDPDZ128rmk,           TB_NO_REVERSE },
+  { X86::VEXPANDPDZ256rrk,           X86::VEXPANDPDZ256rmk,           TB_NO_REVERSE },
+  { X86::VEXPANDPDZrrk,              X86::VEXPANDPDZrmk,              TB_NO_REVERSE },
+  { X86::VEXPANDPSZ128rrk,           X86::VEXPANDPSZ128rmk,           TB_NO_REVERSE },
+  { X86::VEXPANDPSZ256rrk,           X86::VEXPANDPSZ256rmk,           TB_NO_REVERSE },
+  { X86::VEXPANDPSZrrk,              X86::VEXPANDPSZrmk,              TB_NO_REVERSE },
+  { X86::VFIXUPIMMPDZ128rri,         X86::VFIXUPIMMPDZ128rmi,         0 },
+  { X86::VFIXUPIMMPDZ256rri,         X86::VFIXUPIMMPDZ256rmi,         0 },
+  { X86::VFIXUPIMMPDZrri,            X86::VFIXUPIMMPDZrmi,            0 },
+  { X86::VFIXUPIMMPSZ128rri,         X86::VFIXUPIMMPSZ128rmi,         0 },
+  { X86::VFIXUPIMMPSZ256rri,         X86::VFIXUPIMMPSZ256rmi,         0 },
+  { X86::VFIXUPIMMPSZrri,            X86::VFIXUPIMMPSZrmi,            0 },
+  { X86::VFIXUPIMMSDZrri,            X86::VFIXUPIMMSDZrmi,            TB_NO_REVERSE },
+  { X86::VFIXUPIMMSSZrri,            X86::VFIXUPIMMSSZrmi,            TB_NO_REVERSE },
+  { X86::VFMADD132PDYr,              X86::VFMADD132PDYm,              0 },
+  { X86::VFMADD132PDZ128r,           X86::VFMADD132PDZ128m,           0 },
+  { X86::VFMADD132PDZ256r,           X86::VFMADD132PDZ256m,           0 },
+  { X86::VFMADD132PDZr,              X86::VFMADD132PDZm,              0 },
+  { X86::VFMADD132PDr,               X86::VFMADD132PDm,               0 },
+  { X86::VFMADD132PSYr,              X86::VFMADD132PSYm,              0 },
+  { X86::VFMADD132PSZ128r,           X86::VFMADD132PSZ128m,           0 },
+  { X86::VFMADD132PSZ256r,           X86::VFMADD132PSZ256m,           0 },
+  { X86::VFMADD132PSZr,              X86::VFMADD132PSZm,              0 },
+  { X86::VFMADD132PSr,               X86::VFMADD132PSm,               0 },
+  { X86::VFMADD132SDZr,              X86::VFMADD132SDZm,              0 },
+  { X86::VFMADD132SDZr_Int,          X86::VFMADD132SDZm_Int,          TB_NO_REVERSE },
+  { X86::VFMADD132SDr,               X86::VFMADD132SDm,               0 },
+  { X86::VFMADD132SDr_Int,           X86::VFMADD132SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD132SSZr,              X86::VFMADD132SSZm,              0 },
+  { X86::VFMADD132SSZr_Int,          X86::VFMADD132SSZm_Int,          TB_NO_REVERSE },
+  { X86::VFMADD132SSr,               X86::VFMADD132SSm,               0 },
+  { X86::VFMADD132SSr_Int,           X86::VFMADD132SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD213PDYr,              X86::VFMADD213PDYm,              0 },
+  { X86::VFMADD213PDZ128r,           X86::VFMADD213PDZ128m,           0 },
+  { X86::VFMADD213PDZ256r,           X86::VFMADD213PDZ256m,           0 },
+  { X86::VFMADD213PDZr,              X86::VFMADD213PDZm,              0 },
+  { X86::VFMADD213PDr,               X86::VFMADD213PDm,               0 },
+  { X86::VFMADD213PSYr,              X86::VFMADD213PSYm,              0 },
+  { X86::VFMADD213PSZ128r,           X86::VFMADD213PSZ128m,           0 },
+  { X86::VFMADD213PSZ256r,           X86::VFMADD213PSZ256m,           0 },
+  { X86::VFMADD213PSZr,              X86::VFMADD213PSZm,              0 },
+  { X86::VFMADD213PSr,               X86::VFMADD213PSm,               0 },
+  { X86::VFMADD213SDZr,              X86::VFMADD213SDZm,              0 },
+  { X86::VFMADD213SDZr_Int,          X86::VFMADD213SDZm_Int,          TB_NO_REVERSE },
+  { X86::VFMADD213SDr,               X86::VFMADD213SDm,               0 },
+  { X86::VFMADD213SDr_Int,           X86::VFMADD213SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD213SSZr,              X86::VFMADD213SSZm,              0 },
+  { X86::VFMADD213SSZr_Int,          X86::VFMADD213SSZm_Int,          TB_NO_REVERSE },
+  { X86::VFMADD213SSr,               X86::VFMADD213SSm,               0 },
+  { X86::VFMADD213SSr_Int,           X86::VFMADD213SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD231PDYr,              X86::VFMADD231PDYm,              0 },
+  { X86::VFMADD231PDZ128r,           X86::VFMADD231PDZ128m,           0 },
+  { X86::VFMADD231PDZ256r,           X86::VFMADD231PDZ256m,           0 },
+  { X86::VFMADD231PDZr,              X86::VFMADD231PDZm,              0 },
+  { X86::VFMADD231PDr,               X86::VFMADD231PDm,               0 },
+  { X86::VFMADD231PSYr,              X86::VFMADD231PSYm,              0 },
+  { X86::VFMADD231PSZ128r,           X86::VFMADD231PSZ128m,           0 },
+  { X86::VFMADD231PSZ256r,           X86::VFMADD231PSZ256m,           0 },
+  { X86::VFMADD231PSZr,              X86::VFMADD231PSZm,              0 },
+  { X86::VFMADD231PSr,               X86::VFMADD231PSm,               0 },
+  { X86::VFMADD231SDZr,              X86::VFMADD231SDZm,              0 },
+  { X86::VFMADD231SDZr_Int,          X86::VFMADD231SDZm_Int,          TB_NO_REVERSE },
+  { X86::VFMADD231SDr,               X86::VFMADD231SDm,               0 },
+  { X86::VFMADD231SDr_Int,           X86::VFMADD231SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD231SSZr,              X86::VFMADD231SSZm,              0 },
+  { X86::VFMADD231SSZr_Int,          X86::VFMADD231SSZm_Int,          TB_NO_REVERSE },
+  { X86::VFMADD231SSr,               X86::VFMADD231SSm,               0 },
+  { X86::VFMADD231SSr_Int,           X86::VFMADD231SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMADDPD4Yrr,               X86::VFMADDPD4Yrm,               0 },
+  { X86::VFMADDPD4rr,                X86::VFMADDPD4rm,                0 },
+  { X86::VFMADDPS4Yrr,               X86::VFMADDPS4Yrm,               0 },
+  { X86::VFMADDPS4rr,                X86::VFMADDPS4rm,                0 },
+  { X86::VFMADDSD4rr,                X86::VFMADDSD4rm,                0 },
+  { X86::VFMADDSD4rr_Int,            X86::VFMADDSD4rm_Int,            TB_NO_REVERSE },
+  { X86::VFMADDSS4rr,                X86::VFMADDSS4rm,                0 },
+  { X86::VFMADDSS4rr_Int,            X86::VFMADDSS4rm_Int,            TB_NO_REVERSE },
+  { X86::VFMADDSUB132PDYr,           X86::VFMADDSUB132PDYm,           0 },
+  { X86::VFMADDSUB132PDZ128r,        X86::VFMADDSUB132PDZ128m,        0 },
+  { X86::VFMADDSUB132PDZ256r,        X86::VFMADDSUB132PDZ256m,        0 },
+  { X86::VFMADDSUB132PDZr,           X86::VFMADDSUB132PDZm,           0 },
+  { X86::VFMADDSUB132PDr,            X86::VFMADDSUB132PDm,            0 },
+  { X86::VFMADDSUB132PSYr,           X86::VFMADDSUB132PSYm,           0 },
+  { X86::VFMADDSUB132PSZ128r,        X86::VFMADDSUB132PSZ128m,        0 },
+  { X86::VFMADDSUB132PSZ256r,        X86::VFMADDSUB132PSZ256m,        0 },
+  { X86::VFMADDSUB132PSZr,           X86::VFMADDSUB132PSZm,           0 },
+  { X86::VFMADDSUB132PSr,            X86::VFMADDSUB132PSm,            0 },
+  { X86::VFMADDSUB213PDYr,           X86::VFMADDSUB213PDYm,           0 },
+  { X86::VFMADDSUB213PDZ128r,        X86::VFMADDSUB213PDZ128m,        0 },
+  { X86::VFMADDSUB213PDZ256r,        X86::VFMADDSUB213PDZ256m,        0 },
+  { X86::VFMADDSUB213PDZr,           X86::VFMADDSUB213PDZm,           0 },
+  { X86::VFMADDSUB213PDr,            X86::VFMADDSUB213PDm,            0 },
+  { X86::VFMADDSUB213PSYr,           X86::VFMADDSUB213PSYm,           0 },
+  { X86::VFMADDSUB213PSZ128r,        X86::VFMADDSUB213PSZ128m,        0 },
+  { X86::VFMADDSUB213PSZ256r,        X86::VFMADDSUB213PSZ256m,        0 },
+  { X86::VFMADDSUB213PSZr,           X86::VFMADDSUB213PSZm,           0 },
+  { X86::VFMADDSUB213PSr,            X86::VFMADDSUB213PSm,            0 },
+  { X86::VFMADDSUB231PDYr,           X86::VFMADDSUB231PDYm,           0 },
+  { X86::VFMADDSUB231PDZ128r,        X86::VFMADDSUB231PDZ128m,        0 },
+  { X86::VFMADDSUB231PDZ256r,        X86::VFMADDSUB231PDZ256m,        0 },
+  { X86::VFMADDSUB231PDZr,           X86::VFMADDSUB231PDZm,           0 },
+  { X86::VFMADDSUB231PDr,            X86::VFMADDSUB231PDm,            0 },
+  { X86::VFMADDSUB231PSYr,           X86::VFMADDSUB231PSYm,           0 },
+  { X86::VFMADDSUB231PSZ128r,        X86::VFMADDSUB231PSZ128m,        0 },
+  { X86::VFMADDSUB231PSZ256r,        X86::VFMADDSUB231PSZ256m,        0 },
+  { X86::VFMADDSUB231PSZr,           X86::VFMADDSUB231PSZm,           0 },
+  { X86::VFMADDSUB231PSr,            X86::VFMADDSUB231PSm,            0 },
+  { X86::VFMADDSUBPD4Yrr,            X86::VFMADDSUBPD4Yrm,            0 },
+  { X86::VFMADDSUBPD4rr,             X86::VFMADDSUBPD4rm,             0 },
+  { X86::VFMADDSUBPS4Yrr,            X86::VFMADDSUBPS4Yrm,            0 },
+  { X86::VFMADDSUBPS4rr,             X86::VFMADDSUBPS4rm,             0 },
+  { X86::VFMSUB132PDYr,              X86::VFMSUB132PDYm,              0 },
+  { X86::VFMSUB132PDZ128r,           X86::VFMSUB132PDZ128m,           0 },
+  { X86::VFMSUB132PDZ256r,           X86::VFMSUB132PDZ256m,           0 },
+  { X86::VFMSUB132PDZr,              X86::VFMSUB132PDZm,              0 },
+  { X86::VFMSUB132PDr,               X86::VFMSUB132PDm,               0 },
+  { X86::VFMSUB132PSYr,              X86::VFMSUB132PSYm,              0 },
+  { X86::VFMSUB132PSZ128r,           X86::VFMSUB132PSZ128m,           0 },
+  { X86::VFMSUB132PSZ256r,           X86::VFMSUB132PSZ256m,           0 },
+  { X86::VFMSUB132PSZr,              X86::VFMSUB132PSZm,              0 },
+  { X86::VFMSUB132PSr,               X86::VFMSUB132PSm,               0 },
+  { X86::VFMSUB132SDZr,              X86::VFMSUB132SDZm,              0 },
+  { X86::VFMSUB132SDZr_Int,          X86::VFMSUB132SDZm_Int,          TB_NO_REVERSE },
+  { X86::VFMSUB132SDr,               X86::VFMSUB132SDm,               0 },
+  { X86::VFMSUB132SDr_Int,           X86::VFMSUB132SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB132SSZr,              X86::VFMSUB132SSZm,              0 },
+  { X86::VFMSUB132SSZr_Int,          X86::VFMSUB132SSZm_Int,          TB_NO_REVERSE },
+  { X86::VFMSUB132SSr,               X86::VFMSUB132SSm,               0 },
+  { X86::VFMSUB132SSr_Int,           X86::VFMSUB132SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB213PDYr,              X86::VFMSUB213PDYm,              0 },
+  { X86::VFMSUB213PDZ128r,           X86::VFMSUB213PDZ128m,           0 },
+  { X86::VFMSUB213PDZ256r,           X86::VFMSUB213PDZ256m,           0 },
+  { X86::VFMSUB213PDZr,              X86::VFMSUB213PDZm,              0 },
+  { X86::VFMSUB213PDr,               X86::VFMSUB213PDm,               0 },
+  { X86::VFMSUB213PSYr,              X86::VFMSUB213PSYm,              0 },
+  { X86::VFMSUB213PSZ128r,           X86::VFMSUB213PSZ128m,           0 },
+  { X86::VFMSUB213PSZ256r,           X86::VFMSUB213PSZ256m,           0 },
+  { X86::VFMSUB213PSZr,              X86::VFMSUB213PSZm,              0 },
+  { X86::VFMSUB213PSr,               X86::VFMSUB213PSm,               0 },
+  { X86::VFMSUB213SDZr,              X86::VFMSUB213SDZm,              0 },
+  { X86::VFMSUB213SDZr_Int,          X86::VFMSUB213SDZm_Int,          TB_NO_REVERSE },
+  { X86::VFMSUB213SDr,               X86::VFMSUB213SDm,               0 },
+  { X86::VFMSUB213SDr_Int,           X86::VFMSUB213SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB213SSZr,              X86::VFMSUB213SSZm,              0 },
+  { X86::VFMSUB213SSZr_Int,          X86::VFMSUB213SSZm_Int,          TB_NO_REVERSE },
+  { X86::VFMSUB213SSr,               X86::VFMSUB213SSm,               0 },
+  { X86::VFMSUB213SSr_Int,           X86::VFMSUB213SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB231PDYr,              X86::VFMSUB231PDYm,              0 },
+  { X86::VFMSUB231PDZ128r,           X86::VFMSUB231PDZ128m,           0 },
+  { X86::VFMSUB231PDZ256r,           X86::VFMSUB231PDZ256m,           0 },
+  { X86::VFMSUB231PDZr,              X86::VFMSUB231PDZm,              0 },
+  { X86::VFMSUB231PDr,               X86::VFMSUB231PDm,               0 },
+  { X86::VFMSUB231PSYr,              X86::VFMSUB231PSYm,              0 },
+  { X86::VFMSUB231PSZ128r,           X86::VFMSUB231PSZ128m,           0 },
+  { X86::VFMSUB231PSZ256r,           X86::VFMSUB231PSZ256m,           0 },
+  { X86::VFMSUB231PSZr,              X86::VFMSUB231PSZm,              0 },
+  { X86::VFMSUB231PSr,               X86::VFMSUB231PSm,               0 },
+  { X86::VFMSUB231SDZr,              X86::VFMSUB231SDZm,              0 },
+  { X86::VFMSUB231SDZr_Int,          X86::VFMSUB231SDZm_Int,          TB_NO_REVERSE },
+  { X86::VFMSUB231SDr,               X86::VFMSUB231SDm,               0 },
+  { X86::VFMSUB231SDr_Int,           X86::VFMSUB231SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB231SSZr,              X86::VFMSUB231SSZm,              0 },
+  { X86::VFMSUB231SSZr_Int,          X86::VFMSUB231SSZm_Int,          TB_NO_REVERSE },
+  { X86::VFMSUB231SSr,               X86::VFMSUB231SSm,               0 },
+  { X86::VFMSUB231SSr_Int,           X86::VFMSUB231SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUBADD132PDYr,           X86::VFMSUBADD132PDYm,           0 },
+  { X86::VFMSUBADD132PDZ128r,        X86::VFMSUBADD132PDZ128m,        0 },
+  { X86::VFMSUBADD132PDZ256r,        X86::VFMSUBADD132PDZ256m,        0 },
+  { X86::VFMSUBADD132PDZr,           X86::VFMSUBADD132PDZm,           0 },
+  { X86::VFMSUBADD132PDr,            X86::VFMSUBADD132PDm,            0 },
+  { X86::VFMSUBADD132PSYr,           X86::VFMSUBADD132PSYm,           0 },
+  { X86::VFMSUBADD132PSZ128r,        X86::VFMSUBADD132PSZ128m,        0 },
+  { X86::VFMSUBADD132PSZ256r,        X86::VFMSUBADD132PSZ256m,        0 },
+  { X86::VFMSUBADD132PSZr,           X86::VFMSUBADD132PSZm,           0 },
+  { X86::VFMSUBADD132PSr,            X86::VFMSUBADD132PSm,            0 },
+  { X86::VFMSUBADD213PDYr,           X86::VFMSUBADD213PDYm,           0 },
+  { X86::VFMSUBADD213PDZ128r,        X86::VFMSUBADD213PDZ128m,        0 },
+  { X86::VFMSUBADD213PDZ256r,        X86::VFMSUBADD213PDZ256m,        0 },
+  { X86::VFMSUBADD213PDZr,           X86::VFMSUBADD213PDZm,           0 },
+  { X86::VFMSUBADD213PDr,            X86::VFMSUBADD213PDm,            0 },
+  { X86::VFMSUBADD213PSYr,           X86::VFMSUBADD213PSYm,           0 },
+  { X86::VFMSUBADD213PSZ128r,        X86::VFMSUBADD213PSZ128m,        0 },
+  { X86::VFMSUBADD213PSZ256r,        X86::VFMSUBADD213PSZ256m,        0 },
+  { X86::VFMSUBADD213PSZr,           X86::VFMSUBADD213PSZm,           0 },
+  { X86::VFMSUBADD213PSr,            X86::VFMSUBADD213PSm,            0 },
+  { X86::VFMSUBADD231PDYr,           X86::VFMSUBADD231PDYm,           0 },
+  { X86::VFMSUBADD231PDZ128r,        X86::VFMSUBADD231PDZ128m,        0 },
+  { X86::VFMSUBADD231PDZ256r,        X86::VFMSUBADD231PDZ256m,        0 },
+  { X86::VFMSUBADD231PDZr,           X86::VFMSUBADD231PDZm,           0 },
+  { X86::VFMSUBADD231PDr,            X86::VFMSUBADD231PDm,            0 },
+  { X86::VFMSUBADD231PSYr,           X86::VFMSUBADD231PSYm,           0 },
+  { X86::VFMSUBADD231PSZ128r,        X86::VFMSUBADD231PSZ128m,        0 },
+  { X86::VFMSUBADD231PSZ256r,        X86::VFMSUBADD231PSZ256m,        0 },
+  { X86::VFMSUBADD231PSZr,           X86::VFMSUBADD231PSZm,           0 },
+  { X86::VFMSUBADD231PSr,            X86::VFMSUBADD231PSm,            0 },
+  { X86::VFMSUBADDPD4Yrr,            X86::VFMSUBADDPD4Yrm,            0 },
+  { X86::VFMSUBADDPD4rr,             X86::VFMSUBADDPD4rm,             0 },
+  { X86::VFMSUBADDPS4Yrr,            X86::VFMSUBADDPS4Yrm,            0 },
+  { X86::VFMSUBADDPS4rr,             X86::VFMSUBADDPS4rm,             0 },
+  { X86::VFMSUBPD4Yrr,               X86::VFMSUBPD4Yrm,               0 },
+  { X86::VFMSUBPD4rr,                X86::VFMSUBPD4rm,                0 },
+  { X86::VFMSUBPS4Yrr,               X86::VFMSUBPS4Yrm,               0 },
+  { X86::VFMSUBPS4rr,                X86::VFMSUBPS4rm,                0 },
+  { X86::VFMSUBSD4rr,                X86::VFMSUBSD4rm,                0 },
+  { X86::VFMSUBSD4rr_Int,            X86::VFMSUBSD4rm_Int,            TB_NO_REVERSE },
+  { X86::VFMSUBSS4rr,                X86::VFMSUBSS4rm,                0 },
+  { X86::VFMSUBSS4rr_Int,            X86::VFMSUBSS4rm_Int,            TB_NO_REVERSE },
+  { X86::VFNMADD132PDYr,             X86::VFNMADD132PDYm,             0 },
+  { X86::VFNMADD132PDZ128r,          X86::VFNMADD132PDZ128m,          0 },
+  { X86::VFNMADD132PDZ256r,          X86::VFNMADD132PDZ256m,          0 },
+  { X86::VFNMADD132PDZr,             X86::VFNMADD132PDZm,             0 },
+  { X86::VFNMADD132PDr,              X86::VFNMADD132PDm,              0 },
+  { X86::VFNMADD132PSYr,             X86::VFNMADD132PSYm,             0 },
+  { X86::VFNMADD132PSZ128r,          X86::VFNMADD132PSZ128m,          0 },
+  { X86::VFNMADD132PSZ256r,          X86::VFNMADD132PSZ256m,          0 },
+  { X86::VFNMADD132PSZr,             X86::VFNMADD132PSZm,             0 },
+  { X86::VFNMADD132PSr,              X86::VFNMADD132PSm,              0 },
+  { X86::VFNMADD132SDZr,             X86::VFNMADD132SDZm,             0 },
+  { X86::VFNMADD132SDZr_Int,         X86::VFNMADD132SDZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMADD132SDr,              X86::VFNMADD132SDm,              0 },
+  { X86::VFNMADD132SDr_Int,          X86::VFNMADD132SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD132SSZr,             X86::VFNMADD132SSZm,             0 },
+  { X86::VFNMADD132SSZr_Int,         X86::VFNMADD132SSZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMADD132SSr,              X86::VFNMADD132SSm,              0 },
+  { X86::VFNMADD132SSr_Int,          X86::VFNMADD132SSm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD213PDYr,             X86::VFNMADD213PDYm,             0 },
+  { X86::VFNMADD213PDZ128r,          X86::VFNMADD213PDZ128m,          0 },
+  { X86::VFNMADD213PDZ256r,          X86::VFNMADD213PDZ256m,          0 },
+  { X86::VFNMADD213PDZr,             X86::VFNMADD213PDZm,             0 },
+  { X86::VFNMADD213PDr,              X86::VFNMADD213PDm,              0 },
+  { X86::VFNMADD213PSYr,             X86::VFNMADD213PSYm,             0 },
+  { X86::VFNMADD213PSZ128r,          X86::VFNMADD213PSZ128m,          0 },
+  { X86::VFNMADD213PSZ256r,          X86::VFNMADD213PSZ256m,          0 },
+  { X86::VFNMADD213PSZr,             X86::VFNMADD213PSZm,             0 },
+  { X86::VFNMADD213PSr,              X86::VFNMADD213PSm,              0 },
+  { X86::VFNMADD213SDZr,             X86::VFNMADD213SDZm,             0 },
+  { X86::VFNMADD213SDZr_Int,         X86::VFNMADD213SDZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMADD213SDr,              X86::VFNMADD213SDm,              0 },
+  { X86::VFNMADD213SDr_Int,          X86::VFNMADD213SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD213SSZr,             X86::VFNMADD213SSZm,             0 },
+  { X86::VFNMADD213SSZr_Int,         X86::VFNMADD213SSZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMADD213SSr,              X86::VFNMADD213SSm,              0 },
+  { X86::VFNMADD213SSr_Int,          X86::VFNMADD213SSm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD231PDYr,             X86::VFNMADD231PDYm,             0 },
+  { X86::VFNMADD231PDZ128r,          X86::VFNMADD231PDZ128m,          0 },
+  { X86::VFNMADD231PDZ256r,          X86::VFNMADD231PDZ256m,          0 },
+  { X86::VFNMADD231PDZr,             X86::VFNMADD231PDZm,             0 },
+  { X86::VFNMADD231PDr,              X86::VFNMADD231PDm,              0 },
+  { X86::VFNMADD231PSYr,             X86::VFNMADD231PSYm,             0 },
+  { X86::VFNMADD231PSZ128r,          X86::VFNMADD231PSZ128m,          0 },
+  { X86::VFNMADD231PSZ256r,          X86::VFNMADD231PSZ256m,          0 },
+  { X86::VFNMADD231PSZr,             X86::VFNMADD231PSZm,             0 },
+  { X86::VFNMADD231PSr,              X86::VFNMADD231PSm,              0 },
+  { X86::VFNMADD231SDZr,             X86::VFNMADD231SDZm,             0 },
+  { X86::VFNMADD231SDZr_Int,         X86::VFNMADD231SDZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMADD231SDr,              X86::VFNMADD231SDm,              0 },
+  { X86::VFNMADD231SDr_Int,          X86::VFNMADD231SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD231SSZr,             X86::VFNMADD231SSZm,             0 },
+  { X86::VFNMADD231SSZr_Int,         X86::VFNMADD231SSZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMADD231SSr,              X86::VFNMADD231SSm,              0 },
+  { X86::VFNMADD231SSr_Int,          X86::VFNMADD231SSm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADDPD4Yrr,              X86::VFNMADDPD4Yrm,              0 },
+  { X86::VFNMADDPD4rr,               X86::VFNMADDPD4rm,               0 },
+  { X86::VFNMADDPS4Yrr,              X86::VFNMADDPS4Yrm,              0 },
+  { X86::VFNMADDPS4rr,               X86::VFNMADDPS4rm,               0 },
+  { X86::VFNMADDSD4rr,               X86::VFNMADDSD4rm,               0 },
+  { X86::VFNMADDSD4rr_Int,           X86::VFNMADDSD4rm_Int,           TB_NO_REVERSE },
+  { X86::VFNMADDSS4rr,               X86::VFNMADDSS4rm,               0 },
+  { X86::VFNMADDSS4rr_Int,           X86::VFNMADDSS4rm_Int,           TB_NO_REVERSE },
+  { X86::VFNMSUB132PDYr,             X86::VFNMSUB132PDYm,             0 },
+  { X86::VFNMSUB132PDZ128r,          X86::VFNMSUB132PDZ128m,          0 },
+  { X86::VFNMSUB132PDZ256r,          X86::VFNMSUB132PDZ256m,          0 },
+  { X86::VFNMSUB132PDZr,             X86::VFNMSUB132PDZm,             0 },
+  { X86::VFNMSUB132PDr,              X86::VFNMSUB132PDm,              0 },
+  { X86::VFNMSUB132PSYr,             X86::VFNMSUB132PSYm,             0 },
+  { X86::VFNMSUB132PSZ128r,          X86::VFNMSUB132PSZ128m,          0 },
+  { X86::VFNMSUB132PSZ256r,          X86::VFNMSUB132PSZ256m,          0 },
+  { X86::VFNMSUB132PSZr,             X86::VFNMSUB132PSZm,             0 },
+  { X86::VFNMSUB132PSr,              X86::VFNMSUB132PSm,              0 },
+  { X86::VFNMSUB132SDZr,             X86::VFNMSUB132SDZm,             0 },
+  { X86::VFNMSUB132SDZr_Int,         X86::VFNMSUB132SDZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUB132SDr,              X86::VFNMSUB132SDm,              0 },
+  { X86::VFNMSUB132SDr_Int,          X86::VFNMSUB132SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB132SSZr,             X86::VFNMSUB132SSZm,             0 },
+  { X86::VFNMSUB132SSZr_Int,         X86::VFNMSUB132SSZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUB132SSr,              X86::VFNMSUB132SSm,              0 },
+  { X86::VFNMSUB132SSr_Int,          X86::VFNMSUB132SSm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB213PDYr,             X86::VFNMSUB213PDYm,             0 },
+  { X86::VFNMSUB213PDZ128r,          X86::VFNMSUB213PDZ128m,          0 },
+  { X86::VFNMSUB213PDZ256r,          X86::VFNMSUB213PDZ256m,          0 },
+  { X86::VFNMSUB213PDZr,             X86::VFNMSUB213PDZm,             0 },
+  { X86::VFNMSUB213PDr,              X86::VFNMSUB213PDm,              0 },
+  { X86::VFNMSUB213PSYr,             X86::VFNMSUB213PSYm,             0 },
+  { X86::VFNMSUB213PSZ128r,          X86::VFNMSUB213PSZ128m,          0 },
+  { X86::VFNMSUB213PSZ256r,          X86::VFNMSUB213PSZ256m,          0 },
+  { X86::VFNMSUB213PSZr,             X86::VFNMSUB213PSZm,             0 },
+  { X86::VFNMSUB213PSr,              X86::VFNMSUB213PSm,              0 },
+  { X86::VFNMSUB213SDZr,             X86::VFNMSUB213SDZm,             0 },
+  { X86::VFNMSUB213SDZr_Int,         X86::VFNMSUB213SDZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUB213SDr,              X86::VFNMSUB213SDm,              0 },
+  { X86::VFNMSUB213SDr_Int,          X86::VFNMSUB213SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB213SSZr,             X86::VFNMSUB213SSZm,             0 },
+  { X86::VFNMSUB213SSZr_Int,         X86::VFNMSUB213SSZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUB213SSr,              X86::VFNMSUB213SSm,              0 },
+  { X86::VFNMSUB213SSr_Int,          X86::VFNMSUB213SSm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB231PDYr,             X86::VFNMSUB231PDYm,             0 },
+  { X86::VFNMSUB231PDZ128r,          X86::VFNMSUB231PDZ128m,          0 },
+  { X86::VFNMSUB231PDZ256r,          X86::VFNMSUB231PDZ256m,          0 },
+  { X86::VFNMSUB231PDZr,             X86::VFNMSUB231PDZm,             0 },
+  { X86::VFNMSUB231PDr,              X86::VFNMSUB231PDm,              0 },
+  { X86::VFNMSUB231PSYr,             X86::VFNMSUB231PSYm,             0 },
+  { X86::VFNMSUB231PSZ128r,          X86::VFNMSUB231PSZ128m,          0 },
+  { X86::VFNMSUB231PSZ256r,          X86::VFNMSUB231PSZ256m,          0 },
+  { X86::VFNMSUB231PSZr,             X86::VFNMSUB231PSZm,             0 },
+  { X86::VFNMSUB231PSr,              X86::VFNMSUB231PSm,              0 },
+  { X86::VFNMSUB231SDZr,             X86::VFNMSUB231SDZm,             0 },
+  { X86::VFNMSUB231SDZr_Int,         X86::VFNMSUB231SDZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUB231SDr,              X86::VFNMSUB231SDm,              0 },
+  { X86::VFNMSUB231SDr_Int,          X86::VFNMSUB231SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB231SSZr,             X86::VFNMSUB231SSZm,             0 },
+  { X86::VFNMSUB231SSZr_Int,         X86::VFNMSUB231SSZm_Int,         TB_NO_REVERSE },
+  { X86::VFNMSUB231SSr,              X86::VFNMSUB231SSm,              0 },
+  { X86::VFNMSUB231SSr_Int,          X86::VFNMSUB231SSm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUBPD4Yrr,              X86::VFNMSUBPD4Yrm,              0 },
+  { X86::VFNMSUBPD4rr,               X86::VFNMSUBPD4rm,               0 },
+  { X86::VFNMSUBPS4Yrr,              X86::VFNMSUBPS4Yrm,              0 },
+  { X86::VFNMSUBPS4rr,               X86::VFNMSUBPS4rm,               0 },
+  { X86::VFNMSUBSD4rr,               X86::VFNMSUBSD4rm,               0 },
+  { X86::VFNMSUBSD4rr_Int,           X86::VFNMSUBSD4rm_Int,           TB_NO_REVERSE },
+  { X86::VFNMSUBSS4rr,               X86::VFNMSUBSS4rm,               0 },
+  { X86::VFNMSUBSS4rr_Int,           X86::VFNMSUBSS4rm_Int,           TB_NO_REVERSE },
+  { X86::VGETEXPPDZ128rk,            X86::VGETEXPPDZ128mk,            0 },
+  { X86::VGETEXPPDZ256rk,            X86::VGETEXPPDZ256mk,            0 },
+  { X86::VGETEXPPDZrk,               X86::VGETEXPPDZmk,               0 },
+  { X86::VGETEXPPSZ128rk,            X86::VGETEXPPSZ128mk,            0 },
+  { X86::VGETEXPPSZ256rk,            X86::VGETEXPPSZ256mk,            0 },
+  { X86::VGETEXPPSZrk,               X86::VGETEXPPSZmk,               0 },
+  { X86::VGETEXPSDZrkz,              X86::VGETEXPSDZmkz,              TB_NO_REVERSE },
+  { X86::VGETEXPSSZrkz,              X86::VGETEXPSSZmkz,              TB_NO_REVERSE },
+  { X86::VGETMANTPDZ128rrik,         X86::VGETMANTPDZ128rmik,         0 },
+  { X86::VGETMANTPDZ256rrik,         X86::VGETMANTPDZ256rmik,         0 },
+  { X86::VGETMANTPDZrrik,            X86::VGETMANTPDZrmik,            0 },
+  { X86::VGETMANTPSZ128rrik,         X86::VGETMANTPSZ128rmik,         0 },
+  { X86::VGETMANTPSZ256rrik,         X86::VGETMANTPSZ256rmik,         0 },
+  { X86::VGETMANTPSZrrik,            X86::VGETMANTPSZrmik,            0 },
+  { X86::VGETMANTSDZrrikz,           X86::VGETMANTSDZrmikz,           TB_NO_REVERSE },
+  { X86::VGETMANTSSZrrikz,           X86::VGETMANTSSZrmikz,           TB_NO_REVERSE },
+  { X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 },
+  { X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 },
+  { X86::VGF2P8AFFINEINVQBZrrikz,    X86::VGF2P8AFFINEINVQBZrmikz,    0 },
+  { X86::VGF2P8AFFINEQBZ128rrikz,    X86::VGF2P8AFFINEQBZ128rmikz,    0 },
+  { X86::VGF2P8AFFINEQBZ256rrikz,    X86::VGF2P8AFFINEQBZ256rmikz,    0 },
+  { X86::VGF2P8AFFINEQBZrrikz,       X86::VGF2P8AFFINEQBZrmikz,       0 },
+  { X86::VGF2P8MULBZ128rrkz,         X86::VGF2P8MULBZ128rmkz,         0 },
+  { X86::VGF2P8MULBZ256rrkz,         X86::VGF2P8MULBZ256rmkz,         0 },
+  { X86::VGF2P8MULBZrrkz,            X86::VGF2P8MULBZrmkz,            0 },
+  { X86::VINSERTF32x4Z256rrkz,       X86::VINSERTF32x4Z256rmkz,       0 },
+  { X86::VINSERTF32x4Zrrkz,          X86::VINSERTF32x4Zrmkz,          0 },
+  { X86::VINSERTF32x8Zrrkz,          X86::VINSERTF32x8Zrmkz,          0 },
+  { X86::VINSERTF64x2Z256rrkz,       X86::VINSERTF64x2Z256rmkz,       0 },
+  { X86::VINSERTF64x2Zrrkz,          X86::VINSERTF64x2Zrmkz,          0 },
+  { X86::VINSERTF64x4Zrrkz,          X86::VINSERTF64x4Zrmkz,          0 },
+  { X86::VINSERTI32x4Z256rrkz,       X86::VINSERTI32x4Z256rmkz,       0 },
+  { X86::VINSERTI32x4Zrrkz,          X86::VINSERTI32x4Zrmkz,          0 },
+  { X86::VINSERTI32x8Zrrkz,          X86::VINSERTI32x8Zrmkz,          0 },
+  { X86::VINSERTI64x2Z256rrkz,       X86::VINSERTI64x2Z256rmkz,       0 },
+  { X86::VINSERTI64x2Zrrkz,          X86::VINSERTI64x2Zrmkz,          0 },
+  { X86::VINSERTI64x4Zrrkz,          X86::VINSERTI64x4Zrmkz,          0 },
+  { X86::VMAXCPDZ128rrkz,            X86::VMAXCPDZ128rmkz,            0 },
+  { X86::VMAXCPDZ256rrkz,            X86::VMAXCPDZ256rmkz,            0 },
+  { X86::VMAXCPDZrrkz,               X86::VMAXCPDZrmkz,               0 },
+  { X86::VMAXCPSZ128rrkz,            X86::VMAXCPSZ128rmkz,            0 },
+  { X86::VMAXCPSZ256rrkz,            X86::VMAXCPSZ256rmkz,            0 },
+  { X86::VMAXCPSZrrkz,               X86::VMAXCPSZrmkz,               0 },
+  { X86::VMAXPDZ128rrkz,             X86::VMAXPDZ128rmkz,             0 },
+  { X86::VMAXPDZ256rrkz,             X86::VMAXPDZ256rmkz,             0 },
+  { X86::VMAXPDZrrkz,                X86::VMAXPDZrmkz,                0 },
+  { X86::VMAXPSZ128rrkz,             X86::VMAXPSZ128rmkz,             0 },
+  { X86::VMAXPSZ256rrkz,             X86::VMAXPSZ256rmkz,             0 },
+  { X86::VMAXPSZrrkz,                X86::VMAXPSZrmkz,                0 },
+  { X86::VMAXSDZrr_Intkz,            X86::VMAXSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMAXSSZrr_Intkz,            X86::VMAXSSZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMINCPDZ128rrkz,            X86::VMINCPDZ128rmkz,            0 },
+  { X86::VMINCPDZ256rrkz,            X86::VMINCPDZ256rmkz,            0 },
+  { X86::VMINCPDZrrkz,               X86::VMINCPDZrmkz,               0 },
+  { X86::VMINCPSZ128rrkz,            X86::VMINCPSZ128rmkz,            0 },
+  { X86::VMINCPSZ256rrkz,            X86::VMINCPSZ256rmkz,            0 },
+  { X86::VMINCPSZrrkz,               X86::VMINCPSZrmkz,               0 },
+  { X86::VMINPDZ128rrkz,             X86::VMINPDZ128rmkz,             0 },
+  { X86::VMINPDZ256rrkz,             X86::VMINPDZ256rmkz,             0 },
+  { X86::VMINPDZrrkz,                X86::VMINPDZrmkz,                0 },
+  { X86::VMINPSZ128rrkz,             X86::VMINPSZ128rmkz,             0 },
+  { X86::VMINPSZ256rrkz,             X86::VMINPSZ256rmkz,             0 },
+  { X86::VMINPSZrrkz,                X86::VMINPSZrmkz,                0 },
+  { X86::VMINSDZrr_Intkz,            X86::VMINSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMINSSZrr_Intkz,            X86::VMINSSZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMOVAPDZ128rrk,             X86::VMOVAPDZ128rmk,             TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVAPDZ256rrk,             X86::VMOVAPDZ256rmk,             TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVAPDZrrk,                X86::VMOVAPDZrmk,                TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVAPSZ128rrk,             X86::VMOVAPSZ128rmk,             TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVAPSZ256rrk,             X86::VMOVAPSZ256rmk,             TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVAPSZrrk,                X86::VMOVAPSZrmk,                TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVDDUPZ128rrk,            X86::VMOVDDUPZ128rmk,            TB_NO_REVERSE },
+  { X86::VMOVDDUPZ256rrk,            X86::VMOVDDUPZ256rmk,            0 },
+  { X86::VMOVDDUPZrrk,               X86::VMOVDDUPZrmk,               0 },
+  { X86::VMOVDQA32Z128rrk,           X86::VMOVDQA32Z128rmk,           TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVDQA32Z256rrk,           X86::VMOVDQA32Z256rmk,           TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVDQA32Zrrk,              X86::VMOVDQA32Zrmk,              TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVDQA64Z128rrk,           X86::VMOVDQA64Z128rmk,           TB_NO_REVERSE | TB_ALIGN_16 },
+  { X86::VMOVDQA64Z256rrk,           X86::VMOVDQA64Z256rmk,           TB_NO_REVERSE | TB_ALIGN_32 },
+  { X86::VMOVDQA64Zrrk,              X86::VMOVDQA64Zrmk,              TB_NO_REVERSE | TB_ALIGN_64 },
+  { X86::VMOVDQU16Z128rrk,           X86::VMOVDQU16Z128rmk,           TB_NO_REVERSE },
+  { X86::VMOVDQU16Z256rrk,           X86::VMOVDQU16Z256rmk,           TB_NO_REVERSE },
+  { X86::VMOVDQU16Zrrk,              X86::VMOVDQU16Zrmk,              TB_NO_REVERSE },
+  { X86::VMOVDQU32Z128rrk,           X86::VMOVDQU32Z128rmk,           TB_NO_REVERSE },
+  { X86::VMOVDQU32Z256rrk,           X86::VMOVDQU32Z256rmk,           TB_NO_REVERSE },
+  { X86::VMOVDQU32Zrrk,              X86::VMOVDQU32Zrmk,              TB_NO_REVERSE },
+  { X86::VMOVDQU64Z128rrk,           X86::VMOVDQU64Z128rmk,           TB_NO_REVERSE },
+  { X86::VMOVDQU64Z256rrk,           X86::VMOVDQU64Z256rmk,           TB_NO_REVERSE },
+  { X86::VMOVDQU64Zrrk,              X86::VMOVDQU64Zrmk,              TB_NO_REVERSE },
+  { X86::VMOVDQU8Z128rrk,            X86::VMOVDQU8Z128rmk,            TB_NO_REVERSE },
+  { X86::VMOVDQU8Z256rrk,            X86::VMOVDQU8Z256rmk,            TB_NO_REVERSE },
+  { X86::VMOVDQU8Zrrk,               X86::VMOVDQU8Zrmk,               TB_NO_REVERSE },
+  { X86::VMOVSHDUPZ128rrk,           X86::VMOVSHDUPZ128rmk,           0 },
+  { X86::VMOVSHDUPZ256rrk,           X86::VMOVSHDUPZ256rmk,           0 },
+  { X86::VMOVSHDUPZrrk,              X86::VMOVSHDUPZrmk,              0 },
+  { X86::VMOVSLDUPZ128rrk,           X86::VMOVSLDUPZ128rmk,           0 },
+  { X86::VMOVSLDUPZ256rrk,           X86::VMOVSLDUPZ256rmk,           0 },
+  { X86::VMOVSLDUPZrrk,              X86::VMOVSLDUPZrmk,              0 },
+  { X86::VMOVUPDZ128rrk,             X86::VMOVUPDZ128rmk,             TB_NO_REVERSE },
+  { X86::VMOVUPDZ256rrk,             X86::VMOVUPDZ256rmk,             TB_NO_REVERSE },
+  { X86::VMOVUPDZrrk,                X86::VMOVUPDZrmk,                TB_NO_REVERSE },
+  { X86::VMOVUPSZ128rrk,             X86::VMOVUPSZ128rmk,             TB_NO_REVERSE },
+  { X86::VMOVUPSZ256rrk,             X86::VMOVUPSZ256rmk,             TB_NO_REVERSE },
+  { X86::VMOVUPSZrrk,                X86::VMOVUPSZrmk,                TB_NO_REVERSE },
+  { X86::VMULPDZ128rrkz,             X86::VMULPDZ128rmkz,             0 },
+  { X86::VMULPDZ256rrkz,             X86::VMULPDZ256rmkz,             0 },
+  { X86::VMULPDZrrkz,                X86::VMULPDZrmkz,                0 },
+  { X86::VMULPSZ128rrkz,             X86::VMULPSZ128rmkz,             0 },
+  { X86::VMULPSZ256rrkz,             X86::VMULPSZ256rmkz,             0 },
+  { X86::VMULPSZrrkz,                X86::VMULPSZrmkz,                0 },
+  { X86::VMULSDZrr_Intkz,            X86::VMULSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMULSSZrr_Intkz,            X86::VMULSSZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VORPDZ128rrkz,              X86::VORPDZ128rmkz,              0 },
+  { X86::VORPDZ256rrkz,              X86::VORPDZ256rmkz,              0 },
+  { X86::VORPDZrrkz,                 X86::VORPDZrmkz,                 0 },
+  { X86::VORPSZ128rrkz,              X86::VORPSZ128rmkz,              0 },
+  { X86::VORPSZ256rrkz,              X86::VORPSZ256rmkz,              0 },
+  { X86::VORPSZrrkz,                 X86::VORPSZrmkz,                 0 },
+  { X86::VPABSBZ128rrk,              X86::VPABSBZ128rmk,              0 },
+  { X86::VPABSBZ256rrk,              X86::VPABSBZ256rmk,              0 },
+  { X86::VPABSBZrrk,                 X86::VPABSBZrmk,                 0 },
+  { X86::VPABSDZ128rrk,              X86::VPABSDZ128rmk,              0 },
+  { X86::VPABSDZ256rrk,              X86::VPABSDZ256rmk,              0 },
+  { X86::VPABSDZrrk,                 X86::VPABSDZrmk,                 0 },
+  { X86::VPABSQZ128rrk,              X86::VPABSQZ128rmk,              0 },
+  { X86::VPABSQZ256rrk,              X86::VPABSQZ256rmk,              0 },
+  { X86::VPABSQZrrk,                 X86::VPABSQZrmk,                 0 },
+  { X86::VPABSWZ128rrk,              X86::VPABSWZ128rmk,              0 },
+  { X86::VPABSWZ256rrk,              X86::VPABSWZ256rmk,              0 },
+  { X86::VPABSWZrrk,                 X86::VPABSWZrmk,                 0 },
+  { X86::VPACKSSDWZ128rrkz,          X86::VPACKSSDWZ128rmkz,          0 },
+  { X86::VPACKSSDWZ256rrkz,          X86::VPACKSSDWZ256rmkz,          0 },
+  { X86::VPACKSSDWZrrkz,             X86::VPACKSSDWZrmkz,             0 },
+  { X86::VPACKSSWBZ128rrkz,          X86::VPACKSSWBZ128rmkz,          0 },
+  { X86::VPACKSSWBZ256rrkz,          X86::VPACKSSWBZ256rmkz,          0 },
+  { X86::VPACKSSWBZrrkz,             X86::VPACKSSWBZrmkz,             0 },
+  { X86::VPACKUSDWZ128rrkz,          X86::VPACKUSDWZ128rmkz,          0 },
+  { X86::VPACKUSDWZ256rrkz,          X86::VPACKUSDWZ256rmkz,          0 },
+  { X86::VPACKUSDWZrrkz,             X86::VPACKUSDWZrmkz,             0 },
+  { X86::VPACKUSWBZ128rrkz,          X86::VPACKUSWBZ128rmkz,          0 },
+  { X86::VPACKUSWBZ256rrkz,          X86::VPACKUSWBZ256rmkz,          0 },
+  { X86::VPACKUSWBZrrkz,             X86::VPACKUSWBZrmkz,             0 },
+  { X86::VPADDBZ128rrkz,             X86::VPADDBZ128rmkz,             0 },
+  { X86::VPADDBZ256rrkz,             X86::VPADDBZ256rmkz,             0 },
+  { X86::VPADDBZrrkz,                X86::VPADDBZrmkz,                0 },
+  { X86::VPADDDZ128rrkz,             X86::VPADDDZ128rmkz,             0 },
+  { X86::VPADDDZ256rrkz,             X86::VPADDDZ256rmkz,             0 },
+  { X86::VPADDDZrrkz,                X86::VPADDDZrmkz,                0 },
+  { X86::VPADDQZ128rrkz,             X86::VPADDQZ128rmkz,             0 },
+  { X86::VPADDQZ256rrkz,             X86::VPADDQZ256rmkz,             0 },
+  { X86::VPADDQZrrkz,                X86::VPADDQZrmkz,                0 },
+  { X86::VPADDSBZ128rrkz,            X86::VPADDSBZ128rmkz,            0 },
+  { X86::VPADDSBZ256rrkz,            X86::VPADDSBZ256rmkz,            0 },
+  { X86::VPADDSBZrrkz,               X86::VPADDSBZrmkz,               0 },
+  { X86::VPADDSWZ128rrkz,            X86::VPADDSWZ128rmkz,            0 },
+  { X86::VPADDSWZ256rrkz,            X86::VPADDSWZ256rmkz,            0 },
+  { X86::VPADDSWZrrkz,               X86::VPADDSWZrmkz,               0 },
+  { X86::VPADDUSBZ128rrkz,           X86::VPADDUSBZ128rmkz,           0 },
+  { X86::VPADDUSBZ256rrkz,           X86::VPADDUSBZ256rmkz,           0 },
+  { X86::VPADDUSBZrrkz,              X86::VPADDUSBZrmkz,              0 },
+  { X86::VPADDUSWZ128rrkz,           X86::VPADDUSWZ128rmkz,           0 },
+  { X86::VPADDUSWZ256rrkz,           X86::VPADDUSWZ256rmkz,           0 },
+  { X86::VPADDUSWZrrkz,              X86::VPADDUSWZrmkz,              0 },
+  { X86::VPADDWZ128rrkz,             X86::VPADDWZ128rmkz,             0 },
+  { X86::VPADDWZ256rrkz,             X86::VPADDWZ256rmkz,             0 },
+  { X86::VPADDWZrrkz,                X86::VPADDWZrmkz,                0 },
+  { X86::VPALIGNRZ128rrikz,          X86::VPALIGNRZ128rmikz,          0 },
+  { X86::VPALIGNRZ256rrikz,          X86::VPALIGNRZ256rmikz,          0 },
+  { X86::VPALIGNRZrrikz,             X86::VPALIGNRZrmikz,             0 },
+  { X86::VPANDDZ128rrkz,             X86::VPANDDZ128rmkz,             0 },
+  { X86::VPANDDZ256rrkz,             X86::VPANDDZ256rmkz,             0 },
+  { X86::VPANDDZrrkz,                X86::VPANDDZrmkz,                0 },
+  { X86::VPANDNDZ128rrkz,            X86::VPANDNDZ128rmkz,            0 },
+  { X86::VPANDNDZ256rrkz,            X86::VPANDNDZ256rmkz,            0 },
+  { X86::VPANDNDZrrkz,               X86::VPANDNDZrmkz,               0 },
+  { X86::VPANDNQZ128rrkz,            X86::VPANDNQZ128rmkz,            0 },
+  { X86::VPANDNQZ256rrkz,            X86::VPANDNQZ256rmkz,            0 },
+  { X86::VPANDNQZrrkz,               X86::VPANDNQZrmkz,               0 },
+  { X86::VPANDQZ128rrkz,             X86::VPANDQZ128rmkz,             0 },
+  { X86::VPANDQZ256rrkz,             X86::VPANDQZ256rmkz,             0 },
+  { X86::VPANDQZrrkz,                X86::VPANDQZrmkz,                0 },
+  { X86::VPAVGBZ128rrkz,             X86::VPAVGBZ128rmkz,             0 },
+  { X86::VPAVGBZ256rrkz,             X86::VPAVGBZ256rmkz,             0 },
+  { X86::VPAVGBZrrkz,                X86::VPAVGBZrmkz,                0 },
+  { X86::VPAVGWZ128rrkz,             X86::VPAVGWZ128rmkz,             0 },
+  { X86::VPAVGWZ256rrkz,             X86::VPAVGWZ256rmkz,             0 },
+  { X86::VPAVGWZrrkz,                X86::VPAVGWZrmkz,                0 },
+  { X86::VPBLENDMBZ128rrk,           X86::VPBLENDMBZ128rmk,           0 },
+  { X86::VPBLENDMBZ256rrk,           X86::VPBLENDMBZ256rmk,           0 },
+  { X86::VPBLENDMBZrrk,              X86::VPBLENDMBZrmk,              0 },
+  { X86::VPBLENDMDZ128rrk,           X86::VPBLENDMDZ128rmk,           0 },
+  { X86::VPBLENDMDZ256rrk,           X86::VPBLENDMDZ256rmk,           0 },
+  { X86::VPBLENDMDZrrk,              X86::VPBLENDMDZrmk,              0 },
+  { X86::VPBLENDMQZ128rrk,           X86::VPBLENDMQZ128rmk,           0 },
+  { X86::VPBLENDMQZ256rrk,           X86::VPBLENDMQZ256rmk,           0 },
+  { X86::VPBLENDMQZrrk,              X86::VPBLENDMQZrmk,              0 },
+  { X86::VPBLENDMWZ128rrk,           X86::VPBLENDMWZ128rmk,           0 },
+  { X86::VPBLENDMWZ256rrk,           X86::VPBLENDMWZ256rmk,           0 },
+  { X86::VPBLENDMWZrrk,              X86::VPBLENDMWZrmk,              0 },
+  { X86::VPBROADCASTBZ128rk,         X86::VPBROADCASTBZ128mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ256rk,         X86::VPBROADCASTBZ256mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTBZrk,            X86::VPBROADCASTBZmk,            TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ128rk,         X86::VPBROADCASTDZ128mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ256rk,         X86::VPBROADCASTDZ256mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTDZrk,            X86::VPBROADCASTDZmk,            TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ128rk,         X86::VPBROADCASTQZ128mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ256rk,         X86::VPBROADCASTQZ256mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTQZrk,            X86::VPBROADCASTQZmk,            TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ128rk,         X86::VPBROADCASTWZ128mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ256rk,         X86::VPBROADCASTWZ256mk,         TB_NO_REVERSE },
+  { X86::VPBROADCASTWZrk,            X86::VPBROADCASTWZmk,            TB_NO_REVERSE },
+  { X86::VPCMOVYrrr,                 X86::VPCMOVYrrm,                 0 },
+  { X86::VPCMOVrrr,                  X86::VPCMOVrrm,                  0 },
+  { X86::VPCMPBZ128rrik,             X86::VPCMPBZ128rmik,             0 },
+  { X86::VPCMPBZ256rrik,             X86::VPCMPBZ256rmik,             0 },
+  { X86::VPCMPBZrrik,                X86::VPCMPBZrmik,                0 },
+  { X86::VPCMPDZ128rrik,             X86::VPCMPDZ128rmik,             0 },
+  { X86::VPCMPDZ256rrik,             X86::VPCMPDZ256rmik,             0 },
+  { X86::VPCMPDZrrik,                X86::VPCMPDZrmik,                0 },
+  { X86::VPCMPEQBZ128rrk,            X86::VPCMPEQBZ128rmk,            0 },
+  { X86::VPCMPEQBZ256rrk,            X86::VPCMPEQBZ256rmk,            0 },
+  { X86::VPCMPEQBZrrk,               X86::VPCMPEQBZrmk,               0 },
+  { X86::VPCMPEQDZ128rrk,            X86::VPCMPEQDZ128rmk,            0 },
+  { X86::VPCMPEQDZ256rrk,            X86::VPCMPEQDZ256rmk,            0 },
+  { X86::VPCMPEQDZrrk,               X86::VPCMPEQDZrmk,               0 },
+  { X86::VPCMPEQQZ128rrk,            X86::VPCMPEQQZ128rmk,            0 },
+  { X86::VPCMPEQQZ256rrk,            X86::VPCMPEQQZ256rmk,            0 },
+  { X86::VPCMPEQQZrrk,               X86::VPCMPEQQZrmk,               0 },
+  { X86::VPCMPEQWZ128rrk,            X86::VPCMPEQWZ128rmk,            0 },
+  { X86::VPCMPEQWZ256rrk,            X86::VPCMPEQWZ256rmk,            0 },
+  { X86::VPCMPEQWZrrk,               X86::VPCMPEQWZrmk,               0 },
+  { X86::VPCMPGTBZ128rrk,            X86::VPCMPGTBZ128rmk,            0 },
+  { X86::VPCMPGTBZ256rrk,            X86::VPCMPGTBZ256rmk,            0 },
+  { X86::VPCMPGTBZrrk,               X86::VPCMPGTBZrmk,               0 },
+  { X86::VPCMPGTDZ128rrk,            X86::VPCMPGTDZ128rmk,            0 },
+  { X86::VPCMPGTDZ256rrk,            X86::VPCMPGTDZ256rmk,            0 },
+  { X86::VPCMPGTDZrrk,               X86::VPCMPGTDZrmk,               0 },
+  { X86::VPCMPGTQZ128rrk,            X86::VPCMPGTQZ128rmk,            0 },
+  { X86::VPCMPGTQZ256rrk,            X86::VPCMPGTQZ256rmk,            0 },
+  { X86::VPCMPGTQZrrk,               X86::VPCMPGTQZrmk,               0 },
+  { X86::VPCMPGTWZ128rrk,            X86::VPCMPGTWZ128rmk,            0 },
+  { X86::VPCMPGTWZ256rrk,            X86::VPCMPGTWZ256rmk,            0 },
+  { X86::VPCMPGTWZrrk,               X86::VPCMPGTWZrmk,               0 },
+  { X86::VPCMPQZ128rrik,             X86::VPCMPQZ128rmik,             0 },
+  { X86::VPCMPQZ256rrik,             X86::VPCMPQZ256rmik,             0 },
+  { X86::VPCMPQZrrik,                X86::VPCMPQZrmik,                0 },
+  { X86::VPCMPUBZ128rrik,            X86::VPCMPUBZ128rmik,            0 },
+  { X86::VPCMPUBZ256rrik,            X86::VPCMPUBZ256rmik,            0 },
+  { X86::VPCMPUBZrrik,               X86::VPCMPUBZrmik,               0 },
+  { X86::VPCMPUDZ128rrik,            X86::VPCMPUDZ128rmik,            0 },
+  { X86::VPCMPUDZ256rrik,            X86::VPCMPUDZ256rmik,            0 },
+  { X86::VPCMPUDZrrik,               X86::VPCMPUDZrmik,               0 },
+  { X86::VPCMPUQZ128rrik,            X86::VPCMPUQZ128rmik,            0 },
+  { X86::VPCMPUQZ256rrik,            X86::VPCMPUQZ256rmik,            0 },
+  { X86::VPCMPUQZrrik,               X86::VPCMPUQZrmik,               0 },
+  { X86::VPCMPUWZ128rrik,            X86::VPCMPUWZ128rmik,            0 },
+  { X86::VPCMPUWZ256rrik,            X86::VPCMPUWZ256rmik,            0 },
+  { X86::VPCMPUWZrrik,               X86::VPCMPUWZrmik,               0 },
+  { X86::VPCMPWZ128rrik,             X86::VPCMPWZ128rmik,             0 },
+  { X86::VPCMPWZ256rrik,             X86::VPCMPWZ256rmik,             0 },
+  { X86::VPCMPWZrrik,                X86::VPCMPWZrmik,                0 },
+  { X86::VPCONFLICTDZ128rrk,         X86::VPCONFLICTDZ128rmk,         0 },
+  { X86::VPCONFLICTDZ256rrk,         X86::VPCONFLICTDZ256rmk,         0 },
+  { X86::VPCONFLICTDZrrk,            X86::VPCONFLICTDZrmk,            0 },
+  { X86::VPCONFLICTQZ128rrk,         X86::VPCONFLICTQZ128rmk,         0 },
+  { X86::VPCONFLICTQZ256rrk,         X86::VPCONFLICTQZ256rmk,         0 },
+  { X86::VPCONFLICTQZrrk,            X86::VPCONFLICTQZrmk,            0 },
+  { X86::VPDPBUSDSZ128r,             X86::VPDPBUSDSZ128m,             0 },
+  { X86::VPDPBUSDSZ256r,             X86::VPDPBUSDSZ256m,             0 },
+  { X86::VPDPBUSDSZr,                X86::VPDPBUSDSZm,                0 },
+  { X86::VPDPBUSDZ128r,              X86::VPDPBUSDZ128m,              0 },
+  { X86::VPDPBUSDZ256r,              X86::VPDPBUSDZ256m,              0 },
+  { X86::VPDPBUSDZr,                 X86::VPDPBUSDZm,                 0 },
+  { X86::VPDPWSSDSZ128r,             X86::VPDPWSSDSZ128m,             0 },
+  { X86::VPDPWSSDSZ256r,             X86::VPDPWSSDSZ256m,             0 },
+  { X86::VPDPWSSDSZr,                X86::VPDPWSSDSZm,                0 },
+  { X86::VPDPWSSDZ128r,              X86::VPDPWSSDZ128m,              0 },
+  { X86::VPDPWSSDZ256r,              X86::VPDPWSSDZ256m,              0 },
+  { X86::VPDPWSSDZr,                 X86::VPDPWSSDZm,                 0 },
+  { X86::VPERMBZ128rrkz,             X86::VPERMBZ128rmkz,             0 },
+  { X86::VPERMBZ256rrkz,             X86::VPERMBZ256rmkz,             0 },
+  { X86::VPERMBZrrkz,                X86::VPERMBZrmkz,                0 },
+  { X86::VPERMDZ256rrkz,             X86::VPERMDZ256rmkz,             0 },
+  { X86::VPERMDZrrkz,                X86::VPERMDZrmkz,                0 },
+  { X86::VPERMI2B128rr,              X86::VPERMI2B128rm,              0 },
+  { X86::VPERMI2B256rr,              X86::VPERMI2B256rm,              0 },
+  { X86::VPERMI2Brr,                 X86::VPERMI2Brm,                 0 },
+  { X86::VPERMI2D128rr,              X86::VPERMI2D128rm,              0 },
+  { X86::VPERMI2D256rr,              X86::VPERMI2D256rm,              0 },
+  { X86::VPERMI2Drr,                 X86::VPERMI2Drm,                 0 },
+  { X86::VPERMI2PD128rr,             X86::VPERMI2PD128rm,             0 },
+  { X86::VPERMI2PD256rr,             X86::VPERMI2PD256rm,             0 },
+  { X86::VPERMI2PDrr,                X86::VPERMI2PDrm,                0 },
+  { X86::VPERMI2PS128rr,             X86::VPERMI2PS128rm,             0 },
+  { X86::VPERMI2PS256rr,             X86::VPERMI2PS256rm,             0 },
+  { X86::VPERMI2PSrr,                X86::VPERMI2PSrm,                0 },
+  { X86::VPERMI2Q128rr,              X86::VPERMI2Q128rm,              0 },
+  { X86::VPERMI2Q256rr,              X86::VPERMI2Q256rm,              0 },
+  { X86::VPERMI2Qrr,                 X86::VPERMI2Qrm,                 0 },
+  { X86::VPERMI2W128rr,              X86::VPERMI2W128rm,              0 },
+  { X86::VPERMI2W256rr,              X86::VPERMI2W256rm,              0 },
+  { X86::VPERMI2Wrr,                 X86::VPERMI2Wrm,                 0 },
+  { X86::VPERMIL2PDYrr,              X86::VPERMIL2PDYrm,              0 },
+  { X86::VPERMIL2PDrr,               X86::VPERMIL2PDrm,               0 },
+  { X86::VPERMIL2PSYrr,              X86::VPERMIL2PSYrm,              0 },
+  { X86::VPERMIL2PSrr,               X86::VPERMIL2PSrm,               0 },
+  { X86::VPERMILPDZ128rik,           X86::VPERMILPDZ128mik,           0 },
+  { X86::VPERMILPDZ128rrkz,          X86::VPERMILPDZ128rmkz,          0 },
+  { X86::VPERMILPDZ256rik,           X86::VPERMILPDZ256mik,           0 },
+  { X86::VPERMILPDZ256rrkz,          X86::VPERMILPDZ256rmkz,          0 },
+  { X86::VPERMILPDZrik,              X86::VPERMILPDZmik,              0 },
+  { X86::VPERMILPDZrrkz,             X86::VPERMILPDZrmkz,             0 },
+  { X86::VPERMILPSZ128rik,           X86::VPERMILPSZ128mik,           0 },
+  { X86::VPERMILPSZ128rrkz,          X86::VPERMILPSZ128rmkz,          0 },
+  { X86::VPERMILPSZ256rik,           X86::VPERMILPSZ256mik,           0 },
+  { X86::VPERMILPSZ256rrkz,          X86::VPERMILPSZ256rmkz,          0 },
+  { X86::VPERMILPSZrik,              X86::VPERMILPSZmik,              0 },
+  { X86::VPERMILPSZrrkz,             X86::VPERMILPSZrmkz,             0 },
+  { X86::VPERMPDZ256rik,             X86::VPERMPDZ256mik,             0 },
+  { X86::VPERMPDZ256rrkz,            X86::VPERMPDZ256rmkz,            0 },
+  { X86::VPERMPDZrik,                X86::VPERMPDZmik,                0 },
+  { X86::VPERMPDZrrkz,               X86::VPERMPDZrmkz,               0 },
+  { X86::VPERMPSZ256rrkz,            X86::VPERMPSZ256rmkz,            0 },
+  { X86::VPERMPSZrrkz,               X86::VPERMPSZrmkz,               0 },
+  { X86::VPERMQZ256rik,              X86::VPERMQZ256mik,              0 },
+  { X86::VPERMQZ256rrkz,             X86::VPERMQZ256rmkz,             0 },
+  { X86::VPERMQZrik,                 X86::VPERMQZmik,                 0 },
+  { X86::VPERMQZrrkz,                X86::VPERMQZrmkz,                0 },
+  { X86::VPERMT2B128rr,              X86::VPERMT2B128rm,              0 },
+  { X86::VPERMT2B256rr,              X86::VPERMT2B256rm,              0 },
+  { X86::VPERMT2Brr,                 X86::VPERMT2Brm,                 0 },
+  { X86::VPERMT2D128rr,              X86::VPERMT2D128rm,              0 },
+  { X86::VPERMT2D256rr,              X86::VPERMT2D256rm,              0 },
+  { X86::VPERMT2Drr,                 X86::VPERMT2Drm,                 0 },
+  { X86::VPERMT2PD128rr,             X86::VPERMT2PD128rm,             0 },
+  { X86::VPERMT2PD256rr,             X86::VPERMT2PD256rm,             0 },
+  { X86::VPERMT2PDrr,                X86::VPERMT2PDrm,                0 },
+  { X86::VPERMT2PS128rr,             X86::VPERMT2PS128rm,             0 },
+  { X86::VPERMT2PS256rr,             X86::VPERMT2PS256rm,             0 },
+  { X86::VPERMT2PSrr,                X86::VPERMT2PSrm,                0 },
+  { X86::VPERMT2Q128rr,              X86::VPERMT2Q128rm,              0 },
+  { X86::VPERMT2Q256rr,              X86::VPERMT2Q256rm,              0 },
+  { X86::VPERMT2Qrr,                 X86::VPERMT2Qrm,                 0 },
+  { X86::VPERMT2W128rr,              X86::VPERMT2W128rm,              0 },
+  { X86::VPERMT2W256rr,              X86::VPERMT2W256rm,              0 },
+  { X86::VPERMT2Wrr,                 X86::VPERMT2Wrm,                 0 },
+  { X86::VPERMWZ128rrkz,             X86::VPERMWZ128rmkz,             0 },
+  { X86::VPERMWZ256rrkz,             X86::VPERMWZ256rmkz,             0 },
+  { X86::VPERMWZrrkz,                X86::VPERMWZrmkz,                0 },
+  { X86::VPEXPANDBZ128rrk,           X86::VPEXPANDBZ128rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDBZ256rrk,           X86::VPEXPANDBZ256rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDBZrrk,              X86::VPEXPANDBZrmk,              TB_NO_REVERSE },
+  { X86::VPEXPANDDZ128rrk,           X86::VPEXPANDDZ128rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDDZ256rrk,           X86::VPEXPANDDZ256rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDDZrrk,              X86::VPEXPANDDZrmk,              TB_NO_REVERSE },
+  { X86::VPEXPANDQZ128rrk,           X86::VPEXPANDQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDQZ256rrk,           X86::VPEXPANDQZ256rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDQZrrk,              X86::VPEXPANDQZrmk,              TB_NO_REVERSE },
+  { X86::VPEXPANDWZ128rrk,           X86::VPEXPANDWZ128rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDWZ256rrk,           X86::VPEXPANDWZ256rmk,           TB_NO_REVERSE },
+  { X86::VPEXPANDWZrrk,              X86::VPEXPANDWZrmk,              TB_NO_REVERSE },
+  { X86::VPLZCNTDZ128rrk,            X86::VPLZCNTDZ128rmk,            0 },
+  { X86::VPLZCNTDZ256rrk,            X86::VPLZCNTDZ256rmk,            0 },
+  { X86::VPLZCNTDZrrk,               X86::VPLZCNTDZrmk,               0 },
+  { X86::VPLZCNTQZ128rrk,            X86::VPLZCNTQZ128rmk,            0 },
+  { X86::VPLZCNTQZ256rrk,            X86::VPLZCNTQZ256rmk,            0 },
+  { X86::VPLZCNTQZrrk,               X86::VPLZCNTQZrmk,               0 },
+  { X86::VPMADD52HUQZ128r,           X86::VPMADD52HUQZ128m,           0 },
+  { X86::VPMADD52HUQZ256r,           X86::VPMADD52HUQZ256m,           0 },
+  { X86::VPMADD52HUQZr,              X86::VPMADD52HUQZm,              0 },
+  { X86::VPMADD52LUQZ128r,           X86::VPMADD52LUQZ128m,           0 },
+  { X86::VPMADD52LUQZ256r,           X86::VPMADD52LUQZ256m,           0 },
+  { X86::VPMADD52LUQZr,              X86::VPMADD52LUQZm,              0 },
+  { X86::VPMADDUBSWZ128rrkz,         X86::VPMADDUBSWZ128rmkz,         0 },
+  { X86::VPMADDUBSWZ256rrkz,         X86::VPMADDUBSWZ256rmkz,         0 },
+  { X86::VPMADDUBSWZrrkz,            X86::VPMADDUBSWZrmkz,            0 },
+  { X86::VPMADDWDZ128rrkz,           X86::VPMADDWDZ128rmkz,           0 },
+  { X86::VPMADDWDZ256rrkz,           X86::VPMADDWDZ256rmkz,           0 },
+  { X86::VPMADDWDZrrkz,              X86::VPMADDWDZrmkz,              0 },
+  { X86::VPMAXSBZ128rrkz,            X86::VPMAXSBZ128rmkz,            0 },
+  { X86::VPMAXSBZ256rrkz,            X86::VPMAXSBZ256rmkz,            0 },
+  { X86::VPMAXSBZrrkz,               X86::VPMAXSBZrmkz,               0 },
+  { X86::VPMAXSDZ128rrkz,            X86::VPMAXSDZ128rmkz,            0 },
+  { X86::VPMAXSDZ256rrkz,            X86::VPMAXSDZ256rmkz,            0 },
+  { X86::VPMAXSDZrrkz,               X86::VPMAXSDZrmkz,               0 },
+  { X86::VPMAXSQZ128rrkz,            X86::VPMAXSQZ128rmkz,            0 },
+  { X86::VPMAXSQZ256rrkz,            X86::VPMAXSQZ256rmkz,            0 },
+  { X86::VPMAXSQZrrkz,               X86::VPMAXSQZrmkz,               0 },
+  { X86::VPMAXSWZ128rrkz,            X86::VPMAXSWZ128rmkz,            0 },
+  { X86::VPMAXSWZ256rrkz,            X86::VPMAXSWZ256rmkz,            0 },
+  { X86::VPMAXSWZrrkz,               X86::VPMAXSWZrmkz,               0 },
+  { X86::VPMAXUBZ128rrkz,            X86::VPMAXUBZ128rmkz,            0 },
+  { X86::VPMAXUBZ256rrkz,            X86::VPMAXUBZ256rmkz,            0 },
+  { X86::VPMAXUBZrrkz,               X86::VPMAXUBZrmkz,               0 },
+  { X86::VPMAXUDZ128rrkz,            X86::VPMAXUDZ128rmkz,            0 },
+  { X86::VPMAXUDZ256rrkz,            X86::VPMAXUDZ256rmkz,            0 },
+  { X86::VPMAXUDZrrkz,               X86::VPMAXUDZrmkz,               0 },
+  { X86::VPMAXUQZ128rrkz,            X86::VPMAXUQZ128rmkz,            0 },
+  { X86::VPMAXUQZ256rrkz,            X86::VPMAXUQZ256rmkz,            0 },
+  { X86::VPMAXUQZrrkz,               X86::VPMAXUQZrmkz,               0 },
+  { X86::VPMAXUWZ128rrkz,            X86::VPMAXUWZ128rmkz,            0 },
+  { X86::VPMAXUWZ256rrkz,            X86::VPMAXUWZ256rmkz,            0 },
+  { X86::VPMAXUWZrrkz,               X86::VPMAXUWZrmkz,               0 },
+  { X86::VPMINSBZ128rrkz,            X86::VPMINSBZ128rmkz,            0 },
+  { X86::VPMINSBZ256rrkz,            X86::VPMINSBZ256rmkz,            0 },
+  { X86::VPMINSBZrrkz,               X86::VPMINSBZrmkz,               0 },
+  { X86::VPMINSDZ128rrkz,            X86::VPMINSDZ128rmkz,            0 },
+  { X86::VPMINSDZ256rrkz,            X86::VPMINSDZ256rmkz,            0 },
+  { X86::VPMINSDZrrkz,               X86::VPMINSDZrmkz,               0 },
+  { X86::VPMINSQZ128rrkz,            X86::VPMINSQZ128rmkz,            0 },
+  { X86::VPMINSQZ256rrkz,            X86::VPMINSQZ256rmkz,            0 },
+  { X86::VPMINSQZrrkz,               X86::VPMINSQZrmkz,               0 },
+  { X86::VPMINSWZ128rrkz,            X86::VPMINSWZ128rmkz,            0 },
+  { X86::VPMINSWZ256rrkz,            X86::VPMINSWZ256rmkz,            0 },
+  { X86::VPMINSWZrrkz,               X86::VPMINSWZrmkz,               0 },
+  { X86::VPMINUBZ128rrkz,            X86::VPMINUBZ128rmkz,            0 },
+  { X86::VPMINUBZ256rrkz,            X86::VPMINUBZ256rmkz,            0 },
+  { X86::VPMINUBZrrkz,               X86::VPMINUBZrmkz,               0 },
+  { X86::VPMINUDZ128rrkz,            X86::VPMINUDZ128rmkz,            0 },
+  { X86::VPMINUDZ256rrkz,            X86::VPMINUDZ256rmkz,            0 },
+  { X86::VPMINUDZrrkz,               X86::VPMINUDZrmkz,               0 },
+  { X86::VPMINUQZ128rrkz,            X86::VPMINUQZ128rmkz,            0 },
+  { X86::VPMINUQZ256rrkz,            X86::VPMINUQZ256rmkz,            0 },
+  { X86::VPMINUQZrrkz,               X86::VPMINUQZrmkz,               0 },
+  { X86::VPMINUWZ128rrkz,            X86::VPMINUWZ128rmkz,            0 },
+  { X86::VPMINUWZ256rrkz,            X86::VPMINUWZ256rmkz,            0 },
+  { X86::VPMINUWZrrkz,               X86::VPMINUWZrmkz,               0 },
+  { X86::VPMOVSXBDZ128rrk,           X86::VPMOVSXBDZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXBDZ256rrk,           X86::VPMOVSXBDZ256rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXBDZrrk,              X86::VPMOVSXBDZrmk,              0 },
+  { X86::VPMOVSXBQZ128rrk,           X86::VPMOVSXBQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXBQZ256rrk,           X86::VPMOVSXBQZ256rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXBQZrrk,              X86::VPMOVSXBQZrmk,              TB_NO_REVERSE },
+  { X86::VPMOVSXBWZ128rrk,           X86::VPMOVSXBWZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXBWZ256rrk,           X86::VPMOVSXBWZ256rmk,           0 },
+  { X86::VPMOVSXBWZrrk,              X86::VPMOVSXBWZrmk,              0 },
+  { X86::VPMOVSXDQZ128rrk,           X86::VPMOVSXDQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXDQZ256rrk,           X86::VPMOVSXDQZ256rmk,           0 },
+  { X86::VPMOVSXDQZrrk,              X86::VPMOVSXDQZrmk,              0 },
+  { X86::VPMOVSXWDZ128rrk,           X86::VPMOVSXWDZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXWDZ256rrk,           X86::VPMOVSXWDZ256rmk,           0 },
+  { X86::VPMOVSXWDZrrk,              X86::VPMOVSXWDZrmk,              0 },
+  { X86::VPMOVSXWQZ128rrk,           X86::VPMOVSXWQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXWQZ256rrk,           X86::VPMOVSXWQZ256rmk,           TB_NO_REVERSE },
+  { X86::VPMOVSXWQZrrk,              X86::VPMOVSXWQZrmk,              0 },
+  { X86::VPMOVZXBDZ128rrk,           X86::VPMOVZXBDZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXBDZ256rrk,           X86::VPMOVZXBDZ256rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXBDZrrk,              X86::VPMOVZXBDZrmk,              0 },
+  { X86::VPMOVZXBQZ128rrk,           X86::VPMOVZXBQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXBQZ256rrk,           X86::VPMOVZXBQZ256rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXBQZrrk,              X86::VPMOVZXBQZrmk,              TB_NO_REVERSE },
+  { X86::VPMOVZXBWZ128rrk,           X86::VPMOVZXBWZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXBWZ256rrk,           X86::VPMOVZXBWZ256rmk,           0 },
+  { X86::VPMOVZXBWZrrk,              X86::VPMOVZXBWZrmk,              0 },
+  { X86::VPMOVZXDQZ128rrk,           X86::VPMOVZXDQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXDQZ256rrk,           X86::VPMOVZXDQZ256rmk,           0 },
+  { X86::VPMOVZXDQZrrk,              X86::VPMOVZXDQZrmk,              0 },
+  { X86::VPMOVZXWDZ128rrk,           X86::VPMOVZXWDZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXWDZ256rrk,           X86::VPMOVZXWDZ256rmk,           0 },
+  { X86::VPMOVZXWDZrrk,              X86::VPMOVZXWDZrmk,              0 },
+  { X86::VPMOVZXWQZ128rrk,           X86::VPMOVZXWQZ128rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXWQZ256rrk,           X86::VPMOVZXWQZ256rmk,           TB_NO_REVERSE },
+  { X86::VPMOVZXWQZrrk,              X86::VPMOVZXWQZrmk,              0 },
+  { X86::VPMULDQZ128rrkz,            X86::VPMULDQZ128rmkz,            0 },
+  { X86::VPMULDQZ256rrkz,            X86::VPMULDQZ256rmkz,            0 },
+  { X86::VPMULDQZrrkz,               X86::VPMULDQZrmkz,               0 },
+  { X86::VPMULHRSWZ128rrkz,          X86::VPMULHRSWZ128rmkz,          0 },
+  { X86::VPMULHRSWZ256rrkz,          X86::VPMULHRSWZ256rmkz,          0 },
+  { X86::VPMULHRSWZrrkz,             X86::VPMULHRSWZrmkz,             0 },
+  { X86::VPMULHUWZ128rrkz,           X86::VPMULHUWZ128rmkz,           0 },
+  { X86::VPMULHUWZ256rrkz,           X86::VPMULHUWZ256rmkz,           0 },
+  { X86::VPMULHUWZrrkz,              X86::VPMULHUWZrmkz,              0 },
+  { X86::VPMULHWZ128rrkz,            X86::VPMULHWZ128rmkz,            0 },
+  { X86::VPMULHWZ256rrkz,            X86::VPMULHWZ256rmkz,            0 },
+  { X86::VPMULHWZrrkz,               X86::VPMULHWZrmkz,               0 },
+  { X86::VPMULLDZ128rrkz,            X86::VPMULLDZ128rmkz,            0 },
+  { X86::VPMULLDZ256rrkz,            X86::VPMULLDZ256rmkz,            0 },
+  { X86::VPMULLDZrrkz,               X86::VPMULLDZrmkz,               0 },
+  { X86::VPMULLQZ128rrkz,            X86::VPMULLQZ128rmkz,            0 },
+  { X86::VPMULLQZ256rrkz,            X86::VPMULLQZ256rmkz,            0 },
+  { X86::VPMULLQZrrkz,               X86::VPMULLQZrmkz,               0 },
+  { X86::VPMULLWZ128rrkz,            X86::VPMULLWZ128rmkz,            0 },
+  { X86::VPMULLWZ256rrkz,            X86::VPMULLWZ256rmkz,            0 },
+  { X86::VPMULLWZrrkz,               X86::VPMULLWZrmkz,               0 },
+  { X86::VPMULTISHIFTQBZ128rrkz,     X86::VPMULTISHIFTQBZ128rmkz,     0 },
+  { X86::VPMULTISHIFTQBZ256rrkz,     X86::VPMULTISHIFTQBZ256rmkz,     0 },
+  { X86::VPMULTISHIFTQBZrrkz,        X86::VPMULTISHIFTQBZrmkz,        0 },
+  { X86::VPMULUDQZ128rrkz,           X86::VPMULUDQZ128rmkz,           0 },
+  { X86::VPMULUDQZ256rrkz,           X86::VPMULUDQZ256rmkz,           0 },
+  { X86::VPMULUDQZrrkz,              X86::VPMULUDQZrmkz,              0 },
+  { X86::VPOPCNTBZ128rrk,            X86::VPOPCNTBZ128rmk,            0 },
+  { X86::VPOPCNTBZ256rrk,            X86::VPOPCNTBZ256rmk,            0 },
+  { X86::VPOPCNTBZrrk,               X86::VPOPCNTBZrmk,               0 },
+  { X86::VPOPCNTDZ128rrk,            X86::VPOPCNTDZ128rmk,            0 },
+  { X86::VPOPCNTDZ256rrk,            X86::VPOPCNTDZ256rmk,            0 },
+  { X86::VPOPCNTDZrrk,               X86::VPOPCNTDZrmk,               0 },
+  { X86::VPOPCNTQZ128rrk,            X86::VPOPCNTQZ128rmk,            0 },
+  { X86::VPOPCNTQZ256rrk,            X86::VPOPCNTQZ256rmk,            0 },
+  { X86::VPOPCNTQZrrk,               X86::VPOPCNTQZrmk,               0 },
+  { X86::VPOPCNTWZ128rrk,            X86::VPOPCNTWZ128rmk,            0 },
+  { X86::VPOPCNTWZ256rrk,            X86::VPOPCNTWZ256rmk,            0 },
+  { X86::VPOPCNTWZrrk,               X86::VPOPCNTWZrmk,               0 },
+  { X86::VPORDZ128rrkz,              X86::VPORDZ128rmkz,              0 },
+  { X86::VPORDZ256rrkz,              X86::VPORDZ256rmkz,              0 },
+  { X86::VPORDZrrkz,                 X86::VPORDZrmkz,                 0 },
+  { X86::VPORQZ128rrkz,              X86::VPORQZ128rmkz,              0 },
+  { X86::VPORQZ256rrkz,              X86::VPORQZ256rmkz,              0 },
+  { X86::VPORQZrrkz,                 X86::VPORQZrmkz,                 0 },
+  { X86::VPPERMrrr,                  X86::VPPERMrrm,                  0 },
+  { X86::VPROLDZ128rik,              X86::VPROLDZ128mik,              0 },
+  { X86::VPROLDZ256rik,              X86::VPROLDZ256mik,              0 },
+  { X86::VPROLDZrik,                 X86::VPROLDZmik,                 0 },
+  { X86::VPROLQZ128rik,              X86::VPROLQZ128mik,              0 },
+  { X86::VPROLQZ256rik,              X86::VPROLQZ256mik,              0 },
+  { X86::VPROLQZrik,                 X86::VPROLQZmik,                 0 },
+  { X86::VPROLVDZ128rrkz,            X86::VPROLVDZ128rmkz,            0 },
+  { X86::VPROLVDZ256rrkz,            X86::VPROLVDZ256rmkz,            0 },
+  { X86::VPROLVDZrrkz,               X86::VPROLVDZrmkz,               0 },
+  { X86::VPROLVQZ128rrkz,            X86::VPROLVQZ128rmkz,            0 },
+  { X86::VPROLVQZ256rrkz,            X86::VPROLVQZ256rmkz,            0 },
+  { X86::VPROLVQZrrkz,               X86::VPROLVQZrmkz,               0 },
+  { X86::VPRORDZ128rik,              X86::VPRORDZ128mik,              0 },
+  { X86::VPRORDZ256rik,              X86::VPRORDZ256mik,              0 },
+  { X86::VPRORDZrik,                 X86::VPRORDZmik,                 0 },
+  { X86::VPRORQZ128rik,              X86::VPRORQZ128mik,              0 },
+  { X86::VPRORQZ256rik,              X86::VPRORQZ256mik,              0 },
+  { X86::VPRORQZrik,                 X86::VPRORQZmik,                 0 },
+  { X86::VPRORVDZ128rrkz,            X86::VPRORVDZ128rmkz,            0 },
+  { X86::VPRORVDZ256rrkz,            X86::VPRORVDZ256rmkz,            0 },
+  { X86::VPRORVDZrrkz,               X86::VPRORVDZrmkz,               0 },
+  { X86::VPRORVQZ128rrkz,            X86::VPRORVQZ128rmkz,            0 },
+  { X86::VPRORVQZ256rrkz,            X86::VPRORVQZ256rmkz,            0 },
+  { X86::VPRORVQZrrkz,               X86::VPRORVQZrmkz,               0 },
+  { X86::VPSHLDDZ128rrikz,           X86::VPSHLDDZ128rmikz,           0 },
+  { X86::VPSHLDDZ256rrikz,           X86::VPSHLDDZ256rmikz,           0 },
+  { X86::VPSHLDDZrrikz,              X86::VPSHLDDZrmikz,              0 },
+  { X86::VPSHLDQZ128rrikz,           X86::VPSHLDQZ128rmikz,           0 },
+  { X86::VPSHLDQZ256rrikz,           X86::VPSHLDQZ256rmikz,           0 },
+  { X86::VPSHLDQZrrikz,              X86::VPSHLDQZrmikz,              0 },
+  { X86::VPSHLDVDZ128r,              X86::VPSHLDVDZ128m,              0 },
+  { X86::VPSHLDVDZ256r,              X86::VPSHLDVDZ256m,              0 },
+  { X86::VPSHLDVDZr,                 X86::VPSHLDVDZm,                 0 },
+  { X86::VPSHLDVQZ128r,              X86::VPSHLDVQZ128m,              0 },
+  { X86::VPSHLDVQZ256r,              X86::VPSHLDVQZ256m,              0 },
+  { X86::VPSHLDVQZr,                 X86::VPSHLDVQZm,                 0 },
+  { X86::VPSHLDVWZ128r,              X86::VPSHLDVWZ128m,              0 },
+  { X86::VPSHLDVWZ256r,              X86::VPSHLDVWZ256m,              0 },
+  { X86::VPSHLDVWZr,                 X86::VPSHLDVWZm,                 0 },
+  { X86::VPSHLDWZ128rrikz,           X86::VPSHLDWZ128rmikz,           0 },
+  { X86::VPSHLDWZ256rrikz,           X86::VPSHLDWZ256rmikz,           0 },
+  { X86::VPSHLDWZrrikz,              X86::VPSHLDWZrmikz,              0 },
+  { X86::VPSHRDDZ128rrikz,           X86::VPSHRDDZ128rmikz,           0 },
+  { X86::VPSHRDDZ256rrikz,           X86::VPSHRDDZ256rmikz,           0 },
+  { X86::VPSHRDDZrrikz,              X86::VPSHRDDZrmikz,              0 },
+  { X86::VPSHRDQZ128rrikz,           X86::VPSHRDQZ128rmikz,           0 },
+  { X86::VPSHRDQZ256rrikz,           X86::VPSHRDQZ256rmikz,           0 },
+  { X86::VPSHRDQZrrikz,              X86::VPSHRDQZrmikz,              0 },
+  { X86::VPSHRDVDZ128r,              X86::VPSHRDVDZ128m,              0 },
+  { X86::VPSHRDVDZ256r,              X86::VPSHRDVDZ256m,              0 },
+  { X86::VPSHRDVDZr,                 X86::VPSHRDVDZm,                 0 },
+  { X86::VPSHRDVQZ128r,              X86::VPSHRDVQZ128m,              0 },
+  { X86::VPSHRDVQZ256r,              X86::VPSHRDVQZ256m,              0 },
+  { X86::VPSHRDVQZr,                 X86::VPSHRDVQZm,                 0 },
+  { X86::VPSHRDVWZ128r,              X86::VPSHRDVWZ128m,              0 },
+  { X86::VPSHRDVWZ256r,              X86::VPSHRDVWZ256m,              0 },
+  { X86::VPSHRDVWZr,                 X86::VPSHRDVWZm,                 0 },
+  { X86::VPSHRDWZ128rrikz,           X86::VPSHRDWZ128rmikz,           0 },
+  { X86::VPSHRDWZ256rrikz,           X86::VPSHRDWZ256rmikz,           0 },
+  { X86::VPSHRDWZrrikz,              X86::VPSHRDWZrmikz,              0 },
+  { X86::VPSHUFBITQMBZ128rrk,        X86::VPSHUFBITQMBZ128rmk,        0 },
+  { X86::VPSHUFBITQMBZ256rrk,        X86::VPSHUFBITQMBZ256rmk,        0 },
+  { X86::VPSHUFBITQMBZrrk,           X86::VPSHUFBITQMBZrmk,           0 },
+  { X86::VPSHUFBZ128rrkz,            X86::VPSHUFBZ128rmkz,            0 },
+  { X86::VPSHUFBZ256rrkz,            X86::VPSHUFBZ256rmkz,            0 },
+  { X86::VPSHUFBZrrkz,               X86::VPSHUFBZrmkz,               0 },
+  { X86::VPSHUFDZ128rik,             X86::VPSHUFDZ128mik,             0 },
+  { X86::VPSHUFDZ256rik,             X86::VPSHUFDZ256mik,             0 },
+  { X86::VPSHUFDZrik,                X86::VPSHUFDZmik,                0 },
+  { X86::VPSHUFHWZ128rik,            X86::VPSHUFHWZ128mik,            0 },
+  { X86::VPSHUFHWZ256rik,            X86::VPSHUFHWZ256mik,            0 },
+  { X86::VPSHUFHWZrik,               X86::VPSHUFHWZmik,               0 },
+  { X86::VPSHUFLWZ128rik,            X86::VPSHUFLWZ128mik,            0 },
+  { X86::VPSHUFLWZ256rik,            X86::VPSHUFLWZ256mik,            0 },
+  { X86::VPSHUFLWZrik,               X86::VPSHUFLWZmik,               0 },
+  { X86::VPSLLDZ128rik,              X86::VPSLLDZ128mik,              0 },
+  { X86::VPSLLDZ128rrkz,             X86::VPSLLDZ128rmkz,             0 },
+  { X86::VPSLLDZ256rik,              X86::VPSLLDZ256mik,              0 },
+  { X86::VPSLLDZ256rrkz,             X86::VPSLLDZ256rmkz,             0 },
+  { X86::VPSLLDZrik,                 X86::VPSLLDZmik,                 0 },
+  { X86::VPSLLDZrrkz,                X86::VPSLLDZrmkz,                0 },
+  { X86::VPSLLQZ128rik,              X86::VPSLLQZ128mik,              0 },
+  { X86::VPSLLQZ128rrkz,             X86::VPSLLQZ128rmkz,             0 },
+  { X86::VPSLLQZ256rik,              X86::VPSLLQZ256mik,              0 },
+  { X86::VPSLLQZ256rrkz,             X86::VPSLLQZ256rmkz,             0 },
+  { X86::VPSLLQZrik,                 X86::VPSLLQZmik,                 0 },
+  { X86::VPSLLQZrrkz,                X86::VPSLLQZrmkz,                0 },
+  { X86::VPSLLVDZ128rrkz,            X86::VPSLLVDZ128rmkz,            0 },
+  { X86::VPSLLVDZ256rrkz,            X86::VPSLLVDZ256rmkz,            0 },
+  { X86::VPSLLVDZrrkz,               X86::VPSLLVDZrmkz,               0 },
+  { X86::VPSLLVQZ128rrkz,            X86::VPSLLVQZ128rmkz,            0 },
+  { X86::VPSLLVQZ256rrkz,            X86::VPSLLVQZ256rmkz,            0 },
+  { X86::VPSLLVQZrrkz,               X86::VPSLLVQZrmkz,               0 },
+  { X86::VPSLLVWZ128rrkz,            X86::VPSLLVWZ128rmkz,            0 },
+  { X86::VPSLLVWZ256rrkz,            X86::VPSLLVWZ256rmkz,            0 },
+  { X86::VPSLLVWZrrkz,               X86::VPSLLVWZrmkz,               0 },
+  { X86::VPSLLWZ128rik,              X86::VPSLLWZ128mik,              0 },
+  { X86::VPSLLWZ128rrkz,             X86::VPSLLWZ128rmkz,             0 },
+  { X86::VPSLLWZ256rik,              X86::VPSLLWZ256mik,              0 },
+  { X86::VPSLLWZ256rrkz,             X86::VPSLLWZ256rmkz,             0 },
+  { X86::VPSLLWZrik,                 X86::VPSLLWZmik,                 0 },
+  { X86::VPSLLWZrrkz,                X86::VPSLLWZrmkz,                0 },
+  { X86::VPSRADZ128rik,              X86::VPSRADZ128mik,              0 },
+  { X86::VPSRADZ128rrkz,             X86::VPSRADZ128rmkz,             0 },
+  { X86::VPSRADZ256rik,              X86::VPSRADZ256mik,              0 },
+  { X86::VPSRADZ256rrkz,             X86::VPSRADZ256rmkz,             0 },
+  { X86::VPSRADZrik,                 X86::VPSRADZmik,                 0 },
+  { X86::VPSRADZrrkz,                X86::VPSRADZrmkz,                0 },
+  { X86::VPSRAQZ128rik,              X86::VPSRAQZ128mik,              0 },
+  { X86::VPSRAQZ128rrkz,             X86::VPSRAQZ128rmkz,             0 },
+  { X86::VPSRAQZ256rik,              X86::VPSRAQZ256mik,              0 },
+  { X86::VPSRAQZ256rrkz,             X86::VPSRAQZ256rmkz,             0 },
+  { X86::VPSRAQZrik,                 X86::VPSRAQZmik,                 0 },
+  { X86::VPSRAQZrrkz,                X86::VPSRAQZrmkz,                0 },
+  { X86::VPSRAVDZ128rrkz,            X86::VPSRAVDZ128rmkz,            0 },
+  { X86::VPSRAVDZ256rrkz,            X86::VPSRAVDZ256rmkz,            0 },
+  { X86::VPSRAVDZrrkz,               X86::VPSRAVDZrmkz,               0 },
+  { X86::VPSRAVQZ128rrkz,            X86::VPSRAVQZ128rmkz,            0 },
+  { X86::VPSRAVQZ256rrkz,            X86::VPSRAVQZ256rmkz,            0 },
+  { X86::VPSRAVQZrrkz,               X86::VPSRAVQZrmkz,               0 },
+  { X86::VPSRAVWZ128rrkz,            X86::VPSRAVWZ128rmkz,            0 },
+  { X86::VPSRAVWZ256rrkz,            X86::VPSRAVWZ256rmkz,            0 },
+  { X86::VPSRAVWZrrkz,               X86::VPSRAVWZrmkz,               0 },
+  { X86::VPSRAWZ128rik,              X86::VPSRAWZ128mik,              0 },
+  { X86::VPSRAWZ128rrkz,             X86::VPSRAWZ128rmkz,             0 },
+  { X86::VPSRAWZ256rik,              X86::VPSRAWZ256mik,              0 },
+  { X86::VPSRAWZ256rrkz,             X86::VPSRAWZ256rmkz,             0 },
+  { X86::VPSRAWZrik,                 X86::VPSRAWZmik,                 0 },
+  { X86::VPSRAWZrrkz,                X86::VPSRAWZrmkz,                0 },
+  { X86::VPSRLDZ128rik,              X86::VPSRLDZ128mik,              0 },
+  { X86::VPSRLDZ128rrkz,             X86::VPSRLDZ128rmkz,             0 },
+  { X86::VPSRLDZ256rik,              X86::VPSRLDZ256mik,              0 },
+  { X86::VPSRLDZ256rrkz,             X86::VPSRLDZ256rmkz,             0 },
+  { X86::VPSRLDZrik,                 X86::VPSRLDZmik,                 0 },
+  { X86::VPSRLDZrrkz,                X86::VPSRLDZrmkz,                0 },
+  { X86::VPSRLQZ128rik,              X86::VPSRLQZ128mik,              0 },
+  { X86::VPSRLQZ128rrkz,             X86::VPSRLQZ128rmkz,             0 },
+  { X86::VPSRLQZ256rik,              X86::VPSRLQZ256mik,              0 },
+  { X86::VPSRLQZ256rrkz,             X86::VPSRLQZ256rmkz,             0 },
+  { X86::VPSRLQZrik,                 X86::VPSRLQZmik,                 0 },
+  { X86::VPSRLQZrrkz,                X86::VPSRLQZrmkz,                0 },
+  { X86::VPSRLVDZ128rrkz,            X86::VPSRLVDZ128rmkz,            0 },
+  { X86::VPSRLVDZ256rrkz,            X86::VPSRLVDZ256rmkz,            0 },
+  { X86::VPSRLVDZrrkz,               X86::VPSRLVDZrmkz,               0 },
+  { X86::VPSRLVQZ128rrkz,            X86::VPSRLVQZ128rmkz,            0 },
+  { X86::VPSRLVQZ256rrkz,            X86::VPSRLVQZ256rmkz,            0 },
+  { X86::VPSRLVQZrrkz,               X86::VPSRLVQZrmkz,               0 },
+  { X86::VPSRLVWZ128rrkz,            X86::VPSRLVWZ128rmkz,            0 },
+  { X86::VPSRLVWZ256rrkz,            X86::VPSRLVWZ256rmkz,            0 },
+  { X86::VPSRLVWZrrkz,               X86::VPSRLVWZrmkz,               0 },
+  { X86::VPSRLWZ128rik,              X86::VPSRLWZ128mik,              0 },
+  { X86::VPSRLWZ128rrkz,             X86::VPSRLWZ128rmkz,             0 },
+  { X86::VPSRLWZ256rik,              X86::VPSRLWZ256mik,              0 },
+  { X86::VPSRLWZ256rrkz,             X86::VPSRLWZ256rmkz,             0 },
+  { X86::VPSRLWZrik,                 X86::VPSRLWZmik,                 0 },
+  { X86::VPSRLWZrrkz,                X86::VPSRLWZrmkz,                0 },
+  { X86::VPSUBBZ128rrkz,             X86::VPSUBBZ128rmkz,             0 },
+  { X86::VPSUBBZ256rrkz,             X86::VPSUBBZ256rmkz,             0 },
+  { X86::VPSUBBZrrkz,                X86::VPSUBBZrmkz,                0 },
+  { X86::VPSUBDZ128rrkz,             X86::VPSUBDZ128rmkz,             0 },
+  { X86::VPSUBDZ256rrkz,             X86::VPSUBDZ256rmkz,             0 },
+  { X86::VPSUBDZrrkz,                X86::VPSUBDZrmkz,                0 },
+  { X86::VPSUBQZ128rrkz,             X86::VPSUBQZ128rmkz,             0 },
+  { X86::VPSUBQZ256rrkz,             X86::VPSUBQZ256rmkz,             0 },
+  { X86::VPSUBQZrrkz,                X86::VPSUBQZrmkz,                0 },
+  { X86::VPSUBSBZ128rrkz,            X86::VPSUBSBZ128rmkz,            0 },
+  { X86::VPSUBSBZ256rrkz,            X86::VPSUBSBZ256rmkz,            0 },
+  { X86::VPSUBSBZrrkz,               X86::VPSUBSBZrmkz,               0 },
+  { X86::VPSUBSWZ128rrkz,            X86::VPSUBSWZ128rmkz,            0 },
+  { X86::VPSUBSWZ256rrkz,            X86::VPSUBSWZ256rmkz,            0 },
+  { X86::VPSUBSWZrrkz,               X86::VPSUBSWZrmkz,               0 },
+  { X86::VPSUBUSBZ128rrkz,           X86::VPSUBUSBZ128rmkz,           0 },
+  { X86::VPSUBUSBZ256rrkz,           X86::VPSUBUSBZ256rmkz,           0 },
+  { X86::VPSUBUSBZrrkz,              X86::VPSUBUSBZrmkz,              0 },
+  { X86::VPSUBUSWZ128rrkz,           X86::VPSUBUSWZ128rmkz,           0 },
+  { X86::VPSUBUSWZ256rrkz,           X86::VPSUBUSWZ256rmkz,           0 },
+  { X86::VPSUBUSWZrrkz,              X86::VPSUBUSWZrmkz,              0 },
+  { X86::VPSUBWZ128rrkz,             X86::VPSUBWZ128rmkz,             0 },
+  { X86::VPSUBWZ256rrkz,             X86::VPSUBWZ256rmkz,             0 },
+  { X86::VPSUBWZrrkz,                X86::VPSUBWZrmkz,                0 },
+  { X86::VPTERNLOGDZ128rri,          X86::VPTERNLOGDZ128rmi,          0 },
+  { X86::VPTERNLOGDZ256rri,          X86::VPTERNLOGDZ256rmi,          0 },
+  { X86::VPTERNLOGDZrri,             X86::VPTERNLOGDZrmi,             0 },
+  { X86::VPTERNLOGQZ128rri,          X86::VPTERNLOGQZ128rmi,          0 },
+  { X86::VPTERNLOGQZ256rri,          X86::VPTERNLOGQZ256rmi,          0 },
+  { X86::VPTERNLOGQZrri,             X86::VPTERNLOGQZrmi,             0 },
+  { X86::VPTESTMBZ128rrk,            X86::VPTESTMBZ128rmk,            0 },
+  { X86::VPTESTMBZ256rrk,            X86::VPTESTMBZ256rmk,            0 },
+  { X86::VPTESTMBZrrk,               X86::VPTESTMBZrmk,               0 },
+  { X86::VPTESTMDZ128rrk,            X86::VPTESTMDZ128rmk,            0 },
+  { X86::VPTESTMDZ256rrk,            X86::VPTESTMDZ256rmk,            0 },
+  { X86::VPTESTMDZrrk,               X86::VPTESTMDZrmk,               0 },
+  { X86::VPTESTMQZ128rrk,            X86::VPTESTMQZ128rmk,            0 },
+  { X86::VPTESTMQZ256rrk,            X86::VPTESTMQZ256rmk,            0 },
+  { X86::VPTESTMQZrrk,               X86::VPTESTMQZrmk,               0 },
+  { X86::VPTESTMWZ128rrk,            X86::VPTESTMWZ128rmk,            0 },
+  { X86::VPTESTMWZ256rrk,            X86::VPTESTMWZ256rmk,            0 },
+  { X86::VPTESTMWZrrk,               X86::VPTESTMWZrmk,               0 },
+  { X86::VPTESTNMBZ128rrk,           X86::VPTESTNMBZ128rmk,           0 },
+  { X86::VPTESTNMBZ256rrk,           X86::VPTESTNMBZ256rmk,           0 },
+  { X86::VPTESTNMBZrrk,              X86::VPTESTNMBZrmk,              0 },
+  { X86::VPTESTNMDZ128rrk,           X86::VPTESTNMDZ128rmk,           0 },
+  { X86::VPTESTNMDZ256rrk,           X86::VPTESTNMDZ256rmk,           0 },
+  { X86::VPTESTNMDZrrk,              X86::VPTESTNMDZrmk,              0 },
+  { X86::VPTESTNMQZ128rrk,           X86::VPTESTNMQZ128rmk,           0 },
+  { X86::VPTESTNMQZ256rrk,           X86::VPTESTNMQZ256rmk,           0 },
+  { X86::VPTESTNMQZrrk,              X86::VPTESTNMQZrmk,              0 },
+  { X86::VPTESTNMWZ128rrk,           X86::VPTESTNMWZ128rmk,           0 },
+  { X86::VPTESTNMWZ256rrk,           X86::VPTESTNMWZ256rmk,           0 },
+  { X86::VPTESTNMWZrrk,              X86::VPTESTNMWZrmk,              0 },
+  { X86::VPUNPCKHBWZ128rrkz,         X86::VPUNPCKHBWZ128rmkz,         0 },
+  { X86::VPUNPCKHBWZ256rrkz,         X86::VPUNPCKHBWZ256rmkz,         0 },
+  { X86::VPUNPCKHBWZrrkz,            X86::VPUNPCKHBWZrmkz,            0 },
+  { X86::VPUNPCKHDQZ128rrkz,         X86::VPUNPCKHDQZ128rmkz,         0 },
+  { X86::VPUNPCKHDQZ256rrkz,         X86::VPUNPCKHDQZ256rmkz,         0 },
+  { X86::VPUNPCKHDQZrrkz,            X86::VPUNPCKHDQZrmkz,            0 },
+  { X86::VPUNPCKHQDQZ128rrkz,        X86::VPUNPCKHQDQZ128rmkz,        0 },
+  { X86::VPUNPCKHQDQZ256rrkz,        X86::VPUNPCKHQDQZ256rmkz,        0 },
+  { X86::VPUNPCKHQDQZrrkz,           X86::VPUNPCKHQDQZrmkz,           0 },
+  { X86::VPUNPCKHWDZ128rrkz,         X86::VPUNPCKHWDZ128rmkz,         0 },
+  { X86::VPUNPCKHWDZ256rrkz,         X86::VPUNPCKHWDZ256rmkz,         0 },
+  { X86::VPUNPCKHWDZrrkz,            X86::VPUNPCKHWDZrmkz,            0 },
+  { X86::VPUNPCKLBWZ128rrkz,         X86::VPUNPCKLBWZ128rmkz,         0 },
+  { X86::VPUNPCKLBWZ256rrkz,         X86::VPUNPCKLBWZ256rmkz,         0 },
+  { X86::VPUNPCKLBWZrrkz,            X86::VPUNPCKLBWZrmkz,            0 },
+  { X86::VPUNPCKLDQZ128rrkz,         X86::VPUNPCKLDQZ128rmkz,         0 },
+  { X86::VPUNPCKLDQZ256rrkz,         X86::VPUNPCKLDQZ256rmkz,         0 },
+  { X86::VPUNPCKLDQZrrkz,            X86::VPUNPCKLDQZrmkz,            0 },
+  { X86::VPUNPCKLQDQZ128rrkz,        X86::VPUNPCKLQDQZ128rmkz,        0 },
+  { X86::VPUNPCKLQDQZ256rrkz,        X86::VPUNPCKLQDQZ256rmkz,        0 },
+  { X86::VPUNPCKLQDQZrrkz,           X86::VPUNPCKLQDQZrmkz,           0 },
+  { X86::VPUNPCKLWDZ128rrkz,         X86::VPUNPCKLWDZ128rmkz,         0 },
+  { X86::VPUNPCKLWDZ256rrkz,         X86::VPUNPCKLWDZ256rmkz,         0 },
+  { X86::VPUNPCKLWDZrrkz,            X86::VPUNPCKLWDZrmkz,            0 },
+  { X86::VPXORDZ128rrkz,             X86::VPXORDZ128rmkz,             0 },
+  { X86::VPXORDZ256rrkz,             X86::VPXORDZ256rmkz,             0 },
+  { X86::VPXORDZrrkz,                X86::VPXORDZrmkz,                0 },
+  { X86::VPXORQZ128rrkz,             X86::VPXORQZ128rmkz,             0 },
+  { X86::VPXORQZ256rrkz,             X86::VPXORQZ256rmkz,             0 },
+  { X86::VPXORQZrrkz,                X86::VPXORQZrmkz,                0 },
+  { X86::VRANGEPDZ128rrikz,          X86::VRANGEPDZ128rmikz,          0 },
+  { X86::VRANGEPDZ256rrikz,          X86::VRANGEPDZ256rmikz,          0 },
+  { X86::VRANGEPDZrrikz,             X86::VRANGEPDZrmikz,             0 },
+  { X86::VRANGEPSZ128rrikz,          X86::VRANGEPSZ128rmikz,          0 },
+  { X86::VRANGEPSZ256rrikz,          X86::VRANGEPSZ256rmikz,          0 },
+  { X86::VRANGEPSZrrikz,             X86::VRANGEPSZrmikz,             0 },
+  { X86::VRANGESDZrrikz,             X86::VRANGESDZrmikz,             TB_NO_REVERSE },
+  { X86::VRANGESSZrrikz,             X86::VRANGESSZrmikz,             TB_NO_REVERSE },
+  { X86::VRCP14PDZ128rk,             X86::VRCP14PDZ128mk,             0 },
+  { X86::VRCP14PDZ256rk,             X86::VRCP14PDZ256mk,             0 },
+  { X86::VRCP14PDZrk,                X86::VRCP14PDZmk,                0 },
+  { X86::VRCP14PSZ128rk,             X86::VRCP14PSZ128mk,             0 },
+  { X86::VRCP14PSZ256rk,             X86::VRCP14PSZ256mk,             0 },
+  { X86::VRCP14PSZrk,                X86::VRCP14PSZmk,                0 },
+  { X86::VRCP14SDZrrkz,              X86::VRCP14SDZrmkz,              TB_NO_REVERSE },
+  { X86::VRCP14SSZrrkz,              X86::VRCP14SSZrmkz,              TB_NO_REVERSE },
+  { X86::VRCP28PDZrk,                X86::VRCP28PDZmk,                0 },
+  { X86::VRCP28PSZrk,                X86::VRCP28PSZmk,                0 },
+  { X86::VRCP28SDZrkz,               X86::VRCP28SDZmkz,               TB_NO_REVERSE },
+  { X86::VRCP28SSZrkz,               X86::VRCP28SSZmkz,               TB_NO_REVERSE },
+  { X86::VREDUCEPDZ128rrik,          X86::VREDUCEPDZ128rmik,          0 },
+  { X86::VREDUCEPDZ256rrik,          X86::VREDUCEPDZ256rmik,          0 },
+  { X86::VREDUCEPDZrrik,             X86::VREDUCEPDZrmik,             0 },
+  { X86::VREDUCEPSZ128rrik,          X86::VREDUCEPSZ128rmik,          0 },
+  { X86::VREDUCEPSZ256rrik,          X86::VREDUCEPSZ256rmik,          0 },
+  { X86::VREDUCEPSZrrik,             X86::VREDUCEPSZrmik,             0 },
+  { X86::VREDUCESDZrrikz,            X86::VREDUCESDZrmikz,            TB_NO_REVERSE },
+  { X86::VREDUCESSZrrikz,            X86::VREDUCESSZrmikz,            TB_NO_REVERSE },
+  { X86::VRNDSCALEPDZ128rrik,        X86::VRNDSCALEPDZ128rmik,        0 },
+  { X86::VRNDSCALEPDZ256rrik,        X86::VRNDSCALEPDZ256rmik,        0 },
+  { X86::VRNDSCALEPDZrrik,           X86::VRNDSCALEPDZrmik,           0 },
+  { X86::VRNDSCALEPSZ128rrik,        X86::VRNDSCALEPSZ128rmik,        0 },
+  { X86::VRNDSCALEPSZ256rrik,        X86::VRNDSCALEPSZ256rmik,        0 },
+  { X86::VRNDSCALEPSZrrik,           X86::VRNDSCALEPSZrmik,           0 },
+  { X86::VRNDSCALESDZr_Intkz,        X86::VRNDSCALESDZm_Intkz,        TB_NO_REVERSE },
+  { X86::VRNDSCALESSZr_Intkz,        X86::VRNDSCALESSZm_Intkz,        TB_NO_REVERSE },
+  { X86::VRSQRT14PDZ128rk,           X86::VRSQRT14PDZ128mk,           0 },
+  { X86::VRSQRT14PDZ256rk,           X86::VRSQRT14PDZ256mk,           0 },
+  { X86::VRSQRT14PDZrk,              X86::VRSQRT14PDZmk,              0 },
+  { X86::VRSQRT14PSZ128rk,           X86::VRSQRT14PSZ128mk,           0 },
+  { X86::VRSQRT14PSZ256rk,           X86::VRSQRT14PSZ256mk,           0 },
+  { X86::VRSQRT14PSZrk,              X86::VRSQRT14PSZmk,              0 },
+  { X86::VRSQRT14SDZrrkz,            X86::VRSQRT14SDZrmkz,            TB_NO_REVERSE },
+  { X86::VRSQRT14SSZrrkz,            X86::VRSQRT14SSZrmkz,            TB_NO_REVERSE },
+  { X86::VRSQRT28PDZrk,              X86::VRSQRT28PDZmk,              0 },
+  { X86::VRSQRT28PSZrk,              X86::VRSQRT28PSZmk,              0 },
+  { X86::VRSQRT28SDZrkz,             X86::VRSQRT28SDZmkz,             TB_NO_REVERSE },
+  { X86::VRSQRT28SSZrkz,             X86::VRSQRT28SSZmkz,             TB_NO_REVERSE },
+  { X86::VSCALEFPDZ128rrkz,          X86::VSCALEFPDZ128rmkz,          0 },
+  { X86::VSCALEFPDZ256rrkz,          X86::VSCALEFPDZ256rmkz,          0 },
+  { X86::VSCALEFPDZrrkz,             X86::VSCALEFPDZrmkz,             0 },
+  { X86::VSCALEFPSZ128rrkz,          X86::VSCALEFPSZ128rmkz,          0 },
+  { X86::VSCALEFPSZ256rrkz,          X86::VSCALEFPSZ256rmkz,          0 },
+  { X86::VSCALEFPSZrrkz,             X86::VSCALEFPSZrmkz,             0 },
+  { X86::VSCALEFSDZrrkz,             X86::VSCALEFSDZrmkz,             TB_NO_REVERSE },
+  { X86::VSCALEFSSZrrkz,             X86::VSCALEFSSZrmkz,             TB_NO_REVERSE },
+  { X86::VSHUFF32X4Z256rrikz,        X86::VSHUFF32X4Z256rmikz,        0 },
+  { X86::VSHUFF32X4Zrrikz,           X86::VSHUFF32X4Zrmikz,           0 },
+  { X86::VSHUFF64X2Z256rrikz,        X86::VSHUFF64X2Z256rmikz,        0 },
+  { X86::VSHUFF64X2Zrrikz,           X86::VSHUFF64X2Zrmikz,           0 },
+  { X86::VSHUFI32X4Z256rrikz,        X86::VSHUFI32X4Z256rmikz,        0 },
+  { X86::VSHUFI32X4Zrrikz,           X86::VSHUFI32X4Zrmikz,           0 },
+  { X86::VSHUFI64X2Z256rrikz,        X86::VSHUFI64X2Z256rmikz,        0 },
+  { X86::VSHUFI64X2Zrrikz,           X86::VSHUFI64X2Zrmikz,           0 },
+  { X86::VSHUFPDZ128rrikz,           X86::VSHUFPDZ128rmikz,           0 },
+  { X86::VSHUFPDZ256rrikz,           X86::VSHUFPDZ256rmikz,           0 },
+  { X86::VSHUFPDZrrikz,              X86::VSHUFPDZrmikz,              0 },
+  { X86::VSHUFPSZ128rrikz,           X86::VSHUFPSZ128rmikz,           0 },
+  { X86::VSHUFPSZ256rrikz,           X86::VSHUFPSZ256rmikz,           0 },
+  { X86::VSHUFPSZrrikz,              X86::VSHUFPSZrmikz,              0 },
+  { X86::VSQRTPDZ128rk,              X86::VSQRTPDZ128mk,              0 },
+  { X86::VSQRTPDZ256rk,              X86::VSQRTPDZ256mk,              0 },
+  { X86::VSQRTPDZrk,                 X86::VSQRTPDZmk,                 0 },
+  { X86::VSQRTPSZ128rk,              X86::VSQRTPSZ128mk,              0 },
+  { X86::VSQRTPSZ256rk,              X86::VSQRTPSZ256mk,              0 },
+  { X86::VSQRTPSZrk,                 X86::VSQRTPSZmk,                 0 },
+  { X86::VSQRTSDZr_Intkz,            X86::VSQRTSDZm_Intkz,            TB_NO_REVERSE },
+  { X86::VSQRTSSZr_Intkz,            X86::VSQRTSSZm_Intkz,            TB_NO_REVERSE },
+  { X86::VSUBPDZ128rrkz,             X86::VSUBPDZ128rmkz,             0 },
+  { X86::VSUBPDZ256rrkz,             X86::VSUBPDZ256rmkz,             0 },
+  { X86::VSUBPDZrrkz,                X86::VSUBPDZrmkz,                0 },
+  { X86::VSUBPSZ128rrkz,             X86::VSUBPSZ128rmkz,             0 },
+  { X86::VSUBPSZ256rrkz,             X86::VSUBPSZ256rmkz,             0 },
+  { X86::VSUBPSZrrkz,                X86::VSUBPSZrmkz,                0 },
+  { X86::VSUBSDZrr_Intkz,            X86::VSUBSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VSUBSSZrr_Intkz,            X86::VSUBSSZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VUNPCKHPDZ128rrkz,          X86::VUNPCKHPDZ128rmkz,          0 },
+  { X86::VUNPCKHPDZ256rrkz,          X86::VUNPCKHPDZ256rmkz,          0 },
+  { X86::VUNPCKHPDZrrkz,             X86::VUNPCKHPDZrmkz,             0 },
+  { X86::VUNPCKHPSZ128rrkz,          X86::VUNPCKHPSZ128rmkz,          0 },
+  { X86::VUNPCKHPSZ256rrkz,          X86::VUNPCKHPSZ256rmkz,          0 },
+  { X86::VUNPCKHPSZrrkz,             X86::VUNPCKHPSZrmkz,             0 },
+  { X86::VUNPCKLPDZ128rrkz,          X86::VUNPCKLPDZ128rmkz,          0 },
+  { X86::VUNPCKLPDZ256rrkz,          X86::VUNPCKLPDZ256rmkz,          0 },
+  { X86::VUNPCKLPDZrrkz,             X86::VUNPCKLPDZrmkz,             0 },
+  { X86::VUNPCKLPSZ128rrkz,          X86::VUNPCKLPSZ128rmkz,          0 },
+  { X86::VUNPCKLPSZ256rrkz,          X86::VUNPCKLPSZ256rmkz,          0 },
+  { X86::VUNPCKLPSZrrkz,             X86::VUNPCKLPSZrmkz,             0 },
+  { X86::VXORPDZ128rrkz,             X86::VXORPDZ128rmkz,             0 },
+  { X86::VXORPDZ256rrkz,             X86::VXORPDZ256rmkz,             0 },
+  { X86::VXORPDZrrkz,                X86::VXORPDZrmkz,                0 },
+  { X86::VXORPSZ128rrkz,             X86::VXORPSZ128rmkz,             0 },
+  { X86::VXORPSZ256rrkz,             X86::VXORPSZ256rmkz,             0 },
+  { X86::VXORPSZrrkz,                X86::VXORPSZrmkz,                0 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+  { X86::VADDPDZ128rrk,             X86::VADDPDZ128rmk,             0 },
+  { X86::VADDPDZ256rrk,             X86::VADDPDZ256rmk,             0 },
+  { X86::VADDPDZrrk,                X86::VADDPDZrmk,                0 },
+  { X86::VADDPSZ128rrk,             X86::VADDPSZ128rmk,             0 },
+  { X86::VADDPSZ256rrk,             X86::VADDPSZ256rmk,             0 },
+  { X86::VADDPSZrrk,                X86::VADDPSZrmk,                0 },
+  { X86::VADDSDZrr_Intk,            X86::VADDSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VADDSSZrr_Intk,            X86::VADDSSZrm_Intk,            TB_NO_REVERSE },
+  { X86::VALIGNDZ128rrik,           X86::VALIGNDZ128rmik,           0 },
+  { X86::VALIGNDZ256rrik,           X86::VALIGNDZ256rmik,           0 },
+  { X86::VALIGNDZrrik,              X86::VALIGNDZrmik,              0 },
+  { X86::VALIGNQZ128rrik,           X86::VALIGNQZ128rmik,           0 },
+  { X86::VALIGNQZ256rrik,           X86::VALIGNQZ256rmik,           0 },
+  { X86::VALIGNQZrrik,              X86::VALIGNQZrmik,              0 },
+  { X86::VANDNPDZ128rrk,            X86::VANDNPDZ128rmk,            0 },
+  { X86::VANDNPDZ256rrk,            X86::VANDNPDZ256rmk,            0 },
+  { X86::VANDNPDZrrk,               X86::VANDNPDZrmk,               0 },
+  { X86::VANDNPSZ128rrk,            X86::VANDNPSZ128rmk,            0 },
+  { X86::VANDNPSZ256rrk,            X86::VANDNPSZ256rmk,            0 },
+  { X86::VANDNPSZrrk,               X86::VANDNPSZrmk,               0 },
+  { X86::VANDPDZ128rrk,             X86::VANDPDZ128rmk,             0 },
+  { X86::VANDPDZ256rrk,             X86::VANDPDZ256rmk,             0 },
+  { X86::VANDPDZrrk,                X86::VANDPDZrmk,                0 },
+  { X86::VANDPSZ128rrk,             X86::VANDPSZ128rmk,             0 },
+  { X86::VANDPSZ256rrk,             X86::VANDPSZ256rmk,             0 },
+  { X86::VANDPSZrrk,                X86::VANDPSZrmk,                0 },
+  { X86::VCVTSD2SSZrr_Intk,         X86::VCVTSD2SSZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSS2SDZrr_Intk,         X86::VCVTSS2SDZrm_Intk,         TB_NO_REVERSE },
+  { X86::VDBPSADBWZ128rrik,         X86::VDBPSADBWZ128rmik,         0 },
+  { X86::VDBPSADBWZ256rrik,         X86::VDBPSADBWZ256rmik,         0 },
+  { X86::VDBPSADBWZrrik,            X86::VDBPSADBWZrmik,            0 },
+  { X86::VDIVPDZ128rrk,             X86::VDIVPDZ128rmk,             0 },
+  { X86::VDIVPDZ256rrk,             X86::VDIVPDZ256rmk,             0 },
+  { X86::VDIVPDZrrk,                X86::VDIVPDZrmk,                0 },
+  { X86::VDIVPSZ128rrk,             X86::VDIVPSZ128rmk,             0 },
+  { X86::VDIVPSZ256rrk,             X86::VDIVPSZ256rmk,             0 },
+  { X86::VDIVPSZrrk,                X86::VDIVPSZrmk,                0 },
+  { X86::VDIVSDZrr_Intk,            X86::VDIVSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VDIVSSZrr_Intk,            X86::VDIVSSZrm_Intk,            TB_NO_REVERSE },
+  { X86::VFIXUPIMMPDZ128rrik,       X86::VFIXUPIMMPDZ128rmik,       0 },
+  { X86::VFIXUPIMMPDZ128rrikz,      X86::VFIXUPIMMPDZ128rmikz,      0 },
+  { X86::VFIXUPIMMPDZ256rrik,       X86::VFIXUPIMMPDZ256rmik,       0 },
+  { X86::VFIXUPIMMPDZ256rrikz,      X86::VFIXUPIMMPDZ256rmikz,      0 },
+  { X86::VFIXUPIMMPDZrrik,          X86::VFIXUPIMMPDZrmik,          0 },
+  { X86::VFIXUPIMMPDZrrikz,         X86::VFIXUPIMMPDZrmikz,         0 },
+  { X86::VFIXUPIMMPSZ128rrik,       X86::VFIXUPIMMPSZ128rmik,       0 },
+  { X86::VFIXUPIMMPSZ128rrikz,      X86::VFIXUPIMMPSZ128rmikz,      0 },
+  { X86::VFIXUPIMMPSZ256rrik,       X86::VFIXUPIMMPSZ256rmik,       0 },
+  { X86::VFIXUPIMMPSZ256rrikz,      X86::VFIXUPIMMPSZ256rmikz,      0 },
+  { X86::VFIXUPIMMPSZrrik,          X86::VFIXUPIMMPSZrmik,          0 },
+  { X86::VFIXUPIMMPSZrrikz,         X86::VFIXUPIMMPSZrmikz,         0 },
+  { X86::VFIXUPIMMSDZrrik,          X86::VFIXUPIMMSDZrmik,          TB_NO_REVERSE },
+  { X86::VFIXUPIMMSDZrrikz,         X86::VFIXUPIMMSDZrmikz,         TB_NO_REVERSE },
+  { X86::VFIXUPIMMSSZrrik,          X86::VFIXUPIMMSSZrmik,          TB_NO_REVERSE },
+  { X86::VFIXUPIMMSSZrrikz,         X86::VFIXUPIMMSSZrmikz,         TB_NO_REVERSE },
+  { X86::VFMADD132PDZ128rk,         X86::VFMADD132PDZ128mk,         0 },
+  { X86::VFMADD132PDZ128rkz,        X86::VFMADD132PDZ128mkz,        0 },
+  { X86::VFMADD132PDZ256rk,         X86::VFMADD132PDZ256mk,         0 },
+  { X86::VFMADD132PDZ256rkz,        X86::VFMADD132PDZ256mkz,        0 },
+  { X86::VFMADD132PDZrk,            X86::VFMADD132PDZmk,            0 },
+  { X86::VFMADD132PDZrkz,           X86::VFMADD132PDZmkz,           0 },
+  { X86::VFMADD132PSZ128rk,         X86::VFMADD132PSZ128mk,         0 },
+  { X86::VFMADD132PSZ128rkz,        X86::VFMADD132PSZ128mkz,        0 },
+  { X86::VFMADD132PSZ256rk,         X86::VFMADD132PSZ256mk,         0 },
+  { X86::VFMADD132PSZ256rkz,        X86::VFMADD132PSZ256mkz,        0 },
+  { X86::VFMADD132PSZrk,            X86::VFMADD132PSZmk,            0 },
+  { X86::VFMADD132PSZrkz,           X86::VFMADD132PSZmkz,           0 },
+  { X86::VFMADD132SDZr_Intk,        X86::VFMADD132SDZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD132SDZr_Intkz,       X86::VFMADD132SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD132SSZr_Intk,        X86::VFMADD132SSZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD132SSZr_Intkz,       X86::VFMADD132SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD213PDZ128rk,         X86::VFMADD213PDZ128mk,         0 },
+  { X86::VFMADD213PDZ128rkz,        X86::VFMADD213PDZ128mkz,        0 },
+  { X86::VFMADD213PDZ256rk,         X86::VFMADD213PDZ256mk,         0 },
+  { X86::VFMADD213PDZ256rkz,        X86::VFMADD213PDZ256mkz,        0 },
+  { X86::VFMADD213PDZrk,            X86::VFMADD213PDZmk,            0 },
+  { X86::VFMADD213PDZrkz,           X86::VFMADD213PDZmkz,           0 },
+  { X86::VFMADD213PSZ128rk,         X86::VFMADD213PSZ128mk,         0 },
+  { X86::VFMADD213PSZ128rkz,        X86::VFMADD213PSZ128mkz,        0 },
+  { X86::VFMADD213PSZ256rk,         X86::VFMADD213PSZ256mk,         0 },
+  { X86::VFMADD213PSZ256rkz,        X86::VFMADD213PSZ256mkz,        0 },
+  { X86::VFMADD213PSZrk,            X86::VFMADD213PSZmk,            0 },
+  { X86::VFMADD213PSZrkz,           X86::VFMADD213PSZmkz,           0 },
+  { X86::VFMADD213SDZr_Intk,        X86::VFMADD213SDZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD213SDZr_Intkz,       X86::VFMADD213SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD213SSZr_Intk,        X86::VFMADD213SSZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD213SSZr_Intkz,       X86::VFMADD213SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD231PDZ128rk,         X86::VFMADD231PDZ128mk,         0 },
+  { X86::VFMADD231PDZ128rkz,        X86::VFMADD231PDZ128mkz,        0 },
+  { X86::VFMADD231PDZ256rk,         X86::VFMADD231PDZ256mk,         0 },
+  { X86::VFMADD231PDZ256rkz,        X86::VFMADD231PDZ256mkz,        0 },
+  { X86::VFMADD231PDZrk,            X86::VFMADD231PDZmk,            0 },
+  { X86::VFMADD231PDZrkz,           X86::VFMADD231PDZmkz,           0 },
+  { X86::VFMADD231PSZ128rk,         X86::VFMADD231PSZ128mk,         0 },
+  { X86::VFMADD231PSZ128rkz,        X86::VFMADD231PSZ128mkz,        0 },
+  { X86::VFMADD231PSZ256rk,         X86::VFMADD231PSZ256mk,         0 },
+  { X86::VFMADD231PSZ256rkz,        X86::VFMADD231PSZ256mkz,        0 },
+  { X86::VFMADD231PSZrk,            X86::VFMADD231PSZmk,            0 },
+  { X86::VFMADD231PSZrkz,           X86::VFMADD231PSZmkz,           0 },
+  { X86::VFMADD231SDZr_Intk,        X86::VFMADD231SDZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD231SDZr_Intkz,       X86::VFMADD231SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD231SSZr_Intk,        X86::VFMADD231SSZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD231SSZr_Intkz,       X86::VFMADD231SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADDSUB132PDZ128rk,      X86::VFMADDSUB132PDZ128mk,      0 },
+  { X86::VFMADDSUB132PDZ128rkz,     X86::VFMADDSUB132PDZ128mkz,     0 },
+  { X86::VFMADDSUB132PDZ256rk,      X86::VFMADDSUB132PDZ256mk,      0 },
+  { X86::VFMADDSUB132PDZ256rkz,     X86::VFMADDSUB132PDZ256mkz,     0 },
+  { X86::VFMADDSUB132PDZrk,         X86::VFMADDSUB132PDZmk,         0 },
+  { X86::VFMADDSUB132PDZrkz,        X86::VFMADDSUB132PDZmkz,        0 },
+  { X86::VFMADDSUB132PSZ128rk,      X86::VFMADDSUB132PSZ128mk,      0 },
+  { X86::VFMADDSUB132PSZ128rkz,     X86::VFMADDSUB132PSZ128mkz,     0 },
+  { X86::VFMADDSUB132PSZ256rk,      X86::VFMADDSUB132PSZ256mk,      0 },
+  { X86::VFMADDSUB132PSZ256rkz,     X86::VFMADDSUB132PSZ256mkz,     0 },
+  { X86::VFMADDSUB132PSZrk,         X86::VFMADDSUB132PSZmk,         0 },
+  { X86::VFMADDSUB132PSZrkz,        X86::VFMADDSUB132PSZmkz,        0 },
+  { X86::VFMADDSUB213PDZ128rk,      X86::VFMADDSUB213PDZ128mk,      0 },
+  { X86::VFMADDSUB213PDZ128rkz,     X86::VFMADDSUB213PDZ128mkz,     0 },
+  { X86::VFMADDSUB213PDZ256rk,      X86::VFMADDSUB213PDZ256mk,      0 },
+  { X86::VFMADDSUB213PDZ256rkz,     X86::VFMADDSUB213PDZ256mkz,     0 },
+  { X86::VFMADDSUB213PDZrk,         X86::VFMADDSUB213PDZmk,         0 },
+  { X86::VFMADDSUB213PDZrkz,        X86::VFMADDSUB213PDZmkz,        0 },
+  { X86::VFMADDSUB213PSZ128rk,      X86::VFMADDSUB213PSZ128mk,      0 },
+  { X86::VFMADDSUB213PSZ128rkz,     X86::VFMADDSUB213PSZ128mkz,     0 },
+  { X86::VFMADDSUB213PSZ256rk,      X86::VFMADDSUB213PSZ256mk,      0 },
+  { X86::VFMADDSUB213PSZ256rkz,     X86::VFMADDSUB213PSZ256mkz,     0 },
+  { X86::VFMADDSUB213PSZrk,         X86::VFMADDSUB213PSZmk,         0 },
+  { X86::VFMADDSUB213PSZrkz,        X86::VFMADDSUB213PSZmkz,        0 },
+  { X86::VFMADDSUB231PDZ128rk,      X86::VFMADDSUB231PDZ128mk,      0 },
+  { X86::VFMADDSUB231PDZ128rkz,     X86::VFMADDSUB231PDZ128mkz,     0 },
+  { X86::VFMADDSUB231PDZ256rk,      X86::VFMADDSUB231PDZ256mk,      0 },
+  { X86::VFMADDSUB231PDZ256rkz,     X86::VFMADDSUB231PDZ256mkz,     0 },
+  { X86::VFMADDSUB231PDZrk,         X86::VFMADDSUB231PDZmk,         0 },
+  { X86::VFMADDSUB231PDZrkz,        X86::VFMADDSUB231PDZmkz,        0 },
+  { X86::VFMADDSUB231PSZ128rk,      X86::VFMADDSUB231PSZ128mk,      0 },
+  { X86::VFMADDSUB231PSZ128rkz,     X86::VFMADDSUB231PSZ128mkz,     0 },
+  { X86::VFMADDSUB231PSZ256rk,      X86::VFMADDSUB231PSZ256mk,      0 },
+  { X86::VFMADDSUB231PSZ256rkz,     X86::VFMADDSUB231PSZ256mkz,     0 },
+  { X86::VFMADDSUB231PSZrk,         X86::VFMADDSUB231PSZmk,         0 },
+  { X86::VFMADDSUB231PSZrkz,        X86::VFMADDSUB231PSZmkz,        0 },
+  { X86::VFMSUB132PDZ128rk,         X86::VFMSUB132PDZ128mk,         0 },
+  { X86::VFMSUB132PDZ128rkz,        X86::VFMSUB132PDZ128mkz,        0 },
+  { X86::VFMSUB132PDZ256rk,         X86::VFMSUB132PDZ256mk,         0 },
+  { X86::VFMSUB132PDZ256rkz,        X86::VFMSUB132PDZ256mkz,        0 },
+  { X86::VFMSUB132PDZrk,            X86::VFMSUB132PDZmk,            0 },
+  { X86::VFMSUB132PDZrkz,           X86::VFMSUB132PDZmkz,           0 },
+  { X86::VFMSUB132PSZ128rk,         X86::VFMSUB132PSZ128mk,         0 },
+  { X86::VFMSUB132PSZ128rkz,        X86::VFMSUB132PSZ128mkz,        0 },
+  { X86::VFMSUB132PSZ256rk,         X86::VFMSUB132PSZ256mk,         0 },
+  { X86::VFMSUB132PSZ256rkz,        X86::VFMSUB132PSZ256mkz,        0 },
+  { X86::VFMSUB132PSZrk,            X86::VFMSUB132PSZmk,            0 },
+  { X86::VFMSUB132PSZrkz,           X86::VFMSUB132PSZmkz,           0 },
+  { X86::VFMSUB132SDZr_Intk,        X86::VFMSUB132SDZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB132SDZr_Intkz,       X86::VFMSUB132SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB132SSZr_Intk,        X86::VFMSUB132SSZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB132SSZr_Intkz,       X86::VFMSUB132SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB213PDZ128rk,         X86::VFMSUB213PDZ128mk,         0 },
+  { X86::VFMSUB213PDZ128rkz,        X86::VFMSUB213PDZ128mkz,        0 },
+  { X86::VFMSUB213PDZ256rk,         X86::VFMSUB213PDZ256mk,         0 },
+  { X86::VFMSUB213PDZ256rkz,        X86::VFMSUB213PDZ256mkz,        0 },
+  { X86::VFMSUB213PDZrk,            X86::VFMSUB213PDZmk,            0 },
+  { X86::VFMSUB213PDZrkz,           X86::VFMSUB213PDZmkz,           0 },
+  { X86::VFMSUB213PSZ128rk,         X86::VFMSUB213PSZ128mk,         0 },
+  { X86::VFMSUB213PSZ128rkz,        X86::VFMSUB213PSZ128mkz,        0 },
+  { X86::VFMSUB213PSZ256rk,         X86::VFMSUB213PSZ256mk,         0 },
+  { X86::VFMSUB213PSZ256rkz,        X86::VFMSUB213PSZ256mkz,        0 },
+  { X86::VFMSUB213PSZrk,            X86::VFMSUB213PSZmk,            0 },
+  { X86::VFMSUB213PSZrkz,           X86::VFMSUB213PSZmkz,           0 },
+  { X86::VFMSUB213SDZr_Intk,        X86::VFMSUB213SDZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB213SDZr_Intkz,       X86::VFMSUB213SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB213SSZr_Intk,        X86::VFMSUB213SSZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB213SSZr_Intkz,       X86::VFMSUB213SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB231PDZ128rk,         X86::VFMSUB231PDZ128mk,         0 },
+  { X86::VFMSUB231PDZ128rkz,        X86::VFMSUB231PDZ128mkz,        0 },
+  { X86::VFMSUB231PDZ256rk,         X86::VFMSUB231PDZ256mk,         0 },
+  { X86::VFMSUB231PDZ256rkz,        X86::VFMSUB231PDZ256mkz,        0 },
+  { X86::VFMSUB231PDZrk,            X86::VFMSUB231PDZmk,            0 },
+  { X86::VFMSUB231PDZrkz,           X86::VFMSUB231PDZmkz,           0 },
+  { X86::VFMSUB231PSZ128rk,         X86::VFMSUB231PSZ128mk,         0 },
+  { X86::VFMSUB231PSZ128rkz,        X86::VFMSUB231PSZ128mkz,        0 },
+  { X86::VFMSUB231PSZ256rk,         X86::VFMSUB231PSZ256mk,         0 },
+  { X86::VFMSUB231PSZ256rkz,        X86::VFMSUB231PSZ256mkz,        0 },
+  { X86::VFMSUB231PSZrk,            X86::VFMSUB231PSZmk,            0 },
+  { X86::VFMSUB231PSZrkz,           X86::VFMSUB231PSZmkz,           0 },
+  { X86::VFMSUB231SDZr_Intk,        X86::VFMSUB231SDZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB231SDZr_Intkz,       X86::VFMSUB231SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB231SSZr_Intk,        X86::VFMSUB231SSZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB231SSZr_Intkz,       X86::VFMSUB231SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUBADD132PDZ128rk,      X86::VFMSUBADD132PDZ128mk,      0 },
+  { X86::VFMSUBADD132PDZ128rkz,     X86::VFMSUBADD132PDZ128mkz,     0 },
+  { X86::VFMSUBADD132PDZ256rk,      X86::VFMSUBADD132PDZ256mk,      0 },
+  { X86::VFMSUBADD132PDZ256rkz,     X86::VFMSUBADD132PDZ256mkz,     0 },
+  { X86::VFMSUBADD132PDZrk,         X86::VFMSUBADD132PDZmk,         0 },
+  { X86::VFMSUBADD132PDZrkz,        X86::VFMSUBADD132PDZmkz,        0 },
+  { X86::VFMSUBADD132PSZ128rk,      X86::VFMSUBADD132PSZ128mk,      0 },
+  { X86::VFMSUBADD132PSZ128rkz,     X86::VFMSUBADD132PSZ128mkz,     0 },
+  { X86::VFMSUBADD132PSZ256rk,      X86::VFMSUBADD132PSZ256mk,      0 },
+  { X86::VFMSUBADD132PSZ256rkz,     X86::VFMSUBADD132PSZ256mkz,     0 },
+  { X86::VFMSUBADD132PSZrk,         X86::VFMSUBADD132PSZmk,         0 },
+  { X86::VFMSUBADD132PSZrkz,        X86::VFMSUBADD132PSZmkz,        0 },
+  { X86::VFMSUBADD213PDZ128rk,      X86::VFMSUBADD213PDZ128mk,      0 },
+  { X86::VFMSUBADD213PDZ128rkz,     X86::VFMSUBADD213PDZ128mkz,     0 },
+  { X86::VFMSUBADD213PDZ256rk,      X86::VFMSUBADD213PDZ256mk,      0 },
+  { X86::VFMSUBADD213PDZ256rkz,     X86::VFMSUBADD213PDZ256mkz,     0 },
+  { X86::VFMSUBADD213PDZrk,         X86::VFMSUBADD213PDZmk,         0 },
+  { X86::VFMSUBADD213PDZrkz,        X86::VFMSUBADD213PDZmkz,        0 },
+  { X86::VFMSUBADD213PSZ128rk,      X86::VFMSUBADD213PSZ128mk,      0 },
+  { X86::VFMSUBADD213PSZ128rkz,     X86::VFMSUBADD213PSZ128mkz,     0 },
+  { X86::VFMSUBADD213PSZ256rk,      X86::VFMSUBADD213PSZ256mk,      0 },
+  { X86::VFMSUBADD213PSZ256rkz,     X86::VFMSUBADD213PSZ256mkz,     0 },
+  { X86::VFMSUBADD213PSZrk,         X86::VFMSUBADD213PSZmk,         0 },
+  { X86::VFMSUBADD213PSZrkz,        X86::VFMSUBADD213PSZmkz,        0 },
+  { X86::VFMSUBADD231PDZ128rk,      X86::VFMSUBADD231PDZ128mk,      0 },
+  { X86::VFMSUBADD231PDZ128rkz,     X86::VFMSUBADD231PDZ128mkz,     0 },
+  { X86::VFMSUBADD231PDZ256rk,      X86::VFMSUBADD231PDZ256mk,      0 },
+  { X86::VFMSUBADD231PDZ256rkz,     X86::VFMSUBADD231PDZ256mkz,     0 },
+  { X86::VFMSUBADD231PDZrk,         X86::VFMSUBADD231PDZmk,         0 },
+  { X86::VFMSUBADD231PDZrkz,        X86::VFMSUBADD231PDZmkz,        0 },
+  { X86::VFMSUBADD231PSZ128rk,      X86::VFMSUBADD231PSZ128mk,      0 },
+  { X86::VFMSUBADD231PSZ128rkz,     X86::VFMSUBADD231PSZ128mkz,     0 },
+  { X86::VFMSUBADD231PSZ256rk,      X86::VFMSUBADD231PSZ256mk,      0 },
+  { X86::VFMSUBADD231PSZ256rkz,     X86::VFMSUBADD231PSZ256mkz,     0 },
+  { X86::VFMSUBADD231PSZrk,         X86::VFMSUBADD231PSZmk,         0 },
+  { X86::VFMSUBADD231PSZrkz,        X86::VFMSUBADD231PSZmkz,        0 },
+  { X86::VFNMADD132PDZ128rk,        X86::VFNMADD132PDZ128mk,        0 },
+  { X86::VFNMADD132PDZ128rkz,       X86::VFNMADD132PDZ128mkz,       0 },
+  { X86::VFNMADD132PDZ256rk,        X86::VFNMADD132PDZ256mk,        0 },
+  { X86::VFNMADD132PDZ256rkz,       X86::VFNMADD132PDZ256mkz,       0 },
+  { X86::VFNMADD132PDZrk,           X86::VFNMADD132PDZmk,           0 },
+  { X86::VFNMADD132PDZrkz,          X86::VFNMADD132PDZmkz,          0 },
+  { X86::VFNMADD132PSZ128rk,        X86::VFNMADD132PSZ128mk,        0 },
+  { X86::VFNMADD132PSZ128rkz,       X86::VFNMADD132PSZ128mkz,       0 },
+  { X86::VFNMADD132PSZ256rk,        X86::VFNMADD132PSZ256mk,        0 },
+  { X86::VFNMADD132PSZ256rkz,       X86::VFNMADD132PSZ256mkz,       0 },
+  { X86::VFNMADD132PSZrk,           X86::VFNMADD132PSZmk,           0 },
+  { X86::VFNMADD132PSZrkz,          X86::VFNMADD132PSZmkz,          0 },
+  { X86::VFNMADD132SDZr_Intk,       X86::VFNMADD132SDZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD132SDZr_Intkz,      X86::VFNMADD132SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD132SSZr_Intk,       X86::VFNMADD132SSZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD132SSZr_Intkz,      X86::VFNMADD132SSZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD213PDZ128rk,        X86::VFNMADD213PDZ128mk,        0 },
+  { X86::VFNMADD213PDZ128rkz,       X86::VFNMADD213PDZ128mkz,       0 },
+  { X86::VFNMADD213PDZ256rk,        X86::VFNMADD213PDZ256mk,        0 },
+  { X86::VFNMADD213PDZ256rkz,       X86::VFNMADD213PDZ256mkz,       0 },
+  { X86::VFNMADD213PDZrk,           X86::VFNMADD213PDZmk,           0 },
+  { X86::VFNMADD213PDZrkz,          X86::VFNMADD213PDZmkz,          0 },
+  { X86::VFNMADD213PSZ128rk,        X86::VFNMADD213PSZ128mk,        0 },
+  { X86::VFNMADD213PSZ128rkz,       X86::VFNMADD213PSZ128mkz,       0 },
+  { X86::VFNMADD213PSZ256rk,        X86::VFNMADD213PSZ256mk,        0 },
+  { X86::VFNMADD213PSZ256rkz,       X86::VFNMADD213PSZ256mkz,       0 },
+  { X86::VFNMADD213PSZrk,           X86::VFNMADD213PSZmk,           0 },
+  { X86::VFNMADD213PSZrkz,          X86::VFNMADD213PSZmkz,          0 },
+  { X86::VFNMADD213SDZr_Intk,       X86::VFNMADD213SDZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD213SDZr_Intkz,      X86::VFNMADD213SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD213SSZr_Intk,       X86::VFNMADD213SSZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD213SSZr_Intkz,      X86::VFNMADD213SSZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD231PDZ128rk,        X86::VFNMADD231PDZ128mk,        0 },
+  { X86::VFNMADD231PDZ128rkz,       X86::VFNMADD231PDZ128mkz,       0 },
+  { X86::VFNMADD231PDZ256rk,        X86::VFNMADD231PDZ256mk,        0 },
+  { X86::VFNMADD231PDZ256rkz,       X86::VFNMADD231PDZ256mkz,       0 },
+  { X86::VFNMADD231PDZrk,           X86::VFNMADD231PDZmk,           0 },
+  { X86::VFNMADD231PDZrkz,          X86::VFNMADD231PDZmkz,          0 },
+  { X86::VFNMADD231PSZ128rk,        X86::VFNMADD231PSZ128mk,        0 },
+  { X86::VFNMADD231PSZ128rkz,       X86::VFNMADD231PSZ128mkz,       0 },
+  { X86::VFNMADD231PSZ256rk,        X86::VFNMADD231PSZ256mk,        0 },
+  { X86::VFNMADD231PSZ256rkz,       X86::VFNMADD231PSZ256mkz,       0 },
+  { X86::VFNMADD231PSZrk,           X86::VFNMADD231PSZmk,           0 },
+  { X86::VFNMADD231PSZrkz,          X86::VFNMADD231PSZmkz,          0 },
+  { X86::VFNMADD231SDZr_Intk,       X86::VFNMADD231SDZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD231SDZr_Intkz,      X86::VFNMADD231SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD231SSZr_Intk,       X86::VFNMADD231SSZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD231SSZr_Intkz,      X86::VFNMADD231SSZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB132PDZ128rk,        X86::VFNMSUB132PDZ128mk,        0 },
+  { X86::VFNMSUB132PDZ128rkz,       X86::VFNMSUB132PDZ128mkz,       0 },
+  { X86::VFNMSUB132PDZ256rk,        X86::VFNMSUB132PDZ256mk,        0 },
+  { X86::VFNMSUB132PDZ256rkz,       X86::VFNMSUB132PDZ256mkz,       0 },
+  { X86::VFNMSUB132PDZrk,           X86::VFNMSUB132PDZmk,           0 },
+  { X86::VFNMSUB132PDZrkz,          X86::VFNMSUB132PDZmkz,          0 },
+  { X86::VFNMSUB132PSZ128rk,        X86::VFNMSUB132PSZ128mk,        0 },
+  { X86::VFNMSUB132PSZ128rkz,       X86::VFNMSUB132PSZ128mkz,       0 },
+  { X86::VFNMSUB132PSZ256rk,        X86::VFNMSUB132PSZ256mk,        0 },
+  { X86::VFNMSUB132PSZ256rkz,       X86::VFNMSUB132PSZ256mkz,       0 },
+  { X86::VFNMSUB132PSZrk,           X86::VFNMSUB132PSZmk,           0 },
+  { X86::VFNMSUB132PSZrkz,          X86::VFNMSUB132PSZmkz,          0 },
+  { X86::VFNMSUB132SDZr_Intk,       X86::VFNMSUB132SDZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB132SDZr_Intkz,      X86::VFNMSUB132SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB132SSZr_Intk,       X86::VFNMSUB132SSZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB132SSZr_Intkz,      X86::VFNMSUB132SSZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB213PDZ128rk,        X86::VFNMSUB213PDZ128mk,        0 },
+  { X86::VFNMSUB213PDZ128rkz,       X86::VFNMSUB213PDZ128mkz,       0 },
+  { X86::VFNMSUB213PDZ256rk,        X86::VFNMSUB213PDZ256mk,        0 },
+  { X86::VFNMSUB213PDZ256rkz,       X86::VFNMSUB213PDZ256mkz,       0 },
+  { X86::VFNMSUB213PDZrk,           X86::VFNMSUB213PDZmk,           0 },
+  { X86::VFNMSUB213PDZrkz,          X86::VFNMSUB213PDZmkz,          0 },
+  { X86::VFNMSUB213PSZ128rk,        X86::VFNMSUB213PSZ128mk,        0 },
+  { X86::VFNMSUB213PSZ128rkz,       X86::VFNMSUB213PSZ128mkz,       0 },
+  { X86::VFNMSUB213PSZ256rk,        X86::VFNMSUB213PSZ256mk,        0 },
+  { X86::VFNMSUB213PSZ256rkz,       X86::VFNMSUB213PSZ256mkz,       0 },
+  { X86::VFNMSUB213PSZrk,           X86::VFNMSUB213PSZmk,           0 },
+  { X86::VFNMSUB213PSZrkz,          X86::VFNMSUB213PSZmkz,          0 },
+  { X86::VFNMSUB213SDZr_Intk,       X86::VFNMSUB213SDZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB213SDZr_Intkz,      X86::VFNMSUB213SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB213SSZr_Intk,       X86::VFNMSUB213SSZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB213SSZr_Intkz,      X86::VFNMSUB213SSZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB231PDZ128rk,        X86::VFNMSUB231PDZ128mk,        0 },
+  { X86::VFNMSUB231PDZ128rkz,       X86::VFNMSUB231PDZ128mkz,       0 },
+  { X86::VFNMSUB231PDZ256rk,        X86::VFNMSUB231PDZ256mk,        0 },
+  { X86::VFNMSUB231PDZ256rkz,       X86::VFNMSUB231PDZ256mkz,       0 },
+  { X86::VFNMSUB231PDZrk,           X86::VFNMSUB231PDZmk,           0 },
+  { X86::VFNMSUB231PDZrkz,          X86::VFNMSUB231PDZmkz,          0 },
+  { X86::VFNMSUB231PSZ128rk,        X86::VFNMSUB231PSZ128mk,        0 },
+  { X86::VFNMSUB231PSZ128rkz,       X86::VFNMSUB231PSZ128mkz,       0 },
+  { X86::VFNMSUB231PSZ256rk,        X86::VFNMSUB231PSZ256mk,        0 },
+  { X86::VFNMSUB231PSZ256rkz,       X86::VFNMSUB231PSZ256mkz,       0 },
+  { X86::VFNMSUB231PSZrk,           X86::VFNMSUB231PSZmk,           0 },
+  { X86::VFNMSUB231PSZrkz,          X86::VFNMSUB231PSZmkz,          0 },
+  { X86::VFNMSUB231SDZr_Intk,       X86::VFNMSUB231SDZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB231SDZr_Intkz,      X86::VFNMSUB231SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB231SSZr_Intk,       X86::VFNMSUB231SSZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB231SSZr_Intkz,      X86::VFNMSUB231SSZm_Intkz,      TB_NO_REVERSE },
+  { X86::VGETEXPSDZrk,              X86::VGETEXPSDZmk,              TB_NO_REVERSE },
+  { X86::VGETEXPSSZrk,              X86::VGETEXPSSZmk,              TB_NO_REVERSE },
+  { X86::VGETMANTSDZrrik,           X86::VGETMANTSDZrmik,           TB_NO_REVERSE },
+  { X86::VGETMANTSSZrrik,           X86::VGETMANTSSZrmik,           TB_NO_REVERSE },
+  { X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 },
+  { X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 },
+  { X86::VGF2P8AFFINEINVQBZrrik,    X86::VGF2P8AFFINEINVQBZrmik,    0 },
+  { X86::VGF2P8AFFINEQBZ128rrik,    X86::VGF2P8AFFINEQBZ128rmik,    0 },
+  { X86::VGF2P8AFFINEQBZ256rrik,    X86::VGF2P8AFFINEQBZ256rmik,    0 },
+  { X86::VGF2P8AFFINEQBZrrik,       X86::VGF2P8AFFINEQBZrmik,       0 },
+  { X86::VGF2P8MULBZ128rrk,         X86::VGF2P8MULBZ128rmk,         0 },
+  { X86::VGF2P8MULBZ256rrk,         X86::VGF2P8MULBZ256rmk,         0 },
+  { X86::VGF2P8MULBZrrk,            X86::VGF2P8MULBZrmk,            0 },
+  { X86::VINSERTF32x4Z256rrk,       X86::VINSERTF32x4Z256rmk,       0 },
+  { X86::VINSERTF32x4Zrrk,          X86::VINSERTF32x4Zrmk,          0 },
+  { X86::VINSERTF32x8Zrrk,          X86::VINSERTF32x8Zrmk,          0 },
+  { X86::VINSERTF64x2Z256rrk,       X86::VINSERTF64x2Z256rmk,       0 },
+  { X86::VINSERTF64x2Zrrk,          X86::VINSERTF64x2Zrmk,          0 },
+  { X86::VINSERTF64x4Zrrk,          X86::VINSERTF64x4Zrmk,          0 },
+  { X86::VINSERTI32x4Z256rrk,       X86::VINSERTI32x4Z256rmk,       0 },
+  { X86::VINSERTI32x4Zrrk,          X86::VINSERTI32x4Zrmk,          0 },
+  { X86::VINSERTI32x8Zrrk,          X86::VINSERTI32x8Zrmk,          0 },
+  { X86::VINSERTI64x2Z256rrk,       X86::VINSERTI64x2Z256rmk,       0 },
+  { X86::VINSERTI64x2Zrrk,          X86::VINSERTI64x2Zrmk,          0 },
+  { X86::VINSERTI64x4Zrrk,          X86::VINSERTI64x4Zrmk,          0 },
+  { X86::VMAXCPDZ128rrk,            X86::VMAXCPDZ128rmk,            0 },
+  { X86::VMAXCPDZ256rrk,            X86::VMAXCPDZ256rmk,            0 },
+  { X86::VMAXCPDZrrk,               X86::VMAXCPDZrmk,               0 },
+  { X86::VMAXCPSZ128rrk,            X86::VMAXCPSZ128rmk,            0 },
+  { X86::VMAXCPSZ256rrk,            X86::VMAXCPSZ256rmk,            0 },
+  { X86::VMAXCPSZrrk,               X86::VMAXCPSZrmk,               0 },
+  { X86::VMAXPDZ128rrk,             X86::VMAXPDZ128rmk,             0 },
+  { X86::VMAXPDZ256rrk,             X86::VMAXPDZ256rmk,             0 },
+  { X86::VMAXPDZrrk,                X86::VMAXPDZrmk,                0 },
+  { X86::VMAXPSZ128rrk,             X86::VMAXPSZ128rmk,             0 },
+  { X86::VMAXPSZ256rrk,             X86::VMAXPSZ256rmk,             0 },
+  { X86::VMAXPSZrrk,                X86::VMAXPSZrmk,                0 },
+  { X86::VMAXSDZrr_Intk,            X86::VMAXSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMAXSSZrr_Intk,            X86::VMAXSSZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMINCPDZ128rrk,            X86::VMINCPDZ128rmk,            0 },
+  { X86::VMINCPDZ256rrk,            X86::VMINCPDZ256rmk,            0 },
+  { X86::VMINCPDZrrk,               X86::VMINCPDZrmk,               0 },
+  { X86::VMINCPSZ128rrk,            X86::VMINCPSZ128rmk,            0 },
+  { X86::VMINCPSZ256rrk,            X86::VMINCPSZ256rmk,            0 },
+  { X86::VMINCPSZrrk,               X86::VMINCPSZrmk,               0 },
+  { X86::VMINPDZ128rrk,             X86::VMINPDZ128rmk,             0 },
+  { X86::VMINPDZ256rrk,             X86::VMINPDZ256rmk,             0 },
+  { X86::VMINPDZrrk,                X86::VMINPDZrmk,                0 },
+  { X86::VMINPSZ128rrk,             X86::VMINPSZ128rmk,             0 },
+  { X86::VMINPSZ256rrk,             X86::VMINPSZ256rmk,             0 },
+  { X86::VMINPSZrrk,                X86::VMINPSZrmk,                0 },
+  { X86::VMINSDZrr_Intk,            X86::VMINSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMINSSZrr_Intk,            X86::VMINSSZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMULPDZ128rrk,             X86::VMULPDZ128rmk,             0 },
+  { X86::VMULPDZ256rrk,             X86::VMULPDZ256rmk,             0 },
+  { X86::VMULPDZrrk,                X86::VMULPDZrmk,                0 },
+  { X86::VMULPSZ128rrk,             X86::VMULPSZ128rmk,             0 },
+  { X86::VMULPSZ256rrk,             X86::VMULPSZ256rmk,             0 },
+  { X86::VMULPSZrrk,                X86::VMULPSZrmk,                0 },
+  { X86::VMULSDZrr_Intk,            X86::VMULSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMULSSZrr_Intk,            X86::VMULSSZrm_Intk,            TB_NO_REVERSE },
+  { X86::VORPDZ128rrk,              X86::VORPDZ128rmk,              0 },
+  { X86::VORPDZ256rrk,              X86::VORPDZ256rmk,              0 },
+  { X86::VORPDZrrk,                 X86::VORPDZrmk,                 0 },
+  { X86::VORPSZ128rrk,              X86::VORPSZ128rmk,              0 },
+  { X86::VORPSZ256rrk,              X86::VORPSZ256rmk,              0 },
+  { X86::VORPSZrrk,                 X86::VORPSZrmk,                 0 },
+  { X86::VPACKSSDWZ128rrk,          X86::VPACKSSDWZ128rmk,          0 },
+  { X86::VPACKSSDWZ256rrk,          X86::VPACKSSDWZ256rmk,          0 },
+  { X86::VPACKSSDWZrrk,             X86::VPACKSSDWZrmk,             0 },
+  { X86::VPACKSSWBZ128rrk,          X86::VPACKSSWBZ128rmk,          0 },
+  { X86::VPACKSSWBZ256rrk,          X86::VPACKSSWBZ256rmk,          0 },
+  { X86::VPACKSSWBZrrk,             X86::VPACKSSWBZrmk,             0 },
+  { X86::VPACKUSDWZ128rrk,          X86::VPACKUSDWZ128rmk,          0 },
+  { X86::VPACKUSDWZ256rrk,          X86::VPACKUSDWZ256rmk,          0 },
+  { X86::VPACKUSDWZrrk,             X86::VPACKUSDWZrmk,             0 },
+  { X86::VPACKUSWBZ128rrk,          X86::VPACKUSWBZ128rmk,          0 },
+  { X86::VPACKUSWBZ256rrk,          X86::VPACKUSWBZ256rmk,          0 },
+  { X86::VPACKUSWBZrrk,             X86::VPACKUSWBZrmk,             0 },
+  { X86::VPADDBZ128rrk,             X86::VPADDBZ128rmk,             0 },
+  { X86::VPADDBZ256rrk,             X86::VPADDBZ256rmk,             0 },
+  { X86::VPADDBZrrk,                X86::VPADDBZrmk,                0 },
+  { X86::VPADDDZ128rrk,             X86::VPADDDZ128rmk,             0 },
+  { X86::VPADDDZ256rrk,             X86::VPADDDZ256rmk,             0 },
+  { X86::VPADDDZrrk,                X86::VPADDDZrmk,                0 },
+  { X86::VPADDQZ128rrk,             X86::VPADDQZ128rmk,             0 },
+  { X86::VPADDQZ256rrk,             X86::VPADDQZ256rmk,             0 },
+  { X86::VPADDQZrrk,                X86::VPADDQZrmk,                0 },
+  { X86::VPADDSBZ128rrk,            X86::VPADDSBZ128rmk,            0 },
+  { X86::VPADDSBZ256rrk,            X86::VPADDSBZ256rmk,            0 },
+  { X86::VPADDSBZrrk,               X86::VPADDSBZrmk,               0 },
+  { X86::VPADDSWZ128rrk,            X86::VPADDSWZ128rmk,            0 },
+  { X86::VPADDSWZ256rrk,            X86::VPADDSWZ256rmk,            0 },
+  { X86::VPADDSWZrrk,               X86::VPADDSWZrmk,               0 },
+  { X86::VPADDUSBZ128rrk,           X86::VPADDUSBZ128rmk,           0 },
+  { X86::VPADDUSBZ256rrk,           X86::VPADDUSBZ256rmk,           0 },
+  { X86::VPADDUSBZrrk,              X86::VPADDUSBZrmk,              0 },
+  { X86::VPADDUSWZ128rrk,           X86::VPADDUSWZ128rmk,           0 },
+  { X86::VPADDUSWZ256rrk,           X86::VPADDUSWZ256rmk,           0 },
+  { X86::VPADDUSWZrrk,              X86::VPADDUSWZrmk,              0 },
+  { X86::VPADDWZ128rrk,             X86::VPADDWZ128rmk,             0 },
+  { X86::VPADDWZ256rrk,             X86::VPADDWZ256rmk,             0 },
+  { X86::VPADDWZrrk,                X86::VPADDWZrmk,                0 },
+  { X86::VPALIGNRZ128rrik,          X86::VPALIGNRZ128rmik,          0 },
+  { X86::VPALIGNRZ256rrik,          X86::VPALIGNRZ256rmik,          0 },
+  { X86::VPALIGNRZrrik,             X86::VPALIGNRZrmik,             0 },
+  { X86::VPANDDZ128rrk,             X86::VPANDDZ128rmk,             0 },
+  { X86::VPANDDZ256rrk,             X86::VPANDDZ256rmk,             0 },
+  { X86::VPANDDZrrk,                X86::VPANDDZrmk,                0 },
+  { X86::VPANDNDZ128rrk,            X86::VPANDNDZ128rmk,            0 },
+  { X86::VPANDNDZ256rrk,            X86::VPANDNDZ256rmk,            0 },
+  { X86::VPANDNDZrrk,               X86::VPANDNDZrmk,               0 },
+  { X86::VPANDNQZ128rrk,            X86::VPANDNQZ128rmk,            0 },
+  { X86::VPANDNQZ256rrk,            X86::VPANDNQZ256rmk,            0 },
+  { X86::VPANDNQZrrk,               X86::VPANDNQZrmk,               0 },
+  { X86::VPANDQZ128rrk,             X86::VPANDQZ128rmk,             0 },
+  { X86::VPANDQZ256rrk,             X86::VPANDQZ256rmk,             0 },
+  { X86::VPANDQZrrk,                X86::VPANDQZrmk,                0 },
+  { X86::VPAVGBZ128rrk,             X86::VPAVGBZ128rmk,             0 },
+  { X86::VPAVGBZ256rrk,             X86::VPAVGBZ256rmk,             0 },
+  { X86::VPAVGBZrrk,                X86::VPAVGBZrmk,                0 },
+  { X86::VPAVGWZ128rrk,             X86::VPAVGWZ128rmk,             0 },
+  { X86::VPAVGWZ256rrk,             X86::VPAVGWZ256rmk,             0 },
+  { X86::VPAVGWZrrk,                X86::VPAVGWZrmk,                0 },
+  { X86::VPDPBUSDSZ128rk,           X86::VPDPBUSDSZ128mk,           0 },
+  { X86::VPDPBUSDSZ128rkz,          X86::VPDPBUSDSZ128mkz,          0 },
+  { X86::VPDPBUSDSZ256rk,           X86::VPDPBUSDSZ256mk,           0 },
+  { X86::VPDPBUSDSZ256rkz,          X86::VPDPBUSDSZ256mkz,          0 },
+  { X86::VPDPBUSDSZrk,              X86::VPDPBUSDSZmk,              0 },
+  { X86::VPDPBUSDSZrkz,             X86::VPDPBUSDSZmkz,             0 },
+  { X86::VPDPBUSDZ128rk,            X86::VPDPBUSDZ128mk,            0 },
+  { X86::VPDPBUSDZ128rkz,           X86::VPDPBUSDZ128mkz,           0 },
+  { X86::VPDPBUSDZ256rk,            X86::VPDPBUSDZ256mk,            0 },
+  { X86::VPDPBUSDZ256rkz,           X86::VPDPBUSDZ256mkz,           0 },
+  { X86::VPDPBUSDZrk,               X86::VPDPBUSDZmk,               0 },
+  { X86::VPDPBUSDZrkz,              X86::VPDPBUSDZmkz,              0 },
+  { X86::VPDPWSSDSZ128rk,           X86::VPDPWSSDSZ128mk,           0 },
+  { X86::VPDPWSSDSZ128rkz,          X86::VPDPWSSDSZ128mkz,          0 },
+  { X86::VPDPWSSDSZ256rk,           X86::VPDPWSSDSZ256mk,           0 },
+  { X86::VPDPWSSDSZ256rkz,          X86::VPDPWSSDSZ256mkz,          0 },
+  { X86::VPDPWSSDSZrk,              X86::VPDPWSSDSZmk,              0 },
+  { X86::VPDPWSSDSZrkz,             X86::VPDPWSSDSZmkz,             0 },
+  { X86::VPDPWSSDZ128rk,            X86::VPDPWSSDZ128mk,            0 },
+  { X86::VPDPWSSDZ128rkz,           X86::VPDPWSSDZ128mkz,           0 },
+  { X86::VPDPWSSDZ256rk,            X86::VPDPWSSDZ256mk,            0 },
+  { X86::VPDPWSSDZ256rkz,           X86::VPDPWSSDZ256mkz,           0 },
+  { X86::VPDPWSSDZrk,               X86::VPDPWSSDZmk,               0 },
+  { X86::VPDPWSSDZrkz,              X86::VPDPWSSDZmkz,              0 },
+  { X86::VPERMBZ128rrk,             X86::VPERMBZ128rmk,             0 },
+  { X86::VPERMBZ256rrk,             X86::VPERMBZ256rmk,             0 },
+  { X86::VPERMBZrrk,                X86::VPERMBZrmk,                0 },
+  { X86::VPERMDZ256rrk,             X86::VPERMDZ256rmk,             0 },
+  { X86::VPERMDZrrk,                X86::VPERMDZrmk,                0 },
+  { X86::VPERMI2B128rrk,            X86::VPERMI2B128rmk,            0 },
+  { X86::VPERMI2B128rrkz,           X86::VPERMI2B128rmkz,           0 },
+  { X86::VPERMI2B256rrk,            X86::VPERMI2B256rmk,            0 },
+  { X86::VPERMI2B256rrkz,           X86::VPERMI2B256rmkz,           0 },
+  { X86::VPERMI2Brrk,               X86::VPERMI2Brmk,               0 },
+  { X86::VPERMI2Brrkz,              X86::VPERMI2Brmkz,              0 },
+  { X86::VPERMI2D128rrk,            X86::VPERMI2D128rmk,            0 },
+  { X86::VPERMI2D128rrkz,           X86::VPERMI2D128rmkz,           0 },
+  { X86::VPERMI2D256rrk,            X86::VPERMI2D256rmk,            0 },
+  { X86::VPERMI2D256rrkz,           X86::VPERMI2D256rmkz,           0 },
+  { X86::VPERMI2Drrk,               X86::VPERMI2Drmk,               0 },
+  { X86::VPERMI2Drrkz,              X86::VPERMI2Drmkz,              0 },
+  { X86::VPERMI2PD128rrk,           X86::VPERMI2PD128rmk,           0 },
+  { X86::VPERMI2PD128rrkz,          X86::VPERMI2PD128rmkz,          0 },
+  { X86::VPERMI2PD256rrk,           X86::VPERMI2PD256rmk,           0 },
+  { X86::VPERMI2PD256rrkz,          X86::VPERMI2PD256rmkz,          0 },
+  { X86::VPERMI2PDrrk,              X86::VPERMI2PDrmk,              0 },
+  { X86::VPERMI2PDrrkz,             X86::VPERMI2PDrmkz,             0 },
+  { X86::VPERMI2PS128rrk,           X86::VPERMI2PS128rmk,           0 },
+  { X86::VPERMI2PS128rrkz,          X86::VPERMI2PS128rmkz,          0 },
+  { X86::VPERMI2PS256rrk,           X86::VPERMI2PS256rmk,           0 },
+  { X86::VPERMI2PS256rrkz,          X86::VPERMI2PS256rmkz,          0 },
+  { X86::VPERMI2PSrrk,              X86::VPERMI2PSrmk,              0 },
+  { X86::VPERMI2PSrrkz,             X86::VPERMI2PSrmkz,             0 },
+  { X86::VPERMI2Q128rrk,            X86::VPERMI2Q128rmk,            0 },
+  { X86::VPERMI2Q128rrkz,           X86::VPERMI2Q128rmkz,           0 },
+  { X86::VPERMI2Q256rrk,            X86::VPERMI2Q256rmk,            0 },
+  { X86::VPERMI2Q256rrkz,           X86::VPERMI2Q256rmkz,           0 },
+  { X86::VPERMI2Qrrk,               X86::VPERMI2Qrmk,               0 },
+  { X86::VPERMI2Qrrkz,              X86::VPERMI2Qrmkz,              0 },
+  { X86::VPERMI2W128rrk,            X86::VPERMI2W128rmk,            0 },
+  { X86::VPERMI2W128rrkz,           X86::VPERMI2W128rmkz,           0 },
+  { X86::VPERMI2W256rrk,            X86::VPERMI2W256rmk,            0 },
+  { X86::VPERMI2W256rrkz,           X86::VPERMI2W256rmkz,           0 },
+  { X86::VPERMI2Wrrk,               X86::VPERMI2Wrmk,               0 },
+  { X86::VPERMI2Wrrkz,              X86::VPERMI2Wrmkz,              0 },
+  { X86::VPERMILPDZ128rrk,          X86::VPERMILPDZ128rmk,          0 },
+  { X86::VPERMILPDZ256rrk,          X86::VPERMILPDZ256rmk,          0 },
+  { X86::VPERMILPDZrrk,             X86::VPERMILPDZrmk,             0 },
+  { X86::VPERMILPSZ128rrk,          X86::VPERMILPSZ128rmk,          0 },
+  { X86::VPERMILPSZ256rrk,          X86::VPERMILPSZ256rmk,          0 },
+  { X86::VPERMILPSZrrk,             X86::VPERMILPSZrmk,             0 },
+  { X86::VPERMPDZ256rrk,            X86::VPERMPDZ256rmk,            0 },
+  { X86::VPERMPDZrrk,               X86::VPERMPDZrmk,               0 },
+  { X86::VPERMPSZ256rrk,            X86::VPERMPSZ256rmk,            0 },
+  { X86::VPERMPSZrrk,               X86::VPERMPSZrmk,               0 },
+  { X86::VPERMQZ256rrk,             X86::VPERMQZ256rmk,             0 },
+  { X86::VPERMQZrrk,                X86::VPERMQZrmk,                0 },
+  { X86::VPERMT2B128rrk,            X86::VPERMT2B128rmk,            0 },
+  { X86::VPERMT2B128rrkz,           X86::VPERMT2B128rmkz,           0 },
+  { X86::VPERMT2B256rrk,            X86::VPERMT2B256rmk,            0 },
+  { X86::VPERMT2B256rrkz,           X86::VPERMT2B256rmkz,           0 },
+  { X86::VPERMT2Brrk,               X86::VPERMT2Brmk,               0 },
+  { X86::VPERMT2Brrkz,              X86::VPERMT2Brmkz,              0 },
+  { X86::VPERMT2D128rrk,            X86::VPERMT2D128rmk,            0 },
+  { X86::VPERMT2D128rrkz,           X86::VPERMT2D128rmkz,           0 },
+  { X86::VPERMT2D256rrk,            X86::VPERMT2D256rmk,            0 },
+  { X86::VPERMT2D256rrkz,           X86::VPERMT2D256rmkz,           0 },
+  { X86::VPERMT2Drrk,               X86::VPERMT2Drmk,               0 },
+  { X86::VPERMT2Drrkz,              X86::VPERMT2Drmkz,              0 },
+  { X86::VPERMT2PD128rrk,           X86::VPERMT2PD128rmk,           0 },
+  { X86::VPERMT2PD128rrkz,          X86::VPERMT2PD128rmkz,          0 },
+  { X86::VPERMT2PD256rrk,           X86::VPERMT2PD256rmk,           0 },
+  { X86::VPERMT2PD256rrkz,          X86::VPERMT2PD256rmkz,          0 },
+  { X86::VPERMT2PDrrk,              X86::VPERMT2PDrmk,              0 },
+  { X86::VPERMT2PDrrkz,             X86::VPERMT2PDrmkz,             0 },
+  { X86::VPERMT2PS128rrk,           X86::VPERMT2PS128rmk,           0 },
+  { X86::VPERMT2PS128rrkz,          X86::VPERMT2PS128rmkz,          0 },
+  { X86::VPERMT2PS256rrk,           X86::VPERMT2PS256rmk,           0 },
+  { X86::VPERMT2PS256rrkz,          X86::VPERMT2PS256rmkz,          0 },
+  { X86::VPERMT2PSrrk,              X86::VPERMT2PSrmk,              0 },
+  { X86::VPERMT2PSrrkz,             X86::VPERMT2PSrmkz,             0 },
+  { X86::VPERMT2Q128rrk,            X86::VPERMT2Q128rmk,            0 },
+  { X86::VPERMT2Q128rrkz,           X86::VPERMT2Q128rmkz,           0 },
+  { X86::VPERMT2Q256rrk,            X86::VPERMT2Q256rmk,            0 },
+  { X86::VPERMT2Q256rrkz,           X86::VPERMT2Q256rmkz,           0 },
+  { X86::VPERMT2Qrrk,               X86::VPERMT2Qrmk,               0 },
+  { X86::VPERMT2Qrrkz,              X86::VPERMT2Qrmkz,              0 },
+  { X86::VPERMT2W128rrk,            X86::VPERMT2W128rmk,            0 },
+  { X86::VPERMT2W128rrkz,           X86::VPERMT2W128rmkz,           0 },
+  { X86::VPERMT2W256rrk,            X86::VPERMT2W256rmk,            0 },
+  { X86::VPERMT2W256rrkz,           X86::VPERMT2W256rmkz,           0 },
+  { X86::VPERMT2Wrrk,               X86::VPERMT2Wrmk,               0 },
+  { X86::VPERMT2Wrrkz,              X86::VPERMT2Wrmkz,              0 },
+  { X86::VPERMWZ128rrk,             X86::VPERMWZ128rmk,             0 },
+  { X86::VPERMWZ256rrk,             X86::VPERMWZ256rmk,             0 },
+  { X86::VPERMWZrrk,                X86::VPERMWZrmk,                0 },
+  { X86::VPMADD52HUQZ128rk,         X86::VPMADD52HUQZ128mk,         0 },
+  { X86::VPMADD52HUQZ128rkz,        X86::VPMADD52HUQZ128mkz,        0 },
+  { X86::VPMADD52HUQZ256rk,         X86::VPMADD52HUQZ256mk,         0 },
+  { X86::VPMADD52HUQZ256rkz,        X86::VPMADD52HUQZ256mkz,        0 },
+  { X86::VPMADD52HUQZrk,            X86::VPMADD52HUQZmk,            0 },
+  { X86::VPMADD52HUQZrkz,           X86::VPMADD52HUQZmkz,           0 },
+  { X86::VPMADD52LUQZ128rk,         X86::VPMADD52LUQZ128mk,         0 },
+  { X86::VPMADD52LUQZ128rkz,        X86::VPMADD52LUQZ128mkz,        0 },
+  { X86::VPMADD52LUQZ256rk,         X86::VPMADD52LUQZ256mk,         0 },
+  { X86::VPMADD52LUQZ256rkz,        X86::VPMADD52LUQZ256mkz,        0 },
+  { X86::VPMADD52LUQZrk,            X86::VPMADD52LUQZmk,            0 },
+  { X86::VPMADD52LUQZrkz,           X86::VPMADD52LUQZmkz,           0 },
+  { X86::VPMADDUBSWZ128rrk,         X86::VPMADDUBSWZ128rmk,         0 },
+  { X86::VPMADDUBSWZ256rrk,         X86::VPMADDUBSWZ256rmk,         0 },
+  { X86::VPMADDUBSWZrrk,            X86::VPMADDUBSWZrmk,            0 },
+  { X86::VPMADDWDZ128rrk,           X86::VPMADDWDZ128rmk,           0 },
+  { X86::VPMADDWDZ256rrk,           X86::VPMADDWDZ256rmk,           0 },
+  { X86::VPMADDWDZrrk,              X86::VPMADDWDZrmk,              0 },
+  { X86::VPMAXSBZ128rrk,            X86::VPMAXSBZ128rmk,            0 },
+  { X86::VPMAXSBZ256rrk,            X86::VPMAXSBZ256rmk,            0 },
+  { X86::VPMAXSBZrrk,               X86::VPMAXSBZrmk,               0 },
+  { X86::VPMAXSDZ128rrk,            X86::VPMAXSDZ128rmk,            0 },
+  { X86::VPMAXSDZ256rrk,            X86::VPMAXSDZ256rmk,            0 },
+  { X86::VPMAXSDZrrk,               X86::VPMAXSDZrmk,               0 },
+  { X86::VPMAXSQZ128rrk,            X86::VPMAXSQZ128rmk,            0 },
+  { X86::VPMAXSQZ256rrk,            X86::VPMAXSQZ256rmk,            0 },
+  { X86::VPMAXSQZrrk,               X86::VPMAXSQZrmk,               0 },
+  { X86::VPMAXSWZ128rrk,            X86::VPMAXSWZ128rmk,            0 },
+  { X86::VPMAXSWZ256rrk,            X86::VPMAXSWZ256rmk,            0 },
+  { X86::VPMAXSWZrrk,               X86::VPMAXSWZrmk,               0 },
+  { X86::VPMAXUBZ128rrk,            X86::VPMAXUBZ128rmk,            0 },
+  { X86::VPMAXUBZ256rrk,            X86::VPMAXUBZ256rmk,            0 },
+  { X86::VPMAXUBZrrk,               X86::VPMAXUBZrmk,               0 },
+  { X86::VPMAXUDZ128rrk,            X86::VPMAXUDZ128rmk,            0 },
+  { X86::VPMAXUDZ256rrk,            X86::VPMAXUDZ256rmk,            0 },
+  { X86::VPMAXUDZrrk,               X86::VPMAXUDZrmk,               0 },
+  { X86::VPMAXUQZ128rrk,            X86::VPMAXUQZ128rmk,            0 },
+  { X86::VPMAXUQZ256rrk,            X86::VPMAXUQZ256rmk,            0 },
+  { X86::VPMAXUQZrrk,               X86::VPMAXUQZrmk,               0 },
+  { X86::VPMAXUWZ128rrk,            X86::VPMAXUWZ128rmk,            0 },
+  { X86::VPMAXUWZ256rrk,            X86::VPMAXUWZ256rmk,            0 },
+  { X86::VPMAXUWZrrk,               X86::VPMAXUWZrmk,               0 },
+  { X86::VPMINSBZ128rrk,            X86::VPMINSBZ128rmk,            0 },
+  { X86::VPMINSBZ256rrk,            X86::VPMINSBZ256rmk,            0 },
+  { X86::VPMINSBZrrk,               X86::VPMINSBZrmk,               0 },
+  { X86::VPMINSDZ128rrk,            X86::VPMINSDZ128rmk,            0 },
+  { X86::VPMINSDZ256rrk,            X86::VPMINSDZ256rmk,            0 },
+  { X86::VPMINSDZrrk,               X86::VPMINSDZrmk,               0 },
+  { X86::VPMINSQZ128rrk,            X86::VPMINSQZ128rmk,            0 },
+  { X86::VPMINSQZ256rrk,            X86::VPMINSQZ256rmk,            0 },
+  { X86::VPMINSQZrrk,               X86::VPMINSQZrmk,               0 },
+  { X86::VPMINSWZ128rrk,            X86::VPMINSWZ128rmk,            0 },
+  { X86::VPMINSWZ256rrk,            X86::VPMINSWZ256rmk,            0 },
+  { X86::VPMINSWZrrk,               X86::VPMINSWZrmk,               0 },
+  { X86::VPMINUBZ128rrk,            X86::VPMINUBZ128rmk,            0 },
+  { X86::VPMINUBZ256rrk,            X86::VPMINUBZ256rmk,            0 },
+  { X86::VPMINUBZrrk,               X86::VPMINUBZrmk,               0 },
+  { X86::VPMINUDZ128rrk,            X86::VPMINUDZ128rmk,            0 },
+  { X86::VPMINUDZ256rrk,            X86::VPMINUDZ256rmk,            0 },
+  { X86::VPMINUDZrrk,               X86::VPMINUDZrmk,               0 },
+  { X86::VPMINUQZ128rrk,            X86::VPMINUQZ128rmk,            0 },
+  { X86::VPMINUQZ256rrk,            X86::VPMINUQZ256rmk,            0 },
+  { X86::VPMINUQZrrk,               X86::VPMINUQZrmk,               0 },
+  { X86::VPMINUWZ128rrk,            X86::VPMINUWZ128rmk,            0 },
+  { X86::VPMINUWZ256rrk,            X86::VPMINUWZ256rmk,            0 },
+  { X86::VPMINUWZrrk,               X86::VPMINUWZrmk,               0 },
+  { X86::VPMULDQZ128rrk,            X86::VPMULDQZ128rmk,            0 },
+  { X86::VPMULDQZ256rrk,            X86::VPMULDQZ256rmk,            0 },
+  { X86::VPMULDQZrrk,               X86::VPMULDQZrmk,               0 },
+  { X86::VPMULHRSWZ128rrk,          X86::VPMULHRSWZ128rmk,          0 },
+  { X86::VPMULHRSWZ256rrk,          X86::VPMULHRSWZ256rmk,          0 },
+  { X86::VPMULHRSWZrrk,             X86::VPMULHRSWZrmk,             0 },
+  { X86::VPMULHUWZ128rrk,           X86::VPMULHUWZ128rmk,           0 },
+  { X86::VPMULHUWZ256rrk,           X86::VPMULHUWZ256rmk,           0 },
+  { X86::VPMULHUWZrrk,              X86::VPMULHUWZrmk,              0 },
+  { X86::VPMULHWZ128rrk,            X86::VPMULHWZ128rmk,            0 },
+  { X86::VPMULHWZ256rrk,            X86::VPMULHWZ256rmk,            0 },
+  { X86::VPMULHWZrrk,               X86::VPMULHWZrmk,               0 },
+  { X86::VPMULLDZ128rrk,            X86::VPMULLDZ128rmk,            0 },
+  { X86::VPMULLDZ256rrk,            X86::VPMULLDZ256rmk,            0 },
+  { X86::VPMULLDZrrk,               X86::VPMULLDZrmk,               0 },
+  { X86::VPMULLQZ128rrk,            X86::VPMULLQZ128rmk,            0 },
+  { X86::VPMULLQZ256rrk,            X86::VPMULLQZ256rmk,            0 },
+  { X86::VPMULLQZrrk,               X86::VPMULLQZrmk,               0 },
+  { X86::VPMULLWZ128rrk,            X86::VPMULLWZ128rmk,            0 },
+  { X86::VPMULLWZ256rrk,            X86::VPMULLWZ256rmk,            0 },
+  { X86::VPMULLWZrrk,               X86::VPMULLWZrmk,               0 },
+  { X86::VPMULTISHIFTQBZ128rrk,     X86::VPMULTISHIFTQBZ128rmk,     0 },
+  { X86::VPMULTISHIFTQBZ256rrk,     X86::VPMULTISHIFTQBZ256rmk,     0 },
+  { X86::VPMULTISHIFTQBZrrk,        X86::VPMULTISHIFTQBZrmk,        0 },
+  { X86::VPMULUDQZ128rrk,           X86::VPMULUDQZ128rmk,           0 },
+  { X86::VPMULUDQZ256rrk,           X86::VPMULUDQZ256rmk,           0 },
+  { X86::VPMULUDQZrrk,              X86::VPMULUDQZrmk,              0 },
+  { X86::VPORDZ128rrk,              X86::VPORDZ128rmk,              0 },
+  { X86::VPORDZ256rrk,              X86::VPORDZ256rmk,              0 },
+  { X86::VPORDZrrk,                 X86::VPORDZrmk,                 0 },
+  { X86::VPORQZ128rrk,              X86::VPORQZ128rmk,              0 },
+  { X86::VPORQZ256rrk,              X86::VPORQZ256rmk,              0 },
+  { X86::VPORQZrrk,                 X86::VPORQZrmk,                 0 },
+  { X86::VPROLVDZ128rrk,            X86::VPROLVDZ128rmk,            0 },
+  { X86::VPROLVDZ256rrk,            X86::VPROLVDZ256rmk,            0 },
+  { X86::VPROLVDZrrk,               X86::VPROLVDZrmk,               0 },
+  { X86::VPROLVQZ128rrk,            X86::VPROLVQZ128rmk,            0 },
+  { X86::VPROLVQZ256rrk,            X86::VPROLVQZ256rmk,            0 },
+  { X86::VPROLVQZrrk,               X86::VPROLVQZrmk,               0 },
+  { X86::VPRORVDZ128rrk,            X86::VPRORVDZ128rmk,            0 },
+  { X86::VPRORVDZ256rrk,            X86::VPRORVDZ256rmk,            0 },
+  { X86::VPRORVDZrrk,               X86::VPRORVDZrmk,               0 },
+  { X86::VPRORVQZ128rrk,            X86::VPRORVQZ128rmk,            0 },
+  { X86::VPRORVQZ256rrk,            X86::VPRORVQZ256rmk,            0 },
+  { X86::VPRORVQZrrk,               X86::VPRORVQZrmk,               0 },
+  { X86::VPSHLDDZ128rrik,           X86::VPSHLDDZ128rmik,           0 },
+  { X86::VPSHLDDZ256rrik,           X86::VPSHLDDZ256rmik,           0 },
+  { X86::VPSHLDDZrrik,              X86::VPSHLDDZrmik,              0 },
+  { X86::VPSHLDQZ128rrik,           X86::VPSHLDQZ128rmik,           0 },
+  { X86::VPSHLDQZ256rrik,           X86::VPSHLDQZ256rmik,           0 },
+  { X86::VPSHLDQZrrik,              X86::VPSHLDQZrmik,              0 },
+  { X86::VPSHLDVDZ128rk,            X86::VPSHLDVDZ128mk,            0 },
+  { X86::VPSHLDVDZ128rkz,           X86::VPSHLDVDZ128mkz,           0 },
+  { X86::VPSHLDVDZ256rk,            X86::VPSHLDVDZ256mk,            0 },
+  { X86::VPSHLDVDZ256rkz,           X86::VPSHLDVDZ256mkz,           0 },
+  { X86::VPSHLDVDZrk,               X86::VPSHLDVDZmk,               0 },
+  { X86::VPSHLDVDZrkz,              X86::VPSHLDVDZmkz,              0 },
+  { X86::VPSHLDVQZ128rk,            X86::VPSHLDVQZ128mk,            0 },
+  { X86::VPSHLDVQZ128rkz,           X86::VPSHLDVQZ128mkz,           0 },
+  { X86::VPSHLDVQZ256rk,            X86::VPSHLDVQZ256mk,            0 },
+  { X86::VPSHLDVQZ256rkz,           X86::VPSHLDVQZ256mkz,           0 },
+  { X86::VPSHLDVQZrk,               X86::VPSHLDVQZmk,               0 },
+  { X86::VPSHLDVQZrkz,              X86::VPSHLDVQZmkz,              0 },
+  { X86::VPSHLDVWZ128rk,            X86::VPSHLDVWZ128mk,            0 },
+  { X86::VPSHLDVWZ128rkz,           X86::VPSHLDVWZ128mkz,           0 },
+  { X86::VPSHLDVWZ256rk,            X86::VPSHLDVWZ256mk,            0 },
+  { X86::VPSHLDVWZ256rkz,           X86::VPSHLDVWZ256mkz,           0 },
+  { X86::VPSHLDVWZrk,               X86::VPSHLDVWZmk,               0 },
+  { X86::VPSHLDVWZrkz,              X86::VPSHLDVWZmkz,              0 },
+  { X86::VPSHLDWZ128rrik,           X86::VPSHLDWZ128rmik,           0 },
+  { X86::VPSHLDWZ256rrik,           X86::VPSHLDWZ256rmik,           0 },
+  { X86::VPSHLDWZrrik,              X86::VPSHLDWZrmik,              0 },
+  { X86::VPSHRDDZ128rrik,           X86::VPSHRDDZ128rmik,           0 },
+  { X86::VPSHRDDZ256rrik,           X86::VPSHRDDZ256rmik,           0 },
+  { X86::VPSHRDDZrrik,              X86::VPSHRDDZrmik,              0 },
+  { X86::VPSHRDQZ128rrik,           X86::VPSHRDQZ128rmik,           0 },
+  { X86::VPSHRDQZ256rrik,           X86::VPSHRDQZ256rmik,           0 },
+  { X86::VPSHRDQZrrik,              X86::VPSHRDQZrmik,              0 },
+  { X86::VPSHRDVDZ128rk,            X86::VPSHRDVDZ128mk,            0 },
+  { X86::VPSHRDVDZ128rkz,           X86::VPSHRDVDZ128mkz,           0 },
+  { X86::VPSHRDVDZ256rk,            X86::VPSHRDVDZ256mk,            0 },
+  { X86::VPSHRDVDZ256rkz,           X86::VPSHRDVDZ256mkz,           0 },
+  { X86::VPSHRDVDZrk,               X86::VPSHRDVDZmk,               0 },
+  { X86::VPSHRDVDZrkz,              X86::VPSHRDVDZmkz,              0 },
+  { X86::VPSHRDVQZ128rk,            X86::VPSHRDVQZ128mk,            0 },
+  { X86::VPSHRDVQZ128rkz,           X86::VPSHRDVQZ128mkz,           0 },
+  { X86::VPSHRDVQZ256rk,            X86::VPSHRDVQZ256mk,            0 },
+  { X86::VPSHRDVQZ256rkz,           X86::VPSHRDVQZ256mkz,           0 },
+  { X86::VPSHRDVQZrk,               X86::VPSHRDVQZmk,               0 },
+  { X86::VPSHRDVQZrkz,              X86::VPSHRDVQZmkz,              0 },
+  { X86::VPSHRDVWZ128rk,            X86::VPSHRDVWZ128mk,            0 },
+  { X86::VPSHRDVWZ128rkz,           X86::VPSHRDVWZ128mkz,           0 },
+  { X86::VPSHRDVWZ256rk,            X86::VPSHRDVWZ256mk,            0 },
+  { X86::VPSHRDVWZ256rkz,           X86::VPSHRDVWZ256mkz,           0 },
+  { X86::VPSHRDVWZrk,               X86::VPSHRDVWZmk,               0 },
+  { X86::VPSHRDVWZrkz,              X86::VPSHRDVWZmkz,              0 },
+  { X86::VPSHRDWZ128rrik,           X86::VPSHRDWZ128rmik,           0 },
+  { X86::VPSHRDWZ256rrik,           X86::VPSHRDWZ256rmik,           0 },
+  { X86::VPSHRDWZrrik,              X86::VPSHRDWZrmik,              0 },
+  { X86::VPSHUFBZ128rrk,            X86::VPSHUFBZ128rmk,            0 },
+  { X86::VPSHUFBZ256rrk,            X86::VPSHUFBZ256rmk,            0 },
+  { X86::VPSHUFBZrrk,               X86::VPSHUFBZrmk,               0 },
+  { X86::VPSLLDZ128rrk,             X86::VPSLLDZ128rmk,             0 },
+  { X86::VPSLLDZ256rrk,             X86::VPSLLDZ256rmk,             0 },
+  { X86::VPSLLDZrrk,                X86::VPSLLDZrmk,                0 },
+  { X86::VPSLLQZ128rrk,             X86::VPSLLQZ128rmk,             0 },
+  { X86::VPSLLQZ256rrk,             X86::VPSLLQZ256rmk,             0 },
+  { X86::VPSLLQZrrk,                X86::VPSLLQZrmk,                0 },
+  { X86::VPSLLVDZ128rrk,            X86::VPSLLVDZ128rmk,            0 },
+  { X86::VPSLLVDZ256rrk,            X86::VPSLLVDZ256rmk,            0 },
+  { X86::VPSLLVDZrrk,               X86::VPSLLVDZrmk,               0 },
+  { X86::VPSLLVQZ128rrk,            X86::VPSLLVQZ128rmk,            0 },
+  { X86::VPSLLVQZ256rrk,            X86::VPSLLVQZ256rmk,            0 },
+  { X86::VPSLLVQZrrk,               X86::VPSLLVQZrmk,               0 },
+  { X86::VPSLLVWZ128rrk,            X86::VPSLLVWZ128rmk,            0 },
+  { X86::VPSLLVWZ256rrk,            X86::VPSLLVWZ256rmk,            0 },
+  { X86::VPSLLVWZrrk,               X86::VPSLLVWZrmk,               0 },
+  { X86::VPSLLWZ128rrk,             X86::VPSLLWZ128rmk,             0 },
+  { X86::VPSLLWZ256rrk,             X86::VPSLLWZ256rmk,             0 },
+  { X86::VPSLLWZrrk,                X86::VPSLLWZrmk,                0 },
+  { X86::VPSRADZ128rrk,             X86::VPSRADZ128rmk,             0 },
+  { X86::VPSRADZ256rrk,             X86::VPSRADZ256rmk,             0 },
+  { X86::VPSRADZrrk,                X86::VPSRADZrmk,                0 },
+  { X86::VPSRAQZ128rrk,             X86::VPSRAQZ128rmk,             0 },
+  { X86::VPSRAQZ256rrk,             X86::VPSRAQZ256rmk,             0 },
+  { X86::VPSRAQZrrk,                X86::VPSRAQZrmk,                0 },
+  { X86::VPSRAVDZ128rrk,            X86::VPSRAVDZ128rmk,            0 },
+  { X86::VPSRAVDZ256rrk,            X86::VPSRAVDZ256rmk,            0 },
+  { X86::VPSRAVDZrrk,               X86::VPSRAVDZrmk,               0 },
+  { X86::VPSRAVQZ128rrk,            X86::VPSRAVQZ128rmk,            0 },
+  { X86::VPSRAVQZ256rrk,            X86::VPSRAVQZ256rmk,            0 },
+  { X86::VPSRAVQZrrk,               X86::VPSRAVQZrmk,               0 },
+  { X86::VPSRAVWZ128rrk,            X86::VPSRAVWZ128rmk,            0 },
+  { X86::VPSRAVWZ256rrk,            X86::VPSRAVWZ256rmk,            0 },
+  { X86::VPSRAVWZrrk,               X86::VPSRAVWZrmk,               0 },
+  { X86::VPSRAWZ128rrk,             X86::VPSRAWZ128rmk,             0 },
+  { X86::VPSRAWZ256rrk,             X86::VPSRAWZ256rmk,             0 },
+  { X86::VPSRAWZrrk,                X86::VPSRAWZrmk,                0 },
+  { X86::VPSRLDZ128rrk,             X86::VPSRLDZ128rmk,             0 },
+  { X86::VPSRLDZ256rrk,             X86::VPSRLDZ256rmk,             0 },
+  { X86::VPSRLDZrrk,                X86::VPSRLDZrmk,                0 },
+  { X86::VPSRLQZ128rrk,             X86::VPSRLQZ128rmk,             0 },
+  { X86::VPSRLQZ256rrk,             X86::VPSRLQZ256rmk,             0 },
+  { X86::VPSRLQZrrk,                X86::VPSRLQZrmk,                0 },
+  { X86::VPSRLVDZ128rrk,            X86::VPSRLVDZ128rmk,            0 },
+  { X86::VPSRLVDZ256rrk,            X86::VPSRLVDZ256rmk,            0 },
+  { X86::VPSRLVDZrrk,               X86::VPSRLVDZrmk,               0 },
+  { X86::VPSRLVQZ128rrk,            X86::VPSRLVQZ128rmk,            0 },
+  { X86::VPSRLVQZ256rrk,            X86::VPSRLVQZ256rmk,            0 },
+  { X86::VPSRLVQZrrk,               X86::VPSRLVQZrmk,               0 },
+  { X86::VPSRLVWZ128rrk,            X86::VPSRLVWZ128rmk,            0 },
+  { X86::VPSRLVWZ256rrk,            X86::VPSRLVWZ256rmk,            0 },
+  { X86::VPSRLVWZrrk,               X86::VPSRLVWZrmk,               0 },
+  { X86::VPSRLWZ128rrk,             X86::VPSRLWZ128rmk,             0 },
+  { X86::VPSRLWZ256rrk,             X86::VPSRLWZ256rmk,             0 },
+  { X86::VPSRLWZrrk,                X86::VPSRLWZrmk,                0 },
+  { X86::VPSUBBZ128rrk,             X86::VPSUBBZ128rmk,             0 },
+  { X86::VPSUBBZ256rrk,             X86::VPSUBBZ256rmk,             0 },
+  { X86::VPSUBBZrrk,                X86::VPSUBBZrmk,                0 },
+  { X86::VPSUBDZ128rrk,             X86::VPSUBDZ128rmk,             0 },
+  { X86::VPSUBDZ256rrk,             X86::VPSUBDZ256rmk,             0 },
+  { X86::VPSUBDZrrk,                X86::VPSUBDZrmk,                0 },
+  { X86::VPSUBQZ128rrk,             X86::VPSUBQZ128rmk,             0 },
+  { X86::VPSUBQZ256rrk,             X86::VPSUBQZ256rmk,             0 },
+  { X86::VPSUBQZrrk,                X86::VPSUBQZrmk,                0 },
+  { X86::VPSUBSBZ128rrk,            X86::VPSUBSBZ128rmk,            0 },
+  { X86::VPSUBSBZ256rrk,            X86::VPSUBSBZ256rmk,            0 },
+  { X86::VPSUBSBZrrk,               X86::VPSUBSBZrmk,               0 },
+  { X86::VPSUBSWZ128rrk,            X86::VPSUBSWZ128rmk,            0 },
+  { X86::VPSUBSWZ256rrk,            X86::VPSUBSWZ256rmk,            0 },
+  { X86::VPSUBSWZrrk,               X86::VPSUBSWZrmk,               0 },
+  { X86::VPSUBUSBZ128rrk,           X86::VPSUBUSBZ128rmk,           0 },
+  { X86::VPSUBUSBZ256rrk,           X86::VPSUBUSBZ256rmk,           0 },
+  { X86::VPSUBUSBZrrk,              X86::VPSUBUSBZrmk,              0 },
+  { X86::VPSUBUSWZ128rrk,           X86::VPSUBUSWZ128rmk,           0 },
+  { X86::VPSUBUSWZ256rrk,           X86::VPSUBUSWZ256rmk,           0 },
+  { X86::VPSUBUSWZrrk,              X86::VPSUBUSWZrmk,              0 },
+  { X86::VPSUBWZ128rrk,             X86::VPSUBWZ128rmk,             0 },
+  { X86::VPSUBWZ256rrk,             X86::VPSUBWZ256rmk,             0 },
+  { X86::VPSUBWZrrk,                X86::VPSUBWZrmk,                0 },
+  { X86::VPTERNLOGDZ128rrik,        X86::VPTERNLOGDZ128rmik,        0 },
+  { X86::VPTERNLOGDZ128rrikz,       X86::VPTERNLOGDZ128rmikz,       0 },
+  { X86::VPTERNLOGDZ256rrik,        X86::VPTERNLOGDZ256rmik,        0 },
+  { X86::VPTERNLOGDZ256rrikz,       X86::VPTERNLOGDZ256rmikz,       0 },
+  { X86::VPTERNLOGDZrrik,           X86::VPTERNLOGDZrmik,           0 },
+  { X86::VPTERNLOGDZrrikz,          X86::VPTERNLOGDZrmikz,          0 },
+  { X86::VPTERNLOGQZ128rrik,        X86::VPTERNLOGQZ128rmik,        0 },
+  { X86::VPTERNLOGQZ128rrikz,       X86::VPTERNLOGQZ128rmikz,       0 },
+  { X86::VPTERNLOGQZ256rrik,        X86::VPTERNLOGQZ256rmik,        0 },
+  { X86::VPTERNLOGQZ256rrikz,       X86::VPTERNLOGQZ256rmikz,       0 },
+  { X86::VPTERNLOGQZrrik,           X86::VPTERNLOGQZrmik,           0 },
+  { X86::VPTERNLOGQZrrikz,          X86::VPTERNLOGQZrmikz,          0 },
+  { X86::VPUNPCKHBWZ128rrk,         X86::VPUNPCKHBWZ128rmk,         0 },
+  { X86::VPUNPCKHBWZ256rrk,         X86::VPUNPCKHBWZ256rmk,         0 },
+  { X86::VPUNPCKHBWZrrk,            X86::VPUNPCKHBWZrmk,            0 },
+  { X86::VPUNPCKHDQZ128rrk,         X86::VPUNPCKHDQZ128rmk,         0 },
+  { X86::VPUNPCKHDQZ256rrk,         X86::VPUNPCKHDQZ256rmk,         0 },
+  { X86::VPUNPCKHDQZrrk,            X86::VPUNPCKHDQZrmk,            0 },
+  { X86::VPUNPCKHQDQZ128rrk,        X86::VPUNPCKHQDQZ128rmk,        0 },
+  { X86::VPUNPCKHQDQZ256rrk,        X86::VPUNPCKHQDQZ256rmk,        0 },
+  { X86::VPUNPCKHQDQZrrk,           X86::VPUNPCKHQDQZrmk,           0 },
+  { X86::VPUNPCKHWDZ128rrk,         X86::VPUNPCKHWDZ128rmk,         0 },
+  { X86::VPUNPCKHWDZ256rrk,         X86::VPUNPCKHWDZ256rmk,         0 },
+  { X86::VPUNPCKHWDZrrk,            X86::VPUNPCKHWDZrmk,            0 },
+  { X86::VPUNPCKLBWZ128rrk,         X86::VPUNPCKLBWZ128rmk,         0 },
+  { X86::VPUNPCKLBWZ256rrk,         X86::VPUNPCKLBWZ256rmk,         0 },
+  { X86::VPUNPCKLBWZrrk,            X86::VPUNPCKLBWZrmk,            0 },
+  { X86::VPUNPCKLDQZ128rrk,         X86::VPUNPCKLDQZ128rmk,         0 },
+  { X86::VPUNPCKLDQZ256rrk,         X86::VPUNPCKLDQZ256rmk,         0 },
+  { X86::VPUNPCKLDQZrrk,            X86::VPUNPCKLDQZrmk,            0 },
+  { X86::VPUNPCKLQDQZ128rrk,        X86::VPUNPCKLQDQZ128rmk,        0 },
+  { X86::VPUNPCKLQDQZ256rrk,        X86::VPUNPCKLQDQZ256rmk,        0 },
+  { X86::VPUNPCKLQDQZrrk,           X86::VPUNPCKLQDQZrmk,           0 },
+  { X86::VPUNPCKLWDZ128rrk,         X86::VPUNPCKLWDZ128rmk,         0 },
+  { X86::VPUNPCKLWDZ256rrk,         X86::VPUNPCKLWDZ256rmk,         0 },
+  { X86::VPUNPCKLWDZrrk,            X86::VPUNPCKLWDZrmk,            0 },
+  { X86::VPXORDZ128rrk,             X86::VPXORDZ128rmk,             0 },
+  { X86::VPXORDZ256rrk,             X86::VPXORDZ256rmk,             0 },
+  { X86::VPXORDZrrk,                X86::VPXORDZrmk,                0 },
+  { X86::VPXORQZ128rrk,             X86::VPXORQZ128rmk,             0 },
+  { X86::VPXORQZ256rrk,             X86::VPXORQZ256rmk,             0 },
+  { X86::VPXORQZrrk,                X86::VPXORQZrmk,                0 },
+  { X86::VRANGEPDZ128rrik,          X86::VRANGEPDZ128rmik,          0 },
+  { X86::VRANGEPDZ256rrik,          X86::VRANGEPDZ256rmik,          0 },
+  { X86::VRANGEPDZrrik,             X86::VRANGEPDZrmik,             0 },
+  { X86::VRANGEPSZ128rrik,          X86::VRANGEPSZ128rmik,          0 },
+  { X86::VRANGEPSZ256rrik,          X86::VRANGEPSZ256rmik,          0 },
+  { X86::VRANGEPSZrrik,             X86::VRANGEPSZrmik,             0 },
+  { X86::VRANGESDZrrik,             X86::VRANGESDZrmik,             TB_NO_REVERSE },
+  { X86::VRANGESSZrrik,             X86::VRANGESSZrmik,             TB_NO_REVERSE },
+  { X86::VRCP14SDZrrk,              X86::VRCP14SDZrmk,              TB_NO_REVERSE },
+  { X86::VRCP14SSZrrk,              X86::VRCP14SSZrmk,              TB_NO_REVERSE },
+  { X86::VRCP28SDZrk,               X86::VRCP28SDZmk,               TB_NO_REVERSE },
+  { X86::VRCP28SSZrk,               X86::VRCP28SSZmk,               TB_NO_REVERSE },
+  { X86::VREDUCESDZrrik,            X86::VREDUCESDZrmik,            TB_NO_REVERSE },
+  { X86::VREDUCESSZrrik,            X86::VREDUCESSZrmik,            TB_NO_REVERSE },
+  { X86::VRNDSCALESDZr_Intk,        X86::VRNDSCALESDZm_Intk,        TB_NO_REVERSE },
+  { X86::VRNDSCALESSZr_Intk,        X86::VRNDSCALESSZm_Intk,        TB_NO_REVERSE },
+  { X86::VRSQRT14SDZrrk,            X86::VRSQRT14SDZrmk,            TB_NO_REVERSE },
+  { X86::VRSQRT14SSZrrk,            X86::VRSQRT14SSZrmk,            TB_NO_REVERSE },
+  { X86::VRSQRT28SDZrk,             X86::VRSQRT28SDZmk,             TB_NO_REVERSE },
+  { X86::VRSQRT28SSZrk,             X86::VRSQRT28SSZmk,             TB_NO_REVERSE },
+  { X86::VSCALEFPDZ128rrk,          X86::VSCALEFPDZ128rmk,          0 },
+  { X86::VSCALEFPDZ256rrk,          X86::VSCALEFPDZ256rmk,          0 },
+  { X86::VSCALEFPDZrrk,             X86::VSCALEFPDZrmk,             0 },
+  { X86::VSCALEFPSZ128rrk,          X86::VSCALEFPSZ128rmk,          0 },
+  { X86::VSCALEFPSZ256rrk,          X86::VSCALEFPSZ256rmk,          0 },
+  { X86::VSCALEFPSZrrk,             X86::VSCALEFPSZrmk,             0 },
+  { X86::VSCALEFSDZrrk,             X86::VSCALEFSDZrmk,             TB_NO_REVERSE },
+  { X86::VSCALEFSSZrrk,             X86::VSCALEFSSZrmk,             TB_NO_REVERSE },
+  { X86::VSHUFF32X4Z256rrik,        X86::VSHUFF32X4Z256rmik,        0 },
+  { X86::VSHUFF32X4Zrrik,           X86::VSHUFF32X4Zrmik,           0 },
+  { X86::VSHUFF64X2Z256rrik,        X86::VSHUFF64X2Z256rmik,        0 },
+  { X86::VSHUFF64X2Zrrik,           X86::VSHUFF64X2Zrmik,           0 },
+  { X86::VSHUFI32X4Z256rrik,        X86::VSHUFI32X4Z256rmik,        0 },
+  { X86::VSHUFI32X4Zrrik,           X86::VSHUFI32X4Zrmik,           0 },
+  { X86::VSHUFI64X2Z256rrik,        X86::VSHUFI64X2Z256rmik,        0 },
+  { X86::VSHUFI64X2Zrrik,           X86::VSHUFI64X2Zrmik,           0 },
+  { X86::VSHUFPDZ128rrik,           X86::VSHUFPDZ128rmik,           0 },
+  { X86::VSHUFPDZ256rrik,           X86::VSHUFPDZ256rmik,           0 },
+  { X86::VSHUFPDZrrik,              X86::VSHUFPDZrmik,              0 },
+  { X86::VSHUFPSZ128rrik,           X86::VSHUFPSZ128rmik,           0 },
+  { X86::VSHUFPSZ256rrik,           X86::VSHUFPSZ256rmik,           0 },
+  { X86::VSHUFPSZrrik,              X86::VSHUFPSZrmik,              0 },
+  { X86::VSQRTSDZr_Intk,            X86::VSQRTSDZm_Intk,            TB_NO_REVERSE },
+  { X86::VSQRTSSZr_Intk,            X86::VSQRTSSZm_Intk,            TB_NO_REVERSE },
+  { X86::VSUBPDZ128rrk,             X86::VSUBPDZ128rmk,             0 },
+  { X86::VSUBPDZ256rrk,             X86::VSUBPDZ256rmk,             0 },
+  { X86::VSUBPDZrrk,                X86::VSUBPDZrmk,                0 },
+  { X86::VSUBPSZ128rrk,             X86::VSUBPSZ128rmk,             0 },
+  { X86::VSUBPSZ256rrk,             X86::VSUBPSZ256rmk,             0 },
+  { X86::VSUBPSZrrk,                X86::VSUBPSZrmk,                0 },
+  { X86::VSUBSDZrr_Intk,            X86::VSUBSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VSUBSSZrr_Intk,            X86::VSUBSSZrm_Intk,            TB_NO_REVERSE },
+  { X86::VUNPCKHPDZ128rrk,          X86::VUNPCKHPDZ128rmk,          0 },
+  { X86::VUNPCKHPDZ256rrk,          X86::VUNPCKHPDZ256rmk,          0 },
+  { X86::VUNPCKHPDZrrk,             X86::VUNPCKHPDZrmk,             0 },
+  { X86::VUNPCKHPSZ128rrk,          X86::VUNPCKHPSZ128rmk,          0 },
+  { X86::VUNPCKHPSZ256rrk,          X86::VUNPCKHPSZ256rmk,          0 },
+  { X86::VUNPCKHPSZrrk,             X86::VUNPCKHPSZrmk,             0 },
+  { X86::VUNPCKLPDZ128rrk,          X86::VUNPCKLPDZ128rmk,          0 },
+  { X86::VUNPCKLPDZ256rrk,          X86::VUNPCKLPDZ256rmk,          0 },
+  { X86::VUNPCKLPDZrrk,             X86::VUNPCKLPDZrmk,             0 },
+  { X86::VUNPCKLPSZ128rrk,          X86::VUNPCKLPSZ128rmk,          0 },
+  { X86::VUNPCKLPSZ256rrk,          X86::VUNPCKLPSZ256rmk,          0 },
+  { X86::VUNPCKLPSZrrk,             X86::VUNPCKLPSZrmk,             0 },
+  { X86::VXORPDZ128rrk,             X86::VXORPDZ128rmk,             0 },
+  { X86::VXORPDZ256rrk,             X86::VXORPDZ256rmk,             0 },
+  { X86::VXORPDZrrk,                X86::VXORPDZrmk,                0 },
+  { X86::VXORPSZ128rrk,             X86::VXORPSZ128rmk,             0 },
+  { X86::VXORPSZ256rrk,             X86::VXORPSZ256rmk,             0 },
+  { X86::VXORPSZrrk,                X86::VXORPSZrmk,                0 },
+};
+
+static const X86MemoryFoldTableEntry *
+lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
+#ifndef NDEBUG
+  // Make sure the tables are sorted.
+  static std::atomic<bool> FoldTablesChecked(false);
+  if (!FoldTablesChecked.load(std::memory_order_relaxed)) {
+    assert(std::is_sorted(std::begin(MemoryFoldTable2Addr),
+                          std::end(MemoryFoldTable2Addr)) &&
+           std::adjacent_find(std::begin(MemoryFoldTable2Addr),
+                              std::end(MemoryFoldTable2Addr)) ==
+           std::end(MemoryFoldTable2Addr) &&
+           "MemoryFoldTable2Addr is not sorted and unique!");
+    assert(std::is_sorted(std::begin(MemoryFoldTable0),
+                          std::end(MemoryFoldTable0)) &&
+           std::adjacent_find(std::begin(MemoryFoldTable0),
+                              std::end(MemoryFoldTable0)) ==
+           std::end(MemoryFoldTable0) &&
+           "MemoryFoldTable0 is not sorted and unique!");
+    assert(std::is_sorted(std::begin(MemoryFoldTable1),
+                          std::end(MemoryFoldTable1)) &&
+           std::adjacent_find(std::begin(MemoryFoldTable1),
+                              std::end(MemoryFoldTable1)) ==
+           std::end(MemoryFoldTable1) &&
+           "MemoryFoldTable1 is not sorted and unique!");
+    assert(std::is_sorted(std::begin(MemoryFoldTable2),
+                          std::end(MemoryFoldTable2)) &&
+           std::adjacent_find(std::begin(MemoryFoldTable2),
+                              std::end(MemoryFoldTable2)) ==
+           std::end(MemoryFoldTable2) &&
+           "MemoryFoldTable2 is not sorted and unique!");
+    assert(std::is_sorted(std::begin(MemoryFoldTable3),
+                          std::end(MemoryFoldTable3)) &&
+           std::adjacent_find(std::begin(MemoryFoldTable3),
+                              std::end(MemoryFoldTable3)) ==
+           std::end(MemoryFoldTable3) &&
+           "MemoryFoldTable3 is not sorted and unique!");
+    assert(std::is_sorted(std::begin(MemoryFoldTable4),
+                          std::end(MemoryFoldTable4)) &&
+           std::adjacent_find(std::begin(MemoryFoldTable4),
+                              std::end(MemoryFoldTable4)) ==
+           std::end(MemoryFoldTable4) &&
+           "MemoryFoldTable4 is not sorted and unique!");
+    FoldTablesChecked.store(true, std::memory_order_relaxed);
+  }
+#endif
+
+  const X86MemoryFoldTableEntry *Data = std::lower_bound(Table.begin(),
+                                                         Table.end(),
+                                                         RegOp);
+  if (Data != Table.end() && Data->KeyOp == RegOp &&
+      !(Data->Flags & TB_NO_FORWARD))
+    return Data;
+  return nullptr;
+}
+
+const X86MemoryFoldTableEntry *
+llvm::lookupTwoAddrFoldTable(unsigned RegOp) {
+  return lookupFoldTableImpl(MemoryFoldTable2Addr, RegOp);
+}
+
+const X86MemoryFoldTableEntry *
+llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
+  ArrayRef<X86MemoryFoldTableEntry> FoldTable;
+  if (OpNum == 0)
+    FoldTable = makeArrayRef(MemoryFoldTable0);
+  else if (OpNum == 1)
+    FoldTable = makeArrayRef(MemoryFoldTable1);
+  else if (OpNum == 2)
+    FoldTable = makeArrayRef(MemoryFoldTable2);
+  else if (OpNum == 3)
+    FoldTable = makeArrayRef(MemoryFoldTable3);
+  else if (OpNum == 4)
+    FoldTable = makeArrayRef(MemoryFoldTable4);
+  else
+    return nullptr;
+
+  return lookupFoldTableImpl(FoldTable, RegOp);
+}
+
+namespace {
+
+// This class stores the memory unfolding tables. It is instantiated as a
+// ManagedStatic to lazily init the unfolding table.
+struct X86MemUnfoldTable {
+  // Stores memory unfolding tables entries sorted by opcode.
+  std::vector<X86MemoryFoldTableEntry> Table;
+
+  X86MemUnfoldTable() {
+    for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable2Addr)
+      // Index 0, folded load and store, no alignment requirement.
+      addTableEntry(Entry, TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+
+    for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable0)
+      // Index 0, mix of loads and stores.
+      addTableEntry(Entry, TB_INDEX_0);
+
+    for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable1)
+      // Index 1, folded load
+      addTableEntry(Entry, TB_INDEX_1 | TB_FOLDED_LOAD);
+
+    for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable2)
+      // Index 2, folded load
+      addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD);
+
+    for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable3)
+      // Index 3, folded load
+      addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD);
+
+    for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable4)
+      // Index 4, folded load
+      addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
+
+    // Sort the memory->reg unfold table.
+    array_pod_sort(Table.begin(), Table.end());
+
+    // Now that it's sorted, ensure its unique.
+    assert(std::adjacent_find(Table.begin(), Table.end()) == Table.end() &&
+           "Memory unfolding table is not unique!");
+  }
+
+  void addTableEntry(const X86MemoryFoldTableEntry &Entry,
+                     uint16_t ExtraFlags) {
+    // NOTE: This swaps the KeyOp and DstOp in the table so we can sort it.
+    if ((Entry.Flags & TB_NO_REVERSE) == 0)
+      Table.push_back({Entry.DstOp, Entry.KeyOp,
+                      static_cast<uint16_t>(Entry.Flags | ExtraFlags) });
+  }
+};
+}
+
+static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable;
+
+const X86MemoryFoldTableEntry *
+llvm::lookupUnfoldTable(unsigned MemOp) {
+  auto &Table = MemUnfoldTable->Table;
+  auto I = std::lower_bound(Table.begin(), Table.end(), MemOp);
+  if (I != Table.end() && I->KeyOp == MemOp)
+    return &*I;
+  return nullptr;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h
new file mode 100644
index 000000000000..90016baead96
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -0,0 +1,85 @@
+//===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the interface to query the X86 memory folding tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
+#define LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 3)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_3    = 3,
+  TB_INDEX_4    = 4,
+  TB_INDEX_MASK = 0xf,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 4,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 5,
+
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
+};
+
+// This struct is used for both the folding and unfold tables. They KeyOp
+// is used to determine the sorting order.
+struct X86MemoryFoldTableEntry {
+  uint16_t KeyOp;
+  uint16_t DstOp;
+  uint16_t Flags;
+
+  bool operator<(const X86MemoryFoldTableEntry &RHS) const {
+    return KeyOp < RHS.KeyOp;
+  }
+  bool operator==(const X86MemoryFoldTableEntry &RHS) const {
+    return KeyOp == RHS.KeyOp;
+  }
+  friend bool operator<(const X86MemoryFoldTableEntry &TE, unsigned Opcode) {
+    return TE.KeyOp < Opcode;
+  }
+};
+
+// Look up the memory folding table entry for folding a load and a store into
+// operand 0.
+const X86MemoryFoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
+
+// Look up the memory folding table entry for folding a load or store with
+// operand OpNum.
+const X86MemoryFoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
+
+// Look up the memory unfolding table entry for this instruction.
+const X86MemoryFoldTableEntry *lookupUnfoldTable(unsigned MemOp);
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
index 0b266e5591b4..47d4719d3060 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -127,22 +127,28 @@ class Prefix<bits<3> val> {
   bits<3> Value = val;
 }
 def NoPrfx : Prefix<0>;
-def PS     : Prefix<1>;
-def PD     : Prefix<2>;
-def XS     : Prefix<3>;
-def XD     : Prefix<4>;
+def PD     : Prefix<1>;
+def XS     : Prefix<2>;
+def XD     : Prefix<3>;
+def PS     : Prefix<4>; // Similar to NoPrfx, but disassembler uses this to know
+                        // that other instructions with this opcode use PD/XS/XD
+                        // and if any of those is not supported they shouldn't
+                        // decode to this instruction. e.g. ANDSS/ANDSD don't
+                        // exist, but the 0xf2/0xf3 encoding shouldn't
+                        // disable to ANDPS.
 
 // Class specifying the opcode map.
 class Map<bits<3> val> {
   bits<3> Value = val;
 }
-def OB   : Map<0>;
-def TB   : Map<1>;
-def T8   : Map<2>;
-def TA   : Map<3>;
-def XOP8 : Map<4>;
-def XOP9 : Map<5>;
-def XOPA : Map<6>;
+def OB        : Map<0>;
+def TB        : Map<1>;
+def T8        : Map<2>;
+def TA        : Map<3>;
+def XOP8      : Map<4>;
+def XOP9      : Map<5>;
+def XOPA      : Map<6>;
+def ThreeDNow : Map<7>;
 
 // Class specifying the encoding
 class Encoding<bits<2> val> {
@@ -160,7 +166,6 @@ class OperandSize<bits<2> val> {
 def OpSizeFixed  : OperandSize<0>; // Never needs a 0x66 prefix.
 def OpSize16     : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
 def OpSize32     : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
-def OpSizeIgnore : OperandSize<3>; // Takes 0x66 prefix, never emits.
 
 // Address size for encodings that change based on mode.
 class AddressSize<bits<2> val> {
@@ -175,7 +180,6 @@ def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
 // emitter that various prefix bytes are required.
 class OpSize16 { OperandSize OpSize = OpSize16; }
 class OpSize32 { OperandSize OpSize = OpSize32; }
-class OpSizeIgnore { OperandSize OpSize = OpSizeIgnore; }
 class AdSize16 { AddressSize AdSize = AdSize16; }
 class AdSize32 { AddressSize AdSize = AdSize32; }
 class AdSize64 { AddressSize AdSize = AdSize64; }
@@ -188,6 +192,7 @@ class TA     { Map OpMap = TA; }
 class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
 class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
 class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class ThreeDNow { Map OpMap = ThreeDNow; }
 class OBXS   { Prefix OpPrefix = XS; }
 class PS   : TB { Prefix OpPrefix = PS; }
 class PD   : TB { Prefix OpPrefix = PD; }
@@ -203,11 +208,16 @@ class TAXD : TA { Prefix OpPrefix = XD; }
 class VEX    { Encoding OpEnc = EncVEX; }
 class VEX_W    { bits<2> VEX_WPrefix = 1; }
 class VEX_WIG  { bits<2> VEX_WPrefix = 2; }
+// Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX.
+// FIXME: We should consider adding separate bits for VEX_WIG and the extra
+// part of W1X. This would probably simplify the tablegen emitters and
+// the TSFlags creation below.
+class VEX_W1X  { bits<2> VEX_WPrefix = 3; }
 class VEX_4V : VEX { bit hasVEX_4V = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
-class EVEX : VEX { Encoding OpEnc = EncEVEX; }
-class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX   { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : EVEX { bit hasVEX_4V = 1; }
 class EVEX_K { bit hasEVEX_K = 1; }
 class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
 class EVEX_B { bit hasEVEX_B = 1; }
@@ -215,6 +225,7 @@ class EVEX_RC { bit hasEVEX_RC = 1; }
 class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
 class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
 class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+class NOTRACK { bit hasNoTrackPrefix = 1; }
 
 // Specify AVX512 8-bit compressed displacement encoding based on the vector
 // element size in bits (8, 16, 32, 64) and the CDisp8 form.
@@ -223,23 +234,28 @@ class EVEX_CD8<int esize, CD8VForm form> {
   bits<3> CD8_Form = form.Value;
 }
 
-class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 class XOP { Encoding OpEnc = EncXOP; }
 class XOP_4V : XOP { bit hasVEX_4V = 1; }
 
 // Specify the alternative register form instruction to replace the current
 // instruction in case it was picked during generation of memory folding tables
 class FoldGenData<string _RegisterForm> {
-    string FoldGenRegForm = _RegisterForm;
+  string FoldGenRegForm = _RegisterForm;
+}
+
+// Provide a specific instruction to be used by the EVEX2VEX conversion.
+class EVEX2VEXOverride<string VEXInstrName> {
+  string EVEX2VEXOverride = VEXInstrName;
 }
 
 // Mark the instruction as "illegal to memory fold/unfold"
 class NotMemoryFoldable { bit isMemoryFoldable = 0; }
 
+// Prevent EVEX->VEX conversion from considering this instruction.
+class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
-              string AsmStr,
-              InstrItinClass itin,
-              Domain d = GenericDomain>
+              string AsmStr, Domain d = GenericDomain>
   : Instruction {
   let Namespace = "X86";
 
@@ -255,8 +271,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   // If this is a pseudo instruction, mark it isCodeGenOnly.
   let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
 
-  let Itinerary = itin;
-
   //
   // Attributes specific to X86 instructions...
   //
@@ -294,8 +308,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   // Declare it int rather than bits<4> so that all bits are defined when
   // assigning to bits<7>.
   int CD8_EltSize = 0;      // Compressed disp8 form - element-size in bytes.
-  bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
   bit hasEVEX_RC = 0;       // Explicitly specified rounding control in FP instruction.
+  bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix?
 
   bits<2> EVEX_LL;
   let EVEX_LL{0} = hasVEX_L;
@@ -319,112 +333,118 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   // instruction to replace the current one in case it got picked during generation.
   string FoldGenRegForm = ?;
 
+  // Used to prevent an explicit EVEX2VEX override for this instruction.
+  string EVEX2VEXOverride = ?;
+
   bit isMemoryFoldable = 1;     // Is it allowed to memory fold/unfold this instruction?
+  bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
 
   // TSFlags layout should be kept in sync with X86BaseInfo.h.
   let TSFlags{6-0}   = FormBits;
   let TSFlags{8-7}   = OpSizeBits;
   let TSFlags{10-9}  = AdSizeBits;
-  let TSFlags{13-11} = OpPrefixBits;
-  let TSFlags{16-14} = OpMapBits;
-  let TSFlags{17}    = hasREX_WPrefix;
-  let TSFlags{21-18} = ImmT.Value;
-  let TSFlags{24-22} = FPForm.Value;
-  let TSFlags{25}    = hasLockPrefix;
-  let TSFlags{26}    = hasREPPrefix;
-  let TSFlags{28-27} = ExeDomain.Value;
-  let TSFlags{30-29} = OpEncBits;
-  let TSFlags{38-31} = Opcode;
+  // No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
+  let TSFlags{12-11} = OpPrefixBits{1-0};
+  let TSFlags{15-13} = OpMapBits;
+  let TSFlags{16}    = hasREX_WPrefix;
+  let TSFlags{20-17} = ImmT.Value;
+  let TSFlags{23-21} = FPForm.Value;
+  let TSFlags{24}    = hasLockPrefix;
+  let TSFlags{25}    = hasREPPrefix;
+  let TSFlags{27-26} = ExeDomain.Value;
+  let TSFlags{29-28} = OpEncBits;
+  let TSFlags{37-30} = Opcode;
   // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
-  let TSFlags{39}    = VEX_WPrefix{0};
-  let TSFlags{40}    = hasVEX_4V;
-  let TSFlags{41}    = hasVEX_L;
-  let TSFlags{42}    = hasEVEX_K;
-  let TSFlags{43}    = hasEVEX_Z;
-  let TSFlags{44}    = hasEVEX_L2;
-  let TSFlags{45}    = hasEVEX_B;
+  let TSFlags{38}    = VEX_WPrefix{0};
+  let TSFlags{39}    = hasVEX_4V;
+  let TSFlags{40}    = hasVEX_L;
+  let TSFlags{41}    = hasEVEX_K;
+  let TSFlags{42}    = hasEVEX_Z;
+  let TSFlags{43}    = hasEVEX_L2;
+  let TSFlags{44}    = hasEVEX_B;
   // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
-  let TSFlags{52-46} = CD8_Scale;
-  let TSFlags{53}    = has3DNow0F0FOpcode;
-  let TSFlags{54}    = hasEVEX_RC;
+  let TSFlags{51-45} = CD8_Scale;
+  let TSFlags{52}    = hasEVEX_RC;
+  let TSFlags{53}    = hasNoTrackPrefix;
 }
 
-class PseudoI<dag oops, dag iops, list<dag> pattern,
-              InstrItinClass itin = NoItinerary>
-  : X86Inst<0, Pseudo, NoImm, oops, iops, "", itin> {
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, oops, iops, ""> {
   let Pattern = pattern;
 }
 
 class I<bits<8> o, Format f, dag outs, dag ins, string asm,
-        list<dag> pattern, InstrItinClass itin = NoItinerary,
-        Domain d = GenericDomain>
-  : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+        list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
-class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary,
-           Domain d = GenericDomain>
-  : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+class Ii8<bits<8> o, Format f, dag outs, dag ins, string asm,
+          list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary,
-             Domain d = GenericDomain>
-  : X86Inst<o, f, Imm8Reg, outs, ins, asm, itin, d> {
+             list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8Reg, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
-               list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+               list<dag> pattern>
+  : X86Inst<o, f, Imm8PCRel, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+           list<dag> pattern>
+  : X86Inst<o, f, Imm16, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+           list<dag> pattern>
+  : X86Inst<o, f, Imm32, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+            list<dag> pattern>
+  : X86Inst<o, f, Imm32S, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii64<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern>
+  : X86Inst<o, f, Imm64, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
 class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-           : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+           list<dag> pattern>
+           : X86Inst<o, f, Imm16PCRel, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
 class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+           list<dag> pattern>
+  : X86Inst<o, f, Imm32PCRel, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
 // FPStack Instruction Templates:
 // FPI - Floating Point Instruction template.
-class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
-          InstrItinClass itin = NoItinerary>
-  : I<o, F, outs, ins, asm, [], itin> {}
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+  : I<o, F, outs, ins, asm, []> {}
 
 // FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
-class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
-           InstrItinClass itin = NoItinerary>
-  : PseudoI<outs, ins, pattern, itin> {
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+  : PseudoI<outs, ins, pattern> {
   let FPForm = fp;
 }
 
@@ -435,24 +455,23 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
 //   Iseg32 - 16-bit segment selector, 32-bit offset
 
 class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+              list<dag> pattern>
+      : X86Inst<o, f, Imm16, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
 class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+              list<dag> pattern>
+      : X86Inst<o, f, Imm32, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
-         list<dag> pattern, InstrItinClass itin = NoItinerary,
-         Domain d = GenericDomain>
-      : I<o, F, outs, ins, asm, pattern, itin, d> {
+         list<dag> pattern, Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
                    !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -468,9 +487,8 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 // SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
 class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
-         list<dag> pattern, InstrItinClass itin = NoItinerary,
-         Domain d = GenericDomain>
-      : I<o, F, outs, ins, asm, pattern, itin, d> {
+         list<dag> pattern, Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
                    !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -485,8 +503,8 @@ class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
 }
 // SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
 class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin> {
+           list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
                    !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -500,8 +518,8 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 // PI - SSE 1 & 2 packed instructions
 class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
-         InstrItinClass itin, Domain d>
-      : I<o, F, outs, ins, asm, pattern, itin, d> {
+         Domain d>
+      : I<o, F, outs, ins, asm, pattern, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
                    !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
                    !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
@@ -515,16 +533,16 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
 
 // MMXPI - SSE 1 & 2 packed instructions with MMX operands
 class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
-            InstrItinClass itin, Domain d>
-      : I<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
-                       [HasSSE1]);
+            Domain d>
+      : I<o, F, outs, ins, asm, pattern, d> {
+  let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasMMX, HasSSE2],
+                       [HasMMX, HasSSE1]);
 }
 
 // PIi8 - SSE 1 & 2 packed instructions with immediate
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin, Domain d>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+           list<dag> pattern, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
                    !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
                    !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
@@ -545,26 +563,26 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   VPSI  - SSE1 instructions with PS prefix in AVX form, packed single.
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-          list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+          list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-          list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+          list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
         Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
         Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
         Requires<[HasAVX]>;
 class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, PS,
         Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
@@ -586,50 +604,50 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //               MMX operands.
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-          list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+          list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
+             list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-          list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+          list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
         Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
         Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
         Requires<[UseAVX]>;
 class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+            list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
         Requires<[HasAVX]>;
 class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
         PD, Requires<[HasAVX]>;
 class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, PD,
         Requires<[UseAVX]>;
 class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, PD, Requires<[UseSSE2]>;
 class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+               list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX, HasSSE2]>;
 class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-                list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+                list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX, HasSSE2]>;
 
 // SSE3 Instruction Templates:
 //
@@ -638,16 +656,16 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   S3DI  - SSE3 instructions with XD prefix.
 
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
         Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
         Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
-          list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+          list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
         Requires<[UseSSE3]>;
 
 
@@ -663,21 +681,21 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // classes. They need to be enabled even if AVX is enabled.
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[UseSSSE3]>;
 class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
-               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
-        Requires<[HasSSSE3]>;
+               list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PS,
+        Requires<[HasMMX, HasSSSE3]>;
 class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
-               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
-        Requires<[HasSSSE3]>;
+               list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPS,
+        Requires<[HasMMX, HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
 //
@@ -685,32 +703,32 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
 //
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
 //
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[UseSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with T8XD prefix.
 // NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasSSE42]>;
 
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+             list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[UseSSE42]>;
 
 // AVX Instruction Templates:
@@ -719,12 +737,12 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   AVX8I - AVX instructions with T8PD prefix.
 //   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
 class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[HasAVX]>;
 class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+              list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[HasAVX]>;
 
 // AVX2 Instruction Templates:
@@ -733,12 +751,12 @@ class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   AVX28I - AVX2 instructions with T8PD prefix.
 //   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
 class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[HasAVX2]>;
 class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+              list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[HasAVX2]>;
 
 
@@ -755,34 +773,34 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   AVX512SI   - AVX-512 scalar instructions with PD prefix.
 
 class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[HasAVX512]>;
 class AVX5128IBase : T8PD {
   Domain ExeDomain = SSEPackedInt;
 }
 class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8XS,
         Requires<[HasAVX512]>;
 class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS,
         Requires<[HasAVX512]>;
 class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, XD,
         Requires<[HasAVX512]>;
 class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
         Requires<[HasAVX512]>;
 class AVX512BIBase : PD {
   Domain ExeDomain = SSEPackedInt;
 }
 class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+              list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
         Requires<[HasAVX512]>;
 class AVX512BIi8Base : PD {
   Domain ExeDomain = SSEPackedInt;
@@ -805,149 +823,138 @@ class AVX512PDIi8Base : PD {
   ImmType ImmT = Imm8;
 }
 class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+              list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[HasAVX512]>;
 class AVX512AIi8Base : TAPD {
   ImmType ImmT = Imm8;
 }
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+              list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>,
         Requires<[HasAVX512]>;
 class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
         Requires<[HasAVX512]>;
 class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+              list<dag> pattern, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
 class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+              list<dag> pattern, Domain d>
+      : I<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
 class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+           list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8PD,
         EVEX_4V, Requires<[HasAVX512]>;
 class AVX512FMA3Base : T8PD, EVEX_4V;
 
 class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
+           list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern>, Requires<[HasAVX512]>;
 
 // AES Instruction Templates:
 //
 // AES8I
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag>pattern, InstrItinClass itin = IIC_AES>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+            list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[NoAVX, HasAES]>;
 
 class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[NoAVX, HasAES]>;
 
 // PCLMUL Instruction Templates
 class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-               list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD;
+               list<dag>pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD;
 
 // FMA3 Instruction Templates
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+           list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8PD,
         VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
 class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+            list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8PD,
         VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
 class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
-                list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+                list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8PD,
         VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+           list<dag>pattern>
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
         VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
 class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+            list<dag>pattern>
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
         VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
 class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
-                list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+                list<dag>pattern>
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
         VEX_4V, FMASC, Requires<[HasFMA4]>;
 
 // XOP 2, 3 and 4 Operand Instruction Template
 class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
          XOP9, Requires<[HasXOP]>;
 
 // XOP 2 and 3 Operand Instruction Templates with imm byte
 class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+           list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
          XOP8, Requires<[HasXOP]>;
 // XOP 4 Operand Instruction Templates with imm byte
 class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+           list<dag> pattern>
+      : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
          XOP8, Requires<[HasXOP]>;
 
 //  XOP 5 operand instruction (VEX encoding!)
 class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+           list<dag>pattern>
+      : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         VEX_4V, Requires<[HasXOP]>;
 
 // X86-64 Instruction templates...
 //
 
 class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
-         list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+         list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, REX_W;
 class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
 class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+            list<dag> pattern>
+      : Ii16<o, F, outs, ins, asm, pattern>, REX_W;
 class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+             list<dag> pattern>
+      : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
 class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
-              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
-
-class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
-  let Pattern = pattern;
-  let CodeSize = 3;
-}
-
-class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-  : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
-  let Pattern = pattern;
-  let CodeSize = 3;
-}
+              list<dag> pattern>
+      : Ii32S<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi64<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii64<o, F, outs, ins, asm, pattern>, REX_W;
 
 class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+           list<dag> pattern>
+      : S2I<o, F, outs, ins, asm, pattern>, REX_W;
 class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+           list<dag> pattern>
+      : VS2I<o, F, outs, ins, asm, pattern>, VEX_W;
 
 // MMX Instruction templates
 //
@@ -961,26 +968,26 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // MMXID  - MMX instructions with XD prefix.
 // MMXIS  - MMX instructions with XS prefix.
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
 class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,Not64BitMode]>;
 class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,In64BitMode]>;
 class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, PS, REX_W, Requires<[HasMMX]>;
 class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, PD, Requires<[HasMMX]>;
 class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+             list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
 class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
 class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index ebbef00c01d9..739275907978 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -153,12 +153,6 @@ def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
 def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
 def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
 
-def X86IntCmpMask : SDTypeProfile<1, 2,
-    [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisSameAs<1, 2>, SDTCisInt<1>,
-     SDTCisSameNumEltsAs<0, 1>]>;
-def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
-def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
-
 def X86CmpMaskCC :
       SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
                        SDTCisVec<1>, SDTCisSameAs<2, 1>,
@@ -177,8 +171,9 @@ def X86CmpMaskCCScalarRound :
                            SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
 
 def X86cmpm     : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
+// Hack to make CMPM commutable in tablegen patterns for load folding.
+def X86cmpm_c   : SDNode<"X86ISD::CMPM",     X86CmpMaskCC, [SDNPCommutative]>;
 def X86cmpmRnd  : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
-def X86cmpmu    : SDNode<"X86ISD::CMPMU",    X86CmpMaskCC>;
 def X86cmpms    : SDNode<"X86ISD::FSETCCM",   X86CmpMaskCCScalar>;
 def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND",   X86CmpMaskCCScalarRound>;
 
@@ -211,6 +206,8 @@ def X86kshiftr : SDNode<"X86ISD::KSHIFTR",
                                              SDTCisSameAs<0, 1>,
                                              SDTCisVT<2, i8>]>>;
 
+def X86kadd : SDNode<"X86ISD::KADD", SDTIntBinOp, [SDNPCommutative]>;
+
 def X86vrotli  : SDNode<"X86ISD::VROTLI", X86vshiftimm>;
 def X86vrotri  : SDNode<"X86ISD::VROTRI", X86vshiftimm>;
 
@@ -228,9 +225,9 @@ def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
 def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
                         SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                              SDTCisSameAs<0,2>,
-                                             SDTCisSameSizeAs<0,3>,
-                                             SDTCisSameNumEltsAs<0, 3>,
                                              SDTCisFP<0>, SDTCisInt<3>,
+                                             SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisSameSizeAs<0,3>,
                                              SDTCisVT<4, i8>]>>;
 def X86vpperm : SDNode<"X86ISD::VPPERM",
                         SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
@@ -240,10 +237,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>;
 
-def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                       SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>,
-                                       SDTCisSameNumEltsAs<0, 1>]>;
-
 def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>;
 def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>;
@@ -254,8 +247,6 @@ def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
 def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
-def X86testm   : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>;
-def X86testnm  : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;
 
 def X86movmsk : SDNode<"X86ISD::MOVMSK",
                         SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
@@ -267,14 +258,12 @@ def X86selects : SDNode<"X86ISD::SELECTS",
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
-                                             SDTCVecEltisVT<1, i32>,
-                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<0,1>,
                                              SDTCisSameAs<1,2>]>,
                                              [SDNPCommutative]>;
 def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
                         SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
-                                             SDTCVecEltisVT<1, i32>,
-                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<0,1>,
                                              SDTCisSameAs<1,2>]>,
                                              [SDNPCommutative]>;
 
@@ -292,11 +281,13 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI",
 def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>]>;
+def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+                                         SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>;
 
 def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                        SDTCisSameSizeAs<0,2>,
+                                        SDTCisFP<0>, SDTCisInt<2>,
                                         SDTCisSameNumEltsAs<0,2>,
-                                        SDTCisFP<0>, SDTCisInt<2>]>;
+                                        SDTCisSameSizeAs<0,2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -379,15 +370,11 @@ def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
 def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
 def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
 
-def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
-def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
 
-def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
-def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
-def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
-
-def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
-def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
 
 def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
                                    SDTCisVec<1>, SDTCisInt<1>,
@@ -427,15 +414,6 @@ def X86VPermt2     : SDNode<"X86ISD::VPERMV3",
                                          SDTCisSameSizeAs<0,2>,
                                          SDTCisSameAs<0,3>]>, []>;
 
-// Even though the index operand should be integer, we need to make it match the
-// destination type so that we can pattern match the masked version where the
-// index is also the passthru operand.
-def X86VPermi2X   : SDNode<"X86ISD::VPERMIV3",
-                    SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                         SDTCisSameAs<0,1>,
-                                         SDTCisSameAs<0,2>,
-                                         SDTCisSameAs<0,3>]>, []>;
-
 def X86vpternlog  : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
@@ -465,10 +443,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86kextract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
-                         SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
-                                              SDTCVecEltisVT<1, i1>,
-                                              SDTCisPtrTy<2>]>>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 
@@ -507,35 +481,6 @@ def X86FnmsubRnd    : SDNode<"X86ISD::FNMSUB_RND",    SDTFmaRound, [SDNPCommutat
 def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound, [SDNPCommutative]>;
 def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound, [SDNPCommutative]>;
 
-// Scalar FMA4 intrinsics which zero the non-scalar bits.
-def X86Fmadd4s  : SDNode<"X86ISD::FMADD4S",  SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fmsub4s  : SDNode<"X86ISD::FMSUB4S",  SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
-
-// Scalar FMA intrinsics with passthru bits in operand 1.
-def X86Fmadds1  : SDNode<"X86ISD::FMADDS1",  SDTFPTernaryOp>;
-def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
-def X86Fmsubs1  : SDNode<"X86ISD::FMSUBS1",  SDTFPTernaryOp>;
-def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>;
-
-// Scalar FMA intrinsics with passthru bits in operand 1.
-def X86FmaddRnds1   : SDNode<"X86ISD::FMADDS1_RND",     SDTFmaRound>;
-def X86FnmaddRnds1  : SDNode<"X86ISD::FNMADDS1_RND",    SDTFmaRound>;
-def X86FmsubRnds1   : SDNode<"X86ISD::FMSUBS1_RND",     SDTFmaRound>;
-def X86FnmsubRnds1  : SDNode<"X86ISD::FNMSUBS1_RND",    SDTFmaRound>;
-
-def X86Fmadds3  : SDNode<"X86ISD::FMADDS3",  SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fmsubs3  : SDNode<"X86ISD::FMSUBS3",  SDTFPTernaryOp, [SDNPCommutative]>;
-def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>;
-
-// Scalar FMA intrinsics with passthru bits in operand 3.
-def X86FmaddRnds3   : SDNode<"X86ISD::FMADDS3_RND",     SDTFmaRound, [SDNPCommutative]>;
-def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound, [SDNPCommutative]>;
-def X86FmsubRnds3   : SDNode<"X86ISD::FMSUBS3_RND",     SDTFmaRound, [SDNPCommutative]>;
-def X86FnmsubRnds3  : SDNode<"X86ISD::FNMSUBS3_RND",    SDTFmaRound, [SDNPCommutative]>;
-
 def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma, [SDNPCommutative]>;
@@ -569,17 +514,6 @@ def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>;
 def X86ReducesRnd   : SDNode<"X86ISD::VREDUCES_RND",   SDTFPBinOpImmRound>;
 def X86GetMantsRnd  : SDNode<"X86ISD::VGETMANTS_RND",  SDTFPBinOpImmRound>;
 
-def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
-                                         SDTCisVT<4, i8>]>;
-def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
-                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
-                                         SDTCisVT<6, i8>]>;
-
-def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
-def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
-
 def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
                               [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
 def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
@@ -671,8 +605,6 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
                                              SDTCisOpSmallerThanOp<0, 1>,
                                              SDTCisVT<2, i32>]>>;
 
-def X86cvt2mask   : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
-
 // galois field arithmetic
 def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
 def X86GF2P8affineqb    : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
@@ -687,10 +619,10 @@ def X86GF2P8mulb        : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
 // forms.
 def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
                                   [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
-                                   SDNPWantRoot]>;
+                                   SDNPWantRoot, SDNPWantParent]>;
 def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
                                   [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
-                                   SDNPWantRoot]>;
+                                   SDNPWantRoot, SDNPWantParent]>;
 
 def ssmem : Operand<v4f32> {
   let PrintMethod = "printf32mem";
@@ -892,6 +824,7 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
 def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
 def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
 def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>;
 
 // 512-bit bitconvert pattern fragments
 def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
@@ -924,10 +857,8 @@ def I8Imm : SDNodeXForm<imm, [{
   return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
 }]>;
 
-def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
-def FROUND_CURRENT : ImmLeaf<i32, [{
-  return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
-}]>;
+def FROUND_NO_EXC : PatLeaf<(i32 8)>;
+def FROUND_CURRENT : PatLeaf<(i32 4)>;
 
 // BYTE_imm - Transform bit immediates into byte immediates.
 def BYTE_imm  : SDNodeXForm<imm, [{
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 11ada51a8704..1b61accfb42b 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -14,10 +14,12 @@
 #include "X86InstrInfo.h"
 #include "X86.h"
 #include "X86InstrBuilder.h"
+#include "X86InstrFoldTables.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -71,44 +73,6 @@ UndefRegClearance("undef-reg-clearance",
                            "certain undef register reads"),
                   cl::init(128), cl::Hidden);
 
-enum {
-  // Select which memory operand is being unfolded.
-  // (stored in bits 0 - 3)
-  TB_INDEX_0    = 0,
-  TB_INDEX_1    = 1,
-  TB_INDEX_2    = 2,
-  TB_INDEX_3    = 3,
-  TB_INDEX_4    = 4,
-  TB_INDEX_MASK = 0xf,
-
-  // Do not insert the reverse map (MemOp -> RegOp) into the table.
-  // This may be needed because there is a many -> one mapping.
-  TB_NO_REVERSE   = 1 << 4,
-
-  // Do not insert the forward map (RegOp -> MemOp) into the table.
-  // This is needed for Native Client, which prohibits branch
-  // instructions from using a memory operand.
-  TB_NO_FORWARD   = 1 << 5,
-
-  TB_FOLDED_LOAD  = 1 << 6,
-  TB_FOLDED_STORE = 1 << 7,
-
-  // Minimum alignment required for load/store.
-  // Used for RegOp->MemOp conversion.
-  // (stored in bits 8 - 15)
-  TB_ALIGN_SHIFT = 8,
-  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
-  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
-  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
-  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
-  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
-};
-
-struct X86MemoryFoldTableEntry {
-  uint16_t RegOp;
-  uint16_t MemOp;
-  uint16_t Flags;
-};
 
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
@@ -121,3631 +85,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                       X86::CATCHRET,
                       (STI.is64Bit() ? X86::RETQ : X86::RETL)),
       Subtarget(STI), RI(STI.getTargetTriple()) {
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
-    { X86::ADC16ri,     X86::ADC16mi,    0 },
-    { X86::ADC16ri8,    X86::ADC16mi8,   0 },
-    { X86::ADC16rr,     X86::ADC16mr,    0 },
-    { X86::ADC32ri,     X86::ADC32mi,    0 },
-    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
-    { X86::ADC32rr,     X86::ADC32mr,    0 },
-    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
-    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
-    { X86::ADC64rr,     X86::ADC64mr,    0 },
-    { X86::ADC8ri,      X86::ADC8mi,     0 },
-    { X86::ADC8ri8,     X86::ADC8mi8,    0 },
-    { X86::ADC8rr,      X86::ADC8mr,     0 },
-    { X86::ADD16ri,     X86::ADD16mi,    0 },
-    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
-    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
-    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
-    { X86::ADD16rr,     X86::ADD16mr,    0 },
-    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
-    { X86::ADD32ri,     X86::ADD32mi,    0 },
-    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
-    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
-    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
-    { X86::ADD32rr,     X86::ADD32mr,    0 },
-    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
-    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
-    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
-    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
-    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
-    { X86::ADD64rr,     X86::ADD64mr,    0 },
-    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
-    { X86::ADD8ri,      X86::ADD8mi,     0 },
-    { X86::ADD8ri8,     X86::ADD8mi8,    0 },
-    { X86::ADD8rr,      X86::ADD8mr,     0 },
-    { X86::AND16ri,     X86::AND16mi,    0 },
-    { X86::AND16ri8,    X86::AND16mi8,   0 },
-    { X86::AND16rr,     X86::AND16mr,    0 },
-    { X86::AND32ri,     X86::AND32mi,    0 },
-    { X86::AND32ri8,    X86::AND32mi8,   0 },
-    { X86::AND32rr,     X86::AND32mr,    0 },
-    { X86::AND64ri32,   X86::AND64mi32,  0 },
-    { X86::AND64ri8,    X86::AND64mi8,   0 },
-    { X86::AND64rr,     X86::AND64mr,    0 },
-    { X86::AND8ri,      X86::AND8mi,     0 },
-    { X86::AND8ri8,     X86::AND8mi8,    0 },
-    { X86::AND8rr,      X86::AND8mr,     0 },
-    { X86::BTC16ri8,    X86::BTC16mi8,   0 },
-    { X86::BTC32ri8,    X86::BTC32mi8,   0 },
-    { X86::BTC64ri8,    X86::BTC64mi8,   0 },
-    { X86::BTR16ri8,    X86::BTR16mi8,   0 },
-    { X86::BTR32ri8,    X86::BTR32mi8,   0 },
-    { X86::BTR64ri8,    X86::BTR64mi8,   0 },
-    { X86::BTS16ri8,    X86::BTS16mi8,   0 },
-    { X86::BTS32ri8,    X86::BTS32mi8,   0 },
-    { X86::BTS64ri8,    X86::BTS64mi8,   0 },
-    { X86::DEC16r,      X86::DEC16m,     0 },
-    { X86::DEC32r,      X86::DEC32m,     0 },
-    { X86::DEC64r,      X86::DEC64m,     0 },
-    { X86::DEC8r,       X86::DEC8m,      0 },
-    { X86::INC16r,      X86::INC16m,     0 },
-    { X86::INC32r,      X86::INC32m,     0 },
-    { X86::INC64r,      X86::INC64m,     0 },
-    { X86::INC8r,       X86::INC8m,      0 },
-    { X86::NEG16r,      X86::NEG16m,     0 },
-    { X86::NEG32r,      X86::NEG32m,     0 },
-    { X86::NEG64r,      X86::NEG64m,     0 },
-    { X86::NEG8r,       X86::NEG8m,      0 },
-    { X86::NOT16r,      X86::NOT16m,     0 },
-    { X86::NOT32r,      X86::NOT32m,     0 },
-    { X86::NOT64r,      X86::NOT64m,     0 },
-    { X86::NOT8r,       X86::NOT8m,      0 },
-    { X86::OR16ri,      X86::OR16mi,     0 },
-    { X86::OR16ri8,     X86::OR16mi8,    0 },
-    { X86::OR16rr,      X86::OR16mr,     0 },
-    { X86::OR32ri,      X86::OR32mi,     0 },
-    { X86::OR32ri8,     X86::OR32mi8,    0 },
-    { X86::OR32rr,      X86::OR32mr,     0 },
-    { X86::OR64ri32,    X86::OR64mi32,   0 },
-    { X86::OR64ri8,     X86::OR64mi8,    0 },
-    { X86::OR64rr,      X86::OR64mr,     0 },
-    { X86::OR8ri,       X86::OR8mi,      0 },
-    { X86::OR8ri8,      X86::OR8mi8,     0 },
-    { X86::OR8rr,       X86::OR8mr,      0 },
-    { X86::RCL16r1,     X86::RCL16m1,    0 },
-    { X86::RCL16rCL,    X86::RCL16mCL,   0 },
-    { X86::RCL16ri,     X86::RCL16mi,    0 },
-    { X86::RCL32r1,     X86::RCL32m1,    0 },
-    { X86::RCL32rCL,    X86::RCL32mCL,   0 },
-    { X86::RCL32ri,     X86::RCL32mi,    0 },
-    { X86::RCL64r1,     X86::RCL64m1,    0 },
-    { X86::RCL64rCL,    X86::RCL64mCL,   0 },
-    { X86::RCL64ri,     X86::RCL64mi,    0 },
-    { X86::RCL8r1,      X86::RCL8m1,     0 },
-    { X86::RCL8rCL,     X86::RCL8mCL,    0 },
-    { X86::RCL8ri,      X86::RCL8mi,     0 },
-    { X86::RCR16r1,     X86::RCR16m1,    0 },
-    { X86::RCR16rCL,    X86::RCR16mCL,   0 },
-    { X86::RCR16ri,     X86::RCR16mi,    0 },
-    { X86::RCR32r1,     X86::RCR32m1,    0 },
-    { X86::RCR32rCL,    X86::RCR32mCL,   0 },
-    { X86::RCR32ri,     X86::RCR32mi,    0 },
-    { X86::RCR64r1,     X86::RCR64m1,    0 },
-    { X86::RCR64rCL,    X86::RCR64mCL,   0 },
-    { X86::RCR64ri,     X86::RCR64mi,    0 },
-    { X86::RCR8r1,      X86::RCR8m1,     0 },
-    { X86::RCR8rCL,     X86::RCR8mCL,    0 },
-    { X86::RCR8ri,      X86::RCR8mi,     0 },
-    { X86::ROL16r1,     X86::ROL16m1,    0 },
-    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
-    { X86::ROL16ri,     X86::ROL16mi,    0 },
-    { X86::ROL32r1,     X86::ROL32m1,    0 },
-    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
-    { X86::ROL32ri,     X86::ROL32mi,    0 },
-    { X86::ROL64r1,     X86::ROL64m1,    0 },
-    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
-    { X86::ROL64ri,     X86::ROL64mi,    0 },
-    { X86::ROL8r1,      X86::ROL8m1,     0 },
-    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
-    { X86::ROL8ri,      X86::ROL8mi,     0 },
-    { X86::ROR16r1,     X86::ROR16m1,    0 },
-    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
-    { X86::ROR16ri,     X86::ROR16mi,    0 },
-    { X86::ROR32r1,     X86::ROR32m1,    0 },
-    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
-    { X86::ROR32ri,     X86::ROR32mi,    0 },
-    { X86::ROR64r1,     X86::ROR64m1,    0 },
-    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
-    { X86::ROR64ri,     X86::ROR64mi,    0 },
-    { X86::ROR8r1,      X86::ROR8m1,     0 },
-    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
-    { X86::ROR8ri,      X86::ROR8mi,     0 },
-    { X86::SAR16r1,     X86::SAR16m1,    0 },
-    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
-    { X86::SAR16ri,     X86::SAR16mi,    0 },
-    { X86::SAR32r1,     X86::SAR32m1,    0 },
-    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
-    { X86::SAR32ri,     X86::SAR32mi,    0 },
-    { X86::SAR64r1,     X86::SAR64m1,    0 },
-    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
-    { X86::SAR64ri,     X86::SAR64mi,    0 },
-    { X86::SAR8r1,      X86::SAR8m1,     0 },
-    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
-    { X86::SAR8ri,      X86::SAR8mi,     0 },
-    { X86::SBB16ri,     X86::SBB16mi,    0 },
-    { X86::SBB16ri8,    X86::SBB16mi8,   0 },
-    { X86::SBB16rr,     X86::SBB16mr,    0 },
-    { X86::SBB32ri,     X86::SBB32mi,    0 },
-    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
-    { X86::SBB32rr,     X86::SBB32mr,    0 },
-    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
-    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
-    { X86::SBB64rr,     X86::SBB64mr,    0 },
-    { X86::SBB8ri,      X86::SBB8mi,     0 },
-    { X86::SBB8ri8,     X86::SBB8mi8,    0 },
-    { X86::SBB8rr,      X86::SBB8mr,     0 },
-    { X86::SHL16r1,     X86::SHL16m1,    0 },
-    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
-    { X86::SHL16ri,     X86::SHL16mi,    0 },
-    { X86::SHL32r1,     X86::SHL32m1,    0 },
-    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
-    { X86::SHL32ri,     X86::SHL32mi,    0 },
-    { X86::SHL64r1,     X86::SHL64m1,    0 },
-    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
-    { X86::SHL64ri,     X86::SHL64mi,    0 },
-    { X86::SHL8r1,      X86::SHL8m1,     0 },
-    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
-    { X86::SHL8ri,      X86::SHL8mi,     0 },
-    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
-    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
-    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
-    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
-    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
-    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
-    { X86::SHR16r1,     X86::SHR16m1,    0 },
-    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
-    { X86::SHR16ri,     X86::SHR16mi,    0 },
-    { X86::SHR32r1,     X86::SHR32m1,    0 },
-    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
-    { X86::SHR32ri,     X86::SHR32mi,    0 },
-    { X86::SHR64r1,     X86::SHR64m1,    0 },
-    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
-    { X86::SHR64ri,     X86::SHR64mi,    0 },
-    { X86::SHR8r1,      X86::SHR8m1,     0 },
-    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
-    { X86::SHR8ri,      X86::SHR8mi,     0 },
-    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
-    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
-    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
-    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
-    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
-    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
-    { X86::SUB16ri,     X86::SUB16mi,    0 },
-    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
-    { X86::SUB16rr,     X86::SUB16mr,    0 },
-    { X86::SUB32ri,     X86::SUB32mi,    0 },
-    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
-    { X86::SUB32rr,     X86::SUB32mr,    0 },
-    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
-    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
-    { X86::SUB64rr,     X86::SUB64mr,    0 },
-    { X86::SUB8ri,      X86::SUB8mi,     0 },
-    { X86::SUB8ri8,     X86::SUB8mi8,    0 },
-    { X86::SUB8rr,      X86::SUB8mr,     0 },
-    { X86::XOR16ri,     X86::XOR16mi,    0 },
-    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
-    { X86::XOR16rr,     X86::XOR16mr,    0 },
-    { X86::XOR32ri,     X86::XOR32mi,    0 },
-    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
-    { X86::XOR32rr,     X86::XOR32mr,    0 },
-    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
-    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
-    { X86::XOR64rr,     X86::XOR64mr,    0 },
-    { X86::XOR8ri,      X86::XOR8mi,     0 },
-    { X86::XOR8ri8,     X86::XOR8mi8,    0 },
-    { X86::XOR8rr,      X86::XOR8mr,     0 }
-  };
-
-  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
-    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
-                  Entry.RegOp, Entry.MemOp,
-                  // Index 0, folded load and store, no alignment requirement.
-                  Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
-  }
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
-    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
-    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
-    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
-    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
-    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
-    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
-    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
-    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
-    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
-    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
-    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
-    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
-    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
-    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
-    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
-    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
-    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
-    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
-    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
-    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
-    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
-    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
-    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
-    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
-    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
-    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
-    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
-    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
-    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
-    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
-    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
-    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
-    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
-    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
-    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
-    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
-    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
-    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
-    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
-    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
-    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::MOVDQUrr,    X86::MOVDQUmr,      TB_FOLDED_STORE },
-    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
-    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
-    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
-    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
-    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
-    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
-    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
-    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
-    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
-    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
-    { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
-    { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
-    { X86::PUSH16r,     X86::PUSH16rmm,     TB_FOLDED_LOAD },
-    { X86::PUSH32r,     X86::PUSH32rmm,     TB_FOLDED_LOAD },
-    { X86::PUSH64r,     X86::PUSH64rmm,     TB_FOLDED_LOAD },
-    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
-    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
-    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
-    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
-    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
-    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
-    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
-    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
-    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
-    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
-    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
-    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
-    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
-    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
-    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
-    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
-    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
-    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
-    { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
-    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
-    { X86::TEST16rr,    X86::TEST16mr,      TB_FOLDED_LOAD },
-    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
-    { X86::TEST32rr,    X86::TEST32mr,      TB_FOLDED_LOAD },
-    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
-    { X86::TEST64rr,    X86::TEST64mr,      TB_FOLDED_LOAD },
-    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
-    { X86::TEST8rr,     X86::TEST8mr,       TB_FOLDED_LOAD },
-
-    // AVX 128-bit versions of foldable instructions
-    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
-    { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQUrr,   X86::VMOVDQUmr,     TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
-    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
-    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
-    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
-    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
-    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
-    { X86::VPEXTRDrr,   X86::VPEXTRDmr,     TB_FOLDED_STORE },
-    { X86::VPEXTRQrr,   X86::VPEXTRQmr,     TB_FOLDED_STORE },
-
-    // AVX 256-bit foldable instructions
-    { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQUYrr,  X86::VMOVDQUYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
-
-    // AVX-512 foldable instructions
-    { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTPSZrr,   X86::VEXTRACTPSZmr,    TB_FOLDED_STORE },
-    { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
-    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
-    { X86::VMOVPQIto64Zrr,  X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
-    { X86::VMOVSDto64Zrr,   X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
-    { X86::VMOVSS2DIZrr,    X86::VMOVSS2DIZmr,  TB_FOLDED_STORE },
-    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
-    { X86::VPEXTRDZrr,      X86::VPEXTRDZmr,    TB_FOLDED_STORE },
-    { X86::VPEXTRQZrr,      X86::VPEXTRQZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVDBZrr,      X86::VPMOVDBZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVDWZrr,      X86::VPMOVDWZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVQDZrr,      X86::VPMOVQDZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVQWZrr,      X86::VPMOVQWZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVWBZrr,      X86::VPMOVWBZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVSDBZrr,     X86::VPMOVSDBZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSDWZrr,     X86::VPMOVSDWZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSQDZrr,     X86::VPMOVSQDZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSQWZrr,     X86::VPMOVSQWZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSWBZrr,     X86::VPMOVSWBZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVUSDBZrr,    X86::VPMOVUSDBZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSDWZrr,    X86::VPMOVUSDWZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSQDZrr,    X86::VPMOVUSQDZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSQWZrr,    X86::VPMOVUSQWZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSWBZrr,    X86::VPMOVUSWBZmr,  TB_FOLDED_STORE },
-
-    // AVX-512 foldable instructions (256-bit versions)
-    { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
-    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256mr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256mr,    TB_FOLDED_STORE },
-    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256mr,   TB_FOLDED_STORE },
-    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
-    { X86::VPMOVDWZ256rr,      X86::VPMOVDWZ256mr,    TB_FOLDED_STORE },
-    { X86::VPMOVQDZ256rr,      X86::VPMOVQDZ256mr,    TB_FOLDED_STORE },
-    { X86::VPMOVWBZ256rr,      X86::VPMOVWBZ256mr,    TB_FOLDED_STORE },
-    { X86::VPMOVSDWZ256rr,     X86::VPMOVSDWZ256mr,   TB_FOLDED_STORE },
-    { X86::VPMOVSQDZ256rr,     X86::VPMOVSQDZ256mr,   TB_FOLDED_STORE },
-    { X86::VPMOVSWBZ256rr,     X86::VPMOVSWBZ256mr,   TB_FOLDED_STORE },
-    { X86::VPMOVUSDWZ256rr,    X86::VPMOVUSDWZ256mr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSQDZ256rr,    X86::VPMOVUSQDZ256mr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSWBZ256rr,    X86::VPMOVUSWBZ256mr,  TB_FOLDED_STORE },
-
-    // AVX-512 foldable instructions (128-bit versions)
-    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128mr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128mr,    TB_FOLDED_STORE },
-    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
-    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE },
-
-    // F16C foldable instructions
-    { X86::VCVTPS2PHrr,        X86::VCVTPS2PHmr,      TB_FOLDED_STORE },
-    { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
-  };
-
-  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
-    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
-                  Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
-  }
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
-    { X86::BSF16rr,         X86::BSF16rm,             0 },
-    { X86::BSF32rr,         X86::BSF32rm,             0 },
-    { X86::BSF64rr,         X86::BSF64rm,             0 },
-    { X86::BSR16rr,         X86::BSR16rm,             0 },
-    { X86::BSR32rr,         X86::BSR32rm,             0 },
-    { X86::BSR64rr,         X86::BSR64rm,             0 },
-    { X86::CMP16rr,         X86::CMP16rm,             0 },
-    { X86::CMP32rr,         X86::CMP32rm,             0 },
-    { X86::CMP64rr,         X86::CMP64rm,             0 },
-    { X86::CMP8rr,          X86::CMP8rm,              0 },
-    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          TB_NO_REVERSE },
-    { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
-    { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
-    { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
-    { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
-    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          TB_NO_REVERSE },
-    { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int,   TB_NO_REVERSE },
-    { X86::CVTSD2SIrr_Int,  X86::CVTSD2SIrm_Int,      TB_NO_REVERSE },
-    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
-    { X86::CVTSI642SDrr,    X86::CVTSI642SDrm,        0 },
-    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
-    { X86::CVTSI642SSrr,    X86::CVTSI642SSrm,        0 },
-    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
-    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
-    { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int,   TB_NO_REVERSE },
-    { X86::CVTSS2SIrr_Int,  X86::CVTSS2SIrm_Int,      TB_NO_REVERSE },
-    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
-    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
-    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
-    { X86::CVTTSD2SI64rr_Int,X86::CVTTSD2SI64rm_Int,  TB_NO_REVERSE },
-    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
-    { X86::CVTTSD2SIrr_Int, X86::CVTTSD2SIrm_Int,     TB_NO_REVERSE },
-    { X86::CVTTSS2SI64rr_Int,X86::CVTTSS2SI64rm_Int,  TB_NO_REVERSE },
-    { X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int,     TB_NO_REVERSE },
-    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
-    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
-    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
-    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
-    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
-    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
-    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
-    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
-    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        TB_NO_REVERSE },
-    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        TB_NO_REVERSE },
-    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       TB_NO_REVERSE },
-    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       TB_NO_REVERSE },
-    { X86::MOV16rr,         X86::MOV16rm,             0 },
-    { X86::MOV32rr,         X86::MOV32rm,             0 },
-    { X86::MOV64rr,         X86::MOV64rm,             0 },
-    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
-    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
-    { X86::MOV8rr,          X86::MOV8rm,              0 },
-    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
-    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
-    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           TB_NO_REVERSE },
-    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
-    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
-    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
-    { X86::MOVDQUrr,        X86::MOVDQUrm,            0 },
-    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
-    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
-    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
-    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
-    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
-    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
-    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
-    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
-    { X86::MOVUPDrr,        X86::MOVUPDrm,            0 },
-    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
-    { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm,         TB_NO_REVERSE },
-    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
-    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
-    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
-    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
-    { X86::PABSBrr,         X86::PABSBrm,             TB_ALIGN_16 },
-    { X86::PABSDrr,         X86::PABSDrm,             TB_ALIGN_16 },
-    { X86::PABSWrr,         X86::PABSWrm,             TB_ALIGN_16 },
-    { X86::PCMPESTRIrr,     X86::PCMPESTRIrm,         TB_ALIGN_16 },
-    { X86::PCMPESTRM128rr,  X86::PCMPESTRM128rm,      TB_ALIGN_16 },
-    { X86::PCMPISTRIrr,     X86::PCMPISTRIrm,         TB_ALIGN_16 },
-    { X86::PCMPISTRM128rr,  X86::PCMPISTRM128rm,      TB_ALIGN_16 },
-    { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128,     TB_ALIGN_16 },
-    { X86::PMOVSXBDrr,      X86::PMOVSXBDrm,          TB_NO_REVERSE },
-    { X86::PMOVSXBQrr,      X86::PMOVSXBQrm,          TB_NO_REVERSE },
-    { X86::PMOVSXBWrr,      X86::PMOVSXBWrm,          TB_NO_REVERSE },
-    { X86::PMOVSXDQrr,      X86::PMOVSXDQrm,          TB_NO_REVERSE },
-    { X86::PMOVSXWDrr,      X86::PMOVSXWDrm,          TB_NO_REVERSE },
-    { X86::PMOVSXWQrr,      X86::PMOVSXWQrm,          TB_NO_REVERSE },
-    { X86::PMOVZXBDrr,      X86::PMOVZXBDrm,          TB_NO_REVERSE },
-    { X86::PMOVZXBQrr,      X86::PMOVZXBQrm,          TB_NO_REVERSE },
-    { X86::PMOVZXBWrr,      X86::PMOVZXBWrm,          TB_NO_REVERSE },
-    { X86::PMOVZXDQrr,      X86::PMOVZXDQrm,          TB_NO_REVERSE },
-    { X86::PMOVZXWDrr,      X86::PMOVZXWDrm,          TB_NO_REVERSE },
-    { X86::PMOVZXWQrr,      X86::PMOVZXWQrm,          TB_NO_REVERSE },
-    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
-    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
-    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
-    { X86::PTESTrr,         X86::PTESTrm,             TB_ALIGN_16 },
-    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
-    { X86::RCPSSr,          X86::RCPSSm,              0 },
-    { X86::RCPSSr_Int,      X86::RCPSSm_Int,          TB_NO_REVERSE },
-    { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
-    { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
-    { X86::ROUNDSDr,        X86::ROUNDSDm,            0 },
-    { X86::ROUNDSSr,        X86::ROUNDSSm,            0 },
-    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
-    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
-    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        TB_NO_REVERSE },
-    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
-    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
-    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
-    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         TB_NO_REVERSE },
-    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
-    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         TB_NO_REVERSE },
-    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
-    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
-    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
-
-    // MMX version of foldable instructions
-    { X86::MMX_CVTPD2PIirr,   X86::MMX_CVTPD2PIirm,   0 },
-    { X86::MMX_CVTPI2PDirr,   X86::MMX_CVTPI2PDirm,   0 },
-    { X86::MMX_CVTPS2PIirr,   X86::MMX_CVTPS2PIirm,   0 },
-    { X86::MMX_CVTTPD2PIirr,  X86::MMX_CVTTPD2PIirm,  0 },
-    { X86::MMX_CVTTPS2PIirr,  X86::MMX_CVTTPS2PIirm,  0 },
-    { X86::MMX_MOVD64to64rr,  X86::MMX_MOVQ64rm,      0 },
-    { X86::MMX_PABSBrr64,     X86::MMX_PABSBrm64,     0 },
-    { X86::MMX_PABSDrr64,     X86::MMX_PABSDrm64,     0 },
-    { X86::MMX_PABSWrr64,     X86::MMX_PABSWrm64,     0 },
-    { X86::MMX_PSHUFWri,      X86::MMX_PSHUFWmi,      0 },
-
-    // 3DNow! version of foldable instructions
-    { X86::PF2IDrr,         X86::PF2IDrm,             0 },
-    { X86::PF2IWrr,         X86::PF2IWrm,             0 },
-    { X86::PFRCPrr,         X86::PFRCPrm,             0 },
-    { X86::PFRSQRTrr,       X86::PFRSQRTrm,           0 },
-    { X86::PI2FDrr,         X86::PI2FDrm,             0 },
-    { X86::PI2FWrr,         X86::PI2FWrm,             0 },
-    { X86::PSWAPDrr,        X86::PSWAPDrm,            0 },
-
-    // AVX 128-bit versions of foldable instructions
-    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       TB_NO_REVERSE },
-    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       TB_NO_REVERSE },
-    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      TB_NO_REVERSE },
-    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      TB_NO_REVERSE },
-    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
-    { X86::VCVTTSD2SI64rr_Int,X86::VCVTTSD2SI64rm_Int,TB_NO_REVERSE },
-    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
-    { X86::VCVTTSD2SIrr_Int,X86::VCVTTSD2SIrm_Int,    TB_NO_REVERSE },
-    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
-    { X86::VCVTTSS2SI64rr_Int,X86::VCVTTSS2SI64rm_Int,TB_NO_REVERSE },
-    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
-    { X86::VCVTTSS2SIrr_Int,X86::VCVTTSS2SIrm_Int,    TB_NO_REVERSE },
-    { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE },
-    { X86::VCVTSD2SIrr_Int,   X86::VCVTSD2SIrm_Int,   TB_NO_REVERSE },
-    { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE },
-    { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int,     TB_NO_REVERSE },
-    { X86::VCVTDQ2PDrr,     X86::VCVTDQ2PDrm,         TB_NO_REVERSE },
-    { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
-    { X86::VCVTPD2DQrr,     X86::VCVTPD2DQrm,         0 },
-    { X86::VCVTPD2PSrr,     X86::VCVTPD2PSrm,         0 },
-    { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
-    { X86::VCVTPS2PDrr,     X86::VCVTPS2PDrm,         TB_NO_REVERSE },
-    { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQrm,        0 },
-    { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
-    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
-    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
-    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
-    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
-    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          TB_NO_REVERSE },
-    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
-    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
-    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
-    { X86::VMOVDQUrr,       X86::VMOVDQUrm,           0 },
-    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         0 },
-    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         0 },
-    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
-    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
-    { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm,        TB_NO_REVERSE },
-    { X86::VPABSBrr,        X86::VPABSBrm,            0 },
-    { X86::VPABSDrr,        X86::VPABSDrm,            0 },
-    { X86::VPABSWrr,        X86::VPABSWrm,            0 },
-    { X86::VPCMPESTRIrr,    X86::VPCMPESTRIrm,        0 },
-    { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm,     0 },
-    { X86::VPCMPISTRIrr,    X86::VPCMPISTRIrm,        0 },
-    { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm,     0 },
-    { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128,   0 },
-    { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
-    { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
-    { X86::VPMOVSXBDrr,     X86::VPMOVSXBDrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXBQrr,     X86::VPMOVSXBQrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXBWrr,     X86::VPMOVSXBWrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXDQrr,     X86::VPMOVSXDQrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXWDrr,     X86::VPMOVSXWDrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXWQrr,     X86::VPMOVSXWQrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXBDrr,     X86::VPMOVZXBDrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXBQrr,     X86::VPMOVZXBQrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXBWrr,     X86::VPMOVZXBWrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXDQrr,     X86::VPMOVZXDQrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXWDrr,     X86::VPMOVZXWDrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXWQrr,     X86::VPMOVZXWQrm,         TB_NO_REVERSE },
-    { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
-    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
-    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
-    { X86::VPTESTrr,        X86::VPTESTrm,            0 },
-    { X86::VRCPPSr,         X86::VRCPPSm,             0 },
-    { X86::VROUNDPDr,       X86::VROUNDPDm,           0 },
-    { X86::VROUNDPSr,       X86::VROUNDPSm,           0 },
-    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
-    { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
-    { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
-    { X86::VTESTPDrr,       X86::VTESTPDrm,           0 },
-    { X86::VTESTPSrr,       X86::VTESTPSrm,           0 },
-    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
-    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
-
-    // AVX 256-bit foldable instructions
-    { X86::VCVTDQ2PDYrr,    X86::VCVTDQ2PDYrm,        0 },
-    { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
-    { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
-    { X86::VCVTPD2PSYrr,    X86::VCVTPD2PSYrm,        0 },
-    { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
-    { X86::VCVTPS2PDYrr,    X86::VCVTPS2PDYrm,        0 },
-    { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
-    { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
-    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
-    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
-    { X86::VMOVDDUPYrr,     X86::VMOVDDUPYrm,         0 },
-    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
-    { X86::VMOVDQUYrr,      X86::VMOVDQUYrm,          0 },
-    { X86::VMOVSLDUPYrr,    X86::VMOVSLDUPYrm,        0 },
-    { X86::VMOVSHDUPYrr,    X86::VMOVSHDUPYrm,        0 },
-    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
-    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
-    { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
-    { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
-    { X86::VPTESTYrr,       X86::VPTESTYrm,           0 },
-    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
-    { X86::VROUNDYPDr,      X86::VROUNDYPDm,          0 },
-    { X86::VROUNDYPSr,      X86::VROUNDYPSm,          0 },
-    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
-    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
-    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
-    { X86::VTESTPDYrr,      X86::VTESTPDYrm,          0 },
-    { X86::VTESTPSYrr,      X86::VTESTPSYrm,          0 },
-
-    // AVX2 foldable instructions
-
-    // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
-    // VBROADCASTS{SD}rm memory instructions were available from AVX1.
-    // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
-    // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
-    // so they don't need an equivalent limitation.
-    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
-    { X86::VPABSBYrr,       X86::VPABSBYrm,           0 },
-    { X86::VPABSDYrr,       X86::VPABSDYrm,           0 },
-    { X86::VPABSWYrr,       X86::VPABSWYrm,           0 },
-    { X86::VPBROADCASTBrr,  X86::VPBROADCASTBrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm,     TB_NO_REVERSE },
-    { X86::VPBROADCASTDrr,  X86::VPBROADCASTDrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm,     TB_NO_REVERSE },
-    { X86::VPBROADCASTQrr,  X86::VPBROADCASTQrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm,     TB_NO_REVERSE },
-    { X86::VPBROADCASTWrr,  X86::VPBROADCASTWrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm,     TB_NO_REVERSE },
-    { X86::VPERMPDYri,      X86::VPERMPDYmi,          0 },
-    { X86::VPERMQYri,       X86::VPERMQYmi,           0 },
-    { X86::VPMOVSXBDYrr,    X86::VPMOVSXBDYrm,        TB_NO_REVERSE },
-    { X86::VPMOVSXBQYrr,    X86::VPMOVSXBQYrm,        TB_NO_REVERSE },
-    { X86::VPMOVSXBWYrr,    X86::VPMOVSXBWYrm,        0 },
-    { X86::VPMOVSXDQYrr,    X86::VPMOVSXDQYrm,        0 },
-    { X86::VPMOVSXWDYrr,    X86::VPMOVSXWDYrm,        0 },
-    { X86::VPMOVSXWQYrr,    X86::VPMOVSXWQYrm,        TB_NO_REVERSE },
-    { X86::VPMOVZXBDYrr,    X86::VPMOVZXBDYrm,        TB_NO_REVERSE },
-    { X86::VPMOVZXBQYrr,    X86::VPMOVZXBQYrm,        TB_NO_REVERSE },
-    { X86::VPMOVZXBWYrr,    X86::VPMOVZXBWYrm,        0 },
-    { X86::VPMOVZXDQYrr,    X86::VPMOVZXDQYrm,        0 },
-    { X86::VPMOVZXWDYrr,    X86::VPMOVZXWDYrm,        0 },
-    { X86::VPMOVZXWQYrr,    X86::VPMOVZXWQYrm,        TB_NO_REVERSE },
-    { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
-    { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
-    { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
-
-    // XOP foldable instructions
-    { X86::VFRCZPDrr,          X86::VFRCZPDrm,        0 },
-    { X86::VFRCZPDrrY,         X86::VFRCZPDrmY,       0 },
-    { X86::VFRCZPSrr,          X86::VFRCZPSrm,        0 },
-    { X86::VFRCZPSrrY,         X86::VFRCZPSrmY,       0 },
-    { X86::VFRCZSDrr,          X86::VFRCZSDrm,        0 },
-    { X86::VFRCZSSrr,          X86::VFRCZSSrm,        0 },
-    { X86::VPHADDBDrr,         X86::VPHADDBDrm,       0 },
-    { X86::VPHADDBQrr,         X86::VPHADDBQrm,       0 },
-    { X86::VPHADDBWrr,         X86::VPHADDBWrm,       0 },
-    { X86::VPHADDDQrr,         X86::VPHADDDQrm,       0 },
-    { X86::VPHADDWDrr,         X86::VPHADDWDrm,       0 },
-    { X86::VPHADDWQrr,         X86::VPHADDWQrm,       0 },
-    { X86::VPHADDUBDrr,        X86::VPHADDUBDrm,      0 },
-    { X86::VPHADDUBQrr,        X86::VPHADDUBQrm,      0 },
-    { X86::VPHADDUBWrr,        X86::VPHADDUBWrm,      0 },
-    { X86::VPHADDUDQrr,        X86::VPHADDUDQrm,      0 },
-    { X86::VPHADDUWDrr,        X86::VPHADDUWDrm,      0 },
-    { X86::VPHADDUWQrr,        X86::VPHADDUWQrm,      0 },
-    { X86::VPHSUBBWrr,         X86::VPHSUBBWrm,       0 },
-    { X86::VPHSUBDQrr,         X86::VPHSUBDQrm,       0 },
-    { X86::VPHSUBWDrr,         X86::VPHSUBWDrm,       0 },
-    { X86::VPROTBri,           X86::VPROTBmi,         0 },
-    { X86::VPROTBrr,           X86::VPROTBmr,         0 },
-    { X86::VPROTDri,           X86::VPROTDmi,         0 },
-    { X86::VPROTDrr,           X86::VPROTDmr,         0 },
-    { X86::VPROTQri,           X86::VPROTQmi,         0 },
-    { X86::VPROTQrr,           X86::VPROTQmr,         0 },
-    { X86::VPROTWri,           X86::VPROTWmi,         0 },
-    { X86::VPROTWrr,           X86::VPROTWmr,         0 },
-    { X86::VPSHABrr,           X86::VPSHABmr,         0 },
-    { X86::VPSHADrr,           X86::VPSHADmr,         0 },
-    { X86::VPSHAQrr,           X86::VPSHAQmr,         0 },
-    { X86::VPSHAWrr,           X86::VPSHAWmr,         0 },
-    { X86::VPSHLBrr,           X86::VPSHLBmr,         0 },
-    { X86::VPSHLDrr,           X86::VPSHLDmr,         0 },
-    { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
-    { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
-
-    // LWP foldable instructions
-    { X86::LWPINS32rri,        X86::LWPINS32rmi,      0 },
-    { X86::LWPINS64rri,        X86::LWPINS64rmi,      0 },
-    { X86::LWPVAL32rri,        X86::LWPVAL32rmi,      0 },
-    { X86::LWPVAL64rri,        X86::LWPVAL64rmi,      0 },
-
-    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
-    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
-    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
-    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
-    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
-    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
-    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
-    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
-    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
-    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
-    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
-    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
-    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
-    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
-    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
-    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
-    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
-    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
-    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
-    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
-    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
-    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
-    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
-    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
-    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
-    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
-    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
-    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
-    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
-    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
-    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
-    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
-    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
-    { X86::RORX32ri,        X86::RORX32mi,            0 },
-    { X86::RORX64ri,        X86::RORX64mi,            0 },
-    { X86::SARX32rr,        X86::SARX32rm,            0 },
-    { X86::SARX64rr,        X86::SARX64rm,            0 },
-    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
-    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
-    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
-    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
-    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
-    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
-    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
-    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
-    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
-    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
-    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
-
-    // AVX-512 foldable instructions
-    { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
-    { X86::VCVTDQ2PDZrr,     X86::VCVTDQ2PDZrm,       0 },
-    { X86::VCVTPD2PSZrr,     X86::VCVTPD2PSZrm,       0 },
-    { X86::VCVTUDQ2PDZrr,    X86::VCVTUDQ2PDZrm,      0 },
-    { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
-    { X86::VMOV64toSDZrr,    X86::VMOV64toSDZrm,      0 },
-    { X86::VMOVDI2PDIZrr,    X86::VMOVDI2PDIZrm,      0 },
-    { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
-    { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
-    { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
-    { X86::VMOVDQA32Zrr,     X86::VMOVDQA32Zrm,       TB_ALIGN_64 },
-    { X86::VMOVDQA64Zrr,     X86::VMOVDQA64Zrm,       TB_ALIGN_64 },
-    { X86::VMOVDQU8Zrr,      X86::VMOVDQU8Zrm,        0 },
-    { X86::VMOVDQU16Zrr,     X86::VMOVDQU16Zrm,       0 },
-    { X86::VMOVDQU32Zrr,     X86::VMOVDQU32Zrm,       0 },
-    { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
-    { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
-    { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
-    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
-    { X86::VPABSBZrr,        X86::VPABSBZrm,          0 },
-    { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
-    { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
-    { X86::VPABSWZrr,        X86::VPABSWZrm,          0 },
-    { X86::VPCONFLICTDZrr,   X86::VPCONFLICTDZrm,     0 },
-    { X86::VPCONFLICTQZrr,   X86::VPCONFLICTQZrm,     0 },
-    { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
-    { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
-    { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
-    { X86::VPERMQZri,        X86::VPERMQZmi,          0 },
-    { X86::VPLZCNTDZrr,      X86::VPLZCNTDZrm,        0 },
-    { X86::VPLZCNTQZrr,      X86::VPLZCNTQZrm,        0 },
-    { X86::VPMOVSXBDZrr,     X86::VPMOVSXBDZrm,       0 },
-    { X86::VPMOVSXBQZrr,     X86::VPMOVSXBQZrm,       TB_NO_REVERSE },
-    { X86::VPMOVSXBWZrr,     X86::VPMOVSXBWZrm,       0 },
-    { X86::VPMOVSXDQZrr,     X86::VPMOVSXDQZrm,       0 },
-    { X86::VPMOVSXWDZrr,     X86::VPMOVSXWDZrm,       0 },
-    { X86::VPMOVSXWQZrr,     X86::VPMOVSXWQZrm,       0 },
-    { X86::VPMOVZXBDZrr,     X86::VPMOVZXBDZrm,       0 },
-    { X86::VPMOVZXBQZrr,     X86::VPMOVZXBQZrm,       TB_NO_REVERSE },
-    { X86::VPMOVZXBWZrr,     X86::VPMOVZXBWZrm,       0 },
-    { X86::VPMOVZXDQZrr,     X86::VPMOVZXDQZrm,       0 },
-    { X86::VPMOVZXWDZrr,     X86::VPMOVZXWDZrm,       0 },
-    { X86::VPMOVZXWQZrr,     X86::VPMOVZXWQZrm,       0 },
-    { X86::VPOPCNTDZrr,      X86::VPOPCNTDZrm,        0 },
-    { X86::VPOPCNTQZrr,      X86::VPOPCNTQZrm,        0 },
-    { X86::VPSHUFDZri,       X86::VPSHUFDZmi,         0 },
-    { X86::VPSHUFHWZri,      X86::VPSHUFHWZmi,        0 },
-    { X86::VPSHUFLWZri,      X86::VPSHUFLWZmi,        0 },
-    { X86::VPSLLDQZrr,       X86::VPSLLDQZrm,         0 },
-    { X86::VPSLLDZri,        X86::VPSLLDZmi,          0 },
-    { X86::VPSLLQZri,        X86::VPSLLQZmi,          0 },
-    { X86::VPSLLWZri,        X86::VPSLLWZmi,          0 },
-    { X86::VPSRADZri,        X86::VPSRADZmi,          0 },
-    { X86::VPSRAQZri,        X86::VPSRAQZmi,          0 },
-    { X86::VPSRAWZri,        X86::VPSRAWZmi,          0 },
-    { X86::VPSRLDQZrr,       X86::VPSRLDQZrm,         0 },
-    { X86::VPSRLDZri,        X86::VPSRLDZmi,          0 },
-    { X86::VPSRLQZri,        X86::VPSRLQZmi,          0 },
-    { X86::VPSRLWZri,        X86::VPSRLWZmi,          0 },
-
-    // AVX-512 foldable instructions (256-bit versions)
-    { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
-    { X86::VCVTDQ2PDZ256rr,      X86::VCVTDQ2PDZ256rm,      0 },
-    { X86::VCVTPD2PSZ256rr,      X86::VCVTPD2PSZ256rm,      0 },
-    { X86::VCVTUDQ2PDZ256rr,     X86::VCVTUDQ2PDZ256rm,     0 },
-    { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
-    { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
-    { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
-    { X86::VMOVDQA64Z256rr,      X86::VMOVDQA64Z256rm,      TB_ALIGN_32 },
-    { X86::VMOVDQU8Z256rr,       X86::VMOVDQU8Z256rm,       0 },
-    { X86::VMOVDQU16Z256rr,      X86::VMOVDQU16Z256rm,      0 },
-    { X86::VMOVDQU32Z256rr,      X86::VMOVDQU32Z256rm,      0 },
-    { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
-    { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
-    { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
-    { X86::VPABSBZ256rr,         X86::VPABSBZ256rm,         0 },
-    { X86::VPABSDZ256rr,         X86::VPABSDZ256rm,         0 },
-    { X86::VPABSQZ256rr,         X86::VPABSQZ256rm,         0 },
-    { X86::VPABSWZ256rr,         X86::VPABSWZ256rm,         0 },
-    { X86::VPCONFLICTDZ256rr,    X86::VPCONFLICTDZ256rm,    0 },
-    { X86::VPCONFLICTQZ256rr,    X86::VPCONFLICTQZ256rm,    0 },
-    { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
-    { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
-    { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
-    { X86::VPERMQZ256ri,         X86::VPERMQZ256mi,         0 },
-    { X86::VPLZCNTDZ256rr,       X86::VPLZCNTDZ256rm,       0 },
-    { X86::VPLZCNTQZ256rr,       X86::VPLZCNTQZ256rm,       0 },
-    { X86::VPMOVSXBDZ256rr,      X86::VPMOVSXBDZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ256rr,      X86::VPMOVSXBQZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ256rr,      X86::VPMOVSXBWZ256rm,      0 },
-    { X86::VPMOVSXDQZ256rr,      X86::VPMOVSXDQZ256rm,      0 },
-    { X86::VPMOVSXWDZ256rr,      X86::VPMOVSXWDZ256rm,      0 },
-    { X86::VPMOVSXWQZ256rr,      X86::VPMOVSXWQZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ256rr,      X86::VPMOVZXBDZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ256rr,      X86::VPMOVZXBQZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ256rr,      X86::VPMOVZXBWZ256rm,      0 },
-    { X86::VPMOVZXDQZ256rr,      X86::VPMOVZXDQZ256rm,      0 },
-    { X86::VPMOVZXWDZ256rr,      X86::VPMOVZXWDZ256rm,      0 },
-    { X86::VPMOVZXWQZ256rr,      X86::VPMOVZXWQZ256rm,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
-    { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
-    { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
-    { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
-    { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
-    { X86::VPSLLQZ256ri,         X86::VPSLLQZ256mi,         0 },
-    { X86::VPSLLWZ256ri,         X86::VPSLLWZ256mi,         0 },
-    { X86::VPSRADZ256ri,         X86::VPSRADZ256mi,         0 },
-    { X86::VPSRAQZ256ri,         X86::VPSRAQZ256mi,         0 },
-    { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
-    { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
-    { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
-    { X86::VPSRLQZ256ri,         X86::VPSRLQZ256mi,         0 },
-    { X86::VPSRLWZ256ri,         X86::VPSRLWZ256mi,         0 },
-
-    // AVX-512 foldable instructions (128-bit versions)
-    { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
-    { X86::VCVTDQ2PDZ128rr,      X86::VCVTDQ2PDZ128rm,      TB_NO_REVERSE },
-    { X86::VCVTPD2PSZ128rr,      X86::VCVTPD2PSZ128rm,      0 },
-    { X86::VCVTUDQ2PDZ128rr,     X86::VCVTUDQ2PDZ128rm,     TB_NO_REVERSE },
-    { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
-    { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
-    { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
-    { X86::VMOVDQA64Z128rr,      X86::VMOVDQA64Z128rm,      TB_ALIGN_16 },
-    { X86::VMOVDQU8Z128rr,       X86::VMOVDQU8Z128rm,       0 },
-    { X86::VMOVDQU16Z128rr,      X86::VMOVDQU16Z128rm,      0 },
-    { X86::VMOVDQU32Z128rr,      X86::VMOVDQU32Z128rm,      0 },
-    { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
-    { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
-    { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
-    { X86::VPABSBZ128rr,         X86::VPABSBZ128rm,         0 },
-    { X86::VPABSDZ128rr,         X86::VPABSDZ128rm,         0 },
-    { X86::VPABSQZ128rr,         X86::VPABSQZ128rm,         0 },
-    { X86::VPABSWZ128rr,         X86::VPABSWZ128rm,         0 },
-    { X86::VPCONFLICTDZ128rr,    X86::VPCONFLICTDZ128rm,    0 },
-    { X86::VPCONFLICTQZ128rr,    X86::VPCONFLICTQZ128rm,    0 },
-    { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
-    { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
-    { X86::VPLZCNTDZ128rr,       X86::VPLZCNTDZ128rm,       0 },
-    { X86::VPLZCNTQZ128rr,       X86::VPLZCNTQZ128rm,       0 },
-    { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ128rr,      X86::VPMOVSXBQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ128rr,      X86::VPMOVSXBWZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXDQZ128rr,      X86::VPMOVSXDQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXWDZ128rr,      X86::VPMOVSXWDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXWQZ128rr,      X86::VPMOVSXWQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ128rr,      X86::VPMOVZXBDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ128rr,      X86::VPMOVZXBQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ128rr,      X86::VPMOVZXBWZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXDQZ128rr,      X86::VPMOVZXDQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXWDZ128rr,      X86::VPMOVZXWDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXWQZ128rr,      X86::VPMOVZXWQZ128rm,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
-    { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
-    { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
-    { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
-    { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
-    { X86::VPSLLQZ128ri,         X86::VPSLLQZ128mi,         0 },
-    { X86::VPSLLWZ128ri,         X86::VPSLLWZ128mi,         0 },
-    { X86::VPSRADZ128ri,         X86::VPSRADZ128mi,         0 },
-    { X86::VPSRAQZ128ri,         X86::VPSRAQZ128mi,         0 },
-    { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
-    { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
-    { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
-    { X86::VPSRLQZ128ri,         X86::VPSRLQZ128mi,         0 },
-    { X86::VPSRLWZ128ri,         X86::VPSRLWZ128mi,         0 },
-
-    // F16C foldable instructions
-    { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
-    { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
-
-    // AES foldable instructions
-    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
-    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
-    { X86::VAESIMCrr,             X86::VAESIMCrm,             0 },
-    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
-  };
-
-  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
-    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
-                  Entry.RegOp, Entry.MemOp,
-                  // Index 1, folded load
-                  Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
-  }
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
-    { X86::ADC32rr,         X86::ADC32rm,       0 },
-    { X86::ADC64rr,         X86::ADC64rm,       0 },
-    { X86::ADD16rr,         X86::ADD16rm,       0 },
-    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
-    { X86::ADD32rr,         X86::ADD32rm,       0 },
-    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
-    { X86::ADD64rr,         X86::ADD64rm,       0 },
-    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
-    { X86::ADD8rr,          X86::ADD8rm,        0 },
-    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
-    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
-    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
-    { X86::ADDSDrr_Int,     X86::ADDSDrm_Int,   TB_NO_REVERSE },
-    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
-    { X86::ADDSSrr_Int,     X86::ADDSSrm_Int,   TB_NO_REVERSE },
-    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
-    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
-    { X86::AND16rr,         X86::AND16rm,       0 },
-    { X86::AND32rr,         X86::AND32rm,       0 },
-    { X86::AND64rr,         X86::AND64rm,       0 },
-    { X86::AND8rr,          X86::AND8rm,        0 },
-    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
-    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
-    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
-    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
-    { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
-    { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
-    { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
-    { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
-    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
-    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
-    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
-    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
-    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
-    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
-    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
-    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
-    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
-    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
-    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
-    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
-    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
-    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
-    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
-    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
-    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
-    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
-    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
-    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
-    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
-    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
-    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
-    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
-    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
-    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
-    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
-    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
-    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
-    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
-    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
-    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
-    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
-    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
-    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
-    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
-    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
-    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
-    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
-    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
-    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
-    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
-    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
-    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
-    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
-    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
-    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
-    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
-    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
-    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
-    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
-    { X86::CMPSDrr_Int,     X86::CMPSDrm_Int,   TB_NO_REVERSE },
-    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
-    { X86::CMPSSrr_Int,     X86::CMPSSrm_Int,   TB_NO_REVERSE },
-    { X86::CRC32r32r32,     X86::CRC32r32m32,   0 },
-    { X86::CRC32r64r64,     X86::CRC32r64m64,   0 },
-    { X86::CVTSD2SSrr_Int,  X86::CVTSD2SSrm_Int,      TB_NO_REVERSE },
-    { X86::CVTSS2SDrr_Int,  X86::CVTSS2SDrm_Int,      TB_NO_REVERSE },
-    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
-    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
-    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
-    { X86::DIVSDrr_Int,     X86::DIVSDrm_Int,   TB_NO_REVERSE },
-    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
-    { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   TB_NO_REVERSE },
-    { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
-    { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
-    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
-    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
-    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
-    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
-    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
-    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
-    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
-    { X86::CVTSI642SDrr_Int,X86::CVTSI642SDrm_Int,    0 },
-    { X86::CVTSI2SDrr_Int,  X86::CVTSI2SDrm_Int,      0 },
-    { X86::CVTSI642SSrr_Int,X86::CVTSI642SSrm_Int,    0 },
-    { X86::CVTSI2SSrr_Int,  X86::CVTSI2SSrm_Int,      0 },
-    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
-    { X86::MAXCPDrr,        X86::MAXCPDrm,      TB_ALIGN_16 },
-    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
-    { X86::MAXCPSrr,        X86::MAXCPSrm,      TB_ALIGN_16 },
-    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
-    { X86::MAXCSDrr,        X86::MAXCSDrm,      0 },
-    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   TB_NO_REVERSE },
-    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
-    { X86::MAXCSSrr,        X86::MAXCSSrm,      0 },
-    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   TB_NO_REVERSE },
-    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
-    { X86::MINCPDrr,        X86::MINCPDrm,      TB_ALIGN_16 },
-    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
-    { X86::MINCPSrr,        X86::MINCPSrm,      TB_ALIGN_16 },
-    { X86::MINSDrr,         X86::MINSDrm,       0 },
-    { X86::MINCSDrr,        X86::MINCSDrm,      0 },
-    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   TB_NO_REVERSE },
-    { X86::MINSSrr,         X86::MINSSrm,       0 },
-    { X86::MINCSSrr,        X86::MINCSSrm,      0 },
-    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   TB_NO_REVERSE },
-    { X86::MOVLHPSrr,       X86::MOVHPSrm,      TB_NO_REVERSE },
-    { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
-    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
-    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
-    { X86::MULSDrr,         X86::MULSDrm,       0 },
-    { X86::MULSDrr_Int,     X86::MULSDrm_Int,   TB_NO_REVERSE },
-    { X86::MULSSrr,         X86::MULSSrm,       0 },
-    { X86::MULSSrr_Int,     X86::MULSSrm_Int,   TB_NO_REVERSE },
-    { X86::OR16rr,          X86::OR16rm,        0 },
-    { X86::OR32rr,          X86::OR32rm,        0 },
-    { X86::OR64rr,          X86::OR64rm,        0 },
-    { X86::OR8rr,           X86::OR8rm,         0 },
-    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
-    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
-    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
-    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
-    { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
-    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
-    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
-    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
-    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
-    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
-    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
-    { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
-    { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
-    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
-    { X86::PALIGNRrri,      X86::PALIGNRrmi,    TB_ALIGN_16 },
-    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
-    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
-    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
-    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
-    { X86::PBLENDVBrr0,     X86::PBLENDVBrm0,   TB_ALIGN_16 },
-    { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
-    { X86::PCLMULQDQrr,     X86::PCLMULQDQrm,   TB_ALIGN_16 },
-    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
-    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
-    { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
-    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
-    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
-    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
-    { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
-    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
-    { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
-    { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
-    { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
-    { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
-    { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
-    { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
-    { X86::PINSRBrr,        X86::PINSRBrm,      0 },
-    { X86::PINSRDrr,        X86::PINSRDrm,      0 },
-    { X86::PINSRQrr,        X86::PINSRQrm,      0 },
-    { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
-    { X86::PMADDUBSWrr,     X86::PMADDUBSWrm,   TB_ALIGN_16 },
-    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
-    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
-    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
-    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
-    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
-    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
-    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
-    { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
-    { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
-    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
-    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
-    { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
-    { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
-    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
-    { X86::PMULHRSWrr,      X86::PMULHRSWrm,    TB_ALIGN_16 },
-    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
-    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
-    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
-    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
-    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
-    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
-    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
-    { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
-    { X86::PSIGNBrr128,     X86::PSIGNBrm128,   TB_ALIGN_16 },
-    { X86::PSIGNWrr128,     X86::PSIGNWrm128,   TB_ALIGN_16 },
-    { X86::PSIGNDrr128,     X86::PSIGNDrm128,   TB_ALIGN_16 },
-    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
-    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
-    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
-    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
-    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
-    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
-    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
-    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
-    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
-    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
-    { X86::PSUBQrr,         X86::PSUBQrm,       TB_ALIGN_16 },
-    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
-    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
-    { X86::PSUBUSBrr,       X86::PSUBUSBrm,     TB_ALIGN_16 },
-    { X86::PSUBUSWrr,       X86::PSUBUSWrm,     TB_ALIGN_16 },
-    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
-    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
-    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
-    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
-    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
-    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
-    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
-    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
-    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
-    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
-    { X86::ROUNDSDr_Int,    X86::ROUNDSDm_Int,  TB_NO_REVERSE },
-    { X86::ROUNDSSr_Int,    X86::ROUNDSSm_Int,  TB_NO_REVERSE },
-    { X86::SBB32rr,         X86::SBB32rm,       0 },
-    { X86::SBB64rr,         X86::SBB64rm,       0 },
-    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
-    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
-    { X86::SUB16rr,         X86::SUB16rm,       0 },
-    { X86::SUB32rr,         X86::SUB32rm,       0 },
-    { X86::SUB64rr,         X86::SUB64rm,       0 },
-    { X86::SUB8rr,          X86::SUB8rm,        0 },
-    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
-    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
-    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
-    { X86::SUBSDrr_Int,     X86::SUBSDrm_Int,   TB_NO_REVERSE },
-    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
-    { X86::SUBSSrr_Int,     X86::SUBSSrm_Int,   TB_NO_REVERSE },
-    // FIXME: TEST*rr -> swapped operand of TEST*mr.
-    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
-    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
-    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
-    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
-    { X86::XOR16rr,         X86::XOR16rm,       0 },
-    { X86::XOR32rr,         X86::XOR32rm,       0 },
-    { X86::XOR64rr,         X86::XOR64rm,       0 },
-    { X86::XOR8rr,          X86::XOR8rm,        0 },
-    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
-    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
-
-    // MMX version of foldable instructions
-    { X86::MMX_CVTPI2PSirr,   X86::MMX_CVTPI2PSirm,   0 },
-    { X86::MMX_PACKSSDWirr,   X86::MMX_PACKSSDWirm,   0 },
-    { X86::MMX_PACKSSWBirr,   X86::MMX_PACKSSWBirm,   0 },
-    { X86::MMX_PACKUSWBirr,   X86::MMX_PACKUSWBirm,   0 },
-    { X86::MMX_PADDBirr,      X86::MMX_PADDBirm,      0 },
-    { X86::MMX_PADDDirr,      X86::MMX_PADDDirm,      0 },
-    { X86::MMX_PADDQirr,      X86::MMX_PADDQirm,      0 },
-    { X86::MMX_PADDSBirr,     X86::MMX_PADDSBirm,     0 },
-    { X86::MMX_PADDSWirr,     X86::MMX_PADDSWirm,     0 },
-    { X86::MMX_PADDUSBirr,    X86::MMX_PADDUSBirm,    0 },
-    { X86::MMX_PADDUSWirr,    X86::MMX_PADDUSWirm,    0 },
-    { X86::MMX_PADDWirr,      X86::MMX_PADDWirm,      0 },
-    { X86::MMX_PALIGNR64irr,  X86::MMX_PALIGNR64irm,  0 },
-    { X86::MMX_PANDNirr,      X86::MMX_PANDNirm,      0 },
-    { X86::MMX_PANDirr,       X86::MMX_PANDirm,       0 },
-    { X86::MMX_PAVGBirr,      X86::MMX_PAVGBirm,      0 },
-    { X86::MMX_PAVGWirr,      X86::MMX_PAVGWirm,      0 },
-    { X86::MMX_PCMPEQBirr,    X86::MMX_PCMPEQBirm,    0 },
-    { X86::MMX_PCMPEQDirr,    X86::MMX_PCMPEQDirm,    0 },
-    { X86::MMX_PCMPEQWirr,    X86::MMX_PCMPEQWirm,    0 },
-    { X86::MMX_PCMPGTBirr,    X86::MMX_PCMPGTBirm,    0 },
-    { X86::MMX_PCMPGTDirr,    X86::MMX_PCMPGTDirm,    0 },
-    { X86::MMX_PCMPGTWirr,    X86::MMX_PCMPGTWirm,    0 },
-    { X86::MMX_PHADDSWrr64,   X86::MMX_PHADDSWrm64,   0 },
-    { X86::MMX_PHADDWrr64,    X86::MMX_PHADDWrm64,    0 },
-    { X86::MMX_PHADDrr64,     X86::MMX_PHADDrm64,     0 },
-    { X86::MMX_PHSUBDrr64,    X86::MMX_PHSUBDrm64,    0 },
-    { X86::MMX_PHSUBSWrr64,   X86::MMX_PHSUBSWrm64,   0 },
-    { X86::MMX_PHSUBWrr64,    X86::MMX_PHSUBWrm64,    0 },
-    { X86::MMX_PINSRWirri,    X86::MMX_PINSRWirmi,    0 },
-    { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
-    { X86::MMX_PMADDWDirr,    X86::MMX_PMADDWDirm,    0 },
-    { X86::MMX_PMAXSWirr,     X86::MMX_PMAXSWirm,     0 },
-    { X86::MMX_PMAXUBirr,     X86::MMX_PMAXUBirm,     0 },
-    { X86::MMX_PMINSWirr,     X86::MMX_PMINSWirm,     0 },
-    { X86::MMX_PMINUBirr,     X86::MMX_PMINUBirm,     0 },
-    { X86::MMX_PMULHRSWrr64,  X86::MMX_PMULHRSWrm64,  0 },
-    { X86::MMX_PMULHUWirr,    X86::MMX_PMULHUWirm,    0 },
-    { X86::MMX_PMULHWirr,     X86::MMX_PMULHWirm,     0 },
-    { X86::MMX_PMULLWirr,     X86::MMX_PMULLWirm,     0 },
-    { X86::MMX_PMULUDQirr,    X86::MMX_PMULUDQirm,    0 },
-    { X86::MMX_PORirr,        X86::MMX_PORirm,        0 },
-    { X86::MMX_PSADBWirr,     X86::MMX_PSADBWirm,     0 },
-    { X86::MMX_PSHUFBrr64,    X86::MMX_PSHUFBrm64,    0 },
-    { X86::MMX_PSIGNBrr64,    X86::MMX_PSIGNBrm64,    0 },
-    { X86::MMX_PSIGNDrr64,    X86::MMX_PSIGNDrm64,    0 },
-    { X86::MMX_PSIGNWrr64,    X86::MMX_PSIGNWrm64,    0 },
-    { X86::MMX_PSLLDrr,       X86::MMX_PSLLDrm,       0 },
-    { X86::MMX_PSLLQrr,       X86::MMX_PSLLQrm,       0 },
-    { X86::MMX_PSLLWrr,       X86::MMX_PSLLWrm,       0 },
-    { X86::MMX_PSRADrr,       X86::MMX_PSRADrm,       0 },
-    { X86::MMX_PSRAWrr,       X86::MMX_PSRAWrm,       0 },
-    { X86::MMX_PSRLDrr,       X86::MMX_PSRLDrm,       0 },
-    { X86::MMX_PSRLQrr,       X86::MMX_PSRLQrm,       0 },
-    { X86::MMX_PSRLWrr,       X86::MMX_PSRLWrm,       0 },
-    { X86::MMX_PSUBBirr,      X86::MMX_PSUBBirm,      0 },
-    { X86::MMX_PSUBDirr,      X86::MMX_PSUBDirm,      0 },
-    { X86::MMX_PSUBQirr,      X86::MMX_PSUBQirm,      0 },
-    { X86::MMX_PSUBSBirr,     X86::MMX_PSUBSBirm,     0 },
-    { X86::MMX_PSUBSWirr,     X86::MMX_PSUBSWirm,     0 },
-    { X86::MMX_PSUBUSBirr,    X86::MMX_PSUBUSBirm,    0 },
-    { X86::MMX_PSUBUSWirr,    X86::MMX_PSUBUSWirm,    0 },
-    { X86::MMX_PSUBWirr,      X86::MMX_PSUBWirm,      0 },
-    { X86::MMX_PUNPCKHBWirr,  X86::MMX_PUNPCKHBWirm,  0 },
-    { X86::MMX_PUNPCKHDQirr,  X86::MMX_PUNPCKHDQirm,  0 },
-    { X86::MMX_PUNPCKHWDirr,  X86::MMX_PUNPCKHWDirm,  0 },
-    { X86::MMX_PUNPCKLBWirr,  X86::MMX_PUNPCKLBWirm,  0 },
-    { X86::MMX_PUNPCKLDQirr,  X86::MMX_PUNPCKLDQirm,  0 },
-    { X86::MMX_PUNPCKLWDirr,  X86::MMX_PUNPCKLWDirm,  0 },
-    { X86::MMX_PXORirr,       X86::MMX_PXORirm,       0 },
-
-    // 3DNow! version of foldable instructions
-    { X86::PAVGUSBrr,         X86::PAVGUSBrm,         0 },
-    { X86::PFACCrr,           X86::PFACCrm,           0 },
-    { X86::PFADDrr,           X86::PFADDrm,           0 },
-    { X86::PFCMPEQrr,         X86::PFCMPEQrm,         0 },
-    { X86::PFCMPGErr,         X86::PFCMPGErm,         0 },
-    { X86::PFCMPGTrr,         X86::PFCMPGTrm,         0 },
-    { X86::PFMAXrr,           X86::PFMAXrm,           0 },
-    { X86::PFMINrr,           X86::PFMINrm,           0 },
-    { X86::PFMULrr,           X86::PFMULrm,           0 },
-    { X86::PFNACCrr,          X86::PFNACCrm,          0 },
-    { X86::PFPNACCrr,         X86::PFPNACCrm,         0 },
-    { X86::PFRCPIT1rr,        X86::PFRCPIT1rm,        0 },
-    { X86::PFRCPIT2rr,        X86::PFRCPIT2rm,        0 },
-    { X86::PFRSQIT1rr,        X86::PFRSQIT1rm,        0 },
-    { X86::PFSUBrr,           X86::PFSUBrm,           0 },
-    { X86::PFSUBRrr,          X86::PFSUBRrm,          0 },
-    { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
-
-    // AVX 128-bit versions of foldable instructions
-    { X86::VCVTSI642SDrr,     X86::VCVTSI642SDrm,      0 },
-    { X86::VCVTSI642SDrr_Int, X86::VCVTSI642SDrm_Int,  0 },
-    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
-    { X86::VCVTSI2SDrr_Int,   X86::VCVTSI2SDrm_Int,    0 },
-    { X86::VCVTSI642SSrr,     X86::VCVTSI642SSrm,      0 },
-    { X86::VCVTSI642SSrr_Int, X86::VCVTSI642SSrm_Int,  0 },
-    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
-    { X86::VCVTSI2SSrr_Int,   X86::VCVTSI2SSrm_Int,    0 },
-    { X86::VADDPDrr,          X86::VADDPDrm,           0 },
-    { X86::VADDPSrr,          X86::VADDPSrm,           0 },
-    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
-    { X86::VADDSDrr_Int,      X86::VADDSDrm_Int,       TB_NO_REVERSE },
-    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
-    { X86::VADDSSrr_Int,      X86::VADDSSrm_Int,       TB_NO_REVERSE },
-    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
-    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
-    { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
-    { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
-    { X86::VANDPDrr,          X86::VANDPDrm,           0 },
-    { X86::VANDPSrr,          X86::VANDPSrm,           0 },
-    { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
-    { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
-    { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
-    { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
-    { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
-    { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
-    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
-    { X86::VCMPSDrr_Int,      X86::VCMPSDrm_Int,       TB_NO_REVERSE },
-    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
-    { X86::VCMPSSrr_Int,      X86::VCMPSSrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
-    { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
-    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
-    { X86::VDIVSDrr_Int,      X86::VDIVSDrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
-    { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       TB_NO_REVERSE },
-    { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
-    { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
-    { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
-    { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
-    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
-    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
-    { X86::VMAXCPDrr,         X86::VMAXCPDrm,          0 },
-    { X86::VMAXCPSrr,         X86::VMAXCPSrm,          0 },
-    { X86::VMAXCSDrr,         X86::VMAXCSDrm,          0 },
-    { X86::VMAXCSSrr,         X86::VMAXCSSrm,          0 },
-    { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
-    { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
-    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
-    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       TB_NO_REVERSE },
-    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
-    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       TB_NO_REVERSE },
-    { X86::VMINCPDrr,         X86::VMINCPDrm,          0 },
-    { X86::VMINCPSrr,         X86::VMINCPSrm,          0 },
-    { X86::VMINCSDrr,         X86::VMINCSDrm,          0 },
-    { X86::VMINCSSrr,         X86::VMINCSSrm,          0 },
-    { X86::VMINPDrr,          X86::VMINPDrm,           0 },
-    { X86::VMINPSrr,          X86::VMINPSrm,           0 },
-    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
-    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       TB_NO_REVERSE },
-    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
-    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       TB_NO_REVERSE },
-    { X86::VMOVLHPSrr,        X86::VMOVHPSrm,          TB_NO_REVERSE },
-    { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
-    { X86::VMULPDrr,          X86::VMULPDrm,           0 },
-    { X86::VMULPSrr,          X86::VMULPSrm,           0 },
-    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
-    { X86::VMULSDrr_Int,      X86::VMULSDrm_Int,       TB_NO_REVERSE },
-    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
-    { X86::VMULSSrr_Int,      X86::VMULSSrm_Int,       TB_NO_REVERSE },
-    { X86::VORPDrr,           X86::VORPDrm,            0 },
-    { X86::VORPSrr,           X86::VORPSrm,            0 },
-    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
-    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
-    { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
-    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
-    { X86::VPADDBrr,          X86::VPADDBrm,           0 },
-    { X86::VPADDDrr,          X86::VPADDDrm,           0 },
-    { X86::VPADDQrr,          X86::VPADDQrm,           0 },
-    { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
-    { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
-    { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
-    { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
-    { X86::VPADDWrr,          X86::VPADDWrm,           0 },
-    { X86::VPALIGNRrri,       X86::VPALIGNRrmi,        0 },
-    { X86::VPANDNrr,          X86::VPANDNrm,           0 },
-    { X86::VPANDrr,           X86::VPANDrm,            0 },
-    { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
-    { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
-    { X86::VPBLENDVBrr,       X86::VPBLENDVBrm,        0 },
-    { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
-    { X86::VPCLMULQDQrr,      X86::VPCLMULQDQrm,       0 },
-    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
-    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
-    { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
-    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
-    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
-    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
-    { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
-    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
-    { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
-    { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
-    { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
-    { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
-    { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
-    { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
-    { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
-    { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
-    { X86::VPINSRBrr,         X86::VPINSRBrm,          0 },
-    { X86::VPINSRDrr,         X86::VPINSRDrm,          0 },
-    { X86::VPINSRQrr,         X86::VPINSRQrm,          0 },
-    { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
-    { X86::VPMADDUBSWrr,      X86::VPMADDUBSWrm,       0 },
-    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
-    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
-    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
-    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
-    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
-    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
-    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
-    { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
-    { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
-    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
-    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
-    { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
-    { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
-    { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
-    { X86::VPMULHRSWrr,       X86::VPMULHRSWrm,        0 },
-    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
-    { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
-    { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
-    { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
-    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
-    { X86::VPORrr,            X86::VPORrm,             0 },
-    { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
-    { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
-    { X86::VPSIGNBrr128,      X86::VPSIGNBrm128,       0 },
-    { X86::VPSIGNWrr128,      X86::VPSIGNWrm128,       0 },
-    { X86::VPSIGNDrr128,      X86::VPSIGNDrm128,       0 },
-    { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
-    { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
-    { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
-    { X86::VPSRADrr,          X86::VPSRADrm,           0 },
-    { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
-    { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
-    { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
-    { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
-    { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
-    { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
-    { X86::VPSUBQrr,          X86::VPSUBQrm,           0 },
-    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
-    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
-    { X86::VPSUBUSBrr,        X86::VPSUBUSBrm,         0 },
-    { X86::VPSUBUSWrr,        X86::VPSUBUSWrm,         0 },
-    { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
-    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
-    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
-    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
-    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
-    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
-    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
-    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
-    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
-    { X86::VPXORrr,           X86::VPXORrm,            0 },
-    { X86::VRCPSSr,           X86::VRCPSSm,            0 },
-    { X86::VRCPSSr_Int,       X86::VRCPSSm_Int,        TB_NO_REVERSE },
-    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
-    { X86::VRSQRTSSr_Int,     X86::VRSQRTSSm_Int,      TB_NO_REVERSE },
-    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
-    { X86::VROUNDSDr_Int,     X86::VROUNDSDm_Int,      TB_NO_REVERSE },
-    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
-    { X86::VROUNDSSr_Int,     X86::VROUNDSSm_Int,      TB_NO_REVERSE },
-    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
-    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
-    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
-    { X86::VSQRTSDr_Int,      X86::VSQRTSDm_Int,       TB_NO_REVERSE },
-    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
-    { X86::VSQRTSSr_Int,      X86::VSQRTSSm_Int,       TB_NO_REVERSE },
-    { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
-    { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
-    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
-    { X86::VSUBSDrr_Int,      X86::VSUBSDrm_Int,       TB_NO_REVERSE },
-    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
-    { X86::VSUBSSrr_Int,      X86::VSUBSSrm_Int,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
-    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
-    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
-    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
-    { X86::VXORPDrr,          X86::VXORPDrm,           0 },
-    { X86::VXORPSrr,          X86::VXORPSrm,           0 },
-
-    // AVX 256-bit foldable instructions
-    { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
-    { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
-    { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
-    { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
-    { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
-    { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
-    { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
-    { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
-    { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
-    { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
-    { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
-    { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
-    { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
-    { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
-    { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
-    { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
-    { X86::VDPPSYrri,         X86::VDPPSYrmi,          0 },
-    { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
-    { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
-    { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
-    { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
-    { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
-    { X86::VMAXCPDYrr,        X86::VMAXCPDYrm,         0 },
-    { X86::VMAXCPSYrr,        X86::VMAXCPSYrm,         0 },
-    { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
-    { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
-    { X86::VMINCPDYrr,        X86::VMINCPDYrm,         0 },
-    { X86::VMINCPSYrr,        X86::VMINCPSYrm,         0 },
-    { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
-    { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
-    { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
-    { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
-    { X86::VORPDYrr,          X86::VORPDYrm,           0 },
-    { X86::VORPSYrr,          X86::VORPSYrm,           0 },
-    { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
-    { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
-    { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
-    { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
-    { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
-    { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
-    { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
-    { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
-    { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
-    { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
-    { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
-    { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
-    { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
-
-    // AVX2 foldable instructions
-    { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
-    { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
-    { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
-    { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
-    { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
-    { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
-    { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
-    { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
-    { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
-    { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
-    { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
-    { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
-    { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
-    { X86::VPALIGNRYrri,      X86::VPALIGNRYrmi,       0 },
-    { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
-    { X86::VPANDYrr,          X86::VPANDYrm,           0 },
-    { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
-    { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
-    { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
-    { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
-    { X86::VPBLENDVBYrr,      X86::VPBLENDVBYrm,       0 },
-    { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
-    { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
-    { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
-    { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
-    { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
-    { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
-    { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
-    { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
-    { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
-    { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
-    { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
-    { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
-    { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
-    { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
-    { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
-    { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
-    { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
-    { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
-    { X86::VPMADDUBSWYrr,     X86::VPMADDUBSWYrm,      0 },
-    { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
-    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
-    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
-    { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
-    { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
-    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
-    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
-    { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
-    { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
-    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
-    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
-    { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
-    { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
-    { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
-    { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
-    { X86::VPMULHRSWYrr,      X86::VPMULHRSWYrm,       0 },
-    { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
-    { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
-    { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
-    { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
-    { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
-    { X86::VPORYrr,           X86::VPORYrm,            0 },
-    { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
-    { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
-    { X86::VPSIGNBYrr256,     X86::VPSIGNBYrm256,      0 },
-    { X86::VPSIGNWYrr256,     X86::VPSIGNWYrm256,      0 },
-    { X86::VPSIGNDYrr256,     X86::VPSIGNDYrm256,      0 },
-    { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
-    { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
-    { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
-    { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
-    { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
-    { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
-    { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
-    { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
-    { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
-    { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
-    { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
-    { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
-    { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
-    { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
-    { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
-    { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
-    { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
-    { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
-    { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
-    { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
-    { X86::VPSUBQYrr,         X86::VPSUBQYrm,          0 },
-    { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
-    { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
-    { X86::VPSUBUSBYrr,       X86::VPSUBUSBYrm,        0 },
-    { X86::VPSUBUSWYrr,       X86::VPSUBUSWYrm,        0 },
-    { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
-    { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
-    { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
-    { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
-    { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
-    { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
-    { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
-    { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
-    { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
-    { X86::VPXORYrr,          X86::VPXORYrm,           0 },
-
-    // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDSS4rr_Int,   X86::VFMADDSS4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDSD4rr_Int,   X86::VFMADDSD4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDPS4Yrr,      X86::VFMADDPS4Ymr,       TB_ALIGN_NONE },
-    { X86::VFMADDPD4Yrr,      X86::VFMADDPD4Ymr,       TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr_Int,  X86::VFNMADDSS4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDSD4rr_Int,  X86::VFNMADDSD4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDPS4Yrr,     X86::VFNMADDPS4Ymr,      TB_ALIGN_NONE },
-    { X86::VFNMADDPD4Yrr,     X86::VFNMADDPD4Ymr,      TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr_Int,   X86::VFMSUBSS4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBSD4rr_Int,   X86::VFMSUBSD4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBPS4Yrr,      X86::VFMSUBPS4Ymr,       TB_ALIGN_NONE },
-    { X86::VFMSUBPD4Yrr,      X86::VFMSUBPD4Ymr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr_Int,  X86::VFNMSUBSS4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBSD4rr_Int,  X86::VFNMSUBSD4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBPS4Yrr,     X86::VFNMSUBPS4Ymr,      TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4Yrr,     X86::VFNMSUBPD4Ymr,      TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4Yrr,   X86::VFMADDSUBPS4Ymr,    TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4Yrr,   X86::VFMADDSUBPD4Ymr,    TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4Yrr,   X86::VFMSUBADDPS4Ymr,    TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4Yrr,   X86::VFMSUBADDPD4Ymr,    TB_ALIGN_NONE },
-
-    // XOP foldable instructions
-    { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
-    { X86::VPCMOVYrrr,        X86::VPCMOVYrmr,          0 },
-    { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
-    { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
-    { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
-    { X86::VPCOMWri,          X86::VPCOMWmi,            0 },
-    { X86::VPCOMUBri,         X86::VPCOMUBmi,           0 },
-    { X86::VPCOMUDri,         X86::VPCOMUDmi,           0 },
-    { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
-    { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
-    { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
-    { X86::VPERMIL2PDYrr,     X86::VPERMIL2PDYmr,       0 },
-    { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
-    { X86::VPERMIL2PSYrr,     X86::VPERMIL2PSYmr,       0 },
-    { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
-    { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
-    { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
-    { X86::VPMACSSDDrr,       X86::VPMACSSDDrm,         0 },
-    { X86::VPMACSSDQHrr,      X86::VPMACSSDQHrm,        0 },
-    { X86::VPMACSSDQLrr,      X86::VPMACSSDQLrm,        0 },
-    { X86::VPMACSSWDrr,       X86::VPMACSSWDrm,         0 },
-    { X86::VPMACSSWWrr,       X86::VPMACSSWWrm,         0 },
-    { X86::VPMACSWDrr,        X86::VPMACSWDrm,          0 },
-    { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
-    { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
-    { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
-    { X86::VPPERMrrr,         X86::VPPERMrmr,           0 },
-    { X86::VPROTBrr,          X86::VPROTBrm,            0 },
-    { X86::VPROTDrr,          X86::VPROTDrm,            0 },
-    { X86::VPROTQrr,          X86::VPROTQrm,            0 },
-    { X86::VPROTWrr,          X86::VPROTWrm,            0 },
-    { X86::VPSHABrr,          X86::VPSHABrm,            0 },
-    { X86::VPSHADrr,          X86::VPSHADrm,            0 },
-    { X86::VPSHAQrr,          X86::VPSHAQrm,            0 },
-    { X86::VPSHAWrr,          X86::VPSHAWrm,            0 },
-    { X86::VPSHLBrr,          X86::VPSHLBrm,            0 },
-    { X86::VPSHLDrr,          X86::VPSHLDrm,            0 },
-    { X86::VPSHLQrr,          X86::VPSHLQrm,            0 },
-    { X86::VPSHLWrr,          X86::VPSHLWrm,            0 },
-
-    // BMI/BMI2 foldable instructions
-    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
-    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
-    { X86::MULX32rr,          X86::MULX32rm,            0 },
-    { X86::MULX64rr,          X86::MULX64rm,            0 },
-    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
-    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
-    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
-    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
-
-    // ADX foldable instructions
-    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
-    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
-    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
-    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
-
-    // AVX-512 foldable instructions
-    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
-    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
-    { X86::VADDSDZrr,         X86::VADDSDZrm,           0 },
-    { X86::VADDSDZrr_Int,     X86::VADDSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VADDSSZrr,         X86::VADDSSZrm,           0 },
-    { X86::VADDSSZrr_Int,     X86::VADDSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VALIGNDZrri,       X86::VALIGNDZrmi,         0 },
-    { X86::VALIGNQZrri,       X86::VALIGNQZrmi,         0 },
-    { X86::VANDNPDZrr,        X86::VANDNPDZrm,          0 },
-    { X86::VANDNPSZrr,        X86::VANDNPSZrm,          0 },
-    { X86::VANDPDZrr,         X86::VANDPDZrm,           0 },
-    { X86::VANDPSZrr,         X86::VANDPSZrm,           0 },
-    { X86::VCMPPDZrri,        X86::VCMPPDZrmi,          0 },
-    { X86::VCMPPSZrri,        X86::VCMPPSZrmi,          0 },
-    { X86::VCMPSDZrr,         X86::VCMPSDZrm,           0 },
-    { X86::VCMPSDZrr_Int,     X86::VCMPSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VCMPSSZrr,         X86::VCMPSSZrm,           0 },
-    { X86::VCMPSSZrr_Int,     X86::VCMPSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
-    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
-    { X86::VDIVSDZrr,         X86::VDIVSDZrm,           0 },
-    { X86::VDIVSDZrr_Int,     X86::VDIVSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVSSZrr,         X86::VDIVSSZrm,           0 },
-    { X86::VDIVSSZrr_Int,     X86::VDIVSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VINSERTF32x4Zrr,   X86::VINSERTF32x4Zrm,     0 },
-    { X86::VINSERTF32x8Zrr,   X86::VINSERTF32x8Zrm,     0 },
-    { X86::VINSERTF64x2Zrr,   X86::VINSERTF64x2Zrm,     0 },
-    { X86::VINSERTF64x4Zrr,   X86::VINSERTF64x4Zrm,     0 },
-    { X86::VINSERTI32x4Zrr,   X86::VINSERTI32x4Zrm,     0 },
-    { X86::VINSERTI32x8Zrr,   X86::VINSERTI32x8Zrm,     0 },
-    { X86::VINSERTI64x2Zrr,   X86::VINSERTI64x2Zrm,     0 },
-    { X86::VINSERTI64x4Zrr,   X86::VINSERTI64x4Zrm,     0 },
-    { X86::VMAXCPDZrr,        X86::VMAXCPDZrm,          0 },
-    { X86::VMAXCPSZrr,        X86::VMAXCPSZrm,          0 },
-    { X86::VMAXCSDZrr,        X86::VMAXCSDZrm,          0 },
-    { X86::VMAXCSSZrr,        X86::VMAXCSSZrm,          0 },
-    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
-    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
-    { X86::VMAXSDZrr,         X86::VMAXSDZrm,           0 },
-    { X86::VMAXSDZrr_Int,     X86::VMAXSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VMAXSSZrr,         X86::VMAXSSZrm,           0 },
-    { X86::VMAXSSZrr_Int,     X86::VMAXSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VMINCPDZrr,        X86::VMINCPDZrm,          0 },
-    { X86::VMINCPSZrr,        X86::VMINCPSZrm,          0 },
-    { X86::VMINCSDZrr,        X86::VMINCSDZrm,          0 },
-    { X86::VMINCSSZrr,        X86::VMINCSSZrm,          0 },
-    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
-    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
-    { X86::VMINSDZrr,         X86::VMINSDZrm,           0 },
-    { X86::VMINSDZrr_Int,     X86::VMINSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
-    { X86::VMINSSZrr_Int,     X86::VMINSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VMOVLHPSZrr,       X86::VMOVHPSZ128rm,       TB_NO_REVERSE },
-    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
-    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
-    { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
-    { X86::VMULSDZrr_Int,     X86::VMULSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VMULSSZrr,         X86::VMULSSZrm,           0 },
-    { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VORPDZrr,          X86::VORPDZrm,            0 },
-    { X86::VORPSZrr,          X86::VORPSZrm,            0 },
-    { X86::VPACKSSDWZrr,      X86::VPACKSSDWZrm,        0 },
-    { X86::VPACKSSWBZrr,      X86::VPACKSSWBZrm,        0 },
-    { X86::VPACKUSDWZrr,      X86::VPACKUSDWZrm,        0 },
-    { X86::VPACKUSWBZrr,      X86::VPACKUSWBZrm,        0 },
-    { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
-    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
-    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
-    { X86::VPADDSBZrr,        X86::VPADDSBZrm,          0 },
-    { X86::VPADDSWZrr,        X86::VPADDSWZrm,          0 },
-    { X86::VPADDUSBZrr,       X86::VPADDUSBZrm,         0 },
-    { X86::VPADDUSWZrr,       X86::VPADDUSWZrm,         0 },
-    { X86::VPADDWZrr,         X86::VPADDWZrm,           0 },
-    { X86::VPALIGNRZrri,      X86::VPALIGNRZrmi,        0 },
-    { X86::VPANDDZrr,         X86::VPANDDZrm,           0 },
-    { X86::VPANDNDZrr,        X86::VPANDNDZrm,          0 },
-    { X86::VPANDNQZrr,        X86::VPANDNQZrm,          0 },
-    { X86::VPANDQZrr,         X86::VPANDQZrm,           0 },
-    { X86::VPAVGBZrr,         X86::VPAVGBZrm,           0 },
-    { X86::VPAVGWZrr,         X86::VPAVGWZrm,           0 },
-    { X86::VPCMPBZrri,        X86::VPCMPBZrmi,          0 },
-    { X86::VPCMPDZrri,        X86::VPCMPDZrmi,          0 },
-    { X86::VPCMPEQBZrr,       X86::VPCMPEQBZrm,         0 },
-    { X86::VPCMPEQDZrr,       X86::VPCMPEQDZrm,         0 },
-    { X86::VPCMPEQQZrr,       X86::VPCMPEQQZrm,         0 },
-    { X86::VPCMPEQWZrr,       X86::VPCMPEQWZrm,         0 },
-    { X86::VPCMPGTBZrr,       X86::VPCMPGTBZrm,         0 },
-    { X86::VPCMPGTDZrr,       X86::VPCMPGTDZrm,         0 },
-    { X86::VPCMPGTQZrr,       X86::VPCMPGTQZrm,         0 },
-    { X86::VPCMPGTWZrr,       X86::VPCMPGTWZrm,         0 },
-    { X86::VPCMPQZrri,        X86::VPCMPQZrmi,          0 },
-    { X86::VPCMPUBZrri,       X86::VPCMPUBZrmi,         0 },
-    { X86::VPCMPUDZrri,       X86::VPCMPUDZrmi,         0 },
-    { X86::VPCMPUQZrri,       X86::VPCMPUQZrmi,         0 },
-    { X86::VPCMPUWZrri,       X86::VPCMPUWZrmi,         0 },
-    { X86::VPCMPWZrri,        X86::VPCMPWZrmi,          0 },
-    { X86::VPERMBZrr,         X86::VPERMBZrm,           0 },
-    { X86::VPERMDZrr,         X86::VPERMDZrm,           0 },
-    { X86::VPERMILPDZrr,      X86::VPERMILPDZrm,        0 },
-    { X86::VPERMILPSZrr,      X86::VPERMILPSZrm,        0 },
-    { X86::VPERMPDZrr,        X86::VPERMPDZrm,          0 },
-    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
-    { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
-    { X86::VPERMWZrr,         X86::VPERMWZrm,           0 },
-    { X86::VPINSRBZrr,        X86::VPINSRBZrm,          0 },
-    { X86::VPINSRDZrr,        X86::VPINSRDZrm,          0 },
-    { X86::VPINSRQZrr,        X86::VPINSRQZrm,          0 },
-    { X86::VPINSRWZrr,        X86::VPINSRWZrm,          0 },
-    { X86::VPMADDUBSWZrr,     X86::VPMADDUBSWZrm,       0 },
-    { X86::VPMADDWDZrr,       X86::VPMADDWDZrm,         0 },
-    { X86::VPMAXSBZrr,        X86::VPMAXSBZrm,          0 },
-    { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
-    { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
-    { X86::VPMAXSWZrr,        X86::VPMAXSWZrm,          0 },
-    { X86::VPMAXUBZrr,        X86::VPMAXUBZrm,          0 },
-    { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
-    { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
-    { X86::VPMAXUWZrr,        X86::VPMAXUWZrm,          0 },
-    { X86::VPMINSBZrr,        X86::VPMINSBZrm,          0 },
-    { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
-    { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
-    { X86::VPMINSWZrr,        X86::VPMINSWZrm,          0 },
-    { X86::VPMINUBZrr,        X86::VPMINUBZrm,          0 },
-    { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
-    { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
-    { X86::VPMINUWZrr,        X86::VPMINUWZrm,          0 },
-    { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
-    { X86::VPMULLDZrr,        X86::VPMULLDZrm,          0 },
-    { X86::VPMULLQZrr,        X86::VPMULLQZrm,          0 },
-    { X86::VPMULLWZrr,        X86::VPMULLWZrm,          0 },
-    { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
-    { X86::VPORDZrr,          X86::VPORDZrm,            0 },
-    { X86::VPORQZrr,          X86::VPORQZrm,            0 },
-    { X86::VPSADBWZrr,        X86::VPSADBWZrm,          0 },
-    { X86::VPSHUFBZrr,        X86::VPSHUFBZrm,          0 },
-    { X86::VPSLLDZrr,         X86::VPSLLDZrm,           0 },
-    { X86::VPSLLQZrr,         X86::VPSLLQZrm,           0 },
-    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
-    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
-    { X86::VPSLLVWZrr,        X86::VPSLLVWZrm,          0 },
-    { X86::VPSLLWZrr,         X86::VPSLLWZrm,           0 },
-    { X86::VPSRADZrr,         X86::VPSRADZrm,           0 },
-    { X86::VPSRAQZrr,         X86::VPSRAQZrm,           0 },
-    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
-    { X86::VPSRAVQZrr,        X86::VPSRAVQZrm,          0 },
-    { X86::VPSRAVWZrr,        X86::VPSRAVWZrm,          0 },
-    { X86::VPSRAWZrr,         X86::VPSRAWZrm,           0 },
-    { X86::VPSRLDZrr,         X86::VPSRLDZrm,           0 },
-    { X86::VPSRLQZrr,         X86::VPSRLQZrm,           0 },
-    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
-    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
-    { X86::VPSRLVWZrr,        X86::VPSRLVWZrm,          0 },
-    { X86::VPSRLWZrr,         X86::VPSRLWZrm,           0 },
-    { X86::VPSUBBZrr,         X86::VPSUBBZrm,           0 },
-    { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
-    { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
-    { X86::VPSUBSBZrr,        X86::VPSUBSBZrm,          0 },
-    { X86::VPSUBSWZrr,        X86::VPSUBSWZrm,          0 },
-    { X86::VPSUBUSBZrr,       X86::VPSUBUSBZrm,         0 },
-    { X86::VPSUBUSWZrr,       X86::VPSUBUSWZrm,         0 },
-    { X86::VPSUBWZrr,         X86::VPSUBWZrm,           0 },
-    { X86::VPUNPCKHBWZrr,     X86::VPUNPCKHBWZrm,       0 },
-    { X86::VPUNPCKHDQZrr,     X86::VPUNPCKHDQZrm,       0 },
-    { X86::VPUNPCKHQDQZrr,    X86::VPUNPCKHQDQZrm,      0 },
-    { X86::VPUNPCKHWDZrr,     X86::VPUNPCKHWDZrm,       0 },
-    { X86::VPUNPCKLBWZrr,     X86::VPUNPCKLBWZrm,       0 },
-    { X86::VPUNPCKLDQZrr,     X86::VPUNPCKLDQZrm,       0 },
-    { X86::VPUNPCKLQDQZrr,    X86::VPUNPCKLQDQZrm,      0 },
-    { X86::VPUNPCKLWDZrr,     X86::VPUNPCKLWDZrm,       0 },
-    { X86::VPXORDZrr,         X86::VPXORDZrm,           0 },
-    { X86::VPXORQZrr,         X86::VPXORQZrm,           0 },
-    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
-    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
-    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
-    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
-    { X86::VSUBSDZrr,         X86::VSUBSDZrm,           0 },
-    { X86::VSUBSDZrr_Int,     X86::VSUBSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VSUBSSZrr,         X86::VSUBSSZrm,           0 },
-    { X86::VSUBSSZrr_Int,     X86::VSUBSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDZrr,      X86::VUNPCKHPDZrm,        0 },
-    { X86::VUNPCKHPSZrr,      X86::VUNPCKHPSZrm,        0 },
-    { X86::VUNPCKLPDZrr,      X86::VUNPCKLPDZrm,        0 },
-    { X86::VUNPCKLPSZrr,      X86::VUNPCKLPSZrm,        0 },
-    { X86::VXORPDZrr,         X86::VXORPDZrm,           0 },
-    { X86::VXORPSZrr,         X86::VXORPSZrm,           0 },
-
-    // AVX-512{F,VL} foldable instructions
-    { X86::VADDPDZ128rr,      X86::VADDPDZ128rm,        0 },
-    { X86::VADDPDZ256rr,      X86::VADDPDZ256rm,        0 },
-    { X86::VADDPSZ128rr,      X86::VADDPSZ128rm,        0 },
-    { X86::VADDPSZ256rr,      X86::VADDPSZ256rm,        0 },
-    { X86::VALIGNDZ128rri,    X86::VALIGNDZ128rmi,      0 },
-    { X86::VALIGNDZ256rri,    X86::VALIGNDZ256rmi,      0 },
-    { X86::VALIGNQZ128rri,    X86::VALIGNQZ128rmi,      0 },
-    { X86::VALIGNQZ256rri,    X86::VALIGNQZ256rmi,      0 },
-    { X86::VANDNPDZ128rr,     X86::VANDNPDZ128rm,       0 },
-    { X86::VANDNPDZ256rr,     X86::VANDNPDZ256rm,       0 },
-    { X86::VANDNPSZ128rr,     X86::VANDNPSZ128rm,       0 },
-    { X86::VANDNPSZ256rr,     X86::VANDNPSZ256rm,       0 },
-    { X86::VANDPDZ128rr,      X86::VANDPDZ128rm,        0 },
-    { X86::VANDPDZ256rr,      X86::VANDPDZ256rm,        0 },
-    { X86::VANDPSZ128rr,      X86::VANDPSZ128rm,        0 },
-    { X86::VANDPSZ256rr,      X86::VANDPSZ256rm,        0 },
-    { X86::VCMPPDZ128rri,     X86::VCMPPDZ128rmi,       0 },
-    { X86::VCMPPDZ256rri,     X86::VCMPPDZ256rmi,       0 },
-    { X86::VCMPPSZ128rri,     X86::VCMPPSZ128rmi,       0 },
-    { X86::VCMPPSZ256rri,     X86::VCMPPSZ256rmi,       0 },
-    { X86::VDIVPDZ128rr,      X86::VDIVPDZ128rm,        0 },
-    { X86::VDIVPDZ256rr,      X86::VDIVPDZ256rm,        0 },
-    { X86::VDIVPSZ128rr,      X86::VDIVPSZ128rm,        0 },
-    { X86::VDIVPSZ256rr,      X86::VDIVPSZ256rm,        0 },
-    { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm,  0 },
-    { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm,  0 },
-    { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm,  0 },
-    { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm,  0 },
-    { X86::VMAXCPDZ128rr,     X86::VMAXCPDZ128rm,       0 },
-    { X86::VMAXCPDZ256rr,     X86::VMAXCPDZ256rm,       0 },
-    { X86::VMAXCPSZ128rr,     X86::VMAXCPSZ128rm,       0 },
-    { X86::VMAXCPSZ256rr,     X86::VMAXCPSZ256rm,       0 },
-    { X86::VMAXPDZ128rr,      X86::VMAXPDZ128rm,        0 },
-    { X86::VMAXPDZ256rr,      X86::VMAXPDZ256rm,        0 },
-    { X86::VMAXPSZ128rr,      X86::VMAXPSZ128rm,        0 },
-    { X86::VMAXPSZ256rr,      X86::VMAXPSZ256rm,        0 },
-    { X86::VMINCPDZ128rr,     X86::VMINCPDZ128rm,       0 },
-    { X86::VMINCPDZ256rr,     X86::VMINCPDZ256rm,       0 },
-    { X86::VMINCPSZ128rr,     X86::VMINCPSZ128rm,       0 },
-    { X86::VMINCPSZ256rr,     X86::VMINCPSZ256rm,       0 },
-    { X86::VMINPDZ128rr,      X86::VMINPDZ128rm,        0 },
-    { X86::VMINPDZ256rr,      X86::VMINPDZ256rm,        0 },
-    { X86::VMINPSZ128rr,      X86::VMINPSZ128rm,        0 },
-    { X86::VMINPSZ256rr,      X86::VMINPSZ256rm,        0 },
-    { X86::VMULPDZ128rr,      X86::VMULPDZ128rm,        0 },
-    { X86::VMULPDZ256rr,      X86::VMULPDZ256rm,        0 },
-    { X86::VMULPSZ128rr,      X86::VMULPSZ128rm,        0 },
-    { X86::VMULPSZ256rr,      X86::VMULPSZ256rm,        0 },
-    { X86::VORPDZ128rr,       X86::VORPDZ128rm,         0 },
-    { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
-    { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
-    { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
-    { X86::VPACKSSDWZ256rr,   X86::VPACKSSDWZ256rm,     0 },
-    { X86::VPACKSSDWZ128rr,   X86::VPACKSSDWZ128rm,     0 },
-    { X86::VPACKSSWBZ256rr,   X86::VPACKSSWBZ256rm,     0 },
-    { X86::VPACKSSWBZ128rr,   X86::VPACKSSWBZ128rm,     0 },
-    { X86::VPACKUSDWZ256rr,   X86::VPACKUSDWZ256rm,     0 },
-    { X86::VPACKUSDWZ128rr,   X86::VPACKUSDWZ128rm,     0 },
-    { X86::VPACKUSWBZ256rr,   X86::VPACKUSWBZ256rm,     0 },
-    { X86::VPACKUSWBZ128rr,   X86::VPACKUSWBZ128rm,     0 },
-    { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
-    { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
-    { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
-    { X86::VPADDDZ256rr,      X86::VPADDDZ256rm,        0 },
-    { X86::VPADDQZ128rr,      X86::VPADDQZ128rm,        0 },
-    { X86::VPADDQZ256rr,      X86::VPADDQZ256rm,        0 },
-    { X86::VPADDSBZ128rr,     X86::VPADDSBZ128rm,       0 },
-    { X86::VPADDSBZ256rr,     X86::VPADDSBZ256rm,       0 },
-    { X86::VPADDSWZ128rr,     X86::VPADDSWZ128rm,       0 },
-    { X86::VPADDSWZ256rr,     X86::VPADDSWZ256rm,       0 },
-    { X86::VPADDUSBZ128rr,    X86::VPADDUSBZ128rm,      0 },
-    { X86::VPADDUSBZ256rr,    X86::VPADDUSBZ256rm,      0 },
-    { X86::VPADDUSWZ128rr,    X86::VPADDUSWZ128rm,      0 },
-    { X86::VPADDUSWZ256rr,    X86::VPADDUSWZ256rm,      0 },
-    { X86::VPADDWZ128rr,      X86::VPADDWZ128rm,        0 },
-    { X86::VPADDWZ256rr,      X86::VPADDWZ256rm,        0 },
-    { X86::VPALIGNRZ128rri,   X86::VPALIGNRZ128rmi,     0 },
-    { X86::VPALIGNRZ256rri,   X86::VPALIGNRZ256rmi,     0 },
-    { X86::VPANDDZ128rr,      X86::VPANDDZ128rm,        0 },
-    { X86::VPANDDZ256rr,      X86::VPANDDZ256rm,        0 },
-    { X86::VPANDNDZ128rr,     X86::VPANDNDZ128rm,       0 },
-    { X86::VPANDNDZ256rr,     X86::VPANDNDZ256rm,       0 },
-    { X86::VPANDNQZ128rr,     X86::VPANDNQZ128rm,       0 },
-    { X86::VPANDNQZ256rr,     X86::VPANDNQZ256rm,       0 },
-    { X86::VPANDQZ128rr,      X86::VPANDQZ128rm,        0 },
-    { X86::VPANDQZ256rr,      X86::VPANDQZ256rm,        0 },
-    { X86::VPAVGBZ128rr,      X86::VPAVGBZ128rm,        0 },
-    { X86::VPAVGBZ256rr,      X86::VPAVGBZ256rm,        0 },
-    { X86::VPAVGWZ128rr,      X86::VPAVGWZ128rm,        0 },
-    { X86::VPAVGWZ256rr,      X86::VPAVGWZ256rm,        0 },
-    { X86::VPCMPBZ128rri,     X86::VPCMPBZ128rmi,       0 },
-    { X86::VPCMPBZ256rri,     X86::VPCMPBZ256rmi,       0 },
-    { X86::VPCMPDZ128rri,     X86::VPCMPDZ128rmi,       0 },
-    { X86::VPCMPDZ256rri,     X86::VPCMPDZ256rmi,       0 },
-    { X86::VPCMPEQBZ128rr,    X86::VPCMPEQBZ128rm,      0 },
-    { X86::VPCMPEQBZ256rr,    X86::VPCMPEQBZ256rm,      0 },
-    { X86::VPCMPEQDZ128rr,    X86::VPCMPEQDZ128rm,      0 },
-    { X86::VPCMPEQDZ256rr,    X86::VPCMPEQDZ256rm,      0 },
-    { X86::VPCMPEQQZ128rr,    X86::VPCMPEQQZ128rm,      0 },
-    { X86::VPCMPEQQZ256rr,    X86::VPCMPEQQZ256rm,      0 },
-    { X86::VPCMPEQWZ128rr,    X86::VPCMPEQWZ128rm,      0 },
-    { X86::VPCMPEQWZ256rr,    X86::VPCMPEQWZ256rm,      0 },
-    { X86::VPCMPGTBZ128rr,    X86::VPCMPGTBZ128rm,      0 },
-    { X86::VPCMPGTBZ256rr,    X86::VPCMPGTBZ256rm,      0 },
-    { X86::VPCMPGTDZ128rr,    X86::VPCMPGTDZ128rm,      0 },
-    { X86::VPCMPGTDZ256rr,    X86::VPCMPGTDZ256rm,      0 },
-    { X86::VPCMPGTQZ128rr,    X86::VPCMPGTQZ128rm,      0 },
-    { X86::VPCMPGTQZ256rr,    X86::VPCMPGTQZ256rm,      0 },
-    { X86::VPCMPGTWZ128rr,    X86::VPCMPGTWZ128rm,      0 },
-    { X86::VPCMPGTWZ256rr,    X86::VPCMPGTWZ256rm,      0 },
-    { X86::VPCMPQZ128rri,     X86::VPCMPQZ128rmi,       0 },
-    { X86::VPCMPQZ256rri,     X86::VPCMPQZ256rmi,       0 },
-    { X86::VPCMPUBZ128rri,    X86::VPCMPUBZ128rmi,      0 },
-    { X86::VPCMPUBZ256rri,    X86::VPCMPUBZ256rmi,      0 },
-    { X86::VPCMPUDZ128rri,    X86::VPCMPUDZ128rmi,      0 },
-    { X86::VPCMPUDZ256rri,    X86::VPCMPUDZ256rmi,      0 },
-    { X86::VPCMPUQZ128rri,    X86::VPCMPUQZ128rmi,      0 },
-    { X86::VPCMPUQZ256rri,    X86::VPCMPUQZ256rmi,      0 },
-    { X86::VPCMPUWZ128rri,    X86::VPCMPUWZ128rmi,      0 },
-    { X86::VPCMPUWZ256rri,    X86::VPCMPUWZ256rmi,      0 },
-    { X86::VPCMPWZ128rri,     X86::VPCMPWZ128rmi,       0 },
-    { X86::VPCMPWZ256rri,     X86::VPCMPWZ256rmi,       0 },
-    { X86::VPERMBZ128rr,      X86::VPERMBZ128rm,        0 },
-    { X86::VPERMBZ256rr,      X86::VPERMBZ256rm,        0 },
-    { X86::VPERMDZ256rr,      X86::VPERMDZ256rm,        0 },
-    { X86::VPERMILPDZ128rr,   X86::VPERMILPDZ128rm,     0 },
-    { X86::VPERMILPDZ256rr,   X86::VPERMILPDZ256rm,     0 },
-    { X86::VPERMILPSZ128rr,   X86::VPERMILPSZ128rm,     0 },
-    { X86::VPERMILPSZ256rr,   X86::VPERMILPSZ256rm,     0 },
-    { X86::VPERMPDZ256rr,     X86::VPERMPDZ256rm,       0 },
-    { X86::VPERMPSZ256rr,     X86::VPERMPSZ256rm,       0 },
-    { X86::VPERMQZ256rr,      X86::VPERMQZ256rm,        0 },
-    { X86::VPERMWZ128rr,      X86::VPERMWZ128rm,        0 },
-    { X86::VPERMWZ256rr,      X86::VPERMWZ256rm,        0 },
-    { X86::VPMADDUBSWZ128rr,  X86::VPMADDUBSWZ128rm,    0 },
-    { X86::VPMADDUBSWZ256rr,  X86::VPMADDUBSWZ256rm,    0 },
-    { X86::VPMADDWDZ128rr,    X86::VPMADDWDZ128rm,      0 },
-    { X86::VPMADDWDZ256rr,    X86::VPMADDWDZ256rm,      0 },
-    { X86::VPMAXSBZ128rr,     X86::VPMAXSBZ128rm,       0 },
-    { X86::VPMAXSBZ256rr,     X86::VPMAXSBZ256rm,       0 },
-    { X86::VPMAXSDZ128rr,     X86::VPMAXSDZ128rm,       0 },
-    { X86::VPMAXSDZ256rr,     X86::VPMAXSDZ256rm,       0 },
-    { X86::VPMAXSQZ128rr,     X86::VPMAXSQZ128rm,       0 },
-    { X86::VPMAXSQZ256rr,     X86::VPMAXSQZ256rm,       0 },
-    { X86::VPMAXSWZ128rr,     X86::VPMAXSWZ128rm,       0 },
-    { X86::VPMAXSWZ256rr,     X86::VPMAXSWZ256rm,       0 },
-    { X86::VPMAXUBZ128rr,     X86::VPMAXUBZ128rm,       0 },
-    { X86::VPMAXUBZ256rr,     X86::VPMAXUBZ256rm,       0 },
-    { X86::VPMAXUDZ128rr,     X86::VPMAXUDZ128rm,       0 },
-    { X86::VPMAXUDZ256rr,     X86::VPMAXUDZ256rm,       0 },
-    { X86::VPMAXUQZ128rr,     X86::VPMAXUQZ128rm,       0 },
-    { X86::VPMAXUQZ256rr,     X86::VPMAXUQZ256rm,       0 },
-    { X86::VPMAXUWZ128rr,     X86::VPMAXUWZ128rm,       0 },
-    { X86::VPMAXUWZ256rr,     X86::VPMAXUWZ256rm,       0 },
-    { X86::VPMINSBZ128rr,     X86::VPMINSBZ128rm,       0 },
-    { X86::VPMINSBZ256rr,     X86::VPMINSBZ256rm,       0 },
-    { X86::VPMINSDZ128rr,     X86::VPMINSDZ128rm,       0 },
-    { X86::VPMINSDZ256rr,     X86::VPMINSDZ256rm,       0 },
-    { X86::VPMINSQZ128rr,     X86::VPMINSQZ128rm,       0 },
-    { X86::VPMINSQZ256rr,     X86::VPMINSQZ256rm,       0 },
-    { X86::VPMINSWZ128rr,     X86::VPMINSWZ128rm,       0 },
-    { X86::VPMINSWZ256rr,     X86::VPMINSWZ256rm,       0 },
-    { X86::VPMINUBZ128rr,     X86::VPMINUBZ128rm,       0 },
-    { X86::VPMINUBZ256rr,     X86::VPMINUBZ256rm,       0 },
-    { X86::VPMINUDZ128rr,     X86::VPMINUDZ128rm,       0 },
-    { X86::VPMINUDZ256rr,     X86::VPMINUDZ256rm,       0 },
-    { X86::VPMINUQZ128rr,     X86::VPMINUQZ128rm,       0 },
-    { X86::VPMINUQZ256rr,     X86::VPMINUQZ256rm,       0 },
-    { X86::VPMINUWZ128rr,     X86::VPMINUWZ128rm,       0 },
-    { X86::VPMINUWZ256rr,     X86::VPMINUWZ256rm,       0 },
-    { X86::VPMULDQZ128rr,     X86::VPMULDQZ128rm,       0 },
-    { X86::VPMULDQZ256rr,     X86::VPMULDQZ256rm,       0 },
-    { X86::VPMULLDZ128rr,     X86::VPMULLDZ128rm,       0 },
-    { X86::VPMULLDZ256rr,     X86::VPMULLDZ256rm,       0 },
-    { X86::VPMULLQZ128rr,     X86::VPMULLQZ128rm,       0 },
-    { X86::VPMULLQZ256rr,     X86::VPMULLQZ256rm,       0 },
-    { X86::VPMULLWZ128rr,     X86::VPMULLWZ128rm,       0 },
-    { X86::VPMULLWZ256rr,     X86::VPMULLWZ256rm,       0 },
-    { X86::VPMULUDQZ128rr,    X86::VPMULUDQZ128rm,      0 },
-    { X86::VPMULUDQZ256rr,    X86::VPMULUDQZ256rm,      0 },
-    { X86::VPORDZ128rr,       X86::VPORDZ128rm,         0 },
-    { X86::VPORDZ256rr,       X86::VPORDZ256rm,         0 },
-    { X86::VPORQZ128rr,       X86::VPORQZ128rm,         0 },
-    { X86::VPORQZ256rr,       X86::VPORQZ256rm,         0 },
-    { X86::VPSADBWZ128rr,     X86::VPSADBWZ128rm,       0 },
-    { X86::VPSADBWZ256rr,     X86::VPSADBWZ256rm,       0 },
-    { X86::VPSHUFBZ128rr,     X86::VPSHUFBZ128rm,       0 },
-    { X86::VPSHUFBZ256rr,     X86::VPSHUFBZ256rm,       0 },
-    { X86::VPSLLDZ128rr,      X86::VPSLLDZ128rm,        0 },
-    { X86::VPSLLDZ256rr,      X86::VPSLLDZ256rm,        0 },
-    { X86::VPSLLQZ128rr,      X86::VPSLLQZ128rm,        0 },
-    { X86::VPSLLQZ256rr,      X86::VPSLLQZ256rm,        0 },
-    { X86::VPSLLVDZ128rr,     X86::VPSLLVDZ128rm,       0 },
-    { X86::VPSLLVDZ256rr,     X86::VPSLLVDZ256rm,       0 },
-    { X86::VPSLLVQZ128rr,     X86::VPSLLVQZ128rm,       0 },
-    { X86::VPSLLVQZ256rr,     X86::VPSLLVQZ256rm,       0 },
-    { X86::VPSLLVWZ128rr,     X86::VPSLLVWZ128rm,       0 },
-    { X86::VPSLLVWZ256rr,     X86::VPSLLVWZ256rm,       0 },
-    { X86::VPSLLWZ128rr,      X86::VPSLLWZ128rm,        0 },
-    { X86::VPSLLWZ256rr,      X86::VPSLLWZ256rm,        0 },
-    { X86::VPSRADZ128rr,      X86::VPSRADZ128rm,        0 },
-    { X86::VPSRADZ256rr,      X86::VPSRADZ256rm,        0 },
-    { X86::VPSRAQZ128rr,      X86::VPSRAQZ128rm,        0 },
-    { X86::VPSRAQZ256rr,      X86::VPSRAQZ256rm,        0 },
-    { X86::VPSRAVDZ128rr,     X86::VPSRAVDZ128rm,       0 },
-    { X86::VPSRAVDZ256rr,     X86::VPSRAVDZ256rm,       0 },
-    { X86::VPSRAVQZ128rr,     X86::VPSRAVQZ128rm,       0 },
-    { X86::VPSRAVQZ256rr,     X86::VPSRAVQZ256rm,       0 },
-    { X86::VPSRAVWZ128rr,     X86::VPSRAVWZ128rm,       0 },
-    { X86::VPSRAVWZ256rr,     X86::VPSRAVWZ256rm,       0 },
-    { X86::VPSRAWZ128rr,      X86::VPSRAWZ128rm,        0 },
-    { X86::VPSRAWZ256rr,      X86::VPSRAWZ256rm,        0 },
-    { X86::VPSRLDZ128rr,      X86::VPSRLDZ128rm,        0 },
-    { X86::VPSRLDZ256rr,      X86::VPSRLDZ256rm,        0 },
-    { X86::VPSRLQZ128rr,      X86::VPSRLQZ128rm,        0 },
-    { X86::VPSRLQZ256rr,      X86::VPSRLQZ256rm,        0 },
-    { X86::VPSRLVDZ128rr,     X86::VPSRLVDZ128rm,       0 },
-    { X86::VPSRLVDZ256rr,     X86::VPSRLVDZ256rm,       0 },
-    { X86::VPSRLVQZ128rr,     X86::VPSRLVQZ128rm,       0 },
-    { X86::VPSRLVQZ256rr,     X86::VPSRLVQZ256rm,       0 },
-    { X86::VPSRLVWZ128rr,     X86::VPSRLVWZ128rm,       0 },
-    { X86::VPSRLVWZ256rr,     X86::VPSRLVWZ256rm,       0 },
-    { X86::VPSRLWZ128rr,      X86::VPSRLWZ128rm,        0 },
-    { X86::VPSRLWZ256rr,      X86::VPSRLWZ256rm,        0 },
-    { X86::VPSUBBZ128rr,      X86::VPSUBBZ128rm,        0 },
-    { X86::VPSUBBZ256rr,      X86::VPSUBBZ256rm,        0 },
-    { X86::VPSUBDZ128rr,      X86::VPSUBDZ128rm,        0 },
-    { X86::VPSUBDZ256rr,      X86::VPSUBDZ256rm,        0 },
-    { X86::VPSUBQZ128rr,      X86::VPSUBQZ128rm,        0 },
-    { X86::VPSUBQZ256rr,      X86::VPSUBQZ256rm,        0 },
-    { X86::VPSUBSBZ128rr,     X86::VPSUBSBZ128rm,       0 },
-    { X86::VPSUBSBZ256rr,     X86::VPSUBSBZ256rm,       0 },
-    { X86::VPSUBSWZ128rr,     X86::VPSUBSWZ128rm,       0 },
-    { X86::VPSUBSWZ256rr,     X86::VPSUBSWZ256rm,       0 },
-    { X86::VPSUBUSBZ128rr,    X86::VPSUBUSBZ128rm,      0 },
-    { X86::VPSUBUSBZ256rr,    X86::VPSUBUSBZ256rm,      0 },
-    { X86::VPSUBUSWZ128rr,    X86::VPSUBUSWZ128rm,      0 },
-    { X86::VPSUBUSWZ256rr,    X86::VPSUBUSWZ256rm,      0 },
-    { X86::VPSUBWZ128rr,      X86::VPSUBWZ128rm,        0 },
-    { X86::VPSUBWZ256rr,      X86::VPSUBWZ256rm,        0 },
-    { X86::VPUNPCKHBWZ128rr,  X86::VPUNPCKHBWZ128rm,    0 },
-    { X86::VPUNPCKHBWZ256rr,  X86::VPUNPCKHBWZ256rm,    0 },
-    { X86::VPUNPCKHDQZ128rr,  X86::VPUNPCKHDQZ128rm,    0 },
-    { X86::VPUNPCKHDQZ256rr,  X86::VPUNPCKHDQZ256rm,    0 },
-    { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm,   0 },
-    { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm,   0 },
-    { X86::VPUNPCKHWDZ128rr,  X86::VPUNPCKHWDZ128rm,    0 },
-    { X86::VPUNPCKHWDZ256rr,  X86::VPUNPCKHWDZ256rm,    0 },
-    { X86::VPUNPCKLBWZ128rr,  X86::VPUNPCKLBWZ128rm,    0 },
-    { X86::VPUNPCKLBWZ256rr,  X86::VPUNPCKLBWZ256rm,    0 },
-    { X86::VPUNPCKLDQZ128rr,  X86::VPUNPCKLDQZ128rm,    0 },
-    { X86::VPUNPCKLDQZ256rr,  X86::VPUNPCKLDQZ256rm,    0 },
-    { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm,   0 },
-    { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm,   0 },
-    { X86::VPUNPCKLWDZ128rr,  X86::VPUNPCKLWDZ128rm,    0 },
-    { X86::VPUNPCKLWDZ256rr,  X86::VPUNPCKLWDZ256rm,    0 },
-    { X86::VPXORDZ128rr,      X86::VPXORDZ128rm,        0 },
-    { X86::VPXORDZ256rr,      X86::VPXORDZ256rm,        0 },
-    { X86::VPXORQZ128rr,      X86::VPXORQZ128rm,        0 },
-    { X86::VPXORQZ256rr,      X86::VPXORQZ256rm,        0 },
-    { X86::VSHUFPDZ128rri,    X86::VSHUFPDZ128rmi,      0 },
-    { X86::VSHUFPDZ256rri,    X86::VSHUFPDZ256rmi,      0 },
-    { X86::VSHUFPSZ128rri,    X86::VSHUFPSZ128rmi,      0 },
-    { X86::VSHUFPSZ256rri,    X86::VSHUFPSZ256rmi,      0 },
-    { X86::VSUBPDZ128rr,      X86::VSUBPDZ128rm,        0 },
-    { X86::VSUBPDZ256rr,      X86::VSUBPDZ256rm,        0 },
-    { X86::VSUBPSZ128rr,      X86::VSUBPSZ128rm,        0 },
-    { X86::VSUBPSZ256rr,      X86::VSUBPSZ256rm,        0 },
-    { X86::VUNPCKHPDZ128rr,   X86::VUNPCKHPDZ128rm,     0 },
-    { X86::VUNPCKHPDZ256rr,   X86::VUNPCKHPDZ256rm,     0 },
-    { X86::VUNPCKHPSZ128rr,   X86::VUNPCKHPSZ128rm,     0 },
-    { X86::VUNPCKHPSZ256rr,   X86::VUNPCKHPSZ256rm,     0 },
-    { X86::VUNPCKLPDZ128rr,   X86::VUNPCKLPDZ128rm,     0 },
-    { X86::VUNPCKLPDZ256rr,   X86::VUNPCKLPDZ256rm,     0 },
-    { X86::VUNPCKLPSZ128rr,   X86::VUNPCKLPSZ128rm,     0 },
-    { X86::VUNPCKLPSZ256rr,   X86::VUNPCKLPSZ256rm,     0 },
-    { X86::VXORPDZ128rr,      X86::VXORPDZ128rm,        0 },
-    { X86::VXORPDZ256rr,      X86::VXORPDZ256rm,        0 },
-    { X86::VXORPSZ128rr,      X86::VXORPSZ128rm,        0 },
-    { X86::VXORPSZ256rr,      X86::VXORPSZ256rm,        0 },
-
-    // AVX-512 masked foldable instructions
-    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
-    { X86::VPABSBZrrkz,       X86::VPABSBZrmkz,         0 },
-    { X86::VPABSDZrrkz,       X86::VPABSDZrmkz,         0 },
-    { X86::VPABSQZrrkz,       X86::VPABSQZrmkz,         0 },
-    { X86::VPABSWZrrkz,       X86::VPABSWZrmkz,         0 },
-    { X86::VPCONFLICTDZrrkz,  X86::VPCONFLICTDZrmkz,    0 },
-    { X86::VPCONFLICTQZrrkz,  X86::VPCONFLICTQZrmkz,    0 },
-    { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
-    { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
-    { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
-    { X86::VPERMQZrikz,       X86::VPERMQZmikz,         0 },
-    { X86::VPLZCNTDZrrkz,     X86::VPLZCNTDZrmkz,       0 },
-    { X86::VPLZCNTQZrrkz,     X86::VPLZCNTQZrmkz,       0 },
-    { X86::VPMOVSXBDZrrkz,    X86::VPMOVSXBDZrmkz,      0 },
-    { X86::VPMOVSXBQZrrkz,    X86::VPMOVSXBQZrmkz,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZrrkz,    X86::VPMOVSXBWZrmkz,      0 },
-    { X86::VPMOVSXDQZrrkz,    X86::VPMOVSXDQZrmkz,      0 },
-    { X86::VPMOVSXWDZrrkz,    X86::VPMOVSXWDZrmkz,      0 },
-    { X86::VPMOVSXWQZrrkz,    X86::VPMOVSXWQZrmkz,      0 },
-    { X86::VPMOVZXBDZrrkz,    X86::VPMOVZXBDZrmkz,      0 },
-    { X86::VPMOVZXBQZrrkz,    X86::VPMOVZXBQZrmkz,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZrrkz,    X86::VPMOVZXBWZrmkz,      0 },
-    { X86::VPMOVZXDQZrrkz,    X86::VPMOVZXDQZrmkz,      0 },
-    { X86::VPMOVZXWDZrrkz,    X86::VPMOVZXWDZrmkz,      0 },
-    { X86::VPMOVZXWQZrrkz,    X86::VPMOVZXWQZrmkz,      0 },
-    { X86::VPOPCNTDZrrkz,     X86::VPOPCNTDZrmkz,       0 },
-    { X86::VPOPCNTQZrrkz,     X86::VPOPCNTQZrmkz,       0 },
-    { X86::VPSHUFDZrikz,      X86::VPSHUFDZmikz,        0 },
-    { X86::VPSHUFHWZrikz,     X86::VPSHUFHWZmikz,       0 },
-    { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
-    { X86::VPSLLDZrikz,       X86::VPSLLDZmikz,         0 },
-    { X86::VPSLLQZrikz,       X86::VPSLLQZmikz,         0 },
-    { X86::VPSLLWZrikz,       X86::VPSLLWZmikz,         0 },
-    { X86::VPSRADZrikz,       X86::VPSRADZmikz,         0 },
-    { X86::VPSRAQZrikz,       X86::VPSRAQZmikz,         0 },
-    { X86::VPSRAWZrikz,       X86::VPSRAWZmikz,         0 },
-    { X86::VPSRLDZrikz,       X86::VPSRLDZmikz,         0 },
-    { X86::VPSRLQZrikz,       X86::VPSRLQZmikz,         0 },
-    { X86::VPSRLWZrikz,       X86::VPSRLWZmikz,         0 },
-
-    // AVX-512VL 256-bit masked foldable instructions
-    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
-    { X86::VPABSBZ256rrkz,    X86::VPABSBZ256rmkz,      0 },
-    { X86::VPABSDZ256rrkz,    X86::VPABSDZ256rmkz,      0 },
-    { X86::VPABSQZ256rrkz,    X86::VPABSQZ256rmkz,      0 },
-    { X86::VPABSWZ256rrkz,    X86::VPABSWZ256rmkz,      0 },
-    { X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmkz, 0 },
-    { X86::VPCONFLICTQZ256rrkz, X86::VPCONFLICTQZ256rmkz, 0 },
-    { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
-    { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
-    { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
-    { X86::VPERMQZ256rikz,    X86::VPERMQZ256mikz,      0 },
-    { X86::VPLZCNTDZ256rrkz,  X86::VPLZCNTDZ256rmkz,    0 },
-    { X86::VPLZCNTQZ256rrkz,  X86::VPLZCNTQZ256rmkz,    0 },
-    { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz,   0 },
-    { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz,   0 },
-    { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz,   0 },
-    { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz,   0 },
-    { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz,   0 },
-    { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz,   0 },
-    { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPSHUFDZ256rikz,   X86::VPSHUFDZ256mikz,     0 },
-    { X86::VPSHUFHWZ256rikz,  X86::VPSHUFHWZ256mikz,    0 },
-    { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
-    { X86::VPSLLDZ256rikz,    X86::VPSLLDZ256mikz,      0 },
-    { X86::VPSLLQZ256rikz,    X86::VPSLLQZ256mikz,      0 },
-    { X86::VPSLLWZ256rikz,    X86::VPSLLWZ256mikz,      0 },
-    { X86::VPSRADZ256rikz,    X86::VPSRADZ256mikz,      0 },
-    { X86::VPSRAQZ256rikz,    X86::VPSRAQZ256mikz,      0 },
-    { X86::VPSRAWZ256rikz,    X86::VPSRAWZ256mikz,      0 },
-    { X86::VPSRLDZ256rikz,    X86::VPSRLDZ256mikz,      0 },
-    { X86::VPSRLQZ256rikz,    X86::VPSRLQZ256mikz,      0 },
-    { X86::VPSRLWZ256rikz,    X86::VPSRLWZ256mikz,      0 },
-
-    // AVX-512VL 128-bit masked foldable instructions
-    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
-    { X86::VPABSBZ128rrkz,    X86::VPABSBZ128rmkz,      0 },
-    { X86::VPABSDZ128rrkz,    X86::VPABSDZ128rmkz,      0 },
-    { X86::VPABSQZ128rrkz,    X86::VPABSQZ128rmkz,      0 },
-    { X86::VPABSWZ128rrkz,    X86::VPABSWZ128rmkz,      0 },
-    { X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmkz, 0 },
-    { X86::VPCONFLICTQZ128rrkz, X86::VPCONFLICTQZ128rmkz, 0 },
-    { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
-    { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
-    { X86::VPLZCNTDZ128rrkz,  X86::VPLZCNTDZ128rmkz,    0 },
-    { X86::VPLZCNTQZ128rrkz,  X86::VPLZCNTQZ128rmkz,    0 },
-    { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPSHUFDZ128rikz,   X86::VPSHUFDZ128mikz,     0 },
-    { X86::VPSHUFHWZ128rikz,  X86::VPSHUFHWZ128mikz,    0 },
-    { X86::VPSHUFLWZ128rikz,  X86::VPSHUFLWZ128mikz,    0 },
-    { X86::VPSLLDZ128rikz,    X86::VPSLLDZ128mikz,      0 },
-    { X86::VPSLLQZ128rikz,    X86::VPSLLQZ128mikz,      0 },
-    { X86::VPSLLWZ128rikz,    X86::VPSLLWZ128mikz,      0 },
-    { X86::VPSRADZ128rikz,    X86::VPSRADZ128mikz,      0 },
-    { X86::VPSRAQZ128rikz,    X86::VPSRAQZ128mikz,      0 },
-    { X86::VPSRAWZ128rikz,    X86::VPSRAWZ128mikz,      0 },
-    { X86::VPSRLDZ128rikz,    X86::VPSRLDZ128mikz,      0 },
-    { X86::VPSRLQZ128rikz,    X86::VPSRLQZ128mikz,      0 },
-    { X86::VPSRLWZ128rikz,    X86::VPSRLWZ128mikz,      0 },
-
-    // AES foldable instructions
-    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
-    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
-    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
-    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
-    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       0 },
-    { X86::VAESDECrr,         X86::VAESDECrm,           0 },
-    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       0 },
-    { X86::VAESENCrr,         X86::VAESENCrm,           0 },
-
-    // SHA foldable instructions
-    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
-    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
-    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
-    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
-    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
-    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
-    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
-  };
-
-  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
-    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
-                  Entry.RegOp, Entry.MemOp,
-                  // Index 2, folded load
-                  Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
-  }
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
-    // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDSS4rr_Int,       X86::VFMADDSS4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDSD4rr_Int,       X86::VFMADDSD4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDPS4Yrr,          X86::VFMADDPS4Yrm,          TB_ALIGN_NONE },
-    { X86::VFMADDPD4Yrr,          X86::VFMADDPD4Yrm,          TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr_Int,      X86::VFNMADDSS4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDSD4rr_Int,      X86::VFNMADDSD4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDPS4Yrr,         X86::VFNMADDPS4Yrm,         TB_ALIGN_NONE },
-    { X86::VFNMADDPD4Yrr,         X86::VFNMADDPD4Yrm,         TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr_Int,       X86::VFMSUBSS4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBSD4rr_Int,       X86::VFMSUBSD4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBPS4Yrr,          X86::VFMSUBPS4Yrm,          TB_ALIGN_NONE },
-    { X86::VFMSUBPD4Yrr,          X86::VFMSUBPD4Yrm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr_Int,      X86::VFNMSUBSS4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBSD4rr_Int,      X86::VFNMSUBSD4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBPS4Yrr,         X86::VFNMSUBPS4Yrm,         TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4Yrr,         X86::VFNMSUBPD4Yrm,         TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4Yrr,       X86::VFMADDSUBPS4Yrm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4Yrr,       X86::VFMADDSUBPD4Yrm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4Yrr,       X86::VFMSUBADDPS4Yrm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4Yrr,       X86::VFMSUBADDPD4Yrm,       TB_ALIGN_NONE },
-
-    // XOP foldable instructions
-    { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
-    { X86::VPCMOVYrrr,            X86::VPCMOVYrrm,            0 },
-    { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
-    { X86::VPERMIL2PDYrr,         X86::VPERMIL2PDYrm,         0 },
-    { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
-    { X86::VPERMIL2PSYrr,         X86::VPERMIL2PSYrm,         0 },
-    { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
-
-    // AVX-512 instructions with 3 source operands.
-    { X86::VPERMI2Brr,            X86::VPERMI2Brm,            0 },
-    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
-    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
-    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
-    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
-    { X86::VPERMI2Wrr,            X86::VPERMI2Wrm,            0 },
-    { X86::VPERMT2Brr,            X86::VPERMT2Brm,            0 },
-    { X86::VPERMT2Drr,            X86::VPERMT2Drm,            0 },
-    { X86::VPERMT2PSrr,           X86::VPERMT2PSrm,           0 },
-    { X86::VPERMT2PDrr,           X86::VPERMT2PDrm,           0 },
-    { X86::VPERMT2Qrr,            X86::VPERMT2Qrm,            0 },
-    { X86::VPERMT2Wrr,            X86::VPERMT2Wrm,            0 },
-    { X86::VPMADD52HUQZr,         X86::VPMADD52HUQZm,         0 },
-    { X86::VPMADD52LUQZr,         X86::VPMADD52LUQZm,         0 },
-    { X86::VPTERNLOGDZrri,        X86::VPTERNLOGDZrmi,        0 },
-    { X86::VPTERNLOGQZrri,        X86::VPTERNLOGQZrmi,        0 },
-
-    // AVX-512VL 256-bit instructions with 3 source operands.
-    { X86::VPERMI2B256rr,         X86::VPERMI2B256rm,         0 },
-    { X86::VPERMI2D256rr,         X86::VPERMI2D256rm,         0 },
-    { X86::VPERMI2PD256rr,        X86::VPERMI2PD256rm,        0 },
-    { X86::VPERMI2PS256rr,        X86::VPERMI2PS256rm,        0 },
-    { X86::VPERMI2Q256rr,         X86::VPERMI2Q256rm,         0 },
-    { X86::VPERMI2W256rr,         X86::VPERMI2W256rm,         0 },
-    { X86::VPERMT2B256rr,         X86::VPERMT2B256rm,         0 },
-    { X86::VPERMT2D256rr,         X86::VPERMT2D256rm,         0 },
-    { X86::VPERMT2PD256rr,        X86::VPERMT2PD256rm,        0 },
-    { X86::VPERMT2PS256rr,        X86::VPERMT2PS256rm,        0 },
-    { X86::VPERMT2Q256rr,         X86::VPERMT2Q256rm,         0 },
-    { X86::VPERMT2W256rr,         X86::VPERMT2W256rm,         0 },
-    { X86::VPMADD52HUQZ256r,      X86::VPMADD52HUQZ256m,      0 },
-    { X86::VPMADD52LUQZ256r,      X86::VPMADD52LUQZ256m,      0 },
-    { X86::VPTERNLOGDZ256rri,     X86::VPTERNLOGDZ256rmi,     0 },
-    { X86::VPTERNLOGQZ256rri,     X86::VPTERNLOGQZ256rmi,     0 },
-
-    // AVX-512VL 128-bit instructions with 3 source operands.
-    { X86::VPERMI2B128rr,         X86::VPERMI2B128rm,         0 },
-    { X86::VPERMI2D128rr,         X86::VPERMI2D128rm,         0 },
-    { X86::VPERMI2PD128rr,        X86::VPERMI2PD128rm,        0 },
-    { X86::VPERMI2PS128rr,        X86::VPERMI2PS128rm,        0 },
-    { X86::VPERMI2Q128rr,         X86::VPERMI2Q128rm,         0 },
-    { X86::VPERMI2W128rr,         X86::VPERMI2W128rm,         0 },
-    { X86::VPERMT2B128rr,         X86::VPERMT2B128rm,         0 },
-    { X86::VPERMT2D128rr,         X86::VPERMT2D128rm,         0 },
-    { X86::VPERMT2PD128rr,        X86::VPERMT2PD128rm,        0 },
-    { X86::VPERMT2PS128rr,        X86::VPERMT2PS128rm,        0 },
-    { X86::VPERMT2Q128rr,         X86::VPERMT2Q128rm,         0 },
-    { X86::VPERMT2W128rr,         X86::VPERMT2W128rm,         0 },
-    { X86::VPMADD52HUQZ128r,      X86::VPMADD52HUQZ128m,      0 },
-    { X86::VPMADD52LUQZ128r,      X86::VPMADD52LUQZ128m,      0 },
-    { X86::VPTERNLOGDZ128rri,     X86::VPTERNLOGDZ128rmi,     0 },
-    { X86::VPTERNLOGQZ128rri,     X86::VPTERNLOGQZ128rmi,     0 },
-
-    // AVX-512 masked instructions
-    { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
-    { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
-    { X86::VADDSDZrr_Intkz,       X86::VADDSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VADDSSZrr_Intkz,       X86::VADDSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VALIGNDZrrikz,         X86::VALIGNDZrmikz,         0 },
-    { X86::VALIGNQZrrikz,         X86::VALIGNQZrmikz,         0 },
-    { X86::VANDNPDZrrkz,          X86::VANDNPDZrmkz,          0 },
-    { X86::VANDNPSZrrkz,          X86::VANDNPSZrmkz,          0 },
-    { X86::VANDPDZrrkz,           X86::VANDPDZrmkz,           0 },
-    { X86::VANDPSZrrkz,           X86::VANDPSZrmkz,           0 },
-    { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
-    { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
-    { X86::VDIVSDZrr_Intkz,       X86::VDIVSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VDIVSSZrr_Intkz,       X86::VDIVSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VINSERTF32x4Zrrkz,     X86::VINSERTF32x4Zrmkz,     0 },
-    { X86::VINSERTF32x8Zrrkz,     X86::VINSERTF32x8Zrmkz,     0 },
-    { X86::VINSERTF64x2Zrrkz,     X86::VINSERTF64x2Zrmkz,     0 },
-    { X86::VINSERTF64x4Zrrkz,     X86::VINSERTF64x4Zrmkz,     0 },
-    { X86::VINSERTI32x4Zrrkz,     X86::VINSERTI32x4Zrmkz,     0 },
-    { X86::VINSERTI32x8Zrrkz,     X86::VINSERTI32x8Zrmkz,     0 },
-    { X86::VINSERTI64x2Zrrkz,     X86::VINSERTI64x2Zrmkz,     0 },
-    { X86::VINSERTI64x4Zrrkz,     X86::VINSERTI64x4Zrmkz,     0 },
-    { X86::VMAXCPDZrrkz,          X86::VMAXCPDZrmkz,          0 },
-    { X86::VMAXCPSZrrkz,          X86::VMAXCPSZrmkz,          0 },
-    { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
-    { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
-    { X86::VMAXSDZrr_Intkz,       X86::VMAXSDZrm_Intkz,       0 },
-    { X86::VMAXSSZrr_Intkz,       X86::VMAXSSZrm_Intkz,       0 },
-    { X86::VMINCPDZrrkz,          X86::VMINCPDZrmkz,          0 },
-    { X86::VMINCPSZrrkz,          X86::VMINCPSZrmkz,          0 },
-    { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
-    { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
-    { X86::VMINSDZrr_Intkz,       X86::VMINSDZrm_Intkz,       0 },
-    { X86::VMINSSZrr_Intkz,       X86::VMINSSZrm_Intkz,       0 },
-    { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
-    { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
-    { X86::VMULSDZrr_Intkz,       X86::VMULSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VMULSSZrr_Intkz,       X86::VMULSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
-    { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
-    { X86::VPACKSSDWZrrkz,        X86::VPACKSSDWZrmkz,        0 },
-    { X86::VPACKSSWBZrrkz,        X86::VPACKSSWBZrmkz,        0 },
-    { X86::VPACKUSDWZrrkz,        X86::VPACKUSDWZrmkz,        0 },
-    { X86::VPACKUSWBZrrkz,        X86::VPACKUSWBZrmkz,        0 },
-    { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
-    { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
-    { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
-    { X86::VPADDSBZrrkz,          X86::VPADDSBZrmkz,          0 },
-    { X86::VPADDSWZrrkz,          X86::VPADDSWZrmkz,          0 },
-    { X86::VPADDUSBZrrkz,         X86::VPADDUSBZrmkz,         0 },
-    { X86::VPADDUSWZrrkz,         X86::VPADDUSWZrmkz,         0 },
-    { X86::VPADDWZrrkz,           X86::VPADDWZrmkz,           0 },
-    { X86::VPALIGNRZrrikz,        X86::VPALIGNRZrmikz,        0 },
-    { X86::VPANDDZrrkz,           X86::VPANDDZrmkz,           0 },
-    { X86::VPANDNDZrrkz,          X86::VPANDNDZrmkz,          0 },
-    { X86::VPANDNQZrrkz,          X86::VPANDNQZrmkz,          0 },
-    { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
-    { X86::VPAVGBZrrkz,           X86::VPAVGBZrmkz,           0 },
-    { X86::VPAVGWZrrkz,           X86::VPAVGWZrmkz,           0 },
-    { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
-    { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
-    { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
-    { X86::VPERMILPSZrrkz,        X86::VPERMILPSZrmkz,        0 },
-    { X86::VPERMPDZrrkz,          X86::VPERMPDZrmkz,          0 },
-    { X86::VPERMPSZrrkz,          X86::VPERMPSZrmkz,          0 },
-    { X86::VPERMQZrrkz,           X86::VPERMQZrmkz,           0 },
-    { X86::VPERMWZrrkz,           X86::VPERMWZrmkz,           0 },
-    { X86::VPMADDUBSWZrrkz,       X86::VPMADDUBSWZrmkz,       0 },
-    { X86::VPMADDWDZrrkz,         X86::VPMADDWDZrmkz,         0 },
-    { X86::VPMAXSBZrrkz,          X86::VPMAXSBZrmkz,          0 },
-    { X86::VPMAXSDZrrkz,          X86::VPMAXSDZrmkz,          0 },
-    { X86::VPMAXSQZrrkz,          X86::VPMAXSQZrmkz,          0 },
-    { X86::VPMAXSWZrrkz,          X86::VPMAXSWZrmkz,          0 },
-    { X86::VPMAXUBZrrkz,          X86::VPMAXUBZrmkz,          0 },
-    { X86::VPMAXUDZrrkz,          X86::VPMAXUDZrmkz,          0 },
-    { X86::VPMAXUQZrrkz,          X86::VPMAXUQZrmkz,          0 },
-    { X86::VPMAXUWZrrkz,          X86::VPMAXUWZrmkz,          0 },
-    { X86::VPMINSBZrrkz,          X86::VPMINSBZrmkz,          0 },
-    { X86::VPMINSDZrrkz,          X86::VPMINSDZrmkz,          0 },
-    { X86::VPMINSQZrrkz,          X86::VPMINSQZrmkz,          0 },
-    { X86::VPMINSWZrrkz,          X86::VPMINSWZrmkz,          0 },
-    { X86::VPMINUBZrrkz,          X86::VPMINUBZrmkz,          0 },
-    { X86::VPMINUDZrrkz,          X86::VPMINUDZrmkz,          0 },
-    { X86::VPMINUQZrrkz,          X86::VPMINUQZrmkz,          0 },
-    { X86::VPMINUWZrrkz,          X86::VPMINUWZrmkz,          0 },
-    { X86::VPMULLDZrrkz,          X86::VPMULLDZrmkz,          0 },
-    { X86::VPMULLQZrrkz,          X86::VPMULLQZrmkz,          0 },
-    { X86::VPMULLWZrrkz,          X86::VPMULLWZrmkz,          0 },
-    { X86::VPMULDQZrrkz,          X86::VPMULDQZrmkz,          0 },
-    { X86::VPMULUDQZrrkz,         X86::VPMULUDQZrmkz,         0 },
-    { X86::VPORDZrrkz,            X86::VPORDZrmkz,            0 },
-    { X86::VPORQZrrkz,            X86::VPORQZrmkz,            0 },
-    { X86::VPSHUFBZrrkz,          X86::VPSHUFBZrmkz,          0 },
-    { X86::VPSLLDZrrkz,           X86::VPSLLDZrmkz,           0 },
-    { X86::VPSLLQZrrkz,           X86::VPSLLQZrmkz,           0 },
-    { X86::VPSLLVDZrrkz,          X86::VPSLLVDZrmkz,          0 },
-    { X86::VPSLLVQZrrkz,          X86::VPSLLVQZrmkz,          0 },
-    { X86::VPSLLVWZrrkz,          X86::VPSLLVWZrmkz,          0 },
-    { X86::VPSLLWZrrkz,           X86::VPSLLWZrmkz,           0 },
-    { X86::VPSRADZrrkz,           X86::VPSRADZrmkz,           0 },
-    { X86::VPSRAQZrrkz,           X86::VPSRAQZrmkz,           0 },
-    { X86::VPSRAVDZrrkz,          X86::VPSRAVDZrmkz,          0 },
-    { X86::VPSRAVQZrrkz,          X86::VPSRAVQZrmkz,          0 },
-    { X86::VPSRAVWZrrkz,          X86::VPSRAVWZrmkz,          0 },
-    { X86::VPSRAWZrrkz,           X86::VPSRAWZrmkz,           0 },
-    { X86::VPSRLDZrrkz,           X86::VPSRLDZrmkz,           0 },
-    { X86::VPSRLQZrrkz,           X86::VPSRLQZrmkz,           0 },
-    { X86::VPSRLVDZrrkz,          X86::VPSRLVDZrmkz,          0 },
-    { X86::VPSRLVQZrrkz,          X86::VPSRLVQZrmkz,          0 },
-    { X86::VPSRLVWZrrkz,          X86::VPSRLVWZrmkz,          0 },
-    { X86::VPSRLWZrrkz,           X86::VPSRLWZrmkz,           0 },
-    { X86::VPSUBBZrrkz,           X86::VPSUBBZrmkz,           0 },
-    { X86::VPSUBDZrrkz,           X86::VPSUBDZrmkz,           0 },
-    { X86::VPSUBQZrrkz,           X86::VPSUBQZrmkz,           0 },
-    { X86::VPSUBSBZrrkz,          X86::VPSUBSBZrmkz,          0 },
-    { X86::VPSUBSWZrrkz,          X86::VPSUBSWZrmkz,          0 },
-    { X86::VPSUBUSBZrrkz,         X86::VPSUBUSBZrmkz,         0 },
-    { X86::VPSUBUSWZrrkz,         X86::VPSUBUSWZrmkz,         0 },
-    { X86::VPSUBWZrrkz,           X86::VPSUBWZrmkz,           0 },
-    { X86::VPUNPCKHBWZrrkz,       X86::VPUNPCKHBWZrmkz,       0 },
-    { X86::VPUNPCKHDQZrrkz,       X86::VPUNPCKHDQZrmkz,       0 },
-    { X86::VPUNPCKHQDQZrrkz,      X86::VPUNPCKHQDQZrmkz,      0 },
-    { X86::VPUNPCKHWDZrrkz,       X86::VPUNPCKHWDZrmkz,       0 },
-    { X86::VPUNPCKLBWZrrkz,       X86::VPUNPCKLBWZrmkz,       0 },
-    { X86::VPUNPCKLDQZrrkz,       X86::VPUNPCKLDQZrmkz,       0 },
-    { X86::VPUNPCKLQDQZrrkz,      X86::VPUNPCKLQDQZrmkz,      0 },
-    { X86::VPUNPCKLWDZrrkz,       X86::VPUNPCKLWDZrmkz,       0 },
-    { X86::VPXORDZrrkz,           X86::VPXORDZrmkz,           0 },
-    { X86::VPXORQZrrkz,           X86::VPXORQZrmkz,           0 },
-    { X86::VSHUFPDZrrikz,         X86::VSHUFPDZrmikz,         0 },
-    { X86::VSHUFPSZrrikz,         X86::VSHUFPSZrmikz,         0 },
-    { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
-    { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
-    { X86::VSUBSDZrr_Intkz,       X86::VSUBSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VSUBSSZrr_Intkz,       X86::VSUBSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDZrrkz,        X86::VUNPCKHPDZrmkz,        0 },
-    { X86::VUNPCKHPSZrrkz,        X86::VUNPCKHPSZrmkz,        0 },
-    { X86::VUNPCKLPDZrrkz,        X86::VUNPCKLPDZrmkz,        0 },
-    { X86::VUNPCKLPSZrrkz,        X86::VUNPCKLPSZrmkz,        0 },
-    { X86::VXORPDZrrkz,           X86::VXORPDZrmkz,           0 },
-    { X86::VXORPSZrrkz,           X86::VXORPSZrmkz,           0 },
-
-    // AVX-512{F,VL} masked arithmetic instructions 256-bit
-    { X86::VADDPDZ256rrkz,        X86::VADDPDZ256rmkz,        0 },
-    { X86::VADDPSZ256rrkz,        X86::VADDPSZ256rmkz,        0 },
-    { X86::VALIGNDZ256rrikz,      X86::VALIGNDZ256rmikz,      0 },
-    { X86::VALIGNQZ256rrikz,      X86::VALIGNQZ256rmikz,      0 },
-    { X86::VANDNPDZ256rrkz,       X86::VANDNPDZ256rmkz,       0 },
-    { X86::VANDNPSZ256rrkz,       X86::VANDNPSZ256rmkz,       0 },
-    { X86::VANDPDZ256rrkz,        X86::VANDPDZ256rmkz,        0 },
-    { X86::VANDPSZ256rrkz,        X86::VANDPSZ256rmkz,        0 },
-    { X86::VDIVPDZ256rrkz,        X86::VDIVPDZ256rmkz,        0 },
-    { X86::VDIVPSZ256rrkz,        X86::VDIVPSZ256rmkz,        0 },
-    { X86::VINSERTF32x4Z256rrkz,  X86::VINSERTF32x4Z256rmkz,  0 },
-    { X86::VINSERTF64x2Z256rrkz,  X86::VINSERTF64x2Z256rmkz,  0 },
-    { X86::VINSERTI32x4Z256rrkz,  X86::VINSERTI32x4Z256rmkz,  0 },
-    { X86::VINSERTI64x2Z256rrkz,  X86::VINSERTI64x2Z256rmkz,  0 },
-    { X86::VMAXCPDZ256rrkz,       X86::VMAXCPDZ256rmkz,       0 },
-    { X86::VMAXCPSZ256rrkz,       X86::VMAXCPSZ256rmkz,       0 },
-    { X86::VMAXPDZ256rrkz,        X86::VMAXPDZ256rmkz,        0 },
-    { X86::VMAXPSZ256rrkz,        X86::VMAXPSZ256rmkz,        0 },
-    { X86::VMINCPDZ256rrkz,       X86::VMINCPDZ256rmkz,       0 },
-    { X86::VMINCPSZ256rrkz,       X86::VMINCPSZ256rmkz,       0 },
-    { X86::VMINPDZ256rrkz,        X86::VMINPDZ256rmkz,        0 },
-    { X86::VMINPSZ256rrkz,        X86::VMINPSZ256rmkz,        0 },
-    { X86::VMULPDZ256rrkz,        X86::VMULPDZ256rmkz,        0 },
-    { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
-    { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
-    { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
-    { X86::VPACKSSDWZ256rrkz,     X86::VPACKSSDWZ256rmkz,     0 },
-    { X86::VPACKSSWBZ256rrkz,     X86::VPACKSSWBZ256rmkz,     0 },
-    { X86::VPACKUSDWZ256rrkz,     X86::VPACKUSDWZ256rmkz,     0 },
-    { X86::VPACKUSWBZ256rrkz,     X86::VPACKUSWBZ256rmkz,     0 },
-    { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
-    { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
-    { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
-    { X86::VPADDSBZ256rrkz,       X86::VPADDSBZ256rmkz,       0 },
-    { X86::VPADDSWZ256rrkz,       X86::VPADDSWZ256rmkz,       0 },
-    { X86::VPADDUSBZ256rrkz,      X86::VPADDUSBZ256rmkz,      0 },
-    { X86::VPADDUSWZ256rrkz,      X86::VPADDUSWZ256rmkz,      0 },
-    { X86::VPADDWZ256rrkz,        X86::VPADDWZ256rmkz,        0 },
-    { X86::VPALIGNRZ256rrikz,     X86::VPALIGNRZ256rmikz,     0 },
-    { X86::VPANDDZ256rrkz,        X86::VPANDDZ256rmkz,        0 },
-    { X86::VPANDNDZ256rrkz,       X86::VPANDNDZ256rmkz,       0 },
-    { X86::VPANDNQZ256rrkz,       X86::VPANDNQZ256rmkz,       0 },
-    { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
-    { X86::VPAVGBZ256rrkz,        X86::VPAVGBZ256rmkz,        0 },
-    { X86::VPAVGWZ256rrkz,        X86::VPAVGWZ256rmkz,        0 },
-    { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
-    { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
-    { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
-    { X86::VPERMILPSZ256rrkz,     X86::VPERMILPSZ256rmkz,     0 },
-    { X86::VPERMPDZ256rrkz,       X86::VPERMPDZ256rmkz,       0 },
-    { X86::VPERMPSZ256rrkz,       X86::VPERMPSZ256rmkz,       0 },
-    { X86::VPERMQZ256rrkz,        X86::VPERMQZ256rmkz,        0 },
-    { X86::VPERMWZ256rrkz,        X86::VPERMWZ256rmkz,        0 },
-    { X86::VPMADDUBSWZ256rrkz,    X86::VPMADDUBSWZ256rmkz,    0 },
-    { X86::VPMADDWDZ256rrkz,      X86::VPMADDWDZ256rmkz,      0 },
-    { X86::VPMAXSBZ256rrkz,       X86::VPMAXSBZ256rmkz,       0 },
-    { X86::VPMAXSDZ256rrkz,       X86::VPMAXSDZ256rmkz,       0 },
-    { X86::VPMAXSQZ256rrkz,       X86::VPMAXSQZ256rmkz,       0 },
-    { X86::VPMAXSWZ256rrkz,       X86::VPMAXSWZ256rmkz,       0 },
-    { X86::VPMAXUBZ256rrkz,       X86::VPMAXUBZ256rmkz,       0 },
-    { X86::VPMAXUDZ256rrkz,       X86::VPMAXUDZ256rmkz,       0 },
-    { X86::VPMAXUQZ256rrkz,       X86::VPMAXUQZ256rmkz,       0 },
-    { X86::VPMAXUWZ256rrkz,       X86::VPMAXUWZ256rmkz,       0 },
-    { X86::VPMINSBZ256rrkz,       X86::VPMINSBZ256rmkz,       0 },
-    { X86::VPMINSDZ256rrkz,       X86::VPMINSDZ256rmkz,       0 },
-    { X86::VPMINSQZ256rrkz,       X86::VPMINSQZ256rmkz,       0 },
-    { X86::VPMINSWZ256rrkz,       X86::VPMINSWZ256rmkz,       0 },
-    { X86::VPMINUBZ256rrkz,       X86::VPMINUBZ256rmkz,       0 },
-    { X86::VPMINUDZ256rrkz,       X86::VPMINUDZ256rmkz,       0 },
-    { X86::VPMINUQZ256rrkz,       X86::VPMINUQZ256rmkz,       0 },
-    { X86::VPMINUWZ256rrkz,       X86::VPMINUWZ256rmkz,       0 },
-    { X86::VPMULDQZ256rrkz,       X86::VPMULDQZ256rmkz,       0 },
-    { X86::VPMULLDZ256rrkz,       X86::VPMULLDZ256rmkz,       0 },
-    { X86::VPMULLQZ256rrkz,       X86::VPMULLQZ256rmkz,       0 },
-    { X86::VPMULLWZ256rrkz,       X86::VPMULLWZ256rmkz,       0 },
-    { X86::VPMULUDQZ256rrkz,      X86::VPMULUDQZ256rmkz,      0 },
-    { X86::VPORDZ256rrkz,         X86::VPORDZ256rmkz,         0 },
-    { X86::VPORQZ256rrkz,         X86::VPORQZ256rmkz,         0 },
-    { X86::VPSHUFBZ256rrkz,       X86::VPSHUFBZ256rmkz,       0 },
-    { X86::VPSLLDZ256rrkz,        X86::VPSLLDZ256rmkz,        0 },
-    { X86::VPSLLQZ256rrkz,        X86::VPSLLQZ256rmkz,        0 },
-    { X86::VPSLLVDZ256rrkz,       X86::VPSLLVDZ256rmkz,       0 },
-    { X86::VPSLLVQZ256rrkz,       X86::VPSLLVQZ256rmkz,       0 },
-    { X86::VPSLLVWZ256rrkz,       X86::VPSLLVWZ256rmkz,       0 },
-    { X86::VPSLLWZ256rrkz,        X86::VPSLLWZ256rmkz,        0 },
-    { X86::VPSRADZ256rrkz,        X86::VPSRADZ256rmkz,        0 },
-    { X86::VPSRAQZ256rrkz,        X86::VPSRAQZ256rmkz,        0 },
-    { X86::VPSRAVDZ256rrkz,       X86::VPSRAVDZ256rmkz,       0 },
-    { X86::VPSRAVQZ256rrkz,       X86::VPSRAVQZ256rmkz,       0 },
-    { X86::VPSRAVWZ256rrkz,       X86::VPSRAVWZ256rmkz,       0 },
-    { X86::VPSRAWZ256rrkz,        X86::VPSRAWZ256rmkz,        0 },
-    { X86::VPSRLDZ256rrkz,        X86::VPSRLDZ256rmkz,        0 },
-    { X86::VPSRLQZ256rrkz,        X86::VPSRLQZ256rmkz,        0 },
-    { X86::VPSRLVDZ256rrkz,       X86::VPSRLVDZ256rmkz,       0 },
-    { X86::VPSRLVQZ256rrkz,       X86::VPSRLVQZ256rmkz,       0 },
-    { X86::VPSRLVWZ256rrkz,       X86::VPSRLVWZ256rmkz,       0 },
-    { X86::VPSRLWZ256rrkz,        X86::VPSRLWZ256rmkz,        0 },
-    { X86::VPSUBBZ256rrkz,        X86::VPSUBBZ256rmkz,        0 },
-    { X86::VPSUBDZ256rrkz,        X86::VPSUBDZ256rmkz,        0 },
-    { X86::VPSUBQZ256rrkz,        X86::VPSUBQZ256rmkz,        0 },
-    { X86::VPSUBSBZ256rrkz,       X86::VPSUBSBZ256rmkz,       0 },
-    { X86::VPSUBSWZ256rrkz,       X86::VPSUBSWZ256rmkz,       0 },
-    { X86::VPSUBUSBZ256rrkz,      X86::VPSUBUSBZ256rmkz,      0 },
-    { X86::VPSUBUSWZ256rrkz,      X86::VPSUBUSWZ256rmkz,      0 },
-    { X86::VPSUBWZ256rrkz,        X86::VPSUBWZ256rmkz,        0 },
-    { X86::VPUNPCKHBWZ256rrkz,    X86::VPUNPCKHBWZ256rmkz,    0 },
-    { X86::VPUNPCKHDQZ256rrkz,    X86::VPUNPCKHDQZ256rmkz,    0 },
-    { X86::VPUNPCKHQDQZ256rrkz,   X86::VPUNPCKHQDQZ256rmkz,   0 },
-    { X86::VPUNPCKHWDZ256rrkz,    X86::VPUNPCKHWDZ256rmkz,    0 },
-    { X86::VPUNPCKLBWZ256rrkz,    X86::VPUNPCKLBWZ256rmkz,    0 },
-    { X86::VPUNPCKLDQZ256rrkz,    X86::VPUNPCKLDQZ256rmkz,    0 },
-    { X86::VPUNPCKLQDQZ256rrkz,   X86::VPUNPCKLQDQZ256rmkz,   0 },
-    { X86::VPUNPCKLWDZ256rrkz,    X86::VPUNPCKLWDZ256rmkz,    0 },
-    { X86::VPXORDZ256rrkz,        X86::VPXORDZ256rmkz,        0 },
-    { X86::VPXORQZ256rrkz,        X86::VPXORQZ256rmkz,        0 },
-    { X86::VSHUFPDZ256rrikz,      X86::VSHUFPDZ256rmikz,      0 },
-    { X86::VSHUFPSZ256rrikz,      X86::VSHUFPSZ256rmikz,      0 },
-    { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
-    { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
-    { X86::VUNPCKHPDZ256rrkz,     X86::VUNPCKHPDZ256rmkz,     0 },
-    { X86::VUNPCKHPSZ256rrkz,     X86::VUNPCKHPSZ256rmkz,     0 },
-    { X86::VUNPCKLPDZ256rrkz,     X86::VUNPCKLPDZ256rmkz,     0 },
-    { X86::VUNPCKLPSZ256rrkz,     X86::VUNPCKLPSZ256rmkz,     0 },
-    { X86::VXORPDZ256rrkz,        X86::VXORPDZ256rmkz,        0 },
-    { X86::VXORPSZ256rrkz,        X86::VXORPSZ256rmkz,        0 },
-
-    // AVX-512{F,VL} masked arithmetic instructions 128-bit
-    { X86::VADDPDZ128rrkz,        X86::VADDPDZ128rmkz,        0 },
-    { X86::VADDPSZ128rrkz,        X86::VADDPSZ128rmkz,        0 },
-    { X86::VALIGNDZ128rrikz,      X86::VALIGNDZ128rmikz,      0 },
-    { X86::VALIGNQZ128rrikz,      X86::VALIGNQZ128rmikz,      0 },
-    { X86::VANDNPDZ128rrkz,       X86::VANDNPDZ128rmkz,       0 },
-    { X86::VANDNPSZ128rrkz,       X86::VANDNPSZ128rmkz,       0 },
-    { X86::VANDPDZ128rrkz,        X86::VANDPDZ128rmkz,        0 },
-    { X86::VANDPSZ128rrkz,        X86::VANDPSZ128rmkz,        0 },
-    { X86::VDIVPDZ128rrkz,        X86::VDIVPDZ128rmkz,        0 },
-    { X86::VDIVPSZ128rrkz,        X86::VDIVPSZ128rmkz,        0 },
-    { X86::VMAXCPDZ128rrkz,       X86::VMAXCPDZ128rmkz,       0 },
-    { X86::VMAXCPSZ128rrkz,       X86::VMAXCPSZ128rmkz,       0 },
-    { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 },
-    { X86::VMAXPSZ128rrkz,        X86::VMAXPSZ128rmkz,        0 },
-    { X86::VMINCPDZ128rrkz,       X86::VMINCPDZ128rmkz,       0 },
-    { X86::VMINCPSZ128rrkz,       X86::VMINCPSZ128rmkz,       0 },
-    { X86::VMINPDZ128rrkz,        X86::VMINPDZ128rmkz,        0 },
-    { X86::VMINPSZ128rrkz,        X86::VMINPSZ128rmkz,        0 },
-    { X86::VMULPDZ128rrkz,        X86::VMULPDZ128rmkz,        0 },
-    { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
-    { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
-    { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
-    { X86::VPACKSSDWZ128rrkz,     X86::VPACKSSDWZ128rmkz,     0 },
-    { X86::VPACKSSWBZ128rrkz,     X86::VPACKSSWBZ128rmkz,     0 },
-    { X86::VPACKUSDWZ128rrkz,     X86::VPACKUSDWZ128rmkz,     0 },
-    { X86::VPACKUSWBZ128rrkz,     X86::VPACKUSWBZ128rmkz,     0 },
-    { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
-    { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
-    { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
-    { X86::VPADDSBZ128rrkz,       X86::VPADDSBZ128rmkz,       0 },
-    { X86::VPADDSWZ128rrkz,       X86::VPADDSWZ128rmkz,       0 },
-    { X86::VPADDUSBZ128rrkz,      X86::VPADDUSBZ128rmkz,      0 },
-    { X86::VPADDUSWZ128rrkz,      X86::VPADDUSWZ128rmkz,      0 },
-    { X86::VPADDWZ128rrkz,        X86::VPADDWZ128rmkz,        0 },
-    { X86::VPALIGNRZ128rrikz,     X86::VPALIGNRZ128rmikz,     0 },
-    { X86::VPANDDZ128rrkz,        X86::VPANDDZ128rmkz,        0 },
-    { X86::VPANDNDZ128rrkz,       X86::VPANDNDZ128rmkz,       0 },
-    { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
-    { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
-    { X86::VPAVGBZ128rrkz,        X86::VPAVGBZ128rmkz,        0 },
-    { X86::VPAVGWZ128rrkz,        X86::VPAVGWZ128rmkz,        0 },
-    { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
-    { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
-    { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
-    { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
-    { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
-    { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
-    { X86::VPMAXSBZ128rrkz,       X86::VPMAXSBZ128rmkz,       0 },
-    { X86::VPMAXSDZ128rrkz,       X86::VPMAXSDZ128rmkz,       0 },
-    { X86::VPMAXSQZ128rrkz,       X86::VPMAXSQZ128rmkz,       0 },
-    { X86::VPMAXSWZ128rrkz,       X86::VPMAXSWZ128rmkz,       0 },
-    { X86::VPMAXUBZ128rrkz,       X86::VPMAXUBZ128rmkz,       0 },
-    { X86::VPMAXUDZ128rrkz,       X86::VPMAXUDZ128rmkz,       0 },
-    { X86::VPMAXUQZ128rrkz,       X86::VPMAXUQZ128rmkz,       0 },
-    { X86::VPMAXUWZ128rrkz,       X86::VPMAXUWZ128rmkz,       0 },
-    { X86::VPMINSBZ128rrkz,       X86::VPMINSBZ128rmkz,       0 },
-    { X86::VPMINSDZ128rrkz,       X86::VPMINSDZ128rmkz,       0 },
-    { X86::VPMINSQZ128rrkz,       X86::VPMINSQZ128rmkz,       0 },
-    { X86::VPMINSWZ128rrkz,       X86::VPMINSWZ128rmkz,       0 },
-    { X86::VPMINUBZ128rrkz,       X86::VPMINUBZ128rmkz,       0 },
-    { X86::VPMINUDZ128rrkz,       X86::VPMINUDZ128rmkz,       0 },
-    { X86::VPMINUQZ128rrkz,       X86::VPMINUQZ128rmkz,       0 },
-    { X86::VPMINUWZ128rrkz,       X86::VPMINUWZ128rmkz,       0 },
-    { X86::VPMULDQZ128rrkz,       X86::VPMULDQZ128rmkz,       0 },
-    { X86::VPMULLDZ128rrkz,       X86::VPMULLDZ128rmkz,       0 },
-    { X86::VPMULLQZ128rrkz,       X86::VPMULLQZ128rmkz,       0 },
-    { X86::VPMULLWZ128rrkz,       X86::VPMULLWZ128rmkz,       0 },
-    { X86::VPMULUDQZ128rrkz,      X86::VPMULUDQZ128rmkz,      0 },
-    { X86::VPORDZ128rrkz,         X86::VPORDZ128rmkz,         0 },
-    { X86::VPORQZ128rrkz,         X86::VPORQZ128rmkz,         0 },
-    { X86::VPSHUFBZ128rrkz,       X86::VPSHUFBZ128rmkz,       0 },
-    { X86::VPSLLDZ128rrkz,        X86::VPSLLDZ128rmkz,        0 },
-    { X86::VPSLLQZ128rrkz,        X86::VPSLLQZ128rmkz,        0 },
-    { X86::VPSLLVDZ128rrkz,       X86::VPSLLVDZ128rmkz,       0 },
-    { X86::VPSLLVQZ128rrkz,       X86::VPSLLVQZ128rmkz,       0 },
-    { X86::VPSLLVWZ128rrkz,       X86::VPSLLVWZ128rmkz,       0 },
-    { X86::VPSLLWZ128rrkz,        X86::VPSLLWZ128rmkz,        0 },
-    { X86::VPSRADZ128rrkz,        X86::VPSRADZ128rmkz,        0 },
-    { X86::VPSRAQZ128rrkz,        X86::VPSRAQZ128rmkz,        0 },
-    { X86::VPSRAVDZ128rrkz,       X86::VPSRAVDZ128rmkz,       0 },
-    { X86::VPSRAVQZ128rrkz,       X86::VPSRAVQZ128rmkz,       0 },
-    { X86::VPSRAVWZ128rrkz,       X86::VPSRAVWZ128rmkz,       0 },
-    { X86::VPSRAWZ128rrkz,        X86::VPSRAWZ128rmkz,        0 },
-    { X86::VPSRLDZ128rrkz,        X86::VPSRLDZ128rmkz,        0 },
-    { X86::VPSRLQZ128rrkz,        X86::VPSRLQZ128rmkz,        0 },
-    { X86::VPSRLVDZ128rrkz,       X86::VPSRLVDZ128rmkz,       0 },
-    { X86::VPSRLVQZ128rrkz,       X86::VPSRLVQZ128rmkz,       0 },
-    { X86::VPSRLVWZ128rrkz,       X86::VPSRLVWZ128rmkz,       0 },
-    { X86::VPSRLWZ128rrkz,        X86::VPSRLWZ128rmkz,        0 },
-    { X86::VPSUBBZ128rrkz,        X86::VPSUBBZ128rmkz,        0 },
-    { X86::VPSUBDZ128rrkz,        X86::VPSUBDZ128rmkz,        0 },
-    { X86::VPSUBQZ128rrkz,        X86::VPSUBQZ128rmkz,        0 },
-    { X86::VPSUBSBZ128rrkz,       X86::VPSUBSBZ128rmkz,       0 },
-    { X86::VPSUBSWZ128rrkz,       X86::VPSUBSWZ128rmkz,       0 },
-    { X86::VPSUBUSBZ128rrkz,      X86::VPSUBUSBZ128rmkz,      0 },
-    { X86::VPSUBUSWZ128rrkz,      X86::VPSUBUSWZ128rmkz,      0 },
-    { X86::VPSUBWZ128rrkz,        X86::VPSUBWZ128rmkz,        0 },
-    { X86::VPUNPCKHBWZ128rrkz,    X86::VPUNPCKHBWZ128rmkz,    0 },
-    { X86::VPUNPCKHDQZ128rrkz,    X86::VPUNPCKHDQZ128rmkz,    0 },
-    { X86::VPUNPCKHQDQZ128rrkz,   X86::VPUNPCKHQDQZ128rmkz,   0 },
-    { X86::VPUNPCKHWDZ128rrkz,    X86::VPUNPCKHWDZ128rmkz,    0 },
-    { X86::VPUNPCKLBWZ128rrkz,    X86::VPUNPCKLBWZ128rmkz,    0 },
-    { X86::VPUNPCKLDQZ128rrkz,    X86::VPUNPCKLDQZ128rmkz,    0 },
-    { X86::VPUNPCKLQDQZ128rrkz,   X86::VPUNPCKLQDQZ128rmkz,   0 },
-    { X86::VPUNPCKLWDZ128rrkz,    X86::VPUNPCKLWDZ128rmkz,    0 },
-    { X86::VPXORDZ128rrkz,        X86::VPXORDZ128rmkz,        0 },
-    { X86::VPXORQZ128rrkz,        X86::VPXORQZ128rmkz,        0 },
-    { X86::VSHUFPDZ128rrikz,      X86::VSHUFPDZ128rmikz,      0 },
-    { X86::VSHUFPSZ128rrikz,      X86::VSHUFPSZ128rmikz,      0 },
-    { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
-    { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
-    { X86::VUNPCKHPDZ128rrkz,     X86::VUNPCKHPDZ128rmkz,     0 },
-    { X86::VUNPCKHPSZ128rrkz,     X86::VUNPCKHPSZ128rmkz,     0 },
-    { X86::VUNPCKLPDZ128rrkz,     X86::VUNPCKLPDZ128rmkz,     0 },
-    { X86::VUNPCKLPSZ128rrkz,     X86::VUNPCKLPSZ128rmkz,     0 },
-    { X86::VXORPDZ128rrkz,        X86::VXORPDZ128rmkz,        0 },
-    { X86::VXORPSZ128rrkz,        X86::VXORPSZ128rmkz,        0 },
-
-    // AVX-512 masked foldable instructions
-    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
-    { X86::VPABSBZrrk,            X86::VPABSBZrmk,            0 },
-    { X86::VPABSDZrrk,            X86::VPABSDZrmk,            0 },
-    { X86::VPABSQZrrk,            X86::VPABSQZrmk,            0 },
-    { X86::VPABSWZrrk,            X86::VPABSWZrmk,            0 },
-    { X86::VPCONFLICTDZrrk,       X86::VPCONFLICTDZrmk,       0 },
-    { X86::VPCONFLICTQZrrk,       X86::VPCONFLICTQZrmk,       0 },
-    { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
-    { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
-    { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
-    { X86::VPERMQZrik,            X86::VPERMQZmik,            0 },
-    { X86::VPLZCNTDZrrk,          X86::VPLZCNTDZrmk,          0 },
-    { X86::VPLZCNTQZrrk,          X86::VPLZCNTQZrmk,          0 },
-    { X86::VPMOVSXBDZrrk,         X86::VPMOVSXBDZrmk,         0 },
-    { X86::VPMOVSXBQZrrk,         X86::VPMOVSXBQZrmk,         TB_NO_REVERSE },
-    { X86::VPMOVSXBWZrrk,         X86::VPMOVSXBWZrmk,         0 },
-    { X86::VPMOVSXDQZrrk,         X86::VPMOVSXDQZrmk,         0 },
-    { X86::VPMOVSXWDZrrk,         X86::VPMOVSXWDZrmk,         0 },
-    { X86::VPMOVSXWQZrrk,         X86::VPMOVSXWQZrmk,         0 },
-    { X86::VPMOVZXBDZrrk,         X86::VPMOVZXBDZrmk,         0 },
-    { X86::VPMOVZXBQZrrk,         X86::VPMOVZXBQZrmk,         TB_NO_REVERSE },
-    { X86::VPMOVZXBWZrrk,         X86::VPMOVZXBWZrmk,         0 },
-    { X86::VPMOVZXDQZrrk,         X86::VPMOVZXDQZrmk,         0 },
-    { X86::VPMOVZXWDZrrk,         X86::VPMOVZXWDZrmk,         0 },
-    { X86::VPMOVZXWQZrrk,         X86::VPMOVZXWQZrmk,         0 },
-    { X86::VPOPCNTDZrrk,          X86::VPOPCNTDZrmk,          0 },
-    { X86::VPOPCNTQZrrk,          X86::VPOPCNTQZrmk,          0 },
-    { X86::VPSHUFDZrik,           X86::VPSHUFDZmik,           0 },
-    { X86::VPSHUFHWZrik,          X86::VPSHUFHWZmik,          0 },
-    { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
-    { X86::VPSLLDZrik,            X86::VPSLLDZmik,            0 },
-    { X86::VPSLLQZrik,            X86::VPSLLQZmik,            0 },
-    { X86::VPSLLWZrik,            X86::VPSLLWZmik,            0 },
-    { X86::VPSRADZrik,            X86::VPSRADZmik,            0 },
-    { X86::VPSRAQZrik,            X86::VPSRAQZmik,            0 },
-    { X86::VPSRAWZrik,            X86::VPSRAWZmik,            0 },
-    { X86::VPSRLDZrik,            X86::VPSRLDZmik,            0 },
-    { X86::VPSRLQZrik,            X86::VPSRLQZmik,            0 },
-    { X86::VPSRLWZrik,            X86::VPSRLWZmik,            0 },
-
-    // AVX-512VL 256-bit masked foldable instructions
-    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
-    { X86::VPABSBZ256rrk,         X86::VPABSBZ256rmk,         0 },
-    { X86::VPABSDZ256rrk,         X86::VPABSDZ256rmk,         0 },
-    { X86::VPABSQZ256rrk,         X86::VPABSQZ256rmk,         0 },
-    { X86::VPABSWZ256rrk,         X86::VPABSWZ256rmk,         0 },
-    { X86::VPCONFLICTDZ256rrk,    X86::VPCONFLICTDZ256rmk,    0 },
-    { X86::VPCONFLICTQZ256rrk,    X86::VPCONFLICTQZ256rmk,    0 },
-    { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
-    { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
-    { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
-    { X86::VPERMQZ256rik,         X86::VPERMQZ256mik,         0 },
-    { X86::VPLZCNTDZ256rrk,       X86::VPLZCNTDZ256rmk,       0 },
-    { X86::VPLZCNTQZ256rrk,       X86::VPLZCNTQZ256rmk,       0 },
-    { X86::VPMOVSXBDZ256rrk,      X86::VPMOVSXBDZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ256rrk,      X86::VPMOVSXBQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ256rrk,      X86::VPMOVSXBWZ256rmk,      0 },
-    { X86::VPMOVSXDQZ256rrk,      X86::VPMOVSXDQZ256rmk,      0 },
-    { X86::VPMOVSXWDZ256rrk,      X86::VPMOVSXWDZ256rmk,      0 },
-    { X86::VPMOVSXWQZ256rrk,      X86::VPMOVSXWQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ256rrk,      X86::VPMOVZXBDZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ256rrk,      X86::VPMOVZXBQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ256rrk,      X86::VPMOVZXBWZ256rmk,      0 },
-    { X86::VPMOVZXDQZ256rrk,      X86::VPMOVZXDQZ256rmk,      0 },
-    { X86::VPMOVZXWDZ256rrk,      X86::VPMOVZXWDZ256rmk,      0 },
-    { X86::VPMOVZXWQZ256rrk,      X86::VPMOVZXWQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ256rik,        X86::VPSHUFDZ256mik,        0 },
-    { X86::VPSHUFHWZ256rik,       X86::VPSHUFHWZ256mik,       0 },
-    { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
-    { X86::VPSLLDZ256rik,         X86::VPSLLDZ256mik,         0 },
-    { X86::VPSLLQZ256rik,         X86::VPSLLQZ256mik,         0 },
-    { X86::VPSLLWZ256rik,         X86::VPSLLWZ256mik,         0 },
-    { X86::VPSRADZ256rik,         X86::VPSRADZ256mik,         0 },
-    { X86::VPSRAQZ256rik,         X86::VPSRAQZ256mik,         0 },
-    { X86::VPSRAWZ256rik,         X86::VPSRAWZ256mik,         0 },
-    { X86::VPSRLDZ256rik,         X86::VPSRLDZ256mik,         0 },
-    { X86::VPSRLQZ256rik,         X86::VPSRLQZ256mik,         0 },
-    { X86::VPSRLWZ256rik,         X86::VPSRLWZ256mik,         0 },
-
-    // AVX-512VL 128-bit masked foldable instructions
-    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
-    { X86::VPABSBZ128rrk,         X86::VPABSBZ128rmk,         0 },
-    { X86::VPABSDZ128rrk,         X86::VPABSDZ128rmk,         0 },
-    { X86::VPABSQZ128rrk,         X86::VPABSQZ128rmk,         0 },
-    { X86::VPABSWZ128rrk,         X86::VPABSWZ128rmk,         0 },
-    { X86::VPCONFLICTDZ128rrk,    X86::VPCONFLICTDZ128rmk,    0 },
-    { X86::VPCONFLICTQZ128rrk,    X86::VPCONFLICTQZ128rmk,    0 },
-    { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
-    { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
-    { X86::VPLZCNTDZ128rrk,       X86::VPLZCNTDZ128rmk,       0 },
-    { X86::VPLZCNTQZ128rrk,       X86::VPLZCNTQZ128rmk,       0 },
-    { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ128rrk,      X86::VPMOVSXBQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ128rrk,      X86::VPMOVSXBWZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXDQZ128rrk,      X86::VPMOVSXDQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXWDZ128rrk,      X86::VPMOVSXWDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXWQZ128rrk,      X86::VPMOVSXWQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ128rrk,      X86::VPMOVZXBDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ128rrk,      X86::VPMOVZXBQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ128rrk,      X86::VPMOVZXBWZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXDQZ128rrk,      X86::VPMOVZXDQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXWDZ128rrk,      X86::VPMOVZXWDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXWQZ128rrk,      X86::VPMOVZXWQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ128rik,        X86::VPSHUFDZ128mik,        0 },
-    { X86::VPSHUFHWZ128rik,       X86::VPSHUFHWZ128mik,       0 },
-    { X86::VPSHUFLWZ128rik,       X86::VPSHUFLWZ128mik,       0 },
-    { X86::VPSLLDZ128rik,         X86::VPSLLDZ128mik,         0 },
-    { X86::VPSLLQZ128rik,         X86::VPSLLQZ128mik,         0 },
-    { X86::VPSLLWZ128rik,         X86::VPSLLWZ128mik,         0 },
-    { X86::VPSRADZ128rik,         X86::VPSRADZ128mik,         0 },
-    { X86::VPSRAQZ128rik,         X86::VPSRAQZ128mik,         0 },
-    { X86::VPSRAWZ128rik,         X86::VPSRAWZ128mik,         0 },
-    { X86::VPSRLDZ128rik,         X86::VPSRLDZ128mik,         0 },
-    { X86::VPSRLQZ128rik,         X86::VPSRLQZ128mik,         0 },
-    { X86::VPSRLWZ128rik,         X86::VPSRLWZ128mik,         0 },
-
-    // AVX-512 masked compare instructions
-    { X86::VCMPPDZ128rrik,        X86::VCMPPDZ128rmik,        0 },
-    { X86::VCMPPSZ128rrik,        X86::VCMPPSZ128rmik,        0 },
-    { X86::VCMPPDZ256rrik,        X86::VCMPPDZ256rmik,        0 },
-    { X86::VCMPPSZ256rrik,        X86::VCMPPSZ256rmik,        0 },
-    { X86::VCMPPDZrrik,           X86::VCMPPDZrmik,           0 },
-    { X86::VCMPPSZrrik,           X86::VCMPPSZrmik,           0 },
-    { X86::VCMPSDZrr_Intk,        X86::VCMPSDZrm_Intk,        TB_NO_REVERSE },
-    { X86::VCMPSSZrr_Intk,        X86::VCMPSSZrm_Intk,        TB_NO_REVERSE },
-    { X86::VPCMPBZ128rrik,        X86::VPCMPBZ128rmik,        0 },
-    { X86::VPCMPBZ256rrik,        X86::VPCMPBZ256rmik,        0 },
-    { X86::VPCMPBZrrik,           X86::VPCMPBZrmik,           0 },
-    { X86::VPCMPDZ128rrik,        X86::VPCMPDZ128rmik,        0 },
-    { X86::VPCMPDZ256rrik,        X86::VPCMPDZ256rmik,        0 },
-    { X86::VPCMPDZrrik,           X86::VPCMPDZrmik,           0 },
-    { X86::VPCMPEQBZ128rrk,       X86::VPCMPEQBZ128rmk,       0 },
-    { X86::VPCMPEQBZ256rrk,       X86::VPCMPEQBZ256rmk,       0 },
-    { X86::VPCMPEQBZrrk,          X86::VPCMPEQBZrmk,          0 },
-    { X86::VPCMPEQDZ128rrk,       X86::VPCMPEQDZ128rmk,       0 },
-    { X86::VPCMPEQDZ256rrk,       X86::VPCMPEQDZ256rmk,       0 },
-    { X86::VPCMPEQDZrrk,          X86::VPCMPEQDZrmk,          0 },
-    { X86::VPCMPEQQZ128rrk,       X86::VPCMPEQQZ128rmk,       0 },
-    { X86::VPCMPEQQZ256rrk,       X86::VPCMPEQQZ256rmk,       0 },
-    { X86::VPCMPEQQZrrk,          X86::VPCMPEQQZrmk,          0 },
-    { X86::VPCMPEQWZ128rrk,       X86::VPCMPEQWZ128rmk,       0 },
-    { X86::VPCMPEQWZ256rrk,       X86::VPCMPEQWZ256rmk,       0 },
-    { X86::VPCMPEQWZrrk,          X86::VPCMPEQWZrmk,          0 },
-    { X86::VPCMPGTBZ128rrk,       X86::VPCMPGTBZ128rmk,       0 },
-    { X86::VPCMPGTBZ256rrk,       X86::VPCMPGTBZ256rmk,       0 },
-    { X86::VPCMPGTBZrrk,          X86::VPCMPGTBZrmk,          0 },
-    { X86::VPCMPGTDZ128rrk,       X86::VPCMPGTDZ128rmk,       0 },
-    { X86::VPCMPGTDZ256rrk,       X86::VPCMPGTDZ256rmk,       0 },
-    { X86::VPCMPGTDZrrk,          X86::VPCMPGTDZrmk,          0 },
-    { X86::VPCMPGTQZ128rrk,       X86::VPCMPGTQZ128rmk,       0 },
-    { X86::VPCMPGTQZ256rrk,       X86::VPCMPGTQZ256rmk,       0 },
-    { X86::VPCMPGTQZrrk,          X86::VPCMPGTQZrmk,          0 },
-    { X86::VPCMPGTWZ128rrk,       X86::VPCMPGTWZ128rmk,       0 },
-    { X86::VPCMPGTWZ256rrk,       X86::VPCMPGTWZ256rmk,       0 },
-    { X86::VPCMPGTWZrrk,          X86::VPCMPGTWZrmk,          0 },
-    { X86::VPCMPQZ128rrik,        X86::VPCMPQZ128rmik,        0 },
-    { X86::VPCMPQZ256rrik,        X86::VPCMPQZ256rmik,        0 },
-    { X86::VPCMPQZrrik,           X86::VPCMPQZrmik,           0 },
-    { X86::VPCMPUBZ128rrik,       X86::VPCMPUBZ128rmik,       0 },
-    { X86::VPCMPUBZ256rrik,       X86::VPCMPUBZ256rmik,       0 },
-    { X86::VPCMPUBZrrik,          X86::VPCMPUBZrmik,          0 },
-    { X86::VPCMPUDZ128rrik,       X86::VPCMPUDZ128rmik,       0 },
-    { X86::VPCMPUDZ256rrik,       X86::VPCMPUDZ256rmik,       0 },
-    { X86::VPCMPUDZrrik,          X86::VPCMPUDZrmik,          0 },
-    { X86::VPCMPUQZ128rrik,       X86::VPCMPUQZ128rmik,       0 },
-    { X86::VPCMPUQZ256rrik,       X86::VPCMPUQZ256rmik,       0 },
-    { X86::VPCMPUQZrrik,          X86::VPCMPUQZrmik,          0 },
-    { X86::VPCMPUWZ128rrik,       X86::VPCMPUWZ128rmik,       0 },
-    { X86::VPCMPUWZ256rrik,       X86::VPCMPUWZ256rmik,       0 },
-    { X86::VPCMPUWZrrik,          X86::VPCMPUWZrmik,          0 },
-    { X86::VPCMPWZ128rrik,        X86::VPCMPWZ128rmik,        0 },
-    { X86::VPCMPWZ256rrik,        X86::VPCMPWZ256rmik,        0 },
-    { X86::VPCMPWZrrik,           X86::VPCMPWZrmik,           0 },
-  };
-
-  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
-    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                  Entry.RegOp, Entry.MemOp,
-                  // Index 3, folded load
-                  Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
-  }
-  auto I = X86InstrFMA3Info::rm_begin();
-  auto E = X86InstrFMA3Info::rm_end();
-  for (; I != E; ++I) {
-    if (!I.getGroup()->isKMasked()) {
-      // Intrinsic forms need to pass TB_NO_REVERSE.
-      if (I.getGroup()->isIntrinsic()) {
-        AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
-      } else {
-        AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
-      }
-    }
-  }
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
-    // AVX-512 foldable masked instructions
-    { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
-    { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
-    { X86::VADDSDZrr_Intk,     X86::VADDSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VADDSSZrr_Intk,     X86::VADDSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VALIGNDZrrik,       X86::VALIGNDZrmik,         0 },
-    { X86::VALIGNQZrrik,       X86::VALIGNQZrmik,         0 },
-    { X86::VANDNPDZrrk,        X86::VANDNPDZrmk,          0 },
-    { X86::VANDNPSZrrk,        X86::VANDNPSZrmk,          0 },
-    { X86::VANDPDZrrk,         X86::VANDPDZrmk,           0 },
-    { X86::VANDPSZrrk,         X86::VANDPSZrmk,           0 },
-    { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
-    { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
-    { X86::VDIVSDZrr_Intk,     X86::VDIVSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VDIVSSZrr_Intk,     X86::VDIVSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VINSERTF32x4Zrrk,   X86::VINSERTF32x4Zrmk,     0 },
-    { X86::VINSERTF32x8Zrrk,   X86::VINSERTF32x8Zrmk,     0 },
-    { X86::VINSERTF64x2Zrrk,   X86::VINSERTF64x2Zrmk,     0 },
-    { X86::VINSERTF64x4Zrrk,   X86::VINSERTF64x4Zrmk,     0 },
-    { X86::VINSERTI32x4Zrrk,   X86::VINSERTI32x4Zrmk,     0 },
-    { X86::VINSERTI32x8Zrrk,   X86::VINSERTI32x8Zrmk,     0 },
-    { X86::VINSERTI64x2Zrrk,   X86::VINSERTI64x2Zrmk,     0 },
-    { X86::VINSERTI64x4Zrrk,   X86::VINSERTI64x4Zrmk,     0 },
-    { X86::VMAXCPDZrrk,        X86::VMAXCPDZrmk,          0 },
-    { X86::VMAXCPSZrrk,        X86::VMAXCPSZrmk,          0 },
-    { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
-    { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
-    { X86::VMAXSDZrr_Intk,     X86::VMAXSDZrm_Intk,       0 },
-    { X86::VMAXSSZrr_Intk,     X86::VMAXSSZrm_Intk,       0 },
-    { X86::VMINCPDZrrk,        X86::VMINCPDZrmk,          0 },
-    { X86::VMINCPSZrrk,        X86::VMINCPSZrmk,          0 },
-    { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
-    { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
-    { X86::VMINSDZrr_Intk,     X86::VMINSDZrm_Intk,       0 },
-    { X86::VMINSSZrr_Intk,     X86::VMINSSZrm_Intk,       0 },
-    { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
-    { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
-    { X86::VMULSDZrr_Intk,     X86::VMULSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VMULSSZrr_Intk,     X86::VMULSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
-    { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
-    { X86::VPACKSSDWZrrk,      X86::VPACKSSDWZrmk,        0 },
-    { X86::VPACKSSWBZrrk,      X86::VPACKSSWBZrmk,        0 },
-    { X86::VPACKUSDWZrrk,      X86::VPACKUSDWZrmk,        0 },
-    { X86::VPACKUSWBZrrk,      X86::VPACKUSWBZrmk,        0 },
-    { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
-    { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
-    { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
-    { X86::VPADDSBZrrk,        X86::VPADDSBZrmk,          0 },
-    { X86::VPADDSWZrrk,        X86::VPADDSWZrmk,          0 },
-    { X86::VPADDUSBZrrk,       X86::VPADDUSBZrmk,         0 },
-    { X86::VPADDUSWZrrk,       X86::VPADDUSWZrmk,         0 },
-    { X86::VPADDWZrrk,         X86::VPADDWZrmk,           0 },
-    { X86::VPALIGNRZrrik,      X86::VPALIGNRZrmik,        0 },
-    { X86::VPANDDZrrk,         X86::VPANDDZrmk,           0 },
-    { X86::VPANDNDZrrk,        X86::VPANDNDZrmk,          0 },
-    { X86::VPANDNQZrrk,        X86::VPANDNQZrmk,          0 },
-    { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
-    { X86::VPAVGBZrrk,         X86::VPAVGBZrmk,           0 },
-    { X86::VPAVGWZrrk,         X86::VPAVGWZrmk,           0 },
-    { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
-    { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
-    { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
-    { X86::VPERMI2Drrk,        X86::VPERMI2Drmk,          0 },
-    { X86::VPERMI2PSrrk,       X86::VPERMI2PSrmk,         0 },
-    { X86::VPERMI2PDrrk,       X86::VPERMI2PDrmk,         0 },
-    { X86::VPERMI2Qrrk,        X86::VPERMI2Qrmk,          0 },
-    { X86::VPERMI2Wrrk,        X86::VPERMI2Wrmk,          0 },
-    { X86::VPERMILPDZrrk,      X86::VPERMILPDZrmk,        0 },
-    { X86::VPERMILPSZrrk,      X86::VPERMILPSZrmk,        0 },
-    { X86::VPERMPDZrrk,        X86::VPERMPDZrmk,          0 },
-    { X86::VPERMPSZrrk,        X86::VPERMPSZrmk,          0 },
-    { X86::VPERMQZrrk,         X86::VPERMQZrmk,           0 },
-    { X86::VPERMT2Brrk,        X86::VPERMT2Brmk,          0 },
-    { X86::VPERMT2Drrk,        X86::VPERMT2Drmk,          0 },
-    { X86::VPERMT2PSrrk,       X86::VPERMT2PSrmk,         0 },
-    { X86::VPERMT2PDrrk,       X86::VPERMT2PDrmk,         0 },
-    { X86::VPERMT2Qrrk,        X86::VPERMT2Qrmk,          0 },
-    { X86::VPERMT2Wrrk,        X86::VPERMT2Wrmk,          0 },
-    { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
-    { X86::VPMADD52HUQZrk,     X86::VPMADD52HUQZmk,       0 },
-    { X86::VPMADD52LUQZrk,     X86::VPMADD52LUQZmk,       0 },
-    { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
-    { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
-    { X86::VPMAXSBZrrk,        X86::VPMAXSBZrmk,          0 },
-    { X86::VPMAXSDZrrk,        X86::VPMAXSDZrmk,          0 },
-    { X86::VPMAXSQZrrk,        X86::VPMAXSQZrmk,          0 },
-    { X86::VPMAXSWZrrk,        X86::VPMAXSWZrmk,          0 },
-    { X86::VPMAXUBZrrk,        X86::VPMAXUBZrmk,          0 },
-    { X86::VPMAXUDZrrk,        X86::VPMAXUDZrmk,          0 },
-    { X86::VPMAXUQZrrk,        X86::VPMAXUQZrmk,          0 },
-    { X86::VPMAXUWZrrk,        X86::VPMAXUWZrmk,          0 },
-    { X86::VPMINSBZrrk,        X86::VPMINSBZrmk,          0 },
-    { X86::VPMINSDZrrk,        X86::VPMINSDZrmk,          0 },
-    { X86::VPMINSQZrrk,        X86::VPMINSQZrmk,          0 },
-    { X86::VPMINSWZrrk,        X86::VPMINSWZrmk,          0 },
-    { X86::VPMINUBZrrk,        X86::VPMINUBZrmk,          0 },
-    { X86::VPMINUDZrrk,        X86::VPMINUDZrmk,          0 },
-    { X86::VPMINUQZrrk,        X86::VPMINUQZrmk,          0 },
-    { X86::VPMINUWZrrk,        X86::VPMINUWZrmk,          0 },
-    { X86::VPMULDQZrrk,        X86::VPMULDQZrmk,          0 },
-    { X86::VPMULLDZrrk,        X86::VPMULLDZrmk,          0 },
-    { X86::VPMULLQZrrk,        X86::VPMULLQZrmk,          0 },
-    { X86::VPMULLWZrrk,        X86::VPMULLWZrmk,          0 },
-    { X86::VPMULUDQZrrk,       X86::VPMULUDQZrmk,         0 },
-    { X86::VPORDZrrk,          X86::VPORDZrmk,            0 },
-    { X86::VPORQZrrk,          X86::VPORQZrmk,            0 },
-    { X86::VPSHUFBZrrk,        X86::VPSHUFBZrmk,          0 },
-    { X86::VPSLLDZrrk,         X86::VPSLLDZrmk,           0 },
-    { X86::VPSLLQZrrk,         X86::VPSLLQZrmk,           0 },
-    { X86::VPSLLVDZrrk,        X86::VPSLLVDZrmk,          0 },
-    { X86::VPSLLVQZrrk,        X86::VPSLLVQZrmk,          0 },
-    { X86::VPSLLVWZrrk,        X86::VPSLLVWZrmk,          0 },
-    { X86::VPSLLWZrrk,         X86::VPSLLWZrmk,           0 },
-    { X86::VPSRADZrrk,         X86::VPSRADZrmk,           0 },
-    { X86::VPSRAQZrrk,         X86::VPSRAQZrmk,           0 },
-    { X86::VPSRAVDZrrk,        X86::VPSRAVDZrmk,          0 },
-    { X86::VPSRAVQZrrk,        X86::VPSRAVQZrmk,          0 },
-    { X86::VPSRAVWZrrk,        X86::VPSRAVWZrmk,          0 },
-    { X86::VPSRAWZrrk,         X86::VPSRAWZrmk,           0 },
-    { X86::VPSRLDZrrk,         X86::VPSRLDZrmk,           0 },
-    { X86::VPSRLQZrrk,         X86::VPSRLQZrmk,           0 },
-    { X86::VPSRLVDZrrk,        X86::VPSRLVDZrmk,          0 },
-    { X86::VPSRLVQZrrk,        X86::VPSRLVQZrmk,          0 },
-    { X86::VPSRLVWZrrk,        X86::VPSRLVWZrmk,          0 },
-    { X86::VPSRLWZrrk,         X86::VPSRLWZrmk,           0 },
-    { X86::VPSUBBZrrk,         X86::VPSUBBZrmk,           0 },
-    { X86::VPSUBDZrrk,         X86::VPSUBDZrmk,           0 },
-    { X86::VPSUBQZrrk,         X86::VPSUBQZrmk,           0 },
-    { X86::VPSUBSBZrrk,        X86::VPSUBSBZrmk,          0 },
-    { X86::VPSUBSWZrrk,        X86::VPSUBSWZrmk,          0 },
-    { X86::VPSUBUSBZrrk,       X86::VPSUBUSBZrmk,         0 },
-    { X86::VPSUBUSWZrrk,       X86::VPSUBUSWZrmk,         0 },
-    { X86::VPTERNLOGDZrrik,    X86::VPTERNLOGDZrmik,      0 },
-    { X86::VPTERNLOGQZrrik,    X86::VPTERNLOGQZrmik,      0 },
-    { X86::VPUNPCKHBWZrrk,     X86::VPUNPCKHBWZrmk,       0 },
-    { X86::VPUNPCKHDQZrrk,     X86::VPUNPCKHDQZrmk,       0 },
-    { X86::VPUNPCKHQDQZrrk,    X86::VPUNPCKHQDQZrmk,      0 },
-    { X86::VPUNPCKHWDZrrk,     X86::VPUNPCKHWDZrmk,       0 },
-    { X86::VPUNPCKLBWZrrk,     X86::VPUNPCKLBWZrmk,       0 },
-    { X86::VPUNPCKLDQZrrk,     X86::VPUNPCKLDQZrmk,       0 },
-    { X86::VPUNPCKLQDQZrrk,    X86::VPUNPCKLQDQZrmk,      0 },
-    { X86::VPUNPCKLWDZrrk,     X86::VPUNPCKLWDZrmk,       0 },
-    { X86::VPXORDZrrk,         X86::VPXORDZrmk,           0 },
-    { X86::VPXORQZrrk,         X86::VPXORQZrmk,           0 },
-    { X86::VSHUFPDZrrik,       X86::VSHUFPDZrmik,         0 },
-    { X86::VSHUFPSZrrik,       X86::VSHUFPSZrmik,         0 },
-    { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
-    { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
-    { X86::VSUBSDZrr_Intk,     X86::VSUBSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VSUBSSZrr_Intk,     X86::VSUBSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDZrrk,      X86::VUNPCKHPDZrmk,        0 },
-    { X86::VUNPCKHPSZrrk,      X86::VUNPCKHPSZrmk,        0 },
-    { X86::VUNPCKLPDZrrk,      X86::VUNPCKLPDZrmk,        0 },
-    { X86::VUNPCKLPSZrrk,      X86::VUNPCKLPSZrmk,        0 },
-    { X86::VXORPDZrrk,         X86::VXORPDZrmk,           0 },
-    { X86::VXORPSZrrk,         X86::VXORPSZrmk,           0 },
-
-    // AVX-512{F,VL} foldable masked instructions 256-bit
-    { X86::VADDPDZ256rrk,      X86::VADDPDZ256rmk,        0 },
-    { X86::VADDPSZ256rrk,      X86::VADDPSZ256rmk,        0 },
-    { X86::VALIGNDZ256rrik,    X86::VALIGNDZ256rmik,      0 },
-    { X86::VALIGNQZ256rrik,    X86::VALIGNQZ256rmik,      0 },
-    { X86::VANDNPDZ256rrk,     X86::VANDNPDZ256rmk,       0 },
-    { X86::VANDNPSZ256rrk,     X86::VANDNPSZ256rmk,       0 },
-    { X86::VANDPDZ256rrk,      X86::VANDPDZ256rmk,        0 },
-    { X86::VANDPSZ256rrk,      X86::VANDPSZ256rmk,        0 },
-    { X86::VDIVPDZ256rrk,      X86::VDIVPDZ256rmk,        0 },
-    { X86::VDIVPSZ256rrk,      X86::VDIVPSZ256rmk,        0 },
-    { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk,  0 },
-    { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk,  0 },
-    { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk,  0 },
-    { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk,  0 },
-    { X86::VMAXCPDZ256rrk,     X86::VMAXCPDZ256rmk,       0 },
-    { X86::VMAXCPSZ256rrk,     X86::VMAXCPSZ256rmk,       0 },
-    { X86::VMAXPDZ256rrk,      X86::VMAXPDZ256rmk,        0 },
-    { X86::VMAXPSZ256rrk,      X86::VMAXPSZ256rmk,        0 },
-    { X86::VMINCPDZ256rrk,     X86::VMINCPDZ256rmk,       0 },
-    { X86::VMINCPSZ256rrk,     X86::VMINCPSZ256rmk,       0 },
-    { X86::VMINPDZ256rrk,      X86::VMINPDZ256rmk,        0 },
-    { X86::VMINPSZ256rrk,      X86::VMINPSZ256rmk,        0 },
-    { X86::VMULPDZ256rrk,      X86::VMULPDZ256rmk,        0 },
-    { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
-    { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
-    { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
-    { X86::VPACKSSDWZ256rrk,   X86::VPACKSSDWZ256rmk,     0 },
-    { X86::VPACKSSWBZ256rrk,   X86::VPACKSSWBZ256rmk,     0 },
-    { X86::VPACKUSDWZ256rrk,   X86::VPACKUSDWZ256rmk,     0 },
-    { X86::VPACKUSWBZ256rrk,   X86::VPACKUSWBZ256rmk,     0 },
-    { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
-    { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
-    { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
-    { X86::VPADDSBZ256rrk,     X86::VPADDSBZ256rmk,       0 },
-    { X86::VPADDSWZ256rrk,     X86::VPADDSWZ256rmk,       0 },
-    { X86::VPADDUSBZ256rrk,    X86::VPADDUSBZ256rmk,      0 },
-    { X86::VPADDUSWZ256rrk,    X86::VPADDUSWZ256rmk,      0 },
-    { X86::VPADDWZ256rrk,      X86::VPADDWZ256rmk,        0 },
-    { X86::VPALIGNRZ256rrik,   X86::VPALIGNRZ256rmik,     0 },
-    { X86::VPANDDZ256rrk,      X86::VPANDDZ256rmk,        0 },
-    { X86::VPANDNDZ256rrk,     X86::VPANDNDZ256rmk,       0 },
-    { X86::VPANDNQZ256rrk,     X86::VPANDNQZ256rmk,       0 },
-    { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
-    { X86::VPAVGBZ256rrk,      X86::VPAVGBZ256rmk,        0 },
-    { X86::VPAVGWZ256rrk,      X86::VPAVGWZ256rmk,        0 },
-    { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
-    { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
-    { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
-    { X86::VPERMI2D256rrk,     X86::VPERMI2D256rmk,       0 },
-    { X86::VPERMI2PD256rrk,    X86::VPERMI2PD256rmk,      0 },
-    { X86::VPERMI2PS256rrk,    X86::VPERMI2PS256rmk,      0 },
-    { X86::VPERMI2Q256rrk,     X86::VPERMI2Q256rmk,       0 },
-    { X86::VPERMI2W256rrk,     X86::VPERMI2W256rmk,       0 },
-    { X86::VPERMILPDZ256rrk,   X86::VPERMILPDZ256rmk,     0 },
-    { X86::VPERMILPSZ256rrk,   X86::VPERMILPSZ256rmk,     0 },
-    { X86::VPERMPDZ256rrk,     X86::VPERMPDZ256rmk,       0 },
-    { X86::VPERMPSZ256rrk,     X86::VPERMPSZ256rmk,       0 },
-    { X86::VPERMQZ256rrk,      X86::VPERMQZ256rmk,        0 },
-    { X86::VPERMT2B256rrk,     X86::VPERMT2B256rmk,       0 },
-    { X86::VPERMT2D256rrk,     X86::VPERMT2D256rmk,       0 },
-    { X86::VPERMT2PD256rrk,    X86::VPERMT2PD256rmk,      0 },
-    { X86::VPERMT2PS256rrk,    X86::VPERMT2PS256rmk,      0 },
-    { X86::VPERMT2Q256rrk,     X86::VPERMT2Q256rmk,       0 },
-    { X86::VPERMT2W256rrk,     X86::VPERMT2W256rmk,       0 },
-    { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
-    { X86::VPMADD52HUQZ256rk,  X86::VPMADD52HUQZ256mk,    0 },
-    { X86::VPMADD52LUQZ256rk,  X86::VPMADD52LUQZ256mk,    0 },
-    { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
-    { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
-    { X86::VPMAXSBZ256rrk,     X86::VPMAXSBZ256rmk,       0 },
-    { X86::VPMAXSDZ256rrk,     X86::VPMAXSDZ256rmk,       0 },
-    { X86::VPMAXSQZ256rrk,     X86::VPMAXSQZ256rmk,       0 },
-    { X86::VPMAXSWZ256rrk,     X86::VPMAXSWZ256rmk,       0 },
-    { X86::VPMAXUBZ256rrk,     X86::VPMAXUBZ256rmk,       0 },
-    { X86::VPMAXUDZ256rrk,     X86::VPMAXUDZ256rmk,       0 },
-    { X86::VPMAXUQZ256rrk,     X86::VPMAXUQZ256rmk,       0 },
-    { X86::VPMAXUWZ256rrk,     X86::VPMAXUWZ256rmk,       0 },
-    { X86::VPMINSBZ256rrk,     X86::VPMINSBZ256rmk,       0 },
-    { X86::VPMINSDZ256rrk,     X86::VPMINSDZ256rmk,       0 },
-    { X86::VPMINSQZ256rrk,     X86::VPMINSQZ256rmk,       0 },
-    { X86::VPMINSWZ256rrk,     X86::VPMINSWZ256rmk,       0 },
-    { X86::VPMINUBZ256rrk,     X86::VPMINUBZ256rmk,       0 },
-    { X86::VPMINUDZ256rrk,     X86::VPMINUDZ256rmk,       0 },
-    { X86::VPMINUQZ256rrk,     X86::VPMINUQZ256rmk,       0 },
-    { X86::VPMINUWZ256rrk,     X86::VPMINUWZ256rmk,       0 },
-    { X86::VPMULDQZ256rrk,     X86::VPMULDQZ256rmk,       0 },
-    { X86::VPMULLDZ256rrk,     X86::VPMULLDZ256rmk,       0 },
-    { X86::VPMULLQZ256rrk,     X86::VPMULLQZ256rmk,       0 },
-    { X86::VPMULLWZ256rrk,     X86::VPMULLWZ256rmk,       0 },
-    { X86::VPMULUDQZ256rrk,    X86::VPMULUDQZ256rmk,      0 },
-    { X86::VPORDZ256rrk,       X86::VPORDZ256rmk,         0 },
-    { X86::VPORQZ256rrk,       X86::VPORQZ256rmk,         0 },
-    { X86::VPSHUFBZ256rrk,     X86::VPSHUFBZ256rmk,       0 },
-    { X86::VPSLLDZ256rrk,      X86::VPSLLDZ256rmk,        0 },
-    { X86::VPSLLQZ256rrk,      X86::VPSLLQZ256rmk,        0 },
-    { X86::VPSLLVDZ256rrk,     X86::VPSLLVDZ256rmk,       0 },
-    { X86::VPSLLVQZ256rrk,     X86::VPSLLVQZ256rmk,       0 },
-    { X86::VPSLLVWZ256rrk,     X86::VPSLLVWZ256rmk,       0 },
-    { X86::VPSLLWZ256rrk,      X86::VPSLLWZ256rmk,        0 },
-    { X86::VPSRADZ256rrk,      X86::VPSRADZ256rmk,        0 },
-    { X86::VPSRAQZ256rrk,      X86::VPSRAQZ256rmk,        0 },
-    { X86::VPSRAVDZ256rrk,     X86::VPSRAVDZ256rmk,       0 },
-    { X86::VPSRAVQZ256rrk,     X86::VPSRAVQZ256rmk,       0 },
-    { X86::VPSRAVWZ256rrk,     X86::VPSRAVWZ256rmk,       0 },
-    { X86::VPSRAWZ256rrk,      X86::VPSRAWZ256rmk,        0 },
-    { X86::VPSRLDZ256rrk,      X86::VPSRLDZ256rmk,        0 },
-    { X86::VPSRLQZ256rrk,      X86::VPSRLQZ256rmk,        0 },
-    { X86::VPSRLVDZ256rrk,     X86::VPSRLVDZ256rmk,       0 },
-    { X86::VPSRLVQZ256rrk,     X86::VPSRLVQZ256rmk,       0 },
-    { X86::VPSRLVWZ256rrk,     X86::VPSRLVWZ256rmk,       0 },
-    { X86::VPSRLWZ256rrk,      X86::VPSRLWZ256rmk,        0 },
-    { X86::VPSUBBZ256rrk,      X86::VPSUBBZ256rmk,        0 },
-    { X86::VPSUBDZ256rrk,      X86::VPSUBDZ256rmk,        0 },
-    { X86::VPSUBQZ256rrk,      X86::VPSUBQZ256rmk,        0 },
-    { X86::VPSUBSBZ256rrk,     X86::VPSUBSBZ256rmk,       0 },
-    { X86::VPSUBSWZ256rrk,     X86::VPSUBSWZ256rmk,       0 },
-    { X86::VPSUBUSBZ256rrk,    X86::VPSUBUSBZ256rmk,      0 },
-    { X86::VPSUBUSWZ256rrk,    X86::VPSUBUSWZ256rmk,      0 },
-    { X86::VPSUBWZ256rrk,      X86::VPSUBWZ256rmk,        0 },
-    { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik,   0 },
-    { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik,   0 },
-    { X86::VPUNPCKHBWZ256rrk,  X86::VPUNPCKHBWZ256rmk,    0 },
-    { X86::VPUNPCKHDQZ256rrk,  X86::VPUNPCKHDQZ256rmk,    0 },
-    { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk,   0 },
-    { X86::VPUNPCKHWDZ256rrk,  X86::VPUNPCKHWDZ256rmk,    0 },
-    { X86::VPUNPCKLBWZ256rrk,  X86::VPUNPCKLBWZ256rmk,    0 },
-    { X86::VPUNPCKLDQZ256rrk,  X86::VPUNPCKLDQZ256rmk,    0 },
-    { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk,   0 },
-    { X86::VPUNPCKLWDZ256rrk,  X86::VPUNPCKLWDZ256rmk,    0 },
-    { X86::VPXORDZ256rrk,      X86::VPXORDZ256rmk,        0 },
-    { X86::VPXORQZ256rrk,      X86::VPXORQZ256rmk,        0 },
-    { X86::VSHUFPDZ256rrik,    X86::VSHUFPDZ256rmik,      0 },
-    { X86::VSHUFPSZ256rrik,    X86::VSHUFPSZ256rmik,      0 },
-    { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
-    { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
-    { X86::VUNPCKHPDZ256rrk,   X86::VUNPCKHPDZ256rmk,     0 },
-    { X86::VUNPCKHPSZ256rrk,   X86::VUNPCKHPSZ256rmk,     0 },
-    { X86::VUNPCKLPDZ256rrk,   X86::VUNPCKLPDZ256rmk,     0 },
-    { X86::VUNPCKLPSZ256rrk,   X86::VUNPCKLPSZ256rmk,     0 },
-    { X86::VXORPDZ256rrk,      X86::VXORPDZ256rmk,        0 },
-    { X86::VXORPSZ256rrk,      X86::VXORPSZ256rmk,        0 },
-
-    // AVX-512{F,VL} foldable instructions 128-bit
-    { X86::VADDPDZ128rrk,      X86::VADDPDZ128rmk,        0 },
-    { X86::VADDPSZ128rrk,      X86::VADDPSZ128rmk,        0 },
-    { X86::VALIGNDZ128rrik,    X86::VALIGNDZ128rmik,      0 },
-    { X86::VALIGNQZ128rrik,    X86::VALIGNQZ128rmik,      0 },
-    { X86::VANDNPDZ128rrk,     X86::VANDNPDZ128rmk,       0 },
-    { X86::VANDNPSZ128rrk,     X86::VANDNPSZ128rmk,       0 },
-    { X86::VANDPDZ128rrk,      X86::VANDPDZ128rmk,        0 },
-    { X86::VANDPSZ128rrk,      X86::VANDPSZ128rmk,        0 },
-    { X86::VDIVPDZ128rrk,      X86::VDIVPDZ128rmk,        0 },
-    { X86::VDIVPSZ128rrk,      X86::VDIVPSZ128rmk,        0 },
-    { X86::VMAXCPDZ128rrk,     X86::VMAXCPDZ128rmk,       0 },
-    { X86::VMAXCPSZ128rrk,     X86::VMAXCPSZ128rmk,       0 },
-    { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 },
-    { X86::VMAXPSZ128rrk,      X86::VMAXPSZ128rmk,        0 },
-    { X86::VMINCPDZ128rrk,     X86::VMINCPDZ128rmk,       0 },
-    { X86::VMINCPSZ128rrk,     X86::VMINCPSZ128rmk,       0 },
-    { X86::VMINPDZ128rrk,      X86::VMINPDZ128rmk,        0 },
-    { X86::VMINPSZ128rrk,      X86::VMINPSZ128rmk,        0 },
-    { X86::VMULPDZ128rrk,      X86::VMULPDZ128rmk,        0 },
-    { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
-    { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
-    { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
-    { X86::VPACKSSDWZ128rrk,   X86::VPACKSSDWZ128rmk,     0 },
-    { X86::VPACKSSWBZ128rrk,   X86::VPACKSSWBZ128rmk,     0 },
-    { X86::VPACKUSDWZ128rrk,   X86::VPACKUSDWZ128rmk,     0 },
-    { X86::VPACKUSWBZ128rrk,   X86::VPACKUSWBZ128rmk,     0 },
-    { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
-    { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
-    { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },
-    { X86::VPADDSBZ128rrk,     X86::VPADDSBZ128rmk,       0 },
-    { X86::VPADDSWZ128rrk,     X86::VPADDSWZ128rmk,       0 },
-    { X86::VPADDUSBZ128rrk,    X86::VPADDUSBZ128rmk,      0 },
-    { X86::VPADDUSWZ128rrk,    X86::VPADDUSWZ128rmk,      0 },
-    { X86::VPADDWZ128rrk,      X86::VPADDWZ128rmk,        0 },
-    { X86::VPALIGNRZ128rrik,   X86::VPALIGNRZ128rmik,     0 },
-    { X86::VPANDDZ128rrk,      X86::VPANDDZ128rmk,        0 },
-    { X86::VPANDNDZ128rrk,     X86::VPANDNDZ128rmk,       0 },
-    { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
-    { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
-    { X86::VPAVGBZ128rrk,      X86::VPAVGBZ128rmk,        0 },
-    { X86::VPAVGWZ128rrk,      X86::VPAVGWZ128rmk,        0 },
-    { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
-    { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
-    { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
-    { X86::VPERMI2PD128rrk,    X86::VPERMI2PD128rmk,      0 },
-    { X86::VPERMI2PS128rrk,    X86::VPERMI2PS128rmk,      0 },
-    { X86::VPERMI2Q128rrk,     X86::VPERMI2Q128rmk,       0 },
-    { X86::VPERMI2W128rrk,     X86::VPERMI2W128rmk,       0 },
-    { X86::VPERMILPDZ128rrk,   X86::VPERMILPDZ128rmk,     0 },
-    { X86::VPERMILPSZ128rrk,   X86::VPERMILPSZ128rmk,     0 },
-    { X86::VPERMT2B128rrk,     X86::VPERMT2B128rmk,       0 },
-    { X86::VPERMT2D128rrk,     X86::VPERMT2D128rmk,       0 },
-    { X86::VPERMT2PD128rrk,    X86::VPERMT2PD128rmk,      0 },
-    { X86::VPERMT2PS128rrk,    X86::VPERMT2PS128rmk,      0 },
-    { X86::VPERMT2Q128rrk,     X86::VPERMT2Q128rmk,       0 },
-    { X86::VPERMT2W128rrk,     X86::VPERMT2W128rmk,       0 },
-    { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
-    { X86::VPMADD52HUQZ128rk,  X86::VPMADD52HUQZ128mk,    0 },
-    { X86::VPMADD52LUQZ128rk,  X86::VPMADD52LUQZ128mk,    0 },
-    { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
-    { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
-    { X86::VPMAXSBZ128rrk,     X86::VPMAXSBZ128rmk,       0 },
-    { X86::VPMAXSDZ128rrk,     X86::VPMAXSDZ128rmk,       0 },
-    { X86::VPMAXSQZ128rrk,     X86::VPMAXSQZ128rmk,       0 },
-    { X86::VPMAXSWZ128rrk,     X86::VPMAXSWZ128rmk,       0 },
-    { X86::VPMAXUBZ128rrk,     X86::VPMAXUBZ128rmk,       0 },
-    { X86::VPMAXUDZ128rrk,     X86::VPMAXUDZ128rmk,       0 },
-    { X86::VPMAXUQZ128rrk,     X86::VPMAXUQZ128rmk,       0 },
-    { X86::VPMAXUWZ128rrk,     X86::VPMAXUWZ128rmk,       0 },
-    { X86::VPMINSBZ128rrk,     X86::VPMINSBZ128rmk,       0 },
-    { X86::VPMINSDZ128rrk,     X86::VPMINSDZ128rmk,       0 },
-    { X86::VPMINSQZ128rrk,     X86::VPMINSQZ128rmk,       0 },
-    { X86::VPMINSWZ128rrk,     X86::VPMINSWZ128rmk,       0 },
-    { X86::VPMINUBZ128rrk,     X86::VPMINUBZ128rmk,       0 },
-    { X86::VPMINUDZ128rrk,     X86::VPMINUDZ128rmk,       0 },
-    { X86::VPMINUQZ128rrk,     X86::VPMINUQZ128rmk,       0 },
-    { X86::VPMINUWZ128rrk,     X86::VPMINUWZ128rmk,       0 },
-    { X86::VPMULDQZ128rrk,     X86::VPMULDQZ128rmk,       0 },
-    { X86::VPMULLDZ128rrk,     X86::VPMULLDZ128rmk,       0 },
-    { X86::VPMULLQZ128rrk,     X86::VPMULLQZ128rmk,       0 },
-    { X86::VPMULLWZ128rrk,     X86::VPMULLWZ128rmk,       0 },
-    { X86::VPMULUDQZ128rrk,    X86::VPMULUDQZ128rmk,      0 },
-    { X86::VPORDZ128rrk,       X86::VPORDZ128rmk,         0 },
-    { X86::VPORQZ128rrk,       X86::VPORQZ128rmk,         0 },
-    { X86::VPSHUFBZ128rrk,     X86::VPSHUFBZ128rmk,       0 },
-    { X86::VPSLLDZ128rrk,      X86::VPSLLDZ128rmk,        0 },
-    { X86::VPSLLQZ128rrk,      X86::VPSLLQZ128rmk,        0 },
-    { X86::VPSLLVDZ128rrk,     X86::VPSLLVDZ128rmk,       0 },
-    { X86::VPSLLVQZ128rrk,     X86::VPSLLVQZ128rmk,       0 },
-    { X86::VPSLLVWZ128rrk,     X86::VPSLLVWZ128rmk,       0 },
-    { X86::VPSLLWZ128rrk,      X86::VPSLLWZ128rmk,        0 },
-    { X86::VPSRADZ128rrk,      X86::VPSRADZ128rmk,        0 },
-    { X86::VPSRAQZ128rrk,      X86::VPSRAQZ128rmk,        0 },
-    { X86::VPSRAVDZ128rrk,     X86::VPSRAVDZ128rmk,       0 },
-    { X86::VPSRAVQZ128rrk,     X86::VPSRAVQZ128rmk,       0 },
-    { X86::VPSRAVWZ128rrk,     X86::VPSRAVWZ128rmk,       0 },
-    { X86::VPSRAWZ128rrk,      X86::VPSRAWZ128rmk,        0 },
-    { X86::VPSRLDZ128rrk,      X86::VPSRLDZ128rmk,        0 },
-    { X86::VPSRLQZ128rrk,      X86::VPSRLQZ128rmk,        0 },
-    { X86::VPSRLVDZ128rrk,     X86::VPSRLVDZ128rmk,       0 },
-    { X86::VPSRLVQZ128rrk,     X86::VPSRLVQZ128rmk,       0 },
-    { X86::VPSRLVWZ128rrk,     X86::VPSRLVWZ128rmk,       0 },
-    { X86::VPSRLWZ128rrk,      X86::VPSRLWZ128rmk,        0 },
-    { X86::VPSUBBZ128rrk,      X86::VPSUBBZ128rmk,        0 },
-    { X86::VPSUBDZ128rrk,      X86::VPSUBDZ128rmk,        0 },
-    { X86::VPSUBQZ128rrk,      X86::VPSUBQZ128rmk,        0 },
-    { X86::VPSUBSBZ128rrk,     X86::VPSUBSBZ128rmk,       0 },
-    { X86::VPSUBSWZ128rrk,     X86::VPSUBSWZ128rmk,       0 },
-    { X86::VPSUBUSBZ128rrk,    X86::VPSUBUSBZ128rmk,      0 },
-    { X86::VPSUBUSWZ128rrk,    X86::VPSUBUSWZ128rmk,      0 },
-    { X86::VPSUBWZ128rrk,      X86::VPSUBWZ128rmk,        0 },
-    { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik,   0 },
-    { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik,   0 },
-    { X86::VPUNPCKHBWZ128rrk,  X86::VPUNPCKHBWZ128rmk,    0 },
-    { X86::VPUNPCKHDQZ128rrk,  X86::VPUNPCKHDQZ128rmk,    0 },
-    { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk,   0 },
-    { X86::VPUNPCKHWDZ128rrk,  X86::VPUNPCKHWDZ128rmk,    0 },
-    { X86::VPUNPCKLBWZ128rrk,  X86::VPUNPCKLBWZ128rmk,    0 },
-    { X86::VPUNPCKLDQZ128rrk,  X86::VPUNPCKLDQZ128rmk,    0 },
-    { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk,   0 },
-    { X86::VPUNPCKLWDZ128rrk,  X86::VPUNPCKLWDZ128rmk,    0 },
-    { X86::VPXORDZ128rrk,      X86::VPXORDZ128rmk,        0 },
-    { X86::VPXORQZ128rrk,      X86::VPXORQZ128rmk,        0 },
-    { X86::VSHUFPDZ128rrik,    X86::VSHUFPDZ128rmik,      0 },
-    { X86::VSHUFPSZ128rrik,    X86::VSHUFPSZ128rmik,      0 },
-    { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
-    { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
-    { X86::VUNPCKHPDZ128rrk,   X86::VUNPCKHPDZ128rmk,     0 },
-    { X86::VUNPCKHPSZ128rrk,   X86::VUNPCKHPSZ128rmk,     0 },
-    { X86::VUNPCKLPDZ128rrk,   X86::VUNPCKLPDZ128rmk,     0 },
-    { X86::VUNPCKLPSZ128rrk,   X86::VUNPCKLPSZ128rmk,     0 },
-    { X86::VXORPDZ128rrk,      X86::VXORPDZ128rmk,        0 },
-    { X86::VXORPSZ128rrk,      X86::VXORPSZ128rmk,        0 },
-
-    // 512-bit three source instructions with zero masking.
-    { X86::VPERMI2Brrkz,       X86::VPERMI2Brmkz,         0 },
-    { X86::VPERMI2Drrkz,       X86::VPERMI2Drmkz,         0 },
-    { X86::VPERMI2PSrrkz,      X86::VPERMI2PSrmkz,        0 },
-    { X86::VPERMI2PDrrkz,      X86::VPERMI2PDrmkz,        0 },
-    { X86::VPERMI2Qrrkz,       X86::VPERMI2Qrmkz,         0 },
-    { X86::VPERMI2Wrrkz,       X86::VPERMI2Wrmkz,         0 },
-    { X86::VPERMT2Brrkz,       X86::VPERMT2Brmkz,         0 },
-    { X86::VPERMT2Drrkz,       X86::VPERMT2Drmkz,         0 },
-    { X86::VPERMT2PSrrkz,      X86::VPERMT2PSrmkz,        0 },
-    { X86::VPERMT2PDrrkz,      X86::VPERMT2PDrmkz,        0 },
-    { X86::VPERMT2Qrrkz,       X86::VPERMT2Qrmkz,         0 },
-    { X86::VPERMT2Wrrkz,       X86::VPERMT2Wrmkz,         0 },
-    { X86::VPMADD52HUQZrkz,    X86::VPMADD52HUQZmkz,      0 },
-    { X86::VPMADD52LUQZrkz,    X86::VPMADD52LUQZmkz,      0 },
-    { X86::VPTERNLOGDZrrikz,   X86::VPTERNLOGDZrmikz,     0 },
-    { X86::VPTERNLOGQZrrikz,   X86::VPTERNLOGQZrmikz,     0 },
-
-    // 256-bit three source instructions with zero masking.
-    { X86::VPERMI2B256rrkz,    X86::VPERMI2B256rmkz,      0 },
-    { X86::VPERMI2D256rrkz,    X86::VPERMI2D256rmkz,      0 },
-    { X86::VPERMI2PD256rrkz,   X86::VPERMI2PD256rmkz,     0 },
-    { X86::VPERMI2PS256rrkz,   X86::VPERMI2PS256rmkz,     0 },
-    { X86::VPERMI2Q256rrkz,    X86::VPERMI2Q256rmkz,      0 },
-    { X86::VPERMI2W256rrkz,    X86::VPERMI2W256rmkz,      0 },
-    { X86::VPERMT2B256rrkz,    X86::VPERMT2B256rmkz,      0 },
-    { X86::VPERMT2D256rrkz,    X86::VPERMT2D256rmkz,      0 },
-    { X86::VPERMT2PD256rrkz,   X86::VPERMT2PD256rmkz,     0 },
-    { X86::VPERMT2PS256rrkz,   X86::VPERMT2PS256rmkz,     0 },
-    { X86::VPERMT2Q256rrkz,    X86::VPERMT2Q256rmkz,      0 },
-    { X86::VPERMT2W256rrkz,    X86::VPERMT2W256rmkz,      0 },
-    { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz,   0 },
-    { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz,   0 },
-    { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz,  0 },
-    { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz,  0 },
-
-    // 128-bit three source instructions with zero masking.
-    { X86::VPERMI2B128rrkz,    X86::VPERMI2B128rmkz,      0 },
-    { X86::VPERMI2D128rrkz,    X86::VPERMI2D128rmkz,      0 },
-    { X86::VPERMI2PD128rrkz,   X86::VPERMI2PD128rmkz,     0 },
-    { X86::VPERMI2PS128rrkz,   X86::VPERMI2PS128rmkz,     0 },
-    { X86::VPERMI2Q128rrkz,    X86::VPERMI2Q128rmkz,      0 },
-    { X86::VPERMI2W128rrkz,    X86::VPERMI2W128rmkz,      0 },
-    { X86::VPERMT2B128rrkz,    X86::VPERMT2B128rmkz,      0 },
-    { X86::VPERMT2D128rrkz,    X86::VPERMT2D128rmkz,      0 },
-    { X86::VPERMT2PD128rrkz,   X86::VPERMT2PD128rmkz,     0 },
-    { X86::VPERMT2PS128rrkz,   X86::VPERMT2PS128rmkz,     0 },
-    { X86::VPERMT2Q128rrkz,    X86::VPERMT2Q128rmkz,      0 },
-    { X86::VPERMT2W128rrkz,    X86::VPERMT2W128rmkz,      0 },
-    { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz,   0 },
-    { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz,   0 },
-    { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz,  0 },
-    { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz,  0 },
-  };
-
-  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
-    AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                  Entry.RegOp, Entry.MemOp,
-                  // Index 4, folded load
-                  Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
-  }
-  for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
-    if (I.getGroup()->isKMasked()) {
-      // Intrinsics need to pass TB_NO_REVERSE.
-      if (I.getGroup()->isIntrinsic()) {
-        AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
-      } else {
-        AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
-      }
-    }
-  }
-}
-
-void
-X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
-                            MemOp2RegOpTableType &M2RTable,
-                            uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
-  if ((Flags & TB_NO_FORWARD) == 0) {
-    assert(!R2MTable.count(RegOp) && "Duplicate entry!");
-    R2MTable[RegOp] = std::make_pair(MemOp, Flags);
-  }
-  if ((Flags & TB_NO_REVERSE) == 0) {
-    assert(!M2RTable.count(MemOp) &&
-         "Duplicated entries in unfolding maps?");
-    M2RTable[MemOp] = std::make_pair(RegOp, Flags);
-  }
 }
 
 bool
@@ -3867,156 +206,183 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
   return false;
 }
 
-static bool isFrameLoadOpcode(int Opcode) {
+static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
   switch (Opcode) {
   default:
     return false;
   case X86::MOV8rm:
+  case X86::KMOVBkm:
+    MemBytes = 1;
+    return true;
   case X86::MOV16rm:
+  case X86::KMOVWkm:
+    MemBytes = 2;
+    return true;
   case X86::MOV32rm:
+  case X86::MOVSSrm:
+  case X86::VMOVSSZrm:
+  case X86::VMOVSSrm:
+  case X86::KMOVDkm:
+    MemBytes = 4;
+    return true;
   case X86::MOV64rm:
   case X86::LD_Fp64m:
-  case X86::MOVSSrm:
   case X86::MOVSDrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVSDZrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::KMOVQkm:
+    MemBytes = 8;
+    return true;
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
   case X86::MOVAPDrm:
   case X86::MOVUPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
-  case X86::VMOVSSrm:
-  case X86::VMOVSDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
   case X86::VMOVUPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
-  case X86::VMOVUPSYrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVAPSZ128rm_NOVLX:
+  case X86::VMOVUPSZ128rm_NOVLX:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQU64Z128rm:
+    MemBytes = 16;
+    return true;
   case X86::VMOVAPSYrm:
-  case X86::VMOVUPDYrm:
+  case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
-  case X86::VMOVDQUYrm:
+  case X86::VMOVUPDYrm:
   case X86::VMOVDQAYrm:
-  case X86::MMX_MOVD64rm:
-  case X86::MMX_MOVQ64rm:
-  case X86::VMOVSSZrm:
-  case X86::VMOVSDZrm:
-  case X86::VMOVAPSZrm:
-  case X86::VMOVAPSZ128rm:
+  case X86::VMOVDQUYrm:
   case X86::VMOVAPSZ256rm:
-  case X86::VMOVAPSZ128rm_NOVLX:
-  case X86::VMOVAPSZ256rm_NOVLX:
-  case X86::VMOVUPSZrm:
-  case X86::VMOVUPSZ128rm:
   case X86::VMOVUPSZ256rm:
-  case X86::VMOVUPSZ128rm_NOVLX:
+  case X86::VMOVAPSZ256rm_NOVLX:
   case X86::VMOVUPSZ256rm_NOVLX:
-  case X86::VMOVAPDZrm:
-  case X86::VMOVAPDZ128rm:
   case X86::VMOVAPDZ256rm:
-  case X86::VMOVUPDZrm:
-  case X86::VMOVUPDZ128rm:
   case X86::VMOVUPDZ256rm:
-  case X86::VMOVDQA32Zrm:
-  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Z256rm:
   case X86::VMOVDQA32Z256rm:
-  case X86::VMOVDQU32Zrm:
-  case X86::VMOVDQU32Z128rm:
   case X86::VMOVDQU32Z256rm:
-  case X86::VMOVDQA64Zrm:
-  case X86::VMOVDQA64Z128rm:
   case X86::VMOVDQA64Z256rm:
-  case X86::VMOVDQU64Zrm:
-  case X86::VMOVDQU64Z128rm:
   case X86::VMOVDQU64Z256rm:
+    MemBytes = 32;
+    return true;
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVUPDZrm:
   case X86::VMOVDQU8Zrm:
-  case X86::VMOVDQU8Z128rm:
-  case X86::VMOVDQU8Z256rm:
   case X86::VMOVDQU16Zrm:
-  case X86::VMOVDQU16Z128rm:
-  case X86::VMOVDQU16Z256rm:
-  case X86::KMOVBkm:
-  case X86::KMOVWkm:
-  case X86::KMOVDkm:
-  case X86::KMOVQkm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU64Zrm:
+    MemBytes = 64;
     return true;
   }
 }
 
-static bool isFrameStoreOpcode(int Opcode) {
+static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
   switch (Opcode) {
-  default: break;
+  default:
+    return false;
   case X86::MOV8mr:
+  case X86::KMOVBmk:
+    MemBytes = 1;
+    return true;
   case X86::MOV16mr:
+  case X86::KMOVWmk:
+    MemBytes = 2;
+    return true;
   case X86::MOV32mr:
+  case X86::MOVSSmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSSZmr:
+  case X86::KMOVDmk:
+    MemBytes = 4;
+    return true;
   case X86::MOV64mr:
   case X86::ST_FpP64m:
-  case X86::MOVSSmr:
   case X86::MOVSDmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVSDZmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+  case X86::KMOVQmk:
+    MemBytes = 8;
+    return true;
   case X86::MOVAPSmr:
   case X86::MOVUPSmr:
   case X86::MOVAPDmr:
   case X86::MOVUPDmr:
   case X86::MOVDQAmr:
   case X86::MOVDQUmr:
-  case X86::VMOVSSmr:
-  case X86::VMOVSDmr:
   case X86::VMOVAPSmr:
   case X86::VMOVUPSmr:
   case X86::VMOVAPDmr:
   case X86::VMOVUPDmr:
   case X86::VMOVDQAmr:
   case X86::VMOVDQUmr:
+  case X86::VMOVUPSZ128mr:
+  case X86::VMOVAPSZ128mr:
+  case X86::VMOVUPSZ128mr_NOVLX:
+  case X86::VMOVAPSZ128mr_NOVLX:
+  case X86::VMOVUPDZ128mr:
+  case X86::VMOVAPDZ128mr:
+  case X86::VMOVDQA32Z128mr:
+  case X86::VMOVDQU32Z128mr:
+  case X86::VMOVDQA64Z128mr:
+  case X86::VMOVDQU64Z128mr:
+  case X86::VMOVDQU8Z128mr:
+  case X86::VMOVDQU16Z128mr:
+    MemBytes = 16;
+    return true;
   case X86::VMOVUPSYmr:
   case X86::VMOVAPSYmr:
   case X86::VMOVUPDYmr:
   case X86::VMOVAPDYmr:
   case X86::VMOVDQUYmr:
   case X86::VMOVDQAYmr:
-  case X86::VMOVSSZmr:
-  case X86::VMOVSDZmr:
-  case X86::VMOVUPSZmr:
-  case X86::VMOVUPSZ128mr:
   case X86::VMOVUPSZ256mr:
-  case X86::VMOVUPSZ128mr_NOVLX:
-  case X86::VMOVUPSZ256mr_NOVLX:
-  case X86::VMOVAPSZmr:
-  case X86::VMOVAPSZ128mr:
   case X86::VMOVAPSZ256mr:
-  case X86::VMOVAPSZ128mr_NOVLX:
+  case X86::VMOVUPSZ256mr_NOVLX:
   case X86::VMOVAPSZ256mr_NOVLX:
-  case X86::VMOVUPDZmr:
-  case X86::VMOVUPDZ128mr:
   case X86::VMOVUPDZ256mr:
-  case X86::VMOVAPDZmr:
-  case X86::VMOVAPDZ128mr:
   case X86::VMOVAPDZ256mr:
-  case X86::VMOVDQA32Zmr:
-  case X86::VMOVDQA32Z128mr:
+  case X86::VMOVDQU8Z256mr:
+  case X86::VMOVDQU16Z256mr:
   case X86::VMOVDQA32Z256mr:
-  case X86::VMOVDQU32Zmr:
-  case X86::VMOVDQU32Z128mr:
   case X86::VMOVDQU32Z256mr:
-  case X86::VMOVDQA64Zmr:
-  case X86::VMOVDQA64Z128mr:
   case X86::VMOVDQA64Z256mr:
-  case X86::VMOVDQU64Zmr:
-  case X86::VMOVDQU64Z128mr:
   case X86::VMOVDQU64Z256mr:
+    MemBytes = 32;
+    return true;
+  case X86::VMOVUPSZmr:
+  case X86::VMOVAPSZmr:
+  case X86::VMOVUPDZmr:
+  case X86::VMOVAPDZmr:
   case X86::VMOVDQU8Zmr:
-  case X86::VMOVDQU8Z128mr:
-  case X86::VMOVDQU8Z256mr:
   case X86::VMOVDQU16Zmr:
-  case X86::VMOVDQU16Z128mr:
-  case X86::VMOVDQU16Z256mr:
-  case X86::MMX_MOVD64mr:
-  case X86::MMX_MOVQ64mr:
-  case X86::MMX_MOVNTQmr:
-  case X86::KMOVBmk:
-  case X86::KMOVWmk:
-  case X86::KMOVDmk:
-  case X86::KMOVQmk:
+  case X86::VMOVDQA32Zmr:
+  case X86::VMOVDQU32Zmr:
+  case X86::VMOVDQA64Zmr:
+  case X86::VMOVDQU64Zmr:
+    MemBytes = 64;
     return true;
   }
   return false;
@@ -4024,7 +390,14 @@ static bool isFrameStoreOpcode(int Opcode) {
 
 unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                            int &FrameIndex) const {
-  if (isFrameLoadOpcode(MI.getOpcode()))
+  unsigned Dummy;
+  return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex,
+                                           unsigned &MemBytes) const {
+  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
     if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
       return MI.getOperand(0).getReg();
   return 0;
@@ -4032,7 +405,8 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
 
 unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
                                                  int &FrameIndex) const {
-  if (isFrameLoadOpcode(MI.getOpcode())) {
+  unsigned Dummy;
+  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
     unsigned Reg;
     if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
       return Reg;
@@ -4045,7 +419,14 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
 
 unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
-  if (isFrameStoreOpcode(MI.getOpcode()))
+  unsigned Dummy;
+  return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex,
+                                          unsigned &MemBytes) const {
+  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
     if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
         isFrameOperand(MI, 0, FrameIndex))
       return MI.getOperand(X86::AddrNumOperands).getReg();
@@ -4054,7 +435,8 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 
 unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
                                                 int &FrameIndex) const {
-  if (isFrameStoreOpcode(MI.getOpcode())) {
+  unsigned Dummy;
+  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
     unsigned Reg;
     if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
       return Reg;
@@ -4225,8 +607,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
       // This instruction defines EFLAGS, no need to look any further.
       return true;
     ++Iter;
-    // Skip over DBG_VALUE.
-    while (Iter != E && Iter->isDebugValue())
+    // Skip over debug instructions.
+    while (Iter != E && Iter->isDebugInstr())
       ++Iter;
   }
 
@@ -4248,8 +630,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
       return !MBB.isLiveIn(X86::EFLAGS);
 
     --Iter;
-    // Skip over DBG_VALUE.
-    while (Iter != B && Iter->isDebugValue())
+    // Skip over debug instructions.
+    while (Iter != B && Iter->isDebugInstr())
       --Iter;
 
     bool SawKill = false;
@@ -4928,34 +1310,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 /// Case 0 - Possible to commute the first and second operands.
 /// Case 1 - Possible to commute the first and third operands.
 /// Case 2 - Possible to commute the second and third operands.
-static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
-                                  unsigned SrcOpIdx2) {
+static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+                                       unsigned SrcOpIdx2) {
   // Put the lowest index to SrcOpIdx1 to simplify the checks below.
   if (SrcOpIdx1 > SrcOpIdx2)
     std::swap(SrcOpIdx1, SrcOpIdx2);
 
   unsigned Op1 = 1, Op2 = 2, Op3 = 3;
   if (X86II::isKMasked(TSFlags)) {
-    // The k-mask operand cannot be commuted.
-    if (SrcOpIdx1 == 2)
-      return -1;
-
-    // For k-zero-masked operations it is Ok to commute the first vector
-    // operand.
-    // For regular k-masked operations a conservative choice is done as the
-    // elements of the first vector operand, for which the corresponding bit
-    // in the k-mask operand is set to 0, are copied to the result of the
-    // instruction.
-    // TODO/FIXME: The commute still may be legal if it is known that the
-    // k-mask operand is set to either all ones or all zeroes.
-    // It is also Ok to commute the 1st operand if all users of MI use only
-    // the elements enabled by the k-mask operand. For example,
-    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
-    //                                                     : v1[i];
-    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
-    //                                  // Ok, to commute v1 in FMADD213PSZrk.
-    if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
-      return -1;
     Op2++;
     Op3++;
   }
@@ -4966,7 +1328,7 @@ static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
     return 1;
   if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
     return 2;
-  return -1;
+  llvm_unreachable("Unknown three src commute case.");
 }
 
 unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
@@ -4975,23 +1337,19 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
 
   unsigned Opc = MI.getOpcode();
 
-  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
-  if (SrcOpIdx1 > SrcOpIdx2)
-    std::swap(SrcOpIdx1, SrcOpIdx2);
-
   // TODO: Commuting the 1st operand of FMA*_Int requires some additional
   // analysis. The commute optimization is legal only if all users of FMA*_Int
   // use only the lowest element of the FMA*_Int instruction. Such analysis are
   // not implemented yet. So, just return 0 in that case.
   // When such analysis are available this place will be the right place for
   // calling it.
-  if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
-    return 0;
+  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
+         "Intrinsic instructions can't commute operand 1");
 
   // Determine which case this commute is or if it can't be done.
-  int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
-  if (Case < 0)
-    return 0;
+  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
+                                         SrcOpIdx2);
+  assert(Case < 3 && "Unexpected case number!");
 
   // Define the FMA forms mapping array that helps to map input FMA form
   // to output FMA form to preserve the operation semantics after
@@ -5018,15 +1376,9 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
   };
 
   unsigned FMAForms[3];
-  if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
-    FMAForms[0] = FMA3Group.getReg132Opcode();
-    FMAForms[1] = FMA3Group.getReg213Opcode();
-    FMAForms[2] = FMA3Group.getReg231Opcode();
-  } else {
-    FMAForms[0] = FMA3Group.getMem132Opcode();
-    FMAForms[1] = FMA3Group.getMem213Opcode();
-    FMAForms[2] = FMA3Group.getMem231Opcode();
-  }
+  FMAForms[0] = FMA3Group.get132Opcode();
+  FMAForms[1] = FMA3Group.get213Opcode();
+  FMAForms[2] = FMA3Group.get231Opcode();
   unsigned FormIndex;
   for (FormIndex = 0; FormIndex < 3; FormIndex++)
     if (Opc == FMAForms[FormIndex])
@@ -5037,14 +1389,12 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
   return FMAForms[FormIndex];
 }
 
-static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
                              unsigned SrcOpIdx2) {
-  uint64_t TSFlags = MI.getDesc().TSFlags;
-
   // Determine which case this commute is or if it can't be done.
-  int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
-  if (Case < 0)
-    return false;
+  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
+                                         SrcOpIdx2);
+  assert(Case < 3 && "Unexpected case value!");
 
   // For each case we need to swap two pairs of bits in the final immediate.
   static const uint8_t SwapMasks[3][4] = {
@@ -5063,11 +1413,9 @@ static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
   if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
   if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
   MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
-
-  return true;
 }
 
-// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
+// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
 // commuted.
 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
 #define VPERM_CASES(Suffix) \
@@ -5108,7 +1456,7 @@ static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
 }
 
 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
-// from the I opcod to the T opcode and vice versa.
+// from the I opcode to the T opcode and vice versa.
 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
 #define VPERM_CASES(Orig, New) \
   case X86::Orig##128rr:    return X86::New##128rr;   \
@@ -5200,9 +1548,29 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
-  case X86::PBLENDWrri:
   case X86::VBLENDPDrri:
   case X86::VBLENDPSrri:
+    // If we're optimizing for size, try to use MOVSD/MOVSS.
+    if (MI.getParent()->getParent()->getFunction().optForSize()) {
+      unsigned Mask, Opc;
+      switch (MI.getOpcode()) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::BLENDPDrri:  Opc = X86::MOVSDrr;  Mask = 0x03; break;
+      case X86::BLENDPSrri:  Opc = X86::MOVSSrr;  Mask = 0x0F; break;
+      case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
+      case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
+      }
+      if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
+        auto &WorkingMI = cloneIfNew(MI);
+        WorkingMI.setDesc(get(Opc));
+        WorkingMI.RemoveOperand(3);
+        return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
+                                                       /*NewMI=*/false,
+                                                       OpIdx1, OpIdx2);
+      }
+    }
+    LLVM_FALLTHROUGH;
+  case X86::PBLENDWrri:
   case X86::VBLENDPDYrri:
   case X86::VBLENDPSYrri:
   case X86::VPBLENDDrri:
@@ -5236,8 +1604,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VMOVSDrr:
   case X86::VMOVSSrr:{
     // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
-    if (!Subtarget.hasSSE41())
-      return nullptr;
+    assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!");
 
     unsigned Mask, Opc;
     switch (MI.getOpcode()) {
@@ -5270,37 +1637,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
-  case X86::CMPSDrr:
-  case X86::CMPSSrr:
-  case X86::CMPPDrri:
-  case X86::CMPPSrri:
-  case X86::VCMPSDrr:
-  case X86::VCMPSSrr:
-  case X86::VCMPPDrri:
-  case X86::VCMPPSrri:
-  case X86::VCMPPDYrri:
-  case X86::VCMPPSYrri:
-  case X86::VCMPSDZrr:
-  case X86::VCMPSSZrr:
-  case X86::VCMPPDZrri:
-  case X86::VCMPPSZrri:
-  case X86::VCMPPDZ128rri:
-  case X86::VCMPPSZ128rri:
-  case X86::VCMPPDZ256rri:
-  case X86::VCMPPSZ256rri: {
-    // Float comparison can be safely commuted for
-    // Ordered/Unordered/Equal/NotEqual tests
-    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
-    switch (Imm) {
-    case 0x00: // EQUAL
-    case 0x03: // UNORDERED
-    case 0x04: // NOT EQUAL
-    case 0x07: // ORDERED
-      return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
-    default:
-      return nullptr;
-    }
-  }
   case X86::VPCMPBZ128rri:  case X86::VPCMPUBZ128rri:
   case X86::VPCMPBZ256rri:  case X86::VPCMPUBZ256rri:
   case X86::VPCMPBZrri:     case X86::VPCMPUBZrri:
@@ -5327,18 +1663,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPCMPWZrrik:    case X86::VPCMPUWZrrik: {
     // Flip comparison mode immediate (if necessary).
     unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
-    switch (Imm) {
-    default: llvm_unreachable("Unreachable!");
-    case 0x01: Imm = 0x06; break; // LT  -> NLE
-    case 0x02: Imm = 0x05; break; // LE  -> NLT
-    case 0x05: Imm = 0x02; break; // NLT -> LE
-    case 0x06: Imm = 0x01; break; // NLE -> LT
-    case 0x00: // EQ
-    case 0x03: // FALSE
-    case 0x04: // NE
-    case 0x07: // TRUE
-      break;
-    }
+    Imm = X86::getSwappedVPCMPImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -5350,18 +1675,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPCOMWri: case X86::VPCOMUWri: {
     // Flip comparison mode immediate (if necessary).
     unsigned Imm = MI.getOperand(3).getImm() & 0x7;
-    switch (Imm) {
-    default: llvm_unreachable("Unreachable!");
-    case 0x00: Imm = 0x02; break; // LT -> GT
-    case 0x01: Imm = 0x03; break; // LE -> GE
-    case 0x02: Imm = 0x00; break; // GT -> LT
-    case 0x03: Imm = 0x01; break; // GE -> LE
-    case 0x04: // EQ
-    case 0x05: // NE
-    case 0x06: // FALSE
-    case 0x07: // TRUE
-      break;
-    }
+    Imm = X86::getSwappedVPCOMImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(Imm);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -5379,15 +1693,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                    OpIdx1, OpIdx2);
   }
   case X86::MOVHLPSrr:
-  case X86::UNPCKHPDrr: {
-    if (!Subtarget.hasSSE2())
-      return nullptr;
+  case X86::UNPCKHPDrr:
+  case X86::VMOVHLPSrr:
+  case X86::VUNPCKHPDrr:
+  case X86::VMOVHLPSZrr:
+  case X86::VUNPCKHPDZ128rr: {
+    assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
 
     unsigned Opc = MI.getOpcode();
     switch (Opc) {
-      default: llvm_unreachable("Unreachable!");
-      case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
-      case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+    default: llvm_unreachable("Unreachable!");
+    case X86::MOVHLPSrr:       Opc = X86::UNPCKHPDrr;      break;
+    case X86::UNPCKHPDrr:      Opc = X86::MOVHLPSrr;       break;
+    case X86::VMOVHLPSrr:      Opc = X86::VUNPCKHPDrr;     break;
+    case X86::VUNPCKHPDrr:     Opc = X86::VMOVHLPSrr;      break;
+    case X86::VMOVHLPSZrr:     Opc = X86::VUNPCKHPDZ128rr; break;
+    case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr;     break;
     }
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(Opc));
@@ -5498,8 +1819,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPTERNLOGQZ256rmbikz:
   case X86::VPTERNLOGQZrmbikz: {
     auto &WorkingMI = cloneIfNew(MI);
-    if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
-      return nullptr;
+    commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
@@ -5512,13 +1832,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                      OpIdx1, OpIdx2);
     }
 
-    const X86InstrFMA3Group *FMA3Group =
-        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
+                                                      MI.getDesc().TSFlags);
     if (FMA3Group) {
       unsigned Opc =
         getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
-      if (Opc == 0)
-        return nullptr;
       auto &WorkingMI = cloneIfNew(MI);
       WorkingMI.setDesc(get(Opc));
       return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -5530,27 +1848,32 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   }
 }
 
-bool X86InstrInfo::findFMA3CommutedOpIndices(
-    const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
-    const X86InstrFMA3Group &FMA3Group) const {
-
-  if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
-    return false;
-
-  // Check if we can adjust the opcode to preserve the semantics when
-  // commute the register operands.
-  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
-}
-
-bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
-                                                 unsigned &SrcOpIdx1,
-                                                 unsigned &SrcOpIdx2) const {
+bool
+X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+                                            unsigned &SrcOpIdx1,
+                                            unsigned &SrcOpIdx2,
+                                            bool IsIntrinsic) const {
   uint64_t TSFlags = MI.getDesc().TSFlags;
 
   unsigned FirstCommutableVecOp = 1;
   unsigned LastCommutableVecOp = 3;
-  unsigned KMaskOp = 0;
+  unsigned KMaskOp = -1U;
   if (X86II::isKMasked(TSFlags)) {
+    // For k-zero-masked operations it is Ok to commute the first vector
+    // operand.
+    // For regular k-masked operations a conservative choice is done as the
+    // elements of the first vector operand, for which the corresponding bit
+    // in the k-mask operand is set to 0, are copied to the result of the
+    // instruction.
+    // TODO/FIXME: The commute still may be legal if it is known that the
+    // k-mask operand is set to either all ones or all zeroes.
+    // It is also Ok to commute the 1st operand if all users of MI use only
+    // the elements enabled by the k-mask operand. For example,
+    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+    //                                                     : v1[i];
+    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+    //                                  // Ok, to commute v1 in FMADD213PSZrk.
+
     // The k-mask operand has index = 2 for masked and zero-masked operations.
     KMaskOp = 2;
 
@@ -5560,6 +1883,10 @@ bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
       FirstCommutableVecOp = 3;
 
     LastCommutableVecOp++;
+  } else if (IsIntrinsic) {
+    // Commuting the first operand of an intrinsic instruction isn't possible
+    // unless we can prove that only the lowest element of the result is used.
+    FirstCommutableVecOp = 2;
   }
 
   if (isMem(MI, LastCommutableVecOp))
@@ -5666,11 +1993,19 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   case X86::MOVSDrr:
   case X86::MOVSSrr:
   case X86::VMOVSDrr:
-  case X86::VMOVSSrr: {
+  case X86::VMOVSSrr:
     if (Subtarget.hasSSE41())
       return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
     return false;
-  }
+  case X86::MOVHLPSrr:
+  case X86::UNPCKHPDrr:
+  case X86::VMOVHLPSrr:
+  case X86::VUNPCKHPDrr:
+  case X86::VMOVHLPSZrr:
+  case X86::VUNPCKHPDZ128rr:
+    if (Subtarget.hasSSE2())
+      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+    return false;
   case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
   case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
   case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
@@ -5722,7 +2057,7 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   case X86::VPMADD52LUQZrkz: {
     unsigned CommutableOpIdx1 = 2;
     unsigned CommutableOpIdx2 = 3;
-    if (Desc.TSFlags & X86II::EVEX_K) {
+    if (X86II::isKMasked(Desc.TSFlags)) {
       // Skip the mask register.
       ++CommutableOpIdx1;
       ++CommutableOpIdx2;
@@ -5738,14 +2073,15 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   }
 
   default:
-    const X86InstrFMA3Group *FMA3Group =
-        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
+                                                      MI.getDesc().TSFlags);
     if (FMA3Group)
-      return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
+      return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
+                                           FMA3Group->isIntrinsic());
 
     // Handled masked instructions since we need to skip over the mask input
     // and the preserved input.
-    if (Desc.TSFlags & X86II::EVEX_K) {
+    if (X86II::isKMasked(Desc.TSFlags)) {
       // First assume that the first input is the mask operand and skip past it.
       unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
       unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
@@ -5758,11 +2094,11 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
         // be a 3 input instruction and we want the first two non-mask inputs.
         // Otherwise this is a 2 input instruction with a preserved input and
         // mask, so we need to move the indices to skip one more input.
-        if (Desc.TSFlags & X86II::EVEX_Z)
-          --CommutableOpIdx1;
-        else {
+        if (X86II::isKMergeMasked(Desc.TSFlags)) {
           ++CommutableOpIdx1;
           ++CommutableOpIdx2;
+        } else {
+          --CommutableOpIdx1;
         }
       }
 
@@ -6061,6 +2397,59 @@ unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
   }
 }
 
+/// Get the VPCMP immediate for the given condition.
+unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  return 4;
+  case ISD::SETEQ:  return 0;
+  case ISD::SETULT:
+  case ISD::SETLT: return 1;
+  case ISD::SETUGT:
+  case ISD::SETGT: return 6;
+  case ISD::SETUGE:
+  case ISD::SETGE: return 5;
+  case ISD::SETULE:
+  case ISD::SETLE: return 2;
+  }
+}
+
+/// Get the VPCMP immediate if the opcodes are swapped.
+unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
+  switch (Imm) {
+  default: llvm_unreachable("Unreachable!");
+  case 0x01: Imm = 0x06; break; // LT  -> NLE
+  case 0x02: Imm = 0x05; break; // LE  -> NLT
+  case 0x05: Imm = 0x02; break; // NLT -> LE
+  case 0x06: Imm = 0x01; break; // NLE -> LT
+  case 0x00: // EQ
+  case 0x03: // FALSE
+  case 0x04: // NE
+  case 0x07: // TRUE
+    break;
+  }
+
+  return Imm;
+}
+
+/// Get the VPCOM immediate if the opcodes are swapped.
+unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
+  switch (Imm) {
+  default: llvm_unreachable("Unreachable!");
+  case 0x00: Imm = 0x02; break; // LT -> GT
+  case 0x01: Imm = 0x03; break; // LE -> GE
+  case 0x02: Imm = 0x00; break; // GT -> LT
+  case 0x03: Imm = 0x01; break; // GE -> LE
+  case 0x04: // EQ
+  case 0x05: // NE
+  case 0x06: // FALSE
+  case 0x07: // TRUE
+    break;
+  }
+
+  return Imm;
+}
+
 bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
   if (!MI.isTerminator()) return false;
 
@@ -6125,7 +2514,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (!I->isBranch())
       assert(0 && "Can't find the branch to replace!");
@@ -6193,7 +2582,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
   MachineBasicBlock::iterator UnCondBrIter = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     // Working from the bottom, when we see a non-terminator instruction, we're
@@ -6430,7 +2819,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (I->getOpcode() != X86::JMP_1 &&
         X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
@@ -6562,7 +2951,7 @@ static bool isHReg(unsigned Reg) {
 }
 
 // Try and copy between VR128/VR64 and GR64 registers.
-static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                         const X86Subtarget &Subtarget) {
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
@@ -6718,11 +3107,22 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     report_fatal_error("Unable to copy EFLAGS physical register!");
   }
 
-  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
-               << " to " << RI.getName(DestReg) << '\n');
+  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
+                    << RI.getName(DestReg) << '\n');
   llvm_unreachable("Cannot emit physreg copy instruction");
 }
 
+bool X86InstrInfo::isCopyInstr(const MachineInstr &MI,
+                               const MachineOperand *&Src,
+                               const MachineOperand *&Dest) const {
+  if (MI.isMoveReg()) {
+    Dest = &MI.getOperand(0);
+    Src = &MI.getOperand(1);
+    return true;
+  }
+  return false;
+}
+
 static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       const TargetRegisterClass *RC,
                                       bool isStackAligned,
@@ -6757,8 +3157,10 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
         (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
     if (X86::RFP32RegClass.hasSubClassEq(RC))
       return load ? X86::LD_Fp32m : X86::ST_Fp32m;
-    if (X86::VK32RegClass.hasSubClassEq(RC))
+    if (X86::VK32RegClass.hasSubClassEq(RC)) {
+      assert(STI.hasBWI() && "KMOVD requires BWI");
       return load ? X86::KMOVDkm : X86::KMOVDmk;
+    }
     llvm_unreachable("Unknown 4-byte regclass");
   case 8:
     if (X86::GR64RegClass.hasSubClassEq(RC))
@@ -6771,8 +3173,10 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
       return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
     if (X86::RFP64RegClass.hasSubClassEq(RC))
       return load ? X86::LD_Fp64m : X86::ST_Fp64m;
-    if (X86::VK64RegClass.hasSubClassEq(RC))
+    if (X86::VK64RegClass.hasSubClassEq(RC)) {
+      assert(STI.hasBWI() && "KMOVQ requires BWI");
       return load ? X86::KMOVQkm : X86::KMOVQmk;
+    }
     llvm_unreachable("Unknown 8-byte regclass");
   case 10:
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
@@ -6803,9 +3207,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     }
     if (X86::BNDRRegClass.hasSubClassEq(RC)) {
       if (STI.is64Bit())
-        return load ? X86::BNDMOVRM64rm : X86::BNDMOVMR64mr;
+        return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
       else
-        return load ? X86::BNDMOVRM32rm : X86::BNDMOVMR32mr;
+        return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
     }
     llvm_unreachable("Unknown 16-byte regclass");
   }
@@ -7200,6 +3604,13 @@ static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
   case X86::TZCNT32rr: case X86::TZCNT32rm:
   case X86::TZCNT64rr: case X86::TZCNT64rm:
     return X86::COND_B;
+  case X86::BSF16rr:
+  case X86::BSF16rm:
+  case X86::BSF32rr:
+  case X86::BSF32rm:
+  case X86::BSF64rr:
+  case X86::BSF64rm:
+    return X86::COND_E;
   }
 }
 
@@ -7751,6 +4162,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return Expand2AddrUndef(MIB, get(X86::SBB32rr));
   case X86::SETB_C64r:
     return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+  case X86::MMX_SET0:
+    return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
@@ -7854,9 +4267,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::VMOVUPSZ256mr_NOVLX:
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
                             get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
-  case X86::TEST8ri_NOREX:
-    MI.setDesc(get(X86::TEST8ri));
-    return true;
   case X86::MOV32ri64:
     MI.setDesc(get(X86::MOV32ri));
     return true;
@@ -7900,7 +4310,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 ///
 /// FIXME: This should be turned into a TSFlags.
 ///
-static bool hasPartialRegUpdate(unsigned Opcode) {
+static bool hasPartialRegUpdate(unsigned Opcode,
+                                const X86Subtarget &Subtarget) {
   switch (Opcode) {
   case X86::CVTSI2SSrr:
   case X86::CVTSI2SSrm:
@@ -7939,17 +4350,32 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::SQRTSDr_Int:
   case X86::SQRTSDm_Int:
     return true;
+  // GPR
+  case X86::POPCNT32rm:
+  case X86::POPCNT32rr:
+  case X86::POPCNT64rm:
+  case X86::POPCNT64rr:
+    return Subtarget.hasPOPCNTFalseDeps();
+  case X86::LZCNT32rm:
+  case X86::LZCNT32rr:
+  case X86::LZCNT64rm:
+  case X86::LZCNT64rr:
+  case X86::TZCNT32rm:
+  case X86::TZCNT32rr:
+  case X86::TZCNT64rm:
+  case X86::TZCNT64rr:
+    return Subtarget.hasLZCNTFalseDeps();
   }
 
   return false;
 }
 
-/// Inform the ExecutionDepsFix pass how many idle
+/// Inform the BreakFalseDeps pass how many idle
 /// instructions we would like before a partial register update.
 unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
     const TargetRegisterInfo *TRI) const {
-  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
     return 0;
 
   // If MI is marked as reading Reg, the partial register update is wanted.
@@ -8071,20 +4497,51 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   case X86::VCVTSS2SDZrrb_Int:
   case X86::VCVTSS2SDZrm:
   case X86::VCVTSS2SDZrm_Int:
-  case X86::VRNDSCALESDr:
-  case X86::VRNDSCALESDr_Int:
-  case X86::VRNDSCALESDrb_Int:
-  case X86::VRNDSCALESDm:
-  case X86::VRNDSCALESDm_Int:
-  case X86::VRNDSCALESSr:
-  case X86::VRNDSCALESSr_Int:
-  case X86::VRNDSCALESSrb_Int:
-  case X86::VRNDSCALESSm:
-  case X86::VRNDSCALESSm_Int:
-  case X86::VRCP14SSrr:
-  case X86::VRCP14SSrm:
-  case X86::VRSQRT14SSrr:
-  case X86::VRSQRT14SSrm:
+  case X86::VGETEXPSDZr:
+  case X86::VGETEXPSDZrb:
+  case X86::VGETEXPSDZm:
+  case X86::VGETEXPSSZr:
+  case X86::VGETEXPSSZrb:
+  case X86::VGETEXPSSZm:
+  case X86::VGETMANTSDZrri:
+  case X86::VGETMANTSDZrrib:
+  case X86::VGETMANTSDZrmi:
+  case X86::VGETMANTSSZrri:
+  case X86::VGETMANTSSZrrib:
+  case X86::VGETMANTSSZrmi:
+  case X86::VRNDSCALESDZr:
+  case X86::VRNDSCALESDZr_Int:
+  case X86::VRNDSCALESDZrb_Int:
+  case X86::VRNDSCALESDZm:
+  case X86::VRNDSCALESDZm_Int:
+  case X86::VRNDSCALESSZr:
+  case X86::VRNDSCALESSZr_Int:
+  case X86::VRNDSCALESSZrb_Int:
+  case X86::VRNDSCALESSZm:
+  case X86::VRNDSCALESSZm_Int:
+  case X86::VRCP14SDZrr:
+  case X86::VRCP14SDZrm:
+  case X86::VRCP14SSZrr:
+  case X86::VRCP14SSZrm:
+  case X86::VRCP28SDZr:
+  case X86::VRCP28SDZrb:
+  case X86::VRCP28SDZm:
+  case X86::VRCP28SSZr:
+  case X86::VRCP28SSZrb:
+  case X86::VRCP28SSZm:
+  case X86::VREDUCESSZrmi:
+  case X86::VREDUCESSZrri:
+  case X86::VREDUCESSZrrib:
+  case X86::VRSQRT14SDZrr:
+  case X86::VRSQRT14SDZrm:
+  case X86::VRSQRT14SSZrr:
+  case X86::VRSQRT14SSZrm:
+  case X86::VRSQRT28SDZr:
+  case X86::VRSQRT28SDZrb:
+  case X86::VRSQRT28SDZm:
+  case X86::VRSQRT28SSZr:
+  case X86::VRSQRT28SSZrb:
+  case X86::VRSQRT28SSZm:
   case X86::VSQRTSSZr:
   case X86::VSQRTSSZr_Int:
   case X86::VSQRTSSZrb_Int:
@@ -8101,7 +4558,7 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// Inform the BreakFalseDeps pass how many idle instructions we would like
 /// before certain undef register reads.
 ///
 /// This catches the VCVTSI2SD family of instructions:
@@ -8155,6 +4612,20 @@ void X86InstrInfo::breakPartialRegDependency(
         .addReg(XReg, RegState::Undef)
         .addReg(Reg, RegState::ImplicitDefine);
     MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::GR64RegClass.contains(Reg)) {
+    // Using XOR32rr because it has shorter encoding and zeros up the upper bits
+    // as well.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
+        .addReg(XReg, RegState::Undef)
+        .addReg(XReg, RegState::Undef)
+        .addReg(Reg, RegState::ImplicitDefine);
+    MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::GR32RegClass.contains(Reg)) {
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+    MI.addRegisterKilled(Reg, TRI, true);
   }
 }
 
@@ -8182,6 +4653,32 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
   }
 }
 
+static void updateOperandRegConstraints(MachineFunction &MF,
+                                        MachineInstr &NewMI,
+                                        const TargetInstrInfo &TII) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+  for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
+    MachineOperand &MO = NewMI.getOperand(Idx);
+    // We only need to update constraints on virtual register operands.
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TRI.isVirtualRegister(Reg))
+      continue;
+
+    auto *NewRC = MRI.constrainRegClass(
+        Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
+    if (!NewRC) {
+      LLVM_DEBUG(
+          dbgs() << "WARNING: Unable to update register constraint for operand "
+                 << Idx << " of instruction:\n";
+          NewMI.dump(); dbgs() << "\n");
+    }
+  }
+}
+
 static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
                                      ArrayRef<MachineOperand> MOs,
                                      MachineBasicBlock::iterator InsertPt,
@@ -8205,6 +4702,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
     MIB.add(MO);
   }
 
+  updateOperandRegConstraints(MF, *NewMI, TII);
+
   MachineBasicBlock *MBB = InsertPt->getParent();
   MBB->insert(InsertPt, NewMI);
 
@@ -8231,6 +4730,8 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
     }
   }
 
+  updateOperandRegConstraints(MF, *NewMI, TII);
+
   MachineBasicBlock *MBB = InsertPt->getParent();
   MBB->insert(InsertPt, NewMI);
 
@@ -8306,12 +4807,29 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
   return nullptr;
 }
 
+static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) {
+  if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) ||
+      !MI.getOperand(1).isReg())
+    return false;
+
+  // The are two cases we need to handle depending on where in the pipeline
+  // the folding attempt is being made.
+  // -Register has the undef flag set.
+  // -Register is produced by the IMPLICIT_DEF instruction.
+
+  if (MI.getOperand(1).isUndef())
+    return true;
+
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
+  return VRegDef && VRegDef->isImplicitDef();
+}
+
+
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
     unsigned Size, unsigned Align, bool AllowCommute) const {
-  const DenseMap<unsigned,
-                 std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
   bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
   bool isTwoAddrFold = false;
 
@@ -8324,9 +4842,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
        MI.getOpcode() == X86::PUSH64r))
     return nullptr;
 
-  // Avoid partial register update stalls unless optimizing for size.
-  // TODO: we should block undef reg update as well.
-  if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+  // Avoid partial and undef register update stalls unless optimizing for size.
+  if (!MF.getFunction().optForSize() &&
+      (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+       shouldPreventUndefRegUpdateMemFold(MF, MI)))
     return nullptr;
 
   unsigned NumOps = MI.getDesc().getNumOperands();
@@ -8339,6 +4858,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
     return nullptr;
 
+  // GOTTPOFF relocation loads can only be folded into add instructions.
+  // FIXME: Need to exclude other relocations that only support specific
+  // instructions.
+  if (MOs.size() == X86::AddrNumOperands &&
+      MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
+      MI.getOpcode() != X86::ADD64rr)
+    return nullptr;
+
   MachineInstr *NewMI = nullptr;
 
   // Attempt to fold any custom cases we have.
@@ -8346,79 +4873,70 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
           foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
     return CustomMI;
 
+  const X86MemoryFoldTableEntry *I = nullptr;
+
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
   if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
       MI.getOperand(1).isReg() &&
       MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
-    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    I = lookupTwoAddrFoldTable(MI.getOpcode());
     isTwoAddrFold = true;
-  } else if (OpNum == 0) {
-    if (MI.getOpcode() == X86::MOV32r0) {
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
-      if (NewMI)
-        return NewMI;
+  } else {
+    if (OpNum == 0) {
+      if (MI.getOpcode() == X86::MOV32r0) {
+        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+        if (NewMI)
+          return NewMI;
+      }
     }
 
-    OpcodeTablePtr = &RegOp2MemOpTable0;
-  } else if (OpNum == 1) {
-    OpcodeTablePtr = &RegOp2MemOpTable1;
-  } else if (OpNum == 2) {
-    OpcodeTablePtr = &RegOp2MemOpTable2;
-  } else if (OpNum == 3) {
-    OpcodeTablePtr = &RegOp2MemOpTable3;
-  } else if (OpNum == 4) {
-    OpcodeTablePtr = &RegOp2MemOpTable4;
-  }
-
-  // If table selected...
-  if (OpcodeTablePtr) {
-    // Find the Opcode to fuse
-    auto I = OpcodeTablePtr->find(MI.getOpcode());
-    if (I != OpcodeTablePtr->end()) {
-      unsigned Opcode = I->second.first;
-      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
-      if (Align < MinAlign)
-        return nullptr;
-      bool NarrowToMOV32rm = false;
-      if (Size) {
-        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-        const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
-                                                    &RI, MF);
-        unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-        if (Size < RCSize) {
-          // Check if it's safe to fold the load. If the size of the object is
-          // narrower than the load width, then it's not.
-          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
-            return nullptr;
-          // If this is a 64-bit load, but the spill slot is 32, then we can do
-          // a 32-bit load which is implicitly zero-extended. This likely is
-          // due to live interval analysis remat'ing a load from stack slot.
-          if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
-            return nullptr;
-          Opcode = X86::MOV32rm;
-          NarrowToMOV32rm = true;
-        }
+    I = lookupFoldTable(MI.getOpcode(), OpNum);
+  }
+
+  if (I != nullptr) {
+    unsigned Opcode = I->DstOp;
+    unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+    if (Align < MinAlign)
+      return nullptr;
+    bool NarrowToMOV32rm = false;
+    if (Size) {
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
+                                                  &RI, MF);
+      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+      if (Size < RCSize) {
+        // Check if it's safe to fold the load. If the size of the object is
+        // narrower than the load width, then it's not.
+        if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+          return nullptr;
+        // If this is a 64-bit load, but the spill slot is 32, then we can do
+        // a 32-bit load which is implicitly zero-extended. This likely is
+        // due to live interval analysis remat'ing a load from stack slot.
+        if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+          return nullptr;
+        Opcode = X86::MOV32rm;
+        NarrowToMOV32rm = true;
       }
+    }
 
-      if (isTwoAddrFold)
-        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+    if (isTwoAddrFold)
+      NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+    else
+      NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+    if (NarrowToMOV32rm) {
+      // If this is the special case where we use a MOV32rm to load a 32-bit
+      // value and zero-extend the top bits. Change the destination register
+      // to a 32-bit one.
+      unsigned DstReg = NewMI->getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+        NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
       else
-        NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
-
-      if (NarrowToMOV32rm) {
-        // If this is the special case where we use a MOV32rm to load a 32-bit
-        // value and zero-extend the top bits. Change the destination register
-        // to a 32-bit one.
-        unsigned DstReg = NewMI->getOperand(0).getReg();
-        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
-          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
-        else
-          NewMI->getOperand(0).setSubReg(X86::sub_32bit);
-      }
-      return NewMI;
+        NewMI->getOperand(0).setSubReg(X86::sub_32bit);
     }
+    return NewMI;
   }
 
   // If the instruction and target operand are commutable, commute the
@@ -8492,10 +5010,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
   if (NoFusing)
     return nullptr;
 
-  // Unless optimizing for size, don't fold to avoid partial
-  // register update stalls
-  // TODO: we should block undef reg update as well.
-  if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+  // Avoid partial and undef register update stalls unless optimizing for size.
+  if (!MF.getFunction().optForSize() &&
+      (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+       shouldPreventUndefRegUpdateMemFold(MF, MI)))
     return nullptr;
 
   // Don't fold subreg spills, or reloads that use a high subreg.
@@ -8692,9 +5210,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // Check switch flag
   if (NoFusing) return nullptr;
 
-  // Avoid partial register update stalls unless optimizing for size.
-  // TODO: we should block undef reg update as well.
-  if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+  // Avoid partial and undef register update stalls unless optimizing for size.
+  if (!MF.getFunction().optForSize() &&
+      (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+       shouldPreventUndefRegUpdateMemFold(MF, MI)))
     return nullptr;
 
   // Determine the alignment of the load.
@@ -8718,6 +5237,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX512_128_SET0:
       Alignment = 16;
       break;
+    case X86::MMX_SET0:
     case X86::FsFLD0SD:
     case X86::AVX512_FsFLD0SD:
       Alignment = 8;
@@ -8751,6 +5271,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI.getOpcode()) {
+  case X86::MMX_SET0:
   case X86::V_SET0:
   case X86::V_SETALLONES:
   case X86::AVX2_SETALLONES:
@@ -8798,6 +5319,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
              Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
+    else if (Opc == X86::MMX_SET0)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);
 
@@ -8833,13 +5356,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 bool X86InstrInfo::unfoldMemoryOperand(
     MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
     bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
-  auto I = MemOp2RegOpTable.find(MI.getOpcode());
-  if (I == MemOp2RegOpTable.end())
+  const X86MemoryFoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
+  if (I == nullptr)
     return false;
-  unsigned Opc = I->second.first;
-  unsigned Index = I->second.second & TB_INDEX_MASK;
-  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
-  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  unsigned Opc = I->DstOp;
+  unsigned Index = I->Flags & TB_INDEX_MASK;
+  bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+  bool FoldedStore = I->Flags & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return false;
   UnfoldLoad &= FoldedLoad;
@@ -8955,13 +5478,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   if (!N->isMachineOpcode())
     return false;
 
-  auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
-  if (I == MemOp2RegOpTable.end())
+  const X86MemoryFoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
+  if (I == nullptr)
     return false;
-  unsigned Opc = I->second.first;
-  unsigned Index = I->second.second & TB_INDEX_MASK;
-  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
-  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  unsigned Opc = I->DstOp;
+  unsigned Index = I->Flags & TB_INDEX_MASK;
+  bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+  bool FoldedStore = I->Flags & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -9025,6 +5548,30 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   if (Load)
     BeforeOps.push_back(SDValue(Load, 0));
   BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  switch (Opc) {
+    default: break;
+    case X86::CMP64ri32:
+    case X86::CMP64ri8:
+    case X86::CMP32ri:
+    case X86::CMP32ri8:
+    case X86::CMP16ri:
+    case X86::CMP16ri8:
+    case X86::CMP8ri:
+      if (isNullConstant(BeforeOps[1])) {
+        switch (Opc) {
+          default: llvm_unreachable("Unreachable!");
+          case X86::CMP64ri8:
+          case X86::CMP64ri32: Opc = X86::TEST64rr; break;
+          case X86::CMP32ri8:
+          case X86::CMP32ri:   Opc = X86::TEST32rr; break;
+          case X86::CMP16ri8:
+          case X86::CMP16ri:   Opc = X86::TEST16rr; break;
+          case X86::CMP8ri:    Opc = X86::TEST8rr; break;
+        }
+        BeforeOps[1] = BeforeOps[0];
+      }
+  }
   SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
   NewNodes.push_back(NewNode);
 
@@ -9062,18 +5609,18 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex) const {
-  auto I = MemOp2RegOpTable.find(Opc);
-  if (I == MemOp2RegOpTable.end())
+  const X86MemoryFoldTableEntry *I = lookupUnfoldTable(Opc);
+  if (I == nullptr)
     return 0;
-  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
-  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+  bool FoldedStore = I->Flags & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return 0;
   if (UnfoldStore && !FoldedStore)
     return 0;
   if (LoadRegIndex)
-    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
-  return I->second.first;
+    *LoadRegIndex = I->Flags & TB_INDEX_MASK;
+  return I->DstOp;
 }
 
 bool
@@ -9335,7 +5882,9 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
 /// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
 ///
 unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
-  assert(!Subtarget.is64Bit() &&
+  assert((!Subtarget.is64Bit() ||
+          MF->getTarget().getCodeModel() == CodeModel::Medium ||
+          MF->getTarget().getCodeModel() == CodeModel::Large) &&
          "X86-64 PIC uses RIP relative addressing");
 
   X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
@@ -9346,7 +5895,8 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   // Create the register. The code to initialize it is inserted
   // later, by the CGBR pass (below).
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  GlobalBaseReg = RegInfo.createVirtualRegister(
+      Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
@@ -9536,8 +6086,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
   { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
   { X86::VBROADCASTF128,  X86::VBROADCASTF128,  X86::VBROADCASTI128 },
-  { X86::VBLENDPSrri,     X86::VBLENDPSrri,     X86::VPBLENDDrri },
-  { X86::VBLENDPSrmi,     X86::VBLENDPSrmi,     X86::VPBLENDDrmi },
   { X86::VBLENDPSYrri,    X86::VBLENDPSYrri,    X86::VPBLENDDYrri },
   { X86::VBLENDPSYrmi,    X86::VBLENDPSYrmi,    X86::VPBLENDDYrmi },
   { X86::VPERMILPSYmi,    X86::VPERMILPSYmi,    X86::VPSHUFDYmi },
@@ -9791,6 +6339,47 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
     X86::VPXORQZrmbkz,    X86::VPXORDZrmbkz    },
 };
 
+// NOTE: These should only be used by the custom domain methods.
+static const uint16_t ReplaceableCustomInstrs[][3] = {
+  //PackedSingle             PackedDouble             PackedInt
+  { X86::BLENDPSrmi,         X86::BLENDPDrmi,         X86::PBLENDWrmi   },
+  { X86::BLENDPSrri,         X86::BLENDPDrri,         X86::PBLENDWrri   },
+  { X86::VBLENDPSrmi,        X86::VBLENDPDrmi,        X86::VPBLENDWrmi  },
+  { X86::VBLENDPSrri,        X86::VBLENDPDrri,        X86::VPBLENDWrri  },
+  { X86::VBLENDPSYrmi,       X86::VBLENDPDYrmi,       X86::VPBLENDWYrmi },
+  { X86::VBLENDPSYrri,       X86::VBLENDPDYrri,       X86::VPBLENDWYrri },
+};
+static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
+  //PackedSingle             PackedDouble             PackedInt
+  { X86::VBLENDPSrmi,        X86::VBLENDPDrmi,        X86::VPBLENDDrmi  },
+  { X86::VBLENDPSrri,        X86::VBLENDPDrri,        X86::VPBLENDDrri  },
+  { X86::VBLENDPSYrmi,       X86::VBLENDPDYrmi,       X86::VPBLENDDYrmi },
+  { X86::VBLENDPSYrri,       X86::VBLENDPDYrri,       X86::VPBLENDDYrri },
+};
+
+// Special table for changing EVEX logic instructions to VEX.
+// TODO: Should we run EVEX->VEX earlier?
+static const uint16_t ReplaceableCustomAVX512LogicInstrs[][4] = {
+  // Two integer columns for 64-bit and 32-bit elements.
+  //PackedSingle     PackedDouble     PackedInt           PackedInt
+  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDQZ128rm,  X86::VPANDDZ128rm  },
+  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDQZ128rr,  X86::VPANDDZ128rr  },
+  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORQZ128rm,   X86::VPORDZ128rm   },
+  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORQZ128rr,   X86::VPORDZ128rr   },
+  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORQZ128rm,  X86::VPXORDZ128rm  },
+  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORQZ128rr,  X86::VPXORDZ128rr  },
+  { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+  { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+  { X86::VANDPSYrm,  X86::VANDPDYrm,  X86::VPANDQZ256rm,  X86::VPANDDZ256rm  },
+  { X86::VANDPSYrr,  X86::VANDPDYrr,  X86::VPANDQZ256rr,  X86::VPANDDZ256rr  },
+  { X86::VORPSYrm,   X86::VORPDYrm,   X86::VPORQZ256rm,   X86::VPORDZ256rm   },
+  { X86::VORPSYrr,   X86::VORPDYrr,   X86::VPORQZ256rr,   X86::VPORDZ256rr   },
+  { X86::VXORPSYrm,  X86::VXORPDYrm,  X86::VPXORQZ256rm,  X86::VPXORDZ256rm  },
+  { X86::VXORPSYrr,  X86::VXORPDYrr,  X86::VPXORQZ256rr,  X86::VPXORDZ256rr  },
+};
+
 // FIXME: Some shuffle and unpack instructions have equivalents in different
 // domains, but they require a bit more work than just switching opcodes.
 
@@ -9811,13 +6400,239 @@ static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
   return nullptr;
 }
 
+// Helper to attempt to widen/narrow blend masks.
+static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
+                            unsigned NewWidth, unsigned *pNewMask = nullptr) {
+  assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
+         "Illegal blend mask scale");
+  unsigned NewMask = 0;
+
+  if ((OldWidth % NewWidth) == 0) {
+    unsigned Scale = OldWidth / NewWidth;
+    unsigned SubMask = (1u << Scale) - 1;
+    for (unsigned i = 0; i != NewWidth; ++i) {
+      unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
+      if (Sub == SubMask)
+        NewMask |= (1u << i);
+      else if (Sub != 0x0)
+        return false;
+    }
+  } else {
+    unsigned Scale = NewWidth / OldWidth;
+    unsigned SubMask = (1u << Scale) - 1;
+    for (unsigned i = 0; i != OldWidth; ++i) {
+      if (OldMask & (1 << i)) {
+        NewMask |= (SubMask << (i * Scale));
+      }
+    }
+  }
+
+  if (pNewMask)
+    *pNewMask = NewMask;
+  return true;
+}
+
+uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+
+  auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
+    uint16_t validDomains = 0;
+    if (MI.getOperand(NumOperands - 1).isImm()) {
+      unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
+      if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
+        validDomains |= 0x2; // PackedSingle
+      if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
+        validDomains |= 0x4; // PackedDouble
+      if (!Is256 || Subtarget.hasAVX2())
+        validDomains |= 0x8; // PackedInt
+    }
+    return validDomains;
+  };
+
+  switch (Opcode) {
+  case X86::BLENDPDrmi:
+  case X86::BLENDPDrri:
+  case X86::VBLENDPDrmi:
+  case X86::VBLENDPDrri:
+    return GetBlendDomains(2, false);
+  case X86::VBLENDPDYrmi:
+  case X86::VBLENDPDYrri:
+    return GetBlendDomains(4, true);
+  case X86::BLENDPSrmi:
+  case X86::BLENDPSrri:
+  case X86::VBLENDPSrmi:
+  case X86::VBLENDPSrri:
+  case X86::VPBLENDDrmi:
+  case X86::VPBLENDDrri:
+    return GetBlendDomains(4, false);
+  case X86::VBLENDPSYrmi:
+  case X86::VBLENDPSYrri:
+  case X86::VPBLENDDYrmi:
+  case X86::VPBLENDDYrri:
+    return GetBlendDomains(8, true);
+  case X86::PBLENDWrmi:
+  case X86::PBLENDWrri:
+  case X86::VPBLENDWrmi:
+  case X86::VPBLENDWrri:
+  // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
+  case X86::VPBLENDWYrmi:
+  case X86::VPBLENDWYrri:
+    return GetBlendDomains(8, false);
+  case X86::VPANDDZ128rr:  case X86::VPANDDZ128rm:
+  case X86::VPANDDZ256rr:  case X86::VPANDDZ256rm:
+  case X86::VPANDQZ128rr:  case X86::VPANDQZ128rm:
+  case X86::VPANDQZ256rr:  case X86::VPANDQZ256rm:
+  case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
+  case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
+  case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
+  case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
+  case X86::VPORDZ128rr:   case X86::VPORDZ128rm:
+  case X86::VPORDZ256rr:   case X86::VPORDZ256rm:
+  case X86::VPORQZ128rr:   case X86::VPORQZ128rm:
+  case X86::VPORQZ256rr:   case X86::VPORQZ256rm:
+  case X86::VPXORDZ128rr:  case X86::VPXORDZ128rm:
+  case X86::VPXORDZ256rr:  case X86::VPXORDZ256rm:
+  case X86::VPXORQZ128rr:  case X86::VPXORQZ128rm:
+  case X86::VPXORQZ256rr:  case X86::VPXORQZ256rm:
+    // If we don't have DQI see if we can still switch from an EVEX integer
+    // instruction to a VEX floating point instruction.
+    if (Subtarget.hasDQI())
+      return 0;
+
+    if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
+      return 0;
+    if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
+      return 0;
+    // Register forms will have 3 operands. Memory form will have more.
+    if (NumOperands == 3 &&
+        RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return 0;
+
+    // All domains are valid.
+    return 0xe;
+  }
+  return 0;
+}
+
+bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
+                                            unsigned Domain) const {
+  assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
+  uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+
+  unsigned Opcode = MI.getOpcode();
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+
+  auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
+    if (MI.getOperand(NumOperands - 1).isImm()) {
+      unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
+      Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
+      unsigned NewImm = Imm;
+
+      const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
+      if (!table)
+        table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+
+      if (Domain == 1) { // PackedSingle
+        AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
+      } else if (Domain == 2) { // PackedDouble
+        AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
+      } else if (Domain == 3) { // PackedInt
+        if (Subtarget.hasAVX2()) {
+          // If we are already VPBLENDW use that, else use VPBLENDD.
+          if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
+            table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+            AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
+          }
+        } else {
+          assert(!Is256 && "128-bit vector expected");
+          AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
+        }
+      }
+
+      assert(table && table[Domain - 1] && "Unknown domain op");
+      MI.setDesc(get(table[Domain - 1]));
+      MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
+    }
+    return true;
+  };
+
+  switch (Opcode) {
+  case X86::BLENDPDrmi:
+  case X86::BLENDPDrri:
+  case X86::VBLENDPDrmi:
+  case X86::VBLENDPDrri:
+    return SetBlendDomain(2, false);
+  case X86::VBLENDPDYrmi:
+  case X86::VBLENDPDYrri:
+    return SetBlendDomain(4, true);
+  case X86::BLENDPSrmi:
+  case X86::BLENDPSrri:
+  case X86::VBLENDPSrmi:
+  case X86::VBLENDPSrri:
+  case X86::VPBLENDDrmi:
+  case X86::VPBLENDDrri:
+    return SetBlendDomain(4, false);
+  case X86::VBLENDPSYrmi:
+  case X86::VBLENDPSYrri:
+  case X86::VPBLENDDYrmi:
+  case X86::VPBLENDDYrri:
+    return SetBlendDomain(8, true);
+  case X86::PBLENDWrmi:
+  case X86::PBLENDWrri:
+  case X86::VPBLENDWrmi:
+  case X86::VPBLENDWrri:
+    return SetBlendDomain(8, false);
+  case X86::VPBLENDWYrmi:
+  case X86::VPBLENDWYrri:
+    return SetBlendDomain(16, true);
+  case X86::VPANDDZ128rr:  case X86::VPANDDZ128rm:
+  case X86::VPANDDZ256rr:  case X86::VPANDDZ256rm:
+  case X86::VPANDQZ128rr:  case X86::VPANDQZ128rm:
+  case X86::VPANDQZ256rr:  case X86::VPANDQZ256rm:
+  case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
+  case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
+  case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
+  case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
+  case X86::VPORDZ128rr:   case X86::VPORDZ128rm:
+  case X86::VPORDZ256rr:   case X86::VPORDZ256rm:
+  case X86::VPORQZ128rr:   case X86::VPORQZ128rm:
+  case X86::VPORQZ256rr:   case X86::VPORQZ256rm:
+  case X86::VPXORDZ128rr:  case X86::VPXORDZ128rm:
+  case X86::VPXORDZ256rr:  case X86::VPXORDZ256rm:
+  case X86::VPXORQZ128rr:  case X86::VPXORQZ128rm:
+  case X86::VPXORQZ256rr:  case X86::VPXORQZ256rm: {
+    // Without DQI, convert EVEX instructions to VEX instructions.
+    if (Subtarget.hasDQI())
+      return false;
+
+    const uint16_t *table = lookupAVX512(MI.getOpcode(), dom,
+                                         ReplaceableCustomAVX512LogicInstrs);
+    assert(table && "Instruction not found in table?");
+    // Don't change integer Q instructions to D instructions and
+    // use D intructions if we started with a PS instruction.
+    if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+      Domain = 4;
+    MI.setDesc(get(table[Domain - 1]));
+    return true;
+  }
+  }
+  return false;
+}
+
 std::pair<uint16_t, uint16_t>
 X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
   uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   unsigned opcode = MI.getOpcode();
   uint16_t validDomains = 0;
   if (domain) {
-    if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
+    // Attempt to match for custom instructions.
+    validDomains = getExecutionDomainCustom(MI);
+    if (validDomains)
+      return std::make_pair(domain, validDomains);
+
+    if (lookup(opcode, domain, ReplaceableInstrs)) {
       validDomains = 0xe;
     } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
       validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
@@ -9849,6 +6664,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
   uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
+
+  // Attempt to match for custom instructions.
+  if (setExecutionDomainCustom(MI, Domain))
+    return;
+
   const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
   if (!table) { // try the other table
     assert((Subtarget.hasAVX2() || Domain < 3) &&
@@ -10535,9 +7355,10 @@ namespace {
         static_cast<const X86TargetMachine *>(&MF.getTarget());
       const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
 
-      // Don't do anything if this is 64-bit as 64-bit PIC
-      // uses RIP relative addressing.
-      if (STI.is64Bit())
+      // Don't do anything in the 64-bit small and kernel code models. They use
+      // RIP-relative addressing for everything.
+      if (STI.is64Bit() && (TM->getCodeModel() == CodeModel::Small ||
+                            TM->getCodeModel() == CodeModel::Kernel))
         return false;
 
       // Only emit a global base reg in PIC mode.
@@ -10564,17 +7385,41 @@ namespace {
       else
         PC = GlobalBaseReg;
 
-      // Operand of MovePCtoStack is completely ignored by asm printer. It's
-      // only used in JIT code emission as displacement to pc.
-      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
-
-      // If we're using vanilla 'GOT' PIC style, we should use relative addressing
-      // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
-      if (STI.isPICStyleGOT()) {
-        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
-        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
-          .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
-                                        X86II::MO_GOT_ABSOLUTE_ADDRESS);
+      if (STI.is64Bit()) {
+        if (TM->getCodeModel() == CodeModel::Medium) {
+          // In the medium code model, use a RIP-relative LEA to materialize the
+          // GOT.
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
+              .addReg(0);
+        } else if (TM->getCodeModel() == CodeModel::Large) {
+          // Loading the GOT in the large code model requires math with labels,
+          // so we use a pseudo instruction and expand it during MC emission.
+          unsigned Scratch = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVGOT64r), PC)
+              .addReg(Scratch, RegState::Undef | RegState::Define)
+              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+        } else {
+          llvm_unreachable("unexpected code model");
+        }
+      } else {
+        // Operand of MovePCtoStack is completely ignored by asm printer. It's
+        // only used in JIT code emission as displacement to pc.
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+        // If we're using vanilla 'GOT' PIC style, we should use relative
+        // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+        if (STI.isPICStyleGOT()) {
+          // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
+          // %some_register
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+              .addReg(PC)
+              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                 X86II::MO_GOT_ABSOLUTE_ADDRESS);
+        }
       }
 
       return true;
@@ -10743,21 +7588,36 @@ enum MachineOutlinerClass {
   MachineOutlinerTailCall
 };
 
-X86GenInstrInfo::MachineOutlinerInfo
-X86InstrInfo::getOutlininingCandidateInfo(
-  std::vector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-      &RepeatedSequenceLocs) const {
-
-  if (RepeatedSequenceLocs[0].second->isTerminator())
-    return MachineOutlinerInfo(1, // Number of instructions to emit call.
-                               0, // Number of instructions to emit frame.
-                               MachineOutlinerTailCall, // Type of call.
-                               MachineOutlinerTailCall // Type of frame.
-                              );
-
-  return MachineOutlinerInfo(1, 1, MachineOutlinerDefault,
-                             MachineOutlinerDefault);
+outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
+    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+  unsigned SequenceSize =
+      std::accumulate(RepeatedSequenceLocs[0].front(),
+                      std::next(RepeatedSequenceLocs[0].back()), 0,
+                      [](unsigned Sum, const MachineInstr &MI) {
+                        // FIXME: x86 doesn't implement getInstSizeInBytes, so
+                        // we can't tell the cost.  Just assume each instruction
+                        // is one byte.
+                        if (MI.isDebugInstr() || MI.isKill())
+                          return Sum;
+                        return Sum + 1;
+                      });
+
+  // FIXME: Use real size in bytes for call and ret instructions.
+  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+    for (outliner::Candidate &C : RepeatedSequenceLocs)
+      C.setCallInfo(MachineOutlinerTailCall, 1);
+
+    return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+                                      0, // Number of bytes to emit frame.
+                                      MachineOutlinerTailCall // Type of frame.
+    );
+  }
+
+  for (outliner::Candidate &C : RepeatedSequenceLocs)
+    C.setCallInfo(MachineOutlinerDefault, 1);
+
+  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1,
+                                    MachineOutlinerDefault);
 }
 
 bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
@@ -10766,8 +7626,12 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
 
   // Does the function use a red zone? If it does, then we can't risk messing
   // with the stack.
-  if (!F.hasFnAttribute(Attribute::NoRedZone))
+  if (!F.hasFnAttribute(Attribute::NoRedZone)) {
+    // It could have a red zone. If it does, then we don't want to touch it.
+    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    if (!X86FI || X86FI->getUsesRedZone())
       return false;
+  }
 
   // If we *don't* want to outline from things that could potentially be deduped
   // then return false.
@@ -10778,26 +7642,31 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
   return true;
 }
 
-X86GenInstrInfo::MachineOutlinerInstrType
-X86InstrInfo::getOutliningType(MachineInstr &MI) const {
-
+outliner::InstrType
+X86InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,  unsigned Flags) const {
+  MachineInstr &MI = *MIT;
   // Don't allow debug values to impact outlining type.
-  if (MI.isDebugValue() || MI.isIndirectDebugValue())
-    return MachineOutlinerInstrType::Invisible;
+  if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+    return outliner::InstrType::Invisible;
+
+  // At this point, KILL instructions don't really tell us much so we can go
+  // ahead and skip over them.
+  if (MI.isKill())
+    return outliner::InstrType::Invisible;
 
   // Is this a tail call? If yes, we can outline as a tail call.
   if (isTailCall(MI))
-    return MachineOutlinerInstrType::Legal;
+    return outliner::InstrType::Legal;
 
   // Is this the terminator of a basic block?
   if (MI.isTerminator() || MI.isReturn()) {
 
     // Does its parent have any successors in its MachineFunction?
     if (MI.getParent()->succ_empty())
-        return MachineOutlinerInstrType::Legal;
+      return outliner::InstrType::Legal;
 
     // It does, so we can't tail call it.
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
   }
 
   // Don't outline anything that modifies or reads from the stack pointer.
@@ -10812,33 +7681,33 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const {
   if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
       MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
       MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
 
   // Outlined calls change the instruction pointer, so don't read from it.
   if (MI.readsRegister(X86::RIP, &RI) ||
       MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
       MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
 
   // Positions can't safely be outlined.
   if (MI.isPosition())
-    return MachineOutlinerInstrType::Illegal;
+    return outliner::InstrType::Illegal;
 
   // Make sure none of the operands of this instruction do anything tricky.
   for (const MachineOperand &MOP : MI.operands())
     if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
         MOP.isTargetIndex())
-      return MachineOutlinerInstrType::Illegal;
+      return outliner::InstrType::Illegal;
 
-  return MachineOutlinerInstrType::Legal;
+  return outliner::InstrType::Legal;
 }
 
-void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB,
                                           MachineFunction &MF,
-                                          const MachineOutlinerInfo &MInfo)
+                                          const outliner::OutlinedFunction &OF)
                                           const {
   // If we're a tail call, we already have a return, so don't do anything.
-  if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
+  if (OF.FrameConstructionID == MachineOutlinerTailCall)
     return;
 
   // We're a normal call, so our sequence doesn't have a return instruction.
@@ -10847,18 +7716,13 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
   MBB.insert(MBB.end(), retq);
 }
 
-void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
-                                          MachineFunction &MF,
-                                          const MachineOutlinerInfo &MInfo)
-                                          const {}
-
 MachineBasicBlock::iterator
 X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator &It,
                                  MachineFunction &MF,
-                                 const MachineOutlinerInfo &MInfo) const {
+                                 const outliner::Candidate &C) const {
   // Is it a tail call?
-  if (MInfo.CallConstructionID == MachineOutlinerTailCall) {
+  if (C.CallConstructionID == MachineOutlinerTailCall) {
     // Yes, just insert a JMP.
     It = MBB.insert(It,
                   BuildMI(MF, DebugLoc(), get(X86::JMP_1))
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index 2b5ad934f9b1..b1ceb767cce4 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -17,8 +17,9 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86InstrFMA3Info.h"
 #include "X86RegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include <vector>
 
 #define GET_INSTRINFO_HEADER
 #include "X86GenInstrInfo.inc"
@@ -29,6 +30,12 @@ class X86RegisterInfo;
 class X86Subtarget;
 
 namespace X86 {
+
+enum AsmComments {
+  // For instr that was compressed from EVEX to VEX.
+  AC_EVEX_2_VEX = MachineInstr::TAsmComments
+};
+
 // X86 specific condition code. These correspond to X86_*_COND in
 // X86InstrInfo.td. They must be kept in synch.
 enum CondCode {
@@ -64,15 +71,15 @@ enum CondCode {
 // Turn condition code into conditional branch opcode.
 unsigned GetCondBranchFromCond(CondCode CC);
 
-/// \brief Return a pair of condition code for the given predicate and whether
+/// Return a pair of condition code for the given predicate and whether
 /// the instruction operands should be swaped to match the condition code.
 std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
 
-/// \brief Return a set opcode for the given condition and whether it has
+/// Return a set opcode for the given condition and whether it has
 /// a memory operand.
 unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
 
-/// \brief Return a cmov opcode for the given condition, register size in
+/// Return a cmov opcode for the given condition, register size in
 /// bytes, and operand type.
 unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
                          bool HasMemoryOperand = false);
@@ -89,6 +96,16 @@ CondCode getCondFromCMovOpc(unsigned Opc);
 /// GetOppositeBranchCondition - Return the inverse of the specified cond,
 /// e.g. turning COND_E to COND_NE.
 CondCode GetOppositeBranchCondition(CondCode CC);
+
+/// Get the VPCMP immediate for the given condition.
+unsigned getVPCMPImmForCond(ISD::CondCode CC);
+
+/// Get the VPCMP immediate if the opcodes are swapped.
+unsigned getSwappedVPCMPImm(unsigned Imm);
+
+/// Get the VPCOM immediate if the opcodes are swapped.
+unsigned getSwappedVPCOMImm(unsigned Imm);
+
 } // namespace X86
 
 /// isGlobalStubReference - Return true if the specified TargetFlag operand is
@@ -151,28 +168,6 @@ class X86InstrInfo final : public X86GenInstrInfo {
   X86Subtarget &Subtarget;
   const X86RegisterInfo RI;
 
-  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
-  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
-  ///
-  typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>>
-      RegOp2MemOpTableType;
-  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
-  RegOp2MemOpTableType RegOp2MemOpTable0;
-  RegOp2MemOpTableType RegOp2MemOpTable1;
-  RegOp2MemOpTableType RegOp2MemOpTable2;
-  RegOp2MemOpTableType RegOp2MemOpTable3;
-  RegOp2MemOpTableType RegOp2MemOpTable4;
-
-  /// MemOp2RegOpTable - Load / store unfolding opcode map.
-  ///
-  typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>>
-      MemOp2RegOpTableType;
-  MemOp2RegOpTableType MemOp2RegOpTable;
-
-  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
-                            MemOp2RegOpTableType &M2RTable, uint16_t RegOp,
-                            uint16_t MemOp, uint16_t Flags);
-
   virtual void anchor();
 
   bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -225,6 +220,9 @@ public:
 
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex,
+                               unsigned &MemBytes) const override;
   /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
@@ -233,6 +231,9 @@ public:
 
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex,
+                              unsigned &MemBytes) const override;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
@@ -291,34 +292,6 @@ public:
   bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
-  /// Returns true if the routine could find two commutable operands
-  /// in the given FMA instruction \p MI. Otherwise, returns false.
-  ///
-  /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
-  /// The output indices of the commuted operands are returned in these
-  /// arguments. Also, the input values of these arguments may be preset either
-  /// to indices of operands that must be commuted or be equal to a special
-  /// value 'CommuteAnyOperandIndex' which means that the corresponding
-  /// operand index is not set and this method is free to pick any of
-  /// available commutable operands.
-  /// The parameter \p FMA3Group keeps the reference to the group of relative
-  /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
-  ///
-  /// For example, calling this method this way:
-  ///     unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
-  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
-  /// can be interpreted as a query asking if the operand #1 can be swapped
-  /// with any other available operand (e.g. operand #2, operand #3, etc.).
-  ///
-  /// The returned FMA opcode may differ from the opcode in the given MI.
-  /// For example, commuting the operands #1 and #3 in the following FMA
-  ///     FMA213 #1, #2, #3
-  /// results into instruction with adjusted opcode:
-  ///     FMA231 #3, #2, #1
-  bool findFMA3CommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
-                                 unsigned &SrcOpIdx2,
-                                 const X86InstrFMA3Group &FMA3Group) const;
-
   /// Returns an adjusted FMA opcode that must be used in FMA instruction that
   /// performs the same computations as the given \p MI but which has the
   /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
@@ -375,6 +348,8 @@ public:
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
+  bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
+                   const MachineOperand *&Dest) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -496,8 +471,12 @@ public:
   std::pair<uint16_t, uint16_t>
   getExecutionDomain(const MachineInstr &MI) const override;
 
+  uint16_t getExecutionDomainCustom(const MachineInstr &MI) const;
+
   void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
 
+  bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const;
+
   unsigned
   getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
                                const TargetRegisterInfo *TRI) const override;
@@ -565,27 +544,22 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
-  virtual MachineOutlinerInfo getOutlininingCandidateInfo(
-      std::vector<
-          std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-          &RepeatedSequenceLocs) const override;
+  virtual outliner::OutlinedFunction getOutliningCandidateInfo(
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
 
   bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
                                    bool OutlineFromLinkOnceODRs) const override;
 
-  llvm::X86GenInstrInfo::MachineOutlinerInstrType
-  getOutliningType(MachineInstr &MI) const override;
-
-  void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF,
-                              const MachineOutlinerInfo &MInfo) const override;
+  outliner::InstrType
+  getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
 
-  void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF,
-                              const MachineOutlinerInfo &MInfo) const override;
+  void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+                          const outliner::OutlinedFunction &OF) const override;
 
   MachineBasicBlock::iterator
   insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
-                     const MachineOutlinerInfo &MInfo) const override;
+                     const outliner::Candidate &C) const override;
 
 protected:
   /// Commutes the operands in the given instruction by changing the operands
@@ -637,9 +611,12 @@ private:
   ///     findThreeSrcCommutedOpIndices(MI, Op1, Op2);
   /// can be interpreted as a query asking to find an operand that would be
   /// commutable with the operand#1.
+  ///
+  /// If IsIntrinsic is set, operand 1 will be ignored for commuting.
   bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
                                      unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2) const;
+                                     unsigned &SrcOpIdx2,
+                                     bool IsIntrinsic = false) const;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
index 68f40c28d527..7509b312c100 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -94,6 +94,8 @@ def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
 
 def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 
+def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
 def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
                                                          SDTCisVT<1, iPTR>,
                                                          SDTCisVT<2, iPTR>]>;
@@ -196,6 +198,12 @@ def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                          SDNPVariadic]>;
 
+def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
+                            [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                             SDNPVariadic]>;
+def X86NoTrackBrind : SDNode<"X86ISD::NT_BRIND", SDT_X86NtBrind,
+                             [SDNPHasChain]>;
+
 def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
 def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
@@ -281,6 +289,8 @@ def X86lock_dec  : SDNode<"X86ISD::LDEC",  SDTLockUnaryArithWithFlags,
                           [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                            SDNPMemOperand]>;
 
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
+
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -297,6 +307,16 @@ def X86lwpins : SDNode<"X86ISD::LWPINS",
                                             SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
                        [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>;
 
+def X86umwait : SDNode<"X86ISD::UMWAIT",
+                       SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                            SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+                       [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tpause : SDNode<"X86ISD::TPAUSE",
+                       SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                            SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+                       [SDNPHasChain, SDNPSideEffect]>;
+
 //===----------------------------------------------------------------------===//
 // X86 Operand Definitions.
 //
@@ -358,10 +378,9 @@ class X86VMemOperand<RegisterClass RC, string printMethod,
 
 def anymem : X86MemOperand<"printanymem">;
 
-def opaque32mem : X86MemOperand<"printopaquemem">;
-def opaque48mem : X86MemOperand<"printopaquemem">;
-def opaque80mem : X86MemOperand<"printopaquemem">;
-def opaque512mem : X86MemOperand<"printopaquemem">;
+// FIXME: Right now we allow any size during parsing, but we might want to
+// restrict to only unsized memory.
+def opaquemem : X86MemOperand<"printopaquemem">;
 
 def i8mem   : X86MemOperand<"printi8mem",   X86Mem8AsmOperand>;
 def i16mem  : X86MemOperand<"printi16mem",  X86Mem16AsmOperand>;
@@ -391,8 +410,8 @@ def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
 def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
 def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
 def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
-def vy512mem  : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
-def vz256xmem : X86VMemOperand<VR512,  "printi256mem", X86Mem256_RC512Operand>;
+def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz256mem  : X86VMemOperand<VR512,  "printi256mem", X86Mem256_RC512Operand>;
 def vz512mem  : X86VMemOperand<VR512,  "printi512mem", X86Mem512_RC512Operand>;
 
 // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
@@ -807,36 +826,26 @@ def NoAVX        : Predicate<"!Subtarget->hasAVX()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasAVX512    : Predicate<"Subtarget->hasAVX512()">,
-                     AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
-def HasCDI       : Predicate<"Subtarget->hasCDI()">,
-                     AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
-def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
-                   AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
-def HasPFI       : Predicate<"Subtarget->hasPFI()">,
-                     AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
-def HasERI       : Predicate<"Subtarget->hasERI()">,
-                     AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
-def HasDQI       : Predicate<"Subtarget->hasDQI()">,
-                     AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
+def HasCDI       : Predicate<"Subtarget->hasCDI()">;
+def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
+def HasPFI       : Predicate<"Subtarget->hasPFI()">;
+def HasERI       : Predicate<"Subtarget->hasERI()">;
+def HasDQI       : Predicate<"Subtarget->hasDQI()">;
 def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
-def HasBWI       : Predicate<"Subtarget->hasBWI()">,
-                     AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def HasBWI       : Predicate<"Subtarget->hasBWI()">;
 def NoBWI        : Predicate<"!Subtarget->hasBWI()">;
-def HasVLX       : Predicate<"Subtarget->hasVLX()">,
-                     AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
+def HasVLX       : Predicate<"Subtarget->hasVLX()">;
 def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
 def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
 def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
 def PKU        : Predicate<"Subtarget->hasPKU()">;
-def HasVNNI    : Predicate<"Subtarget->hasVNNI()">,
-                     AssemblerPredicate<"FeatureVNNI", "AVX-512 VNNI ISA">;
+def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 
-def HasBITALG    : Predicate<"Subtarget->hasBITALG()">,
-                     AssemblerPredicate<"FeatureBITALG", "AVX-512 BITALG ISA">;
+def HasBITALG    : Predicate<"Subtarget->hasBITALG()">;
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
 def HasVAES      : Predicate<"Subtarget->hasVAES()">;
@@ -866,15 +875,13 @@ def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
 def NoBMI2       : Predicate<"!Subtarget->hasBMI2()">;
-def HasVBMI      : Predicate<"Subtarget->hasVBMI()">,
-                     AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
-def HasVBMI2     : Predicate<"Subtarget->hasVBMI2()">,
-                     AssemblerPredicate<"FeatureVBMI2", "AVX-512 VBMI2 ISA">;
-def HasIFMA      : Predicate<"Subtarget->hasIFMA()">,
-                     AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
+def HasVBMI      : Predicate<"Subtarget->hasVBMI()">;
+def HasVBMI2     : Predicate<"Subtarget->hasVBMI2()">;
+def HasIFMA      : Predicate<"Subtarget->hasIFMA()">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
+def HasSGX       : Predicate<"Subtarget->hasSGX()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
@@ -884,14 +891,22 @@ def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
 def HasCLZERO    : Predicate<"Subtarget->hasCLZERO()">;
+def HasCLDEMOTE  : Predicate<"Subtarget->hasCLDEMOTE()">;
+def HasMOVDIRI   : Predicate<"Subtarget->hasMOVDIRI()">;
+def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
+def HasPTWRITE   : Predicate<"Subtarget->hasPTWRITE()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
 def HasSHSTK     : Predicate<"Subtarget->hasSHSTK()">;
-def HasIBT       : Predicate<"Subtarget->hasIBT()">;
 def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCLWB      : Predicate<"Subtarget->hasCLWB()">;
+def HasWBNOINVD  : Predicate<"Subtarget->hasWBNOINVD()">;
+def HasRDPID     : Predicate<"Subtarget->hasRDPID()">;
+def HasWAITPKG   : Predicate<"Subtarget->hasWAITPKG()">;
+def HasINVPCID   : Predicate<"Subtarget->hasINVPCID()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def HasPCONFIG   : Predicate<"Subtarget->hasPCONFIG()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
@@ -929,6 +944,8 @@ let RecomputePerFunction = 1 in {
   def OptForSpeed  : Predicate<"!MF->getFunction().optForSize()">;
   def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
                             "MF->getFunction().optForSize()">;
+  def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || "
+                                        "!Subtarget->hasSSE41()">;
 }
 
 def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
@@ -1040,6 +1057,17 @@ def i64immZExt32SExt8 : ImmLeaf<i64, [{
 }]>;
 
 // Helper fragments for loads.
+
+// It's safe to fold a zextload/extload from i1 as a regular i8 load. The
+// upper bits are guaranteed to be zero and we were going to emit a MOV8rm
+// which might get folded during peephole anyway.
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  return ExtType == ISD::NON_EXTLOAD || ExtType == ISD::EXTLOAD ||
+         ExtType == ISD::ZEXTLOAD;
+}]>;
+
 // It's always safe to treat a anyext i16 load as a i32 load if the i16 is
 // known to be 32-bit aligned or better. Ditto for i8 to i16.
 def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
@@ -1052,14 +1080,6 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
   return false;
 }]>;
 
-def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  if (ExtType == ISD::EXTLOAD)
-    return LD->getAlignment() >= 2 && !LD->isVolatile();
-  return false;
-}]>;
-
 def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::LoadExtType ExtType = LD->getExtensionType();
@@ -1070,12 +1090,20 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
   return false;
 }]>;
 
-def loadi8   : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
 def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
 def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
 def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
 def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
 def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+def alignedloadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
+def memopf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  return Subtarget->hasSSEUnalignedMem() ||
+         Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
 
 def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
 def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
@@ -1125,39 +1153,37 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 //
 
 // Nop
-let hasSideEffects = 0, SchedRW = [WriteZero] in {
-  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+let hasSideEffects = 0, SchedRW = [WriteNop] in {
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
   def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
-                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+                "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
   def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
-                "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+                "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
   def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero),
-                "nop{q}\t$zero", [], IIC_NOP>, TB,
+                "nop{q}\t$zero", []>, TB, NotMemoryFoldable,
                 Requires<[In64BitMode]>;
   // Also allow register so we can assemble/disassemble
   def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero),
-                 "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+                 "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
   def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero),
-                 "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+                 "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
   def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero),
-                  "nop{q}\t$zero", [], IIC_NOP>, TB,
+                  "nop{q}\t$zero", []>, TB, NotMemoryFoldable,
                   Requires<[In64BitMode]>;
 }
 
 
 // Constructing a stack frame.
 def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
-                 "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+                 "enter\t$len, $lvl", []>, Sched<[WriteMicrocoded]>;
 
 let SchedRW = [WriteALU] in {
 let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
-def LEAVE    : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", [], IIC_LEAVE>,
+def LEAVE    : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
                  Requires<[Not64BitMode]>;
 
 let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
-def LEAVE64  : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", [], IIC_LEAVE>,
+def LEAVE64  : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
                  Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -1172,50 +1198,56 @@ let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1,
 
 let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
-def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
-                IIC_POP_REG16>, OpSize16;
-def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
-                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
-def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
-                IIC_POP_REG>, OpSize16;
-def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
-                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+                OpSize16;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
+                OpSize32, Requires<[Not64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+                OpSize16, NotMemoryFoldable;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
+                OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
 } // mayLoad, SchedRW
 let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in {
-def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
-                IIC_POP_MEM>, OpSize16;
-def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
-                IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", []>,
+                OpSize16;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", []>,
+                OpSize32, Requires<[Not64BitMode]>;
 } // mayStore, mayLoad, WriteRMW
 
 let mayStore = 1, SchedRW = [WriteStore] in {
-def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
-                 IIC_PUSH_REG>, OpSize16;
-def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
-                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
-def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
-                 IIC_PUSH_REG>, OpSize16;
-def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
-                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+                 OpSize16;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
+                 OpSize32, Requires<[Not64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+                 OpSize16, NotMemoryFoldable;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
+                 OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
 
 def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
-                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+                   "push{w}\t$imm", []>, OpSize16;
 def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+                   "push{w}\t$imm", []>, OpSize16;
 
 def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
-                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   "push{l}\t$imm", []>, OpSize32,
                    Requires<[Not64BitMode]>;
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
-                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   "push{l}\t$imm", []>, OpSize32,
                    Requires<[Not64BitMode]>;
 } // mayStore, SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
-                 IIC_PUSH_MEM>, OpSize16;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
-                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src", []>,
+                 OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src", []>,
+                 OpSize32, Requires<[Not64BitMode]>;
 } // mayLoad, mayStore, SchedRW
 
 }
@@ -1248,186 +1280,199 @@ let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
 
 let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
     SchedRW = [WriteLoad] in {
-def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
-                OpSize16;
-def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
-                OpSize32, Requires<[Not64BitMode]>;
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize16;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, OpSize32,
+                 Requires<[Not64BitMode]>;
 }
 
 let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0,
     SchedRW = [WriteStore] in {
-def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
-                 OpSize16;
-def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
-               OpSize32, Requires<[Not64BitMode]>;
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize16;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, OpSize32,
+                 Requires<[Not64BitMode]>;
 }
 
 let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
-def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
-                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
-def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
-                IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
+                 OpSize32, Requires<[In64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
+                OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
 } // mayLoad, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in
-def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
-                IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", []>,
+                OpSize32, Requires<[In64BitMode]>;
 let mayStore = 1, SchedRW = [WriteStore] in {
-def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
-                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
-def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
-                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
+                 OpSize32, Requires<[In64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
+                 OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
 } // mayStore, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
-def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
-                 IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
+                 OpSize32, Requires<[In64BitMode]>;
 } // mayLoad, mayStore, SchedRW
 }
 
 let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
     SchedRW = [WriteStore] in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
-                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    "push{q}\t$imm", []>, OpSize32,
                     Requires<[In64BitMode]>;
 def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
-                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    "push{q}\t$imm", []>, OpSize32,
                     Requires<[In64BitMode]>;
 }
 
 let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
-def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
                OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
 let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in
-def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
                  OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", []>,
                OpSize32, Requires<[Not64BitMode]>;
-def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", []>,
                OpSize16, Requires<[Not64BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", []>,
                OpSize32, Requires<[Not64BitMode]>;
-def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", []>,
                OpSize16, Requires<[Not64BitMode]>;
 }
 
-let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+let Constraints = "$src = $dst", SchedRW = [WriteBSWAP32] in {
+// This instruction is a consequence of BSWAP32r observing operand size. The
+// encoding is valid, but the behavior is undefined.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+def BSWAP16r_BAD : I<0xC8, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+                     "bswap{w}\t$dst", []>, OpSize16, TB;
 // GR32 = bswap GR32
-def BSWAP32r : I<0xC8, AddRegFrm,
-                 (outs GR32:$dst), (ins GR32:$src),
+def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
                  "bswap{l}\t$dst",
-                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+                 [(set GR32:$dst, (bswap GR32:$src))]>, OpSize32, TB;
 
+let SchedRW = [WriteBSWAP64] in
 def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
                   "bswap{q}\t$dst",
-                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
 } // Constraints = "$src = $dst", SchedRW
 
 // Bit scan instructions.
 let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
-                  IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
+                  PS, OpSize16, Sched<[WriteBSF]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
-                  IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
+                 PS, OpSize16, Sched<[WriteBSFLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
-                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
+                 PS, OpSize32, Sched<[WriteBSF]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
-                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
+                 PS, OpSize32, Sched<[WriteBSFLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
-                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
+                  PS, Sched<[WriteBSF]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
-                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
+                  PS, Sched<[WriteBSFLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
-                 IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
+                 PS, OpSize16, Sched<[WriteBSR]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
-                 IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
+                 PS, OpSize16, Sched<[WriteBSRLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
-                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
+                 PS, OpSize32, Sched<[WriteBSR]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
-                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
+                 PS, OpSize32, Sched<[WriteBSRLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
-                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
+                  PS, Sched<[WriteBSR]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
-                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
+                  PS, Sched<[WriteBSRLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in {
 def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
-              "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+              "movsb\t{$src, $dst|$dst, $src}", []>;
 def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
-              "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+              "movsw\t{$src, $dst|$dst, $src}", []>, OpSize16;
 def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
-              "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+              "movs{l|d}\t{$src, $dst|$dst, $src}", []>, OpSize32;
 def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
-               "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+               "movsq\t{$src, $dst|$dst, $src}", []>,
+               Requires<[In64BitMode]>;
 }
 
 let Defs = [EDI], Uses = [AL,EDI,DF] in
 def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
-              "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
+              "stosb\t{%al, $dst|$dst, al}", []>;
 let Defs = [EDI], Uses = [AX,EDI,DF] in
 def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
-              "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
+              "stosw\t{%ax, $dst|$dst, ax}", []>, OpSize16;
 let Defs = [EDI], Uses = [EAX,EDI,DF] in
 def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
-              "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
+              "stos{l|d}\t{%eax, $dst|$dst, eax}", []>, OpSize32;
 let Defs = [RDI], Uses = [RAX,RDI,DF] in
 def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
-               "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+               "stosq\t{%rax, $dst|$dst, rax}", []>,
+               Requires<[In64BitMode]>;
 
 let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in
 def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
-              "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+              "scasb\t{$dst, %al|al, $dst}", []>;
 let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in
 def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
-              "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+              "scasw\t{$dst, %ax|ax, $dst}", []>, OpSize16;
 let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in
 def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
-              "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+              "scas{l|d}\t{$dst, %eax|eax, $dst}", []>, OpSize32;
 let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in
 def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
-               "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+               "scasq\t{$dst, %rax|rax, $dst}", []>,
+               Requires<[In64BitMode]>;
 
 let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in {
 def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
-              "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+              "cmpsb\t{$dst, $src|$src, $dst}", []>;
 def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
-              "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+              "cmpsw\t{$dst, $src|$src, $dst}", []>, OpSize16;
 def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
-              "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+              "cmps{l|d}\t{$dst, $src|$src, $dst}", []>, OpSize32;
 def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
-               "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+               "cmpsq\t{$dst, $src|$src, $dst}", []>,
+               Requires<[In64BitMode]>;
 }
 } // SchedRW
 
@@ -1435,47 +1480,47 @@ def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
 //  Move Instructions.
 //
 let SchedRW = [WriteMove] in {
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, isMoveReg = 1 in {
 def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
-                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                "mov{b}\t{$src, $dst|$dst, $src}", []>;
 def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
 def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
 def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
+                   [(set GR8:$dst, imm:$src)]>;
 def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+                   [(set GR16:$dst, imm:$src)]>, OpSize16;
 def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32;
+                   [(set GR32:$dst, relocImm:$src)]>, OpSize32;
 def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
                        "mov{q}\t{$src, $dst|$dst, $src}",
-                       [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+                       [(set GR64:$dst, i64immSExt32:$src)]>;
 }
 let isReMaterializable = 1 in {
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, relocImm:$src)], IIC_MOV>;
+                    [(set GR64:$dst, relocImm:$src)]>;
 }
 
 // Longer forms that use a ModR/M byte. Needed for disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                   "mov{b}\t{$src, $dst|$dst, $src}", []>,
                    FoldGenData<"MOV8ri">;
 def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
-                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+                   "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
                    FoldGenData<"MOV16ri">;
 def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
-                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+                   "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
                    FoldGenData<"MOV32ri">;
 }
 } // SchedRW
@@ -1483,16 +1528,16 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
 let SchedRW = [WriteStore] in {
 def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
+                   [(store (i8 imm8_su:$src), addr:$dst)]>;
 def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+                   [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16;
 def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+                   [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32;
 def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                        "mov{q}\t{$src, $dst|$dst, $src}",
-                       [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>,
+                       [(store i64immSExt32_su:$src, addr:$dst)]>,
                        Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -1504,183 +1549,200 @@ let SchedRW = [WriteALU] in {
 let mayLoad = 1 in {
 let Defs = [AL] in
 def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
-                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                    "mov{b}\t{$src, %al|al, $src}", []>,
                     AdSize32;
 let Defs = [AX] in
 def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
-                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     "mov{w}\t{$src, %ax|ax, $src}", []>,
                      OpSize16, AdSize32;
 let Defs = [EAX] in
 def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
-                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     "mov{l}\t{$src, %eax|eax, $src}", []>,
                      OpSize32, AdSize32;
 let Defs = [RAX] in
 def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
-                      "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+                      "mov{q}\t{$src, %rax|rax, $src}", []>,
                       AdSize32;
 
 let Defs = [AL] in
 def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
-                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
+                    "mov{b}\t{$src, %al|al, $src}", []>, AdSize16;
 let Defs = [AX] in
 def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
-                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     "mov{w}\t{$src, %ax|ax, $src}", []>,
                      OpSize16, AdSize16;
 let Defs = [EAX] in
 def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
-                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     "mov{l}\t{$src, %eax|eax, $src}", []>,
                      AdSize16, OpSize32;
-}
+} // mayLoad
 let mayStore = 1 in {
 let Uses = [AL] in
 def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
-                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
+                    "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize32;
 let Uses = [AX] in
 def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
-                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     "mov{w}\t{%ax, $dst|$dst, ax}", []>,
                      OpSize16, AdSize32;
 let Uses = [EAX] in
 def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
-                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     "mov{l}\t{%eax, $dst|$dst, eax}", []>,
                      OpSize32, AdSize32;
 let Uses = [RAX] in
 def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
-                      "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+                      "mov{q}\t{%rax, $dst|$dst, rax}", []>,
                       AdSize32;
 
 let Uses = [AL] in
 def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
-                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
+                    "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize16;
 let Uses = [AX] in
 def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
-                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     "mov{w}\t{%ax, $dst|$dst, ax}", []>,
                      OpSize16, AdSize16;
 let Uses = [EAX] in
 def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
-                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     "mov{l}\t{%eax, $dst|$dst, eax}", []>,
                      OpSize32, AdSize16;
-}
-}
+} // mayStore
 
 // These forms all have full 64-bit absolute addresses in their instructions
 // and use the movabs mnemonic to indicate this specific form.
 let mayLoad = 1 in {
 let Defs = [AL] in
-def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
-                     "movabs{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
-                     AdSize64;
+def MOV8ao64 : Ii64<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+                    "movabs{b}\t{$src, %al|al, $src}", []>,
+                    AdSize64;
 let Defs = [AX] in
-def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
-                     "movabs{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+def MOV16ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>,
                      OpSize16, AdSize64;
 let Defs = [EAX] in
-def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
-                     "movabs{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+def MOV32ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+                     "movabs{l}\t{$src, %eax|eax, $src}", []>,
                      OpSize32, AdSize64;
 let Defs = [RAX] in
 def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
-                     "movabs{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+                     "movabs{q}\t{$src, %rax|rax, $src}", []>,
                      AdSize64;
-}
+} // mayLoad
 
 let mayStore = 1 in {
 let Uses = [AL] in
-def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
-                     "movabs{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
-                     AdSize64;
+def MOV8o64a : Ii64<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
+                    "movabs{b}\t{%al, $dst|$dst, al}", []>,
+                    AdSize64;
 let Uses = [AX] in
-def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
-                     "movabs{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+def MOV16o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>,
                      OpSize16, AdSize64;
 let Uses = [EAX] in
-def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
-                     "movabs{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+def MOV32o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
+                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
                      OpSize32, AdSize64;
 let Uses = [RAX] in
 def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
-                     "movabs{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
                      AdSize64;
-}
+} // mayStore
+} // SchedRW
 } // hasSideEffects = 0
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    SchedRW = [WriteMove] in {
+    SchedRW = [WriteMove], isMoveReg = 1 in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                   "mov{b}\t{$src, $dst|$dst, $src}", []>,
                    FoldGenData<"MOV8rr">;
 def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+                    "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
                     FoldGenData<"MOV16rr">;
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+                    "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
                     FoldGenData<"MOV32rr">;
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                     "mov{q}\t{$src, $dst|$dst, $src}", []>,
                      FoldGenData<"MOV64rr">;
 }
 
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"mov{b}.s\t{$src, $dst|$dst, $src}",
+                (MOV8rr_REV GR8:$dst, GR8:$src), 0>;
+def : InstAlias<"mov{w}.s\t{$src, $dst|$dst, $src}",
+                (MOV16rr_REV GR16:$dst, GR16:$src), 0>;
+def : InstAlias<"mov{l}.s\t{$src, $dst|$dst, $src}",
+                (MOV32rr_REV GR32:$dst, GR32:$src), 0>;
+def : InstAlias<"mov{q}.s\t{$src, $dst|$dst, $src}",
+                (MOV64rr_REV GR64:$dst, GR64:$src), 0>;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+                (MOV8rr_REV GR8:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+                (MOV16rr_REV GR16:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+                (MOV32rr_REV GR32:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+                (MOV64rr_REV GR64:$dst, GR64:$src), 0, "att">;
+
 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
 def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
-                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+                [(set GR8:$dst, (loadi8 addr:$src))]>;
 def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+                [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize16;
 def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+                [(set GR32:$dst, (loadi32 addr:$src))]>, OpSize32;
 def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
-                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+                 [(set GR64:$dst, (load addr:$src))]>;
 }
 
 let SchedRW = [WriteStore] in {
 def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
-                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+                [(store GR8:$src, addr:$dst)]>;
 def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+                [(store GR16:$src, addr:$dst)]>, OpSize16;
 def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+                [(store GR32:$src, addr:$dst)]>, OpSize32;
 def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
-                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+                 [(store GR64:$src, addr:$dst)]>;
 } // SchedRW
 
 // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
 // that they can be used for copying and storing h registers, which can't be
 // encoded when a REX prefix is present.
 let isCodeGenOnly = 1 in {
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isMoveReg = 1 in
 def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
+                     "mov{b}\t{$src, $dst|$dst, $src}", []>,
                    Sched<[WriteMove]>;
 let mayStore = 1, hasSideEffects = 0 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
-                     IIC_MOV_MEM>, Sched<[WriteStore]>;
+                     "mov{b}\t{$src, $dst|$dst, $src}", []>,
+                     Sched<[WriteStore]>;
 let mayLoad = 1, hasSideEffects = 0,
     canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
-                     IIC_MOV_MEM>, Sched<[WriteLoad]>;
+                     "mov{b}\t{$src, $dst|$dst, $src}", []>,
+                     Sched<[WriteLoad]>;
 }
 
 
 // Condition code ops, incl. set if equal/not equal/...
-let SchedRW = [WriteALU] in {
+let SchedRW = [WriteLAHFSAHF] in {
 let Defs = [EFLAGS], Uses = [AH] in
 def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
-                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
-               Requires<[HasLAHFSAHF]>;
+                 [(set EFLAGS, (X86sahf AH))]>,
+                 Requires<[HasLAHFSAHF]>;
 let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
-def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
-                IIC_AHF>,  // AH = flags
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>,  // AH = flags
                Requires<[HasLAHFSAHF]>;
 } // SchedRW
 
@@ -1691,15 +1753,15 @@ let Defs = [EFLAGS] in {
 let SchedRW = [WriteALU] in {
 def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                "bt{w}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>,
                OpSize16, TB, NotMemoryFoldable;
 def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
                "bt{l}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>,
                OpSize32, TB, NotMemoryFoldable;
 def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                "bt{q}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB,
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB,
                NotMemoryFoldable;
 } // SchedRW
 
@@ -1712,189 +1774,180 @@ def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
 let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
   def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
                  "bt{w}\t{$src2, $src1|$src1, $src2}",
-                 [], IIC_BT_MR
-                 >, OpSize16, TB, NotMemoryFoldable;
+                 []>, OpSize16, TB, NotMemoryFoldable;
   def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
                  "bt{l}\t{$src2, $src1|$src1, $src2}",
-                 [], IIC_BT_MR
-                 >, OpSize32, TB, NotMemoryFoldable;
+                 []>, OpSize32, TB, NotMemoryFoldable;
   def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                  "bt{q}\t{$src2, $src1|$src1, $src2}",
-                  [], IIC_BT_MR
-                  >, TB, NotMemoryFoldable;
+                  []>, TB, NotMemoryFoldable;
 }
 
 let SchedRW = [WriteALU] in {
 def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
-                IIC_BT_RI>, OpSize16, TB;
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
+                OpSize16, TB;
 def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
-                IIC_BT_RI>, OpSize32, TB;
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>,
+                OpSize32, TB;
 def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
-                IIC_BT_RI>, TB;
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
 } // SchedRW
 
 // Note that these instructions aren't slow because that only applies when the
 // other operand is in a register. When it's an immediate, bt is still fast.
 let SchedRW = [WriteALU] in {
 def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                "bt{w}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
-                 ], IIC_BT_MI>, OpSize16, TB;
+                  "bt{w}\t{$src2, $src1|$src1, $src2}",
+                  [(set EFLAGS, (X86bt (loadi16 addr:$src1),
+                                       i16immSExt8:$src2))]>,
+                  OpSize16, TB;
 def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                "bt{l}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
-                 ], IIC_BT_MI>, OpSize32, TB;
+                  "bt{l}\t{$src2, $src1|$src1, $src2}",
+                  [(set EFLAGS, (X86bt (loadi32 addr:$src1),
+                                       i32immSExt8:$src2))]>,
+                  OpSize32, TB;
 def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi64 addr:$src1),
-                                     i64immSExt8:$src2))], IIC_BT_MI>, TB,
+                                     i64immSExt8:$src2))]>, TB,
                 Requires<[In64BitMode]>;
 } // SchedRW
 
 let hasSideEffects = 0 in {
 let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
 def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
-                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                "btc{w}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize16, TB, NotMemoryFoldable;
 def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                "btc{l}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize32, TB, NotMemoryFoldable;
 def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
-                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB,
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                  NotMemoryFoldable;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                "btc{w}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize16, TB, NotMemoryFoldable;
 def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                "btc{l}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize32, TB, NotMemoryFoldable;
 def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                  NotMemoryFoldable;
 }
 
 let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
 def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
-                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize16, TB;
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
 def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize32, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
 def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
-                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize16, TB;
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
 def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize32, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
 def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                     Requires<[In64BitMode]>;
 }
 
 let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
 def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
-                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize16, TB, NotMemoryFoldable;
 def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize32, TB, NotMemoryFoldable;
 def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
-                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, NotMemoryFoldable;
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+                 NotMemoryFoldable;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize16, TB, NotMemoryFoldable;
 def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize32, TB, NotMemoryFoldable;
 def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                  NotMemoryFoldable;
 }
 
 let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
 def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
-                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
                     OpSize16, TB;
 def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
                     OpSize32, TB;
 def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
-                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
                     OpSize16, TB;
 def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
                     OpSize32, TB;
 def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                     Requires<[In64BitMode]>;
 }
 
 let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
 def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
-                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                "bts{w}\t{$src2, $src1|$src1, $src2}", []>,
                 OpSize16, TB, NotMemoryFoldable;
 def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                "bts{l}\t{$src2, $src1|$src1, $src2}", []>,
               OpSize32, TB, NotMemoryFoldable;
 def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
-               "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB,
+               "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                NotMemoryFoldable;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-              "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              "bts{w}\t{$src2, $src1|$src1, $src2}", []>,
               OpSize16, TB, NotMemoryFoldable;
 def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-              "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              "bts{l}\t{$src2, $src1|$src1, $src2}", []>,
               OpSize32, TB, NotMemoryFoldable;
 def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                  NotMemoryFoldable;
 }
 
 let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
 def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
-                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize16, TB;
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
 def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize32, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
 def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
-                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize16, TB;
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
 def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize32, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
 def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
                     Requires<[In64BitMode]>;
 }
 } // hasSideEffects = 0
@@ -1907,143 +1960,154 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 
 // Atomic swap. These are just normal xchg instructions. But since a memory
 // operand is referenced, the atomicity is ensured.
-multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
-                       InstrItinClass itin> {
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> {
   let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
     def NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
                       (ins GR8:$val, i8mem:$ptr),
                       !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
                       [(set
                          GR8:$dst,
-                         (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
-                      itin>;
+                         (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
     def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
                       (ins GR16:$val, i16mem:$ptr),
                       !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
                       [(set
                          GR16:$dst,
-                         (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
-                      itin>, OpSize16;
+                         (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+                      OpSize16;
     def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$val, i32mem:$ptr),
                       !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
                       [(set
                          GR32:$dst,
-                         (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
-                      itin>, OpSize32;
+                         (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+                      OpSize32;
     def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
                        (ins GR64:$val, i64mem:$ptr),
                        !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
                        [(set
                          GR64:$dst,
-                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
-                       itin>;
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
   }
 }
 
-defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap">, NotMemoryFoldable;
 
 // Swap between registers.
 let SchedRW = [WriteALU] in {
-let Constraints = "$val = $dst" in {
-def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
-                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
-def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
-                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
-                 OpSize16;
-def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
-                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
-                 OpSize32;
-def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
-                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+let Constraints = "$src1 = $dst1, $src2 = $dst2", hasSideEffects = 0 in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst1, GR8:$dst2),
+                (ins GR8:$src1, GR8:$src2),
+                "xchg{b}\t{$src2, $src1|$src1, $src2}", []>, NotMemoryFoldable;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst1, GR16:$dst2),
+                 (ins GR16:$src1, GR16:$src2),
+                 "xchg{w}\t{$src2, $src1|$src1, $src2}", []>,
+                 OpSize16, NotMemoryFoldable;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst1, GR32:$dst2),
+                 (ins GR32:$src1, GR32:$src2),
+                 "xchg{l}\t{$src2, $src1|$src1, $src2}", []>,
+                 OpSize32, NotMemoryFoldable;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst1, GR64:$dst2),
+                  (ins GR64:$src1 ,GR64:$src2),
+                  "xchg{q}\t{$src2, $src1|$src1, $src2}", []>, NotMemoryFoldable;
 }
 
 // Swap between EAX and other registers.
+let Constraints = "$src = $dst", hasSideEffects = 0 in {
 let Uses = [AX], Defs = [AX] in
-def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
-let Uses = [EAX], Defs = [EAX] in
-def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
-                  "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
-                  OpSize32, Requires<[Not64BitMode]>;
+def XCHG16ar : I<0x90, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+                  "xchg{w}\t{$src, %ax|ax, $src}", []>, OpSize16;
 let Uses = [EAX], Defs = [EAX] in
-// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
-// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
-def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
-                   "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
-                   OpSize32, Requires<[In64BitMode]>;
+def XCHG32ar : I<0x90, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
+                  "xchg{l}\t{$src, %eax|eax, $src}", []>, OpSize32;
 let Uses = [RAX], Defs = [RAX] in
-def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
-                  "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+def XCHG64ar : RI<0x90, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "xchg{q}\t{$src, %rax|rax, $src}", []>;
+}
 } // SchedRW
 
-let SchedRW = [WriteALU] in {
-def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
-                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
-def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
-                 OpSize16;
-def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
-                 OpSize32;
-def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+let hasSideEffects = 0, Constraints = "$src1 = $dst1, $src2 = $dst2",
+    Defs = [EFLAGS], SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst1, GR8:$dst2),
+                (ins GR8:$src1, GR8:$src2),
+                "xadd{b}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst1, GR16:$dst2),
+                 (ins GR16:$src1, GR16:$src2),
+                 "xadd{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst1, GR32:$dst2),
+                  (ins GR32:$src1, GR32:$src2),
+                 "xadd{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst1, GR64:$dst2),
+                  (ins GR64:$src1, GR64:$src2),
+                  "xadd{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 } // SchedRW
 
-let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
-def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
-                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
-def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+let mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$val = $dst",
+    Defs = [EFLAGS], SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm   : I<0xC0, MRMSrcMem, (outs GR8:$dst),
+                  (ins GR8:$val, i8mem:$ptr),
+                 "xadd{b}\t{$val, $ptr|$ptr, $val}", []>, TB;
+def XADD16rm  : I<0xC1, MRMSrcMem, (outs GR16:$dst),
+                  (ins GR16:$val, i16mem:$ptr),
+                 "xadd{w}\t{$val, $ptr|$ptr, $val}", []>, TB,
                  OpSize16;
-def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+def XADD32rm  : I<0xC1, MRMSrcMem, (outs GR32:$dst),
+                  (ins GR32:$val, i32mem:$ptr),
+                 "xadd{l}\t{$val, $ptr|$ptr, $val}", []>, TB,
                  OpSize32;
-def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD64rm  : RI<0xC1, MRMSrcMem, (outs GR64:$dst),
+                   (ins GR64:$val, i64mem:$ptr),
+                   "xadd{q}\t{$val, $ptr|$ptr, $val}", []>, TB;
 
 }
 
-let SchedRW = [WriteALU] in {
+let SchedRW = [WriteALU], hasSideEffects = 0 in {
+let Defs = [AL, EFLAGS], Uses = [AL] in
 def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
-                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
-                   IIC_CMPXCHG_REG8>, TB;
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB,
+                   NotMemoryFoldable;
+let Defs = [AX, EFLAGS], Uses = [AX] in
 def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
-                    IIC_CMPXCHG_REG>, TB, OpSize16;
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16,
+                    NotMemoryFoldable;
+let Defs = [EAX, EFLAGS], Uses = [EAX] in
 def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_REG>, TB, OpSize32;
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32,
+                     NotMemoryFoldable;
+let Defs = [RAX, EFLAGS], Uses = [RAX] in
 def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
-                      IIC_CMPXCHG_REG>, TB;
-} // SchedRW
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB,
+                      NotMemoryFoldable;
+} // SchedRW, hasSideEffects
 
-let SchedRW = [WriteALULd, WriteRMW] in {
-let mayLoad = 1, mayStore = 1 in {
+let SchedRW = [WriteALULd, WriteRMW], mayLoad = 1, mayStore = 1,
+    hasSideEffects = 0 in {
+let Defs = [AL, EFLAGS], Uses = [AL] in
 def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
-                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_MEM8>, TB;
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB,
+                     NotMemoryFoldable;
+let Defs = [AX, EFLAGS], Uses = [AX] in
 def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_MEM>, TB, OpSize16;
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16,
+                     NotMemoryFoldable;
+let Defs = [EAX, EFLAGS], Uses = [EAX] in
 def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_MEM>, TB, OpSize32;
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32,
+                     NotMemoryFoldable;
+let Defs = [RAX, EFLAGS], Uses = [RAX] in
 def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
-                      IIC_CMPXCHG_MEM>, TB;
-}
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB,
+                      NotMemoryFoldable;
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
 def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
-                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+                  "cmpxchg8b\t$dst", []>, TB;
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
 def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
-                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    "cmpxchg16b\t$dst", []>,
                     TB, Requires<[HasCmpxchg16b, In64BitMode]>;
-} // SchedRW
+} // SchedRW, mayLoad, mayStore, hasSideEffects
 
 
 // Lock instruction prefix
@@ -2053,16 +2117,11 @@ def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
 let SchedRW = [WriteNop] in {
 
 // Rex64 instruction prefix
-def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", [], IIC_NOP>,
+def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
                      Requires<[In64BitMode]>;
 
 // Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", [], IIC_NOP>,
-                     Requires<[Not16BitMode]>;
-
-// Data instruction prefix
-def DATA32_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data32", [], IIC_NOP>,
-                     Requires<[In16BitMode]>;
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
 } // SchedRW
 
 // Repeat string operation instruction prefixes
@@ -2077,108 +2136,106 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [AL,ESI], Uses = [ESI,DF] in
 def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
-              "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+              "lodsb\t{$src, %al|al, $src}", []>;
 let Defs = [AX,ESI], Uses = [ESI,DF] in
 def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
-              "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+              "lodsw\t{$src, %ax|ax, $src}", []>, OpSize16;
 let Defs = [EAX,ESI], Uses = [ESI,DF] in
 def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
-              "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+              "lods{l|d}\t{$src, %eax|eax, $src}", []>, OpSize32;
 let Defs = [RAX,ESI], Uses = [ESI,DF] in
 def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
-               "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+               "lodsq\t{$src, %rax|rax, $src}", []>,
+               Requires<[In64BitMode]>;
 }
 
 let SchedRW = [WriteSystem] in {
 let Defs = [ESI], Uses = [DX,ESI,DF] in {
 def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
-             "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+             "outsb\t{$src, %dx|dx, $src}", []>;
 def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
-              "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+              "outsw\t{$src, %dx|dx, $src}", []>, OpSize16;
 def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
-              "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+              "outs{l|d}\t{$src, %dx|dx, $src}", []>, OpSize32;
 }
 
 let Defs = [EDI], Uses = [DX,EDI,DF] in {
 def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
-             "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+             "insb\t{%dx, $dst|$dst, dx}", []>;
 def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
-             "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>,  OpSize16;
+             "insw\t{%dx, $dst|$dst, dx}", []>,  OpSize16;
 def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
-             "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+             "ins{l|d}\t{%dx, $dst|$dst, dx}", []>, OpSize32;
 }
 }
 
 // EFLAGS management instructions.
 let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in {
-def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC_CMC_STC>;
-def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_CLC_CMC_STC>;
-def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CLC_CMC_STC>;
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
 }
 
 // DF management instructions.
-// FIXME: These are a bit more expensive than CLC and STC. We should consider
-// adjusting their schedule bucket.
 let SchedRW = [WriteALU], Defs = [DF] in {
-def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
-def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
 }
 
-
 // Table lookup instructions
 let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
-def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
-           Sched<[WriteLoad]>;
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>, Sched<[WriteLoad]>;
 
 let SchedRW = [WriteMicrocoded] in {
 // ASCII Adjust After Addition
 let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
-def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>,
             Requires<[Not64BitMode]>;
 
 // ASCII Adjust AX Before Division
 let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
 def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
-                 "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+                 "aad\t$src", []>, Requires<[Not64BitMode]>;
 
 // ASCII Adjust AX After Multiply
 let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
 def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
-                 "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+                 "aam\t$src", []>, Requires<[Not64BitMode]>;
 
 // ASCII Adjust AL After Subtraction - sets
 let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
-def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>,
             Requires<[Not64BitMode]>;
 
 // Decimal Adjust AL after Addition
 let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
-def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>,
             Requires<[Not64BitMode]>;
 
 // Decimal Adjust AL after Subtraction
 let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
-def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>,
             Requires<[Not64BitMode]>;
 } // SchedRW
 
 let SchedRW = [WriteSystem] in {
 // Check Array Index Against Bounds
+// Note: "bound" does not have reversed operands in at&t syntax.
 def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+                   "bound\t$dst, $src", []>, OpSize16,
                    Requires<[Not64BitMode]>;
 def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+                   "bound\t$dst, $src", []>, OpSize32,
                    Requires<[Not64BitMode]>;
 
 // Adjust RPL Field of Segment Selector
 def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
-                 Requires<[Not64BitMode]>;
+                 "arpl\t{$src, $dst|$dst, $src}", []>,
+                 Requires<[Not64BitMode]>, NotMemoryFoldable;
 let mayStore = 1 in
 def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
-                 Requires<[Not64BitMode]>;
+                 "arpl\t{$src, $dst|$dst, $src}", []>,
+                 Requires<[Not64BitMode]>, NotMemoryFoldable;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -2188,29 +2245,29 @@ let Predicates = [HasMOVBE] in {
   let SchedRW = [WriteALULd] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
-                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
                     OpSize16, T8PS;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
                     OpSize32, T8PS;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
-                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
                      T8PS;
   }
   let SchedRW = [WriteStore] in {
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
-                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    [(store (bswap GR16:$src), addr:$dst)]>,
                     OpSize16, T8PS;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
-                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    [(store (bswap GR32:$src), addr:$dst)]>,
                     OpSize32, T8PS;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
-                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     [(store (bswap GR64:$src), addr:$dst)]>,
                      T8PS;
   }
 }
@@ -2220,33 +2277,26 @@ let Predicates = [HasMOVBE] in {
 //
 let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
-                    "rdrand{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
+                    "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
                     OpSize16, PS;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
-                    "rdrand{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
+                    "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
                     OpSize32, PS;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
-                     "rdrand{q}\t$dst",
-                     [(set GR64:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, PS;
+                     "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
+                     PS;
 }
 
 //===----------------------------------------------------------------------===//
 // RDSEED Instruction
 //
 let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
-  def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
-                    "rdseed{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
-                    OpSize16, PS;
-  def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
-                    "rdseed{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
-                    OpSize32, PS;
-  def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
-                     "rdseed{q}\t$dst",
-                     [(set GR64:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, PS;
+  def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS;
+  def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS;
+  def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2255,33 +2305,30 @@ let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
 let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
   def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
-                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)],
-                    IIC_LZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>,
+                    XS, OpSize16, Sched<[WriteLZCNT]>;
   def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
-                     (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize16,
-                    Sched<[WriteIMulLd]>;
+                     (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>;
 
   def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)],
-                    IIC_LZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>,
+                    XS, OpSize32, Sched<[WriteLZCNT]>;
   def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
-                     (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize32,
-                    Sched<[WriteIMulLd]>;
+                     (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>;
 
   def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
-                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)],
-                     IIC_LZCNT_RR>, XS, Sched<[WriteIMul]>;
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS, Sched<[WriteLZCNT]>;
   def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
-                      (implicit EFLAGS)], IIC_LZCNT_RM>, XS,
-                     Sched<[WriteIMulLd]>;
+                      (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2290,45 +2337,42 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
   def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
-                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)],
-                    IIC_TZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>,
+                    XS, OpSize16, Sched<[WriteTZCNT]>;
   def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz (loadi16 addr:$src))),
-                     (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize16,
-                    Sched<[WriteIMulLd]>;
+                     (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>;
 
   def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)],
-                    IIC_TZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>,
+                    XS, OpSize32, Sched<[WriteTZCNT]>;
   def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (cttz (loadi32 addr:$src))),
-                     (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize32,
-                    Sched<[WriteIMulLd]>;
+                     (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>;
 
   def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
-                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)],
-                     IIC_TZCNT_RR>, XS, Sched<[WriteIMul]>;
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS, Sched<[WriteTZCNT]>;
   def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (cttz (loadi64 addr:$src))),
-                      (implicit EFLAGS)], IIC_TZCNT_RM>, XS,
-                     Sched<[WriteIMulLd]>;
+                      (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>;
 }
 
 multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
                   RegisterClass RC, X86MemOperand x86memop> {
 let hasSideEffects = 0 in {
   def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
-             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
-             [], IIC_UNARY_REG>, T8PS, VEX_4V, Sched<[WriteALU]>;
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+             T8PS, VEX_4V, Sched<[WriteALU]>;
   let mayLoad = 1 in
   def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
-             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
-             [], IIC_UNARY_MEM>, T8PS, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+             T8PS, VEX_4V, Sched<[WriteALULd]>;
 }
 }
 
@@ -2363,32 +2407,56 @@ let Predicates = [HasBMI] in {
             (BLSI64rr GR64:$src)>;
 }
 
-multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
-                          X86MemOperand x86memop, Intrinsic Int,
-                          PatFrag ld_frag> {
+multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
+                     X86MemOperand x86memop, SDNode OpNode,
+                     PatFrag ld_frag, X86FoldableSchedWrite Sched> {
   def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)], IIC_BIN_NONMEM>,
-             T8PS, VEX, Sched<[WriteALU]>;
+             [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+             T8PS, VEX, Sched<[Sched]>;
   def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
-              (implicit EFLAGS)], IIC_BIN_MEM>, T8PS, VEX,
-             Sched<[WriteALULd, ReadAfterLd]>;
+             [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
+              (implicit EFLAGS)]>, T8PS, VEX,
+             Sched<[Sched.Folded,
+                    // x86memop:$src1
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    ReadDefault,
+                    // RC:$src2
+                    ReadAfterLd]>;
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
-  defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
-                                int_x86_bmi_bextr_32, loadi32>;
-  defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
-                                int_x86_bmi_bextr_64, loadi64>, VEX_W;
+  defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem,
+                           X86bextr, loadi32, WriteBEXTR>;
+  defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem,
+                           X86bextr, loadi64, WriteBEXTR>, VEX_W;
+}
+
+multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+                    X86MemOperand x86memop, Intrinsic Int,
+                    PatFrag ld_frag, X86FoldableSchedWrite Sched> {
+  def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+             T8PS, VEX, Sched<[Sched]>;
+  def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+              (implicit EFLAGS)]>, T8PS, VEX,
+             Sched<[Sched.Folded,
+                    // x86memop:$src1
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    ReadDefault,
+                    // RC:$src2
+                    ReadAfterLd]>;
 }
 
 let Predicates = [HasBMI2], Defs = [EFLAGS] in {
-  defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
-                               int_x86_bmi_bzhi_32, loadi32>;
-  defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
-                               int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+  defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+                         int_x86_bmi_bzhi_32, loadi32, WriteBZHI>;
+  defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+                         int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W;
 }
 
 def CountTrailingOnes : SDNodeXForm<imm, [{
@@ -2402,7 +2470,7 @@ def BEXTRMaskXForm : SDNodeXForm<imm, [{
 }]>;
 
 def AndMask64 : ImmLeaf<i64, [{
-  return isMask_64(Imm) && Imm > UINT32_MAX;
+  return isMask_64(Imm) && !isUInt<32>(Imm);
 }]>;
 
 // Use BEXTR for 64-bit 'and' with large immediate 'mask'.
@@ -2430,21 +2498,49 @@ let Predicates = [HasBMI2, NoTBM] in {
 }
 
 let Predicates = [HasBMI2] in {
-  def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
-            (BZHI32rr GR32:$src,
-              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
-
-  def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
-            (BZHI32rm addr:$src,
-              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+  multiclass _bmi_bzhi_pattern<dag regpattern, dag mempattern, RegisterClass RC,
+                               ValueType VT, Instruction DstInst,
+                               Instruction DstMemInst> {
+    def : Pat<regpattern,
+              (DstInst RC:$src,
+                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+    def : Pat<mempattern,
+              (DstMemInst addr:$src,
+                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+  }
 
-  def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
-            (BZHI64rr GR64:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+  multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT,
+                               Instruction DstInst, X86MemOperand x86memop,
+                               Instruction DstMemInst> {
+    // x & ((1 << y) - 1)
+    defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)),
+                             (and (x86memop addr:$src),
+                                  (add (shl 1, GR8:$lz), -1)),
+                             RC, VT, DstInst, DstMemInst>;
+
+    // x & ~(-1 << y)
+    defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)),
+                             (and (x86memop addr:$src),
+                                  (xor (shl -1, GR8:$lz), -1)),
+                             RC, VT, DstInst, DstMemInst>;
+
+    // x & (-1 >> (bitwidth - y))
+    defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))),
+                             (and (x86memop addr:$src),
+                                  (srl -1, (sub bitwidth, GR8:$lz))),
+                             RC, VT, DstInst, DstMemInst>;
+
+    // x << (bitwidth - y) >> (bitwidth - y)
+    defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)),
+                                  (sub bitwidth, GR8:$lz)),
+                             (srl (shl (x86memop addr:$src),
+                                        (sub bitwidth, GR8:$lz)),
+                                  (sub bitwidth, GR8:$lz)),
+                             RC, VT, DstInst, DstMemInst>;
+  }
 
-  def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
-            (BZHI64rm addr:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+  defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
+  defm : bmi_bzhi_patterns<GR64, 64, i64, BZHI64rr, loadi64, BZHI64rm>;
 
   // x & (-1 >> (32 - y))
   def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
@@ -2484,12 +2580,12 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          PatFrag ld_frag> {
   def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int RC:$src1, RC:$src2))], IIC_BIN_NONMEM>,
+             [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
              VEX_4V, Sched<[WriteALU]>;
   def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))],
-             IIC_BIN_MEM>, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>,
+             VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
 }
 
 let Predicates = [HasBMI2] in {
@@ -2508,61 +2604,63 @@ let Predicates = [HasBMI2] in {
 //
 let Predicates = [HasTBM], Defs = [EFLAGS] in {
 
-multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
-                                X86MemOperand x86memop, PatFrag ld_frag,
-                                Intrinsic Int, Operand immtype,
-                                SDPatternOperator immoperator> {
+multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                           X86MemOperand x86memop, PatFrag ld_frag,
+                           SDNode OpNode, Operand immtype,
+                           SDPatternOperator immoperator,
+                           X86FoldableSchedWrite Sched> {
   def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
                 !strconcat(OpcodeStr,
                            "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
-                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))],
-           IIC_BIN_NONMEM>, XOP, XOPA, Sched<[WriteALU]>;
+                [(set RC:$dst, (OpNode RC:$src1, immoperator:$cntl))]>,
+                XOP, XOPA, Sched<[Sched]>;
   def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
                 (ins x86memop:$src1, immtype:$cntl),
                 !strconcat(OpcodeStr,
                            "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
-                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))],
-           IIC_BIN_MEM>, XOP, XOPA, Sched<[WriteALULd, ReadAfterLd]>;
+                [(set RC:$dst, (OpNode (ld_frag addr:$src1), immoperator:$cntl))]>,
+                XOP, XOPA, Sched<[Sched.Folded]>;
 }
 
-defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
-                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32,
+                                X86bextr, i32imm, imm, WriteBEXTR>;
 let ImmT = Imm32S in
-defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
-                                     int_x86_tbm_bextri_u64, i64i32imm,
-                                     i64immSExt32>, VEX_W;
+defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64,
+                                X86bextr, i64i32imm,
+                                i64immSExt32, WriteBEXTR>, VEX_W;
 
 multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
                          RegisterClass RC, string OpcodeStr,
-                         X86MemOperand x86memop, PatFrag ld_frag> {
+                         X86MemOperand x86memop, X86FoldableSchedWrite Sched> {
 let hasSideEffects = 0 in {
   def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
-             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
-             [], IIC_BIN_NONMEM>, XOP_4V, XOP9, Sched<[WriteALU]>;
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
+             XOP_4V, XOP9, Sched<[Sched]>;
   let mayLoad = 1 in
   def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
-             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
-             [], IIC_BIN_MEM>, XOP_4V, XOP9, Sched<[WriteALULd, ReadAfterLd]>;
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
+             XOP_4V, XOP9, Sched<[Sched.Folded]>;
 }
 }
 
 multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           X86FoldableSchedWrite Sched,
                            Format FormReg, Format FormMem> {
-  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
-                               loadi32>;
-  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
-                               loadi64>, VEX_W;
-}
-
-defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
-defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
-defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
-defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
-defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
-defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
-defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
-defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
-defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr#"{l}",
+                               i32mem, Sched>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr#"{q}",
+                               i64mem, Sched>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", WriteALU, MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", WriteALU, MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", WriteALU, MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", WriteALU, MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", WriteALU, MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", WriteALU, MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", WriteALU, MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", WriteALU, MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", WriteALU, MRM4r, MRM4m>;
 } // HasTBM, EFLAGS
 
 // Use BEXTRI for 64-bit 'and' with large immediate 'mask'.
@@ -2580,28 +2678,24 @@ let Predicates = [HasTBM] in {
 let Predicates = [HasLWP], SchedRW = [WriteSystem] in {
 
 def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
-               [(int_x86_llwpcb GR32:$src)], IIC_LWP>,
-               XOP, XOP9;
+               [(int_x86_llwpcb GR32:$src)]>, XOP, XOP9;
 def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
-               [(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>,
-               XOP, XOP9;
+               [(set GR32:$dst, (int_x86_slwpcb))]>, XOP, XOP9;
 
 def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
-                 [(int_x86_llwpcb GR64:$src)], IIC_LWP>,
-                 XOP, XOP9, VEX_W;
+                 [(int_x86_llwpcb GR64:$src)]>, XOP, XOP9, VEX_W;
 def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
-                 [(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>,
-                 XOP, XOP9, VEX_W;
+                 [(set GR64:$dst, (int_x86_slwpcb))]>, XOP, XOP9, VEX_W;
 
 multiclass lwpins_intr<RegisterClass RC> {
   def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))], IIC_LWP>,
+                 [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
                  XOP_4V, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))], IIC_LWP>,
+                 [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
                  XOP_4V, XOPA;
 }
 
@@ -2613,12 +2707,11 @@ let Defs = [EFLAGS] in {
 multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
   def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(Int RC:$src0, GR32:$src1, imm:$cntl)], IIC_LWP>,
-                 XOP_4V, XOPA;
+                 [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)], IIC_LWP>,
+                 [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>,
                  XOP_4V, XOPA;
 }
 
@@ -2638,13 +2731,13 @@ let SchedRW = [ WriteSystem ] in {
   }
 
   let Uses = [ EAX, ECX, EDX ] in {
-    def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>,
+    def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
                       TB, Requires<[ HasMWAITX ]>;
   }
 
   let Uses = [ ECX, EAX, EBX ] in {
     def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
-                    [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
+                    [(int_x86_mwaitx ECX, EAX, EBX)]>,
                     TB, Requires<[ HasMWAITX ]>;
   }
 } // SchedRW
@@ -2660,11 +2753,67 @@ def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
       Requires<[ In64BitMode ]>;
 
 //===----------------------------------------------------------------------===//
+// WAITPKG Instructions
+//
+let SchedRW = [WriteSystem] in {
+  def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src),
+                     "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>,
+                     XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
+  def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
+                     "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>,
+                     XS, AdSize32, Requires<[HasWAITPKG]>;
+  def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
+                     "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>,
+                     XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
+  let Uses = [EAX, EDX], Defs = [EFLAGS] in {
+    def UMWAIT : I<0xAE, MRM6r,
+                     (outs), (ins GR32orGR64:$src), "umwait\t$src",
+                     [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>,
+                     XD, Requires<[HasWAITPKG]>;
+    def TPAUSE : I<0xAE, MRM6r,
+                     (outs), (ins GR32orGR64:$src), "tpause\t$src",
+                     [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
+                     PD, Requires<[HasWAITPKG]>, NotMemoryFoldable;
+  }
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVDIRI - Move doubleword/quadword as direct store
+//
+let SchedRW = [WriteStore] in {
+def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                  "movdiri\t{$src, $dst|$dst, $src}",
+                  [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
+                 T8, Requires<[HasMOVDIRI]>;
+def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                   "movdiri\t{$src, $dst|$dst, $src}",
+                   [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
+                  T8, Requires<[In64BitMode, HasMOVDIRI]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVDIR64B - Move 64 bytes as direct store
+//
+let SchedRW = [WriteStore] in {
+def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+                    "movdir64b\t{$src, $dst|$dst, $src}", []>,
+                   T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
+def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+                    "movdir64b\t{$src, $dst|$dst, $src}",
+                    [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
+                   T8PD, AdSize32, Requires<[HasMOVDIR64B]>;
+def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+                    "movdir64b\t{$src, $dst|$dst, $src}",
+                    [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
+                   T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
 // CLZERO Instruction
 //
 let SchedRW = [WriteSystem] in {
   let Uses = [EAX] in
-  def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>,
+  def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
                 TB, Requires<[HasCLZERO]>;
 
   let usesCustomInserter = 1 in {
@@ -2740,12 +2889,15 @@ let Predicates = [HasTBM] in {
 
 let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)],
-                   IIC_SSE_PREFETCH>, PD;
+                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
 
 let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
-                   [(int_x86_clwb addr:$src)], IIC_SSE_PREFETCH>, PD;
+                   [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable;
+
+let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
+def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
+                   [(int_x86_cldemote addr:$src)]>, TB;
 
 //===----------------------------------------------------------------------===//
 // Subsystems.
@@ -2899,6 +3051,14 @@ def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtd", "intel">, Requires<[In32BitMode]>;
 
 
 // Floating point stack aliases.
@@ -2976,19 +3136,19 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
 // Disambiguate the mem/imm form of bt-without-a-suffix as btl.
 // Likewise for btc/btr/bts.
 def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
-                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
 def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
-                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
 def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
-                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
 def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
-                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
 
 // clr aliases.
-def : InstAlias<"clrb\t$reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
-def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
-def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
-def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+def : InstAlias<"clr{b}\t$reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clr{w}\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clr{l}\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clr{q}\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
 
 // lods aliases. Accept the destination being omitted because it's implicit
 // in the mnemonic, or the mnemonic suffix being omitted because it's implicit
@@ -3001,10 +3161,10 @@ def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
 def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
 def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
 def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"lods\t$src", (LODSB srcidx8:$src),  0>;
-def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>;
-def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>;
-def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t$src", (LODSB srcidx8:$src),  0, "intel">;
+def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
 
 
 // stos aliases. Accept the source being omitted because it's implicit in
@@ -3018,10 +3178,10 @@ def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
 def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
 def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
 def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst),  0>;
-def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>;
-def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>;
-def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst),  0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;
 
 
 // scas aliases. Accept the destination being omitted because it's implicit
@@ -3035,24 +3195,24 @@ def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
 def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
 def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
 def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst),  0>;
-def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>;
-def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>;
-def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst),  0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;
 
 // cmps aliases. Mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src),  0>;
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>;
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>;
-def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src),   0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
 
 // movs aliases. Mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src),  0>;
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>;
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>;
-def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src),   0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
 
 // div and idiv aliases for explicit A register.
 def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
@@ -3074,7 +3234,7 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
 
 
 
-// Various unary fpstack operations default to operating on on ST1.
+// Various unary fpstack operations default to operating on ST1.
 // For example, "fxch" -> "fxch %st(1)"
 def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
 def:  InstAlias<"fadd",         (ADD_FPrST0  ST1), 0>;
@@ -3133,28 +3293,22 @@ def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
 def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
 def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
 
-// We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
-def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
-def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r), 0>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
 def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
 def : InstAlias<"ljmp\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"lcall\t{*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp\t{*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP32m  opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
 def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
 def : InstAlias<"ljmp\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall\t{*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp\t{*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP16m  opaquemem:$dst), 0>, Requires<[In16BitMode]>;
 
-def : InstAlias<"call\t{*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp\t{*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"call\t{*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp\t{*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"call\t{*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp\t{*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP64m  i64mem:$dst), 0, "att">, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP32m  i32mem:$dst), 0, "att">, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP16m  i16mem:$dst), 0, "att">, Requires<[In16BitMode]>;
 
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
@@ -3167,15 +3321,15 @@ def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i6
 
 // ins aliases. Accept the mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst),  0>;
-def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst),  0>;
-def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst),  0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst),  0, "intel">;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0, "intel">;
 
 // outs aliases. Accept the mnemonic suffix being omitted because it's implicit
 // in the source.
-def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src),  0>;
-def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src),  0>;
-def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src),  0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src),  0, "intel">;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0, "intel">;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0, "intel">;
 
 // inb %dx -> inb %al, %dx
 def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
@@ -3196,37 +3350,33 @@ def : InstAlias<"jmpw\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>, Req
 def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
 def : InstAlias<"jmpl\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
 
-// Force mov without a suffix with a segment and mem to prefer the 'l' form of
-// the move.  All segment/mem forms are equivalent, this has the shortest
-// encoding.
-def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV16sm SEGMENT_REG:$seg, i16mem:$mem), 0>;
-def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV16ms i16mem:$mem, SEGMENT_REG:$seg), 0>;
-
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
 def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
-// Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas,
+// which supports this due to an old AMD documentation bug when 64-bit mode was
+// created.
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
                 (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
                 (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
 // movsx aliases
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0, "att">;
 
 // movzx aliases
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0, "att">;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
@@ -3307,12 +3457,19 @@ def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
 
 // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
 def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
-                (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
-                (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src), 0>;
 def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
 
+// In 64-bit mode, xchg %eax, %eax can't be encoded with the 0x90 opcode we
+// would get by default because it's defined as NOP. But xchg %eax, %eax implies
+// implicit zeroing of the upper 32 bits. So alias to the longer encoding.
+def : InstAlias<"xchg{l}\t{%eax, %eax|eax, eax}",
+                (XCHG32rr EAX, EAX), 0>, Requires<[In64BitMode]>;
+
+// xchg %rax, %rax is a nop in x86-64 and can be encoded as such. Without this
+// we emit an unneeded REX.w prefix.
+def : InstAlias<"xchg{q}\t{%rax, %rax|rax, rax}", (NOOP), 0>;
+
 // These aliases exist to get the parser to prioritize matching 8-bit
 // immediate encodings over matching the implicit ax/eax/rax encodings. By
 // explicitly mentioning the A register here, these entries will be ordered
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
index a481644efdd6..aefeffedfc1a 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -20,179 +20,120 @@
 // MMX Multiclasses
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteVecALU in {
-def MMX_INTALU_ITINS : OpndItins<
-  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
->;
-
-def MMX_INTALUQ_ITINS : OpndItins<
-  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
->;
-
-def MMX_PHADDSUBW : OpndItins<
-  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
->;
-
-def MMX_PHADDSUBD : OpndItins<
-  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
->;
-}
-
-let Sched = WriteVecLogic in
-def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
-  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
->;
-
-let Sched = WriteVecIMul in
-def MMX_PMUL_ITINS : OpndItins<
-  IIC_MMX_PMUL, IIC_MMX_PMUL
->;
-
-let Sched = WriteVecIMul in {
-def MMX_PSADBW_ITINS : OpndItins<
-  IIC_MMX_PSADBW, IIC_MMX_PSADBW
->;
-
-def MMX_MISC_FUNC_ITINS : OpndItins<
-  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
->;
-}
-
-def MMX_SHIFT_ITINS : ShiftOpndItins<
-  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
->;
-
-let Sched = WriteShuffle in {
-def MMX_UNPCK_H_ITINS : OpndItins<
-  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
->;
-
-def MMX_UNPCK_L_ITINS : OpndItins<
-  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
->;
-
-def MMX_PCK_ITINS : OpndItins<
-  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
->;
-
-def MMX_PSHUF_ITINS : OpndItins<
-  IIC_MMX_PSHUF, IIC_MMX_PSHUF
->;
-} // Sched
-
-let Sched = WriteCvtF2I in {
-def MMX_CVT_PD_ITINS : OpndItins<
-  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
->;
-
-def MMX_CVT_PS_ITINS : OpndItins<
-  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
->;
+// Alias instruction that maps zero vector to pxor mmx.
+// This is expanded by ExpandPostRAPseudos to an pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
 }
 
 let Constraints = "$src1 = $dst" in {
   // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
   // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               OpndItins itins, bit Commutable = 0,
+                               X86FoldableSchedWrite sched, bit Commutable = 0,
                                X86MemOperand OType = i64mem> {
     def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                  (ins VR64:$src1, VR64:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
-              Sched<[itins.Sched]> {
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
+              Sched<[sched]> {
       let isCommutable = Commutable;
     }
     def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                  (ins VR64:$src1, OType:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1,
-                                   (bitconvert (load_mmx addr:$src2))))],
-                 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                   (bitconvert (load_mmx addr:$src2))))]>,
+                 Sched<[sched.Folded, ReadAfterLd]>;
   }
 
   multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
                                 string OpcodeStr, Intrinsic IntId,
-                                Intrinsic IntId2, ShiftOpndItins itins> {
+                                Intrinsic IntId2, X86FoldableSchedWrite sched,
+                                X86FoldableSchedWrite schedImm> {
     def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                                   (ins VR64:$src1, VR64:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
-             Sched<[WriteVecShift]>;
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
+             Sched<[sched]>;
     def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                                   (ins VR64:$src1, i64mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR64:$dst, (IntId VR64:$src1,
-                                    (bitconvert (load_mmx addr:$src2))))],
-                  itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+                                    (bitconvert (load_mmx addr:$src2))))]>,
+                  Sched<[sched.Folded, ReadAfterLd]>;
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
-           Sched<[WriteVecShift]>;
+           [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>,
+           Sched<[schedImm]>;
   }
 }
 
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId64, OpndItins itins> {
-  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
-             Sched<[itins.Sched]>;
-
-  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst,
-                     (IntId64 (bitconvert (load_mmx addr:$src))))],
-                   itins.rm>, Sched<[itins.Sched.Folded]>;
+                               Intrinsic IntId64, X86FoldableSchedWrite sched> {
+  def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR64:$dst, (IntId64 VR64:$src))]>,
+           Sched<[sched]>;
+
+  def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR64:$dst,
+                   (IntId64 (bitconvert (load_mmx addr:$src))))]>,
+                 Sched<[sched.Folded]>;
 }
 
 /// Binary MMX instructions requiring SSSE3.
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                             Intrinsic IntId64, OpndItins itins,
+                             Intrinsic IntId64, X86FoldableSchedWrite sched,
                              bit Commutable = 0> {
   let isCommutable = Commutable in
-  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+  def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
-      Sched<[itins.Sched]>;
-  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>,
+      Sched<[sched]>;
+  def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
          (IntId64 VR64:$src1,
-          (bitconvert (load_mmx addr:$src2))))], itins.rm>,
-      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+          (bitconvert (load_mmx addr:$src2))))]>,
+      Sched<[sched.Folded, ReadAfterLd]>;
 }
 }
 
 /// PALIGN MMX instructions (require SSSE3).
-multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
-  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
+                           X86FoldableSchedWrite sched> {
+  def rri  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
-      Sched<[WriteShuffle]>;
-  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+      Sched<[sched]>;
+  def rmi  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
                        (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
-      Sched<[WriteShuffleLd, ReadAfterLd]>;
+      Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
-                         string asm, OpndItins itins, Domain d> {
+                         string asm, X86FoldableSchedWrite sched, Domain d> {
   def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
-            Sched<[itins.Sched]>;
+                  [(set DstRC:$dst, (Int SrcRC:$src))], d>,
+            Sched<[sched]>;
   def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
-            Sched<[itins.Sched.Folded]>;
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>,
+            Sched<[sched.Folded]>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -200,20 +141,20 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
                     PatFrag ld_frag, string asm, Domain d> {
   def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
                   (ins DstRC:$src1, SrcRC:$src2), asm,
-                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
-                  NoItinerary, d>, Sched<[WriteCvtI2F]>;
+                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>,
+                  Sched<[WriteCvtI2PS]>;
   def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
                   (ins DstRC:$src1, x86memop:$src2), asm,
-                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
-                  NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
+                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>,
+                  Sched<[WriteCvtI2PS.Folded]>;
 }
 
 //===----------------------------------------------------------------------===//
 // MMX EMMS Instruction
 //===----------------------------------------------------------------------===//
 
-def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
-                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+let SchedRW = [WriteEMMS] in
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
 
 //===----------------------------------------------------------------------===//
 // MMX Scalar Instructions
@@ -223,405 +164,407 @@ def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
-                         (x86mmx (scalar_to_vector GR32:$src)))],
-                        IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+                         (x86mmx (scalar_to_vector GR32:$src)))]>,
+                        Sched<[WriteVecMoveFromGpr]>;
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
-                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
-                        IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>,
+                        Sched<[WriteVecLoad]>;
 
 let Predicates = [HasMMX] in {
-  let AddedComplexity = 15 in
-    def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
-              (MMX_MOVD64rr GR32:$src)>;
-  let AddedComplexity = 20 in
-    def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
-              (MMX_MOVD64rm addr:$src)>;
+  def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+            (MMX_MOVD64rr GR32:$src)>;
+  def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
+            (MMX_SET0)>;
+  def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+            (MMX_MOVD64rm addr:$src)>;
 }
 
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
-                   Sched<[WriteStore]>;
+                        "movd\t{$src, $dst|$dst, $src}", []>,
+                   Sched<[WriteVecStore]>;
 
 def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                          "movd\t{$src, $dst|$dst, $src}",
                          [(set GR32:$dst,
-                          (MMX_X86movd2w (x86mmx VR64:$src)))],
-                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>,
-                          FoldGenData<"MMX_MOVD64rr">;
+                          (MMX_X86movd2w (x86mmx VR64:$src)))]>,
+                         Sched<[WriteVecMoveToGpr]>, FoldGenData<"MMX_MOVD64rr">;
 
 let isBitcast = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst, (bitconvert GR64:$src))],
-                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+                             "movq\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst, (bitconvert GR64:$src))]>,
+                             Sched<[WriteVecMoveFromGpr]>;
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
 def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
-                             (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
-                             [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+                             (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}",
+                             []>, Sched<[SchedWriteVecMoveLS.MMX.RM]>;
 
-// These are 64 bit moves, but since the OS X assembler doesn't
-// recognize a register-register movq, we write them as
-// movd.
-let SchedRW = [WriteMove], isBitcast = 1 in {
+let isBitcast = 1 in {
 def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
-                               "movd\t{$src, $dst|$dst, $src}",
-                             [(set GR64:$dst,
-                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
-let hasSideEffects = 0 in
+                               "movq\t{$src, $dst|$dst, $src}",
+                               [(set GR64:$dst, (bitconvert VR64:$src))]>,
+                               Sched<[WriteVecMoveToGpr]>;
+let SchedRW = [WriteVecMove], hasSideEffects = 0, isMoveReg = 1 in {
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}", [],
-                        IIC_MMX_MOVQ_RR>;
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+                        "movq\t{$src, $dst|$dst, $src}", []>;
+let isCodeGenOnly = 1, ForceDisassemble = 1 in
 def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}", [],
-                        IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">;
-}
-} // SchedRW
+                            "movq\t{$src, $dst|$dst, $src}", []>,
+                            FoldGenData<"MMX_MOVQ64rr">;
+} // SchedRW, hasSideEffects, isMoveReg
+} // isBitcast
+
+def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
+                (MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>;
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
                                (outs), (ins i64mem:$dst, VR64:$src),
-                               "movd\t{$src, $dst|$dst, $src}",
-                               [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+                               "movq\t{$src, $dst|$dst, $src}", []>,
+                               Sched<[SchedWriteVecMoveLS.MMX.MR]>;
 
-let SchedRW = [WriteLoad] in {
+let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in {
 let canFoldAsLoad = 1 in
 def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(set VR64:$dst, (load_mmx addr:$src))],
-                        IIC_MMX_MOVQ_RM>;
+                        [(set VR64:$dst, (load_mmx addr:$src))]>;
 } // SchedRW
 
-let SchedRW = [WriteStore] in
+let SchedRW = [SchedWriteVecMoveLS.MMX.MR] in
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(store (x86mmx VR64:$src), addr:$dst)],
-                        IIC_MMX_MOVQ_RM>;
+                        [(store (x86mmx VR64:$src), addr:$dst)]>;
 
-let SchedRW = [WriteMove] in {
+let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
 def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                              (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
                                (x86mmx (bitconvert
                                (i64 (extractelt (v2i64 VR128:$src),
-                                     (iPTR 0))))))],
-                             IIC_MMX_MOVQ_RR>;
+                                     (iPTR 0))))))]>;
 
 def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
                               [(set VR128:$dst,
                                 (v2i64
                                   (scalar_to_vector
-                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
-                              IIC_MMX_MOVQ_RR>;
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))]>;
 
 let isCodeGenOnly = 1, hasSideEffects = 1 in {
 def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
                                (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
-                               [], IIC_MMX_MOVQ_RR>;
+                               []>;
 
 def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                               (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
-                              [], IIC_MMX_MOVQ_RR>;
+                              []>;
 }
 } // SchedRW
 
-let Predicates = [HasSSE1] in
+let Predicates = [HasMMX, HasSSE1] in
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
-                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
-                         IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>,
+                         Sched<[SchedWriteVecMoveLSNT.MMX.MR]>;
 
 let Predicates = [HasMMX] in {
-  let AddedComplexity = 15 in
   // movd to MMX register zero-extends
   def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
             (MMX_MOVD64rr GR32:$src)>;
-  let AddedComplexity = 20 in
   def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
             (MMX_MOVD64rm addr:$src)>;
 }
 
 // Arithmetic Instructions
 defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 // -- Addition
 defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                   SchedWriteVecALU.MMX, 1>;
 defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   SchedWriteVecALU.MMX, 1>;
 defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
-                                   MMX_INTALU_ITINS, 1>;
-let Predicates = [HasSSE2] in
+                                   SchedWriteVecALU.MMX, 1>;
+let Predicates = [HasMMX, HasSSE2] in
 defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
-                                   MMX_INTALUQ_ITINS, 1>;
+                                   SchedWriteVecALU.MMX, 1>;
 defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 
 defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                   SchedWriteVecALU.MMX, 1>;
 defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   SchedWriteVecALU.MMX, 1>;
 
 defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
-                                   MMX_PHADDSUBW>;
-defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
-                                   MMX_PHADDSUBD>;
+                                        SchedWritePHAdd.MMX>;
+defm MMX_PHADDD  : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                        SchedWritePHAdd.MMX>;
 defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
-                                   MMX_PHADDSUBW>;
+                                        SchedWritePHAdd.MMX>;
 
 // -- Subtraction
 defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
-                                   MMX_INTALU_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
-                                   MMX_INTALU_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
-                                   MMX_INTALU_ITINS>;
-let Predicates = [HasSSE2] in
+                                   SchedWriteVecALU.MMX>;
+let Predicates = [HasMMX, HasSSE2] in
 defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
-                                   MMX_INTALUQ_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 
 defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
-                                   MMX_INTALU_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
-                                   MMX_INTALU_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 
 defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
-                                   MMX_INTALU_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
-                                   MMX_INTALU_ITINS>;
+                                   SchedWriteVecALU.MMX>;
 
 defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
-                                   MMX_PHADDSUBW>;
+                                        SchedWritePHAdd.MMX>;
 defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
-                                   MMX_PHADDSUBD>;
+                                        SchedWritePHAdd.MMX>;
 defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
-                                   MMX_PHADDSUBW>;
+                                        SchedWritePHAdd.MMX>;
 
 // -- Multiplication
 defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
-                                     MMX_PMUL_ITINS, 1>;
+                                     SchedWriteVecIMul.MMX, 1>;
 
 defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
-                                     MMX_PMUL_ITINS, 1>;
-let Predicates = [HasSSE1] in
+                                     SchedWriteVecIMul.MMX, 1>;
+let Predicates = [HasMMX, HasSSE1] in
 defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
-                                     MMX_PMUL_ITINS, 1>;
-let Predicates = [HasSSE2] in
+                                     SchedWriteVecIMul.MMX, 1>;
+let Predicates = [HasMMX, HasSSE2] in
 defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
-                                     MMX_PMUL_ITINS, 1>;
+                                     SchedWriteVecIMul.MMX, 1>;
 defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
                                      int_x86_ssse3_pmul_hr_sw,
-                                     MMX_PMUL_ITINS, 1>;
+                                     SchedWriteVecIMul.MMX, 1>;
 
 // -- Miscellanea
 defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
-                                     MMX_PMUL_ITINS, 1>;
+                                     SchedWriteVecIMul.MMX, 1>;
 
 defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
-                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
-let Predicates = [HasSSE1] in {
+                                          int_x86_ssse3_pmadd_ub_sw,
+                                          SchedWriteVecIMul.MMX>;
+let Predicates = [HasMMX, HasSSE1] in {
 defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
-                                     MMX_MISC_FUNC_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
-                                     MMX_MISC_FUNC_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 
 defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
-                                     MMX_MISC_FUNC_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
-                                     MMX_MISC_FUNC_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 
 defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
-                                     MMX_MISC_FUNC_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
-                                     MMX_MISC_FUNC_ITINS, 1>;
+                                     SchedWriteVecALU.MMX, 1>;
 
 defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
-                                     MMX_PSADBW_ITINS, 1>;
+                                     SchedWritePSADBW.MMX, 1>;
 }
 
 defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
-                                        MMX_MISC_FUNC_ITINS>;
+                                        SchedWriteVecALU.MMX>;
 defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
-                                        MMX_MISC_FUNC_ITINS>;
+                                        SchedWriteVecALU.MMX>;
 defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
-                                        MMX_MISC_FUNC_ITINS>;
+                                        SchedWriteVecALU.MMX>;
 let Constraints = "$src1 = $dst" in
-  defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+  defm MMX_PALIGNR : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b,
+                                     SchedWriteShuffle.MMX>;
 
 // Logical Instructions
 defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
-                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+                                  SchedWriteVecLogic.MMX, 1>;
 defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
-                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+                                  SchedWriteVecLogic.MMX, 1>;
 defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
-                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+                                  SchedWriteVecLogic.MMX, 1>;
 defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
-                                  MMX_INTALU_ITINS_VECLOGICSCHED>;
+                                   SchedWriteVecLogic.MMX>;
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
                                     int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
                                     int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
                                     int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
                                     int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
                                     int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
                                     int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
                                     int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                     int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
-                                    MMX_SHIFT_ITINS>;
+                                    SchedWriteVecShift.MMX,
+                                    SchedWriteVecShiftImm.MMX>;
 
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 
 defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
-                                     MMX_INTALU_ITINS>;
+                                     SchedWriteVecALU.MMX>;
 
 // -- Unpack Instructions
 defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
                                        int_x86_mmx_punpckhbw,
-                                       MMX_UNPCK_H_ITINS>;
+                                       SchedWriteShuffle.MMX>;
 defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
                                        int_x86_mmx_punpckhwd,
-                                       MMX_UNPCK_H_ITINS>;
+                                       SchedWriteShuffle.MMX>;
 defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
                                        int_x86_mmx_punpckhdq,
-                                       MMX_UNPCK_H_ITINS>;
+                                       SchedWriteShuffle.MMX>;
 defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
                                        int_x86_mmx_punpcklbw,
-                                       MMX_UNPCK_L_ITINS,
+                                       SchedWriteShuffle.MMX,
                                        0, i32mem>;
 defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
                                        int_x86_mmx_punpcklwd,
-                                       MMX_UNPCK_L_ITINS,
+                                       SchedWriteShuffle.MMX,
                                        0, i32mem>;
 defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
                                        int_x86_mmx_punpckldq,
-                                       MMX_UNPCK_L_ITINS,
+                                       SchedWriteShuffle.MMX,
                                        0, i32mem>;
 
 // -- Pack Instructions
 defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
-                                      MMX_PCK_ITINS>;
+                                      SchedWriteShuffle.MMX>;
 defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
-                                      MMX_PCK_ITINS>;
+                                      SchedWriteShuffle.MMX>;
 defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
-                                      MMX_PCK_ITINS>;
+                                      SchedWriteShuffle.MMX>;
 
 // -- Shuffle Instructions
 defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
-                                       MMX_PSHUF_ITINS>;
+                                       SchedWriteVarShuffle.MMX>;
 
 def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
                           (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
-                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
-                          IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>,
+                          Sched<[SchedWriteShuffle.MMX]>;
 def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                           (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
-                                                   imm:$src2))],
-                          IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+                                                   imm:$src2))]>,
+                          Sched<[SchedWriteShuffle.MMX.Folded]>;
 
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+                      WriteCvtPS2I, SSEPackedSingle>, PS;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                      MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+                      WriteCvtPD2I, SSEPackedDouble>, PD;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+                       WriteCvtPS2I, SSEPackedSingle>, PS;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                       MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+                       WriteCvtPD2I, SSEPackedDouble>, PD;
 defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
                          i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+                         WriteCvtI2PD, SSEPackedDouble>, PD;
 let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                          SSEPackedSingle>, PS;
+                         SSEPackedSingle>, PS;
 }
 
 // Extract / Insert
-let Predicates = [HasSSE1] in
-def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
-                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
-                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                               imm:$src2))],
-                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Predicates = [HasMMX, HasSSE1] in
+def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg,
+                     (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                             imm:$src2))]>,
+                     Sched<[WriteVecExtract]>;
 let Constraints = "$src1 = $dst" in {
-let Predicates = [HasSSE1] in {
-  def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
-                      (outs VR64:$dst),
-                      (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
-                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32orGR64:$src2, imm:$src3))],
-                      IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
-
-  def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
-                     (outs VR64:$dst),
-                     (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
-                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                         (i32 (anyext (loadi16 addr:$src2))),
-                                       imm:$src3))],
-                     IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+let Predicates = [HasMMX, HasSSE1] in {
+  def MMX_PINSRWrr : MMXIi8<0xC4, MRMSrcReg,
+                    (outs VR64:$dst),
+                    (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+                    "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                      GR32orGR64:$src2, imm:$src3))]>,
+                    Sched<[WriteVecInsert]>;
+
+  def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
+                   (outs VR64:$dst),
+                   (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+                   "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                   [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                       (i32 (anyext (loadi16 addr:$src2))),
+                                     imm:$src3))]>,
+                   Sched<[WriteVecInsertLd, ReadAfterLd]>;
 }
 }
 
 // Mask creation
-let Predicates = [HasSSE1] in
+let Predicates = [HasMMX, HasSSE1] in
 def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                           (ins VR64:$src),
                           "pmovmskb\t{$src, $dst|$dst, $src}",
                           [(set GR32orGR64:$dst,
-                                (int_x86_mmx_pmovmskb VR64:$src))],
-                          IIC_MMX_MOVMSK>, Sched<[WriteVecLogic]>;
+                                (int_x86_mmx_pmovmskb VR64:$src))]>,
+                          Sched<[WriteMMXMOVMSK]>;
 
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
@@ -634,29 +577,30 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
           (x86mmx (MMX_MOVQ64rm addr:$src))>;
 
 // Misc.
-let SchedRW = [WriteShuffle] in {
-let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
+let SchedRW = [SchedWriteShuffle.MMX] in {
+let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
 def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                           "maskmovq\t{$mask, $src|$src, $mask}",
-                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
-                          IIC_MMX_MASKMOV>;
-let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI], Predicates = [HasMMX, HasSSE1,In64BitMode] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
-                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
-                           IIC_MMX_MASKMOV>;
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
 }
 
 // 64-bit bit convert.
-let Predicates = [HasSSE2] in {
+let Predicates = [HasMMX, HasSSE2] in {
 def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
-                   (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))),
+                   (bc_v2i64 (v4i32 (X86cvtp2Int (v4f32 VR128:$src)))))),
           (MMX_CVTPS2PIirr VR128:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
+          (MMX_CVTTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
           (MMX_CVTTPS2PIirr VR128:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
index cb2b47b4f0c9..c1a8cc7c5fbf 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -13,70 +13,68 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FIXME: Investigate a better scheduler itinerary once MPX is used inside LLVM.
+// FIXME: Investigate a better scheduler class once MPX is used inside LLVM.
 let SchedRW = [WriteSystem] in {
 
 multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
-let mayLoad = 1 in {
-  def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
-              OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>,
+  def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, Not64BitMode]>;
-  def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-              OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>,
+  def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
-}
 
 defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
 
 multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
-let mayLoad = 1 in {
-  def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+  def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, anymem:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
-  def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+  def 64rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, anymem:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
-}
+
   def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
-  def 64rr: RI<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
+  def 64rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
-defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
-defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
-defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
 
-def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
-                    "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
-                    Requires<[HasMPX]>;
+def BNDMOVrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                  Requires<[HasMPX]>, NotMemoryFoldable;
 let mayLoad = 1 in {
-def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-                    "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
-                    Requires<[HasMPX, Not64BitMode]>;
-def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
-                    "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
-                    Requires<[HasMPX, In64BitMode]>;
+def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                  Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                  Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
 }
-def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
-                    "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
-                    Requires<[HasMPX]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1 in
+def BNDMOVrr_REV   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+                       "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                       Requires<[HasMPX]>, NotMemoryFoldable;
 let mayStore = 1 in {
-def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
-                    "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
-                    Requires<[HasMPX, Not64BitMode]>;
-def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
-                    "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
-                    Requires<[HasMPX, In64BitMode]>;
+def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                  Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
+                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                  Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
 
-def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
-                    "bndstx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS,
+def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
+                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
 }
 let mayLoad = 1 in
 def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
-                    "bndldx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS,
+                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
 } // SchedRW
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
index f4331c5e2d93..488cc4438076 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSGX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 // SGX instructions
 
-let SchedRW = [WriteSystem] in {
+let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
 // ENCLS - Execute an Enclave System Function of Specified Leaf Number
 def ENCLS : I<0x01, MRM_CF, (outs), (ins),
              "encls", []>, TB;
@@ -23,4 +23,8 @@ def ENCLS : I<0x01, MRM_CF, (outs), (ins),
 // ENCLU - Execute an Enclave User Function of Specified Leaf Number
 def ENCLU : I<0x01, MRM_D7, (outs), (ins),
              "enclu", []>, TB;
+
+// ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
+def ENCLV : I<0x01, MRM_C0, (outs), (ins),
+             "enclv", []>, TB;
 } // SchedRW
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index cb84f9aecf79..6a9b20998210 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -13,246 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
-  InstrItinClass rr = arg_rr;
-  InstrItinClass rm = arg_rm;
-  // InstrSchedModel info.
-  X86FoldableSchedWrite Sched = WriteFAdd;
-}
-
-class SizeItins<OpndItins arg_s, OpndItins arg_d> {
-  OpndItins s = arg_s;
-  OpndItins d = arg_d;
-}
-
-class MoveLoadStoreItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
-                         InstrItinClass arg_mr> {
-  InstrItinClass rr = arg_rr;
-  InstrItinClass rm = arg_rm;
-  InstrItinClass mr = arg_mr;
-}
-
-class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
-                     InstrItinClass arg_ri> {
-  InstrItinClass rr = arg_rr;
-  InstrItinClass rm = arg_rm;
-  InstrItinClass ri = arg_ri;
-}
-
-// scalar
-let Sched = WriteFAdd in {
-def SSE_ALU_F32S : OpndItins<
-  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
->;
-
-def SSE_ALU_F64S : OpndItins<
-  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
->;
-}
-
-def SSE_ALU_ITINS_S : SizeItins<
-  SSE_ALU_F32S, SSE_ALU_F64S
->;
-
-let Sched = WriteFMul in {
-def SSE_MUL_F32S : OpndItins<
-  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
->;
-
-def SSE_MUL_F64S : OpndItins<
-  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
->;
-}
-
-def SSE_MUL_ITINS_S : SizeItins<
-  SSE_MUL_F32S, SSE_MUL_F64S
->;
-
-let Sched = WriteFDiv in {
-def SSE_DIV_F32S : OpndItins<
-  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
->;
-
-def SSE_DIV_F64S : OpndItins<
-  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
->;
-}
-
-def SSE_DIV_ITINS_S : SizeItins<
-  SSE_DIV_F32S, SSE_DIV_F64S
->;
-
-// parallel
-let Sched = WriteFAdd in {
-def SSE_ALU_F32P : OpndItins<
-  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
->;
-
-def SSE_ALU_F64P : OpndItins<
-  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
->;
-}
-
-def SSE_ALU_ITINS_P : SizeItins<
-  SSE_ALU_F32P, SSE_ALU_F64P
->;
-
-let Sched = WriteFMul in {
-def SSE_MUL_F32P : OpndItins<
-  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
->;
-
-def SSE_MUL_F64P : OpndItins<
-  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
->;
-}
-
-def SSE_MUL_ITINS_P : SizeItins<
-  SSE_MUL_F32P, SSE_MUL_F64P
->;
-
-let Sched = WriteFDiv in {
-def SSE_DIV_F32P : OpndItins<
-  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
->;
-
-def SSE_DIV_F64P : OpndItins<
-  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
->;
-}
-
-def SSE_DIV_ITINS_P : SizeItins<
-  SSE_DIV_F32P, SSE_DIV_F64P
->;
-
-let Sched = WriteVecLogic in
-def SSE_BIT_ITINS_P : OpndItins<
-  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
->;
-
-let Sched = WriteVecALU in {
-def SSE_INTALU_ITINS_P : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-def SSE_INTALUQ_ITINS_P : OpndItins<
-  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
->;
-}
-
-let Sched = WriteVecIMul in
-def SSE_INTMUL_ITINS_P : OpndItins<
-  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
->;
-
-// FIXME: Merge SSE_INTSHIFT_P + SSE_INTSHIFT_ITINS_P.
-def SSE_INTSHIFT_P : OpndItins<
-  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM
->;
-
-def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
-  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
->;
-
-def SSE_MOVA_ITINS : OpndItins<
-  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
->;
-
-def SSE_MOVA : MoveLoadStoreItins<
-  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM, IIC_SSE_MOVA_P_MR
->;
-
-def SSE_MOVU_ITINS : OpndItins<
-  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
->;
-
-def SSE_MOVU : MoveLoadStoreItins<
-  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM, IIC_SSE_MOVU_P_MR
->;
-
-def SSE_DPPD_ITINS : OpndItins<
-  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
->;
-
-def SSE_DPPS_ITINS : OpndItins<
-  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
->;
-
-def DEFAULT_ITINS : OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-def SSE_EXTRACT_ITINS : OpndItins<
-  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
->;
-
-def SSE_INSERT_ITINS : OpndItins<
-  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
->;
-
-let Sched = WriteMPSAD in
-def SSE_MPSADBW_ITINS : OpndItins<
-  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
->;
-
-let Sched = WriteVecIMul in
-def SSE_PMULLD_ITINS : OpndItins<
-  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
->;
-
-// Definitions for backward compatibility.
-// The instructions mapped on these definitions uses a different itinerary
-// than the actual scheduling model.
-let Sched = WriteShuffle in
-def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteVecIMul in
-def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteShuffle in
-def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-let Sched = WriteShuffle in
-def SSE_PACK : OpndItins<
-  IIC_SSE_PACK, IIC_SSE_PACK
->;
-
-let Sched = WriteMPSAD in
-def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteFBlend in
-def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteBlend in
-def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteVarBlend in
-def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
-  IIC_ALU_NONMEM, IIC_ALU_MEM
->;
-
-let Sched = WriteFBlend in
-def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
-let Sched = WriteBlend in
-def SSE_INTALU_ITINS_BLEND_P : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
@@ -260,21 +20,22 @@ def SSE_INTALU_ITINS_BLEND_P : OpndItins<
 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            RegisterClass RC, X86MemOperand x86memop,
-                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+                           Domain d, X86FoldableSchedWrite sched,
+                           bit Is2Addr = 1> {
   let isCommutable = 1 in {
     def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
+       Sched<[sched]>;
   }
   def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
@@ -282,21 +43,21 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
                                SDPatternOperator OpNode, RegisterClass RC,
                                ValueType VT, string asm, Operand memopr,
                                ComplexPattern mem_cpat, Domain d,
-                               OpndItins itins, bit Is2Addr = 1> {
+                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
   def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
+       Sched<[sched]>;
   let mayLoad = 1 in
   def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 }
 
@@ -304,27 +65,29 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            RegisterClass RC, ValueType vt,
                            X86MemOperand x86memop, PatFrag mem_frag,
-                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+                           Domain d, X86FoldableSchedWrite sched,
+                           bit Is2Addr = 1> {
   let isCommutable = 1 in
     def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
+       Sched<[sched]>;
   let mayLoad = 1 in
     def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
-          itins.rm, d>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+          d>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
                                       string OpcodeStr, X86MemOperand x86memop,
+                                      X86FoldableSchedWrite sched,
                                       list<dag> pat_rr, list<dag> pat_rm,
                                       bit Is2Addr = 1> {
   let isCommutable = 1, hasSideEffects = 0 in
@@ -332,15 +95,15 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       pat_rr, IIC_SSE_BIT_P_RR, d>,
-       Sched<[WriteVecLogic]>;
+       pat_rr, d>,
+       Sched<[sched]>;
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       pat_rm, IIC_SSE_BIT_P_RM, d>,
-       Sched<[WriteVecLogicLd, ReadAfterLd]>;
+       pat_rm, d>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 
@@ -360,7 +123,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 
 // Alias instruction that maps zero vector to pxor / xorp* for sse.
 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
-// swizzled by ExecutionDepsFix to pxor.
+// swizzled by ExecutionDomainFix to pxor.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -415,22 +178,22 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src1, VR128:$src2),
               !strconcat(base_opc, asm_opr),
-              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
-              IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
+              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
+              Sched<[SchedWriteFShuffle.XMM]>;
 
   // For the disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                   (ins VR128:$src1, VR128:$src2),
-                  !strconcat(base_opc, asm_opr),
-                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
-                  FoldGenData<Name#rr>;
+                  !strconcat(base_opc, asm_opr), []>,
+                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
                       X86MemOperand x86memop, string OpcodeStr,
-                      Domain d, string Name> {
+                      Domain d, string Name, Predicate pred> {
   // AVX
+  let Predicates = [UseAVX, OptForSize] in
   defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
                               "V"#Name>,
@@ -438,18 +201,26 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
-                     VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
+                     [(store RC:$src, addr:$dst)], d>,
+                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
+    let Predicates = [pred, NoSSE41_Or_OptForSize] in
     defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
                               "\t{$src2, $dst|$dst, $src2}", d, Name>;
   }
 
   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
-                  Sched<[WriteStore]>;
+                     [(store RC:$src, addr:$dst)], d>,
+                     Sched<[WriteFStore]>;
+
+  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (!cast<Instruction>("V"#NAME#"rr_REV")
+                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
+  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
+                  (!cast<Instruction>(NAME#"rr_REV")
+                   VR128:$dst, VR128:$src2), 0>;
 }
 
 // Loading from memory automatically zeroing upper bits.
@@ -457,37 +228,32 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
                          PatFrag mem_pat, string OpcodeStr, Domain d> {
   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG;
+                     [(set RC:$dst, (mem_pat addr:$src))], d>,
+                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
+                     [(set RC:$dst, (mem_pat addr:$src))], d>,
+                     Sched<[WriteFLoad]>;
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle, "MOVSS">, XS;
+                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble, "MOVSD">, XD;
+                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
                              SSEPackedSingle>, XS;
-
-  let AddedComplexity = 20 in
-    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
-                               SSEPackedDouble>, XD;
+  defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+                             SSEPackedDouble>, XD;
 }
 
 // Patterns
 let Predicates = [UseAVX] in {
-  let AddedComplexity = 20 in {
   // MOVSSrm zeros the high parts of the register; represent this
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
-  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (X86vzload addr:$src)),
@@ -497,8 +263,6 @@ let Predicates = [UseAVX] in {
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
-  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
@@ -518,43 +282,45 @@ let Predicates = [UseAVX] in {
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzload addr:$src)),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
-  }
 
   // Extract and store.
   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+}
 
-  // Shuffle with VMOVSS
-  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
-            (VMOVSSrr VR128:$src1, VR128:$src2)>;
-
-  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
-            (VMOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>;
-
-  // Shuffle with VMOVSD
-  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, VR128:$src2)>;
+let Predicates = [UseAVX, OptForSize] in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
 
-  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>;
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
+              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
+              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
 
-  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
-  // is during lowering, where it's not possible to recognize the fold cause
-  // it has two uses through a bitcast. One use disappears at isel time and the
-  // fold opportunity reappears.
-  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
+                       (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
+             sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
+                       (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
+             sub_xmm)>;
 }
 
 let Predicates = [UseSSE1] in {
-  let Predicates = [NoSSE41], AddedComplexity = 15 in {
+  let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -563,72 +329,30 @@ let Predicates = [UseSSE1] in {
             (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
   }
 
-  let AddedComplexity = 20 in {
   // MOVSSrm already zeros the high parts of the register.
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
-  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
-  }
 
   // Extract and store.
   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
-
-  // Shuffle with MOVSS
-  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
-            (MOVSSrr VR128:$src1, VR128:$src2)>;
-
-  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
-            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>;
 }
 
 let Predicates = [UseSSE2] in {
-  let Predicates = [NoSSE41], AddedComplexity = 15 in {
-  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
-  // MOVSD to the lower bits.
-  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-            (MOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  }
-
-  let AddedComplexity = 20 in {
   // MOVSDrm already zeros the high parts of the register.
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-  }
-
-  // Shuffle with MOVSD
-  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, VR128:$src2)>;
-
-  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>;
-
-  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
-  // is during lowering, where it's not possible to recognize the fold because
-  // it has two uses through a bitcast. One use disappears at isel time and the
-  // fold opportunity reappears.
-  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, VR128:$src2)>;
 }
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -645,142 +369,144 @@ def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
                             X86MemOperand x86memop, PatFrag ld_frag,
                             string asm, Domain d,
-                            OpndItins itins> {
-let hasSideEffects = 0 in
+                            X86SchedWriteMoveLS sched> {
+let hasSideEffects = 0, isMoveReg = 1 in
   def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
-           Sched<[WriteFShuffle]>;
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+           Sched<[sched.RR]>;
 let canFoldAsLoad = 1, isReMaterializable = 1 in
   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
-           Sched<[WriteLoad]>;
+                   [(set RC:$dst, (ld_frag addr:$src))], d>,
+           Sched<[sched.RM]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
-                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX, VEX_WIG;
-defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
-                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX, VEX_WIG;
-defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
-                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX, VEX_WIG;
-defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
-                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX, VEX_WIG;
-
-defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
-                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX, VEX_L, VEX_WIG;
-defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
-                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX, VEX_L, VEX_WIG;
-defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
-                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX, VEX_L, VEX_WIG;
-defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
-                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX, VEX_L, VEX_WIG;
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
+                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+                                PS, VEX, VEX_WIG;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
+                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+                                PD, VEX, VEX_WIG;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
+                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+                                PS, VEX, VEX_WIG;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
+                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+                                PD, VEX, VEX_WIG;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
+                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
+                                 PS, VEX, VEX_L, VEX_WIG;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
+                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
+                                 PD, VEX, VEX_L, VEX_WIG;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
+                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
+                                 PS, VEX, VEX_L, VEX_WIG;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 
+                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
+                                 PD, VEX, VEX_L, VEX_WIG;
 }
 
 let Predicates = [UseSSE1] in {
-defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
-                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS;
-defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
-                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS;
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
+                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+                               PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
+                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+                               PS;
 }
 let Predicates = [UseSSE2] in {
-defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
-                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD;
-defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
-                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD;
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
+                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+                               PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
+                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+                               PD;
 }
 
-let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
+let Predicates = [HasAVX, NoVLX]  in {
+let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
+                   VEX, VEX_WIG;
 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
+                   VEX, VEX_WIG;
 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}",
-                   [(store (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
+                   [(store (v4f32 VR128:$src), addr:$dst)]>,
+                   VEX, VEX_WIG;
 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
-                   [(store (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
+                   [(store (v2f64 VR128:$src), addr:$dst)]>,
+                   VEX, VEX_WIG;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
+                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
+                   VEX, VEX_L, VEX_WIG;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
+                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
+                   VEX, VEX_L, VEX_WIG;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
-                   [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
+                   [(store (v8f32 VR256:$src), addr:$dst)]>,
+                   VEX, VEX_L, VEX_WIG;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
-                   [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
+                   [(store (v4f64 VR256:$src), addr:$dst)]>,
+                   VEX, VEX_L, VEX_WIG;
 } // SchedRW
+} // Predicate
 
 // For disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    SchedRW = [WriteFShuffle] in {
+    isMoveReg = 1 in {
+let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
-                          "movaps\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
-                          FoldGenData<"VMOVAPSrr">;
+                          "movaps\t{$src, $dst|$dst, $src}", []>,
+                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
-                           "movapd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
-                           FoldGenData<"VMOVAPDrr">;
+                           "movapd\t{$src, $dst|$dst, $src}", []>,
+                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
-                           "movups\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
-                           FoldGenData<"VMOVUPSrr">;
+                           "movups\t{$src, $dst|$dst, $src}", []>,
+                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
-                           "movupd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
-                           FoldGenData<"VMOVUPDrr">;
+                           "movupd\t{$src, $dst|$dst, $src}", []>,
+                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
-                            "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
-                            FoldGenData<"VMOVAPSYrr">;
+                            "movaps\t{$src, $dst|$dst, $src}", []>,
+                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
-                            "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
-                            FoldGenData<"VMOVAPDYrr">;
+                            "movapd\t{$src, $dst|$dst, $src}", []>,
+                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
-                            "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
-                            FoldGenData<"VMOVUPSYrr">;
+                            "movups\t{$src, $dst|$dst, $src}", []>,
+                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
-                            "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
-                            FoldGenData<"VMOVUPDYrr">;
-}
+                            "movupd\t{$src, $dst|$dst, $src}", []>,
+                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
+} // SchedRW
+} // Predicate
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
@@ -801,42 +527,66 @@ def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
                 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
-let SchedRW = [WriteStore] in {
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
+                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
+                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
+                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
+                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
+                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
+                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
+                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
+                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
+
+let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>;
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>;
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}",
-                   [(store (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>;
+                   [(store (v4f32 VR128:$src), addr:$dst)]>;
 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
-                   [(store (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>;
+                   [(store (v2f64 VR128:$src), addr:$dst)]>;
 } // SchedRW
 
 // For disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    SchedRW = [WriteFShuffle] in {
+    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                         "movaps\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">;
+                         "movaps\t{$src, $dst|$dst, $src}", []>,
+                         FoldGenData<"MOVAPSrr">;
   def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                         "movapd\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">;
+                         "movapd\t{$src, $dst|$dst, $src}", []>,
+                         FoldGenData<"MOVAPDrr">;
   def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                         "movups\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">;
+                         "movups\t{$src, $dst|$dst, $src}", []>,
+                         FoldGenData<"MOVUPSrr">;
   def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                         "movupd\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">;
+                         "movupd\t{$src, $dst|$dst, $src}", []>,
+                         FoldGenData<"MOVUPDrr">;
 }
 
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
+                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
+                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
+                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
+                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
+
 let Predicates = [HasAVX, NoVLX] in {
   // 256-bit load/store need to use floating point load/store in case we don't
   // have AVX2. Execution domain fixing will convert to integer if AVX2 is
@@ -894,135 +644,82 @@ let Predicates = [UseSSE1] in {
 // SSE 1 & 2 - Move Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
-                                      string base_opc, string asm_opr,
-                                      InstrItinClass itin> {
+multiclass sse12_mov_hilo_packed_base<bits<8>opc,  SDNode pdnode,
+                                      string base_opc, string asm_opr> {
+  // No pattern as they need be special cased between high and low.
+  let hasSideEffects = 0, mayLoad = 1 in
   def PSrm : PI<opc, MRMSrcMem,
-         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
-         !strconcat(base_opc, "s", asm_opr),
-     [(set VR128:$dst,
-       (psnode VR128:$src1,
-              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
-              itin, SSEPackedSingle>, PS,
-     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                !strconcat(base_opc, "s", asm_opr),
+                [], SSEPackedSingle>, PS,
+                Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
 
   def PDrm : PI<opc, MRMSrcMem,
          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
          !strconcat(base_opc, "d", asm_opr),
      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
                               (scalar_to_vector (loadf64 addr:$src2)))))],
-              itin, SSEPackedDouble>, PD,
-     Sched<[WriteFShuffleLd, ReadAfterLd]>;
-
+              SSEPackedDouble>, PD,
+     Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
 }
 
-multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
-                                 string base_opc, InstrItinClass itin> {
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
+                                 string base_opc> {
   let Predicates = [UseAVX] in
-    defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
-                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                                    itin>, VEX_4V, VEX_WIG;
+    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
+                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+                                    VEX_4V, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in
-    defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
-                                    "\t{$src2, $dst|$dst, $src2}",
-                                    itin>;
+    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
+                                    "\t{$src2, $dst|$dst, $src2}">;
 }
 
-let AddedComplexity = 20 in {
-  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
-                                    IIC_SSE_MOV_LH>;
-}
+defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
 
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteFStore] in {
 let Predicates = [UseAVX] in {
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
-                                 (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+                     "movlps\t{$src, $dst|$dst, $src}",
+                     [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+                                   (iPTR 0))), addr:$dst)]>,
+                     VEX, VEX_WIG;
 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (extractelt (v2f64 VR128:$src),
-                                 (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+                     "movlpd\t{$src, $dst|$dst, $src}",
+                     [(store (f64 (extractelt (v2f64 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>,
+                     VEX, VEX_WIG;
 }// UseAVX
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
-                                 (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>;
+                                 (iPTR 0))), addr:$dst)]>;
 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (v2f64 VR128:$src),
-                                 (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>;
+                                 (iPTR 0))), addr:$dst)]>;
 } // SchedRW
 
-let Predicates = [UseAVX] in {
-  // Shuffle with VMOVLPS
-  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
-            (VMOVLPSrm VR128:$src1, addr:$src2)>;
-
-  // Shuffle with VMOVLPD
-  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
-            (VMOVLPDrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Movsd VR128:$src1,
-                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-            (VMOVLPDrm VR128:$src1, addr:$src2)>;
-
-  // Store patterns
-  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
-                   addr:$src1),
-            (VMOVLPSmr addr:$src1, VR128:$src2)>;
-  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
-                   addr:$src1),
-            (VMOVLPDmr addr:$src1, VR128:$src2)>;
-}
-
 let Predicates = [UseSSE1] in {
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
   def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
                                  (iPTR 0))), addr:$src1),
             (MOVLPSmr addr:$src1, VR128:$src2)>;
 
-  // Shuffle with MOVLPS
-  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
-            (MOVLPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(X86Movlps VR128:$src1,
-                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
+  // end up with a movsd or blend instead of shufp.
+  // No need for aligned load, we're only loading 64-bits.
+  def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
             (MOVLPSrm VR128:$src1, addr:$src2)>;
-
-  // Store patterns
-  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
-                                      addr:$src1),
-            (MOVLPSmr addr:$src1, VR128:$src2)>;
-}
-
-let Predicates = [UseSSE2] in {
-  // Shuffle with MOVLPD
-  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
-            (MOVLPDrm VR128:$src1, addr:$src2)>;
-  def : Pat<(v2f64 (X86Movsd VR128:$src1,
-                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-            (MOVLPDrm VR128:$src1, addr:$src2)>;
-
-  // Store patterns
-  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
-                           addr:$src1),
-            (MOVLPDmr addr:$src1, VR128:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move Hi packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 20 in {
-  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Unpckl, "movhp",
-                                    IIC_SSE_MOV_LH>;
-}
+defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
 
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteFStore] in {
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
 let Predicates = [UseAVX] in {
@@ -1031,35 +728,27 @@ def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
+                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
 } // UseAVX
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+                                 (iPTR 0))), addr:$dst)]>;
 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+                                 (iPTR 0))), addr:$dst)]>;
 } // SchedRW
 
 let Predicates = [UseAVX] in {
-  // VMOVHPS patterns
-  def : Pat<(X86Movlhps VR128:$src1,
-                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
-            (VMOVHPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(X86Movlhps VR128:$src1,
-                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
-            (VMOVHPSrm VR128:$src1, addr:$src2)>;
-
   // Also handle an i64 load because that may get selected as a faster way to
   // load the data.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1067,23 +756,16 @@ let Predicates = [UseAVX] in {
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 
   def : Pat<(store (f64 (extractelt
-                          (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
-                          (iPTR 0))), addr:$dst),
-            (VMOVHPDmr addr:$dst, VR128:$src)>;
-
-  def : Pat<(store (f64 (extractelt
                           (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
                           (iPTR 0))), addr:$dst),
             (VMOVHPDmr addr:$dst, VR128:$src)>;
 }
 
 let Predicates = [UseSSE1] in {
-  // MOVHPS patterns
-  def : Pat<(X86Movlhps VR128:$src1,
-                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
-            (MOVHPSrm VR128:$src1, addr:$src2)>;
-  def : Pat<(X86Movlhps VR128:$src1,
-                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
+  // end up with a movsd or blend instead of shufp.
+  // No need for aligned load, we're only loading 64-bits.
+  def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
             (MOVHPSrm VR128:$src1, addr:$src2)>;
 }
 
@@ -1097,11 +779,6 @@ let Predicates = [UseSSE2] in {
             (MOVHPDrm VR128:$src1, addr:$src2)>;
 
   def : Pat<(store (f64 (extractelt
-                          (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
-                          (iPTR 0))), addr:$dst),
-            (MOVHPDmr addr:$dst, VR128:$src)>;
-
-  def : Pat<(store (f64 (extractelt
                           (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
                           (iPTR 0))), addr:$dst),
             (MOVHPDmr addr:$dst, VR128:$src)>;
@@ -1111,206 +788,149 @@ let Predicates = [UseSSE2] in {
 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 20, Predicates = [UseAVX] in {
+let Predicates = [UseAVX] in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
-                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
+                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
+  let isCommutable = 1 in
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
-                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
+                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
+                      NotMemoryFoldable;
 }
-let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+let Constraints = "$src1 = $dst" in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst,
-                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
+                      Sched<[SchedWriteFShuffle.XMM]>;
   let isCommutable = 1 in
   def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst,
-                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
-}
-
-//===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Conversion Instructions
-//===----------------------------------------------------------------------===//
-
-let Sched = WriteCvtF2I in {
-def SSE_CVT_SS2SI_32 : OpndItins<
-  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
->;
-
-let Sched = WriteCvtF2I in
-def SSE_CVT_SS2SI_64 : OpndItins<
-  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
->;
-
-def SSE_CVT_SD2SI : OpndItins<
-  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
->;
-
-def SSE_CVT_PS2I : OpndItins<
-  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
-
-def SSE_CVT_PD2I : OpndItins<
-  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
+                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
 }
 
-let Sched = WriteCvtI2F in {
-def SSE_CVT_SI2SS : OpndItins<
-  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_SI2SD : OpndItins<
-  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_I2PS : OpndItins<
-  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
+// TODO: This is largely to trick fastisel into ignoring the pattern.
+def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2),
+                          (X86Unpckh node:$src1, node:$src2), [{
+  return N->getOperand(0) == N->getOperand(1);
+}]>;
 
-def SSE_CVT_I2PD : OpndItins<
-  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
+let Predicates = [UseSSE2] in {
+  // TODO: This is a hack pattern to allow lowering to emit unpckh instead of
+  // movhlps for sse2 without changing a bunch of tests.
+  def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)),
+            (MOVHLPSrr VR128:$src, VR128:$src)>;
 }
 
-let Sched = WriteCvtF2F in {
-def SSE_CVT_SD2SS : OpndItins<
-  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_SS2SD : OpndItins<
-  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
->;
-
-def SSE_CVT_PD2PS : OpndItins<
-  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
-
-def SSE_CVT_PS2PD : OpndItins<
-  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
->;
-
-def SSE_CVT_PH2PS : OpndItins<
-  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
-
-def SSE_CVT_PS2PH : OpndItins<
-  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
->;
-}
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
 
-// FIXME: We probably want to match the rm form only when optimizing for
-// size, to avoid false depenendecies (see sse_fp_unop_s for details)
 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                      SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
-                     string asm, OpndItins itins> {
+                     string asm, X86FoldableSchedWrite sched> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
-                        itins.rr>, Sched<[itins.Sched]>;
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
+                        Sched<[sched]>;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
-                        itins.rm>, Sched<[itins.Sched.Folded]>;
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
+                        Sched<[sched.Folded]>;
 }
 
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
                        ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
-                       string asm, Domain d, OpndItins itins> {
+                       string asm, Domain d, X86FoldableSchedWrite sched> {
 let hasSideEffects = 0 in {
   def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
-             [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
-             itins.rr, d>, Sched<[itins.Sched]>;
+             [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
+             Sched<[sched]>;
   let mayLoad = 1 in
   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
              [(set RC:$dst, (DstTy (sint_to_fp
-                                    (SrcTy (bitconvert (ld_frag addr:$src))))))],
-             itins.rm, d>, Sched<[itins.Sched.Folded]>;
+                                    (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
+             Sched<[sched.Folded]>;
 }
 }
 
-// FIXME: We probably want to match the rm form only when optimizing for
-// size, to avoid false depenendecies (see sse_fp_unop_s for details)
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                          X86MemOperand x86memop, string asm, OpndItins itins> {
+                          X86MemOperand x86memop, string asm,
+                          X86FoldableSchedWrite sched> {
 let hasSideEffects = 0, Predicates = [UseAVX] in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
-              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
-              itins.rr>, Sched<[itins.Sched]>;
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              Sched<[sched]>;
   let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins DstRC:$src1, x86memop:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
-           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+           Sched<[sched.Folded, ReadAfterLd]>;
 } // hasSideEffects = 0
 }
 
 let Predicates = [UseAVX] in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}",
-                                SSE_CVT_SS2SI_32>,
+                                WriteCvtSS2I>,
                                 XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}",
-                                SSE_CVT_SS2SI_64>,
+                                WriteCvtSS2I>,
                                 XS, VEX, VEX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si\t{$src, $dst|$dst, $src}",
-                                SSE_CVT_SD2SI>,
+                                WriteCvtSD2I>,
                                 XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si\t{$src, $dst|$dst, $src}",
-                                SSE_CVT_SD2SI>,
+                                WriteCvtSD2I>,
                                 XD, VEX, VEX_W, VEX_LIG;
 
 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
 }
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
 // register, but the same isn't true when only using memory operands,
 // provide other assembly "l" and "q" forms to address this explicitly
 // where appropriate to do so.
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
-                                  SSE_CVT_SI2SS>, XS, VEX_4V, VEX_LIG;
+                                  WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
-                                  SSE_CVT_SI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
+                                  WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
-                                  SSE_CVT_SI2SD>, XD, VEX_4V, VEX_LIG;
+                                  WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
-                                  SSE_CVT_SI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
+                                  WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
 
 let Predicates = [UseAVX] in {
   def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
   def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
 
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -1333,50 +953,50 @@ let Predicates = [UseAVX] in {
 
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                       "cvttss2si\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SS2SI_32>, XS;
+                      WriteCvtSS2I>, XS;
 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
                       "cvttss2si\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SS2SI_64>, XS, REX_W;
+                      WriteCvtSS2I>, XS, REX_W;
 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
                       "cvttsd2si\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SD2SI>, XD;
+                      WriteCvtSD2I>, XD;
 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
                       "cvttsd2si\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SD2SI>, XD, REX_W;
+                      WriteCvtSD2I>, XD, REX_W;
 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
                       "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SI2SS>, XS;
+                      WriteCvtI2SS>, XS;
 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SI2SS>, XS, REX_W;
+                      WriteCvtI2SS>, XS, REX_W;
 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
                       "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SI2SD>, XD;
+                      WriteCvtI2SD>, XD;
 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
-                      SSE_CVT_SI2SD>, XD, REX_W;
+                      WriteCvtI2SD>, XD, REX_W;
 
 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
 
 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
-                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
-                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
 
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
 // and/or XMM operand(s).
@@ -1384,81 +1004,72 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
 // FIXME: We probably want to match the rm form only when optimizing for
 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
-                         string asm, OpndItins itins> {
+                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                          string asm, X86FoldableSchedWrite sched> {
   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
                   !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
-               Sched<[itins.Sched]>;
+                  [(set DstRC:$dst, (Int SrcRC:$src))]>,
+               Sched<[sched]>;
   def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
                   !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                  [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
-               Sched<[itins.Sched.Folded]>;
+                  [(set DstRC:$dst, (Int mem_cpat:$src))]>,
+               Sched<[sched.Folded]>;
 }
 
 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
-                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
-                    PatFrag ld_frag, string asm, OpndItins itins,
+                    RegisterClass DstRC, X86MemOperand x86memop,
+                    string asm, X86FoldableSchedWrite sched,
                     bit Is2Addr = 1> {
+let hasSideEffects = 0 in {
   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
                   !if(Is2Addr,
                       !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
-                  itins.rr>, Sched<[itins.Sched]>;
+                  []>, Sched<[sched]>;
+  let mayLoad = 1 in
   def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
                   (ins DstRC:$src1, x86memop:$src2),
                   !if(Is2Addr,
                       !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
-                  itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                  []>, Sched<[sched.Folded, ReadAfterLd]>;
+}
 }
 
 let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
                   int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
-                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+                  WriteCvtSD2I>, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
                     int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
-                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+                    WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
 }
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
-                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
 
 
 let isCodeGenOnly = 1 in {
   let Predicates = [UseAVX] in {
   defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
-            SSE_CVT_SI2SS, 0>, XS, VEX_4V;
+            i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
   defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
-            SSE_CVT_SI2SS, 0>, XS, VEX_4V,
-            VEX_W;
+            i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
   defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
-            SSE_CVT_SI2SD, 0>, XD, VEX_4V;
+            i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
   defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
-            SSE_CVT_SI2SD, 0>, XD,
-            VEX_4V, VEX_W;
+            i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
   }
   let Constraints = "$src1 = $dst" in {
     defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
-                          "cvtsi2ss{l}", SSE_CVT_SI2SS>, XS;
+                          i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
     defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
-                          "cvtsi2ss{q}", SSE_CVT_SI2SS>, XS, REX_W;
+                          i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
     defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
-                          "cvtsi2sd{l}", SSE_CVT_SI2SD>, XD;
+                          i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
     defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
-                          "cvtsi2sd{q}", SSE_CVT_SI2SD>, XD, REX_W;
+                          i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
   }
 } // isCodeGenOnly = 1
 
@@ -1469,113 +1080,113 @@ let isCodeGenOnly = 1 in {
 let Predicates = [UseAVX] in {
 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                 ssmem, sse_load_f32, "cvttss2si",
-                                SSE_CVT_SS2SI_32>, XS, VEX;
+                                WriteCvtSS2I>, XS, VEX;
 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
-                               "cvttss2si", SSE_CVT_SS2SI_64>,
+                               "cvttss2si", WriteCvtSS2I>,
                                XS, VEX, VEX_W;
 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
                                 sdmem, sse_load_f64, "cvttsd2si",
-                                SSE_CVT_SD2SI>, XD, VEX;
+                                WriteCvtSS2I>, XD, VEX;
 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                               int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
-                              "cvttsd2si", SSE_CVT_SD2SI>,
+                              "cvttsd2si", WriteCvtSS2I>,
                               XD, VEX, VEX_W;
 }
 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
-                                    SSE_CVT_SS2SI_32>, XS;
+                                    WriteCvtSS2I>, XS;
 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                    int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
-                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+                                   "cvttss2si", WriteCvtSS2I>, XS, REX_W;
 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
                                     sdmem, sse_load_f64, "cvttsd2si",
-                                    SSE_CVT_SD2SI>, XD;
+                                    WriteCvtSD2I>, XD;
 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
-                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+                                  "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
 } // isCodeGenOnly = 1
 
 let Predicates = [UseAVX] in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+                                  WriteCvtSS2I>, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+                                  WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
 }
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                ssmem, sse_load_f32, "cvtss2si",
-                               SSE_CVT_SS2SI_32>, XS;
+                               WriteCvtSS2I>, XS;
 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
                                  ssmem, sse_load_f32, "cvtss2si",
-                                 SSE_CVT_SS2SI_64>, XS, REX_W;
+                                 WriteCvtSS2I>, XS, REX_W;
 
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
-                               SSEPackedSingle, SSE_CVT_I2PS>,
+                               SSEPackedSingle, WriteCvtI2PS>,
                                PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
-                               SSEPackedSingle, SSE_CVT_I2PS>,
+                               SSEPackedSingle, WriteCvtI2PSY>,
                                PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                            SSEPackedSingle, SSE_CVT_I2PS>,
+                            SSEPackedSingle, WriteCvtI2PS>,
                             PS, Requires<[UseSSE2]>;
 
 let Predicates = [UseAVX] in {
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>;
+                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>;
+                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>;
+                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
-                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>;
+                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>;
+                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>;
+                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
 }
 
 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>;
+                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>;
+                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>;
+                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
-                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>;
+                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>;
+                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>;
+                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>;
+                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
 
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
-                       (ins FR32:$src1, FR64:$src2),
-                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
+                        (ins FR32:$src1, FR64:$src2),
+                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                        VEX_4V, VEX_LIG, VEX_WIG,
+                        Sched<[WriteCvtSD2SS]>;
 let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
-                       (ins FR32:$src1, f64mem:$src2),
-                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
+                     (ins FR32:$src1, f64mem:$src2),
+                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                     XD, VEX_4V, VEX_LIG, VEX_WIG,
+                     Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
 }
 
 def : Pat<(f32 (fpround FR64:$src)),
@@ -1584,69 +1195,67 @@ def : Pat<(f32 (fpround FR64:$src)),
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (fpround FR64:$src))],
-                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+                      [(set FR32:$dst, (fpround FR64:$src))]>,
+                      Sched<[WriteCvtSD2SS]>;
 def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
-                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
-                      IIC_SSE_CVT_Scalar_RM>,
-                      XD,
-                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                    [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
+                    XD, Requires<[UseSSE2, OptForSize]>,
+                    Sched<[WriteCvtSD2SS.Folded]>;
 
 let isCodeGenOnly = 1 in {
 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
-                       Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
+                       XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+                       Sched<[WriteCvtSD2SS]>;
 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
-                                          VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG,
-                       Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
-
+                                          VR128:$src1, sse_load_f64:$src2))]>,
+                       XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+                       Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in {
 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
-                       Sched<[WriteCvtF2F]>;
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
+                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
-                                          VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
-                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                                          VR128:$src1, sse_load_f64:$src2))]>,
+                       XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
 }
 } // isCodeGenOnly = 1
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
-let hasSideEffects = 0, Predicates = [UseAVX] in {
+let hasSideEffects = 0 in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR64:$src1, FR32:$src2),
-                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG,
-                    Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                    XS, VEX_4V, VEX_LIG, VEX_WIG,
+                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR64:$src1, f32mem:$src2),
-                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                    XS, VEX_4V, VEX_LIG, VEX_WIG,
+                    Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>,
+                    Requires<[UseAVX, OptForSize]>;
 }
 
 def : Pat<(f64 (fpextend FR32:$src)),
     (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fpextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
 
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -1657,14 +1266,13 @@ def : Pat<(extloadf32 addr:$src),
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
-                   [(set FR64:$dst, (fpextend FR32:$src))],
-                   IIC_SSE_CVT_Scalar_RR>, XS,
-                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+                   [(set FR64:$dst, (fpextend FR32:$src))]>,
+                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
-                   [(set FR64:$dst, (extloadf32 addr:$src))],
-                   IIC_SSE_CVT_Scalar_RM>, XS,
-                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+                   [(set FR64:$dst, (extloadf32 addr:$src))]>,
+                   XS, Requires<[UseSSE2, OptForSize]>,
+                   Sched<[WriteCvtSS2SD.Folded]>;
 
 // extload f32 -> f64.  This matches load+fpextend because we have a hack in
 // the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1672,40 +1280,34 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
 // Since these loads aren't folded into the fpextend, we have to match it
 // explicitly here.
 def : Pat<(fpextend (loadf32 addr:$src)),
-          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
           (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
 
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst,
-                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
-                    Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
+                    []>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
+let mayLoad = 1 in
 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst,
-                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
-                    Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+                    Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
-                    Sched<[WriteCvtF2F]>;
+                    []>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtSS2SD]>;
+let mayLoad = 1 in
 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    []>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
 }
 } // isCodeGenOnly = 1
 
@@ -1732,9 +1334,19 @@ def : Pat<(v4f32 (X86Movss
 
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
           (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
 
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
+
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128:$dst),
                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
@@ -1742,8 +1354,18 @@ def : Pat<(v2f64 (X86Movsd
 
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
           (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
 } // Predicates = [UseAVX]
 
 let Predicates = [UseSSE2] in {
@@ -1766,8 +1388,18 @@ def : Pat<(v2f64 (X86Movsd
 
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
           (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
 } // Predicates = [UseSSE2]
 
 let Predicates = [UseSSE1] in {
@@ -1778,39 +1410,51 @@ def : Pat<(v4f32 (X86Movss
 
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
           (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
 } // Predicates = [UseSSE1]
 
+let Predicates = [HasAVX, NoVLX] in {
 // Convert packed single/double fp to doubleword
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
+                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
-                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
+                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
-                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
+                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
-                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
+                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
+}
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
+                     Sched<[WriteCvtPS2I]>;
 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
-                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
+                     Sched<[WriteCvtPS2ILd]>;
 
 
 // Convert Packed Double FP to Packed DW Integers
@@ -1822,7 +1466,7 @@ def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
-                       VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
@@ -1831,37 +1475,37 @@ def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
-                      Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
-                (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+                (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
 
 // YMM only
 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
-                (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+                (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
 }
 
 def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
-                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
+                      Sched<[WriteCvtPD2ILd]>;
 def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
-                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
+                      Sched<[WriteCvtPD2I]>;
 
 // Convert with truncation packed single/double fp to doubleword
 // SSE2 packed instructions with XS prefix
@@ -1869,43 +1513,61 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                           (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
-                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+                           (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
+                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                           (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
-                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                           (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
+                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
-                            (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
-                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+                            (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
+                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
-                            (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
-                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
-                          Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                            (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
+                          VEX, VEX_L,
+                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (VCVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+            (VCVTTPS2DQrm addr:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+            (VCVTTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+            (VCVTTPS2DQYrm addr:$src)>;
 }
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
-                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+                         (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
+                       Sched<[WriteCvtPS2I]>;
 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
-                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+                         (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
+                       Sched<[WriteCvtPS2ILd]>;
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (CVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+            (CVTTPS2DQrm addr:$src)>;
+}
 
 let Predicates = [HasAVX, NoVLX] in
 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
-                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
+                          (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
+                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
@@ -1914,76 +1576,80 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // XMM only
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+
 let Predicates = [HasAVX, NoVLX] in
 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
-                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                          (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
+                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
-                (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+                (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
 
 // YMM only
 let Predicates = [HasAVX, NoVLX] in {
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                           (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
-                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
+                           (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
+                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                           (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
-                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
+                           (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
+                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
 }
 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
-                (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+                (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
 
 let Predicates = [HasAVX, NoVLX] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
-              (VCVTPD2DQrr VR128:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
-              (VCVTPD2DQrm addr:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
-              (VCVTTPD2DQrr VR128:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
-              (VCVTTPD2DQrm addr:$src)>;
-  }
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+            (VCVTTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+            (VCVTPD2DQrr VR128:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
+            (VCVTPD2DQrm addr:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+            (VCVTTPD2DQrr VR128:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
+            (VCVTTPD2DQrm addr:$src)>;
 } // Predicates = [HasAVX, NoVLX]
 
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
-                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+                        (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
+                      Sched<[WriteCvtPD2I]>;
 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
-                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+                        (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
+                      Sched<[WriteCvtPD2ILd]>;
 
 let Predicates = [UseSSE2] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
-              (CVTPD2DQrr VR128:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
-              (CVTPD2DQrm addr:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
-              (CVTTPD2DQrr VR128:$src)>;
-    def : Pat<(X86vzmovl (v2i64 (bitconvert
-                                 (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
-              (CVTTPD2DQrm addr:$src)>;
-  }
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+            (CVTPD2DQrr VR128:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
+            (CVTPD2DQrm addr:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+            (CVTTPD2DQrr VR128:$src)>;
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
+            (CVTTPD2DQrm addr:$src)>;
 } // Predicates = [UseSSE2]
 
 // Convert packed single to packed double
@@ -1991,31 +1657,31 @@ let Predicates = [HasAVX, NoVLX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
-                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
+                    [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
+                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
+                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
-                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
+                     [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
+                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
+                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
-                   IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+                   [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
+                   PS, Sched<[WriteCvtPS2PD]>;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
+                   PS, Sched<[WriteCvtPS2PD.Folded]>;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -2025,35 +1691,36 @@ def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                        VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
+                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
-                        VEX, Sched<[WriteCvtI2F]>, VEX_WIG;
+                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG;
+                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
+                         VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG;
+                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
 }
 
 let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
-                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                       Sched<[WriteCvtI2PDLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
-                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+                         (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+                       Sched<[WriteCvtI2PD]>;
 
 // AVX register conversion intrinsics
 let Predicates = [HasAVX, NoVLX] in {
@@ -2078,8 +1745,8 @@ let Predicates = [UseSSE2] in {
 let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
-                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
+                       [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
+                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
@@ -2087,35 +1754,35 @@ def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
 let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
-                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+                       [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
+                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
-                (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
+                (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
 
 // YMM only
 let Predicates = [HasAVX, NoVLX] in {
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (fpround VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
+                        [(set VR128:$dst, (fpround VR256:$src))]>,
+                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
-                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
+                        [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
+                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
 }
 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
-                (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
+                (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
 
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
-                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+                     [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
+                     Sched<[WriteCvtPD2PS]>;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+                     [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
+                     Sched<[WriteCvtPD2PS.Folded]>;
 
 // AVX 256-bit register conversion intrinsics
 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
@@ -2123,64 +1790,53 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 
 let Predicates = [HasAVX, NoVLX] in {
   // Match fpround and fpextend for 128/256-bit conversions
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                 (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
-              (VCVTPD2PSrr VR128:$src)>;
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                 (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
-              (VCVTPD2PSrm addr:$src)>;
-  }
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
+            (VCVTPD2PSrm addr:$src)>;
 }
 
 let Predicates = [UseSSE2] in {
   // Match fpround and fpextend for 128 conversions
-  let AddedComplexity = 15 in {
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                 (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
-              (CVTPD2PSrr VR128:$src)>;
-    def : Pat<(X86vzmovl (v2f64 (bitconvert
-                                 (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
-              (CVTPD2PSrm addr:$src)>;
-  }
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
+            (CVTPD2PSrm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteFAdd in
-def SSE_COMIS : OpndItins<
-  IIC_SSE_COMIS_RR, IIC_SSE_COMIS_RM
->;
-
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm, string asm_alt,
-                            OpndItins itins> {
+                            X86FoldableSchedWrite sched> {
   let isCommutable = 1 in
   def rr : SIi8<0xC2, MRMSrcReg,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
-                itins.rr>, Sched<[itins.Sched]>;
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
+                Sched<[sched]>;
   def rm : SIi8<0xC2, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1),
-                                         (ld_frag addr:$src2), imm:$cc))],
-                                         itins.rm>,
-           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                         (ld_frag addr:$src2), imm:$cc))]>,
+                Sched<[sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
-                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
-                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
+                      Sched<[sched]>, NotMemoryFoldable;
     let mayLoad = 1 in
     def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
-                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
-                      IIC_SSE_ALU_F32S_RM>,
-                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
+                      Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
   }
 }
 
@@ -2188,43 +1844,41 @@ let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+                 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
 let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S>, // same latency as 32 bit compare
+                 SchedWriteFCmpSizes.PD.Scl>,
                  XD, VEX_4V, VEX_LIG, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
-                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
-                  XS;
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SchedWriteFCmpSizes.PS.Scl>, XS;
   let ExeDomain = SSEPackedDouble in
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SSE_ALU_F64S>, XD;
+                  SchedWriteFCmpSizes.PD.Scl>, XD;
 }
 
 multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
-                         Intrinsic Int, string asm, OpndItins itins,
+                         Intrinsic Int, string asm, X86FoldableSchedWrite sched,
                          ComplexPattern mem_cpat> {
   def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, VR128:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
-                                               VR128:$src, imm:$cc))],
-                                               itins.rr>,
-           Sched<[itins.Sched]>;
+                                               VR128:$src, imm:$cc))]>,
+           Sched<[sched]>;
 let mayLoad = 1 in
   def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, memop:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
-                                               mem_cpat:$src, imm:$cc))],
-                                               itins.rm>,
-           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                               mem_cpat:$src, imm:$cc))]>,
+           Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let isCodeGenOnly = 1 in {
@@ -2232,174 +1886,168 @@ let isCodeGenOnly = 1 in {
   let ExeDomain = SSEPackedSingle in
   defm VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                       SSE_ALU_F32S, sse_load_f32>, XS, VEX_4V;
+                       SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                       SSE_ALU_F32S, sse_load_f64>, // same latency as f32
+                       SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
                        XD, VEX_4V;
   let Constraints = "$src1 = $dst" in {
     let ExeDomain = SSEPackedSingle in
     defm CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                         SSE_ALU_F32S, sse_load_f32>, XS;
+                         SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
     let ExeDomain = SSEPackedDouble in
     defm CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                         SSE_ALU_F64S, sse_load_f64>, XD;
+                         SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
 }
 }
 
 
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
-                            ValueType vt, X86MemOperand x86memop,
-                            PatFrag ld_frag, string OpcodeStr,
-                            OpndItins itins> {
+                         ValueType vt, X86MemOperand x86memop,
+                         PatFrag ld_frag, string OpcodeStr,
+                         X86FoldableSchedWrite sched> {
 let hasSideEffects = 0 in {
   def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
-                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
-                     itins.rr>,
-          Sched<[itins.Sched]>;
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
+          Sched<[sched]>;
 let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
-                                           (ld_frag addr:$src2)))],
-                                           itins.rm>,
-          Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                           (ld_frag addr:$src2)))]>,
+          Sched<[sched.Folded, ReadAfterLd]>;
 }
 }
 
 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
-                            ValueType vt, Operand memop,
-                            ComplexPattern mem_cpat, string OpcodeStr,
-                            OpndItins itins> {
-  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                             ValueType vt, Operand memop,
+                             ComplexPattern mem_cpat, string OpcodeStr,
+                             X86FoldableSchedWrite sched> {
+  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
-                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
-                     itins.rr>,
-          Sched<[itins.Sched]>;
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
+          Sched<[sched]>;
 let mayLoad = 1 in
-  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
-                                           mem_cpat:$src2))],
-                                           itins.rm>,
-          Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                           mem_cpat:$src2))]>,
+          Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                               "ucomiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG;
+                               "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                               "ucomisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG;
+                               "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                "comiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG;
+                                "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                "comisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG;
+                                "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
   }
 
   let isCodeGenOnly = 1 in {
-    defm Int_VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                          sse_load_f32, "ucomiss", SSE_COMIS>, PS, VEX, VEX_WIG;
-    defm Int_VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                          sse_load_f64, "ucomisd", SSE_COMIS>, PD, VEX, VEX_WIG;
-
-    defm Int_VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                           sse_load_f32, "comiss", SSE_COMIS>, PS, VEX, VEX_WIG;
-    defm Int_VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                           sse_load_f64, "comisd", SSE_COMIS>, PD, VEX, VEX_WIG;
+    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+                      sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
+    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+                      sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
+
+    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+                       sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
+    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+                       sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSE_COMIS>, PS;
+                                  "ucomiss", WriteFCom>, PS;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSE_COMIS>, PD;
+                                  "ucomisd", WriteFCom>, PD;
 
   let Pattern = []<dag> in {
     defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                    "comiss", SSE_COMIS>, PS;
+                                    "comiss", WriteFCom>, PS;
     defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                    "comisd", SSE_COMIS>, PD;
+                                    "comisd", WriteFCom>, PD;
   }
 
   let isCodeGenOnly = 1 in {
-    defm Int_UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                                sse_load_f32, "ucomiss", SSE_COMIS>, PS;
-    defm Int_UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                                sse_load_f64, "ucomisd", SSE_COMIS>, PD;
-
-    defm Int_COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                                    sse_load_f32, "comiss", SSE_COMIS>, PS;
-    defm Int_COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                                    sse_load_f64, "comisd", SSE_COMIS>, PD;
+    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+                            sse_load_f32, "ucomiss", WriteFCom>, PS;
+    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+                            sse_load_f64, "ucomisd", WriteFCom>, PD;
+
+    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+                                sse_load_f32, "comiss", WriteFCom>, PS;
+    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+                                    sse_load_f64, "comisd", WriteFCom>, PD;
   }
 } // Defs = [EFLAGS]
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC,  ValueType VT, string asm,
-                            string asm_alt, Domain d,
-                            PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+                            string asm_alt, X86FoldableSchedWrite sched,
+                            Domain d, PatFrag ld_frag> {
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))],
-             itins.rr, d>,
-            Sched<[WriteFAdd]>;
+             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
+            Sched<[sched]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
              [(set RC:$dst,
-               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))],
-             itins.rm, d>,
-            Sched<[WriteFAddLd, ReadAfterLd]>;
+               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
+            Sched<[sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : PIi8<0xC2, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
-               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+               asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
     let mayLoad = 1 in
     def rmi_alt : PIi8<0xC2, MRMSrcMem,
                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
-               asm_alt, [], itins.rm, d>,
-               Sched<[WriteFAddLd, ReadAfterLd]>;
+               asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>,
+               NotMemoryFoldable;
   }
 }
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
+               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
+               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L;
+               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L;
+               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedSingle, memopv4f32, SSE_ALU_F32P>, PS;
+                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedDouble, memopv2f64, SSE_ALU_F64P>, PD;
+                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
 }
 
 def CommutableCMPCC : PatLeaf<(imm), [{
-  return (N->getZExtValue() == 0x00 || N->getZExtValue() == 0x03 ||
-          N->getZExtValue() == 0x04 || N->getZExtValue() == 0x07);
+  uint64_t Imm = N->getZExtValue() & 0x7;
+  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
 }]>;
 
 // Patterns to select compares with loads in first operand.
@@ -2453,120 +2101,114 @@ let Predicates = [UseSSE1] in {
 // SSE 1 & 2 - Shuffle Instructions
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteFShuffle in
-def SSE_SHUFP : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                          ValueType vt, string asm, PatFrag mem_frag,
-                         OpndItins itins, Domain d> {
+                         X86FoldableSchedWrite sched, Domain d> {
   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
-                                       (i8 imm:$src3))))], itins.rm, d>,
-            Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                       (i8 imm:$src3))))], d>,
+            Sched<[sched.Folded, ReadAfterLd]>;
   def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
                  (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
                  [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
-                                     (i8 imm:$src3))))], itins.rr, d>,
-            Sched<[itins.Sched]>;
+                                     (i8 imm:$src3))))], d>,
+            Sched<[sched]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
+           PS, VEX_4V, VEX_WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv8f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
+           PS, VEX_4V, VEX_L, VEX_WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv2f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
+           PD, VEX_4V, VEX_WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
+           PD, VEX_4V, VEX_L, VEX_WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv4f32, SSE_SHUFP, SSEPackedSingle>, PS;
+                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv2f64, SSE_SHUFP, SSEPackedDouble>, PD;
+                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
 }
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Unpack FP Instructions
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteFShuffle in
-def SSE_UNPCK : OpndItins<
-  IIC_SSE_UNPCK, IIC_SSE_UNPCK
->;
-
 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
                                    PatFrag mem_frag, RegisterClass RC,
                                    X86MemOperand x86memop, string asm,
-                                   OpndItins itins, Domain d, bit IsCommutable = 0> {
+                                   X86FoldableSchedWrite sched, Domain d,
+                                   bit IsCommutable = 0> {
     let isCommutable = IsCommutable in
     def rr : PI<opc, MRMSrcReg,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
                 asm, [(set RC:$dst,
-                           (vt (OpNode RC:$src1, RC:$src2)))],
-                           itins.rr, d>, Sched<[itins.Sched]>;
+                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
+                Sched<[sched]>;
     def rm : PI<opc, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                 asm, [(set RC:$dst,
                            (vt (OpNode RC:$src1,
-                                       (mem_frag addr:$src2))))],
-                                       itins.rm, d>,
-             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                       (mem_frag addr:$src2))))], d>,
+             Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
-                       SSE_UNPCK, SSEPackedSingle>, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
-                       SSE_UNPCK, SSEPackedDouble, 1>, PD;
+                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
-                       SSE_UNPCK, SSEPackedSingle>, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
-                       SSE_UNPCK, SSEPackedDouble>, PD;
+                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
@@ -2598,8 +2240,8 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
                                 string asm, Domain d> {
   def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
-              Sched<[WriteVecLogic]>;
+              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
+              Sched<[WriteFMOVMSK]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -2627,7 +2269,7 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
 /// PDI_binop_rm - Simple SSE2 binary operator.
 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                        X86MemOperand x86memop, OpndItins itins,
+                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
                         bit IsCommutable, bit Is2Addr> {
   let isCommutable = IsCommutable in
   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
@@ -2635,47 +2277,48 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[sched]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)))))],
-                                     itins.rm>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 } // ExeDomain = SSEPackedInt
 
 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          ValueType OpVT128, ValueType OpVT256,
-                         OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+                         X86SchedWriteWidths sched, bit IsCommutable,
+                         Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG;
+                             VR128, loadv2i64, i128mem, sched.XMM,
+                             IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
-                           memopv2i64, i128mem, itins, IsCommutable, 1>;
+                           memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
 
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, loadv4i64, i256mem, itins,
+                               OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
                                IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 // These are ordered here for pattern ordering requirements with the fp versions
 
 defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
-                           SSE_BIT_ITINS_P, 1, NoVLX>;
+                           SchedWriteVecLogic, 1, NoVLX>;
 defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
-                           SSE_BIT_ITINS_P, 1, NoVLX>;
+                           SchedWriteVecLogic, 1, NoVLX>;
 defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
-                           SSE_BIT_ITINS_P, 1, NoVLX>;
+                           SchedWriteVecLogic, 1, NoVLX>;
 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
-                           SSE_BIT_ITINS_P, 0, NoVLX>;
+                           SchedWriteVecLogic, 0, NoVLX>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Logical Instructions
@@ -2686,41 +2329,41 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
 /// There are no patterns here because isel prefers integer versions for SSE2
 /// and later. There are SSE1 v4f32 patterns later.
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
-                                   SDNode OpNode> {
+                                   SDNode OpNode, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
-        !strconcat(OpcodeStr, "ps"), f256mem,
+        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
         [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
-        !strconcat(OpcodeStr, "pd"), f256mem,
+        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
         [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
-       !strconcat(OpcodeStr, "ps"), f128mem,
+       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
        [], [], 0>, PS, VEX_4V, VEX_WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
-       !strconcat(OpcodeStr, "pd"), f128mem,
+       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
        [], [], 0>, PD, VEX_4V, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
-         !strconcat(OpcodeStr, "ps"), f128mem,
+         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
          [], []>, PS;
 
     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
-         !strconcat(OpcodeStr, "pd"), f128mem,
+         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
          [], []>, PD;
   }
 }
 
-defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
-defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
-defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
+defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
 let isCommutable = 0 in
-  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
 
 // If only AVX1 is supported, we need to handle integer operations with
 // floating point instructions since the integer versions aren't available.
@@ -2747,78 +2390,94 @@ let Predicates = [HasAVX1Only] in {
 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
   // Use packed logical operations for scalar ops.
   def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (VANDPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
   def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (VORPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
   def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (VXORPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
   def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (VANDNPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                               (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
 
   def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (VANDPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
   def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (VORPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
   def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (VXORPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
   def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (VANDNPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                               (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
 }
 
 let Predicates = [UseSSE1] in {
   // Use packed logical operations for scalar ops.
   def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (ANDPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
   def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (ORPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                            (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
   def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (XORPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
   def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
-            (COPY_TO_REGCLASS (ANDNPSrr
-                               (COPY_TO_REGCLASS FR32:$src1, VR128),
-                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+            (COPY_TO_REGCLASS
+             (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
+                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
+             FR32)>;
 }
 
 let Predicates = [UseSSE2] in {
   // Use packed logical operations for scalar ops.
   def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (ANDPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
   def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (ORPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                            (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
   def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (XORPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
   def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
-            (COPY_TO_REGCLASS (ANDNPDrr
-                               (COPY_TO_REGCLASS FR64:$src1, VR128),
-                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+            (COPY_TO_REGCLASS
+             (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
+                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
+             FR64)>;
 }
 
 // Patterns for packed operations when we don't have integer type available.
@@ -2858,99 +2517,99 @@ def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
 /// classes below
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNode, SizeItins itins> {
+                                  SDNode OpNode, X86SchedWriteSizes sched> {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG;
+                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG;
+                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
+                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
+                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                              itins.s>, PS;
+                              sched.PS.XMM>, PS;
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                              itins.d>, PD;
+                              sched.PD.XMM>, PD;
   }
 }
 
 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SizeItins itins> {
+                                  X86SchedWriteSizes sched> {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
-                         OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
+                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
                          XS, VEX_4V, VEX_LIG, VEX_WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
-                         OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
+                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
                          XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                               OpNode, FR32, f32mem, SSEPackedSingle,
-                              itins.s>, XS;
+                              sched.PS.Scl>, XS;
     defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                               OpNode, FR64, f64mem, SSEPackedDouble,
-                              itins.d>, XD;
+                              sched.PD.Scl>, XD;
   }
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
                                       SDPatternOperator OpNode,
-                                      SizeItins itins> {
+                                      X86SchedWriteSizes sched> {
   defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
   defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
+                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, itins.s>, XS;
+                   SSEPackedSingle, sched.PS.Scl>, XS;
     defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, itins.d>, XD;
+                   SSEPackedDouble, sched.PD.Scl>, XD;
   }
 }
 
 // Binary Arithmetic instructions
-defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
-           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>;
-defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
-           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>;
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
+           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
 let isCommutable = 0 in {
-  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
-             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>;
-  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
-             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>;
-  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
-             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>;
-  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
-             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>;
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
 }
 
 let isCodeGenOnly = 1 in {
-  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
-             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
-  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
-             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
 }
 
 // Patterns used to select SSE scalar fp arithmetic instructions from
@@ -2995,79 +2654,41 @@ let isCodeGenOnly = 1 in {
 
 // TODO: Some canonicalization in lowering would simplify the number of
 // patterns we have to try to match.
-multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
-  let Predicates = [UseSSE1] in {
-    // extracted scalar math op with insert via movss
-    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
-          FR32:$src))))),
-      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
-    // vector math op with insert via movss
-    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
-  }
-
-  // Repeat everything for AVX.
-  let Predicates = [UseAVX] in {
-    // extracted scalar math op with insert via movss
-    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
-          FR32:$src))))),
-      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
-          (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
-    // vector math op with insert via movss
-    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
-  }
-}
-
-defm : scalar_math_f32_patterns<fadd, "ADD">;
-defm : scalar_math_f32_patterns<fsub, "SUB">;
-defm : scalar_math_f32_patterns<fmul, "MUL">;
-defm : scalar_math_f32_patterns<fdiv, "DIV">;
-
-multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
-  let Predicates = [UseSSE2] in {
-    // extracted scalar math op with insert via movsd
-    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
-          FR64:$src))))),
-      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
-          (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-    // vector math op with insert via movsd
-    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
+                                    ValueType VT, ValueType EltTy,
+                                    RegisterClass RC, Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    // extracted scalar math op with insert via movss/movsd
+    def : Pat<(VT (Move (VT VR128:$dst),
+                        (VT (scalar_to_vector
+                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+                                 RC:$src))))),
+              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
+               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
   }
 
-  // Repeat everything for AVX.
+  // Repeat for AVX versions of the instructions.
   let Predicates = [UseAVX] in {
-    // extracted scalar math op with insert via movsd
-    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
-          FR64:$src))))),
-      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
-          (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-    // vector math op with insert via movsd
-    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+    // extracted scalar math op with insert via movss/movsd
+    def : Pat<(VT (Move (VT VR128:$dst),
+                        (VT (scalar_to_vector
+                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+                                 RC:$src))))),
+              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
+               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
   }
 }
 
-defm : scalar_math_f64_patterns<fadd, "ADD">;
-defm : scalar_math_f64_patterns<fsub, "SUB">;
-defm : scalar_math_f64_patterns<fmul, "MUL">;
-defm : scalar_math_f64_patterns<fdiv, "DIV">;
-
+defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
 
+defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+ 
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
@@ -3076,98 +2697,46 @@ defm : scalar_math_f64_patterns<fdiv, "DIV">;
 ///
 /// And, we have a special variant form for a full-vector intrinsic form.
 
-let Sched = WriteFSqrt in {
-def SSE_SQRTPS : OpndItins<
-  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
->;
-
-def SSE_SQRTSS : OpndItins<
-  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
->;
-
-def SSE_SQRTPD : OpndItins<
-  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
->;
-
-def SSE_SQRTSD : OpndItins<
-  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
->;
-}
-
-let Sched = WriteFRsqrt in {
-def SSE_RSQRTPS : OpndItins<
-  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
->;
-
-def SSE_RSQRTSS : OpndItins<
-  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
->;
-}
-
-def SSE_RSQRT_P : SizeItins<
-  SSE_RSQRTPS, SSE_RSQRTPS
->;
-
-def SSE_RSQRT_S : SizeItins<
-  SSE_RSQRTSS, SSE_RSQRTSS
->;
-
-let Sched = WriteFRcp in {
-def SSE_RCPP : OpndItins<
-  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
->;
-
-def SSE_RCPS : OpndItins<
-  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
->;
-}
-
-def SSE_RCP_P : SizeItins<
-  SSE_RCPP, SSE_RCPP
->;
-
-def SSE_RCP_S : SizeItins<
-  SSE_RCPS, SSE_RCPS
->;
-
 /// sse_fp_unop_s - SSE1 unops in scalar form
 /// For the non-AVX defs, we need $src1 to be tied to $dst because
 /// the HW instructions are 2 operand / destructive.
 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                          ValueType vt, ValueType ScalarVT,
-                          X86MemOperand x86memop,
-                          Operand intmemop, ComplexPattern int_cpat,
-                          Intrinsic Intr,
-                          SDNode OpNode, Domain d, OpndItins itins,
-                          Predicate target, string Suffix> {
+                          ValueType ScalarVT, X86MemOperand x86memop,
+                          Operand intmemop, SDNode OpNode, Domain d,
+                          X86FoldableSchedWrite sched, Predicate target> {
   let hasSideEffects = 0 in {
   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
               !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
-            [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
+            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
             Requires<[target]>;
   let mayLoad = 1 in
   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
             !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
-            [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
-            Sched<[itins.Sched.Folded, ReadAfterLd]>,
+            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
+            Sched<[sched.Folded, ReadAfterLd]>,
             Requires<[target, OptForSize]>;
 
   let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
+                Sched<[sched]>;
   let mayLoad = 1 in
   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
-              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
+                Sched<[sched.Folded, ReadAfterLd]>;
   }
   }
 
+}
+
+multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
+                              ComplexPattern int_cpat, Intrinsic Intr,
+                              Predicate target, string Suffix> {
   let Predicates = [target] in {
   // These are unary operations, but they are modeled as having 2 source operands
   // because the high elements of the destination are unchanged in SSE.
   def : Pat<(Intr VR128:$src),
-            (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
   }
   // We don't want to fold scalar loads into these instructions unless
   // optimizing for size. This is because the folded instruction will have a
@@ -3178,35 +2747,47 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   // rcpss mem, %xmm0
   let Predicates = [target, OptForSize] in {
     def : Pat<(Intr int_cpat:$src2),
-               (!cast<Instruction>(NAME#Suffix##m_Int)
+               (!cast<Instruction>(NAME#m_Int)
                       (vt (IMPLICIT_DEF)), addr:$src2)>;
   }
 }
 
+multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
+                              Intrinsic Intr, Predicate target> {
+  let Predicates = [target] in {
+   def : Pat<(Intr VR128:$src),
+             (!cast<Instruction>(NAME#r_Int) VR128:$src,
+                                 VR128:$src)>;
+  }
+  let Predicates = [target, OptForSize] in {
+    def : Pat<(Intr int_cpat:$src2),
+              (!cast<Instruction>(NAME#m_Int)
+                    (vt (IMPLICIT_DEF)), addr:$src2)>;
+  }
+}
+
 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                          ValueType vt, ValueType ScalarVT,
-                          X86MemOperand x86memop,
-                          Operand intmemop, ComplexPattern int_cpat,
-                          Intrinsic Intr, SDNode OpNode, Domain d,
-                          OpndItins itins, Predicate target, string Suffix> {
+                          ValueType ScalarVT, X86MemOperand x86memop,
+                          Operand intmemop, SDNode OpNode, Domain d,
+                          X86FoldableSchedWrite sched, Predicate target> {
   let hasSideEffects = 0 in {
   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-            [], itins.rr, d>, Sched<[itins.Sched]>;
+            [], d>, Sched<[sched]>;
   let mayLoad = 1 in
   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-            [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+            [], d>, Sched<[sched.Folded, ReadAfterLd]>;
   let isCodeGenOnly = 1, ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
                 (ins VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, Sched<[itins.Sched.Folded]>;
+             []>, Sched<[sched]>;
   let mayLoad = 1 in
   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
                 (ins VR128:$src1, intmemop:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+             []>, Sched<[sched.Folded, ReadAfterLd]>;
   }
   }
 
@@ -3218,164 +2799,191 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   // which has a clobber before the rcp, vs.
   // vrcpss mem, %xmm0, %xmm0
   // TODO: In theory, we could fold the load, and avoid the stall caused by
-  // the partial register store, either in ExecutionDepsFix or with smarter RA.
+  // the partial register store, either in BreakFalseDeps or with smarter RA.
   let Predicates = [target] in {
-   def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
+   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
-   def : Pat<(Intr VR128:$src),
-             (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
-                                 VR128:$src)>;
   }
   let Predicates = [target, OptForSize] in {
-    def : Pat<(Intr int_cpat:$src2),
-              (!cast<Instruction>("V"#NAME#Suffix##m_Int)
-                    (vt (IMPLICIT_DEF)), addr:$src2)>;
     def : Pat<(ScalarVT (OpNode (load addr:$src))),
-              (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
             addr:$src)>;
   }
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins, list<Predicate> prds> {
+                          X86SchedWriteWidths sched, list<Predicate> prds> {
 let Predicates = prds in {
   def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
+                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
+                       VEX, Sched<[sched.XMM]>, VEX_WIG;
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
+                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
+                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
+                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
+                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
 
   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
-            Sched<[itins.Sched]>;
+                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
+                Sched<[sched.XMM]>;
   def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
-            Sched<[itins.Sched.Folded]>;
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
+                Sched<[sched.XMM.Folded]>;
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
-                          SDNode OpNode, OpndItins itins> {
+                          SDNode OpNode, X86SchedWriteWidths sched> {
 let Predicates = [HasAVX, NoVLX] in {
   def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
+                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
+                       VEX, Sched<[sched.XMM]>, VEX_WIG;
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
+                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
+                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
+                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
+                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
 
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
-            Sched<[itins.Sched]>;
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
+                Sched<[sched.XMM]>;
   def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
-            Sched<[itins.Sched.Folded]>;
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
+                Sched<[sched.XMM.Folded]>;
+}
+
+multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          X86SchedWriteWidths sched, Predicate AVXTarget> {
+  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+                      UseSSE1, "SS">, XS;
+  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+                      AVXTarget>,
+                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
 }
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins, Predicate AVXTarget> {
-  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
-                      ssmem, sse_load_f32,
-                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
-                      SSEPackedSingle, itins, UseSSE1, "SS">, XS;
-  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
-                      f32mem, ssmem, sse_load_f32,
-                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
-                      SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
-                      VEX_LIG, VEX_WIG, NotMemoryFoldable;
+                          X86SchedWriteWidths sched, Predicate AVXTarget> {
+  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
+                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
+  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
+                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
+                       XS, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins, Predicate AVXTarget> {
-  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
-                         sdmem, sse_load_f64,
-                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
-                         OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
-  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
-                         f64mem, sdmem, sse_load_f64,
-                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
-                         OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
-                         XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
+                          X86SchedWriteWidths sched, Predicate AVXTarget> {
+  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
+                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
+  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
+                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 // Square root.
-defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>,
-             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>,
-             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>,
-             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
+             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
-defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>,
-             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
+defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
 
 // There is no f64 version of the reciprocal approximation instructions.
 
-// TODO: We should add *scalar* op patterns for these just like we have for
-// the binops above. If the binop and unop patterns could all be unified
-// that would be even better.
+multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
+                                      ValueType VT, Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+                                  (OpNode (extractelt VT:$src, 0))))),
+              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+
+  // Repeat for AVX versions of the instructions.
+  let Predicates = [UseAVX] in {
+    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+                                  (OpNode (extractelt VT:$src, 0))))),
+              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+}
+
+multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
+                                          ValueType VT, bits<8> ImmV,
+                                          Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+                                  (OpNode (extractelt VT:$src, 0))))),
+              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
+  }
+
+  // Repeat for AVX versions of the instructions.
+  let Predicates = [UseAVX] in {
+    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+                                  (OpNode (extractelt VT:$src, 0))))),
+              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
+  }
+}
+
+defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
 
-multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
-                                      SDNode Move, ValueType VT,
-                                      Predicate BasePredicate> {
+multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
+                                           SDNode Move, ValueType VT,
+                                           Predicate BasePredicate> {
   let Predicates = [BasePredicate] in {
     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
-              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   }
 
   // Repeat for AVX versions of the instructions.
   let Predicates = [HasAVX] in {
     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
-              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   }
 }
 
-defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
-                                  v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
-                                  v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
-                                  v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
-                                  v2f64, UseSSE2>;
+defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+                                       v4f32, UseSSE1>;
+defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+                                       v4f32, UseSSE1>;
 
 
 //===----------------------------------------------------------------------===//
@@ -3383,77 +2991,74 @@ defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
 //===----------------------------------------------------------------------===//
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
-let SchedRW = [WriteStore] in {
 let Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                      (ins f128mem:$dst, VR128:$src),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f32 VR128:$src),
-                                               addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
+                                               addr:$dst)]>, VEX, VEX_WIG;
 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f128mem:$dst, VR128:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v2f64 VR128:$src),
-                                               addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
-
-let ExeDomain = SSEPackedInt in
-def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
-                         (ins i128mem:$dst, VR128:$src),
-                         "movntdq\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v2i64 VR128:$src),
-                                                   addr:$dst)],
-                                                   IIC_SSE_MOVNT>, VEX, VEX_WIG;
+                                               addr:$dst)]>, VEX, VEX_WIG;
+} // SchedRW
 
+let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v8f32 VR256:$src),
-                                               addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
+                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f64 VR256:$src),
-                                               addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
-let ExeDomain = SSEPackedInt in
+                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
+} // SchedRW
+
+let ExeDomain = SSEPackedInt in {
+def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                         (ins i128mem:$dst, VR128:$src),
+                         "movntdq\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2i64 VR128:$src),
+                                                   addr:$dst)]>, VEX, VEX_WIG,
+                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
                     (ins i256mem:$dst, VR256:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4i64 VR256:$src),
-                                              addr:$dst)],
-                                              IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
-}
+                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
+                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
+} // ExeDomain
+} // Predicates
 
+let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
-                    IIC_SSE_MOVNT>;
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntpd\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
-                    IIC_SSE_MOVNT>;
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+} // SchedRW
 
-let ExeDomain = SSEPackedInt in
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
-                    IIC_SSE_MOVNT>;
+                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
 
+let SchedRW = [WriteStoreNT] in {
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti{l}\t{$src, $dst|$dst, $src}",
-                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
-                 IIC_SSE_MOVNT>,
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
                PS, Requires<[HasSSE2]>;
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti{q}\t{$src, $dst|$dst, $src}",
-                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
-                     IIC_SSE_MOVNT>,
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
                   PS, Requires<[HasSSE2]>;
-} // SchedRW = [WriteStore]
+} // SchedRW = [WriteStoreNT]
 
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
@@ -3489,47 +3094,40 @@ let Predicates = [UseSSE2] in {
 // Prefetch intrinsic.
 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
 def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
-    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
-    IIC_SSE_PREFETCH>, TB;
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
 def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
-    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
-    IIC_SSE_PREFETCH>, TB;
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
 def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
-    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
-    IIC_SSE_PREFETCH>, TB;
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
 def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
-    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
-    IIC_SSE_PREFETCH>, TB;
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
 }
 
 // FIXME: How should flush instruction be modeled?
 let SchedRW = [WriteLoad] in {
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
-               IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+               PS, Requires<[HasSSE2]>;
 }
 
 let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
 def PAUSE : I<0x90, RawFrm, (outs), (ins),
-              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS;
+              "pause", [(int_x86_sse2_pause)]>, OBXS;
 }
 
 let SchedRW = [WriteFence] in {
 // Load, store, and memory fence
 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
 // to include any 64-bit target.
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
-               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
                PS, Requires<[HasSSE1]>;
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
-               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
-               TB, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
-               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
-               TB, Requires<[HasMFence]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
+               PS, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
+               PS, Requires<[HasMFence]>;
 } // SchedRW
 
 def : Pat<(X86MFence), (MFENCE)>;
@@ -3539,18 +3137,18 @@ def : Pat<(X86MFence), (MFENCE)>;
 //===----------------------------------------------------------------------===//
 
 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-               IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG;
+               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
+               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-               IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG;
+               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
+               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
 
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
-              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-              IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
+              TB, Sched<[WriteLDMXCSR]>;
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-              IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
+              TB, Sched<[WriteSTMXCSR]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3558,128 +3156,122 @@ def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
-let hasSideEffects = 0, SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX, VEX_WIG;
-def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX, VEX_L, VEX_WIG;
+                      "movdqa\t{$src, $dst|$dst, $src}", []>,
+                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX, VEX_WIG;
+                      "movdqu\t{$src, $dst|$dst, $src}", []>,
+                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                      "movdqa\t{$src, $dst|$dst, $src}", []>,
+                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX, VEX_L, VEX_WIG;
+                      "movdqu\t{$src, $dst|$dst, $src}", []>,
+                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
 }
 
 // For Disassembler
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                        "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>,
-                        VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
+                          "movdqa\t{$src, $dst|$dst, $src}", []>,
+                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
+                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
-                        "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
-                        FoldGenData<"VMOVDQAYrr">;
+                          "movdqa\t{$src, $dst|$dst, $src}", []>,
+                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
+                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                        "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>,
-                        VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
+                          "movdqu\t{$src, $dst|$dst, $src}", []>,
+                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
+                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
-                        "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
-                        FoldGenData<"VMOVDQUYrr">;
+                          "movdqu\t{$src, $dst|$dst, $src}", []>,
+                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
+                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    hasSideEffects = 0, SchedRW = [WriteLoad] in {
-let Predicates = [HasAVX,NoVLX] in
+    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (alignedloadv2i64 addr:$src))],
-                   IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG;
+                      "movdqa\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
+                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX, VEX_L, VEX_WIG;
-let Predicates = [HasAVX,NoVLX] in
+                      "movdqa\t{$src, $dst|$dst, $src}", []>,
+                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+                      VEX, VEX_L, VEX_WIG;
 def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",
-                  [(set VR128:$dst, (loadv2i64 addr:$src))],
-                  IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG;
+                   "vmovdqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
+                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+                   XS, VEX, VEX_WIG;
 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                  XS, VEX, VEX_L, VEX_WIG;
+                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
+                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+                   XS, VEX, VEX_L, VEX_WIG;
 }
 
-let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
-let Predicates = [HasAVX,NoVLX] in
+let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
-                     (ins i128mem:$dst, VR128:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}",
-                     [(alignedstore (v2i64 VR128:$src), addr:$dst)],
-                     IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
+                      (ins i128mem:$dst, VR128:$src),
+                      "movdqa\t{$src, $dst|$dst, $src}",
+                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
+                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
-                     (ins i256mem:$dst, VR256:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX, VEX_L, VEX_WIG;
-let Predicates = [HasAVX,NoVLX] in
+                      (ins i256mem:$dst, VR256:$src),
+                      "movdqa\t{$src, $dst|$dst, $src}", []>,
+                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",
-                  [(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX, VEX_WIG;
+                   "vmovdqu\t{$src, $dst|$dst, $src}",
+                   [(store (v2i64 VR128:$src), addr:$dst)]>,
+                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX, VEX_L, VEX_WIG;
+                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
+                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
 }
 
-let SchedRW = [WriteMove] in {
+let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
 let hasSideEffects = 0 in {
 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+                   "movdqa\t{$src, $dst|$dst, $src}", []>;
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                   "movdqu\t{$src, $dst|$dst, $src}",
-                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+                   "movdqu\t{$src, $dst|$dst, $src}", []>,
+                   XS, Requires<[UseSSE2]>;
 }
 
 // For Disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                       "movdqa\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">;
+                       "movdqa\t{$src, $dst|$dst, $src}", []>,
+                       FoldGenData<"MOVDQArr">;
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                       "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>,
-                       FoldGenData<"MOVDQUrr">;
+                       "movdqu\t{$src, $dst|$dst, $src}", []>,
+                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
 }
 } // SchedRW
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    hasSideEffects = 0, SchedRW = [WriteLoad] in {
+    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
-                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
-                   IIC_SSE_MOVA_P_RM>;
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
 def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
-                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
-                   IIC_SSE_MOVU_P_RM>,
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
                  XS, Requires<[UseSSE2]>;
 }
 
-let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+let mayStore = 1, hasSideEffects = 0,
+    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
-                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
-                   IIC_SSE_MOVA_P_MR>;
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
 def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
-                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
-                   IIC_SSE_MOVU_P_MR>,
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
                  XS, Requires<[UseSSE2]>;
 }
 
@@ -3696,6 +3288,22 @@ def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
                 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
+                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
+                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
+                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
+                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
+                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
+                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
+
 let Predicates = [HasAVX, NoVLX] in {
   // Additional patterns for other integer sizes.
   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
@@ -3716,123 +3324,109 @@ let Predicates = [HasAVX, NoVLX] in {
 // SSE2 - Packed Integer Arithmetic Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteVecIMul in
-def SSE_PMADD : OpndItins<
-  IIC_SSE_PMADD, IIC_SSE_PMADD
->;
-
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          ValueType DstVT, ValueType SrcVT, RegisterClass RC,
                          PatFrag memop_frag, X86MemOperand x86memop,
-                         OpndItins itins, bit Is2Addr = 1> {
+                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[sched]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))],
-       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 } // ExeDomain = SSEPackedInt
 
 defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 1, NoVLX>;
+                             SchedWriteVecALU, 1, NoVLX>;
 defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
-                             SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+                             SchedWriteVecALU, 1, NoVLX>;
 defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
-                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
-                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
 defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
-                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
 defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 0, NoVLX>;
+                             SchedWriteVecALU, 0, NoVLX>;
 defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
-                             SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+                             SchedWriteVecALU, 0, NoVLX>;
 defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
+                             SchedWriteVecIMul, 1, NoVLX>;
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG;
+                              loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                              VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
-                               VR256, loadv4i64, i256mem, SSE_PMADD,
+                               VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
                                0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                             memopv2i64, i128mem, SSE_PMADD>;
+                             memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
-                             loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
+                             loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
-                             loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
+                             loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
                              VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
-                            memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
-
-let Predicates = [HasAVX, NoVLX] in
-defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
-                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                              VEX_4V, VEX_WIG;
-let Predicates = [HasAVX2, NoVLX] in
-defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
-                               VR256, loadv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
-let Constraints = "$src1 = $dst" in
-defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
-                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
+                            memopv2i64, i128mem, SchedWritePSADBW.XMM>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -3841,6 +3435,8 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
                          string OpcodeStr, SDNode OpNode,
                          SDNode OpNode2, RegisterClass RC,
+                         X86FoldableSchedWrite sched,
+                         X86FoldableSchedWrite schedImm,
                          ValueType DstVT, ValueType SrcVT,
                          PatFrag ld_frag, bit Is2Addr = 1> {
   // src2 is always 128-bit
@@ -3849,89 +3445,103 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
-       SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
+       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
+       Sched<[sched]>;
   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, i128mem:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (SrcVT (bitconvert (ld_frag addr:$src2))))))],
-       SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+                       (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, u8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
-       SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
+       Sched<[schedImm]>;
 }
 
 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
                              string OpcodeStr, SDNode OpNode,
                              SDNode OpNode2, ValueType DstVT128,
                              ValueType DstVT256, ValueType SrcVT,
-                             Predicate prd> {
+                             X86SchedWriteWidths sched,
+                             X86SchedWriteWidths schedImm, Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
-                              OpNode, OpNode2, VR128, DstVT128, SrcVT,
-                              loadv2i64, 0>, VEX_4V, VEX_WIG;
+                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
+                              DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
-                                OpNode, OpNode2, VR256, DstVT256, SrcVT,
-                                loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
+                                DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
+                                VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
-                           VR128, DstVT128, SrcVT, memopv2i64>;
+                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
+                            memopv2i64>;
 }
 
 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
                         SDNode OpNode, RegisterClass RC, ValueType VT,
-                        bit Is2Addr = 1> {
+                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
-       IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
+       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+       Sched<[sched]>;
 }
 
 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
-                           SDNode OpNode> {
+                            SDNode OpNode, X86SchedWriteWidths sched> {
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                             VR128, v16i8, 0>, VEX_4V, VEX_WIG;
+                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                               VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG;
+                               VR256, v32i8, sched.YMM, 0>,
+                               VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
-  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
+  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
+                           sched.XMM>;
 }
 
 let ExeDomain = SSEPackedInt in {
   defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
-                                 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
   defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
-                                 v4i32, v8i32, v4i32, NoVLX>;
+                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX>;
   defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
-                                 v2i64, v4i64, v2i64, NoVLX>;
+                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX>;
 
   defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
-                                 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
   defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
-                                 v4i32, v8i32, v4i32, NoVLX>;
+                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX>;
   defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
-                                 v2i64, v4i64, v2i64, NoVLX>;
+                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX>;
 
   defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
-                                 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
   defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
-                                 v4i32, v8i32, v4i32, NoVLX>;
+                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
+                                 SchedWriteVecShiftImm, NoVLX>;
 
-  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
-  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
-  // PSRADQri doesn't exist in SSE[1-3].
+  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
+                                 SchedWriteShuffle>;
+  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
+                                 SchedWriteShuffle>;
 } // ExeDomain = SSEPackedInt
 
 //===---------------------------------------------------------------------===//
@@ -3939,46 +3549,42 @@ let ExeDomain = SSEPackedInt in {
 //===---------------------------------------------------------------------===//
 
 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
+                             SchedWriteVecALU, 1, TruePredicate>;
 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
+                             SchedWriteVecALU, 1, TruePredicate>;
 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
+                             SchedWriteVecALU, 1, TruePredicate>;
 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
+                             SchedWriteVecALU, 0, TruePredicate>;
 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
+                             SchedWriteVecALU, 0, TruePredicate>;
 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
+                             SchedWriteVecALU, 0, TruePredicate>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Shuffle Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteShuffle in
-def SSE_PSHUF : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
-                         SDNode OpNode, OpndItins itins, Predicate prd> {
+                         SDNode OpNode, X86SchedWriteWidths sched,
+                         Predicate prd> {
 let Predicates = [HasAVX, prd] in {
   def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
-                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
+                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+                      VEX, Sched<[sched.XMM]>, VEX_WIG;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
                        (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
-                        (i8 imm:$src2))))], itins.rm>, VEX,
-                  Sched<[itins.Sched.Folded]>, VEX_WIG;
+                        (i8 imm:$src2))))]>, VEX,
+                  Sched<[sched.XMM.Folded]>, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, prd] in {
@@ -3987,16 +3593,16 @@ let Predicates = [HasAVX2, prd] in {
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
-                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
+                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
+                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
                         (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
-                         (i8 imm:$src2))))], itins.rm>, VEX, VEX_L,
-                   Sched<[itins.Sched.Folded]>, VEX_WIG;
+                         (i8 imm:$src2))))]>, VEX, VEX_L,
+                   Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -4004,27 +3610,27 @@ let Predicates = [UseSSE2] in {
                (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                [(set VR128:$dst,
-                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                itins.rr>, Sched<[itins.Sched]>;
+               [(set VR128:$dst,
+                 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+               Sched<[sched.XMM]>;
   def mi : Ii8<0x70, MRMSrcMem,
                (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                [(set VR128:$dst,
-                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                          (i8 imm:$src2))))], itins.rm>,
-           Sched<[itins.Sched.Folded]>;
+               [(set VR128:$dst,
+                 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                        (i8 imm:$src2))))]>,
+               Sched<[sched.XMM.Folded]>;
 }
 }
 } // ExeDomain = SSEPackedInt
 
-defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, SSE_PSHUF,
-                             NoVLX>, PD;
-defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, SSE_PSHUF,
-                             NoVLX_Or_NoBWI>, XS;
-defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF,
-                             NoVLX_Or_NoBWI>, XD;
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
+                             SchedWriteShuffle, NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
 
 //===---------------------------------------------------------------------===//
 // Packed Integer Pack Instructions (SSE & AVX)
@@ -4033,8 +3639,8 @@ defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF,
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                      ValueType ArgVT, SDNode OpNode, RegisterClass RC,
-                     X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag,
-                     bit Is2Addr = 1> {
+                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
+                     PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2),
                !if(Is2Addr,
@@ -4042,8 +3648,8 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                    !strconcat(OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set RC:$dst,
-                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))],
-               itins.rr>, Sched<[itins.Sched]>;
+                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
+               Sched<[sched]>;
   def rm : PDI<opc, MRMSrcMem,
                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                !if(Is2Addr,
@@ -4052,14 +3658,14 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set RC:$dst,
                      (OutVT (OpNode (ArgVT RC:$src1),
-                                    (bitconvert (ld_frag addr:$src2)))))],
-               itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                    (bitconvert (ld_frag addr:$src2)))))]>,
+               Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                      ValueType ArgVT, SDNode OpNode, RegisterClass RC,
-                     X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag,
-                     bit Is2Addr = 1> {
+                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
+                     PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : SS48I<opc, MRMSrcReg,
                  (outs RC:$dst), (ins RC:$src1, RC:$src2),
                  !if(Is2Addr,
@@ -4067,8 +3673,8 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                      !strconcat(OpcodeStr,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set RC:$dst,
-                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))],
-                 itins.rr>, Sched<[itins.Sched]>;
+                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
+                 Sched<[sched]>;
   def rm : SS48I<opc, MRMSrcMem,
                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                  !if(Is2Addr,
@@ -4077,49 +3683,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set RC:$dst,
                        (OutVT (OpNode (ArgVT RC:$src1),
-                                      (bitconvert (ld_frag addr:$src2)))))],
-                 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                      (bitconvert (ld_frag addr:$src2)))))]>,
+                 Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
-                             i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
-                             i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
-                             i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
-                             i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V;
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
-                              VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
-  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
-                              VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
 
-  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
-                              VR256,i256mem, SSE_PACK, loadv4i64, 0>,
+  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
-  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
-                              VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
-                            i128mem, SSE_PACK, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
-                            i128mem, SSE_PACK, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
-                            i128mem, SSE_PACK, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
-                            i128mem, SSE_PACK, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4127,107 +3737,106 @@ let Constraints = "$src1 = $dst" in {
 // SSE2 - Packed Integer Unpack Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteShuffle in
-def SSE_PUNPCK : OpndItins<
-  IIC_SSE_UNPCK, IIC_SSE_UNPCK
->;
-
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
                        SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
-                       OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> {
+                       X86FoldableSchedWrite sched, PatFrag ld_frag,
+                       bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
       (outs RC:$dst), (ins RC:$src1, RC:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))],
-      itins.rr>, Sched<[itins.Sched]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+      Sched<[sched]>;
   def rm : PDI<opc, MRMSrcMem,
       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1,
-                                  (bitconvert (ld_frag addr:$src2)))))],
-                                               itins.rm>,
-      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                  (bitconvert (ld_frag addr:$src2)))))]>,
+      Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
-                                 i128mem, SSE_PUNPCK, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
-                                  i256mem, SSE_PUNPCK, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
-                                i128mem, SSE_PUNPCK, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4237,41 +3846,41 @@ let Constraints = "$src1 = $dst" in {
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_pinsrw<bit Is2Addr = 1> {
-  def rri : Ii8<0xC4, MRMSrcReg,
+  def rr : Ii8<0xC4, MRMSrcReg,
        (outs VR128:$dst), (ins VR128:$src1,
         GR32orGR64:$src2, u8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
-         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
-       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
-  def rmi : Ii8<0xC4, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1,
-                        i16mem:$src2, u8imm:$src3),
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+       Sched<[WriteVecInsert]>;
+  def rm : Ii8<0xC4, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1,
+                       i16mem:$src2, u8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
          (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
-                    imm:$src3))], IIC_SSE_PINSRW>,
-       Sched<[WriteShuffleLd, ReadAfterLd]>;
+                    imm:$src3))]>,
+       Sched<[WriteVecInsertLd, ReadAfterLd]>;
 }
 
 // Extract
 let Predicates = [HasAVX, NoBWI] in
-def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                            imm:$src2))]>, PD, VEX,
-                Sched<[WriteShuffle]>;
-def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                                            imm:$src2))]>,
+                PD, VEX, Sched<[WriteVecExtract]>;
+def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                            imm:$src2))], IIC_SSE_PEXTRW>,
-               Sched<[WriteShuffleLd, ReadAfterLd]>;
+                                            imm:$src2))]>,
+               Sched<[WriteVecExtract]>;
 
 // Insert
 let Predicates = [HasAVX, NoBWI] in
@@ -4286,26 +3895,26 @@ defm PINSRW : sse2_pinsrw, PD;
 // SSE2 - Packed Mask Creation
 //===---------------------------------------------------------------------===//
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+let ExeDomain = SSEPackedInt in {
 
 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
-           IIC_SSE_MOVMSK>, VEX, VEX_WIG;
+           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
+           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
 
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
-           VEX, VEX_L, VEX_WIG;
+           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
-           IIC_SSE_MOVMSK>;
+           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
+           Sched<[WriteVecMOVMSK]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4313,31 +3922,28 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
 // SSE2 - Conditional Store
 //===---------------------------------------------------------------------===//
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
-
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
-           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+           VEX, VEX_WIG;
 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
-           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
+           VEX, VEX_WIG;
 
 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
-           IIC_SSE_MASKMOV>;
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
-           IIC_SSE_MASKMOV>;
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4350,55 +3956,54 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 //
 let ExeDomain = SSEPackedInt in {
 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
-                        VEX, Sched<[WriteMove]>;
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v4i32 (scalar_to_vector GR32:$src)))]>,
+                          VEX, Sched<[WriteVecMoveFromGpr]>;
 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
-                        IIC_SSE_MOVDQ>,
-                      VEX, Sched<[WriteLoad]>;
-def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}",
+                        "movd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2i64 (scalar_to_vector GR64:$src)))],
-                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+                        VEX, Sched<[WriteVecLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set VR128:$dst,
+                            (v2i64 (scalar_to_vector GR64:$src)))]>,
+                          VEX, Sched<[WriteVecMoveFromGpr]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                        "movq\t{$src, $dst|$dst, $src}",
-                        [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
+                          "movq\t{$src, $dst|$dst, $src}", []>,
+                          VEX, Sched<[WriteVecLoad]>;
 let isCodeGenOnly = 1 in
 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (bitconvert GR64:$src))],
-                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
+                         VEX, Sched<[WriteVecMoveFromGpr]>;
 
 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
-                  Sched<[WriteMove]>;
+                        (v4i32 (scalar_to_vector GR32:$src)))]>,
+                      Sched<[WriteVecMoveFromGpr]>;
 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
-                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+                      Sched<[WriteVecLoad]>;
 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2i64 (scalar_to_vector GR64:$src)))],
-                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+                          (v2i64 (scalar_to_vector GR64:$src)))]>,
+                        Sched<[WriteVecMoveFromGpr]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                        "movq\t{$src, $dst|$dst, $src}",
-                        [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+                        "movq\t{$src, $dst|$dst, $src}", []>,
+                        Sched<[WriteVecLoad]>;
 let isCodeGenOnly = 1 in
 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "movq\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (bitconvert GR64:$src))],
-                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
+                       Sched<[WriteVecMoveFromGpr]>;
 } // ExeDomain = SSEPackedInt
 
 //===---------------------------------------------------------------------===//
@@ -4407,23 +4012,22 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set FR32:$dst, (bitconvert GR32:$src))],
-                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
+                        VEX, Sched<[WriteVecMoveFromGpr]>;
 
   def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                        IIC_SSE_MOVDQ>,
-                        VEX, Sched<[WriteLoad]>;
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+                        VEX, Sched<[WriteVecLoad]>;
   def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set FR32:$dst, (bitconvert GR32:$src))],
-                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
+                        Sched<[WriteVecMoveFromGpr]>;
 
   def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+                        Sched<[WriteVecLoad]>;
 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 //===---------------------------------------------------------------------===//
@@ -4431,55 +4035,54 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
 //
 let ExeDomain = SSEPackedInt in {
 def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
-                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
-                    Sched<[WriteMove]>;
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+                                          (iPTR 0)))]>, VEX,
+                         Sched<[WriteVecMoveToGpr]>;
 def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
-                       (ins i32mem:$dst, VR128:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(store (i32 (extractelt (v4i32 VR128:$src),
-                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                                     VEX, Sched<[WriteStore]>;
+                         (ins i32mem:$dst, VR128:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(store (i32 (extractelt (v4i32 VR128:$src),
+                                       (iPTR 0))), addr:$dst)]>,
+                         VEX, Sched<[WriteVecStore]>;
 def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
-                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
-                   Sched<[WriteMove]>;
+                                        (iPTR 0)))]>,
+                   Sched<[WriteVecMoveToGpr]>;
 def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (extractelt (v4i32 VR128:$src),
-                                     (iPTR 0))), addr:$dst)],
-                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                                     (iPTR 0))), addr:$dst)]>,
+                       Sched<[WriteVecStore]>;
 } // ExeDomain = SSEPackedInt
+
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 let ExeDomain = SSEPackedInt in {
-let SchedRW = [WriteMove] in {
+let SchedRW = [WriteVecMoveToGpr] in {
 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "movq\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
-                                                        (iPTR 0)))],
-                                                           IIC_SSE_MOVD_ToGP>,
+                                                        (iPTR 0)))]>,
                       VEX;
 
 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
-                                                         (iPTR 0)))],
-                                                         IIC_SSE_MOVD_ToGP>;
+                                                         (iPTR 0)))]>;
 } //SchedRW
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
                           (ins i64mem:$dst, VR128:$src),
-                          "movq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+                          "movq\t{$src, $dst|$dst, $src}", []>,
+                          VEX, Sched<[WriteVecStore]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                        "movq\t{$src, $dst|$dst, $src}",
-                        [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                        "movq\t{$src, $dst|$dst, $src}", []>,
+                        Sched<[WriteVecStore]>;
 } // ExeDomain = SSEPackedInt
 
 //===---------------------------------------------------------------------===//
@@ -4490,28 +4093,28 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                           "movq\t{$src, $dst|$dst, $src}",
                           [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
-                          VEX, Sched<[WriteLoad]>;
+                          VEX, Sched<[WriteVecLoad]>;
   def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                            "movq\t{$src, $dst|$dst, $src}",
-                           [(set GR64:$dst, (bitconvert FR64:$src))],
-                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
+                           VEX, Sched<[WriteVecMoveToGpr]>;
   def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                            "movq\t{$src, $dst|$dst, $src}",
-                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
+                           VEX, Sched<[WriteVecStore]>;
 
   def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                          "movq\t{$src, $dst|$dst, $src}",
-                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
-                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                         Sched<[WriteVecLoad]>;
   def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
-                         [(set GR64:$dst, (bitconvert FR64:$src))],
-                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
+                         Sched<[WriteVecMoveToGpr]>;
   def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
-                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
+                         Sched<[WriteVecStore]>;
 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 //===---------------------------------------------------------------------===//
@@ -4520,79 +4123,67 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set GR32:$dst, (bitconvert FR32:$src))],
-                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
+                        VEX, Sched<[WriteVecMoveToGpr]>;
   def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
+                        VEX, Sched<[WriteVecStore]>;
   def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set GR32:$dst, (bitconvert FR32:$src))],
-                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
+                        Sched<[WriteVecMoveToGpr]>;
   def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
+                        Sched<[WriteVecStore]>;
 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 let Predicates = [UseAVX] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
-              (VMOVDI2PDIrr GR32:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+            (VMOVDI2PDIrr GR32:$src)>;
 
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
-              (VMOV64toPQIrr GR64:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+            (VMOV64toPQIrr GR64:$src)>;
 
-    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-              (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
-  }
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+              (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   // These instructions also write zeros in the high part of a 256-bit register.
-  let AddedComplexity = 20 in {
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
-              (VMOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
-              (VMOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-              (VMOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-              (VMOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzload addr:$src)),
-              (VMOVDI2PDIrm addr:$src)>;
-    def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
-              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
-    def : Pat<(v8i32 (X86vzload addr:$src)),
-              (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
-  }
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+            (VMOVDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+            (VMOVDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzload addr:$src)),
+            (VMOVDI2PDIrm addr:$src)>;
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+              (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
 }
 
 let Predicates = [UseSSE2] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
-              (MOVDI2PDIrr GR32:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+            (MOVDI2PDIrr GR32:$src)>;
 
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
-              (MOV64toPQIrr GR64:$src)>;
-  }
-  let AddedComplexity = 20 in {
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
-              (MOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
-              (MOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-              (MOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-              (MOVDI2PDIrm addr:$src)>;
-    def : Pat<(v4i32 (X86vzload addr:$src)),
-              (MOVDI2PDIrm addr:$src)>;
-  }
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+            (MOV64toPQIrr GR64:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+            (MOVDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+            (MOVDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (MOVDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzload addr:$src)),
+            (MOVDI2PDIrm addr:$src)>;
 }
 
 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
@@ -4616,7 +4207,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
 // Move Quadword Int to Packed Quadword Int
 //
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4625,34 +4216,32 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
-                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
-                      IIC_SSE_MOVDQ>, XS,
-                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 } // ExeDomain, SchedRW
 
 //===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
 //
-let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                      "movq\t{$src, $dst|$dst, $src}",
-                      [(store (i64 (extractelt (v2i64 VR128:$src),
-                                    (iPTR 0))), addr:$dst)],
-                                    IIC_SSE_MOVDQ>, VEX, VEX_WIG;
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (i64 (extractelt (v2i64 VR128:$src),
+                                      (iPTR 0))), addr:$dst)]>,
+                        VEX, VEX_WIG;
 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
-                                    (iPTR 0))), addr:$dst)],
-                                    IIC_SSE_MOVDQ>;
+                                    (iPTR 0))), addr:$dst)]>;
 } // ExeDomain, SchedRW
 
 // For disassembler only
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    SchedRW = [WriteVecLogic] in {
+    SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG;
+                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+                      "movq\t{$src, $dst|$dst, $src}", []>;
 }
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -4660,29 +4249,26 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
                 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
 
-let Predicates = [UseAVX], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (VMOVQI2PQIrm addr:$src)>;
+def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
+                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
+                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
+
+let Predicates = [UseAVX] in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (VMOVQI2PQIrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
-            (VMOVQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)),
             (VMOVQI2PQIrm addr:$src)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
               (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
   def : Pat<(v4i64 (X86vzload addr:$src)),
-            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
 }
 
-let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (MOVQI2PQIrm addr:$src)>;
+let Predicates = [UseSSE2] in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVQI2PQIrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
-            (MOVQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
 }
 
@@ -4690,62 +4276,61 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in {
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
 //
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
-let AddedComplexity = 15 in
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
-                    IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[UseAVX]>, VEX_WIG;
-let AddedComplexity = 15 in
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
-                    IIC_SSE_MOVQ_RR>,
-                      XS, Requires<[UseSSE2]>;
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+                        XS, Requires<[UseSSE2]>;
 } // ExeDomain, SchedRW
 
-let AddedComplexity = 20 in {
-  let Predicates = [UseAVX] in {
-    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
-              (VMOVZPQILo2PQIrr VR128:$src)>;
-  }
-  let Predicates = [UseSSE2] in {
-    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
-              (MOVZPQILo2PQIrr VR128:$src)>;
-  }
+let Predicates = [UseAVX] in {
+  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+            (VMOVZPQILo2PQIrr VR128:$src)>;
+}
+let Predicates = [UseSSE2] in {
+  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+            (MOVZPQILo2PQIrr VR128:$src)>;
 }
 
 //===---------------------------------------------------------------------===//
 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
+
 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
                               ValueType vt, RegisterClass RC, PatFrag mem_frag,
-                              X86MemOperand x86memop> {
+                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set RC:$dst, (vt (OpNode RC:$src)))],
-                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
+                      Sched<[sched]>;
 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
-                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
+                      Sched<[sched.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
+                                       v4f32, VR128, loadv4f32, f128mem,
+                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
+                                       v4f32, VR128, loadv4f32, f128mem,
+                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
+                                       v8f32, VR256, loadv8f32, f256mem,
+                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
+                                       v8f32, VR256, loadv8f32, f256mem,
+                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
-                                   memopv4f32, f128mem>;
+                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
-                                   memopv4f32, f128mem>;
+                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
 
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
@@ -4781,44 +4366,40 @@ let Predicates = [UseSSE3] in {
 // SSE3 - Replicate Double FP - MOVDDUP
 //===---------------------------------------------------------------------===//
 
-// FIXME: Improve MOVDDUP/BROADCAST reg/mem scheduling itineraries.
-let Sched = WriteFShuffle in
-def SSE_MOVDDUP : OpndItins<
-  IIC_SSE_MOV_LH, IIC_SSE_MOV_LH
->;
-
-multiclass sse3_replicate_dfp<string OpcodeStr> {
+multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
-                    IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
+                    Sched<[sched.XMM]>;
 def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (v2f64 (X86Movddup
-                              (scalar_to_vector (loadf64 addr:$src)))))],
-                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+                              (scalar_to_vector (loadf64 addr:$src)))))]>,
+                    Sched<[sched.XMM.Folded]>;
 }
 
-// FIXME: Merge with above classe when there're patterns for the ymm version
-multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+// FIXME: Merge with above classes when there are patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
-                    Sched<[WriteFShuffle]>;
+                    Sched<[sched.YMM]>;
 def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
                       (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
-                    Sched<[WriteLoad]>;
+                    Sched<[sched.YMM.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG;
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
+                                      VEX, VEX_WIG;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
+                                        VEX, VEX_L, VEX_WIG;
 }
 
-defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
 
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -4836,152 +4417,149 @@ let Predicates = [UseSSE3] in {
 // SSE3 - Move Unaligned Integer
 //===---------------------------------------------------------------------===//
 
-let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                   "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG;
+                      "vlddqu\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
+                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                   "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
-                   VEX, VEX_L, VEX_WIG;
-}
+                       "vlddqu\t{$src, $dst|$dst, $src}",
+                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
+} // Predicates
+
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
-                   IIC_SSE_LDDQU>;
-}
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
+                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
 
 //===---------------------------------------------------------------------===//
 // SSE3 - Arithmetic
 //===---------------------------------------------------------------------===//
 
 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
-                       X86MemOperand x86memop, OpndItins itins,
+                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
                        PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : I<0xD0, MRMSrcReg,
        (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
+       Sched<[sched]>;
   def rm : I<0xD0, MRMSrcMem,
        (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))],
-       itins.rr>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
-                                 SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V,
-                                 VEX_WIG;
+                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
+                                 XD, VEX_4V, VEX_WIG;
     defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
-                                  SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V,
-                                  VEX_L, VEX_WIG;
+                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
+                                  XD, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
-                                 SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V,
-                                 VEX_WIG;
+                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
+                                 PD, VEX_4V, VEX_WIG;
     defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
-                                  SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V,
-                                  VEX_L, VEX_WIG;
+                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
+                                  PD, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
-  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, SSE_ALU_F32P,
-                              memopv4f32>, XD;
+  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
+                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
   let ExeDomain = SSEPackedDouble in
-  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, SSE_ALU_F64P,
-                              memopv2f64>, PD;
+  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
+                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
 }
 
 //===---------------------------------------------------------------------===//
 // SSE3 Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteFHAdd in
-def SSE_HADDSUB : OpndItins<
-  IIC_SSE_HADDSUB_RR, IIC_SSE_HADDSUB_RM
->;
-
 // Horizontal ops
 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                   X86MemOperand x86memop, SDNode OpNode, OpndItins itins,
-                   PatFrag ld_frag, bit Is2Addr = 1> {
+                   X86MemOperand x86memop, SDNode OpNode,
+                   X86FoldableSchedWrite sched, PatFrag ld_frag,
+                   bit Is2Addr = 1> {
   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>,
-      Sched<[itins.Sched]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+      Sched<[sched]>;
 
   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
-        itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+      Sched<[sched.Folded, ReadAfterLd]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                  X86MemOperand x86memop, SDNode OpNode, OpndItins itins,
-                  PatFrag ld_frag, bit Is2Addr = 1> {
+                  X86MemOperand x86memop, SDNode OpNode,
+                  X86FoldableSchedWrite sched, PatFrag ld_frag,
+                  bit Is2Addr = 1> {
   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>,
-      Sched<[itins.Sched]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+        Sched<[sched]>;
 
   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
-        itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+        Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG;
+                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG;
+                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
-    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                            X86fhadd, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG;
-    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                            X86fhsub, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG;
-    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
-    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in {
     defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
-                          SSE_HADDSUB, memopv4f32>;
+                          WriteFHAdd, memopv4f32>;
     defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
-                          SSE_HADDSUB, memopv4f32>;
+                          WriteFHAdd, memopv4f32>;
   }
   let ExeDomain = SSEPackedDouble in {
     defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
-                         SSE_HADDSUB, memopv2f64>;
+                         WriteFHAdd, memopv2f64>;
     defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
-                         SSE_HADDSUB, memopv2f64>;
+                         WriteFHAdd, memopv2f64>;
   }
 }
 
@@ -4989,105 +4567,85 @@ let Constraints = "$src1 = $dst" in {
 // SSSE3 - Packed Absolute Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteVecALU in
-def SSE_PABS : OpndItins<
-  IIC_SSE_PABS_RR, IIC_SSE_PABS_RM
->;
-
 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
-                        SDNode OpNode, OpndItins itins, PatFrag ld_frag> {
+                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
   def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (vt (OpNode VR128:$src)))],
-                 itins.rr>, Sched<[itins.Sched]>;
+                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
+                 Sched<[sched.XMM]>;
 
   def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
                  (ins i128mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst,
-                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
-                 itins.rm>, Sched<[itins.Sched.Folded]>;
+                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
+                 Sched<[sched.XMM.Folded]>;
 }
 
 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
-                          SDNode OpNode, OpndItins itins> {
+                          SDNode OpNode, X86SchedWriteWidths sched> {
   def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
                   (ins VR256:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (vt (OpNode VR256:$src)))], itins.rr>,
-                  Sched<[itins.Sched]>;
+                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+                  Sched<[sched.YMM]>;
 
   def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
                   (ins i256mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst,
-                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))], itins.rm>,
-                  Sched<[itins.Sched.Folded]>;
+                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+                  Sched<[sched.YMM.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
-  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
+  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
+                              loadv2i64>, VEX, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
+                              loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
+  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
+                              loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
-  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
+  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
+                                VEX, VEX_L, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
+                                VEX, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
+  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
+                                VEX, VEX_L, VEX_WIG;
 }
 
-defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SSE_PABS, memopv2i64>;
-defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SSE_PABS, memopv2i64>;
-defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SSE_PABS, memopv2i64>;
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
+                          memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
+                          memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
+                          memopv2i64>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WritePHAdd in {
-def SSE_PHADDSUBD : OpndItins<
-  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
->;
-def SSE_PHADDSUBSW : OpndItins<
-  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
->;
-def SSE_PHADDSUBW : OpndItins<
-  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
->;
-}
-let Sched = WriteShuffle in
-def SSE_PSHUFB : OpndItins<
-  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
->;
-let Sched = WriteVecALU in
-def SSE_PSIGN : OpndItins<
-  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
->;
-let Sched = WriteVecIMul in
-def SSE_PMULHRSW : OpndItins<
-  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
->;
-
 /// SS3I_binop_rm - Simple SSSE3 bin op
 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          ValueType DstVT, ValueType OpVT, RegisterClass RC,
                          PatFrag memop_frag, X86MemOperand x86memop,
-                         OpndItins itins, bit Is2Addr = 1> {
+                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
+       Sched<[sched]>;
   def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
@@ -5095,93 +4653,93 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
          (DstVT (OpNode (OpVT RC:$src1),
-          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+          (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                             Intrinsic IntId128, OpndItins itins,
+                             Intrinsic IntId128, X86FoldableSchedWrite sched,
                              PatFrag ld_frag, bit Is2Addr = 1> {
   let isCommutable = 1 in
-  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], itins.rr>,
-       Sched<[itins.Sched]>;
-  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[sched]>;
+  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (ld_frag addr:$src2))))], itins.rm>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+          (bitconvert (ld_frag addr:$src2))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId256,
-                               X86FoldableSchedWrite Sched> {
+                               X86FoldableSchedWrite sched> {
   let isCommutable = 1 in
-  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
        (ins VR256:$src1, VR256:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
-       Sched<[Sched]>;
-  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+       Sched<[sched]>;
+  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
-       Sched<[Sched.Folded, ReadAfterLd]>;
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PSHUFB, 0>, VEX_4V, VEX_WIG;
+                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
                                   v16i8, VR128, loadv2i64, i128mem,
-                                  SSE_PMADD, 0>, VEX_4V, VEX_WIG;
+                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG;
+                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBD, 0>, VEX_4V;
+                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 }
 
@@ -5189,42 +4747,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
                                    v32i8, VR256, loadv4i64, i256mem,
-                                   SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
-  defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
-                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
-  defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
-                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
-  defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
-                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
-  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
-                                        int_x86_avx2_phadd_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
-  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
-                                        int_x86_avx2_phsub_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
+  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
+                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
+                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
+                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+                                       int_x86_avx2_phadd_sw,
+                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
+  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+                                       int_x86_avx2_phsub_sw,
+                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
 }
 }
 
@@ -5232,47 +4790,42 @@ let isCommutable = 0 in {
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
-                                     SSE_PSIGN, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memopv2i64>;
   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
-                                     SSE_PSIGN, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memopv2i64>;
   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
-                                     SSE_PSIGN, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memopv2i64>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
-                                 memopv2i64, i128mem, SSE_PSHUFB>;
+                                 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128,
-                                     SSE_PHADDSUBSW, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memopv2i64>;
   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128,
-                                     SSE_PHADDSUBSW, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memopv2i64>;
   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
                                  v16i8, VR128, memopv2i64, i128mem,
-                                 SSE_PMADD>;
+                                 SchedWriteVecIMul.XMM>;
 }
 defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
-                                 VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
+                                 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
 }
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Align Instruction Patterns
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteShuffle in
-def SSE_PALIGN : OpndItins<
-  IIC_SSE_PALIGNRR, IIC_SSE_PALIGNRM
->;
-
 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
                          PatFrag memop_frag, X86MemOperand x86memop,
-                         OpndItins itins, bit Is2Addr = 1> {
+                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   let hasSideEffects = 0 in {
   def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
       (ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -5280,8 +4833,8 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))],
-      itins.rr>, Sched<[itins.Sched]>;
+      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
+      Sched<[sched]>;
   let mayLoad = 1 in
   def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
       (ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5291,20 +4844,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
                                      (bitconvert (memop_frag addr:$src2)),
-                                     (i8 imm:$src3))))],
-      itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                                     (i8 imm:$src3))))]>,
+      Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64,
-                                i128mem, SSE_PALIGN, 0>, VEX_4V, VEX_WIG;
+  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
+                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
-  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64,
-                                 i256mem, SSE_PALIGN, 0>, VEX_4V, VEX_L, VEX_WIG;
+  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
+                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64,
-                               i128mem, SSE_PALIGN>;
+  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
+                               SchedWriteShuffle.XMM>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Thread synchronization
@@ -5318,13 +4871,12 @@ def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
 }
 
 let Uses = [EAX, ECX, EDX] in
-def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
-                 TB, Requires<[HasSSE3]>;
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+                   TB, Requires<[HasSSE3]>;
 
 let Uses = [ECX, EAX] in
 def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
-                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
-                TB, Requires<[HasSSE3]>;
+                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
 } // SchedRW
 
 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
@@ -5340,45 +4892,39 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
 //===----------------------------------------------------------------------===//
 
 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
-                          RegisterClass OutRC, RegisterClass InRC,
-                          OpndItins itins> {
+                            RegisterClass OutRC, RegisterClass InRC,
+                            X86FoldableSchedWrite sched> {
   def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [], itins.rr>,
-                 Sched<[itins.Sched]>;
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                 Sched<[sched]>;
 
   def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [],
-                 itins.rm>, Sched<[itins.Sched.Folded]>;
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                 Sched<[sched.Folded]>;
 }
 
 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
-                          X86MemOperand MemOp, X86MemOperand MemYOp,
-                          OpndItins SSEItins, OpndItins AVXItins,
-                          OpndItins AVX2Itins, Predicate prd> {
-  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+                              X86MemOperand MemOp, X86MemOperand MemYOp,
+                              Predicate prd> {
+  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
+                               SchedWriteShuffle.XMM>;
   let Predicates = [HasAVX, prd] in
     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
-                                     VR128, VR128, AVXItins>, VEX, VEX_WIG;
+                                     VR128, VR128, SchedWriteShuffle.XMM>,
+                                     VEX, VEX_WIG;
   let Predicates = [HasAVX2, prd] in
     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
-                                     VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG;
+                                     VR256, VR128, WriteShuffle256>,
+                                     VEX, VEX_L, VEX_WIG;
 }
 
 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
                           X86MemOperand MemYOp, Predicate prd> {
   defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
-                                        MemOp, MemYOp,
-                                        SSE_INTALU_ITINS_SHUFF_P,
-                                        DEFAULT_ITINS_SHUFFLESCHED,
-                                        DEFAULT_ITINS_SHUFFLESCHED, prd>;
+                                        MemOp, MemYOp, prd>;
   defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
                                         !strconcat("pmovzx", OpcodeStr),
-                                        MemOp, MemYOp,
-                                        SSE_INTALU_ITINS_SHUFF_P,
-                                        DEFAULT_ITINS_SHUFFLESCHED,
-                                        DEFAULT_ITINS_SHUFFLESCHED, prd>;
+                                        MemOp, MemYOp, prd>;
 }
 
 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
@@ -5490,7 +5036,7 @@ defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
 
 // SSE4.1/AVX patterns.
 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
-                                SDNode ExtOp, PatFrag ExtLoad16> {
+                                SDNode ExtOp> {
   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
@@ -5549,7 +5095,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
@@ -5591,12 +5137,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   }
 }
 
-defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>;
-defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>;
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
 
 let Predicates = [UseSSE41] in {
-  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>;
-  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>;
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5611,15 +5157,14 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
                                          imm:$src2))]>,
-                  Sched<[WriteShuffle]>;
-  let hasSideEffects = 0, mayStore = 1,
-      SchedRW = [WriteShuffleLd, WriteRMW] in
+                  Sched<[WriteVecExtract]>;
+  let hasSideEffects = 0, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
-                          addr:$dst)]>;
+                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
 let Predicates = [HasAVX, NoBWI] in
@@ -5634,17 +5179,16 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
   def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins VR128:$src1, u8imm:$src2),
                    !strconcat(OpcodeStr,
-                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   []>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>;
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
 
-  let hasSideEffects = 0, mayStore = 1,
-      SchedRW = [WriteShuffleLd, WriteRMW] in
+  let hasSideEffects = 0, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
-                          addr:$dst)]>;
+                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
 let Predicates = [HasAVX, NoBWI] in
@@ -5661,14 +5205,13 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32:$dst,
                   (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
-                  Sched<[WriteShuffle]>;
-  let SchedRW = [WriteShuffleLd, WriteRMW] in
+                  Sched<[WriteVecExtract]>;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
-                          addr:$dst)]>;
+                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
@@ -5684,14 +5227,13 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR64:$dst,
                   (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
-                  Sched<[WriteShuffle]>;
-  let SchedRW = [WriteShuffleLd, WriteRMW] in
+                  Sched<[WriteVecExtract]>;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
-                          addr:$dst)]>;
+                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
@@ -5701,28 +5243,26 @@ defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
-multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
-                            OpndItins itins = DEFAULT_ITINS> {
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
-                 (ins VR128:$src1, u8imm:$src2),
-                 !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32orGR64:$dst,
-                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
-                    itins.rr>, Sched<[WriteFBlend]>;
-  let SchedRW = [WriteFBlendLd, WriteRMW] in
+                   (ins VR128:$src1, u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set GR32orGR64:$dst,
+                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+                   Sched<[WriteVecExtract]>;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
-                 !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
-                          addr:$dst)], itins.rm>;
+                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
     defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
-  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
 }
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -5750,7 +5290,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
-      Sched<[WriteShuffle]>;
+      Sched<[WriteVecInsert]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5759,7 +5299,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
-                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+                   imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoBWI] in
@@ -5776,7 +5316,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
-      Sched<[WriteShuffle]>;
+      Sched<[WriteVecInsert]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5785,7 +5325,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
-                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+                          imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
@@ -5802,7 +5342,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
-      Sched<[WriteShuffle]>;
+      Sched<[WriteVecInsert]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5811,7 +5351,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
-                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+                          imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
@@ -5823,8 +5363,7 @@ let Constraints = "$src1 = $dst" in
 // are optimized inserts that won't zero arbitrary elements in the destination
 // vector. The next one matches the intrinsic and could zero arbitrary elements
 // in the target vector.
-multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
-                           OpndItins itins = DEFAULT_ITINS> {
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5832,8 +5371,8 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
-      Sched<[WriteFShuffle]>;
+        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
+      Sched<[SchedWriteFShuffle.XMM]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5843,15 +5382,16 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
       [(set VR128:$dst,
         (X86insertps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                    imm:$src3))], itins.rm>,
-      Sched<[WriteFShuffleLd, ReadAfterLd]>;
+                    imm:$src3))]>,
+      Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG;
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
+                     VEX_4V, VEX_WIG;
   let Constraints = "$src1 = $dst" in
-    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
 }
 
 let Predicates = [UseAVX] in {
@@ -5869,66 +5409,44 @@ let Predicates = [UseAVX] in {
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
                            X86MemOperand x86memop, RegisterClass RC,
-                           ValueType VT32, ValueType VT64,
-                           PatFrag mem_frag32, PatFrag mem_frag64,
-                           SDNode OpNode> {
-let ExeDomain = SSEPackedSingle in {
+                           ValueType VT, PatFrag mem_frag, SDNode OpNode,
+                           X86FoldableSchedWrite sched> {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
-  def PSr : SS4AIi8<opcps, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (VT32 (OpNode RC:$src1, imm:$src2)))],
-                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
-
-  // Vector intrinsic operation, mem
-  def PSm : SS4AIi8<opcps, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst,
-                          (VT32 (OpNode (mem_frag32 addr:$src1),imm:$src2)))],
-                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
-} // ExeDomain = SSEPackedSingle
-
-let ExeDomain = SSEPackedDouble in {
-  // Vector intrinsic operation, reg
-  def PDr : SS4AIi8<opcpd, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (VT64 (OpNode RC:$src1, imm:$src2)))],
-                    IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAdd]>;
+  def r : SS4AIi8<opc, MRMSrcReg,
+                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                  !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
+                  Sched<[sched]>;
 
   // Vector intrinsic operation, mem
-  def PDm : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst,
-                          (VT64 (OpNode (mem_frag64 addr:$src1),imm:$src2)))],
-                          IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAddLd]>;
-} // ExeDomain = SSEPackedDouble
+  def m : SS4AIi8<opc, MRMSrcMem,
+                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                  !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set RC:$dst,
+                        (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
+                  Sched<[sched.Folded]>;
 }
 
 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
-                          string OpcodeStr> {
+                          string OpcodeStr, X86FoldableSchedWrite sched> {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
   def SSr : SS4AIi8<opcss, MRMSrcReg,
         (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-      []>, Sched<[WriteFAdd]>;
+      []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def SSm : SS4AIi8<opcss, MRMSrcMem,
         (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+        []>, Sched<[sched.Folded, ReadAfterLd]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
@@ -5936,32 +5454,32 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
               "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, Sched<[WriteFAdd]>;
+        []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def SDm : SS4AIi8<opcsd, MRMSrcMem,
         (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+        []>, Sched<[sched.Folded, ReadAfterLd]>;
 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
 }
 
 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
-                           string OpcodeStr> {
+                           string OpcodeStr, X86FoldableSchedWrite sched> {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
   def SSr : SS4AIi8<opcss, MRMSrcReg,
                     (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[WriteFAdd]>;
+                    []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def SSm : SS4AIi8<opcss, MRMSrcMem,
                     (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+                    []>, Sched<[sched.Folded, ReadAfterLd]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
@@ -5969,19 +5487,20 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
                     (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[WriteFAdd]>;
+                    []>, Sched<[sched]>;
 
   let mayLoad = 1 in
   def SDm : SS4AIi8<opcsd, MRMSrcMem,
                     (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+                    []>, Sched<[sched.Folded, ReadAfterLd]>;
 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
 }
 
 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
-                            string OpcodeStr, ValueType VT32, ValueType VT64,
+                            string OpcodeStr, X86FoldableSchedWrite sched,
+                            ValueType VT32, ValueType VT64,
                             SDNode OpNode, bit Is2Addr = 1> {
 let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
@@ -5992,7 +5511,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
-        Sched<[WriteFAdd]>;
+        Sched<[sched]>;
 
   def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
@@ -6003,7 +5522,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
              (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
-        Sched<[WriteFAddLd, ReadAfterLd]>;
+        Sched<[sched.Folded, ReadAfterLd]>;
 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
 
 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
@@ -6015,7 +5534,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
-        Sched<[WriteFAdd]>;
+        Sched<[sched]>;
 
   def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
@@ -6026,49 +5545,87 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
               (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
-        Sched<[WriteFAddLd, ReadAfterLd]>;
+        Sched<[sched.Folded, ReadAfterLd]>;
 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
 }
 
 // FP round - roundss, roundps, roundsd, roundpd
 let Predicates = [HasAVX, NoVLX] in {
-  // Intrinsic form
-  defm VROUND  : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, v4f32,
-                                 v2f64, loadv4f32, loadv2f64, X86VRndScale>,
-                                 VEX, VEX_WIG;
-  defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, v8f32,
-                                 v4f64, loadv8f32, loadv4f64, X86VRndScale>,
-                                 VEX, VEX_L, VEX_WIG;
+  let ExeDomain = SSEPackedSingle in {
+    // Intrinsic form
+    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
+                                     loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
+                                   VEX, VEX_WIG;
+    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
+                                     loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
+                                   VEX, VEX_L, VEX_WIG;
+  }
+
+  let ExeDomain = SSEPackedDouble in {
+    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
+                                     loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
+                                   VEX, VEX_WIG;
+    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
+                                     loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
+                                   VEX, VEX_L, VEX_WIG;
+  }
 }
 let Predicates = [HasAVX, NoAVX512] in {
-  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", v4f32, v2f64,
-                                 X86RndScales, 0>, VEX_4V, VEX_LIG, VEX_WIG;
-  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG, VEX_WIG;
+  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+                                  v4f32, v2f64, X86RndScales, 0>,
+                                  VEX_4V, VEX_LIG, VEX_WIG;
+  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
+                                VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 let Predicates = [UseAVX] in {
   def : Pat<(ffloor FR32:$src),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
-  def : Pat<(f64 (ffloor FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
   def : Pat<(f32 (fnearbyint FR32:$src)),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
-  def : Pat<(f64 (fnearbyint FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
   def : Pat<(f32 (fceil FR32:$src)),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
-  def : Pat<(f64 (fceil FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
   def : Pat<(f32 (frint FR32:$src)),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
-  def : Pat<(f64 (frint FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   def : Pat<(f32 (ftrunc FR32:$src)),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
 }
 
+let Predicates = [UseAVX, OptForSize] in {
+  def : Pat<(ffloor (loadf32 addr:$src)),
+            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
+  def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
+            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil (loadf32 addr:$src))),
+            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
+  def : Pat<(f32 (frint (loadf32 addr:$src))),
+            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
+            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
+
+  def : Pat<(f64 (ffloor (loadf64 addr:$src))),
+            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
+  def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
+            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
+  def : Pat<(f64 (fceil (loadf64 addr:$src))),
+            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
+  def : Pat<(f64 (frint (loadf64 addr:$src))),
+            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
+  def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
+            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
+}
+
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4f32 (ffloor VR128:$src)),
             (VROUNDPSr VR128:$src, (i32 0x9))>;
@@ -6081,6 +5638,17 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4f32 (ftrunc VR128:$src)),
             (VROUNDPSr VR128:$src, (i32 0xB))>;
 
+  def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
+            (VROUNDPSm addr:$src, (i32 0x9))>;
+  def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
+            (VROUNDPSm addr:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
+            (VROUNDPSm addr:$src, (i32 0xA))>;
+  def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
+            (VROUNDPSm addr:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
+            (VROUNDPSm addr:$src, (i32 0xB))>;
+
   def : Pat<(v2f64 (ffloor VR128:$src)),
             (VROUNDPDr VR128:$src, (i32 0x9))>;
   def : Pat<(v2f64 (fnearbyint VR128:$src)),
@@ -6092,59 +5660,124 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v2f64 (ftrunc VR128:$src)),
             (VROUNDPDr VR128:$src, (i32 0xB))>;
 
+  def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
+            (VROUNDPDm addr:$src, (i32 0x9))>;
+  def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
+            (VROUNDPDm addr:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
+            (VROUNDPDm addr:$src, (i32 0xA))>;
+  def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
+            (VROUNDPDm addr:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
+            (VROUNDPDm addr:$src, (i32 0xB))>;
+
   def : Pat<(v8f32 (ffloor VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0x9))>;
+            (VROUNDPSYr VR256:$src, (i32 0x9))>;
   def : Pat<(v8f32 (fnearbyint VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0xC))>;
+            (VROUNDPSYr VR256:$src, (i32 0xC))>;
   def : Pat<(v8f32 (fceil VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0xA))>;
+            (VROUNDPSYr VR256:$src, (i32 0xA))>;
   def : Pat<(v8f32 (frint VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0x4))>;
+            (VROUNDPSYr VR256:$src, (i32 0x4))>;
   def : Pat<(v8f32 (ftrunc VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0xB))>;
+            (VROUNDPSYr VR256:$src, (i32 0xB))>;
+
+  def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
+            (VROUNDPSYm addr:$src, (i32 0x9))>;
+  def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
+            (VROUNDPSYm addr:$src, (i32 0xC))>;
+  def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
+            (VROUNDPSYm addr:$src, (i32 0xA))>;
+  def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
+            (VROUNDPSYm addr:$src, (i32 0x4))>;
+  def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
+            (VROUNDPSYm addr:$src, (i32 0xB))>;
 
   def : Pat<(v4f64 (ffloor VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0x9))>;
+            (VROUNDPDYr VR256:$src, (i32 0x9))>;
   def : Pat<(v4f64 (fnearbyint VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0xC))>;
+            (VROUNDPDYr VR256:$src, (i32 0xC))>;
   def : Pat<(v4f64 (fceil VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0xA))>;
+            (VROUNDPDYr VR256:$src, (i32 0xA))>;
   def : Pat<(v4f64 (frint VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0x4))>;
+            (VROUNDPDYr VR256:$src, (i32 0x4))>;
   def : Pat<(v4f64 (ftrunc VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0xB))>;
+            (VROUNDPDYr VR256:$src, (i32 0xB))>;
+
+  def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
+            (VROUNDPDYm addr:$src, (i32 0x9))>;
+  def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
+            (VROUNDPDYm addr:$src, (i32 0xC))>;
+  def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
+            (VROUNDPDYm addr:$src, (i32 0xA))>;
+  def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
+            (VROUNDPDYm addr:$src, (i32 0x4))>;
+  def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
+            (VROUNDPDYm addr:$src, (i32 0xB))>;
 }
 
-defm ROUND  : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, v4f32, v2f64,
-                              memopv4f32, memopv2f64, X86VRndScale>;
+let ExeDomain = SSEPackedSingle in
+defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
+                                memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
+let ExeDomain = SSEPackedDouble in
+defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
+                                memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
 
-defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
 
 let Constraints = "$src1 = $dst" in
-defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", v4f32, v2f64, X86RndScales>;
+defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+                               v4f32, v2f64, X86RndScales>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(ffloor FR32:$src),
             (ROUNDSSr FR32:$src, (i32 0x9))>;
-  def : Pat<(f64 (ffloor FR64:$src)),
-            (ROUNDSDr FR64:$src, (i32 0x9))>;
   def : Pat<(f32 (fnearbyint FR32:$src)),
             (ROUNDSSr FR32:$src, (i32 0xC))>;
-  def : Pat<(f64 (fnearbyint FR64:$src)),
-            (ROUNDSDr FR64:$src, (i32 0xC))>;
   def : Pat<(f32 (fceil FR32:$src)),
             (ROUNDSSr FR32:$src, (i32 0xA))>;
-  def : Pat<(f64 (fceil FR64:$src)),
-            (ROUNDSDr FR64:$src, (i32 0xA))>;
   def : Pat<(f32 (frint FR32:$src)),
             (ROUNDSSr FR32:$src, (i32 0x4))>;
-  def : Pat<(f64 (frint FR64:$src)),
-            (ROUNDSDr FR64:$src, (i32 0x4))>;
   def : Pat<(f32 (ftrunc FR32:$src)),
             (ROUNDSSr FR32:$src, (i32 0xB))>;
+
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0x9))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0xC))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0xA))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0x4))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (ROUNDSDr FR64:$src, (i32 0xB))>;
+}
+
+let Predicates = [UseSSE41, OptForSize] in {
+  def : Pat<(ffloor (loadf32 addr:$src)),
+            (ROUNDSSm addr:$src, (i32 0x9))>;
+  def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
+            (ROUNDSSm addr:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil (loadf32 addr:$src))),
+            (ROUNDSSm addr:$src, (i32 0xA))>;
+  def : Pat<(f32 (frint (loadf32 addr:$src))),
+            (ROUNDSSm addr:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
+            (ROUNDSSm addr:$src, (i32 0xB))>;
+
+  def : Pat<(f64 (ffloor (loadf64 addr:$src))),
+            (ROUNDSDm addr:$src, (i32 0x9))>;
+  def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
+            (ROUNDSDm addr:$src, (i32 0xC))>;
+  def : Pat<(f64 (fceil (loadf64 addr:$src))),
+            (ROUNDSDm addr:$src, (i32 0xA))>;
+  def : Pat<(f64 (frint (loadf64 addr:$src))),
+            (ROUNDSDm addr:$src, (i32 0x4))>;
+  def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
+            (ROUNDSDm addr:$src, (i32 0xB))>;
+}
 
+let Predicates = [UseSSE41] in {
   def : Pat<(v4f32 (ffloor VR128:$src)),
             (ROUNDPSr VR128:$src, (i32 0x9))>;
   def : Pat<(v4f32 (fnearbyint VR128:$src)),
@@ -6156,6 +5789,17 @@ let Predicates = [UseSSE41] in {
   def : Pat<(v4f32 (ftrunc VR128:$src)),
             (ROUNDPSr VR128:$src, (i32 0xB))>;
 
+  def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
+            (ROUNDPSm addr:$src, (i32 0x9))>;
+  def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
+            (ROUNDPSm addr:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
+            (ROUNDPSm addr:$src, (i32 0xA))>;
+  def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
+            (ROUNDPSm addr:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
+            (ROUNDPSm addr:$src, (i32 0xB))>;
+
   def : Pat<(v2f64 (ffloor VR128:$src)),
             (ROUNDPDr VR128:$src, (i32 0x9))>;
   def : Pat<(v2f64 (fnearbyint VR128:$src)),
@@ -6166,73 +5810,93 @@ let Predicates = [UseSSE41] in {
             (ROUNDPDr VR128:$src, (i32 0x4))>;
   def : Pat<(v2f64 (ftrunc VR128:$src)),
             (ROUNDPDr VR128:$src, (i32 0xB))>;
-}
+
+  def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
+            (ROUNDPDm addr:$src, (i32 0x9))>;
+  def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
+            (ROUNDPDm addr:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
+            (ROUNDPDm addr:$src, (i32 0xA))>;
+  def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
+            (ROUNDPDm addr:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
+            (ROUNDPDm addr:$src, (i32 0xB))>;
+}
+
+defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
+                                      v4f32, 0x01, UseSSE41>;
+defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
+                                      v4f32, 0x02, UseSSE41>;
+defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
+                                      v2f64, 0x01, UseSSE41>;
+defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
+                                      v2f64, 0x02, UseSSE41>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
 //===----------------------------------------------------------------------===//
 
-let Sched = WriteVecLogic in
-def SSE_PTEST : OpndItins<
-  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
->;
-
 // ptest instruction we'll lower to this in X86ISelLowering primarily from
 // the intel intrinsic that corresponds to this.
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX, VEX_WIG;
+                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG;
+                Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>,
+                VEX, VEX_WIG;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG;
+                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG;
+                Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>,
+                VEX, VEX_L, VEX_WIG;
 }
 
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
               "ptest\t{$src2, $src1|$src1, $src2}",
               [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-              Sched<[WriteVecLogic]>;
+              Sched<[SchedWriteVecTest.XMM]>;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
               "ptest\t{$src2, $src1|$src1, $src2}",
               [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
-              Sched<[WriteVecLogicLd, ReadAfterLd]>;
+              Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>;
 }
 
 // The bit test instructions below are AVX only
 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
+                       X86FoldableSchedWrite sched> {
   def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
             [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
-            Sched<[WriteVecLogic]>, VEX;
+            Sched<[sched]>, VEX;
   def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
             [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
-            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+            Sched<[sched.Folded, ReadAfterLd]>, VEX;
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
-defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
-                            VEX_L;
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
+                            SchedWriteFTest.XMM>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
+                            SchedWriteFTest.YMM>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
-defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
-                            VEX_L;
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
+                            SchedWriteFTest.XMM>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
+                            SchedWriteFTest.YMM>, VEX_L;
 }
 }
 
@@ -6243,229 +5907,201 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
-                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
-                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
-                     OpSize16, XS;
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     Sched<[WritePOPCNT]>, OpSize16, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
-                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
-                      Sched<[WriteFAddLd]>, OpSize16, XS;
+                      (implicit EFLAGS)]>,
+                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
-                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
-                     OpSize32, XS;
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     Sched<[WritePOPCNT]>, OpSize32, XS;
 
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
-                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
-                      Sched<[WriteFAddLd]>, OpSize32, XS;
+                      (implicit EFLAGS)]>,
+                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
-                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      Sched<[WritePOPCNT]>, XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
-                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
-                       Sched<[WriteFAddLd]>, XS;
+                       (implicit EFLAGS)]>,
+                       Sched<[WritePOPCNT.Folded]>, XS;
 }
 
 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                                  SDNode OpNode, PatFrag ld_frag,
                                  X86FoldableSchedWrite Sched> {
-  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
-                    Sched<[Sched]>;
-  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                     (ins i128mem:$src),
-                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(set VR128:$dst,
-                       (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
-                    Sched<[Sched.Folded]>;
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
+                 Sched<[Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                  (ins i128mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                    (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
+                 Sched<[Sched.Folded]>;
 }
 
 // PHMIN has the same profile as PSAD, thus we use the same scheduling
 // model, although the naming is misleading.
 let Predicates = [HasAVX] in
-defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
                                          X86phminpos, loadv2i64,
-                                         WriteVecIMul>, VEX, VEX_WIG;
-defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         WritePHMINPOS>, VEX, VEX_WIG;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
                                          X86phminpos, memopv2i64,
-                                         WriteVecIMul>;
+                                         WritePHMINPOS>;
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, bit Is2Addr = 1,
-                          OpndItins itins = SSE_INTALU_ITINS_P> {
+                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
+                          bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[sched]>;
   def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))],
-       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-}
-
-/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
-/// types.
-multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
-                         PatFrag memop_frag, X86MemOperand x86memop,
-                         OpndItins itins,
-                         bit IsCommutable = 0, bit Is2Addr = 1> {
-  let isCommutable = IsCommutable in
-  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
-       (ins RC:$src1, RC:$src2),
-       !if(Is2Addr,
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
-  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, x86memop:$src2),
-       !if(Is2Addr,
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))],
-       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  VEX_4V, VEX_WIG;
+  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
+                                  loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
                                   VEX_4V, VEX_WIG;
-  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
-                                   VR128, loadv2i64, i128mem,
-                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  VEX_4V, VEX_L, VEX_WIG;
+  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
+                                  loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
-  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
-                                  VR256, loadv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
-                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
-                                  VR128, memopv2i64, i128mem,
-                                  SSE_INTMUL_ITINS_P, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
+                                 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
 }
 
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                 loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+                                 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
-                                 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+                                  loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+                                memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+                                memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
                  X86MemOperand x86memop, bit Is2Addr,
-                 OpndItins itins> {
+                 X86FoldableSchedWrite sched> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -6474,8 +6110,8 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
-        Sched<[itins.Sched]>;
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        Sched<[sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !if(Is2Addr,
@@ -6485,15 +6121,15 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
           (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
-        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+        Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
                            X86MemOperand x86memop, bit Is2Addr,
-                           OpndItins itins> {
+                           X86FoldableSchedWrite sched> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -6502,8 +6138,8 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
-        itins.rr>, Sched<[itins.Sched]>;
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        Sched<[sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !if(Is2Addr,
@@ -6513,8 +6149,8 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
           (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
-        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+        Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 def BlendCommuteImm2 : SDNodeXForm<imm, [{
@@ -6536,53 +6172,53 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                         VR128, loadv2i64, i128mem, 0,
-                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
+                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, loadv4f32, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
+                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, loadv2f64, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
+                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                     VR256, loadv8f32, i256mem, 0,
-                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG;
+                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, loadv4i64, i256mem, 0,
-                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
+                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem,
-                                     1, SSE_MPSADBW_ITINS>;
+                                     VR128, memopv2i64, i128mem, 1,
+                                     SchedWriteMPSAD.XMM>;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
                                   VR128, memopv4f32, f128mem, 1,
-                                  SSE_DPPS_ITINS>;
+                                  SchedWriteDPPS.XMM>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
                                   VR128, memopv2f64, f128mem, 1,
-                                  SSE_DPPD_ITINS>;
+                                  SchedWriteDPPD.XMM>;
 }
 
 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
                            X86MemOperand x86memop, bit Is2Addr, Domain d,
-                           OpndItins itins, SDNodeXForm commuteXForm> {
+                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -6592,8 +6228,8 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
-        itins.rr>, Sched<[itins.Sched]>;
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        Sched<[sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !if(Is2Addr,
@@ -6603,8 +6239,8 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
           (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
-        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+        Sched<[sched.Folded, ReadAfterLd]>;
 }
 
   // Pattern to commute if load is in first source.
@@ -6617,42 +6253,42 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
 let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
                                   VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
-                                  DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
                                    VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
-                                   DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>,
+                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
                                   VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
-                                  DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>,
+                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
                                    VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
-                                   DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
                                   VR128, loadv2i64, i128mem, 0, SSEPackedInt,
-                                  DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
                                    VR256, loadv4i64, i256mem, 0, SSEPackedInt,
-                                   DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
                                VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
-                               SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>;
+                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
                                VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
-                               SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>;
+                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
                                VR128, memopv2i64, i128mem, 1, SSEPackedInt,
-                               SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>;
+                               SchedWriteBlend.XMM, BlendCommuteImm8>;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
 // more efficient to generate a blend with immediate instead of an insert*128.
@@ -6671,14 +6307,14 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
                                     PatFrag mem_frag, Intrinsic IntId,
-                                    X86FoldableSchedWrite Sched> {
+                                    X86FoldableSchedWrite sched> {
   def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
                   (ins RC:$src1, RC:$src2, RC:$src3),
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
-                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
-                Sched<[Sched]>;
+                  SSEPackedInt>, TAPD, VEX_4V,
+                Sched<[sched]>;
 
   def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
                   (ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -6686,37 +6322,41 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
                         (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
-                               RC:$src3))],
-                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
-                Sched<[Sched.Folded, ReadAfterLd]>;
+                               RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
+                Sched<[sched.Folded, ReadAfterLd,
+                       // x86memop:$src2
+                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                       ReadDefault,
+                       // RC::$src3
+                       ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
                                            loadv2f64, int_x86_sse41_blendvpd,
-                                           WriteFVarBlend>;
+                                           SchedWriteFVarBlend.XMM>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                   loadv4f64, int_x86_avx_blendv_pd_256,
-                                  WriteFVarBlend>, VEX_L;
+                                  SchedWriteFVarBlend.YMM>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
                                            loadv4f32, int_x86_sse41_blendvps,
-                                           WriteFVarBlend>;
+                                           SchedWriteFVarBlend.XMM>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                   loadv8f32, int_x86_avx_blendv_ps_256,
-                                  WriteFVarBlend>, VEX_L;
+                                  SchedWriteFVarBlend.YMM>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
                                            loadv2i64, int_x86_sse41_pblendvb,
-                                           WriteVarBlend>;
+                                           SchedWriteVarBlend.XMM>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
                                       loadv4i64, int_x86_avx2_pblendvb,
-                                      WriteVarBlend>, VEX_L;
+                                      SchedWriteVarBlend.YMM>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6755,48 +6395,76 @@ let Predicates = [HasAVX2] in {
             (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
 }
 
-// Patterns
-// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
-// on targets where they have equal performance. These were changed to use
-// blends because blends have better throughput on SandyBridge and Haswell, but
-// movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseAVX] in {
-  let AddedComplexity = 15 in {
+// Prefer a movss or movsd over a blendps when optimizing for size. these were
+// changed to use blends because blends have better throughput on sandybridge
+// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [HasAVX, OptForSpeed] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
             (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-            (VMOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
+            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
+  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
+            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
+
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
+            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
+  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
+            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
-            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0),
+             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
+                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
+                          (i8 1))), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
+                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
+                          (i8 3))), sub_xmm)>;
 
-  // Move low f64 and clear high bits.
   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
-            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
-  }
-
-  // These will incur an FP/int domain crossing penalty, but it may be the only
-  // way without AVX2. Do not add any complexity because we may be able to match
-  // more optimal patterns defined earlier in this file.
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0),
+             (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
+                          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
+                          (i8 1))), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
-            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0),
+             (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
+                          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
+                          (i8 0xf))), sub_xmm)>;
 }
 
-// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
-// on targets where they have equal performance. These were changed to use
-// blends because blends have better throughput on SandyBridge and Haswell, but
-// movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseSSE41], AddedComplexity = 15 in {
+// Prefer a movss or movsd over a blendps when optimizing for size. these were
+// changed to use blends because blends have better throughput on sandybridge
+// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41, OptForSpeed] in {
   // With SSE41 we can use blends for these patterns.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
             (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
+            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
+  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
+            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
+
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
+            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
+  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
+            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
 }
 
 
@@ -6804,13 +6472,13 @@ let Predicates = [UseSSE41], AddedComplexity = 15 in {
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                                X86MemOperand x86memop, Intrinsic IntId,
-                               OpndItins itins> {
+                               X86FoldableSchedWrite sched> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
-                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
-                    itins.rr>, Sched<[itins.Sched]>;
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+                    Sched<[sched]>;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
@@ -6818,22 +6486,19 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))],
-                       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                       (bitconvert (mem_frag addr:$src2)), XMM0))]>,
+                    Sched<[sched.Folded, ReadAfterLd]>;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
-                                  int_x86_sse41_blendvpd,
-                                  DEFAULT_ITINS_FBLENDSCHED>;
+                                  int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
 let ExeDomain = SSEPackedSingle in
 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
-                                  int_x86_sse41_blendvps,
-                                  DEFAULT_ITINS_FBLENDSCHED>;
+                                  int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
-                                  int_x86_sse41_pblendvb,
-                                  DEFAULT_ITINS_VARBLENDSCHED>;
+                                  int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
 
 // Aliases with the implicit xmm0 argument
 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
@@ -6868,18 +6533,18 @@ let Predicates = [UseSSE41] in {
 }
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
-let SchedRW = [WriteLoad] in {
+
 let Predicates = [HasAVX, NoVLX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
-                       VEX, VEX_WIG;
+                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
-                         VEX, VEX_L, VEX_WIG;
+                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "movntdqa\t{$src, $dst|$dst, $src}", []>;
-} // SchedRW
+                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
+                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
 
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v8f32 (alignednontemporalload addr:$src)),
@@ -6917,62 +6582,43 @@ let Predicates = [UseSSE41] in {
 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, OpndItins itins,
+                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
                           bit Is2Addr = 1> {
   def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
-       Sched<[itins.Sched]>;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[sched]>;
   def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))], itins.rm>,
-       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+       Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>,
+                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
+                                memopv2i64, i128mem, SchedWriteVecALU.XMM>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
 //===----------------------------------------------------------------------===//
 
-// Packed Compare Implicit Length Strings, Return Mask
-multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
-                                                  imm:$src3))]>;
-  def MEM : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
-                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
-                         Requires<[HasAVX]>, VEX_WIG;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
-                         Requires<[UseSSE42]>;
-}
-
 multiclass pcmpistrm_SS42AI<string asm> {
   def rr : SS42AI<0x62, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
@@ -6982,32 +6628,13 @@ multiclass pcmpistrm_SS42AI<string asm> {
   def rm :SS42AI<0x62, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+    []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>;
 }
 
 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
-  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
-}
-
-// Packed Compare Explicit Length Strings, Return Mask
-multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
-  def MEM : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
-                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
-                         Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
-                         Requires<[UseSSE42]>;
+  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
 multiclass SS42AI_pcmpestrm<string asm> {
@@ -7019,32 +6646,13 @@ multiclass SS42AI_pcmpestrm<string asm> {
   def rm : SS42AI<0x60, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+    []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>;
 }
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
-  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
-}
-
-// Packed Compare Implicit Length Strings, Return Index
-multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
-    [(set GR32:$dst, EFLAGS,
-      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
-  def MEM : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
-    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
-                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
-                      Requires<[HasAVX]>, VEX_WIG;
-  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
-                      Requires<[UseSSE42]>;
+  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
 multiclass SS42AI_pcmpistri<string asm> {
@@ -7056,7 +6664,7 @@ multiclass SS42AI_pcmpistri<string asm> {
   def rm : SS42AI<0x63, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+    []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>;
 }
 
 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
@@ -7065,26 +6673,6 @@ let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
 }
 
-// Packed Compare Explicit Length Strings, Return Index
-multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
-    [(set GR32:$dst, EFLAGS,
-      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
-  def MEM : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
-    [(set GR32:$dst, EFLAGS,
-      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
-       imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in {
-  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
-                      Requires<[HasAVX]>;
-  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
-                      Requires<[UseSSE42]>;
-}
-
 multiclass SS42AI_pcmpestri<string asm> {
   def rr : SS42AI<0x61, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
@@ -7094,7 +6682,7 @@ multiclass SS42AI_pcmpestri<string asm> {
   def rm : SS42AI<0x61, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+    []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>;
 }
 
 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
@@ -7116,15 +6704,15 @@ class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
                    RegisterClass RCIn, SDPatternOperator Int> :
   SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
-         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
-         Sched<[WriteFAdd]>;
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
+         Sched<[WriteCRC32]>;
 
 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
                    X86MemOperand x86memop, SDPatternOperator Int> :
   SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
-         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
-         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
+         Sched<[WriteCRC32.Folded, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
   def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
@@ -7156,9 +6744,9 @@ let Constraints = "$src1 = $dst" in {
 // SHA-NI Instructions
 //===----------------------------------------------------------------------===//
 
-// FIXME: Is there a better scheduler itinerary for SHA than WriteVecIMul?
+// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
-                      OpndItins itins, bit UsesXMM0 = 0> {
+                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
   def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2),
              !if(UsesXMM0,
@@ -7166,8 +6754,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
-                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))], itins.rr>,
-             T8, Sched<[itins.Sched]>;
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
+             T8, Sched<[sched]>;
 
   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
@@ -7178,8 +6766,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                   (set VR128:$dst, (IntId VR128:$src1,
                     (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)))))], itins.rm>, T8,
-             Sched<[itins.Sched.Folded, ReadAfterLd]>;
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
+             Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
@@ -7188,32 +6776,32 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
-                            (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RR>, TA,
-                         Sched<[WriteVecIMul]>;
+                            (i8 imm:$src3)))]>, TA,
+                         Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
                             (bc_v4i32 (memopv2i64 addr:$src2)),
-                            (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RM>, TA,
-                         Sched<[WriteVecIMulLd, ReadAfterLd]>;
+                            (i8 imm:$src3)))]>, TA,
+                         Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>;
 
   defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
-                              SSE_INTMUL_ITINS_P>;
+                              SchedWriteVecIMul.XMM>;
   defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
-                              SSE_INTMUL_ITINS_P>;
+                              SchedWriteVecIMul.XMM>;
   defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
-                              SSE_INTMUL_ITINS_P>;
+                              SchedWriteVecIMul.XMM>;
 
   let Uses=[XMM0] in
   defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
-                                SSE_INTMUL_ITINS_P, 1>;
+                                SchedWriteVecIMul.XMM, 1>;
 
   defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
-                               SSE_INTMUL_ITINS_P>;
+                               SchedWriteVecIMul.XMM>;
   defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
-                               SSE_INTMUL_ITINS_P>;
+                               SchedWriteVecIMul.XMM>;
 }
 
 // Aliases with explicit %xmm0
@@ -7240,7 +6828,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
     def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, MemOp:$src2), "",
                    [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
-                   Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+                   Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>;
   }
 }
 
@@ -7294,7 +6882,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
-      Sched<[WriteAESIMCLd]>, VEX, VEX_WIG;
+      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1),
@@ -7305,7 +6893,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
   [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
-  Sched<[WriteAESIMCLd]>;
+  Sched<[WriteAESIMC.Folded]>;
 
 // AES Round Key Generation Assist
 let Predicates = [HasAVX, HasAES] in {
@@ -7320,7 +6908,7 @@ let Predicates = [HasAVX, HasAES] in {
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
-      Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG;
+      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, u8imm:$src2),
@@ -7333,7 +6921,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
     (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
-  Sched<[WriteAESKeyGenLd]>;
+  Sched<[WriteAESKeyGen.Folded]>;
 
 //===----------------------------------------------------------------------===//
 // PCLMUL Instructions
@@ -7353,16 +6941,16 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               (ins VR128:$src1, VR128:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
-                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
-              IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+                Sched<[WriteCLMul]>;
 
     def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
                  (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
-                  imm:$src3))],
-              IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMulLd, ReadAfterLd]>;
+                  imm:$src3))]>,
+              Sched<[WriteCLMul.Folded, ReadAfterLd]>;
   } // Constraints = "$src1 = $dst"
 
   def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
@@ -7398,7 +6986,7 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
             [(set RC:$dst,
                (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
-            Sched<[WriteCLMulLd, ReadAfterLd]>;
+            Sched<[WriteCLMul.Folded, ReadAfterLd]>;
 
   // We can commute a load in the first operand by swapping the sources and
   // rotating the immediate.
@@ -7449,45 +7037,45 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  (ins VR128:$src, u8imm:$len, u8imm:$idx),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
                  [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
-                                    imm:$idx))], IIC_SSE_INTALU_P_RR>,
-                 PD, Sched<[WriteVecALU]>;
+                                    imm:$idx))]>,
+                 PD, Sched<[SchedWriteVecALU.XMM]>;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
               "extrq\t{$mask, $src|$src, $mask}",
               [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
-                                 VR128:$mask))], IIC_SSE_INTALU_P_RR>,
-              PD, Sched<[WriteVecALU]>;
+                                 VR128:$mask))]>,
+              PD, Sched<[SchedWriteVecALU.XMM]>;
 
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
                    [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
-                                      imm:$len, imm:$idx))], IIC_SSE_INTALU_P_RR>,
-                   XD, Sched<[WriteVecALU]>;
+                                      imm:$len, imm:$idx))]>,
+                   XD, Sched<[SchedWriteVecALU.XMM]>;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src, VR128:$mask),
                  "insertq\t{$mask, $src|$src, $mask}",
                  [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
-                                    VR128:$mask))], IIC_SSE_INTALU_P_RR>,
-                 XD, Sched<[WriteVecALU]>;
+                                    VR128:$mask))]>,
+                 XD, Sched<[SchedWriteVecALU.XMM]>;
 }
 } // ExeDomain = SSEPackedInt
 
 // Non-temporal (unaligned) scalar stores.
 let AddedComplexity = 400 in { // Prefer non-temporal versions
-let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in {
+let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
-                "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
+                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
 
 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
+                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
 } // SchedRW
 
 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
-          (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
 
 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
-          (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
 
 } // AddedComplexity
 } // HasSSE4A
@@ -7518,18 +7106,20 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
   def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
-                                             f32mem, v4f32, loadf32, WriteLoad>;
+                                         f32mem, v4f32, loadf32,
+                                         SchedWriteFShuffle.XMM.Folded>;
   def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
-                                             f32mem, v8f32, loadf32,
-                                             WriteFShuffleLd>, VEX_L;
+                                         f32mem, v8f32, loadf32,
+                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
 def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
-                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+                                        v4f64, loadf64,
+                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
 
 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
   def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
-                                          v4f32, v4f32, WriteFShuffle>;
+                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
   def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
                                           v8f32, v4f32, WriteFShuffle256>, VEX_L;
 }
@@ -7554,13 +7144,14 @@ let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
                            (ins i128mem:$src),
                            "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
-                           Sched<[WriteLoad]>, VEX, VEX_L;
+                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
 
-let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
+    ExeDomain = SSEPackedSingle in
 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
                            (ins f128mem:$src),
                            "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
-                           Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
 
 let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
@@ -7598,12 +7189,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
@@ -7645,12 +7236,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, u8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, Sched<[WriteStore]>, VEX, VEX_L;
+          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
 }
 
 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
@@ -7686,23 +7277,23 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, f128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))],
-             IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
+             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+             VEX_4V, Sched<[WriteFMaskedLoad]>;
   def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
-             IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
-             VEX_4V, Sched<[WriteStore]>;
+             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
+             VEX_4V, Sched<[WriteFMaskedStore]>;
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
-             VEX_4V, VEX_L, Sched<[WriteStore]>;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
+             VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7722,63 +7313,60 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
 // VPERMIL - Permute Single and Double Floating-Point Values
 //
 
-let Sched = WriteFShuffle in
-def AVX_VPERMILV : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
-let Sched = WriteFShuffle in
-def AVX_VPERMIL : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
                       X86MemOperand x86memop_i, PatFrag i_frag,
-                      ValueType f_vt, ValueType i_vt> {
+                      ValueType f_vt, ValueType i_vt,
+                      X86FoldableSchedWrite sched,
+                      X86FoldableSchedWrite varsched> {
   let Predicates = [HasAVX, NoVLX] in {
     def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
-               Sched<[WriteFShuffle]>;
+               Sched<[varsched]>;
     def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
                               (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
-               Sched<[WriteFShuffleLd, ReadAfterLd]>;
+               Sched<[varsched.Folded, ReadAfterLd]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
-             Sched<[WriteFShuffle]>;
+             Sched<[sched]>;
     def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
              (ins x86memop_f:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
                (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
-             Sched<[WriteFShuffleLd]>;
+             Sched<[sched.Folded]>;
   }// Predicates = [HasAVX, NoVLX]
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               loadv2i64, v4f32, v4i32>;
+                               loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
+                               SchedWriteFVarShuffle.XMM>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                               loadv4i64, v8f32, v8i32>, VEX_L;
+                               loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
+                               SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               loadv2i64, v2f64, v2i64>;
+                               loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
+                               SchedWriteFVarShuffle.XMM>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                               loadv4i64, v4f64, v4i64>, VEX_L;
+                               loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
+                               SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
+
 let ExeDomain = SSEPackedSingle in {
 let isCommutable = 1 in
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
@@ -7786,13 +7374,13 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
                               (i8 imm:$src3))))]>, VEX_4V, VEX_L,
-          Sched<[WriteFShuffle]>;
+          Sched<[WriteFShuffle256]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L,
-          Sched<[WriteFShuffleLd, ReadAfterLd]>;
+          Sched<[WriteFShuffle256Ld, ReadAfterLd]>;
 }
 
 // Immediate transform to help with commuting.
@@ -7821,58 +7409,63 @@ def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
 
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
+// Note: These instruction do not affect the YMM16-YMM31.
 //
-// Note, these instruction do not affect the YMM16-YMM31.
+
 let SchedRW = [WriteSystem] in {
 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)], IIC_AVX_ZERO>, PS, VEX, VEX_L,
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
                   Requires<[HasAVX]>, VEX_WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)], IIC_AVX_ZERO>, PS, VEX,
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
                      Requires<[HasAVX]>, VEX_WIG;
 } // Defs
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
-//===----------------------------------------------------------------------===//
-multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> {
+//
+
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
+                      X86FoldableSchedWrite sched> {
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
-             T8PD, VEX, Sched<[WriteCvtF2F]>;
+             T8PD, VEX, Sched<[sched]>;
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (X86cvtph2ps (bc_v8i16
                                           (loadv2i64 addr:$src))))]>,
-             T8PD, VEX, Sched<[WriteCvtF2FLd]>;
+             T8PD, VEX, Sched<[sched.Folded]>;
 }
 
-multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> {
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
+                      SchedWrite RR, SchedWrite MR> {
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
                (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
-               TAPD, VEX, Sched<[WriteCvtF2F]>;
-  let hasSideEffects = 0, mayStore = 1,
-      SchedRW = [WriteCvtF2FLd, WriteRMW] in
+               TAPD, VEX, Sched<[RR]>;
+  let hasSideEffects = 0, mayStore = 1 in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               TAPD, VEX;
+               TAPD, VEX, Sched<[MR]>;
 }
 
 let Predicates = [HasF16C, NoVLX] in {
-  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L;
-  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem>;
-  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L;
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
+                               WriteCvtPS2PHSt>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
+                               WriteCvtPS2PHYSt>, VEX_L;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
   def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
@@ -7903,16 +7496,16 @@ let Predicates = [HasF16C, NoVLX] in {
   // more consistent with other instructions, which are always controlled by it.
   // It's encoded as 0b100.
   def : Pat<(fp_to_f16 FR32:$src),
-            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
-              (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
+              (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
 
   def : Pat<(f16_to_fp GR16:$src),
-            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
-              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
+              (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
 
   def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
-            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
-              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
+            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
+             (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7921,7 +7514,8 @@ let Predicates = [HasF16C, NoVLX] in {
 
 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          ValueType OpVT, X86FoldableSchedWrite sched,
+                          RegisterClass RC, PatFrag memop_frag,
                           X86MemOperand x86memop, SDNodeXForm commuteXForm> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -7929,7 +7523,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
-        Sched<[WriteBlend]>, VEX_4V;
+        Sched<[sched]>, VEX_4V;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
@@ -7937,7 +7531,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         [(set RC:$dst,
           (OpVT (OpNode RC:$src1,
            (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
-        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+        Sched<[sched.Folded, ReadAfterLd]>, VEX_4V;
 
   // Pattern to commute if load is in first source.
   def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
@@ -7947,10 +7541,11 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
-                               VR128, loadv2i64, i128mem, BlendCommuteImm4>;
+                               SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
+                               BlendCommuteImm4>;
 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
-                                VR256, loadv4i64, i256mem, BlendCommuteImm8>,
-                                VEX_L;
+                                SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
+                                BlendCommuteImm8>, VEX_L;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
 // more efficient to generate a blend with immediate instead of an insert*128.
@@ -8004,12 +7599,12 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
                    (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
-                  Sched<[WriteShuffle]>, VEX;
+                  Sched<[SchedWriteShuffle.XMM]>, VEX;
     def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
                    (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
-                  Sched<[WriteLoad]>, VEX;
+                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
@@ -8019,7 +7614,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
                     (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
-                   Sched<[WriteLoad]>, VEX, VEX_L;
+                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
 
     // Provide aliases for broadcast from the same register class that
     // automatically does the extract.
@@ -8084,45 +7679,45 @@ let Predicates = [HasAVX2, NoVLX] in {
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
     def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
-              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
     def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
-              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
     def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
-              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
-        (VPBROADCASTBrr (COPY_TO_REGCLASS
+        (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                                              GR8:$src, sub_8bit)),
-                         VR128))>;
+                         VR128)))>;
   def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
-        (VPBROADCASTBYrr (COPY_TO_REGCLASS
+        (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                                               GR8:$src, sub_8bit)),
-                          VR128))>;
+                          VR128)))>;
 
   def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
-        (VPBROADCASTWrr (COPY_TO_REGCLASS
+        (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                                              GR16:$src, sub_16bit)),
-                         VR128))>;
+                         VR128)))>;
   def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
-        (VPBROADCASTWYrr (COPY_TO_REGCLASS
+        (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                                               GR16:$src, sub_16bit)),
-                          VR128))>;
+                          VR128)))>;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+            (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
-            (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+            (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
-            (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+            (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
-            (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+            (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
 }
 
 // AVX1 broadcast patterns
@@ -8140,7 +7735,7 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
 let Predicates = [HasAVX, NoVLX] in {
   // 128bit broadcasts:
   def : Pat<(v2f64 (X86VBroadcast f64:$src)),
-            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
   def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
             (VMOVDDUPrm addr:$src)>;
 
@@ -8152,29 +7747,29 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
-            (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
-              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
-              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
+              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
-              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm),
-              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>;
+              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
+              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
 
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
+              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
+              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
 
   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
-            (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
+            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
   def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
             (VMOVDDUPrm addr:$src)>;
 }
@@ -8183,16 +7778,6 @@ let Predicates = [HasAVX1Only] in {
 // VPERM - Permute instructions
 //
 
-let Sched = WriteFShuffle256 in
-def AVX2_PERMV_F : OpndItins<
-  IIC_SSE_SHUFP, IIC_SSE_SHUFP
->;
-
-let Sched = WriteShuffle256 in
-def AVX2_PERMV_I : OpndItins<
-  IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
->;
-
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                      ValueType OpVT, X86FoldableSchedWrite Sched,
                      X86MemOperand memOp> {
@@ -8215,10 +7800,10 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256,
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
                         i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256,
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
                         f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
@@ -8305,7 +7890,7 @@ let hasSideEffects = 0, mayStore = 1 in
 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-          Sched<[WriteStore]>, VEX, VEX_L;
+          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
@@ -8323,23 +7908,23 @@ multiclass avx2_pmovmask<string OpcodeStr,
   def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))],
-             IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
+             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
+             VEX_4V, Sched<[WriteVecMaskedLoad]>;
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
-             IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
-             VEX_4V, Sched<[WriteStore]>;
+             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
+             VEX_4V, Sched<[WriteVecMaskedStore]>;
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
-             VEX_4V, VEX_L, Sched<[WriteStore]>;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
+             VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8367,7 +7952,7 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
     def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
              (!cast<Instruction>(BlendStr#"rr")
                  RC:$src0,
-                 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
+                 (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
                  RC:$mask)>;
 }
 let Predicates = [HasAVX] in {
@@ -8444,27 +8029,27 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
-             VEX_4V, Sched<[WriteVarVecShift]>;
+             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
                        (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
-             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
-             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -8547,60 +8132,49 @@ let Predicates = [UseAVX2] in {
 }
 
 //===----------------------------------------------------------------------===//
-// Extra selection patterns for FR128, f128, f128mem
+// Extra selection patterns for f128, f128mem
 
 // movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
-def : Pat<(store (f128 FR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
+def : Pat<(store (f128 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
 
+def : Pat<(alignedloadf128 addr:$src),
+          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>;
 def : Pat<(loadf128 addr:$src),
-          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+          (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>;
 
 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
-def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
-          (COPY_TO_REGCLASS
-           (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
-           FR128)>;
-
-def : Pat<(X86fand FR128:$src1, FR128:$src2),
-          (COPY_TO_REGCLASS
-           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
-                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
-
-def : Pat<(and FR128:$src1, FR128:$src2),
-          (COPY_TO_REGCLASS
-           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
-                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
-
-def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
           (COPY_TO_REGCLASS
-           (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
-           FR128)>;
+           (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
+           VR128)>;
 
-def : Pat<(X86for FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
           (COPY_TO_REGCLASS
-           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
-                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+           (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
+                    (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
 
-def : Pat<(or FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
           (COPY_TO_REGCLASS
-           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
-                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+           (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
+           VR128)>;
 
-def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
           (COPY_TO_REGCLASS
-           (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
-           FR128)>;
+           (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
+                   (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
 
-def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
           (COPY_TO_REGCLASS
-           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
-                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+           (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
+           VR128)>;
 
-def : Pat<(xor FR128:$src1, FR128:$src2),
+def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
           (COPY_TO_REGCLASS
-           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
-                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+           (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
+                    (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
 
 //===----------------------------------------------------------------------===//
 // GFNI instructions
@@ -8615,15 +8189,13 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
         OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
     let isCommutable = 1 in
     def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
-                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))],
-                 SSE_INTALU_ITINS_P.rr>,
-             Sched<[SSE_INTALU_ITINS_P.Sched]>, T8PD;
+                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
+             Sched<[SchedWriteVecALU.XMM]>, T8PD;
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
-                                 (bitconvert (MemOpFrag addr:$src2)))))],
-                 SSE_INTALU_ITINS_P.rm>,
-             Sched<[SSE_INTALU_ITINS_P.Sched.Folded, ReadAfterLd]>, T8PD;
+                                 (bitconvert (MemOpFrag addr:$src2)))))]>,
+             Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD;
   }
 }
 
@@ -8636,15 +8208,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
   def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
               (ins RC:$src1, RC:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
-              SSE_INTALU_ITINS_P.rr, SSEPackedInt>,
-              Sched<[WriteVecALU]>;
+              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1,
                                     (bitconvert (MemOpFrag addr:$src2)),
-                              imm:$src3)))],
-              SSE_INTALU_ITINS_P.rm, SSEPackedInt>,
-              Sched<[WriteVecALU.Folded, ReadAfterLd]>;
+                              imm:$src3)))], SSEPackedInt>,
+              Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
index bdf478600279..2dc6e8b43667 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSVM.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -17,47 +17,47 @@
 
 let SchedRW = [WriteSystem] in {
 // 0F 01 D9
-def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", [], IIC_SVM>, TB;
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
 
 // 0F 01 DC
-def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", [], IIC_STGI>, TB;
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
 
 // 0F 01 DD
-def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", [], IIC_CLGI>, TB;
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
 
 // 0F 01 DE
 let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", [], IIC_SKINIT>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
 
 // 0F 01 D8
 let Uses = [EAX] in
-def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
-                "vmrun\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%eax|eax}", []>, TB,
+                Requires<[Not64BitMode]>;
 let Uses = [RAX] in
-def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
-                "vmrun\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%rax|rax}", []>, TB,
+                Requires<[In64BitMode]>;
 
 // 0F 01 DA
 let Uses = [EAX] in
-def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
-                "vmload\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%eax|eax}", []>, TB,
+                 Requires<[Not64BitMode]>;
 let Uses = [RAX] in
-def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
-                "vmload\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%rax|rax}", []>, TB,
+                 Requires<[In64BitMode]>;
 
 // 0F 01 DB
 let Uses = [EAX] in
-def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
-                "vmsave\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%eax|eax}", []>, TB,
+                 Requires<[Not64BitMode]>;
 let Uses = [RAX] in
-def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
-                "vmsave\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%rax|rax}", []>, TB,
+                 Requires<[In64BitMode]>;
 
 // 0F 01 DF
 let Uses = [EAX, ECX] in
 def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%ecx, %eax|eax, ecx}", [], IIC_INVLPG>, TB, Requires<[Not64BitMode]>;
+                "invlpga\t{%eax, %ecx|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
 let Uses = [RAX, ECX] in
 def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%ecx, %rax|rax, ecx}", [], IIC_INVLPG>, TB, Requires<[In64BitMode]>;
+                "invlpga\t{%rax, %ecx|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
 } // SchedRW
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 43e1752f2df2..ee3b01159174 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -19,49 +19,48 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "shl{b}\t{%cl, $dst|$dst, cl}",
-                 [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+                 [(set GR8:$dst, (shl GR8:$src1, CL))]>;
 def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
                  "shl{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize16;
 def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
                  "shl{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (shl GR32:$src1, CL))]>, OpSize32;
 def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                   "shl{q}\t{%cl, $dst|$dst, cl}",
-                  [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+                  [(set GR64:$dst, (shl GR64:$src1, CL))]>;
 } // Uses = [CL]
 
 def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "shl{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
 
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "shl{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>,
                    OpSize16;
 def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "shl{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>,
                    OpSize32;
 def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "shl{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
-                    IIC_SR>;
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
 } // isConvertibleToThreeAddress = 1
 
 // NOTE: We don't include patterns for shifts of a register by one, because
 // 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
 let hasSideEffects = 0 in {
 def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
-                 "shl{b}\t$dst", [], IIC_SR>;
+                 "shl{b}\t$dst", []>;
 def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
-                 "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+                 "shl{w}\t$dst", []>, OpSize16;
 def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
-                 "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+                 "shl{l}\t$dst", []>, OpSize32;
 def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
-                 "shl{q}\t$dst", [], IIC_SR>;
+                 "shl{q}\t$dst", []>;
 } // hasSideEffects = 0
 } // Constraints = "$src = $dst", SchedRW
 
@@ -72,100 +71,98 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
                  "shl{b}\t{%cl, $dst|$dst, cl}",
-                 [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+                 [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
 def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
                  "shl{w}\t{%cl, $dst|$dst, cl}",
-                 [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>,
                  OpSize16;
 def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
                  "shl{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>,
                  OpSize32;
 def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
                   "shl{q}\t{%cl, $dst|$dst, cl}",
-                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>,
                   Requires<[In64BitMode]>;
 }
 def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "shl{b}\t{$src, $dst|$dst, $src}",
-                [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                IIC_SR>;
+                [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
 def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "shl{w}\t{$src, $dst|$dst, $src}",
-               [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>, OpSize16;
+               [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+               OpSize16;
 def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "shl{l}\t{$src, $dst|$dst, $src}",
-               [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>, OpSize32;
+               [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+               OpSize32;
 def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
                   "shl{q}\t{$src, $dst|$dst, $src}",
-                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                  [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                  Requires<[In64BitMode]>;
 
 // Shift by 1
 def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
                  "shl{b}\t$dst",
-                [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
-                IIC_SR>;
+                [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
 def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
                  "shl{w}\t$dst",
-               [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, OpSize16;
+                 [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                 OpSize16;
 def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
                  "shl{l}\t$dst",
-               [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, OpSize32;
+                 [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+                 OpSize32;
 def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
                   "shl{q}\t$dst",
-                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 } // SchedRW
 
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "shr{b}\t{%cl, $dst|$dst, cl}",
-                 [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+                 [(set GR8:$dst, (srl GR8:$src1, CL))]>;
 def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
                  "shr{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize16;
 def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
                  "shr{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (srl GR32:$src1, CL))]>, OpSize32;
 def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                   "shr{q}\t{%cl, $dst|$dst, cl}",
-                  [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+                  [(set GR64:$dst, (srl GR64:$src1, CL))]>;
 }
 
 def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
                    "shr{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
 def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "shr{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize16;
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>,
+                   OpSize16;
 def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "shr{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize32;
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>,
+                   OpSize32;
 def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
                   "shr{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
 
 // Shift right by 1
 def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
                  "shr{b}\t$dst",
-                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
 def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
                  "shr{w}\t$dst",
-                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize16;
 def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
                  "shr{l}\t$dst",
-                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>, OpSize32;
 def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                  "shr{q}\t$dst",
-                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
 } // Constraints = "$src = $dst", SchedRW
 
 
@@ -173,111 +170,101 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
                  "shr{b}\t{%cl, $dst|$dst, cl}",
-                 [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+                 [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
 def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
                  "shr{w}\t{%cl, $dst|$dst, cl}",
-                 [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
                  OpSize16;
 def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
                  "shr{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>,
                  OpSize32;
 def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
                   "shr{q}\t{%cl, $dst|$dst, cl}",
-                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>,
                   Requires<[In64BitMode]>;
 }
 def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "shr{b}\t{$src, $dst|$dst, $src}",
-                [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                IIC_SR>;
+                [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
 def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "shr{w}\t{$src, $dst|$dst, $src}",
-               [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>, OpSize16;
+               [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+               OpSize16;
 def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "shr{l}\t{$src, $dst|$dst, $src}",
-               [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>, OpSize32;
+               [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+               OpSize32;
 def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
                   "shr{q}\t{$src, $dst|$dst, $src}",
-                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 
 // Shift by 1
 def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
                  "shr{b}\t$dst",
-                [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
-                IIC_SR>;
+                 [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
 def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
                  "shr{w}\t$dst",
-               [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, OpSize16;
+                 [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                 OpSize16;
 def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
                  "shr{l}\t$dst",
-               [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, OpSize32;
+                 [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+                 OpSize32;
 def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
                   "shr{q}\t$dst",
-                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 } // SchedRW
 
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "sar{b}\t{%cl, $dst|$dst, cl}",
-                 [(set GR8:$dst, (sra GR8:$src1, CL))],
-                 IIC_SR>;
+                 [(set GR8:$dst, (sra GR8:$src1, CL))]>;
 def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
                  "sar{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (sra GR16:$src1, CL))],
-                 IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (sra GR16:$src1, CL))]>,
+                 OpSize16;
 def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
                  "sar{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (sra GR32:$src1, CL))],
-                 IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (sra GR32:$src1, CL))]>,
+                 OpSize32;
 def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
                  "sar{q}\t{%cl, $dst|$dst, cl}",
-                 [(set GR64:$dst, (sra GR64:$src1, CL))],
-                 IIC_SR>;
+                 [(set GR64:$dst, (sra GR64:$src1, CL))]>;
 }
 
 def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "sar{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
-                   IIC_SR>;
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
 def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "sar{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize16;
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+                   OpSize16;
 def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "sar{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize32;
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>,
+                   OpSize32;
 def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "sar{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
-                    IIC_SR>;
+                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
 
 // Shift by 1
 def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "sar{b}\t$dst",
-                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
-                 IIC_SR>;
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
 def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
                  "sar{w}\t$dst",
-                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
-                 IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize16;
 def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
                  "sar{l}\t$dst",
-                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
-                 IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>, OpSize32;
 def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
-                 "sar{q}\t$dst",
-                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
-                 IIC_SR>;
+                  "sar{q}\t$dst",
+                  [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
 } // Constraints = "$src = $dst", SchedRW
 
 
@@ -285,55 +272,52 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
                  "sar{b}\t{%cl, $dst|$dst, cl}",
-                 [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>;
+                 [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
 def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
                  "sar{w}\t{%cl, $dst|$dst, cl}",
-                 [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize16;
+                 [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>,
+                 OpSize16;
 def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
                  "sar{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize32;
+                 [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>,
+                 OpSize32;
 def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
                  "sar{q}\t{%cl, $dst|$dst, cl}",
-                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 }
 def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "sar{b}\t{$src, $dst|$dst, $src}",
-                [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                IIC_SR>;
+                [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
 def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "sar{w}\t{$src, $dst|$dst, $src}",
-               [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>, OpSize16;
+               [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+               OpSize16;
 def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "sar{l}\t{$src, $dst|$dst, $src}",
-               [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>, OpSize32;
+               [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+               OpSize32;
 def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
                     "sar{q}\t{$src, $dst|$dst, $src}",
-                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 
 // Shift by 1
 def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
                  "sar{b}\t$dst",
-                [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
-                IIC_SR>;
+                [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
 def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
                  "sar{w}\t$dst",
-               [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, OpSize16;
+               [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+               OpSize16;
 def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
                  "sar{l}\t$dst",
-               [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, OpSize32;
+               [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+               OpSize32;
 def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
                   "sar{q}\t$dst",
-                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
-                 IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -345,62 +329,62 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 
 let Uses = [CL, EFLAGS] in {
 def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                "rcl{b}\t{%cl, $dst|$dst, cl}", []>;
 def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
 def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
 def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", []>;
 } // Uses = [CL, EFLAGS]
 
 let Uses = [EFLAGS] in {
 def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
-               "rcl{b}\t$dst", [], IIC_SR>;
+               "rcl{b}\t$dst", []>;
 def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
-                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+                "rcl{w}\t$dst", []>, OpSize16;
 def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
-                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
 def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+                "rcl{l}\t$dst", []>, OpSize32;
 def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
-                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
-                 "rcl{q}\t$dst", [], IIC_SR>;
+                 "rcl{q}\t$dst", []>;
 def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
-                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
 } // Uses = [EFLAGS]
 
 let Uses = [CL, EFLAGS] in {
 def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                "rcr{b}\t{%cl, $dst|$dst, cl}", []>;
 def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
 def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
 def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", []>;
 } // Uses = [CL, EFLAGS]
 
 let Uses = [EFLAGS] in {
 def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
-               "rcr{b}\t$dst", [], IIC_SR>;
+               "rcr{b}\t$dst", []>;
 def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
-                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+                "rcr{w}\t$dst", []>, OpSize16;
 def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
-                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
 def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+                "rcr{l}\t$dst", []>, OpSize32;
 def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
-                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
-                 "rcr{q}\t$dst", [], IIC_SR>;
+                 "rcr{q}\t$dst", []>;
 def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
-                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
 } // Uses = [EFLAGS]
 
 } // Constraints = "$src = $dst"
@@ -408,61 +392,61 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
 let SchedRW = [WriteShiftLd, WriteRMW], mayStore = 1 in {
 let Uses = [EFLAGS] in {
 def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
-               "rcl{b}\t$dst", [], IIC_SR>;
+               "rcl{b}\t$dst", []>;
 def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
-                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
 def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
-                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+                "rcl{w}\t$dst", []>, OpSize16;
 def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
-                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
 def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
-                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+                "rcl{l}\t$dst", []>, OpSize32;
 def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
-                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
 def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
-                 "rcl{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>;
+                 "rcl{q}\t$dst", []>, Requires<[In64BitMode]>;
 def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
-                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>,
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>,
                    Requires<[In64BitMode]>;
 
 def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
-               "rcr{b}\t$dst", [], IIC_SR>;
+               "rcr{b}\t$dst", []>;
 def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
-                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
 def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
-                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+                "rcr{w}\t$dst", []>, OpSize16;
 def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
-                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
 def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
-                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+                "rcr{l}\t$dst", []>, OpSize32;
 def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
-                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
 def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
-                 "rcr{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>;
+                 "rcr{q}\t$dst", []>, Requires<[In64BitMode]>;
 def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
-                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>,
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>,
                    Requires<[In64BitMode]>;
 } // Uses = [EFLAGS]
 
 let Uses = [CL, EFLAGS] in {
 def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
-                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                "rcl{b}\t{%cl, $dst|$dst, cl}", []>;
 def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
-                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
 def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
-                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
 def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
-                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>,
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", []>,
                   Requires<[In64BitMode]>;
 
 def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
-                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                "rcr{b}\t{%cl, $dst|$dst, cl}", []>;
 def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
-                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
 def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
-                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
 def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
-                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>,
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", []>,
                   Requires<[In64BitMode]>;
 } // Uses = [CL, EFLAGS]
 } // SchedRW
@@ -473,215 +457,192 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "rol{b}\t{%cl, $dst|$dst, cl}",
-                 [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+                 [(set GR8:$dst, (rotl GR8:$src1, CL))]>;
 def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                  "rol{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize16;
 def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                  "rol{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))]>, OpSize32;
 def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   "rol{q}\t{%cl, $dst|$dst, cl}",
-                  [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+                  [(set GR64:$dst, (rotl GR64:$src1, CL))]>;
 }
 
 def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "rol{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
 def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "rol{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize16;
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16;
 def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "rol{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize32;
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32;
 def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "rol{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
-                    IIC_SR>;
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
 
 // Rotate by 1
 def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "rol{b}\t$dst",
-                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
-                 IIC_SR>;
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
 def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                  "rol{w}\t$dst",
-                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
-                 IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize16;
 def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                  "rol{l}\t$dst",
-                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
-                 IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>, OpSize32;
 def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   "rol{q}\t$dst",
-                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
-                  IIC_SR>;
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
 } // Constraints = "$src = $dst", SchedRW
 
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
                  "rol{b}\t{%cl, $dst|$dst, cl}",
-                 [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>;
+                 [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
 def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
                  "rol{w}\t{%cl, $dst|$dst, cl}",
-                 [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize16;
+                 [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16;
 def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
                  "rol{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize32;
+                 [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32;
 def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
                    "rol{q}\t{%cl, $dst|$dst, cl}",
-                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
-                   IIC_SR>, Requires<[In64BitMode]>;
+                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>,
+                   Requires<[In64BitMode]>;
 }
 def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
                    "rol{b}\t{$src1, $dst|$dst, $src1}",
-               [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
-               IIC_SR>;
+               [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>;
 def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
                    "rol{w}\t{$src1, $dst|$dst, $src1}",
-              [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
-              IIC_SR>, OpSize16;
+              [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+              OpSize16;
 def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
                    "rol{l}\t{$src1, $dst|$dst, $src1}",
-              [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
-              IIC_SR>, OpSize32;
+              [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+              OpSize32;
 def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
                     "rol{q}\t{$src1, $dst|$dst, $src1}",
-                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
-                IIC_SR>, Requires<[In64BitMode]>;
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+                Requires<[In64BitMode]>;
 
 // Rotate by 1
 def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
                  "rol{b}\t$dst",
-               [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>;
+                 [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
 def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
                  "rol{w}\t$dst",
-              [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-              IIC_SR>, OpSize16;
+                 [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                 OpSize16;
 def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
                  "rol{l}\t$dst",
-              [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-              IIC_SR>, OpSize32;
+                 [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+                 OpSize32;
 def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
                  "rol{q}\t$dst",
-               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 } // SchedRW
 
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "ror{b}\t{%cl, $dst|$dst, cl}",
-                 [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+                 [(set GR8:$dst, (rotr GR8:$src1, CL))]>;
 def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                  "ror{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize16;
 def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))]>, OpSize32;
 def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t{%cl, $dst|$dst, cl}",
-                  [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+                  [(set GR64:$dst, (rotr GR64:$src1, CL))]>;
 }
 
 def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "ror{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))],
-                   IIC_SR>;
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))]>;
 def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "ror{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))],
-                   IIC_SR>, OpSize16;
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))]>,
+                   OpSize16;
 def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "ror{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))],
-                   IIC_SR>, OpSize32;
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))]>,
+                   OpSize32;
 def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))],
-                    IIC_SR>;
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))]>;
 
 // Rotate by 1
 def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "ror{b}\t$dst",
-                 [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))],
-                 IIC_SR>;
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>;
 def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                  "ror{w}\t$dst",
-                 [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))],
-                 IIC_SR>, OpSize16;
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16;
 def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t$dst",
-                 [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))],
-                 IIC_SR>, OpSize32;
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32;
 def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t$dst",
-                  [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))],
-                  IIC_SR>;
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>;
 } // Constraints = "$src = $dst", SchedRW
 
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
                  "ror{b}\t{%cl, $dst|$dst, cl}",
-                 [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>;
+                 [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
 def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t{%cl, $dst|$dst, cl}",
-                 [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize16;
+                 [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16;
 def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize32;
+                 [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32;
 def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
                   "ror{q}\t{%cl, $dst|$dst, cl}",
-                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
-                  IIC_SR>, Requires<[In64BitMode]>;
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>,
+                  Requires<[In64BitMode]>;
 }
 def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "ror{b}\t{$src, $dst|$dst, $src}",
-               [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>;
+                   [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
 def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "ror{w}\t{$src, $dst|$dst, $src}",
-              [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-              IIC_SR>, OpSize16;
+                   [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                   OpSize16;
 def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "ror{l}\t{$src, $dst|$dst, $src}",
-              [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-              IIC_SR>, OpSize32;
+                   [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                   OpSize32;
 def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
                     "ror{q}\t{$src, $dst|$dst, $src}",
-                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
-                IIC_SR>, Requires<[In64BitMode]>;
+                    [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                    Requires<[In64BitMode]>;
 
 // Rotate by 1
 def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
                  "ror{b}\t$dst",
-               [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)],
-               IIC_SR>;
+                 [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>;
 def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t$dst",
-              [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)],
-              IIC_SR>, OpSize16;
+                 [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>,
+                 OpSize16;
 def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t$dst",
-              [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)],
-              IIC_SR>, OpSize32;
+                 [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>,
+                 OpSize32;
 def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                  "ror{q}\t$dst",
-               [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)],
-               IIC_SR>, Requires<[In64BitMode]>;
+                 [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>,
+                 Requires<[In64BitMode]>;
 } // SchedRW
 
 
@@ -689,42 +650,38 @@ def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
 // Double shift instructions (generalizations of rotate)
 //===----------------------------------------------------------------------===//
 
-let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShiftDouble] in {
 
 let Uses = [CL] in {
 def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
-                    IIC_SHD16_REG_CL>,
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
                    TB, OpSize16;
 def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
-                    IIC_SHD16_REG_CL>,
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
                    TB, OpSize16;
 def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
-                    IIC_SHD32_REG_CL>, TB, OpSize32;
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+                   TB, OpSize32;
 def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
-                   IIC_SHD32_REG_CL>, TB, OpSize32;
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+                   TB, OpSize32;
 def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
-                    IIC_SHD64_REG_CL>,
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
                     TB;
 def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
-                    IIC_SHD64_REG_CL>,
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
                     TB;
 }
 
@@ -734,119 +691,113 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
                      (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
-                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                                      (i8 imm:$src3)))]>,
                      TB, OpSize16;
 def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
-                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                                      (i8 imm:$src3)))]>,
                      TB, OpSize16;
 def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
-                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                                      (i8 imm:$src3)))]>,
                  TB, OpSize32;
 def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
-                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                                      (i8 imm:$src3)))]>,
                  TB, OpSize32;
 def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
                       (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
-                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                                       (i8 imm:$src3)))]>,
                  TB;
 def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
                       (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
-                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                                       (i8 imm:$src3)))]>,
                  TB;
 }
 } // Constraints = "$src = $dst", SchedRW
 
-let SchedRW = [WriteShiftLd, WriteRMW] in {
+let SchedRW = [WriteShiftDoubleLd, WriteRMW] in {
 let Uses = [CL] in {
 def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
-                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+                     addr:$dst)]>, TB, OpSize16;
 def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
-                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+                    addr:$dst)]>, TB, OpSize16;
 
 def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
-                     addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+                     addr:$dst)]>, TB, OpSize32;
 def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
-                    addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+                    addr:$dst)]>, TB, OpSize32;
 
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
-                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+                      addr:$dst)]>, TB;
 def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
-                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+                      addr:$dst)]>, TB;
 }
 
 def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
-                                      (i8 imm:$src3)), addr:$dst)],
-                                      IIC_SHD16_MEM_IM>,
+                                      (i8 imm:$src3)), addr:$dst)]>,
                     TB, OpSize16;
 def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
-                                      (i8 imm:$src3)), addr:$dst)],
-                                      IIC_SHD16_MEM_IM>,
+                                      (i8 imm:$src3)), addr:$dst)]>,
                      TB, OpSize16;
 
 def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
-                                      (i8 imm:$src3)), addr:$dst)],
-                                      IIC_SHD32_MEM_IM>,
+                                      (i8 imm:$src3)), addr:$dst)]>,
                     TB, OpSize32;
 def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
-                                       (i8 imm:$src3)), addr:$dst)],
-                                       IIC_SHD32_MEM_IM>,
+                                       (i8 imm:$src3)), addr:$dst)]>,
                      TB, OpSize32;
 
 def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
-                                       (i8 imm:$src3)), addr:$dst)],
-                                       IIC_SHD64_MEM_IM>,
+                                       (i8 imm:$src3)), addr:$dst)]>,
                  TB;
 def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
-                                       (i8 imm:$src3)), addr:$dst)],
-                                       IIC_SHD64_MEM_IM>,
+                                       (i8 imm:$src3)), addr:$dst)]>,
                  TB;
 } // SchedRW
 
@@ -897,7 +848,7 @@ let hasSideEffects = 0 in {
                          // x86memop:$src1
                          ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                          ReadDefault,
-                         // RC:$src1
+                         // RC:$src2
                          ReadAfterLd]>;
 }
 }
@@ -967,7 +918,7 @@ let Predicates = [HasBMI2] in {
                           (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
   }
 
-  // Artificially lower the complexity so that we'll favor
+  // We prefer to use
   //  mov (%ecx), %esi
   //  shl $imm, $esi
   //
@@ -975,32 +926,32 @@ let Predicates = [HasBMI2] in {
   //
   //  movb $imm, %al
   //  shlx %al, (%ecx), %esi
-  let AddedComplexity = -20 in {
-    def : Pat<(sra (loadi32 addr:$src1), GR8:$src2),
-              (SARX32rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-    def : Pat<(sra (loadi64 addr:$src1), GR8:$src2),
-              (SARX64rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-
-    def : Pat<(srl (loadi32 addr:$src1), GR8:$src2),
-              (SHRX32rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-    def : Pat<(srl (loadi64 addr:$src1), GR8:$src2),
-              (SHRX64rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-
-    def : Pat<(shl (loadi32 addr:$src1), GR8:$src2),
-              (SHLX32rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-    def : Pat<(shl (loadi64 addr:$src1), GR8:$src2),
-              (SHLX64rm addr:$src1,
-                        (INSERT_SUBREG
-                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-  }
+  //
+  // This priority is enforced by IsProfitableToFoldLoad.
+  def : Pat<(sra (loadi32 addr:$src1), GR8:$src2),
+            (SARX32rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(sra (loadi64 addr:$src1), GR8:$src2),
+            (SARX64rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+  def : Pat<(srl (loadi32 addr:$src1), GR8:$src2),
+            (SHRX32rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(srl (loadi64 addr:$src1), GR8:$src2),
+            (SHRX64rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+  def : Pat<(shl (loadi32 addr:$src1), GR8:$src2),
+            (SHLX32rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(shl (loadi64 addr:$src1), GR8:$src2),
+            (SHLX64rm addr:$src1,
+                      (INSERT_SUBREG
+                        (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
index 576f87b13ab4..35ee00b9e016 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -15,28 +15,26 @@
 
 let SchedRW = [WriteSystem] in {
 let Defs = [RAX, RDX] in
-  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
-              TB;
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
 
 let Defs = [RAX, RCX, RDX] in
-  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)],
-                 IIC_RDTSCP>, TB;
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
 
 // CPU flow control instructions
 
-let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
   def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
   def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
 }
 
-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
-def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
 
 // Interrupt and SysCall Instructions.
 let Uses = [EFLAGS] in
   def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>;
-def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
-              [(int_x86_int (i8 3))], IIC_INT3>;
+
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
 } // SchedRW
 
 // The long form of "int $3" turns into int3 as a size optimization.
@@ -46,21 +44,19 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
 let SchedRW = [WriteSystem] in {
 
 def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
-              [(int_x86_int imm:$trap)], IIC_INT>;
+              [(int_x86_int imm:$trap)]>;
 
 
-def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
-def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
-def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB,
                Requires<[In64BitMode]>;
 
-def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
-                 IIC_SYS_ENTER_EXIT>, TB;
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
 
-def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
-                 IIC_SYS_ENTER_EXIT>, TB;
-def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
-                 IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB,
+                  Requires<[In64BitMode]>;
 } // SchedRW
 
 def : Pat<(debugtrap),
@@ -73,44 +69,42 @@ def : Pat<(debugtrap),
 //
 let SchedRW = [WriteSystem] in {
 let Defs = [AL], Uses = [DX] in
-def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
-               "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins), "in{b}\t{%dx, %al|al, dx}", []>;
 let Defs = [AX], Uses = [DX] in
-def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize16;
+def IN16rr : I<0xED, RawFrm, (outs), (ins), "in{w}\t{%dx, %ax|ax, dx}", []>,
+               OpSize16;
 let Defs = [EAX], Uses = [DX] in
-def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", []>,
+               OpSize32;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
-                  "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+                 "in{b}\t{$port, %al|al, $port}", []>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
-                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+                 "in{w}\t{$port, %ax|ax, $port}", []>, OpSize16;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
-                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+                 "in{l}\t{$port, %eax|eax, $port}", []>, OpSize32;
 
 let Uses = [DX, AL] in
-def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
-                "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins), "out{b}\t{%al, %dx|dx, al}", []>;
 let Uses = [DX, AX] in
-def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins), "out{w}\t{%ax, %dx|dx, ax}", []>,
+                OpSize16;
 let Uses = [DX, EAX] in
-def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", []>,
+                OpSize32;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
-                   "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+                   "out{b}\t{%al, $port|$port, al}", []>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
-                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+                   "out{w}\t{%ax, $port|$port, ax}", []>, OpSize16;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
-                  "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+                  "out{l}\t{%eax, $port|$port, eax}", []>, OpSize32;
 
 } // SchedRW
 
@@ -119,17 +113,17 @@ def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
 
 let SchedRW = [WriteSystem] in {
 def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[Not64BitMode]>;
 def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[In64BitMode]>;
 
 def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[Not64BitMode]>;
 def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -138,17 +132,17 @@ def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
 
 let SchedRW = [WriteSystem] in {
 def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[Not64BitMode]>;
 def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[In64BitMode]>;
 
 def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[Not64BitMode]>;
 def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
                 Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -156,12 +150,12 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
 // Segment override instruction prefixes
 
 let SchedRW = [WriteNop] in {
-def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", [], IIC_NOP>;
-def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", [], IIC_NOP>;
-def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", [], IIC_NOP>;
-def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", [], IIC_NOP>;
-def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", [], IIC_NOP>;
-def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", [], IIC_NOP>;
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -170,24 +164,24 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", [], IIC_NOP>;
 
 let SchedRW = [WriteMove] in {
 def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
 def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
 let mayStore = 1 in {
 def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSizeIgnore;
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
 }
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
 def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
 def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in {
 def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSizeIgnore;
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
 }
 } // SchedRW
 
@@ -195,198 +189,168 @@ def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
 // Segmentation support instructions.
 
 let SchedRW = [WriteSystem] in {
-def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
 
 let mayLoad = 1 in
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
-                OpSize16;
+                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize16, NotMemoryFoldable;
 def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
-                OpSize16;
+                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize16, NotMemoryFoldable;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
 let mayLoad = 1 in
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
-                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
-                OpSize32;
+                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize32, NotMemoryFoldable;
 def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
-                OpSize32;
-// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize32, NotMemoryFoldable;
+// i16mem operand in LAR64rm and GR32 operand in LAR64rr is not a typo.
 let mayLoad = 1 in
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
-                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 
+// i16mem operand in LSL32rm and GR32 operand in LSL32rr is not a typo.
 let mayLoad = 1 in
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
-                OpSize16;
+                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize16, NotMemoryFoldable;
 def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
-                OpSize16;
+                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize16, NotMemoryFoldable;
+// i16mem operand in LSL64rm and GR32 operand in LSL64rr is not a typo.
 let mayLoad = 1 in
-def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
-                OpSize32;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize32, NotMemoryFoldable;
 def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
-                OpSize32;
+                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
+                OpSize32, NotMemoryFoldable;
 let mayLoad = 1 in
-def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
-def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 
-def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
-               [], IIC_INVLPG>, TB;
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
 
 def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
-               "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+               "str{w}\t$dst", []>, TB, OpSize16;
 def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
-               "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+               "str{l}\t$dst", []>, TB, OpSize32;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
-                "str{q}\t$dst", [], IIC_STR>, TB;
+                "str{q}\t$dst", []>, TB;
 let mayStore = 1 in
-def STRm   : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
-               "str{w}\t$dst", [], IIC_STR>, TB;
+def STRm   : I<0x00, MRM1m, (outs), (ins i16mem:$dst), "str{w}\t$dst", []>, TB;
 
-def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
-             "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable;
 let mayLoad = 1 in
-def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
-             "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable;
 
-def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|cs}", []>,
                  OpSize16, Requires<[Not64BitMode]>;
-def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), "push{l}\t{%cs|cs}", []>,
                  OpSize32, Requires<[Not64BitMode]>;
-def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), "push{w}\t{%ss|ss}", []>,
                  OpSize16, Requires<[Not64BitMode]>;
-def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), "push{l}\t{%ss|ss}", []>,
                  OpSize32, Requires<[Not64BitMode]>;
-def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), "push{w}\t{%ds|ds}", []>,
                  OpSize16, Requires<[Not64BitMode]>;
-def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), "push{l}\t{%ds|ds}", []>,
                  OpSize32, Requires<[Not64BitMode]>;
-def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins), "push{w}\t{%es|es}", []>,
                  OpSize16, Requires<[Not64BitMode]>;
-def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins), "push{l}\t{%es|es}", []>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), "push{w}\t{%fs|fs}", []>,
+                 OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), "push{l}\t{%fs|fs}", []>, TB,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), "push{w}\t{%gs|gs}", []>,
+                 OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), "push{l}\t{%gs|gs}", []>, TB,
                  OpSize32, Requires<[Not64BitMode]>;
-def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
-def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
-               OpSize32, Requires<[Not64BitMode]>;
-def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
-def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
-               OpSize32, Requires<[Not64BitMode]>;
-def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
-               OpSize32, Requires<[In64BitMode]>;
-def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
-               OpSize32, Requires<[In64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), "push{q}\t{%fs|fs}", []>, TB,
+                 OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), "push{q}\t{%gs|gs}", []>, TB,
+                 OpSize32, Requires<[In64BitMode]>;
 
 // No "pop cs" instruction.
-def POPSS16 : I<0x17, RawFrm, (outs), (ins),
-                "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+def POPSS16 : I<0x17, RawFrm, (outs), (ins), "pop{w}\t{%ss|ss}", []>,
               OpSize16, Requires<[Not64BitMode]>;
-def POPSS32 : I<0x17, RawFrm, (outs), (ins),
-                "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+def POPSS32 : I<0x17, RawFrm, (outs), (ins), "pop{l}\t{%ss|ss}", []>,
               OpSize32, Requires<[Not64BitMode]>;
 
-def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins), "pop{w}\t{%ds|ds}", []>,
               OpSize16, Requires<[Not64BitMode]>;
-def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins), "pop{l}\t{%ds|ds}", []>,
               OpSize32, Requires<[Not64BitMode]>;
 
-def POPES16 : I<0x07, RawFrm, (outs), (ins),
-                "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+def POPES16 : I<0x07, RawFrm, (outs), (ins), "pop{w}\t{%es|es}", []>,
               OpSize16, Requires<[Not64BitMode]>;
-def POPES32 : I<0x07, RawFrm, (outs), (ins),
-                "pop{l}\t{%es|es}", [], IIC_POP_SR>,
+def POPES32 : I<0x07, RawFrm, (outs), (ins), "pop{l}\t{%es|es}", []>,
               OpSize32, Requires<[Not64BitMode]>;
 
-def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
-def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
-              OpSize32, Requires<[Not64BitMode]>;
-def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
-              OpSize32, Requires<[In64BitMode]>;
-
-def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
-def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
-              OpSize32, Requires<[Not64BitMode]>;
-def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
-              OpSize32, Requires<[In64BitMode]>;
-
-
-def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins), "pop{w}\t{%fs|fs}", []>,
+                OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins), "pop{l}\t{%fs|fs}", []>, TB,
+                OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins), "pop{q}\t{%fs|fs}", []>, TB,
+                OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins), "pop{w}\t{%gs|gs}", []>,
+                OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins), "pop{l}\t{%gs|gs}", []>, TB,
+                OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins), "pop{q}\t{%gs|gs}", []>, TB,
+                OpSize32, Requires<[In64BitMode]>;
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+                "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
                 Requires<[Not64BitMode]>;
-def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+                "lds{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
                 Requires<[Not64BitMode]>;
 
-def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
-def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
-def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+                "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+                "lss{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+                 "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
 
-def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+                "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
                 Requires<[Not64BitMode]>;
-def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+                "les{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
                 Requires<[Not64BitMode]>;
 
-def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
-def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
-def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+                "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+                "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+                 "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
 
-def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
-def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+                "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+                "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
 
-def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+                 "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
 
-
-def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
-              "verr\t$seg", [], IIC_VERR>, TB;
-def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
-              "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable;
 let mayLoad = 1 in {
-def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
-              "verr\t$seg", [], IIC_VERR>, TB;
-def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
-              "verw\t$seg", [], IIC_VERW_REG>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable;
 }
 } // SchedRW
 
@@ -394,97 +358,100 @@ def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
 // Descriptor-table support instructions
 
 let SchedRW = [WriteSystem] in {
-def SGDT16m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
-              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def SGDT32m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
-              "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
-def SGDT64m : I<0x01, MRM0m, (outs), (ins opaque80mem:$dst),
-              "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
-def SIDT16m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
-              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def SIDT32m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
-              "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
-def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
-              "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SGDT16m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+                "sgdtw\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+                "sgdt{l|d}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+                "sgdt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+                "sidtw\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+                "sidt{l|d}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+                "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
-                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+                "sldt{w}\t$dst", []>, TB, OpSize16;
 let mayStore = 1 in
 def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
-                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+                "sldt{w}\t$dst", []>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
-                "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+                "sldt{l}\t$dst", []>, OpSize32, TB;
 
 // LLDT is not interpreted specially in 64-bit mode because there is no sign
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
-                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
-let mayStore = 1 in
-def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
-                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
-
-def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
-def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
-              "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
-def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
-def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
-              "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+                 "sldt{q}\t$dst", []>, TB, Requires<[In64BitMode]>;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+                "lgdtw\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+                "lgdt{l|d}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+                "lgdt{q}\t$src", []>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+                "lidtw\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+                "lidt{l|d}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+                "lidt{q}\t$src", []>, TB, Requires<[In64BitMode]>;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
-                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+                "lldt{w}\t$src", []>, TB, NotMemoryFoldable;
 let mayLoad = 1 in
 def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
-                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+                "lldt{w}\t$src", []>, TB, NotMemoryFoldable;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // Specialized register support
 let SchedRW = [WriteSystem] in {
 let Uses = [EAX, ECX, EDX] in
-def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
 let Defs = [EAX, EDX], Uses = [ECX] in
-def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
 
 let Defs = [RAX, RDX], Uses = [ECX] in
-  def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
-              TB;
+  def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB;
 
 def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
-                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+                "smsw{w}\t$dst", []>, OpSize16, TB;
 def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
-                "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+                "smsw{l}\t$dst", []>, OpSize32, TB;
 // no m form encodable; use SMSW16m
 def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
-                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+                 "smsw{q}\t$dst", []>, TB;
 
 // For memory operands, there is only a 16-bit form
 def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
-                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+                "smsw{w}\t$dst", []>, TB;
 
 def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
-                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+                "lmsw{w}\t$src", []>, TB, NotMemoryFoldable;
 let mayLoad = 1 in
 def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
-                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+                "lmsw{w}\t$src", []>, TB, NotMemoryFoldable;
 
 let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
-  def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+  def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // Cache instructions
 let SchedRW = [WriteSystem] in {
-def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB;
+
+// wbnoinvd is like wbinvd, except without invalidation
+// encoding: like wbinvd + an 0xF3 prefix
+def WBNOINVD : I<0x09, RawFrm, (outs), (ins), "wbnoinvd",
+                 [(int_x86_wbnoinvd)]>, XS,
+                 Requires<[HasWBNOINVD]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // CET instructions
-let SchedRW = [WriteSystem], Predicates = [HasSHSTK]  in{
+// Use with caution, availability is not predicated on features.
+let SchedRW = [WriteSystem] in {
   let Uses = [SSP] in {
     let Defs = [SSP] in {
       def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src",
@@ -534,7 +501,12 @@ let SchedRW = [WriteSystem], Predicates = [HasSHSTK]  in{
                      "clrssbsy\t$src",
                      [(int_x86_clrssbsy addr:$src)]>, XS;
   } // Defs SSP
-} // SchedRW && HasSHSTK
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+    def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, XS;
+    def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, XS;
+} // SchedRW
 
 //===----------------------------------------------------------------------===//
 // XSAVE instructions
@@ -551,40 +523,40 @@ let Uses = [EDX, EAX, ECX] in
 } // HasXSAVE
 
 let Uses = [EDX, EAX] in {
-def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
               "xsave\t$dst",
               [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
-def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsave64\t$dst",
                  [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
-def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                "xrstor\t$dst",
                [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
-def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                   "xrstor64\t$dst",
                   [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
-def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                  "xsaveopt\t$dst",
                  [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>;
-def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                     "xsaveopt64\t$dst",
                     [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
-def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                "xsavec\t$dst",
                [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
-def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsavec64\t$dst",
                  [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
-def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                "xsaves\t$dst",
                [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
-def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                   "xsaves64\t$dst",
                   [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
-def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                 "xrstors\t$dst",
                 [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
-def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                    "xrstors64\t$dst",
                    [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
 } // Uses
@@ -625,9 +597,9 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
 
 let SchedRW = [WriteSystem] in {
 let Defs = [EAX, EDX], Uses = [ECX] in
-  def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", [], IIC_PKU>, TB;
+  def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
 let Uses = [EAX, ECX, EDX] in
-  def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", [], IIC_PKU>, TB;
+  def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -635,100 +607,134 @@ let Uses = [EAX, ECX, EDX] in
 let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
   def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
                    "rdfsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdfsbase_32))],
-                   IIC_SEGMENT_BASE_R>, XS;
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
   def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
                      "rdfsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdfsbase_64))],
-                     IIC_SEGMENT_BASE_R>, XS;
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
   def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
                    "rdgsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdgsbase_32))],
-                   IIC_SEGMENT_BASE_R>, XS;
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
   def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
                      "rdgsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdgsbase_64))],
-                     IIC_SEGMENT_BASE_R>, XS;
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
   def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
                    "wrfsbase{l}\t$src",
-                   [(int_x86_wrfsbase_32 GR32:$src)],
-                   IIC_SEGMENT_BASE_W>, XS;
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
   def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
                       "wrfsbase{q}\t$src",
-                      [(int_x86_wrfsbase_64 GR64:$src)],
-                      IIC_SEGMENT_BASE_W>, XS;
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
   def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
                    "wrgsbase{l}\t$src",
-                   [(int_x86_wrgsbase_32 GR32:$src)], IIC_SEGMENT_BASE_W>, XS;
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
   def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
                       "wrgsbase{q}\t$src",
-                      [(int_x86_wrgsbase_64 GR64:$src)],
-                      IIC_SEGMENT_BASE_W>, XS;
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
 }
 
 //===----------------------------------------------------------------------===//
 // INVPCID Instruction
 let SchedRW = [WriteSystem] in {
 def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD,
-                Requires<[Not64BitMode]>;
+                  "invpcid\t{$src2, $src1|$src1, $src2}",
+                  [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8PD,
+                  Requires<[Not64BitMode, HasINVPCID]>;
 def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD,
-                Requires<[In64BitMode]>;
+                  "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                  Requires<[In64BitMode, HasINVPCID]>;
 } // SchedRW
 
+let Predicates = [In64BitMode, HasINVPCID] in {
+  // The instruction can only use a 64 bit register as the register argument
+  // in 64 bit mode, while the intrinsic only accepts a 32 bit argument
+  // corresponding to it.
+  // The accepted values for now are 0,1,2,3 anyways (see Intel SDM -- INVCPID
+  // type),/ so it doesn't hurt us that one can't supply a 64 bit value here.
+  def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2),
+            (INVPCID64
+              (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit),
+              addr:$src2)>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
 let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
-  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", [], IIC_SMAP>, TB;
-  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", [], IIC_SMAP>, TB;
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
 }
 
 //===----------------------------------------------------------------------===//
 // SMX Instruction
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
-  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", [], IIC_SMX>, TB;
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
 } // Uses, Defs
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // TS flag control instruction.
 let SchedRW = [WriteSystem] in {
-def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
 }
 
 //===----------------------------------------------------------------------===//
 // IF (inside EFLAGS) management instructions.
 let SchedRW = [WriteSystem], Uses = [EFLAGS], Defs = [EFLAGS] in {
-def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
-def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
 }
 
 //===----------------------------------------------------------------------===//
 // RDPID Instruction
 let SchedRW = [WriteSystem] in {
-def RDPID32 : I<0xC7, MRM7r, (outs GR32:$src), (ins),
-              "rdpid\t$src", [], IIC_RDPID>, XS,
-              Requires<[Not64BitMode]>;
-def RDPID64 : I<0xC7, MRM7r, (outs GR64:$src), (ins),
-              "rdpid\t$src", [], IIC_RDPID>, XS,
-              Requires<[In64BitMode]>;
+def RDPID32 : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+                "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, XS,
+                Requires<[Not64BitMode, HasRDPID]>;
+def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, XS,
+                Requires<[In64BitMode, HasRDPID]>;
 } // SchedRW
 
+let Predicates = [In64BitMode, HasRDPID] in {
+  // Due to silly instruction definition, we have to compensate for the
+  // instruction outputing a 64-bit register.
+  def : Pat<(int_x86_rdpid),
+            (EXTRACT_SUBREG (RDPID64), sub_32bit)>;
+}
+
+
 //===----------------------------------------------------------------------===//
-// PTWRITE Instruction
+// PTWRITE Instruction - Write Data to a Processor Trace Packet
 let SchedRW = [WriteSystem] in {
-
 def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst),
-                "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS;
+                "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, XS,
+                Requires<[HasPTWRITE]>;
 def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst),
-                    "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS,
-                    Requires<[In64BitMode]>;
+                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, XS,
+                    Requires<[In64BitMode, HasPTWRITE]>;
 
 def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst),
-                 "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS;
+                 "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, XS,
+                    Requires<[HasPTWRITE]>;
 def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
-                    "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS,
-                    Requires<[In64BitMode]>;
+                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, XS,
+                    Requires<[In64BitMode, HasPTWRITE]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Platform Configuration instruction
+
+// From ISA docs:
+//  "This instruction is used to execute functions for configuring platform
+//   features.
+//   EAX: Leaf function to be invoked.
+//   RBX/RCX/RDX: Leaf-specific purpose."
+//  "Successful execution of the leaf clears RAX (set to zero) and ZF, CF, PF,
+//   AF, OF, and SF are cleared. In case of failure, the failure reason is
+//   indicated in RAX with ZF set to 1 and CF, PF, AF, OF, and SF are cleared."
+// Thus all these mentioned registers are considered clobbered.
+
+let SchedRW = [WriteSystem] in {
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
+    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB,
+                  Requires<[HasPCONFIG]>;
 } // SchedRW
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
index 4bb2c204b368..06a438ebfcad 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -18,59 +18,67 @@
 let SchedRW = [WriteSystem] in {
 // 66 0F 38 80
 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                Requires<[Not64BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                Requires<[In64BitMode]>;
 
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                 Requires<[Not64BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                 Requires<[In64BitMode]>;
 
 // 0F 01 C1
-def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", [], IIC_VMX>, TB;
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
 def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
   "vmclear\t$vmcs", []>, PD;
 
 // OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", [], IIC_VMX>, TB;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
 
 // 0F 01 C2
-def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", [], IIC_VMX>, TB;
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
 
 // 0F 01 C3
-def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", [], IIC_VMX>, TB;
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
 def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmptrld\t$vmcs", [], IIC_VMX>, PS;
+  "vmptrld\t$vmcs", []>, PS;
 def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
-  "vmptrst\t$vmcs", [], IIC_VMX>, PS;
+  "vmptrst\t$vmcs", []>, PS;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+  NotMemoryFoldable;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+  NotMemoryFoldable;
 
 let mayStore = 1 in {
 def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+  NotMemoryFoldable;
 def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+  NotMemoryFoldable;
 } // mayStore
 
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+  NotMemoryFoldable;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+  NotMemoryFoldable;
 
 let mayLoad = 1 in {
 def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+  NotMemoryFoldable;
 def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+  NotMemoryFoldable;
 } // mayLoad
 
 // 0F 01 C4
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td
index c1cb4dcb16be..322bdb74e2de 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -48,8 +48,6 @@ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
-def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
 
 // Bitcasts between 256-bit vector types. Return the original type since
 // no instruction is needed for the conversion
@@ -111,7 +109,6 @@ def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
 def : Pat<(v32i16 (bitconvert (v64i8  VR512:$src))), (v32i16 VR512:$src)>;
 def : Pat<(v32i16 (bitconvert (v8f64  VR512:$src))), (v32i16 VR512:$src)>;
 def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
 def : Pat<(v64i8  (bitconvert (v8i64  VR512:$src))), (v64i8  VR512:$src)>;
 def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8  VR512:$src)>;
 def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8  VR512:$src)>;
@@ -148,7 +145,6 @@ multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
   def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
             (subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
 
-  let AddedComplexity = 25 in // to give priority over vinsertf128rm
   def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
             (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
 }
@@ -217,13 +213,13 @@ let Predicates = [HasVLX] in {
                                   sub_xmm>;
   defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32,
                                   sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v2i64,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64,
                                   v4i64, sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v4i32,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32,
                                   v8i32, sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v8i16,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16,
                                   v16i16, sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v16i8,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8,
                                   v32i8, sub_xmm>;
 
   // Special patterns for storing subvector extracts of lower 128-bits of 512.
@@ -232,13 +228,13 @@ let Predicates = [HasVLX] in {
                                   sub_xmm>;
   defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32,
                                   sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v2i64,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64,
                                   v8i64, sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v4i32,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32,
                                   v16i32, sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v8i16,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16,
                                   v32i16, sub_xmm>;
-  defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v16i8,
+  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8,
                                   v64i8, sub_xmm>;
 
   // Special patterns for storing subvector extracts of lower 256-bits of 512.
@@ -247,186 +243,83 @@ let Predicates = [HasVLX] in {
                                   sub_ymm>;
   defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32,
                                   sub_ymm>;
-  defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v4i64,
+  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64,
                                   v8i64, sub_ymm>;
-  defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v8i32,
+  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32,
                                   v16i32, sub_ymm>;
-  defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v16i16,
+  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16,
                                   v32i16, sub_ymm>;
-  defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v32i8,
+  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8,
                                   v64i8, sub_ymm>;
 }
 
 // If we're inserting into an all zeros vector, just use a plain move which
-// will zero the upper bits.
-// TODO: Is there a safe way to detect whether the producing instruction
-// already zeroed the upper bits?
-multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC,
-                                   ValueType DstTy, ValueType SrcTy,
-                                   ValueType ZeroTy, PatFrag memop,
-                                   SubRegIndex SubIdx> {
+// will zero the upper bits. A post-isel hook will take care of removing
+// any moves that we can prove are unnecessary.
+multiclass subvec_zero_lowering<string MoveStr,
+                                RegisterClass RC, ValueType DstTy,
+                                ValueType SrcTy, ValueType ZeroTy,
+                                SubRegIndex SubIdx> {
   def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
                                      (SrcTy RC:$src), (iPTR 0))),
             (SUBREG_TO_REG (i64 0),
-             (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src), SubIdx)>;
-
-  def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
-                                     (SrcTy (bitconvert (memop addr:$src))),
-                                     (iPTR 0))),
-            (SUBREG_TO_REG (i64 0),
-             (!cast<Instruction>("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>;
+             (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64,
-                                 sub_xmm>;
+  defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>;
 }
 
 let Predicates = [HasVLX] in {
-  defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32,
-                                 loadv2f64, sub_xmm>;
-  defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32,
-                                 loadv4f32, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32,
-                                 loadv2i64, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32,
-                                 loadv2i64, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32,
-                                 loadv2i64, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32,
-                                 loadv2i64, sub_xmm>;
-
-  defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32,
-                                 loadv2f64, sub_xmm>;
-  defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32,
-                                 loadv4f32, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32,
-                                 loadv2i64, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32,
-                                 loadv2i64, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32,
-                                 loadv2i64, sub_xmm>;
-  defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32,
-                                 loadv2i64, sub_xmm>;
-
-  defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32,
-                                 loadv4f64, sub_ymm>;
-  defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32,
-                                 loadv8f32, sub_ymm>;
-  defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32,
-                                 loadv4i64, sub_ymm>;
-  defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32,
-                                 loadv4i64, sub_ymm>;
-  defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32,
-                                 loadv4i64, sub_ymm>;
-  defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32,
-                                 loadv4i64, sub_ymm>;
+  defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>;
+
+  defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>;
+
+  defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>;
 }
 
 let Predicates = [HasAVX512, NoVLX] in {
-  defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64,
-                                 sub_xmm>;
-  defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64,
-                                 sub_xmm>;
-
-  defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32,
-                                 loadv4f64, sub_ymm>;
-  defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32,
-                                 loadv8f32, sub_ymm>;
-  defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32,
-                                 loadv4i64, sub_ymm>;
-  defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32,
-                                 loadv4i64, sub_ymm>;
-  defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32,
-                                 loadv4i64, sub_ymm>;
-  defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32,
-                                 loadv4i64, sub_ymm>;
+  defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>;
+
+  defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>;
 }
 
-// List of opcodes that guaranteed to zero the upper elements of vector regs.
-// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
-// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
-// this difficult. So starting with a couple opcodes used by reduction loops
-// where we explicitly insert zeros.
-class veczeroupper<ValueType vt, RegisterClass RC> :
-  PatLeaf<(vt RC:$src), [{
-    return N->getOpcode() == X86ISD::VPMADDWD ||
-           N->getOpcode() == X86ISD::PSADBW;
-  }]>;
-
-def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
-def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
-def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
-def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
-def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
-def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
-
-def zeroupperv4f64  : veczeroupper<v4f64, VR256>;
-def zeroupperv8f32  : veczeroupper<v8f32, VR256>;
-def zeroupperv4i64  : veczeroupper<v4i64, VR256>;
-def zeroupperv8i32  : veczeroupper<v8i32, VR256>;
-def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
-def zeroupperv32i8  : veczeroupper<v32i8, VR256>;
-
-
-// If we can guarantee the upper elements have already been zeroed we can elide
-// an explicit zeroing.
-multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
-                                   ValueType SrcTy, ValueType ZeroTy,
-                                   SubRegIndex SubIdx, PatLeaf Zeroupper> {
-  def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
-                                     Zeroupper:$src, (iPTR 0))),
-            (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
-}
-
-// 128->256
-defm: subvector_zero_ellision<VR128, v4f64,  v2f64, v8i32, sub_xmm, zeroupperv2f64>;
-defm: subvector_zero_ellision<VR128, v8f32,  v4f32, v8i32, sub_xmm, zeroupperv4f32>;
-defm: subvector_zero_ellision<VR128, v4i64,  v2i64, v8i32, sub_xmm, zeroupperv2i64>;
-defm: subvector_zero_ellision<VR128, v8i32,  v4i32, v8i32, sub_xmm, zeroupperv4i32>;
-defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
-defm: subvector_zero_ellision<VR128, v32i8,  v16i8, v8i32, sub_xmm, zeroupperv16i8>;
-
-// 128->512
-defm: subvector_zero_ellision<VR128, v8f64,  v2f64, v16i32, sub_xmm, zeroupperv2f64>;
-defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
-defm: subvector_zero_ellision<VR128, v8i64,  v2i64, v16i32, sub_xmm, zeroupperv2i64>;
-defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
-defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
-defm: subvector_zero_ellision<VR128, v64i8,  v16i8, v16i32, sub_xmm, zeroupperv16i8>;
-
-// 256->512
-defm: subvector_zero_ellision<VR256, v8f64,  v4f64,  v16i32, sub_ymm, zeroupperv4f64>;
-defm: subvector_zero_ellision<VR256, v16f32, v8f32,  v16i32, sub_ymm, zeroupperv8f32>;
-defm: subvector_zero_ellision<VR256, v8i64,  v4i64,  v16i32, sub_ymm, zeroupperv4i64>;
-defm: subvector_zero_ellision<VR256, v16i32, v8i32,  v16i32, sub_ymm, zeroupperv8i32>;
-defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
-defm: subvector_zero_ellision<VR256, v64i8,  v32i8,  v16i32, sub_ymm, zeroupperv32i8>;
-
-
 class maskzeroupper<ValueType vt, RegisterClass RC> :
   PatLeaf<(vt RC:$src), [{
     return isMaskZeroExtended(N);
   }]>;
 
+def maskzeroupperv1i1  : maskzeroupper<v1i1,  VK1>;
 def maskzeroupperv2i1  : maskzeroupper<v2i1,  VK2>;
 def maskzeroupperv4i1  : maskzeroupper<v4i1,  VK4>;
 def maskzeroupperv8i1  : maskzeroupper<v8i1,  VK8>;
@@ -438,11 +331,18 @@ def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>;
 // zeroing.
 let Predicates = [HasBWI] in {
   def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+                                     maskzeroupperv1i1:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK1:$src, VK32)>;
+  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
                                      maskzeroupperv8i1:$src, (iPTR 0))),
             (COPY_TO_REGCLASS VK8:$src, VK32)>;
   def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
                                      maskzeroupperv16i1:$src, (iPTR 0))),
             (COPY_TO_REGCLASS VK16:$src, VK32)>;
+
+  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+                                     maskzeroupperv1i1:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK1:$src, VK64)>;
   def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
                                      maskzeroupperv8i1:$src, (iPTR 0))),
             (COPY_TO_REGCLASS VK8:$src, VK64)>;
@@ -456,10 +356,19 @@ let Predicates = [HasBWI] in {
 
 let Predicates = [HasAVX512] in {
   def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+                                     maskzeroupperv1i1:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK1:$src, VK16)>;
+  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
                                      maskzeroupperv8i1:$src, (iPTR 0))),
             (COPY_TO_REGCLASS VK8:$src, VK16)>;
 }
 
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+                                    maskzeroupperv1i1:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK1:$src, VK8)>;
+}
+
 let Predicates = [HasVLX, HasDQI] in {
   def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
                                     maskzeroupperv2i1:$src, (iPTR 0))),
@@ -495,6 +404,23 @@ let Predicates = [HasBWI, HasVLX] in {
 
 // If the bits are not zero we have to fall back to explicitly zeroing by
 // using shifts.
+let Predicates = [HasAVX512] in {
+  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+                                     (v1i1 VK1:$mask), (iPTR 0))),
+            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK1:$mask, VK16),
+                                    (i8 15)), (i8 15))>;
+
+  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+                                     (v2i1 VK2:$mask), (iPTR 0))),
+            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
+                                    (i8 14)), (i8 14))>;
+
+  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+                                     (v4i1 VK4:$mask), (iPTR 0))),
+            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
+                                    (i8 12)), (i8 12))>;
+}
+
 let Predicates = [HasAVX512, NoDQI] in {
   def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
                                      (v8i1 VK8:$mask), (iPTR 0))),
@@ -506,9 +432,11 @@ let Predicates = [HasDQI] in {
   def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
                                      (v8i1 VK8:$mask), (iPTR 0))),
             (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>;
-}
 
-let Predicates = [HasVLX, HasDQI] in {
+  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+                                    (v1i1 VK1:$mask), (iPTR 0))),
+            (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK1:$mask, VK8),
+                                    (i8 7)), (i8 7))>;
   def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
                                     (v2i1 VK2:$mask), (iPTR 0))),
             (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8),
@@ -519,17 +447,6 @@ let Predicates = [HasVLX, HasDQI] in {
                                     (i8 4)), (i8 4))>;
 }
 
-let Predicates = [HasVLX] in {
-  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
-                                     (v2i1 VK2:$mask), (iPTR 0))),
-            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
-                                    (i8 14)), (i8 14))>;
-  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
-                                     (v4i1 VK4:$mask), (iPTR 0))),
-            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
-                                    (i8 12)), (i8 12))>;
-}
-
 let Predicates = [HasBWI] in {
   def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
                                      (v16i1 VK16:$mask), (iPTR 0))),
@@ -567,6 +484,10 @@ let Predicates = [HasBWI, HasDQI] in {
 
 let Predicates = [HasBWI, HasVLX] in {
   def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+                                     (v1i1 VK1:$mask), (iPTR 0))),
+            (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32),
+                                    (i8 31)), (i8 31))>;
+  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
                                      (v2i1 VK2:$mask), (iPTR 0))),
             (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32),
                                     (i8 30)), (i8 30))>;
@@ -576,6 +497,10 @@ let Predicates = [HasBWI, HasVLX] in {
                                     (i8 28)), (i8 28))>;
 
   def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+                                     (v1i1 VK1:$mask), (iPTR 0))),
+            (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK1:$mask, VK64),
+                                    (i8 63)), (i8 63))>;
+  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
                                      (v2i1 VK2:$mask), (iPTR 0))),
             (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64),
                                     (i8 62)), (i8 62))>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
index c4b8e3e90d29..ff3e3be48a24 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -14,11 +14,11 @@
 multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WritePHAdd]>;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
-           Sched<[WritePHAddLd, ReadAfterLd]>;
+           Sched<[SchedWritePHAdd.XMM.Folded, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -41,123 +41,133 @@ let ExeDomain = SSEPackedInt in {
 
 // Scalar load 2 addr operand instructions
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     Operand memop, ComplexPattern mem_cpat> {
+                     Operand memop, ComplexPattern mem_cpat,
+                     X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
-           Sched<[WriteFAddLd, ReadAfterLd]>;
+           Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop> {
+                     PatFrag memop, X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
-           Sched<[WriteFAddLd, ReadAfterLd]>;
+           Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop> {
-  def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                     PatFrag memop, X86FoldableSchedWrite sched> {
+  def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[WriteFAdd]>;
-  def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+           [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
+  def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
-           Sched<[WriteFAddLd, ReadAfterLd]>;
+           Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
-                           ssmem, sse_load_f32>;
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+                           ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32,
+                           SchedWriteFRnd.XMM>;
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32,
+                           SchedWriteFRnd.YMM>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
-                           sdmem, sse_load_f64>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+                           sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64,
+                           SchedWriteFRnd.XMM>;
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64,
+                           SchedWriteFRnd.YMM>;
 }
 
 multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                  ValueType vt128> {
+                  ValueType vt128, X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
-           XOP, Sched<[WriteVarVecShift]>;
+           XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
                              (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
-           XOP_4V, VEX_W, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+           XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
                              (vt128 VR128:$src2))))]>,
-             XOP, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+             XOP, Sched<[sched.Folded, ReadAfterLd]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>,
-               XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData<NAME#rr>;
+               XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8>;
-  defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32>;
-  defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64>;
-  defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16>;
-  defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
-  defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
-  defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
-  defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
-  defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
-  defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
-  defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
-  defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+  defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8, SchedWriteVarVecShift.XMM>;
+  defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32, SchedWriteVarVecShift.XMM>;
+  defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64, SchedWriteVarVecShift.XMM>;
+  defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16, SchedWriteVarVecShift.XMM>;
+  defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8, SchedWriteVarVecShift.XMM>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32, SchedWriteVarVecShift.XMM>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64, SchedWriteVarVecShift.XMM>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16, SchedWriteVarVecShift.XMM>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8, SchedWriteVarVecShift.XMM>;
+  defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32, SchedWriteVarVecShift.XMM>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64, SchedWriteVarVecShift.XMM>;
+  defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16, SchedWriteVarVecShift.XMM>;
 }
 
 multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                     ValueType vt128> {
+                     ValueType vt128, X86FoldableSchedWrite sched> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>,
-           XOP, Sched<[WriteVecShift]>;
+           XOP, Sched<[sched]>;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
-           XOP, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+           XOP, Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8>;
-  defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32>;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64>;
-  defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16>;
+  defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8,
+                          SchedWriteVecShiftImm.XMM>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32,
+                          SchedWriteVecShiftImm.XMM>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64,
+                          SchedWriteVecShiftImm.XMM>;
+  defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16,
+                          SchedWriteVecShiftImm.XMM>;
 }
 
 // Instruction where second source can be memory, but third must be register
-multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                    X86FoldableSchedWrite sched> {
   let isCommutable = 1 in
   def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -165,29 +175,41 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
               (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
-           Sched<[WriteVecIMul]>;
+           Sched<[sched]>;
   def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
-              VR128:$src3))]>, XOP_4V, Sched<[WriteVecIMulLd, ReadAfterLd]>;
+              VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
-  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
-  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
-  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
-  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
-  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
-  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
-  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
-  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
-  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
-  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
-  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd",
+                             int_x86_xop_vpmadcswd, SchedWriteVecIMul.XMM>;
+  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd",
+                             int_x86_xop_vpmadcsswd, SchedWriteVecIMul.XMM>;
+  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww",
+                             int_x86_xop_vpmacsww, SchedWriteVecIMul.XMM>;
+  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd",
+                             int_x86_xop_vpmacswd, SchedWriteVecIMul.XMM>;
+  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww",
+                             int_x86_xop_vpmacssww, SchedWriteVecIMul.XMM>;
+  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd",
+                             int_x86_xop_vpmacsswd, SchedWriteVecIMul.XMM>;
+  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql",
+                             int_x86_xop_vpmacssdql, SchedWritePMULLD.XMM>;
+  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh",
+                             int_x86_xop_vpmacssdqh, SchedWritePMULLD.XMM>;
+  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd",
+                             int_x86_xop_vpmacssdd, SchedWritePMULLD.XMM>;
+  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql",
+                             int_x86_xop_vpmacsdql, SchedWritePMULLD.XMM>;
+  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh",
+                             int_x86_xop_vpmacsdqh, SchedWritePMULLD.XMM>;
+  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd",
+                             int_x86_xop_vpmacsdd, SchedWritePMULLD.XMM>;
 }
 
 // IFMA patterns - for cases where we can safely ignore the overflow bits from
@@ -199,11 +221,11 @@ let Predicates = [HasXOP] in {
   def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
                         (v4i32 VR128:$src3))),
             (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)),
-                                   (X86PShufd (v4i32 VR128:$src2), (i8 -11))),
+  def : Pat<(v2i64 (add (X86pmuldq (bc_v2i64 (X86PShufd (v4i32 VR128:$src1), (i8 -11))),
+                                   (bc_v2i64 (X86PShufd (v4i32 VR128:$src2), (i8 -11)))),
                         (v2i64 VR128:$src3))),
             (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+  def : Pat<(v2i64 (add (X86pmuldq (v2i64 VR128:$src1), (v2i64 VR128:$src2)),
                         (v2i64 VR128:$src3))),
             (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>;
   def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
@@ -211,54 +233,69 @@ let Predicates = [HasXOP] in {
             (VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
 }
 
+// Transforms to swizzle an immediate to help matching memory operand in first
+// operand.
+def CommuteVPCOMCC : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue() & 0x7;
+  Imm = X86::getSwappedVPCOMImm(Imm);
+  return getI8Imm(Imm, SDLoc(N));
+}]>;
+
 // Instruction where second source can be memory, third must be imm8
-multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
-  let isCommutable = 1 in
-  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
-           !strconcat("vpcom${cc}", Suffix,
-           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst,
-              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                             imm:$cc)))]>,
-           XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
-  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
-           !strconcat("vpcom${cc}", Suffix,
-           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst,
-              (vt128 (OpNode (vt128 VR128:$src1),
-                             (vt128 (bitconvert (loadv2i64 addr:$src2))),
-                              imm:$cc)))]>,
-           XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
-  let isAsmParserOnly = 1, hasSideEffects = 0 in {
-    def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-                 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
-                 !strconcat("vpcom", Suffix,
-                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                 []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
-    let mayLoad = 1 in
-    def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-                 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
-                 !strconcat("vpcom", Suffix,
-                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                 []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
+                    X86FoldableSchedWrite sched> {
+  let ExeDomain = SSEPackedInt in { // SSE integer instructions
+    let isCommutable = 1 in
+    def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+             !strconcat("vpcom${cc}", Suffix,
+             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+                (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                               imm:$cc)))]>,
+             XOP_4V, Sched<[sched]>;
+    def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+             !strconcat("vpcom${cc}", Suffix,
+             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+                (vt128 (OpNode (vt128 VR128:$src1),
+                               (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                                imm:$cc)))]>,
+             XOP_4V, Sched<[sched.Folded, ReadAfterLd]>;
+    let isAsmParserOnly = 1, hasSideEffects = 0 in {
+      def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+                   !strconcat("vpcom", Suffix,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   []>, XOP_4V, Sched<[sched]>, NotMemoryFoldable;
+      let mayLoad = 1 in
+      def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+                   !strconcat("vpcom", Suffix,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   []>, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>,
+                   NotMemoryFoldable;
+    }
   }
-}
 
-let ExeDomain = SSEPackedInt in { // SSE integer instructions
-  defm VPCOMB  : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
-  defm VPCOMW  : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
-  defm VPCOMD  : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
-  defm VPCOMQ  : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
-  defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
-  defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
-  defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
-  defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
+  def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)),
+                    (vt128 VR128:$src1), imm:$cc),
+            (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
+                                           (CommuteVPCOMCC imm:$cc))>;
 }
 
+defm VPCOMB  : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>;
+defm VPCOMW  : xopvpcom<0xCD, "w", X86vpcom, v8i16, SchedWriteVecALU.XMM>;
+defm VPCOMD  : xopvpcom<0xCE, "d", X86vpcom, v4i32, SchedWriteVecALU.XMM>;
+defm VPCOMQ  : xopvpcom<0xCF, "q", X86vpcom, v2i64, SchedWriteVecALU.XMM>;
+defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8, SchedWriteVecALU.XMM>;
+defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16, SchedWriteVecALU.XMM>;
+defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32, SchedWriteVecALU.XMM>;
+defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64, SchedWriteVecALU.XMM>;
+
 multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                  ValueType vt128> {
+                  ValueType vt128, X86FoldableSchedWrite sched> {
   def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
             (ins VR128:$src1, VR128:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
@@ -266,7 +303,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                              (vt128 VR128:$src3))))]>,
-            XOP_4V, Sched<[WriteShuffle]>;
+            XOP_4V, Sched<[sched]>;
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
             !strconcat(OpcodeStr,
@@ -274,7 +311,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                              (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
-            XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>;
+            XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
@@ -282,69 +319,83 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
                              (vt128 VR128:$src3))))]>,
-            XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+            XOP_4V, Sched<[sched.Folded, ReadAfterLd,
+                           // 128mem:$src2
+                           ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                           ReadDefault,
+                           // VR128:$src3
+                           ReadAfterLd]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
                 (ins VR128:$src1, VR128:$src2, VR128:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>;
+                []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>;
+  defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8,
+                       SchedWriteVarShuffle.XMM>;
 }
 
 // Instruction where either second or third source can be memory
 multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                       X86MemOperand x86memop, ValueType VT> {
+                      X86MemOperand x86memop, ValueType VT,
+                      X86FoldableSchedWrite sched> {
   def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
             (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
-            Sched<[WriteShuffle]>;
+            Sched<[sched]>;
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
             (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
                                    (X86andnp (load addr:$src3), RC:$src2))))]>,
-            XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>;
+            XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
             (ins RC:$src1, x86memop:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, (load addr:$src2)))))]>,
-            XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+            XOP_4V, Sched<[sched.Folded, ReadAfterLd,
+                           // x86memop:$src2
+                           ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                           ReadDefault,
+                           // RC::$src3
+                           ReadAfterLd]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
             (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>;
+            []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>;
-  defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L;
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64,
+                           SchedWriteShuffle.XMM>;
+  defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64,
+                            SchedWriteShuffle.YMM>, VEX_L;
 }
 
 multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
                         X86MemOperand intmemop, X86MemOperand fpmemop,
-                        ValueType VT, PatFrag FPLdFrag,
-                        PatFrag IntLdFrag> {
+                        ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
+                        X86FoldableSchedWrite sched> {
   def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
            (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>,
-        Sched<[WriteFShuffle]>;
+        Sched<[sched]>;
   def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
@@ -353,7 +404,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
           (VT (X86vpermil2 RC:$src1, RC:$src2,
                            (bitconvert (IntLdFrag addr:$src3)),
                            (i8 imm:$src4))))]>, VEX_W,
-        Sched<[WriteFShuffleLd, ReadAfterLd]>;
+        Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>;
   def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
@@ -361,27 +412,35 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
         [(set RC:$dst,
           (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
                            RC:$src3, (i8 imm:$src4))))]>,
-        Sched<[WriteFShuffleLd, ReadAfterLd]>;
+        Sched<[sched.Folded, ReadAfterLd,
+               // fpmemop:$src2
+               ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+               // RC:$src3
+               ReadAfterLd]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        []>, VEX_W, Sched<[WriteFShuffle]>, FoldGenData<NAME#rr>;
+        []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem,
-                                 v2f64, loadv2f64, loadv2i64>;
+                                 v2f64, loadv2f64, loadv2i64,
+                                 SchedWriteFVarShuffle.XMM>;
   defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem,
-                                  v4f64, loadv4f64, loadv4i64>, VEX_L;
+                                  v4f64, loadv4f64, loadv4i64,
+                                  SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
-                                 v4f32, loadv4f32, loadv2i64>;
+                                 v4f32, loadv4f32, loadv2i64,
+                                 SchedWriteFVarShuffle.XMM>;
   defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
-                                  v8f32, loadv8f32, loadv4i64>, VEX_L;
+                                  v8f32, loadv8f32, loadv4i64,
+                                  SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 44bbc3f1b3fa..36d36cb11d72 100644
--- a/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -81,8 +81,8 @@ private:
                          MachineFunction &MF) const;
   bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI,
                       MachineFunction &MF) const;
-  bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
-                   MachineFunction &MF) const;
+  bool selectTruncOrPtrToInt(MachineInstr &I, MachineRegisterInfo &MRI,
+                             MachineFunction &MF) const;
   bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
                   MachineFunction &MF) const;
   bool selectAnyext(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -104,9 +104,18 @@ private:
                      MachineFunction &MF) const;
   bool selectCondBranch(MachineInstr &I, MachineRegisterInfo &MRI,
                         MachineFunction &MF) const;
+  bool selectTurnIntoCOPY(MachineInstr &I, MachineRegisterInfo &MRI,
+                          const unsigned DstReg,
+                          const TargetRegisterClass *DstRC,
+                          const unsigned SrcReg,
+                          const TargetRegisterClass *SrcRC) const;
   bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI,
                      MachineFunction &MF) const;
   bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectShift(MachineInstr &I, MachineRegisterInfo &MRI,
+                   MachineFunction &MF) const;
+  bool selectSDiv(MachineInstr &I, MachineRegisterInfo &MRI,
+                   MachineFunction &MF) const;
 
   // emit insert subreg instruction and insert it before MachineInstr &I
   bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
@@ -287,8 +296,8 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
   const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
   if (!OldRC || !DstRC->hasSubClassEq(OldRC)) {
     if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-      DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
-                   << " operand\n");
+      LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                        << " operand\n");
       return false;
     }
   }
@@ -324,7 +333,7 @@ bool X86InstructionSelector::select(MachineInstr &I,
   if (selectImpl(I, CoverageInfo))
     return true;
 
-  DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
+  LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
 
   // TODO: This should be implemented by tblgen.
   switch (I.getOpcode()) {
@@ -342,8 +351,11 @@ bool X86InstructionSelector::select(MachineInstr &I,
     return selectConstant(I, MRI, MF);
   case TargetOpcode::G_FCONSTANT:
     return materializeFP(I, MRI, MF);
+  case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_TRUNC:
-    return selectTrunc(I, MRI, MF);
+    return selectTruncOrPtrToInt(I, MRI, MF);
+  case TargetOpcode::G_INTTOPTR:
+    return selectCopy(I, MRI);
   case TargetOpcode::G_ZEXT:
     return selectZext(I, MRI, MF);
   case TargetOpcode::G_ANYEXT:
@@ -365,6 +377,12 @@ bool X86InstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_IMPLICIT_DEF:
   case TargetOpcode::G_PHI:
     return selectImplicitDefOrPHI(I, MRI);
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+    return selectShift(I, MRI, MF);
+  case TargetOpcode::G_SDIV:
+    return selectSDiv(I, MRI, MF);
   }
 
   return false;
@@ -485,7 +503,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
 
   auto &MemOp = **I.memoperands_begin();
   if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
-    DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+    LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
     return false;
   }
 
@@ -640,10 +658,37 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
-bool X86InstructionSelector::selectTrunc(MachineInstr &I,
-                                         MachineRegisterInfo &MRI,
-                                         MachineFunction &MF) const {
-  assert((I.getOpcode() == TargetOpcode::G_TRUNC) && "unexpected instruction");
+// Helper function for selectTruncOrPtrToInt and selectAnyext.
+// Returns true if DstRC lives on a floating register class and
+// SrcRC lives on a 128-bit vector class.
+static bool canTurnIntoCOPY(const TargetRegisterClass *DstRC,
+                            const TargetRegisterClass *SrcRC) {
+  return (DstRC == &X86::FR32RegClass || DstRC == &X86::FR32XRegClass ||
+          DstRC == &X86::FR64RegClass || DstRC == &X86::FR64XRegClass) &&
+         (SrcRC == &X86::VR128RegClass || SrcRC == &X86::VR128XRegClass);
+}
+
+bool X86InstructionSelector::selectTurnIntoCOPY(
+    MachineInstr &I, MachineRegisterInfo &MRI, const unsigned DstReg,
+    const TargetRegisterClass *DstRC, const unsigned SrcReg,
+    const TargetRegisterClass *SrcRC) const {
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << " operand\n");
+    return false;
+  }
+  I.setDesc(TII.get(X86::COPY));
+  return true;
+}
+
+bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I,
+                                                   MachineRegisterInfo &MRI,
+                                                   MachineFunction &MF) const {
+  assert((I.getOpcode() == TargetOpcode::G_TRUNC ||
+          I.getOpcode() == TargetOpcode::G_PTRTOINT) &&
+         "unexpected instruction");
 
   const unsigned DstReg = I.getOperand(0).getReg();
   const unsigned SrcReg = I.getOperand(1).getReg();
@@ -655,19 +700,24 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
   const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
 
   if (DstRB.getID() != SrcRB.getID()) {
-    DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+    LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode())
+                      << " input/output on different banks\n");
     return false;
   }
 
-  if (DstRB.getID() != X86::GPRRegBankID)
-    return false;
-
   const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
-  if (!DstRC)
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+  if (!DstRC || !SrcRC)
     return false;
 
-  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
-  if (!SrcRC)
+  // If that's truncation of the value that lives on the vector class and goes
+  // into the floating class, just replace it with copy, as we are able to
+  // select it as a regular move.
+  if (canTurnIntoCOPY(DstRC, SrcRC))
+    return selectTurnIntoCOPY(I, MRI, DstReg, DstRC, SrcReg, SrcRC);
+
+  if (DstRB.getID() != X86::GPRRegBankID)
     return false;
 
   unsigned SubIdx;
@@ -688,7 +738,8 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
 
   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
       !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << "\n");
     return false;
   }
 
@@ -709,6 +760,70 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
   const LLT DstTy = MRI.getType(DstReg);
   const LLT SrcTy = MRI.getType(SrcReg);
 
+  assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) &&
+         "8=>32 Zext is handled by tablegen");
+  assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) &&
+         "16=>32 Zext is handled by tablegen");
+
+  const static struct ZextEntry {
+    LLT SrcTy;
+    LLT DstTy;
+    unsigned MovOp;
+    bool NeedSubregToReg;
+  } OpTable[] = {
+      {LLT::scalar(8), LLT::scalar(16), X86::MOVZX16rr8, false},  // i8  => i16
+      {LLT::scalar(8), LLT::scalar(64), X86::MOVZX32rr8, true},   // i8  => i64
+      {LLT::scalar(16), LLT::scalar(64), X86::MOVZX32rr16, true}, // i16 => i64
+      {LLT::scalar(32), LLT::scalar(64), 0, true}                 // i32 => i64
+  };
+
+  auto ZextEntryIt =
+      std::find_if(std::begin(OpTable), std::end(OpTable),
+                   [SrcTy, DstTy](const ZextEntry &El) {
+                     return El.DstTy == DstTy && El.SrcTy == SrcTy;
+                   });
+
+  // Here we try to select Zext into a MOVZ and/or SUBREG_TO_REG instruction.
+  if (ZextEntryIt != std::end(OpTable)) {
+    const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+    const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+    const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+    const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+    if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+        !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+      LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                        << " operand\n");
+      return false;
+    }
+
+    unsigned TransitRegTo = DstReg;
+    unsigned TransitRegFrom = SrcReg;
+    if (ZextEntryIt->MovOp) {
+      // If we select Zext into MOVZ + SUBREG_TO_REG, we need to have
+      // a transit register in between: create it here.
+      if (ZextEntryIt->NeedSubregToReg) {
+        TransitRegFrom = MRI.createVirtualRegister(
+            getRegClass(LLT::scalar(32), DstReg, MRI));
+        TransitRegTo = TransitRegFrom;
+      }
+
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ZextEntryIt->MovOp))
+          .addDef(TransitRegTo)
+          .addReg(SrcReg);
+    }
+    if (ZextEntryIt->NeedSubregToReg) {
+      BuildMI(*I.getParent(), I, I.getDebugLoc(),
+              TII.get(TargetOpcode::SUBREG_TO_REG))
+          .addDef(DstReg)
+          .addImm(0)
+          .addReg(TransitRegFrom)
+          .addImm(X86::sub_32bit);
+    }
+    I.eraseFromParent();
+    return true;
+  }
+
   if (SrcTy != LLT::scalar(1))
     return false;
 
@@ -765,16 +880,22 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I,
   assert(DstTy.getSizeInBits() > SrcTy.getSizeInBits() &&
          "G_ANYEXT incorrect operand size");
 
-  if (DstRB.getID() != X86::GPRRegBankID)
-    return false;
-
   const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
   const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
 
+  // If that's ANY_EXT of the value that lives on the floating class and goes
+  // into the vector class, just replace it with copy, as we are able to select
+  // it as a regular move.
+  if (canTurnIntoCOPY(SrcRC, DstRC))
+    return selectTurnIntoCOPY(I, MRI, SrcReg, SrcRC, DstReg, DstRC);
+
+  if (DstRB.getID() != X86::GPRRegBankID)
+    return false;
+
   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
       !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
-                 << " operand\n");
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << " operand\n");
     return false;
   }
 
@@ -990,7 +1111,7 @@ bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
 
   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
       !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
     return false;
   }
 
@@ -1027,7 +1148,7 @@ bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
 
   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
       !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+    LLVM_DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
     return false;
   }
 
@@ -1271,8 +1392,8 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
     const TargetRegisterClass *RC = getRegClass(DstTy, DstReg, MRI);
 
     if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
-      DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
-                   << " operand\n");
+      LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                        << " operand\n");
       return false;
     }
   }
@@ -1285,6 +1406,165 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
   return true;
 }
 
+// Currently GlobalIsel TableGen generates patterns for shift imm and shift 1,
+// but with shiftCount i8. In G_LSHR/G_ASHR/G_SHL like LLVM-IR both arguments
+// has the same type, so for now only shift i8 can use auto generated
+// TableGen patterns.
+bool X86InstructionSelector::selectShift(MachineInstr &I,
+                                         MachineRegisterInfo &MRI,
+                                         MachineFunction &MF) const {
+
+  assert((I.getOpcode() == TargetOpcode::G_SHL ||
+          I.getOpcode() == TargetOpcode::G_ASHR ||
+          I.getOpcode() == TargetOpcode::G_LSHR) &&
+         "unexpected instruction");
+
+  unsigned DstReg = I.getOperand(0).getReg();
+  const LLT DstTy = MRI.getType(DstReg);
+  const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+
+  const static struct ShiftEntry {
+    unsigned SizeInBits;
+    unsigned CReg;
+    unsigned OpLSHR;
+    unsigned OpASHR;
+    unsigned OpSHL;
+  } OpTable[] = {
+      {8, X86::CL, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL},      // i8
+      {16, X86::CX, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL},  // i16
+      {32, X86::ECX, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
+      {64, X86::RCX, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL}  // i64
+  };
+
+  if (DstRB.getID() != X86::GPRRegBankID)
+    return false;
+
+  auto ShiftEntryIt = std::find_if(
+      std::begin(OpTable), std::end(OpTable), [DstTy](const ShiftEntry &El) {
+        return El.SizeInBits == DstTy.getSizeInBits();
+      });
+  if (ShiftEntryIt == std::end(OpTable))
+    return false;
+
+  unsigned CReg = ShiftEntryIt->CReg;
+  unsigned Opcode = 0;
+  switch (I.getOpcode()) {
+  case TargetOpcode::G_SHL:
+    Opcode = ShiftEntryIt->OpSHL;
+    break;
+  case TargetOpcode::G_ASHR:
+    Opcode = ShiftEntryIt->OpASHR;
+    break;
+  case TargetOpcode::G_LSHR:
+    Opcode = ShiftEntryIt->OpLSHR;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned Op0Reg = I.getOperand(1).getReg();
+  unsigned Op1Reg = I.getOperand(2).getReg();
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+          ShiftEntryIt->CReg)
+      .addReg(Op1Reg);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::KILL),
+            X86::CL)
+        .addReg(CReg, RegState::Kill);
+
+  MachineInstr &ShiftInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
+           .addReg(Op0Reg);
+
+  constrainSelectedInstRegOperands(ShiftInst, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
+bool X86InstructionSelector::selectSDiv(MachineInstr &I,
+                                        MachineRegisterInfo &MRI,
+                                        MachineFunction &MF) const {
+
+  assert(I.getOpcode() == TargetOpcode::G_SDIV && "unexpected instruction");
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned DividentReg = I.getOperand(1).getReg();
+  const unsigned DiviserReg = I.getOperand(2).getReg();
+
+  const LLT RegTy = MRI.getType(DstReg);
+  assert(RegTy == MRI.getType(DividentReg) &&
+         RegTy == MRI.getType(DiviserReg) &&
+         "Arguments and return value types must match");
+
+  const RegisterBank &RegRB = *RBI.getRegBank(DstReg, MRI, TRI);
+
+  // For the X86 IDIV instruction, in most cases the dividend
+  // (numerator) must be in a specific register pair highreg:lowreg,
+  // producing the quotient in lowreg and the remainder in highreg.
+  // For most data types, to set up the instruction, the dividend is
+  // copied into lowreg, and lowreg is sign-extended into highreg.  The
+  // exception is i8, where the dividend is defined as a single register rather
+  // than a register pair, and we therefore directly sign-extend the dividend
+  // into lowreg, instead of copying, and ignore the highreg.
+  const static struct SDivEntry {
+    unsigned SizeInBits;
+    unsigned QuotientReg;
+    unsigned DividentRegUpper;
+    unsigned DividentRegLower;
+    unsigned OpSignExtend;
+    unsigned OpCopy;
+    unsigned OpDiv;
+  } OpTable[] = {
+      {8, X86::AL, X86::NoRegister, X86::AX, 0, X86::MOVSX16rr8,
+       X86::IDIV8r}, // i8
+      {16, X86::AX, X86::DX, X86::AX, X86::CWD, TargetOpcode::COPY,
+       X86::IDIV16r}, // i16
+      {32, X86::EAX, X86::EDX, X86::EAX, X86::CDQ, TargetOpcode::COPY,
+       X86::IDIV32r}, // i32
+      {64, X86::RAX, X86::RDX, X86::RAX, X86::CQO, TargetOpcode::COPY,
+       X86::IDIV64r} // i64
+  };
+
+  if (RegRB.getID() != X86::GPRRegBankID)
+    return false;
+
+  auto SDivEntryIt = std::find_if(
+      std::begin(OpTable), std::end(OpTable), [RegTy](const SDivEntry &El) {
+    return El.SizeInBits == RegTy.getSizeInBits();
+      });
+
+  if (SDivEntryIt == std::end(OpTable))
+    return false;
+
+  const TargetRegisterClass *RegRC = getRegClass(RegTy, RegRB);
+  if (!RBI.constrainGenericRegister(DividentReg, *RegRC, MRI) ||
+      !RBI.constrainGenericRegister(DiviserReg, *RegRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) {
+    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                      << " operand\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SDivEntryIt->OpCopy),
+          SDivEntryIt->DividentRegLower)
+      .addReg(DividentReg);
+  if (SDivEntryIt->DividentRegUpper != X86::NoRegister)
+    BuildMI(*I.getParent(), I, I.getDebugLoc(),
+            TII.get(SDivEntryIt->OpSignExtend));
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SDivEntryIt->OpDiv))
+      .addReg(DiviserReg);
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+          DstReg)
+      .addReg(SDivEntryIt->QuotientReg);
+
+  I.eraseFromParent();
+  return true;
+}
+
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index cdb24b9d40a6..6c7fb9c339ac 100644
--- a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -30,6 +29,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -39,7 +39,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief This class holds necessary information to represent an interleaved
+/// This class holds necessary information to represent an interleaved
 /// access group and supports utilities to lower the group into
 /// X86-specific instructions/intrinsics.
 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
@@ -48,32 +48,32 @@ namespace {
 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
 class X86InterleavedAccessGroup {
-  /// \brief Reference to the wide-load instruction of an interleaved access
+  /// Reference to the wide-load instruction of an interleaved access
   /// group.
   Instruction *const Inst;
 
-  /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
+  /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
   ArrayRef<ShuffleVectorInst *> Shuffles;
 
-  /// \brief Reference to the starting index of each user-shuffle.
+  /// Reference to the starting index of each user-shuffle.
   ArrayRef<unsigned> Indices;
 
-  /// \brief Reference to the interleaving stride in terms of elements.
+  /// Reference to the interleaving stride in terms of elements.
   const unsigned Factor;
 
-  /// \brief Reference to the underlying target.
+  /// Reference to the underlying target.
   const X86Subtarget &Subtarget;
 
   const DataLayout &DL;
 
   IRBuilder<> &Builder;
 
-  /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
+  /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
   void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
                  SmallVectorImpl<Instruction *> &DecomposedVectors);
 
-  /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
+  /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
   /// returns the transposed-vectors in \p TransposedVectors.
   /// E.g.
   /// InputVectors:
@@ -115,11 +115,11 @@ public:
       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
         DL(Inst->getModule()->getDataLayout()), Builder(B) {}
 
-  /// \brief Returns true if this interleaved access group can be lowered into
+  /// Returns true if this interleaved access group can be lowered into
   /// x86-specific instructions/intrinsics, false otherwise.
   bool isSupported() const;
 
-  /// \brief Lowers this interleaved access group into X86-specific
+  /// Lowers this interleaved access group into X86-specific
   /// instructions/intrinsics.
   bool lowerIntoOptimizedSequence();
 };
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index fae0889950b2..2dd60a1b8b5a 100644
--- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -20,24 +20,21 @@
 namespace llvm {
 
 enum IntrinsicType : uint16_t {
-  INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
+  INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP_IMM8,
   CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
   CVTPD2PS, CVTPD2PS_MASK,
   INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
-  INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
-  INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
-  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
-  FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
-  IFMA_OP_MASK, IFMA_OP_MASKZ,
-  VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+  INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
+  INTR_TYPE_3OP_MASK,
+  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_SCALAR,
+  IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
   INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
-  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
+  COMPRESS_EXPAND_IN_REG,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
-  EXPAND_FROM_MEM,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
-  FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+  FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  FIXUPIMMS_MASKZ, GATHER_AVX2,
   ROUNDP, ROUNDS
 };
 
@@ -54,6 +51,9 @@ struct IntrinsicData {
   bool operator==(const IntrinsicData &RHS) const {
     return RHS.Id == Id;
   }
+  friend bool operator<(const IntrinsicData &LHS, unsigned Id) {
+    return LHS.Id < Id;
+  }
 };
 
 #define X86_INTRINSIC_DATA(id, type, op0, op1) \
@@ -120,78 +120,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
                      X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
 
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_b_128,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_b_256,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_b_512,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_w_128,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_w_256,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_compress_store_w_512,
-                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_b_128,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_b_256,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_b_512,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_w_128,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_w_256,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_expand_load_w_512,
-                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
@@ -352,13 +280,11 @@ static const IntrinsicData IntrinsicsWithChain[] = {
 /*
  * Find Intrinsic data by intrinsic ID
  */
-static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
-
-  IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
   const IntrinsicData *Data =  std::lower_bound(std::begin(IntrinsicsWithChain),
                                                 std::end(IntrinsicsWithChain),
-                                                IntrinsicToFind);
-  if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+                                                IntNo);
+  if (Data != std::end(IntrinsicsWithChain) && Data->Id == IntNo)
     return Data;
   return nullptr;
 }
@@ -374,9 +300,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx_cmp_ps_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
-  X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
-  X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
-  X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -391,8 +317,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx_round_pd_256,  ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx_round_ps_256,  ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
@@ -405,6 +329,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -412,11 +338,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
-  X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
   X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
-  X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -449,15 +373,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
+  X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
   X86_INTRINSIC_DATA(avx512_cvtsi2sd64,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2ss32,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2ss64,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
@@ -472,33 +395,23 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtusi2ss,   INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
-  X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
-  X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
-  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
-  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
-  X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
-  X86ISD::FADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
-  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FADDS_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
-                     X86ISD::CMPM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
-                     X86ISD::CMPM_RND),
   X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
@@ -552,22 +465,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::CONFLICT, 0),
   X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
                      X86ISD::CONFLICT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
-                     ISD::SINT_TO_FP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
-                     ISD::SINT_TO_FP, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
                      ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
   X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTP2SI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
-                    X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps,     INTR_TYPE_1OP_MASK,
                     X86ISD::VFPROUND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK,
-                     ISD::FP_ROUND, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK,
                      ISD::FP_ROUND, X86ISD::VFPROUND_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
@@ -594,10 +499,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::VFPEXT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_EXTEND, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
                      ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
@@ -618,10 +519,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::CVTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
-                     ISD::SINT_TO_FP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
-                     ISD::SINT_TO_FP, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
                      ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
@@ -636,62 +533,48 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VFPEXTS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTTP2SI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
+                     X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
+                     X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, 0),
+                     X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, 0),
+                     X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, 0),
+                     X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
+                     X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, 0),
+                     X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, 0),
+                     X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, 0),
+                     X86ISD::CVTTP2UI, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
-                     ISD::UINT_TO_FP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
-                     ISD::UINT_TO_FP, 0),
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
                      ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
-                     ISD::UINT_TO_FP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
-                     ISD::UINT_TO_FP, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
                      ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
   X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
@@ -700,16 +583,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      ISD::UINT_TO_FP, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
                      ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
-  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::DBPSADBW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::DBPSADBW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::DBPSADBW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
-                     X86ISD::FDIV_RND),
-  X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
-                     X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
@@ -758,12 +631,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
   X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
   X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
   X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
   X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
@@ -798,26 +665,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
   X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK,
                      X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
-                     X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
-                     X86ISD::FMAX_RND),
   X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FMAXS, X86ISD::FMAXS_RND),
   X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FMAXS, X86ISD::FMAXS_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
-                     X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
-                     X86ISD::FMIN_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FMINS, X86ISD::FMINS_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FMINS, X86ISD::FMINS_RND),
-  X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
-                     X86ISD::FMUL_RND),
-  X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
-                     X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FMULS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
@@ -834,58 +689,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_di_256, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_di_512, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_128, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_256, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_512, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_128, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_256, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_512, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_sf_256, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_sf_512, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_si_256, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_permvar_si_512, VPERM_2OP_MASK,
-                     X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::VPMADDUBSW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::VPMADDUBSW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::VPMADDUBSW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::VPMADDWD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::VPMADDWD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
@@ -895,21 +710,21 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNC, 0),
+                     ISD::TRUNCATE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNCS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
@@ -982,45 +797,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VTRUNCUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNCUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
                      X86ISD::MULTISHIFT, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
                      X86ISD::MULTISHIFT, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
                      X86ISD::MULTISHIFT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_d_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_d_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_d_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_q_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_q_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_q_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_d_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_d_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_d_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_q_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_q_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_q_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
@@ -1033,18 +815,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
-                     X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
@@ -1087,22 +857,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::SCALEFS, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::SCALEFS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
-                     X86ISD::FSQRT_RND),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
-                     X86ISD::FSQRT_RND),
   X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FSQRTS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FSQRTS_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
-                     X86ISD::FSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
-                     X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
@@ -1119,151 +877,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::CVTPS2PH, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
                      X86ISD::CVTPS2PH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, ISD::FMA,
-                     X86ISD::FMADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
-                     X86ISD::FMADD_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
-                     X86ISD::FNMADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
-                     X86ISD::FNMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
-                     X86ISD::FNMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
-                     X86ISD::FNMSUB_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_128,  FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_256,  FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_512,  FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_128, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_256, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_512, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_128,  FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_256,  FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_512,  FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_128, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_256, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_512, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
-
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52L, 0),
-
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
@@ -1273,15 +887,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
@@ -1299,56 +904,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK,
                      X86ISD::VPSHUFBITQMB, 0),
 
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, ISD::FMA,
-                     X86ISD::FMADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
-                     X86ISD::FMADD_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
-                     X86ISD::FMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
-                     X86ISD::FMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
-                     X86ISD::FMSUBADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
-                     X86ISD::FMSUBADD_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
-                     X86ISD::FNMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
-                     X86ISD::FNMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
                      X86ISD::VFIXUPIMM, 0),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1365,99 +920,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VFIXUPIMMS, 0),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
                      X86ISD::VFIXUPIMMS, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
-                     X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA,
-                     X86ISD::FMADD_RND),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
-                     X86ISD::FMADD_RND),
-
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_128,  FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_256,  FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_512,  FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_128, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_256, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_512, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_128,  FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_256,  FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_512,  FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_128, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_256, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_512, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
-
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
-                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52L, 0),
 
   X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
@@ -1478,12 +940,57 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
 
+  X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
-  X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_di_512, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_hi_128, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_hi_256, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_hi_512, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_qi_128, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_qi_256, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_qi_512, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_sf_512, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_permvar_si_512, VPERM_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_pmaddubs_w_512, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_pmaddw_d_512, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_prol_d_128,  INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_prol_d_256,  INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_prol_d_512,  INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_prol_q_128,  INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_prol_q_256,  INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_prol_q_512,  INTR_TYPE_2OP_IMM8, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_prolv_d_128, INTR_TYPE_2OP, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_prolv_d_256, INTR_TYPE_2OP, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_prolv_d_512, INTR_TYPE_2OP, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_prolv_q_128, INTR_TYPE_2OP, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_prolv_q_256, INTR_TYPE_2OP, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_prolv_q_512, INTR_TYPE_2OP, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_pror_d_128,  INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_pror_d_256,  INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_pror_d_512,  INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_pror_q_128,  INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_pror_q_256,  INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_pror_q_512,  INTR_TYPE_2OP_IMM8, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_prorv_d_128, INTR_TYPE_2OP, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_prorv_d_256, INTR_TYPE_2OP, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_prorv_d_512, INTR_TYPE_2OP, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_prorv_q_128, INTR_TYPE_2OP, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_prorv_q_256, INTR_TYPE_2OP, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_prorv_q_512, INTR_TYPE_2OP, ISD::ROTR, 0),
   X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -1525,6 +1032,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
@@ -1549,6 +1062,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+  X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
@@ -1559,42 +1076,74 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
   X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
   X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vfmadd_f32, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_vfmadd_f64, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_vfmadd_pd_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_vfmadd_ps_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_vfmaddsub_pd_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_vfmaddsub_ps_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_vpdpbusd_128,  INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpbusd_256,  INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpbusd_512,  INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpbusds_128, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpbusds_256, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpbusds_512, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpwssd_128,  INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpwssd_256,  INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpwssd_512,  INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpwssds_128, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpwssds_256, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+  X86_INTRINSIC_DATA(avx512_vpdpwssds_512, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+
+  X86_INTRINSIC_DATA(avx512_vpermi2var_d_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_d_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_d_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_hi_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_hi_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_hi_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_pd_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_pd_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_pd_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_ps_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_ps_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_ps_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_q_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_q_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_q_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_qi_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_qi_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_vpermi2var_qi_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
-  X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(fma_vfmadd_ps_256,    INTR_TYPE_3OP, ISD::FMA, 0),
-  X86_INTRINSIC_DATA(fma_vfmadd_sd,        INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
-  X86_INTRINSIC_DATA(fma_vfmadd_ss,        INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
-  X86_INTRINSIC_DATA(fma_vfmaddsub_pd,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmsub_pd,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmsub_pd_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmsub_ps,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmsub_ps_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfmsub_sd,        INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
-  X86_INTRINSIC_DATA(fma_vfmsub_ss,        INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
-  X86_INTRINSIC_DATA(fma_vfmsubadd_pd,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(fma_vfmsubadd_ps,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(fma_vfnmadd_pd,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(fma_vfnmadd_pd_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(fma_vfnmadd_ps,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(fma_vfnmadd_ps_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
-  X86_INTRINSIC_DATA(fma_vfnmadd_sd,       INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
-  X86_INTRINSIC_DATA(fma_vfnmadd_ss,       INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
-  X86_INTRINSIC_DATA(fma_vfnmsub_pd,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(fma_vfnmsub_sd,       INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
-  X86_INTRINSIC_DATA(fma_vfnmsub_ss,       INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
-  X86_INTRINSIC_DATA(fma4_vfmadd_sd,       INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
-  X86_INTRINSIC_DATA(fma4_vfmadd_ss,       INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(bmi_bextr_32,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
@@ -1609,7 +1158,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1623,11 +1171,11 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_comile_sd,    COMI, X86ISD::COMI, ISD::SETLE),
   X86_INTRINSIC_DATA(sse2_comilt_sd,    COMI, X86ISD::COMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
-  X86_INTRINSIC_DATA(sse2_cvtdq2ps,     INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
   X86_INTRINSIC_DATA(sse2_cvtpd2dq,     INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(sse2_cvtpd2ps,     INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(sse2_cvtps2dq,     INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(sse2_cvttpd2dq,    INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
-  X86_INTRINSIC_DATA(sse2_cvttps2dq,    INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(sse2_cvttps2dq,    INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(sse2_max_sd,       INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
@@ -1644,7 +1192,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
-  X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(sse2_psad_bw,      INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(sse2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(sse2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -1666,7 +1213,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1682,7 +1228,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
   X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(sse41_phminposuw,  INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
-  X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(sse41_round_pd,    ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(sse41_round_ps,    ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(sse41_round_sd,    ROUNDS, X86ISD::VRNDSCALES, 0),
@@ -1696,6 +1241,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(tbm_bextri_u32,    INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(tbm_bextri_u64,    INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(vcvtph2ps_128,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(vcvtph2ps_256,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_128,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
@@ -1755,12 +1302,11 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  * Retrieve data for Intrinsic without chain.
  * Return nullptr if intrinsic is not defined in the table.
  */
-static const IntrinsicData* getIntrinsicWithoutChain(uint16_t IntNo) {
-  IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
   const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
                                                std::end(IntrinsicsWithoutChain),
-                                               IntrinsicToFind);
-  if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+                                               IntNo);
+  if (Data != std::end(IntrinsicsWithoutChain) && Data->Id == IntNo)
     return Data;
   return nullptr;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index 4108a58fa7a5..d372cada8de8 100644
--- a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -21,6 +21,7 @@
 
 using namespace llvm;
 using namespace TargetOpcode;
+using namespace LegalizeActions;
 
 /// FIXME: The following static functions are SizeChangeStrategy functions
 /// that are meant to temporarily mimic the behaviour of the old legalization
@@ -38,7 +39,7 @@ addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
     result.push_back(v[i]);
     if (i + 1 < v[i].first && i + 1 < v.size() &&
         v[i + 1].first != v[i].first + 1)
-      result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+      result.push_back({v[i].first + 1, Unsupported});
   }
 }
 
@@ -46,11 +47,11 @@ static LegalizerInfo::SizeAndActionsVec
 widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
   assert(v.size() >= 1);
   assert(v[0].first > 1);
-  LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
-                                             {2, LegalizerInfo::Unsupported}};
+  LegalizerInfo::SizeAndActionsVec result = {{1, WidenScalar},
+                                             {2, Unsupported}};
   addAndInterleaveWithUnsupported(result, v);
   auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  result.push_back({Largest + 1, Unsupported});
   return result;
 }
 
@@ -81,16 +82,18 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
 
   computeTables();
+  verify(*STI.getInstrInfo());
 }
 
 void X86LegalizerInfo::setLegalizerInfo32bit() {
 
-  const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
+  const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
   const LLT s1 = LLT::scalar(1);
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
+  const LLT s128 = LLT::scalar(128);
 
   for (auto Ty : {p0, s1, s8, s16, s32})
     setAction({G_IMPLICIT_DEF, Ty}, Legal);
@@ -122,6 +125,19 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   setAction({G_GEP, p0}, Legal);
   setAction({G_GEP, 1, s32}, Legal);
 
+  if (!Subtarget.is64Bit()) {
+    getActionDefinitionsBuilder(G_PTRTOINT)
+        .legalForCartesianProduct({s1, s8, s16, s32}, {p0})
+        .maxScalar(0, s32)
+        .widenScalarToNextPow2(0, /*Min*/ 8);
+    getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+
+    // Shifts and SDIV
+    getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR, G_SDIV})
+        .legalFor({s8, s16, s32})
+        .clampScalar(0, s8, s32);
+  }
+
   // Control-flow
   setAction({G_BRCOND, s1}, Legal);
 
@@ -135,6 +151,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
     setAction({G_SEXT, Ty}, Legal);
     setAction({G_ANYEXT, Ty}, Legal);
   }
+  setAction({G_ANYEXT, s128}, Legal);
 
   // Comparison
   setAction({G_ICMP, s1}, Legal);
@@ -158,10 +175,18 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
   if (!Subtarget.is64Bit())
     return;
 
+  const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
   const LLT s128 = LLT::scalar(128);
 
   setAction({G_IMPLICIT_DEF, s64}, Legal);
+  // Need to have that, as tryFoldImplicitDef will create this pattern:
+  // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
+  setAction({G_IMPLICIT_DEF, s128}, Legal);
 
   setAction({G_PHI, s64}, Legal);
 
@@ -173,6 +198,11 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
 
   // Pointer-handling
   setAction({G_GEP, 1, s64}, Legal);
+  getActionDefinitionsBuilder(G_PTRTOINT)
+      .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+      .maxScalar(0, s64)
+      .widenScalarToNextPow2(0, /*Min*/ 8);
+  getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s64}});
 
   // Constants
   setAction({TargetOpcode::G_CONSTANT, s64}, Legal);
@@ -182,9 +212,21 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
     setAction({extOp, s64}, Legal);
   }
 
+  getActionDefinitionsBuilder(G_SITOFP)
+    .legalForCartesianProduct({s32, s64})
+      .clampScalar(1, s32, s64)
+      .widenScalarToNextPow2(1)
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0);
+
   // Comparison
   setAction({G_ICMP, 1, s64}, Legal);
 
+  // Shifts and SDIV
+  getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR, G_SDIV})
+    .legalFor({s8, s16, s32, s64})
+    .clampScalar(0, s8, s64);
+
   // Merge/Unmerge
   setAction({G_MERGE_VALUES, s128}, Legal);
   setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
index 730ba745eb70..d38c7b497965 100644
--- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
@@ -44,6 +43,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
 
@@ -56,6 +56,7 @@ class X86MCInstLower {
   const TargetMachine &TM;
   const MCAsmInfo &MAI;
   X86AsmPrinter &AsmPrinter;
+
 public:
   X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
 
@@ -115,13 +116,12 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
   return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
 }
 
-
 /// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
 /// operand to an MCSymbol.
-MCSymbol *X86MCInstLower::
-GetSymbolFromOperand(const MachineOperand &MO) const {
+MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
   const DataLayout &DL = MF.getDataLayout();
-  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) &&
+         "Isn't a symbol reference");
 
   MCSymbol *Sym = nullptr;
   SmallString<128> Name;
@@ -158,17 +158,17 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   // If the target flags on the operand changes the name of the symbol, do that
   // before we return the symbol.
   switch (MO.getTargetFlags()) {
-  default: break;
+  default:
+    break;
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
     MachineModuleInfoImpl::StubValueTy &StubSym =
-      getMachOMMI().getGVStubEntry(Sym);
+        getMachOMMI().getGVStubEntry(Sym);
     if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
-      StubSym =
-        MachineModuleInfoImpl::
-        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
-                    !MO.getGlobal()->hasInternalLinkage());
+      StubSym = MachineModuleInfoImpl::StubValueTy(
+          AsmPrinter.getSymbol(MO.getGlobal()),
+          !MO.getGlobal()->hasInternalLinkage());
     }
     break;
   }
@@ -185,44 +185,74 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
   switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  case X86II::MO_NO_FLAG:    // No flag.
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG: // No flag.
   // These affect the name of the symbol, not any suffix.
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DLLIMPORT:
     break;
 
-  case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
+  case X86II::MO_TLVP:
+    RefKind = MCSymbolRefExpr::VK_TLVP;
+    break;
   case X86II::MO_TLVP_PIC_BASE:
     Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
     // Subtract the pic base.
-    Expr = MCBinaryExpr::createSub(Expr,
-                                  MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
-                                                           Ctx),
-                                   Ctx);
-    break;
-  case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
-  case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
-  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
-  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
-  case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
-  case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
-  case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
-  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
-  case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
-  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
-  case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
-  case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
-  case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
-  case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
-  case X86II::MO_ABS8:      RefKind = MCSymbolRefExpr::VK_X86_ABS8; break;
+    Expr = MCBinaryExpr::createSub(
+        Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
+    break;
+  case X86II::MO_SECREL:
+    RefKind = MCSymbolRefExpr::VK_SECREL;
+    break;
+  case X86II::MO_TLSGD:
+    RefKind = MCSymbolRefExpr::VK_TLSGD;
+    break;
+  case X86II::MO_TLSLD:
+    RefKind = MCSymbolRefExpr::VK_TLSLD;
+    break;
+  case X86II::MO_TLSLDM:
+    RefKind = MCSymbolRefExpr::VK_TLSLDM;
+    break;
+  case X86II::MO_GOTTPOFF:
+    RefKind = MCSymbolRefExpr::VK_GOTTPOFF;
+    break;
+  case X86II::MO_INDNTPOFF:
+    RefKind = MCSymbolRefExpr::VK_INDNTPOFF;
+    break;
+  case X86II::MO_TPOFF:
+    RefKind = MCSymbolRefExpr::VK_TPOFF;
+    break;
+  case X86II::MO_DTPOFF:
+    RefKind = MCSymbolRefExpr::VK_DTPOFF;
+    break;
+  case X86II::MO_NTPOFF:
+    RefKind = MCSymbolRefExpr::VK_NTPOFF;
+    break;
+  case X86II::MO_GOTNTPOFF:
+    RefKind = MCSymbolRefExpr::VK_GOTNTPOFF;
+    break;
+  case X86II::MO_GOTPCREL:
+    RefKind = MCSymbolRefExpr::VK_GOTPCREL;
+    break;
+  case X86II::MO_GOT:
+    RefKind = MCSymbolRefExpr::VK_GOT;
+    break;
+  case X86II::MO_GOTOFF:
+    RefKind = MCSymbolRefExpr::VK_GOTOFF;
+    break;
+  case X86II::MO_PLT:
+    RefKind = MCSymbolRefExpr::VK_PLT;
+    break;
+  case X86II::MO_ABS8:
+    RefKind = MCSymbolRefExpr::VK_X86_ABS8;
+    break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
     Expr = MCSymbolRefExpr::create(Sym, Ctx);
     // Subtract the pic base.
-    Expr = MCBinaryExpr::createSub(Expr,
-                            MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
-                                   Ctx);
+    Expr = MCBinaryExpr::createSub(
+        Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
     if (MO.isJTI()) {
       assert(MAI.doesSetDirectiveSuppressReloc());
       // If .set directive is supported, use it to reduce the number of
@@ -240,14 +270,12 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
 
   if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
-    Expr = MCBinaryExpr::createAdd(Expr,
-                                   MCConstantExpr::create(MO.getOffset(), Ctx),
-                                   Ctx);
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
   return MCOperand::createExpr(Expr);
 }
 
-
-/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
 /// a short fixed-register form.
 static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
   unsigned ImmOp = Inst.getNumOperands() - 1;
@@ -255,7 +283,8 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
          (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
          ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
            Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
-          Inst.getNumOperands() == 2) && "Unexpected instruction!");
+          Inst.getNumOperands() == 2) &&
+         "Unexpected instruction!");
 
   // Check whether the destination register can be fixed.
   unsigned Reg = Inst.getOperand(0).getReg();
@@ -269,7 +298,7 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
   Inst.addOperand(Saved);
 }
 
-/// \brief If a movsx instruction has a shorter encoding for the used register
+/// If a movsx instruction has a shorter encoding for the used register
 /// simplify the instruction to use it instead.
 static void SimplifyMOVSX(MCInst &Inst) {
   unsigned NewOpcode = 0;
@@ -277,7 +306,7 @@ static void SimplifyMOVSX(MCInst &Inst) {
   switch (Inst.getOpcode()) {
   default:
     llvm_unreachable("Unexpected instruction!");
-  case X86::MOVSX16rr8:  // movsbw %al, %ax   --> cbtw
+  case X86::MOVSX16rr8: // movsbw %al, %ax   --> cbtw
     if (Op0 == X86::AX && Op1 == X86::AL)
       NewOpcode = X86::CBW;
     break;
@@ -297,7 +326,7 @@ static void SimplifyMOVSX(MCInst &Inst) {
   }
 }
 
-/// \brief Simplify things like MOV32rm to MOV32o32a.
+/// Simplify things like MOV32rm to MOV32o32a.
 static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
                                   unsigned Opcode) {
   // Don't make these simplifications in 64-bit mode; other assemblers don't
@@ -309,14 +338,14 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
   unsigned AddrBase = IsStore;
   unsigned RegOp = IsStore ? 0 : 5;
   unsigned AddrOp = AddrBase + 3;
-  assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
-         Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
-         Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
-         Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
-         Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
-         (Inst.getOperand(AddrOp).isExpr() ||
-          Inst.getOperand(AddrOp).isImm()) &&
-         "Unexpected instruction!");
+  assert(
+      Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+      Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+      Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+      Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+      Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+      (Inst.getOperand(AddrOp).isExpr() || Inst.getOperand(AddrOp).isImm()) &&
+      "Unexpected instruction!");
 
   // Check whether the destination register can be fixed.
   unsigned Reg = Inst.getOperand(RegOp).getReg();
@@ -401,9 +430,9 @@ ReSimplify:
   case X86::LEA16r:
   case X86::LEA32r:
     // LEA should have a segment register, but it must be empty.
-    assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+    assert(OutMI.getNumOperands() == 1 + X86::AddrNumOperands &&
            "Unexpected # of LEA operands");
-    assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+    assert(OutMI.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
            "LEA has segment specified!");
     break;
 
@@ -452,8 +481,8 @@ ReSimplify:
       unsigned NewOpc;
       switch (OutMI.getOpcode()) {
       default: llvm_unreachable("Invalid opcode");
-      case X86::VMOVSDrr:   NewOpc = X86::VMOVSDrr_REV;   break;
-      case X86::VMOVSSrr:   NewOpc = X86::VMOVSSrr_REV;   break;
+      case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+      case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
       }
       OutMI.setOpcode(NewOpc);
     }
@@ -499,24 +528,30 @@ ReSimplify:
     break;
   }
 
-  // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
-  { unsigned Opcode;
-  case X86::TAILJMPr:   Opcode = X86::JMP32r; goto SetTailJmpOpcode;
-  case X86::TAILJMPd:
-  case X86::TAILJMPd64: Opcode = X86::JMP_1;  goto SetTailJmpOpcode;
-  case X86::TAILJMPd_CC:
-  case X86::TAILJMPd64_CC:
-    Opcode = X86::GetCondBranchFromCond(
-        static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
-    goto SetTailJmpOpcode;
-
-  SetTailJmpOpcode:
-    MCOperand Saved = OutMI.getOperand(0);
-    OutMI = MCInst();
-    OutMI.setOpcode(Opcode);
-    OutMI.addOperand(Saved);
-    break;
-  }
+    // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
+    // instruction.
+    {
+      unsigned Opcode;
+    case X86::TAILJMPr:
+      Opcode = X86::JMP32r;
+      goto SetTailJmpOpcode;
+    case X86::TAILJMPd:
+    case X86::TAILJMPd64:
+      Opcode = X86::JMP_1;
+      goto SetTailJmpOpcode;
+    case X86::TAILJMPd_CC:
+    case X86::TAILJMPd64_CC:
+      Opcode = X86::GetCondBranchFromCond(
+          static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
+      goto SetTailJmpOpcode;
+
+    SetTailJmpOpcode:
+      MCOperand Saved = OutMI.getOperand(0);
+      OutMI = MCInst();
+      OutMI.setOpcode(Opcode);
+      OutMI.addOperand(Saved);
+      break;
+    }
 
   case X86::DEC16r:
   case X86::DEC32r:
@@ -539,63 +574,63 @@ ReSimplify:
   // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
   // this with an ugly goto in case the resultant OR uses EAX and needs the
   // short form.
-  case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
-  case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
-  case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
-  case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
-  case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+  case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr);   goto ReSimplify;
+  case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr);   goto ReSimplify;
+  case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr);   goto ReSimplify;
+  case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri);   goto ReSimplify;
+  case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri);   goto ReSimplify;
   case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
-  case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
-  case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
-  case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+  case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8);  goto ReSimplify;
+  case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8);  goto ReSimplify;
+  case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8);  goto ReSimplify;
 
   // Atomic load and store require a separate pseudo-inst because Acquire
   // implies mayStore and Release implies mayLoad; fix these to regular MOV
   // instructions here
-  case X86::ACQUIRE_MOV8rm:    OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
-  case X86::ACQUIRE_MOV16rm:   OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
-  case X86::ACQUIRE_MOV32rm:   OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
-  case X86::ACQUIRE_MOV64rm:   OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
-  case X86::RELEASE_MOV8mr:    OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
-  case X86::RELEASE_MOV16mr:   OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
-  case X86::RELEASE_MOV32mr:   OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
-  case X86::RELEASE_MOV64mr:   OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
-  case X86::RELEASE_MOV8mi:    OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
-  case X86::RELEASE_MOV16mi:   OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
-  case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+  case X86::ACQUIRE_MOV8rm:    OutMI.setOpcode(X86::MOV8rm);    goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm:   OutMI.setOpcode(X86::MOV16rm);   goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm:   OutMI.setOpcode(X86::MOV32rm);   goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm:   OutMI.setOpcode(X86::MOV64rm);   goto ReSimplify;
+  case X86::RELEASE_MOV8mr:    OutMI.setOpcode(X86::MOV8mr);    goto ReSimplify;
+  case X86::RELEASE_MOV16mr:   OutMI.setOpcode(X86::MOV16mr);   goto ReSimplify;
+  case X86::RELEASE_MOV32mr:   OutMI.setOpcode(X86::MOV32mr);   goto ReSimplify;
+  case X86::RELEASE_MOV64mr:   OutMI.setOpcode(X86::MOV64mr);   goto ReSimplify;
+  case X86::RELEASE_MOV8mi:    OutMI.setOpcode(X86::MOV8mi);    goto ReSimplify;
+  case X86::RELEASE_MOV16mi:   OutMI.setOpcode(X86::MOV16mi);   goto ReSimplify;
+  case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi);   goto ReSimplify;
   case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
-  case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
-  case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
-  case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
-  case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
+  case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi);    goto ReSimplify;
+  case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr);    goto ReSimplify;
+  case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi);   goto ReSimplify;
+  case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr);   goto ReSimplify;
   case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
-  case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
-  case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
-  case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
-  case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
-  case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
+  case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr);   goto ReSimplify;
+  case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi);    goto ReSimplify;
+  case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr);    goto ReSimplify;
+  case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi);   goto ReSimplify;
+  case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr);   goto ReSimplify;
   case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
-  case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
-  case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
-  case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
-  case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
-  case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
-  case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
-  case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
-  case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
-  case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
-  case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
-  case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
+  case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr);   goto ReSimplify;
+  case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi);     goto ReSimplify;
+  case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr);     goto ReSimplify;
+  case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi);    goto ReSimplify;
+  case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr);    goto ReSimplify;
+  case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32);  goto ReSimplify;
+  case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr);    goto ReSimplify;
+  case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi);    goto ReSimplify;
+  case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr);    goto ReSimplify;
+  case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi);   goto ReSimplify;
+  case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr);   goto ReSimplify;
   case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
-  case X86::RELEASE_XOR64mr:   OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
-  case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m); goto ReSimplify;
-  case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m); goto ReSimplify;
-  case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m); goto ReSimplify;
-  case X86::RELEASE_INC64m:    OutMI.setOpcode(X86::INC64m); goto ReSimplify;
-  case X86::RELEASE_DEC8m:     OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
-  case X86::RELEASE_DEC16m:    OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
-  case X86::RELEASE_DEC32m:    OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
-  case X86::RELEASE_DEC64m:    OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
+  case X86::RELEASE_XOR64mr:   OutMI.setOpcode(X86::XOR64mr);   goto ReSimplify;
+  case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m);     goto ReSimplify;
+  case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m);    goto ReSimplify;
+  case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m);    goto ReSimplify;
+  case X86::RELEASE_INC64m:    OutMI.setOpcode(X86::INC64m);    goto ReSimplify;
+  case X86::RELEASE_DEC8m:     OutMI.setOpcode(X86::DEC8m);     goto ReSimplify;
+  case X86::RELEASE_DEC16m:    OutMI.setOpcode(X86::DEC16m);    goto ReSimplify;
+  case X86::RELEASE_DEC32m:    OutMI.setOpcode(X86::DEC32m);    goto ReSimplify;
+  case X86::RELEASE_DEC64m:    OutMI.setOpcode(X86::DEC64m);    goto ReSimplify;
 
   // We don't currently select the correct instruction form for instructions
   // which have a short %eax, etc. form. Handle this by custom lowering, for
@@ -616,13 +651,13 @@ ReSimplify:
     switch (OutMI.getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
     case X86::MOV8mr_NOREX:
-    case X86::MOV8mr:     NewOpc = X86::MOV8o32a; break;
+    case X86::MOV8mr:  NewOpc = X86::MOV8o32a; break;
     case X86::MOV8rm_NOREX:
-    case X86::MOV8rm:     NewOpc = X86::MOV8ao32; break;
-    case X86::MOV16mr:    NewOpc = X86::MOV16o32a; break;
-    case X86::MOV16rm:    NewOpc = X86::MOV16ao32; break;
-    case X86::MOV32mr:    NewOpc = X86::MOV32o32a; break;
-    case X86::MOV32rm:    NewOpc = X86::MOV32ao32; break;
+    case X86::MOV8rm:  NewOpc = X86::MOV8ao32; break;
+    case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
+    case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
+    case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
+    case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
     }
     SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
     break;
@@ -705,18 +740,18 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
 
   MCSymbolRefExpr::VariantKind SRVK;
   switch (MI.getOpcode()) {
-    case X86::TLS_addr32:
-    case X86::TLS_addr64:
-      SRVK = MCSymbolRefExpr::VK_TLSGD;
-      break;
-    case X86::TLS_base_addr32:
-      SRVK = MCSymbolRefExpr::VK_TLSLDM;
-      break;
-    case X86::TLS_base_addr64:
-      SRVK = MCSymbolRefExpr::VK_TLSLD;
-      break;
-    default:
-      llvm_unreachable("unexpected opcode");
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+    SRVK = MCSymbolRefExpr::VK_TLSGD;
+    break;
+  case X86::TLS_base_addr32:
+    SRVK = MCSymbolRefExpr::VK_TLSLDM;
+    break;
+  case X86::TLS_base_addr64:
+    SRVK = MCSymbolRefExpr::VK_TLSLD;
+    break;
+  default:
+    llvm_unreachable("unexpected opcode");
   }
 
   MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
@@ -759,16 +794,14 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
   MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
   const MCSymbolRefExpr *tlsRef =
-    MCSymbolRefExpr::create(tlsGetAddr,
-                            MCSymbolRefExpr::VK_PLT,
-                            context);
+      MCSymbolRefExpr::create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, context);
 
-  EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
-                                                 : X86::CALLpcrel32)
-                            .addExpr(tlsRef));
+  EmitAndCountInstruction(
+      MCInstBuilder(is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
+          .addExpr(tlsRef));
 }
 
-/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
+/// Emit the largest nop instruction smaller than or equal to \p NumBytes
 /// bytes.  Return the size of nop emitted.
 static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
                         const MCSubtargetInfo &STI) {
@@ -782,22 +815,62 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
   BaseReg = X86::RAX;
   ScaleVal = 1;
   switch (NumBytes) {
-  case  0: llvm_unreachable("Zero nops?"); break;
-  case  1: NopSize = 1; Opc = X86::NOOP; break;
-  case  2: NopSize = 2; Opc = X86::XCHG16ar; break;
-  case  3: NopSize = 3; Opc = X86::NOOPL; break;
-  case  4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
-  case  5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
-           IndexReg = X86::RAX; break;
-  case  6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
-           IndexReg = X86::RAX; break;
-  case  7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
-  case  8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
-           IndexReg = X86::RAX; break;
-  case  9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
-           IndexReg = X86::RAX; break;
-  default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
-           IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+  case 0:
+    llvm_unreachable("Zero nops?");
+    break;
+  case 1:
+    NopSize = 1;
+    Opc = X86::NOOP;
+    break;
+  case 2:
+    NopSize = 2;
+    Opc = X86::XCHG16ar;
+    break;
+  case 3:
+    NopSize = 3;
+    Opc = X86::NOOPL;
+    break;
+  case 4:
+    NopSize = 4;
+    Opc = X86::NOOPL;
+    Displacement = 8;
+    break;
+  case 5:
+    NopSize = 5;
+    Opc = X86::NOOPL;
+    Displacement = 8;
+    IndexReg = X86::RAX;
+    break;
+  case 6:
+    NopSize = 6;
+    Opc = X86::NOOPW;
+    Displacement = 8;
+    IndexReg = X86::RAX;
+    break;
+  case 7:
+    NopSize = 7;
+    Opc = X86::NOOPL;
+    Displacement = 512;
+    break;
+  case 8:
+    NopSize = 8;
+    Opc = X86::NOOPL;
+    Displacement = 512;
+    IndexReg = X86::RAX;
+    break;
+  case 9:
+    NopSize = 9;
+    Opc = X86::NOOPW;
+    Displacement = 512;
+    IndexReg = X86::RAX;
+    break;
+  default:
+    NopSize = 10;
+    Opc = X86::NOOPW;
+    Displacement = 512;
+    IndexReg = X86::RAX;
+    SegmentReg = X86::CS;
+    break;
   }
 
   unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
@@ -806,14 +879,12 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
     OS.EmitBytes("\x66");
 
   switch (Opc) {
-  default:
-    llvm_unreachable("Unexpected opcode");
-    break;
+  default: llvm_unreachable("Unexpected opcode");
   case X86::NOOP:
     OS.EmitInstruction(MCInstBuilder(Opc), STI);
     break;
   case X86::XCHG16ar:
-    OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+    OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI);
     break;
   case X86::NOOPL:
   case X86::NOOPW:
@@ -830,7 +901,7 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
   return NopSize;
 }
 
-/// \brief Emit the optimal amount of multi-byte nops on X86.
+/// Emit the optimal amount of multi-byte nops on X86.
 static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
                      const MCSubtargetInfo &STI) {
   unsigned NopsToEmit = NumBytes;
@@ -971,7 +1042,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
       unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
                                  getSubtargetInfo());
       assert(NopSize == MinSize && "Could not implement MinSize!");
-      (void) NopSize;
+      (void)NopSize;
     }
   }
 
@@ -1016,9 +1087,8 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
       break;
     case MachineOperand::MO_ExternalSymbol:
     case MachineOperand::MO_GlobalAddress:
-      CalleeMCOp =
-        MCIL.LowerSymbolOperand(CalleeMO,
-                                MCIL.GetSymbolFromOperand(CalleeMO));
+      CalleeMCOp = MCIL.LowerSymbolOperand(CalleeMO,
+                                           MCIL.GetSymbolFromOperand(CalleeMO));
       break;
     }
 
@@ -1084,8 +1154,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
 
   // The default C calling convention will place two arguments into %rcx and
   // %rdx -- so we only work with those.
-  unsigned UsedRegs[] = {X86::RDI, X86::RSI};
+  unsigned DestRegs[] = {X86::RDI, X86::RSI};
   bool UsedMask[] = {false, false};
+  // Filled out in loop.
+  unsigned SrcRegs[] = {0, 0};
 
   // Then we put the operands in the %rdi and %rsi registers. We spill the
   // values in the register before we clobber them, and mark them as used in
@@ -1095,18 +1167,22 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   for (unsigned I = 0; I < MI.getNumOperands(); ++I)
     if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
       assert(Op->isReg() && "Only support arguments in registers");
-      if (Op->getReg() != UsedRegs[I]) {
+      SrcRegs[I] = Op->getReg();
+      if (SrcRegs[I] != DestRegs[I]) {
         UsedMask[I] = true;
         EmitAndCountInstruction(
-            MCInstBuilder(X86::PUSH64r).addReg(UsedRegs[I]));
-        EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
-                                    .addReg(UsedRegs[I])
-                                    .addReg(Op->getReg()));
+            MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
       } else {
         EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
       }
     }
 
+  // Now that the register values are stashed, mov arguments into place.
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+    if (SrcRegs[I] != DestRegs[I])
+      EmitAndCountInstruction(
+          MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));
+
   // We emit a hard dependency on the __xray_CustomEvent symbol, which is the
   // name of the trampoline to be implemented by the XRay runtime.
   auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
@@ -1121,7 +1197,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   // Restore caller-saved and used registers.
   for (unsigned I = sizeof UsedMask; I-- > 0;)
     if (UsedMask[I])
-      EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(UsedRegs[I]));
+      EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
     else
       EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
 
@@ -1133,6 +1209,102 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
 }
 
+void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
+                                                    X86MCInstLower &MCIL) {
+  assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");
+
+  // We want to emit the following pattern, which follows the x86 calling
+  // convention to prepare for the trampoline call to be patched in.
+  //
+  //   .p2align 1, ...
+  // .Lxray_event_sled_N:
+  //   jmp +N                        // jump across the instrumentation sled
+  //   ...                           // set up arguments in register
+  //   callq __xray_TypedEvent@plt  // force dependency to symbol
+  //   ...
+  //   <jump here>
+  //
+  // After patching, it would look something like:
+  //
+  //   nopw (2-byte nop)
+  //   ...
+  //   callq __xrayTypedEvent  // already lowered
+  //   ...
+  //
+  // ---
+  // First we emit the label and the jump.
+  auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
+  OutStreamer->AddComment("# XRay Typed Event Log");
+  OutStreamer->EmitCodeAlignment(2);
+  OutStreamer->EmitLabel(CurSled);
+
+  // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+  // an operand (computed as an offset from the jmp instruction).
+  // FIXME: Find another less hacky way do force the relative jump.
+  OutStreamer->EmitBinaryData("\xeb\x14");
+
+  // An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
+  // so we'll work with those. Or we may be called via SystemV, in which case
+  // we don't have to do any translation.
+  unsigned DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
+  bool UsedMask[] = {false, false, false};
+
+  // Will fill out src regs in the loop.
+  unsigned SrcRegs[] = {0, 0, 0};
+
+  // Then we put the operands in the SystemV registers. We spill the values in
+  // the registers before we clobber them, and mark them as used in UsedMask.
+  // In case the arguments are already in the correct register, we emit nops
+  // appropriately sized to keep the sled the same size in every situation.
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+    if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
+      // TODO: Is register only support adequate?
+      assert(Op->isReg() && "Only supports arguments in registers");
+      SrcRegs[I] = Op->getReg();
+      if (SrcRegs[I] != DestRegs[I]) {
+        UsedMask[I] = true;
+        EmitAndCountInstruction(
+            MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
+      } else {
+        EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+      }
+    }
+
+  // In the above loop we only stash all of the destination registers or emit
+  // nops if the arguments are already in the right place. Doing the actually
+  // moving is postponed until after all the registers are stashed so nothing
+  // is clobbers. We've already added nops to account for the size of mov and
+  // push if the register is in the right place, so we only have to worry about
+  // emitting movs.
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+    if (UsedMask[I])
+      EmitAndCountInstruction(
+          MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));
+
+  // We emit a hard dependency on the __xray_TypedEvent symbol, which is the
+  // name of the trampoline to be implemented by the XRay runtime.
+  auto TSym = OutContext.getOrCreateSymbol("__xray_TypedEvent");
+  MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
+  if (isPositionIndependent())
+    TOp.setTargetFlags(X86II::MO_PLT);
+
+  // Emit the call instruction.
+  EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
+                              .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
+
+  // Restore caller-saved and used registers.
+  for (unsigned I = sizeof UsedMask; I-- > 0;)
+    if (UsedMask[I])
+      EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
+    else
+      EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+
+  OutStreamer->AddComment("xray typed event end.");
+
+  // Record the sled version.
+  recordSled(CurSled, MI, SledKind::TYPED_EVENT, 0);
+}
+
 void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
                                                   X86MCInstLower &MCIL) {
   // We want to emit the following pattern:
@@ -1190,7 +1362,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
 }
 
-void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
+void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
+                                             X86MCInstLower &MCIL) {
   // Like PATCHABLE_RET, we have the actual instruction in the operands to this
   // instruction so we lower that particular instruction and its operands.
   // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
@@ -1244,8 +1417,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
 
   ArrayRef<MachineConstantPoolEntry> Constants =
       MI.getParent()->getParent()->getConstantPool()->getConstants();
-  const MachineConstantPoolEntry &ConstantEntry =
-      Constants[Op.getIndex()];
+  const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()];
 
   // Bail if this is a machine constant pool entry, we won't be able to dig out
   // anything useful.
@@ -1258,10 +1430,8 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
   return C;
 }
 
-static std::string getShuffleComment(const MachineInstr *MI,
-                                     unsigned SrcOp1Idx,
-                                     unsigned SrcOp2Idx,
-                                     ArrayRef<int> Mask) {
+static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
+                                     unsigned SrcOp2Idx, ArrayRef<int> Mask) {
   std::string Comment;
 
   // Compute the name for a register. This is really goofy because we have
@@ -1449,12 +1619,13 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
-  const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+  const X86RegisterInfo *RI =
+      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
 
   // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
   // are compressed from EVEX encoding to VEX encoding.
   if (TM.Options.MCOptions.ShowMCEncoding) {
-    if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
+    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
       OutStreamer->AddComment("EVEX TO VEX Compression ", false);
   }
 
@@ -1467,7 +1638,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer->emitRawComment("MEMBARRIER");
     return;
 
-
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
     // Lower these as normal, but add some comments.
@@ -1519,13 +1689,14 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *PICBase = MF->getPICBaseSymbol();
     // FIXME: We would like an efficient form for this, so we don't have to do a
     // lot of extra uniquing.
-    EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
-      .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::CALLpcrel32)
+            .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
 
-    const X86FrameLowering* FrameLowering =
+    const X86FrameLowering *FrameLowering =
         MF->getSubtarget<X86Subtarget>().getFrameLowering();
     bool hasFP = FrameLowering->hasFP(*MF);
-    
+
     // TODO: This is needed only if we require precise CFA.
     bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
                                !OutStreamer->getDwarfFrameInfos().back().End;
@@ -1540,8 +1711,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer->EmitLabel(PICBase);
 
     // popl $reg
-    EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
-                            .addReg(MI->getOperand(0).getReg()));
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
 
     if (HasActiveDwarfFrame && !hasFP) {
       OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
@@ -1549,6 +1720,41 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case X86::MOVGOT64r: {
+    // Materializes the GOT for the 64-bit large code model.
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    MCSymbol *GOTSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    // .LtmpN: leaq .LtmpN(%rip), %dst
+    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+    EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
+                                .addReg(DstReg)   // dest
+                                .addReg(X86::RIP) // base
+                                .addImm(1)        // scale
+                                .addReg(0)        // index
+                                .addExpr(DotExpr) // disp
+                                .addReg(0));      // seg
+
+    // movq $_GLOBAL_OFFSET_TABLE_ - .LtmpN, %scratch
+    const MCExpr *GOTSymExpr = MCSymbolRefExpr::create(GOTSym, OutContext);
+    const MCExpr *GOTDiffExpr =
+        MCBinaryExpr::createSub(GOTSymExpr, DotExpr, OutContext);
+    EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
+                                .addReg(ScratchReg)     // dest
+                                .addExpr(GOTDiffExpr)); // disp
+
+    // addq %scratch, %dst
+    EmitAndCountInstruction(MCInstBuilder(X86::ADD64rr)
+                                .addReg(DstReg)       // dest
+                                .addReg(DstReg)       // dest
+                                .addReg(ScratchReg)); // src
+    return;
+  }
+
   case X86::ADD32ri: {
     // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
     if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
@@ -1569,16 +1775,16 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
     const MCExpr *PICBase =
-      MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+        MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
     DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
 
-    DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
-                                      DotExpr, OutContext);
+    DotExpr = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
 
     EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
-      .addReg(MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(1).getReg())
-      .addExpr(DotExpr));
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(DotExpr));
     return;
   }
   case TargetOpcode::STATEPOINT:
@@ -1607,10 +1813,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   case TargetOpcode::PATCHABLE_TAIL_CALL:
     return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
-    
+
   case TargetOpcode::PATCHABLE_EVENT_CALL:
     return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
 
+  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+    return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
+
   case X86::MORESTACK_RET:
     EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
     return;
@@ -1618,9 +1827,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::MORESTACK_RET_RESTORE_R10:
     // Return, then restore R10.
     EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
-    EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
-                            .addReg(X86::R10)
-                            .addReg(X86::RAX));
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
     return;
 
   case X86::SEH_PushReg:
@@ -1822,37 +2030,55 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     break;
   }
 
-#define MOV_CASE(Prefix, Suffix)        \
-  case X86::Prefix##MOVAPD##Suffix##rm: \
-  case X86::Prefix##MOVAPS##Suffix##rm: \
-  case X86::Prefix##MOVUPD##Suffix##rm: \
-  case X86::Prefix##MOVUPS##Suffix##rm: \
-  case X86::Prefix##MOVDQA##Suffix##rm: \
+  case X86::MMX_MOVQ64rm: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    if (MI->getNumOperands() <= 4)
+      break;
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+      std::string Comment;
+      raw_string_ostream CS(Comment);
+      const MachineOperand &DstOp = MI->getOperand(0);
+      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+      if (auto *CF = dyn_cast<ConstantFP>(C)) {
+        CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
+        OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+      }
+    }
+    break;
+  }
+
+#define MOV_CASE(Prefix, Suffix)                                               \
+  case X86::Prefix##MOVAPD##Suffix##rm:                                        \
+  case X86::Prefix##MOVAPS##Suffix##rm:                                        \
+  case X86::Prefix##MOVUPD##Suffix##rm:                                        \
+  case X86::Prefix##MOVUPS##Suffix##rm:                                        \
+  case X86::Prefix##MOVDQA##Suffix##rm:                                        \
   case X86::Prefix##MOVDQU##Suffix##rm:
 
-#define MOV_AVX512_CASE(Suffix)         \
-  case X86::VMOVDQA64##Suffix##rm:      \
-  case X86::VMOVDQA32##Suffix##rm:      \
-  case X86::VMOVDQU64##Suffix##rm:      \
-  case X86::VMOVDQU32##Suffix##rm:      \
-  case X86::VMOVDQU16##Suffix##rm:      \
-  case X86::VMOVDQU8##Suffix##rm:       \
-  case X86::VMOVAPS##Suffix##rm:        \
-  case X86::VMOVAPD##Suffix##rm:        \
-  case X86::VMOVUPS##Suffix##rm:        \
+#define MOV_AVX512_CASE(Suffix)                                                \
+  case X86::VMOVDQA64##Suffix##rm:                                             \
+  case X86::VMOVDQA32##Suffix##rm:                                             \
+  case X86::VMOVDQU64##Suffix##rm:                                             \
+  case X86::VMOVDQU32##Suffix##rm:                                             \
+  case X86::VMOVDQU16##Suffix##rm:                                             \
+  case X86::VMOVDQU8##Suffix##rm:                                              \
+  case X86::VMOVAPS##Suffix##rm:                                               \
+  case X86::VMOVAPD##Suffix##rm:                                               \
+  case X86::VMOVUPS##Suffix##rm:                                               \
   case X86::VMOVUPD##Suffix##rm:
 
-#define CASE_ALL_MOV_RM()               \
-  MOV_CASE(, )   /* SSE */              \
-  MOV_CASE(V, )  /* AVX-128 */          \
-  MOV_CASE(V, Y) /* AVX-256 */          \
-  MOV_AVX512_CASE(Z)                    \
-  MOV_AVX512_CASE(Z256)                 \
+#define CASE_ALL_MOV_RM()                                                      \
+  MOV_CASE(, )   /* SSE */                                                     \
+  MOV_CASE(V, )  /* AVX-128 */                                                 \
+  MOV_CASE(V, Y) /* AVX-256 */                                                 \
+  MOV_AVX512_CASE(Z)                                                           \
+  MOV_AVX512_CASE(Z256)                                                        \
   MOV_AVX512_CASE(Z128)
 
-  // For loads from a constant pool to a vector register, print the constant
-  // loaded.
-  CASE_ALL_MOV_RM()
+    // For loads from a constant pool to a vector register, print the constant
+    // loaded.
+    CASE_ALL_MOV_RM()
   case X86::VBROADCASTF128:
   case X86::VBROADCASTI128:
   case X86::VBROADCASTF32X4Z256rm:
@@ -1875,20 +2101,20 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       int NumLanes = 1;
       // Override NumLanes for the broadcast instructions.
       switch (MI->getOpcode()) {
-      case X86::VBROADCASTF128:         NumLanes = 2;  break;
-      case X86::VBROADCASTI128:         NumLanes = 2;  break;
-      case X86::VBROADCASTF32X4Z256rm:  NumLanes = 2;  break;
-      case X86::VBROADCASTF32X4rm:      NumLanes = 4;  break;
-      case X86::VBROADCASTF32X8rm:      NumLanes = 2;  break;
-      case X86::VBROADCASTF64X2Z128rm:  NumLanes = 2;  break;
-      case X86::VBROADCASTF64X2rm:      NumLanes = 4;  break;
-      case X86::VBROADCASTF64X4rm:      NumLanes = 2;  break;
-      case X86::VBROADCASTI32X4Z256rm:  NumLanes = 2;  break;
-      case X86::VBROADCASTI32X4rm:      NumLanes = 4;  break;
-      case X86::VBROADCASTI32X8rm:      NumLanes = 2;  break;
-      case X86::VBROADCASTI64X2Z128rm:  NumLanes = 2;  break;
-      case X86::VBROADCASTI64X2rm:      NumLanes = 4;  break;
-      case X86::VBROADCASTI64X4rm:      NumLanes = 2;  break;
+      case X86::VBROADCASTF128:        NumLanes = 2; break;
+      case X86::VBROADCASTI128:        NumLanes = 2; break;
+      case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break;
+      case X86::VBROADCASTF32X4rm:     NumLanes = 4; break;
+      case X86::VBROADCASTF32X8rm:     NumLanes = 2; break;
+      case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break;
+      case X86::VBROADCASTF64X2rm:     NumLanes = 4; break;
+      case X86::VBROADCASTF64X4rm:     NumLanes = 2; break;
+      case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break;
+      case X86::VBROADCASTI32X4rm:     NumLanes = 4; break;
+      case X86::VBROADCASTI32X8rm:     NumLanes = 2; break;
+      case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break;
+      case X86::VBROADCASTI64X2rm:     NumLanes = 4; break;
+      case X86::VBROADCASTI64X4rm:     NumLanes = 2; break;
       }
 
       std::string Comment;
@@ -1898,7 +2124,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
         CS << "[";
         for (int l = 0; l != NumLanes; ++l) {
-          for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+          for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements;
+               ++i) {
             if (i != 0 || l != 0)
               CS << ",";
             if (CDS->getElementType()->isIntegerTy())
@@ -1916,7 +2143,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
         CS << "<";
         for (int l = 0; l != NumLanes; ++l) {
-          for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+          for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands;
+               ++i) {
             if (i != 0 || l != 0)
               CS << ",";
             printConstant(CV->getOperand(i), CS);
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index d517d82537a7..e1183bd14796 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -16,7 +16,7 @@
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
@@ -49,7 +49,7 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// ReturnAddrIndex - FrameIndex for return slot.
   int ReturnAddrIndex = 0;
 
-  /// \brief FrameIndex for return slot.
+  /// FrameIndex for return slot.
   int FrameAddrIndex = 0;
 
   /// TailCallReturnAddrDelta - The number of bytes by which return address
diff --git a/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp b/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp
index 67d95c2233de..df3abb17014d 100644
--- a/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -19,7 +19,7 @@
 
 using namespace llvm;
 
-/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
 /// together. Given SecondMI, when FirstMI is unspecified, then check if
 /// SecondMI may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
@@ -86,7 +86,6 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   case X86::TEST16mr:
   case X86::TEST32mr:
   case X86::TEST64mr:
-  case X86::TEST8ri_NOREX:
   case X86::AND16i16:
   case X86::AND16ri:
   case X86::AND16ri8:
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 1fc6f07b79fa..42db51b3cf01 100644
--- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -60,17 +60,17 @@ static cl::opt<bool>
 STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
 STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
 
-/// \brief Returns true if two machine operands are identical and they are not
+/// Returns true if two machine operands are identical and they are not
 /// physical registers.
 static inline bool isIdenticalOp(const MachineOperand &MO1,
                                  const MachineOperand &MO2);
 
-/// \brief Returns true if two address displacement operands are of the same
+/// Returns true if two address displacement operands are of the same
 /// type and use the same symbol/index/address regardless of the offset.
 static bool isSimilarDispOp(const MachineOperand &MO1,
                             const MachineOperand &MO2);
 
-/// \brief Returns true if the instruction is LEA.
+/// Returns true if the instruction is LEA.
 static inline bool isLEA(const MachineInstr &MI);
 
 namespace {
@@ -184,7 +184,7 @@ template <> struct DenseMapInfo<MemOpKey> {
 
 } // end namespace llvm
 
-/// \brief Returns a hash table key based on memory operands of \p MI. The
+/// Returns a hash table key based on memory operands of \p MI. The
 /// number of the first memory operand of \p MI is specified through \p N.
 static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
   assert((isLEA(MI) || MI.mayLoadOrStore()) &&
@@ -242,7 +242,7 @@ public:
 
   StringRef getPassName() const override { return "X86 LEA Optimize"; }
 
-  /// \brief Loop over all of the basic blocks, replacing address
+  /// Loop over all of the basic blocks, replacing address
   /// calculations in load and store instructions, if it's already
   /// been calculated by LEA. Also, remove redundant LEAs.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -250,11 +250,11 @@ public:
 private:
   using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
 
-  /// \brief Returns a distance between two instructions inside one basic block.
+  /// Returns a distance between two instructions inside one basic block.
   /// Negative result means, that instructions occur in reverse order.
   int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
 
-  /// \brief Choose the best \p LEA instruction from the \p List to replace
+  /// Choose the best \p LEA instruction from the \p List to replace
   /// address calculation in \p MI instruction. Return the address displacement
   /// and the distance between \p MI and the chosen \p BestLEA in
   /// \p AddrDispShift and \p Dist.
@@ -262,25 +262,25 @@ private:
                      const MachineInstr &MI, MachineInstr *&BestLEA,
                      int64_t &AddrDispShift, int &Dist);
 
-  /// \brief Returns the difference between addresses' displacements of \p MI1
+  /// Returns the difference between addresses' displacements of \p MI1
   /// and \p MI2. The numbers of the first memory operands for the instructions
   /// are specified through \p N1 and \p N2.
   int64_t getAddrDispShift(const MachineInstr &MI1, unsigned N1,
                            const MachineInstr &MI2, unsigned N2) const;
 
-  /// \brief Returns true if the \p Last LEA instruction can be replaced by the
+  /// Returns true if the \p Last LEA instruction can be replaced by the
   /// \p First. The difference between displacements of the addresses calculated
   /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
   /// replacement of the \p Last LEA's uses with the \p First's def register.
   bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
                      int64_t &AddrDispShift) const;
 
-  /// \brief Find all LEA instructions in the basic block. Also, assign position
+  /// Find all LEA instructions in the basic block. Also, assign position
   /// numbers to all instructions in the basic block to speed up calculation of
   /// distance between them.
   void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
 
-  /// \brief Removes redundant address calculations.
+  /// Removes redundant address calculations.
   bool removeRedundantAddrCalc(MemOpMap &LEAs);
 
   /// Replace debug value MI with a new debug value instruction using register
@@ -289,7 +289,7 @@ private:
   MachineInstr *replaceDebugValue(MachineInstr &MI, unsigned VReg,
                                   int64_t AddrDispShift);
 
-  /// \brief Removes LEAs which calculate similar addresses.
+  /// Removes LEAs which calculate similar addresses.
   bool removeRedundantLEAs(MemOpMap &LEAs);
 
   DenseMap<const MachineInstr *, unsigned> InstrPos;
@@ -541,7 +541,7 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
     MRI->clearKillFlags(DefMI->getOperand(0).getReg());
 
     ++NumSubstLEAs;
-    DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+    LLVM_DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
 
     // Change instruction operands.
     MI.getOperand(MemOpNo + X86::AddrBaseReg)
@@ -553,7 +553,7 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
     MI.getOperand(MemOpNo + X86::AddrSegmentReg)
         .ChangeToRegister(X86::NoRegister, false);
 
-    DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+    LLVM_DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
 
     Changed = true;
   }
@@ -649,7 +649,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         MRI->clearKillFlags(FirstVReg);
 
         ++NumRedundantLEAs;
-        DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+        LLVM_DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: ";
+                   Last.dump(););
 
         // By this moment, all of the Last LEA's uses must be replaced. So we
         // can freely remove it.
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 1da0fad8b6cf..85b9aecc2106 100644
--- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -21,7 +21,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -49,7 +49,7 @@ namespace {
   struct PadShortFunc : public MachineFunctionPass {
     static char ID;
     PadShortFunc() : MachineFunctionPass(ID)
-                   , Threshold(4), STI(nullptr), TII(nullptr) {}
+                   , Threshold(4) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -82,8 +82,7 @@ namespace {
     // VisitedBBs - Cache of previously visited BBs.
     DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
 
-    const X86Subtarget *STI;
-    const TargetInstrInfo *TII;
+    TargetSchedModel TSM;
   };
 
   char PadShortFunc::ID = 0;
@@ -99,15 +98,13 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  if (MF.getFunction().optForSize()) {
+  if (MF.getFunction().optForSize())
     return false;
-  }
 
-  STI = &MF.getSubtarget<X86Subtarget>();
-  if (!STI->padShortFunctions())
+  if (!MF.getSubtarget<X86Subtarget>().padShortFunctions())
     return false;
 
-  TII = STI->getInstrInfo();
+  TSM.init(&MF.getSubtarget());
 
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
@@ -132,7 +129,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
              "Basic block should contain at least a RET but is empty");
       MachineBasicBlock::iterator ReturnLoc = --MBB->end();
 
-      while (ReturnLoc->isDebugValue())
+      while (ReturnLoc->isDebugInstr())
         --ReturnLoc;
       assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
              "Basic block does not end with RET");
@@ -195,7 +192,7 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
       return true;
     }
 
-    CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
+    CyclesToEnd += TSM.computeInstrLatency(&MI);
   }
 
   VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
@@ -209,9 +206,8 @@ void PadShortFunc::addPadding(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator &MBBI,
                               unsigned int NOOPsToAdd) {
   DebugLoc DL = MBBI->getDebugLoc();
+  unsigned IssueWidth = TSM.getIssueWidth();
 
-  while (NOOPsToAdd-- > 0) {
-    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
-    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
-  }
+  for (unsigned i = 0, e = IssueWidth * NOOPsToAdd; i != e; ++i)
+    BuildMI(*MBB, MBBI, DL, TSM.getInstrInfo()->get(X86::NOOP));
 }
diff --git a/contrib/llvm/lib/Target/X86/X86PfmCounters.td b/contrib/llvm/lib/Target/X86/X86PfmCounters.td
new file mode 100644
index 000000000000..093fbafa3fba
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PfmCounters.td
@@ -0,0 +1,77 @@
+//===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for various subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedModel = SandyBridgeModel in {
+def SBCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def SBPort0Counter : PfmIssueCounter<SBPort0, ["uops_dispatched_port:port_0"]>;
+def SBPort1Counter : PfmIssueCounter<SBPort1, ["uops_dispatched_port:port_1"]>;
+def SBPort23Counter : PfmIssueCounter<SBPort23,
+                                      ["uops_dispatched_port:port_2",
+                                       "uops_dispatched_port:port_3"]>;
+def SBPort4Counter : PfmIssueCounter<SBPort4, ["uops_dispatched_port:port_4"]>;
+def SBPort5Counter : PfmIssueCounter<SBPort5, ["uops_dispatched_port:port_5"]>;
+}
+
+let SchedModel = HaswellModel in {
+def HWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def HWPort0Counter : PfmIssueCounter<HWPort0, ["uops_dispatched_port:port_0"]>;
+def HWPort1Counter : PfmIssueCounter<HWPort1, ["uops_dispatched_port:port_1"]>;
+def HWPort2Counter : PfmIssueCounter<HWPort2, ["uops_dispatched_port:port_2"]>;
+def HWPort3Counter : PfmIssueCounter<HWPort3, ["uops_dispatched_port:port_3"]>;
+def HWPort4Counter : PfmIssueCounter<HWPort4, ["uops_dispatched_port:port_4"]>;
+def HWPort5Counter : PfmIssueCounter<HWPort5, ["uops_dispatched_port:port_5"]>;
+def HWPort6Counter : PfmIssueCounter<HWPort6, ["uops_dispatched_port:port_6"]>;
+def HWPort7Counter : PfmIssueCounter<HWPort7, ["uops_dispatched_port:port_7"]>;
+}
+
+let SchedModel = BroadwellModel in {
+def BWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def BWPort0Counter : PfmIssueCounter<BWPort0, ["uops_executed_port:port_0"]>;
+def BWPort1Counter : PfmIssueCounter<BWPort1, ["uops_executed_port:port_1"]>;
+def BWPort2Counter : PfmIssueCounter<BWPort2, ["uops_executed_port:port_2"]>;
+def BWPort3Counter : PfmIssueCounter<BWPort3, ["uops_executed_port:port_3"]>;
+def BWPort4Counter : PfmIssueCounter<BWPort4, ["uops_executed_port:port_4"]>;
+def BWPort5Counter : PfmIssueCounter<BWPort5, ["uops_executed_port:port_5"]>;
+def BWPort6Counter : PfmIssueCounter<BWPort6, ["uops_executed_port:port_6"]>;
+def BWPort7Counter : PfmIssueCounter<BWPort7, ["uops_executed_port:port_7"]>;
+}
+
+let SchedModel = SkylakeClientModel in {
+def SKLCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def SKLPort0Counter : PfmIssueCounter<SKLPort0, ["uops_dispatched_port:port_0"]>;
+def SKLPort1Counter : PfmIssueCounter<SKLPort1, ["uops_dispatched_port:port_1"]>;
+def SKLPort2Counter : PfmIssueCounter<SKLPort2, ["uops_dispatched_port:port_2"]>;
+def SKLPort3Counter : PfmIssueCounter<SKLPort3, ["uops_dispatched_port:port_3"]>;
+def SKLPort4Counter : PfmIssueCounter<SKLPort4, ["uops_dispatched_port:port_4"]>;
+def SKLPort5Counter : PfmIssueCounter<SKLPort5, ["uops_dispatched_port:port_5"]>;
+def SKLPort6Counter : PfmIssueCounter<SKLPort6, ["uops_dispatched_port:port_6"]>;
+def SKLPort7Counter : PfmIssueCounter<SKLPort7, ["uops_dispatched_port:port_7"]>;
+}
+
+let SchedModel = SkylakeServerModel in {
+def SKXCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
+def SKXPort0Counter : PfmIssueCounter<SKXPort0, ["uops_dispatched_port:port_0"]>;
+def SKXPort1Counter : PfmIssueCounter<SKXPort1, ["uops_dispatched_port:port_1"]>;
+def SKXPort2Counter : PfmIssueCounter<SKXPort2, ["uops_dispatched_port:port_2"]>;
+def SKXPort3Counter : PfmIssueCounter<SKXPort3, ["uops_dispatched_port:port_3"]>;
+def SKXPort4Counter : PfmIssueCounter<SKXPort4, ["uops_dispatched_port:port_4"]>;
+def SKXPort5Counter : PfmIssueCounter<SKXPort5, ["uops_dispatched_port:port_5"]>;
+def SKXPort6Counter : PfmIssueCounter<SKXPort6, ["uops_dispatched_port:port_6"]>;
+def SKXPort7Counter : PfmIssueCounter<SKXPort7, ["uops_dispatched_port:port_7"]>;
+}
+
+let SchedModel = BtVer2Model in {
+def JCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">;
+def JFPU0Counter  : PfmIssueCounter<JFPU0, ["dispatched_fpu:pipe0"]>;
+def JFPU1Counter  : PfmIssueCounter<JFPU1, ["dispatched_fpu:pipe1"]>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
index aa0e3743c948..246d6d5a58d0 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -73,6 +73,8 @@ X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
       return PMI_GPR32;
     case 64:
       return PMI_GPR64;
+    case 128:
+      return PMI_VEC128;
       break;
     default:
       llvm_unreachable("Unsupported register size.");
@@ -83,6 +85,8 @@ X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
       return PMI_FP32;
     case 64:
       return PMI_FP64;
+    case 128:
+      return PMI_VEC128;
     default:
       llvm_unreachable("Unsupported register size.");
     }
@@ -169,6 +173,10 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   switch (Opc) {
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_SUB:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
     return getSameOperandsMapping(MI, false);
     break;
   case TargetOpcode::G_FADD:
@@ -190,6 +198,34 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // Instruction having only floating-point operands (all scalars in VECRReg)
     getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
     break;
+  case TargetOpcode::G_SITOFP: {
+    // Some of the floating-point instructions have mixed GPR and FP operands:
+    // fine-tune the computed mapping.
+    auto &Op0 = MI.getOperand(0);
+    auto &Op1 = MI.getOperand(1);
+    const LLT Ty0 = MRI.getType(Op0.getReg());
+    const LLT Ty1 = MRI.getType(Op1.getReg());
+    OpRegBankIdx[0] = getPartialMappingIdx(Ty0, /* isFP */ true);
+    OpRegBankIdx[1] = getPartialMappingIdx(Ty1, /* isFP */ false);
+    break;
+  }
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_ANYEXT: {
+    auto &Op0 = MI.getOperand(0);
+    auto &Op1 = MI.getOperand(1);
+    const LLT Ty0 = MRI.getType(Op0.getReg());
+    const LLT Ty1 = MRI.getType(Op1.getReg());
+
+    bool isFPTrunc = (Ty0.getSizeInBits() == 32 || Ty0.getSizeInBits() == 64) &&
+                     Ty1.getSizeInBits() == 128 && Opc == TargetOpcode::G_TRUNC;
+    bool isFPAnyExt =
+        Ty0.getSizeInBits() == 128 &&
+        (Ty1.getSizeInBits() == 32 || Ty1.getSizeInBits() == 64) &&
+        Opc == TargetOpcode::G_ANYEXT;
+
+    getInstrPartialMappingIdxs(MI, MRI, /* isFP */ isFPTrunc || isFPAnyExt,
+                               OpRegBankIdx);
+  } break;
   default:
     // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
     getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index bc31e95aa6b5..55842a4a2091 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -75,7 +75,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // ExecutionDepsFixer and PostRAScheduler require liveness.
+  // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness.
   return true;
 }
 
@@ -552,6 +552,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(X86::DIL);
     Reserved.set(X86::BPL);
     Reserved.set(X86::SPL);
+    Reserved.set(X86::SIH);
+    Reserved.set(X86::DIH);
+    Reserved.set(X86::BPH);
+    Reserved.set(X86::SPH);
 
     for (unsigned n = 0; n != 8; ++n) {
       // R8, R9, ...
@@ -571,7 +575,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   }
 
   assert(checkAllSuperRegsMarked(Reserved,
-                                 {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
+                                 {X86::SIL, X86::DIL, X86::BPL, X86::SPL,
+                                  X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
   return Reserved;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
index 1a776dcd04eb..ee9e7891f9f6 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -21,12 +21,14 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n>
 
 // Subregister indices.
 let Namespace = "X86" in {
-  def sub_8bit    : SubRegIndex<8>;
-  def sub_8bit_hi : SubRegIndex<8, 8>;
-  def sub_16bit   : SubRegIndex<16>;
-  def sub_32bit   : SubRegIndex<32>;
-  def sub_xmm     : SubRegIndex<128>;
-  def sub_ymm     : SubRegIndex<256>;
+  def sub_8bit     : SubRegIndex<8>;
+  def sub_8bit_hi  : SubRegIndex<8, 8>;
+  def sub_8bit_hi_phony  : SubRegIndex<8, 8>;
+  def sub_16bit    : SubRegIndex<16>;
+  def sub_16bit_hi : SubRegIndex<16, 16>;
+  def sub_32bit    : SubRegIndex<32>;
+  def sub_xmm      : SubRegIndex<128>;
+  def sub_ymm      : SubRegIndex<256>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -73,6 +75,40 @@ def R14B : X86Reg<"r14b", 14>;
 def R15B : X86Reg<"r15b", 15>;
 }
 
+let isArtificial = 1 in {
+// High byte of the low 16 bits of the super-register:
+def SIH   : X86Reg<"", -1>;
+def DIH   : X86Reg<"", -1>;
+def BPH   : X86Reg<"", -1>;
+def SPH   : X86Reg<"", -1>;
+def R8BH  : X86Reg<"", -1>;
+def R9BH  : X86Reg<"", -1>;
+def R10BH : X86Reg<"", -1>;
+def R11BH : X86Reg<"", -1>;
+def R12BH : X86Reg<"", -1>;
+def R13BH : X86Reg<"", -1>;
+def R14BH : X86Reg<"", -1>;
+def R15BH : X86Reg<"", -1>;
+// High word of the low 32 bits of the super-register:
+def HAX   : X86Reg<"", -1>;
+def HDX   : X86Reg<"", -1>;
+def HCX   : X86Reg<"", -1>;
+def HBX   : X86Reg<"", -1>;
+def HSI   : X86Reg<"", -1>;
+def HDI   : X86Reg<"", -1>;
+def HBP   : X86Reg<"", -1>;
+def HSP   : X86Reg<"", -1>;
+def HIP   : X86Reg<"", -1>;
+def R8WH  : X86Reg<"", -1>;
+def R9WH  : X86Reg<"", -1>;
+def R10WH : X86Reg<"", -1>;
+def R11WH : X86Reg<"", -1>;
+def R12WH : X86Reg<"", -1>;
+def R13WH : X86Reg<"", -1>;
+def R14WH : X86Reg<"", -1>;
+def R15WH : X86Reg<"", -1>;
+}
+
 // 16-bit registers
 let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
 def AX : X86Reg<"ax", 0, [AL,AH]>;
@@ -80,49 +116,52 @@ def DX : X86Reg<"dx", 2, [DL,DH]>;
 def CX : X86Reg<"cx", 1, [CL,CH]>;
 def BX : X86Reg<"bx", 3, [BL,BH]>;
 }
-let SubRegIndices = [sub_8bit] in {
-def SI : X86Reg<"si", 6, [SIL]>;
-def DI : X86Reg<"di", 7, [DIL]>;
-def BP : X86Reg<"bp", 5, [BPL]>;
-def SP : X86Reg<"sp", 4, [SPL]>;
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in {
+def SI : X86Reg<"si", 6, [SIL,SIH]>;
+def DI : X86Reg<"di", 7, [DIL,DIH]>;
+def BP : X86Reg<"bp", 5, [BPL,BPH]>;
+def SP : X86Reg<"sp", 4, [SPL,SPH]>;
 }
 def IP : X86Reg<"ip", 0>;
 
 // X86-64 only, requires REX.
-let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
-def R8W  : X86Reg<"r8w",   8, [R8B]>;
-def R9W  : X86Reg<"r9w",   9, [R9B]>;
-def R10W : X86Reg<"r10w", 10, [R10B]>;
-def R11W : X86Reg<"r11w", 11, [R11B]>;
-def R12W : X86Reg<"r12w", 12, [R12B]>;
-def R13W : X86Reg<"r13w", 13, [R13B]>;
-def R14W : X86Reg<"r14w", 14, [R14B]>;
-def R15W : X86Reg<"r15w", 15, [R15B]>;
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = 1,
+    CoveredBySubRegs = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B,R8BH]>;
+def R9W  : X86Reg<"r9w",   9, [R9B,R9BH]>;
+def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>;
+def R11W : X86Reg<"r11w", 11, [R11B,R11BH]>;
+def R12W : X86Reg<"r12w", 12, [R12B,R12BH]>;
+def R13W : X86Reg<"r13w", 13, [R13B,R13BH]>;
+def R14W : X86Reg<"r14w", 14, [R14B,R14BH]>;
+def R15W : X86Reg<"r15w", 15, [R15B,R15BH]>;
 }
 
 // 32-bit registers
-let SubRegIndices = [sub_16bit] in {
-def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
-def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
-def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
-def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
-def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
-def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
-def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
-def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
-def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in {
+def EAX : X86Reg<"eax", 0, [AX, HAX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX, HDX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX, HCX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX, HBX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI, HSI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI, HDI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP, HBP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP, HSP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>;
+}
 
 // X86-64 only, requires REX
-let CostPerUse = 1 in {
-def R8D  : X86Reg<"r8d",   8, [R8W]>;
-def R9D  : X86Reg<"r9d",   9, [R9W]>;
-def R10D : X86Reg<"r10d", 10, [R10W]>;
-def R11D : X86Reg<"r11d", 11, [R11W]>;
-def R12D : X86Reg<"r12d", 12, [R12W]>;
-def R13D : X86Reg<"r13d", 13, [R13W]>;
-def R14D : X86Reg<"r14d", 14, [R14W]>;
-def R15D : X86Reg<"r15d", 15, [R15W]>;
-}}
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = 1,
+    CoveredBySubRegs = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W,R8WH]>;
+def R9D  : X86Reg<"r9d",   9, [R9W,R9WH]>;
+def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>;
+def R11D : X86Reg<"r11d", 11, [R11W,R11WH]>;
+def R12D : X86Reg<"r12d", 12, [R12W,R12WH]>;
+def R13D : X86Reg<"r13d", 13, [R13W,R13WH]>;
+def R14D : X86Reg<"r14d", 14, [R14W,R14WH]>;
+def R15D : X86Reg<"r15d", 15, [R15W,R15WH]>;
+}
 
 // 64-bit registers, X86-64 only
 let SubRegIndices = [sub_32bit] in {
@@ -261,7 +300,7 @@ def FPSW : X86Reg<"fpsw", 0>;
 def EFLAGS : X86Reg<"flags", 0>;
 
 // The direction flag.
-def DF : X86Reg<"DF", 0>;
+def DF : X86Reg<"dirflag", 0>;
 
 
 // Segment registers
@@ -347,10 +386,21 @@ def GR8 : RegisterClass<"X86", [i8],  8,
   }];
 }
 
+let isAllocatable = 0 in
+def GRH8 : RegisterClass<"X86", [i8],  8,
+                         (add SIH, DIH, BPH, SPH, R8BH, R9BH, R10BH, R11BH,
+                              R12BH, R13BH, R14BH, R15BH)>;
+
 def GR16 : RegisterClass<"X86", [i16], 16,
                          (add AX, CX, DX, SI, DI, BX, BP, SP,
                               R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
 
+let isAllocatable = 0 in
+def GRH16 : RegisterClass<"X86", [i16], 16,
+                          (add HAX, HCX, HDX, HSI, HDI, HBX, HBP, HSP, HIP,
+                               R8WH, R9WH, R10WH, R11WH, R12WH, R13WH, R14WH,
+                               R15WH)>;
+
 def GR32 : RegisterClass<"X86", [i32], 32,
                          (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
                               R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
@@ -410,11 +460,6 @@ def GR32_NOREX : RegisterClass<"X86", [i32], 32,
 def GR64_NOREX : RegisterClass<"X86", [i64], 64,
                             (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
 
-// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
-// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
-// to clear upper 32-bits of RAX so is not a NOP.
-def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
-
 // GR32_NOSP - GR32 registers except ESP.
 def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
 
@@ -459,8 +504,6 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 
 def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
 
-def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
-
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
@@ -482,16 +525,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
 // Generic vector registers: VR64 and VR128.
 // Ensure that float types are declared first - only float is legal on SSE1.
 def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
-def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
                           128, (add FR32)>;
 def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                           256, (sequence "YMM%u", 0, 15)>;
 
 // Special classes that help the assembly parser choose some alternate
 // instructions to favor 2-byte VEX encodings.
-def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
                            128, (sequence "XMM%u", 0, 7)>;
-def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
                            128, (sequence "XMM%u", 8, 15)>;
 def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 7)>;
@@ -522,7 +565,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
 def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
 
 // Extended VR128 and VR256 for AVX-512 instructions
-def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
                            128, (add FR32X)>;
 def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 31)>;
diff --git a/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp
index d03826bbe992..250deb3523b4 100644
--- a/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp
@@ -91,7 +91,7 @@ bool X86RetpolineThunks::doInitialization(Module &M) {
 }
 
 bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << getPassName() << '\n');
+  LLVM_DEBUG(dbgs() << getPassName() << '\n');
 
   TM = &MF.getTarget();;
   STI = &MF.getSubtarget<X86Subtarget>();
@@ -214,6 +214,15 @@ void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
   IRBuilder<> Builder(Entry);
 
   Builder.CreateRetVoid();
+
+  // MachineFunctions/MachineBasicBlocks aren't created automatically for the
+  // IR-level constructs we already made. Create them and insert them into the
+  // module.
+  MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
+  MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
+
+  // Insert EntryMBB into MF. It's not in the module until we do this.
+  MF.insert(MF.end(), EntryMBB);
 }
 
 void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td
index e4e0ed435103..c7713fea70fa 100755
--- a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -11,8 +11,9 @@
 // scheduling and other instruction cost heuristics.
 //
 //===----------------------------------------------------------------------===//
+
 def BroadwellModel : SchedMachineModel {
-  // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+  // All x86 instructions are modeled as a single micro-op, and BW can decode 4
   // instructions per cycle.
   let IssueWidth = 4;
   let MicroOpBufferSize = 192; // Based on the reorder buffer.
@@ -22,7 +23,7 @@ def BroadwellModel : SchedMachineModel {
   // Based on the LSD (loop-stream detector) queue size and benchmarking data.
   let LoopMicroOpBufferSize = 50;
 
-  // This flag is set to allow the scheduler to assign a default model to 
+  // This flag is set to allow the scheduler to assign a default model to
   // unrecognized opcodes.
   let CompleteModel = 0;
 }
@@ -66,6 +67,11 @@ def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4,
   let BufferSize=60;
 }
 
+// Integer division issued on port 0.
+def BWDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def BWFPDivider : ProcResource<1>;
+
 // Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
 // cycles after the memory operand.
 def : ReadAdvance<ReadAfterLd, 5>;
@@ -76,45 +82,84 @@ def : ReadAdvance<ReadAfterLd, 5>;
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
 multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [1], int UOps = 1,
+                          int LoadLat = 5> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
-  // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
-  // latency.
-  def : WriteRes<SchedRW.Folded, [BWPort23, ExePort]> {
-     let Latency = !add(Lat, 5);
+  // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+  // the latency (default = 5).
+  def : WriteRes<SchedRW.Folded, !listconcat([BWPort23], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = !add(UOps, 1);
   }
 }
 
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [BWPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [BWPort237,BWPort4]>;
 
 // Arithmetic.
-defm : BWWriteResPair<WriteALU,   BWPort0156, 1>; // Simple integer ALU op.
-defm : BWWriteResPair<WriteIMul,  BWPort1,   3>; // Integer multiplication.
+defm : BWWriteResPair<WriteALU,    [BWPort0156], 1>; // Simple integer ALU op.
+defm : BWWriteResPair<WriteADC,    [BWPort06], 1>; // Integer ALU + flags op.
+defm : BWWriteResPair<WriteIMul,   [BWPort1], 3>; // Integer multiplication.
+defm : BWWriteResPair<WriteIMul64, [BWPort1], 3>; // Integer 64-bit multiplication.
+defm : BWWriteResPair<WriteDiv8,   [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteDiv16,  [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteDiv32,  [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteDiv64,  [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv8,  [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv16, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv32, [BWPort0, BWDivider], 25, [1, 10]>;
+defm : BWWriteResPair<WriteIDiv64, [BWPort0, BWDivider], 25, [1, 10]>;
+
+defm : BWWriteResPair<WriteBSWAP32,[BWPort15], 1>; //
+defm : BWWriteResPair<WriteBSWAP64,[BWPort06, BWPort15], 2, [1, 1], 2>; //
+
+defm : BWWriteResPair<WriteCRC32, [BWPort1],   3>;
 def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def BWDivider : ProcResource<1>; // Integer division issued on port 0.     
-def : WriteRes<WriteIDiv, [BWPort0, BWDivider]> { // Integer division.
-  let Latency = 25;
-  let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [BWPort23, BWPort0, BWDivider]> {
-  let Latency = 29;
-  let ResourceCycles = [1, 1, 10];
-}
 
 def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
 
+defm : BWWriteResPair<WriteCMOV,  [BWPort06], 1>; // Conditional move.
+defm : BWWriteResPair<WriteCMOV2, [BWPort06,BWPort0156], 2, [1,1], 2>; // // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
+
+def  : WriteRes<WriteSETCC, [BWPort06]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+def  : WriteRes<WriteLAHFSAHF, [BWPort06]>;
+
+// Bit counts.
+defm : BWWriteResPair<WriteBSF, [BWPort1], 3>;
+defm : BWWriteResPair<WriteBSR, [BWPort1], 3>;
+defm : BWWriteResPair<WriteLZCNT,          [BWPort1], 3>;
+defm : BWWriteResPair<WriteTZCNT,          [BWPort1], 3>;
+defm : BWWriteResPair<WritePOPCNT,         [BWPort1], 3>;
+
 // Integer shifts and rotates.
-defm : BWWriteResPair<WriteShift, BWPort06,  1>;
+defm : BWWriteResPair<WriteShift, [BWPort06],  1>;
+
+// Double shift instructions.
+defm : BWWriteResPair<WriteShiftDouble, [BWPort06],  1>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
+defm : BWWriteResPair<WriteBZHI, [BWPort15], 1>;
 
 // Loads, stores, and moves, not folded with other operations.
-def : WriteRes<WriteLoad,  [BWPort23]> { let Latency = 5; }
-def : WriteRes<WriteStore, [BWPort237, BWPort4]>;
-def : WriteRes<WriteMove,  [BWPort0156]>;
+defm : X86WriteRes<WriteLoad,    [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore,   [BWPort237, BWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove,    [BWPort0156], 1, [1], 1>;
 
 // Idioms that clear a register, like xorps %xmm0, %xmm0.
 // These can often bypass execution ports completely.
@@ -125,153 +170,367 @@ def : InstRW<[WriteMove], (instrs COPY)>;
 
 // Branches don't produce values, so they have no latency, but they still
 // consume resources. Indirect branches can fold loads.
-defm : BWWriteResPair<WriteJump,  BWPort06,   1>;
+defm : BWWriteResPair<WriteJump,  [BWPort06],   1>;
 
 // Floating point. This covers both scalar and vector operations.
-defm : BWWriteResPair<WriteFAdd,   BWPort1, 3>; // Floating point add/sub/compare.
-defm : BWWriteResPair<WriteFMul,   BWPort0, 5>; // Floating point multiplication.
-defm : BWWriteResPair<WriteFDiv,   BWPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : BWWriteResPair<WriteFSqrt,  BWPort0, 15>; // Floating point square root.
-defm : BWWriteResPair<WriteFRcp,   BWPort0, 5>; // Floating point reciprocal estimate.
-defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate.
-defm : BWWriteResPair<WriteFMA,    BWPort01, 5>; // Fused Multiply Add.
-defm : BWWriteResPair<WriteFShuffle,  BWPort5,  1>; // Floating point vector shuffles.
-defm : BWWriteResPair<WriteFBlend,  BWPort015,  1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends.	       
-  let Latency = 2;
-  let ResourceCycles = [2];
-} 
-def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0,          [BWPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,          [BWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC,          [BWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad,         [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [BWPort23,BWPort5], 7, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [BWPort23,BWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFStore,        [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX,       [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY,       [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT,      [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX,     [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY,     [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMove,         [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [BWPort5], 1, [1], 1>;
+
+defm : BWWriteResPair<WriteFAdd,    [BWPort1],  3, [1], 1, 5>; // Floating point add/sub.
+defm : BWWriteResPair<WriteFAddX,   [BWPort1],  3, [1], 1, 5>; // Floating point add/sub (XMM).
+defm : BWWriteResPair<WriteFAddY,   [BWPort1],  3, [1], 1, 6>; // Floating point add/sub (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : BWWriteResPair<WriteFAdd64,  [BWPort1],  3, [1], 1, 5>; // Floating point double add/sub.
+defm : BWWriteResPair<WriteFAdd64X, [BWPort1],  3, [1], 1, 5>; // Floating point double add/sub (XMM).
+defm : BWWriteResPair<WriteFAdd64Y, [BWPort1],  3, [1], 1, 6>; // Floating point double add/sub (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : BWWriteResPair<WriteFCmp,    [BWPort1],  3, [1], 1, 5>; // Floating point compare.
+defm : BWWriteResPair<WriteFCmpX,   [BWPort1],  3, [1], 1, 5>; // Floating point compare (XMM).
+defm : BWWriteResPair<WriteFCmpY,   [BWPort1],  3, [1], 1, 6>; // Floating point compare (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : BWWriteResPair<WriteFCmp64,  [BWPort1],  3, [1], 1, 5>; // Floating point double compare.
+defm : BWWriteResPair<WriteFCmp64X, [BWPort1],  3, [1], 1, 5>; // Floating point double compare (XMM).
+defm : BWWriteResPair<WriteFCmp64Y, [BWPort1],  3, [1], 1, 6>; // Floating point double compare (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : BWWriteResPair<WriteFCom,    [BWPort1],  3>; // Floating point compare to flags.
+
+defm : BWWriteResPair<WriteFMul,    [BWPort01], 3, [1], 1, 5>; // Floating point multiplication.
+defm : BWWriteResPair<WriteFMulX,   [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM).
+defm : BWWriteResPair<WriteFMulY,   [BWPort01], 3, [1], 1, 6>; // Floating point multiplication (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : BWWriteResPair<WriteFMul64,  [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication.
+defm : BWWriteResPair<WriteFMul64X, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication (XMM).
+defm : BWWriteResPair<WriteFMul64Y, [BWPort01], 3, [1], 1, 6>; // Floating point double multiplication (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+//defm : BWWriteResPair<WriteFDiv,     [BWPort0,BWFPDivider], 11, [1,3], 1, 5>; // Floating point division.
+defm : BWWriteResPair<WriteFDivX,    [BWPort0,BWFPDivider], 11, [1,5], 1, 5>; // Floating point division (XMM).
+defm : BWWriteResPair<WriteFDivY,    [BWPort0,BWPort015,BWFPDivider], 17, [2,1,10], 3, 6>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+//defm : BWWriteResPair<WriteFDiv64,   [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division.
+defm : BWWriteResPair<WriteFDiv64X,  [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division (XMM).
+defm : BWWriteResPair<WriteFDiv64Y,  [BWPort0,BWPort015,BWFPDivider], 23, [2,1,16], 3, 6>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : X86WriteRes<WriteFSqrt,       [BWPort0,BWFPDivider], 11, [1,4], 1>; // Floating point square root.
+defm : X86WriteRes<WriteFSqrtLd,     [BWPort0,BWPort23,BWFPDivider], 16, [1,1,7], 2>;
+defm : BWWriteResPair<WriteFSqrtX,   [BWPort0,BWFPDivider], 11, [1,7], 1, 5>; // Floating point square root (XMM).
+defm : BWWriteResPair<WriteFSqrtY,   [BWPort0,BWPort015,BWFPDivider], 21, [2,1,14], 3, 6>; // Floating point square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : X86WriteRes<WriteFSqrt64,     [BWPort0,BWFPDivider], 16, [1,8], 1>; // Floating point double square root.
+defm : X86WriteRes<WriteFSqrt64Ld,   [BWPort0,BWPort23,BWFPDivider], 21, [1,1,14], 2>;
+defm : BWWriteResPair<WriteFSqrt64X, [BWPort0,BWFPDivider], 16, [1,14],1, 5>; // Floating point double square root (XMM).
+defm : BWWriteResPair<WriteFSqrt64Y, [BWPort0,BWPort015,BWFPDivider], 29, [2,1,28], 3, 6>; // Floating point double square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : BWWriteResPair<WriteFSqrt80,  [BWPort0,BWFPDivider], 23, [1,9]>; // Floating point long double square root.
+
+defm : BWWriteResPair<WriteFRcp,   [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRcpX,  [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
+defm : BWWriteResPair<WriteFRcpY,  [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : BWWriteResPair<WriteFRsqrt, [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFRsqrtX,[BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
+defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : BWWriteResPair<WriteFMA,    [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add.
+defm : BWWriteResPair<WriteFMAX,   [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add (XMM).
+defm : BWWriteResPair<WriteFMAY,   [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : BWWriteResPair<WriteDPPD,   [BWPort0,BWPort1,BWPort5],  9, [1,1,1], 3, 5>; // Floating point double dot product.
+defm : BWWriteResPair<WriteDPPS,   [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product.
+defm : BWWriteResPair<WriteDPPSY,  [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM).
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : BWWriteResPair<WriteFSign,     [BWPort5], 1>; // Floating point fabs/fchs.
+defm : X86WriteRes<WriteFRnd,            [BWPort23],  6, [1],   1>; // Floating point rounding.
+defm : X86WriteRes<WriteFRndY,           [BWPort23],  6, [1],   1>; // Floating point rounding (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : X86WriteRes<WriteFRndLd,  [BWPort1,BWPort23], 11, [2,1], 3>;
+defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
+defm : BWWriteResPair<WriteFLogic,    [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
+defm : BWWriteResPair<WriteFLogicY,   [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : BWWriteResPair<WriteFTest,     [BWPort0], 1, [1], 1, 5>; // Floating point TEST instructions.
+defm : BWWriteResPair<WriteFTestY,    [BWPort0], 1, [1], 1, 6>; // Floating point TEST instructions (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : BWWriteResPair<WriteFShuffle,  [BWPort5], 1, [1], 1, 5>; // Floating point vector shuffles.
+defm : BWWriteResPair<WriteFShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : BWWriteResPair<WriteFVarShuffle,  [BWPort5], 1, [1], 1, 5>; // Floating point vector variable shuffles.
+defm : BWWriteResPair<WriteFVarShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : BWWriteResPair<WriteFBlend,  [BWPort015], 1, [1], 1, 5>; // Floating point vector blends.
+defm : BWWriteResPair<WriteFBlendY, [BWPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : BWWriteResPair<WriteFVarBlend,  [BWPort5], 2, [2], 2, 5>; // Fp vector variable blends.
+defm : BWWriteResPair<WriteFVarBlendY, [BWPort5], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 
 // FMA Scheduling helper class.
 // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
 // Vector integer operations.
-defm : BWWriteResPair<WriteVecALU,   BWPort15,  1>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecShift, BWPort0,  1>; // Vector integer shifts.
-defm : BWWriteResPair<WriteVecIMul,  BWPort0,   5>; // Vector integer multiply.
-defm : BWWriteResPair<WriteShuffle,  BWPort5,  1>; // Vector shuffles.
-defm : BWWriteResPair<WriteBlend,  BWPort15,  1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [BWPort5]> { // Vector variable blends.
+defm : X86WriteRes<WriteVecLoad,         [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [BWPort23,BWPort5], 7, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [BWPort23,BWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore,        [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX,       [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY,       [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT,      [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY,     [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMove,         [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [BWPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [BWPort5], 1, [1], 1>;
+
+defm : X86WriteRes<WriteEMMS,            [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
+
+defm : BWWriteResPair<WriteVecALU,   [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUX,  [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUY,  [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : BWWriteResPair<WriteVecTest,  [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions.
+defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : BWWriteResPair<WriteVecIMul,  [BWPort0],  5, [1], 1, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteVecIMulX, [BWPort0],  5, [1], 1, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteVecIMulY, [BWPort0],  5, [1], 1, 6>; // Vector integer multiply.
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : BWWriteResPair<WritePMULLD,   [BWPort0], 10, [2], 2, 5>; // Vector PMULLD.
+defm : BWWriteResPair<WritePMULLDY,  [BWPort0], 10, [2], 2, 6>; // Vector PMULLD (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : BWWriteResPair<WriteShuffle,  [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleX, [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleY, [BWPort5], 1, [1], 1, 6>; // Vector shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteVarShuffleX,[BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteVarShuffleY,[BWPort5], 1, [1], 1, 6>; // Vector variable shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : BWWriteResPair<WriteBlend,  [BWPort5], 1, [1], 1, 5>; // Vector blends.
+defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : BWWriteResPair<WriteVarBlend,  [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
+defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : BWWriteResPair<WriteMPSAD,  [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD.
+defm : BWWriteResPair<WriteMPSADY, [BWPort0, BWPort5], 7, [1, 2], 3, 6>; // Vector MPSAD.
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : BWWriteResPair<WritePSADBW,   [BWPort0], 5, [1], 1, 5>; // Vector PSADBW.
+defm : BWWriteResPair<WritePSADBWX,  [BWPort0], 5, [1], 1, 5>; // Vector PSADBW.
+defm : BWWriteResPair<WritePSADBWY,  [BWPort0], 5, [1], 1, 6>; // Vector PSADBW (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : BWWriteResPair<WritePHMINPOS, [BWPort0], 5>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : BWWriteResPair<WriteVecShift,     [BWPort0], 1, [1], 1, 5>;
+defm : BWWriteResPair<WriteVecShiftX,    [BWPort0,BWPort5],  2, [1,1], 2, 5>;
+defm : X86WriteRes<WriteVecShiftY,       [BWPort0,BWPort5],  4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd,     [BWPort0,BWPort23], 7, [1,1], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : BWWriteResPair<WriteVecShiftImm,  [BWPort0],  1, [1], 1, 5>;
+defm : BWWriteResPair<WriteVecShiftImmX, [BWPort0],  1, [1], 1, 5>; // Vector integer immediate shifts (XMM).
+defm : BWWriteResPair<WriteVecShiftImmY, [BWPort0],  1, [1], 1, 6>; // Vector integer immediate shifts (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : BWWriteResPair<WriteVarVecShift,  [BWPort0, BWPort5], 3, [2,1], 3, 5>; // Variable vector shifts.
+defm : BWWriteResPair<WriteVarVecShiftY, [BWPort0, BWPort5], 3, [2,1], 3, 6>; // Variable vector shifts (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [BWPort5]> {
   let Latency = 2;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : WriteRes<WriteVarBlendLd, [BWPort5, BWPort23]> {
+def : WriteRes<WriteVecInsertLd, [BWPort5,BWPort23]> {
   let Latency = 6;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 2;
 }
 
-def : WriteRes<WriteMPSAD, [BWPort0, BWPort5]> { // Vector MPSAD.     
-  let Latency = 6;
-  let ResourceCycles = [1, 2];
+def : WriteRes<WriteVecExtract, [BWPort0,BWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
 }
-def : WriteRes<WriteMPSADLd, [BWPort23, BWPort0, BWPort5]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [BWPort4,BWPort5,BWPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
 }
 
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm : BWWriteResPair<WriteVecLogic, BWPort015, 1>; // Vector and/or/xor.
-
 // Conversion between integer and float.
-defm : BWWriteResPair<WriteCvtF2I, BWPort1, 3>; // Float -> Integer.
-defm : BWWriteResPair<WriteCvtI2F, BWPort1, 4>; // Integer -> Float.
-defm : BWWriteResPair<WriteCvtF2F, BWPort1, 3>; // Float -> Float size conversion.
+defm : BWWriteResPair<WriteCvtSS2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2IY,  [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : BWWriteResPair<WriteCvtSD2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2IY,  [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : BWWriteResPair<WriteCvtI2SS,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PS,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PSY,  [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : BWWriteResPair<WriteCvtI2SD,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PD,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PDY,  [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : BWWriteResPair<WriteCvtSS2SD,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PD,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : BWWriteResPair<WriteCvtSD2SS,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PS,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS,     [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd,  [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH,    [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [BWPort1,BWPort5], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt,  [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 
 // Strings instructions.
+
 // Packed Compare Implicit Length Strings, Return Mask
-// String instructions.
 def : WriteRes<WritePCmpIStrM, [BWPort0]> {
-  let Latency = 10;
+  let Latency = 11;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrMLd, [BWPort0, BWPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 1];
-} 
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
 // Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort16, BWPort5]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort5, BWPort015, BWPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
-def : WriteRes<WritePCmpEStrMLd, [BWPort05, BWPort16, BWPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [6, 2, 1];
-} 
-  // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrMLd, [BWPort0, BWPort5, BWPort23, BWPort015, BWPort0156]> {
+  let Latency = 24;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
 def : WriteRes<WritePCmpIStrI, [BWPort0]> {
   let Latency = 11;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrILd, [BWPort0, BWPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 1];
-}     
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
 // Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [BWPort05, BWPort16]> {
-  let Latency = 11;
-  let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [BWPort0, BWPort5, BWPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4,3,1];
 }
-def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort16, BWPort5, BWPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort5, BWPort23, BWPort0156]> {
+  let Latency = 23;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
 
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK,  [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSKY, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteMMXMOVMSK,  [BWPort0]> { let Latency = 1; }
+
 // AES instructions.
 def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption.
   let Latency = 7;
+  let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
 def : WriteRes<WriteAESDecEncLd, [BWPort5, BWPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
+  let Latency = 12;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
+
 def : WriteRes<WriteAESIMC, [BWPort5]> { // InvMixColumn.
   let Latency = 14;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
 def : WriteRes<WriteAESIMCLd, [BWPort5, BWPort23]> {
-  let Latency = 14;
-  let ResourceCycles = [2, 1];
+  let Latency = 19;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5]> { // Key Generation.
-  let Latency = 10;
-  let ResourceCycles = [2, 8];
+
+def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5, BWPort015]> { // Key Generation.
+  let Latency = 29;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,7,2];
 }
-def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23, BWPort015]> {
+  let Latency = 33;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,7,1,1];
 }
 
 // Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [BWPort0, BWPort5]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteCLMulLd, [BWPort0, BWPort5, BWPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1, 1];
-}
+defm : BWWriteResPair<WriteCLMul,  [BWPort0], 5>;
 
 // Catch-all for expensive system instructions.
 def : WriteRes<WriteSystem,     [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
 
 // AVX2.
-defm : BWWriteResPair<WriteFShuffle256,  BWPort5,  3>; // Fp 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteShuffle256,  BWPort5,  3>;  // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [BWPort0, BWPort5]> { // Variable vector shifts.
-  let Latency = 2;
-  let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [BWPort0, BWPort5, BWPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1, 1];
-}
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector variable shuffles.
 
 // Old microcoded instructions that nobody use.
 def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
@@ -279,33 +538,22 @@ def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def Writ
 // Fence instructions.
 def : WriteRes<WriteFence,  [BWPort23, BWPort4]>;
 
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
 // Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [BWPort1]> {
-  let Latency = 3;
-}
 
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [BWPort1, BWPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [BWPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [BWPort15, BWPort23]> {
-  let Latency = 5;
-  let ResourceCycles = [1, 1];
-}
+defm : BWWriteResPair<WriteFHAdd,   [BWPort1,BWPort5], 5, [1,2], 3, 5>;
+defm : BWWriteResPair<WriteFHAddY,  [BWPort1,BWPort5], 5, [1,2], 3, 6>;
+defm : BWWriteResPair<WritePHAdd,  [BWPort5,BWPort15], 3, [2,1], 3, 5>;
+defm : BWWriteResPair<WritePHAddX, [BWPort5,BWPort15], 3, [2,1], 3, 5>;
+defm : BWWriteResPair<WritePHAddY, [BWPort5,BWPort15], 3, [2,1], 3, 6>;
 
 // Remaining instrs.
 
@@ -314,264 +562,23 @@ def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "MOVPQIto64rr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSLLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSLLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSLLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRADri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRAWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "PSRLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRADYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRADri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWYri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWri")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSYrr")>;
-def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQ(Y?)rr",
+                                           "VPSRLVQ(Y?)rr")>;
 
 def BWWriteResGroup2 : SchedWriteRes<[BWPort1]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup2], (instregex "COMP_FST0r")>;
-def: InstRW<[BWWriteResGroup2], (instregex "COM_FST0r")>;
-def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[BWWriteResGroup2], (instregex "UCOM_FPr")>;
-def: InstRW<[BWWriteResGroup2], (instregex "UCOM_Fr")>;
-def: InstRW<[BWWriteResGroup2], (instregex "VMASKMOVDQU")>;
+def: InstRW<[BWWriteResGroup2], (instregex "COM(P?)_FST0r",
+                                           "UCOM_F(P?)r")>;
 
 def BWWriteResGroup3 : SchedWriteRes<[BWPort5]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup3], (instregex "ANDNPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ANDNPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ANDPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ANDPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "INSERTPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVAPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVAPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVDDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVHLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSHDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "ORPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKSSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKSSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKUSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PACKUSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PALIGNRrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PBLENDWrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFDri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFHWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSHUFLWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSLLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PSRLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "SHUFPDrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "SHUFPSrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VANDPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VINSERTPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVHLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VORPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQYri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSYrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSrri")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPDYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPSYrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VXORPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "XORPDrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "XORPSrr")>;
 
 def BWWriteResGroup4 : SchedWriteRes<[BWPort6]> {
   let Latency = 1;
@@ -585,561 +592,93 @@ def BWWriteResGroup5 : SchedWriteRes<[BWPort01]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup5], (instregex "FINCSTP")>;
-def: InstRW<[BWWriteResGroup5], (instregex "FNOP")>;
+def: InstRW<[BWWriteResGroup5], (instrs FINCSTP, FNOP)>;
 
 def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADCX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADOX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CDQ")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "CQO")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JAE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JAE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JA_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JA_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JBE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JBE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JB_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JB_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JGE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JGE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JG_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JG_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JLE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JLE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JL_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JL_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JMP_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JMP_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNE_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNE_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNO_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNO_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNP_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNP_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNS_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JNS_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JO_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JO_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JP_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JP_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JS_1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "JS_4")>;
-def: InstRW<[BWWriteResGroup6], (instregex "RORX(32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR8r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SAR8ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SARX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETAEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETBr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETGEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETGr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETLEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETLr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNEr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNOr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNPr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETNSr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETOr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETPr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SETSr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL8r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHL8ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR8r1")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHR8ri")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SHRX(32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>;
+def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8",
+                                           "BT(16|32|64)rr",
+                                           "BTC(16|32|64)ri8",
+                                           "BTC(16|32|64)rr",
+                                           "BTR(16|32|64)ri8",
+                                           "BTR(16|32|64)rr",
+                                           "BTS(16|32|64)ri8",
+                                           "BTS(16|32|64)rr")>;
 
 def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "LEA(16|32|64)(_32)?r")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDQirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PABSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PABSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PABSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PADDWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PAVGBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PAVGWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMAXUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PMINUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSIGNBrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSIGNDrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSIGNWrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "PSUBWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPABSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDQYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPADDWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWrr128")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWYrr")>;
-def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr",
+                                           "BLSI(32|64)rr",
+                                           "BLSMSK(32|64)rr",
+                                           "BLSR(32|64)rr")>;
 
 def BWWriteResGroup8 : SchedWriteRes<[BWPort015]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup8], (instregex "BLENDPDrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "BLENDPSrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDNirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PORirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_PXORirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PANDNrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PANDrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PORrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "PXORrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDYrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSYrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDNYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDNrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPANDrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDYrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPORYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPORrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPXORYrr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VPXORrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDD(Y?)rri")>;
 
 def BWWriteResGroup9 : SchedWriteRes<[BWPort0156]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CBW")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CLC")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMC")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CWDE")>;
-def: InstRW<[BWWriteResGroup9], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "DEC8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "INC(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "INC8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "LAHF")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NEG8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NOOP")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "NOT8r")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SAHF")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SIDT64m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SLDT64m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SMSW16m")>;
-def: InstRW<[BWWriteResGroup9], (instregex "STC")>;
-def: InstRW<[BWWriteResGroup9], (instregex "STRm")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SYSCALL")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "TEST8rr")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)rr(_REV)?")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8i8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m",
+                                           "SIDT64m",
+                                           "SMSW16m",
+                                           "STRm",
+                                           "SYSCALL")>;
 
 def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> {
   let Latency = 1;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOV8mi")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOV8mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVAPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVAPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVDQAmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVDQUmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVHPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVHPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVLPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVLPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTDQmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTI_64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVPDI2DImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVPQI2QImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVPQIto64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVSDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVSSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVUPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "MOVUPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "ST_FP32m")>;
-def: InstRW<[BWWriteResGroup10], (instregex "ST_FP64m")>;
-def: InstRW<[BWWriteResGroup10], (instregex "ST_FP80m")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVSDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVSSmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSYmr")>;
-def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSmr")>;
-
-def BWWriteResGroup11 : SchedWriteRes<[BWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPDrr0")>;
-def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPSrr0")>;
-def: InstRW<[BWWriteResGroup11], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PBLENDVBrr0")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRBrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRDrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRQrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "PINSRWrri")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRBrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRDrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRQrr")>;
-def: InstRW<[BWWriteResGroup11], (instregex "VPINSRWrri")>;
+def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm",
+                                            "ST_FP(32|64|80)m")>;
 
 def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[BWWriteResGroup12], (instregex "FDECSTP")>;
+def: InstRW<[BWWriteResGroup12], (instrs FDECSTP)>;
 
 def BWWriteResGroup13 : SchedWriteRes<[BWPort06]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROL8r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROL8ri")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR8r1")>;
-def: InstRW<[BWWriteResGroup13], (instregex "ROR8ri")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROL(8|16|32|64)r1",
+                                            "ROL(8|16|32|64)ri",
+                                            "ROR(8|16|32|64)r1",
+                                            "ROR(8|16|32|64)ri")>;
 
 def BWWriteResGroup14 : SchedWriteRes<[BWPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[BWWriteResGroup14], (instregex "LFENCE")>;
-def: InstRW<[BWWriteResGroup14], (instregex "MFENCE")>;
-def: InstRW<[BWWriteResGroup14], (instregex "WAIT")>;
-def: InstRW<[BWWriteResGroup14], (instregex "XGETBV")>;
+def: InstRW<[BWWriteResGroup14], (instrs LFENCE,
+                                         MFENCE,
+                                         WAIT,
+                                         XGETBV)>;
 
 def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup15], (instregex "CVTPS2PDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "CVTSS2SDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "EXTRACTPSrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRBrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWri")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSLLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSLLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSLLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRADrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRAWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PSRLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "PTESTrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRBrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWri")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSLLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSLLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSLLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRADrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRAWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRLDrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRLQrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPSRLWrr")>;
-def: InstRW<[BWWriteResGroup15], (instregex "VPTESTrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "(V?)CVTPS2PDrr",
+                                            "(V?)CVTSS2SDrr")>;
 
 def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> {
   let Latency = 2;
@@ -1160,76 +699,27 @@ def BWWriteResGroup18 : SchedWriteRes<[BWPort237,BWPort0156]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup18], (instregex "SFENCE")>;
-
-def BWWriteResGroup19 : SchedWriteRes<[BWPort06,BWPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup18], (instrs SFENCE)>;
 
 def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8")>;
-def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri")>;
-def: InstRW<[BWWriteResGroup20], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup20], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup20], (instregex "CWD")>;
-def: InstRW<[BWWriteResGroup20], (instregex "JRCXZ")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SBB8i8")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SBB8ri")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SETAr")>;
-def: InstRW<[BWWriteResGroup20], (instregex "SETBEr")>;
-
-def BWWriteResGroup21 : SchedWriteRes<[BWPort4,BWPort5,BWPort237]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup21], (instregex "EXTRACTPSmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRBmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRDmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRQmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "PEXTRWmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "STMXCSR")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRBmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRDmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRQmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRWmr")>;
-def: InstRW<[BWWriteResGroup21], (instregex "VSTMXCSR")>;
+def: InstRW<[BWWriteResGroup20], (instrs CWD)>;
+def: InstRW<[BWWriteResGroup20], (instrs JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8",
+                                            "ADC8ri",
+                                            "SBB8i8",
+                                            "SBB8ri",
+                                            "SET(A|BE)r")>;
 
 def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> {
   let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup22], (instregex "FNSTCW16m")>;
-
-def BWWriteResGroup23 : SchedWriteRes<[BWPort4,BWPort237,BWPort06]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup23], (instregex "SETAEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETBm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETGEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETGm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETLEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETLm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNEm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNOm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNPm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETNSm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETOm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETPm")>;
-def: InstRW<[BWWriteResGroup23], (instregex "SETSm")>;
+def: InstRW<[BWWriteResGroup22], (instrs FNSTCW16m)>;
 
 def BWWriteResGroup24 : SchedWriteRes<[BWPort4,BWPort237,BWPort15]> {
   let Latency = 2;
@@ -1243,247 +733,55 @@ def BWWriteResGroup25 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[BWWriteResGroup25], (instregex "PUSH64i8")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSB")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSL")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSQ")>;
-def: InstRW<[BWWriteResGroup25], (instregex "STOSW")>;
-
-def BWWriteResGroup26 : SchedWriteRes<[BWPort0]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPDrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPSrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "PMOVMSKBrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[BWWriteResGroup25], (instrs PUSH16r, PUSH32r, PUSH64r,
+                                         STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)rmr",
+                                            "PUSH64i8")>;
 
 def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup27], (instregex "ADDPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADD_FPrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADD_FST0r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "ADD_FrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPPDrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPPSrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CMPSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "COMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "COMISSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CVTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "IMUL(32|64)rr(i8)?")>;
-def: InstRW<[BWWriteResGroup27], (instregex "IMUL8r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "MUL8r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FPrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FST0r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUBSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUB_FPrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUB_FST0r")>;
-def: InstRW<[BWWriteResGroup27], (instregex "SUB_FrST0")>;
-def: InstRW<[BWWriteResGroup27], (instregex "TZCNT(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "UCOMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "UCOMISSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDYrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSYrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSrri")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCMPSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCOMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCOMISSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSYrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBSDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VSUBSSrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISDrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr",
+                                            "PDEP(32|64)rr",
+                                            "PEXT(32|64)rr",
+                                            "SHLD(16|32|64)rri8",
+                                            "SHRD(16|32|64)rri8",
+                                            "(V?)CVTDQ2PS(Y?)rr")>;
 
 def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> {
-  let Latency = 3;
+  let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup27_16], (instregex "IMUL16rr(i8)?")>;
+def: InstRW<[BWWriteResGroup27_16], (instrs IMUL16rri, IMUL16rri8)>;
 
 def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VINSERTF128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VINSERTI128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERM2F128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERM2I128rr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMPDYri")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMPSYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPERMQYri")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWQYrr")>;
-
-def BWWriteResGroup29 : SchedWriteRes<[BWPort01]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup29], (instregex "MULPDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "MULPSrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "MULSDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "MULSSrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPDYrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPSYrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULPSrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULSDrr")>;
-def: InstRW<[BWWriteResGroup29], (instregex "VMULSSrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr",
+                                            "VPBROADCASTWrr")>;
 
 def BWWriteResGroup30 : SchedWriteRes<[BWPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[BWWriteResGroup30], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup30], (instregex "XADD8rr")>;
-def: InstRW<[BWWriteResGroup30], (instregex "XCHG8rr")>;
-
-def BWWriteResGroup31 : SchedWriteRes<[BWPort0,BWPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDYrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDYrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDYrr")>;
-def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDrr")>;
-
-def BWWriteResGroup32 : SchedWriteRes<[BWPort5,BWPort15]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBSWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBWrr64")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHADDDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHADDSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHADDWrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHSUBDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHSUBSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "PHSUBWrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr256")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr256")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWYrr")>;
-def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWrr")>;
+def: InstRW<[BWWriteResGroup30], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+                                         XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+                                         XCHG16ar, XCHG32ar, XCHG64ar)>;
 
 def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr",
+                                            "MMX_PACKSSWBirr",
+                                            "MMX_PACKUSWBirr")>;
 
 def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
   let Latency = 3;
@@ -1497,30 +795,21 @@ def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCL8r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCL8ri")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR8r1")>;
-def: InstRW<[BWWriteResGroup35], (instregex "RCR8ri")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r1",
+                                            "RCL(8|16|32|64)ri",
+                                            "RCR(8|16|32|64)r1",
+                                            "RCR(8|16|32|64)ri")>;
 
 def BWWriteResGroup36 : SchedWriteRes<[BWPort06,BWPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup36], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "ROL8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "ROR8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SAR8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHL8rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup36], (instregex "SHR8rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "ROL(8|16|32|64)rCL",
+                                            "ROR(8|16|32|64)rCL",
+                                            "SAR(8|16|32|64)rCL",
+                                            "SHL(8|16|32|64)rCL",
+                                            "SHR(8|16|32|64)rCL")>;
 
 def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
   let Latency = 3;
@@ -1534,31 +823,18 @@ def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[BWWriteResGroup38], (instregex "CALL64pcrel32")>;
-def: InstRW<[BWWriteResGroup38], (instregex "SETAm")>;
-def: InstRW<[BWWriteResGroup38], (instregex "SETBEm")>;
+def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>;
+def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>;
 
 def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "(V?)CVT(T?)SD2SI64rr",
+                                            "(V?)CVT(T?)SD2SIrr",
+                                            "(V?)CVT(T?)SS2SI64rr",
+                                            "(V?)CVT(T?)SS2SIrr")>;
 
 def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> {
   let Latency = 4;
@@ -1566,241 +842,98 @@ def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSLLDYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSLLQYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSLLWYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRADYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRAWYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRLDYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRLQYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPSRLWYrr")>;
-def: InstRW<[BWWriteResGroup40], (instregex "VPTESTYrr")>;
 
 def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup41], (instregex "FNSTSW16r")>;
+def: InstRW<[BWWriteResGroup41], (instrs FNSTSW16r)>;
 
 def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup42], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2DQrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2PSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSD2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSI642SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "IMUL(32|64)r")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MUL(32|64)r")>;
-def: InstRW<[BWWriteResGroup42], (instregex "MULX64rr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTDQ2PDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[BWWriteResGroup42], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[BWWriteResGroup42], (instrs IMUL64r, MUL64r, MULX64rr)>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr",
+                                            "MMX_CVT(T?)PD2PIirr",
+                                            "MMX_CVT(T?)PS2PIirr",
+                                            "(V?)CVTDQ2PDrr",
+                                            "(V?)CVTPD2PSrr",
+                                            "(V?)CVTSD2SSrr",
+                                            "(V?)CVTSI642SDrr",
+                                            "(V?)CVTSI2SDrr",
+                                            "(V?)CVTSI2SSrr",
+                                            "(V?)CVT(T?)PD2DQrr")>;
 
 def BWWriteResGroup42_16 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
 }
-def: InstRW<[BWWriteResGroup42_16], (instregex "IMUL16r")>;
-def: InstRW<[BWWriteResGroup42_16], (instregex "MUL16r")>;
+def: InstRW<[BWWriteResGroup42_16], (instrs IMUL16r, MUL16r)>;
 
 def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> {
   let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup43], (instregex "FNSTSWm")>;
+def: InstRW<[BWWriteResGroup43], (instrs FNSTSWm)>;
 
 def BWWriteResGroup44 : SchedWriteRes<[BWPort1,BWPort4,BWPort237]> {
   let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP16m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP32m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP64m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_F16m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_F32m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_FP16m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_FP32m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "IST_FP64m")>;
-def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHYmr")>;
-def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHmr")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST(T?)_FP(16|32|64)m",
+                                            "IST_F(16|32)m")>;
 
 def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [4];
 }
-def: InstRW<[BWWriteResGroup45], (instregex "FNCLEX")>;
+def: InstRW<[BWWriteResGroup45], (instrs FNCLEX)>;
 
 def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[BWWriteResGroup46], (instregex "VZEROUPPER")>;
+def: InstRW<[BWWriteResGroup46], (instrs VZEROUPPER)>;
 
 def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> {
   let Latency = 5;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MUL_FPrST0")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MUL_FST0r")>;
-def: InstRW<[BWWriteResGroup47], (instregex "MUL_FrST0")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PCLMULQDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PCMPGTQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMADDUBSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMADDWDrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULHRSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULHUWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULHWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULLWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PMULUDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "PSADBWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RCPPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RCPSSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RSQRTPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "RSQRTSSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPCLMULQDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWYrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWrr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRCPPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRCPSSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTPSr")>;
-def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTSSr")>;
-
-def BWWriteResGroup48 : SchedWriteRes<[BWPort01]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup48],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
-                       "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
+def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr",
+                                            "MUL_(FPrST0|FST0r|FrST0)")>;
 
 def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> {
   let Latency = 5;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup49], (instregex "LDDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOV64toPQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOV8rm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVAPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVAPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVNTDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSHDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSLDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVUPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVUPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHNTA")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT0")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT1")>;
-def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT2")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VLDDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQUrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVNTDQArm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVSSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPSrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTQrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16",
+                                            "MOVSX(16|32|64)rm32",
+                                            "MOVSX(16|32|64)rm8",
+                                            "MOVZX(16|32|64)rm16",
+                                            "MOVZX(16|32|64)rm8",
+                                            "VBROADCASTSSrm",
+                                            "(V?)MOVDDUPrm",
+                                            "(V?)MOVSHDUPrm",
+                                            "(V?)MOVSLDUPrm",
+                                            "VPBROADCASTDrm",
+                                            "VPBROADCASTQrm")>;
 
 def BWWriteResGroup50 : SchedWriteRes<[BWPort1,BWPort5]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[BWWriteResGroup50], (instregex "CVTSI642SSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HADDPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HADDPSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HSUBPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "HSUBPSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSYrr")>;
-def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "(V?)CVTSI642SSrr")>;
 
 def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> {
   let Latency = 5;
@@ -1810,482 +943,125 @@ def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> {
 def: InstRW<[BWWriteResGroup51], (instregex "STR(16|32|64)r")>;
 
 def BWWriteResGroup52 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
-  let Latency = 5;
+  let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup52], (instregex "MULX32rr")>;
-
-def BWWriteResGroup53 : SchedWriteRes<[BWPort0,BWPort4,BWPort237,BWPort15]> {
-  let Latency = 5;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQmr")>;
+def: InstRW<[BWWriteResGroup52], (instrs IMUL32r, MUL32r, MULX32rr)>;
 
 def BWWriteResGroup54 : SchedWriteRes<[BWPort6,BWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [1,4];
 }
-def: InstRW<[BWWriteResGroup54], (instregex "PAUSE")>;
+def: InstRW<[BWWriteResGroup54], (instrs PAUSE)>;
 
 def BWWriteResGroup55 : SchedWriteRes<[BWPort06,BWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [1,4];
 }
-def: InstRW<[BWWriteResGroup55], (instregex "XSETBV")>;
+def: InstRW<[BWWriteResGroup55], (instrs XSETBV)>;
 
 def BWWriteResGroup56 : SchedWriteRes<[BWPort06,BWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [2,3];
 }
-def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG8rr")>;
+def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(8|16|32|64)rr")>;
 
 def BWWriteResGroup57 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,4];
 }
-def: InstRW<[BWWriteResGroup57], (instregex "PUSHF16")>;
-def: InstRW<[BWWriteResGroup57], (instregex "PUSHF64")>;
+def: InstRW<[BWWriteResGroup57], (instregex "PUSHF(16|64)")>;
 
 def BWWriteResGroup58 : SchedWriteRes<[BWPort23]> {
   let Latency = 6;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup58], (instregex "LD_F32m")>;
-def: InstRW<[BWWriteResGroup58], (instregex "LD_F64m")>;
-def: InstRW<[BWWriteResGroup58], (instregex "LD_F80m")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTF128")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTI128")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VLDDQUYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPSYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQAYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQUYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPSYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTQYrm")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSSr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPDr")>;
-def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPSr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "LD_F(32|64|80)m",
+                                            "VBROADCASTF128",
+                                            "VBROADCASTI128",
+                                            "VBROADCASTSDYrm",
+                                            "VBROADCASTSSYrm",
+                                            "VMOVDDUPYrm",
+                                            "VMOVSHDUPYrm",
+                                            "VMOVSLDUPYrm",
+                                            "VPBROADCASTDYrm",
+                                            "VPBROADCASTQYrm")>;
 
 def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup59], (instregex "CVTPS2PDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "CVTSS2SDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRADrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VPSLLVQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VPSRLVQrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VTESTPDrm")>;
-def: InstRW<[BWWriteResGroup59], (instregex "VTESTPSrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "(V?)CVTPS2PDrm",
+                                            "(V?)CVTSS2SDrm",
+                                            "VPSLLVQrm",
+                                            "VPSRLVQrm")>;
 
 def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[BWWriteResGroup60], (instregex "VCVTTPD2DQYrr")>;
-
-def BWWriteResGroup61 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup61], (instregex "ANDNPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ANDNPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ANDPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ANDPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "INSERTPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "MOVLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "ORPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKSSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKSSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKUSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PACKUSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PALIGNRrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PBLENDWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PINSRWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFDmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFHWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PSHUFLWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "SHUFPDrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "SHUFPSrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDNPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDNPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VANDPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VINSERTPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VORPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSDWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSWBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPALIGNRrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPBLENDWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPINSRWrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFBrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFDmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFHWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFLWmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPDrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPSrmi")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VXORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "VXORPSrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "XORPDrm")>;
-def: InstRW<[BWWriteResGroup61], (instregex "XORPSrm")>;
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr",
+                                            "VCVTPD2PSYrr",
+                                            "VCVT(T?)PD2DQYrr")>;
 
 def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64")>;
-def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64",
+                                            "JMP(16|32|64)m")>;
 
 def BWWriteResGroup63 : SchedWriteRes<[BWPort23,BWPort06]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup63], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "ADC8rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "ADCX(32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "ADOX(32|64)rm")>;
 def: InstRW<[BWWriteResGroup63], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "RORX(32|64)mi")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SARX(32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SBB8rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SHLX(32|64)rm")>;
-def: InstRW<[BWWriteResGroup63], (instregex "SHRX(32|64)rm")>;
 
 def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDQirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "MOVBE(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PABSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PABSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PABSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PADDWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PAVGBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PAVGWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMAXUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PMINUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSIGNBrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSIGNDrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSIGNWrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "PSUBWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPABSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPABSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPABSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPADDWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPAVGBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPAVGWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINSDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINUBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINUDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPMINUWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNBrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNDrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNWrm128")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBDrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBQrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSBrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSWrm")>;
-def: InstRW<[BWWriteResGroup64], (instregex "VPSUBWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm",
+                                            "BLSI(32|64)rm",
+                                            "BLSMSK(32|64)rm",
+                                            "BLSR(32|64)rm",
+                                            "MOVBE(16|32|64)rm")>;
 
 def BWWriteResGroup65 : SchedWriteRes<[BWPort23,BWPort015]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup65], (instregex "BLENDPDrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "BLENDPSrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDNirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PORirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "MMX_PXORirm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PANDNrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PANDrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PORrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "PXORrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPDrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPSrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VINSERTI128rm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPANDNrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPANDrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPBLENDDrmi")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPORrm")>;
-def: InstRW<[BWWriteResGroup65], (instregex "VPXORrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm",
+                                            "VINSERTI128rm",
+                                            "VPBLENDDrmi")>;
 
 def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup66], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "ADD8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "AND8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP8mi")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP8mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "CMP8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "OR8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[BWWriteResGroup66], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "SUB8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "TEST8mi")>;
-def: InstRW<[BWWriteResGroup66], (instregex "TEST8mr")>;
-def: InstRW<[BWWriteResGroup66], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "XOR8rm")>;
+def: InstRW<[BWWriteResGroup66], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>;
 
 def BWWriteResGroup67 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[BWWriteResGroup67], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL",
+                                            "SHRD(16|32|64)rrCL")>;
 
 def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> {
   let Latency = 6;
@@ -2299,665 +1075,209 @@ def BWWriteResGroup69 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup69], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup69], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR8m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SAR8mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL8m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHL8mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR8m1")>;
-def: InstRW<[BWWriteResGroup69], (instregex "SHR8mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8",
+                                            "BTR(16|32|64)mi8",
+                                            "BTS(16|32|64)mi8",
+                                            "SAR(8|16|32|64)m1",
+                                            "SAR(8|16|32|64)mi",
+                                            "SHL(8|16|32|64)m1",
+                                            "SHL(8|16|32|64)mi",
+                                            "SHR(8|16|32|64)m1",
+                                            "SHR(8|16|32|64)mi")>;
 
 def BWWriteResGroup70 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "ADD8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "ADD8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "AND8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "DEC8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "INC(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "INC8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NEG8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "NOT8m")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "OR8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[BWWriteResGroup70], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "SUB8mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR8mi")>;
-def: InstRW<[BWWriteResGroup70], (instregex "XOR8mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm",
+                                            "PUSH(16|32|64)rmm")>;
 
 def BWWriteResGroup71 : SchedWriteRes<[BWPort6,BWPort0156]> {
   let Latency = 6;
   let NumMicroOps = 6;
   let ResourceCycles = [1,5];
 }
-def: InstRW<[BWWriteResGroup71], (instregex "STD")>;
-
-def BWWriteResGroup72 : SchedWriteRes<[BWPort5]> {
-  let Latency = 7;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup72], (instregex "AESDECLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "AESDECrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "AESENCLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "AESENCrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESDECLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESDECrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESENCLASTrr")>;
-def: InstRW<[BWWriteResGroup72], (instregex "VAESENCrr")>;
+def: InstRW<[BWWriteResGroup71], (instrs STD)>;
 
 def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLDYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSLLWYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRADYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRAWYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLDYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLVQYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VPSRLWYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VTESTPDYrm")>;
-def: InstRW<[BWWriteResGroup73], (instregex "VTESTPSYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm",
+                                            "VPSRLVQYrm")>;
 
 def BWWriteResGroup74 : SchedWriteRes<[BWPort1,BWPort23]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup74], (instregex "FCOM32m")>;
-def: InstRW<[BWWriteResGroup74], (instregex "FCOM64m")>;
-def: InstRW<[BWWriteResGroup74], (instregex "FCOMP32m")>;
-def: InstRW<[BWWriteResGroup74], (instregex "FCOMP64m")>;
-
-def BWWriteResGroup75 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup75], (instregex "VANDNPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VANDNPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VANDPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VANDPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VORPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VORPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFBYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFDYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VXORPDYrm")>;
-def: InstRW<[BWWriteResGroup75], (instregex "VXORPSYrm")>;
-
-def BWWriteResGroup76 : SchedWriteRes<[BWPort23,BWPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup76], (instregex "VPABSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPABSDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPABSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDQYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPADDWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPAVGBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPAVGWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINSDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINUBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINUDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPMINUWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBDYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBQYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSWYrm")>;
-def: InstRW<[BWWriteResGroup76], (instregex "VPSUBWYrm")>;
+def: InstRW<[BWWriteResGroup74], (instregex "FCOM(P?)(32|64)m")>;
 
 def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPANDNYrm")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPANDYrm")>;
 def: InstRW<[BWWriteResGroup77], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPORYrm")>;
-def: InstRW<[BWWriteResGroup77], (instregex "VPXORYrm")>;
-
-def BWWriteResGroup78 : SchedWriteRes<[BWPort0,BWPort5]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[BWWriteResGroup78], (instregex "MPSADBWrri")>;
-def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWYrri")>;
-def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWrri")>;
 
 def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPDrm0")>;
-def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPSrm0")>;
-def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKUSWBirm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "PBLENDVBrm0")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPDrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPSrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VPBLENDVBrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVQrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm",
+                                            "MMX_PACKSSWBirm",
+                                            "MMX_PACKUSWBirm")>;
 
 def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[BWWriteResGroup80], (instregex "LEAVE64")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASB")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASL")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASQ")>;
-def: InstRW<[BWWriteResGroup80], (instregex "SCASW")>;
-
-def BWWriteResGroup81 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup81], (instregex "PSLLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSLLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSLLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRADrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRAWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PSRLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "PTESTrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSLLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSLLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSLLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRADrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRAWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRLDrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRLQrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPSRLWrm")>;
-def: InstRW<[BWWriteResGroup81], (instregex "VPTESTrm")>;
+def: InstRW<[BWWriteResGroup80], (instrs LEAVE, LEAVE64,
+                                         SCASB, SCASL, SCASQ, SCASW)>;
 
 def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup82], (instregex "FLDCW16m")>;
-
-def BWWriteResGroup83 : SchedWriteRes<[BWPort0,BWPort23,BWPort0156]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup83], (instregex "LDMXCSR")>;
-def: InstRW<[BWWriteResGroup83], (instregex "VLDMXCSR")>;
+def: InstRW<[BWWriteResGroup82], (instrs FLDCW16m)>;
 
 def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup84], (instregex "LRETQ")>;
-def: InstRW<[BWWriteResGroup84], (instregex "RETQ")>;
-
-def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>;
-
-def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup86], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup86], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup84], (instrs LRETQ, RETQ)>;
 
 def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROL8m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROL8mi")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR8m1")>;
-def: InstRW<[BWWriteResGroup87], (instregex "ROR8mi")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m1",
+                                            "ROL(8|16|32|64)mi",
+                                            "ROR(8|16|32|64)m1",
+                                            "ROR(8|16|32|64)mi")>;
 
 def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[BWWriteResGroup88], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup88], (instregex "XADD8rm")>;
+def: InstRW<[BWWriteResGroup88], (instregex "XADD(8|16|32|64)rm")>;
 
 def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,1,1];
 }
-def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup89], (instregex "FARCALL64")>;
+def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m",
+                                            "FARCALL64")>;
 
 def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 7;
   let ResourceCycles = [2,2,1,2];
 }
-def: InstRW<[BWWriteResGroup90], (instregex "LOOP")>;
+def: InstRW<[BWWriteResGroup90], (instrs LOOP)>;
 
 def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup91], (instregex "ADDPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPPDrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPPSrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CMPSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "COMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "COMISSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CVTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL64m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL(32|64)rm(i8)?")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL8m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MUL64m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "MUL8m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "SUBSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "TZCNT(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "UCOMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "UCOMISSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPPDrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPPSrmi")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCMPSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCOMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCOMISSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBPDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBPSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBSDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VSUBSSrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISDrm")>;
-def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm",
+                                            "PDEP(32|64)rm",
+                                            "PEXT(32|64)rm",
+                                            "(V?)CVTDQ2PSrm")>;
 
 def BWWriteResGroup91_16 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
   let Latency = 8;
   let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1]; 
+  let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup91_16], (instregex "IMUL16rm(i8)?")>;
+def: InstRW<[BWWriteResGroup91_16], (instrs IMUL16rmi, IMUL16rmi8)>;
 
-def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
-  let Latency = 8;
+def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort06, BWPort0156, BWPort23]> {
+  let Latency = 9;
   let NumMicroOps = 5;
+  let ResourceCycles = [1,1,2,1];
 }
-def: InstRW<[BWWriteResGroup91_16_2], (instregex "IMUL16m")>;
-def: InstRW<[BWWriteResGroup91_16_2], (instregex "MUL16m")>;
-
-def BWWriteResGroup91_32 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup91_32], (instregex "IMUL32m")>;
-def: InstRW<[BWWriteResGroup91_32], (instregex "MUL32m")>;
+def: InstRW<[BWWriteResGroup91_16_2], (instrs IMUL16m, MUL16m)>;
 
 def BWWriteResGroup92 : SchedWriteRes<[BWPort5,BWPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[BWWriteResGroup92], (instregex "VPMOVZXWDYrm")>;
-
-def BWWriteResGroup93 : SchedWriteRes<[BWPort01,BWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup93], (instregex "MULPDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "MULPSrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "MULSDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "MULSSrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULPDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULPSrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULSDrm")>;
-def: InstRW<[BWWriteResGroup93], (instregex "VMULSSrm")>;
-
-def BWWriteResGroup94 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPSYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVQYrm")>;
-
-def BWWriteResGroup95 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup95], (instregex "VPSLLVDrm")>;
-def: InstRW<[BWWriteResGroup95], (instregex "VPSRAVDrm")>;
-def: InstRW<[BWWriteResGroup95], (instregex "VPSRLVDrm")>;
-
-def BWWriteResGroup96 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBSWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBWrm64")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHADDDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHADDSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHADDWrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHSUBDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHSUBSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "PHSUBWrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHADDDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHADDSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHADDWrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBDrm")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBSWrm128")>;
-def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBWrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm",
+                                            "VPMOVSXBQYrm",
+                                            "VPMOVSXBWYrm",
+                                            "VPMOVSXDQYrm",
+                                            "VPMOVSXWDYrm",
+                                            "VPMOVSXWQYrm",
+                                            "VPMOVZXWDYrm")>;
 
 def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCL8m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCL8mi")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR8m1")>;
-def: InstRW<[BWWriteResGroup97], (instregex "RCR8mi")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCL(8|16|32|64)m1",
+                                            "RCL(8|16|32|64)mi",
+                                            "RCR(8|16|32|64)m1",
+                                            "RCR(8|16|32|64)mi")>;
 
 def BWWriteResGroup98 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,2,1];
 }
-def: InstRW<[BWWriteResGroup98], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup98], (instregex "ROR8mCL")>;
+def: InstRW<[BWWriteResGroup98], (instregex "ROR(8|16|32|64)mCL")>;
 
 def BWWriteResGroup99 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 8;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,3];
 }
-def: InstRW<[BWWriteResGroup99], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "ADC8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "ADD8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "AND8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "OR8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "SUB8mi")>;
-def: InstRW<[BWWriteResGroup99], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup99], (instregex "XCHG8rm")>;
-def: InstRW<[BWWriteResGroup99], (instregex "XOR8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "XCHG(8|16|32|64)rm")>;
 
 def BWWriteResGroup100 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 8;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,2,1];
 }
-def: InstRW<[BWWriteResGroup100], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "ADC8mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG8rm")>;
-def: InstRW<[BWWriteResGroup100], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "ROL8mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SAR8mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB8mi")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SBB8mr")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHL8mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup100], (instregex "SHR8mCL")>;
+def : SchedAlias<WriteADCRMW, BWWriteResGroup100>;
+def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(8|16|32|64)rm",
+                                             "ROL(8|16|32|64)mCL",
+                                             "SAR(8|16|32|64)mCL",
+                                             "SHL(8|16|32|64)mCL",
+                                             "SHR(8|16|32|64)mCL")>;
 
 def BWWriteResGroup101 : SchedWriteRes<[BWPort1,BWPort23]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup101], (instregex "ADD_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ADD_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ILD_F16m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ILD_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "ILD_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUB_F32m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "SUB_F64m")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDPDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDPSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCMPPDYrmi")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCMPPSYrmi")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VSUBPDYrm")>;
-def: InstRW<[BWWriteResGroup101], (instregex "VSUBPSYrm")>;
-
-def BWWriteResGroup102 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup102], (instregex "VPERM2F128rm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERM2I128rm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMDYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMPDYmi")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMPSYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPERMQYmi")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXWQYrm")>;
-
-def BWWriteResGroup103 : SchedWriteRes<[BWPort01,BWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup103], (instregex "VMULPDYrm")>;
-def: InstRW<[BWWriteResGroup103], (instregex "VMULPSYrm")>;
-
-def BWWriteResGroup104 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup104], (instregex "DPPDrri")>;
-def: InstRW<[BWWriteResGroup104], (instregex "VDPPDrri")>;
+def: InstRW<[BWWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+                                             "ILD_F(16|32|64)m",
+                                             "VCVTPS2DQYrm",
+                                             "VCVTTPS2DQYrm")>;
 
 def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "(V?)CVTSS2SI(64)?rm",
+                                             "(V?)CVT(T?)SD2SI64rm",
+                                             "(V?)CVT(T?)SD2SIrm",
+                                             "VCVTTSS2SI64rm",
+                                             "(V?)CVTTSS2SIrm")>;
 
 def BWWriteResGroup106 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
   let Latency = 9;
@@ -2971,56 +1291,29 @@ def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup107], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2DQrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTSD2SSrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTTPD2PIirm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "MULX64rm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "VCVTDQ2PDrm")>;
-def: InstRW<[BWWriteResGroup107], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[BWWriteResGroup107], (instrs IMUL64m, MUL64m, MULX64rm)>;
+def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm",
+                                             "CVT(T?)PD2DQrm",
+                                             "MMX_CVTPI2PDirm",
+                                             "MMX_CVT(T?)PD2PIirm",
+                                             "(V?)CVTDQ2PDrm",
+                                             "(V?)CVTSD2SSrm")>;
 
 def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWrm")>;
-
-def BWWriteResGroup109 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup109], (instregex "VPSLLVDYrm")>;
-def: InstRW<[BWWriteResGroup109], (instregex "VPSRAVDYrm")>;
-def: InstRW<[BWWriteResGroup109], (instregex "VPSRLVDYrm")>;
-
-def BWWriteResGroup110 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup110], (instregex "VPHADDDYrm")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHADDSWrm256")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHADDWYrm")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBDYrm")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBSWrm256")>;
-def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBWYrm")>;
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm",
+                                             "VPBROADCASTW(Y?)rm")>;
 
 def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[BWWriteResGroup111], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8",
+                                             "SHRD(16|32|64)mri8")>;
 
 def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
   let Latency = 9;
@@ -3034,103 +1327,22 @@ def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[BWWriteResGroup113], (instregex "LSL(16|32|64)rm")>;
-
-def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[BWWriteResGroup114], (instregex "PMULLDrr")>;
-def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDYrr")>;
-def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDrr")>;
+def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm",
+                                             "LSL(16|32|64)rm")>;
 
 def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PCLMULQDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PCMPGTQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMADDUBSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMADDWDrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULHRSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULHUWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULHWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULLWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PMULUDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "PSADBWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RCPPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RCPSSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RSQRTPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "RSQRTSSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPCLMULQDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPCMPGTQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMADDWDrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULHRSWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULHUWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULHWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULLWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPMULUDQrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VPSADBWrm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRCPPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRCPSSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTPSm")>;
-def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTSSm")>;
-
-def BWWriteResGroup116 : SchedWriteRes<[BWPort01,BWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup116],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m",
-                       "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+def: InstRW<[BWWriteResGroup115], (instregex "(V?)PCMPGTQrm")>;
 
 def BWWriteResGroup117 : SchedWriteRes<[BWPort1,BWPort23]> {
   let Latency = 10;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup117], (instregex "FICOM16m")>;
-def: InstRW<[BWWriteResGroup117], (instregex "FICOM32m")>;
-def: InstRW<[BWWriteResGroup117], (instregex "FICOMP16m")>;
-def: InstRW<[BWWriteResGroup117], (instregex "FICOMP32m")>;
-
-def BWWriteResGroup118 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[BWWriteResGroup118], (instregex "VPTESTYrm")>;
-
-def BWWriteResGroup119 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup119], (instregex "HADDPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "HADDPSrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "HSUBPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "HSUBPSrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHADDPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHADDPSrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPDrm")>;
-def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPSrm")>;
+def: InstRW<[BWWriteResGroup117], (instregex "FICOM(P?)(16|32)m")>;
 
 def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
   let Latency = 10;
@@ -3140,79 +1352,26 @@ def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
 def: InstRW<[BWWriteResGroup120], (instregex "CVTTSS2SI64rm")>;
 
 def BWWriteResGroup121 : SchedWriteRes<[BWPort1,BWPort23,BWPort06,BWPort0156]> {
-  let Latency = 10;
+  let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[BWWriteResGroup121], (instregex "MULX32rm")>;
+def: InstRW<[BWWriteResGroup121], (instrs IMUL32m, MUL32m, MULX32rm)>;
 
-def BWWriteResGroup122 : SchedWriteRes<[BWPort0]> {
+def BWWriteResGroup122_1 : SchedWriteRes<[BWPort0,BWFPDivider]> {
   let Latency = 11;
   let NumMicroOps = 1;
-  let ResourceCycles = [1];
+  let ResourceCycles = [1,3]; // Really 2.5 cycle throughput
 }
-def: InstRW<[BWWriteResGroup122], (instregex "DIVPSrr")>;
-def: InstRW<[BWWriteResGroup122], (instregex "DIVSSrr")>;
-def: InstRW<[BWWriteResGroup122], (instregex "VDIVPSrr")>;
-def: InstRW<[BWWriteResGroup122], (instregex "VDIVSSrr")>;
+def : SchedAlias<WriteFDiv, BWWriteResGroup122_1>; // TODO - convert to ZnWriteResFpuPair
 
 def BWWriteResGroup123 : SchedWriteRes<[BWPort0,BWPort23]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup123], (instregex "MUL_F32m")>;
-def: InstRW<[BWWriteResGroup123], (instregex "MUL_F64m")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMADDWDYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULDQYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULHUWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULHWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULLWYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPMULUDQYrm")>;
-def: InstRW<[BWWriteResGroup123], (instregex "VPSADBWYrm")>;
-
-def BWWriteResGroup124 : SchedWriteRes<[BWPort01,BWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup124],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-
-def BWWriteResGroup125 : SchedWriteRes<[BWPort0]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRIrr")>;
-def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRM128rr")>;
-
-def BWWriteResGroup126 : SchedWriteRes<[BWPort0,BWPort015]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup126], (instregex "VRCPPSYr")>;
-def: InstRW<[BWWriteResGroup126], (instregex "VRSQRTPSYr")>;
-
-def BWWriteResGroup127 : SchedWriteRes<[BWPort1,BWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPSm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSSm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPSm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSDm")>;
-def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSSm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "MUL_F(32|64)m",
+                                             "VPCMPGTQYrm")>;
 
 def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
   let Latency = 11;
@@ -3221,31 +1380,21 @@ def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
 }
 def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>;
 
-def BWWriteResGroup129 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup129], (instregex "VHADDPDYrm")>;
-def: InstRW<[BWWriteResGroup129], (instregex "VHADDPSYrm")>;
-def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPDYrm")>;
-def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPSYrm")>;
-
 def BWWriteResGroup130 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 11;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,1,2];
 }
-def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[BWWriteResGroup130], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL",
+                                             "SHRD(16|32|64)mrCL")>;
 
 def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
   let Latency = 11;
   let NumMicroOps = 7;
   let ResourceCycles = [2,2,3];
 }
-def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[BWWriteResGroup131], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL",
+                                             "RCR(16|32|64)rCL")>;
 
 def BWWriteResGroup132 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 11;
@@ -3259,104 +1408,29 @@ def BWWriteResGroup133 : SchedWriteRes<[BWPort06,BWPort0156]> {
   let NumMicroOps = 11;
   let ResourceCycles = [2,9];
 }
-def: InstRW<[BWWriteResGroup133], (instregex "LOOPE")>;
-def: InstRW<[BWWriteResGroup133], (instregex "LOOPNE")>;
-
-def BWWriteResGroup134 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup134], (instregex "AESDECLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "AESDECrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "AESENCLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "AESENCrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESDECLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESDECrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESENCLASTrm")>;
-def: InstRW<[BWWriteResGroup134], (instregex "VAESENCrm")>;
+def: InstRW<[BWWriteResGroup133], (instrs LOOPE)>;
+def: InstRW<[BWWriteResGroup133], (instrs LOOPNE)>;
 
 def BWWriteResGroup135 : SchedWriteRes<[BWPort1,BWPort23]> {
   let Latency = 12;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI16m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI32m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI16m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI32m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI16m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI32m")>;
-def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPDm")>;
-def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPSm")>;
+def: InstRW<[BWWriteResGroup135], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
 
-def BWWriteResGroup136 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup136], (instregex "MPSADBWrmi")>;
-def: InstRW<[BWWriteResGroup136], (instregex "VMPSADBWrmi")>;
-
-def BWWriteResGroup137 : SchedWriteRes<[BWPort0]> {
-  let Latency = 13;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup137], (instregex "SQRTPSr")>;
-def: InstRW<[BWWriteResGroup137], (instregex "SQRTSSr")>;
-
-def BWWriteResGroup138 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[BWWriteResGroup138], (instregex "VMPSADBWYrmi")>;
-
-def BWWriteResGroup139 : SchedWriteRes<[BWPort0]> {
+def BWWriteResGroup139_1 : SchedWriteRes<[BWPort0,BWFPDivider]> {
   let Latency = 14;
   let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup139], (instregex "DIVPDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "DIVSDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VDIVPDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VDIVSDrr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VSQRTPSr")>;
-def: InstRW<[BWWriteResGroup139], (instregex "VSQRTSSr")>;
-
-def BWWriteResGroup140 : SchedWriteRes<[BWPort5]> {
-  let Latency = 14;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
+  let ResourceCycles = [1,4];
 }
-def: InstRW<[BWWriteResGroup140], (instregex "AESIMCrr")>;
-def: InstRW<[BWWriteResGroup140], (instregex "VAESIMCrr")>;
+def : SchedAlias<WriteFDiv64, BWWriteResGroup139_1>; // TODO - convert to ZnWriteResFpuPair
 
 def BWWriteResGroup141 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
   let Latency = 14;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI16m")>;
-def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI32m")>;
-
-def BWWriteResGroup142 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> {
-  let Latency = 14;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup142], (instregex "DPPSrri")>;
-def: InstRW<[BWWriteResGroup142], (instregex "VDPPSYrri")>;
-def: InstRW<[BWWriteResGroup142], (instregex "VDPPSrri")>;
-
-def BWWriteResGroup143 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
-  let Latency = 14;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[BWWriteResGroup143], (instregex "DPPDrmi")>;
-def: InstRW<[BWWriteResGroup143], (instregex "VDPPDrmi")>;
+def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI(16|32)m")>;
 
 def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
   let Latency = 14;
@@ -3377,213 +1451,92 @@ def BWWriteResGroup146 : SchedWriteRes<[BWPort0,BWPort1,BWPort6,BWPort0156]> {
   let NumMicroOps = 12;
   let ResourceCycles = [2,1,4,5];
 }
-def: InstRW<[BWWriteResGroup146], (instregex "XCH_F")>;
+def: InstRW<[BWWriteResGroup146], (instrs XCH_F)>;
 
 def BWWriteResGroup147 : SchedWriteRes<[BWPort0]> {
   let Latency = 15;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FPrST0")>;
-def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FST0r")>;
-def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FrST0")>;
-
-def BWWriteResGroup148 : SchedWriteRes<[BWPort0,BWPort23]> {
-  let Latency = 15;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup148], (instregex "PMULLDrm")>;
-def: InstRW<[BWWriteResGroup148], (instregex "VPMULLDrm")>;
+def: InstRW<[BWWriteResGroup147], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
 
 def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 15;
   let NumMicroOps = 10;
   let ResourceCycles = [1,1,1,4,1,2];
 }
-def: InstRW<[BWWriteResGroup149], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup149], (instregex "RCL8mCL")>;
+def: InstRW<[BWWriteResGroup149], (instregex "RCL(8|16|32|64)mCL")>;
 
-def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23]> {
+def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> {
   let Latency = 16;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let ResourceCycles = [1,1,5];
 }
-def: InstRW<[BWWriteResGroup150], (instregex "DIVPSrm")>;
-def: InstRW<[BWWriteResGroup150], (instregex "DIVSSrm")>;
-def: InstRW<[BWWriteResGroup150], (instregex "VDIVPSrm")>;
-def: InstRW<[BWWriteResGroup150], (instregex "VDIVSSrm")>;
-
-def BWWriteResGroup151 : SchedWriteRes<[BWPort0,BWPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup151], (instregex "VPMULLDYrm")>;
-
-def BWWriteResGroup152 : SchedWriteRes<[BWPort0,BWPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 4;
-  let ResourceCycles = [3,1];
-}
-def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRIrm")>;
-def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRM128rm")>;
+def : SchedAlias<WriteFDivLd, BWWriteResGroup150>; // TODO - convert to ZnWriteResFpuPair
 
 def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 16;
   let NumMicroOps = 14;
   let ResourceCycles = [1,1,1,4,2,5];
 }
-def: InstRW<[BWWriteResGroup153], (instregex "CMPXCHG8B")>;
+def: InstRW<[BWWriteResGroup153], (instrs CMPXCHG8B)>;
 
 def BWWriteResGroup154 : SchedWriteRes<[BWPort5]> {
   let Latency = 16;
   let NumMicroOps = 16;
   let ResourceCycles = [16];
 }
-def: InstRW<[BWWriteResGroup154], (instregex "VZEROALL")>;
-
-def BWWriteResGroup155 : SchedWriteRes<[BWPort0,BWPort015]> {
-  let Latency = 17;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup155], (instregex "VDIVPSYrr")>;
-
-def BWWriteResGroup156 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
-  let Latency = 17;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup156], (instregex "VRCPPSYm")>;
-def: InstRW<[BWWriteResGroup156], (instregex "VRSQRTPSYm")>;
-
-def BWWriteResGroup157 : SchedWriteRes<[BWPort0,BWPort23]> {
-  let Latency = 18;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup157], (instregex "SQRTPSm")>;
-def: InstRW<[BWWriteResGroup157], (instregex "SQRTSSm")>;
-
-def BWWriteResGroup158 : SchedWriteRes<[BWPort0,BWPort5,BWPort0156]> {
-  let Latency = 18;
-  let NumMicroOps = 8;
-  let ResourceCycles = [4,3,1];
-}
-def: InstRW<[BWWriteResGroup158], (instregex "PCMPESTRIrr")>;
-def: InstRW<[BWWriteResGroup158], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[BWWriteResGroup154], (instrs VZEROALL)>;
 
 def BWWriteResGroup159 : SchedWriteRes<[BWPort5,BWPort6,BWPort06,BWPort0156]> {
   let Latency = 18;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,5];
 }
-def: InstRW<[BWWriteResGroup159], (instregex "CPUID")>;
-def: InstRW<[BWWriteResGroup159], (instregex "RDTSC")>;
+def: InstRW<[BWWriteResGroup159], (instrs CPUID)>;
+def: InstRW<[BWWriteResGroup159], (instrs RDTSC)>;
 
 def BWWriteResGroup160 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 18;
   let NumMicroOps = 11;
   let ResourceCycles = [2,1,1,3,1,3];
 }
-def: InstRW<[BWWriteResGroup160], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[BWWriteResGroup160], (instregex "RCR8mCL")>;
+def: InstRW<[BWWriteResGroup160], (instregex "RCR(8|16|32|64)mCL")>;
 
-def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23]> {
+def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> {
   let Latency = 19;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let ResourceCycles = [1,1,8];
 }
-def: InstRW<[BWWriteResGroup161], (instregex "DIVPDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "DIVSDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VDIVPDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VDIVSDrm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VSQRTPSm")>;
-def: InstRW<[BWWriteResGroup161], (instregex "VSQRTSSm")>;
-
-def BWWriteResGroup162 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 19;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup162], (instregex "AESIMCrm")>;
-def: InstRW<[BWWriteResGroup162], (instregex "VAESIMCrm")>;
-
-def BWWriteResGroup163 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
-  let Latency = 19;
-  let NumMicroOps = 5;
-  let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[BWWriteResGroup163], (instregex "DPPSrmi")>;
-def: InstRW<[BWWriteResGroup163], (instregex "VDPPSrmi")>;
-
-def BWWriteResGroup164 : SchedWriteRes<[BWPort0,BWPort5,BWPort015,BWPort0156]> {
-  let Latency = 19;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[BWWriteResGroup164], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[BWWriteResGroup164], (instregex "VPCMPESTRM128rr")>;
+def : SchedAlias<WriteFDiv64Ld, BWWriteResGroup161>; // TODO - convert to ZnWriteResFpuPair
 
 def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> {
   let Latency = 20;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup165], (instregex "DIV_FPrST0")>;
-def: InstRW<[BWWriteResGroup165], (instregex "DIV_FST0r")>;
-def: InstRW<[BWWriteResGroup165], (instregex "DIV_FrST0")>;
-def: InstRW<[BWWriteResGroup165], (instregex "SQRTPDr")>;
-def: InstRW<[BWWriteResGroup165], (instregex "SQRTSDr")>;
-
-def BWWriteResGroup166 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
-  let Latency = 20;
-  let NumMicroOps = 5;
-  let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[BWWriteResGroup166], (instregex "VDPPSYrmi")>;
+def: InstRW<[BWWriteResGroup165], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
 
 def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 20;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,1,1,1,2];
 }
-def: InstRW<[BWWriteResGroup167], (instregex "INSB")>;
-def: InstRW<[BWWriteResGroup167], (instregex "INSL")>;
-def: InstRW<[BWWriteResGroup167], (instregex "INSW")>;
-
-def BWWriteResGroup168 : SchedWriteRes<[BWPort0]> {
-  let Latency = 21;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[BWWriteResGroup168], (instregex "VSQRTPDr")>;
-def: InstRW<[BWWriteResGroup168], (instregex "VSQRTSDr")>;
+def: InstRW<[BWWriteResGroup167], (instrs INSB, INSL, INSW)>;
 
 def BWWriteResGroup169 : SchedWriteRes<[BWPort0,BWPort23]> {
   let Latency = 21;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup169], (instregex "DIV_F32m")>;
-def: InstRW<[BWWriteResGroup169], (instregex "DIV_F64m")>;
-
-def BWWriteResGroup170 : SchedWriteRes<[BWPort0,BWPort015]> {
-  let Latency = 21;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup170], (instregex "VSQRTPSYr")>;
+def: InstRW<[BWWriteResGroup169], (instregex "DIV_F(32|64)m")>;
 
 def BWWriteResGroup171 : SchedWriteRes<[BWPort0,BWPort4,BWPort5,BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 21;
   let NumMicroOps = 19;
   let ResourceCycles = [2,1,4,1,1,4,6];
 }
-def: InstRW<[BWWriteResGroup171], (instregex "CMPXCHG16B")>;
+def: InstRW<[BWWriteResGroup171], (instrs CMPXCHG16B)>;
 
 def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
   let Latency = 22;
@@ -3592,28 +1545,6 @@ def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
 }
 def: InstRW<[BWWriteResGroup172], (instregex "POPF64")>;
 
-def BWWriteResGroup173 : SchedWriteRes<[BWPort0,BWPort015]> {
-  let Latency = 23;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup173], (instregex "VDIVPDYrr")>;
-
-def BWWriteResGroup174 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
-  let Latency = 23;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup174], (instregex "VDIVPSYrm")>;
-
-def BWWriteResGroup175 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort0156]> {
-  let Latency = 23;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[BWWriteResGroup175], (instregex "PCMPESTRIrm")>;
-def: InstRW<[BWWriteResGroup175], (instregex "VPCMPESTRIrm")>;
-
 def BWWriteResGroup176 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
   let Latency = 23;
   let NumMicroOps = 19;
@@ -3626,56 +1557,21 @@ def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI16m")>;
-def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI32m")>;
-
-def BWWriteResGroup178 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015,BWPort0156]> {
-  let Latency = 24;
-  let NumMicroOps = 10;
-  let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[BWWriteResGroup178], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[BWWriteResGroup178], (instregex "VPCMPESTRM128rm")>;
-
-def BWWriteResGroup179 : SchedWriteRes<[BWPort0,BWPort23]> {
-  let Latency = 25;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup179], (instregex "SQRTPDm")>;
-def: InstRW<[BWWriteResGroup179], (instregex "SQRTSDm")>;
+def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI(16|32)m")>;
 
 def BWWriteResGroup180 : SchedWriteRes<[BWPort0,BWPort23]> {
   let Latency = 26;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F32m")>;
-def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F64m")>;
-def: InstRW<[BWWriteResGroup180], (instregex "VSQRTPDm")>;
-def: InstRW<[BWWriteResGroup180], (instregex "VSQRTSDm")>;
-
-def BWWriteResGroup181 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
-  let Latency = 27;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup181], (instregex "VSQRTPSYm")>;
+def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F(32|64)m")>;
 
 def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
   let Latency = 29;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI16m")>;
-def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI32m")>;
-
-def BWWriteResGroup183 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
-  let Latency = 29;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup183], (instregex "VDIVPDYrm")>;
+def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>;
 
 def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
   let Latency = 22;
@@ -3716,7 +1612,7 @@ def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>;
 def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
   let Latency = 26;
   let NumMicroOps = 14;
-  let ResourceCycles = [1,4,8,1];  
+  let ResourceCycles = [1,4,8,1];
 }
 def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>;
 
@@ -3727,128 +1623,85 @@ def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156
 }
 def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>;
 
-def BWWriteResGroup184 : SchedWriteRes<[BWPort0,BWPort5,BWPort015]> {
-  let Latency = 29;
-  let NumMicroOps = 11;
-  let ResourceCycles = [2,7,2];
-}
-def: InstRW<[BWWriteResGroup184], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[BWWriteResGroup184], (instregex "VAESKEYGENASSIST128rr")>;
-
 def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 29;
   let NumMicroOps = 27;
   let ResourceCycles = [1,5,1,1,19];
 }
-def: InstRW<[BWWriteResGroup185], (instregex "XSAVE64")>;
+def: InstRW<[BWWriteResGroup185], (instrs XSAVE64)>;
 
 def BWWriteResGroup186 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 30;
   let NumMicroOps = 28;
   let ResourceCycles = [1,6,1,1,19];
 }
-def: InstRW<[BWWriteResGroup186], (instregex "XSAVE(OPT)?")>;
-
-def BWWriteResGroup187 : SchedWriteRes<[BWPort01,BWPort15,BWPort015,BWPort0156]> {
-  let Latency = 31;
-  let NumMicroOps = 31;
-  let ResourceCycles = [8,1,21,1];
-}
-def: InstRW<[BWWriteResGroup187], (instregex "MMX_EMMS")>;
-
-def BWWriteResGroup188 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015]> {
-  let Latency = 33;
-  let NumMicroOps = 11;
-  let ResourceCycles = [2,7,1,1];
-}
-def: InstRW<[BWWriteResGroup188], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[BWWriteResGroup188], (instregex "VAESKEYGENASSIST128rm")>;
-
-def BWWriteResGroup189 : SchedWriteRes<[BWPort0,BWPort015]> {
-  let Latency = 34;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup189], (instregex "VSQRTPDYr")>;
+def: InstRW<[BWWriteResGroup186], (instrs XSAVE)>;
+def: InstRW<[BWWriteResGroup186], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
 
 def BWWriteResGroup190 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> {
   let Latency = 34;
   let NumMicroOps = 8;
   let ResourceCycles = [2,2,2,1,1];
 }
-def: InstRW<[BWWriteResGroup190], (instregex "DIV(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup190], (instregex "DIV8m")>;
+def: InstRW<[BWWriteResGroup190], (instregex "DIV(8|16|32|64)m")>;
 
 def BWWriteResGroup191 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort06,BWPort0156]> {
   let Latency = 34;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,3,4,10];
 }
-def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)ri")>;
-def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)rr")>;
-def: InstRW<[BWWriteResGroup191], (instregex "IN8ri")>;
-def: InstRW<[BWWriteResGroup191], (instregex "IN8rr")>;
+def: InstRW<[BWWriteResGroup191], (instregex "IN(8|16|32)ri",
+                                             "IN(8|16|32)rr")>;
 
 def BWWriteResGroup193 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> {
   let Latency = 35;
   let NumMicroOps = 8;
   let ResourceCycles = [2,2,2,1,1];
 }
-def: InstRW<[BWWriteResGroup193], (instregex "IDIV(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup193], (instregex "IDIV8m")>;
+def: InstRW<[BWWriteResGroup193], (instregex "IDIV(8|16|32|64)m")>;
 
 def BWWriteResGroup194 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,2,1,4,10];
 }
-def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)ir")>;
-def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)rr")>;
-def: InstRW<[BWWriteResGroup194], (instregex "OUT8ir")>;
-def: InstRW<[BWWriteResGroup194], (instregex "OUT8rr")>;
-
-def BWWriteResGroup195 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
-  let Latency = 40;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[BWWriteResGroup195], (instregex "VSQRTPDYm")>;
+def: InstRW<[BWWriteResGroup194], (instregex "OUT(8|16|32)ir",
+                                             "OUT(8|16|32)rr")>;
 
 def BWWriteResGroup196 : SchedWriteRes<[BWPort5,BWPort0156]> {
   let Latency = 42;
   let NumMicroOps = 22;
   let ResourceCycles = [2,20];
 }
-def: InstRW<[BWWriteResGroup196], (instregex "RDTSCP")>;
+def: InstRW<[BWWriteResGroup196], (instrs RDTSCP)>;
 
 def BWWriteResGroup197 : SchedWriteRes<[BWPort0,BWPort01,BWPort23,BWPort05,BWPort06,BWPort015,BWPort0156]> {
   let Latency = 60;
   let NumMicroOps = 64;
   let ResourceCycles = [2,2,8,1,10,2,39];
 }
-def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>;
-def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>;
+def: InstRW<[BWWriteResGroup197], (instrs FLDENVm)>;
 
 def BWWriteResGroup198 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 63;
   let NumMicroOps = 88;
   let ResourceCycles = [4,4,31,1,2,1,45];
 }
-def: InstRW<[BWWriteResGroup198], (instregex "FXRSTOR64")>;
+def: InstRW<[BWWriteResGroup198], (instrs FXRSTOR64)>;
 
 def BWWriteResGroup199 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 63;
   let NumMicroOps = 90;
   let ResourceCycles = [4,2,33,1,2,1,47];
 }
-def: InstRW<[BWWriteResGroup199], (instregex "FXRSTOR")>;
+def: InstRW<[BWWriteResGroup199], (instrs FXRSTOR)>;
 
 def BWWriteResGroup200 : SchedWriteRes<[BWPort5,BWPort01,BWPort0156]> {
   let Latency = 75;
   let NumMicroOps = 15;
   let ResourceCycles = [6,3,6];
 }
-def: InstRW<[BWWriteResGroup200], (instregex "FNINIT")>;
+def: InstRW<[BWWriteResGroup200], (instrs FNINIT)>;
 
 def BWWriteResGroup201 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156]> {
   let Latency = 80;
@@ -3862,8 +1715,8 @@ def BWWriteResGroup202 : SchedWriteRes<[BWPort0,BWPort1,BWPort4,BWPort5,BWPort6,
   let NumMicroOps = 100;
   let ResourceCycles = [9,9,11,8,1,11,21,30];
 }
-def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>;
-def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>;
+def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
 
-} // SchedModel
+def: InstRW<[WriteZero], (instrs CLC)>;
 
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
index 46612554b1fa..189dd4183839 100644
--- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -10,6 +10,11 @@
 // This file defines the machine model for Haswell to support instruction
 // scheduling and other instruction cost heuristics.
 //
+// Note that we define some instructions here that are not supported by haswell,
+// but we still have to define them because KNL uses the HSW model.
+// They are currently tagged with a comment `Unsupported = 1`.
+// FIXME: Use Unsupported = 1 once KNL has its own model.
+//
 //===----------------------------------------------------------------------===//
 
 def HaswellModel : SchedMachineModel {
@@ -23,7 +28,7 @@ def HaswellModel : SchedMachineModel {
   // Based on the LSD (loop-stream detector) queue size and benchmarking data.
   let LoopMicroOpBufferSize = 50;
 
-  // This flag is set to allow the scheduler to assign a default model to 
+  // This flag is set to allow the scheduler to assign a default model to
   // unrecognized opcodes.
   let CompleteModel = 0;
 }
@@ -69,6 +74,8 @@ def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
 
 // Integer division issued on port 0.
 def HWDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def HWFPDivider : ProcResource<1>;
 
 // Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
 // cycles after the memory operand.
@@ -80,189 +87,451 @@ def : ReadAdvance<ReadAfterLd, 5>;
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
 multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [1], int UOps = 1,
+                          int LoadLat = 5> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
-  // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
-  // latency.
-  def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
-     let Latency = !add(Lat, 5);
+  // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+  // the latency (default = 5).
+  def : WriteRes<SchedRW.Folded, !listconcat([HWPort23], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = !add(UOps, 1);
   }
 }
 
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [HWPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort237,HWPort4]>;
 
 // Store_addr on 237.
 // Store_data on 4.
-def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
-def : WriteRes<WriteLoad,  [HWPort23]> { let Latency = 5; }
-def : WriteRes<WriteMove,  [HWPort0156]>;
-def : WriteRes<WriteZero,  []>;
+defm : X86WriteRes<WriteStore,   [HWPort237, HWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [HWPort237, HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteLoad,    [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteMove,    [HWPort0156], 1, [1], 1>;
+def  : WriteRes<WriteZero,       []>;
+
+defm : HWWriteResPair<WriteALU,    [HWPort0156], 1>;
+defm : HWWriteResPair<WriteADC,    [HWPort06,HWPort0156], 2, [1,1], 2>;
+defm : HWWriteResPair<WriteIMul,   [HWPort1],   3>;
+defm : HWWriteResPair<WriteIMul64, [HWPort1],   3>;
+
+defm : HWWriteResPair<WriteBSWAP32,[HWPort15],   1>;
+defm : HWWriteResPair<WriteBSWAP64,[HWPort06, HWPort15], 2, [1,1], 2>;
 
-defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
-defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
 def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : HWWriteResPair<WriteShift, HWPort06,  1>;
-defm : HWWriteResPair<WriteJump,  HWPort06,   1>;
+defm : HWWriteResPair<WriteShift,  [HWPort06],  1>;
+defm : HWWriteResPair<WriteShiftDouble,  [HWPort06],  1>;
+defm : HWWriteResPair<WriteJump,   [HWPort06],  1>;
+defm : HWWriteResPair<WriteCRC32,  [HWPort1],   3>;
+
+defm : HWWriteResPair<WriteCMOV,  [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
+defm : HWWriteResPair<WriteCMOV2, [HWPort06,HWPort0156], 3, [1,2], 3>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+def  : WriteRes<WriteLAHFSAHF, [HWPort06]>;
 
 // This is for simple LEAs with one or two input operands.
 // The complex ones can only execute on port 1, and they require two cycles on
 // the port to read all inputs. We don't model that.
 def : WriteRes<WriteLEA, [HWPort15]>;
 
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
-  let Latency = 25;
-  let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
-  let Latency = 29;
-  let ResourceCycles = [1, 1, 10];
-}
+// Bit counts.
+defm : HWWriteResPair<WriteBSF, [HWPort1], 3>;
+defm : HWWriteResPair<WriteBSR, [HWPort1], 3>;
+defm : HWWriteResPair<WriteLZCNT,          [HWPort1], 3>;
+defm : HWWriteResPair<WriteTZCNT,          [HWPort1], 3>;
+defm : HWWriteResPair<WritePOPCNT,         [HWPort1], 3>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : HWWriteResPair<WriteBEXTR, [HWPort06,HWPort15], 2, [1,1], 2>;
+defm : HWWriteResPair<WriteBZHI, [HWPort15], 1>;
+
+defm : HWWriteResPair<WriteDiv8,   [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteDiv16,  [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteDiv32,  [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteDiv64,  [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv8,  [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv16, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv32, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
+defm : HWWriteResPair<WriteIDiv64, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
 
 // Scalar and vector floating point.
-defm : HWWriteResPair<WriteFAdd,   HWPort1, 3>;
-defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
-defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
-defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
-defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
-defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
-defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
-defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
-defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
-defm : HWWriteResPair<WriteFMA,   HWPort01, 5>;
-defm : HWWriteResPair<WriteFShuffle,  HWPort5,  1>;
-defm : HWWriteResPair<WriteFBlend,  HWPort015,  1>;
-defm : HWWriteResPair<WriteFShuffle256,  HWPort5,  3>;
-
-def : WriteRes<WriteFVarBlend, [HWPort5]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0,          [HWPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,          [HWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC,          [HWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad,         [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [HWPort23,HWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [HWPort23,HWPort5], 9, [1,2], 3>;
+defm : X86WriteRes<WriteFStore,        [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX,       [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY,       [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT,      [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX,     [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY,     [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMove,         [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>;
+
+defm : HWWriteResPair<WriteFAdd,    [HWPort1],  3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFAddX,   [HWPort1],  3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFAddY,   [HWPort1],  3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFAddZ,   [HWPort1],  3, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFAdd64,  [HWPort1],  3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFAdd64X, [HWPort1],  3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFAdd64Y, [HWPort1],  3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFAdd64Z, [HWPort1],  3, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFCmp,    [HWPort1],  3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFCmpX,   [HWPort1],  3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFCmpY,   [HWPort1],  3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFCmpZ,   [HWPort1],  3, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFCmp64,  [HWPort1],  3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFCmp64X, [HWPort1],  3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFCmp64Y, [HWPort1],  3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFCmp64Z, [HWPort1],  3, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFCom,    [HWPort1],  3>;
+
+defm : HWWriteResPair<WriteFMul,    [HWPort01],  5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMulX,   [HWPort01],  5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMulY,   [HWPort01],  5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMulZ,   [HWPort01],  5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFMul64,  [HWPort01],  5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMul64X, [HWPort01],  5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMul64Y, [HWPort01],  5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMul64Z, [HWPort01],  5, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFDiv,    [HWPort0,HWFPDivider], 13, [1,7], 1, 5>;
+defm : HWWriteResPair<WriteFDivX,   [HWPort0,HWFPDivider], 13, [1,7], 1, 6>;
+defm : HWWriteResPair<WriteFDivY,   [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>;
+defm : HWWriteResPair<WriteFDivZ,   [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFDiv64,  [HWPort0,HWFPDivider], 20, [1,14], 1, 5>;
+defm : HWWriteResPair<WriteFDiv64X, [HWPort0,HWFPDivider], 20, [1,14], 1, 6>;
+defm : HWWriteResPair<WriteFDiv64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>;
+defm : HWWriteResPair<WriteFDiv64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFRcp,   [HWPort0],  5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFRcpX,  [HWPort0],  5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFRcpY,  [HWPort0,HWPort015], 11, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteFRcpZ,  [HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFRsqrt, [HWPort0],  5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFRsqrtX,[HWPort0],  5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFRsqrtY,[HWPort0,HWPort015], 11, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteFRsqrtZ,[HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFSqrt,    [HWPort0,HWFPDivider], 11, [1,7], 1, 5>;
+defm : HWWriteResPair<WriteFSqrtX,   [HWPort0,HWFPDivider], 11, [1,7], 1, 6>;
+defm : HWWriteResPair<WriteFSqrtY,   [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>;
+defm : HWWriteResPair<WriteFSqrtZ,   [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSqrt64,  [HWPort0,HWFPDivider], 16, [1,14], 1, 5>;
+defm : HWWriteResPair<WriteFSqrt64X, [HWPort0,HWFPDivider], 16, [1,14], 1, 6>;
+defm : HWWriteResPair<WriteFSqrt64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>;
+defm : HWWriteResPair<WriteFSqrt64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSqrt80,  [HWPort0,HWFPDivider], 23, [1,17]>;
+
+defm : HWWriteResPair<WriteFMA,   [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMAX,  [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMAY,  [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMAZ,  [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteDPPD,  [HWPort0,HWPort1,HWPort5],  9, [1,1,1], 3, 6>;
+defm : HWWriteResPair<WriteDPPS,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
+defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : HWWriteResPair<WriteDPPSZ, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
+defm : X86WriteRes<WriteFRnd,            [HWPort23],  6, [1],   1>;
+defm : X86WriteRes<WriteFRndY,           [HWPort23],  6, [1],   1>;
+defm : X86WriteRes<WriteFRndZ,           [HWPort23],  6, [1],   1>; // Unsupported = 1
+defm : X86WriteRes<WriteFRndLd,  [HWPort1,HWPort23], 12, [2,1], 3>;
+defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
+defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFLogic,  [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFTest,   [HWPort0], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFTestY,  [HWPort0], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFTestZ,  [HWPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFShuffle,  [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFVarShuffle,  [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFVarShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFBlend,  [HWPort015], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFBlendY, [HWPort015], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFBlendZ, [HWPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarBlend,  [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteFVarBlendY, [HWPort5], 2, [2], 2, 7>;
+defm : HWWriteResPair<WriteFVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1
+
+// Conversion between integer and float.
+defm : HWWriteResPair<WriteCvtSD2I,   [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2I,   [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2IY,  [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2IZ,  [HWPort1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtSS2I,   [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2I,   [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2IY,  [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2IZ,  [HWPort1], 3>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteCvtI2SD,   [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PD,   [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PDY,  [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PDZ,  [HWPort1], 4>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtI2SS,   [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PS,   [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PSY,  [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PSZ,  [HWPort1], 4>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteCvtSS2SD,  [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PD,  [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PDY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PDZ, [HWPort1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtSD2SS,  [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PS,  [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PSY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PSZ, [HWPort1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPH2PS,     [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ,    [HWPort0,HWPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPH2PSLd,  [HWPort0,HWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [HWPort0,HWPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [HWPort0,HWPort23], 7, [1,1], 2>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPS2PH,    [HWPort1,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [HWPort1,HWPort5], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ,   [HWPort1,HWPort5], 6, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PHSt,  [HWPort1,HWPort4,HWPort5,HWPort237], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>; // Unsupported = 1
 
 // Vector integer operations.
-defm : HWWriteResPair<WriteVecShift, HWPort0,  1>;
-defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
-defm : HWWriteResPair<WriteVecALU,   HWPort15,  1>;
-defm : HWWriteResPair<WriteVecIMul,  HWPort0,   5>;
-defm : HWWriteResPair<WriteShuffle,  HWPort5,  1>;
-defm : HWWriteResPair<WriteBlend,  HWPort15,  1>;
-defm : HWWriteResPair<WriteShuffle256,  HWPort5,  3>;
-
-def : WriteRes<WriteVarBlend, [HWPort5]> {
+defm : X86WriteRes<WriteVecLoad,         [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [HWPort23,HWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [HWPort23,HWPort5], 9, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore,        [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX,       [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY,       [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT,      [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY,     [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMove,         [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [HWPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [HWPort5], 1, [1], 1>;
+
+defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecLogicX,[HWPort015], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecLogicY,[HWPort015], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecLogicZ,[HWPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecTest,  [HWPort0,HWPort5], 2, [1,1], 2, 6>;
+defm : HWWriteResPair<WriteVecTestY, [HWPort0,HWPort5], 4, [1,1], 2, 7>;
+defm : HWWriteResPair<WriteVecTestZ, [HWPort0,HWPort5], 4, [1,1], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecALU,   [HWPort15],  1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecALUX,  [HWPort15],  1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecALUY,  [HWPort15],  1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecALUZ,  [HWPort15],  1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecIMul,  [HWPort0],  5, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecIMulX, [HWPort0],  5, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecIMulY, [HWPort0],  5, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecIMulZ, [HWPort0],  5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePMULLD,   [HWPort0], 10, [2], 2, 6>;
+defm : HWWriteResPair<WritePMULLDY,  [HWPort0], 10, [2], 2, 7>;
+defm : HWWriteResPair<WritePMULLDZ,  [HWPort0], 10, [2], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteShuffle,  [HWPort5],  1, [1], 1, 5>;
+defm : HWWriteResPair<WriteShuffleX, [HWPort5],  1, [1], 1, 6>;
+defm : HWWriteResPair<WriteShuffleY, [HWPort5],  1, [1], 1, 7>;
+defm : HWWriteResPair<WriteShuffleZ, [HWPort5],  1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVarShuffleX,[HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVarShuffleY,[HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffleZ,[HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteBlend,  [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteBlendZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarBlend,  [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>;
+defm : HWWriteResPair<WriteVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteMPSAD,  [HWPort0, HWPort5], 7, [1, 2], 3, 6>;
+defm : HWWriteResPair<WriteMPSADY, [HWPort0, HWPort5], 7, [1, 2], 3, 7>;
+defm : HWWriteResPair<WriteMPSADZ, [HWPort0, HWPort5], 7, [1, 2], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePSADBW,  [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WritePSADBWX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WritePSADBWY, [HWPort0], 5, [1], 1, 7>;
+defm : HWWriteResPair<WritePSADBWZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePHMINPOS, [HWPort0],  5, [1], 1, 6>;
+
+// Vector integer shifts.
+defm : HWWriteResPair<WriteVecShift,     [HWPort0], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecShiftX,    [HWPort0,HWPort5],  2, [1,1], 2, 6>;
+defm : X86WriteRes<WriteVecShiftY,       [HWPort0,HWPort5],  4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ,       [HWPort0,HWPort5],  4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteVecShiftYLd,     [HWPort0,HWPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd,     [HWPort0,HWPort23], 8, [1,1], 2>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteVecShiftImm,  [HWPort0], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecShiftImmX, [HWPort0], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecShiftImmY, [HWPort0], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecShiftImmZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVarVecShift,  [HWPort0, HWPort5], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WriteVarVecShiftY, [HWPort0, HWPort5], 3, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteVarVecShiftZ, [HWPort0, HWPort5], 3, [2,1], 3, 7>; // Unsupported = 1
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [HWPort5]> {
   let Latency = 2;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+def : WriteRes<WriteVecInsertLd, [HWPort5,HWPort23]> {
   let Latency = 6;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 2;
 }
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
 
-def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+def : WriteRes<WriteVecExtract, [HWPort0,HWPort5]> {
   let Latency = 2;
-  let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1, 1];
-}
-
-def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 2];
+  let NumMicroOps = 2;
 }
-def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [HWPort4,HWPort5,HWPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
 }
 
 // String instructions.
+
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [HWPort0]> {
-  let Latency = 10;
+  let Latency = 11;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 1];
+  let Latency = 17;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort5, HWPort015, HWPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
-def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [6, 2, 1];
+def : WriteRes<WritePCmpEStrMLd, [HWPort0, HWPort5, HWPort23, HWPort015, HWPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4,3,1,1,1];
 }
 
 // Packed Compare Implicit Length Strings, Return Index
 def : WriteRes<WritePCmpIStrI, [HWPort0]> {
   let Latency = 11;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 1];
+  let Latency = 17;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
 }
 
 // Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
-  let Latency = 11;
-  let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [HWPort0, HWPort5, HWPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4,3,1];
 }
-def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort5, HWPort23, HWPort0156]> {
+  let Latency = 24;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
 
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK,  [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSKY, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteMMXMOVMSK,  [HWPort0]> { let Latency = 1; }
+
 // AES Instructions.
 def : WriteRes<WriteAESDecEnc, [HWPort5]> {
   let Latency = 7;
+  let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
 def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
+  let Latency = 13;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
 
 def : WriteRes<WriteAESIMC, [HWPort5]> {
   let Latency = 14;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
 def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
-  let Latency = 14;
-  let ResourceCycles = [2, 1];
+  let Latency = 20;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
 
-def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 8];
+def : WriteRes<WriteAESKeyGen, [HWPort0,HWPort5,HWPort015]> {
+  let Latency = 29;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,7,2];
 }
-def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [HWPort0,HWPort5,HWPort23,HWPort015]> {
+  let Latency = 34;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,7,1,1];
 }
 
 // Carry-less multiplication instructions.
 def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1];
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
 def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1, 1];
+  let Latency = 17;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
 }
 
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [HWPort0,HWPort23,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
 def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
 def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
@@ -273,149 +542,34 @@ def : WriteRes<WriteNop, []>;
 //-- Specific Scheduling Models --//
 
 // Starting with P0.
-def WriteP0 : SchedWriteRes<[HWPort0]>;
-
-def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-
-def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
-}
+def HWWriteP0 : SchedWriteRes<[HWPort0]>;
 
-def WriteP01 : SchedWriteRes<[HWPort01]>;
+def HWWriteP01 : SchedWriteRes<[HWPort01]>;
 
-def Write2P01 : SchedWriteRes<[HWPort01]> {
+def HWWrite2P01 : SchedWriteRes<[HWPort01]> {
   let NumMicroOps = 2;
 }
-def Write3P01 : SchedWriteRes<[HWPort01]> {
+def HWWrite3P01 : SchedWriteRes<[HWPort01]> {
   let NumMicroOps = 3;
 }
 
-def WriteP015 : SchedWriteRes<[HWPort015]>;
-
-def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
-  let NumMicroOps = 2;
-}
-def WriteP06 : SchedWriteRes<[HWPort06]>;
-
-def Write2P06 : SchedWriteRes<[HWPort06]> {
-  let Latency = 1;
+def HWWriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
   let NumMicroOps = 2;
-  let ResourceCycles = [2];
 }
 
-def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-
-def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
-  let NumMicroOps = 2;
-}
-
-def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+def HWWrite2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2, 1];
 }
 
-def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
-}
-
-def Write5P0156 : SchedWriteRes<[HWPort0156]> {
-  let NumMicroOps = 5;
-  let ResourceCycles = [5];
-}
-
-def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
-  let Latency = 1;
-  let ResourceCycles = [1, 2, 1];
-}
-
-def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
-  let Latency = 1;
-  let ResourceCycles = [2, 2, 1];
-}
-
-def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
-  let Latency = 1;
-  let ResourceCycles = [3, 2, 1];
-}
-
 // Starting with P1.
-def WriteP1 : SchedWriteRes<[HWPort1]>;
+def HWWriteP1 : SchedWriteRes<[HWPort1]>;
 
-def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
-  let NumMicroOps = 2;
-}
-def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
-  let Latency = 3;
-}
-def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
-  let Latency = 7;
-}
 
-def Write2P1 : SchedWriteRes<[HWPort1]> {
+def HWWrite2P1 : SchedWriteRes<[HWPort1]> {
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
-}
-def WriteP15 : SchedWriteRes<[HWPort15]>;
-def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
-  let Latency = 4;
-}
-
-def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-
-def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
-}
-
-def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-
-def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
-}
-
-// Starting with P2.
-def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
-  let Latency = 1;
-  let ResourceCycles = [2, 1];
-}
-
-// Starting with P5.
-def WriteP5 : SchedWriteRes<[HWPort5]>;
-def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
 
 // Notation:
 // - r: register.
@@ -429,284 +583,215 @@ def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
 //=== Integer Instructions ===//
 //-- Move instructions --//
 
-// MOV.
-// r16,m.
-def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
-
-// MOVSX, MOVZX.
-// r,m.
-def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm8")>;
-
 // XLAT.
-def WriteXLAT : SchedWriteRes<[]> {
+def HWWriteXLAT : SchedWriteRes<[]> {
   let Latency = 7;
   let NumMicroOps = 3;
 }
-def : InstRW<[WriteXLAT], (instregex "XLAT")>;
-
-// PUSH.
-// m.
-def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+def : InstRW<[HWWriteXLAT], (instrs XLAT)>;
 
 // PUSHA.
-def WritePushA : SchedWriteRes<[]> {
+def HWWritePushA : SchedWriteRes<[]> {
   let NumMicroOps = 19;
 }
-def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
-
-// POP.
-// m.
-def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+def : InstRW<[HWWritePushA], (instregex "PUSHA(16|32)")>;
 
 // POPA.
-def WritePopA : SchedWriteRes<[]> {
+def HWWritePopA : SchedWriteRes<[]> {
   let NumMicroOps = 18;
 }
-def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+def : InstRW<[HWWritePopA], (instregex "POPA(16|32)")>;
 
 //-- Arithmetic instructions --//
 
 // DIV.
 // r8.
-def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+def HWWriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
   let Latency = 22;
   let NumMicroOps = 9;
 }
-def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+def : InstRW<[HWWriteDiv8], (instregex "DIV8r")>;
 
 // IDIV.
 // r8.
-def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+def HWWriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
   let Latency = 23;
   let NumMicroOps = 9;
 }
-def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+def : InstRW<[HWWriteIDiv8], (instregex "IDIV8r")>;
 
 // BT.
 // m,r.
-def WriteBTmr : SchedWriteRes<[]> {
+def HWWriteBTmr : SchedWriteRes<[]> {
   let NumMicroOps = 10;
 }
-def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+def : InstRW<[HWWriteBTmr], (instregex "BT(16|32|64)mr")>;
 
 // BTR BTS BTC.
 // m,r.
-def WriteBTRSCmr : SchedWriteRes<[]> {
+def HWWriteBTRSCmr : SchedWriteRes<[]> {
   let NumMicroOps = 11;
 }
-def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+def : InstRW<[HWWriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
 
 //-- Control transfer instructions --//
 
 // CALL.
 // i.
-def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+def HWWriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1, 2, 1];
 }
-def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+def : InstRW<[HWWriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
 
 // BOUND.
 // r,m.
-def WriteBOUND : SchedWriteRes<[]> {
+def HWWriteBOUND : SchedWriteRes<[]> {
   let NumMicroOps = 15;
 }
-def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+def : InstRW<[HWWriteBOUND], (instregex "BOUNDS(16|32)rm")>;
 
 // INTO.
-def WriteINTO : SchedWriteRes<[]> {
+def HWWriteINTO : SchedWriteRes<[]> {
   let NumMicroOps = 4;
 }
-def : InstRW<[WriteINTO], (instregex "INTO")>;
+def : InstRW<[HWWriteINTO], (instrs INTO)>;
 
 //-- String instructions --//
 
 // LODSB/W.
-def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+def : InstRW<[HWWrite2P0156_P23], (instregex "LODS(B|W)")>;
 
 // LODSD/Q.
-def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+def : InstRW<[HWWriteP0156_P23], (instregex "LODS(L|Q)")>;
 
 // MOVS.
-def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+def HWWriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 5;
   let ResourceCycles = [2, 1, 2];
 }
-def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+def : InstRW<[HWWriteMOVS], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>;
 
 // CMPS.
-def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+def HWWriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 5;
   let ResourceCycles = [2, 3];
 }
-def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+def : InstRW<[HWWriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
 
 //-- Other --//
 
 // RDPMC.f
-def WriteRDPMC : SchedWriteRes<[]> {
+def HWWriteRDPMC : SchedWriteRes<[]> {
   let NumMicroOps = 34;
 }
-def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+def : InstRW<[HWWriteRDPMC], (instrs RDPMC)>;
 
 // RDRAND.
-def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+def HWWriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
   let NumMicroOps = 17;
   let ResourceCycles = [1, 16];
 }
-def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+def : InstRW<[HWWriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
 
 //=== Floating Point x87 Instructions ===//
 //-- Move instructions --//
 
 // FLD.
 // m80.
-def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+def : InstRW<[HWWriteP01], (instregex "LD_Frr")>;
 
 // FBLD.
 // m80.
-def WriteFBLD : SchedWriteRes<[]> {
+def HWWriteFBLD : SchedWriteRes<[]> {
   let Latency = 47;
   let NumMicroOps = 43;
 }
-def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+def : InstRW<[HWWriteFBLD], (instregex "FBLDm")>;
 
 // FST(P).
 // r.
-def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
-
-// FLDZ.
-def : InstRW<[WriteP01], (instregex "LD_F0")>;
-
-// FLDPI FLDL2E etc.
-def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+def : InstRW<[HWWriteP01], (instregex "ST_(F|FP)rr")>;
 
 // FFREE.
-def : InstRW<[WriteP01], (instregex "FFREE")>;
+def : InstRW<[HWWriteP01], (instregex "FFREE")>;
 
 // FNSAVE.
-def WriteFNSAVE : SchedWriteRes<[]> {
+def HWWriteFNSAVE : SchedWriteRes<[]> {
   let NumMicroOps = 147;
 }
-def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+def : InstRW<[HWWriteFNSAVE], (instregex "FSAVEm")>;
 
 // FRSTOR.
-def WriteFRSTOR : SchedWriteRes<[]> {
+def HWWriteFRSTOR : SchedWriteRes<[]> {
   let NumMicroOps = 90;
 }
-def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+def : InstRW<[HWWriteFRSTOR], (instregex "FRSTORm")>;
 
 //-- Arithmetic instructions --//
 
-// FABS.
-def : InstRW<[WriteP0], (instregex "ABS_F")>;
-
-// FCHS.
-def : InstRW<[WriteP0], (instregex "CHS_F")>;
-
 // FCOMPP FUCOMPP.
 // r.
-def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+def : InstRW<[HWWrite2P01], (instrs FCOMPP, UCOM_FPPr)>;
 
 // FCOMI(P) FUCOMI(P).
 // m.
-def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
-                           "UCOM_FIPr")>;
+def : InstRW<[HWWrite3P01], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
 
 // FTST.
-def : InstRW<[WriteP1], (instregex "TST_F")>;
+def : InstRW<[HWWriteP1], (instregex "TST_F")>;
 
 // FXAM.
-def : InstRW<[Write2P1], (instregex "FXAM")>;
+def : InstRW<[HWWrite2P1], (instrs FXAM)>;
 
 // FPREM.
-def WriteFPREM : SchedWriteRes<[]> {
+def HWWriteFPREM : SchedWriteRes<[]> {
   let Latency = 19;
   let NumMicroOps = 28;
 }
-def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+def : InstRW<[HWWriteFPREM], (instrs FPREM)>;
 
 // FPREM1.
-def WriteFPREM1 : SchedWriteRes<[]> {
+def HWWriteFPREM1 : SchedWriteRes<[]> {
   let Latency = 27;
   let NumMicroOps = 41;
 }
-def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+def : InstRW<[HWWriteFPREM1], (instrs FPREM1)>;
 
 // FRNDINT.
-def WriteFRNDINT : SchedWriteRes<[]> {
+def HWWriteFRNDINT : SchedWriteRes<[]> {
   let Latency = 11;
   let NumMicroOps = 17;
 }
-def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+def : InstRW<[HWWriteFRNDINT], (instrs FRNDINT)>;
 
 //-- Math instructions --//
 
 // FSCALE.
-def WriteFSCALE : SchedWriteRes<[]> {
+def HWWriteFSCALE : SchedWriteRes<[]> {
   let Latency = 75; // 49-125
   let NumMicroOps = 50; // 25-75
 }
-def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+def : InstRW<[HWWriteFSCALE], (instrs FSCALE)>;
 
 // FXTRACT.
-def WriteFXTRACT : SchedWriteRes<[]> {
+def HWWriteFXTRACT : SchedWriteRes<[]> {
   let Latency = 15;
   let NumMicroOps = 17;
 }
-def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
-
-//-- Other instructions --//
-
-// FNOP.
-def : InstRW<[WriteP01], (instregex "FNOP")>;
-
-// WAIT.
-def : InstRW<[Write2P01], (instregex "WAIT")>;
-
-// FNCLEX.
-def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
-
-// FNINIT.
-def WriteFNINIT : SchedWriteRes<[]> {
-  let NumMicroOps = 26;
-}
-def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1, 2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
-}
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2, 1];
-}
+defm : HWWriteResPair<WriteFHAdd,  [HWPort1, HWPort5], 5, [1,2], 3, 6>;
+defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>;
+defm : HWWriteResPair<WritePHAdd,  [HWPort5, HWPort15], 3, [2,1], 3, 5>;
+defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
 
 //=== Floating Point XMM and YMM Instructions ===//
 
@@ -717,429 +802,69 @@ def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSSr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPDr")>;
-def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPSr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm",
+                                           "(V?)MOVSHDUPrm",
+                                           "(V?)MOVSLDUPrm",
+                                           "VPBROADCAST(D|Q)rm")>;
 
 def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> {
   let Latency = 7;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F32m")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F64m")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F80m")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTF128")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTI128")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VLDDQUYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPSYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQAYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQUYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPSYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F(32|64|80)m",
+                                             "VBROADCASTF128",
+                                             "VBROADCASTI128",
+                                             "VBROADCASTSDYrm",
+                                             "VBROADCASTSSYrm",
+                                             "VMOVDDUPYrm",
+                                             "VMOVSHDUPYrm",
+                                             "VMOVSLDUPYrm",
+                                             "VPBROADCAST(D|Q)Yrm")>;
 
 def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> {
   let Latency = 5;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOV64toPQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOV8rm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDDUPrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSDrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSSrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHNTA")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT0")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT1")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT2")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDDUPrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSDrm")>;
-def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSSrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16",
+                                             "MOVSX(16|32|64)rm32",
+                                             "MOVSX(16|32|64)rm8",
+                                             "MOVZX(16|32|64)rm16",
+                                             "MOVZX(16|32|64)rm8",
+                                             "(V?)MOVDDUPrm")>;
 
 def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> {
   let Latency = 1;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVSDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "ST_FP32m")>;
-def: InstRW<[HWWriteResGroup1], (instregex "ST_FP64m")>;
-def: InstRW<[HWWriteResGroup1], (instregex "ST_FP80m")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>;
-def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm",
+                                           "ST_FP(32|64|80)m",
+                                           "VMPTRSTm")>;
 
 def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>;
-def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQ(Y?)rr",
+                                           "VPSRLVQ(Y?)rr")>;
 
 def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup3], (instregex "COMP_FST0r")>;
-def: InstRW<[HWWriteResGroup3], (instregex "COM_FST0r")>;
-def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[HWWriteResGroup3], (instregex "UCOM_FPr")>;
-def: InstRW<[HWWriteResGroup3], (instregex "UCOM_Fr")>;
-def: InstRW<[HWWriteResGroup3], (instregex "VMASKMOVDQU")>;
+def: InstRW<[HWWriteResGroup3], (instregex "COM(P?)_FST0r",
+                                           "UCOM_F(P?)r")>;
 
 def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPDYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPSYrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>;
 
 def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> {
   let Latency = 1;
@@ -1153,661 +878,128 @@ def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>;
-def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>;
+def: InstRW<[HWWriteResGroup6], (instrs FINCSTP, FNOP)>;
 
 def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>;
-def: InstRW<[HWWriteResGroup7], (instregex "CQO")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JAE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JAE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JA_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JA_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JBE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JBE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JB_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JB_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JGE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JGE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JG_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JG_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JLE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JLE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JL_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JL_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JMP_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JMP_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNE_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNE_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNO_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNO_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNP_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNP_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNS_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JNS_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JO_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JO_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JP_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JP_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JS_1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "JS_4")>;
-def: InstRW<[HWWriteResGroup7], (instregex "RORX(32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SARX(32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>;
-def: InstRW<[HWWriteResGroup7], (instregex "SHRX(32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>;
+def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8",
+                                           "BT(16|32|64)rr",
+                                           "BTC(16|32|64)ri8",
+                                           "BTC(16|32|64)rr",
+                                           "BTR(16|32|64)ri8",
+                                           "BTR(16|32|64)rr",
+                                           "BTS(16|32|64)ri8",
+                                           "BTS(16|32|64)rr")>;
 
 def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>;
-def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr",
+                                           "BLSI(32|64)rr",
+                                           "BLSMSK(32|64)rr",
+                                           "BLSR(32|64)rr")>;
 
 def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDD(Y?)rri")>;
 
 def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CBW")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CLC")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMC")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>;
-def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SLDT64m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>;
-def: InstRW<[HWWriteResGroup10], (instregex "STC")>;
-def: InstRW<[HWWriteResGroup10], (instregex "STRm")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR8i8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>;
+def: InstRW<[HWWriteResGroup10], (instrs CBW, CWDE, CDQE,
+                                         CMC, STC)>;
+def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m",
+                                            "SIDT64m",
+                                            "SMSW16m",
+                                            "STRm",
+                                            "SYSCALL")>;
 
 def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "(V?)CVTPS2PDrm")>;
 
 def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup11_1], (instregex "CVTSS2SDrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VPSLLVQrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VPSRLVQrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPDrm")>;
-def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPSrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "(V?)CVTSS2SDrm",
+                                              "VPSLLVQrm",
+                                              "VPSRLVQrm")>;
 
 def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLDYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLWYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRADYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRAWYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLDYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLVQYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLWYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPDYrm")>;
-def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPSYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm",
+                                              "VPSRLVQYrm")>;
 
 def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup12], (instregex "ADDSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "ADDSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "CMPSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "CMPSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "COMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "COMISSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOM32m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOM64m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)rm(i8)?")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL8m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MUL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "MUL8m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "SUBSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "SUBSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "TZCNT(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "UCOMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "UCOMISSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VADDSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VADDSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCMPSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCMPSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCOMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VCOMISSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VSUBSDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VSUBSSrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISDrm")>;
-def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm",
+                                            "PDEP(32|64)rm",
+                                            "PEXT(32|64)rm")>;
+
+def HWWriteResGroup12_1 : SchedWriteRes<[HWPort1,HWPort0156,HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup12_1], (instrs IMUL16rmi, IMUL16rmi8)>;
+
+def HWWriteResGroup12_2 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156,HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup12_2], (instrs IMUL16m, MUL16m)>;
 
 def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 7;
+  let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup13], (instregex "ANDNPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ANDNPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ANDPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ANDPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "INSERTPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "ORPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKSSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKSSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKUSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PACKUSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PALIGNRrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PBLENDWrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFDmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFHWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PSHUFLWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "SHUFPDrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "SHUFPSrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VANDPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VINSERTPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VORPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSrmi")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VXORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "VXORPSrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "XORPDrm")>;
-def: InstRW<[HWWriteResGroup13], (instregex "XORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "(V?)PMOV(SX|ZX)BDrm",
+                                            "(V?)PMOV(SX|ZX)BQrm",
+                                            "(V?)PMOV(SX|ZX)BWrm",
+                                            "(V?)PMOV(SX|ZX)DQrm",
+                                            "(V?)PMOV(SX|ZX)WDrm",
+                                            "(V?)PMOV(SX|ZX)WQrm")>;
 
 def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VORPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VORPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFBYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFDYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPDYrm")>;
-def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPSYrm")>;
-
-def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRBrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRWrmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPSrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRBrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRWrmi")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm",
+                                              "VPMOVSXBQYrm",
+                                              "VPMOVSXWQYrm")>;
 
 def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64")>;
-def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64",
+                                            "JMP(16|32|64)m")>;
 
 def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> {
   let Latency = 6;
@@ -1815,323 +1007,55 @@ def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup15], (instregex "RORX32mi")>;
-def: InstRW<[HWWriteResGroup15], (instregex "RORX64mi")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SARX32rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SARX64rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHLX32rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHLX64rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHRX32rm")>;
-def: InstRW<[HWWriteResGroup15], (instregex "SHRX64rm")>;
 
 def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDQirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[HWWriteResGroup16], (instregex "MOVBE(16|32|64)rm")>;
-
-def HWWriteResGroup16_1 : SchedWriteRes<[HWPort23,HWPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup16_1], (instregex "PABSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PABSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PABSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PADDWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNBrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNDrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNWrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNBrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNDrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNWrm128")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBDrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBQrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSBrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSWrm")>;
-def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBWrm")>;
-
-def HWWriteResGroup16_2 : SchedWriteRes<[HWPort23,HWPort15]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDQYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBDYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBQYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSWYrm")>;
-def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm",
+                                            "BLSI(32|64)rm",
+                                            "BLSMSK(32|64)rm",
+                                            "BLSR(32|64)rm",
+                                            "MOVBE(16|32|64)rm")>;
 
 def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup17], (instregex "BLENDPDrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "BLENDPSrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PANDNrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PANDrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PORrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "PXORrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VINSERTI128rm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPANDNrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPANDrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDrmi")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPORrm")>;
-def: InstRW<[HWWriteResGroup17], (instregex "VPXORrm")>;
-
-def HWWriteResGroup17_1 : SchedWriteRes<[HWPort23,HWPort015]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDNirm")>;
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDirm")>;
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PORirm")>;
-def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PXORirm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm",
+                                            "VINSERTI128rm",
+                                            "VPBLENDDrmi")>;
 
 def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDNYrm")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDYrm")>;
 def: InstRW<[HWWriteResGroup17_2], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPORYrm")>;
-def: InstRW<[HWWriteResGroup17_2], (instregex "VPXORYrm")>;
 
 def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup18], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "ADD8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "AND8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP8mi")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP8mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "TEST8mi")>;
-def: InstRW<[HWWriteResGroup18], (instregex "TEST8mr")>;
-def: InstRW<[HWWriteResGroup18], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "XOR8rm")>;
+def: InstRW<[HWWriteResGroup18], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)rmr")>;
 
 def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>;
-
-def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup20], (instregex "EXTRACTPSmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRBmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRDmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRQmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "PEXTRWmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "STMXCSR")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRBmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRDmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRQmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRWmr")>;
-def: InstRW<[HWWriteResGroup20], (instregex "VSTMXCSR")>;
+def: InstRW<[HWWriteResGroup19], (instrs SFENCE)>;
 
 def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> {
   let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup21], (instregex "FNSTCW16m")>;
-
-def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort06]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup22], (instregex "SETAEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETBm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETGEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETGm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETLEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETLm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNEm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNOm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNPm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETNSm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETOm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETPm")>;
-def: InstRW<[HWWriteResGroup22], (instregex "SETSm")>;
+def: InstRW<[HWWriteResGroup21], (instrs FNSTCW16m)>;
 
 def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> {
   let Latency = 2;
@@ -2145,174 +1069,75 @@ def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup23_16], (instregex "MOVBE16mr")>;
+def: InstRW<[HWWriteResGroup23_16], (instrs MOVBE16mr)>;
 
 def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
   let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSQ")>;
-def: InstRW<[HWWriteResGroup24], (instregex "STOSW")>;
+def: InstRW<[HWWriteResGroup24], (instrs PUSH16r, PUSH32r, PUSH64r,
+                                         STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)rmr",
+                                            "PUSH64i8")>;
 
 def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup25], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup25], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR8m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SAR8mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL8m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHL8mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR8m1")>;
-def: InstRW<[HWWriteResGroup25], (instregex "SHR8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8",
+                                            "BTR(16|32|64)mi8",
+                                            "BTS(16|32|64)mi8",
+                                            "SAR(8|16|32|64)m1",
+                                            "SAR(8|16|32|64)mi",
+                                            "SHL(8|16|32|64)m1",
+                                            "SHL(8|16|32|64)mi",
+                                            "SHR(8|16|32|64)m1",
+                                            "SHR(8|16|32|64)mi")>;
 
 def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "ADD8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "ADD8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "AND8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "DEC8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "INC(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "INC8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NEG8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "NOT8m")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "OR8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[HWWriteResGroup26], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "SUB8mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR8mi")>;
-def: InstRW<[HWWriteResGroup26], (instregex "XOR8mr")>;
-
-def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0")>;
-def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPSrr0")>;
-def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PBLENDVBrr0")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRBrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRDrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRQrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "PINSRWrri")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRBrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRDrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRQrr")>;
-def: InstRW<[HWWriteResGroup27], (instregex "VPINSRWrri")>;
+def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm",
+                                            "PUSH(16|32|64)rmm")>;
 
 def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[HWWriteResGroup28], (instregex "FDECSTP")>;
+def: InstRW<[HWWriteResGroup28], (instrs FDECSTP)>;
 
 def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROL8r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROL8ri")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR8r1")>;
-def: InstRW<[HWWriteResGroup29], (instregex "ROR8ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL(8|16|32|64)r1",
+                                            "ROL(8|16|32|64)ri",
+                                            "ROR(8|16|32|64)r1",
+                                            "ROR(8|16|32|64)ri")>;
 
 def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[HWWriteResGroup30], (instregex "LFENCE")>;
-def: InstRW<[HWWriteResGroup30], (instregex "MFENCE")>;
-def: InstRW<[HWWriteResGroup30], (instregex "WAIT")>;
-def: InstRW<[HWWriteResGroup30], (instregex "XGETBV")>;
+def: InstRW<[HWWriteResGroup30], (instrs LFENCE,
+                                         MFENCE,
+                                         WAIT,
+                                         XGETBV)>;
 
 def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup31], (instregex "CVTPS2PDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "CVTSS2SDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "EXTRACTPSrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRBrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWri")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSLLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSLLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSLLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRADrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRAWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PSRLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "PTESTrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRBrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWri")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSLLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSLLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSLLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRADrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRAWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRLDrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRLQrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPSRLWrr")>;
-def: InstRW<[HWWriteResGroup31], (instregex "VPTESTrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "(V?)CVTPS2PDrr",
+                                            "(V?)CVTSS2SDrr")>;
 
 def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> {
   let Latency = 2;
@@ -2328,175 +1153,44 @@ def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> {
 }
 def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>;
 
-def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>;
-
 def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "CWD")>;
-def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>;
-
-def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0")>;
-def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPSrm0")>;
-def: InstRW<[HWWriteResGroup36], (instregex "PBLENDVBrm0")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQrm")>;
-
-def HWWriteResGroup36_1 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPSYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVQYrm")>;
+def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>;
 
 def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm",
+                                              "MMX_PACKSSWBirm",
+                                              "MMX_PACKUSWBirm")>;
 
 def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[HWWriteResGroup37], (instregex "LEAVE64")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASB")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASL")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASQ")>;
-def: InstRW<[HWWriteResGroup37], (instregex "SCASW")>;
-
-def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup38], (instregex "PSLLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSLLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSLLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRADrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRAWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PSRLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "PTESTrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSLLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSLLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSLLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRADrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRAWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRLDrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRLQrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPSRLWrm")>;
-def: InstRW<[HWWriteResGroup38], (instregex "VPTESTrm")>;
+def: InstRW<[HWWriteResGroup37], (instrs LEAVE, LEAVE64,
+                                         SCASB, SCASL, SCASQ, SCASW)>;
 
 def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup39], (instregex "FLDCW16m")>;
-
-def HWWriteResGroup40 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup40], (instregex "LDMXCSR")>;
-def: InstRW<[HWWriteResGroup40], (instregex "VLDMXCSR")>;
+def: InstRW<[HWWriteResGroup39], (instrs FLDCW16m)>;
 
 def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup41], (instregex "LRETQ")>;
-def: InstRW<[HWWriteResGroup41], (instregex "RETL")>;
-def: InstRW<[HWWriteResGroup41], (instregex "RETQ")>;
-
-def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>;
-
-def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup43], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "ADC8rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup43], (instregex "SBB8rm")>;
+def: InstRW<[HWWriteResGroup41], (instrs LRETQ, RETL, RETQ)>;
 
 def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
   let Latency = 3;
@@ -2510,356 +1204,106 @@ def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[HWWriteResGroup45], (instregex "CALL64pcrel32")>;
-def: InstRW<[HWWriteResGroup45], (instregex "SETAm")>;
-def: InstRW<[HWWriteResGroup45], (instregex "SETBEm")>;
+def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>;
+def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>;
 
 def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROL8m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROL8mi")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR8m1")>;
-def: InstRW<[HWWriteResGroup46], (instregex "ROR8mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m1",
+                                            "ROL(8|16|32|64)mi",
+                                            "ROR(8|16|32|64)m1",
+                                            "ROR(8|16|32|64)mi")>;
 
 def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[HWWriteResGroup47], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup47], (instregex "XADD8rm")>;
+def: InstRW<[HWWriteResGroup47], (instregex "XADD(8|16|32|64)rm")>;
 
 def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,1,1];
 }
-def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup48], (instregex "FARCALL64")>;
-
-def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPDrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPSrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "PMOVMSKBrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m",
+                                            "FARCALL64")>;
 
 def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADD_FPrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADD_FST0r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "ADD_FrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "COMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8)?")>;
-def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "MUL8r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FPrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FST0r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUB_FPrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUB_FST0r")>;
-def: InstRW<[HWWriteResGroup50], (instregex "SUB_FrST0")>;
-def: InstRW<[HWWriteResGroup50], (instregex "TZCNT(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrri")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>;
-
-def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
-  let Latency = 3;
-  let NumMicroOps = 4;
-}
-def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8)?")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr",
+                                            "PDEP(32|64)rr",
+                                            "PEXT(32|64)rr",
+                                            "SHLD(16|32|64)rri8",
+                                            "SHRD(16|32|64)rri8",
+                                            "(V?)CVTDQ2PS(Y?)rr")>;
 
-def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
+def HWWriteResGroup50_16i : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8)?")>;
+def: InstRW<[HWWriteResGroup50_16i], (instrs IMUL16rri, IMUL16rri8)>;
 
 def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VINSERTF128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VINSERTI128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYri")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYri")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCAST(B|W)rr")>;
 
 def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup52], (instregex "ADDPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "ADDPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CMPPDrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CMPPSrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CVTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "SUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "SUBPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSrmi")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDrm")>;
-def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "(V?)CVTPS2DQrm",
+                                            "(V?)CVTTPS2DQrm")>;
 
 def HWWriteResGroup52_1 : SchedWriteRes<[HWPort1,HWPort23]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F16m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F32m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F64m")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPDYrmi")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPSYrmi")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPDYrm")>;
-def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPSYrm")>;
-
-def HWWriteResGroup53 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERM2I128rm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMDYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMPDYmi")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMPSYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPERMQYmi")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWQYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+                                              "ILD_F(16|32|64)m",
+                                              "VCVTDQ2PSYrm",
+                                              "VCVTPS2DQYrm",
+                                              "VCVTTPS2DQYrm")>;
 
 def HWWriteResGroup53_1 : SchedWriteRes<[HWPort5,HWPort23]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVZXWDYrm")>;
+def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm",
+                                              "VPMOVSXDQYrm",
+                                              "VPMOVSXWDYrm",
+                                              "VPMOVZXWDYrm")>;
 
 def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[HWWriteResGroup54], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup54], (instregex "XADD8rr")>;
-def: InstRW<[HWWriteResGroup54], (instregex "XCHG8rr")>;
-
-def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDYrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDYrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDYrr")>;
-def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDrr")>;
-
-def HWWriteResGroup56 : SchedWriteRes<[HWPort5,HWPort15]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBSWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBWrr64")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHADDDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHADDSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHADDWrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHSUBDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHSUBSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "PHSUBWrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr256")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr256")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWYrr")>;
-def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWrr")>;
+def: InstRW<[HWWriteResGroup54], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+                                         XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+                                         XCHG16ar, XCHG32ar, XCHG64ar)>;
 
 def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr",
+                                            "MMX_PACKSSWBirr",
+                                            "MMX_PACKUSWBirr")>;
 
 def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
   let Latency = 3;
@@ -2873,202 +1317,80 @@ def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[HWWriteResGroup59], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup59], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL8r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCL8ri")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR8r1")>;
-def: InstRW<[HWWriteResGroup59], (instregex "RCR8ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r1",
+                                            "RCL(8|16|32|64)ri",
+                                            "RCR(8|16|32|64)r1",
+                                            "RCR(8|16|32|64)ri")>;
 
 def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup60], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "ROL8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "ROR8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SAR8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHL8rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup60], (instregex "SHR8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROL(8|16|32|64)rCL",
+                                            "ROR(8|16|32|64)rCL",
+                                            "SAR(8|16|32|64)rCL",
+                                            "SHL(8|16|32|64)rCL",
+                                            "SHR(8|16|32|64)rCL")>;
 
 def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
   let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup61], (instregex "FNSTSWm")>;
+def: InstRW<[HWWriteResGroup61], (instrs FNSTSWm)>;
 
 def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> {
   let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP16m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP32m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP64m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_F16m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_F32m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_FP16m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_FP32m")>;
-def: InstRW<[HWWriteResGroup62], (instregex "IST_FP64m")>;
-
-def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm")>;
-def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDYrm")>;
-def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDYrm")>;
-
-def HWWriteResGroup63_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup63_1], (instregex "VPSLLVDrm")>;
-def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRAVDrm")>;
-def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRLVDrm")>;
-
-def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBSWrm64")>;
-def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBWrm64")>;
-
-def HWWriteResGroup64_1 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
-  let Latency = 10;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDDYrm")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDSWrm256")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDWYrm")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBDYrm")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBSWrm256")>;
-def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBWYrm")>;
-
-def HWWriteResGroup64_2 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDWrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBWrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDWrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBDrm")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBSWrm128")>;
-def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBWrm")>;
-
-def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,2];
-}
-def: InstRW<[HWWriteResGroup65], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup65], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m",
+                                            "IST_F(16|32)m")>;
 
 def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
   let Latency = 9;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCL8m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCL8mi")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR8m1")>;
-def: InstRW<[HWWriteResGroup66], (instregex "RCR8mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(8|16|32|64)m1",
+                                            "RCL(8|16|32|64)mi",
+                                            "RCR(8|16|32|64)m1",
+                                            "RCR(8|16|32|64)mi")>;
 
 def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
   let Latency = 9;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,2,1];
 }
-def: InstRW<[HWWriteResGroup67], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup67], (instregex "ROR8mCL")>;
+def: InstRW<[HWWriteResGroup67], (instregex "ROR(8|16|32|64)mCL")>;
 
 def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 9;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,3];
 }
-def: InstRW<[HWWriteResGroup68], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "ADC8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "ADD8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "AND8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "OR8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "SUB8mi")>;
-def: InstRW<[HWWriteResGroup68], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup68], (instregex "XCHG8rm")>;
-def: InstRW<[HWWriteResGroup68], (instregex "XOR8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG(8|16|32|64)rm")>;
 
 def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> {
   let Latency = 9;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,2,1];
 }
-def: InstRW<[HWWriteResGroup69], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "ADC8mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG8rm")>;
-def: InstRW<[HWWriteResGroup69], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "ROL8mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SAR8mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB8mi")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SBB8mr")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHL8mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup69], (instregex "SHR8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(8|16|32|64)rm",
+                                            "ROL(8|16|32|64)mCL",
+                                            "SAR(8|16|32|64)mCL",
+                                            "SHL(8|16|32|64)mCL",
+                                            "SHR(8|16|32|64)mCL")>;
+def: SchedAlias<WriteADCRMW, HWWriteResGroup69>;
 
 def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "(V?)CVT(T?)SD2SI(64)?rr",
+                                            "(V?)CVT(T?)SS2SI(64)?rr")>;
 
 def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
   let Latency = 4;
@@ -3076,104 +1398,60 @@ def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSLLDYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSLLQYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSLLWYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRADYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRAWYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRLDYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRLQYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPSRLWYrr")>;
-def: InstRW<[HWWriteResGroup71], (instregex "VPTESTYrr")>;
 
 def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup72], (instregex "FNSTSW16r")>;
+def: InstRW<[HWWriteResGroup72], (instrs FNSTSW16r)>;
 
 def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup73], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2DQrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2PSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSD2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSI642SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTDQ2PDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[HWWriteResGroup73], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr",
+                                            "MMX_CVT(T?)PD2PIirr",
+                                            "MMX_CVT(T?)PS2PIirr",
+                                            "(V?)CVTDQ2PDrr",
+                                            "(V?)CVTPD2PSrr",
+                                            "(V?)CVTSD2SSrr",
+                                            "(V?)CVTSI(64)?2SDrr",
+                                            "(V?)CVTSI2SSrr",
+                                            "(V?)CVT(T?)PD2DQrr")>;
 
 def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup74], (instregex "IMUL64r")>;
-def: InstRW<[HWWriteResGroup74], (instregex "MUL64r")>;
-def: InstRW<[HWWriteResGroup74], (instregex "MULX64rr")>;
+def: InstRW<[HWWriteResGroup74], (instrs IMUL64r, MUL64r, MULX64rr)>;
 
-def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort06, HWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
 }
-def: InstRW<[HWWriteResGroup74_16], (instregex "IMUL16r")>;
-def: InstRW<[HWWriteResGroup74_16], (instregex "MUL16r")>;
-
-def HWWriteResGroup74_32 : SchedWriteRes<[HWPort1,HWPort0156]> {
-  let Latency = 4;
-  let NumMicroOps = 3;
-}
-def: InstRW<[HWWriteResGroup74_32], (instregex "IMUL32r")>;
-def: InstRW<[HWWriteResGroup74_32], (instregex "MUL32r")>;
+def: InstRW<[HWWriteResGroup74_16], (instrs IMUL16r, MUL16r)>;
 
 def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup75], (instregex "FICOM16m")>;
-def: InstRW<[HWWriteResGroup75], (instregex "FICOM32m")>;
-def: InstRW<[HWWriteResGroup75], (instregex "FICOMP16m")>;
-def: InstRW<[HWWriteResGroup75], (instregex "FICOMP32m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM(P?)(16|32)m")>;
 
 def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "(V?)CVTSD2SI(64)?rm",
+                                            "(V?)CVTSS2SI(64)?rm",
+                                            "(V?)CVTTSD2SI(64)?rm",
+                                            "VCVTTSS2SI64rm",
+                                            "(V?)CVTTSS2SIrm")>;
 
 def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
   let Latency = 10;
@@ -3182,65 +1460,51 @@ def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
 }
 def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>;
 
-def HWWriteResGroup77_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup77_1], (instregex "VPTESTYrm")>;
-
 def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
   let Latency = 10;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup78], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2DQrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTTPD2PIirm")>;
-def: InstRW<[HWWriteResGroup78], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm",
+                                            "CVT(T?)PD2DQrm",
+                                            "MMX_CVT(T?)PD2PIirm",
+                                            "(V?)CVTDQ2PDrm")>;
 
 def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup78_1], (instregex "CVTSD2SSrm")>;
-def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[HWWriteResGroup78_1], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm",
+                                              "(V?)CVTSD2SSrm")>;
 
 def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup79], (instregex "MULX64rm")>;
+def: InstRW<[HWWriteResGroup79], (instrs IMUL64m, MUL64m, MULX64rm)>;
 
 def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCAST(B|W)(Y?)rm")>;
 
 def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [4];
 }
-def: InstRW<[HWWriteResGroup81], (instregex "FNCLEX")>;
+def: InstRW<[HWWriteResGroup81], (instrs FNCLEX)>;
 
 def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[HWWriteResGroup82], (instregex "VZEROUPPER")>;
+def: InstRW<[HWWriteResGroup82], (instrs VZEROUPPER)>;
 
 def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
   let Latency = 4;
@@ -3249,256 +1513,58 @@ def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
 }
 def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>;
 
-def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> {
-  let Latency = 5;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQmr")>;
-
-def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
-  let Latency = 5;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>;
-
 def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 10;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[HWWriteResGroup86], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8",
+                                            "SHRD(16|32|64)mri8")>;
 
 def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> {
   let Latency = 9;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[HWWriteResGroup87], (instregex "LSL(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm",
+                                            "LSL(16|32|64)rm")>;
 
 def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,4];
 }
-def: InstRW<[HWWriteResGroup88], (instregex "PUSHF16")>;
-def: InstRW<[HWWriteResGroup88], (instregex "PUSHF64")>;
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF(16|64)")>;
 
 def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
   let Latency = 5;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MUL_FPrST0")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MUL_FST0r")>;
-def: InstRW<[HWWriteResGroup89], (instregex "MUL_FrST0")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PCMPGTQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMADDUBSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMADDWDrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULHRSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULHUWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULHWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULLWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PMULUDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "PSADBWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RCPPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RCPSSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RSQRTPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "RSQRTSSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWYrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWrr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRCPPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRCPSSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTPSr")>;
-def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTSSr")>;
-
-def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup90], (instregex "MULPDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "MULPSrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "MULSDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "MULSSrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPDYrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPSYrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULPSrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULSDrr")>;
-def: InstRW<[HWWriteResGroup90], (instregex "VMULSSrr")>;
-def: InstRW<[HWWriteResGroup90],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
-                       "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
-
-def HWWriteResGroup91 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "RCPSSm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "RSQRTSSm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "VRCPSSm")>;
-def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTSSm")>;
-
-def HWWriteResGroup91_1 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 18;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup91_1], (instregex "SQRTSSm")>;
-def: InstRW<[HWWriteResGroup91_1], (instregex "VDIVSSrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr",
+                                            "MUL_(FPrST0|FST0r|FrST0)")>;
 
 def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup91_2], (instregex "PCMPGTQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDUBSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDWDrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHRSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHUWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULLWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PMULUDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "PSADBWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "RCPPSm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "RSQRTPSm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPCMPGTQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDWDrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHRSWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHUWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULLWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULUDQrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VPSADBWrm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VRCPPSm")>;
-def: InstRW<[HWWriteResGroup91_2], (instregex "VRSQRTPSm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "(V?)PCMPGTQrm")>;
 
 def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 12;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F32m")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F64m")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDWDYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULDQYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHUWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULLWYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULUDQYrm")>;
-def: InstRW<[HWWriteResGroup91_3], (instregex "VPSADBWYrm")>;
-
-def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup92], (instregex "MULPDrm")>;
-def: InstRW<[HWWriteResGroup92], (instregex "MULPSrm")>;
-def: InstRW<[HWWriteResGroup92], (instregex "VMULPDrm")>;
-def: InstRW<[HWWriteResGroup92], (instregex "VMULPSrm")>;
-def: InstRW<[HWWriteResGroup92],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>;
-
-def HWWriteResGroup92_1 : SchedWriteRes<[HWPort01,HWPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPDYrm")>;
-def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPSYrm")>;
-def: InstRW<[HWWriteResGroup92_1],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-
-def HWWriteResGroup92_2 : SchedWriteRes<[HWPort01,HWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup92_2], (instregex "MULSDrm")>;
-def: InstRW<[HWWriteResGroup92_2], (instregex "MULSSrm")>;
-def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSDrm")>;
-def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSSrm")>;
-def: InstRW<[HWWriteResGroup92_2],
-            (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F(32|64)m",
+                                              "VPCMPGTQYrm")>;
 
 def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[HWWriteResGroup93], (instregex "CVTSI642SSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HADDPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HADDPSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HSUBPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "HSUBPSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSYrr")>;
-def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "(V?)CVTSI642SSrr")>;
 
 def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
   let Latency = 5;
@@ -3508,35 +1574,11 @@ def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
 def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>;
 
 def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
-  let Latency = 5;
+  let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup95], (instregex "MULX32rr")>;
-
-def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup96], (instregex "HADDPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "HADDPSrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "HSUBPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "HSUBPSrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDrm")>;
-def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSrm")>;
-
-def HWWriteResGroup96_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPDYrm")>;
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPSYrm")>;
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPDYrm")>;
-def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup95], (instrs IMUL32r, MUL32r, MULX32rr)>;
 
 def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
   let Latency = 10;
@@ -3546,72 +1588,48 @@ def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
 def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>;
 
 def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> {
-  let Latency = 10;
+  let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[HWWriteResGroup98], (instregex "MULX32rm")>;
+def: InstRW<[HWWriteResGroup98], (instrs IMUL32m, MUL32m, MULX32rm)>;
 
 def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [1,4];
 }
-def: InstRW<[HWWriteResGroup99], (instregex "PAUSE")>;
+def: InstRW<[HWWriteResGroup99], (instrs PAUSE)>;
 
 def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [1,4];
 }
-def: InstRW<[HWWriteResGroup100], (instregex "XSETBV")>;
+def: InstRW<[HWWriteResGroup100], (instrs XSETBV)>;
 
 def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [2,3];
 }
-def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG8rr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(8|16|32|64)rr")>;
 
 def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[HWWriteResGroup102], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr",
+                                             "VCVTPD2PSYrr",
+                                             "VCVT(T?)PD2DQYrr")>;
 
 def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> {
   let Latency = 13;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI16m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI32m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI16m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI32m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI16m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI32m")>;
-def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPDm")>;
-def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPSm")>;
-
-def HWWriteResGroup103_1 : SchedWriteRes<[HWPort1,HWPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPSm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSSm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPSm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSDm")>;
-def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSSm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
 
 def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
   let Latency = 12;
@@ -3625,15 +1643,8 @@ def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[HWWriteResGroup105], (instregex "SHRD(16|32|64)rrCL")>;
-
-def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
-  let Latency = 7;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL",
+                                             "SHRD(16|32|64)rrCL")>;
 
 def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> {
   let Latency = 6;
@@ -3647,212 +1658,44 @@ def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> {
   let NumMicroOps = 6;
   let ResourceCycles = [1,5];
 }
-def: InstRW<[HWWriteResGroup108], (instregex "STD")>;
+def: InstRW<[HWWriteResGroup108], (instrs STD)>;
 
 def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> {
   let Latency = 12;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,1,2];
 }
-def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[HWWriteResGroup109], (instregex "SHRD(16|32|64)mrCL")>;
-
-def HWWriteResGroup110 : SchedWriteRes<[HWPort5]> {
-  let Latency = 7;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup110], (instregex "AESDECLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "AESDECrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "AESENCLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "AESENCrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESDECLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESDECrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESENCLASTrr")>;
-def: InstRW<[HWWriteResGroup110], (instregex "VAESENCrr")>;
-
-def HWWriteResGroup111 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 13;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup111], (instregex "AESDECLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "AESDECrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "AESENCLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "AESENCrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESDECLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESDECrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESENCLASTrm")>;
-def: InstRW<[HWWriteResGroup111], (instregex "VAESENCrm")>;
-
-def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort5]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[HWWriteResGroup112], (instregex "MPSADBWrri")>;
-def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWYrri")>;
-def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWrri")>;
-
-def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup113], (instregex "MPSADBWrmi")>;
-def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWrmi")>;
-
-def HWWriteResGroup113_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 14;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[HWWriteResGroup113_1], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL",
+                                             "SHRD(16|32|64)mrCL")>;
 
 def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 7;
   let NumMicroOps = 7;
   let ResourceCycles = [2,2,1,2];
 }
-def: InstRW<[HWWriteResGroup114], (instregex "LOOP")>;
+def: InstRW<[HWWriteResGroup114], (instrs LOOP)>;
 
 def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
   let Latency = 15;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI16m")>;
-def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI32m")>;
-
-def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup116], (instregex "DPPDrri")>;
-def: InstRW<[HWWriteResGroup116], (instregex "VDPPDrri")>;
-
-def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
-  let Latency = 15;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[HWWriteResGroup117], (instregex "DPPDrmi")>;
-def: InstRW<[HWWriteResGroup117], (instregex "VDPPDrmi")>;
-
-def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup118], (instregex "PMULLDrr")>;
-def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDYrr")>;
-def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDrr")>;
-
-def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup119], (instregex "PMULLDrm")>;
-def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDrm")>;
-
-def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 17;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup119_1], (instregex "VPMULLDYrm")>;
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI(16|32)m")>;
 
 def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 16;
   let NumMicroOps = 10;
   let ResourceCycles = [1,1,1,4,1,2];
 }
-def: InstRW<[HWWriteResGroup120], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup120], (instregex "RCL8mCL")>;
-
-def HWWriteResGroup121 : SchedWriteRes<[HWPort0]> {
-  let Latency = 11;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup121], (instregex "DIVPSrr")>;
-def: InstRW<[HWWriteResGroup121], (instregex "DIVSSrr")>;
-
-def HWWriteResGroup122 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 17;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup122], (instregex "DIVPSrm")>;
-
-def HWWriteResGroup122_1 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup122_1], (instregex "DIVSSrm")>;
-
-def HWWriteResGroup123 : SchedWriteRes<[HWPort0]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRIrr")>;
-def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRM128rr")>;
-
-def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort5]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup124], (instregex "PCLMULQDQrr")>;
-def: InstRW<[HWWriteResGroup124], (instregex "VPCLMULQDQrr")>;
-
-def HWWriteResGroup125 : SchedWriteRes<[HWPort0,HWPort015]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup125], (instregex "VRCPPSYr")>;
-def: InstRW<[HWWriteResGroup125], (instregex "VRSQRTPSYr")>;
-
-def HWWriteResGroup126 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 17;
-  let NumMicroOps = 4;
-  let ResourceCycles = [3,1];
-}
-def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRIrm")>;
-def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRM128rm")>;
-
-def HWWriteResGroup127 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
-  let Latency = 17;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup127], (instregex "PCLMULQDQrm")>;
-def: InstRW<[HWWriteResGroup127], (instregex "VPCLMULQDQrm")>;
-
-def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
-  let Latency = 18;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup128], (instregex "VRCPPSYm")>;
-def: InstRW<[HWWriteResGroup128], (instregex "VRSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup120], (instregex "RCL(8|16|32|64)mCL")>;
 
 def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
   let Latency = 11;
   let NumMicroOps = 7;
   let ResourceCycles = [2,2,3];
 }
-def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[HWWriteResGroup129], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL",
+                                             "RCR(16|32|64)rCL")>;
 
 def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 11;
@@ -3866,101 +1709,21 @@ def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> {
   let NumMicroOps = 11;
   let ResourceCycles = [2,9];
 }
-def: InstRW<[HWWriteResGroup131], (instregex "LOOPE")>;
-def: InstRW<[HWWriteResGroup131], (instregex "LOOPNE")>;
+def: InstRW<[HWWriteResGroup131], (instrs LOOPE, LOOPNE)>;
 
 def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 17;
   let NumMicroOps = 14;
   let ResourceCycles = [1,1,1,4,2,5];
 }
-def: InstRW<[HWWriteResGroup132], (instregex "CMPXCHG8B")>;
-
-def HWWriteResGroup133 : SchedWriteRes<[HWPort0]> {
-  let Latency = 13;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup133], (instregex "SQRTPSr")>;
-def: InstRW<[HWWriteResGroup133], (instregex "SQRTSSr")>;
-def: InstRW<[HWWriteResGroup133], (instregex "VDIVPSrr")>;
-def: InstRW<[HWWriteResGroup133], (instregex "VDIVSSrr")>;
-
-def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 19;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup134], (instregex "DIVSDrm")>;
-def: InstRW<[HWWriteResGroup134], (instregex "SQRTPSm")>;
-def: InstRW<[HWWriteResGroup134], (instregex "VDIVPSrm")>;
-def: InstRW<[HWWriteResGroup134], (instregex "VSQRTSSm")>;
+def: InstRW<[HWWriteResGroup132], (instrs CMPXCHG8B)>;
 
 def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 19;
   let NumMicroOps = 11;
   let ResourceCycles = [2,1,1,3,1,3];
 }
-def: InstRW<[HWWriteResGroup135], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[HWWriteResGroup135], (instregex "RCR8mCL")>;
-
-def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> {
-  let Latency = 14;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup136], (instregex "DIVPDrr")>;
-def: InstRW<[HWWriteResGroup136], (instregex "DIVSDrr")>;
-def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPSr")>;
-def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSSr")>;
-
-def HWWriteResGroup137 : SchedWriteRes<[HWPort5]> {
-  let Latency = 14;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup137], (instregex "AESIMCrr")>;
-def: InstRW<[HWWriteResGroup137], (instregex "VAESIMCrr")>;
-
-def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 20;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup138], (instregex "DIVPDrm")>;
-def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSm")>;
-
-def HWWriteResGroup139 : SchedWriteRes<[HWPort5,HWPort23]> {
-  let Latency = 20;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup139], (instregex "AESIMCrm")>;
-def: InstRW<[HWWriteResGroup139], (instregex "VAESIMCrm")>;
-
-def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
-  let Latency = 14;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup140], (instregex "DPPSrri")>;
-def: InstRW<[HWWriteResGroup140], (instregex "VDPPSYrri")>;
-def: InstRW<[HWWriteResGroup140], (instregex "VDPPSrri")>;
-
-def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
-  let Latency = 20;
-  let NumMicroOps = 5;
-  let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[HWWriteResGroup141], (instregex "DPPSrmi")>;
-def: InstRW<[HWWriteResGroup141], (instregex "VDPPSrmi")>;
-
-def HWWriteResGroup141_1 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
-  let Latency = 21;
-  let NumMicroOps = 5;
-  let ResourceCycles = [2,1,1,1];
-}
-def: InstRW<[HWWriteResGroup141_1], (instregex "VDPPSYrmi")>;
+def: InstRW<[HWWriteResGroup135], (instregex "RCR(8|16|32|64)mCL")>;
 
 def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 14;
@@ -3981,54 +1744,35 @@ def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort2
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,1,1,1,2];
 }
-def: InstRW<[HWWriteResGroup144], (instregex "INSB")>;
-def: InstRW<[HWWriteResGroup144], (instregex "INSL")>;
-def: InstRW<[HWWriteResGroup144], (instregex "INSW")>;
+def: InstRW<[HWWriteResGroup144], (instrs INSB, INSL, INSW)>;
 
 def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> {
   let Latency = 16;
   let NumMicroOps = 16;
   let ResourceCycles = [16];
 }
-def: InstRW<[HWWriteResGroup145], (instregex "VZEROALL")>;
+def: InstRW<[HWWriteResGroup145], (instrs VZEROALL)>;
 
 def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> {
   let Latency = 22;
   let NumMicroOps = 19;
   let ResourceCycles = [2,1,4,1,1,4,6];
 }
-def: InstRW<[HWWriteResGroup146], (instregex "CMPXCHG16B")>;
+def: InstRW<[HWWriteResGroup146], (instrs CMPXCHG16B)>;
 
 def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
   let Latency = 17;
   let NumMicroOps = 15;
   let ResourceCycles = [2,1,2,4,2,4];
 }
-def: InstRW<[HWWriteResGroup147], (instregex "XCH_F")>;
-
-def HWWriteResGroup148 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> {
-  let Latency = 18;
-  let NumMicroOps = 8;
-  let ResourceCycles = [4,3,1];
-}
-def: InstRW<[HWWriteResGroup148], (instregex "PCMPESTRIrr")>;
-def: InstRW<[HWWriteResGroup148], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[HWWriteResGroup147], (instrs XCH_F)>;
 
 def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> {
   let Latency = 18;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,5];
 }
-def: InstRW<[HWWriteResGroup149], (instregex "CPUID")>;
-def: InstRW<[HWWriteResGroup149], (instregex "RDTSC")>;
-
-def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> {
-  let Latency = 24;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[HWWriteResGroup150], (instregex "PCMPESTRIrm")>;
-def: InstRW<[HWWriteResGroup150], (instregex "VPCMPESTRIrm")>;
+def: InstRW<[HWWriteResGroup149], (instrs CPUID, RDTSC)>;
 
 def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
   let Latency = 23;
@@ -4037,240 +1781,127 @@ def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
 }
 def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>;
 
-def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> {
-  let Latency = 19;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[HWWriteResGroup152], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[HWWriteResGroup152], (instregex "VPCMPESTRM128rr")>;
-
-def HWWriteResGroup153 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> {
-  let Latency = 25;
-  let NumMicroOps = 10;
-  let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[HWWriteResGroup153], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[HWWriteResGroup153], (instregex "VPCMPESTRM128rm")>;
-
 def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> {
   let Latency = 20;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup154], (instregex "DIV_FPrST0")>;
-def: InstRW<[HWWriteResGroup154], (instregex "DIV_FST0r")>;
-def: InstRW<[HWWriteResGroup154], (instregex "DIV_FrST0")>;
-def: InstRW<[HWWriteResGroup154], (instregex "SQRTPDr")>;
-def: InstRW<[HWWriteResGroup154], (instregex "SQRTSDr")>;
-def: InstRW<[HWWriteResGroup154], (instregex "VDIVPDrr")>;
-def: InstRW<[HWWriteResGroup154], (instregex "VDIVSDrr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
 
 def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 27;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F32m")>;
-def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F64m")>;
-def: InstRW<[HWWriteResGroup155], (instregex "VSQRTPDm")>;
-
-def HWWriteResGroup155_1 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 26;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup155_1], (instregex "SQRTPDm")>;
-def: InstRW<[HWWriteResGroup155_1], (instregex "VDIVPDrm")>;
-def: InstRW<[HWWriteResGroup155_1], (instregex "VSQRTSDm")>;
-
-def HWWriteResGroup155_2 : SchedWriteRes<[HWPort0,HWPort23]> {
-  let Latency = 25;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[HWWriteResGroup155_2], (instregex "SQRTSDm")>;
-def: InstRW<[HWWriteResGroup155_2], (instregex "VDIVSDrm")>;
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F(32|64)m")>;
 
 def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> {
   let Latency = 20;
   let NumMicroOps = 10;
   let ResourceCycles = [1,2,7];
 }
-def: InstRW<[HWWriteResGroup156], (instregex "MWAITrr")>;
-
-def HWWriteResGroup157 : SchedWriteRes<[HWPort0]> {
-  let Latency = 21;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[HWWriteResGroup157], (instregex "VSQRTPDr")>;
-def: InstRW<[HWWriteResGroup157], (instregex "VSQRTSDr")>;
-
-def HWWriteResGroup159 : SchedWriteRes<[HWPort0,HWPort015]> {
-  let Latency = 21;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup159], (instregex "VDIVPSYrr")>;
-def: InstRW<[HWWriteResGroup159], (instregex "VSQRTPSYr")>;
-
-def HWWriteResGroup160 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
-  let Latency = 28;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup160], (instregex "VDIVPSYrm")>;
-def: InstRW<[HWWriteResGroup160], (instregex "VSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup156], (instrs MWAITrr)>;
 
 def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
   let Latency = 30;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI16m")>;
-def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI32m")>;
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI(16|32)m")>;
 
 def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> {
   let Latency = 24;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FPrST0")>;
-def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FST0r")>;
-def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FrST0")>;
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
 
 def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 31;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup163], (instregex "DIV_F32m")>;
-def: InstRW<[HWWriteResGroup163], (instregex "DIV_F64m")>;
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F(32|64)m")>;
 
 def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 30;
   let NumMicroOps = 27;
   let ResourceCycles = [1,5,1,1,19];
 }
-def: InstRW<[HWWriteResGroup164], (instregex "XSAVE64")>;
+def: InstRW<[HWWriteResGroup164], (instrs XSAVE64)>;
 
 def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 31;
   let NumMicroOps = 28;
   let ResourceCycles = [1,6,1,1,19];
 }
-def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT)?")>;
+def: InstRW<[HWWriteResGroup165], (instrs XSAVE)>;
+def: InstRW<[HWWriteResGroup165], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
 
 def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
   let Latency = 34;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI16m")>;
-def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI32m")>;
-
-def HWWriteResGroup167 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> {
-  let Latency = 34;
-  let NumMicroOps = 11;
-  let ResourceCycles = [2,7,1,1];
-}
-def: InstRW<[HWWriteResGroup167], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[HWWriteResGroup167], (instregex "VAESKEYGENASSIST128rm")>;
-
-def HWWriteResGroup168 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> {
-  let Latency = 29;
-  let NumMicroOps = 11;
-  let ResourceCycles = [2,7,2];
-}
-def: InstRW<[HWWriteResGroup168], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[HWWriteResGroup168], (instregex "VAESKEYGENASSIST128rr")>;
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI(16|32)m")>;
 
 def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,3,4,10];
 }
-def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)ri")>;
-def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)rr")>;
-def: InstRW<[HWWriteResGroup170], (instregex "IN8ri")>;
-def: InstRW<[HWWriteResGroup170], (instregex "IN8rr")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN(8|16|32)ri",
+                                             "IN(8|16|32)rr")>;
 
 def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
   let Latency = 36;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,2,1,4,10];
 }
-def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)ir")>;
-def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)rr")>;
-def: InstRW<[HWWriteResGroup171], (instregex "OUT8ir")>;
-def: InstRW<[HWWriteResGroup171], (instregex "OUT8rr")>;
-
-def HWWriteResGroup172 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> {
-  let Latency = 31;
-  let NumMicroOps = 31;
-  let ResourceCycles = [8,1,21,1];
-}
-def: InstRW<[HWWriteResGroup172], (instregex "MMX_EMMS")>;
-
-def HWWriteResGroup173 : SchedWriteRes<[HWPort0,HWPort015]> {
-  let Latency = 35;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup173], (instregex "VDIVPDYrr")>;
-def: InstRW<[HWWriteResGroup173], (instregex "VSQRTPDYr")>;
-
-def HWWriteResGroup174 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
-  let Latency = 42;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[HWWriteResGroup174], (instregex "VDIVPDYrm")>;
-def: InstRW<[HWWriteResGroup174], (instregex "VSQRTPDYm")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT(8|16|32)ir",
+                                             "OUT(8|16|32)rr")>;
 
 def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> {
   let Latency = 41;
   let NumMicroOps = 18;
   let ResourceCycles = [1,1,2,3,1,1,1,8];
 }
-def: InstRW<[HWWriteResGroup175], (instregex "VMCLEARm")>;
+def: InstRW<[HWWriteResGroup175], (instrs VMCLEARm)>;
 
 def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> {
   let Latency = 42;
   let NumMicroOps = 22;
   let ResourceCycles = [2,20];
 }
-def: InstRW<[HWWriteResGroup176], (instregex "RDTSCP")>;
+def: InstRW<[HWWriteResGroup176], (instrs RDTSCP)>;
 
 def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> {
   let Latency = 61;
   let NumMicroOps = 64;
   let ResourceCycles = [2,2,8,1,10,2,39];
 }
-def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
-def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
+def: InstRW<[HWWriteResGroup177], (instrs FLDENVm)>;
 
 def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 64;
   let NumMicroOps = 88;
   let ResourceCycles = [4,4,31,1,2,1,45];
 }
-def: InstRW<[HWWriteResGroup178], (instregex "FXRSTOR64")>;
+def: InstRW<[HWWriteResGroup178], (instrs FXRSTOR64)>;
 
 def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
   let Latency = 64;
   let NumMicroOps = 90;
   let ResourceCycles = [4,2,33,1,2,1,47];
 }
-def: InstRW<[HWWriteResGroup179], (instregex "FXRSTOR")>;
+def: InstRW<[HWWriteResGroup179], (instrs FXRSTOR)>;
 
 def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> {
   let Latency = 75;
   let NumMicroOps = 15;
   let ResourceCycles = [6,3,6];
 }
-def: InstRW<[HWWriteResGroup180], (instregex "FNINIT")>;
+def: InstRW<[HWWriteResGroup180], (instrs FNINIT)>;
 
 def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
   let Latency = 98;
@@ -4291,8 +1922,7 @@ def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,
   let NumMicroOps = 100;
   let ResourceCycles = [9,9,11,8,1,11,21,30];
 }
-def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
-def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
+def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>;
 
 def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> {
   let Latency = 26;
@@ -4364,4 +1994,6 @@ def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HW
 def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
                                           VGATHERDPSrm)>;
 
+def: InstRW<[WriteZero], (instrs CLC)>;
+
 } // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedPredicates.td b/contrib/llvm/lib/Target/X86/X86SchedPredicates.td
new file mode 100644
index 000000000000..27aaeb193583
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedPredicates.td
@@ -0,0 +1,49 @@
+//===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are common to
+// all X86 subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+// A predicate used to identify dependency-breaking instructions that clear the
+// content of the destination register. Note that this predicate only checks if
+// input registers are the same. This predicate doesn't make any assumptions on
+// the expected instruction opcodes, because different processors may implement
+// different zero-idioms.
+def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>;
+
+// A predicate used to check if an instruction is a LEA, and if it uses all
+// three source operands: base, index, and offset.
+def IsThreeOperandsLEAPredicate: CheckAll<[
+  CheckOpcode<[LEA32r, LEA64r, LEA64_32r, LEA16r]>,
+
+  // isRegOperand(Base)
+  CheckIsRegOperand<1>,
+  CheckNot<CheckInvalidRegOperand<1>>,
+
+  // isRegOperand(Index)
+  CheckIsRegOperand<3>,
+  CheckNot<CheckInvalidRegOperand<3>>,
+
+  // hasLEAOffset(Offset)
+  CheckAny<[
+    CheckAll<[
+      CheckIsImmOperand<4>,
+      CheckNot<CheckZeroOperand<4>>
+    ]>,
+    CheckNonPortable<"MI.getOperand(4).isGlobal()">
+  ]>
+]>;
+
+// This predicate evaluates to true only if the input machine instruction is a
+// 3-operands LEA.  Tablegen automatically generates a new method for it in
+// X86GenInstrInfo.
+def IsThreeOperandsLEAFn :
+    TIIPredicate<"X86", "isThreeOperandsLEA", IsThreeOperandsLEAPredicate>;
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 4466d30f14c7..3b543c680ef4 100644
--- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -10,6 +10,10 @@
 // This file defines the machine model for Sandy Bridge to support instruction
 // scheduling and other instruction cost heuristics.
 //
+// Note that we define some instructions here that are not supported by SNB,
+// but we still have to define them because SNB is the default subtarget for
+// X86. These instructions are tagged with a comment `Unsupported = 1`.
+//
 //===----------------------------------------------------------------------===//
 
 def SandyBridgeModel : SchedMachineModel {
@@ -18,7 +22,7 @@ def SandyBridgeModel : SchedMachineModel {
   // FIXME: Identify instructions that aren't a single fused micro-op.
   let IssueWidth = 4;
   let MicroOpBufferSize = 168; // Based on the reorder buffer.
-  let LoadLatency = 4;
+  let LoadLatency = 5;
   let MispredictPenalty = 16;
 
   // Based on the LSD (loop-stream detector) queue size.
@@ -60,10 +64,12 @@ def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
 
 // Integer division issued on port 0.
 def SBDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def SBFPDivider : ProcResource<1>;
 
-// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
 // cycles after the memory operand.
-def : ReadAdvance<ReadAfterLd, 4>;
+def : ReadAdvance<ReadAfterLd, 5>;
 
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
@@ -71,129 +77,362 @@ def : ReadAdvance<ReadAfterLd, 4>;
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
 multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [1], int UOps = 1,
+                          int LoadLat = 5> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
-  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
-  // latency.
-  def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
-     let Latency = !add(Lat, 4);
+  // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+  // the latency (default = 5).
+  def : WriteRes<SchedRW.Folded, !listconcat([SBPort23], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = !add(UOps, 1);
   }
 }
 
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [SBPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort23,SBPort4]>;
+
+def : WriteRes<WriteStore,   [SBPort23, SBPort4]>;
+def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad,    [SBPort23]> { let Latency = 5; }
+def : WriteRes<WriteMove,    [SBPort015]>;
+def : WriteRes<WriteZero,    []>;
 
-def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
-def : WriteRes<WriteLoad,  [SBPort23]> { let Latency = 4; }
-def : WriteRes<WriteMove,  [SBPort015]>;
-def : WriteRes<WriteZero,  []>;
+defm : SBWriteResPair<WriteALU,    [SBPort015], 1>;
+defm : SBWriteResPair<WriteADC,    [SBPort05,SBPort015], 2, [1,1], 2>;
+defm : SBWriteResPair<WriteIMul,   [SBPort1],   3>;
+defm : SBWriteResPair<WriteIMul64, [SBPort1],   3>;
+
+defm : SBWriteResPair<WriteBSWAP32,[SBPort1], 1>;
+defm : SBWriteResPair<WriteBSWAP64,[SBPort1,SBPort05], 2, [1,1], 2>;
+
+defm : SBWriteResPair<WriteDiv8,   [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv16,  [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv32,  [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv64,  [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv8,  [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv16, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv32, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv64, [SBPort0, SBDivider], 25, [1, 10]>;
 
-defm : SBWriteResPair<WriteALU,   SBPort015, 1>;
-defm : SBWriteResPair<WriteIMul,  SBPort1,   3>;
 def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : SBWriteResPair<WriteShift, SBPort05,  1>;
-defm : SBWriteResPair<WriteJump,  SBPort5,   1>;
+
+defm : SBWriteResPair<WriteShift, [SBPort05],  1>;
+defm : SBWriteResPair<WriteShiftDouble, [SBPort05],  1>;
+defm : SBWriteResPair<WriteJump,  [SBPort5],   1>;
+defm : SBWriteResPair<WriteCRC32, [SBPort1],   3, [1], 1, 5>;
+
+defm : SBWriteResPair<WriteCMOV,  [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move.
+defm : SBWriteResPair<WriteCMOV2, [SBPort05,SBPort015], 3, [2,1], 3>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [SBPort05]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+def  : WriteRes<WriteLAHFSAHF, [SBPort05]>;
 
 // This is for simple LEAs with one or two input operands.
 // The complex ones can only execute on port 1, and they require two cycles on
 // the port to read all inputs. We don't model that.
-def : WriteRes<WriteLEA, [SBPort15]>;
+def : WriteRes<WriteLEA, [SBPort01]>;
 
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
-  let Latency = 25;
-  let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
-  let Latency = 29;
-  let ResourceCycles = [1, 1, 10];
-}
+// Bit counts.
+defm : SBWriteResPair<WriteBSF, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteBSR, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteLZCNT,          [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteTZCNT,          [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WritePOPCNT,         [SBPort1], 3, [1], 1, 6>;
+
+// BMI1 BEXTR, BMI2 BZHI
+// NOTE: These don't exist on Sandy Bridge. Ports are guesses.
+defm : SBWriteResPair<WriteBEXTR, [SBPort05,SBPort1], 2, [1,1], 2>;
+defm : SBWriteResPair<WriteBZHI, [SBPort1], 1>;
 
 // Scalar and vector floating point.
-defm : SBWriteResPair<WriteFAdd,   SBPort1, 3>;
-defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
-defm : SBWriteResPair<WriteFDiv,   SBPort0, 24>;
-defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
-defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
-defm : SBWriteResPair<WriteFSqrt,  SBPort0, 14>;
-defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
-defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
-defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
-defm : SBWriteResPair<WriteFShuffle,  SBPort5,  1>;
-defm : SBWriteResPair<WriteFBlend,  SBPort05,  1>;
-def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
-  let Latency = 2;
-  let ResourceCycles = [1, 1];
-}
-def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1, 1];
-}
+defm : X86WriteRes<WriteFLD0,          [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,          [SBPort0,SBPort5], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFLDC,          [SBPort0,SBPort1], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFLoad,         [SBPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [SBPort23,SBPort05], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [SBPort23,SBPort05], 9, [1,2], 3>;
+defm : X86WriteRes<WriteFStore,        [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreX,       [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreY,       [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNT,      [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX,     [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTY,     [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMove,         [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [SBPort015], 31, [31], 31>;
+
+defm : SBWriteResPair<WriteFAdd,    [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAddX,   [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAddY,   [SBPort1],  3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFAddZ,   [SBPort1],  3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFAdd64,  [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAdd64X, [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAdd64Y, [SBPort1],  3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFAdd64Z, [SBPort1],  3, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFCmp,    [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmpX,   [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmpY,   [SBPort1],  3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFCmpZ,   [SBPort1],  3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFCmp64,  [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmp64X, [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmp64Y, [SBPort1],  3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFCmp64Z, [SBPort1],  3, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFCom,    [SBPort1],  3>;
+
+defm : SBWriteResPair<WriteFMul,    [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMulX,   [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMulY,   [SBPort0],  5, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMulZ,   [SBPort0],  5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFMul64,  [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMul64X, [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMul64Y, [SBPort0],  5, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMul64Z, [SBPort0],  5, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFDiv,    [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFDivX,   [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFDivY,   [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>;
+defm : SBWriteResPair<WriteFDivZ,   [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFDiv64,  [SBPort0,SBFPDivider], 22, [1,22], 1, 6>;
+defm : SBWriteResPair<WriteFDiv64X, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>;
+defm : SBWriteResPair<WriteFDiv64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>;
+defm : SBWriteResPair<WriteFDiv64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFRcp,   [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRcpX,  [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRcpY,  [SBPort0,SBPort05],  7, [2,1], 3, 7>;
+defm : SBWriteResPair<WriteFRcpZ,  [SBPort0,SBPort05],  7, [2,1], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFRsqrt, [SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRsqrtX,[SBPort0],  5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRsqrtY,[SBPort0,SBPort05],  7, [2,1], 3, 7>;
+defm : SBWriteResPair<WriteFRsqrtZ,[SBPort0,SBPort05],  7, [2,1], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFSqrt,    [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFSqrtX,   [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFSqrtY,   [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>;
+defm : SBWriteResPair<WriteFSqrtZ,   [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSqrt64,  [SBPort0,SBFPDivider], 21, [1,21], 1, 6>;
+defm : SBWriteResPair<WriteFSqrt64X, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>;
+defm : SBWriteResPair<WriteFSqrt64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>;
+defm : SBWriteResPair<WriteFSqrt64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSqrt80,  [SBPort0,SBFPDivider], 24, [1,24], 1, 6>;
+
+defm : SBWriteResPair<WriteDPPD,   [SBPort0,SBPort1,SBPort5],  9, [1,1,1], 3, 6>;
+defm : SBWriteResPair<WriteDPPS,   [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 6>;
+defm : SBWriteResPair<WriteDPPSY,  [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>;
+defm : SBWriteResPair<WriteDPPSZ,  [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSign,    [SBPort5], 1>;
+defm : SBWriteResPair<WriteFRnd,     [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRndY,    [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFRndZ,    [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFLogic,   [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFLogicY,  [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFLogicZ,  [SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFTest,    [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFTestY,   [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFTestZ,   [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFShuffleY,[SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFVarShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFVarShuffleY,[SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFBlend,    [SBPort05], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFBlendY,   [SBPort05], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFBlendZ,   [SBPort05], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFVarBlend, [SBPort05], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteFVarBlendY,[SBPort05], 2, [2], 2, 7>;
+defm : SBWriteResPair<WriteFVarBlendZ,[SBPort05], 2, [2], 2, 7>; // Unsupported = 1
+
+// Conversion between integer and float.
+defm : SBWriteResPair<WriteCvtSS2I,   [SBPort0,SBPort1], 5, [1,1], 2>;
+defm : SBWriteResPair<WriteCvtPS2I,           [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteCvtPS2IY,          [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteCvtPS2IZ,          [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteCvtSD2I,   [SBPort0,SBPort1], 5, [1,1], 2>;
+defm : SBWriteResPair<WriteCvtPD2I,   [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : X86WriteRes<WriteCvtPD2IY,     [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPD2IZ,     [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPD2IYLd,   [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPD2IZLd,   [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtI2SS,      [SBPort1,SBPort5],  5, [1,2], 3>;
+defm : X86WriteRes<WriteCvtI2SSLd,    [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : SBWriteResPair<WriteCvtI2PS,           [SBPort1],  3, [1], 1, 6>;
+defm : SBWriteResPair<WriteCvtI2PSY,          [SBPort1],  3, [1], 1, 7>;
+defm : SBWriteResPair<WriteCvtI2PSZ,          [SBPort1],  3, [1], 1, 7>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2SD,      [SBPort1,SBPort5],  4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PD,      [SBPort1,SBPort5],  4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDY,     [SBPort1,SBPort5],  4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDZ,     [SBPort1,SBPort5],  4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2SDLd,   [SBPort1,SBPort23],  9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDLd,   [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtI2PDYLd,  [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtI2PDZLd,  [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteCvtSS2SD,  [SBPort0], 1, [1], 1, 6>;
+defm : X86WriteRes<WriteCvtPS2PD,     [SBPort0,SBPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDY,    [SBPort0,SBPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZ,    [SBPort0,SBPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PDLd,  [SBPort0,SBPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDYLd, [SBPort0,SBPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZLd, [SBPort0,SBPort23], 7, [1,1], 2>; // Unsupported = 1
+defm : SBWriteResPair<WriteCvtSD2SS,  [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteCvtPD2PS,  [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteCvtPD2PSY, [SBPort1,SBPort5], 4, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteCvtPD2PSZ, [SBPort1,SBPort5], 4, [1,1], 2, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteCvtPH2PS,  [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtPH2PSY, [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtPH2PSZ, [SBPort1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPS2PH,    [SBPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [SBPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHZ,   [SBPort1], 3, [1], 1>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PHSt,  [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>; // Unsupported = 1
 
 // Vector integer operations.
-defm : SBWriteResPair<WriteVecShift, SBPort5,  1>;
-defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>;
-defm : SBWriteResPair<WriteVecALU,   SBPort1,  3>;
-defm : SBWriteResPair<WriteVecIMul,  SBPort0,   5>;
-defm : SBWriteResPair<WriteShuffle,  SBPort5,  1>;
-defm : SBWriteResPair<WriteBlend,  SBPort15,  1>;
-def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+defm : X86WriteRes<WriteVecLoad,         [SBPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [SBPort23,SBPort05], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [SBPort23,SBPort05], 9, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore,        [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreX,       [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreY,       [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreNT,      [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY,     [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMove,         [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [SBPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [SBPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [SBPort5], 1, [1], 1>;
+
+defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecLogicX,[SBPort015], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecLogicY,[SBPort015], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecLogicZ,[SBPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecTest,  [SBPort0,SBPort5], 2, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteVecTestY, [SBPort0,SBPort5], 2, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteVecTestZ, [SBPort0,SBPort5], 2, [1,1], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecALU,   [SBPort1],  3, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecALUX,  [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecALUY,  [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecALUZ,  [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecIMul,  [SBPort0], 5, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecIMulX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecIMulY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecIMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePMULLD,   [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WritePMULLDY,  [SBPort0], 5, [1], 1, 7>; // TODO this is probably wrong for 256/512-bit for the "generic" model
+defm : SBWriteResPair<WritePMULLDZ,  [SBPort0], 5, [1], 1, 7>;  // Unsupported = 1
+defm : SBWriteResPair<WriteShuffle,  [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteShuffleX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteShuffleY, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffleZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarShuffle,  [SBPort15], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVarShuffleX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarShuffleY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffleZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteBlend,   [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteBlendY,  [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteBlendZ,  [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteVarBlendY,[SBPort15], 2, [2], 2, 7>;
+defm : SBWriteResPair<WriteVarBlendZ,[SBPort15], 2, [2], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteMPSAD,  [SBPort0, SBPort15], 7, [1,2], 3, 6>;
+defm : SBWriteResPair<WriteMPSADY, [SBPort0, SBPort15], 7, [1,2], 3, 7>;
+defm : SBWriteResPair<WriteMPSADZ, [SBPort0, SBPort15], 7, [1,2], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePSADBW,  [SBPort0], 5, [1], 1, 5>;
+defm : SBWriteResPair<WritePSADBWX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WritePSADBWY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WritePSADBWZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePHMINPOS,  [SBPort0], 5, [1], 1, 6>;
+
+// Vector integer shifts.
+defm : SBWriteResPair<WriteVecShift,     [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecShiftX,    [SBPort0,SBPort15], 2, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteVecShiftY,    [SBPort0,SBPort15], 4, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteVecShiftZ,    [SBPort0,SBPort15], 4, [1,1], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecShiftImm,  [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecShiftImmX, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecShiftImmY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecShiftImmZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarVecShift,  [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarVecShiftY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarVecShiftZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SBPort5,SBPort15]> {
   let Latency = 2;
-  let ResourceCycles = [1, 1];
+  let NumMicroOps = 2;
 }
-def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1, 1];
+def : WriteRes<WriteVecInsertLd, [SBPort23,SBPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
 }
-def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> {
+
+def : WriteRes<WriteVecExtract, [SBPort0,SBPort15]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [SBPort4,SBPort23,SBPort15]> {
   let Latency = 5;
   let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,2];
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SBPort1]> {
-  let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
 
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SBPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
-  let Latency = 5;
-  let ResourceCycles = [1, 1];
-}
+defm : SBWriteResPair<WriteFHAdd,  [SBPort1,SBPort5], 5, [1,2], 3, 6>;
+defm : SBWriteResPair<WriteFHAddY, [SBPort1,SBPort5], 5, [1,2], 3, 7>;
+defm : SBWriteResPair<WriteFHAddZ, [SBPort1,SBPort5], 5, [1,2], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePHAdd,  [SBPort15], 3, [3], 3, 5>;
+defm : SBWriteResPair<WritePHAddX, [SBPort15], 3, [3], 3, 6>;
+defm : SBWriteResPair<WritePHAddY, [SBPort15], 3, [3], 3, 7>;
+defm : SBWriteResPair<WritePHAddZ, [SBPort15], 3, [3], 3, 7>; // Unsupported = 1
 
+////////////////////////////////////////////////////////////////////////////////
 // String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
 // Packed Compare Implicit Length Strings, Return Mask
-def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+def : WriteRes<WritePCmpIStrM, [SBPort0]> {
   let Latency = 11;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 1];
+def : WriteRes<WritePCmpIStrMLd, [SBPort0, SBPort23]> {
+  let Latency = 17;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
@@ -228,6 +467,12 @@ def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
   let ResourceCycles = [7, 1];
 }
 
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK,  [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK,  [SBPort0]> { let Latency = 1; }
+
 // AES Instructions.
 def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> {
   let Latency = 7;
@@ -270,6 +515,10 @@ def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
   let ResourceCycles = [17, 1];
 }
 
+// Load/store MXCSR.
+// FIXME: This is probably wrong. Only STMXCSR should require Port4.
+def : WriteRes<WriteLDMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; }
 
 def : WriteRes<WriteSystem,     [SBPort015]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
@@ -278,624 +527,107 @@ def : WriteRes<WriteNop, []>;
 
 // AVX2/FMA is not supported on that architecture, but we should define the basic
 // scheduling resources anyway.
-defm : SBWriteResPair<WriteFShuffle256, SBPort0,  1>;
-defm : SBWriteResPair<WriteShuffle256, SBPort0,  1>;
-defm : SBWriteResPair<WriteVarVecShift, SBPort0,  1>;
-defm : SBWriteResPair<WriteFMA, SBPort01,  5>;
+defm : SBWriteResPair<WriteFShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMA,  [SBPort01],  5>;
+defm : SBWriteResPair<WriteFMAX, [SBPort01],  5>;
+defm : SBWriteResPair<WriteFMAY, [SBPort01],  5>;
+defm : SBWriteResPair<WriteFMAZ, [SBPort01],  5>;  // Unsupported = 1
 
 // Remaining SNB instrs.
 
-def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup0], (instregex "CVTSS2SDrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSLLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSLLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSLLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRADri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRAWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "PSRLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPMOVMSKBrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSLLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSLLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSLLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRADri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRAWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRLDri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRLQri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VPSRLWri")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDYrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSYrr")>;
-def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSrr")>;
-
 def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup1], (instregex "COMP_FST0r")>;
-def: InstRW<[SBWriteResGroup1], (instregex "COM_FST0r")>;
-def: InstRW<[SBWriteResGroup1], (instregex "UCOM_FPr")>;
-def: InstRW<[SBWriteResGroup1], (instregex "UCOM_Fr")>;
+def: InstRW<[SBWriteResGroup1], (instrs COMP_FST0r,
+                                        COM_FST0r,
+                                        UCOM_FPr,
+                                        UCOM_Fr)>;
 
 def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup2], (instregex "ANDNPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ANDNPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ANDPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ANDPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FDECSTP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FFREE")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FINCSTP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "FNOP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "INSERTPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JAE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JAE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JA_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JA_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JBE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JBE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JB_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JB_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JGE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JGE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JG_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JG_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JLE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JLE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JL_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JL_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JMP64r")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JMP_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JMP_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNE_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNE_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNO_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNO_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNP_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNP_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNS_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JNS_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JO_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JO_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JP_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JP_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JS_1")>;
-def: InstRW<[SBWriteResGroup2], (instregex "JS_4")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LD_Frr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LOOP")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LOOPE")>;
-def: InstRW<[SBWriteResGroup2], (instregex "LOOPNE")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOV64toPQIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVAPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVAPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVDDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVHLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVLHPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSHDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVSSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVUPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "MOVUPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ORPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "RETQ")>;
-def: InstRW<[SBWriteResGroup2], (instregex "SHUFPDrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "SHUFPSrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ST_FPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "ST_Frr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VANDPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VINSERTF128rr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VINSERTPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVSSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VORPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERM2F128rr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDYrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSYrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSrri")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPDYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPSYrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "VXORPSrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "XORPDrr")>;
-def: InstRW<[SBWriteResGroup2], (instregex "XORPSrr")>;
-
-def SBWriteResGroup3 : SchedWriteRes<[SBPort01]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup3], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[SBWriteResGroup2], (instrs FDECSTP, FINCSTP, FFREE, FFREEP, FNOP,
+                                        LD_Frr, ST_Frr, ST_FPrr)>;
+def: InstRW<[SBWriteResGroup2], (instrs LOOP, LOOPE, LOOPNE)>; // FIXME: This seems wrong compared to other Intel CPUs.
+def: InstRW<[SBWriteResGroup2], (instrs RETQ)>;
 
 def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup4], (instregex "BLENDPDrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BLENDPSrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "CDQ")>;
-def: InstRW<[SBWriteResGroup4], (instregex "CQO")>;
-def: InstRW<[SBWriteResGroup4], (instregex "LAHF")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SAHF")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SAR8ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETAEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETBr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETGEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETGr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETLEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETLr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNEr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNOr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNPr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETNSr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETOr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETPr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SETSr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL8r1")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHL8ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "SHR8ri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDYrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSYrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSrri")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQAYrr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQArr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUYrr")>;
-def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUrr")>;
+def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>;
+def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8",
+                                           "BT(16|32|64)rr",
+                                           "BTC(16|32|64)ri8",
+                                           "BTC(16|32|64)rr",
+                                           "BTR(16|32|64)ri8",
+                                           "BTR(16|32|64)rr",
+                                           "BTS(16|32|64)ri8",
+                                           "BTS(16|32|64)rr")>;
 
 def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PADDQirr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PABSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PABSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PABSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKSSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKSSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKUSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PACKUSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PADDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PALIGNRrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PAVGBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PAVGWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PBLENDWrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMAXUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMINUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFDri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFHWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSHUFLWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSIGNBrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSIGNDrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSIGNWrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSLLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSRLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PSUBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPABSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPABSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPABSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSWBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPADDWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPALIGNRrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPAVGBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPAVGWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPBLENDWrri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINSDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINUBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINUDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMINUWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFDri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFHWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFLWri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNBrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNDrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNWrr128")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSLLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSRLDQri")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSBrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPSUBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLWDrr")>;
-
-def SBWriteResGroup6 : SchedWriteRes<[SBPort015]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "ADD8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "AND8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CBW")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMC")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CMP8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "CWDE")>;
-def: InstRW<[SBWriteResGroup6], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "DEC8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "INC(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "INC8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOV(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOV8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOV8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVDQArr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVDQUrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NEG8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "NOT8r")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "OR8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PANDNrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PANDrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "PXORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "STC")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "SUB8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "TEST8rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPANDNrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPANDrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "VPXORrr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR8i8")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR8ri")>;
-def: InstRW<[SBWriteResGroup6], (instregex "XOR8rr")>;
-
-def SBWriteResGroup7 : SchedWriteRes<[SBPort0]> {
-  let Latency = 2;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPDrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPSrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "MOVPQIto64rr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "PMOVMSKBrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[SBWriteResGroup7], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABS(B|D|W)rr",
+                                           "MMX_PADDQirr",
+                                           "MMX_PALIGNRrri",
+                                           "MMX_PSIGN(B|D|W)rr")>;
 
 def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPDrr0")>;
-def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPSrr0")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROL8ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "ROR8ri")>;
-def: InstRW<[SBWriteResGroup9], (instregex "SETAr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "SETBEr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDrr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSrr")>;
-
-def SBWriteResGroup10 : SchedWriteRes<[SBPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SBWriteResGroup10], (instregex "VPBLENDVBrr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "ROL(8|16|32|64)r1",
+                                           "ROL(8|16|32|64)ri",
+                                           "ROR(8|16|32|64)r1",
+                                           "ROR(8|16|32|64)ri",
+                                           "SET(A|BE)r")>;
 
 def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SBWriteResGroup11], (instregex "SCASB")>;
-def: InstRW<[SBWriteResGroup11], (instregex "SCASL")>;
-def: InstRW<[SBWriteResGroup11], (instregex "SCASQ")>;
-def: InstRW<[SBWriteResGroup11], (instregex "SCASW")>;
+def: InstRW<[SBWriteResGroup11], (instrs SCASB,
+                                         SCASL,
+                                         SCASQ,
+                                         SCASW)>;
 
 def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup12], (instregex "COMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "COMISSrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "UCOMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "UCOMISSrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VCOMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VCOMISSrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISDrr")>;
-def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISSrr")>;
-
-def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup13], (instregex "CVTPS2PDrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "PTESTrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VPTESTYrr")>;
-def: InstRW<[SBWriteResGroup13], (instregex "VPTESTrr")>;
-
-def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup14], (instregex "PSLLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSLLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSLLWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRADrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRAWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "PSRLWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSLLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSLLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSLLWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRADrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRAWrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRLDrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRLQrr")>;
-def: InstRW<[SBWriteResGroup14], (instregex "VPSRLWrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "(V?)COMISDrr",
+                                            "(V?)COMISSrr",
+                                            "(V?)UCOMISDrr",
+                                            "(V?)UCOMISSrr")>;
 
 def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup15], (instregex "CWD")>;
-def: InstRW<[SBWriteResGroup15], (instregex "FNSTSW16r")>;
-
-def SBWriteResGroup16 : SchedWriteRes<[SBPort1,SBPort05]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup16], (instregex "BSWAP(16|32|64)r")>;
-
-def SBWriteResGroup17 : SchedWriteRes<[SBPort5,SBPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRBrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRDrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRQrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "PINSRWrri")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRBrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRDrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRQrr")>;
-def: InstRW<[SBWriteResGroup17], (instregex "VPINSRWrri")>;
+def: InstRW<[SBWriteResGroup15], (instrs CWD,
+                                         FNSTSW16r)>;
 
 def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup18], (instregex "JRCXZ")>;
+def: InstRW<[SBWriteResGroup18], (instrs JCXZ, JECXZ, JRCXZ)>;
 def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>;
 
 def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> {
@@ -903,300 +635,84 @@ def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "ADC8ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "ADC8rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB8ri")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SBB8rr")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[SBWriteResGroup19], (instregex "SHRD(16|32|64)rri8")>;
-
-def SBWriteResGroup20 : SchedWriteRes<[SBPort0]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMADDUBSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMADDWDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULHRSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULHUWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULHWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULLDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULLWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PMULUDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "PSADBWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMADDWDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULHRSWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULHUWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULHWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULLDrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULLWrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPMULUDQrr")>;
-def: InstRW<[SBWriteResGroup20], (instregex "VPSADBWrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8",
+                                            "SHRD(16|32|64)rri8")>;
 
 def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup21], (instregex "ADDPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADD_FPrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADD_FST0r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ADD_FrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPPDrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPPSrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CMPSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r8")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r64")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CVTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "MUL8r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "POPCNT(16|32|64)rr")>;
 def: InstRW<[SBWriteResGroup21], (instregex "PUSHFS64")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FPrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FST0r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUBSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUB_FPrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUB_FST0r")>;
-def: InstRW<[SBWriteResGroup21], (instregex "SUB_FrST0")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDYrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSYrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSrri")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCMPSSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPDr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPSr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSYrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBSDrr")>;
-def: InstRW<[SBWriteResGroup21], (instregex "VSUBSSrr")>;
 
-def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
-  let Latency = 3;
+def SBWriteResGroup21_16i : SchedWriteRes<[SBPort1, SBPort015]> {
+  let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup22], (instregex "EXTRACTPSrr")>;
-def: InstRW<[SBWriteResGroup22], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[SBWriteResGroup21_16i], (instrs IMUL16rri, IMUL16rri8)>;
 
-def SBWriteResGroup23 : SchedWriteRes<[SBPort0,SBPort15]> {
+def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
   let Latency = 3;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRBrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRDrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRQrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "PEXTRWri")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRBrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRDrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRQrr")>;
-def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRWri")>;
+def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>;
 
 def SBWriteResGroup23_2 : SchedWriteRes<[SBPort05]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROL8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "ROR8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SAR8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHL8rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[SBWriteResGroup23_2], (instregex "SHR8rCL")>;
-
-def SBWriteResGroup24 : SchedWriteRes<[SBPort15]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBSWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBWrr64")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHADDDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHADDSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHADDWrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHSUBDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHSUBSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "PHSUBWrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHADDDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHADDSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHADDWrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBDrr")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBWrr")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(8|16|32|64)rCL",
+                                              "ROR(8|16|32|64)rCL",
+                                              "SAR(8|16|32|64)rCL",
+                                              "SHL(8|16|32|64)rCL",
+                                              "SHR(8|16|32|64)rCL")>;
 
 def SBWriteResGroup25 : SchedWriteRes<[SBPort015]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SBWriteResGroup25], (instregex "ADC8i8")>;
-def: InstRW<[SBWriteResGroup25], (instregex "LEAVE64")>;
-def: InstRW<[SBWriteResGroup25], (instregex "OUT32rr")>;
-def: InstRW<[SBWriteResGroup25], (instregex "OUT8rr")>;
-def: InstRW<[SBWriteResGroup25], (instregex "SBB8i8")>;
-def: InstRW<[SBWriteResGroup25], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup25], (instregex "XADD8rr")>;
-
-def SBWriteResGroup25_2 : SchedWriteRes<[SBPort5,SBPort05]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVBE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVB_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNBE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNB_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNE_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNP_F")>;
-def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVP_F")>;
+def: InstRW<[SBWriteResGroup25], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+                                         XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+                                         XCHG16ar, XCHG32ar, XCHG64ar)>;
 
-def SBWriteResGroup26 : SchedWriteRes<[SBPort05,SBPort015]> {
-  let Latency = 3;
+def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> {
+  let Latency = 7;
   let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
+  let ResourceCycles = [1,2];
 }
-def: InstRW<[SBWriteResGroup26], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup26], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup25_1], (instrs LEAVE, LEAVE64)>;
 
 def SBWriteResGroup26_2 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIPr")>;
-def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIr")>;
-def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIPr")>;
-def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIr")>;
+def: InstRW<[SBWriteResGroup26_2], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
 
 def SBWriteResGroup27 : SchedWriteRes<[SBPort0,SBPort1]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup27], (instregex "MUL(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup27], (instrs IMUL64r, MUL64r)>;
 
-def SBWriteResGroup28 : SchedWriteRes<[SBPort1,SBPort5]> {
+def SBWriteResGroup27_1 : SchedWriteRes<[SBPort1,SBPort05,SBPort015]> {
   let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup28], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2DQrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2PSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTSD2SSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTSI642SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTSI2SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQYrr")>;
-def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup27_1], (instrs IMUL32r, MUL32r)>;
+
+def SBWriteResGroup27_2 : SchedWriteRes<[SBPort1,SBPort05,SBPort015]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup27_2], (instrs IMUL16r, MUL16r)>;
 
 def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> {
   let Latency = 4;
@@ -1210,288 +726,97 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SBWriteResGroup29_2], (instregex "OUT32ir")>;
-def: InstRW<[SBWriteResGroup29_2], (instregex "OUT8ir")>;
-def: InstRW<[SBWriteResGroup29_2], (instregex "PAUSE")>;
+def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>;
 
 def SBWriteResGroup29_3 : SchedWriteRes<[SBPort05,SBPort015]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [3,1];
 }
-def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[SBWriteResGroup29_3], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL",
+                                              "SHRD(16|32|64)rrCL")>;
 
 def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> {
   let Latency = 5;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup30], (instregex "MULPDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MULPSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MULSDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MULSSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MUL_FPrST0")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MUL_FST0r")>;
-def: InstRW<[SBWriteResGroup30], (instregex "MUL_FrST0")>;
-def: InstRW<[SBWriteResGroup30], (instregex "PCMPGTQrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RCPPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RCPSSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RSQRTPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "RSQRTSSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPDYrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPSYrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULPSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULSDrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VMULSSrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VPCMPGTQrr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRCPPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRCPSSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTPSr")>;
-def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTSSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "(V?)PCMPGTQrr")>;
 
 def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> {
   let Latency = 5;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup31], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOV8rm")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[SBWriteResGroup31], (instregex "PREFETCH")>;
-
-def SBWriteResGroup32 : SchedWriteRes<[SBPort0,SBPort1]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+                                            "MOVZX(16|32|64)rm(8|16)")>;
 
 def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
   let Latency = 5;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup33], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOV8mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVAPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVAPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVDQAmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVDQUmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVHPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVHPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVLPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVLPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTDQmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTI_64mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVPDI2DImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVPQI2QImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVPQIto64mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVSDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVSSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVUPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "MOVUPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "PUSH64i8")>;
-def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVSDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVSSmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSYmr")>;
-def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSmr")>;
-
-def SBWriteResGroup34 : SchedWriteRes<[SBPort0,SBPort15]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SBWriteResGroup34], (instregex "MPSADBWrri")>;
-def: InstRW<[SBWriteResGroup34], (instregex "VMPSADBWrri")>;
+def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16r|32r|64r|64i8)")>;
 
 def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SBWriteResGroup35], (instregex "CLI")>;
-def: InstRW<[SBWriteResGroup35], (instregex "CVTSI642SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "CVTSI2SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HADDPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HADDPSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HSUBPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "HSUBPSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSYrr")>;
-def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSrr")>;
+def: InstRW<[SBWriteResGroup35], (instrs CLI)>;
 
 def SBWriteResGroup35_2 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP16m")>;
-def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP32m")>;
-def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP64m")>;
-def: InstRW<[SBWriteResGroup35_2], (instregex "PUSHGS64")>;
+def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP(16|32|64)m",
+                                              "PUSHGS64")>;
 
 def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup36], (instregex "CALL64pcrel32")>;
-def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup36], (instregex "EXTRACTPSmr")>;
-def: InstRW<[SBWriteResGroup36], (instregex "VEXTRACTPSmr")>;
-
-def SBWriteResGroup37 : SchedWriteRes<[SBPort4,SBPort01,SBPort23]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSmr")>;
-
-def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup38], (instregex "SETAEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETBm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETGEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETGm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETLEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETLm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNEm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNOm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNPm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETNSm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETOm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETPm")>;
-def: InstRW<[SBWriteResGroup38], (instregex "SETSm")>;
-
-def SBWriteResGroup39 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup39], (instregex "PEXTRBmr")>;
-def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRBmr")>;
-def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRDmr")>;
-def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRWmr")>;
+def: InstRW<[SBWriteResGroup36], (instrs CALL64pcrel32)>;
+def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r",
+                                            "(V?)EXTRACTPSmr")>;
 
 def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup40], (instregex "MOV8mi")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSB")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSL")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSQ")>;
-def: InstRW<[SBWriteResGroup40], (instregex "STOSW")>;
+def: InstRW<[SBWriteResGroup40], (instrs STOSB, STOSL, STOSQ, STOSW)>;
 
 def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> {
   let Latency = 5;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SBWriteResGroup41], (instregex "FNINIT")>;
+def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>;
 
 def SBWriteResGroup42 : SchedWriteRes<[SBPort05,SBPort015]> {
   let Latency = 5;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG8rr")>;
+def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(8|16|32|64)rr")>;
 
 def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
-  let Latency = 5;
+  let Latency = 3;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SBWriteResGroup43], (instregex "SETAm")>;
-def: InstRW<[SBWriteResGroup43], (instregex "SETBEm")>;
-
-def SBWriteResGroup44 : SchedWriteRes<[SBPort0,SBPort4,SBPort5,SBPort23]> {
-  let Latency = 5;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SBWriteResGroup44], (instregex "LDMXCSR")>;
-def: InstRW<[SBWriteResGroup44], (instregex "STMXCSR")>;
-def: InstRW<[SBWriteResGroup44], (instregex "VLDMXCSR")>;
-def: InstRW<[SBWriteResGroup44], (instregex "VSTMXCSR")>;
+def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>;
 
 def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
   let Latency = 5;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SBWriteResGroup45], (instregex "PEXTRDmr")>;
-def: InstRW<[SBWriteResGroup45], (instregex "PEXTRQmr")>;
-def: InstRW<[SBWriteResGroup45], (instregex "VPEXTRQmr")>;
-def: InstRW<[SBWriteResGroup45], (instregex "PUSHF16")>;
-def: InstRW<[SBWriteResGroup45], (instregex "PUSHF64")>;
+def: InstRW<[SBWriteResGroup45], (instregex "(V?)PEXTR(D|Q)mr",
+                                            "PUSHF(16|64)")>;
 
 def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
   let Latency = 5;
@@ -1512,49 +837,23 @@ def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup48], (instregex "LDDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOV64toPQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVAPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVAPSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVNTDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSHDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSLDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVSSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVUPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "MOVUPSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUYrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQUrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVNTDQArm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVSSrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPDrm")>;
-def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm",
+                                            "POP(16|32|64)r",
+                                            "VBROADCASTSSrm",
+                                            "(V?)MOV64toPQIrm",
+                                            "(V?)MOVDDUPrm",
+                                            "(V?)MOVDI2PDIrm",
+                                            "(V?)MOVQI2PQIrm",
+                                            "(V?)MOVSDrm",
+                                            "(V?)MOVSHDUPrm",
+                                            "(V?)MOVSLDUPrm",
+                                            "(V?)MOVSSrm")>;
 
 def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup49], (instregex "JMP(16|32|64)m")>;
 def: InstRW<[SBWriteResGroup49], (instregex "MOV16sm")>;
 
 def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort05]> {
@@ -1569,153 +868,42 @@ def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABS(B|D|W)rm",
+                                            "MMX_PALIGNRrmi",
+                                            "MMX_PSIGN(B|D|W)rm")>;
 
 def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup52], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "ADD8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "AND8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP8mi")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP8mr")>;
-def: InstRW<[SBWriteResGroup52], (instregex "CMP8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "LODSL")>;
-def: InstRW<[SBWriteResGroup52], (instregex "LODSQ")>;
-def: InstRW<[SBWriteResGroup52], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "OR8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "SUB8rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup52], (instregex "XOR8rm")>;
+def: InstRW<[SBWriteResGroup52], (instrs LODSL, LODSQ)>;
 
 def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SBWriteResGroup53], (instregex "ST_F32m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_F64m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_FP32m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_FP64m")>;
-def: InstRW<[SBWriteResGroup53], (instregex "ST_FP80m")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ST_F(32|64)m",
+                                            "ST_FP(32|64|80)m")>;
 
 def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> {
   let Latency = 7;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPDYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPSYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQAYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQUYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPDYrm")>;
-def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPSYrm")>;
-
-def SBWriteResGroup55 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup55], (instregex "CVTPS2PDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "CVTSS2SDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDYrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VTESTPDrm")>;
-def: InstRW<[SBWriteResGroup55], (instregex "VTESTPSrm")>;
-
-def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup56], (instregex "ANDNPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ANDNPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ANDPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ANDPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "INSERTPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "MOVLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "ORPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "SHUFPDrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "SHUFPSrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDNPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDNPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VANDPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VINSERTPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VORPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPDrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPSrmi")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VXORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "VXORPSrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "XORPDrm")>;
-def: InstRW<[SBWriteResGroup56], (instregex "XORPSrm")>;
-
-def SBWriteResGroup57 : SchedWriteRes<[SBPort5,SBPort015]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup57], (instregex "AESDECLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "AESDECrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "AESENCLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "AESENCrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESDECLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESDECrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESENCLASTrr")>;
-def: InstRW<[SBWriteResGroup57], (instregex "VAESENCrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm",
+                                            "VBROADCASTSSYrm",
+                                            "VMOVDDUPYrm",
+                                            "VMOVSHDUPYrm",
+                                            "VMOVSLDUPYrm")>;
 
 def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup58], (instregex "BLENDPDrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "BLENDPSrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPDrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPSrmi")>;
-def: InstRW<[SBWriteResGroup58], (instregex "VINSERTF128rm")>;
+def: InstRW<[SBWriteResGroup58], (instrs VINSERTF128rm)>;
 
 def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
   let Latency = 7;
@@ -1723,1136 +911,282 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PABSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PABSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PABSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKSSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKSSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKUSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PACKUSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PADDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PALIGNRrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PAVGBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PAVGWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PBLENDWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PINSRWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMAXUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMINUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFDmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFHWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSHUFLWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSIGNBrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSIGNDrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSIGNWrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PSUBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPABSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPABSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPABSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSWBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPADDWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPALIGNRrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPAVGBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPAVGWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPBLENDWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPINSRWrmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINSDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINUBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINUDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMINUWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFDmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFHWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFLWmi")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNBrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNDrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNWrm128")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSBrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPSUBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLWDrm")>;
-
-def SBWriteResGroup60 : SchedWriteRes<[SBPort23,SBPort015]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup60], (instregex "PANDNrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "PANDrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "PORrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "PXORrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPANDNrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPANDrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPORrm")>;
-def: InstRW<[SBWriteResGroup60], (instregex "VPXORrm")>;
-
-def SBWriteResGroup61 : SchedWriteRes<[SBPort0,SBPort05]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup61], (instregex "VRCPPSYr")>;
-def: InstRW<[SBWriteResGroup61], (instregex "VRSQRTPSYr")>;
 
 def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SBWriteResGroup62], (instregex "VERRm")>;
-def: InstRW<[SBWriteResGroup62], (instregex "VERWm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "VER(R|W)m")>;
 
 def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SBWriteResGroup63], (instregex "LODSB")>;
-def: InstRW<[SBWriteResGroup63], (instregex "LODSW")>;
+def: InstRW<[SBWriteResGroup63], (instrs LODSB, LODSW)>;
 
 def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup64], (instregex "FARJMP64")>;
-
-def SBWriteResGroup65 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup65], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "ADC8rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup65], (instregex "SBB8rm")>;
+def: InstRW<[SBWriteResGroup64], (instrs FARJMP64)>;
 
 def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SBWriteResGroup66], (instregex "FNSTSWm")>;
+def: InstRW<[SBWriteResGroup66], (instrs FNSTSWm)>;
 
 def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r")>;
-def: InstRW<[SBWriteResGroup67], (instregex "STR(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r",
+                                            "STR(16|32|64)r")>;
 
 def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
+def: InstRW<[SBWriteResGroup68], (instrs FNSTCW16m)>;
 def: InstRW<[SBWriteResGroup68], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup68], (instregex "FNSTCW16m")>;
 
 def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[SBWriteResGroup69], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[SBWriteResGroup69], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SAR8mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL8m1")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHL8mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup69], (instregex "SHR8mi")>;
-
-def SBWriteResGroup70 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
-  let Latency = 7;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "ADD8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "ADD8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "AND8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "DEC8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "INC(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "INC8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NEG8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "NOT8m")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "OR8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "SUB8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "TEST8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "TEST8mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR8mi")>;
-def: InstRW<[SBWriteResGroup70], (instregex "XOR8mr")>;
-
-def SBWriteResGroup71 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[SBWriteResGroup71], (instregex "VTESTPDYrm")>;
-def: InstRW<[SBWriteResGroup71], (instregex "VTESTPSYrm")>;
-
-def SBWriteResGroup72 : SchedWriteRes<[SBPort1,SBPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup72], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup72], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m64")>;
-def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m8")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOM32m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOM64m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOMP32m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "FCOMP64m")>;
-def: InstRW<[SBWriteResGroup72], (instregex "MUL8m")>;
-
-def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup73], (instregex "VANDNPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VANDNPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VANDPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VANDPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VORPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VORPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VXORPDYrm")>;
-def: InstRW<[SBWriteResGroup73], (instregex "VXORPSYrm")>;
-
-def SBWriteResGroup74 : SchedWriteRes<[SBPort23,SBPort05]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPSYrmi")>;
-
-def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPDrm0")>;
-def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPSrm0")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPDrm")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPSrm")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPSrm")>;
-
-def SBWriteResGroup76 : SchedWriteRes<[SBPort23,SBPort15]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SBWriteResGroup76], (instregex "PBLENDVBrr0")>;
-def: InstRW<[SBWriteResGroup76], (instregex "VPBLENDVBrm")>;
+def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8",
+                                            "BTR(16|32|64)mi8",
+                                            "BTS(16|32|64)mi8",
+                                            "SAR(8|16|32|64)m1",
+                                            "SAR(8|16|32|64)mi",
+                                            "SHL(8|16|32|64)m1",
+                                            "SHL(8|16|32|64)mi",
+                                            "SHR(8|16|32|64)m1",
+                                            "SHR(8|16|32|64)mi")>;
 
 def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
   let Latency = 8;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup77], (instregex "COMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "COMISSrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "UCOMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "UCOMISSrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VCOMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VCOMISSrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISDrm")>;
-def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISSrm")>;
-
-def SBWriteResGroup78 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup78], (instregex "PTESTrm")>;
-def: InstRW<[SBWriteResGroup78], (instregex "VPTESTrm")>;
-
-def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup79], (instregex "PSLLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSLLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSLLWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRADrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRAWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "PSRLWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSLLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSLLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSLLWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRADrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRAWrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRLDrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRLQrm")>;
-def: InstRW<[SBWriteResGroup79], (instregex "VPSRLWrm")>;
-
-def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
-}
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBSWrm64")>;
-def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBWrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "(V?)(U?)COMI(SD|SS)rm")>;
 
 def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> {
   let Latency = 8;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG8rm")>;
-
-def SBWriteResGroup82 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup82], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup82], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(8|16|32|64)rm")>;
 
 def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [2,3];
 }
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSB")>;
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSL")>;
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSQ")>;
-def: InstRW<[SBWriteResGroup83], (instregex "CMPSW")>;
+def: InstRW<[SBWriteResGroup83], (instrs CMPSB,
+                                         CMPSL,
+                                         CMPSQ,
+                                         CMPSW)>;
 
 def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,2];
 }
-def: InstRW<[SBWriteResGroup84], (instregex "FLDCW16m")>;
+def: InstRW<[SBWriteResGroup84], (instrs FLDCW16m)>;
 
 def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,2];
 }
-def: InstRW<[SBWriteResGroup85], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup85], (instregex "ROL8mi")>;
-def: InstRW<[SBWriteResGroup85], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup85], (instregex "ROR8mi")>;
+def: InstRW<[SBWriteResGroup85], (instregex "ROL(8|16|32|64)m1",
+                                            "ROL(8|16|32|64)mi",
+                                            "ROR(8|16|32|64)m1",
+                                            "ROR(8|16|32|64)mi")>;
 
 def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,2];
 }
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSB")>;
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSL")>;
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSQ")>;
-def: InstRW<[SBWriteResGroup86], (instregex "MOVSW")>;
-def: InstRW<[SBWriteResGroup86], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup86], (instregex "XADD8rm")>;
+def: InstRW<[SBWriteResGroup86], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>;
+def: InstRW<[SBWriteResGroup86], (instregex "XADD(8|16|32|64)rm")>;
 
 def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SBWriteResGroup87], (instregex "FARCALL64")>;
+def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>;
 
 def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[SBWriteResGroup88], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8",
+                                            "SHRD(16|32|64)mri8")>;
 
-def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup89], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMADDUBSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMADDWDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULHRSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULHUWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULHWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULLDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULLWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PMULUDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "PSADBWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMADDWDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULHRSWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULHUWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULHWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULLDrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULLWrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPMULUDQrm")>;
-def: InstRW<[SBWriteResGroup89], (instregex "VPSADBWrm")>;
-
-def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup90], (instregex "ADDPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPPDrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPPSrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CMPSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTSI642SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTSI2SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "SUBSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPPDrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPPSrmi")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCMPSSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI642SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI2SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSDm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSSm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBPDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBPSrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBSDrm")>;
-def: InstRW<[SBWriteResGroup90], (instregex "VSUBSSrm")>;
-
-def SBWriteResGroup91 : SchedWriteRes<[SBPort23,SBPort05]> {
+def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
+  let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPSYrm")>;
-def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVT(T?)SD2SI(64)?rm",
+                                            "CVT(T?)SS2SI(64)?rm")>;
 
-def SBWriteResGroup92 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+def SBWriteResGroup93_1 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup92], (instregex "DPPDrri")>;
-def: InstRW<[SBWriteResGroup92], (instregex "VDPPDrri")>;
+def: InstRW<[SBWriteResGroup93_1], (instrs IMUL64m, MUL64m)>;
 
-def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+def SBWriteResGroup93_2 : SchedWriteRes<[SBPort1,SBPort23,SBPort05,SBPort015]> {
   let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[SBWriteResGroup93], (instregex "MUL(16|32|64)m")>;
-
-def SBWriteResGroup94 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
+def: InstRW<[SBWriteResGroup93_2], (instrs IMUL32m, MUL32m)>;
+
+def SBWriteResGroup93_3 : SchedWriteRes<[SBPort1,SBPort05,SBPort015,SBPort23]> {
   let Latency = 9;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[SBWriteResGroup93_3], (instrs IMUL16m, MUL16m)>;
+
+def SBWriteResGroup93_4 : SchedWriteRes<[SBPort1,SBPort015,SBPort23]> {
+  let Latency = 8;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup94], (instregex "VPTESTYrm")>;
+def: InstRW<[SBWriteResGroup93_4], (instrs IMUL16rmi, IMUL16rmi8)>;
 
 def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup95], (instregex "LD_F32m")>;
-def: InstRW<[SBWriteResGroup95], (instregex "LD_F64m")>;
-def: InstRW<[SBWriteResGroup95], (instregex "LD_F80m")>;
-
-def SBWriteResGroup96 : SchedWriteRes<[SBPort23,SBPort15]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
-}
-def: InstRW<[SBWriteResGroup96], (instregex "PHADDDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHADDSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHADDWrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHSUBDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHSUBSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "PHSUBWrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHADDDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHADDSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHADDWrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBDrm")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBSWrm128")>;
-def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBWrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "LD_F(32|64|80)m")>;
 
 def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SBWriteResGroup97], (instregex "IST_F16m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_F32m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_FP16m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_FP32m")>;
-def: InstRW<[SBWriteResGroup97], (instregex "IST_FP64m")>;
+def: InstRW<[SBWriteResGroup97], (instregex "IST_F(16|32)m",
+                                            "IST_FP(16|32|64)m")>;
 
 def SBWriteResGroup97_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
   let Latency = 9;
   let NumMicroOps = 6;
   let ResourceCycles = [1,2,3];
 }
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROL8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "ROR8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SAR8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHL8mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[SBWriteResGroup97_2], (instregex "SHR8mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(8|16|32|64)mCL",
+                                              "ROR(8|16|32|64)mCL",
+                                              "SAR(8|16|32|64)mCL",
+                                              "SHL(8|16|32|64)mCL",
+                                              "SHR(8|16|32|64)mCL")>;
 
 def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
   let Latency = 9;
   let NumMicroOps = 6;
   let ResourceCycles = [1,2,3];
 }
-def: InstRW<[SBWriteResGroup98], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup98], (instregex "ADC8mi")>;
-def: InstRW<[SBWriteResGroup98], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[SBWriteResGroup98], (instregex "SBB8mi")>;
+def: SchedAlias<WriteADCRMW, SBWriteResGroup98>;
 
 def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
   let Latency = 9;
   let NumMicroOps = 6;
   let ResourceCycles = [1,2,2,1];
 }
-def: InstRW<[SBWriteResGroup99], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup99], (instregex "ADC8mr")>;
-def: InstRW<[SBWriteResGroup99], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup99], (instregex "SBB8mr")>;
+def: InstRW<[SBWriteResGroup99, ReadAfterLd], (instrs ADC8mr, ADC16mr, ADC32mr, ADC64mr,
+                                                      SBB8mr, SBB16mr, SBB32mr, SBB64mr)>;
 
 def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort05,SBPort015]> {
   let Latency = 9;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,2,1,1];
 }
-def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup100], (instregex "BTC(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup100], (instregex "BTR(16|32|64)mr")>;
-def: InstRW<[SBWriteResGroup100], (instregex "BTS(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr",
+                                             "BTC(16|32|64)mr",
+                                             "BTR(16|32|64)mr",
+                                             "BTS(16|32|64)mr")>;
 
 def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup101], (instregex "ADD_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ADD_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ILD_F16m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ILD_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "ILD_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUB_F32m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "SUB_F64m")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDPDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDPSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCMPPDYrmi")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCMPPSYrmi")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPDm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPSm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VSUBPDYrm")>;
-def: InstRW<[SBWriteResGroup101], (instregex "VSUBPSYrm")>;
-
-def SBWriteResGroup102 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SIrm")>;
-
-def SBWriteResGroup103 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup103], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2DQrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2PSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTSD2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTSI642SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTSI2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTTPD2PIirm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDYrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2DQrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2PSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTSD2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI642SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI2SSrm")>;
-def: InstRW<[SBWriteResGroup103], (instregex "VCVTTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+                                             "ILD_F(16|32|64)m")>;
 
 def SBWriteResGroup103_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
   let Latency = 10;
   let NumMicroOps = 7;
   let ResourceCycles = [1,2,3,1];
 }
-def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[SBWriteResGroup103_2], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL",
+                                               "SHRD(16|32|64)mrCL")>;
 
 def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup104], (instregex "MULPDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "MULPSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "MULSDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "MULSSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "PCMPGTQrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RCPPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RCPSSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RSQRTPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "RSQRTSSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULPDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULPSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULSDrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VMULSSrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VPCMPGTQrm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRCPPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRCPSSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTPSm")>;
-def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTSSm")>;
-
-def SBWriteResGroup105 : SchedWriteRes<[SBPort0]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRIrr")>;
-def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRM128rr")>;
+def: InstRW<[SBWriteResGroup104], (instregex "(V?)PCMPGTQrm")>;
 
 def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SBWriteResGroup106], (instregex "FICOM16m")>;
-def: InstRW<[SBWriteResGroup106], (instregex "FICOM32m")>;
-def: InstRW<[SBWriteResGroup106], (instregex "FICOMP16m")>;
-def: InstRW<[SBWriteResGroup106], (instregex "FICOMP32m")>;
-
-def SBWriteResGroup107 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2DQYrm")>;
-def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2PSYrm")>;
-def: InstRW<[SBWriteResGroup107], (instregex "VCVTTPD2DQYrm")>;
-
-def SBWriteResGroup108 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SBWriteResGroup108], (instregex "MPSADBWrmi")>;
-def: InstRW<[SBWriteResGroup108], (instregex "VMPSADBWrmi")>;
-
-def SBWriteResGroup109 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup109], (instregex "HADDPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "HADDPSrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "HSUBPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "HSUBPSrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHADDPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHADDPSrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPDrm")>;
-def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPSrm")>;
-
-def SBWriteResGroup110 : SchedWriteRes<[SBPort5]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SBWriteResGroup110], (instregex "AESIMCrr")>;
-def: InstRW<[SBWriteResGroup110], (instregex "VAESIMCrr")>;
+def: InstRW<[SBWriteResGroup106], (instregex "FICOM(P?)(16|32)m")>;
 
 def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> {
   let Latency = 12;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup111], (instregex "MUL_F32m")>;
-def: InstRW<[SBWriteResGroup111], (instregex "MUL_F64m")>;
-def: InstRW<[SBWriteResGroup111], (instregex "VMULPDYrm")>;
-def: InstRW<[SBWriteResGroup111], (instregex "VMULPSYrm")>;
-
-def SBWriteResGroup112 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
-  let Latency = 12;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup112], (instregex "DPPSrri")>;
-def: InstRW<[SBWriteResGroup112], (instregex "VDPPSYrri")>;
-def: InstRW<[SBWriteResGroup112], (instregex "VDPPSrri")>;
-
-def SBWriteResGroup113 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,2,1];
-}
-def: InstRW<[SBWriteResGroup113], (instregex "VHADDPDYrm")>;
-def: InstRW<[SBWriteResGroup113], (instregex "VHADDPSYrm")>;
-def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPDYrm")>;
-def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPSYrm")>;
+def: InstRW<[SBWriteResGroup111], (instregex "MUL_F(32|64)m")>;
 
 def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> {
   let Latency = 13;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI16m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI32m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI16m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI32m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI16m")>;
-def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI32m")>;
-
-def SBWriteResGroup115 : SchedWriteRes<[SBPort5,SBPort23,SBPort015]> {
-  let Latency = 13;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SBWriteResGroup115], (instregex "AESDECLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "AESDECrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "AESENCLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "AESENCrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESDECLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESDECrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESENCLASTrm")>;
-def: InstRW<[SBWriteResGroup115], (instregex "VAESENCrm")>;
-
-def SBWriteResGroup116 : SchedWriteRes<[SBPort0]> {
-  let Latency = 14;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup116], (instregex "DIVPSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "DIVSSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "SQRTPSr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "SQRTSSr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "VDIVPSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "VDIVSSrr")>;
-def: InstRW<[SBWriteResGroup116], (instregex "VSQRTPSr")>;
-
-def SBWriteResGroup117 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 14;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup117], (instregex "VSQRTSSm")>;
-
-def SBWriteResGroup118 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
-  let Latency = 14;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SBWriteResGroup118], (instregex "VRCPPSYm")>;
-def: InstRW<[SBWriteResGroup118], (instregex "VRSQRTPSYm")>;
+def: InstRW<[SBWriteResGroup114], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
 
 def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
   let Latency = 15;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI16m")>;
-def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI32m")>;
-
-def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> {
-  let Latency = 15;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SBWriteResGroup120], (instregex "DPPDrmi")>;
-def: InstRW<[SBWriteResGroup120], (instregex "VDPPDrmi")>;
-
-def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 17;
-  let NumMicroOps = 4;
-  let ResourceCycles = [3,1];
-}
-def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRIrm")>;
-def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRM128rm")>;
-
-def SBWriteResGroup122 : SchedWriteRes<[SBPort5,SBPort23]> {
-  let Latency = 18;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup122], (instregex "AESIMCrm")>;
-def: InstRW<[SBWriteResGroup122], (instregex "VAESIMCrm")>;
-
-def SBWriteResGroup123 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 20;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup123], (instregex "DIVPSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "DIVSSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "SQRTPSm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "SQRTSSm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "VDIVPSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "VDIVSSrm")>;
-def: InstRW<[SBWriteResGroup123], (instregex "VSQRTPSm")>;
-
-def SBWriteResGroup124 : SchedWriteRes<[SBPort0]> {
-  let Latency = 21;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup124], (instregex "VSQRTSDr")>;
-
-def SBWriteResGroup125 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 21;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup125], (instregex "VSQRTSDm")>;
-
-def SBWriteResGroup126 : SchedWriteRes<[SBPort0]> {
-  let Latency = 22;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup126], (instregex "DIVPDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "DIVSDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "SQRTPDr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "SQRTSDr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "VDIVPDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "VDIVSDrr")>;
-def: InstRW<[SBWriteResGroup126], (instregex "VSQRTPDr")>;
-
-def SBWriteResGroup127 : SchedWriteRes<[SBPort0]> {
-  let Latency = 24;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FPrST0")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FST0r")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FrST0")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIV_FPrST0")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIV_FST0r")>;
-def: InstRW<[SBWriteResGroup127], (instregex "DIV_FrST0")>;
-
-def SBWriteResGroup128 : SchedWriteRes<[SBPort0,SBPort23]> {
-  let Latency = 28;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup128], (instregex "DIVPDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "DIVSDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "SQRTPDm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "SQRTSDm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "VDIVPDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "VDIVSDrm")>;
-def: InstRW<[SBWriteResGroup128], (instregex "VSQRTPDm")>;
-
-def SBWriteResGroup129 : SchedWriteRes<[SBPort0,SBPort05]> {
-  let Latency = 29;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup129], (instregex "VDIVPSYrr")>;
-def: InstRW<[SBWriteResGroup129], (instregex "VSQRTPSYr")>;
+def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI(16|32)m")>;
 
 def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> {
   let Latency = 31;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F32m")>;
-def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F64m")>;
-def: InstRW<[SBWriteResGroup130], (instregex "DIV_F32m")>;
-def: InstRW<[SBWriteResGroup130], (instregex "DIV_F64m")>;
+def: InstRW<[SBWriteResGroup130], (instregex "DIV(R?)_F(32|64)m")>;
 
 def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
   let Latency = 34;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI16m")>;
-def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI32m")>;
-def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI16m")>;
-def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI32m")>;
+def: InstRW<[SBWriteResGroup131], (instregex "DIV(R?)_FI(16|32)m")>;
 
-def SBWriteResGroup132 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
-  let Latency = 36;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SBWriteResGroup132], (instregex "VDIVPSYrm")>;
-def: InstRW<[SBWriteResGroup132], (instregex "VSQRTPSYm")>;
-
-def SBWriteResGroup133 : SchedWriteRes<[SBPort0,SBPort05]> {
-  let Latency = 45;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SBWriteResGroup133], (instregex "VDIVPDYrr")>;
-def: InstRW<[SBWriteResGroup133], (instregex "VSQRTPDYr")>;
-
-def SBWriteResGroup134 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
-  let Latency = 52;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SBWriteResGroup134], (instregex "VDIVPDYrm")>;
-def: InstRW<[SBWriteResGroup134], (instregex "VSQRTPDYm")>;
-
-def SBWriteResGroup135 : SchedWriteRes<[SBPort0]> {
-  let Latency = 114;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SBWriteResGroup135], (instregex "VSQRTSSr")>;
+def: InstRW<[WriteZero], (instrs CLC)>;
 
 } // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 9a417b2d3e82..1417799d76be 100644
--- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -19,7 +19,7 @@ def SkylakeClientModel : SchedMachineModel {
   let MicroOpBufferSize = 224; // Based on the reorder buffer.
   let LoadLatency = 5;
   let MispredictPenalty = 14;
-  
+
   // Based on the LSD (loop-stream detector) queue size and benchmarking data.
   let LoopMicroOpBufferSize = 50;
 
@@ -61,6 +61,10 @@ def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>;
 def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>;
 def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>;
 
+def SKLDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def SKLFPDivider : ProcResource<1>;
+
 // 60 Entry Unified Scheduler
 def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
                               SKLPort5, SKLPort6, SKLPort7]> {
@@ -77,45 +81,84 @@ def : ReadAdvance<ReadAfterLd, 5>;
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
 multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [1], int UOps = 1,
+                          int LoadLat = 5> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
-  // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
-  // latency.
-  def : WriteRes<SchedRW.Folded, [SKLPort23, ExePort]> {
-     let Latency = !add(Lat, 5);
+  // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+  // the latency (default = 5).
+  def : WriteRes<SchedRW.Folded, !listconcat([SKLPort23], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = !add(UOps, 1);
   }
 }
 
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [SKLPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKLPort237,SKLPort4]>;
 
 // Arithmetic.
-defm : SKLWriteResPair<WriteALU,   SKLPort0156, 1>; // Simple integer ALU op.
-defm : SKLWriteResPair<WriteIMul,  SKLPort1,   3>; // Integer multiplication.
+defm : SKLWriteResPair<WriteALU,    [SKLPort0156], 1>; // Simple integer ALU op.
+defm : SKLWriteResPair<WriteADC,    [SKLPort06],   1>; // Integer ALU + flags op.
+defm : SKLWriteResPair<WriteIMul,   [SKLPort1],    3>; // Integer multiplication.
+defm : SKLWriteResPair<WriteIMul64, [SKLPort1],    3>; // Integer 64-bit multiplication.
+
+defm : SKLWriteResPair<WriteBSWAP32,[SKLPort15],   1>; //
+defm : SKLWriteResPair<WriteBSWAP64,[SKLPort06, SKLPort15], 2, [1,1], 2>; //
+
+defm : SKLWriteResPair<WriteDiv8,   [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteDiv16,  [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteDiv32,  [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteDiv64,  [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv8,  [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv32, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+defm : SKLWriteResPair<WriteIDiv64, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>;
+
+defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>;
+
 def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def SKLDivider : ProcResource<1>; // Integer division issued on port 0.     
-def : WriteRes<WriteIDiv, [SKLPort0, SKLDivider]> { // Integer division.
-  let Latency = 25;
-  let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SKLPort23, SKLPort0, SKLDivider]> {
-  let Latency = 29;
-  let ResourceCycles = [1, 1, 10];
+def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
+
+defm : SKLWriteResPair<WriteCMOV,  [SKLPort06], 1, [1], 1>; // Conditional move.
+defm : SKLWriteResPair<WriteCMOV2, [SKLPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
 }
+def  : WriteRes<WriteLAHFSAHF, [SKLPort06]>;
 
-def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
+// Bit counts.
+defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteBSR, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteLZCNT,          [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteTZCNT,          [SKLPort1], 3>;
+defm : SKLWriteResPair<WritePOPCNT,         [SKLPort1], 3>;
 
 // Integer shifts and rotates.
-defm : SKLWriteResPair<WriteShift, SKLPort06,  1>;
+defm : SKLWriteResPair<WriteShift, [SKLPort06],  1>;
+
+// Double shift instructions.
+defm : SKLWriteResPair<WriteShiftDouble, [SKLPort06],  1>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>;
+defm : SKLWriteResPair<WriteBZHI, [SKLPort15], 1>;
 
 // Loads, stores, and moves, not folded with other operations.
-def : WriteRes<WriteLoad,  [SKLPort23]> { let Latency = 5; }
-def : WriteRes<WriteStore, [SKLPort237, SKLPort4]>;
-def : WriteRes<WriteMove,  [SKLPort0156]>;
+defm : X86WriteRes<WriteLoad,    [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore,   [SKLPort237, SKLPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [SKLPort237, SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove,    [SKLPort0156], 1, [1], 1>;
 
 // Idioms that clear a register, like xorps %xmm0, %xmm0.
 // These can often bypass execution ports completely.
@@ -123,153 +166,373 @@ def : WriteRes<WriteZero,  []>;
 
 // Branches don't produce values, so they have no latency, but they still
 // consume resources. Indirect branches can fold loads.
-defm : SKLWriteResPair<WriteJump,  SKLPort06,   1>;
+defm : SKLWriteResPair<WriteJump,  [SKLPort06],   1>;
 
 // Floating point. This covers both scalar and vector operations.
-defm : SKLWriteResPair<WriteFAdd,   SKLPort1, 3>; // Floating point add/sub/compare.
-defm : SKLWriteResPair<WriteFMul,   SKLPort0, 5>; // Floating point multiplication.
-defm : SKLWriteResPair<WriteFDiv,   SKLPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : SKLWriteResPair<WriteFSqrt,  SKLPort0, 15>; // Floating point square root.
-defm : SKLWriteResPair<WriteFRcp,   SKLPort0, 5>; // Floating point reciprocal estimate.
-defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate.
-defm : SKLWriteResPair<WriteFMA,    SKLPort01, 4>; // Fused Multiply Add.
-defm : SKLWriteResPair<WriteFShuffle,  SKLPort5,  1>; // Floating point vector shuffles.
-defm : SKLWriteResPair<WriteFBlend,  SKLPort015,  1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends.	       
-  let Latency = 2;
-  let ResourceCycles = [2];
-} 
-def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0,          [SKLPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,          [SKLPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC,          [SKLPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad,         [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [SKLPort23,SKLPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore,        [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT,      [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX,     [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY,     [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMove,         [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [SKLPort05,SKLPort0156], 10, [9,1], 10>;
+
+defm : SKLWriteResPair<WriteFAdd,     [SKLPort01],  4, [1], 1, 5>; // Floating point add/sub.
+defm : SKLWriteResPair<WriteFAddX,    [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFAddY,    [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : SKLWriteResPair<WriteFAdd64,   [SKLPort01],  4, [1], 1, 5>; // Floating point double add/sub.
+defm : SKLWriteResPair<WriteFAdd64X,  [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFAdd64Y,  [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : SKLWriteResPair<WriteFCmp,     [SKLPort01],  4, [1], 1, 5>; // Floating point compare.
+defm : SKLWriteResPair<WriteFCmpX,    [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFCmpY,    [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : SKLWriteResPair<WriteFCmp64,   [SKLPort01],  4, [1], 1, 5>; // Floating point double compare.
+defm : SKLWriteResPair<WriteFCmp64X,  [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFCmp64Y,  [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : SKLWriteResPair<WriteFCom,      [SKLPort0],  2>; // Floating point compare to flags.
+
+defm : SKLWriteResPair<WriteFMul,     [SKLPort01],  4, [1], 1, 5>; // Floating point multiplication.
+defm : SKLWriteResPair<WriteFMulX,    [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMulY,    [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : SKLWriteResPair<WriteFMul64,   [SKLPort01],  4, [1], 1, 5>; // Floating point double multiplication.
+defm : SKLWriteResPair<WriteFMul64X,  [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMul64Y,  [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+defm : SKLWriteResPair<WriteFDiv,     [SKLPort0,SKLFPDivider], 11, [1,3], 1, 5>; // Floating point division.
+//defm : SKLWriteResPair<WriteFDivX,    [SKLPort0,SKLFPDivider], 11, [1,3], 1, 6>;
+defm : SKLWriteResPair<WriteFDivY,    [SKLPort0,SKLFPDivider], 11, [1,5], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+//defm : SKLWriteResPair<WriteFDiv64,   [SKLPort0,SKLFPDivider], 14, [1,3], 1, 5>; // Floating point double division.
+//defm : SKLWriteResPair<WriteFDiv64X,  [SKLPort0,SKLFPDivider], 14, [1,3], 1, 6>;
+//defm : SKLWriteResPair<WriteFDiv64Y,  [SKLPort0,SKLFPDivider], 14, [1,5], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : SKLWriteResPair<WriteFSqrt,    [SKLPort0,SKLFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : SKLWriteResPair<WriteFSqrtX,   [SKLPort0,SKLFPDivider], 12, [1,3], 1, 6>;
+defm : SKLWriteResPair<WriteFSqrtY,   [SKLPort0,SKLFPDivider], 12, [1,6], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : SKLWriteResPair<WriteFSqrt64,  [SKLPort0,SKLFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : SKLWriteResPair<WriteFSqrt64X, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 6>;
+defm : SKLWriteResPair<WriteFSqrt64Y, [SKLPort0,SKLFPDivider], 18, [1,12],1, 7>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : SKLWriteResPair<WriteFSqrt80,  [SKLPort0,SKLFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : SKLWriteResPair<WriteFRcp,   [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : SKLWriteResPair<WriteFRcpX,  [SKLPort0], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFRcpY,  [SKLPort0], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : SKLWriteResPair<WriteFRsqrt, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : SKLWriteResPair<WriteFRsqrtX,[SKLPort0], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFRsqrtY,[SKLPort0], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : SKLWriteResPair<WriteFMA,    [SKLPort01], 4, [1], 1, 5>; // Fused Multiply Add.
+defm : SKLWriteResPair<WriteFMAX,   [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMAY,   [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : SKLWriteResPair<WriteDPPD,   [SKLPort5,SKLPort01],  9, [1,2], 3, 6>; // Floating point double dot product.
+defm : SKLWriteResPair<WriteDPPS,   [SKLPort5,SKLPort01], 13, [1,3], 4, 6>;
+defm : SKLWriteResPair<WriteDPPSY,  [SKLPort5,SKLPort01], 13, [1,3], 4, 7>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : SKLWriteResPair<WriteFSign,   [SKLPort0], 1>; // Floating point fabs/fchs.
+defm : SKLWriteResPair<WriteFRnd,     [SKLPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : SKLWriteResPair<WriteFRndY,    [SKLPort01], 8, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : SKLWriteResPair<WriteFLogic,  [SKLPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : SKLWriteResPair<WriteFLogicY, [SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : SKLWriteResPair<WriteFTest,   [SKLPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : SKLWriteResPair<WriteFTestY,  [SKLPort0], 2, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : SKLWriteResPair<WriteFShuffle,  [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : SKLWriteResPair<WriteFVarShuffle,  [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : SKLWriteResPair<WriteFBlend,  [SKLPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : SKLWriteResPair<WriteFBlendY, [SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SKLWriteResPair<WriteFVarBlend, [SKLPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : SKLWriteResPair<WriteFVarBlendY,[SKLPort015], 2, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 
 // FMA Scheduling helper class.
 // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
 // Vector integer operations.
-defm : SKLWriteResPair<WriteVecALU,   SKLPort15,  1>; // Vector integer ALU op, no logicals.
-defm : SKLWriteResPair<WriteVecShift, SKLPort0,  1>; // Vector integer shifts.
-defm : SKLWriteResPair<WriteVecIMul,  SKLPort0,   5>; // Vector integer multiply.
-defm : SKLWriteResPair<WriteShuffle,  SKLPort5,  1>; // Vector shuffles.
-defm : SKLWriteResPair<WriteBlend,  SKLPort15,  1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [SKLPort5]> { // Vector variable blends.
+defm : X86WriteRes<WriteVecLoad,         [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [SKLPort23,SKLPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore,        [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT,      [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY,     [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove,         [SKLPort05],  1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [SKLPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [SKLPort5], 1, [1], 1>;
+
+defm : SKLWriteResPair<WriteVecALU,   [SKLPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : SKLWriteResPair<WriteVecALUX,  [SKLPort01], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecALUY,  [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : SKLWriteResPair<WriteVecLogic, [SKLPort05],  1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : SKLWriteResPair<WriteVecLogicX,[SKLPort015], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecLogicY,[SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : SKLWriteResPair<WriteVecTest,  [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : SKLWriteResPair<WriteVecIMul,  [SKLPort0] ,  4, [1], 1, 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01],  4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01],  4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : SKLWriteResPair<WritePMULLD,   [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : SKLWriteResPair<WritePMULLDY,  [SKLPort01], 10, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : SKLWriteResPair<WriteShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteShuffleX, [SKLPort5], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffleX, [SKLPort5], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : SKLWriteResPair<WriteBlend,  [SKLPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKLWriteResPair<WriteBlendY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SKLWriteResPair<WriteVarBlend,  [SKLPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKLWriteResPair<WriteVarBlendY, [SKLPort015], 2, [2], 2, 6>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : SKLWriteResPair<WriteMPSAD,  [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : SKLWriteResPair<WriteMPSADY, [SKLPort5], 4, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : SKLWriteResPair<WritePSADBW,  [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
+defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : SKLWriteResPair<WriteVecShift,     [SKLPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX,        [SKLPort5,SKLPort01],  2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY,        [SKLPort5,SKLPort01],  4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd,      [SKLPort01,SKLPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd,      [SKLPort01,SKLPort23], 8, [1,1], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : SKLWriteResPair<WriteVecShiftImm,  [SKLPort0],  1, [1], 1, 5>; // Vector integer immediate shifts.
+defm : SKLWriteResPair<WriteVecShiftImmX, [SKLPort01], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecShiftImmY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SKLWriteResPair<WriteVarVecShift,  [SKLPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : SKLWriteResPair<WriteVarVecShiftY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SKLPort5]> {
   let Latency = 2;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : WriteRes<WriteVarBlendLd, [SKLPort5, SKLPort23]> {
+def : WriteRes<WriteVecInsertLd, [SKLPort5,SKLPort23]> {
   let Latency = 6;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 2;
 }
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
 
-def : WriteRes<WriteMPSAD, [SKLPort0, SKLPort5]> { // Vector MPSAD.     
-  let Latency = 6;
-  let ResourceCycles = [1, 2];
+def : WriteRes<WriteVecExtract, [SKLPort0,SKLPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
 }
-def : WriteRes<WriteMPSADLd, [SKLPort23, SKLPort0, SKLPort5]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [SKLPort4,SKLPort5,SKLPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
 }
 
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm : SKLWriteResPair<WriteVecLogic, SKLPort015, 1>; // Vector and/or/xor.
-
 // Conversion between integer and float.
-defm : SKLWriteResPair<WriteCvtF2I, SKLPort1, 3>; // Float -> Integer.
-defm : SKLWriteResPair<WriteCvtI2F, SKLPort1, 4>; // Integer -> Float.
-defm : SKLWriteResPair<WriteCvtF2F, SKLPort1, 3>; // Float -> Float size conversion.
+defm : SKLWriteResPair<WriteCvtSS2I,   [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2I,   [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2IY,  [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : SKLWriteResPair<WriteCvtSD2I,   [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2I,   [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2IY,  [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : SKLWriteResPair<WriteCvtI2SS,   [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PS,   [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PSY,  [SKLPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : SKLWriteResPair<WriteCvtI2SD,   [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PD,   [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PDY,  [SKLPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : SKLWriteResPair<WriteCvtSS2SD,  [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2PD,  [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2PDY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : SKLWriteResPair<WriteCvtSD2SS,  [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2PS,  [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2PSY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS,    [SKLPort5,SKLPort015],  5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [SKLPort5,SKLPort01],  7, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd,  [SKLPort23,SKLPort01],  9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [SKLPort23,SKLPort01], 10, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH,                       [SKLPort5,SKLPort015], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,                       [SKLPort5,SKLPort01], 7, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt,  [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 
 // Strings instructions.
+
 // Packed Compare Implicit Length Strings, Return Mask
-// String instructions.
 def : WriteRes<WritePCmpIStrM, [SKLPort0]> {
   let Latency = 10;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrMLd, [SKLPort0, SKLPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 1];
-} 
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
 // Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort16, SKLPort5]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort5, SKLPort015, SKLPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
-def : WriteRes<WritePCmpEStrMLd, [SKLPort05, SKLPort16, SKLPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [6, 2, 1];
-} 
-  // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrMLd, [SKLPort0, SKLPort5,SKLPort23, SKLPort015, SKLPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
 def : WriteRes<WritePCmpIStrI, [SKLPort0]> {
-  let Latency = 11;
+  let Latency = 10;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrILd, [SKLPort0, SKLPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 1];
-}     
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
 // Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [SKLPort05, SKLPort16]> {
-  let Latency = 11;
-  let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [SKLPort0, SKLPort5, SKLPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4,3,1];
 }
-def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort16, SKLPort5, SKLPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort5, SKLPort23, SKLPort0156]> {
+  let Latency = 24;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
 
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK,  [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK,  [SKLPort0]> { let Latency = 2; }
+
 // AES instructions.
-def : WriteRes<WriteAESDecEnc, [SKLPort5]> { // Decryption, encryption.
-  let Latency = 7;
+def : WriteRes<WriteAESDecEnc, [SKLPort0]> { // Decryption, encryption.
+  let Latency = 4;
+  let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def : WriteRes<WriteAESDecEncLd, [SKLPort5, SKLPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
+def : WriteRes<WriteAESDecEncLd, [SKLPort0, SKLPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : WriteRes<WriteAESIMC, [SKLPort5]> { // InvMixColumn.
-  let Latency = 14;
+
+def : WriteRes<WriteAESIMC, [SKLPort0]> { // InvMixColumn.
+  let Latency = 8;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : WriteRes<WriteAESIMCLd, [SKLPort5, SKLPort23]> {
+def : WriteRes<WriteAESIMCLd, [SKLPort0, SKLPort23]> {
   let Latency = 14;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5]> { // Key Generation.
-  let Latency = 10;
-  let ResourceCycles = [2, 8];
+
+def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5, SKLPort015]> { // Key Generation.
+  let Latency = 20;
+  let NumMicroOps = 11;
+  let ResourceCycles = [3,6,2];
 }
-def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23, SKLPort015]> {
+  let Latency = 25;
+  let NumMicroOps = 11;
+  let ResourceCycles = [3,6,1,1];
 }
 
 // Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [SKLPort0, SKLPort5]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1];
+def : WriteRes<WriteCLMul, [SKLPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : WriteRes<WriteCLMulLd, [SKLPort0, SKLPort5, SKLPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1, 1];
+def : WriteRes<WriteCLMulLd, [SKLPort5, SKLPort23]> {
+  let Latency = 12;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
 
 // Catch-all for expensive system instructions.
 def : WriteRes<WriteSystem,     [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
 
 // AVX2.
-defm : SKLWriteResPair<WriteFShuffle256,  SKLPort5,  3>; // Fp 256-bit width vector shuffles.
-defm : SKLWriteResPair<WriteShuffle256,  SKLPort5,  3>;  // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [SKLPort0, SKLPort5]> { // Variable vector shifts.
-  let Latency = 2;
-  let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [SKLPort0, SKLPort5, SKLPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1, 1];
-}
+defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3, [1], 1, 7>;  // 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffle256, [SKLPort5], 3, [1], 1, 7>;  // 256-bit width vector variable shuffles.
 
 // Old microcoded instructions that nobody use.
 def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
@@ -277,33 +540,22 @@ def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def Wri
 // Fence instructions.
 def : WriteRes<WriteFence,  [SKLPort23, SKLPort4]>;
 
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [SKLPort0,SKLPort23,SKLPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SKLPort4,SKLPort5,SKLPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
 // Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SKLPort1]> {
-  let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SKLPort1, SKLPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
 
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SKLPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SKLPort15, SKLPort23]> {
-  let Latency = 5;
-  let ResourceCycles = [1, 1];
-}
+defm : SKLWriteResPair<WriteFHAdd,  [SKLPort5,SKLPort01], 6, [2,1], 3, 6>;
+defm : SKLWriteResPair<WriteFHAddY, [SKLPort5,SKLPort01], 6, [2,1], 3, 7>;
+defm : SKLWriteResPair<WritePHAdd,  [SKLPort5,SKLPort05],  3, [2,1], 3, 5>;
+defm : SKLWriteResPair<WritePHAddX, [SKLPort5,SKLPort015], 3, [2,1], 3, 6>;
+defm : SKLWriteResPair<WritePHAddY, [SKLPort5,SKLPort015], 3, [2,1], 3, 7>;
 
 // Remaining instrs.
 
@@ -312,210 +564,23 @@ def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWri")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSWirr")>;
-
-def SKLWriteResGroup2 : SchedWriteRes<[SKLPort1]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)irr",
+                                            "MMX_PADDUS(B|W)irr",
+                                            "MMX_PAVG(B|W)irr",
+                                            "MMX_PCMPEQ(B|D|W)irr",
+                                            "MMX_PCMPGT(B|D|W)irr",
+                                            "MMX_P(MAX|MIN)SWirr",
+                                            "MMX_P(MAX|MIN)UBirr",
+                                            "MMX_PSUBS(B|W)irr",
+                                            "MMX_PSUBUS(B|W)irr")>;
 
 def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup3], (instregex "COMP_FST0r")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "COM_FST0r")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "INSERTPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVDDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVHLPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSHDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PALIGNRrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PBLENDWrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFDri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFHWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFLWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSLLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PSRLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPDrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPSrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_FPr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_Fr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VINSERTPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQYri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSYrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSrri")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "COM(P?)_FST0r",
+                                            "UCOM_F(P?)r")>;
 
 def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> {
   let Latency = 1;
@@ -524,557 +589,68 @@ def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> {
 }
 def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>;
 
-def SKLWriteResGroup5 : SchedWriteRes<[SKLPort01]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup5], (instregex "PABSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PABSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PABSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PAVGBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PAVGWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PMINUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNBrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNDrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNWrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSLLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSLLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSLLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRADri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRAWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSRLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWrr128")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWYri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWri")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWrr")>;
-
 def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup6], (instregex "FINCSTP")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "FNOP")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDBirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDDirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDQirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDWirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDNirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PORirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PXORirr")>;
+def: InstRW<[SKLWriteResGroup6], (instrs FINCSTP, FNOP)>;
 
 def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADCX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADOX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CDQ")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CLAC")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "CQO")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JAE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JAE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JA_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JA_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JBE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JBE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JB_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JB_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JGE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JGE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JG_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JG_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JLE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JLE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JL_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JL_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JMP_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JMP_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNE_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNE_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNO_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNO_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNP_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNP_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNS_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JNS_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JO_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JO_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JP_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JP_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JS_1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "JS_4")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "RORX(32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR8r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SAR8ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SARX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETAEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETBr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETGEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETGr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETLEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETLr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNEr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNOr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNPr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETNSr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETOr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETPr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SETSr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL8r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHL8ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR8r1")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHR8ri")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SHRX(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "STAC")>;
+def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8",
+                                            "BT(16|32|64)rr",
+                                            "BTC(16|32|64)ri8",
+                                            "BTC(16|32|64)rr",
+                                            "BTR(16|32|64)ri8",
+                                            "BTR(16|32|64)rr",
+                                            "BTS(16|32|64)ri8",
+                                            "BTS(16|32|64)rr")>;
 
 def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr",
+                                            "BLSI(32|64)rr",
+                                            "BLSMSK(32|64)rr",
+                                            "BLSR(32|64)rr")>;
 
 def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ANDPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPDrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPSrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "ORPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PADDWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PANDNrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PANDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PSUBWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "PXORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDYrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSYrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VORPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPANDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDYrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPORYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPXORYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VPXORrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSYrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "XORPDrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "XORPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr",
+                                            "VPBLENDD(Y?)rri",
+                                            "(V?)PSUB(B|D|Q|W)(Y?)rr")>;
 
 def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CBW")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CLC")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMC")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CWDE")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "DEC8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "INC(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "INC8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NEG8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NOOP")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "NOT8r")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SAHF")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SIDT64m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SLDT64m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SMSW16m")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "STC")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "STRm")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SYSCALL")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "TEST8rr")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8i8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instrs CBW, CWDE, CDQE,
+                                          CMC, STC)>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m",
+                                             "SIDT64m",
+                                             "SMSW16m",
+                                             "STRm",
+                                             "SYSCALL")>;
 
 def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> {
   let Latency = 1;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mi")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQAmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQUmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTDQmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTI_64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVPDI2DImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQI2QImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQIto64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVSDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVSSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP32m")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP64m")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP80m")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSYmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSmr")>;
-def: InstRW<[SKLWriteResGroup11], (instregex "VMPTRSTm")>;
-
-def SKLWriteResGroup12 : SchedWriteRes<[SKLPort0]> {
-  let Latency = 2;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup12], (instregex "COMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "COMISSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "MOVPQIto64rr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "PMOVMSKBrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSYrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISDrr")>;
-def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISSrr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm",
+                                             "ST_FP(32|64|80)m",
+                                             "VMPTRSTm")>;
 
 def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> {
   let Latency = 2;
@@ -1082,22 +658,13 @@ def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> {
   let ResourceCycles = [2];
 }
 def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRBrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRDrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRQrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "PINSRWrri")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRBrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRDrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRQrr")>;
-def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRWrri")>;
 
 def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKLWriteResGroup14], (instregex "FDECSTP")>;
+def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP)>;
 def: InstRW<[SKLWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>;
 
 def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
@@ -1105,80 +672,20 @@ def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKLWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL8r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROL8ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR8r1")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "ROR8ri")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "SETAr")>;
-def: InstRW<[SKLWriteResGroup15], (instregex "SETBEr")>;
-
-def SKLWriteResGroup16 : SchedWriteRes<[SKLPort015]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPDrr0")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPSrr0")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "PBLENDVBrr0")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBrr")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROL(8|16|32|64)r1",
+                                             "ROL(8|16|32|64)ri",
+                                             "ROR(8|16|32|64)r1",
+                                             "ROR(8|16|32|64)ri",
+                                             "SET(A|BE)r")>;
 
 def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKLWriteResGroup17], (instregex "LFENCE")>;
-def: InstRW<[SKLWriteResGroup17], (instregex "WAIT")>;
-def: InstRW<[SKLWriteResGroup17], (instregex "XGETBV")>;
-
-def SKLWriteResGroup18 : SchedWriteRes<[SKLPort0,SKLPort237]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup18], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVDQU")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQmr")>;
-
-def SKLWriteResGroup19 : SchedWriteRes<[SKLPort5,SKLPort01]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup19], (instregex "PSLLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSLLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSLLWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRADrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRAWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "PSRLWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRADrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRAWrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLDrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLQrr")>;
-def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLWrr")>;
+def: InstRW<[SKLWriteResGroup17], (instrs LFENCE,
+                                          WAIT,
+                                          XGETBV)>;
 
 def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
   let Latency = 2;
@@ -1192,72 +699,26 @@ def SKLWriteResGroup21 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup21], (instregex "SFENCE")>;
-
-def SKLWriteResGroup22 : SchedWriteRes<[SKLPort06,SKLPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;
+def: InstRW<[SKLWriteResGroup21], (instrs SFENCE)>;
 
 def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "CWD")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "JRCXZ")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "SBB8i8")>;
-def: InstRW<[SKLWriteResGroup23], (instregex "SBB8ri")>;
-
-def SKLWriteResGroup24 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup24], (instregex "EXTRACTPSmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRBmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRDmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRQmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRWmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "STMXCSR")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRBmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRDmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRQmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRWmr")>;
-def: InstRW<[SKLWriteResGroup24], (instregex "VSTMXCSR")>;
+def: InstRW<[SKLWriteResGroup23], (instrs CWD)>;
+def: InstRW<[SKLWriteResGroup23], (instrs JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8",
+                                             "ADC8ri",
+                                             "SBB8i8",
+                                             "SBB8ri")>;
 
 def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> {
   let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup25], (instregex "FNSTCW16m")>;
-
-def SKLWriteResGroup26 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup26], (instregex "SETAEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETBm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETGEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETGm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETLEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETLm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNEm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNOm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNPm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETNSm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETOm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETPm")>;
-def: InstRW<[SKLWriteResGroup26], (instregex "SETSm")>;
+def: InstRW<[SKLWriteResGroup25], (instrs FNSTCW16m)>;
 
 def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> {
   let Latency = 2;
@@ -1271,206 +732,88 @@ def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "PUSH64i8")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSB")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSL")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSQ")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "STOSW")>;
+def: InstRW<[SKLWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r,
+                                          STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)rmr",
+                                             "PUSH64i8")>;
 
 def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup29], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "IMUL64rr(i8)?")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "IMUL8r")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "MUL8r")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "TZCNT(16|32|64)rr")>;
-
-def SKLWriteResGroup29_16 : SchedWriteRes<[SKLPort1, SKLPort0156]> {
-  let Latency = 3;
+def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr",
+                                             "PEXT(32|64)rr",
+                                             "SHLD(16|32|64)rri8",
+                                             "SHRD(16|32|64)rri8")>;
+
+def SKLWriteResGroup29_16i : SchedWriteRes<[SKLPort1, SKLPort0156]> {
+  let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup29_16], (instregex "IMUL16rr(i8)?")>;
-
-def SKLWriteResGroup29_32 : SchedWriteRes<[SKLPort1]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-def: InstRW<[SKLWriteResGroup29_32], (instregex "IMUL32rr(i8)?")>;
+def: InstRW<[SKLWriteResGroup29_16i], (instrs IMUL16rri, IMUL16rri8)>;
 
 def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FPrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FST0r")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "PCMPGTQrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "PSADBWrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FPrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FST0r")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FPrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FST0r")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FrST0")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTF128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTI128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2F128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2I128rr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPDYri")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPSYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPERMQYri")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWQYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWYrr")>;
-def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWrr")>;
-
-def SKLWriteResGroup31 : SchedWriteRes<[SKLPort0,SKLPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup31], (instregex "EXTRACTPSrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRBrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRDrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRQrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWri")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "PTESTrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRBrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRDrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRQrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWri")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTYrr")>;
-def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+                                             "VPBROADCASTBrr",
+                                             "VPBROADCASTWrr",
+                                             "(V?)PCMPGTQ(Y?)rr")>;
 
 def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> {
   let Latency = 3;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup32], (instregex "FNSTSW16r")>;
+def: InstRW<[SKLWriteResGroup32], (instrs FNSTSW16r)>;
 
 def SKLWriteResGroup33 : SchedWriteRes<[SKLPort06]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SKLWriteResGroup33], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "ROL8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "ROR8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SAR8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHL8rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup33], (instregex "SHR8rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "ROL(8|16|32|64)rCL",
+                                             "ROR(8|16|32|64)rCL",
+                                             "SAR(8|16|32|64)rCL",
+                                             "SHL(8|16|32|64)rCL",
+                                             "SHR(8|16|32|64)rCL")>;
 
 def SKLWriteResGroup34 : SchedWriteRes<[SKLPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SKLWriteResGroup34], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup34], (instregex "XADD8rr")>;
-def: InstRW<[SKLWriteResGroup34], (instregex "XCHG8rr")>;
+def: InstRW<[SKLWriteResGroup34], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+                                          XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+                                          XCHG16ar, XCHG32ar, XCHG64ar)>;
 
 def SKLWriteResGroup35 : SchedWriteRes<[SKLPort0,SKLPort5]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PH(ADD|SUB)SWrr")>;
 
 def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup36], (instregex "PHADDSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "PHSUBSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr256")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr256")>;
-
-def SKLWriteResGroup37 : SchedWriteRes<[SKLPort5,SKLPort05]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBWrr64")>;
-
-def SKLWriteResGroup38 : SchedWriteRes<[SKLPort5,SKLPort015]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup38], (instregex "PHADDDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "PHADDWrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBWrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWYrr")>;
-def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWrr")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "(V?)PHADDSW(Y?)rr",
+                                             "(V?)PHSUBSW(Y?)rr")>;
 
 def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr",
+                                             "MMX_PACKSSWBirr",
+                                             "MMX_PACKUSWBirr")>;
 
 def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
   let Latency = 3;
@@ -1484,36 +827,31 @@ def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKLWriteResGroup41], (instregex "MFENCE")>;
+def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>;
 
 def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL8r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL8ri")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR8r1")>;
-def: InstRW<[SKLWriteResGroup42], (instregex "RCR8ri")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r1",
+                                             "RCL(8|16|32|64)ri",
+                                             "RCR(8|16|32|64)r1",
+                                             "RCR(8|16|32|64)ri")>;
 
 def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup43], (instregex "FNSTSWm")>;
+def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>;
 
 def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
   let Latency = 3;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SKLWriteResGroup44], (instregex "SETAm")>;
-def: InstRW<[SKLWriteResGroup44], (instregex "SETBEm")>;
+def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>;
 
 def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> {
   let Latency = 3;
@@ -1527,232 +865,65 @@ def SKLWriteResGroup46 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort015
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKLWriteResGroup46], (instregex "CALL64pcrel32")>;
+def: InstRW<[SKLWriteResGroup46], (instrs CALL64pcrel32)>;
 
 def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0]> {
   let Latency = 4;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup47], (instregex "AESDECLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "AESDECrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "AESENCLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "AESENCrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FPrST0")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FST0r")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FrST0")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RCPPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RCPSSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTSSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCLASTrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCrr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSYr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRCPSSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSYr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSr")>;
-def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTSSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
 
 def SKLWriteResGroup48 : SchedWriteRes<[SKLPort01]> {
   let Latency = 4;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "MULSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "SUBSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VMULSSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSYrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSDrr")>;
-def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSSrr")>;
-def: InstRW<[SKLWriteResGroup48],
-            (instregex
-             "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
-             "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
-
-def SKLWriteResGroup49 : SchedWriteRes<[SKLPort015]> {
-  let Latency = 4;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPPDrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPPSrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPSDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CMPSSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CVTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMADDUBSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMADDWDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULDQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULHRSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULHUWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULHWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULLWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "PMULUDQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDYrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSYrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSrri")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQYrr")>;
-def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQrr")>;
-
-def SKLWriteResGroup50 : SchedWriteRes<[SKLPort5]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup50], (instregex "MPSADBWrri")>;
-def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWYrri")>;
-def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWrri")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "(V?)CVTDQ2PS(Y?)rr",
+                                             "(V?)CVT(T?)PS2DQ(Y?)rr")>;
 
 def SKLWriteResGroup51 : SchedWriteRes<[SKLPort1,SKLPort5]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup51], (instregex "IMUL64r")>;
-def: InstRW<[SKLWriteResGroup51], (instregex "MUL64r")>;
-def: InstRW<[SKLWriteResGroup51], (instregex "MULX64rr")>;
+def: InstRW<[SKLWriteResGroup51], (instrs IMUL64r, MUL64r, MULX64rr)>;
 
 def SKLWriteResGroup51_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SKLWriteResGroup51_16], (instregex "IMUL16r")>;
-def: InstRW<[SKLWriteResGroup51_16], (instregex "MUL16r")>;
-
-def SKLWriteResGroup52 : SchedWriteRes<[SKLPort5,SKLPort01]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLDYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLQYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLWYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRADYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRAWYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLDYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLQYrr")>;
-def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLWYrr")>;
+def: InstRW<[SKLWriteResGroup51_16], (instrs IMUL16r, MUL16r)>;
 
 def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
   let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP16m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP32m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP64m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_F16m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_F32m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP16m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP32m")>;
-def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP64m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST(T?)_FP(16|32|64)m",
+                                             "IST_F(16|32)m")>;
 
 def SKLWriteResGroup54 : SchedWriteRes<[SKLPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [4];
 }
-def: InstRW<[SKLWriteResGroup54], (instregex "FNCLEX")>;
+def: InstRW<[SKLWriteResGroup54], (instrs FNCLEX)>;
 
 def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKLWriteResGroup55], (instregex "PAUSE")>;
+def: InstRW<[SKLWriteResGroup55], (instrs PAUSE)>;
 
 def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKLWriteResGroup56], (instregex "VZEROUPPER")>;
+def: InstRW<[SKLWriteResGroup56], (instrs VZEROUPPER)>;
 
 def SKLWriteResGroup57 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> {
   let Latency = 4;
@@ -1766,72 +937,36 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOV64toPQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOV8rm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVDDUPrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSDrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSSrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHNTA")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT0")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT1")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT2")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDDUPrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSDrm")>;
-def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSSrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16",
+                                             "MOVSX(16|32|64)rm32",
+                                             "MOVSX(16|32|64)rm8",
+                                             "MOVZX(16|32|64)rm16",
+                                             "MOVZX(16|32|64)rm8",
+                                             "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
 
 def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
   let Latency = 5;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup59], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[SKLWriteResGroup59], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr",
+                                             "(V?)CVTDQ2PDrr")>;
 
 def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> {
   let Latency = 5;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2DQrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2PSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTPS2PDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSD2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI642SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTSS2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[SKLWriteResGroup60], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr",
+                                             "MMX_CVT(T?)PS2PIirr",
+                                             "(V?)CVT(T?)PD2DQrr",
+                                             "(V?)CVTPD2PSrr",
+                                             "(V?)CVTPS2PDrr",
+                                             "(V?)CVTSD2SSrr",
+                                             "(V?)CVTSI642SDrr",
+                                             "(V?)CVTSI2SDrr",
+                                             "(V?)CVTSI2SSrr",
+                                             "(V?)CVTSS2SDrr")>;
 
 def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> {
   let Latency = 5;
@@ -1841,73 +976,43 @@ def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> {
 def: InstRW<[SKLWriteResGroup61], (instregex "STR(16|32|64)r")>;
 
 def SKLWriteResGroup62 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
-  let Latency = 5;
+  let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup62], (instregex "IMUL32r")>;
-def: InstRW<[SKLWriteResGroup62], (instregex "MUL32r")>;
-def: InstRW<[SKLWriteResGroup62], (instregex "MULX32rr")>;
+def: InstRW<[SKLWriteResGroup62], (instrs IMUL32r, MUL32r, MULX32rr)>;
 
 def SKLWriteResGroup63 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [1,4];
 }
-def: InstRW<[SKLWriteResGroup63], (instregex "XSETBV")>;
+def: InstRW<[SKLWriteResGroup63], (instrs XSETBV)>;
 
 def SKLWriteResGroup64 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [2,3];
 }
-def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG8rr")>;
+def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(8|16|32|64)rr")>;
 
 def SKLWriteResGroup65 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
   let Latency = 5;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,4];
 }
-def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF16")>;
-def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF64")>;
-
-def SKLWriteResGroup66 : SchedWriteRes<[SKLPort5]> {
-  let Latency = 6;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup66], (instregex "PCLMULQDQrr")>;
-def: InstRW<[SKLWriteResGroup66], (instregex "VPCLMULQDQrr")>;
+def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF(16|64)")>;
 
 def SKLWriteResGroup67 : SchedWriteRes<[SKLPort23]> {
   let Latency = 6;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup67], (instregex "LDDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVNTDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVSHDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVSLDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VLDDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQUrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVNTDQArm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPSrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTQrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm",
+                                             "(V?)MOVSHDUPrm",
+                                             "(V?)MOVSLDUPrm",
+                                             "VPBROADCASTDrm",
+                                             "VPBROADCASTQrm")>;
 
 def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
   let Latency = 6;
@@ -1921,247 +1026,83 @@ def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRADrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSWirm")>;
-
-def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort015]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
-
-def SKLWriteResGroup71 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm",
+                                             "MMX_PADDSWirm",
+                                             "MMX_PADDUSBirm",
+                                             "MMX_PADDUSWirm",
+                                             "MMX_PAVGBirm",
+                                             "MMX_PAVGWirm",
+                                             "MMX_PCMPEQBirm",
+                                             "MMX_PCMPEQDirm",
+                                             "MMX_PCMPEQWirm",
+                                             "MMX_PCMPGTBirm",
+                                             "MMX_PCMPGTDirm",
+                                             "MMX_PCMPGTWirm",
+                                             "MMX_PMAXSWirm",
+                                             "MMX_PMAXUBirm",
+                                             "MMX_PMINSWirm",
+                                             "MMX_PMINUBirm",
+                                             "MMX_PSUBSBirm",
+                                             "MMX_PSUBSWirm",
+                                             "MMX_PSUBUSBirm",
+                                             "MMX_PSUBUSWirm")>;
+
+def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort01]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRBrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PINSRWrmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPSrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRBrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRWrmi")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "(V?)CVTSS2SI(64)?rr",
+                                             "(V?)CVT(T?)SD2SI(64)?rr")>;
 
 def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64")>;
-def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>;
-
-def SKLWriteResGroup73 : SchedWriteRes<[SKLPort23,SKLPort05]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDBirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDDirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDQirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDWirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDNirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PORirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PXORirm")>;
+def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64",
+                                             "JMP(16|32|64)m")>;
 
 def SKLWriteResGroup74 : SchedWriteRes<[SKLPort23,SKLPort06]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup74], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "ADC8rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "ADCX(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "ADOX(32|64)rm")>;
 def: InstRW<[SKLWriteResGroup74], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "RORX32mi")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "RORX64mi")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SARX32rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SARX64rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SBB8rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHLX32rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHLX64rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHRX32rm")>;
-def: InstRW<[SKLWriteResGroup74], (instregex "SHRX64rm")>;
 
 def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup75], (instregex "MOVBE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm",
+                                             "BLSI(32|64)rm",
+                                             "BLSMSK(32|64)rm",
+                                             "BLSR(32|64)rm",
+                                             "MOVBE(16|32|64)rm")>;
 
 def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup76], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "ADD8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "AND8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mi")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "CMP8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "OR8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "SUB8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mi")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mr")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "XOR8rm")>;
-
-def SKLWriteResGroup77 : SchedWriteRes<[SKLPort5,SKLPort01]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup77], (instregex "HADDPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "HADDPSrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPSrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSYrr")>;
-def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSrr")>;
-
-def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+def: InstRW<[SKLWriteResGroup76], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)rmr")>;
+
+def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup78], (instregex "CVTSI642SSrr")>;
-def: InstRW<[SKLWriteResGroup78], (instregex "VCVTSI642SSrr")>;
+def: InstRW<[SKLWriteResGroup78], (instregex "(V?)CVTSI642SSrr")>;
 
 def SKLWriteResGroup79 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[SKLWriteResGroup79], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL",
+                                             "SHRD(16|32|64)rrCL")>;
 
 def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> {
   let Latency = 6;
@@ -2170,102 +1111,51 @@ def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]
 }
 def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>;
 
-def SKLWriteResGroup81 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> {
-  let Latency = 6;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKLWriteResGroup81], (instregex "VCVTPS2PHmr")>;
-
 def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR8m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SAR8mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL8m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHL8mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR8m1")>;
-def: InstRW<[SKLWriteResGroup82], (instregex "SHR8mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8",
+                                             "BTR(16|32|64)mi8",
+                                             "BTS(16|32|64)mi8",
+                                             "SAR(8|16|32|64)m1",
+                                             "SAR(8|16|32|64)mi",
+                                             "SHL(8|16|32|64)m1",
+                                             "SHL(8|16|32|64)mi",
+                                             "SHR(8|16|32|64)m1",
+                                             "SHR(8|16|32|64)mi")>;
 
 def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "AND8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "DEC8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "INC(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "INC8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NEG8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "NOT8m")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "OR8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mi")>;
-def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm",
+                                             "PUSH(16|32|64)rmm")>;
 
 def SKLWriteResGroup84 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
   let Latency = 6;
   let NumMicroOps = 6;
   let ResourceCycles = [1,5];
 }
-def: InstRW<[SKLWriteResGroup84], (instregex "STD")>;
+def: InstRW<[SKLWriteResGroup84], (instrs STD)>;
 
 def SKLWriteResGroup85 : SchedWriteRes<[SKLPort23]> {
   let Latency = 7;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup85], (instregex "LD_F32m")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "LD_F64m")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "LD_F80m")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTF128")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTI128")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VLDDQUYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPSYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQAYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQUYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPSYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "LD_F(32|64|80)m",
+                                             "VBROADCASTF128",
+                                             "VBROADCASTI128",
+                                             "VBROADCASTSDYrm",
+                                             "VBROADCASTSSYrm",
+                                             "VMOVDDUPYrm",
+                                             "VMOVSHDUPYrm",
+                                             "VMOVSLDUPYrm",
+                                             "VPBROADCASTDYrm",
+                                             "VPBROADCASTQYrm")>;
 
 def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> {
   let Latency = 7;
@@ -2274,654 +1164,192 @@ def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> {
 }
 def: InstRW<[SKLWriteResGroup86], (instregex "VCVTDQ2PDYrr")>;
 
-def SKLWriteResGroup87 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup87], (instregex "COMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "COMISSrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISSrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISSrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISDrm")>;
-def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISSrm")>;
-
 def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup88], (instregex "INSERTPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PALIGNRrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PBLENDWrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFDmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFHWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFLWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPDrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPSrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VINSERTPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSDWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSWBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPALIGNRrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPBLENDWrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFBrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFDmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFHWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFLWmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPDrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPSrmi")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPSrm")>;
-
-def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort015]> {
-  let Latency = 7;
+  let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[SKLWriteResGroup89], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PMOV(SX|ZX)BDrm",
+                                             "(V?)PMOV(SX|ZX)BQrm",
+                                             "(V?)PMOV(SX|ZX)BWrm",
+                                             "(V?)PMOV(SX|ZX)DQrm",
+                                             "(V?)PMOV(SX|ZX)WDrm",
+                                             "(V?)PMOV(SX|ZX)WQrm")>;
 
-def SKLWriteResGroup90 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup90], (instregex "PABSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PABSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PABSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PAVGBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PAVGWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PMINUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNBrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNDrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNWrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSLLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSLLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSLLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRADrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRAWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSRLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPABSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPABSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPABSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNBrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNDrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNWrm128")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRADrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAVDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVDrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVQrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSWrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSBrm")>;
-def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSWrm")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr",
+                                             "VCVTPS2PDYrr",
+                                             "VCVT(T?)PD2DQYrr")>;
 
 def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ANDPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPDrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPSrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "ORPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PADDWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PANDNrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PANDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PSUBWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "PXORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VANDPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPDrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPSrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTF128rm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTI128rm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VORPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPADDWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPANDNrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPANDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPBLENDDrmi")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBBrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBQrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBWrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VPXORrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VXORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "VXORPSrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "XORPDrm")>;
-def: InstRW<[SKLWriteResGroup91], (instregex "XORPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "(V?)INSERTF128rm",
+                                             "(V?)INSERTI128rm",
+                                             "(V?)PADD(B|D|Q|W)rm",
+                                             "(V?)PBLENDDrmi",
+                                             "(V?)PSUB(B|D|Q|W)rm")>;
 
 def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKUSWBirm")>;
-
-def SKLWriteResGroup93 : SchedWriteRes<[SKLPort23,SKLPort06]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup93], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup93], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm",
+                                             "MMX_PACKSSWBirm",
+                                             "MMX_PACKUSWBirm")>;
 
 def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKLWriteResGroup94], (instregex "LEAVE64")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASB")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASL")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASQ")>;
-def: InstRW<[SKLWriteResGroup94], (instregex "SCASW")>;
+def: InstRW<[SKLWriteResGroup94], (instrs LEAVE, LEAVE64,
+                                          SCASB, SCASL, SCASQ, SCASW)>;
 
-def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> {
+def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort01]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[SKLWriteResGroup95], (instregex "(V?)CVTTSS2SI(64)?rr")>;
 
 def SKLWriteResGroup96 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup96], (instregex "FLDCW16m")>;
-
-def SKLWriteResGroup97 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort0156]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup97], (instregex "LDMXCSR")>;
-def: InstRW<[SKLWriteResGroup97], (instregex "VLDMXCSR")>;
+def: InstRW<[SKLWriteResGroup96], (instrs FLDCW16m)>;
 
 def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ")>;
-def: InstRW<[SKLWriteResGroup98], (instregex "RETQ")>;
-
-def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup98], (instrs LRETQ, RETQ)>;
 
 def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL8m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROL8mi")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR8m1")>;
-def: InstRW<[SKLWriteResGroup100], (instregex "ROR8mi")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m1",
+                                              "ROL(8|16|32|64)mi",
+                                              "ROR(8|16|32|64)m1",
+                                              "ROR(8|16|32|64)mi")>;
 
 def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SKLWriteResGroup101], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup101], (instregex "XADD8rm")>;
+def: InstRW<[SKLWriteResGroup101], (instregex "XADD(8|16|32|64)rm")>;
 
 def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,1,1];
 }
-def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup102], (instregex "FARCALL64")>;
+def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m",
+                                              "FARCALL64")>;
 
 def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 7;
   let NumMicroOps = 7;
   let ResourceCycles = [1,3,1,2];
 }
-def: InstRW<[SKLWriteResGroup103], (instregex "LOOP")>;
-
-def SKLWriteResGroup104 : SchedWriteRes<[SKLPort0]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup104], (instregex "AESIMCrr")>;
-def: InstRW<[SKLWriteResGroup104], (instregex "VAESIMCrr")>;
-
-def SKLWriteResGroup105 : SchedWriteRes<[SKLPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup105], (instregex "PMULLDrr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDYrr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDrr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSSr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPDr")>;
-def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPSr")>;
-
-def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPDrm")>;
-def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPSrm")>;
+def: InstRW<[SKLWriteResGroup103], (instrs LOOP)>;
 
 def SKLWriteResGroup107 : SchedWriteRes<[SKLPort1,SKLPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup107], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL64m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL(32|64)rm(i8)?")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL8m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "MUL(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "MUL8m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm",
+                                              "PEXT(32|64)rm")>;
 
 def SKLWriteResGroup107_16 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
-  let Latency = 3;
+  let Latency = 8;
   let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1]; 
+  let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup107_16], (instregex "IMUL16rm(i8)?")>;
+def: InstRW<[SKLWriteResGroup107_16], (instrs IMUL16rmi, IMUL16rmi8)>;
 
-def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
-  let Latency = 3;
+def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort06, SKLPort0156, SKLPort23]> {
+  let Latency = 9;
   let NumMicroOps = 5;
+  let ResourceCycles = [1,1,2,1];
 }
-def: InstRW<[SKLWriteResGroup107_16_2], (instregex "IMUL16m")>;
-def: InstRW<[SKLWriteResGroup107_16_2], (instregex "MUL16m")>;
-
-def SKLWriteResGroup107_32 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup107_32], (instregex "IMUL32m")>;
-def: InstRW<[SKLWriteResGroup107_32], (instregex "MUL32m")>;
+def: InstRW<[SKLWriteResGroup107_16_2], (instrs IMUL16m, MUL16m)>;
 
 def SKLWriteResGroup108 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOM32m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOM64m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP32m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP64m")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFBYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFDYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPSYrm")>;
-
-def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup109], (instregex "VPABSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPABSDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPABSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRADYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAVDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVDYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVQYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSWYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOM(P?)(32|64)m",
+                                              "VPBROADCASTBYrm",
+                                              "VPBROADCASTWYrm",
+                                              "VPMOVSXBDYrm",
+                                              "VPMOVSXBQYrm",
+                                              "VPMOVSXWQYrm")>;
 
 def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VANDPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VORPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VORPSYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDBYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDQYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADDWYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPANDNYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPANDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVQYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPORYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBBYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBQYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBWYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPXORYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VXORPDYrm")>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VXORPSYrm")>;
-
-def SKLWriteResGroup111 : SchedWriteRes<[SKLPort23,SKLPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPDrm0")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPSrm0")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "PBLENDVBrm0")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPDrm")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPSrm")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPADD(B|D|Q|W)Yrm",
+                                              "VPBLENDDYrmi",
+                                              "VPSUB(B|D|Q|W)Yrm")>;
 
 def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 8;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHSUBSWrm64")>;
-
-def SKLWriteResGroup113 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort05]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBWrm64")>;
-
-def SKLWriteResGroup114 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKLWriteResGroup114], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PH(ADD|SUB)SWrm")>;
 
 def SKLWriteResGroup115 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,3];
 }
-def: InstRW<[SKLWriteResGroup115], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup115], (instregex "ROR8mCL")>;
+def: InstRW<[SKLWriteResGroup115], (instregex "ROR(8|16|32|64)mCL")>;
 
 def SKLWriteResGroup116 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL8m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCL8mi")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR8m1")>;
-def: InstRW<[SKLWriteResGroup116], (instregex "RCR8mi")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL(8|16|32|64)m1",
+                                              "RCL(8|16|32|64)mi",
+                                              "RCR(8|16|32|64)m1",
+                                              "RCR(8|16|32|64)mi")>;
 
 def SKLWriteResGroup117 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
   let Latency = 8;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,3];
 }
-def: InstRW<[SKLWriteResGroup117], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "ROL8mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SAR8mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHL8mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup117], (instregex "SHR8mCL")>;
-
-def SKLWriteResGroup118 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
-  let Latency = 8;
-  let NumMicroOps = 6;
-  let ResourceCycles = [1,1,1,3];
-}
-def: InstRW<[SKLWriteResGroup118], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup118], (instregex "ADC8mi")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "ROL(8|16|32|64)mCL",
+                                              "SAR(8|16|32|64)mCL",
+                                              "SHL(8|16|32|64)mCL",
+                                              "SHR(8|16|32|64)mCL")>;
 
 def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 8;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,2,1];
 }
-def: InstRW<[SKLWriteResGroup119], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "ADC8mr")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG8rm")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mi")>;
-def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mr")>;
+def: SchedAlias<WriteADCRMW, SKLWriteResGroup119>;
+def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(8|16|32|64)rm")>;
 
 def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let Latency = 9;
@@ -2929,280 +1357,75 @@ def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SKLWriteResGroup120], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "RCPSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "RSQRTSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VRCPSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VRSQRTSSm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPDYrm")>;
-def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPSYrm")>;
 
 def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup121], (instregex "PCMPGTQrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "PSADBWrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPCMPGTQrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVZXWDYrm")>;
-def: InstRW<[SKLWriteResGroup121], (instregex "VPSADBWrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "(V?)PCMPGTQrm",
+                                              "VPMOVSXBWYrm",
+                                              "VPMOVSXDQYrm",
+                                              "VPMOVSXWDYrm",
+                                              "VPMOVZXWDYrm")>;
 
-def SKLWriteResGroup122 : SchedWriteRes<[SKLPort01,SKLPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup122], (instregex "ADDSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "ADDSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "MULSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "MULSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "SUBSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "SUBSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VADDSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VADDSSrm")>;
-def: InstRW<[SKLWriteResGroup122],
-            (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VMULSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VMULSSrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSDrm")>;
-def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSSrm")>;
-
-def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup123], (instregex "CMPSDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "CMPSSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "CVTPS2PDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SSrm")>;
-
-def SKLWriteResGroup124 : SchedWriteRes<[SKLPort5,SKLPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup124], (instregex "DPPDrri")>;
-def: InstRW<[SKLWriteResGroup124], (instregex "VDPPDrri")>;
-
-def SKLWriteResGroup125 : SchedWriteRes<[SKLPort23,SKLPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPSYrm")>;
-
-def SKLWriteResGroup126 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKLWriteResGroup126], (instregex "PTESTrm")>;
-def: InstRW<[SKLWriteResGroup126], (instregex "VPTESTrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm",
+                                              "(V?)CVTPS2PDrm")>;
 
 def SKLWriteResGroup127 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup127], (instregex "MULX64rm")>;
+def: InstRW<[SKLWriteResGroup127], (instrs IMUL64m, MUL64m, MULX64rm)>;
 
 def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKLWriteResGroup128], (instregex "PHADDSWrm128")>;
-def: InstRW<[SKLWriteResGroup128], (instregex "PHSUBSWrm128")>;
-def: InstRW<[SKLWriteResGroup128], (instregex "VPHADDSWrm128")>;
-def: InstRW<[SKLWriteResGroup128], (instregex "VPHSUBSWrm128")>;
-
-def SKLWriteResGroup129 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup129], (instregex "PHADDDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "PHADDWrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBWrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDWrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBDrm")>;
-def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBWrm")>;
+def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm",
+                                              "(V?)PHSUBSWrm")>;
 
 def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[SKLWriteResGroup130], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8",
+                                              "SHRD(16|32|64)mri8")>;
 
 def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
   let Latency = 9;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup131], (instregex "LSL(16|32|64)rm")>;
-
-def SKLWriteResGroup132 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup132], (instregex "AESDECLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "AESDECrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "AESENCLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "AESENCrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "RCPPSm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "RSQRTPSm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCLASTrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCrm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VRCPPSm")>;
-def: InstRW<[SKLWriteResGroup132], (instregex "VRSQRTPSm")>;
+def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm",
+                                              "LSL(16|32|64)rm")>;
 
 def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F16m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F32m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F64m")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2F128rm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2I128rm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMDYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPDYmi")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPSYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPERMQYmi")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXWQYrm")>;
-def: InstRW<[SKLWriteResGroup133], (instregex "VPSADBWYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+                                              "ILD_F(16|32|64)m",
+                                              "VPCMPGTQYrm")>;
 
 def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "MULPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "MULPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "SUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "SUBPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPSrm")>;
-def: InstRW<[SKLWriteResGroup134],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VMULPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VMULPSrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPSrm")>;
-
-def SKLWriteResGroup135 : SchedWriteRes<[SKLPort23,SKLPort015]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup135], (instregex "CMPPDrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CMPPSrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTSS2SDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMADDUBSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMADDWDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULDQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULHRSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULHUWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULHWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULLWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "PMULUDQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPDrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPSrmi")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDWDrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULDQrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHRSWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHUWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULLWrm")>;
-def: InstRW<[SKLWriteResGroup135], (instregex "VPMULUDQrm")>;
-
-def SKLWriteResGroup136 : SchedWriteRes<[SKLPort0]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRIrr")>;
-def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRM128rr")>;
-
-def SKLWriteResGroup137 : SchedWriteRes<[SKLPort5,SKLPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKLWriteResGroup137], (instregex "MPSADBWrmi")>;
-def: InstRW<[SKLWriteResGroup137], (instregex "VMPSADBWrmi")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "(V?)CVTDQ2PSrm",
+                                              "(V?)CVTPS2DQrm",
+                                              "(V?)CVTSS2SDrm",
+                                              "(V?)CVTTPS2DQrm")>;
 
 def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 10;
@@ -3210,188 +1433,107 @@ def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let ResourceCycles = [1,1,1];
 }
 def: InstRW<[SKLWriteResGroup138], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[SKLWriteResGroup138], (instregex "VPTESTYrm")>;
 
-def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
   let Latency = 10;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup139], (instregex "CVTSD2SSrm")>;
-def: InstRW<[SKLWriteResGroup139], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[SKLWriteResGroup139], (instregex "(V?)CVTSD2SSrm")>;
 
 def SKLWriteResGroup140 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
   let Latency = 10;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWrm256")>;
-def: InstRW<[SKLWriteResGroup140], (instregex "VPHSUBSWrm256")>;
-
-def SKLWriteResGroup141 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
-  let Latency = 10;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDDYrm")>;
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDWYrm")>;
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBDYrm")>;
-def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBWYrm")>;
+def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWYrm",
+                                              "VPHSUBSWYrm")>;
 
 def SKLWriteResGroup142 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> {
-  let Latency = 10;
+  let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKLWriteResGroup142], (instregex "MULX32rm")>;
+def: InstRW<[SKLWriteResGroup142], (instrs IMUL32m, MUL32m, MULX32rm)>;
 
 def SKLWriteResGroup143 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 10;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,1,1,3];
 }
-def: InstRW<[SKLWriteResGroup143], (instregex "ADD8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "AND8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "OR8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "SUB8mi")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "XCHG8rm")>;
-def: InstRW<[SKLWriteResGroup143], (instregex "XOR8mi")>;
-
-def SKLWriteResGroup144 : SchedWriteRes<[SKLPort05,SKLPort0156]> {
-  let Latency = 10;
-  let NumMicroOps = 10;
-  let ResourceCycles = [9,1];
-}
-def: InstRW<[SKLWriteResGroup144], (instregex "MMX_EMMS")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(8|16|32|64)rm")>;
 
-def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0]> {
+def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
   let Latency = 11;
   let NumMicroOps = 1;
-  let ResourceCycles = [1];
+  let ResourceCycles = [1,3];
 }
-def: InstRW<[SKLWriteResGroup145], (instregex "DIVPSrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "DIVSSrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSYrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSrr")>;
-def: InstRW<[SKLWriteResGroup145], (instregex "VDIVSSrr")>;
+def : SchedAlias<WriteFDivX, SKLWriteResGroup145>; // TODO - convert to ZnWriteResFpuPair
 
 def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F32m")>;
-def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F64m")>;
-def: InstRW<[SKLWriteResGroup146], (instregex "VRCPPSYm")>;
-def: InstRW<[SKLWriteResGroup146], (instregex "VRSQRTPSYm")>;
+def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F(32|64)m")>;
 
 def SKLWriteResGroup147 : SchedWriteRes<[SKLPort01,SKLPort23]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDPSYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[SKLWriteResGroup147],
-            (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VMULPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VMULPSYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPDYrm")>;
-def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPSYrm")>;
-
-def SKLWriteResGroup148 : SchedWriteRes<[SKLPort23,SKLPort015]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPDYrmi")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPSYrmi")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2PDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDWDYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULDQYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHUWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULLWYrm")>;
-def: InstRW<[SKLWriteResGroup148], (instregex "VPMULUDQYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VCVTDQ2PSYrm",
+                                              "VCVTPS2PDYrm",
+                                              "VCVT(T?)PS2DQYrm")>;
 
 def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOM16m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOM32m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP16m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP32m")>;
-def: InstRW<[SKLWriteResGroup149], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOM(P?)(16|32)m")>;
 
 def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup150], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[SKLWriteResGroup150], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[SKLWriteResGroup150], (instregex "(V?)CVTDQ2PDrm")>;
 
-def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort015]> {
+def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort01]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SIrm")>;
-
-def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+def: InstRW<[SKLWriteResGroup151], (instregex "(V?)CVTSS2SI64rm",
+                                              "(V?)CVT(T?)SD2SI(64)?rm",
+                                              "VCVTTSS2SI64rm",
+                                              "(V?)CVT(T?)SS2SIrm")>;
+
+def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2DQrm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm",
+                                              "CVT(T?)PD2DQrm",
+                                              "MMX_CVT(T?)PD2PIirm")>;
 
 def SKLWriteResGroup153 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 11;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,2,1];
 }
-def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[SKLWriteResGroup153], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL",
+                                              "SHRD(16|32|64)mrCL")>;
 
 def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
   let Latency = 11;
   let NumMicroOps = 7;
   let ResourceCycles = [2,3,2];
 }
-def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[SKLWriteResGroup154], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL",
+                                              "RCR(16|32|64)rCL")>;
 
 def SKLWriteResGroup155 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 11;
@@ -3405,66 +1547,21 @@ def SKLWriteResGroup156 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
   let NumMicroOps = 11;
   let ResourceCycles = [2,9];
 }
-def: InstRW<[SKLWriteResGroup156], (instregex "LOOPE")>;
-def: InstRW<[SKLWriteResGroup156], (instregex "LOOPNE")>;
-
-def SKLWriteResGroup157 : SchedWriteRes<[SKLPort0]> {
-  let Latency = 12;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSYr")>;
-def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSr")>;
-def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTSSr")>;
-
-def SKLWriteResGroup158 : SchedWriteRes<[SKLPort5,SKLPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup158], (instregex "PCLMULQDQrm")>;
-def: InstRW<[SKLWriteResGroup158], (instregex "VPCLMULQDQrm")>;
+def: InstRW<[SKLWriteResGroup156], (instrs LOOPE, LOOPNE)>;
 
-def SKLWriteResGroup159 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
-  let Latency = 12;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup159], (instregex "HADDPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "HADDPSrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPSrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPSrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPDrm")>;
-def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPSrm")>;
-
-def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> {
+def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort01]> {
   let Latency = 12;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
 def: InstRW<[SKLWriteResGroup160], (instregex "CVTTSS2SI64rm")>;
 
-def SKLWriteResGroup161 : SchedWriteRes<[SKLPort0]> {
-  let Latency = 13;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup161], (instregex "SQRTPSr")>;
-def: InstRW<[SKLWriteResGroup161], (instregex "SQRTSSr")>;
-
 def SKLWriteResGroup162 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 13;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI16m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI32m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI16m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI32m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI16m")>;
-def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI32m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
 
 def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 13;
@@ -3473,67 +1570,27 @@ def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
 }
 def: InstRW<[SKLWriteResGroup163], (instregex "VCVTDQ2PDYrm")>;
 
-def SKLWriteResGroup164 : SchedWriteRes<[SKLPort5,SKLPort015]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
-}
-def: InstRW<[SKLWriteResGroup164], (instregex "DPPSrri")>;
-def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSYrri")>;
-def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSrri")>;
-
-def SKLWriteResGroup165 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPDYrm")>;
-def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPSYrm")>;
-def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPDYrm")>;
-def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPSYrm")>;
-
-def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0]> {
+def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
   let Latency = 14;
   let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup166], (instregex "DIVPDrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "DIVSDrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDYrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDrr")>;
-def: InstRW<[SKLWriteResGroup166], (instregex "VDIVSDrr")>;
-
-def SKLWriteResGroup167 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 14;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
+  let ResourceCycles = [1,3];
 }
-def: InstRW<[SKLWriteResGroup167], (instregex "AESIMCrm")>;
-def: InstRW<[SKLWriteResGroup167], (instregex "VAESIMCrm")>;
+def : SchedAlias<WriteFDiv64,  SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair
 
-def SKLWriteResGroup168 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+def SKLWriteResGroup166_1 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
   let Latency = 14;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
+  let NumMicroOps = 1;
+  let ResourceCycles = [1,5];
 }
-def: InstRW<[SKLWriteResGroup168], (instregex "PMULLDrm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPSm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSSm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VPMULLDrm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPSm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSDm")>;
-def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSSm")>;
+def : SchedAlias<WriteFDiv64Y, SKLWriteResGroup166_1>; // TODO - convert to ZnWriteResFpuPair
 
 def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 14;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI16m")>;
-def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI32m")>;
+def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI(16|32)m")>;
 
 def SKLWriteResGroup170 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 14;
@@ -3547,215 +1604,105 @@ def SKLWriteResGroup171 : SchedWriteRes<[SKLPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FPrST0")>;
-def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FST0r")>;
-def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FrST0")>;
-
-def SKLWriteResGroup172 : SchedWriteRes<[SKLPort23,SKLPort015]> {
-  let Latency = 15;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup172], (instregex "VPMULLDYrm")>;
-def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPDm")>;
-def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPSm")>;
-
-def SKLWriteResGroup173 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
-  let Latency = 15;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKLWriteResGroup173], (instregex "DPPDrmi")>;
-def: InstRW<[SKLWriteResGroup173], (instregex "VDPPDrmi")>;
+def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
 
 def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 15;
   let NumMicroOps = 10;
   let ResourceCycles = [1,1,1,5,1,1];
 }
-def: InstRW<[SKLWriteResGroup174], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup174], (instregex "RCL8mCL")>;
-
-def SKLWriteResGroup175 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup175], (instregex "DIVSSrm")>;
-def: InstRW<[SKLWriteResGroup175], (instregex "VDIVSSrm")>;
-
-def SKLWriteResGroup176 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 4;
-  let ResourceCycles = [3,1];
-}
-def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRIrm")>;
-def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRM128rm")>;
+def: InstRW<[SKLWriteResGroup174], (instregex "RCL(8|16|32|64)mCL")>;
 
 def SKLWriteResGroup177 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 16;
   let NumMicroOps = 14;
   let ResourceCycles = [1,1,1,4,2,5];
 }
-def: InstRW<[SKLWriteResGroup177], (instregex "CMPXCHG8B")>;
+def: InstRW<[SKLWriteResGroup177], (instrs CMPXCHG8B)>;
 
 def SKLWriteResGroup178 : SchedWriteRes<[SKLPort0156]> {
   let Latency = 16;
   let NumMicroOps = 16;
   let ResourceCycles = [16];
 }
-def: InstRW<[SKLWriteResGroup178], (instregex "VZEROALL")>;
+def: InstRW<[SKLWriteResGroup178], (instrs VZEROALL)>;
 
-def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
   let Latency = 17;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let ResourceCycles = [1,1,5];
 }
-def: InstRW<[SKLWriteResGroup179], (instregex "DIVPSrm")>;
-def: InstRW<[SKLWriteResGroup179], (instregex "VDIVPSrm")>;
-def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTSSm")>;
+def : SchedAlias<WriteFDivXLd, SKLWriteResGroup179>; // TODO - convert to ZnWriteResFpuPair
 
 def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
   let Latency = 17;
   let NumMicroOps = 15;
   let ResourceCycles = [2,1,2,4,2,4];
 }
-def: InstRW<[SKLWriteResGroup180], (instregex "XCH_F")>;
-
-def SKLWriteResGroup181 : SchedWriteRes<[SKLPort0]> {
-  let Latency = 18;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDYr")>;
-def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDr")>;
-def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTSDr")>;
-
-def SKLWriteResGroup182 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 18;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup182], (instregex "SQRTSSm")>;
-def: InstRW<[SKLWriteResGroup182], (instregex "VDIVPSYrm")>;
-def: InstRW<[SKLWriteResGroup182], (instregex "VSQRTPSm")>;
-
-def SKLWriteResGroup183 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort0156]> {
-  let Latency = 18;
-  let NumMicroOps = 8;
-  let ResourceCycles = [4,3,1];
-}
-def: InstRW<[SKLWriteResGroup183], (instregex "PCMPESTRIrr")>;
-def: InstRW<[SKLWriteResGroup183], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[SKLWriteResGroup180], (instrs XCH_F)>;
 
 def SKLWriteResGroup184 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> {
   let Latency = 18;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,5];
 }
-def: InstRW<[SKLWriteResGroup184], (instregex "CPUID")>;
-def: InstRW<[SKLWriteResGroup184], (instregex "RDTSC")>;
+def: InstRW<[SKLWriteResGroup184], (instrs CPUID, RDTSC)>;
 
 def SKLWriteResGroup185 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 18;
   let NumMicroOps = 11;
   let ResourceCycles = [2,1,1,4,1,2];
 }
-def: InstRW<[SKLWriteResGroup185], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[SKLWriteResGroup185], (instregex "RCR8mCL")>;
+def: InstRW<[SKLWriteResGroup185], (instregex "RCR(8|16|32|64)mCL")>;
 
-def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
   let Latency = 19;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup186], (instregex "DIVSDrm")>;
-def: InstRW<[SKLWriteResGroup186], (instregex "SQRTPSm")>;
-def: InstRW<[SKLWriteResGroup186], (instregex "VDIVSDrm")>;
-def: InstRW<[SKLWriteResGroup186], (instregex "VSQRTPSYm")>;
-
-def SKLWriteResGroup187 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
-  let Latency = 19;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,1,3];
-}
-def: InstRW<[SKLWriteResGroup187], (instregex "DPPSrmi")>;
-def: InstRW<[SKLWriteResGroup187], (instregex "VDPPSrmi")>;
-
-def SKLWriteResGroup188 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015,SKLPort0156]> {
-  let Latency = 19;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
+  let ResourceCycles = [1,1,4];
 }
-def: InstRW<[SKLWriteResGroup188], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[SKLWriteResGroup188], (instregex "VPCMPESTRM128rr")>;
+def : SchedAlias<WriteFDiv64Ld,  SKLWriteResGroup186>; // TODO - convert to ZnWriteResFpuPair
 
 def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> {
   let Latency = 20;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FPrST0")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FST0r")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FrST0")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "SQRTPDr")>;
-def: InstRW<[SKLWriteResGroup189], (instregex "SQRTSDr")>;
+def: InstRW<[SKLWriteResGroup189], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
 
-def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
   let Latency = 20;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup190], (instregex "DIVPDrm")>;
-def: InstRW<[SKLWriteResGroup190], (instregex "VDIVPDrm")>;
-
-def SKLWriteResGroup191 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
-  let Latency = 20;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,1,3];
+  let ResourceCycles = [1,1,4];
 }
-def: InstRW<[SKLWriteResGroup191], (instregex "VDPPSYrmi")>;
+def : SchedAlias<WriteFDiv64XLd, SKLWriteResGroup190>; // TODO - convert to ZnWriteResFpuPair
 
 def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 20;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,1,1,1,2];
 }
-def: InstRW<[SKLWriteResGroup192], (instregex "INSB")>;
-def: InstRW<[SKLWriteResGroup192], (instregex "INSL")>;
-def: InstRW<[SKLWriteResGroup192], (instregex "INSW")>;
+def: InstRW<[SKLWriteResGroup192], (instrs INSB, INSL, INSW)>;
 
 def SKLWriteResGroup193 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> {
   let Latency = 20;
   let NumMicroOps = 10;
   let ResourceCycles = [1,2,7];
 }
-def: InstRW<[SKLWriteResGroup193], (instregex "MWAITrr")>;
+def: InstRW<[SKLWriteResGroup193], (instrs MWAITrr)>;
 
-def SKLWriteResGroup194 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> {
-  let Latency = 20;
-  let NumMicroOps = 11;
-  let ResourceCycles = [3,6,2];
-}
-def: InstRW<[SKLWriteResGroup194], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[SKLWriteResGroup194], (instregex "VAESKEYGENASSIST128rr")>;
-
-def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
   let Latency = 21;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let ResourceCycles = [1,1,8];
 }
-def: InstRW<[SKLWriteResGroup195], (instregex "VDIVPDYrm")>;
+def : SchedAlias<WriteFDiv64YLd, SKLWriteResGroup195>; // TODO - convert to ZnWriteResFpuPair
 
 def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let Latency = 22;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F32m")>;
-def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F64m")>;
+def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>;
 
 def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
   let Latency = 22;
@@ -3785,117 +1732,56 @@ def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm,
                                              VPGATHERQQYrm,
                                              VGATHERDPDYrm)>;
 
-def SKLWriteResGroup197 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 23;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup197], (instregex "VSQRTSDm")>;
-
 def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 23;
   let NumMicroOps = 19;
   let ResourceCycles = [2,1,4,1,1,4,6];
 }
-def: InstRW<[SKLWriteResGroup198], (instregex "CMPXCHG16B")>;
-
-def SKLWriteResGroup199 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 24;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTPDm")>;
-
-def SKLWriteResGroup200 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> {
-  let Latency = 24;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[SKLWriteResGroup200], (instregex "PCMPESTRIrm")>;
-def: InstRW<[SKLWriteResGroup200], (instregex "VPCMPESTRIrm")>;
-
-def SKLWriteResGroup201 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 25;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup201], (instregex "SQRTSDm")>;
-def: InstRW<[SKLWriteResGroup201], (instregex "VSQRTPDYm")>;
+def: InstRW<[SKLWriteResGroup198], (instrs CMPXCHG16B)>;
 
 def SKLWriteResGroup202 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 25;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI16m")>;
-def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI32m")>;
-
-def SKLWriteResGroup203 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015,SKLPort0156]> {
-  let Latency = 25;
-  let NumMicroOps = 10;
-  let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[SKLWriteResGroup203], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[SKLWriteResGroup203], (instregex "VPCMPESTRM128rm")>;
-
-def SKLWriteResGroup204 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> {
-  let Latency = 25;
-  let NumMicroOps = 11;
-  let ResourceCycles = [3,6,1,1];
-}
-def: InstRW<[SKLWriteResGroup204], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[SKLWriteResGroup204], (instregex "VAESKEYGENASSIST128rm")>;
-
-def SKLWriteResGroup205 : SchedWriteRes<[SKLPort0,SKLPort23]> {
-  let Latency = 26;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKLWriteResGroup205], (instregex "SQRTPDm")>;
+def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI(16|32)m")>;
 
 def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let Latency = 27;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F32m")>;
-def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F64m")>;
+def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F(32|64)m")>;
 
 def SKLWriteResGroup207 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> {
   let Latency = 28;
   let NumMicroOps = 8;
   let ResourceCycles = [2,4,1,1];
 }
-def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup207], (instregex "IDIV8m")>;
+def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(8|16|32|64)m")>;
 
 def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let Latency = 30;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI16m")>;
-def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI32m")>;
+def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI(16|32)m")>;
 
 def SKLWriteResGroup209 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,3,4,10];
 }
-def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)ri")>;
-def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)rr")>;
-def: InstRW<[SKLWriteResGroup209], (instregex "IN8ri")>;
-def: InstRW<[SKLWriteResGroup209], (instregex "IN8rr")>;
+def: InstRW<[SKLWriteResGroup209], (instregex "IN(8|16|32)ri",
+                                              "IN(8|16|32)rr")>;
 
 def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,2,1,4,10];
 }
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)ir")>;
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)rr")>;
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT8ir")>;
-def: InstRW<[SKLWriteResGroup210], (instregex "OUT8rr")>;
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT(8|16|32)ir",
+                                              "OUT(8|16|32)rr")>;
 
 def SKLWriteResGroup211 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
   let Latency = 37;
@@ -3909,28 +1795,29 @@ def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKL
   let NumMicroOps = 18;
   let ResourceCycles = [1,1,2,3,1,1,1,8];
 }
-def: InstRW<[SKLWriteResGroup212], (instregex "VMCLEARm")>;
+def: InstRW<[SKLWriteResGroup212], (instrs VMCLEARm)>;
 
 def SKLWriteResGroup213 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 41;
   let NumMicroOps = 39;
   let ResourceCycles = [1,10,1,1,26];
 }
-def: InstRW<[SKLWriteResGroup213], (instregex "XSAVE64")>;
+def: InstRW<[SKLWriteResGroup213], (instrs XSAVE64)>;
 
 def SKLWriteResGroup214 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
   let Latency = 42;
   let NumMicroOps = 22;
   let ResourceCycles = [2,20];
 }
-def: InstRW<[SKLWriteResGroup214], (instregex "RDTSCP")>;
+def: InstRW<[SKLWriteResGroup214], (instrs RDTSCP)>;
 
 def SKLWriteResGroup215 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 42;
   let NumMicroOps = 40;
   let ResourceCycles = [1,11,1,1,26];
 }
-def: InstRW<[SKLWriteResGroup215], (instregex "^XSAVE$", "XSAVEC", "XSAVES")>;
+def: InstRW<[SKLWriteResGroup215], (instrs XSAVE)>;
+def: InstRW<[SKLWriteResGroup215], (instregex "XSAVEC", "XSAVES")>;
 
 def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 46;
@@ -3944,29 +1831,28 @@ def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,
   let NumMicroOps = 64;
   let ResourceCycles = [2,8,5,10,39];
 }
-def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>;
-def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>;
+def: InstRW<[SKLWriteResGroup217], (instrs FLDENVm)>;
 
 def SKLWriteResGroup218 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 63;
   let NumMicroOps = 88;
   let ResourceCycles = [4,4,31,1,2,1,45];
 }
-def: InstRW<[SKLWriteResGroup218], (instregex "FXRSTOR64")>;
+def: InstRW<[SKLWriteResGroup218], (instrs FXRSTOR64)>;
 
 def SKLWriteResGroup219 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 63;
   let NumMicroOps = 90;
   let ResourceCycles = [4,2,33,1,2,1,47];
 }
-def: InstRW<[SKLWriteResGroup219], (instregex "FXRSTOR")>;
+def: InstRW<[SKLWriteResGroup219], (instrs FXRSTOR)>;
 
 def SKLWriteResGroup220 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> {
   let Latency = 75;
   let NumMicroOps = 15;
   let ResourceCycles = [6,3,6];
 }
-def: InstRW<[SKLWriteResGroup220], (instregex "FNINIT")>;
+def: InstRW<[SKLWriteResGroup220], (instrs FNINIT)>;
 
 def SKLWriteResGroup221 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
   let Latency = 76;
@@ -3987,7 +1873,8 @@ def SKLWriteResGroup223 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKL
   let NumMicroOps = 100;
   let ResourceCycles = [9,1,11,16,1,11,21,30];
 }
-def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>;
-def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>;
+def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
 
 } // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 439a2ffa36a4..7095ec081bd9 100755
--- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -19,7 +19,7 @@ def SkylakeServerModel : SchedMachineModel {
   let MicroOpBufferSize = 224; // Based on the reorder buffer.
   let LoadLatency = 5;
   let MispredictPenalty = 14;
-  
+
   // Based on the LSD (loop-stream detector) queue size and benchmarking data.
   let LoopMicroOpBufferSize = 50;
 
@@ -61,6 +61,10 @@ def SKXPort015 : ProcResGroup<[SKXPort0, SKXPort1, SKXPort5]>;
 def SKXPort056 : ProcResGroup<[SKXPort0, SKXPort5, SKXPort6]>;
 def SKXPort0156: ProcResGroup<[SKXPort0, SKXPort1, SKXPort5, SKXPort6]>;
 
+def SKXDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def SKXFPDivider : ProcResource<1>;
+
 // 60 Entry Unified Scheduler
 def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
                               SKXPort5, SKXPort6, SKXPort7]> {
@@ -77,45 +81,84 @@ def : ReadAdvance<ReadAfterLd, 5>;
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
 multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [1], int UOps = 1,
+                          int LoadLat = 5> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
-  // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
-  // latency.
-  def : WriteRes<SchedRW.Folded, [SKXPort23, ExePort]> {
-     let Latency = !add(Lat, 5);
+  // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+  // the latency (default = 5).
+  def : WriteRes<SchedRW.Folded, !listconcat([SKXPort23], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = !add(UOps, 1);
   }
 }
 
-// A folded store needs a cycle on port 4 for the store data, but it does not
-// need an extra port 2/3 cycle to recompute the address.
-def : WriteRes<WriteRMW, [SKXPort4]>;
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKXPort237,SKXPort4]>;
 
 // Arithmetic.
-defm : SKXWriteResPair<WriteALU,   SKXPort0156, 1>; // Simple integer ALU op.
-defm : SKXWriteResPair<WriteIMul,  SKXPort1,   3>; // Integer multiplication.
-def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def SKXDivider : ProcResource<1>; // Integer division issued on port 0.     
-def : WriteRes<WriteIDiv, [SKXPort0, SKXDivider]> { // Integer division.
-  let Latency = 25;
-  let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SKXPort23, SKXPort0, SKXDivider]> {
-  let Latency = 29;
-  let ResourceCycles = [1, 1, 10];
-}
+defm : SKXWriteResPair<WriteALU,    [SKXPort0156], 1>; // Simple integer ALU op.
+defm : SKXWriteResPair<WriteADC,    [SKXPort06],   1>; // Integer ALU + flags op.
+defm : SKXWriteResPair<WriteIMul,   [SKXPort1],    3>; // Integer multiplication.
+defm : SKXWriteResPair<WriteIMul64, [SKXPort1],    3>; // Integer 64-bit multiplication.
 
+defm : SKXWriteResPair<WriteBSWAP32,[SKXPort15],   1>; //
+defm : SKXWriteResPair<WriteBSWAP64,[SKXPort06, SKXPort15], 2, [1,1], 2>; //
+
+defm : SKXWriteResPair<WriteDiv8,   [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteDiv16,  [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteDiv32,  [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteDiv64,  [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv8,  [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv32, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : SKXWriteResPair<WriteIDiv64, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+
+defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>;
+
+def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
 def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
 
+defm : SKXWriteResPair<WriteCMOV,  [SKXPort06], 1, [1], 1>; // Conditional move.
+defm : SKXWriteResPair<WriteCMOV2, [SKXPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+def  : WriteRes<WriteLAHFSAHF, [SKXPort06]>;
+
 // Integer shifts and rotates.
-defm : SKXWriteResPair<WriteShift, SKXPort06,  1>;
+defm : SKXWriteResPair<WriteShift, [SKXPort06],  1>;
+
+// Double shift instructions.
+defm : SKXWriteResPair<WriteShiftDouble, [SKXPort06],  1>;
+
+// Bit counts.
+defm : SKXWriteResPair<WriteBSF, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteBSR, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteLZCNT,          [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteTZCNT,          [SKXPort1], 3>;
+defm : SKXWriteResPair<WritePOPCNT,         [SKXPort1], 3>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : SKXWriteResPair<WriteBEXTR, [SKXPort06,SKXPort15], 2, [1,1], 2>;
+defm : SKXWriteResPair<WriteBZHI, [SKXPort15], 1>;
 
 // Loads, stores, and moves, not folded with other operations.
-def : WriteRes<WriteLoad,  [SKXPort23]> { let Latency = 5; }
-def : WriteRes<WriteStore, [SKXPort237, SKXPort4]>;
-def : WriteRes<WriteMove,  [SKXPort0156]>;
+defm : X86WriteRes<WriteLoad,    [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore,   [SKXPort237, SKXPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [SKXPort237, SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove,    [SKXPort0156], 1, [1], 1>;
 
 // Idioms that clear a register, like xorps %xmm0, %xmm0.
 // These can often bypass execution ports completely.
@@ -123,153 +166,374 @@ def : WriteRes<WriteZero,  []>;
 
 // Branches don't produce values, so they have no latency, but they still
 // consume resources. Indirect branches can fold loads.
-defm : SKXWriteResPair<WriteJump,  SKXPort06,   1>;
+defm : SKXWriteResPair<WriteJump,  [SKXPort06],   1>;
 
 // Floating point. This covers both scalar and vector operations.
-defm : SKXWriteResPair<WriteFAdd,   SKXPort1, 3>; // Floating point add/sub/compare.
-defm : SKXWriteResPair<WriteFMul,   SKXPort0, 5>; // Floating point multiplication.
-defm : SKXWriteResPair<WriteFDiv,   SKXPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : SKXWriteResPair<WriteFSqrt,  SKXPort0, 15>; // Floating point square root.
-defm : SKXWriteResPair<WriteFRcp,   SKXPort0, 5>; // Floating point reciprocal estimate.
-defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate.
-defm : SKXWriteResPair<WriteFMA,  SKXPort015, 4>; // Fused Multiply Add.
-defm : SKXWriteResPair<WriteFShuffle,  SKXPort5,  1>; // Floating point vector shuffles.
-defm : SKXWriteResPair<WriteFBlend,  SKXPort015,  1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends.	       
-  let Latency = 2;
-  let ResourceCycles = [2];
-} 
-def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
-}
+defm : X86WriteRes<WriteFLD0,          [SKXPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,          [SKXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC,          [SKXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad,         [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [SKXPort23,SKXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore,        [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT,      [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX,     [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY,     [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMove,         [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [SKXPort05,SKXPort0156], 10, [9,1], 10>;
+
+defm : SKXWriteResPair<WriteFAdd,      [SKXPort01],  4, [1], 1, 5>; // Floating point add/sub.
+defm : SKXWriteResPair<WriteFAddX,     [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFAddY,     [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAddZ,     [SKXPort05],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAdd64,    [SKXPort01],  4, [1], 1, 5>; // Floating point double add/sub.
+defm : SKXWriteResPair<WriteFAdd64X,   [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFAdd64Y,   [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAdd64Z,   [SKXPort05],  4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFCmp,      [SKXPort01],  4, [1], 1, 5>; // Floating point compare.
+defm : SKXWriteResPair<WriteFCmpX,     [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFCmpY,     [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmpZ,     [SKXPort05],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmp64,    [SKXPort01],  4, [1], 1, 5>; // Floating point double compare.
+defm : SKXWriteResPair<WriteFCmp64X,   [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFCmp64Y,   [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmp64Z,   [SKXPort05],  4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFCom,       [SKXPort0],  2>; // Floating point compare to flags.
+
+defm : SKXWriteResPair<WriteFMul,      [SKXPort01],  4, [1], 1, 5>; // Floating point multiplication.
+defm : SKXWriteResPair<WriteFMulX,     [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMulY,     [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMulZ,     [SKXPort05],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMul64,    [SKXPort01],  4, [1], 1, 5>; // Floating point double multiplication.
+defm : SKXWriteResPair<WriteFMul64X,   [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMul64Y,   [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMul64Z,   [SKXPort05],  4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFDiv,     [SKXPort0,SKXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : SKXWriteResPair<WriteFDivX,    [SKXPort0,SKXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDivY,    [SKXPort0,SKXFPDivider], 11, [1,5], 1, 7>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDivZ,    [SKXPort0,SKXPort5,SKXFPDivider], 18, [2,1,10], 3, 7>; // 10-14 cycles.
+//defm : SKXWriteResPair<WriteFDiv64,   [SKXPort0,SKXFPDivider], 14, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : SKXWriteResPair<WriteFDiv64X,  [SKXPort0,SKXFPDivider], 14, [1,3], 1, 6>; // 10-14 cycles.
+//defm : SKXWriteResPair<WriteFDiv64Y,  [SKXPort0,SKXFPDivider], 14, [1,5], 1, 7>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDiv64Z,  [SKXPort0,SKXPort5,SKXFPDivider], 23, [2,1,16], 3, 7>; // 10-14 cycles.
+
+defm : SKXWriteResPair<WriteFSqrt,    [SKXPort0,SKXFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : SKXWriteResPair<WriteFSqrtX,   [SKXPort0,SKXFPDivider], 12, [1,3], 1, 6>;
+defm : SKXWriteResPair<WriteFSqrtY,   [SKXPort0,SKXFPDivider], 12, [1,6], 1, 7>;
+defm : SKXWriteResPair<WriteFSqrtZ,   [SKXPort0,SKXPort5,SKXFPDivider], 20, [2,1,12], 3, 7>;
+defm : SKXWriteResPair<WriteFSqrt64,  [SKXPort0,SKXFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : SKXWriteResPair<WriteFSqrt64X, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 6>;
+defm : SKXWriteResPair<WriteFSqrt64Y, [SKXPort0,SKXFPDivider], 18, [1,12],1, 7>;
+defm : SKXWriteResPair<WriteFSqrt64Z, [SKXPort0,SKXPort5,SKXFPDivider], 32, [2,1,24], 3, 7>;
+defm : SKXWriteResPair<WriteFSqrt80,  [SKXPort0,SKXFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : SKXWriteResPair<WriteFRcp,   [SKXPort0],  4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : SKXWriteResPair<WriteFRcpX,  [SKXPort0],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFRcpY,  [SKXPort0],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFRcpZ,  [SKXPort0,SKXPort5],  4, [2,1], 3, 7>;
+
+defm : SKXWriteResPair<WriteFRsqrt, [SKXPort0],  4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : SKXWriteResPair<WriteFRsqrtX,[SKXPort0],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFRsqrtY,[SKXPort0],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFRsqrtZ,[SKXPort0,SKXPort5],  9, [2,1], 3, 7>;
+
+defm : SKXWriteResPair<WriteFMA,  [SKXPort01],  4, [1], 1, 5>; // Fused Multiply Add.
+defm : SKXWriteResPair<WriteFMAX, [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMAY, [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMAZ, [SKXPort05],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015],  9, [1,2], 3, 6>; // Floating point double dot product.
+defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>;
+defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : SKXWriteResPair<WriteDPPSZ,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : SKXWriteResPair<WriteFSign,  [SKXPort0],  1>; // Floating point fabs/fchs.
+defm : SKXWriteResPair<WriteFRnd,   [SKXPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : SKXWriteResPair<WriteFRndY,  [SKXPort01], 8, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFRndZ,  [SKXPort05], 8, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFLogic, [SKXPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : SKXWriteResPair<WriteFLogicY, [SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFLogicZ, [SKXPort05], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFTest,  [SKXPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : SKXWriteResPair<WriteFTestY, [SKXPort0], 2, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFTestZ, [SKXPort0], 2, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFShuffle,  [SKXPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKXWriteResPair<WriteFShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarShuffle,  [SKXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : SKXWriteResPair<WriteFVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : SKXWriteResPair<WriteFBlendY,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFBlendZ,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarBlend, [SKXPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : SKXWriteResPair<WriteFVarBlendY,[SKXPort015], 2, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFVarBlendZ,[SKXPort015], 2, [2], 2, 7>;
 
 // FMA Scheduling helper class.
 // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
 // Vector integer operations.
-defm : SKXWriteResPair<WriteVecALU,   SKXPort15,  1>; // Vector integer ALU op, no logicals.
-defm : SKXWriteResPair<WriteVecShift, SKXPort0,  1>; // Vector integer shifts.
-defm : SKXWriteResPair<WriteVecIMul,  SKXPort0,   5>; // Vector integer multiply.
-defm : SKXWriteResPair<WriteShuffle,  SKXPort5,  1>; // Vector shuffles.
-defm : SKXWriteResPair<WriteBlend,  SKXPort15,  1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [SKXPort5]> { // Vector variable blends.
+defm : X86WriteRes<WriteVecLoad,         [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [SKXPort23,SKXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore,        [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT,      [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY,     [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove,         [SKXPort05],  1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [SKXPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [SKXPort5], 1, [1], 1>;
+
+defm : SKXWriteResPair<WriteVecALU,   [SKXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : SKXWriteResPair<WriteVecALUX,  [SKXPort01], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecALUY,  [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecALUZ,  [SKXPort0], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecLogic, [SKXPort05],  1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : SKXWriteResPair<WriteVecLogicX,[SKXPort015], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecLogicY,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecTest,  [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
+defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
+defm : SKXWriteResPair<WriteVecIMul,  [SKXPort0],    4, [1], 1, 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01],  4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WritePMULLD,   [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : SKXWriteResPair<WritePMULLDY,  [SKXPort01], 10, [2], 2, 7>;
+defm : SKXWriteResPair<WritePMULLDZ,  [SKXPort05], 10, [2], 2, 7>;
+defm : SKXWriteResPair<WriteShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKXWriteResPair<WriteShuffleX, [SKXPort5], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteVarShuffleX, [SKXPort5], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKXWriteResPair<WriteBlendY,[SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteBlendZ,[SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKXWriteResPair<WriteVarBlendY,[SKXPort015], 2, [2], 2, 6>;
+defm : SKXWriteResPair<WriteVarBlendZ,[SKXPort05],  2, [1], 1, 6>;
+defm : SKXWriteResPair<WriteMPSAD,   [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : SKXWriteResPair<WriteMPSADY,  [SKXPort5], 4, [2], 2, 7>;
+defm : SKXWriteResPair<WriteMPSADZ,  [SKXPort5], 4, [2], 2, 7>;
+defm : SKXWriteResPair<WritePSADBW,  [SKXPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : SKXWriteResPair<WritePSADBWX, [SKXPort5], 3, [1], 1, 6>;
+defm : SKXWriteResPair<WritePSADBWY, [SKXPort5], 3, [1], 1, 7>;
+defm : SKXWriteResPair<WritePSADBWZ, [SKXPort5], 3, [1], 1, 7>;
+defm : SKXWriteResPair<WritePHMINPOS, [SKXPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX,    [SKXPort5,SKXPort01],  2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY,    [SKXPort5,SKXPort01],  4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ,    [SKXPort5,SKXPort0],   4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd,  [SKXPort01,SKXPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd,  [SKXPort01,SKXPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd,  [SKXPort0,SKXPort23],  8, [1,1], 2>;
+
+defm : SKXWriteResPair<WriteVecShiftImm,  [SKXPort0],  1, [1], 1, 5>;
+defm : SKXWriteResPair<WriteVecShiftImmX, [SKXPort01], 1, [1], 1, 6>; // Vector integer immediate shifts.
+defm : SKXWriteResPair<WriteVecShiftImmY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecShiftImmZ, [SKXPort0], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarVecShift,  [SKXPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : SKXWriteResPair<WriteVarVecShiftY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarVecShiftZ, [SKXPort0], 1, [1], 1, 7>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SKXPort5]> {
   let Latency = 2;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : WriteRes<WriteVarBlendLd, [SKXPort5, SKXPort23]> {
+def : WriteRes<WriteVecInsertLd, [SKXPort5,SKXPort23]> {
   let Latency = 6;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 2;
 }
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
 
-def : WriteRes<WriteMPSAD, [SKXPort0, SKXPort5]> { // Vector MPSAD.     
-  let Latency = 6;
-  let ResourceCycles = [1, 2];
+def : WriteRes<WriteVecExtract, [SKXPort0,SKXPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
 }
-def : WriteRes<WriteMPSADLd, [SKXPort23, SKXPort0, SKXPort5]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1, 2];
+def : WriteRes<WriteVecExtractSt, [SKXPort4,SKXPort5,SKXPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
 }
 
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm : SKXWriteResPair<WriteVecLogic, SKXPort015, 1>; // Vector and/or/xor.
-
 // Conversion between integer and float.
-defm : SKXWriteResPair<WriteCvtF2I, SKXPort1, 3>; // Float -> Integer.
-defm : SKXWriteResPair<WriteCvtI2F, SKXPort1, 4>; // Integer -> Float.
-defm : SKXWriteResPair<WriteCvtF2F, SKXPort1, 3>; // Float -> Float size conversion.
+defm : SKXWriteResPair<WriteCvtSS2I,   [SKXPort01], 6, [2], 2>; // Needs more work: DD vs DQ.
+defm : SKXWriteResPair<WriteCvtPS2I,   [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPS2IY,  [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPS2IZ,  [SKXPort05], 3>;
+defm : SKXWriteResPair<WriteCvtSD2I,   [SKXPort01], 6, [2], 2>;
+defm : SKXWriteResPair<WriteCvtPD2I,   [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPD2IY,  [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPD2IZ,  [SKXPort05], 3>;
+
+defm : SKXWriteResPair<WriteCvtI2SS,   [SKXPort1], 4>;
+defm : SKXWriteResPair<WriteCvtI2PS,   [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PSY,  [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PSZ,  [SKXPort05], 4>;  // Needs more work: DD vs DQ.
+defm : SKXWriteResPair<WriteCvtI2SD,   [SKXPort1], 4>;
+defm : SKXWriteResPair<WriteCvtI2PD,   [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PDY,  [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PDZ,  [SKXPort05], 4>;
+
+defm : SKXWriteResPair<WriteCvtSS2SD,  [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPS2PD,  [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPS2PDY, [SKXPort5,SKXPort01], 3, [1,1], 2>;
+defm : SKXWriteResPair<WriteCvtPS2PDZ, [SKXPort05], 3, [2], 2>;
+defm : SKXWriteResPair<WriteCvtSD2SS,  [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPD2PS,  [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPD2PSY, [SKXPort5,SKXPort01], 3, [1,1], 2>;
+defm : SKXWriteResPair<WriteCvtPD2PSZ, [SKXPort05], 3, [2], 2>;
+
+defm : X86WriteRes<WriteCvtPH2PS,     [SKXPort5,SKXPort01],  5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [SKXPort5,SKXPort01],  7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ,    [SKXPort5,SKXPort0],   7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSLd,  [SKXPort23,SKXPort01],  9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [SKXPort23,SKXPort01], 10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [SKXPort23,SKXPort05], 10, [1,1], 2>;
+
+defm : X86WriteRes<WriteCvtPS2PH,    [SKXPort5,SKXPort01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [SKXPort5,SKXPort01], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ,   [SKXPort5,SKXPort05], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHSt,  [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort05], 8, [1,1,1,1], 4>;
 
 // Strings instructions.
+
 // Packed Compare Implicit Length Strings, Return Mask
-// String instructions.
 def : WriteRes<WritePCmpIStrM, [SKXPort0]> {
   let Latency = 10;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrMLd, [SKXPort0, SKXPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 1];
-} 
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
 // Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort16, SKXPort5]> {
-  let Latency = 10;
-  let ResourceCycles = [3, 2, 4];
+def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort5, SKXPort015, SKXPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
-def : WriteRes<WritePCmpEStrMLd, [SKXPort05, SKXPort16, SKXPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [6, 2, 1];
-} 
-  // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrMLd, [SKXPort0, SKXPort5, SKXPort23, SKXPort015, SKXPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
 def : WriteRes<WritePCmpIStrI, [SKXPort0]> {
-  let Latency = 11;
+  let Latency = 10;
+  let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def : WriteRes<WritePCmpIStrILd, [SKXPort0, SKXPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 1];
-}     
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
 // Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [SKXPort05, SKXPort16]> {
-  let Latency = 11;
-  let ResourceCycles = [6, 2];
+def : WriteRes<WritePCmpEStrI, [SKXPort0,SKXPort5,SKXPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4,3,1];
 }
-def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort16, SKXPort5, SKXPort23]> {
-  let Latency = 11;
-  let ResourceCycles = [3, 2, 2, 1];
+def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort5, SKXPort23, SKXPort0156]> {
+  let Latency = 24;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
 
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK,  [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK,  [SKXPort0]> { let Latency = 2; }
+
 // AES instructions.
-def : WriteRes<WriteAESDecEnc, [SKXPort5]> { // Decryption, encryption.
-  let Latency = 7;
+def : WriteRes<WriteAESDecEnc, [SKXPort0]> { // Decryption, encryption.
+  let Latency = 4;
+  let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def : WriteRes<WriteAESDecEncLd, [SKXPort5, SKXPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
+def : WriteRes<WriteAESDecEncLd, [SKXPort0, SKXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : WriteRes<WriteAESIMC, [SKXPort5]> { // InvMixColumn.
-  let Latency = 14;
+
+def : WriteRes<WriteAESIMC, [SKXPort0]> { // InvMixColumn.
+  let Latency = 8;
+  let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : WriteRes<WriteAESIMCLd, [SKXPort5, SKXPort23]> {
+def : WriteRes<WriteAESIMCLd, [SKXPort0, SKXPort23]> {
   let Latency = 14;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : WriteRes<WriteAESKeyGen, [SKXPort0, SKXPort5]> { // Key Generation.
-  let Latency = 10;
-  let ResourceCycles = [2, 8];
+
+def : WriteRes<WriteAESKeyGen, [SKXPort0,SKXPort5,SKXPort015]> { // Key Generation.
+  let Latency = 20;
+  let NumMicroOps = 11;
+  let ResourceCycles = [3,6,2];
 }
-def : WriteRes<WriteAESKeyGenLd, [SKXPort0, SKXPort5, SKXPort23]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 7, 1];
+def : WriteRes<WriteAESKeyGenLd, [SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
+  let Latency = 25;
+  let NumMicroOps = 11;
+  let ResourceCycles = [3,6,1,1];
 }
 
 // Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [SKXPort0, SKXPort5]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1];
+def : WriteRes<WriteCLMul, [SKXPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : WriteRes<WriteCLMulLd, [SKXPort0, SKXPort5, SKXPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 1, 1];
+def : WriteRes<WriteCLMulLd, [SKXPort5, SKXPort23]> {
+  let Latency = 12;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
 
 // Catch-all for expensive system instructions.
 def : WriteRes<WriteSystem,     [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
 
 // AVX2.
-defm : SKXWriteResPair<WriteFShuffle256,  SKXPort5,  3>; // Fp 256-bit width vector shuffles.
-defm : SKXWriteResPair<WriteShuffle256,  SKXPort5,  3>;  // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [SKXPort0, SKXPort5]> { // Variable vector shifts.
-  let Latency = 2;
-  let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [SKXPort0, SKXPort5, SKXPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1, 1];
-}
+defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteFVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3, [1], 1, 7>;  // 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteVarShuffle256, [SKXPort5], 3, [1], 1, 7>;  // 256-bit width vector variable shuffles.
 
 // Old microcoded instructions that nobody use.
 def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
@@ -277,33 +541,22 @@ def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def Wri
 // Fence instructions.
 def : WriteRes<WriteFence,  [SKXPort23, SKXPort4]>;
 
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [SKXPort0,SKXPort23,SKXPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SKXPort4,SKXPort5,SKXPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
 // Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SKXPort1]> {
-  let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SKXPort1, SKXPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
 
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SKXPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SKXPort15, SKXPort23]> {
-  let Latency = 5;
-  let ResourceCycles = [1, 1];
-}
+defm : SKXWriteResPair<WriteFHAdd,  [SKXPort5,SKXPort015], 6, [2,1], 3, 6>;
+defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>;
+defm : SKXWriteResPair<WritePHAdd,  [SKXPort5,SKXPort05],  3, [2,1], 3, 5>;
+defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>;
+defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>;
 
 // Remaining instrs.
 
@@ -312,358 +565,35 @@ def SKXWriteResGroup1 : SchedWriteRes<[SKXPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDNWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KANDWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVBkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVDkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVQkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KMOVWkk")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KNOTWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KORWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXNORWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORBrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "KXORWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQDirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTDirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXUBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINUBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWri")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWrr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSBirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSWirr")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZrr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup2 : SchedWriteRes<[SKXPort1]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
+                                            "KANDN(B|D|Q|W)rr",
+                                            "KMOV(B|D|Q|W)kk",
+                                            "KNOT(B|D|Q|W)rr",
+                                            "KOR(B|D|Q|W)rr",
+                                            "KXNOR(B|D|Q|W)rr",
+                                            "KXOR(B|D|Q|W)rr",
+                                            "MMX_PADDS(B|W)irr",
+                                            "MMX_PADDUS(B|W)irr",
+                                            "MMX_PAVG(B|W)irr",
+                                            "MMX_PCMPEQ(B|D|W)irr",
+                                            "MMX_PCMPGT(B|D|W)irr",
+                                            "MMX_P(MAX|MIN)SWirr",
+                                            "MMX_P(MAX|MIN)UBirr",
+                                            "MMX_PSUBS(B|W)irr",
+                                            "MMX_PSUBUS(B|W)irr",
+                                            "VPMOVB2M(Z|Z128|Z256)rr",
+                                            "VPMOVD2M(Z|Z128|Z256)rr",
+                                            "VPMOVQ2M(Z|Z128|Z256)rr",
+                                            "VPMOVW2M(Z|Z128|Z256)rr")>;
 
 def SKXWriteResGroup3 : SchedWriteRes<[SKXPort5]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup3], (instregex "COMP_FST0r")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "COM_FST0r")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "INSERTPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVBkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVDkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVQkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "KMOVWkr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64rr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVDDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVDI2PDIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVHLPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSHDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PALIGNRrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PBLENDWrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFDri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFHWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFLWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSLLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PSRLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPDrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPSrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_FPr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_Fr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTI32X2Z128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTSSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSSZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQYri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSYrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSrri")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "COM(P?)_FST0r",
+                                            "KMOV(B|D|Q|W)kr",
+                                            "UCOM_F(P?)r")>;
 
 def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> {
   let Latency = 1;
@@ -672,907 +602,79 @@ def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> {
 }
 def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>;
 
-def SKXWriteResGroup5 : SchedWriteRes<[SKXPort01]> {
-  let Latency = 1;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup5], (instregex "PABSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PABSDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PABSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PAVGBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PAVGWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PMINUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNBrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNDrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNWrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSLLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSLLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSLLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRADri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRAWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSRLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBYrr256")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDYrr256")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWYrr256")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWrr128")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ128ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ256ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ128ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ256ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ128r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWYri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ128ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ256ri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWri")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWYrr")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWrr")>;
-
 def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup6], (instregex "FINCSTP")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "FNOP")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSBrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSDrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSWrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDBirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDDirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDQirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDWirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDNirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PORirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNBrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNDrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNWrr64")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBBirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBDirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBQirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBWirr")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PXORirr")>;
+def: InstRW<[SKXWriteResGroup6], (instrs FINCSTP, FNOP)>;
 
 def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADCX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADOX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CDQ")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CLAC")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "CQO")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JAE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JAE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JA_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JA_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JBE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JBE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JB_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JB_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JGE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JGE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JG_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JG_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JLE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JLE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JL_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JL_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JMP_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JMP_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNE_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNE_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNO_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNO_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNP_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNP_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNS_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JNS_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JO_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JO_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JP_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JP_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JS_1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "JS_4")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "RORX(32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR8r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SAR8ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SARX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETAEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETBr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETGEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETGr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETLEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETLr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNEr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNOr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNPr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETNSr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETOr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETPr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SETSr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL8r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHL8ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHLX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR8r1")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHR8ri")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SHRX(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "STAC")>;
+def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8",
+                                            "BT(16|32|64)rr",
+                                            "BTC(16|32|64)ri8",
+                                            "BTC(16|32|64)rr",
+                                            "BTR(16|32|64)ri8",
+                                            "BTR(16|32|64)rr",
+                                            "BTS(16|32|64)ri8",
+                                            "BTS(16|32|64)rr")>;
 
 def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BLSI(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BLSR(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "BZHI(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr",
+                                            "BLSI(32|64)rr",
+                                            "BLSMSK(32|64)rr",
+                                            "BLSR(32|64)rr")>;
 
 def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ANDPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPDrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPSrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "ORPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PADDWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PANDNrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PANDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PSUBWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "PXORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDYrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSYrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Zrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI(2Q|Lo2PQ)IZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ128rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ256rr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZrr(b?)(k?)(z?)(_REV)?")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VORPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPANDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDYrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VPXORrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSYrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "XORPDrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "XORPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
+                                            "VBLENDMPS(Z128|Z256)rr",
+                                            "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr",
+                                            "(V?)PADD(B|D|Q|W)rr",
+                                            "VPBLENDD(Y?)rri",
+                                            "VPBLENDMB(Z128|Z256)rr",
+                                            "VPBLENDMD(Z128|Z256)rr",
+                                            "VPBLENDMQ(Z128|Z256)rr",
+                                            "VPBLENDMW(Z128|Z256)rr",
+                                            "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr",
+                                            "(V?)PSUB(B|D|Q|W)rr",
+                                            "VPTERNLOGD(Z|Z128|Z256)rri",
+                                            "VPTERNLOGQ(Z|Z128|Z256)rri")>;
 
 def SKXWriteResGroup10 : SchedWriteRes<[SKXPort0156]> {
   let Latency = 1;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CBW")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CLC")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMC")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CWDE")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "DEC(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "DEC8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "INC(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "INC8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NEG(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NEG8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NOOP")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NOT(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "NOT8r")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SAHF")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SIDT64m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SLDT64m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SMSW16m")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "STC")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "STRm")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SYSCALL")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "TEST8rr")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8i8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instrs CBW, CWDE, CDQE,
+                                          CMC, STC)>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m",
+                                             "SIDT64m",
+                                             "SMSW16m",
+                                             "STRm",
+                                             "SYSCALL")>;
 
 def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> {
   let Latency = 1;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVBmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVDmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVQmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "KMOVWmk")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVNTQmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVQ64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOV(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mi")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQAmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQUmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTDQmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTI_64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVPDI2DImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQI2QImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQIto64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVSDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVSSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP32m")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP64m")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP80m")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF128mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x8Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI128mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x8Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x4Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DIZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI(2QI|to64)Zmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI2QImr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQIto64mr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSYmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSmr")>;
-def: InstRW<[SKXWriteResGroup11], (instregex "VMPTRSTm")>;
-
-def SKXWriteResGroup12 : SchedWriteRes<[SKXPort0]> {
-  let Latency = 2;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup12], (instregex "COMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "COMISSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64grr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVPDI2DIrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "MOVPQIto64rr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "PMOVMSKBrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64rr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSYrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDrr")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSrr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm",
+                                             "KMOV(B|D|Q|W)mk",
+                                             "ST_FP(32|64|80)m",
+                                             "VMPTRSTm")>;
 
 def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> {
   let Latency = 2;
@@ -1580,26 +682,13 @@ def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> {
   let ResourceCycles = [2];
 }
 def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "MMX_PINSRWirri")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRBrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRDrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRQrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "PINSRWrri")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQrr")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWrri")>;
 
 def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKXWriteResGroup14], (instregex "FDECSTP")>;
+def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP)>;
 def: InstRW<[SKXWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>;
 
 def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
@@ -1607,88 +696,20 @@ def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKXWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL8r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROL8ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR8r1")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "ROR8ri")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "SETAr")>;
-def: InstRW<[SKXWriteResGroup15], (instregex "SETBEr")>;
-
-def SKXWriteResGroup16 : SchedWriteRes<[SKXPort015]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPDrr0")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPSrr0")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "PBLENDVBrr0")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDYrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSYrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBYrr")>;
-def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBrr")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROL(8|16|32|64)r1",
+                                             "ROL(8|16|32|64)ri",
+                                             "ROR(8|16|32|64)r1",
+                                             "ROR(8|16|32|64)ri",
+                                             "SET(A|BE)r")>;
 
 def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKXWriteResGroup17], (instregex "LFENCE")>;
-def: InstRW<[SKXWriteResGroup17], (instregex "WAIT")>;
-def: InstRW<[SKXWriteResGroup17], (instregex "XGETBV")>;
-
-def SKXWriteResGroup18 : SchedWriteRes<[SKXPort0,SKXPort237]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup18], (instregex "MMX_MASKMOVQ64")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVDQU")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQYmr")>;
-def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQmr")>;
-
-def SKXWriteResGroup19 : SchedWriteRes<[SKXPort5,SKXPort01]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup19], (instregex "PSLLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSLLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSLLWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRADrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRAWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "PSRLWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQrr")>;
-def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLWrr")>;
+def: InstRW<[SKXWriteResGroup17], (instrs LFENCE,
+                                          WAIT,
+                                          XGETBV)>;
 
 def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
   let Latency = 2;
@@ -1702,77 +723,26 @@ def SKXWriteResGroup21 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup21], (instregex "SFENCE")>;
-
-def SKXWriteResGroup22 : SchedWriteRes<[SKXPort06,SKXPort15]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;
+def: InstRW<[SKXWriteResGroup21], (instrs SFENCE)>;
 
 def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
   let Latency = 2;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "CWD")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "JRCXZ")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "SBB8i8")>;
-def: InstRW<[SKXWriteResGroup23], (instregex "SBB8ri")>;
-
-def SKXWriteResGroup24 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup24], (instregex "EXTRACTPSmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRBmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRDmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRQmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRWmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "STMXCSR")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWmr")>;
-def: InstRW<[SKXWriteResGroup24], (instregex "VSTMXCSR")>;
+def: InstRW<[SKXWriteResGroup23], (instrs CWD)>;
+def: InstRW<[SKXWriteResGroup23], (instrs JCXZ, JECXZ, JRCXZ)>;
+def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8",
+                                             "ADC8ri",
+                                             "SBB8i8",
+                                             "SBB8ri")>;
 
 def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> {
   let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup25], (instregex "FNSTCW16m")>;
-
-def SKXWriteResGroup26 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup26], (instregex "SETAEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETBm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETGEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETGm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETLEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETLm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNEm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNOm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNPm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETNSm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETOm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETPm")>;
-def: InstRW<[SKXWriteResGroup26], (instregex "SETSm")>;
+def: InstRW<[SKXWriteResGroup25], (instrs FNSTCW16m)>;
 
 def SKXWriteResGroup27 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
   let Latency = 2;
@@ -1786,497 +756,131 @@ def SKXWriteResGroup28 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "PUSH64i8")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSB")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSL")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSQ")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "STOSW")>;
+def: InstRW<[SKXWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r,
+                                          STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)rmr",
+                                             "PUSH64i8")>;
 
 def SKXWriteResGroup29 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
   let Latency = 2;
   let NumMicroOps = 5;
   let ResourceCycles = [2,2,1];
 }
-def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)")>;
 
 def SKXWriteResGroup30 : SchedWriteRes<[SKXPort0]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDBrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDDrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDQrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KADDWrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVBrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVDrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVQrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KMOVWrk")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTBrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTDrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTQrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTWrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTBrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTDrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTQrr")>;
-def: InstRW<[SKXWriteResGroup30], (instregex "KTESTWrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOV(B|D|Q|W)rk",
+                                             "KORTEST(B|D|Q|W)rr",
+                                             "KTEST(B|D|Q|W)rr")>;
 
 def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup31], (instregex "BSF(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "IMUL64rr(i8)?")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "IMUL8r")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "LZCNT(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "MUL8r")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "PEXT(32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "POPCNT(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "SHRD(16|32|64)rri8")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "TZCNT(16|32|64)rr")>;
-
-def SKXWriteResGroup31_16 : SchedWriteRes<[SKXPort1, SKXPort0156]> {
-  let Latency = 3;
+def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr",
+                                             "PEXT(32|64)rr",
+                                             "SHLD(16|32|64)rri8",
+                                             "SHRD(16|32|64)rri8")>;
+
+def SKXWriteResGroup31_16i : SchedWriteRes<[SKXPort1, SKXPort0156]> {
+  let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup31_16], (instregex "IMUL16rr(i8)?")>;
+def: InstRW<[SKXWriteResGroup31_16i], (instrs IMUL16rri, IMUL16rri8)>;
 
-def SKXWriteResGroup31_32 : SchedWriteRes<[SKXPort1]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
-}
-def: InstRW<[SKXWriteResGroup31_32], (instregex "IMUL32rr(i8)?")>;
 
 def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
   let Latency = 3;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FPrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FST0r")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLBri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLDri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLQri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLWri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRBri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRDri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRQri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRWri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKBWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKDQrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKWDrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "MMX_PSADBWirr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "PCMPGTQrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "PSADBWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FPrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FST0r")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FPrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FST0r")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FrST0")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Z256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Zr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Z256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Zr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x8Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x4Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2F128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2I128rr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Drr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Qrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDYri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQYri")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256r(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Drr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Qrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWYrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWrr")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Zrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Zrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Zrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Z256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Zrri(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup33 : SchedWriteRes<[SKXPort0,SKXPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup33], (instregex "EXTRACTPSrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "MMX_PEXTRWirri")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRBrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRDrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRQrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWri")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWrr_REV")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "PTESTrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWZrr(_REV)?")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWri")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWrr_REV")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTYrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+                                             "KADD(B|D|Q|W)rr",
+                                             "KSHIFTL(B|D|Q|W)ri",
+                                             "KSHIFTR(B|D|Q|W)ri",
+                                             "KUNPCKBWrr",
+                                             "KUNPCKDQrr",
+                                             "KUNPCKWDrr",
+                                             "VALIGND(Z|Z128|Z256)rri",
+                                             "VALIGNQ(Z|Z128|Z256)rri",
+                                             "VCMPPD(Z|Z128|Z256)rri",
+                                             "VCMPPS(Z|Z128|Z256)rri",
+                                             "VCMPSDZrr",
+                                             "VCMPSSZrr",
+                                             "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
+                                             "VFPCLASSPD(Z|Z128|Z256)rr",
+                                             "VFPCLASSPS(Z|Z128|Z256)rr",
+                                             "VFPCLASSSDZrr",
+                                             "VFPCLASSSSZrr",
+                                             "VPBROADCASTBrr",
+                                             "VPBROADCASTWrr",
+                                             "VPCMPB(Z|Z128|Z256)rri",
+                                             "VPCMPD(Z|Z128|Z256)rri",
+                                             "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
+                                             "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr",
+                                             "(V?)PCMPGTQ(Y?)rr",
+                                             "VPCMPQ(Z|Z128|Z256)rri",
+                                             "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
+                                             "VPCMPW(Z|Z128|Z256)rri",
+                                             "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr",
+                                             "VPSADBWZrr", // TODO: 512-bit ops require ports 0/1 to be joined.
+                                             "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>;
 
 def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> {
   let Latency = 3;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup34], (instregex "FNSTSW16r")>;
+def: InstRW<[SKXWriteResGroup34], (instrs FNSTSW16r)>;
 
 def SKXWriteResGroup35 : SchedWriteRes<[SKXPort06]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SKXWriteResGroup35], (instregex "ROL(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "ROL8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "ROR(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "ROR8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SAR(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SAR8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHL(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHL8rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHR(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup35], (instregex "SHR8rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "ROL(8|16|32|64)rCL",
+                                             "ROR(8|16|32|64)rCL",
+                                             "SAR(8|16|32|64)rCL",
+                                             "SHL(8|16|32|64)rCL",
+                                             "SHR(8|16|32|64)rCL")>;
 
 def SKXWriteResGroup36 : SchedWriteRes<[SKXPort0156]> {
-  let Latency = 3;
+  let Latency = 2;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SKXWriteResGroup36], (instregex "XADD(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup36], (instregex "XADD8rr")>;
-def: InstRW<[SKXWriteResGroup36], (instregex "XCHG8rr")>;
+def: InstRW<[SKXWriteResGroup36], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr,
+                                          XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr,
+                                          XCHG16ar, XCHG32ar, XCHG64ar)>;
 
 def SKXWriteResGroup37 : SchedWriteRes<[SKXPort0,SKXPort5]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHADDSWrr64")>;
-def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>;
 
 def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup38], (instregex "PHADDSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "PHSUBSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr256")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr128")>;
-def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr256")>;
-
-def SKXWriteResGroup39 : SchedWriteRes<[SKXPort5,SKXPort05]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDWrr64")>;
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDrr64")>;
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBDrr64")>;
-def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBWrr64")>;
-
-def SKXWriteResGroup40 : SchedWriteRes<[SKXPort5,SKXPort015]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup40], (instregex "PHADDDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "PHADDWrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBWrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWYrr")>;
-def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWrr")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "(V?)PH(ADD|SUB)SW(Y?)rr")>;
 
 def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr")>;
-def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSWBirr")>;
-def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKUSWBirr")>;
+def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr",
+                                             "MMX_PACKSSWBirr",
+                                             "MMX_PACKUSWBirr")>;
 
 def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
   let Latency = 3;
@@ -2290,36 +894,31 @@ def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKXWriteResGroup43], (instregex "MFENCE")>;
+def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>;
 
 def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL8r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL8ri")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)ri")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR8r1")>;
-def: InstRW<[SKXWriteResGroup44], (instregex "RCR8ri")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r1",
+                                             "RCL(8|16|32|64)ri",
+                                             "RCR(8|16|32|64)r1",
+                                             "RCR(8|16|32|64)ri")>;
 
 def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup45], (instregex "FNSTSWm")>;
+def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>;
 
 def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
   let Latency = 3;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SKXWriteResGroup46], (instregex "SETAm")>;
-def: InstRW<[SKXWriteResGroup46], (instregex "SETBEm")>;
+def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>;
 
 def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> {
   let Latency = 3;
@@ -2333,474 +932,116 @@ def SKXWriteResGroup48 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06,SKXPort015
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKXWriteResGroup48], (instregex "CALL64pcrel32")>;
+def: InstRW<[SKXWriteResGroup48], (instrs CALL64pcrel32)>;
 
 def SKXWriteResGroup49 : SchedWriteRes<[SKXPort0]> {
   let Latency = 4;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup49], (instregex "AESDECLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "AESDECrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "AESENCLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "AESENCrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDUBSWrr64")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDWDirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHRSWrr64")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHUWirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHWirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULLWirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULUDQirr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FPrST0")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FST0r")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FrST0")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RCPPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RCPSSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTSSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCLASTrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCrr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSYr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRCPSSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SDrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SSrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSYr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSr")>;
-def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTSSr")>;
-
-def SKXWriteResGroup50 : SchedWriteRes<[SKXPort015]> {
+def: InstRW<[SKXWriteResGroup49], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def SKXWriteResGroup50 : SchedWriteRes<[SKXPort01]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PS(Y|Z128|Z256)rr",
+                                             "(V?)CVTDQ2PSrr",
+                                             "VCVTPD2QQ(Z128|Z256)rr",
+                                             "VCVTPD2UQQ(Z128|Z256)rr",
+                                             "VCVTPS2DQ(Y|Z128|Z256)rr",
+                                             "(V?)CVTPS2DQrr",
+                                             "VCVTPS2UDQ(Z128|Z256)rr",
+                                             "VCVTQQ2PD(Z128|Z256)rr",
+                                             "VCVTTPD2QQ(Z128|Z256)rr",
+                                             "VCVTTPD2UQQ(Z128|Z256)rr",
+                                             "VCVTTPS2DQ(Z128|Z256)rr",
+                                             "(V?)CVTTPS2DQrr",
+                                             "VCVTTPS2UDQ(Z128|Z256)rr",
+                                             "VCVTUDQ2PS(Z128|Z256)rr",
+                                             "VCVTUQQ2PD(Z128|Z256)rr")>;
+
+def SKXWriteResGroup50z : SchedWriteRes<[SKXPort05]> {
   let Latency = 4;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPPDrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPPSrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CMPSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CVTDQ2PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CVTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "MULSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PHMINPOSUWrr128")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMADDUBSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMADDWDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULHRSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULHUWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULHWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULLWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "PMULUDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "SUBSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDYrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSYrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSrri")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSDrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSSrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50],
-          (instregex
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Yr",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128r(b?)(k?)(z?)",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256r(b?)(k?)(z?)",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zr(b?)(k?)(z?)",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)r",
-           "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zr(b?)(_Int)?(k?)(z?)",
-           "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSDr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSSr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPHMINPOSUWrr128")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSYrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSrr")>;
+def: InstRW<[SKXWriteResGroup50z], (instrs VCVTDQ2PSZrr,
+                                           VCVTPD2QQZrr,
+                                           VCVTPD2UQQZrr,
+                                           VCVTPS2DQZrr,
+                                           VCVTPS2UDQZrr,
+                                           VCVTQQ2PDZrr,
+                                           VCVTTPD2QQZrr,
+                                           VCVTTPD2UQQZrr,
+                                           VCVTTPS2DQZrr,
+                                           VCVTTPS2UDQZrr,
+                                           VCVTUDQ2PSZrr,
+                                           VCVTUQQ2PDZrr)>;
 
 def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKXWriteResGroup51], (instregex "MPSADBWrri")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWYrri")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWrri")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPD(Z|Z128|Z256)rr",
+                                             "VEXPANDPS(Z|Z128|Z256)rr",
+                                             "VPEXPANDD(Z|Z128|Z256)rr",
+                                             "VPEXPANDQ(Z|Z128|Z256)rr",
+                                             "VPMOVDB(Z|Z128|Z256)rr",
+                                             "VPMOVDW(Z|Z128|Z256)rr",
+                                             "VPMOVQB(Z|Z128|Z256)rr",
+                                             "VPMOVQW(Z|Z128|Z256)rr",
+                                             "VPMOVSDB(Z|Z128|Z256)rr",
+                                             "VPMOVSDW(Z|Z128|Z256)rr",
+                                             "VPMOVSQB(Z|Z128|Z256)rr",
+                                             "VPMOVSQD(Z|Z128|Z256)rr",
+                                             "VPMOVSQW(Z|Z128|Z256)rr",
+                                             "VPMOVSWB(Z|Z128|Z256)rr",
+                                             "VPMOVUSDB(Z|Z128|Z256)rr",
+                                             "VPMOVUSDW(Z|Z128|Z256)rr",
+                                             "VPMOVUSQB(Z|Z128|Z256)rr",
+                                             "VPMOVUSQD(Z|Z128|Z256)rr",
+                                             "VPMOVUSWB(Z|Z128|Z256)rr",
+                                             "VPMOVWB(Z|Z128|Z256)rr")>;
 
 def SKXWriteResGroup52 : SchedWriteRes<[SKXPort1,SKXPort5]> {
   let Latency = 4;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup52], (instregex "IMUL(32|64)r")>;
-def: InstRW<[SKXWriteResGroup52], (instregex "MUL(32|64)r")>;
-def: InstRW<[SKXWriteResGroup52], (instregex "MULX64rr")>;
+def: InstRW<[SKXWriteResGroup52], (instrs IMUL64r, MUL64r, MULX64rr)>;
 
 def SKXWriteResGroup52_16 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
 }
-def: InstRW<[SKXWriteResGroup52_16], (instregex "IMUL16r")>;
-def: InstRW<[SKXWriteResGroup52_16], (instregex "MUL16r")>;
-
-def SKXWriteResGroup53 : SchedWriteRes<[SKXPort5,SKXPort01]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWYrr")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup52_16], (instrs IMUL16r, MUL16r)>;
 
 def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
   let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP16m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP32m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP64m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_F16m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_F32m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP16m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP32m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP64m")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST(T?)_FP(16|32|64)m",
+                                             "IST_F(16|32)m",
+                                             "VPMOVQD(Z|Z128|Z256)mr(b?)")>;
 
 def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [4];
 }
-def: InstRW<[SKXWriteResGroup55], (instregex "FNCLEX")>;
+def: InstRW<[SKXWriteResGroup55], (instrs FNCLEX)>;
 
 def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> {
   let Latency = 4;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup56], (instregex "VZEROUPPER")>;
+def: InstRW<[SKXWriteResGroup56], (instrs VZEROUPPER)>;
 
 def SKXWriteResGroup57 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort0156]> {
   let Latency = 4;
@@ -2814,109 +1055,53 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64from64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64to64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVQ64rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOV(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOV64toPQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOV8rm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVDDUPrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVDI2PDIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVQI2PQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSDrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSSrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHNTA")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT0")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT1")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT2")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOV64toPQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDDUPrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDI2PDIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVQI2PQIrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSDrm")>;
-def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSSrm")>;
-
-def SKXWriteResGroup59 : SchedWriteRes<[SKXPort015]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup59], (instregex "VCVTSD2SSZrr(b?)(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup60 : SchedWriteRes<[SKXPort0,SKXPort5]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup60], (instregex "CVTDQ2PDrr")>;
-def: InstRW<[SKXWriteResGroup60], (instregex "MMX_CVTPI2PDirr")>;
-def: InstRW<[SKXWriteResGroup60], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16",
+                                             "MOVSX(16|32|64)rm32",
+                                             "MOVSX(16|32|64)rm8",
+                                             "MOVZX(16|32|64)rm16",
+                                             "MOVZX(16|32|64)rm8",
+                                             "(V?)MOVDDUPrm")>;  // TODO: Should this be SKXWriteResGroup71?
 
 def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 5;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2PSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTPS2PDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSD2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI642SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTSS2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "CVTTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPD2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPS2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPD2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPS2PIirr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTDQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTQQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSD2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQrr")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2UDQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2QQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2UQQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUDQ2PDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUQQ2PSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI642SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr",
+                                             "MMX_CVT(T?)PS2PIirr",
+                                             "VCVTDQ2PDZ128rr",
+                                             "VCVTPD2DQZ128rr",
+                                             "(V?)CVT(T?)PD2DQrr",
+                                             "VCVTPD2PSZ128rr",
+                                             "(V?)CVTPD2PSrr",
+                                             "VCVTPD2UDQZ128rr",
+                                             "VCVTPS2PDZ128rr",
+                                             "(V?)CVTPS2PDrr",
+                                             "VCVTPS2QQZ128rr",
+                                             "VCVTPS2UQQZ128rr",
+                                             "VCVTQQ2PSZ128rr",
+                                             "(V?)CVTSD2SS(Z?)rr",
+                                             "(V?)CVTSI(64)?2SDrr",
+                                             "VCVTSI2SSZrr",
+                                             "(V?)CVTSI2SSrr",
+                                             "VCVTSI(64)?2SDZrr",
+                                             "VCVTSS2SDZrr",
+                                             "(V?)CVTSS2SDrr",
+                                             "VCVTTPD2DQZ128rr",
+                                             "VCVTTPD2UDQZ128rr",
+                                             "VCVTTPS2QQZ128rr",
+                                             "VCVTTPS2UQQZ128rr",
+                                             "VCVTUDQ2PDZ128rr",
+                                             "VCVTUQQ2PSZ128rr",
+                                             "VCVTUSI2SSZrr",
+                                             "VCVTUSI(64)?2SDZrr")>;
 
 def SKXWriteResGroup62 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr")>;
 
 def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> {
   let Latency = 5;
@@ -2926,426 +1111,172 @@ def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> {
 def: InstRW<[SKXWriteResGroup63], (instregex "STR(16|32|64)r")>;
 
 def SKXWriteResGroup64 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
-  let Latency = 5;
+  let Latency = 4;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup64], (instregex "MULX32rr")>;
+def: InstRW<[SKXWriteResGroup64], (instrs IMUL32r, MUL32r, MULX32rr)>;
 
 def SKXWriteResGroup65 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort015]> {
   let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)",
+                                             "VCVTPS2PHZ256mr(b?)",
+                                             "VCVTPS2PHZmr(b?)")>;
 
 def SKXWriteResGroup66 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
   let Latency = 5;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVDW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVQB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVQW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSDB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSDW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSQB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSQD(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSQW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSWB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSDB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSDW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSQB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSQD(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSQW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSWB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVWB(Z|Z128|Z256)mr(b?)")>;
 
 def SKXWriteResGroup67 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [1,4];
 }
-def: InstRW<[SKXWriteResGroup67], (instregex "XSETBV")>;
+def: InstRW<[SKXWriteResGroup67], (instrs XSETBV)>;
 
 def SKXWriteResGroup68 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
   let Latency = 5;
   let NumMicroOps = 5;
   let ResourceCycles = [2,3];
 }
-def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG8rr")>;
+def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(8|16|32|64)rr")>;
 
 def SKXWriteResGroup69 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
   let Latency = 5;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,4];
 }
-def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF16")>;
-def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF64")>;
-
-def SKXWriteResGroup70 : SchedWriteRes<[SKXPort5]> {
-  let Latency = 6;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup70], (instregex "PCLMULQDQrr")>;
-def: InstRW<[SKXWriteResGroup70], (instregex "VPCLMULQDQrr")>;
+def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF(16|64)")>;
 
 def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
   let Latency = 6;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup71], (instregex "LDDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVNTDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVSHDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVSLDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VLDDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQUrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVNTDQArm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSHDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSLDUPrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPSrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTDrm")>;
-def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTQrm")>;
-
-def SKXWriteResGroup72 : SchedWriteRes<[SKXPort0]> {
+def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm",
+                                             "(V?)MOVSHDUPrm",
+                                             "(V?)MOVSLDUPrm",
+                                             "VPBROADCASTDrm",
+                                             "VPBROADCASTQrm")>;
+
+def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr",
+                                             "VCOMPRESSPD(Z|Z128|Z256)rr",
+                                             "VCOMPRESSPS(Z|Z128|Z256)rr",
+                                             "VPCOMPRESSD(Z|Z128|Z256)rr",
+                                             "VPCOMPRESSQ(Z|Z128|Z256)rr",
+                                             "VPERMW(Z|Z128|Z256)rr")>;
 
 def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQDirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTDirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXUBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINUBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLDrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLQrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLWrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRADrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRAWrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLDrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLQrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLWrm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSWirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSBirm")>;
-def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSWirm")>;
-
-def SKXWriteResGroup74 : SchedWriteRes<[SKXPort0,SKXPort015]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2USIZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64Zrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64rr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIrr")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USI64Zrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USIZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSS2USIZrr(b?)")>;
-
-def SKXWriteResGroup75 : SchedWriteRes<[SKXPort5,SKXPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PALIGNR64irm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PINSRWirmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFBrm64")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFWmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHBWirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHDQirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHWDirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLBWirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLDQirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLWDirm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRBrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PINSRWrmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWrmi")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBWrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXDQrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWDrm")>;
-def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm",
+                                             "MMX_PADDSWirm",
+                                             "MMX_PADDUSBirm",
+                                             "MMX_PADDUSWirm",
+                                             "MMX_PAVGBirm",
+                                             "MMX_PAVGWirm",
+                                             "MMX_PCMPEQBirm",
+                                             "MMX_PCMPEQDirm",
+                                             "MMX_PCMPEQWirm",
+                                             "MMX_PCMPGTBirm",
+                                             "MMX_PCMPGTDirm",
+                                             "MMX_PCMPGTWirm",
+                                             "MMX_PMAXSWirm",
+                                             "MMX_PMAXUBirm",
+                                             "MMX_PMINSWirm",
+                                             "MMX_PMINUBirm",
+                                             "MMX_PSUBSBirm",
+                                             "MMX_PSUBSWirm",
+                                             "MMX_PSUBUSBirm",
+                                             "MMX_PSUBUSWirm")>;
 
 def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64")>;
-def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
-
-def SKXWriteResGroup77 : SchedWriteRes<[SKXPort23,SKXPort05]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSBrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSDrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSWrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDBirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDDirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDQirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDWirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDNirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PORirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNBrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNDrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNWrm64")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBBirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBDirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBQirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBWirm")>;
-def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PXORirm")>;
+def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64",
+                                             "JMP(16|32|64)m")>;
 
 def SKXWriteResGroup78 : SchedWriteRes<[SKXPort23,SKXPort06]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup78], (instregex "ADC(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "ADC8rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "ADCX(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "ADOX(32|64)rm")>;
 def: InstRW<[SKXWriteResGroup78], (instregex "BT(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVAE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVB(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVG(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVGE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVL(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVLE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNE(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNO(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNP(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNS(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVO(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVP(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "CMOVS(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "RORX(32|64)mi")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SARX(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SBB(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SBB8rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SHLX(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup78], (instregex "SHRX(32|64)rm")>;
 
 def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BLSI(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BLSMSK(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BLSR(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "BZHI(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup79], (instregex "MOVBE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm",
+                                             "BLSI(32|64)rm",
+                                             "BLSMSK(32|64)rm",
+                                             "BLSR(32|64)rm",
+                                             "MOVBE(16|32|64)rm")>;
 
 def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup80], (instregex "VMOVDI2PDIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)",
+                                             "VMOVDI2PDIZrm(b?)")>;
 
 def SKXWriteResGroup81 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup81], (instregex "ADD(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "ADD8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "AND(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "AND8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mi")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "CMP8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "OR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "OR8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)r(mr)?")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "SUB(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "SUB8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "TEST(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mi")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mr")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "XOR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "XOR8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)rmr")>;
 
 def SKXWriteResGroup82 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup82], (instregex "CVTSI642SSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HADDPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HADDPSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VCVTUSI642SSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSYrr")>;
-def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "(V?)CVTSI642SSrr",
+                                             "VCVTSI642SSZrr",
+                                             "VCVTUSI642SSZrr")>;
 
 def SKXWriteResGroup83 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[SKXWriteResGroup83], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL",
+                                             "SHRD(16|32|64)rrCL")>;
 
 def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> {
   let Latency = 6;
@@ -3354,675 +1285,249 @@ def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]
 }
 def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>;
 
-def SKXWriteResGroup85 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> {
-  let Latency = 6;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKXWriteResGroup85], (instregex "VCVTPS2PHmr")>;
-
 def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "BTR(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "BTS(16|32|64)mi8")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR8m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SAR8mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL8m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHL8mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR8m1")>;
-def: InstRW<[SKXWriteResGroup86], (instregex "SHR8mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8",
+                                             "BTR(16|32|64)mi8",
+                                             "BTS(16|32|64)mi8",
+                                             "SAR(8|16|32|64)m1",
+                                             "SAR(8|16|32|64)mi",
+                                             "SHL(8|16|32|64)m1",
+                                             "SHL(8|16|32|64)mi",
+                                             "SHR(8|16|32|64)m1",
+                                             "SHR(8|16|32|64)mi")>;
 
 def SKXWriteResGroup87 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
   let Latency = 6;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "AND8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "DEC(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "DEC8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "INC(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "INC8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NEG(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NEG8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NOT(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "NOT8m")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "OR8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "PUSH(16|32|64)rmm")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mi")>;
-def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm",
+                                             "PUSH(16|32|64)rmm")>;
 
 def SKXWriteResGroup88 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
   let Latency = 6;
   let NumMicroOps = 6;
   let ResourceCycles = [1,5];
 }
-def: InstRW<[SKXWriteResGroup88], (instregex "STD")>;
+def: InstRW<[SKXWriteResGroup88], (instrs STD)>;
 
 def SKXWriteResGroup89 : SchedWriteRes<[SKXPort23]> {
   let Latency = 7;
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup89], (instregex "LD_F32m")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "LD_F64m")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "LD_F80m")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTF128")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTI128")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSSYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VLDDQUYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPSYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDDUPYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQAYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQUYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSHDUPYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSLDUPYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPSYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTDYrm")>;
-def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTQYrm")>;
-
-def SKXWriteResGroup90 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+def: InstRW<[SKXWriteResGroup89], (instregex "LD_F(32|64|80)m",
+                                             "VBROADCASTF128",
+                                             "VBROADCASTI128",
+                                             "VBROADCASTSDYrm",
+                                             "VBROADCASTSSYrm",
+                                             "VMOVDDUPYrm",
+                                             "VMOVSHDUPYrm",
+                                             "VMOVSLDUPYrm",
+                                             "VPBROADCASTDYrm",
+                                             "VPBROADCASTQYrm")>;
+
+def SKXWriteResGroup90 : SchedWriteRes<[SKXPort01,SKXPort5]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SKXWriteResGroup90], (instregex "VCVTDQ2PDYrr")>;
 
-def SKXWriteResGroup91 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup91], (instregex "COMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "COMISSrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISSrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDrm")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)",
+                                             "VMOVSSZrm(b?)")>;
 
-def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> {
-  let Latency = 7;
+def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> {
+  let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup92], (instregex "INSERTPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PALIGNRrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PBLENDWrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFDmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFHWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFLWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPDrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPSrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBLENDWrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSLLDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPSRLDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSrmi")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDrm")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[SKXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm",
+                                              "(V?)PMOV(SX|ZX)BQrm",
+                                              "(V?)PMOV(SX|ZX)BWrm",
+                                              "(V?)PMOV(SX|ZX)DQrm",
+                                              "(V?)PMOV(SX|ZX)WDrm",
+                                              "(V?)PMOV(SX|ZX)WQrm")>;
 
 def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQYrr")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZrr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup94 : SchedWriteRes<[SKXPort01,SKXPort23]> {
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr",
+                                             "VCVTPD2DQ(Y|Z256)rr",
+                                             "VCVTPD2PS(Y|Z256)rr",
+                                             "VCVTPD2UDQZ256rr",
+                                             "VCVTPS2PD(Y|Z256)rr",
+                                             "VCVTPS2QQZ256rr",
+                                             "VCVTPS2UQQZ256rr",
+                                             "VCVTQQ2PSZ256rr",
+                                             "VCVTTPD2DQ(Y|Z256)rr",
+                                             "VCVTTPD2UDQZ256rr",
+                                             "VCVTTPS2QQZ256rr",
+                                             "VCVTTPS2UQQZ256rr",
+                                             "VCVTUDQ2PDZ256rr",
+                                             "VCVTUQQ2PSZ256rr")>;
+
+def SKXWriteResGroup93z : SchedWriteRes<[SKXPort5,SKXPort05]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup94], (instregex "PABSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PABSDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PABSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PAVGBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PAVGWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PMINUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNBrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNDrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNWrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSLLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSLLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSLLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRADrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRAWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSRLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNBrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNDrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNWrm128")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBrm")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWrm")>;
+def: InstRW<[SKXWriteResGroup93z], (instrs VCVTDQ2PDZrr,
+                                           VCVTPD2DQZrr,
+                                           VCVTPD2PSZrr,
+                                           VCVTPD2UDQZrr,
+                                           VCVTPS2PDZrr,
+                                           VCVTPS2QQZrr,
+                                           VCVTPS2UQQZrr,
+                                           VCVTQQ2PSZrr,
+                                           VCVTTPD2DQZrr,
+                                           VCVTTPD2UDQZrr,
+                                           VCVTTPS2QQZrr,
+                                           VCVTTPS2UQQZrr,
+                                           VCVTUDQ2PDZrr,
+                                           VCVTUQQ2PSZrr)>;
 
 def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 7;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ANDPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPDrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPSrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "ORPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PADDWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PANDNrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PANDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PSUBWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "PXORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPDrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPSrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTI32X2Z128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTSSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTF128rm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTI128rm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDDUPZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA32Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA64Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU16Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU32Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU64Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU8Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVNTDQAZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSHDUPZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSLDUPZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VORPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPANDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDDrmi")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTQZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPORDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPORQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGQZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPXORDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPXORQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VPXORrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "XORPDrm")>;
-def: InstRW<[SKXWriteResGroup95], (instregex "XORPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)",
+                                             "VBLENDMPSZ128rm(b?)",
+                                             "VBROADCASTI32X2Z128m(b?)",
+                                             "VBROADCASTSSZ128m(b?)",
+                                             "VINSERTF128rm",
+                                             "VINSERTI128rm",
+                                             "VMOVAPDZ128rm(b?)",
+                                             "VMOVAPSZ128rm(b?)",
+                                             "VMOVDDUPZ128rm(b?)",
+                                             "VMOVDQA32Z128rm(b?)",
+                                             "VMOVDQA64Z128rm(b?)",
+                                             "VMOVDQU16Z128rm(b?)",
+                                             "VMOVDQU32Z128rm(b?)",
+                                             "VMOVDQU64Z128rm(b?)",
+                                             "VMOVDQU8Z128rm(b?)",
+                                             "VMOVNTDQAZ128rm(b?)",
+                                             "VMOVSHDUPZ128rm(b?)",
+                                             "VMOVSLDUPZ128rm(b?)",
+                                             "VMOVUPDZ128rm(b?)",
+                                             "VMOVUPSZ128rm(b?)",
+                                             "VPADD(B|D|Q|W)Z128rm(b?)",
+                                             "(V?)PADD(B|D|Q|W)rm",
+                                             "VPBLENDDrmi",
+                                             "VPBLENDM(B|D|Q|W)Z128rm(b?)",
+                                             "VPBROADCASTDZ128m(b?)",
+                                             "VPBROADCASTQZ128m(b?)",
+                                             "VPSUB(B|D|Q|W)Z128rm(b?)",
+                                             "(V?)PSUB(B|D|Q|W)rm",
+                                             "VPTERNLOGDZ128rm(b?)i",
+                                             "VPTERNLOGQZ128rm(b?)i")>;
 
 def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm")>;
-def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSWBirm")>;
-def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm",
+                                             "MMX_PACKSSWBirm",
+                                             "MMX_PACKUSWBirm")>;
 
 def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2Wrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2Wrr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup98 : SchedWriteRes<[SKXPort23,SKXPort06]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup98], (instregex "CMOVA(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup98], (instregex "CMOVBE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr",
+                                             "VPERMI2W256rr",
+                                             "VPERMI2Wrr",
+                                             "VPERMT2W128rr",
+                                             "VPERMT2W256rr",
+                                             "VPERMT2Wrr")>;
 
 def SKXWriteResGroup99 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKXWriteResGroup99], (instregex "LEAVE64")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASB")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASL")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASQ")>;
-def: InstRW<[SKXWriteResGroup99], (instregex "SCASW")>;
+def: InstRW<[SKXWriteResGroup99], (instrs LEAVE, LEAVE64,
+                                          SCASB, SCASL, SCASQ, SCASW)>;
 
 def SKXWriteResGroup100 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64Zrr(b?)")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64rr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIZrr(b?)")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIrr")>;
-def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2USI64Zrr(b?)")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr",
+                                              "(V?)CVTSS2SI64(Z?)rr",
+                                              "(V?)CVTTSS2SI64(Z?)rr",
+                                              "VCVTTSS2USI64Zrr")>;
 
 def SKXWriteResGroup101 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup101], (instregex "FLDCW16m")>;
-
-def SKXWriteResGroup102 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort0156]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup102], (instregex "LDMXCSR")>;
-def: InstRW<[SKXWriteResGroup102], (instregex "VLDMXCSR")>;
+def: InstRW<[SKXWriteResGroup101], (instrs FLDCW16m)>;
 
 def SKXWriteResGroup103 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVBkm")>;
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVDkm")>;
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVQkm")>;
-def: InstRW<[SKXWriteResGroup103], (instregex "KMOVWkm")>;
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOV(B|D|Q|W)km")>;
 
 def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> {
   let Latency = 7;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ")>;
-def: InstRW<[SKXWriteResGroup104], (instregex "RETQ")>;
-
-def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup104], (instrs LRETQ, RETQ)>;
 
 def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZmr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ128mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ256mr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPD(Z|Z128|Z256)mr(b?)",
+                                              "VCOMPRESSPS(Z|Z128|Z256)mr(b?)",
+                                              "VPCOMPRESSD(Z|Z128|Z256)mr(b?)",
+                                              "VPCOMPRESSQ(Z|Z128|Z256)mr(b?)")>;
 
 def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL8m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROL8mi")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR8m1")>;
-def: InstRW<[SKXWriteResGroup107], (instregex "ROR8mi")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m1",
+                                              "ROL(8|16|32|64)mi",
+                                              "ROR(8|16|32|64)m1",
+                                              "ROR(8|16|32|64)mi")>;
 
 def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SKXWriteResGroup108], (instregex "XADD(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup108], (instregex "XADD8rm")>;
+def: InstRW<[SKXWriteResGroup108], (instregex "XADD(8|16|32|64)rm")>;
 
 def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,1,1];
 }
-def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup109], (instregex "FARCALL64")>;
+def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m",
+                                              "FARCALL64")>;
 
 def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
   let Latency = 7;
@@ -4039,7 +1544,7 @@ def SKXWriteResGroup111 : SchedWriteRes<[SKXPort6,SKXPort06,SKXPort15,SKXPort015
   let NumMicroOps = 7;
   let ResourceCycles = [1,3,1,2];
 }
-def: InstRW<[SKXWriteResGroup111], (instregex "LOOP")>;
+def: InstRW<[SKXWriteResGroup111], (instrs LOOP)>;
 
 def SKXWriteResGroup112 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
   let Latency = 7;
@@ -4068,629 +1573,142 @@ def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,S
 }
 def: InstRW<[SKXWriteResGroup114], (instrs VSCATTERDPSZmr)>;
 
-def SKXWriteResGroup115 : SchedWriteRes<[SKXPort0]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup115], (instregex "AESIMCrr")>;
-def: InstRW<[SKXWriteResGroup115], (instregex "VAESIMCrr")>;
-
-def SKXWriteResGroup116 : SchedWriteRes<[SKXPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup116], (instregex "PMULLDrr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDYrr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDrr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ128rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ256rri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESDr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESSr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSSr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPDr")>;
-def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPSr")>;
-
-def SKXWriteResGroup117 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPDrm")>;
-def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPSrm")>;
-
 def SKXWriteResGroup118 : SchedWriteRes<[SKXPort1,SKXPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup118], (instregex "BSF(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "BSR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL64m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL(32|64)rm(i8)?")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL8m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "LZCNT(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "MUL(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "MUL8m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "PEXT(32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "POPCNT(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm",
+                                              "PEXT(32|64)rm")>;
 
 def SKXWriteResGroup118_16_1 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
   let Latency = 8;
   let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1]; 
+  let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup118_16_1], (instregex "IMUL16rm(i8)?")>;
+def: InstRW<[SKXWriteResGroup118_16_1], (instrs IMUL16rm, IMUL16rmi, IMUL16rmi8)>;
 
-def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
-  let Latency = 8;
+def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort06, SKXPort0156, SKXPort23]> {
+  let Latency = 9;
   let NumMicroOps = 5;
+  let ResourceCycles = [1,1,2,1];
 }
-def: InstRW<[SKXWriteResGroup118_16_2], (instregex "IMUL16m")>;
-def: InstRW<[SKXWriteResGroup118_16_2], (instregex "MUL16m")>;
-
-def SKXWriteResGroup118_32 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup118_32], (instregex "IMUL32m")>;
-def: InstRW<[SKXWriteResGroup118_32], (instregex "MUL32m")>;
+def: InstRW<[SKXWriteResGroup118_16_2], (instrs IMUL16m, MUL16m)>;
 
 def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOM32m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOM64m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP32m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP64m")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "MMX_PSADBWirm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VFPCLASSSDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBLENDWYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXWQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWYmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSYrmi")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSYrm")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNBYrm256")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNDYrm256")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNWYrm256")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256mi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWYrm")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
+                                              "VFPCLASSSDZrm(b?)",
+                                              "VPBROADCASTBYrm",
+                                              "VPBROADCASTB(Z|Z256)m(b?)",
+                                              "VPBROADCASTWYrm",
+                                              "VPBROADCASTW(Z|Z256)m(b?)",
+                                              "VPMOVSXBDYrm",
+                                              "VPMOVSXBQYrm",
+                                              "VPMOVSXWQYrm")>;
 
 def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 8;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPDYrmi")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPSYrmi")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Z256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Zm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X8rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Z256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Zm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X8rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2Z128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X4rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x8Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x8Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x4Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Z256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVNTDQAZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPANDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDDYrmi")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVQYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPORYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VPXORYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSYrm")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup122 : SchedWriteRes<[SKXPort23,SKXPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPDrm0")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPSrm0")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "PBLENDVBrm0")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPDrm")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPSrm")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBYrm")>;
-def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPD(Z|Z256)rm(b?)",
+                                              "VBLENDMPS(Z|Z256)rm(b?)",
+                                              "VBROADCASTF32X2Z256m(b?)",
+                                              "VBROADCASTF32X2Zm(b?)",
+                                              "VBROADCASTF32X4Z256rm(b?)",
+                                              "VBROADCASTF32X4rm(b?)",
+                                              "VBROADCASTF32X8rm(b?)",
+                                              "VBROADCASTF64X2Z128rm(b?)",
+                                              "VBROADCASTF64X2rm(b?)",
+                                              "VBROADCASTF64X4rm(b?)",
+                                              "VBROADCASTI32X2Z256m(b?)",
+                                              "VBROADCASTI32X2Zm(b?)",
+                                              "VBROADCASTI32X4Z256rm(b?)",
+                                              "VBROADCASTI32X4rm(b?)",
+                                              "VBROADCASTI32X8rm(b?)",
+                                              "VBROADCASTI64X2Z128rm(b?)",
+                                              "VBROADCASTI64X2rm(b?)",
+                                              "VBROADCASTI64X4rm(b?)",
+                                              "VBROADCASTSD(Z|Z256)m(b?)",
+                                              "VBROADCASTSS(Z|Z256)m(b?)",
+                                              "VINSERTF32x4(Z|Z256)rm(b?)",
+                                              "VINSERTF32x8Zrm(b?)",
+                                              "VINSERTF64x2(Z|Z256)rm(b?)",
+                                              "VINSERTF64x4Zrm(b?)",
+                                              "VINSERTI32x4(Z|Z256)rm(b?)",
+                                              "VINSERTI32x8Zrm(b?)",
+                                              "VINSERTI64x2(Z|Z256)rm(b?)",
+                                              "VINSERTI64x4Zrm(b?)",
+                                              "VMOVAPD(Z|Z256)rm(b?)",
+                                              "VMOVAPS(Z|Z256)rm(b?)",
+                                              "VMOVDDUP(Z|Z256)rm(b?)",
+                                              "VMOVDQA32(Z|Z256)rm(b?)",
+                                              "VMOVDQA64(Z|Z256)rm(b?)",
+                                              "VMOVDQU16(Z|Z256)rm(b?)",
+                                              "VMOVDQU32(Z|Z256)rm(b?)",
+                                              "VMOVDQU64(Z|Z256)rm(b?)",
+                                              "VMOVDQU8(Z|Z256)rm(b?)",
+                                              "VMOVNTDQAZ256rm(b?)",
+                                              "VMOVSHDUP(Z|Z256)rm(b?)",
+                                              "VMOVSLDUP(Z|Z256)rm(b?)",
+                                              "VMOVUPD(Z|Z256)rm(b?)",
+                                              "VMOVUPS(Z|Z256)rm(b?)",
+                                              "VPADD(B|D|Q|W)Yrm",
+                                              "VPADD(B|D|Q|W)(Z|Z256)rm(b?)",
+                                              "VPBLENDDYrmi",
+                                              "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)",
+                                              "VPBROADCASTD(Z|Z256)m(b?)",
+                                              "VPBROADCASTQ(Z|Z256)m(b?)",
+                                              "VPSUB(B|D|Q|W)Yrm",
+                                              "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)",
+                                              "VPTERNLOGD(Z|Z256)rm(b?)i",
+                                              "VPTERNLOGQ(Z|Z256)rm(b?)i")>;
 
 def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
   let Latency = 8;
   let NumMicroOps = 4;
   let ResourceCycles = [1,2,1];
 }
-def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHADDSWrm64")>;
-def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHSUBSWrm64")>;
-
-def SKXWriteResGroup124 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort05]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDWrm64")>;
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDrm64")>;
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBDrm64")>;
-def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBWrm64")>;
-
-def SKXWriteResGroup125 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,1,1];
-}
-def: InstRW<[SKXWriteResGroup125], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>;
 
 def SKXWriteResGroup126 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,3];
 }
-def: InstRW<[SKXWriteResGroup126], (instregex "ROR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup126], (instregex "ROR8mCL")>;
+def: InstRW<[SKXWriteResGroup126], (instregex "ROR(8|16|32|64)mCL")>;
 
 def SKXWriteResGroup127 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 8;
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL8m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCL8mi")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR8m1")>;
-def: InstRW<[SKXWriteResGroup127], (instregex "RCR8mi")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL(8|16|32|64)m1",
+                                              "RCL(8|16|32|64)mi",
+                                              "RCR(8|16|32|64)m1",
+                                              "RCR(8|16|32|64)mi")>;
 
 def SKXWriteResGroup128 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
   let Latency = 8;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,3];
 }
-def: InstRW<[SKXWriteResGroup128], (instregex "ROL(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "ROL8mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SAR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SAR8mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHL(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHL8mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup128], (instregex "SHR8mCL")>;
-
-def SKXWriteResGroup129 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
-  let Latency = 8;
-  let NumMicroOps = 6;
-  let ResourceCycles = [1,1,1,3];
-}
-def: InstRW<[SKXWriteResGroup129], (instregex "ADC(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup129], (instregex "ADC8mi")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "ROL(8|16|32|64)mCL",
+                                              "SAR(8|16|32|64)mCL",
+                                              "SHL(8|16|32|64)mCL",
+                                              "SHR(8|16|32|64)mCL")>;
 
 def SKXWriteResGroup130 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 8;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,2,1];
 }
-def: InstRW<[SKXWriteResGroup130], (instregex "ADC(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "ADC8mr")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG8rm")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mi")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mr")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mi")>;
-def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mr")>;
+def: SchedAlias<WriteADCRMW, SKXWriteResGroup130>;
+def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(8|16|32|64)rm")>;
 
 def SKXWriteResGroup131 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
   let Latency = 8;
@@ -4731,886 +1749,305 @@ def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SKXWriteResGroup135], (instregex "MMX_CVTPI2PSirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDUBSWrm64")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDWDirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHRSWrm64")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHUWirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHWirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULLWirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULUDQirm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "RCPSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "RSQRTSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VRCPSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VRSQRTSSm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPDYrm")>;
-def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPSYrm")>;
 
 def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup136], (instregex "PCMPGTQrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "PSADBWrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNQZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VDBPSADBWZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VFPCLASSSSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPBZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPQZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUBZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUQZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUWZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPWZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2D128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PD128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PS128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2Q128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2D128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PD128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PS128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2Q128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXSQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXUQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMINSQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMINUQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDYrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWrm")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMBZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i",
+                                              "VALIGNQZ128rm(b?)i",
+                                              "VCMPPDZ128rm(b?)i",
+                                              "VCMPPSZ128rm(b?)i",
+                                              "VCMPSDZrm",
+                                              "VCMPSSZrm",
+                                              "VFPCLASSSSZrm(b?)",
+                                              "VPCMPBZ128rmi(b?)",
+                                              "VPCMPDZ128rmi(b?)",
+                                              "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
+                                              "VPCMPGT(B|D|Q|W)Z128rm(b?)",
+                                              "(V?)PCMPGTQrm",
+                                              "VPCMPQZ128rmi(b?)",
+                                              "VPCMPU(B|D|Q|W)Z128rmi(b?)",
+                                              "VPCMPWZ128rmi(b?)",
+                                              "VPERMI2D128rm(b?)",
+                                              "VPERMI2PD128rm(b?)",
+                                              "VPERMI2PS128rm(b?)",
+                                              "VPERMI2Q128rm(b?)",
+                                              "VPERMT2D128rm(b?)",
+                                              "VPERMT2PD128rm(b?)",
+                                              "VPERMT2PS128rm(b?)",
+                                              "VPERMT2Q128rm(b?)",
+                                              "VPMAXSQZ128rm(b?)",
+                                              "VPMAXUQZ128rm(b?)",
+                                              "VPMINSQZ128rm(b?)",
+                                              "VPMINUQZ128rm(b?)",
+                                              "VPMOVSXBDZ128rm(b?)",
+                                              "VPMOVSXBQZ128rm(b?)",
+                                              "VPMOVSXBWYrm",
+                                              "VPMOVSXBWZ128rm(b?)",
+                                              "VPMOVSXDQYrm",
+                                              "VPMOVSXDQZ128rm(b?)",
+                                              "VPMOVSXWDYrm",
+                                              "VPMOVSXWDZ128rm(b?)",
+                                              "VPMOVSXWQZ128rm(b?)",
+                                              "VPMOVZXBDZ128rm(b?)",
+                                              "VPMOVZXBQZ128rm(b?)",
+                                              "VPMOVZXBWZ128rm(b?)",
+                                              "VPMOVZXDQZ128rm(b?)",
+                                              "VPMOVZXWDYrm",
+                                              "VPMOVZXWDZ128rm(b?)",
+                                              "VPMOVZXWQZ128rm(b?)",
+                                              "VPTESTMBZ128rm(b?)",
+                                              "VPTESTMDZ128rm(b?)",
+                                              "VPTESTMQZ128rm(b?)",
+                                              "VPTESTMWZ128rm(b?)",
+                                              "VPTESTNMBZ128rm(b?)",
+                                              "VPTESTNMDZ128rm(b?)",
+                                              "VPTESTNMQZ128rm(b?)",
+                                              "VPTESTNMWZ128rm(b?)")>;
 
 def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup137], (instregex "ADDSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "ADDSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "CMPSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "CMPSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "CVTPS2PDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTPS2PIirm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTTPS2PIirm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MULSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "MULSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "SUBSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "SUBSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VADDSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VADDSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPH2PSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPS2PDrm")>;
-def: InstRW<[SKXWriteResGroup137],
-            (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMULSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VMULSSrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSDrm")>;
-def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSSrm")>;
-
-def SKXWriteResGroup138 : SchedWriteRes<[SKXPort0,SKXPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PSZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PDZr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PSZr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup139 : SchedWriteRes<[SKXPort5,SKXPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup139], (instregex "DPPDrri")>;
-def: InstRW<[SKXWriteResGroup139], (instregex "VDPPDrri")>;
-
-def SKXWriteResGroup140 : SchedWriteRes<[SKXPort23,SKXPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPDYrm")>;
-def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPSYrm")>;
-
-def SKXWriteResGroup141 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup141], (instregex "PTESTrm")>;
-def: InstRW<[SKXWriteResGroup141], (instregex "VPTESTrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm",
+                                              "(V?)CVTPS2PDrm")>;
 
 def SKXWriteResGroup142 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort23]> {
   let Latency = 9;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup142], (instregex "MULX64rm")>;
+def: InstRW<[SKXWriteResGroup142], (instrs IMUL64m, MUL64m, MULX64rm)>;
 
 def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKXWriteResGroup143], (instregex "PHADDSWrm128")>;
-def: InstRW<[SKXWriteResGroup143], (instregex "PHSUBSWrm128")>;
-def: InstRW<[SKXWriteResGroup143], (instregex "VPHADDSWrm128")>;
-def: InstRW<[SKXWriteResGroup143], (instregex "VPHSUBSWrm128")>;
-
-def SKXWriteResGroup144 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup144], (instregex "PHADDDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "PHADDWrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBWrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDWrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBDrm")>;
-def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBWrm")>;
+def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm",
+                                              "(V?)PHSUBSWrm")>;
 
 def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8")>;
-def: InstRW<[SKXWriteResGroup145], (instregex "SHRD(16|32|64)mri8")>;
+def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8",
+                                              "SHRD(16|32|64)mri8")>;
 
 def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
   let Latency = 9;
   let NumMicroOps = 5;
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup146], (instregex "LSL(16|32|64)rm")>;
-
-def SKXWriteResGroup147 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup147], (instregex "AESDECLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "AESDECrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "AESENCLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "AESENCrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "RCPPSm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "RSQRTPSm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCLASTrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCrm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRCPPSm")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRTPSm")>;
+def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm",
+                                              "LSL(16|32|64)rm")>;
 
 def SKXWriteResGroup148 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F16m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F32m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F64m")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZ256rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2F128rm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2I128rm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2D256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Drm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PD256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PS256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Q256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Qrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDYmi")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQYmi")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256m(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2D256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Drm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PD256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PDrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PS256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PSrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Q256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Qrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWYrm")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Zrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Zrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Zrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Z256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Zrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+                                              "ILD_F(16|32|64)m",
+                                              "VALIGND(Z|Z256)rm(b?)i",
+                                              "VALIGNQ(Z|Z256)rm(b?)i",
+                                              "VCMPPD(Z|Z256)rm(b?)i",
+                                              "VCMPPS(Z|Z256)rm(b?)i",
+                                              "VPCMPB(Z|Z256)rmi(b?)",
+                                              "VPCMPD(Z|Z256)rmi(b?)",
+                                              "VPCMPEQB(Z|Z256)rm(b?)",
+                                              "VPCMPEQD(Z|Z256)rm(b?)",
+                                              "VPCMPEQQ(Z|Z256)rm(b?)",
+                                              "VPCMPEQW(Z|Z256)rm(b?)",
+                                              "VPCMPGTB(Z|Z256)rm(b?)",
+                                              "VPCMPGTD(Z|Z256)rm(b?)",
+                                              "VPCMPGTQYrm",
+                                              "VPCMPGTQ(Z|Z256)rm(b?)",
+                                              "VPCMPGTW(Z|Z256)rm(b?)",
+                                              "VPCMPQ(Z|Z256)rmi(b?)",
+                                              "VPCMPU(B|D|Q|W)Z256rmi(b?)",
+                                              "VPCMPU(B|D|Q|W)Zrmi(b?)",
+                                              "VPCMPW(Z|Z256)rmi(b?)",
+                                              "VPMAXSQ(Z|Z256)rm(b?)",
+                                              "VPMAXUQ(Z|Z256)rm(b?)",
+                                              "VPMINSQ(Z|Z256)rm(b?)",
+                                              "VPMINUQ(Z|Z256)rm(b?)",
+                                              "VPTESTM(B|D|Q|W)Z256rm(b?)",
+                                              "VPTESTM(B|D|Q|W)Zrm(b?)",
+                                              "VPTESTNM(B|D|Q|W)Z256rm(b?)",
+                                              "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
 
 def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CMPPDrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CMPPSrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTDQ2PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTSS2SDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "CVTTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MULPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "MULPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PHMINPOSUWrm128")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMADDUBSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMADDWDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULHRSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULHUWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULHWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULLWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "PMULUDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "SUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "SUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPDrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPSrmi")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSYrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2QQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UQQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSDrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSSrmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149],
-          (instregex
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128m(b?)(k?)(z?)",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m",
-           "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSDm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSSm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSSZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VMULSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPHMINPOSUWrm128")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESSZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESDZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESSZ128rmi(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSDZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSSZrm(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup150 : SchedWriteRes<[SKXPort0]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
-}
-def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRIrr")>;
-def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRM128rr")>;
-def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRIrr")>;
-def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRM128rr")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)",
+                                              "VCVTDQ2PSZ128rm(b?)",
+                                              "(V?)CVTDQ2PSrm",
+                                              "VCVTPD2QQZ128rm(b?)",
+                                              "VCVTPD2UQQZ128rm(b?)",
+                                              "VCVTPH2PSZ128rm(b?)",
+                                              "VCVTPS2DQZ128rm(b?)",
+                                              "(V?)CVTPS2DQrm",
+                                              "VCVTPS2PDZ128rm(b?)",
+                                              "VCVTPS2QQZ128rm(b?)",
+                                              "VCVTPS2UDQZ128rm(b?)",
+                                              "VCVTPS2UQQZ128rm(b?)",
+                                              "VCVTQQ2PDZ128rm(b?)",
+                                              "VCVTQQ2PSZ128rm(b?)",
+                                              "VCVTSS2SDZrm",
+                                              "(V?)CVTSS2SDrm",
+                                              "VCVTTPD2QQZ128rm(b?)",
+                                              "VCVTTPD2UQQZ128rm(b?)",
+                                              "VCVTTPS2DQZ128rm(b?)",
+                                              "(V?)CVTTPS2DQrm",
+                                              "VCVTTPS2QQZ128rm(b?)",
+                                              "VCVTTPS2UDQZ128rm(b?)",
+                                              "VCVTTPS2UQQZ128rm(b?)",
+                                              "VCVTUDQ2PDZ128rm(b?)",
+                                              "VCVTUDQ2PSZ128rm(b?)",
+                                              "VCVTUQQ2PDZ128rm(b?)",
+                                              "VCVTUQQ2PSZ128rm(b?)")>;
 
 def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 10;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup151], (instregex "MPSADBWrmi")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VMPSADBWrmi")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDQZ128rm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup152 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup152], (instregex "MMX_CVTPI2PDirm")>;
-def: InstRW<[SKXWriteResGroup152], (instregex "VPTESTYrm")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)",
+                                              "VEXPANDPSZ128rm(b?)",
+                                              "VPEXPANDDZ128rm(b?)",
+                                              "VPEXPANDQZ128rm(b?)")>;
 
 def SKXWriteResGroup153 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 10;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup153], (instregex "CVTSD2SSrm")>;
-def: InstRW<[SKXWriteResGroup153], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[SKXWriteResGroup153], (instregex "(V?)CVTSD2SSrm")>;
 
 def SKXWriteResGroup154 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
   let Latency = 10;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWrm256")>;
-def: InstRW<[SKXWriteResGroup154], (instregex "VPHSUBSWrm256")>;
-
-def SKXWriteResGroup155 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 10;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDDYrm")>;
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDWYrm")>;
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBDYrm")>;
-def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBWYrm")>;
+def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWYrm",
+                                              "VPHSUBSWYrm")>;
 
 def SKXWriteResGroup156 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort06,SKXPort0156]> {
-  let Latency = 10;
+  let Latency = 9;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,1,1];
 }
-def: InstRW<[SKXWriteResGroup156], (instregex "MULX32rm")>;
+def: InstRW<[SKXWriteResGroup156], (instrs IMUL32m, MUL32m, MULX32rm)>;
 
 def SKXWriteResGroup157 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 10;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,1,1,3];
 }
-def: InstRW<[SKXWriteResGroup157], (instregex "ADD8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "AND8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "OR8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "SUB8mi")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(16|32|64)rm")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "XCHG8rm")>;
-def: InstRW<[SKXWriteResGroup157], (instregex "XOR8mi")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(8|16|32|64)rm")>;
 
-def SKXWriteResGroup158 : SchedWriteRes<[SKXPort05,SKXPort0156]> {
-  let Latency = 10;
-  let NumMicroOps = 10;
-  let ResourceCycles = [9,1];
-}
-def: InstRW<[SKXWriteResGroup158], (instregex "MMX_EMMS")>;
-
-def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0]> {
+def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
   let Latency = 11;
   let NumMicroOps = 1;
-  let ResourceCycles = [1];
+  let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup159], (instregex "DIVPSrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "DIVSSrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSYrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSrr")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSrr")>;
+def : SchedAlias<WriteFDivX,  SKXWriteResGroup159>; // TODO - convert to ZnWriteResFpuPair
 
 def SKXWriteResGroup160 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F32m")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F64m")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRCPPSYm")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRTPSYm")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F(32|64)m")>;
 
 def SKXWriteResGroup161 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 11;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPDYrmi")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPSYrmi")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2QQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UQQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161],
-          (instregex
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256m(b?)(k?)(z?)",
-           "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSZ256m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSYrm")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PD(Z|Z256)rm(b?)",
+                                              "VCVTDQ2PSYrm",
+                                              "VCVTDQ2PS(Z|Z256)rm(b?)",
+                                              "VCVTPH2PS(Z|Z256)rm(b?)",
+                                              "VCVTPS2PDYrm",
+                                              "VCVTPS2PD(Z|Z256)rm(b?)",
+                                              "VCVTQQ2PD(Z|Z256)rm(b?)",
+                                              "VCVTQQ2PSZ256rm(b?)",
+                                              "VCVT(T?)PD2QQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PD2UQQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PS2DQYrm",
+                                              "VCVT(T?)PS2DQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PS2QQZ256rm(b?)",
+                                              "VCVT(T?)PS2UDQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PS2UQQZ256rm(b?)",
+                                              "VCVTUDQ2PD(Z|Z256)rm(b?)",
+                                              "VCVTUDQ2PS(Z|Z256)rm(b?)",
+                                              "VCVTUQQ2PD(Z|Z256)rm(b?)",
+                                              "VCVTUQQ2PSZ256rm(b?)")>;
 
 def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOM16m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOM32m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP16m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP32m")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VMPSADBWYrmi")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOM(P?)(16|32)m",
+                                              "VEXPANDPD(Z|Z256)rm(b?)",
+                                              "VEXPANDPS(Z|Z256)rm(b?)",
+                                              "VPEXPANDD(Z|Z256)rm(b?)",
+                                              "VPEXPANDQ(Z|Z256)rm(b?)")>;
 
 def SKXWriteResGroup163 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [1,2];
 }
-def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm")>;
 
 def SKXWriteResGroup164 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup164], (instregex "CVTDQ2PDrm")>;
-def: InstRW<[SKXWriteResGroup164], (instregex "VCVTDQ2PDrm")>;
-
-def SKXWriteResGroup165 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
-  let Latency = 11;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,1,1];
-}
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2USI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2USIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2USI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64rm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIrm")>;
-def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2USIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup164], (instregex "(V?)CVTDQ2PDrm")>;
 
 def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 11;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2DQrm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "CVTTPD2DQrm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTPD2PIirm")>;
-def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm",
+                                              "CVT(T?)PD2DQrm",
+                                              "MMX_CVT(T?)PD2PIirm")>;
 
 def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 11;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>;
 
 def SKXWriteResGroup168 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 11;
   let NumMicroOps = 6;
   let ResourceCycles = [1,1,1,2,1];
 }
-def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL")>;
-def: InstRW<[SKXWriteResGroup168], (instregex "SHRD(16|32|64)mrCL")>;
+def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL",
+                                              "SHRD(16|32|64)mrCL")>;
 
 def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
   let Latency = 11;
   let NumMicroOps = 7;
   let ResourceCycles = [2,3,2];
 }
-def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL")>;
-def: InstRW<[SKXWriteResGroup169], (instregex "RCR(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL",
+                                              "RCR(16|32|64)rCL")>;
 
 def SKXWriteResGroup170 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 11;
@@ -5624,80 +2061,44 @@ def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
   let NumMicroOps = 11;
   let ResourceCycles = [2,9];
 }
-def: InstRW<[SKXWriteResGroup171], (instregex "LOOPE")>;
-def: InstRW<[SKXWriteResGroup171], (instregex "LOOPNE")>;
-
-def SKXWriteResGroup172 : SchedWriteRes<[SKXPort0]> {
-  let Latency = 12;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup172], (instregex "SQRTPSr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "SQRTSSr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSYr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSr")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSZr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSr")>;
+def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>;
 
-def SKXWriteResGroup173 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> {
   let Latency = 12;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
 }
-def: InstRW<[SKXWriteResGroup173], (instregex "PCLMULQDQrm")>;
-def: InstRW<[SKXWriteResGroup173], (instregex "VPCLMULQDQrm")>;
+def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
 
-def SKXWriteResGroup174 : SchedWriteRes<[SKXPort015]> {
+def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> {
   let Latency = 12;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
-def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup174z], (instregex "VPMULLQZrr")>;
 
 def SKXWriteResGroup175 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 12;
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)")>;
 
 def SKXWriteResGroup176 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
   let Latency = 12;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSD2USIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSS2USI64Zrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSD2USIZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSS2USI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVT(T?)SD2USIZrm(b?)",
+                                              "VCVT(T?)SS2USI64Zrm(b?)")>;
 
 def SKXWriteResGroup177 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 12;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2UQQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2QQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2UQQZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup178 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 12;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup178], (instregex "HADDPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "HADDPSrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPSrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPSrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPDrm")>;
-def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPSrm")>;
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVT(T?)PS2QQZrm(b?)",
+                                              "VCVT(T?)PS2UQQZrm(b?)")>;
 
 def SKXWriteResGroup179 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 12;
@@ -5711,14 +2112,9 @@ def SKXWriteResGroup180 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI16m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI32m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI16m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI32m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI16m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI32m")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "(ADD|SUB|SUBR)_FI(16|32)m",
+                                              "VPERMWZ256rm(b?)",
+                                              "VPERMWZrm(b?)")>;
 
 def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
   let Latency = 13;
@@ -5727,100 +2123,58 @@ def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup181], (instregex "VCVTDQ2PDYrm")>;
 
-def SKXWriteResGroup182 : SchedWriteRes<[SKXPort5,SKXPort015]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
-}
-def: InstRW<[SKXWriteResGroup182], (instregex "DPPSrri")>;
-def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSYrri")>;
-def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSrri")>;
-
 def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 13;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPDYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPSYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPDYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPSYrm")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup183], (instregex "VPERMT2W128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)",
+                                              "VPERMT2W128rm(b?)")>;
 
-def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0]> {
+def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
   let Latency = 14;
   let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup184], (instregex "DIVPDrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "DIVSDrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDYrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDrr")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDZrr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDrr")>;
-
-def SKXWriteResGroup185 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 14;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
+  let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup185], (instregex "AESIMCrm")>;
-def: InstRW<[SKXWriteResGroup185], (instregex "VAESIMCrm")>;
+def : SchedAlias<WriteFDiv64,  SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
 
-def SKXWriteResGroup186 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+def SKXWriteResGroup184_1 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
   let Latency = 14;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
+  let NumMicroOps = 1;
+  let ResourceCycles = [1,5];
 }
-def: InstRW<[SKXWriteResGroup186], (instregex "PMULLDrm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPSm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSSm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDrm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPDZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPSZ128rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESDm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESSm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPSm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSDm")>;
-def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSSm")>;
+def : SchedAlias<WriteFDiv64Y, SKXWriteResGroup184_1>; // TODO - convert to ZnWriteResFpuPair
 
 def SKXWriteResGroup187 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
   let Latency = 14;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI16m")>;
-def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI32m")>;
+def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI(16|32)m")>;
 
 def SKXWriteResGroup188 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 14;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2DQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2UDQZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup188], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)",
+                                              "VCVTPD2PSZrm(b?)",
+                                              "VCVTPD2UDQZrm(b?)",
+                                              "VCVTQQ2PSZrm(b?)",
+                                              "VCVTTPD2DQZrm(b?)",
+                                              "VCVTTPD2UDQZrm(b?)",
+                                              "VCVTUQQ2PSZrm(b?)")>;
 
 def SKXWriteResGroup189 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 14;
   let NumMicroOps = 4;
   let ResourceCycles = [2,1,1];
 }
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2Wrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2W256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2Wrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)",
+                                              "VPERMI2Wrm(b?)",
+                                              "VPERMT2W256rm(b?)",
+                                              "VPERMT2Wrm(b?)")>;
 
 def SKXWriteResGroup190 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 14;
@@ -5834,206 +2188,85 @@ def SKXWriteResGroup191 : SchedWriteRes<[SKXPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FPrST0")>;
-def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FST0r")>;
-def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FrST0")>;
-
-def SKXWriteResGroup192 : SchedWriteRes<[SKXPort23,SKXPort015]> {
-  let Latency = 15;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1,2];
-}
-def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDYrm")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZrm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZ256rm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZrm(b?)i(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPDm")>;
-def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPSm")>;
-
-def SKXWriteResGroup193 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 15;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKXWriteResGroup193], (instregex "DPPDrmi")>;
-def: InstRW<[SKXWriteResGroup193], (instregex "VDPPDrmi")>;
+def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
 
 def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
   let Latency = 15;
   let NumMicroOps = 8;
   let ResourceCycles = [1,2,2,1,2];
 }
-def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)")>;
 
 def SKXWriteResGroup195 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 15;
   let NumMicroOps = 10;
   let ResourceCycles = [1,1,1,5,1,1];
 }
-def: InstRW<[SKXWriteResGroup195], (instregex "RCL(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup195], (instregex "RCL8mCL")>;
-
-def SKXWriteResGroup196 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup196], (instregex "DIVSSrm")>;
-def: InstRW<[SKXWriteResGroup196], (instregex "VDIVSSrm")>;
-
-def SKXWriteResGroup197 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 16;
-  let NumMicroOps = 4;
-  let ResourceCycles = [3,1];
-}
-def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRIrm")>;
-def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRM128rm")>;
-def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRIrm")>;
-def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRM128rm")>;
-
-def SKXWriteResGroup198 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
-  let Latency = 16;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PSZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PDZm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PSZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup195], (instregex "RCL(8|16|32|64)mCL")>;
 
 def SKXWriteResGroup199 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 16;
   let NumMicroOps = 14;
   let ResourceCycles = [1,1,1,4,2,5];
 }
-def: InstRW<[SKXWriteResGroup199], (instregex "CMPXCHG8B")>;
+def: InstRW<[SKXWriteResGroup199], (instrs CMPXCHG8B)>;
 
 def SKXWriteResGroup200 : SchedWriteRes<[SKXPort0156]> {
   let Latency = 16;
   let NumMicroOps = 16;
   let ResourceCycles = [16];
 }
-def: InstRW<[SKXWriteResGroup200], (instregex "VZEROALL")>;
+def: InstRW<[SKXWriteResGroup200], (instrs VZEROALL)>;
 
-def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
   let Latency = 17;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let ResourceCycles = [1,1,5];
 }
-def: InstRW<[SKXWriteResGroup201], (instregex "DIVPSrm")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "SQRTSSm")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSrm")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VDIVSSZrm(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup201], (instregex "VSQRTSSm")>;
+def : SchedAlias<WriteFDivXLd, SKXWriteResGroup201>; // TODO - convert to ZnWriteResFpuPair
 
 def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
   let Latency = 17;
   let NumMicroOps = 15;
   let ResourceCycles = [2,1,2,4,2,4];
 }
-def: InstRW<[SKXWriteResGroup202], (instregex "XCH_F")>;
-
-def SKXWriteResGroup203 : SchedWriteRes<[SKXPort0]> {
-  let Latency = 18;
-  let NumMicroOps = 1;
-  let ResourceCycles = [1];
-}
-def: InstRW<[SKXWriteResGroup203], (instregex "SQRTPDr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "SQRTSDr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDYr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ128r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ256r(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDr")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDZr(b?)(_Int)?(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDr")>;
-
-def SKXWriteResGroup204 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 18;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup204], (instregex "SQRTPSm")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSYrm")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSm")>;
-def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTSSZm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>;
 
 def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 18;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup206 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort0156]> {
-  let Latency = 18;
-  let NumMicroOps = 8;
-  let ResourceCycles = [4,3,1];
-}
-def: InstRW<[SKXWriteResGroup206], (instregex "PCMPESTRIrr")>;
-def: InstRW<[SKXWriteResGroup206], (instregex "VPCMPESTRIrr")>;
+def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)")>;
 
 def SKXWriteResGroup207 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> {
   let Latency = 18;
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,5];
 }
-def: InstRW<[SKXWriteResGroup207], (instregex "CPUID")>;
-def: InstRW<[SKXWriteResGroup207], (instregex "RDTSC")>;
+def: InstRW<[SKXWriteResGroup207], (instrs CPUID, RDTSC)>;
 
 def SKXWriteResGroup208 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 18;
   let NumMicroOps = 11;
   let ResourceCycles = [2,1,1,4,1,2];
 }
-def: InstRW<[SKXWriteResGroup208], (instregex "RCR(16|32|64)mCL")>;
-def: InstRW<[SKXWriteResGroup208], (instregex "RCR8mCL")>;
+def: InstRW<[SKXWriteResGroup208], (instregex "RCR(8|16|32|64)mCL")>;
 
-def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
   let Latency = 19;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup209], (instregex "DIVSDrm")>;
-def: InstRW<[SKXWriteResGroup209], (instregex "VDIVSDrm")>;
-def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSYm")>;
-def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSZ256m(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup210 : SchedWriteRes<[SKXPort0,SKXPort015]> {
-  let Latency = 19;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
+  let ResourceCycles = [1,1,4];
 }
-def: InstRW<[SKXWriteResGroup210], (instregex "VSQRTPSZr(b?)(k?)(z?)")>;
+def : SchedAlias<WriteFDiv64Ld,  SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair
 
 def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 19;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup212 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 19;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,1,3];
-}
-def: InstRW<[SKXWriteResGroup212], (instregex "DPPSrmi")>;
-def: InstRW<[SKXWriteResGroup212], (instregex "VDPPSrmi")>;
-
-def SKXWriteResGroup213 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015,SKXPort0156]> {
-  let Latency = 19;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[SKXWriteResGroup213], (instregex "PCMPESTRM128rr")>;
-def: InstRW<[SKXWriteResGroup213], (instregex "VPCMPESTRM128rr")>;
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)",
+                                              "VPMULLQZrm(b?)")>;
 
 def SKXWriteResGroup214 : SchedWriteRes<[]> {
   let Latency = 20;
@@ -6048,26 +2281,14 @@ def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FPrST0")>;
-def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FST0r")>;
-def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FrST0")>;
+def: InstRW<[SKXWriteResGroup215], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
 
-def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
   let Latency = 20;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup216], (instregex "DIVPDrm")>;
-def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDZ128rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDrm")>;
-def: InstRW<[SKXWriteResGroup216], (instregex "VDIVSDZrm(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup217 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 20;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,1,3];
+  let ResourceCycles = [1,1,4];
 }
-def: InstRW<[SKXWriteResGroup217], (instregex "VDPPSYrmi")>;
+def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair
 
 def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
   let Latency = 20;
@@ -6084,40 +2305,28 @@ def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SK
   let NumMicroOps = 8;
   let ResourceCycles = [1,1,1,1,1,1,2];
 }
-def: InstRW<[SKXWriteResGroup219], (instregex "INSB")>;
-def: InstRW<[SKXWriteResGroup219], (instregex "INSL")>;
-def: InstRW<[SKXWriteResGroup219], (instregex "INSW")>;
+def: InstRW<[SKXWriteResGroup219], (instrs INSB, INSL, INSW)>;
 
 def SKXWriteResGroup220 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort0156]> {
   let Latency = 20;
   let NumMicroOps = 10;
   let ResourceCycles = [1,2,7];
 }
-def: InstRW<[SKXWriteResGroup220], (instregex "MWAITrr")>;
+def: InstRW<[SKXWriteResGroup220], (instrs MWAITrr)>;
 
-def SKXWriteResGroup221 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
-  let Latency = 20;
-  let NumMicroOps = 11;
-  let ResourceCycles = [3,6,2];
-}
-def: InstRW<[SKXWriteResGroup221], (instregex "AESKEYGENASSIST128rr")>;
-def: InstRW<[SKXWriteResGroup221], (instregex "VAESKEYGENASSIST128rr")>;
-
-def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
   let Latency = 21;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
+  let ResourceCycles = [1,1,8];
 }
-def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDYrm")>;
-def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDZ256rm(b?)(k?)(z?)")>;
+def : SchedAlias<WriteFDiv64YLd, SKXWriteResGroup222>; // TODO - convert to ZnWriteResFpuPair
 
 def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let Latency = 22;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F32m")>;
-def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F64m")>;
+def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>;
 
 def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
   let Latency = 22;
@@ -6176,72 +2385,22 @@ def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
   let NumMicroOps = 14;
   let ResourceCycles = [5,5,4];
 }
-def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTQZ256rr(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup226 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 23;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup226], (instregex "SQRTSDm")>;
-def: InstRW<[SKXWriteResGroup226], (instregex "VSQRTSDm")>;
-
-def SKXWriteResGroup227 : SchedWriteRes<[SKXPort0,SKXPort015]> {
-  let Latency = 23;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr",
+                                              "VPCONFLICTQZ256rr")>;
 
 def SKXWriteResGroup228 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 23;
   let NumMicroOps = 19;
   let ResourceCycles = [2,1,4,1,1,4,6];
 }
-def: InstRW<[SKXWriteResGroup228], (instregex "CMPXCHG16B")>;
-
-def SKXWriteResGroup229 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 24;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup229], (instregex "SQRTPDm")>;
-def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDZ128m(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDm")>;
-def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTSDZm(_Int)?(k?)(z?)")>;
-
-def SKXWriteResGroup230 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
-  let Latency = 24;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup230], (instregex "VDIVPSZrm(b?)(k?)(z?)")>;
-
-def SKXWriteResGroup231 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> {
-  let Latency = 24;
-  let NumMicroOps = 9;
-  let ResourceCycles = [4,3,1,1];
-}
-def: InstRW<[SKXWriteResGroup231], (instregex "PCMPESTRIrm")>;
-def: InstRW<[SKXWriteResGroup231], (instregex "VPCMPESTRIrm")>;
-
-def SKXWriteResGroup232 : SchedWriteRes<[SKXPort0,SKXPort23]> {
-  let Latency = 25;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDYm")>;
-def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup228], (instrs CMPXCHG16B)>;
 
 def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
   let Latency = 25;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI16m")>;
-def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI32m")>;
+def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;
 
 def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
   let Latency = 25;
@@ -6254,29 +2413,6 @@ def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm,
                                            VPGATHERQDZrm,
                                            VPGATHERQQZ256rm)>;
 
-def SKXWriteResGroup235 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 25;
-  let NumMicroOps = 10;
-  let ResourceCycles = [4,3,1,1,1];
-}
-def: InstRW<[SKXWriteResGroup235], (instregex "PCMPESTRM128rm")>;
-def: InstRW<[SKXWriteResGroup235], (instregex "VPCMPESTRM128rm")>;
-
-def SKXWriteResGroup236 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
-  let Latency = 25;
-  let NumMicroOps = 11;
-  let ResourceCycles = [3,6,1,1];
-}
-def: InstRW<[SKXWriteResGroup236], (instregex "AESKEYGENASSIST128rm")>;
-def: InstRW<[SKXWriteResGroup236], (instregex "VAESKEYGENASSIST128rm")>;
-
-def SKXWriteResGroup237 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
-  let Latency = 26;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup237], (instregex "VSQRTPSZm(b?)(k?)(z?)")>;
-
 def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
   let Latency = 26;
   let NumMicroOps = 5;
@@ -6292,8 +2428,7 @@ def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F32m")>;
-def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F64m")>;
+def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;
 
 def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
   let Latency = 27;
@@ -6308,30 +2443,21 @@ def SKXWriteResGroup241 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156
   let NumMicroOps = 8;
   let ResourceCycles = [2,4,1,1];
 }
-def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup241], (instregex "IDIV8m")>;
+def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(8|16|32|64)m")>;
 
 def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
   let Latency = 29;
   let NumMicroOps = 15;
   let ResourceCycles = [5,5,1,4];
 }
-def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)")>;
 
 def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
   let Latency = 30;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI16m")>;
-def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI32m")>;
-
-def SKXWriteResGroup244 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
-  let Latency = 30;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup244], (instregex "VDIVPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;
 
 def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
   let Latency = 30;
@@ -6341,40 +2467,29 @@ def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort01
 def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm,
                                            VPGATHERDDZrm)>;
 
-def SKXWriteResGroup246 : SchedWriteRes<[SKXPort0,SKXPort015]> {
-  let Latency = 31;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2,1];
-}
-def: InstRW<[SKXWriteResGroup246], (instregex "VSQRTPDZr(b?)(k?)(z?)")>;
-
 def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,3,4,10];
 }
-def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)ri")>;
-def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)rr")>;
-def: InstRW<[SKXWriteResGroup247], (instregex "IN8ri")>;
-def: InstRW<[SKXWriteResGroup247], (instregex "IN8rr")>;
+def: InstRW<[SKXWriteResGroup247], (instregex "IN(8|16|32)ri",
+                                              "IN(8|16|32)rr")>;
 
 def SKXWriteResGroup248 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
   let ResourceCycles = [1,5,2,1,4,10];
 }
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)ir")>;
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)rr")>;
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT8ir")>;
-def: InstRW<[SKXWriteResGroup248], (instregex "OUT8rr")>;
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT(8|16|32)ir",
+                                              "OUT(8|16|32)rr")>;
 
 def SKXWriteResGroup249 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
   let Latency = 37;
   let NumMicroOps = 21;
   let ResourceCycles = [9,7,5];
 }
-def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr",
+                                              "VPCONFLICTQZrr")>;
 
 def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
   let Latency = 37;
@@ -6383,91 +2498,84 @@ def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156
 }
 def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>;
 
-def SKXWriteResGroup251 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
-  let Latency = 38;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2,1,1];
-}
-def: InstRW<[SKXWriteResGroup251], (instregex "VSQRTPDZm(b?)(k?)(z?)")>;
-
 def SKXWriteResGroup252 : SchedWriteRes<[SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort15,SKXPort0156]> {
   let Latency = 40;
   let NumMicroOps = 18;
   let ResourceCycles = [1,1,2,3,1,1,1,8];
 }
-def: InstRW<[SKXWriteResGroup252], (instregex "VMCLEARm")>;
+def: InstRW<[SKXWriteResGroup252], (instrs VMCLEARm)>;
 
 def SKXWriteResGroup253 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
   let Latency = 41;
   let NumMicroOps = 39;
   let ResourceCycles = [1,10,1,1,26];
 }
-def: InstRW<[SKXWriteResGroup253], (instregex "XSAVE64")>;
+def: InstRW<[SKXWriteResGroup253], (instrs XSAVE64)>;
 
 def SKXWriteResGroup254 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
   let Latency = 42;
   let NumMicroOps = 22;
   let ResourceCycles = [2,20];
 }
-def: InstRW<[SKXWriteResGroup254], (instregex "RDTSCP")>;
+def: InstRW<[SKXWriteResGroup254], (instrs RDTSCP)>;
 
 def SKXWriteResGroup255 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
   let Latency = 42;
   let NumMicroOps = 40;
   let ResourceCycles = [1,11,1,1,26];
 }
-def: InstRW<[SKXWriteResGroup255], (instregex "XSAVE")>;
+def: InstRW<[SKXWriteResGroup255], (instrs XSAVE)>;
+def: InstRW<[SKXWriteResGroup255], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
 
 def SKXWriteResGroup256 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
   let Latency = 44;
   let NumMicroOps = 22;
   let ResourceCycles = [9,7,1,5];
 }
-def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)",
+                                              "VPCONFLICTQZrm(b?)")>;
 
 def SKXWriteResGroup258 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05,SKXPort06,SKXPort0156]> {
   let Latency = 62;
   let NumMicroOps = 64;
   let ResourceCycles = [2,8,5,10,39];
 }
-def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>;
-def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>;
+def: InstRW<[SKXWriteResGroup258], (instrs FLDENVm)>;
 
 def SKXWriteResGroup259 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 63;
   let NumMicroOps = 88;
   let ResourceCycles = [4,4,31,1,2,1,45];
 }
-def: InstRW<[SKXWriteResGroup259], (instregex "FXRSTOR64")>;
+def: InstRW<[SKXWriteResGroup259], (instrs FXRSTOR64)>;
 
 def SKXWriteResGroup260 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
   let Latency = 63;
   let NumMicroOps = 90;
   let ResourceCycles = [4,2,33,1,2,1,47];
 }
-def: InstRW<[SKXWriteResGroup260], (instregex "FXRSTOR")>;
+def: InstRW<[SKXWriteResGroup260], (instrs FXRSTOR)>;
 
 def SKXWriteResGroup261 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
   let Latency = 67;
   let NumMicroOps = 35;
   let ResourceCycles = [17,11,7];
 }
-def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr")>;
 
 def SKXWriteResGroup262 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
   let Latency = 74;
   let NumMicroOps = 36;
   let ResourceCycles = [17,11,1,7];
 }
-def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)")>;
 
 def SKXWriteResGroup263 : SchedWriteRes<[SKXPort5,SKXPort05,SKXPort0156]> {
   let Latency = 75;
   let NumMicroOps = 15;
   let ResourceCycles = [6,3,6];
 }
-def: InstRW<[SKXWriteResGroup263], (instregex "FNINIT")>;
+def: InstRW<[SKXWriteResGroup263], (instrs FNINIT)>;
 
 def SKXWriteResGroup264 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
   let Latency = 76;
@@ -6488,13 +2596,15 @@ def SKXWriteResGroup266 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort4,SKXPort5,SKX
   let NumMicroOps = 100;
   let ResourceCycles = [9,1,11,16,1,11,21,30];
 }
-def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>;
-def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>;
+def: InstRW<[SKXWriteResGroup266], (instrs FSTENVm)>;
 
 def SKXWriteResGroup267 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
   let Latency = 140;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup267], (instregex "PAUSE")>;
+def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
 } // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
index 078d459634ce..d0167753ccd4 100644
--- a/contrib/llvm/lib/Target/X86/X86Schedule.td
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
 // InstrSchedModel annotations for out-of-order CPUs.
-//
-// These annotations are independent of the itinerary classes defined below.
 
 // Instructions with folded loads need to read the memory operand immediately,
 // but other register operands don't have to be read until the load is ready.
@@ -20,6 +19,17 @@ def ReadAfterLd : SchedRead;
 // load + WriteRMW.
 def WriteRMW : SchedWrite;
 
+// Helper to set SchedWrite ExePorts/Latency/ResourceCycles/NumMicroOps.
+multiclass X86WriteRes<SchedWrite SchedRW,
+                       list<ProcResourceKind> ExePorts,
+                       int Lat, list<int> Res, int UOps> {
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
+}
+
 // Most instructions can fold loads, so almost every SchedWrite comes in two
 // variants: With and without a folded load.
 // An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
@@ -39,20 +49,108 @@ multiclass X86SchedWritePair {
   }
 }
 
+// Helpers to mark SchedWrites as unsupported.
+multiclass X86WriteResUnsupported<SchedWrite SchedRW> {
+  let Unsupported = 1 in {
+    def : WriteRes<SchedRW, []>;
+  }
+}
+multiclass X86WriteResPairUnsupported<X86FoldableSchedWrite SchedRW> {
+  let Unsupported = 1 in {
+    def : WriteRes<SchedRW, []>;
+    def : WriteRes<SchedRW.Folded, []>;
+  }
+}
+
+// Multiclass that wraps X86FoldableSchedWrite for each vector width.
+class X86SchedWriteWidths<X86FoldableSchedWrite sScl,
+                          X86FoldableSchedWrite s128,
+                          X86FoldableSchedWrite s256,
+                          X86FoldableSchedWrite s512> {
+  X86FoldableSchedWrite Scl = sScl; // Scalar float/double operations.
+  X86FoldableSchedWrite MMX = sScl; // MMX operations.
+  X86FoldableSchedWrite XMM = s128; // XMM operations.
+  X86FoldableSchedWrite YMM = s256; // YMM operations.
+  X86FoldableSchedWrite ZMM = s512; // ZMM operations.
+}
+
+// Multiclass that wraps X86SchedWriteWidths for each fp vector type.
+class X86SchedWriteSizes<X86SchedWriteWidths sPS,
+                         X86SchedWriteWidths sPD> {
+  X86SchedWriteWidths PS = sPS;
+  X86SchedWriteWidths PD = sPD;
+}
+
+// Multiclass that wraps move/load/store triple for a vector width.
+class X86SchedWriteMoveLS<SchedWrite MoveRR,
+                          SchedWrite LoadRM,
+                          SchedWrite StoreMR> {
+  SchedWrite RR = MoveRR;
+  SchedWrite RM = LoadRM;
+  SchedWrite MR = StoreMR;
+}
+
+// Multiclass that wraps X86SchedWriteMoveLS for each vector width.
+class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
+                                X86SchedWriteMoveLS s128,
+                                X86SchedWriteMoveLS s256,
+                                X86SchedWriteMoveLS s512> {
+  X86SchedWriteMoveLS Scl = sScl; // Scalar float/double operations.
+  X86SchedWriteMoveLS MMX = sScl; // MMX operations.
+  X86SchedWriteMoveLS XMM = s128; // XMM operations.
+  X86SchedWriteMoveLS YMM = s256; // YMM operations.
+  X86SchedWriteMoveLS ZMM = s512; // ZMM operations.
+}
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad    : SchedWrite;
+def WriteStore   : SchedWrite;
+def WriteStoreNT : SchedWrite;
+def WriteMove    : SchedWrite;
+
 // Arithmetic.
-defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
-defm WriteIMul : X86SchedWritePair; // Integer multiplication.
-def  WriteIMulH : SchedWrite;       // Integer multiplication, high part.
-defm WriteIDiv : X86SchedWritePair; // Integer division.
-def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
+defm WriteALU    : X86SchedWritePair; // Simple integer ALU op.
+defm WriteADC    : X86SchedWritePair; // Integer ALU + flags op.
+def  WriteALURMW : WriteSequence<[WriteALULd, WriteStore]>;
+def  WriteADCRMW : WriteSequence<[WriteADCLd, WriteStore]>;
+defm WriteIMul   : X86SchedWritePair; // Integer multiplication.
+defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication.
+def  WriteIMulH  : SchedWrite;        // Integer multiplication, high part.
+def  WriteLEA    : SchedWrite;        // LEA instructions can't fold loads.
+
+defm WriteBSWAP32: X86SchedWritePair; // Byte Order (Endiannes) Swap
+defm WriteBSWAP64: X86SchedWritePair; // Byte Order (Endiannes) Swap
+
+// Integer division.
+defm WriteDiv8   : X86SchedWritePair;
+defm WriteDiv16  : X86SchedWritePair;
+defm WriteDiv32  : X86SchedWritePair;
+defm WriteDiv64  : X86SchedWritePair;
+defm WriteIDiv8  : X86SchedWritePair;
+defm WriteIDiv16 : X86SchedWritePair;
+defm WriteIDiv32 : X86SchedWritePair;
+defm WriteIDiv64 : X86SchedWritePair;
+
+defm WriteBSF : X86SchedWritePair; // Bit scan forward.
+defm WriteBSR : X86SchedWritePair; // Bit scan reverse.
+defm WritePOPCNT : X86SchedWritePair; // Bit population count.
+defm WriteLZCNT : X86SchedWritePair; // Leading zero count.
+defm WriteTZCNT : X86SchedWritePair; // Trailing zero count.
+defm WriteCMOV  : X86SchedWritePair; // Conditional move.
+defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move.
+def  WriteFCMOV : SchedWrite; // X87 conditional move.
+def  WriteSETCC : SchedWrite; // Set register based on condition code.
+def  WriteSETCCStore : SchedWrite;
+def  WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH.
 
 // Integer shifts and rotates.
 defm WriteShift : X86SchedWritePair;
+// Double shift instructions.
+defm WriteShiftDouble : X86SchedWritePair;
 
-// Loads, stores, and moves, not folded with other operations.
-def WriteLoad  : SchedWrite;
-def WriteStore : SchedWrite;
-def WriteMove  : SchedWrite;
+// BMI1 BEXTR, BMI2 BZHI
+defm WriteBEXTR : X86SchedWritePair;
+defm WriteBZHI  : X86SchedWritePair;
 
 // Idioms that clear a register, like xorps %xmm0, %xmm0.
 // These can often bypass execution ports completely.
@@ -63,41 +161,244 @@ def WriteZero : SchedWrite;
 defm WriteJump : X86SchedWritePair;
 
 // Floating point. This covers both scalar and vector operations.
-defm WriteFAdd   : X86SchedWritePair; // Floating point add/sub/compare.
-defm WriteFMul   : X86SchedWritePair; // Floating point multiplication.
-defm WriteFDiv   : X86SchedWritePair; // Floating point division.
+def  WriteFLD0          : SchedWrite;
+def  WriteFLD1          : SchedWrite;
+def  WriteFLDC          : SchedWrite;
+def  WriteFLoad         : SchedWrite;
+def  WriteFLoadX        : SchedWrite;
+def  WriteFLoadY        : SchedWrite;
+def  WriteFMaskedLoad   : SchedWrite;
+def  WriteFMaskedLoadY  : SchedWrite;
+def  WriteFStore        : SchedWrite;
+def  WriteFStoreX       : SchedWrite;
+def  WriteFStoreY       : SchedWrite;
+def  WriteFStoreNT      : SchedWrite;
+def  WriteFStoreNTX     : SchedWrite;
+def  WriteFStoreNTY     : SchedWrite;
+def  WriteFMaskedStore  : SchedWrite;
+def  WriteFMaskedStoreY : SchedWrite;
+def  WriteFMove         : SchedWrite;
+def  WriteFMoveX        : SchedWrite;
+def  WriteFMoveY        : SchedWrite;
+
+defm WriteFAdd    : X86SchedWritePair; // Floating point add/sub.
+defm WriteFAddX   : X86SchedWritePair; // Floating point add/sub (XMM).
+defm WriteFAddY   : X86SchedWritePair; // Floating point add/sub (YMM).
+defm WriteFAddZ   : X86SchedWritePair; // Floating point add/sub (ZMM).
+defm WriteFAdd64  : X86SchedWritePair; // Floating point double add/sub.
+defm WriteFAdd64X : X86SchedWritePair; // Floating point double add/sub (XMM).
+defm WriteFAdd64Y : X86SchedWritePair; // Floating point double add/sub (YMM).
+defm WriteFAdd64Z : X86SchedWritePair; // Floating point double add/sub (ZMM).
+defm WriteFCmp    : X86SchedWritePair; // Floating point compare.
+defm WriteFCmpX   : X86SchedWritePair; // Floating point compare (XMM).
+defm WriteFCmpY   : X86SchedWritePair; // Floating point compare (YMM).
+defm WriteFCmpZ   : X86SchedWritePair; // Floating point compare (ZMM).
+defm WriteFCmp64  : X86SchedWritePair; // Floating point double compare.
+defm WriteFCmp64X : X86SchedWritePair; // Floating point double compare (XMM).
+defm WriteFCmp64Y : X86SchedWritePair; // Floating point double compare (YMM).
+defm WriteFCmp64Z : X86SchedWritePair; // Floating point double compare (ZMM).
+defm WriteFCom    : X86SchedWritePair; // Floating point compare to flags.
+defm WriteFMul    : X86SchedWritePair; // Floating point multiplication.
+defm WriteFMulX   : X86SchedWritePair; // Floating point multiplication (XMM).
+defm WriteFMulY   : X86SchedWritePair; // Floating point multiplication (YMM).
+defm WriteFMulZ   : X86SchedWritePair; // Floating point multiplication (YMM).
+defm WriteFMul64  : X86SchedWritePair; // Floating point double multiplication.
+defm WriteFMul64X : X86SchedWritePair; // Floating point double multiplication (XMM).
+defm WriteFMul64Y : X86SchedWritePair; // Floating point double multiplication (YMM).
+defm WriteFMul64Z : X86SchedWritePair; // Floating point double multiplication (ZMM).
+defm WriteFDiv    : X86SchedWritePair; // Floating point division.
+defm WriteFDivX   : X86SchedWritePair; // Floating point division (XMM).
+defm WriteFDivY   : X86SchedWritePair; // Floating point division (YMM).
+defm WriteFDivZ   : X86SchedWritePair; // Floating point division (ZMM).
+defm WriteFDiv64  : X86SchedWritePair; // Floating point double division.
+defm WriteFDiv64X : X86SchedWritePair; // Floating point double division (XMM).
+defm WriteFDiv64Y : X86SchedWritePair; // Floating point double division (YMM).
+defm WriteFDiv64Z : X86SchedWritePair; // Floating point double division (ZMM).
 defm WriteFSqrt  : X86SchedWritePair; // Floating point square root.
+defm WriteFSqrtX : X86SchedWritePair; // Floating point square root (XMM).
+defm WriteFSqrtY : X86SchedWritePair; // Floating point square root (YMM).
+defm WriteFSqrtZ : X86SchedWritePair; // Floating point square root (ZMM).
+defm WriteFSqrt64  : X86SchedWritePair; // Floating point double square root.
+defm WriteFSqrt64X : X86SchedWritePair; // Floating point double square root (XMM).
+defm WriteFSqrt64Y : X86SchedWritePair; // Floating point double square root (YMM).
+defm WriteFSqrt64Z : X86SchedWritePair; // Floating point double square root (ZMM).
+defm WriteFSqrt80  : X86SchedWritePair; // Floating point long double square root.
 defm WriteFRcp   : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRcpX  : X86SchedWritePair; // Floating point reciprocal estimate (XMM).
+defm WriteFRcpY  : X86SchedWritePair; // Floating point reciprocal estimate (YMM).
+defm WriteFRcpZ  : X86SchedWritePair; // Floating point reciprocal estimate (ZMM).
 defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFRsqrtX: X86SchedWritePair; // Floating point reciprocal square root estimate (XMM).
+defm WriteFRsqrtY: X86SchedWritePair; // Floating point reciprocal square root estimate (YMM).
+defm WriteFRsqrtZ: X86SchedWritePair; // Floating point reciprocal square root estimate (ZMM).
 defm WriteFMA    : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFMAX   : X86SchedWritePair; // Fused Multiply Add (XMM).
+defm WriteFMAY   : X86SchedWritePair; // Fused Multiply Add (YMM).
+defm WriteFMAZ   : X86SchedWritePair; // Fused Multiply Add (ZMM).
+defm WriteDPPD   : X86SchedWritePair; // Floating point double dot product.
+defm WriteDPPS   : X86SchedWritePair; // Floating point single dot product.
+defm WriteDPPSY  : X86SchedWritePair; // Floating point single dot product (YMM).
+defm WriteDPPSZ  : X86SchedWritePair; // Floating point single dot product (ZMM).
+defm WriteFSign  : X86SchedWritePair; // Floating point fabs/fchs.
+defm WriteFRnd   : X86SchedWritePair; // Floating point rounding.
+defm WriteFRndY  : X86SchedWritePair; // Floating point rounding (YMM).
+defm WriteFRndZ  : X86SchedWritePair; // Floating point rounding (ZMM).
+defm WriteFLogic  : X86SchedWritePair; // Floating point and/or/xor logicals.
+defm WriteFLogicY : X86SchedWritePair; // Floating point and/or/xor logicals (YMM).
+defm WriteFLogicZ : X86SchedWritePair; // Floating point and/or/xor logicals (ZMM).
+defm WriteFTest   : X86SchedWritePair; // Floating point TEST instructions.
+defm WriteFTestY  : X86SchedWritePair; // Floating point TEST instructions (YMM).
+defm WriteFTestZ  : X86SchedWritePair; // Floating point TEST instructions (ZMM).
 defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFShuffleY : X86SchedWritePair; // Floating point vector shuffles (YMM).
+defm WriteFShuffleZ : X86SchedWritePair; // Floating point vector shuffles (ZMM).
+defm WriteFVarShuffle  : X86SchedWritePair; // Floating point vector variable shuffles.
+defm WriteFVarShuffleY : X86SchedWritePair; // Floating point vector variable shuffles (YMM).
+defm WriteFVarShuffleZ : X86SchedWritePair; // Floating point vector variable shuffles (ZMM).
 defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
+defm WriteFBlendY : X86SchedWritePair; // Floating point vector blends (YMM).
+defm WriteFBlendZ : X86SchedWritePair; // Floating point vector blends (ZMM).
 defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
+defm WriteFVarBlendY : X86SchedWritePair; // Fp vector variable blends (YMM).
+defm WriteFVarBlendZ : X86SchedWritePair; // Fp vector variable blends (YMZMM).
 
 // FMA Scheduling helper class.
 class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
 // Horizontal Add/Sub (float and integer)
 defm WriteFHAdd  : X86SchedWritePair;
-defm WritePHAdd : X86SchedWritePair;
+defm WriteFHAddY : X86SchedWritePair;
+defm WriteFHAddZ : X86SchedWritePair;
+defm WritePHAdd  : X86SchedWritePair;
+defm WritePHAddX : X86SchedWritePair;
+defm WritePHAddY : X86SchedWritePair;
+defm WritePHAddZ : X86SchedWritePair;
 
 // Vector integer operations.
-defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
-defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
-defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+def  WriteVecLoad         : SchedWrite;
+def  WriteVecLoadX        : SchedWrite;
+def  WriteVecLoadY        : SchedWrite;
+def  WriteVecLoadNT       : SchedWrite;
+def  WriteVecLoadNTY      : SchedWrite;
+def  WriteVecMaskedLoad   : SchedWrite;
+def  WriteVecMaskedLoadY  : SchedWrite;
+def  WriteVecStore        : SchedWrite;
+def  WriteVecStoreX       : SchedWrite;
+def  WriteVecStoreY       : SchedWrite;
+def  WriteVecStoreNT      : SchedWrite;
+def  WriteVecStoreNTY     : SchedWrite;
+def  WriteVecMaskedStore  : SchedWrite;
+def  WriteVecMaskedStoreY : SchedWrite;
+def  WriteVecMove         : SchedWrite;
+def  WriteVecMoveX        : SchedWrite;
+def  WriteVecMoveY        : SchedWrite;
+def  WriteVecMoveToGpr    : SchedWrite;
+def  WriteVecMoveFromGpr  : SchedWrite;
+
+defm WriteVecALU    : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecALUX   : X86SchedWritePair; // Vector integer ALU op, no logicals (XMM).
+defm WriteVecALUY   : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM).
+defm WriteVecALUZ   : X86SchedWritePair; // Vector integer ALU op, no logicals (ZMM).
+defm WriteVecLogic  : X86SchedWritePair; // Vector integer and/or/xor logicals.
+defm WriteVecLogicX : X86SchedWritePair; // Vector integer and/or/xor logicals (XMM).
+defm WriteVecLogicY : X86SchedWritePair; // Vector integer and/or/xor logicals (YMM).
+defm WriteVecLogicZ : X86SchedWritePair; // Vector integer and/or/xor logicals (ZMM).
+defm WriteVecTest  : X86SchedWritePair; // Vector integer TEST instructions.
+defm WriteVecTestY : X86SchedWritePair; // Vector integer TEST instructions (YMM).
+defm WriteVecTestZ : X86SchedWritePair; // Vector integer TEST instructions (ZMM).
+defm WriteVecShift  : X86SchedWritePair; // Vector integer shifts (default).
+defm WriteVecShiftX : X86SchedWritePair; // Vector integer shifts (XMM).
+defm WriteVecShiftY : X86SchedWritePair; // Vector integer shifts (YMM).
+defm WriteVecShiftZ : X86SchedWritePair; // Vector integer shifts (ZMM).
+defm WriteVecShiftImm : X86SchedWritePair; // Vector integer immediate shifts (default).
+defm WriteVecShiftImmX: X86SchedWritePair; // Vector integer immediate shifts (XMM).
+defm WriteVecShiftImmY: X86SchedWritePair; // Vector integer immediate shifts (YMM).
+defm WriteVecShiftImmZ: X86SchedWritePair; // Vector integer immediate shifts (ZMM).
+defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply (default).
+defm WriteVecIMulX : X86SchedWritePair; // Vector integer multiply (XMM).
+defm WriteVecIMulY : X86SchedWritePair; // Vector integer multiply (YMM).
+defm WriteVecIMulZ : X86SchedWritePair; // Vector integer multiply (ZMM).
+defm WritePMULLD   : X86SchedWritePair; // Vector PMULLD.
+defm WritePMULLDY   : X86SchedWritePair; // Vector PMULLD (YMM).
+defm WritePMULLDZ   : X86SchedWritePair; // Vector PMULLD (ZMM).
 defm WriteShuffle  : X86SchedWritePair; // Vector shuffles.
+defm WriteShuffleX : X86SchedWritePair; // Vector shuffles (XMM).
+defm WriteShuffleY : X86SchedWritePair; // Vector shuffles (YMM).
+defm WriteShuffleZ : X86SchedWritePair; // Vector shuffles (ZMM).
+defm WriteVarShuffle  : X86SchedWritePair; // Vector variable shuffles.
+defm WriteVarShuffleX : X86SchedWritePair; // Vector variable shuffles (XMM).
+defm WriteVarShuffleY : X86SchedWritePair; // Vector variable shuffles (YMM).
+defm WriteVarShuffleZ : X86SchedWritePair; // Vector variable shuffles (ZMM).
 defm WriteBlend  : X86SchedWritePair; // Vector blends.
+defm WriteBlendY : X86SchedWritePair; // Vector blends (YMM).
+defm WriteBlendZ : X86SchedWritePair; // Vector blends (ZMM).
 defm WriteVarBlend  : X86SchedWritePair; // Vector variable blends.
-defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
-
-// Vector bitwise operations.
-// These are often used on both floating point and integer vectors.
-defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+defm WriteVarBlendY : X86SchedWritePair; // Vector variable blends (YMM).
+defm WriteVarBlendZ : X86SchedWritePair; // Vector variable blends (ZMM).
+defm WritePSADBW  : X86SchedWritePair; // Vector PSADBW.
+defm WritePSADBWX : X86SchedWritePair; // Vector PSADBW (XMM).
+defm WritePSADBWY : X86SchedWritePair; // Vector PSADBW (YMM).
+defm WritePSADBWZ : X86SchedWritePair; // Vector PSADBW (ZMM).
+defm WriteMPSAD  : X86SchedWritePair; // Vector MPSAD.
+defm WriteMPSADY : X86SchedWritePair; // Vector MPSAD (YMM).
+defm WriteMPSADZ : X86SchedWritePair; // Vector MPSAD (ZMM).
+defm WritePHMINPOS : X86SchedWritePair; // Vector PHMINPOS.
+
+// Vector insert/extract operations.
+defm WriteVecInsert : X86SchedWritePair; // Insert gpr to vector element.
+def  WriteVecExtract : SchedWrite; // Extract vector element to gpr.
+def  WriteVecExtractSt : SchedWrite; // Extract vector element and store.
+
+// MOVMSK operations.
+def WriteFMOVMSK    : SchedWrite;
+def WriteVecMOVMSK  : SchedWrite;
+def WriteVecMOVMSKY : SchedWrite;
+def WriteMMXMOVMSK  : SchedWrite;
 
 // Conversion between integer and float.
-defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
-defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
-defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+defm WriteCvtSD2I  : X86SchedWritePair; // Double -> Integer.
+defm WriteCvtPD2I  : X86SchedWritePair; // Double -> Integer (XMM).
+defm WriteCvtPD2IY : X86SchedWritePair; // Double -> Integer (YMM).
+defm WriteCvtPD2IZ : X86SchedWritePair; // Double -> Integer (ZMM).
+
+defm WriteCvtSS2I  : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtPS2I  : X86SchedWritePair; // Float -> Integer (XMM).
+defm WriteCvtPS2IY : X86SchedWritePair; // Float -> Integer (YMM).
+defm WriteCvtPS2IZ : X86SchedWritePair; // Float -> Integer (ZMM).
+
+defm WriteCvtI2SD  : X86SchedWritePair; // Integer -> Double.
+defm WriteCvtI2PD  : X86SchedWritePair; // Integer -> Double (XMM).
+defm WriteCvtI2PDY : X86SchedWritePair; // Integer -> Double (YMM).
+defm WriteCvtI2PDZ : X86SchedWritePair; // Integer -> Double (ZMM).
+
+defm WriteCvtI2SS  : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtI2PS  : X86SchedWritePair; // Integer -> Float (XMM).
+defm WriteCvtI2PSY : X86SchedWritePair; // Integer -> Float (YMM).
+defm WriteCvtI2PSZ : X86SchedWritePair; // Integer -> Float (ZMM).
+
+defm WriteCvtSS2SD  : X86SchedWritePair; // Float -> Double size conversion.
+defm WriteCvtPS2PD  : X86SchedWritePair; // Float -> Double size conversion (XMM).
+defm WriteCvtPS2PDY : X86SchedWritePair; // Float -> Double size conversion (YMM).
+defm WriteCvtPS2PDZ : X86SchedWritePair; // Float -> Double size conversion (ZMM).
+
+defm WriteCvtSD2SS  : X86SchedWritePair; // Double -> Float size conversion.
+defm WriteCvtPD2PS  : X86SchedWritePair; // Double -> Float size conversion (XMM).
+defm WriteCvtPD2PSY : X86SchedWritePair; // Double -> Float size conversion (YMM).
+defm WriteCvtPD2PSZ : X86SchedWritePair; // Double -> Float size conversion (ZMM).
+
+defm WriteCvtPH2PS    : X86SchedWritePair; // Half -> Float size conversion.
+defm WriteCvtPH2PSY   : X86SchedWritePair; // Half -> Float size conversion (YMM).
+defm WriteCvtPH2PSZ   : X86SchedWritePair; // Half -> Float size conversion (ZMM).
+
+def  WriteCvtPS2PH    : SchedWrite; // // Float -> Half size conversion.
+def  WriteCvtPS2PHY   : SchedWrite; // // Float -> Half size conversion (YMM).
+def  WriteCvtPS2PHZ   : SchedWrite; // // Float -> Half size conversion (ZMM).
+def  WriteCvtPS2PHSt  : SchedWrite; // // Float -> Half + store size conversion.
+def  WriteCvtPS2PHYSt : SchedWrite; // // Float -> Half + store size conversion (YMM).
+def  WriteCvtPS2PHZSt : SchedWrite; // // Float -> Half + store size conversion (ZMM).
+
+// CRC32 instruction.
+defm WriteCRC32 : X86SchedWritePair;
 
 // Strings instructions.
 // Packed Compare Implicit Length Strings, Return Mask
@@ -117,13 +418,24 @@ defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
 // Carry-less multiplication instructions.
 defm WriteCLMul : X86SchedWritePair;
 
+// EMMS/FEMMS
+def WriteEMMS : SchedWrite;
+
+// Load/store MXCSR
+def WriteLDMXCSR : SchedWrite;
+def WriteSTMXCSR : SchedWrite;
+
 // Catch-all for expensive system instructions.
 def WriteSystem : SchedWrite;
 
 // AVX2.
 defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteFVarShuffle256 : X86SchedWritePair; // Fp 256-bit width variable shuffles.
 defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
-defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+defm WriteVarShuffle256 : X86SchedWritePair; // 256-bit width vector variable shuffles.
+defm WriteVarVecShift  : X86SchedWritePair; // Variable vector shifts.
+defm WriteVarVecShiftY : X86SchedWritePair; // Variable vector shifts (YMM).
+defm WriteVarVecShiftZ : X86SchedWritePair; // Variable vector shifts (ZMM).
 
 // Old microcoded instructions that nobody use.
 def WriteMicrocoded : SchedWrite;
@@ -134,529 +446,182 @@ def WriteFence : SchedWrite;
 // Nop, not very useful expect it provides a model for nops!
 def WriteNop : SchedWrite;
 
-//===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for X86
-def IIC_ALU_MEM     : InstrItinClass;
-def IIC_ALU_NONMEM  : InstrItinClass;
-def IIC_LEA         : InstrItinClass;
-def IIC_LEA_16      : InstrItinClass;
-def IIC_MUL8        : InstrItinClass;
-def IIC_MUL16_MEM   : InstrItinClass;
-def IIC_MUL16_REG   : InstrItinClass;
-def IIC_MUL32_MEM   : InstrItinClass;
-def IIC_MUL32_REG   : InstrItinClass;
-def IIC_MUL64       : InstrItinClass;
-// imul by al, ax, eax, tax
-def IIC_IMUL8       : InstrItinClass;
-def IIC_IMUL16_MEM  : InstrItinClass;
-def IIC_IMUL16_REG  : InstrItinClass;
-def IIC_IMUL32_MEM  : InstrItinClass;
-def IIC_IMUL32_REG  : InstrItinClass;
-def IIC_IMUL64      : InstrItinClass;
-// imul reg by reg|mem
-def IIC_IMUL16_RM   : InstrItinClass;
-def IIC_IMUL16_RR   : InstrItinClass;
-def IIC_IMUL32_RM   : InstrItinClass;
-def IIC_IMUL32_RR   : InstrItinClass;
-def IIC_IMUL64_RM   : InstrItinClass;
-def IIC_IMUL64_RR   : InstrItinClass;
-// imul reg = reg/mem * imm
-def IIC_IMUL16_RMI  : InstrItinClass;
-def IIC_IMUL16_RRI  : InstrItinClass;
-def IIC_IMUL32_RMI  : InstrItinClass;
-def IIC_IMUL32_RRI  : InstrItinClass;
-def IIC_IMUL64_RMI  : InstrItinClass;
-def IIC_IMUL64_RRI  : InstrItinClass;
-// div
-def IIC_DIV8_MEM    : InstrItinClass;
-def IIC_DIV8_REG    : InstrItinClass;
-def IIC_DIV16       : InstrItinClass;
-def IIC_DIV32       : InstrItinClass;
-def IIC_DIV64       : InstrItinClass;
-// idiv
-def IIC_IDIV8       : InstrItinClass;
-def IIC_IDIV16      : InstrItinClass;
-def IIC_IDIV32      : InstrItinClass;
-def IIC_IDIV64      : InstrItinClass;
-// neg/not/inc/dec
-def IIC_UNARY_REG   : InstrItinClass;
-def IIC_UNARY_MEM   : InstrItinClass;
-// add/sub/and/or/xor/sbc/cmp/test
-def IIC_BIN_MEM     : InstrItinClass;
-def IIC_BIN_NONMEM  : InstrItinClass;
-// adc/sbc
-def IIC_BIN_CARRY_MEM     : InstrItinClass;
-def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
-// shift/rotate
-def IIC_SR          : InstrItinClass;
-// shift double
-def IIC_SHD16_REG_IM : InstrItinClass;
-def IIC_SHD16_REG_CL : InstrItinClass;
-def IIC_SHD16_MEM_IM : InstrItinClass;
-def IIC_SHD16_MEM_CL : InstrItinClass;
-def IIC_SHD32_REG_IM : InstrItinClass;
-def IIC_SHD32_REG_CL : InstrItinClass;
-def IIC_SHD32_MEM_IM : InstrItinClass;
-def IIC_SHD32_MEM_CL : InstrItinClass;
-def IIC_SHD64_REG_IM : InstrItinClass;
-def IIC_SHD64_REG_CL : InstrItinClass;
-def IIC_SHD64_MEM_IM : InstrItinClass;
-def IIC_SHD64_MEM_CL : InstrItinClass;
-// cmov
-def IIC_CMOV16_RM : InstrItinClass;
-def IIC_CMOV16_RR : InstrItinClass;
-def IIC_CMOV32_RM : InstrItinClass;
-def IIC_CMOV32_RR : InstrItinClass;
-def IIC_CMOV64_RM : InstrItinClass;
-def IIC_CMOV64_RR : InstrItinClass;
-// set
-def IIC_SET_R : InstrItinClass;
-def IIC_SET_M : InstrItinClass;
-// jmp/jcc/jcxz
-def IIC_Jcc : InstrItinClass;
-def IIC_JCXZ : InstrItinClass;
-def IIC_JMP_REL : InstrItinClass;
-def IIC_JMP_REG : InstrItinClass;
-def IIC_JMP_MEM : InstrItinClass;
-def IIC_JMP_FAR_MEM : InstrItinClass;
-def IIC_JMP_FAR_PTR : InstrItinClass;
-// loop
-def IIC_LOOP : InstrItinClass;
-def IIC_LOOPE : InstrItinClass;
-def IIC_LOOPNE : InstrItinClass;
-// call
-def IIC_CALL_RI : InstrItinClass;
-def IIC_CALL_MEM : InstrItinClass;
-def IIC_CALL_FAR_MEM : InstrItinClass;
-def IIC_CALL_FAR_PTR : InstrItinClass;
-// ret
-def IIC_RET : InstrItinClass;
-def IIC_RET_IMM : InstrItinClass;
-//sign extension movs
-def IIC_MOVSX : InstrItinClass;
-def IIC_MOVSX_R16_R8 : InstrItinClass;
-def IIC_MOVSX_R16_M8 : InstrItinClass;
-def IIC_MOVSX_R16_R16 : InstrItinClass;
-def IIC_MOVSX_R32_R32 : InstrItinClass;
-//zero extension movs
-def IIC_MOVZX : InstrItinClass;
-def IIC_MOVZX_R16_R8 : InstrItinClass;
-def IIC_MOVZX_R16_M8 : InstrItinClass;
-
-def IIC_REP_MOVS : InstrItinClass;
-def IIC_REP_STOS : InstrItinClass;
-
-// SSE scalar/parallel binary operations
-def IIC_SSE_ALU_F32S_RR : InstrItinClass;
-def IIC_SSE_ALU_F32S_RM : InstrItinClass;
-def IIC_SSE_ALU_F64S_RR : InstrItinClass;
-def IIC_SSE_ALU_F64S_RM : InstrItinClass;
-def IIC_SSE_MUL_F32S_RR : InstrItinClass;
-def IIC_SSE_MUL_F32S_RM : InstrItinClass;
-def IIC_SSE_MUL_F64S_RR : InstrItinClass;
-def IIC_SSE_MUL_F64S_RM : InstrItinClass;
-def IIC_SSE_DIV_F32S_RR : InstrItinClass;
-def IIC_SSE_DIV_F32S_RM : InstrItinClass;
-def IIC_SSE_DIV_F64S_RR : InstrItinClass;
-def IIC_SSE_DIV_F64S_RM : InstrItinClass;
-def IIC_SSE_ALU_F32P_RR : InstrItinClass;
-def IIC_SSE_ALU_F32P_RM : InstrItinClass;
-def IIC_SSE_ALU_F64P_RR : InstrItinClass;
-def IIC_SSE_ALU_F64P_RM : InstrItinClass;
-def IIC_SSE_MUL_F32P_RR : InstrItinClass;
-def IIC_SSE_MUL_F32P_RM : InstrItinClass;
-def IIC_SSE_MUL_F64P_RR : InstrItinClass;
-def IIC_SSE_MUL_F64P_RM : InstrItinClass;
-def IIC_SSE_DIV_F32P_RR : InstrItinClass;
-def IIC_SSE_DIV_F32P_RM : InstrItinClass;
-def IIC_SSE_DIV_F64P_RR : InstrItinClass;
-def IIC_SSE_DIV_F64P_RM : InstrItinClass;
-
-def IIC_SSE_COMIS_RR : InstrItinClass;
-def IIC_SSE_COMIS_RM : InstrItinClass;
-
-def IIC_SSE_HADDSUB_RR : InstrItinClass;
-def IIC_SSE_HADDSUB_RM : InstrItinClass;
-
-def IIC_SSE_BIT_P_RR  : InstrItinClass;
-def IIC_SSE_BIT_P_RM  : InstrItinClass;
-
-def IIC_SSE_INTALU_P_RR  : InstrItinClass;
-def IIC_SSE_INTALU_P_RM  : InstrItinClass;
-def IIC_SSE_INTALUQ_P_RR  : InstrItinClass;
-def IIC_SSE_INTALUQ_P_RM  : InstrItinClass;
-
-def IIC_SSE_INTMUL_P_RR : InstrItinClass;
-def IIC_SSE_INTMUL_P_RM : InstrItinClass;
-
-def IIC_SSE_INTSH_P_RR : InstrItinClass;
-def IIC_SSE_INTSH_P_RM : InstrItinClass;
-def IIC_SSE_INTSH_P_RI : InstrItinClass;
-
-def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
-
-def IIC_SSE_SHUFP : InstrItinClass;
-def IIC_SSE_PSHUF_RI : InstrItinClass;
-def IIC_SSE_PSHUF_MI : InstrItinClass;
-
-def IIC_SSE_PACK : InstrItinClass;
-def IIC_SSE_UNPCK : InstrItinClass;
-
-def IIC_SSE_MOVMSK : InstrItinClass;
-def IIC_SSE_MASKMOV : InstrItinClass;
-
-def IIC_SSE_PEXTRW : InstrItinClass;
-def IIC_SSE_PINSRW : InstrItinClass;
-
-def IIC_SSE_PABS_RR : InstrItinClass;
-def IIC_SSE_PABS_RM : InstrItinClass;
-
-def IIC_SSE_SQRTPS_RR : InstrItinClass;
-def IIC_SSE_SQRTPS_RM : InstrItinClass;
-def IIC_SSE_SQRTSS_RR : InstrItinClass;
-def IIC_SSE_SQRTSS_RM : InstrItinClass;
-def IIC_SSE_SQRTPD_RR : InstrItinClass;
-def IIC_SSE_SQRTPD_RM : InstrItinClass;
-def IIC_SSE_SQRTSD_RR : InstrItinClass;
-def IIC_SSE_SQRTSD_RM : InstrItinClass;
-
-def IIC_SSE_RSQRTPS_RR : InstrItinClass;
-def IIC_SSE_RSQRTPS_RM : InstrItinClass;
-def IIC_SSE_RSQRTSS_RR : InstrItinClass;
-def IIC_SSE_RSQRTSS_RM : InstrItinClass;
-
-def IIC_SSE_RCPP_RR : InstrItinClass;
-def IIC_SSE_RCPP_RM : InstrItinClass;
-def IIC_SSE_RCPS_RR : InstrItinClass;
-def IIC_SSE_RCPS_RM : InstrItinClass;
-
-def IIC_SSE_MOV_S_RR : InstrItinClass;
-def IIC_SSE_MOV_S_RM : InstrItinClass;
-def IIC_SSE_MOV_S_MR : InstrItinClass;
-
-def IIC_SSE_MOVA_P_RR : InstrItinClass;
-def IIC_SSE_MOVA_P_RM : InstrItinClass;
-def IIC_SSE_MOVA_P_MR : InstrItinClass;
-
-def IIC_SSE_MOVU_P_RR : InstrItinClass;
-def IIC_SSE_MOVU_P_RM : InstrItinClass;
-def IIC_SSE_MOVU_P_MR : InstrItinClass;
-
-def IIC_SSE_MOVDQ : InstrItinClass;
-def IIC_SSE_MOVD_ToGP : InstrItinClass;
-def IIC_SSE_MOVQ_RR : InstrItinClass;
-
-def IIC_SSE_MOV_LH : InstrItinClass;
-
-def IIC_SSE_LDDQU : InstrItinClass;
-
-def IIC_SSE_MOVNT : InstrItinClass;
-
-def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
-def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
-def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
-def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
-def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
-def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
-def IIC_SSE_PSHUFB_RR : InstrItinClass;
-def IIC_SSE_PSHUFB_RM : InstrItinClass;
-def IIC_SSE_PSIGN_RR : InstrItinClass;
-def IIC_SSE_PSIGN_RM : InstrItinClass;
-
-def IIC_SSE_PMADD : InstrItinClass;
-def IIC_SSE_PMULHRSW : InstrItinClass;
-def IIC_SSE_PALIGNRR : InstrItinClass;
-def IIC_SSE_PALIGNRM : InstrItinClass;
-def IIC_SSE_MWAIT : InstrItinClass;
-def IIC_SSE_MONITOR : InstrItinClass;
-def IIC_SSE_MWAITX : InstrItinClass;
-def IIC_SSE_MONITORX : InstrItinClass;
-def IIC_SSE_CLZERO : InstrItinClass;
-
-def IIC_SSE_PREFETCH : InstrItinClass;
-def IIC_SSE_PAUSE : InstrItinClass;
-def IIC_SSE_LFENCE : InstrItinClass;
-def IIC_SSE_MFENCE : InstrItinClass;
-def IIC_SSE_SFENCE : InstrItinClass;
-def IIC_SSE_LDMXCSR : InstrItinClass;
-def IIC_SSE_STMXCSR : InstrItinClass;
-
-def IIC_SSE_CVT_PD_RR : InstrItinClass;
-def IIC_SSE_CVT_PD_RM : InstrItinClass;
-def IIC_SSE_CVT_PS_RR : InstrItinClass;
-def IIC_SSE_CVT_PS_RM : InstrItinClass;
-def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
-def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
-def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
-def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
-def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
-def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
-def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
-def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
-
-def IIC_AVX_ZERO : InstrItinClass;
-
-// MMX
-def IIC_MMX_MOV_MM_RM : InstrItinClass;
-def IIC_MMX_MOV_REG_MM : InstrItinClass;
-def IIC_MMX_MOVQ_RM : InstrItinClass;
-def IIC_MMX_MOVQ_RR : InstrItinClass;
-
-def IIC_MMX_ALU_RM : InstrItinClass;
-def IIC_MMX_ALU_RR : InstrItinClass;
-def IIC_MMX_ALUQ_RM : InstrItinClass;
-def IIC_MMX_ALUQ_RR : InstrItinClass;
-def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
-def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
-def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
-def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
-def IIC_MMX_PMUL : InstrItinClass;
-def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
-def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
-def IIC_MMX_PSADBW : InstrItinClass;
-def IIC_MMX_SHIFT_RI : InstrItinClass;
-def IIC_MMX_SHIFT_RM : InstrItinClass;
-def IIC_MMX_SHIFT_RR : InstrItinClass;
-def IIC_MMX_UNPCK_H_RM : InstrItinClass;
-def IIC_MMX_UNPCK_H_RR : InstrItinClass;
-def IIC_MMX_UNPCK_L : InstrItinClass;
-def IIC_MMX_PCK_RM : InstrItinClass;
-def IIC_MMX_PCK_RR : InstrItinClass;
-def IIC_MMX_PSHUF : InstrItinClass;
-def IIC_MMX_PEXTR : InstrItinClass;
-def IIC_MMX_PINSRW : InstrItinClass;
-def IIC_MMX_MASKMOV : InstrItinClass;
-def IIC_MMX_MOVMSK : InstrItinClass;
-def IIC_MMX_CVT_PD_RR : InstrItinClass;
-def IIC_MMX_CVT_PD_RM : InstrItinClass;
-def IIC_MMX_CVT_PS_RR : InstrItinClass;
-def IIC_MMX_CVT_PS_RM : InstrItinClass;
-
-def IIC_3DNOW_FALU_RM : InstrItinClass;
-def IIC_3DNOW_FALU_RR : InstrItinClass;
-def IIC_3DNOW_FCVT_F2I_RM : InstrItinClass;
-def IIC_3DNOW_FCVT_F2I_RR : InstrItinClass;
-def IIC_3DNOW_FCVT_I2F_RM : InstrItinClass;
-def IIC_3DNOW_FCVT_I2F_RR : InstrItinClass;
-def IIC_3DNOW_MISC_FUNC_REG : InstrItinClass;
-def IIC_3DNOW_MISC_FUNC_MEM : InstrItinClass;
-
-def IIC_CMPX_LOCK : InstrItinClass;
-def IIC_CMPX_LOCK_8 : InstrItinClass;
-def IIC_CMPX_LOCK_8B : InstrItinClass;
-def IIC_CMPX_LOCK_16B : InstrItinClass;
-
-def IIC_XADD_LOCK_MEM : InstrItinClass;
-def IIC_XADD_LOCK_MEM8 : InstrItinClass;
-
-def IIC_FCMOV : InstrItinClass;
-def IIC_FILD : InstrItinClass;
-def IIC_FLD : InstrItinClass;
-def IIC_FLD80 : InstrItinClass;
-def IIC_FST : InstrItinClass;
-def IIC_FST80 : InstrItinClass;
-def IIC_FIST : InstrItinClass;
-def IIC_FLDZ : InstrItinClass;
-def IIC_FUCOM : InstrItinClass;
-def IIC_FUCOMI : InstrItinClass;
-def IIC_FCOMI : InstrItinClass;
-def IIC_FNSTSW : InstrItinClass;
-def IIC_FNSTCW : InstrItinClass;
-def IIC_FLDCW : InstrItinClass;
-def IIC_FNINIT : InstrItinClass;
-def IIC_FFREE : InstrItinClass;
-def IIC_FNCLEX : InstrItinClass;
-def IIC_WAIT : InstrItinClass;
-def IIC_FXAM : InstrItinClass;
-def IIC_FNOP : InstrItinClass;
-def IIC_FLDL : InstrItinClass;
-def IIC_F2XM1 : InstrItinClass;
-def IIC_FYL2X : InstrItinClass;
-def IIC_FPTAN : InstrItinClass;
-def IIC_FPATAN : InstrItinClass;
-def IIC_FXTRACT : InstrItinClass;
-def IIC_FPREM1 : InstrItinClass;
-def IIC_FPSTP : InstrItinClass;
-def IIC_FPREM : InstrItinClass;
-def IIC_FSIGN : InstrItinClass;
-def IIC_FSQRT : InstrItinClass;
-def IIC_FYL2XP1 : InstrItinClass;
-def IIC_FSINCOS : InstrItinClass;
-def IIC_FRNDINT : InstrItinClass;
-def IIC_FSCALE : InstrItinClass;
-def IIC_FCOMPP : InstrItinClass;
-def IIC_FXSAVE : InstrItinClass;
-def IIC_FXRSTOR : InstrItinClass;
-
-def IIC_FXCH : InstrItinClass;
-
-// System instructions
-def IIC_CPUID : InstrItinClass;
-def IIC_INT : InstrItinClass;
-def IIC_INT3 : InstrItinClass;
-def IIC_INVD : InstrItinClass;
-def IIC_INVLPG : InstrItinClass;
-def IIC_INVPCID : InstrItinClass;
-def IIC_IRET : InstrItinClass;
-def IIC_HLT : InstrItinClass;
-def IIC_LXS : InstrItinClass;
-def IIC_LTR : InstrItinClass;
-def IIC_MPX : InstrItinClass;
-def IIC_PKU : InstrItinClass;
-def IIC_PTWRITE : InstrItinClass;
-def IIC_RDPID : InstrItinClass;
-def IIC_RDRAND : InstrItinClass;
-def IIC_RDSEED : InstrItinClass;
-def IIC_RDTSC : InstrItinClass;
-def IIC_RDTSCP : InstrItinClass;
-def IIC_RSM : InstrItinClass;
-def IIC_SIDT : InstrItinClass;
-def IIC_SGDT : InstrItinClass;
-def IIC_SLDT : InstrItinClass;
-def IIC_SMAP : InstrItinClass;
-def IIC_SMX : InstrItinClass;
-def IIC_STR : InstrItinClass;
-def IIC_SKINIT : InstrItinClass;
-def IIC_SVM : InstrItinClass;
-def IIC_VMX : InstrItinClass;
-def IIC_CLGI : InstrItinClass;
-def IIC_STGI : InstrItinClass;
-def IIC_SWAPGS : InstrItinClass;
-def IIC_SYSCALL : InstrItinClass;
-def IIC_SYS_ENTER_EXIT : InstrItinClass;
-def IIC_IN_RR : InstrItinClass;
-def IIC_IN_RI : InstrItinClass;
-def IIC_OUT_RR : InstrItinClass;
-def IIC_OUT_IR : InstrItinClass;
-def IIC_INS : InstrItinClass;
-def IIC_LWP : InstrItinClass;
-def IIC_MOV_REG_DR : InstrItinClass;
-def IIC_MOV_DR_REG : InstrItinClass;
-def IIC_MOV_REG_CR : InstrItinClass;
-def IIC_MOV_CR_REG : InstrItinClass;
-def IIC_MOV_REG_SR : InstrItinClass;
-def IIC_MOV_MEM_SR : InstrItinClass;
-def IIC_MOV_SR_REG : InstrItinClass;
-def IIC_MOV_SR_MEM : InstrItinClass;
-def IIC_LAR_RM : InstrItinClass;
-def IIC_LAR_RR : InstrItinClass;
-def IIC_LSL_RM : InstrItinClass;
-def IIC_LSL_RR : InstrItinClass;
-def IIC_LGDT : InstrItinClass;
-def IIC_LIDT : InstrItinClass;
-def IIC_LLDT_REG : InstrItinClass;
-def IIC_LLDT_MEM : InstrItinClass;
-def IIC_PUSH_CS : InstrItinClass;
-def IIC_PUSH_SR : InstrItinClass;
-def IIC_POP_SR : InstrItinClass;
-def IIC_POP_SR_SS : InstrItinClass;
-def IIC_SEGMENT_BASE_R : InstrItinClass;
-def IIC_SEGMENT_BASE_W : InstrItinClass;
-def IIC_VERR : InstrItinClass;
-def IIC_VERW_REG : InstrItinClass;
-def IIC_VERW_MEM : InstrItinClass;
-def IIC_WRMSR : InstrItinClass;
-def IIC_RDMSR : InstrItinClass;
-def IIC_RDPMC : InstrItinClass;
-def IIC_SMSW : InstrItinClass;
-def IIC_LMSW_REG : InstrItinClass;
-def IIC_LMSW_MEM : InstrItinClass;
-def IIC_ENTER : InstrItinClass;
-def IIC_LEAVE : InstrItinClass;
-def IIC_POP_MEM : InstrItinClass;
-def IIC_POP_REG16 : InstrItinClass;
-def IIC_POP_REG : InstrItinClass;
-def IIC_POP_F : InstrItinClass;
-def IIC_POP_FD : InstrItinClass;
-def IIC_POP_A : InstrItinClass;
-def IIC_PUSH_IMM : InstrItinClass;
-def IIC_PUSH_MEM : InstrItinClass;
-def IIC_PUSH_REG : InstrItinClass;
-def IIC_PUSH_F : InstrItinClass;
-def IIC_PUSH_A : InstrItinClass;
-def IIC_BSWAP : InstrItinClass;
-def IIC_BIT_SCAN_MEM : InstrItinClass;
-def IIC_BIT_SCAN_REG : InstrItinClass;
-def IIC_LZCNT_RR : InstrItinClass;
-def IIC_LZCNT_RM : InstrItinClass;
-def IIC_TZCNT_RR : InstrItinClass;
-def IIC_TZCNT_RM : InstrItinClass;
-def IIC_MOVS : InstrItinClass;
-def IIC_STOS : InstrItinClass;
-def IIC_SCAS : InstrItinClass;
-def IIC_CMPS : InstrItinClass;
-def IIC_MOV : InstrItinClass;
-def IIC_MOV_MEM : InstrItinClass;
-def IIC_AHF : InstrItinClass;
-def IIC_BT_MI : InstrItinClass;
-def IIC_BT_MR : InstrItinClass;
-def IIC_BT_RI : InstrItinClass;
-def IIC_BT_RR : InstrItinClass;
-def IIC_BTX_MI : InstrItinClass;
-def IIC_BTX_MR : InstrItinClass;
-def IIC_BTX_RI : InstrItinClass;
-def IIC_BTX_RR : InstrItinClass;
-def IIC_XCHG_REG : InstrItinClass;
-def IIC_XCHG_MEM : InstrItinClass;
-def IIC_XADD_REG : InstrItinClass;
-def IIC_XADD_MEM : InstrItinClass;
-def IIC_CMPXCHG_MEM : InstrItinClass;
-def IIC_CMPXCHG_REG : InstrItinClass;
-def IIC_CMPXCHG_MEM8 : InstrItinClass;
-def IIC_CMPXCHG_REG8 : InstrItinClass;
-def IIC_CMPXCHG_8B : InstrItinClass;
-def IIC_CMPXCHG_16B : InstrItinClass;
-def IIC_LODS : InstrItinClass;
-def IIC_OUTS : InstrItinClass;
-def IIC_CLC_CMC_STC : InstrItinClass;
-def IIC_CLD : InstrItinClass;
-def IIC_CLI : InstrItinClass;
-def IIC_CLTS : InstrItinClass;
-def IIC_STI : InstrItinClass;
-def IIC_STD : InstrItinClass;
-def IIC_XLAT : InstrItinClass;
-def IIC_AAA : InstrItinClass;
-def IIC_AAD : InstrItinClass;
-def IIC_AAM : InstrItinClass;
-def IIC_AAS : InstrItinClass;
-def IIC_DAA : InstrItinClass;
-def IIC_DAS : InstrItinClass;
-def IIC_BOUND : InstrItinClass;
-def IIC_ARPL_REG : InstrItinClass;
-def IIC_ARPL_MEM : InstrItinClass;
-def IIC_MOVBE : InstrItinClass;
-def IIC_AES   : InstrItinClass;
-def IIC_BLEND_MEM : InstrItinClass;
-def IIC_BLEND_NOMEM : InstrItinClass;
-def IIC_CBW   : InstrItinClass;
-def IIC_CRC32_REG : InstrItinClass;
-def IIC_CRC32_MEM : InstrItinClass;
-def IIC_SSE_DPPD_RR : InstrItinClass;
-def IIC_SSE_DPPD_RM : InstrItinClass;
-def IIC_SSE_DPPS_RR : InstrItinClass;
-def IIC_SSE_DPPS_RM : InstrItinClass;
-def IIC_MMX_EMMS : InstrItinClass;
-def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
-def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
-def IIC_SSE_INSERTPS_RR : InstrItinClass;
-def IIC_SSE_INSERTPS_RM : InstrItinClass;
-def IIC_SSE_MPSADBW_RR : InstrItinClass;
-def IIC_SSE_MPSADBW_RM : InstrItinClass;
-def IIC_SSE_PMULLD_RR : InstrItinClass;
-def IIC_SSE_PMULLD_RM : InstrItinClass;
-def IIC_SSE_ROUNDPS_REG : InstrItinClass;
-def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
-def IIC_SSE_ROUNDPD_REG : InstrItinClass;
-def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
-def IIC_SSE_POPCNT_RR : InstrItinClass;
-def IIC_SSE_POPCNT_RM : InstrItinClass;
-def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
-def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
-
-def IIC_NOP : InstrItinClass;
+// Move/Load/Store wrappers.
+def WriteFMoveLS
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>;
+def WriteFMoveLSX
+ : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>;
+def WriteFMoveLSY
+ : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>;
+def SchedWriteFMoveLS
+  : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX,
+                              WriteFMoveLSY, WriteFMoveLSY>;
+
+def WriteFMoveLSNT
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>;
+def WriteFMoveLSNTX
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNTX>;
+def WriteFMoveLSNTY
+ : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreNTY>;
+def SchedWriteFMoveLSNT
+  : X86SchedWriteMoveLSWidths<WriteFMoveLSNT, WriteFMoveLSNTX,
+                              WriteFMoveLSNTY, WriteFMoveLSNTY>;
+
+def WriteVecMoveLS
+ : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>;
+def WriteVecMoveLSX
+ : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>;
+def WriteVecMoveLSY
+ : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>;
+def SchedWriteVecMoveLS
+  : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX,
+                              WriteVecMoveLSY, WriteVecMoveLSY>;
+
+def WriteVecMoveLSNT
+ : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>;
+def WriteVecMoveLSNTX
+ : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadNT, WriteVecStoreNT>;
+def WriteVecMoveLSNTY
+ : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadNTY, WriteVecStoreNTY>;
+def SchedWriteVecMoveLSNT
+  : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
+                              WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
+
+// Vector width wrappers.
+def SchedWriteFAdd
+ : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;
+def SchedWriteFAdd64
+ : X86SchedWriteWidths<WriteFAdd64, WriteFAdd64X, WriteFAdd64Y, WriteFAdd64Z>;
+def SchedWriteFHAdd
+ : X86SchedWriteWidths<WriteFHAdd, WriteFHAdd, WriteFHAddY, WriteFHAddZ>;
+def SchedWriteFCmp
+ : X86SchedWriteWidths<WriteFCmp, WriteFCmpX, WriteFCmpY, WriteFCmpZ>;
+def SchedWriteFCmp64
+ : X86SchedWriteWidths<WriteFCmp64, WriteFCmp64X, WriteFCmp64Y, WriteFCmp64Z>;
+def SchedWriteFMul
+ : X86SchedWriteWidths<WriteFMul, WriteFMulX, WriteFMulY, WriteFMulZ>;
+def SchedWriteFMul64
+ : X86SchedWriteWidths<WriteFMul64, WriteFMul64X, WriteFMul64Y, WriteFMul64Z>;
+def SchedWriteFMA
+ : X86SchedWriteWidths<WriteFMA, WriteFMAX, WriteFMAY, WriteFMAZ>;
+def SchedWriteDPPD
+ : X86SchedWriteWidths<WriteDPPD, WriteDPPD, WriteDPPD, WriteDPPD>;
+def SchedWriteDPPS
+ : X86SchedWriteWidths<WriteDPPS, WriteDPPS, WriteDPPSY, WriteDPPSZ>;
+def SchedWriteFDiv
+ : X86SchedWriteWidths<WriteFDiv, WriteFDivX, WriteFDivY, WriteFDivZ>;
+def SchedWriteFDiv64
+ : X86SchedWriteWidths<WriteFDiv64, WriteFDiv64X, WriteFDiv64Y, WriteFDiv64Z>;
+def SchedWriteFSqrt
+ : X86SchedWriteWidths<WriteFSqrt, WriteFSqrtX,
+                       WriteFSqrtY, WriteFSqrtZ>;
+def SchedWriteFSqrt64
+ : X86SchedWriteWidths<WriteFSqrt64, WriteFSqrt64X,
+                       WriteFSqrt64Y, WriteFSqrt64Z>;
+def SchedWriteFRcp
+ : X86SchedWriteWidths<WriteFRcp, WriteFRcpX, WriteFRcpY, WriteFRcpZ>;
+def SchedWriteFRsqrt
+ : X86SchedWriteWidths<WriteFRsqrt, WriteFRsqrtX, WriteFRsqrtY, WriteFRsqrtZ>;
+def SchedWriteFRnd
+ : X86SchedWriteWidths<WriteFRnd, WriteFRnd, WriteFRndY, WriteFRndZ>;
+def SchedWriteFLogic
+ : X86SchedWriteWidths<WriteFLogic, WriteFLogic, WriteFLogicY, WriteFLogicZ>;
+def SchedWriteFTest
+ : X86SchedWriteWidths<WriteFTest, WriteFTest, WriteFTestY, WriteFTestZ>;
+
+def SchedWriteFShuffle
+ : X86SchedWriteWidths<WriteFShuffle, WriteFShuffle,
+                       WriteFShuffleY, WriteFShuffleZ>;
+def SchedWriteFVarShuffle
+ : X86SchedWriteWidths<WriteFVarShuffle, WriteFVarShuffle,
+                       WriteFVarShuffleY, WriteFVarShuffleZ>;
+def SchedWriteFBlend
+ : X86SchedWriteWidths<WriteFBlend, WriteFBlend, WriteFBlendY, WriteFBlendZ>;
+def SchedWriteFVarBlend
+ : X86SchedWriteWidths<WriteFVarBlend, WriteFVarBlend,
+                       WriteFVarBlendY, WriteFVarBlendZ>;
+
+def SchedWriteCvtDQ2PD
+ : X86SchedWriteWidths<WriteCvtI2SD, WriteCvtI2PD,
+                       WriteCvtI2PDY, WriteCvtI2PDZ>;
+def SchedWriteCvtDQ2PS
+ : X86SchedWriteWidths<WriteCvtI2SS, WriteCvtI2PS,
+                       WriteCvtI2PSY, WriteCvtI2PSZ>;
+def SchedWriteCvtPD2DQ
+ : X86SchedWriteWidths<WriteCvtSD2I, WriteCvtPD2I,
+                       WriteCvtPD2IY, WriteCvtPD2IZ>;
+def SchedWriteCvtPS2DQ
+ : X86SchedWriteWidths<WriteCvtSS2I, WriteCvtPS2I,
+                       WriteCvtPS2IY, WriteCvtPS2IZ>;
+def SchedWriteCvtPS2PD
+ : X86SchedWriteWidths<WriteCvtSS2SD, WriteCvtPS2PD,
+                       WriteCvtPS2PDY, WriteCvtPS2PDZ>;
+def SchedWriteCvtPD2PS
+ : X86SchedWriteWidths<WriteCvtSD2SS, WriteCvtPD2PS,
+                       WriteCvtPD2PSY, WriteCvtPD2PSZ>;
+
+def SchedWriteVecALU
+ : X86SchedWriteWidths<WriteVecALU, WriteVecALUX, WriteVecALUY, WriteVecALUZ>;
+def SchedWritePHAdd
+ : X86SchedWriteWidths<WritePHAdd, WritePHAddX, WritePHAddY, WritePHAddZ>;
+def SchedWriteVecLogic
+ : X86SchedWriteWidths<WriteVecLogic, WriteVecLogicX,
+                       WriteVecLogicY, WriteVecLogicZ>;
+def SchedWriteVecTest
+ : X86SchedWriteWidths<WriteVecTest, WriteVecTest,
+                       WriteVecTestY, WriteVecTestZ>;
+def SchedWriteVecShift
+ : X86SchedWriteWidths<WriteVecShift, WriteVecShiftX,
+                       WriteVecShiftY, WriteVecShiftZ>;
+def SchedWriteVecShiftImm
+ : X86SchedWriteWidths<WriteVecShiftImm, WriteVecShiftImmX,
+                       WriteVecShiftImmY, WriteVecShiftImmZ>;
+def SchedWriteVarVecShift
+ : X86SchedWriteWidths<WriteVarVecShift, WriteVarVecShift,
+                       WriteVarVecShiftY, WriteVarVecShiftZ>;
+def SchedWriteVecIMul
+ : X86SchedWriteWidths<WriteVecIMul, WriteVecIMulX,
+                       WriteVecIMulY, WriteVecIMulZ>;
+def SchedWritePMULLD
+ : X86SchedWriteWidths<WritePMULLD, WritePMULLD,
+                       WritePMULLDY, WritePMULLDZ>;
+def SchedWriteMPSAD
+ : X86SchedWriteWidths<WriteMPSAD, WriteMPSAD,
+                       WriteMPSADY, WriteMPSADZ>;
+def SchedWritePSADBW
+ : X86SchedWriteWidths<WritePSADBW, WritePSADBWX,
+                       WritePSADBWY, WritePSADBWZ>;
+
+def SchedWriteShuffle
+ : X86SchedWriteWidths<WriteShuffle, WriteShuffleX,
+                       WriteShuffleY, WriteShuffleZ>;
+def SchedWriteVarShuffle
+ : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffleX,
+                       WriteVarShuffleY, WriteVarShuffleZ>;
+def SchedWriteBlend
+ : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlendY, WriteBlendZ>;
+def SchedWriteVarBlend
+ : X86SchedWriteWidths<WriteVarBlend, WriteVarBlend,
+                       WriteVarBlendY, WriteVarBlendZ>;
+
+// Vector size wrappers.
+def SchedWriteFAddSizes
+ : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd64>;
+def SchedWriteFCmpSizes
+ : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp64>;
+def SchedWriteFMulSizes
+ : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul64>;
+def SchedWriteFDivSizes
+ : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv64>;
+def SchedWriteFSqrtSizes
+ : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt64>;
+def SchedWriteFLogicSizes
+ : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic>;
+def SchedWriteFShuffleSizes
+ : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>;
 
 //===----------------------------------------------------------------------===//
-// Processor instruction itineraries.
+// Generic Processor Scheduler Models.
 
 // IssueWidth is analogous to the number of decode units. Core and its
 // descendents, including Nehalem and SandyBridge have 4 decoders.
@@ -673,7 +638,7 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
-// The GenericX86Model contains no instruction itineraries
+// The GenericX86Model contains no instruction schedules
 // and disables PostRAScheduler.
 class GenericX86Model : SchedMachineModel {
   let IssueWidth = 4;
@@ -690,4 +655,3 @@ def GenericModel : GenericX86Model;
 def GenericPostRAModel : GenericX86Model {
   let PostRAScheduler = 1;
 }
-
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
index 460b9823a7e7..d1e902e6c43f 100644
--- a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Intel Atom
+// This file defines the schedule class data for the Intel Atom
 // in order (Saltwell-32nm/Bonnell-45nm) processors.
 //
 //===----------------------------------------------------------------------===//
@@ -15,540 +15,907 @@
 //
 // Scheduling information derived from the "Intel 64 and IA32 Architectures
 // Optimization Reference Manual", Chapter 13, Section 4.
-// Functional Units
-//    Port 0
-def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
-                      // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
-def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
-                      // SIMD/FP: SIMD ALU, FP Adder
-
-def AtomItineraries : ProcessorItineraries<
-  [ Port0, Port1 ],
-  [], [
-  // P0 only
-  // InstrItinData<class, [InstrStage<N, [P0]>] >,
-  // P0 or P1
-  // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
-  // P0 and P1
-  // InstrItinData<class, [InstrStage<N, [P0], 0>,  InstrStage<N, [P1]>] >,
-  //
-  // Default is 1 cycle, port0 or port1
-  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
-  // mul
-  InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
-  // imul by al, ax, eax, rax
-  InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
-  // imul reg by reg|mem
-  InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
-  // imul reg = reg/mem * imm
-  InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
-  // idiv
-  InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
-  InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
-  InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
-  InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
-  // div
-  InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
-  InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
-  InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
-  InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
-  InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
-  // neg/not/inc/dec
-  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
-  // add/sub/and/or/xor/cmp/test
-  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
-  // adc/sbc
-  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
-  // shift/rotate
-  InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
-  // shift double
-  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
-  // cmov
-  InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
-  // set
-  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
-  // jcc
-  InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
-  // jcxz/jecxz/jrcxz
-  InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
-  // jmp rel
-  InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
-  // jmp indirect
-  InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
-  // jmp far
-  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
-  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
-  // loop/loope/loopne
-  InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
-  InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
-  // call - all but reg/imm
-  InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
-                              InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
-  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
-  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
-  //ret
-  InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
-  InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>,  InstrStage<1, [Port1]>] >,
-  //sign extension movs
-  InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
-  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
-  //zero extension movs
-  InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
-  InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
-
-  // SSE binary operations
-  // arithmetic fp scalar
-  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
-                                   InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
-                                   InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
-  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
-  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
-
-  // arithmetic fp parallel
-  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
-                                   InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
-
-  // bitwise parallel
-  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
-
-  // arithmetic int parallel
-  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
-
-  // multiply int parallel
-  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
-
-  // shift parallel
-  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_PACK, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
-  InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
-  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
-  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
-  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
-
-  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
-
-  // conversions
-  // to/from PD ...
-  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
-  // to/from PS except to/from PD and PS2PI
-  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
-
-  // MMX MOVs
-  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
-  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
-  // other MMX
-  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
-  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MMX_MOVMSK, [InstrStage<3, [Port0]>] >,
-  // conversions
-  // from/to PD
-  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
-  // from/to PI
-  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
-                                    InstrStage<5, [Port1]>]>,
-
-  InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
-  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
-  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_FCMOV,  [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
-  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
-  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
-  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
-  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
-  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
-  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
-  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
-  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
-  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
-  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
-  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
-  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
-  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
-  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
-  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
-  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_FSIGN,  [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_FSQRT,  [InstrStage<71, [Port0, Port1]>] >,
-
-  // System instructions
-  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
-  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
-  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
-  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
-  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
-  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
-  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
-  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
-  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
-  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
-  InstrItinData<IIC_RDTSCP, [InstrStage<30, [Port0, Port1]>] >,
-  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
-  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
-  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
-  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
-  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
-  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
-  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
-  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
-  // worst case for mov REG_CRx
-  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
-  // LAR
-  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
-  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
-  // LSL
-  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
-  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
-  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
-  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
-  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
-  // push control register, segment registers
-  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
-  // pop control register, segment registers
-  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
-  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
-  // VERR, VERW
-  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
-  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
-  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
-  // WRMSR, RDMSR
-  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
-  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
-  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
-  // SMSW, LMSW
-  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
-  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
-  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
-                            InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
-  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
-  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
-                               InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
-                               InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
-  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
-  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
-  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
-  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
-  InstrItinData<IIC_CLC_CMC_STC, [InstrStage<1, [Port0, Port1]>] >,
-  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
-  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
-  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
-  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
-  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
-  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
-  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
-  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
-  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
-  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
-  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
-  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
-  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
-  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
-  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
-
-  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
-  ]>;
 
 // Atom machine model.
 def AtomModel : SchedMachineModel {
   let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
   let MicroOpBufferSize = 0; // In-order execution, always hide latency.
-  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
-  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+  let LoadLatency = 3; // Expected cycles, may be overriden.
+  let HighLatency = 30;// Expected, may be overriden.
 
   // On the Atom, the throughput for taken branches is 2 cycles. For small
   // simple loops, expand by a small factor to hide the backedge cost.
   let LoopMicroOpBufferSize = 10;
   let PostRAScheduler = 1;
   let CompleteModel = 0;
+}
+
+let SchedModel = AtomModel in {
+
+// Functional Units
+def AtomPort0 : ProcResource<1>; // ALU: ALU0, shift/rotate, load/store
+                                 // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def AtomPort1 : ProcResource<1>; // ALU: ALU1, bit processing, jump, and LEA
+                                 // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomPort01 : ProcResGroup<[AtomPort0, AtomPort1]>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass AtomWriteResPair<X86FoldableSchedWrite SchedRW,
+                            list<ProcResourceKind> RRPorts,
+                            list<ProcResourceKind> RMPorts,
+                            int RRLat = 1, int RMLat = 1,
+                            list<int> RRRes = [1],
+                            list<int> RMRes = [1]> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, RRPorts> {
+    let Latency = RRLat;
+    let ResourceCycles = RRRes;
+  }
+
+  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, RMPorts> {
+    let Latency = RMLat;
+    let ResourceCycles = RMRes;
+  }
+}
+
+// A folded store needs a cycle on Port0 for the store data.
+def : WriteRes<WriteRMW, [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteALU,    [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteADC,    [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteIMul,   [AtomPort01], [AtomPort01],  7,  7,  [7],  [7]>;
+defm : AtomWriteResPair<WriteIMul64, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
+
+defm : AtomWriteResPair<WriteBSWAP32,    [AtomPort0], [AtomPort0]>;
+defm : AtomWriteResPair<WriteBSWAP64,    [AtomPort0], [AtomPort0]>;
+
+defm : AtomWriteResPair<WriteDiv8,   [AtomPort01], [AtomPort01], 50, 68, [50], [68]>;
+defm : AtomWriteResPair<WriteDiv16,  [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
+defm : AtomWriteResPair<WriteDiv32,  [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
+defm : AtomWriteResPair<WriteDiv64,  [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+defm : AtomWriteResPair<WriteIDiv8,  [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv16, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv32, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+
+defm : X86WriteResPairUnsupported<WriteCRC32>;
+
+defm : AtomWriteResPair<WriteCMOV,  [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteCMOV2, [AtomPort01], [AtomPort0]>;
+defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move.
+
+def  : WriteRes<WriteSETCC, [AtomPort01]>;
+def  : WriteRes<WriteSETCCStore, [AtomPort01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def  : WriteRes<WriteLAHFSAHF, [AtomPort01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+defm : X86WriteResUnsupported<WriteIMulH>;
+
+// This is for simple LEAs with one or two input operands.
+def : WriteRes<WriteLEA, [AtomPort1]>;
+
+def AtomWriteIMul16Ld : SchedWriteRes<[AtomPort01]> {
+  let Latency = 8;
+  let ResourceCycles = [8];
+}
+def : InstRW<[AtomWriteIMul16Ld], (instrs MUL16m, IMUL16m)>;
+
+def AtomWriteIMul32 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 6;
+  let ResourceCycles = [6];
+}
+def : InstRW<[AtomWriteIMul32], (instrs MUL32r, IMUL32r)>;
+
+def AtomWriteIMul64I : SchedWriteRes<[AtomPort01]> {
+  let Latency = 14;
+  let ResourceCycles = [14];
+}
+def : InstRW<[AtomWriteIMul64I], (instrs IMUL64rri8, IMUL64rri32,
+                                         IMUL64rmi8, IMUL64rmi32)>;
+
+// Bit counts.
+defm : AtomWriteResPair<WriteBSF, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : AtomWriteResPair<WriteBSR, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : X86WriteResPairUnsupported<WritePOPCNT>;
+defm : X86WriteResPairUnsupported<WriteLZCNT>;
+defm : X86WriteResPairUnsupported<WriteTZCNT>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : X86WriteResPairUnsupported<WriteBEXTR>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteShift, [AtomPort0], [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Double shift instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteShiftDouble, [AtomPort0], [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad,    [AtomPort0]>;
+def : WriteRes<WriteStore,   [AtomPort0]>;
+def : WriteRes<WriteStoreNT, [AtomPort0]>;
+def : WriteRes<WriteMove,    [AtomPort01]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero,  []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteJump, [AtomPort1], [AtomPort1]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem,     [AtomPort01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [AtomPort01]> { let Latency = 100; }
+def : WriteRes<WriteFence,      [AtomPort0]>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [AtomPort01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteRes<WriteFLD0,       [AtomPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,       [AtomPort01], 6, [6], 1>;
+def  : WriteRes<WriteFLoad,         [AtomPort0]>;
+def  : WriteRes<WriteFLoadX,        [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFLoadY>;
+defm : X86WriteResUnsupported<WriteFMaskedLoad>;
+defm : X86WriteResUnsupported<WriteFMaskedLoadY>;
+
+def  : WriteRes<WriteFStore,        [AtomPort0]>;
+def  : WriteRes<WriteFStoreX,       [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFStoreY>;
+def  : WriteRes<WriteFStoreNT,      [AtomPort0]>;
+def  : WriteRes<WriteFStoreNTX,     [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFStoreNTY>;
+defm : X86WriteResUnsupported<WriteFMaskedStore>;
+defm : X86WriteResUnsupported<WriteFMaskedStoreY>;
+
+def  : WriteRes<WriteFMove,         [AtomPort01]>;
+def  : WriteRes<WriteFMoveX,        [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteFMoveY>;
+
+defm : X86WriteRes<WriteEMMS,       [AtomPort01], 5, [5], 1>;
+
+defm : AtomWriteResPair<WriteFAdd,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFAddX,          [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : X86WriteResPairUnsupported<WriteFAddY>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : AtomWriteResPair<WriteFAdd64,         [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFAdd64X,       [AtomPort01], [AtomPort01],  6,  7,  [6],  [7]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Y>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : AtomWriteResPair<WriteFCmp,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFCmpX,          [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : X86WriteResPairUnsupported<WriteFCmpY>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : AtomWriteResPair<WriteFCmp64,         [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFCmp64X,       [AtomPort01], [AtomPort01],  6,  7,  [6],  [7]>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : AtomWriteResPair<WriteFCom,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFMul,           [AtomPort0],  [AtomPort0],  4,  4,  [4],  [4]>;
+defm : AtomWriteResPair<WriteFMulX,          [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : X86WriteResPairUnsupported<WriteFMulY>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : AtomWriteResPair<WriteFMul64,         [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFMul64X,       [AtomPort01], [AtomPort01],  9, 10,  [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Y>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : AtomWriteResPair<WriteFRcp,           [AtomPort0],  [AtomPort0],  4,  4,  [4],  [4]>;
+defm : AtomWriteResPair<WriteFRcpX,         [AtomPort01], [AtomPort01],  9, 10,  [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFRcpY>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : AtomWriteResPair<WriteFRsqrt,         [AtomPort0],  [AtomPort0],  4,  4,  [4],  [4]>;
+defm : AtomWriteResPair<WriteFRsqrtX,       [AtomPort01], [AtomPort01],  9, 10,  [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtY>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : AtomWriteResPair<WriteFDiv,          [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
+defm : AtomWriteResPair<WriteFDivX,         [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : X86WriteResPairUnsupported<WriteFDivY>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : AtomWriteResPair<WriteFDiv64,        [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteFDiv64X,       [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Y>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : AtomWriteResPair<WriteFSqrt,         [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
+defm : AtomWriteResPair<WriteFSqrtX,        [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : X86WriteResPairUnsupported<WriteFSqrtY>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : AtomWriteResPair<WriteFSqrt64,       [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteFSqrt64X,      [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Y>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : AtomWriteResPair<WriteFSqrt80,       [AtomPort01], [AtomPort01], 71, 71, [71], [71]>;
+defm : AtomWriteResPair<WriteFSign,          [AtomPort1],  [AtomPort1]>;
+defm : AtomWriteResPair<WriteFRnd,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : X86WriteResPairUnsupported<WriteFRndY>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : AtomWriteResPair<WriteFLogic,        [AtomPort01],  [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFLogicY>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : AtomWriteResPair<WriteFTest,         [AtomPort01],  [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFTestY>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : AtomWriteResPair<WriteFShuffle,       [AtomPort0],  [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFShuffleY>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : X86WriteResPairUnsupported<WriteDPPD>;
+defm : X86WriteResPairUnsupported<WriteDPPS>;
+defm : X86WriteResPairUnsupported<WriteDPPSY>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : X86WriteResPairUnsupported<WriteFBlend>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFVarBlend>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteCvtSS2I,   [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteCvtPS2I,   [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IY>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : AtomWriteResPair<WriteCvtSD2I,   [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteCvtPD2I,   [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IY>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : AtomWriteResPair<WriteCvtI2SS,   [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2PS,   [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : AtomWriteResPair<WriteCvtI2SD,   [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2PD,   [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDY>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : AtomWriteResPair<WriteCvtSS2SD,  [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtPS2PD,  [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : AtomWriteResPair<WriteCvtSD2SS,  [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtPD2PS,  [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def  : WriteRes<WriteVecLoad,         [AtomPort0]>;
+def  : WriteRes<WriteVecLoadX,        [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecLoadY>;
+def  : WriteRes<WriteVecLoadNT,       [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecLoadNTY>;
+defm : X86WriteResUnsupported<WriteVecMaskedLoad>;
+defm : X86WriteResUnsupported<WriteVecMaskedLoadY>;
+
+def  : WriteRes<WriteVecStore,        [AtomPort0]>;
+def  : WriteRes<WriteVecStoreX,       [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecStoreY>;
+def  : WriteRes<WriteVecStoreNT,      [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecStoreNTY>;
+def  : WriteRes<WriteVecMaskedStore,  [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecMaskedStoreY>;
+
+def  : WriteRes<WriteVecMove,          [AtomPort0]>;
+def  : WriteRes<WriteVecMoveX,        [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteVecMoveY>;
+defm : X86WriteRes<WriteVecMoveToGpr,   [AtomPort0], 3, [3], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>;
+
+defm : AtomWriteResPair<WriteVecALU,       [AtomPort01],  [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecALUX,      [AtomPort01],  [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : AtomWriteResPair<WriteVecLogic,     [AtomPort01],  [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecLogicX,    [AtomPort01],  [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : AtomWriteResPair<WriteVecTest,      [AtomPort01],  [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestY>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : AtomWriteResPair<WriteVecShift,     [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : AtomWriteResPair<WriteVecShiftX,    [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : AtomWriteResPair<WriteVecShiftImm,  [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : AtomWriteResPair<WriteVecShiftImmX, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : AtomWriteResPair<WriteVecIMul,       [AtomPort0],  [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteVecIMulX,      [AtomPort0],  [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : X86WriteResPairUnsupported<WritePMULLD>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : X86WriteResPairUnsupported<WritePHMINPOS>;
+defm : X86WriteResPairUnsupported<WriteMPSAD>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : AtomWriteResPair<WritePSADBW,       [AtomPort01], [AtomPort01], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WritePSADBWX,       [AtomPort0],  [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : AtomWriteResPair<WriteShuffle,       [AtomPort0],  [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteShuffleX,      [AtomPort0],  [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleY>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : AtomWriteResPair<WriteVarShuffle,    [AtomPort0],  [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVarShuffleX,  [AtomPort01], [AtomPort01], 4, 5, [4], [5]>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteBlend>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : X86WriteResPairUnsupported<WriteVarBlend>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarVecShift>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteVecInsert,     [AtomPort0],  [AtomPort0], 1, 1>;
+def  : WriteRes<WriteVecExtract,   [AtomPort0]>;
+def  : WriteRes<WriteVecExtractSt, [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WritePCmpIStrI>;
+defm : X86WriteResPairUnsupported<WritePCmpIStrM>;
+defm : X86WriteResPairUnsupported<WritePCmpEStrI>;
+defm : X86WriteResPairUnsupported<WritePCmpEStrM>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def  : WriteRes<WriteFMOVMSK,    [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+def  : WriteRes<WriteVecMOVMSK,  [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+def  : WriteRes<WriteMMXMOVMSK,  [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// AES instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WriteAESIMC>;
+defm : X86WriteResPairUnsupported<WriteAESKeyGen>;
+defm : X86WriteResPairUnsupported<WriteAESDecEnc>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteFHAdd,  [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteFHAddY, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WritePHAdd,  [AtomPort01], [AtomPort01], 3, 4, [3], [4]>;
+defm : AtomWriteResPair<WritePHAddX, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WritePHAddY, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WriteCLMul>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Load/store MXCSR.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLDMXCSR, [AtomPort01]> { let Latency = 5; let ResourceCycles = [5]; }
+def : WriteRes<WriteSTMXCSR, [AtomPort01]> { let Latency = 15; let ResourceCycles = [15]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// Special Cases.
+////////////////////////////////////////////////////////////////////////////////
+
+// Port0
+def AtomWrite0_1 : SchedWriteRes<[AtomPort0]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite0_1], (instrs FXAM, LD_Frr,
+                                     MOVSX64rr32)>;
+def : SchedAlias<WriteALURMW, AtomWrite0_1>;
+def : SchedAlias<WriteADCRMW, AtomWrite0_1>;
+def : InstRW<[AtomWrite0_1], (instregex "(RCL|RCR|ROL|ROR|SAR|SHL|SHR)(8|16|32|64)m",
+                                        "MOV(S|Z)X(32|64)rr(8|8_NOREX|16)")>;
+
+def AtomWrite0_5 : SchedWriteRes<[AtomPort0]> {
+  let Latency = 5;
+  let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite0_5], (instregex "IMUL32(rm|rr)")>;
+
+// Port1
+def AtomWrite1_1 : SchedWriteRes<[AtomPort1]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite1_1], (instrs FCOMPP)>;
+def : InstRW<[AtomWrite1_1], (instregex "UCOM_F(P|PP)?r",
+                                        "BT(C|R|S)?(16|32|64)(rr|ri8)")>;
+
+def AtomWrite1_5 : SchedWriteRes<[AtomPort1]> {
+  let Latency = 5;
+  let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSirr, MMX_CVTPI2PSirm,
+                                     MMX_CVTPS2PIirr, MMX_CVTTPS2PIirr)>;
+
+// Port0 and Port1
+def AtomWrite0_1_1 : SchedWriteRes<[AtomPort0, AtomPort1]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[AtomWrite0_1_1], (instrs POP32r, POP64r,
+                                       POP16rmr, POP32rmr, POP64rmr,
+                                       PUSH16r, PUSH32r, PUSH64r,
+                                       PUSHi16, PUSHi32,
+                                       PUSH16rmr, PUSH32rmr, PUSH64rmr,
+                                       PUSH16i8, PUSH32i8, PUSH64i8, PUSH64i32,
+                                       XCH_F)>;
+def : InstRW<[AtomWrite0_1_1], (instregex "RETI(L|Q|W)$",
+                                          "IRET(16|32|64)?")>;
+
+def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> {
+  let Latency = 5;
+  let ResourceCycles = [5, 5];
+}
+def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIirm, MMX_CVTTPS2PIirm)>;
+def : InstRW<[AtomWrite0_1_5], (instregex "ILD_F(16|32|64)")>;
 
-  let Itineraries = AtomItineraries;
+// Port0 or Port1
+def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
 }
+def : InstRW<[AtomWrite01_1], (instrs FDECSTP, FFREE, FFREEP, FINCSTP, WAIT,
+                                      LFENCE,
+                                      STOSB, STOSL, STOSQ, STOSW,
+                                      MOVSSrr, MOVSSrr_REV,
+                                      PSLLDQri, PSRLDQri)>;
+def : InstRW<[AtomWrite01_1], (instregex "MMX_PACK(SSDW|SSWB|USWB)irr",
+                                         "MMX_PUNPCKH(BW|DQ|WD)irr")>;
+
+def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r,
+                                      PUSH16rmm, PUSH32rmm, PUSH64rmm,
+                                      LODSB, LODSL, LODSQ, LODSW,
+                                      SCASB, SCASL, SCASQ, SCASW,
+                                      SHLD32rrCL, SHRD32rrCL,
+                                      SHLD32rri8, SHRD32rri8)>;
+def : InstRW<[AtomWrite01_2], (instregex "BT(C|R|S)(16|32|64)mi8",
+                                         "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)",
+                                         "XADD(8|16|32|64)rr",
+                                         "XCHG(8|16|32|64)(ar|rr)",
+                                         "(ST|ISTT)_F(P)?(16|32|64)?(m|rr)",
+                                         "MMX_P(ADD|SUB)Qirr",
+                                         "MOV(S|Z)X16rr8",
+                                         "MOV(UPS|UPD|DQU)mr",
+                                         "MASKMOVDQU(64)?",
+                                         "P(ADD|SUB)Qrr")>;
+
+def AtomWrite01_3 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+}
+def : InstRW<[AtomWrite01_3], (instrs CLD, LDDQUrm,
+                                      CMPSB, CMPSL, CMPSQ, CMPSW,
+                                      MOVSB, MOVSL, MOVSQ, MOVSW,
+                                      POP16rmm, POP32rmm, POP64rmm)>;
+def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm",
+                                         "XCHG(8|16|32|64)rm",
+                                         "PH(ADD|SUB)Drr",
+                                         "MOV(S|Z)X16rm8",
+                                         "MMX_P(ADD|SUB)Qirm",
+                                         "MOV(UPS|UPD|DQU)rm",
+                                         "P(ADD|SUB)Qrm")>;
+
+def AtomWrite01_4 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 4;
+  let ResourceCycles = [4];
+}
+def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO,
+                                      JCXZ, JECXZ, JRCXZ,
+                                      SHLD32mrCL, SHRD32mrCL,
+                                      SHLD32mri8, SHRD32mri8,
+                                      LD_F80m)>;
+def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm",
+                                         "(MMX_)?PEXTRWrr(_REV)?")>;
+
+def AtomWrite01_5 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 5;
+  let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite01_5], (instrs FLDCW16m, ST_FP80m)>;
+def : InstRW<[AtomWrite01_5], (instregex "MMX_PH(ADD|SUB)S?Wrr")>;
+
+def AtomWrite01_6 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 6;
+  let ResourceCycles = [6];
+}
+def : InstRW<[AtomWrite01_6], (instrs CMPXCHG8rm, INTO, XLAT,
+                                      SHLD16rrCL, SHRD16rrCL,
+                                      SHLD16rri8, SHRD16rri8,
+                                      SHLD16mrCL, SHRD16mrCL,
+                                      SHLD16mri8, SHRD16mri8)>;
+def : InstRW<[AtomWrite01_6], (instregex "IMUL16rr",
+                                         "IST_F(P)?(16|32|64)?m",
+                                         "MMX_PH(ADD|SUB)S?Wrm")>;
+
+def AtomWrite01_7 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 7;
+  let ResourceCycles = [7];
+}
+def : InstRW<[AtomWrite01_7], (instrs AAD8i8)>;
+
+def AtomWrite01_8 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 8;
+  let ResourceCycles = [8];
+}
+def : InstRW<[AtomWrite01_8], (instrs LOOPE,
+                                      PUSHA16, PUSHA32,
+                                      SHLD64rrCL, SHRD64rrCL,
+                                      FNSTCW16m)>;
+
+def AtomWrite01_9 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 9;
+  let ResourceCycles = [9];
+}
+def : InstRW<[AtomWrite01_9], (instrs BT16mr, BT32mr, BT64mr,
+                                      POPA16, POPA32,
+                                      PUSHF16, PUSHF32, PUSHF64,
+                                      SHLD64mrCL, SHRD64mrCL,
+                                      SHLD64mri8, SHRD64mri8,
+                                      SHLD64rri8, SHRD64rri8,
+                                      CMPXCHG8rr)>;
+def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F",
+                                         "(U)?COMIS(D|S)rr",
+                                         "CVT(T)?SS2SI64rr(_Int)?")>;
+
+def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 10;
+  let ResourceCycles = [10];
+}
+def : SchedAlias<WriteFLDC, AtomWrite01_10>;
+def : InstRW<[AtomWrite01_10], (instregex "(U)?COMIS(D|S)rm",
+                                          "CVT(T)?SS2SI64rm(_Int)?")>;
+
+def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 11;
+  let ResourceCycles = [11];
+}
+def : InstRW<[AtomWrite01_11], (instrs BOUNDS16rm, BOUNDS32rm)>;
+def : InstRW<[AtomWrite01_11], (instregex "BT(C|R|S)(16|32|64)mr")>;
+
+def AtomWrite01_13 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 13;
+  let ResourceCycles = [13];
+}
+def : InstRW<[AtomWrite01_13], (instrs AAA, AAS)>;
+
+def AtomWrite01_14 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 14;
+  let ResourceCycles = [14];
+}
+def : InstRW<[AtomWrite01_14], (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
+
+def AtomWrite01_15 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 15;
+  let ResourceCycles = [15];
+}
+def : InstRW<[AtomWrite01_15], (instrs CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr)>;
+
+def AtomWrite01_17 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : InstRW<[AtomWrite01_17], (instrs LOOPNE, PAUSE)>;
+
+def AtomWrite01_18 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 18;
+  let ResourceCycles = [18];
+}
+def : InstRW<[AtomWrite01_18], (instrs CMPXCHG8B, DAA, LOOP)>;
+
+def AtomWrite01_20 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 20;
+  let ResourceCycles = [20];
+}
+def : InstRW<[AtomWrite01_20], (instrs DAS)>;
+
+def AtomWrite01_21 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 21;
+  let ResourceCycles = [21];
+}
+def : InstRW<[AtomWrite01_21], (instrs AAM8i8, STD)>;
+
+def AtomWrite01_22 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 22;
+  let ResourceCycles = [22];
+}
+def : InstRW<[AtomWrite01_22], (instrs CMPXCHG16B)>;
+
+def AtomWrite01_23 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 23;
+  let ResourceCycles = [23];
+}
+def : InstRW<[AtomWrite01_23], (instrs ARPL16mr, ARPL16rr)>;
+
+def AtomWrite01_25 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 25;
+  let ResourceCycles = [25];
+}
+def : InstRW<[AtomWrite01_25], (instrs FNCLEX, FXTRACT)>;
+
+def AtomWrite01_26 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 26;
+  let ResourceCycles = [26];
+}
+def : InstRW<[AtomWrite01_26], (instrs POPF32, POPF64)>;
+
+def AtomWrite01_29 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 29;
+  let ResourceCycles = [29];
+}
+def : InstRW<[AtomWrite01_29], (instregex "POP(DS|ES|FS|GS)(16|32|64)")>;
+
+def AtomWrite01_30 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 30;
+  let ResourceCycles = [30];
+}
+def : InstRW<[AtomWrite01_30], (instrs RDTSC, RDTSCP)>;
+
+def AtomWrite01_32 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 32;
+  let ResourceCycles = [32];
+}
+def : InstRW<[AtomWrite01_32], (instrs ENTER, POPF16)>;
+
+def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 45;
+  let ResourceCycles = [45];
+}
+def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>;
+
+def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 46;
+  let ResourceCycles = [46];
+}
+def : InstRW<[AtomWrite01_46], (instrs FRNDINT, MWAITrr, RDPMC)>;
+
+def AtomWrite01_48 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 48;
+  let ResourceCycles = [48];
+}
+def : InstRW<[AtomWrite01_48], (instrs POPSS16, POPSS32)>;
+
+def AtomWrite01_55 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 55;
+  let ResourceCycles = [55];
+}
+def : InstRW<[AtomWrite01_55], (instrs FPREM)>;
+
+def AtomWrite01_59 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 59;
+  let ResourceCycles = [59];
+}
+def : InstRW<[AtomWrite01_59], (instrs INSB, INSL, INSW)>;
+
+def AtomWrite01_63 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 63;
+  let ResourceCycles = [63];
+}
+def : InstRW<[AtomWrite01_63], (instrs FNINIT)>;
+
+def AtomWrite01_68 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 68;
+  let ResourceCycles = [68];
+}
+def : InstRW<[AtomWrite01_68], (instrs OUT8rr, OUT16rr, OUT32rr)>;
+
+def AtomWrite01_71 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 71;
+  let ResourceCycles = [71];
+}
+def : InstRW<[AtomWrite01_71], (instrs FPREM1,
+                                       INVLPG, INVLPGA32, INVLPGA64)>;
+
+def AtomWrite01_72 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 72;
+  let ResourceCycles = [72];
+}
+def : InstRW<[AtomWrite01_72], (instrs OUT8ir, OUT16ir, OUT32ir)>;
+
+def AtomWrite01_74 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 74;
+  let ResourceCycles = [74];
+}
+def : InstRW<[AtomWrite01_74], (instrs OUTSB, OUTSL, OUTSW)>;
+
+def AtomWrite01_77 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 77;
+  let ResourceCycles = [77];
+}
+def : InstRW<[AtomWrite01_77], (instrs FSCALE)>;
+
+def AtomWrite01_78 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 78;
+  let ResourceCycles = [78];
+}
+def : InstRW<[AtomWrite01_78], (instrs RDMSR)>;
+
+def AtomWrite01_79 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 79;
+  let ResourceCycles = [79];
+}
+def : InstRW<[AtomWrite01_79], (instregex "RET(L|Q|W)?$",
+                                          "LRETI?(L|Q|W)")>;
+
+def AtomWrite01_92 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 92;
+  let ResourceCycles = [92];
+}
+def : InstRW<[AtomWrite01_92], (instrs IN8ri, IN16ri, IN32ri)>;
+
+def AtomWrite01_94 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 94;
+  let ResourceCycles = [94];
+}
+def : InstRW<[AtomWrite01_94], (instrs IN8rr, IN16rr, IN32rr)>;
+
+def AtomWrite01_99 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 99;
+  let ResourceCycles = [99];
+}
+def : InstRW<[AtomWrite01_99], (instrs F2XM1)>;
+
+def AtomWrite01_121 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 121;
+  let ResourceCycles = [121];
+}
+def : InstRW<[AtomWrite01_121], (instrs CPUID)>;
+
+def AtomWrite01_127 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 127;
+  let ResourceCycles = [127];
+}
+def : InstRW<[AtomWrite01_127], (instrs INT)>;
+
+def AtomWrite01_130 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 130;
+  let ResourceCycles = [130];
+}
+def : InstRW<[AtomWrite01_130], (instrs INT3)>;
+
+def AtomWrite01_140 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 140;
+  let ResourceCycles = [140];
+}
+def : InstRW<[AtomWrite01_140], (instrs FXSAVE, FXSAVE64)>;
+
+def AtomWrite01_141 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 141;
+  let ResourceCycles = [141];
+}
+def : InstRW<[AtomWrite01_141], (instrs FXRSTOR, FXRSTOR64)>;
+
+def AtomWrite01_146 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 146;
+  let ResourceCycles = [146];
+}
+def : InstRW<[AtomWrite01_146], (instrs FYL2X)>;
+
+def AtomWrite01_147 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 147;
+  let ResourceCycles = [147];
+}
+def : InstRW<[AtomWrite01_147], (instrs FYL2XP1)>;
+
+def AtomWrite01_168 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 168;
+  let ResourceCycles = [168];
+}
+def : InstRW<[AtomWrite01_168], (instrs FPTAN)>;
+
+def AtomWrite01_174 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 174;
+  let ResourceCycles = [174];
+}
+def : InstRW<[AtomWrite01_174], (instrs FSINCOS)>;
+def : InstRW<[AtomWrite01_174], (instregex "(COS|SIN)_F")>;
+
+def AtomWrite01_183 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 183;
+  let ResourceCycles = [183];
+}
+def : InstRW<[AtomWrite01_183], (instrs FPATAN)>;
+
+def AtomWrite01_202 : SchedWriteRes<[AtomPort01]> {
+  let Latency = 202;
+  let ResourceCycles = [202];
+}
+def : InstRW<[AtomWrite01_202], (instrs WRMSR)>;
+
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 6ea81a25e41c..d78c343ebd5c 100644
--- a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -38,8 +38,27 @@ def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
 def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
 def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
 
-// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
-def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: www.realworldtech.com/jaguar/4/
+//
+// The processor always keeps the different parts of an integer register
+// together. An instruction that writes to a part of a register will therefore
+// have a false dependence on any previous write to the same register or any
+// part of it.
+// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
+// access" - Agner Fog's "microarchitecture.pdf".
+def JIntegerPRF : RegisterFile<64, [GR64, CCR]>;
+
+// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: www.realworldtech.com/jaguar/4/
+def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
+// retire up to two macro-ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 16h Processors"
+def JRCU : RetireControlUnit<64, 2>;
 
 // Integer Pipe Scheduler
 def JALU01 : ProcResGroup<[JALU0, JALU1]> {
@@ -56,6 +75,7 @@ def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
   let BufferSize=18;
 }
 
+// Functional units
 def JDiv    : ProcResource<1>; // integer division
 def JMul    : ProcResource<1>; // integer multiplication
 def JVALU0  : ProcResource<1>; // vector integer
@@ -65,6 +85,10 @@ def JSTC    : ProcResource<1>; // vector store/convert
 def JFPM    : ProcResource<1>; // FP multiplication
 def JFPA    : ProcResource<1>; // FP addition
 
+// Functional unit groups
+def JFPX  : ProcResGroup<[JFPA, JFPM]>;
+def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
+
 // Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
 // cycles after the memory operand.
 def : ReadAdvance<ReadAfterLd, 3>;
@@ -75,28 +99,59 @@ def : ReadAdvance<ReadAfterLd, 3>;
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
 multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                            list<ProcResourceKind> ExePorts,
+                            int Lat, list<int> Res = [], int UOps = 1> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
   // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
   // latency.
-  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
-     let Latency = !add(Lat, 3);
+  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+    let Latency = !add(Lat, 3);
+    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+    let NumMicroOps = UOps;
   }
 }
 
 multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                            list<ProcResourceKind> ExePorts,
+                            int Lat, list<int> Res = [], int UOps = 1> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
   // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
   // latency.
-  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
-     let Latency = !add(Lat, 5);
+  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+    let Latency = !add(Lat, 5);
+    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+    let NumMicroOps = UOps;
+  }
+}
+
+multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
+                            list<ProcResourceKind> ExePorts,
+                            int Lat, list<int> Res = [2], int UOps = 2> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
+
+  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+    let Latency = !add(Lat, 5);
+    let ResourceCycles = !listconcat([2], Res);
+    let NumMicroOps = UOps;
   }
 }
 
@@ -107,66 +162,94 @@ def : WriteRes<WriteRMW, [JSAGU]>;
 // Arithmetic.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : JWriteResIntPair<WriteALU,   JALU01, 1>;
-defm : JWriteResIntPair<WriteIMul,  JALU1,  3>;
-
-def  : WriteRes<WriteIMulH, [JALU1]> {
-  let Latency = 6;
-  let ResourceCycles = [4];
-}
-
-// FIXME 8/16 bit divisions
-def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
-  let Latency = 25;
-  let ResourceCycles = [1, 25];
-}
-def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
-  let Latency = 41;
-  let ResourceCycles = [1, 1, 25];
-}
+defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
+defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
+defm : JWriteResIntPair<WriteIMul,   [JALU1, JMul], 3, [1, 1], 2>; // i8/i16/i32 multiplication
+defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; // i64 multiplication
+defm : X86WriteRes<WriteIMulH,       [JALU1], 6, [4], 1>;
+
+defm : JWriteResIntPair<WriteBSWAP32,[JALU01], 1>;
+defm : JWriteResIntPair<WriteBSWAP64,[JALU01], 1>;
+
+defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
+defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
+defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
+defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
+defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
+defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
+defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
+defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
+
+defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
+
+defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
+defm : JWriteResIntPair<WriteCMOV2, [JALU01], 1>; // Conditional (CF + ZF flag) move.
+defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
+def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
 
 // This is for simple LEAs with one or two input operands.
-// FIXME: SAGU 3-operand LEA
 def : WriteRes<WriteLEA, [JALU01]>;
 
+// Bit counts.
+defm : JWriteResIntPair<WriteBSF, [JALU01], 5, [4], 8>;
+defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [4], 8>;
+defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
+defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
+defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2]>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Integer shifts and rotates.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+defm : JWriteResIntPair<WriteShift, [JALU01], 1>;
+
+defm : JWriteResIntPair<WriteShiftDouble, [JALU01], 1>;
 
-def WriteSHLDrri : SchedWriteRes<[JALU01]> {
+def JWriteSHLDrri : SchedWriteRes<[JALU01]> {
   let Latency = 3;
   let ResourceCycles = [6];
   let NumMicroOps = 6;
 }
-def: InstRW<[WriteSHLDrri], (instregex "SHLD(16|32|64)rri8")>;
-def: InstRW<[WriteSHLDrri], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[JWriteSHLDrri], (instrs SHLD16rri8, SHLD32rri8, SHLD64rri8,
+                                     SHRD16rri8, SHRD32rri8, SHRD64rri8)>;
 
-def WriteSHLDrrCL : SchedWriteRes<[JALU01]> {
+def JWriteSHLDrrCL : SchedWriteRes<[JALU01]> {
   let Latency = 4;
   let ResourceCycles = [8];
   let NumMicroOps = 7;
 }
-def: InstRW<[WriteSHLDrrCL], (instregex "SHLD(16|32|64)rrCL")>;
-def: InstRW<[WriteSHLDrrCL], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[JWriteSHLDrrCL], (instrs SHLD16rrCL, SHLD32rrCL, SHLD64rrCL,
+                                      SHRD16rrCL, SHRD32rrCL, SHRD64rrCL)>;
 
-def WriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> {
+def JWriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> {
   let Latency = 9;
   let ResourceCycles = [1, 22];
   let NumMicroOps = 8;
 }
-def: InstRW<[WriteSHLDm], (instregex "SHLD(16|32|64)mr(i8|CL)")>;
-def: InstRW<[WriteSHLDm], (instregex "SHRD(16|32|64)mr(i8|CL)")>;
+def: InstRW<[JWriteSHLDm],(instrs SHLD16mri8, SHLD32mri8, SHLD64mri8,
+                                  SHLD16mrCL, SHLD32mrCL, SHLD64mrCL,
+                                  SHRD16mri8, SHRD32mri8, SHRD64mri8,
+                                  SHRD16mrCL, SHRD32mrCL, SHRD64mrCL)>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Loads, stores, and moves, not folded with other operations.
-// FIXME: Split x86 and SSE load/store/moves
 ////////////////////////////////////////////////////////////////////////////////
 
-def : WriteRes<WriteLoad,  [JLAGU]> { let Latency = 5; }
-def : WriteRes<WriteStore, [JSAGU]>;
-def : WriteRes<WriteMove,  [JALU01]>;
+def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore,   [JSAGU]>;
+def : WriteRes<WriteStoreNT, [JSAGU]>;
+def : WriteRes<WriteMove,    [JALU01]>;
+
+// Load/store MXCSR.
+// FIXME: These are copy and pasted from WriteLoad/Store.
+def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteSTMXCSR, [JSAGU]>;
 
 // Treat misc copies as a move.
 def : InstRW<[WriteMove], (instrs COPY)>;
@@ -183,572 +266,438 @@ def : WriteRes<WriteZero,  []>;
 // consume resources. Indirect branches can fold loads.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : JWriteResIntPair<WriteJump,  JALU01, 1>;
+defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// Floating point. This covers both scalar and vector operations.
-// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
-// FIXME: Double precision latencies
-// FIXME: SS vs PS latencies
-// FIXME: ymm latencies
-////////////////////////////////////////////////////////////////////////////////
-
-defm : JWriteResFpuPair<WriteFAdd,        JFPU0,  3>;
-defm : JWriteResFpuPair<WriteFMul,        JFPU1,  2>;
-defm : JWriteResFpuPair<WriteFMA,         JFPU1,  2>; // NOTE: Doesn't exist on Jaguar.
-defm : JWriteResFpuPair<WriteFRcp,        JFPU1,  2>;
-defm : JWriteResFpuPair<WriteFRsqrt,      JFPU1,  2>;
-defm : JWriteResFpuPair<WriteFShuffle,   JFPU01,  1>;
-defm : JWriteResFpuPair<WriteFBlend,     JFPU01,  1>;
-defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
-
-def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
-  let Latency = 21;
-  let ResourceCycles = [1, 1, 21];
-}
-def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
-  let Latency = 26;
-  let ResourceCycles = [1, 1, 21];
-}
-
-def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
-  let Latency = 19;
-  let ResourceCycles = [1, 1, 19];
-}
-def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
-  let Latency = 24;
-  let ResourceCycles = [1, 1, 19];
-}
-
-// FIXME: integer pipes
-defm : JWriteResFpuPair<WriteCvtF2I,    JFPU1,  3>; // Float -> Integer.
-defm : JWriteResFpuPair<WriteCvtI2F,    JFPU1,  3>; // Integer -> Float.
-defm : JWriteResFpuPair<WriteCvtF2F,    JFPU1,  3>; // Float -> Float size conversion.
-
-def : WriteRes<WriteFVarBlend, [JFPU01]> {
-  let Latency = 2;
-  let ResourceCycles = [4];
-  let NumMicroOps = 3;
-}
-def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 4];
-  let NumMicroOps = 3;
-}
-
-// Vector integer operations.
-defm : JWriteResFpuPair<WriteVecALU,   JFPU01,  1>;
-defm : JWriteResFpuPair<WriteVecShift, JFPU01,  1>;
-defm : JWriteResFpuPair<WriteVecIMul,  JFPU0,   2>;
-defm : JWriteResFpuPair<WriteShuffle,  JFPU01,  1>;
-defm : JWriteResFpuPair<WriteBlend,    JFPU01,  1>;
-defm : JWriteResFpuPair<WriteVecLogic, JFPU01,  1>;
-defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
-
-def : WriteRes<WriteVarBlend, [JFPU01]> {
-  let Latency = 2;
-  let ResourceCycles = [4];
-  let NumMicroOps = 3;
-}
-def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 4];
-  let NumMicroOps = 3;
-}
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
 
-// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
-def : WriteRes<WriteVarVecShift, [JFPU01]> {}
-def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 2];
-}
+def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [JSAGU]>;
 
-def : WriteRes<WriteMPSAD, [JFPU0]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-}
-def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2];
-}
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
 
 ////////////////////////////////////////////////////////////////////////////////
-// String instructions.
-// Packed Compare Implicit Length Strings, Return Mask
-// FIXME: approximate latencies + pipe dependencies
+// Floating point. This covers both scalar and vector operations.
 ////////////////////////////////////////////////////////////////////////////////
 
-def : WriteRes<WritePCmpIStrM, [JFPU1,JFPU0]> {
-  let Latency = 8;
-  let ResourceCycles = [2, 2];
-  let NumMicroOps = 3;
-}
-def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU1, JFPU0]> {
-  let Latency = 13;
-  let ResourceCycles = [1, 2, 2];
-  let NumMicroOps = 3;
-}
-
-// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
-  let Latency = 14;
-  let ResourceCycles = [5, 5, 5, 5, 5];
-  let NumMicroOps = 9;
-}
-def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
-  let Latency = 19;
-  let ResourceCycles = [1, 5, 5, 5, 5, 5];
-  let NumMicroOps = 9;
-}
-
-// Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [JFPU1, JFPU0]> {
-  let Latency = 7;
-  let ResourceCycles = [2, 2];
-}
-def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU1, JFPU0]> {
-  let Latency = 12;
-  let ResourceCycles = [1, 2, 2];
-}
-
-// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
-  let Latency = 14;
-  let ResourceCycles = [5, 5, 5, 5, 5];
-  let NumMicroOps = 9;
-}
-def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
-  let Latency = 19;
-  let ResourceCycles = [1, 5, 5, 5, 5, 5];
-  let NumMicroOps = 9;
-}
+defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 1, 2], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
+defm : X86WriteRes<WriteFMaskedStore,  [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
+defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
+
+defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
+
+defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
+defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
+defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
+defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
+defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
+defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
+defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
+defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
+defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
+defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
+defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
+defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
+defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
+defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
+defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
+defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
+defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
+defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
+defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
+defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
+defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
+defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
+defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
+defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
+defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
+defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
+defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
+defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
+defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  2, [1, 4], 3>;
+defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  3, [2, 6], 6>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
+defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [1, 4], 3>;
+defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [2, 6], 6>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// AES Instructions.
+// Conversions.
 ////////////////////////////////////////////////////////////////////////////////
 
-def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
-  let Latency = 3;
-  let ResourceCycles = [1, 1];
-}
-def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 1, 1];
-}
-
-def : WriteRes<WriteAESIMC, [JVIMUL]> {
-  let Latency = 2;
-  let ResourceCycles = [1];
-}
-def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
-
-def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
-  let Latency = 2;
-  let ResourceCycles = [1];
-}
-def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
+defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPA, JALU0], 7, [1,1,1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPA, JALU0], 7, [1,1,1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+// FIXME: f+3 ST, LD+STC latency
+defm : JWriteResFpuPair<WriteCvtI2SS,      [JFPU1, JSTC], 9, [1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : JWriteResFpuPair<WriteCvtI2SD,      [JFPU1, JSTC], 9, [1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
+defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+
+defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
+defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub  instructions.
+// Vector integer operations.
 ////////////////////////////////////////////////////////////////////////////////
 
-def : WriteRes<WriteFHAdd, [JFPU0]> {
-  let Latency = 3;
-}
-
-def : WriteRes<WriteFHAddLd, [JLAGU, JFPU0]> {
-  let Latency = 8;
-}
-
-def : WriteRes<WritePHAdd, [JFPU01]> {
-  let ResourceCycles = [1];
-}
-def : WriteRes<WritePHAddLd, [JLAGU, JFPU01 ]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 1];
-}
-
-def WriteFHAddY: SchedWriteRes<[JFPU0]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-}
-def : InstRW<[WriteFHAddY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>;
-
-def WriteFHAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteFHAddYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>;
+defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadX,         [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadY,         [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 1, 2], 1>;
+defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
+defm : X86WriteRes<WriteVecMaskedStore,   [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
+defm : X86WriteRes<WriteVecMaskedStoreY,  [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
+
+defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : X86WriteResPairUnsupported<WriteVarVecShift>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
+defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU0,  JVALU], 2>;
+defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleY>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// Carry-less multiplication instructions.
+// Vector insert/extract operations.
 ////////////////////////////////////////////////////////////////////////////////
 
-def : WriteRes<WriteCLMul, [JVIMUL]> {
-  let Latency = 2;
-  let ResourceCycles = [1];
-}
-def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
-
-// FIXME: pipe for system/microcode?
-def : WriteRes<WriteSystem,     [JAny]> { let Latency = 100; }
-def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
-def : WriteRes<WriteFence,  [JSAGU]>;
-def : WriteRes<WriteNop, []>;
+defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
+defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// SSE4.1 instructions.
+// SSE42 String instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-def WriteDPPS: SchedWriteRes<[JFPU0, JFPU1]> {
-  let Latency = 11;
-  let ResourceCycles = [3,3];
-  let NumMicroOps = 5;
-}
-def : InstRW<[WriteDPPS], (instregex "(V)?DPPSrri")>;
-
-def WriteDPPSLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> {
-  let Latency = 16;
-  let ResourceCycles = [1,3,3];
-  let NumMicroOps = 6;
-}
-def : InstRW<[WriteDPPSLd], (instregex "(V)?DPPSrmi")>;
-
-def WriteDPPD: SchedWriteRes<[JFPU0, JFPU1]> {
-  let Latency = 9;
-  let ResourceCycles = [3,3];
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteDPPD], (instregex "(V)?DPPDrri")>;
-
-def WriteDPPDLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> {
-  let Latency = 14;
-  let ResourceCycles = [1,3,3];
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteDPPDLd], (instregex "(V)?DPPDrmi")>;
+defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPA, JALU0], 7, [1, 2, 1, 1], 3>;
+defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPA, JALU0], 8, [1, 2, 1, 1], 3>;
+defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
+defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// SSE4A instructions.
+// MOVMSK Instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-def WriteEXTRQ: SchedWriteRes<[JFPU01]> {
-  let Latency = 1;
-  let ResourceCycles = [1];
-}
-def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>;
-
-def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
-  let Latency = 2;
-  let ResourceCycles = [4];
-}
-def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
+def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
 
 ////////////////////////////////////////////////////////////////////////////////
-// F16C instructions.
+// AES Instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-def WriteCVT3: SchedWriteRes<[JFPU1]> {
-  let Latency = 3;
-}
-def : InstRW<[WriteCVT3], (instregex "VCVTPS2PHrr")>;
-def : InstRW<[WriteCVT3], (instregex "VCVTPH2PSrr")>;
-
-def WriteCVT3St: SchedWriteRes<[JFPU1, JSAGU]> {
-  let Latency = 3;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteCVT3St], (instregex "VCVTPS2PHmr")>;
-
-def WriteCVT3Ld: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteCVT3Ld], (instregex "VCVTPH2PSrm")>;
-
-def WriteCVTPS2PHY: SchedWriteRes<[JFPU1, JFPU01]> {
-  let Latency = 6;
-  let ResourceCycles = [2,2];
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteCVTPS2PHY], (instregex "VCVTPS2PHYrr")>;
-
-def WriteCVTPS2PHYSt: SchedWriteRes<[JFPU1, JFPU01, JSAGU]> {
-  let Latency = 11;
-  let ResourceCycles = [2,2,1];
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteCVTPS2PHYSt], (instregex "VCVTPS2PHYmr")>;
-
-def WriteCVTPH2PSY: SchedWriteRes<[JFPU1]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-  let NumMicroOps = 2;
-}
-def : InstRW<[WriteCVTPH2PSY], (instregex "VCVTPH2PSYrr")>;
-
-def WriteCVTPH2PSYLd: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 8;
-  let ResourceCycles = [1,2];
-  let NumMicroOps = 2;
-}
-def : InstRW<[WriteCVTPH2PSYLd], (instregex "VCVTPH2PSYrm")>;
+defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU0, JVIMUL], 3, [1, 1], 2>;
 
 ////////////////////////////////////////////////////////////////////////////////
-// AVX instructions.
+// Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-def WriteVDPPSY: SchedWriteRes<[JFPU1, JFPU0]> {
-  let Latency = 12;
-  let ResourceCycles = [6, 6];
-  let NumMicroOps = 10;
-}
-def : InstRW<[WriteVDPPSY], (instregex "VDPPSYrr")>;
-
-def WriteVDPPSYLd: SchedWriteRes<[JLAGU, JFPU1, JFPU0]> {
-  let Latency = 17;
-  let ResourceCycles = [1, 6, 6];
-  let NumMicroOps = 11;
-}
-def : InstRW<[WriteVDPPSYLd, ReadAfterLd], (instregex "VDPPSYrm")>;
-
-def WriteFAddY: SchedWriteRes<[JFPU0]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-}
-def : InstRW<[WriteFAddY], (instregex "VADD(SUB)?P(S|D)Yrr", "VSUBP(S|D)Yrr")>;
-
-def WriteFAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteFAddYLd, ReadAfterLd], (instregex "VADD(SUB)?P(S|D)Yrm", "VSUBP(S|D)Yrm")>;
+defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 3>;
+defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 3, [2,2], 2>;
+defm : JWriteResFpuPair<WritePHAdd,       [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WritePHAddX,      [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
 
-def WriteFDivY: SchedWriteRes<[JFPU1]> {
-  let Latency = 38;
-  let ResourceCycles = [38];
-}
-def : InstRW<[WriteFDivY], (instregex "VDIVP(D|S)Yrr")>;
-
-def WriteFDivYLd: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 43;
-  let ResourceCycles = [1, 38];
-}
-def : InstRW<[WriteFDivYLd, ReadAfterLd], (instregex "VDIVP(S|D)Yrm")>;
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
 
-def WriteVMULYPD: SchedWriteRes<[JFPU1]> {
-  let Latency = 4;
-  let ResourceCycles = [4];
-}
-def : InstRW<[WriteVMULYPD], (instregex "VMULPDYrr")>;
+defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
 
-def WriteVMULYPDLd: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 9;
-  let ResourceCycles = [1, 4];
-}
-def : InstRW<[WriteVMULYPDLd, ReadAfterLd], (instregex "VMULPDYrm")>;
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
 
-def WriteVMULYPS: SchedWriteRes<[JFPU1]> {
+def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
   let Latency = 2;
-  let ResourceCycles = [2];
-}
-def : InstRW<[WriteVMULYPS], (instregex "VMULPSYrr", "VRCPPSYr", "VRSQRTPSYr")>;
-
-def WriteVMULYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteVMULYPSLd, ReadAfterLd], (instregex "VMULPSYrm", "VRCPPSYm", "VRSQRTPSYm")>;
-
-def WriteVCVTY: SchedWriteRes<[JSTC]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-}
-def : InstRW<[WriteVCVTY], (instregex "VCVTDQ2P(S|D)Yrr")>;
-def : InstRW<[WriteVCVTY], (instregex "VROUNDYP(S|D)r")>;
-def : InstRW<[WriteVCVTY], (instregex "VCVTPS2DQYrr")>;
-def : InstRW<[WriteVCVTY], (instregex "VCVTTPS2DQYrr")>;
-
-def WriteVCVTYLd: SchedWriteRes<[JLAGU, JSTC]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2];
+  let ResourceCycles = [1, 4];
 }
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTDQ2P(S|D)Yrm")>;
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VROUNDYP(S|D)m")>;
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTPS2DQYrm")>;
-def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTTPS2DQYrm")>;
+def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
 
-def WriteVMONTPSt: SchedWriteRes<[JSTC, JLAGU]> {
-  let Latency = 3;
-  let ResourceCycles = [2,1];
-}
-def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTP(S|D)Ymr")>;
-def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTDQYmr")>;
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
 
-def WriteVCVTPDY: SchedWriteRes<[JSTC, JFPU01]> {
+def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
   let Latency = 6;
-  let ResourceCycles = [2, 4];
-}
-def : InstRW<[WriteVCVTPDY], (instregex "VCVTPD2(DQ|PS)Yrr")>;
-def : InstRW<[WriteVCVTPDY], (instregex "VCVTTPD2DQYrr")>;
-
-def WriteVCVTPDYLd: SchedWriteRes<[JLAGU, JSTC, JFPU01]> {
-  let Latency = 11;
   let ResourceCycles = [1, 2, 4];
+  let NumMicroOps = 2;
 }
-def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTPD2(DQ|PS)Yrm")>;
-def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTTPD2DQYrm")>;
-
-def WriteVBlendVPY: SchedWriteRes<[JFPU01]> {
-  let Latency = 3;
-  let ResourceCycles = [6];
-}
-def : InstRW<[WriteVBlendVPY], (instregex "VBLENDVP(S|D)Yrr", "VPERMILP(D|S)Yrr")>;
+def : InstRW<[JWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
+                                                         VBROADCASTSSYrm)>;
 
-def WriteVBlendVPYLd: SchedWriteRes<[JLAGU, JFPU01]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 6];
+def JWriteJVZEROALL: SchedWriteRes<[]> {
+  let Latency = 90;
+  let NumMicroOps = 73;
 }
-def : InstRW<[WriteVBlendVPYLd, ReadAfterLd], (instregex "VBLENDVP(S|D)Yrm")>;
+def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
 
-def WriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 4];
+def JWriteJVZEROUPPER: SchedWriteRes<[]> {
+  let Latency = 46;
+  let NumMicroOps = 37;
 }
-def : InstRW<[WriteVBROADCASTYLd, ReadAfterLd], (instregex "VBROADCASTS(S|D)Yrm")>;
-
-def WriteFPAY22: SchedWriteRes<[JFPU0]> {
+def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
+
+///////////////////////////////////////////////////////////////////////////////
+//  SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteZeroLatency : SchedWriteRes<[]> {
+  let Latency = 0;
+}
+
+// Certain instructions that use the same register for both source
+// operands do not have a real dependency on the previous contents of the
+// register, and thus, do not have to wait before completing. They can be
+// optimized out at register renaming stage.
+// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
+// 15h Processors".
+// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// Section 21.8 [Dependency-breaking instructions].
+
+def JWriteZeroIdiom : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteALU]>
+]>;
+def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+                                        XOR32rr, XOR64rr)>;
+
+def JWriteFZeroIdiom : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteFLogic]>
+]>;
+def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
+                                         ANDNPSrr, VANDNPSrr,
+                                         ANDNPDrr, VANDNPDrr)>;
+
+def JWriteVZeroIdiomLogic : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogic]>
+]>;
+def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+
+def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogicX]>
+]>;
+def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+                                               PANDNrr, VPANDNrr)>;
+
+def JWriteVZeroIdiomALU : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALU]>
+]>;
+def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
+                                            MMX_PSUBQirr, MMX_PSUBWirr,
+                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
+                                            MMX_PCMPGTWirr)>;
+
+def JWriteVZeroIdiomALUX : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALUX]>
+]>;
+def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+                                             PSUBDrr, VPSUBDrr,
+                                             PSUBQrr, VPSUBQrr,
+                                             PSUBWrr, VPSUBWrr,
+                                             PCMPGTBrr, VPCMPGTBrr,
+                                             PCMPGTDrr, VPCMPGTDrr,
+                                             PCMPGTQrr, VPCMPGTQrr,
+                                             PCMPGTWrr, VPCMPGTWrr)>;
+
+// This write is used for slow LEA instructions.
+def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
   let Latency = 2;
-  let ResourceCycles = [2];
-}
-def : InstRW<[WriteFPAY22], (instregex "VCMPP(S|D)Yrri", "VM(AX|IN)P(D|S)Yrr")>;
-
-def WriteFPAY22Ld: SchedWriteRes<[JLAGU, JFPU0]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteFPAY22Ld, ReadAfterLd], (instregex "VCMPP(S|D)Yrmi", "VM(AX|IN)P(D|S)Yrm")>;
-
-def WriteVHAddSubY: SchedWriteRes<[JFPU0]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-}
-def : InstRW<[WriteVHAddSubY], (instregex "VH(ADD|SUB)P(D|S)Yrr")>;
-
-def WriteVHAddSubYLd: SchedWriteRes<[JLAGU, JFPU0]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteVHAddSubYLd], (instregex "VH(ADD|SUB)P(D|S)Yrm")>;
-
-def WriteVMaskMovLd: SchedWriteRes<[JLAGU,JFPU01]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteVMaskMovLd], (instregex "VMASKMOVP(D|S)rm")>;
-
-def WriteVMaskMovYLd: SchedWriteRes<[JLAGU,JFPU01]> {
-  let Latency = 6;
-  let ResourceCycles = [1, 4];
-}
-def : InstRW<[WriteVMaskMovYLd], (instregex "VMASKMOVP(D|S)Yrm")>;
-
-def WriteVMaskMovSt: SchedWriteRes<[JFPU01,JSAGU]> {
-  let Latency = 6;
-  let ResourceCycles = [4, 1];
-}
-def : InstRW<[WriteVMaskMovSt], (instregex "VMASKMOVP(D|S)mr")>;
-
-def WriteVMaskMovYSt: SchedWriteRes<[JFPU01,JSAGU]> {
-  let Latency = 6;
-  let ResourceCycles = [4, 1];
-}
-def : InstRW<[WriteVMaskMovYSt], (instregex "VMASKMOVP(D|S)Ymr")>;
-
-// TODO: In fact we have latency '2+i'. The +i represents an additional 1 cycle transfer
-// operation which moves the floating point result to the integer unit. During this
-// additional cycle the floating point unit execution resources are not occupied
-// and ALU0 in the integer unit is occupied instead.
-def WriteVMOVMSK: SchedWriteRes<[JFPU0]> {
-  let Latency = 3;
-}
-def : InstRW<[WriteVMOVMSK], (instregex "VMOVMSKP(D|S)(Y)?rr")>;
-
-// TODO: In fact we have latency '3+i'. The +i represents an additional 1 cycle transfer
-// operation which moves the floating point result to the integer unit. During this
-// additional cycle the floating point unit execution resources are not occupied
-// and ALU0 in the integer unit is occupied instead.
-def WriteVTESTY: SchedWriteRes<[JFPU01, JFPU0]> {
-  let Latency = 4;
-  let ResourceCycles = [2, 2];
-  let NumMicroOps = 3;
 }
-def : InstRW<[WriteVTESTY], (instregex "VTESTP(S|D)Yrr")>;
-def : InstRW<[WriteVTESTY], (instregex "VPTESTYrr")>;
 
-def WriteVTESTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPU0]> {
-  let Latency = 9;
-  let ResourceCycles = [1, 2, 2];
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteVTESTYLd], (instregex "VTESTP(S|D)Yrm")>;
-def : InstRW<[WriteVTESTYLd], (instregex "VPTESTYrm")>;
-
-def WriteVTEST: SchedWriteRes<[JFPU0]> {
+// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
+// with a `Scale` value different than 1.
+def JSlowLEAPredicate : MCSchedPredicate<
+  CheckAny<[
+    // A 3-operand LEA (base, index, offset).
+    IsThreeOperandsLEAFn,
+    // An LEA with a "Scale" different than 1.
+    CheckAll<[
+      CheckIsImmOperand<2>,
+      CheckNot<CheckImmOperand<2, 1>>
+    ]>
+  ]>
+>;
+
+def JWriteLEA : SchedWriteVariant<[
+    SchedVar<JSlowLEAPredicate,          [JWrite3OpsLEA]>,
+    SchedVar<MCSchedPredicate<TruePred>, [WriteLEA]>
+]>;
+
+def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def JSlowLEA16r : SchedWriteRes<[JALU01]> {
   let Latency = 3;
+  let ResourceCycles = [4];
 }
-def : InstRW<[WriteVTEST], (instregex "VTESTP(S|D)rr")>;
-def : InstRW<[WriteVTEST], (instregex "VPTESTrr")>;
-
-def WriteVTESTLd: SchedWriteRes<[JLAGU, JFPU0]> {
-  let Latency = 8;
-}
-def : InstRW<[WriteVTESTLd], (instregex "VTESTP(S|D)rm")>;
-def : InstRW<[WriteVTESTLd], (instregex "VPTESTrm")>;
-
-def WriteVSQRTYPD: SchedWriteRes<[JFPU1]> {
-  let Latency = 54;
-  let ResourceCycles = [54];
-}
-def : InstRW<[WriteVSQRTYPD], (instregex "VSQRTPDYr")>;
-
-def WriteVSQRTYPDLd: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 59;
-  let ResourceCycles = [1, 54];
-}
-def : InstRW<[WriteVSQRTYPDLd], (instregex "VSQRTPDYm")>;
-
-def WriteVSQRTYPS: SchedWriteRes<[JFPU1]> {
-  let Latency = 42;
-  let ResourceCycles = [42];
-}
-def : InstRW<[WriteVSQRTYPS], (instregex "VSQRTPSYr")>;
-
-def WriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
-  let Latency = 47;
-  let ResourceCycles = [1, 42];
-}
-def : InstRW<[WriteVSQRTYPSLd], (instregex "VSQRTPSYm")>;
 
-def WriteJVZEROALL: SchedWriteRes<[]> {
-  let Latency = 90;
-  let NumMicroOps = 73;
-}
-def : InstRW<[WriteJVZEROALL], (instregex "VZEROALL")>;
+def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
 
-def WriteJVZEROUPPER: SchedWriteRes<[]> {
-  let Latency = 46;
-  let NumMicroOps = 37;
-}
-def : InstRW<[WriteJVZEROUPPER], (instregex "VZEROUPPER")>;
 } // SchedModel
-
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
index 35ec7488db72..c938a4a8939e 100644
--- a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -32,19 +32,19 @@ def SLMModel : SchedMachineModel {
 let SchedModel = SLMModel in {
 
 // Silvermont has 5 reservation stations for micro-ops
-def IEC_RSV0 : ProcResource<1>;
-def IEC_RSV1 : ProcResource<1>;
-def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
-def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
-def MEC_RSV  : ProcResource<1>;
+def SLM_IEC_RSV0 : ProcResource<1>;
+def SLM_IEC_RSV1 : ProcResource<1>;
+def SLM_FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def SLM_FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def SLM_MEC_RSV  : ProcResource<1>;
 
 // Many micro-ops are capable of issuing on multiple ports.
-def IEC_RSV01  : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
-def FPC_RSV01  : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+def SLM_IEC_RSV01  : ProcResGroup<[SLM_IEC_RSV0, SLM_IEC_RSV1]>;
+def SLM_FPC_RSV01  : ProcResGroup<[SLM_FPC_RSV0, SLM_FPC_RSV1]>;
 
-def SMDivider      : ProcResource<1>;
-def SMFPMultiplier : ProcResource<1>;
-def SMFPDivider    : ProcResource<1>;
+def SLMDivider      : ProcResource<1>;
+def SLMFPMultiplier : ProcResource<1>;
+def SLMFPDivider    : ProcResource<1>;
 
 // Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
 // cycles after the memory operand.
@@ -55,209 +55,426 @@ def : ReadAdvance<ReadAfterLd, 3>;
 // as two micro-ops when queued in the reservation station.
 // This multiclass defines the resource usage for variants with and without
 // folded loads.
-multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
+                           list<ProcResourceKind> ExePorts,
+                           int Lat, list<int> Res = [1], int UOps = 1,
+                           int LoadLat = 3> {
   // Register variant is using a single cycle on ExePort.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
-  // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
-  // latency.
-  def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
-     let Latency = !add(Lat, 3);
+  // Memory variant also uses a cycle on MEC_RSV and adds LoadLat cycles to
+  // the latency (default = 3).
+  def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = UOps;
   }
 }
 
 // A folded store needs a cycle on MEC_RSV for the store data, but it does not
 // need an extra port cycle to recompute the address.
-def : WriteRes<WriteRMW, [MEC_RSV]>;
+def : WriteRes<WriteRMW, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteStore,   [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteStoreNT, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteLoad,    [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove,    [SLM_IEC_RSV01]>;
+def : WriteRes<WriteZero,    []>;
 
-def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
-def : WriteRes<WriteLoad,  [MEC_RSV]> { let Latency = 3; }
-def : WriteRes<WriteMove,  [IEC_RSV01]>;
-def : WriteRes<WriteZero,  []>;
+// Load/store MXCSR.
+// FIXME: These are probably wrong. They are copy pasted from WriteStore/Load.
+def : WriteRes<WriteSTMXCSR, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteLDMXCSR,  [SLM_MEC_RSV]> { let Latency = 3; }
 
 // Treat misc copies as a move.
 def : InstRW<[WriteMove], (instrs COPY)>;
 
-defm : SMWriteResPair<WriteALU,   IEC_RSV01, 1>;
-defm : SMWriteResPair<WriteIMul,  IEC_RSV1,  3>;
-defm : SMWriteResPair<WriteShift, IEC_RSV0,  1>;
-defm : SMWriteResPair<WriteJump,  IEC_RSV1,   1>;
+defm : SLMWriteResPair<WriteALU,    [SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteADC,    [SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteIMul,   [SLM_IEC_RSV1],  3>;
+defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1],  3>;
+
+defm : SLMWriteResPair<WriteBSWAP32,[SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteBSWAP64,[SLM_IEC_RSV01], 1>;
+
+defm : SLMWriteResPair<WriteShift,  [SLM_IEC_RSV0],  1>;
+defm : SLMWriteResPair<WriteShiftDouble,  [SLM_IEC_RSV0],  1>;
+defm : SLMWriteResPair<WriteJump,   [SLM_IEC_RSV1],  1>;
+defm : SLMWriteResPair<WriteCRC32,  [SLM_IEC_RSV1],  3>;
+
+defm : SLMWriteResPair<WriteCMOV,  [SLM_IEC_RSV01], 2, [2]>;
+defm : SLMWriteResPair<WriteCMOV2, [SLM_IEC_RSV01], 2, [2]>;
+defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>;
+def  : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
+  // FIXME Latency and NumMicrOps?
+  let ResourceCycles = [2,1];
+}
+def  : WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01]>;
 
 // This is for simple LEAs with one or two input operands.
 // The complex ones can only execute on port 1, and they require two cycles on
 // the port to read all inputs. We don't model that.
-def : WriteRes<WriteLEA, [IEC_RSV1]>;
-
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
-  let Latency = 25;
-  let ResourceCycles = [1, 25];
-}
-def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
-  let Latency = 29;
-  let ResourceCycles = [1, 1, 25];
-}
+def : WriteRes<WriteLEA, [SLM_IEC_RSV1]>;
+
+// Bit counts.
+defm : SLMWriteResPair<WriteBSF, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteBSR, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteLZCNT,          [SLM_IEC_RSV0], 3>;
+defm : SLMWriteResPair<WriteTZCNT,          [SLM_IEC_RSV0], 3>;
+defm : SLMWriteResPair<WritePOPCNT,         [SLM_IEC_RSV0], 3>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : X86WriteResPairUnsupported<WriteBEXTR>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+defm : SLMWriteResPair<WriteDiv8,   [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv16,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv32,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv64,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv8,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
 
 // Scalar and vector floating point.
-defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
-defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
-defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
-defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
-defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteFShuffle,  FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteFBlend,  FPC_RSV0,  1>;
-
-// This is quite rough, latency depends on precision
-def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
-  let Latency = 5;
-  let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 1, 2];
-}
-
-def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
-  let Latency = 34;
-  let ResourceCycles = [1, 34];
-}
-def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
-  let Latency = 37;
-  let ResourceCycles = [1, 1, 34];
-}
+defm : X86WriteRes<WriteFLD0,       [SLM_FPC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,       [SLM_FPC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLDC,       [SLM_FPC_RSV01], 1, [2], 2>;
+def  : WriteRes<WriteFLoad,         [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteFLoadX,        [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteFLoadY,        [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteFMaskedLoad,   [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteFMaskedLoadY,  [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteFStore,        [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFStoreX,       [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFStoreY,       [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFStoreNT,      [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFStoreNTX,     [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFStoreNTY,     [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFMaskedStore,  [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFMove,         [SLM_FPC_RSV01]>;
+def  : WriteRes<WriteFMoveX,        [SLM_FPC_RSV01]>;
+def  : WriteRes<WriteFMoveY,        [SLM_FPC_RSV01]>;
+defm : X86WriteRes<WriteEMMS,       [SLM_FPC_RSV01], 10, [10], 9>;
+
+defm : SLMWriteResPair<WriteFAdd,     [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAddX,    [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAddY,    [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : SLMWriteResPair<WriteFAdd64,   [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAdd64X,  [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAdd64Y,  [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : SLMWriteResPair<WriteFCmp,     [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmpX,    [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmpY,    [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : SLMWriteResPair<WriteFCmp64,   [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmp64X,  [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmp64Y,  [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : SLMWriteResPair<WriteFCom,     [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFMul,     [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMulX,    [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMulY,    [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : SLMWriteResPair<WriteFMul64,   [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMul64X,  [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMul64Y,  [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : SLMWriteResPair<WriteFDiv,     [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>;
+defm : SLMWriteResPair<WriteFDivX,    [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : SLMWriteResPair<WriteFDivY,    [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : SLMWriteResPair<WriteFDiv64,   [SLM_FPC_RSV0, SLMFPDivider], 34, [1,32]>;
+defm : SLMWriteResPair<WriteFDiv64X,  [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : SLMWriteResPair<WriteFDiv64Y,  [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : SLMWriteResPair<WriteFRcp,     [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcpX,    [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcpY,    [SLM_FPC_RSV0], 5>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : SLMWriteResPair<WriteFRsqrt,   [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrtX,  [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrtY,  [SLM_FPC_RSV0], 5>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : SLMWriteResPair<WriteFSqrt,    [SLM_FPC_RSV0,SLMFPDivider], 20, [1,20], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrtX,   [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrtY,   [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : SLMWriteResPair<WriteFSqrt64,  [SLM_FPC_RSV0,SLMFPDivider], 35, [1,35], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64X, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64Y, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : SLMWriteResPair<WriteFSqrt80,  [SLM_FPC_RSV0,SLMFPDivider], 40, [1,40]>;
+defm : SLMWriteResPair<WriteDPPD,   [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPS,   [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPSY,  [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : SLMWriteResPair<WriteFSign,  [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFRnd,   [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFRndY,  [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : SLMWriteResPair<WriteFLogic, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFLogicY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : SLMWriteResPair<WriteFTest,  [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFTestY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : SLMWriteResPair<WriteFShuffle,  [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteFShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : SLMWriteResPair<WriteFVarShuffle, [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteFVarShuffleY,[SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : SLMWriteResPair<WriteFBlend,  [SLM_FPC_RSV0],  1>;
+
+// Conversion between integer and float.
+defm : SLMWriteResPair<WriteCvtSS2I,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2I,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2IY,  [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : SLMWriteResPair<WriteCvtSD2I,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2I,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2IY,  [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : SLMWriteResPair<WriteCvtI2SS,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PS,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PSY,  [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : SLMWriteResPair<WriteCvtI2SD,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PD,   [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PDY,  [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : SLMWriteResPair<WriteCvtSS2SD,  [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2PD,  [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2PDY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : SLMWriteResPair<WriteCvtSD2SS,  [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2PS,  [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2PSY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
 
 // Vector integer operations.
-defm : SMWriteResPair<WriteVecShift, FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
-defm : SMWriteResPair<WriteVecALU,   FPC_RSV01,  1>;
-defm : SMWriteResPair<WriteVecIMul,  FPC_RSV0,   4>;
-defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
+def  : WriteRes<WriteVecLoad,         [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecLoadX,        [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecLoadY,        [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecLoadNT,       [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecLoadNTY,      [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecMaskedLoad,   [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecMaskedLoadY,  [SLM_MEC_RSV]> { let Latency = 3; }
+def  : WriteRes<WriteVecStore,        [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecStoreX,       [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecStoreY,       [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecStoreNT,      [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecStoreNTY,     [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMaskedStore,  [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMaskedStoreY, [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMove,         [SLM_FPC_RSV01]>;
+def  : WriteRes<WriteVecMoveX,        [SLM_FPC_RSV01]>;
+def  : WriteRes<WriteVecMoveY,        [SLM_FPC_RSV01]>;
+def  : WriteRes<WriteVecMoveToGpr,    [SLM_IEC_RSV01]>;
+def  : WriteRes<WriteVecMoveFromGpr,  [SLM_IEC_RSV01]>;
+
+defm : SLMWriteResPair<WriteVecShift,    [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVecShiftX,   [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVecShiftY,   [SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : SLMWriteResPair<WriteVecShiftImm, [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVecShiftImmX,[SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVecShiftImmY,[SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecLogicX,[SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecLogicY,[SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : SLMWriteResPair<WriteVecTest,  [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecTestY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : SLMWriteResPair<WriteVecALU,   [SLM_FPC_RSV01],  1>;
+defm : SLMWriteResPair<WriteVecALUX,  [SLM_FPC_RSV01],  1>;
+defm : SLMWriteResPair<WriteVecALUY,  [SLM_FPC_RSV01],  1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : SLMWriteResPair<WriteVecIMul,  [SLM_FPC_RSV0],   4>;
+defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0],   4>;
+defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0],   4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+// FIXME: The below is closer to correct, but caused some perf regressions.
+//defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   11, [11], 7>;
+defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   4>;
+defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0],   4>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : SLMWriteResPair<WriteShuffle,  [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVarShuffle,  [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : SLMWriteResPair<WriteBlend,  [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SLMWriteResPair<WriteMPSAD,  [SLM_FPC_RSV0],  7>;
+defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0],  7>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : SLMWriteResPair<WritePSADBW,  [SLM_FPC_RSV0],  4>;
+defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0],  4>;
+defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0],  4>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : SLMWriteResPair<WritePHMINPOS,  [SLM_FPC_RSV0],   4>;
+
+// Vector insert/extract operations.
+defm : SLMWriteResPair<WriteVecInsert, [SLM_FPC_RSV0],  1>;
+
+def  : WriteRes<WriteVecExtract, [SLM_FPC_RSV0]>;
+def  : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 2];
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-// HADD, HSUB PS/PD
-
-def : WriteRes<WriteFHAdd,  [FPC_RSV01]> {
-  let Latency = 3;
-  let ResourceCycles = [2];
-}
-
-def : WriteRes<WriteFHAddLd,  [FPC_RSV01, MEC_RSV]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-def : WriteRes<WritePHAdd,  [FPC_RSV01]> {
-  let Latency = 1;
-  let ResourceCycles = [1];
-}
-
-def : WriteRes<WritePHAddLd,  [FPC_RSV01, MEC_RSV]> {
-  let Latency = 4;
-  let ResourceCycles = [1, 1];
-}
+defm : SLMWriteResPair<WriteFHAdd,   [SLM_FPC_RSV01], 3, [2]>;
+defm : SLMWriteResPair<WriteFHAddY,  [SLM_FPC_RSV01], 3, [2]>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+defm : SLMWriteResPair<WritePHAdd,   [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAddX,  [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAddY,  [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
 
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
-def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+def : WriteRes<WritePCmpIStrM, [SLM_FPC_RSV0]> {
   let Latency = 13;
   let ResourceCycles = [13];
 }
-def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpIStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 13;
   let ResourceCycles = [13, 1];
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+def : WriteRes<WritePCmpEStrM, [SLM_FPC_RSV0]> {
   let Latency = 17;
   let ResourceCycles = [17];
 }
-def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpEStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 17;
   let ResourceCycles = [17, 1];
 }
 
 // Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+def : WriteRes<WritePCmpIStrI, [SLM_FPC_RSV0]> {
   let Latency = 17;
   let ResourceCycles = [17];
 }
-def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpIStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 17;
   let ResourceCycles = [17, 1];
 }
 
 // Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+def : WriteRes<WritePCmpEStrI, [SLM_FPC_RSV0]> {
   let Latency = 21;
   let ResourceCycles = [21];
 }
-def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WritePCmpEStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 21;
   let ResourceCycles = [21, 1];
 }
 
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteVecMOVMSK,  [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteMMXMOVMSK,  [SLM_FPC_RSV1]> { let Latency = 4; }
+
 // AES Instructions.
-def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+def : WriteRes<WriteAESDecEnc, [SLM_FPC_RSV0]> {
   let Latency = 8;
   let ResourceCycles = [5];
 }
-def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteAESDecEncLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 8;
   let ResourceCycles = [5, 1];
 }
 
-def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+def : WriteRes<WriteAESIMC, [SLM_FPC_RSV0]> {
   let Latency = 8;
   let ResourceCycles = [5];
 }
-def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteAESIMCLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 8;
   let ResourceCycles = [5, 1];
 }
 
-def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+def : WriteRes<WriteAESKeyGen, [SLM_FPC_RSV0]> {
   let Latency = 8;
   let ResourceCycles = [5];
 }
-def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteAESKeyGenLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 8;
   let ResourceCycles = [5, 1];
 }
 
 // Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+def : WriteRes<WriteCLMul, [SLM_FPC_RSV0]> {
   let Latency = 10;
   let ResourceCycles = [10];
 }
-def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+def : WriteRes<WriteCLMulLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
   let Latency = 10;
   let ResourceCycles = [10, 1];
 }
 
-
-def : WriteRes<WriteSystem,     [FPC_RSV0]> { let Latency = 100; }
-def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
-def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteSystem,     [SLM_FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SLM_MEC_RSV]>;
 def : WriteRes<WriteNop, []>;
 
 // AVX/FMA is not supported on that architecture, but we should define the basic
 // scheduling resources anyway.
-def  : WriteRes<WriteIMulH, [FPC_RSV0]>;
-defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteShuffle256, FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0,  1>;
-defm : SMWriteResPair<WriteFMA, FPC_RSV0,  1>;
+def  : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : SLMWriteResPair<WriteVarVecShift,  [SLM_FPC_RSV0],  1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
 } // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td
index a4e5327213c2..d28d58580752 100644
--- a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -55,7 +55,6 @@ def ZnFPU2 : ProcResource<1>;
 def ZnFPU3 : ProcResource<1>;
 
 // FPU grouping
-def ZnFPU     : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
 def ZnFPU013  : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
 def ZnFPU01   : ProcResGroup<[ZnFPU0, ZnFPU1]>;
 def ZnFPU12   : ProcResGroup<[ZnFPU1, ZnFPU2]>;
@@ -91,6 +90,32 @@ def ZnDivider : ProcResource<1>;
 // 4 Cycles load-to use Latency is captured
 def : ReadAdvance<ReadAfterLd, 4>;
 
+// The Integer PRF for Zen is 168 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def ZnIntegerPRF : RegisterFile<168, [GR64, CCR]>;
+
+// 36 Entry (9x4 entries) floating-point Scheduler
+def ZnFPU     : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]> {
+let BufferSize=36;
+}
+
+// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def ZnFpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The unit can track up to 192 macro ops in-flight.
+// The retire unit handles in-order commit of up to 8 macro ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+// To be noted, the retire unit is shared between integer and FP ops.
+// In SMT mode it is 96 entry per thread. But, we do not use the conservative
+// value here because there is currently no way to fully mode the SMT mode,
+// so there is no point in trying.
+def ZnRCU : RetireControlUnit<192, 8>;
+
+// FIXME: there are 72 read buffers and 44 write buffers.
+
 // (a folded load is an instruction that loads and does some operation)
 // Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
 // Instructions with folded loads are usually micro-fused, so they only appear
@@ -99,30 +124,43 @@ def : ReadAdvance<ReadAfterLd, 4>;
 //      b. addpd
 // This multiclass is for folded loads for integer units.
 multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [], int UOps = 1,
+                          int LoadLat = 4, int LoadUOps = 1> {
   // Register variant takes 1-cycle on Execution Port.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
   // Memory variant also uses a cycle on ZnAGU
-  // adds 4 cycles to the latency.
-  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
-     let NumMicroOps = 2;
-     let Latency = !add(Lat, 4);
+  // adds LoadLat cycles to the latency (default = 4).
+  def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+    let NumMicroOps = !add(UOps, LoadUOps);
   }
 }
 
 // This multiclass is for folded loads for floating point units.
 multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
-                          ProcResourceKind ExePort,
-                          int Lat> {
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [], int UOps = 1,
+                          int LoadLat = 7, int LoadUOps = 0> {
   // Register variant takes 1-cycle on Execution Port.
-  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
 
   // Memory variant also uses a cycle on ZnAGU
-  // adds 7 cycles to the latency.
-  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
-     let Latency = !add(Lat, 7);
+  // adds LoadLat cycles to the latency (default = 7).
+  def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+    let NumMicroOps = !add(UOps, LoadUOps);
   }
 }
 
@@ -130,103 +168,310 @@ multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
 // operation in codegen
 def : WriteRes<WriteRMW, [ZnAGU]>;
 
-def : WriteRes<WriteStore, [ZnAGU]>;
-def : WriteRes<WriteMove,  [ZnALU]>;
-def : WriteRes<WriteLoad,  [ZnAGU]> { let Latency = 8; }
+def : WriteRes<WriteStore,   [ZnAGU]>;
+def : WriteRes<WriteStoreNT, [ZnAGU]>;
+def : WriteRes<WriteMove,    [ZnALU]>;
+def : WriteRes<WriteLoad,    [ZnAGU]> { let Latency = 8; }
 
 def : WriteRes<WriteZero,  []>;
 def : WriteRes<WriteLEA, [ZnALU]>;
-defm : ZnWriteResPair<WriteALU,   ZnALU, 1>;
-defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
-defm : ZnWriteResPair<WriteJump,  ZnALU, 1>;
+defm : ZnWriteResPair<WriteALU,   [ZnALU], 1>;
+defm : ZnWriteResPair<WriteADC,   [ZnALU], 1>;
+defm : ZnWriteResPair<WriteIMul,   [ZnALU1, ZnMultiplier], 4>;
+defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
+
+defm : ZnWriteResPair<WriteBSWAP32,[ZnALU], 1, [4]>;
+defm : ZnWriteResPair<WriteBSWAP64,[ZnALU], 1, [4]>;
+
+defm : ZnWriteResPair<WriteShift, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteShiftDouble, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteJump,  [ZnALU], 1>;
+defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>;
+
+defm : ZnWriteResPair<WriteCMOV,   [ZnALU], 1>;
+defm : ZnWriteResPair<WriteCMOV2,  [ZnALU], 1>;
+def  : WriteRes<WriteSETCC,  [ZnALU]>;
+def  : WriteRes<WriteSETCCStore,  [ZnALU, ZnAGU]>;
+defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>;
+
+// Bit counts.
+defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteBSR, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteLZCNT,          [ZnALU], 2>;
+defm : ZnWriteResPair<WriteTZCNT,          [ZnALU], 2>;
+defm : ZnWriteResPair<WritePOPCNT,         [ZnALU], 1>;
 
 // Treat misc copies as a move.
 def : InstRW<[WriteMove], (instrs COPY)>;
 
-// IDIV
-def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
-  let Latency = 41;
-  let ResourceCycles = [1, 41];
-}
+// BMI1 BEXTR, BMI2 BZHI
+defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteBZHI, [ZnALU], 1>;
 
-def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
-  let Latency = 45;
-  let ResourceCycles = [1, 4, 41];
-}
-
-// IMUL
+// IDIV
+defm : ZnWriteResPair<WriteDiv8,   [ZnALU2, ZnDivider], 15, [1,15], 1>;
+defm : ZnWriteResPair<WriteDiv16,  [ZnALU2, ZnDivider], 17, [1,17], 2>;
+defm : ZnWriteResPair<WriteDiv32,  [ZnALU2, ZnDivider], 25, [1,25], 2>;
+defm : ZnWriteResPair<WriteDiv64,  [ZnALU2, ZnDivider], 41, [1,41], 2>;
+defm : ZnWriteResPair<WriteIDiv8,  [ZnALU2, ZnDivider], 15, [1,15], 1>;
+defm : ZnWriteResPair<WriteIDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>;
+defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
+defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
+
+// IMULH
 def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
   let Latency = 4;
 }
-def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
-  let Latency = 4;
-}
-
-def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
-  let Latency = 8;
-}
 
 // Floating point operations
-defm : ZnWriteResFpuPair<WriteFHAdd,     ZnFPU0,  3>;
-defm : ZnWriteResFpuPair<WriteFAdd,      ZnFPU0,  3>;
-defm : ZnWriteResFpuPair<WriteFBlend,    ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteVarBlend,  ZnFPU0,  1>;
-defm : ZnWriteResFpuPair<WriteCvtI2F,    ZnFPU3,  5>;
-defm : ZnWriteResFpuPair<WriteCvtF2F,    ZnFPU3,  5>;
-defm : ZnWriteResFpuPair<WriteCvtF2I,    ZnFPU3,  5>;
-defm : ZnWriteResFpuPair<WriteFDiv,      ZnFPU3, 15>;
-defm : ZnWriteResFpuPair<WriteFShuffle,  ZnFPU12, 1>;
-defm : ZnWriteResFpuPair<WriteFMul,      ZnFPU0,  5>;
-defm : ZnWriteResFpuPair<WriteFMA,       ZnFPU03, 5>;
-defm : ZnWriteResFpuPair<WriteFRcp,      ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFRsqrt,    ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFSqrt,     ZnFPU3, 20>;
+defm : X86WriteRes<WriteFLoad,         [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [ZnAGU,ZnFPU01], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteFStore,        [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreX,       [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreY,       [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNT,      [ZnAGU,ZnFPU2], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX,     [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNTY,     [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMaskedStore,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMove,         [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [ZnFPU], 1, [1], 1>;
+
+defm : ZnWriteResFpuPair<WriteFAdd,      [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFAddX,     [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFAddY,     [ZnFPU0],  3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : ZnWriteResFpuPair<WriteFAdd64,    [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFAdd64X,   [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFAdd64Y,   [ZnFPU0],  3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : ZnWriteResFpuPair<WriteFCmp,      [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCmpX,     [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCmpY,     [ZnFPU0],  3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : ZnWriteResFpuPair<WriteFCmp64,    [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCmp64X,   [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFCmp64Y,   [ZnFPU0],  3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : ZnWriteResFpuPair<WriteFCom,      [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFBlend,    [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFBlendY,   [ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlendY,[ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : ZnWriteResFpuPair<WriteVarBlend,  [ZnFPU0],  1>;
+defm : ZnWriteResFpuPair<WriteVarBlendY, [ZnFPU0],  1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : ZnWriteResFpuPair<WriteCvtSS2I,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtPS2I,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtPS2IY,  [ZnFPU3],  5>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : ZnWriteResFpuPair<WriteCvtSD2I,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtPD2I,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtPD2IY,  [ZnFPU3],  5>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+defm : ZnWriteResFpuPair<WriteCvtI2SS,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PS,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PSY,  [ZnFPU3],  5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : ZnWriteResFpuPair<WriteCvtI2SD,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PD,   [ZnFPU3],  5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PDY,  [ZnFPU3],  5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+defm : ZnWriteResFpuPair<WriteFDiv,      [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFDivX,     [ZnFPU3], 15>;
+//defm : ZnWriteResFpuPair<WriteFDivY,     [ZnFPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : ZnWriteResFpuPair<WriteFDiv64,    [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFDiv64X,   [ZnFPU3], 15>;
+//defm : ZnWriteResFpuPair<WriteFDiv64Y,   [ZnFPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : ZnWriteResFpuPair<WriteFSign,     [ZnFPU3],  2>;
+defm : ZnWriteResFpuPair<WriteFRnd,      [ZnFPU3],  4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops?
+defm : ZnWriteResFpuPair<WriteFRndY,     [ZnFPU3],  4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops?
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : ZnWriteResFpuPair<WriteFLogic,    [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteFLogicY,   [ZnFPU],   1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : ZnWriteResFpuPair<WriteFTest,     [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteFTestY,    [ZnFPU],   1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : ZnWriteResFpuPair<WriteFShuffle,  [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFShuffleY, [ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFVarShuffleY,[ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : ZnWriteResFpuPair<WriteFMul,      [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMulX,     [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMulY,     [ZnFPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : ZnWriteResFpuPair<WriteFMul64,    [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMul64X,   [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMul64Y,   [ZnFPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : ZnWriteResFpuPair<WriteFMA,       [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFMAX,      [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFMAY,      [ZnFPU03], 5>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : ZnWriteResFpuPair<WriteFRcp,      [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRcpX,     [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRcpY,     [ZnFPU01], 5, [1], 1, 7, 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+//defm : ZnWriteResFpuPair<WriteFRsqrt,    [ZnFPU02], 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrtX,   [ZnFPU01], 5, [1], 1, 7, 1>;
+//defm : ZnWriteResFpuPair<WriteFRsqrtY,   [ZnFPU01], 5, [2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : ZnWriteResFpuPair<WriteFSqrt,     [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrtX,    [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrtY,    [ZnFPU3], 28, [28], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : ZnWriteResFpuPair<WriteFSqrt64,   [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrt64X,  [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrt64Y,  [ZnFPU3], 40, [40], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : ZnWriteResFpuPair<WriteFSqrt80,   [ZnFPU3], 20, [20]>;
 
 // Vector integer operations which uses FPU units
-defm : ZnWriteResFpuPair<WriteVecShift,   ZnFPU,   1>;
-defm : ZnWriteResFpuPair<WriteVecLogic,   ZnFPU,   1>;
-defm : ZnWriteResFpuPair<WritePHAdd,      ZnFPU,   1>;
-defm : ZnWriteResFpuPair<WriteVecALU,     ZnFPU,   1>;
-defm : ZnWriteResFpuPair<WriteVecIMul,    ZnFPU0,  4>;
-defm : ZnWriteResFpuPair<WriteShuffle,    ZnFPU,   1>;
-defm : ZnWriteResFpuPair<WriteBlend,      ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU,   2>;
+defm : X86WriteRes<WriteVecLoad,         [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [ZnAGU,ZnFPU01], 9, [1,3], 2>;
+defm : X86WriteRes<WriteVecStore,        [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreX,       [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreY,       [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNT,      [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY,     [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMove,         [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [ZnFPU], 2, [1], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [ZnFPU2], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [ZnFPU2], 3, [1], 1>;
+defm : X86WriteRes<WriteEMMS,            [ZnFPU], 2, [1], 1>;
+
+defm : ZnWriteResFpuPair<WriteVecShift,   [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVecShiftX,  [ZnFPU2],  1>;
+defm : ZnWriteResFpuPair<WriteVecShiftY,  [ZnFPU2],  2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : ZnWriteResFpuPair<WriteVecShiftImm,  [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : ZnWriteResFpuPair<WriteVecLogic,   [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVecLogicX,  [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVecLogicY,  [ZnFPU],   1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : ZnWriteResFpuPair<WriteVecTest,    [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTestY,   [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : ZnWriteResFpuPair<WriteVecALU,     [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVecALUX,    [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVecALUY,    [ZnFPU],   1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : ZnWriteResFpuPair<WriteVecIMul,    [ZnFPU0],  4>;
+defm : ZnWriteResFpuPair<WriteVecIMulX,   [ZnFPU0],  4>;
+defm : ZnWriteResFpuPair<WriteVecIMulY,   [ZnFPU0],  4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : ZnWriteResFpuPair<WritePMULLD,     [ZnFPU0],  4, [1], 1, 7, 1>; // FIXME
+defm : ZnWriteResFpuPair<WritePMULLDY,    [ZnFPU0],  5, [2], 1, 7, 1>; // FIXME
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : ZnWriteResFpuPair<WriteShuffle,    [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteShuffleX,   [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteShuffleY,   [ZnFPU],   1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : ZnWriteResFpuPair<WriteVarShuffle, [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleX,[ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleY,[ZnFPU],   1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : ZnWriteResFpuPair<WriteBlend,      [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteBlendY,     [ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU],   2>;
+defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU],   2>;
+defm : ZnWriteResFpuPair<WritePSADBW,     [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WritePSADBWX,    [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WritePSADBWY,    [ZnFPU0],  3>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : ZnWriteResFpuPair<WritePHMINPOS,   [ZnFPU0],  4>;
 
 // Vector Shift Operations
-defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShift,  [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+defm : ZnWriteResFpuPair<WriteVecInsert,   [ZnFPU],   1>;
+
+def : WriteRes<WriteVecExtract, [ZnFPU12, ZnFPU2]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteVecExtractSt, [ZnAGU, ZnFPU12, ZnFPU2]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 2, 3];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [ZnFPU2]>;
+def : WriteRes<WriteMMXMOVMSK, [ZnFPU2]>;
+def : WriteRes<WriteVecMOVMSK, [ZnFPU2]>;
+
+def : WriteRes<WriteVecMOVMSKY, [ZnFPU2]> {
+  let NumMicroOps = 2;
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
 
 // AES Instructions.
-defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC,    [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>;
 
 def : WriteRes<WriteFence,  [ZnAGU]>;
 def : WriteRes<WriteNop, []>;
 
 // Following instructions with latency=100 are microcoded.
 // We set long latency so as to block the entire pipeline.
-defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
-
-//Microcoded Instructions
-let Latency = 100 in {
-  def : WriteRes<WriteMicrocoded, []>;
-  def : WriteRes<WriteSystem, []>;
-  def : WriteRes<WriteMPSAD, []>;
-  def : WriteRes<WriteMPSADLd, []>;
-  def : WriteRes<WriteCLMul, []>;
-  def : WriteRes<WriteCLMulLd, []>;
-  def : WriteRes<WritePCmpIStrM, []>;
-  def : WriteRes<WritePCmpIStrMLd, []>;
-  def : WriteRes<WritePCmpEStrI, []>;
-  def : WriteRes<WritePCmpEStrILd, []>;
-  def : WriteRes<WritePCmpEStrM, []>;
-  def : WriteRes<WritePCmpEStrMLd, []>;
-  def : WriteRes<WritePCmpIStrI, []>;
-  def : WriteRes<WritePCmpIStrILd, []>;
-  }
-
-//=== Regex based itineraries ===//
+defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle256, [ZnFPU], 100>;
+
+// Microcoded Instructions
+def ZnWriteMicrocoded : SchedWriteRes<[]> {
+  let Latency = 100;
+}
+
+def : SchedAlias<WriteMicrocoded, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFCMOV, ZnWriteMicrocoded>;
+def : SchedAlias<WriteSystem, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSAD, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADY, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADYLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCLMul, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCLMulLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrM, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrMLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrI, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrILd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrM, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrMLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrI, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrILd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteLDMXCSR, ZnWriteMicrocoded>;
+def : SchedAlias<WriteSTMXCSR, ZnWriteMicrocoded>;
+
+//=== Regex based InstRW ===//
 // Notation:
 // - r: register.
 // - m = memory.
@@ -247,14 +492,6 @@ def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>;
 // r,m.
 def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
 
-// CMOVcc.
-// r,r.
-def : InstRW<[WriteALU],
-      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteALULd, ReadAfterLd],
-      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
-
 // XCHG.
 // r,r.
 def ZnWriteXCHG : SchedWriteRes<[ZnALU]> {
@@ -271,7 +508,7 @@ def ZnWriteXCHGrm : SchedWriteRes<[ZnAGU, ZnALU]> {
 }
 def : InstRW<[ZnWriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
 
-def : InstRW<[WriteMicrocoded], (instregex "XLAT")>;
+def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
 
 // POP16.
 // r.
@@ -302,20 +539,7 @@ def ZnWritePushA : SchedWriteRes<[ZnAGU]> {
 def : InstRW<[ZnWritePushA], (instregex "PUSHA(16|32)")>;
 
 //LAHF
-def : InstRW<[WriteMicrocoded], (instregex "LAHF")>;
-
-// SAHF.
-def ZnWriteSAHF : SchedWriteRes<[ZnALU]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteSAHF], (instregex "SAHF")>;
-
-// BSWAP.
-def ZnWriteBSwap : SchedWriteRes<[ZnALU]> {
-  let ResourceCycles = [4];
-}
-def : InstRW<[ZnWriteBSwap], (instregex "BSWAP")>;
+def : InstRW<[WriteMicrocoded], (instrs LAHF)>;
 
 // MOVBE.
 // r,m.
@@ -336,16 +560,6 @@ def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
                           "(ADD|SUB)64mi32")>;
 
 // ADC SBB.
-// r,r/i.
-def : InstRW<[WriteALU], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
-                          "(ADC|SBB)(16|32|64)ri8",
-                          "(ADC|SBB)64ri32",
-                          "(ADC|SBB)(8|16|32|64)rr_REV")>;
-
-// r,m.
-def : InstRW<[WriteALULd, ReadAfterLd],
-            (instregex "(ADC|SBB)(8|16|32|64)rm")>;
-
 // m,r/i.
 def : InstRW<[WriteALULd],
              (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
@@ -355,59 +569,52 @@ def : InstRW<[WriteALULd],
 // INC DEC NOT NEG.
 // m.
 def : InstRW<[WriteALULd],
-             (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
-              "(INC|DEC)64(16|32)m")>;
+             (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>;
 
 // MUL IMUL.
 // r16.
 def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
   let Latency = 3;
 }
-def : InstRW<[ZnWriteMul16], (instregex "IMUL16r", "MUL16r")>;
+def : InstRW<[ZnWriteMul16], (instrs IMUL16r, MUL16r)>;
+def : InstRW<[ZnWriteMul16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; // TODO: is this right?
+def : InstRW<[ZnWriteMul16], (instrs IMUL16rm, IMUL16rmi, IMUL16rmi8)>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // m16.
 def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 8;
 }
-def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instregex "IMUL16m", "MUL16m")>;
+def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instrs IMUL16m, MUL16m)>;
 
 // r32.
 def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
   let Latency = 3;
 }
-def : InstRW<[ZnWriteMul32], (instregex "IMUL32r", "MUL32r")>;
+def : InstRW<[ZnWriteMul32], (instrs IMUL32r, MUL32r)>;
+def : InstRW<[ZnWriteMul32], (instrs IMUL32rr, IMUL32rri, IMUL32rri8)>; // TODO: is this right?
+def : InstRW<[ZnWriteMul32], (instrs IMUL32rm, IMUL32rmi, IMUL32rmi8)>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // m32.
 def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 8;
 }
-def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instregex "IMUL32m", "MUL32m")>;
+def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instrs IMUL32m, MUL32m)>;
 
 // r64.
 def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
   let Latency = 4;
   let NumMicroOps = 2;
 }
-def : InstRW<[ZnWriteMul64], (instregex "IMUL64r", "MUL64r")>;
+def : InstRW<[ZnWriteMul64], (instrs IMUL64r, MUL64r)>;
+def : InstRW<[ZnWriteMul64], (instrs IMUL64rr, IMUL64rri8, IMUL64rri32)>; // TODO: is this right?
+def : InstRW<[ZnWriteMul64], (instrs IMUL64rm, IMUL64rmi32, IMUL64rmi8)>; // TODO: this is definitely wrong but matches what the instregex did.
 
 // m64.
 def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 9;
   let NumMicroOps = 2;
 }
-def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instregex "IMUL64m", "MUL64m")>;
-
-// r16,r16.
-def ZnWriteMul16rri : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
-  let Latency = 3;
-}
-def : InstRW<[ZnWriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
-
-// r16,m16.
-def ZnWriteMul16rmi : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
-  let Latency = 8;
-}
-def : InstRW<[ZnWriteMul16rmi, ReadAfterLd], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instrs IMUL64m, MUL64m)>;
 
 // MULX.
 // r32,r32,r32.
@@ -415,72 +622,43 @@ def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
   let Latency = 3;
   let ResourceCycles = [1, 2];
 }
-def : InstRW<[ZnWriteMulX32], (instregex "MULX32rr")>;
+def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>;
 
 // r32,r32,m32.
 def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 8;
   let ResourceCycles = [1, 2, 2];
 }
-def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instregex "MULX32rm")>;
+def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
 
 // r64,r64,r64.
 def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> {
   let Latency = 3;
 }
-def : InstRW<[ZnWriteMulX64], (instregex "MULX64rr")>;
+def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>;
 
 // r64,r64,m64.
 def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
   let Latency = 8;
 }
-def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instregex "MULX64rm")>;
-
-// DIV, IDIV.
-// r8.
-def ZnWriteDiv8 : SchedWriteRes<[ZnALU2, ZnDivider]> {
-  let Latency = 15;
-}
-def : InstRW<[ZnWriteDiv8], (instregex "DIV8r", "IDIV8r")>;
-
-// r16.
-def ZnWriteDiv16 : SchedWriteRes<[ZnALU2, ZnDivider]> {
-  let Latency = 17;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteDiv16], (instregex "DIV16r", "IDIV16r")>;
-
-// r32.
-def ZnWriteDiv32 : SchedWriteRes<[ZnALU2, ZnDivider]> {
-  let Latency = 25;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteDiv32], (instregex "DIV32r", "IDIV32r")>;
-
-// r64.
-def ZnWriteDiv64 : SchedWriteRes<[ZnALU2, ZnDivider]> {
-  let Latency = 41;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteDiv64], (instregex "DIV64r", "IDIV64r")>;
+def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
 
 //-- Control transfer instructions --//
 
 // J(E|R)CXZ.
 def ZnWriteJCXZ : SchedWriteRes<[ZnALU03]>;
-def : InstRW<[ZnWriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+def : InstRW<[ZnWriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>;
 
 // INTO
-def : InstRW<[WriteMicrocoded], (instregex "INTO")>;
+def : InstRW<[WriteMicrocoded], (instrs INTO)>;
 
 // LOOP.
 def ZnWriteLOOP : SchedWriteRes<[ZnALU03]>;
-def : InstRW<[ZnWriteLOOP], (instregex "LOOP")>;
+def : InstRW<[ZnWriteLOOP], (instrs LOOP)>;
 
 // LOOP(N)E, LOOP(N)Z
 def ZnWriteLOOPE : SchedWriteRes<[ZnALU03]>;
-def : InstRW<[ZnWriteLOOPE], (instregex "LOOPE", "LOOPNE",
-                              "LOOPZ", "LOOPNZ")>;
+def : InstRW<[ZnWriteLOOPE], (instrs LOOPE, LOOPNE)>;
 
 // CALL.
 // r.
@@ -494,7 +672,7 @@ def ZnWriteRET : SchedWriteRes<[ZnALU03]> {
   let NumMicroOps = 2;
 }
 def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
-                            "IRET(D|Q)", "RETF")>;
+                            "IRET(16|32|64)")>;
 
 //-- Logic instructions --//
 
@@ -504,12 +682,6 @@ def : InstRW<[WriteALULd],
              (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
               "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
 
-// ANDN.
-// r,r.
-def : InstRW<[WriteALU], (instregex "ANDN(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteALULd, ReadAfterLd], (instregex "ANDN(32|64)rm")>;
-
 // Define ALU latency variants
 def ZnWriteALULat2 : SchedWriteRes<[ZnALU]> {
   let Latency = 2;
@@ -518,24 +690,8 @@ def ZnWriteALULat2Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
   let Latency = 6;
 }
 
-def ZnWriteALULat3 : SchedWriteRes<[ZnALU]> {
-  let Latency = 3;
-}
-def ZnWriteALULat3Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
-  let Latency = 7;
-}
-
-// BSF BSR.
-// r,r.
-def : InstRW<[ZnWriteALULat3], (instregex "BS(R|F)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[ZnWriteALULat3Ld, ReadAfterLd], (instregex "BS(R|F)(16|32|64)rm")>;
-
 // BT.
-// r,r/i.
-def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
-
-def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mr")>;
+// m,i.
 def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
 
 // BTR BTS BTC.
@@ -546,7 +702,6 @@ def ZnWriteBTRSC : SchedWriteRes<[ZnALU]> {
 }
 def : InstRW<[ZnWriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
 
-
 // m,r,i.
 def ZnWriteBTRSCm : SchedWriteRes<[ZnAGU, ZnALU]> {
   let Latency = 6;
@@ -559,79 +714,35 @@ def : InstRW<[ZnWriteBTRSCm], (instregex "BT(R|S|C)(16|32|64)m(r|i8)")>;
 // r,r.
 def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>;
 // r,m.
-def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>;
-
-// BEXTR.
-// r,r,r.
-def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>;
-
-// BZHI.
-// r,r,r.
-def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>;
+def : InstRW<[ZnWriteALULat2Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
 
 // CLD STD.
-def : InstRW<[WriteALU], (instregex "STD", "CLD")>;
+def : InstRW<[WriteALU], (instrs STD, CLD)>;
 
 // PDEP PEXT.
 // r,r,r.
 def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
-// r,m,r.
+// r,r,m.
 def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
 
-// ROR ROL.
-def : InstRW<[WriteShift], (instregex "RO(R|L)(8|16|32|64)r1")>;
-
 // RCR RCL.
-// r,1.
-def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r1")>;
-
-// m,1.
-def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m1")>;
-
-// i.
-def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
-
 // m,i.
-def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>;
 
 // SHR SHL SAR.
 // m,i.
 def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
 
 // SHRD SHLD.
-// r,r
-def : InstRW<[WriteShift], (instregex "SH(R|L)D(16|32|64)rri8")>;
-
 // m,r
 def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>;
 
 // r,r,cl.
-def : InstRW<[WriteMicrocoded], (instregex "SHLD(16|32|64)rrCL")>;
-
-// r,r,cl.
-def : InstRW<[WriteMicrocoded], (instregex "SHRD(16|32|64)rrCL")>;
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>;
 
 // m,r,cl.
 def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>;
 
-// SETcc.
-// r.
-def : InstRW<[WriteShift],
-             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
-// m.
-def : InstRW<[WriteShift],
-             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
-
-// LZCNT TZCNT.
-// r,r.
-def : InstRW<[ZnWriteALULat2], (instregex "(LZCNT|TZCNT)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "(LZCNT|TZCNT)(16|32|64)rm")>;
-
 //-- Misc instructions --//
 // CMPXCHG.
 def ZnWriteCMPXCHG : SchedWriteRes<[ZnAGU, ZnALU]> {
@@ -644,9 +755,9 @@ def : InstRW<[ZnWriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
 def ZnWriteCMPXCHG8B : SchedWriteRes<[ZnAGU, ZnALU]> {
   let NumMicroOps = 18;
 }
-def : InstRW<[ZnWriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+def : InstRW<[ZnWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
 
-def : InstRW<[WriteMicrocoded], (instregex "CMPXCHG16B")>;
+def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>;
 
 // LEAVE
 def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> {
@@ -656,13 +767,13 @@ def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> {
 def : InstRW<[ZnWriteLEAVE], (instregex "LEAVE")>;
 
 // PAUSE.
-def : InstRW<[WriteMicrocoded], (instregex "PAUSE")>;
+def : InstRW<[WriteMicrocoded], (instrs PAUSE)>;
 
 // RDTSC.
 def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
 
 // RDPMC.
-def : InstRW<[WriteMicrocoded], (instregex "RDPMC")>;
+def : InstRW<[WriteMicrocoded], (instrs RDPMC)>;
 
 // RDRAND.
 def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>;
@@ -732,7 +843,7 @@ def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>;
 def ZnWriteFXCH : SchedWriteRes<[ZnFPU]>;
 
 // FXCHG.
-def : InstRW<[ZnWriteFXCH], (instregex "XCH_F")>;
+def : InstRW<[ZnWriteFXCH], (instrs XCH_F)>;
 
 // FILD.
 def ZnWriteFILD : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -756,31 +867,29 @@ def ZnWriteFPU3 : SchedWriteRes<[ZnAGU, ZnFPU3]> {
 }
 
 // FLDZ.
-def : InstRW<[ZnWriteFPU13], (instregex "LD_F0")>;
+def : SchedAlias<WriteFLD0, ZnWriteFPU13>;
 
 // FLD1.
-def : InstRW<[ZnWriteFPU3], (instregex "LD_F1")>;
+def : SchedAlias<WriteFLD1, ZnWriteFPU3>;
 
 // FLDPI FLDL2E etc.
-def : InstRW<[ZnWriteFPU3], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
-
-def : InstRW<[WriteMicrocoded], (instregex "CMOV(B|BE|E|P|NB|NBE|NE|NP)_F")>;
+def : SchedAlias<WriteFLDC, ZnWriteFPU3>;
 
 // FNSTSW.
 // AX.
-def : InstRW<[WriteMicrocoded], (instregex "FNSTSW16r")>;
+def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>;
 
 // m16.
-def : InstRW<[WriteMicrocoded], (instregex "FNSTSWm")>;
+def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>;
 
 // FLDCW.
-def : InstRW<[WriteMicrocoded], (instregex "FLDCW16m")>;
+def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>;
 
 // FNSTCW.
-def : InstRW<[WriteMicrocoded], (instregex "FNSTCW16m")>;
+def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>;
 
 // FINCSTP FDECSTP.
-def : InstRW<[ZnWriteFPU3], (instregex "FINCSTP", "FDECSTP")>;
+def : InstRW<[ZnWriteFPU3], (instrs FINCSTP, FDECSTP)>;
 
 // FFREE.
 def : InstRW<[ZnWriteFPU3], (instregex "FFREE")>;
@@ -793,14 +902,6 @@ def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>;
 
 //-- Arithmetic instructions --//
 
-def ZnWriteFPU3Lat2 : SchedWriteRes<[ZnFPU3]> {
-  let Latency = 2;
-}
-
-def ZnWriteFPU3Lat2Ld : SchedWriteRes<[ZnAGU, ZnFPU3]> {
-  let Latency = 9;
-}
-
 def ZnWriteFPU3Lat1 : SchedWriteRes<[ZnFPU3]> ;
 
 def ZnWriteFPU0Lat1 : SchedWriteRes<[ZnFPU0]> ;
@@ -809,22 +910,18 @@ def ZnWriteFPU0Lat1Ld : SchedWriteRes<[ZnAGU, ZnFPU0]> {
   let Latency = 8;
 }
 
-// FABS.
-def : InstRW<[ZnWriteFPU3Lat2], (instregex "ABS_F")>;
-
 // FCHS.
 def : InstRW<[ZnWriteFPU3Lat1], (instregex "CHS_F")>;
 
 // FCOM(P) FUCOM(P).
 // r.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
-                         "UCOM_FPr")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>;
 // m.
-def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>;
 
 // FCOMPP FUCOMPP.
 // r.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "FCOMPP", "UCOM_FPPr")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>;
 
 def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]>
 {
@@ -833,8 +930,7 @@ def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]>
 
 // FCOMI(P) FUCOMI(P).
 // m.
-def : InstRW<[ZnWriteFPU02], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
-                           "UCOM_FIPr")>;
+def : InstRW<[ZnWriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
 
 def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]>
 {
@@ -844,92 +940,42 @@ def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]>
 }
 
 // FICOM(P).
-def : InstRW<[ZnWriteFPU03], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+def : InstRW<[ZnWriteFPU03], (instregex "FICOM(P?)(16|32)m")>;
 
 // FTST.
 def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>;
 
 // FXAM.
-def : InstRW<[ZnWriteFPU3Lat1], (instregex "FXAM")>;
+def : InstRW<[ZnWriteFPU3Lat1], (instrs FXAM)>;
 
 // FPREM.
-def : InstRW<[WriteMicrocoded], (instregex "FPREM")>;
+def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
 
 // FPREM1.
-def : InstRW<[WriteMicrocoded], (instregex "FPREM1")>;
+def : InstRW<[WriteMicrocoded], (instrs FPREM1)>;
 
 // FRNDINT.
-def : InstRW<[WriteMicrocoded], (instregex "FRNDINT")>;
+def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>;
 
 // FSCALE.
-def : InstRW<[WriteMicrocoded], (instregex "FSCALE")>;
+def : InstRW<[WriteMicrocoded], (instrs FSCALE)>;
 
 // FXTRACT.
-def : InstRW<[WriteMicrocoded], (instregex "FXTRACT")>;
+def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>;
 
 // FNOP.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "FNOP")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instrs FNOP)>;
 
 // WAIT.
-def : InstRW<[ZnWriteFPU0Lat1], (instregex "WAIT")>;
+def : InstRW<[ZnWriteFPU0Lat1], (instrs WAIT)>;
 
 // FNCLEX.
-def : InstRW<[WriteMicrocoded], (instregex "FNCLEX")>;
+def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>;
 
 // FNINIT.
-def : InstRW<[WriteMicrocoded], (instregex "FNINIT")>;
+def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
 
 //=== Integer MMX and XMM Instructions ===//
-//-- Move instructions --//
-
-// Moves from GPR to FPR incurs a penalty
-def ZnWriteFPU2 : SchedWriteRes<[ZnFPU2]> {
-  let Latency = 3;
-}
-
-// Move to ALU doesn't incur penalty
-def ZnWriteToALU2 : SchedWriteRes<[ZnFPU2]> {
-  let Latency = 2;
-}
-
-def ZnWriteFPU : SchedWriteRes<[ZnFPU]>;
-def ZnWriteFPUY : SchedWriteRes<[ZnFPU]> {
-  let NumMicroOps = 2;
-  let Latency=2;
-}
-
-// MOVD.
-// r32/64 <- (x)mm.
-def : InstRW<[ZnWriteToALU2], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
-                         "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
-
-// (x)mm <- r32/64.
-def : InstRW<[ZnWriteFPU2], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
-                         "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
-
-// MOVQ.
-// r64 <- (x)mm.
-def : InstRW<[ZnWriteToALU2], (instregex "VMOVPQIto64rr")>;
-
-// (x)mm <- r64.
-def : InstRW<[ZnWriteFPU2], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
-
-// (x)mm <- (x)mm.
-def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ64rr")>;
-
-// (V)MOVDQA/U.
-// x <- x.
-def : InstRW<[ZnWriteFPU], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
-                           "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV")>;
-
-// y <- y.
-def : InstRW<[ZnWriteFPUY], (instregex "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
-
-// MOVDQ2Q.
-def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVDQ2Qrr")>;
-
-// MOVQ2DQ.
-def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ2DQrr")>;
 
 // PACKSSWB/DW.
 // mm <- mm.
@@ -938,15 +984,22 @@ def ZnWriteFPU12Y : SchedWriteRes<[ZnFPU12]> {
   let NumMicroOps = 2;
 }
 def ZnWriteFPU12m : SchedWriteRes<[ZnAGU, ZnFPU12]> ;
+def ZnWriteFPU12Ym : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
 
-def : InstRW<[ZnWriteFPU12], (instregex "MMX_PACKSSDWirr",
-                                  "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
-def : InstRW<[ZnWriteFPU12m], (instregex "MMX_PACKSSDWirm",
-                                  "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWirr,
+                                     MMX_PACKSSWBirr,
+                                     MMX_PACKUSWBirr)>;
+def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWirm,
+                                      MMX_PACKSSWBirm,
+                                      MMX_PACKUSWBirm)>;
 
-// VPMOVSX/ZX BW BD BQ DW DQ.
+// VPMOVSX/ZX BW BD BQ WD WQ DQ.
 // y <- x.
-def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
+def : InstRW<[ZnWriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
 
 def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ;
 def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> {
@@ -969,12 +1022,12 @@ def ZnWriteFPU013LdY : SchedWriteRes<[ZnAGU, ZnFPU013]> {
 // x,x,i / v,v,v,i
 def : InstRW<[ZnWriteFPU013], (instregex "(V?)PBLENDWrri")>;
 // ymm
-def : InstRW<[ZnWriteFPU013Y], (instregex "(V?)PBLENDWYrri")>;
+def : InstRW<[ZnWriteFPU013Y], (instrs VPBLENDWYrri)>;
 
 // x,m,i / v,v,m,i
 def : InstRW<[ZnWriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>;
 // y,m,i
-def : InstRW<[ZnWriteFPU013LdY], (instregex "(V?)PBLENDWYrmi")>;
+def : InstRW<[ZnWriteFPU013LdY], (instrs VPBLENDWYrmi)>;
 
 def ZnWriteFPU01 : SchedWriteRes<[ZnFPU01]> ;
 def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> {
@@ -983,9 +1036,9 @@ def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> {
 
 // VPBLENDD.
 // v,v,v,i.
-def : InstRW<[ZnWriteFPU01], (instregex "VPBLENDDrri")>;
+def : InstRW<[ZnWriteFPU01], (instrs VPBLENDDrri)>;
 // ymm
-def : InstRW<[ZnWriteFPU01Y], (instregex "VPBLENDDYrri")>;
+def : InstRW<[ZnWriteFPU01Y], (instrs VPBLENDDYrri)>;
 
 // v,v,m,i
 def ZnWriteFPU01Op2 : SchedWriteRes<[ZnAGU, ZnFPU01]> {
@@ -998,8 +1051,8 @@ def ZnWriteFPU01Op2Y : SchedWriteRes<[ZnAGU, ZnFPU01]> {
   let Latency = 9;
   let ResourceCycles = [1, 3];
 }
-def : InstRW<[ZnWriteFPU01Op2], (instregex "VPBLENDDrmi")>;
-def : InstRW<[ZnWriteFPU01Op2Y], (instregex "VPBLENDDYrmi")>;
+def : InstRW<[ZnWriteFPU01Op2], (instrs VPBLENDDrmi)>;
+def : InstRW<[ZnWriteFPU01Op2Y], (instrs VPBLENDDYrmi)>;
 
 // MASKMOVQ.
 def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
@@ -1007,42 +1060,13 @@ def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
 // MASKMOVDQU.
 def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>;
 
-// VPMASKMOVQ.
+// VPMASKMOVD.
 // ymm
-def : InstRW<[ZnWriteFPU01Op2],(instregex "VPMASKMOVQrm")>;
-def : InstRW<[ZnWriteFPU01Op2Y],(instregex "VPMASKMOVQYrm")>;
-
 def : InstRW<[WriteMicrocoded],
                                (instregex "VPMASKMOVD(Y?)rm")>;
 // m, v,v.
 def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
 
-// PMOVMSKB.
-def ZnWritePMOVMSKB : SchedWriteRes<[ZnFPU2]> {
-  let NumMicroOps = 2;
-}
-def ZnWritePMOVMSKBY : SchedWriteRes<[ZnFPU2]> {
-  let Latency = 2;
-}
-def : InstRW<[ZnWritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKBrr")>;
-def : InstRW<[ZnWritePMOVMSKBY], (instregex "(V|MMX_)?PMOVMSKBYrr")>;
-
-// PEXTR B/W/D/Q.
-// r32,x,i.
-def ZnWritePEXTRr : SchedWriteRes<[ZnFPU12, ZnFPU2]> {
-  let Latency = 2;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
-
-def ZnWritePEXTRm : SchedWriteRes<[ZnAGU, ZnFPU12, ZnFPU2]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 2, 3];
-}
-// m8,x,i.
-def : InstRW<[ZnWritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
-
 // VPBROADCAST B/W.
 // x, m8/16.
 def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
@@ -1069,13 +1093,12 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
 
 // HADD, HSUB PS/PD
 // PHADD|PHSUB (S) W/D.
-def : InstRW<[WriteMicrocoded], (instregex "MMX_PHADD(W?)r(r|m)64",
-                               "MMX_PHADDSWr(r|m)64",
-                               "MMX_PHSUB(W|D)r(r|m)64",
-                               "MMX_PHSUBSWrr64",
-                               "(V?)PH(ADD|SUB)(W|D)(Y?)r(r|m)",
-                               "(V?)PH(ADD|SUB)SWr(r|m)(256)?")>;
-
+def : SchedAlias<WritePHAdd,    ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddLd,  ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddX,   ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddY,   ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>;
 
 // PCMPGTQ.
 def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>;
@@ -1092,69 +1115,16 @@ def ZnWritePCMPGTQYm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
   let ResourceCycles = [1,2];
 }
 def : InstRW<[ZnWritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>;
-def : InstRW<[ZnWritePCMPGTQYm], (instregex "(V?)PCMPGTQYrm")>;
-
-// PMULLD.
-// x,x.
-def ZnWritePMULLDr : SchedWriteRes<[ZnFPU0]> {
-  let Latency = 4;
-}
-// ymm.
-def ZnWritePMULLDYr : SchedWriteRes<[ZnFPU0]> {
-  let Latency = 5;
-  let ResourceCycles = [2];
-}
-def : InstRW<[ZnWritePMULLDr], (instregex "(V?)PMULLDrr")>;
-def : InstRW<[ZnWritePMULLDYr], (instregex "(V?)PMULLDYrr")>;
-
-// x,m.
-def ZnWritePMULLDm : SchedWriteRes<[ZnAGU, ZnFPU0]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-}
-// y,m.
-def ZnWritePMULLDYm : SchedWriteRes<[ZnAGU, ZnFPU0]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWritePMULLDm], (instregex "(V?)PMULLDrm")>;
-def : InstRW<[ZnWritePMULLDYm], (instregex "(V?)PMULLDYrm")>;
+def : InstRW<[ZnWritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
 
 //-- Logic instructions --//
 
-// PTEST.
-// v,v.
-def ZnWritePTESTr : SchedWriteRes<[ZnFPU12]> {
-  let ResourceCycles = [2];
-}
-def : InstRW<[ZnWritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
-
-// v,m.
-def ZnWritePTESTm : SchedWriteRes<[ZnAGU, ZnFPU12]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWritePTESTm], (instregex "(V?)PTEST(Y?)rm")>;
-
 // PSLL,PSRL,PSRA W/D/Q.
 // x,x / v,v,x.
 def ZnWritePShift  : SchedWriteRes<[ZnFPU2]> ;
 def ZnWritePShiftY : SchedWriteRes<[ZnFPU2]> {
   let Latency = 2;
 }
-def ZnWritePShiftLd  : SchedWriteRes<[ZnAGU,ZnFPU2]> {
-  let Latency = 8;
-}
-def ZnWritePShiftYLd : SchedWriteRes<[ZnAGU, ZnFPU2]> {
-  let Latency = 9;
-}
-def : InstRW<[ZnWritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rr")>;
-def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrr")>;
-
-def : InstRW<[ZnWritePShiftLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rm")>;
-def : InstRW<[ZnWritePShiftYLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrm")>;
 
 // PSLL,PSRL DQ.
 def : InstRW<[ZnWritePShift], (instregex "(V?)PS(R|L)LDQri")>;
@@ -1163,33 +1133,16 @@ def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>;
 //=== Floating Point XMM and YMM Instructions ===//
 //-- Move instructions --//
 
-// MOVMSKP S/D.
-// r32 <- x,y.
-def ZnWriteMOVMSKPr : SchedWriteRes<[ZnFPU2]> ;
-def : InstRW<[ZnWriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)(Y?)rr")>;
-
 // VPERM2F128.
-def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rr")>;
-def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rm")>;
-
-// BLENDVP S/D.
-def ZnWriteFPU01Lat3 : SchedWriteRes<[ZnFPU013]> {
-  let Latency = 3;
-}
-def ZnWriteFPU01Lat3Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWriteFPU01Lat3], (instregex "BLENDVP(S|D)rr0")>;
-def : InstRW<[ZnWriteFPU01Lat3Ld, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>;
 
 def ZnWriteBROADCAST : SchedWriteRes<[ZnAGU, ZnFPU13]> {
   let NumMicroOps = 2;
   let Latency = 8;
 }
 // VBROADCASTF128.
-def : InstRW<[ZnWriteBROADCAST], (instregex "VBROADCASTF128")>;
+def : InstRW<[ZnWriteBROADCAST], (instrs VBROADCASTF128)>;
 
 // EXTRACTPS.
 // r32,x,i.
@@ -1210,10 +1163,10 @@ def : InstRW<[ZnWriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
 
 // VEXTRACTF128.
 // x,y,i.
-def : InstRW<[ZnWriteFPU013], (instregex "VEXTRACTF128rr")>;
+def : InstRW<[ZnWriteFPU013], (instrs VEXTRACTF128rr)>;
 
 // m128,y,i.
-def : InstRW<[ZnWriteFPU013m], (instregex "VEXTRACTF128mr")>;
+def : InstRW<[ZnWriteFPU013m], (instrs VEXTRACTF128mr)>;
 
 def ZnWriteVINSERT128r: SchedWriteRes<[ZnFPU013]> {
   let Latency = 2;
@@ -1226,69 +1179,27 @@ def ZnWriteVINSERT128Ld: SchedWriteRes<[ZnAGU,ZnFPU013]> {
 }
 // VINSERTF128.
 // y,y,x,i.
-def : InstRW<[ZnWriteVINSERT128r], (instregex "VINSERTF128rr")>;
-def : InstRW<[ZnWriteVINSERT128Ld], (instregex "VINSERTF128rm")>;
-
-// VMASKMOVP S/D.
-// x,x,m.
-def ZnWriteVMASKMOVPLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 8;
-}
-// y,y,m.
-def ZnWriteVMASKMOVPLdY : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 2];
-}
-def ZnWriteVMASKMOVPm : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 4;
-}
-def : InstRW<[ZnWriteVMASKMOVPLd], (instregex "VMASKMOVP(S|D)rm")>;
-def : InstRW<[ZnWriteVMASKMOVPLdY], (instregex "VMASKMOVP(S|D)Yrm")>;
-def : InstRW<[ZnWriteVMASKMOVPm], (instregex "VMASKMOVP(S|D)mr")>;
-
-// m256,y,y.
-def ZnWriteVMASKMOVPYmr : SchedWriteRes<[ZnAGU,ZnFPU01]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
-
-// VGATHERDPS.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSrm")>;
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSYrm")>;
-
-// VGATHERQPS.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSrm")>;
+def : InstRW<[ZnWriteVINSERT128r], (instrs VINSERTF128rr)>;
+def : InstRW<[ZnWriteVINSERT128Ld], (instrs VINSERTF128rm)>;
 
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSYrm")>;
-
-// VGATHERDPD.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDrm")>;
-
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDYrm")>;
-
-// VGATHERQPD.
-// x.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDrm")>;
-
-// y.
-def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDYrm")>;
+// VGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>;
 
 //-- Conversion instructions --//
 def ZnWriteCVTPD2PSr: SchedWriteRes<[ZnFPU3]> {
   let Latency = 4;
 }
+def ZnWriteCVTPD2PSYr: SchedWriteRes<[ZnFPU3]> {
+  let Latency = 5;
+}
+
 // CVTPD2PS.
 // x,x.
-def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(V?)CVTPD2PSrr")>;
+def : SchedAlias<WriteCvtPD2PS,  ZnWriteCVTPD2PSr>;
+// y,y.
+def : SchedAlias<WriteCvtPD2PSY, ZnWriteCVTPD2PSYr>;
+// z,z.
+defm : X86WriteResUnsupported<WriteCvtPD2PSZ>;
 
 def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> {
   let Latency = 11;
@@ -1296,34 +1207,30 @@ def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> {
   let ResourceCycles = [1,2];
 }
 // x,m128.
-def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(V?)CVTPD2PS(X?)rm")>;
-
-// x,y.
-def ZnWriteCVTPD2PSYr : SchedWriteRes<[ZnFPU3]> {
-  let Latency = 5;
-}
-def : InstRW<[ZnWriteCVTPD2PSYr], (instregex "(V?)CVTPD2PSYrr")>;
+def : SchedAlias<WriteCvtPD2PSLd, ZnWriteCVTPD2PSLd>;
 
 // x,m256.
 def ZnWriteCVTPD2PSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
   let Latency = 11;
 }
-def : InstRW<[ZnWriteCVTPD2PSYLd], (instregex "(V?)CVTPD2PSYrm")>;
+def : SchedAlias<WriteCvtPD2PSYLd, ZnWriteCVTPD2PSYLd>;
+// z,m512
+defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>;
 
 // CVTSD2SS.
 // x,x.
 // Same as WriteCVTPD2PSr
-def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+def : SchedAlias<WriteCvtSD2SS, ZnWriteCVTPD2PSr>;
 
 // x,m64.
-def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+def : SchedAlias<WriteCvtSD2SSLd, ZnWriteCVTPD2PSLd>;
 
 // CVTPS2PD.
 // x,x.
 def ZnWriteCVTPS2PDr : SchedWriteRes<[ZnFPU3]> {
   let Latency = 3;
 }
-def : InstRW<[ZnWriteCVTPS2PDr], (instregex "(V?)CVTPS2PDrr")>;
+def : SchedAlias<WriteCvtPS2PD, ZnWriteCVTPS2PDr>;
 
 // x,m64.
 // y,m128.
@@ -1331,20 +1238,23 @@ def ZnWriteCVTPS2PDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
   let Latency = 10;
   let NumMicroOps = 2;
 }
-def : InstRW<[ZnWriteCVTPS2PDLd], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+def : SchedAlias<WriteCvtPS2PDLd, ZnWriteCVTPS2PDLd>;
+def : SchedAlias<WriteCvtPS2PDYLd, ZnWriteCVTPS2PDLd>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>;
 
 // y,x.
 def ZnWriteVCVTPS2PDY : SchedWriteRes<[ZnFPU3]> {
   let Latency = 3;
 }
-def : InstRW<[ZnWriteVCVTPS2PDY], (instregex "VCVTPS2PDYrr")>;
+def : SchedAlias<WriteCvtPS2PDY, ZnWriteVCVTPS2PDY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZ>;
 
 // CVTSS2SD.
 // x,x.
 def ZnWriteCVTSS2SDr : SchedWriteRes<[ZnFPU3]> {
   let Latency = 4;
 }
-def : InstRW<[ZnWriteCVTSS2SDr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+def : SchedAlias<WriteCvtSS2SD, ZnWriteCVTSS2SDr>;
 
 // x,m32.
 def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -1352,7 +1262,7 @@ def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1, 2];
 }
-def : InstRW<[ZnWriteCVTSS2SDLd], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+def : SchedAlias<WriteCvtSS2SDLd, ZnWriteCVTSS2SDLd>;
 
 def ZnWriteCVTDQ2PDr: SchedWriteRes<[ZnFPU12,ZnFPU3]> {
   let Latency = 5;
@@ -1363,7 +1273,7 @@ def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
 
 // Same as xmm
 // y,x.
-def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "VCVTDQ2PDYrr")>;
+def : InstRW<[ZnWriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>;
 
 def ZnWriteCVTPD2DQr: SchedWriteRes<[ZnFPU12, ZnFPU3]> {
   let Latency = 5;
@@ -1383,7 +1293,6 @@ def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>;
 def : InstRW<[ZnWriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>;
 // x,m256.
 def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>;
-def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQ(64)?rm")>;
 
 def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> {
   let Latency = 4;
@@ -1394,7 +1303,7 @@ def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
 
 // CVTPI2PD.
 // x,mm.
-def : InstRW<[ZnWriteCVTPS2PDr], (instregex "MMX_CVT(T?)PI2PDirr")>;
+def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
 
 // CVT(T)PD2PI.
 // mm,x.
@@ -1403,24 +1312,21 @@ def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
 def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> {
   let Latency = 5;
 }
-// CVSTSI2SS.
-// x,r32.
-def : InstRW<[ZnWriteCVSTSI2SSr], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
 
 // same as CVTPD2DQr
 // CVT(T)SS2SI.
 // r32,x.
-def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
 // same as CVTPD2DQm
 // r32,m32.
-def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
 
 def ZnWriteCVSTSI2SDr: SchedWriteRes<[ZnFPU013, ZnFPU3]> {
   let Latency = 5;
 }
 // CVTSI2SD.
 // x,r32/64.
-def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>;
 
 
 def ZnWriteCVSTSI2SIr: SchedWriteRes<[ZnFPU3, ZnFPU2]> {
@@ -1431,34 +1337,29 @@ def ZnWriteCVSTSI2SILd: SchedWriteRes<[ZnAGU, ZnFPU3, ZnFPU2]> {
 }
 // CVTSD2SI.
 // r32/64
-def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?CVT(T?)SD2SI(64)?rr")>;
+def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>;
 // r32,m32.
-def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?CVT(T?)SD2SI(64)?rm")>;
-
-
-def ZnWriteVCVSTSI2SIr: SchedWriteRes<[ZnFPU3]> {
-  let Latency = 5;
-}
-def ZnWriteVCVSTSI2SILd: SchedWriteRes<[ZnFPU3, ZnAGU]> {
-  let Latency = 12;
-}
-// VCVTSD2SI.
-// r32/64
-def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rm")>;
+def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>;
 
 // VCVTPS2PH.
 // x,v,i.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)rr")>;
+def : SchedAlias<WriteCvtPS2PH,    ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHY,   ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
 // m,v,i.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)mr")>;
+def : SchedAlias<WriteCvtPS2PHSt,  ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHYSt, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 
 // VCVTPH2PS.
 // v,x.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rr")>;
+def : SchedAlias<WriteCvtPH2PS,    ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSY,   ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
 // v,m.
-def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rm")>;
+def : SchedAlias<WriteCvtPH2PSLd,  ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSYLd, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
 
 //-- SSE4A instructions --//
 // EXTRQ
@@ -1473,12 +1374,6 @@ def ZnWriteINSERTQ: SchedWriteRes<[ZnFPU03,ZnFPU1]> {
 }
 def : InstRW<[ZnWriteINSERTQ], (instregex "INSERTQ")>;
 
-// MOVNTSS/MOVNTSD
-def ZnWriteMOVNT: SchedWriteRes<[ZnAGU,ZnFPU2]> {
-  let Latency = 8;
-}
-def : InstRW<[ZnWriteMOVNT], (instregex "MOVNTS(S|D)")>;
-
 //-- SHA instructions --//
 // SHA256MSG2
 def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>;
@@ -1544,41 +1439,19 @@ def : InstRW<[ZnWriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
 //-- Arithmetic instructions --//
 
 // HADD, HSUB PS/PD
-def : InstRW<[WriteMicrocoded], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)r(r|m)")>;
-
-// MULL SS/SD PS/PD.
-// x,x / v,v,v.
-def ZnWriteMULr : SchedWriteRes<[ZnFPU01]> {
-  let Latency = 3;
-}
-// ymm.
-def ZnWriteMULYr : SchedWriteRes<[ZnFPU01]> {
-  let Latency = 4;
-}
-def : InstRW<[ZnWriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
-def : InstRW<[ZnWriteMULYr], (instregex "(V?)MUL(P|S)(S|D)Yrr")>;
-
-// x,m / v,v,m.
-def ZnWriteMULLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteMULLd], (instregex "(V?)MUL(P|S)(S|D)rm")>;
-
-// ymm
-def ZnWriteMULYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteMULYLd], (instregex "(V?)MUL(P|S)(S|D)Yrm")>;
+def : SchedAlias<WriteFHAdd,    ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddLd,  ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddY,   ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddYLd, ZnWriteMicrocoded>;
 
 // VDIVPS.
+// TODO - convert to ZnWriteResFpuPair
 // y,y,y.
 def ZnWriteVDIVPSYr : SchedWriteRes<[ZnFPU3]> {
   let Latency = 12;
   let ResourceCycles = [12];
 }
-def : InstRW<[ZnWriteVDIVPSYr], (instregex "VDIVPSYrr")>;
+def : SchedAlias<WriteFDivY,   ZnWriteVDIVPSYr>;
 
 // y,y,m256.
 def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -1586,15 +1459,16 @@ def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1, 19];
 }
-def : InstRW<[ZnWriteVDIVPSYLd], (instregex "VDIVPSYrm")>;
+def : SchedAlias<WriteFDivYLd,  ZnWriteVDIVPSYLd>;
 
 // VDIVPD.
+// TODO - convert to ZnWriteResFpuPair
 // y,y,y.
 def ZnWriteVDIVPDY : SchedWriteRes<[ZnFPU3]> {
   let Latency = 15;
   let ResourceCycles = [15];
 }
-def : InstRW<[ZnWriteVDIVPDY], (instregex "VDIVPDYrr")>;
+def : SchedAlias<WriteFDiv64Y, ZnWriteVDIVPDY>;
 
 // y,y,m256.
 def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
@@ -1602,173 +1476,63 @@ def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,22];
 }
-def : InstRW<[ZnWriteVDIVPDYLd], (instregex "VDIVPDYrm")>;
-
-// VRCPPS.
-// y,y.
-def ZnWriteVRCPPSr : SchedWriteRes<[ZnFPU01]> {
-  let Latency = 5;
-}
-def : InstRW<[ZnWriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
-
-// y,m256.
-def ZnWriteVRCPPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 12;
-  let NumMicroOps = 3;
-}
-def : InstRW<[ZnWriteVRCPPSLd], (instregex "VRCPPSYm(_Int)?")>;
-
-// ROUND SS/SD PS/PD.
-// v,v,i.
-def ZnWriteROUNDr : SchedWriteRes<[ZnFPU3]> {
-  let Latency = 4;
-}
-def : InstRW<[ZnWriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
-
-// VFMADD.
-// v,v,v.
-def ZnWriteFMADDr : SchedWriteRes<[ZnFPU03]> {
-  let Latency = 5;
-}
-def : InstRW<[ZnWriteFMADDr],
-    (instregex
-    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(213|132|231)(Y)?r",
-    "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)r",
-    "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
-    "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
-
-// v,v,m.
-def ZnWriteFMADDm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteFMADDm],
-    (instregex
-    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)(213|132|231)P(S|D)(Y)?m",
-    "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)m",
-    "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
-    "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
-
-// v,m,i.
-def ZnWriteROUNDm : SchedWriteRes<[ZnAGU, ZnFPU3]> {
-  let Latency = 11;
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+def : SchedAlias<WriteFDiv64YLd, ZnWriteVDIVPDYLd>;
 
 // DPPS.
 // x,x,i / v,v,v,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rri")>;
+def : SchedAlias<WriteDPPS,   ZnWriteMicrocoded>;
+def : SchedAlias<WriteDPPSY,  ZnWriteMicrocoded>;
 
 // x,m,i / v,v,m,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rmi")>;
+def : SchedAlias<WriteDPPSLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteDPPSYLd,ZnWriteMicrocoded>;
 
 // DPPD.
 // x,x,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrri")>;
+def : SchedAlias<WriteDPPD,   ZnWriteMicrocoded>;
 
 // x,m,i.
-def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrmi")>;
-
-// VSQRTPS.
-// y,y.
-def ZnWriteVSQRTPSYr : SchedWriteRes<[ZnFPU3]> {
-  let Latency = 28;
-  let ResourceCycles = [28];
-}
-def : InstRW<[ZnWriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
-
-// y,m256.
-def ZnWriteVSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
-  let Latency = 35;
-  let ResourceCycles = [1,35];
-  let NumMicroOps = 2;
-}
-def : InstRW<[ZnWriteVSQRTPSYLd], (instregex "VSQRTPSYm")>;
-
-// VSQRTPD.
-// y,y.
-def ZnWriteVSQRTPDYr : SchedWriteRes<[ZnFPU3]> {
-  let Latency = 40;
-  let ResourceCycles = [40];
-}
-def : InstRW<[ZnWriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
-
-// y,m256.
-def ZnWriteVSQRTPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
-  let Latency = 47;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,47];
-}
-def : InstRW<[ZnWriteVSQRTPDYLd], (instregex "VSQRTPDYm")>;
+def : SchedAlias<WriteDPPDLd, ZnWriteMicrocoded>;
 
 // RSQRTSS
+// TODO - convert to ZnWriteResFpuPair
 // x,x.
 def ZnWriteRSQRTSSr : SchedWriteRes<[ZnFPU02]> {
   let Latency = 5;
 }
-def : InstRW<[ZnWriteRSQRTSSr], (instregex "(V?)RSQRTSS(Y?)r(_Int)?")>;
+def : SchedAlias<WriteFRsqrt, ZnWriteRSQRTSSr>;
 
-// RSQRTPS
-// x,x.
-def ZnWriteRSQRTPSr : SchedWriteRes<[ZnFPU01]> {
-  let Latency = 5;
-}
-def : InstRW<[ZnWriteRSQRTPSr], (instregex "(V?)RSQRTPS(Y?)r(_Int)?")>;
-
-// RSQRTSSm
 // x,m128.
 def ZnWriteRSQRTSSLd: SchedWriteRes<[ZnAGU, ZnFPU02]> {
   let Latency = 12;
   let NumMicroOps = 2;
-  let ResourceCycles = [1,2];
-}
-def : InstRW<[ZnWriteRSQRTSSLd], (instregex "(V?)RSQRTSSm(_Int)?")>;
-
-// RSQRTPSm
-def ZnWriteRSQRTPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
-  let Latency = 12;
-  let NumMicroOps = 2;
+  let ResourceCycles = [1,2]; // FIXME: Is this right?
 }
-def : InstRW<[ZnWriteRSQRTPSLd], (instregex "(V?)RSQRTPSm(_Int)?")>;
+def : SchedAlias<WriteFRsqrtLd, ZnWriteRSQRTSSLd>;
 
-// RSQRTPS 256.
+// RSQRTPS
+// TODO - convert to ZnWriteResFpuPair
 // y,y.
 def ZnWriteRSQRTPSYr : SchedWriteRes<[ZnFPU01]> {
   let Latency = 5;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : InstRW<[ZnWriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+def : SchedAlias<WriteFRsqrtY, ZnWriteRSQRTPSYr>;
 
 // y,m256.
 def ZnWriteRSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
   let Latency = 12;
   let NumMicroOps = 2;
 }
-def : InstRW<[ZnWriteRSQRTPSYLd], (instregex "VRSQRTPSYm(_Int)?")>;
-
-//-- Logic instructions --//
-
-// AND, ANDN, OR, XOR PS/PD.
-// x,x / v,v,v.
-def : InstRW<[WriteVecLogic], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
-// x,m / v,v,m.
-def : InstRW<[WriteVecLogicLd],
-                         (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+def : SchedAlias<WriteFRsqrtYLd, ZnWriteRSQRTPSYLd>;
 
 //-- Other instructions --//
 
 // VZEROUPPER.
-def : InstRW<[WriteMicrocoded], (instregex "VZEROUPPER")>;
+def : InstRW<[WriteMicrocoded], (instrs VZEROUPPER)>;
 
 // VZEROALL.
-def : InstRW<[WriteMicrocoded], (instregex "VZEROALL")>;
-
-// LDMXCSR.
-def : InstRW<[WriteMicrocoded], (instregex "(V)?LDMXCSR")>;
-
-// STMXCSR.
-def : InstRW<[WriteMicrocoded], (instregex "(V)?STMXCSR")>;
+def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
 
 } // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
new file mode 100644
index 000000000000..078fe1598f13
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -0,0 +1,2247 @@
+//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Provide a pass which mitigates speculative execution attacks which operate
+/// by speculating incorrectly past some predicate (a type check, bounds check,
+/// or other condition) to reach a load with invalid inputs and leak the data
+/// accessed by that load using a side channel out of the speculative domain.
+///
+/// For details on the attacks, see the first variant in both the Project Zero
+/// writeup and the Spectre paper:
+/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
+/// https://spectreattack.com/spectre.pdf
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-speculative-load-hardening"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
+STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
+STATISTIC(NumAddrRegsHardened,
+          "Number of address mode used registers hardaned");
+STATISTIC(NumPostLoadRegsHardened,
+          "Number of post-load register values hardened");
+STATISTIC(NumCallsOrJumpsHardened,
+          "Number of calls or jumps requiring extra hardening");
+STATISTIC(NumInstsInserted, "Number of instructions inserted");
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> HardenEdgesWithLFENCE(
+    PASS_KEY "-lfence",
+    cl::desc(
+        "Use LFENCE along each conditional edge to harden against speculative "
+        "loads rather than conditional movs and poisoned pointers."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnablePostLoadHardening(
+    PASS_KEY "-post-load",
+    cl::desc("Harden the value loaded *after* it is loaded by "
+             "flushing the loaded bits to 1. This is hard to do "
+             "in general but can be done easily for GPRs."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> FenceCallAndRet(
+    PASS_KEY "-fence-call-and-ret",
+    cl::desc("Use a full speculation fence to harden both call and ret edges "
+             "rather than a lighter weight mitigation."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> HardenInterprocedurally(
+    PASS_KEY "-ip",
+    cl::desc("Harden interprocedurally by passing our state in and out of "
+             "functions in the high bits of the stack pointer."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+    HardenLoads(PASS_KEY "-loads",
+                cl::desc("Sanitize loads from memory. When disable, no "
+                         "significant security is provided."),
+                cl::init(true), cl::Hidden);
+
+static cl::opt<bool> HardenIndirectCallsAndJumps(
+    PASS_KEY "-indirect",
+    cl::desc("Harden indirect calls and jumps against using speculatively "
+             "stored attacker controlled addresses. This is designed to "
+             "mitigate Spectre v1.2 style attacks."),
+    cl::init(true), cl::Hidden);
+
+namespace llvm {
+
+void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
+public:
+  X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
+    initializeX86SpeculativeLoadHardeningPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "X86 speculative load hardening";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+private:
+  /// The information about a block's conditional terminators needed to trace
+  /// our predicate state through the exiting edges.
+  struct BlockCondInfo {
+    MachineBasicBlock *MBB;
+
+    // We mostly have one conditional branch, and in extremely rare cases have
+    // two. Three and more are so rare as to be unimportant for compile time.
+    SmallVector<MachineInstr *, 2> CondBrs;
+
+    MachineInstr *UncondBr;
+  };
+
+  /// Manages the predicate state traced through the program.
+  struct PredState {
+    unsigned InitialReg;
+    unsigned PoisonReg;
+
+    const TargetRegisterClass *RC;
+    MachineSSAUpdater SSA;
+
+    PredState(MachineFunction &MF, const TargetRegisterClass *RC)
+        : RC(RC), SSA(MF) {}
+  };
+
+  const X86Subtarget *Subtarget;
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  Optional<PredState> PS;
+
+  void hardenEdgesWithLFENCE(MachineFunction &MF);
+
+  SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
+
+  SmallVector<MachineInstr *, 16>
+  tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
+
+  void unfoldCallAndJumpLoads(MachineFunction &MF);
+
+  void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
+
+  unsigned saveEFLAGS(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
+  void restoreEFLAGS(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+                     unsigned OFReg);
+
+  void mergePredStateIntoSP(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+                            unsigned PredStateReg);
+  unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertPt,
+                                  DebugLoc Loc);
+
+  void
+  hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
+                 MachineOperand &IndexMO,
+                 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+  MachineInstr *
+  sinkPostLoadHardenedInst(MachineInstr &MI,
+                           SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
+  bool canHardenRegister(unsigned Reg);
+  unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator InsertPt,
+                                 DebugLoc Loc);
+  unsigned hardenPostLoad(MachineInstr &MI);
+  void hardenReturnInstr(MachineInstr &MI);
+  void tracePredStateThroughCall(MachineInstr &MI);
+  void hardenIndirectCallOrJumpInstr(
+      MachineInstr &MI,
+      SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+};
+
+} // end anonymous namespace
+
+char X86SpeculativeLoadHardeningPass::ID = 0;
+
+void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
+                                    MachineBasicBlock &Succ, int SuccCount,
+                                    MachineInstr *Br, MachineInstr *&UncondBr,
+                                    const X86InstrInfo &TII) {
+  assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
+
+  MachineFunction &MF = *MBB.getParent();
+
+  MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+  // We have to insert the new block immediately after the current one as we
+  // don't know what layout-successor relationships the successor has and we
+  // may not be able to (and generally don't want to) try to fix those up.
+  MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+  // Update the branch instruction if necessary.
+  if (Br) {
+    assert(Br->getOperand(0).getMBB() == &Succ &&
+           "Didn't start with the right target!");
+    Br->getOperand(0).setMBB(&NewMBB);
+
+    // If this successor was reached through a branch rather than fallthrough,
+    // we might have *broken* fallthrough and so need to inject a new
+    // unconditional branch.
+    if (!UncondBr) {
+      MachineBasicBlock &OldLayoutSucc =
+          *std::next(MachineFunction::iterator(&NewMBB));
+      assert(MBB.isSuccessor(&OldLayoutSucc) &&
+             "Without an unconditional branch, the old layout successor should "
+             "be an actual successor!");
+      auto BrBuilder =
+          BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
+      // Update the unconditional branch now that we've added one.
+      UncondBr = &*BrBuilder;
+    }
+
+    // Insert unconditional "jump Succ" instruction in the new block if
+    // necessary.
+    if (!NewMBB.isLayoutSuccessor(&Succ)) {
+      SmallVector<MachineOperand, 4> Cond;
+      TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
+    }
+  } else {
+    assert(!UncondBr &&
+           "Cannot have a branchless successor and an unconditional branch!");
+    assert(NewMBB.isLayoutSuccessor(&Succ) &&
+           "A non-branch successor must have been a layout successor before "
+           "and now is a layout successor of the new block.");
+  }
+
+  // If this is the only edge to the successor, we can just replace it in the
+  // CFG. Otherwise we need to add a new entry in the CFG for the new
+  // successor.
+  if (SuccCount == 1) {
+    MBB.replaceSuccessor(&Succ, &NewMBB);
+  } else {
+    MBB.splitSuccessor(&Succ, &NewMBB);
+  }
+
+  // Hook up the edge from the new basic block to the old successor in the CFG.
+  NewMBB.addSuccessor(&Succ);
+
+  // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
+  for (MachineInstr &MI : Succ) {
+    if (!MI.isPHI())
+      break;
+    for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+         OpIdx += 2) {
+      MachineOperand &OpV = MI.getOperand(OpIdx);
+      MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+      assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+      if (OpMBB.getMBB() != &MBB)
+        continue;
+
+      // If this is the last edge to the succesor, just replace MBB in the PHI
+      if (SuccCount == 1) {
+        OpMBB.setMBB(&NewMBB);
+        break;
+      }
+
+      // Otherwise, append a new pair of operands for the new incoming edge.
+      MI.addOperand(MF, OpV);
+      MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+      break;
+    }
+  }
+
+  // Inherit live-ins from the successor
+  for (auto &LI : Succ.liveins())
+    NewMBB.addLiveIn(LI);
+
+  LLVM_DEBUG(dbgs() << "  Split edge from '" << MBB.getName() << "' to '"
+                    << Succ.getName() << "'.\n");
+  return NewMBB;
+}
+
+/// Removing duplicate PHI operands to leave the PHI in a canonical and
+/// predictable form.
+///
+/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
+/// isn't what you might expect. We may have multiple entries in PHI nodes for
+/// a single predecessor. This makes CFG-updating extremely complex, so here we
+/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
+/// one entry per predecessor, regardless of how many edges there are.
+static void canonicalizePHIOperands(MachineFunction &MF) {
+  SmallPtrSet<MachineBasicBlock *, 4> Preds;
+  SmallVector<int, 4> DupIndices;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB) {
+      if (!MI.isPHI())
+        break;
+
+      // First we scan the operands of the PHI looking for duplicate entries
+      // a particular predecessor. We retain the operand index of each duplicate
+      // entry found.
+      for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+           OpIdx += 2)
+        if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
+          DupIndices.push_back(OpIdx);
+
+      // Now walk the duplicate indices, removing both the block and value. Note
+      // that these are stored as a vector making this element-wise removal
+      // :w
+      // potentially quadratic.
+      //
+      // FIXME: It is really frustrating that we have to use a quadratic
+      // removal algorithm here. There should be a better way, but the use-def
+      // updates required make that impossible using the public API.
+      //
+      // Note that we have to process these backwards so that we don't
+      // invalidate other indices with each removal.
+      while (!DupIndices.empty()) {
+        int OpIdx = DupIndices.pop_back_val();
+        // Remove both the block and value operand, again in reverse order to
+        // preserve indices.
+        MI.RemoveOperand(OpIdx + 1);
+        MI.RemoveOperand(OpIdx);
+      }
+
+      Preds.clear();
+    }
+}
+
+/// Helper to scan a function for loads vulnerable to misspeculation that we
+/// want to harden.
+///
+/// We use this to avoid making changes to functions where there is nothing we
+/// need to do to harden against misspeculation.
+static bool hasVulnerableLoad(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // Loads within this basic block after an LFENCE are not at risk of
+      // speculatively executing with invalid predicates from prior control
+      // flow. So break out of this block but continue scanning the function.
+      if (MI.getOpcode() == X86::LFENCE)
+        break;
+
+      // Looking for loads only.
+      if (!MI.mayLoad())
+        continue;
+
+      // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
+      if (MI.getOpcode() == X86::MFENCE)
+        continue;
+
+      // We found a load.
+      return true;
+    }
+  }
+
+  // No loads found.
+  return false;
+}
+
+bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+                    << " **********\n");
+
+  Subtarget = &MF.getSubtarget<X86Subtarget>();
+  MRI = &MF.getRegInfo();
+  TII = Subtarget->getInstrInfo();
+  TRI = Subtarget->getRegisterInfo();
+
+  // FIXME: Support for 32-bit.
+  PS.emplace(MF, &X86::GR64_NOSPRegClass);
+
+  if (MF.begin() == MF.end())
+    // Nothing to do for a degenerate empty function...
+    return false;
+
+  // We support an alternative hardening technique based on a debug flag.
+  if (HardenEdgesWithLFENCE) {
+    hardenEdgesWithLFENCE(MF);
+    return true;
+  }
+
+  // Create a dummy debug loc to use for all the generated code here.
+  DebugLoc Loc;
+
+  MachineBasicBlock &Entry = *MF.begin();
+  auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
+
+  // Do a quick scan to see if we have any checkable loads.
+  bool HasVulnerableLoad = hasVulnerableLoad(MF);
+
+  // See if we have any conditional branching blocks that we will need to trace
+  // predicate state through.
+  SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
+
+  // If we have no interesting conditions or loads, nothing to do here.
+  if (!HasVulnerableLoad && Infos.empty())
+    return true;
+
+  // The poison value is required to be an all-ones value for many aspects of
+  // this mitigation.
+  const int PoisonVal = -1;
+  PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
+  BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
+      .addImm(PoisonVal);
+  ++NumInstsInserted;
+
+  // If we have loads being hardened and we've asked for call and ret edges to
+  // get a full fence-based mitigation, inject that fence.
+  if (HasVulnerableLoad && FenceCallAndRet) {
+    // We need to insert an LFENCE at the start of the function to suspend any
+    // incoming misspeculation from the caller. This helps two-fold: the caller
+    // may not have been protected as this code has been, and this code gets to
+    // not take any specific action to protect across calls.
+    // FIXME: We could skip this for functions which unconditionally return
+    // a constant.
+    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+  }
+
+  // If we guarded the entry with an LFENCE and have no conditionals to protect
+  // in blocks, then we're done.
+  if (FenceCallAndRet && Infos.empty())
+    // We may have changed the function's code at this point to insert fences.
+    return true;
+
+  // For every basic block in the function which can b
+  if (HardenInterprocedurally && !FenceCallAndRet) {
+    // Set up the predicate state by extracting it from the incoming stack
+    // pointer so we pick up any misspeculation in our caller.
+    PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
+  } else {
+    // Otherwise, just build the predicate state itself by zeroing a register
+    // as we don't need any initial state.
+    PS->InitialReg = MRI->createVirtualRegister(PS->RC);
+    unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
+                         PredStateSubReg);
+    ++NumInstsInserted;
+    MachineOperand *ZeroEFLAGSDefOp =
+        ZeroI->findRegisterDefOperand(X86::EFLAGS);
+    assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
+           "Must have an implicit def of EFLAGS!");
+    ZeroEFLAGSDefOp->setIsDead(true);
+    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
+            PS->InitialReg)
+        .addImm(0)
+        .addReg(PredStateSubReg)
+        .addImm(X86::sub_32bit);
+  }
+
+  // We're going to need to trace predicate state throughout the function's
+  // CFG. Prepare for this by setting up our initial state of PHIs with unique
+  // predecessor entries and all the initial predicate state.
+  canonicalizePHIOperands(MF);
+
+  // Track the updated values in an SSA updater to rewrite into SSA form at the
+  // end.
+  PS->SSA.Initialize(PS->InitialReg);
+  PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
+
+  // Trace through the CFG.
+  auto CMovs = tracePredStateThroughCFG(MF, Infos);
+
+  // We may also enter basic blocks in this function via exception handling
+  // control flow. Here, if we are hardening interprocedurally, we need to
+  // re-capture the predicate state from the throwing code. In the Itanium ABI,
+  // the throw will always look like a call to __cxa_throw and will have the
+  // predicate state in the stack pointer, so extract fresh predicate state from
+  // the stack pointer and make it available in SSA.
+  // FIXME: Handle non-itanium ABI EH models.
+  if (HardenInterprocedurally) {
+    for (MachineBasicBlock &MBB : MF) {
+      assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
+      assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
+      assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
+      if (!MBB.isEHPad())
+        continue;
+      PS->SSA.AddAvailableValue(
+          &MBB,
+          extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
+    }
+  }
+
+  // If we are going to harden calls and jumps we need to unfold their memory
+  // operands.
+  if (HardenIndirectCallsAndJumps)
+    unfoldCallAndJumpLoads(MF);
+
+  // Now that we have the predicate state available at the start of each block
+  // in the CFG, trace it through each block, hardening vulnerable instructions
+  // as we go.
+  tracePredStateThroughBlocksAndHarden(MF);
+
+  // Now rewrite all the uses of the pred state using the SSA updater to insert
+  // PHIs connecting the state between blocks along the CFG edges.
+  for (MachineInstr *CMovI : CMovs)
+    for (MachineOperand &Op : CMovI->operands()) {
+      if (!Op.isReg() || Op.getReg() != PS->InitialReg)
+        continue;
+
+      PS->SSA.RewriteUse(Op);
+    }
+
+  LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
+             dbgs() << "\n"; MF.verify(this));
+  return true;
+}
+
+/// Implements the naive hardening approach of putting an LFENCE after every
+/// potentially mis-predicted control flow construct.
+///
+/// We include this as an alternative mostly for the purpose of comparison. The
+/// performance impact of this is expected to be extremely severe and not
+/// practical for any real-world users.
+void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
+    MachineFunction &MF) {
+  // First, we scan the function looking for blocks that are reached along edges
+  // that we might want to harden.
+  SmallSetVector<MachineBasicBlock *, 8> Blocks;
+  for (MachineBasicBlock &MBB : MF) {
+    // If there are no or only one successor, nothing to do here.
+    if (MBB.succ_size() <= 1)
+      continue;
+
+    // Skip blocks unless their terminators start with a branch. Other
+    // terminators don't seem interesting for guarding against misspeculation.
+    auto TermIt = MBB.getFirstTerminator();
+    if (TermIt == MBB.end() || !TermIt->isBranch())
+      continue;
+
+    // Add all the non-EH-pad succossors to the blocks we want to harden. We
+    // skip EH pads because there isn't really a condition of interest on
+    // entering.
+    for (MachineBasicBlock *SuccMBB : MBB.successors())
+      if (!SuccMBB->isEHPad())
+        Blocks.insert(SuccMBB);
+  }
+
+  for (MachineBasicBlock *MBB : Blocks) {
+    auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
+    BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+  }
+}
+
+SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
+X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
+  SmallVector<BlockCondInfo, 16> Infos;
+
+  // Walk the function and build up a summary for each block's conditions that
+  // we need to trace through.
+  for (MachineBasicBlock &MBB : MF) {
+    // If there are no or only one successor, nothing to do here.
+    if (MBB.succ_size() <= 1)
+      continue;
+
+    // We want to reliably handle any conditional branch terminators in the
+    // MBB, so we manually analyze the branch. We can handle all of the
+    // permutations here, including ones that analyze branch cannot.
+    //
+    // The approach is to walk backwards across the terminators, resetting at
+    // any unconditional non-indirect branch, and track all conditional edges
+    // to basic blocks as well as the fallthrough or unconditional successor
+    // edge. For each conditional edge, we track the target and the opposite
+    // condition code in order to inject a "no-op" cmov into that successor
+    // that will harden the predicate. For the fallthrough/unconditional
+    // edge, we inject a separate cmov for each conditional branch with
+    // matching condition codes. This effectively implements an "and" of the
+    // condition flags, even if there isn't a single condition flag that would
+    // directly implement that. We don't bother trying to optimize either of
+    // these cases because if such an optimization is possible, LLVM should
+    // have optimized the conditional *branches* in that way already to reduce
+    // instruction count. This late, we simply assume the minimal number of
+    // branch instructions is being emitted and use that to guide our cmov
+    // insertion.
+
+    BlockCondInfo Info = {&MBB, {}, nullptr};
+
+    // Now walk backwards through the terminators and build up successors they
+    // reach and the conditions.
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
+      // Once we've handled all the terminators, we're done.
+      if (!MI.isTerminator())
+        break;
+
+      // If we see a non-branch terminator, we can't handle anything so bail.
+      if (!MI.isBranch()) {
+        Info.CondBrs.clear();
+        break;
+      }
+
+      // If we see an unconditional branch, reset our state, clear any
+      // fallthrough, and set this is the "else" successor.
+      if (MI.getOpcode() == X86::JMP_1) {
+        Info.CondBrs.clear();
+        Info.UncondBr = &MI;
+        continue;
+      }
+
+      // If we get an invalid condition, we have an indirect branch or some
+      // other unanalyzable "fallthrough" case. We model this as a nullptr for
+      // the destination so we can still guard any conditional successors.
+      // Consider code sequences like:
+      // ```
+      //   jCC L1
+      //   jmpq *%rax
+      // ```
+      // We still want to harden the edge to `L1`.
+      if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
+        Info.CondBrs.clear();
+        Info.UncondBr = &MI;
+        continue;
+      }
+
+      // We have a vanilla conditional branch, add it to our list.
+      Info.CondBrs.push_back(&MI);
+    }
+    if (Info.CondBrs.empty()) {
+      ++NumBranchesUntraced;
+      LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
+                 MBB.dump());
+      continue;
+    }
+
+    Infos.push_back(Info);
+  }
+
+  return Infos;
+}
+
+/// Trace the predicate state through the CFG, instrumenting each conditional
+/// branch such that misspeculation through an edge will poison the predicate
+/// state.
+///
+/// Returns the list of inserted CMov instructions so that they can have their
+/// uses of the predicate state rewritten into proper SSA form once it is
+/// complete.
+SmallVector<MachineInstr *, 16>
+X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
+    MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
+  // Collect the inserted cmov instructions so we can rewrite their uses of the
+  // predicate state into SSA form.
+  SmallVector<MachineInstr *, 16> CMovs;
+
+  // Now walk all of the basic blocks looking for ones that end in conditional
+  // jumps where we need to update this register along each edge.
+  for (const BlockCondInfo &Info : Infos) {
+    MachineBasicBlock &MBB = *Info.MBB;
+    const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
+    MachineInstr *UncondBr = Info.UncondBr;
+
+    LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
+                      << "\n");
+    ++NumCondBranchesTraced;
+
+    // Compute the non-conditional successor as either the target of any
+    // unconditional branch or the layout successor.
+    MachineBasicBlock *UncondSucc =
+        UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
+                        ? UncondBr->getOperand(0).getMBB()
+                        : nullptr)
+                 : &*std::next(MachineFunction::iterator(&MBB));
+
+    // Count how many edges there are to any given successor.
+    SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
+    if (UncondSucc)
+      ++SuccCounts[UncondSucc];
+    for (auto *CondBr : CondBrs)
+      ++SuccCounts[CondBr->getOperand(0).getMBB()];
+
+    // A lambda to insert cmov instructions into a block checking all of the
+    // condition codes in a sequence.
+    auto BuildCheckingBlockForSuccAndConds =
+        [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
+            MachineInstr *Br, MachineInstr *&UncondBr,
+            ArrayRef<X86::CondCode> Conds) {
+          // First, we split the edge to insert the checking block into a safe
+          // location.
+          auto &CheckingMBB =
+              (SuccCount == 1 && Succ.pred_size() == 1)
+                  ? Succ
+                  : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
+
+          bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
+          if (!LiveEFLAGS)
+            CheckingMBB.addLiveIn(X86::EFLAGS);
+
+          // Now insert the cmovs to implement the checks.
+          auto InsertPt = CheckingMBB.begin();
+          assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
+                 "Should never have a PHI in the initial checking block as it "
+                 "always has a single predecessor!");
+
+          // We will wire each cmov to each other, but need to start with the
+          // incoming pred state.
+          unsigned CurStateReg = PS->InitialReg;
+
+          for (X86::CondCode Cond : Conds) {
+            int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+            auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
+
+            unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+            // Note that we intentionally use an empty debug location so that
+            // this picks up the preceding location.
+            auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
+                                 TII->get(CMovOp), UpdatedStateReg)
+                             .addReg(CurStateReg)
+                             .addReg(PS->PoisonReg);
+            // If this is the last cmov and the EFLAGS weren't originally
+            // live-in, mark them as killed.
+            if (!LiveEFLAGS && Cond == Conds.back())
+              CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+
+            ++NumInstsInserted;
+            LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump();
+                       dbgs() << "\n");
+
+            // The first one of the cmovs will be using the top level
+            // `PredStateReg` and need to get rewritten into SSA form.
+            if (CurStateReg == PS->InitialReg)
+              CMovs.push_back(&*CMovI);
+
+            // The next cmov should start from this one's def.
+            CurStateReg = UpdatedStateReg;
+          }
+
+          // And put the last one into the available values for SSA form of our
+          // predicate state.
+          PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
+        };
+
+    std::vector<X86::CondCode> UncondCodeSeq;
+    for (auto *CondBr : CondBrs) {
+      MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
+      int &SuccCount = SuccCounts[&Succ];
+
+      X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
+      X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
+      UncondCodeSeq.push_back(Cond);
+
+      BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
+                                        {InvCond});
+
+      // Decrement the successor count now that we've split one of the edges.
+      // We need to keep the count of edges to the successor accurate in order
+      // to know above when to *replace* the successor in the CFG vs. just
+      // adding the new successor.
+      --SuccCount;
+    }
+
+    // Since we may have split edges and changed the number of successors,
+    // normalize the probabilities. This avoids doing it each time we split an
+    // edge.
+    MBB.normalizeSuccProbs();
+
+    // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
+    // need to intersect the other condition codes. We can do this by just
+    // doing a cmov for each one.
+    if (!UncondSucc)
+      // If we have no fallthrough to protect (perhaps it is an indirect jump?)
+      // just skip this and continue.
+      continue;
+
+    assert(SuccCounts[UncondSucc] == 1 &&
+           "We should never have more than one edge to the unconditional "
+           "successor at this point because every other edge must have been "
+           "split above!");
+
+    // Sort and unique the codes to minimize them.
+    llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
+    UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
+                        UncondCodeSeq.end());
+
+    // Build a checking version of the successor.
+    BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
+                                      UncondBr, UncondBr, UncondCodeSeq);
+  }
+
+  return CMovs;
+}
+
+/// Compute the register class for the unfolded load.
+///
+/// FIXME: This should probably live in X86InstrInfo, potentially by adding
+/// a way to unfold into a newly created vreg rather than requiring a register
+/// input.
+static const TargetRegisterClass *
+getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
+                           unsigned Opcode) {
+  unsigned Index;
+  unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
+      Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
+  const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
+  return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
+}
+
+void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
+    MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF)
+    for (auto MII = MBB.instr_begin(), MIE = MBB.instr_end(); MII != MIE;) {
+      // Grab a reference and increment the iterator so we can remove this
+      // instruction if needed without disturbing the iteration.
+      MachineInstr &MI = *MII++;
+
+      // Must either be a call or a branch.
+      if (!MI.isCall() && !MI.isBranch())
+        continue;
+      // We only care about loading variants of these instructions.
+      if (!MI.mayLoad())
+        continue;
+
+      switch (MI.getOpcode()) {
+      default: {
+        LLVM_DEBUG(
+            dbgs() << "ERROR: Found an unexpected loading branch or call "
+                      "instruction:\n";
+            MI.dump(); dbgs() << "\n");
+        report_fatal_error("Unexpected loading branch or call!");
+      }
+
+      case X86::FARCALL16m:
+      case X86::FARCALL32m:
+      case X86::FARCALL64:
+      case X86::FARJMP16m:
+      case X86::FARJMP32m:
+      case X86::FARJMP64:
+        // We cannot mitigate far jumps or calls, but we also don't expect them
+        // to be vulnerable to Spectre v1.2 style attacks.
+        continue;
+
+      case X86::CALL16m:
+      case X86::CALL16m_NT:
+      case X86::CALL32m:
+      case X86::CALL32m_NT:
+      case X86::CALL64m:
+      case X86::CALL64m_NT:
+      case X86::JMP16m:
+      case X86::JMP16m_NT:
+      case X86::JMP32m:
+      case X86::JMP32m_NT:
+      case X86::JMP64m:
+      case X86::JMP64m_NT:
+      case X86::TAILJMPm64:
+      case X86::TAILJMPm64_REX:
+      case X86::TAILJMPm:
+      case X86::TCRETURNmi64:
+      case X86::TCRETURNmi: {
+        // Use the generic unfold logic now that we know we're dealing with
+        // expected instructions.
+        // FIXME: We don't have test coverage for all of these!
+        auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
+        if (!UnfoldedRC) {
+          LLVM_DEBUG(dbgs()
+                         << "ERROR: Unable to unfold load from instruction:\n";
+                     MI.dump(); dbgs() << "\n");
+          report_fatal_error("Unable to unfold load!");
+        }
+        unsigned Reg = MRI->createVirtualRegister(UnfoldedRC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        // If we were able to compute an unfolded reg class, any failure here
+        // is just a programming error so just assert.
+        bool Unfolded =
+            TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true,
+                                     /*UnfoldStore*/ false, NewMIs);
+        (void)Unfolded;
+        assert(Unfolded &&
+               "Computed unfolded register class but failed to unfold");
+        // Now stitch the new instructions into place and erase the old one.
+        for (auto *NewMI : NewMIs)
+          MBB.insert(MI.getIterator(), NewMI);
+        MI.eraseFromParent();
+        LLVM_DEBUG({
+          dbgs() << "Unfolded load successfully into:\n";
+          for (auto *NewMI : NewMIs) {
+            NewMI->dump();
+            dbgs() << "\n";
+          }
+        });
+        continue;
+      }
+      }
+      llvm_unreachable("Escaped switch with default!");
+    }
+}
+
+/// Returns true if the instruction has no behavior (specified or otherwise)
+/// that is based on the value of any of its register operands
+///
+/// A classical example of something that is inherently not data invariant is an
+/// indirect jump -- the destination is loaded into icache based on the bits set
+/// in the jump destination register.
+///
+/// FIXME: This should become part of our instruction tables.
+static bool isDataInvariant(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    // By default, assume that the instruction is not data invariant.
+    return false;
+
+    // Some target-independent operations that trivially lower to data-invariant
+    // instructions.
+  case TargetOpcode::COPY:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+    return true;
+
+  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+  // However, they set flags and are perhaps the most surprisingly constant
+  // time operations so we call them out here separately.
+  case X86::IMUL16rr:
+  case X86::IMUL16rri8:
+  case X86::IMUL16rri:
+  case X86::IMUL32rr:
+  case X86::IMUL32rri8:
+  case X86::IMUL32rri:
+  case X86::IMUL64rr:
+  case X86::IMUL64rri32:
+  case X86::IMUL64rri8:
+
+  // Bit scanning and counting instructions that are somewhat surprisingly
+  // constant time as they scan across bits and do other fairly complex
+  // operations like popcnt, but are believed to be constant time on x86.
+  // However, these set flags.
+  case X86::BSF16rr:
+  case X86::BSF32rr:
+  case X86::BSF64rr:
+  case X86::BSR16rr:
+  case X86::BSR32rr:
+  case X86::BSR64rr:
+  case X86::LZCNT16rr:
+  case X86::LZCNT32rr:
+  case X86::LZCNT64rr:
+  case X86::POPCNT16rr:
+  case X86::POPCNT32rr:
+  case X86::POPCNT64rr:
+  case X86::TZCNT16rr:
+  case X86::TZCNT32rr:
+  case X86::TZCNT64rr:
+
+  // Bit manipulation instructions are effectively combinations of basic
+  // arithmetic ops, and should still execute in constant time. These also
+  // set flags.
+  case X86::BLCFILL32rr:
+  case X86::BLCFILL64rr:
+  case X86::BLCI32rr:
+  case X86::BLCI64rr:
+  case X86::BLCIC32rr:
+  case X86::BLCIC64rr:
+  case X86::BLCMSK32rr:
+  case X86::BLCMSK64rr:
+  case X86::BLCS32rr:
+  case X86::BLCS64rr:
+  case X86::BLSFILL32rr:
+  case X86::BLSFILL64rr:
+  case X86::BLSI32rr:
+  case X86::BLSI64rr:
+  case X86::BLSIC32rr:
+  case X86::BLSIC64rr:
+  case X86::BLSMSK32rr:
+  case X86::BLSMSK64rr:
+  case X86::BLSR32rr:
+  case X86::BLSR64rr:
+  case X86::TZMSK32rr:
+  case X86::TZMSK64rr:
+
+  // Bit extracting and clearing instructions should execute in constant time,
+  // and set flags.
+  case X86::BEXTR32rr:
+  case X86::BEXTR64rr:
+  case X86::BEXTRI32ri:
+  case X86::BEXTRI64ri:
+  case X86::BZHI32rr:
+  case X86::BZHI64rr:
+
+  // Shift and rotate.
+  case X86::ROL8r1:  case X86::ROL16r1:  case X86::ROL32r1:  case X86::ROL64r1:
+  case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
+  case X86::ROL8ri:  case X86::ROL16ri:  case X86::ROL32ri:  case X86::ROL64ri:
+  case X86::ROR8r1:  case X86::ROR16r1:  case X86::ROR32r1:  case X86::ROR64r1:
+  case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
+  case X86::ROR8ri:  case X86::ROR16ri:  case X86::ROR32ri:  case X86::ROR64ri:
+  case X86::SAR8r1:  case X86::SAR16r1:  case X86::SAR32r1:  case X86::SAR64r1:
+  case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
+  case X86::SAR8ri:  case X86::SAR16ri:  case X86::SAR32ri:  case X86::SAR64ri:
+  case X86::SHL8r1:  case X86::SHL16r1:  case X86::SHL32r1:  case X86::SHL64r1:
+  case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
+  case X86::SHL8ri:  case X86::SHL16ri:  case X86::SHL32ri:  case X86::SHL64ri:
+  case X86::SHR8r1:  case X86::SHR16r1:  case X86::SHR32r1:  case X86::SHR64r1:
+  case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
+  case X86::SHR8ri:  case X86::SHR16ri:  case X86::SHR32ri:  case X86::SHR64ri:
+  case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
+  case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
+  case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
+  case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
+
+  // Basic arithmetic is constant time on the input but does set flags.
+  case X86::ADC8rr:   case X86::ADC8ri:
+  case X86::ADC16rr:  case X86::ADC16ri:   case X86::ADC16ri8:
+  case X86::ADC32rr:  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC64rr:  case X86::ADC64ri8:  case X86::ADC64ri32:
+  case X86::ADD8rr:   case X86::ADD8ri:
+  case X86::ADD16rr:  case X86::ADD16ri:   case X86::ADD16ri8:
+  case X86::ADD32rr:  case X86::ADD32ri:   case X86::ADD32ri8:
+  case X86::ADD64rr:  case X86::ADD64ri8:  case X86::ADD64ri32:
+  case X86::AND8rr:   case X86::AND8ri:
+  case X86::AND16rr:  case X86::AND16ri:   case X86::AND16ri8:
+  case X86::AND32rr:  case X86::AND32ri:   case X86::AND32ri8:
+  case X86::AND64rr:  case X86::AND64ri8:  case X86::AND64ri32:
+  case X86::OR8rr:    case X86::OR8ri:
+  case X86::OR16rr:   case X86::OR16ri:    case X86::OR16ri8:
+  case X86::OR32rr:   case X86::OR32ri:    case X86::OR32ri8:
+  case X86::OR64rr:   case X86::OR64ri8:   case X86::OR64ri32:
+  case X86::SBB8rr:   case X86::SBB8ri:
+  case X86::SBB16rr:  case X86::SBB16ri:   case X86::SBB16ri8:
+  case X86::SBB32rr:  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB64rr:  case X86::SBB64ri8:  case X86::SBB64ri32:
+  case X86::SUB8rr:   case X86::SUB8ri:
+  case X86::SUB16rr:  case X86::SUB16ri:   case X86::SUB16ri8:
+  case X86::SUB32rr:  case X86::SUB32ri:   case X86::SUB32ri8:
+  case X86::SUB64rr:  case X86::SUB64ri8:  case X86::SUB64ri32:
+  case X86::XOR8rr:   case X86::XOR8ri:
+  case X86::XOR16rr:  case X86::XOR16ri:   case X86::XOR16ri8:
+  case X86::XOR32rr:  case X86::XOR32ri:   case X86::XOR32ri8:
+  case X86::XOR64rr:  case X86::XOR64ri8:  case X86::XOR64ri32:
+  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+  case X86::ADCX32rr: case X86::ADCX64rr:
+  case X86::ADOX32rr: case X86::ADOX64rr:
+  case X86::ANDN32rr: case X86::ANDN64rr:
+  // Unary arithmetic operations.
+  case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
+  case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
+  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
+    // Check whether the EFLAGS implicit-def is dead. We assume that this will
+    // always find the implicit-def because this code should only be reached
+    // for instructions that do in fact implicitly def this.
+    if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+      // If we would clobber EFLAGS that are used, just bail for now.
+      LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
+                 MI.dump(); dbgs() << "\n");
+      return false;
+    }
+
+    // Otherwise, fallthrough to handle these the same as instructions that
+    // don't set EFLAGS.
+    LLVM_FALLTHROUGH;
+
+  // Unlike other arithmetic, NOT doesn't set EFLAGS.
+  case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
+
+  // Various move instructions used to zero or sign extend things. Note that we
+  // intentionally don't support the _NOREX variants as we can't handle that
+  // register constraint anyways.
+  case X86::MOVSX16rr8:
+  case X86::MOVSX32rr8: case X86::MOVSX32rr16:
+  case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
+  case X86::MOVZX16rr8:
+  case X86::MOVZX32rr8: case X86::MOVZX32rr16:
+  case X86::MOVZX64rr8: case X86::MOVZX64rr16:
+  case X86::MOV32rr:
+
+  // Arithmetic instructions that are both constant time and don't set flags.
+  case X86::RORX32ri:
+  case X86::RORX64ri:
+  case X86::SARX32rr:
+  case X86::SARX64rr:
+  case X86::SHLX32rr:
+  case X86::SHLX64rr:
+  case X86::SHRX32rr:
+  case X86::SHRX64rr:
+
+  // LEA doesn't actually access memory, and its arithmetic is constant time.
+  case X86::LEA16r:
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    return true;
+  }
+}
+
+/// Returns true if the instruction has no behavior (specified or otherwise)
+/// that is based on the value loaded from memory or the value of any
+/// non-address register operands.
+///
+/// For example, if the latency of the instruction is dependent on the
+/// particular bits set in any of the registers *or* any of the bits loaded from
+/// memory.
+///
+/// A classical example of something that is inherently not data invariant is an
+/// indirect jump -- the destination is loaded into icache based on the bits set
+/// in the jump destination register.
+///
+/// FIXME: This should become part of our instruction tables.
+static bool isDataInvariantLoad(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    // By default, assume that the load will immediately leak.
+    return false;
+
+  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+  // However, they set flags and are perhaps the most surprisingly constant
+  // time operations so we call them out here separately.
+  case X86::IMUL16rm:
+  case X86::IMUL16rmi8:
+  case X86::IMUL16rmi:
+  case X86::IMUL32rm:
+  case X86::IMUL32rmi8:
+  case X86::IMUL32rmi:
+  case X86::IMUL64rm:
+  case X86::IMUL64rmi32:
+  case X86::IMUL64rmi8:
+
+  // Bit scanning and counting instructions that are somewhat surprisingly
+  // constant time as they scan across bits and do other fairly complex
+  // operations like popcnt, but are believed to be constant time on x86.
+  // However, these set flags.
+  case X86::BSF16rm:
+  case X86::BSF32rm:
+  case X86::BSF64rm:
+  case X86::BSR16rm:
+  case X86::BSR32rm:
+  case X86::BSR64rm:
+  case X86::LZCNT16rm:
+  case X86::LZCNT32rm:
+  case X86::LZCNT64rm:
+  case X86::POPCNT16rm:
+  case X86::POPCNT32rm:
+  case X86::POPCNT64rm:
+  case X86::TZCNT16rm:
+  case X86::TZCNT32rm:
+  case X86::TZCNT64rm:
+
+  // Bit manipulation instructions are effectively combinations of basic
+  // arithmetic ops, and should still execute in constant time. These also
+  // set flags.
+  case X86::BLCFILL32rm:
+  case X86::BLCFILL64rm:
+  case X86::BLCI32rm:
+  case X86::BLCI64rm:
+  case X86::BLCIC32rm:
+  case X86::BLCIC64rm:
+  case X86::BLCMSK32rm:
+  case X86::BLCMSK64rm:
+  case X86::BLCS32rm:
+  case X86::BLCS64rm:
+  case X86::BLSFILL32rm:
+  case X86::BLSFILL64rm:
+  case X86::BLSI32rm:
+  case X86::BLSI64rm:
+  case X86::BLSIC32rm:
+  case X86::BLSIC64rm:
+  case X86::BLSMSK32rm:
+  case X86::BLSMSK64rm:
+  case X86::BLSR32rm:
+  case X86::BLSR64rm:
+  case X86::TZMSK32rm:
+  case X86::TZMSK64rm:
+
+  // Bit extracting and clearing instructions should execute in constant time,
+  // and set flags.
+  case X86::BEXTR32rm:
+  case X86::BEXTR64rm:
+  case X86::BEXTRI32mi:
+  case X86::BEXTRI64mi:
+  case X86::BZHI32rm:
+  case X86::BZHI64rm:
+
+  // Basic arithmetic is constant time on the input but does set flags.
+  case X86::ADC8rm:
+  case X86::ADC16rm:
+  case X86::ADC32rm:
+  case X86::ADC64rm:
+  case X86::ADCX32rm:
+  case X86::ADCX64rm:
+  case X86::ADD8rm:
+  case X86::ADD16rm:
+  case X86::ADD32rm:
+  case X86::ADD64rm:
+  case X86::ADOX32rm:
+  case X86::ADOX64rm:
+  case X86::AND8rm:
+  case X86::AND16rm:
+  case X86::AND32rm:
+  case X86::AND64rm:
+  case X86::ANDN32rm:
+  case X86::ANDN64rm:
+  case X86::OR8rm:
+  case X86::OR16rm:
+  case X86::OR32rm:
+  case X86::OR64rm:
+  case X86::SBB8rm:
+  case X86::SBB16rm:
+  case X86::SBB32rm:
+  case X86::SBB64rm:
+  case X86::SUB8rm:
+  case X86::SUB16rm:
+  case X86::SUB32rm:
+  case X86::SUB64rm:
+  case X86::XOR8rm:
+  case X86::XOR16rm:
+  case X86::XOR32rm:
+  case X86::XOR64rm:
+    // Check whether the EFLAGS implicit-def is dead. We assume that this will
+    // always find the implicit-def because this code should only be reached
+    // for instructions that do in fact implicitly def this.
+    if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+      // If we would clobber EFLAGS that are used, just bail for now.
+      LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
+                 MI.dump(); dbgs() << "\n");
+      return false;
+    }
+
+    // Otherwise, fallthrough to handle these the same as instructions that
+    // don't set EFLAGS.
+    LLVM_FALLTHROUGH;
+
+  // Integer multiply w/o affecting flags is still believed to be constant
+  // time on x86. Called out separately as this is among the most surprising
+  // instructions to exhibit that behavior.
+  case X86::MULX32rm:
+  case X86::MULX64rm:
+
+  // Arithmetic instructions that are both constant time and don't set flags.
+  case X86::RORX32mi:
+  case X86::RORX64mi:
+  case X86::SARX32rm:
+  case X86::SARX64rm:
+  case X86::SHLX32rm:
+  case X86::SHLX64rm:
+  case X86::SHRX32rm:
+  case X86::SHRX64rm:
+
+  // Conversions are believed to be constant time and don't set flags.
+  case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
+  case X86::CVTTSD2SIrm:   case X86::VCVTTSD2SIrm:   case X86::VCVTTSD2SIZrm:
+  case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
+  case X86::CVTTSS2SIrm:   case X86::VCVTTSS2SIrm:   case X86::VCVTTSS2SIZrm:
+  case X86::CVTSI2SDrm:    case X86::VCVTSI2SDrm:    case X86::VCVTSI2SDZrm:
+  case X86::CVTSI2SSrm:    case X86::VCVTSI2SSrm:    case X86::VCVTSI2SSZrm:
+  case X86::CVTSI642SDrm:  case X86::VCVTSI642SDrm:  case X86::VCVTSI642SDZrm:
+  case X86::CVTSI642SSrm:  case X86::VCVTSI642SSrm:  case X86::VCVTSI642SSZrm:
+  case X86::CVTSS2SDrm:    case X86::VCVTSS2SDrm:    case X86::VCVTSS2SDZrm:
+  case X86::CVTSD2SSrm:    case X86::VCVTSD2SSrm:    case X86::VCVTSD2SSZrm:
+  // AVX512 added unsigned integer conversions.
+  case X86::VCVTTSD2USI64Zrm:
+  case X86::VCVTTSD2USIZrm:
+  case X86::VCVTTSS2USI64Zrm:
+  case X86::VCVTTSS2USIZrm:
+  case X86::VCVTUSI2SDZrm:
+  case X86::VCVTUSI642SDZrm:
+  case X86::VCVTUSI2SSZrm:
+  case X86::VCVTUSI642SSZrm:
+
+  // Loads to register don't set flags.
+  case X86::MOV8rm:
+  case X86::MOV8rm_NOREX:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::MOVSX16rm8:
+  case X86::MOVSX32rm16:
+  case X86::MOVSX32rm8:
+  case X86::MOVSX32rm8_NOREX:
+  case X86::MOVSX64rm16:
+  case X86::MOVSX64rm32:
+  case X86::MOVSX64rm8:
+  case X86::MOVZX16rm8:
+  case X86::MOVZX32rm16:
+  case X86::MOVZX32rm8:
+  case X86::MOVZX32rm8_NOREX:
+  case X86::MOVZX64rm16:
+  case X86::MOVZX64rm8:
+    return true;
+  }
+}
+
+static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                         const TargetRegisterInfo &TRI) {
+  // Check if EFLAGS are alive by seeing if there is a def of them or they
+  // live-in, and then seeing if that def is in turn used.
+  for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
+    if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+      // If the def is dead, then EFLAGS is not live.
+      if (DefOp->isDead())
+        return false;
+
+      // Otherwise we've def'ed it, and it is live.
+      return true;
+    }
+    // While at this instruction, also check if we use and kill EFLAGS
+    // which means it isn't live.
+    if (MI.killsRegister(X86::EFLAGS, &TRI))
+      return false;
+  }
+
+  // If we didn't find anything conclusive (neither definitely alive or
+  // definitely dead) return whether it lives into the block.
+  return MBB.isLiveIn(X86::EFLAGS);
+}
+
+/// Trace the predicate state through each of the blocks in the function,
+/// hardening everything necessary along the way.
+///
+/// We call this routine once the initial predicate state has been established
+/// for each basic block in the function in the SSA updater. This routine traces
+/// it through the instructions within each basic block, and for non-returning
+/// blocks informs the SSA updater about the final state that lives out of the
+/// block. Along the way, it hardens any vulnerable instruction using the
+/// currently valid predicate state. We have to do these two things together
+/// because the SSA updater only works across blocks. Within a block, we track
+/// the current predicate state directly and update it as it changes.
+///
+/// This operates in two passes over each block. First, we analyze the loads in
+/// the block to determine which strategy will be used to harden them: hardening
+/// the address or hardening the loaded value when loaded into a register
+/// amenable to hardening. We have to process these first because the two
+/// strategies may interact -- later hardening may change what strategy we wish
+/// to use. We also will analyze data dependencies between loads and avoid
+/// hardening those loads that are data dependent on a load with a hardened
+/// address. We also skip hardening loads already behind an LFENCE as that is
+/// sufficient to harden them against misspeculation.
+///
+/// Second, we actively trace the predicate state through the block, applying
+/// the hardening steps we determined necessary in the first pass as we go.
+///
+/// These two passes are applied to each basic block. We operate one block at a
+/// time to simplify reasoning about reachability and sequencing.
+void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
+    MachineFunction &MF) {
+  SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
+  SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
+
+  SmallSet<unsigned, 16> HardenedAddrRegs;
+
+  SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
+
+  // Track the set of load-dependent registers through the basic block. Because
+  // the values of these registers have an existing data dependency on a loaded
+  // value which we would have checked, we can omit any checks on them.
+  SparseBitVector<> LoadDepRegs;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // The first pass over the block: collect all the loads which can have their
+    // loaded value hardened and all the loads that instead need their address
+    // hardened. During this walk we propagate load dependence for address
+    // hardened loads and also look for LFENCE to stop hardening wherever
+    // possible. When deciding whether or not to harden the loaded value or not,
+    // we check to see if any registers used in the address will have been
+    // hardened at this point and if so, harden any remaining address registers
+    // as that often successfully re-uses hardened addresses and minimizes
+    // instructions.
+    //
+    // FIXME: We should consider an aggressive mode where we continue to keep as
+    // many loads value hardened even when some address register hardening would
+    // be free (due to reuse).
+    //
+    // Note that we only need this pass if we are actually hardening loads.
+    if (HardenLoads)
+      for (MachineInstr &MI : MBB) {
+        // We naively assume that all def'ed registers of an instruction have
+        // a data dependency on all of their operands.
+        // FIXME: Do a more careful analysis of x86 to build a conservative
+        // model here.
+        if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
+              return Op.isReg() && LoadDepRegs.test(Op.getReg());
+            }))
+          for (MachineOperand &Def : MI.defs())
+            if (Def.isReg())
+              LoadDepRegs.set(Def.getReg());
+
+        // Both Intel and AMD are guiding that they will change the semantics of
+        // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
+        // no more need to guard things in this block.
+        if (MI.getOpcode() == X86::LFENCE)
+          break;
+
+        // If this instruction cannot load, nothing to do.
+        if (!MI.mayLoad())
+          continue;
+
+        // Some instructions which "load" are trivially safe or unimportant.
+        if (MI.getOpcode() == X86::MFENCE)
+          continue;
+
+        // Extract the memory operand information about this instruction.
+        // FIXME: This doesn't handle loading pseudo instructions which we often
+        // could handle with similarly generic logic. We probably need to add an
+        // MI-layer routine similar to the MC-layer one we use here which maps
+        // pseudos much like this maps real instructions.
+        const MCInstrDesc &Desc = MI.getDesc();
+        int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+        if (MemRefBeginIdx < 0) {
+          LLVM_DEBUG(dbgs()
+                         << "WARNING: unable to harden loading instruction: ";
+                     MI.dump());
+          continue;
+        }
+
+        MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+        MachineOperand &BaseMO =
+            MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+        MachineOperand &IndexMO =
+            MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+
+        // If we have at least one (non-frame-index, non-RIP) register operand,
+        // and neither operand is load-dependent, we need to check the load.
+        unsigned BaseReg = 0, IndexReg = 0;
+        if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
+            BaseMO.getReg() != X86::NoRegister)
+          BaseReg = BaseMO.getReg();
+        if (IndexMO.getReg() != X86::NoRegister)
+          IndexReg = IndexMO.getReg();
+
+        if (!BaseReg && !IndexReg)
+          // No register operands!
+          continue;
+
+        // If any register operand is dependent, this load is dependent and we
+        // needn't check it.
+        // FIXME: Is this true in the case where we are hardening loads after
+        // they complete? Unclear, need to investigate.
+        if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
+            (IndexReg && LoadDepRegs.test(IndexReg)))
+          continue;
+
+        // If post-load hardening is enabled, this load is compatible with
+        // post-load hardening, and we aren't already going to harden one of the
+        // address registers, queue it up to be hardened post-load. Notably,
+        // even once hardened this won't introduce a useful dependency that
+        // could prune out subsequent loads.
+        if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
+            MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
+            canHardenRegister(MI.getOperand(0).getReg()) &&
+            !HardenedAddrRegs.count(BaseReg) &&
+            !HardenedAddrRegs.count(IndexReg)) {
+          HardenPostLoad.insert(&MI);
+          HardenedAddrRegs.insert(MI.getOperand(0).getReg());
+          continue;
+        }
+
+        // Record this instruction for address hardening and record its register
+        // operands as being address-hardened.
+        HardenLoadAddr.insert(&MI);
+        if (BaseReg)
+          HardenedAddrRegs.insert(BaseReg);
+        if (IndexReg)
+          HardenedAddrRegs.insert(IndexReg);
+
+        for (MachineOperand &Def : MI.defs())
+          if (Def.isReg())
+            LoadDepRegs.set(Def.getReg());
+      }
+
+    // Now re-walk the instructions in the basic block, and apply whichever
+    // hardening strategy we have elected. Note that we do this in a second
+    // pass specifically so that we have the complete set of instructions for
+    // which we will do post-load hardening and can defer it in certain
+    // circumstances.
+    //
+    // FIXME: This could probably be made even more effective by doing it
+    // across the entire function. Rather than just walking the flat list
+    // backwards here, we could walk the function in PO and each block bottom
+    // up, allowing us to in some cases sink hardening across block blocks. As
+    // long as the in-block predicate state is used at the eventual hardening
+    // site, this remains safe.
+    for (MachineInstr &MI : MBB) {
+      if (HardenLoads) {
+        // We cannot both require hardening the def of a load and its address.
+        assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
+               "Requested to harden both the address and def of a load!");
+
+        // Check if this is a load whose address needs to be hardened.
+        if (HardenLoadAddr.erase(&MI)) {
+          const MCInstrDesc &Desc = MI.getDesc();
+          int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+          assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
+
+          MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+          MachineOperand &BaseMO =
+              MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+          MachineOperand &IndexMO =
+              MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+          hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
+          continue;
+        }
+
+        // Test if this instruction is one of our post load instructions (and
+        // remove it from the set if so).
+        if (HardenPostLoad.erase(&MI)) {
+          assert(!MI.isCall() && "Must not try to post-load harden a call!");
+
+          // If this is a data-invariant load, we want to try and sink any
+          // hardening as far as possible.
+          if (isDataInvariantLoad(MI)) {
+            // Sink the instruction we'll need to harden as far as we can down
+            // the graph.
+            MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
+
+            // If we managed to sink this instruction, update everything so we
+            // harden that instruction when we reach it in the instruction
+            // sequence.
+            if (SunkMI != &MI) {
+              // If in sinking there was no instruction needing to be hardened,
+              // we're done.
+              if (!SunkMI)
+                continue;
+
+              // Otherwise, add this to the set of defs we harden.
+              HardenPostLoad.insert(SunkMI);
+              continue;
+            }
+          }
+
+          unsigned HardenedReg = hardenPostLoad(MI);
+
+          // Mark the resulting hardened register as such so we don't re-harden.
+          AddrRegToHardenedReg[HardenedReg] = HardenedReg;
+
+          continue;
+        }
+
+        // Check for an indirect call or branch that may need its input hardened
+        // even if we couldn't find the specific load used, or were able to
+        // avoid hardening it for some reason. Note that here we cannot break
+        // out afterward as we may still need to handle any call aspect of this
+        // instruction.
+        if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps)
+          hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
+      }
+
+      // After we finish hardening loads we handle interprocedural hardening if
+      // enabled and relevant for this instruction.
+      if (!HardenInterprocedurally)
+        continue;
+      if (!MI.isCall() && !MI.isReturn())
+        continue;
+
+      // If this is a direct return (IE, not a tail call) just directly harden
+      // it.
+      if (MI.isReturn() && !MI.isCall()) {
+        hardenReturnInstr(MI);
+        continue;
+      }
+
+      // Otherwise we have a call. We need to handle transferring the predicate
+      // state into a call and recovering it after the call returns unless this
+      // is a tail call.
+      assert(MI.isCall() && "Should only reach here for calls!");
+      tracePredStateThroughCall(MI);
+    }
+
+    HardenPostLoad.clear();
+    HardenLoadAddr.clear();
+    HardenedAddrRegs.clear();
+    AddrRegToHardenedReg.clear();
+
+    // Currently, we only track data-dependent loads within a basic block.
+    // FIXME: We should see if this is necessary or if we could be more
+    // aggressive here without opening up attack avenues.
+    LoadDepRegs.clear();
+  }
+}
+
+/// Save EFLAGS into the returned GPR. This can in turn be restored with
+/// `restoreEFLAGS`.
+///
+/// Note that LLVM can only lower very simple patterns of saved and restored
+/// EFLAGS registers. The restore should always be within the same basic block
+/// as the save so that no PHI nodes are inserted.
+unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    DebugLoc Loc) {
+  // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
+  // what instruction selection does.
+  unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  // We directly copy the FLAGS register and rely on later lowering to clean
+  // this up into the appropriate setCC instructions.
+  BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
+  ++NumInstsInserted;
+  return Reg;
+}
+
+/// Restore EFLAGS from the provided GPR. This should be produced by
+/// `saveEFLAGS`.
+///
+/// This must be done within the same basic block as the save in order to
+/// reliably lower.
+void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+    unsigned Reg) {
+  BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
+  ++NumInstsInserted;
+}
+
+/// Takes the current predicate state (in a register) and merges it into the
+/// stack pointer. The state is essentially a single bit, but we merge this in
+/// a way that won't form non-canonical pointers and also will be preserved
+/// across normal stack adjustments.
+void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+    unsigned PredStateReg) {
+  unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+  // FIXME: This hard codes a shift distance based on the number of bits needed
+  // to stay canonical on 64-bit. We should compute this somehow and support
+  // 32-bit as part of that.
+  auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
+                    .addReg(PredStateReg, RegState::Kill)
+                    .addImm(47);
+  ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+  auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
+                 .addReg(X86::RSP)
+                 .addReg(TmpReg, RegState::Kill);
+  OrI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+}
+
+/// Extracts the predicate state stored in the high bits of the stack pointer.
+unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    DebugLoc Loc) {
+  unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
+  unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+
+  // We know that the stack pointer will have any preserved predicate state in
+  // its high bit. We just want to smear this across the other bits. Turns out,
+  // this is exactly what an arithmetic right shift does.
+  BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
+      .addReg(X86::RSP);
+  auto ShiftI =
+      BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
+  ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+
+  return PredStateReg;
+}
+
+void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
+    MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
+    SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc Loc = MI.getDebugLoc();
+
+  // Check if EFLAGS are alive by seeing if there is a def of them or they
+  // live-in, and then seeing if that def is in turn used.
+  bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
+
+  SmallVector<MachineOperand *, 2> HardenOpRegs;
+
+  if (BaseMO.isFI()) {
+    // A frame index is never a dynamically controllable load, so only
+    // harden it if we're covering fixed address loads as well.
+    LLVM_DEBUG(
+        dbgs() << "  Skipping hardening base of explicit stack frame load: ";
+        MI.dump(); dbgs() << "\n");
+  } else if (BaseMO.getReg() == X86::RIP ||
+             BaseMO.getReg() == X86::NoRegister) {
+    // For both RIP-relative addressed loads or absolute loads, we cannot
+    // meaningfully harden them because the address being loaded has no
+    // dynamic component.
+    //
+    // FIXME: When using a segment base (like TLS does) we end up with the
+    // dynamic address being the base plus -1 because we can't mutate the
+    // segment register here. This allows the signed 32-bit offset to point at
+    // valid segment-relative addresses and load them successfully.
+    LLVM_DEBUG(
+        dbgs() << "  Cannot harden base of "
+               << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
+               << " address in a load!");
+  } else {
+    assert(BaseMO.isReg() &&
+           "Only allowed to have a frame index or register base.");
+    HardenOpRegs.push_back(&BaseMO);
+  }
+
+  if (IndexMO.getReg() != X86::NoRegister &&
+      (HardenOpRegs.empty() ||
+       HardenOpRegs.front()->getReg() != IndexMO.getReg()))
+    HardenOpRegs.push_back(&IndexMO);
+
+  assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
+         "Should have exactly one or two registers to harden!");
+  assert((HardenOpRegs.size() == 1 ||
+          HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
+         "Should not have two of the same registers!");
+
+  // Remove any registers that have alreaded been checked.
+  llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
+    // See if this operand's register has already been checked.
+    auto It = AddrRegToHardenedReg.find(Op->getReg());
+    if (It == AddrRegToHardenedReg.end())
+      // Not checked, so retain this one.
+      return false;
+
+    // Otherwise, we can directly update this operand and remove it.
+    Op->setReg(It->second);
+    return true;
+  });
+  // If there are none left, we're done.
+  if (HardenOpRegs.empty())
+    return;
+
+  // Compute the current predicate state.
+  unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+
+  auto InsertPt = MI.getIterator();
+
+  // If EFLAGS are live and we don't have access to instructions that avoid
+  // clobbering EFLAGS we need to save and restore them. This in turn makes
+  // the EFLAGS no longer live.
+  unsigned FlagsReg = 0;
+  if (EFLAGSLive && !Subtarget->hasBMI2()) {
+    EFLAGSLive = false;
+    FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+  }
+
+  for (MachineOperand *Op : HardenOpRegs) {
+    unsigned OpReg = Op->getReg();
+    auto *OpRC = MRI->getRegClass(OpReg);
+    unsigned TmpReg = MRI->createVirtualRegister(OpRC);
+
+    // If this is a vector register, we'll need somewhat custom logic to handle
+    // hardening it.
+    if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
+                                 OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
+      assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
+      bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
+
+      // Move our state into a vector register.
+      // FIXME: We could skip this at the cost of longer encodings with AVX-512
+      // but that doesn't seem likely worth it.
+      unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
+      auto MovI =
+          BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
+              .addReg(StateReg);
+      (void)MovI;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting mov: "; MovI->dump(); dbgs() << "\n");
+
+      // Broadcast it across the vector register.
+      unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
+      auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
+                                TII->get(Is128Bit ? X86::VPBROADCASTQrr
+                                                  : X86::VPBROADCASTQYrr),
+                                VBStateReg)
+                            .addReg(VStateReg);
+      (void)BroadcastI;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting broadcast: "; BroadcastI->dump();
+                 dbgs() << "\n");
+
+      // Merge our potential poison state into the value with a vector or.
+      auto OrI =
+          BuildMI(MBB, InsertPt, Loc,
+                  TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
+              .addReg(VBStateReg)
+              .addReg(OpReg);
+      (void)OrI;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
+    } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
+               OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
+               OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
+      assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
+      bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
+      bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
+      if (Is128Bit || Is256Bit)
+        assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
+
+      // Broadcast our state into a vector register.
+      unsigned VStateReg = MRI->createVirtualRegister(OpRC);
+      unsigned BroadcastOp =
+          Is128Bit ? X86::VPBROADCASTQrZ128r
+                   : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
+      auto BroadcastI =
+          BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
+              .addReg(StateReg);
+      (void)BroadcastI;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting broadcast: "; BroadcastI->dump();
+                 dbgs() << "\n");
+
+      // Merge our potential poison state into the value with a vector or.
+      unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
+                               : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
+      auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
+                     .addReg(VStateReg)
+                     .addReg(OpReg);
+      (void)OrI;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
+    } else {
+      // FIXME: Need to support GR32 here for 32-bit code.
+      assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
+             "Not a supported register class for address hardening!");
+
+      if (!EFLAGSLive) {
+        // Merge our potential poison state into the value with an or.
+        auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
+                       .addReg(StateReg)
+                       .addReg(OpReg);
+        OrI->addRegisterDead(X86::EFLAGS, TRI);
+        ++NumInstsInserted;
+        LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
+      } else {
+        // We need to avoid touching EFLAGS so shift out all but the least
+        // significant bit using the instruction that doesn't update flags.
+        auto ShiftI =
+            BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
+                .addReg(OpReg)
+                .addReg(StateReg);
+        (void)ShiftI;
+        ++NumInstsInserted;
+        LLVM_DEBUG(dbgs() << "  Inserting shrx: "; ShiftI->dump();
+                   dbgs() << "\n");
+      }
+    }
+
+    // Record this register as checked and update the operand.
+    assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
+           "Should not have checked this register yet!");
+    AddrRegToHardenedReg[Op->getReg()] = TmpReg;
+    Op->setReg(TmpReg);
+    ++NumAddrRegsHardened;
+  }
+
+  // And restore the flags if needed.
+  if (FlagsReg)
+    restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+}
+
+MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
+    MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
+  assert(isDataInvariantLoad(InitialMI) &&
+         "Cannot get here with a non-invariant load!");
+
+  // See if we can sink hardening the loaded value.
+  auto SinkCheckToSingleUse =
+      [&](MachineInstr &MI) -> Optional<MachineInstr *> {
+    unsigned DefReg = MI.getOperand(0).getReg();
+
+    // We need to find a single use which we can sink the check. We can
+    // primarily do this because many uses may already end up checked on their
+    // own.
+    MachineInstr *SingleUseMI = nullptr;
+    for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
+      // If we're already going to harden this use, it is data invariant and
+      // within our block.
+      if (HardenedInstrs.count(&UseMI)) {
+        if (!isDataInvariantLoad(UseMI)) {
+          // If we've already decided to harden a non-load, we must have sunk
+          // some other post-load hardened instruction to it and it must itself
+          // be data-invariant.
+          assert(isDataInvariant(UseMI) &&
+                 "Data variant instruction being hardened!");
+          continue;
+        }
+
+        // Otherwise, this is a load and the load component can't be data
+        // invariant so check how this register is being used.
+        const MCInstrDesc &Desc = UseMI.getDesc();
+        int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+        assert(MemRefBeginIdx >= 0 &&
+               "Should always have mem references here!");
+        MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+        MachineOperand &BaseMO =
+            UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+        MachineOperand &IndexMO =
+            UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+        if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
+            (IndexMO.isReg() && IndexMO.getReg() == DefReg))
+          // The load uses the register as part of its address making it not
+          // invariant.
+          return {};
+
+        continue;
+      }
+
+      if (SingleUseMI)
+        // We already have a single use, this would make two. Bail.
+        return {};
+
+      // If this single use isn't data invariant, isn't in this block, or has
+      // interfering EFLAGS, we can't sink the hardening to it.
+      if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
+        return {};
+
+      // If this instruction defines multiple registers bail as we won't harden
+      // all of them.
+      if (UseMI.getDesc().getNumDefs() > 1)
+        return {};
+
+      // If this register isn't a virtual register we can't walk uses of sanely,
+      // just bail. Also check that its register class is one of the ones we
+      // can harden.
+      unsigned UseDefReg = UseMI.getOperand(0).getReg();
+      if (!TRI->isVirtualRegister(UseDefReg) ||
+          !canHardenRegister(UseDefReg))
+        return {};
+
+      SingleUseMI = &UseMI;
+    }
+
+    // If SingleUseMI is still null, there is no use that needs its own
+    // checking. Otherwise, it is the single use that needs checking.
+    return {SingleUseMI};
+  };
+
+  MachineInstr *MI = &InitialMI;
+  while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
+    // Update which MI we're checking now.
+    MI = *SingleUse;
+    if (!MI)
+      break;
+  }
+
+  return MI;
+}
+
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
+  auto *RC = MRI->getRegClass(Reg);
+  int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
+  if (RegBytes > 8)
+    // We don't support post-load hardening of vectors.
+    return false;
+
+  // If this register class is explicitly constrained to a class that doesn't
+  // require REX prefix, we may not be able to satisfy that constraint when
+  // emitting the hardening instructions, so bail out here.
+  // FIXME: This seems like a pretty lame hack. The way this comes up is when we
+  // end up both with a NOREX and REX-only register as operands to the hardening
+  // instructions. It would be better to fix that code to handle this situation
+  // rather than hack around it in this way.
+  const TargetRegisterClass *NOREXRegClasses[] = {
+      &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
+      &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
+  if (RC == NOREXRegClasses[Log2_32(RegBytes)])
+    return false;
+
+  const TargetRegisterClass *GPRRegClasses[] = {
+      &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
+      &X86::GR64RegClass};
+  return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
+}
+
+/// Harden a value in a register.
+///
+/// This is the low-level logic to fully harden a value sitting in a register
+/// against leaking during speculative execution.
+///
+/// Unlike hardening an address that is used by a load, this routine is required
+/// to hide *all* incoming bits in the register.
+///
+/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
+/// larger than the predicate state register. FIXME: We should support vector
+/// registers here by broadcasting the predicate state.
+///
+/// The new, hardened virtual register is returned. It will have the same
+/// register class as `Reg`.
+unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
+    unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    DebugLoc Loc) {
+  assert(canHardenRegister(Reg) && "Cannot harden this register!");
+  assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
+
+  auto *RC = MRI->getRegClass(Reg);
+  int Bytes = TRI->getRegSizeInBits(*RC) / 8;
+
+  unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+
+  // FIXME: Need to teach this about 32-bit mode.
+  if (Bytes != 8) {
+    unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
+    unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
+    unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
+    BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
+        .addReg(StateReg, 0, SubRegImm);
+    StateReg = NarrowStateReg;
+  }
+
+  unsigned FlagsReg = 0;
+  if (isEFLAGSLive(MBB, InsertPt, *TRI))
+    FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+
+  unsigned NewReg = MRI->createVirtualRegister(RC);
+  unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
+  unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
+  auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
+                 .addReg(StateReg)
+                 .addReg(Reg);
+  OrI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+  LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
+
+  if (FlagsReg)
+    restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+
+  return NewReg;
+}
+
+/// Harden a load by hardening the loaded value in the defined register.
+///
+/// We can harden a non-leaking load into a register without touching the
+/// address by just hiding all of the loaded bits during misspeculation. We use
+/// an `or` instruction to do this because we set up our poison value as all
+/// ones. And the goal is just for the loaded bits to not be exposed to
+/// execution and coercing them to one is sufficient.
+///
+/// Returns the newly hardened register.
+unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc Loc = MI.getDebugLoc();
+
+  auto &DefOp = MI.getOperand(0);
+  unsigned OldDefReg = DefOp.getReg();
+  auto *DefRC = MRI->getRegClass(OldDefReg);
+
+  // Because we want to completely replace the uses of this def'ed value with
+  // the hardened value, create a dedicated new register that will only be used
+  // to communicate the unhardened value to the hardening.
+  unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
+  DefOp.setReg(UnhardenedReg);
+
+  // Now harden this register's value, getting a hardened reg that is safe to
+  // use. Note that we insert the instructions to compute this *after* the
+  // defining instruction, not before it.
+  unsigned HardenedReg = hardenValueInRegister(
+      UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
+
+  // Finally, replace the old register (which now only has the uses of the
+  // original def) with the hardened register.
+  MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
+
+  ++NumPostLoadRegsHardened;
+  return HardenedReg;
+}
+
+/// Harden a return instruction.
+///
+/// Returns implicitly perform a load which we need to harden. Without hardening
+/// this load, an attacker my speculatively write over the return address to
+/// steer speculation of the return to an attacker controlled address. This is
+/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
+/// this paper:
+/// https://people.csail.mit.edu/vlk/spectre11.pdf
+///
+/// We can harden this by introducing an LFENCE that will delay any load of the
+/// return address until prior instructions have retired (and thus are not being
+/// speculated), or we can harden the address used by the implicit load: the
+/// stack pointer.
+///
+/// If we are not using an LFENCE, hardening the stack pointer has an additional
+/// benefit: it allows us to pass the predicate state accumulated in this
+/// function back to the caller. In the absence of a BCBS attack on the return,
+/// the caller will typically be resumed and speculatively executed due to the
+/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
+/// priority. It is possible that some code from the caller will be executed
+/// speculatively even during a BCBS-attacked return until the steering takes
+/// effect. Whenever this happens, the caller can recover the (poisoned)
+/// predicate state from the stack pointer and continue to harden loads.
+void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc Loc = MI.getDebugLoc();
+  auto InsertPt = MI.getIterator();
+
+  if (FenceCallAndRet) {
+    // Simply forcibly block speculation of loads out of the function by using
+    // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
+    // should be secure, is simple from an ABI perspective, and the cost can be
+    // minimized through inlining.
+    //
+    // FIXME: We should investigate ways to establish a strong data-dependency
+    // on the return. However, poisoning the stack pointer is unlikely to work
+    // because the return is *predicted* rather than relying on the load of the
+    // return address to actually resolve.
+    BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+    return;
+  }
+
+  // Take our predicate state, shift it to the high 17 bits (so that we keep
+  // pointers canonical) and merge it into RSP. This will allow the caller to
+  // extract it when we return (speculatively).
+  mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
+}
+
+/// Trace the predicate state through a call.
+///
+/// There are several layers of this needed to handle the full complexity of
+/// calls.
+///
+/// First, we need to send the predicate state into the called function. We do
+/// this by merging it into the high bits of the stack pointer.
+///
+/// For tail calls, this is all we need to do.
+///
+/// For calls where we might return to control flow, we further need to extract
+/// the predicate state built up within that function from the high bits of the
+/// stack pointer, and make that the newly available predicate state.
+void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
+    MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  auto InsertPt = MI.getIterator();
+  DebugLoc Loc = MI.getDebugLoc();
+
+  // First, we transfer the predicate state into the called function by merging
+  // it into the stack pointer. This will kill the current def of the state.
+  unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+  mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
+
+  // If this call is also a return, it is a tail call and we don't need anything
+  // else to handle it so just continue.
+  // FIXME: We should also handle noreturn calls.
+  if (MI.isReturn())
+    return;
+
+  // We need to step past the call and recover the predicate state from SP after
+  // the return, and make this new state available.
+  ++InsertPt;
+  unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
+  PS->SSA.AddAvailableValue(&MBB, NewStateReg);
+}
+
+/// An attacker may speculatively store over a value that is then speculatively
+/// loaded and used as the target of an indirect call or jump instruction. This
+/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
+/// in this paper:
+/// https://people.csail.mit.edu/vlk/spectre11.pdf
+///
+/// When this happens, the speculative execution of the call or jump will end up
+/// being steered to this attacker controlled address. While most such loads
+/// will be adequately hardened already, we want to ensure that they are
+/// definitively treated as needing post-load hardening. While address hardening
+/// is sufficient to prevent secret data from leaking to the attacker, it may
+/// not be sufficient to prevent an attacker from steering speculative
+/// execution. We forcibly unfolded all relevant loads above and so will always
+/// have an opportunity to post-load harden here, we just need to scan for cases
+/// not already flagged and add them.
+void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
+    MachineInstr &MI,
+    SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+  switch (MI.getOpcode()) {
+  case X86::FARCALL16m:
+  case X86::FARCALL32m:
+  case X86::FARCALL64:
+  case X86::FARJMP16m:
+  case X86::FARJMP32m:
+  case X86::FARJMP64:
+    // We don't need to harden either far calls or far jumps as they are
+    // safe from Spectre.
+    return;
+
+  default:
+    break;
+  }
+
+  // We should never see a loading instruction at this point, as those should
+  // have been unfolded.
+  assert(!MI.mayLoad() && "Found a lingering loading instruction!");
+
+  // If the first operand isn't a register, this is a branch or call
+  // instruction with an immediate operand which doesn't need to be hardened.
+  if (!MI.getOperand(0).isReg())
+    return;
+
+  // For all of these, the target register is the first operand of the
+  // instruction.
+  auto &TargetOp = MI.getOperand(0);
+  unsigned OldTargetReg = TargetOp.getReg();
+
+  // Try to lookup a hardened version of this register. We retain a reference
+  // here as we want to update the map to track any newly computed hardened
+  // register.
+  unsigned &HardenedTargetReg = AddrRegToHardenedReg[OldTargetReg];
+
+  // If we don't have a hardened register yet, compute one. Otherwise, just use
+  // the already hardened register.
+  //
+  // FIXME: It is a little suspect that we use partially hardened registers that
+  // only feed addresses. The complexity of partial hardening with SHRX
+  // continues to pile up. Should definitively measure its value and consider
+  // eliminating it.
+  if (!HardenedTargetReg)
+    HardenedTargetReg = hardenValueInRegister(
+        OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc());
+
+  // Set the target operand to the hardened register.
+  TargetOp.setReg(HardenedTargetReg);
+
+  ++NumCallsOrJumpsHardened;
+}
+
+INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
+                      "X86 speculative load hardener", false, false)
+INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
+                    "X86 speculative load hardener", false, false)
+
+FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
+  return new X86SpeculativeLoadHardeningPass();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
index dca98d999e58..7e84323dda4c 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -68,14 +68,36 @@ X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
 
 unsigned char
 X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
-  // 64 bits can use %rip addressing for anything local.
-  if (is64Bit())
+  // If we're not PIC, it's not very interesting.
+  if (!isPositionIndependent())
     return X86II::MO_NO_FLAG;
 
-  // If this is for a position dependent executable, the static linker can
-  // figure it out.
-  if (!isPositionIndependent())
+  if (is64Bit()) {
+    // 64-bit ELF PIC local references may use GOTOFF relocations.
+    if (isTargetELF()) {
+      switch (TM.getCodeModel()) {
+      // 64-bit small code model is simple: All rip-relative.
+      case CodeModel::Small:
+      case CodeModel::Kernel:
+        return X86II::MO_NO_FLAG;
+
+      // The large PIC code model uses GOTOFF.
+      case CodeModel::Large:
+        return X86II::MO_GOTOFF;
+
+      // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
+      case CodeModel::Medium:
+        if (isa<Function>(GV))
+          return X86II::MO_NO_FLAG; // All code is RIP-relative
+        return X86II::MO_GOTOFF;    // Local symbols use GOTOFF.
+      }
+      llvm_unreachable("invalid code model");
+    }
+
+    // Otherwise, this is either a RIP-relative reference or a 64-bit movabsq,
+    // both of which use MO_NO_FLAG.
     return X86II::MO_NO_FLAG;
+  }
 
   // The COFF dynamic linker just patches the executable sections.
   if (isTargetCOFF())
@@ -97,8 +119,8 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
 
 unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
                                                     const Module &M) const {
-  // Large model never uses stubs.
-  if (TM.getCodeModel() == CodeModel::Large)
+  // The static large model never uses stubs.
+  if (TM.getCodeModel() == CodeModel::Large && !isPositionIndependent())
     return X86II::MO_NO_FLAG;
 
   // Absolute symbols can be referenced directly.
@@ -120,8 +142,14 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
   if (isTargetCOFF())
     return X86II::MO_DLLIMPORT;
 
-  if (is64Bit())
+  if (is64Bit()) {
+    // ELF supports a large, truly PIC code model with non-PC relative GOT
+    // references. Other object file formats do not. Use the no-flag, 64-bit
+    // reference for them.
+    if (TM.getCodeModel() == CodeModel::Large)
+      return isTargetELF() ? X86II::MO_GOT : X86II::MO_NO_FLAG;
     return X86II::MO_GOTPCREL;
+  }
 
   if (isTargetDarwin()) {
     if (!isPositionIndependent())
@@ -157,8 +185,11 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
       // In Regcall calling convention those registers are used for passing
       // parameters. Thus we need to prevent lazy binding in Regcall.
       return X86II::MO_GOTPCREL;
-    if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
-      return X86II::MO_GOTPCREL;
+    // If PLT must be avoided then the call should be via GOTPCREL.
+    if (((F && F->hasFnAttribute(Attribute::NonLazyBind)) ||
+         (!F && M.getRtLibUseGOT())) &&
+        is64Bit())
+       return X86II::MO_GOTPCREL;
     return X86II::MO_PLT;
   }
 
@@ -216,8 +247,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // micro-architectures respectively.
   if (hasSSE42() || hasSSE4A())
     IsUAMem16Slow = false;
-  
-  InstrItins = getInstrItineraryForCPU(CPUName);
 
   // It's important to keep the MCSubtargetInfo feature bits in sync with
   // target data structure which is shared with MC code emitter, etc.
@@ -230,9 +259,9 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   else
     llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
 
-  DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
-               << ", 3DNowLevel " << X863DNowLevel
-               << ", 64bit " << HasX86_64 << "\n");
+  LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+                    << ", 3DNowLevel " << X863DNowLevel << ", 64bit "
+                    << HasX86_64 << "\n");
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
@@ -254,114 +283,30 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     GatherOverhead = 2;
   if (hasAVX512())
     ScatterOverhead = 2;
-}
 
-void X86Subtarget::initializeEnvironment() {
-  X86SSELevel = NoSSE;
-  X863DNowLevel = NoThreeDNow;
-  HasX87 = false;
-  HasCMov = false;
-  HasX86_64 = false;
-  HasPOPCNT = false;
-  HasSSE4A = false;
-  HasAES = false;
-  HasVAES = false;
-  HasFXSR = false;
-  HasXSAVE = false;
-  HasXSAVEOPT = false;
-  HasXSAVEC = false;
-  HasXSAVES = false;
-  HasPCLMUL = false;
-  HasVPCLMULQDQ = false;
-  HasGFNI = false;
-  HasFMA = false;
-  HasFMA4 = false;
-  HasXOP = false;
-  HasTBM = false;
-  HasLWP = false;
-  HasMOVBE = false;
-  HasRDRAND = false;
-  HasF16C = false;
-  HasFSGSBase = false;
-  HasLZCNT = false;
-  HasBMI = false;
-  HasBMI2 = false;
-  HasVBMI = false;
-  HasVBMI2 = false;
-  HasIFMA = false;
-  HasRTM = false;
-  HasERI = false;
-  HasCDI = false;
-  HasPFI = false;
-  HasDQI = false;
-  HasVPOPCNTDQ = false;
-  HasBWI = false;
-  HasVLX = false;
-  HasADX = false;
-  HasPKU = false;
-  HasVNNI = false;
-  HasBITALG = false;
-  HasSHA = false;
-  HasPREFETCHWT1 = false;
-  HasPRFCHW = false;
-  HasRDSEED = false;
-  HasLAHFSAHF = false;
-  HasMWAITX = false;
-  HasCLZERO = false;
-  HasMPX = false;
-  HasSHSTK = false;
-  HasIBT = false;
-  HasSGX = false;
-  HasCLFLUSHOPT = false;
-  HasCLWB = false;
-  UseRetpoline = false;
-  UseRetpolineExternalThunk = false;
-  IsPMULLDSlow = false;
-  IsSHLDSlow = false;
-  IsUAMem16Slow = false;
-  IsUAMem32Slow = false;
-  HasSSEUnalignedMem = false;
-  HasCmpxchg16b = false;
-  UseLeaForSP = false;
-  HasFastVariableShuffle = false;
-  HasFastPartialYMMorZMMWrite = false;
-  HasFastGather = false;
-  HasFastScalarFSQRT = false;
-  HasFastVectorFSQRT = false;
-  HasFastLZCNT = false;
-  HasFastSHLDRotate = false;
-  HasMacroFusion = false;
-  HasERMSB = false;
-  HasSlowDivide32 = false;
-  HasSlowDivide64 = false;
-  PadShortFunctions = false;
-  SlowTwoMemOps = false;
-  LEAUsesAG = false;
-  SlowLEA = false;
-  Slow3OpsLEA = false;
-  SlowIncDec = false;
-  stackAlignment = 4;
-  // FIXME: this is a known good value for Yonah. How about others?
-  MaxInlineSizeThreshold = 128;
-  UseSoftFloat = false;
-  X86ProcFamily = Others;
-  GatherOverhead = 1024;
-  ScatterOverhead = 1024;
+  // Consume the vector width attribute or apply any target specific limit.
+  if (PreferVectorWidthOverride)
+    PreferVectorWidth = PreferVectorWidthOverride;
+  else if (Prefer256Bit)
+    PreferVectorWidth = 256;
 }
 
 X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
-  initializeEnvironment();
   initSubtargetFeatures(CPU, FS);
   return *this;
 }
 
 X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                            const X86TargetMachine &TM,
-                           unsigned StackAlignOverride)
-    : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+                           unsigned StackAlignOverride,
+                           unsigned PreferVectorWidthOverride,
+                           unsigned RequiredVectorWidth)
+    : X86GenSubtargetInfo(TT, CPU, FS),
       PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
       StackAlignOverride(StackAlignOverride),
+      PreferVectorWidthOverride(PreferVectorWidthOverride),
+      RequiredVectorWidth(RequiredVectorWidth),
       In64BitMode(TargetTriple.getArch() == Triple::x86_64),
       In32BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() != Triple::CODE16),
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
index 37ffac1faf68..fedb13f89e19 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -26,8 +26,8 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
+#include <climits>
 #include <memory>
 
 #define GET_SUBTARGETINFO_HEADER
@@ -57,13 +57,16 @@ public:
     IntelAtom,
     IntelSLM,
     IntelGLM,
+    IntelGLP,
+    IntelTRM,
     IntelHaswell,
     IntelBroadwell,
     IntelSkylake,
     IntelKNL,
     IntelSKX,
     IntelCannonlake,
-    IntelIcelake,
+    IntelIcelakeClient,
+    IntelIcelakeServer,
   };
 
 protected:
@@ -76,7 +79,7 @@ protected:
   };
 
   /// X86 processor family: Intel Atom, and others
-  X86ProcFamilyEnum X86ProcFamily;
+  X86ProcFamilyEnum X86ProcFamily = Others;
 
   /// Which PIC style to use
   PICStyles::Style PICStyle;
@@ -84,288 +87,330 @@ protected:
   const TargetMachine &TM;
 
   /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
-  X86SSEEnum X86SSELevel;
+  X86SSEEnum X86SSELevel = NoSSE;
 
   /// MMX, 3DNow, 3DNow Athlon, or none supported.
-  X863DNowEnum X863DNowLevel;
+  X863DNowEnum X863DNowLevel = NoThreeDNow;
 
   /// True if the processor supports X87 instructions.
-  bool HasX87;
+  bool HasX87 = false;
+
+  /// True if this processor has NOPL instruction
+  /// (generally pentium pro+).
+  bool HasNOPL = false;
 
   /// True if this processor has conditional move instructions
   /// (generally pentium pro+).
-  bool HasCMov;
+  bool HasCMov = false;
 
   /// True if the processor supports X86-64 instructions.
-  bool HasX86_64;
+  bool HasX86_64 = false;
 
   /// True if the processor supports POPCNT.
-  bool HasPOPCNT;
+  bool HasPOPCNT = false;
 
   /// True if the processor supports SSE4A instructions.
-  bool HasSSE4A;
+  bool HasSSE4A = false;
 
   /// Target has AES instructions
-  bool HasAES;
-  bool HasVAES;
+  bool HasAES = false;
+  bool HasVAES = false;
 
   /// Target has FXSAVE/FXRESTOR instructions
-  bool HasFXSR;
+  bool HasFXSR = false;
 
   /// Target has XSAVE instructions
-  bool HasXSAVE;
+  bool HasXSAVE = false;
 
   /// Target has XSAVEOPT instructions
-  bool HasXSAVEOPT;
+  bool HasXSAVEOPT = false;
 
   /// Target has XSAVEC instructions
-  bool HasXSAVEC;
+  bool HasXSAVEC = false;
 
   /// Target has XSAVES instructions
-  bool HasXSAVES;
+  bool HasXSAVES = false;
 
   /// Target has carry-less multiplication
-  bool HasPCLMUL;
-  bool HasVPCLMULQDQ;
+  bool HasPCLMUL = false;
+  bool HasVPCLMULQDQ = false;
 
   /// Target has Galois Field Arithmetic instructions
-  bool HasGFNI;
+  bool HasGFNI = false;
 
   /// Target has 3-operand fused multiply-add
-  bool HasFMA;
+  bool HasFMA = false;
 
   /// Target has 4-operand fused multiply-add
-  bool HasFMA4;
+  bool HasFMA4 = false;
 
   /// Target has XOP instructions
-  bool HasXOP;
+  bool HasXOP = false;
 
   /// Target has TBM instructions.
-  bool HasTBM;
+  bool HasTBM = false;
 
   /// Target has LWP instructions
-  bool HasLWP;
+  bool HasLWP = false;
 
   /// True if the processor has the MOVBE instruction.
-  bool HasMOVBE;
+  bool HasMOVBE = false;
 
   /// True if the processor has the RDRAND instruction.
-  bool HasRDRAND;
+  bool HasRDRAND = false;
 
   /// Processor has 16-bit floating point conversion instructions.
-  bool HasF16C;
+  bool HasF16C = false;
 
   /// Processor has FS/GS base insturctions.
-  bool HasFSGSBase;
+  bool HasFSGSBase = false;
 
   /// Processor has LZCNT instruction.
-  bool HasLZCNT;
+  bool HasLZCNT = false;
 
   /// Processor has BMI1 instructions.
-  bool HasBMI;
+  bool HasBMI = false;
 
   /// Processor has BMI2 instructions.
-  bool HasBMI2;
+  bool HasBMI2 = false;
 
   /// Processor has VBMI instructions.
-  bool HasVBMI;
+  bool HasVBMI = false;
 
   /// Processor has VBMI2 instructions.
-  bool HasVBMI2;
+  bool HasVBMI2 = false;
 
   /// Processor has Integer Fused Multiply Add
-  bool HasIFMA;
+  bool HasIFMA = false;
 
   /// Processor has RTM instructions.
-  bool HasRTM;
+  bool HasRTM = false;
 
   /// Processor has ADX instructions.
-  bool HasADX;
+  bool HasADX = false;
 
   /// Processor has SHA instructions.
-  bool HasSHA;
+  bool HasSHA = false;
 
   /// Processor has PRFCHW instructions.
-  bool HasPRFCHW;
+  bool HasPRFCHW = false;
 
   /// Processor has RDSEED instructions.
-  bool HasRDSEED;
+  bool HasRDSEED = false;
 
   /// Processor has LAHF/SAHF instructions.
-  bool HasLAHFSAHF;
+  bool HasLAHFSAHF = false;
 
   /// Processor has MONITORX/MWAITX instructions.
-  bool HasMWAITX;
+  bool HasMWAITX = false;
 
   /// Processor has Cache Line Zero instruction
-  bool HasCLZERO;
+  bool HasCLZERO = false;
+
+  /// Processor has Cache Line Demote instruction
+  bool HasCLDEMOTE = false;
+
+  /// Processor has MOVDIRI instruction (direct store integer).
+  bool HasMOVDIRI = false;
+
+  /// Processor has MOVDIR64B instruction (direct store 64 bytes).
+  bool HasMOVDIR64B = false;
+
+  /// Processor has ptwrite instruction.
+  bool HasPTWRITE = false;
 
   /// Processor has Prefetch with intent to Write instruction
-  bool HasPREFETCHWT1;
+  bool HasPREFETCHWT1 = false;
 
   /// True if SHLD instructions are slow.
-  bool IsSHLDSlow;
+  bool IsSHLDSlow = false;
 
   /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
   //  PMULUDQ.
-  bool IsPMULLDSlow;
+  bool IsPMULLDSlow = false;
 
   /// True if unaligned memory accesses of 16-bytes are slow.
-  bool IsUAMem16Slow;
+  bool IsUAMem16Slow = false;
 
   /// True if unaligned memory accesses of 32-bytes are slow.
-  bool IsUAMem32Slow;
+  bool IsUAMem32Slow = false;
 
   /// True if SSE operations can have unaligned memory operands.
   /// This may require setting a configuration bit in the processor.
-  bool HasSSEUnalignedMem;
+  bool HasSSEUnalignedMem = false;
 
   /// True if this processor has the CMPXCHG16B instruction;
   /// this is true for most x86-64 chips, but not the first AMD chips.
-  bool HasCmpxchg16b;
+  bool HasCmpxchg16b = false;
 
   /// True if the LEA instruction should be used for adjusting
   /// the stack pointer. This is an optimization for Intel Atom processors.
-  bool UseLeaForSP;
+  bool UseLeaForSP = false;
+
+  /// True if POPCNT instruction has a false dependency on the destination register.
+  bool HasPOPCNTFalseDeps = false;
+
+  /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
+  bool HasLZCNTFalseDeps = false;
 
   /// True if its preferable to combine to a single shuffle using a variable
   /// mask over multiple fixed shuffles.
-  bool HasFastVariableShuffle;
+  bool HasFastVariableShuffle = false;
 
   /// True if there is no performance penalty to writing only the lower parts
   /// of a YMM or ZMM register without clearing the upper part.
-  bool HasFastPartialYMMorZMMWrite;
+  bool HasFastPartialYMMorZMMWrite = false;
+
+  /// True if there is no performance penalty for writing NOPs with up to
+  /// 11 bytes.
+  bool HasFast11ByteNOP = false;
+
+  /// True if there is no performance penalty for writing NOPs with up to
+  /// 15 bytes.
+  bool HasFast15ByteNOP = false;
 
   /// True if gather is reasonably fast. This is true for Skylake client and
   /// all AVX-512 CPUs.
-  bool HasFastGather;
+  bool HasFastGather = false;
 
   /// True if hardware SQRTSS instruction is at least as fast (latency) as
   /// RSQRTSS followed by a Newton-Raphson iteration.
-  bool HasFastScalarFSQRT;
+  bool HasFastScalarFSQRT = false;
 
   /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
   /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
-  bool HasFastVectorFSQRT;
+  bool HasFastVectorFSQRT = false;
 
   /// True if 8-bit divisions are significantly faster than
   /// 32-bit divisions and should be used when possible.
-  bool HasSlowDivide32;
+  bool HasSlowDivide32 = false;
 
   /// True if 32-bit divides are significantly faster than
   /// 64-bit divisions and should be used when possible.
-  bool HasSlowDivide64;
+  bool HasSlowDivide64 = false;
 
   /// True if LZCNT instruction is fast.
-  bool HasFastLZCNT;
+  bool HasFastLZCNT = false;
 
   /// True if SHLD based rotate is fast.
-  bool HasFastSHLDRotate;
+  bool HasFastSHLDRotate = false;
 
   /// True if the processor supports macrofusion.
-  bool HasMacroFusion;
+  bool HasMacroFusion = false;
 
   /// True if the processor has enhanced REP MOVSB/STOSB.
-  bool HasERMSB;
+  bool HasERMSB = false;
 
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
-  bool PadShortFunctions;
+  bool PadShortFunctions = false;
 
   /// True if two memory operand instructions should use a temporary register
   /// instead.
-  bool SlowTwoMemOps;
+  bool SlowTwoMemOps = false;
 
   /// True if the LEA instruction inputs have to be ready at address generation
   /// (AG) time.
-  bool LEAUsesAG;
+  bool LEAUsesAG = false;
 
   /// True if the LEA instruction with certain arguments is slow
-  bool SlowLEA;
+  bool SlowLEA = false;
 
   /// True if the LEA instruction has all three source operands: base, index,
   /// and offset or if the LEA instruction uses base and index registers where
   /// the base is EBP, RBP,or R13
-  bool Slow3OpsLEA;
+  bool Slow3OpsLEA = false;
 
   /// True if INC and DEC instructions are slow when writing to flags
-  bool SlowIncDec;
+  bool SlowIncDec = false;
 
   /// Processor has AVX-512 PreFetch Instructions
-  bool HasPFI;
+  bool HasPFI = false;
 
   /// Processor has AVX-512 Exponential and Reciprocal Instructions
-  bool HasERI;
+  bool HasERI = false;
 
   /// Processor has AVX-512 Conflict Detection Instructions
-  bool HasCDI;
+  bool HasCDI = false;
 
   /// Processor has AVX-512 population count Instructions
-  bool HasVPOPCNTDQ;
+  bool HasVPOPCNTDQ = false;
 
   /// Processor has AVX-512 Doubleword and Quadword instructions
-  bool HasDQI;
+  bool HasDQI = false;
 
   /// Processor has AVX-512 Byte and Word instructions
-  bool HasBWI;
+  bool HasBWI = false;
 
   /// Processor has AVX-512 Vector Length eXtenstions
-  bool HasVLX;
+  bool HasVLX = false;
 
   /// Processor has PKU extenstions
-  bool HasPKU;
+  bool HasPKU = false;
 
   /// Processor has AVX-512 Vector Neural Network Instructions
-  bool HasVNNI;
+  bool HasVNNI = false;
 
   /// Processor has AVX-512 Bit Algorithms instructions
-  bool HasBITALG;
+  bool HasBITALG = false;
 
   /// Processor supports MPX - Memory Protection Extensions
-  bool HasMPX;
+  bool HasMPX = false;
 
   /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
   /// using Shadow Stack
-  bool HasSHSTK;
+  bool HasSHSTK = false;
 
-  /// Processor supports CET IBT - Control-Flow Enforcement Technology
-  /// using Indirect Branch Tracking
-  bool HasIBT;
+  /// Processor supports Invalidate Process-Context Identifier
+  bool HasINVPCID = false;
 
   /// Processor has Software Guard Extensions
-  bool HasSGX;
+  bool HasSGX = false;
 
   /// Processor supports Flush Cache Line instruction
-  bool HasCLFLUSHOPT;
+  bool HasCLFLUSHOPT = false;
 
   /// Processor supports Cache Line Write Back instruction
-  bool HasCLWB;
+  bool HasCLWB = false;
+
+  /// Processor supports Write Back No Invalidate instruction
+  bool HasWBNOINVD = false;
+
+  /// Processor support RDPID instruction
+  bool HasRDPID = false;
+
+  /// Processor supports WaitPKG instructions
+  bool HasWAITPKG = false;
+
+  /// Processor supports PCONFIG instruction
+  bool HasPCONFIG = false;
 
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
-  bool UseRetpoline;
+  bool UseRetpoline = false;
 
   /// When using a retpoline thunk, call an externally provided thunk rather
   /// than emitting one inside the compiler.
-  bool UseRetpolineExternalThunk;
+  bool UseRetpolineExternalThunk = false;
 
   /// Use software floating point for code generation.
-  bool UseSoftFloat;
+  bool UseSoftFloat = false;
 
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned stackAlignment;
+  unsigned stackAlignment = 4;
 
   /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
   ///
-  unsigned MaxInlineSizeThreshold;
+  // FIXME: this is a known good value for Yonah. How about others?
+  unsigned MaxInlineSizeThreshold = 128;
+
+  /// Indicates target prefers 256 bit instructions.
+  bool Prefer256Bit = false;
 
   /// What processor and OS we're targeting.
   Triple TargetTriple;
 
-  /// Instruction itineraries for scheduling
-  InstrItineraryData InstrItins;
-
   /// GlobalISel related APIs.
   std::unique_ptr<CallLowering> CallLoweringInfo;
   std::unique_ptr<LegalizerInfo> Legalizer;
@@ -376,6 +421,16 @@ private:
   /// Override the stack alignment.
   unsigned StackAlignOverride;
 
+  /// Preferred vector width from function attribute.
+  unsigned PreferVectorWidthOverride;
+
+  /// Resolved preferred vector width from function attribute and subtarget
+  /// features.
+  unsigned PreferVectorWidth = UINT32_MAX;
+
+  /// Required vector width from function attribute.
+  unsigned RequiredVectorWidth;
+
   /// True if compiling for 64-bit, false for 16-bit or 32-bit.
   bool In64BitMode;
 
@@ -386,8 +441,8 @@ private:
   bool In16BitMode;
 
   /// Contains the Overhead of gather\scatter instructions
-  int GatherOverhead;
-  int ScatterOverhead;
+  int GatherOverhead = 1024;
+  int ScatterOverhead = 1024;
 
   X86SelectionDAGInfo TSInfo;
   // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
@@ -401,7 +456,9 @@ public:
   /// of the specified triple.
   ///
   X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-               const X86TargetMachine &TM, unsigned StackAlignOverride);
+               const X86TargetMachine &TM, unsigned StackAlignOverride,
+               unsigned PreferVectorWidthOverride,
+               unsigned RequiredVectorWidth);
 
   const X86TargetLowering *getTargetLowering() const override {
     return &TLInfo;
@@ -444,7 +501,6 @@ private:
   /// Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
   X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
-  void initializeEnvironment();
   void initSubtargetFeatures(StringRef CPU, StringRef FS);
 
 public:
@@ -477,6 +533,7 @@ public:
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
 
   bool hasX87() const { return HasX87; }
+  bool hasNOPL() const { return HasNOPL; }
   bool hasCMov() const { return HasCMov; }
   bool hasSSE1() const { return X86SSELevel >= SSE1; }
   bool hasSSE2() const { return X86SSELevel >= SSE2; }
@@ -487,7 +544,6 @@ public:
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
   bool hasAVX512() const { return X86SSELevel >= AVX512F; }
-  bool hasFp256() const { return hasAVX(); }
   bool hasInt256() const { return hasAVX2(); }
   bool hasSSE4A() const { return HasSSE4A; }
   bool hasMMX() const { return X863DNowLevel >= MMX; }
@@ -537,6 +593,10 @@ public:
   bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool hasMWAITX() const { return HasMWAITX; }
   bool hasCLZERO() const { return HasCLZERO; }
+  bool hasCLDEMOTE() const { return HasCLDEMOTE; }
+  bool hasMOVDIRI() const { return HasMOVDIRI; }
+  bool hasMOVDIR64B() const { return HasMOVDIR64B; }
+  bool hasPTWRITE() const { return HasPTWRITE; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isPMULLDSlow() const { return IsPMULLDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
@@ -546,6 +606,8 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
+  bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
   bool hasFastVariableShuffle() const {
     return HasFastVariableShuffle;
   }
@@ -579,12 +641,41 @@ public:
   bool hasBITALG() const { return HasBITALG; }
   bool hasMPX() const { return HasMPX; }
   bool hasSHSTK() const { return HasSHSTK; }
-  bool hasIBT() const { return HasIBT; }
   bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
   bool hasCLWB() const { return HasCLWB; }
+  bool hasWBNOINVD() const { return HasWBNOINVD; }
+  bool hasRDPID() const { return HasRDPID; }
+  bool hasWAITPKG() const { return HasWAITPKG; }
+  bool hasPCONFIG() const { return HasPCONFIG; }
+  bool hasSGX() const { return HasSGX; }
+  bool hasINVPCID() const { return HasINVPCID; }
   bool useRetpoline() const { return UseRetpoline; }
   bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
 
+  unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
+  unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
+
+  // Helper functions to determine when we should allow widening to 512-bit
+  // during codegen.
+  // TODO: Currently we're always allowing widening on CPUs without VLX,
+  // because for many cases we don't have a better option.
+  bool canExtendTo512DQ() const {
+    return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+  }
+  bool canExtendTo512BW() const  {
+    return hasBWI() && canExtendTo512DQ();
+  }
+
+  // If there are no 512-bit vectors and we prefer not to use 512-bit registers,
+  // disable them in the legalizer.
+  bool useAVX512Regs() const {
+    return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
+  }
+
+  bool useBWIRegs() const {
+    return hasBWI() && useAVX512Regs();
+  }
+
   bool isXRaySupported() const override { return is64Bit(); }
 
   X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }
@@ -592,6 +683,11 @@ public:
   /// TODO: to be removed later and replaced with suitable properties
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
+  bool isGLM() const {
+    return X86ProcFamily == IntelGLM ||
+           X86ProcFamily == IntelGLP ||
+           X86ProcFamily == IntelTRM;
+  }
   bool useSoftFloat() const { return UseSoftFloat; }
 
   /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
@@ -718,11 +814,6 @@ public:
 
   bool enableEarlyIfConversion() const override;
 
-  /// Return the instruction itineraries based on the subtarget selection.
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
-  }
-
   AntiDepBreakMode getAntiDepBreakMode() const override {
     return TargetSubtargetInfo::ANTIDEP_CRITICAL;
   }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
index e41e16d82d83..374bf3daaf9b 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -26,7 +26,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/ExecutionDomainFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -34,7 +34,6 @@
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
@@ -44,6 +43,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include <memory>
 #include <string>
@@ -54,14 +54,20 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableSpeculativeLoadHardening(
+    "x86-speculative-load-hardening",
+    cl::desc("Enable speculative load hardening"), cl::init(false), cl::Hidden);
+
 namespace llvm {
 
 void initializeWinEHStatePassPass(PassRegistry &);
 void initializeFixupLEAPassPass(PassRegistry &);
+void initializeShadowCallStackPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExecutionDepsFixPass(PassRegistry &);
+void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
+void initializeX86AvoidSFBPassPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 
 } // end namespace llvm
@@ -77,10 +83,12 @@ extern "C" void LLVMInitializeX86Target() {
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
   initializeFixupLEAPassPass(PR);
+  initializeShadowCallStackPass(PR);
   initializeX86CallFrameOptimizationPass(PR);
   initializeX86CmovConverterPassPass(PR);
-  initializeX86ExecutionDepsFixPass(PR);
+  initializeX86ExecutionDomainFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
+  initializeX86AvoidSFBPassPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
 }
 
@@ -101,8 +109,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     return llvm::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
     return llvm::make_unique<X86ELFTargetObjectFile>();
-  if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
-    return llvm::make_unique<X86WindowsTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
     return llvm::make_unique<TargetLoweringObjectFileCOFF>();
   llvm_unreachable("unknown subtarget type");
@@ -154,9 +160,15 @@ static std::string computeDataLayout(const Triple &TT) {
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           bool JIT,
                                            Optional<Reloc::Model> RM) {
   bool is64Bit = TT.getArch() == Triple::x86_64;
   if (!RM.hasValue()) {
+    // JIT codegen should use static relocations by default, since it's
+    // typically executed in process and not relocatable.
+    if (JIT)
+      return Reloc::Static;
+
     // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
     // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
     // use static relocation model by default.
@@ -208,7 +220,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
                                    CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(
           T, computeDataLayout(TT), TT, CPU, FS, Options,
-          getEffectiveRelocModel(TT, RM),
+          getEffectiveRelocModel(TT, JIT, RM),
           getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL),
       TLOF(createTLOF(getTargetTriple())) {
   // Windows stack unwinder gets confused when execution flow "falls through"
@@ -220,8 +232,15 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
   // The check here for 64-bit windows is a bit icky, but as we're unlikely
   // to ever want to mix 32 and 64-bit windows code in a single module
   // this should be fine.
-  if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4())
+  if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() ||
+      TT.isOSBinFormatMachO()) {
     this->Options.TrapUnreachable = true;
+    this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
+  }
+
+  // Outlining is available for x86-64.
+  if (TT.getArch() == Triple::x86_64)
+    setMachineOutliner(true);
 
   initAsmInfo();
 }
@@ -257,7 +276,38 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
   if (SoftFloat)
     Key += FS.empty() ? "+soft-float" : ",+soft-float";
 
-  FS = Key.substr(CPU.size());
+  // Keep track of the key width after all features are added so we can extract
+  // the feature string out later.
+  unsigned CPUFSWidth = Key.size();
+
+  // Extract prefer-vector-width attribute.
+  unsigned PreferVectorWidthOverride = 0;
+  if (F.hasFnAttribute("prefer-vector-width")) {
+    StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString();
+    unsigned Width;
+    if (!Val.getAsInteger(0, Width)) {
+      Key += ",prefer-vector-width=";
+      Key += Val;
+      PreferVectorWidthOverride = Width;
+    }
+  }
+
+  // Extract required-vector-width attribute.
+  unsigned RequiredVectorWidth = UINT32_MAX;
+  if (F.hasFnAttribute("required-vector-width")) {
+    StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString();
+    unsigned Width;
+    if (!Val.getAsInteger(0, Width)) {
+      Key += ",required-vector-width=";
+      Key += Val;
+      RequiredVectorWidth = Width;
+    }
+  }
+
+  // Extracted here so that we make sure there is backing for the StringRef. If
+  // we assigned earlier, its possible the SmallString reallocated leaving a
+  // dangling StringRef.
+  FS = Key.slice(CPU.size(), CPUFSWidth);
 
   auto &I = SubtargetMap[Key];
   if (!I) {
@@ -266,7 +316,9 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
-                                        Options.StackAlignmentOverride);
+                                        Options.StackAlignmentOverride,
+                                        PreferVectorWidthOverride,
+                                        RequiredVectorWidth);
   }
   return I.get();
 }
@@ -327,20 +379,23 @@ public:
   void addPreSched2() override;
 };
 
-class X86ExecutionDepsFix : public ExecutionDepsFix {
+class X86ExecutionDomainFix : public ExecutionDomainFix {
 public:
   static char ID;
-  X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {}
+  X86ExecutionDomainFix() : ExecutionDomainFix(ID, X86::VR128XRegClass) {}
   StringRef getPassName() const override {
     return "X86 Execution Dependency Fix";
   }
 };
-char X86ExecutionDepsFix::ID;
+char X86ExecutionDomainFix::ID;
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
-                "X86 Execution Dependency Fix", false, false)
+INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
+  "X86 Execution Domain Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
+  "X86 Execution Domain Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(*this, PM);
@@ -415,8 +470,12 @@ void X86PassConfig::addPreRegAlloc() {
     addPass(createX86FixupSetCC());
     addPass(createX86OptimizeLEAs());
     addPass(createX86CallFrameOptimization());
+    addPass(createX86AvoidStoreForwardingBlocks());
   }
 
+  if (EnableSpeculativeLoadHardening)
+    addPass(createX86SpeculativeLoadHardeningPass());
+
   addPass(createX86FlagsCopyLoweringPass());
   addPass(createX86WinAllocaExpander());
 }
@@ -432,8 +491,13 @@ void X86PassConfig::addPostRegAlloc() {
 void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
 
 void X86PassConfig::addPreEmitPass() {
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(new X86ExecutionDepsFix());
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(new X86ExecutionDomainFix());
+    addPass(createBreakFalseDeps());
+  }
+
+  addPass(createShadowCallStackPass());
+  addPass(createX86IndirectBranchTrackingPass());
 
   if (UseVZeroUpper)
     addPass(createX86IssueVZeroUpperPass());
@@ -448,4 +512,10 @@ void X86PassConfig::addPreEmitPass() {
 
 void X86PassConfig::addPreEmitPass2() {
   addPass(createX86RetpolineThunksPass());
+  // Verify basic block incoming and outgoing cfa offset and register values and
+  // correct CFA calculation rule where needed by inserting appropriate CFI
+  // instructions.
+  const Triple &TT = TM->getTargetTriple();
+  if (!TT.isOSDarwin() && !TT.isOSWindows())
+    addPass(createCFIInstrInserter());
 }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index fb35a6b2ec1a..505c4fa07b77 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -91,100 +91,3 @@ void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
-
-const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
-    const GlobalValue *LHS, const GlobalValue *RHS,
-    const TargetMachine &TM) const {
-  // Our symbols should exist in address space zero, cowardly no-op if
-  // otherwise.
-  if (LHS->getType()->getPointerAddressSpace() != 0 ||
-      RHS->getType()->getPointerAddressSpace() != 0)
-    return nullptr;
-
-  // Both ptrtoint instructions must wrap global objects:
-  // - Only global variables are eligible for image relative relocations.
-  // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
-  // We expect __ImageBase to be a global variable without a section, externally
-  // defined.
-  //
-  // It should look something like this: @__ImageBase = external constant i8
-  if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
-      LHS->isThreadLocal() || RHS->isThreadLocal() ||
-      RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
-      cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
-    return nullptr;
-
-  return MCSymbolRefExpr::create(TM.getSymbol(LHS),
-                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
-                                 getContext());
-}
-
-static std::string APIntToHexString(const APInt &AI) {
-  unsigned Width = (AI.getBitWidth() / 8) * 2;
-  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
-  unsigned Size = HexString.size();
-  assert(Width >= Size && "hex string is too large!");
-  HexString.insert(HexString.begin(), Width - Size, '0');
-
-  return HexString;
-}
-
-static std::string scalarConstantToHexString(const Constant *C) {
-  Type *Ty = C->getType();
-  if (isa<UndefValue>(C)) {
-    return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
-  } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
-    return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
-  } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
-    return APIntToHexString(CI->getValue());
-  } else {
-    unsigned NumElements;
-    if (isa<VectorType>(Ty))
-      NumElements = Ty->getVectorNumElements();
-    else
-      NumElements = Ty->getArrayNumElements();
-    std::string HexString;
-    for (int I = NumElements - 1, E = -1; I != E; --I)
-      HexString += scalarConstantToHexString(C->getAggregateElement(I));
-    return HexString;
-  }
-}
-
-MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
-  if (Kind.isMergeableConst() && C) {
-    const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                     COFF::IMAGE_SCN_MEM_READ |
-                                     COFF::IMAGE_SCN_LNK_COMDAT;
-    std::string COMDATSymName;
-    if (Kind.isMergeableConst4()) {
-      if (Align <= 4) {
-        COMDATSymName = "__real@" + scalarConstantToHexString(C);
-        Align = 4;
-      }
-    } else if (Kind.isMergeableConst8()) {
-      if (Align <= 8) {
-        COMDATSymName = "__real@" + scalarConstantToHexString(C);
-        Align = 8;
-      }
-    } else if (Kind.isMergeableConst16()) {
-      if (Align <= 16) {
-        COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
-        Align = 16;
-      }
-    } else if (Kind.isMergeableConst32()) {
-      if (Align <= 32) {
-        COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
-        Align = 32;
-      }
-    }
-
-    if (!COMDATSymName.empty())
-      return getContext().getCOFFSection(".rdata", Characteristics, Kind,
-                                         COMDATSymName,
-                                         COFF::IMAGE_COMDAT_SELECT_ANY);
-  }
-
-  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
-}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
index 76e9cd5db2a0..d045094edb1e 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
 
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
 
@@ -37,7 +37,7 @@ namespace llvm {
                                             MCStreamer &Streamer) const override;
   };
 
-  /// \brief This implemenatation is used for X86 ELF targets that don't
+  /// This implemenatation is used for X86 ELF targets that don't
   /// have a further specialization.
   class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
   public:
@@ -45,7 +45,7 @@ namespace llvm {
       PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
     }
 
-    /// \brief Describe a TLS variable address within debug info.
+    /// Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
 
@@ -55,7 +55,7 @@ namespace llvm {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
 
-  /// \brief This implementation is used for Fuchsia on x86-64.
+  /// This implementation is used for Fuchsia on x86-64.
   class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
@@ -66,24 +66,11 @@ namespace llvm {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
 
-  /// \brief This implementation is used for Solaris on x86/x86-64.
+  /// This implementation is used for Solaris on x86/x86-64.
   class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
 
-  /// \brief This implementation is used for Windows targets on x86 and x86-64.
-  class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
-    const MCExpr *
-    lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS,
-                           const TargetMachine &TM) const override;
-
-    /// \brief Given a mergeable constant with the specified size and relocation
-    /// information, return a section that it should be placed in.
-    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
-                                     const Constant *C,
-                                     unsigned &Align) const override;
-  };
-
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 967d67a84bc0..bae2ef80c365 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -130,12 +130,13 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
 }
 
 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
+  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
   if (Vector) {
-    if (ST->hasAVX512())
+    if (ST->hasAVX512() && PreferVectorWidth >= 512)
       return 512;
-    if (ST->hasAVX())
+    if (ST->hasAVX() && PreferVectorWidth >= 256)
       return 256;
-    if (ST->hasSSE1())
+    if (ST->hasSSE1() && PreferVectorWidth >= 128)
       return 128;
     return 0;
   }
@@ -180,28 +181,40 @@ int X86TTIImpl::getArithmeticInstrCost(
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry GLMCostTable[] = {
+    { ISD::FDIV,  MVT::f32,   18 }, // divss
+    { ISD::FDIV,  MVT::v4f32, 35 }, // divps
+    { ISD::FDIV,  MVT::f64,   33 }, // divsd
+    { ISD::FDIV,  MVT::v2f64, 65 }, // divpd
+  };
+
+  if (ST->isGLM())
+    if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry SLMCostTable[] = {
-    { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
-    { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
-    { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
-    { ISD::FMUL, MVT::f64,   2  }, // mulsd
-    { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
-    { ISD::FMUL, MVT::v4f32, 2  }, // mulps
-    { ISD::FDIV, MVT::f32,   17 }, // divss
-    { ISD::FDIV, MVT::v4f32, 39 }, // divps
-    { ISD::FDIV, MVT::f64,   32 }, // divsd
-    { ISD::FDIV, MVT::v2f64, 69 }, // divpd
-    { ISD::FADD, MVT::v2f64, 2  }, // addpd
-    { ISD::FSUB, MVT::v2f64, 2  }, // subpd
+    { ISD::MUL,   MVT::v4i32, 11 }, // pmulld
+    { ISD::MUL,   MVT::v8i16, 2  }, // pmullw
+    { ISD::MUL,   MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+    { ISD::FMUL,  MVT::f64,   2  }, // mulsd
+    { ISD::FMUL,  MVT::v2f64, 4  }, // mulpd
+    { ISD::FMUL,  MVT::v4f32, 2  }, // mulps
+    { ISD::FDIV,  MVT::f32,   17 }, // divss
+    { ISD::FDIV,  MVT::v4f32, 39 }, // divps
+    { ISD::FDIV,  MVT::f64,   32 }, // divsd
+    { ISD::FDIV,  MVT::v2f64, 69 }, // divpd
+    { ISD::FADD,  MVT::v2f64, 2  }, // addpd
+    { ISD::FSUB,  MVT::v2f64, 2  }, // subpd
     // v2i64/v4i64 mul is custom lowered as a series of long:
     // multiplies(3), shifts(3) and adds(2)
     // slm muldq version throughput is 2 and addq throughput 4
-    // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+    // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
     //       3X4 (addq throughput) = 17
-    { ISD::MUL,  MVT::v2i64, 17 },
+    { ISD::MUL,   MVT::v2i64, 17 },
     // slm addq\subq throughput is 4
-    { ISD::ADD,  MVT::v2i64, 4  },
-    { ISD::SUB,  MVT::v2i64, 4  },
+    { ISD::ADD,   MVT::v2i64, 4  },
+    { ISD::SUB,   MVT::v2i64, 4  },
   };
 
   if (ST->isSLM()) {
@@ -224,30 +237,53 @@ int X86TTIImpl::getArithmeticInstrCost(
       if (!signedMode && OpMinSize <= 16)
         return LT.first * 5; // pmullw/pmulhw/pshuf
     }
+
     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
                                             LT.second)) {
       return LT.first * Entry->Cost;
     }
   }
 
-  if (ISD == ISD::SDIV &&
-      Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
+       ISD == ISD::UREM) &&
+      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
       Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
-    // On X86, vector signed division by constants power-of-two are
-    // normally expanded to the sequence SRA + SRL + ADD + SRA.
-    // The OperandValue properties many not be same as that of previous
-    // operation;conservatively assume OP_None.
-    int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
-                                          Op2Info, TargetTransformInfo::OP_None,
-                                          TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
+    if (ISD == ISD::SDIV || ISD == ISD::SREM) {
+      // On X86, vector signed division by constants power-of-two are
+      // normally expanded to the sequence SRA + SRL + ADD + SRA.
+      // The OperandValue properties may not be the same as that of the previous
+      // operation; conservatively assume OP_None.
+      int Cost =
+          2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+
+      if (ISD == ISD::SREM) {
+        // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
+        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
+        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
+      }
+
+      return Cost;
+    }
 
-    return Cost;
+    // Vector unsigned division/remainder will be simplified to shifts/masks.
+    if (ISD == ISD::UDIV)
+      return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+                                    TargetTransformInfo::OP_None,
+                                    TargetTransformInfo::OP_None);
+
+    if (ISD == ISD::UREM)
+      return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
+                                    TargetTransformInfo::OP_None,
+                                    TargetTransformInfo::OP_None);
   }
 
   static const CostTblEntry AVX512BWUniformConstCostTable[] = {
@@ -256,7 +292,9 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
 
     { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
+    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
     { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
+    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -272,7 +310,9 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v8i64,   1 },
 
     { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -290,9 +330,13 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
 
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
     { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
     { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
     { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -312,13 +356,21 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
 
     { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+    { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
     { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
+    { ISD::SREM, MVT::v8i16,     8 }, // pmulhw+mul+sub sequence
     { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+    { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
     { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
+    { ISD::UREM, MVT::v8i16,     8 }, // pmulhuw+mul+sub sequence
     { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
+    { ISD::SREM, MVT::v8i32,  48+2 }, // 2*pmuludq+mul+sub sequence + split.
     { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
+    { ISD::SREM, MVT::v4i32,    24 }, // pmuludq+mul+sub sequence
     { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
+    { ISD::UREM, MVT::v8i32,  40+2 }, // 2*pmuludq+mul+sub sequence + split.
     { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
+    { ISD::UREM, MVT::v4i32,    20 }, // pmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -326,8 +378,12 @@ int X86TTIImpl::getArithmeticInstrCost(
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
       return LT.first * 32;
+    if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
+      return LT.first * 38;
     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 15;
+    if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 20;
 
     // XOP has faster vXi8 shifts.
     if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
@@ -405,12 +461,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
-
-    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
-    { ISD::SDIV,  MVT::v64i8,  64*20 },
-    { ISD::SDIV,  MVT::v32i16, 32*20 },
-    { ISD::UDIV,  MVT::v64i8,  64*20 },
-    { ISD::UDIV,  MVT::v32i16, 32*20 }
   };
 
   // Look for AVX512BW lowering tricks for custom cases.
@@ -432,14 +482,18 @@ int X86TTIImpl::getArithmeticInstrCost(
 
     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld
+    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld (Skylake from agner.org)
+    { ISD::MUL,     MVT::v8i32,      1 }, // pmulld (Skylake from agner.org)
+    { ISD::MUL,     MVT::v4i32,      1 }, // pmulld (Skylake from agner.org)
     { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
 
-    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
-    { ISD::SDIV,    MVT::v16i32, 16*20 },
-    { ISD::SDIV,    MVT::v8i64,   8*20 },
-    { ISD::UDIV,    MVT::v16i32, 16*20 },
-    { ISD::UDIV,    MVT::v8i64,   8*20 }
+    { ISD::FADD,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
+    { ISD::FSUB,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
+    { ISD::FMUL,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
+
+    { ISD::FADD,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
+    { ISD::FSUB,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
+    { ISD::FMUL,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
   };
 
   if (ST->hasAVX512())
@@ -468,7 +522,9 @@ int X86TTIImpl::getArithmeticInstrCost(
          Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
       // On AVX2, a packed v16i16 shift left by a constant build_vector
       // is lowered into a vector multiply (vpmullw).
-      return LT.first;
+      return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+                                    TargetTransformInfo::OP_None,
+                                    TargetTransformInfo::OP_None);
 
     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
@@ -571,9 +627,16 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
+    { ISD::MUL,  MVT::v8i32,      2 }, // pmulld (Haswell from agner.org)
     { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
 
+    { ISD::FADD, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
+    { ISD::FADD, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
+    { ISD::FSUB, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
+    { ISD::FSUB, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
+    { ISD::FMUL, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
+    { ISD::FMUL, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
+
     { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
     { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
     { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
@@ -617,16 +680,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
     { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
     { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
-
-    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
-    { ISD::SDIV,    MVT::v32i8,  32*20 },
-    { ISD::SDIV,    MVT::v16i16, 16*20 },
-    { ISD::SDIV,    MVT::v8i32,   8*20 },
-    { ISD::SDIV,    MVT::v4i64,   4*20 },
-    { ISD::UDIV,    MVT::v32i8,  32*20 },
-    { ISD::UDIV,    MVT::v16i16, 16*20 },
-    { ISD::UDIV,    MVT::v8i32,   8*20 },
-    { ISD::UDIV,    MVT::v4i64,   4*20 },
   };
 
   if (ST->hasAVX())
@@ -634,6 +687,21 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE42CostTable[] = {
+    { ISD::FADD, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
+    { ISD::FADD, MVT::f32,     1 }, // Nehalem from http://www.agner.org/
+    { ISD::FADD, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
+    { ISD::FADD, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
+
+    { ISD::FSUB, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
+    { ISD::FSUB, MVT::f32 ,    1 }, // Nehalem from http://www.agner.org/
+    { ISD::FSUB, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
+    { ISD::FSUB, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
+
+    { ISD::FMUL, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
+    { ISD::FMUL, MVT::f32,     1 }, // Nehalem from http://www.agner.org/
+    { ISD::FMUL, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
+    { ISD::FMUL, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
+
     { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
     { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
     { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
@@ -666,7 +734,7 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
 
-    { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
+    { ISD::MUL,  MVT::v4i32,       2 }  // pmulld (Nehalem from agner.org)
   };
 
   if (ST->hasSSE41())
@@ -703,21 +771,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
     { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
     { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
-
-    // It is not a good idea to vectorize division. We have to scalarize it and
-    // in the process we will often end up having to spilling regular
-    // registers. The overhead of division is going to dominate most kernels
-    // anyways so try hard to prevent vectorization of division - it is
-    // generally a bad idea. Assume somewhat arbitrarily that we have to be able
-    // to hide "20 cycles" for each lane.
-    { ISD::SDIV,  MVT::v16i8,  16*20 },
-    { ISD::SDIV,  MVT::v8i16,   8*20 },
-    { ISD::SDIV,  MVT::v4i32,   4*20 },
-    { ISD::SDIV,  MVT::v2i64,   2*20 },
-    { ISD::UDIV,  MVT::v16i8,  16*20 },
-    { ISD::UDIV,  MVT::v8i16,   8*20 },
-    { ISD::UDIV,  MVT::v4i32,   4*20 },
-    { ISD::UDIV,  MVT::v2i64,   2*20 },
   };
 
   if (ST->hasSSE2())
@@ -733,6 +786,20 @@ int X86TTIImpl::getArithmeticInstrCost(
     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
 
+  // It is not a good idea to vectorize division. We have to scalarize it and
+  // in the process we will often end up having to spilling regular
+  // registers. The overhead of division is going to dominate most kernels
+  // anyways so try hard to prevent vectorization of division - it is
+  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+  // to hide "20 cycles" for each lane.
+  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
+                               ISD == ISD::UDIV || ISD == ISD::UREM)) {
+    int ScalarCost = getArithmeticInstrCost(
+        Opcode, Ty->getScalarType(), Op1Info, Op2Info,
+        TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+    return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
+  }
+
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
 }
@@ -887,8 +954,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
 
-    { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-    { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
+    { TTI::SK_Select,    MVT::v16i16, 1 }, // vpblendvb
+    { TTI::SK_Select,    MVT::v32i8,  1 }, // vpblendvb
 
     { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
     { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
@@ -952,15 +1019,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
                                            // + vinsertf128
 
-    { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
-    { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
-    { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
-    { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
-    { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
-    { TTI::SK_Alternate, MVT::v32i8,  3 }, // vpand + vpandn + vpor
+    { TTI::SK_Select,    MVT::v4i64,  1 }, // vblendpd
+    { TTI::SK_Select,    MVT::v4f64,  1 }, // vblendpd
+    { TTI::SK_Select,    MVT::v8i32,  1 }, // vblendps
+    { TTI::SK_Select,    MVT::v8f32,  1 }, // vblendps
+    { TTI::SK_Select,    MVT::v16i16, 3 }, // vpand + vpandn + vpor
+    { TTI::SK_Select,    MVT::v32i8,  3 }, // vpand + vpandn + vpor
 
-    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  3 }, // 2*vperm2f128 + vshufpd
-    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  3 }, // 2*vperm2f128 + vshufpd
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  2 }, // vperm2f128 + vshufpd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  2 }, // vperm2f128 + vshufpd
     { TTI::SK_PermuteSingleSrc, MVT::v8f32,  4 }, // 2*vperm2f128 + 2*vshufps
     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  4 }, // 2*vperm2f128 + 2*vshufps
     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
@@ -968,9 +1035,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  8 }, // vextractf128 + 4*pshufb
                                                   // + 2*por + vinsertf128
 
-    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,   4 }, // 2*vperm2f128 + 2*vshufpd
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,   3 }, // 2*vperm2f128 + vshufpd
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,   3 }, // 2*vperm2f128 + vshufpd
     { TTI::SK_PermuteTwoSrc,    MVT::v8f32,   4 }, // 2*vperm2f128 + 2*vshufps
-    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,   4 }, // 2*vperm2f128 + 2*vshufpd
     { TTI::SK_PermuteTwoSrc,    MVT::v8i32,   4 }, // 2*vperm2f128 + 2*vshufps
     { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
                                                    // + 4*por + vinsertf128
@@ -983,12 +1050,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE41ShuffleTbl[] = {
-    { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
-    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
-    { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
-    { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
-    { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
-    { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
+    { TTI::SK_Select,    MVT::v2i64,  1 }, // pblendw
+    { TTI::SK_Select,    MVT::v2f64,  1 }, // movsd
+    { TTI::SK_Select,    MVT::v4i32,  1 }, // pblendw
+    { TTI::SK_Select,    MVT::v4f32,  1 }, // blendps
+    { TTI::SK_Select,    MVT::v8i16,  1 }, // pblendw
+    { TTI::SK_Select,    MVT::v16i8,  1 }  // pblendvb
   };
 
   if (ST->hasSSE41())
@@ -1002,8 +1069,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
-    { TTI::SK_Alternate, MVT::v8i16,  3 }, // 2*pshufb + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }, // 2*pshufb + por
+    { TTI::SK_Select,    MVT::v8i16,  3 }, // 2*pshufb + por
+    { TTI::SK_Select,    MVT::v16i8,  3 }, // 2*pshufb + por
 
     { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
@@ -1030,11 +1097,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
                                            // + 2*pshufd + 2*unpck + packus
 
-    { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
-    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
-    { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
-    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
+    { TTI::SK_Select,    MVT::v2i64,  1 }, // movsd
+    { TTI::SK_Select,    MVT::v2f64,  1 }, // movsd
+    { TTI::SK_Select,    MVT::v4i32,  2 }, // 2*shufps
+    { TTI::SK_Select,    MVT::v8i16,  3 }, // pand + pandn + por
+    { TTI::SK_Select,    MVT::v16i8,  3 }, // pand + pandn + por
 
     { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // shufpd
     { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // pshufd
@@ -1058,7 +1125,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   static const CostTblEntry SSE1ShuffleTbl[] = {
     { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
     { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
-    { TTI::SK_Alternate,        MVT::v4f32, 2 }, // 2*shufps
+    { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
     { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
     { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
   };
@@ -1488,6 +1555,15 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     { ISD::SETCC,   MVT::v16f32,  1 },
   };
 
+  static const CostTblEntry AVX512BWCostTbl[] = {
+    { ISD::SETCC,   MVT::v32i16,  1 },
+    { ISD::SETCC,   MVT::v64i8,   1 },
+  };
+
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   if (ST->hasAVX512())
     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
@@ -1632,6 +1708,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
   };
+  static const CostTblEntry GLMCostTbl[] = {
+    { ISD::FSQRT, MVT::f32,   19 }, // sqrtss
+    { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
+    { ISD::FSQRT, MVT::f64,   34 }, // sqrtsd
+    { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
+  };
+  static const CostTblEntry SLMCostTbl[] = {
+    { ISD::FSQRT, MVT::f32,   20 }, // sqrtss
+    { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
+    { ISD::FSQRT, MVT::f64,   35 }, // sqrtsd
+    { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
+  };
   static const CostTblEntry SSE42CostTbl[] = {
     { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
@@ -1722,6 +1810,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   MVT MTy = LT.second;
 
   // Attempt to lookup cost.
+  if (ST->isGLM())
+    if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->isSLM())
+    if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   if (ST->hasCDI())
     if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
@@ -1887,8 +1983,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires expand/truncate for data and a shuffle for mask.
-    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
-            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+    Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
+            getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
 
   else if (LT.second.getVectorNumElements() > NumElem) {
     VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -2146,7 +2242,7 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
   return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
 }
 
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
 int X86TTIImpl::getIntImmCost(int64_t Val) {
@@ -2481,6 +2577,10 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
 
+bool X86TTIImpl::canMacroFuseCmp() {
+  return ST->hasMacroFusion();
+}
+
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   // The backend can't handle a single element vector.
   if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
@@ -2523,7 +2623,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
   // enable gather with a -march.
   return (DataWidth == 32 || DataWidth == 64) &&
-    (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
+         (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
 }
 
 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 6f01a6fd11df..3df899038820 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -120,6 +120,7 @@ public:
                     Type *Ty);
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2);
+  bool canMacroFuseCmp();
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 224262830b12..f882b760927c 100644
--- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -264,8 +264,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     }
   }
 
-  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
-               << getBlockExitStateName(CurState) << '\n');
+  LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+                    << getBlockExitStateName(CurState) << '\n');
 
   if (CurState == EXITS_DIRTY)
     for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
@@ -341,8 +341,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
     // successors need to be added to the worklist (if they haven't been
     // already).
     if (BBState.ExitState == PASS_THROUGH) {
-      DEBUG(dbgs() << "MBB #" << MBB.getNumber()
-                   << " was Pass-through, is now Dirty-out.\n");
+      LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+                        << " was Pass-through, is now Dirty-out.\n");
       for (MachineBasicBlock *Succ : MBB.successors())
         addDirtySuccessor(*Succ);
     }
diff --git a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
index 1046696587d9..d298aaa97ecd 100644
--- a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -62,6 +62,7 @@ private:
   unsigned StackPtr;
   unsigned SlotSize;
   int64_t StackProbeSize;
+  bool NoStackArgProbe;
 
   StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
   static char ID;
@@ -240,13 +241,21 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
     }
     break;
   case Probe:
-    // The probe lowering expects the amount in RAX/EAX.
-    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
-        .addReg(MI->getOperand(0).getReg());
-
-    // Do the probe.
-    STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
-                                            /*InPrologue=*/false);
+    if (!NoStackArgProbe) {
+      // The probe lowering expects the amount in RAX/EAX.
+      BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
+          .addReg(MI->getOperand(0).getReg());
+
+      // Do the probe.
+      STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
+                                              /*InPrologue=*/false);
+    } else {
+      // Sub
+      BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::SUB64rr : X86::SUB32rr),
+              StackPtr)
+          .addReg(StackPtr)
+          .addReg(MI->getOperand(0).getReg());
+    }
     break;
   }
 
@@ -285,6 +294,9 @@ bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
   }
+  NoStackArgProbe = MF.getFunction().hasFnAttribute("no-stack-arg-probe");
+  if (NoStackArgProbe)
+    StackProbeSize = INT64_MAX;
 
   LoweringMap Lowerings;
   computeLowerings(MF, Lowerings);
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
index 6d6dedc60736..dde9c734f492 100644
--- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -695,10 +695,10 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
       Worklist.push_back(BB);
       continue;
     }
-    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
-                 << " InitialState=" << InitialState << '\n');
-    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
-                 << " FinalState=" << FinalState << '\n');
+    LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                      << " InitialState=" << InitialState << '\n');
+    LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                      << " FinalState=" << FinalState << '\n');
     InitialStates.insert({BB, InitialState});
     FinalStates.insert({BB, FinalState});
   }
@@ -743,8 +743,8 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
       continue;
 
     int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
-    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
-                 << " PrevState=" << PrevState << '\n');
+    LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                      << " PrevState=" << PrevState << '\n');
 
     for (Instruction &I : *BB) {
       CallSite CS(&I);
diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 059b75ef482a..faf66e5944ab 100644
--- a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file is part of the XCore Disassembler.
+/// This file is part of the XCore Disassembler.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -29,7 +29,7 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 
-/// \brief A disassembler class for XCore.
+/// A disassembler class for XCore.
 class XCoreDisassembler : public MCDisassembler {
 public:
   XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 8a7efe2e39c6..a0b480026469 100644
--- a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// This file contains the declaration of the XCoreInstPrinter class,
 /// which is used to print XCore MCInst to a .s file.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index 0da90df6eb16..8f7c8a82380a 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -42,6 +41,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <algorithm>
 #include <cctype>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 62b2c8eee152..b87c149a36dc 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -151,7 +151,7 @@ static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
                                       Offset,
                                       FramePtr));
   }
-  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+  llvm::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
 }
 
 /// Creates an ordered list of EH info register 'spills'.
@@ -170,7 +170,7 @@ static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
   SpillList.push_back(
       StackSlotInfo(EHSlot[0], MFI.getObjectOffset(EHSlot[1]),
                     TL->getExceptionSelectorRegister(PersonalityFn)));
-  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+  llvm::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
 }
 
 static MachineMemOperand *getFrameIndexMMO(MachineBasicBlock &MBB,
@@ -427,7 +427,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
 
   DebugLoc DL;
-  if (MI != MBB.end() && !MI->isDebugValue())
+  if (MI != MBB.end() && !MI->isDebugInstr())
     DL = MI->getDebugLoc();
 
   for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 0ac5ecfa7e8c..99e76144cba3 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -91,10 +91,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
   // XCore does not have the NodeTypes below.
   setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i32,   Expand);
-  setOperationAction(ISD::ADDC, MVT::i32, Expand);
-  setOperationAction(ISD::ADDE, MVT::i32, Expand);
-  setOperationAction(ISD::SUBC, MVT::i32, Expand);
-  setOperationAction(ISD::SUBE, MVT::i32, Expand);
 
   // 64bit
   setOperationAction(ISD::ADD, MVT::i64, Custom);
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index c885332b07ad..d5e276788f71 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -364,7 +364,7 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          const TargetRegisterInfo *TRI) const
 {
   DebugLoc DL;
-  if (I != MBB.end() && !I->isDebugValue())
+  if (I != MBB.end() && !I->isDebugInstr())
     DL = I->getDebugLoc();
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -386,7 +386,7 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           const TargetRegisterInfo *TRI) const
 {
   DebugLoc DL;
-  if (I != MBB.end() && !I->isDebugValue())
+  if (I != MBB.end() && !I->isDebugInstr())
     DL = I->getDebugLoc();
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -429,7 +429,7 @@ MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
                                               MachineBasicBlock::iterator MI,
                                               unsigned Reg, uint64_t Value) const {
   DebugLoc dl;
-  if (MI != MBB.end() && !MI->isDebugValue())
+  if (MI != MBB.end() && !MI->isDebugInstr())
     dl = MI->getDebugLoc();
   if (isImmMskBitp(Value)) {
     int N = Log2_32(Value) + 1;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 87532d11ede8..1c93ba8fa14c 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains a pass that lowers thread local variables on the
+/// This file contains a pass that lowers thread local variables on the
 ///        XCore.
 ///
 //===----------------------------------------------------------------------===//
@@ -129,7 +129,7 @@ createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
 static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
   do {
     SmallVector<WeakTrackingVH, 8> WUsers(CE->user_begin(), CE->user_end());
-    std::sort(WUsers.begin(), WUsers.end());
+    llvm::sort(WUsers.begin(), WUsers.end());
     WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
     while (!WUsers.empty())
       if (WeakTrackingVH WU = WUsers.pop_back_val()) {
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
index 7763ccc8f4af..21270192b234 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file contains code to lower XCore MachineInstrs to their
+/// This file contains code to lower XCore MachineInstrs to their
 /// corresponding MCInst records.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
index 8fb1593cc6e6..abcb80fcf766 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
@@ -21,7 +21,7 @@ namespace llvm {
   class Mangler;
   class AsmPrinter;
 
-/// \brief This class is used to lower an MachineInstr into an MCInst.
+/// This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
   typedef MachineOperand::MachineOperandType MachineOperandType;
   MCContext *Ctx;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
index 70376d40a37f..1915aaedc35d 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -274,14 +274,13 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int StackSize = MF.getFrameInfo().getStackSize();
 
   #ifndef NDEBUG
-  DEBUG(errs() << "\nFunction         : " 
-        << MF.getName() << "\n");
-  DEBUG(errs() << "<--------->\n");
-  DEBUG(MI.print(errs()));
-  DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
-  DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
-  DEBUG(errs() << "StackSize          : " << StackSize << "\n");
-  #endif
+  LLVM_DEBUG(errs() << "\nFunction         : " << MF.getName() << "\n");
+  LLVM_DEBUG(errs() << "<--------->\n");
+  LLVM_DEBUG(MI.print(errs()));
+  LLVM_DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
+  LLVM_DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
+  LLVM_DEBUG(errs() << "StackSize          : " << StackSize << "\n");
+#endif
 
   Offset += StackSize;
 
@@ -299,7 +298,8 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
   
   assert(Offset%4 == 0 && "Misaligned stack offset");
-  DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
+  LLVM_DEBUG(errs() << "Offset             : " << Offset << "\n"
+                    << "<--------->\n");
   Offset/=4;
   
   unsigned Reg = MI.getOperand(0).getReg();
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
index 0e337d65a0f6..c31f5d5a7c44 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
@@ -33,6 +33,8 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   
+  bool enableMultipleCopyHints() const override { return true; }
+
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Testing/Support/Error.cpp b/contrib/llvm/lib/Testing/Support/Error.cpp
index ce0da44da408..5692cdfcdf7b 100644
--- a/contrib/llvm/lib/Testing/Support/Error.cpp
+++ b/contrib/llvm/lib/Testing/Support/Error.cpp
@@ -14,9 +14,10 @@
 using namespace llvm;
 
 llvm::detail::ErrorHolder llvm::detail::TakeError(llvm::Error Err) {
-  bool Succeeded = !static_cast<bool>(Err);
-  std::string Message;
-  if (!Succeeded)
-    Message = toString(std::move(Err));
-  return {Succeeded, Message};
+  std::vector<std::shared_ptr<ErrorInfoBase>> Infos;
+  handleAllErrors(std::move(Err),
+                  [&Infos](std::unique_ptr<ErrorInfoBase> Info) {
+                    Infos.emplace_back(std::move(Info));
+                  });
+  return {std::move(Infos)};
 }
diff --git a/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 964844922f07..f8de7ca73924 100644
--- a/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -158,7 +158,7 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
 
   if (Machine == IMAGE_FILE_MACHINE_I386 && Args.getLastArg(OPT_k)) {
     for (COFFShortExport& E : Def->Exports) {
-      if (E.isWeak() || (!E.Name.empty() && E.Name[0] == '?'))
+      if (!E.AliasTarget.empty() || (!E.Name.empty() && E.Name[0] == '?'))
         continue;
       E.SymbolName = E.Name;
       // Trim off the trailing decoration. Symbols will always have a
diff --git a/contrib/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/contrib/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index f5cf848aa8c7..d636dca7a2c7 100644
--- a/contrib/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/contrib/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -121,6 +121,12 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
   for (auto *Arg : Args.filtered(OPT_UNKNOWN))
     llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
 
+  // Handle /help
+  if (Args.hasArg(OPT_help)) {
+    Table.PrintHelp(outs(), ArgsArr[0], "LLVM Lib");
+    return 0;
+  }
+
   // If no input files, silently do nothing to match lib.exe.
   if (!Args.hasArgNoClaim(OPT_INPUT))
     return 0;
diff --git a/contrib/llvm/lib/ToolDrivers/llvm-lib/Options.td b/contrib/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 5a56ef7468d4..dd41952b7878 100644
--- a/contrib/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/contrib/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -12,7 +12,11 @@ class P<string name, string help> :
 def libpath: P<"libpath", "Object file search path">;
 def out    : P<"out", "Path to file to write output">;
 
-def llvmlibthin : F<"llvmlibthin">;
+def llvmlibthin : F<"llvmlibthin">,
+    HelpText<"Make .lib point to .obj files instead of copying their contents">;
+
+def help : F<"help">;
+def help_q : Flag<["/?", "-?"], "">, Alias<help>;
 
 //==============================================================================
 // The flags below do nothing. They are defined only for lib.exe compatibility.
diff --git a/contrib/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
new file mode 100644
index 000000000000..b622d018478a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -0,0 +1,257 @@
+//===- AggressiveInstCombine.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the aggressive expression pattern combiner classes.
+// Currently, it handles expression patterns for:
+//  * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "AggressiveInstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+namespace {
+/// Contains expression pattern combiner logic.
+/// This class provides both the logic to combine expression patterns and
+/// combine them. It differs from InstCombiner class in that each pattern
+/// combiner runs only once as opposed to InstCombine's multi-iteration,
+/// which allows pattern combiner to have higher complexity than the O(1)
+/// required by the instruction combiner.
+class AggressiveInstCombinerLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  AggressiveInstCombinerLegacyPass() : FunctionPass(ID) {
+    initializeAggressiveInstCombinerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Run all expression pattern optimizations on the given /p F function.
+  ///
+  /// \param F function to optimize.
+  /// \returns true if the IR is changed.
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
+/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
+/// of 'and' ops, then we also need to capture the fact that we saw an
+/// "and X, 1", so that's an extra return value for that case.
+struct MaskOps {
+  Value *Root;
+  APInt Mask;
+  bool MatchAndChain;
+  bool FoundAnd1;
+
+  MaskOps(unsigned BitWidth, bool MatchAnds) :
+      Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
+      MatchAndChain(MatchAnds), FoundAnd1(false) {}
+};
+
+/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
+/// chain of 'and' or 'or' instructions looking for shift ops of a common source
+/// value. Examples:
+///   or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
+/// returns { X, 0x129 }
+///   and (and (X >> 1), 1), (X >> 4)
+/// returns { X, 0x12 }
+static bool matchAndOrChain(Value *V, MaskOps &MOps) {
+  Value *Op0, *Op1;
+  if (MOps.MatchAndChain) {
+    // Recurse through a chain of 'and' operands. This requires an extra check
+    // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
+    // in the chain to know that all of the high bits are cleared.
+    if (match(V, m_And(m_Value(Op0), m_One()))) {
+      MOps.FoundAnd1 = true;
+      return matchAndOrChain(Op0, MOps);
+    }
+    if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
+      return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+  } else {
+    // Recurse through a chain of 'or' operands.
+    if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+      return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+  }
+
+  // We need a shift-right or a bare value representing a compare of bit 0 of
+  // the original source operand.
+  Value *Candidate;
+  uint64_t BitIndex = 0;
+  if (!match(V, m_LShr(m_Value(Candidate), m_ConstantInt(BitIndex))))
+    Candidate = V;
+
+  // Initialize result source operand.
+  if (!MOps.Root)
+    MOps.Root = Candidate;
+
+  // The shift constant is out-of-range? This code hasn't been simplified.
+  if (BitIndex >= MOps.Mask.getBitWidth())
+    return false;
+
+  // Fill in the mask bit derived from the shift constant.
+  MOps.Mask.setBit(BitIndex);
+  return MOps.Root == Candidate;
+}
+
+/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
+/// These will include a chain of 'or' or 'and'-shifted bits from a
+/// common source value:
+/// and (or  (lshr X, C), ...), 1 --> (X & CMask) != 0
+/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
+/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
+/// that differ only with a final 'not' of the result. We expect that final
+/// 'not' to be folded with the compare that we create here (invert predicate).
+static bool foldAnyOrAllBitsSet(Instruction &I) {
+  // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
+  // final "and X, 1" instruction must be the final op in the sequence.
+  bool MatchAllBitsSet;
+  if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
+    MatchAllBitsSet = true;
+  else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
+    MatchAllBitsSet = false;
+  else
+    return false;
+
+  MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
+  if (MatchAllBitsSet) {
+    if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)
+      return false;
+  } else {
+    if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))
+      return false;
+  }
+
+  // The pattern was found. Create a masked compare that replaces all of the
+  // shift and logic ops.
+  IRBuilder<> Builder(&I);
+  Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
+  Value *And = Builder.CreateAnd(MOps.Root, Mask);
+  Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask) :
+                                 Builder.CreateIsNotNull(And);
+  Value *Zext = Builder.CreateZExt(Cmp, I.getType());
+  I.replaceAllUsesWith(Zext);
+  return true;
+}
+
+/// This is the entry point for folds that could be implemented in regular
+/// InstCombine, but they are separated because they are not expected to
+/// occur frequently and/or have more than a constant-length pattern match.
+static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    // Ignore unreachable basic blocks.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+    // Do not delete instructions under here and invalidate the iterator.
+    // Walk the block backwards for efficiency. We're matching a chain of
+    // use->defs, so we're more likely to succeed by starting from the bottom.
+    // Also, we want to avoid matching partial patterns.
+    // TODO: It would be more efficient if we removed dead instructions
+    // iteratively in this loop rather than waiting until the end.
+    for (Instruction &I : make_range(BB.rbegin(), BB.rend()))
+      MadeChange |= foldAnyOrAllBitsSet(I);
+  }
+
+  // We're done with transforms, so remove dead instructions.
+  if (MadeChange)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
+
+  return MadeChange;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
+  bool MadeChange = false;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  TruncInstCombine TIC(TLI, DL, DT);
+  MadeChange |= TIC.run(F);
+  MadeChange |= foldUnusualPatterns(F, DT);
+  return MadeChange;
+}
+
+void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return runImpl(F, TLI, DT);
+}
+
+PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, TLI, DT)) {
+    // No changes, all analyses are preserved.
+    return PreservedAnalyses::all();
+  }
+  // Mark all the analyses that instcombine updates as preserved.
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char AggressiveInstCombinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
+                      "aggressive-instcombine",
+                      "Combine pattern based expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
+                    "Combine pattern based expressions", false, false)
+
+// Initialization Routines
+void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) {
+  initializeAggressiveInstCombinerLegacyPassPass(Registry);
+}
+
+void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) {
+  initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createAggressiveInstCombinerPass() {
+  return new AggressiveInstCombinerLegacyPass();
+}
+
+void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveInstCombinerPass());
+}
diff --git a/contrib/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/contrib/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
new file mode 100644
index 000000000000..199374cdabf3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -0,0 +1,121 @@
+//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the instruction pattern combiner classes.
+// Currently, it handles pattern expressions for:
+//  * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TruncInstCombine - looks for expression dags dominated by trunc instructions
+// and for each eligible dag, it will create a reduced bit-width expression and
+// replace the old expression with this new one and remove the old one.
+// Eligible expression dag is such that:
+//   1. Contains only supported instructions.
+//   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+//   3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
+//   4. All instructions in the dag must not have users outside the dag.
+//      Only exception is for {ZExt, SExt}Inst with operand type equal to the
+//      new reduced type chosen in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+  class DataLayout;
+  class DominatorTree;
+  class TargetLibraryInfo;
+
+class TruncInstCombine {
+  TargetLibraryInfo &TLI;
+  const DataLayout &DL;
+  const DominatorTree &DT;
+
+  /// List of all TruncInst instructions to be processed.
+  SmallVector<TruncInst *, 4> Worklist;
+
+  /// Current processed TruncInst instruction.
+  TruncInst *CurrentTruncInst;
+
+  /// Information per each instruction in the expression dag.
+  struct Info {
+    /// Number of LSBs that are needed to generate a valid expression.
+    unsigned ValidBitWidth = 0;
+    /// Minimum number of LSBs needed to generate the ValidBitWidth.
+    unsigned MinBitWidth = 0;
+    /// The reduced value generated to replace the old instruction.
+    Value *NewValue = nullptr;
+  };
+  /// An ordered map representing expression dag post-dominated by current
+  /// processed TruncInst. It maps each instruction in the dag to its Info
+  /// structure. The map is ordered such that each instruction appears before
+  /// all other instructions in the dag that uses it.
+  MapVector<Instruction *, Info> InstInfoMap;
+
+public:
+  TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL,
+                   const DominatorTree &DT)
+      : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
+
+  /// Perform TruncInst pattern optimization on given function.
+  bool run(Function &F);
+
+private:
+  /// Build expression dag dominated by the /p CurrentTruncInst and append it to
+  /// the InstInfoMap container.
+  ///
+  /// \return true only if succeed to generate an eligible sub expression dag.
+  bool buildTruncExpressionDag();
+
+  /// Calculate the minimal allowed bit-width of the chain ending with the
+  /// currently visited truncate's operand.
+  ///
+  /// \return minimum number of bits to which the chain ending with the
+  /// truncate's operand can be shrunk to.
+  unsigned getMinBitWidth();
+
+  /// Build an expression dag dominated by the current processed TruncInst and
+  /// Check if it is eligible to be reduced to a smaller type.
+  ///
+  /// \return the scalar version of the new type to be used for the reduced
+  ///         expression dag, or nullptr if the expression dag is not eligible
+  ///         to be reduced.
+  Type *getBestTruncatedType();
+
+  /// Given a \p V value and a \p SclTy scalar type return the generated reduced
+  /// value of \p V based on the type \p SclTy.
+  ///
+  /// \param V value to be reduced.
+  /// \param SclTy scalar version of new type to reduce to.
+  /// \return the new reduced value.
+  Value *getReducedOperand(Value *V, Type *SclTy);
+
+  /// Create a new expression dag using the reduced /p SclTy type and replace
+  /// the old expression dag with it. Also erase all instructions in the old
+  /// dag, except those that are still needed outside the dag.
+  ///
+  /// \param SclTy scalar version of new type to reduce expression dag into.
+  void ReduceExpressionDag(Type *SclTy);
+};
+} // end namespace llvm.
diff --git a/contrib/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/contrib/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
new file mode 100644
index 000000000000..8289b2d68f8a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -0,0 +1,418 @@
+//===- TruncInstCombine.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
+// for each eligible dag, it will create a reduced bit-width expression, replace
+// the old expression with this new one and remove the old expression.
+// Eligible expression dag is such that:
+//   1. Contains only supported instructions.
+//   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+//   3. Can be evaluated into type with reduced legal bit-width.
+//   4. All instructions in the dag must not have users outside the dag.
+//      The only exception is for {ZExt, SExt}Inst with operand type equal to
+//      the new reduced type evaluated in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveInstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+/// Given an instruction and a container, it fills all the relevant operands of
+/// that instruction, with respect to the Trunc expression dag optimizaton.
+static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // These CastInst are considered leaves of the evaluated expression, thus,
+    // their operands are not relevent.
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    Ops.push_back(I->getOperand(0));
+    Ops.push_back(I->getOperand(1));
+    break;
+  default:
+    llvm_unreachable("Unreachable!");
+  }
+}
+
+bool TruncInstCombine::buildTruncExpressionDag() {
+  SmallVector<Value *, 8> Worklist;
+  SmallVector<Instruction *, 8> Stack;
+  // Clear old expression dag.
+  InstInfoMap.clear();
+
+  Worklist.push_back(CurrentTruncInst->getOperand(0));
+
+  while (!Worklist.empty()) {
+    Value *Curr = Worklist.back();
+
+    if (isa<Constant>(Curr)) {
+      Worklist.pop_back();
+      continue;
+    }
+
+    auto *I = dyn_cast<Instruction>(Curr);
+    if (!I)
+      return false;
+
+    if (!Stack.empty() && Stack.back() == I) {
+      // Already handled all instruction operands, can remove it from both the
+      // Worklist and the Stack, and add it to the instruction info map.
+      Worklist.pop_back();
+      Stack.pop_back();
+      // Insert I to the Info map.
+      InstInfoMap.insert(std::make_pair(I, Info()));
+      continue;
+    }
+
+    if (InstInfoMap.count(I)) {
+      Worklist.pop_back();
+      continue;
+    }
+
+    // Add the instruction to the stack before start handling its operands.
+    Stack.push_back(I);
+
+    unsigned Opc = I->getOpcode();
+    switch (Opc) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // trunc(trunc(x)) -> trunc(x)
+      // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+      // trunc(ext(x)) -> trunc(x) if the source type is larger than the new
+      // dest
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      SmallVector<Value *, 2> Operands;
+      getRelevantOperands(I, Operands);
+      for (Value *Operand : Operands)
+        Worklist.push_back(Operand);
+      break;
+    }
+    default:
+      // TODO: Can handle more cases here:
+      // 1. select, shufflevector, extractelement, insertelement
+      // 2. udiv, urem
+      // 3. shl, lshr, ashr
+      // 4. phi node(and loop handling)
+      // ...
+      return false;
+    }
+  }
+  return true;
+}
+
+unsigned TruncInstCombine::getMinBitWidth() {
+  SmallVector<Value *, 8> Worklist;
+  SmallVector<Instruction *, 8> Stack;
+
+  Value *Src = CurrentTruncInst->getOperand(0);
+  Type *DstTy = CurrentTruncInst->getType();
+  unsigned TruncBitWidth = DstTy->getScalarSizeInBits();
+  unsigned OrigBitWidth =
+      CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+  if (isa<Constant>(Src))
+    return TruncBitWidth;
+
+  Worklist.push_back(Src);
+  InstInfoMap[cast<Instruction>(Src)].ValidBitWidth = TruncBitWidth;
+
+  while (!Worklist.empty()) {
+    Value *Curr = Worklist.back();
+
+    if (isa<Constant>(Curr)) {
+      Worklist.pop_back();
+      continue;
+    }
+
+    // Otherwise, it must be an instruction.
+    auto *I = cast<Instruction>(Curr);
+
+    auto &Info = InstInfoMap[I];
+
+    SmallVector<Value *, 2> Operands;
+    getRelevantOperands(I, Operands);
+
+    if (!Stack.empty() && Stack.back() == I) {
+      // Already handled all instruction operands, can remove it from both, the
+      // Worklist and the Stack, and update MinBitWidth.
+      Worklist.pop_back();
+      Stack.pop_back();
+      for (auto *Operand : Operands)
+        if (auto *IOp = dyn_cast<Instruction>(Operand))
+          Info.MinBitWidth =
+              std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth);
+      continue;
+    }
+
+    // Add the instruction to the stack before start handling its operands.
+    Stack.push_back(I);
+    unsigned ValidBitWidth = Info.ValidBitWidth;
+
+    // Update minimum bit-width before handling its operands. This is required
+    // when the instruction is part of a loop.
+    Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth);
+
+    for (auto *Operand : Operands)
+      if (auto *IOp = dyn_cast<Instruction>(Operand)) {
+        // If we already calculated the minimum bit-width for this valid
+        // bit-width, or for a smaller valid bit-width, then just keep the
+        // answer we already calculated.
+        unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth;
+        if (IOpBitwidth >= ValidBitWidth)
+          continue;
+        InstInfoMap[IOp].ValidBitWidth = std::max(ValidBitWidth, IOpBitwidth);
+        Worklist.push_back(IOp);
+      }
+  }
+  unsigned MinBitWidth = InstInfoMap.lookup(cast<Instruction>(Src)).MinBitWidth;
+  assert(MinBitWidth >= TruncBitWidth);
+
+  if (MinBitWidth > TruncBitWidth) {
+    // In this case reducing expression with vector type might generate a new
+    // vector type, which is not preferable as it might result in generating
+    // sub-optimal code.
+    if (DstTy->isVectorTy())
+      return OrigBitWidth;
+    // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth).
+    Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth);
+    // Update minimum bit-width with the new destination type bit-width if
+    // succeeded to find such, otherwise, with original bit-width.
+    MinBitWidth = Ty ? Ty->getScalarSizeInBits() : OrigBitWidth;
+  } else { // MinBitWidth == TruncBitWidth
+    // In this case the expression can be evaluated with the trunc instruction
+    // destination type, and trunc instruction can be omitted. However, we
+    // should not perform the evaluation if the original type is a legal scalar
+    // type and the target type is illegal.
+    bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth);
+    bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth);
+    if (!DstTy->isVectorTy() && FromLegal && !ToLegal)
+      return OrigBitWidth;
+  }
+  return MinBitWidth;
+}
+
+Type *TruncInstCombine::getBestTruncatedType() {
+  if (!buildTruncExpressionDag())
+    return nullptr;
+
+  // We don't want to duplicate instructions, which isn't profitable. Thus, we
+  // can't shrink something that has multiple users, unless all users are
+  // post-dominated by the trunc instruction, i.e., were visited during the
+  // expression evaluation.
+  unsigned DesiredBitWidth = 0;
+  for (auto Itr : InstInfoMap) {
+    Instruction *I = Itr.first;
+    if (I->hasOneUse())
+      continue;
+    bool IsExtInst = (isa<ZExtInst>(I) || isa<SExtInst>(I));
+    for (auto *U : I->users())
+      if (auto *UI = dyn_cast<Instruction>(U))
+        if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) {
+          if (!IsExtInst)
+            return nullptr;
+          // If this is an extension from the dest type, we can eliminate it,
+          // even if it has multiple users. Thus, update the DesiredBitWidth and
+          // validate all extension instructions agrees on same DesiredBitWidth.
+          unsigned ExtInstBitWidth =
+              I->getOperand(0)->getType()->getScalarSizeInBits();
+          if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth)
+            return nullptr;
+          DesiredBitWidth = ExtInstBitWidth;
+        }
+  }
+
+  unsigned OrigBitWidth =
+      CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+  // Calculate minimum allowed bit-width allowed for shrinking the currently
+  // visited truncate's operand.
+  unsigned MinBitWidth = getMinBitWidth();
+
+  // Check that we can shrink to smaller bit-width than original one and that
+  // it is similar to the DesiredBitWidth is such exists.
+  if (MinBitWidth >= OrigBitWidth ||
+      (DesiredBitWidth && DesiredBitWidth != MinBitWidth))
+    return nullptr;
+
+  return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth);
+}
+
+/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type
+/// for \p V, according to its type, if it vector type, return the vector
+/// version of \p Ty, otherwise return \p Ty.
+static Type *getReducedType(Value *V, Type *Ty) {
+  assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
+  if (auto *VTy = dyn_cast<VectorType>(V->getType()))
+    return VectorType::get(Ty, VTy->getNumElements());
+  return Ty;
+}
+
+Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
+  Type *Ty = getReducedType(V, SclTy);
+  if (auto *C = dyn_cast<Constant>(V)) {
+    C = ConstantExpr::getIntegerCast(C, Ty, false);
+    // If we got a constantexpr back, try to simplify it with DL info.
+    if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI))
+      C = FoldedC;
+    return C;
+  }
+
+  auto *I = cast<Instruction>(V);
+  Info Entry = InstInfoMap.lookup(I);
+  assert(Entry.NewValue);
+  return Entry.NewValue;
+}
+
+void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+  for (auto &Itr : InstInfoMap) { // Forward
+    Instruction *I = Itr.first;
+    TruncInstCombine::Info &NodeInfo = Itr.second;
+
+    assert(!NodeInfo.NewValue && "Instruction has been evaluated");
+
+    IRBuilder<> Builder(I);
+    Value *Res = nullptr;
+    unsigned Opc = I->getOpcode();
+    switch (Opc) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt: {
+      Type *Ty = getReducedType(I, SclTy);
+      // If the source type of the cast is the type we're trying for then we can
+      // just return the source.  There's no need to insert it because it is not
+      // new.
+      if (I->getOperand(0)->getType() == Ty) {
+        assert(!isa<TruncInst>(I) && "Cannot reach here with TruncInst");
+        NodeInfo.NewValue = I->getOperand(0);
+        continue;
+      }
+      // Otherwise, must be the same type of cast, so just reinsert a new one.
+      // This also handles the case of zext(trunc(x)) -> zext(x).
+      Res = Builder.CreateIntCast(I->getOperand(0), Ty,
+                                  Opc == Instruction::SExt);
+
+      // Update Worklist entries with new value if needed.
+      // There are three possible changes to the Worklist:
+      // 1. Update Old-TruncInst -> New-TruncInst.
+      // 2. Remove Old-TruncInst (if New node is not TruncInst).
+      // 3. Add New-TruncInst (if Old node was not TruncInst).
+      auto Entry = find(Worklist, I);
+      if (Entry != Worklist.end()) {
+        if (auto *NewCI = dyn_cast<TruncInst>(Res))
+          *Entry = NewCI;
+        else
+          Worklist.erase(Entry);
+      } else if (auto *NewCI = dyn_cast<TruncInst>(Res))
+          Worklist.push_back(NewCI);
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
+      Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
+      Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+      break;
+    }
+    default:
+      llvm_unreachable("Unhandled instruction");
+    }
+
+    NodeInfo.NewValue = Res;
+    if (auto *ResI = dyn_cast<Instruction>(Res))
+      ResI->takeName(I);
+  }
+
+  Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
+  Type *DstTy = CurrentTruncInst->getType();
+  if (Res->getType() != DstTy) {
+    IRBuilder<> Builder(CurrentTruncInst);
+    Res = Builder.CreateIntCast(Res, DstTy, false);
+    if (auto *ResI = dyn_cast<Instruction>(Res))
+      ResI->takeName(CurrentTruncInst);
+  }
+  CurrentTruncInst->replaceAllUsesWith(Res);
+
+  // Erase old expression dag, which was replaced by the reduced expression dag.
+  // We iterate backward, which means we visit the instruction before we visit
+  // any of its operands, this way, when we get to the operand, we already
+  // removed the instructions (from the expression dag) that uses it.
+  CurrentTruncInst->eraseFromParent();
+  for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
+    // We still need to check that the instruction has no users before we erase
+    // it, because {SExt, ZExt}Inst Instruction might have other users that was
+    // not reduced, in such case, we need to keep that instruction.
+    if (I->first->use_empty())
+      I->first->eraseFromParent();
+  }
+}
+
+bool TruncInstCombine::run(Function &F) {
+  bool MadeIRChange = false;
+
+  // Collect all TruncInst in the function into the Worklist for evaluating.
+  for (auto &BB : F) {
+    // Ignore unreachable basic block.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+    for (auto &I : BB)
+      if (auto *CI = dyn_cast<TruncInst>(&I))
+        Worklist.push_back(CI);
+  }
+
+  // Process all TruncInst in the Worklist, for each instruction:
+  //   1. Check if it dominates an eligible expression dag to be reduced.
+  //   2. Create a reduced expression dag and replace the old one with it.
+  while (!Worklist.empty()) {
+    CurrentTruncInst = Worklist.pop_back_val();
+
+    if (Type *NewDstSclTy = getBestTruncatedType()) {
+      LLVM_DEBUG(
+          dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
+                    "dominated by: "
+                 << CurrentTruncInst << '\n');
+      ReduceExpressionDag(NewDstSclTy);
+      MadeIRChange = true;
+    }
+  }
+
+  return MadeIRChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index ba05896af150..ac47a06281a5 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -27,10 +27,12 @@ namespace {
 class Lowerer : public coro::LowererBase {
   IRBuilder<> Builder;
   PointerType *const AnyResumeFnPtrTy;
+  Constant *NoopCoro = nullptr;
 
   void lowerResumeOrDestroy(CallSite CS, CoroSubFnInst::ResumeKind);
   void lowerCoroPromise(CoroPromiseInst *Intrin);
   void lowerCoroDone(IntrinsicInst *II);
+  void lowerCoroNoop(IntrinsicInst *II);
 
 public:
   Lowerer(Module &M)
@@ -103,6 +105,41 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
   II->eraseFromParent();
 }
 
+void Lowerer::lowerCoroNoop(IntrinsicInst *II) {
+  if (!NoopCoro) {
+    LLVMContext &C = Builder.getContext();
+    Module &M = *II->getModule();
+
+    // Create a noop.frame struct type.
+    StructType *FrameTy = StructType::create(C, "NoopCoro.Frame");
+    auto *FramePtrTy = FrameTy->getPointerTo();
+    auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+                                   /*IsVarArgs=*/false);
+    auto *FnPtrTy = FnTy->getPointerTo();
+    FrameTy->setBody({FnPtrTy, FnPtrTy});
+
+    // Create a Noop function that does nothing.
+    Function *NoopFn =
+        Function::Create(FnTy, GlobalValue::LinkageTypes::PrivateLinkage,
+                         "NoopCoro.ResumeDestroy", &M);
+    NoopFn->setCallingConv(CallingConv::Fast);
+    auto *Entry = BasicBlock::Create(C, "entry", NoopFn);
+    ReturnInst::Create(C, Entry);
+
+    // Create a constant struct for the frame.
+    Constant* Values[] = {NoopFn, NoopFn};
+    Constant* NoopCoroConst = ConstantStruct::get(FrameTy, Values);
+    NoopCoro = new GlobalVariable(M, NoopCoroConst->getType(), /*isConstant=*/true,
+                                GlobalVariable::PrivateLinkage, NoopCoroConst,
+                                "NoopCoro.Frame.Const");
+  }
+
+  Builder.SetInsertPoint(II);
+  auto *NoopCoroVoidPtr = Builder.CreateBitCast(NoopCoro, Int8Ptr);
+  II->replaceAllUsesWith(NoopCoroVoidPtr);
+  II->eraseFromParent();
+}
+
 // Prior to CoroSplit, calls to coro.begin needs to be marked as NoDuplicate,
 // as CoroSplit assumes there is exactly one coro.begin. After CoroSplit,
 // NoDuplicate attribute will be removed from coro.begin otherwise, it will
@@ -138,6 +175,9 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         if (cast<CoroEndInst>(&I)->isFallthrough())
           CS.setCannotDuplicate();
         break;
+      case Intrinsic::coro_noop:
+        lowerCoroNoop(cast<IntrinsicInst>(&I));
+        break;
       case Intrinsic::coro_id:
         // Mark a function that comes out of the frontend that has a coro.id
         // with a coroutine attribute.
@@ -192,10 +232,10 @@ struct CoroEarly : public FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.destroy",
-                                     "llvm.coro.done", "llvm.coro.end",
-                                     "llvm.coro.free", "llvm.coro.promise",
-                                     "llvm.coro.resume", "llvm.coro.suspend"}))
+    if (coro::declaresIntrinsics(
+            M, {"llvm.coro.id", "llvm.coro.destroy", "llvm.coro.done",
+                "llvm.coro.end", "llvm.coro.noop", "llvm.coro.free",
+                "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.suspend"}))
       L = llvm::make_unique<Lowerer>(M);
     return false;
   }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 42fd6d746145..dfe05c4b2a5e 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -14,6 +14,7 @@
 #include "CoroInternal.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -35,8 +36,8 @@ struct Lowerer : coro::LowererBase {
   Lowerer(Module &M) : LowererBase(M) {}
 
   void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA);
-  bool shouldElide() const;
-  bool processCoroId(CoroIdInst *, AAResults &AA);
+  bool shouldElide(Function *F, DominatorTree &DT) const;
+  bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT);
 };
 } // end anonymous namespace
 
@@ -77,7 +78,6 @@ static bool operandReferences(CallInst *CI, AllocaInst *Frame, AAResults &AA) {
 // call implies that the function does not references anything on the stack.
 static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
   Function &F = *Frame->getFunction();
-  MemoryLocation Mem(Frame);
   for (Instruction &I : instructions(F))
     if (auto *Call = dyn_cast<CallInst>(&I))
       if (Call->isTailCall() && operandReferences(Call, Frame, AA)) {
@@ -142,33 +142,54 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   removeTailCallAttribute(Frame, AA);
 }
 
-bool Lowerer::shouldElide() const {
+bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // If no CoroAllocs, we cannot suppress allocation, so elision is not
   // possible.
   if (CoroAllocs.empty())
     return false;
 
   // Check that for every coro.begin there is a coro.destroy directly
-  // referencing the SSA value of that coro.begin. If the value escaped, then
-  // coro.destroy would have been referencing a memory location storing that
-  // value and not the virtual register.
-
-  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+  // referencing the SSA value of that coro.begin along a non-exceptional path.
+  // If the value escaped, then coro.destroy would have been referencing a
+  // memory location storing that value and not the virtual register.
+
+  // First gather all of the non-exceptional terminators for the function.
+  SmallPtrSet<Instruction *, 8> Terminators;
+  for (BasicBlock &B : *F) {
+    auto *TI = B.getTerminator();
+    if (TI->getNumSuccessors() == 0 && !TI->isExceptional() &&
+        !isa<UnreachableInst>(TI))
+      Terminators.insert(TI);
+  }
 
+  // Filter out the coro.destroy that lie along exceptional paths.
+  SmallPtrSet<CoroSubFnInst *, 4> DAs;
   for (CoroSubFnInst *DA : DestroyAddr) {
+    for (Instruction *TI : Terminators) {
+      if (DT.dominates(DA, TI)) {
+        DAs.insert(DA);
+        break;
+      }
+    }
+  }
+
+  // Find all the coro.begin referenced by coro.destroy along happy paths.
+  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+  for (CoroSubFnInst *DA : DAs) {
     if (auto *CB = dyn_cast<CoroBeginInst>(DA->getFrame()))
       ReferencedCoroBegins.insert(CB);
     else
       return false;
   }
 
-  // If size of the set is the same as total number of CoroBegins, means we
-  // found a coro.free or coro.destroy mentioning a coro.begin and we can
+  // If size of the set is the same as total number of coro.begin, that means we
+  // found a coro.free or coro.destroy referencing each coro.begin, so we can
   // perform heap elision.
   return ReferencedCoroBegins.size() == CoroBegins.size();
 }
 
-bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) {
+bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
+                            DominatorTree &DT) {
   CoroBegins.clear();
   CoroAllocs.clear();
   CoroFrees.clear();
@@ -214,7 +235,7 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) {
 
   replaceWithConstant(ResumeAddrConstant, ResumeAddr);
 
-  bool ShouldElide = shouldElide();
+  bool ShouldElide = shouldElide(CoroId->getFunction(), DT);
 
   auto *DestroyAddrConstant = ConstantExpr::getExtractValue(
       Resumers,
@@ -294,14 +315,16 @@ struct CoroElide : FunctionPass {
       return Changed;
 
     AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
     for (auto *CII : L->CoroIds)
-      Changed |= L->processCoroId(CII, AA);
+      Changed |= L->processCoroId(CII, AA, DT);
 
     return Changed;
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
   }
   StringRef getPassName() const override { return "Coroutine Elision"; }
 };
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 6334256bf03a..cf63b678b618 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -19,6 +19,8 @@
 
 #include "CoroInternal.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -27,7 +29,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 
@@ -48,7 +49,7 @@ public:
   BlockToIndexMapping(Function &F) {
     for (BasicBlock &BB : F)
       V.push_back(&BB);
-    std::sort(V.begin(), V.end());
+    llvm::sort(V.begin(), V.end());
   }
 
   size_t blockToIndex(BasicBlock *BB) const {
@@ -105,8 +106,8 @@ struct SuspendCrossingInfo {
 
     assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def");
     bool const Result = Block[UseIndex].Kills[DefIndex];
-    DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
-                 << " answer is " << Result << "\n");
+    LLVM_DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
+                      << " answer is " << Result << "\n");
     return Result;
   }
 
@@ -194,8 +195,8 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
 
   bool Changed;
   do {
-    DEBUG(dbgs() << "iteration " << ++Iteration);
-    DEBUG(dbgs() << "==============\n");
+    LLVM_DEBUG(dbgs() << "iteration " << ++Iteration);
+    LLVM_DEBUG(dbgs() << "==============\n");
 
     Changed = false;
     for (size_t I = 0; I < N; ++I) {
@@ -239,20 +240,20 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
         Changed |= (S.Kills != SavedKills) || (S.Consumes != SavedConsumes);
 
         if (S.Kills != SavedKills) {
-          DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()
-                       << "\n");
-          DEBUG(dump("S.Kills", S.Kills));
-          DEBUG(dump("SavedKills", SavedKills));
+          LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()
+                            << "\n");
+          LLVM_DEBUG(dump("S.Kills", S.Kills));
+          LLVM_DEBUG(dump("SavedKills", SavedKills));
         }
         if (S.Consumes != SavedConsumes) {
-          DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");
-          DEBUG(dump("S.Consume", S.Consumes));
-          DEBUG(dump("SavedCons", SavedConsumes));
+          LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");
+          LLVM_DEBUG(dump("S.Consume", S.Consumes));
+          LLVM_DEBUG(dump("SavedCons", SavedConsumes));
         }
       }
     }
   } while (Changed);
-  DEBUG(dump());
+  LLVM_DEBUG(dump());
 }
 
 #undef DEBUG_TYPE // "coro-suspend-crossing"
@@ -263,8 +264,9 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
 
 namespace {
 class Spill {
-  Value *Def;
-  Instruction *User;
+  Value *Def = nullptr;
+  Instruction *User = nullptr;
+  unsigned FieldNo = 0;
 
 public:
   Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {}
@@ -272,6 +274,20 @@ public:
   Value *def() const { return Def; }
   Instruction *user() const { return User; }
   BasicBlock *userBlock() const { return User->getParent(); }
+
+  // Note that field index is stored in the first SpillEntry for a particular
+  // definition. Subsequent mentions of a defintion do not have fieldNo
+  // assigned. This works out fine as the users of Spills capture the info about
+  // the definition the first time they encounter it. Consider refactoring
+  // SpillInfo into two arrays to normalize the spill representation.
+  unsigned fieldIndex() const {
+    assert(FieldNo && "Accessing unassigned field");
+    return FieldNo;
+  }
+  void setFieldIndex(unsigned FieldNumber) {
+    assert(!FieldNo && "Reassigning field number");
+    FieldNo = FieldNumber;
+  }
 };
 } // namespace
 
@@ -294,6 +310,57 @@ static void dump(StringRef Title, SpillInfo const &Spills) {
 }
 #endif
 
+namespace {
+// We cannot rely solely on natural alignment of a type when building a
+// coroutine frame and if the alignment specified on the Alloca instruction
+// differs from the natural alignment of the alloca type we will need to insert
+// padding.
+struct PaddingCalculator {
+  const DataLayout &DL;
+  LLVMContext &Context;
+  unsigned StructSize = 0;
+
+  PaddingCalculator(LLVMContext &Context, DataLayout const &DL)
+      : DL(DL), Context(Context) {}
+
+  // Replicate the logic from IR/DataLayout.cpp to match field offset
+  // computation for LLVM structs.
+  void addType(Type *Ty) {
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
+    if ((StructSize & (TyAlign - 1)) != 0)
+      StructSize = alignTo(StructSize, TyAlign);
+
+    StructSize += DL.getTypeAllocSize(Ty); // Consume space for this data item.
+  }
+
+  void addTypes(SmallVectorImpl<Type *> const &Types) {
+    for (auto *Ty : Types)
+      addType(Ty);
+  }
+
+  unsigned computePadding(Type *Ty, unsigned ForcedAlignment) {
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
+    auto Natural = alignTo(StructSize, TyAlign);
+    auto Forced = alignTo(StructSize, ForcedAlignment);
+
+    // Return how many bytes of padding we need to insert.
+    if (Natural != Forced)
+      return std::max(Natural, Forced) - StructSize;
+
+    // Rely on natural alignment.
+    return 0;
+  }
+
+  // If padding required, return the padding field type to insert.
+  ArrayType *getPaddingType(Type *Ty, unsigned ForcedAlignment) {
+    if (auto Padding = computePadding(Ty, ForcedAlignment))
+      return ArrayType::get(Type::getInt8Ty(Context), Padding);
+
+    return nullptr;
+  }
+};
+} // namespace
+
 // Build a struct that will keep state for an active coroutine.
 //   struct f.frame {
 //     ResumeFnTy ResumeFnAddr;
@@ -305,6 +372,8 @@ static void dump(StringRef Title, SpillInfo const &Spills) {
 static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                   SpillInfo &Spills) {
   LLVMContext &C = F.getContext();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PaddingCalculator Padder(C, DL);
   SmallString<32> Name(F.getName());
   Name.append(".Frame");
   StructType *FrameTy = StructType::create(C, Name);
@@ -322,8 +391,10 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                Type::getIntNTy(C, IndexBits)};
   Value *CurrentDef = nullptr;
 
+  Padder.addTypes(Types);
+
   // Create an entry for every spilled value.
-  for (auto const &S : Spills) {
+  for (auto &S : Spills) {
     if (CurrentDef == S.def())
       continue;
 
@@ -333,12 +404,22 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
       continue;
 
     Type *Ty = nullptr;
-    if (auto *AI = dyn_cast<AllocaInst>(CurrentDef))
+    if (auto *AI = dyn_cast<AllocaInst>(CurrentDef)) {
       Ty = AI->getAllocatedType();
-    else
+      if (unsigned AllocaAlignment = AI->getAlignment()) {
+        // If alignment is specified in alloca, see if we need to insert extra
+        // padding.
+        if (auto PaddingTy = Padder.getPaddingType(Ty, AllocaAlignment)) {
+          Types.push_back(PaddingTy);
+          Padder.addType(PaddingTy);
+        }
+      }
+    } else {
       Ty = CurrentDef->getType();
-
+    }
+    S.setFieldIndex(Types.size());
     Types.push_back(Ty);
+    Padder.addType(Ty);
   }
   FrameTy->setBody(Types);
 
@@ -399,7 +480,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   Value *CurrentValue = nullptr;
   BasicBlock *CurrentBlock = nullptr;
   Value *CurrentReload = nullptr;
-  unsigned Index = coro::Shape::LastKnownField;
+  unsigned Index = 0; // Proper field number will be read from field definition.
 
   // We need to keep track of any allocas that need "spilling"
   // since they will live in the coroutine frame now, all access to them
@@ -414,6 +495,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   // Create a load instruction to reload the spilled value from the coroutine
   // frame.
   auto CreateReload = [&](Instruction *InsertBefore) {
+    assert(Index && "accessing unassigned field number");
     Builder.SetInsertPoint(InsertBefore);
     auto *G = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, Index,
                                                  CurrentValue->getName() +
@@ -431,7 +513,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
       CurrentBlock = nullptr;
       CurrentReload = nullptr;
 
-      ++Index;
+      Index = E.fieldIndex();
 
       if (auto *AI = dyn_cast<AllocaInst>(CurrentValue)) {
         // Spilled AllocaInst will be replaced with GEP from the coroutine frame
@@ -739,6 +821,8 @@ static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
     for (User *U : CurrentValue->users()) {
       Instruction *I = cast<Instruction>(U);
       if (!DT.dominates(CoroBegin, I)) {
+        LLVM_DEBUG(dbgs() << "will move: " << *I << "\n");
+
         // TODO: Make this more robust. Currently if we run into a situation
         // where simple instruction move won't work we panic and
         // report_fatal_error.
@@ -748,7 +832,6 @@ static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
                                " dominated by CoroBegin");
         }
 
-        DEBUG(dbgs() << "will move: " << *I << "\n");
         NeedsMoving.push_back(I);
       }
     }
@@ -823,7 +906,7 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
       break;
 
     // Rewrite materializable instructions to be materialized at the use point.
-    DEBUG(dump("Materializations", Spills));
+    LLVM_DEBUG(dump("Materializations", Spills));
     rewriteMaterializableInstructions(Builder, Spills);
     Spills.clear();
   }
@@ -853,7 +936,7 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
         Spills.emplace_back(&I, U);
       }
   }
-  DEBUG(dump("Spills", Spills));
+  LLVM_DEBUG(dump("Spills", Spills));
   moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
   Shape.FramePtr = insertSpills(Spills, Shape);
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 1eac88dbac3a..8e690d649cf5 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -76,7 +76,6 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     DestroyField,
     PromiseField,
     IndexField,
-    LastKnownField = IndexField
   };
 
   StructType *FrameTy;
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 122f51a0d214..49acc5e93a39 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -59,7 +60,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 #include <cstddef>
@@ -250,7 +250,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType());
 
   Function *NewF =
-      Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
+      Function::Create(FnTy, GlobalValue::LinkageTypes::ExternalLinkage,
                        F.getName() + Suffix, M);
   NewF->addParamAttr(0, Attribute::NonNull);
   NewF->addParamAttr(0, Attribute::NoAlias);
@@ -265,6 +265,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   SmallVector<ReturnInst *, 4> Returns;
 
   CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
+  NewF->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
 
   // Remove old returns.
   for (ReturnInst *Return : Returns)
@@ -653,13 +654,28 @@ getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
   // set.
   do {
     Instruction *Current = Work.pop_back_val();
+    LLVM_DEBUG(dbgs() << "CoroSplit: Will not relocate: " << *Current << "\n");
     DoNotRelocate.insert(Current);
     for (Value *U : Current->operands()) {
       auto *I = dyn_cast<Instruction>(U);
       if (!I)
         continue;
-      if (isa<AllocaInst>(U))
+
+      if (auto *A = dyn_cast<AllocaInst>(I)) {
+        // Stores to alloca instructions that occur before the coroutine frame
+        // is allocated should not be moved; the stored values may be used by
+        // the coroutine frame allocator. The operands to those stores must also
+        // remain in place.
+        for (const auto &User : A->users())
+          if (auto *SI = dyn_cast<llvm::StoreInst>(User))
+            if (RelocBlocks.count(SI->getParent()) != 0 &&
+                DoNotRelocate.count(SI) == 0) {
+              Work.push_back(SI);
+              DoNotRelocate.insert(SI);
+            }
         continue;
+      }
+
       if (DoNotRelocate.count(I) == 0) {
         Work.push_back(I);
         DoNotRelocate.insert(I);
@@ -834,8 +850,8 @@ struct CoroSplit : public CallGraphSCCPass {
     for (Function *F : Coroutines) {
       Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
       StringRef Value = Attr.getValueAsString();
-      DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
-                   << "' state: " << Value << "\n");
+      LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
+                        << "' state: " << Value << "\n");
       if (Value == UNPREPARED_FOR_SPLIT) {
         prepareForSplit(*F, CG);
         continue;
diff --git a/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 10411c1bd65d..731faeb5dce4 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines.h"
 #include "CoroInstr.h"
 #include "CoroInternal.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -31,10 +33,8 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Coroutines.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstddef>
 #include <utility>
@@ -125,9 +125,10 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
   static const char *const CoroIntrinsics[] = {
       "llvm.coro.alloc",   "llvm.coro.begin",   "llvm.coro.destroy",
       "llvm.coro.done",    "llvm.coro.end",     "llvm.coro.frame",
-      "llvm.coro.free",    "llvm.coro.id",      "llvm.coro.param",
-      "llvm.coro.promise", "llvm.coro.resume",  "llvm.coro.save",
-      "llvm.coro.size",    "llvm.coro.subfn.addr", "llvm.coro.suspend",
+      "llvm.coro.free",    "llvm.coro.id",      "llvm.coro.noop",
+      "llvm.coro.param",   "llvm.coro.promise", "llvm.coro.resume",
+      "llvm.coro.save",    "llvm.coro.size",    "llvm.coro.subfn.addr",
+      "llvm.coro.suspend",
   };
   return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index a4bbc99b1f90..3b735ddd192e 100644
--- a/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -50,7 +50,8 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, ModuleAnalysisManager &) {
       for (CallSite CS : Calls)
         // FIXME: We really shouldn't be able to fail to inline at this point!
         // We should do something to log or check the inline failures here.
-        Changed |= InlineFunction(CS, IFI);
+        Changed |=
+            InlineFunction(CS, IFI, /*CalleeAAR=*/nullptr, InsertLifetime);
 
       // Remember to try and delete this function afterward. This both avoids
       // re-walking the rest of the module and avoids dealing with any iterator
@@ -129,7 +130,7 @@ Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
   return new AlwaysInlinerLegacyPass(InsertLifetime);
 }
 
-/// \brief Get the inline cost for the always-inliner.
+/// Get the inline cost for the always-inliner.
 ///
 /// The always inliner *only* handles functions which are marked with the
 /// attribute to force inlining. As such, it is dramatically simpler and avoids
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 76c4a8fbc16e..f2c2b55b1c5b 100644
--- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -220,8 +220,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   NF->setSubprogram(F->getSubprogram());
   F->setSubprogram(nullptr);
 
-  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
-               << "From: " << *F);
+  LLVM_DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
+                    << "From: " << *F);
 
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
@@ -426,8 +426,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
         I2->setName(I->getName() + ".val");
         LI->replaceAllUsesWith(&*I2);
         LI->eraseFromParent();
-        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
-                     << "' in function '" << F->getName() << "'\n");
+        LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+                          << "' in function '" << F->getName() << "'\n");
       } else {
         GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
         IndicesVector Operands;
@@ -453,8 +453,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
         NewName += ".val";
         TheArg->setName(NewName);
 
-        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
-                     << "' of function '" << NF->getName() << "'\n");
+        LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+                          << "' of function '" << NF->getName() << "'\n");
 
         // All of the uses must be load instructions.  Replace them all with
         // the argument specified by ArgNo.
@@ -688,11 +688,11 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
     // to do.
     if (ToPromote.find(Operands) == ToPromote.end()) {
       if (MaxElements > 0 && ToPromote.size() == MaxElements) {
-        DEBUG(dbgs() << "argpromotion not promoting argument '"
-                     << Arg->getName()
-                     << "' because it would require adding more "
-                     << "than " << MaxElements
-                     << " arguments to the function.\n");
+        LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
+                          << Arg->getName()
+                          << "' because it would require adding more "
+                          << "than " << MaxElements
+                          << " arguments to the function.\n");
         // We limit aggregate promotion to only promoting up to a fixed number
         // of elements of the aggregate.
         return false;
@@ -738,7 +738,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
   return true;
 }
 
-/// \brief Checks if a type could have padding bytes.
+/// Checks if a type could have padding bytes.
 static bool isDenselyPacked(Type *type, const DataLayout &DL) {
   // There is no size information, so be conservative.
   if (!type->isSized())
@@ -772,7 +772,7 @@ static bool isDenselyPacked(Type *type, const DataLayout &DL) {
   return true;
 }
 
-/// \brief Checks if the padding bytes of an argument could be accessed.
+/// Checks if the padding bytes of an argument could be accessed.
 static bool canPaddingBeAccessed(Argument *arg) {
   assert(arg->hasByValAttr());
 
@@ -817,6 +817,12 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
                  unsigned MaxElements,
                  Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
                      ReplaceCallSite) {
+  // Don't perform argument promotion for naked functions; otherwise we can end
+  // up removing parameters that are seemingly 'not used' as they are referred
+  // to in the assembly.
+  if(F->hasFnAttribute(Attribute::Naked))
+    return nullptr;
+
   // Make sure that it is local to this module.
   if (!F->hasLocalLinkage())
     return nullptr;
@@ -895,11 +901,11 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
     if (isSafeToPromote) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
-          DEBUG(dbgs() << "argpromotion disable promoting argument '"
-                       << PtrArg->getName()
-                       << "' because it would require adding more"
-                       << " than " << MaxElements
-                       << " arguments to the function.\n");
+          LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
+                            << PtrArg->getName()
+                            << "' because it would require adding more"
+                            << " than " << MaxElements
+                            << " arguments to the function.\n");
           continue;
         }
 
@@ -973,7 +979,7 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
         return FAM.getResult<AAManager>(F);
       };
 
-      Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
+      Function *NewF = promoteArguments(&OldF, AARGetter, MaxElements, None);
       if (!NewF)
         continue;
       LocalChange = true;
diff --git a/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
index 6af104362594..05fc3dd6950c 100644
--- a/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -23,7 +23,7 @@
 using namespace llvm;
 
 namespace {
-/// \brief A nonce module pass used to place a barrier in a pass manager.
+/// A nonce module pass used to place a barrier in a pass manager.
 ///
 /// There is no mechanism for ending a CGSCC pass manager once one is started.
 /// This prevents extension points from having clear deterministic ordering
diff --git a/contrib/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/BlockExtractor.cpp
new file mode 100644
index 000000000000..ff5ee817da49
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -0,0 +1,176 @@
+//===- BlockExtractor.cpp - Extracts blocks into their own functions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts the specified basic blocks from the module into their
+// own functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "block-extractor"
+
+STATISTIC(NumExtracted, "Number of basic blocks extracted");
+
+static cl::opt<std::string> BlockExtractorFile(
+    "extract-blocks-file", cl::value_desc("filename"),
+    cl::desc("A file containing list of basic blocks to extract"), cl::Hidden);
+
+cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs",
+                                       cl::desc("Erase the existing functions"),
+                                       cl::Hidden);
+
+namespace {
+class BlockExtractor : public ModulePass {
+  SmallVector<BasicBlock *, 16> Blocks;
+  bool EraseFunctions;
+  SmallVector<std::pair<std::string, std::string>, 32> BlocksByName;
+
+public:
+  static char ID;
+  BlockExtractor(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
+                 bool EraseFunctions)
+      : ModulePass(ID), Blocks(BlocksToExtract.begin(), BlocksToExtract.end()),
+        EraseFunctions(EraseFunctions) {
+    if (!BlockExtractorFile.empty())
+      loadFile();
+  }
+  BlockExtractor() : BlockExtractor(SmallVector<BasicBlock *, 0>(), false) {}
+  bool runOnModule(Module &M) override;
+
+private:
+  void loadFile();
+  void splitLandingPadPreds(Function &F);
+};
+} // end anonymous namespace
+
+char BlockExtractor::ID = 0;
+INITIALIZE_PASS(BlockExtractor, "extract-blocks",
+                "Extract basic blocks from module", false, false)
+
+ModulePass *llvm::createBlockExtractorPass() { return new BlockExtractor(); }
+ModulePass *llvm::createBlockExtractorPass(
+    const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
+  return new BlockExtractor(BlocksToExtract, EraseFunctions);
+}
+
+/// Gets all of the blocks specified in the input file.
+void BlockExtractor::loadFile() {
+  auto ErrOrBuf = MemoryBuffer::getFile(BlockExtractorFile);
+  if (ErrOrBuf.getError())
+    report_fatal_error("BlockExtractor couldn't load the file.");
+  // Read the file.
+  auto &Buf = *ErrOrBuf;
+  SmallVector<StringRef, 16> Lines;
+  Buf->getBuffer().split(Lines, '\n', /*MaxSplit=*/-1,
+                         /*KeepEmpty=*/false);
+  for (const auto &Line : Lines) {
+    auto FBPair = Line.split(' ');
+    BlocksByName.push_back({FBPair.first, FBPair.second});
+  }
+}
+
+/// Extracts the landing pads to make sure all of them have only one
+/// predecessor.
+void BlockExtractor::splitLandingPadPreds(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (!isa<InvokeInst>(&I))
+        continue;
+      InvokeInst *II = cast<InvokeInst>(&I);
+      BasicBlock *Parent = II->getParent();
+      BasicBlock *LPad = II->getUnwindDest();
+
+      // Look through the landing pad's predecessors. If one of them ends in an
+      // 'invoke', then we want to split the landing pad.
+      bool Split = false;
+      for (auto PredBB : predecessors(LPad)) {
+        if (PredBB->isLandingPad() && PredBB != Parent &&
+            isa<InvokeInst>(Parent->getTerminator())) {
+          Split = true;
+          break;
+        }
+      }
+
+      if (!Split)
+        continue;
+
+      SmallVector<BasicBlock *, 2> NewBBs;
+      SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
+    }
+  }
+}
+
+bool BlockExtractor::runOnModule(Module &M) {
+
+  bool Changed = false;
+
+  // Get all the functions.
+  SmallVector<Function *, 4> Functions;
+  for (Function &F : M) {
+    splitLandingPadPreds(F);
+    Functions.push_back(&F);
+  }
+
+  // Get all the blocks specified in the input file.
+  for (const auto &BInfo : BlocksByName) {
+    Function *F = M.getFunction(BInfo.first);
+    if (!F)
+      report_fatal_error("Invalid function name specified in the input file");
+    auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
+      return BB.getName().equals(BInfo.second);
+    });
+    if (Res == F->end())
+      report_fatal_error("Invalid block name specified in the input file");
+    Blocks.push_back(&*Res);
+  }
+
+  // Extract basic blocks.
+  for (BasicBlock *BB : Blocks) {
+    // Check if the module contains BB.
+    if (BB->getParent()->getParent() != &M)
+      report_fatal_error("Invalid basic block");
+    LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting "
+                      << BB->getParent()->getName() << ":" << BB->getName()
+                      << "\n");
+    SmallVector<BasicBlock *, 2> BlocksToExtractVec;
+    BlocksToExtractVec.push_back(BB);
+    if (const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      BlocksToExtractVec.push_back(II->getUnwindDest());
+    CodeExtractor(BlocksToExtractVec).extractCodeRegion();
+    ++NumExtracted;
+    Changed = true;
+  }
+
+  // Erase the functions.
+  if (EraseFunctions || BlockExtractorEraseFuncs) {
+    for (Function *F : Functions) {
+      LLVM_DEBUG(dbgs() << "BlockExtractor: Trying to delete " << F->getName()
+                        << "\n");
+      F->deleteBody();
+    }
+    // Set linkage as ExternalLinkage to avoid erasing unreachable functions.
+    for (Function &F : M)
+      F.setLinkage(GlobalValue::ExternalLinkage);
+    Changed = true;
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/contrib/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index c5f6336aa2be..d642445b35de 100644
--- a/contrib/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -69,12 +69,15 @@ public:
 
   CVPLatticeVal() : LatticeState(Undefined) {}
   CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
-  CVPLatticeVal(std::set<Function *, Compare> &&Functions)
-      : LatticeState(FunctionSet), Functions(Functions) {}
+  CVPLatticeVal(std::vector<Function *> &&Functions)
+      : LatticeState(FunctionSet), Functions(std::move(Functions)) {
+    assert(std::is_sorted(this->Functions.begin(), this->Functions.end(),
+                          Compare()));
+  }
 
   /// Get a reference to the functions held by this lattice value. The number
   /// of functions will be zero for states other than FunctionSet.
-  const std::set<Function *, Compare> &getFunctions() const {
+  const std::vector<Function *> &getFunctions() const {
     return Functions;
   }
 
@@ -99,7 +102,8 @@ private:
   /// MaxFunctionsPerValue. Since most LLVM values are expected to be in
   /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be
   /// small and efficiently copyable.
-  std::set<Function *, Compare> Functions;
+  // FIXME: This could be a TinyPtrVector and/or merge with LatticeState.
+  std::vector<Function *> Functions;
 };
 
 /// The custom lattice function used by the generic sparse propagation solver.
@@ -150,11 +154,10 @@ public:
       return getOverdefinedVal();
     if (X == getUndefVal() && Y == getUndefVal())
       return getUndefVal();
-    std::set<Function *, CVPLatticeVal::Compare> Union;
+    std::vector<Function *> Union;
     std::set_union(X.getFunctions().begin(), X.getFunctions().end(),
                    Y.getFunctions().begin(), Y.getFunctions().end(),
-                   std::inserter(Union, Union.begin()),
-                   CVPLatticeVal::Compare{});
+                   std::back_inserter(Union), CVPLatticeVal::Compare{});
     if (Union.size() > MaxFunctionsPerValue)
       return getOverdefinedVal();
     return CVPLatticeVal(std::move(Union));
@@ -265,6 +268,10 @@ private:
 
     // If we can't track the function's return values, there's nothing to do.
     if (!F || !canTrackReturnsInterprocedurally(F)) {
+      // Void return, No need to create and update CVPLattice state as no one
+      // can use it.
+      if (I->getType()->isVoidTy())
+        return;
       ChangedValues[RegI] = getOverdefinedVal();
       return;
     }
@@ -280,6 +287,12 @@ private:
       ChangedValues[RegFormal] =
           MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
     }
+
+    // Void return, No need to create and update CVPLattice state as no one can
+    // use it.
+    if (I->getType()->isVoidTy())
+      return;
+
     ChangedValues[RegI] =
         MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
   }
@@ -377,8 +390,7 @@ static bool runCVP(Module &M) {
     CVPLatticeVal LV = Solver.getExistingValueState(RegI);
     if (!LV.isFunctionSet() || LV.getFunctions().empty())
       continue;
-    MDNode *Callees = MDB.createCallees(SmallVector<Function *, 4>(
-        LV.getFunctions().begin(), LV.getFunctions().end()));
+    MDNode *Callees = MDB.createCallees(LV.getFunctions());
     C->setMetadata(LLVMContext::MD_callees, Callees);
     Changed = true;
   }
diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index 886029ea58d5..666f6cc37bfd 100644
--- a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -162,9 +162,6 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
 }
 
 bool CrossDSOCFI::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
   VeryLikelyWeights =
     MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
   if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index b2afa6f2c9cd..31e771da3bd3 100644
--- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -240,8 +240,11 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     I2->takeName(&*I);
   }
 
-  // Patch the pointer to LLVM function in debug info descriptor.
-  NF->setSubprogram(Fn.getSubprogram());
+  // Clone metadatas from the old function, including debug info descriptor.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  Fn.getAllMetadata(MDs);
+  for (auto MD : MDs)
+    NF->addMetadata(MD.first, *MD.second);
 
   // Fix up any BlockAddresses that refer to the function.
   Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
@@ -526,8 +529,8 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   }
 
   if (HasMustTailCalls) {
-    DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
-                 << " has musttail calls\n");
+    LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+                      << " has musttail calls\n");
   }
 
   if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
@@ -535,8 +538,9 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
     return;
   }
 
-  DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
-               << F.getName() << "\n");
+  LLVM_DEBUG(
+      dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
+             << F.getName() << "\n");
   // Keep track of the number of live retvals, so we can skip checks once all
   // of them turn out to be live.
   unsigned NumLiveRetVals = 0;
@@ -603,16 +607,16 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   }
 
   if (HasMustTailCallers) {
-    DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
-                 << " has musttail callers\n");
+    LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+                      << " has musttail callers\n");
   }
 
   // Now we've inspected all callers, record the liveness of our return values.
   for (unsigned i = 0; i != RetCount; ++i)
     MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
 
-  DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
-               << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
+                    << F.getName() << "\n");
 
   // Now, check all of our arguments.
   unsigned i = 0;
@@ -671,8 +675,8 @@ void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
 /// mark any values that are used as this function's parameters or by its return
 /// values (according to Uses) live as well.
 void DeadArgumentEliminationPass::MarkLive(const Function &F) {
-  DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
-               << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
+                    << F.getName() << "\n");
   // Mark the function as live.
   LiveFunctions.insert(&F);
   // Mark all arguments as live.
@@ -693,8 +697,8 @@ void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
   if (!LiveValues.insert(RA).second)
     return; // We were already marked Live.
 
-  DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
-               << RA.getDescription() << " live\n");
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
+                    << RA.getDescription() << " live\n");
   PropagateLiveness(RA);
 }
 
@@ -752,9 +756,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
-      DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
-                   << " (" << I->getName() << ") from " << F->getName()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
+                        << i << " (" << I->getName() << ") from "
+                        << F->getName() << "\n");
     }
   }
 
@@ -797,8 +801,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         NewRetIdxs[i] = RetTypes.size() - 1;
       } else {
         ++NumRetValsEliminated;
-        DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing return value "
-                     << i << " from " << F->getName() << "\n");
+        LLVM_DEBUG(
+            dbgs() << "DeadArgumentEliminationPass - Removing return value "
+                   << i << " from " << F->getName() << "\n");
       }
     }
     if (RetTypes.size() > 1) {
@@ -837,10 +842,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
   AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
+  // Strip allocsize attributes. They might refer to the deleted arguments.
+  AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
+      F->getContext(), Attribute::AllocSize);
+
   // Reconstruct the AttributesList based on the vector we constructed.
   assert(ArgAttrVec.size() == Params.size());
-  AttributeList NewPAL = AttributeList::get(
-      F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec);
+  AttributeList NewPAL =
+      AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -909,8 +918,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Reconstruct the AttributesList based on the vector we constructed.
     assert(ArgAttrVec.size() == Args.size());
+
+    // Again, be sure to remove any allocsize attributes, since their indices
+    // may now be incorrect.
+    AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+        F->getContext(), Attribute::AllocSize);
+
     AttributeList NewCallPAL = AttributeList::get(
-        F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec);
+        F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
@@ -1051,8 +1066,11 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         BB.getInstList().erase(RI);
       }
 
-  // Patch the pointer to LLVM function in debug info descriptor.
-  NF->setSubprogram(F->getSubprogram());
+  // Clone metadatas from the old function, including debug info descriptor.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F->getAllMetadata(MDs);
+  for (auto MD : MDs)
+    NF->addMetadata(MD.first, *MD.second);
 
   // Now that the old function is dead, delete it.
   F->eraseFromParent();
@@ -1068,7 +1086,7 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
   // removed.  We can do this if they never call va_start.  This loop cannot be
   // fused with the next loop, because deleting a function invalidates
   // information computed while surveying other functions.
-  DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
     Function &F = *I++;
     if (F.getFunctionType()->isVarArg())
@@ -1079,7 +1097,7 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
   // We assume all arguments are dead unless proven otherwise (allowing us to
   // determine that dead arguments passed into recursive functions are dead).
   //
-  DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
   for (auto &F : M)
     SurveyFunction(F);
 
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
index 042cacb70ad0..d45a88323910 100644
--- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -51,7 +51,7 @@ static void makeVisible(GlobalValue &GV, bool Delete) {
 }
 
 namespace {
-  /// @brief A pass to extract specific global values and their dependencies.
+  /// A pass to extract specific global values and their dependencies.
   class GVExtractorPass : public ModulePass {
     SetVector<GlobalValue *> Named;
     bool deleteStuff;
diff --git a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 325a5d77aadb..37273f975417 100644
--- a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -42,8 +42,10 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) {
       .Case("nonlazybind", Attribute::NonLazyBind)
       .Case("noredzone", Attribute::NoRedZone)
       .Case("noreturn", Attribute::NoReturn)
+      .Case("nocf_check", Attribute::NoCfCheck)
       .Case("norecurse", Attribute::NoRecurse)
       .Case("nounwind", Attribute::NoUnwind)
+      .Case("optforfuzzing", Attribute::OptForFuzzing)
       .Case("optnone", Attribute::OptimizeNone)
       .Case("optsize", Attribute::OptimizeForSize)
       .Case("readnone", Attribute::ReadNone)
@@ -51,6 +53,7 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) {
       .Case("argmemonly", Attribute::ArgMemOnly)
       .Case("returns_twice", Attribute::ReturnsTwice)
       .Case("safestack", Attribute::SafeStack)
+      .Case("shadowcallstack", Attribute::ShadowCallStack)
       .Case("sanitize_address", Attribute::SanitizeAddress)
       .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
       .Case("sanitize_memory", Attribute::SanitizeMemory)
@@ -72,8 +75,8 @@ static void addForcedAttributes(Function &F) {
 
     auto Kind = parseAttrKind(KV.second);
     if (Kind == Attribute::None) {
-      DEBUG(dbgs() << "ForcedAttribute: " << KV.second
-                   << " unknown or not handled!\n");
+      LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second
+                        << " unknown or not handled!\n");
       continue;
     }
     if (F.hasFnAttribute(Kind))
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 5352e32479bb..2797da6c0abd 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -74,6 +73,7 @@ STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
+STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
 
 // FIXME: This is disabled by default to avoid exposing security vulnerabilities
 // in C/C++ code compiled by clang:
@@ -83,6 +83,10 @@ static cl::opt<bool> EnableNonnullArgPropagation(
     cl::desc("Try to propagate nonnull argument attributes from callsites to "
              "caller functions."));
 
+static cl::opt<bool> DisableNoUnwindInference(
+    "disable-nounwind-inference", cl::Hidden,
+    cl::desc("Stop inferring nounwind attribute during function-attrs pass"));
+
 namespace {
 
 using SCCNodeSet = SmallSetVector<Function *, 8>;
@@ -401,7 +405,7 @@ static Attribute::AttrKind
 determinePointerReadAttrs(Argument *A,
                           const SmallPtrSet<Argument *, 8> &SCCNodes) {
   SmallVector<Use *, 32> Worklist;
-  SmallSet<Use *, 32> Visited;
+  SmallPtrSet<Use *, 32> Visited;
 
   // inalloca arguments are always clobbered by the call.
   if (A->hasInAllocaAttr())
@@ -1008,7 +1012,8 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
       if (!Speculative) {
         // Mark the function eagerly since we may discover a function
         // which prevents us from speculating about the entire SCC
-        DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
+        LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
+                          << " as nonnull\n");
         F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
         ++NumNonNullReturn;
         MadeChange = true;
@@ -1027,7 +1032,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
           !F->getReturnType()->isPointerTy())
         continue;
 
-      DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
+      LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
       F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
       ++NumNonNullReturn;
       MadeChange = true;
@@ -1037,49 +1042,214 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
   return MadeChange;
 }
 
-/// Remove the convergent attribute from all functions in the SCC if every
-/// callsite within the SCC is not convergent (except for calls to functions
-/// within the SCC).  Returns true if changes were made.
-static bool removeConvergentAttrs(const SCCNodeSet &SCCNodes) {
-  // For every function in SCC, ensure that either
-  //  * it is not convergent, or
-  //  * we can remove its convergent attribute.
-  bool HasConvergentFn = false;
+namespace {
+
+/// Collects a set of attribute inference requests and performs them all in one
+/// go on a single SCC Node. Inference involves scanning function bodies
+/// looking for instructions that violate attribute assumptions.
+/// As soon as all the bodies are fine we are free to set the attribute.
+/// Customization of inference for individual attributes is performed by
+/// providing a handful of predicates for each attribute.
+class AttributeInferer {
+public:
+  /// Describes a request for inference of a single attribute.
+  struct InferenceDescriptor {
+
+    /// Returns true if this function does not have to be handled.
+    /// General intent for this predicate is to provide an optimization
+    /// for functions that do not need this attribute inference at all
+    /// (say, for functions that already have the attribute).
+    std::function<bool(const Function &)> SkipFunction;
+
+    /// Returns true if this instruction violates attribute assumptions.
+    std::function<bool(Instruction &)> InstrBreaksAttribute;
+
+    /// Sets the inferred attribute for this function.
+    std::function<void(Function &)> SetAttribute;
+
+    /// Attribute we derive.
+    Attribute::AttrKind AKind;
+
+    /// If true, only "exact" definitions can be used to infer this attribute.
+    /// See GlobalValue::isDefinitionExact.
+    bool RequiresExactDefinition;
+
+    InferenceDescriptor(Attribute::AttrKind AK,
+                        std::function<bool(const Function &)> SkipFunc,
+                        std::function<bool(Instruction &)> InstrScan,
+                        std::function<void(Function &)> SetAttr,
+                        bool ReqExactDef)
+        : SkipFunction(SkipFunc), InstrBreaksAttribute(InstrScan),
+          SetAttribute(SetAttr), AKind(AK),
+          RequiresExactDefinition(ReqExactDef) {}
+  };
+
+private:
+  SmallVector<InferenceDescriptor, 4> InferenceDescriptors;
+
+public:
+  void registerAttrInference(InferenceDescriptor AttrInference) {
+    InferenceDescriptors.push_back(AttrInference);
+  }
+
+  bool run(const SCCNodeSet &SCCNodes);
+};
+
+/// Perform all the requested attribute inference actions according to the
+/// attribute predicates stored before.
+bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
+  SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
+  // Go through all the functions in SCC and check corresponding attribute
+  // assumptions for each of them. Attributes that are invalid for this SCC
+  // will be removed from InferInSCC.
   for (Function *F : SCCNodes) {
-    if (!F->isConvergent()) continue;
-    HasConvergentFn = true;
 
-    // Can't remove convergent from function declarations.
-    if (F->isDeclaration()) return false;
+    // No attributes whose assumptions are still valid - done.
+    if (InferInSCC.empty())
+      return false;
 
-    // Can't remove convergent if any of our functions has a convergent call to a
-    // function not in the SCC.
-    for (Instruction &I : instructions(*F)) {
-      CallSite CS(&I);
-      // Bail if CS is a convergent call to a function not in the SCC.
-      if (CS && CS.isConvergent() &&
-          SCCNodes.count(CS.getCalledFunction()) == 0)
+    // Check if our attributes ever need scanning/can be scanned.
+    llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
+      if (ID.SkipFunction(*F))
         return false;
+
+      // Remove from further inference (invalidate) when visiting a function
+      // that has no instructions to scan/has an unsuitable definition.
+      return F->isDeclaration() ||
+             (ID.RequiresExactDefinition && !F->hasExactDefinition());
+    });
+
+    // For each attribute still in InferInSCC that doesn't explicitly skip F,
+    // set up the F instructions scan to verify assumptions of the attribute.
+    SmallVector<InferenceDescriptor, 4> InferInThisFunc;
+    llvm::copy_if(
+        InferInSCC, std::back_inserter(InferInThisFunc),
+        [F](const InferenceDescriptor &ID) { return !ID.SkipFunction(*F); });
+
+    if (InferInThisFunc.empty())
+      continue;
+
+    // Start instruction scan.
+    for (Instruction &I : instructions(*F)) {
+      llvm::erase_if(InferInThisFunc, [&](const InferenceDescriptor &ID) {
+        if (!ID.InstrBreaksAttribute(I))
+          return false;
+        // Remove attribute from further inference on any other functions
+        // because attribute assumptions have just been violated.
+        llvm::erase_if(InferInSCC, [&ID](const InferenceDescriptor &D) {
+          return D.AKind == ID.AKind;
+        });
+        // Remove attribute from the rest of current instruction scan.
+        return true;
+      });
+
+      if (InferInThisFunc.empty())
+        break;
     }
   }
 
-  // If the SCC doesn't have any convergent functions, we have nothing to do.
-  if (!HasConvergentFn) return false;
+  if (InferInSCC.empty())
+    return false;
 
-  // If we got here, all of the calls the SCC makes to functions not in the SCC
-  // are non-convergent.  Therefore all of the SCC's functions can also be made
-  // non-convergent.  We'll remove the attr from the callsites in
-  // InstCombineCalls.
-  for (Function *F : SCCNodes) {
-    if (!F->isConvergent()) continue;
+  bool Changed = false;
+  for (Function *F : SCCNodes)
+    // At this point InferInSCC contains only functions that were either:
+    //   - explicitly skipped from scan/inference, or
+    //   - verified to have no instructions that break attribute assumptions.
+    // Hence we just go and force the attribute for all non-skipped functions.
+    for (auto &ID : InferInSCC) {
+      if (ID.SkipFunction(*F))
+        continue;
+      Changed = true;
+      ID.SetAttribute(*F);
+    }
+  return Changed;
+}
 
-    DEBUG(dbgs() << "Removing convergent attr from fn " << F->getName()
-                 << "\n");
-    F->setNotConvergent();
+} // end anonymous namespace
+
+/// Helper for non-Convergent inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonConvergent(Instruction &I,
+                                     const SCCNodeSet &SCCNodes) {
+  const CallSite CS(&I);
+  // Breaks non-convergent assumption if CS is a convergent call to a function
+  // not in the SCC.
+  return CS && CS.isConvergent() && SCCNodes.count(CS.getCalledFunction()) == 0;
+}
+
+/// Helper for NoUnwind inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
+  if (!I.mayThrow())
+    return false;
+  if (const auto *CI = dyn_cast<CallInst>(&I)) {
+    if (Function *Callee = CI->getCalledFunction()) {
+      // I is a may-throw call to a function inside our SCC. This doesn't
+      // invalidate our current working assumption that the SCC is no-throw; we
+      // just have to scan that other function.
+      if (SCCNodes.count(Callee) > 0)
+        return false;
+    }
   }
   return true;
 }
 
+/// Infer attributes from all functions in the SCC by scanning every
+/// instruction for compliance to the attribute assumptions. Currently it
+/// does:
+///   - removal of Convergent attribute
+///   - addition of NoUnwind attribute
+///
+/// Returns true if any changes to function attributes were made.
+static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
+
+  AttributeInferer AI;
+
+  // Request to remove the convergent attribute from all functions in the SCC
+  // if every callsite within the SCC is not convergent (except for calls
+  // to functions within the SCC).
+  // Note: Removal of the attr from the callsites will happen in
+  // InstCombineCalls separately.
+  AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+      Attribute::Convergent,
+      // Skip non-convergent functions.
+      [](const Function &F) { return !F.isConvergent(); },
+      // Instructions that break non-convergent assumption.
+      [SCCNodes](Instruction &I) {
+        return InstrBreaksNonConvergent(I, SCCNodes);
+      },
+      [](Function &F) {
+        LLVM_DEBUG(dbgs() << "Removing convergent attr from fn " << F.getName()
+                          << "\n");
+        F.setNotConvergent();
+      },
+      /* RequiresExactDefinition= */ false});
+
+  if (!DisableNoUnwindInference)
+    // Request to infer nounwind attribute for all the functions in the SCC if
+    // every callsite within the SCC is not throwing (except for calls to
+    // functions within the SCC). Note that nounwind attribute suffers from
+    // derefinement - results may change depending on how functions are
+    // optimized. Thus it can be inferred only from exact definitions.
+    AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+        Attribute::NoUnwind,
+        // Skip non-throwing functions.
+        [](const Function &F) { return F.doesNotThrow(); },
+        // Instructions that break non-throwing assumption.
+        [SCCNodes](Instruction &I) {
+          return InstrBreaksNonThrowing(I, SCCNodes);
+        },
+        [](Function &F) {
+          LLVM_DEBUG(dbgs()
+                     << "Adding nounwind attr to fn " << F.getName() << "\n");
+          F.setDoesNotThrow();
+          ++NumNoUnwind;
+        },
+        /* RequiresExactDefinition= */ true});
+
+  // Perform all the requested attribute inference actions.
+  return AI.run(SCCNodes);
+}
+
 static bool setDoesNotRecurse(Function &F) {
   if (F.doesNotRecurse())
     return false;
@@ -1136,7 +1306,8 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
   bool HasUnknownCall = false;
   for (LazyCallGraph::Node &N : C) {
     Function &F = N.getFunction();
-    if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+    if (F.hasFnAttribute(Attribute::OptimizeNone) ||
+        F.hasFnAttribute(Attribute::Naked)) {
       // Treat any function we're trying not to optimize as if it were an
       // indirect call and omit it from the node set used below.
       HasUnknownCall = true;
@@ -1167,7 +1338,7 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
   if (!HasUnknownCall) {
     Changed |= addNoAliasAttrs(SCCNodes);
     Changed |= addNonNullAttrs(SCCNodes);
-    Changed |= removeConvergentAttrs(SCCNodes);
+    Changed |= inferAttrsFromFunctionBodies(SCCNodes);
     Changed |= addNoRecurseAttrs(SCCNodes);
   }
 
@@ -1221,7 +1392,8 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
   bool ExternalNode = false;
   for (CallGraphNode *I : SCC) {
     Function *F = I->getFunction();
-    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) {
+    if (!F || F->hasFnAttribute(Attribute::OptimizeNone) ||
+        F->hasFnAttribute(Attribute::Naked)) {
       // External node or function we're trying not to optimize - we both avoid
       // transform them and avoid leveraging information they provide.
       ExternalNode = true;
@@ -1244,7 +1416,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
   if (!ExternalNode) {
     Changed |= addNoAliasAttrs(SCCNodes);
     Changed |= addNonNullAttrs(SCCNodes);
-    Changed |= removeConvergentAttrs(SCCNodes);
+    Changed |= inferAttrsFromFunctionBodies(SCCNodes);
     Changed |= addNoRecurseAttrs(SCCNodes);
   }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
index b6d6201cd23b..15808a073894 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -18,8 +18,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/Constants.h"
@@ -61,6 +61,7 @@ using namespace llvm;
 #define DEBUG_TYPE "function-import"
 
 STATISTIC(NumImportedFunctions, "Number of functions imported");
+STATISTIC(NumImportedGlobalVars, "Number of global variables imported");
 STATISTIC(NumImportedModules, "Number of modules imported from");
 STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
 STATISTIC(NumLiveSymbols, "Number of live symbols in index");
@@ -70,6 +71,10 @@ static cl::opt<unsigned> ImportInstrLimit(
     "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
     cl::desc("Only import functions with less than N instructions"));
 
+static cl::opt<int> ImportCutoff(
+    "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Only import first N functions if N>=0 (default -1)"));
+
 static cl::opt<float>
     ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
                       cl::Hidden, cl::value_desc("x"),
@@ -131,7 +136,7 @@ static cl::opt<bool>
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
                                         LLVMContext &Context) {
   SMDiagnostic Err;
-  DEBUG(dbgs() << "Loading '" << FileName << "'\n");
+  LLVM_DEBUG(dbgs() << "Loading '" << FileName << "'\n");
   // Metadata isn't loaded until functions are imported, to minimize
   // the memory overhead.
   std::unique_ptr<Module> Result =
@@ -163,6 +168,9 @@ selectCallee(const ModuleSummaryIndex &Index,
       CalleeSummaryList,
       [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
         auto *GVSummary = SummaryPtr.get();
+        if (!Index.isGlobalValueLive(GVSummary))
+          return false;
+
         // For SamplePGO, in computeImportForFunction the OriginalId
         // may have been used to locate the callee summary list (See
         // comment there).
@@ -231,10 +239,37 @@ updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
   // it, rather than needing to perform this mapping on each walk.
   auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
   if (GUID == 0)
-    return nullptr;
+    return ValueInfo();
   return Index.getValueInfo(GUID);
 }
 
+static void computeImportForReferencedGlobals(
+    const FunctionSummary &Summary, const GVSummaryMapTy &DefinedGVSummaries,
+    FunctionImporter::ImportMapTy &ImportList,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
+  for (auto &VI : Summary.refs()) {
+    if (DefinedGVSummaries.count(VI.getGUID())) {
+      LLVM_DEBUG(
+          dbgs() << "Ref ignored! Target already in destination module.\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
+
+    for (auto &RefSummary : VI.getSummaryList())
+      if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind &&
+          // Don't try to import regular LTO summaries added to dummy module.
+          !RefSummary->modulePath().empty() &&
+          !GlobalValue::isInterposableLinkage(RefSummary->linkage()) &&
+          RefSummary->refs().empty()) {
+        ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+        if (ExportLists)
+          (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID());
+        break;
+      }
+  }
+}
+
 /// Compute the list of functions to import for a given caller. Mark these
 /// imported functions and the symbols they reference in their source module as
 /// exported from their source module.
@@ -243,18 +278,28 @@ static void computeImportForFunction(
     const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
     SmallVectorImpl<EdgeInfo> &Worklist,
     FunctionImporter::ImportMapTy &ImportList,
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists,
+    FunctionImporter::ImportThresholdsTy &ImportThresholds) {
+  computeImportForReferencedGlobals(Summary, DefinedGVSummaries, ImportList,
+                                    ExportLists);
+  static int ImportCount = 0;
   for (auto &Edge : Summary.calls()) {
     ValueInfo VI = Edge.first;
-    DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold
-                 << "\n");
+    LLVM_DEBUG(dbgs() << " edge -> " << VI << " Threshold:" << Threshold
+                      << "\n");
+
+    if (ImportCutoff >= 0 && ImportCount >= ImportCutoff) {
+      LLVM_DEBUG(dbgs() << "ignored! import-cutoff value of " << ImportCutoff
+                        << " reached.\n");
+      continue;
+    }
 
     VI = updateValueInfoForIndirectCalls(Index, VI);
     if (!VI)
       continue;
 
     if (DefinedGVSummaries.count(VI.getGUID())) {
-      DEBUG(dbgs() << "ignored! Target already in destination module.\n");
+      LLVM_DEBUG(dbgs() << "ignored! Target already in destination module.\n");
       continue;
     }
 
@@ -269,20 +314,87 @@ static void computeImportForFunction(
     };
 
     const auto NewThreshold =
-        Threshold * GetBonusMultiplier(Edge.second.Hotness);
-
-    auto *CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
-                                       Summary.modulePath());
-    if (!CalleeSummary) {
-      DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
-      continue;
-    }
+        Threshold * GetBonusMultiplier(Edge.second.getHotness());
+
+    auto IT = ImportThresholds.insert(
+        std::make_pair(VI.getGUID(), std::make_pair(NewThreshold, nullptr)));
+    bool PreviouslyVisited = !IT.second;
+    auto &ProcessedThreshold = IT.first->second.first;
+    auto &CalleeSummary = IT.first->second.second;
+
+    const FunctionSummary *ResolvedCalleeSummary = nullptr;
+    if (CalleeSummary) {
+      assert(PreviouslyVisited);
+      // Since the traversal of the call graph is DFS, we can revisit a function
+      // a second time with a higher threshold. In this case, it is added back
+      // to the worklist with the new threshold (so that its own callee chains
+      // can be considered with the higher threshold).
+      if (NewThreshold <= ProcessedThreshold) {
+        LLVM_DEBUG(
+            dbgs() << "ignored! Target was already imported with Threshold "
+                   << ProcessedThreshold << "\n");
+        continue;
+      }
+      // Update with new larger threshold.
+      ProcessedThreshold = NewThreshold;
+      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+    } else {
+      // If we already rejected importing a callee at the same or higher
+      // threshold, don't waste time calling selectCallee.
+      if (PreviouslyVisited && NewThreshold <= ProcessedThreshold) {
+        LLVM_DEBUG(
+            dbgs() << "ignored! Target was already rejected with Threshold "
+            << ProcessedThreshold << "\n");
+        continue;
+      }
 
-    // "Resolve" the summary
-    const auto *ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary->getBaseObject());
+      CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                                   Summary.modulePath());
+      if (!CalleeSummary) {
+        // Update with new larger threshold if this was a retry (otherwise
+        // we would have already inserted with NewThreshold above).
+        if (PreviouslyVisited)
+          ProcessedThreshold = NewThreshold;
+        LLVM_DEBUG(
+            dbgs() << "ignored! No qualifying callee with summary found.\n");
+        continue;
+      }
 
-    assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
-           "selectCallee() didn't honor the threshold");
+      // "Resolve" the summary
+      CalleeSummary = CalleeSummary->getBaseObject();
+      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+      assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
+             "selectCallee() didn't honor the threshold");
+
+      auto ExportModulePath = ResolvedCalleeSummary->modulePath();
+      auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
+      // We previously decided to import this GUID definition if it was already
+      // inserted in the set of imports from the exporting module.
+      bool PreviouslyImported = !ILI.second;
+
+      // Make exports in the source module.
+      if (ExportLists) {
+        auto &ExportList = (*ExportLists)[ExportModulePath];
+        ExportList.insert(VI.getGUID());
+        if (!PreviouslyImported) {
+          // This is the first time this function was exported from its source
+          // module, so mark all functions and globals it references as exported
+          // to the outside if they are defined in the same source module.
+          // For efficiency, we unconditionally add all the referenced GUIDs
+          // to the ExportList for this module, and will prune out any not
+          // defined in the module later in a single pass.
+          for (auto &Edge : ResolvedCalleeSummary->calls()) {
+            auto CalleeGUID = Edge.first.getGUID();
+            ExportList.insert(CalleeGUID);
+          }
+          for (auto &Ref : ResolvedCalleeSummary->refs()) {
+            auto GUID = Ref.getGUID();
+            ExportList.insert(GUID);
+          }
+        }
+      }
+    }
 
     auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
       // Adjust the threshold for next level of imported functions.
@@ -293,44 +405,11 @@ static void computeImportForFunction(
       return Threshold * ImportInstrFactor;
     };
 
-    bool IsHotCallsite = Edge.second.Hotness == CalleeInfo::HotnessType::Hot;
+    bool IsHotCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
     const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
 
-    auto ExportModulePath = ResolvedCalleeSummary->modulePath();
-    auto &ProcessedThreshold = ImportList[ExportModulePath][VI.getGUID()];
-    /// Since the traversal of the call graph is DFS, we can revisit a function
-    /// a second time with a higher threshold. In this case, it is added back to
-    /// the worklist with the new threshold.
-    if (ProcessedThreshold && ProcessedThreshold >= AdjThreshold) {
-      DEBUG(dbgs() << "ignored! Target was already seen with Threshold "
-                   << ProcessedThreshold << "\n");
-      continue;
-    }
-    bool PreviouslyImported = ProcessedThreshold != 0;
-    // Mark this function as imported in this module, with the current Threshold
-    ProcessedThreshold = AdjThreshold;
-
-    // Make exports in the source module.
-    if (ExportLists) {
-      auto &ExportList = (*ExportLists)[ExportModulePath];
-      ExportList.insert(VI.getGUID());
-      if (!PreviouslyImported) {
-        // This is the first time this function was exported from its source
-        // module, so mark all functions and globals it references as exported
-        // to the outside if they are defined in the same source module.
-        // For efficiency, we unconditionally add all the referenced GUIDs
-        // to the ExportList for this module, and will prune out any not
-        // defined in the module later in a single pass.
-        for (auto &Edge : ResolvedCalleeSummary->calls()) {
-          auto CalleeGUID = Edge.first.getGUID();
-          ExportList.insert(CalleeGUID);
-        }
-        for (auto &Ref : ResolvedCalleeSummary->refs()) {
-          auto GUID = Ref.getGUID();
-          ExportList.insert(GUID);
-        }
-      }
-    }
+    ImportCount++;
 
     // Insert the newly imported function to the worklist.
     Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, VI.getGUID());
@@ -347,12 +426,18 @@ static void ComputeImportForModule(
   // Worklist contains the list of function imported in this module, for which
   // we will analyse the callees and may import further down the callgraph.
   SmallVector<EdgeInfo, 128> Worklist;
+  FunctionImporter::ImportThresholdsTy ImportThresholds;
 
   // Populate the worklist with the import for the functions in the current
   // module
   for (auto &GVSummary : DefinedGVSummaries) {
+#ifndef NDEBUG
+    // FIXME: Change the GVSummaryMapTy to hold ValueInfo instead of GUID
+    // so this map look up (and possibly others) can be avoided.
+    auto VI = Index.getValueInfo(GVSummary.first);
+#endif
     if (!Index.isGlobalValueLive(GVSummary.second)) {
-      DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
+      LLVM_DEBUG(dbgs() << "Ignores Dead GUID: " << VI << "\n");
       continue;
     }
     auto *FuncSummary =
@@ -360,10 +445,10 @@ static void ComputeImportForModule(
     if (!FuncSummary)
       // Skip import for global variables
       continue;
-    DEBUG(dbgs() << "Initialize import for " << GVSummary.first << "\n");
+    LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n");
     computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
                              DefinedGVSummaries, Worklist, ImportList,
-                             ExportLists);
+                             ExportLists, ImportThresholds);
   }
 
   // Process the newly imported functions and add callees to the worklist.
@@ -371,20 +456,37 @@ static void ComputeImportForModule(
     auto FuncInfo = Worklist.pop_back_val();
     auto *Summary = std::get<0>(FuncInfo);
     auto Threshold = std::get<1>(FuncInfo);
-    auto GUID = std::get<2>(FuncInfo);
-
-    // Check if we later added this summary with a higher threshold.
-    // If so, skip this entry.
-    auto ExportModulePath = Summary->modulePath();
-    auto &LatestProcessedThreshold = ImportList[ExportModulePath][GUID];
-    if (LatestProcessedThreshold > Threshold)
-      continue;
 
     computeImportForFunction(*Summary, Index, Threshold, DefinedGVSummaries,
-                             Worklist, ImportList, ExportLists);
+                             Worklist, ImportList, ExportLists,
+                             ImportThresholds);
+  }
+}
+
+#ifndef NDEBUG
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
+                               GlobalValue::GUID G) {
+  if (const auto &VI = Index.getValueInfo(G)) {
+    auto SL = VI.getSummaryList();
+    if (!SL.empty())
+      return SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
   }
+  return false;
 }
 
+static GlobalValue::GUID getGUID(GlobalValue::GUID G) { return G; }
+
+template <class T>
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
+                                      T &Cont) {
+  unsigned NumGVS = 0;
+  for (auto &V : Cont)
+    if (isGlobalVarSummary(Index, getGUID(V)))
+      ++NumGVS;
+  return NumGVS;
+}
+#endif
+
 /// Compute all the import and export for every module using the Index.
 void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
@@ -394,8 +496,8 @@ void llvm::ComputeCrossModuleImport(
   // For each module that has function defined, compute the import/export lists.
   for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
     auto &ImportList = ImportLists[DefinedGVSummaries.first()];
-    DEBUG(dbgs() << "Computing import for Module '"
-                 << DefinedGVSummaries.first() << "'\n");
+    LLVM_DEBUG(dbgs() << "Computing import for Module '"
+                      << DefinedGVSummaries.first() << "'\n");
     ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
                            &ExportLists);
   }
@@ -417,32 +519,41 @@ void llvm::ComputeCrossModuleImport(
   }
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
-               << " modules:\n");
+  LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
+                    << " modules:\n");
   for (auto &ModuleImports : ImportLists) {
     auto ModName = ModuleImports.first();
     auto &Exports = ExportLists[ModName];
-    DEBUG(dbgs() << "* Module " << ModName << " exports " << Exports.size()
-                 << " functions. Imports from " << ModuleImports.second.size()
-                 << " modules.\n");
+    unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
+    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
+                      << Exports.size() - NumGVS << " functions and " << NumGVS
+                      << " vars. Imports from " << ModuleImports.second.size()
+                      << " modules.\n");
     for (auto &Src : ModuleImports.second) {
       auto SrcModName = Src.first();
-      DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
-                   << SrcModName << "\n");
+      unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+      LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+                        << " functions imported from " << SrcModName << "\n");
+      LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
+                        << " global vars imported from " << SrcModName << "\n");
     }
   }
 #endif
 }
 
 #ifndef NDEBUG
-static void dumpImportListForModule(StringRef ModulePath,
+static void dumpImportListForModule(const ModuleSummaryIndex &Index,
+                                    StringRef ModulePath,
                                     FunctionImporter::ImportMapTy &ImportList) {
-  DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
-               << ImportList.size() << " modules.\n");
+  LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+                    << ImportList.size() << " modules.\n");
   for (auto &Src : ImportList) {
     auto SrcModName = Src.first();
-    DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
-                 << SrcModName << "\n");
+    unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+    LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+                      << " functions imported from " << SrcModName << "\n");
+    LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
+                      << SrcModName << "\n");
   }
 }
 #endif
@@ -457,11 +568,11 @@ void llvm::ComputeCrossModuleImportForModule(
   Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
 
   // Compute the import list for this module.
-  DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
+  LLVM_DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
   ComputeImportForModule(FunctionSummaryMap, Index, ImportList);
 
 #ifndef NDEBUG
-  dumpImportListForModule(ModulePath, ImportList);
+  dumpImportListForModule(Index, ModulePath, ImportList);
 #endif
 }
 
@@ -483,18 +594,18 @@ void llvm::ComputeCrossModuleImportForModuleFromIndex(
     // e.g. record required linkage changes.
     if (Summary->modulePath() == ModulePath)
       continue;
-    // Doesn't matter what value we plug in to the map, just needs an entry
-    // to provoke importing by thinBackend.
-    ImportList[Summary->modulePath()][GUID] = 1;
+    // Add an entry to provoke importing by thinBackend.
+    ImportList[Summary->modulePath()].insert(GUID);
   }
 #ifndef NDEBUG
-  dumpImportListForModule(ModulePath, ImportList);
+  dumpImportListForModule(Index, ModulePath, ImportList);
 #endif
 }
 
 void llvm::computeDeadSymbols(
     ModuleSummaryIndex &Index,
-    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
   assert(!Index.withGlobalValueDeadStripping());
   if (!ComputeDead)
     return;
@@ -513,17 +624,18 @@ void llvm::computeDeadSymbols(
   }
 
   // Add values flagged in the index as live roots to the worklist.
-  for (const auto &Entry : Index)
+  for (const auto &Entry : Index) {
+    auto VI = Index.getValueInfo(Entry);
     for (auto &S : Entry.second.SummaryList)
       if (S->isLive()) {
-        DEBUG(dbgs() << "Live root: " << Entry.first << "\n");
-        Worklist.push_back(ValueInfo(&Entry));
+        LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
+        Worklist.push_back(VI);
         ++LiveSymbols;
         break;
       }
+  }
 
   // Make value live and add it to the worklist if it was not live before.
-  // FIXME: we should only make the prevailing copy live here
   auto visit = [&](ValueInfo VI) {
     // FIXME: If we knew which edges were created for indirect call profiles,
     // we could skip them here. Any that are live should be reached via
@@ -539,6 +651,28 @@ void llvm::computeDeadSymbols(
     for (auto &S : VI.getSummaryList())
       if (S->isLive())
         return;
+
+    // We only keep live symbols that are known to be non-prevailing if any are
+    // available_externally. Those symbols are discarded later in the
+    // EliminateAvailableExternally pass and setting them to not-live breaks
+    // downstreams users of liveness information (PR36483).
+    if (isPrevailing(VI.getGUID()) == PrevailingType::No) {
+      bool AvailableExternally = false;
+      bool Interposable = false;
+      for (auto &S : VI.getSummaryList()) {
+        if (S->linkage() == GlobalValue::AvailableExternallyLinkage)
+          AvailableExternally = true;
+        else if (GlobalValue::isInterposableLinkage(S->linkage()))
+          Interposable = true;
+      }
+
+      if (!AvailableExternally)
+        return;
+
+      if (Interposable)
+        report_fatal_error("Interposable and available_externally symbol");
+    }
+
     for (auto &S : VI.getSummaryList())
       S->setLive(true);
     ++LiveSymbols;
@@ -549,6 +683,8 @@ void llvm::computeDeadSymbols(
     auto VI = Worklist.pop_back_val();
     for (auto &Summary : VI.getSummaryList()) {
       GlobalValueSummary *Base = Summary->getBaseObject();
+      // Set base value live in case it is an alias.
+      Base->setLive(true);
       for (auto Ref : Base->refs())
         visit(Ref);
       if (auto *FS = dyn_cast<FunctionSummary>(Base))
@@ -559,8 +695,8 @@ void llvm::computeDeadSymbols(
   Index.setWithGlobalValueDeadStripping();
 
   unsigned DeadSymbols = Index.size() - LiveSymbols;
-  DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
-               << " symbols Dead \n");
+  LLVM_DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+                    << " symbols Dead \n");
   NumDeadSymbols += DeadSymbols;
   NumLiveSymbols += LiveSymbols;
 }
@@ -581,47 +717,66 @@ void llvm::gatherImportedSummariesForModule(
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ILI.first());
     for (auto &GI : ILI.second) {
-      const auto &DS = DefinedGVSummaries.find(GI.first);
+      const auto &DS = DefinedGVSummaries.find(GI);
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
-      SummariesForIndex[GI.first] = DS->second;
+      SummariesForIndex[GI] = DS->second;
     }
   }
 }
 
 /// Emit the files \p ModulePath will import from into \p OutputFilename.
-std::error_code
-llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
-                       const FunctionImporter::ImportMapTy &ModuleImports) {
+std::error_code llvm::EmitImportsFiles(
+    StringRef ModulePath, StringRef OutputFilename,
+    const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
   std::error_code EC;
   raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
   if (EC)
     return EC;
-  for (auto &ILI : ModuleImports)
-    ImportsOS << ILI.first() << "\n";
+  for (auto &ILI : ModuleToSummariesForIndex)
+    // The ModuleToSummariesForIndex map includes an entry for the current
+    // Module (needed for writing out the index files). We don't want to
+    // include it in the imports file, however, so filter it out.
+    if (ILI.first != ModulePath)
+      ImportsOS << ILI.first << "\n";
   return std::error_code();
 }
 
+bool llvm::convertToDeclaration(GlobalValue &GV) {
+  LLVM_DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName()
+                    << "\n");
+  if (Function *F = dyn_cast<Function>(&GV)) {
+    F->deleteBody();
+    F->clearMetadata();
+    F->setComdat(nullptr);
+  } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+    V->setInitializer(nullptr);
+    V->setLinkage(GlobalValue::ExternalLinkage);
+    V->clearMetadata();
+    V->setComdat(nullptr);
+  } else {
+    GlobalValue *NewGV;
+    if (GV.getValueType()->isFunctionTy())
+      NewGV =
+          Function::Create(cast<FunctionType>(GV.getValueType()),
+                           GlobalValue::ExternalLinkage, "", GV.getParent());
+    else
+      NewGV =
+          new GlobalVariable(*GV.getParent(), GV.getValueType(),
+                             /*isConstant*/ false, GlobalValue::ExternalLinkage,
+                             /*init*/ nullptr, "",
+                             /*insertbefore*/ nullptr, GV.getThreadLocalMode(),
+                             GV.getType()->getAddressSpace());
+    NewGV->takeName(&GV);
+    GV.replaceAllUsesWith(NewGV);
+    return false;
+  }
+  return true;
+}
+
 /// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
 void llvm::thinLTOResolveWeakForLinkerModule(
     Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
-  auto ConvertToDeclaration = [](GlobalValue &GV) {
-    DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n");
-    if (Function *F = dyn_cast<Function>(&GV)) {
-      F->deleteBody();
-      F->clearMetadata();
-    } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
-      V->setInitializer(nullptr);
-      V->setLinkage(GlobalValue::ExternalLinkage);
-      V->clearMetadata();
-    } else
-      // For now we don't resolve or drop aliases. Once we do we'll
-      // need to add support here for creating either a function or
-      // variable declaration, and return the new GlobalValue* for
-      // the caller to use.
-      llvm_unreachable("Expected function or variable");
-  };
-
   auto updateLinkage = [&](GlobalValue &GV) {
     // See if the global summary analysis computed a new resolved linkage.
     const auto &GS = DefinedGlobals.find(GV.getGUID());
@@ -651,11 +806,23 @@ void llvm::thinLTOResolveWeakForLinkerModule(
     // interposable property and possibly get inlined. Simply drop
     // the definition in that case.
     if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
-        GlobalValue::isInterposableLinkage(GV.getLinkage()))
-      ConvertToDeclaration(GV);
-    else {
-      DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
-                   << GV.getLinkage() << " to " << NewLinkage << "\n");
+        GlobalValue::isInterposableLinkage(GV.getLinkage())) {
+      if (!convertToDeclaration(GV))
+        // FIXME: Change this to collect replaced GVs and later erase
+        // them from the parent module once thinLTOResolveWeakForLinkerGUID is
+        // changed to enable this for aliases.
+        llvm_unreachable("Expected GV to be converted");
+    } else {
+      // If the original symbols has global unnamed addr and linkonce_odr linkage,
+      // it should be an auto hide symbol. Add hidden visibility to the symbol to
+      // preserve the property.
+      if (GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr() &&
+          NewLinkage == GlobalValue::WeakODRLinkage)
+        GV.setVisibility(GlobalValue::HiddenVisibility);
+
+      LLVM_DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName()
+                        << "` from " << GV.getLinkage() << " to " << NewLinkage
+                        << "\n");
       GV.setLinkage(NewLinkage);
     }
     // Remove declarations from comdats, including available_externally
@@ -732,9 +899,9 @@ static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
 // index.
 Expected<bool> FunctionImporter::importFunctions(
     Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
-  DEBUG(dbgs() << "Starting import for Module "
-               << DestModule.getModuleIdentifier() << "\n");
-  unsigned ImportedCount = 0;
+  LLVM_DEBUG(dbgs() << "Starting import for Module "
+                    << DestModule.getModuleIdentifier() << "\n");
+  unsigned ImportedCount = 0, ImportedGVCount = 0;
 
   IRMover Mover(DestModule);
   // Do the actual import of functions now, one Module at a time
@@ -766,9 +933,9 @@ Expected<bool> FunctionImporter::importFunctions(
         continue;
       auto GUID = F.getGUID();
       auto Import = ImportGUIDs.count(GUID);
-      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function " << GUID
-                   << " " << F.getName() << " from "
-                   << SrcModule->getSourceFileName() << "\n");
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
+                        << GUID << " " << F.getName() << " from "
+                        << SrcModule->getSourceFileName() << "\n");
       if (Import) {
         if (Error Err = F.materialize())
           return std::move(Err);
@@ -788,13 +955,13 @@ Expected<bool> FunctionImporter::importFunctions(
         continue;
       auto GUID = GV.getGUID();
       auto Import = ImportGUIDs.count(GUID);
-      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global " << GUID
-                   << " " << GV.getName() << " from "
-                   << SrcModule->getSourceFileName() << "\n");
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
+                        << GUID << " " << GV.getName() << " from "
+                        << SrcModule->getSourceFileName() << "\n");
       if (Import) {
         if (Error Err = GV.materialize())
           return std::move(Err);
-        GlobalsToImport.insert(&GV);
+        ImportedGVCount += GlobalsToImport.insert(&GV);
       }
     }
     for (GlobalAlias &GA : SrcModule->aliases()) {
@@ -802,9 +969,9 @@ Expected<bool> FunctionImporter::importFunctions(
         continue;
       auto GUID = GA.getGUID();
       auto Import = ImportGUIDs.count(GUID);
-      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias " << GUID
-                   << " " << GA.getName() << " from "
-                   << SrcModule->getSourceFileName() << "\n");
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
+                        << GUID << " " << GA.getName() << " from "
+                        << SrcModule->getSourceFileName() << "\n");
       if (Import) {
         if (Error Err = GA.materialize())
           return std::move(Err);
@@ -813,9 +980,9 @@ Expected<bool> FunctionImporter::importFunctions(
         if (Error Err = Base->materialize())
           return std::move(Err);
         auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
-        DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
-              << " " << Base->getName() << " from "
-              << SrcModule->getSourceFileName() << "\n");
+        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
+                          << " " << Base->getName() << " from "
+                          << SrcModule->getSourceFileName() << "\n");
         if (EnableImportMetadata) {
           // Add 'thinlto_src_module' metadata for statistics and debugging.
           Fn->setMetadata(
@@ -851,10 +1018,15 @@ Expected<bool> FunctionImporter::importFunctions(
     NumImportedModules++;
   }
 
-  NumImportedFunctions += ImportedCount;
+  NumImportedFunctions += (ImportedCount - ImportedGVCount);
+  NumImportedGlobalVars += ImportedGVCount;
 
-  DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
-               << DestModule.getModuleIdentifier() << "\n");
+  LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
+                    << " functions for Module "
+                    << DestModule.getModuleIdentifier() << "\n");
+  LLVM_DEBUG(dbgs() << "Imported " << ImportedGVCount
+                    << " global variables for Module "
+                    << DestModule.getModuleIdentifier() << "\n");
   return ImportedCount;
 }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index e0bbf45d316a..1af7e6894777 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -17,14 +17,16 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -55,6 +57,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -63,7 +66,6 @@
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <utility>
@@ -88,6 +90,21 @@ STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
 STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
 STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+STATISTIC(NumInternalFunc, "Number of internal functions");
+STATISTIC(NumColdCC, "Number of functions marked coldcc");
+
+static cl::opt<bool>
+    EnableColdCCStressTest("enable-coldcc-stress-test",
+                           cl::desc("Enable stress test of coldcc by adding "
+                                    "calling conv to all internal functions."),
+                           cl::init(false), cl::Hidden);
+
+static cl::opt<int> ColdCCRelFreq(
+    "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+    cl::desc(
+        "Maximum block frequency, expressed as a percentage of caller's "
+        "entry frequency, for a call site to be considered cold for enabling"
+        "coldcc"));
 
 /// Is this global variable possibly used by a leak checker as a root?  If so,
 /// we might not really want to eliminate the stores to it.
@@ -483,7 +500,6 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
     StartAlignment = DL.getABITypeAlignment(GV->getType());
 
   if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    uint64_t FragmentOffset = 0;
     unsigned NumElements = STy->getNumElements();
     NewGlobals.reserve(NumElements);
     const StructLayout &Layout = *DL.getStructLayout(STy);
@@ -509,10 +525,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
         NGV->setAlignment(NewAlign);
 
       // Copy over the debug info for the variable.
-      FragmentOffset = alignTo(FragmentOffset, NewAlign);
-      uint64_t Size = DL.getTypeSizeInBits(NGV->getValueType());
-      transferSRADebugInfo(GV, NGV, FragmentOffset, Size, NumElements);
-      FragmentOffset += Size;
+      uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
+      uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(i);
+      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, NumElements);
     }
   } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
     unsigned NumElements = STy->getNumElements();
@@ -522,7 +537,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
     auto ElTy = STy->getElementType();
     uint64_t EltSize = DL.getTypeAllocSize(ElTy);
     unsigned EltAlign = DL.getABITypeAlignment(ElTy);
-    uint64_t FragmentSizeInBits = DL.getTypeSizeInBits(ElTy);
+    uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
     for (unsigned i = 0, e = NumElements; i != e; ++i) {
       Constant *In = Init->getAggregateElement(i);
       assert(In && "Couldn't get element of initializer?");
@@ -551,7 +566,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   if (NewGlobals.empty())
     return nullptr;
 
-  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
+  LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
 
   Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
 
@@ -621,7 +636,13 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
 /// reprocessing them.
 static bool AllUsesOfValueWillTrapIfNull(const Value *V,
                                         SmallPtrSetImpl<const PHINode*> &PHIs) {
-  for (const User *U : V->users())
+  for (const User *U : V->users()) {
+    if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      // If null pointer is considered valid, then all uses are non-trapping.
+      // Non address-space 0 globals have already been pruned by the caller.
+      if (NullPointerIsDefined(I->getFunction()))
+        return false;
+    }
     if (isa<LoadInst>(U)) {
       // Will trap.
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
@@ -655,7 +676,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
       //cerr << "NONTRAPPING USE: " << *U;
       return false;
     }
-
+  }
   return true;
 }
 
@@ -682,6 +703,10 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
   bool Changed = false;
   for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
     Instruction *I = cast<Instruction>(*UI++);
+    // Uses are non-trapping if null pointer is considered valid.
+    // Non address-space 0 globals are already pruned by the caller.
+    if (NullPointerIsDefined(I->getFunction()))
+      return false;
     if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
       LI->setOperand(0, NewV);
       Changed = true;
@@ -783,7 +808,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   }
 
   if (Changed) {
-    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n");
+    LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV
+                      << "\n");
     ++NumGlobUses;
   }
 
@@ -797,7 +823,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
       CleanupConstantGlobalUsers(GV, nullptr, DL, TLI);
     }
     if (GV->use_empty()) {
-      DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
+      LLVM_DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
       Changed = true;
       GV->eraseFromParent();
       ++NumDeleted;
@@ -833,7 +859,8 @@ static GlobalVariable *
 OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
                               ConstantInt *NElements, const DataLayout &DL,
                               TargetLibraryInfo *TLI) {
-  DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
+  LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI
+                    << '\n');
 
   Type *GlobalType;
   if (NElements->getZExtValue() == 1)
@@ -1269,7 +1296,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                                             Value *NElems, const DataLayout &DL,
                                             const TargetLibraryInfo *TLI) {
-  DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
+  LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI
+                    << '\n');
   Type *MAT = getMallocAllocatedType(CI, TLI);
   StructType *STy = cast<StructType>(MAT);
 
@@ -1566,7 +1594,10 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
   // users of the loaded value (often calls and loads) that would trap if the
   // value was null.
   if (GV->getInitializer()->getType()->isPointerTy() &&
-      GV->getInitializer()->isNullValue()) {
+      GV->getInitializer()->isNullValue() &&
+      !NullPointerIsDefined(
+          nullptr /* F */,
+          GV->getInitializer()->getType()->getPointerAddressSpace())) {
     if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
       if (GV->getInitializer()->getType() != SOVC->getType())
         SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
@@ -1608,7 +1639,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
     if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
       return false;
 
-  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");
+  LLVM_DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");
 
   // Create the new global, initializing it to false.
   GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
@@ -1652,15 +1683,11 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         // val * (ValOther - ValInit) + ValInit:
         // DW_OP_deref DW_OP_constu <ValMinus>
         // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
-        E = DIExpression::get(NewGV->getContext(),
-                             {dwarf::DW_OP_deref,
-                              dwarf::DW_OP_constu,
-                              ValMinus,
-                              dwarf::DW_OP_mul,
-                              dwarf::DW_OP_constu,
-                              ValInit,
-                              dwarf::DW_OP_plus,
-                              dwarf::DW_OP_stack_value});
+        SmallVector<uint64_t, 12> Ops = {
+            dwarf::DW_OP_deref, dwarf::DW_OP_constu, ValMinus,
+            dwarf::DW_OP_mul,   dwarf::DW_OP_constu, ValInit,
+            dwarf::DW_OP_plus};
+        E = DIExpression::prependOpcodes(E, Ops, DIExpression::WithStackValue);
         DIGlobalVariableExpression *DGVE =
           DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
         NewGV->addDebugInfo(DGVE);
@@ -1732,8 +1759,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   return true;
 }
 
-static bool deleteIfDead(GlobalValue &GV,
-                         SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+static bool deleteIfDead(
+    GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   GV.removeDeadConstantUsers();
 
   if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
@@ -1751,7 +1778,7 @@ static bool deleteIfDead(GlobalValue &GV,
   if (!Dead)
     return false;
 
-  DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+  LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
   GV.eraseFromParent();
   ++NumDeleted;
   return true;
@@ -1917,7 +1944,7 @@ static bool processInternalGlobal(
                                           LookupDomTree)) {
     const DataLayout &DL = GV->getParent()->getDataLayout();
 
-    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
+    LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
     Type *ElemTy = GV->getValueType();
@@ -1938,7 +1965,7 @@ static bool processInternalGlobal(
   // If the global is never loaded (but may be stored to), it is dead.
   // Delete it now.
   if (!GS.IsLoaded) {
-    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
+    LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
 
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
@@ -1960,7 +1987,7 @@ static bool processInternalGlobal(
 
   }
   if (GS.StoredType <= GlobalStatus::InitializerStored) {
-    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
+    LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
@@ -1968,8 +1995,8 @@ static bool processInternalGlobal(
 
     // If the global is dead now, just nuke it.
     if (GV->use_empty()) {
-      DEBUG(dbgs() << "   *** Marking constant allowed us to simplify "
-            << "all users and delete global!\n");
+      LLVM_DEBUG(dbgs() << "   *** Marking constant allowed us to simplify "
+                        << "all users and delete global!\n");
       GV->eraseFromParent();
       ++NumDeleted;
       return true;
@@ -1997,8 +2024,8 @@ static bool processInternalGlobal(
         CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
 
         if (GV->use_empty()) {
-          DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
-                       << "simplify all users and delete global!\n");
+          LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
+                            << "simplify all users and delete global!\n");
           GV->eraseFromParent();
           ++NumDeleted;
         }
@@ -2097,7 +2124,7 @@ static void RemoveNestAttribute(Function *F) {
 /// idea here is that we don't want to mess with the convention if the user
 /// explicitly requested something with performance implications like coldcc,
 /// GHC, or anyregcc.
-static bool isProfitableToMakeFastCC(Function *F) {
+static bool hasChangeableCC(Function *F) {
   CallingConv::ID CC = F->getCallingConv();
 
   // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
@@ -2126,14 +2153,113 @@ static bool isProfitableToMakeFastCC(Function *F) {
   return true;
 }
 
+/// Return true if the block containing the call site has a BlockFrequency of
+/// less than ColdCCRelFreq% of the entry block.
+static bool isColdCallSite(CallSite CS, BlockFrequencyInfo &CallerBFI) {
+  const BranchProbability ColdProb(ColdCCRelFreq, 100);
+  auto CallSiteBB = CS.getInstruction()->getParent();
+  auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
+  auto CallerEntryFreq =
+      CallerBFI.getBlockFreq(&(CS.getCaller()->getEntryBlock()));
+  return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
+// This function checks if the input function F is cold at all call sites. It
+// also looks each call site's containing function, returning false if the
+// caller function contains other non cold calls. The input vector AllCallsCold
+// contains a list of functions that only have call sites in cold blocks.
+static bool
+isValidCandidateForColdCC(Function &F,
+                          function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+                          const std::vector<Function *> &AllCallsCold) {
+
+  if (F.user_empty())
+    return false;
+
+  for (User *U : F.users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+
+    CallSite CS(cast<Instruction>(U));
+    Function *CallerFunc = CS.getInstruction()->getParent()->getParent();
+    BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
+    if (!isColdCallSite(CS, CallerBFI))
+      return false;
+    auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc);
+    if (It == AllCallsCold.end())
+      return false;
+  }
+  return true;
+}
+
+static void changeCallSitesToColdCC(Function *F) {
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    CallSite CS(cast<Instruction>(U));
+    CS.setCallingConv(CallingConv::Cold);
+  }
+}
+
+// This function iterates over all the call instructions in the input Function
+// and checks that all call sites are in cold blocks and are allowed to use the
+// coldcc calling convention.
+static bool
+hasOnlyColdCalls(Function &F,
+                 function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        CallSite CS(cast<Instruction>(CI));
+        // Skip over isline asm instructions since they aren't function calls.
+        if (CI->isInlineAsm())
+          continue;
+        Function *CalledFn = CI->getCalledFunction();
+        if (!CalledFn)
+          return false;
+        if (!CalledFn->hasLocalLinkage())
+          return false;
+        // Skip over instrinsics since they won't remain as function calls.
+        if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
+          continue;
+        // Check if it's valid to use coldcc calling convention.
+        if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() ||
+            CalledFn->hasAddressTaken())
+          return false;
+        BlockFrequencyInfo &CallerBFI = GetBFI(F);
+        if (!isColdCallSite(CS, CallerBFI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
 static bool
 OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
+                  function_ref<TargetTransformInfo &(Function &)> GetTTI,
+                  function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
                   function_ref<DominatorTree &(Function &)> LookupDomTree,
-                  SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+                  SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+
   bool Changed = false;
+
+  std::vector<Function *> AllCallsCold;
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
+    Function *F = &*FI++;
+    if (hasOnlyColdCalls(*F, GetBFI))
+      AllCallsCold.push_back(F);
+  }
+
   // Optimize functions.
   for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
     Function *F = &*FI++;
+
+    // Don't perform global opt pass on naked functions; we don't want fast
+    // calling conventions for naked functions.
+    if (F->hasFnAttribute(Attribute::Naked))
+      continue;
+
     // Functions without names cannot be referenced outside this module.
     if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
       F->setLinkage(GlobalValue::InternalLinkage);
@@ -2165,7 +2291,25 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
 
     if (!F->hasLocalLinkage())
       continue;
-    if (isProfitableToMakeFastCC(F) && !F->isVarArg() &&
+
+    if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
+      NumInternalFunc++;
+      TargetTransformInfo &TTI = GetTTI(*F);
+      // Change the calling convention to coldcc if either stress testing is
+      // enabled or the target would like to use coldcc on functions which are
+      // cold at all call sites and the callers contain no other non coldcc
+      // calls.
+      if (EnableColdCCStressTest ||
+          (isValidCandidateForColdCC(*F, GetBFI, AllCallsCold) &&
+           TTI.useColdCCForColdCall(*F))) {
+        F->setCallingConv(CallingConv::Cold);
+        changeCallSitesToColdCC(F);
+        Changed = true;
+        NumColdCC++;
+      }
+    }
+
+    if (hasChangeableCC(F) && !F->isVarArg() &&
         !F->hasAddressTaken()) {
       // If this function has a calling convention worth changing, is not a
       // varargs function, and is only called directly, promote it to use the
@@ -2191,7 +2335,7 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
 static bool
 OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
                    function_ref<DominatorTree &(Function &)> LookupDomTree,
-                   SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+                   SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   bool Changed = false;
 
   for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
@@ -2277,6 +2421,131 @@ static void CommitValueTo(Constant *Val, Constant *Addr) {
   GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
 }
 
+/// Given a map of address -> value, where addresses are expected to be some form
+/// of either a global or a constant GEP, set the initializer for the address to
+/// be the value. This performs mostly the same function as CommitValueTo()
+/// and EvaluateStoreInto() but is optimized to be more efficient for the common
+/// case where the set of addresses are GEPs sharing the same underlying global,
+/// processing the GEPs in batches rather than individually.
+///
+/// To give an example, consider the following C++ code adapted from the clang
+/// regression tests:
+/// struct S {
+///  int n = 10;
+///  int m = 2 * n;
+///  S(int a) : n(a) {}
+/// };
+///
+/// template<typename T>
+/// struct U {
+///  T *r = &q;
+///  T q = 42;
+///  U *p = this;
+/// };
+///
+/// U<S> e;
+///
+/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
+/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
+/// members. This batch algorithm will simply use general CommitValueTo() method
+/// to handle the complex nested S struct initialization of 'q', before
+/// processing the outermost members in a single batch. Using CommitValueTo() to
+/// handle member in the outer struct is inefficient when the struct/array is
+/// very large as we end up creating and destroy constant arrays for each
+/// initialization.
+/// For the above case, we expect the following IR to be generated:
+///
+/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
+/// %struct.S = type { i32, i32 }
+/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
+///                                                  i64 0, i32 1),
+///                         %struct.S { i32 42, i32 84 }, %struct.U* @e }
+/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
+/// constant expression, while the other two elements of @e are "simple".
+static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
+  SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
+  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
+  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
+  SimpleCEs.reserve(Mem.size());
+
+  for (const auto &I : Mem) {
+    if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
+      GVs.push_back(std::make_pair(GV, I.second));
+    } else {
+      ConstantExpr *GEP = cast<ConstantExpr>(I.first);
+      // We don't handle the deeply recursive case using the batch method.
+      if (GEP->getNumOperands() > 3)
+        ComplexCEs.push_back(std::make_pair(GEP, I.second));
+      else
+        SimpleCEs.push_back(std::make_pair(GEP, I.second));
+    }
+  }
+
+  // The algorithm below doesn't handle cases like nested structs, so use the
+  // slower fully general method if we have to.
+  for (auto ComplexCE : ComplexCEs)
+    CommitValueTo(ComplexCE.second, ComplexCE.first);
+
+  for (auto GVPair : GVs) {
+    assert(GVPair.first->hasInitializer());
+    GVPair.first->setInitializer(GVPair.second);
+  }
+
+  if (SimpleCEs.empty())
+    return;
+
+  // We cache a single global's initializer elements in the case where the
+  // subsequent address/val pair uses the same one. This avoids throwing away and
+  // rebuilding the constant struct/vector/array just because one element is
+  // modified at a time.
+  SmallVector<Constant *, 32> Elts;
+  Elts.reserve(SimpleCEs.size());
+  GlobalVariable *CurrentGV = nullptr;
+
+  auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
+    Constant *Init = GV->getInitializer();
+    Type *Ty = Init->getType();
+    if (Update) {
+      if (CurrentGV) {
+        assert(CurrentGV && "Expected a GV to commit to!");
+        Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
+        // We have a valid cache that needs to be committed.
+        if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
+          CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
+        else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
+          CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
+        else
+          CurrentGV->setInitializer(ConstantVector::get(Elts));
+      }
+      if (CurrentGV == GV)
+        return;
+      // Need to clear and set up cache for new initializer.
+      CurrentGV = GV;
+      Elts.clear();
+      unsigned NumElts;
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        NumElts = STy->getNumElements();
+      else
+        NumElts = cast<SequentialType>(Ty)->getNumElements();
+      for (unsigned i = 0, e = NumElts; i != e; ++i)
+        Elts.push_back(Init->getAggregateElement(i));
+    }
+  };
+
+  for (auto CEPair : SimpleCEs) {
+    ConstantExpr *GEP = CEPair.first;
+    Constant *Val = CEPair.second;
+
+    GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
+    commitAndSetupCache(GV, GV != CurrentGV);
+    ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
+    Elts[CI->getZExtValue()] = Val;
+  }
+  // The last initializer in the list needs to be committed, others
+  // will be committed on a new initializer being processed.
+  commitAndSetupCache(CurrentGV, true);
+}
+
 /// Evaluate static constructors in the function, if we can.  Return true if we
 /// can, false otherwise.
 static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
@@ -2291,11 +2560,10 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
     ++NumCtorsEvaluated;
 
     // We succeeded at evaluation: commit the result.
-    DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
-          << F->getName() << "' to " << Eval.getMutatedMemory().size()
-          << " stores.\n");
-    for (const auto &I : Eval.getMutatedMemory())
-      CommitValueTo(I.second, I.first);
+    LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+                      << F->getName() << "' to "
+                      << Eval.getMutatedMemory().size() << " stores.\n");
+    BatchCommitValueTo(Eval.getMutatedMemory());
     for (GlobalVariable *GV : Eval.getInvariants())
       GV->setConstant(true);
   }
@@ -2310,7 +2578,7 @@ static int compareNames(Constant *const *A, Constant *const *B) {
 }
 
 static void setUsedInitializer(GlobalVariable &V,
-                               const SmallPtrSet<GlobalValue *, 8> &Init) {
+                               const SmallPtrSetImpl<GlobalValue *> &Init) {
   if (Init.empty()) {
     V.eraseFromParent();
     return;
@@ -2463,7 +2731,7 @@ static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
 
 static bool
 OptimizeGlobalAliases(Module &M,
-                      SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+                      SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   bool Changed = false;
   LLVMUsed Used(M);
 
@@ -2483,7 +2751,7 @@ OptimizeGlobalAliases(Module &M,
       continue;
     }
 
-    // If the aliasee may change at link time, nothing can be done - bail out.
+    // If the alias can change at link time, nothing can be done - bail out.
     if (J->isInterposable())
       continue;
 
@@ -2509,6 +2777,7 @@ OptimizeGlobalAliases(Module &M,
       // Give the aliasee the name, linkage and other attributes of the alias.
       Target->takeName(&*J);
       Target->setLinkage(J->getLinkage());
+      Target->setDSOLocal(J->isDSOLocal());
       Target->setVisibility(J->getVisibility());
       Target->setDLLStorageClass(J->getDLLStorageClass());
 
@@ -2642,8 +2911,10 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 
 static bool optimizeGlobalsInModule(
     Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI,
+    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
     function_ref<DominatorTree &(Function &)> LookupDomTree) {
-  SmallSet<const Comdat *, 8> NotDiscardableComdats;
+  SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -2664,8 +2935,8 @@ static bool optimizeGlobalsInModule(
           NotDiscardableComdats.insert(C);
 
     // Delete functions that are trivially dead, ccc -> fastcc
-    LocalChange |=
-        OptimizeFunctions(M, TLI, LookupDomTree, NotDiscardableComdats);
+    LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree,
+                                     NotDiscardableComdats);
 
     // Optimize global_ctors list.
     LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
@@ -2702,7 +2973,15 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
       return FAM.getResult<DominatorTreeAnalysis>(F);
     };
-    if (!optimizeGlobalsInModule(M, DL, &TLI, LookupDomTree))
+    auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+      return FAM.getResult<TargetIRAnalysis>(F);
+    };
+
+    auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+      return FAM.getResult<BlockFrequencyAnalysis>(F);
+    };
+
+    if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree))
       return PreservedAnalyses::all();
     return PreservedAnalyses::none();
 }
@@ -2725,12 +3004,22 @@ struct GlobalOptLegacyPass : public ModulePass {
     auto LookupDomTree = [this](Function &F) -> DominatorTree & {
       return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
     };
-    return optimizeGlobalsInModule(M, DL, TLI, LookupDomTree);
+    auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+      return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+    };
+
+    return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
   }
 };
 
@@ -2741,6 +3030,8 @@ char GlobalOptLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
                       "Global Variable Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
                     "Global Variable Optimizer", false, false)
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
index d5d35ee89e0e..dce9ee076bc5 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -40,7 +40,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeInferFunctionAttrsLegacyPassPass(Registry);
   initializeInternalizeLegacyPassPass(Registry);
   initializeLoopExtractorPass(Registry);
-  initializeBlockExtractorPassPass(Registry);
+  initializeBlockExtractorPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
   initializeLowerTypeTestsPass(Registry);
   initializeMergeFunctionsPass(Registry);
@@ -48,6 +48,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializePostOrderFunctionAttrsLegacyPassPass(Registry);
   initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
   initializePruneEHPass(Registry);
+  initializeIPSCCPLegacyPassPass(Registry);
   initializeStripDeadPrototypesLegacyPassPass(Registry);
   initializeStripSymbolsPass(Registry);
   initializeStripDebugDeclarePass(Registry);
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
index b259a0abd63c..82bba1e5c93b 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Actual inliner pass implementation.
+/// Actual inliner pass implementation.
 ///
 /// The common implementation of the inlining logic is shared between this
 /// inliner pass and the always inliner pass. The two passes use different cost
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
index 4449c87ddefa..3da0c2e83eb8 100644
--- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
@@ -59,7 +60,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -208,8 +208,8 @@ static void mergeInlinedArrayAllocas(
 
       // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
       // success!
-      DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI
-                   << "\n\t\tINTO: " << *AvailableAlloca << '\n');
+      LLVM_DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI
+                        << "\n\t\tINTO: " << *AvailableAlloca << '\n');
 
       // Move affected dbg.declare calls immediately after the new alloca to
       // avoid the situation when a dbg.declare precedes its alloca.
@@ -379,14 +379,14 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
   Function *Caller = CS.getCaller();
 
   if (IC.isAlways()) {
-    DEBUG(dbgs() << "    Inlining: cost=always"
-                 << ", Call: " << *CS.getInstruction() << "\n");
+    LLVM_DEBUG(dbgs() << "    Inlining: cost=always"
+                      << ", Call: " << *CS.getInstruction() << "\n");
     return IC;
   }
 
   if (IC.isNever()) {
-    DEBUG(dbgs() << "    NOT Inlining: cost=never"
-                 << ", Call: " << *CS.getInstruction() << "\n");
+    LLVM_DEBUG(dbgs() << "    NOT Inlining: cost=never"
+                      << ", Call: " << *CS.getInstruction() << "\n");
     ORE.emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
              << NV("Callee", Callee) << " not inlined into "
@@ -397,9 +397,9 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
   }
 
   if (!IC) {
-    DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
-                 << ", thres=" << IC.getThreshold()
-                 << ", Call: " << *CS.getInstruction() << "\n");
+    LLVM_DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
+                      << ", thres=" << IC.getThreshold()
+                      << ", Call: " << *CS.getInstruction() << "\n");
     ORE.emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
              << NV("Callee", Callee) << " not inlined into "
@@ -412,9 +412,9 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
 
   int TotalSecondaryCost = 0;
   if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost, GetInlineCost)) {
-    DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
-                 << " Cost = " << IC.getCost()
-                 << ", outer Cost = " << TotalSecondaryCost << '\n');
+    LLVM_DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
+                      << " Cost = " << IC.getCost()
+                      << ", outer Cost = " << TotalSecondaryCost << '\n');
     ORE.emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
                                       Call)
@@ -428,9 +428,9 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
     return None;
   }
 
-  DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
-               << ", thres=" << IC.getThreshold()
-               << ", Call: " << *CS.getInstruction() << '\n');
+  LLVM_DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
+                    << ", thres=" << IC.getThreshold()
+                    << ", Call: " << *CS.getInstruction() << '\n');
   return IC;
 }
 
@@ -470,12 +470,12 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
                 function_ref<AAResults &(Function &)> AARGetter,
                 ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
   SmallPtrSet<Function *, 8> SCCFunctions;
-  DEBUG(dbgs() << "Inliner visiting SCC:");
+  LLVM_DEBUG(dbgs() << "Inliner visiting SCC:");
   for (CallGraphNode *Node : SCC) {
     Function *F = Node->getFunction();
     if (F)
       SCCFunctions.insert(F);
-    DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
+    LLVM_DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
   }
 
   // Scan through and identify all call sites ahead of time so that we only
@@ -524,7 +524,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       }
   }
 
-  DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
+  LLVM_DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
 
   // If there are no calls in this function, exit early.
   if (CallSites.empty())
@@ -593,7 +593,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       // size.  This happens because IPSCCP propagates the result out of the
       // call and then we're left with the dead call.
       if (IsTriviallyDead) {
-        DEBUG(dbgs() << "    -> Deleting dead call: " << *Instr << "\n");
+        LLVM_DEBUG(dbgs() << "    -> Deleting dead call: " << *Instr << "\n");
         // Update the call graph by deleting the edge from Callee to Caller.
         CG[Caller]->removeCallEdgeFor(CS);
         Instr->eraseFromParent();
@@ -657,8 +657,8 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
           // callgraph references to the node, we cannot delete it yet, this
           // could invalidate the CGSCC iterator.
           CG[Callee]->getNumReferences() == 0) {
-        DEBUG(dbgs() << "    -> Deleting dead function: " << Callee->getName()
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "    -> Deleting dead function: "
+                          << Callee->getName() << "\n");
         CallGraphNode *CalleeNode = CG[Callee];
 
         // Remove any call graph edges from the callee to its callees.
@@ -793,6 +793,14 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
   return true;
 }
 
+InlinerPass::~InlinerPass() {
+  if (ImportedFunctionsStats) {
+    assert(InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No);
+    ImportedFunctionsStats->dump(InlinerFunctionImportStats ==
+                                 InlinerFunctionImportStatsOpts::Verbose);
+  }
+}
+
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
@@ -804,6 +812,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   Module &M = *InitialC.begin()->getFunction().getParent();
   ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
 
+  if (!ImportedFunctionsStats &&
+      InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
+    ImportedFunctionsStats =
+        llvm::make_unique<ImportedFunctionsInliningStatistics>();
+    ImportedFunctionsStats->setModuleInfo(M);
+  }
+
   // We use a single common worklist for calls across the entire SCC. We
   // process these in-order and append new calls introduced during inlining to
   // the end.
@@ -830,8 +845,14 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // incrementally maknig a single function grow in a super linear fashion.
   SmallVector<std::pair<CallSite, int>, 16> Calls;
 
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
+          .getManager();
+
   // Populate the initial list of calls in this SCC.
   for (auto &N : InitialC) {
+    auto &ORE =
+        FAM.getResult<OptimizationRemarkEmitterAnalysis>(N.getFunction());
     // We want to generally process call sites top-down in order for
     // simplifications stemming from replacing the call with the returned value
     // after inlining to be visible to subsequent inlining decisions.
@@ -839,9 +860,20 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // Instead we should do an actual RPO walk of the function body.
     for (Instruction &I : instructions(N.getFunction()))
       if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
+        if (Function *Callee = CS.getCalledFunction()) {
           if (!Callee->isDeclaration())
             Calls.push_back({CS, -1});
+          else if (!isa<IntrinsicInst>(I)) {
+            using namespace ore;
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+                     << NV("Callee", Callee) << " will not be inlined into "
+                     << NV("Caller", CS.getCaller())
+                     << " because its definition is unavailable"
+                     << setIsVerbose();
+            });
+          }
+        }
   }
   if (Calls.empty())
     return PreservedAnalyses::all();
@@ -879,7 +911,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
 
-    DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
 
     // Get a FunctionAnalysisManager via a proxy for this particular node. We
     // do this each time we visit a node as the SCC may have changed and as
@@ -931,9 +963,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       // and thus hidden from the full inline history.
       if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
           UR.InlinedInternalEdges.count({&N, C})) {
-        DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
-                        "previously split out of this SCC by inlining: "
-                     << F.getName() << " -> " << Callee.getName() << "\n");
+        LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+                             "previously split out of this SCC by inlining: "
+                          << F.getName() << " -> " << Callee.getName() << "\n");
         continue;
       }
 
@@ -992,6 +1024,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
               Calls.push_back({CS, NewHistoryID});
       }
 
+      if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+        ImportedFunctionsStats->recordInline(F, Callee);
+
       // Merge the attributes based on the inlining.
       AttributeFuncs::mergeAttributesForInlining(F, Callee);
 
@@ -1052,7 +1087,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // change.
     LazyCallGraph::SCC *OldC = C;
     C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
-    DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
+    LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
 
     // If this causes an SCC to split apart into multiple smaller SCCs, there
@@ -1070,8 +1105,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
           return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
         })) {
-      DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
-                      "retaining this to avoid infinite inlining.\n");
+      LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+                           "retaining this to avoid infinite inlining.\n");
       UR.InlinedInternalEdges.insert({&N, OldC});
     }
     InlinedCallees.clear();
diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
index 26db1465bb26..a6542d28dfd8 100644
--- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -192,7 +192,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
       ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
 
     ++NumFunctions;
-    DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
   }
 
   // Never internalize the llvm.used symbol.  It is used to implement
@@ -221,7 +221,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
     Changed = true;
 
     ++NumGlobals;
-    DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
   }
 
   // Mark all aliases that are not in the api as internal as well.
@@ -231,7 +231,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
     Changed = true;
 
     ++NumAliases;
-    DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
   }
 
   return Changed;
diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index 36b6bdba2cd0..8c86f7cb806a 100644
--- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 #include <fstream>
@@ -158,155 +159,3 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
 Pass *llvm::createSingleLoopExtractorPass() {
   return new SingleLoopExtractor();
 }
-
-
-// BlockFile - A file which contains a list of blocks that should not be
-// extracted.
-static cl::opt<std::string>
-BlockFile("extract-blocks-file", cl::value_desc("filename"),
-          cl::desc("A file containing list of basic blocks to not extract"),
-          cl::Hidden);
-
-namespace {
-  /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
-  /// from the module into their own functions except for those specified by the
-  /// BlocksToNotExtract list.
-  class BlockExtractorPass : public ModulePass {
-    void LoadFile(const char *Filename);
-    void SplitLandingPadPreds(Function *F);
-
-    std::vector<BasicBlock*> BlocksToNotExtract;
-    std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    BlockExtractorPass() : ModulePass(ID) {
-      if (!BlockFile.empty())
-        LoadFile(BlockFile.c_str());
-    }
-
-    bool runOnModule(Module &M) override;
-  };
-}
-
-char BlockExtractorPass::ID = 0;
-INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
-                "Extract Basic Blocks From Module (for bugpoint use)",
-                false, false)
-
-// createBlockExtractorPass - This pass extracts all blocks (except those
-// specified in the argument list) from the functions in the module.
-//
-ModulePass *llvm::createBlockExtractorPass() {
-  return new BlockExtractorPass();
-}
-
-void BlockExtractorPass::LoadFile(const char *Filename) {
-  // Load the BlockFile...
-  std::ifstream In(Filename);
-  if (!In.good()) {
-    errs() << "WARNING: BlockExtractor couldn't load file '" << Filename
-           << "'!\n";
-    return;
-  }
-  while (In) {
-    std::string FunctionName, BlockName;
-    In >> FunctionName;
-    In >> BlockName;
-    if (!BlockName.empty())
-      BlocksToNotExtractByName.push_back(
-          std::make_pair(FunctionName, BlockName));
-  }
-}
-
-/// SplitLandingPadPreds - The landing pad needs to be extracted with the invoke
-/// instruction. The critical edge breaker will refuse to break critical edges
-/// to a landing pad. So do them here. After this method runs, all landing pads
-/// should have only one predecessor.
-void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-    InvokeInst *II = dyn_cast<InvokeInst>(I);
-    if (!II) continue;
-    BasicBlock *Parent = II->getParent();
-    BasicBlock *LPad = II->getUnwindDest();
-
-    // Look through the landing pad's predecessors. If one of them ends in an
-    // 'invoke', then we want to split the landing pad.
-    bool Split = false;
-    for (pred_iterator
-           PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ++PI) {
-      BasicBlock *BB = *PI;
-      if (BB->isLandingPad() && BB != Parent &&
-          isa<InvokeInst>(Parent->getTerminator())) {
-        Split = true;
-        break;
-      }
-    }
-
-    if (!Split) continue;
-
-    SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
-  }
-}
-
-bool BlockExtractorPass::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
-  std::set<BasicBlock*> TranslatedBlocksToNotExtract;
-  for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
-    BasicBlock *BB = BlocksToNotExtract[i];
-    Function *F = BB->getParent();
-
-    // Map the corresponding function in this module.
-    Function *MF = M.getFunction(F->getName());
-    assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?");
-
-    // Figure out which index the basic block is in its function.
-    Function::iterator BBI = MF->begin();
-    std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
-    TranslatedBlocksToNotExtract.insert(&*BBI);
-  }
-
-  while (!BlocksToNotExtractByName.empty()) {
-    // There's no way to find BBs by name without looking at every BB inside
-    // every Function. Fortunately, this is always empty except when used by
-    // bugpoint in which case correctness is more important than performance.
-
-    std::string &FuncName  = BlocksToNotExtractByName.back().first;
-    std::string &BlockName = BlocksToNotExtractByName.back().second;
-
-    for (Function &F : M) {
-      if (F.getName() != FuncName) continue;
-
-      for (BasicBlock &BB : F) {
-        if (BB.getName() != BlockName) continue;
-
-        TranslatedBlocksToNotExtract.insert(&BB);
-      }
-    }
-
-    BlocksToNotExtractByName.pop_back();
-  }
-
-  // Now that we know which blocks to not extract, figure out which ones we WANT
-  // to extract.
-  std::vector<BasicBlock*> BlocksToExtract;
-  for (Function &F : M) {
-    SplitLandingPadPreds(&F);
-    for (BasicBlock &BB : F)
-      if (!TranslatedBlocksToNotExtract.count(&BB))
-        BlocksToExtract.push_back(&BB);
-  }
-
-  for (BasicBlock *BlockToExtract : BlocksToExtract) {
-    SmallVector<BasicBlock*, 2> BlocksToExtractVec;
-    BlocksToExtractVec.push_back(BlockToExtract);
-    if (const InvokeInst *II =
-            dyn_cast<InvokeInst>(BlockToExtract->getTerminator()))
-      BlocksToExtractVec.push_back(II->getUnwindDest());
-    CodeExtractor(BlocksToExtractVec).extractCodeRegion();
-  }
-
-  return !BlocksToExtract.empty();
-}
diff --git a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 8db7e1e142d2..4f7571884707 100644
--- a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass lowers type metadata and calls to the llvm.type.test intrinsic.
+// It also ensures that globals are properly laid out for the
+// llvm.icall.branch.funnel intrinsic.
 // See http://llvm.org/docs/TypeMetadata.html for more information.
 //
 //===----------------------------------------------------------------------===//
@@ -25,6 +27,7 @@
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -291,6 +294,33 @@ public:
   }
 };
 
+struct ICallBranchFunnel final
+    : TrailingObjects<ICallBranchFunnel, GlobalTypeMember *> {
+  static ICallBranchFunnel *create(BumpPtrAllocator &Alloc, CallInst *CI,
+                                   ArrayRef<GlobalTypeMember *> Targets,
+                                   unsigned UniqueId) {
+    auto *Call = static_cast<ICallBranchFunnel *>(
+        Alloc.Allocate(totalSizeToAlloc<GlobalTypeMember *>(Targets.size()),
+                       alignof(ICallBranchFunnel)));
+    Call->CI = CI;
+    Call->UniqueId = UniqueId;
+    Call->NTargets = Targets.size();
+    std::uninitialized_copy(Targets.begin(), Targets.end(),
+                            Call->getTrailingObjects<GlobalTypeMember *>());
+    return Call;
+  }
+
+  CallInst *CI;
+  ArrayRef<GlobalTypeMember *> targets() const {
+    return makeArrayRef(getTrailingObjects<GlobalTypeMember *>(), NTargets);
+  }
+
+  unsigned UniqueId;
+
+private:
+  size_t NTargets;
+};
+
 class LowerTypeTestsModule {
   Module &M;
 
@@ -372,6 +402,7 @@ class LowerTypeTestsModule {
       const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
   Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                            const TypeIdLowering &TIL);
+
   void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
                                        ArrayRef<GlobalTypeMember *> Globals);
   unsigned getJumpTableEntrySize();
@@ -383,19 +414,32 @@ class LowerTypeTestsModule {
   void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
                                  ArrayRef<GlobalTypeMember *> Functions);
   void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
-                                    ArrayRef<GlobalTypeMember *> Functions);
+                                       ArrayRef<GlobalTypeMember *> Functions);
   void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
                                      ArrayRef<GlobalTypeMember *> Functions);
-  void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
-                                   ArrayRef<GlobalTypeMember *> Globals);
+  void
+  buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
+                              ArrayRef<GlobalTypeMember *> Globals,
+                              ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
 
-  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT);
+  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, bool IsDefinition);
   void moveInitializerToModuleConstructor(GlobalVariable *GV);
   void findGlobalVariableUsersOf(Constant *C,
                                  SmallSetVector<GlobalVariable *, 8> &Out);
 
   void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
 
+  /// replaceCfiUses - Go through the uses list for this definition
+  /// and make each use point to "V" instead of "this" when the use is outside
+  /// the block. 'This's use list is expected to have at least one element.
+  /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
+  /// uses.
+  void replaceCfiUses(Function *Old, Value *New, bool IsDefinition);
+
+  /// replaceDirectCalls - Go through the uses list for this definition and
+  /// replace each use, which is a direct function call.
+  void replaceDirectCalls(Value *Old, Value *New);
+
 public:
   LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
                        const ModuleSummaryIndex *ImportSummary);
@@ -427,8 +471,6 @@ struct LowerTypeTests : public ModulePass {
   }
 
   bool runOnModule(Module &M) override {
-    if (skipModule(M))
-      return false;
     if (UseCommandLine)
       return LowerTypeTestsModule::runForTesting(M);
     return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
@@ -729,10 +771,12 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
     // Compute the amount of padding required.
     uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
 
-    // Cap at 128 was found experimentally to have a good data/instruction
-    // overhead tradeoff.
-    if (Padding > 128)
-      Padding = alignTo(InitSize, 128) - InitSize;
+    // Experiments of different caps with Chromium on both x64 and ARM64
+    // have shown that the 32-byte cap generates the smallest binary on
+    // both platforms while different caps yield similar performance.
+    // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
+    if (Padding > 32)
+      Padding = alignTo(InitSize, 32) - InitSize;
 
     GlobalInits.push_back(
         ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
@@ -936,14 +980,23 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
 void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
   assert(F->getType()->getAddressSpace() == 0);
 
-  // Declaration of a local function - nothing to do.
-  if (F->isDeclarationForLinker() && isDefinition)
-    return;
-
   GlobalValue::VisibilityTypes Visibility = F->getVisibility();
   std::string Name = F->getName();
-  Function *FDecl;
 
+  if (F->isDeclarationForLinker() && isDefinition) {
+    // Non-dso_local functions may be overriden at run time,
+    // don't short curcuit them
+    if (F->isDSOLocal()) {
+      Function *RealF = Function::Create(F->getFunctionType(),
+                                         GlobalValue::ExternalLinkage,
+                                         Name + ".cfi", &M);
+      RealF->setVisibility(GlobalVariable::HiddenVisibility);
+      replaceDirectCalls(F, RealF);
+    }
+    return;
+  }
+
+  Function *FDecl;
   if (F->isDeclarationForLinker() && !isDefinition) {
     // Declaration of an external function.
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
@@ -952,10 +1005,25 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
   } else if (isDefinition) {
     F->setName(Name + ".cfi");
     F->setLinkage(GlobalValue::ExternalLinkage);
-    F->setVisibility(GlobalValue::HiddenVisibility);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
                              Name, &M);
     FDecl->setVisibility(Visibility);
+    Visibility = GlobalValue::HiddenVisibility;
+
+    // Delete aliases pointing to this function, they'll be re-created in the
+    // merged output
+    SmallVector<GlobalAlias*, 4> ToErase;
+    for (auto &U : F->uses()) {
+      if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
+        Function *AliasDecl = Function::Create(
+            F->getFunctionType(), GlobalValue::ExternalLinkage, "", &M);
+        AliasDecl->takeName(A);
+        A->replaceAllUsesWith(AliasDecl);
+        ToErase.push_back(A);
+      }
+    }
+    for (auto *A : ToErase)
+      A->eraseFromParent();
   } else {
     // Function definition without type metadata, where some other translation
     // unit contained a declaration with type metadata. This normally happens
@@ -966,9 +1034,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
   }
 
   if (F->isWeakForLinker())
-    replaceWeakDeclarationWithJumpTablePtr(F, FDecl);
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isDefinition);
   else
-    F->replaceAllUsesWith(FDecl);
+    replaceCfiUses(F, FDecl, isDefinition);
+
+  // Set visibility late because it's used in replaceCfiUses() to determine
+  // whether uses need to to be replaced.
+  F->setVisibility(Visibility);
 }
 
 void LowerTypeTestsModule::lowerTypeTestCalls(
@@ -980,7 +1052,7 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
   for (Metadata *TypeId : TypeIds) {
     // Build the bitset.
     BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
-    DEBUG({
+    LLVM_DEBUG({
       if (auto MDS = dyn_cast<MDString>(TypeId))
         dbgs() << MDS->getString() << ": ";
       else
@@ -1150,7 +1222,7 @@ void LowerTypeTestsModule::findGlobalVariableUsersOf(
 
 // Replace all uses of F with (F ? JT : 0).
 void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
-    Function *F, Constant *JT) {
+    Function *F, Constant *JT, bool IsDefinition) {
   // The target expression can not appear in a constant initializer on most
   // (all?) targets. Switch to a runtime initializer.
   SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
@@ -1163,7 +1235,7 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
   Function *PlaceholderFn =
       Function::Create(cast<FunctionType>(F->getValueType()),
                        GlobalValue::ExternalWeakLinkage, "", &M);
-  F->replaceAllUsesWith(PlaceholderFn);
+  replaceCfiUses(F, PlaceholderFn, IsDefinition);
 
   Constant *Target = ConstantExpr::getSelect(
       ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
@@ -1226,12 +1298,6 @@ void LowerTypeTestsModule::createJumpTable(
     createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs,
                          cast<Function>(Functions[I]->getGlobal()));
 
-  // Try to emit the jump table at the end of the text segment.
-  // Jump table must come after __cfi_check in the cross-dso mode.
-  // FIXME: this magic section name seems to do the trick.
-  F->setSection(ObjectFormat == Triple::MachO
-                    ? "__TEXT,__text,regular,pure_instructions"
-                    : ".text.cfi");
   // Align the whole table by entry size.
   F->setAlignment(getJumpTableEntrySize());
   // Skip prologue.
@@ -1248,6 +1314,8 @@ void LowerTypeTestsModule::createJumpTable(
     // by Clang for -march=armv7.
     F->addFnAttr("target-cpu", "cortex-a8");
   }
+  // Make sure we don't emit .eh_frame for this function.
+  F->addFnAttr(Attribute::NoUnwind);
 
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
   IRBuilder<> IRB(BB);
@@ -1389,9 +1457,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     }
     if (!IsDefinition) {
       if (F->isWeakForLinker())
-        replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
+        replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, IsDefinition);
       else
-        F->replaceAllUsesWith(CombinedGlobalElemPtr);
+        replaceCfiUses(F, CombinedGlobalElemPtr, IsDefinition);
     } else {
       assert(F->getType()->getAddressSpace() == 0);
 
@@ -1401,10 +1469,10 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
       FAlias->takeName(F);
       if (FAlias->hasName())
         F->setName(FAlias->getName() + ".cfi");
-      F->replaceUsesExceptBlockAddr(FAlias);
+      replaceCfiUses(F, FAlias, IsDefinition);
+      if (!F->hasLocalLinkage())
+        F->setVisibility(GlobalVariable::HiddenVisibility);
     }
-    if (!F->isDeclarationForLinker())
-      F->setLinkage(GlobalValue::InternalLinkage);
   }
 
   createJumpTable(JumpTableFn, Functions);
@@ -1447,7 +1515,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
 }
 
 void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals,
+    ArrayRef<ICallBranchFunnel *> ICallBranchFunnels) {
   DenseMap<Metadata *, uint64_t> TypeIdIndices;
   for (unsigned I = 0; I != TypeIds.size(); ++I)
     TypeIdIndices[TypeIds[I]] = I;
@@ -1456,15 +1525,25 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
   // the type identifier.
   std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
   unsigned GlobalIndex = 0;
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalIndices;
   for (GlobalTypeMember *GTM : Globals) {
     for (MDNode *Type : GTM->types()) {
       // Type = { offset, type identifier }
-      unsigned TypeIdIndex = TypeIdIndices[Type->getOperand(1)];
-      TypeMembers[TypeIdIndex].insert(GlobalIndex);
+      auto I = TypeIdIndices.find(Type->getOperand(1));
+      if (I != TypeIdIndices.end())
+        TypeMembers[I->second].insert(GlobalIndex);
     }
+    GlobalIndices[GTM] = GlobalIndex;
     GlobalIndex++;
   }
 
+  for (ICallBranchFunnel *JT : ICallBranchFunnels) {
+    TypeMembers.emplace_back();
+    std::set<uint64_t> &TMSet = TypeMembers.back();
+    for (GlobalTypeMember *T : JT->targets())
+      TMSet.insert(GlobalIndices[T]);
+  }
+
   // Order the sets of indices by size. The GlobalLayoutBuilder works best
   // when given small index sets first.
   std::stable_sort(
@@ -1514,7 +1593,7 @@ LowerTypeTestsModule::LowerTypeTestsModule(
 }
 
 bool LowerTypeTestsModule::runForTesting(Module &M) {
-  ModuleSummaryIndex Summary;
+  ModuleSummaryIndex Summary(/*HaveGVs=*/false);
 
   // Handle the command-line summary arguments. This code is for testing
   // purposes only, so we handle errors directly.
@@ -1549,11 +1628,71 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
   return Changed;
 }
 
+static bool isDirectCall(Use& U) {
+  auto *Usr = dyn_cast<CallInst>(U.getUser());
+  if (Usr) {
+    CallSite CS(Usr);
+    if (CS.isCallee(&U))
+      return true;
+  }
+  return false;
+}
+
+void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefinition) {
+  SmallSetVector<Constant *, 4> Constants;
+  auto UI = Old->use_begin(), E = Old->use_end();
+  for (; UI != E;) {
+    Use &U = *UI;
+    ++UI;
+
+    // Skip block addresses
+    if (isa<BlockAddress>(U.getUser()))
+      continue;
+
+    // Skip direct calls to externally defined or non-dso_local functions
+    if (isDirectCall(U) && (Old->isDSOLocal() || !IsDefinition))
+      continue;
+
+    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+    // constant because they are uniqued.
+    if (auto *C = dyn_cast<Constant>(U.getUser())) {
+      if (!isa<GlobalValue>(C)) {
+        // Save unique users to avoid processing operand replacement
+        // more than once.
+        Constants.insert(C);
+        continue;
+      }
+    }
+
+    U.set(New);
+  }
+
+  // Process operand replacement of saved constants.
+  for (auto *C : Constants)
+    C->handleOperandChange(Old, New);
+}
+
+void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
+  auto UI = Old->use_begin(), E = Old->use_end();
+  for (; UI != E;) {
+    Use &U = *UI;
+    ++UI;
+
+    if (!isDirectCall(U))
+      continue;
+
+    U.set(New);
+  }
+}
+
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
-  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary &&
-      !ImportSummary)
+  Function *ICallBranchFunnelFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+      (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
+      !ExportSummary && !ImportSummary)
     return false;
 
   if (ImportSummary) {
@@ -1565,6 +1704,10 @@ bool LowerTypeTestsModule::lower() {
       }
     }
 
+    if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
+      report_fatal_error(
+          "unexpected call to llvm.icall.branch.funnel during import phase");
+
     SmallVector<Function *, 8> Defs;
     SmallVector<Function *, 8> Decls;
     for (auto &F : M) {
@@ -1589,8 +1732,8 @@ bool LowerTypeTestsModule::lower() {
   // Equivalence class set containing type identifiers and the globals that
   // reference them. This is used to partition the set of type identifiers in
   // the module into disjoint sets.
-  using GlobalClassesTy =
-      EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>;
+  using GlobalClassesTy = EquivalenceClasses<
+      PointerUnion3<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
   GlobalClassesTy GlobalClasses;
 
   // Verify the type metadata and build a few data structures to let us
@@ -1602,33 +1745,61 @@ bool LowerTypeTestsModule::lower() {
   // identifiers.
   BumpPtrAllocator Alloc;
   struct TIInfo {
-    unsigned Index;
+    unsigned UniqueId;
     std::vector<GlobalTypeMember *> RefGlobals;
   };
   DenseMap<Metadata *, TIInfo> TypeIdInfo;
-  unsigned I = 0;
+  unsigned CurUniqueId = 0;
   SmallVector<MDNode *, 2> Types;
 
+  // Cross-DSO CFI emits jumptable entries for exported functions as well as
+  // address taken functions in case they are address taken in other modules.
+  const bool CrossDsoCfi = M.getModuleFlag("Cross-DSO CFI") != nullptr;
+
   struct ExportedFunctionInfo {
     CfiFunctionLinkage Linkage;
     MDNode *FuncMD; // {name, linkage, type[, type...]}
   };
   DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
   if (ExportSummary) {
+    // A set of all functions that are address taken by a live global object.
+    DenseSet<GlobalValue::GUID> AddressTaken;
+    for (auto &I : *ExportSummary)
+      for (auto &GVS : I.second.SummaryList)
+        if (GVS->isLive())
+          for (auto &Ref : GVS->refs())
+            AddressTaken.insert(Ref.getGUID());
+
     NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
     if (CfiFunctionsMD) {
       for (auto FuncMD : CfiFunctionsMD->operands()) {
         assert(FuncMD->getNumOperands() >= 2);
         StringRef FunctionName =
             cast<MDString>(FuncMD->getOperand(0))->getString();
-        if (!ExportSummary->isGUIDLive(GlobalValue::getGUID(
-                GlobalValue::dropLLVMManglingEscape(FunctionName))))
-          continue;
         CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
             cast<ConstantAsMetadata>(FuncMD->getOperand(1))
                 ->getValue()
                 ->getUniqueInteger()
                 .getZExtValue());
+        const GlobalValue::GUID GUID = GlobalValue::getGUID(
+                GlobalValue::dropLLVMManglingEscape(FunctionName));
+        // Do not emit jumptable entries for functions that are not-live and
+        // have no live references (and are not exported with cross-DSO CFI.)
+        if (!ExportSummary->isGUIDLive(GUID))
+          continue;
+        if (!AddressTaken.count(GUID)) {
+          if (!CrossDsoCfi || Linkage != CFL_Definition)
+            continue;
+
+          bool Exported = false;
+          if (auto VI = ExportSummary->getValueInfo(GUID))
+            for (auto &GVS : VI.getSummaryList())
+              if (GVS->isLive() && !GlobalValue::isLocalLinkage(GVS->linkage()))
+                Exported = true;
+
+          if (!Exported)
+            continue;
+        }
         auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
         if (!P.second && P.first->second.Linkage != CFL_Definition)
           P.first->second = {Linkage, FuncMD};
@@ -1656,6 +1827,11 @@ bool LowerTypeTestsModule::lower() {
           F->clearMetadata();
         }
 
+        // Update the linkage for extern_weak declarations when a definition
+        // exists.
+        if (Linkage == CFL_Definition && F->hasExternalWeakLinkage())
+          F->setLinkage(GlobalValue::ExternalLinkage);
+
         // If the function in the full LTO module is a declaration, replace its
         // type metadata with the type metadata we found in cfi.functions. That
         // metadata is presumed to be more accurate than the metadata attached
@@ -1673,28 +1849,37 @@ bool LowerTypeTestsModule::lower() {
     }
   }
 
+  DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers;
   for (GlobalObject &GO : M.global_objects()) {
     if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
       continue;
 
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
-    if (Types.empty())
-      continue;
 
     bool IsDefinition = !GO.isDeclarationForLinker();
     bool IsExported = false;
-    if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) {
-      IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition;
-      IsExported = true;
+    if (Function *F = dyn_cast<Function>(&GO)) {
+      if (ExportedFunctions.count(F->getName())) {
+        IsDefinition |= ExportedFunctions[F->getName()].Linkage == CFL_Definition;
+        IsExported = true;
+      // TODO: The logic here checks only that the function is address taken,
+      // not that the address takers are live. This can be updated to check
+      // their liveness and emit fewer jumptable entries once monolithic LTO
+      // builds also emit summaries.
+      } else if (!F->hasAddressTaken()) {
+        if (!CrossDsoCfi || !IsDefinition || F->hasLocalLinkage())
+          continue;
+      }
     }
 
     auto *GTM =
         GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
+    GlobalTypeMembers[&GO] = GTM;
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
       auto &Info = TypeIdInfo[Type->getOperand(1)];
-      Info.Index = ++I;
+      Info.UniqueId = ++CurUniqueId;
       Info.RefGlobals.push_back(GTM);
     }
   }
@@ -1731,6 +1916,44 @@ bool LowerTypeTestsModule::lower() {
     }
   }
 
+  if (ICallBranchFunnelFunc) {
+    for (const Use &U : ICallBranchFunnelFunc->uses()) {
+      if (Arch != Triple::x86_64)
+        report_fatal_error(
+            "llvm.icall.branch.funnel not supported on this target");
+
+      auto CI = cast<CallInst>(U.getUser());
+
+      std::vector<GlobalTypeMember *> Targets;
+      if (CI->getNumArgOperands() % 2 != 1)
+        report_fatal_error("number of arguments should be odd");
+
+      GlobalClassesTy::member_iterator CurSet;
+      for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
+        int64_t Offset;
+        auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+            CI->getOperand(I), Offset, M.getDataLayout()));
+        if (!Base)
+          report_fatal_error(
+              "Expected branch funnel operand to be global value");
+
+        GlobalTypeMember *GTM = GlobalTypeMembers[Base];
+        Targets.push_back(GTM);
+        GlobalClassesTy::member_iterator NewSet =
+            GlobalClasses.findLeader(GlobalClasses.insert(GTM));
+        if (I == 1)
+          CurSet = NewSet;
+        else
+          CurSet = GlobalClasses.unionSets(CurSet, NewSet);
+      }
+
+      GlobalClasses.unionSets(
+          CurSet, GlobalClasses.findLeader(
+                      GlobalClasses.insert(ICallBranchFunnel::create(
+                          Alloc, CI, Targets, ++CurUniqueId))));
+    }
+  }
+
   if (ExportSummary) {
     DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
     for (auto &P : TypeIdInfo) {
@@ -1764,54 +1987,124 @@ bool LowerTypeTestsModule::lower() {
       continue;
     ++NumTypeIdDisjointSets;
 
-    unsigned MaxIndex = 0;
+    unsigned MaxUniqueId = 0;
     for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
          MI != GlobalClasses.member_end(); ++MI) {
-      if ((*MI).is<Metadata *>())
-        MaxIndex = std::max(MaxIndex, TypeIdInfo[MI->get<Metadata *>()].Index);
+      if (auto *MD = MI->dyn_cast<Metadata *>())
+        MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId);
+      else if (auto *BF = MI->dyn_cast<ICallBranchFunnel *>())
+        MaxUniqueId = std::max(MaxUniqueId, BF->UniqueId);
     }
-    Sets.emplace_back(I, MaxIndex);
+    Sets.emplace_back(I, MaxUniqueId);
   }
-  std::sort(Sets.begin(), Sets.end(),
-            [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
-               const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
-              return S1.second < S2.second;
-            });
+  llvm::sort(Sets.begin(), Sets.end(),
+             [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
+                const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
+               return S1.second < S2.second;
+             });
 
   // For each disjoint set we found...
   for (const auto &S : Sets) {
     // Build the list of type identifiers in this disjoint set.
     std::vector<Metadata *> TypeIds;
     std::vector<GlobalTypeMember *> Globals;
+    std::vector<ICallBranchFunnel *> ICallBranchFunnels;
     for (GlobalClassesTy::member_iterator MI =
              GlobalClasses.member_begin(S.first);
          MI != GlobalClasses.member_end(); ++MI) {
-      if ((*MI).is<Metadata *>())
+      if (MI->is<Metadata *>())
         TypeIds.push_back(MI->get<Metadata *>());
-      else
+      else if (MI->is<GlobalTypeMember *>())
         Globals.push_back(MI->get<GlobalTypeMember *>());
+      else
+        ICallBranchFunnels.push_back(MI->get<ICallBranchFunnel *>());
     }
 
-    // Order type identifiers by global index for determinism. This ordering is
-    // stable as there is a one-to-one mapping between metadata and indices.
-    std::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
-      return TypeIdInfo[M1].Index < TypeIdInfo[M2].Index;
+    // Order type identifiers by unique ID for determinism. This ordering is
+    // stable as there is a one-to-one mapping between metadata and unique IDs.
+    llvm::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
+      return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId;
     });
 
+    // Same for the branch funnels.
+    llvm::sort(ICallBranchFunnels.begin(), ICallBranchFunnels.end(),
+               [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) {
+                 return F1->UniqueId < F2->UniqueId;
+               });
+
     // Build bitsets for this disjoint set.
-    buildBitSetsFromDisjointSet(TypeIds, Globals);
+    buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels);
   }
 
   allocateByteArrays();
 
+  // Parse alias data to replace stand-in function declarations for aliases
+  // with an alias to the intended target.
+  if (ExportSummary) {
+    if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
+      for (auto AliasMD : AliasesMD->operands()) {
+        assert(AliasMD->getNumOperands() >= 4);
+        StringRef AliasName =
+            cast<MDString>(AliasMD->getOperand(0))->getString();
+        StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString();
+
+        if (!ExportedFunctions.count(Aliasee) ||
+            ExportedFunctions[Aliasee].Linkage != CFL_Definition ||
+            !M.getNamedAlias(Aliasee))
+          continue;
+
+        GlobalValue::VisibilityTypes Visibility =
+            static_cast<GlobalValue::VisibilityTypes>(
+                cast<ConstantAsMetadata>(AliasMD->getOperand(2))
+                    ->getValue()
+                    ->getUniqueInteger()
+                    .getZExtValue());
+        bool Weak =
+            static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3))
+                                  ->getValue()
+                                  ->getUniqueInteger()
+                                  .getZExtValue());
+
+        auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee));
+        Alias->setVisibility(Visibility);
+        if (Weak)
+          Alias->setLinkage(GlobalValue::WeakAnyLinkage);
+
+        if (auto *F = M.getFunction(AliasName)) {
+          Alias->takeName(F);
+          F->replaceAllUsesWith(Alias);
+          F->eraseFromParent();
+        } else {
+          Alias->setName(AliasName);
+        }
+      }
+    }
+  }
+
+  // Emit .symver directives for exported functions, if they exist.
+  if (ExportSummary) {
+    if (NamedMDNode *SymversMD = M.getNamedMetadata("symvers")) {
+      for (auto Symver : SymversMD->operands()) {
+        assert(Symver->getNumOperands() >= 2);
+        StringRef SymbolName =
+            cast<MDString>(Symver->getOperand(0))->getString();
+        StringRef Alias = cast<MDString>(Symver->getOperand(1))->getString();
+
+        if (!ExportedFunctions.count(SymbolName))
+          continue;
+
+        M.appendModuleInlineAsm(
+            (llvm::Twine(".symver ") + SymbolName + ", " + Alias).str());
+      }
+    }
+  }
+
   return true;
 }
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr,
-                                      /*ImportSummary=*/nullptr)
-                     .lower();
+  bool Changed = LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 8886af90ba65..139941127dee 100644
--- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -90,7 +90,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Argument.h"
@@ -407,10 +407,10 @@ bool MergeFunctions::runOnModule(Module &M) {
     std::vector<WeakTrackingVH> Worklist;
     Deferred.swap(Worklist);
 
-    DEBUG(doSanityCheck(Worklist));
+    LLVM_DEBUG(doSanityCheck(Worklist));
 
-    DEBUG(dbgs() << "size of module: " << M.size() << '\n');
-    DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
+    LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+    LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
 
     // Insert functions and merge them.
     for (WeakTrackingVH &I : Worklist) {
@@ -421,7 +421,7 @@ bool MergeFunctions::runOnModule(Module &M) {
         Changed |= insert(F);
       }
     }
-    DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
+    LLVM_DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
   } while (!Deferred.empty());
 
   FnTree.clear();
@@ -498,19 +498,20 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
 // parameter debug info, from the entry block.
 void MergeFunctions::eraseInstsUnrelatedToPDI(
     std::vector<Instruction *> &PDIUnrelatedWL) {
-  DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in "
-                  "entry block) unrelated to parameter debug info from entry "
-                  "block: {\n");
+  LLVM_DEBUG(
+      dbgs() << " Erasing instructions (in reverse order of appearance in "
+                "entry block) unrelated to parameter debug info from entry "
+                "block: {\n");
   while (!PDIUnrelatedWL.empty()) {
     Instruction *I = PDIUnrelatedWL.back();
-    DEBUG(dbgs() << "  Deleting Instruction: ");
-    DEBUG(I->print(dbgs()));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "  Deleting Instruction: ");
+    LLVM_DEBUG(I->print(dbgs()));
+    LLVM_DEBUG(dbgs() << "\n");
     I->eraseFromParent();
     PDIUnrelatedWL.pop_back();
   }
-  DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
-                  "debug info from entry block. \n");
+  LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+                       "debug info from entry block. \n");
 }
 
 // Reduce G to its entry block.
@@ -543,99 +544,100 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
   for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
        BI != BIE; ++BI) {
     if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
-      DEBUG(dbgs() << " Deciding: ");
-      DEBUG(BI->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << " Deciding: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
       DILocalVariable *DILocVar = DVI->getVariable();
       if (DILocVar->isParameter()) {
-        DEBUG(dbgs() << "  Include (parameter): ");
-        DEBUG(BI->print(dbgs()));
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "  Include (parameter): ");
+        LLVM_DEBUG(BI->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "\n");
         PDIRelated.insert(&*BI);
       } else {
-        DEBUG(dbgs() << "  Delete (!parameter): ");
-        DEBUG(BI->print(dbgs()));
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "  Delete (!parameter): ");
+        LLVM_DEBUG(BI->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "\n");
       }
     } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
-      DEBUG(dbgs() << " Deciding: ");
-      DEBUG(BI->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << " Deciding: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
       DILocalVariable *DILocVar = DDI->getVariable();
       if (DILocVar->isParameter()) {
-        DEBUG(dbgs() << "  Parameter: ");
-        DEBUG(DILocVar->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "  Parameter: ");
+        LLVM_DEBUG(DILocVar->print(dbgs()));
         AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
         if (AI) {
-          DEBUG(dbgs() << "  Processing alloca users: ");
-          DEBUG(dbgs() << "\n");
+          LLVM_DEBUG(dbgs() << "  Processing alloca users: ");
+          LLVM_DEBUG(dbgs() << "\n");
           for (User *U : AI->users()) {
             if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
               if (Value *Arg = SI->getValueOperand()) {
                 if (dyn_cast<Argument>(Arg)) {
-                  DEBUG(dbgs() << "  Include: ");
-                  DEBUG(AI->print(dbgs()));
-                  DEBUG(dbgs() << "\n");
+                  LLVM_DEBUG(dbgs() << "  Include: ");
+                  LLVM_DEBUG(AI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
                   PDIRelated.insert(AI);
-                  DEBUG(dbgs() << "   Include (parameter): ");
-                  DEBUG(SI->print(dbgs()));
-                  DEBUG(dbgs() << "\n");
+                  LLVM_DEBUG(dbgs() << "   Include (parameter): ");
+                  LLVM_DEBUG(SI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
                   PDIRelated.insert(SI);
-                  DEBUG(dbgs() << "  Include: ");
-                  DEBUG(BI->print(dbgs()));
-                  DEBUG(dbgs() << "\n");
+                  LLVM_DEBUG(dbgs() << "  Include: ");
+                  LLVM_DEBUG(BI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
                   PDIRelated.insert(&*BI);
                 } else {
-                  DEBUG(dbgs() << "   Delete (!parameter): ");
-                  DEBUG(SI->print(dbgs()));
-                  DEBUG(dbgs() << "\n");
+                  LLVM_DEBUG(dbgs() << "   Delete (!parameter): ");
+                  LLVM_DEBUG(SI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
                 }
               }
             } else {
-              DEBUG(dbgs() << "   Defer: ");
-              DEBUG(U->print(dbgs()));
-              DEBUG(dbgs() << "\n");
+              LLVM_DEBUG(dbgs() << "   Defer: ");
+              LLVM_DEBUG(U->print(dbgs()));
+              LLVM_DEBUG(dbgs() << "\n");
             }
           }
         } else {
-          DEBUG(dbgs() << "  Delete (alloca NULL): ");
-          DEBUG(BI->print(dbgs()));
-          DEBUG(dbgs() << "\n");
+          LLVM_DEBUG(dbgs() << "  Delete (alloca NULL): ");
+          LLVM_DEBUG(BI->print(dbgs()));
+          LLVM_DEBUG(dbgs() << "\n");
         }
       } else {
-        DEBUG(dbgs() << "  Delete (!parameter): ");
-        DEBUG(BI->print(dbgs()));
-        DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "  Delete (!parameter): ");
+        LLVM_DEBUG(BI->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "\n");
       }
     } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
-      DEBUG(dbgs() << " Will Include Terminator: ");
-      DEBUG(BI->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
       PDIRelated.insert(&*BI);
     } else {
-      DEBUG(dbgs() << " Defer: ");
-      DEBUG(BI->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << " Defer: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
     }
   }
-  DEBUG(dbgs()
-        << " Report parameter debug info related/related instructions: {\n");
+  LLVM_DEBUG(
+      dbgs()
+      << " Report parameter debug info related/related instructions: {\n");
   for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
        BI != BE; ++BI) {
 
     Instruction *I = &*BI;
     if (PDIRelated.find(I) == PDIRelated.end()) {
-      DEBUG(dbgs() << "  !PDIRelated: ");
-      DEBUG(I->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "  !PDIRelated: ");
+      LLVM_DEBUG(I->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
       PDIUnrelatedWL.push_back(I);
     } else {
-      DEBUG(dbgs() << "   PDIRelated: ");
-      DEBUG(I->print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "   PDIRelated: ");
+      LLVM_DEBUG(I->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
     }
   }
-  DEBUG(dbgs() << " }\n");
+  LLVM_DEBUG(dbgs() << " }\n");
 }
 
 // Don't merge tiny functions using a thunk, since it can just end up
@@ -643,8 +645,8 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 static bool isThunkProfitable(Function * F) {
   if (F->size() == 1) {
     if (F->front().size() <= 2) {
-      DEBUG(dbgs() << "isThunkProfitable: " << F->getName()
-                    << " is too small to bother creating a thunk for\n");
+      LLVM_DEBUG(dbgs() << "isThunkProfitable: " << F->getName()
+                        << " is too small to bother creating a thunk for\n");
       return false;
     }
   }
@@ -665,13 +667,14 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   BasicBlock *BB = nullptr;
   Function *NewG = nullptr;
   if (MergeFunctionsPDI) {
-    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
-                    "function as thunk; retain original: "
-                 << G->getName() << "()\n");
+    LLVM_DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+                         "function as thunk; retain original: "
+                      << G->getName() << "()\n");
     GEntryBlock = &G->getEntryBlock();
-    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
-                    "debug info for "
-                 << G->getName() << "() {\n");
+    LLVM_DEBUG(
+        dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+                  "debug info for "
+               << G->getName() << "() {\n");
     filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
     GEntryBlock->getTerminator()->eraseFromParent();
     BB = GEntryBlock;
@@ -710,13 +713,15 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
       CI->setDebugLoc(CIDbgLoc);
       RI->setDebugLoc(RIDbgLoc);
     } else {
-      DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
-                   << G->getName() << "()\n");
+      LLVM_DEBUG(
+          dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+                 << G->getName() << "()\n");
     }
     eraseTail(G);
     eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
-    DEBUG(dbgs() << "} // End of parameter related debug info filtering for: "
-                 << G->getName() << "()\n");
+    LLVM_DEBUG(
+        dbgs() << "} // End of parameter related debug info filtering for: "
+               << G->getName() << "()\n");
   } else {
     NewG->copyAttributesFrom(G);
     NewG->takeName(G);
@@ -725,7 +730,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
     G->eraseFromParent();
   }
 
-  DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
   ++NumThunksWritten;
 }
 
@@ -820,7 +825,8 @@ bool MergeFunctions::insert(Function *NewFunction) {
   if (Result.second) {
     assert(FNodesInTree.count(NewFunction) == 0);
     FNodesInTree.insert({NewFunction, Result.first});
-    DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName()
+                      << '\n');
     return false;
   }
 
@@ -841,8 +847,8 @@ bool MergeFunctions::insert(Function *NewFunction) {
     assert(OldF.getFunc() != F && "Must have swapped the functions.");
   }
 
-  DEBUG(dbgs() << "  " << OldF.getFunc()->getName()
-               << " == " << NewFunction->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "  " << OldF.getFunc()->getName()
+                    << " == " << NewFunction->getName() << '\n');
 
   Function *DeleteF = NewFunction;
   mergeTwoFunctions(OldF.getFunc(), DeleteF);
@@ -854,7 +860,7 @@ bool MergeFunctions::insert(Function *NewFunction) {
 void MergeFunctions::remove(Function *F) {
   auto I = FNodesInTree.find(F);
   if (I != FNodesInTree.end()) {
-    DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n");
+    LLVM_DEBUG(dbgs() << "Deferred " << F->getName() << ".\n");
     FnTree.erase(I->second);
     // I->second has been invalidated, remove it from the FNodesInTree map to
     // preserve the invariant.
@@ -868,7 +874,7 @@ void MergeFunctions::remove(Function *F) {
 void MergeFunctions::removeUsers(Value *V) {
   std::vector<Value *> Worklist;
   Worklist.push_back(V);
-  SmallSet<Value*, 8> Visited;
+  SmallPtrSet<Value*, 8> Visited;
   Visited.insert(V);
   while (!Worklist.empty()) {
     Value *V = Worklist.back();
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
index a9cfd8ded6fb..4907e4b30519 100644
--- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -202,10 +202,8 @@ struct PartialInlinerImpl {
       std::function<AssumptionCache &(Function &)> *GetAC,
       std::function<TargetTransformInfo &(Function &)> *GTTI,
       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
-      ProfileSummaryInfo *ProfSI,
-      std::function<OptimizationRemarkEmitter &(Function &)> *GORE)
-      : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI),
-        GetORE(GORE) {}
+      ProfileSummaryInfo *ProfSI)
+      : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
 
   bool run(Module &M);
   // Main part of the transformation that calls helper functions to find
@@ -217,7 +215,7 @@ struct PartialInlinerImpl {
   // outline function due to code size.
   std::pair<bool, Function *> unswitchFunction(Function *F);
 
-  // This class speculatively clones the the function to be partial inlined.
+  // This class speculatively clones the function to be partial inlined.
   // At the end of partial inlining, the remaining callsites to the cloned
   // function that are not partially inlined will be fixed up to reference
   // the original function, and the cloned function will be erased.
@@ -271,7 +269,6 @@ private:
   std::function<TargetTransformInfo &(Function &)> *GetTTI;
   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
   ProfileSummaryInfo *PSI;
-  std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
 
   // Return the frequency of the OutlininingBB relative to F's entry point.
   // The result is no larger than 1 and is represented using BP.
@@ -282,7 +279,8 @@ private:
   // Return true if the callee of CS should be partially inlined with
   // profit.
   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
-                           BlockFrequency WeightedOutliningRcost);
+                           BlockFrequency WeightedOutliningRcost,
+                           OptimizationRemarkEmitter &ORE);
 
   // Try to inline DuplicateFunction (cloned from F with call to
   // the OutlinedFunction into its callers. Return true
@@ -337,7 +335,7 @@ private:
 
   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
   std::unique_ptr<FunctionOutliningMultiRegionInfo>
-  computeOutliningColdRegionsInfo(Function *F);
+  computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -362,7 +360,6 @@ struct PartialInlinerLegacyPass : public ModulePass {
         &getAnalysis<TargetTransformInfoWrapperPass>();
     ProfileSummaryInfo *PSI =
         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-    std::unique_ptr<OptimizationRemarkEmitter> UPORE;
 
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&ACT](Function &F) -> AssumptionCache & {
@@ -374,14 +371,7 @@ struct PartialInlinerLegacyPass : public ModulePass {
       return TTIWP->getTTI(F);
     };
 
-    std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
-        [&UPORE](Function &F) -> OptimizationRemarkEmitter & {
-      UPORE.reset(new OptimizationRemarkEmitter(&F));
-      return *UPORE.get();
-    };
-
-    return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI,
-                              &GetORE)
+    return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI)
         .run(M);
   }
 };
@@ -389,7 +379,8 @@ struct PartialInlinerLegacyPass : public ModulePass {
 } // end anonymous namespace
 
 std::unique_ptr<FunctionOutliningMultiRegionInfo>
-PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
+PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
+                                                    OptimizationRemarkEmitter &ORE) {
   BasicBlock *EntryBlock = &F->front();
 
   DominatorTree DT(*F);
@@ -403,8 +394,6 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
   } else
     BFI = &(*GetBFI)(*F);
 
-  auto &ORE = (*GetORE)(*F);
-
   // Return if we don't have profiling information.
   if (!PSI->hasInstrumentationProfile())
     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
@@ -414,8 +403,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
 
   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
     BasicBlock *Dom = BlockList.front();
-    return BlockList.size() > 1 &&
-           std::distance(pred_begin(Dom), pred_end(Dom)) == 1;
+    return BlockList.size() > 1 && pred_size(Dom) == 1;
   };
 
   auto IsSingleExit =
@@ -567,10 +555,6 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
     return is_contained(successors(BB), Succ);
   };
 
-  auto SuccSize = [](BasicBlock *BB) {
-    return std::distance(succ_begin(BB), succ_end(BB));
-  };
-
   auto IsReturnBlock = [](BasicBlock *BB) {
     TerminatorInst *TI = BB->getTerminator();
     return isa<ReturnInst>(TI);
@@ -607,7 +591,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
       break;
 
-    if (SuccSize(CurrEntry) != 2)
+    if (succ_size(CurrEntry) != 2)
       break;
 
     BasicBlock *Succ1 = *succ_begin(CurrEntry);
@@ -681,7 +665,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   // peeling off dominating blocks from the outlining region:
   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
-    if (SuccSize(Cand) != 2)
+    if (succ_size(Cand) != 2)
       break;
 
     if (HasNonEntryPred(Cand))
@@ -766,19 +750,19 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
 
 bool PartialInlinerImpl::shouldPartialInline(
     CallSite CS, FunctionCloner &Cloner,
-    BlockFrequency WeightedOutliningRcost) {
+    BlockFrequency WeightedOutliningRcost,
+    OptimizationRemarkEmitter &ORE) {
   using namespace ore;
 
-  if (SkipCostAnalysis)
-    return true;
-
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
   assert(Callee == Cloner.ClonedFunc);
 
+  if (SkipCostAnalysis)
+    return isInlineViable(*Callee);
+
   Function *Caller = CS.getCaller();
   auto &CalleeTTI = (*GetTTI)(*Callee);
-  auto &ORE = (*GetORE)(*Caller);
   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
 
@@ -1270,14 +1254,14 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (F->user_begin() == F->user_end())
     return {false, nullptr};
 
-  auto &ORE = (*GetORE)(*F);
+  OptimizationRemarkEmitter ORE(F);
 
   // Only try to outline cold regions if we have a profile summary, which
   // implies we have profiling information.
   if (PSI->hasProfileSummary() && F->hasProfileData() &&
       !DisableMultiRegionPartialInline) {
     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
-        computeOutliningColdRegionsInfo(F);
+        computeOutliningColdRegionsInfo(F, ORE);
     if (OMRI) {
       FunctionCloner Cloner(F, OMRI.get(), ORE);
 
@@ -1357,11 +1341,11 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   // inlining the function with outlining (The inliner uses the size increase to
   // model the cost of inlining a callee).
   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
-    auto &ORE = (*GetORE)(*Cloner.OrigFunc);
+    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
     DebugLoc DLoc;
     BasicBlock *Block;
     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
-    ORE.emit([&]() {
+    OrigFuncORE.emit([&]() {
       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
                                         DLoc, Block)
              << ore::NV("Function", Cloner.OrigFunc)
@@ -1384,7 +1368,8 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   if (CalleeEntryCount)
     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
 
-  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+  uint64_t CalleeEntryCountV =
+      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
 
   bool AnyInline = false;
   for (User *User : Users) {
@@ -1393,11 +1378,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     if (IsLimitReached())
       continue;
 
-
-    if (!shouldPartialInline(CS, Cloner, WeightedRcost))
+    OptimizationRemarkEmitter CallerORE(CS.getCaller());
+    if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
       continue;
 
-    auto &ORE = (*GetORE)(*CS.getCaller());
     // Construct remark before doing the inlining, as after successful inlining
     // the callsite is removed.
     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
@@ -1412,7 +1396,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
                                          : nullptr)))
       continue;
 
-    ORE.emit(OR);
+    CallerORE.emit(OR);
 
     // Now update the entry count:
     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
@@ -1433,9 +1417,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   if (AnyInline) {
     Cloner.IsFunctionInlined = true;
     if (CalleeEntryCount)
-      Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
-    auto &ORE = (*GetORE)(*Cloner.OrigFunc);
-    ORE.emit([&]() {
+      Cloner.OrigFunc->setEntryCount(
+          CalleeEntryCount.setCount(CalleeEntryCountV));
+    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+    OrigFuncORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
              << "Partially inlined into at least one caller";
     });
@@ -1517,14 +1502,9 @@ PreservedAnalyses PartialInlinerPass::run(Module &M,
     return FAM.getResult<TargetIRAnalysis>(F);
   };
 
-  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
-      [&FAM](Function &F) -> OptimizationRemarkEmitter & {
-    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  };
-
   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
 
-  if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE)
+  if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI)
           .run(M))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 3855e6245d8e..5ced6481996a 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -29,14 +29,18 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
@@ -92,6 +96,10 @@ static cl::opt<bool> EnableLoopInterchange(
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
+static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
+                                        cl::init(false), cl::Hidden,
+                                        cl::desc("Enable Unroll And Jam Pass"));
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -135,10 +143,10 @@ static cl::opt<bool>
                               cl::Hidden,
                               cl::desc("Disable shrink-wrap library calls"));
 
-static cl::opt<bool>
-    EnableSimpleLoopUnswitch("enable-simple-loop-unswitch", cl::init(false),
-                             cl::Hidden,
-                             cl::desc("Enable the simple loop unswitch pass."));
+static cl::opt<bool> EnableSimpleLoopUnswitch(
+    "enable-simple-loop-unswitch", cl::init(false), cl::Hidden,
+    cl::desc("Enable the simple loop unswitch pass. Also enables independent "
+             "cleanup passes integrated into the loop pass manager pipeline."));
 
 static cl::opt<bool> EnableGVNSink(
     "enable-gvn-sink", cl::init(false), cl::Hidden,
@@ -318,6 +326,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   // Combine silly seq's
+  if (OptLevel > 2)
+    MPM.add(createAggressiveInstCombinerPass());
   addInstructionCombiningPass(MPM);
   if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
     MPM.add(createLibCallsShrinkWrapPass());
@@ -330,6 +340,15 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
+
+  // Begin the loop pass pipeline.
+  if (EnableSimpleLoopUnswitch) {
+    // The simple loop unswitch pass relies on separate cleanup passes. Schedule
+    // them first so when we re-process a loop they run before other loop
+    // passes.
+    MPM.add(createLoopInstSimplifyPass());
+    MPM.add(createLoopSimplifyCFGPass());
+  }
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
   MPM.add(createLICMPass());                  // Hoist loop invariants
@@ -337,20 +356,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createSimpleLoopUnswitchLegacyPass());
   else
     MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+  // FIXME: We break the loop pass pipeline here in order to do full
+  // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
+  // need for this.
   MPM.add(createCFGSimplificationPass());
   addInstructionCombiningPass(MPM);
+  // We resume loop passes creating a second loop pipeline here.
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   addExtensionsToPM(EP_LateLoopOptimizations, MPM);
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
   if (EnableLoopInterchange) {
+    // FIXME: These are function passes and break the loop pass pipeline.
     MPM.add(createLoopInterchangePass()); // Interchange loops
     MPM.add(createCFGSimplificationPass());
   }
   if (!DisableUnrollLoops)
     MPM.add(createSimpleLoopUnrollPass(OptLevel));    // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+  // This ends the loop pass pipelines.
 
   if (OptLevel > 1) {
     MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
@@ -431,7 +456,7 @@ void PassManagerBuilder::populateModulePassManager(
     // This has to be done after we add the extensions to the pass manager
     // as there could be passes (e.g. Adddress sanitizer) which introduce
     // new unnamed globals.
-    if (PrepareForThinLTO)
+    if (PrepareForLTO || PrepareForThinLTO)
       MPM.add(createNameAnonGlobalPass());
     return;
   }
@@ -648,6 +673,13 @@ void PassManagerBuilder::populateModulePassManager(
   addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
+    if (EnableUnrollAndJam) {
+      // Unroll and Jam. We do this before unroll but need to be in a separate
+      // loop pass manager in order for the outer loop to be processed by
+      // unroll and jam before the inner loop is unrolled.
+      MPM.add(createLoopUnrollAndJamPass(OptLevel));
+    }
+
     MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
 
     // LoopUnroll may generate some redundency to cleanup.
@@ -683,7 +715,7 @@ void PassManagerBuilder::populateModulePassManager(
   // result too early.
   MPM.add(createLoopSinkPass());
   // Get rid of LCSSA nodes.
-  MPM.add(createInstructionSimplifierPass());
+  MPM.add(createInstSimplifyLegacyPass());
 
   // This hoists/decomposes div/rem ops. It should run after other sink/hoist
   // passes to avoid re-sinking, but before SimplifyCFG because it can allow
@@ -695,6 +727,10 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createCFGSimplificationPass());
 
   addExtensionsToPM(EP_OptimizerLast, MPM);
+
+  // Rename anon globals to be able to handle them in the summary
+  if (PrepareForLTO)
+    MPM.add(createNameAnonGlobalPass());
 }
 
 void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
@@ -765,6 +801,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // simplification opportunities, and both can propagate functions through
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
+  if (OptLevel > 2)
+    PM.add(createAggressiveInstCombinerPass());
   addInstructionCombiningPass(PM);
   addExtensionsToPM(EP_Peephole, PM);
 
@@ -865,6 +903,8 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(
 void PassManagerBuilder::populateThinLTOPassManager(
     legacy::PassManagerBase &PM) {
   PerformThinLTO = true;
+  if (LibraryInfo)
+    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
 
   if (VerifyInput)
     PM.add(createVerifierPass());
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
index 46b088189040..27d791857314 100644
--- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -27,7 +28,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/IPO/SCCP.cpp b/contrib/llvm/lib/Transforms/IPO/SCCP.cpp
new file mode 100644
index 000000000000..cc53c4b8c46f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -0,0 +1,58 @@
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+
+using namespace llvm;
+
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
+  const DataLayout &DL = M.getDataLayout();
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  if (!runIPSCCP(M, DL, &TLI))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  IPSCCPLegacyPass() : ModulePass(ID) {
+    initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    const DataLayout &DL = M.getDataLayout();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return runIPSCCP(M, DL, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+                      "Interprocedural Sparse Conditional Constant Propagation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+                    "Interprocedural Sparse Conditional Constant Propagation",
+                    false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
index a69c009e1a54..dcd24595f7ea 100644
--- a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -22,7 +22,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/SampleProfile.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -37,6 +37,8 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -85,7 +87,7 @@
 
 using namespace llvm;
 using namespace sampleprof;
-
+using ProfileCount = Function::ProfileCount;
 #define DEBUG_TYPE "sample-profile"
 
 // Command line option to specify the file to read samples from. This is
@@ -109,10 +111,10 @@ static cl::opt<unsigned> SampleProfileSampleCoverage(
     cl::desc("Emit a warning if less than N% of samples in the input profile "
              "are matched to the IR."));
 
-static cl::opt<double> SampleProfileHotThreshold(
-    "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
-    cl::desc("Inlined functions that account for more than N% of all samples "
-             "collected in the parent function, will be inlined again."));
+static cl::opt<bool> NoWarnSampleUnused(
+    "no-warn-sample-unused", cl::init(false), cl::Hidden,
+    cl::desc("Use this option to turn off/on warnings about function with "
+             "samples but without debug information to use those samples. "));
 
 namespace {
 
@@ -130,10 +132,13 @@ public:
   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
                        uint32_t Discriminator, uint64_t Samples);
   unsigned computeCoverage(unsigned Used, unsigned Total) const;
-  unsigned countUsedRecords(const FunctionSamples *FS) const;
-  unsigned countBodyRecords(const FunctionSamples *FS) const;
+  unsigned countUsedRecords(const FunctionSamples *FS,
+                            ProfileSummaryInfo *PSI) const;
+  unsigned countBodyRecords(const FunctionSamples *FS,
+                            ProfileSummaryInfo *PSI) const;
   uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
-  uint64_t countBodySamples(const FunctionSamples *FS) const;
+  uint64_t countBodySamples(const FunctionSamples *FS,
+                            ProfileSummaryInfo *PSI) const;
 
   void clear() {
     SampleCoverage.clear();
@@ -170,7 +175,7 @@ private:
   uint64_t TotalUsedSamples = 0;
 };
 
-/// \brief Sample profile pass.
+/// Sample profile pass.
 ///
 /// This pass reads profile data from the file specified by
 /// -sample-profile-file and annotates every affected function with the
@@ -186,7 +191,8 @@ public:
         IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
-  bool runOnModule(Module &M, ModuleAnalysisManager *AM);
+  bool runOnModule(Module &M, ModuleAnalysisManager *AM,
+                   ProfileSummaryInfo *_PSI);
 
   void dump() { Reader->dump(); }
 
@@ -217,28 +223,27 @@ protected:
   void buildEdges(Function &F);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
-  unsigned getOffset(const DILocation *DIL) const;
   void clearFunctionData();
 
-  /// \brief Map basic blocks to their computed weights.
+  /// Map basic blocks to their computed weights.
   ///
   /// The weight of a basic block is defined to be the maximum
   /// of all the instruction weights in that block.
   BlockWeightMap BlockWeights;
 
-  /// \brief Map edges to their computed weights.
+  /// Map edges to their computed weights.
   ///
   /// Edge weights are computed by propagating basic block weights in
   /// SampleProfile::propagateWeights.
   EdgeWeightMap EdgeWeights;
 
-  /// \brief Set of visited blocks during propagation.
+  /// Set of visited blocks during propagation.
   SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
 
-  /// \brief Set of visited edges during propagation.
+  /// Set of visited edges during propagation.
   SmallSet<Edge, 32> VisitedEdges;
 
-  /// \brief Equivalence classes for block weights.
+  /// Equivalence classes for block weights.
   ///
   /// Two blocks BB1 and BB2 are in the same equivalence class if they
   /// dominate and post-dominate each other, and they are in the same loop
@@ -252,47 +257,50 @@ protected:
   /// is one-to-one mapping.
   StringMap<Function *> SymbolMap;
 
-  /// \brief Dominance, post-dominance and loop information.
+  /// Dominance, post-dominance and loop information.
   std::unique_ptr<DominatorTree> DT;
-  std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
+  std::unique_ptr<PostDominatorTree> PDT;
   std::unique_ptr<LoopInfo> LI;
 
   std::function<AssumptionCache &(Function &)> GetAC;
   std::function<TargetTransformInfo &(Function &)> GetTTI;
 
-  /// \brief Predecessors for each basic block in the CFG.
+  /// Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
 
-  /// \brief Successors for each basic block in the CFG.
+  /// Successors for each basic block in the CFG.
   BlockEdgeMap Successors;
 
   SampleCoverageTracker CoverageTracker;
 
-  /// \brief Profile reader object.
+  /// Profile reader object.
   std::unique_ptr<SampleProfileReader> Reader;
 
-  /// \brief Samples collected for the body of this function.
+  /// Samples collected for the body of this function.
   FunctionSamples *Samples = nullptr;
 
-  /// \brief Name of the profile file to load.
+  /// Name of the profile file to load.
   std::string Filename;
 
-  /// \brief Flag indicating whether the profile input loaded successfully.
+  /// Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid = false;
 
-  /// \brief Flag indicating if the pass is invoked in ThinLTO compile phase.
+  /// Flag indicating if the pass is invoked in ThinLTO compile phase.
   ///
   /// In this phase, in annotation, we should not promote indirect calls.
   /// Instead, we will mark GUIDs that needs to be annotated to the function.
   bool IsThinLTOPreLink;
 
-  /// \brief Total number of samples collected in this profile.
+  /// Profile Summary Info computed from sample profile.
+  ProfileSummaryInfo *PSI = nullptr;
+
+  /// Total number of samples collected in this profile.
   ///
   /// This is the sum of all the samples collected in all the functions executed
   /// at runtime.
   uint64_t TotalCollectedSamples = 0;
 
-  /// \brief Optimization Remark Emitter used to emit diagnostic remarks.
+  /// Optimization Remark Emitter used to emit diagnostic remarks.
   OptimizationRemarkEmitter *ORE = nullptr;
 };
 
@@ -326,6 +334,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
   }
 
 private:
@@ -336,7 +345,7 @@ private:
 
 } // end anonymous namespace
 
-/// Return true if the given callsite is hot wrt to its caller.
+/// Return true if the given callsite is hot wrt to hot cutoff threshold.
 ///
 /// Functions that were inlined in the original binary will be represented
 /// in the inline stack in the sample profile. If the profile shows that
@@ -344,28 +353,17 @@ private:
 /// frequently), then we will recreate the inline decision and apply the
 /// profile from the inlined callsite.
 ///
-/// To decide whether an inlined callsite is hot, we compute the fraction
-/// of samples used by the callsite with respect to the total number of samples
-/// collected in the caller.
-///
-/// If that fraction is larger than the default given by
-/// SampleProfileHotThreshold, the callsite will be inlined again.
-static bool callsiteIsHot(const FunctionSamples *CallerFS,
-                          const FunctionSamples *CallsiteFS) {
+/// To decide whether an inlined callsite is hot, we compare the callsite
+/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
+/// regarded as hot if the count is above the cutoff value.
+static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+                          ProfileSummaryInfo *PSI) {
   if (!CallsiteFS)
     return false; // The callsite was not inlined in the original binary.
 
-  uint64_t ParentTotalSamples = CallerFS->getTotalSamples();
-  if (ParentTotalSamples == 0)
-    return false; // Avoid division by zero.
-
+  assert(PSI && "PSI is expected to be non null");
   uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
-  if (CallsiteTotalSamples == 0)
-    return false; // Callsite is trivially cold.
-
-  double PercentSamples =
-      (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
-  return PercentSamples >= SampleProfileHotThreshold;
+  return PSI->isHotCount(CallsiteTotalSamples);
 }
 
 /// Mark as used the sample record for the given function samples at
@@ -388,7 +386,8 @@ bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
 ///
 /// This count does not include records from cold inlined callsites.
 unsigned
-SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
+                                        ProfileSummaryInfo *PSI) const {
   auto I = SampleCoverage.find(FS);
 
   // The size of the coverage map for FS represents the number of records
@@ -401,8 +400,8 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(FS, CalleeSamples))
-        Count += countUsedRecords(CalleeSamples);
+      if (callsiteIsHot(CalleeSamples, PSI))
+        Count += countUsedRecords(CalleeSamples, PSI);
     }
 
   return Count;
@@ -412,15 +411,16 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
 ///
 /// This count does not include records from cold inlined callsites.
 unsigned
-SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
+                                        ProfileSummaryInfo *PSI) const {
   unsigned Count = FS->getBodySamples().size();
 
   // Only count records in hot callsites.
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(FS, CalleeSamples))
-        Count += countBodyRecords(CalleeSamples);
+      if (callsiteIsHot(CalleeSamples, PSI))
+        Count += countBodyRecords(CalleeSamples, PSI);
     }
 
   return Count;
@@ -430,7 +430,8 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
 ///
 /// This count does not include samples from cold inlined callsites.
 uint64_t
-SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
+                                        ProfileSummaryInfo *PSI) const {
   uint64_t Total = 0;
   for (const auto &I : FS->getBodySamples())
     Total += I.second.getSamples();
@@ -439,8 +440,8 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(FS, CalleeSamples))
-        Total += countBodySamples(CalleeSamples);
+      if (callsiteIsHot(CalleeSamples, PSI))
+        Total += countBodySamples(CalleeSamples, PSI);
     }
 
   return Total;
@@ -473,15 +474,8 @@ void SampleProfileLoader::clearFunctionData() {
   CoverageTracker.clear();
 }
 
-/// Returns the line offset to the start line of the subprogram.
-/// We assume that a single function will not exceed 65535 LOC.
-unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
-  return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
-         0xffff;
-}
-
 #ifndef NDEBUG
-/// \brief Print the weight of edge \p E on stream \p OS.
+/// Print the weight of edge \p E on stream \p OS.
 ///
 /// \param OS  Stream to emit the output to.
 /// \param E  Edge to print.
@@ -490,7 +484,7 @@ void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
      << "]: " << EdgeWeights[E] << "\n";
 }
 
-/// \brief Print the equivalence class of block \p BB on stream \p OS.
+/// Print the equivalence class of block \p BB on stream \p OS.
 ///
 /// \param OS  Stream to emit the output to.
 /// \param BB  Block to print.
@@ -501,7 +495,7 @@ void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
      << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
 }
 
-/// \brief Print the weight of block \p BB on stream \p OS.
+/// Print the weight of block \p BB on stream \p OS.
 ///
 /// \param OS  Stream to emit the output to.
 /// \param BB  Block to print.
@@ -513,7 +507,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
 }
 #endif
 
-/// \brief Get the weight for an instruction.
+/// Get the weight for an instruction.
 ///
 /// The "weight" of an instruction \p Inst is the number of samples
 /// collected on that instruction at runtime. To retrieve it, we
@@ -549,7 +543,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
     return 0;
 
   const DILocation *DIL = DLoc;
-  uint32_t LineOffset = getOffset(DIL);
+  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
   uint32_t Discriminator = DIL->getBaseDiscriminator();
   ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
   if (R) {
@@ -569,16 +563,16 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
         return Remark;
       });
     }
-    DEBUG(dbgs() << "    " << DLoc.getLine() << "."
-                 << DIL->getBaseDiscriminator() << ":" << Inst
-                 << " (line offset: " << LineOffset << "."
-                 << DIL->getBaseDiscriminator() << " - weight: " << R.get()
-                 << ")\n");
+    LLVM_DEBUG(dbgs() << "    " << DLoc.getLine() << "."
+                      << DIL->getBaseDiscriminator() << ":" << Inst
+                      << " (line offset: " << LineOffset << "."
+                      << DIL->getBaseDiscriminator() << " - weight: " << R.get()
+                      << ")\n");
   }
   return R;
 }
 
-/// \brief Compute the weight of a basic block.
+/// Compute the weight of a basic block.
 ///
 /// The weight of basic block \p BB is the maximum weight of all the
 /// instructions in BB.
@@ -599,7 +593,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
   return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
 }
 
-/// \brief Compute and store the weights of every basic block.
+/// Compute and store the weights of every basic block.
 ///
 /// This populates the BlockWeights map by computing
 /// the weights of every basic block in the CFG.
@@ -607,7 +601,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
 /// \param F The function to query.
 bool SampleProfileLoader::computeBlockWeights(Function &F) {
   bool Changed = false;
-  DEBUG(dbgs() << "Block weights\n");
+  LLVM_DEBUG(dbgs() << "Block weights\n");
   for (const auto &BB : F) {
     ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
     if (Weight) {
@@ -615,13 +609,13 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) {
       VisitedBlocks.insert(&BB);
       Changed = true;
     }
-    DEBUG(printBlockWeight(dbgs(), &BB));
+    LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
   }
 
   return Changed;
 }
 
-/// \brief Get the FunctionSamples for a call instruction.
+/// Get the FunctionSamples for a call instruction.
 ///
 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
 /// instance in which that call instruction is calling to. It contains
@@ -649,8 +643,11 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
   if (FS == nullptr)
     return nullptr;
 
-  return FS->findFunctionSamplesAt(
-      LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName);
+  std::string CalleeGUID;
+  CalleeName = getRepInFormat(CalleeName, Reader->getFormat(), CalleeGUID);
+  return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL),
+                                                DIL->getBaseDiscriminator()),
+                                   CalleeName);
 }
 
 /// Returns a vector of FunctionSamples that are the indirect call targets
@@ -670,7 +667,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
   if (FS == nullptr)
     return R;
 
-  uint32_t LineOffset = getOffset(DIL);
+  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
   uint32_t Discriminator = DIL->getBaseDiscriminator();
 
   auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
@@ -678,23 +675,23 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
   if (T)
     for (const auto &T_C : T.get())
       Sum += T_C.second;
-  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
-          LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
+  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(LineLocation(
+          FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()))) {
     if (M->empty())
       return R;
     for (const auto &NameFS : *M) {
       Sum += NameFS.second.getEntrySamples();
       R.push_back(&NameFS.second);
     }
-    std::sort(R.begin(), R.end(),
-              [](const FunctionSamples *L, const FunctionSamples *R) {
-                return L->getEntrySamples() > R->getEntrySamples();
-              });
+    llvm::sort(R.begin(), R.end(),
+               [](const FunctionSamples *L, const FunctionSamples *R) {
+                 return L->getEntrySamples() > R->getEntrySamples();
+               });
   }
   return R;
 }
 
-/// \brief Get the FunctionSamples for an instruction.
+/// Get the FunctionSamples for an instruction.
 ///
 /// The FunctionSamples of an instruction \p Inst is the inlined instance
 /// in which that instruction is coming from. We traverse the inline stack
@@ -710,20 +707,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   if (!DIL)
     return Samples;
 
-  const DILocation *PrevDIL = DIL;
-  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
-    S.push_back(std::make_pair(
-        LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
-        PrevDIL->getScope()->getSubprogram()->getLinkageName()));
-    PrevDIL = DIL;
-  }
-  if (S.size() == 0)
-    return Samples;
-  const FunctionSamples *FS = Samples;
-  for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
-    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
-  }
-  return FS;
+  return Samples->findFunctionSamples(DIL);
 }
 
 bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
@@ -759,7 +743,7 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
   return false;
 }
 
-/// \brief Iteratively inline hot callsites of a function.
+/// Iteratively inline hot callsites of a function.
 ///
 /// Iteratively traverse all callsites of the function \p F, and find if
 /// the corresponding inlined instance exists and is hot in profile. If
@@ -776,6 +760,7 @@ bool SampleProfileLoader::inlineHotFunctions(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
   DenseSet<Instruction *> PromotedInsns;
   bool Changed = false;
+  bool isCompact = (Reader->getFormat() == SPF_Compact_Binary);
   while (true) {
     bool LocalChanged = false;
     SmallVector<Instruction *, 10> CIS;
@@ -787,7 +772,7 @@ bool SampleProfileLoader::inlineHotFunctions(
         if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
             !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
           Candidates.push_back(&I);
-          if (callsiteIsHot(Samples, FS))
+          if (callsiteIsHot(FS, PSI))
             Hot = true;
         }
       }
@@ -807,8 +792,8 @@ bool SampleProfileLoader::inlineHotFunctions(
         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
           if (IsThinLTOPreLink) {
             FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
-                                     Samples->getTotalSamples() *
-                                         SampleProfileHotThreshold / 100);
+                                     PSI->getOrCompHotCountThreshold(),
+                                     isCompact);
             continue;
           }
           auto CalleeFunctionName = FS->getName();
@@ -817,7 +802,9 @@ bool SampleProfileLoader::inlineHotFunctions(
           // clone the caller first, and inline the cloned caller if it is
           // recursive. As llvm does not inline recursive calls, we will
           // simply ignore it instead of handling it explicitly.
-          if (CalleeFunctionName == F.getName())
+          std::string FGUID;
+          auto Fname = getRepInFormat(F.getName(), Reader->getFormat(), FGUID);
+          if (CalleeFunctionName == Fname)
             continue;
 
           const char *Reason = "Callee function not available";
@@ -836,9 +823,9 @@ bool SampleProfileLoader::inlineHotFunctions(
                 inlineCallInstruction(DI))
               LocalChanged = true;
           } else {
-            DEBUG(dbgs()
-                  << "\nFailed to promote indirect call to "
-                  << CalleeFunctionName << " because " << Reason << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "\nFailed to promote indirect call to "
+                       << CalleeFunctionName << " because " << Reason << "\n");
           }
         }
       } else if (CalledFunction && CalledFunction->getSubprogram() &&
@@ -847,8 +834,8 @@ bool SampleProfileLoader::inlineHotFunctions(
           LocalChanged = true;
       } else if (IsThinLTOPreLink) {
         findCalleeFunctionSamples(*I)->findInlinedFunctions(
-            InlinedGUIDs, F.getParent(),
-            Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
+            InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold(),
+            isCompact);
       }
     }
     if (LocalChanged) {
@@ -860,7 +847,7 @@ bool SampleProfileLoader::inlineHotFunctions(
   return Changed;
 }
 
-/// \brief Find equivalence classes for the given block.
+/// Find equivalence classes for the given block.
 ///
 /// This finds all the blocks that are guaranteed to execute the same
 /// number of times as \p BB1. To do this, it traverses all the
@@ -917,7 +904,7 @@ void SampleProfileLoader::findEquivalencesFor(
   }
 }
 
-/// \brief Find equivalence classes.
+/// Find equivalence classes.
 ///
 /// Since samples may be missing from blocks, we can fill in the gaps by setting
 /// the weights of all the blocks in the same equivalence class to the same
@@ -928,14 +915,14 @@ void SampleProfileLoader::findEquivalencesFor(
 /// \param F The function to query.
 void SampleProfileLoader::findEquivalenceClasses(Function &F) {
   SmallVector<BasicBlock *, 8> DominatedBBs;
-  DEBUG(dbgs() << "\nBlock equivalence classes\n");
+  LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
   // Find equivalence sets based on dominance and post-dominance information.
   for (auto &BB : F) {
     BasicBlock *BB1 = &BB;
 
     // Compute BB1's equivalence class once.
     if (EquivalenceClass.count(BB1)) {
-      DEBUG(printBlockEquivalence(dbgs(), BB1));
+      LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
       continue;
     }
 
@@ -956,7 +943,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
     DT->getDescendants(BB1, DominatedBBs);
     findEquivalencesFor(BB1, DominatedBBs, PDT.get());
 
-    DEBUG(printBlockEquivalence(dbgs(), BB1));
+    LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
   }
 
   // Assign weights to equivalence classes.
@@ -965,17 +952,18 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
   // the same number of times. Since we know that the head block in
   // each equivalence class has the largest weight, assign that weight
   // to all the blocks in that equivalence class.
-  DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+  LLVM_DEBUG(
+      dbgs() << "\nAssign the same weight to all blocks in the same class\n");
   for (auto &BI : F) {
     const BasicBlock *BB = &BI;
     const BasicBlock *EquivBB = EquivalenceClass[BB];
     if (BB != EquivBB)
       BlockWeights[BB] = BlockWeights[EquivBB];
-    DEBUG(printBlockWeight(dbgs(), BB));
+    LLVM_DEBUG(printBlockWeight(dbgs(), BB));
   }
 }
 
-/// \brief Visit the given edge to decide if it has a valid weight.
+/// Visit the given edge to decide if it has a valid weight.
 ///
 /// If \p E has not been visited before, we copy to \p UnknownEdge
 /// and increment the count of unknown edges.
@@ -996,7 +984,7 @@ uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
   return EdgeWeights[E];
 }
 
-/// \brief Propagate weights through incoming/outgoing edges.
+/// Propagate weights through incoming/outgoing edges.
 ///
 /// If the weight of a basic block is known, and there is only one edge
 /// with an unknown weight, we can calculate the weight of that edge.
@@ -1012,7 +1000,7 @@ uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
 bool SampleProfileLoader::propagateThroughEdges(Function &F,
                                                 bool UpdateBlockCount) {
   bool Changed = false;
-  DEBUG(dbgs() << "\nPropagation through edges\n");
+  LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
   for (const auto &BI : F) {
     const BasicBlock *BB = &BI;
     const BasicBlock *EC = EquivalenceClass[BB];
@@ -1084,9 +1072,9 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
             if (TotalWeight > BBWeight) {
               BBWeight = TotalWeight;
               Changed = true;
-              DEBUG(dbgs() << "All edge weights for " << BB->getName()
-                           << " known. Set weight for block: ";
-                    printBlockWeight(dbgs(), BB););
+              LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
+                                << " known. Set weight for block: ";
+                         printBlockWeight(dbgs(), BB););
             }
           } else if (NumTotalEdges == 1 &&
                      EdgeWeights[SingleEdge] < BlockWeights[EC]) {
@@ -1113,8 +1101,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
             EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
           VisitedEdges.insert(UnknownEdge);
           Changed = true;
-          DEBUG(dbgs() << "Set weight for edge: ";
-                printEdgeWeight(dbgs(), UnknownEdge));
+          LLVM_DEBUG(dbgs() << "Set weight for edge: ";
+                     printEdgeWeight(dbgs(), UnknownEdge));
         }
       } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
         // If a block Weights 0, all its in/out edges should weight 0.
@@ -1140,8 +1128,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
           EdgeWeights[SelfReferentialEdge] = 0;
         VisitedEdges.insert(SelfReferentialEdge);
         Changed = true;
-        DEBUG(dbgs() << "Set self-referential edge weight to: ";
-              printEdgeWeight(dbgs(), SelfReferentialEdge));
+        LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
+                   printEdgeWeight(dbgs(), SelfReferentialEdge));
       }
       if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
         BlockWeights[EC] = TotalWeight;
@@ -1154,7 +1142,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
   return Changed;
 }
 
-/// \brief Build in/out edge lists for each basic block in the CFG.
+/// Build in/out edge lists for each basic block in the CFG.
 ///
 /// We are interested in unique edges. If a block B1 has multiple
 /// edges to another block B2, we only add a single B1->B2 edge.
@@ -1190,17 +1178,17 @@ static SmallVector<InstrProfValueData, 2> SortCallTargets(
   SmallVector<InstrProfValueData, 2> R;
   for (auto I = M.begin(); I != M.end(); ++I)
     R.push_back({Function::getGUID(I->getKey()), I->getValue()});
-  std::sort(R.begin(), R.end(),
-            [](const InstrProfValueData &L, const InstrProfValueData &R) {
-              if (L.Count == R.Count)
-                return L.Value > R.Value;
-              else
-                return L.Count > R.Count;
-            });
+  llvm::sort(R.begin(), R.end(),
+             [](const InstrProfValueData &L, const InstrProfValueData &R) {
+               if (L.Count == R.Count)
+                 return L.Value > R.Value;
+               else
+                 return L.Count > R.Count;
+             });
   return R;
 }
 
-/// \brief Propagate weights into edges
+/// Propagate weights into edges
 ///
 /// The following rules are applied to every block BB in the CFG:
 ///
@@ -1265,7 +1253,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
 
   // Generate MD_prof metadata for every branch instruction using the
   // edge weights computed during propagation.
-  DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
+  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
   LLVMContext &Ctx = F.getContext();
   MDBuilder MDB(Ctx);
   for (auto &BI : F) {
@@ -1281,7 +1269,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           if (!DLoc)
             continue;
           const DILocation *DIL = DLoc;
-          uint32_t LineOffset = getOffset(DIL);
+          uint32_t LineOffset = FunctionSamples::getOffset(DIL);
           uint32_t Discriminator = DIL->getBaseDiscriminator();
 
           const FunctionSamples *FS = findFunctionSamples(I);
@@ -1311,10 +1299,10 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       continue;
 
     DebugLoc BranchLoc = TI->getDebugLoc();
-    DEBUG(dbgs() << "\nGetting weights for branch at line "
-                 << ((BranchLoc) ? Twine(BranchLoc.getLine())
-                                 : Twine("<UNKNOWN LOCATION>"))
-                 << ".\n");
+    LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
+                      << ((BranchLoc) ? Twine(BranchLoc.getLine())
+                                      : Twine("<UNKNOWN LOCATION>"))
+                      << ".\n");
     SmallVector<uint32_t, 4> Weights;
     uint32_t MaxWeight = 0;
     Instruction *MaxDestInst;
@@ -1322,12 +1310,12 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       BasicBlock *Succ = TI->getSuccessor(I);
       Edge E = std::make_pair(BB, Succ);
       uint64_t Weight = EdgeWeights[E];
-      DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
+      LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
       // Use uint32_t saturated arithmetic to adjust the incoming weights,
       // if needed. Sample counts in profiles are 64-bit unsigned values,
       // but internally branch weights are expressed as 32-bit values.
       if (Weight > std::numeric_limits<uint32_t>::max()) {
-        DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+        LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
         Weight = std::numeric_limits<uint32_t>::max();
       }
       // Weight is added by one to avoid propagation errors introduced by
@@ -1348,7 +1336,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
     // annotation is done twice. If the first annotation already set the
     // weights, the second pass does not need to set it.
     if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
-      DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
+      LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
       TI->setMetadata(LLVMContext::MD_prof,
                       MDB.createBranchWeights(Weights));
       ORE->emit([&]() {
@@ -1357,12 +1345,12 @@ void SampleProfileLoader::propagateWeights(Function &F) {
                << ore::NV("CondBranchesLoc", BranchLoc);
       });
     } else {
-      DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+      LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
     }
   }
 }
 
-/// \brief Get the line number for the function header.
+/// Get the line number for the function header.
 ///
 /// This looks up function \p F in the current compilation unit and
 /// retrieves the line number where the function is defined. This is
@@ -1377,6 +1365,9 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
   if (DISubprogram *S = F.getSubprogram())
     return S->getLine();
 
+  if (NoWarnSampleUnused)
+    return 0;
+
   // If the start of \p F is missing, emit a diagnostic to inform the user
   // about the missed opportunity.
   F.getContext().diagnose(DiagnosticInfoSampleProfile(
@@ -1390,14 +1381,13 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
   DT.reset(new DominatorTree);
   DT->recalculate(F);
 
-  PDT.reset(new PostDomTreeBase<BasicBlock>());
-  PDT->recalculate(F);
+  PDT.reset(new PostDominatorTree(F));
 
   LI.reset(new LoopInfo);
   LI->analyze(*DT);
 }
 
-/// \brief Generate branch weight metadata for all branches in \p F.
+/// Generate branch weight metadata for all branches in \p F.
 ///
 /// Branch weights are computed out of instruction samples using a
 /// propagation heuristic. Propagation proceeds in 3 phases:
@@ -1452,8 +1442,8 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   if (getFunctionLoc(F) == 0)
     return false;
 
-  DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
-               << ": " << getFunctionLoc(F) << "\n");
+  LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
+                    << F.getName() << ": " << getFunctionLoc(F) << "\n");
 
   DenseSet<GlobalValue::GUID> InlinedGUIDs;
   Changed |= inlineHotFunctions(F, InlinedGUIDs);
@@ -1467,7 +1457,9 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
     // Sets the GUIDs that are inlined in the profiled binary. This is used
     // for ThinLink to make correct liveness analysis, and also make the IR
     // match the profiled binary before annotation.
-    F.setEntryCount(Samples->getHeadSamples() + 1, &InlinedGUIDs);
+    F.setEntryCount(
+        ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+        &InlinedGUIDs);
 
     // Compute dominance and loop info needed for propagation.
     computeDominanceAndLoopInfo(F);
@@ -1481,8 +1473,8 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
 
   // If coverage checking was requested, compute it now.
   if (SampleProfileRecordCoverage) {
-    unsigned Used = CoverageTracker.countUsedRecords(Samples);
-    unsigned Total = CoverageTracker.countBodyRecords(Samples);
+    unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
+    unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
     if (Coverage < SampleProfileRecordCoverage) {
       F.getContext().diagnose(DiagnosticInfoSampleProfile(
@@ -1495,7 +1487,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
 
   if (SampleProfileSampleCoverage) {
     uint64_t Used = CoverageTracker.getTotalUsedSamples();
-    uint64_t Total = CoverageTracker.countBodySamples(Samples);
+    uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
     if (Coverage < SampleProfileSampleCoverage) {
       F.getContext().diagnose(DiagnosticInfoSampleProfile(
@@ -1514,6 +1506,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
                       "Sample Profile loader", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
 
@@ -1538,10 +1531,15 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
   return new SampleProfileLoaderLegacyPass(Name);
 }
 
-bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
+bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
+                                      ProfileSummaryInfo *_PSI) {
   if (!ProfileIsValid)
     return false;
 
+  PSI = _PSI;
+  if (M.getProfileSummary() == nullptr)
+    M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
+
   // Compute the total number of samples collected in this profile.
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
@@ -1572,22 +1570,22 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
       clearFunctionData();
       retval |= runOnFunction(F, AM);
     }
-  if (M.getProfileSummary() == nullptr)
-    M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
   return retval;
 }
 
 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
   ACT = &getAnalysis<AssumptionCacheTracker>();
   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
-  return SampleLoader.runOnModule(M, nullptr);
+  ProfileSummaryInfo *PSI =
+      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  return SampleLoader.runOnModule(M, nullptr, PSI);
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
   // Initialize the entry count to -1, which will be treated conservatively
   // by getEntryCount as the same as unknown (None). If we have samples this
   // will be overwritten in emitAnnotations.
-  F.setEntryCount(-1);
+  F.setEntryCount(ProfileCount(-1, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
     auto &FAM =
@@ -1622,7 +1620,8 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
 
   SampleLoader.doInitialization(M);
 
-  if (!SampleLoader.runOnModule(M, &AM))
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+  if (!SampleLoader.runOnModule(M, &AM, PSI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
index de1b51e206ff..c9afb060a91a 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -21,6 +21,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -30,7 +31,6 @@
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
new file mode 100644
index 000000000000..3c5ad37bced1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -0,0 +1,140 @@
+//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that synthesizes entry counts for
+// functions and attaches !prof metadata to functions with the synthesized
+// counts. The presence of !prof metadata with counter name set to
+// 'synthesized_function_entry_count' indicate that the value of the counter is
+// an estimation of the likely execution count of the function. This transform
+// is applied only in non PGO mode as functions get 'real' profile-based
+// function entry counts in the PGO mode.
+//
+// The transformation works by first assigning some initial values to the entry
+// counts of all functions and then doing a top-down traversal of the
+// callgraph-scc to propagate the counts. For each function the set of callsites
+// and their relative block frequency is gathered. The relative block frequency
+// multiplied by the entry count of the caller and added to the callee's entry
+// count. For non-trivial SCCs, the new counts are computed from the previous
+// counts and updated in one shot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/SyntheticCountsUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using Scaled64 = ScaledNumber<uint64_t>;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "synthetic-counts-propagation"
+
+/// Initial synthetic count assigned to functions.
+static cl::opt<int>
+    InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
+                          cl::ZeroOrMore,
+                          cl::desc("Initial value of synthetic entry count."));
+
+/// Initial synthetic count assigned to inline functions.
+static cl::opt<int> InlineSyntheticCount(
+    "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore,
+    cl::desc("Initial synthetic entry count for inline functions."));
+
+/// Initial synthetic count assigned to cold functions.
+static cl::opt<int> ColdSyntheticCount(
+    "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore,
+    cl::desc("Initial synthetic entry count for cold functions."));
+
+// Assign initial synthetic entry counts to functions.
+static void
+initializeCounts(Module &M, function_ref<void(Function *, uint64_t)> SetCount) {
+  auto MayHaveIndirectCalls = [](Function &F) {
+    for (auto *U : F.users()) {
+      if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+        return true;
+    }
+    return false;
+  };
+
+  for (Function &F : M) {
+    uint64_t InitialCount = InitialSyntheticCount;
+    if (F.isDeclaration())
+      continue;
+    if (F.hasFnAttribute(Attribute::AlwaysInline) ||
+        F.hasFnAttribute(Attribute::InlineHint)) {
+      // Use a higher value for inline functions to account for the fact that
+      // these are usually beneficial to inline.
+      InitialCount = InlineSyntheticCount;
+    } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) {
+      // Local functions without inline hints get counts only through
+      // propagation.
+      InitialCount = 0;
+    } else if (F.hasFnAttribute(Attribute::Cold) ||
+               F.hasFnAttribute(Attribute::NoInline)) {
+      // Use a lower value for noinline and cold functions.
+      InitialCount = ColdSyntheticCount;
+    }
+    SetCount(&F, InitialCount);
+  }
+}
+
+PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
+                                                  ModuleAnalysisManager &MAM) {
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  DenseMap<Function *, uint64_t> Counts;
+  // Set initial entry counts.
+  initializeCounts(M, [&](Function *F, uint64_t Count) { Counts[F] = Count; });
+
+  // Compute the relative block frequency for a call edge. Use scaled numbers
+  // and not integers since the relative block frequency could be less than 1.
+  auto GetCallSiteRelFreq = [&](const CallGraphNode::CallRecord &Edge) {
+    Optional<Scaled64> Res = None;
+    if (!Edge.first)
+      return Res;
+    assert(isa<Instruction>(Edge.first));
+    CallSite CS(cast<Instruction>(Edge.first));
+    Function *Caller = CS.getCaller();
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+    BasicBlock *CSBB = CS.getInstruction()->getParent();
+    Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
+    Scaled64 BBFreq(BFI.getBlockFreq(CSBB).getFrequency(), 0);
+    BBFreq /= EntryFreq;
+    return Optional<Scaled64>(BBFreq);
+  };
+
+  CallGraph CG(M);
+  // Propgate the entry counts on the callgraph.
+  SyntheticCountsUtils<const CallGraph *>::propagate(
+      &CG, GetCallSiteRelFreq,
+      [&](const CallGraphNode *N) { return Counts[N->getFunction()]; },
+      [&](const CallGraphNode *N, uint64_t New) {
+        auto F = N->getFunction();
+        if (!F || F->isDeclaration())
+          return;
+        Counts[F] += New;
+      });
+
+  // Set the counts as metadata.
+  for (auto Entry : Counts)
+    Entry.first->setEntryCount(
+        ProfileCount(Entry.second, Function::PCT_Synthetic));
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index caffc03339c4..8fe7ae1282cc 100644
--- a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -18,11 +18,13 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 using namespace llvm;
@@ -128,8 +130,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
       }
       GO.addMetadata(
           LLVMContext::MD_type,
-          *MDNode::get(M.getContext(),
-                       ArrayRef<Metadata *>{MD->getOperand(0), I->second}));
+          *MDNode::get(M.getContext(), {MD->getOperand(0), I->second}));
     }
   }
 }
@@ -169,46 +170,17 @@ void simplifyExternals(Module &M) {
   }
 }
 
-void filterModule(
-    Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
-  for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
-       I != E;) {
-    GlobalAlias *GA = &*I++;
-    if (ShouldKeepDefinition(GA))
-      continue;
-
-    GlobalObject *GO;
-    if (GA->getValueType()->isFunctionTy())
-      GO = Function::Create(cast<FunctionType>(GA->getValueType()),
-                            GlobalValue::ExternalLinkage, "", M);
-    else
-      GO = new GlobalVariable(
-          *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
-          nullptr, "", nullptr,
-          GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
-    GO->takeName(GA);
-    GA->replaceAllUsesWith(GO);
-    GA->eraseFromParent();
-  }
-
-  for (Function &F : *M) {
-    if (ShouldKeepDefinition(&F))
-      continue;
-
-    F.deleteBody();
-    F.setComdat(nullptr);
-    F.clearMetadata();
-  }
-
-  for (GlobalVariable &GV : M->globals()) {
-    if (ShouldKeepDefinition(&GV))
-      continue;
-
-    GV.setInitializer(nullptr);
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.setComdat(nullptr);
-    GV.clearMetadata();
-  }
+static void
+filterModule(Module *M,
+             function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
+  std::vector<GlobalValue *> V;
+  for (GlobalValue &GV : M->global_values())
+    if (!ShouldKeepDefinition(&GV))
+      V.push_back(&GV);
+
+  for (GlobalValue *GV : V)
+    if (!convertToDeclaration(*GV))
+      GV->eraseFromParent();
 }
 
 void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
@@ -228,13 +200,19 @@ void splitAndWriteThinLTOBitcode(
     function_ref<AAResults &(Function &)> AARGetter, Module &M) {
   std::string ModuleId = getUniqueModuleId(&M);
   if (ModuleId.empty()) {
-    // We couldn't generate a module ID for this module, just write it out as a
-    // regular LTO module.
-    WriteBitcodeToFile(&M, OS);
+    // We couldn't generate a module ID for this module, write it out as a
+    // regular LTO module with an index for summary-based dead stripping.
+    ProfileSummaryInfo PSI(M);
+    M.addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+    ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+    WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, &Index);
+
     if (ThinLinkOS)
       // We don't have a ThinLTO part, but still write the module to the
       // ThinLinkOS if requested so that the expected output file is produced.
-      WriteBitcodeToFile(&M, *ThinLinkOS);
+      WriteBitcodeToFile(M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+                         &Index);
+
     return;
   }
 
@@ -243,10 +221,8 @@ void splitAndWriteThinLTOBitcode(
   // Returns whether a global has attached type metadata. Such globals may
   // participate in CFI or whole-program devirtualization, so they need to
   // appear in the merged module instead of the thin LTO module.
-  auto HasTypeMetadata = [&](const GlobalObject *GO) {
-    SmallVector<MDNode *, 1> MDs;
-    GO->getMetadata(LLVMContext::MD_type, MDs);
-    return !MDs.empty();
+  auto HasTypeMetadata = [](const GlobalObject *GO) {
+    return GO->hasMetadata(LLVMContext::MD_type);
   };
 
   // Collect the set of virtual functions that are eligible for virtual constant
@@ -287,7 +263,7 @@ void splitAndWriteThinLTOBitcode(
 
   ValueToValueMapTy VMap;
   std::unique_ptr<Module> MergedM(
-      CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool {
+      CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
         if (const auto *C = GV->getComdat())
           if (MergedMComdats.count(C))
             return true;
@@ -298,6 +274,7 @@ void splitAndWriteThinLTOBitcode(
         return false;
       }));
   StripDebugInfo(*MergedM);
+  MergedM->setModuleInlineAsm("");
 
   for (Function &F : *MergedM)
     if (!F.isDeclaration()) {
@@ -328,13 +305,13 @@ void splitAndWriteThinLTOBitcode(
   promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
   promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
 
+  auto &Ctx = MergedM->getContext();
   SmallVector<MDNode *, 8> CfiFunctionMDs;
   for (auto V : CfiFunctions) {
     Function &F = *cast<Function>(V);
     SmallVector<MDNode *, 2> Types;
     F.getMetadata(LLVMContext::MD_type, Types);
 
-    auto &Ctx = MergedM->getContext();
     SmallVector<Metadata *, 4> Elts;
     Elts.push_back(MDString::get(Ctx, F.getName()));
     CfiFunctionLinkage Linkage;
@@ -357,6 +334,47 @@ void splitAndWriteThinLTOBitcode(
       NMD->addOperand(MD);
   }
 
+  SmallVector<MDNode *, 8> FunctionAliases;
+  for (auto &A : M.aliases()) {
+    if (!isa<Function>(A.getAliasee()))
+      continue;
+
+    auto *F = cast<Function>(A.getAliasee());
+
+    Metadata *Elts[] = {
+        MDString::get(Ctx, A.getName()),
+        MDString::get(Ctx, F->getName()),
+        ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())),
+        ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())),
+    };
+
+    FunctionAliases.push_back(MDTuple::get(Ctx, Elts));
+  }
+
+  if (!FunctionAliases.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases");
+    for (auto MD : FunctionAliases)
+      NMD->addOperand(MD);
+  }
+
+  SmallVector<MDNode *, 8> Symvers;
+  ModuleSymbolTable::CollectAsmSymvers(M, [&](StringRef Name, StringRef Alias) {
+    Function *F = M.getFunction(Name);
+    if (!F || F->use_empty())
+      return;
+
+    Symvers.push_back(MDTuple::get(
+        Ctx, {MDString::get(Ctx, Name), MDString::get(Ctx, Alias)}));
+  });
+
+  if (!Symvers.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("symvers");
+    for (auto MD : Symvers)
+      NMD->addOperand(MD);
+  }
+
   simplifyExternals(*MergedM);
 
   // FIXME: Try to re-use BSI and PFI from the original module here.
@@ -376,10 +394,9 @@ void splitAndWriteThinLTOBitcode(
   // be used in the backends, and use that in the minimized bitcode
   // produced for the full link.
   ModuleHash ModHash = {{0}};
-  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+  W.writeModule(M, /*ShouldPreserveUseListOrder=*/false, &Index,
                 /*GenerateHash=*/true, &ModHash);
-  W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
-                &MergedMIndex);
+  W.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, &MergedMIndex);
   W.writeSymtab();
   W.writeStrtab();
   OS << Buffer;
@@ -391,8 +408,8 @@ void splitAndWriteThinLTOBitcode(
     Buffer.clear();
     BitcodeWriter W2(Buffer);
     StripDebugInfo(M);
-    W2.writeThinLinkBitcode(&M, Index, ModHash);
-    W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
+    W2.writeThinLinkBitcode(M, Index, ModHash);
+    W2.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false,
                    &MergedMIndex);
     W2.writeSymtab();
     W2.writeStrtab();
@@ -402,10 +419,8 @@ void splitAndWriteThinLTOBitcode(
 
 // Returns whether this module needs to be split because it uses type metadata.
 bool requiresSplit(Module &M) {
-  SmallVector<MDNode *, 1> MDs;
   for (auto &GO : M.global_objects()) {
-    GO.getMetadata(LLVMContext::MD_type, MDs);
-    if (!MDs.empty())
+    if (GO.hasMetadata(LLVMContext::MD_type))
       return true;
   }
 
@@ -425,13 +440,13 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
   // be used in the backends, and use that in the minimized bitcode
   // produced for the full link.
   ModuleHash ModHash = {{0}};
-  WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
+  WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
                      /*GenerateHash=*/true, &ModHash);
   // If a minimized bitcode module was requested for the thin link, only
   // the information that is needed by thin link will be written in the
   // given OS.
   if (ThinLinkOS && Index)
-    WriteThinLinkBitcodeToFile(&M, *ThinLinkOS, *Index, ModHash);
+    WriteThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash);
 }
 
 class WriteThinLTOBitcode : public ModulePass {
diff --git a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 5fbb001216a3..d65da2504db4 100644
--- a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -111,6 +111,12 @@ static cl::opt<std::string> ClWriteSummary(
     cl::desc("Write summary to given YAML file after running pass"),
     cl::Hidden);
 
+static cl::opt<unsigned>
+    ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
+                cl::init(10), cl::ZeroOrMore,
+                cl::desc("Maximum number of call targets per "
+                         "call site to enable branch funnels"));
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -281,24 +287,11 @@ struct VirtualCallSite {
     DebugLoc DLoc = CS->getDebugLoc();
     BasicBlock *Block = CS.getParent();
 
-    // In the new pass manager, we can request the optimization
-    // remark emitter pass on a per-function-basis, which the
-    // OREGetter will do for us.
-    // In the old pass manager, this is harder, so we just build
-    // a optimization remark emitter on the fly, when we need it.
-    std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
-    OptimizationRemarkEmitter *ORE;
-    if (OREGetter)
-      ORE = &OREGetter(F);
-    else {
-      OwnedORE = make_unique<OptimizationRemarkEmitter>(F);
-      ORE = OwnedORE.get();
-    }
-
     using namespace ore;
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
-              << NV("Optimization", OptName) << ": devirtualized a call to "
-              << NV("FunctionName", TargetName));
+    OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+                      << NV("Optimization", OptName)
+                      << ": devirtualized a call to "
+                      << NV("FunctionName", TargetName));
   }
 
   void replaceAndErase(
@@ -329,12 +322,17 @@ struct CallSiteInfo {
   /// cases we are directly operating on the call sites at the IR level.
   std::vector<VirtualCallSite> CallSites;
 
+  /// Whether all call sites represented by this CallSiteInfo, including those
+  /// in summaries, have been devirtualized. This starts off as true because a
+  /// default constructed CallSiteInfo represents no call sites.
+  bool AllCallSitesDevirted = true;
+
   // These fields are used during the export phase of ThinLTO and reflect
   // information collected from function summaries.
 
   /// Whether any function summary contains an llvm.assume(llvm.type.test) for
   /// this slot.
-  bool SummaryHasTypeTestAssumeUsers;
+  bool SummaryHasTypeTestAssumeUsers = false;
 
   /// CFI-specific: a vector containing the list of function summaries that use
   /// the llvm.type.checked.load intrinsic and therefore will require
@@ -350,8 +348,22 @@ struct CallSiteInfo {
            !SummaryTypeCheckedLoadUsers.empty();
   }
 
-  /// As explained in the comment for SummaryTypeCheckedLoadUsers.
-  void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); }
+  void markSummaryHasTypeTestAssumeUsers() {
+    SummaryHasTypeTestAssumeUsers = true;
+    AllCallSitesDevirted = false;
+  }
+
+  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
+    SummaryTypeCheckedLoadUsers.push_back(FS);
+    AllCallSitesDevirted = false;
+  }
+
+  void markDevirt() {
+    AllCallSitesDevirted = true;
+
+    // As explained in the comment for SummaryTypeCheckedLoadUsers.
+    SummaryTypeCheckedLoadUsers.clear();
+  }
 };
 
 // Call site information collected for a specific VTableSlot.
@@ -386,7 +398,9 @@ CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
 
 void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
                                  unsigned *NumUnsafeUses) {
-  findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses});
+  auto &CSI = findCallSiteInfo(CS);
+  CSI.AllCallSitesDevirted = false;
+  CSI.CallSites.push_back({VTable, CS, NumUnsafeUses});
 }
 
 struct DevirtModule {
@@ -451,6 +465,12 @@ struct DevirtModule {
                            VTableSlotInfo &SlotInfo,
                            WholeProgramDevirtResolution *Res);
 
+  void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
+                              bool &IsExported);
+  void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                            VTableSlotInfo &SlotInfo,
+                            WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
   bool tryEvaluateFunctionsWithArgs(
       MutableArrayRef<VirtualCallTarget> TargetsForSlot,
       ArrayRef<uint64_t> Args);
@@ -484,6 +504,8 @@ struct DevirtModule {
                            StringRef Name, IntegerType *IntTy,
                            uint32_t Storage);
 
+  Constant *getMemberAddr(const TypeMemberInfo *M);
+
   void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
                             Constant *UniqueMemberAddr);
   bool tryUniqueRetValOpt(unsigned BitWidth,
@@ -539,7 +561,16 @@ struct WholeProgramDevirt : public ModulePass {
     if (skipModule(M))
       return false;
 
-    auto OREGetter = function_ref<OptimizationRemarkEmitter &(Function *)>();
+    // In the new pass manager, we can request the optimization
+    // remark emitter pass on a per-function-basis, which the
+    // OREGetter will do for us.
+    // In the old pass manager, this is harder, so we just build
+    // an optimization remark emitter on the fly, when we need it.
+    std::unique_ptr<OptimizationRemarkEmitter> ORE;
+    auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+      ORE = make_unique<OptimizationRemarkEmitter>(F);
+      return *ORE;
+    };
 
     if (UseCommandLine)
       return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter);
@@ -580,7 +611,8 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
   };
-  if (!DevirtModule(M, AARGetter, OREGetter, nullptr, nullptr).run())
+  if (!DevirtModule(M, AARGetter, OREGetter, ExportSummary, ImportSummary)
+           .run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
@@ -588,7 +620,7 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
 bool DevirtModule::runForTesting(
     Module &M, function_ref<AAResults &(Function &)> AARGetter,
     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
-  ModuleSummaryIndex Summary;
+  ModuleSummaryIndex Summary(/*HaveGVs=*/false);
 
   // Handle the command-line summary arguments. This code is for testing
   // purposes only, so we handle errors directly.
@@ -730,10 +762,9 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
       if (VCallSite.NumUnsafeUses)
         --*VCallSite.NumUnsafeUses;
     }
-    if (CSInfo.isExported()) {
+    if (CSInfo.isExported())
       IsExported = true;
-      CSInfo.markDevirt();
-    }
+    CSInfo.markDevirt();
   };
   Apply(SlotInfo.CSInfo);
   for (auto &P : SlotInfo.ConstCSInfo)
@@ -789,6 +820,133 @@ bool DevirtModule::trySingleImplDevirt(
   return true;
 }
 
+void DevirtModule::tryICallBranchFunnel(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+  Triple T(M.getTargetTriple());
+  if (T.getArch() != Triple::x86_64)
+    return;
+
+  if (TargetsForSlot.size() > ClThreshold)
+    return;
+
+  bool HasNonDevirt = !SlotInfo.CSInfo.AllCallSitesDevirted;
+  if (!HasNonDevirt)
+    for (auto &P : SlotInfo.ConstCSInfo)
+      if (!P.second.AllCallSitesDevirted) {
+        HasNonDevirt = true;
+        break;
+      }
+
+  if (!HasNonDevirt)
+    return;
+
+  FunctionType *FT =
+      FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true);
+  Function *JT;
+  if (isa<MDString>(Slot.TypeID)) {
+    JT = Function::Create(FT, Function::ExternalLinkage,
+                          getGlobalName(Slot, {}, "branch_funnel"), &M);
+    JT->setVisibility(GlobalValue::HiddenVisibility);
+  } else {
+    JT = Function::Create(FT, Function::InternalLinkage, "branch_funnel", &M);
+  }
+  JT->addAttribute(1, Attribute::Nest);
+
+  std::vector<Value *> JTArgs;
+  JTArgs.push_back(JT->arg_begin());
+  for (auto &T : TargetsForSlot) {
+    JTArgs.push_back(getMemberAddr(T.TM));
+    JTArgs.push_back(T.Fn);
+  }
+
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr);
+  Constant *Intr =
+      Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {});
+
+  auto *CI = CallInst::Create(Intr, JTArgs, "", BB);
+  CI->setTailCallKind(CallInst::TCK_MustTail);
+  ReturnInst::Create(M.getContext(), nullptr, BB);
+
+  bool IsExported = false;
+  applyICallBranchFunnel(SlotInfo, JT, IsExported);
+  if (IsExported)
+    Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
+}
+
+void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
+                                          Constant *JT, bool &IsExported) {
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    if (CSInfo.isExported())
+      IsExported = true;
+    if (CSInfo.AllCallSitesDevirted)
+      return;
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      CallSite CS = VCallSite.CS;
+
+      // Jump tables are only profitable if the retpoline mitigation is enabled.
+      Attribute FSAttr = CS.getCaller()->getFnAttribute("target-features");
+      if (FSAttr.hasAttribute(Attribute::None) ||
+          !FSAttr.getValueAsString().contains("+retpoline"))
+        continue;
+
+      if (RemarksEnabled)
+        VCallSite.emitRemark("branch-funnel", JT->getName(), OREGetter);
+
+      // Pass the address of the vtable in the nest register, which is r10 on
+      // x86_64.
+      std::vector<Type *> NewArgs;
+      NewArgs.push_back(Int8PtrTy);
+      for (Type *T : CS.getFunctionType()->params())
+        NewArgs.push_back(T);
+      PointerType *NewFT = PointerType::getUnqual(
+          FunctionType::get(CS.getFunctionType()->getReturnType(), NewArgs,
+                            CS.getFunctionType()->isVarArg()));
+
+      IRBuilder<> IRB(CS.getInstruction());
+      std::vector<Value *> Args;
+      Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
+      for (unsigned I = 0; I != CS.getNumArgOperands(); ++I)
+        Args.push_back(CS.getArgOperand(I));
+
+      CallSite NewCS;
+      if (CS.isCall())
+        NewCS = IRB.CreateCall(IRB.CreateBitCast(JT, NewFT), Args);
+      else
+        NewCS = IRB.CreateInvoke(
+            IRB.CreateBitCast(JT, NewFT),
+            cast<InvokeInst>(CS.getInstruction())->getNormalDest(),
+            cast<InvokeInst>(CS.getInstruction())->getUnwindDest(), Args);
+      NewCS.setCallingConv(CS.getCallingConv());
+
+      AttributeList Attrs = CS.getAttributes();
+      std::vector<AttributeSet> NewArgAttrs;
+      NewArgAttrs.push_back(AttributeSet::get(
+          M.getContext(), ArrayRef<Attribute>{Attribute::get(
+                              M.getContext(), Attribute::Nest)}));
+      for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I)
+        NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+      NewCS.setAttributes(
+          AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
+                             Attrs.getRetAttributes(), NewArgAttrs));
+
+      CS->replaceAllUsesWith(NewCS.getInstruction());
+      CS->eraseFromParent();
+
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    // Don't mark as devirtualized because there may be callers compiled without
+    // retpoline mitigation, which would mean that they are lowered to
+    // llvm.type.test and therefore require an llvm.type.test resolution for the
+    // type identifier.
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
 bool DevirtModule::tryEvaluateFunctionsWithArgs(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
     ArrayRef<uint64_t> Args) {
@@ -909,7 +1067,7 @@ Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
 
   // We only need to set metadata if the global is newly created, in which
   // case it would not have hidden visibility.
-  if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
+  if (GV->hasMetadata(LLVMContext::MD_absolute_symbol))
     return C;
 
   auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
@@ -941,6 +1099,12 @@ void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
   CSInfo.markDevirt();
 }
 
+Constant *DevirtModule::getMemberAddr(const TypeMemberInfo *M) {
+  Constant *C = ConstantExpr::getBitCast(M->Bits->GV, Int8PtrTy);
+  return ConstantExpr::getGetElementPtr(Int8Ty, C,
+                                        ConstantInt::get(Int64Ty, M->Offset));
+}
+
 bool DevirtModule::tryUniqueRetValOpt(
     unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
     CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
@@ -960,12 +1124,7 @@ bool DevirtModule::tryUniqueRetValOpt(
     // checked for a uniform return value in tryUniformRetValOpt.
     assert(UniqueMember);
 
-    Constant *UniqueMemberAddr =
-        ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy);
-    UniqueMemberAddr = ConstantExpr::getGetElementPtr(
-        Int8Ty, UniqueMemberAddr,
-        ConstantInt::get(Int64Ty, UniqueMember->Offset));
-
+    Constant *UniqueMemberAddr = getMemberAddr(UniqueMember);
     if (CSInfo.isExported()) {
       Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
       Res->Info = IsOne;
@@ -1352,6 +1511,14 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
       break;
     }
   }
+
+  if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
+    auto *JT = M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
+                                     Type::getVoidTy(M.getContext()));
+    bool IsExported = false;
+    applyICallBranchFunnel(SlotInfo, JT, IsExported);
+    assert(!IsExported);
+  }
 }
 
 void DevirtModule::removeRedundantTypeTests() {
@@ -1421,14 +1588,13 @@ bool DevirtModule::run() {
         // FIXME: Only add live functions.
         for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
           for (Metadata *MD : MetadataByGUID[VF.GUID]) {
-            CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers =
-                true;
+            CallSlots[{MD, VF.Offset}]
+                .CSInfo.markSummaryHasTypeTestAssumeUsers();
           }
         }
         for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
           for (Metadata *MD : MetadataByGUID[VF.GUID]) {
-            CallSlots[{MD, VF.Offset}]
-                .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS);
+            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
           }
         }
         for (const FunctionSummary::ConstVCall &VC :
@@ -1436,7 +1602,7 @@ bool DevirtModule::run() {
           for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
             CallSlots[{MD, VC.VFunc.Offset}]
                 .ConstCSInfo[VC.Args]
-                .SummaryHasTypeTestAssumeUsers = true;
+                .markSummaryHasTypeTestAssumeUsers();
           }
         }
         for (const FunctionSummary::ConstVCall &VC :
@@ -1444,7 +1610,7 @@ bool DevirtModule::run() {
           for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
             CallSlots[{MD, VC.VFunc.Offset}]
                 .ConstCSInfo[VC.Args]
-                .SummaryTypeCheckedLoadUsers.push_back(FS);
+                .addSummaryTypeCheckedLoadUser(FS);
           }
         }
       }
@@ -1468,9 +1634,12 @@ bool DevirtModule::run() {
                        cast<MDString>(S.first.TypeID)->getString())
                    .WPDRes[S.first.ByteOffset];
 
-      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) &&
-          tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first))
-        DidVirtualConstProp = true;
+      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res)) {
+        DidVirtualConstProp |=
+            tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
+
+        tryICallBranchFunnel(TargetsForSlot, S.second, Res, S.first);
+      }
 
       // Collect functions devirtualized at least for one call site for stats.
       if (RemarksEnabled)
@@ -1499,23 +1668,10 @@ bool DevirtModule::run() {
     for (const auto &DT : DevirtTargets) {
       Function *F = DT.second;
 
-      // In the new pass manager, we can request the optimization
-      // remark emitter pass on a per-function-basis, which the
-      // OREGetter will do for us.
-      // In the old pass manager, this is harder, so we just build
-      // a optimization remark emitter on the fly, when we need it.
-      std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
-      OptimizationRemarkEmitter *ORE;
-      if (OREGetter)
-        ORE = &OREGetter(F);
-      else {
-        OwnedORE = make_unique<OptimizationRemarkEmitter>(F);
-        ORE = OwnedORE.get();
-      }
-
       using namespace ore;
-      ORE->emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
-                << "devirtualized " << NV("FunctionName", F->getName()));
+      OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+                        << "devirtualized "
+                        << NV("FunctionName", F->getName()));
     }
   }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 688897644848..aa31e0d850dd 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -511,7 +511,8 @@ Value *FAddCombine::performFactorization(Instruction *I) {
 }
 
 Value *FAddCombine::simplify(Instruction *I) {
-  assert(I->isFast() && "Expected 'fast' instruction");
+  assert(I->hasAllowReassoc() && I->hasNoSignedZeros() &&
+         "Expected 'reassoc'+'nsz' instruction");
 
   // Currently we are not able to handle vector type.
   if (I->getType()->isVectorTy())
@@ -855,48 +856,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
   return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
 }
 
-/// \brief Return true if we can prove that:
-///    (sub LHS, RHS)  === (sub nsw LHS, RHS)
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-/// TODO: Handle this for Vectors.
-bool InstCombiner::willNotOverflowSignedSub(const Value *LHS,
-                                            const Value *RHS,
-                                            const Instruction &CxtI) const {
-  // If LHS and RHS each have at least two sign bits, the subtraction
-  // cannot overflow.
-  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
-      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
-    return true;
-
-  KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI);
-
-  KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI);
-
-  // Subtraction of two 2's complement numbers having identical signs will
-  // never overflow.
-  if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
-      (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
-    return true;
-
-  // TODO: implement logic similar to checkRippleForAdd
-  return false;
-}
-
-/// \brief Return true if we can prove that:
-///    (sub LHS, RHS)  === (sub nuw LHS, RHS)
-bool InstCombiner::willNotOverflowUnsignedSub(const Value *LHS,
-                                              const Value *RHS,
-                                              const Instruction &CxtI) const {
-  // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
-  KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
-  KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
-  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
-    return true;
-
-  return false;
-}
-
 // Checks if any operand is negative and we can convert add to sub.
 // This function checks for following negative patterns
 //   ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
@@ -964,7 +923,7 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
   if (!match(Op1, m_Constant(Op1C)))
     return nullptr;
 
-  if (Instruction *NV = foldOpWithConstantIntoOperand(Add))
+  if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add))
     return NV;
 
   Value *X;
@@ -1031,17 +990,148 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
-  if (Value *V = SimplifyVectorOp(I))
-    return replaceInstUsesWith(I, V);
+// Matches multiplication expression Op * C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns true if such a
+// match is found.
+static bool MatchMul(Value *E, Value *&Op, APInt &C) {
+  const APInt *AI;
+  if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) {
+    C = *AI;
+    return true;
+  }
+  if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) {
+    C = APInt(AI->getBitWidth(), 1);
+    C <<= *AI;
+    return true;
+  }
+  return false;
+}
+
+// Matches remainder expression Op % C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns the signedness of
+// the remainder operation in IsSigned. Returns true if such a match is
+// found.
+static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) {
+  const APInt *AI;
+  IsSigned = false;
+  if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) {
+    IsSigned = true;
+    C = *AI;
+    return true;
+  }
+  if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) {
+    C = *AI;
+    return true;
+  }
+  if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) {
+    C = *AI + 1;
+    return true;
+  }
+  return false;
+}
 
+// Matches division expression Op / C with the given signedness as indicated
+// by IsSigned, where C is a constant. Returns the constant value in C and the
+// other operand in Op. Returns true if such a match is found.
+static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) {
+  const APInt *AI;
+  if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) {
+    C = *AI;
+    return true;
+  }
+  if (!IsSigned) {
+    if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) {
+      C = *AI;
+      return true;
+    }
+    if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) {
+      C = APInt(AI->getBitWidth(), 1);
+      C <<= *AI;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns whether C0 * C1 with the given signedness overflows.
+static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
+  bool overflow;
+  if (IsSigned)
+    (void)C0.smul_ov(C1, overflow);
+  else
+    (void)C0.umul_ov(C1, overflow);
+  return overflow;
+}
+
+// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
+// does not overflow.
+Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  if (Value *V =
-          SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                          SQ.getWithInstruction(&I)))
+  Value *X, *MulOpV;
+  APInt C0, MulOpC;
+  bool IsSigned;
+  // Match I = X % C0 + MulOpV * C0
+  if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) ||
+       (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) &&
+      C0 == MulOpC) {
+    Value *RemOpV;
+    APInt C1;
+    bool Rem2IsSigned;
+    // Match MulOpC = RemOpV % C1
+    if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) &&
+        IsSigned == Rem2IsSigned) {
+      Value *DivOpV;
+      APInt DivOpC;
+      // Match RemOpV = X / C0
+      if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
+          C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
+        Value *NewDivisor =
+            ConstantInt::get(X->getType()->getContext(), C0 * C1);
+        return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
+                        : Builder.CreateURem(X, NewDivisor, "urem");
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold
+///   (1 << NBits) - 1
+/// Into:
+///   ~(-(1 << NBits))
+/// Because a 'not' is better for bit-tracking analysis and other transforms
+/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was.
+static Instruction *canonicalizeLowbitMask(BinaryOperator &I,
+                                           InstCombiner::BuilderTy &Builder) {
+  Value *NBits;
+  if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes())))
+    return nullptr;
+
+  Constant *MinusOne = Constant::getAllOnesValue(NBits->getType());
+  Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask");
+  // Be wary of constant folding.
+  if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) {
+    // Always NSW. But NUW propagates from `add`.
+    BOp->setHasNoSignedWrap();
+    BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+  }
+
+  return BinaryOperator::CreateNot(NotMask, I.getName());
+}
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+  if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
+
   // (A*B)+(A*C) -> A*(B+C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
@@ -1051,6 +1141,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
   // FIXME: This should be moved into the above helper function to allow these
   // transforms for general constant or constant splat vectors.
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   Type *Ty = I.getType();
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr;
@@ -1123,6 +1214,14 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = checkForNegativeOperand(I, Builder))
     return replaceInstUsesWith(I, V);
 
+  // (A + 1) + ~B --> A - B
+  // ~B + (A + 1) --> A - B
+  if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))))
+    return BinaryOperator::CreateSub(A, B);
+
+  // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
+  if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
+
   // A+B --> A|B iff A and B have no bits set in common.
   if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
     return BinaryOperator::CreateOr(LHS, RHS);
@@ -1253,26 +1352,15 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   }
 
   // (add (xor A, B) (and A, B)) --> (or A, B)
-  if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
-      match(RHS, m_c_And(m_Specific(A), m_Specific(B))))
-    return BinaryOperator::CreateOr(A, B);
-
   // (add (and A, B) (xor A, B)) --> (or A, B)
-  if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
-      match(LHS, m_c_And(m_Specific(A), m_Specific(B))))
+  if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)),
+                          m_c_And(m_Deferred(A), m_Deferred(B)))))
     return BinaryOperator::CreateOr(A, B);
 
   // (add (or A, B) (and A, B)) --> (add A, B)
-  if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
-      match(RHS, m_c_And(m_Specific(A), m_Specific(B)))) {
-    I.setOperand(0, A);
-    I.setOperand(1, B);
-    return &I;
-  }
-
   // (add (and A, B) (or A, B)) --> (add A, B)
-  if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
-      match(LHS, m_c_And(m_Specific(A), m_Specific(B)))) {
+  if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
+                          m_c_And(m_Deferred(A), m_Deferred(B))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -1281,6 +1369,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   // TODO(jingyue): Consider willNotOverflowSignedAdd and
   // willNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
+  bool Changed = false;
   if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
@@ -1290,39 +1379,35 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     I.setHasNoUnsignedWrap(true);
   }
 
+  if (Instruction *V = canonicalizeLowbitMask(I, Builder))
+    return V;
+
   return Changed ? &I : nullptr;
 }
 
 Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
-    return replaceInstUsesWith(I, V);
-
-  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(),
+  if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (isa<Constant>(RHS))
-    if (Instruction *FoldedFAdd = foldOpWithConstantIntoOperand(I))
-      return FoldedFAdd;
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
 
-  // -A + B  -->  B - A
-  // -A + -B  -->  -(A + B)
-  if (Value *LHSV = dyn_castFNegVal(LHS)) {
-    Instruction *RI = BinaryOperator::CreateFSub(RHS, LHSV);
-    RI->copyFastMathFlags(&I);
-    return RI;
-  }
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
-  // A + -B  -->  A - B
-  if (!isa<Constant>(RHS))
-    if (Value *V = dyn_castFNegVal(RHS)) {
-      Instruction *RI = BinaryOperator::CreateFSub(LHS, V);
-      RI->copyFastMathFlags(&I);
-      return RI;
-    }
+  if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
+    return FoldedFAdd;
+
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Value *X;
+  // (-X) + Y --> Y - X
+  if (match(LHS, m_FNeg(m_Value(X))))
+    return BinaryOperator::CreateFSubFMF(RHS, X, &I);
+  // Y + (-X) --> Y - X
+  if (match(RHS, m_FNeg(m_Value(X))))
+    return BinaryOperator::CreateFSubFMF(LHS, X, &I);
 
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
@@ -1386,12 +1471,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
     return replaceInstUsesWith(I, V);
 
-  if (I.isFast()) {
+  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return replaceInstUsesWith(I, V);
   }
 
-  return Changed ? &I : nullptr;
+  return nullptr;
 }
 
 /// Optimize pointer differences into the same array into a size.  Consider:
@@ -1481,21 +1566,20 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
 }
 
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V =
-          SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                          SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   // (A*B)-(A*C) -> A*(B-C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
   // If this is a 'B = x-(-A)', change to B = x+A.
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (Value *V = dyn_castNegVal(Op1)) {
     BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
 
@@ -1519,12 +1603,28 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (match(Op0, m_AllOnes()))
     return BinaryOperator::CreateNot(Op1);
 
+  // (~X) - (~Y) --> Y - X
+  Value *X, *Y;
+  if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
+    return BinaryOperator::CreateSub(Y, X);
+
   if (Constant *C = dyn_cast<Constant>(Op0)) {
+    bool IsNegate = match(C, m_ZeroInt());
     Value *X;
-    // C - zext(bool) -> bool ? C - 1 : C
-    if (match(Op1, m_ZExt(m_Value(X))) &&
-        X->getType()->getScalarSizeInBits() == 1)
+    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+      // 0 - (zext bool) --> sext bool
+      // C - (zext bool) --> bool ? C - 1 : C
+      if (IsNegate)
+        return CastInst::CreateSExtOrBitCast(X, I.getType());
       return SelectInst::Create(X, SubOne(C), C);
+    }
+    if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+      // 0 - (sext bool) --> zext bool
+      // C - (sext bool) --> bool ? C + 1 : C
+      if (IsNegate)
+        return CastInst::CreateZExtOrBitCast(X, I.getType());
+      return SelectInst::Create(X, AddOne(C), C);
+    }
 
     // C - ~X == X + (1+C)
     if (match(Op1, m_Not(m_Value(X))))
@@ -1544,16 +1644,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     Constant *C2;
     if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
-
-    // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
-    if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X))))
-      if (X->getType()->isIntOrIntVectorTy(1))
-        return CastInst::CreateSExtOrBitCast(X, Op1->getType());
-
-    // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
-    if (C->isNullValue() && match(Op1, m_SExt(m_Value(X))))
-      if (X->getType()->isIntOrIntVectorTy(1))
-        return CastInst::CreateZExtOrBitCast(X, Op1->getType());
   }
 
   const APInt *Op0C;
@@ -1575,6 +1665,22 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
         return BinaryOperator::CreateLShr(X, ShAmtOp);
       }
+
+      if (Op1->hasOneUse()) {
+        Value *LHS, *RHS;
+        SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor;
+        if (SPF == SPF_ABS || SPF == SPF_NABS) {
+          // This is a negate of an ABS/NABS pattern. Just swap the operands
+          // of the select.
+          SelectInst *SI = cast<SelectInst>(Op1);
+          Value *TrueVal = SI->getTrueValue();
+          Value *FalseVal = SI->getFalseValue();
+          SI->setTrueValue(FalseVal);
+          SI->setFalseValue(TrueVal);
+          // Don't swap prof metadata, we didn't change the branch behavior.
+          return replaceInstUsesWith(I, SI);
+        }
+      }
     }
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
@@ -1678,6 +1784,27 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
       return replaceInstUsesWith(I, Res);
 
+  // Canonicalize a shifty way to code absolute value to the common pattern.
+  // There are 2 potential commuted variants.
+  // We're relying on the fact that we only do this transform when the shift has
+  // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase
+  // instructions).
+  Value *A;
+  const APInt *ShAmt;
+  Type *Ty = I.getType();
+  if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
+      Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+      match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) {
+    // B = ashr i32 A, 31 ; smear the sign bit
+    // sub (xor A, B), B  ; flip bits if negative and subtract -1 (add 1)
+    // --> (A < 0) ? -A : A
+    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+    // Copy the nuw/nsw flags from the sub to the negate.
+    Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
+                                   I.hasNoSignedWrap());
+    return SelectInst::Create(Cmp, Neg, A);
+  }
+
   bool Changed = false;
   if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
     Changed = true;
@@ -1692,21 +1819,32 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 }
 
 Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
-    return replaceInstUsesWith(I, V);
-
-  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(),
+  if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
+
+  // Subtraction from -0.0 is the canonical form of fneg.
   // fsub nsz 0, X ==> fsub nsz -0.0, X
-  if (I.getFastMathFlags().noSignedZeros() && match(Op0, m_Zero())) {
-    // Subtraction from -0.0 is the canonical form of fneg.
-    Instruction *NewI = BinaryOperator::CreateFNeg(Op1);
-    NewI->copyFastMathFlags(&I);
-    return NewI;
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (I.hasNoSignedZeros() && match(Op0, m_PosZeroFP()))
+    return BinaryOperator::CreateFNegFMF(Op1, &I);
+
+  // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
+  // Canonicalize to fadd to make analysis easier.
+  // This can also help codegen because fadd is commutative.
+  // Note that if this fsub was really an fneg, the fadd with -0.0 will get
+  // killed later. We still limit that particular transform with 'hasOneUse'
+  // because an fneg is assumed better/cheaper than a generic fsub.
+  Value *X, *Y;
+  if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) {
+    if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+      Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
+      return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
+    }
   }
 
   if (isa<Constant>(Op0))
@@ -1714,34 +1852,34 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
       if (Instruction *NV = FoldOpIntoSelect(I, SI))
         return NV;
 
-  // If this is a 'B = x-(-A)', change to B = x+A, potentially looking
-  // through FP extensions/truncations along the way.
-  if (Value *V = dyn_castFNegVal(Op1)) {
-    Instruction *NewI = BinaryOperator::CreateFAdd(Op0, V);
-    NewI->copyFastMathFlags(&I);
-    return NewI;
-  }
-  if (FPTruncInst *FPTI = dyn_cast<FPTruncInst>(Op1)) {
-    if (Value *V = dyn_castFNegVal(FPTI->getOperand(0))) {
-      Value *NewTrunc = Builder.CreateFPTrunc(V, I.getType());
-      Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewTrunc);
-      NewI->copyFastMathFlags(&I);
-      return NewI;
-    }
-  } else if (FPExtInst *FPEI = dyn_cast<FPExtInst>(Op1)) {
-    if (Value *V = dyn_castFNegVal(FPEI->getOperand(0))) {
-      Value *NewExt = Builder.CreateFPExt(V, I.getType());
-      Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewExt);
-      NewI->copyFastMathFlags(&I);
-      return NewI;
-    }
+  // X - C --> X + (-C)
+  // But don't transform constant expressions because there's an inverse fold
+  // for X + (-Y) --> X - Y.
+  Constant *C;
+  if (match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1))
+    return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
+  
+  // X - (-Y) --> X + Y
+  if (match(Op1, m_FNeg(m_Value(Y))))
+    return BinaryOperator::CreateFAddFMF(Op0, Y, &I);
+
+  // Similar to above, but look through a cast of the negated value:
+  // X - (fptrunc(-Y)) --> X + fptrunc(Y)
+  if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y)))))) {
+    Value *TruncY = Builder.CreateFPTrunc(Y, I.getType());
+    return BinaryOperator::CreateFAddFMF(Op0, TruncY, &I);
+  }
+  // X - (fpext(-Y)) --> X + fpext(Y)
+  if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y)))))) {
+    Value *ExtY = Builder.CreateFPExt(Y, I.getType());
+    return BinaryOperator::CreateFAddFMF(Op0, ExtY, &I);
   }
 
   // Handle specials cases for FSub with selects feeding the operation
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
     return replaceInstUsesWith(I, V);
 
-  if (I.isFast()) {
+  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return replaceInstUsesWith(I, V);
   }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2364202e5b69..372bc41f780e 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -14,10 +14,10 @@
 #include "InstCombineInternal.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -75,7 +75,7 @@ static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
   return Builder.CreateFCmp(Pred, LHS, RHS);
 }
 
-/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
+/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
 /// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
 /// \param I Binary operator to transform.
 /// \return Pointer to node that must replace the original binary operator, or
@@ -305,17 +305,21 @@ static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pre
 }
 
 /// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
-/// Return the set of pattern classes (from MaskedICmpType) that both LHS and
-/// RHS satisfy.
-static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
-                                         Value *&D, Value *&E, ICmpInst *LHS,
-                                         ICmpInst *RHS,
-                                         ICmpInst::Predicate &PredL,
-                                         ICmpInst::Predicate &PredR) {
+/// Return the pattern classes (from MaskedICmpType) for the left hand side and
+/// the right hand side as a pair.
+/// LHS and RHS are the left hand side and the right hand side ICmps and PredL
+/// and PredR are their predicates, respectively.
+static
+Optional<std::pair<unsigned, unsigned>>
+getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+                         Value *&D, Value *&E, ICmpInst *LHS,
+                         ICmpInst *RHS,
+                         ICmpInst::Predicate &PredL,
+                         ICmpInst::Predicate &PredR) {
   // vectors are not (yet?) supported. Don't support pointers either.
   if (!LHS->getOperand(0)->getType()->isIntegerTy() ||
       !RHS->getOperand(0)->getType()->isIntegerTy())
-    return 0;
+    return None;
 
   // Here comes the tricky part:
   // LHS might be of the form L11 & L12 == X, X == L21 & L22,
@@ -346,7 +350,7 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
 
   // Bail if LHS was a icmp that can't be decomposed into an equality.
   if (!ICmpInst::isEquality(PredL))
-    return 0;
+    return None;
 
   Value *R1 = RHS->getOperand(0);
   Value *R2 = RHS->getOperand(1);
@@ -360,7 +364,7 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
       A = R12;
       D = R11;
     } else {
-      return 0;
+      return None;
     }
     E = R2;
     R1 = nullptr;
@@ -388,7 +392,7 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
 
   // Bail if RHS was a icmp that can't be decomposed into an equality.
   if (!ICmpInst::isEquality(PredR))
-    return 0;
+    return None;
 
   // Look for ANDs on the right side of the RHS icmp.
   if (!Ok) {
@@ -408,11 +412,11 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
       E = R1;
       Ok = true;
     } else {
-      return 0;
+      return None;
     }
   }
   if (!Ok)
-    return 0;
+    return None;
 
   if (L11 == A) {
     B = L12;
@@ -430,7 +434,174 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
 
   unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
   unsigned RightType = getMaskedICmpType(A, D, E, PredR);
-  return LeftType & RightType;
+  return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType));
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
+/// and the right hand side is of type BMask_Mixed. For example,
+/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
+static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+    Value *A, Value *B, Value *C, Value *D, Value *E,
+    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    llvm::InstCombiner::BuilderTy &Builder) {
+  // We are given the canonical form:
+  //   (icmp ne (A & B), 0) & (icmp eq (A & D), E).
+  // where D & E == E.
+  //
+  // If IsAnd is false, we get it in negated form:
+  //   (icmp eq (A & B), 0) | (icmp ne (A & D), E) ->
+  //      !((icmp ne (A & B), 0) & (icmp eq (A & D), E)).
+  //
+  // We currently handle the case of B, C, D, E are constant.
+  //
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  if (!BCst)
+    return nullptr;
+  ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+  if (!CCst)
+    return nullptr;
+  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+  if (!DCst)
+    return nullptr;
+  ConstantInt *ECst = dyn_cast<ConstantInt>(E);
+  if (!ECst)
+    return nullptr;
+
+  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+
+  // Update E to the canonical form when D is a power of two and RHS is
+  // canonicalized as,
+  // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or
+  // (icmp ne (A & D), D) -> (icmp eq (A & D), 0).
+  if (PredR != NewCC)
+    ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+  // If B or D is zero, skip because if LHS or RHS can be trivially folded by
+  // other folding rules and this pattern won't apply any more.
+  if (BCst->getValue() == 0 || DCst->getValue() == 0)
+    return nullptr;
+
+  // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't
+  // deduce anything from it.
+  // For example,
+  // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding.
+  if ((BCst->getValue() & DCst->getValue()) == 0)
+    return nullptr;
+
+  // If the following two conditions are met:
+  //
+  // 1. mask B covers only a single bit that's not covered by mask D, that is,
+  // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of
+  // B and D has only one bit set) and,
+  //
+  // 2. RHS (and E) indicates that the rest of B's bits are zero (in other
+  // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0
+  //
+  // then that single bit in B must be one and thus the whole expression can be
+  // folded to
+  //   (A & (B | D)) == (B & (B ^ D)) | E.
+  //
+  // For example,
+  // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9)
+  // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8)
+  if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) &&
+      (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) {
+    APInt BorD = BCst->getValue() | DCst->getValue();
+    APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) |
+        ECst->getValue();
+    Value *NewMask = ConstantInt::get(BCst->getType(), BorD);
+    Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE);
+    Value *NewAnd = Builder.CreateAnd(A, NewMask);
+    return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue);
+  }
+
+  auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+    return (C1->getValue() & C2->getValue()) == C1->getValue();
+  };
+  auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+    return (C1->getValue() & C2->getValue()) == C2->getValue();
+  };
+
+  // In the following, we consider only the cases where B is a superset of D, B
+  // is a subset of D, or B == D because otherwise there's at least one bit
+  // covered by B but not D, in which case we can't deduce much from it, so
+  // no folding (aside from the single must-be-one bit case right above.)
+  // For example,
+  // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding.
+  if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst))
+    return nullptr;
+
+  // At this point, either B is a superset of D, B is a subset of D or B == D.
+
+  // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict
+  // and the whole expression becomes false (or true if negated), otherwise, no
+  // folding.
+  // For example,
+  // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false.
+  // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding.
+  if (ECst->isZero()) {
+    if (IsSubSetOrEqual(BCst, DCst))
+      return ConstantInt::get(LHS->getType(), !IsAnd);
+    return nullptr;
+  }
+
+  // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B ==
+  // D. If B is a superset of (or equal to) D, since E is not zero, LHS is
+  // subsumed by RHS (RHS implies LHS.) So the whole expression becomes
+  // RHS. For example,
+  // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+  // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+  if (IsSuperSetOrEqual(BCst, DCst))
+    return RHS;
+  // Otherwise, B is a subset of D. If B and E have a common bit set,
+  // ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
+  // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+  assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
+  if ((BCst->getValue() & ECst->getValue()) != 0)
+    return RHS;
+  // Otherwise, LHS and RHS contradict and the whole expression becomes false
+  // (or true if negated.) For example,
+  // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false.
+  // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false.
+  return ConstantInt::get(LHS->getType(), !IsAnd);
+}
+
+/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
+/// aren't of the common mask pattern type.
+static Value *foldLogOpOfMaskedICmpsAsymmetric(
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+    Value *A, Value *B, Value *C, Value *D, Value *E,
+    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    unsigned LHSMask, unsigned RHSMask,
+    llvm::InstCombiner::BuilderTy &Builder) {
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
+  // Handle Mask_NotAllZeros-BMask_Mixed cases.
+  // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or
+  // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E)
+  //    which gets swapped to
+  //    (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C).
+  if (!IsAnd) {
+    LHSMask = conjugateICmpMask(LHSMask);
+    RHSMask = conjugateICmpMask(RHSMask);
+  }
+  if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) {
+    if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+            LHS, RHS, IsAnd, A, B, C, D, E,
+            PredL, PredR, Builder)) {
+      return V;
+    }
+  } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) {
+    if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+            RHS, LHS, IsAnd, A, D, E, B, C,
+            PredR, PredL, Builder)) {
+      return V;
+    }
+  }
+  return nullptr;
 }
 
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
@@ -439,13 +610,24 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy &Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-  unsigned Mask =
+  Optional<std::pair<unsigned, unsigned>> MaskPair =
       getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
-  if (Mask == 0)
+  if (!MaskPair)
     return nullptr;
-
   assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
          "Expected equality predicates for masked type of icmps.");
+  unsigned LHSMask = MaskPair->first;
+  unsigned RHSMask = MaskPair->second;
+  unsigned Mask = LHSMask & RHSMask;
+  if (Mask == 0) {
+    // Even if the two sides don't share a common pattern, check if folding can
+    // still happen.
+    if (Value *V = foldLogOpOfMaskedICmpsAsymmetric(
+            LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask,
+            Builder))
+      return V;
+    return nullptr;
+  }
 
   // In full generality:
   //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
@@ -939,8 +1121,8 @@ Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd)
       return nullptr;
 
     // FCmp canonicalization ensures that (fcmp ord/uno X, X) and
-    // (fcmp ord/uno X, C) will be transformed to (fcmp X, 0.0).
-    if (match(LHS1, m_Zero()) && LHS1 == RHS1)
+    // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0).
+    if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP()))
       // Ignore the constants because they are obviously not NANs:
       // (fcmp ord x, 0.0) & (fcmp ord y, 0.0)  -> (fcmp ord x, y)
       // (fcmp uno x, 0.0) | (fcmp uno y, 0.0)  -> (fcmp uno x, y)
@@ -1106,8 +1288,8 @@ static Instruction *foldAndToXor(BinaryOperator &I,
   // Operand complexity canonicalization guarantees that the 'or' is Op0.
   // (A | B) & ~(A & B) --> A ^ B
   // (A | B) & ~(B & A) --> A ^ B
-  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-      match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B)))))
+  if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)),
+                        m_Not(m_c_And(m_Deferred(A), m_Deferred(B))))))
     return BinaryOperator::CreateXor(A, B);
 
   // (A | ~B) & (~A | B) --> ~(A ^ B)
@@ -1115,8 +1297,8 @@ static Instruction *foldAndToXor(BinaryOperator &I,
   // (~B | A) & (~A | B) --> ~(A ^ B)
   // (~B | A) & (B | ~A) --> ~(A ^ B)
   if (Op0->hasOneUse() || Op1->hasOneUse())
-    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B))))
+    if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+                          m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
       return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
 
   return nullptr;
@@ -1148,18 +1330,86 @@ static Instruction *foldOrToXor(BinaryOperator &I,
   return nullptr;
 }
 
+/// Return true if a constant shift amount is always less than the specified
+/// bit-width. If not, the shift could create poison in the narrower type.
+static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
+  if (auto *ScalarC = dyn_cast<ConstantInt>(C))
+    return ScalarC->getZExtValue() < BitWidth;
+
+  if (C->getType()->isVectorTy()) {
+    // Check each element of a constant vector.
+    unsigned NumElts = C->getType()->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt)
+        return false;
+      if (isa<UndefValue>(Elt))
+        continue;
+      auto *CI = dyn_cast<ConstantInt>(Elt);
+      if (!CI || CI->getZExtValue() >= BitWidth)
+        return false;
+    }
+    return true;
+  }
+
+  // The constant is a constant expression or unknown.
+  return false;
+}
+
+/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
+/// a common zext operand: and (binop (zext X), C), (zext X).
+Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
+  // This transform could also apply to {or, and, xor}, but there are better
+  // folds for those cases, so we don't expect those patterns here. AShr is not
+  // handled because it should always be transformed to LShr in this sequence.
+  // The subtract transform is different because it has a constant on the left.
+  // Add/mul commute the constant to RHS; sub with constant RHS becomes add.
+  Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1);
+  Constant *C;
+  if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1)))))
+    return nullptr;
+
+  Value *X;
+  if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3))
+    return nullptr;
+
+  Type *Ty = And.getType();
+  if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType()))
+    return nullptr;
+
+  // If we're narrowing a shift, the shift amount must be safe (less than the
+  // width) in the narrower type. If the shift amount is greater, instsimplify
+  // usually handles that case, but we can't guarantee/assert it.
+  Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode();
+  if (Opc == Instruction::LShr || Opc == Instruction::Shl)
+    if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits()))
+      return nullptr;
+
+  // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X)
+  // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X)
+  Value *NewC = ConstantExpr::getTrunc(C, X->getType());
+  Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X)
+                                         : Builder.CreateBinOp(Opc, X, NewC);
+  return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty);
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAndInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
@@ -1177,6 +1427,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I, Builder))
     return replaceInstUsesWith(I, V);
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   const APInt *C;
   if (match(Op1, m_APInt(C))) {
     Value *X, *Y;
@@ -1289,9 +1540,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     }
   }
 
-  if (isa<Constant>(Op1))
-    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
-      return FoldedLogic;
+  if (Instruction *Z = narrowMaskedBinOp(I))
+    return Z;
+
+  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+    return FoldedLogic;
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
     return DeMorgan;
@@ -1397,7 +1650,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));
 
-  return Changed ? &I : nullptr;
+  return nullptr;
 }
 
 /// Given an OR instruction, check to see if this is a bswap idiom. If so,
@@ -1424,7 +1677,18 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
   bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
                   match(Op1, m_And(m_Value(), m_Value()));
 
-  if (!OrOfOrs && !OrOfShifts && !OrOfAnds)
+  // (A << B) | (C & D)                              -> bswap if possible.
+  // The bigger pattern here is ((A & C1) << C2) | ((B >> C2) & C1), which is a
+  // part of the bswap idiom for specific values of C1, C2 (e.g. C1 = 16711935,
+  // C2 = 8 for i32).
+  // This pattern can occur when the operands of the 'or' are not canonicalized
+  // for some reason (not having only one use, for example).
+  bool OrOfAndAndSh = (match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
+                       match(Op1, m_And(m_Value(), m_Value()))) ||
+                      (match(Op0, m_And(m_Value(), m_Value())) &&
+                       match(Op1, m_LogicalShift(m_Value(), m_Value())));
+
+  if (!OrOfOrs && !OrOfShifts && !OrOfAnds && !OrOfAndAndSh)
     return nullptr;
 
   SmallVector<Instruction*, 4> Insts;
@@ -1448,7 +1712,6 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
       return false;
 
     // One element must be all ones, and the other must be all zeros.
-    // FIXME: Allow undef elements.
     if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
           (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
       return false;
@@ -1755,14 +2018,15 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombiner::visitOr(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
+                                SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyOrInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
@@ -1780,14 +2044,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I, Builder))
     return replaceInstUsesWith(I, V);
 
-  if (isa<Constant>(Op1))
-    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
-      return FoldedLogic;
+  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+    return FoldedLogic;
 
   // Given an OR instruction, check to see if this is a bswap.
   if (Instruction *BSwap = MatchBSwap(I))
     return BSwap;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   {
     Value *A;
     const APInt *C;
@@ -2027,7 +2291,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
   }
 
-  return Changed ? &I : nullptr;
+  return nullptr;
 }
 
 /// A ^ B can be specified using other logic ops in a variety of patterns. We
@@ -2045,10 +2309,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   // (A & B) ^ (B | A) -> A ^ B
   // (A | B) ^ (A & B) -> A ^ B
   // (A | B) ^ (B & A) -> A ^ B
-  if ((match(Op0, m_And(m_Value(A), m_Value(B))) &&
-       match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) ||
-      (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-       match(Op1, m_c_And(m_Specific(A), m_Specific(B))))) {
+  if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)),
+                        m_c_Or(m_Deferred(A), m_Deferred(B))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2058,10 +2320,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   // (~B | A) ^ (~A | B) -> A ^ B
   // (~A | B) ^ (A | ~B) -> A ^ B
   // (B | ~A) ^ (A | ~B) -> A ^ B
-  if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
-       match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
-      (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
-       match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
+  if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+                      m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2071,10 +2331,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   // (~B & A) ^ (~A & B) -> A ^ B
   // (~A & B) ^ (A & ~B) -> A ^ B
   // (B & ~A) ^ (A & ~B) -> A ^ B
-  if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
-       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
-      (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-       match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) {
+  if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))),
+                      m_c_And(m_Not(m_Deferred(A)), m_Deferred(B))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2113,6 +2371,34 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     }
   }
 
+  // TODO: This can be generalized to compares of non-signbits using
+  // decomposeBitTestICmp(). It could be enhanced more by using (something like)
+  // foldLogOpOfMaskedICmps().
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+  if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+      LHS0->getType() == RHS0->getType()) {
+    // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
+    // (X <  0) ^ (Y <  0) --> (X ^ Y) < 0
+    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
+        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) {
+      Value *Zero = ConstantInt::getNullValue(LHS0->getType());
+      return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero);
+    }
+    // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1
+    // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1
+    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
+        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) {
+      Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType());
+      return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne);
+    }
+  }
+
   // Instead of trying to imitate the folds for and/or, decompose this 'xor'
   // into those logic ops. That is, try to turn this into an and-of-icmps
   // because we have many folds for that pattern.
@@ -2140,18 +2426,63 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   return nullptr;
 }
 
+/// If we have a masked merge, in the canonical form of:
+/// (assuming that A only has one use.)
+///   |        A  |  |B|
+///   ((x ^ y) & M) ^ y
+///    |  D  |
+/// * If M is inverted:
+///      |  D  |
+///     ((x ^ y) & ~M) ^ y
+///   We can canonicalize by swapping the final xor operand
+///   to eliminate the 'not' of the mask.
+///     ((x ^ y) & M) ^ x
+/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops
+///   because that shortens the dependency chain and improves analysis:
+///     (x & M) | (y & ~M)
+static Instruction *visitMaskedMerge(BinaryOperator &I,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *B, *X, *D;
+  Value *M;
+  if (!match(&I, m_c_Xor(m_Value(B),
+                         m_OneUse(m_c_And(
+                             m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)),
+                                          m_Value(D)),
+                             m_Value(M))))))
+    return nullptr;
+
+  Value *NotM;
+  if (match(M, m_Not(m_Value(NotM)))) {
+    // De-invert the mask and swap the value in B part.
+    Value *NewA = Builder.CreateAnd(D, NotM);
+    return BinaryOperator::CreateXor(NewA, X);
+  }
+
+  Constant *C;
+  if (D->hasOneUse() && match(M, m_Constant(C))) {
+    // Unfold.
+    Value *LHS = Builder.CreateAnd(X, C);
+    Value *NotC = Builder.CreateNot(C);
+    Value *RHS = Builder.CreateAnd(B, NotC);
+    return BinaryOperator::CreateOr(LHS, RHS);
+  }
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombiner::visitXor(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyXorInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   if (Instruction *NewXor = foldXorToXor(I, Builder))
     return NewXor;
@@ -2168,6 +2499,11 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I, Builder))
     return replaceInstUsesWith(I, V);
 
+  // A^B --> A|B iff A and B have no bits set in common.
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (haveNoCommonBitsSet(Op0, Op1, DL, &AC, &I, &DT))
+    return BinaryOperator::CreateOr(Op0, Op1);
+
   // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
   Value *X, *Y;
 
@@ -2186,6 +2522,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     return BinaryOperator::CreateAnd(X, NotY);
   }
 
+  if (Instruction *Xor = visitMaskedMerge(I, Builder))
+    return Xor;
+
   // Is this a 'not' (~) fed by a binary operator?
   BinaryOperator *NotVal;
   if (match(&I, m_Not(m_BinOp(NotVal)))) {
@@ -2206,6 +2545,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
 
+    // ~(X - Y) --> ~X + Y
+    if (match(NotVal, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))))
+      return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y);
+
     // ~(~X >>s Y) --> (X >>s Y)
     if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
       return BinaryOperator::CreateAShr(X, Y);
@@ -2214,16 +2557,18 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     // the 'not' by inverting the constant and using the opposite shift type.
     // Canonicalization rules ensure that only a negative constant uses 'ashr',
     // but we must check that in case that transform has not fired yet.
-    const APInt *C;
-    if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+    Constant *C;
+    if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) &&
+        match(C, m_Negative())) {
       // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
-      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      Constant *NotC = ConstantExpr::getNot(C);
       return BinaryOperator::CreateLShr(NotC, Y);
     }
 
-    if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+    if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) &&
+        match(C, m_NonNegative())) {
       // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
-      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      Constant *NotC = ConstantExpr::getNot(C);
       return BinaryOperator::CreateAShr(NotC, Y);
     }
   }
@@ -2305,9 +2650,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  if (isa<Constant>(Op1))
-    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
-      return FoldedLogic;
+  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+    return FoldedLogic;
 
   {
     Value *A, *B;
@@ -2397,25 +2741,59 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
     return CastedXor;
 
-  // Canonicalize the shifty way to code absolute value to the common pattern.
+  // Canonicalize a shifty way to code absolute value to the common pattern.
   // There are 4 potential commuted variants. Move the 'ashr' candidate to Op1.
   // We're relying on the fact that we only do this transform when the shift has
   // exactly 2 uses and the add has exactly 1 use (otherwise, we might increase
   // instructions).
-  if (Op0->getNumUses() == 2)
+  if (Op0->hasNUses(2))
     std::swap(Op0, Op1);
 
   const APInt *ShAmt;
   Type *Ty = I.getType();
   if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
-      Op1->getNumUses() == 2 && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+      Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
       match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
     // B = ashr i32 A, 31 ; smear the sign bit
     // xor (add A, B), B  ; add -1 and flip bits if negative
     // --> (A < 0) ? -A : A
     Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
-    return SelectInst::Create(Cmp, Builder.CreateNeg(A), A);
+    // Copy the nuw/nsw flags from the add to the negate.
+    auto *Add = cast<BinaryOperator>(Op0);
+    Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
+                                   Add->hasNoSignedWrap());
+    return SelectInst::Create(Cmp, Neg, A);
+  }
+
+  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
+  //
+  //   %notx = xor i32 %x, -1
+  //   %cmp1 = icmp sgt i32 %notx, %y
+  //   %smax = select i1 %cmp1, i32 %notx, i32 %y
+  //   %res = xor i32 %smax, -1
+  // =>
+  //   %noty = xor i32 %y, -1
+  //   %cmp2 = icmp slt %x, %noty
+  //   %res = select i1 %cmp2, i32 %x, i32 %noty
+  //
+  // Same is applicable for smin/umax/umin.
+  {
+    Value *LHS, *RHS;
+    SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor;
+    if (Op0->hasOneUse() && SelectPatternResult::isMinOrMax(SPF) &&
+        match(Op1, m_AllOnes())) {
+
+      Value *X;
+      if (match(RHS, m_Not(m_Value(X))))
+        std::swap(RHS, LHS);
+
+      if (match(LHS, m_Not(m_Value(X)))) {
+        Value *NotY = Builder.CreateNot(RHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
+      }
+    }
   }
 
-  return Changed ? &I : nullptr;
+  return nullptr;
 }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2f2f0696366c..cbfbd8a53993 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -57,7 +58,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include <algorithm>
 #include <cassert>
@@ -73,11 +73,11 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
-static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
-    "unfold-element-atomic-memcpy-max-elements",
-    cl::init(16),
-    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
-             "allowed to unfold"));
+static cl::opt<unsigned> GuardWideningWindow(
+    "instcombine-guard-widening-window",
+    cl::init(3),
+    cl::desc("How wide an instruction window to bypass looking for "
+             "another guard"));
 
 /// Return the specified type promoted as it would be to pass though a va_arg
 /// area.
@@ -106,97 +106,24 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
   return ConstantVector::get(BoolVec);
 }
 
-Instruction *
-InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
-  // Try to unfold this intrinsic into sequence of explicit atomic loads and
-  // stores.
-  // First check that number of elements is compile time constant.
-  auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
-  if (!LengthCI)
-    return nullptr;
-
-  // Check that there are not too many elements.
-  uint64_t LengthInBytes = LengthCI->getZExtValue();
-  uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
-  uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
-  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
-    return nullptr;
-
-  // Only expand if there are elements to copy.
-  if (NumElements > 0) {
-    // Don't unfold into illegal integers
-    uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
-    if (!getDataLayout().isLegalInteger(ElementSizeInBits))
-      return nullptr;
-
-    // Cast source and destination to the correct type. Intrinsic input
-    // arguments are usually represented as i8*. Often operands will be
-    // explicitly casted to i8* and we can just strip those casts instead of
-    // inserting new ones. However it's easier to rely on other InstCombine
-    // rules which will cover trivial cases anyway.
-    Value *Src = AMI->getRawSource();
-    Value *Dst = AMI->getRawDest();
-    Type *ElementPointerType =
-        Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
-                           Src->getType()->getPointerAddressSpace());
-
-    Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
-                                                 "memcpy_unfold.src_casted");
-    Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
-                                                 "memcpy_unfold.dst_casted");
-
-    for (uint64_t i = 0; i < NumElements; ++i) {
-      // Get current element addresses
-      ConstantInt *ElementIdxCI =
-          ConstantInt::get(AMI->getContext(), APInt(64, i));
-      Value *SrcElementAddr =
-          Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
-      Value *DstElementAddr =
-          Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
-      // Load from the source. Transfer alignment information and mark load as
-      // unordered atomic.
-      LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
-      Load->setOrdering(AtomicOrdering::Unordered);
-      // We know alignment of the first element. It is also guaranteed by the
-      // verifier that element size is less or equal than first element
-      // alignment and both of this values are powers of two. This means that
-      // all subsequent accesses are at least element size aligned.
-      // TODO: We can infer better alignment but there is no evidence that this
-      // will matter.
-      Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
-                                : ElementSizeInBytes);
-      Load->setDebugLoc(AMI->getDebugLoc());
-
-      // Store loaded value via unordered atomic store.
-      StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
-      Store->setOrdering(AtomicOrdering::Unordered);
-      Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
-                                 : ElementSizeInBytes);
-      Store->setDebugLoc(AMI->getDebugLoc());
-    }
+Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
+  unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
+  unsigned CopyDstAlign = MI->getDestAlignment();
+  if (CopyDstAlign < DstAlign){
+    MI->setDestAlignment(DstAlign);
+    return MI;
   }
 
-  // Set the number of elements of the copy to 0, it will be deleted on the
-  // next iteration.
-  AMI->setLength(Constant::getNullValue(LengthCI->getType()));
-  return AMI;
-}
-
-Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
-  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
-  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
-  unsigned MinAlign = std::min(DstAlign, SrcAlign);
-  unsigned CopyAlign = MI->getAlignment();
-
-  if (CopyAlign < MinAlign) {
-    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
+  unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+  unsigned CopySrcAlign = MI->getSourceAlignment();
+  if (CopySrcAlign < SrcAlign) {
+    MI->setSourceAlignment(SrcAlign);
     return MI;
   }
 
   // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
   // load/store.
-  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
   if (!MemOpLength) return nullptr;
 
   // Source and destination pointer types are always "i8*" for intrinsic.  See
@@ -222,7 +149,9 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // If the memcpy has metadata describing the members, see if we can get the
   // TBAA tag describing our copy.
   MDNode *CopyMD = nullptr;
-  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
+    CopyMD = M;
+  } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
     if (M->getNumOperands() == 3 && M->getOperand(0) &&
         mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
         mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
@@ -234,15 +163,11 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
       CopyMD = cast<MDNode>(M->getOperand(2));
   }
 
-  // If the memcpy/memmove provides better alignment info than we can
-  // infer, use it.
-  SrcAlign = std::max(SrcAlign, CopyAlign);
-  DstAlign = std::max(DstAlign, CopyAlign);
-
   Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
   Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
-  LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
-  L->setAlignment(SrcAlign);
+  LoadInst *L = Builder.CreateLoad(Src);
+  // Alignment from the mem intrinsic will be better, so use it.
+  L->setAlignment(CopySrcAlign);
   if (CopyMD)
     L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   MDNode *LoopMemParallelMD =
@@ -250,23 +175,34 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   if (LoopMemParallelMD)
     L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
 
-  StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
-  S->setAlignment(DstAlign);
+  StoreInst *S = Builder.CreateStore(L, Dest);
+  // Alignment from the mem intrinsic will be better, so use it.
+  S->setAlignment(CopyDstAlign);
   if (CopyMD)
     S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   if (LoopMemParallelMD)
     S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
 
+  if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+    // non-atomics can be volatile
+    L->setVolatile(MT->isVolatile());
+    S->setVolatile(MT->isVolatile());
+  }
+  if (isa<AtomicMemTransferInst>(MI)) {
+    // atomics have to be unordered
+    L->setOrdering(AtomicOrdering::Unordered);
+    S->setOrdering(AtomicOrdering::Unordered);
+  }
+
   // Set the size of the copy to 0, it will be deleted on the next iteration.
-  MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+  MI->setLength(Constant::getNullValue(MemOpLength->getType()));
   return MI;
 }
 
-Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
   unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
-  if (MI->getAlignment() < Alignment) {
-    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
-                                             Alignment, false));
+  if (MI->getDestAlignment() < Alignment) {
+    MI->setDestAlignment(Alignment);
     return MI;
   }
 
@@ -276,7 +212,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
     return nullptr;
   uint64_t Len = LenC->getLimitedValue();
-  Alignment = MI->getAlignment();
+  Alignment = MI->getDestAlignment();
   assert(Len && "0-sized memory setting should be removed already.");
 
   // memset(s,c,n) -> store s, c (for n=1,2,4,8)
@@ -296,6 +232,8 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
     StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
                                        MI->isVolatile());
     S->setAlignment(Alignment);
+    if (isa<AtomicMemSetInst>(MI))
+      S->setOrdering(AtomicOrdering::Unordered);
 
     // Set the size of the copy to 0, it will be deleted on the next iteration.
     MI->setLength(Constant::getNullValue(LenC->getType()));
@@ -563,55 +501,6 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
-static Value *simplifyX86muldq(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder) {
-  Value *Arg0 = II.getArgOperand(0);
-  Value *Arg1 = II.getArgOperand(1);
-  Type *ResTy = II.getType();
-  assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
-         Arg1->getType()->getScalarSizeInBits() == 32 &&
-         ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
-
-  // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
-  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
-    return ConstantAggregateZero::get(ResTy);
-
-  // Constant folding.
-  // PMULDQ  = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
-  //                vXi64 sext(shuffle<0,2,..>(Arg1))))
-  // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
-  //                vXi64 zext(shuffle<0,2,..>(Arg1))))
-  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
-    return nullptr;
-
-  unsigned NumElts = ResTy->getVectorNumElements();
-  assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
-         Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
-         "Unexpected muldq/muludq types");
-
-  unsigned IntrinsicID = II.getIntrinsicID();
-  bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
-                   Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
-                   Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
-
-  SmallVector<unsigned, 16> ShuffleMask;
-  for (unsigned i = 0; i != NumElts; ++i)
-    ShuffleMask.push_back(i * 2);
-
-  auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
-  auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
-
-  if (IsSigned) {
-    LHS = Builder.CreateSExt(LHS, ResTy);
-    RHS = Builder.CreateSExt(RHS, ResTy);
-  } else {
-    LHS = Builder.CreateZExt(LHS, ResTy);
-    RHS = Builder.CreateZExt(RHS, ResTy);
-  }
-
-  return Builder.CreateMul(LHS, RHS);
-}
-
 static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
   Value *Arg0 = II.getArgOperand(0);
   Value *Arg1 = II.getArgOperand(1);
@@ -687,6 +576,105 @@ static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
   return ConstantVector::get(Vals);
 }
 
+// Replace X86-specific intrinsics with generic floor-ceil where applicable.
+static Value *simplifyX86round(IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
+  ConstantInt *Arg = nullptr;
+  Intrinsic::ID IntrinsicID = II.getIntrinsicID();
+
+  if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+      IntrinsicID == Intrinsic::x86_sse41_round_sd)
+    Arg = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+           IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+    Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+  else
+    Arg = dyn_cast<ConstantInt>(II.getArgOperand(1));
+  if (!Arg)
+    return nullptr;
+  unsigned RoundControl = Arg->getZExtValue();
+
+  Arg = nullptr;
+  unsigned SAE = 0;
+  if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+      IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512)
+    Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+  else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+           IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+    Arg = dyn_cast<ConstantInt>(II.getArgOperand(5));
+  else
+    SAE = 4;
+  if (!SAE) {
+    if (!Arg)
+      return nullptr;
+    SAE = Arg->getZExtValue();
+  }
+
+  if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/))
+    return nullptr;
+
+  Value *Src, *Dst, *Mask;
+  bool IsScalar = false;
+  if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+      IntrinsicID == Intrinsic::x86_sse41_round_sd ||
+      IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+      IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+    IsScalar = true;
+    if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+        IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+      Mask = II.getArgOperand(3);
+      Value *Zero = Constant::getNullValue(Mask->getType());
+      Mask = Builder.CreateAnd(Mask, 1);
+      Mask = Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero);
+      Dst = II.getArgOperand(2);
+    } else
+      Dst = II.getArgOperand(0);
+    Src = Builder.CreateExtractElement(II.getArgOperand(1), (uint64_t)0);
+  } else {
+    Src = II.getArgOperand(0);
+    if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_128 ||
+        IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_256 ||
+        IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+        IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_128 ||
+        IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_256 ||
+        IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512) {
+      Dst = II.getArgOperand(2);
+      Mask = II.getArgOperand(3);
+    } else {
+      Dst = Src;
+      Mask = ConstantInt::getAllOnesValue(
+          Builder.getIntNTy(Src->getType()->getVectorNumElements()));
+    }
+  }
+
+  Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor;
+  Value *Res = Builder.CreateIntrinsic(ID, {Src}, &II);
+  if (!IsScalar) {
+    if (auto *C = dyn_cast<Constant>(Mask))
+      if (C->isAllOnesValue())
+        return Res;
+    auto *MaskTy = VectorType::get(
+        Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth());
+    Mask = Builder.CreateBitCast(Mask, MaskTy);
+    unsigned Width = Src->getType()->getVectorNumElements();
+    if (MaskTy->getVectorNumElements() > Width) {
+      uint32_t Indices[4];
+      for (unsigned i = 0; i != Width; ++i)
+        Indices[i] = i;
+      Mask = Builder.CreateShuffleVector(Mask, Mask,
+                                         makeArrayRef(Indices, Width));
+    }
+    return Builder.CreateSelect(Mask, Res, Dst);
+  }
+  if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+      IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+    Dst = Builder.CreateExtractElement(Dst, (uint64_t)0);
+    Res = Builder.CreateSelect(Mask, Res, Dst);
+    Dst = II.getArgOperand(0);
+  }
+  return Builder.CreateInsertElement(Dst, Res, (uint64_t)0);
+}
+
 static Value *simplifyX86movmsk(const IntrinsicInst &II) {
   Value *Arg = II.getArgOperand(0);
   Type *ResTy = II.getType();
@@ -1145,36 +1133,6 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II,
   return nullptr;
 }
 
-// Emit a select instruction and appropriate bitcasts to help simplify
-// masked intrinsics.
-static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1,
-                                InstCombiner::BuilderTy &Builder) {
-  unsigned VWidth = Op0->getType()->getVectorNumElements();
-
-  // If the mask is all ones we don't need the select. But we need to check
-  // only the bit thats will be used in case VWidth is less than 8.
-  if (auto *C = dyn_cast<ConstantInt>(Mask))
-    if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue())
-      return Op0;
-
-  auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
-                         cast<IntegerType>(Mask->getType())->getBitWidth());
-  Mask = Builder.CreateBitCast(Mask, MaskTy);
-
-  // If we have less than 8 elements, then the starting mask was an i8 and
-  // we need to extract down to the right number of elements.
-  if (VWidth < 8) {
-    uint32_t Indices[4];
-    for (unsigned i = 0; i != VWidth; ++i)
-      Indices[i] = i;
-    Mask = Builder.CreateShuffleVector(Mask, Mask,
-                                       makeArrayRef(Indices, VWidth),
-                                       "extract");
-  }
-
-  return Builder.CreateSelect(Mask, Op0, Op1);
-}
-
 static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
   Value *Arg0 = II.getArgOperand(0);
   Value *Arg1 = II.getArgOperand(1);
@@ -1308,6 +1266,40 @@ static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
   return nullptr;
 }
 
+/// This function transforms launder.invariant.group and strip.invariant.group
+/// like:
+/// launder(launder(%x)) -> launder(%x)       (the result is not the argument)
+/// launder(strip(%x)) -> launder(%x)
+/// strip(strip(%x)) -> strip(%x)             (the result is not the argument)
+/// strip(launder(%x)) -> strip(%x)
+/// This is legal because it preserves the most recent information about
+/// the presence or absence of invariant.group.
+static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
+                                                    InstCombiner &IC) {
+  auto *Arg = II.getArgOperand(0);
+  auto *StrippedArg = Arg->stripPointerCasts();
+  auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
+  if (StrippedArg == StrippedInvariantGroupsArg)
+    return nullptr; // No launders/strips to remove.
+
+  Value *Result = nullptr;
+
+  if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
+    Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
+  else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
+    Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
+  else
+    llvm_unreachable(
+        "simplifyInvariantGroupIntrinsic only handles launder and strip");
+  if (Result->getType()->getPointerAddressSpace() !=
+      II.getType()->getPointerAddressSpace())
+    Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
+  if (Result->getType() != II.getType())
+    Result = IC.Builder.CreateBitCast(Result, II.getType());
+
+  return cast<Instruction>(Result);
+}
+
 static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
   // If the mask is all zeros, a scatter does nothing.
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
@@ -1498,6 +1490,68 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
   return maxnum(Src0, Src1);
 }
 
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+static Value *simplifyNeonTbl1(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
+  // Bail out if the mask is not a constant.
+  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!C)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+
+  // Only perform this transformation for <8 x i8> vector types.
+  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+    return nullptr;
+
+  uint32_t Indexes[8];
+
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = C->getAggregateElement(I);
+
+    if (!COp || !isa<ConstantInt>(COp))
+      return nullptr;
+
+    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+    // Make sure the mask indices are in range.
+    if (Indexes[I] >= NumElts)
+      return nullptr;
+  }
+
+  auto *ShuffleMask = ConstantDataVector::get(II.getContext(),
+                                              makeArrayRef(Indexes));
+  auto *V1 = II.getArgOperand(0);
+  auto *V2 = Constant::getNullValue(V1->getType());
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II,
+                               unsigned MemAlign,
+                               InstCombiner::BuilderTy &Builder) {
+  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+  if (!IntrAlign)
+    return nullptr;
+
+  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
+                       MemAlign : IntrAlign->getLimitedValue();
+
+  if (!isPowerOf2_32(Alignment))
+    return nullptr;
+
+  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::get(II.getType(), 0));
+  return Builder.CreateAlignedLoad(BCastInst, Alignment);
+}
+
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1820,7 +1874,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   // Intrinsics cannot occur in an invoke, so handle them here instead of in
   // visitCallSite.
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+  if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
     bool Changed = false;
 
     // memmove/cpy/set of zero bytes is a noop.
@@ -1837,17 +1891,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
 
     // No other transformations apply to volatile transfers.
-    if (MI->isVolatile())
-      return nullptr;
+    if (auto *M = dyn_cast<MemIntrinsic>(MI))
+      if (M->isVolatile())
+        return nullptr;
 
     // If we have a memmove and the source operation is a constant global,
     // then the source and dest pointers can't alias, so we can change this
     // into a call to memcpy.
-    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+    if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
       if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
         if (GVSrc->isConstant()) {
           Module *M = CI.getModule();
-          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          Intrinsic::ID MemCpyID =
+              isa<AtomicMemMoveInst>(MMI)
+                  ? Intrinsic::memcpy_element_unordered_atomic
+                  : Intrinsic::memcpy;
           Type *Tys[3] = { CI.getArgOperand(0)->getType(),
                            CI.getArgOperand(1)->getType(),
                            CI.getArgOperand(2)->getType() };
@@ -1856,7 +1914,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         }
     }
 
-    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+    if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
       // memmove(x,x,size) -> noop.
       if (MTI->getSource() == MTI->getDest())
         return eraseInstFromFunction(CI);
@@ -1864,26 +1922,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // If we can determine a pointer alignment that is bigger than currently
     // set, update the alignment.
-    if (isa<MemTransferInst>(MI)) {
-      if (Instruction *I = SimplifyMemTransfer(MI))
+    if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyAnyMemTransfer(MTI))
         return I;
-    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
-      if (Instruction *I = SimplifyMemSet(MSI))
+    } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
+      if (Instruction *I = SimplifyAnyMemSet(MSI))
         return I;
     }
 
     if (Changed) return II;
   }
 
-  if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
-    if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
-      if (C->isNullValue())
-        return eraseInstFromFunction(*AMI);
-
-    if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
-      return I;
-  }
-
   if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
     return I;
 
@@ -1925,7 +1974,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     return simplifyMaskedGather(*II, *this);
   case Intrinsic::masked_scatter:
     return simplifyMaskedScatter(*II, *this);
-
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
+    if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
+      return replaceInstUsesWith(*II, SkippedBarrier);
+    break;
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // 0 and 1 are handled in instsimplify
@@ -1991,8 +2044,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       II->setArgOperand(1, Arg0);
       return II;
     }
+
+    // FIXME: Simplifications should be in instsimplify.
     if (Value *V = simplifyMinnumMaxnum(*II))
       return replaceInstUsesWith(*II, V);
+
+    Value *X, *Y;
+    if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
+        (Arg0->hasOneUse() || Arg1->hasOneUse())) {
+      // If both operands are negated, invert the call and negate the result:
+      // minnum(-X, -Y) --> -(maxnum(X, Y))
+      // maxnum(-X, -Y) --> -(minnum(X, Y))
+      Intrinsic::ID NewIID = II->getIntrinsicID() == Intrinsic::maxnum ?
+          Intrinsic::minnum : Intrinsic::maxnum;
+      Value *NewCall = Builder.CreateIntrinsic(NewIID, { X, Y }, II);
+      Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
+      FNeg->copyIRFlags(II);
+      return FNeg;
+    }
     break;
   }
   case Intrinsic::fmuladd: {
@@ -2013,37 +2082,34 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src0 = II->getArgOperand(0);
     Value *Src1 = II->getArgOperand(1);
 
-    // Canonicalize constants into the RHS.
+    // Canonicalize constant multiply operand to Src1.
     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
       II->setArgOperand(0, Src1);
       II->setArgOperand(1, Src0);
       std::swap(Src0, Src1);
     }
 
-    Value *LHS = nullptr;
-    Value *RHS = nullptr;
-
     // fma fneg(x), fneg(y), z -> fma x, y, z
-    if (match(Src0, m_FNeg(m_Value(LHS))) &&
-        match(Src1, m_FNeg(m_Value(RHS)))) {
-      II->setArgOperand(0, LHS);
-      II->setArgOperand(1, RHS);
+    Value *X, *Y;
+    if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
+      II->setArgOperand(0, X);
+      II->setArgOperand(1, Y);
       return II;
     }
 
     // fma fabs(x), fabs(x), z -> fma x, x, z
-    if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
-        match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
-      II->setArgOperand(0, LHS);
-      II->setArgOperand(1, RHS);
+    if (match(Src0, m_FAbs(m_Value(X))) &&
+        match(Src1, m_FAbs(m_Specific(X)))) {
+      II->setArgOperand(0, X);
+      II->setArgOperand(1, X);
       return II;
     }
 
     // fma x, 1, z -> fadd x, z
     if (match(Src1, m_FPOne())) {
-      Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
-      RI->copyFastMathFlags(II);
-      return RI;
+      auto *FAdd = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+      FAdd->copyFastMathFlags(II);
+      return FAdd;
     }
 
     break;
@@ -2067,17 +2133,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::rint:
   case Intrinsic::trunc: {
     Value *ExtSrc;
-    if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
-        II->getArgOperand(0)->hasOneUse()) {
-      // fabs (fpext x) -> fpext (fabs x)
-      Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
-                                           { ExtSrc->getType() });
-      CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
-      NewFabs->copyFastMathFlags(II);
-      NewFabs->takeName(II);
-      return new FPExtInst(NewFabs, II->getType());
+    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
+      // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
+      Value *NarrowII = Builder.CreateIntrinsic(II->getIntrinsicID(),
+                                                { ExtSrc }, II);
+      return new FPExtInst(NarrowII, II->getType());
     }
-
     break;
   }
   case Intrinsic::cos:
@@ -2085,7 +2146,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *SrcSrc;
     Value *Src = II->getArgOperand(0);
     if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
-        match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+        match(Src, m_FAbs(m_Value(SrcSrc)))) {
       // cos(-x) -> cos(x)
       // cos(fabs(x)) -> cos(x)
       II->setArgOperand(0, SrcSrc);
@@ -2298,6 +2359,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_sse41_round_ps:
+  case Intrinsic::x86_sse41_round_pd:
+  case Intrinsic::x86_avx_round_ps_256:
+  case Intrinsic::x86_avx_round_pd_256:
+  case Intrinsic::x86_avx512_mask_rndscale_ps_128:
+  case Intrinsic::x86_avx512_mask_rndscale_ps_256:
+  case Intrinsic::x86_avx512_mask_rndscale_ps_512:
+  case Intrinsic::x86_avx512_mask_rndscale_pd_128:
+  case Intrinsic::x86_avx512_mask_rndscale_pd_256:
+  case Intrinsic::x86_avx512_mask_rndscale_pd_512:
+  case Intrinsic::x86_avx512_mask_rndscale_ss:
+  case Intrinsic::x86_avx512_mask_rndscale_sd:
+    if (Value *V = simplifyX86round(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
   case Intrinsic::x86_mmx_pmovmskb:
   case Intrinsic::x86_sse_movmsk_ps:
   case Intrinsic::x86_sse2_movmsk_pd:
@@ -2355,16 +2432,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     break;
   }
-  case Intrinsic::x86_avx512_mask_cmp_pd_128:
-  case Intrinsic::x86_avx512_mask_cmp_pd_256:
-  case Intrinsic::x86_avx512_mask_cmp_pd_512:
-  case Intrinsic::x86_avx512_mask_cmp_ps_128:
-  case Intrinsic::x86_avx512_mask_cmp_ps_256:
-  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+  case Intrinsic::x86_avx512_cmp_pd_128:
+  case Intrinsic::x86_avx512_cmp_pd_256:
+  case Intrinsic::x86_avx512_cmp_pd_512:
+  case Intrinsic::x86_avx512_cmp_ps_128:
+  case Intrinsic::x86_avx512_cmp_ps_256:
+  case Intrinsic::x86_avx512_cmp_ps_512: {
     // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
-    bool Arg0IsZero = match(Arg0, m_Zero());
+    bool Arg0IsZero = match(Arg0, m_PosZeroFP());
     if (Arg0IsZero)
       std::swap(Arg0, Arg1);
     Value *A, *B;
@@ -2376,7 +2453,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // The compare intrinsic uses the above assumptions and therefore
     // doesn't require additional flags.
     if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
-         match(Arg1, m_Zero()) && isa<Instruction>(Arg0) &&
+         match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
          cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
       if (Arg0IsZero)
         std::swap(A, B);
@@ -2387,17 +2464,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
-  case Intrinsic::x86_avx512_mask_add_ps_512:
-  case Intrinsic::x86_avx512_mask_div_ps_512:
-  case Intrinsic::x86_avx512_mask_mul_ps_512:
-  case Intrinsic::x86_avx512_mask_sub_ps_512:
-  case Intrinsic::x86_avx512_mask_add_pd_512:
-  case Intrinsic::x86_avx512_mask_div_pd_512:
-  case Intrinsic::x86_avx512_mask_mul_pd_512:
-  case Intrinsic::x86_avx512_mask_sub_pd_512:
+  case Intrinsic::x86_avx512_add_ps_512:
+  case Intrinsic::x86_avx512_div_ps_512:
+  case Intrinsic::x86_avx512_mul_ps_512:
+  case Intrinsic::x86_avx512_sub_ps_512:
+  case Intrinsic::x86_avx512_add_pd_512:
+  case Intrinsic::x86_avx512_div_pd_512:
+  case Intrinsic::x86_avx512_mul_pd_512:
+  case Intrinsic::x86_avx512_sub_pd_512:
     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
     // IR operations.
-    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
       if (R->getValue() == 4) {
         Value *Arg0 = II->getArgOperand(0);
         Value *Arg1 = II->getArgOperand(1);
@@ -2405,27 +2482,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         Value *V;
         switch (II->getIntrinsicID()) {
         default: llvm_unreachable("Case stmts out of sync!");
-        case Intrinsic::x86_avx512_mask_add_ps_512:
-        case Intrinsic::x86_avx512_mask_add_pd_512:
+        case Intrinsic::x86_avx512_add_ps_512:
+        case Intrinsic::x86_avx512_add_pd_512:
           V = Builder.CreateFAdd(Arg0, Arg1);
           break;
-        case Intrinsic::x86_avx512_mask_sub_ps_512:
-        case Intrinsic::x86_avx512_mask_sub_pd_512:
+        case Intrinsic::x86_avx512_sub_ps_512:
+        case Intrinsic::x86_avx512_sub_pd_512:
           V = Builder.CreateFSub(Arg0, Arg1);
           break;
-        case Intrinsic::x86_avx512_mask_mul_ps_512:
-        case Intrinsic::x86_avx512_mask_mul_pd_512:
+        case Intrinsic::x86_avx512_mul_ps_512:
+        case Intrinsic::x86_avx512_mul_pd_512:
           V = Builder.CreateFMul(Arg0, Arg1);
           break;
-        case Intrinsic::x86_avx512_mask_div_ps_512:
-        case Intrinsic::x86_avx512_mask_div_pd_512:
+        case Intrinsic::x86_avx512_div_ps_512:
+        case Intrinsic::x86_avx512_div_pd_512:
           V = Builder.CreateFDiv(Arg0, Arg1);
           break;
         }
 
-        // Create a select for the masking.
-        V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
-                              Builder);
         return replaceInstUsesWith(*II, V);
       }
     }
@@ -2499,32 +2573,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx512_mask_min_ss_round:
   case Intrinsic::x86_avx512_mask_max_sd_round:
   case Intrinsic::x86_avx512_mask_min_sd_round:
-  case Intrinsic::x86_avx512_mask_vfmadd_ss:
-  case Intrinsic::x86_avx512_mask_vfmadd_sd:
-  case Intrinsic::x86_avx512_maskz_vfmadd_ss:
-  case Intrinsic::x86_avx512_maskz_vfmadd_sd:
-  case Intrinsic::x86_avx512_mask3_vfmadd_ss:
-  case Intrinsic::x86_avx512_mask3_vfmadd_sd:
-  case Intrinsic::x86_avx512_mask3_vfmsub_ss:
-  case Intrinsic::x86_avx512_mask3_vfmsub_sd:
-  case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
-  case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
-  case Intrinsic::x86_fma_vfmadd_ss:
-  case Intrinsic::x86_fma_vfmsub_ss:
-  case Intrinsic::x86_fma_vfnmadd_ss:
-  case Intrinsic::x86_fma_vfnmsub_ss:
-  case Intrinsic::x86_fma_vfmadd_sd:
-  case Intrinsic::x86_fma_vfmsub_sd:
-  case Intrinsic::x86_fma_vfnmadd_sd:
-  case Intrinsic::x86_fma_vfnmsub_sd:
   case Intrinsic::x86_sse_cmp_ss:
   case Intrinsic::x86_sse_min_ss:
   case Intrinsic::x86_sse_max_ss:
   case Intrinsic::x86_sse2_cmp_sd:
   case Intrinsic::x86_sse2_min_sd:
   case Intrinsic::x86_sse2_max_sd:
-  case Intrinsic::x86_sse41_round_ss:
-  case Intrinsic::x86_sse41_round_sd:
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd: {
    unsigned VWidth = II->getType()->getVectorNumElements();
@@ -2537,6 +2591,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    }
    break;
   }
+  case Intrinsic::x86_sse41_round_ss:
+  case Intrinsic::x86_sse41_round_sd: {
+    unsigned VWidth = II->getType()->getVectorNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+      if (V != II)
+        return replaceInstUsesWith(*II, V);
+      return II;
+    } else if (Value *V = simplifyX86round(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+  }
 
   // Constant fold ashr( <A x Bi>, Ci ).
   // Constant fold lshr( <A x Bi>, Ci ).
@@ -2647,26 +2714,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, V);
     break;
 
-  case Intrinsic::x86_sse2_pmulu_dq:
-  case Intrinsic::x86_sse41_pmuldq:
-  case Intrinsic::x86_avx2_pmul_dq:
-  case Intrinsic::x86_avx2_pmulu_dq:
-  case Intrinsic::x86_avx512_pmul_dq_512:
-  case Intrinsic::x86_avx512_pmulu_dq_512: {
-    if (Value *V = simplifyX86muldq(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    unsigned VWidth = II->getType()->getVectorNumElements();
-    APInt UndefElts(VWidth, 0);
-    APInt DemandedElts = APInt::getAllOnesValue(VWidth);
-    if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
-      if (V != II)
-        return replaceInstUsesWith(*II, V);
-      return II;
-    }
-    break;
-  }
-
   case Intrinsic::x86_sse2_packssdw_128:
   case Intrinsic::x86_sse2_packsswb_128:
   case Intrinsic::x86_avx2_packssdw:
@@ -2687,7 +2734,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, V);
     break;
 
-  case Intrinsic::x86_pclmulqdq: {
+  case Intrinsic::x86_pclmulqdq:
+  case Intrinsic::x86_pclmulqdq_256:
+  case Intrinsic::x86_pclmulqdq_512: {
     if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
       unsigned Imm = C->getZExtValue();
 
@@ -2695,27 +2744,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       Value *Arg0 = II->getArgOperand(0);
       Value *Arg1 = II->getArgOperand(1);
       unsigned VWidth = Arg0->getType()->getVectorNumElements();
-      APInt DemandedElts(VWidth, 0);
 
       APInt UndefElts1(VWidth, 0);
-      DemandedElts = (Imm & 0x01) ? 2 : 1;
-      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
+      APInt DemandedElts1 = APInt::getSplat(VWidth,
+                                            APInt(2, (Imm & 0x01) ? 2 : 1));
+      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
                                                 UndefElts1)) {
         II->setArgOperand(0, V);
         MadeChange = true;
       }
 
       APInt UndefElts2(VWidth, 0);
-      DemandedElts = (Imm & 0x10) ? 2 : 1;
-      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
+      APInt DemandedElts2 = APInt::getSplat(VWidth,
+                                            APInt(2, (Imm & 0x10) ? 2 : 1));
+      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
                                                 UndefElts2)) {
         II->setArgOperand(1, V);
         MadeChange = true;
       }
 
-      // If both input elements are undef, the result is undef.
-      if (UndefElts1[(Imm & 0x01) ? 1 : 0] ||
-          UndefElts2[(Imm & 0x10) ? 1 : 0])
+      // If either input elements are undef, the result is zero.
+      if (DemandedElts1.isSubsetOf(UndefElts1) ||
+          DemandedElts2.isSubsetOf(UndefElts2))
         return replaceInstUsesWith(*II,
                                    ConstantAggregateZero::get(II->getType()));
 
@@ -2916,32 +2966,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
+  case Intrinsic::x86_avx512_permvar_df_256:
+  case Intrinsic::x86_avx512_permvar_df_512:
+  case Intrinsic::x86_avx512_permvar_di_256:
+  case Intrinsic::x86_avx512_permvar_di_512:
+  case Intrinsic::x86_avx512_permvar_hi_128:
+  case Intrinsic::x86_avx512_permvar_hi_256:
+  case Intrinsic::x86_avx512_permvar_hi_512:
+  case Intrinsic::x86_avx512_permvar_qi_128:
+  case Intrinsic::x86_avx512_permvar_qi_256:
+  case Intrinsic::x86_avx512_permvar_qi_512:
+  case Intrinsic::x86_avx512_permvar_sf_512:
+  case Intrinsic::x86_avx512_permvar_si_512:
     if (Value *V = simplifyX86vpermv(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
-  case Intrinsic::x86_avx512_mask_permvar_df_256:
-  case Intrinsic::x86_avx512_mask_permvar_df_512:
-  case Intrinsic::x86_avx512_mask_permvar_di_256:
-  case Intrinsic::x86_avx512_mask_permvar_di_512:
-  case Intrinsic::x86_avx512_mask_permvar_hi_128:
-  case Intrinsic::x86_avx512_mask_permvar_hi_256:
-  case Intrinsic::x86_avx512_mask_permvar_hi_512:
-  case Intrinsic::x86_avx512_mask_permvar_qi_128:
-  case Intrinsic::x86_avx512_mask_permvar_qi_256:
-  case Intrinsic::x86_avx512_mask_permvar_qi_512:
-  case Intrinsic::x86_avx512_mask_permvar_sf_256:
-  case Intrinsic::x86_avx512_mask_permvar_sf_512:
-  case Intrinsic::x86_avx512_mask_permvar_si_256:
-  case Intrinsic::x86_avx512_mask_permvar_si_512:
-    if (Value *V = simplifyX86vpermv(*II, Builder)) {
-      // We simplified the permuting, now create a select for the masking.
-      V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
-                            Builder);
-      return replaceInstUsesWith(*II, V);
-    }
-    break;
-
   case Intrinsic::x86_avx_maskload_ps:
   case Intrinsic::x86_avx_maskload_pd:
   case Intrinsic::x86_avx_maskload_ps_256:
@@ -3042,7 +3082,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
 
-  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld1: {
+    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0),
+                                          DL, II, &AC, &DT);
+    if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+  }
+
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
   case Intrinsic::arm_neon_vld4:
@@ -3069,6 +3116,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::arm_neon_vtbl1:
+  case Intrinsic::aarch64_neon_tbl1:
+    if (Value *V = simplifyNeonTbl1(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
   case Intrinsic::aarch64_neon_smull:
@@ -3107,6 +3160,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::arm_neon_aesd:
+  case Intrinsic::arm_neon_aese:
+  case Intrinsic::aarch64_crypto_aesd:
+  case Intrinsic::aarch64_crypto_aese: {
+    Value *DataArg = II->getArgOperand(0);
+    Value *KeyArg  = II->getArgOperand(1);
+
+    // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+    Value *Data, *Key;
+    if (match(KeyArg, m_ZeroInt()) &&
+        match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+      II->setArgOperand(0, Data);
+      II->setArgOperand(1, Key);
+      return II;
+    }
+    break;
+  }
   case Intrinsic::amdgcn_rcp: {
     Value *Src = II->getArgOperand(0);
 
@@ -3382,6 +3452,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src1 = II->getArgOperand(1);
     Value *Src2 = II->getArgOperand(2);
 
+    // Checking for NaN before canonicalization provides better fidelity when
+    // mapping other operations onto fmed3 since the order of operands is
+    // unchanged.
+    CallInst *NewCall = nullptr;
+    if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
+      NewCall = Builder.CreateMinNum(Src1, Src2);
+    } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
+      NewCall = Builder.CreateMinNum(Src0, Src2);
+    } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+      NewCall = Builder.CreateMaxNum(Src0, Src1);
+    }
+
+    if (NewCall) {
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
     bool Swap = false;
     // Canonicalize constants to RHS operands.
     //
@@ -3408,13 +3496,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     }
 
-    if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
-      CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
-      NewCall->copyFastMathFlags(II);
-      NewCall->takeName(II);
-      return replaceInstUsesWith(*II, NewCall);
-    }
-
     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
@@ -3548,13 +3629,32 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // amdgcn.kill(i1 1) is a no-op
     return eraseInstFromFunction(CI);
   }
+  case Intrinsic::amdgcn_update_dpp: {
+    Value *Old = II->getArgOperand(0);
+
+    auto BC = dyn_cast<ConstantInt>(II->getArgOperand(5));
+    auto RM = dyn_cast<ConstantInt>(II->getArgOperand(3));
+    auto BM = dyn_cast<ConstantInt>(II->getArgOperand(4));
+    if (!BC || !RM || !BM ||
+        BC->isZeroValue() ||
+        RM->getZExtValue() != 0xF ||
+        BM->getZExtValue() != 0xF ||
+        isa<UndefValue>(Old))
+      break;
+
+    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+    II->setOperand(0, UndefValue::get(Old->getType()));
+    return II;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
     if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
       if (SS->getIntrinsicID() == Intrinsic::stacksave) {
-        if (&*++SS->getIterator() == II)
+        // Skip over debug info.
+        if (SS->getNextNonDebugInstruction() == II) {
           return eraseInstFromFunction(CI);
+        }
       }
     }
 
@@ -3609,9 +3709,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::assume: {
     Value *IIOperand = II->getArgOperand(0);
-    // Remove an assume if it is immediately followed by an identical assume.
-    if (match(II->getNextNode(),
-              m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+    // Remove an assume if it is followed by an identical assume.
+    // TODO: Do we need this? Unless there are conflicting assumptions, the
+    // computeKnownBits(IIOperand) below here eliminates redundant assumes.
+    Instruction *Next = II->getNextNonDebugInstruction();
+    if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
       return eraseInstFromFunction(CI);
 
     // Canonicalize assume(a && b) -> assume(a); assume(b);
@@ -3698,8 +3800,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
 
   case Intrinsic::experimental_guard: {
-    // Is this guard followed by another guard?
+    // Is this guard followed by another guard?  We scan forward over a small
+    // fixed window of instructions to handle common cases with conditions
+    // computed between guards.
     Instruction *NextInst = II->getNextNode();
+    for (unsigned i = 0; i < GuardWideningWindow; i++) {
+      // Note: Using context-free form to avoid compile time blow up
+      if (!isSafeToSpeculativelyExecute(NextInst))
+        break;
+      NextInst = NextInst->getNextNode();
+    }
     Value *NextCond = nullptr;
     if (match(NextInst,
               m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
@@ -3710,6 +3820,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         return eraseInstFromFunction(*NextInst);
 
       // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+      Instruction* MoveI = II->getNextNode();
+      while (MoveI != NextInst) {
+        auto *Temp = MoveI;
+        MoveI = MoveI->getNextNode();
+        Temp->moveBefore(II);
+      }
       II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
       return eraseInstFromFunction(*NextInst);
     }
@@ -3722,7 +3838,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 // Fence instruction simplification
 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
   // Remove identical consecutive fences.
-  if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+  Instruction *Next = FI.getNextNonDebugInstruction();
+  if (auto *NFI = dyn_cast<FenceInst>(Next))
     if (FI.isIdenticalTo(NFI))
       return eraseInstFromFunction(FI);
   return nullptr;
@@ -3899,8 +4016,8 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     // Remove the convergent attr on calls when the callee is not convergent.
     if (CS.isConvergent() && !CalleeF->isConvergent() &&
         !CalleeF->isIntrinsic()) {
-      DEBUG(dbgs() << "Removing convergent attr from instr "
-                   << CS.getInstruction() << "\n");
+      LLVM_DEBUG(dbgs() << "Removing convergent attr from instr "
+                        << CS.getInstruction() << "\n");
       CS.setNotConvergent();
       return CS.getInstruction();
     }
@@ -3931,7 +4048,9 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     }
   }
 
-  if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+  if ((isa<ConstantPointerNull>(Callee) &&
+       !NullPointerIsDefined(CS.getInstruction()->getFunction())) ||
+      isa<UndefValue>(Callee)) {
     // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
@@ -3998,10 +4117,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   if (!Callee)
     return false;
 
-  // The prototype of a thunk is a lie. Don't directly call such a function.
+  // If this is a call to a thunk function, don't remove the cast. Thunks are
+  // used to transparently forward all incoming parameters and outgoing return
+  // values, so it's important to leave the cast in place.
   if (Callee->hasFnAttribute("thunk"))
     return false;
 
+  // If this is a musttail call, the callee's prototype must match the caller's
+  // prototype with the exception of pointee types. The code below doesn't
+  // implement that, so we can't do this transform.
+  // TODO: Do the transform if it only requires adding pointer casts.
+  if (CS.isMustTailCall())
+    return false;
+
   Instruction *Caller = CS.getInstruction();
   const AttributeList &CallerPAL = CS.getAttributes();
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 178c8eaf2502..e8ea7396a96a 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;
@@ -256,7 +257,7 @@ Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
   return Instruction::CastOps(Res);
 }
 
-/// @brief Implement the transforms common to all CastInst visitors.
+/// Implement the transforms common to all CastInst visitors.
 Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
 
@@ -265,14 +266,27 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
     if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
-      return CastInst::Create(NewOpc, CSrc->getOperand(0), CI.getType());
+      auto *Ty = CI.getType();
+      auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
+      // Point debug users of the dying cast to the new one.
+      if (CSrc->hasOneUse())
+        replaceAllDbgUsesWith(*CSrc, *Res, CI, DT);
+      return Res;
     }
   }
 
-  // If we are casting a select, then fold the cast into the select.
-  if (auto *SI = dyn_cast<SelectInst>(Src))
-    if (Instruction *NV = FoldOpIntoSelect(CI, SI))
-      return NV;
+  if (auto *Sel = dyn_cast<SelectInst>(Src)) {
+    // We are casting a select. Try to fold the cast into the select, but only
+    // if the select does not have a compare instruction with matching operand
+    // types. Creating a select with operands that are different sizes than its
+    // condition may inhibit other folds and lead to worse codegen.
+    auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition());
+    if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType())
+      if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) {
+        replaceAllDbgUsesWith(*Sel, *NV, CI, DT);
+        return NV;
+      }
+  }
 
   // If we are casting a PHI, then fold the cast into the PHI.
   if (auto *PN = dyn_cast<PHINode>(Src)) {
@@ -287,6 +301,33 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   return nullptr;
 }
 
+/// Constants and extensions/truncates from the destination type are always
+/// free to be evaluated in that type. This is a helper for canEvaluate*.
+static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
+  if (isa<Constant>(V))
+    return true;
+  Value *X;
+  if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
+      X->getType() == Ty)
+    return true;
+
+  return false;
+}
+
+/// Filter out values that we can not evaluate in the destination type for free.
+/// This is a helper for canEvaluate*.
+static bool canNotEvaluateInType(Value *V, Type *Ty) {
+  assert(!isa<Constant>(V) && "Constant should already be handled.");
+  if (!isa<Instruction>(V))
+    return true;
+  // We don't extend or shrink something that has multiple uses --  doing so
+  // would require duplicating the instruction which isn't profitable.
+  if (!V->hasOneUse())
+    return true;
+
+  return false;
+}
+
 /// Return true if we can evaluate the specified expression tree as type Ty
 /// instead of its larger type, and arrive with the same value.
 /// This is used by code that tries to eliminate truncates.
@@ -300,27 +341,14 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
 ///
 static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
                                  Instruction *CxtI) {
-  // We can always evaluate constants in another type.
-  if (isa<Constant>(V))
+  if (canAlwaysEvaluateInType(V, Ty))
     return true;
+  if (canNotEvaluateInType(V, Ty))
+    return false;
 
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
-
+  auto *I = cast<Instruction>(V);
   Type *OrigTy = V->getType();
-
-  // If this is an extension from the dest type, we can eliminate it, even if it
-  // has multiple uses.
-  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
-      I->getOperand(0)->getType() == Ty)
-    return true;
-
-  // We can't extend or shrink something that has multiple uses: doing so would
-  // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
-
-  unsigned Opc = I->getOpcode();
-  switch (Opc) {
+  switch (I->getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
@@ -336,13 +364,12 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     // UDiv and URem can be truncated if all the truncated bits are zero.
     uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
     uint32_t BitWidth = Ty->getScalarSizeInBits();
-    if (BitWidth < OrigBitWidth) {
-      APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
-      if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
-          IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
-               canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
-      }
+    assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!");
+    APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
+        IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
     }
     break;
   }
@@ -365,9 +392,9 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     if (match(I->getOperand(1), m_APInt(Amt))) {
       uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
       uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (IC.MaskedValueIsZero(I->getOperand(0),
-            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) &&
-          Amt->getLimitedValue(BitWidth) < BitWidth) {
+      if (Amt->getLimitedValue(BitWidth) < BitWidth &&
+          IC.MaskedValueIsZero(I->getOperand(0),
+            APInt::getBitsSetFrom(OrigBitWidth, BitWidth), 0, CxtI)) {
         return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
       }
     }
@@ -644,20 +671,6 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // Test if the trunc is the user of a select which is part of a
-  // minimum or maximum operation. If so, don't do any more simplification.
-  // Even simplifying demanded bits can break the canonical form of a
-  // min/max.
-  Value *LHS, *RHS;
-  if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
-    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
-      return nullptr;
-
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType(), *SrcTy = Src->getType();
 
@@ -670,13 +683,29 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
 
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
-    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid cast: " << CI << '\n');
+    LLVM_DEBUG(
+        dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                  " to avoid cast: "
+               << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
     return replaceInstUsesWith(CI, Res);
   }
 
+  // Test if the trunc is the user of a select which is part of a
+  // minimum or maximum operation. If so, don't do any more simplification.
+  // Even simplifying demanded bits can break the canonical form of a
+  // min/max.
+  Value *LHS, *RHS;
+  if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
+    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
+      return nullptr;
+
+  // See if we can simplify any instructions used by the input whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
   // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
     Constant *One = ConstantInt::get(SrcTy, 1);
@@ -916,23 +945,14 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
 static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
                              InstCombiner &IC, Instruction *CxtI) {
   BitsToClear = 0;
-  if (isa<Constant>(V))
-    return true;
-
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
-
-  // If the input is a truncate from the destination type, we can trivially
-  // eliminate it.
-  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+  if (canAlwaysEvaluateInType(V, Ty))
     return true;
+  if (canNotEvaluateInType(V, Ty))
+    return false;
 
-  // We can't extend or shrink something that has multiple uses: doing so would
-  // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
-
-  unsigned Opc = I->getOpcode(), Tmp;
-  switch (Opc) {
+  auto *I = cast<Instruction>(V);
+  unsigned Tmp;
+  switch (I->getOpcode()) {
   case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
   case Instruction::SExt:  // zext(sext(x)) -> sext(x).
   case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
@@ -961,7 +981,7 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
                                0, CxtI)) {
         // If this is an And instruction and all of the BitsToClear are
         // known to be zero we can reset BitsToClear.
-        if (Opc == Instruction::And)
+        if (I->getOpcode() == Instruction::And)
           BitsToClear = 0;
         return true;
       }
@@ -1052,11 +1072,18 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
            "Can't clear more bits than in SrcTy");
 
     // Okay, we can transform this!  Insert the new expression now.
-    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid zero extend: " << CI << '\n');
+    LLVM_DEBUG(
+        dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                  " to avoid zero extend: "
+               << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
 
+    // Preserve debug values referring to Src if the zext is its last use.
+    if (auto *SrcOp = dyn_cast<Instruction>(Src))
+      if (SrcOp->hasOneUse())
+        replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT);
+
     uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
     uint32_t DestBitSize = DestTy->getScalarSizeInBits();
 
@@ -1168,22 +1195,19 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
   if (!Op1->getType()->isIntOrIntVectorTy())
     return nullptr;
 
-  if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+  if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) ||
+      (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) {
     // (x <s  0) ? -1 : 0 -> ashr x, 31        -> all ones if negative
     // (x >s -1) ? -1 : 0 -> not (ashr x, 31)  -> all ones if positive
-    if ((Pred == ICmpInst::ICMP_SLT && Op1C->isNullValue()) ||
-        (Pred == ICmpInst::ICMP_SGT && Op1C->isAllOnesValue())) {
+    Value *Sh = ConstantInt::get(Op0->getType(),
+                                 Op0->getType()->getScalarSizeInBits() - 1);
+    Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
+    if (In->getType() != CI.getType())
+      In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
 
-      Value *Sh = ConstantInt::get(Op0->getType(),
-                                   Op0->getType()->getScalarSizeInBits()-1);
-      Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
-      if (In->getType() != CI.getType())
-        In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
-
-      if (Pred == ICmpInst::ICMP_SGT)
-        In = Builder.CreateNot(In, In->getName() + ".not");
-      return replaceInstUsesWith(CI, In);
-    }
+    if (Pred == ICmpInst::ICMP_SGT)
+      In = Builder.CreateNot(In, In->getName() + ".not");
+    return replaceInstUsesWith(CI, In);
   }
 
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
@@ -1254,21 +1278,12 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
 static bool canEvaluateSExtd(Value *V, Type *Ty) {
   assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
          "Can't sign extend type to a smaller type");
-  // If this is a constant, it can be trivially promoted.
-  if (isa<Constant>(V))
+  if (canAlwaysEvaluateInType(V, Ty))
     return true;
+  if (canNotEvaluateInType(V, Ty))
+    return false;
 
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
-
-  // If this is a truncate from the dest type, we can trivially eliminate it.
-  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
-    return true;
-
-  // We can't extend or shrink something that has multiple uses: doing so would
-  // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
-
+  auto *I = cast<Instruction>(V);
   switch (I->getOpcode()) {
   case Instruction::SExt:  // sext(sext(x)) -> sext(x)
   case Instruction::ZExt:  // sext(zext(x)) -> zext(x)
@@ -1335,8 +1350,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateSExtd(Src, DestTy)) {
     // Okay, we can transform this!  Insert the new expression now.
-    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid sign extend: " << CI << '\n');
+    LLVM_DEBUG(
+        dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                  " to avoid sign extend: "
+               << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, true);
     assert(Res->getType() == DestTy);
 
@@ -1401,45 +1418,83 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
 
 /// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
-static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   bool losesInfo;
   APFloat F = CFP->getValueAPF();
   (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
-  if (!losesInfo)
-    return ConstantFP::get(CFP->getContext(), F);
+  return !losesInfo;
+}
+
+static Type *shrinkFPConstant(ConstantFP *CFP) {
+  if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
+    return nullptr;  // No constant folding of this.
+  // See if the value can be truncated to half and then reextended.
+  if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+    return Type::getHalfTy(CFP->getContext());
+  // See if the value can be truncated to float and then reextended.
+  if (fitsInFPType(CFP, APFloat::IEEEsingle()))
+    return Type::getFloatTy(CFP->getContext());
+  if (CFP->getType()->isDoubleTy())
+    return nullptr;  // Won't shrink.
+  if (fitsInFPType(CFP, APFloat::IEEEdouble()))
+    return Type::getDoubleTy(CFP->getContext());
+  // Don't try to shrink to various long double types.
   return nullptr;
 }
 
-/// Look through floating-point extensions until we get the source value.
-static Value *lookThroughFPExtensions(Value *V) {
-  while (auto *FPExt = dyn_cast<FPExtInst>(V))
-    V = FPExt->getOperand(0);
+// Determine if this is a vector of ConstantFPs and if so, return the minimal
+// type we can safely truncate all elements to.
+// TODO: Make these support undef elements.
+static Type *shrinkFPConstantVector(Value *V) {
+  auto *CV = dyn_cast<Constant>(V);
+  if (!CV || !CV->getType()->isVectorTy())
+    return nullptr;
+
+  Type *MinType = nullptr;
+
+  unsigned NumElts = CV->getType()->getVectorNumElements();
+  for (unsigned i = 0; i != NumElts; ++i) {
+    auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
+    if (!CFP)
+      return nullptr;
+
+    Type *T = shrinkFPConstant(CFP);
+    if (!T)
+      return nullptr;
+
+    // If we haven't found a type yet or this type has a larger mantissa than
+    // our previous type, this is our new minimal type.
+    if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth())
+      MinType = T;
+  }
+
+  // Make a vector type from the minimal type.
+  return VectorType::get(MinType, NumElts);
+}
+
+/// Find the minimum FP type we can safely truncate to.
+static Type *getMinimumFPType(Value *V) {
+  if (auto *FPExt = dyn_cast<FPExtInst>(V))
+    return FPExt->getOperand(0)->getType();
 
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
-  if (auto *CFP = dyn_cast<ConstantFP>(V)) {
-    if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
-      return V;  // No constant folding of this.
-    // See if the value can be truncated to half and then reextended.
-    if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf()))
-      return V;
-    // See if the value can be truncated to float and then reextended.
-    if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle()))
-      return V;
-    if (CFP->getType()->isDoubleTy())
-      return V;  // Won't shrink.
-    if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble()))
-      return V;
-    // Don't try to shrink to various long double types.
-  }
-
-  return V;
+  if (auto *CFP = dyn_cast<ConstantFP>(V))
+    if (Type *T = shrinkFPConstant(CFP))
+      return T;
+
+  // Try to shrink a vector of FP constants.
+  if (Type *T = shrinkFPConstantVector(V))
+    return T;
+
+  return V->getType();
 }
 
-Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
-  if (Instruction *I = commonCastTransforms(CI))
+Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
+  if (Instruction *I = commonCastTransforms(FPT))
     return I;
+
   // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
   // simplify this expression to avoid one or more of the trunc/extend
   // operations if we can do so without changing the numerical results.
@@ -1447,15 +1502,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   // The exact manner in which the widths of the operands interact to limit
   // what we can and cannot do safely varies from operation to operation, and
   // is explained below in the various case statements.
-  BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
+  Type *Ty = FPT.getType();
+  BinaryOperator *OpI = dyn_cast<BinaryOperator>(FPT.getOperand(0));
   if (OpI && OpI->hasOneUse()) {
-    Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0));
-    Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1));
+    Type *LHSMinType = getMinimumFPType(OpI->getOperand(0));
+    Type *RHSMinType = getMinimumFPType(OpI->getOperand(1));
     unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
-    unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth();
-    unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth();
+    unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
+    unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
     unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
-    unsigned DstWidth = CI.getType()->getFPMantissaWidth();
+    unsigned DstWidth = Ty->getFPMantissaWidth();
     switch (OpI->getOpcode()) {
       default: break;
       case Instruction::FAdd:
@@ -1479,12 +1535,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // could be tightened for those cases, but they are rare (the main
         // case of interest here is (float)((double)float + float)).
         if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
-          if (LHSOrig->getType() != CI.getType())
-            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
-          if (RHSOrig->getType() != CI.getType())
-            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
-          Instruction *RI =
-            BinaryOperator::Create(OpI->getOpcode(), LHSOrig, RHSOrig);
+          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+          Instruction *RI = BinaryOperator::Create(OpI->getOpcode(), LHS, RHS);
           RI->copyFastMathFlags(OpI);
           return RI;
         }
@@ -1496,14 +1549,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // rounding can possibly occur; we can safely perform the operation
         // in the destination format if it can represent both sources.
         if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
-          if (LHSOrig->getType() != CI.getType())
-            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
-          if (RHSOrig->getType() != CI.getType())
-            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
-          Instruction *RI =
-            BinaryOperator::CreateFMul(LHSOrig, RHSOrig);
-          RI->copyFastMathFlags(OpI);
-          return RI;
+          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+          return BinaryOperator::CreateFMulFMF(LHS, RHS, OpI);
         }
         break;
       case Instruction::FDiv:
@@ -1514,72 +1562,48 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // condition used here is a good conservative first pass.
         // TODO: Tighten bound via rigorous analysis of the unbalanced case.
         if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
-          if (LHSOrig->getType() != CI.getType())
-            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
-          if (RHSOrig->getType() != CI.getType())
-            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
-          Instruction *RI =
-            BinaryOperator::CreateFDiv(LHSOrig, RHSOrig);
-          RI->copyFastMathFlags(OpI);
-          return RI;
+          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+          return BinaryOperator::CreateFDivFMF(LHS, RHS, OpI);
         }
         break;
-      case Instruction::FRem:
+      case Instruction::FRem: {
         // Remainder is straightforward.  Remainder is always exact, so the
         // type of OpI doesn't enter into things at all.  We simply evaluate
         // in whichever source type is larger, then convert to the
         // destination type.
         if (SrcWidth == OpWidth)
           break;
-        if (LHSWidth < SrcWidth)
-          LHSOrig = Builder.CreateFPExt(LHSOrig, RHSOrig->getType());
-        else if (RHSWidth <= SrcWidth)
-          RHSOrig = Builder.CreateFPExt(RHSOrig, LHSOrig->getType());
-        if (LHSOrig != OpI->getOperand(0) || RHSOrig != OpI->getOperand(1)) {
-          Value *ExactResult = Builder.CreateFRem(LHSOrig, RHSOrig);
-          if (Instruction *RI = dyn_cast<Instruction>(ExactResult))
-            RI->copyFastMathFlags(OpI);
-          return CastInst::CreateFPCast(ExactResult, CI.getType());
+        Value *LHS, *RHS;
+        if (LHSWidth == SrcWidth) {
+           LHS = Builder.CreateFPTrunc(OpI->getOperand(0), LHSMinType);
+           RHS = Builder.CreateFPTrunc(OpI->getOperand(1), LHSMinType);
+        } else {
+           LHS = Builder.CreateFPTrunc(OpI->getOperand(0), RHSMinType);
+           RHS = Builder.CreateFPTrunc(OpI->getOperand(1), RHSMinType);
         }
+
+        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, OpI);
+        return CastInst::CreateFPCast(ExactResult, Ty);
+      }
     }
 
     // (fptrunc (fneg x)) -> (fneg (fptrunc x))
     if (BinaryOperator::isFNeg(OpI)) {
-      Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1),
-                                                CI.getType());
-      Instruction *RI = BinaryOperator::CreateFNeg(InnerTrunc);
-      RI->copyFastMathFlags(OpI);
-      return RI;
+      Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+      return BinaryOperator::CreateFNegFMF(InnerTrunc, OpI);
     }
   }
 
-  // (fptrunc (select cond, R1, Cst)) -->
-  // (select cond, (fptrunc R1), (fptrunc Cst))
-  //
-  //  - but only if this isn't part of a min/max operation, else we'll
-  // ruin min/max canonical form which is to have the select and
-  // compare's operands be of the same type with no casts to look through.
-  Value *LHS, *RHS;
-  SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
-  if (SI &&
-      (isa<ConstantFP>(SI->getOperand(1)) ||
-       isa<ConstantFP>(SI->getOperand(2))) &&
-      matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) {
-    Value *LHSTrunc = Builder.CreateFPTrunc(SI->getOperand(1), CI.getType());
-    Value *RHSTrunc = Builder.CreateFPTrunc(SI->getOperand(2), CI.getType());
-    return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
-  }
-
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
-  if (II) {
+  if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) {
     switch (II->getIntrinsicID()) {
     default: break;
-    case Intrinsic::fabs:
     case Intrinsic::ceil:
+    case Intrinsic::fabs:
     case Intrinsic::floor:
+    case Intrinsic::nearbyint:
     case Intrinsic::rint:
     case Intrinsic::round:
-    case Intrinsic::nearbyint:
     case Intrinsic::trunc: {
       Value *Src = II->getArgOperand(0);
       if (!Src->hasOneUse())
@@ -1590,30 +1614,26 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       // truncating.
       if (II->getIntrinsicID() != Intrinsic::fabs) {
         FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
-        if (!FPExtSrc || FPExtSrc->getOperand(0)->getType() != CI.getType())
+        if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty)
           break;
       }
 
       // Do unary FP operation on smaller type.
       // (fptrunc (fabs x)) -> (fabs (fptrunc x))
-      Value *InnerTrunc = Builder.CreateFPTrunc(Src, CI.getType());
-      Type *IntrinsicType[] = { CI.getType() };
-      Function *Overload = Intrinsic::getDeclaration(
-        CI.getModule(), II->getIntrinsicID(), IntrinsicType);
-
+      Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty);
+      Function *Overload = Intrinsic::getDeclaration(FPT.getModule(),
+                                                     II->getIntrinsicID(), Ty);
       SmallVector<OperandBundleDef, 1> OpBundles;
       II->getOperandBundlesAsDefs(OpBundles);
-
-      Value *Args[] = { InnerTrunc };
-      CallInst *NewCI =  CallInst::Create(Overload, Args,
-                                          OpBundles, II->getName());
+      CallInst *NewCI = CallInst::Create(Overload, { InnerTrunc }, OpBundles,
+                                         II->getName());
       NewCI->copyFastMathFlags(II);
       return NewCI;
     }
     }
   }
 
-  if (Instruction *I = shrinkInsertElt(CI, Builder))
+  if (Instruction *I = shrinkInsertElt(FPT, Builder))
     return I;
 
   return nullptr;
@@ -1718,7 +1738,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   return nullptr;
 }
 
-/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+/// Implement the transforms for cast of pointer (bitcast/ptrtoint)
 Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
 
@@ -1751,7 +1771,7 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   Type *Ty = CI.getType();
   unsigned AS = CI.getPointerAddressSpace();
 
-  if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
+  if (Ty->getScalarSizeInBits() == DL.getIndexSizeInBits(AS))
     return commonPointerCastTransforms(CI);
 
   Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
@@ -2004,13 +2024,13 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
       !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
       !BO->isBitwiseLogicOp())
     return nullptr;
-  
+
   // FIXME: This transform is restricted to vector types to avoid backend
   // problems caused by creating potentially illegal operations. If a fix-up is
   // added to handle that situation, we can remove this check.
   if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
     return nullptr;
-  
+
   Value *X;
   if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
       X->getType() == DestTy && !isa<Constant>(X)) {
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3bc7fae77cb1..6de92a4842ab 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -682,7 +682,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
   // 4. Emit GEPs to get the original pointers.
   // 5. Remove the original instructions.
   Type *IndexType = IntegerType::get(
-      Base->getContext(), DL.getPointerTypeSizeInBits(Start->getType()));
+      Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));
 
   DenseMap<Value *, Value *> NewInsts;
   NewInsts[Base] = ConstantInt::getNullValue(IndexType);
@@ -723,7 +723,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
       }
 
       auto *Op = NewInsts[GEP->getOperand(0)];
-      if (isa<ConstantInt>(Op) && dyn_cast<ConstantInt>(Op)->isZero())
+      if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
         NewInsts[GEP] = Index;
       else
         NewInsts[GEP] = Builder.CreateNSWAdd(
@@ -790,7 +790,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
 static std::pair<Value *, Value *>
 getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
   Type *IndexType = IntegerType::get(V->getContext(),
-                                     DL.getPointerTypeSizeInBits(V->getType()));
+                                     DL.getIndexTypeSizeInBits(V->getType()));
 
   Constant *Index = ConstantInt::getNullValue(IndexType);
   while (true) {
@@ -1893,11 +1893,8 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
       APInt ShiftedC = C.ashr(*ShiftAmt);
       return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
     }
-    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
-      // This is the same code as the SGT case, but assert the pre-condition
-      // that is needed for this to work with equality predicates.
-      assert(C.ashr(*ShiftAmt).shl(*ShiftAmt) == C &&
-             "Compare known true or false was not folded");
+    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+        C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) {
       APInt ShiftedC = C.ashr(*ShiftAmt);
       return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
     }
@@ -1926,11 +1923,8 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
       APInt ShiftedC = C.lshr(*ShiftAmt);
       return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
     }
-    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
-      // This is the same code as the UGT case, but assert the pre-condition
-      // that is needed for this to work with equality predicates.
-      assert(C.lshr(*ShiftAmt).shl(*ShiftAmt) == C &&
-             "Compare known true or false was not folded");
+    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+        C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) {
       APInt ShiftedC = C.lshr(*ShiftAmt);
       return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
     }
@@ -2463,6 +2457,45 @@ Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
+Instruction *InstCombiner::foldICmpBitCastConstant(ICmpInst &Cmp,
+                                                   BitCastInst *Bitcast,
+                                                   const APInt &C) {
+  // Folding: icmp <pred> iN X, C
+  //  where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
+  //    and C is a splat of a K-bit pattern
+  //    and SC is a constant vector = <C', C', C', ..., C'>
+  // Into:
+  //   %E = extractelement <M x iK> %vec, i32 C'
+  //   icmp <pred> iK %E, trunc(C)
+  if (!Bitcast->getType()->isIntegerTy() ||
+      !Bitcast->getSrcTy()->isIntOrIntVectorTy())
+    return nullptr;
+
+  Value *BCIOp = Bitcast->getOperand(0);
+  Value *Vec = nullptr;     // 1st vector arg of the shufflevector
+  Constant *Mask = nullptr; // Mask arg of the shufflevector
+  if (match(BCIOp,
+            m_ShuffleVector(m_Value(Vec), m_Undef(), m_Constant(Mask)))) {
+    // Check whether every element of Mask is the same constant
+    if (auto *Elem = dyn_cast_or_null<ConstantInt>(Mask->getSplatValue())) {
+      auto *VecTy = cast<VectorType>(BCIOp->getType());
+      auto *EltTy = cast<IntegerType>(VecTy->getElementType());
+      auto Pred = Cmp.getPredicate();
+      if (C.isSplat(EltTy->getBitWidth())) {
+        // Fold the icmp based on the value of C
+        // If C is M copies of an iK sized bit pattern,
+        // then:
+        //   =>  %E = extractelement <N x iK> %vec, i32 Elem
+        //       icmp <pred> iK %SplatVal, <pattern>
+        Value *Extract = Builder.CreateExtractElement(Vec, Elem);
+        Value *NewC = ConstantInt::get(EltTy, C.trunc(EltTy->getBitWidth()));
+        return new ICmpInst(Pred, Extract, NewC);
+      }
+    }
+  }
+  return nullptr;
+}
+
 /// Try to fold integer comparisons with a constant operand: icmp Pred X, C
 /// where X is some kind of instruction.
 Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
@@ -2537,6 +2570,11 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
       return I;
   }
 
+  if (auto *BCI = dyn_cast<BitCastInst>(Cmp.getOperand(0))) {
+    if (Instruction *I = foldICmpBitCastConstant(Cmp, BCI, *C))
+      return I;
+  }
+
   if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, *C))
     return I;
 
@@ -2828,6 +2866,160 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
   return nullptr;
 }
 
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy truncation.
+/// Folds:
+///   x & (-1 >> y) SrcPred x    to    x DstPred (-1 >> y)
+/// The Mask can be a constant, too.
+/// For some predicates, the operands are commutative.
+/// For others, x can only be on a specific side.
+static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
+                                          InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate SrcPred;
+  Value *X, *M;
+  auto m_Mask = m_CombineOr(m_LShr(m_AllOnes(), m_Value()), m_LowBitMask());
+  if (!match(&I, m_c_ICmp(SrcPred,
+                          m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
+                          m_Deferred(X))))
+    return nullptr;
+
+  ICmpInst::Predicate DstPred;
+  switch (SrcPred) {
+  case ICmpInst::Predicate::ICMP_EQ:
+    //  x & (-1 >> y) == x    ->    x u<= (-1 >> y)
+    DstPred = ICmpInst::Predicate::ICMP_ULE;
+    break;
+  case ICmpInst::Predicate::ICMP_NE:
+    //  x & (-1 >> y) != x    ->    x u> (-1 >> y)
+    DstPred = ICmpInst::Predicate::ICMP_UGT;
+    break;
+  case ICmpInst::Predicate::ICMP_UGT:
+    //  x u> x & (-1 >> y)    ->    x u> (-1 >> y)
+    assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
+    DstPred = ICmpInst::Predicate::ICMP_UGT;
+    break;
+  case ICmpInst::Predicate::ICMP_UGE:
+    //  x & (-1 >> y) u>= x    ->    x u<= (-1 >> y)
+    assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
+    DstPred = ICmpInst::Predicate::ICMP_ULE;
+    break;
+  case ICmpInst::Predicate::ICMP_ULT:
+    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y)
+    assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
+    DstPred = ICmpInst::Predicate::ICMP_UGT;
+    break;
+  case ICmpInst::Predicate::ICMP_ULE:
+    //  x u<= x & (-1 >> y)    ->    x u<= (-1 >> y)
+    assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
+    DstPred = ICmpInst::Predicate::ICMP_ULE;
+    break;
+  case ICmpInst::Predicate::ICMP_SGT:
+    //  x s> x & (-1 >> y)    ->    x s> (-1 >> y)
+    if (X != I.getOperand(0)) // X must be on LHS of comparison!
+      return nullptr;         // Ignore the other case.
+    DstPred = ICmpInst::Predicate::ICMP_SGT;
+    break;
+  case ICmpInst::Predicate::ICMP_SGE:
+    //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y)
+    if (X != I.getOperand(1)) // X must be on RHS of comparison!
+      return nullptr;         // Ignore the other case.
+    DstPred = ICmpInst::Predicate::ICMP_SLE;
+    break;
+  case ICmpInst::Predicate::ICMP_SLT:
+    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
+    if (X != I.getOperand(1)) // X must be on RHS of comparison!
+      return nullptr;         // Ignore the other case.
+    DstPred = ICmpInst::Predicate::ICMP_SGT;
+    break;
+  case ICmpInst::Predicate::ICMP_SLE:
+    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y)
+    if (X != I.getOperand(0)) // X must be on LHS of comparison!
+      return nullptr;         // Ignore the other case.
+    DstPred = ICmpInst::Predicate::ICMP_SLE;
+    break;
+  default:
+    llvm_unreachable("All possible folds are handled.");
+  }
+
+  return Builder.CreateICmp(DstPred, X, M);
+}
+
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy signed truncation.
+/// Folds:   (MaskedBits is a constant.)
+///   ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
+/// Into:
+///   (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
+/// Where  KeptBits = bitwidth(%x) - MaskedBits
+static Value *
+foldICmpWithTruncSignExtendedVal(ICmpInst &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate SrcPred;
+  Value *X;
+  const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
+  // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
+  if (!match(&I, m_c_ICmp(SrcPred,
+                          m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
+                                          m_APInt(C1))),
+                          m_Deferred(X))))
+    return nullptr;
+
+  // Potential handling of non-splats: for each element:
+  //  * if both are undef, replace with constant 0.
+  //    Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
+  //  * if both are not undef, and are different, bailout.
+  //  * else, only one is undef, then pick the non-undef one.
+
+  // The shift amount must be equal.
+  if (*C0 != *C1)
+    return nullptr;
+  const APInt &MaskedBits = *C0;
+  assert(MaskedBits != 0 && "shift by zero should be folded away already.");
+
+  ICmpInst::Predicate DstPred;
+  switch (SrcPred) {
+  case ICmpInst::Predicate::ICMP_EQ:
+    // ((%x << MaskedBits) a>> MaskedBits) == %x
+    //   =>
+    // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
+    DstPred = ICmpInst::Predicate::ICMP_ULT;
+    break;
+  case ICmpInst::Predicate::ICMP_NE:
+    // ((%x << MaskedBits) a>> MaskedBits) != %x
+    //   =>
+    // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
+    DstPred = ICmpInst::Predicate::ICMP_UGE;
+    break;
+  // FIXME: are more folds possible?
+  default:
+    return nullptr;
+  }
+
+  auto *XType = X->getType();
+  const unsigned XBitWidth = XType->getScalarSizeInBits();
+  const APInt BitWidth = APInt(XBitWidth, XBitWidth);
+  assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");
+
+  // KeptBits = bitwidth(%x) - MaskedBits
+  const APInt KeptBits = BitWidth - MaskedBits;
+  assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
+  // ICmpCst = (1 << KeptBits)
+  const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
+  assert(ICmpCst.isPowerOf2());
+  // AddCst = (1 << (KeptBits-1))
+  const APInt AddCst = ICmpCst.lshr(1);
+  assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());
+
+  // T0 = add %x, AddCst
+  Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
+  // T1 = T0 DstPred ICmpCst
+  Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));
+
+  return T1;
+}
+
 /// Try to fold icmp (binop), X or icmp X, (binop).
 /// TODO: A large part of this logic is duplicated in InstSimplify's
 /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
@@ -3011,17 +3203,22 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
   if (A == Op1 && NoOp0WrapProblem)
     return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
-
   // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
   if (C == Op0 && NoOp1WrapProblem)
     return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
 
+  // (A - B) >u A --> A <u B
+  if (A == Op1 && Pred == ICmpInst::ICMP_UGT)
+    return new ICmpInst(ICmpInst::ICMP_ULT, A, B);
+  // C <u (C - D) --> C <u D
+  if (C == Op0 && Pred == ICmpInst::ICMP_ULT)
+    return new ICmpInst(ICmpInst::ICMP_ULT, C, D);
+
   // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
   if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
       // Try not to increase register pressure.
       BO0->hasOneUse() && BO1->hasOneUse())
     return new ICmpInst(Pred, A, C);
-
   // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
   if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
       // Try not to increase register pressure.
@@ -3032,8 +3229,8 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
     Value *X;
     if (match(BO0, m_Neg(m_Value(X))))
-      if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
-        if (!RHSC->isMinValue(/*isSigned=*/true))
+      if (Constant *RHSC = dyn_cast<Constant>(Op1))
+        if (RHSC->isNotMinSignedValue())
           return new ICmpInst(I.getSwappedPredicate(), X,
                               ConstantExpr::getNeg(RHSC));
   }
@@ -3160,6 +3357,12 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     }
   }
 
+  if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -3414,8 +3617,15 @@ Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
 
   // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
+  const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool {
+    if (isa<VectorType>(SrcTy)) {
+      SrcTy = cast<VectorType>(SrcTy)->getElementType();
+      DestTy = cast<VectorType>(DestTy)->getElementType();
+    }
+    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
+  };
   if (LHSCI->getOpcode() == Instruction::PtrToInt &&
-      DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
+      CompatibleSizes(SrcTy, DestTy)) {
     Value *RHSOp = nullptr;
     if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
       Value *RHSCIOp = RHSC->getOperand(0);
@@ -3618,7 +3828,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
   return false;
 }
 
-/// \brief Recognize and process idiom involving test for multiplication
+/// Recognize and process idiom involving test for multiplication
 /// overflow.
 ///
 /// The caller has matched a pattern of the form:
@@ -3799,7 +4009,8 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
   // mul.with.overflow and adjust properly mask/size.
   if (MulVal->hasNUsesOrMore(2)) {
     Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
-    for (User *U : MulVal->users()) {
+    for (auto UI = MulVal->user_begin(), UE = MulVal->user_end(); UI != UE;) {
+      User *U = *UI++;
       if (U == &I || U == OtherVal)
         continue;
       if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
@@ -3890,48 +4101,33 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
   }
 }
 
-/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
+/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
 /// should be swapped.
 /// The decision is based on how many times these two operands are reused
 /// as subtract operands and their positions in those instructions.
-/// The rational is that several architectures use the same instruction for
-/// both subtract and cmp, thus it is better if the order of those operands
+/// The rationale is that several architectures use the same instruction for
+/// both subtract and cmp. Thus, it is better if the order of those operands
 /// match.
 /// \return true if Op0 and Op1 should be swapped.
-static bool swapMayExposeCSEOpportunities(const Value * Op0,
-                                          const Value * Op1) {
-  // Filter out pointer value as those cannot appears directly in subtract.
+static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
+  // Filter out pointer values as those cannot appear directly in subtract.
   // FIXME: we may want to go through inttoptrs or bitcasts.
   if (Op0->getType()->isPointerTy())
     return false;
-  // Count every uses of both Op0 and Op1 in a subtract.
-  // Each time Op0 is the first operand, count -1: swapping is bad, the
-  // subtract has already the same layout as the compare.
-  // Each time Op0 is the second operand, count +1: swapping is good, the
-  // subtract has a different layout as the compare.
-  // At the end, if the benefit is greater than 0, Op0 should come second to
-  // expose more CSE opportunities.
-  int GlobalSwapBenefits = 0;
+  // If a subtract already has the same operands as a compare, swapping would be
+  // bad. If a subtract has the same operands as a compare but in reverse order,
+  // then swapping is good.
+  int GoodToSwap = 0;
   for (const User *U : Op0->users()) {
-    const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(U);
-    if (!BinOp || BinOp->getOpcode() != Instruction::Sub)
-      continue;
-    // If Op0 is the first argument, this is not beneficial to swap the
-    // arguments.
-    int LocalSwapBenefits = -1;
-    unsigned Op1Idx = 1;
-    if (BinOp->getOperand(Op1Idx) == Op0) {
-      Op1Idx = 0;
-      LocalSwapBenefits = 1;
-    }
-    if (BinOp->getOperand(Op1Idx) != Op1)
-      continue;
-    GlobalSwapBenefits += LocalSwapBenefits;
+    if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
+      GoodToSwap++;
+    else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
+      GoodToSwap--;
   }
-  return GlobalSwapBenefits > 0;
+  return GoodToSwap > 0;
 }
 
-/// \brief Check that one use is in the same block as the definition and all
+/// Check that one use is in the same block as the definition and all
 /// other uses are in blocks dominated by a given block.
 ///
 /// \param DI Definition
@@ -3976,7 +4172,7 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
   return true;
 }
 
-/// \brief True when a select result is replaced by one of its operands
+/// True when a select result is replaced by one of its operands
 /// in select-icmp sequence. This will eventually result in the elimination
 /// of the select.
 ///
@@ -4052,7 +4248,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   // Get scalar or pointer size.
   unsigned BitWidth = Ty->isIntOrIntVectorTy()
                           ? Ty->getScalarSizeInBits()
-                          : DL.getTypeSizeInBits(Ty->getScalarType());
+                          : DL.getIndexTypeSizeInBits(Ty->getScalarType());
 
   if (!BitWidth)
     return nullptr;
@@ -4082,13 +4278,13 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
     computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
   }
 
-  // If Min and Max are known to be the same, then SimplifyDemandedBits
-  // figured out that the LHS is a constant. Constant fold this now, so that
+  // If Min and Max are known to be the same, then SimplifyDemandedBits figured
+  // out that the LHS or RHS is a constant. Constant fold this now, so that
   // code below can assume that Min != Max.
   if (!isa<Constant>(Op0) && Op0Min == Op0Max)
-    return new ICmpInst(Pred, ConstantInt::get(Op0->getType(), Op0Min), Op1);
+    return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
   if (!isa<Constant>(Op1) && Op1Min == Op1Max)
-    return new ICmpInst(Pred, Op0, ConstantInt::get(Op1->getType(), Op1Min));
+    return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
 
   // Based on the range information we know about the LHS, see if we can
   // simplify this comparison.  For example, (x&4) < 8 is always true.
@@ -4520,6 +4716,34 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         return New;
   }
 
+  // Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
+  Value *X;
+  if (match(Op0, m_BitCast(m_SIToFP(m_Value(X))))) {
+    // icmp  eq (bitcast (sitofp X)), 0 --> icmp  eq X, 0
+    // icmp  ne (bitcast (sitofp X)), 0 --> icmp  ne X, 0
+    // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
+    // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
+    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT ||
+         Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) &&
+        match(Op1, m_Zero()))
+      return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+    // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
+    if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));
+
+    // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
+    if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
+      return new ICmpInst(Pred, X, ConstantInt::getAllOnesValue(X->getType()));
+  }
+
+  // Zero-equality checks are preserved through unsigned floating-point casts:
+  // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
+  // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
+  if (match(Op0, m_BitCast(m_UIToFP(m_Value(X)))))
+    if (I.isEquality() && match(Op1, m_Zero()))
+      return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
   // Test to see if the operands of the icmp are casted versions of other
   // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
   // now.
@@ -4642,6 +4866,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
       return foldICmpAddOpConst(X, Cst, I.getSwappedPredicate());
   }
+
   return Changed ? &I : nullptr;
 }
 
@@ -4928,11 +5153,11 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
   // then canonicalize the operand to 0.0.
   if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
-    if (!match(Op0, m_Zero()) && isKnownNeverNaN(Op0)) {
+    if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0)) {
       I.setOperand(0, ConstantFP::getNullValue(Op0->getType()));
       return &I;
     }
-    if (!match(Op1, m_Zero()) && isKnownNeverNaN(Op1)) {
+    if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1)) {
       I.setOperand(1, ConstantFP::getNullValue(Op0->getType()));
       return &I;
     }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f1f66d86cb73..58ef3d41415c 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -40,7 +41,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 
@@ -122,17 +122,17 @@ static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
   return V;
 }
 
-/// \brief Add one to a Constant
+/// Add one to a Constant
 static inline Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
 }
 
-/// \brief Subtract one from a Constant
+/// Subtract one from a Constant
 static inline Constant *SubOne(Constant *C) {
   return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
 }
 
-/// \brief Return true if the specified value is free to invert (apply ~ to).
+/// Return true if the specified value is free to invert (apply ~ to).
 /// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
 /// is true, work under the assumption that the caller intends to remove all
 /// uses of V and only keep uses of ~V.
@@ -178,7 +178,7 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
   return false;
 }
 
-/// \brief Specific patterns of overflow check idioms that we match.
+/// Specific patterns of overflow check idioms that we match.
 enum OverflowCheckFlavor {
   OCF_UNSIGNED_ADD,
   OCF_SIGNED_ADD,
@@ -190,7 +190,7 @@ enum OverflowCheckFlavor {
   OCF_INVALID
 };
 
-/// \brief Returns the OverflowCheckFlavor corresponding to a overflow_with_op
+/// Returns the OverflowCheckFlavor corresponding to a overflow_with_op
 /// intrinsic.
 static inline OverflowCheckFlavor
 IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
@@ -212,7 +212,62 @@ IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
   }
 }
 
-/// \brief The core instruction combiner logic.
+/// Some binary operators require special handling to avoid poison and undefined
+/// behavior. If a constant vector has undef elements, replace those undefs with
+/// identity constants if possible because those are always safe to execute.
+/// If no identity constant exists, replace undef with some other safe constant.
+static inline Constant *getSafeVectorConstantForBinop(
+      BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) {
+  assert(In->getType()->isVectorTy() && "Not expecting scalars here");
+
+  Type *EltTy = In->getType()->getVectorElementType();
+  auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
+  if (!SafeC) {
+    // TODO: Should this be available as a constant utility function? It is
+    // similar to getBinOpAbsorber().
+    if (IsRHSConstant) {
+      switch (Opcode) {
+      case Instruction::SRem: // X % 1 = 0
+      case Instruction::URem: // X %u 1 = 0
+        SafeC = ConstantInt::get(EltTy, 1);
+        break;
+      case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
+        SafeC = ConstantFP::get(EltTy, 1.0);
+        break;
+      default:
+        llvm_unreachable("Only rem opcodes have no identity constant for RHS");
+      }
+    } else {
+      switch (Opcode) {
+      case Instruction::Shl:  // 0 << X = 0
+      case Instruction::LShr: // 0 >>u X = 0
+      case Instruction::AShr: // 0 >> X = 0
+      case Instruction::SDiv: // 0 / X = 0
+      case Instruction::UDiv: // 0 /u X = 0
+      case Instruction::SRem: // 0 % X = 0
+      case Instruction::URem: // 0 %u X = 0
+      case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
+      case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
+      case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
+      case Instruction::FRem: // 0.0 % X = 0
+        SafeC = Constant::getNullValue(EltTy);
+        break;
+      default:
+        llvm_unreachable("Expected to find identity constant for opcode");
+      }
+    }
+  }
+  assert(SafeC && "Must have safe constant for binop");
+  unsigned NumElts = In->getType()->getVectorNumElements();
+  SmallVector<Constant *, 16> Out(NumElts);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *C = In->getAggregateElement(i);
+    Out[i] = isa<UndefValue>(C) ? SafeC : C;
+  }
+  return ConstantVector::get(Out);
+}
+
+/// The core instruction combiner logic.
 ///
 /// This class provides both the logic to recursively visit instructions and
 /// combine them.
@@ -220,10 +275,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
     : public InstVisitor<InstCombiner, Instruction *> {
   // FIXME: These members shouldn't be public.
 public:
-  /// \brief A worklist of the instructions that need to be simplified.
+  /// A worklist of the instructions that need to be simplified.
   InstCombineWorklist &Worklist;
 
-  /// \brief An IRBuilder that automatically inserts new instructions into the
+  /// An IRBuilder that automatically inserts new instructions into the
   /// worklist.
   using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
   BuilderTy &Builder;
@@ -261,7 +316,7 @@ public:
         ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
         DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI) {}
 
-  /// \brief Run the combiner over the entire worklist until it is empty.
+  /// Run the combiner over the entire worklist until it is empty.
   ///
   /// \returns true if the IR is changed.
   bool run();
@@ -289,8 +344,6 @@ public:
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
-  Value *foldFMulConst(Instruction *FMulOrDiv, Constant *C,
-                       Instruction *InsertBefore);
   Instruction *visitFMul(BinaryOperator &I);
   Instruction *visitURem(BinaryOperator &I);
   Instruction *visitSRem(BinaryOperator &I);
@@ -378,7 +431,6 @@ private:
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
-  Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
                             SmallVectorImpl<Value *> &NewIndices);
 
@@ -393,7 +445,7 @@ private:
   /// if it cannot already be eliminated by some other transformation.
   bool shouldOptimizeCast(CastInst *CI);
 
-  /// \brief Try to optimize a sequence of instructions checking if an operation
+  /// Try to optimize a sequence of instructions checking if an operation
   /// on LHS and RHS overflows.
   ///
   /// If this overflow check is done via one of the overflow check intrinsics,
@@ -445,11 +497,22 @@ private:
   }
 
   bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
-                                const Instruction &CxtI) const;
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedSub(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
   bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
-                                  const Instruction &CxtI) const;
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
   bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
-                                const Instruction &CxtI) const;
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
 
   bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
                                   const Instruction &CxtI) const {
@@ -462,6 +525,7 @@ private:
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
   Instruction *narrowBinOp(TruncInst &Trunc);
+  Instruction *narrowMaskedBinOp(BinaryOperator &And);
   Instruction *narrowRotate(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
 
@@ -490,7 +554,7 @@ private:
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
                                        bool JoinedByAnd, Instruction &CxtI);
 public:
-  /// \brief Inserts an instruction \p New before instruction \p Old
+  /// Inserts an instruction \p New before instruction \p Old
   ///
   /// Also adds the new instruction to the worklist and returns \p New so that
   /// it is suitable for use as the return from the visitation patterns.
@@ -503,13 +567,13 @@ public:
     return New;
   }
 
-  /// \brief Same as InsertNewInstBefore, but also sets the debug loc.
+  /// Same as InsertNewInstBefore, but also sets the debug loc.
   Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
     New->setDebugLoc(Old.getDebugLoc());
     return InsertNewInstBefore(New, Old);
   }
 
-  /// \brief A combiner-aware RAUW-like routine.
+  /// A combiner-aware RAUW-like routine.
   ///
   /// This method is to be used when an instruction is found to be dead,
   /// replaceable with another preexisting expression. Here we add all uses of
@@ -527,8 +591,8 @@ public:
     if (&I == V)
       V = UndefValue::get(I.getType());
 
-    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
-                 << "    with " << *V << '\n');
+    LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                      << "    with " << *V << '\n');
 
     I.replaceAllUsesWith(V);
     return &I;
@@ -544,13 +608,13 @@ public:
     return InsertValueInst::Create(Struct, Result, 0);
   }
 
-  /// \brief Combiner aware instruction erasure.
+  /// Combiner aware instruction erasure.
   ///
   /// When dealing with an instruction that has side effects or produces a void
   /// value, we can't rely on DCE to delete the instruction. Instead, visit
   /// methods should return the value returned by this function.
   Instruction *eraseInstFromFunction(Instruction &I) {
-    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
+    LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     salvageDebugInfo(I);
 
@@ -599,6 +663,12 @@ public:
     return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
   }
 
+  OverflowResult computeOverflowForSignedMul(const Value *LHS,
+	                                         const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
   OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
                                                const Value *RHS,
                                                const Instruction *CxtI) const {
@@ -611,15 +681,26 @@ public:
     return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
 
+  OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
   /// Maximum size of array considered when transforming.
   uint64_t MaxArraySizeForCombine;
 
 private:
-  /// \brief Performs a few simplifications for operators which are associative
+  /// Performs a few simplifications for operators which are associative
   /// or commutative.
   bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
 
-  /// \brief Tries to simplify binary operations which some other binary
+  /// Tries to simplify binary operations which some other binary
   /// operation distributes over.
   ///
   /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
@@ -628,6 +709,13 @@ private:
   /// value, or null if it didn't simplify.
   Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
 
+  /// Tries to simplify add operations using the definition of remainder.
+  ///
+  /// The definition of remainder is X % C = X - (X / C ) * C. The add
+  /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to
+  /// X % (C0 * C1)
+  Value *SimplifyAddWithRemainder(BinaryOperator &I);
+
   // Binary Op helper for select operations where the expression can be
   // efficiently reorganized.
   Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
@@ -647,7 +735,7 @@ private:
                                ConstantInt *&Less, ConstantInt *&Equal,
                                ConstantInt *&Greater);
 
-  /// \brief Attempts to replace V with a simpler value based on the demanded
+  /// Attempts to replace V with a simpler value based on the demanded
   /// bits.
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
                                  unsigned Depth, Instruction *CxtI);
@@ -669,15 +757,19 @@ private:
       Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
       const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
 
-  /// \brief Tries to simplify operands to an integer instruction based on its
+  /// Tries to simplify operands to an integer instruction based on its
   /// demanded bits.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
 
+  Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
+                                               APInt DemandedElts,
+                                               int DmaskIdx = -1);
+
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                     APInt &UndefElts, unsigned Depth = 0);
 
-  Value *SimplifyVectorOp(BinaryOperator &Inst);
-
+  /// Canonicalize the position of binops relative to shufflevector.
+  Instruction *foldShuffledBinop(BinaryOperator &Inst);
 
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
@@ -691,11 +783,11 @@ private:
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// This is a convenience wrapper function for the above two functions.
-  Instruction *foldOpWithConstantIntoOperand(BinaryOperator &I);
+  Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
 
   Instruction *foldAddWithConstant(BinaryOperator &Add);
 
-  /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
+  /// Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
   Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
@@ -735,6 +827,8 @@ private:
 
   Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
                                       ConstantInt *C);
+  Instruction *foldICmpBitCastConstant(ICmpInst &Cmp, BitCastInst *Bitcast,
+                                       const APInt &C);
   Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
                                      const APInt &C);
   Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
@@ -789,13 +883,12 @@ private:
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
 
-  Instruction *SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI);
-  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
-  Instruction *SimplifyMemSet(MemSetInst *MI);
+  Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
+  Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
 
   Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
 
-  /// \brief Returns a value X such that Val = X * Scale, or null if none.
+  /// Returns a value X such that Val = X * Scale, or null if none.
   ///
   /// If the multiplication is known not to overflow then NoSignedWrap is set.
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index d4f06e18b957..742caf649007 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -23,7 +24,6 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -270,7 +270,7 @@ void PointerReplacer::findLoadAndReplace(Instruction &I) {
     auto *Inst = dyn_cast<Instruction>(&*U);
     if (!Inst)
       return;
-    DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
+    LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
     if (isa<LoadInst>(Inst)) {
       for (auto P : Path)
         replace(P);
@@ -405,8 +405,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
           Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
       if (AI.getAlignment() <= SourceAlign &&
           isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
-        DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
-        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+        LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+        LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           eraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
@@ -437,10 +437,10 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
 
 // Are we allowed to form a atomic load or store of this type?
 static bool isSupportedAtomicType(Type *Ty) {
-  return Ty->isIntegerTy() || Ty->isPointerTy() || Ty->isFloatingPointTy();
+  return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
 }
 
-/// \brief Helper to combine a load to a new type.
+/// Helper to combine a load to a new type.
 ///
 /// This just does the work of combining a load to a new type. It handles
 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
@@ -453,15 +453,20 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
                                       const Twine &Suffix = "") {
   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
          "can't fold an atomic load to requested type");
-  
+
   Value *Ptr = LI.getPointerOperand();
   unsigned AS = LI.getPointerAddressSpace();
   SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
   LI.getAllMetadata(MD);
 
+  Value *NewPtr = nullptr;
+  if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
+        NewPtr->getType()->getPointerElementType() == NewTy &&
+        NewPtr->getType()->getPointerAddressSpace() == AS))
+    NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
+
   LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
-      IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
-      LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
+      NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
   MDBuilder MDB(NewLoad->getContext());
   for (const auto &MDPair : MD) {
@@ -507,7 +512,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
   return NewLoad;
 }
 
-/// \brief Combine a store to a new type.
+/// Combine a store to a new type.
 ///
 /// Returns the newly created store instruction.
 static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
@@ -584,7 +589,7 @@ static bool isMinMaxWithLoads(Value *V) {
           match(L2, m_Load(m_Specific(LHS))));
 }
 
-/// \brief Combine loads to match the type of their uses' value after looking
+/// Combine loads to match the type of their uses' value after looking
 /// through intervening bitcasts.
 ///
 /// The core idea here is that if the result of a load is used in an operation,
@@ -959,23 +964,26 @@ static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
 }
 
 static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
-  if (SI.getPointerAddressSpace() != 0)
+  if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
     return false;
 
   auto *Ptr = SI.getPointerOperand();
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
     Ptr = GEPI->getOperand(0);
-  return isa<ConstantPointerNull>(Ptr);
+  return (isa<ConstantPointerNull>(Ptr) &&
+          !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
 }
 
 static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
     const Value *GEPI0 = GEPI->getOperand(0);
-    if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0)
+    if (isa<ConstantPointerNull>(GEPI0) &&
+        !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
       return true;
   }
   if (isa<UndefValue>(Op) ||
-      (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0))
+      (isa<ConstantPointerNull>(Op) &&
+       !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
     return true;
   return false;
 }
@@ -1071,14 +1079,16 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 
       // load (select (cond, null, P)) -> load P
       if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
-          LI.getPointerAddressSpace() == 0) {
+          !NullPointerIsDefined(SI->getFunction(),
+                                LI.getPointerAddressSpace())) {
         LI.setOperand(0, SI->getOperand(2));
         return &LI;
       }
 
       // load (select (cond, P, null)) -> load P
       if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
-          LI.getPointerAddressSpace() == 0) {
+          !NullPointerIsDefined(SI->getFunction(),
+                                LI.getPointerAddressSpace())) {
         LI.setOperand(0, SI->getOperand(1));
         return &LI;
       }
@@ -1087,7 +1097,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   return nullptr;
 }
 
-/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
+/// Look for extractelement/insertvalue sequence that acts like a bitcast.
 ///
 /// \returns underlying value that was "cast", or nullptr otherwise.
 ///
@@ -1142,7 +1152,7 @@ static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
   return U;
 }
 
-/// \brief Combine stores to match the type of value being stored.
+/// Combine stores to match the type of value being stored.
 ///
 /// The core idea here is that the memory does not have any intrinsic type and
 /// where we can we should match the type of a store to the type of value being
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 38604830b885..63761d427235 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -94,115 +95,52 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
   return MadeChange ? V : nullptr;
 }
 
-/// True if the multiply can not be expressed in an int this size.
-static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
-                              bool IsSigned) {
-  bool Overflow;
-  if (IsSigned)
-    Product = C1.smul_ov(C2, Overflow);
-  else
-    Product = C1.umul_ov(C2, Overflow);
-
-  return Overflow;
-}
-
-/// \brief True if C2 is a multiple of C1. Quotient contains C2/C1.
-static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
-                       bool IsSigned) {
-  assert(C1.getBitWidth() == C2.getBitWidth() &&
-         "Inconsistent width of constants!");
-
-  // Bail if we will divide by zero.
-  if (C2.isMinValue())
-    return false;
-
-  // Bail if we would divide INT_MIN by -1.
-  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
-    return false;
-
-  APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
-  if (IsSigned)
-    APInt::sdivrem(C1, C2, Quotient, Remainder);
-  else
-    APInt::udivrem(C1, C2, Quotient, Remainder);
-
-  return Remainder.isMinValue();
-}
-
-/// \brief A helper routine of InstCombiner::visitMul().
+/// A helper routine of InstCombiner::visitMul().
 ///
-/// If C is a vector of known powers of 2, then this function returns
-/// a new vector obtained from C replacing each element with its logBase2.
+/// If C is a scalar/vector of known powers of 2, then this function returns
+/// a new scalar/vector obtained from logBase2 of C.
 /// Return a null pointer otherwise.
-static Constant *getLogBase2Vector(ConstantDataVector *CV) {
+static Constant *getLogBase2(Type *Ty, Constant *C) {
   const APInt *IVal;
-  SmallVector<Constant *, 4> Elts;
+  if (match(C, m_APInt(IVal)) && IVal->isPowerOf2())
+    return ConstantInt::get(Ty, IVal->logBase2());
+
+  if (!Ty->isVectorTy())
+    return nullptr;
 
-  for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
-    Constant *Elt = CV->getElementAsConstant(I);
+  SmallVector<Constant *, 4> Elts;
+  for (unsigned I = 0, E = Ty->getVectorNumElements(); I != E; ++I) {
+    Constant *Elt = C->getAggregateElement(I);
+    if (!Elt)
+      return nullptr;
+    if (isa<UndefValue>(Elt)) {
+      Elts.push_back(UndefValue::get(Ty->getScalarType()));
+      continue;
+    }
     if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
       return nullptr;
-    Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2()));
+    Elts.push_back(ConstantInt::get(Ty->getScalarType(), IVal->logBase2()));
   }
 
   return ConstantVector::get(Elts);
 }
 
-/// \brief Return true if we can prove that:
-///    (mul LHS, RHS)  === (mul nsw LHS, RHS)
-bool InstCombiner::willNotOverflowSignedMul(const Value *LHS,
-                                            const Value *RHS,
-                                            const Instruction &CxtI) const {
-  // Multiplying n * m significant bits yields a result of n + m significant
-  // bits. If the total number of significant bits does not exceed the
-  // result bit width (minus 1), there is no overflow.
-  // This means if we have enough leading sign bits in the operands
-  // we can guarantee that the result does not overflow.
-  // Ref: "Hacker's Delight" by Henry Warren
-  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-
-  // Note that underestimating the number of sign bits gives a more
-  // conservative answer.
-  unsigned SignBits =
-      ComputeNumSignBits(LHS, 0, &CxtI) + ComputeNumSignBits(RHS, 0, &CxtI);
-
-  // First handle the easy case: if we have enough sign bits there's
-  // definitely no overflow.
-  if (SignBits > BitWidth + 1)
-    return true;
-
-  // There are two ambiguous cases where there can be no overflow:
-  //   SignBits == BitWidth + 1    and
-  //   SignBits == BitWidth
-  // The second case is difficult to check, therefore we only handle the
-  // first case.
-  if (SignBits == BitWidth + 1) {
-    // It overflows only when both arguments are negative and the true
-    // product is exactly the minimum negative number.
-    // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
-    // For simplicity we just check if at least one side is not negative.
-    KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
-    KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
-    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
-      return true;
-  }
-  return false;
-}
-
 Instruction *InstCombiner::visitMul(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyMulInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
   // X * -1 == 0 - X
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (match(Op1, m_AllOnes())) {
     BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
     if (I.hasNoSignedWrap())
@@ -231,16 +169,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
 
     if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
-      Constant *NewCst = nullptr;
-      if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2())
-        // Replace X*(2^C) with X << C, where C is either a scalar or a splat.
-        NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2());
-      else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1))
-        // Replace X*(2^C) with X << C, where C is a vector of known
-        // constant powers of 2.
-        NewCst = getLogBase2Vector(CV);
-
-      if (NewCst) {
+      // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
+      if (Constant *NewCst = getLogBase2(NewOp->getType(), C1)) {
         unsigned Width = NewCst->getType()->getPrimitiveSizeInBits();
         BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
 
@@ -282,34 +212,37 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
+  if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+    return FoldedMul;
+
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
-    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
-      return FoldedMul;
-
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
-    {
-      Value *X;
-      Constant *C1;
-      if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
-        Value *Mul = Builder.CreateMul(C1, Op1);
-        // Only go forward with the transform if C1*CI simplifies to a tidier
-        // constant.
-        if (!match(Mul, m_Mul(m_Value(), m_Value())))
-          return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
-      }
+    Value *X;
+    Constant *C1;
+    if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
+      Value *Mul = Builder.CreateMul(C1, Op1);
+      // Only go forward with the transform if C1*CI simplifies to a tidier
+      // constant.
+      if (!match(Mul, m_Mul(m_Value(), m_Value())))
+        return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
     }
   }
 
-  if (Value *Op0v = dyn_castNegVal(Op0)) {   // -X * -Y = X*Y
-    if (Value *Op1v = dyn_castNegVal(Op1)) {
-      BinaryOperator *BO = BinaryOperator::CreateMul(Op0v, Op1v);
-      if (I.hasNoSignedWrap() &&
-          match(Op0, m_NSWSub(m_Value(), m_Value())) &&
-          match(Op1, m_NSWSub(m_Value(), m_Value())))
-        BO->setHasNoSignedWrap();
-      return BO;
-    }
+  // -X * C --> X * -C
+  Value *X, *Y;
+  Constant *Op1C;
+  if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
+    return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
+
+  // -X * -Y --> X * Y
+  if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) {
+    auto *NewMul = BinaryOperator::CreateMul(X, Y);
+    if (I.hasNoSignedWrap() &&
+        cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() &&
+        cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap())
+      NewMul->setHasNoSignedWrap();
+    return NewMul;
   }
 
   // (X / Y) *  Y = X - (X % Y)
@@ -371,28 +304,24 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
-  // If one of the operands of the multiply is a cast from a boolean value, then
-  // we know the bool is either zero or one, so this is a 'masking' multiply.
-  //   X * Y (where Y is 0 or 1) -> X & (0-Y)
-  if (!I.getType()->isVectorTy()) {
-    // -2 is "-1 << 1" so it is all bits set except the low one.
-    APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
-
-    Value *BoolCast = nullptr, *OtherOp = nullptr;
-    if (MaskedValueIsZero(Op0, Negative2, 0, &I)) {
-      BoolCast = Op0;
-      OtherOp = Op1;
-    } else if (MaskedValueIsZero(Op1, Negative2, 0, &I)) {
-      BoolCast = Op1;
-      OtherOp = Op0;
-    }
-
-    if (BoolCast) {
-      Value *V = Builder.CreateSub(Constant::getNullValue(I.getType()),
-                                    BoolCast);
-      return BinaryOperator::CreateAnd(V, OtherOp);
-    }
-  }
+  // (bool X) * Y --> X ? Y : 0
+  // Y * (bool X) --> X ? Y : 0
+  if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
+  if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
+
+  // (lshr X, 31) * Y --> (ashr X, 31) & Y
+  // Y * (lshr X, 31) --> (ashr X, 31) & Y
+  // TODO: We are not checking one-use because the elimination of the multiply
+  //       is better for analysis?
+  // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
+  //       more similar to what we're doing above.
+  const APInt *C;
+  if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
+  if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
 
   // Check for (mul (sext x), y), see if we can merge this into an
   // integer mul followed by a sext.
@@ -466,6 +395,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
+  bool Changed = false;
   if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
@@ -479,303 +409,103 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-/// Detect pattern log2(Y * 0.5) with corresponding fast math flags.
-static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
-  if (!Op->hasOneUse())
-    return;
-
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
-  if (!II)
-    return;
-  if (II->getIntrinsicID() != Intrinsic::log2 || !II->isFast())
-    return;
-  Log2 = II;
-
-  Value *OpLog2Of = II->getArgOperand(0);
-  if (!OpLog2Of->hasOneUse())
-    return;
-
-  Instruction *I = dyn_cast<Instruction>(OpLog2Of);
-  if (!I)
-    return;
-
-  if (I->getOpcode() != Instruction::FMul || !I->isFast())
-    return;
-
-  if (match(I->getOperand(0), m_SpecificFP(0.5)))
-    Y = I->getOperand(1);
-  else if (match(I->getOperand(1), m_SpecificFP(0.5)))
-    Y = I->getOperand(0);
-}
-
-static bool isFiniteNonZeroFp(Constant *C) {
-  if (C->getType()->isVectorTy()) {
-    for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
-         ++I) {
-      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
-      if (!CFP || !CFP->getValueAPF().isFiniteNonZero())
-        return false;
-    }
-    return true;
-  }
-
-  return isa<ConstantFP>(C) &&
-         cast<ConstantFP>(C)->getValueAPF().isFiniteNonZero();
-}
-
-static bool isNormalFp(Constant *C) {
-  if (C->getType()->isVectorTy()) {
-    for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
-         ++I) {
-      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
-      if (!CFP || !CFP->getValueAPF().isNormal())
-        return false;
-    }
-    return true;
-  }
-
-  return isa<ConstantFP>(C) && cast<ConstantFP>(C)->getValueAPF().isNormal();
-}
-
-/// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
-/// true iff the given value is FMul or FDiv with one and only one operand
-/// being a normal constant (i.e. not Zero/NaN/Infinity).
-static bool isFMulOrFDivWithConstant(Value *V) {
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I || (I->getOpcode() != Instruction::FMul &&
-             I->getOpcode() != Instruction::FDiv))
-    return false;
-
-  Constant *C0 = dyn_cast<Constant>(I->getOperand(0));
-  Constant *C1 = dyn_cast<Constant>(I->getOperand(1));
-
-  if (C0 && C1)
-    return false;
-
-  return (C0 && isFiniteNonZeroFp(C0)) || (C1 && isFiniteNonZeroFp(C1));
-}
+Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+  if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
 
-/// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
-/// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
-/// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
-/// This function is to simplify "FMulOrDiv * C" and returns the
-/// resulting expression. Note that this function could return NULL in
-/// case the constants cannot be folded into a normal floating-point.
-Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C,
-                                   Instruction *InsertBefore) {
-  assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
-
-  Value *Opnd0 = FMulOrDiv->getOperand(0);
-  Value *Opnd1 = FMulOrDiv->getOperand(1);
-
-  Constant *C0 = dyn_cast<Constant>(Opnd0);
-  Constant *C1 = dyn_cast<Constant>(Opnd1);
-
-  BinaryOperator *R = nullptr;
-
-  // (X * C0) * C => X * (C0*C)
-  if (FMulOrDiv->getOpcode() == Instruction::FMul) {
-    Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C);
-    if (isNormalFp(F))
-      R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F);
-  } else {
-    if (C0) {
-      // (C0 / X) * C => (C0 * C) / X
-      if (FMulOrDiv->hasOneUse()) {
-        // It would otherwise introduce another div.
-        Constant *F = ConstantExpr::getFMul(C0, C);
-        if (isNormalFp(F))
-          R = BinaryOperator::CreateFDiv(F, Opnd1);
-      }
-    } else {
-      // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
-      Constant *F = ConstantExpr::getFDiv(C, C1);
-      if (isNormalFp(F)) {
-        R = BinaryOperator::CreateFMul(Opnd0, F);
-      } else {
-        // (X / C1) * C => X / (C1/C)
-        Constant *F = ConstantExpr::getFDiv(C1, C);
-        if (isNormalFp(F))
-          R = BinaryOperator::CreateFDiv(Opnd0, F);
-      }
-    }
-  }
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
 
-  if (R) {
-    R->setFast(true);
-    InsertNewInstWith(R, *InsertBefore);
-  }
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
-  return R;
-}
+  if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+    return FoldedMul;
 
-Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
+  // X * -1.0 --> -X
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (match(Op1, m_SpecificFP(-1.0)))
+    return BinaryOperator::CreateFNegFMF(Op0, &I);
 
-  if (Value *V = SimplifyVectorOp(I))
-    return replaceInstUsesWith(I, V);
-
-  if (isa<Constant>(Op0))
-    std::swap(Op0, Op1);
+  // -X * -Y --> X * Y
+  Value *X, *Y;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+    return BinaryOperator::CreateFMulFMF(X, Y, &I);
 
-  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  // -X * C --> X * -C
+  Constant *C;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
+    return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
 
-  bool AllowReassociate = I.isFast();
+  // Sink negation: -X * Y --> -(X * Y)
+  if (match(Op0, m_OneUse(m_FNeg(m_Value(X)))))
+    return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op1, &I), &I);
 
-  // Simplify mul instructions with a constant RHS.
-  if (isa<Constant>(Op1)) {
-    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
-      return FoldedMul;
-
-    // (fmul X, -1.0) --> (fsub -0.0, X)
-    if (match(Op1, m_SpecificFP(-1.0))) {
-      Constant *NegZero = ConstantFP::getNegativeZero(Op1->getType());
-      Instruction *RI = BinaryOperator::CreateFSub(NegZero, Op0);
-      RI->copyFastMathFlags(&I);
-      return RI;
-    }
+  // Sink negation: Y * -X --> -(X * Y)
+  if (match(Op1, m_OneUse(m_FNeg(m_Value(X)))))
+    return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op0, &I), &I);
 
-    Constant *C = cast<Constant>(Op1);
-    if (AllowReassociate && isFiniteNonZeroFp(C)) {
-      // Let MDC denote an expression in one of these forms:
-      // X * C, C/X, X/C, where C is a constant.
-      //
-      // Try to simplify "MDC * Constant"
-      if (isFMulOrFDivWithConstant(Op0))
-        if (Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I))
-          return replaceInstUsesWith(I, V);
-
-      // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
-      Instruction *FAddSub = dyn_cast<Instruction>(Op0);
-      if (FAddSub &&
-          (FAddSub->getOpcode() == Instruction::FAdd ||
-           FAddSub->getOpcode() == Instruction::FSub)) {
-        Value *Opnd0 = FAddSub->getOperand(0);
-        Value *Opnd1 = FAddSub->getOperand(1);
-        Constant *C0 = dyn_cast<Constant>(Opnd0);
-        Constant *C1 = dyn_cast<Constant>(Opnd1);
-        bool Swap = false;
-        if (C0) {
-          std::swap(C0, C1);
-          std::swap(Opnd0, Opnd1);
-          Swap = true;
-        }
+  // fabs(X) * fabs(X) -> X * X
+  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
+    return BinaryOperator::CreateFMulFMF(X, X, &I);
 
-        if (C1 && isFiniteNonZeroFp(C1) && isFMulOrFDivWithConstant(Opnd0)) {
-          Value *M1 = ConstantExpr::getFMul(C1, C);
-          Value *M0 = isNormalFp(cast<Constant>(M1)) ?
-                      foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
-                      nullptr;
-          if (M0 && M1) {
-            if (Swap && FAddSub->getOpcode() == Instruction::FSub)
-              std::swap(M0, M1);
-
-            Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd)
-                                  ? BinaryOperator::CreateFAdd(M0, M1)
-                                  : BinaryOperator::CreateFSub(M0, M1);
-            RI->copyFastMathFlags(&I);
-            return RI;
-          }
-        }
-      }
-    }
-  }
+  // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E)
+  if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
+    return replaceInstUsesWith(I, V);
 
-  if (Op0 == Op1) {
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
-      // sqrt(X) * sqrt(X) -> X
-      if (AllowReassociate && II->getIntrinsicID() == Intrinsic::sqrt)
-        return replaceInstUsesWith(I, II->getOperand(0));
-
-      // fabs(X) * fabs(X) -> X * X
-      if (II->getIntrinsicID() == Intrinsic::fabs) {
-        Instruction *FMulVal = BinaryOperator::CreateFMul(II->getOperand(0),
-                                                          II->getOperand(0),
-                                                          I.getName());
-        FMulVal->copyFastMathFlags(&I);
-        return FMulVal;
+  if (I.hasAllowReassoc()) {
+    // Reassociate constant RHS with another constant to form constant
+    // expression.
+    if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
+      Constant *C1;
+      if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
+        // (C1 / X) * C --> (C * C1) / X
+        Constant *CC1 = ConstantExpr::getFMul(C, C1);
+        if (CC1->isNormalFP())
+          return BinaryOperator::CreateFDivFMF(CC1, X, &I);
       }
-    }
-  }
-
-  // Under unsafe algebra do:
-  // X * log2(0.5*Y) = X*log2(Y) - X
-  if (AllowReassociate) {
-    Value *OpX = nullptr;
-    Value *OpY = nullptr;
-    IntrinsicInst *Log2;
-    detectLog2OfHalf(Op0, OpY, Log2);
-    if (OpY) {
-      OpX = Op1;
-    } else {
-      detectLog2OfHalf(Op1, OpY, Log2);
-      if (OpY) {
-        OpX = Op0;
+      if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+        // (X / C1) * C --> X * (C / C1)
+        Constant *CDivC1 = ConstantExpr::getFDiv(C, C1);
+        if (CDivC1->isNormalFP())
+          return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+
+        // If the constant was a denormal, try reassociating differently.
+        // (X / C1) * C --> X / (C1 / C)
+        Constant *C1DivC = ConstantExpr::getFDiv(C1, C);
+        if (Op0->hasOneUse() && C1DivC->isNormalFP())
+          return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
       }
-    }
-    // if pattern detected emit alternate sequence
-    if (OpX && OpY) {
-      BuilderTy::FastMathFlagGuard Guard(Builder);
-      Builder.setFastMathFlags(Log2->getFastMathFlags());
-      Log2->setArgOperand(0, OpY);
-      Value *FMulVal = Builder.CreateFMul(OpX, Log2);
-      Value *FSub = Builder.CreateFSub(FMulVal, OpX);
-      FSub->takeName(&I);
-      return replaceInstUsesWith(I, FSub);
-    }
-  }
 
-  // sqrt(a) * sqrt(b) -> sqrt(a * b)
-  if (AllowReassociate &&
-      Op0->hasOneUse() && Op1->hasOneUse()) {
-    Value *Opnd0 = nullptr;
-    Value *Opnd1 = nullptr;
-    if (match(Op0, m_Intrinsic<Intrinsic::sqrt>(m_Value(Opnd0))) &&
-        match(Op1, m_Intrinsic<Intrinsic::sqrt>(m_Value(Opnd1)))) {
-      BuilderTy::FastMathFlagGuard Guard(Builder);
-      Builder.setFastMathFlags(I.getFastMathFlags());
-      Value *FMulVal = Builder.CreateFMul(Opnd0, Opnd1);
-      Value *Sqrt = Intrinsic::getDeclaration(I.getModule(), 
-                                              Intrinsic::sqrt, I.getType());
-      Value *SqrtCall = Builder.CreateCall(Sqrt, FMulVal);
-      return replaceInstUsesWith(I, SqrtCall);
-    }
-  }
-
-  // Handle symmetric situation in a 2-iteration loop
-  Value *Opnd0 = Op0;
-  Value *Opnd1 = Op1;
-  for (int i = 0; i < 2; i++) {
-    bool IgnoreZeroSign = I.hasNoSignedZeros();
-    if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
-      BuilderTy::FastMathFlagGuard Guard(Builder);
-      Builder.setFastMathFlags(I.getFastMathFlags());
-
-      Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
-      Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
-
-      // -X * -Y => X*Y
-      if (N1) {
-        Value *FMul = Builder.CreateFMul(N0, N1);
-        FMul->takeName(&I);
-        return replaceInstUsesWith(I, FMul);
+      // We do not need to match 'fadd C, X' and 'fsub X, C' because they are
+      // canonicalized to 'fadd X, C'. Distributing the multiply may allow
+      // further folds and (X * C) + C2 is 'fma'.
+      if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) {
+        // (X + C1) * C --> (X * C) + (C * C1)
+        Constant *CC1 = ConstantExpr::getFMul(C, C1);
+        Value *XC = Builder.CreateFMulFMF(X, C, &I);
+        return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
       }
-
-      if (Opnd0->hasOneUse()) {
-        // -X * Y => -(X*Y) (Promote negation as high as possible)
-        Value *T = Builder.CreateFMul(N0, Opnd1);
-        Value *Neg = Builder.CreateFNeg(T);
-        Neg->takeName(&I);
-        return replaceInstUsesWith(I, Neg);
+      if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
+        // (C1 - X) * C --> (C * C1) - (X * C)
+        Constant *CC1 = ConstantExpr::getFMul(C, C1);
+        Value *XC = Builder.CreateFMulFMF(X, C, &I);
+        return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
       }
     }
 
-    // Handle specials cases for FMul with selects feeding the operation
-    if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
-      return replaceInstUsesWith(I, V);
+    // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
+    // nnan disallows the possibility of returning a number if both operands are
+    // negative (in that case, we should return NaN).
+    if (I.hasNoNaNs() &&
+        match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
+        match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+      Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+      Value *Sqrt = Builder.CreateIntrinsic(Intrinsic::sqrt, { XY }, &I);
+      return replaceInstUsesWith(I, Sqrt);
+    }
 
     // (X*Y) * X => (X*X) * Y where Y != X
     //  The purpose is two-fold:
@@ -784,34 +514,40 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     //  latency of the instruction Y is amortized by the expression of X*X,
     //  and therefore Y is in a "less critical" position compared to what it
     //  was before the transformation.
-    if (AllowReassociate) {
-      Value *Opnd0_0, *Opnd0_1;
-      if (Opnd0->hasOneUse() &&
-          match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
-        Value *Y = nullptr;
-        if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
-          Y = Opnd0_1;
-        else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
-          Y = Opnd0_0;
-
-        if (Y) {
-          BuilderTy::FastMathFlagGuard Guard(Builder);
-          Builder.setFastMathFlags(I.getFastMathFlags());
-          Value *T = Builder.CreateFMul(Opnd1, Opnd1);
-          Value *R = Builder.CreateFMul(T, Y);
-          R->takeName(&I);
-          return replaceInstUsesWith(I, R);
-        }
-      }
+    if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) &&
+        Op1 != Y) {
+      Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I);
+      return BinaryOperator::CreateFMulFMF(XX, Y, &I);
     }
+    if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) &&
+        Op0 != Y) {
+      Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I);
+      return BinaryOperator::CreateFMulFMF(XX, Y, &I);
+    }
+  }
 
-    if (!isa<Constant>(Op1))
-      std::swap(Opnd0, Opnd1);
-    else
-      break;
+  // log2(X * 0.5) * Y = log2(X) * Y - Y
+  if (I.isFast()) {
+    IntrinsicInst *Log2 = nullptr;
+    if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+            m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+      Log2 = cast<IntrinsicInst>(Op0);
+      Y = Op1;
+    }
+    if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+            m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+      Log2 = cast<IntrinsicInst>(Op1);
+      Y = Op0;
+    }
+    if (Log2) {
+      Log2->setArgOperand(0, X);
+      Log2->copyFastMathFlags(&I);
+      Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
+      return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
+    }
   }
 
-  return Changed ? &I : nullptr;
+  return nullptr;
 }
 
 /// Fold a divide or remainder with a select instruction divisor when one of the
@@ -852,9 +588,9 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
   Type *CondTy = SelectCond->getType();
   while (BBI != BBFront) {
     --BBI;
-    // If we found a call to a function, we can't assume it will return, so
+    // If we found an instruction that we can't assume will return, so
     // information from below it cannot be propagated above it.
-    if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
+    if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI))
       break;
 
     // Replace uses of the select or its condition with the known values.
@@ -884,12 +620,44 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
   return true;
 }
 
+/// True if the multiply can not be expressed in an int this size.
+static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
+                              bool IsSigned) {
+  bool Overflow;
+  Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow);
+  return Overflow;
+}
+
+/// True if C1 is a multiple of C2. Quotient contains C1/C2.
+static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
+                       bool IsSigned) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");
+
+  // Bail if we will divide by zero.
+  if (C2.isNullValue())
+    return false;
+
+  // Bail if we would divide INT_MIN by -1.
+  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+    return false;
+
+  APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
+  if (IsSigned)
+    APInt::sdivrem(C1, C2, Quotient, Remainder);
+  else
+    APInt::udivrem(C1, C2, Quotient, Remainder);
+
+  return Remainder.isMinValue();
+}
+
 /// This function implements the transforms common to both integer division
 /// instructions (udiv and sdiv). It is called by the visitors to those integer
 /// division instructions.
-/// @brief Common integer divide transforms
+/// Common integer divide transforms
 Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  bool IsSigned = I.getOpcode() == Instruction::SDiv;
+  Type *Ty = I.getType();
 
   // The RHS is known non-zero.
   if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
@@ -902,94 +670,87 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   if (simplifyDivRemOfSelectWithZeroOp(I))
     return &I;
 
-  if (Instruction *LHS = dyn_cast<Instruction>(Op0)) {
-    const APInt *C2;
-    if (match(Op1, m_APInt(C2))) {
-      Value *X;
-      const APInt *C1;
-      bool IsSigned = I.getOpcode() == Instruction::SDiv;
-
-      // (X / C1) / C2  -> X / (C1*C2)
-      if ((IsSigned && match(LHS, m_SDiv(m_Value(X), m_APInt(C1)))) ||
-          (!IsSigned && match(LHS, m_UDiv(m_Value(X), m_APInt(C1))))) {
-        APInt Product(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
-        if (!MultiplyOverflows(*C1, *C2, Product, IsSigned))
-          return BinaryOperator::Create(I.getOpcode(), X,
-                                        ConstantInt::get(I.getType(), Product));
-      }
-
-      if ((IsSigned && match(LHS, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
-          (!IsSigned && match(LHS, m_NUWMul(m_Value(X), m_APInt(C1))))) {
-        APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+  const APInt *C2;
+  if (match(Op1, m_APInt(C2))) {
+    Value *X;
+    const APInt *C1;
+
+    // (X / C1) / C2  -> X / (C1*C2)
+    if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) ||
+        (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) {
+      APInt Product(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+      if (!multiplyOverflows(*C1, *C2, Product, IsSigned))
+        return BinaryOperator::Create(I.getOpcode(), X,
+                                      ConstantInt::get(Ty, Product));
+    }
 
-        // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
-        if (IsMultiple(*C2, *C1, Quotient, IsSigned)) {
-          BinaryOperator *BO = BinaryOperator::Create(
-              I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
-          BO->setIsExact(I.isExact());
-          return BO;
-        }
+    if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
+        (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) {
+      APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
 
-        // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
-        if (IsMultiple(*C1, *C2, Quotient, IsSigned)) {
-          BinaryOperator *BO = BinaryOperator::Create(
-              Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
-          BO->setHasNoUnsignedWrap(
-              !IsSigned &&
-              cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
-          BO->setHasNoSignedWrap(
-              cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
-          return BO;
-        }
+      // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
+      if (isMultiple(*C2, *C1, Quotient, IsSigned)) {
+        auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X,
+                                              ConstantInt::get(Ty, Quotient));
+        NewDiv->setIsExact(I.isExact());
+        return NewDiv;
       }
 
-      if ((IsSigned && match(LHS, m_NSWShl(m_Value(X), m_APInt(C1))) &&
-           *C1 != C1->getBitWidth() - 1) ||
-          (!IsSigned && match(LHS, m_NUWShl(m_Value(X), m_APInt(C1))))) {
-        APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
-        APInt C1Shifted = APInt::getOneBitSet(
-            C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
-
-        // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of C1.
-        if (IsMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
-          BinaryOperator *BO = BinaryOperator::Create(
-              I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
-          BO->setIsExact(I.isExact());
-          return BO;
-        }
+      // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
+      if (isMultiple(*C1, *C2, Quotient, IsSigned)) {
+        auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+                                           ConstantInt::get(Ty, Quotient));
+        auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+        Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+        Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+        return Mul;
+      }
+    }
 
-        // (X << C1) / C2 -> X * (C2 >> C1) if C1 is a multiple of C2.
-        if (IsMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
-          BinaryOperator *BO = BinaryOperator::Create(
-              Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
-          BO->setHasNoUnsignedWrap(
-              !IsSigned &&
-              cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
-          BO->setHasNoSignedWrap(
-              cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
-          return BO;
-        }
+    if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
+         *C1 != C1->getBitWidth() - 1) ||
+        (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+      APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+      APInt C1Shifted = APInt::getOneBitSet(
+          C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+
+      // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
+      if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
+        auto *BO = BinaryOperator::Create(I.getOpcode(), X,
+                                          ConstantInt::get(Ty, Quotient));
+        BO->setIsExact(I.isExact());
+        return BO;
       }
 
-      if (!C2->isNullValue()) // avoid X udiv 0
-        if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
-          return FoldedDiv;
+      // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2.
+      if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
+        auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+                                           ConstantInt::get(Ty, Quotient));
+        auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+        Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+        Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+        return Mul;
+      }
     }
+
+    if (!C2->isNullValue()) // avoid X udiv 0
+      if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
+        return FoldedDiv;
   }
 
   if (match(Op0, m_One())) {
-    assert(!I.getType()->isIntOrIntVectorTy(1) && "i1 divide not removed?");
-    if (I.getOpcode() == Instruction::SDiv) {
+    assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
+    if (IsSigned) {
       // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
       // result is one, if Op1 is -1 then the result is minus one, otherwise
       // it's zero.
       Value *Inc = Builder.CreateAdd(Op1, Op0);
-      Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(I.getType(), 3));
-      return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0));
+      Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
+      return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
     } else {
       // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
       // result is one, otherwise it's zero.
-      return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), I.getType());
+      return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty);
     }
   }
 
@@ -998,12 +759,28 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
     return &I;
 
   // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
-  Value *X = nullptr, *Z = nullptr;
-  if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1
-    bool isSigned = I.getOpcode() == Instruction::SDiv;
-    if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
-        (!isSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
+  Value *X, *Z;
+  if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1
+    if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
+        (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
       return BinaryOperator::Create(I.getOpcode(), X, Op1);
+
+  // (X << Y) / X -> 1 << Y
+  Value *Y;
+  if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y))))
+    return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y);
+  if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y))))
+    return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y);
+
+  // X / (X * Y) -> 1 / Y if the multiplication does not overflow.
+  if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) {
+    bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
+    bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
+    if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) {
+      I.setOperand(0, ConstantInt::get(Ty, 1));
+      I.setOperand(1, Y);
+      return &I;
+    }
   }
 
   return nullptr;
@@ -1017,7 +794,7 @@ using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
                                            const BinaryOperator &I,
                                            InstCombiner &IC);
 
-/// \brief Used to maintain state for visitUDivOperand().
+/// Used to maintain state for visitUDivOperand().
 struct UDivFoldAction {
   /// Informs visitUDiv() how to fold this operand.  This can be zero if this
   /// action joins two actions together.
@@ -1045,23 +822,15 @@ struct UDivFoldAction {
 // X udiv 2^C -> X >> C
 static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
                                     const BinaryOperator &I, InstCombiner &IC) {
-  const APInt &C = cast<Constant>(Op1)->getUniqueInteger();
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(
-      Op0, ConstantInt::get(Op0->getType(), C.logBase2()));
+  Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1));
+  if (!C1)
+    llvm_unreachable("Failed to constant fold udiv -> logbase2");
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
   if (I.isExact())
     LShr->setIsExact();
   return LShr;
 }
 
-// X udiv C, where C >= signbit
-static Instruction *foldUDivNegCst(Value *Op0, Value *Op1,
-                                   const BinaryOperator &I, InstCombiner &IC) {
-  Value *ICI = IC.Builder.CreateICmpULT(Op0, cast<ConstantInt>(Op1));
-
-  return SelectInst::Create(ICI, Constant::getNullValue(I.getType()),
-                            ConstantInt::get(I.getType(), 1));
-}
-
 // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
 // X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
 static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
@@ -1070,12 +839,14 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
   if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
     ShiftLeft = Op1;
 
-  const APInt *CI;
+  Constant *CI;
   Value *N;
-  if (!match(ShiftLeft, m_Shl(m_APInt(CI), m_Value(N))))
+  if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
     llvm_unreachable("match should never fail here!");
-  if (*CI != 1)
-    N = IC.Builder.CreateAdd(N, ConstantInt::get(N->getType(), CI->logBase2()));
+  Constant *Log2Base = getLogBase2(N->getType(), CI);
+  if (!Log2Base)
+    llvm_unreachable("getLogBase2 should never fail here!");
+  N = IC.Builder.CreateAdd(N, Log2Base);
   if (Op1 != ShiftLeft)
     N = IC.Builder.CreateZExt(N, Op1->getType());
   BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
@@ -1084,7 +855,7 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
   return LShr;
 }
 
-// \brief Recursively visits the possible right hand operands of a udiv
+// Recursively visits the possible right hand operands of a udiv
 // instruction, seeing through select instructions, to determine if we can
 // replace the udiv with something simpler.  If we find that an operand is not
 // able to simplify the udiv, we abort the entire transformation.
@@ -1098,13 +869,6 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
     return Actions.size();
   }
 
-  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1))
-    // X udiv C, where C >= signbit
-    if (C->getValue().isNegative()) {
-      Actions.push_back(UDivFoldAction(foldUDivNegCst, C));
-      return Actions.size();
-    }
-
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
   if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
       match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
@@ -1165,40 +929,65 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
 }
 
 Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyUDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
     return Common;
 
-  // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
-  {
-    Value *X;
-    const APInt *C1, *C2;
-    if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) &&
-        match(Op1, m_APInt(C2))) {
-      bool Overflow;
-      APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
-      if (!Overflow) {
-        bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
-        BinaryOperator *BO = BinaryOperator::CreateUDiv(
-            X, ConstantInt::get(X->getType(), C2ShlC1));
-        if (IsExact)
-          BO->setIsExact();
-        return BO;
-      }
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *X;
+  const APInt *C1, *C2;
+  if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) {
+    // (X lshr C1) udiv C2 --> X udiv (C2 << C1)
+    bool Overflow;
+    APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
+    if (!Overflow) {
+      bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
+      BinaryOperator *BO = BinaryOperator::CreateUDiv(
+          X, ConstantInt::get(X->getType(), C2ShlC1));
+      if (IsExact)
+        BO->setIsExact();
+      return BO;
     }
   }
 
+  // Op0 / C where C is large (negative) --> zext (Op0 >= C)
+  // TODO: Could use isKnownNegative() to handle non-constant values.
+  Type *Ty = I.getType();
+  if (match(Op1, m_Negative())) {
+    Value *Cmp = Builder.CreateICmpUGE(Op0, Op1);
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
+  // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined)
+  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+    Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
+
   if (Instruction *NarrowDiv = narrowUDivURem(I, Builder))
     return NarrowDiv;
 
+  // If the udiv operands are non-overflowing multiplies with a common operand,
+  // then eliminate the common factor:
+  // (A * B) / (A * X) --> B / X (and commuted variants)
+  // TODO: The code would be reduced if we had m_c_NUWMul pattern matching.
+  // TODO: If -reassociation handled this generally, we could remove this.
+  Value *A, *B;
+  if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) {
+    if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) ||
+        match(Op1, m_NUWMul(m_Value(X), m_Specific(A))))
+      return BinaryOperator::CreateUDiv(B, X);
+    if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) ||
+        match(Op1, m_NUWMul(m_Value(X), m_Specific(B))))
+      return BinaryOperator::CreateUDiv(A, X);
+  }
+
   // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
   SmallVector<UDivFoldAction, 6> UDivActions;
   if (visitUDivOperand(Op0, Op1, I, UDivActions))
@@ -1234,24 +1023,27 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
 }
 
 Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
     return Common;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *X;
+  // sdiv Op0, -1 --> -Op0
+  // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
+  if (match(Op1, m_AllOnes()) ||
+      (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
+    return BinaryOperator::CreateNeg(Op0);
+
   const APInt *Op1C;
   if (match(Op1, m_APInt(Op1C))) {
-    // sdiv X, -1 == -X
-    if (Op1C->isAllOnesValue())
-      return BinaryOperator::CreateNeg(Op0);
-
     // sdiv exact X, C  -->  ashr exact X, log2(C)
     if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) {
       Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2());
@@ -1315,166 +1107,148 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   return nullptr;
 }
 
-/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
-/// FP value and:
-///    1) 1/C is exact, or
-///    2) reciprocal is allowed.
-/// If the conversion was successful, the simplified expression "X * 1/C" is
-/// returned; otherwise, nullptr is returned.
-static Instruction *CvtFDivConstToReciprocal(Value *Dividend, Constant *Divisor,
-                                             bool AllowReciprocal) {
-  if (!isa<ConstantFP>(Divisor)) // TODO: handle vectors.
+/// Remove negation and try to convert division into multiplication.
+static Instruction *foldFDivConstantDivisor(BinaryOperator &I) {
+  Constant *C;
+  if (!match(I.getOperand(1), m_Constant(C)))
     return nullptr;
 
-  const APFloat &FpVal = cast<ConstantFP>(Divisor)->getValueAPF();
-  APFloat Reciprocal(FpVal.getSemantics());
-  bool Cvt = FpVal.getExactInverse(&Reciprocal);
+  // -X / C --> X / -C
+  Value *X;
+  if (match(I.getOperand(0), m_FNeg(m_Value(X))))
+    return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
 
-  if (!Cvt && AllowReciprocal && FpVal.isFiniteNonZero()) {
-    Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
-    (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
-    Cvt = !Reciprocal.isDenormal();
-  }
+  // If the constant divisor has an exact inverse, this is always safe. If not,
+  // then we can still create a reciprocal if fast-math-flags allow it and the
+  // constant is a regular number (not zero, infinite, or denormal).
+  if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP())))
+    return nullptr;
 
-  if (!Cvt)
+  // Disallow denormal constants because we don't know what would happen
+  // on all targets.
+  // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+  // denorms are flushed?
+  auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C);
+  if (!RecipC->isNormalFP())
     return nullptr;
 
-  ConstantFP *R;
-  R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
-  return BinaryOperator::CreateFMul(Dividend, R);
+  // X / C --> X * (1 / C)
+  return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I);
 }
 
-Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+/// Remove negation and try to reassociate constant math.
+static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
+  Constant *C;
+  if (!match(I.getOperand(0), m_Constant(C)))
+    return nullptr;
 
-  if (Value *V = SimplifyVectorOp(I))
-    return replaceInstUsesWith(I, V);
+  // C / -X --> -C / X
+  Value *X;
+  if (match(I.getOperand(1), m_FNeg(m_Value(X))))
+    return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
+
+  if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
+    return nullptr;
+
+  // Try to reassociate C / X expressions where X includes another constant.
+  Constant *C2, *NewC = nullptr;
+  if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) {
+    // C / (X * C2) --> (C / C2) / X
+    NewC = ConstantExpr::getFDiv(C, C2);
+  } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) {
+    // C / (X / C2) --> (C * C2) / X
+    NewC = ConstantExpr::getFMul(C, C2);
+  }
+  // Disallow denormal constants because we don't know what would happen
+  // on all targets.
+  // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+  // denorms are flushed?
+  if (!NewC || !NewC->isNormalFP())
+    return nullptr;
+
+  return BinaryOperator::CreateFDivFMF(NewC, X, &I);
+}
 
-  if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+  if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
+
+  if (Instruction *R = foldFDivConstantDivisor(I))
+    return R;
+
+  if (Instruction *R = foldFDivConstantDividend(I))
+    return R;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
-  bool AllowReassociate = I.isFast();
-  bool AllowReciprocal = I.hasAllowReciprocal();
-
-  if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+  if (isa<Constant>(Op1))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
-    if (AllowReassociate) {
-      Constant *C1 = nullptr;
-      Constant *C2 = Op1C;
-      Value *X;
-      Instruction *Res = nullptr;
-
-      if (match(Op0, m_FMul(m_Value(X), m_Constant(C1)))) {
-        // (X*C1)/C2 => X * (C1/C2)
-        //
-        Constant *C = ConstantExpr::getFDiv(C1, C2);
-        if (isNormalFp(C))
-          Res = BinaryOperator::CreateFMul(X, C);
-      } else if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
-        // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
-        Constant *C = ConstantExpr::getFMul(C1, C2);
-        if (isNormalFp(C)) {
-          Res = CvtFDivConstToReciprocal(X, C, AllowReciprocal);
-          if (!Res)
-            Res = BinaryOperator::CreateFDiv(X, C);
-        }
-      }
-
-      if (Res) {
-        Res->setFastMathFlags(I.getFastMathFlags());
-        return Res;
-      }
+  if (I.hasAllowReassoc() && I.hasAllowReciprocal()) {
+    Value *X, *Y;
+    if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+        (!isa<Constant>(Y) || !isa<Constant>(Op1))) {
+      // (X / Y) / Z => X / (Y * Z)
+      Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFDivFMF(X, YZ, &I);
     }
-
-    // X / C => X * 1/C
-    if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal)) {
-      T->copyFastMathFlags(&I);
-      return T;
+    if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+        (!isa<Constant>(Y) || !isa<Constant>(Op0))) {
+      // Z / (X / Y) => (Y * Z) / X
+      Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
+      return BinaryOperator::CreateFDivFMF(YZ, X, &I);
     }
-
-    return nullptr;
   }
 
-  if (AllowReassociate && isa<Constant>(Op0)) {
-    Constant *C1 = cast<Constant>(Op0), *C2;
-    Constant *Fold = nullptr;
+  if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
+    // sin(X) / cos(X) -> tan(X)
+    // cos(X) / sin(X) -> 1/tan(X) (cotangent)
     Value *X;
-    bool CreateDiv = true;
-
-    // C1 / (X*C2) => (C1/C2) / X
-    if (match(Op1, m_FMul(m_Value(X), m_Constant(C2))))
-      Fold = ConstantExpr::getFDiv(C1, C2);
-    else if (match(Op1, m_FDiv(m_Value(X), m_Constant(C2)))) {
-      // C1 / (X/C2) => (C1*C2) / X
-      Fold = ConstantExpr::getFMul(C1, C2);
-    } else if (match(Op1, m_FDiv(m_Constant(C2), m_Value(X)))) {
-      // C1 / (C2/X) => (C1/C2) * X
-      Fold = ConstantExpr::getFDiv(C1, C2);
-      CreateDiv = false;
-    }
-
-    if (Fold && isNormalFp(Fold)) {
-      Instruction *R = CreateDiv ? BinaryOperator::CreateFDiv(Fold, X)
-                                 : BinaryOperator::CreateFMul(X, Fold);
-      R->setFastMathFlags(I.getFastMathFlags());
-      return R;
+    bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) &&
+                 match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X)));
+    bool IsCot =
+        !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
+                  match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
+
+    if ((IsTan || IsCot) && hasUnaryFloatFn(&TLI, I.getType(), LibFunc_tan,
+                                            LibFunc_tanf, LibFunc_tanl)) {
+      IRBuilder<> B(&I);
+      IRBuilder<>::FastMathFlagGuard FMFGuard(B);
+      B.setFastMathFlags(I.getFastMathFlags());
+      AttributeList Attrs = CallSite(Op0).getCalledFunction()->getAttributes();
+      Value *Res = emitUnaryFloatFnCall(X, TLI.getName(LibFunc_tan), B, Attrs);
+      if (IsCot)
+        Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
+      return replaceInstUsesWith(I, Res);
     }
-    return nullptr;
   }
 
-  if (AllowReassociate) {
-    Value *X, *Y;
-    Value *NewInst = nullptr;
-    Instruction *SimpR = nullptr;
-
-    if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
-      // (X/Y) / Z => X / (Y*Z)
-      if (!isa<Constant>(Y) || !isa<Constant>(Op1)) {
-        NewInst = Builder.CreateFMul(Y, Op1);
-        if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
-          FastMathFlags Flags = I.getFastMathFlags();
-          Flags &= cast<Instruction>(Op0)->getFastMathFlags();
-          RI->setFastMathFlags(Flags);
-        }
-        SimpR = BinaryOperator::CreateFDiv(X, NewInst);
-      }
-    } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
-      // Z / (X/Y) => Z*Y / X
-      if (!isa<Constant>(Y) || !isa<Constant>(Op0)) {
-        NewInst = Builder.CreateFMul(Op0, Y);
-        if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
-          FastMathFlags Flags = I.getFastMathFlags();
-          Flags &= cast<Instruction>(Op1)->getFastMathFlags();
-          RI->setFastMathFlags(Flags);
-        }
-        SimpR = BinaryOperator::CreateFDiv(NewInst, X);
-      }
-    }
-
-    if (NewInst) {
-      if (Instruction *T = dyn_cast<Instruction>(NewInst))
-        T->setDebugLoc(I.getDebugLoc());
-      SimpR->setFastMathFlags(I.getFastMathFlags());
-      return SimpR;
-    }
+  // -X / -Y -> X / Y
+  Value *X, *Y;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) {
+    I.setOperand(0, X);
+    I.setOperand(1, Y);
+    return &I;
   }
 
-  Value *LHS;
-  Value *RHS;
-
-  // -x / -y -> x / y
-  if (match(Op0, m_FNeg(m_Value(LHS))) && match(Op1, m_FNeg(m_Value(RHS)))) {
-    I.setOperand(0, LHS);
-    I.setOperand(1, RHS);
+  // X / (X * Y) --> 1.0 / Y
+  // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
+  // We can ignore the possibility that X is infinity because INF/INF is NaN.
+  if (I.hasNoNaNs() && I.hasAllowReassoc() &&
+      match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
+    I.setOperand(0, ConstantFP::get(I.getType(), 1.0));
+    I.setOperand(1, Y);
     return &I;
   }
 
@@ -1484,7 +1258,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
 /// This function implements the transforms common to both integer remainder
 /// instructions (urem and srem). It is called by the visitors to those integer
 /// remainder instructions.
-/// @brief Common integer remainder transforms
+/// Common integer remainder transforms
 Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -1526,13 +1300,12 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
 }
 
 Instruction *InstCombiner::visitURem(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyURemInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   if (Instruction *common = commonIRemTransforms(I))
     return common;
@@ -1541,47 +1314,55 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
     return NarrowRem;
 
   // X urem Y -> X and Y-1, where Y is a power of 2,
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
   if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
-    Constant *N1 = Constant::getAllOnesValue(I.getType());
+    Constant *N1 = Constant::getAllOnesValue(Ty);
     Value *Add = Builder.CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
   }
 
   // 1 urem X -> zext(X != 1)
-  if (match(Op0, m_One())) {
-    Value *Cmp = Builder.CreateICmpNE(Op1, Op0);
-    Value *Ext = Builder.CreateZExt(Cmp, I.getType());
-    return replaceInstUsesWith(I, Ext);
-  }
+  if (match(Op0, m_One()))
+    return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty);
 
   // X urem C -> X < C ? X : X - C, where C >= signbit.
-  const APInt *DivisorC;
-  if (match(Op1, m_APInt(DivisorC)) && DivisorC->isNegative()) {
+  if (match(Op1, m_Negative())) {
     Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
     Value *Sub = Builder.CreateSub(Op0, Op1);
     return SelectInst::Create(Cmp, Op0, Sub);
   }
 
+  // If the divisor is a sext of a boolean, then the divisor must be max
+  // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also
+  // max unsigned value. In that case, the remainder is 0:
+  // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0
+  Value *X;
+  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+    Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+    return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0);
+  }
+
   return nullptr;
 }
 
 Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySRemInst(Op0, Op1, SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   // Handle the integer rem common cases
   if (Instruction *Common = commonIRemTransforms(I))
     return Common;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   {
     const APInt *Y;
     // X % -Y -> X % Y
-    if (match(Op1, m_APInt(Y)) && Y->isNegative() && !Y->isMinSignedValue()) {
+    if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue()) {
       Worklist.AddValue(I.getOperand(1));
       I.setOperand(1, ConstantInt::get(I.getType(), -*Y));
       return &I;
@@ -1639,14 +1420,13 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
 }
 
 Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (Value *V = SimplifyVectorOp(I))
-    return replaceInstUsesWith(I, V);
-
-  if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
+  if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
+
   return nullptr;
 }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 7ee018dbc49b..e54a1dd05a24 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -15,14 +15,18 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+static cl::opt<unsigned>
+MaxNumPhis("instcombine-max-num-phis", cl::init(512),
+           cl::desc("Maximum number phis to handle in intptr/ptrint folding"));
+
 /// The PHI arguments will be folded into a single operation with a PHI node
 /// as input. The debug location of the single operation will be the merged
 /// locations of the original PHI node arguments.
@@ -176,8 +180,12 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
   assert(AvailablePtrVals.size() == PN.getNumIncomingValues() &&
          "Not enough available ptr typed incoming values");
   PHINode *MatchingPtrPHI = nullptr;
+  unsigned NumPhis = 0;
   for (auto II = BB->begin(), EI = BasicBlock::iterator(BB->getFirstNonPHI());
-       II != EI; II++) {
+       II != EI; II++, NumPhis++) {
+    // FIXME: consider handling this in AggressiveInstCombine
+    if (NumPhis > MaxNumPhis)
+      return nullptr;
     PHINode *PtrPHI = dyn_cast<PHINode>(II);
     if (!PtrPHI || PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
       continue;
@@ -1008,10 +1016,9 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // extracted out of it.  First, sort the users by their offset and size.
   array_pod_sort(PHIUsers.begin(), PHIUsers.end());
 
-  DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
-        for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
-          dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';
-    );
+  LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+             for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs()
+             << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';);
 
   // PredValues - This is a temporary used when rewriting PHI nodes.  It is
   // hoisted out here to avoid construction/destruction thrashing.
@@ -1092,8 +1099,8 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       }
       PredValues.clear();
 
-      DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
-                   << *EltPHI << '\n');
+      LLVM_DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
+                        << *EltPHI << '\n');
       ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
     }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index c790de3505f3..4867808478a3 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -47,93 +47,51 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-static SelectPatternFlavor
-getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) {
-  switch (SPF) {
-  default:
-    llvm_unreachable("unhandled!");
-
-  case SPF_SMIN:
-    return SPF_SMAX;
-  case SPF_UMIN:
-    return SPF_UMAX;
-  case SPF_SMAX:
-    return SPF_SMIN;
-  case SPF_UMAX:
-    return SPF_UMIN;
-  }
-}
-
-static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF,
-                                                   bool Ordered=false) {
-  switch (SPF) {
-  default:
-    llvm_unreachable("unhandled!");
-
-  case SPF_SMIN:
-    return ICmpInst::ICMP_SLT;
-  case SPF_UMIN:
-    return ICmpInst::ICMP_ULT;
-  case SPF_SMAX:
-    return ICmpInst::ICMP_SGT;
-  case SPF_UMAX:
-    return ICmpInst::ICMP_UGT;
-  case SPF_FMINNUM:
-    return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT;
-  case SPF_FMAXNUM:
-    return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;
-  }
-}
-
-static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy &Builder,
-                                          SelectPatternFlavor SPF, Value *A,
-                                          Value *B) {
-  CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF);
-  assert(CmpInst::isIntPredicate(Pred));
+static Value *createMinMax(InstCombiner::BuilderTy &Builder,
+                           SelectPatternFlavor SPF, Value *A, Value *B) {
+  CmpInst::Predicate Pred = getMinMaxPred(SPF);
+  assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate");
   return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
 }
 
-/// If one of the constants is zero (we know they can't both be) and we have an
-/// icmp instruction with zero, and we have an 'and' with the non-constant value
-/// and a power of two we can turn the select into a shift on the result of the
-/// 'and'.
 /// This folds:
-///  select (icmp eq (and X, C1)), C2, C3
-///    iff C1 is a power 2 and the difference between C2 and C3 is a power of 2.
+///  select (icmp eq (and X, C1)), TC, FC
+///    iff C1 is a power 2 and the difference between TC and FC is a power-of-2.
 /// To something like:
-///  (shr (and (X, C1)), (log2(C1) - log2(C2-C3))) + C3
+///  (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC
 /// Or:
-///  (shl (and (X, C1)), (log2(C2-C3) - log2(C1))) + C3
-/// With some variations depending if C3 is larger than C2, or the shift
+///  (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC
+/// With some variations depending if FC is larger than TC, or the shift
 /// isn't needed, or the bit widths don't match.
-static Value *foldSelectICmpAnd(Type *SelType, const ICmpInst *IC,
-                                APInt TrueVal, APInt FalseVal,
+static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
                                 InstCombiner::BuilderTy &Builder) {
-  assert(SelType->isIntOrIntVectorTy() && "Not an integer select?");
+  const APInt *SelTC, *SelFC;
+  if (!match(Sel.getTrueValue(), m_APInt(SelTC)) ||
+      !match(Sel.getFalseValue(), m_APInt(SelFC)))
+    return nullptr;
 
   // If this is a vector select, we need a vector compare.
-  if (SelType->isVectorTy() != IC->getType()->isVectorTy())
+  Type *SelType = Sel.getType();
+  if (SelType->isVectorTy() != Cmp->getType()->isVectorTy())
     return nullptr;
 
   Value *V;
   APInt AndMask;
   bool CreateAnd = false;
-  ICmpInst::Predicate Pred = IC->getPredicate();
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
   if (ICmpInst::isEquality(Pred)) {
-    if (!match(IC->getOperand(1), m_Zero()))
+    if (!match(Cmp->getOperand(1), m_Zero()))
       return nullptr;
 
-    V = IC->getOperand(0);
-
+    V = Cmp->getOperand(0);
     const APInt *AndRHS;
     if (!match(V, m_And(m_Value(), m_Power2(AndRHS))))
       return nullptr;
 
     AndMask = *AndRHS;
-  } else if (decomposeBitTestICmp(IC->getOperand(0), IC->getOperand(1),
+  } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1),
                                   Pred, V, AndMask)) {
     assert(ICmpInst::isEquality(Pred) && "Not equality test?");
-
     if (!AndMask.isPowerOf2())
       return nullptr;
 
@@ -142,39 +100,58 @@ static Value *foldSelectICmpAnd(Type *SelType, const ICmpInst *IC,
     return nullptr;
   }
 
-  // If both select arms are non-zero see if we have a select of the form
-  // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic
-  // for 'x ? 2^n : 0' and fix the thing up at the end.
-  APInt Offset(TrueVal.getBitWidth(), 0);
-  if (!TrueVal.isNullValue() && !FalseVal.isNullValue()) {
-    if ((TrueVal - FalseVal).isPowerOf2())
-      Offset = FalseVal;
-    else if ((FalseVal - TrueVal).isPowerOf2())
-      Offset = TrueVal;
-    else
+  // In general, when both constants are non-zero, we would need an offset to
+  // replace the select. This would require more instructions than we started
+  // with. But there's one special-case that we handle here because it can
+  // simplify/reduce the instructions.
+  APInt TC = *SelTC;
+  APInt FC = *SelFC;
+  if (!TC.isNullValue() && !FC.isNullValue()) {
+    // If the select constants differ by exactly one bit and that's the same
+    // bit that is masked and checked by the select condition, the select can
+    // be replaced by bitwise logic to set/clear one bit of the constant result.
+    if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask)
       return nullptr;
-
-    // Adjust TrueVal and FalseVal to the offset.
-    TrueVal -= Offset;
-    FalseVal -= Offset;
+    if (CreateAnd) {
+      // If we have to create an 'and', then we must kill the cmp to not
+      // increase the instruction count.
+      if (!Cmp->hasOneUse())
+        return nullptr;
+      V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask));
+    }
+    bool ExtraBitInTC = TC.ugt(FC);
+    if (Pred == ICmpInst::ICMP_EQ) {
+      // If the masked bit in V is clear, clear or set the bit in the result:
+      // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC
+      // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC
+      Constant *C = ConstantInt::get(SelType, TC);
+      return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C);
+    }
+    if (Pred == ICmpInst::ICMP_NE) {
+      // If the masked bit in V is set, set or clear the bit in the result:
+      // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC
+      // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC
+      Constant *C = ConstantInt::get(SelType, FC);
+      return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C);
+    }
+    llvm_unreachable("Only expecting equality predicates");
   }
 
-  // Make sure one of the select arms is a power of 2.
-  if (!TrueVal.isPowerOf2() && !FalseVal.isPowerOf2())
+  // Make sure one of the select arms is a power-of-2.
+  if (!TC.isPowerOf2() && !FC.isPowerOf2())
     return nullptr;
 
   // Determine which shift is needed to transform result of the 'and' into the
   // desired result.
-  const APInt &ValC = !TrueVal.isNullValue() ? TrueVal : FalseVal;
+  const APInt &ValC = !TC.isNullValue() ? TC : FC;
   unsigned ValZeros = ValC.logBase2();
   unsigned AndZeros = AndMask.logBase2();
 
-  if (CreateAnd) {
-    // Insert the AND instruction on the input to the truncate.
+  // Insert the 'and' instruction on the input to the truncate.
+  if (CreateAnd)
     V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask));
-  }
 
-  // If types don't match we can still convert the select by introducing a zext
+  // If types don't match, we can still convert the select by introducing a zext
   // or a trunc of the 'and'.
   if (ValZeros > AndZeros) {
     V = Builder.CreateZExtOrTrunc(V, SelType);
@@ -182,19 +159,17 @@ static Value *foldSelectICmpAnd(Type *SelType, const ICmpInst *IC,
   } else if (ValZeros < AndZeros) {
     V = Builder.CreateLShr(V, AndZeros - ValZeros);
     V = Builder.CreateZExtOrTrunc(V, SelType);
-  } else
+  } else {
     V = Builder.CreateZExtOrTrunc(V, SelType);
+  }
 
   // Okay, now we know that everything is set up, we just don't know whether we
   // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
-  bool ShouldNotVal = !TrueVal.isNullValue();
+  bool ShouldNotVal = !TC.isNullValue();
   ShouldNotVal ^= Pred == ICmpInst::ICMP_NE;
   if (ShouldNotVal)
     V = Builder.CreateXor(V, ValC);
 
-  // Apply an offset if needed.
-  if (!Offset.isNullValue())
-    V = Builder.CreateAdd(V, ConstantInt::get(V->getType(), Offset));
   return V;
 }
 
@@ -300,12 +275,13 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
                             TI->getType());
   }
 
-  // Only handle binary operators with one-use here. As with the cast case
-  // above, it may be possible to relax the one-use constraint, but that needs
-  // be examined carefully since it may not reduce the total number of
-  // instructions.
-  BinaryOperator *BO = dyn_cast<BinaryOperator>(TI);
-  if (!BO || !TI->hasOneUse() || !FI->hasOneUse())
+  // Only handle binary operators (including two-operand getelementptr) with
+  // one-use here. As with the cast case above, it may be possible to relax the
+  // one-use constraint, but that needs be examined carefully since it may not
+  // reduce the total number of instructions.
+  if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 ||
+      (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) ||
+      !TI->hasOneUse() || !FI->hasOneUse())
     return nullptr;
 
   // Figure out if the operations have any operands in common.
@@ -342,7 +318,18 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
                                       SI.getName() + ".v", &SI);
   Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
   Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
-  return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+  if (auto *BO = dyn_cast<BinaryOperator>(TI)) {
+    return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+  }
+  if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) {
+    auto *FGEP = cast<GetElementPtrInst>(FI);
+    Type *ElementType = TGEP->getResultElementType();
+    return TGEP->isInBounds() && FGEP->isInBounds()
+               ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1})
+               : GetElementPtrInst::Create(ElementType, Op0, {Op1});
+  }
+  llvm_unreachable("Expected BinaryOperator or GEP");
+  return nullptr;
 }
 
 static bool isSelect01(const APInt &C1I, const APInt &C2I) {
@@ -424,6 +411,47 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 }
 
 /// We want to turn:
+///   (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1)
+/// into:
+///   zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0)
+/// Note:
+///   Z may be 0 if lshr is missing.
+/// Worst-case scenario is that we will replace 5 instructions with 5 different
+/// instructions, but we got rid of select.
+static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
+                                         Value *TVal, Value *FVal,
+                                         InstCombiner::BuilderTy &Builder) {
+  if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() &&
+        Cmp->getPredicate() == ICmpInst::ICMP_EQ &&
+        match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One())))
+    return nullptr;
+
+  // The TrueVal has general form of:  and %B, 1
+  Value *B;
+  if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One()))))
+    return nullptr;
+
+  // Where %B may be optionally shifted:  lshr %X, %Z.
+  Value *X, *Z;
+  const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z))));
+  if (!HasShift)
+    X = B;
+
+  Value *Y;
+  if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y))))
+    return nullptr;
+
+  // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0
+  // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0
+  Constant *One = ConstantInt::get(SelType, 1);
+  Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One;
+  Value *FullMask = Builder.CreateOr(Y, MaskB);
+  Value *MaskedX = Builder.CreateAnd(X, FullMask);
+  Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX);
+  return new ZExtInst(ICmpNeZero, SelType);
+}
+
+/// We want to turn:
 ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
 /// into:
 ///   (or (shl (and X, C1), C3), Y)
@@ -526,6 +554,59 @@ static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal,
   return Builder.CreateOr(V, Y);
 }
 
+/// Transform patterns such as: (a > b) ? a - b : 0
+/// into: ((a > b) ? a : b) - b)
+/// This produces a canonical max pattern that is more easily recognized by the
+/// backend and converted into saturated subtraction instructions if those
+/// exist.
+/// There are 8 commuted/swapped variants of this pattern.
+/// TODO: Also support a - UMIN(a,b) patterns.
+static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
+                                            const Value *TrueVal,
+                                            const Value *FalseVal,
+                                            InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  if (!ICmpInst::isUnsigned(Pred))
+    return nullptr;
+
+  // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0
+  if (match(TrueVal, m_Zero())) {
+    Pred = ICmpInst::getInversePredicate(Pred);
+    std::swap(TrueVal, FalseVal);
+  }
+  if (!match(FalseVal, m_Zero()))
+    return nullptr;
+
+  Value *A = ICI->getOperand(0);
+  Value *B = ICI->getOperand(1);
+  if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) {
+    // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0
+    std::swap(A, B);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
+         "Unexpected isUnsigned predicate!");
+
+  // Account for swapped form of subtraction: ((a > b) ? b - a : 0).
+  bool IsNegative = false;
+  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))))
+    IsNegative = true;
+  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))))
+    return nullptr;
+
+  // If sub is used anywhere else, we wouldn't be able to eliminate it
+  // afterwards.
+  if (!TrueVal->hasOneUse())
+    return nullptr;
+
+  // All checks passed, convert to canonical unsigned saturated subtraction
+  // form: sub(max()).
+  // (a > b) ? a - b : 0 -> ((a > b) ? a : b) - b)
+  Value *Max = Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
+  return IsNegative ? Builder.CreateSub(B, Max) : Builder.CreateSub(Max, B);
+}
+
 /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
 /// call to cttz/ctlz with flag 'is_zero_undef' cleared.
 ///
@@ -687,23 +768,18 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
 
   // Canonicalize the compare predicate based on whether we have min or max.
   Value *LHS, *RHS;
-  ICmpInst::Predicate NewPred;
   SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
-  switch (SPR.Flavor) {
-  case SPF_SMIN: NewPred = ICmpInst::ICMP_SLT; break;
-  case SPF_UMIN: NewPred = ICmpInst::ICMP_ULT; break;
-  case SPF_SMAX: NewPred = ICmpInst::ICMP_SGT; break;
-  case SPF_UMAX: NewPred = ICmpInst::ICMP_UGT; break;
-  default: return nullptr;
-  }
+  if (!SelectPatternResult::isMinOrMax(SPR.Flavor))
+    return nullptr;
 
   // Is this already canonical?
+  ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor);
   if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
-      Cmp.getPredicate() == NewPred)
+      Cmp.getPredicate() == CanonicalPred)
     return nullptr;
 
   // Create the canonical compare and plug it into the select.
-  Sel.setCondition(Builder.CreateICmp(NewPred, LHS, RHS));
+  Sel.setCondition(Builder.CreateICmp(CanonicalPred, LHS, RHS));
 
   // If the select operands did not change, we're done.
   if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
@@ -718,6 +794,89 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   return &Sel;
 }
 
+/// There are many select variants for each of ABS/NABS.
+/// In matchSelectPattern(), there are different compare constants, compare
+/// predicates/operands and select operands.
+/// In isKnownNegation(), there are different formats of negated operands.
+/// Canonicalize all these variants to 1 pattern.
+/// This makes CSE more likely.
+static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
+                                        InstCombiner::BuilderTy &Builder) {
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+    return nullptr;
+
+  // Choose a sign-bit check for the compare (likely simpler for codegen).
+  // ABS:  (X <s 0) ? -X : X
+  // NABS: (X <s 0) ? X : -X
+  Value *LHS, *RHS;
+  SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
+  if (SPF != SelectPatternFlavor::SPF_ABS &&
+      SPF != SelectPatternFlavor::SPF_NABS)
+    return nullptr;
+
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+  assert(isKnownNegation(TVal, FVal) &&
+         "Unexpected result from matchSelectPattern");
+
+  // The compare may use the negated abs()/nabs() operand, or it may use
+  // negation in non-canonical form such as: sub A, B.
+  bool CmpUsesNegatedOp = match(Cmp.getOperand(0), m_Neg(m_Specific(TVal))) ||
+                          match(Cmp.getOperand(0), m_Neg(m_Specific(FVal)));
+
+  bool CmpCanonicalized = !CmpUsesNegatedOp &&
+                          match(Cmp.getOperand(1), m_ZeroInt()) &&
+                          Cmp.getPredicate() == ICmpInst::ICMP_SLT;
+  bool RHSCanonicalized = match(RHS, m_Neg(m_Specific(LHS)));
+
+  // Is this already canonical?
+  if (CmpCanonicalized && RHSCanonicalized)
+    return nullptr;
+
+  // If RHS is used by other instructions except compare and select, don't
+  // canonicalize it to not increase the instruction count.
+  if (!(RHS->hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp)))
+    return nullptr;
+
+  // Create the canonical compare: icmp slt LHS 0.
+  if (!CmpCanonicalized) {
+    Cmp.setPredicate(ICmpInst::ICMP_SLT);
+    Cmp.setOperand(1, ConstantInt::getNullValue(Cmp.getOperand(0)->getType()));
+    if (CmpUsesNegatedOp)
+      Cmp.setOperand(0, LHS);
+  }
+
+  // Create the canonical RHS: RHS = sub (0, LHS).
+  if (!RHSCanonicalized) {
+    assert(RHS->hasOneUse() && "RHS use number is not right");
+    RHS = Builder.CreateNeg(LHS);
+    if (TVal == LHS) {
+      Sel.setFalseValue(RHS);
+      FVal = RHS;
+    } else {
+      Sel.setTrueValue(RHS);
+      TVal = RHS;
+    }
+  }
+
+  // If the select operands do not change, we're done.
+  if (SPF == SelectPatternFlavor::SPF_NABS) {
+    if (TVal == LHS)
+      return &Sel;
+    assert(FVal == LHS && "Unexpected results from matchSelectPattern");
+  } else {
+    if (FVal == LHS)
+      return &Sel;
+    assert(TVal == LHS && "Unexpected results from matchSelectPattern");
+  }
+
+  // We are swapping the select operands, so swap the metadata too.
+  Sel.setTrueValue(FVal);
+  Sel.setFalseValue(TVal);
+  Sel.swapProfMetadata();
+  return &Sel;
+}
+
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
                                                   ICmpInst *ICI) {
@@ -727,59 +886,18 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
     return NewSel;
 
+  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder))
+    return NewAbs;
+
   bool Changed = adjustMinMax(SI, *ICI);
 
+  if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
   ICmpInst::Predicate Pred = ICI->getPredicate();
   Value *CmpLHS = ICI->getOperand(0);
   Value *CmpRHS = ICI->getOperand(1);
-
-  // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1
-  // and       (X <s  0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1
-  // FIXME: Type and constness constraints could be lifted, but we have to
-  //        watch code size carefully. We should consider xor instead of
-  //        sub/add when we decide to do that.
-  // TODO: Merge this with foldSelectICmpAnd somehow.
-  if (CmpLHS->getType()->isIntOrIntVectorTy() &&
-      CmpLHS->getType() == TrueVal->getType()) {
-    const APInt *C1, *C2;
-    if (match(TrueVal, m_APInt(C1)) && match(FalseVal, m_APInt(C2))) {
-      ICmpInst::Predicate Pred = ICI->getPredicate();
-      Value *X;
-      APInt Mask;
-      if (decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask, false)) {
-        if (Mask.isSignMask()) {
-          assert(X == CmpLHS && "Expected to use the compare input directly");
-          assert(ICmpInst::isEquality(Pred) && "Expected equality predicate");
-
-          if (Pred == ICmpInst::ICMP_NE)
-            std::swap(C1, C2);
-
-          // This shift results in either -1 or 0.
-          Value *AShr = Builder.CreateAShr(X, Mask.getBitWidth() - 1);
-
-          // Check if we can express the operation with a single or.
-          if (C2->isAllOnesValue())
-            return replaceInstUsesWith(SI, Builder.CreateOr(AShr, *C1));
-
-          Value *And = Builder.CreateAnd(AShr, *C2 - *C1);
-          return replaceInstUsesWith(SI, Builder.CreateAdd(And,
-                                        ConstantInt::get(And->getType(), *C1)));
-        }
-      }
-    }
-  }
-
-  {
-    const APInt *TrueValC, *FalseValC;
-    if (match(TrueVal, m_APInt(TrueValC)) &&
-        match(FalseVal, m_APInt(FalseValC)))
-      if (Value *V = foldSelectICmpAnd(SI.getType(), ICI, *TrueValC,
-                                       *FalseValC, Builder))
-        return replaceInstUsesWith(SI, V);
-  }
-
-  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
-
   if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
     if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
       // Transform (X == C) ? X : Y -> (X == C) ? C : Y
@@ -842,16 +960,22 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  if (Instruction *V =
+          foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
+    return V;
+
   if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
   if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
+  if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
   return Changed ? &SI : nullptr;
 }
 
-
 /// SI is a select whose condition is a PHI node (but the two may be in
 /// different blocks). See if the true/false values (V) are live in all of the
 /// predecessor blocks of the PHI. For example, cases like this can't be mapped:
@@ -900,7 +1024,7 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
   if (C == A || C == B) {
     // MAX(MAX(A, B), B) -> MAX(A, B)
     // MIN(MIN(a, b), a) -> MIN(a, b)
-    if (SPF1 == SPF2)
+    if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1))
       return replaceInstUsesWith(Outer, Inner);
 
     // MAX(MIN(a, b), a) -> a
@@ -992,10 +1116,10 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
     if (!NotC)
       NotC = Builder.CreateNot(C);
 
-    Value *NewInner = generateMinMaxSelectPattern(
-        Builder, getInverseMinMaxSelectPattern(SPF1), NotA, NotB);
-    Value *NewOuter = Builder.CreateNot(generateMinMaxSelectPattern(
-        Builder, getInverseMinMaxSelectPattern(SPF2), NewInner, NotC));
+    Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA,
+                                   NotB);
+    Value *NewOuter = Builder.CreateNot(
+        createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC));
     return replaceInstUsesWith(Outer, NewOuter);
   }
 
@@ -1075,6 +1199,11 @@ static Instruction *foldAddSubSelect(SelectInst &SI,
 }
 
 Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+  Constant *C;
+  if (!match(Sel.getTrueValue(), m_Constant(C)) &&
+      !match(Sel.getFalseValue(), m_Constant(C)))
+    return nullptr;
+
   Instruction *ExtInst;
   if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
       !match(Sel.getFalseValue(), m_Instruction(ExtInst)))
@@ -1084,20 +1213,18 @@ Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
   if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
     return nullptr;
 
-  // TODO: Handle larger types? That requires adjusting FoldOpIntoSelect too.
+  // If we are extending from a boolean type or if we can create a select that
+  // has the same size operands as its condition, try to narrow the select.
   Value *X = ExtInst->getOperand(0);
   Type *SmallType = X->getType();
-  if (!SmallType->isIntOrIntVectorTy(1))
-    return nullptr;
-
-  Constant *C;
-  if (!match(Sel.getTrueValue(), m_Constant(C)) &&
-      !match(Sel.getFalseValue(), m_Constant(C)))
+  Value *Cond = Sel.getCondition();
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!SmallType->isIntOrIntVectorTy(1) &&
+      (!Cmp || Cmp->getOperand(0)->getType() != SmallType))
     return nullptr;
 
   // If the constant is the same after truncation to the smaller type and
   // extension to the original type, we can narrow the select.
-  Value *Cond = Sel.getCondition();
   Type *SelType = Sel.getType();
   Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
   Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
@@ -1289,6 +1416,63 @@ static Instruction *foldSelectCmpXchg(SelectInst &SI) {
   return nullptr;
 }
 
+/// Reduce a sequence of min/max with a common operand.
+static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
+                                        Value *RHS,
+                                        InstCombiner::BuilderTy &Builder) {
+  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max");
+  // TODO: Allow FP min/max with nnan/nsz.
+  if (!LHS->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
+  Value *A, *B, *C, *D;
+  SelectPatternResult L = matchSelectPattern(LHS, A, B);
+  SelectPatternResult R = matchSelectPattern(RHS, C, D);
+  if (SPF != L.Flavor || L.Flavor != R.Flavor)
+    return nullptr;
+
+  // Look for a common operand. The use checks are different than usual because
+  // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by
+  // the select.
+  Value *MinMaxOp = nullptr;
+  Value *ThirdOp = nullptr;
+  if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) {
+    // If the LHS is only used in this chain and the RHS is used outside of it,
+    // reuse the RHS min/max because that will eliminate the LHS.
+    if (D == A || C == A) {
+      // min(min(a, b), min(c, a)) --> min(min(c, a), b)
+      // min(min(a, b), min(a, d)) --> min(min(a, d), b)
+      MinMaxOp = RHS;
+      ThirdOp = B;
+    } else if (D == B || C == B) {
+      // min(min(a, b), min(c, b)) --> min(min(c, b), a)
+      // min(min(a, b), min(b, d)) --> min(min(b, d), a)
+      MinMaxOp = RHS;
+      ThirdOp = A;
+    }
+  } else if (!RHS->hasNUsesOrMore(3)) {
+    // Reuse the LHS. This will eliminate the RHS.
+    if (D == A || D == B) {
+      // min(min(a, b), min(c, a)) --> min(min(a, b), c)
+      // min(min(a, b), min(c, b)) --> min(min(a, b), c)
+      MinMaxOp = LHS;
+      ThirdOp = C;
+    } else if (C == A || C == B) {
+      // min(min(a, b), min(b, d)) --> min(min(a, b), d)
+      // min(min(a, b), min(c, b)) --> min(min(a, b), d)
+      MinMaxOp = LHS;
+      ThirdOp = D;
+    }
+  }
+  if (!MinMaxOp || !ThirdOp)
+    return nullptr;
+
+  CmpInst::Predicate P = getMinMaxPred(SPF);
+  Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp);
+  return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
+}
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -1489,7 +1673,37 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
       // NOTE: if we wanted to, this is where to detect MIN/MAX
     }
-    // NOTE: if we wanted to, this is where to detect ABS
+
+    // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
+    // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We
+    // also require nnan because we do not want to unintentionally change the
+    // sign of a NaN value.
+    Value *X = FCI->getOperand(0);
+    FCmpInst::Predicate Pred = FCI->getPredicate();
+    if (match(FCI->getOperand(1), m_AnyZeroFP()) && FCI->hasNoNaNs()) {
+      // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
+      // (X >  +/-0.0) ? X : (0.0 - X) --> fabs(X)
+      if ((X == FalseVal && Pred == FCmpInst::FCMP_OLE &&
+           match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(X)))) ||
+          (X == TrueVal && Pred == FCmpInst::FCMP_OGT &&
+           match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(X))))) {
+        Value *Fabs = Builder.CreateIntrinsic(Intrinsic::fabs, { X }, FCI);
+        return replaceInstUsesWith(SI, Fabs);
+      }
+      // With nsz:
+      // (X <  +/-0.0) ? -X : X --> fabs(X)
+      // (X <= +/-0.0) ? -X : X --> fabs(X)
+      // (X >  +/-0.0) ? X : -X --> fabs(X)
+      // (X >= +/-0.0) ? X : -X --> fabs(X)
+      if (FCI->hasNoSignedZeros() &&
+          ((X == FalseVal && match(TrueVal, m_FNeg(m_Specific(X))) &&
+            (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE)) ||
+           (X == TrueVal && match(FalseVal, m_FNeg(m_Specific(X))) &&
+            (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE)))) {
+        Value *Fabs = Builder.CreateIntrinsic(Intrinsic::fabs, { X }, FCI);
+        return replaceInstUsesWith(SI, Fabs);
+      }
+    }
   }
 
   // See if we are selecting two values based on a comparison of the two values.
@@ -1532,7 +1746,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           (LHS->getType()->isFPOrFPVectorTy() &&
            ((CmpLHS != LHS && CmpLHS != RHS) ||
             (CmpRHS != LHS && CmpRHS != RHS)))) {
-        CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered);
+        CmpInst::Predicate Pred = getMinMaxPred(SPF, SPR.Ordered);
 
         Value *Cmp;
         if (CmpInst::isIntPredicate(Pred)) {
@@ -1551,6 +1765,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType);
         return replaceInstUsesWith(SI, NewCast);
       }
+
+      // MAX(~a, ~b) -> ~MIN(a, b)
+      // MIN(~a, ~b) -> ~MAX(a, b)
+      Value *A, *B;
+      if (match(LHS, m_Not(m_Value(A))) && match(RHS, m_Not(m_Value(B))) &&
+          (LHS->getNumUses() <= 2 || RHS->getNumUses() <= 2)) {
+        CmpInst::Predicate InvertedPred = getInverseMinMaxPred(SPF);
+        Value *InvertedCmp = Builder.CreateICmp(InvertedPred, A, B);
+        Value *NewSel = Builder.CreateSelect(InvertedCmp, A, B);
+        return BinaryOperator::CreateNot(NewSel);
+      }
+
+      if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
+        return I;
     }
 
     if (SPF) {
@@ -1570,28 +1798,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           return R;
     }
 
-    // MAX(~a, ~b) -> ~MIN(a, b)
-    if ((SPF == SPF_SMAX || SPF == SPF_UMAX) &&
-        IsFreeToInvert(LHS, LHS->hasNUses(2)) &&
-        IsFreeToInvert(RHS, RHS->hasNUses(2))) {
-      // For this transform to be profitable, we need to eliminate at least two
-      // 'not' instructions if we're going to add one 'not' instruction.
-      int NumberOfNots =
-          (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) +
-          (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) +
-          (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
-
-      if (NumberOfNots >= 2) {
-        Value *NewLHS = Builder.CreateNot(LHS);
-        Value *NewRHS = Builder.CreateNot(RHS);
-        Value *NewCmp = SPF == SPF_SMAX ? Builder.CreateICmpSLT(NewLHS, NewRHS)
-                                        : Builder.CreateICmpULT(NewLHS, NewRHS);
-        Value *NewSI =
-            Builder.CreateNot(Builder.CreateSelect(NewCmp, NewLHS, NewRHS));
-        return replaceInstUsesWith(SI, NewSI);
-      }
-    }
-
     // TODO.
     // ABS(-X) -> ABS(X)
   }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 44bbb84686ab..34f8037e519f 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -87,8 +87,7 @@ static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
   // Equal shift amounts in opposite directions become bitwise 'and':
   // lshr (shl X, C), C --> and X, C'
   // shl (lshr X, C), C --> and X, C'
-  unsigned InnerShAmt = InnerShiftConst->getZExtValue();
-  if (InnerShAmt == OuterShAmt)
+  if (*InnerShiftConst == OuterShAmt)
     return true;
 
   // If the 2nd shift is bigger than the 1st, we can fold:
@@ -98,7 +97,8 @@ static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
   // Also, check that the inner shift is valid (less than the type width) or
   // we'll crash trying to produce the bit mask for the 'and'.
   unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
-  if (InnerShAmt > OuterShAmt && InnerShAmt < TypeWidth) {
+  if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) {
+    unsigned InnerShAmt = InnerShiftConst->getZExtValue();
     unsigned MaskShift =
         IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
     APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
@@ -135,7 +135,7 @@ static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
   ConstantInt *CI = nullptr;
   if ((IsLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
       (!IsLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
-    if (CI->getZExtValue() == NumBits) {
+    if (CI->getValue() == NumBits) {
       // TODO: Check that the input bits are already zero with MaskedValueIsZero
 #if 0
       // If this is a truncate of a logical shr, we can truncate it to a smaller
@@ -356,8 +356,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
       canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
-    DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
-              " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
+    LLVM_DEBUG(
+        dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+                  " to eliminate shift:\n  IN: "
+               << *Op0 << "\n  SH: " << I << "\n");
 
     return replaceInstUsesWith(
         I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
@@ -370,7 +372,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
   assert(!Op1C->uge(TypeBits) &&
          "Shift over the type width should have been removed already");
 
-  if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
+  if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I))
     return FoldedShift;
 
   // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
@@ -586,23 +588,23 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
 }
 
 Instruction *InstCombiner::visitShl(BinaryOperator &I) {
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  if (Value *V =
-          SimplifyShlInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                          SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   if (Instruction *V = commonShiftTransforms(I))
     return V;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
   const APInt *ShAmtAPInt;
   if (match(Op1, m_APInt(ShAmtAPInt))) {
     unsigned ShAmt = ShAmtAPInt->getZExtValue();
-    unsigned BitWidth = I.getType()->getScalarSizeInBits();
-    Type *Ty = I.getType();
+    unsigned BitWidth = Ty->getScalarSizeInBits();
 
     // shl (zext X), ShAmt --> zext (shl X, ShAmt)
     // This is only valid if X would have zeros shifted out.
@@ -620,11 +622,8 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
-    // Be careful about hiding shl instructions behind bit masks. They are used
-    // to represent multiplies by a constant, and it is important that simple
-    // arithmetic expressions are still recognizable by scalar evolution.
-    // The inexact versions are deferred to DAGCombine, so we don't hide shl
-    // behind a bit mask.
+    // FIXME: we do not yet transform non-exact shr's. The backend (DAGCombine)
+    // needs a few fixes for the rotate pattern recognition first.
     const APInt *ShOp1;
     if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {
       unsigned ShrAmt = ShOp1->getZExtValue();
@@ -668,6 +667,15 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     }
   }
 
+  // Transform  (x >> y) << y  to  x & (-1 << y)
+  // Valid for any type of right-shift.
+  Value *X;
+  if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    Value *Mask = Builder.CreateShl(AllOnes, Op1);
+    return BinaryOperator::CreateAnd(Mask, X);
+  }
+
   Constant *C1;
   if (match(Op1, m_Constant(C1))) {
     Constant *C2;
@@ -685,17 +693,17 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
 }
 
 Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  if (Value *V =
-          SimplifyLShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
   const APInt *ShAmtAPInt;
   if (match(Op1, m_APInt(ShAmtAPInt))) {
@@ -800,25 +808,34 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
       return &I;
     }
   }
+
+  // Transform  (x << y) >> y  to  x & (-1 >> y)
+  Value *X;
+  if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    Value *Mask = Builder.CreateLShr(AllOnes, Op1);
+    return BinaryOperator::CreateAnd(Mask, X);
+  }
+
   return nullptr;
 }
 
 Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
-  if (Value *V = SimplifyVectorOp(I))
+  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  if (Value *V =
-          SimplifyAShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
+  if (Instruction *X = foldShuffledBinop(I))
+    return X;
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
   unsigned BitWidth = Ty->getScalarSizeInBits();
   const APInt *ShAmtAPInt;
-  if (match(Op1, m_APInt(ShAmtAPInt))) {
+  if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) {
     unsigned ShAmt = ShAmtAPInt->getZExtValue();
 
     // If the shift amount equals the difference in width of the destination
@@ -832,7 +849,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
     // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
     // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
     const APInt *ShOp1;
-    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1)))) {
+    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) &&
+        ShOp1->ult(BitWidth)) {
       unsigned ShlAmt = ShOp1->getZExtValue();
       if (ShlAmt < ShAmt) {
         // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
@@ -850,7 +868,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
       }
     }
 
-    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1)))) {
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) &&
+        ShOp1->ult(BitWidth)) {
       unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
       // Oversized arithmetic shifts replicate the sign bit.
       AmtSum = std::min(AmtSum, BitWidth - 1);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a2e757cb4273..425f5ce384be 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -23,6 +23,17 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+  unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
 /// Check to see if the specified operand of the specified instruction is a
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
@@ -333,7 +344,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     KnownBits InputKnown(SrcBitWidth);
     if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
       return I;
-    Known = Known.zextOrTrunc(BitWidth);
+    Known = InputKnown.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
       Known.Zero.setBitsFrom(SrcBitWidth);
@@ -545,6 +556,27 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     }
     break;
   }
+  case Instruction::UDiv: {
+    // UDiv doesn't demand low bits that are zero in the divisor.
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      // If the shift is exact, then it does demand the low bits.
+      if (cast<UDivOperator>(I)->isExact())
+        break;
+
+      // FIXME: Take the demanded mask of the result into account.
+      unsigned RHSTrailingZeros = SA->countTrailingZeros();
+      APInt DemandedMaskIn =
+          APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
+        return I;
+
+      // Propagate zero bits from the input.
+      Known.Zero.setHighBits(std::min(
+          BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
+    }
+    break;
+  }
   case Instruction::SRem:
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       // X % -1 demands all the bits because we don't want to introduce
@@ -888,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
   return nullptr;
 }
 
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
+                                                           APInt DemandedElts,
+                                                           int DMaskIdx) {
+  unsigned VWidth = II->getType()->getVectorNumElements();
+  if (VWidth == 1)
+    return nullptr;
+
+  ConstantInt *NewDMask = nullptr;
+
+  if (DMaskIdx < 0) {
+    // Pretend that a prefix of elements is demanded to simplify the code
+    // below.
+    DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
+  } else {
+    ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx));
+    if (!DMask)
+      return nullptr; // non-constant dmask is not supported by codegen
+
+    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+    // Mask off values that are undefined because the dmask doesn't cover them
+    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+    unsigned NewDMaskVal = 0;
+    unsigned OrigLoadIdx = 0;
+    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+      const unsigned Bit = 1 << SrcIdx;
+      if (!!(DMaskVal & Bit)) {
+        if (!!DemandedElts[OrigLoadIdx])
+          NewDMaskVal |= Bit;
+        OrigLoadIdx++;
+      }
+    }
+
+    if (DMaskVal != NewDMaskVal)
+      NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
+  }
+
+  // TODO: Handle 3 vectors when supported in code gen.
+  unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation());
+  if (!NewNumElts)
+    return UndefValue::get(II->getType());
+
+  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+    if (NewDMask)
+      II->setArgOperand(DMaskIdx, NewDMask);
+    return nullptr;
+  }
+
+  // Determine the overload types of the original intrinsic.
+  auto IID = II->getIntrinsicID();
+  SmallVector<Intrinsic::IITDescriptor, 16> Table;
+  getIntrinsicInfoTableEntries(IID, Table);
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+  FunctionType *FTy = II->getCalledFunction()->getFunctionType();
+  SmallVector<Type *, 6> OverloadTys;
+  Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys);
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys);
+
+  // Get the new return type overload of the intrinsic.
+  Module *M = II->getParent()->getParent()->getParent();
+  Type *EltTy = II->getType()->getVectorElementType();
+  Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
+
+  OverloadTys[0] = NewTy;
+  Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
+
+  SmallVector<Value *, 16> Args;
+  for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+    Args.push_back(II->getArgOperand(I));
+
+  if (NewDMask)
+    Args[DMaskIdx] = NewDMask;
+
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(II);
+
+  CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
+  NewCall->takeName(II);
+  NewCall->copyMetadata(*II);
+
+  if (NewNumElts == 1) {
+    return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
+                                       DemandedElts.countTrailingZeros());
+  }
+
+  SmallVector<uint32_t, 8> EltMask;
+  unsigned NewLoadIdx = 0;
+  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+    if (!!DemandedElts[OrigLoadIdx])
+      EltMask.push_back(NewLoadIdx++);
+    else
+      EltMask.push_back(NewNumElts);
+  }
+
+  Value *Shuffle =
+      Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
+
+  return Shuffle;
+}
+
 /// The specified value produces a vector with any number of elements.
 /// DemandedElts contains the set of elements that are actually used by the
 /// caller. This method analyzes which elements of the operand are undef and
@@ -1187,7 +1323,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       break;
     }
 
-    // div/rem demand all inputs, because they don't want divide by zero.
     TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
                                       UndefElts2, Depth + 1);
     if (TmpV) {
@@ -1247,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
     if (!II) break;
     switch (II->getIntrinsicID()) {
-    default: break;
-
     case Intrinsic::x86_xop_vfrcz_ss:
     case Intrinsic::x86_xop_vfrcz_sd:
       // The instructions for these intrinsics are speced to zero upper bits not
@@ -1273,8 +1406,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // Unary scalar-as-vector operations that work column-wise.
     case Intrinsic::x86_sse_rcp_ss:
     case Intrinsic::x86_sse_rsqrt_ss:
-    case Intrinsic::x86_sse_sqrt_ss:
-    case Intrinsic::x86_sse2_sqrt_sd:
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                         UndefElts, Depth + 1);
       if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1366,18 +1497,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::x86_avx512_mask_sub_sd_round:
     case Intrinsic::x86_avx512_mask_max_sd_round:
     case Intrinsic::x86_avx512_mask_min_sd_round:
-    case Intrinsic::x86_fma_vfmadd_ss:
-    case Intrinsic::x86_fma_vfmsub_ss:
-    case Intrinsic::x86_fma_vfnmadd_ss:
-    case Intrinsic::x86_fma_vfnmsub_ss:
-    case Intrinsic::x86_fma_vfmadd_sd:
-    case Intrinsic::x86_fma_vfmsub_sd:
-    case Intrinsic::x86_fma_vfnmadd_sd:
-    case Intrinsic::x86_fma_vfnmsub_sd:
-    case Intrinsic::x86_avx512_mask_vfmadd_ss:
-    case Intrinsic::x86_avx512_mask_vfmadd_sd:
-    case Intrinsic::x86_avx512_maskz_vfmadd_ss:
-    case Intrinsic::x86_avx512_maskz_vfmadd_sd:
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                         UndefElts, Depth + 1);
       if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1404,68 +1523,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
       break;
 
-    case Intrinsic::x86_avx512_mask3_vfmadd_ss:
-    case Intrinsic::x86_avx512_mask3_vfmadd_sd:
-    case Intrinsic::x86_avx512_mask3_vfmsub_ss:
-    case Intrinsic::x86_avx512_mask3_vfmsub_sd:
-    case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
-    case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
-      // These intrinsics get the passthru bits from operand 2.
-      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
-                                        UndefElts, Depth + 1);
-      if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
-
-      // If lowest element of a scalar op isn't used then use Arg2.
-      if (!DemandedElts[0]) {
-        Worklist.Add(II);
-        return II->getArgOperand(2);
-      }
-
-      // Only lower element is used for operand 0 and 1.
-      DemandedElts = 1;
-      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
-                                        UndefElts2, Depth + 1);
-      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
-      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
-                                        UndefElts3, Depth + 1);
-      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
-
-      // Lower element is undefined if all three lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0] || !UndefElts3[0])
-        UndefElts.clearBit(0);
-
-      break;
-
-    case Intrinsic::x86_sse2_pmulu_dq:
-    case Intrinsic::x86_sse41_pmuldq:
-    case Intrinsic::x86_avx2_pmul_dq:
-    case Intrinsic::x86_avx2_pmulu_dq:
-    case Intrinsic::x86_avx512_pmul_dq_512:
-    case Intrinsic::x86_avx512_pmulu_dq_512: {
-      Value *Op0 = II->getArgOperand(0);
-      Value *Op1 = II->getArgOperand(1);
-      unsigned InnerVWidth = Op0->getType()->getVectorNumElements();
-      assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
-
-      APInt InnerDemandedElts(InnerVWidth, 0);
-      for (unsigned i = 0; i != VWidth; ++i)
-        if (DemandedElts[i])
-          InnerDemandedElts.setBit(i * 2);
-
-      UndefElts2 = APInt(InnerVWidth, 0);
-      TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2,
-                                        Depth + 1);
-      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
-
-      UndefElts3 = APInt(InnerVWidth, 0);
-      TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3,
-                                        Depth + 1);
-      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
-
-      break;
-    }
-
     case Intrinsic::x86_sse2_packssdw_128:
     case Intrinsic::x86_sse2_packsswb_128:
     case Intrinsic::x86_sse2_packuswb_128:
@@ -1554,124 +1611,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       break;
     case Intrinsic::amdgcn_buffer_load:
     case Intrinsic::amdgcn_buffer_load_format:
-    case Intrinsic::amdgcn_image_sample:
-    case Intrinsic::amdgcn_image_sample_cl:
-    case Intrinsic::amdgcn_image_sample_d:
-    case Intrinsic::amdgcn_image_sample_d_cl:
-    case Intrinsic::amdgcn_image_sample_l:
-    case Intrinsic::amdgcn_image_sample_b:
-    case Intrinsic::amdgcn_image_sample_b_cl:
-    case Intrinsic::amdgcn_image_sample_lz:
-    case Intrinsic::amdgcn_image_sample_cd:
-    case Intrinsic::amdgcn_image_sample_cd_cl:
-
-    case Intrinsic::amdgcn_image_sample_c:
-    case Intrinsic::amdgcn_image_sample_c_cl:
-    case Intrinsic::amdgcn_image_sample_c_d:
-    case Intrinsic::amdgcn_image_sample_c_d_cl:
-    case Intrinsic::amdgcn_image_sample_c_l:
-    case Intrinsic::amdgcn_image_sample_c_b:
-    case Intrinsic::amdgcn_image_sample_c_b_cl:
-    case Intrinsic::amdgcn_image_sample_c_lz:
-    case Intrinsic::amdgcn_image_sample_c_cd:
-    case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
-    case Intrinsic::amdgcn_image_sample_o:
-    case Intrinsic::amdgcn_image_sample_cl_o:
-    case Intrinsic::amdgcn_image_sample_d_o:
-    case Intrinsic::amdgcn_image_sample_d_cl_o:
-    case Intrinsic::amdgcn_image_sample_l_o:
-    case Intrinsic::amdgcn_image_sample_b_o:
-    case Intrinsic::amdgcn_image_sample_b_cl_o:
-    case Intrinsic::amdgcn_image_sample_lz_o:
-    case Intrinsic::amdgcn_image_sample_cd_o:
-    case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
-    case Intrinsic::amdgcn_image_sample_c_o:
-    case Intrinsic::amdgcn_image_sample_c_cl_o:
-    case Intrinsic::amdgcn_image_sample_c_d_o:
-    case Intrinsic::amdgcn_image_sample_c_d_cl_o:
-    case Intrinsic::amdgcn_image_sample_c_l_o:
-    case Intrinsic::amdgcn_image_sample_c_b_o:
-    case Intrinsic::amdgcn_image_sample_c_b_cl_o:
-    case Intrinsic::amdgcn_image_sample_c_lz_o:
-    case Intrinsic::amdgcn_image_sample_c_cd_o:
-    case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
-
-    case Intrinsic::amdgcn_image_getlod: {
-      if (VWidth == 1 || !DemandedElts.isMask())
-        return nullptr;
-
-      // TODO: Handle 3 vectors when supported in code gen.
-      unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
-      if (NewNumElts == VWidth)
-        return nullptr;
-
-      Module *M = II->getParent()->getParent()->getParent();
-      Type *EltTy = V->getType()->getVectorElementType();
-
-      Type *NewTy = (NewNumElts == 1) ? EltTy :
-        VectorType::get(EltTy, NewNumElts);
-
-      auto IID = II->getIntrinsicID();
-
-      bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
-                      IID == Intrinsic::amdgcn_buffer_load_format;
-
-      Function *NewIntrin = IsBuffer ?
-        Intrinsic::getDeclaration(M, IID, NewTy) :
-        // Samplers have 3 mangled types.
-        Intrinsic::getDeclaration(M, IID,
-                                  { NewTy, II->getArgOperand(0)->getType(),
-                                      II->getArgOperand(1)->getType()});
-
-      SmallVector<Value *, 5> Args;
-      for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
-        Args.push_back(II->getArgOperand(I));
-
-      IRBuilderBase::InsertPointGuard Guard(Builder);
-      Builder.SetInsertPoint(II);
-
-      CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
-      NewCall->takeName(II);
-      NewCall->copyMetadata(*II);
-
-      if (!IsBuffer) {
-        ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
-        if (DMask) {
-          unsigned DMaskVal = DMask->getZExtValue() & 0xf;
-
-          unsigned PopCnt = 0;
-          unsigned NewDMask = 0;
-          for (unsigned I = 0; I < 4; ++I) {
-            const unsigned Bit = 1 << I;
-            if (!!(DMaskVal & Bit)) {
-              if (++PopCnt > NewNumElts)
-                break;
+      return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
+    default: {
+      if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
+        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
 
-              NewDMask |= Bit;
-            }
-          }
-
-          NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
-        }
-      }
-
-
-      if (NewNumElts == 1) {
-        return Builder.CreateInsertElement(UndefValue::get(V->getType()),
-                                           NewCall, static_cast<uint64_t>(0));
-      }
-
-      SmallVector<uint32_t, 8> EltMask;
-      for (unsigned I = 0; I < VWidth; ++I)
-        EltMask.push_back(I);
-
-      Value *Shuffle = Builder.CreateShuffleVector(
-        NewCall, UndefValue::get(NewTy), EltMask);
-
-      MadeChange = true;
-      return Shuffle;
+      break;
     }
     }
     break;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/contrib/llvm/lib/Transforms/InstCombine/InstCombineTables.td
new file mode 100644
index 000000000000..98b2adc442fa
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineTables.td
@@ -0,0 +1,11 @@
+include "llvm/TableGen/SearchableTable.td"
+include "llvm/IR/Intrinsics.td"
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+  let FilterClass = "AMDGPUImageDMaskIntrinsic";
+  let Fields = ["Intr"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+  let PrimaryKeyEarlyOut = 1;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index aeac8910af6b..2560feb37d66 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1140,6 +1140,216 @@ static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
   return true;
 }
 
+/// These are the ingredients in an alternate form binary operator as described
+/// below.
+struct BinopElts {
+  BinaryOperator::BinaryOps Opcode;
+  Value *Op0;
+  Value *Op1;
+  BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
+            Value *V0 = nullptr, Value *V1 = nullptr) :
+      Opcode(Opc), Op0(V0), Op1(V1) {}
+  operator bool() const { return Opcode != 0; }
+};
+
+/// Binops may be transformed into binops with different opcodes and operands.
+/// Reverse the usual canonicalization to enable folds with the non-canonical
+/// form of the binop. If a transform is possible, return the elements of the
+/// new binop. If not, return invalid elements.
+static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
+  Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
+  Type *Ty = BO->getType();
+  switch (BO->getOpcode()) {
+    case Instruction::Shl: {
+      // shl X, C --> mul X, (1 << C)
+      Constant *C;
+      if (match(BO1, m_Constant(C))) {
+        Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
+        return { Instruction::Mul, BO0, ShlOne };
+      }
+      break;
+    }
+    case Instruction::Or: {
+      // or X, C --> add X, C (when X and C have no common bits set)
+      const APInt *C;
+      if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
+        return { Instruction::Add, BO0, BO1 };
+      break;
+    }
+    default:
+      break;
+  }
+  return {};
+}
+
+static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
+  assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
+
+  // Are we shuffling together some value and that same value after it has been
+  // modified by a binop with a constant?
+  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+  Constant *C;
+  bool Op0IsBinop;
+  if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
+    Op0IsBinop = true;
+  else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
+    Op0IsBinop = false;
+  else
+    return nullptr;
+
+  // The identity constant for a binop leaves a variable operand unchanged. For
+  // a vector, this is a splat of something like 0, -1, or 1.
+  // If there's no identity constant for this binop, we're done.
+  auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
+  BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
+  Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
+  if (!IdC)
+    return nullptr;
+
+  // Shuffle identity constants into the lanes that return the original value.
+  // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
+  // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
+  // The existing binop constant vector remains in the same operand position.
+  Constant *Mask = Shuf.getMask();
+  Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
+                                ConstantExpr::getShuffleVector(IdC, C, Mask);
+
+  bool MightCreatePoisonOrUB =
+      Mask->containsUndefElement() &&
+      (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
+  if (MightCreatePoisonOrUB)
+    NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true);
+
+  // shuf (bop X, C), X, M --> bop X, C'
+  // shuf X, (bop X, C), M --> bop X, C'
+  Value *X = Op0IsBinop ? Op1 : Op0;
+  Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
+  NewBO->copyIRFlags(BO);
+
+  // An undef shuffle mask element may propagate as an undef constant element in
+  // the new binop. That would produce poison where the original code might not.
+  // If we already made a safe constant, then there's no danger.
+  if (Mask->containsUndefElement() && !MightCreatePoisonOrUB)
+    NewBO->dropPoisonGeneratingFlags();
+  return NewBO;
+}
+
+/// Try to fold shuffles that are the equivalent of a vector select.
+static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
+                                      InstCombiner::BuilderTy &Builder,
+                                      const DataLayout &DL) {
+  if (!Shuf.isSelect())
+    return nullptr;
+
+  if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
+    return I;
+
+  BinaryOperator *B0, *B1;
+  if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
+      !match(Shuf.getOperand(1), m_BinOp(B1)))
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *C0, *C1;
+  bool ConstantsAreOp1;
+  if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
+      match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
+    ConstantsAreOp1 = true;
+  else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
+           match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
+    ConstantsAreOp1 = false;
+  else
+    return nullptr;
+
+  // We need matching binops to fold the lanes together.
+  BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
+  BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
+  bool DropNSW = false;
+  if (ConstantsAreOp1 && Opc0 != Opc1) {
+    // TODO: We drop "nsw" if shift is converted into multiply because it may
+    // not be correct when the shift amount is BitWidth - 1. We could examine
+    // each vector element to determine if it is safe to keep that flag.
+    if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
+      DropNSW = true;
+    if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
+      assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
+      Opc0 = AltB0.Opcode;
+      C0 = cast<Constant>(AltB0.Op1);
+    } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
+      assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
+      Opc1 = AltB1.Opcode;
+      C1 = cast<Constant>(AltB1.Op1);
+    }
+  }
+
+  if (Opc0 != Opc1)
+    return nullptr;
+
+  // The opcodes must be the same. Use a new name to make that clear.
+  BinaryOperator::BinaryOps BOpc = Opc0;
+
+  // Select the constant elements needed for the single binop.
+  Constant *Mask = Shuf.getMask();
+  Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
+
+  // We are moving a binop after a shuffle. When a shuffle has an undefined
+  // mask element, the result is undefined, but it is not poison or undefined
+  // behavior. That is not necessarily true for div/rem/shift.
+  bool MightCreatePoisonOrUB =
+      Mask->containsUndefElement() &&
+      (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
+  if (MightCreatePoisonOrUB)
+    NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1);
+
+  Value *V;
+  if (X == Y) {
+    // Remove a binop and the shuffle by rearranging the constant:
+    // shuffle (op V, C0), (op V, C1), M --> op V, C'
+    // shuffle (op C0, V), (op C1, V), M --> op C', V
+    V = X;
+  } else {
+    // If there are 2 different variable operands, we must create a new shuffle
+    // (select) first, so check uses to ensure that we don't end up with more
+    // instructions than we started with.
+    if (!B0->hasOneUse() && !B1->hasOneUse())
+      return nullptr;
+
+    // If we use the original shuffle mask and op1 is *variable*, we would be
+    // putting an undef into operand 1 of div/rem/shift. This is either UB or
+    // poison. We do not have to guard against UB when *constants* are op1
+    // because safe constants guarantee that we do not overflow sdiv/srem (and
+    // there's no danger for other opcodes).
+    // TODO: To allow this case, create a new shuffle mask with no undefs.
+    if (MightCreatePoisonOrUB && !ConstantsAreOp1)
+      return nullptr;
+
+    // Note: In general, we do not create new shuffles in InstCombine because we
+    // do not know if a target can lower an arbitrary shuffle optimally. In this
+    // case, the shuffle uses the existing mask, so there is no additional risk.
+
+    // Select the variable vectors first, then perform the binop:
+    // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
+    // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
+    V = Builder.CreateShuffleVector(X, Y, Mask);
+  }
+
+  Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
+                                         BinaryOperator::Create(BOpc, NewC, V);
+
+  // Flags are intersected from the 2 source binops. But there are 2 exceptions:
+  // 1. If we changed an opcode, poison conditions might have changed.
+  // 2. If the shuffle had undef mask elements, the new binop might have undefs
+  //    where the original code did not. But if we already made a safe constant,
+  //    then there's no danger.
+  NewBO->copyIRFlags(B0);
+  NewBO->andIRFlags(B1);
+  if (DropNSW)
+    NewBO->setHasNoSignedWrap(false);
+  if (Mask->containsUndefElement() && !MightCreatePoisonOrUB)
+    NewBO->dropPoisonGeneratingFlags();
+  return NewBO;
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -1150,6 +1360,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
           LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI)))
     return replaceInstUsesWith(SVI, V);
 
+  if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
+    return I;
+
   bool MadeChange = false;
   unsigned VWidth = SVI.getType()->getVectorNumElements();
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 8fa7d0684b94..12fcc8752ea9 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -35,6 +35,7 @@
 
 #include "InstCombineInternal.h"
 #include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/InstCombine.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -56,6 +57,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -73,6 +75,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
@@ -94,8 +97,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -145,12 +146,20 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
 /// We don't want to convert from a legal to an illegal type or from a smaller
 /// to a larger illegal type. A width of '1' is always treated as a legal type
 /// because i1 is a fundamental type in IR, and there are many specialized
-/// optimizations for i1 types.
+/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
+/// legal to convert to, in order to open up more combining opportunities.
+/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
+/// from frontend languages.
 bool InstCombiner::shouldChangeType(unsigned FromWidth,
                                     unsigned ToWidth) const {
   bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
   bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
+  // Convert to widths of 8, 16 or 32 even if they are not legal types. Only
+  // shrink types, to prevent infinite loops.
+  if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
+    return true;
+
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
   if (FromLegal && !ToLegal)
@@ -397,28 +406,23 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
 
       // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
       // if C1 and C2 are constants.
+      Value *A, *B;
+      Constant *C1, *C2;
       if (Op0 && Op1 &&
           Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
-          isa<Constant>(Op0->getOperand(1)) &&
-          isa<Constant>(Op1->getOperand(1)) &&
-          Op0->hasOneUse() && Op1->hasOneUse()) {
-        Value *A = Op0->getOperand(0);
-        Constant *C1 = cast<Constant>(Op0->getOperand(1));
-        Value *B = Op1->getOperand(0);
-        Constant *C2 = cast<Constant>(Op1->getOperand(1));
-
-        Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
-        BinaryOperator *New = BinaryOperator::Create(Opcode, A, B);
-        if (isa<FPMathOperator>(New)) {
+          match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
+          match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
+        BinaryOperator *NewBO = BinaryOperator::Create(Opcode, A, B);
+        if (isa<FPMathOperator>(NewBO)) {
           FastMathFlags Flags = I.getFastMathFlags();
           Flags &= Op0->getFastMathFlags();
           Flags &= Op1->getFastMathFlags();
-          New->setFastMathFlags(Flags);
+          NewBO->setFastMathFlags(Flags);
         }
-        InsertNewInstWith(New, I);
-        New->takeName(Op1);
-        I.setOperand(0, New);
-        I.setOperand(1, Folded);
+        InsertNewInstWith(NewBO, I);
+        NewBO->takeName(Op1);
+        I.setOperand(0, NewBO);
+        I.setOperand(1, ConstantExpr::get(Opcode, C1, C2));
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
         ClearSubclassDataAfterReassociation(I);
@@ -435,72 +439,38 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
 
 /// Return whether "X LOp (Y ROp Z)" is always equal to
 /// "(X LOp Y) ROp (X LOp Z)".
-static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,
+static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
                                      Instruction::BinaryOps ROp) {
-  switch (LOp) {
-  default:
-    return false;
+  // X & (Y | Z) <--> (X & Y) | (X & Z)
+  // X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
+  if (LOp == Instruction::And)
+    return ROp == Instruction::Or || ROp == Instruction::Xor;
 
-  case Instruction::And:
-    // And distributes over Or and Xor.
-    switch (ROp) {
-    default:
-      return false;
-    case Instruction::Or:
-    case Instruction::Xor:
-      return true;
-    }
+  // X | (Y & Z) <--> (X | Y) & (X | Z)
+  if (LOp == Instruction::Or)
+    return ROp == Instruction::And;
 
-  case Instruction::Mul:
-    // Multiplication distributes over addition and subtraction.
-    switch (ROp) {
-    default:
-      return false;
-    case Instruction::Add:
-    case Instruction::Sub:
-      return true;
-    }
+  // X * (Y + Z) <--> (X * Y) + (X * Z)
+  // X * (Y - Z) <--> (X * Y) - (X * Z)
+  if (LOp == Instruction::Mul)
+    return ROp == Instruction::Add || ROp == Instruction::Sub;
 
-  case Instruction::Or:
-    // Or distributes over And.
-    switch (ROp) {
-    default:
-      return false;
-    case Instruction::And:
-      return true;
-    }
-  }
+  return false;
 }
 
 /// Return whether "(X LOp Y) ROp Z" is always equal to
 /// "(X ROp Z) LOp (Y ROp Z)".
-static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
+static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
                                      Instruction::BinaryOps ROp) {
   if (Instruction::isCommutative(ROp))
-    return LeftDistributesOverRight(ROp, LOp);
+    return leftDistributesOverRight(ROp, LOp);
+
+  // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts.
+  return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);
 
-  switch (LOp) {
-  default:
-    return false;
-  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
-  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
-  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    switch (ROp) {
-    default:
-      return false;
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-      return true;
-    }
-  }
   // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
   // but this requires knowing that the addition does not overflow and other
   // such subtleties.
-  return false;
 }
 
 /// This function returns identity value for given opcode, which can be used to
@@ -512,37 +482,27 @@ static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
   return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
 }
 
-/// This function factors binary ops which can be combined using distributive
-/// laws. This function tries to transform 'Op' based TopLevelOpcode to enable
-/// factorization e.g for ADD(SHL(X , 2), MUL(X, 5)), When this function called
-/// with TopLevelOpcode == Instruction::Add and Op = SHL(X, 2), transforms
-/// SHL(X, 2) to MUL(X, 4) i.e. returns Instruction::Mul with LHS set to 'X' and
-/// RHS to 4.
+/// This function predicates factorization using distributive laws. By default,
+/// it just returns the 'Op' inputs. But for special-cases like
+/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
+/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
+/// allow more factorization opportunities.
 static Instruction::BinaryOps
-getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
-                          BinaryOperator *Op, Value *&LHS, Value *&RHS) {
+getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
+                          Value *&LHS, Value *&RHS) {
   assert(Op && "Expected a binary operator");
-
   LHS = Op->getOperand(0);
   RHS = Op->getOperand(1);
-
-  switch (TopLevelOpcode) {
-  default:
-    return Op->getOpcode();
-
-  case Instruction::Add:
-  case Instruction::Sub:
-    if (Op->getOpcode() == Instruction::Shl) {
-      if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) {
-        // The multiplier is really 1 << CST.
-        RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST);
-        return Instruction::Mul;
-      }
+  if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) {
+    Constant *C;
+    if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
+      // X << C --> X * (1 << C)
+      RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
+      return Instruction::Mul;
     }
-    return Op->getOpcode();
+    // TODO: We can add other conversions e.g. shr => div etc.
   }
-
-  // TODO: We can add other conversions e.g. shr => div etc.
+  return Op->getOpcode();
 }
 
 /// This tries to simplify binary operations by factorizing out common terms
@@ -561,7 +521,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
   bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
 
   // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
-  if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+  if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
     // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
     // commutative case, "(A op' B) op (C op' A)"?
     if (A == C || (InnerCommutative && A == D)) {
@@ -580,7 +540,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
     }
 
   // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
-  if (!SimplifiedInst && RightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+  if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
     // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
     // commutative case, "(A op' B) op (B op' D)"?
     if (B == D || (InnerCommutative && B == C)) {
@@ -665,21 +625,19 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     // term.
     if (Op0)
       if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
-        if (Value *V =
-                tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
+        if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
           return V;
 
     // The instruction has the form "(B) op (C op' D)".  Try to factorize common
     // term.
     if (Op1)
       if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
-        if (Value *V =
-                tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
+        if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
           return V;
   }
 
   // Expansion.
-  if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
+  if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
     // The instruction has the form "(A op' B) op C".  See if expanding it out
     // to "(A op C) op' (B op C)" results in simplifications.
     Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
@@ -716,7 +674,7 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     }
   }
 
-  if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
+  if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
     // The instruction has the form "A op (B op' C)".  See if expanding it out
     // to "(A op B) op' (A op C)" results in simplifications.
     Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
@@ -818,23 +776,6 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
   return nullptr;
 }
 
-/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is
-/// a constant negative zero (which is the 'negate' form).
-Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
-  if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
-    return BinaryOperator::getFNegArgument(V);
-
-  // Constants can be considered to be negated values if they can be folded.
-  if (ConstantFP *C = dyn_cast<ConstantFP>(V))
-    return ConstantExpr::getFNeg(C);
-
-  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
-    if (C->getType()->getElementType()->isFloatingPointTy())
-      return ConstantExpr::getFNeg(C);
-
-  return nullptr;
-}
-
 static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
                                              InstCombiner::BuilderTy &Builder) {
   if (auto *Cast = dyn_cast<CastInst>(&I))
@@ -1082,8 +1023,9 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   return replaceInstUsesWith(I, NewPN);
 }
 
-Instruction *InstCombiner::foldOpWithConstantIntoOperand(BinaryOperator &I) {
-  assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
+Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
+  if (!isa<Constant>(I.getOperand(1)))
+    return nullptr;
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
     if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
@@ -1108,7 +1050,7 @@ Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  Type *IntPtrTy = DL.getIntPtrType(PtrTy);
+  Type *IndexTy = DL.getIndexType(PtrTy);
   int64_t FirstIdx = 0;
   if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -1123,7 +1065,7 @@ Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
     assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
   }
 
-  NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+  NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
 
   // Index into the types.  If we fail, set OrigBase to null.
   while (Offset) {
@@ -1145,7 +1087,7 @@ Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
     } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
       uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
       assert(EltSize && "Cannot index into a zero-sized array");
-      NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+      NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
       Offset %= EltSize;
       Ty = AT->getElementType();
     } else {
@@ -1409,22 +1351,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   } while (true);
 }
 
-/// \brief Creates node of binary operation with the same attributes as the
-/// specified one but with other operands.
-static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
-                                 InstCombiner::BuilderTy &B) {
-  Value *BO = B.CreateBinOp(Inst.getOpcode(), LHS, RHS);
-  // If LHS and RHS are constant, BO won't be a binary operator.
-  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO))
-    NewBO->copyIRFlags(&Inst);
-  return BO;
-}
-
-/// \brief Makes transformation of binary operation specific for vector types.
-/// \param Inst Binary operator to transform.
-/// \return Pointer to node that must replace the original binary operator, or
-///         null pointer if no transformation was made.
-Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
+Instruction *InstCombiner::foldShuffledBinop(BinaryOperator &Inst) {
   if (!Inst.getType()->isVectorTy()) return nullptr;
 
   // It may not be safe to reorder shuffles and things like div, urem, etc.
@@ -1438,58 +1365,71 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
   assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
 
+  auto createBinOpShuffle = [&](Value *X, Value *Y, Constant *M) {
+    Value *XY = Builder.CreateBinOp(Inst.getOpcode(), X, Y);
+    if (auto *BO = dyn_cast<BinaryOperator>(XY))
+      BO->copyIRFlags(&Inst);
+    return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
+  };
+
   // If both arguments of the binary operation are shuffles that use the same
-  // mask and shuffle within a single vector, move the shuffle after the binop:
-  //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
-  auto *LShuf = dyn_cast<ShuffleVectorInst>(LHS);
-  auto *RShuf = dyn_cast<ShuffleVectorInst>(RHS);
-  if (LShuf && RShuf && LShuf->getMask() == RShuf->getMask() &&
-      isa<UndefValue>(LShuf->getOperand(1)) &&
-      isa<UndefValue>(RShuf->getOperand(1)) &&
-      LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType()) {
-    Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
-                                      RShuf->getOperand(0), Builder);
-    return Builder.CreateShuffleVector(
-        NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
+  // mask and shuffle within a single vector, move the shuffle after the binop.
+  Value *V1, *V2;
+  Constant *Mask;
+  if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))) &&
+      match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(Mask))) &&
+      V1->getType() == V2->getType() &&
+      (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
+    // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
+    return createBinOpShuffle(V1, V2, Mask);
   }
 
-  // If one argument is a shuffle within one vector, the other is a constant,
-  // try moving the shuffle after the binary operation.
-  ShuffleVectorInst *Shuffle = nullptr;
-  Constant *C1 = nullptr;
-  if (isa<ShuffleVectorInst>(LHS)) Shuffle = cast<ShuffleVectorInst>(LHS);
-  if (isa<ShuffleVectorInst>(RHS)) Shuffle = cast<ShuffleVectorInst>(RHS);
-  if (isa<Constant>(LHS)) C1 = cast<Constant>(LHS);
-  if (isa<Constant>(RHS)) C1 = cast<Constant>(RHS);
-  if (Shuffle && C1 &&
-      (isa<ConstantVector>(C1) || isa<ConstantDataVector>(C1)) &&
-      isa<UndefValue>(Shuffle->getOperand(1)) &&
-      Shuffle->getType() == Shuffle->getOperand(0)->getType()) {
-    SmallVector<int, 16> ShMask = Shuffle->getShuffleMask();
-    // Find constant C2 that has property:
-    //   shuffle(C2, ShMask) = C1
-    // If such constant does not exist (example: ShMask=<0,0> and C1=<1,2>)
-    // reorder is not possible.
-    SmallVector<Constant*, 16> C2M(VWidth,
-                               UndefValue::get(C1->getType()->getScalarType()));
+  // If one argument is a shuffle within one vector and the other is a constant,
+  // try moving the shuffle after the binary operation. This canonicalization
+  // intends to move shuffles closer to other shuffles and binops closer to
+  // other binops, so they can be folded. It may also enable demanded elements
+  // transforms.
+  Constant *C;
+  if (match(&Inst, m_c_BinOp(
+          m_OneUse(m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))),
+          m_Constant(C))) &&
+      V1->getType() == Inst.getType()) {
+    // Find constant NewC that has property:
+    //   shuffle(NewC, ShMask) = C
+    // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
+    // reorder is not possible. A 1-to-1 mapping is not required. Example:
+    // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
+    SmallVector<int, 16> ShMask;
+    ShuffleVectorInst::getShuffleMask(Mask, ShMask);
+    SmallVector<Constant *, 16>
+        NewVecC(VWidth, UndefValue::get(C->getType()->getScalarType()));
     bool MayChange = true;
     for (unsigned I = 0; I < VWidth; ++I) {
       if (ShMask[I] >= 0) {
         assert(ShMask[I] < (int)VWidth);
-        if (!isa<UndefValue>(C2M[ShMask[I]])) {
+        Constant *CElt = C->getAggregateElement(I);
+        Constant *NewCElt = NewVecC[ShMask[I]];
+        if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt)) {
           MayChange = false;
           break;
         }
-        C2M[ShMask[I]] = C1->getAggregateElement(I);
+        NewVecC[ShMask[I]] = CElt;
       }
     }
     if (MayChange) {
-      Constant *C2 = ConstantVector::get(C2M);
-      Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0);
-      Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2;
-      Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder);
-      return Builder.CreateShuffleVector(NewBO,
-          UndefValue::get(Inst.getType()), Shuffle->getMask());
+      Constant *NewC = ConstantVector::get(NewVecC);
+      // It may not be safe to execute a binop on a vector with undef elements
+      // because the entire instruction can be folded to undef or create poison
+      // that did not exist in the original code.
+      bool ConstOp1 = isa<Constant>(Inst.getOperand(1));
+      if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
+        NewC = getSafeVectorConstantForBinop(Inst.getOpcode(), NewC, ConstOp1);
+      
+      // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
+      // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
+      Value *NewLHS = isa<Constant>(LHS) ? NewC : V1;
+      Value *NewRHS = isa<Constant>(LHS) ? V1 : NewC;
+      return createBinOpShuffle(NewLHS, NewRHS, Mask);
     }
   }
 
@@ -1498,9 +1438,9 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
 
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
-
-  if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops,
-                                 SQ.getWithInstruction(&GEP)))
+  Type *GEPType = GEP.getType();
+  Type *GEPEltType = GEP.getSourceElementType();
+  if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
@@ -1508,8 +1448,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Eliminate unneeded casts for indices, and replace indices which displace
   // by multiples of a zero size type with zero.
   bool MadeChange = false;
-  Type *IntPtrTy =
-    DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType());
+
+  // Index width may not be the same width as pointer width.
+  // Data layout chooses the right type based on supported integer types.
+  Type *NewScalarIndexTy =
+      DL.getIndexType(GEP.getPointerOperandType()->getScalarType());
 
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
@@ -1518,10 +1461,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (GTI.isStruct())
       continue;
 
-    // Index type should have the same width as IntPtr
     Type *IndexTy = (*I)->getType();
-    Type *NewIndexType = IndexTy->isVectorTy() ?
-      VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy;
+    Type *NewIndexType =
+        IndexTy->isVectorTy()
+            ? VectorType::get(NewScalarIndexTy, IndexTy->getVectorNumElements())
+            : NewScalarIndexTy;
 
     // If the element type has zero size then any index over it is equivalent
     // to an index of zero, so replace it with zero if it is not zero already.
@@ -1544,8 +1488,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return &GEP;
 
   // Check to see if the inputs to the PHI node are getelementptr instructions.
-  if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
-    GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+  if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
+    auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
     if (!Op1)
       return nullptr;
 
@@ -1561,7 +1505,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     int DI = -1;
 
     for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
-      GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
+      auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
       if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
         return nullptr;
 
@@ -1603,7 +1547,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         if (J > 0) {
           if (J == 1) {
             CurTy = Op1->getSourceElementType();
-          } else if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
+          } else if (auto *CT = dyn_cast<CompositeType>(CurTy)) {
             CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
           } else {
             CurTy = nullptr;
@@ -1618,7 +1562,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (DI != -1 && !PN->hasOneUse())
       return nullptr;
 
-    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+    auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
     if (DI == -1) {
       // All the GEPs feeding the PHI are identical. Clone one down into our
       // BB so that it can be merged with the current GEP.
@@ -1653,15 +1597,64 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Combine Indices - If the source pointer to this getelementptr instruction
   // is a getelementptr instruction, combine the indices of the two
   // getelementptr instructions into a single instruction.
-  if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
+  if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
     if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
       return nullptr;
 
+    // Try to reassociate loop invariant GEP chains to enable LICM.
+    if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+        Src->hasOneUse()) {
+      if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+        Value *GO1 = GEP.getOperand(1);
+        Value *SO1 = Src->getOperand(1);
+        // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+        // invariant: this breaks the dependence between GEPs and allows LICM
+        // to hoist the invariant part out of the loop.
+        if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+          // We have to be careful here.
+          // We have something like:
+          //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+          //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+          // If we just swap idx & idx2 then we could inadvertantly
+          // change %src from a vector to a scalar, or vice versa.
+          // Cases:
+          //  1) %base a scalar & idx a scalar & idx2 a vector
+          //      => Swapping idx & idx2 turns %src into a vector type.
+          //  2) %base a scalar & idx a vector & idx2 a scalar
+          //      => Swapping idx & idx2 turns %src in a scalar type
+          //  3) %base, %idx, and %idx2 are scalars
+          //      => %src & %gep are scalars
+          //      => swapping idx & idx2 is safe
+          //  4) %base a vector
+          //      => %src is a vector
+          //      => swapping idx & idx2 is safe.
+          auto *SO0 = Src->getOperand(0);
+          auto *SO0Ty = SO0->getType();
+          if (!isa<VectorType>(GEPType) || // case 3
+              isa<VectorType>(SO0Ty)) {    // case 4
+            Src->setOperand(1, GO1);
+            GEP.setOperand(1, SO1);
+            return &GEP;
+          } else {
+            // Case 1 or 2
+            // -- have to recreate %src & %gep
+            // put NewSrc at same location as %src
+            Builder.SetInsertPoint(cast<Instruction>(PtrOp));
+            auto *NewSrc = cast<GetElementPtrInst>(
+                Builder.CreateGEP(SO0, GO1, Src->getName()));
+            NewSrc->setIsInBounds(Src->isInBounds());
+            auto *NewGEP = GetElementPtrInst::Create(nullptr, NewSrc, {SO1});
+            NewGEP->setIsInBounds(GEP.isInBounds());
+            return NewGEP;
+          }
+        }
+      }
+    }
+
     // Note that if our source is a gep chain itself then we wait for that
     // chain to be resolved before we perform this transformation.  This
     // avoids us creating a TON of code in some cases.
-    if (GEPOperator *SrcGEP =
-          dyn_cast<GEPOperator>(Src->getOperand(0)))
+    if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
       if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
         return nullptr;   // Wait until our source is folded to completion.
 
@@ -1724,9 +1717,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   if (GEP.getNumIndices() == 1) {
     unsigned AS = GEP.getPointerAddressSpace();
     if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
-        DL.getPointerSizeInBits(AS)) {
-      Type *Ty = GEP.getSourceElementType();
-      uint64_t TyAllocSize = DL.getTypeAllocSize(Ty);
+        DL.getIndexSizeInBits(AS)) {
+      uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType);
 
       bool Matched = false;
       uint64_t C;
@@ -1753,22 +1745,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           Operator *Index = cast<Operator>(V);
           Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
           Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
-          return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+          return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
         }
         // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
         // to (bitcast Y)
         Value *Y;
         if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
-                           m_PtrToInt(m_Specific(GEP.getOperand(0)))))) {
-          return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y,
-                                                               GEP.getType());
-        }
+                           m_PtrToInt(m_Specific(GEP.getOperand(0))))))
+          return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
       }
     }
   }
 
   // We do not handle pointer-vector geps here.
-  if (GEP.getType()->isVectorTy())
+  if (GEPType->isVectorTy())
     return nullptr;
 
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
@@ -1777,7 +1767,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   if (StrippedPtr != PtrOp) {
     bool HasZeroPointerIndex = false;
-    if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
+    if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
       HasZeroPointerIndex = C->isZero();
 
     // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
@@ -1788,8 +1778,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     //
     // This occurs when the program declares an array extern like "int X[];"
     if (HasZeroPointerIndex) {
-      if (ArrayType *CATy =
-          dyn_cast<ArrayType>(GEP.getSourceElementType())) {
+      if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
         // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
         if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
           // -> GEP i8* X, ...
@@ -1805,11 +1794,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           // ->
           // %0 = GEP i8 addrspace(1)* X, ...
           // addrspacecast i8 addrspace(1)* %0 to i8*
-          return new AddrSpaceCastInst(Builder.Insert(Res), GEP.getType());
+          return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
         }
 
-        if (ArrayType *XATy =
-              dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
+        if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrTy->getElementType())) {
           // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
           if (CATy->getElementType() == XATy->getElementType()) {
             // -> GEP [10 x i8]* X, i32 0, ...
@@ -1837,7 +1825,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                       nullptr, StrippedPtr, Idx, GEP.getName())
                                 : Builder.CreateGEP(nullptr, StrippedPtr, Idx,
                                                     GEP.getName());
-            return new AddrSpaceCastInst(NewGEP, GEP.getType());
+            return new AddrSpaceCastInst(NewGEP, GEPType);
           }
         }
       }
@@ -1845,12 +1833,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // Transform things like:
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
-      Type *SrcElTy = StrippedPtrTy->getElementType();
-      Type *ResElTy = GEP.getSourceElementType();
-      if (SrcElTy->isArrayTy() &&
-          DL.getTypeAllocSize(SrcElTy->getArrayElementType()) ==
-              DL.getTypeAllocSize(ResElTy)) {
-        Type *IdxType = DL.getIntPtrType(GEP.getType());
+      Type *SrcEltTy = StrippedPtrTy->getElementType();
+      if (SrcEltTy->isArrayTy() &&
+          DL.getTypeAllocSize(SrcEltTy->getArrayElementType()) ==
+              DL.getTypeAllocSize(GEPEltType)) {
+        Type *IdxType = DL.getIndexType(GEPType);
         Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
         Value *NewGEP =
             GEP.isInBounds()
@@ -1859,28 +1846,28 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                 : Builder.CreateGEP(nullptr, StrippedPtr, Idx, GEP.getName());
 
         // V and GEP are both pointer types --> BitCast
-        return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
-                                                             GEP.getType());
+        return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
       }
 
       // Transform things like:
       // %V = mul i64 %N, 4
       // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
       // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast
-      if (ResElTy->isSized() && SrcElTy->isSized()) {
+      if (GEPEltType->isSized() && SrcEltTy->isSized()) {
         // Check that changing the type amounts to dividing the index by a scale
         // factor.
-        uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
-        uint64_t SrcSize = DL.getTypeAllocSize(SrcElTy);
+        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType);
+        uint64_t SrcSize = DL.getTypeAllocSize(SrcEltTy);
         if (ResSize && SrcSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
           uint64_t Scale = SrcSize / ResSize;
 
-          // Earlier transforms ensure that the index has type IntPtrType, which
-          // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
-                 "Index not cast to pointer width?");
+          // Earlier transforms ensure that the index has the right type
+          // according to Data Layout, which considerably simplifies the
+          // logic by eliminating implicit casts.
+          assert(Idx->getType() == DL.getIndexType(GEPType) &&
+                 "Index type does not match the Data Layout preferences");
 
           bool NSW;
           if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
@@ -1896,7 +1883,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
-                                                                 GEP.getType());
+                                                                 GEPType);
           }
         }
       }
@@ -1905,39 +1892,40 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
       //   (where tmp = 8*tmp2) into:
       // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
-      if (ResElTy->isSized() && SrcElTy->isSized() && SrcElTy->isArrayTy()) {
+      if (GEPEltType->isSized() && SrcEltTy->isSized() &&
+          SrcEltTy->isArrayTy()) {
         // Check that changing to the array element type amounts to dividing the
         // index by a scale factor.
-        uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
+        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType);
         uint64_t ArrayEltSize =
-            DL.getTypeAllocSize(SrcElTy->getArrayElementType());
+            DL.getTypeAllocSize(SrcEltTy->getArrayElementType());
         if (ResSize && ArrayEltSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
           uint64_t Scale = ArrayEltSize / ResSize;
 
-          // Earlier transforms ensure that the index has type IntPtrType, which
-          // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
-                 "Index not cast to pointer width?");
+          // Earlier transforms ensure that the index has the right type
+          // according to the Data Layout, which considerably simplifies
+          // the logic by eliminating implicit casts.
+          assert(Idx->getType() == DL.getIndexType(GEPType) &&
+                 "Index type does not match the Data Layout preferences");
 
           bool NSW;
           if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
             // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
-            Value *Off[2] = {
-                Constant::getNullValue(DL.getIntPtrType(GEP.getType())),
-                NewIdx};
+            Type *IndTy = DL.getIndexType(GEPType);
+            Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
 
             Value *NewGEP = GEP.isInBounds() && NSW
                                 ? Builder.CreateInBoundsGEP(
-                                      SrcElTy, StrippedPtr, Off, GEP.getName())
-                                : Builder.CreateGEP(SrcElTy, StrippedPtr, Off,
+                                      SrcEltTy, StrippedPtr, Off, GEP.getName())
+                                : Builder.CreateGEP(SrcEltTy, StrippedPtr, Off,
                                                     GEP.getName());
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
-                                                                 GEP.getType());
+                                                                 GEPType);
           }
         }
       }
@@ -1948,34 +1936,52 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // addrspacecast. To take advantage of the below bitcast + struct GEP, look
   // through the addrspacecast.
   Value *ASCStrippedPtrOp = PtrOp;
-  if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
+  if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
     //   X = bitcast A addrspace(1)* to B addrspace(1)*
     //   Y = addrspacecast A addrspace(1)* to B addrspace(2)*
     //   Z = gep Y, <...constant indices...>
     // Into an addrspacecasted GEP of the struct.
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
+    if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
       ASCStrippedPtrOp = BC;
   }
 
-  /// See if we can simplify:
-  ///   X = bitcast A* to B*
-  ///   Y = gep X, <...constant indices...>
-  /// into a gep of the original struct.  This is important for SROA and alias
-  /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
-  if (BitCastInst *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
-    Value *Operand = BCI->getOperand(0);
-    PointerType *OpType = cast<PointerType>(Operand->getType());
-    unsigned OffsetBits = DL.getPointerTypeSizeInBits(GEP.getType());
-    APInt Offset(OffsetBits, 0);
-    if (!isa<BitCastInst>(Operand) &&
-        GEP.accumulateConstantOffset(DL, Offset)) {
+  if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
+    Value *SrcOp = BCI->getOperand(0);
+    PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
+    Type *SrcEltType = SrcType->getElementType();
+
+    // GEP directly using the source operand if this GEP is accessing an element
+    // of a bitcasted pointer to vector or array of the same dimensions:
+    // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
+    // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
+    auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy) {
+      return ArrTy->getArrayElementType() == VecTy->getVectorElementType() &&
+             ArrTy->getArrayNumElements() == VecTy->getVectorNumElements();
+    };
+    if (GEP.getNumOperands() == 3 &&
+        ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() &&
+          areMatchingArrayAndVecTypes(GEPEltType, SrcEltType)) ||
+         (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() &&
+          areMatchingArrayAndVecTypes(SrcEltType, GEPEltType)))) {
+      GEP.setOperand(0, SrcOp);
+      GEP.setSourceElementType(SrcEltType);
+      return &GEP;
+    }
 
+    // See if we can simplify:
+    //   X = bitcast A* to B*
+    //   Y = gep X, <...constant indices...>
+    // into a gep of the original struct. This is important for SROA and alias
+    // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
+    unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
+    APInt Offset(OffsetBits, 0);
+    if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
       // If this GEP instruction doesn't move the pointer, just replace the GEP
       // with a bitcast of the real input to the dest type.
       if (!Offset) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
-        if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, &TLI)) {
+        if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1987,43 +1993,43 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           }
         }
 
-        if (Operand->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
-          return new AddrSpaceCastInst(Operand, GEP.getType());
-        return new BitCastInst(Operand, GEP.getType());
+        if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(SrcOp, GEPType);
+        return new BitCastInst(SrcOp, GEPType);
       }
 
       // Otherwise, if the offset is non-zero, we need to find out if there is a
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP =
             GEP.isInBounds()
-                ? Builder.CreateInBoundsGEP(nullptr, Operand, NewIndices)
-                : Builder.CreateGEP(nullptr, Operand, NewIndices);
+                ? Builder.CreateInBoundsGEP(nullptr, SrcOp, NewIndices)
+                : Builder.CreateGEP(nullptr, SrcOp, NewIndices);
 
-        if (NGEP->getType() == GEP.getType())
+        if (NGEP->getType() == GEPType)
           return replaceInstUsesWith(GEP, NGEP);
         NGEP->takeName(&GEP);
 
         if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
-          return new AddrSpaceCastInst(NGEP, GEP.getType());
-        return new BitCastInst(NGEP, GEP.getType());
+          return new AddrSpaceCastInst(NGEP, GEPType);
+        return new BitCastInst(NGEP, GEPType);
       }
     }
   }
 
   if (!GEP.isInBounds()) {
-    unsigned PtrWidth =
-        DL.getPointerSizeInBits(PtrOp->getType()->getPointerAddressSpace());
-    APInt BasePtrOffset(PtrWidth, 0);
+    unsigned IdxWidth =
+        DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace());
+    APInt BasePtrOffset(IdxWidth, 0);
     Value *UnderlyingPtrOp =
             PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
                                                              BasePtrOffset);
     if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
       if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
           BasePtrOffset.isNonNegative()) {
-        APInt AllocSize(PtrWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
+        APInt AllocSize(IdxWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
         if (BasePtrOffset.ule(AllocSize)) {
           return GetElementPtrInst::CreateInBounds(
               PtrOp, makeArrayRef(Ops).slice(1), GEP.getName());
@@ -2200,7 +2206,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   return nullptr;
 }
 
-/// \brief Move the call to free before a NULL test.
+/// Move the call to free before a NULL test.
 ///
 /// Check if this free is accessed after its argument has been test
 /// against NULL (property 0).
@@ -2564,6 +2570,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
   case EHPersonality::MSVC_Win64SEH:
   case EHPersonality::MSVC_CXX:
   case EHPersonality::CoreCLR:
+  case EHPersonality::Wasm_CXX:
     return TypeInfo->isNullValue();
   }
   llvm_unreachable("invalid enum");
@@ -2891,6 +2898,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
 /// block.
 static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   assert(I->hasOneUse() && "Invariants didn't hold!");
+  BasicBlock *SrcBlock = I->getParent();
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
   if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
@@ -2920,10 +2928,20 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
       if (Scan->mayWriteToMemory())
         return false;
   }
-
   BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
   I->moveBefore(&*InsertPos);
   ++NumSunkInst;
+
+  // Also sink all related debug uses from the source basic block. Otherwise we
+  // get debug use before the def.
+  SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, I);
+  for (auto *DII : DbgUsers) {
+    if (DII->getParent() == SrcBlock) {
+      DII->moveBefore(&*InsertPos);
+      LLVM_DEBUG(dbgs() << "SINK: " << *DII << '\n');
+    }
+  }
   return true;
 }
 
@@ -2934,7 +2952,7 @@ bool InstCombiner::run() {
 
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, &TLI)) {
-      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
       eraseInstFromFunction(*I);
       ++NumDeadInst;
       MadeIRChange = true;
@@ -2948,7 +2966,8 @@ bool InstCombiner::run() {
     if (!I->use_empty() &&
         (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
       if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
-        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+        LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I
+                          << '\n');
 
         // Add operands to the worklist.
         replaceInstUsesWith(*I, C);
@@ -2967,8 +2986,8 @@ bool InstCombiner::run() {
       KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
       if (Known.isConstant()) {
         Constant *C = ConstantInt::get(Ty, Known.getConstant());
-        DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
-                        " from: " << *I << '\n');
+        LLVM_DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C
+                          << " from: " << *I << '\n');
 
         // Add operands to the worklist.
         replaceInstUsesWith(*I, C);
@@ -3007,7 +3026,7 @@ bool InstCombiner::run() {
         if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
           // Okay, the CFG is simple enough, try to sink this instruction.
           if (TryToSinkInstruction(I, UserParent)) {
-            DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+            LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
             MadeIRChange = true;
             // We'll add uses of the sunk instruction below, but since sinking
             // can expose opportunities for it's *operands* add them to the
@@ -3027,15 +3046,15 @@ bool InstCombiner::run() {
 #ifndef NDEBUG
     std::string OrigI;
 #endif
-    DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
-    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+    LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+    LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
 
     if (Instruction *Result = visit(*I)) {
       ++NumCombined;
       // Should we replace the old instruction with a new one?
       if (Result != I) {
-        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
-                     << "    New = " << *Result << '\n');
+        LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
+                          << "    New = " << *Result << '\n');
 
         if (I->getDebugLoc())
           Result->setDebugLoc(I->getDebugLoc());
@@ -3062,8 +3081,8 @@ bool InstCombiner::run() {
 
         eraseInstFromFunction(*I);
       } else {
-        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
-                     << "    New = " << *I << '\n');
+        LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
+                          << "    New = " << *I << '\n');
 
         // If the instruction was modified, it's possible that it is now dead.
         // if so, remove it.
@@ -3114,7 +3133,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
-        DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
+        LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
         salvageDebugInfo(*Inst);
         Inst->eraseFromParent();
         MadeIRChange = true;
@@ -3125,8 +3144,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       if (!Inst->use_empty() &&
           (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
         if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
-          DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: "
-                       << *Inst << '\n');
+          LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
+                            << '\n');
           Inst->replaceAllUsesWith(C);
           ++NumConstProp;
           if (isInstructionTriviallyDead(Inst, TLI))
@@ -3148,9 +3167,9 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
           FoldRes = C;
 
         if (FoldRes != C) {
-          DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
-                       << "\n    Old = " << *C
-                       << "\n    New = " << *FoldRes << '\n');
+          LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+                            << "\n    Old = " << *C
+                            << "\n    New = " << *FoldRes << '\n');
           U = FoldRes;
           MadeIRChange = true;
         }
@@ -3193,7 +3212,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
   return MadeIRChange;
 }
 
-/// \brief Populate the IC worklist from a function, and prune any dead basic
+/// Populate the IC worklist from a function, and prune any dead basic
 /// blocks discovered in the process.
 ///
 /// This also does basic constant propagation and other forward fixing to make
@@ -3253,8 +3272,8 @@ static bool combineInstructionsOverFunction(
   int Iteration = 0;
   while (true) {
     ++Iteration;
-    DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
-                 << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+                      << F.getName() << "\n");
 
     MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
@@ -3350,3 +3369,7 @@ void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
 FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
   return new InstructionCombiningPass(ExpensiveCombines);
 }
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8e39f24d819c..b3f659194558 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/IR/Argument.h"
@@ -71,7 +72,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
@@ -107,10 +107,18 @@ static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 
+static const uint64_t kMyriadShadowScale = 5;
+static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL;
+static const uint64_t kMyriadMemorySize32 = 0x20000000ULL;
+static const uint64_t kMyriadTagShift = 29;
+static const uint64_t kMyriadDDRTag = 4;
+static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL;
+
 // The shadow memory space is dynamically allocated.
 static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
 
@@ -145,7 +153,7 @@ static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const int kMaxAsanStackMallocSizeClass = 10;
 static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
 static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
-static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kAsanGenPrefix = "___asan_gen_";
 static const char *const kODRGenPrefix = "__odr_asan_gen_";
 static const char *const kSanCovGenPrefix = "__sancov_gen_";
 static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
@@ -485,18 +493,17 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   bool IsSystemZ = TargetTriple.getArch() == Triple::systemz;
   bool IsX86 = TargetTriple.getArch() == Triple::x86;
   bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
-  bool IsMIPS32 = TargetTriple.getArch() == Triple::mips ||
-                  TargetTriple.getArch() == Triple::mipsel;
-  bool IsMIPS64 = TargetTriple.getArch() == Triple::mips64 ||
-                  TargetTriple.getArch() == Triple::mips64el;
+  bool IsMIPS32 = TargetTriple.isMIPS32();
+  bool IsMIPS64 = TargetTriple.isMIPS64();
   bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
   bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
   bool IsWindows = TargetTriple.isOSWindows();
   bool IsFuchsia = TargetTriple.isOSFuchsia();
+  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
 
   ShadowMapping Mapping;
 
-  Mapping.Scale = kDefaultShadowScale;
+  Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale;
   if (ClMappingScale.getNumOccurrences() > 0) {
     Mapping.Scale = ClMappingScale;
   }
@@ -508,11 +515,18 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
       Mapping.Offset = kMIPS32_ShadowOffset32;
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset32;
+    else if (IsNetBSD)
+      Mapping.Offset = kNetBSD_ShadowOffset32;
     else if (IsIOS)
       // If we're targeting iOS and x86, the binary is built for iOS simulator.
       Mapping.Offset = IsX86 ? kIOSSimShadowOffset32 : kIOSShadowOffset32;
     else if (IsWindows)
       Mapping.Offset = kWindowsShadowOffset32;
+    else if (IsMyriad) {
+      uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
+                               (kMyriadMemorySize32 >> Mapping.Scale));
+      Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale);
+    }
     else
       Mapping.Offset = kDefaultShadowOffset32;
   } else {  // LongSize == 64
@@ -589,9 +603,10 @@ struct AddressSanitizer : public FunctionPass {
 
   explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
                             bool UseAfterScope = false)
-      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan),
-        Recover(Recover || ClRecover),
-        UseAfterScope(UseAfterScope || ClUseAfterScope) {
+      : FunctionPass(ID), UseAfterScope(UseAfterScope || ClUseAfterScope) {
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
+        ClEnableKasan : CompileKernel;
     initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -717,8 +732,7 @@ public:
   explicit AddressSanitizerModule(bool CompileKernel = false,
                                   bool Recover = false,
                                   bool UseGlobalsGC = true)
-      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
-        Recover(Recover || ClRecover),
+      : ModulePass(ID),
         UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
         // Not a typo: ClWithComdat is almost completely pointless without
         // ClUseGlobalsGC (because then it only works on modules without
@@ -727,7 +741,12 @@ public:
         // argument is designed as workaround. Therefore, disable both
         // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
         // do globals-gc.
-        UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
+        UseCtorComdat(UseGlobalsGC && ClWithComdat) {
+          this->Recover = ClRecover.getNumOccurrences() > 0 ?
+              ClRecover : Recover;
+          this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
+              ClEnableKasan : CompileKernel;
+	}
 
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -869,7 +888,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     processStaticAllocas();
 
     if (ClDebugStack) {
-      DEBUG(dbgs() << F);
+      LLVM_DEBUG(dbgs() << F);
     }
     return true;
   }
@@ -888,13 +907,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void createDynamicAllocasInitStorage();
 
   // ----------------------- Visitors.
-  /// \brief Collect all Ret instructions.
+  /// Collect all Ret instructions.
   void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
 
-  /// \brief Collect all Resume instructions.
+  /// Collect all Resume instructions.
   void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
 
-  /// \brief Collect all CatchReturnInst instructions.
+  /// Collect all CatchReturnInst instructions.
   void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
 
   void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
@@ -942,7 +961,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   // requested memory, but also left, partial and right redzones.
   void handleDynamicAllocaCall(AllocaInst *AI);
 
-  /// \brief Collect Alloca instructions we want (and can) handle.
+  /// Collect Alloca instructions we want (and can) handle.
   void visitAllocaInst(AllocaInst &AI) {
     if (!ASan.isInterestingAlloca(AI)) {
       if (AI.isStaticAlloca()) {
@@ -963,7 +982,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
       AllocaVec.push_back(&AI);
   }
 
-  /// \brief Collect lifetime intrinsic calls to check for use-after-scope
+  /// Collect lifetime intrinsic calls to check for use-after-scope
   /// errors.
   void visitIntrinsicInst(IntrinsicInst &II) {
     Intrinsic::ID ID = II.getIntrinsicID();
@@ -1081,7 +1100,7 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
   return Res;
 }
 
-// \brief Create a constant for Str so that we can pass it to the run-time lib.
+// Create a constant for Str so that we can pass it to the run-time lib.
 static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
                                                     bool AllowMerging) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
@@ -1095,7 +1114,7 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
   return GV;
 }
 
-/// \brief Create a global describing a source location.
+/// Create a global describing a source location.
 static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
                                                        LocationMetadata MD) {
   Constant *LocData[] = {
@@ -1111,7 +1130,7 @@ static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
   return GV;
 }
 
-/// \brief Check if \p G has been created by a trusted compiler pass.
+/// Check if \p G has been created by a trusted compiler pass.
 static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
   // Do not instrument asan globals.
   if (G->getName().startswith(kAsanGenPrefix) ||
@@ -1487,6 +1506,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
                                          uint32_t TypeSize, bool IsWrite,
                                          Value *SizeArgument, bool UseCalls,
                                          uint32_t Exp) {
+  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+
   IRBuilder<> IRB(InsertBefore);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
@@ -1501,6 +1522,23 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     return;
   }
 
+  if (IsMyriad) {
+    // Strip the cache bit and do range check.
+    // AddrLong &= ~kMyriadCacheBitMask32
+    AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32);
+    // Tag = AddrLong >> kMyriadTagShift
+    Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift);
+    // Tag == kMyriadDDRTag
+    Value *TagCheck =
+        IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
+
+    TerminatorInst *TagCheckTerm = SplitBlockAndInsertIfThen(
+        TagCheck, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
+    IRB.SetInsertPoint(TagCheckTerm);
+    InsertBefore = TagCheckTerm;
+  }
+
   Type *ShadowTy =
       IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
@@ -1609,7 +1647,7 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
 
 bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
   Type *Ty = G->getValueType();
-  DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+  LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
   if (GlobalsMD.get(G).IsBlacklisted) return false;
   if (!Ty->isSized()) return false;
@@ -1646,12 +1684,17 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
       return false;
     }
 
-    // Callbacks put into the CRT initializer/terminator sections
-    // should not be instrumented.
+    // On COFF, if the section name contains '$', it is highly likely that the
+    // user is using section sorting to create an array of globals similar to
+    // the way initialization callbacks are registered in .init_array and
+    // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones
+    // to such globals is counterproductive, because the intent is that they
+    // will form an array, and out-of-bounds accesses are expected.
     // See https://github.com/google/sanitizers/issues/305
     // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
-    if (Section.startswith(".CRT")) {
-      DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n");
+    if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) {
+      LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): "
+                        << *G << "\n");
       return false;
     }
 
@@ -1668,7 +1711,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
       // them.
       if (ParsedSegment == "__OBJC" ||
           (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
-        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+        LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
         return false;
       }
       // See https://github.com/google/sanitizers/issues/32
@@ -1680,13 +1723,13 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
       // Therefore there's no point in placing redzones into __DATA,__cfstring.
       // Moreover, it causes the linker to crash on OS X 10.7
       if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
-        DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+        LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
         return false;
       }
       // The linker merges the contents of cstring_literals and removes the
       // trailing zeroes.
       if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
-        DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+        LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
         return false;
       }
     }
@@ -2153,11 +2196,21 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
 
     if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
 
-    DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+    LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
 
     Initializers[i] = Initializer;
   }
 
+  // Add instrumented globals to llvm.compiler.used list to avoid LTO from
+  // ConstantMerge'ing them.
+  SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList;
+  for (size_t i = 0; i < n; i++) {
+    GlobalVariable *G = NewGlobals[i];
+    if (G->getName().empty()) continue;
+    GlobalsToAddToUsedList.push_back(G);
+  }
+  appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
+
   std::string ELFUniqueModuleId =
       (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
                                                         : "";
@@ -2177,7 +2230,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
   if (HasDynamicallyInitializedGlobals)
     createInitializerPoisonCalls(M, ModuleName);
 
-  DEBUG(dbgs() << M);
+  LLVM_DEBUG(dbgs() << M);
   return true;
 }
 
@@ -2247,7 +2300,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
     for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
       const std::string TypeStr = AccessIsWrite ? "store" : "load";
       const std::string ExpStr = Exp ? "exp_" : "";
-      const std::string SuffixStr = CompileKernel ? "N" : "_n";
       const std::string EndingStr = Recover ? "_noabort" : "";
 
       SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
@@ -2259,8 +2311,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
       }
       AsanErrorCallbackSized[AccessIsWrite][Exp] =
           checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
-                  EndingStr,
+              kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
               FunctionType::get(IRB.getVoidTy(), Args2, false)));
 
       AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
@@ -2420,7 +2471,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   // Leave if the function doesn't need instrumentation.
   if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
 
-  DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+  LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   initializeCallbacks(*F.getParent());
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -2435,7 +2486,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   // We want to instrument every address only once per basic block (unless there
   // are calls between uses).
-  SmallSet<Value *, 16> TempsToInstrument;
+  SmallPtrSet<Value *, 16> TempsToInstrument;
   SmallVector<Instruction *, 16> ToInstrument;
   SmallVector<Instruction *, 8> NoReturnCalls;
   SmallVector<BasicBlock *, 16> AllBlocks;
@@ -2494,7 +2545,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   }
 
   bool UseCalls =
-      CompileKernel ||
       (ClInstrumentationWithCallsThreshold >= 0 &&
        ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
   const DataLayout &DL = F.getParent()->getDataLayout();
@@ -2534,8 +2584,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   if (NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty())
     FunctionModified = true;
 
-  DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
-               << F << "\n");
+  LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+                    << F << "\n");
 
   return FunctionModified;
 }
@@ -2710,7 +2760,7 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
       Arg.replaceAllUsesWith(AI);
 
       uint64_t AllocSize = DL.getTypeAllocSize(Ty);
-      IRB.CreateMemCpy(AI, &Arg, AllocSize, Align);
+      IRB.CreateMemCpy(AI, Align, &Arg, Align, AllocSize);
     }
   }
 }
@@ -2851,7 +2901,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
   }
 
   auto DescriptionString = ComputeASanStackFrameDescription(SVD);
-  DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
+  LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
   uint64_t LocalStackSize = L.FrameSize;
   bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
                        LocalStackSize <= kMaxStackMallocSize;
@@ -3086,7 +3136,8 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
   } else if (GetElementPtrInst *EP = dyn_cast<GetElementPtrInst>(V)) {
     Res = findAllocaForValue(EP->getPointerOperand());
   } else {
-    DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V
+                      << "\n");
   }
   if (Res) AllocaForValue[V] = Res;
   return Res;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index be9a22a8681b..e13db08e263c 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -59,11 +60,11 @@ template <typename GetTrapBBT>
 static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
                                 const DataLayout &DL, TargetLibraryInfo &TLI,
                                 ObjectSizeOffsetEvaluator &ObjSizeEval,
-                                BuilderTy &IRB,
-                                GetTrapBBT GetTrapBB) {
+                                BuilderTy &IRB, GetTrapBBT GetTrapBB,
+                                ScalarEvolution &SE) {
   uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
-  DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
-              << " bytes\n");
+  LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+                    << " bytes\n");
 
   SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
 
@@ -79,6 +80,10 @@ static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
   Type *IntTy = DL.getIntPtrType(Ptr->getType());
   Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
 
+  auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
+  auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
+  auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
+
   // three checks are required to ensure safety:
   // . Offset >= 0  (since the offset is given from the base ptr)
   // . Size >= Offset  (unsigned)
@@ -87,10 +92,17 @@ static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
   // optimization: if Size >= 0 (signed), skip 1st check
   // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
   Value *ObjSize = IRB.CreateSub(Size, Offset);
-  Value *Cmp2 = IRB.CreateICmpULT(Size, Offset);
-  Value *Cmp3 = IRB.CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
+                    ? ConstantInt::getFalse(Ptr->getContext())
+                    : IRB.CreateICmpULT(Size, Offset);
+  Value *Cmp3 = SizeRange.sub(OffsetRange)
+                        .getUnsignedMin()
+                        .uge(NeededSizeRange.getUnsignedMax())
+                    ? ConstantInt::getFalse(Ptr->getContext())
+                    : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
   Value *Or = IRB.CreateOr(Cmp2, Cmp3);
-  if (!SizeCI || SizeCI->getValue().slt(0)) {
+  if ((!SizeCI || SizeCI->getValue().slt(0)) &&
+      !SizeRange.getSignedMin().isNonNegative()) {
     Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
     Or = IRB.CreateOr(Cmp1, Or);
   }
@@ -123,7 +135,8 @@ static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
   return true;
 }
 
-static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI) {
+static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
+                              ScalarEvolution &SE) {
   const DataLayout &DL = F.getParent()->getDataLayout();
   ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(),
                                            /*RoundToAlign=*/true);
@@ -168,19 +181,19 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI) {
     BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       MadeChange |= instrumentMemAccess(LI->getPointerOperand(), LI, DL, TLI,
-                                        ObjSizeEval, IRB, GetTrapBB);
+                                        ObjSizeEval, IRB, GetTrapBB, SE);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       MadeChange |=
           instrumentMemAccess(SI->getPointerOperand(), SI->getValueOperand(),
-                              DL, TLI, ObjSizeEval, IRB, GetTrapBB);
+                              DL, TLI, ObjSizeEval, IRB, GetTrapBB, SE);
     } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
       MadeChange |=
           instrumentMemAccess(AI->getPointerOperand(), AI->getCompareOperand(),
-                              DL, TLI, ObjSizeEval, IRB, GetTrapBB);
+                              DL, TLI, ObjSizeEval, IRB, GetTrapBB, SE);
     } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
       MadeChange |=
           instrumentMemAccess(AI->getPointerOperand(), AI->getValOperand(), DL,
-                              TLI, ObjSizeEval, IRB, GetTrapBB);
+                              TLI, ObjSizeEval, IRB, GetTrapBB, SE);
     } else {
       llvm_unreachable("unknown Instruction type");
     }
@@ -190,8 +203,9 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI) {
 
 PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
 
-  if (!addBoundsChecking(F, TLI))
+  if (!addBoundsChecking(F, TLI, SE))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -207,11 +221,13 @@ struct BoundsCheckingLegacyPass : public FunctionPass {
 
   bool runOnFunction(Function &F) override {
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return addBoundsChecking(F, TLI);
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    return addBoundsChecking(F, TLI, SE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
   }
 };
 } // namespace
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
index 075e5672cff8..cc9b149d0b6a 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -31,7 +31,7 @@
 
 namespace llvm {
 
-/// \brief An union-find based Minimum Spanning Tree for CFG
+/// An union-find based Minimum Spanning Tree for CFG
 ///
 /// Implements a Union-find algorithm to compute Minimum Spanning Tree
 /// for a given CFG.
@@ -97,7 +97,7 @@ public:
   // Edges with large weight will be put into MST first so they are less likely
   // to be instrumented.
   void buildEdges() {
-    DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
 
     const BasicBlock *Entry = &(F.getEntryBlock());
     uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
@@ -107,8 +107,8 @@ public:
 
     // Add a fake edge to the entry.
     EntryIncoming = &addEdge(nullptr, Entry, EntryWeight);
-    DEBUG(dbgs() << "  Edge: from fake node to " << Entry->getName()
-                     << " w = " << EntryWeight << "\n");
+    LLVM_DEBUG(dbgs() << "  Edge: from fake node to " << Entry->getName()
+                      << " w = " << EntryWeight << "\n");
 
     // Special handling for single BB functions.
     if (succ_empty(Entry)) {
@@ -138,8 +138,8 @@ public:
             Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
           auto *E = &addEdge(&*BB, TargetBB, Weight);
           E->IsCritical = Critical;
-          DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
-                       << TargetBB->getName() << "  w=" << Weight << "\n");
+          LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                            << TargetBB->getName() << "  w=" << Weight << "\n");
 
           // Keep track of entry/exit edges:
           if (&*BB == Entry) {
@@ -164,8 +164,8 @@ public:
           MaxExitOutWeight = BBWeight;
           ExitOutgoing = ExitO;
         }
-        DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to fake exit"
-                     << " w = " << BBWeight << "\n");
+        LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to fake exit"
+                          << " w = " << BBWeight << "\n");
       }
     }
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/contrib/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
new file mode 100644
index 000000000000..9606b3da2475
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -0,0 +1,100 @@
+//===-- CGProfile.cpp -----------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+#include <array>
+
+using namespace llvm;
+
+PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  InstrProfSymtab Symtab;
+  auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
+                          Function *CalledF, uint64_t NewCount) {
+    if (!CalledF || !TTI.isLoweredToCall(CalledF))
+      return;
+    uint64_t &Count = Counts[std::make_pair(F, CalledF)];
+    Count = SaturatingAdd(Count, NewCount);
+  };
+  // Ignore error here.  Indirect calls are ignored if this fails.
+  (void)(bool)Symtab.create(M);
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+    if (BFI.getEntryFreq() == 0)
+      continue;
+    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    for (auto &BB : F) {
+      Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
+      if (!BBCount)
+        continue;
+      for (auto &I : BB) {
+        CallSite CS(&I);
+        if (!CS)
+          continue;
+        if (CS.isIndirectCall()) {
+          InstrProfValueData ValueData[8];
+          uint32_t ActualNumValueData;
+          uint64_t TotalC;
+          if (!getValueProfDataFromInst(*CS.getInstruction(),
+                                        IPVK_IndirectCallTarget, 8, ValueData,
+                                        ActualNumValueData, TotalC))
+            continue;
+          for (const auto &VD :
+               ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
+            UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
+          }
+          continue;
+        }
+        UpdateCounts(TTI, &F, CS.getCalledFunction(), *BBCount);
+      }
+    }
+  }
+
+  addModuleFlags(M, Counts);
+
+  return PreservedAnalyses::all();
+}
+
+void CGProfilePass::addModuleFlags(
+    Module &M,
+    MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) const {
+  if (Counts.empty())
+    return;
+
+  LLVMContext &Context = M.getContext();
+  MDBuilder MDB(Context);
+  std::vector<Metadata *> Nodes;
+
+  for (auto E : Counts) {
+    SmallVector<Metadata *, 3> Vals;
+    Vals.push_back(ValueAsMetadata::get(E.first.first));
+    Vals.push_back(ValueAsMetadata::get(E.first.second));
+    Vals.push_back(MDB.createConstant(
+        ConstantInt::get(Type::getInt64Ty(Context), E.second)));
+    Nodes.push_back(MDNode::get(Context, Vals));
+  }
+
+  M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 09bcbb282653..bb0e4379d1a8 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -56,6 +56,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -90,7 +91,6 @@
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -211,6 +211,72 @@ class DFSanABIList {
   }
 };
 
+/// TransformedFunction is used to express the result of transforming one
+/// function type into another.  This struct is immutable.  It holds metadata
+/// useful for updating calls of the old function to the new type.
+struct TransformedFunction {
+  TransformedFunction(FunctionType* OriginalType,
+                      FunctionType* TransformedType,
+                      std::vector<unsigned> ArgumentIndexMapping)
+      : OriginalType(OriginalType),
+        TransformedType(TransformedType),
+        ArgumentIndexMapping(ArgumentIndexMapping) {}
+
+  // Disallow copies.
+  TransformedFunction(const TransformedFunction&) = delete;
+  TransformedFunction& operator=(const TransformedFunction&) = delete;
+
+  // Allow moves.
+  TransformedFunction(TransformedFunction&&) = default;
+  TransformedFunction& operator=(TransformedFunction&&) = default;
+
+  /// Type of the function before the transformation.
+  FunctionType* const OriginalType;
+
+  /// Type of the function after the transformation.
+  FunctionType* const TransformedType;
+
+  /// Transforming a function may change the position of arguments.  This
+  /// member records the mapping from each argument's old position to its new
+  /// position.  Argument positions are zero-indexed.  If the transformation
+  /// from F to F' made the first argument of F into the third argument of F',
+  /// then ArgumentIndexMapping[0] will equal 2.
+  const std::vector<unsigned> ArgumentIndexMapping;
+};
+
+/// Given function attributes from a call site for the original function,
+/// return function attributes appropriate for a call to the transformed
+/// function.
+AttributeList TransformFunctionAttributes(
+    const TransformedFunction& TransformedFunction,
+    LLVMContext& Ctx, AttributeList CallSiteAttrs) {
+
+  // Construct a vector of AttributeSet for each function argument.
+  std::vector<llvm::AttributeSet> ArgumentAttributes(
+      TransformedFunction.TransformedType->getNumParams());
+
+  // Copy attributes from the parameter of the original function to the
+  // transformed version.  'ArgumentIndexMapping' holds the mapping from
+  // old argument position to new.
+  for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size();
+       i < ie; ++i) {
+    unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i];
+    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i);
+  }
+
+  // Copy annotations on varargs arguments.
+  for (unsigned i = TransformedFunction.OriginalType->getNumParams(),
+       ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) {
+    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i));
+  }
+
+  return AttributeList::get(
+      Ctx,
+      CallSiteAttrs.getFnAttributes(),
+      CallSiteAttrs.getRetAttributes(),
+      llvm::makeArrayRef(ArgumentAttributes));
+}
+
 class DataFlowSanitizer : public ModulePass {
   friend struct DFSanFunction;
   friend class DFSanVisitor;
@@ -294,7 +360,7 @@ class DataFlowSanitizer : public ModulePass {
   bool isInstrumented(const GlobalAlias *GA);
   FunctionType *getArgsFunctionType(FunctionType *T);
   FunctionType *getTrampolineFunctionType(FunctionType *T);
-  FunctionType *getCustomFunctionType(FunctionType *T);
+  TransformedFunction getCustomFunctionType(FunctionType *T);
   InstrumentedABI getInstrumentedABI();
   WrapperKind getWrapperKind(Function *F);
   void addGlobalNamePrefix(GlobalValue *GV);
@@ -437,17 +503,25 @@ FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
   return FunctionType::get(T->getReturnType(), ArgTypes, false);
 }
 
-FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
   SmallVector<Type *, 4> ArgTypes;
-  for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
-       i != e; ++i) {
+
+  // Some parameters of the custom function being constructed are
+  // parameters of T.  Record the mapping from parameters of T to
+  // parameters of the custom function, so that parameter attributes
+  // at call sites can be updated.
+  std::vector<unsigned> ArgumentIndexMapping;
+  for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) {
+    Type* param_type = T->getParamType(i);
     FunctionType *FT;
-    if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
-                                     *i)->getElementType()))) {
+    if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>(
+            cast<PointerType>(param_type)->getElementType()))) {
+      ArgumentIndexMapping.push_back(ArgTypes.size());
       ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
       ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
     } else {
-      ArgTypes.push_back(*i);
+      ArgumentIndexMapping.push_back(ArgTypes.size());
+      ArgTypes.push_back(param_type);
     }
   }
   for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
@@ -457,14 +531,15 @@ FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
     ArgTypes.push_back(ShadowPtrTy);
-  return FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg());
+  return TransformedFunction(
+      T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
+      ArgumentIndexMapping);
 }
 
 bool DataFlowSanitizer::doInitialization(Module &M) {
   Triple TargetTriple(M.getTargetTriple());
   bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
-  bool IsMIPS64 = TargetTriple.getArch() == Triple::mips64 ||
-                  TargetTriple.getArch() == Triple::mips64el;
+  bool IsMIPS64 = TargetTriple.isMIPS64();
   bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
                    TargetTriple.getArch() == Triple::aarch64_be;
 
@@ -783,9 +858,17 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
       FunctionType *NewFT = getInstrumentedABI() == IA_Args
                                 ? getArgsFunctionType(FT)
                                 : FT;
+
+      // If the function being wrapped has local linkage, then preserve the
+      // function's linkage in the wrapper function.
+      GlobalValue::LinkageTypes wrapperLinkage =
+          F.hasLocalLinkage()
+              ? F.getLinkage()
+              : GlobalValue::LinkOnceODRLinkage;
+
       Function *NewF = buildWrapperFunction(
           &F, std::string("dfsw$") + std::string(F.getName()),
-          GlobalValue::LinkOnceODRLinkage, NewFT);
+          wrapperLinkage, NewFT);
       if (getInstrumentedABI() == IA_TLS)
         NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
 
@@ -1382,20 +1465,19 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
   Value *LenShadow = IRB.CreateMul(
       I.getLength(),
       ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
-  Value *AlignShadow;
-  if (ClPreserveAlignment) {
-    AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
-                                ConstantInt::get(I.getAlignmentCst()->getType(),
-                                                 DFSF.DFS.ShadowWidth / 8));
-  } else {
-    AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
-                                   DFSF.DFS.ShadowWidth / 8);
-  }
   Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
   DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
   SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
-  IRB.CreateCall(I.getCalledValue(), {DestShadow, SrcShadow, LenShadow,
-                                      AlignShadow, I.getVolatileCst()});
+  auto *MTI = cast<MemTransferInst>(
+      IRB.CreateCall(I.getCalledValue(),
+                     {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
+  if (ClPreserveAlignment) {
+    MTI->setDestAlignment(I.getDestAlignment() * (DFSF.DFS.ShadowWidth / 8));
+    MTI->setSourceAlignment(I.getSourceAlignment() * (DFSF.DFS.ShadowWidth / 8));
+  } else {
+    MTI->setDestAlignment(DFSF.DFS.ShadowWidth / 8);
+    MTI->setSourceAlignment(DFSF.DFS.ShadowWidth / 8);
+  }
 }
 
 void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
@@ -1460,11 +1542,11 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
       // wrapper.
       if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
         FunctionType *FT = F->getFunctionType();
-        FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+        TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
         std::string CustomFName = "__dfsw_";
         CustomFName += F->getName();
-        Constant *CustomF =
-            DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+        Constant *CustomF = DFSF.DFS.Mod->getOrInsertFunction(
+            CustomFName, CustomFn.TransformedType);
         if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
           CustomFn->copyAttributesFrom(F);
 
@@ -1532,7 +1614,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
 
         CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
         CustomCI->setCallingConv(CI->getCallingConv());
-        CustomCI->setAttributes(CI->getAttributes());
+        CustomCI->setAttributes(TransformFunctionAttributes(CustomFn,
+            CI->getContext(), CI->getAttributes()));
 
         // Update the parameter attributes of the custom call instruction to
         // zero extend the shadow parameters. This is required for targets
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 6864d295525c..33f220a893df 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -33,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -537,7 +537,7 @@ void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
 bool EfficiencySanitizer::initOnModule(Module &M) {
 
   Triple TargetTriple(M.getTargetTriple());
-  if (TargetTriple.getArch() == Triple::mips64 || TargetTriple.getArch() == Triple::mips64el)
+  if (TargetTriple.isMIPS64())
     ShadowParams = ShadowParams40;
   else
     ShadowParams = ShadowParams47;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 67ca8172b0d5..acd27c2e226f 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -17,11 +17,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
@@ -35,8 +37,8 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <memory>
@@ -84,7 +86,7 @@ public:
     ReversedVersion[3] = Options.Version[0];
     ReversedVersion[4] = '\0';
   }
-  bool runOnModule(Module &M);
+  bool runOnModule(Module &M, const TargetLibraryInfo &TLI);
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
@@ -130,6 +132,7 @@ private:
   SmallVector<uint32_t, 4> FileChecksums;
 
   Module *M;
+  const TargetLibraryInfo *TLI;
   LLVMContext *Ctx;
   SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
 };
@@ -145,7 +148,14 @@ public:
   }
   StringRef getPassName() const override { return "GCOV Profiler"; }
 
-  bool runOnModule(Module &M) override { return Profiler.runOnModule(M); }
+  bool runOnModule(Module &M) override { 
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return Profiler.runOnModule(M, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
 
 private:
   GCOVProfiler Profiler;
@@ -153,8 +163,13 @@ private:
 }
 
 char GCOVProfilerLegacyPass::ID = 0;
-INITIALIZE_PASS(GCOVProfilerLegacyPass, "insert-gcov-profiling",
-                "Insert instrumentation for GCOV profiling", false, false)
+INITIALIZE_PASS_BEGIN(
+    GCOVProfilerLegacyPass, "insert-gcov-profiling",
+    "Insert instrumentation for GCOV profiling", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    GCOVProfilerLegacyPass, "insert-gcov-profiling",
+    "Insert instrumentation for GCOV profiling", false, false)
 
 ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
   return new GCOVProfilerLegacyPass(Options);
@@ -272,7 +287,7 @@ namespace {
       write(Len);
       write(Number);
 
-      std::sort(
+      llvm::sort(
           SortedLinesByFile.begin(), SortedLinesByFile.end(),
           [](StringMapEntry<GCOVLines> *LHS, StringMapEntry<GCOVLines> *RHS) {
             return LHS->getKey() < RHS->getKey();
@@ -315,7 +330,7 @@ namespace {
            ReturnBlock(1, os) {
       this->os = os;
 
-      DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+      LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
 
       uint32_t i = 0;
       for (auto &BB : *F) {
@@ -383,7 +398,7 @@ namespace {
       for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
         write(0);  // No flags on our blocks.
       }
-      DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+      LLVM_DEBUG(dbgs() << Blocks.size() << " blocks.\n");
 
       // Emit edges between blocks.
       if (Blocks.empty()) return;
@@ -396,8 +411,8 @@ namespace {
         write(Block.OutEdges.size() * 2 + 1);
         write(Block.Number);
         for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
-          DEBUG(dbgs() << Block.Number << " -> " << Block.OutEdges[i]->Number
-                       << "\n");
+          LLVM_DEBUG(dbgs() << Block.Number << " -> "
+                            << Block.OutEdges[i]->Number << "\n");
           write(Block.OutEdges[i]->Number);
           write(0);  // no flags
         }
@@ -461,8 +476,9 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   return CurPath.str();
 }
 
-bool GCOVProfiler::runOnModule(Module &M) {
+bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
   this->M = &M;
+  this->TLI = &TLI;
   Ctx = &M.getContext();
 
   if (Options.EmitNotes) emitProfileNotes();
@@ -475,7 +491,8 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
 
   GCOVProfiler Profiler(GCOVOpts);
 
-  if (!Profiler.runOnModule(M))
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  if (!Profiler.runOnModule(M, TLI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -503,11 +520,11 @@ static bool functionHasLines(Function &F) {
   return false;
 }
 
-static bool isUsingFuncletBasedEH(Function &F) {
+static bool isUsingScopeBasedEH(Function &F) {
   if (!F.hasPersonalityFn()) return false;
 
   EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
-  return isFuncletEHPersonality(Personality);
+  return isScopedEHPersonality(Personality);
 }
 
 static bool shouldKeepInEntry(BasicBlock::iterator It) {
@@ -550,8 +567,8 @@ void GCOVProfiler::emitProfileNotes() {
       DISubprogram *SP = F.getSubprogram();
       if (!SP) continue;
       if (!functionHasLines(F)) continue;
-      // TODO: Functions using funclet-based EH are currently not supported.
-      if (isUsingFuncletBasedEH(F)) continue;
+      // TODO: Functions using scope-based EH are currently not supported.
+      if (isUsingScopeBasedEH(F)) continue;
 
       // gcov expects every function to start with an entry block that has a
       // single successor, so split the entry block to make sure of that.
@@ -629,8 +646,8 @@ bool GCOVProfiler::emitProfileArcs() {
       DISubprogram *SP = F.getSubprogram();
       if (!SP) continue;
       if (!functionHasLines(F)) continue;
-      // TODO: Functions using funclet-based EH are currently not supported.
-      if (isUsingFuncletBasedEH(F)) continue;
+      // TODO: Functions using scope-based EH are currently not supported.
+      if (isUsingScopeBasedEH(F)) continue;
       if (!Result) Result = true;
 
       unsigned Edges = 0;
@@ -807,7 +824,12 @@ Constant *GCOVProfiler::getStartFileFunc() {
     Type::getInt32Ty(*Ctx),    // uint32_t checksum
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
-  return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+  auto *Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+  if (Function *FunRes = dyn_cast<Function>(Res))
+    if (auto AK = TLI->getExtAttrForI32Param(false))
+      FunRes->addParamAttr(2, AK);
+  return Res;
+
 }
 
 Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
@@ -830,7 +852,15 @@ Constant *GCOVProfiler::getEmitFunctionFunc() {
     Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
-  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+  auto *Res = M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+  if (Function *FunRes = dyn_cast<Function>(Res))
+    if (auto AK = TLI->getExtAttrForI32Param(false)) {
+      FunRes->addParamAttr(0, AK);
+      FunRes->addParamAttr(2, AK);
+      FunRes->addParamAttr(3, AK);
+      FunRes->addParamAttr(4, AK);
+    }
+  return Res;
 }
 
 Constant *GCOVProfiler::getEmitArcsFunc() {
@@ -839,7 +869,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
-  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+  auto *Res = M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+  if (Function *FunRes = dyn_cast<Function>(Res))
+    if (auto AK = TLI->getExtAttrForI32Param(false))
+      FunRes->addParamAttr(0, AK);
+  return Res;
 }
 
 Constant *GCOVProfiler::getSummaryInfoFunc() {
@@ -886,46 +920,205 @@ Function *GCOVProfiler::insertCounterWriteout(
   Constant *SummaryInfo = getSummaryInfoFunc();
   Constant *EndFile = getEndFileFunc();
 
-  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  if (CU_Nodes) {
-    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-      auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+  NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CUNodes) {
+    Builder.CreateRetVoid();
+    return WriteoutF;
+  }
 
-      // Skip module skeleton (and module) CUs.
-      if (CU->getDWOId())
-        continue;
+  // Collect the relevant data into a large constant data structure that we can
+  // walk to write out everything.
+  StructType *StartFileCallArgsTy = StructType::create(
+      {Builder.getInt8PtrTy(), Builder.getInt8PtrTy(), Builder.getInt32Ty()});
+  StructType *EmitFunctionCallArgsTy = StructType::create(
+      {Builder.getInt32Ty(), Builder.getInt8PtrTy(), Builder.getInt32Ty(),
+       Builder.getInt8Ty(), Builder.getInt32Ty()});
+  StructType *EmitArcsCallArgsTy = StructType::create(
+      {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()});
+  StructType *FileInfoTy =
+      StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
+                          EmitFunctionCallArgsTy->getPointerTo(),
+                          EmitArcsCallArgsTy->getPointerTo()});
+
+  Constant *Zero32 = Builder.getInt32(0);
+  // Build an explicit array of two zeros for use in ConstantExpr GEP building.
+  Constant *TwoZero32s[] = {Zero32, Zero32};
+
+  SmallVector<Constant *, 8> FileInfos;
+  for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) {
+    auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i));
 
-      std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
-      uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
-      Builder.CreateCall(StartFile,
-                         {Builder.CreateGlobalStringPtr(FilenameGcda),
-                          Builder.CreateGlobalStringPtr(ReversedVersion),
-                          Builder.getInt32(CfgChecksum)});
-      for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
-        auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
-        uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
-        Builder.CreateCall(
-            EmitFunction,
-            {Builder.getInt32(j),
-             Options.FunctionNamesInData
-                 ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
-                 : Constant::getNullValue(Builder.getInt8PtrTy()),
-             Builder.getInt32(FuncChecksum),
-             Builder.getInt8(Options.UseCfgChecksum),
-             Builder.getInt32(CfgChecksum)});
-
-        GlobalVariable *GV = CountersBySP[j].first;
-        unsigned Arcs =
-          cast<ArrayType>(GV->getValueType())->getNumElements();
-        Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs),
-                                      Builder.CreateConstGEP2_64(GV, 0, 0)});
-      }
-      Builder.CreateCall(SummaryInfo, {});
-      Builder.CreateCall(EndFile, {});
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
+    std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
+    uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+    auto *StartFileCallArgs = ConstantStruct::get(
+        StartFileCallArgsTy, {Builder.CreateGlobalStringPtr(FilenameGcda),
+                              Builder.CreateGlobalStringPtr(ReversedVersion),
+                              Builder.getInt32(CfgChecksum)});
+
+    SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
+    SmallVector<Constant *, 8> EmitArcsCallArgsArray;
+    for (int j : llvm::seq<int>(0, CountersBySP.size())) {
+      auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
+      uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+      EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
+          EmitFunctionCallArgsTy,
+          {Builder.getInt32(j),
+           Options.FunctionNamesInData
+               ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
+               : Constant::getNullValue(Builder.getInt8PtrTy()),
+           Builder.getInt32(FuncChecksum),
+           Builder.getInt8(Options.UseCfgChecksum),
+           Builder.getInt32(CfgChecksum)}));
+
+      GlobalVariable *GV = CountersBySP[j].first;
+      unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements();
+      EmitArcsCallArgsArray.push_back(ConstantStruct::get(
+          EmitArcsCallArgsTy,
+          {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr(
+                                       GV->getValueType(), GV, TwoZero32s)}));
     }
+    // Create global arrays for the two emit calls.
+    int CountersSize = CountersBySP.size();
+    assert(CountersSize == (int)EmitFunctionCallArgsArray.size() &&
+           "Mismatched array size!");
+    assert(CountersSize == (int)EmitArcsCallArgsArray.size() &&
+           "Mismatched array size!");
+    auto *EmitFunctionCallArgsArrayTy =
+        ArrayType::get(EmitFunctionCallArgsTy, CountersSize);
+    auto *EmitFunctionCallArgsArrayGV = new GlobalVariable(
+        *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true,
+        GlobalValue::InternalLinkage,
+        ConstantArray::get(EmitFunctionCallArgsArrayTy,
+                           EmitFunctionCallArgsArray),
+        Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i));
+    auto *EmitArcsCallArgsArrayTy =
+        ArrayType::get(EmitArcsCallArgsTy, CountersSize);
+    EmitFunctionCallArgsArrayGV->setUnnamedAddr(
+        GlobalValue::UnnamedAddr::Global);
+    auto *EmitArcsCallArgsArrayGV = new GlobalVariable(
+        *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true,
+        GlobalValue::InternalLinkage,
+        ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray),
+        Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i));
+    EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+    FileInfos.push_back(ConstantStruct::get(
+        FileInfoTy,
+        {StartFileCallArgs, Builder.getInt32(CountersSize),
+         ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy,
+                                                EmitFunctionCallArgsArrayGV,
+                                                TwoZero32s),
+         ConstantExpr::getInBoundsGetElementPtr(
+             EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)}));
   }
 
+  // If we didn't find anything to actually emit, bail on out.
+  if (FileInfos.empty()) {
+    Builder.CreateRetVoid();
+    return WriteoutF;
+  }
+
+  // To simplify code, we cap the number of file infos we write out to fit
+  // easily in a 32-bit signed integer. This gives consistent behavior between
+  // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit
+  // operations on 32-bit systems. It also seems unreasonable to try to handle
+  // more than 2 billion files.
+  if ((int64_t)FileInfos.size() > (int64_t)INT_MAX)
+    FileInfos.resize(INT_MAX);
+
+  // Create a global for the entire data structure so we can walk it more
+  // easily.
+  auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size());
+  auto *FileInfoArrayGV = new GlobalVariable(
+      *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage,
+      ConstantArray::get(FileInfoArrayTy, FileInfos),
+      "__llvm_internal_gcov_emit_file_info");
+  FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  // Create the CFG for walking this data structure.
+  auto *FileLoopHeader =
+      BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF);
+  auto *CounterLoopHeader =
+      BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF);
+  auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF);
+  auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF);
+
+  // We always have at least one file, so just branch to the header.
+  Builder.CreateBr(FileLoopHeader);
+
+  // The index into the files structure is our loop induction variable.
+  Builder.SetInsertPoint(FileLoopHeader);
+  PHINode *IV =
+      Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+  IV->addIncoming(Builder.getInt32(0), BB);
+  auto *FileInfoPtr =
+      Builder.CreateInBoundsGEP(FileInfoArrayGV, {Builder.getInt32(0), IV});
+  auto *StartFileCallArgsPtr = Builder.CreateStructGEP(FileInfoPtr, 0);
+  auto *StartFileCall = Builder.CreateCall(
+      StartFile,
+      {Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 0)),
+       Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 1)),
+       Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 2))});
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    StartFileCall->addParamAttr(2, AK);
+  auto *NumCounters =
+      Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 1));
+  auto *EmitFunctionCallArgsArray =
+      Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 2));
+  auto *EmitArcsCallArgsArray =
+      Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 3));
+  auto *EnterCounterLoopCond =
+      Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
+  Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
+
+  Builder.SetInsertPoint(CounterLoopHeader);
+  auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+  JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
+  auto *EmitFunctionCallArgsPtr =
+      Builder.CreateInBoundsGEP(EmitFunctionCallArgsArray, {JV});
+  auto *EmitFunctionCall = Builder.CreateCall(
+      EmitFunction,
+      {Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 0)),
+       Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 1)),
+       Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 2)),
+       Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 3)),
+       Builder.CreateLoad(
+           Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 4))});
+  if (auto AK = TLI->getExtAttrForI32Param(false)) {
+    EmitFunctionCall->addParamAttr(0, AK);
+    EmitFunctionCall->addParamAttr(2, AK);
+    EmitFunctionCall->addParamAttr(3, AK);
+    EmitFunctionCall->addParamAttr(4, AK);
+  }
+  auto *EmitArcsCallArgsPtr =
+      Builder.CreateInBoundsGEP(EmitArcsCallArgsArray, {JV});
+  auto *EmitArcsCall = Builder.CreateCall(
+      EmitArcs,
+      {Builder.CreateLoad(Builder.CreateStructGEP(EmitArcsCallArgsPtr, 0)),
+       Builder.CreateLoad(Builder.CreateStructGEP(EmitArcsCallArgsPtr, 1))});
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    EmitArcsCall->addParamAttr(0, AK);
+  auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
+  auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters);
+  Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch);
+  JV->addIncoming(NextJV, CounterLoopHeader);
+
+  Builder.SetInsertPoint(FileLoopLatch);
+  Builder.CreateCall(SummaryInfo, {});
+  Builder.CreateCall(EndFile, {});
+  auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1));
+  auto *FileLoopCond =
+      Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
+  Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
+  IV->addIncoming(NextIV, FileLoopLatch);
+
+  Builder.SetInsertPoint(ExitBB);
   Builder.CreateRetVoid();
+
   return WriteoutF;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8e2833d22032..d62598bb5d4f 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -22,10 +22,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstVisitor.h"
@@ -34,6 +31,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -41,8 +39,11 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 using namespace llvm;
 
@@ -51,10 +52,15 @@ using namespace llvm;
 static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
 static const char *const kHwasanInitName = "__hwasan_init";
 
+static const char *const kHwasanShadowMemoryDynamicAddress =
+    "__hwasan_shadow_memory_dynamic_address";
+
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
 
-static const size_t kShadowScale = 4;
+static const size_t kDefaultShadowScale = 4;
+static const uint64_t kDynamicShadowSentinel =
+    std::numeric_limits<uint64_t>::max();
 static const unsigned kPointerTagShift = 56;
 
 static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
@@ -85,17 +91,57 @@ static cl::opt<bool> ClRecover(
     cl::desc("Enable recovery mode (continue-after-error)."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
+                                       cl::desc("instrument stack (allocas)"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClUARRetagToZero(
+    "hwasan-uar-retag-to-zero",
+    cl::desc("Clear alloca tags before returning from the function to allow "
+             "non-instrumented and instrumented function calls mix. When set "
+             "to false, allocas are retagged before returning from the "
+             "function to detect use after return."),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClGenerateTagsWithCalls(
+    "hwasan-generate-tags-with-calls",
+    cl::desc("generate new tags with runtime library calls"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<int> ClMatchAllTag(
+    "hwasan-match-all-tag",
+    cl::desc("don't report bad accesses via pointers with this tag"),
+    cl::Hidden, cl::init(-1));
+
+static cl::opt<bool> ClEnableKhwasan(
+    "hwasan-kernel",
+    cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+
+// These flags allow to change the shadow mapping and control how shadow memory
+// is accessed. The shadow mapping looks like:
+//    Shadow = (Mem >> scale) + offset
+
+static cl::opt<unsigned long long> ClMappingOffset(
+    "hwasan-mapping-offset",
+    cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"), cl::Hidden,
+    cl::init(0));
+
 namespace {
 
-/// \brief An instrumentation pass implementing detection of addressability bugs
+/// An instrumentation pass implementing detection of addressability bugs
 /// using tagged pointers.
 class HWAddressSanitizer : public FunctionPass {
 public:
   // Pass identification, replacement for typeid.
   static char ID;
 
-  HWAddressSanitizer(bool Recover = false)
-      : FunctionPass(ID), Recover(Recover || ClRecover) {}
+  explicit HWAddressSanitizer(bool CompileKernel = false, bool Recover = false)
+      : FunctionPass(ID) {
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
+        ClEnableKhwasan : CompileKernel;
+  }
 
   StringRef getPassName() const override { return "HWAddressSanitizer"; }
 
@@ -103,6 +149,11 @@ public:
   bool doInitialization(Module &M) override;
 
   void initializeCallbacks(Module &M);
+
+  void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+
+  void untagPointerOperand(Instruction *I, Value *Addr);
+  Value *memToShadow(Value *Shadow, Type *Ty, IRBuilder<> &IRB);
   void instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
                                  unsigned AccessSizeIndex,
                                  Instruction *InsertBefore);
@@ -111,16 +162,54 @@ public:
                                    uint64_t *TypeSize, unsigned *Alignment,
                                    Value **MaybeMask);
 
+  bool isInterestingAlloca(const AllocaInst &AI);
+  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag);
+  Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
+  Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
+  bool instrumentStack(SmallVectorImpl<AllocaInst *> &Allocas,
+                       SmallVectorImpl<Instruction *> &RetVec);
+  Value *getNextTagWithCall(IRBuilder<> &IRB);
+  Value *getStackBaseTag(IRBuilder<> &IRB);
+  Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
+                     unsigned AllocaNo);
+  Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
+
 private:
   LLVMContext *C;
+  Triple TargetTriple;
+
+  /// This struct defines the shadow mapping using the rule:
+  ///   shadow = (mem >> Scale) + Offset.
+  /// If InGlobal is true, then
+  ///   extern char __hwasan_shadow[];
+  ///   shadow = (mem >> Scale) + &__hwasan_shadow
+  struct ShadowMapping {
+    int Scale;
+    uint64_t Offset;
+    bool InGlobal;
+
+    void init(Triple &TargetTriple);
+    unsigned getAllocaAlignment() const { return 1U << Scale; }
+  };
+  ShadowMapping Mapping;
+
   Type *IntptrTy;
+  Type *Int8Ty;
 
+  bool CompileKernel;
   bool Recover;
 
   Function *HwasanCtorFunction;
 
   Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
   Function *HwasanMemoryAccessCallbackSized[2];
+
+  Function *HwasanTagMemoryFunc;
+  Function *HwasanGenerateTagFunc;
+
+  Constant *ShadowGlobal;
+
+  Value *LocalDynamicShadow = nullptr;
 };
 
 } // end anonymous namespace
@@ -129,34 +218,44 @@ char HWAddressSanitizer::ID = 0;
 
 INITIALIZE_PASS_BEGIN(
     HWAddressSanitizer, "hwasan",
-    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false)
+    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+    false)
 INITIALIZE_PASS_END(
     HWAddressSanitizer, "hwasan",
-    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false)
+    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+    false)
 
-FunctionPass *llvm::createHWAddressSanitizerPass(bool Recover) {
-  return new HWAddressSanitizer(Recover);
+FunctionPass *llvm::createHWAddressSanitizerPass(bool CompileKernel,
+                                                 bool Recover) {
+  assert(!CompileKernel || Recover);
+  return new HWAddressSanitizer(CompileKernel, Recover);
 }
 
-/// \brief Module-level initialization.
+/// Module-level initialization.
 ///
 /// inserts a call to __hwasan_init to the module's constructor list.
 bool HWAddressSanitizer::doInitialization(Module &M) {
-  DEBUG(dbgs() << "Init " << M.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   auto &DL = M.getDataLayout();
 
-  Triple TargetTriple(M.getTargetTriple());
+  TargetTriple = Triple(M.getTargetTriple());
+
+  Mapping.init(TargetTriple);
 
   C = &(M.getContext());
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
-
-  std::tie(HwasanCtorFunction, std::ignore) =
-      createSanitizerCtorAndInitFunctions(M, kHwasanModuleCtorName,
-                                          kHwasanInitName,
-                                          /*InitArgTypes=*/{},
-                                          /*InitArgs=*/{});
-  appendToGlobalCtors(M, HwasanCtorFunction, 0);
+  Int8Ty = IRB.getInt8Ty();
+
+  HwasanCtorFunction = nullptr;
+  if (!CompileKernel) {
+    std::tie(HwasanCtorFunction, std::ignore) =
+        createSanitizerCtorAndInitFunctions(M, kHwasanModuleCtorName,
+                                            kHwasanInitName,
+                                            /*InitArgTypes=*/{},
+                                            /*InitArgs=*/{});
+    appendToGlobalCtors(M, HwasanCtorFunction, 0);
+  }
   return true;
 }
 
@@ -168,7 +267,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
 
     HwasanMemoryAccessCallbackSized[AccessIsWrite] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            ClMemoryAccessCallbackPrefix + TypeStr + EndingStr,
+            ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
             FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false)));
 
     for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -180,16 +279,50 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
               FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)));
     }
   }
+
+  HwasanTagMemoryFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__hwasan_tag_memory", IRB.getVoidTy(), IntptrTy, Int8Ty, IntptrTy));
+  HwasanGenerateTagFunc = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty));
+
+  if (Mapping.InGlobal)
+    ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
+                                       ArrayType::get(IRB.getInt8Ty(), 0));
+}
+
+void HWAddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+  // Generate code only when dynamic addressing is needed.
+  if (Mapping.Offset != kDynamicShadowSentinel)
+    return;
+
+  IRBuilder<> IRB(&F.front().front());
+  if (Mapping.InGlobal) {
+    // An empty inline asm with input reg == output reg.
+    // An opaque pointer-to-int cast, basically.
+    InlineAsm *Asm = InlineAsm::get(
+        FunctionType::get(IntptrTy, {ShadowGlobal->getType()}, false),
+        StringRef(""), StringRef("=r,0"),
+        /*hasSideEffects=*/false);
+    LocalDynamicShadow = IRB.CreateCall(Asm, {ShadowGlobal}, ".hwasan.shadow");
+  } else {
+    Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+        kHwasanShadowMemoryDynamicAddress, IntptrTy);
+    LocalDynamicShadow = IRB.CreateLoad(GlobalDynamicAddress);
+  }
 }
 
 Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
-                                                   bool *IsWrite,
-                                                   uint64_t *TypeSize,
-                                                   unsigned *Alignment,
-                                                   Value **MaybeMask) {
+                                                     bool *IsWrite,
+                                                     uint64_t *TypeSize,
+                                                     unsigned *Alignment,
+                                                     Value **MaybeMask) {
   // Skip memory accesses inserted by another instrumentation.
   if (I->getMetadata("nosanitize")) return nullptr;
 
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (LocalDynamicShadow == I)
+    return nullptr;
+
   Value *PtrOperand = nullptr;
   const DataLayout &DL = I->getModule()->getDataLayout();
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
@@ -219,7 +352,7 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
   }
 
   if (PtrOperand) {
-    // Do not instrument acesses from different address spaces; we cannot deal
+    // Do not instrument accesses from different address spaces; we cannot deal
     // with them.
     Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
     if (PtrTy->getPointerAddressSpace() != 0)
@@ -236,41 +369,103 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
   return PtrOperand;
 }
 
+static unsigned getPointerOperandIndex(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperandIndex();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperandIndex();
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
+    return RMW->getPointerOperandIndex();
+  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
+    return XCHG->getPointerOperandIndex();
+  report_fatal_error("Unexpected instruction");
+  return -1;
+}
+
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
   size_t Res = countTrailingZeros(TypeSize / 8);
   assert(Res < kNumberOfAccessSizes);
   return Res;
 }
 
+void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
+  if (TargetTriple.isAArch64())
+    return;
+
+  IRBuilder<> IRB(I);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *UntaggedPtr =
+      IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
+  I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
+}
+
+Value *HWAddressSanitizer::memToShadow(Value *Mem, Type *Ty, IRBuilder<> &IRB) {
+  // Mem >> Scale
+  Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
+  if (Mapping.Offset == 0)
+    return Shadow;
+  // (Mem >> Scale) + Offset
+  Value *ShadowBase;
+  if (LocalDynamicShadow)
+    ShadowBase = LocalDynamicShadow;
+  else
+    ShadowBase = ConstantInt::get(Ty, Mapping.Offset);
+  return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
 void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
                                                    unsigned AccessSizeIndex,
                                                    Instruction *InsertBefore) {
   IRBuilder<> IRB(InsertBefore);
-  Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift), IRB.getInt8Ty());
-  Value *AddrLong =
-      IRB.CreateAnd(PtrLong, ConstantInt::get(PtrLong->getType(),
-                                              ~(0xFFULL << kPointerTagShift)));
-  Value *ShadowLong = IRB.CreateLShr(AddrLong, kShadowScale);
-  Value *MemTag = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowLong, IRB.getInt8PtrTy()));
+  Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
+                                  IRB.getInt8Ty());
+  Value *AddrLong = untagPointer(IRB, PtrLong);
+  Value *ShadowLong = memToShadow(AddrLong, PtrLong->getType(), IRB);
+  Value *MemTag =
+      IRB.CreateLoad(IRB.CreateIntToPtr(ShadowLong, IRB.getInt8PtrTy()));
   Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
 
+  int matchAllTag = ClMatchAllTag.getNumOccurrences() > 0 ?
+      ClMatchAllTag : (CompileKernel ? 0xFF : -1);
+  if (matchAllTag != -1) {
+    Value *TagNotIgnored = IRB.CreateICmpNE(PtrTag,
+        ConstantInt::get(PtrTag->getType(), matchAllTag));
+    TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
+  }
+
   TerminatorInst *CheckTerm =
       SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
   IRB.SetInsertPoint(CheckTerm);
-  // The signal handler will find the data address in x0.
-  InlineAsm *Asm = InlineAsm::get(
-      FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
-      "hlt #" +
-          itostr(0x100 + Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex),
-      "{x0}",
-      /*hasSideEffects=*/true);
+  const int64_t AccessInfo = Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex;
+  InlineAsm *Asm;
+  switch (TargetTriple.getArch()) {
+    case Triple::x86_64:
+      // The signal handler will find the data address in rdi.
+      Asm = InlineAsm::get(
+          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+          "int3\nnopl " + itostr(0x40 + AccessInfo) + "(%rax)",
+          "{rdi}",
+          /*hasSideEffects=*/true);
+      break;
+    case Triple::aarch64:
+    case Triple::aarch64_be:
+      // The signal handler will find the data address in x0.
+      Asm = InlineAsm::get(
+          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+          "brk #" + itostr(0x900 + AccessInfo),
+          "{x0}",
+          /*hasSideEffects=*/true);
+      break;
+    default:
+      report_fatal_error("unsupported architecture");
+  }
   IRB.CreateCall(Asm, PtrLong);
 }
 
 bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
-  DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
+  LLVM_DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
   bool IsWrite = false;
   unsigned Alignment = 0;
   uint64_t TypeSize = 0;
@@ -288,7 +483,7 @@ bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   if (isPowerOf2_64(TypeSize) &&
       (TypeSize / 8 <= (1UL << (kNumberOfAccessSizes - 1))) &&
-      (Alignment >= (1UL << kShadowScale) || Alignment == 0 ||
+      (Alignment >= (1UL << Mapping.Scale) || Alignment == 0 ||
        Alignment >= TypeSize / 8)) {
     size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
     if (ClInstrumentWithCalls) {
@@ -301,10 +496,197 @@ bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
     IRB.CreateCall(HwasanMemoryAccessCallbackSized[IsWrite],
                    {AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8)});
   }
+  untagPointerOperand(I, Addr);
 
   return true;
 }
 
+static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+  uint64_t ArraySize = 1;
+  if (AI.isArrayAllocation()) {
+    const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+    assert(CI && "non-constant array size");
+    ArraySize = CI->getZExtValue();
+  }
+  Type *Ty = AI.getAllocatedType();
+  uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+  return SizeInBytes * ArraySize;
+}
+
+bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
+                                   Value *Tag) {
+  size_t Size = (getAllocaSizeInBytes(*AI) + Mapping.getAllocaAlignment() - 1) &
+                ~(Mapping.getAllocaAlignment() - 1);
+
+  Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
+  if (ClInstrumentWithCalls) {
+    IRB.CreateCall(HwasanTagMemoryFunc,
+                   {IRB.CreatePointerCast(AI, IntptrTy), JustTag,
+                    ConstantInt::get(IntptrTy, Size)});
+  } else {
+    size_t ShadowSize = Size >> Mapping.Scale;
+    Value *ShadowPtr = IRB.CreateIntToPtr(
+        memToShadow(IRB.CreatePointerCast(AI, IntptrTy), AI->getType(), IRB),
+        IRB.getInt8PtrTy());
+    // If this memset is not inlined, it will be intercepted in the hwasan
+    // runtime library. That's OK, because the interceptor skips the checks if
+    // the address is in the shadow region.
+    // FIXME: the interceptor is not as fast as real memset. Consider lowering
+    // llvm.memset right here into either a sequence of stores, or a call to
+    // hwasan_tag_memory.
+    IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+  }
+  return true;
+}
+
+static unsigned RetagMask(unsigned AllocaNo) {
+  // A list of 8-bit numbers that have at most one run of non-zero bits.
+  // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
+  // masks.
+  // The list does not include the value 255, which is used for UAR.
+  static unsigned FastMasks[] = {
+      0,   1,   2,   3,   4,   6,   7,   8,   12,  14,  15, 16,  24,
+      28,  30,  31,  32,  48,  56,  60,  62,  63,  64,  96, 112, 120,
+      124, 126, 127, 128, 192, 224, 240, 248, 252, 254};
+  return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
+}
+
+Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
+  return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
+}
+
+Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
+  if (ClGenerateTagsWithCalls)
+    return nullptr;
+  // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+  // first).
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  auto GetStackPointerFn =
+      Intrinsic::getDeclaration(M, Intrinsic::frameaddress);
+  Value *StackPointer = IRB.CreateCall(
+      GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
+
+  // Extract some entropy from the stack pointer for the tags.
+  // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
+  // between functions).
+  Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+  Value *StackTag =
+      IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
+                    "hwasan.stack.base.tag");
+  return StackTag;
+}
+
+Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
+                                        AllocaInst *AI, unsigned AllocaNo) {
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  return IRB.CreateXor(StackTag,
+                       ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
+}
+
+Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
+  if (ClUARRetagToZero)
+    return ConstantInt::get(IntptrTy, 0);
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
+}
+
+// Add a tag to an address.
+Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
+                                      Value *PtrLong, Value *Tag) {
+  Value *TaggedPtrLong;
+  if (CompileKernel) {
+    // Kernel addresses have 0xFF in the most significant byte.
+    Value *ShiftedTag = IRB.CreateOr(
+        IRB.CreateShl(Tag, kPointerTagShift),
+        ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
+    TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
+  } else {
+    // Userspace can simply do OR (tag << 56);
+    Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
+    TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
+  }
+  return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
+}
+
+// Remove tag from an address.
+Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
+  Value *UntaggedPtrLong;
+  if (CompileKernel) {
+    // Kernel addresses have 0xFF in the most significant byte.
+    UntaggedPtrLong = IRB.CreateOr(PtrLong,
+        ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
+  } else {
+    // Userspace addresses have 0x00.
+    UntaggedPtrLong = IRB.CreateAnd(PtrLong,
+        ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
+  }
+  return UntaggedPtrLong;
+}
+
+bool HWAddressSanitizer::instrumentStack(
+    SmallVectorImpl<AllocaInst *> &Allocas,
+    SmallVectorImpl<Instruction *> &RetVec) {
+  Function *F = Allocas[0]->getParent()->getParent();
+  Instruction *InsertPt = &*F->getEntryBlock().begin();
+  IRBuilder<> IRB(InsertPt);
+
+  Value *StackTag = getStackBaseTag(IRB);
+
+  // Ideally, we want to calculate tagged stack base pointer, and rewrite all
+  // alloca addresses using that. Unfortunately, offsets are not known yet
+  // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
+  // temp, shift-OR it into each alloca address and xor with the retag mask.
+  // This generates one extra instruction per alloca use.
+  for (unsigned N = 0; N < Allocas.size(); ++N) {
+    auto *AI = Allocas[N];
+    IRB.SetInsertPoint(AI->getNextNode());
+
+    // Replace uses of the alloca with tagged address.
+    Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
+    Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
+    Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
+    std::string Name =
+        AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
+    Replacement->setName(Name + ".hwasan");
+
+    for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
+      Use &U = *UI++;
+      if (U.getUser() != AILong)
+        U.set(Replacement);
+    }
+
+    tagAlloca(IRB, AI, Tag);
+
+    for (auto RI : RetVec) {
+      IRB.SetInsertPoint(RI);
+
+      // Re-tag alloca memory with the special UAR tag.
+      Value *Tag = getUARTag(IRB, StackTag);
+      tagAlloca(IRB, AI, Tag);
+    }
+  }
+
+  return true;
+}
+
+bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  return (AI.getAllocatedType()->isSized() &&
+          // FIXME: instrument dynamic allocas, too
+          AI.isStaticAlloca() &&
+          // alloca() may be called with 0 size, ignore it.
+          getAllocaSizeInBytes(AI) > 0 &&
+          // We are only interested in allocas not promotable to registers.
+          // Promotable allocas are common under -O0.
+          !isAllocaPromotable(&AI) &&
+          // inalloca allocas are not treated as static, and we don't want
+          // dynamic alloca instrumentation for them as well.
+          !AI.isUsedWithInAlloca() &&
+          // swifterror allocas are register promoted by ISel
+          !AI.isSwiftError());
+}
+
 bool HWAddressSanitizer::runOnFunction(Function &F) {
   if (&F == HwasanCtorFunction)
     return false;
@@ -312,14 +694,35 @@ bool HWAddressSanitizer::runOnFunction(Function &F) {
   if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
     return false;
 
-  DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
 
   initializeCallbacks(*F.getParent());
 
+  assert(!LocalDynamicShadow);
+  maybeInsertDynamicShadowAtFunctionEntry(F);
+
   bool Changed = false;
   SmallVector<Instruction*, 16> ToInstrument;
+  SmallVector<AllocaInst*, 8> AllocasToInstrument;
+  SmallVector<Instruction*, 8> RetVec;
   for (auto &BB : F) {
     for (auto &Inst : BB) {
+      if (ClInstrumentStack)
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+          // Realign all allocas. We don't want small uninteresting allocas to
+          // hide in instrumented alloca's padding.
+          if (AI->getAlignment() < Mapping.getAllocaAlignment())
+            AI->setAlignment(Mapping.getAllocaAlignment());
+          // Instrument some of them.
+          if (isInterestingAlloca(*AI))
+            AllocasToInstrument.push_back(AI);
+          continue;
+        }
+
+      if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
+          isa<CleanupReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+
       Value *MaybeMask = nullptr;
       bool IsWrite;
       unsigned Alignment;
@@ -331,8 +734,30 @@ bool HWAddressSanitizer::runOnFunction(Function &F) {
     }
   }
 
+  if (!AllocasToInstrument.empty())
+    Changed |= instrumentStack(AllocasToInstrument, RetVec);
+
   for (auto Inst : ToInstrument)
     Changed |= instrumentMemAccess(Inst);
 
+  LocalDynamicShadow = nullptr;
+
   return Changed;
 }
+
+void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
+  const bool IsAndroid = TargetTriple.isAndroid();
+  const bool IsAndroidWithIfuncSupport =
+      IsAndroid && !TargetTriple.isAndroidVersionLT(21);
+
+  Scale = kDefaultShadowScale;
+
+  if (ClEnableKhwasan || ClInstrumentWithCalls || !IsAndroidWithIfuncSupport)
+    Offset = 0;
+  else
+    Offset = kDynamicShadowSentinel;
+  if (ClMappingOffset.getNumOccurrences() > 0)
+    Offset = ClMappingOffset;
+
+  InGlobal = IsAndroidWithIfuncSupport;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 49b8a67a6c14..27fb0e4393af 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -45,7 +45,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include <cassert>
@@ -223,12 +223,12 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     uint64_t TotalCount, uint32_t NumCandidates) {
   std::vector<PromotionCandidate> Ret;
 
-  DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
-               << " Num_targets: " << ValueDataRef.size()
-               << " Num_candidates: " << NumCandidates << "\n");
+  LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
+                    << " Num_targets: " << ValueDataRef.size()
+                    << " Num_candidates: " << NumCandidates << "\n");
   NumOfPGOICallsites++;
   if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
-    DEBUG(dbgs() << " Skip: User options.\n");
+    LLVM_DEBUG(dbgs() << " Skip: User options.\n");
     return Ret;
   }
 
@@ -236,11 +236,11 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     uint64_t Count = ValueDataRef[I].Count;
     assert(Count <= TotalCount);
     uint64_t Target = ValueDataRef[I].Value;
-    DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
-                 << "  Target_func: " << Target << "\n");
+    LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                      << "  Target_func: " << Target << "\n");
 
     if (ICPInvokeOnly && dyn_cast<CallInst>(Inst)) {
-      DEBUG(dbgs() << " Not promote: User options.\n");
+      LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
                << " Not promote: User options";
@@ -248,7 +248,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
       break;
     }
     if (ICPCallOnly && dyn_cast<InvokeInst>(Inst)) {
-      DEBUG(dbgs() << " Not promote: User option.\n");
+      LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
                << " Not promote: User options";
@@ -256,7 +256,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
       break;
     }
     if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
-      DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+      LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", Inst)
                << " Not promote: Cutoff reached";
@@ -266,7 +266,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
 
     Function *TargetFunction = Symtab->getFunction(Target);
     if (TargetFunction == nullptr) {
-      DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+      LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", Inst)
                << "Cannot promote indirect call: target not found";
@@ -387,7 +387,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
   InstrProfSymtab Symtab;
   if (Error E = Symtab.create(M, InLTO)) {
     std::string SymtabFailure = toString(std::move(E));
-    DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+    LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
     (void)SymtabFailure;
     return false;
   }
@@ -412,12 +412,12 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
     ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE);
     bool FuncChanged = ICallPromotion.processFunction(PSI);
     if (ICPDUMPAFTER && FuncChanged) {
-      DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
     }
     Changed |= FuncChanged;
     if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
-      DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+      LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n");
       break;
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 9b70f95480e4..22076f04d6ad 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/InstrProfiling.h"
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -271,8 +271,8 @@ public:
         break;
     }
 
-    DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
-                 << L.getLoopDepth() << ")\n");
+    LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+                      << L.getLoopDepth() << ")\n");
     return Promoted != 0;
   }
 
@@ -430,9 +430,24 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) {
   }
 }
 
-bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
-  bool MadeChange = false;
+/// Check if the module contains uses of any profiling intrinsics.
+static bool containsProfilingIntrinsics(Module &M) {
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
+    if (!F->use_empty())
+      return true;
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
+    if (!F->use_empty())
+      return true;
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
+    if (!F->use_empty())
+      return true;
+  return false;
+}
 
+bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   this->M = &M;
   this->TLI = &TLI;
   NamesVar = nullptr;
@@ -443,6 +458,15 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
                               MemOPSizeRangeLast);
   TT = Triple(M.getTargetTriple());
 
+  // Emit the runtime hook even if no counters are present.
+  bool MadeChange = emitRuntimeHook();
+
+  // Improve compile time by avoiding linear scans when there is no work.
+  GlobalVariable *CoverageNamesVar =
+      M.getNamedGlobal(getCoverageUnusedNamesVarName());
+  if (!containsProfilingIntrinsics(M) && !CoverageNamesVar)
+    return MadeChange;
+
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
@@ -464,8 +488,7 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   for (Function &F : M)
     MadeChange |= lowerIntrinsics(&F);
 
-  if (GlobalVariable *CoverageNamesVar =
-          M.getNamedGlobal(getCoverageUnusedNamesVarName())) {
+  if (CoverageNamesVar) {
     lowerCoverageData(CoverageNamesVar);
     MadeChange = true;
   }
@@ -476,7 +499,6 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   emitVNodes();
   emitNameData();
   emitRegistration();
-  emitRuntimeHook();
   emitUses();
   emitInitialization();
   return true;
@@ -669,6 +691,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Module &M) {
   // Use linker script magic to get data/cnts/name start/end.
   if (Triple(M.getTargetTriple()).isOSLinux() ||
       Triple(M.getTargetTriple()).isOSFreeBSD() ||
+      Triple(M.getTargetTriple()).isOSFuchsia() ||
       Triple(M.getTargetTriple()).isPS4CPU())
     return false;
 
@@ -892,15 +915,15 @@ void InstrProfiling::emitRegistration() {
   IRB.CreateRetVoid();
 }
 
-void InstrProfiling::emitRuntimeHook() {
+bool InstrProfiling::emitRuntimeHook() {
   // We expect the linker to be invoked with -u<hook_var> flag for linux,
   // for which case there is no need to emit the user function.
   if (Triple(M->getTargetTriple()).isOSLinux())
-    return;
+    return false;
 
   // If the module's provided its own runtime, we don't need to do anything.
   if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
-    return;
+    return false;
 
   // Declare an external variable that will pull in the runtime initialization.
   auto *Int32Ty = Type::getInt32Ty(M->getContext());
@@ -925,6 +948,7 @@ void InstrProfiling::emitRuntimeHook() {
 
   // Mark the user variable as used so that it isn't stripped out.
   UsedVars.push_back(User);
+  return true;
 }
 
 void InstrProfiling::emitUses() {
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b3c39b5b1665..4bcef6972786 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -101,6 +101,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -138,7 +139,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -163,7 +163,7 @@ static const unsigned kRetvalTLSSize = 800;
 // Accesses sizes are powers of two: 1, 2, 4, 8.
 static const size_t kNumberOfAccessSizes = 4;
 
-/// \brief Track origins of uninitialized values.
+/// Track origins of uninitialized values.
 ///
 /// Adds a section to MemorySanitizer report that points to the allocation
 /// (stack or heap) the uninitialized bits came from originally.
@@ -199,6 +199,18 @@ static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
        cl::desc("exact handling of relational integer ICmp"),
        cl::Hidden, cl::init(false));
 
+// When compiling the Linux kernel, we sometimes see false positives related to
+// MSan being unable to understand that inline assembly calls may initialize
+// local variables.
+// This flag makes the compiler conservatively unpoison every memory location
+// passed into an assembly call. Note that this may cause false positives.
+// Because it's impossible to figure out the array sizes, we can only unpoison
+// the first sizeof(type) bytes for each type* pointer.
+static cl::opt<bool> ClHandleAsmConservative(
+    "msan-handle-asm-conservative",
+    cl::desc("conservative handling of inline assembly"), cl::Hidden,
+    cl::init(false));
+
 // This flag controls whether we check the shadow of the address
 // operand of load or store. Such bugs are very rare, since load from
 // a garbage address typically results in SEGV, but still happen
@@ -234,6 +246,24 @@ static cl::opt<bool> ClWithComdat("msan-with-comdat",
        cl::desc("Place MSan constructors in comdat sections"),
        cl::Hidden, cl::init(false));
 
+// These options allow to specify custom memory map parameters
+// See MemoryMapParams for details.
+static cl::opt<unsigned long long> ClAndMask("msan-and-mask",
+       cl::desc("Define custom MSan AndMask"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned long long> ClXorMask("msan-xor-mask",
+       cl::desc("Define custom MSan XorMask"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned long long> ClShadowBase("msan-shadow-base",
+       cl::desc("Define custom MSan ShadowBase"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned long long> ClOriginBase("msan-origin-base",
+       cl::desc("Define custom MSan OriginBase"),
+       cl::Hidden, cl::init(0));
+
 static const char *const kMsanModuleCtorName = "msan.module_ctor";
 static const char *const kMsanInitName = "__msan_init";
 
@@ -360,7 +390,7 @@ static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
 
 namespace {
 
-/// \brief An instrumentation pass implementing detection of uninitialized
+/// An instrumentation pass implementing detection of uninitialized
 /// reads.
 ///
 /// MemorySanitizer: instrument the code in module to find
@@ -368,7 +398,7 @@ namespace {
 class MemorySanitizer : public FunctionPass {
 public:
   // Pass identification, replacement for typeid.
-  static char ID; 
+  static char ID;
 
   MemorySanitizer(int TrackOrigins = 0, bool Recover = false)
       : FunctionPass(ID),
@@ -392,8 +422,9 @@ private:
   friend struct VarArgPowerPC64Helper;
 
   void initializeCallbacks(Module &M);
+  void createUserspaceApi(Module &M);
 
-  /// \brief Track origins (allocation points) of uninitialized values.
+  /// Track origins (allocation points) of uninitialized values.
   int TrackOrigins;
   bool Recover;
 
@@ -401,60 +432,67 @@ private:
   Type *IntptrTy;
   Type *OriginTy;
 
-  /// \brief Thread-local shadow storage for function parameters.
+  /// Thread-local shadow storage for function parameters.
   GlobalVariable *ParamTLS;
 
-  /// \brief Thread-local origin storage for function parameters.
+  /// Thread-local origin storage for function parameters.
   GlobalVariable *ParamOriginTLS;
 
-  /// \brief Thread-local shadow storage for function return value.
+  /// Thread-local shadow storage for function return value.
   GlobalVariable *RetvalTLS;
 
-  /// \brief Thread-local origin storage for function return value.
+  /// Thread-local origin storage for function return value.
   GlobalVariable *RetvalOriginTLS;
 
-  /// \brief Thread-local shadow storage for in-register va_arg function
+  /// Thread-local shadow storage for in-register va_arg function
   /// parameters (x86_64-specific).
   GlobalVariable *VAArgTLS;
 
-  /// \brief Thread-local shadow storage for va_arg overflow area
+  /// Thread-local shadow storage for va_arg overflow area
   /// (x86_64-specific).
   GlobalVariable *VAArgOverflowSizeTLS;
 
-  /// \brief Thread-local space used to pass origin value to the UMR reporting
+  /// Thread-local space used to pass origin value to the UMR reporting
   /// function.
   GlobalVariable *OriginTLS;
 
-  /// \brief The run-time callback to print a warning.
-  Value *WarningFn = nullptr;
+  /// Are the instrumentation callbacks set up?
+  bool CallbacksInitialized = false;
+
+  /// The run-time callback to print a warning.
+  Value *WarningFn;
 
   // These arrays are indexed by log2(AccessSize).
   Value *MaybeWarningFn[kNumberOfAccessSizes];
   Value *MaybeStoreOriginFn[kNumberOfAccessSizes];
 
-  /// \brief Run-time helper that generates a new origin value for a stack
+  /// Run-time helper that generates a new origin value for a stack
   /// allocation.
   Value *MsanSetAllocaOrigin4Fn;
 
-  /// \brief Run-time helper that poisons stack on function entry.
+  /// Run-time helper that poisons stack on function entry.
   Value *MsanPoisonStackFn;
 
-  /// \brief Run-time helper that records a store (or any event) of an
+  /// Run-time helper that records a store (or any event) of an
   /// uninitialized value and returns an updated origin id encoding this info.
   Value *MsanChainOriginFn;
 
-  /// \brief MSan runtime replacements for memmove, memcpy and memset.
+  /// MSan runtime replacements for memmove, memcpy and memset.
   Value *MemmoveFn, *MemcpyFn, *MemsetFn;
 
-  /// \brief Memory map parameters used in application-to-shadow calculation.
+  /// Memory map parameters used in application-to-shadow calculation.
   const MemoryMapParams *MapParams;
 
+  /// Custom memory map parameters used when -msan-shadow-base or
+  // -msan-origin-base is provided.
+  MemoryMapParams CustomMapParams;
+
   MDNode *ColdCallWeights;
 
-  /// \brief Branch weights for origin store.
+  /// Branch weights for origin store.
   MDNode *OriginStoreWeights;
 
-  /// \brief An empty volatile inline asm that prevents callback merge.
+  /// An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
   Function *MsanCtorFunction;
@@ -476,7 +514,7 @@ FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins, bool Recover) {
   return new MemorySanitizer(TrackOrigins, Recover);
 }
 
-/// \brief Create a non-const global initialized with the given string.
+/// Create a non-const global initialized with the given string.
 ///
 /// Creates a writable global for Str so that we can pass it to the
 /// run-time lib. Runtime uses first 4 bytes of the string to store the
@@ -488,12 +526,8 @@ static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
                             GlobalValue::PrivateLinkage, StrConst, "");
 }
 
-/// \brief Insert extern declaration of runtime-provided functions and globals.
-void MemorySanitizer::initializeCallbacks(Module &M) {
-  // Only do this once.
-  if (WarningFn)
-    return;
-
+/// Insert declarations for userspace-specific functions and globals.
+void MemorySanitizer::createUserspaceApi(Module &M) {
   IRBuilder<> IRB(*C);
   // Create the callback.
   // FIXME: this function should have "Cold" calling conv,
@@ -502,6 +536,38 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
                                     : "__msan_warning_noreturn";
   WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
 
+  // Create the global TLS variables.
+  RetvalTLS = new GlobalVariable(
+      M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
+      GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
+      GlobalVariable::InitialExecTLSModel);
+
+  RetvalOriginTLS = new GlobalVariable(
+      M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
+      "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
+  ParamTLS = new GlobalVariable(
+      M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+      GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
+      GlobalVariable::InitialExecTLSModel);
+
+  ParamOriginTLS = new GlobalVariable(
+      M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
+      GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
+      nullptr, GlobalVariable::InitialExecTLSModel);
+
+  VAArgTLS = new GlobalVariable(
+      M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+      GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
+      GlobalVariable::InitialExecTLSModel);
+  VAArgOverflowSizeTLS = new GlobalVariable(
+      M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+      "__msan_va_arg_overflow_size_tls", nullptr,
+      GlobalVariable::InitialExecTLSModel);
+  OriginTLS = new GlobalVariable(
+      M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+      "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
   for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
        AccessSizeIndex++) {
     unsigned AccessSize = 1 << AccessSizeIndex;
@@ -522,6 +588,17 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MsanPoisonStackFn =
       M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
                             IRB.getInt8PtrTy(), IntptrTy);
+}
+
+/// Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (CallbacksInitialized)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Initialize callbacks that are common for kernel and userspace
+  // instrumentation.
   MsanChainOriginFn = M.getOrInsertFunction(
     "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
   MemmoveFn = M.getOrInsertFunction(
@@ -533,98 +610,81 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MemsetFn = M.getOrInsertFunction(
     "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
     IntptrTy);
-
-  // Create globals.
-  RetvalTLS = new GlobalVariable(
-    M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
-    GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
-    GlobalVariable::InitialExecTLSModel);
-  RetvalOriginTLS = new GlobalVariable(
-    M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
-    "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
-
-  ParamTLS = new GlobalVariable(
-    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
-    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
-    GlobalVariable::InitialExecTLSModel);
-  ParamOriginTLS = new GlobalVariable(
-    M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
-    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
-    nullptr, GlobalVariable::InitialExecTLSModel);
-
-  VAArgTLS = new GlobalVariable(
-    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
-    GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
-    GlobalVariable::InitialExecTLSModel);
-  VAArgOverflowSizeTLS = new GlobalVariable(
-    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
-    "__msan_va_arg_overflow_size_tls", nullptr,
-    GlobalVariable::InitialExecTLSModel);
-  OriginTLS = new GlobalVariable(
-    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
-    "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
-
   // We insert an empty inline asm after __msan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
+
+  createUserspaceApi(M);
+  CallbacksInitialized = true;
 }
 
-/// \brief Module-level initialization.
+/// Module-level initialization.
 ///
 /// inserts a call to __msan_init to the module's constructor list.
 bool MemorySanitizer::doInitialization(Module &M) {
   auto &DL = M.getDataLayout();
 
-  Triple TargetTriple(M.getTargetTriple());
-  switch (TargetTriple.getOS()) {
-    case Triple::FreeBSD:
-      switch (TargetTriple.getArch()) {
-        case Triple::x86_64:
-          MapParams = FreeBSD_X86_MemoryMapParams.bits64;
-          break;
-        case Triple::x86:
-          MapParams = FreeBSD_X86_MemoryMapParams.bits32;
-          break;
-        default:
-          report_fatal_error("unsupported architecture");
-      }
-      break;
-    case Triple::NetBSD:
-      switch (TargetTriple.getArch()) {
-        case Triple::x86_64:
-          MapParams = NetBSD_X86_MemoryMapParams.bits64;
-          break;
-        default:
-          report_fatal_error("unsupported architecture");
-      }
-      break;
-    case Triple::Linux:
-      switch (TargetTriple.getArch()) {
-        case Triple::x86_64:
-          MapParams = Linux_X86_MemoryMapParams.bits64;
-          break;
-        case Triple::x86:
-          MapParams = Linux_X86_MemoryMapParams.bits32;
-          break;
-        case Triple::mips64:
-        case Triple::mips64el:
-          MapParams = Linux_MIPS_MemoryMapParams.bits64;
-          break;
-        case Triple::ppc64:
-        case Triple::ppc64le:
-          MapParams = Linux_PowerPC_MemoryMapParams.bits64;
-          break;
-        case Triple::aarch64:
-        case Triple::aarch64_be:
-          MapParams = Linux_ARM_MemoryMapParams.bits64;
-          break;
-        default:
-          report_fatal_error("unsupported architecture");
-      }
-      break;
-    default:
-      report_fatal_error("unsupported operating system");
+  bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0;
+  bool OriginPassed = ClOriginBase.getNumOccurrences() > 0;
+  // Check the overrides first
+  if (ShadowPassed || OriginPassed) {
+    CustomMapParams.AndMask = ClAndMask;
+    CustomMapParams.XorMask = ClXorMask;
+    CustomMapParams.ShadowBase = ClShadowBase;
+    CustomMapParams.OriginBase = ClOriginBase;
+    MapParams = &CustomMapParams;
+  } else {
+    Triple TargetTriple(M.getTargetTriple());
+    switch (TargetTriple.getOS()) {
+      case Triple::FreeBSD:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+            break;
+          case Triple::x86:
+            MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      case Triple::NetBSD:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = NetBSD_X86_MemoryMapParams.bits64;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      case Triple::Linux:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = Linux_X86_MemoryMapParams.bits64;
+            break;
+          case Triple::x86:
+            MapParams = Linux_X86_MemoryMapParams.bits32;
+            break;
+          case Triple::mips64:
+          case Triple::mips64el:
+            MapParams = Linux_MIPS_MemoryMapParams.bits64;
+            break;
+          case Triple::ppc64:
+          case Triple::ppc64le:
+            MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+            break;
+          case Triple::aarch64:
+          case Triple::aarch64_be:
+            MapParams = Linux_ARM_MemoryMapParams.bits64;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      default:
+        report_fatal_error("unsupported operating system");
+    }
   }
 
   C = &(M.getContext());
@@ -661,7 +721,7 @@ bool MemorySanitizer::doInitialization(Module &M) {
 
 namespace {
 
-/// \brief A helper class that handles instrumentation of VarArg
+/// A helper class that handles instrumentation of VarArg
 /// functions on a particular platform.
 ///
 /// Implementations are expected to insert the instrumentation
@@ -672,16 +732,16 @@ namespace {
 struct VarArgHelper {
   virtual ~VarArgHelper() = default;
 
-  /// \brief Visit a CallSite.
+  /// Visit a CallSite.
   virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
 
-  /// \brief Visit a va_start call.
+  /// Visit a va_start call.
   virtual void visitVAStartInst(VAStartInst &I) = 0;
 
-  /// \brief Visit a va_copy call.
+  /// Visit a va_copy call.
   virtual void visitVACopyInst(VACopyInst &I) = 0;
 
-  /// \brief Finalize function instrumentation.
+  /// Finalize function instrumentation.
   ///
   /// This method is called after visiting all interesting (see above)
   /// instructions in a function.
@@ -715,6 +775,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ValueMap<Value*, Value*> ShadowMap, OriginMap;
   std::unique_ptr<VarArgHelper> VAHelper;
   const TargetLibraryInfo *TLI;
+  BasicBlock *ActualFnStart;
 
   // The following flags disable parts of MSan instrumentation based on
   // blacklist contents and command-line options.
@@ -747,9 +808,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     CheckReturnValue = SanitizeFunction && (F.getName() == "main");
     TLI = &MS.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
-    DEBUG(if (!InsertChecks)
-          dbgs() << "MemorySanitizer is not inserting checks into '"
-                 << F.getName() << "'\n");
+    MS.initializeCallbacks(*F.getParent());
+    ActualFnStart = &F.getEntryBlock();
+
+    LLVM_DEBUG(if (!InsertChecks) dbgs()
+               << "MemorySanitizer is not inserting checks into '"
+               << F.getName() << "'\n");
   }
 
   Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
@@ -766,7 +830,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
   }
 
-  /// \brief Fill memory range with the given origin value.
+  /// Fill memory range with the given origin value.
   void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
                    unsigned Size, unsigned Alignment) {
     const DataLayout &DL = F.getParent()->getDataLayout();
@@ -849,13 +913,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       unsigned Alignment = SI->getAlignment();
       unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
       std::tie(ShadowPtr, OriginPtr) =
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment);
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
 
       StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
-      DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
-
-      if (ClCheckAccessAddress)
-        insertShadowCheck(Addr, NewSI);
+      LLVM_DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
 
       if (SI->isAtomic())
         SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
@@ -866,25 +928,31 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
+  /// Helper function to insert a warning at IRB's current insert point.
+  void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
+    if (!Origin)
+      Origin = (Value *)IRB.getInt32(0);
+    if (MS.TrackOrigins) {
+      IRB.CreateStore(Origin, MS.OriginTLS);
+    }
+    IRB.CreateCall(MS.WarningFn, {});
+    IRB.CreateCall(MS.EmptyAsm, {});
+    // FIXME: Insert UnreachableInst if !MS.Recover?
+    // This may invalidate some of the following checks and needs to be done
+    // at the very end.
+  }
+
   void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
                            bool AsCall) {
     IRBuilder<> IRB(OrigIns);
-    DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+    LLVM_DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
     Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
-    DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+    LLVM_DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
 
     Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
     if (ConstantShadow) {
       if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
-        if (MS.TrackOrigins) {
-          IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
-                          MS.OriginTLS);
-        }
-        IRB.CreateCall(MS.WarningFn, {});
-        IRB.CreateCall(MS.EmptyAsm, {});
-        // FIXME: Insert UnreachableInst if !MS.Recover?
-        // This may invalidate some of the following checks and needs to be done
-        // at the very end.
+        insertWarningFn(IRB, Origin);
       }
       return;
     }
@@ -908,13 +976,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
 
       IRB.SetInsertPoint(CheckTerm);
-      if (MS.TrackOrigins) {
-        IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
-                        MS.OriginTLS);
-      }
-      IRB.CreateCall(MS.WarningFn, {});
-      IRB.CreateCall(MS.EmptyAsm, {});
-      DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+      insertWarningFn(IRB, Origin);
+      LLVM_DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
     }
   }
 
@@ -925,13 +988,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Origin = ShadowData.Origin;
       materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
     }
-    DEBUG(dbgs() << "DONE:\n" << F);
+    LLVM_DEBUG(dbgs() << "DONE:\n" << F);
   }
 
-  /// \brief Add MemorySanitizer instrumentation to a function.
+  /// Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
-    MS.initializeCallbacks(*F.getParent());
-
     // In the presence of unreachable blocks, we may see Phi nodes with
     // incoming nodes from such blocks. Since InstVisitor skips unreachable
     // blocks, such nodes will not have any shadow value associated with them.
@@ -941,7 +1002,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Iterate all BBs in depth-first order and create shadow instructions
     // for all instructions (where applicable).
     // For PHI nodes we create dummy shadow PHIs which will be finalized later.
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+    for (BasicBlock *BB : depth_first(ActualFnStart))
       visit(*BB);
 
     // Finalize PHI nodes.
@@ -961,22 +1022,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                InstrumentationList.size() + StoreList.size() >
                                    (unsigned)ClInstrumentationWithCallThreshold;
 
-    // Delayed instrumentation of StoreInst.
-    // This may add new checks to be inserted later.
-    materializeStores(InstrumentWithCalls);
-
     // Insert shadow value checks.
     materializeChecks(InstrumentWithCalls);
 
+    // Delayed instrumentation of StoreInst.
+    // This may not add new address checks.
+    materializeStores(InstrumentWithCalls);
+
     return true;
   }
 
-  /// \brief Compute the shadow type that corresponds to a given Value.
+  /// Compute the shadow type that corresponds to a given Value.
   Type *getShadowTy(Value *V) {
     return getShadowTy(V->getType());
   }
 
-  /// \brief Compute the shadow type that corresponds to a given Type.
+  /// Compute the shadow type that corresponds to a given Type.
   Type *getShadowTy(Type *OrigTy) {
     if (!OrigTy->isSized()) {
       return nullptr;
@@ -1000,21 +1061,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
         Elements.push_back(getShadowTy(ST->getElementType(i)));
       StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
-      DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
       return Res;
     }
     uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
     return IntegerType::get(*MS.C, TypeSize);
   }
 
-  /// \brief Flatten a vector type.
+  /// Flatten a vector type.
   Type *getShadowTyNoVec(Type *ty) {
     if (VectorType *vt = dyn_cast<VectorType>(ty))
       return IntegerType::get(*MS.C, vt->getBitWidth());
     return ty;
   }
 
-  /// \brief Convert a shadow value to it's flattened variant.
+  /// Convert a shadow value to it's flattened variant.
   Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
     Type *Ty = V->getType();
     Type *NoVecTy = getShadowTyNoVec(Ty);
@@ -1022,7 +1083,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateBitCast(V, NoVecTy);
   }
 
-  /// \brief Compute the integer shadow offset that corresponds to a given
+  /// Compute the integer shadow offset that corresponds to a given
   /// application address.
   ///
   /// Offset = (Addr & ~AndMask) ^ XorMask
@@ -1041,18 +1102,18 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return OffsetLong;
   }
 
-  /// \brief Compute the shadow and origin addresses corresponding to a given
+  /// Compute the shadow and origin addresses corresponding to a given
   /// application address.
   ///
   /// Shadow = ShadowBase + Offset
   /// Origin = (OriginBase + Offset) & ~3ULL
-  std::pair<Value *, Value *> getShadowOriginPtrUserspace(
-      Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, unsigned Alignment,
-      Instruction **FirstInsn) {
+  std::pair<Value *, Value *> getShadowOriginPtrUserspace(Value *Addr,
+                                                          IRBuilder<> &IRB,
+                                                          Type *ShadowTy,
+                                                          unsigned Alignment) {
     Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
     Value *ShadowLong = ShadowOffset;
     uint64_t ShadowBase = MS.MapParams->ShadowBase;
-    *FirstInsn = dyn_cast<Instruction>(ShadowLong);
     if (ShadowBase != 0) {
       ShadowLong =
         IRB.CreateAdd(ShadowLong,
@@ -1080,58 +1141,60 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
                                                  Type *ShadowTy,
-                                                 unsigned Alignment) {
-    Instruction *FirstInsn = nullptr;
+                                                 unsigned Alignment,
+                                                 bool isStore) {
     std::pair<Value *, Value *> ret =
-        getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment, &FirstInsn);
+        getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
     return ret;
   }
 
-  /// \brief Compute the shadow address for a given function argument.
+  /// Compute the shadow address for a given function argument.
   ///
   /// Shadow = ParamTLS+ArgOffset.
   Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
                                  int ArgOffset) {
     Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    if (ArgOffset)
+      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
     return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
                               "_msarg");
   }
 
-  /// \brief Compute the origin address for a given function argument.
+  /// Compute the origin address for a given function argument.
   Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
                                  int ArgOffset) {
     if (!MS.TrackOrigins) return nullptr;
     Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    if (ArgOffset)
+      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
     return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
                               "_msarg_o");
   }
 
-  /// \brief Compute the shadow address for a retval.
+  /// Compute the shadow address for a retval.
   Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
     return IRB.CreatePointerCast(MS.RetvalTLS,
                                  PointerType::get(getShadowTy(A), 0),
                                  "_msret");
   }
 
-  /// \brief Compute the origin address for a retval.
+  /// Compute the origin address for a retval.
   Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
     // We keep a single origin for the entire retval. Might be too optimistic.
     return MS.RetvalOriginTLS;
   }
 
-  /// \brief Set SV to be the shadow value for V.
+  /// Set SV to be the shadow value for V.
   void setShadow(Value *V, Value *SV) {
     assert(!ShadowMap.count(V) && "Values may only have one shadow");
     ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
   }
 
-  /// \brief Set Origin to be the origin value for V.
+  /// Set Origin to be the origin value for V.
   void setOrigin(Value *V, Value *Origin) {
     if (!MS.TrackOrigins) return;
     assert(!OriginMap.count(V) && "Values may only have one origin");
-    DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
     OriginMap[V] = Origin;
   }
 
@@ -1142,7 +1205,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return Constant::getNullValue(ShadowTy);
   }
 
-  /// \brief Create a clean shadow value for a given value.
+  /// Create a clean shadow value for a given value.
   ///
   /// Clean shadow (all zeroes) means all bits of the value are defined
   /// (initialized).
@@ -1150,7 +1213,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return getCleanShadow(V->getType());
   }
 
-  /// \brief Create a dirty shadow of a given shadow type.
+  /// Create a dirty shadow of a given shadow type.
   Constant *getPoisonedShadow(Type *ShadowTy) {
     assert(ShadowTy);
     if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
@@ -1169,7 +1232,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     llvm_unreachable("Unexpected shadow type");
   }
 
-  /// \brief Create a dirty shadow for a given value.
+  /// Create a dirty shadow for a given value.
   Constant *getPoisonedShadow(Value *V) {
     Type *ShadowTy = getShadowTy(V);
     if (!ShadowTy)
@@ -1177,12 +1240,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return getPoisonedShadow(ShadowTy);
   }
 
-  /// \brief Create a clean (zero) origin.
+  /// Create a clean (zero) origin.
   Value *getCleanOrigin() {
     return Constant::getNullValue(MS.OriginTy);
   }
 
-  /// \brief Get the shadow value for a given Value.
+  /// Get the shadow value for a given Value.
   ///
   /// This function either returns the value set earlier with setShadow,
   /// or extracts if from ParamTLS (for function arguments).
@@ -1194,7 +1257,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // For instructions the shadow is already stored in the map.
       Value *Shadow = ShadowMap[V];
       if (!Shadow) {
-        DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
         (void)I;
         assert(Shadow && "No shadow for a value");
       }
@@ -1202,7 +1265,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
     if (UndefValue *U = dyn_cast<UndefValue>(V)) {
       Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
-      DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
       (void)U;
       return AllOnes;
     }
@@ -1212,12 +1275,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (*ShadowPtr)
         return *ShadowPtr;
       Function *F = A->getParent();
-      IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+      IRBuilder<> EntryIRB(ActualFnStart->getFirstNonPHI());
       unsigned ArgOffset = 0;
       const DataLayout &DL = F->getParent()->getDataLayout();
       for (auto &FArg : F->args()) {
         if (!FArg.getType()->isSized()) {
-          DEBUG(dbgs() << "Arg is not sized\n");
+          LLVM_DEBUG(dbgs() << "Arg is not sized\n");
           continue;
         }
         unsigned Size =
@@ -1237,7 +1300,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
               ArgAlign = DL.getABITypeAlignment(EltType);
             }
             Value *CpShadowPtr =
-                getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign)
+                getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
+                                   /*isStore*/ true)
                     .first;
             if (Overflow) {
               // ParamTLS overflow.
@@ -1246,9 +1310,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                   Size, ArgAlign);
             } else {
               unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
-              Value *Cpy =
-                  EntryIRB.CreateMemCpy(CpShadowPtr, Base, Size, CopyAlign);
-              DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+              Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
+                                                 CopyAlign, Size);
+              LLVM_DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
               (void)Cpy;
             }
             *ShadowPtr = getCleanShadow(V);
@@ -1261,8 +1325,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                   EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
             }
           }
-          DEBUG(dbgs() << "  ARG:    "  << FArg << " ==> " <<
-                **ShadowPtr << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "  ARG:    " << FArg << " ==> " << **ShadowPtr << "\n");
           if (MS.TrackOrigins && !Overflow) {
             Value *OriginPtr =
                 getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
@@ -1280,12 +1344,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return getCleanShadow(V);
   }
 
-  /// \brief Get the shadow for i-th argument of the instruction I.
+  /// Get the shadow for i-th argument of the instruction I.
   Value *getShadow(Instruction *I, int i) {
     return getShadow(I->getOperand(i));
   }
 
-  /// \brief Get the origin for a value.
+  /// Get the origin for a value.
   Value *getOrigin(Value *V) {
     if (!MS.TrackOrigins) return nullptr;
     if (!PropagateShadow) return getCleanOrigin();
@@ -1301,12 +1365,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return Origin;
   }
 
-  /// \brief Get the origin for i-th argument of the instruction I.
+  /// Get the origin for i-th argument of the instruction I.
   Value *getOrigin(Instruction *I, int i) {
     return getOrigin(I->getOperand(i));
   }
 
-  /// \brief Remember the place where a shadow check should be inserted.
+  /// Remember the place where a shadow check should be inserted.
   ///
   /// This location will be later instrumented with a check that will print a
   /// UMR warning in runtime if the shadow value is not 0.
@@ -1322,7 +1386,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
   }
 
-  /// \brief Remember the place where a shadow check should be inserted.
+  /// Remember the place where a shadow check should be inserted.
   ///
   /// This location will be later instrumented with a check that will print a
   /// UMR warning in runtime if the value is not fully defined.
@@ -1382,7 +1446,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       InstVisitor<MemorySanitizerVisitor>::visit(I);
   }
 
-  /// \brief Instrument LoadInst
+  /// Instrument LoadInst
   ///
   /// Loads the corresponding shadow and (optionally) origin.
   /// Optionally, checks that the load address is fully defined.
@@ -1396,7 +1460,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     unsigned Alignment = I.getAlignment();
     if (PropagateShadow) {
       std::tie(ShadowPtr, OriginPtr) =
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment);
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
       setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, Alignment, "_msld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
@@ -1418,12 +1482,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
-  /// \brief Instrument StoreInst
+  /// Instrument StoreInst
   ///
   /// Stores the corresponding shadow and (optionally) origin.
   /// Optionally, checks that the store address is fully defined.
   void visitStoreInst(StoreInst &I) {
     StoreList.push_back(&I);
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
   }
 
   void handleCASOrRMW(Instruction &I) {
@@ -1431,8 +1497,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     IRBuilder<> IRB(&I);
     Value *Addr = I.getOperand(0);
-    Value *ShadowPtr =
-        getShadowOriginPtr(Addr, IRB, I.getType(), /*Alignment*/ 1).first;
+    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(),
+                                          /*Alignment*/ 1, /*isStore*/ true)
+                           .first;
 
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
@@ -1536,7 +1603,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
   void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
 
-  /// \brief Propagate shadow for bitwise AND.
+  /// Propagate shadow for bitwise AND.
   ///
   /// This code is exact, i.e. if, for example, a bit in the left argument
   /// is defined and 0, then neither the value not definedness of the
@@ -1585,7 +1652,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  /// \brief Default propagation of shadow and/or origin.
+  /// Default propagation of shadow and/or origin.
   ///
   /// This class implements the general case of shadow propagation, used in all
   /// cases where we don't know and/or don't care about what the operation
@@ -1611,7 +1678,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
         : IRB(IRB), MSV(MSV) {}
 
-    /// \brief Add a pair of shadow and origin values to the mix.
+    /// Add a pair of shadow and origin values to the mix.
     Combiner &Add(Value *OpShadow, Value *OpOrigin) {
       if (CombineShadow) {
         assert(OpShadow);
@@ -1641,14 +1708,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return *this;
     }
 
-    /// \brief Add an application value to the mix.
+    /// Add an application value to the mix.
     Combiner &Add(Value *V) {
       Value *OpShadow = MSV->getShadow(V);
       Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
       return Add(OpShadow, OpOrigin);
     }
 
-    /// \brief Set the current combined values as the given instruction's shadow
+    /// Set the current combined values as the given instruction's shadow
     /// and origin.
     void Done(Instruction *I) {
       if (CombineShadow) {
@@ -1666,7 +1733,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   using ShadowAndOriginCombiner = Combiner<true>;
   using OriginCombiner = Combiner<false>;
 
-  /// \brief Propagate origin for arbitrary operation.
+  /// Propagate origin for arbitrary operation.
   void setOriginForNaryOp(Instruction &I) {
     if (!MS.TrackOrigins) return;
     IRBuilder<> IRB(&I);
@@ -1684,7 +1751,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Ty->getPrimitiveSizeInBits();
   }
 
-  /// \brief Cast between two shadow types, extending or truncating as
+  /// Cast between two shadow types, extending or truncating as
   /// necessary.
   Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
                           bool Signed = false) {
@@ -1706,7 +1773,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // TODO: handle struct types.
   }
 
-  /// \brief Cast an application value to the type of its own shadow.
+  /// Cast an application value to the type of its own shadow.
   Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
     Type *ShadowTy = getShadowTy(V);
     if (V->getType() == ShadowTy)
@@ -1717,7 +1784,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return IRB.CreateBitCast(V, ShadowTy);
   }
 
-  /// \brief Propagate shadow for arbitrary operation.
+  /// Propagate shadow for arbitrary operation.
   void handleShadowOr(Instruction &I) {
     IRBuilder<> IRB(&I);
     ShadowAndOriginCombiner SC(this, IRB);
@@ -1726,7 +1793,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SC.Done(&I);
   }
 
-  // \brief Handle multiplication by constant.
+  // Handle multiplication by constant.
   //
   // Handle a special case of multiplication by constant that may have one or
   // more zeros in the lower bits. This makes corresponding number of lower bits
@@ -1788,7 +1855,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void visitSub(BinaryOperator &I) { handleShadowOr(I); }
   void visitXor(BinaryOperator &I) { handleShadowOr(I); }
 
-  void handleDiv(Instruction &I) {
+  void handleIntegerDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
     // Strict on the second argument.
     insertShadowCheck(I.getOperand(1), &I);
@@ -1796,14 +1863,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  void visitUDiv(BinaryOperator &I) { handleDiv(I); }
-  void visitSDiv(BinaryOperator &I) { handleDiv(I); }
-  void visitFDiv(BinaryOperator &I) { handleDiv(I); }
-  void visitURem(BinaryOperator &I) { handleDiv(I); }
-  void visitSRem(BinaryOperator &I) { handleDiv(I); }
-  void visitFRem(BinaryOperator &I) { handleDiv(I); }
+  void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
+
+  // Floating point division is side-effect free. We can not require that the
+  // divisor is fully initialized and must propagate shadow. See PR37523.
+  void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
 
-  /// \brief Instrument == and != comparisons.
+  /// Instrument == and != comparisons.
   ///
   /// Sometimes the comparison result is known even if some of the bits of the
   /// arguments are not.
@@ -1841,7 +1911,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  /// \brief Build the lowest possible value of V, taking into account V's
+  /// Build the lowest possible value of V, taking into account V's
   ///        uninitialized bits.
   Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
                                 bool isSigned) {
@@ -1858,7 +1928,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
-  /// \brief Build the highest possible value of V, taking into account V's
+  /// Build the highest possible value of V, taking into account V's
   ///        uninitialized bits.
   Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
                                 bool isSigned) {
@@ -1875,7 +1945,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
-  /// \brief Instrument relational comparisons.
+  /// Instrument relational comparisons.
   ///
   /// This function does exact shadow propagation for all relational
   /// comparisons of integers, pointers and vectors of those.
@@ -1908,7 +1978,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  /// \brief Instrument signed relational comparisons.
+  /// Instrument signed relational comparisons.
   ///
   /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
   /// bit of the shadow. Everything else is delegated to handleShadowOr().
@@ -1992,7 +2062,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void visitAShr(BinaryOperator &I) { handleShift(I); }
   void visitLShr(BinaryOperator &I) { handleShift(I); }
 
-  /// \brief Instrument llvm.memmove
+  /// Instrument llvm.memmove
   ///
   /// At this point we don't know if llvm.memmove will be inlined or not.
   /// If we don't instrument it and it gets inlined,
@@ -2045,7 +2115,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     VAHelper->visitVACopyInst(I);
   }
 
-  /// \brief Handle vector store-like intrinsics.
+  /// Handle vector store-like intrinsics.
   ///
   /// Instrument intrinsics that look like a simple SIMD store: writes memory,
   /// has 1 pointer argument and 1 vector argument, returns void.
@@ -2057,8 +2127,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     // We don't know the pointer alignment (could be unaligned SSE store!).
     // Have to assume to worst case.
-    std::tie(ShadowPtr, OriginPtr) =
-        getShadowOriginPtr(Addr, IRB, Shadow->getType(), /*Alignment*/ 1);
+    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+        Addr, IRB, Shadow->getType(), /*Alignment*/ 1, /*isStore*/ true);
     IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
 
     if (ClCheckAccessAddress)
@@ -2069,7 +2139,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return true;
   }
 
-  /// \brief Handle vector load-like intrinsics.
+  /// Handle vector load-like intrinsics.
   ///
   /// Instrument intrinsics that look like a simple SIMD load: reads memory,
   /// has 1 pointer argument, returns a vector.
@@ -2084,7 +2154,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // Have to assume to worst case.
       unsigned Alignment = 1;
       std::tie(ShadowPtr, OriginPtr) =
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment);
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
       setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, Alignment, "_msld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
@@ -2102,7 +2172,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return true;
   }
 
-  /// \brief Handle (SIMD arithmetic)-like intrinsics.
+  /// Handle (SIMD arithmetic)-like intrinsics.
   ///
   /// Instrument intrinsics with any number of arguments of the same type,
   /// equal to the return type. The type should be simple (no aggregates or
@@ -2132,7 +2202,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return true;
   }
 
-  /// \brief Heuristically instrument unknown intrinsics.
+  /// Heuristically instrument unknown intrinsics.
   ///
   /// The main purpose of this code is to do something reasonable with all
   /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
@@ -2182,7 +2252,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(Op));
   }
 
-  // \brief Instrument vector convert instrinsic.
+  // Instrument vector convert instrinsic.
   //
   // This function instruments intrinsics like cvtsi2ss:
   // %Out = int_xxx_cvtyyy(%ConvertOp)
@@ -2285,7 +2355,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateSExt(S2, T);
   }
 
-  // \brief Instrument vector shift instrinsic.
+  // Instrument vector shift instrinsic.
   //
   // This function instruments intrinsics like int_x86_avx2_psll_w.
   // Intrinsic shifts %In by %ShiftSize bits.
@@ -2310,14 +2380,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // \brief Get an X86_MMX-sized vector type.
+  // Get an X86_MMX-sized vector type.
   Type *getMMXVectorTy(unsigned EltSizeInBits) {
     const unsigned X86_MMXSizeInBits = 64;
     return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
                            X86_MMXSizeInBits / EltSizeInBits);
   }
 
-  // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // Returns a signed counterpart for an (un)signed-saturate-and-pack
   // intrinsic.
   Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
     switch (id) {
@@ -2348,7 +2418,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
-  // \brief Instrument vector pack instrinsic.
+  // Instrument vector pack instrinsic.
   //
   // This function instruments intrinsics like x86_mmx_packsswb, that
   // packs elements of 2 input vectors into half as many bits with saturation.
@@ -2391,7 +2461,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // \brief Instrument sum-of-absolute-differencies intrinsic.
+  // Instrument sum-of-absolute-differencies intrinsic.
   void handleVectorSadIntrinsic(IntrinsicInst &I) {
     const unsigned SignificantBitsPerResultElement = 16;
     bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
@@ -2410,7 +2480,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // \brief Instrument multiply-add intrinsic.
+  // Instrument multiply-add intrinsic.
   void handleVectorPmaddIntrinsic(IntrinsicInst &I,
                                   unsigned EltSizeInBits = 0) {
     bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
@@ -2425,7 +2495,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // \brief Instrument compare-packed intrinsic.
+  // Instrument compare-packed intrinsic.
   // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
   // all-ones shadow.
   void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
@@ -2438,7 +2508,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // \brief Instrument compare-scalar intrinsic.
+  // Instrument compare-scalar intrinsic.
   // This handles both cmp* intrinsics which return the result in the first
   // element of a vector, and comi* which return the result as i32.
   void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
@@ -2453,7 +2523,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value* Addr = I.getArgOperand(0);
     Type *Ty = IRB.getInt32Ty();
-    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1).first;
+    Value *ShadowPtr =
+        getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1, /*isStore*/ true)
+            .first;
 
     IRB.CreateStore(getCleanShadow(Ty),
                     IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
@@ -2471,7 +2543,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     unsigned Alignment = 1;
     Value *ShadowPtr, *OriginPtr;
     std::tie(ShadowPtr, OriginPtr) =
-        getShadowOriginPtr(Addr, IRB, Ty, Alignment);
+        getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
 
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
@@ -2482,11 +2554,98 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     insertShadowCheck(Shadow, Origin, &I);
   }
 
+  void handleMaskedStore(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *V = I.getArgOperand(0);
+    Value *Addr = I.getArgOperand(1);
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Value *Mask = I.getArgOperand(3);
+    Value *Shadow = getShadow(V);
+
+    Value *ShadowPtr;
+    Value *OriginPtr;
+    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+        Addr, IRB, Shadow->getType(), Align, /*isStore*/ true);
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Addr, &I);
+      // Uninitialized mask is kind of like uninitialized address, but not as
+      // scary.
+      insertShadowCheck(Mask, &I);
+    }
+
+    IRB.CreateMaskedStore(Shadow, ShadowPtr, Align, Mask);
+
+    if (MS.TrackOrigins) {
+      auto &DL = F.getParent()->getDataLayout();
+      paintOrigin(IRB, getOrigin(V), OriginPtr,
+                  DL.getTypeStoreSize(Shadow->getType()),
+                  std::max(Align, kMinOriginAlignment));
+    }
+  }
+
+  bool handleMaskedLoad(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Value *Mask = I.getArgOperand(2);
+    Value *PassThru = I.getArgOperand(3);
+
+    Type *ShadowTy = getShadowTy(&I);
+    Value *ShadowPtr, *OriginPtr;
+    if (PropagateShadow) {
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Align, /*isStore*/ false);
+      setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Align, Mask,
+                                         getShadow(PassThru), "_msmaskedld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Addr, &I);
+      insertShadowCheck(Mask, &I);
+    }
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        // Choose between PassThru's and the loaded value's origins.
+        Value *MaskedPassThruShadow = IRB.CreateAnd(
+            getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy));
+
+        Value *Acc = IRB.CreateExtractElement(
+            MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+        for (int i = 1, N = PassThru->getType()->getVectorNumElements(); i < N;
+             ++i) {
+          Value *More = IRB.CreateExtractElement(
+              MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+          Acc = IRB.CreateOr(Acc, More);
+        }
+
+        Value *Origin = IRB.CreateSelect(
+            IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())),
+            getOrigin(PassThru), IRB.CreateLoad(OriginPtr));
+
+        setOrigin(&I, Origin);
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+    return true;
+  }
+
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case Intrinsic::bswap:
       handleBswap(I);
       break;
+    case Intrinsic::masked_store:
+      handleMaskedStore(I);
+      break;
+    case Intrinsic::masked_load:
+      handleMaskedLoad(I);
+      break;
     case Intrinsic::x86_sse_stmxcsr:
       handleStmxcsr(I);
       break;
@@ -2501,20 +2660,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_cvttss2usi:
     case Intrinsic::x86_avx512_cvttsd2usi64:
     case Intrinsic::x86_avx512_cvttsd2usi:
-    case Intrinsic::x86_avx512_cvtusi2sd:
     case Intrinsic::x86_avx512_cvtusi2ss:
     case Intrinsic::x86_avx512_cvtusi642sd:
     case Intrinsic::x86_avx512_cvtusi642ss:
     case Intrinsic::x86_sse2_cvtsd2si64:
     case Intrinsic::x86_sse2_cvtsd2si:
     case Intrinsic::x86_sse2_cvtsd2ss:
-    case Intrinsic::x86_sse2_cvtsi2sd:
-    case Intrinsic::x86_sse2_cvtsi642sd:
-    case Intrinsic::x86_sse2_cvtss2sd:
     case Intrinsic::x86_sse2_cvttsd2si64:
     case Intrinsic::x86_sse2_cvttsd2si:
-    case Intrinsic::x86_sse_cvtsi2ss:
-    case Intrinsic::x86_sse_cvtsi642ss:
     case Intrinsic::x86_sse_cvtss2si64:
     case Intrinsic::x86_sse_cvtss2si:
     case Intrinsic::x86_sse_cvttss2si64:
@@ -2715,7 +2868,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // outputs as clean. Note that any side effects of the inline asm that are
       // not immediately visible in its constraints are not handled.
       if (Call->isInlineAsm()) {
-        visitInstruction(I);
+        if (ClHandleAsmConservative)
+          visitAsmInstruction(I);
+        else
+          visitInstruction(I);
         return;
       }
 
@@ -2738,13 +2894,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
 
     unsigned ArgOffset = 0;
-    DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "  CallSite: " << I << "\n");
     for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
          ArgIt != End; ++ArgIt) {
       Value *A = *ArgIt;
       unsigned i = ArgIt - CS.arg_begin();
       if (!A->getType()->isSized()) {
-        DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
         continue;
       }
       unsigned Size = 0;
@@ -2754,8 +2910,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // __msan_param_tls.
       Value *ArgShadow = getShadow(A);
       Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
-      DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
-            " Shadow: " << *ArgShadow << "\n");
+      LLVM_DEBUG(dbgs() << "  Arg#" << i << ": " << *A
+                        << " Shadow: " << *ArgShadow << "\n");
       bool ArgIsInitialized = false;
       const DataLayout &DL = F.getParent()->getDataLayout();
       if (CS.paramHasAttr(i, Attribute::ByVal)) {
@@ -2765,10 +2921,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         if (ArgOffset + Size > kParamTLSSize) break;
         unsigned ParamAlignment = CS.getParamAlignment(i);
         unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
-        Value *AShadowPtr =
-            getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment).first;
+        Value *AShadowPtr = getShadowOriginPtr(A, IRB, IRB.getInt8Ty(),
+                                               Alignment, /*isStore*/ false)
+                                .first;
 
-        Store = IRB.CreateMemCpy(ArgShadowBase, AShadowPtr, Size, Alignment);
+        Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+                                 Alignment, Size);
       } else {
         Size = DL.getTypeAllocSize(A->getType());
         if (ArgOffset + Size > kParamTLSSize) break;
@@ -2782,10 +2940,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                         getOriginPtrForArgument(A, IRB, ArgOffset));
       (void)Store;
       assert(Size != 0 && Store != nullptr);
-      DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n");
       ArgOffset += alignTo(Size, 8);
     }
-    DEBUG(dbgs() << "  done with call args\n");
+    LLVM_DEBUG(dbgs() << "  done with call args\n");
 
     FunctionType *FT =
       cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
@@ -2888,8 +3046,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       IRB.CreateCall(MS.MsanPoisonStackFn,
                      {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
     } else {
-      Value *ShadowBase =
-          getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(), I.getAlignment()).first;
+      Value *ShadowBase = getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(),
+                                             I.getAlignment(), /*isStore*/ true)
+                              .first;
 
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
       IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
@@ -2991,24 +3150,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void visitExtractValueInst(ExtractValueInst &I) {
     IRBuilder<> IRB(&I);
     Value *Agg = I.getAggregateOperand();
-    DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    LLVM_DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
     Value *AggShadow = getShadow(Agg);
-    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
     Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
-    DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    LLVM_DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
     setShadow(&I, ResShadow);
     setOriginForNaryOp(I);
   }
 
   void visitInsertValueInst(InsertValueInst &I) {
     IRBuilder<> IRB(&I);
-    DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    LLVM_DEBUG(dbgs() << "InsertValue:  " << I << "\n");
     Value *AggShadow = getShadow(I.getAggregateOperand());
     Value *InsShadow = getShadow(I.getInsertedValueOperand());
-    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
-    DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    LLVM_DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
     Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
-    DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    LLVM_DEBUG(dbgs() << "   Res:        " << *Res << "\n");
     setShadow(&I, Res);
     setOriginForNaryOp(I);
   }
@@ -3023,25 +3182,58 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitResumeInst(ResumeInst &I) {
-    DEBUG(dbgs() << "Resume: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
     // Nothing to do here.
   }
 
   void visitCleanupReturnInst(CleanupReturnInst &CRI) {
-    DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+    LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
     // Nothing to do here.
   }
 
   void visitCatchReturnInst(CatchReturnInst &CRI) {
-    DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+    LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
     // Nothing to do here.
   }
 
+  void visitAsmInstruction(Instruction &I) {
+    // Conservative inline assembly handling: check for poisoned shadow of
+    // asm() arguments, then unpoison the result and all the memory locations
+    // pointed to by those arguments.
+    CallInst *CI = dyn_cast<CallInst>(&I);
+
+    for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+      Value *Operand = CI->getOperand(i);
+      if (Operand->getType()->isSized())
+        insertShadowCheck(Operand, &I);
+    }
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+    IRBuilder<> IRB(&I);
+    IRB.SetInsertPoint(I.getNextNode());
+    for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+      Value *Operand = CI->getOperand(i);
+      Type *OpType = Operand->getType();
+      if (!OpType->isPointerTy())
+        continue;
+      Type *ElType = OpType->getPointerElementType();
+      if (!ElType->isSized())
+        continue;
+      Value *ShadowPtr, *OriginPtr;
+      std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+          Operand, IRB, ElType, /*Alignment*/ 1, /*isStore*/ true);
+      Value *CShadow = getCleanShadow(ElType);
+      IRB.CreateStore(
+          CShadow,
+          IRB.CreatePointerCast(ShadowPtr, CShadow->getType()->getPointerTo()));
+    }
+  }
+
   void visitInstruction(Instruction &I) {
     // Everything else: stop propagating and check for poisoned shadow.
     if (ClDumpStrictInstructions)
       dumpInst(I);
-    DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
     for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
       Value *Operand = I.getOperand(i);
       if (Operand->getType()->isSized())
@@ -3052,7 +3244,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 };
 
-/// \brief AMD64-specific implementation of VarArgHelper.
+/// AMD64-specific implementation of VarArgHelper.
 struct VarArgAMD64Helper : public VarArgHelper {
   // An unfortunate workaround for asymmetric lowering of va_arg stuff.
   // See a comment in visitCallSite for more details.
@@ -3116,10 +3308,12 @@ struct VarArgAMD64Helper : public VarArgHelper {
             getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
         OverflowOffset += alignTo(ArgSize, 8);
         Value *ShadowPtr, *OriginPtr;
-        std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
-            A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment);
+        std::tie(ShadowPtr, OriginPtr) =
+            MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment,
+                                   /*isStore*/ false);
 
-        IRB.CreateMemCpy(ShadowBase, ShadowPtr, ArgSize, kShadowTLSAlignment);
+        IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr,
+                         kShadowTLSAlignment, ArgSize);
       } else {
         ArgKind AK = classifyArgument(A);
         if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
@@ -3157,7 +3351,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
   }
 
-  /// \brief Compute the shadow address for a given va_arg.
+  /// Compute the shadow address for a given va_arg.
   Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
                                    int ArgOffset) {
     Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
@@ -3172,7 +3366,8 @@ struct VarArgAMD64Helper : public VarArgHelper {
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
     std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
 
     // Unpoison the whole __va_list_tag.
     // FIXME: magic ABI constants.
@@ -3200,13 +3395,13 @@ struct VarArgAMD64Helper : public VarArgHelper {
     if (!VAStartInstrumentationList.empty()) {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
-      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
       VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
       Value *CopySize =
         IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
                       VAArgOverflowSize);
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
     }
 
     // Instrument va_start.
@@ -3219,33 +3414,33 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
           IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
                         ConstantInt::get(MS.IntptrTy, 16)),
-          Type::getInt64PtrTy(*MS.C));
+          PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
       Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       unsigned Alignment = 16;
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
           MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 Alignment);
-      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, AMD64FpEndOffset,
-                       Alignment);
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       AMD64FpEndOffset);
       Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
           IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
                         ConstantInt::get(MS.IntptrTy, 8)),
-          Type::getInt64PtrTy(*MS.C));
+          PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
       Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
       std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
           MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
-                                 Alignment);
+                                 Alignment, /*isStore*/ true);
       Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
                                              AMD64FpEndOffset);
-      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize,
-                       Alignment);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+                       VAArgOverflowSize);
     }
   }
 };
 
-/// \brief MIPS64-specific implementation of VarArgHelper.
+/// MIPS64-specific implementation of VarArgHelper.
 struct VarArgMIPS64Helper : public VarArgHelper {
   Function &F;
   MemorySanitizer &MS;
@@ -3286,7 +3481,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
     IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
   }
 
-  /// \brief Compute the shadow address for a given va_arg.
+  /// Compute the shadow address for a given va_arg.
   Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
                                    int ArgOffset) {
     Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
@@ -3301,8 +3496,8 @@ struct VarArgMIPS64Helper : public VarArgHelper {
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
-    std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
                      /* size */ 8, Alignment, false);
   }
@@ -3313,8 +3508,8 @@ struct VarArgMIPS64Helper : public VarArgHelper {
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
-    std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
                      /* size */ 8, Alignment, false);
   }
@@ -3322,7 +3517,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
   void finalizeInstrumentation() override {
     assert(!VAArgSize && !VAArgTLSCopy &&
            "finalizeInstrumentation called twice");
-    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
     VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
     Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
                                     VAArgSize);
@@ -3331,7 +3526,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
     }
 
     // Instrument va_start.
@@ -3341,20 +3536,21 @@ struct VarArgMIPS64Helper : public VarArgHelper {
       IRBuilder<> IRB(OrigInst->getNextNode());
       Value *VAListTag = OrigInst->getArgOperand(0);
       Value *RegSaveAreaPtrPtr =
-        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                        Type::getInt64PtrTy(*MS.C));
+          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                             PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
       Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       unsigned Alignment = 8;
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
           MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 Alignment);
-      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, Alignment);
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       CopySize);
     }
   }
 };
 
-/// \brief AArch64-specific implementation of VarArgHelper.
+/// AArch64-specific implementation of VarArgHelper.
 struct VarArgAArch64Helper : public VarArgHelper {
   static const unsigned kAArch64GrArgSize = 64;
   static const unsigned kAArch64VrArgSize = 128;
@@ -3461,8 +3657,8 @@ struct VarArgAArch64Helper : public VarArgHelper {
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
-    std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
                      /* size */ 32, Alignment, false);
   }
@@ -3473,8 +3669,8 @@ struct VarArgAArch64Helper : public VarArgHelper {
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
-    std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
                      /* size */ 32, Alignment, false);
   }
@@ -3506,13 +3702,13 @@ struct VarArgAArch64Helper : public VarArgHelper {
     if (!VAStartInstrumentationList.empty()) {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
-      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
       VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
       Value *CopySize =
         IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
                       VAArgOverflowSize);
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
     }
 
     Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
@@ -3563,14 +3759,14 @@ struct VarArgAArch64Helper : public VarArgHelper {
 
       Value *GrRegSaveAreaShadowPtr =
           MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 /*Alignment*/ 8)
+                                 /*Alignment*/ 8, /*isStore*/ true)
               .first;
 
       Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
                                               GrRegSaveAreaShadowPtrOff);
       Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
 
-      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8);
+      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, 8, GrSrcPtr, 8, GrCopySize);
 
       // Again, but for FP/SIMD values.
       Value *VrRegSaveAreaShadowPtrOff =
@@ -3578,7 +3774,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
 
       Value *VrRegSaveAreaShadowPtr =
           MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 /*Alignment*/ 8)
+                                 /*Alignment*/ 8, /*isStore*/ true)
               .first;
 
       Value *VrSrcPtr = IRB.CreateInBoundsGEP(
@@ -3588,25 +3784,25 @@ struct VarArgAArch64Helper : public VarArgHelper {
         VrRegSaveAreaShadowPtrOff);
       Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
 
-      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8);
+      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, 8, VrSrcPtr, 8, VrCopySize);
 
       // And finally for remaining arguments.
       Value *StackSaveAreaShadowPtr =
           MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 /*Alignment*/ 16)
+                                 /*Alignment*/ 16, /*isStore*/ true)
               .first;
 
       Value *StackSrcPtr =
         IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
                               IRB.getInt32(AArch64VAEndOffset));
 
-      IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr,
-                       VAArgOverflowSize, 16);
+      IRB.CreateMemCpy(StackSaveAreaShadowPtr, 16, StackSrcPtr, 16,
+                       VAArgOverflowSize);
     }
   }
 };
 
-/// \brief PowerPC64-specific implementation of VarArgHelper.
+/// PowerPC64-specific implementation of VarArgHelper.
 struct VarArgPowerPC64Helper : public VarArgHelper {
   Function &F;
   MemorySanitizer &MS;
@@ -3657,9 +3853,10 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
                                                   VAArgOffset - VAArgBase);
           Value *AShadowPtr, *AOriginPtr;
           std::tie(AShadowPtr, AOriginPtr) = MSV.getShadowOriginPtr(
-              A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment);
+              A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment, /*isStore*/ false);
 
-          IRB.CreateMemCpy(Base, AShadowPtr, ArgSize, kShadowTLSAlignment);
+          IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr,
+                           kShadowTLSAlignment, ArgSize);
         }
         VAArgOffset += alignTo(ArgSize, 8);
       } else {
@@ -3704,7 +3901,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
   }
 
-  /// \brief Compute the shadow address for a given va_arg.
+  /// Compute the shadow address for a given va_arg.
   Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
                                    int ArgOffset) {
     Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
@@ -3719,8 +3916,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
-    std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
                      /* size */ 8, Alignment, false);
   }
@@ -3730,8 +3927,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr, *OriginPtr;
     unsigned Alignment = 8;
-    std::tie(ShadowPtr, OriginPtr) =
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
     // Unpoison the whole __va_list_tag.
     // FIXME: magic ABI constants.
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -3741,7 +3938,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
   void finalizeInstrumentation() override {
     assert(!VAArgSize && !VAArgTLSCopy &&
            "finalizeInstrumentation called twice");
-    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
     VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
     Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
                                     VAArgSize);
@@ -3750,7 +3947,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
       VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
-      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+      IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
     }
 
     // Instrument va_start.
@@ -3760,20 +3957,21 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       IRBuilder<> IRB(OrigInst->getNextNode());
       Value *VAListTag = OrigInst->getArgOperand(0);
       Value *RegSaveAreaPtrPtr =
-        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                        Type::getInt64PtrTy(*MS.C));
+          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                             PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
       Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       unsigned Alignment = 8;
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
           MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
-                                 Alignment);
-      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, Alignment);
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       CopySize);
     }
   }
 };
 
-/// \brief A no-op implementation of VarArgHelper.
+/// A no-op implementation of VarArgHelper.
 struct VarArgNoOpHelper : public VarArgHelper {
   VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
                    MemorySanitizerVisitor &MSV) {}
@@ -3796,8 +3994,7 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
   Triple TargetTriple(Func.getParent()->getTargetTriple());
   if (TargetTriple.getArch() == Triple::x86_64)
     return new VarArgAMD64Helper(Func, Msan, Visitor);
-  else if (TargetTriple.getArch() == Triple::mips64 ||
-           TargetTriple.getArch() == Triple::mips64el)
+  else if (TargetTriple.isMIPS64())
     return new VarArgMIPS64Helper(Func, Msan, Visitor);
   else if (TargetTriple.getArch() == Triple::aarch64)
     return new VarArgAArch64Helper(Func, Msan, Visitor);
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index cb4b3a9c2545..307b7eaa2196 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -48,7 +48,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "CFGMST.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -119,6 +119,7 @@
 #include <vector>
 
 using namespace llvm;
+using ProfileCount = Function::ProfileCount;
 
 #define DEBUG_TYPE "pgo-instrumentation"
 
@@ -223,8 +224,8 @@ static cl::opt<bool>
     EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
                           cl::desc("When this option is on, the annotated "
                                    "branch probability will be emitted as "
-                                   " optimization remarks: -Rpass-analysis="
-                                   "pgo-instr-use"));
+                                   "optimization remarks: -{Rpass|"
+                                   "pass-remarks}=pgo-instrumentation"));
 
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
@@ -448,7 +449,7 @@ ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename) {
 
 namespace {
 
-/// \brief An MST based instrumentation for PGO
+/// An MST based instrumentation for PGO
 ///
 /// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
 /// in the function level.
@@ -545,7 +546,7 @@ public:
     computeCFGHash();
     if (!ComdatMembers.empty())
       renameComdatFunction();
-    DEBUG(dumpInfo("after CFGMST"));
+    LLVM_DEBUG(dumpInfo("after CFGMST"));
 
     NumOfPGOBB += MST.BBInfos.size();
     for (auto &E : MST.AllEdges) {
@@ -595,12 +596,12 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
                  (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
                  (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
-  DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
-               << " CRC = " << JC.getCRC()
-               << ", Selects = " << SIVisitor.getNumOfSelectInsts()
-               << ", Edges = " << MST.AllEdges.size()
-               << ", ICSites = " << ValueSites[IPVK_IndirectCallTarget].size()
-               << ", Hash = " << FunctionHash << "\n";);
+  LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
+                    << " CRC = " << JC.getCRC()
+                    << ", Selects = " << SIVisitor.getNumOfSelectInsts()
+                    << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
+                    << ValueSites[IPVK_IndirectCallTarget].size()
+                    << ", Hash = " << FunctionHash << "\n";);
 }
 
 // Check if we can safely rename this Comdat function.
@@ -701,8 +702,8 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
   // For a critical edge, we have to split. Instrument the newly
   // created BB.
   NumOfPGOSplit++;
-  DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> "
-               << getBBInfo(DestBB).Index << "\n");
+  LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index
+                    << " --> " << getBBInfo(DestBB).Index << "\n");
   unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
   BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
   assert(InstrBB && "Critical edge is not split");
@@ -752,8 +753,8 @@ static void instrumentOneFunc(
   for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
     CallSite CS(I);
     Value *Callee = CS.getCalledValue();
-    DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
-                 << NumIndirectCallSites << "\n");
+    LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
+                      << NumIndirectCallSites << "\n");
     IRBuilder<> Builder(I);
     assert(Builder.GetInsertPoint() != I->getParent()->end() &&
            "Cannot get the Instrumentation point");
@@ -861,7 +862,7 @@ public:
   // Set the branch weights based on the count values.
   void setBranchWeights();
 
-  // Annotate the value profile call sites all all value kind.
+  // Annotate the value profile call sites for all value kind.
   void annotateValueSites();
 
   // Annotate the value profile call sites for one value kind.
@@ -1041,14 +1042,14 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
   std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
 
   NumOfPGOFunc++;
-  DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
   uint64_t ValueSum = 0;
   for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
-    DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
     ValueSum += CountFromProfile[I];
   }
 
-  DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+  LLVM_DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
 
   getBBInfo(nullptr).UnknownCountOutEdge = 2;
   getBBInfo(nullptr).UnknownCountInEdge = 2;
@@ -1128,7 +1129,7 @@ void PGOUseFunc::populateCounters() {
     }
   }
 
-  DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+  LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
 #ifndef NDEBUG
   // Assert every BB has a valid counter.
   for (auto &BB : F) {
@@ -1139,7 +1140,7 @@ void PGOUseFunc::populateCounters() {
   }
 #endif
   uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
-  F.setEntryCount(FuncEntryCount);
+  F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
   uint64_t FuncMaxCount = FuncEntryCount;
   for (auto &BB : F) {
     auto BI = findBBInfo(&BB);
@@ -1153,13 +1154,13 @@ void PGOUseFunc::populateCounters() {
   FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
   assert(CountPosition == ProfileCountSize);
 
-  DEBUG(FuncInfo.dumpInfo("after reading profile."));
+  LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile."));
 }
 
 // Assign the scaled count values to the BB with multiple out edges.
 void PGOUseFunc::setBranchWeights() {
   // Generate MD_prof metadata for every branch instruction.
-  DEBUG(dbgs() << "\nSetting branch weights.\n");
+  LLVM_DEBUG(dbgs() << "\nSetting branch weights.\n");
   for (auto &BB : F) {
     TerminatorInst *TI = BB.getTerminator();
     if (TI->getNumSuccessors() < 2)
@@ -1200,7 +1201,7 @@ static bool isIndirectBrTarget(BasicBlock *BB) {
 }
 
 void PGOUseFunc::annotateIrrLoopHeaderWeights() {
-  DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+  LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
   // Find irr loop headers
   for (auto &BB : F) {
     // As a heuristic also annotate indrectbr targets as they have a high chance
@@ -1333,9 +1334,9 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) {
   }
 
   for (auto &I : ValueSites) {
-    DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
-                 << "): Index = " << ValueSiteIndex << " out of "
-                 << NumValueSites << "\n");
+    LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                      << "): Index = " << ValueSiteIndex << " out of "
+                      << NumValueSites << "\n");
     annotateValueSite(*M, *I, ProfileRecord,
                       static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
                       Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
@@ -1431,7 +1432,7 @@ static bool annotateAllFunctions(
     Module &M, StringRef ProfileFileName,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
-  DEBUG(dbgs() << "Read in profile counters: ");
+  LLVM_DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
   auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
@@ -1517,12 +1518,13 @@ static bool annotateAllFunctions(
   // inconsistent MST between prof-gen and prof-use.
   for (auto &F : HotFunctions) {
     F->addFnAttr(Attribute::InlineHint);
-    DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                      << "\n");
   }
   for (auto &F : ColdFunctions) {
     F->addFnAttr(Attribute::Cold);
-    DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+                      << "\n");
   }
   return true;
 }
@@ -1585,22 +1587,25 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
   for (const auto &ECI : EdgeCounts)
     Weights.push_back(scaleBranchCount(ECI, Scale));
 
-  DEBUG(dbgs() << "Weight is: ";
-        for (const auto &W : Weights) { dbgs() << W << " "; }
-        dbgs() << "\n";);
+  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
+                                           : Weights) {
+    dbgs() << W << " ";
+  } dbgs() << "\n";);
   TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
     if (BrCondStr.empty())
       return;
 
-    unsigned WSum =
-        std::accumulate(Weights.begin(), Weights.end(), 0,
-                        [](unsigned w1, unsigned w2) { return w1 + w2; });
+    uint64_t WSum =
+        std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0,
+                        [](uint64_t w1, uint64_t w2) { return w1 + w2; });
     uint64_t TotalCount =
-        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), 0,
+        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
                         [](uint64_t c1, uint64_t c2) { return c1 + c2; });
-    BranchProbability BP(Weights[0], WSum);
+    Scale = calculateCountScale(WSum);
+    BranchProbability BP(scaleBranchCount(Weights[0], Scale),
+                         scaleBranchCount(WSum, Scale));
     std::string BranchProbStr;
     raw_string_ostream OS(BranchProbStr);
     OS << BP;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 95eb3680403a..2c71e75dadcc 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -25,6 +25,8 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DomTreeUpdater.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
@@ -44,7 +46,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <cassert>
 #include <cstdint>
@@ -112,6 +114,7 @@ private:
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
   }
 };
 } // end anonymous namespace
@@ -133,8 +136,8 @@ namespace {
 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
 public:
   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
-               OptimizationRemarkEmitter &ORE)
-      : Func(Func), BFI(BFI), ORE(ORE), Changed(false) {
+               OptimizationRemarkEmitter &ORE, DominatorTree *DT)
+      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
     ValueDataArray =
         llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
     // Get the MemOPSize range information from option MemOPSizeRange,
@@ -151,8 +154,9 @@ public:
       if (perform(MI)) {
         Changed = true;
         ++NumOfPGOMemOPOpt;
-        DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
-                     << "is Transformed.\n");
+        LLVM_DEBUG(dbgs() << "MemOP call: "
+                          << MI->getCalledFunction()->getName()
+                          << "is Transformed.\n");
       }
     }
   }
@@ -169,6 +173,7 @@ private:
   Function &Func;
   BlockFrequencyInfo &BFI;
   OptimizationRemarkEmitter &ORE;
+  DominatorTree *DT;
   bool Changed;
   std::vector<MemIntrinsic *> WorkList;
   // Start of the previse range.
@@ -245,9 +250,9 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   }
 
   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
-  DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
-               << "\n");
-  DEBUG(
+  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
+                    << ActualCount << "\n");
+  LLVM_DEBUG(
       for (auto &VD
            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
 
@@ -260,8 +265,8 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 
   TotalCount = ActualCount;
   if (MemOPScaleCount)
-    DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
-                 << " denominator = " << SavedTotalCount << "\n");
+    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+                      << " denominator = " << SavedTotalCount << "\n");
 
   // Keeping track of the count of the default case:
   uint64_t RemainCount = TotalCount;
@@ -310,9 +315,9 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 
   uint64_t SumForOpt = TotalCount - RemainCount;
 
-  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
-               << " Versions (covering " << SumForOpt << " out of "
-               << TotalCount << ")\n");
+  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+                    << " Versions (covering " << SumForOpt << " out of "
+                    << TotalCount << ")\n");
 
   // mem_op(..., size)
   // ==>
@@ -331,19 +336,20 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   // merge_bb:
 
   BasicBlock *BB = MI->getParent();
-  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
-  DEBUG(dbgs() << *BB << "\n");
+  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  LLVM_DEBUG(dbgs() << *BB << "\n");
   auto OrigBBFreq = BFI.getBlockFreq(BB);
 
-  BasicBlock *DefaultBB = SplitBlock(BB, MI);
+  BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
   BasicBlock::iterator It(*MI);
   ++It;
   assert(It != DefaultBB->end());
-  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
   MergeBB->setName("MemOP.Merge");
   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
   DefaultBB->setName("MemOP.Default");
 
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   auto &Ctx = Func.getContext();
   IRBuilder<> IRB(BB);
   BB->getTerminator()->eraseFromParent();
@@ -358,7 +364,11 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
     annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
                       SavedRemainCount, IPVK_MemOPSize, NumVals);
 
-  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DT)
+    Updates.reserve(2 * SizeIds.size());
 
   for (uint64_t SizeId : SizeIds) {
     BasicBlock *CaseBB = BasicBlock::Create(
@@ -374,13 +384,20 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
     IRBuilder<> IRBCase(CaseBB);
     IRBCase.CreateBr(MergeBB);
     SI->addCase(CaseSizeId, CaseBB);
-    DEBUG(dbgs() << *CaseBB << "\n");
+    if (DT) {
+      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
+      Updates.push_back({DominatorTree::Insert, BB, CaseBB});
+    }
+    LLVM_DEBUG(dbgs() << *CaseBB << "\n");
   }
+  DTU.applyUpdates(Updates);
+  Updates.clear();
+
   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
 
-  DEBUG(dbgs() << *BB << "\n");
-  DEBUG(dbgs() << *DefaultBB << "\n");
-  DEBUG(dbgs() << *MergeBB << "\n");
+  LLVM_DEBUG(dbgs() << *BB << "\n");
+  LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
+  LLVM_DEBUG(dbgs() << *MergeBB << "\n");
 
   ORE.emit([&]() {
     using namespace ore;
@@ -396,13 +413,14 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 } // namespace
 
 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
-                                OptimizationRemarkEmitter &ORE) {
+                                OptimizationRemarkEmitter &ORE,
+                                DominatorTree *DT) {
   if (DisableMemOPOPT)
     return false;
 
   if (F.hasFnAttribute(Attribute::OptimizeForSize))
     return false;
-  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE);
+  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
   MemOPSizeOpt.perform();
   return MemOPSizeOpt.isChanged();
 }
@@ -411,7 +429,9 @@ bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
   BlockFrequencyInfo &BFI =
       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-  return PGOMemOPSizeOptImpl(F, BFI, ORE);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
 }
 
 namespace llvm {
@@ -421,11 +441,13 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
                                        FunctionAnalysisManager &FAM) {
   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
   if (!Changed)
     return PreservedAnalyses::all();
   auto PA = PreservedAnalyses();
   PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 } // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index d950e2e730f2..a4dd48c8dd6a 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -243,6 +242,7 @@ private:
   GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
   GlobalVariable *FunctionPCsArray;  // for pc-table.
   SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
+  SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
 
   SanitizerCoverageOptions Options;
 };
@@ -405,6 +405,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   // so we need to prevent them from being dead stripped.
   if (TargetTriple.isOSBinFormatMachO())
     appendToUsed(M, GlobalsToAppendToUsed);
+  appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed);
   return true;
 }
 
@@ -480,6 +481,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   if (F.getName() == "__local_stdio_printf_options" ||
       F.getName() == "__local_stdio_scanf_options")
     return false;
+  if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
+    return false;
   // Don't instrument functions using SEH for now. Splitting basic blocks like
   // we do for coverage breaks WinEHPrepare.
   // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
@@ -592,11 +595,15 @@ void SanitizerCoverageModule::CreateFunctionLocalArrays(
   if (Options.Inline8bitCounters) {
     Function8bitCounterArray = CreateFunctionLocalArrayInSection(
         AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
-    GlobalsToAppendToUsed.push_back(Function8bitCounterArray);
+    GlobalsToAppendToCompilerUsed.push_back(Function8bitCounterArray);
+    MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+    Function8bitCounterArray->addMetadata(LLVMContext::MD_associated, *MD);
   }
   if (Options.PCTable) {
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
-    GlobalsToAppendToUsed.push_back(FunctionPCsArray);
+    GlobalsToAppendToCompilerUsed.push_back(FunctionPCsArray);
+    MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+    FunctionPCsArray->addMetadata(LLVMContext::MD_associated, *MD);
   }
 }
 
@@ -659,11 +666,11 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
           C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
         Initializers.push_back(C);
       }
-      std::sort(Initializers.begin() + 2, Initializers.end(),
-                [](const Constant *A, const Constant *B) {
-                  return cast<ConstantInt>(A)->getLimitedValue() <
-                         cast<ConstantInt>(B)->getLimitedValue();
-                });
+      llvm::sort(Initializers.begin() + 2, Initializers.end(),
+                 [](const Constant *A, const Constant *B) {
+                   return cast<ConstantInt>(A)->getLimitedValue() <
+                          cast<ConstantInt>(B)->getLimitedValue();
+                 });
       ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
       GlobalVariable *GV = new GlobalVariable(
           *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index ec6904486e10..fa1e5a157a0f 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,13 +19,14 @@
 // The rest is handled by the run-time library.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -44,7 +45,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -339,7 +339,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
 void ThreadSanitizer::chooseInstructionsToInstrument(
     SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
     const DataLayout &DL) {
-  SmallSet<Value*, 8> WriteTargets;
+  SmallPtrSet<Value*, 8> WriteTargets;
   // Iterate from the end.
   for (Instruction *I : reverse(Local)) {
     if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
@@ -502,7 +502,7 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
   if (Idx < 0)
     return false;
   if (IsWrite && isVtableAccess(I)) {
-    DEBUG(dbgs() << "  VPTR : " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "  VPTR : " << *I << "\n");
     Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
     // StoredValue may be a vector type if we are storing several vptrs at once.
     // In this case, just take the first element of the vector since this is
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
index 5518b49c4095..9ade14c1177a 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -18,7 +18,7 @@
 
 namespace llvm {
 
-/// \brief An associative container with fast insertion-order (deterministic)
+/// An associative container with fast insertion-order (deterministic)
 /// iteration over its elements. Plus the special blot operation.
 template <class KeyT, class ValueT> class BlotMapVector {
   /// Map keys to indices in Vector.
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 8cc1232b18ca..0f13b02c806f 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -38,7 +38,7 @@ namespace objcarc {
 class ProvenanceAnalysis;
 
 /// \enum DependenceKind
-/// \brief Defines different dependence kinds among various ARC constructs.
+/// Defines different dependence kinds among various ARC constructs.
 ///
 /// There are several kinds of dependence-like concepts in use here.
 ///
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index cd9b3d96a14f..1dbe72c7569f 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -28,13 +28,13 @@
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/ObjCARC.h"
-#include "llvm/Transforms/Utils/Local.h"
 
 namespace llvm {
 class raw_ostream;
@@ -43,7 +43,7 @@ class raw_ostream;
 namespace llvm {
 namespace objcarc {
 
-/// \brief Erase the given instruction.
+/// Erase the given instruction.
 ///
 /// Many ObjC calls return their argument verbatim,
 /// so if it's such a call and the return value has users, replace them with the
@@ -82,6 +82,26 @@ static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
   return dyn_cast<InvokeInst>(Opnd);
 }
 
+/// Return the list of PHI nodes that are equivalent to PN.
+template<class PHINodeTy, class VectorTy>
+void getEquivalentPHIs(PHINodeTy &PN, VectorTy &PHIList) {
+  auto *BB = PN.getParent();
+  for (auto &P : BB->phis()) {
+    if (&P == &PN) // Do not add PN to the list.
+      continue;
+    unsigned I = 0, E = PN.getNumIncomingValues();
+    for (; I < E; ++I) {
+      auto *BB = PN.getIncomingBlock(I);
+      auto *PNOpnd = PN.getIncomingValue(I)->stripPointerCasts();
+      auto *POpnd = P.getIncomingValueForBlock(BB)->stripPointerCasts();
+      if (PNOpnd != POpnd)
+        break;
+    }
+    if (I == E)
+      PHIList.push_back(&P);
+  }
+}
+
 } // end namespace objcarc
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index b2c62a0e8eeb..8d3ef8fde534 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -36,7 +36,7 @@ using namespace llvm::objcarc;
 #define DEBUG_TYPE "objc-arc-ap-elim"
 
 namespace {
-  /// \brief Autorelease pool elimination.
+  /// Autorelease pool elimination.
   class ObjCARCAPElim : public ModulePass {
     void getAnalysisUsage(AnalysisUsage &AU) const override;
     bool runOnModule(Module &M) override;
@@ -103,10 +103,12 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
       // zap the pair.
       if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
         Changed = true;
-        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
-                        "autorelease pair:\n"
-                        "                           Pop: " << *Inst << "\n"
-                     << "                           Push: " << *Push << "\n");
+        LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                             "autorelease pair:\n"
+                             "                           Pop: "
+                          << *Inst << "\n"
+                          << "                           Push: " << *Push
+                          << "\n");
         Inst->eraseFromParent();
         Push->eraseFromParent();
       }
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index c4e61218f3f3..1f1ea9f58739 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -31,6 +31,7 @@
 #include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Operator.h"
@@ -50,7 +51,7 @@ STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// \brief Late ARC optimizations
+  /// Late ARC optimizations
   ///
   /// These change the IR in a way that makes it difficult to be analyzed by
   /// ObjCARCOpt, so it's run late.
@@ -74,11 +75,12 @@ namespace {
     SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
     /// Returns true if we eliminated Inst.
-    bool tryToPeepholeInstruction(Function &F, Instruction *Inst,
-                                  inst_iterator &Iter,
-                                  SmallPtrSetImpl<Instruction *> &DepInsts,
-                                  SmallPtrSetImpl<const BasicBlock *> &Visited,
-                                  bool &TailOkForStoreStrong);
+    bool tryToPeepholeInstruction(
+        Function &F, Instruction *Inst, inst_iterator &Iter,
+        SmallPtrSetImpl<Instruction *> &DepInsts,
+        SmallPtrSetImpl<const BasicBlock *> &Visited,
+        bool &TailOkForStoreStrong,
+        const DenseMap<BasicBlock *, ColorVector> &BlockColors);
 
     bool optimizeRetainCall(Function &F, Instruction *Retain);
 
@@ -88,8 +90,9 @@ namespace {
                         SmallPtrSetImpl<Instruction *> &DependingInstructions,
                         SmallPtrSetImpl<const BasicBlock *> &Visited);
 
-    void tryToContractReleaseIntoStoreStrong(Instruction *Release,
-                                             inst_iterator &Iter);
+    void tryToContractReleaseIntoStoreStrong(
+        Instruction *Release, inst_iterator &Iter,
+        const DenseMap<BasicBlock *, ColorVector> &BlockColors);
 
     void getAnalysisUsage(AnalysisUsage &AU) const override;
     bool doInitialization(Module &M) override;
@@ -129,16 +132,18 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "Transforming objc_retain => "
-                  "objc_retainAutoreleasedReturnValue since the operand is a "
-                  "return value.\nOld: "<< *Retain << "\n");
+  LLVM_DEBUG(
+      dbgs() << "Transforming objc_retain => "
+                "objc_retainAutoreleasedReturnValue since the operand is a "
+                "return value.\nOld: "
+             << *Retain << "\n");
 
   // We do not have to worry about tail calls/does not throw since
   // retain/retainRV have the same properties.
   Constant *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
   cast<CallInst>(Retain)->setCalledFunction(Decl);
 
-  DEBUG(dbgs() << "New: " << *Retain << "\n");
+  LLVM_DEBUG(dbgs() << "New: " << *Retain << "\n");
   return true;
 }
 
@@ -177,16 +182,19 @@ bool ObjCARCContract::contractAutorelease(
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "    Fusing retain/autorelease!\n"
-                  "        Autorelease:" << *Autorelease << "\n"
-                  "        Retain: " << *Retain << "\n");
+  LLVM_DEBUG(dbgs() << "    Fusing retain/autorelease!\n"
+                       "        Autorelease:"
+                    << *Autorelease
+                    << "\n"
+                       "        Retain: "
+                    << *Retain << "\n");
 
   Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
                               ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
                               : ARCRuntimeEntryPointKind::RetainAutorelease);
   Retain->setCalledFunction(Decl);
 
-  DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n");
+  LLVM_DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n");
 
   EraseInstruction(Autorelease);
   return true;
@@ -303,6 +311,24 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
   return Retain;
 }
 
+/// Create a call instruction with the correct funclet token. Should be used
+/// instead of calling CallInst::Create directly.
+static CallInst *
+createCallInst(Value *Func, ArrayRef<Value *> Args, const Twine &NameStr,
+               Instruction *InsertBefore,
+               const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  if (!BlockColors.empty()) {
+    const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second;
+    assert(CV.size() == 1 && "non-unique color for block!");
+    Instruction *EHPad = CV.front()->getFirstNonPHI();
+    if (EHPad->isEHPad())
+      OpBundles.emplace_back("funclet", EHPad);
+  }
+
+  return CallInst::Create(Func, Args, OpBundles, NameStr, InsertBefore);
+}
+
 /// Attempt to merge an objc_release with a store, load, and objc_retain to form
 /// an objc_storeStrong. An objc_storeStrong:
 ///
@@ -330,8 +356,9 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
 ///     (4).
 ///  2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
 ///     safe.
-void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
-                                                          inst_iterator &Iter) {
+void ObjCARCContract::tryToContractReleaseIntoStoreStrong(
+    Instruction *Release, inst_iterator &Iter,
+    const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
   // See if we are releasing something that we just loaded.
   auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
   if (!Load || !Load->isSimple())
@@ -365,7 +392,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
   Changed = true;
   ++NumStoreStrongs;
 
-  DEBUG(
+  LLVM_DEBUG(
       llvm::dbgs() << "    Contracting retain, release into objc_storeStrong.\n"
                    << "        Old:\n"
                    << "            Store:   " << *Store << "\n"
@@ -383,7 +410,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
   if (Args[1]->getType() != I8X)
     Args[1] = new BitCastInst(Args[1], I8X, "", Store);
   Constant *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
-  CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store);
+  CallInst *StoreStrong = createCallInst(Decl, Args, "", Store, BlockColors);
   StoreStrong->setDoesNotThrow();
   StoreStrong->setDebugLoc(Store->getDebugLoc());
 
@@ -392,7 +419,8 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
   // we can set the tail flag once we know it's safe.
   StoreStrongCalls.insert(StoreStrong);
 
-  DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong
+                          << "\n");
 
   if (&*Iter == Retain) ++Iter;
   if (&*Iter == Store) ++Iter;
@@ -407,7 +435,8 @@ bool ObjCARCContract::tryToPeepholeInstruction(
   Function &F, Instruction *Inst, inst_iterator &Iter,
   SmallPtrSetImpl<Instruction *> &DependingInsts,
   SmallPtrSetImpl<const BasicBlock *> &Visited,
-  bool &TailOkForStoreStrongs) {
+  bool &TailOkForStoreStrongs,
+  const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
     // Only these library routines return their argument. In particular,
     // objc_retainBlock does not necessarily return its argument.
   ARCInstKind Class = GetBasicARCInstKind(Inst);
@@ -449,15 +478,16 @@ bool ObjCARCContract::tryToPeepholeInstruction(
       } while (IsNoopInstruction(&*BBI));
 
       if (&*BBI == GetArgRCIdentityRoot(Inst)) {
-        DEBUG(dbgs() << "Adding inline asm marker for the return value "
-                        "optimization.\n");
+        LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value "
+                             "optimization.\n");
         Changed = true;
         InlineAsm *IA = InlineAsm::get(
             FunctionType::get(Type::getVoidTy(Inst->getContext()),
                               /*isVarArg=*/false),
             RVInstMarker->getString(),
             /*Constraints=*/"", /*hasSideEffects=*/true);
-        CallInst::Create(IA, "", Inst);
+
+        createCallInst(IA, None, "", Inst, BlockColors);
       }
     decline_rv_optimization:
       return false;
@@ -471,8 +501,8 @@ bool ObjCARCContract::tryToPeepholeInstruction(
         Changed = true;
         new StoreInst(Null, CI->getArgOperand(0), CI);
 
-        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
-                     << "                 New = " << *Null << "\n");
+        LLVM_DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                          << "                 New = " << *Null << "\n");
 
         CI->replaceAllUsesWith(Null);
         CI->eraseFromParent();
@@ -482,7 +512,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
     case ARCInstKind::Release:
       // Try to form an objc store strong from our release. If we fail, there is
       // nothing further to do below, so continue.
-      tryToContractReleaseIntoStoreStrong(Inst, Iter);
+      tryToContractReleaseIntoStoreStrong(Inst, Iter, BlockColors);
       return true;
     case ARCInstKind::User:
       // Be conservative if the function has any alloca instructions.
@@ -518,7 +548,12 @@ bool ObjCARCContract::runOnFunction(Function &F) {
 
   PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
 
-  DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  if (F.hasPersonalityFn() &&
+      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    BlockColors = colorEHFunclets(F);
+
+  LLVM_DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
 
   // Track whether it's ok to mark objc_storeStrong calls with the "tail"
   // keyword. Be conservative if the function has variadic arguments.
@@ -536,12 +571,12 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
     Instruction *Inst = &*I++;
 
-    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
 
     // First try to peephole Inst. If there is nothing further we can do in
     // terms of undoing objc-arc-expand, process the next inst.
     if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited,
-                                 TailOkForStoreStrongs))
+                                 TailOkForStoreStrongs, BlockColors))
       continue;
 
     // Otherwise, try to undo objc-arc-expand.
@@ -568,35 +603,48 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         // trivially dominate itself, which would lead us to rewriting its
         // argument in terms of its return value, which would lead to
         // infinite loops in GetArgRCIdentityRoot.
-        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
-          Changed = true;
-          Instruction *Replacement = Inst;
-          Type *UseTy = U.get()->getType();
-          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
-            // For PHI nodes, insert the bitcast in the predecessor block.
-            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
-            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            &BB->back());
-            // While we're here, rewrite all edges for this PHI, rather
-            // than just one use at a time, to minimize the number of
-            // bitcasts we emit.
-            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-              if (PHI->getIncomingBlock(i) == BB) {
-                // Keep the UI iterator valid.
-                if (UI != UE &&
-                    &PHI->getOperandUse(
-                        PHINode::getOperandNumForIncomingValue(i)) == &*UI)
-                  ++UI;
-                PHI->setIncomingValue(i, Replacement);
-              }
-          } else {
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            cast<Instruction>(U.getUser()));
-            U.set(Replacement);
+        if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U))
+          continue;
+
+        Changed = true;
+        Instruction *Replacement = Inst;
+        Type *UseTy = U.get()->getType();
+        if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+          // For PHI nodes, insert the bitcast in the predecessor block.
+          unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+          BasicBlock *IncomingBB = PHI->getIncomingBlock(ValNo);
+          if (Replacement->getType() != UseTy) {
+            // A catchswitch is both a pad and a terminator, meaning a basic
+            // block with a catchswitch has no insertion point. Keep going up
+            // the dominator tree until we find a non-catchswitch.
+            BasicBlock *InsertBB = IncomingBB;
+            while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) {
+              InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock();
+            }
+
+            assert(DT->dominates(Inst, &InsertBB->back()) &&
+                   "Invalid insertion point for bitcast");
+            Replacement =
+                new BitCastInst(Replacement, UseTy, "", &InsertBB->back());
           }
+
+          // While we're here, rewrite all edges for this PHI, rather
+          // than just one use at a time, to minimize the number of
+          // bitcasts we emit.
+          for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+            if (PHI->getIncomingBlock(i) == IncomingBB) {
+              // Keep the UI iterator valid.
+              if (UI != UE &&
+                  &PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) == &*UI)
+                ++UI;
+              PHI->setIncomingValue(i, Replacement);
+            }
+        } else {
+          if (Replacement->getType() != UseTy)
+            Replacement = new BitCastInst(Replacement, UseTy, "",
+                                          cast<Instruction>(U.getUser()));
+          U.set(Replacement);
         }
       }
     };
@@ -618,8 +666,17 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       else if (isa<GlobalAlias>(Arg) &&
                !cast<GlobalAlias>(Arg)->isInterposable())
         Arg = cast<GlobalAlias>(Arg)->getAliasee();
-      else
+      else {
+        // If Arg is a PHI node, get PHIs that are equivalent to it and replace
+        // their uses.
+        if (PHINode *PN = dyn_cast<PHINode>(Arg)) {
+          SmallVector<Value *, 1> PHIList;
+          getEquivalentPHIs(*PN, PHIList);
+          for (Value *PHI : PHIList)
+            ReplaceArgUses(PHI);
+        }
         break;
+      }
     }
 
     // Replace bitcast users of Arg that are dominated by Inst.
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index bb6a0a0e73db..6a345ef56e1b 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -47,7 +47,7 @@ using namespace llvm;
 using namespace llvm::objcarc;
 
 namespace {
-  /// \brief Early ARC transformations.
+  /// Early ARC transformations.
   class ObjCARCExpand : public FunctionPass {
     void getAnalysisUsage(AnalysisUsage &AU) const override;
     bool doInitialization(Module &M) override;
@@ -91,12 +91,13 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 
   bool Changed = false;
 
-  DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName()
+                    << "\n");
 
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
     Instruction *Inst = &*I;
 
-    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+    LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
 
     switch (GetBasicARCInstKind(Inst)) {
     case ARCInstKind::Retain:
@@ -111,8 +112,10 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
       // emitted here. We'll redo them in the contract pass.
       Changed = true;
       Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
-      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
-                      "               New = " << *Value << "\n");
+      LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst
+                        << "\n"
+                           "               New = "
+                        << *Value << "\n");
       Inst->replaceAllUsesWith(Value);
       break;
     }
@@ -121,7 +124,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
     }
   }
 
-  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+  LLVM_DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
 
   return Changed;
 }
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 99ed6863c22e..21e2848030fc 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -38,6 +38,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ObjCARCAliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
@@ -76,7 +77,7 @@ using namespace llvm::objcarc;
 /// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
 /// @{
 
-/// \brief This is similar to GetRCIdentityRoot but it stops as soon
+/// This is similar to GetRCIdentityRoot but it stops as soon
 /// as it finds a value with multiple uses.
 static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   // ConstantData (like ConstantPointerNull and UndefValue) is used across
@@ -174,7 +175,7 @@ STATISTIC(NumReleasesAfterOpt,
 
 namespace {
 
-  /// \brief Per-BasicBlock state.
+  /// Per-BasicBlock state.
   class BBState {
     /// The number of unique control paths from the entry which can reach this
     /// block.
@@ -422,7 +423,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
   // Dump the pointers we are tracking.
   OS << "    TopDown State:\n";
   if (!BBInfo.hasTopDownPtrs()) {
-    DEBUG(dbgs() << "        NONE!\n");
+    LLVM_DEBUG(dbgs() << "        NONE!\n");
   } else {
     for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
          I != E; ++I) {
@@ -442,7 +443,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
 
   OS << "    BottomUp State:\n";
   if (!BBInfo.hasBottomUpPtrs()) {
-    DEBUG(dbgs() << "        NONE!\n");
+    LLVM_DEBUG(dbgs() << "        NONE!\n");
   } else {
     for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
          I != E; ++I) {
@@ -465,7 +466,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
 
 namespace {
 
-  /// \brief The main ARC optimization pass.
+  /// The main ARC optimization pass.
   class ObjCARCOpt : public FunctionPass {
     bool Changed;
     ProvenanceAnalysis PA;
@@ -612,8 +613,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
       Changed = true;
       ++NumPeeps;
 
-      DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
-                   << "Erasing " << *RetainRV << "\n");
+      LLVM_DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
+                        << "Erasing " << *RetainRV << "\n");
 
       EraseInstruction(&*I);
       EraseInstruction(RetainRV);
@@ -625,14 +626,15 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
-                  "objc_retain since the operand is not a return value.\n"
-                  "Old = " << *RetainRV << "\n");
+  LLVM_DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
+                       "objc_retain since the operand is not a return value.\n"
+                       "Old = "
+                    << *RetainRV << "\n");
 
   Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
   cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
 
-  DEBUG(dbgs() << "New = " << *RetainRV << "\n");
+  LLVM_DEBUG(dbgs() << "New = " << *RetainRV << "\n");
 
   return false;
 }
@@ -652,6 +654,11 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
 
   SmallVector<const Value *, 2> Users;
   Users.push_back(Ptr);
+
+  // Add PHIs that are equivalent to Ptr to Users.
+  if (const PHINode *PN = dyn_cast<PHINode>(Ptr))
+    getEquivalentPHIs(*PN, Users);
+
   do {
     Ptr = Users.pop_back_val();
     for (const User *U : Ptr->users()) {
@@ -665,10 +672,12 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "Transforming objc_autoreleaseReturnValue => "
-                  "objc_autorelease since its operand is not used as a return "
-                  "value.\n"
-                  "Old = " << *AutoreleaseRV << "\n");
+  LLVM_DEBUG(
+      dbgs() << "Transforming objc_autoreleaseReturnValue => "
+                "objc_autorelease since its operand is not used as a return "
+                "value.\n"
+                "Old = "
+             << *AutoreleaseRV << "\n");
 
   CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
   Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
@@ -676,23 +685,53 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
   AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
   Class = ARCInstKind::Autorelease;
 
-  DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+  LLVM_DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+}
+
+namespace {
+Instruction *
+CloneCallInstForBB(CallInst &CI, BasicBlock &BB,
+                   const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  for (unsigned I = 0, E = CI.getNumOperandBundles(); I != E; ++I) {
+    auto Bundle = CI.getOperandBundleAt(I);
+    // Funclets will be reassociated in the future.
+    if (Bundle.getTagID() == LLVMContext::OB_funclet)
+      continue;
+    OpBundles.emplace_back(Bundle);
+  }
+
+  if (!BlockColors.empty()) {
+    const ColorVector &CV = BlockColors.find(&BB)->second;
+    assert(CV.size() == 1 && "non-unique color for block!");
+    Instruction *EHPad = CV.front()->getFirstNonPHI();
+    if (EHPad->isEHPad())
+      OpBundles.emplace_back("funclet", EHPad);
+  }
+
+  return CallInst::Create(&CI, OpBundles);
+}
 }
 
 /// Visit each call, one at a time, and make simplifications without doing any
 /// additional analysis.
 void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
-  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
   // Reset all the flags in preparation for recomputing them.
   UsedInThisFunction = 0;
 
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  if (F.hasPersonalityFn() &&
+      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    BlockColors = colorEHFunclets(F);
+
   // Visit all objc_* calls in F.
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
 
     ARCInstKind Class = GetBasicARCInstKind(Inst);
 
-    DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
 
     switch (Class) {
     default: break;
@@ -708,7 +747,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case ARCInstKind::NoopCast:
       Changed = true;
       ++NumNoops;
-      DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
       EraseInstruction(Inst);
       continue;
 
@@ -726,8 +765,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
                       Constant::getNullValue(Ty),
                       CI);
         Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
-                       "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
+        LLVM_DEBUG(
+            dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                      "\nOld = "
+                   << *CI << "\nNew = " << *NewValue << "\n");
         CI->replaceAllUsesWith(NewValue);
         CI->eraseFromParent();
         continue;
@@ -746,8 +787,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
                       CI);
 
         Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
-                        "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
+        LLVM_DEBUG(
+            dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                      "\nOld = "
+                   << *CI << "\nNew = " << *NewValue << "\n");
 
         CI->replaceAllUsesWith(NewValue);
         CI->eraseFromParent();
@@ -782,9 +825,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
                              MDNode::get(C, None));
 
-        DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
-              "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
-              << *NewCall << "\n");
+        LLVM_DEBUG(
+            dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+                      "since x is otherwise unused.\nOld: "
+                   << *Call << "\nNew: " << *NewCall << "\n");
 
         EraseInstruction(Call);
         Inst = NewCall;
@@ -796,8 +840,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // a tail keyword.
     if (IsAlwaysTail(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "Adding tail keyword to function since it can never be "
-                      "passed stack args: " << *Inst << "\n");
+      LLVM_DEBUG(
+          dbgs() << "Adding tail keyword to function since it can never be "
+                    "passed stack args: "
+                 << *Inst << "\n");
       cast<CallInst>(Inst)->setTailCall();
     }
 
@@ -805,16 +851,16 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // semantics of ARC truly do not do so.
     if (IsNeverTail(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst <<
-            "\n");
+      LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
+                        << "\n");
       cast<CallInst>(Inst)->setTailCall(false);
     }
 
     // Set nounwind as needed.
     if (IsNoThrow(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: "
+                        << *Inst << "\n");
       cast<CallInst>(Inst)->setDoesNotThrow();
     }
 
@@ -829,8 +875,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     if (IsNullOrUndef(Arg)) {
       Changed = true;
       ++NumNoops;
-      DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
-            << "\n");
+      LLVM_DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
+                        << "\n");
       EraseInstruction(Inst);
       continue;
     }
@@ -922,22 +968,24 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
             Value *Incoming =
               GetRCIdentityRoot(PN->getIncomingValue(i));
             if (!IsNullOrUndef(Incoming)) {
-              CallInst *Clone = cast<CallInst>(CInst->clone());
               Value *Op = PN->getIncomingValue(i);
               Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              CallInst *Clone = cast<CallInst>(CloneCallInstForBB(
+                  *CInst, *InsertPos->getParent(), BlockColors));
               if (Op->getType() != ParamTy)
                 Op = new BitCastInst(Op, ParamTy, "", InsertPos);
               Clone->setArgOperand(0, Op);
               Clone->insertBefore(InsertPos);
 
-              DEBUG(dbgs() << "Cloning "
-                           << *CInst << "\n"
-                           "And inserting clone at " << *InsertPos << "\n");
+              LLVM_DEBUG(dbgs() << "Cloning " << *CInst
+                                << "\n"
+                                   "And inserting clone at "
+                                << *InsertPos << "\n");
               Worklist.push_back(std::make_pair(Clone, Incoming));
             }
           }
           // Erase the original call.
-          DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+          LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
           EraseInstruction(CInst);
           continue;
         }
@@ -1114,7 +1162,7 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
   ARCInstKind Class = GetARCInstKind(Inst);
   const Value *Arg = nullptr;
 
-  DEBUG(dbgs() << "        Class: " << Class << "\n");
+  LLVM_DEBUG(dbgs() << "        Class: " << Class << "\n");
 
   switch (Class) {
   case ARCInstKind::Release: {
@@ -1137,7 +1185,7 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
       // Don't do retain+release tracking for ARCInstKind::RetainRV, because
       // it's better to let it remain as the first instruction after a call.
       if (Class != ARCInstKind::RetainRV) {
-        DEBUG(dbgs() << "        Matching with: " << *Inst << "\n");
+        LLVM_DEBUG(dbgs() << "        Matching with: " << *Inst << "\n");
         Retains[Inst] = S.GetRRInfo();
       }
       S.ClearSequenceProgress();
@@ -1179,7 +1227,7 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
 bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
                                DenseMap<const BasicBlock *, BBState> &BBStates,
                                BlotMapVector<Value *, RRInfo> &Retains) {
-  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
 
   bool NestingDetected = false;
   BBState &MyStates = BBStates[BB];
@@ -1202,8 +1250,9 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     }
   }
 
-  DEBUG(dbgs() << "Before:\n" << BBStates[BB] << "\n"
-               << "Performing Dataflow:\n");
+  LLVM_DEBUG(dbgs() << "Before:\n"
+                    << BBStates[BB] << "\n"
+                    << "Performing Dataflow:\n");
 
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
@@ -1213,7 +1262,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     if (isa<InvokeInst>(Inst))
       continue;
 
-    DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
+    LLVM_DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
   }
@@ -1228,7 +1277,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
   }
 
-  DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
+  LLVM_DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
 
   return NestingDetected;
 }
@@ -1241,7 +1290,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
   ARCInstKind Class = GetARCInstKind(Inst);
   const Value *Arg = nullptr;
 
-  DEBUG(dbgs() << "        Class: " << Class << "\n");
+  LLVM_DEBUG(dbgs() << "        Class: " << Class << "\n");
 
   switch (Class) {
   case ARCInstKind::RetainBlock:
@@ -1267,7 +1316,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     if (S.MatchWithRelease(MDKindCache, Inst)) {
       // If we succeed, copy S's RRInfo into the Release -> {Retain Set
       // Map}. Then we clear S.
-      DEBUG(dbgs() << "        Matching with: " << *Inst << "\n");
+      LLVM_DEBUG(dbgs() << "        Matching with: " << *Inst << "\n");
       Releases[Inst] = S.GetRRInfo();
       S.ClearSequenceProgress();
     }
@@ -1307,7 +1356,7 @@ bool
 ObjCARCOpt::VisitTopDown(BasicBlock *BB,
                          DenseMap<const BasicBlock *, BBState> &BBStates,
                          DenseMap<Value *, RRInfo> &Releases) {
-  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
   bool NestingDetected = false;
   BBState &MyStates = BBStates[BB];
 
@@ -1329,20 +1378,21 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
     }
   }
 
-  DEBUG(dbgs() << "Before:\n" << BBStates[BB]  << "\n"
-               << "Performing Dataflow:\n");
+  LLVM_DEBUG(dbgs() << "Before:\n"
+                    << BBStates[BB] << "\n"
+                    << "Performing Dataflow:\n");
 
   // Visit all the instructions, top-down.
   for (Instruction &Inst : *BB) {
-    DEBUG(dbgs() << "    Visiting " << Inst << "\n");
+    LLVM_DEBUG(dbgs() << "    Visiting " << Inst << "\n");
 
     NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
   }
 
-  DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
-               << BBStates[BB] << "\n\n");
+  LLVM_DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
+                    << BBStates[BB] << "\n\n");
   CheckForCFGHazards(BB, BBStates, MyStates);
-  DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
+  LLVM_DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
   return NestingDetected;
 }
 
@@ -1465,7 +1515,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
   Type *ArgTy = Arg->getType();
   Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
 
-  DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+  LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
 
   // Insert the new retain and release calls.
   for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
@@ -1476,8 +1526,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
     Call->setDoesNotThrow();
     Call->setTailCall();
 
-    DEBUG(dbgs() << "Inserting new Retain: " << *Call << "\n"
-                    "At insertion point: " << *InsertPt << "\n");
+    LLVM_DEBUG(dbgs() << "Inserting new Retain: " << *Call
+                      << "\n"
+                         "At insertion point: "
+                      << *InsertPt << "\n");
   }
   for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
     Value *MyArg = ArgTy == ParamTy ? Arg :
@@ -1491,20 +1543,22 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
     if (ReleasesToMove.IsTailCallRelease)
       Call->setTailCall();
 
-    DEBUG(dbgs() << "Inserting new Release: " << *Call << "\n"
-                    "At insertion point: " << *InsertPt << "\n");
+    LLVM_DEBUG(dbgs() << "Inserting new Release: " << *Call
+                      << "\n"
+                         "At insertion point: "
+                      << *InsertPt << "\n");
   }
 
   // Delete the original retain and release calls.
   for (Instruction *OrigRetain : RetainsToMove.Calls) {
     Retains.blot(OrigRetain);
     DeadInsts.push_back(OrigRetain);
-    DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
+    LLVM_DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
   }
   for (Instruction *OrigRelease : ReleasesToMove.Calls) {
     Releases.erase(OrigRelease);
     DeadInsts.push_back(OrigRelease);
-    DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
+    LLVM_DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
   }
 }
 
@@ -1538,6 +1592,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
+      CFGHazardAfflicted |= NewRetainRRI.CFGHazardAfflicted;
       for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
         auto Jt = Releases.find(NewRetainRelease);
         if (Jt == Releases.end())
@@ -1710,7 +1765,7 @@ bool ObjCARCOpt::PerformCodePlacement(
     DenseMap<const BasicBlock *, BBState> &BBStates,
     BlotMapVector<Value *, RRInfo> &Retains,
     DenseMap<Value *, RRInfo> &Releases, Module *M) {
-  DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
 
   bool AnyPairsCompletelyEliminated = false;
   SmallVector<Instruction *, 8> DeadInsts;
@@ -1724,7 +1779,7 @@ bool ObjCARCOpt::PerformCodePlacement(
 
     Instruction *Retain = cast<Instruction>(V);
 
-    DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
 
     Value *Arg = GetArgRCIdentityRoot(Retain);
 
@@ -1769,7 +1824,7 @@ bool ObjCARCOpt::PerformCodePlacement(
 
 /// Weak pointer optimizations.
 void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
-  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
 
   // First, do memdep-style RLE and S2L optimizations. We can't use memdep
   // itself because it uses AliasAnalysis and we need to do provenance
@@ -1777,7 +1832,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
 
-    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
 
     ARCInstKind Class = GetBasicARCInstKind(Inst);
     if (Class != ARCInstKind::LoadWeak &&
@@ -2036,7 +2091,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
   if (!F.getReturnType()->isPointerTy())
     return;
 
-  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
 
   SmallPtrSet<Instruction *, 4> DependingInstructions;
   SmallPtrSet<const BasicBlock *, 4> Visited;
@@ -2045,7 +2100,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
     if (!Ret)
       continue;
 
-    DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
 
     const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
 
@@ -2083,8 +2138,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
     // If so, we can zap the retain and autorelease.
     Changed = true;
     ++NumRets;
-    DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: "
-          << *Autorelease << "\n");
+    LLVM_DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: " << *Autorelease
+                      << "\n");
     EraseInstruction(Retain);
     EraseInstruction(Autorelease);
   }
@@ -2144,8 +2199,9 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
 
   Changed = false;
 
-  DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"
-        "\n");
+  LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName()
+                    << " >>>"
+                       "\n");
 
   PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
 
@@ -2193,7 +2249,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
   }
 #endif
 
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "\n");
 
   return Changed;
 }
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index f89fc8eb62aa..3004fffb9745 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -115,14 +115,6 @@ static bool IsStoredObjCPointer(const Value *P) {
 
 bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
                                       const DataLayout &DL) {
-  // Skip past provenance pass-throughs.
-  A = GetUnderlyingObjCPtr(A, DL);
-  B = GetUnderlyingObjCPtr(B, DL);
-
-  // Quick check.
-  if (A == B)
-    return true;
-
   // Ask regular AliasAnalysis, for a first approximation.
   switch (AA->alias(A, B)) {
   case NoAlias:
@@ -171,6 +163,13 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
 
 bool ProvenanceAnalysis::related(const Value *A, const Value *B,
                                  const DataLayout &DL) {
+  A = GetUnderlyingObjCPtrCached(A, DL, UnderlyingObjCPtrCache);
+  B = GetUnderlyingObjCPtrCached(B, DL, UnderlyingObjCPtrCache);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
   // Begin by inserting a conservative value into the map. If the insertion
   // fails, we have the answer already. If it succeeds, leave it there until we
   // compute the real answer to guard against recursive queries.
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 5e676167a6a1..1276f564a022 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -28,6 +28,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/ValueHandle.h"
 #include <utility>
 
 namespace llvm {
@@ -39,7 +40,7 @@ class Value;
 
 namespace objcarc {
 
-/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// This is similar to BasicAliasAnalysis, and it uses many of the same
 /// techniques, except it uses special ObjC-specific reasoning about pointer
 /// relationships.
 ///
@@ -56,6 +57,8 @@ class ProvenanceAnalysis {
 
   CachedResultsTy CachedResults;
 
+  DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
+
   bool relatedCheck(const Value *A, const Value *B, const DataLayout &DL);
   bool relatedSelect(const SelectInst *A, const Value *B);
   bool relatedPHI(const PHINode *A, const Value *B);
@@ -73,6 +76,7 @@ public:
 
   void clear() {
     CachedResults.clear();
+    UnderlyingObjCPtrCache.clear();
   }
 };
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
index e1774b88fd35..8a7b6a74fae2 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -126,22 +126,23 @@ bool RRInfo::Merge(const RRInfo &Other) {
 //===----------------------------------------------------------------------===//
 
 void PtrState::SetKnownPositiveRefCount() {
-  DEBUG(dbgs() << "        Setting Known Positive.\n");
+  LLVM_DEBUG(dbgs() << "        Setting Known Positive.\n");
   KnownPositiveRefCount = true;
 }
 
 void PtrState::ClearKnownPositiveRefCount() {
-  DEBUG(dbgs() << "        Clearing Known Positive.\n");
+  LLVM_DEBUG(dbgs() << "        Clearing Known Positive.\n");
   KnownPositiveRefCount = false;
 }
 
 void PtrState::SetSeq(Sequence NewSeq) {
-  DEBUG(dbgs() << "            Old: " << GetSeq() << "; New: " << NewSeq << "\n");
+  LLVM_DEBUG(dbgs() << "            Old: " << GetSeq() << "; New: " << NewSeq
+                    << "\n");
   Seq = NewSeq;
 }
 
 void PtrState::ResetSequenceProgress(Sequence NewSeq) {
-  DEBUG(dbgs() << "        Resetting sequence progress.\n");
+  LLVM_DEBUG(dbgs() << "        Resetting sequence progress.\n");
   SetSeq(NewSeq);
   Partial = false;
   RRI.clear();
@@ -184,7 +185,8 @@ bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
   // simple and avoids adding overhead for the non-nested case.
   bool NestingDetected = false;
   if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
-    DEBUG(dbgs() << "        Found nested releases (i.e. a release pair)\n");
+    LLVM_DEBUG(
+        dbgs() << "        Found nested releases (i.e. a release pair)\n");
     NestingDetected = true;
   }
 
@@ -234,8 +236,8 @@ bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
   if (!CanAlterRefCount(Inst, Ptr, PA, Class))
     return false;
 
-  DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; " << *Ptr
-               << "\n");
+  LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; "
+                    << *Ptr << "\n");
   switch (S) {
   case S_Use:
     SetSeq(S_CanRelease);
@@ -266,6 +268,11 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
     if (isa<InvokeInst>(Inst)) {
       const auto IP = BB->getFirstInsertionPt();
       InsertAfter = IP == BB->end() ? std::prev(BB->end()) : IP;
+      if (isa<CatchSwitchInst>(InsertAfter))
+        // A catchswitch must be the only non-phi instruction in its basic
+        // block, so attempting to insert an instruction into such a block would
+        // produce invalid IR.
+        SetCFGHazardAfflicted(true);
     } else {
       InsertAfter = std::next(Inst->getIterator());
     }
@@ -277,26 +284,26 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
   case S_Release:
   case S_MovableRelease:
     if (CanUse(Inst, Ptr, PA, Class)) {
-      DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; " << *Ptr
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; "
+                        << *Ptr << "\n");
       SetSeqAndInsertReverseInsertPt(S_Use);
     } else if (Seq == S_Release && IsUser(Class)) {
-      DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() << "; "
-                   << *Ptr << "\n");
+      LLVM_DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq()
+                        << "; " << *Ptr << "\n");
       // Non-movable releases depend on any possible objc pointer use.
       SetSeqAndInsertReverseInsertPt(S_Stop);
     } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
       if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
-        DEBUG(dbgs() << "            ReleaseUse: Seq: " << GetSeq() << "; "
-                     << *Ptr << "\n");
+        LLVM_DEBUG(dbgs() << "            ReleaseUse: Seq: " << GetSeq() << "; "
+                          << *Ptr << "\n");
         SetSeqAndInsertReverseInsertPt(S_Stop);
       }
     }
     break;
   case S_Stop:
     if (CanUse(Inst, Ptr, PA, Class)) {
-      DEBUG(dbgs() << "            PreciseStopUse: Seq: " << GetSeq() << "; "
-                   << *Ptr << "\n");
+      LLVM_DEBUG(dbgs() << "            PreciseStopUse: Seq: " << GetSeq()
+                        << "; " << *Ptr << "\n");
       SetSeq(S_Use);
     }
     break;
@@ -377,8 +384,8 @@ bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
       Class != ARCInstKind::IntrinsicUser)
     return false;
 
-  DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr
-               << "\n");
+  LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; "
+                    << *Ptr << "\n");
   ClearKnownPositiveRefCount();
   switch (GetSeq()) {
   case S_Retain:
@@ -410,8 +417,8 @@ void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
   case S_CanRelease:
     if (!CanUse(Inst, Ptr, PA, Class))
       return;
-    DEBUG(dbgs() << "             CanUse: Seq: " << GetSeq() << "; " << *Ptr
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "             CanUse: Seq: " << GetSeq() << "; "
+                      << *Ptr << "\n");
     SetSeq(S_Use);
     return;
   case S_Retain:
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
index e1e95afcf76b..f5b9b853d8e3 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
@@ -36,7 +36,7 @@ class ProvenanceAnalysis;
 
 /// \enum Sequence
 ///
-/// \brief A sequence of states that a pointer may go through in which an
+/// A sequence of states that a pointer may go through in which an
 /// objc_retain and objc_release are actually needed.
 enum Sequence {
   S_None,
@@ -51,7 +51,7 @@ enum Sequence {
 raw_ostream &operator<<(raw_ostream &OS,
                         const Sequence S) LLVM_ATTRIBUTE_UNUSED;
 
-/// \brief Unidirectional information about either a
+/// Unidirectional information about either a
 /// retain-decrement-use-release sequence or release-use-decrement-retain
 /// reverse sequence.
 struct RRInfo {
@@ -97,7 +97,7 @@ struct RRInfo {
   bool Merge(const RRInfo &Other);
 };
 
-/// \brief This class summarizes several per-pointer runtime properties which
+/// This class summarizes several per-pointer runtime properties which
 /// are propagated through the flow graph.
 class PtrState {
 protected:
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index 1e683db50206..ce09a477b5f5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -174,8 +174,8 @@ class AggressiveDeadCodeElimination {
   /// marked live.
   void markLiveBranchesFromControlDependences();
 
-  /// Remove instructions not marked live, return if any any instruction
-  /// was removed.
+  /// Remove instructions not marked live, return if any instruction was
+  /// removed.
   bool removeDeadInstructions();
 
   /// Identify connected sections of the control flow graph which have
@@ -298,8 +298,8 @@ void AggressiveDeadCodeElimination::initialize() {
     auto &Info = BlockInfo[BB];
     // Real function return
     if (isa<ReturnInst>(Info.Terminator)) {
-      DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
-                   << '\n';);
+      LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
+                        << '\n';);
       continue;
     }
 
@@ -356,7 +356,7 @@ void AggressiveDeadCodeElimination::markLiveInstructions() {
     // where we need to mark the inputs as live.
     while (!Worklist.empty()) {
       Instruction *LiveInst = Worklist.pop_back_val();
-      DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+      LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump(););
 
       for (Use &OI : LiveInst->operands())
         if (Instruction *Inst = dyn_cast<Instruction>(OI))
@@ -378,7 +378,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) {
   if (Info.Live)
     return;
 
-  DEBUG(dbgs() << "mark live: "; I->dump());
+  LLVM_DEBUG(dbgs() << "mark live: "; I->dump());
   Info.Live = true;
   Worklist.push_back(I);
 
@@ -402,7 +402,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) {
 void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
   if (BBInfo.Live)
     return;
-  DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
   BBInfo.Live = true;
   if (!BBInfo.CFLive) {
     BBInfo.CFLive = true;
@@ -463,7 +463,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
   if (BlocksWithDeadTerminators.empty())
     return;
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "new live blocks:\n";
     for (auto *BB : NewLiveBlocks)
       dbgs() << "\t" << BB->getName() << '\n';
@@ -487,7 +487,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
 
   // Dead terminators which control live blocks are now marked live.
   for (auto *BB : IDFBlocks) {
-    DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
     markLive(BB->getTerminator());
   }
 }
@@ -501,7 +501,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
   // Updates control and dataflow around dead blocks
   updateDeadRegions();
 
-  DEBUG({
+  LLVM_DEBUG({
     for (Instruction &I : instructions(F)) {
       // Check if the instruction is alive.
       if (isLive(&I))
@@ -555,7 +555,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
 
 // A dead region is the set of dead blocks with a common live post-dominator.
 void AggressiveDeadCodeElimination::updateDeadRegions() {
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "final dead terminator blocks: " << '\n';
     for (auto *BB : BlocksWithDeadTerminators)
       dbgs() << '\t' << BB->getName()
@@ -607,8 +607,9 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
       // It might have happened that the same successor appeared multiple times
       // and the CFG edge wasn't really removed.
       if (Succ != PreferredSucc->BB) {
-        DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
-                     << BB->getName() << " -> " << Succ->getName() << "\n");
+        LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+                          << BB->getName() << " -> " << Succ->getName()
+                          << "\n");
         DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
       }
     }
@@ -652,7 +653,7 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
     InstInfo[PredTerm].Live = true;
     return;
   }
-  DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
   NumBranchesRemoved += 1;
   IRBuilder<> Builder(PredTerm);
   auto *NewTerm = Builder.CreateBr(Target);
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 99480f12da9e..fa7bcec677f7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -98,8 +98,8 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
   const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
   const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
 
-  DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
-                  *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+  LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
+                    << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
 
   if (const SCEVConstant *ConstDUSCEV =
       dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
@@ -139,12 +139,12 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
   // address. This address is displaced by the provided offset.
   DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
 
-  DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
-                  *AlignSCEV << " and offset " << *OffSCEV <<
-                  " using diff " << *DiffSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to "
+                    << *AlignSCEV << " and offset " << *OffSCEV
+                    << " using diff " << *DiffSCEV << "\n");
 
   unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
-  DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+  LLVM_DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
 
   if (NewAlignment) {
     return NewAlignment;
@@ -160,8 +160,8 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
     const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
     const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
 
-    DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
-                    *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+    LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start "
+                      << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
 
     // Now compute the new alignment using the displacement to the value in the
     // first iteration, and also the alignment using the per-iteration delta.
@@ -170,26 +170,26 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
     NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
     unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
 
-    DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
-    DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+    LLVM_DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+    LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
 
     if (!NewAlignment || !NewIncAlignment) {
       return 0;
     } else if (NewAlignment > NewIncAlignment) {
       if (NewAlignment % NewIncAlignment == 0) {
-        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
-                        NewIncAlignment << "\n");
+        LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewIncAlignment
+                          << "\n");
         return NewIncAlignment;
       }
     } else if (NewIncAlignment > NewAlignment) {
       if (NewIncAlignment % NewAlignment == 0) {
-        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
-                        NewAlignment << "\n");
+        LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+                          << "\n");
         return NewAlignment;
       }
     } else if (NewIncAlignment == NewAlignment) {
-      DEBUG(dbgs() << "\tnew start/inc alignment: " <<
-                      NewAlignment << "\n");
+      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+                        << "\n");
       return NewAlignment;
     }
   }
@@ -339,55 +339,24 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
       unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
         MI->getDest(), SE);
 
-      // For memory transfers, we need a common alignment for both the
-      // source and destination. If we have a new alignment for this
-      // instruction, but only for one operand, save it. If we reach the
-      // other operand through another assumption later, then we may
-      // change the alignment at that point.
+      LLVM_DEBUG(dbgs() << "\tmem inst: " << NewDestAlignment << "\n";);
+      if (NewDestAlignment > MI->getDestAlignment()) {
+        MI->setDestAlignment(NewDestAlignment);
+        ++NumMemIntAlignChanged;
+      }
+
+      // For memory transfers, there is also a source alignment that
+      // can be set.
       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
         unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
           MTI->getSource(), SE);
 
-        DenseMap<MemTransferInst *, unsigned>::iterator DI =
-          NewDestAlignments.find(MTI);
-        unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
-                                    0 : DI->second;
-
-        DenseMap<MemTransferInst *, unsigned>::iterator SI =
-          NewSrcAlignments.find(MTI);
-        unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
-                                   0 : SI->second;
-
-        DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
-                        AltDestAlignment << " " << NewSrcAlignment <<
-                        " " << AltSrcAlignment << "\n");
-
-        // Of these four alignments, pick the largest possible...
-        unsigned NewAlignment = 0;
-        if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
-          NewAlignment = std::max(NewAlignment, NewDestAlignment);
-        if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
-          NewAlignment = std::max(NewAlignment, AltDestAlignment);
-        if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
-          NewAlignment = std::max(NewAlignment, NewSrcAlignment);
-        if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
-          NewAlignment = std::max(NewAlignment, AltSrcAlignment);
-
-        if (NewAlignment > MI->getAlignment()) {
-          MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
-            MI->getParent()->getContext()), NewAlignment));
+        LLVM_DEBUG(dbgs() << "\tmem trans: " << NewSrcAlignment << "\n";);
+
+        if (NewSrcAlignment > MTI->getSourceAlignment()) {
+          MTI->setSourceAlignment(NewSrcAlignment);
           ++NumMemIntAlignChanged;
         }
-
-        NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
-        NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
-      } else if (NewDestAlignment > MI->getAlignment()) {
-        assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
-               "Unknown memory intrinsic");
-
-        MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
-          MI->getParent()->getContext()), NewDestAlignment));
-        ++NumMemIntAlignChanged;
       }
     }
 
@@ -421,9 +390,6 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
   SE = SE_;
   DT = DT_;
 
-  NewDestAlignments.clear();
-  NewSrcAlignments.clear();
-
   bool Changed = false;
   for (auto &AssumeVH : AC.assumptions())
     if (AssumeVH)
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 851efa000f65..3a8ef073cb48 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
@@ -99,7 +100,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
       // For live instructions that have all dead bits, first make them dead by
       // replacing all uses with something else. Then, if they don't need to
       // remain live (because they have side effects, etc.) we can remove them.
-      DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+      LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
 
       clearAssumptionsOfUsers(&I, DB);
 
@@ -114,6 +115,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
     if (!DB.isInstructionDead(&I))
       continue;
 
+    salvageDebugInfo(I);
     Worklist.push_back(&I);
     I.dropAllReferences();
     Changed = true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 7488cd5af8be..5ebfbf8a879b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -59,12 +59,14 @@
 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -73,9 +75,16 @@ using namespace PatternMatch;
 
 STATISTIC(NumCallSiteSplit, "Number of call-site split");
 
-static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
-                                Value *Op) {
-  CallSite CS(NewCallI);
+/// Only allow instructions before a call, if their CodeSize cost is below
+/// DuplicationThreshold. Those instructions need to be duplicated in all
+/// split blocks.
+static cl::opt<unsigned>
+    DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
+                         cl::desc("Only allow instructions before a call, if "
+                                  "their cost is below DuplicationThreshold"),
+                         cl::init(5));
+
+static void addNonNullAttribute(CallSite CS, Value *Op) {
   unsigned ArgNo = 0;
   for (auto &I : CS.args()) {
     if (&*I == Op)
@@ -84,13 +93,16 @@ static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
   }
 }
 
-static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
-                                  Value *Op, Constant *ConstValue) {
-  CallSite CS(NewCallI);
+static void setConstantInArgument(CallSite CS, Value *Op,
+                                  Constant *ConstValue) {
   unsigned ArgNo = 0;
   for (auto &I : CS.args()) {
-    if (&*I == Op)
+    if (&*I == Op) {
+      // It is possible we have already added the non-null attribute to the
+      // parameter by using an earlier constraining condition.
+      CS.removeParamAttr(ArgNo, Attribute::NonNull);
       CS.setArgument(ArgNo, ConstValue);
+    }
     ++ArgNo;
   }
 }
@@ -111,11 +123,13 @@ static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
   return false;
 }
 
+typedef std::pair<ICmpInst *, unsigned> ConditionTy;
+typedef SmallVector<ConditionTy, 2> ConditionsTy;
+
 /// If From has a conditional jump to To, add the condition to Conditions,
 /// if it is relevant to any argument at CS.
-static void
-recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
-                SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To,
+                            ConditionsTy &Conditions) {
   auto *BI = dyn_cast<BranchInst>(From->getTerminator());
   if (!BI || !BI->isConditional())
     return;
@@ -134,11 +148,10 @@ recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
 }
 
 /// Record ICmp conditions relevant to any argument in CS following Pred's
-/// single successors. If there are conflicting conditions along a path, like
+/// single predecessors. If there are conflicting conditions along a path, like
 /// x == 1 and x == 0, the first condition will be used.
-static void
-recordConditions(const CallSite &CS, BasicBlock *Pred,
-                 SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+static void recordConditions(CallSite CS, BasicBlock *Pred,
+                             ConditionsTy &Conditions) {
   recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
   BasicBlock *From = Pred;
   BasicBlock *To = Pred;
@@ -151,24 +164,17 @@ recordConditions(const CallSite &CS, BasicBlock *Pred,
   }
 }
 
-static Instruction *
-addConditions(CallSite &CS,
-              SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
-  if (Conditions.empty())
-    return nullptr;
-
-  Instruction *NewCI = CS.getInstruction()->clone();
+static void addConditions(CallSite CS, const ConditionsTy &Conditions) {
   for (auto &Cond : Conditions) {
     Value *Arg = Cond.first->getOperand(0);
     Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
     if (Cond.second == ICmpInst::ICMP_EQ)
-      setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
+      setConstantInArgument(CS, Arg, ConstVal);
     else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
       assert(Cond.second == ICmpInst::ICMP_NE);
-      addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
+      addNonNullAttribute(CS, Arg);
     }
   }
-  return NewCI;
 }
 
 static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
@@ -177,28 +183,39 @@ static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
   return Preds;
 }
 
-static bool canSplitCallSite(CallSite CS) {
+static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
   // FIXME: As of now we handle only CallInst. InvokeInst could be handled
   // without too much effort.
   Instruction *Instr = CS.getInstruction();
   if (!isa<CallInst>(Instr))
     return false;
 
-  // Allow splitting a call-site only when there is no instruction before the
-  // call-site in the basic block. Based on this constraint, we only clone the
-  // call instruction, and we do not move a call-site across any other
-  // instruction.
   BasicBlock *CallSiteBB = Instr->getParent();
-  if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
-    return false;
-
   // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
   SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
   if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
       isa<IndirectBrInst>(Preds[1]->getTerminator()))
     return false;
 
-  return CallSiteBB->canSplitPredecessors();
+  // BasicBlock::canSplitPredecessors is more agressive, so checking for
+  // BasicBlock::isEHPad as well.
+  if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
+    return false;
+
+  // Allow splitting a call-site only when the CodeSize cost of the
+  // instructions before the call is less then DuplicationThreshold. The
+  // instructions before the call will be duplicated in the split blocks and
+  // corresponding uses will be updated.
+  unsigned Cost = 0;
+  for (auto &InstBeforeCall :
+       llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) {
+    Cost += TTI.getInstructionCost(&InstBeforeCall,
+                                   TargetTransformInfo::TCK_CodeSize);
+    if (Cost >= DuplicationThreshold)
+      return false;
+  }
+
+  return true;
 }
 
 static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
@@ -224,11 +241,11 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
   bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
   auto II = std::next(CI->getIterator());
 
-  BitCastInst *BCI = dyn_cast<BitCastInst>(&*II);
+  BitCastInst* BCI = dyn_cast<BitCastInst>(&*II);
   if (BCI)
     ++II;
 
-  ReturnInst *RI = dyn_cast<ReturnInst>(&*II);
+  ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
   assert(RI && "`musttail` call must be followed by `ret` instruction");
 
   TerminatorInst *TI = SplitBB->getTerminator();
@@ -241,14 +258,15 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
   // that prevents doing this now.
 }
 
-/// Return true if the CS is split into its new predecessors which are directly
-/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
-/// CallInst1 and CallInst2 will be the new call-sites placed in the new
-/// predecessors split for PredBB1 and PredBB2, respectively.
+/// For each (predecessor, conditions from predecessors) pair, it will split the
+/// basic block containing the call site, hook it up to the predecessor and
+/// replace the call instruction with new call instructions, which contain
+/// constraints based on the conditions from their predecessors.
 /// For example, in the IR below with an OR condition, the call-site can
-/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
-/// call-site placed between Header and Tail, and CallInst2 will be the
-/// call-site between TBB and Tail.
+/// be split. In this case, Preds for Tail is [(Header, a == null),
+/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing
+/// CallInst1, which has constraints based on the conditions from Head and
+/// CallInst2, which has constraints based on the conditions coming from TBB.
 ///
 /// From :
 ///
@@ -281,61 +299,59 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
 /// Note that in case any arguments at the call-site are constrained by its
 /// predecessors, new call-sites with more constrained arguments will be
 /// created in createCallSitesOnPredicatedArgument().
-static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
-                          Instruction *CallInst1, Instruction *CallInst2) {
+static void splitCallSite(
+    CallSite CS,
+    const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
+    DominatorTree *DT) {
   Instruction *Instr = CS.getInstruction();
   BasicBlock *TailBB = Instr->getParent();
   bool IsMustTailCall = CS.isMustTailCall();
-  assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
-
-  BasicBlock *SplitBlock1 =
-      SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
-  BasicBlock *SplitBlock2 =
-      SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
-
-  assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
 
-  if (!CallInst1)
-    CallInst1 = Instr->clone();
-  if (!CallInst2)
-    CallInst2 = Instr->clone();
-
-  CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
-  CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
-
-  CallSite CS1(CallInst1);
-  CallSite CS2(CallInst2);
-
-  // Handle PHIs used as arguments in the call-site.
-  for (PHINode &PN : TailBB->phis()) {
-    unsigned ArgNo = 0;
-    for (auto &CI : CS.args()) {
-      if (&*CI == &PN) {
-        CS1.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock1));
-        CS2.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock2));
+  PHINode *CallPN = nullptr;
+
+  // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
+  // split blocks will be terminated right after that so there're no users for
+  // this phi in a `TailBB`.
+  if (!IsMustTailCall && !Instr->use_empty())
+    CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call");
+
+  LLVM_DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+
+  assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
+  // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
+  // here.
+  ValueToValueMapTy ValueToValueMaps[2];
+  for (unsigned i = 0; i < Preds.size(); i++) {
+    BasicBlock *PredBB = Preds[i].first;
+    BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+        TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i],
+        DT);
+    assert(SplitBlock && "Unexpected new basic block split.");
+
+    Instruction *NewCI =
+        &*std::prev(SplitBlock->getTerminator()->getIterator());
+    CallSite NewCS(NewCI);
+    addConditions(NewCS, Preds[i].second);
+
+    // Handle PHIs used as arguments in the call-site.
+    for (PHINode &PN : TailBB->phis()) {
+      unsigned ArgNo = 0;
+      for (auto &CI : CS.args()) {
+        if (&*CI == &PN) {
+          NewCS.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
+        }
+        ++ArgNo;
       }
-      ++ArgNo;
     }
+    LLVM_DEBUG(dbgs() << "    " << *NewCI << " in " << SplitBlock->getName()
+                      << "\n");
+    if (CallPN)
+      CallPN->addIncoming(NewCI, SplitBlock);
+
+    // Clone and place bitcast and return instructions before `TI`
+    if (IsMustTailCall)
+      copyMustTailReturn(SplitBlock, Instr, NewCI);
   }
-  // Clone and place bitcast and return instructions before `TI`
-  if (IsMustTailCall) {
-    copyMustTailReturn(SplitBlock1, CS.getInstruction(), CallInst1);
-    copyMustTailReturn(SplitBlock2, CS.getInstruction(), CallInst2);
-  }
-
-  // Replace users of the original call with a PHI mering call-sites split.
-  if (!IsMustTailCall && Instr->getNumUses()) {
-    PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
-                                  TailBB->getFirstNonPHI());
-    PN->addIncoming(CallInst1, SplitBlock1);
-    PN->addIncoming(CallInst2, SplitBlock2);
-    Instr->replaceAllUsesWith(PN);
-  }
-  DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
-  DEBUG(dbgs() << "    " << *CallInst1 << " in " << SplitBlock1->getName()
-               << "\n");
-  DEBUG(dbgs() << "    " << *CallInst2 << " in " << SplitBlock2->getName()
-               << "\n");
 
   NumCallSiteSplit++;
 
@@ -354,7 +370,41 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
     TailBB->eraseFromParent();
     return;
   }
-  Instr->eraseFromParent();
+
+  auto *OriginalBegin = &*TailBB->begin();
+  // Replace users of the original call with a PHI mering call-sites split.
+  if (CallPN) {
+    CallPN->insertBefore(OriginalBegin);
+    Instr->replaceAllUsesWith(CallPN);
+  }
+
+  // Remove instructions moved to split blocks from TailBB, from the duplicated
+  // call instruction to the beginning of the basic block. If an instruction
+  // has any uses, add a new PHI node to combine the values coming from the
+  // split blocks. The new PHI nodes are placed before the first original
+  // instruction, so we do not end up deleting them. By using reverse-order, we
+  // do not introduce unnecessary PHI nodes for def-use chains from the call
+  // instruction to the beginning of the block.
+  auto I = Instr->getReverseIterator();
+  while (I != TailBB->rend()) {
+    Instruction *CurrentI = &*I++;
+    if (!CurrentI->use_empty()) {
+      // If an existing PHI has users after the call, there is no need to create
+      // a new one.
+      if (isa<PHINode>(CurrentI))
+        continue;
+      PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
+      for (auto &Mapping : ValueToValueMaps)
+        NewPN->addIncoming(Mapping[CurrentI],
+                           cast<Instruction>(Mapping[CurrentI])->getParent());
+      NewPN->insertBefore(&*TailBB->begin());
+      CurrentI->replaceAllUsesWith(NewPN);
+    }
+    CurrentI->eraseFromParent();
+    // We are done once we handled the first original instruction in TailBB.
+    if (CurrentI == OriginalBegin)
+      break;
+  }
 }
 
 // Return true if the call-site has an argument which is a PHI with only
@@ -385,45 +435,59 @@ static bool isPredicatedOnPHI(CallSite CS) {
   return false;
 }
 
-static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPHIPredicatedArgument(CallSite CS, DominatorTree *DT) {
   if (!isPredicatedOnPHI(CS))
     return false;
 
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
-  splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
+  SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS = {
+      {Preds[0], {}}, {Preds[1], {}}};
+  splitCallSite(CS, PredsCS, DT);
   return true;
 }
 
-static bool tryToSplitOnPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) {
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
   if (Preds[0] == Preds[1])
     return false;
 
-  SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
-  recordConditions(CS, Preds[0], C1);
-  recordConditions(CS, Preds[1], C2);
+  SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
+  for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
+    ConditionsTy Conditions;
+    recordConditions(CS, Pred, Conditions);
+    PredsCS.push_back({Pred, Conditions});
+  }
 
-  Instruction *CallInst1 = addConditions(CS, C1);
-  Instruction *CallInst2 = addConditions(CS, C2);
-  if (!CallInst1 && !CallInst2)
+  if (std::all_of(PredsCS.begin(), PredsCS.end(),
+                  [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+                    return P.second.empty();
+                  }))
     return false;
 
-  splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
+  splitCallSite(CS, PredsCS, DT);
   return true;
 }
 
-static bool tryToSplitCallSite(CallSite CS) {
-  if (!CS.arg_size() || !canSplitCallSite(CS))
+static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI,
+                               DominatorTree *DT) {
+  if (!CS.arg_size() || !canSplitCallSite(CS, TTI))
     return false;
-  return tryToSplitOnPredicatedArgument(CS) ||
-         tryToSplitOnPHIPredicatedArgument(CS);
+  return tryToSplitOnPredicatedArgument(CS, DT) ||
+         tryToSplitOnPHIPredicatedArgument(CS, DT);
 }
 
-static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
+                                TargetTransformInfo &TTI, DominatorTree *DT) {
   bool Changed = false;
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
     BasicBlock &BB = *BI++;
-    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+    auto II = BB.getFirstNonPHIOrDbg()->getIterator();
+    auto IE = BB.getTerminator()->getIterator();
+    // Iterate until we reach the terminator instruction. tryToSplitCallSite
+    // can replace BB's terminator in case BB is a successor of itself. In that
+    // case, IE will be invalidated and we also have to check the current
+    // terminator.
+    while (II != IE && &*II != BB.getTerminator()) {
       Instruction *I = &*II++;
       CallSite CS(cast<Value>(I));
       if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
@@ -437,7 +501,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
       // Check if such path is possible before attempting the splitting.
       bool IsMustTail = CS.isMustTailCall();
 
-      Changed |= tryToSplitCallSite(CS);
+      Changed |= tryToSplitCallSite(CS, TTI, DT);
 
       // There're no interesting instructions after this. The call site
       // itself might have been erased on splitting.
@@ -457,6 +521,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
@@ -465,7 +531,10 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
       return false;
 
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return doCallSiteSplitting(F, TLI);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    return doCallSiteSplitting(F, TLI, TTI,
+                               DTWP ? &DTWP->getDomTree() : nullptr);
   }
 };
 } // namespace
@@ -474,6 +543,7 @@ char CallSiteSplittingLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
                       "Call-site splitting", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
                     "Call-site splitting", false, false)
 FunctionPass *llvm::createCallSiteSplittingPass() {
@@ -483,9 +553,12 @@ FunctionPass *llvm::createCallSiteSplittingPass() {
 PreservedAnalyses CallSiteSplittingPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
 
-  if (!doCallSiteSplitting(F, TLI))
+  if (!doCallSiteSplitting(F, TLI, TTI, DT))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index e4b08c5ed305..3a675b979017 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -43,8 +43,10 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
@@ -59,8 +61,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -84,7 +84,7 @@ static cl::opt<bool> ConstHoistWithBlockFrequency(
 
 namespace {
 
-/// \brief The constant hoisting pass.
+/// The constant hoisting pass.
 class ConstantHoistingLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -127,13 +127,13 @@ FunctionPass *llvm::createConstantHoistingPass() {
   return new ConstantHoistingLegacyPass();
 }
 
-/// \brief Perform the constant hoisting optimization for the given function.
+/// Perform the constant hoisting optimization for the given function.
 bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
   if (skipFunction(Fn))
     return false;
 
-  DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
-  DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
 
   bool MadeChange =
       Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
@@ -144,16 +144,16 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
                    Fn.getEntryBlock());
 
   if (MadeChange) {
-    DEBUG(dbgs() << "********** Function after Constant Hoisting: "
-                 << Fn.getName() << '\n');
-    DEBUG(dbgs() << Fn);
+    LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+                      << Fn.getName() << '\n');
+    LLVM_DEBUG(dbgs() << Fn);
   }
-  DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+  LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
 
   return MadeChange;
 }
 
-/// \brief Find the constant materialization insertion point.
+/// Find the constant materialization insertion point.
 Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
                                                    unsigned Idx) const {
   // If the operand is a cast instruction, then we have to materialize the
@@ -187,7 +187,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   return IDom->getBlock()->getTerminator();
 }
 
-/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// Given \p BBs as input, find another set of BBs which collectively
 /// dominates \p BBs and have the minimal sum of frequencies. Return the BB
 /// set found in \p BBs.
 static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
@@ -289,7 +289,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
   }
 }
 
-/// \brief Find an insertion point that dominates all uses.
+/// Find an insertion point that dominates all uses.
 SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
     const ConstantInfo &ConstInfo) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
@@ -335,7 +335,7 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
   return InsertPts;
 }
 
-/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// Record constant integer ConstInt for instruction Inst at operand
 /// index Idx.
 ///
 /// The operand at index Idx is not necessarily the constant integer itself. It
@@ -364,18 +364,17 @@ void ConstantHoistingPass::collectConstantCandidates(
       Itr->second = ConstCandVec.size() - 1;
     }
     ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
-    DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
-            dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+    LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs()
+                   << "Collect constant " << *ConstInt << " from " << *Inst
                    << " with cost " << Cost << '\n';
-          else
-          dbgs() << "Collect constant " << *ConstInt << " indirectly from "
-                 << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
-                 << Cost << '\n';
-    );
+               else dbgs() << "Collect constant " << *ConstInt
+                           << " indirectly from " << *Inst << " via "
+                           << *Inst->getOperand(Idx) << " with cost " << Cost
+                           << '\n';);
   }
 }
 
-/// \brief Check the operand for instruction Inst at index Idx.
+/// Check the operand for instruction Inst at index Idx.
 void ConstantHoistingPass::collectConstantCandidates(
     ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
   Value *Opnd = Inst->getOperand(Idx);
@@ -416,7 +415,7 @@ void ConstantHoistingPass::collectConstantCandidates(
   }
 }
 
-/// \brief Scan the instruction for expensive integer constants and record them
+/// Scan the instruction for expensive integer constants and record them
 /// in the constant candidate vector.
 void ConstantHoistingPass::collectConstantCandidates(
     ConstCandMapType &ConstCandMap, Instruction *Inst) {
@@ -436,7 +435,7 @@ void ConstantHoistingPass::collectConstantCandidates(
   } // end of for all operands
 }
 
-/// \brief Collect all integer constants in the function that cannot be folded
+/// Collect all integer constants in the function that cannot be folded
 /// into an instruction itself.
 void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
   ConstCandMapType ConstCandMap;
@@ -501,20 +500,21 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
     return NumUses;
   }
 
-  DEBUG(dbgs() << "== Maximize constants in range ==\n");
+  LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n");
   int MaxCost = -1;
   for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
     auto Value = ConstCand->ConstInt->getValue();
     Type *Ty = ConstCand->ConstInt->getType();
     int Cost = 0;
     NumUses += ConstCand->Uses.size();
-    DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n");
+    LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue()
+                      << "\n");
 
     for (auto User : ConstCand->Uses) {
       unsigned Opcode = User.Inst->getOpcode();
       unsigned OpndIdx = User.OpndIdx;
       Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
-      DEBUG(dbgs() << "Cost: " << Cost << "\n");
+      LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
 
       for (auto C2 = S; C2 != E; ++C2) {
         Optional<APInt> Diff = calculateOffsetDiff(
@@ -524,24 +524,24 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
           const int ImmCosts =
             TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
           Cost -= ImmCosts;
-          DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
-                       << "has penalty: " << ImmCosts << "\n"
-                       << "Adjusted cost: " << Cost << "\n");
+          LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+                            << "has penalty: " << ImmCosts << "\n"
+                            << "Adjusted cost: " << Cost << "\n");
         }
       }
     }
-    DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+    LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
     if (Cost > MaxCost) {
       MaxCost = Cost;
       MaxCostItr = ConstCand;
-      DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+                        << "\n");
     }
   }
   return NumUses;
 }
 
-/// \brief Find the base constant within the given range and rebase all other
+/// Find the base constant within the given range and rebase all other
 /// constants with respect to the base constant.
 void ConstantHoistingPass::findAndMakeBaseConstant(
     ConstCandVecType::iterator S, ConstCandVecType::iterator E) {
@@ -567,12 +567,12 @@ void ConstantHoistingPass::findAndMakeBaseConstant(
   ConstantVec.push_back(std::move(ConstInfo));
 }
 
-/// \brief Finds and combines constant candidates that can be easily
+/// Finds and combines constant candidates that can be easily
 /// rematerialized with an add from a common base constant.
 void ConstantHoistingPass::findBaseConstants() {
   // Sort the constants by value and type. This invalidates the mapping!
-  std::sort(ConstCandVec.begin(), ConstCandVec.end(),
-            [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+  llvm::sort(ConstCandVec.begin(), ConstCandVec.end(),
+             [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
     if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
       return LHS.ConstInt->getType()->getBitWidth() <
              RHS.ConstInt->getType()->getBitWidth();
@@ -601,7 +601,7 @@ void ConstantHoistingPass::findBaseConstants() {
   findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
 }
 
-/// \brief Updates the operand at Idx in instruction Inst with the result of
+/// Updates the operand at Idx in instruction Inst with the result of
 ///        instruction Mat. If the instruction is a PHI node then special
 ///        handling for duplicate values form the same incoming basic block is
 ///        required.
@@ -629,7 +629,7 @@ static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
   return true;
 }
 
-/// \brief Emit materialization code for all rebased constants and update their
+/// Emit materialization code for all rebased constants and update their
 /// users.
 void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
                                              Constant *Offset,
@@ -641,19 +641,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
     Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
                                  "const_mat", InsertionPt);
 
-    DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
-                 << " + " << *Offset << ") in BB "
-                 << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+    LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+                      << " + " << *Offset << ") in BB "
+                      << Mat->getParent()->getName() << '\n'
+                      << *Mat << '\n');
     Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
   }
   Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
 
   // Visit constant integer.
   if (isa<ConstantInt>(Opnd)) {
-    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
     if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
       Mat->eraseFromParent();
-    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
     return;
   }
 
@@ -669,13 +670,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
       ClonedCastInst->insertAfter(CastInst);
       // Use the same debug location as the original cast instruction.
       ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
-      DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
-                   << "To               : " << *ClonedCastInst << '\n');
+      LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+                        << "To               : " << *ClonedCastInst << '\n');
     }
 
-    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
     updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
-    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
     return;
   }
 
@@ -689,20 +690,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
     // Use the same debug location as the instruction we are about to update.
     ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
 
-    DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
-                 << "From              : " << *ConstExpr << '\n');
-    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+                      << "From              : " << *ConstExpr << '\n');
+    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
     if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
       ConstExprInst->eraseFromParent();
       if (Offset)
         Mat->eraseFromParent();
     }
-    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
     return;
   }
 }
 
-/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// Hoist and hide the base constant behind a bitcast and emit
 /// materialization code for derived constants.
 bool ConstantHoistingPass::emitBaseConstants() {
   bool MadeChange = false;
@@ -720,9 +721,9 @@ bool ConstantHoistingPass::emitBaseConstants() {
 
       Base->setDebugLoc(IP->getDebugLoc());
 
-      DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
-                   << ") to BB " << IP->getParent()->getName() << '\n'
-                   << *Base << '\n');
+      LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+                        << ") to BB " << IP->getParent()->getName() << '\n'
+                        << *Base << '\n');
 
       // Emit materialization code for all rebased constants.
       unsigned Uses = 0;
@@ -765,7 +766,7 @@ bool ConstantHoistingPass::emitBaseConstants() {
   return MadeChange;
 }
 
-/// \brief Check all cast instructions we made a copy of and remove them if they
+/// Check all cast instructions we made a copy of and remove them if they
 /// have no more users.
 void ConstantHoistingPass::deleteDeadCastInst() const {
   for (auto const &I : ClonedCastMap)
@@ -773,7 +774,7 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
       I.first->eraseFromParent();
 }
 
-/// \brief Optimize expensive integer constants in the given function.
+/// Optimize expensive integer constants in the given function.
 bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
                                    DominatorTree &DT, BlockFrequencyInfo *BFI,
                                    BasicBlock &Entry) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 4fa27891a974..46915889ce7c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -21,12 +21,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <set>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 8f468ebf8949..ea148b728a10 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -28,11 +29,11 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -43,7 +44,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
 
@@ -52,12 +52,14 @@ using namespace llvm;
 #define DEBUG_TYPE "correlated-value-propagation"
 
 STATISTIC(NumPhis,      "Number of phis propagated");
+STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
 STATISTIC(NumSelects,   "Number of selects propagated");
 STATISTIC(NumMemAccess, "Number of memory access targets propagated");
 STATISTIC(NumCmps,      "Number of comparisons propagated");
 STATISTIC(NumReturns,   "Number of return values propagated");
 STATISTIC(NumDeadCases, "Number of switch cases removed");
 STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
+STATISTIC(NumUDivs,     "Number of udivs whose width was decreased");
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
 STATISTIC(NumOverflows, "Number of overflow checks removed");
@@ -77,8 +79,10 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
     }
   };
 
@@ -88,6 +92,7 @@ char CorrelatedValuePropagation::ID = 0;
 
 INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
                 "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
                 "Value Propagation", false, false)
@@ -101,14 +106,14 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   if (S->getType()->isVectorTy()) return false;
   if (isa<Constant>(S->getOperand(0))) return false;
 
-  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+  Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S);
   if (!C) return false;
 
   ConstantInt *CI = dyn_cast<ConstantInt>(C);
   if (!CI) return false;
 
-  Value *ReplaceWith = S->getOperand(1);
-  Value *Other = S->getOperand(2);
+  Value *ReplaceWith = S->getTrueValue();
+  Value *Other = S->getFalseValue();
   if (!CI->isOne()) std::swap(ReplaceWith, Other);
   if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
 
@@ -120,7 +125,63 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   return true;
 }
 
-static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+/// Try to simplify a phi with constant incoming values that match the edge
+/// values of a non-constant value on all other edges:
+/// bb0:
+///   %isnull = icmp eq i8* %x, null
+///   br i1 %isnull, label %bb2, label %bb1
+/// bb1:
+///   br label %bb2
+/// bb2:
+///   %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ]
+/// -->
+///   %r = %x
+static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
+                                   DominatorTree *DT) {
+  // Collect incoming constants and initialize possible common value.
+  SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants;
+  Value *CommonValue = nullptr;
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) {
+      IncomingConstants.push_back(std::make_pair(IncomingConstant, i));
+    } else if (!CommonValue) {
+      // The potential common value is initialized to the first non-constant.
+      CommonValue = Incoming;
+    } else if (Incoming != CommonValue) {
+      // There can be only one non-constant common value.
+      return false;
+    }
+  }
+
+  if (!CommonValue || IncomingConstants.empty())
+    return false;
+
+  // The common value must be valid in all incoming blocks.
+  BasicBlock *ToBB = P->getParent();
+  if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+    if (!DT->dominates(CommonInst, ToBB))
+      return false;
+
+  // We have a phi with exactly 1 variable incoming value and 1 or more constant
+  // incoming values. See if all constant incoming values can be mapped back to
+  // the same incoming variable value.
+  for (auto &IncomingConstant : IncomingConstants) {
+    Constant *C = IncomingConstant.first;
+    BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second);
+    if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P))
+      return false;
+  }
+
+  // All constant incoming values map to the same variable along the incoming
+  // edges of the phi. The phi is unnecessary.
+  P->replaceAllUsesWith(CommonValue);
+  P->eraseFromParent();
+  ++NumPhiCommon;
+  return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
                        const SimplifyQuery &SQ) {
   bool Changed = false;
 
@@ -168,7 +229,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI,
         V = SI->getTrueValue();
       }
 
-      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+      LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
     }
 
     P->setIncomingValue(i, V);
@@ -181,6 +242,9 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI,
     Changed = true;
   }
 
+  if (!Changed)
+    Changed = simplifyCommonValuePhi(P, LVI, DT);
+
   if (Changed)
     ++NumPhis;
 
@@ -243,7 +307,7 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
 /// that cannot fire no matter what the incoming edge can safely be removed. If
 /// a case fires on every incoming edge then the entire switch can be removed
 /// and replaced with a branch to the case destination.
-static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, DominatorTree *DT) {
   Value *Cond = SI->getCondition();
   BasicBlock *BB = SI->getParent();
 
@@ -258,6 +322,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
   // Analyse each switch case in turn.
   bool Changed = false;
+  DenseMap<BasicBlock*, int> SuccessorsCount;
+  for (auto *Succ : successors(BB))
+    SuccessorsCount[Succ]++;
+
   for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
     ConstantInt *Case = CI->getCaseValue();
 
@@ -292,7 +360,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
     if (State == LazyValueInfo::False) {
       // This case never fires - remove it.
-      CI->getCaseSuccessor()->removePredecessor(BB);
+      BasicBlock *Succ = CI->getCaseSuccessor();
+      Succ->removePredecessor(BB);
       CI = SI->removeCase(CI);
       CE = SI->case_end();
 
@@ -302,6 +371,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
       ++NumDeadCases;
       Changed = true;
+      if (--SuccessorsCount[Succ] == 0)
+        DT->deleteEdge(BB, Succ);
       continue;
     }
     if (State == LazyValueInfo::True) {
@@ -318,10 +389,14 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
     ++CI;
   }
 
-  if (Changed)
+  if (Changed) {
     // If the switch has been simplified to the point where it can be replaced
     // by a branch then do so now.
-    ConstantFoldTerminator(BB);
+    DeferredDominance DDT(*DT);
+    ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false,
+                           /*TLI = */ nullptr, &DDT);
+    DDT.flush();
+  }
 
   return Changed;
 }
@@ -430,9 +505,50 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
   return true;
 }
 
+/// Try to shrink a udiv/urem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+  assert(Instr->getOpcode() == Instruction::UDiv ||
+         Instr->getOpcode() == Instruction::URem);
+  if (Instr->getType()->isVectorTy())
+    return false;
+
+  // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+  // operands.
+  auto OrigWidth = Instr->getType()->getIntegerBitWidth();
+  ConstantRange OperandRange(OrigWidth, /*isFullset=*/false);
+  for (Value *Operand : Instr->operands()) {
+    OperandRange = OperandRange.unionWith(
+        LVI->getConstantRange(Operand, Instr->getParent()));
+  }
+  // Don't shrink below 8 bits wide.
+  unsigned NewWidth = std::max<unsigned>(
+      PowerOf2Ceil(OperandRange.getUnsignedMax().getActiveBits()), 8);
+  // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+  // two.
+  if (NewWidth >= OrigWidth)
+    return false;
+
+  ++NumUDivs;
+  auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+  auto *LHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(0), TruncTy,
+                               Instr->getName() + ".lhs.trunc", Instr);
+  auto *RHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(1), TruncTy,
+                               Instr->getName() + ".rhs.trunc", Instr);
+  auto *BO =
+      BinaryOperator::Create(Instr->getOpcode(), LHS, RHS, Instr->getName(), Instr);
+  auto *Zext = CastInst::Create(Instruction::ZExt, BO, Instr->getType(),
+                                Instr->getName() + ".zext", Instr);
+  if (BO->getOpcode() == Instruction::UDiv)
+    BO->setIsExact(Instr->isExact());
+
+  Instr->replaceAllUsesWith(Zext);
+  Instr->eraseFromParent();
+  return true;
+}
+
 static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() ||
-      !hasPositiveOperands(SDI, LVI))
+  if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
     return false;
 
   ++NumSRems;
@@ -440,6 +556,10 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
                                         SDI->getName(), SDI);
   SDI->replaceAllUsesWith(BO);
   SDI->eraseFromParent();
+
+  // Try to process our new urem.
+  processUDivOrURem(BO, LVI);
+
   return true;
 }
 
@@ -449,8 +569,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
 /// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
 static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() ||
-      !hasPositiveOperands(SDI, LVI))
+  if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
     return false;
 
   ++NumSDivs;
@@ -460,6 +579,9 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
   SDI->replaceAllUsesWith(BO);
   SDI->eraseFromParent();
 
+  // Try to simplify our new udiv.
+  processUDivOrURem(BO, LVI);
+
   return true;
 }
 
@@ -559,7 +681,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
     ConstantInt::getFalse(C->getContext());
 }
 
-static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
+static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
+                    const SimplifyQuery &SQ) {
   bool FnChanged = false;
   // Visiting in a pre-order depth-first traversal causes us to simplify early
   // blocks before querying later blocks (which require us to analyze early
@@ -575,7 +698,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
         BBChanged |= processSelect(cast<SelectInst>(II), LVI);
         break;
       case Instruction::PHI:
-        BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
+        BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
@@ -595,6 +718,10 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
       case Instruction::SDiv:
         BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
         break;
+      case Instruction::UDiv:
+      case Instruction::URem:
+        BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
+        break;
       case Instruction::AShr:
         BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
         break;
@@ -607,7 +734,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
     Instruction *Term = BB->getTerminator();
     switch (Term->getOpcode()) {
     case Instruction::Switch:
-      BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
+      BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT);
       break;
     case Instruction::Ret: {
       auto *RI = cast<ReturnInst>(Term);
@@ -636,18 +763,22 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
     return false;
 
   LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
-  return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F));
 }
 
 PreservedAnalyses
 CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
-
   LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
-  bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+  bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
 
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index fa4806e884c3..6078967a0f94 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -20,11 +20,11 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dce"
@@ -50,6 +50,7 @@ namespace {
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = &*DI++;
         if (isInstructionTriviallyDead(Inst, TLI)) {
+          salvageDebugInfo(*Inst);
           Inst->eraseFromParent();
           Changed = true;
           ++DIEEliminated;
@@ -76,6 +77,8 @@ static bool DCEInstruction(Instruction *I,
                            SmallSetVector<Instruction *, 16> &WorkList,
                            const TargetLibraryInfo *TLI) {
   if (isInstructionTriviallyDead(I, TLI)) {
+    salvageDebugInfo(*I);
+
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index b665d94a70aa..dd1a2a6adb82 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -56,11 +57,10 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
-#include <cstdint>
 #include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <map>
 #include <utility>
@@ -115,6 +115,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
     Instruction *DeadInst = NowDeadInsts.pop_back_val();
     ++NumFastOther;
 
+    // Try to preserve debug information attached to the dead instruction.
+    salvageDebugInfo(*DeadInst);
+
     // This instruction is dead, zap it, in stages.  Start by removing it from
     // MemDep, which needs to know the operands and needs it to be in the
     // function.
@@ -146,7 +149,8 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
 
 /// Does this instruction write some memory?  This only returns true for things
 /// that we can analyze with other helpers below.
-static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+static bool hasAnalyzableMemoryWrite(Instruction *I,
+                                     const TargetLibraryInfo &TLI) {
   if (isa<StoreInst>(I))
     return true;
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
@@ -156,6 +160,9 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
     case Intrinsic::memset:
     case Intrinsic::memmove:
     case Intrinsic::memcpy:
+    case Intrinsic::memcpy_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memset_element_unordered_atomic:
     case Intrinsic::init_trampoline:
     case Intrinsic::lifetime_end:
       return true;
@@ -180,43 +187,45 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
 /// Return a Location stored to by the specified instruction. If isRemovable
 /// returns true, this function and getLocForRead completely describe the memory
 /// operations for this instruction.
-static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+static MemoryLocation getLocForWrite(Instruction *Inst) {
+  
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return MemoryLocation::get(SI);
 
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+  if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
     // memcpy/memmove/memset.
     MemoryLocation Loc = MemoryLocation::getForDest(MI);
     return Loc;
   }
 
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
-  if (!II)
-    return MemoryLocation();
-
-  switch (II->getIntrinsicID()) {
-  default:
-    return MemoryLocation(); // Unhandled intrinsic.
-  case Intrinsic::init_trampoline:
-    // FIXME: We don't know the size of the trampoline, so we can't really
-    // handle it here.
-    return MemoryLocation(II->getArgOperand(0));
-  case Intrinsic::lifetime_end: {
-    uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
-    return MemoryLocation(II->getArgOperand(1), Len);
-  }
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return MemoryLocation(); // Unhandled intrinsic.
+    case Intrinsic::init_trampoline:
+      return MemoryLocation(II->getArgOperand(0));
+    case Intrinsic::lifetime_end: {
+      uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+      return MemoryLocation(II->getArgOperand(1), Len);
+    }
+    }
   }
+  if (auto CS = CallSite(Inst))
+    // All the supported TLI functions so far happen to have dest as their
+    // first argument.
+    return MemoryLocation(CS.getArgument(0));
+  return MemoryLocation();
 }
 
-/// Return the location read by the specified "hasMemoryWrite" instruction if
-/// any.
+/// Return the location read by the specified "hasAnalyzableMemoryWrite"
+/// instruction if any.
 static MemoryLocation getLocForRead(Instruction *Inst,
                                     const TargetLibraryInfo &TLI) {
-  assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+  assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
 
   // The only instructions that both read and write are the mem transfer
   // instructions (memcpy/memmove).
-  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+  if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
     return MemoryLocation::getForSource(MTI);
   return MemoryLocation();
 }
@@ -230,7 +239,7 @@ static bool isRemovable(Instruction *I) {
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
-    default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+    default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
     case Intrinsic::lifetime_end:
       // Never remove dead lifetime_end's, e.g. because it is followed by a
       // free.
@@ -243,9 +252,14 @@ static bool isRemovable(Instruction *I) {
     case Intrinsic::memcpy:
       // Don't remove volatile memory intrinsics.
       return !cast<MemIntrinsic>(II)->isVolatile();
+    case Intrinsic::memcpy_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memset_element_unordered_atomic:
+      return true;
     }
   }
 
+  // note: only get here for calls with analyzable writes - i.e. libcalls
   if (auto CS = CallSite(I))
     return CS.getInstruction()->use_empty();
 
@@ -264,6 +278,8 @@ static bool isShortenableAtTheEnd(Instruction *I) {
       default: return false;
       case Intrinsic::memset:
       case Intrinsic::memcpy:
+      case Intrinsic::memcpy_element_unordered_atomic:
+      case Intrinsic::memset_element_unordered_atomic:
         // Do shorten memory intrinsics.
         // FIXME: Add memmove if it's also safe to transform.
         return true;
@@ -280,35 +296,27 @@ static bool isShortenableAtTheEnd(Instruction *I) {
 static bool isShortenableAtTheBeginning(Instruction *I) {
   // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
   // easily done by offsetting the source address.
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
-  return II && II->getIntrinsicID() == Intrinsic::memset;
+  return isa<AnyMemSetInst>(I);
 }
 
 /// Return the pointer that is being written to.
 static Value *getStoredPointerOperand(Instruction *I) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
-    return MI->getDest();
-
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    default: llvm_unreachable("Unexpected intrinsic!");
-    case Intrinsic::init_trampoline:
-      return II->getArgOperand(0);
-    }
-  }
-
-  CallSite CS(I);
-  // All the supported functions so far happen to have dest as their first
-  // argument.
-  return CS.getArgument(0);
+  //TODO: factor this to reuse getLocForWrite
+  MemoryLocation Loc = getLocForWrite(I);
+  assert(Loc.Ptr &&
+         "unable to find pointer written for analyzable instruction?");
+  // TODO: most APIs don't expect const Value *
+  return const_cast<Value*>(Loc.Ptr);
 }
 
 static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
-                               const TargetLibraryInfo &TLI) {
+                               const TargetLibraryInfo &TLI,
+                               const Function *F) {
   uint64_t Size;
-  if (getObjectSize(V, Size, DL, &TLI))
+  ObjectSizeOpts Opts;
+  Opts.NullIsUnknownSize = NullPointerIsDefined(F);
+
+  if (getObjectSize(V, Size, DL, &TLI, Opts))
     return Size;
   return MemoryLocation::UnknownSize;
 }
@@ -338,7 +346,9 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const TargetLibraryInfo &TLI,
                                    int64_t &EarlierOff, int64_t &LaterOff,
                                    Instruction *DepWrite,
-                                   InstOverlapIntervalsTy &IOL) {
+                                   InstOverlapIntervalsTy &IOL,
+                                   AliasAnalysis &AA,
+                                   const Function *F) {
   // If we don't know the sizes of either access, then we can't do a comparison.
   if (Later.Size == MemoryLocation::UnknownSize ||
       Earlier.Size == MemoryLocation::UnknownSize)
@@ -349,7 +359,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 
   // If the start pointers are the same, we just have to compare sizes to see if
   // the later store was larger than the earlier store.
-  if (P1 == P2) {
+  if (P1 == P2 || AA.isMustAlias(P1, P2)) {
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)
       return OW_Complete;
@@ -367,7 +377,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
     return OW_Unknown;
 
   // If the "Later" store is to a recognizable object, get its size.
-  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
   if (ObjectSize != MemoryLocation::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
       return OW_Complete;
@@ -415,9 +425,10 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 
     // Insert our part of the overlap into the map.
     auto &IM = IOL[DepWrite];
-    DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
-                    int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
-                    LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");
+    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
+                      << ", " << int64_t(EarlierOff + Earlier.Size)
+                      << ") Later [" << LaterOff << ", "
+                      << int64_t(LaterOff + Later.Size) << ")\n");
 
     // Make sure that we only insert non-overlapping intervals and combine
     // adjacent intervals. The intervals are stored in the map with the ending
@@ -454,11 +465,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
     ILI = IM.begin();
     if (ILI->second <= EarlierOff &&
         ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
-      DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
-                      EarlierOff << ", " <<
-                      int64_t(EarlierOff + Earlier.Size) <<
-                      ") Composite Later [" <<
-                      ILI->second << ", " << ILI->first << ")\n");
+      LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
+                        << EarlierOff << ", "
+                        << int64_t(EarlierOff + Earlier.Size)
+                        << ") Composite Later [" << ILI->second << ", "
+                        << ILI->first << ")\n");
       ++NumCompletePartials;
       return OW_Complete;
     }
@@ -469,10 +480,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
       int64_t(EarlierOff + Earlier.Size) > LaterOff &&
       uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
-    DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
-                 << ", " << int64_t(EarlierOff + Earlier.Size)
-                 << ") by a later store [" << LaterOff << ", "
-                 << int64_t(LaterOff + Later.Size) << ")\n");
+    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
+                      << EarlierOff << ", "
+                      << int64_t(EarlierOff + Earlier.Size)
+                      << ") by a later store [" << LaterOff << ", "
+                      << int64_t(LaterOff + Later.Size) << ")\n");
     // TODO: Maybe come up with a better name?
     return OW_PartialEarlierWithFullLater;
   }
@@ -514,8 +526,8 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 /// memory region into an identical pointer) then it doesn't actually make its
 /// input dead in the traditional sense.  Consider this case:
 ///
-///   memcpy(A <- B)
-///   memcpy(A <- A)
+///   memmove(A <- B)
+///   memmove(A <- A)
 ///
 /// In this case, the second store to A does not make the first store to A dead.
 /// The usual situation isn't an explicit A<-A store like this (which can be
@@ -531,24 +543,35 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // Self reads can only happen for instructions that read memory.  Get the
   // location read.
   MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
-  if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.
+  if (!InstReadLoc.Ptr)
+    return false; // Not a reading instruction.
 
   // If the read and written loc obviously don't alias, it isn't a read.
-  if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
-
-  // Okay, 'Inst' may copy over itself.  However, we can still remove a the
-  // DepWrite instruction if we can prove that it reads from the same location
-  // as Inst.  This handles useful cases like:
-  //   memcpy(A <- B)
-  //   memcpy(A <- B)
-  // Here we don't know if A/B may alias, but we do know that B/B are must
-  // aliases, so removing the first memcpy is safe (assuming it writes <= #
-  // bytes as the second one.
-  MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
-
-  if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+  if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
     return false;
 
+  if (isa<AnyMemCpyInst>(Inst)) {
+    // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
+    // but in practice memcpy(A <- B) either means that A and B are disjoint or
+    // are equal (i.e. there are not partial overlaps).  Given that, if we have:
+    //
+    //   memcpy/memmove(A <- B)  // DepWrite
+    //   memcpy(A <- B)  // Inst
+    //
+    // with Inst reading/writing a >= size than DepWrite, we can reason as
+    // follows:
+    //
+    //   - If A == B then both the copies are no-ops, so the DepWrite can be
+    //     removed.
+    //   - If A != B then A and B are disjoint locations in Inst.  Since
+    //     Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
+    //     Therefore DepWrite can be removed.
+    MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+    if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+      return false;
+  }
+
   // If DepWrite doesn't read memory or if we can't prove it is a must alias,
   // then it can't be considered dead.
   return true;
@@ -650,7 +673,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
         MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
     while (Dep.isDef() || Dep.isClobber()) {
       Instruction *Dependency = Dep.getInst();
-      if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+      if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
+          !isRemovable(Dependency))
         break;
 
       Value *DepPointer =
@@ -660,8 +684,9 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
       if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
         break;
 
-      DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n  DEAD: "
-                   << *Dependency << '\n');
+      LLVM_DEBUG(
+          dbgs() << "DSE: Dead Store to soon to be freed memory:\n  DEAD: "
+                 << *Dependency << '\n');
 
       // DCE instructions only used to calculate that store.
       BasicBlock::iterator BBI(Dependency);
@@ -690,7 +715,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
 static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
                                   SmallSetVector<Value *, 16> &DeadStackObjects,
                                   const DataLayout &DL, AliasAnalysis *AA,
-                                  const TargetLibraryInfo *TLI) {
+                                  const TargetLibraryInfo *TLI,
+                                  const Function *F) {
   const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
 
   // A constant can't be in the dead pointer set.
@@ -707,7 +733,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
   // Remove objects that could alias LoadedLoc.
   DeadStackObjects.remove_if([&](Value *I) {
     // See if the loaded location could alias the stack location.
-    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
     return !AA->isNoAlias(StackLoc, LoadedLoc);
   });
 }
@@ -754,7 +780,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
     --BBI;
 
     // If we find a store, check to see if it points into a dead stack value.
-    if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+    if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
       // See through pointer-to-pointer bitcasts
       SmallVector<Value *, 4> Pointers;
       GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
@@ -770,15 +796,16 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
       if (AllDead) {
         Instruction *Dead = &*BBI;
 
-        DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
-                     << *Dead << "\n  Objects: ";
-              for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
-                   E = Pointers.end(); I != E; ++I) {
-                dbgs() << **I;
-                if (std::next(I) != E)
-                  dbgs() << ", ";
-              }
-              dbgs() << '\n');
+        LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
+                          << *Dead << "\n  Objects: ";
+                   for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+                        E = Pointers.end();
+                        I != E; ++I) {
+                     dbgs() << **I;
+                     if (std::next(I) != E)
+                       dbgs() << ", ";
+                   } dbgs()
+                   << '\n');
 
         // DCE instructions only used to calculate that store.
         deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
@@ -790,8 +817,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
 
     // Remove any dead non-memory-mutating instructions.
     if (isInstructionTriviallyDead(&*BBI, TLI)) {
-      DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: "
-                   << *&*BBI << '\n');
+      LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: "
+                        << *&*BBI << '\n');
       deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
@@ -820,7 +847,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
       // the call is live.
       DeadStackObjects.remove_if([&](Value *I) {
         // See if the call site touches the value.
-        return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
+        return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI,
+                                                                BB.getParent())));
       });
 
       // If all of the allocas were clobbered by the call then we're not going
@@ -848,8 +876,6 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
       LoadedLoc = MemoryLocation::get(L);
     } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
       LoadedLoc = MemoryLocation::get(V);
-    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
-      LoadedLoc = MemoryLocation::getForSource(MTI);
     } else if (!BBI->mayReadFromMemory()) {
       // Instruction doesn't read memory.  Note that stores that weren't removed
       // above will hit this case.
@@ -861,7 +887,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
 
     // Remove any allocas from the DeadPointer set that are loaded, as this
     // makes any stores above the access live.
-    removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);
+    removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
 
     // If all of the allocas were clobbered by the access then we're not going
     // to find anything else to process.
@@ -881,8 +907,8 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
   // Power of 2 vector writes are probably always a bad idea to optimize
   // as any store/memset/memcpy is likely using vector instructions so
   // shortening it to not vector size is likely to be slower
-  MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
-  unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
+  auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
+  unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
   if (!IsOverwriteEnd)
     LaterOffset = int64_t(LaterOffset + LaterSize);
 
@@ -890,15 +916,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
       !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
     return false;
 
-  DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
-               << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
-               << "\n  KILLER (offset " << LaterOffset << ", " << EarlierSize
-               << ")\n");
-
   int64_t NewLength = IsOverwriteEnd
                           ? LaterOffset - EarlierOffset
                           : EarlierSize - (LaterOffset - EarlierOffset);
 
+  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+    // When shortening an atomic memory intrinsic, the newly shortened
+    // length must remain an integer multiple of the element size.
+    const uint32_t ElementSize = AMI->getElementSizeInBytes();
+    if (0 != NewLength % ElementSize)
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
+                    << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
+                    << *EarlierWrite << "\n  KILLER (offset " << LaterOffset
+                    << ", " << EarlierSize << ")\n");
+
   Value *EarlierWriteLength = EarlierIntrinsic->getLength();
   Value *TrimmedLength =
       ConstantInt::get(EarlierWriteLength->getType(), NewLength);
@@ -966,7 +1000,7 @@ static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
   bool Changed = false;
   for (auto OI : IOL) {
     Instruction *EarlierWrite = OI.first;
-    MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
+    MemoryLocation Loc = getLocForWrite(EarlierWrite);
     assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
     assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");
 
@@ -1002,8 +1036,9 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
     if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
         isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
 
-      DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
-                   << *DepLoad << "\n  STORE: " << *SI << '\n');
+      LLVM_DEBUG(
+          dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
+                 << *DepLoad << "\n  STORE: " << *SI << '\n');
 
       deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
       ++NumRedundantStores;
@@ -1019,7 +1054,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
 
     if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
         memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
-      DEBUG(
+      LLVM_DEBUG(
           dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
                  << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
 
@@ -1067,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
     }
 
     // Check to see if Inst writes to memory.  If not, continue.
-    if (!hasMemoryWrite(Inst, *TLI))
+    if (!hasAnalyzableMemoryWrite(Inst, *TLI))
       continue;
 
     // eliminateNoopStore will update in iterator, if necessary.
@@ -1085,7 +1120,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       continue;
 
     // Figure out what location is being stored to.
-    MemoryLocation Loc = getLocForWrite(Inst, *AA);
+    MemoryLocation Loc = getLocForWrite(Inst);
 
     // If we didn't get a useful location, fail.
     if (!Loc.Ptr)
@@ -1107,7 +1142,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       //
       // Find out what memory location the dependent instruction stores.
       Instruction *DepWrite = InstDep.getInst();
-      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+      if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
+        break;
+      MemoryLocation DepLoc = getLocForWrite(DepWrite);
       // If we didn't get a useful location, or if it isn't a size, bail out.
       if (!DepLoc.Ptr)
         break;
@@ -1145,12 +1182,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       if (isRemovable(DepWrite) &&
           !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
         int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR =
-            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
-                        DepWrite, IOL);
+        OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset,
+                                         InstWriteOffset, DepWrite, IOL, *AA,
+                                         BB.getParent());
         if (OR == OW_Complete) {
-          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
-                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
+          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DepWrite
+                            << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
           deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering);
@@ -1208,9 +1245,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
             // store, shifted appropriately.
             APInt Merged =
                 (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
-            DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *DepWrite
-                         << "\n  Later: " << *Inst
-                         << "\n  Merged Value: " << Merged << '\n');
+            LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *DepWrite
+                              << "\n  Later: " << *Inst
+                              << "\n  Merged Value: " << Merged << '\n');
 
             auto *SI = new StoreInst(
                 ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 5798e1c4ee99..565745d12e99 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -49,10 +50,10 @@
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <deque>
 #include <memory>
@@ -70,13 +71,16 @@ STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
 STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
 STATISTIC(NumDSE,      "Number of trivial dead stores removed");
 
+DEBUG_COUNTER(CSECounter, "early-cse",
+              "Controls which instructions are removed");
+
 //===----------------------------------------------------------------------===//
 // SimpleValue
 //===----------------------------------------------------------------------===//
 
 namespace {
 
-/// \brief Struct representing the available values in the scoped hash table.
+/// Struct representing the available values in the scoped hash table.
 struct SimpleValue {
   Instruction *Inst;
 
@@ -151,12 +155,15 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor;
   // TODO: We should also detect FP min/max.
   if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
-      SPF == SPF_UMIN || SPF == SPF_UMAX ||
-      SPF == SPF_ABS || SPF == SPF_NABS) {
+      SPF == SPF_UMIN || SPF == SPF_UMAX) {
     if (A > B)
       std::swap(A, B);
     return hash_combine(Inst->getOpcode(), SPF, A, B);
   }
+  if (SPF == SPF_ABS || SPF == SPF_NABS) {
+    // ABS/NABS always puts the input in A and its negation in B.
+    return hash_combine(Inst->getOpcode(), SPF, A, B);
+  }
 
   if (CastInst *CI = dyn_cast<CastInst>(Inst))
     return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
@@ -226,8 +233,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
       LSPF == SPF_ABS || LSPF == SPF_NABS) {
     Value *RHSA, *RHSB;
     SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor;
-    return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) ||
-                             (LHSA == RHSB && LHSB == RHSA)));
+    if (LSPF == RSPF) {
+      // Abs results are placed in a defined order by matchSelectPattern.
+      if (LSPF == SPF_ABS || LSPF == SPF_NABS)
+        return LHSA == RHSA && LHSB == RHSB;
+      return ((LHSA == RHSA && LHSB == RHSB) ||
+              (LHSA == RHSB && LHSB == RHSA));
+    }
   }
 
   return false;
@@ -239,7 +251,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 
 namespace {
 
-/// \brief Struct representing the available call values in the scoped hash
+/// Struct representing the available call values in the scoped hash
 /// table.
 struct CallValue {
   Instruction *Inst;
@@ -305,7 +317,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
 
 namespace {
 
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
 ///
 /// This pass does a simple depth-first walk over the dominator tree,
 /// eliminating trivially redundant instructions and using instsimplify to
@@ -329,7 +341,7 @@ public:
       ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
                       AllocatorTy>;
 
-  /// \brief A scoped hash table of the current values of all of our simple
+  /// A scoped hash table of the current values of all of our simple
   /// scalar expressions.
   ///
   /// As we walk down the domtree, we look to see if instructions are in this:
@@ -337,8 +349,8 @@ public:
   /// that dominated values can succeed in their lookup.
   ScopedHTType AvailableValues;
 
-  /// A scoped hash table of the current values of previously encounted memory
-  /// locations.
+  /// A scoped hash table of the current values of previously encountered
+  /// memory locations.
   ///
   /// This allows us to get efficient access to dominating loads or stores when
   /// we have a fully redundant load.  In addition to the most recent load, we
@@ -356,13 +368,12 @@ public:
     unsigned Generation = 0;
     int MatchingId = -1;
     bool IsAtomic = false;
-    bool IsInvariant = false;
 
     LoadValue() = default;
     LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
-              bool IsAtomic, bool IsInvariant)
+              bool IsAtomic)
         : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
-          IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
+          IsAtomic(IsAtomic) {}
   };
 
   using LoadMapAllocator =
@@ -373,8 +384,19 @@ public:
                       LoadMapAllocator>;
 
   LoadHTType AvailableLoads;
+  
+  // A scoped hash table mapping memory locations (represented as typed
+  // addresses) to generation numbers at which that memory location became
+  // (henceforth indefinitely) invariant.
+  using InvariantMapAllocator =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<MemoryLocation, unsigned>>;
+  using InvariantHTType =
+      ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+                      InvariantMapAllocator>;
+  InvariantHTType AvailableInvariants;
 
-  /// \brief A scoped hash table of the current values of read-only call
+  /// A scoped hash table of the current values of read-only call
   /// values.
   ///
   /// It uses the same generation count as loads.
@@ -382,10 +404,10 @@ public:
       ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
   CallHTType AvailableCalls;
 
-  /// \brief This is the current generation of the memory value.
+  /// This is the current generation of the memory value.
   unsigned CurrentGeneration = 0;
 
-  /// \brief Set up the EarlyCSE runner for a particular function.
+  /// Set up the EarlyCSE runner for a particular function.
   EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
            const TargetTransformInfo &TTI, DominatorTree &DT,
            AssumptionCache &AC, MemorySSA *MSSA)
@@ -401,15 +423,16 @@ private:
   class NodeScope {
   public:
     NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              CallHTType &AvailableCalls)
-        : Scope(AvailableValues), LoadScope(AvailableLoads),
-          CallScope(AvailableCalls) {}
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+      : Scope(AvailableValues), LoadScope(AvailableLoads),
+        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
     NodeScope(const NodeScope &) = delete;
     NodeScope &operator=(const NodeScope &) = delete;
 
   private:
     ScopedHTType::ScopeTy Scope;
     LoadHTType::ScopeTy LoadScope;
+    InvariantHTType::ScopeTy InvariantScope;
     CallHTType::ScopeTy CallScope;
   };
 
@@ -420,10 +443,13 @@ private:
   class StackNode {
   public:
     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
-              DomTreeNode::iterator child, DomTreeNode::iterator end)
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+              unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
+              DomTreeNode::iterator end)
         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
-          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+          EndIter(end),
+          Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+                 AvailableCalls)
           {}
     StackNode(const StackNode &) = delete;
     StackNode &operator=(const StackNode &) = delete;
@@ -455,7 +481,7 @@ private:
     bool Processed = false;
   };
 
-  /// \brief Wrapper class to handle memory instructions, including loads,
+  /// Wrapper class to handle memory instructions, including loads,
   /// stores and intrinsic loads and stores defined by the target.
   class ParseMemoryInst {
   public:
@@ -532,12 +558,7 @@ private:
 
     Value *getPointerOperand() const {
       if (IsTargetMemInst) return Info.PtrVal;
-      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        return LI->getPointerOperand();
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        return SI->getPointerOperand();
-      }
-      return nullptr;
+      return getLoadStorePointerOperand(Inst);
     }
 
     bool mayReadFromMemory() const {
@@ -558,6 +579,9 @@ private:
 
   bool processNode(DomTreeNode *Node);
 
+  bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
+                             const BasicBlock *BB, const BasicBlock *Pred);
+
   Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
     if (auto *LI = dyn_cast<LoadInst>(Inst))
       return LI;
@@ -568,6 +592,10 @@ private:
                                                  ExpectedType);
   }
 
+  /// Return true if the instruction is known to only operate on memory
+  /// provably invariant in the given "generation".
+  bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
   bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
                            Instruction *EarlierInst, Instruction *LaterInst);
 
@@ -661,6 +689,79 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
   return MSSA->dominates(LaterDef, EarlierMA);
 }
 
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+  // A location loaded from with an invariant_load is assumed to *never* change
+  // within the visible scope of the compilation.
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+  auto MemLocOpt = MemoryLocation::getOrNone(I);
+  if (!MemLocOpt)
+    // "target" intrinsic forms of loads aren't currently known to
+    // MemoryLocation::get.  TODO
+    return false;
+  MemoryLocation MemLoc = *MemLocOpt;
+  if (!AvailableInvariants.count(MemLoc))
+    return false;
+
+  // Is the generation at which this became invariant older than the
+  // current one?
+  return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
+bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
+                                     const BranchInst *BI, const BasicBlock *BB,
+                                     const BasicBlock *Pred) {
+  assert(BI->isConditional() && "Should be a conditional branch!");
+  assert(BI->getCondition() == CondInst && "Wrong condition?");
+  assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+  auto *TorF = (BI->getSuccessor(0) == BB)
+                   ? ConstantInt::getTrue(BB->getContext())
+                   : ConstantInt::getFalse(BB->getContext());
+  auto MatchBinOp = [](Instruction *I, unsigned Opcode) {
+    if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(I))
+      return BOp->getOpcode() == Opcode;
+    return false;
+  };
+  // If the condition is AND operation, we can propagate its operands into the
+  // true branch. If it is OR operation, we can propagate them into the false
+  // branch.
+  unsigned PropagateOpcode =
+      (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or;
+
+  bool MadeChanges = false;
+  SmallVector<Instruction *, 4> WorkList;
+  SmallPtrSet<Instruction *, 4> Visited;
+  WorkList.push_back(CondInst);
+  while (!WorkList.empty()) {
+    Instruction *Curr = WorkList.pop_back_val();
+
+    AvailableValues.insert(Curr, TorF);
+    LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                      << Curr->getName() << "' as " << *TorF << " in "
+                      << BB->getName() << "\n");
+    if (!DebugCounter::shouldExecute(CSECounter)) {
+      LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+    } else {
+      // Replace all dominated uses with the known value.
+      if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT,
+                                                    BasicBlockEdge(Pred, BB))) {
+        NumCSECVP += Count;
+        MadeChanges = true;
+      }
+    }
+
+    if (MatchBinOp(Curr, PropagateOpcode))
+      for (auto &Op : cast<BinaryOperator>(Curr)->operands())
+        if (Instruction *OPI = dyn_cast<Instruction>(Op))
+          if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
+            WorkList.push_back(OPI);
+  }
+
+  return MadeChanges;
+}
+
 bool EarlyCSE::processNode(DomTreeNode *Node) {
   bool Changed = false;
   BasicBlock *BB = Node->getBlock();
@@ -684,22 +785,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
     if (BI && BI->isConditional()) {
       auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
-      if (CondInst && SimpleValue::canHandle(CondInst)) {
-        assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
-        auto *TorF = (BI->getSuccessor(0) == BB)
-                         ? ConstantInt::getTrue(BB->getContext())
-                         : ConstantInt::getFalse(BB->getContext());
-        AvailableValues.insert(CondInst, TorF);
-        DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
-                     << CondInst->getName() << "' as " << *TorF << " in "
-                     << BB->getName() << "\n");
-        // Replace all dominated uses with the known value.
-        if (unsigned Count = replaceDominatedUsesWith(
-                CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
-          Changed = true;
-          NumCSECVP += Count;
-        }
-      }
+      if (CondInst && SimpleValue::canHandle(CondInst))
+        Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
     }
   }
 
@@ -716,7 +803,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // Dead instructions should just be removed.
     if (isInstructionTriviallyDead(Inst, &TLI)) {
-      DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+      LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+      if (!DebugCounter::shouldExecute(CSECounter)) {
+        LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+        continue;
+      }
+      salvageDebugInfo(*Inst);
       removeMSSA(Inst);
       Inst->eraseFromParent();
       Changed = true;
@@ -732,31 +824,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       auto *CondI =
           dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
       if (CondI && SimpleValue::canHandle(CondI)) {
-        DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst
+                          << '\n');
         AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
       } else
-        DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
       continue;
     }
 
     // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
     if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
-      DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+      LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
       continue;
     }
 
-    // Skip invariant.start intrinsics since they only read memory, and we can
-    // forward values across it. Also, we dont need to consume the last store
-    // since the semantics of invariant.start allow us to perform DSE of the
-    // last store, if there was a store following invariant.start. Consider:
+    // We can skip all invariant.start intrinsics since they only read memory,
+    // and we can forward values across it. For invariant starts without
+    // invariant ends, we can use the fact that the invariantness never ends to
+    // start a scope in the current generaton which is true for all future
+    // generations.  Also, we dont need to consume the last store since the
+    // semantics of invariant.start allow us to perform   DSE of the last
+    // store, if there was a store following invariant.start. Consider: 
     //
     // store 30, i8* p
     // invariant.start(p)
     // store 40, i8* p
     // We can DSE the store to 30, since the store 40 to invariant location p
     // causes undefined behaviour.
-    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+      // If there are any uses, the scope might end.  
+      if (!Inst->use_empty())
+        continue;
+      auto *CI = cast<CallInst>(Inst);
+      MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+      // Don't start a scope if we already have a better one pushed
+      if (!AvailableInvariants.count(MemLoc))
+        AvailableInvariants.insert(MemLoc, CurrentGeneration);
       continue;
+    }
 
     if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
       if (auto *CondI =
@@ -767,7 +872,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
             // Is the condition known to be true?
             if (isa<ConstantInt>(KnownCond) &&
                 cast<ConstantInt>(KnownCond)->isOne()) {
-              DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+              LLVM_DEBUG(dbgs()
+                         << "EarlyCSE removing guard: " << *Inst << '\n');
               removeMSSA(Inst);
               Inst->eraseFromParent();
               Changed = true;
@@ -792,29 +898,39 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
     if (Value *V = SimplifyInstruction(Inst, SQ)) {
-      DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
-      bool Killed = false;
-      if (!Inst->use_empty()) {
-        Inst->replaceAllUsesWith(V);
-        Changed = true;
-      }
-      if (isInstructionTriviallyDead(Inst, &TLI)) {
-        removeMSSA(Inst);
-        Inst->eraseFromParent();
-        Changed = true;
-        Killed = true;
+      LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V
+                        << '\n');
+      if (!DebugCounter::shouldExecute(CSECounter)) {
+        LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+      } else {
+        bool Killed = false;
+        if (!Inst->use_empty()) {
+          Inst->replaceAllUsesWith(V);
+          Changed = true;
+        }
+        if (isInstructionTriviallyDead(Inst, &TLI)) {
+          removeMSSA(Inst);
+          Inst->eraseFromParent();
+          Changed = true;
+          Killed = true;
+        }
+        if (Changed)
+          ++NumSimplify;
+        if (Killed)
+          continue;
       }
-      if (Changed)
-        ++NumSimplify;
-      if (Killed)
-        continue;
     }
 
     // If this is a simple instruction that we can value number, process it.
     if (SimpleValue::canHandle(Inst)) {
       // See if the instruction has an available value.  If so, use it.
       if (Value *V = AvailableValues.lookup(Inst)) {
-        DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V
+                          << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+          continue;
+        }
         if (auto *I = dyn_cast<Instruction>(V))
           I->andIRFlags(Inst);
         Inst->replaceAllUsesWith(V);
@@ -840,6 +956,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++CurrentGeneration;
       }
 
+      if (MemInst.isInvariantLoad()) {
+        // If we pass an invariant load, we know that memory location is
+        // indefinitely constant from the moment of first dereferenceability.
+        // We conservatively treat the invariant_load as that moment.  If we
+        // pass a invariant load after already establishing a scope, don't
+        // restart it since we want to preserve the earliest point seen.
+        auto MemLoc = MemoryLocation::get(Inst);
+        if (!AvailableInvariants.count(MemLoc))
+          AvailableInvariants.insert(MemLoc, CurrentGeneration);
+      }
+
       // If we have an available version of this load, and if it is the right
       // generation or the load is known to be from an invariant location,
       // replace this instruction.
@@ -854,13 +981,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           !MemInst.isVolatile() && MemInst.isUnordered() &&
           // We can't replace an atomic load with one which isn't also atomic.
           InVal.IsAtomic >= MemInst.isAtomic() &&
-          (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+          (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
            isSameMemGeneration(InVal.Generation, CurrentGeneration,
                                InVal.DefInst, Inst))) {
         Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
         if (Op != nullptr) {
-          DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
-                       << "  to: " << *InVal.DefInst << '\n');
+          LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+                            << "  to: " << *InVal.DefInst << '\n');
+          if (!DebugCounter::shouldExecute(CSECounter)) {
+            LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+            continue;
+          }
           if (!Inst->use_empty())
             Inst->replaceAllUsesWith(Op);
           removeMSSA(Inst);
@@ -875,7 +1006,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       AvailableLoads.insert(
           MemInst.getPointerOperand(),
           LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
-                    MemInst.isAtomic(), MemInst.isInvariantLoad()));
+                    MemInst.isAtomic()));
       LastStore = nullptr;
       continue;
     }
@@ -898,8 +1029,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       if (InVal.first != nullptr &&
           isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
                               Inst)) {
-        DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
-                     << "  to: " << *InVal.first << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+                          << "  to: " << *InVal.first << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+          continue;
+        }
         if (!Inst->use_empty())
           Inst->replaceAllUsesWith(InVal.first);
         removeMSSA(Inst);
@@ -938,8 +1073,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           InVal.MatchingId == MemInst.getMatchingId() &&
           // We don't yet handle removing stores with ordering of any kind.
           !MemInst.isVolatile() && MemInst.isUnordered() &&
-          isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                              InVal.DefInst, Inst)) {
+          (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+           isSameMemGeneration(InVal.Generation, CurrentGeneration,
+                               InVal.DefInst, Inst))) {
         // It is okay to have a LastStore to a different pointer here if MemorySSA
         // tells us that the load and store are from the same memory generation.
         // In that case, LastStore should keep its present value since we're
@@ -949,7 +1085,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
                     MemInst.getPointerOperand() ||
                 MSSA) &&
                "can't have an intervening store if not using MemorySSA!");
-        DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+          continue;
+        }
         removeMSSA(Inst);
         Inst->eraseFromParent();
         Changed = true;
@@ -980,13 +1120,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
                  !LastStoreMemInst.isVolatile() &&
                  "Violated invariant");
           if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
-            DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
-                         << "  due to: " << *Inst << '\n');
-            removeMSSA(LastStore);
-            LastStore->eraseFromParent();
-            Changed = true;
-            ++NumDSE;
-            LastStore = nullptr;
+            LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+                              << "  due to: " << *Inst << '\n');
+            if (!DebugCounter::shouldExecute(CSECounter)) {
+              LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+            } else {
+              removeMSSA(LastStore);
+              LastStore->eraseFromParent();
+              Changed = true;
+              ++NumDSE;
+              LastStore = nullptr;
+            }
           }
           // fallthrough - we can exploit information about this store
         }
@@ -999,7 +1143,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         AvailableLoads.insert(
             MemInst.getPointerOperand(),
             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
-                      MemInst.isAtomic(), /*IsInvariant=*/false));
+                      MemInst.isAtomic()));
 
         // Remember that this was the last unordered store we saw for DSE. We
         // don't yet handle DSE on ordered or volatile stores since we don't
@@ -1031,8 +1175,9 @@ bool EarlyCSE::run() {
 
   // Process the root node.
   nodesToProcess.push_back(new StackNode(
-      AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
-      DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+      AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+      CurrentGeneration, DT.getRootNode(),
+      DT.getRootNode()->begin(), DT.getRootNode()->end()));
 
   // Save the current generation.
   unsigned LiveOutGeneration = CurrentGeneration;
@@ -1056,9 +1201,9 @@ bool EarlyCSE::run() {
       // Push the next child onto the stack.
       DomTreeNode *child = NodeToProcess->nextChild();
       nodesToProcess.push_back(
-          new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
-                        NodeToProcess->childGeneration(), child, child->begin(),
-                        child->end()));
+          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+                        AvailableCalls, NodeToProcess->childGeneration(),
+                        child, child->begin(), child->end()));
     } else {
       // It has been processed, and there are no more children to process,
       // so delete it and pop it off the stack.
@@ -1097,7 +1242,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
 
 namespace {
 
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
 ///
 /// This pass does a simple depth-first walk over the dominator tree,
 /// eliminating trivially redundant instructions and using instsimplify to
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 063df779a30b..117b19fb8a42 100644
--- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "flattencfg"
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index b105ece8dc7c..f2828e80bc58 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -138,7 +138,7 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
 
 // Helper - mark I as having been traversed, having range R.
 void Float2IntPass::seen(Instruction *I, ConstantRange R) {
-  DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+  LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
   auto IT = SeenInsts.find(I);
   if (IT != SeenInsts.end())
     IT->second = std::move(R);
@@ -359,7 +359,7 @@ bool Float2IntPass::validateAndTransform() {
         for (User *U : I->users()) {
           Instruction *UI = dyn_cast<Instruction>(U);
           if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
-            DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+            LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
             Fail = true;
             break;
           }
@@ -380,7 +380,7 @@ bool Float2IntPass::validateAndTransform() {
     // lower limits, plus one so it can be signed.
     unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
                               R.getUpper().getMinSignedBits()) + 1;
-    DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+    LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
 
     // If we've run off the realms of the exactly representable integers,
     // the floating point result will differ from an integer approximation.
@@ -391,11 +391,12 @@ bool Float2IntPass::validateAndTransform() {
     unsigned MaxRepresentableBits
       = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
     if (MinBW > MaxRepresentableBits) {
-      DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+      LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
       continue;
     }
     if (MinBW > 64) {
-      DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+      LLVM_DEBUG(
+          dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
       continue;
     }
 
@@ -490,7 +491,7 @@ void Float2IntPass::cleanup() {
 }
 
 bool Float2IntPass::runImpl(Function &F) {
-  DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
   // Clear out all state.
   ECs = EquivalenceClasses<Instruction*>();
   SeenInsts.clear();
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index e2c1eaf58e43..1e0a22cb14b3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -38,7 +38,9 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
@@ -69,7 +71,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
 #include <algorithm>
@@ -765,6 +766,15 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
 
+    // If the value is the load that we will be eliminating, and the block it's
+    // available in is the block that the load is in, then don't add it as
+    // SSAUpdater will resolve the value to the relevant phi which may let it
+    // avoid phi construction entirely if there's actually only one value.
+    if (BB == LI->getParent() &&
+        ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) ||
+         (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI)))
+      continue;
+
     SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
   }
 
@@ -783,9 +793,10 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
     if (Res->getType() != LoadTy) {
       Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
 
-      DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
-                   << *getSimpleValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
+      LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
+                        << "  " << *getSimpleValue() << '\n'
+                        << *Res << '\n'
+                        << "\n\n\n");
     }
   } else if (isCoercedLoadValue()) {
     LoadInst *Load = getCoercedLoadValue();
@@ -799,20 +810,21 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
       // but then there all of the operations based on it would need to be
       // rehashed.  Just leave the dead load around.
       gvn.getMemDep().removeInstruction(Load);
-      DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
-                   << *getCoercedLoadValue() << '\n'
-                   << *Res << '\n'
-                   << "\n\n\n");
+      LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset
+                        << "  " << *getCoercedLoadValue() << '\n'
+                        << *Res << '\n'
+                        << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
     Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
                                  InsertPt, DL);
-    DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
-                 << "  " << *getMemIntrinValue() << '\n'
-                 << *Res << '\n' << "\n\n\n");
+    LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                      << "  " << *getMemIntrinValue() << '\n'
+                      << *Res << '\n'
+                      << "\n\n\n");
   } else {
     assert(isUndefValue() && "Should be UndefVal");
-    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
     return UndefValue::get(LoadTy);
   }
   assert(Res && "failed to materialize?");
@@ -825,7 +837,7 @@ static bool isLifetimeStart(const Instruction *Inst) {
   return false;
 }
 
-/// \brief Try to locate the three instruction involved in a missed
+/// Try to locate the three instruction involved in a missed
 /// load-elimination case that is due to an intervening store.
 static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
                                    DominatorTree *DT,
@@ -914,13 +926,11 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       }
     }
     // Nothing known about this clobber, have to be conservative
-    DEBUG(
-      // fast print dep, using operator<< on instruction is too slow.
-      dbgs() << "GVN: load ";
-      LI->printAsOperand(dbgs());
-      Instruction *I = DepInfo.getInst();
-      dbgs() << " is clobbered by " << *I << '\n';
-    );
+    LLVM_DEBUG(
+        // fast print dep, using operator<< on instruction is too slow.
+        dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+        Instruction *I = DepInfo.getInst();
+        dbgs() << " is clobbered by " << *I << '\n';);
     if (ORE->allowExtraAnalysis(DEBUG_TYPE))
       reportMayClobberedLoad(LI, DepInfo, DT, ORE);
 
@@ -978,12 +988,10 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
   }
 
   // Unknown def - must be conservative
-  DEBUG(
-    // fast print dep, using operator<< on instruction is too slow.
-    dbgs() << "GVN: load ";
-    LI->printAsOperand(dbgs());
-    dbgs() << " has unknown def " << *DepInst << '\n';
-  );
+  LLVM_DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+      dbgs() << " has unknown def " << *DepInst << '\n';);
   return false;
 }
 
@@ -1065,7 +1073,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // It is illegal to move the array access to any point above the guard,
   // because if the index is out of bounds we should deoptimize rather than
   // access the array.
-  // Check that there is no guard in this block above our intruction.
+  // Check that there is no guard in this block above our instruction.
   if (!IsSafeToSpeculativelyExecute) {
     auto It = FirstImplicitControlFlowInsts.find(TmpBB);
     if (It != FirstImplicitControlFlowInsts.end()) {
@@ -1113,9 +1121,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // If any predecessor block is an EH pad that does not allow non-PHI
     // instructions before the terminator, we can't PRE the load.
     if (Pred->getTerminator()->isEHPad()) {
-      DEBUG(dbgs()
-            << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
-            << Pred->getName() << "': " << *LI << '\n');
+      LLVM_DEBUG(
+          dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+                 << Pred->getName() << "': " << *LI << '\n');
       return false;
     }
 
@@ -1125,15 +1133,16 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
     if (Pred->getTerminator()->getNumSuccessors() != 1) {
       if (isa<IndirectBrInst>(Pred->getTerminator())) {
-        DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
-              << Pred->getName() << "': " << *LI << '\n');
+        LLVM_DEBUG(
+            dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+                   << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
 
       if (LoadBB->isEHPad()) {
-        DEBUG(dbgs()
-              << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
-              << Pred->getName() << "': " << *LI << '\n');
+        LLVM_DEBUG(
+            dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+                   << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
 
@@ -1161,8 +1170,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
     assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
     PredLoads[NewPred] = nullptr;
-    DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
-                 << LoadBB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                      << LoadBB->getName() << '\n');
   }
 
   // Check if the load can safely be moved to all the unavailable predecessors.
@@ -1186,8 +1195,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // If we couldn't find or insert a computation of this phi translated value,
     // we fail PRE.
     if (!LoadPtr) {
-      DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
-            << *LI->getPointerOperand() << "\n");
+      LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+                        << *LI->getPointerOperand() << "\n");
       CanDoPRE = false;
       break;
     }
@@ -1208,10 +1217,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // Okay, we can eliminate this load by inserting a reload in the predecessor
   // and using PHI construction to get the value in the other predecessors, do
   // it.
-  DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
-  DEBUG(if (!NewInsts.empty())
-          dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
-                 << *NewInsts.back() << '\n');
+  LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+  LLVM_DEBUG(if (!NewInsts.empty()) dbgs()
+             << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back()
+             << '\n');
 
   // Assign value numbers to the new instructions.
   for (Instruction *I : NewInsts) {
@@ -1262,7 +1271,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
                                                         NewLoad));
     MD->invalidateCachedPointerInfo(LoadPtr);
-    DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+    LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
   }
 
   // Perform PHI construction.
@@ -1320,11 +1329,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // clobber in the current block.  Reject this early.
   if (NumDeps == 1 &&
       !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
-    DEBUG(
-      dbgs() << "GVN: non-local load ";
-      LI->printAsOperand(dbgs());
-      dbgs() << " has unknown dependencies\n";
-    );
+    LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs());
+               dbgs() << " has unknown dependencies\n";);
     return false;
   }
 
@@ -1353,7 +1359,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // load, then it is fully redundant and we can use PHI insertion to compute
   // its value.  Insert PHIs and remove the fully redundant value now.
   if (UnavailableBlocks.empty()) {
-    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+    LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
 
     // Perform PHI construction.
     Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
@@ -1506,12 +1512,10 @@ bool GVN::processLoad(LoadInst *L) {
   // Only handle the local case below
   if (!Dep.isDef() && !Dep.isClobber()) {
     // This might be a NonFuncLocal or an Unknown
-    DEBUG(
-      // fast print dep, using operator<< on instruction is too slow.
-      dbgs() << "GVN: load ";
-      L->printAsOperand(dbgs());
-      dbgs() << " has unknown dependence\n";
-    );
+    LLVM_DEBUG(
+        // fast print dep, using operator<< on instruction is too slow.
+        dbgs() << "GVN: load "; L->printAsOperand(dbgs());
+        dbgs() << " has unknown dependence\n";);
     return false;
   }
 
@@ -1695,8 +1699,8 @@ bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
     if (it != ReplaceWithConstMap.end()) {
       assert(!isa<Constant>(Operand) &&
              "Replacing constants with constants is invalid");
-      DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
-                   << " in instruction " << *Instr << '\n');
+      LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
+                        << *it->second << " in instruction " << *Instr << '\n');
       Instr->setOperand(OpNum, it->second);
       Changed = true;
     }
@@ -2038,7 +2042,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
 
   unsigned Iteration = 0;
   while (ShouldContinue) {
-    DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+    LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
     ShouldContinue = iterateOnFunction(F);
     Changed |= ShouldContinue;
     ++Iteration;
@@ -2104,9 +2108,10 @@ bool GVN::processBlock(BasicBlock *BB) {
     const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB);
     for (auto *I : InstrsToErase) {
       assert(I->getParent() == BB && "Removing instruction from wrong block?");
-      DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+      salvageDebugInfo(*I);
       if (MD) MD->removeInstruction(I);
-      DEBUG(verifyRemoved(I));
+      LLVM_DEBUG(verifyRemoved(I));
       if (MaybeFirstICF == I) {
         // We have erased the first ICF in block. The map needs to be updated.
         InvalidateImplicitCF = true;
@@ -2288,7 +2293,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     PREInstr = CurInst->clone();
     if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
       // If we failed insertion, make sure we remove the instruction.
-      DEBUG(verifyRemoved(PREInstr));
+      LLVM_DEBUG(verifyRemoved(PREInstr));
       PREInstr->deleteValue();
       return false;
     }
@@ -2326,10 +2331,10 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   VN.erase(CurInst);
   removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
 
-  DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+  LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
   if (MD)
     MD->removeInstruction(CurInst);
-  DEBUG(verifyRemoved(CurInst));
+  LLVM_DEBUG(verifyRemoved(CurInst));
   bool InvalidateImplicitCF =
       FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst;
   // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 026fab5dbd3b..6d2b25cf6013 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -72,7 +73,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -534,7 +534,7 @@ private:
 
     if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
       if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
-        if (firstInBB(NewPt, UD->getMemoryInst()))
+        if (!firstInBB(UD->getMemoryInst(), NewPt))
           // Cannot move the load or store to NewPt above its definition in D.
           return false;
 
@@ -570,7 +570,7 @@ private:
   // The ides is inspired from:
   // "Partial Redundancy Elimination in SSA Form"
   // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
-  // They use similar idea in the forward graph to to find fully redundant and
+  // They use similar idea in the forward graph to find fully redundant and
   // partially redundant expressions, here it is used in the inverse graph to
   // find fully anticipable instructions at merge point (post-dominator in
   // the inverse CFG).
@@ -578,7 +578,7 @@ private:
 
   // Returns true when the values are flowing out to each edge.
   bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
-    if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
+    if (TI->getNumSuccessors() > (unsigned)size(C))
       return false; // Not enough args in this CHI.
 
     for (auto CHI : C) {
@@ -622,7 +622,7 @@ private:
       // Iterate in reverse order to keep lower ranked values on the top.
       for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
         // Get the value of instruction I
-        DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+        LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
         RenameStack[VI.first].push_back(VI.second);
       }
     }
@@ -636,7 +636,7 @@ private:
       if (P == CHIBBs.end()) {
         continue;
       }
-      DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+      LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
       // A CHI is found (BB -> Pred is an edge in the CFG)
       // Pop the stack until Top(V) = Ve.
       auto &VCHI = P->second;
@@ -651,9 +651,9 @@ private:
               DT->properlyDominates(Pred, si->second.back()->getParent())) {
             C.Dest = BB;                     // Assign the edge
             C.I = si->second.pop_back_val(); // Assign the argument
-            DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
-                         << *C.I << ", VN: " << C.VN.first << ", "
-                         << C.VN.second);
+            LLVM_DEBUG(dbgs()
+                       << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
+                       << ", VN: " << C.VN.first << ", " << C.VN.second);
           }
           // Move to next CHI of a different value
           It = std::find_if(It, VCHI.end(),
@@ -748,11 +748,11 @@ private:
     // TODO: Remove fully-redundant expressions.
     // Get instruction from the Map, assume that all the Instructions
     // with same VNs have same rank (this is an approximation).
-    std::sort(Ranks.begin(), Ranks.end(),
-              [this, &Map](const VNType &r1, const VNType &r2) {
-                return (rank(*Map.lookup(r1).begin()) <
-                        rank(*Map.lookup(r2).begin()));
-              });
+    llvm::sort(Ranks.begin(), Ranks.end(),
+               [this, &Map](const VNType &r1, const VNType &r2) {
+                 return (rank(*Map.lookup(r1).begin()) <
+                         rank(*Map.lookup(r2).begin()));
+               });
 
     // - Sort VNs according to their rank, and start with lowest ranked VN
     // - Take a VN and for each instruction with same VN
@@ -798,8 +798,8 @@ private:
            // Ignore spurious PDFs.
           if (DT->properlyDominates(IDFB, V[i]->getParent())) {
             OutValue[IDFB].push_back(C);
-            DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
-                         << ", for Insn: " << *V[i]);
+            LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
+                              << ", for Insn: " << *V[i]);
           }
         }
       }
@@ -1200,6 +1200,7 @@ INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
                     "Early GVN Hoisting of Expressions", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 5594c29bbd9f..28c5940db1e0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -48,6 +48,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -71,7 +72,6 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -239,7 +239,7 @@ public:
     SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
       Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
-    std::sort(Ops.begin(), Ops.end());
+    llvm::sort(Ops.begin(), Ops.end());
     for (auto &P : Ops) {
       Blocks.push_back(P.first);
       Values.push_back(P.second);
@@ -361,7 +361,7 @@ public:
 
     for (auto &U : I->uses())
       op_push_back(U.getUser());
-    std::sort(op_begin(), op_end());
+    llvm::sort(op_begin(), op_end());
   }
 
   void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
@@ -561,7 +561,8 @@ public:
   GVNSink() = default;
 
   bool run(Function &F) {
-    DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
+                      << "\n");
 
     unsigned NumSunk = 0;
     ReversePostOrderTraversal<Function*> RPOT(&F);
@@ -629,15 +630,15 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
   LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
   ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
   auto Insts = *LRI;
-  DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
-                                                             : Insts) {
+  LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+                                                                  : Insts) {
     I->dump();
   } dbgs() << " ]\n";);
 
   DenseMap<uint32_t, unsigned> VNums;
   for (auto *I : Insts) {
     uint32_t N = VN.lookupOrAdd(I);
-    DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
+    LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
     if (N == ~0U)
       return None;
     VNums[N]++;
@@ -749,8 +750,8 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
 }
 
 unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
-  DEBUG(dbgs() << "GVNSink: running on basic block ";
-        BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "GVNSink: running on basic block ";
+             BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
   SmallVector<BasicBlock *, 4> Preds;
   for (auto *B : predecessors(BBEnd)) {
     auto *T = B->getTerminator();
@@ -761,7 +762,7 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
   }
   if (Preds.size() < 2)
     return 0;
-  std::sort(Preds.begin(), Preds.end());
+  llvm::sort(Preds.begin(), Preds.end());
 
   unsigned NumOrigPreds = Preds.size();
   // We can only sink instructions through unconditional branches.
@@ -794,23 +795,23 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
       Candidates.begin(), Candidates.end(),
       [](const SinkingInstructionCandidate &A,
          const SinkingInstructionCandidate &B) { return A > B; });
-  DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
-                                                    : Candidates) dbgs()
-                                               << "  " << C << "\n";);
+  LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+                                                         : Candidates) dbgs()
+                                                    << "  " << C << "\n";);
 
   // Pick the top candidate, as long it is positive!
   if (Candidates.empty() || Candidates.front().Cost <= 0)
     return 0;
   auto C = Candidates.front();
 
-  DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+  LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n");
   BasicBlock *InsertBB = BBEnd;
   if (C.Blocks.size() < NumOrigPreds) {
-    DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
-          dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << " -- Splitting edge to ";
+               BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
     InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
     if (!InsertBB) {
-      DEBUG(dbgs() << " -- FAILED to split edge!\n");
+      LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n");
       // Edge couldn't be split.
       return 0;
     }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index c4aeccb85ca7..ad1598d7b8bf 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -40,9 +40,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/GuardWidening.h"
+#include <functional>
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/ConstantRange.h"
@@ -53,6 +55,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 
 using namespace llvm;
 
@@ -62,9 +65,14 @@ namespace {
 
 class GuardWideningImpl {
   DominatorTree &DT;
-  PostDominatorTree &PDT;
+  PostDominatorTree *PDT;
   LoopInfo &LI;
 
+  /// Together, these describe the region of interest.  This might be all of
+  /// the blocks within a function, or only a given loop's blocks and preheader.
+  DomTreeNode *Root;
+  std::function<bool(BasicBlock*)> BlockFilter;
+
   /// The set of guards whose conditions have been widened into dominating
   /// guards.
   SmallVector<IntrinsicInst *, 16> EliminatedGuards;
@@ -205,39 +213,15 @@ class GuardWideningImpl {
   }
 
 public:
-  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
-                             LoopInfo &LI)
-      : DT(DT), PDT(PDT), LI(LI) {}
+
+  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
+                             LoopInfo &LI, DomTreeNode *Root,
+                             std::function<bool(BasicBlock*)> BlockFilter)
+    : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {}
 
   /// The entry point for this pass.
   bool run();
 };
-
-struct GuardWideningLegacyPass : public FunctionPass {
-  static char ID;
-  GuardWideningPass Impl;
-
-  GuardWideningLegacyPass() : FunctionPass(ID) {
-    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F))
-      return false;
-    return GuardWideningImpl(
-               getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-               getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
-               getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-  }
-};
-
 }
 
 bool GuardWideningImpl::run() {
@@ -246,9 +230,12 @@ bool GuardWideningImpl::run() {
   DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
   bool Changed = false;
 
-  for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+  for (auto DFI = df_begin(Root), DFE = df_end(Root);
        DFI != DFE; ++DFI) {
     auto *BB = (*DFI)->getBlock();
+    if (!BlockFilter(BB))
+      continue;
+
     auto &CurrentList = GuardsInBlock[BB];
 
     for (auto &I : *BB)
@@ -259,6 +246,7 @@ bool GuardWideningImpl::run() {
       Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
   }
 
+  assert(EliminatedGuards.empty() || Changed);
   for (auto *II : EliminatedGuards)
     if (!WidenedGuards.count(II))
       II->eraseFromParent();
@@ -278,6 +266,8 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
   // for the most profit.
   for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
     auto *CurBB = DFSI.getPath(i)->getBlock();
+    if (!BlockFilter(CurBB))
+      break;
     auto *CurLoop = LI.getLoopFor(CurBB);
     assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
     const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
@@ -312,9 +302,9 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
     for (auto *Candidate : make_range(I, E)) {
       auto Score =
           computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
-      DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
-                   << " and " << *Candidate->getArgOperand(0) << " is "
-                   << scoreTypeToString(Score) << "\n");
+      LLVM_DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+                        << " and " << *Candidate->getArgOperand(0) << " is "
+                        << scoreTypeToString(Score) << "\n");
       if (Score > BestScoreSoFar) {
         BestScoreSoFar = Score;
         BestSoFar = Candidate;
@@ -323,15 +313,16 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
   }
 
   if (BestScoreSoFar == WS_IllegalOrNegative) {
-    DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+    LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
     return false;
   }
 
   assert(BestSoFar != GuardInst && "Should have never visited same guard!");
   assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
 
-  DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
-               << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+  LLVM_DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+                    << " with score " << scoreTypeToString(BestScoreSoFar)
+                    << "\n");
   widenGuard(BestSoFar, GuardInst->getArgOperand(0));
   GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
   EliminatedGuards.push_back(GuardInst);
@@ -345,6 +336,8 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
   bool HoistingOutOfLoop = false;
 
   if (DominatingGuardLoop != DominatedGuardLoop) {
+    // Be conservative and don't widen into a sibling loop.  TODO: If the
+    // sibling is colder, we should consider allowing this.
     if (DominatingGuardLoop &&
         !DominatingGuardLoop->contains(DominatedGuardLoop))
       return WS_IllegalOrNegative;
@@ -355,9 +348,14 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
   if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
     return WS_IllegalOrNegative;
 
-  bool HoistingOutOfIf =
-      !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
-
+  // If the guard was conditional executed, it may never be reached
+  // dynamically.  There are two potential downsides to hoisting it out of the
+  // conditionally executed region: 1) we may spuriously deopt without need and
+  // 2) we have the extra cost of computing the guard condition in the common
+  // case.  At the moment, we really only consider the second in our heuristic
+  // here.  TODO: evaluate cost model for spurious deopt
+  // NOTE: As written, this also lets us hoist right over another guard which
+  // is essentially just another spelling for control flow.  
   if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
                                DominatingGuard->getArgOperand(0)))
     return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
@@ -365,7 +363,26 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
   if (HoistingOutOfLoop)
     return WS_Positive;
 
-  return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+  // Returns true if we might be hoisting above explicit control flow.  Note
+  // that this completely ignores implicit control flow (guards, calls which
+  // throw, etc...).  That choice appears arbitrary.
+  auto MaybeHoistingOutOfIf = [&]() {
+    auto *DominatingBlock = DominatingGuard->getParent();
+    auto *DominatedBlock = DominatedGuard->getParent();
+    
+    // Same Block?
+    if (DominatedBlock == DominatingBlock)
+      return false;
+    // Obvious successor (common loop header/preheader case)
+    if (DominatedBlock == DominatingBlock->getUniqueSuccessor())
+      return false;
+    // TODO: diamond, triangle cases
+    if (!PDT) return true;
+    return !PDT->dominates(DominatedGuard->getParent(),
+                           DominatingGuard->getParent());
+  };
+
+  return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral;
 }
 
 bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
@@ -581,9 +598,9 @@ bool GuardWideningImpl::combineRangeChecks(
     // CurrentChecks.size() will typically be 3 here, but so far there has been
     // no need to hard-code that fact.
 
-    std::sort(CurrentChecks.begin(), CurrentChecks.end(),
-              [&](const GuardWideningImpl::RangeCheck &LHS,
-                  const GuardWideningImpl::RangeCheck &RHS) {
+    llvm::sort(CurrentChecks.begin(), CurrentChecks.end(),
+               [&](const GuardWideningImpl::RangeCheck &LHS,
+                   const GuardWideningImpl::RangeCheck &RHS) {
       return LHS.getOffsetValue().slt(RHS.getOffsetValue());
     });
 
@@ -651,19 +668,6 @@ bool GuardWideningImpl::combineRangeChecks(
   return RangeChecksOut.size() != OldCount;
 }
 
-PreservedAnalyses GuardWideningPass::run(Function &F,
-                                         FunctionAnalysisManager &AM) {
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &LI = AM.getResult<LoopAnalysis>(F);
-  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  if (!GuardWideningImpl(DT, PDT, LI).run())
-    return PreservedAnalyses::all();
-
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
-}
-
 #ifndef NDEBUG
 StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
   switch (WS) {
@@ -681,7 +685,82 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
 }
 #endif
 
+PreservedAnalyses GuardWideningPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+                         [](BasicBlock*) { return true; } ).run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct GuardWideningLegacyPass : public FunctionPass {
+  static char ID;
+
+  GuardWideningLegacyPass() : FunctionPass(ID) {
+    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+                         [](BasicBlock*) { return true; } ).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+
+/// Same as above, but restricted to a single loop at a time.  Can be
+/// scheduled with other loop passes w/o breaking out of LPM
+struct LoopGuardWideningLegacyPass : public LoopPass {
+  static char ID;
+
+  LoopGuardWideningLegacyPass() : LoopPass(ID) {
+    initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    BasicBlock *RootBB = L->getLoopPredecessor();
+    if (!RootBB)
+      RootBB = L->getHeader();
+    auto BlockFilter = [&](BasicBlock *BB) {
+      return BB == RootBB || L->contains(BB);
+    };
+    return GuardWideningImpl(DT, PDT, LI,
+                             DT.getNode(RootBB), BlockFilter).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    getLoopAnalysisUsage(AU);
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+  }
+};
+}
+
 char GuardWideningLegacyPass::ID = 0;
+char LoopGuardWideningLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
                       false, false)
@@ -691,6 +770,20 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
                     false, false)
 
+INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
+                      "Widen guards (within a single loop, as a loop pass)",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
+                    "Widen guards (within a single loop, as a loop pass)",
+                    false, false)
+
 FunctionPass *llvm::createGuardWideningPass() {
   return new GuardWideningLegacyPass();
 }
+
+Pass *llvm::createLoopGuardWideningPass() {
+  return new LoopGuardWideningLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 221fe57581ca..8656e88b79cb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
@@ -77,7 +78,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include <cassert>
@@ -210,8 +210,8 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
     if (FromBase == ToBase)
       return true;
 
-    DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
-          << *FromBase << " != " << *ToBase << "\n");
+    LLVM_DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " << *FromBase
+                      << " != " << *ToBase << "\n");
 
     return false;
   }
@@ -653,8 +653,9 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         Value *ExitVal =
             expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
 
-        DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
-                     << "  LoopVal = " << *Inst << "\n");
+        LLVM_DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
+                          << '\n'
+                          << "  LoopVal = " << *Inst << "\n");
 
         if (!isValidRewrite(Inst, ExitVal)) {
           DeadInsts.push_back(ExitVal);
@@ -1084,7 +1085,7 @@ Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
   Instruction *NarrowDef = DU.NarrowDef;
   Instruction *WideDef = DU.WideDef;
 
-  DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+  LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
 
   // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
   // about the narrow operand yet so must insert a [sz]ext. It is probably loop
@@ -1115,7 +1116,7 @@ Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
   Instruction *NarrowDef = DU.NarrowDef;
   Instruction *WideDef = DU.WideDef;
 
-  DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+  LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
 
   unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
 
@@ -1315,8 +1316,8 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
 /// This IV user cannot be widen. Replace this use of the original narrow IV
 /// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
 static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
-  DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
-        << " for user " << *DU.NarrowUse << "\n");
+  LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
+                    << *DU.NarrowUse << "\n");
   IRBuilder<> Builder(
       getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
@@ -1396,8 +1397,8 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
         Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
         UsePhi->replaceAllUsesWith(Trunc);
         DeadInsts.emplace_back(UsePhi);
-        DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
-              << " to " << *WidePhi << "\n");
+        LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
+                          << *WidePhi << "\n");
       }
       return nullptr;
     }
@@ -1428,15 +1429,16 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
         // A wider extend was hidden behind a narrower one. This may induce
         // another round of IV widening in which the intermediate IV becomes
         // dead. It should be very rare.
-        DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
-              << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+        LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+                          << " not wide enough to subsume " << *DU.NarrowUse
+                          << "\n");
         DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
         NewDef = DU.NarrowUse;
       }
     }
     if (NewDef != DU.NarrowUse) {
-      DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
-            << " replaced by " << *DU.WideDef << "\n");
+      LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+                        << " replaced by " << *DU.WideDef << "\n");
       ++NumElimExt;
       DU.NarrowUse->replaceAllUsesWith(NewDef);
       DeadInsts.emplace_back(DU.NarrowUse);
@@ -1491,8 +1493,9 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
   // absolutely guarantee it. Hence the following failsafe check. In rare cases
   // where it fails, we simply throw away the newly created wide use.
   if (WideAddRec.first != SE->getSCEV(WideUse)) {
-    DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
-          << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n");
+    LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": "
+                      << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first
+                      << "\n");
     DeadInsts.emplace_back(WideUse);
     return nullptr;
   }
@@ -1597,7 +1600,7 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
     WideInc->setDebugLoc(OrigInc->getDebugLoc());
   }
 
-  DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
   ++NumWidened;
 
   // Traverse the def-use chain using a worklist starting at the original IV.
@@ -2231,12 +2234,12 @@ linearFunctionTestReplace(Loop *L,
   else
     P = ICmpInst::ICMP_EQ;
 
-  DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
-               << "      LHS:" << *CmpIndVar << '\n'
-               << "       op:\t"
-               << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
-               << "      RHS:\t" << *ExitCnt << "\n"
-               << "  IVCount:\t" << *IVCount << "\n");
+  LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+                    << "      LHS:" << *CmpIndVar << '\n'
+                    << "       op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==")
+                    << "\n"
+                    << "      RHS:\t" << *ExitCnt << "\n"
+                    << "  IVCount:\t" << *IVCount << "\n");
 
   IRBuilder<> Builder(BI);
 
@@ -2272,7 +2275,7 @@ linearFunctionTestReplace(Loop *L,
         NewLimit = Start + Count;
       ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
 
-      DEBUG(dbgs() << "  Widen RHS:\t" << *ExitCnt << "\n");
+      LLVM_DEBUG(dbgs() << "  Widen RHS:\t" << *ExitCnt << "\n");
     } else {
       // We try to extend trip count first. If that doesn't work we truncate IV.
       // Zext(trunc(IV)) == IV implies equivalence of the following two:
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index cf98088111be..e2f29705f2dd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -43,6 +43,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
@@ -52,6 +53,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -179,10 +181,7 @@ public:
     OS << "  Step: ";
     Step->print(OS);
     OS << "  End: ";
-    if (End)
-      End->print(OS);
-    else
-      OS << "(null)";
+    End->print(OS);
     OS << "\n  CheckUse: ";
     getCheckUse()->getUser()->print(OS);
     OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
@@ -196,7 +195,7 @@ public:
   Use *getCheckUse() const { return CheckUse; }
 
   /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If
-  /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+  /// R.getEnd() le R.getBegin(), then R denotes the empty range.
 
   class Range {
     const SCEV *Begin;
@@ -238,17 +237,31 @@ public:
   /// checks, and hence don't end up in \p Checks.
   static void
   extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
-                               BranchProbabilityInfo &BPI,
+                               BranchProbabilityInfo *BPI,
                                SmallVectorImpl<InductiveRangeCheck> &Checks);
 };
 
-class InductiveRangeCheckElimination : public LoopPass {
+class InductiveRangeCheckElimination {
+  ScalarEvolution &SE;
+  BranchProbabilityInfo *BPI;
+  DominatorTree &DT;
+  LoopInfo &LI;
+
+public:
+  InductiveRangeCheckElimination(ScalarEvolution &SE,
+                                 BranchProbabilityInfo *BPI, DominatorTree &DT,
+                                 LoopInfo &LI)
+      : SE(SE), BPI(BPI), DT(DT), LI(LI) {}
+
+  bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
+};
+
+class IRCELegacyPass : public LoopPass {
 public:
   static char ID;
 
-  InductiveRangeCheckElimination() : LoopPass(ID) {
-    initializeInductiveRangeCheckEliminationPass(
-        *PassRegistry::getPassRegistry());
+  IRCELegacyPass() : LoopPass(ID) {
+    initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -261,14 +274,14 @@ public:
 
 } // end anonymous namespace
 
-char InductiveRangeCheckElimination::ID = 0;
+char IRCELegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
                       "Inductive range check elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
-                    "Inductive range check elimination", false, false)
+INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
+                    false, false)
 
 StringRef InductiveRangeCheck::rangeCheckKindToStr(
     InductiveRangeCheck::RangeCheckKind RCK) {
@@ -299,13 +312,8 @@ InductiveRangeCheck::RangeCheckKind
 InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
                                          ScalarEvolution &SE, Value *&Index,
                                          Value *&Length, bool &IsSigned) {
-  auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
-    const SCEV *S = SE.getSCEV(V);
-    if (isa<SCEVCouldNotCompute>(S))
-      return false;
-
-    return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
-           SE.isKnownNonNegative(S);
+  auto IsLoopInvariant = [&SE, L](Value *V) {
+    return SE.isLoopInvariant(SE.getSCEV(V), L);
   };
 
   ICmpInst::Predicate Pred = ICI->getPredicate();
@@ -337,7 +345,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
       return RANGE_CHECK_LOWER;
     }
 
-    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+    if (IsLoopInvariant(LHS)) {
       Index = RHS;
       Length = LHS;
       return RANGE_CHECK_UPPER;
@@ -349,7 +357,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
     LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_UGT:
     IsSigned = false;
-    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+    if (IsLoopInvariant(LHS)) {
       Index = RHS;
       Length = LHS;
       return RANGE_CHECK_BOTH;
@@ -394,8 +402,23 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
   if (!IsAffineIndex)
     return;
 
+  const SCEV *End = nullptr;
+  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+  // We can potentially do much better here.
+  if (Length)
+    End = SE.getSCEV(Length);
+  else {
+    assert(RCKind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+    // So far we can only reach this point for Signed range check. This may
+    // change in future. In this case we will need to pick Unsigned max for the
+    // unsigned range check.
+    unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth();
+    const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+    End = SIntMax;
+  }
+
   InductiveRangeCheck IRC;
-  IRC.End = Length ? SE.getSCEV(Length) : nullptr;
+  IRC.End = End;
   IRC.Begin = IndexAddRec->getStart();
   IRC.Step = IndexAddRec->getStepRecurrence(SE);
   IRC.CheckUse = &ConditionUse;
@@ -405,15 +428,15 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
 }
 
 void InductiveRangeCheck::extractRangeChecksFromBranch(
-    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
     SmallVectorImpl<InductiveRangeCheck> &Checks) {
   if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
     return;
 
   BranchProbability LikelyTaken(15, 16);
 
-  if (!SkipProfitabilityChecks &&
-      BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+  if (!SkipProfitabilityChecks && BPI &&
+      BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
     return;
 
   SmallPtrSet<Value *, 8> Visited;
@@ -504,9 +527,8 @@ struct LoopStructure {
   }
 
   static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
-                                                    BranchProbabilityInfo &BPI,
-                                                    Loop &,
-                                                    const char *&);
+                                                    BranchProbabilityInfo *BPI,
+                                                    Loop &, const char *&);
 };
 
 /// This class is used to constrain loops to run within a given iteration space.
@@ -573,7 +595,7 @@ class LoopConstrainer {
   // Create the appropriate loop structure needed to describe a cloned copy of
   // `Original`.  The clone is described by `VM`.
   Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
-                                  ValueToValueMapTy &VM);
+                                  ValueToValueMapTy &VM, bool IsSubloop);
 
   // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
   // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
@@ -625,8 +647,8 @@ class LoopConstrainer {
   LLVMContext &Ctx;
   ScalarEvolution &SE;
   DominatorTree &DT;
-  LPPassManager &LPM;
   LoopInfo &LI;
+  function_ref<void(Loop *, bool)> LPMAddNewLoop;
 
   // Information about the original loop we started out with.
   Loop &OriginalLoop;
@@ -646,12 +668,13 @@ class LoopConstrainer {
   LoopStructure MainLoopStructure;
 
 public:
-  LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM,
+  LoopConstrainer(Loop &L, LoopInfo &LI,
+                  function_ref<void(Loop *, bool)> LPMAddNewLoop,
                   const LoopStructure &LS, ScalarEvolution &SE,
                   DominatorTree &DT, InductiveRangeCheck::Range R)
       : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
-        SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R),
-        MainLoopStructure(LS) {}
+        SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
+        Range(R), MainLoopStructure(LS) {}
 
   // Entry point for the algorithm.  Returns true on success.
   bool run();
@@ -666,56 +689,141 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
       PN->setIncomingBlock(i, ReplaceBy);
 }
 
-static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) {
-  APInt Max = Signed ?
-      APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) :
-      APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
-  return SE.getSignedRange(S).contains(Max) &&
-         SE.getUnsignedRange(S).contains(Max);
+static bool CannotBeMaxInLoop(const SCEV *BoundSCEV, Loop *L,
+                              ScalarEvolution &SE, bool Signed) {
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) :
+    APInt::getMaxValue(BitWidth);
+  auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+  return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+         SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV,
+                                     SE.getConstant(Max));
+}
+
+/// Given a loop with an deccreasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeDecreasingBound(const SCEV *Start,
+                                  const SCEV *BoundSCEV, const SCEV *Step,
+                                  ICmpInst::Predicate Pred,
+                                  unsigned LatchBrExitIdx,
+                                  Loop *L, ScalarEvolution &SE) {
+  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+    return false;
+
+  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+    return false;
+
+  assert(SE.isKnownNegative(Step) && "expecting negative step");
+
+  LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
+  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+  bool IsSigned = ICmpInst::isSigned(Pred);
+  // The predicate that we need to check that the induction variable lies
+  // within bounds.
+  ICmpInst::Predicate BoundPred =
+    IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+  if (LatchBrExitIdx == 1)
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+  assert(LatchBrExitIdx == 0 &&
+         "LatchBrExitIdx should be either 0 or 1");
+            
+  const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
+    APInt::getMinValue(BitWidth);
+  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
+
+  const SCEV *MinusOne =
+    SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+
+  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
+         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+
+}
+
+/// Given a loop with an increasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeIncreasingBound(const SCEV *Start,
+                                  const SCEV *BoundSCEV, const SCEV *Step,
+                                  ICmpInst::Predicate Pred,
+                                  unsigned LatchBrExitIdx,
+                                  Loop *L, ScalarEvolution &SE) {
+  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+    return false;
+
+  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
+  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+  bool IsSigned = ICmpInst::isSigned(Pred);
+  // The predicate that we need to check that the induction variable lies
+  // within bounds.
+  ICmpInst::Predicate BoundPred =
+      IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+  if (LatchBrExitIdx == 1)
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+  assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
+
+  const SCEV *StepMinusOne =
+    SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) : 
+    APInt::getMaxValue(BitWidth);
+  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
+
+  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
+                                      SE.getAddExpr(BoundSCEV, Step)) &&
+          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
 }
 
-static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
-                           bool Signed) {
-  // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX.
-  assert(SE.isKnownNonNegative(S2) &&
-         "We expected the 2nd arg to be non-negative!");
-  const SCEV *Max = SE.getConstant(
-      Signed ? APInt::getSignedMaxValue(
-                   cast<IntegerType>(S1->getType())->getBitWidth())
-             : APInt::getMaxValue(
-                   cast<IntegerType>(S1->getType())->getBitWidth()));
-  const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
-  return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                              S1, CapForS1);
+static bool CannotBeMinInLoop(const SCEV *BoundSCEV, Loop *L,
+                              ScalarEvolution &SE, bool Signed) {
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) : 
+    APInt::getMinValue(BitWidth);
+  auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+  return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+         SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV,
+                                     SE.getConstant(Min));
 }
 
-static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) {
-  APInt Min = Signed ?
-      APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) :
-      APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth());
-  return SE.getSignedRange(S).contains(Min) &&
-         SE.getUnsignedRange(S).contains(Min);
+static bool isKnownNonNegativeInLoop(const SCEV *BoundSCEV, const Loop *L,
+                                     ScalarEvolution &SE) {
+  const SCEV *Zero = SE.getZero(BoundSCEV->getType());
+  return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+         SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, BoundSCEV, Zero);
 }
 
-static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
-                           bool Signed) {
-  // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN.
-  assert(SE.isKnownNonPositive(S2) &&
-         "We expected the 2nd arg to be non-positive!");
-  const SCEV *Max = SE.getConstant(
-      Signed ? APInt::getSignedMinValue(
-                   cast<IntegerType>(S1->getType())->getBitWidth())
-             : APInt::getMinValue(
-                   cast<IntegerType>(S1->getType())->getBitWidth()));
-  const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
-  return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT,
-                              S1, CapForS1);
+static bool isKnownNegativeInLoop(const SCEV *BoundSCEV, const Loop *L,
+                                  ScalarEvolution &SE) {
+  const SCEV *Zero = SE.getZero(BoundSCEV->getType());
+  return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+         SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, BoundSCEV, Zero);
 }
 
 Optional<LoopStructure>
 LoopStructure::parseLoopStructure(ScalarEvolution &SE,
-                                  BranchProbabilityInfo &BPI,
-                                  Loop &L, const char *&FailureReason) {
+                                  BranchProbabilityInfo *BPI, Loop &L,
+                                  const char *&FailureReason) {
   if (!L.isLoopSimplifyForm()) {
     FailureReason = "loop not in LoopSimplify form";
     return None;
@@ -750,7 +858,8 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
   unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
 
   BranchProbability ExitProbability =
-    BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+      BPI ? BPI->getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx)
+          : BranchProbability::getZero();
 
   if (!SkipProfitabilityChecks &&
       ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
@@ -816,43 +925,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
     return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
   };
 
-  // Here we check whether the suggested AddRec is an induction variable that
-  // can be handled (i.e. with known constant step), and if yes, calculate its
-  // step and identify whether it is increasing or decreasing.
-  auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing,
-                            ConstantInt *&StepCI) {
-    if (!AR->isAffine())
-      return false;
-
-    // Currently we only work with induction variables that have been proved to
-    // not wrap.  This restriction can potentially be lifted in the future.
-
-    if (!HasNoSignedWrap(AR))
-      return false;
-
-    if (const SCEVConstant *StepExpr =
-            dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
-      StepCI = StepExpr->getValue();
-      assert(!StepCI->isZero() && "Zero step?");
-      IsIncreasing = !StepCI->isNegative();
-      return true;
-    }
-
-    return false;
-  };
-
   // `ICI` is interpreted as taking the backedge if the *next* value of the
   // induction variable satisfies some constraint.
 
   const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
-  bool IsIncreasing = false;
-  bool IsSignedPredicate = true;
-  ConstantInt *StepCI;
-  if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) {
+  if (!IndVarBase->isAffine()) {
     FailureReason = "LHS in icmp not induction variable";
     return None;
   }
+  const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
+  if (!isa<SCEVConstant>(StepRec)) {
+    FailureReason = "LHS in icmp not induction variable";
+    return None;
+  }
+  ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
+
+  if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
+    FailureReason = "LHS in icmp needs nsw for equality predicates";
+    return None;
+  }
 
+  assert(!StepCI->isZero() && "Zero step?");
+  bool IsIncreasing = !StepCI->isNegative();
+  bool IsSignedPredicate = ICmpInst::isSigned(Pred);
   const SCEV *StartNext = IndVarBase->getStart();
   const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
   const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
@@ -870,22 +965,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
         // If both parts are known non-negative, it is profitable to use
         // unsigned comparison in increasing loop. This allows us to make the
         // comparison check against "RightSCEV + 1" more optimistic.
-        if (SE.isKnownNonNegative(IndVarStart) &&
-            SE.isKnownNonNegative(RightSCEV))
+        if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
+            isKnownNonNegativeInLoop(RightSCEV, &L, SE))
           Pred = ICmpInst::ICMP_ULT;
         else
           Pred = ICmpInst::ICMP_SLT;
-      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
-               !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
         // while (true) {               while (true) {
         //   if (++i == len)     --->     if (++i > len - 1)
         //     break;                       break;
         //   ...                          ...
         // }                            }
-        // TODO: Insert ICMP_UGT if both are non-negative?
-        Pred = ICmpInst::ICMP_SGT;
-        RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
-        DecreasedRightValueByOne = true;
+        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+            CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
+          Pred = ICmpInst::ICMP_UGT;
+          RightSCEV = SE.getMinusSCEV(RightSCEV,
+                                      SE.getOne(RightSCEV->getType()));
+          DecreasedRightValueByOne = true;
+        } else if (CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
+          Pred = ICmpInst::ICMP_SGT;
+          RightSCEV = SE.getMinusSCEV(RightSCEV,
+                                      SE.getOne(RightSCEV->getType()));
+          DecreasedRightValueByOne = true;
+        }
       }
     }
 
@@ -899,36 +1001,18 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
       return None;
     }
 
-    IsSignedPredicate =
-        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
-
+    IsSignedPredicate = ICmpInst::isSigned(Pred);
     if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
       FailureReason = "unsigned latch conditions are explicitly prohibited";
       return None;
     }
 
-    // The predicate that we need to check that the induction variable lies
-    // within bounds.
-    ICmpInst::Predicate BoundPred =
-        IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
-
+    if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
+                               LatchBrExitIdx, &L, SE)) {
+      FailureReason = "Unsafe loop bounds";
+      return None;
+    }
     if (LatchBrExitIdx == 0) {
-      const SCEV *StepMinusOne = SE.getMinusSCEV(Step,
-                                                 SE.getOne(Step->getType()));
-      if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) {
-        // TODO: this restriction is easily removable -- we just have to
-        // remember that the icmp was an slt and not an sle.
-        FailureReason = "limit may overflow when coercing le to lt";
-        return None;
-      }
-
-      if (!SE.isLoopEntryGuardedByCond(
-              &L, BoundPred, IndVarStart,
-              SE.getAddExpr(RightSCEV, Step))) {
-        FailureReason = "Induction variable start not bounded by upper limit";
-        return None;
-      }
-
       // We need to increase the right value unless we have already decreased
       // it virtually when we replaced EQ with SGT.
       if (!DecreasedRightValueByOne) {
@@ -936,10 +1020,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
         RightValue = B.CreateAdd(RightValue, One);
       }
     } else {
-      if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
-        FailureReason = "Induction variable start not bounded by upper limit";
-        return None;
-      }
       assert(!DecreasedRightValueByOne &&
              "Right value can be decreased only for LatchBrExitIdx == 0!");
     }
@@ -955,17 +1035,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
         // that both operands are non-negative, because it will only pessimize
         // our check against "RightSCEV - 1".
         Pred = ICmpInst::ICMP_SGT;
-      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
-               !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
         // while (true) {               while (true) {
         //   if (--i == len)     --->     if (--i < len + 1)
         //     break;                       break;
         //   ...                          ...
         // }                            }
-        // TODO: Insert ICMP_ULT if both are non-negative?
-        Pred = ICmpInst::ICMP_SLT;
-        RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
-        IncreasedRightValueByOne = true;
+        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+            CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
+          Pred = ICmpInst::ICMP_ULT;
+          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+          IncreasedRightValueByOne = true;
+        } else if (CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
+          Pred = ICmpInst::ICMP_SLT;
+          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+          IncreasedRightValueByOne = true;
+        }
       }
     }
 
@@ -988,27 +1073,13 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
       return None;
     }
 
-    // The predicate that we need to check that the induction variable lies
-    // within bounds.
-    ICmpInst::Predicate BoundPred =
-        IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+    if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
+                               LatchBrExitIdx, &L, SE)) {
+      FailureReason = "Unsafe bounds";
+      return None;
+    }
 
     if (LatchBrExitIdx == 0) {
-      const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
-      if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) {
-        // TODO: this restriction is easily removable -- we just have to
-        // remember that the icmp was an sgt and not an sge.
-        FailureReason = "limit may overflow when coercing ge to gt";
-        return None;
-      }
-
-      if (!SE.isLoopEntryGuardedByCond(
-              &L, BoundPred, IndVarStart,
-              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
-        FailureReason = "Induction variable start not bounded by lower limit";
-        return None;
-      }
-
       // We need to decrease the right value unless we have already increased
       // it virtually when we replaced EQ with SLT.
       if (!IncreasedRightValueByOne) {
@@ -1016,10 +1087,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
         RightValue = B.CreateSub(RightValue, One);
       }
     } else {
-      if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
-        FailureReason = "Induction variable start not bounded by lower limit";
-        return None;
-      }
       assert(!IncreasedRightValueByOne &&
              "Right value can be increased only for LatchBrExitIdx == 0!");
     }
@@ -1381,13 +1448,14 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
 }
 
 Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
-                                                 ValueToValueMapTy &VM) {
+                                                 ValueToValueMapTy &VM,
+                                                 bool IsSubloop) {
   Loop &New = *LI.AllocateLoop();
   if (Parent)
     Parent->addChildLoop(&New);
   else
     LI.addTopLevelLoop(&New);
-  LPM.addLoop(New);
+  LPMAddNewLoop(&New, IsSubloop);
 
   // Add all of the blocks in Original to the new loop.
   for (auto *BB : Original->blocks())
@@ -1396,7 +1464,7 @@ Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
 
   // Add all of the subloops to the new loop.
   for (Loop *SubLoop : *Original)
-    createClonedLoopStructure(SubLoop, &New, VM);
+    createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
 
   return &New;
 }
@@ -1414,7 +1482,7 @@ bool LoopConstrainer::run() {
   bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
   Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
   if (!MaybeSR.hasValue()) {
-    DEBUG(dbgs() << "irce: could not compute subranges\n");
+    LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
     return false;
   }
 
@@ -1446,19 +1514,22 @@ bool LoopConstrainer::run() {
     if (Increasing)
       ExitPreLoopAtSCEV = *SR.LowLimit;
     else {
-      if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) {
-        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
-                     << "preloop exit limit.  HighLimit = " << *(*SR.HighLimit)
-                     << "\n");
+      if (CannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
+                            IsSignedPredicate))
+        ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+      else {
+        LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                          << "preloop exit limit.  HighLimit = "
+                          << *(*SR.HighLimit) << "\n");
         return false;
       }
-      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
     }
 
     if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
-      DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
-                   << " preloop exit limit " << *ExitPreLoopAtSCEV
-                   << " at block " << InsertPt->getParent()->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+                        << " preloop exit limit " << *ExitPreLoopAtSCEV
+                        << " at block " << InsertPt->getParent()->getName()
+                        << "\n");
       return false;
     }
 
@@ -1472,19 +1543,22 @@ bool LoopConstrainer::run() {
     if (Increasing)
       ExitMainLoopAtSCEV = *SR.HighLimit;
     else {
-      if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) {
-        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
-                     << "mainloop exit limit.  LowLimit = " << *(*SR.LowLimit)
-                     << "\n");
+      if (CannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
+                            IsSignedPredicate))
+        ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+      else {
+        LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                          << "mainloop exit limit.  LowLimit = "
+                          << *(*SR.LowLimit) << "\n");
         return false;
       }
-      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
     }
 
     if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
-      DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
-                   << " main loop exit limit " << *ExitMainLoopAtSCEV
-                   << " at block " << InsertPt->getParent()->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+                        << " main loop exit limit " << *ExitMainLoopAtSCEV
+                        << " at block " << InsertPt->getParent()->getName()
+                        << "\n");
       return false;
     }
 
@@ -1546,13 +1620,15 @@ bool LoopConstrainer::run() {
   // LI when LoopSimplifyForm is generated.
   Loop *PreL = nullptr, *PostL = nullptr;
   if (!PreLoop.Blocks.empty()) {
-    PreL = createClonedLoopStructure(
-        &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
+    PreL = createClonedLoopStructure(&OriginalLoop,
+                                     OriginalLoop.getParentLoop(), PreLoop.Map,
+                                     /* IsSubLoop */ false);
   }
 
   if (!PostLoop.Blocks.empty()) {
-    PostL = createClonedLoopStructure(
-        &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+    PostL =
+        createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
+                                  PostLoop.Map, /* IsSubLoop */ false);
   }
 
   // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
@@ -1618,32 +1694,34 @@ InductiveRangeCheck::computeSafeIterationSpace(
   unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
   const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
 
-  // Substract Y from X so that it does not go through border of the IV
+  // Subtract Y from X so that it does not go through border of the IV
   // iteration space. Mathematically, it is equivalent to:
   //
-  //    ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX).        [1]
+  //    ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX).        [1]
   //
-  // In [1], 'X - Y' is a mathematical substraction (result is not bounded to
+  // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to
   // any width of bit grid). But after we take min/max, the result is
   // guaranteed to be within [INT_MIN, INT_MAX].
   //
   // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
   // values, depending on type of latch condition that defines IV iteration
   // space.
-  auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) {
-    assert(SE.isKnownNonNegative(X) &&
-           "We can only substract from values in [0; SINT_MAX]!");
+  auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) {
+    // FIXME: The current implementation assumes that X is in [0, SINT_MAX].
+    // This is required to ensure that SINT_MAX - X does not overflow signed and
+    // that X - Y does not overflow unsigned if Y is negative. Can we lift this
+    // restriction and make it work for negative X either?
     if (IsLatchSigned) {
       // X is a number from signed range, Y is interpreted as signed.
       // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
       // thing we should care about is that we didn't cross SINT_MAX.
-      // So, if Y is positive, we substract Y safely.
+      // So, if Y is positive, we subtract Y safely.
       //   Rule 1: Y > 0 ---> Y.
-      // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely.
+      // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely.
       //   Rule 2: Y >=s (X - SINT_MAX) ---> Y.
-      // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX).
+      // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX).
       //   Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
-      // It gives us smax(Y, X - SINT_MAX) to substract in all cases.
+      // It gives us smax(Y, X - SINT_MAX) to subtract in all cases.
       const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
       return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
                              SCEV::FlagNSW);
@@ -1651,29 +1729,45 @@ InductiveRangeCheck::computeSafeIterationSpace(
       // X is a number from unsigned range, Y is interpreted as signed.
       // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
       // thing we should care about is that we didn't cross zero.
-      // So, if Y is negative, we substract Y safely.
+      // So, if Y is negative, we subtract Y safely.
       //   Rule 1: Y <s 0 ---> Y.
-      // If 0 <= Y <= X, we substract Y safely.
+      // If 0 <= Y <= X, we subtract Y safely.
       //   Rule 2: Y <=s X ---> Y.
-      // If 0 <= X < Y, we should stop at 0 and can only substract X.
+      // If 0 <= X < Y, we should stop at 0 and can only subtract X.
       //   Rule 3: Y >s X ---> X.
-      // It gives us smin(X, Y) to substract in all cases.
+      // It gives us smin(X, Y) to subtract in all cases.
       return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
   };
   const SCEV *M = SE.getMinusSCEV(C, A);
   const SCEV *Zero = SE.getZero(M->getType());
-  const SCEV *Begin = ClampedSubstract(Zero, M);
-  const SCEV *L = nullptr;
 
-  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
-  // We can potentially do much better here.
-  if (const SCEV *EndLimit = getEnd())
-    L = EndLimit;
-  else {
-    assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
-    L = SIntMax;
-  }
-  const SCEV *End = ClampedSubstract(L, M);
+  // This function returns SCEV equal to 1 if X is non-negative 0 otherwise.
+  auto SCEVCheckNonNegative = [&](const SCEV *X) {
+    const Loop *L = IndVar->getLoop();
+    const SCEV *One = SE.getOne(X->getType());
+    // Can we trivially prove that X is a non-negative or negative value?
+    if (isKnownNonNegativeInLoop(X, L, SE))
+      return One;
+    else if (isKnownNegativeInLoop(X, L, SE))
+      return Zero;
+    // If not, we will have to figure it out during the execution.
+    // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0.
+    const SCEV *NegOne = SE.getNegativeSCEV(One);
+    return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One);
+  };
+  // FIXME: Current implementation of ClampedSubtract implicitly assumes that
+  // X is non-negative (in sense of a signed value). We need to re-implement
+  // this function in a way that it will correctly handle negative X as well.
+  // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can
+  // end up with a negative X and produce wrong results. So currently we ensure
+  // that if getEnd() is negative then both ends of the safe range are zero.
+  // Note that this may pessimize elimination of unsigned range checks against
+  // negative values.
+  const SCEV *REnd = getEnd();
+  const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd);
+
+  const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative);
+  const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative);
   return InductiveRangeCheck::Range(Begin, End);
 }
 
@@ -1735,26 +1829,56 @@ IntersectUnsignedRange(ScalarEvolution &SE,
   return Ret;
 }
 
-bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+PreservedAnalyses IRCEPass::run(Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR,
+                                LPMUpdater &U) {
+  Function *F = L.getHeader()->getParent();
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+  auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
+  InductiveRangeCheckElimination IRCE(AR.SE, BPI, AR.DT, AR.LI);
+  auto LPMAddNewLoop = [&U](Loop *NL, bool IsSubloop) {
+    if (!IsSubloop)
+      U.addSiblingLoops(NL);
+  };
+  bool Changed = IRCE.run(&L, LPMAddNewLoop);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+bool IRCELegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipLoop(L))
     return false;
 
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+  auto LPMAddNewLoop = [&LPM](Loop *NL, bool /* IsSubLoop */) {
+    LPM.addLoop(*NL);
+  };
+  return IRCE.run(L, LPMAddNewLoop);
+}
+
+bool InductiveRangeCheckElimination::run(
+    Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
   if (L->getBlocks().size() >= LoopSizeCutoff) {
-    DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+    LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n");
     return false;
   }
 
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
-    DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+    LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
     return false;
   }
 
   LLVMContext &Context = Preheader->getContext();
   SmallVector<InductiveRangeCheck, 16> RangeChecks;
-  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  BranchProbabilityInfo &BPI =
-      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
 
   for (auto BBI : L->getBlocks())
     if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
@@ -1772,7 +1896,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
       IRC.print(OS);
   };
 
-  DEBUG(PrintRecognizedRangeChecks(dbgs()));
+  LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs()));
 
   if (PrintRangeChecks)
     PrintRecognizedRangeChecks(errs());
@@ -1781,8 +1905,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
   Optional<LoopStructure> MaybeLoopStructure =
       LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
   if (!MaybeLoopStructure.hasValue()) {
-    DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
-                 << "\n";);
+    LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
+                      << FailureReason << "\n";);
     return false;
   }
   LoopStructure LS = MaybeLoopStructure.getValue();
@@ -1820,9 +1944,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!SafeIterRange.hasValue())
     return false;
 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM,
-                     LS, SE, DT, SafeIterRange.getValue());
+  LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
+                     SafeIterRange.getValue());
   bool Changed = LC.run();
 
   if (Changed) {
@@ -1833,7 +1956,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
       L->print(dbgs());
     };
 
-    DEBUG(PrintConstrainedLoopInfo());
+    LLVM_DEBUG(PrintConstrainedLoopInfo());
 
     if (PrintChangedLoops)
       PrintConstrainedLoopInfo();
@@ -1852,5 +1975,5 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
 }
 
 Pass *llvm::createInductiveRangeCheckEliminationPass() {
-  return new InductiveRangeCheckElimination;
+  return new IRCELegacyPass();
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 7d66c0f73821..fbbc09eb487f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -97,6 +97,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -121,7 +122,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 #include <iterator>
@@ -140,7 +140,7 @@ namespace {
 
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 
-/// \brief InferAddressSpaces
+/// InferAddressSpaces
 class InferAddressSpaces : public FunctionPass {
   /// Target specific address space which uses of should be replaced if
   /// possible.
@@ -260,7 +260,10 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
 
   switch (II->getIntrinsicID()) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:{
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
     const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
     if (!IsVolatile || !IsVolatile->isZero())
       return false;
@@ -289,6 +292,9 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
   case Intrinsic::objectsize:
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax:
     appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
                                                  PostorderStack, Visited);
     break;
@@ -647,13 +653,13 @@ void InferAddressSpaces::inferAddressSpaces(
 
     // Tries to update the address space of the stack top according to the
     // address spaces of its operands.
-    DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
+    LLVM_DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
     Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
     if (!NewAS.hasValue())
       continue;
     // If any updates are made, grabs its users to the worklist because
     // their address spaces can also be possibly updated.
-    DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
+    LLVM_DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
     (*InferredAddrSpace)[V] = NewAS.getValue();
 
     for (Value *User : V->users()) {
@@ -779,7 +785,7 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
 
   if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
     B.CreateMemSet(NewV, MSI->getValue(),
-                   MSI->getLength(), MSI->getAlignment(),
+                   MSI->getLength(), MSI->getDestAlignment(),
                    false, // isVolatile
                    TBAA, ScopeMD, NoAliasMD);
   } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
@@ -795,14 +801,16 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
 
     if (isa<MemCpyInst>(MTI)) {
       MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
-      B.CreateMemCpy(Dest, Src, MTI->getLength(),
-                     MTI->getAlignment(),
+      B.CreateMemCpy(Dest, MTI->getDestAlignment(),
+                     Src, MTI->getSourceAlignment(),
+                     MTI->getLength(),
                      false, // isVolatile
                      TBAA, TBAAStruct, ScopeMD, NoAliasMD);
     } else {
       assert(isa<MemMoveInst>(MTI));
-      B.CreateMemMove(Dest, Src, MTI->getLength(),
-                      MTI->getAlignment(),
+      B.CreateMemMove(Dest, MTI->getDestAlignment(),
+                      Src, MTI->getSourceAlignment(),
+                      MTI->getLength(),
                       false, // isVolatile
                       TBAA, ScopeMD, NoAliasMD);
     }
@@ -893,15 +901,15 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
     if (NewV == nullptr)
       continue;
 
-    DEBUG(dbgs() << "Replacing the uses of " << *V
-                 << "\n  with\n  " << *NewV << '\n');
+    LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  with\n  "
+                      << *NewV << '\n');
 
     if (Constant *C = dyn_cast<Constant>(V)) {
       Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
                                                          C->getType());
       if (C != Replace) {
-        DEBUG(dbgs() << "Inserting replacement const cast: "
-              << Replace << ": " << *Replace << '\n');
+        LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace
+                          << ": " << *Replace << '\n');
         C->replaceAllUsesWith(Replace);
         V = Replace;
       }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index f3d4f2ef38d7..05cd48d83267 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -1,4 +1,4 @@
-//===------ SimplifyInstructions.cpp - Remove redundant instructions ------===//
+//===- InstSimplifyPass.cpp -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,15 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification.  If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/SimplifyInstructions.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,7 +20,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -84,58 +77,57 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
 }
 
 namespace {
-  struct InstSimplifier : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    InstSimplifier() : FunctionPass(ID) {
-      initializeInstSimplifierPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-    }
-
-    /// runOnFunction - Remove instructions that simplify.
-    bool runOnFunction(Function &F) override {
-      if (skipFunction(F))
-        return false;
-
-      const DominatorTree *DT =
-          &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      const TargetLibraryInfo *TLI =
-          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      AssumptionCache *AC =
-          &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      OptimizationRemarkEmitter *ORE =
-          &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-      const DataLayout &DL = F.getParent()->getDataLayout();
-      const SimplifyQuery SQ(DL, TLI, DT, AC);
-      return runImpl(F, SQ, ORE);
-    }
-  };
-}
-
-char InstSimplifier::ID = 0;
-INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
+struct InstSimplifyLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  InstSimplifyLegacyPass() : FunctionPass(ID) {
+    initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+  /// runOnFunction - Remove instructions that simplify.
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    const DominatorTree *DT =
+        &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    OptimizationRemarkEmitter *ORE =
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const SimplifyQuery SQ(DL, TLI, DT, AC);
+    return runImpl(F, SQ, ORE);
+  }
+};
+} // namespace
+
+char InstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify",
                       "Remove redundant instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
+INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify",
                     "Remove redundant instructions", false, false)
-char &llvm::InstructionSimplifierID = InstSimplifier::ID;
 
 // Public interface to the simplify instructions pass.
-FunctionPass *llvm::createInstructionSimplifierPass() {
-  return new InstSimplifier();
+FunctionPass *llvm::createInstSimplifyLegacyPass() {
+  return new InstSimplifyLegacyPass();
 }
 
-PreservedAnalyses InstSimplifierPass::run(Function &F,
-                                      FunctionAnalysisManager &AM) {
+PreservedAnalyses InstSimplifyPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 2f1645433fb8..1d66472f93c8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -64,7 +65,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
@@ -131,10 +131,11 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      if (PrintLVIAfterJumpThreading)
-        AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<LazyValueInfoWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
@@ -148,6 +149,7 @@ char JumpThreading::ID = 0;
 
 INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -164,7 +166,7 @@ JumpThreadingPass::JumpThreadingPass(int T) {
 }
 
 // Update branch probability information according to conditional
-// branch probablity. This is usually made possible for cloned branches
+// branch probability. This is usually made possible for cloned branches
 // in inline instances by the context specific profile in the caller.
 // For instance,
 //
@@ -278,8 +280,12 @@ bool JumpThreading::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  // Get DT analysis before LVI. When LVI is initialized it conditionally adds
+  // DT if it's available.
+  auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DeferredDominance DDT(*DT);
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.hasProfileData();
@@ -289,12 +295,11 @@ bool JumpThreading::runOnFunction(Function &F) {
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
-  bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
-                              std::move(BPI));
+  bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DDT, HasProfileData,
+                              std::move(BFI), std::move(BPI));
   if (PrintLVIAfterJumpThreading) {
     dbgs() << "LVI for function '" << F.getName() << "':\n";
-    LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                  dbgs());
+    LVI->printLVI(F, *DT, dbgs());
   }
   return Changed;
 }
@@ -302,8 +307,12 @@ bool JumpThreading::runOnFunction(Function &F) {
 PreservedAnalyses JumpThreadingPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  // Get DT analysis before LVI. When LVI is initialized it conditionally adds
+  // DT if it's available.
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
+  DeferredDominance DDT(DT);
 
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
@@ -313,25 +322,28 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
-  bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
-                         std::move(BPI));
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, &DDT, HasProfileData,
+                         std::move(BFI), std::move(BPI));
 
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LazyValueAnalysis>();
   return PA;
 }
 
 bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
                                 LazyValueInfo *LVI_, AliasAnalysis *AA_,
-                                bool HasProfileData_,
+                                DeferredDominance *DDT_, bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
-  DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+  LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
   LVI = LVI_;
   AA = AA_;
+  DDT = DDT_;
   BFI.reset();
   BPI.reset();
   // When profile data is available, we need to update edge weights after
@@ -345,69 +357,66 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
     BFI = std::move(BFI_);
   }
 
-  // Remove unreachable blocks from function as they may result in infinite
-  // loop. We do threading if we found something profitable. Jump threading a
-  // branch can create other opportunities. If these opportunities form a cycle
-  // i.e. if any jump threading is undoing previous threading in the path, then
-  // we will loop forever. We take care of this issue by not jump threading for
-  // back edges. This works for normal cases but not for unreachable blocks as
-  // they may have cycle with no back edge.
-  bool EverChanged = false;
-  EverChanged |= removeUnreachableBlocks(F, LVI);
+  // JumpThreading must not processes blocks unreachable from entry. It's a
+  // waste of compute time and can potentially lead to hangs.
+  SmallPtrSet<BasicBlock *, 16> Unreachable;
+  DominatorTree &DT = DDT->flush();
+  for (auto &BB : F)
+    if (!DT.isReachableFromEntry(&BB))
+      Unreachable.insert(&BB);
 
   FindLoopHeaders(F);
 
+  bool EverChanged = false;
   bool Changed;
   do {
     Changed = false;
-    for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
-      BasicBlock *BB = &*I;
-      // Thread all of the branches we can over this block.
-      while (ProcessBlock(BB))
+    for (auto &BB : F) {
+      if (Unreachable.count(&BB))
+        continue;
+      while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
         Changed = true;
+      // Stop processing BB if it's the entry or is now deleted. The following
+      // routines attempt to eliminate BB and locating a suitable replacement
+      // for the entry is non-trivial.
+      if (&BB == &F.getEntryBlock() || DDT->pendingDeletedBB(&BB))
+        continue;
 
-      ++I;
-
-      // If the block is trivially dead, zap it.  This eliminates the successor
-      // edges which simplifies the CFG.
-      if (pred_empty(BB) &&
-          BB != &BB->getParent()->getEntryBlock()) {
-        DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
-              << "' with terminator: " << *BB->getTerminator() << '\n');
-        LoopHeaders.erase(BB);
-        LVI->eraseBlock(BB);
-        DeleteDeadBlock(BB);
+      if (pred_empty(&BB)) {
+        // When ProcessBlock makes BB unreachable it doesn't bother to fix up
+        // the instructions in it. We must remove BB to prevent invalid IR.
+        LLVM_DEBUG(dbgs() << "  JT: Deleting dead block '" << BB.getName()
+                          << "' with terminator: " << *BB.getTerminator()
+                          << '\n');
+        LoopHeaders.erase(&BB);
+        LVI->eraseBlock(&BB);
+        DeleteDeadBlock(&BB, DDT);
         Changed = true;
         continue;
       }
 
-      BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
-
-      // Can't thread an unconditional jump, but if the block is "almost
-      // empty", we can replace uses of it with uses of the successor and make
-      // this dead.
-      // We should not eliminate the loop header or latch either, because
-      // eliminating a loop header or latch might later prevent LoopSimplify
-      // from transforming nested loops into simplified form. We will rely on
-      // later passes in backend to clean up empty blocks.
+      // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
+      // is "almost empty", we attempt to merge BB with its sole successor.
+      auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
       if (BI && BI->isUnconditional() &&
-          BB != &BB->getParent()->getEntryBlock() &&
-          // If the terminator is the only non-phi instruction, try to nuke it.
-          BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) &&
-          !LoopHeaders.count(BI->getSuccessor(0))) {
-        // FIXME: It is always conservatively correct to drop the info
-        // for a block even if it doesn't get erased.  This isn't totally
-        // awesome, but it allows us to use AssertingVH to prevent nasty
-        // dangling pointer issues within LazyValueInfo.
-        LVI->eraseBlock(BB);
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
-          Changed = true;
+          // The terminator must be the only non-phi instruction in BB.
+          BB.getFirstNonPHIOrDbg()->isTerminator() &&
+          // Don't alter Loop headers and latches to ensure another pass can
+          // detect and transform nested loops later.
+          !LoopHeaders.count(&BB) && !LoopHeaders.count(BI->getSuccessor(0)) &&
+          TryToSimplifyUncondBranchFromEmptyBlock(&BB, DDT)) {
+        // BB is valid for cleanup here because we passed in DDT. F remains
+        // BB's parent until a DDT->flush() event.
+        LVI->eraseBlock(&BB);
+        Changed = true;
       }
     }
     EverChanged |= Changed;
   } while (Changed);
 
   LoopHeaders.clear();
+  DDT->flush();
+  LVI->enableDT();
   return EverChanged;
 }
 
@@ -600,6 +609,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
     // Perhaps getConstantOnEdge should be smart enough to do this?
 
+    if (DDT->pending())
+      LVI->disableDT();
+    else
+      LVI->enableDT();
     for (BasicBlock *P : predecessors(BB)) {
       // If the value is known by LazyValueInfo to be a constant in a
       // predecessor, use that information to try to thread this block.
@@ -613,6 +626,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 
   /// If I is a PHI node, then we know the incoming values for any constants.
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    if (DDT->pending())
+      LVI->disableDT();
+    else
+      LVI->enableDT();
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *InVal = PN->getIncomingValue(i);
       if (Constant *KC = getKnownConstant(InVal, Preference)) {
@@ -630,11 +647,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
   }
 
   // Handle Cast instructions.  Only see through Cast when the source operand is
-  // PHI or Cmp and the source type is i1 to save the compilation time.
+  // PHI or Cmp to save the compilation time.
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     Value *Source = CI->getOperand(0);
-    if (!Source->getType()->isIntegerTy(1))
-      return false;
     if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
       return false;
     ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
@@ -738,20 +753,36 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     CmpInst::Predicate Pred = Cmp->getPredicate();
 
     PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+    if (!PN)
+      PN = dyn_cast<PHINode>(CmpRHS);
     if (PN && PN->getParent() == BB) {
       const DataLayout &DL = PN->getModule()->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
       // See if any do.
+      if (DDT->pending())
+        LVI->disableDT();
+      else
+        LVI->enableDT();
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         BasicBlock *PredBB = PN->getIncomingBlock(i);
-        Value *LHS = PN->getIncomingValue(i);
-        Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
-
+        Value *LHS, *RHS;
+        if (PN == CmpLHS) {
+          LHS = PN->getIncomingValue(i);
+          RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+        } else {
+          LHS = CmpLHS->DoPHITranslation(BB, PredBB);
+          RHS = PN->getIncomingValue(i);
+        }
         Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
         if (!Res) {
           if (!isa<Constant>(RHS))
             continue;
 
+          // getPredicateOnEdge call will make no sense if LHS is defined in BB.
+          auto LHSInst = dyn_cast<Instruction>(LHS);
+          if (LHSInst && LHSInst->getParent() == BB)
+            continue;
+
           LazyValueInfo::Tristate
             ResT = LVI->getPredicateOnEdge(Pred, LHS,
                                            cast<Constant>(RHS), PredBB, BB,
@@ -775,6 +806,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 
       if (!isa<Instruction>(CmpLHS) ||
           cast<Instruction>(CmpLHS)->getParent() != BB) {
+        if (DDT->pending())
+          LVI->disableDT();
+        else
+          LVI->enableDT();
         for (BasicBlock *P : predecessors(BB)) {
           // If the value is known by LazyValueInfo to be a constant in a
           // predecessor, use that information to try to thread this block.
@@ -803,6 +838,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
             match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
           if (!isa<Instruction>(AddLHS) ||
               cast<Instruction>(AddLHS)->getParent() != BB) {
+            if (DDT->pending())
+              LVI->disableDT();
+            else
+              LVI->enableDT();
             for (BasicBlock *P : predecessors(BB)) {
               // If the value is known by LazyValueInfo to be a ConstantRange in
               // a predecessor, use that information to try to thread this
@@ -884,6 +923,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
   }
 
   // If all else fails, see if LVI can figure out a constant value for us.
+  if (DDT->pending())
+    LVI->disableDT();
+  else
+    LVI->enableDT();
   Constant *CI = LVI->getConstant(V, BB, CxtI);
   if (Constant *KC = getKnownConstant(CI, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
@@ -903,10 +946,10 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
   unsigned MinSucc = 0;
   BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
   // Compute the successor with the minimum number of predecessors.
-  unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+  unsigned MinNumPreds = pred_size(TestBB);
   for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
     TestBB = BBTerm->getSuccessor(i);
-    unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+    unsigned NumPreds = pred_size(TestBB);
     if (NumPreds < MinNumPreds) {
       MinSucc = i;
       MinNumPreds = NumPreds;
@@ -931,8 +974,8 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
 bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // If the block is trivially dead, just return and let the caller nuke it.
   // This simplifies other transformations.
-  if (pred_empty(BB) &&
-      BB != &BB->getParent()->getEntryBlock())
+  if (DDT->pendingDeletedBB(BB) ||
+      (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
     return false;
 
   // If this block has a single predecessor, and if that pred has a single
@@ -948,7 +991,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         LoopHeaders.insert(BB);
 
       LVI->eraseBlock(SinglePred);
-      MergeBasicBlockIntoOnlyPred(BB);
+      MergeBasicBlockIntoOnlyPred(BB, nullptr, DDT);
 
       // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
       // BB code within one basic block `BB`), we need to invalidate the LVI
@@ -977,9 +1020,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
 
       // Invalidate LVI information for BB if the LVI is not provably true for
       // all of BB.
-      if (any_of(*BB, [](Instruction &I) {
-            return !isGuaranteedToTransferExecutionToSuccessor(&I);
-          }))
+      if (!isGuaranteedToTransferExecutionToSuccessor(BB))
         LVI->eraseBlock(BB);
       return true;
     }
@@ -1031,18 +1072,23 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // successors to branch to.  Let GetBestDestForJumpOnUndef decide.
   if (isa<UndefValue>(Condition)) {
     unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+    std::vector<DominatorTree::UpdateType> Updates;
 
     // Fold the branch/switch.
     TerminatorInst *BBTerm = BB->getTerminator();
+    Updates.reserve(BBTerm->getNumSuccessors());
     for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
       if (i == BestSucc) continue;
-      BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+      BasicBlock *Succ = BBTerm->getSuccessor(i);
+      Succ->removePredecessor(BB, true);
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
     }
 
-    DEBUG(dbgs() << "  In block '" << BB->getName()
-          << "' folding undef terminator: " << *BBTerm << '\n');
+    LLVM_DEBUG(dbgs() << "  In block '" << BB->getName()
+                      << "' folding undef terminator: " << *BBTerm << '\n');
     BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
     BBTerm->eraseFromParent();
+    DDT->applyUpdates(Updates);
     return true;
   }
 
@@ -1050,10 +1096,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // terminator to an unconditional branch.  This can occur due to threading in
   // other blocks.
   if (getKnownConstant(Condition, Preference)) {
-    DEBUG(dbgs() << "  In block '" << BB->getName()
-          << "' folding terminator: " << *BB->getTerminator() << '\n');
+    LLVM_DEBUG(dbgs() << "  In block '" << BB->getName()
+                      << "' folding terminator: " << *BB->getTerminator()
+                      << '\n');
     ++NumFolds;
-    ConstantFoldTerminator(BB, true);
+    ConstantFoldTerminator(BB, true, nullptr, DDT);
     return true;
   }
 
@@ -1080,13 +1127,18 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
       // threading is concerned.
       assert(CondBr->isConditional() && "Threading on unconditional terminator");
 
+      if (DDT->pending())
+        LVI->disableDT();
+      else
+        LVI->enableDT();
       LazyValueInfo::Tristate Ret =
         LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                             CondConst, CondBr);
       if (Ret != LazyValueInfo::Unknown) {
         unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
         unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
-        CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+        BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
+        ToRemoveSucc->removePredecessor(BB, true);
         BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
         CondBr->eraseFromParent();
         if (CondCmp->use_empty())
@@ -1104,6 +1156,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
             ConstantInt::getFalse(CondCmp->getType());
           ReplaceFoldableUses(CondCmp, CI);
         }
+        DDT->deleteEdge(BB, ToRemoveSucc);
         return true;
       }
 
@@ -1125,8 +1178,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
 
   // TODO: There are other places where load PRE would be profitable, such as
   // more complex comparisons.
-  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
-    if (SimplifyPartiallyRedundantLoad(LI))
+  if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LoadI))
       return true;
 
   // Before threading, try to propagate profile data backwards:
@@ -1182,9 +1235,12 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
     Optional<bool> Implication =
         isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
     if (Implication) {
-      BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
-      BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
+      BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
+      BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
+      RemoveSucc->removePredecessor(BB);
+      BranchInst::Create(KeepSucc, BI);
       BI->eraseFromParent();
+      DDT->deleteEdge(BB, RemoveSucc);
       return true;
     }
     CurrentBB = CurrentPred;
@@ -1202,17 +1258,17 @@ static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
   return false;
 }
 
-/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
-/// load instruction, eliminate it by replacing it with a PHI node.  This is an
-/// important optimization that encourages jump threading, and needs to be run
-/// interlaced with other jump threading tasks.
-bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
+/// redundant load instruction, eliminate it by replacing it with a PHI node.
+/// This is an important optimization that encourages jump threading, and needs
+/// to be run interlaced with other jump threading tasks.
+bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   // Don't hack volatile and ordered loads.
-  if (!LI->isUnordered()) return false;
+  if (!LoadI->isUnordered()) return false;
 
   // If the load is defined in a block with exactly one predecessor, it can't be
   // partially redundant.
-  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *LoadBB = LoadI->getParent();
   if (LoadBB->getSinglePredecessor())
     return false;
 
@@ -1222,7 +1278,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (LoadBB->isEHPad())
     return false;
 
-  Value *LoadedPtr = LI->getOperand(0);
+  Value *LoadedPtr = LoadI->getOperand(0);
 
   // If the loaded operand is defined in the LoadBB and its not a phi,
   // it can't be available in predecessors.
@@ -1231,26 +1287,27 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // Scan a few instructions up from the load, to see if it is obviously live at
   // the entry to its block.
-  BasicBlock::iterator BBIt(LI);
+  BasicBlock::iterator BBIt(LoadI);
   bool IsLoadCSE;
   if (Value *AvailableVal = FindAvailableLoadedValue(
-          LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+          LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
     if (IsLoadCSE) {
-      LoadInst *NLI = cast<LoadInst>(AvailableVal);
-      combineMetadataForCSE(NLI, LI);
+      LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
+      combineMetadataForCSE(NLoadI, LoadI);
     };
 
     // If the returned value is the load itself, replace with an undef. This can
     // only happen in dead loops.
-    if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
-    if (AvailableVal->getType() != LI->getType())
-      AvailableVal =
-          CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
-    LI->replaceAllUsesWith(AvailableVal);
-    LI->eraseFromParent();
+    if (AvailableVal == LoadI)
+      AvailableVal = UndefValue::get(LoadI->getType());
+    if (AvailableVal->getType() != LoadI->getType())
+      AvailableVal = CastInst::CreateBitOrPointerCast(
+          AvailableVal, LoadI->getType(), "", LoadI);
+    LoadI->replaceAllUsesWith(AvailableVal);
+    LoadI->eraseFromParent();
     return true;
   }
 
@@ -1263,7 +1320,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // If all of the loads and stores that feed the value have the same AA tags,
   // then we can propagate them onto any newly inserted loads.
   AAMDNodes AATags;
-  LI->getAAMetadata(AATags);
+  LoadI->getAAMetadata(AATags);
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
 
@@ -1285,16 +1342,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     Value *PredAvailable = nullptr;
     // NOTE: We don't CSE load that is volatile or anything stronger than
     // unordered, that should have been checked when we entered the function.
-    assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+    assert(LoadI->isUnordered() &&
+           "Attempting to CSE volatile or atomic loads");
     // If this is a load on a phi pointer, phi-translate it and search
     // for available load/store to the pointer in predecessors.
     Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
     PredAvailable = FindAvailablePtrLoadStore(
-        Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
-        AA, &IsLoadCSE, &NumScanedInst);
+        Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
+        DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
 
     // If PredBB has a single predecessor, continue scanning through the
-    // single precessor.
+    // single predecessor.
     BasicBlock *SinglePredBB = PredBB;
     while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
            NumScanedInst < DefMaxInstsToScan) {
@@ -1302,7 +1360,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
       if (SinglePredBB) {
         BBIt = SinglePredBB->end();
         PredAvailable = FindAvailablePtrLoadStore(
-            Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+            Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
             (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
             &NumScanedInst);
       }
@@ -1334,15 +1392,15 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // If the value is unavailable in one of predecessors, we will end up
   // inserting a new instruction into them. It is only valid if all the
-  // instructions before LI are guaranteed to pass execution to its successor,
-  // or if LI is safe to speculate.
+  // instructions before LoadI are guaranteed to pass execution to its
+  // successor, or if LoadI is safe to speculate.
   // TODO: If this logic becomes more complex, and we will perform PRE insertion
   // farther than to a predecessor, we need to reuse the code from GVN's PRE.
   // It requires domination tree analysis, so for this simple case it is an
   // overkill.
   if (PredsScanned.size() != AvailablePreds.size() &&
-      !isSafeToSpeculativelyExecute(LI))
-    for (auto I = LoadBB->begin(); &*I != LI; ++I)
+      !isSafeToSpeculativelyExecute(LoadI))
+    for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
       if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
         return false;
 
@@ -1381,11 +1439,12 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    LoadInst *NewVal = new LoadInst(
-        LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
-        LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
-        LI->getSyncScopeID(), UnavailablePred->getTerminator());
-    NewVal->setDebugLoc(LI->getDebugLoc());
+    LoadInst *NewVal =
+        new LoadInst(LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+                     LoadI->getName() + ".pr", false, LoadI->getAlignment(),
+                     LoadI->getOrdering(), LoadI->getSyncScopeID(),
+                     UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LoadI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
 
@@ -1398,10 +1457,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // Create a PHI node at the start of the block for the PRE'd load value.
   pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
-  PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+  PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
                                 &LoadBB->front());
-  PN->takeName(LI);
-  PN->setDebugLoc(LI->getDebugLoc());
+  PN->takeName(LoadI);
+  PN->setDebugLoc(LoadI->getDebugLoc());
 
   // Insert new entries into the PHI for each predecessor.  A single block may
   // have multiple entries here.
@@ -1419,19 +1478,19 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     // AvailablePreds vector as we go so that all of the PHI entries for this
     // predecessor use the same bitcast.
     Value *&PredV = I->second;
-    if (PredV->getType() != LI->getType())
-      PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+    if (PredV->getType() != LoadI->getType())
+      PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
                                                P->getTerminator());
 
     PN->addIncoming(PredV, I->first);
   }
 
-  for (LoadInst *PredLI : CSELoads) {
-    combineMetadataForCSE(PredLI, LI);
+  for (LoadInst *PredLoadI : CSELoads) {
+    combineMetadataForCSE(PredLoadI, LoadI);
   }
 
-  LI->replaceAllUsesWith(PN);
-  LI->eraseFromParent();
+  LoadI->replaceAllUsesWith(PN);
+  LoadI->eraseFromParent();
 
   return true;
 }
@@ -1516,12 +1575,12 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   assert(!PredValues.empty() &&
          "ComputeValueKnownInPredecessors returned true with no values");
 
-  DEBUG(dbgs() << "IN BB: " << *BB;
-        for (const auto &PredValue : PredValues) {
-          dbgs() << "  BB '" << BB->getName() << "': FOUND condition = "
-            << *PredValue.first
-            << " for pred '" << PredValue.second->getName() << "'.\n";
-        });
+  LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
+             for (const auto &PredValue : PredValues) {
+               dbgs() << "  BB '" << BB->getName()
+                      << "': FOUND condition = " << *PredValue.first
+                      << " for pred '" << PredValue.second->getName() << "'.\n";
+  });
 
   // Decide what we want to thread through.  Convert our list of known values to
   // a list of known destinations for each pred.  This also discards duplicate
@@ -1591,20 +1650,24 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   // not thread. By doing so, we do not need to duplicate the current block and
   // also miss potential opportunities in case we dont/cant duplicate.
   if (OnlyDest && OnlyDest != MultipleDestSentinel) {
-    if (PredWithKnownDest ==
-        (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+    if (PredWithKnownDest == (size_t)pred_size(BB)) {
       bool SeenFirstBranchToOnlyDest = false;
+      std::vector <DominatorTree::UpdateType> Updates;
+      Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
       for (BasicBlock *SuccBB : successors(BB)) {
-        if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+        if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
           SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
-        else
+        } else {
           SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+          Updates.push_back({DominatorTree::Delete, BB, SuccBB});
+        }
       }
 
       // Finally update the terminator.
       TerminatorInst *Term = BB->getTerminator();
       BranchInst::Create(OnlyDest, Term);
       Term->eraseFromParent();
+      DDT->applyUpdates(Updates);
 
       // If the condition is now dead due to the removal of the old terminator,
       // erase it.
@@ -1839,15 +1902,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
                                    BasicBlock *SuccBB) {
   // If threading to the same block as we come from, we would infinite loop.
   if (SuccBB == BB) {
-    DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
-          << "' - would thread to self!\n");
+    LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+                      << "' - would thread to self!\n");
     return false;
   }
 
   // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above FindLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
-    DEBUG({
+    LLVM_DEBUG({
       bool BBIsHeader = LoopHeaders.count(BB);
       bool SuccIsHeader = LoopHeaders.count(SuccBB);
       dbgs() << "  Not threading across "
@@ -1861,8 +1924,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   unsigned JumpThreadCost =
       getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
-    DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
-          << "' - Cost is too high: " << JumpThreadCost << "\n");
+    LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+                      << "' - Cost is too high: " << JumpThreadCost << "\n");
     return false;
   }
 
@@ -1871,17 +1934,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   if (PredBBs.size() == 1)
     PredBB = PredBBs[0];
   else {
-    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
-          << " common predecessors.\n");
+    LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+                      << " common predecessors.\n");
     PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
   }
 
   // And finally, do it!
-  DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName() << "' to '"
-        << SuccBB->getName() << "' with cost: " << JumpThreadCost
-        << ", across block:\n    "
-        << *BB << "\n");
-
+  LLVM_DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName()
+                    << "' to '" << SuccBB->getName()
+                    << "' with cost: " << JumpThreadCost
+                    << ", across block:\n    " << *BB << "\n");
+
+  if (DDT->pending())
+    LVI->disableDT();
+  else
+    LVI->enableDT();
   LVI->threadEdge(PredBB, BB, SuccBB);
 
   // We are going to have to map operands from the original BB block to the new
@@ -1931,15 +1998,32 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   // PHI nodes for NewBB now.
   AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
 
+  // Update the terminator of PredBB to jump to NewBB instead of BB.  This
+  // eliminates predecessors from BB, which requires us to simplify any PHI
+  // nodes in BB.
+  TerminatorInst *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB, true);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+
+  // Enqueue required DT updates.
+  DDT->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB},
+                     {DominatorTree::Insert, PredBB, NewBB},
+                     {DominatorTree::Delete, PredBB, BB}});
+
   // If there were values defined in BB that are used outside the block, then we
   // now have to update all uses of the value to use either the original value,
   // the cloned value, or some PHI derived value.  This can require arbitrary
   // PHI insertion, of which we are prepared to do, clean these up now.
   SSAUpdater SSAUpdate;
   SmallVector<Use*, 16> UsesToRename;
+
   for (Instruction &I : *BB) {
-    // Scan all uses of this instruction to see if it is used outside of its
-    // block, and if so, record them in UsesToRename.
+    // Scan all uses of this instruction to see if their uses are no longer
+    // dominated by the previous def and if so, record them in UsesToRename.
+    // Also, skip phi operands from PredBB - we'll remove them anyway.
     for (Use &U : I.uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
@@ -1954,8 +2038,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     // If there are no uses outside the block, we're done with this instruction.
     if (UsesToRename.empty())
       continue;
-
-    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
     // We found a use of I outside of BB.  Rename all uses of I that are outside
     // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
@@ -1966,19 +2049,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
   }
 
-  // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
-  // NewBB instead of BB.  This eliminates predecessors from BB, which requires
-  // us to simplify any PHI nodes in BB.
-  TerminatorInst *PredTerm = PredBB->getTerminator();
-  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
-    if (PredTerm->getSuccessor(i) == BB) {
-      BB->removePredecessor(PredBB, true);
-      PredTerm->setSuccessor(i, NewBB);
-    }
-
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
@@ -1998,20 +2071,42 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
                                                ArrayRef<BasicBlock *> Preds,
                                                const char *Suffix) {
+  SmallVector<BasicBlock *, 2> NewBBs;
+
   // Collect the frequencies of all predecessors of BB, which will be used to
-  // update the edge weight on BB->SuccBB.
-  BlockFrequency PredBBFreq(0);
+  // update the edge weight of the result of splitting predecessors.
+  DenseMap<BasicBlock *, BlockFrequency> FreqMap;
   if (HasProfileData)
     for (auto Pred : Preds)
-      PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+      FreqMap.insert(std::make_pair(
+          Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
+
+  // In the case when BB is a LandingPad block we create 2 new predecessors
+  // instead of just one.
+  if (BB->isLandingPad()) {
+    std::string NewName = std::string(Suffix) + ".split-lp";
+    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
+  } else {
+    NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
+  }
 
-  BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+  std::vector<DominatorTree::UpdateType> Updates;
+  Updates.reserve((2 * Preds.size()) + NewBBs.size());
+  for (auto NewBB : NewBBs) {
+    BlockFrequency NewBBFreq(0);
+    Updates.push_back({DominatorTree::Insert, NewBB, BB});
+    for (auto Pred : predecessors(NewBB)) {
+      Updates.push_back({DominatorTree::Delete, Pred, BB});
+      Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+      if (HasProfileData) // Update frequencies between Pred -> NewBB.
+        NewBBFreq += FreqMap.lookup(Pred);
+    }
+    if (HasProfileData) // Apply the summed frequency to NewBB.
+      BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
 
-  // Set the block frequency of the newly created PredBB, which is the sum of
-  // frequencies of Preds.
-  if (HasProfileData)
-    BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
-  return PredBB;
+  DDT->applyUpdates(Updates);
+  return NewBBs[0];
 }
 
 bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
@@ -2140,42 +2235,49 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
   // cause us to transform this into an irreducible loop, don't do this.
   // See the comments above FindLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB)) {
-    DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
-          << "' into predecessor block '" << PredBBs[0]->getName()
-          << "' - it might create an irreducible loop!\n");
+    LLVM_DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
+                      << "' into predecessor block '" << PredBBs[0]->getName()
+                      << "' - it might create an irreducible loop!\n");
     return false;
   }
 
   unsigned DuplicationCost =
       getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
-    DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
-          << "' - Cost is too high: " << DuplicationCost << "\n");
+    LLVM_DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
+                      << "' - Cost is too high: " << DuplicationCost << "\n");
     return false;
   }
 
   // And finally, do it!  Start by factoring the predecessors if needed.
+  std::vector<DominatorTree::UpdateType> Updates;
   BasicBlock *PredBB;
   if (PredBBs.size() == 1)
     PredBB = PredBBs[0];
   else {
-    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
-          << " common predecessors.\n");
+    LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+                      << " common predecessors.\n");
     PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
   }
+  Updates.push_back({DominatorTree::Delete, PredBB, BB});
 
   // Okay, we decided to do this!  Clone all the instructions in BB onto the end
   // of PredBB.
-  DEBUG(dbgs() << "  Duplicating block '" << BB->getName() << "' into end of '"
-        << PredBB->getName() << "' to eliminate branch on phi.  Cost: "
-        << DuplicationCost << " block is:" << *BB << "\n");
+  LLVM_DEBUG(dbgs() << "  Duplicating block '" << BB->getName()
+                    << "' into end of '" << PredBB->getName()
+                    << "' to eliminate branch on phi.  Cost: "
+                    << DuplicationCost << " block is:" << *BB << "\n");
 
   // Unless PredBB ends with an unconditional branch, split the edge so that we
   // can just clone the bits from BB into the end of the new PredBB.
   BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
 
   if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
-    PredBB = SplitEdge(PredBB, BB);
+    BasicBlock *OldPredBB = PredBB;
+    PredBB = SplitEdge(OldPredBB, BB);
+    Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
+    Updates.push_back({DominatorTree::Insert, PredBB, BB});
+    Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
     OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
   }
 
@@ -2217,6 +2319,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
       // Otherwise, insert the new instruction into the block.
       New->setName(BI->getName());
       PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+      // Update Dominance from simplified New instruction operands.
+      for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+        if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
+          Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
     }
   }
 
@@ -2252,7 +2358,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
     if (UsesToRename.empty())
       continue;
 
-    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
     // We found a use of I outside of BB.  Rename all uses of I that are outside
     // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
@@ -2263,7 +2369,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
   }
 
   // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
@@ -2272,6 +2378,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 
   // Remove the unconditional branch at the end of the PredBB block.
   OldPredBranch->eraseFromParent();
+  DDT->applyUpdates(Updates);
 
   ++NumDupes;
   return true;
@@ -2314,6 +2421,10 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
     // Now check if one of the select values would allow us to constant fold the
     // terminator in BB. We don't do the transform if both sides fold, those
     // cases will be threaded in any case.
+    if (DDT->pending())
+      LVI->disableDT();
+    else
+      LVI->enableDT();
     LazyValueInfo::Tristate LHSFolds =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
                                 CondRHS, Pred, BB, CondCmp);
@@ -2344,6 +2455,8 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
       // The select is now dead.
       SI->eraseFromParent();
 
+      DDT->applyUpdates({{DominatorTree::Insert, NewBB, BB},
+                         {DominatorTree::Insert, Pred, NewBB}});
       // Update any other PHI nodes in BB.
       for (BasicBlock::iterator BI = BB->begin();
            PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
@@ -2409,7 +2522,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
               break;
             }
       } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
-        // Look for a Select in BB that uses PN as condtion.
+        // Look for a Select in BB that uses PN as condition.
         if (isUnfoldCandidate(SelectI, U.get())) {
           SI = SelectI;
           break;
@@ -2422,11 +2535,25 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
     // Expand the select.
     TerminatorInst *Term =
         SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+    BasicBlock *SplitBB = SI->getParent();
+    BasicBlock *NewBB = Term->getParent();
     PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
     NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
     NewPN->addIncoming(SI->getFalseValue(), BB);
     SI->replaceAllUsesWith(NewPN);
     SI->eraseFromParent();
+    // NewBB and SplitBB are newly created blocks which require insertion.
+    std::vector<DominatorTree::UpdateType> Updates;
+    Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
+    Updates.push_back({DominatorTree::Insert, BB, SplitBB});
+    Updates.push_back({DominatorTree::Insert, BB, NewBB});
+    Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
+    // BB's successors were moved to SplitBB, update DDT accordingly.
+    for (auto *Succ : successors(SplitBB)) {
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+      Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
+    }
+    DDT->applyUpdates(Updates);
     return true;
   }
   return false;
@@ -2513,8 +2640,8 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
   if (!TrueDestIsSafe && !FalseDestIsSafe)
     return false;
 
-  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
-  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
 
   ValueToValueMapTy UnguardedMapping, GuardedMapping;
   Instruction *AfterGuard = Guard->getNextNode();
@@ -2523,18 +2650,29 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
     return false;
   // Duplicate all instructions before the guard and the guard itself to the
   // branch where implication is not proved.
-  GuardedBlock = DuplicateInstructionsInSplitBetween(
-      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, PredGuardedBlock, AfterGuard, GuardedMapping);
   assert(GuardedBlock && "Could not create the guarded block?");
   // Duplicate all instructions before the guard in the unguarded branch.
   // Since we have successfully duplicated the guarded block and this block
   // has fewer instructions, we expect it to succeed.
-  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
-                                                       Guard, UnguardedMapping);
+  BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, PredUnguardedBlock, Guard, UnguardedMapping);
   assert(UnguardedBlock && "Could not create the unguarded block?");
-  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
-               << GuardedBlock->getName() << "\n");
-
+  LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+                    << GuardedBlock->getName() << "\n");
+  // DuplicateInstructionsInSplitBetween inserts a new block "BB.split" between
+  // PredBB and BB. We need to perform two inserts and one delete for each of
+  // the above calls to update Dominators.
+  DDT->applyUpdates(
+      {// Guarded block split.
+       {DominatorTree::Delete, PredGuardedBlock, BB},
+       {DominatorTree::Insert, PredGuardedBlock, GuardedBlock},
+       {DominatorTree::Insert, GuardedBlock, BB},
+       // Unguarded block split.
+       {DominatorTree::Delete, PredUnguardedBlock, BB},
+       {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock},
+       {DominatorTree::Insert, UnguardedBlock, BB}});
   // Some instructions before the guard may still have uses. For them, we need
   // to create Phi nodes merging their copies in both guarded and unguarded
   // branches. Those instructions that have no uses can be just removed.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index 946474fef062..ff66632f0391 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -64,7 +65,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -170,7 +170,8 @@ struct LegacyLICMPass : public LoopPass {
   /// loop preheaders be inserted into the CFG...
   ///
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     if (EnableMSSALoopDependency)
       AU.addRequired<MemorySSAWrapperPass>();
@@ -220,7 +221,10 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
-  PA.preserveSet<CFGAnalyses>();
+
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+
   return PA;
 }
 
@@ -392,7 +396,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // If the instruction is dead, we would try to sink it because it isn't
       // used in the loop, instead, just delete it.
       if (isInstructionTriviallyDead(&I, TLI)) {
-        DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+        LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+        salvageDebugInfo(I);
         ++II;
         CurAST->deleteValue(&I);
         I.eraseFromParent();
@@ -445,101 +450,78 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     BasicBlock *BB = DTN->getBlock();
     // Only need to process the contents of this block if it is not part of a
     // subloop (which would already have been processed).
-    if (!inSubLoop(BB, CurLoop, LI))
-      for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
-        Instruction &I = *II++;
-        // Try constant folding this instruction.  If all the operands are
-        // constants, it is technically hoistable, but it would be better to
-        // just fold it.
-        if (Constant *C = ConstantFoldInstruction(
-                &I, I.getModule()->getDataLayout(), TLI)) {
-          DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
-          CurAST->copyValue(&I, C);
-          I.replaceAllUsesWith(C);
-          if (isInstructionTriviallyDead(&I, TLI)) {
-            CurAST->deleteValue(&I);
-            I.eraseFromParent();
-          }
-          Changed = true;
-          continue;
-        }
+    if (inSubLoop(BB, CurLoop, LI))
+      continue;
 
-        // Attempt to remove floating point division out of the loop by
-        // converting it to a reciprocal multiplication.
-        if (I.getOpcode() == Instruction::FDiv &&
-            CurLoop->isLoopInvariant(I.getOperand(1)) &&
-            I.hasAllowReciprocal()) {
-          auto Divisor = I.getOperand(1);
-          auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
-          auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
-          ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
-          ReciprocalDivisor->insertBefore(&I);
-
-          auto Product =
-              BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
-          Product->setFastMathFlags(I.getFastMathFlags());
-          Product->insertAfter(&I);
-          I.replaceAllUsesWith(Product);
+    // Keep track of whether the prefix of instructions visited so far are such
+    // that the next instruction visited is guaranteed to execute if the loop
+    // is entered.  
+    bool IsMustExecute = CurLoop->getHeader() == BB;
+
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      Instruction &I = *II++;
+      // Try constant folding this instruction.  If all the operands are
+      // constants, it is technically hoistable, but it would be better to
+      // just fold it.
+      if (Constant *C = ConstantFoldInstruction(
+              &I, I.getModule()->getDataLayout(), TLI)) {
+        LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C
+                          << '\n');
+        CurAST->copyValue(&I, C);
+        I.replaceAllUsesWith(C);
+        if (isInstructionTriviallyDead(&I, TLI)) {
+          CurAST->deleteValue(&I);
           I.eraseFromParent();
-
-          hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
-          Changed = true;
-          continue;
         }
+        Changed = true;
+        continue;
+      }
+
+      // Try hoisting the instruction out to the preheader.  We can only do
+      // this if all of the operands of the instruction are loop invariant and
+      // if it is safe to hoist the instruction.
+      //
+      if (CurLoop->hasLoopInvariantOperands(&I) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+          (IsMustExecute ||
+           isSafeToExecuteUnconditionally(
+               I, DT, CurLoop, SafetyInfo, ORE,
+               CurLoop->getLoopPreheader()->getTerminator()))) {
+        Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+        continue;
+      }
+
+      // Attempt to remove floating point division out of the loop by
+      // converting it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv &&
+          CurLoop->isLoopInvariant(I.getOperand(1)) &&
+          I.hasAllowReciprocal()) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product =
+            BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        I.eraseFromParent();
 
-        // Try hoisting the instruction out to the preheader.  We can only do
-        // this if all of the operands of the instruction are loop invariant and
-        // if it is safe to hoist the instruction.
-        //
-        if (CurLoop->hasLoopInvariantOperands(&I) &&
-            canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
-            isSafeToExecuteUnconditionally(
-                I, DT, CurLoop, SafetyInfo, ORE,
-                CurLoop->getLoopPreheader()->getTerminator()))
-          Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        Changed = true;
+        continue;
       }
+
+      if (IsMustExecute)
+        IsMustExecute = isGuaranteedToTransferExecutionToSuccessor(&I);
+    }
   }
 
   return Changed;
 }
 
-/// Computes loop safety information, checks loop body & header
-/// for the possibility of may throw exception.
-///
-void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
-  assert(CurLoop != nullptr && "CurLoop cant be null");
-  BasicBlock *Header = CurLoop->getHeader();
-  // Setting default safety values.
-  SafetyInfo->MayThrow = false;
-  SafetyInfo->HeaderMayThrow = false;
-  // Iterate over header and compute safety info.
-  for (BasicBlock::iterator I = Header->begin(), E = Header->end();
-       (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
-    SafetyInfo->HeaderMayThrow |=
-        !isGuaranteedToTransferExecutionToSuccessor(&*I);
-
-  SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
-  // Iterate over loop instructions and compute safety info.
-  // Skip header as it has been computed and stored in HeaderMayThrow.
-  // The first block in loopinfo.Blocks is guaranteed to be the header.
-  assert(Header == *CurLoop->getBlocks().begin() &&
-         "First block must be header");
-  for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
-                            BBE = CurLoop->block_end();
-       (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
-    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
-         (I != E) && !SafetyInfo->MayThrow; ++I)
-      SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I);
-
-  // Compute funclet colors if we might sink/hoist in a function with a funclet
-  // personality routine.
-  Function *Fn = CurLoop->getHeader()->getParent();
-  if (Fn->hasPersonalityFn())
-    if (Constant *PersonalityFn = Fn->getPersonalityFn())
-      if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
-        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
-}
-
 // Return true if LI is invariant within scope of the loop. LI is invariant if
 // CurLoop is dominated by an invariant.start representing the same memory
 // location and size as the memory location LI loads from, and also the
@@ -708,7 +690,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
 /// This is true when all incoming values are that instruction.
 /// This pattern occurs most often with LCSSA PHI nodes.
 ///
-static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
   for (const Value *IncValue : PN.incoming_values())
     if (IncValue != &I)
       return false;
@@ -838,12 +820,12 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   return New;
 }
 
-static Instruction *sinkThroughTriviallyReplacablePHI(
+static Instruction *sinkThroughTriviallyReplaceablePHI(
     PHINode *TPN, Instruction *I, LoopInfo *LI,
     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
     const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
-  assert(isTriviallyReplacablePHI(*TPN, *I) &&
-         "Expect only trivially replacalbe PHI");
+  assert(isTriviallyReplaceablePHI(*TPN, *I) &&
+         "Expect only trivially replaceable PHI");
   BasicBlock *ExitBlock = TPN->getParent();
   Instruction *New;
   auto It = SunkCopies.find(ExitBlock);
@@ -886,7 +868,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
   assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
 
   // Split predecessors of the loop exit to make instructions in the loop are
-  // exposed to exit blocks through trivially replacable PHIs while keeping the
+  // exposed to exit blocks through trivially replaceable PHIs while keeping the
   // loop in the canonical form where each predecessor of each exit block should
   // be contained within the loop. For example, this will convert the loop below
   // from
@@ -898,7 +880,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
   //   %v2 =
   //   br %LE, %LB1
   // LE:
-  //   %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+  //   %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable
   //
   // to
   //
@@ -909,10 +891,10 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
   //   %v2 =
   //   br %LE.split2, %LB1
   // LE.split:
-  //   %p1 = phi [%v1, %LB1]  <-- trivially replacable
+  //   %p1 = phi [%v1, %LB1]  <-- trivially replaceable
   //   br %LE
   // LE.split2:
-  //   %p2 = phi [%v2, %LB2]  <-- trivially replacable
+  //   %p2 = phi [%v2, %LB2]  <-- trivially replaceable
   //   br %LE
   // LE:
   //   %p = phi [%p1, %LE.split], [%p2, %LE.split2]
@@ -929,8 +911,14 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
       // Since we do not allow splitting EH-block with BlockColors in
       // canSplitPredecessors(), we can simply assign predecessor's color to
       // the new block.
-      if (!BlockColors.empty())
-        BlockColors[NewPred] = BlockColors[PredBB];
+      if (!BlockColors.empty()) {
+        // Grab a reference to the ColorVector to be inserted before getting the
+        // reference to the vector we are copying because inserting the new
+        // element in BlockColors might cause the map to be reallocated.
+        ColorVector &ColorsForNewBlock = BlockColors[NewPred];
+        ColorVector &ColorsForOldBlock = BlockColors[PredBB];
+        ColorsForNewBlock = ColorsForOldBlock;
+      }
     }
     PredBBs.remove(PredBB);
   }
@@ -944,7 +932,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
-  DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+  LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
            << "sinking " << ore::NV("Inst", &I);
@@ -987,14 +975,14 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     }
 
     VisitedUsers.insert(PN);
-    if (isTriviallyReplacablePHI(*PN, I))
+    if (isTriviallyReplaceablePHI(*PN, I))
       continue;
 
     if (!canSplitPredecessors(PN, SafetyInfo))
       return Changed;
 
     // Split predecessors of the PHI so that we can make users trivially
-    // replacable.
+    // replaceable.
     splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo);
 
     // Should rebuild the iterators, as they may be invalidated by
@@ -1029,9 +1017,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     PHINode *PN = cast<PHINode>(User);
     assert(ExitBlockSet.count(PN->getParent()) &&
            "The LCSSA PHI is not in an exit block!");
-    // The PHI must be trivially replacable.
-    Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
-                                                         SafetyInfo, CurLoop);
+    // The PHI must be trivially replaceable.
+    Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies,
+                                                          SafetyInfo, CurLoop);
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
     Changed = true;
@@ -1046,8 +1034,8 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   const LoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE) {
   auto *Preheader = CurLoop->getLoopPreheader();
-  DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
-               << "\n");
+  LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+                    << "\n");
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
                                                          << ore::NV("Inst", &I);
@@ -1236,7 +1224,7 @@ bool llvm::promoteLoopAccessesToScalars(
   Value *SomePtr = *PointerMustAliases.begin();
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
 
-  // It isn't safe to promote a load/store from the loop if the load/store is
+  // It is not safe to promote a load/store from the loop if the load/store is
   // conditional.  For example, turning:
   //
   //    for () { if (c) *P += 1; }
@@ -1365,7 +1353,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
         // If a store dominates all exit blocks, it is safe to sink.
         // As explained above, if an exit block was executed, a dominating
-        // store must have been been executed at least once, so we are not
+        // store must have been executed at least once, so we are not
         // introducing stores on paths that did not have them.
         // Note that this only looks at explicit exit blocks. If we ever
         // start sinking stores into unwind edges (see above), this will break.
@@ -1427,8 +1415,8 @@ bool llvm::promoteLoopAccessesToScalars(
     return false;
 
   // Otherwise, this is safe to promote, lets do it!
-  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
-               << '\n');
+  LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+                    << '\n');
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
                               LoopUses[0])
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 7f7c6de76450..3b41b5d96c86 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -71,7 +71,7 @@ public:
 private:
   bool runOnLoop(Loop *L);
 
-  /// \brief Check if the the stride of the accesses is large enough to
+  /// Check if the stride of the accesses is large enough to
   /// warrant a prefetch.
   bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
 
@@ -244,9 +244,9 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
   if (ItersAhead > getMaxPrefetchIterationsAhead())
     return MadeChange;
 
-  DEBUG(dbgs() << "Prefetching " << ItersAhead
-               << " iterations ahead (loop size: " << LoopSize << ") in "
-               << L->getHeader()->getParent()->getName() << ": " << *L);
+  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+                    << " iterations ahead (loop size: " << LoopSize << ") in "
+                    << L->getHeader()->getParent()->getName() << ": " << *L);
 
   SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
   for (const auto BB : L->blocks()) {
@@ -275,7 +275,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       if (!LSCEVAddRec)
         continue;
 
-      // Check if the the stride of the accesses is large enough to warrant a
+      // Check if the stride of the accesses is large enough to warrant a
       // prefetch.
       if (!isStrideLargeEnough(LSCEVAddRec))
         continue;
@@ -320,8 +320,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
            ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
            ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
       ++NumPrefetches;
-      DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
+                        << "\n");
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
                << "prefetched memory access";
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 15cd1086f209..d412025d7e94 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -142,14 +142,15 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   // of trouble.
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader || !L->hasDedicatedExits()) {
-    DEBUG(dbgs()
-          << "Deletion requires Loop with preheader and dedicated exits.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "Deletion requires Loop with preheader and dedicated exits.\n");
     return LoopDeletionResult::Unmodified;
   }
   // We can't remove loops that contain subloops.  If the subloops were dead,
   // they would already have been removed in earlier executions of this pass.
   if (L->begin() != L->end()) {
-    DEBUG(dbgs() << "Loop contains subloops.\n");
+    LLVM_DEBUG(dbgs() << "Loop contains subloops.\n");
     return LoopDeletionResult::Unmodified;
   }
 
@@ -157,7 +158,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   BasicBlock *ExitBlock = L->getUniqueExitBlock();
 
   if (ExitBlock && isLoopNeverExecuted(L)) {
-    DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+    LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
     // Set incoming value to undef for phi nodes in the exit block.
     for (PHINode &P : ExitBlock->phis()) {
       std::fill(P.incoming_values().begin(), P.incoming_values().end(),
@@ -178,13 +179,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   // block will be branched to, or trying to preserve the branching logic in
   // a loop invariant manner.
   if (!ExitBlock) {
-    DEBUG(dbgs() << "Deletion requires single exit block\n");
+    LLVM_DEBUG(dbgs() << "Deletion requires single exit block\n");
     return LoopDeletionResult::Unmodified;
   }
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
   if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
-    DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+    LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
     return Changed ? LoopDeletionResult::Modified
                    : LoopDeletionResult::Unmodified;
   }
@@ -193,12 +194,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   // They could be infinite, in which case we'd be changing program behavior.
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S)) {
-    DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+    LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
     return Changed ? LoopDeletionResult::Modified
                    : LoopDeletionResult::Unmodified;
   }
 
-  DEBUG(dbgs() << "Loop is invariant, delete it!");
+  LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
   deleteDeadLoop(L, &DT, &SE, &LI);
   ++NumDeleted;
 
@@ -209,8 +210,8 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
                                         LoopStandardAnalysisResults &AR,
                                         LPMUpdater &Updater) {
 
-  DEBUG(dbgs() << "Analyzing Loop for deletion: ");
-  DEBUG(L.dump());
+  LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  LLVM_DEBUG(L.dump());
   std::string LoopName = L.getName();
   auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
   if (Result == LoopDeletionResult::Unmodified)
@@ -255,8 +256,8 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-  DEBUG(dbgs() << "Analyzing Loop for deletion: ");
-  DEBUG(L->dump());
+  LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  LLVM_DEBUG(L->dump());
 
   LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0d7e3db901cb..06083a4f5086 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -111,7 +111,7 @@ STATISTIC(NumLoopsDistributed, "Number of loops distributed");
 
 namespace {
 
-/// \brief Maintains the set of instructions of the loop for a partition before
+/// Maintains the set of instructions of the loop for a partition before
 /// cloning.  After cloning, it hosts the new loop.
 class InstPartition {
   using InstructionSet = SmallPtrSet<Instruction *, 8>;
@@ -122,20 +122,20 @@ public:
     Set.insert(I);
   }
 
-  /// \brief Returns whether this partition contains a dependence cycle.
+  /// Returns whether this partition contains a dependence cycle.
   bool hasDepCycle() const { return DepCycle; }
 
-  /// \brief Adds an instruction to this partition.
+  /// Adds an instruction to this partition.
   void add(Instruction *I) { Set.insert(I); }
 
-  /// \brief Collection accessors.
+  /// Collection accessors.
   InstructionSet::iterator begin() { return Set.begin(); }
   InstructionSet::iterator end() { return Set.end(); }
   InstructionSet::const_iterator begin() const { return Set.begin(); }
   InstructionSet::const_iterator end() const { return Set.end(); }
   bool empty() const { return Set.empty(); }
 
-  /// \brief Moves this partition into \p Other.  This partition becomes empty
+  /// Moves this partition into \p Other.  This partition becomes empty
   /// after this.
   void moveTo(InstPartition &Other) {
     Other.Set.insert(Set.begin(), Set.end());
@@ -143,7 +143,7 @@ public:
     Other.DepCycle |= DepCycle;
   }
 
-  /// \brief Populates the partition with a transitive closure of all the
+  /// Populates the partition with a transitive closure of all the
   /// instructions that the seeded instructions dependent on.
   void populateUsedSet() {
     // FIXME: We currently don't use control-dependence but simply include all
@@ -166,7 +166,7 @@ public:
     }
   }
 
-  /// \brief Clones the original loop.
+  /// Clones the original loop.
   ///
   /// Updates LoopInfo and DominatorTree using the information that block \p
   /// LoopDomBB dominates the loop.
@@ -179,27 +179,27 @@ public:
     return ClonedLoop;
   }
 
-  /// \brief The cloned loop.  If this partition is mapped to the original loop,
+  /// The cloned loop.  If this partition is mapped to the original loop,
   /// this is null.
   const Loop *getClonedLoop() const { return ClonedLoop; }
 
-  /// \brief Returns the loop where this partition ends up after distribution.
+  /// Returns the loop where this partition ends up after distribution.
   /// If this partition is mapped to the original loop then use the block from
   /// the loop.
   const Loop *getDistributedLoop() const {
     return ClonedLoop ? ClonedLoop : OrigLoop;
   }
 
-  /// \brief The VMap that is populated by cloning and then used in
+  /// The VMap that is populated by cloning and then used in
   /// remapinstruction to remap the cloned instructions.
   ValueToValueMapTy &getVMap() { return VMap; }
 
-  /// \brief Remaps the cloned instructions using VMap.
+  /// Remaps the cloned instructions using VMap.
   void remapInstructions() {
     remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
   }
 
-  /// \brief Based on the set of instructions selected for this partition,
+  /// Based on the set of instructions selected for this partition,
   /// removes the unnecessary ones.
   void removeUnusedInsts() {
     SmallVector<Instruction *, 8> Unused;
@@ -239,30 +239,30 @@ public:
   }
 
 private:
-  /// \brief Instructions from OrigLoop selected for this partition.
+  /// Instructions from OrigLoop selected for this partition.
   InstructionSet Set;
 
-  /// \brief Whether this partition contains a dependence cycle.
+  /// Whether this partition contains a dependence cycle.
   bool DepCycle;
 
-  /// \brief The original loop.
+  /// The original loop.
   Loop *OrigLoop;
 
-  /// \brief The cloned loop.  If this partition is mapped to the original loop,
+  /// The cloned loop.  If this partition is mapped to the original loop,
   /// this is null.
   Loop *ClonedLoop = nullptr;
 
-  /// \brief The blocks of ClonedLoop including the preheader.  If this
+  /// The blocks of ClonedLoop including the preheader.  If this
   /// partition is mapped to the original loop, this is empty.
   SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
 
-  /// \brief These gets populated once the set of instructions have been
+  /// These gets populated once the set of instructions have been
   /// finalized. If this partition is mapped to the original loop, these are not
   /// set.
   ValueToValueMapTy VMap;
 };
 
-/// \brief Holds the set of Partitions.  It populates them, merges them and then
+/// Holds the set of Partitions.  It populates them, merges them and then
 /// clones the loops.
 class InstPartitionContainer {
   using InstToPartitionIdT = DenseMap<Instruction *, int>;
@@ -271,10 +271,10 @@ public:
   InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
       : L(L), LI(LI), DT(DT) {}
 
-  /// \brief Returns the number of partitions.
+  /// Returns the number of partitions.
   unsigned getSize() const { return PartitionContainer.size(); }
 
-  /// \brief Adds \p Inst into the current partition if that is marked to
+  /// Adds \p Inst into the current partition if that is marked to
   /// contain cycles.  Otherwise start a new partition for it.
   void addToCyclicPartition(Instruction *Inst) {
     // If the current partition is non-cyclic.  Start a new one.
@@ -284,7 +284,7 @@ public:
       PartitionContainer.back().add(Inst);
   }
 
-  /// \brief Adds \p Inst into a partition that is not marked to contain
+  /// Adds \p Inst into a partition that is not marked to contain
   /// dependence cycles.
   ///
   //  Initially we isolate memory instructions into as many partitions as
@@ -293,7 +293,7 @@ public:
     PartitionContainer.emplace_back(Inst, L);
   }
 
-  /// \brief Merges adjacent non-cyclic partitions.
+  /// Merges adjacent non-cyclic partitions.
   ///
   /// The idea is that we currently only want to isolate the non-vectorizable
   /// partition.  We could later allow more distribution among these partition
@@ -303,7 +303,7 @@ public:
         [](const InstPartition *P) { return !P->hasDepCycle(); });
   }
 
-  /// \brief If a partition contains only conditional stores, we won't vectorize
+  /// If a partition contains only conditional stores, we won't vectorize
   /// it.  Try to merge it with a previous cyclic partition.
   void mergeNonIfConvertible() {
     mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
@@ -323,14 +323,14 @@ public:
     });
   }
 
-  /// \brief Merges the partitions according to various heuristics.
+  /// Merges the partitions according to various heuristics.
   void mergeBeforePopulating() {
     mergeAdjacentNonCyclic();
     if (!DistributeNonIfConvertible)
       mergeNonIfConvertible();
   }
 
-  /// \brief Merges partitions in order to ensure that no loads are duplicated.
+  /// Merges partitions in order to ensure that no loads are duplicated.
   ///
   /// We can't duplicate loads because that could potentially reorder them.
   /// LoopAccessAnalysis provides dependency information with the context that
@@ -362,9 +362,11 @@ public:
           std::tie(LoadToPart, NewElt) =
               LoadToPartition.insert(std::make_pair(Inst, PartI));
           if (!NewElt) {
-            DEBUG(dbgs() << "Merging partitions due to this load in multiple "
-                         << "partitions: " << PartI << ", "
-                         << LoadToPart->second << "\n" << *Inst << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "Merging partitions due to this load in multiple "
+                       << "partitions: " << PartI << ", " << LoadToPart->second
+                       << "\n"
+                       << *Inst << "\n");
 
             auto PartJ = I;
             do {
@@ -398,7 +400,7 @@ public:
     return true;
   }
 
-  /// \brief Sets up the mapping between instructions to partitions.  If the
+  /// Sets up the mapping between instructions to partitions.  If the
   /// instruction is duplicated across multiple partitions, set the entry to -1.
   void setupPartitionIdOnInstructions() {
     int PartitionID = 0;
@@ -416,14 +418,14 @@ public:
     }
   }
 
-  /// \brief Populates the partition with everything that the seeding
+  /// Populates the partition with everything that the seeding
   /// instructions require.
   void populateUsedSet() {
     for (auto &P : PartitionContainer)
       P.populateUsedSet();
   }
 
-  /// \brief This performs the main chunk of the work of cloning the loops for
+  /// This performs the main chunk of the work of cloning the loops for
   /// the partitions.
   void cloneLoops() {
     BasicBlock *OrigPH = L->getLoopPreheader();
@@ -470,13 +472,13 @@ public:
           Curr->getDistributedLoop()->getExitingBlock());
   }
 
-  /// \brief Removes the dead instructions from the cloned loops.
+  /// Removes the dead instructions from the cloned loops.
   void removeUnusedInsts() {
     for (auto &Partition : PartitionContainer)
       Partition.removeUnusedInsts();
   }
 
-  /// \brief For each memory pointer, it computes the partitionId the pointer is
+  /// For each memory pointer, it computes the partitionId the pointer is
   /// used in.
   ///
   /// This returns an array of int where the I-th entry corresponds to I-th
@@ -543,10 +545,10 @@ public:
 private:
   using PartitionContainerT = std::list<InstPartition>;
 
-  /// \brief List of partitions.
+  /// List of partitions.
   PartitionContainerT PartitionContainer;
 
-  /// \brief Mapping from Instruction to partition Id.  If the instruction
+  /// Mapping from Instruction to partition Id.  If the instruction
   /// belongs to multiple partitions the entry contains -1.
   InstToPartitionIdT InstToPartitionId;
 
@@ -554,7 +556,7 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
 
-  /// \brief The control structure to merge adjacent partitions if both satisfy
+  /// The control structure to merge adjacent partitions if both satisfy
   /// the \p Predicate.
   template <class UnaryPredicate>
   void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
@@ -575,7 +577,7 @@ private:
   }
 };
 
-/// \brief For each memory instruction, this class maintains difference of the
+/// For each memory instruction, this class maintains difference of the
 /// number of unsafe dependences that start out from this instruction minus
 /// those that end here.
 ///
@@ -602,7 +604,7 @@ public:
       const SmallVectorImpl<Dependence> &Dependences) {
     Accesses.append(Instructions.begin(), Instructions.end());
 
-    DEBUG(dbgs() << "Backward dependences:\n");
+    LLVM_DEBUG(dbgs() << "Backward dependences:\n");
     for (auto &Dep : Dependences)
       if (Dep.isPossiblyBackward()) {
         // Note that the designations source and destination follow the program
@@ -611,7 +613,7 @@ public:
         ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
         --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
 
-        DEBUG(Dep.print(dbgs(), 2, Instructions));
+        LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
       }
   }
 
@@ -619,7 +621,7 @@ private:
   AccessesType Accesses;
 };
 
-/// \brief The actual class performing the per-loop work.
+/// The actual class performing the per-loop work.
 class LoopDistributeForLoop {
 public:
   LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
@@ -628,12 +630,13 @@ public:
     setForced();
   }
 
-  /// \brief Try to distribute an inner-most loop.
+  /// Try to distribute an inner-most loop.
   bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
     assert(L->empty() && "Only process inner loops.");
 
-    DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
-                 << "\" checking " << *L << "\n");
+    LLVM_DEBUG(dbgs() << "\nLDist: In \""
+                      << L->getHeader()->getParent()->getName()
+                      << "\" checking " << *L << "\n");
 
     if (!L->getExitBlock())
       return fail("MultipleExitBlocks", "multiple exit blocks");
@@ -705,7 +708,7 @@ public:
     for (auto *Inst : DefsUsedOutside)
       Partitions.addToNewNonCyclicPartition(Inst);
 
-    DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+    LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
     if (Partitions.getSize() < 2)
       return fail("CantIsolateUnsafeDeps",
                   "cannot isolate unsafe dependencies");
@@ -713,20 +716,20 @@ public:
     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
     // should be able to vectorize these together.
     Partitions.mergeBeforePopulating();
-    DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+    LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
     if (Partitions.getSize() < 2)
       return fail("CantIsolateUnsafeDeps",
                   "cannot isolate unsafe dependencies");
 
     // Now, populate the partitions with non-memory operations.
     Partitions.populateUsedSet();
-    DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+    LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
 
     // In order to preserve original lexical order for loads, keep them in the
     // partition that we set up in the MemoryInstructionDependences loop.
     if (Partitions.mergeToAvoidDuplicatedLoads()) {
-      DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
-                   << Partitions);
+      LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+                        << Partitions);
       if (Partitions.getSize() < 2)
         return fail("CantIsolateUnsafeDeps",
                     "cannot isolate unsafe dependencies");
@@ -740,7 +743,7 @@ public:
       return fail("TooManySCEVRuntimeChecks",
                   "too many SCEV run-time checks needed.\n");
 
-    DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+    LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
     // We're done forming the partitions set up the reverse mapping from
     // instructions to partitions.
     Partitions.setupPartitionIdOnInstructions();
@@ -759,8 +762,8 @@ public:
                                                   RtPtrChecking);
 
     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
-      DEBUG(dbgs() << "\nPointers:\n");
-      DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+      LLVM_DEBUG(dbgs() << "\nPointers:\n");
+      LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
       LVer.setAliasChecks(std::move(Checks));
       LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
@@ -775,12 +778,12 @@ public:
     // Now, we remove the instruction from each loop that don't belong to that
     // partition.
     Partitions.removeUnusedInsts();
-    DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
-    DEBUG(Partitions.printBlocks());
+    LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+    LLVM_DEBUG(Partitions.printBlocks());
 
     if (LDistVerify) {
       LI->verify(*DT);
-      DT->verifyDomTree();
+      assert(DT->verify(DominatorTree::VerificationLevel::Fast));
     }
 
     ++NumLoopsDistributed;
@@ -793,12 +796,12 @@ public:
     return true;
   }
 
-  /// \brief Provide diagnostics then \return with false.
+  /// Provide diagnostics then \return with false.
   bool fail(StringRef RemarkName, StringRef Message) {
     LLVMContext &Ctx = F->getContext();
     bool Forced = isForced().getValueOr(false);
 
-    DEBUG(dbgs() << "Skipping; " << Message << "\n");
+    LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
 
     // With Rpass-missed report that distribution failed.
     ORE->emit([&]() {
@@ -826,7 +829,7 @@ public:
     return false;
   }
 
-  /// \brief Return if distribution forced to be enabled/disabled for the loop.
+  /// Return if distribution forced to be enabled/disabled for the loop.
   ///
   /// If the optional has a value, it indicates whether distribution was forced
   /// to be enabled (true) or disabled (false).  If the optional has no value
@@ -834,7 +837,7 @@ public:
   const Optional<bool> &isForced() const { return IsForced; }
 
 private:
-  /// \brief Filter out checks between pointers from the same partition.
+  /// Filter out checks between pointers from the same partition.
   ///
   /// \p PtrToPartition contains the partition number for pointers.  Partition
   /// number -1 means that the pointer is used in multiple partitions.  In this
@@ -873,7 +876,7 @@ private:
     return Checks;
   }
 
-  /// \brief Check whether the loop metadata is forcing distribution to be
+  /// Check whether the loop metadata is forcing distribution to be
   /// enabled/disabled.
   void setForced() {
     Optional<const MDOperand *> Value =
@@ -896,7 +899,7 @@ private:
   ScalarEvolution *SE;
   OptimizationRemarkEmitter *ORE;
 
-  /// \brief Indicates whether distribution is forced to be enabled/disabled for
+  /// Indicates whether distribution is forced to be enabled/disabled for
   /// the loop.
   ///
   /// If the optional has a value, it indicates whether distribution was forced
@@ -939,7 +942,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
 
 namespace {
 
-/// \brief The pass class.
+/// The pass class.
 class LoopDistributeLegacy : public FunctionPass {
 public:
   static char ID;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 21551f0a0825..d8692198f7a3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -37,7 +37,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -57,6 +56,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -87,8 +87,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -188,8 +188,9 @@ private:
                                PHINode *CntPhi, Value *Var);
   bool recognizeAndInsertCTLZ();
   void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
-                                PHINode *CntPhi, Value *Var, const DebugLoc DL,
-                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
+                                PHINode *CntPhi, Value *Var, Instruction *DefX,
+                                const DebugLoc &DL, bool ZeroCheck,
+                                bool IsCntPhiUsedOutsideLoop);
 
   /// @}
 };
@@ -310,9 +311,9 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
   SmallVector<BasicBlock *, 8> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
 
-  DEBUG(dbgs() << "loop-idiom Scanning: F["
-               << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
-               << CurLoop->getHeader()->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "loop-idiom Scanning: F["
+                    << CurLoop->getHeader()->getParent()->getName()
+                    << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
 
   bool MadeChange = false;
 
@@ -756,8 +757,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   MSIs.insert(MSI);
   bool NegStride = SizeInBytes == -Stride;
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
-                                 BECount, NegStride, /*IsLoopMemset=*/true);
+                                 MSI->getDestAlignment(), SplatValue, MSI, MSIs,
+                                 Ev, BECount, NegStride, /*IsLoopMemset=*/true);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -936,8 +937,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
   }
 
-  DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
-               << "    from store to: " << *Ev << " at: " << *TheStore << "\n");
+  LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
+                    << "    from store to: " << *Ev << " at: " << *TheStore
+                    << "\n");
   NewCall->setDebugLoc(TheStore->getDebugLoc());
 
   // Okay, the memset has been formed.  Zap the original store and anything that
@@ -1037,16 +1039,17 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
-  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
   CallInst *NewCall = nullptr;
   // Check whether to generate an unordered atomic memcpy:
-  //  If the load or store are atomic, then they must neccessarily be unordered
+  //  If the load or store are atomic, then they must necessarily be unordered
   //  by previous checks.
   if (!SI->isAtomic() && !LI->isAtomic())
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
+                                   LoadBasePtr, LI->getAlignment(), NumBytes);
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
+    unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
     if (Align < StoreSize)
       return false;
 
@@ -1066,9 +1069,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   }
   NewCall->setDebugLoc(SI->getDebugLoc());
 
-  DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
-               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
-               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+  LLVM_DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
+                    << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+                    << "    from store ptr=" << *StoreEv << " at: " << *SI
+                    << "\n");
 
   // Okay, the memcpy has been formed.  Zap the original store and anything that
   // feeds into it.
@@ -1084,9 +1088,9 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
                                                    bool IsLoopMemset) {
   if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
     if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
-      DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
-                   << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
-                   << " avoided: multi-block top-level loop\n");
+      LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
+                        << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+                        << " avoided: multi-block top-level loop\n");
       return true;
     }
   }
@@ -1195,14 +1199,13 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
       VarX1 = DefX2->getOperand(0);
       SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
     }
-    if (!SubOneOp)
+    if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
       return false;
 
-    Instruction *SubInst = cast<Instruction>(SubOneOp);
-    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
     if (!Dec ||
-        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
-          (SubInst->getOpcode() == Instruction::Add &&
+        !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubOneOp->getOpcode() == Instruction::Add &&
            Dec->isMinusOne()))) {
       return false;
     }
@@ -1314,7 +1317,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
     return false;
 
   // step 2: detect instructions corresponding to "x.next = x >> 1"
-  if (!DefX || DefX->getOpcode() != Instruction::AShr)
+  if (!DefX || (DefX->getOpcode() != Instruction::AShr &&
+                DefX->getOpcode() != Instruction::LShr))
     return false;
   ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
   if (!Shft || !Shft->isOne())
@@ -1372,13 +1376,13 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
 
   bool IsCntPhiUsedOutsideLoop = false;
   for (User *U : CntPhi->users())
-    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+    if (!CurLoop->contains(cast<Instruction>(U))) {
       IsCntPhiUsedOutsideLoop = true;
       break;
     }
   bool IsCntInstUsedOutsideLoop = false;
   for (User *U : CntInst->users())
-    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+    if (!CurLoop->contains(cast<Instruction>(U))) {
       IsCntInstUsedOutsideLoop = true;
       break;
     }
@@ -1395,16 +1399,27 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
   // parent function RunOnLoop.
   BasicBlock *PH = CurLoop->getLoopPreheader();
   Value *InitX = PhiX->getIncomingValueForBlock(PH);
-  // If we check X != 0 before entering the loop we don't need a zero
-  // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
-  // loop (if it is used we count CTLZ(X >> 1)).
-  if (!IsCntPhiUsedOutsideLoop)
-    if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
-      if (BranchInst *PreCondBr =
-          dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
-        if (matchCondition(PreCondBr, PH) == InitX)
-          ZeroCheck = true;
-      }
+
+  // Make sure the initial value can't be negative otherwise the ashr in the
+  // loop might never reach zero which would make the loop infinite.
+  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL))
+    return false;
+
+  // If we are using the count instruction outside the loop, make sure we
+  // have a zero check as a precondition. Without the check the loop would run
+  // one iteration for before any check of the input value. This means 0 and 1
+  // would have identical behavior in the original loop and thus
+  if (!IsCntPhiUsedOutsideLoop) {
+    auto *PreCondBB = PH->getSinglePredecessor();
+    if (!PreCondBB)
+      return false;
+    auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    if (!PreCondBI)
+      return false;
+    if (matchCondition(PreCondBI, PH) != InitX)
+      return false;
+    ZeroCheck = true;
+  }
 
   // Check if CTLZ intrinsic is profitable. Assume it is always profitable
   // if we delete the loop (the loop has only 6 instructions):
@@ -1415,17 +1430,16 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
   //  %inc = add nsw %i.0, 1
   //  br i1 %tobool
 
-  IRBuilder<> Builder(PH->getTerminator());
-  SmallVector<const Value *, 2> Ops =
-      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
-  ArrayRef<const Value *> Args(Ops);
+  const Value *Args[] =
+      {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
+                        : ConstantInt::getFalse(InitX->getContext())};
   if (CurLoop->getHeader()->size() != 6 &&
       TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
           TargetTransformInfo::TCC_Basic)
     return false;
 
-  const DebugLoc DL = DefX->getDebugLoc();
-  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX,
+                           DefX->getDebugLoc(), ZeroCheck,
                            IsCntPhiUsedOutsideLoop);
   return true;
 }
@@ -1461,7 +1475,7 @@ bool LoopIdiomRecognize::recognizePopcount() {
   if (!EntryBI || EntryBI->isConditional())
     return false;
 
-  // It should have a precondition block where the generated popcount instrinsic
+  // It should have a precondition block where the generated popcount intrinsic
   // function can be inserted.
   auto *PreCondBB = PH->getSinglePredecessor();
   if (!PreCondBB)
@@ -1539,8 +1553,9 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
 void LoopIdiomRecognize::transformLoopToCountable(
     BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
-    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
-  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+    Instruction *DefX, const DebugLoc &DL, bool ZeroCheck,
+    bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
 
   // Step 1: Insert the CTLZ instruction at the end of the preheader block
   //   Count = BitWidth - CTLZ(InitX);
@@ -1550,10 +1565,16 @@ void LoopIdiomRecognize::transformLoopToCountable(
   Builder.SetCurrentDebugLocation(DL);
   Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
 
-  if (IsCntPhiUsedOutsideLoop)
-    InitXNext = Builder.CreateAShr(InitX,
-                                   ConstantInt::get(InitX->getType(), 1));
-  else
+  if (IsCntPhiUsedOutsideLoop) {
+    if (DefX->getOpcode() == Instruction::AShr)
+      InitXNext =
+          Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
+    else if (DefX->getOpcode() == Instruction::LShr)
+      InitXNext =
+          Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
+    else
+      llvm_unreachable("Unexpected opcode!");      
+  } else
     InitXNext = InitX;
   CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
   Count = Builder.CreateSub(
@@ -1588,7 +1609,7 @@ void LoopIdiomRecognize::transformLoopToCountable(
   //   ...
   //   Br: loop if (Dec != 0)
   BasicBlock *Body = *(CurLoop->block_begin());
-  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+  auto *LbBr = cast<BranchInst>(Body->getTerminator());
   ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
   Type *Ty = Count->getType();
 
@@ -1625,8 +1646,8 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                  Instruction *CntInst,
                                                  PHINode *CntPhi, Value *Var) {
   BasicBlock *PreHead = CurLoop->getLoopPreheader();
-  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
-  const DebugLoc DL = CntInst->getDebugLoc();
+  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
+  const DebugLoc &DL = CntInst->getDebugLoc();
 
   // Assuming before transformation, the loop is following:
   //  if (x) // the precondition
@@ -1675,7 +1696,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
   }
 
   // Step 3: Note that the population count is exactly the trip count of the
-  // loop in question, which enable us to to convert the loop from noncountable
+  // loop in question, which enable us to convert the loop from noncountable
   // loop into a countable one. The benefit is twofold:
   //
   //  - If the loop only counts population, the entire loop becomes dead after
@@ -1696,7 +1717,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
   //     do { cnt++; x &= x-1; t--) } while (t > 0);
   BasicBlock *Body = *(CurLoop->block_begin());
   {
-    auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+    auto *LbBr = cast<BranchInst>(Body->getTerminator());
     ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
     Type *Ty = TripCnt->getType();
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 40d468a084d4..71859efbf4bd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -20,8 +20,10 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
@@ -34,7 +36,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <utility>
@@ -45,118 +46,116 @@ using namespace llvm;
 
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
 
-static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
-                             AssumptionCache *AC,
-                             const TargetLibraryInfo *TLI) {
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-  array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
-
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                             AssumptionCache &AC,
+                             const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+
+  // On the first pass over the loop body we try to simplify every instruction.
+  // On subsequent passes, we can restrict this to only simplifying instructions
+  // where the inputs have been updated. We end up needing two sets: one
+  // containing the instructions we are simplifying in *this* pass, and one for
+  // the instructions we will want to simplify in the *next* pass. We use
+  // pointers so we can swap between two stably allocated sets.
   SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
 
-  // The bit we are stealing from the pointer represents whether this basic
-  // block is the header of a subloop, in which case we only process its phis.
-  using WorklistItem = PointerIntPair<BasicBlock *, 1>;
-  SmallVector<WorklistItem, 16> VisitStack;
-  SmallPtrSet<BasicBlock *, 32> Visited;
-
-  bool Changed = false;
-  bool LocalChanged;
-  do {
-    LocalChanged = false;
-
-    VisitStack.clear();
-    Visited.clear();
+  // Track the PHI nodes that have already been visited during each iteration so
+  // that we can identify when it is necessary to iterate.
+  SmallPtrSet<PHINode *, 4> VisitedPHIs;
 
-    VisitStack.push_back(WorklistItem(L->getHeader(), false));
+  // While simplifying we may discover dead code or cause code to become dead.
+  // Keep track of all such instructions and we will delete them at the end.
+  SmallVector<Instruction *, 8> DeadInsts;
 
-    while (!VisitStack.empty()) {
-      WorklistItem Item = VisitStack.pop_back_val();
-      BasicBlock *BB = Item.getPointer();
-      bool IsSubloopHeader = Item.getInt();
-      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  // First we want to create an RPO traversal of the loop body. By processing in
+  // RPO we can ensure that definitions are processed prior to uses (for non PHI
+  // uses) in all cases. This ensures we maximize the simplifications in each
+  // iteration over the loop and minimizes the possible causes for continuing to
+  // iterate.
+  LoopBlocksRPO RPOT(&L);
+  RPOT.perform(&LI);
 
-      // Simplify instructions in the current basic block.
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = &*BI++;
-
-        // The first time through the loop ToSimplify is empty and we try to
-        // simplify all instructions. On later iterations ToSimplify is not
-        // empty and we only bother simplifying instructions that are in it.
-        if (!ToSimplify->empty() && !ToSimplify->count(I))
+  bool Changed = false;
+  for (;;) {
+    for (BasicBlock *BB : RPOT) {
+      for (Instruction &I : *BB) {
+        if (auto *PI = dyn_cast<PHINode>(&I))
+          VisitedPHIs.insert(PI);
+
+        if (I.use_empty()) {
+          if (isInstructionTriviallyDead(&I, &TLI))
+            DeadInsts.push_back(&I);
           continue;
-
-        // Don't bother simplifying unused instructions.
-        if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
-          if (V && LI->replacementPreservesLCSSAForm(I, V)) {
-            // Mark all uses for resimplification next time round the loop.
-            for (User *U : I->users())
-              Next->insert(cast<Instruction>(U));
-
-            I->replaceAllUsesWith(V);
-            LocalChanged = true;
-            ++NumSimplified;
-          }
-        }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
-          // instruction, so simply incrementing the iterator does not work.
-          // When instructions get deleted re-iterate instead.
-          BI = BB->begin();
-          BE = BB->end();
-          LocalChanged = true;
         }
 
-        if (IsSubloopHeader && !isa<PHINode>(I))
-          break;
-      }
+        // We special case the first iteration which we can detect due to the
+        // empty `ToSimplify` set.
+        bool IsFirstIteration = ToSimplify->empty();
 
-      // Add all successors to the worklist, except for loop exit blocks and the
-      // bodies of subloops. We visit the headers of loops so that we can
-      // process
-      // their phis, but we contract the rest of the subloop body and only
-      // follow
-      // edges leading back to the original loop.
-      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
-           ++SI) {
-        BasicBlock *SuccBB = *SI;
-        if (!Visited.insert(SuccBB).second)
+        if (!IsFirstIteration && !ToSimplify->count(&I))
           continue;
 
-        const Loop *SuccLoop = LI->getLoopFor(SuccBB);
-        if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
-            L->contains(SuccLoop)) {
-          VisitStack.push_back(WorklistItem(SuccBB, true));
-
-          SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
-          SuccLoop->getExitBlocks(SubLoopExitBlocks);
-
-          for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
-            BasicBlock *ExitBB = SubLoopExitBlocks[i];
-            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
-              VisitStack.push_back(WorklistItem(ExitBB, false));
-          }
-
+        Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+        if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
           continue;
-        }
 
-        bool IsExitBlock =
-            std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
-        if (IsExitBlock)
-          continue;
+        for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          auto *UserI = cast<Instruction>(U.getUser());
+          U.set(V);
+
+          // If the instruction is used by a PHI node we have already processed
+          // we'll need to iterate on the loop body to converge, so add it to
+          // the next set.
+          if (auto *UserPI = dyn_cast<PHINode>(UserI))
+            if (VisitedPHIs.count(UserPI)) {
+              Next->insert(UserPI);
+              continue;
+            }
+
+          // If we are only simplifying targeted instructions and the user is an
+          // instruction in the loop body, add it to our set of targeted
+          // instructions. Because we process defs before uses (outside of PHIs)
+          // we won't have visited it yet.
+          //
+          // We also skip any uses outside of the loop being simplified. Those
+          // should always be PHI nodes due to LCSSA form, and we don't want to
+          // try to simplify those away.
+          assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+                 "Uses outside the loop should be PHI nodes due to LCSSA!");
+          if (!IsFirstIteration && L.contains(UserI))
+            ToSimplify->insert(UserI);
+        }
 
-        VisitStack.push_back(WorklistItem(SuccBB, false));
+        assert(I.use_empty() && "Should always have replaced all uses!");
+        if (isInstructionTriviallyDead(&I, &TLI))
+          DeadInsts.push_back(&I);
+        ++NumSimplified;
+        Changed = true;
       }
     }
 
-    // Place the list of instructions to simplify on the next loop iteration
-    // into ToSimplify.
-    std::swap(ToSimplify, Next);
-    Next->clear();
+    // Delete any dead instructions found thus far now that we've finished an
+    // iteration over all instructions in all the loop blocks.
+    if (!DeadInsts.empty()) {
+      Changed = true;
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI);
+    }
+
+    // If we never found a PHI that needs to be simplified in the next
+    // iteration, we're done.
+    if (Next->empty())
+      break;
 
-    Changed |= LocalChanged;
-  } while (LocalChanged);
+    // Otherwise, put the next set in place for the next iteration and reset it
+    // and the visited PHIs for that iteration.
+    std::swap(Next, ToSimplify);
+    Next->clear();
+    VisitedPHIs.clear();
+    DeadInsts.clear();
+  }
 
   return Changed;
 }
@@ -174,21 +173,20 @@ public:
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
     if (skipLoop(L))
       return false;
-    DominatorTreeWrapperPass *DTWP =
-        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    AssumptionCache *AC =
-        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
             *L->getHeader()->getParent());
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
-    return SimplifyLoopInst(L, DT, LI, AC, TLI);
+    return simplifyLoopInst(*L, DT, LI, AC, TLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesCFG();
     getLoopAnalysisUsage(AU);
@@ -200,7 +198,7 @@ public:
 PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
                                             LoopStandardAnalysisResults &AR,
                                             LPMUpdater &) {
-  if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+  if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 4f8dafef230a..2978165ed8a9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
@@ -40,6 +41,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <cassert>
@@ -50,6 +52,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-interchange"
 
+STATISTIC(LoopsInterchanged, "Number of loops interchanged");
+
 static cl::opt<int> LoopInterchangeCostThreshold(
     "loop-interchange-threshold", cl::init(0), cl::Hidden,
     cl::desc("Interchange if you gain more than this number"));
@@ -73,8 +77,8 @@ static const unsigned MaxLoopNestDepth = 10;
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
     for (auto D : Row)
-      DEBUG(dbgs() << D << " ");
-    DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << D << " ");
+    LLVM_DEBUG(dbgs() << "\n");
   }
 }
 #endif
@@ -103,8 +107,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     }
   }
 
-  DEBUG(dbgs() << "Found " << MemInstr.size()
-               << " Loads and Stores to analyze\n");
+  LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
+                    << " Loads and Stores to analyze\n");
 
   ValueVector::iterator I, IE, J, JE;
 
@@ -121,11 +125,11 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
       // Track Output, Flow, and Anti dependencies.
       if (auto D = DI->depends(Src, Dst, true)) {
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
-        DEBUG(StringRef DepType =
-                  D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
-              dbgs() << "Found " << DepType
-                     << " dependency between Src and Dst\n"
-                     << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+        LLVM_DEBUG(StringRef DepType =
+                       D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+                   dbgs() << "Found " << DepType
+                          << " dependency between Src and Dst\n"
+                          << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
         unsigned Levels = D->getLevels();
         char Direction;
         for (unsigned II = 1; II <= Levels; ++II) {
@@ -165,17 +169,14 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
 
         DepMatrix.push_back(Dep);
         if (DepMatrix.size() > MaxMemInstrCount) {
-          DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
-                       << " dependencies inside loop\n");
+          LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+                            << " dependencies inside loop\n");
           return false;
         }
       }
     }
   }
 
-  // We don't have a DepMatrix to check legality return false.
-  if (DepMatrix.empty())
-    return false;
   return true;
 }
 
@@ -271,9 +272,9 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
 }
 
 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
-  DEBUG(dbgs() << "Calling populateWorklist on Func: "
-               << L.getHeader()->getParent()->getName() << " Loop: %"
-               << L.getHeader()->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
+                    << L.getHeader()->getParent()->getName() << " Loop: %"
+                    << L.getHeader()->getName() << '\n');
   LoopVector LoopList;
   Loop *CurrentLoop = &L;
   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
@@ -404,7 +405,9 @@ public:
 
   /// Interchange OuterLoop and InnerLoop.
   bool transform();
-  void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+  void restructureLoops(Loop *NewInner, Loop *NewOuter,
+                        BasicBlock *OrigInnerPreHeader,
+                        BasicBlock *OrigOuterPreHeader);
   void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
 
 private:
@@ -453,6 +456,9 @@ struct LoopInterchange : public FunctionPass {
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
@@ -462,8 +468,7 @@ struct LoopInterchange : public FunctionPass {
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
@@ -473,7 +478,7 @@ struct LoopInterchange : public FunctionPass {
     for (Loop *L : *LI)
       populateWorklist(*L, Worklist);
 
-    DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+    LLVM_DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
     bool Changed = true;
     while (!Worklist.empty()) {
       LoopVector LoopList = Worklist.pop_back_val();
@@ -486,15 +491,15 @@ struct LoopInterchange : public FunctionPass {
     for (Loop *L : LoopList) {
       const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
       if (ExitCountOuter == SE->getCouldNotCompute()) {
-        DEBUG(dbgs() << "Couldn't compute backedge count\n");
+        LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
         return false;
       }
       if (L->getNumBackEdges() != 1) {
-        DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+        LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
         return false;
       }
       if (!L->getExitingBlock()) {
-        DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+        LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
         return false;
       }
     }
@@ -511,53 +516,38 @@ struct LoopInterchange : public FunctionPass {
     bool Changed = false;
     unsigned LoopNestDepth = LoopList.size();
     if (LoopNestDepth < 2) {
-      DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+      LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
       return false;
     }
     if (LoopNestDepth > MaxLoopNestDepth) {
-      DEBUG(dbgs() << "Cannot handle loops of depth greater than "
-                   << MaxLoopNestDepth << "\n");
+      LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+                        << MaxLoopNestDepth << "\n");
       return false;
     }
     if (!isComputableLoopNest(LoopList)) {
-      DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+      LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
       return false;
     }
 
-    DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
+    LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
+                      << "\n");
 
     CharMatrix DependencyMatrix;
     Loop *OuterMostLoop = *(LoopList.begin());
     if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
                                   OuterMostLoop, DI)) {
-      DEBUG(dbgs() << "Populating dependency matrix failed\n");
+      LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
     }
 #ifdef DUMP_DEP_MATRICIES
-    DEBUG(dbgs() << "Dependence before interchange\n");
+    LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
     printDepMatrix(DependencyMatrix);
 #endif
 
-    BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
-    BranchInst *OuterMostLoopLatchBI =
-        dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
-    if (!OuterMostLoopLatchBI)
-      return false;
-
-    // Since we currently do not handle LCSSA PHI's any failure in loop
-    // condition will now branch to LoopNestExit.
-    // TODO: This should be removed once we handle LCSSA PHI nodes.
-
     // Get the Outermost loop exit.
-    BasicBlock *LoopNestExit;
-    if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
-      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
-    else
-      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
-
-    if (isa<PHINode>(LoopNestExit->begin())) {
-      DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
-                      "since on failure all loops branch to loop nest exit.\n");
+    BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
+    if (!LoopNestExit) {
+      LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
       return false;
     }
 
@@ -573,9 +563,8 @@ struct LoopInterchange : public FunctionPass {
 
       // Update the DependencyMatrix
       interChangeDependencies(DependencyMatrix, i, i - 1);
-      DT->recalculate(F);
 #ifdef DUMP_DEP_MATRICIES
-      DEBUG(dbgs() << "Dependence after interchange\n");
+      LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
       printDepMatrix(DependencyMatrix);
 #endif
       Changed |= Interchanged;
@@ -586,21 +575,21 @@ struct LoopInterchange : public FunctionPass {
   bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId, BasicBlock *LoopNestExit,
                    std::vector<std::vector<char>> &DependencyMatrix) {
-    DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
-                 << " and OuterLoopId = " << OuterLoopId << "\n");
+    LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+                      << " and OuterLoopId = " << OuterLoopId << "\n");
     Loop *InnerLoop = LoopList[InnerLoopId];
     Loop *OuterLoop = LoopList[OuterLoopId];
 
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
                                 PreserveLCSSA, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
-      DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+      LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
       return false;
     }
-    DEBUG(dbgs() << "Loops are legal to interchange\n");
+    LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
-      DEBUG(dbgs() << "Interchanging loops not profitable\n");
+      LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
 
@@ -614,7 +603,8 @@ struct LoopInterchange : public FunctionPass {
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
                                  LoopNestExit, LIL.hasInnerLoopReduction());
     LIT.transform();
-    DEBUG(dbgs() << "Loops interchanged\n");
+    LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
+    LoopsInterchanged++;
     return true;
   }
 };
@@ -631,13 +621,13 @@ bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
 
 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
     BasicBlock *BB) {
-  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+  for (Instruction &I : *BB) {
     // Load corresponding to reduction PHI's are safe while concluding if
     // tightly nested.
-    if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
       if (!areAllUsesReductions(L, InnerLoop))
         return true;
-    } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+    } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
       return true;
   }
   return false;
@@ -645,13 +635,13 @@ bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
 
 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
     BasicBlock *BB) {
-  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+  for (Instruction &I : *BB) {
     // Stores corresponding to reductions are safe while concluding if tightly
     // nested.
-    if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+    if (StoreInst *L = dyn_cast<StoreInst>(&I)) {
       if (!isa<PHINode>(L->getOperand(0)))
         return true;
-    } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+    } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
       return true;
   }
   return false;
@@ -662,7 +652,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
 
-  DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+  LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
 
   // A perfectly nested loop will not have any branch in between the outer and
   // inner block i.e. outer header will branch to either inner preheader and
@@ -676,14 +666,14 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
     if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
       return false;
 
-  DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+  LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
   // We do not have any basic block in between now make sure the outer header
   // and outer loop latch doesn't contain any unsafe instructions.
   if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
       containsUnsafeInstructionsInLatch(OuterLoopLatch))
     return false;
 
-  DEBUG(dbgs() << "Loops are perfectly nested\n");
+  LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
   // We have a perfect loop nest.
   return true;
 }
@@ -717,16 +707,15 @@ bool LoopInterchangeLegality::findInductionAndReductions(
     SmallVector<PHINode *, 8> &Reductions) {
   if (!L->getLoopLatch() || !L->getLoopPredecessor())
     return false;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+  for (PHINode &PHI : L->getHeader()->phis()) {
     RecurrenceDescriptor RD;
     InductionDescriptor ID;
-    PHINode *PHI = cast<PHINode>(I);
-    if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
-      Inductions.push_back(PHI);
-    else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
-      Reductions.push_back(PHI);
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+      Inductions.push_back(&PHI);
+    else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD))
+      Reductions.push_back(&PHI);
     else {
-      DEBUG(
+      LLVM_DEBUG(
           dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
       return false;
     }
@@ -735,12 +724,11 @@ bool LoopInterchangeLegality::findInductionAndReductions(
 }
 
 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
-  for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PHI = cast<PHINode>(I);
+  for (PHINode &PHI : Block->phis()) {
     // Reduction lcssa phi will have only 1 incoming block that from loop latch.
-    if (PHI->getNumIncomingValues() > 1)
+    if (PHI.getNumIncomingValues() > 1)
       return false;
-    Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+    Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
     if (!Ins)
       return false;
     // Incoming value for lcssa phi's in outer loop exit can only be inner loop
@@ -751,35 +739,38 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
   return true;
 }
 
-static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
-                                         BasicBlock *LoopHeader) {
-  if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
-    assert(BI->getNumSuccessors() == 2 &&
-           "Branch leaving loop latch must have 2 successors");
-    for (BasicBlock *Succ : BI->successors()) {
-      if (Succ == LoopHeader)
-        continue;
-      return Succ;
-    }
-  }
-  return nullptr;
-}
-
 // This function indicates the current limitations in the transform as a result
 // of which we do not proceed.
 bool LoopInterchangeLegality::currentLimitations() {
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
-  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
-  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+  // transform currently expects the loop latches to also be the exiting
+  // blocks.
+  if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
+      OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
+      !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
+      !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
+    LLVM_DEBUG(
+        dbgs() << "Loops where the latch is not the exiting block are not"
+               << " supported currently.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Loops where the latch is not the exiting block cannot be"
+                " interchange currently.";
+    });
+    return true;
+  }
 
   PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
   SmallVector<PHINode *, 8> Reductions;
   if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
-    DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
-                 << "are supported currently.\n");
+    LLVM_DEBUG(
+        dbgs() << "Only inner loops with induction or reduction PHI nodes "
+               << "are supported currently.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
                                       InnerLoop->getStartLoc(),
@@ -792,8 +783,9 @@ bool LoopInterchangeLegality::currentLimitations() {
 
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
-    DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
-                 << "Failed to interchange due to current limitation\n");
+    LLVM_DEBUG(
+        dbgs() << "We currently only support loops with 1 induction variable."
+               << "Failed to interchange due to current limitation\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
                                       InnerLoop->getStartLoc(),
@@ -809,8 +801,9 @@ bool LoopInterchangeLegality::currentLimitations() {
   InnerInductionVar = Inductions.pop_back_val();
   Reductions.clear();
   if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
-    DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
-                 << "are supported currently.\n");
+    LLVM_DEBUG(
+        dbgs() << "Only outer loops with induction or reduction PHI nodes "
+               << "are supported currently.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
                                       OuterLoop->getStartLoc(),
@@ -824,8 +817,8 @@ bool LoopInterchangeLegality::currentLimitations() {
   // Outer loop cannot have reduction because then loops will not be tightly
   // nested.
   if (!Reductions.empty()) {
-    DEBUG(dbgs() << "Outer loops with reductions are not supported "
-                 << "currently.\n");
+    LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported "
+                      << "currently.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
                                       OuterLoop->getStartLoc(),
@@ -837,8 +830,8 @@ bool LoopInterchangeLegality::currentLimitations() {
   }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
-    DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
-                 << "supported currently.\n");
+    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+                      << "supported currently.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
                                       OuterLoop->getStartLoc(),
@@ -851,7 +844,7 @@ bool LoopInterchangeLegality::currentLimitations() {
 
   // TODO: Triangular loops are not handled for now.
   if (!isLoopStructureUnderstood(InnerInductionVar)) {
-    DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
                                       InnerLoop->getStartLoc(),
@@ -862,23 +855,10 @@ bool LoopInterchangeLegality::currentLimitations() {
   }
 
   // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
-  BasicBlock *LoopExitBlock =
-      getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
-  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
-    DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Only outer loops with LCSSA PHIs can be interchange "
-                "currently.";
-    });
-    return true;
-  }
-
-  LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
-  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
-    DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+  BasicBlock *InnerExit = InnerLoop->getExitBlock();
+  if (!containsSafePHI(InnerExit, false)) {
+    LLVM_DEBUG(
+        dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
                                       InnerLoop->getStartLoc(),
@@ -908,8 +888,9 @@ bool LoopInterchangeLegality::currentLimitations() {
         dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
 
   if (!InnerIndexVarInc) {
-    DEBUG(dbgs() << "Did not find an instruction to increment the induction "
-                 << "variable.\n");
+    LLVM_DEBUG(
+        dbgs() << "Did not find an instruction to increment the induction "
+               << "variable.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
                                       InnerLoop->getStartLoc(),
@@ -924,7 +905,8 @@ bool LoopInterchangeLegality::currentLimitations() {
   // instruction.
 
   bool FoundInduction = false;
-  for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
+  for (const Instruction &I :
+       llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
     if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
         isa<ZExtInst>(I))
       continue;
@@ -932,8 +914,8 @@ bool LoopInterchangeLegality::currentLimitations() {
     // We found an instruction. If this is not induction variable then it is not
     // safe to split this loop latch.
     if (!I.isIdenticalTo(InnerIndexVarInc)) {
-      DEBUG(dbgs() << "Found unsupported instructions between induction "
-                   << "variable increment and branch.\n");
+      LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
+                        << "variable increment and branch.\n");
       ORE->emit([&]() {
         return OptimizationRemarkMissed(
                    DEBUG_TYPE, "UnsupportedInsBetweenInduction",
@@ -950,7 +932,7 @@ bool LoopInterchangeLegality::currentLimitations() {
   // The loop latch ended and we didn't find the induction variable return as
   // current limitation.
   if (!FoundInduction) {
-    DEBUG(dbgs() << "Did not find the induction variable.\n");
+    LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
                                       InnerLoop->getStartLoc(),
@@ -962,13 +944,50 @@ bool LoopInterchangeLegality::currentLimitations() {
   return false;
 }
 
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+  for (PHINode &PHI : LoopNestExit->phis()) {
+    //  FIXME: We currently are not able to detect floating point reductions
+    //         and have to use floating point PHIs as a proxy to prevent
+    //         interchanging in the presence of floating point reductions.
+    if (PHI.getType()->isFloatingPointTy())
+      return false;
+    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+       continue;
+
+     // The incoming value is defined in the outer loop latch. Currently we
+     // only support that in case the outer loop latch has a single predecessor.
+     // This guarantees that the outer loop latch is executed if and only if
+     // the inner loop is executed (because tightlyNested() guarantees that the
+     // outer loop header only branches to the inner loop or the outer loop
+     // latch).
+     // FIXME: We could weaken this logic and allow multiple predecessors,
+     //        if the values are produced outside the loop latch. We would need
+     //        additional logic to update the PHI nodes in the exit block as
+     //        well.
+     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+       return false;
+    }
+  }
+  return true;
+}
+
 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
                                                   unsigned OuterLoopId,
                                                   CharMatrix &DepMatrix) {
   if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
-    DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
-                 << " and OuterLoopId = " << OuterLoopId
-                 << " due to dependence\n");
+    LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+                      << " and OuterLoopId = " << OuterLoopId
+                      << " due to dependence\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
                                       InnerLoop->getStartLoc(),
@@ -977,16 +996,23 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     });
     return false;
   }
-
   // Check if outer and inner loop contain legal instructions only.
   for (auto *BB : OuterLoop->blocks())
-    for (Instruction &I : *BB)
+    for (Instruction &I : BB->instructionsWithoutDebug())
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         // readnone functions do not prevent interchanging.
         if (CI->doesNotReadMemory())
           continue;
-        DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
-                     << "safely.");
+        LLVM_DEBUG(
+            dbgs() << "Loops with call instructions cannot be interchanged "
+                   << "safely.");
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
+                                          CI->getDebugLoc(),
+                                          CI->getParent())
+                 << "Cannot interchange loops due to call instruction.";
+        });
+
         return false;
       }
 
@@ -1015,13 +1041,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   // TODO: The loops could not be interchanged due to current limitations in the
   // transform module.
   if (currentLimitations()) {
-    DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+    LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
     return false;
   }
 
   // Check if the loops are tightly nested.
   if (!tightlyNested(OuterLoop, InnerLoop)) {
-    DEBUG(dbgs() << "Loops not tightly nested\n");
+    LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
                                       InnerLoop->getStartLoc(),
@@ -1032,6 +1058,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     return false;
   }
 
+  if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Found unsupported PHI node in loop exit.";
+    });
+    return false;
+  }
+
   return true;
 }
 
@@ -1100,7 +1137,8 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
   }
   // If outer loop has dependence and inner loop is loop independent then it is
   // profitable to interchange to enable parallelism.
-  return true;
+  // If there are no dependences, interchanging will not improve anything.
+  return !DepMatrix.empty();
 }
 
 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
@@ -1115,7 +1153,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
   // of induction variables in the instruction and allows reordering if number
   // of bad orders is more than good.
   int Cost = getInstrOrderCost();
-  DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
   if (Cost < -LoopInterchangeCostThreshold)
     return true;
 
@@ -1138,33 +1176,88 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
                                                Loop *InnerLoop) {
-  for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
-       ++I) {
-    if (*I == InnerLoop) {
-      OuterLoop->removeChildLoop(I);
+  for (Loop *L : *OuterLoop)
+    if (L == InnerLoop) {
+      OuterLoop->removeChildLoop(L);
       return;
     }
-  }
   llvm_unreachable("Couldn't find loop");
 }
 
-void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
-                                                Loop *OuterLoop) {
+/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
+/// new inner and outer loop after interchanging: NewInner is the original
+/// outer loop and NewOuter is the original inner loop.
+///
+/// Before interchanging, we have the following structure
+/// Outer preheader
+//  Outer header
+//    Inner preheader
+//    Inner header
+//      Inner body
+//      Inner latch
+//   outer bbs
+//   Outer latch
+//
+// After interchanging:
+// Inner preheader
+// Inner header
+//   Outer preheader
+//   Outer header
+//     Inner body
+//     outer bbs
+//     Outer latch
+//   Inner latch
+void LoopInterchangeTransform::restructureLoops(
+    Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
+    BasicBlock *OrigOuterPreHeader) {
   Loop *OuterLoopParent = OuterLoop->getParentLoop();
+  // The original inner loop preheader moves from the new inner loop to
+  // the parent loop, if there is one.
+  NewInner->removeBlockFromLoop(OrigInnerPreHeader);
+  LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
+
+  // Switch the loop levels.
   if (OuterLoopParent) {
     // Remove the loop from its parent loop.
-    removeChildLoop(OuterLoopParent, OuterLoop);
-    removeChildLoop(OuterLoop, InnerLoop);
-    OuterLoopParent->addChildLoop(InnerLoop);
+    removeChildLoop(OuterLoopParent, NewInner);
+    removeChildLoop(NewInner, NewOuter);
+    OuterLoopParent->addChildLoop(NewOuter);
   } else {
-    removeChildLoop(OuterLoop, InnerLoop);
-    LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+    removeChildLoop(NewInner, NewOuter);
+    LI->changeTopLevelLoop(NewInner, NewOuter);
+  }
+  while (!NewOuter->empty())
+    NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
+  NewOuter->addChildLoop(NewInner);
+
+  // BBs from the original inner loop.
+  SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
+
+  // Add BBs from the original outer loop to the original inner loop (excluding
+  // BBs already in inner loop)
+  for (BasicBlock *BB : NewInner->blocks())
+    if (LI->getLoopFor(BB) == NewInner)
+      NewOuter->addBlockEntry(BB);
+
+  // Now remove inner loop header and latch from the new inner loop and move
+  // other BBs (the loop body) to the new inner loop.
+  BasicBlock *OuterHeader = NewOuter->getHeader();
+  BasicBlock *OuterLatch = NewOuter->getLoopLatch();
+  for (BasicBlock *BB : OrigInnerBBs) {
+    // Nothing will change for BBs in child loops.
+    if (LI->getLoopFor(BB) != NewOuter)
+      continue;
+    // Remove the new outer loop header and latch from the new inner loop.
+    if (BB == OuterHeader || BB == OuterLatch)
+      NewInner->removeBlockFromLoop(BB);
+    else
+      LI->changeLoopFor(BB, NewInner);
   }
 
-  while (!InnerLoop->empty())
-    OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
-
-  InnerLoop->addChildLoop(OuterLoop);
+  // The preheader of the original outer loop becomes part of the new
+  // outer loop.
+  NewOuter->addBlockEntry(OrigOuterPreHeader);
+  LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
 }
 
 bool LoopInterchangeTransform::transform() {
@@ -1173,10 +1266,10 @@ bool LoopInterchangeTransform::transform() {
 
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-    DEBUG(dbgs() << "Calling Split Inner Loop\n");
+    LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
     PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
     if (!InductionPHI) {
-      DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+      LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
       return false;
     }
 
@@ -1185,8 +1278,7 @@ bool LoopInterchangeTransform::transform() {
     else
       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
 
-    // Ensure that InductionPHI is the first Phi node as required by
-    // splitInnerLoopHeader
+    // Ensure that InductionPHI is the first Phi node.
     if (&InductionPHI->getParent()->front() != InductionPHI)
       InductionPHI->moveBefore(&InductionPHI->getParent()->front());
 
@@ -1194,20 +1286,20 @@ bool LoopInterchangeTransform::transform() {
     // incremented/decremented.
     // TODO: This splitting logic may not work always. Fix this.
     splitInnerLoopLatch(InnerIndexVar);
-    DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+    LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
 
     // Splits the inner loops phi nodes out into a separate basic block.
-    splitInnerLoopHeader();
-    DEBUG(dbgs() << "splitInnerLoopHeader done\n");
+    BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+    LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
   }
 
   Transformed |= adjustLoopLinks();
   if (!Transformed) {
-    DEBUG(dbgs() << "adjustLoopLinks failed\n");
+    LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
     return false;
   }
 
-  restructureLoops(InnerLoop, OuterLoop);
   return true;
 }
 
@@ -1217,38 +1309,6 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
   InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
 }
 
-void LoopInterchangeTransform::splitInnerLoopHeader() {
-  // Split the inner loop header out. Here make sure that the reduction PHI's
-  // stay in the innerloop body.
-  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-  if (InnerLoopHasReduction) {
-    // Note: The induction PHI must be the first PHI for this to work
-    BasicBlock *New = InnerLoopHeader->splitBasicBlock(
-        ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
-    if (LI)
-      if (Loop *L = LI->getLoopFor(InnerLoopHeader))
-        L->addBasicBlockToLoop(New, *LI);
-
-    // Adjust Reduction PHI's in the block.
-    SmallVector<PHINode *, 8> PHIVec;
-    for (auto I = New->begin(); isa<PHINode>(I); ++I) {
-      PHINode *PHI = dyn_cast<PHINode>(I);
-      Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
-      PHI->replaceAllUsesWith(V);
-      PHIVec.push_back((PHI));
-    }
-    for (PHINode *P : PHIVec) {
-      P->eraseFromParent();
-    }
-  } else {
-    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
-  }
-
-  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
-                  "InnerLoopHeader\n");
-}
-
 /// \brief Move all instructions except the terminator from FromBB right before
 /// InsertBefore
 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
@@ -1262,18 +1322,40 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
                                                    BasicBlock *OldPred,
                                                    BasicBlock *NewPred) {
-  for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PHI = cast<PHINode>(I);
-    unsigned Num = PHI->getNumIncomingValues();
+  for (PHINode &PHI : CurrBlock->phis()) {
+    unsigned Num = PHI.getNumIncomingValues();
     for (unsigned i = 0; i < Num; ++i) {
-      if (PHI->getIncomingBlock(i) == OldPred)
-        PHI->setIncomingBlock(i, NewPred);
+      if (PHI.getIncomingBlock(i) == OldPred)
+        PHI.setIncomingBlock(i, NewPred);
+    }
+  }
+}
+
+/// Update BI to jump to NewBB instead of OldBB. Records updates to
+/// the dominator tree in DTUpdates, if DT should be preserved.
+static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
+                            BasicBlock *NewBB,
+                            std::vector<DominatorTree::UpdateType> &DTUpdates) {
+  assert(llvm::count_if(BI->successors(),
+                        [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 &&
+         "BI must jump to OldBB at most once.");
+  for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) {
+    if (BI->getSuccessor(i) == OldBB) {
+      BI->setSuccessor(i, NewBB);
+
+      DTUpdates.push_back(
+          {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+      DTUpdates.push_back(
+          {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
+      break;
     }
   }
 }
 
 bool LoopInterchangeTransform::adjustLoopBranches() {
-  DEBUG(dbgs() << "adjustLoopBranches called\n");
+  LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
+  std::vector<DominatorTree::UpdateType> DTUpdates;
+
   // Adjust the loop preheader
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
@@ -1313,27 +1395,18 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
     return false;
 
   // Adjust Loop Preheader and headers
-
-  unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
-  for (unsigned i = 0; i < NumSucc; ++i) {
-    if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
-      OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
-  }
-
-  NumSucc = OuterLoopHeaderBI->getNumSuccessors();
-  for (unsigned i = 0; i < NumSucc; ++i) {
-    if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
-      OuterLoopHeaderBI->setSuccessor(i, LoopExit);
-    else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
-      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
-  }
+  updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
+                  InnerLoopPreHeader, DTUpdates);
+  updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates);
+  updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
+                  InnerLoopHeaderSuccessor, DTUpdates);
 
   // Adjust reduction PHI's now that the incoming block has changed.
   updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
                       OuterLoopHeader);
 
-  BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
-  InnerLoopHeaderBI->eraseFromParent();
+  updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
+                  OuterLoopPreHeader, DTUpdates);
 
   // -------------Adjust loop latches-----------
   if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
@@ -1341,19 +1414,15 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   else
     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
 
-  NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
-  for (unsigned i = 0; i < NumSucc; ++i) {
-    if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
-      InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
-  }
+  updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
+                  InnerLoopLatchSuccessor, DTUpdates);
 
   // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
   // the value and remove this PHI node from inner loop.
   SmallVector<PHINode *, 8> LcssaVec;
-  for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
-    PHINode *LcssaPhi = cast<PHINode>(I);
-    LcssaVec.push_back(LcssaPhi);
-  }
+  for (PHINode &P : InnerLoopLatchSuccessor->phis())
+    LcssaVec.push_back(&P);
+
   for (PHINode *P : LcssaVec) {
     Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
     P->replaceAllUsesWith(Incoming);
@@ -1365,19 +1434,52 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   else
     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
 
-  if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
-    InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
-  else
-    InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+  updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
+                  OuterLoopLatchSuccessor, DTUpdates);
+  updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
+                  DTUpdates);
 
   updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
 
-  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
-    OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
-  } else {
-    OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+  DT->applyUpdates(DTUpdates);
+  restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
+                   OuterLoopPreHeader);
+
+  // Now update the reduction PHIs in the inner and outer loop headers.
+  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
+  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+  for (PHINode *PHI : OuterLoopPHIs)
+    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+
+  // Move the PHI nodes from the inner loop header to the outer loop header.
+  // We have to deal with one kind of PHI nodes:
+  //  1) PHI nodes that are part of inner loop-only reductions.
+  // We only have to move the PHI node and update the incoming blocks.
+  for (PHINode *PHI : InnerLoopPHIs) {
+    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+    for (BasicBlock *InBB : PHI->blocks()) {
+      if (InnerLoop->contains(InBB))
+        continue;
+
+      assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) &&
+             "Unexpected incoming PHI node, reductions in outer loop are not "
+             "supported yet");
+      PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
+      PHI->eraseFromParent();
+      break;
+    }
   }
 
+  // Update the incoming blocks for moved PHI nodes.
+  updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
+  updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch);
+  updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader);
+  updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch);
+
   return true;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index dfa5ec1f354d..19bd9ebcc15b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -25,7 +25,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -52,6 +52,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <algorithm>
 #include <cassert>
@@ -79,7 +80,7 @@ STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
 
 namespace {
 
-/// \brief Represent a store-to-forwarding candidate.
+/// Represent a store-to-forwarding candidate.
 struct StoreToLoadForwardingCandidate {
   LoadInst *Load;
   StoreInst *Store;
@@ -87,7 +88,7 @@ struct StoreToLoadForwardingCandidate {
   StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
       : Load(Load), Store(Store) {}
 
-  /// \brief Return true if the dependence from the store to the load has a
+  /// Return true if the dependence from the store to the load has a
   /// distance of one.  E.g. A[i+1] = A[i]
   bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
                                  Loop *L) const {
@@ -136,7 +137,7 @@ struct StoreToLoadForwardingCandidate {
 
 } // end anonymous namespace
 
-/// \brief Check if the store dominates all latches, so as long as there is no
+/// Check if the store dominates all latches, so as long as there is no
 /// intervening store this value will be loaded in the next iteration.
 static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
                                          DominatorTree *DT) {
@@ -147,21 +148,21 @@ static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
   });
 }
 
-/// \brief Return true if the load is not executed on all paths in the loop.
+/// Return true if the load is not executed on all paths in the loop.
 static bool isLoadConditional(LoadInst *Load, Loop *L) {
   return Load->getParent() != L->getHeader();
 }
 
 namespace {
 
-/// \brief The per-loop class that does most of the work.
+/// The per-loop class that does most of the work.
 class LoadEliminationForLoop {
 public:
   LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
                          DominatorTree *DT)
       : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
 
-  /// \brief Look through the loop-carried and loop-independent dependences in
+  /// Look through the loop-carried and loop-independent dependences in
   /// this loop and find store->load dependences.
   ///
   /// Note that no candidate is returned if LAA has failed to analyze the loop
@@ -178,7 +179,7 @@ public:
     // forward and backward dependences qualify.  Disqualify loads that have
     // other unknown dependences.
 
-    SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+    SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
 
     for (const auto &Dep : *Deps) {
       Instruction *Source = Dep.getSource(LAI);
@@ -222,14 +223,14 @@ public:
     return Candidates;
   }
 
-  /// \brief Return the index of the instruction according to program order.
+  /// Return the index of the instruction according to program order.
   unsigned getInstrIndex(Instruction *Inst) {
     auto I = InstOrder.find(Inst);
     assert(I != InstOrder.end() && "No index for instruction");
     return I->second;
   }
 
-  /// \brief If a load has multiple candidates associated (i.e. different
+  /// If a load has multiple candidates associated (i.e. different
   /// stores), it means that it could be forwarding from multiple stores
   /// depending on control flow.  Remove these candidates.
   ///
@@ -284,22 +285,24 @@ public:
 
     Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
       if (LoadToSingleCand[Cand.Load] != &Cand) {
-        DEBUG(dbgs() << "Removing from candidates: \n" << Cand
-                     << "  The load may have multiple stores forwarding to "
-                     << "it\n");
+        LLVM_DEBUG(
+            dbgs() << "Removing from candidates: \n"
+                   << Cand
+                   << "  The load may have multiple stores forwarding to "
+                   << "it\n");
         return true;
       }
       return false;
     });
   }
 
-  /// \brief Given two pointers operations by their RuntimePointerChecking
+  /// Given two pointers operations by their RuntimePointerChecking
   /// indices, return true if they require an alias check.
   ///
   /// We need a check if one is a pointer for a candidate load and the other is
   /// a pointer for a possibly intervening store.
   bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
-                     const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+                     const SmallPtrSet<Value *, 4> &PtrsWrittenOnFwdingPath,
                      const std::set<Value *> &CandLoadPtrs) {
     Value *Ptr1 =
         LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
@@ -309,11 +312,11 @@ public:
             (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
   }
 
-  /// \brief Return pointers that are possibly written to on the path from a
+  /// Return pointers that are possibly written to on the path from a
   /// forwarding store to a load.
   ///
   /// These pointers need to be alias-checked against the forwarding candidates.
-  SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+  SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
       const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
     // From FirstStore to LastLoad neither of the elimination candidate loads
     // should overlap with any of the stores.
@@ -351,7 +354,7 @@ public:
     // We're looking for stores after the first forwarding store until the end
     // of the loop, then from the beginning of the loop until the last
     // forwarded-to load.  Collect the pointer for the stores.
-    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+    SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
 
     auto InsertStorePtr = [&](Instruction *I) {
       if (auto *S = dyn_cast<StoreInst>(I))
@@ -366,16 +369,16 @@ public:
     return PtrsWrittenOnFwdingPath;
   }
 
-  /// \brief Determine the pointer alias checks to prove that there are no
+  /// Determine the pointer alias checks to prove that there are no
   /// intervening stores.
   SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
       const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
 
-    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+    SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
         findPointersWrittenOnForwardingPath(Candidates);
 
     // Collect the pointers of the candidate loads.
-    // FIXME: SmallSet does not work with std::inserter.
+    // FIXME: SmallPtrSet does not work with std::inserter.
     std::set<Value *> CandLoadPtrs;
     transform(Candidates,
                    std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
@@ -394,13 +397,14 @@ public:
               return false;
             });
 
-    DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
-    DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+    LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
+                      << "):\n");
+    LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
 
     return Checks;
   }
 
-  /// \brief Perform the transformation for a candidate.
+  /// Perform the transformation for a candidate.
   void
   propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
                                   SCEVExpander &SEE) {
@@ -436,11 +440,11 @@ public:
     Cand.Load->replaceAllUsesWith(PHI);
   }
 
-  /// \brief Top-level driver for each loop: find store->load forwarding
+  /// Top-level driver for each loop: find store->load forwarding
   /// candidates, add run-time checks and perform transformation.
   bool processLoop() {
-    DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
-                 << "\" checking " << *L << "\n");
+    LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+                      << "\" checking " << *L << "\n");
 
     // Look for store-to-load forwarding cases across the
     // backedge. E.g.:
@@ -479,7 +483,7 @@ public:
     SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
     unsigned NumForwarding = 0;
     for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
-      DEBUG(dbgs() << "Candidate " << Cand);
+      LLVM_DEBUG(dbgs() << "Candidate " << Cand);
 
       // Make sure that the stored values is available everywhere in the loop in
       // the next iteration.
@@ -498,9 +502,10 @@ public:
         continue;
 
       ++NumForwarding;
-      DEBUG(dbgs()
-            << NumForwarding
-            << ". Valid store-to-load forwarding across the loop backedge\n");
+      LLVM_DEBUG(
+          dbgs()
+          << NumForwarding
+          << ". Valid store-to-load forwarding across the loop backedge\n");
       Candidates.push_back(Cand);
     }
     if (Candidates.empty())
@@ -513,25 +518,26 @@ public:
 
     // Too many checks are likely to outweigh the benefits of forwarding.
     if (Checks.size() > Candidates.size() * CheckPerElim) {
-      DEBUG(dbgs() << "Too many run-time checks needed.\n");
+      LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
       return false;
     }
 
     if (LAI.getPSE().getUnionPredicate().getComplexity() >
         LoadElimSCEVCheckThreshold) {
-      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
     }
 
     if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
       if (L->getHeader()->getParent()->optForSize()) {
-        DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing "
-                        "for size.\n");
+        LLVM_DEBUG(
+            dbgs() << "Versioning is needed but not allowed when optimizing "
+                      "for size.\n");
         return false;
       }
 
       if (!L->isLoopSimplifyForm()) {
-        DEBUG(dbgs() << "Loop is not is loop-simplify form");
+        LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
         return false;
       }
 
@@ -558,7 +564,7 @@ public:
 private:
   Loop *L;
 
-  /// \brief Maps the load/store instructions to their index according to
+  /// Maps the load/store instructions to their index according to
   /// program order.
   DenseMap<Instruction *, unsigned> InstOrder;
 
@@ -599,7 +605,7 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
 
 namespace {
 
-/// \brief The pass.  Most of the work is delegated to the per-loop
+/// The pass.  Most of the work is delegated to the per-loop
 /// LoadEliminationForLoop class.
 class LoopLoadElimination : public FunctionPass {
 public:
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 2e4c7b19e476..561ceea1d880 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -155,7 +155,7 @@
 // When S = -1 (i.e. reverse iterating loop), the transformation is supported
 // when:
 //   * The loop has a single latch with the condition of the form:
-//     B(X) = X <pred> latchLimit, where <pred> is u> or s>.
+//     B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=.
 //   * The guard condition is of the form
 //     G(X) = X - 1 u< guardLimit
 //
@@ -171,9 +171,14 @@
 //     guardStart u< guardLimit && latchLimit u>= 1.
 //   Similarly for sgt condition the widened condition is:
 //     guardStart u< guardLimit && latchLimit s>= 1.
+//   For uge condition the widened condition is:
+//     guardStart u< guardLimit && latchLimit u> 1.
+//   For sge condition the widened condition is:
+//     guardStart u< guardLimit && latchLimit s> 1.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -198,6 +203,20 @@ static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
 
 static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
                                         cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    SkipProfitabilityChecks("loop-predication-skip-profitability-checks",
+                            cl::Hidden, cl::init(false));
+
+// This is the scale factor for the latch probability. We use this during
+// profitability analysis to find other exiting blocks that have a much higher
+// probability of exiting the loop instead of loop exiting via latch.
+// This value should be greater than 1 for a sane profitability check.
+static cl::opt<float> LatchExitProbabilityScale(
+    "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0),
+    cl::desc("scale factor for the latch probability. Value should be greater "
+             "than 1. Lower values are ignored"));
+
 namespace {
 class LoopPredication {
   /// Represents an induction variable check:
@@ -217,6 +236,7 @@ class LoopPredication {
   };
 
   ScalarEvolution *SE;
+  BranchProbabilityInfo *BPI;
 
   Loop *L;
   const DataLayout *DL;
@@ -250,6 +270,12 @@ class LoopPredication {
                                                         IRBuilder<> &Builder);
   bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
 
+  // If the loop always exits through another block in the loop, we should not
+  // predicate based on the latch check. For example, the latch check can be a
+  // very coarse grained check and there can be more fine grained exit checks
+  // within the loop. We identify such unprofitable loops through BPI.
+  bool isLoopProfitableToPredicate();
+
   // When the IV type is wider than the range operand type, we can still do loop
   // predication, by generating SCEVs for the range and latch that are of the
   // same type. We achieve this by generating a SCEV truncate expression for the
@@ -266,8 +292,10 @@ class LoopPredication {
   // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
   // so.
   Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
+
 public:
-  LoopPredication(ScalarEvolution *SE) : SE(SE){};
+  LoopPredication(ScalarEvolution *SE, BranchProbabilityInfo *BPI)
+      : SE(SE), BPI(BPI){};
   bool runOnLoop(Loop *L);
 };
 
@@ -279,6 +307,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 
@@ -286,7 +315,9 @@ public:
     if (skipLoop(L))
       return false;
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    LoopPredication LP(SE);
+    BranchProbabilityInfo &BPI =
+        getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    LoopPredication LP(SE, &BPI);
     return LP.runOnLoop(L);
   }
 };
@@ -296,6 +327,7 @@ char LoopPredicationLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
                       "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
                     "Loop predication", false, false)
@@ -307,7 +339,11 @@ Pass *llvm::createLoopPredicationPass() {
 PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &U) {
-  LoopPredication LP(&AR.SE);
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+  Function *F = L.getHeader()->getParent();
+  auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
+  LoopPredication LP(&AR.SE, BPI);
   if (!LP.runOnLoop(&L))
     return PreservedAnalyses::all();
 
@@ -375,11 +411,11 @@ LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
   if (!NewLatchCheck.IV)
     return None;
   NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
-  DEBUG(dbgs() << "IV of type: " << *LatchType
-               << "can be represented as range check type:" << *RangeCheckType
-               << "\n");
-  DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
-  DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+  LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType
+                    << "can be represented as range check type:"
+                    << *RangeCheckType << "\n");
+  LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+  LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
   return NewLatchCheck;
 }
 
@@ -412,30 +448,15 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
                      SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
   if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
       !CanExpand(LatchLimit) || !CanExpand(RHS)) {
-    DEBUG(dbgs() << "Can't expand limit check!\n");
+    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
     return None;
   }
-  ICmpInst::Predicate LimitCheckPred;
-  switch (LatchCheck.Pred) {
-  case ICmpInst::ICMP_ULT:
-    LimitCheckPred = ICmpInst::ICMP_ULE;
-    break;
-  case ICmpInst::ICMP_ULE:
-    LimitCheckPred = ICmpInst::ICMP_ULT;
-    break;
-  case ICmpInst::ICMP_SLT:
-    LimitCheckPred = ICmpInst::ICMP_SLE;
-    break;
-  case ICmpInst::ICMP_SLE:
-    LimitCheckPred = ICmpInst::ICMP_SLT;
-    break;
-  default:
-    llvm_unreachable("Unsupported loop latch!");
-  }
+  auto LimitCheckPred =
+      ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
 
-  DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
-  DEBUG(dbgs() << "RHS: " << *RHS << "\n");
-  DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+  LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+  LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+  LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
 
   Instruction *InsertAt = Preheader->getTerminator();
   auto *LimitCheck =
@@ -454,16 +475,16 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
   const SCEV *LatchLimit = LatchCheck.Limit;
   if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
       !CanExpand(LatchLimit)) {
-    DEBUG(dbgs() << "Can't expand limit check!\n");
+    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
     return None;
   }
   // The decrement of the latch check IV should be the same as the
   // rangeCheckIV.
   auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
   if (RangeCheck.IV != PostDecLatchCheckIV) {
-    DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
-                 << *PostDecLatchCheckIV
-                 << "  and RangeCheckIV: " << *RangeCheck.IV << "\n");
+    LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+                      << *PostDecLatchCheckIV
+                      << "  and RangeCheckIV: " << *RangeCheck.IV << "\n");
     return None;
   }
 
@@ -472,9 +493,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
   // latchLimit <pred> 1.
   // See the header comment for reasoning of the checks.
   Instruction *InsertAt = Preheader->getTerminator();
-  auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
-                            ? ICmpInst::ICMP_SGE
-                            : ICmpInst::ICMP_UGE;
+  auto LimitCheckPred =
+      ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
   auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT,
                                           GuardStart, GuardLimit, InsertAt);
   auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit,
@@ -488,8 +508,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
 Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
                                                        SCEVExpander &Expander,
                                                        IRBuilder<> &Builder) {
-  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
-  DEBUG(ICI->dump());
+  LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  LLVM_DEBUG(ICI->dump());
 
   // parseLoopStructure guarantees that the latch condition is:
   //   ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
@@ -497,34 +517,34 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   //   i u< guardLimit
   auto RangeCheck = parseLoopICmp(ICI);
   if (!RangeCheck) {
-    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
     return None;
   }
-  DEBUG(dbgs() << "Guard check:\n");
-  DEBUG(RangeCheck->dump());
+  LLVM_DEBUG(dbgs() << "Guard check:\n");
+  LLVM_DEBUG(RangeCheck->dump());
   if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
-    DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
-                 << ")!\n");
+    LLVM_DEBUG(dbgs() << "Unsupported range check predicate("
+                      << RangeCheck->Pred << ")!\n");
     return None;
   }
   auto *RangeCheckIV = RangeCheck->IV;
   if (!RangeCheckIV->isAffine()) {
-    DEBUG(dbgs() << "Range check IV is not affine!\n");
+    LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n");
     return None;
   }
   auto *Step = RangeCheckIV->getStepRecurrence(*SE);
   // We cannot just compare with latch IV step because the latch and range IVs
   // may have different types.
   if (!isSupportedStep(Step)) {
-    DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+    LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
     return None;
   }
   auto *Ty = RangeCheckIV->getType();
   auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
   if (!CurrLatchCheckOpt) {
-    DEBUG(dbgs() << "Failed to generate a loop latch check "
-                    "corresponding to range type: "
-                 << *Ty << "\n");
+    LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check "
+                         "corresponding to range type: "
+                      << *Ty << "\n");
     return None;
   }
 
@@ -535,7 +555,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
              CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
          "Range and latch steps should be of same type!");
   if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
-    DEBUG(dbgs() << "Range and latch have different step values!\n");
+    LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n");
     return None;
   }
 
@@ -551,14 +571,14 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
 
 bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
                                            SCEVExpander &Expander) {
-  DEBUG(dbgs() << "Processing guard:\n");
-  DEBUG(Guard->dump());
+  LLVM_DEBUG(dbgs() << "Processing guard:\n");
+  LLVM_DEBUG(Guard->dump());
 
   IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
 
   // The guard condition is expected to be in form of:
   //   cond1 && cond2 && cond3 ...
-  // Iterate over subconditions looking for for icmp conditions which can be
+  // Iterate over subconditions looking for icmp conditions which can be
   // widened across loop iterations. Widening these conditions remember the
   // resulting list of subconditions in Checks vector.
   SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
@@ -605,7 +625,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
       LastCheck = Builder.CreateAnd(LastCheck, Check);
   Guard->setOperand(0, LastCheck);
 
-  DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
   return true;
 }
 
@@ -614,7 +634,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
 
   BasicBlock *LoopLatch = L->getLoopLatch();
   if (!LoopLatch) {
-    DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+    LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
     return None;
   }
 
@@ -625,7 +645,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
   if (!match(LoopLatch->getTerminator(),
              m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
                   FalseDest))) {
-    DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+    LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
     return None;
   }
   assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
@@ -635,20 +655,20 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
 
   auto Result = parseLoopICmp(Pred, LHS, RHS);
   if (!Result) {
-    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
     return None;
   }
 
   // Check affine first, so if it's not we don't try to compute the step
   // recurrence.
   if (!Result->IV->isAffine()) {
-    DEBUG(dbgs() << "The induction variable is not affine!\n");
+    LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n");
     return None;
   }
 
   auto *Step = Result->IV->getStepRecurrence(*SE);
   if (!isSupportedStep(Step)) {
-    DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+    LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
     return None;
   }
 
@@ -658,13 +678,14 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
              Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
     } else {
       assert(Step->isAllOnesValue() && "Step should be -1!");
-      return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT;
+      return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT &&
+             Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE;
     }
   };
 
   if (IsUnsupportedPredicate(Step, Result->Pred)) {
-    DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
-                 << ")!\n");
+    LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+                      << ")!\n");
     return None;
   }
   return Result;
@@ -700,11 +721,65 @@ bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
          Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
 }
 
+bool LoopPredication::isLoopProfitableToPredicate() {
+  if (SkipProfitabilityChecks || !BPI)
+    return true;
+
+  SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 8> ExitEdges;
+  L->getExitEdges(ExitEdges);
+  // If there is only one exiting edge in the loop, it is always profitable to
+  // predicate the loop.
+  if (ExitEdges.size() == 1)
+    return true;
+
+  // Calculate the exiting probabilities of all exiting edges from the loop,
+  // starting with the LatchExitProbability.
+  // Heuristic for profitability: If any of the exiting blocks' probability of
+  // exiting the loop is larger than exiting through the latch block, it's not
+  // profitable to predicate the loop.
+  auto *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "Should have a single latch at this point!");
+  auto *LatchTerm = LatchBlock->getTerminator();
+  assert(LatchTerm->getNumSuccessors() == 2 &&
+         "expected to be an exiting block with 2 succs!");
+  unsigned LatchBrExitIdx =
+      LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
+  BranchProbability LatchExitProbability =
+      BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
+
+  // Protect against degenerate inputs provided by the user. Providing a value
+  // less than one, can invert the definition of profitable loop predication.
+  float ScaleFactor = LatchExitProbabilityScale;
+  if (ScaleFactor < 1) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Ignored user setting for loop-predication-latch-probability-scale: "
+        << LatchExitProbabilityScale << "\n");
+    LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
+    ScaleFactor = 1.0;
+  }
+  const auto LatchProbabilityThreshold =
+      LatchExitProbability * ScaleFactor;
+
+  for (const auto &ExitEdge : ExitEdges) {
+    BranchProbability ExitingBlockProbability =
+        BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
+    // Some exiting edge has higher probability than the latch exiting edge.
+    // No longer profitable to predicate.
+    if (ExitingBlockProbability > LatchProbabilityThreshold)
+      return false;
+  }
+  // Using BPI, we have concluded that the most probable way to exit from the
+  // loop is through the latch (or there's no profile information and all
+  // exits are equally likely).
+  return true;
+}
+
 bool LoopPredication::runOnLoop(Loop *Loop) {
   L = Loop;
 
-  DEBUG(dbgs() << "Analyzing ");
-  DEBUG(L->dump());
+  LLVM_DEBUG(dbgs() << "Analyzing ");
+  LLVM_DEBUG(L->dump());
 
   Module *M = L->getHeader()->getModule();
 
@@ -725,9 +800,13 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
     return false;
   LatchCheck = *LatchCheckOpt;
 
-  DEBUG(dbgs() << "Latch check:\n");
-  DEBUG(LatchCheck.dump());
+  LLVM_DEBUG(dbgs() << "Latch check:\n");
+  LLVM_DEBUG(LatchCheck.dump());
 
+  if (!isLoopProfitableToPredicate()) {
+    LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n");
+    return false;
+  }
   // Collect all the guards into a vector and process later, so as not
   // to invalidate the instruction iterator.
   SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index d1a54b877950..9a99e5925572 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -51,8 +52,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <cassert>
 #include <cstddef>
@@ -69,10 +70,6 @@ using namespace llvm;
 STATISTIC(NumRerolledLoops, "Number of rerolled loops");
 
 static cl::opt<unsigned>
-MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
-  cl::desc("The maximum increment for loop rerolling"));
-
-static cl::opt<unsigned>
 NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
                           cl::Hidden,
                           cl::desc("The maximum number of failures to tolerate"
@@ -188,7 +185,7 @@ namespace {
     bool PreserveLCSSA;
 
     using SmallInstructionVector = SmallVector<Instruction *, 16>;
-    using SmallInstructionSet = SmallSet<Instruction *, 16>;
+    using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
 
     // Map between induction variable and its increment
     DenseMap<Instruction *, int64_t> IVToIncMap;
@@ -397,8 +394,8 @@ namespace {
 
       /// Stage 3: Assuming validate() returned true, perform the
       /// replacement.
-      /// @param IterCount The maximum iteration count of L.
-      void replace(const SCEV *IterCount);
+      /// @param BackedgeTakenCount The backedge-taken count of L.
+      void replace(const SCEV *BackedgeTakenCount);
 
     protected:
       using UsesTy = MapVector<Instruction *, BitVector>;
@@ -428,8 +425,7 @@ namespace {
       bool instrDependsOn(Instruction *I,
                           UsesTy::iterator Start,
                           UsesTy::iterator End);
-      void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount);
-      void updateNonLoopCtrlIncr();
+      void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
 
       LoopReroll *Parent;
 
@@ -482,8 +478,8 @@ namespace {
     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
     void collectPossibleReductions(Loop *L,
            ReductionTracker &Reductions);
-    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
-                ReductionTracker &Reductions);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
   };
 
 } // end anonymous namespace
@@ -510,48 +506,6 @@ static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
   return false;
 }
 
-static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE,
-                                                 const SCEV *SCEVExpr,
-                                                 Instruction &IV) {
-  const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr);
-
-  // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)),
-  // Return c1.
-  if (!MulSCEV && IV.getType()->isPointerTy())
-    if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) {
-      const PointerType *PTy = cast<PointerType>(IV.getType());
-      Type *ElTy = PTy->getElementType();
-      const SCEV *SizeOfExpr =
-          SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy);
-      if (IncSCEV->getValue()->getValue().isNegative()) {
-        const SCEV *NewSCEV =
-            SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr);
-        return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV));
-      } else {
-        return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr));
-      }
-    }
-
-  if (!MulSCEV)
-    return nullptr;
-
-  // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant,
-  // Return c.
-  const SCEVConstant *CIncSCEV = nullptr;
-  for (const SCEV *Operand : MulSCEV->operands()) {
-    if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) {
-      CIncSCEV = Constant;
-    } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) {
-      Type *AllocTy;
-      if (!Unknown->isSizeOf(AllocTy))
-        break;
-    } else {
-      return nullptr;
-    }
-  }
-  return CIncSCEV;
-}
-
 // Check if an IV is only used to control the loop. There are two cases:
 // 1. It only has one use which is loop increment, and the increment is only
 // used by comparison and the PHI (could has sext with nsw in between), and the
@@ -632,25 +586,17 @@ void LoopReroll::collectPossibleIVs(Loop *L,
         continue;
       if (!PHISCEV->isAffine())
         continue;
-      const SCEVConstant *IncSCEV = nullptr;
-      if (I->getType()->isPointerTy())
-        IncSCEV =
-            getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I);
-      else
-        IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+      auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
       if (IncSCEV) {
-        const APInt &AInt = IncSCEV->getValue()->getValue().abs();
-        if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
-          continue;
         IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
-        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+                          << "\n");
 
         if (isLoopControlIV(L, &*I)) {
           assert(!LoopControlIV && "Found two loop control only IV");
           LoopControlIV = &(*I);
-          DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = "
-                       << *PHISCEV << "\n");
+          LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
+                            << " = " << *PHISCEV << "\n");
         } else
           PossibleIVs.push_back(&*I);
       }
@@ -717,8 +663,8 @@ void LoopReroll::collectPossibleReductions(Loop *L,
     if (!SLR.valid())
       continue;
 
-    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
-          SLR.size() << " chained instructions)\n");
+    LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
+                      << SLR.size() << " chained instructions)\n");
     Reductions.addSLR(SLR);
   }
 }
@@ -856,7 +802,8 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
         BaseUsers.push_back(II);
         continue;
       } else {
-        DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+        LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
+                          << "\n");
         return false;
       }
     }
@@ -878,7 +825,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
   // away.
   if (BaseUsers.size()) {
     if (Roots.find(0) != Roots.end()) {
-      DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+      LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
       return false;
     }
     Roots[0] = Base;
@@ -894,9 +841,9 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
     if (KV.first == 0)
       continue;
     if (!KV.second->hasNUses(NumBaseUses)) {
-      DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
-            << "#Base=" << NumBaseUses << ", #Root=" <<
-            KV.second->getNumUses() << "\n");
+      LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+                        << "#Base=" << NumBaseUses
+                        << ", #Root=" << KV.second->getNumUses() << "\n");
       return false;
     }
   }
@@ -1024,13 +971,14 @@ bool LoopReroll::DAGRootTracker::findRoots() {
 
   // Ensure all sets have the same size.
   if (RootSets.empty()) {
-    DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+    LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
     return false;
   }
   for (auto &V : RootSets) {
     if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
-      DEBUG(dbgs()
-            << "LRR: Aborting because not all root sets have the same size\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "LRR: Aborting because not all root sets have the same size\n");
       return false;
     }
   }
@@ -1038,13 +986,14 @@ bool LoopReroll::DAGRootTracker::findRoots() {
   Scale = RootSets[0].Roots.size() + 1;
 
   if (Scale > IL_MaxRerollIterations) {
-    DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
-          << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
-          << "\n");
+    LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+                      << "#Found=" << Scale
+                      << ", #Max=" << IL_MaxRerollIterations << "\n");
     return false;
   }
 
-  DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+  LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
+                    << "\n");
 
   return true;
 }
@@ -1078,7 +1027,7 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
 
       // While we're here, check the use sets are the same size.
       if (V.size() != VBase.size()) {
-        DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+        LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
         return false;
       }
 
@@ -1235,17 +1184,17 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
   // set.
   for (auto &KV : Uses) {
     if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
-      DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
-            << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+      LLVM_DEBUG(
+          dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+                 << *KV.first << " (#uses=" << KV.second.count() << ")\n");
       return false;
     }
   }
 
-  DEBUG(
-    for (auto &KV : Uses) {
-      dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
-    }
-    );
+  LLVM_DEBUG(for (auto &KV
+                  : Uses) {
+    dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+  });
 
   for (unsigned Iter = 1; Iter < Scale; ++Iter) {
     // In addition to regular aliasing information, we need to look for
@@ -1304,8 +1253,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
 
         if (TryIt == Uses.end() || TryIt == RootIt ||
             instrDependsOn(TryIt->first, RootIt, TryIt)) {
-          DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
-                " vs. " << *RootInst << "\n");
+          LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+                            << *BaseInst << " vs. " << *RootInst << "\n");
           return false;
         }
 
@@ -1341,8 +1290,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
       // root instruction, does not also belong to the base set or the set of
       // some other root instruction.
       if (RootIt->second.count() > 1) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
-                        " vs. " << *RootInst << " (prev. case overlap)\n");
+        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                          << " vs. " << *RootInst << " (prev. case overlap)\n");
         return false;
       }
 
@@ -1352,8 +1301,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
       if (RootInst->mayReadFromMemory())
         for (auto &K : AST) {
           if (K.aliasesUnknownInst(RootInst, *AA)) {
-            DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
-                            " vs. " << *RootInst << " (depends on future store)\n");
+            LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+                              << *BaseInst << " vs. " << *RootInst
+                              << " (depends on future store)\n");
             return false;
           }
         }
@@ -1366,9 +1316,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
                                  !isSafeToSpeculativelyExecute(BaseInst)) ||
                                 (!isUnorderedLoadStore(RootInst) &&
                                  !isSafeToSpeculativelyExecute(RootInst)))) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
-                        " vs. " << *RootInst <<
-                        " (side effects prevent reordering)\n");
+        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                          << " vs. " << *RootInst
+                          << " (side effects prevent reordering)\n");
         return false;
       }
 
@@ -1419,8 +1369,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
                 BaseInst->getOperand(!j) == Op2) {
               Swapped = true;
             } else {
-              DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                    << " vs. " << *RootInst << " (operand " << j << ")\n");
+              LLVM_DEBUG(dbgs()
+                         << "LRR: iteration root match failed at " << *BaseInst
+                         << " vs. " << *RootInst << " (operand " << j << ")\n");
               return false;
             }
           }
@@ -1433,8 +1384,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
            hasUsesOutsideLoop(BaseInst, L)) ||
           (!PossibleRedLastSet.count(RootInst) &&
            hasUsesOutsideLoop(RootInst, L))) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
-                        " vs. " << *RootInst << " (uses outside loop)\n");
+        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                          << " vs. " << *RootInst << " (uses outside loop)\n");
         return false;
       }
 
@@ -1451,20 +1402,32 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
            "Mismatched set sizes!");
   }
 
-  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
-                  *IV << "\n");
+  LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
+                    << "\n");
 
   return true;
 }
 
-void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
   BasicBlock *Header = L->getHeader();
+
+  // Compute the start and increment for each BaseInst before we start erasing
+  // instructions.
+  SmallVector<const SCEV *, 8> StartExprs;
+  SmallVector<const SCEV *, 8> IncrExprs;
+  for (auto &DRS : RootSets) {
+    const SCEVAddRecExpr *IVSCEV =
+        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+    StartExprs.push_back(IVSCEV->getStart());
+    IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
+  }
+
   // Remove instructions associated with non-base iterations.
   for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
        J != JE;) {
     unsigned I = Uses[&*J].find_first();
     if (I > 0 && I < IL_All) {
-      DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+      LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
       J++->eraseFromParent();
       continue;
     }
@@ -1472,74 +1435,47 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
     ++J;
   }
 
-  bool HasTwoIVs = LoopControlIV && LoopControlIV != IV;
+  // Rewrite each BaseInst using SCEV.
+  for (size_t i = 0, e = RootSets.size(); i != e; ++i)
+    // Insert the new induction variable.
+    replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
 
-  if (HasTwoIVs) {
-    updateNonLoopCtrlIncr();
-    replaceIV(LoopControlIV, LoopControlIV, IterCount);
-  } else
-    // We need to create a new induction variable for each different BaseInst.
-    for (auto &DRS : RootSets)
-      // Insert the new induction variable.
-      replaceIV(DRS.BaseInst, IV, IterCount);
+  { // Limit the lifetime of SCEVExpander.
+    BranchInst *BI = cast<BranchInst>(Header->getTerminator());
+    const DataLayout &DL = Header->getModule()->getDataLayout();
+    SCEVExpander Expander(*SE, DL, "reroll");
+    auto Zero = SE->getZero(BackedgeTakenCount->getType());
+    auto One = SE->getOne(BackedgeTakenCount->getType());
+    auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
+    Value *NewIV =
+        Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
+                               Header->getFirstNonPHIOrDbg());
+    // FIXME: This arithmetic can overflow.
+    auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
+    auto ScaledTripCount = SE->getMulExpr(
+        TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
+    auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
+    Value *TakenCount =
+        Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
+                               Header->getFirstNonPHIOrDbg());
+    Value *Cond =
+        new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
+    BI->setCondition(Cond);
+
+    if (BI->getSuccessor(1) != Header)
+      BI->swapSuccessors();
+  }
 
   SimplifyInstructionsInBlock(Header, TLI);
   DeleteDeadPHIs(Header, TLI);
 }
 
-// For non-loop-control IVs, we only need to update the last increment
-// with right amount, then we are done.
-void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() {
-  const SCEV *NewInc = nullptr;
-  for (auto *LoopInc : LoopIncs) {
-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc);
-    const SCEVConstant *COp = nullptr;
-    if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) {
-      COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
-    } else {
-      COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0)));
-      if (!COp)
-        COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
-    }
-
-    assert(COp && "Didn't find constant operand of LoopInc!\n");
-
-    const APInt &AInt = COp->getValue()->getValue();
-    const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale);
-    if (AInt.isNegative()) {
-      NewInc = SE->getNegativeSCEV(COp);
-      NewInc = SE->getUDivExpr(NewInc, ScaleSCEV);
-      NewInc = SE->getNegativeSCEV(NewInc);
-    } else
-      NewInc = SE->getUDivExpr(COp, ScaleSCEV);
-
-    LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue());
-  }
-}
-
-void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
-                                           Instruction *InstIV,
-                                           const SCEV *IterCount) {
+void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
+                                           const SCEV *Start,
+                                           const SCEV *IncrExpr) {
   BasicBlock *Header = L->getHeader();
-  int64_t Inc = IVToIncMap[InstIV];
-  bool NeedNewIV = InstIV == LoopControlIV;
-  bool Negative = !NeedNewIV && Inc < 0;
-
-  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst));
-  const SCEV *Start = RealIVSCEV->getStart();
-
-  if (NeedNewIV)
-    Start = SE->getConstant(Start->getType(), 0);
-
-  const SCEV *SizeOfExpr = nullptr;
-  const SCEV *IncrExpr =
-      SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1);
-  if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) {
-    Type *ElTy = PTy->getElementType();
-    SizeOfExpr =
-        SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy);
-    IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr);
-  }
+  Instruction *Inst = DRS.BaseInst;
+
   const SCEV *NewIVSCEV =
       SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
 
@@ -1552,54 +1488,6 @@ void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
     for (auto &KV : Uses)
       if (KV.second.find_first() == 0)
         KV.first->replaceUsesOfWith(Inst, NewIV);
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
-      // FIXME: Why do we need this check?
-      if (Uses[BI].find_first() == IL_All) {
-        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
-
-        if (NeedNewIV)
-          ICSCEV = SE->getMulExpr(IterCount,
-                                  SE->getConstant(IterCount->getType(), Scale));
-
-        // Iteration count SCEV minus or plus 1
-        const SCEV *MinusPlus1SCEV =
-            SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1);
-        if (Inst->getType()->isPointerTy()) {
-          assert(SizeOfExpr && "SizeOfExpr is not initialized");
-          MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr);
-        }
-
-        const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV);
-        // Iteration count minus 1
-        Instruction *InsertPtr = nullptr;
-        if (isa<SCEVConstant>(ICMinusPlus1SCEV)) {
-          InsertPtr = BI;
-        } else {
-          BasicBlock *Preheader = L->getLoopPreheader();
-          if (!Preheader)
-            Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
-          InsertPtr = Preheader->getTerminator();
-        }
-
-        if (!isa<PointerType>(NewIV->getType()) && NeedNewIV &&
-            (SE->getTypeSizeInBits(NewIV->getType()) <
-             SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) {
-          IRBuilder<> Builder(BI);
-          Builder.SetCurrentDebugLocation(BI->getDebugLoc());
-          NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType());
-        }
-        Value *ICMinusPlus1 = Expander.expandCodeFor(
-            ICMinusPlus1SCEV, NewIV->getType(), InsertPtr);
-
-        Value *Cond =
-            new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond");
-        BI->setCondition(Cond);
-
-        if (BI->getSuccessor(1) != Header)
-          BI->swapSuccessors();
-      }
-    }
   }
 }
 
@@ -1617,17 +1505,17 @@ bool LoopReroll::ReductionTracker::validateSelected() {
       int Iter = PossibleRedIter[J];
       if (Iter != PrevIter && Iter != PrevIter + 1 &&
           !PossibleReds[i].getReducedValue()->isAssociative()) {
-        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
-                        J << "\n");
+        LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
+                          << J << "\n");
         return false;
       }
 
       if (Iter != PrevIter) {
         if (Count != BaseCount) {
-          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
-                " reduction use count " << Count <<
-                " is not equal to the base use count " <<
-                BaseCount << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "LRR: Iteration " << PrevIter << " reduction use count "
+                     << Count << " is not equal to the base use count "
+                     << BaseCount << "\n");
           return false;
         }
 
@@ -1716,15 +1604,15 @@ void LoopReroll::ReductionTracker::replaceSelected() {
 // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
 // have been validated), then we reroll the loop.
 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
-                        const SCEV *IterCount,
+                        const SCEV *BackedgeTakenCount,
                         ReductionTracker &Reductions) {
   DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
                           IVToIncMap, LoopControlIV);
 
   if (!DAGRoots.findRoots())
     return false;
-  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
-                  *IV << "\n");
+  LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
+                    << "\n");
 
   if (!DAGRoots.validate(Reductions))
     return false;
@@ -1734,7 +1622,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
   // making changes!
 
   Reductions.replaceSelected();
-  DAGRoots.replace(IterCount);
+  DAGRoots.replace(BackedgeTakenCount);
 
   ++NumRerolledLoops;
   return true;
@@ -1752,9 +1640,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
   BasicBlock *Header = L->getHeader();
-  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
-        "] Loop %" << Header->getName() << " (" <<
-        L->getNumBlocks() << " block(s))\n");
+  LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
+                    << Header->getName() << " (" << L->getNumBlocks()
+                    << " block(s))\n");
 
   // For now, we'll handle only single BB loops.
   if (L->getNumBlocks() > 1)
@@ -1763,10 +1651,10 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!SE->hasLoopInvariantBackedgeTakenCount(L))
     return false;
 
-  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
-  const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
-  DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
-  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+  LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
+               << "\n");
 
   // First, we need to find the induction variable with respect to which we can
   // reroll (there may be several possible options).
@@ -1776,7 +1664,7 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   collectPossibleIVs(L, PossibleIVs);
 
   if (PossibleIVs.empty()) {
-    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
     return false;
   }
 
@@ -1787,11 +1675,11 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // For each possible IV, collect the associated possible set of 'root' nodes
   // (i+1, i+2, etc.).
   for (Instruction *PossibleIV : PossibleIVs)
-    if (reroll(PossibleIV, L, Header, IterCount, Reductions)) {
+    if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
       Changed = true;
       break;
     }
-  DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+  LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
 
   // Trip count of L has changed so SE must be re-evaluated.
   if (Changed)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index a91f53ba663f..eeaad39dc1d1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,33 +13,15 @@
 
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
@@ -48,595 +30,6 @@ static cl::opt<unsigned> DefaultRotationThreshold(
     "rotation-max-header-size", cl::init(16), cl::Hidden,
     cl::desc("The default maximum header size for automatic loop rotation"));
 
-STATISTIC(NumRotated, "Number of loops rotated");
-
-namespace {
-/// A simple loop rotation transformation.
-class LoopRotate {
-  const unsigned MaxHeaderSize;
-  LoopInfo *LI;
-  const TargetTransformInfo *TTI;
-  AssumptionCache *AC;
-  DominatorTree *DT;
-  ScalarEvolution *SE;
-  const SimplifyQuery &SQ;
-
-public:
-  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
-             const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
-      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
-        SQ(SQ) {}
-  bool processLoop(Loop *L);
-
-private:
-  bool rotateLoop(Loop *L, bool SimplifiedLatch);
-  bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader.  If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values.  Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
-                                            BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap,
-                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
-  // Remove PHI node entries that are no longer live.
-  BasicBlock::iterator I, E = OrigHeader->end();
-  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
-  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
-  // as necessary.
-  SSAUpdater SSA(InsertedPHIs);
-  for (I = OrigHeader->begin(); I != E; ++I) {
-    Value *OrigHeaderVal = &*I;
-
-    // If there are no uses of the value (e.g. because it returns void), there
-    // is nothing to rewrite.
-    if (OrigHeaderVal->use_empty())
-      continue;
-
-    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
-    // The value now exits in two versions: the initial value in the preheader
-    // and the loop "next" value in the original header.
-    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
-    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
-    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
-    // Visit each use of the OrigHeader instruction.
-    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
-                             UE = OrigHeaderVal->use_end();
-         UI != UE;) {
-      // Grab the use before incrementing the iterator.
-      Use &U = *UI;
-
-      // Increment the iterator before removing the use from the list.
-      ++UI;
-
-      // SSAUpdater can't handle a non-PHI use in the same block as an
-      // earlier def. We can easily handle those cases manually.
-      Instruction *UserInst = cast<Instruction>(U.getUser());
-      if (!isa<PHINode>(UserInst)) {
-        BasicBlock *UserBB = UserInst->getParent();
-
-        // The original users in the OrigHeader are already using the
-        // original definitions.
-        if (UserBB == OrigHeader)
-          continue;
-
-        // Users in the OrigPreHeader need to use the value to which the
-        // original definitions are mapped.
-        if (UserBB == OrigPreheader) {
-          U = OrigPreHeaderVal;
-          continue;
-        }
-      }
-
-      // Anything else can be handled by SSAUpdater.
-      SSA.RewriteUse(U);
-    }
-
-    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
-    // intrinsics.
-    SmallVector<DbgValueInst *, 1> DbgValues;
-    llvm::findDbgValues(DbgValues, OrigHeaderVal);
-    for (auto &DbgValue : DbgValues) {
-      // The original users in the OrigHeader are already using the original
-      // definitions.
-      BasicBlock *UserBB = DbgValue->getParent();
-      if (UserBB == OrigHeader)
-        continue;
-
-      // Users in the OrigPreHeader need to use the value to which the
-      // original definitions are mapped and anything else can be handled by
-      // the SSAUpdater. To avoid adding PHINodes, check if the value is
-      // available in UserBB, if not substitute undef.
-      Value *NewVal;
-      if (UserBB == OrigPreheader)
-        NewVal = OrigPreHeaderVal;
-      else if (SSA.HasValueForBlock(UserBB))
-        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
-      else
-        NewVal = UndefValue::get(OrigHeaderVal->getType());
-      DbgValue->setOperand(0,
-                           MetadataAsValue::get(OrigHeaderVal->getContext(),
-                                                ValueAsMetadata::get(NewVal)));
-    }
-  }
-}
-
-/// Propagate dbg.value intrinsics through the newly inserted Phis.
-static void insertDebugValues(BasicBlock *OrigHeader,
-                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
-  ValueToValueMapTy DbgValueMap;
-
-  // Map existing PHI nodes to their dbg.values.
-  for (auto &I : *OrigHeader) {
-    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
-      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
-        DbgValueMap.insert({Loc, DbgII});
-    }
-  }
-
-  // Then iterate through the new PHIs and look to see if they use one of the
-  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
-  // propagate the info through the new PHI.
-  LLVMContext &C = OrigHeader->getContext();
-  for (auto PHI : InsertedPHIs) {
-    for (auto VI : PHI->operand_values()) {
-      auto V = DbgValueMap.find(VI);
-      if (V != DbgValueMap.end()) {
-        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
-        Instruction *NewDbgII = DbgII->clone();
-        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
-        NewDbgII->setOperand(0, PhiMAV);
-        BasicBlock *Parent = PHI->getParent();
-        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
-      }
-    }
-  }
-}
-
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
-  // If the loop has only one block then there is not much to rotate.
-  if (L->getBlocks().size() == 1)
-    return false;
-
-  BasicBlock *OrigHeader = L->getHeader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
-
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (!BI || BI->isUnconditional())
-    return false;
-
-  // If the loop header is not one of the loop exiting blocks then
-  // either this loop is already rotated or it is not
-  // suitable for loop rotation transformations.
-  if (!L->isLoopExiting(OrigHeader))
-    return false;
-
-  // If the loop latch already contains a branch that leaves the loop then the
-  // loop is already rotated.
-  if (!OrigLatch)
-    return false;
-
-  // Rotate if either the loop latch does *not* exit the loop, or if the loop
-  // latch was just simplified.
-  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
-    return false;
-
-  // Check size of original header and reject loop if it is very big or we can't
-  // duplicate blocks inside it.
-  {
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-    CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
-    if (Metrics.notDuplicatable) {
-      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-                   << " instructions: ";
-            L->dump());
-      return false;
-    }
-    if (Metrics.convergent) {
-      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
-                      "instructions: ";
-            L->dump());
-      return false;
-    }
-    if (Metrics.NumInsts > MaxHeaderSize)
-      return false;
-  }
-
-  // Now, this loop is suitable for rotation.
-  BasicBlock *OrigPreheader = L->getLoopPreheader();
-
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!OrigPreheader)
-    return false;
-
-  // Anything ScalarEvolution may know about this loop or the PHI nodes
-  // in its header will soon be invalidated.
-  if (SE)
-    SE->forgetLoop(L);
-
-  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-
-  // Find new Loop header. NewHeader is a Header's one and only successor
-  // that is inside loop.  Header's other successor is outside the
-  // loop.  Otherwise loop is not suitable for rotation.
-  BasicBlock *Exit = BI->getSuccessor(0);
-  BasicBlock *NewHeader = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    std::swap(Exit, NewHeader);
-  assert(NewHeader && "Unable to determine new loop header");
-  assert(L->contains(NewHeader) && !L->contains(Exit) &&
-         "Unable to determine loop header and exit blocks");
-
-  // This code assumes that the new header has exactly one predecessor.
-  // Remove any single-entry PHI nodes in it.
-  assert(NewHeader->getSinglePredecessor() &&
-         "New header doesn't have one pred!");
-  FoldSingleEntryPHINodes(NewHeader);
-
-  // Begin by walking OrigHeader and populating ValueMap with an entry for
-  // each Instruction.
-  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-  ValueToValueMapTy ValueMap;
-
-  // For PHI nodes, the value available in OldPreHeader is just the
-  // incoming value from OldPreHeader.
-  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
-
-  // For the rest of the instructions, either hoist to the OrigPreheader if
-  // possible or create a clone in the OldPreHeader if not.
-  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
-
-  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
-  using DbgIntrinsicHash =
-      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
-  auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
-    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
-  };
-  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
-  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
-       I != E; ++I) {
-    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
-      DbgIntrinsics.insert(makeHash(DII));
-    else
-      break;
-  }
-
-  while (I != E) {
-    Instruction *Inst = &*I++;
-
-    // If the instruction's operands are invariant and it doesn't read or write
-    // memory, then it is safe to hoist.  Doing this doesn't change the order of
-    // execution in the preheader, but does prevent the instruction from
-    // executing in each iteration of the loop.  This means it is safe to hoist
-    // something that might trap, but isn't safe to hoist something that reads
-    // memory (without proving that the loop doesn't write).
-    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
-        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
-      Inst->moveBefore(LoopEntryBranch);
-      continue;
-    }
-
-    // Otherwise, create a duplicate of the instruction.
-    Instruction *C = Inst->clone();
-
-    // Eagerly remap the operands of the instruction.
-    RemapInstruction(C, ValueMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-    // Avoid inserting the same intrinsic twice.
-    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
-      if (DbgIntrinsics.count(makeHash(DII))) {
-        C->deleteValue();
-        continue;
-      }
-
-    // With the operands remapped, see if the instruction constant folds or is
-    // otherwise simplifyable.  This commonly occurs because the entry from PHI
-    // nodes allows icmps and other instructions to fold.
-    Value *V = SimplifyInstruction(C, SQ);
-    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-      // If so, then delete the temporary instruction and stick the folded value
-      // in the map.
-      ValueMap[Inst] = V;
-      if (!C->mayHaveSideEffects()) {
-        C->deleteValue();
-        C = nullptr;
-      }
-    } else {
-      ValueMap[Inst] = C;
-    }
-    if (C) {
-      // Otherwise, stick the new instruction into the new block!
-      C->setName(Inst->getName());
-      C->insertBefore(LoopEntryBranch);
-
-      if (auto *II = dyn_cast<IntrinsicInst>(C))
-        if (II->getIntrinsicID() == Intrinsic::assume)
-          AC->registerAssumption(II);
-    }
-  }
-
-  // Along with all the other instructions, we just cloned OrigHeader's
-  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-  // successors by duplicating their incoming values for OrigHeader.
-  TerminatorInst *TI = OrigHeader->getTerminator();
-  for (BasicBlock *SuccBB : TI->successors())
-    for (BasicBlock::iterator BI = SuccBB->begin();
-         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-  // OrigPreHeader's old terminator (the original branch into the loop), and
-  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-  LoopEntryBranch->eraseFromParent();
-
-
-  SmallVector<PHINode*, 2> InsertedPHIs;
-  // If there were any uses of instructions in the duplicated block outside the
-  // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
-                                  &InsertedPHIs);
-
-  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
-  // previously had debug metadata attached. This keeps the debug info
-  // up-to-date in the loop body.
-  if (!InsertedPHIs.empty())
-    insertDebugValues(OrigHeader, InsertedPHIs);
-
-  // NewHeader is now the header of the loop.
-  L->moveToHeader(NewHeader);
-  assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-  // Inform DT about changes to the CFG.
-  if (DT) {
-    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
-    // the DT about the removed edge to the OrigHeader (that got removed).
-    SmallVector<DominatorTree::UpdateType, 3> Updates;
-    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
-    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
-    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
-    DT->applyUpdates(Updates);
-  }
-
-  // At this point, we've finished our major CFG changes.  As part of cloning
-  // the loop into the preheader we've simplified instructions and the
-  // duplicated conditional branch may now be branching on a constant.  If it is
-  // branching on a constant and if that constant means that we enter the loop,
-  // then we fold away the cond branch to an uncond branch.  This simplifies the
-  // loop in cases important for nested loops, and it also means we don't have
-  // to split as many edges.
-  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-  if (!isa<ConstantInt>(PHBI->getCondition()) ||
-      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
-          NewHeader) {
-    // The conditional branch can't be folded, handle the general case.
-    // Split edges as necessary to preserve LoopSimplify form.
-
-    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.
-    // Split the edge to form a real preheader.
-    BasicBlock *NewPH = SplitCriticalEdge(
-        OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-    NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-    // Preserve canonical loop form, which means that 'Exit' should have only
-    // one predecessor. Note that Exit could be an exit block for multiple
-    // nested loops, causing both of the edges to now be critical and need to
-    // be split.
-    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
-    bool SplitLatchEdge = false;
-    for (BasicBlock *ExitPred : ExitPreds) {
-      // We only need to split loop exit edges.
-      Loop *PredLoop = LI->getLoopFor(ExitPred);
-      if (!PredLoop || PredLoop->contains(Exit))
-        continue;
-      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
-        continue;
-      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-      BasicBlock *ExitSplit = SplitCriticalEdge(
-          ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-      ExitSplit->moveBefore(Exit);
-    }
-    assert(SplitLatchEdge &&
-           "Despite splitting all preds, failed to split latch exit?");
-  } else {
-    // We can fold the conditional branch in the preheader, this makes things
-    // simpler. The first step is to remove the extra edge to the Exit block.
-    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
-    NewBI->setDebugLoc(PHBI->getDebugLoc());
-    PHBI->eraseFromParent();
-
-    // With our CFG finalized, update DomTree if it is available.
-    if (DT) DT->deleteEdge(OrigPreheader, Exit);
-  }
-
-  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
-
-  // Now that the CFG and DomTree are in a consistent state again, try to merge
-  // the OrigHeader block into OrigLatch.  This will succeed if they are
-  // connected by an unconditional branch.  This is just a cleanup so the
-  // emitted code isn't too gross in this common case.
-  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
-
-  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
-
-  ++NumRotated;
-  return true;
-}
-
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
-                                  BasicBlock::iterator End, Loop *L) {
-  bool seenIncrement = false;
-  bool MultiExitLoop = false;
-
-  if (!L->getExitingBlock())
-    MultiExitLoop = true;
-
-  for (BasicBlock::iterator I = Begin; I != End; ++I) {
-
-    if (!isSafeToSpeculativelyExecute(&*I))
-      return false;
-
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::GetElementPtr:
-      // GEPs are cheap if all indices are constant.
-      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
-        return false;
-      // fall-thru to increment case
-      LLVM_FALLTHROUGH;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr: {
-      Value *IVOpnd =
-          !isa<Constant>(I->getOperand(0))
-              ? I->getOperand(0)
-              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
-      if (!IVOpnd)
-        return false;
-
-      // If increment operand is used outside of the loop, this speculation
-      // could cause extra live range interference.
-      if (MultiExitLoop) {
-        for (User *UseI : IVOpnd->users()) {
-          auto *UserInst = cast<Instruction>(UseI);
-          if (!L->contains(UserInst))
-            return false;
-        }
-      }
-
-      if (seenIncrement)
-        return false;
-      seenIncrement = true;
-      break;
-    }
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      // ignore type conversions
-      break;
-    }
-  }
-  return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch || Latch->hasAddressTaken())
-    return false;
-
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!Jmp || !Jmp->isUnconditional())
-    return false;
-
-  BasicBlock *LastExit = Latch->getSinglePredecessor();
-  if (!LastExit || !L->isLoopExiting(LastExit))
-    return false;
-
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
-  if (!BI)
-    return false;
-
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
-    return false;
-
-  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
-               << LastExit->getName() << "\n");
-
-  // Hoist the instructions from Latch into LastExit.
-  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
-                                 Latch->begin(), Jmp->getIterator());
-
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
-
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
-
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
-  return true;
-}
-
-/// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
-  // Save the loop metadata.
-  MDNode *LoopMD = L->getLoopID();
-
-  // Simplify the loop latch before attempting to rotate the header
-  // upward. Rotation may not be needed if the loop tail can be folded into the
-  // loop exit.
-  bool SimplifiedLatch = simplifyLoopLatch(L);
-
-  bool MadeChange = rotateLoop(L, SimplifiedLatch);
-  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
-         "Loop latch should be exiting after loop-rotate.");
-
-  // Restore the loop metadata.
-  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
-  if ((MadeChange || SimplifiedLatch) && LoopMD)
-    L->setLoopID(LoopMD);
-
-  return MadeChange || SimplifiedLatch;
-}
-
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
     : EnableHeaderDuplication(EnableHeaderDuplication) {}
 
@@ -646,10 +39,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
   int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
   const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
   const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
-  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
-                SQ);
 
-  bool Changed = LR.processLoop(&L);
+  bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ,
+                              false, Threshold, false);
+
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -691,8 +84,8 @@ public:
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
-    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
-    return LR.processLoop(L);
+    return LoopRotation(L, LI, TTI, AC, DT, SE, SQ, false, MaxHeaderSize,
+                        false);
   }
 };
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 35c05e84fd68..2b83d3dc5f1b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -30,13 +30,16 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-simplifycfg"
 
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                            ScalarEvolution &SE) {
   bool Changed = false;
   // Copy blocks into a temporary array to avoid iterator invalidation issues
   // as we remove them.
@@ -53,11 +56,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
     if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
       continue;
 
-    // Pred is going to disappear, so we need to update the loop info.
-    if (L.getHeader() == Pred)
-      L.moveToHeader(Succ);
-    LI.removeBlock(Pred);
-    MergeBasicBlockIntoOnlyPred(Succ, &DT);
+    // Merge Succ into Pred and delete it.
+    MergeBlockIntoPredecessor(Succ, &DT, &LI);
+
+    SE.forgetLoop(&L);
     Changed = true;
   }
 
@@ -67,7 +69,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
 PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &) {
-  if (!simplifyLoopCFG(L, AR.DT, AR.LI))
+  if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
@@ -87,7 +89,8 @@ public:
 
     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    return simplifyLoopCFG(*L, DT, LI);
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    return simplifyLoopCFG(*L, DT, LI, SE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 430a7085d93f..760177c9c5e9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
@@ -49,7 +50,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -200,17 +200,19 @@ static bool sinkInstruction(Loop &L, Instruction &I,
   SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
   SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
                              BBsToSinkInto.end());
-  std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
-            [&](BasicBlock *A, BasicBlock *B) {
-              return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
-            });
+  llvm::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+             [&](BasicBlock *A, BasicBlock *B) {
+               return LoopBlockNumber.find(A)->second <
+                      LoopBlockNumber.find(B)->second;
+             });
 
   BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
   // FIXME: Optimize the efficiency for cloned value replacement. The current
   //        implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
-  for (BasicBlock *N : SortedBBsToSinkInto) {
-    if (N == MoveBB)
-      continue;
+  for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
+    assert(LoopBlockNumber.find(N)->second >
+               LoopBlockNumber.find(MoveBB)->second &&
+           "BBs not sorted!");
     // Clone I and replace its uses.
     Instruction *IC = I.clone();
     IC->setName(I.getName());
@@ -224,11 +226,11 @@ static bool sinkInstruction(Loop &L, Instruction &I,
     }
     // Replaces uses of I with IC in blocks dominated by N
     replaceDominatedUsesWith(&I, IC, DT, N);
-    DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+                      << '\n');
     NumLoopSunkCloned++;
   }
-  DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
   NumLoopSunk++;
   I.moveBefore(&*MoveBB->getFirstInsertionPt());
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index ff3e9eef16d9..fa83b48210bc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -75,6 +75,8 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -105,8 +107,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -121,7 +123,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-reduce"
 
-/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
 /// bail out. This threshold is far beyond the number of users that LSR can
 /// conceivably solve, so it should not affect generated code, but catches the
 /// worst cases before LSR burns too much compile time and stack space.
@@ -185,6 +187,8 @@ struct MemAccessTy {
                                 unsigned AS = UnknownAddressSpace) {
     return MemAccessTy(Type::getVoidTy(Ctx), AS);
   }
+
+  Type *getType() { return MemTy; }
 };
 
 /// This class holds data which is used to order reuse candidates.
@@ -327,7 +331,7 @@ struct Formula {
   /// #2 enforces that 1 * reg is reg.
   /// #3 ensures invariant regs with respect to current loop can be combined
   /// together in LSR codegen.
-  /// This invariant can be temporarly broken while building a formula.
+  /// This invariant can be temporarily broken while building a formula.
   /// However, every formula inserted into the LSRInstance must be in canonical
   /// form.
   SmallVector<const SCEV *, 4> BaseRegs;
@@ -442,7 +446,7 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
   canonicalize(*L);
 }
 
-/// \brief Check whether or not this formula statisfies the canonical
+/// Check whether or not this formula satisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
 bool Formula::isCanonical(const Loop &L) const {
@@ -470,7 +474,7 @@ bool Formula::isCanonical(const Loop &L) const {
   return I == BaseRegs.end();
 }
 
-/// \brief Helper method to morph a formula into its canonical representation.
+/// Helper method to morph a formula into its canonical representation.
 /// \see Formula::BaseRegs.
 /// Every formula having more than one base register, must use the ScaledReg
 /// field. Otherwise, we would have to do special cases everywhere in LSR
@@ -505,7 +509,7 @@ void Formula::canonicalize(const Loop &L) {
   }
 }
 
-/// \brief Get rid of the scale in the formula.
+/// Get rid of the scale in the formula.
 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
 /// \return true if it was possible to get rid of the scale, false otherwise.
 /// \note After this operation the formula may not be in the canonical form.
@@ -818,7 +822,7 @@ static bool isAddressUse(const TargetTransformInfo &TTI,
 
 /// Return the type of the memory being accessed.
 static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
-                                 Instruction *Inst) {
+                                 Instruction *Inst, Value *OperandVal) {
   MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     AccessTy.MemTy = SI->getOperand(0)->getType();
@@ -832,7 +836,14 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::prefetch:
+    case Intrinsic::memset:
       AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+      AccessTy.MemTy = OperandVal->getType();
+      break;
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
+      AccessTy.MemTy = OperandVal->getType();
       break;
     default: {
       MemIntrinsicInfo IntrInfo;
@@ -937,7 +948,7 @@ static bool isHighCostExpansion(const SCEV *S,
   return true;
 }
 
-/// If any of the instructions is the specified set are trivially dead, delete
+/// If any of the instructions in the specified set are trivially dead, delete
 /// them and see if this makes any of their operands subsequently dead.
 static bool
 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
@@ -970,7 +981,7 @@ class LSRUse;
 
 } // end anonymous namespace
 
-/// \brief Check if the addressing mode defined by \p F is completely
+/// Check if the addressing mode defined by \p F is completely
 /// folded in \p LU at isel time.
 /// This includes address-mode folding and special icmp tricks.
 /// This function returns true if \p LU can accommodate what \p F
@@ -1040,12 +1051,14 @@ private:
   void RateRegister(const SCEV *Reg,
                     SmallPtrSetImpl<const SCEV *> &Regs,
                     const Loop *L,
-                    ScalarEvolution &SE, DominatorTree &DT);
+                    ScalarEvolution &SE, DominatorTree &DT,
+                    const TargetTransformInfo &TTI);
   void RatePrimaryRegister(const SCEV *Reg,
                            SmallPtrSetImpl<const SCEV *> &Regs,
                            const Loop *L,
                            ScalarEvolution &SE, DominatorTree &DT,
-                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs,
+                           const TargetTransformInfo &TTI);
 };
 
 /// An operand value in an instruction which is to be replaced with some
@@ -1194,7 +1207,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 void Cost::RateRegister(const SCEV *Reg,
                         SmallPtrSetImpl<const SCEV *> &Regs,
                         const Loop *L,
-                        ScalarEvolution &SE, DominatorTree &DT) {
+                        ScalarEvolution &SE, DominatorTree &DT,
+                        const TargetTransformInfo &TTI) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
     // If this is an addrec for another loop, it should be an invariant
     // with respect to L since L is the innermost loop (at least
@@ -1215,13 +1229,28 @@ void Cost::RateRegister(const SCEV *Reg,
       ++C.NumRegs;
       return;
     }
-    C.AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+    unsigned LoopCost = 1;
+    if (TTI.shouldFavorPostInc()) {
+      const SCEV *LoopStep = AR->getStepRecurrence(SE);
+      if (isa<SCEVConstant>(LoopStep)) {
+        // Check if a post-indexed load/store can be used.
+        if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+            TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+          const SCEV *LoopStart = AR->getStart();
+          if (!isa<SCEVConstant>(LoopStart) &&
+            SE.isLoopInvariant(LoopStart, L))
+              LoopCost = 0;
+        }
+      }
+    }
+    C.AddRecCost += LoopCost;
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
       if (!Regs.count(AR->getOperand(1))) {
-        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
         if (isLoser())
           return;
       }
@@ -1249,13 +1278,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
                                SmallPtrSetImpl<const SCEV *> &Regs,
                                const Loop *L,
                                ScalarEvolution &SE, DominatorTree &DT,
-                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+                               SmallPtrSetImpl<const SCEV *> *LoserRegs,
+                               const TargetTransformInfo &TTI) {
   if (LoserRegs && LoserRegs->count(Reg)) {
     Lose();
     return;
   }
   if (Regs.insert(Reg).second) {
-    RateRegister(Reg, Regs, L, SE, DT);
+    RateRegister(Reg, Regs, L, SE, DT, TTI);
     if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
   }
@@ -1279,7 +1309,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       Lose();
       return;
     }
-    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
     if (isLoser())
       return;
   }
@@ -1288,7 +1318,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       Lose();
       return;
     }
-    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
     if (isLoser())
       return;
   }
@@ -1343,14 +1373,15 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
 
   // If ICmpZero formula ends with not 0, it could not be replaced by
   // just add or sub. We'll need to compare final result of AddRec.
-  // That means we'll need an additional instruction.
+  // That means we'll need an additional instruction. But if the target can
+  // macro-fuse a compare with a branch, don't count this extra instruction.
   // For -10 + {0, +, 1}:
   // i = i + 1;
   // cmp i, 10
   //
   // For {-10, +, 1}:
   // i = i + 1;
-  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
     C.Insns++;
   // Each new AddRec adds 1 instruction to calculation.
   C.Insns += (C.AddRecCost - PrevAddRecCost);
@@ -1456,7 +1487,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
-  std::sort(Key.begin(), Key.end());
+  llvm::sort(Key.begin(), Key.end());
   return Uniquifier.count(Key);
 }
 
@@ -1480,7 +1511,7 @@ bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
-  std::sort(Key.begin(), Key.end());
+  llvm::sort(Key.begin(), Key.end());
 
   if (!Uniquifier.insert(Key).second)
     return false;
@@ -2384,24 +2415,27 @@ LSRInstance::OptimizeLoopTermCond() {
                 C->getValue().isMinSignedValue())
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
-            MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
-            int64_t Scale = C->getSExtValue();
-            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
-                                          /*BaseOffset=*/0,
-                                          /*HasBaseReg=*/false, Scale,
-                                          AccessTy.AddrSpace))
-              goto decline_post_inc;
-            Scale = -Scale;
-            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
-                                          /*BaseOffset=*/0,
-                                          /*HasBaseReg=*/false, Scale,
-                                          AccessTy.AddrSpace))
-              goto decline_post_inc;
+            if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
+              MemAccessTy AccessTy = getAccessType(
+                  TTI, UI->getUser(), UI->getOperandValToReplace());
+              int64_t Scale = C->getSExtValue();
+              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                            /*BaseOffset=*/0,
+                                            /*HasBaseReg=*/false, Scale,
+                                            AccessTy.AddrSpace))
+                goto decline_post_inc;
+              Scale = -Scale;
+              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                            /*BaseOffset=*/0,
+                                            /*HasBaseReg=*/false, Scale,
+                                            AccessTy.AddrSpace))
+                goto decline_post_inc;
+            }
           }
         }
 
-    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
-                 << *Cond << '\n');
+    LLVM_DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
+                      << *Cond << '\n');
 
     // It's possible for the setcc instruction to be anywhere in the loop, and
     // possible for it to have multiple users.  If it is not immediately before
@@ -2642,7 +2676,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
   if (Types.size() == 1)
     Types.clear();
 
-  DEBUG(print_factors_and_types(dbgs()));
+  LLVM_DEBUG(print_factors_and_types(dbgs()));
 }
 
 /// Helper for CollectChains that finds an IV operand (computed by an AddRec in
@@ -2666,7 +2700,7 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE,
   return OI;
 }
 
-/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
 /// a convenient helper.
 static Value *getWideOperand(Value *Oper) {
   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
@@ -2773,10 +2807,9 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
     return false;
 
   if (!Users.empty()) {
-    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
-          for (Instruction *Inst : Users) {
-            dbgs() << "  " << *Inst << "\n";
-          });
+    LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+               for (Instruction *Inst
+                    : Users) { dbgs() << "  " << *Inst << "\n"; });
     return false;
   }
   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
@@ -2829,8 +2862,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
   // the stride.
   cost -= NumReusedIncrements;
 
-  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+                    << "\n");
 
   return cost < 0;
 }
@@ -2883,7 +2916,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
     if (isa<PHINode>(UserInst))
       return;
     if (NChains >= MaxChains && !StressIVChain) {
-      DEBUG(dbgs() << "IV Chain Limit\n");
+      LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
       return;
     }
     LastIncExpr = OperExpr;
@@ -2896,11 +2929,11 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
     IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
                                  OperExprBase));
     ChainUsersVec.resize(NChains);
-    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
-                 << ") IV=" << *LastIncExpr << "\n");
+    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                      << ") IV=" << *LastIncExpr << "\n");
   } else {
-    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
-                 << ") IV+" << *LastIncExpr << "\n");
+    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                      << ") IV+" << *LastIncExpr << "\n");
     // Add this IV user to the end of the chain.
     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
   }
@@ -2970,7 +3003,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
 /// loop latch. This will discover chains on side paths, but requires
 /// maintaining multiple copies of the Chains state.
 void LSRInstance::CollectChains() {
-  DEBUG(dbgs() << "Collecting IV Chains.\n");
+  LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
   SmallVector<ChainUsers, 8> ChainUsersVec;
 
   SmallVector<BasicBlock *,8> LatchPath;
@@ -3039,10 +3072,10 @@ void LSRInstance::CollectChains() {
 
 void LSRInstance::FinalizeChain(IVChain &Chain) {
   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
-  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+  LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
 
   for (const IVInc &Inc : Chain) {
-    DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
+    LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
     auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
     assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
     IVIncSet.insert(UseI);
@@ -3059,7 +3092,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
   if (IncConst->getAPInt().getMinSignedBits() > 64)
     return false;
 
-  MemAccessTy AccessTy = getAccessType(TTI, UserInst);
+  MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
   int64_t IncOffset = IncConst->getValue()->getSExtValue();
   if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
                         IncOffset, /*HaseBaseReg=*/false))
@@ -3099,11 +3132,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   }
   if (IVOpIter == IVOpEnd) {
     // Gracefully give up on this chain.
-    DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+    LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
     return;
   }
 
-  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+  LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
   Type *IVTy = IVSrc->getType();
   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
   const SCEV *LeftOverExpr = nullptr;
@@ -3179,7 +3212,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         find(UserInst->operands(), U.getOperandValToReplace());
     assert(UseI != UserInst->op_end() && "cannot find IV operand");
     if (IVIncSet.count(UseI)) {
-      DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+      LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
       continue;
     }
 
@@ -3187,7 +3220,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     MemAccessTy AccessTy;
     if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
       Kind = LSRUse::Address;
-      AccessTy = getAccessType(TTI, UserInst);
+      AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
     }
 
     const SCEV *S = IU.getExpr(U);
@@ -3255,7 +3288,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     }
   }
 
-  DEBUG(print_fixups(dbgs()));
+  LLVM_DEBUG(print_fixups(dbgs()));
 }
 
 /// Insert a formula for the given expression into the given use, separating out
@@ -3464,12 +3497,45 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
   return S;
 }
 
-/// \brief Helper function for LSRInstance::GenerateReassociations.
+/// Return true if the SCEV represents a value that may end up as a
+/// post-increment operation.
+static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
+                              LSRUse &LU, const SCEV *S, const Loop *L,
+                              ScalarEvolution &SE) {
+  if (LU.Kind != LSRUse::Address ||
+      !LU.AccessTy.getType()->isIntOrIntVectorTy())
+    return false;
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (!AR)
+    return false;
+  const SCEV *LoopStep = AR->getStepRecurrence(SE);
+  if (!isa<SCEVConstant>(LoopStep))
+    return false;
+  if (LU.AccessTy.getType()->getScalarSizeInBits() !=
+      LoopStep->getType()->getScalarSizeInBits())
+    return false;
+  // Check if a post-indexed load/store can be used.
+  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+      TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+    const SCEV *LoopStart = AR->getStart();
+    if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
+      return true;
+  }
+  return false;
+}
+
+/// Helper function for LSRInstance::GenerateReassociations.
 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
                                              const Formula &Base,
                                              unsigned Depth, size_t Idx,
                                              bool IsScaledReg) {
   const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  // Don't generate reassociations for the base register of a value that
+  // may generate a post-increment operator. The reason is that the
+  // reassociations cause extra base+register formula to be created,
+  // and possibly chosen, but the post-increment is more efficient.
+  if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
+    return;
   SmallVector<const SCEV *, 8> AddOps;
   const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
   if (Remainder)
@@ -3542,7 +3608,12 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
     if (InsertFormula(LU, LUIdx, F))
       // If that formula hadn't been seen before, recurse to find more like
       // it.
-      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+      // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
+      // Because just Depth is not enough to bound compile time.
+      // This means that every time AddOps.size() is greater 16^x we will add
+      // x to Depth.
+      GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
+                             Depth + 1 + (Log2_32(AddOps.size()) >> 2));
   }
 }
 
@@ -3596,7 +3667,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
   }
 }
 
-/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+/// Helper function for LSRInstance::GenerateSymbolicOffsets.
 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
                                               const Formula &Base, size_t Idx,
                                               bool IsScaledReg) {
@@ -3628,7 +3699,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                 /* IsScaledReg */ true);
 }
 
-/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+/// Helper function for LSRInstance::GenerateConstantOffsets.
 void LSRInstance::GenerateConstantOffsetsImpl(
     LSRUse &LU, unsigned LUIdx, const Formula &Base,
     const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
@@ -3938,10 +4009,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     if (Imms.size() == 1)
       continue;
 
-    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
-          for (const auto &Entry : Imms)
-            dbgs() << ' ' << Entry.first;
-          dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+               for (const auto &Entry
+                    : Imms) dbgs()
+               << ' ' << Entry.first;
+               dbgs() << '\n');
 
     // Examine each offset.
     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
@@ -3953,7 +4025,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
       if (!isa<SCEVConstant>(OrigReg) &&
           UsedByIndicesMap[Reg].count() == 1) {
-        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+        LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+                          << '\n');
         continue;
       }
 
@@ -4038,6 +4111,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
                           LU.Kind, LU.AccessTy, NewF)) {
+            if (TTI.shouldFavorPostInc() &&
+                mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
+              continue;
             if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
               continue;
             NewF = F;
@@ -4099,9 +4175,9 @@ LSRInstance::GenerateAllReuseFormulae() {
 
   GenerateCrossUseConstantOffsets();
 
-  DEBUG(dbgs() << "\n"
-                  "After generating reuse formulae:\n";
-        print_uses(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n"
+                       "After generating reuse formulae:\n";
+             print_uses(dbgs()));
 }
 
 /// If there are multiple formulae with the same set of registers used
@@ -4123,7 +4199,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];
-    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+               dbgs() << '\n');
 
     bool Any = false;
     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
@@ -4147,8 +4224,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
         // as the basis of rediscovering the desired formula that uses an AddRec
         // corresponding to the existing phi. Once all formulae have been
         // generated, these initial losers may be pruned.
-        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
-              dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+                   dbgs() << "\n");
       }
       else {
         SmallVector<const SCEV *, 4> Key;
@@ -4161,7 +4238,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
           Key.push_back(F.ScaledReg);
         // Unstable sort by host order ok, because this is only used for
         // uniquifying.
-        std::sort(Key.begin(), Key.end());
+        llvm::sort(Key.begin(), Key.end());
 
         std::pair<BestFormulaeTy::const_iterator, bool> P =
           BestFormulae.insert(std::make_pair(Key, FIdx));
@@ -4175,10 +4252,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
         if (CostF.isLess(CostBest, TTI))
           std::swap(F, Best);
-        DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
-              dbgs() << "\n"
-                        "    in favor of formula "; Best.print(dbgs());
-              dbgs() << '\n');
+        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+                   dbgs() << "\n"
+                             "    in favor of formula ";
+                   Best.print(dbgs()); dbgs() << '\n');
       }
 #ifndef NDEBUG
       ChangedFormulae = true;
@@ -4197,11 +4274,11 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
     BestFormulae.clear();
   }
 
-  DEBUG(if (ChangedFormulae) {
-          dbgs() << "\n"
-                    "After filtering out undesirable candidates:\n";
-          print_uses(dbgs());
-        });
+  LLVM_DEBUG(if (ChangedFormulae) {
+    dbgs() << "\n"
+              "After filtering out undesirable candidates:\n";
+    print_uses(dbgs());
+  });
 }
 
 // This is a rough guess that seems to work fairly well.
@@ -4230,11 +4307,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
 /// register pressure); remove it to simplify the system.
 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
-    DEBUG(dbgs() << "The search space is too complex.\n");
+    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
 
-    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
-                    "which use a superset of registers used by other "
-                    "formulae.\n");
+    LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+                         "which use a superset of registers used by other "
+                         "formulae.\n");
 
     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
       LSRUse &LU = Uses[LUIdx];
@@ -4252,7 +4329,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                 (I - F.BaseRegs.begin()));
             if (LU.HasFormulaWithSameRegs(NewF)) {
-              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+              LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                         dbgs() << '\n');
               LU.DeleteFormula(F);
               --i;
               --e;
@@ -4267,8 +4345,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                     (I - F.BaseRegs.begin()));
                 if (LU.HasFormulaWithSameRegs(NewF)) {
-                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
-                        dbgs() << '\n');
+                  LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                             dbgs() << '\n');
                   LU.DeleteFormula(F);
                   --i;
                   --e;
@@ -4283,8 +4361,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
         LU.RecomputeRegs(LUIdx, RegUses);
     }
 
-    DEBUG(dbgs() << "After pre-selection:\n";
-          print_uses(dbgs()));
+    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
   }
 }
 
@@ -4294,9 +4371,10 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
     return;
 
-  DEBUG(dbgs() << "The search space is too complex.\n"
-                  "Narrowing the search space by assuming that uses separated "
-                  "by a constant offset will use the same registers.\n");
+  LLVM_DEBUG(
+      dbgs() << "The search space is too complex.\n"
+                "Narrowing the search space by assuming that uses separated "
+                "by a constant offset will use the same registers.\n");
 
   // This is especially useful for unrolled loops.
 
@@ -4314,7 +4392,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
                               LU.Kind, LU.AccessTy))
         continue;
 
-      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
 
       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
 
@@ -4322,7 +4400,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
       for (LSRFixup &Fixup : LU.Fixups) {
         Fixup.Offset += F.BaseOffset;
         LUThatHas->pushFixup(Fixup);
-        DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+        LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
       }
 
       // Delete formulae from the new use which are no longer legal.
@@ -4331,8 +4409,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
         Formula &F = LUThatHas->Formulae[i];
         if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
                         LUThatHas->Kind, LUThatHas->AccessTy, F)) {
-          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
-                dbgs() << '\n');
+          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
           LUThatHas->DeleteFormula(F);
           --i;
           --e;
@@ -4351,7 +4428,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
     }
   }
 
-  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
 }
 
 /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
@@ -4359,15 +4436,14 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
 /// eliminate.
 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
-    DEBUG(dbgs() << "The search space is too complex.\n");
+    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
 
-    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
-                    "undesirable dedicated registers.\n");
+    LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                         "undesirable dedicated registers.\n");
 
     FilterOutUndesirableDedicatedRegisters();
 
-    DEBUG(dbgs() << "After pre-selection:\n";
-          print_uses(dbgs()));
+    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
   }
 }
 
@@ -4378,15 +4454,16 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
 /// The benefit is that it is more likely to find out a better solution
 /// from a formulae set with more Scale and ScaledReg variations than
 /// a formulae set with the same Scale and ScaledReg. The picking winner
-/// reg heurstic will often keep the formulae with the same Scale and
+/// reg heuristic will often keep the formulae with the same Scale and
 /// ScaledReg and filter others, and we want to avoid that if possible.
 void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
     return;
 
-  DEBUG(dbgs() << "The search space is too complex.\n"
-                  "Narrowing the search space by choosing the best Formula "
-                  "from the Formulae with the same Scale and ScaledReg.\n");
+  LLVM_DEBUG(
+      dbgs() << "The search space is too complex.\n"
+                "Narrowing the search space by choosing the best Formula "
+                "from the Formulae with the same Scale and ScaledReg.\n");
 
   // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
   using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
@@ -4400,7 +4477,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
 
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];
-    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+               dbgs() << '\n');
 
     // Return true if Formula FA is better than Formula FB.
     auto IsBetterThan = [&](Formula &FA, Formula &FB) {
@@ -4444,10 +4522,10 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
       Formula &Best = LU.Formulae[P.first->second];
       if (IsBetterThan(F, Best))
         std::swap(F, Best);
-      DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
-            dbgs() << "\n"
-                      "    in favor of formula ";
-            Best.print(dbgs()); dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+                 dbgs() << "\n"
+                           "    in favor of formula ";
+                 Best.print(dbgs()); dbgs() << '\n');
 #ifndef NDEBUG
       ChangedFormulae = true;
 #endif
@@ -4463,7 +4541,7 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
     BestFormulae.clear();
   }
 
-  DEBUG(if (ChangedFormulae) {
+  LLVM_DEBUG(if (ChangedFormulae) {
     dbgs() << "\n"
               "After filtering out undesirable candidates:\n";
     print_uses(dbgs());
@@ -4522,7 +4600,7 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
   // Used in each formula of a solution (in example above this is reg(c)).
   // We can skip them in calculations.
   SmallPtrSet<const SCEV *, 4> UniqRegs;
-  DEBUG(dbgs() << "The search space is too complex.\n");
+  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
 
   // Map each register to probability of not selecting
   DenseMap <const SCEV *, float> RegNumMap;
@@ -4542,7 +4620,8 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
     RegNumMap.insert(std::make_pair(Reg, PNotSel));
   }
 
-  DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+  LLVM_DEBUG(
+      dbgs() << "Narrowing the search space by deleting costly formulas\n");
 
   // Delete formulas where registers number expectation is high.
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
@@ -4584,26 +4663,25 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
         MinIdx = i;
       }
     }
-    DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
-          dbgs() << " with min reg num " << FMinRegNum << '\n');
+    LLVM_DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+               dbgs() << " with min reg num " << FMinRegNum << '\n');
     if (MinIdx != 0)
       std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
     while (LU.Formulae.size() != 1) {
-      DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
-            dbgs() << '\n');
+      LLVM_DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+                 dbgs() << '\n');
       LU.Formulae.pop_back();
     }
     LU.RecomputeRegs(LUIdx, RegUses);
     assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
     Formula &F = LU.Formulae[0];
-    DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
     // When we choose the formula, the regs become unique.
     UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
     if (F.ScaledReg)
       UniqRegs.insert(F.ScaledReg);
   }
-  DEBUG(dbgs() << "After pre-selection:\n";
-  print_uses(dbgs()));
+  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
 }
 
 /// Pick a register which seems likely to be profitable, and then in any use
@@ -4616,7 +4694,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
     // Ok, we have too many of formulae on our hands to conveniently handle.
     // Use a rough heuristic to thin out the list.
-    DEBUG(dbgs() << "The search space is too complex.\n");
+    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
 
     // Pick the register which is used by the most LSRUses, which is likely
     // to be a good reuse register candidate.
@@ -4637,8 +4715,8 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
       }
     }
 
-    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
-                 << " will yield profitable reuse.\n");
+    LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+                      << " will yield profitable reuse.\n");
     Taken.insert(Best);
 
     // In any use with formulae which references this register, delete formulae
@@ -4651,7 +4729,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
         Formula &F = LU.Formulae[i];
         if (!F.referencesReg(Best)) {
-          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
           LU.DeleteFormula(F);
           --e;
           --i;
@@ -4665,8 +4743,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
         LU.RecomputeRegs(LUIdx, RegUses);
     }
 
-    DEBUG(dbgs() << "After pre-selection:\n";
-          print_uses(dbgs()));
+    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
   }
 }
 
@@ -4748,11 +4825,11 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
         if (F.getNumRegs() == 1 && Workspace.size() == 1)
           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
       } else {
-        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
-              dbgs() << ".\n Regs:";
-              for (const SCEV *S : NewRegs)
-                dbgs() << ' ' << *S;
-              dbgs() << '\n');
+        LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+                   dbgs() << ".\n Regs:"; for (const SCEV *S
+                                               : NewRegs) dbgs()
+                                          << ' ' << *S;
+                   dbgs() << '\n');
 
         SolutionCost = NewCost;
         Solution = Workspace;
@@ -4777,22 +4854,22 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
                CurRegs, VisitedRegs);
   if (Solution.empty()) {
-    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
     return;
   }
 
   // Ok, we've now made all our decisions.
-  DEBUG(dbgs() << "\n"
-                  "The chosen solution requires "; SolutionCost.print(dbgs());
-        dbgs() << ":\n";
-        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
-          dbgs() << "  ";
-          Uses[i].print(dbgs());
-          dbgs() << "\n"
-                    "    ";
-          Solution[i]->print(dbgs());
-          dbgs() << '\n';
-        });
+  LLVM_DEBUG(dbgs() << "\n"
+                       "The chosen solution requires ";
+             SolutionCost.print(dbgs()); dbgs() << ":\n";
+             for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+               dbgs() << "  ";
+               Uses[i].print(dbgs());
+               dbgs() << "\n"
+                         "    ";
+               Solution[i]->print(dbgs());
+               dbgs() << '\n';
+             });
 
   assert(Solution.size() == Uses.size() && "Malformed solution!");
 }
@@ -4993,7 +5070,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
       // Unless the addressing mode will not be folded.
       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
           isAMCompletelyFolded(TTI, LU, F)) {
-        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
         Ops.clear();
         Ops.push_back(SE.getUnknown(FullV));
       }
@@ -5266,7 +5343,8 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   for (const IVStrideUse &U : IU) {
     if (++NumUsers > MaxIVUsers) {
       (void)U;
-      DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+      LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
+                        << "\n");
       return;
     }
     // Bail out if we have a PHI on an EHPad that gets a value from a
@@ -5299,9 +5377,9 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   }
 #endif // DEBUG
 
-  DEBUG(dbgs() << "\nLSR on loop ";
-        L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
-        dbgs() << ":\n");
+  LLVM_DEBUG(dbgs() << "\nLSR on loop ";
+             L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+             dbgs() << ":\n");
 
   // First, perform some low-level loop optimizations.
   OptimizeShadowIV();
@@ -5312,7 +5390,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
 
   // Skip nested loops until we can model them better with formulae.
   if (!L->empty()) {
-    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
     return;
   }
 
@@ -5322,9 +5400,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   CollectFixupsAndInitialFormulae();
   CollectLoopInvariantFixupsAndFormulae();
 
-  assert(!Uses.empty() && "IVUsers reported at least one use");
-  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
-        print_uses(dbgs()));
+  if (Uses.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+             print_uses(dbgs()));
 
   // Now use the reuse data to generate a bunch of interesting ways
   // to formulate the values needed for the uses.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
new file mode 100644
index 000000000000..86c99aed4417
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -0,0 +1,447 @@
+//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an unroll and jam pass. Most of the work is done by
+// Utils/UnrollLoopAndJam.cpp.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+static cl::opt<bool>
+    AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
+                      cl::desc("Allows loops to be unroll-and-jammed."));
+
+static cl::opt<unsigned> UnrollAndJamCount(
+    "unroll-and-jam-count", cl::Hidden,
+    cl::desc("Use this unroll count for all loops including those with "
+             "unroll_and_jam_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollAndJamThreshold(
+    "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
+    cl::desc("Threshold to use for inner loop when doing unroll and jam."));
+
+static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
+    "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
+    cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
+             "unroll_count pragma."));
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
+// Returns true if the loop has any metadata starting with Prefix. For example a
+// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
+static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+  if (MDNode *LoopID = L->getLoopID()) {
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (!MD)
+        continue;
+
+      MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+      if (!S)
+        continue;
+
+      if (S->getString().startswith(Prefix))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the loop has an unroll_and_jam(enable) pragma.
+static bool HasUnrollAndJamEnablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+}
+
+// Returns true if the loop has an unroll_and_jam(disable) pragma.
+static bool HasUnrollAndJamDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
+}
+
+// If loop has an unroll_and_jam_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
+  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t
+getUnrollAndJammedLoopSize(unsigned LoopSize,
+                           TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+  return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Calculates unroll and jam count and writes it to UP.Count. Returns true if
+// unroll count was set explicitly.
+static bool computeUnrollAndJamCount(
+    Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
+    LoopInfo *LI, ScalarEvolution &SE,
+    const SmallPtrSetImpl<const Value *> &EphValues,
+    OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
+    unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
+    unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
+  // Check for explicit Count from the "unroll-and-jam-count" option.
+  bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
+  if (UserUnrollCount) {
+    UP.Count = UnrollAndJamCount;
+    UP.Force = true;
+    if (UP.AllowRemainder &&
+        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+        getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+            UP.UnrollAndJamInnerLoopThreshold)
+      return true;
+  }
+
+  // Check for unroll_and_jam pragmas
+  unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
+  if (PragmaCount > 0) {
+    UP.Count = PragmaCount;
+    UP.Runtime = true;
+    UP.Force = true;
+    if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
+        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+        getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+            UP.UnrollAndJamInnerLoopThreshold)
+      return true;
+  }
+
+  // Use computeUnrollCount from the loop unroller to get a sensible count
+  // for the unrolling the outer loop. This uses UP.Threshold /
+  // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+  // We have already checked that the loop has no unroll.* pragmas.
+  unsigned MaxTripCount = 0;
+  bool UseUpperBound = false;
+  bool ExplicitUnroll = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+  if (ExplicitUnroll || UseUpperBound) {
+    // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+    // for the unroller instead.
+    UP.Count = 0;
+    return false;
+  }
+
+  bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
+  ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
+
+  // If the loop has an unrolling pragma, we want to be more aggressive with
+  // unrolling limits.
+  if (ExplicitUnroll && OuterTripCount != 0)
+    UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
+
+  if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+                                UP.UnrollAndJamInnerLoopThreshold) {
+    UP.Count = 0;
+    return false;
+  }
+
+  // If the inner loop count is known and small, leave the entire loop nest to
+  // be the unroller
+  if (!ExplicitUnroll && InnerTripCount &&
+      InnerLoopSize * InnerTripCount < UP.Threshold) {
+    UP.Count = 0;
+    return false;
+  }
+
+  // We have a sensible limit for the outer loop, now adjust it for the inner
+  // loop and UP.UnrollAndJamInnerLoopThreshold.
+  while (UP.Count != 0 && UP.AllowRemainder &&
+         getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+             UP.UnrollAndJamInnerLoopThreshold)
+    UP.Count--;
+
+  if (!ExplicitUnroll) {
+    // Check for situations where UnJ is likely to be unprofitable. Including
+    // subloops with more than 1 block.
+    if (SubLoop->getBlocks().size() != 1) {
+      UP.Count = 0;
+      return false;
+    }
+
+    // Limit to loops where there is something to gain from unrolling and
+    // jamming the loop. In this case, look for loads that are invariant in the
+    // outer loop and can become shared.
+    unsigned NumInvariant = 0;
+    for (BasicBlock *BB : SubLoop->getBlocks()) {
+      for (Instruction &I : *BB) {
+        if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+          Value *V = Ld->getPointerOperand();
+          const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+          if (SE.isLoopInvariant(LSCEV, L))
+            NumInvariant++;
+        }
+      }
+    }
+    if (NumInvariant == 0) {
+      UP.Count = 0;
+      return false;
+    }
+  }
+
+  return ExplicitUnroll;
+}
+
+static LoopUnrollResult
+tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                      AssumptionCache &AC, DependenceInfo &DI,
+                      OptimizationRemarkEmitter &ORE, int OptLevel) {
+  // Quick checks of the correct loop form
+  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+    return LoopUnrollResult::Unmodified;
+  Loop *SubLoop = L->getSubLoops()[0];
+  if (!SubLoop->isLoopSimplifyForm())
+    return LoopUnrollResult::Unmodified;
+
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Exit = L->getExitingBlock();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+  if (Latch != Exit || SubLoopLatch != SubLoopExit)
+    return LoopUnrollResult::Unmodified;
+
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, SE, TTI, OptLevel, None, None, None, None, None, None);
+  if (AllowUnrollAndJam.getNumOccurrences() > 0)
+    UP.UnrollAndJam = AllowUnrollAndJam;
+  if (UnrollAndJamThreshold.getNumOccurrences() > 0)
+    UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
+  // Exit early if unrolling is disabled.
+  if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
+    return LoopUnrollResult::Unmodified;
+
+  LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
+                    << L->getHeader()->getParent()->getName() << "] Loop %"
+                    << L->getHeader()->getName() << "\n");
+
+  // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
+  // the unroller, so long as it does not explicitly have unroll_and_jam
+  // metadata. This means #pragma nounroll will disable unroll and jam as well
+  // as unrolling
+  if (HasUnrollAndJamDisablePragma(L) ||
+      (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+       !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
+    LLVM_DEBUG(dbgs() << "  Disabled due to pragma.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
+    LLVM_DEBUG(dbgs() << "  Disabled due to not being safe.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Approximate the loop size and collect useful info
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+  unsigned InnerLoopSize =
+      ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
+                          Convergent, TTI, EphValues, UP.BEInsns);
+  unsigned OuterLoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
+  LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSize << "\n");
+  LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSize << "\n");
+  if (NotDuplicatable) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable "
+                         "instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  if (NumInlineCandidates != 0) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  if (Convergent) {
+    LLVM_DEBUG(
+        dbgs() << "  Not unrolling loop with convergent instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Find trip count and trip multiple
+  unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
+  unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
+  unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
+
+  // Decide if, and by how much, to unroll
+  bool IsCountSetExplicitly = computeUnrollAndJamCount(
+      L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
+      OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
+  if (UP.Count <= 1)
+    return LoopUnrollResult::Unmodified;
+  // Unroll factor (Count) must be less or equal to TripCount.
+  if (OuterTripCount && UP.Count > OuterTripCount)
+    UP.Count = OuterTripCount;
+
+  LoopUnrollResult UnrollResult =
+      UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
+                       UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
+
+  // If loop has an unroll count pragma or unrolled by explicitly set count
+  // mark loop as unrolled to prevent unrolling beyond that requested.
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
+    L->setLoopAlreadyUnrolled();
+
+  return UnrollResult;
+}
+
+namespace {
+
+class LoopUnrollAndJam : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  unsigned OptLevel;
+
+  LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+    initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(&F);
+
+    LoopUnrollResult Result =
+        tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+
+    if (Result == LoopUnrollResult::FullyUnrolled)
+      LPM.markLoopAsDeleted(*L);
+
+    return Result != LoopUnrollResult::Unmodified;
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DependenceAnalysisWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char LoopUnrollAndJam::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
+                      "Unroll and Jam loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
+                    "Unroll and Jam loops", false, false)
+
+Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
+  return new LoopUnrollAndJam(OptLevel);
+}
+
+PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+  // FIXME: This should probably be optional rather than required.
+  if (!ORE)
+    report_fatal_error(
+        "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
+        "a higher level");
+
+  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+
+  LoopUnrollResult Result = tryToUnrollAndJamLoop(
+      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
+
+  if (Result == LoopUnrollResult::Unmodified)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 15e7da5e1a7a..634215c9770f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -164,7 +165,7 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
 
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
-static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
@@ -191,6 +192,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.Force = false;
   UP.UpperBound = false;
   UP.AllowPeeling = true;
+  UP.UnrollAndJam = false;
+  UP.UnrollAndJamInnerLoopThreshold = 60;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP);
@@ -285,17 +288,17 @@ struct UnrolledInstStateKeyInfo {
 };
 
 struct EstimatedUnrollCost {
-  /// \brief The estimated cost after unrolling.
+  /// The estimated cost after unrolling.
   unsigned UnrolledCost;
 
-  /// \brief The estimated dynamic cost of executing the instructions in the
+  /// The estimated dynamic cost of executing the instructions in the
   /// rolled form.
   unsigned RolledDynamicCost;
 };
 
 } // end anonymous namespace
 
-/// \brief Figure out if the loop is worth full unrolling.
+/// Figure out if the loop is worth full unrolling.
 ///
 /// Complete loop unrolling can make some loads constant, and we need to know
 /// if that would expose any further optimization opportunities.  This routine
@@ -308,10 +311,10 @@ struct EstimatedUnrollCost {
 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
 /// the analysis failed (no benefits expected from the unrolling, or the loop is
 /// too big to analyze), the returned value is None.
-static Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
-                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
-                      unsigned MaxUnrolledLoopSize) {
+static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
+    const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
+    const SmallPtrSetImpl<const Value *> &EphValues,
+    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
   // analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -405,9 +408,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
         // First accumulate the cost of this instruction.
         if (!Cost.IsFree) {
           UnrolledCost += TTI.getUserCost(I);
-          DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
-                       << "): ");
-          DEBUG(I->dump());
+          LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
+                            << Iteration << "): ");
+          LLVM_DEBUG(I->dump());
         }
 
         // We must count the cost of every operand which is not free,
@@ -442,14 +445,14 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
   assert(L->isLCSSAForm(DT) &&
          "Must have loops in LCSSA form to track live-out values.");
 
-  DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+  LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
 
   // Simulate execution of each iteration of the loop counting instructions,
   // which would be simplified.
   // Since the same load will take different values on different iterations,
   // we literally have to go through all loop's iterations.
   for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
-    DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+    LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
 
     // Prepare for the iteration by collecting any simplified entry or backedge
     // inputs.
@@ -490,7 +493,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
       // it.  We don't change the actual IR, just count optimization
       // opportunities.
       for (Instruction &I : *BB) {
-        if (isa<DbgInfoIntrinsic>(I))
+        // These won't get into the final code - don't even try calculating the
+        // cost for them.
+        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
           continue;
 
         // Track this instruction's expected baseline cost when executing the
@@ -512,8 +517,13 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
 
         // Can't properly model a cost of a call.
         // FIXME: With a proper cost model we should be able to do it.
-        if(isa<CallInst>(&I))
-          return None;
+        if (auto *CI = dyn_cast<CallInst>(&I)) {
+          const Function *Callee = CI->getCalledFunction();
+          if (!Callee || TTI.isLoweredToCall(Callee)) {
+            LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
+            return None;
+          }
+        }
 
         // If the instruction might have a side-effect recursively account for
         // the cost of it and all the instructions leading up to it.
@@ -522,10 +532,10 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
 
         // If unrolled body turns out to be too big, bail out.
         if (UnrolledCost > MaxUnrolledLoopSize) {
-          DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
-                       << "  UnrolledCost: " << UnrolledCost
-                       << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
-                       << "\n");
+          LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
+                            << "  UnrolledCost: " << UnrolledCost
+                            << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+                            << "\n");
           return None;
         }
       }
@@ -578,8 +588,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
     // If we found no optimization opportunities on the first iteration, we
     // won't find them on later ones too.
     if (UnrolledCost == RolledDynamicCost) {
-      DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
-                   << "  UnrolledCost: " << UnrolledCost << "\n");
+      LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
+                        << "  UnrolledCost: " << UnrolledCost << "\n");
       return None;
     }
   }
@@ -600,20 +610,17 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
     }
   }
 
-  DEBUG(dbgs() << "Analysis finished:\n"
-               << "UnrolledCost: " << UnrolledCost << ", "
-               << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+  LLVM_DEBUG(dbgs() << "Analysis finished:\n"
+                    << "UnrolledCost: " << UnrolledCost << ", "
+                    << "RolledDynamicCost: " << RolledDynamicCost << "\n");
   return {{UnrolledCost, RolledDynamicCost}};
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    bool &NotDuplicatable, bool &Convergent,
-                                    const TargetTransformInfo &TTI,
-                                    AssumptionCache *AC, unsigned BEInsns) {
-  SmallPtrSet<const Value *, 32> EphValues;
-  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
+unsigned llvm::ApproximateLoopSize(
+    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+    const TargetTransformInfo &TTI,
+    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
   CodeMetrics Metrics;
   for (BasicBlock *BB : L->blocks())
     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
@@ -706,10 +713,11 @@ static uint64_t getUnrolledLoopSize(
 
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
-static bool computeUnrollCount(
+bool llvm::computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
-    unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+    unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
@@ -729,7 +737,7 @@ static bool computeUnrollCount(
     UP.Runtime = true;
     UP.AllowExpensiveTripCount = true;
     UP.Force = true;
-    if (UP.AllowRemainder &&
+    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
         getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
       return true;
   }
@@ -746,8 +754,8 @@ static bool computeUnrollCount(
 
   if (ExplicitUnroll && TripCount != 0) {
     // If the loop has an unrolling pragma, we want to be more aggressive with
-    // unrolling limits. Set thresholds to at least the PragmaThreshold value
-    // which is larger than the default limits.
+    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+    // value which is larger than the default limits.
     UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
     UP.PartialThreshold =
         std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
@@ -763,7 +771,7 @@ static bool computeUnrollCount(
   // compute the former when the latter is zero.
   unsigned ExactTripCount = TripCount;
   assert((ExactTripCount == 0 || MaxTripCount == 0) &&
-         "ExtractTripCound and MaxTripCount cannot both be non zero.");
+         "ExtractTripCount and MaxTripCount cannot both be non zero.");
   unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
   UP.Count = FullUnrollTripCount;
   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
@@ -779,7 +787,7 @@ static bool computeUnrollCount(
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, SE, TTI,
+              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -794,7 +802,7 @@ static bool computeUnrollCount(
   }
 
   // 4th priority is loop peeling
-  computePeelCount(L, LoopSize, UP, TripCount);
+  computePeelCount(L, LoopSize, UP, TripCount, SE);
   if (UP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
@@ -802,12 +810,12 @@ static bool computeUnrollCount(
   }
 
   // 5th priority is partial unrolling.
-  // Try partial unroll only when TripCount could be staticaly calculated.
+  // Try partial unroll only when TripCount could be statically calculated.
   if (TripCount) {
     UP.Partial |= ExplicitUnroll;
     if (!UP.Partial) {
-      DEBUG(dbgs() << "  will not try to unroll partially because "
-                   << "-unroll-allow-partial not given\n");
+      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
+                        << "-unroll-allow-partial not given\n");
       UP.Count = 0;
       return false;
     }
@@ -894,8 +902,9 @@ static bool computeUnrollCount(
   // Reduce count based on the type of unrolling and the threshold values.
   UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
   if (!UP.Runtime) {
-    DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
-                 << "-unroll-runtime not given\n");
+    LLVM_DEBUG(
+        dbgs() << "  will not try to unroll loop with runtime trip count "
+               << "-unroll-runtime not given\n");
     UP.Count = 0;
     return false;
   }
@@ -915,12 +924,13 @@ static bool computeUnrollCount(
   if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
     while (UP.Count != 0 && TripMultiple % UP.Count != 0)
       UP.Count >>= 1;
-    DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
-                    "specific or because the loop contains a convergent "
-                    "instruction), so unroll count must divide the trip "
-                    "multiple, "
-                 << TripMultiple << ".  Reducing unroll count from "
-                 << OrigCount << " to " << UP.Count << ".\n");
+    LLVM_DEBUG(
+        dbgs() << "Remainder loop is restricted (that could architecture "
+                  "specific or because the loop contains a convergent "
+                  "instruction), so unroll count must divide the trip "
+                  "multiple, "
+               << TripMultiple << ".  Reducing unroll count from " << OrigCount
+               << " to " << UP.Count << ".\n");
 
     using namespace ore;
 
@@ -942,7 +952,8 @@ static bool computeUnrollCount(
 
   if (UP.Count > UP.MaxCount)
     UP.Count = UP.MaxCount;
-  DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count << "\n");
+  LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+                    << "\n");
   if (UP.Count < 2)
     UP.Count = 0;
   return ExplicitUnroll;
@@ -955,12 +966,13 @@ static LoopUnrollResult tryToUnrollLoop(
     Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
     Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
     Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
-  DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
-               << "] Loop %" << L->getHeader()->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Loop Unroll: F["
+                    << L->getHeader()->getParent()->getName() << "] Loop %"
+                    << L->getHeader()->getName() << "\n");
   if (HasUnrollDisablePragma(L))
     return LoopUnrollResult::Unmodified;
   if (!L->isLoopSimplifyForm()) {
-    DEBUG(
+    LLVM_DEBUG(
         dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
     return LoopUnrollResult::Unmodified;
   }
@@ -975,16 +987,21 @@ static LoopUnrollResult tryToUnrollLoop(
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
     return LoopUnrollResult::Unmodified;
-  unsigned LoopSize = ApproximateLoopSize(
-      L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
-  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
+  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
   if (NotDuplicatable) {
-    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
-                 << " instructions.\n");
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                      << " instructions.\n");
     return LoopUnrollResult::Unmodified;
   }
   if (NumInlineCandidates != 0) {
-    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -1030,7 +1047,7 @@ static LoopUnrollResult tryToUnrollLoop(
     // loop tests remains the same compared to the non-unrolled version, whereas
     // the generic upper bound unrolling keeps all but the last loop test so the
     // number of loop tests goes up which may end up being worse on targets with
-    // constriained branch predictor resources so is controlled by an option.)
+    // constrained branch predictor resources so is controlled by an option.)
     // In addition we only unroll small upper bounds.
     if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
       MaxTripCount = 0;
@@ -1040,9 +1057,9 @@ static LoopUnrollResult tryToUnrollLoop(
   // computeUnrollCount() decides whether it is beneficial to use upper bound to
   // fully unroll the loop.
   bool UseUpperBound = false;
-  bool IsCountSetExplicitly =
-      computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
-                         TripMultiple, LoopSize, UP, UseUpperBound);
+  bool IsCountSetExplicitly = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+      TripMultiple, LoopSize, UP, UseUpperBound);
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
   // Unroll factor (Count) must be less or equal to TripCount.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index f2405d9b0c03..b12586758925 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -28,7 +28,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -39,6 +39,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
@@ -66,7 +67,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
@@ -298,9 +298,9 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
     MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
 
     if (Metrics.notDuplicatable) {
-      DEBUG(dbgs() << "NOT unswitching loop %"
-                   << L->getHeader()->getName() << ", contents cannot be "
-                   << "duplicated!\n");
+      LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
+                        << ", contents cannot be "
+                        << "duplicated!\n");
       return false;
     }
   }
@@ -635,6 +635,12 @@ bool LoopUnswitch::processCurrentLoop() {
     return true;
   }
 
+  // Do not do non-trivial unswitch while optimizing for size.
+  // FIXME: Use Function::optForSize().
+  if (OptimizeForSize ||
+      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+
   // Run through the instructions in the loop, keeping track of three things:
   //
   //  - That we do not unswitch loops containing convergent operations, as we
@@ -666,12 +672,6 @@ bool LoopUnswitch::processCurrentLoop() {
     }
   }
 
-  // Do not do non-trivial unswitch while optimizing for size.
-  // FIXME: Use Function::optForSize().
-  if (OptimizeForSize ||
-      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
-    return false;
-
   for (IntrinsicInst *Guard : Guards) {
     Value *LoopCond =
         FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
@@ -856,20 +856,20 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                                         TerminatorInst *TI) {
   // Check to see if it would be profitable to unswitch current loop.
   if (!BranchesInfo.CostAllowsUnswitching()) {
-    DEBUG(dbgs() << "NOT unswitching loop %"
-                 << currentLoop->getHeader()->getName()
-                 << " at non-trivial condition '" << *Val
-                 << "' == " << *LoopCond << "\n"
-                 << ". Cost too high.\n");
+    LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+                      << currentLoop->getHeader()->getName()
+                      << " at non-trivial condition '" << *Val
+                      << "' == " << *LoopCond << "\n"
+                      << ". Cost too high.\n");
     return false;
   }
   if (hasBranchDivergence &&
       getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
-    DEBUG(dbgs() << "NOT unswitching loop %"
-                 << currentLoop->getHeader()->getName()
-                 << " at non-trivial condition '" << *Val
-                 << "' == " << *LoopCond << "\n"
-                 << ". Condition is divergent.\n");
+    LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+                      << currentLoop->getHeader()->getName()
+                      << " at non-trivial condition '" << *Val
+                      << "' == " << *LoopCond << "\n"
+                      << ". Condition is divergent.\n");
     return false;
   }
 
@@ -910,6 +910,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                                   BranchInst *OldBranch,
                                                   TerminatorInst *TI) {
   assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
+  assert(TrueDest != FalseDest && "Branch targets should be different");
   // Insert a conditional branch on LIC to the two preheaders.  The original
   // code is the true version and the new code is the false version.
   Value *BranchVal = LIC;
@@ -942,9 +943,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
   if (DT) {
     // First, add both successors.
     SmallVector<DominatorTree::UpdateType, 3> Updates;
-    if (TrueDest != OldBranchParent)
+    if (TrueDest != OldBranchSucc)
       Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
-    if (FalseDest != OldBranchParent)
+    if (FalseDest != OldBranchSucc)
       Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
     // If both of the new successors are different from the old one, inform the
     // DT that the edge was deleted.
@@ -970,11 +971,15 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                             BasicBlock *ExitBlock,
                                             TerminatorInst *TI) {
-  DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
-               << loopHeader->getName() << " [" << L->getBlocks().size()
-               << " blocks] in Function "
-               << L->getHeader()->getParent()->getName() << " on cond: " << *Val
-               << " == " << *Cond << "\n");
+  LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+                    << loopHeader->getName() << " [" << L->getBlocks().size()
+                    << " blocks] in Function "
+                    << L->getHeader()->getParent()->getName()
+                    << " on cond: " << *Val << " == " << *Cond << "\n");
+  // We are going to make essential changes to CFG. This may invalidate cached
+  // information for L or one of its parent loops in SCEV.
+  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+    SEWP->getSE().forgetTopmostLoop(L);
 
   // First step, split the preheader, so that we know that there is a safe place
   // to insert the conditional branch.  We will change loopPreheader to have a
@@ -1038,7 +1043,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   // until it finds the trivial condition candidate (condition that is not a
   // constant). Since unswitching generates branches with constant conditions,
   // this scenario could be very common in practice.
-  SmallSet<BasicBlock*, 8> Visited;
+  SmallPtrSet<BasicBlock*, 8> Visited;
 
   while (true) {
     // If we exit loop or reach a previous visited block, then
@@ -1196,13 +1201,15 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
 void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                                Loop *L, TerminatorInst *TI) {
   Function *F = loopHeader->getParent();
-  DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
-        << loopHeader->getName() << " [" << L->getBlocks().size()
-        << " blocks] in Function " << F->getName()
-        << " when '" << *Val << "' == " << *LIC << "\n");
+  LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+                    << loopHeader->getName() << " [" << L->getBlocks().size()
+                    << " blocks] in Function " << F->getName() << " when '"
+                    << *Val << "' == " << *LIC << "\n");
 
+  // We are going to make essential changes to CFG. This may invalidate cached
+  // information for L or one of its parent loops in SCEV.
   if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
-    SEWP->getSE().forgetLoop(L);
+    SEWP->getSE().forgetTopmostLoop(L);
 
   LoopBlocks.clear();
   NewBlocks.clear();
@@ -1355,7 +1362,7 @@ static void RemoveFromWorklist(Instruction *I,
 static void ReplaceUsesOfWith(Instruction *I, Value *V,
                               std::vector<Instruction*> &Worklist,
                               Loop *L, LPPassManager *LPM) {
-  DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+  LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
 
   // Add uses to the worklist, which may be dead now.
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1524,7 +1531,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
 
     // Simple DCE.
     if (isInstructionTriviallyDead(I)) {
-      DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+      LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
 
       // Add uses to the worklist, which may be dead now.
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1557,8 +1564,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         if (!SinglePred) continue;  // Nothing to do.
         assert(SinglePred == Pred && "CFG broken");
 
-        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
-              << Succ->getName() << "\n");
+        LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+                          << Succ->getName() << "\n");
 
         // Resolve any single entry PHI nodes in Succ.
         while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 53b25e688e82..06e86081e8a0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -68,6 +68,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -85,6 +86,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <cassert>
@@ -111,7 +113,7 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
         "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
     cl::init(2), cl::Hidden);
 
-/// \brief Create MDNode for input string.
+/// Create MDNode for input string.
 static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
   LLVMContext &Context = TheLoop->getHeader()->getContext();
   Metadata *MDs[] = {
@@ -120,7 +122,7 @@ static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
   return MDNode::get(Context, MDs);
 }
 
-/// \brief Set input string into loop metadata by keeping other values intact.
+/// Set input string into loop metadata by keeping other values intact.
 void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                                    unsigned V) {
   SmallVector<Metadata *, 4> MDs(1);
@@ -166,6 +168,7 @@ struct LoopVersioningLICM : public LoopPass {
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
   StringRef getPassName() const override { return "Loop Versioning for LICM"; }
@@ -178,6 +181,7 @@ struct LoopVersioningLICM : public LoopPass {
     LoadAndStoreCounter = 0;
     InvariantCounter = 0;
     IsReadOnlyLoop = true;
+    ORE = nullptr;
     CurAST.reset();
   }
 
@@ -207,7 +211,7 @@ private:
   Loop *CurLoop = nullptr;
 
   // AliasSet information for the current loop.
-  std::unique_ptr<AliasSetTracker> CurAST; 
+  std::unique_ptr<AliasSetTracker> CurAST;
 
   // Maximum loop nest threshold
   unsigned LoopDepthThreshold;
@@ -224,6 +228,9 @@ private:
   // Read only loop marker.
   bool IsReadOnlyLoop = true;
 
+  // OptimizationRemarkEmitter
+  OptimizationRemarkEmitter *ORE;
+
   bool isLegalForVersioning();
   bool legalLoopStructure();
   bool legalLoopInstructions();
@@ -235,58 +242,57 @@ private:
 
 } // end anonymous namespace
 
-/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
+/// Check loop structure and confirms it's good for LoopVersioningLICM.
 bool LoopVersioningLICM::legalLoopStructure() {
   // Loop must be in loop simplify form.
   if (!CurLoop->isLoopSimplifyForm()) {
-    DEBUG(
-        dbgs() << "    loop is not in loop-simplify form.\n");
+    LLVM_DEBUG(dbgs() << "    loop is not in loop-simplify form.\n");
     return false;
   }
   // Loop should be innermost loop, if not return false.
   if (!CurLoop->getSubLoops().empty()) {
-    DEBUG(dbgs() << "    loop is not innermost\n");
+    LLVM_DEBUG(dbgs() << "    loop is not innermost\n");
     return false;
   }
   // Loop should have a single backedge, if not return false.
   if (CurLoop->getNumBackEdges() != 1) {
-    DEBUG(dbgs() << "    loop has multiple backedges\n");
+    LLVM_DEBUG(dbgs() << "    loop has multiple backedges\n");
     return false;
   }
   // Loop must have a single exiting block, if not return false.
   if (!CurLoop->getExitingBlock()) {
-    DEBUG(dbgs() << "    loop has multiple exiting block\n");
+    LLVM_DEBUG(dbgs() << "    loop has multiple exiting block\n");
     return false;
   }
   // We only handle bottom-tested loop, i.e. loop in which the condition is
   // checked at the end of each iteration. With that we can assume that all
   // instructions in the loop are executed the same number of times.
   if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
-    DEBUG(dbgs() << "    loop is not bottom tested\n");
+    LLVM_DEBUG(dbgs() << "    loop is not bottom tested\n");
     return false;
   }
   // Parallel loops must not have aliasing loop-invariant memory accesses.
   // Hence we don't need to version anything in this case.
   if (CurLoop->isAnnotatedParallel()) {
-    DEBUG(dbgs() << "    Parallel loop is not worth versioning\n");
+    LLVM_DEBUG(dbgs() << "    Parallel loop is not worth versioning\n");
     return false;
   }
   // Loop depth more then LoopDepthThreshold are not allowed
   if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
-    DEBUG(dbgs() << "    loop depth is more then threshold\n");
+    LLVM_DEBUG(dbgs() << "    loop depth is more then threshold\n");
     return false;
   }
   // We need to be able to compute the loop trip count in order
   // to generate the bound checks.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
-    DEBUG(dbgs() << "    loop does not has trip count\n");
+    LLVM_DEBUG(dbgs() << "    loop does not has trip count\n");
     return false;
   }
   return true;
 }
 
-/// \brief Check memory accesses in loop and confirms it's good for
+/// Check memory accesses in loop and confirms it's good for
 /// LoopVersioningLICM.
 bool LoopVersioningLICM::legalLoopMemoryAccesses() {
   bool HasMayAlias = false;
@@ -328,24 +334,24 @@ bool LoopVersioningLICM::legalLoopMemoryAccesses() {
   }
   // Ensure types should be of same type.
   if (!TypeSafety) {
-    DEBUG(dbgs() << "    Alias tracker type safety failed!\n");
+    LLVM_DEBUG(dbgs() << "    Alias tracker type safety failed!\n");
     return false;
   }
   // Ensure loop body shouldn't be read only.
   if (!HasMod) {
-    DEBUG(dbgs() << "    No memory modified in loop body\n");
+    LLVM_DEBUG(dbgs() << "    No memory modified in loop body\n");
     return false;
   }
   // Make sure alias set has may alias case.
   // If there no alias memory ambiguity, return false.
   if (!HasMayAlias) {
-    DEBUG(dbgs() << "    No ambiguity in memory access.\n");
+    LLVM_DEBUG(dbgs() << "    No ambiguity in memory access.\n");
     return false;
   }
   return true;
 }
 
-/// \brief Check loop instructions safe for Loop versioning.
+/// Check loop instructions safe for Loop versioning.
 /// It returns true if it's safe else returns false.
 /// Consider following:
 /// 1) Check all load store in loop body are non atomic & non volatile.
@@ -355,12 +361,12 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
   assert(I != nullptr && "Null instruction found!");
   // Check function call safety
   if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) {
-    DEBUG(dbgs() << "    Unsafe call site found.\n");
+    LLVM_DEBUG(dbgs() << "    Unsafe call site found.\n");
     return false;
   }
   // Avoid loops with possiblity of throw
   if (I->mayThrow()) {
-    DEBUG(dbgs() << "    May throw instruction found in loop body\n");
+    LLVM_DEBUG(dbgs() << "    May throw instruction found in loop body\n");
     return false;
   }
   // If current instruction is load instructions
@@ -368,7 +374,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
   if (I->mayReadFromMemory()) {
     LoadInst *Ld = dyn_cast<LoadInst>(I);
     if (!Ld || !Ld->isSimple()) {
-      DEBUG(dbgs() << "    Found a non-simple load.\n");
+      LLVM_DEBUG(dbgs() << "    Found a non-simple load.\n");
       return false;
     }
     LoadAndStoreCounter++;
@@ -382,7 +388,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
   else if (I->mayWriteToMemory()) {
     StoreInst *St = dyn_cast<StoreInst>(I);
     if (!St || !St->isSimple()) {
-      DEBUG(dbgs() << "    Found a non-simple store.\n");
+      LLVM_DEBUG(dbgs() << "    Found a non-simple store.\n");
       return false;
     }
     LoadAndStoreCounter++;
@@ -396,59 +402,87 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
   return true;
 }
 
-/// \brief Check loop instructions and confirms it's good for
+/// Check loop instructions and confirms it's good for
 /// LoopVersioningLICM.
 bool LoopVersioningLICM::legalLoopInstructions() {
   // Resetting counters.
   LoadAndStoreCounter = 0;
   InvariantCounter = 0;
   IsReadOnlyLoop = true;
+  using namespace ore;
   // Iterate over loop blocks and instructions of each block and check
   // instruction safety.
   for (auto *Block : CurLoop->getBlocks())
     for (auto &Inst : *Block) {
       // If instruction is unsafe just return false.
-      if (!instructionSafeForVersioning(&Inst))
+      if (!instructionSafeForVersioning(&Inst)) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst)
+                 << " Unsafe Loop Instruction";
+        });
         return false;
+      }
     }
   // Get LoopAccessInfo from current loop.
   LAI = &LAA->getInfo(CurLoop);
   // Check LoopAccessInfo for need of runtime check.
   if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
-    DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
+    LLVM_DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
     return false;
   }
   // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
   if (LAI->getNumRuntimePointerChecks() >
       VectorizerParams::RuntimeMemoryCheckThreshold) {
-    DEBUG(dbgs() << "    LAA: Runtime checks are more than threshold !!\n");
+    LLVM_DEBUG(
+        dbgs() << "    LAA: Runtime checks are more than threshold !!\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << "Number of runtime checks "
+             << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks())
+             << " exceeds threshold "
+             << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold);
+    });
     return false;
   }
   // Loop should have at least one invariant load or store instruction.
   if (!InvariantCounter) {
-    DEBUG(dbgs() << "    Invariant not found !!\n");
+    LLVM_DEBUG(dbgs() << "    Invariant not found !!\n");
     return false;
   }
   // Read only loop not allowed.
   if (IsReadOnlyLoop) {
-    DEBUG(dbgs() << "    Found a read-only loop!\n");
+    LLVM_DEBUG(dbgs() << "    Found a read-only loop!\n");
     return false;
   }
   // Profitablity check:
   // Check invariant threshold, should be in limit.
   if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
-    DEBUG(dbgs()
-          << "    Invariant load & store are less then defined threshold\n");
-    DEBUG(dbgs() << "    Invariant loads & stores: "
-                 << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n");
-    DEBUG(dbgs() << "    Invariant loads & store threshold: "
-                 << InvariantThreshold << "%\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "    Invariant load & store are less then defined threshold\n");
+    LLVM_DEBUG(dbgs() << "    Invariant loads & stores: "
+                      << ((InvariantCounter * 100) / LoadAndStoreCounter)
+                      << "%\n");
+    LLVM_DEBUG(dbgs() << "    Invariant loads & store threshold: "
+                      << InvariantThreshold << "%\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << "Invariant load & store "
+             << NV("LoadAndStoreCounter",
+                   ((InvariantCounter * 100) / LoadAndStoreCounter))
+             << " are less then defined threshold "
+             << NV("Threshold", InvariantThreshold);
+    });
     return false;
   }
   return true;
 }
 
-/// \brief It checks loop is already visited or not.
+/// It checks loop is already visited or not.
 /// check loop meta data, if loop revisited return true
 /// else false.
 bool LoopVersioningLICM::isLoopAlreadyVisited() {
@@ -459,42 +493,64 @@ bool LoopVersioningLICM::isLoopAlreadyVisited() {
   return false;
 }
 
-/// \brief Checks legality for LoopVersioningLICM by considering following:
+/// Checks legality for LoopVersioningLICM by considering following:
 /// a) loop structure legality   b) loop instruction legality
 /// c) loop memory access legality.
 /// Return true if legal else returns false.
 bool LoopVersioningLICM::isLegalForVersioning() {
-  DEBUG(dbgs() << "Loop: " << *CurLoop);
+  using namespace ore;
+  LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop);
   // Make sure not re-visiting same loop again.
   if (isLoopAlreadyVisited()) {
-    DEBUG(
+    LLVM_DEBUG(
         dbgs() << "    Revisiting loop in LoopVersioningLICM not allowed.\n\n");
     return false;
   }
   // Check loop structure leagality.
   if (!legalLoopStructure()) {
-    DEBUG(
+    LLVM_DEBUG(
         dbgs() << "    Loop structure not suitable for LoopVersioningLICM\n\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << " Unsafe Loop structure";
+    });
     return false;
   }
   // Check loop instruction leagality.
   if (!legalLoopInstructions()) {
-    DEBUG(dbgs()
-          << "    Loop instructions not suitable for LoopVersioningLICM\n\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "    Loop instructions not suitable for LoopVersioningLICM\n\n");
     return false;
   }
   // Check loop memory access leagality.
   if (!legalLoopMemoryAccesses()) {
-    DEBUG(dbgs()
-          << "    Loop memory access not suitable for LoopVersioningLICM\n\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "    Loop memory access not suitable for LoopVersioningLICM\n\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << " Unsafe Loop memory access";
+    });
     return false;
   }
   // Loop versioning is feasible, return true.
-  DEBUG(dbgs() << "    Loop Versioning found to be beneficial\n\n");
+  LLVM_DEBUG(dbgs() << "    Loop Versioning found to be beneficial\n\n");
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning",
+                              CurLoop->getStartLoc(), CurLoop->getHeader())
+           << " Versioned loop for LICM."
+           << " Number of runtime checks we had to insert "
+           << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks());
+  });
   return true;
 }
 
-/// \brief Update loop with aggressive aliasing assumptions.
+/// Update loop with aggressive aliasing assumptions.
 /// It marks no-alias to any pairs of memory operations by assuming
 /// loop should not have any must-alias memory accesses pairs.
 /// During LoopVersioningLICM legality we ignore loops having must
@@ -542,6 +598,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
   LAI = nullptr;
   // Set Current Loop
   CurLoop = L;
@@ -592,6 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
                     "Loop Versioning For LICM", false, false)
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 46f8a3564265..68bfa0030395 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -357,7 +357,7 @@ PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
 }
 
 namespace {
-/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+/// Legacy pass for lowering expect intrinsics out of the IR.
 ///
 /// When this pass is run over a function it uses expect intrinsics which feed
 /// branches and switches to provide branch weight metadata for those
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9c870b42a747..3b74421a47a0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -55,7 +56,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -263,7 +263,7 @@ public:
 
   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
   }
 
   void addRange(int64_t Start, int64_t Size, Value *Ptr,
@@ -479,10 +479,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     AMemSet =
       Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
 
-    DEBUG(dbgs() << "Replace stores:\n";
-          for (Instruction *SI : Range.TheStores)
-            dbgs() << *SI << '\n';
-          dbgs() << "With: " << *AMemSet << '\n');
+    LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
+                                                   : Range.TheStores) dbgs()
+                                              << *SI << '\n';
+               dbgs() << "With: " << *AMemSet << '\n');
 
     if (!Range.TheStores.empty())
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
@@ -498,16 +498,25 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   return AMemSet;
 }
 
-static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
-                                     const LoadInst *LI) {
+static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) {
   unsigned StoreAlign = SI->getAlignment();
   if (!StoreAlign)
     StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  return StoreAlign;
+}
+
+static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) {
   unsigned LoadAlign = LI->getAlignment();
   if (!LoadAlign)
     LoadAlign = DL.getABITypeAlignment(LI->getType());
+  return LoadAlign;
+}
 
-  return std::min(StoreAlign, LoadAlign);
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = findStoreAlignment(DL, SI);
+  unsigned LoadAlign = findLoadAlignment(DL, LI);
+  return MinAlign(StoreAlign, LoadAlign);
 }
 
 // This method try to lift a store instruction before position P.
@@ -522,7 +531,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
     return false;
 
   // Keep track of the arguments of all instruction we plan to lift
-  // so we can make sure to lift them as well if apropriate.
+  // so we can make sure to lift them as well if appropriate.
   DenseSet<Instruction*> Args;
   if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
     if (Ptr->getParent() == SI->getParent())
@@ -594,7 +603,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
 
   // We made it, we need to lift
   for (auto *I : llvm::reverse(ToLift)) {
-    DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+    LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
     I->moveBefore(P);
   }
 
@@ -656,22 +665,23 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
           if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
             UseMemMove = true;
 
-          unsigned Align = findCommonAlignment(DL, SI, LI);
           uint64_t Size = DL.getTypeStoreSize(T);
 
           IRBuilder<> Builder(P);
           Instruction *M;
           if (UseMemMove)
-            M = Builder.CreateMemMove(SI->getPointerOperand(),
-                                      LI->getPointerOperand(), Size,
-                                      Align, SI->isVolatile());
+            M = Builder.CreateMemMove(
+                SI->getPointerOperand(), findStoreAlignment(DL, SI),
+                LI->getPointerOperand(), findLoadAlignment(DL, LI), Size,
+                SI->isVolatile());
           else
-            M = Builder.CreateMemCpy(SI->getPointerOperand(),
-                                     LI->getPointerOperand(), Size,
-                                     Align, SI->isVolatile());
+            M = Builder.CreateMemCpy(
+                SI->getPointerOperand(), findStoreAlignment(DL, SI),
+                LI->getPointerOperand(), findLoadAlignment(DL, LI), Size,
+                SI->isVolatile());
 
-          DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
-                       << " => " << *M << "\n");
+          LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
+                            << *M << "\n");
 
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -760,7 +770,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
                                      Size, Align, SI->isVolatile());
 
-      DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+      LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
 
       MD->removeInstruction(SI);
       SI->eraseFromParent();
@@ -1047,20 +1057,17 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
 
   // If all checks passed, then we can transform M.
 
-  // Make sure to use the lesser of the alignment of the source and the dest
-  // since we're changing where we're reading from, but don't want to increase
-  // the alignment past what can be read from or written to.
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
-  unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
-
   IRBuilder<> Builder(M);
   if (UseMemMove)
-    Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
-                          Align, M->isVolatile());
+    Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(),
+                          MDep->getRawSource(), MDep->getSourceAlignment(),
+                          M->getLength(), M->isVolatile());
   else
-    Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
-                         Align, M->isVolatile());
+    Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(),
+                         MDep->getRawSource(), MDep->getSourceAlignment(),
+                         M->getLength(), M->isVolatile());
 
   // Remove the instruction we're replacing.
   MD->removeInstruction(M);
@@ -1106,7 +1113,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   // If Dest is aligned, and SrcSize is constant, use the minimum alignment
   // of the sum.
   const unsigned DestAlign =
-      std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+      std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
   if (DestAlign > 1)
     if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
       Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
@@ -1166,7 +1173,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 
   IRBuilder<> Builder(MemCpy);
   Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
-                       CopySize, MemCpy->getAlignment());
+                       CopySize, MemCpy->getDestAlignment());
   return true;
 }
 
@@ -1192,7 +1199,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
         IRBuilder<> Builder(M);
         Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
-                             M->getAlignment(), false);
+                             M->getDestAlignment(), false);
         MD->removeInstruction(M);
         M->eraseFromParent();
         ++NumCpyToSet;
@@ -1221,8 +1228,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
   //   d) memcpy from a just-memset'd source can be turned into memset.
   if (DepInfo.isClobber()) {
     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+      // FIXME: Can we pass in either of dest/src alignment here instead
+      // of conservatively taking the minimum?
+      unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment());
       if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), M->getAlignment(),
+                               CopySize->getZExtValue(), Align,
                                C)) {
         MD->removeInstruction(M);
         M->eraseFromParent();
@@ -1284,8 +1294,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
                     MemoryLocation::getForSource(M)))
     return false;
 
-  DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
-               << "\n");
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+                    << "\n");
 
   // If not, then we know we can transform this.
   Type *ArgTys[3] = { M->getRawDest()->getType(),
@@ -1337,7 +1347,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
   AssumptionCache &AC = LookupAssumptionCache();
   DominatorTree &DT = LookupDomTree();
-  if (MDep->getAlignment() < ByValAlign &&
+  if (MDep->getSourceAlignment() < ByValAlign &&
       getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
     return false;
@@ -1367,9 +1377,9 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
     TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
                               "tmpcast", CS.getInstruction());
 
-  DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
-               << "  " << *MDep << "\n"
-               << "  " << *CS.getInstruction() << "\n");
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+                    << "  " << *MDep << "\n"
+                    << "  " << *CS.getInstruction() << "\n");
 
   // Otherwise we're good!  Update the byval argument.
   CS.setArgument(ArgNo, TmpCast);
@@ -1381,10 +1391,19 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
 bool MemCpyOptPass::iterateOnFunction(Function &F) {
   bool MadeChange = false;
 
+  DominatorTree &DT = LookupDomTree();
+
   // Walk all instruction in the function.
   for (BasicBlock &BB : F) {
+    // Skip unreachable blocks. For example processStore assumes that an
+    // instruction in a BB can't be dominated by a later instruction in the
+    // same BB (which is a scenario that can happen for an unreachable BB that
+    // has itself as a predecessor).
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
-      // Avoid invalidating the iterator.
+        // Avoid invalidating the iterator.
       Instruction *I = &*BI++;
 
       bool RepeatInstruction = false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index 9869a3fb96fa..ff0183a8ea2d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -71,30 +71,30 @@ struct BCEAtom {
 };
 
 // If this value is a load from a constant offset w.r.t. a base address, and
-// there are no othe rusers of the load or address, returns the base address and
+// there are no other users of the load or address, returns the base address and
 // the offset.
 BCEAtom visitICmpLoadOperand(Value *const Val) {
   BCEAtom Result;
   if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
-    DEBUG(dbgs() << "load\n");
+    LLVM_DEBUG(dbgs() << "load\n");
     if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
-      DEBUG(dbgs() << "used outside of block\n");
+      LLVM_DEBUG(dbgs() << "used outside of block\n");
       return {};
     }
     if (LoadI->isVolatile()) {
-      DEBUG(dbgs() << "volatile\n");
+      LLVM_DEBUG(dbgs() << "volatile\n");
       return {};
     }
     Value *const Addr = LoadI->getOperand(0);
     if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
-      DEBUG(dbgs() << "GEP\n");
+      LLVM_DEBUG(dbgs() << "GEP\n");
       if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
-        DEBUG(dbgs() << "used outside of block\n");
+        LLVM_DEBUG(dbgs() << "used outside of block\n");
         return {};
       }
       const auto &DL = GEP->getModule()->getDataLayout();
       if (!isDereferenceablePointer(GEP, DL)) {
-        DEBUG(dbgs() << "not dereferenceable\n");
+        LLVM_DEBUG(dbgs() << "not dereferenceable\n");
         // We need to make sure that we can do comparison in any order, so we
         // require memory to be unconditionnally dereferencable.
         return {};
@@ -110,6 +110,10 @@ BCEAtom visitICmpLoadOperand(Value *const Val) {
 }
 
 // A basic block with a comparison between two BCE atoms.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
 // Note: the terminology is misleading: the comparison is symmetric, so there
 // is no real {l/r}hs. What we want though is to have the same base on the
 // left (resp. right), so that we can detect consecutive loads. To ensure this
@@ -127,7 +131,7 @@ class BCECmpBlock {
     return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
   }
 
-  // Assert the the block is consistent: If valid, it should also have
+  // Assert the block is consistent: If valid, it should also have
   // non-null members besides Lhs_ and Rhs_.
   void AssertConsistent() const {
     if (IsValid()) {
@@ -144,37 +148,95 @@ class BCECmpBlock {
   // Returns true if the block does other works besides comparison.
   bool doesOtherWork() const;
 
+  // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+  // instructions in the block.
+  bool canSplit() const;
+
+  // Return true if this all the relevant instructions in the BCE-cmp-block can
+  // be sunk below this instruction. By doing this, we know we can separate the
+  // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+  // block.
+  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const;
+
+  // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+  // instructions. Split the old block and move all non-BCE-cmp-insts into the
+  // new parent block.
+  void split(BasicBlock *NewParent) const;
+
   // The basic block where this comparison happens.
   BasicBlock *BB = nullptr;
   // The ICMP for this comparison.
   ICmpInst *CmpI = nullptr;
   // The terminating branch.
   BranchInst *BranchI = nullptr;
+  // The block requires splitting.
+  bool RequireSplit = false;
 
- private:
+private:
   BCEAtom Lhs_;
   BCEAtom Rhs_;
   int SizeBits_ = 0;
 };
 
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+                                    DenseSet<Instruction *> &BlockInsts) const {
+  // If this instruction has side effects and its in middle of the BCE cmp block
+  // instructions, then bail for now.
+  // TODO: use alias analysis to tell whether there is real interference.
+  if (Inst->mayHaveSideEffects())
+    return false;
+  // Make sure this instruction does not use any of the BCE cmp block
+  // instructions as operand.
+  for (auto BI : BlockInsts) {
+    if (is_contained(Inst->operands(), BI))
+      return false;
+  }
+  return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent) const {
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  llvm::SmallVector<Instruction *, 4> OtherInsts;
+  for (Instruction &Inst : *BB) {
+    if (BlockInsts.count(&Inst))
+      continue;
+    assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block");
+    // This is a non-BCE-cmp-block instruction. And it can be separated
+    // from the BCE-cmp-block instruction.
+    OtherInsts.push_back(&Inst);
+  }
+
+  // Do the actual spliting.
+  for (Instruction *Inst : reverse(OtherInsts)) {
+    Inst->moveBefore(&*NewParent->begin());
+  }
+}
+
+bool BCECmpBlock::canSplit() const {
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  for (Instruction &Inst : *BB) {
+    if (!BlockInsts.count(&Inst)) {
+      if (!canSinkBCECmpInst(&Inst, BlockInsts))
+        return false;
+    }
+  }
+  return true;
+}
+
 bool BCECmpBlock::doesOtherWork() const {
   AssertConsistent();
+  // All the instructions we care about in the BCE cmp block.
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
   // TODO(courbet): Can we allow some other things ? This is very conservative.
-  // We might be able to get away with anything does does not have any side
+  // We might be able to get away with anything does not have any side
   // effects outside of the basic block.
   // Note: The GEPs and/or loads are not necessarily in the same block.
   for (const Instruction &Inst : *BB) {
-    if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
-      if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true;
-    } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
-      if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true;
-    } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
-      if (C != CmpI) return true;
-    } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
-      if (Br != BranchI) return true;
-    } else {
+    if (!BlockInsts.count(&Inst))
       return true;
-    }
   }
   return false;
 }
@@ -183,10 +245,19 @@ bool BCECmpBlock::doesOtherWork() const {
 // BCE atoms, returns the comparison.
 BCECmpBlock visitICmp(const ICmpInst *const CmpI,
                       const ICmpInst::Predicate ExpectedPredicate) {
+  // The comparison can only be used once:
+  //  - For intermediate blocks, as a branch condition.
+  //  - For the final block, as an incoming value for the Phi.
+  // If there are any other uses of the comparison, we cannot merge it with
+  // other comparisons as we would create an orphan use of the value.
+  if (!CmpI->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "cmp has several uses\n");
+    return {};
+  }
   if (CmpI->getPredicate() == ExpectedPredicate) {
-    DEBUG(dbgs() << "cmp "
-                 << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "cmp "
+                      << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+                      << "\n");
     auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
     if (!Lhs.Base()) return {};
     auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
@@ -204,7 +275,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
   if (Block->empty()) return {};
   auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
   if (!BranchI) return {};
-  DEBUG(dbgs() << "branch\n");
+  LLVM_DEBUG(dbgs() << "branch\n");
   if (BranchI->isUnconditional()) {
     // In this case, we expect an incoming value which is the result of the
     // comparison. This is the last link in the chain of comparisons (note
@@ -212,7 +283,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
     // can be reordered).
     auto *const CmpI = dyn_cast<ICmpInst>(Val);
     if (!CmpI) return {};
-    DEBUG(dbgs() << "icmp\n");
+    LLVM_DEBUG(dbgs() << "icmp\n");
     auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
     Result.CmpI = CmpI;
     Result.BranchI = BranchI;
@@ -221,12 +292,12 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
     // In this case, we expect a constant incoming value (the comparison is
     // chained).
     const auto *const Const = dyn_cast<ConstantInt>(Val);
-    DEBUG(dbgs() << "const\n");
+    LLVM_DEBUG(dbgs() << "const\n");
     if (!Const->isZero()) return {};
-    DEBUG(dbgs() << "false\n");
+    LLVM_DEBUG(dbgs() << "false\n");
     auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
     if (!CmpI) return {};
-    DEBUG(dbgs() << "icmp\n");
+    LLVM_DEBUG(dbgs() << "icmp\n");
     assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
     BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
     auto Result = visitICmp(
@@ -238,6 +309,18 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
   return {};
 }
 
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+                                BCECmpBlock &Comparison) {
+  LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
+                    << "': Found cmp of " << Comparison.SizeBits()
+                    << " bits between " << Comparison.Lhs().Base() << " + "
+                    << Comparison.Lhs().Offset << " and "
+                    << Comparison.Rhs().Base() << " + "
+                    << Comparison.Rhs().Offset << "\n");
+  LLVM_DEBUG(dbgs() << "\n");
+  Comparisons.push_back(Comparison);
+}
+
 // A chain of comparisons.
 class BCECmpChain {
  public:
@@ -263,9 +346,9 @@ class BCECmpChain {
   // Merges the given comparison blocks into one memcmp block and update
   // branches. Comparisons are assumed to be continguous. If NextBBInChain is
   // null, the merged block will link to the phi block.
-  static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
-                               BasicBlock *const NextBBInChain, PHINode &Phi,
-                               const TargetLibraryInfo *const TLI);
+  void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+                        BasicBlock *const NextBBInChain, PHINode &Phi,
+                        const TargetLibraryInfo *const TLI);
 
   PHINode &Phi_;
   std::vector<BCECmpBlock> Comparisons_;
@@ -275,24 +358,47 @@ class BCECmpChain {
 
 BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
     : Phi_(Phi) {
+  assert(!Blocks.empty() && "a chain should have at least one block");
   // Now look inside blocks to check for BCE comparisons.
   std::vector<BCECmpBlock> Comparisons;
-  for (BasicBlock *Block : Blocks) {
+  for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
+    BasicBlock *const Block = Blocks[BlockIdx];
+    assert(Block && "invalid block");
     BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
                                            Block, Phi.getParent());
     Comparison.BB = Block;
     if (!Comparison.IsValid()) {
-      DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+      LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
       return;
     }
     if (Comparison.doesOtherWork()) {
-      DEBUG(dbgs() << "block does extra work besides compare\n");
-      if (Comparisons.empty()) {  // First block.
-        // TODO(courbet): The first block can do other things, and we should
-        // split them apart in a separate block before the comparison chain.
-        // Right now we just discard it and make the chain shorter.
-        DEBUG(dbgs()
-              << "ignoring first block that does extra work besides compare\n");
+      LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
+                        << "' does extra work besides compare\n");
+      if (Comparisons.empty()) {
+        // This is the initial block in the chain, in case this block does other
+        // work, we can try to split the block and move the irrelevant
+        // instructions to the predecessor.
+        //
+        // If this is not the initial block in the chain, splitting it wont
+        // work.
+        //
+        // As once split, there will still be instructions before the BCE cmp
+        // instructions that do other work in program order, i.e. within the
+        // chain before sorting. Unless we can abort the chain at this point
+        // and start anew.
+        //
+        // NOTE: we only handle block with single predecessor for now.
+        if (Comparison.canSplit()) {
+          LLVM_DEBUG(dbgs()
+                     << "Split initial block '" << Comparison.BB->getName()
+                     << "' that does extra work besides compare\n");
+          Comparison.RequireSplit = true;
+          enqueueBlock(Comparisons, Comparison);
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "ignoring initial block '" << Comparison.BB->getName()
+                     << "' that does extra work besides compare\n");
+        }
         continue;
       }
       // TODO(courbet): Right now we abort the whole chain. We could be
@@ -320,13 +426,13 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
       // We could still merge bb1 and bb2 though.
       return;
     }
-    DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
-                 << " bits between " << Comparison.Lhs().Base() << " + "
-                 << Comparison.Lhs().Offset << " and "
-                 << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
-                 << "\n");
-    DEBUG(dbgs() << "\n");
-    Comparisons.push_back(Comparison);
+    enqueueBlock(Comparisons, Comparison);
+  }
+
+  // It is possible we have no suitable comparison to merge.
+  if (Comparisons.empty()) {
+    LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
+    return;
   }
   EntryBlock_ = Comparisons[0].BB;
   Comparisons_ = std::move(Comparisons);
@@ -336,10 +442,10 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
 #endif  // MERGEICMPS_DOT_ON
   // Reorder blocks by LHS. We can do that without changing the
   // semantics because we are only accessing dereferencable memory.
-  std::sort(Comparisons_.begin(), Comparisons_.end(),
-            [](const BCECmpBlock &a, const BCECmpBlock &b) {
-              return a.Lhs() < b.Lhs();
-            });
+  llvm::sort(Comparisons_.begin(), Comparisons_.end(),
+             [](const BCECmpBlock &a, const BCECmpBlock &b) {
+               return a.Lhs() < b.Lhs();
+             });
 #ifdef MERGEICMPS_DOT_ON
   errs() << "AFTER REORDERING:\n\n";
   dump();
@@ -389,10 +495,24 @@ bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
     Phi_.removeIncomingValue(Comparison.BB, false);
   }
 
+  // If entry block is part of the chain, we need to make the first block
+  // of the chain the new entry block of the function.
+  BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock();
+  for (size_t I = 1; I < Comparisons_.size(); ++I) {
+    if (Entry == Comparisons_[I].BB) {
+      BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "",
+                                                Entry->getParent(), Entry);
+      BranchInst::Create(Entry, NEntryBB);
+      break;
+    }
+  }
+
   // Point the predecessors of the chain to the first comparison block (which is
-  // the new entry point).
-  if (EntryBlock_ != Comparisons_[0].BB)
+  // the new entry point) and update the entry block of the chain.
+  if (EntryBlock_ != Comparisons_[0].BB) {
     EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+    EntryBlock_ = Comparisons_[0].BB;
+  }
 
   // Effectively merge blocks.
   int NumMerged = 1;
@@ -424,7 +544,15 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
   LLVMContext &Context = BB->getContext();
 
   if (Comparisons.size() >= 2) {
-    DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+    // If there is one block that requires splitting, we do it now, i.e.
+    // just before we know we will collapse the chain. The instructions
+    // can be executed before any of the instructions in the chain.
+    auto C = std::find_if(Comparisons.begin(), Comparisons.end(),
+                          [](const BCECmpBlock &B) { return B.RequireSplit; });
+    if (C != Comparisons.end())
+      C->split(EntryBlock_);
+
+    LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
     const auto TotalSize =
         std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
                         [](int Size, const BCECmpBlock &C) {
@@ -445,7 +573,8 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     IRBuilder<> Builder(BB);
     const auto &DL = Phi.getModule()->getDataLayout();
     Value *const MemCmpCall = emitMemCmp(
-        FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+        FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
+        ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
         Builder, DL, TLI);
     Value *const MemCmpIsZero = Builder.CreateICmpEQ(
         MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
@@ -468,17 +597,17 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
   } else {
     assert(Comparisons.size() == 1);
     // There are no blocks to merge, but we still need to update the branches.
-    DEBUG(dbgs() << "Only one comparison, updating branches\n");
+    LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
     if (NextBBInChain) {
       if (FirstComparison.BranchI->isConditional()) {
-        DEBUG(dbgs() << "conditional -> conditional\n");
+        LLVM_DEBUG(dbgs() << "conditional -> conditional\n");
         // Just update the "true" target, the "false" target should already be
         // the phi block.
         assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
         FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
         Phi.addIncoming(ConstantInt::getFalse(Context), BB);
       } else {
-        DEBUG(dbgs() << "unconditional -> conditional\n");
+        LLVM_DEBUG(dbgs() << "unconditional -> conditional\n");
         // Replace the unconditional branch by a conditional one.
         FirstComparison.BranchI->eraseFromParent();
         IRBuilder<> Builder(BB);
@@ -488,14 +617,14 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
       }
     } else {
       if (FirstComparison.BranchI->isConditional()) {
-        DEBUG(dbgs() << "conditional -> unconditional\n");
+        LLVM_DEBUG(dbgs() << "conditional -> unconditional\n");
         // Replace the conditional branch by an unconditional one.
         FirstComparison.BranchI->eraseFromParent();
         IRBuilder<> Builder(BB);
         Builder.CreateBr(Phi.getParent());
         Phi.addIncoming(FirstComparison.CmpI, BB);
       } else {
-        DEBUG(dbgs() << "unconditional -> unconditional\n");
+        LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n");
         Phi.addIncoming(FirstComparison.CmpI, BB);
       }
     }
@@ -507,27 +636,28 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
                                            int NumBlocks) {
   // Walk up from the last block to find other blocks.
   std::vector<BasicBlock *> Blocks(NumBlocks);
+  assert(LastBlock && "invalid last block");
   BasicBlock *CurBlock = LastBlock;
   for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
     if (CurBlock->hasAddressTaken()) {
       // Somebody is jumping to the block through an address, all bets are
       // off.
-      DEBUG(dbgs() << "skip: block " << BlockIndex
-                   << " has its address taken\n");
+      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+                        << " has its address taken\n");
       return {};
     }
     Blocks[BlockIndex] = CurBlock;
     auto *SinglePredecessor = CurBlock->getSinglePredecessor();
     if (!SinglePredecessor) {
       // The block has two or more predecessors.
-      DEBUG(dbgs() << "skip: block " << BlockIndex
-                   << " has two or more predecessors\n");
+      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+                        << " has two or more predecessors\n");
       return {};
     }
     if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
       // The block does not link back to the phi.
-      DEBUG(dbgs() << "skip: block " << BlockIndex
-                   << " does not link back to the phi\n");
+      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+                        << " does not link back to the phi\n");
       return {};
     }
     CurBlock = SinglePredecessor;
@@ -537,9 +667,9 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
 }
 
 bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
-  DEBUG(dbgs() << "processPhi()\n");
+  LLVM_DEBUG(dbgs() << "processPhi()\n");
   if (Phi.getNumIncomingValues() <= 1) {
-    DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+    LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
     return false;
   }
   // We are looking for something that has the following structure:
@@ -552,7 +682,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
   //  - The last basic block (bb4 here) must branch unconditionally to bb_phi.
   //    It's the only block that contributes a non-constant value to the Phi.
   //  - All other blocks (b1, b2, b3) must have exactly two successors, one of
-  //    them being the the phi block.
+  //    them being the phi block.
   //  - All intermediate blocks (bb2, bb3) must have only one predecessor.
   //  - Blocks cannot do other work besides the comparison, see doesOtherWork()
 
@@ -563,18 +693,31 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
     if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
     if (LastBlock) {
       // There are several non-constant values.
-      DEBUG(dbgs() << "skip: several non-constant values\n");
+      LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
+      return false;
+    }
+    if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
+        cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
+            Phi.getIncomingBlock(I)) {
+      // Non-constant incoming value is not from a cmp instruction or not
+      // produced by the last block. We could end up processing the value
+      // producing block more than once.
+      //
+      // This is an uncommon case, so we bail.
+      LLVM_DEBUG(
+          dbgs()
+          << "skip: non-constant value not from cmp or not from last block.\n");
       return false;
     }
     LastBlock = Phi.getIncomingBlock(I);
   }
   if (!LastBlock) {
     // There is no non-constant block.
-    DEBUG(dbgs() << "skip: no non-constant block\n");
+    LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
     return false;
   }
   if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
-    DEBUG(dbgs() << "skip: last block non-phi successor\n");
+    LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
     return false;
   }
 
@@ -584,7 +727,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
   BCECmpChain CmpChain(Blocks, Phi);
 
   if (CmpChain.size() < 2) {
-    DEBUG(dbgs() << "skip: only one compare block\n");
+    LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
     return false;
   }
 
@@ -619,12 +762,16 @@ class MergeICmps : public FunctionPass {
 
 PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
                                       const TargetTransformInfo *TTI) {
-  DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
 
   // We only try merging comparisons if the target wants to expand memcmp later.
   // The rationale is to avoid turning small chains into memcmp calls.
   if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
 
+  // If we don't have memcmp avaiable we can't emit calls to it.
+  if (!TLI->has(LibFunc_memcmp))
+    return PreservedAnalyses::all();
+
   bool MadeChange = false;
 
   for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index f2f615cb9b0f..3464b759280f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 //! \file
-//! \brief This pass performs merges of loads and stores on both sides of a
+//! This pass performs merges of loads and stores on both sides of a
 //  diamond (hammock). It hoists the loads and sinks the stores.
 //
 // The algorithm iteratively hoists two loads to the same address out of a
@@ -80,7 +80,6 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Debug.h"
@@ -97,7 +96,6 @@ namespace {
 //                         MergedLoadStoreMotion Pass
 //===----------------------------------------------------------------------===//
 class MergedLoadStoreMotion {
-  MemoryDependenceResults *MD = nullptr;
   AliasAnalysis *AA = nullptr;
 
   // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
@@ -107,14 +105,9 @@ class MergedLoadStoreMotion {
   const int MagicCompileTimeControl = 250;
 
 public:
-  bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA);
+  bool run(Function &F, AliasAnalysis &AA);
 
 private:
-  ///
-  /// \brief Remove instruction from parent and update memory dependence
-  /// analysis.
-  ///
-  void removeInstruction(Instruction *Inst);
   BasicBlock *getDiamondTail(BasicBlock *BB);
   bool isDiamondHead(BasicBlock *BB);
   // Routines for sinking stores
@@ -128,23 +121,7 @@ private:
 } // end anonymous namespace
 
 ///
-/// \brief Remove instruction from parent and update memory dependence analysis.
-///
-void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
-  // Notify the memory dependence analysis.
-  if (MD) {
-    MD->removeInstruction(Inst);
-    if (auto *LI = dyn_cast<LoadInst>(Inst))
-      MD->invalidateCachedPointerInfo(LI->getPointerOperand());
-    if (Inst->getType()->isPtrOrPtrVectorTy()) {
-      MD->invalidateCachedPointerInfo(Inst);
-    }
-  }
-  Inst->eraseFromParent();
-}
-
-///
-/// \brief Return tail block of a diamond.
+/// Return tail block of a diamond.
 ///
 BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
   assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
@@ -152,7 +129,7 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
 }
 
 ///
-/// \brief True when BB is the head of a diamond (hammock)
+/// True when BB is the head of a diamond (hammock)
 ///
 bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
   if (!BB)
@@ -179,7 +156,7 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
 
 
 ///
-/// \brief True when instruction is a sink barrier for a store
+/// True when instruction is a sink barrier for a store
 /// located in Loc
 ///
 /// Whenever an instruction could possibly read or modify the
@@ -197,13 +174,13 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
 }
 
 ///
-/// \brief Check if \p BB contains a store to the same address as \p SI
+/// Check if \p BB contains a store to the same address as \p SI
 ///
 /// \return The store in \p  when it is safe to sink. Otherwise return Null.
 ///
 StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
                                                    StoreInst *Store0) {
-  DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
   BasicBlock *BB0 = Store0->getParent();
   for (Instruction &Inst : reverse(*BB1)) {
     auto *Store1 = dyn_cast<StoreInst>(&Inst);
@@ -222,7 +199,7 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
 }
 
 ///
-/// \brief Create a PHI node in BB for the operands of S0 and S1
+/// Create a PHI node in BB for the operands of S0 and S1
 ///
 PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
                                               StoreInst *S1) {
@@ -236,13 +213,11 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
                                 &BB->front());
   NewPN->addIncoming(Opd1, S0->getParent());
   NewPN->addIncoming(Opd2, S1->getParent());
-  if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
-    MD->invalidateCachedPointerInfo(NewPN);
   return NewPN;
 }
 
 ///
-/// \brief Merge two stores to same address and sink into \p BB
+/// Merge two stores to same address and sink into \p BB
 ///
 /// Also sinks GEP instruction computing the store address
 ///
@@ -254,9 +229,9 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
   if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
       (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
       (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
-    DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
-          dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
-          dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+               dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+               dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
     // Hoist the instruction.
     BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
     // Intersect optional metadata.
@@ -275,19 +250,19 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
     // New PHI operand? Use it.
     if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
       SNew->setOperand(0, NewPN);
-    removeInstruction(S0);
-    removeInstruction(S1);
+    S0->eraseFromParent();
+    S1->eraseFromParent();
     A0->replaceAllUsesWith(ANew);
-    removeInstruction(A0);
+    A0->eraseFromParent();
     A1->replaceAllUsesWith(ANew);
-    removeInstruction(A1);
+    A1->eraseFromParent();
     return true;
   }
   return false;
 }
 
 ///
-/// \brief True when two stores are equivalent and can sink into the footer
+/// True when two stores are equivalent and can sink into the footer
 ///
 /// Starting from a diamond tail block, iterate over the instructions in one
 /// predecessor block and try to match a store in the second predecessor.
@@ -310,7 +285,8 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
     return false; // No. More than 2 predecessors.
 
   // #Instructions in Succ1 for Compile Time Control
-  int Size1 = Pred1->size();
+  auto InstsNoDbg = Pred1->instructionsWithoutDebug();
+  int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
   int NStores = 0;
 
   for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
@@ -338,19 +314,17 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
         break;
       RBI = Pred0->rbegin();
       RBE = Pred0->rend();
-      DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+      LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
     }
   }
   return MergedStores;
 }
 
-bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
-                                AliasAnalysis &AA) {
-  this->MD = MD;
+bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
   this->AA = &AA;
 
   bool Changed = false;
-  DEBUG(dbgs() << "Instruction Merger\n");
+  LLVM_DEBUG(dbgs() << "Instruction Merger\n");
 
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
@@ -376,15 +350,13 @@ public:
   }
 
   ///
-  /// \brief Run the transformation for each function
+  /// Run the transformation for each function
   ///
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
     MergedLoadStoreMotion Impl;
-    auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
-    return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr,
-                    getAnalysis<AAResultsWrapperPass>().getAAResults());
+    return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
   }
 
 private:
@@ -392,7 +364,6 @@ private:
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>();
   }
 };
 
@@ -400,7 +371,7 @@ char MergedLoadStoreMotionLegacyPass::ID = 0;
 } // anonymous namespace
 
 ///
-/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+/// createMergedLoadStoreMotionPass - The public interface to this file.
 ///
 FunctionPass *llvm::createMergedLoadStoreMotionPass() {
   return new MergedLoadStoreMotionLegacyPass();
@@ -408,7 +379,6 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() {
 
 INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
                       "MergedLoadStoreMotion", false, false)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
                     "MergedLoadStoreMotion", false, false)
@@ -416,14 +386,12 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
 PreservedAnalyses
 MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
   MergedLoadStoreMotion Impl;
-  auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
-  if (!Impl.run(F, MD, AA))
+  if (!Impl.run(F, AA))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index b026c8d692c3..7106ea216ad6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -83,6 +83,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -105,7 +106,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 
@@ -240,10 +240,17 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
           Changed = true;
           SE->forgetValue(&*I);
           I->replaceAllUsesWith(NewI);
-          // If SeenExprs constains I's WeakTrackingVH, that entry will be
-          // replaced with
-          // nullptr.
+          WeakVH NewIExist = NewI;
+          // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that
+          // entry will be replaced with nullptr if deleted.
           RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+          if (!NewIExist) {
+            // Rare occation where the new instruction (NewI) have been removed,
+            // probably due to parts of the input code was dead from the
+            // beginning, reset the iterator and start over from the beginning
+            I = BB->begin();
+            continue;
+          }
           I = NewI->getIterator();
         }
         // Add the rewritten instruction to SeenExprs; the original instruction
@@ -429,6 +436,9 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
 
 Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
   Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  // There is no need to reassociate 0.
+  if (SE->getSCEV(I)->isZero())
+    return nullptr;
   if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
     return NewI;
   if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 9ebf2d769356..2eb887c986be 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -77,6 +77,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -105,7 +106,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
 #include <algorithm>
@@ -221,13 +221,13 @@ private:
       Components.resize(Components.size() + 1);
       auto &Component = Components.back();
       Component.insert(I);
-      DEBUG(dbgs() << "Component root is " << *I << "\n");
+      LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n");
       InComponent.insert(I);
       ValueToComponent[I] = ComponentID;
       // Pop a component off the stack and label it.
       while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
         auto *Member = Stack.back();
-        DEBUG(dbgs() << "Component member is " << *Member << "\n");
+        LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n");
         Component.insert(Member);
         InComponent.insert(Member);
         ValueToComponent[Member] = ComponentID;
@@ -366,9 +366,8 @@ public:
   // True if this class has no memory members.
   bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
 
-  // Return true if two congruence classes are equivalent to each other.  This
-  // means
-  // that every field but the ID number and the dead field are equivalent.
+  // Return true if two congruence classes are equivalent to each other. This
+  // means that every field but the ID number and the dead field are equivalent.
   bool isEquivalentTo(const CongruenceClass *Other) const {
     if (!Other)
       return false;
@@ -383,10 +382,12 @@ public:
       if (!DefiningExpr || !Other->DefiningExpr ||
           *DefiningExpr != *Other->DefiningExpr)
         return false;
-    // We need some ordered set
-    std::set<Value *> AMembers(Members.begin(), Members.end());
-    std::set<Value *> BMembers(Members.begin(), Members.end());
-    return AMembers == BMembers;
+
+    if (Members.size() != Other->Members.size())
+      return false;
+
+    return all_of(Members,
+                  [&](const Value *V) { return Other->Members.count(V); });
   }
 
 private:
@@ -860,7 +861,7 @@ private:
 
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
-  std::pair<int, int> StartingVNCounter;
+  int64_t StartingVNCounter;
 };
 
 } // end anonymous namespace
@@ -958,7 +959,8 @@ static bool isCopyOfAPHI(const Value *V) {
 // order. The BlockInstRange numbers are generated in an RPO walk of the basic
 // blocks.
 void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
-  std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) {
+  llvm::sort(Ops.begin(), Ops.end(),
+             [&](const ValPair &P1, const ValPair &P2) {
     return BlockInstRange.lookup(P1.second).first <
            BlockInstRange.lookup(P2.second).first;
   });
@@ -1067,8 +1069,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
     return nullptr;
   if (auto *C = dyn_cast<Constant>(V)) {
     if (I)
-      DEBUG(dbgs() << "Simplified " << *I << " to "
-                   << " constant " << *C << "\n");
+      LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+                        << " constant " << *C << "\n");
     NumGVNOpsSimplified++;
     assert(isa<BasicExpression>(E) &&
            "We should always have had a basic expression here");
@@ -1076,8 +1078,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
     return createConstantExpression(C);
   } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
     if (I)
-      DEBUG(dbgs() << "Simplified " << *I << " to "
-                   << " variable " << *V << "\n");
+      LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+                        << " variable " << *V << "\n");
     deleteExpression(E);
     return createVariableExpression(V);
   }
@@ -1100,8 +1102,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
       }
 
       if (I)
-        DEBUG(dbgs() << "Simplified " << *I << " to "
-                     << " expression " << *CC->getDefiningExpr() << "\n");
+        LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+                          << " expression " << *CC->getDefiningExpr() << "\n");
       NumGVNOpsSimplified++;
       deleteExpression(E);
       return CC->getDefiningExpr();
@@ -1257,7 +1259,7 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
    // This must be an instruction because we are only called from phi nodes
   // in the case that the value it needs to check against is an instruction.
 
-  // The most likely candiates for dominance are the leader and the next leader.
+  // The most likely candidates for dominance are the leader and the next leader.
   // The leader or nextleader will dominate in all cases where there is an
   // equivalent that is higher up in the dom tree.
   // We can't *only* check them, however, because the
@@ -1421,8 +1423,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
     if (Offset >= 0) {
       if (auto *C = dyn_cast<Constant>(
               lookupOperandLeader(DepSI->getValueOperand()))) {
-        DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
-                     << *C << "\n");
+        LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
+                          << " to constant " << *C << "\n");
         return createConstantExpression(
             getConstantStoreValueForLoad(C, Offset, LoadType, DL));
       }
@@ -1437,8 +1439,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
       if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
         if (auto *PossibleConstant =
                 getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
-          DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
-                       << *PossibleConstant << "\n");
+          LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
+                            << " to constant " << *PossibleConstant << "\n");
           return createConstantExpression(PossibleConstant);
         }
     }
@@ -1447,8 +1449,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
     if (Offset >= 0) {
       if (auto *PossibleConstant =
               getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
-        DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
-                     << " to constant " << *PossibleConstant << "\n");
+        LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                          << " to constant " << *PossibleConstant << "\n");
         return createConstantExpression(PossibleConstant);
       }
     }
@@ -1529,7 +1531,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
   if (!PI)
     return nullptr;
 
-  DEBUG(dbgs() << "Found predicate info from instruction !\n");
+  LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
 
   auto *PWC = dyn_cast<PredicateWithCondition>(PI);
   if (!PWC)
@@ -1569,7 +1571,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
     return nullptr;
 
   if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
-    DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+    LLVM_DEBUG(dbgs() << "Copy is not of any condition operands!\n");
     return nullptr;
   }
   Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1584,11 +1586,11 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
       SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
 
   if (isa<PredicateAssume>(PI)) {
-    // If the comparison is true when the operands are equal, then we know the
-    // operands are equal, because assumes must always be true.
-    if (CmpInst::isTrueWhenEqual(Predicate)) {
+    // If we assume the operands are equal, then they are equal.
+    if (Predicate == CmpInst::ICMP_EQ) {
       addPredicateUsers(PI, I);
-      addAdditionalUsers(Cmp->getOperand(0), I);
+      addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+                         I);
       return createVariableOrConstant(FirstOp);
     }
   }
@@ -1622,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
 const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   auto *CI = cast<CallInst>(I);
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-    // Instrinsics with the returned attribute are copies of arguments.
+    // Intrinsics with the returned attribute are copies of arguments.
     if (auto *ReturnedValue = II->getReturnedArgOperand()) {
       if (II->getIntrinsicID() == Intrinsic::ssa_copy)
         if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
@@ -1652,10 +1654,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
                             CongruenceClass *NewClass) {
   assert(NewClass &&
          "Every MemoryAccess should be getting mapped to a non-null class");
-  DEBUG(dbgs() << "Setting " << *From);
-  DEBUG(dbgs() << " equivalent to congruence class ");
-  DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
-  DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+  LLVM_DEBUG(dbgs() << "Setting " << *From);
+  LLVM_DEBUG(dbgs() << " equivalent to congruence class ");
+  LLVM_DEBUG(dbgs() << NewClass->getID()
+                    << " with current MemoryAccess leader ");
+  LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
 
   auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
@@ -1673,11 +1676,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
             OldClass->setMemoryLeader(nullptr);
           } else {
             OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
-            DEBUG(dbgs() << "Memory class leader change for class "
-                         << OldClass->getID() << " to "
-                         << *OldClass->getMemoryLeader()
-                         << " due to removal of a memory member " << *From
-                         << "\n");
+            LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+                              << OldClass->getID() << " to "
+                              << *OldClass->getMemoryLeader()
+                              << " due to removal of a memory member " << *From
+                              << "\n");
             markMemoryLeaderChangeTouched(OldClass);
           }
         }
@@ -1705,7 +1708,7 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
   if (ICS == ICS_Unknown) {
     SCCFinder.Start(I);
     auto &SCC = SCCFinder.getComponentFor(I);
-    // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+    // It's cycle free if it's size 1 or the SCC is *only* phi nodes.
     if (SCC.size() == 1)
       InstCycleState.insert({I, ICS_CycleFree});
     else {
@@ -1753,12 +1756,13 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
     // If it has undef at this point, it means there are no-non-undef arguments,
     // and thus, the value of the phi node must be undef.
     if (HasUndef) {
-      DEBUG(dbgs() << "PHI Node " << *I
-                   << " has no non-undef arguments, valuing it as undef\n");
+      LLVM_DEBUG(
+          dbgs() << "PHI Node " << *I
+                 << " has no non-undef arguments, valuing it as undef\n");
       return createConstantExpression(UndefValue::get(I->getType()));
     }
 
-    DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+    LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
     deleteExpression(E);
     return createDeadExpression();
   }
@@ -1797,8 +1801,8 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
         InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
       return E;
     NumGVNPhisAllSame++;
-    DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+                      << "\n");
     deleteExpression(E);
     return createVariableOrConstant(AllSameValue);
   }
@@ -2091,7 +2095,7 @@ void NewGVN::markUsersTouched(Value *V) {
 }
 
 void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
-  DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
   MemoryToUsers[To].insert(U);
 }
 
@@ -2207,13 +2211,13 @@ Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
 //
 // - I must be moving to NewClass from OldClass
 // - The StoreCount of OldClass and NewClass is expected to have been updated
-//   for I already if it is is a store.
+//   for I already if it is a store.
 // - The OldClass memory leader has not been updated yet if I was the leader.
 void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
                                             MemoryAccess *InstMA,
                                             CongruenceClass *OldClass,
                                             CongruenceClass *NewClass) {
-  // If the leader is I, and we had a represenative MemoryAccess, it should
+  // If the leader is I, and we had a representative MemoryAccess, it should
   // be the MemoryAccess of OldClass.
   assert((!InstMA || !OldClass->getMemoryLeader() ||
           OldClass->getLeader() != I ||
@@ -2227,8 +2231,9 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
            (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
     NewClass->setMemoryLeader(InstMA);
     // Mark it touched if we didn't just create a singleton
-    DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
-                 << " due to new memory instruction becoming leader\n");
+    LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+                      << NewClass->getID()
+                      << " due to new memory instruction becoming leader\n");
     markMemoryLeaderChangeTouched(NewClass);
   }
   setMemoryClass(InstMA, NewClass);
@@ -2236,10 +2241,10 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
   if (OldClass->getMemoryLeader() == InstMA) {
     if (!OldClass->definesNoMemory()) {
       OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
-      DEBUG(dbgs() << "Memory class leader change for class "
-                   << OldClass->getID() << " to "
-                   << *OldClass->getMemoryLeader()
-                   << " due to removal of old leader " << *InstMA << "\n");
+      LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+                        << OldClass->getID() << " to "
+                        << *OldClass->getMemoryLeader()
+                        << " due to removal of old leader " << *InstMA << "\n");
       markMemoryLeaderChangeTouched(OldClass);
     } else
       OldClass->setMemoryLeader(nullptr);
@@ -2276,9 +2281,10 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
         NewClass->setStoredValue(SE->getStoredValue());
         markValueLeaderChangeTouched(NewClass);
         // Shift the new class leader to be the store
-        DEBUG(dbgs() << "Changing leader of congruence class "
-                     << NewClass->getID() << " from " << *NewClass->getLeader()
-                     << " to  " << *SI << " because store joined class\n");
+        LLVM_DEBUG(dbgs() << "Changing leader of congruence class "
+                          << NewClass->getID() << " from "
+                          << *NewClass->getLeader() << " to  " << *SI
+                          << " because store joined class\n");
         // If we changed the leader, we have to mark it changed because we don't
         // know what it will do to symbolic evaluation.
         NewClass->setLeader(SI);
@@ -2298,8 +2304,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
   // See if we destroyed the class or need to swap leaders.
   if (OldClass->empty() && OldClass != TOPClass) {
     if (OldClass->getDefiningExpr()) {
-      DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
-                   << " from table\n");
+      LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+                        << " from table\n");
       // We erase it as an exact expression to make sure we don't just erase an
       // equivalent one.
       auto Iter = ExpressionToClass.find_as(
@@ -2316,8 +2322,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
     // When the leader changes, the value numbering of
     // everything may change due to symbolization changes, so we need to
     // reprocess.
-    DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Value class leader change for class "
+                      << OldClass->getID() << "\n");
     ++NumGVNLeaderChanges;
     // Destroy the stored value if there are no more stores to represent it.
     // Note that this is basically clean up for the expression removal that
@@ -2380,12 +2386,14 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
              "VariableExpression should have been handled already");
 
       EClass = NewClass;
-      DEBUG(dbgs() << "Created new congruence class for " << *I
-                   << " using expression " << *E << " at " << NewClass->getID()
-                   << " and leader " << *(NewClass->getLeader()));
+      LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I
+                        << " using expression " << *E << " at "
+                        << NewClass->getID() << " and leader "
+                        << *(NewClass->getLeader()));
       if (NewClass->getStoredValue())
-        DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
-      DEBUG(dbgs() << "\n");
+        LLVM_DEBUG(dbgs() << " and stored value "
+                          << *(NewClass->getStoredValue()));
+      LLVM_DEBUG(dbgs() << "\n");
     } else {
       EClass = lookupResult.first->second;
       if (isa<ConstantExpression>(E))
@@ -2403,8 +2411,8 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   bool ClassChanged = IClass != EClass;
   bool LeaderChanged = LeaderChanges.erase(I);
   if (ClassChanged || LeaderChanged) {
-    DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression "
+                      << *E << "\n");
     if (ClassChanged) {
       moveValueToNewCongruenceClass(I, E, IClass, EClass);
       markPhiOfOpsChanged(E);
@@ -2442,13 +2450,15 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
   if (ReachableEdges.insert({From, To}).second) {
     // If this block wasn't reachable before, all instructions are touched.
     if (ReachableBlocks.insert(To).second) {
-      DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n");
+      LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+                        << " marked reachable\n");
       const auto &InstRange = BlockInstRange.lookup(To);
       TouchedInstructions.set(InstRange.first, InstRange.second);
     } else {
-      DEBUG(dbgs() << "Block " << getBlockName(To)
-                   << " was reachable, but new edge {" << getBlockName(From)
-                   << "," << getBlockName(To) << "} to it found\n");
+      LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+                        << " was reachable, but new edge {"
+                        << getBlockName(From) << "," << getBlockName(To)
+                        << "} to it found\n");
 
       // We've made an edge reachable to an existing block, which may
       // impact predicates. Otherwise, only mark the phi nodes as touched, as
@@ -2495,12 +2505,12 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     BasicBlock *FalseSucc = BR->getSuccessor(1);
     if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
       if (CI->isOne()) {
-        DEBUG(dbgs() << "Condition for Terminator " << *TI
-                     << " evaluated to true\n");
+        LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+                          << " evaluated to true\n");
         updateReachableEdge(B, TrueSucc);
       } else if (CI->isZero()) {
-        DEBUG(dbgs() << "Condition for Terminator " << *TI
-                     << " evaluated to false\n");
+        LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+                          << " evaluated to false\n");
         updateReachableEdge(B, FalseSucc);
       }
     } else {
@@ -2685,8 +2695,8 @@ Value *NewGVN::findLeaderForInst(Instruction *TransInst,
   auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
   if (!FoundVal) {
     ExpressionToPhiOfOps[E].insert(OrigInst);
-    DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
-                 << " in block " << getBlockName(PredBB) << "\n");
+    LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+                      << " in block " << getBlockName(PredBB) << "\n");
     return nullptr;
   }
   if (auto *SI = dyn_cast<StoreInst>(FoundVal))
@@ -2723,116 +2733,143 @@ NewGVN::makePossiblePHIOfOps(Instruction *I,
       MemAccess->getDefiningAccess()->getBlock() == I->getParent())
     return nullptr;
 
-  SmallPtrSet<const Value *, 10> VisitedOps;
   // Convert op of phis to phi of ops
-  for (auto *Op : I->operand_values()) {
+  SmallPtrSet<const Value *, 10> VisitedOps;
+  SmallVector<Value *, 4> Ops(I->operand_values());
+  BasicBlock *SamePHIBlock = nullptr;
+  PHINode *OpPHI = nullptr;
+  if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+    return nullptr;
+  for (auto *Op : Ops) {
     if (!isa<PHINode>(Op)) {
       auto *ValuePHI = RealToTemp.lookup(Op);
       if (!ValuePHI)
         continue;
-      DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+      LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n");
       Op = ValuePHI;
     }
-    auto *OpPHI = cast<PHINode>(Op);
+    OpPHI = cast<PHINode>(Op);
+    if (!SamePHIBlock) {
+      SamePHIBlock = getBlockForValue(OpPHI);
+    } else if (SamePHIBlock != getBlockForValue(OpPHI)) {
+      LLVM_DEBUG(
+          dbgs()
+          << "PHIs for operands are not all in the same block, aborting\n");
+      return nullptr;
+    }
     // No point in doing this for one-operand phis.
-    if (OpPHI->getNumOperands() == 1)
+    if (OpPHI->getNumOperands() == 1) {
+      OpPHI = nullptr;
       continue;
-    if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
-      return nullptr;
-    SmallVector<ValPair, 4> Ops;
-    SmallPtrSet<Value *, 4> Deps;
-    auto *PHIBlock = getBlockForValue(OpPHI);
-    RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
-    for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
-      auto *PredBB = OpPHI->getIncomingBlock(PredNum);
-      Value *FoundVal = nullptr;
-      // We could just skip unreachable edges entirely but it's tricky to do
-      // with rewriting existing phi nodes.
-      if (ReachableEdges.count({PredBB, PHIBlock})) {
-        // Clone the instruction, create an expression from it that is
-        // translated back into the predecessor, and see if we have a leader.
-        Instruction *ValueOp = I->clone();
-        if (MemAccess)
-          TempToMemory.insert({ValueOp, MemAccess});
-        bool SafeForPHIOfOps = true;
-        VisitedOps.clear();
-        for (auto &Op : ValueOp->operands()) {
-          auto *OrigOp = &*Op;
-          // When these operand changes, it could change whether there is a
-          // leader for us or not, so we have to add additional users.
-          if (isa<PHINode>(Op)) {
-            Op = Op->DoPHITranslation(PHIBlock, PredBB);
-            if (Op != OrigOp && Op != I)
-              Deps.insert(Op);
-          } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
-            if (getBlockForValue(ValuePHI) == PHIBlock)
-              Op = ValuePHI->getIncomingValueForBlock(PredBB);
-          }
-          // If we phi-translated the op, it must be safe.
-          SafeForPHIOfOps =
-              SafeForPHIOfOps &&
-              (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
+    }
+  }
+
+  if (!OpPHI)
+    return nullptr;
+
+  SmallVector<ValPair, 4> PHIOps;
+  SmallPtrSet<Value *, 4> Deps;
+  auto *PHIBlock = getBlockForValue(OpPHI);
+  RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+  for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+    auto *PredBB = OpPHI->getIncomingBlock(PredNum);
+    Value *FoundVal = nullptr;
+    SmallPtrSet<Value *, 4> CurrentDeps;
+    // We could just skip unreachable edges entirely but it's tricky to do
+    // with rewriting existing phi nodes.
+    if (ReachableEdges.count({PredBB, PHIBlock})) {
+      // Clone the instruction, create an expression from it that is
+      // translated back into the predecessor, and see if we have a leader.
+      Instruction *ValueOp = I->clone();
+      if (MemAccess)
+        TempToMemory.insert({ValueOp, MemAccess});
+      bool SafeForPHIOfOps = true;
+      VisitedOps.clear();
+      for (auto &Op : ValueOp->operands()) {
+        auto *OrigOp = &*Op;
+        // When these operand changes, it could change whether there is a
+        // leader for us or not, so we have to add additional users.
+        if (isa<PHINode>(Op)) {
+          Op = Op->DoPHITranslation(PHIBlock, PredBB);
+          if (Op != OrigOp && Op != I)
+            CurrentDeps.insert(Op);
+        } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+          if (getBlockForValue(ValuePHI) == PHIBlock)
+            Op = ValuePHI->getIncomingValueForBlock(PredBB);
         }
-        // FIXME: For those things that are not safe we could generate
-        // expressions all the way down, and see if this comes out to a
-        // constant.  For anything where that is true, and unsafe, we should
-        // have made a phi-of-ops (or value numbered it equivalent to something)
-        // for the pieces already.
-        FoundVal = !SafeForPHIOfOps ? nullptr
-                                    : findLeaderForInst(ValueOp, Visited,
-                                                        MemAccess, I, PredBB);
-        ValueOp->deleteValue();
-        if (!FoundVal)
-          return nullptr;
-      } else {
-        DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
-                     << getBlockName(PredBB)
-                     << " because the block is unreachable\n");
-        FoundVal = UndefValue::get(I->getType());
-        RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+        // If we phi-translated the op, it must be safe.
+        SafeForPHIOfOps =
+            SafeForPHIOfOps &&
+            (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
       }
-
-      Ops.push_back({FoundVal, PredBB});
-      DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
-                   << getBlockName(PredBB) << "\n");
-    }
-    for (auto Dep : Deps)
-      addAdditionalUsers(Dep, I);
-    sortPHIOps(Ops);
-    auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock);
-    if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
-      DEBUG(dbgs()
-            << "Not creating real PHI of ops because it simplified to existing "
-               "value or constant\n");
-      return E;
-    }
-    auto *ValuePHI = RealToTemp.lookup(I);
-    bool NewPHI = false;
-    if (!ValuePHI) {
-      ValuePHI =
-          PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
-      addPhiOfOps(ValuePHI, PHIBlock, I);
-      NewPHI = true;
-      NumGVNPHIOfOpsCreated++;
-    }
-    if (NewPHI) {
-      for (auto PHIOp : Ops)
-        ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
-    } else {
-      unsigned int i = 0;
-      for (auto PHIOp : Ops) {
-        ValuePHI->setIncomingValue(i, PHIOp.first);
-        ValuePHI->setIncomingBlock(i, PHIOp.second);
-        ++i;
+      // FIXME: For those things that are not safe we could generate
+      // expressions all the way down, and see if this comes out to a
+      // constant.  For anything where that is true, and unsafe, we should
+      // have made a phi-of-ops (or value numbered it equivalent to something)
+      // for the pieces already.
+      FoundVal = !SafeForPHIOfOps ? nullptr
+                                  : findLeaderForInst(ValueOp, Visited,
+                                                      MemAccess, I, PredBB);
+      ValueOp->deleteValue();
+      if (!FoundVal) {
+        // We failed to find a leader for the current ValueOp, but this might
+        // change in case of the translated operands change.
+        if (SafeForPHIOfOps)
+          for (auto Dep : CurrentDeps)
+            addAdditionalUsers(Dep, I);
+
+        return nullptr;
       }
+      Deps.insert(CurrentDeps.begin(), CurrentDeps.end());
+    } else {
+      LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+                        << getBlockName(PredBB)
+                        << " because the block is unreachable\n");
+      FoundVal = UndefValue::get(I->getType());
+      RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
     }
-    RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
-    DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
-                 << "\n");
 
+    PHIOps.push_back({FoundVal, PredBB});
+    LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+                      << getBlockName(PredBB) << "\n");
+  }
+  for (auto Dep : Deps)
+    addAdditionalUsers(Dep, I);
+  sortPHIOps(PHIOps);
+  auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock);
+  if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Not creating real PHI of ops because it simplified to existing "
+           "value or constant\n");
     return E;
   }
-  return nullptr;
+  auto *ValuePHI = RealToTemp.lookup(I);
+  bool NewPHI = false;
+  if (!ValuePHI) {
+    ValuePHI =
+        PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
+    addPhiOfOps(ValuePHI, PHIBlock, I);
+    NewPHI = true;
+    NumGVNPHIOfOpsCreated++;
+  }
+  if (NewPHI) {
+    for (auto PHIOp : PHIOps)
+      ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+  } else {
+    TempToBlock[ValuePHI] = PHIBlock;
+    unsigned int i = 0;
+    for (auto PHIOp : PHIOps) {
+      ValuePHI->setIncomingValue(i, PHIOp.first);
+      ValuePHI->setIncomingBlock(i, PHIOp.second);
+      ++i;
+    }
+  }
+  RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+  LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+                    << "\n");
+
+  return E;
 }
 
 // The algorithm initially places the values of the routine in the TOP
@@ -2902,8 +2939,9 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
 
 void NewGVN::cleanupTables() {
   for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
-                 << " has " << CongruenceClasses[i]->size() << " members\n");
+    LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                      << " has " << CongruenceClasses[i]->size()
+                      << " members\n");
     // Make sure we delete the congruence class (probably worth switching to
     // a unique_ptr at some point.
     delete CongruenceClasses[i];
@@ -2973,7 +3011,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
     // we change its DFS number so that it doesn't get value numbered.
     if (isInstructionTriviallyDead(&I, TLI)) {
       InstrDFS[&I] = 0;
-      DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
       markInstructionForDeletion(&I);
       continue;
     }
@@ -3039,9 +3077,10 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
       [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
 
   if (AllEqual)
-    DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
+    LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue
+                      << "\n");
   else
-    DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+    LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
   // If it's equal to something, it's in that class. Otherwise, it has to be in
   // a class where it is the leader (other things may be equivalent to it, but
   // it needs to start off in its own class, which means it must have been the
@@ -3060,7 +3099,7 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
 // Value number a single instruction, symbolically evaluating, performing
 // congruence finding, and updating mappings.
 void NewGVN::valueNumberInstruction(Instruction *I) {
-  DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+  LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n");
   if (!I->isTerminator()) {
     const Expression *Symbolized = nullptr;
     SmallPtrSet<Value *, 2> Visited;
@@ -3246,7 +3285,7 @@ void NewGVN::verifyMemoryCongruency() const {
 // and redoing the iteration to see if anything changed.
 void NewGVN::verifyIterationSettled(Function &F) {
 #ifndef NDEBUG
-  DEBUG(dbgs() << "Beginning iteration verification\n");
+  LLVM_DEBUG(dbgs() << "Beginning iteration verification\n");
   if (DebugCounter::isCounterSet(VNCounter))
     DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
 
@@ -3364,9 +3403,9 @@ void NewGVN::iterateTouchedInstructions() {
         // If it's not reachable, erase any touched instructions and move on.
         if (!BlockReachable) {
           TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
-          DEBUG(dbgs() << "Skipping instructions in block "
-                       << getBlockName(CurrBlock)
-                       << " because it is unreachable\n");
+          LLVM_DEBUG(dbgs() << "Skipping instructions in block "
+                            << getBlockName(CurrBlock)
+                            << " because it is unreachable\n");
           continue;
         }
         updateProcessedCount(CurrBlock);
@@ -3376,7 +3415,7 @@ void NewGVN::iterateTouchedInstructions() {
       TouchedInstructions.reset(InstrNum);
 
       if (auto *MP = dyn_cast<MemoryPhi>(V)) {
-        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
         valueNumberMemoryPhi(MP);
       } else if (auto *I = dyn_cast<Instruction>(V)) {
         valueNumberInstruction(I);
@@ -3422,10 +3461,10 @@ bool NewGVN::runGVN() {
   for (auto &B : RPOT) {
     auto *Node = DT->getNode(B);
     if (Node->getChildren().size() > 1)
-      std::sort(Node->begin(), Node->end(),
-                [&](const DomTreeNode *A, const DomTreeNode *B) {
-                  return RPOOrdering[A] < RPOOrdering[B];
-                });
+      llvm::sort(Node->begin(), Node->end(),
+                 [&](const DomTreeNode *A, const DomTreeNode *B) {
+                   return RPOOrdering[A] < RPOOrdering[B];
+                 });
   }
 
   // Now a standard depth first ordering of the domtree is equivalent to RPO.
@@ -3446,8 +3485,8 @@ bool NewGVN::runGVN() {
   // Initialize the touched instructions to include the entry block.
   const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
   TouchedInstructions.set(InstRange.first, InstRange.second);
-  DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
-               << " marked reachable\n");
+  LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+                    << " marked reachable\n");
   ReachableBlocks.insert(&F.getEntryBlock());
 
   iterateTouchedInstructions();
@@ -3472,8 +3511,8 @@ bool NewGVN::runGVN() {
   };
 
   for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
-    DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
-                 << " is unreachable\n");
+    LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+                      << " is unreachable\n");
     deleteInstructionsInBlock(&BB);
     Changed = true;
   }
@@ -3695,7 +3734,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
 }
 
 void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
-  DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+  LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumGVNBlocksDeleted;
 
   // Delete the instructions backwards, as it has a reduced likelihood of having
@@ -3722,12 +3761,12 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
 }
 
 void NewGVN::markInstructionForDeletion(Instruction *I) {
-  DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+  LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
   InstructionsToErase.insert(I);
 }
 
 void NewGVN::replaceInstruction(Instruction *I, Value *V) {
-  DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
   patchAndReplaceAllUsesWith(I, V);
   // We save the actual erasing to avoid invalidating memory
   // dependencies until we are done with everything.
@@ -3853,9 +3892,10 @@ bool NewGVN::eliminateInstructions(Function &F) {
   auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
     for (auto &Operand : PHI->incoming_values())
       if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
-        DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
-                     << getBlockName(PHI->getIncomingBlock(Operand))
-                     << " with undef due to it being unreachable\n");
+        LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
+                          << " for block "
+                          << getBlockName(PHI->getIncomingBlock(Operand))
+                          << " with undef due to it being unreachable\n");
         Operand.set(UndefValue::get(PHI->getType()));
       }
   };
@@ -3887,7 +3927,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
   // Map to store the use counts
   DenseMap<const Value *, unsigned int> UseCounts;
   for (auto *CC : reverse(CongruenceClasses)) {
-    DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+    LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+                      << "\n");
     // Track the equivalent store info so we can decide whether to try
     // dead store elimination.
     SmallVector<ValueDFS, 8> PossibleDeadStores;
@@ -3925,8 +3966,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
           MembersLeft.insert(Member);
           continue;
         }
-        DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for "
+                          << *Member << "\n");
         auto *I = cast<Instruction>(Member);
         assert(Leader != I && "About to accidentally remove our leader");
         replaceInstruction(I, Leader);
@@ -3947,7 +3988,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
         convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
 
         // Sort the whole thing.
-        std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+        llvm::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
         for (auto &VD : DFSOrderedSet) {
           int MemberDFSIn = VD.DFSIn;
           int MemberDFSOut = VD.DFSOut;
@@ -3966,24 +4007,24 @@ bool NewGVN::eliminateInstructions(Function &F) {
             // remove from temp instruction list.
             AllTempInstructions.erase(PN);
             auto *DefBlock = getBlockForValue(Def);
-            DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
-                         << " into block "
-                         << getBlockName(getBlockForValue(Def)) << "\n");
+            LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+                              << " into block "
+                              << getBlockName(getBlockForValue(Def)) << "\n");
             PN->insertBefore(&DefBlock->front());
             Def = PN;
             NumGVNPHIOfOpsEliminations++;
           }
 
           if (EliminationStack.empty()) {
-            DEBUG(dbgs() << "Elimination Stack is empty\n");
+            LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n");
           } else {
-            DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
-                         << EliminationStack.dfs_back().first << ","
-                         << EliminationStack.dfs_back().second << ")\n");
+            LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+                              << EliminationStack.dfs_back().first << ","
+                              << EliminationStack.dfs_back().second << ")\n");
           }
 
-          DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
-                       << MemberDFSOut << ")\n");
+          LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+                            << MemberDFSOut << ")\n");
           // First, we see if we are out of scope or empty.  If so,
           // and there equivalences, we try to replace the top of
           // stack with equivalences (if it's on the stack, it must
@@ -4058,14 +4099,16 @@ bool NewGVN::eliminateInstructions(Function &F) {
           Value *DominatingLeader = EliminationStack.back();
 
           auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
-          if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+          bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
+          if (isSSACopy)
             DominatingLeader = II->getOperand(0);
 
           // Don't replace our existing users with ourselves.
           if (U->get() == DominatingLeader)
             continue;
-          DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
-                       << *U->get() << " in " << *(U->getUser()) << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "Found replacement " << *DominatingLeader << " for "
+                     << *U->get() << " in " << *(U->getUser()) << "\n");
 
           // If we replaced something in an instruction, handle the patching of
           // metadata.  Skip this if we are replacing predicateinfo with its
@@ -4081,7 +4124,9 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // It's about to be alive again.
           if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
             ProbablyDead.erase(cast<Instruction>(DominatingLeader));
-          if (LeaderUseCount == 0 && II)
+          // Copy instructions, however, are still dead because we use their
+          // operand as the leader.
+          if (LeaderUseCount == 0 && isSSACopy)
             ProbablyDead.insert(II);
           ++LeaderUseCount;
           AnythingReplaced = true;
@@ -4106,7 +4151,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
     // If we have possible dead stores to look at, try to eliminate them.
     if (CC->getStoreCount() > 0) {
       convertClassToLoadsAndStores(*CC, PossibleDeadStores);
-      std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+      llvm::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
       ValueDFSStack EliminationStack;
       for (auto &VD : PossibleDeadStores) {
         int MemberDFSIn = VD.DFSIn;
@@ -4129,8 +4174,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
         (void)Leader;
         assert(DT->dominates(Leader->getParent(), Member->getParent()));
         // Member is dominater by Leader, and thus dead
-        DEBUG(dbgs() << "Marking dead store " << *Member
-                     << " that is dominated by " << *Leader << "\n");
+        LLVM_DEBUG(dbgs() << "Marking dead store " << *Member
+                          << " that is dominated by " << *Leader << "\n");
         markInstructionForDeletion(Member);
         CC->erase(Member);
         ++NumGVNDeadStores;
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 2d0cb6fbf211..8f30bccf48f1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -65,7 +66,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "safepoint-placement"
 
@@ -323,7 +323,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
     // avoiding the runtime cost of the actual safepoint.
     if (!AllBackedges) {
       if (mustBeFiniteCountedLoop(L, SE, Pred)) {
-        DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+        LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
         FiniteExecution++;
         continue;
       }
@@ -332,7 +332,9 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
         // Note: This is only semantically legal since we won't do any further
         // IPO or inlining before the actual call insertion..  If we hadn't, we
         // might latter loose this call safepoint.
-        DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n");
+        LLVM_DEBUG(
+            dbgs()
+            << "skipping safepoint placement due to unconditional call\n");
         CallInLoop++;
         continue;
       }
@@ -348,7 +350,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
     // variables) and branches to the true header
     TerminatorInst *Term = Pred->getTerminator();
 
-    DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+    LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
 
     PollLocations.push_back(Term);
   }
@@ -522,7 +524,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
     };
     // We need the order of list to be stable so that naming ends up stable
     // when we split edges.  This makes test cases much easier to write.
-    std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+    llvm::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
 
     // We can sometimes end up with duplicate poll locations.  This happens if
     // a single loop is visited more than once.   The fact this happens seems
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 88dcaf0f8a36..c81ac70d99e6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -42,6 +43,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -55,7 +57,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 #include <utility>
@@ -168,8 +169,8 @@ void ReassociatePass::BuildRankMap(Function &F,
   // Assign distinct ranks to function arguments.
   for (auto &Arg : F.args()) {
     ValueRankMap[&Arg] = ++Rank;
-    DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+                      << "\n");
   }
 
   // Traverse basic blocks in ReversePostOrder
@@ -200,17 +201,17 @@ unsigned ReassociatePass::getRank(Value *V) {
   // for PHI nodes, we cannot have infinite recursion here, because there
   // cannot be loops in the value graph that do not go through PHI nodes.
   unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
-  for (unsigned i = 0, e = I->getNumOperands();
-       i != e && Rank != MaxRank; ++i)
+  for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
     Rank = std::max(Rank, getRank(I->getOperand(i)));
 
   // If this is a not or neg instruction, do not count it for rank.  This
   // assures us that X and ~X will have the same rank.
-  if  (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
-       !BinaryOperator::isFNeg(I))
+  if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+      !BinaryOperator::isFNeg(I))
     ++Rank;
 
-  DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+  LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
+                    << "\n");
 
   return ValueRankMap[I] = Rank;
 }
@@ -445,7 +446,7 @@ using RepeatedValue = std::pair<Value*, APInt>;
 /// type and thus make the expression bigger.
 static bool LinearizeExprTree(BinaryOperator *I,
                               SmallVectorImpl<RepeatedValue> &Ops) {
-  DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
   unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
   unsigned Opcode = I->getOpcode();
   assert(I->isAssociative() && I->isCommutative() &&
@@ -494,14 +495,14 @@ static bool LinearizeExprTree(BinaryOperator *I,
     for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
       Value *Op = I->getOperand(OpIdx);
       APInt Weight = P.second; // Number of paths to this operand.
-      DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
       assert(!Op->use_empty() && "No uses, so how did we get to it?!");
 
       // If this is a binary operation of the right kind with only one use then
       // add its operands to the expression.
       if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
         assert(Visited.insert(Op).second && "Not first visit!");
-        DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
         Worklist.push_back(std::make_pair(BO, Weight));
         continue;
       }
@@ -514,7 +515,8 @@ static bool LinearizeExprTree(BinaryOperator *I,
         if (!Op->hasOneUse()) {
           // This value has uses not accounted for by the expression, so it is
           // not safe to modify.  Mark it as being a leaf.
-          DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LLVM_DEBUG(dbgs()
+                     << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
           LeafOrder.push_back(Op);
           Leaves[Op] = Weight;
           continue;
@@ -540,7 +542,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
         // to the expression, then no longer consider it to be a leaf and add
         // its operands to the expression.
         if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
-          DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
           Worklist.push_back(std::make_pair(BO, It->second));
           Leaves.erase(It);
           continue;
@@ -573,9 +575,10 @@ static bool LinearizeExprTree(BinaryOperator *I,
       if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
         if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
             (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
-          DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+          LLVM_DEBUG(dbgs()
+                     << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
           BO = LowerNegateToMultiply(BO);
-          DEBUG(dbgs() << *BO << '\n');
+          LLVM_DEBUG(dbgs() << *BO << '\n');
           Worklist.push_back(std::make_pair(BO, Weight));
           Changed = true;
           continue;
@@ -583,7 +586,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
 
       // Failed to morph into an expression of the right type.  This really is
       // a leaf.
-      DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
       assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
       LeafOrder.push_back(Op);
       Leaves[Op] = Weight;
@@ -675,9 +678,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
 
       if (NewLHS == OldRHS && NewRHS == OldLHS) {
         // The order of the operands was reversed.  Swap them.
-        DEBUG(dbgs() << "RA: " << *Op << '\n');
+        LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
         Op->swapOperands();
-        DEBUG(dbgs() << "TO: " << *Op << '\n');
+        LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
         MadeChange = true;
         ++NumChanged;
         break;
@@ -685,7 +688,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
 
       // The new operation differs non-trivially from the original. Overwrite
       // the old operands with the new ones.
-      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
       if (NewLHS != OldLHS) {
         BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
         if (BO && !NotRewritable.count(BO))
@@ -698,7 +701,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
           NodesToRewrite.push_back(BO);
         Op->setOperand(1, NewRHS);
       }
-      DEBUG(dbgs() << "TO: " << *Op << '\n');
+      LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
 
       ExpressionChanged = Op;
       MadeChange = true;
@@ -711,7 +714,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
     // while the right-hand side will be the current element of Ops.
     Value *NewRHS = Ops[i].Op;
     if (NewRHS != Op->getOperand(1)) {
-      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
       if (NewRHS == Op->getOperand(0)) {
         // The new right-hand side was already present as the left operand.  If
         // we are lucky then swapping the operands will sort out both of them.
@@ -724,7 +727,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
         Op->setOperand(1, NewRHS);
         ExpressionChanged = Op;
       }
-      DEBUG(dbgs() << "TO: " << *Op << '\n');
+      LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
       MadeChange = true;
       ++NumChanged;
     }
@@ -756,9 +759,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
       NewOp = NodesToRewrite.pop_back_val();
     }
 
-    DEBUG(dbgs() << "RA: " << *Op << '\n');
+    LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
     Op->setOperand(0, NewOp);
-    DEBUG(dbgs() << "TO: " << *Op << '\n');
+    LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
     ExpressionChanged = Op;
     MadeChange = true;
     ++NumChanged;
@@ -781,6 +784,18 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
 
       if (ExpressionChanged == I)
         break;
+
+      // Discard any debug info related to the expressions that has changed (we
+      // can leave debug infor related to the root, since the result of the
+      // expression tree should be the same even after reassociation).
+      SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+      findDbgUsers(DbgUsers, ExpressionChanged);
+      for (auto *DII : DbgUsers) {
+        Value *Undef = UndefValue::get(ExpressionChanged->getType());
+        DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+                                                ValueAsMetadata::get(Undef)));
+      }
+
       ExpressionChanged->moveBefore(I);
       ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
     } while (true);
@@ -798,7 +813,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
 /// pushing the negates through adds.  These will be revisited to see if
 /// additional opportunities have been exposed.
 static Value *NegateValue(Value *V, Instruction *BI,
-                          SetVector<AssertingVH<Instruction>> &ToRedo) {
+                          ReassociatePass::OrderedSet &ToRedo) {
   if (auto *C = dyn_cast<Constant>(V))
     return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
                                               ConstantExpr::getNeg(C);
@@ -912,8 +927,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
 
 /// If we have (X-Y), and if either X is an add, or if this is only used by an
 /// add, transform this into (X+(0-Y)) to promote better reassociation.
-static BinaryOperator *
-BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+static BinaryOperator *BreakUpSubtract(Instruction *Sub,
+                                       ReassociatePass::OrderedSet &ToRedo) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
@@ -929,7 +944,7 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
   Sub->replaceAllUsesWith(New);
   New->setDebugLoc(Sub->getDebugLoc());
 
-  DEBUG(dbgs() << "Negated: " << *New << '\n');
+  LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n');
   return New;
 }
 
@@ -1415,7 +1430,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
         ++NumFound;
       } while (i != Ops.size() && Ops[i].Op == TheOp);
 
-      DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+      LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp
+                        << '\n');
       ++NumFactor;
 
       // Insert a new multiply.
@@ -1553,7 +1569,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 
   // If any factor occurred more than one time, we can pull it out.
   if (MaxOcc > 1) {
-    DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+    LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal
+                      << '\n');
     ++NumFactor;
 
     // Create a new instruction that uses the MaxOccVal twice.  If we don't do
@@ -1622,7 +1639,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
   return nullptr;
 }
 
-/// \brief Build up a vector of value/power pairs factoring a product.
+/// Build up a vector of value/power pairs factoring a product.
 ///
 /// Given a series of multiplication operands, build a vector of factors and
 /// the powers each is raised to when forming the final product. Sort them in
@@ -1687,7 +1704,7 @@ static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
   return true;
 }
 
-/// \brief Build a tree of multiplies, computing the product of Ops.
+/// Build a tree of multiplies, computing the product of Ops.
 static Value *buildMultiplyTree(IRBuilder<> &Builder,
                                 SmallVectorImpl<Value*> &Ops) {
   if (Ops.size() == 1)
@@ -1704,7 +1721,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
   return LHS;
 }
 
-/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
 ///
 /// Given a vector of values raised to various powers, where no two values are
 /// equal and the powers are sorted in decreasing order, compute the minimal
@@ -1859,8 +1876,8 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
 
 // Remove dead instructions and if any operands are trivially dead add them to
 // Insts so they will be removed as well.
-void ReassociatePass::RecursivelyEraseDeadInsts(
-    Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
+                                                OrderedSet &Insts) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
   SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
   ValueRankMap.erase(I);
@@ -1876,7 +1893,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(
 /// Zap the given instruction, adding interesting operands to the work list.
 void ReassociatePass::EraseInst(Instruction *I) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
-  DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+  LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
 
   SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
   // Erase the dead instruction.
@@ -1893,7 +1910,14 @@ void ReassociatePass::EraseInst(Instruction *I) {
       while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
              Visited.insert(Op).second)
         Op = Op->user_back();
-      RedoInsts.insert(Op);
+
+      // The instruction we're going to push may be coming from a
+      // dead block, and Reassociate skips the processing of unreachable
+      // blocks because it's a waste of time and also because it can
+      // lead to infinite loop due to LLVM's non-standard definition
+      // of dominance.
+      if (ValueRankMap.find(Op) != ValueRankMap.end())
+        RedoInsts.insert(Op);
     }
 
   MadeChange = true;
@@ -2120,7 +2144,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
                ValueEntry(getRank(E.first), E.first));
   }
 
-  DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
 
   // Now that we have linearized the tree to a list and have gathered all of
   // the operands and their ranks, sort the operands by their rank.  Use a
@@ -2138,7 +2162,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
       return;
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
-    DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+    LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
     I->replaceAllUsesWith(V);
     if (Instruction *VI = dyn_cast<Instruction>(V))
       if (I->getDebugLoc())
@@ -2169,7 +2193,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
     }
   }
 
-  DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
 
   if (Ops.size() == 1) {
     if (Ops[0].Op == I)
@@ -2321,7 +2345,7 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
 
     // Make a copy of all the instructions to be redone so we can remove dead
     // instructions.
-    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+    OrderedSet ToRedo(RedoInsts);
     // Iterate over all instructions to be reevaluated and remove trivially dead
     // instructions. If any operand of the trivially dead instruction becomes
     // dead mark it for deletion as well. Continue this process until all
@@ -2337,7 +2361,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
     // Now that we have removed dead instructions, we can reoptimize the
     // remaining instructions.
     while (!RedoInsts.empty()) {
-      Instruction *I = RedoInsts.pop_back_val();
+      Instruction *I = RedoInsts.front();
+      RedoInsts.erase(RedoInsts.begin());
       if (isInstructionTriviallyDead(I))
         EraseInst(I);
       else
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 96295683314c..018feb035a4f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
@@ -25,7 +26,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
 #include <list>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c44edbed8ed9..391e43f79121 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -64,7 +65,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -476,6 +476,12 @@ findBaseDefiningValueOfVector(Value *I) {
   if (auto *BC = dyn_cast<BitCastInst>(I))
     return findBaseDefiningValue(BC->getOperand(0));
 
+  // We assume that functions in the source language only return base
+  // pointers.  This should probably be generalized via attributes to support
+  // both source language and internal functions.
+  if (isa<CallInst>(I) || isa<InvokeInst>(I))
+    return BaseDefiningValueResult(I, true);
+
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -610,8 +616,8 @@ static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
   Value *&Cached = Cache[I];
   if (!Cached) {
     Cached = findBaseDefiningValue(I).BDV;
-    DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
-                 << Cached->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+                      << Cached->getName() << "\n");
   }
   assert(Cache[I] != nullptr);
   return Cached;
@@ -842,9 +848,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
   }
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "States after initialization:\n");
+  LLVM_DEBUG(dbgs() << "States after initialization:\n");
   for (auto Pair : States) {
-    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+    LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
 #endif
 
@@ -917,9 +923,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
   }
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "States after meet iteration:\n");
+  LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
   for (auto Pair : States) {
-    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+    LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
 #endif
 
@@ -960,7 +966,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
       if (isa<PHINode>(I)) {
         BasicBlock *BB = I->getParent();
-        int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+        int NumPreds = pred_size(BB);
         assert(NumPreds > 0 && "how did we reach here");
         std::string Name = suffixed_name_or(I, ".base", "base_phi");
         return PHINode::Create(I->getType(), NumPreds, Name, I);
@@ -1118,10 +1124,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     assert(BDV && Base);
     assert(!isKnownBaseResult(BDV) && "why did it get added?");
 
-    DEBUG(dbgs() << "Updating base value cache"
-                 << " for: " << BDV->getName() << " from: "
-                 << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
-                 << " to: " << Base->getName() << "\n");
+    LLVM_DEBUG(
+        dbgs() << "Updating base value cache"
+               << " for: " << BDV->getName() << " from: "
+               << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+               << " to: " << Base->getName() << "\n");
 
     if (Cache.count(BDV)) {
       assert(isKnownBaseResult(Base) &&
@@ -1369,7 +1376,7 @@ public:
 
     assert(OldI != NewI && "Disallowed at construction?!");
     assert((!IsDeoptimize || !New) &&
-           "Deoptimize instrinsics are not replaced!");
+           "Deoptimize intrinsics are not replaced!");
 
     Old = nullptr;
     New = nullptr;
@@ -1379,7 +1386,7 @@ public:
 
     if (IsDeoptimize) {
       // Note: we've inserted instructions, so the call to llvm.deoptimize may
-      // not necessarilly be followed by the matching return.
+      // not necessarily be followed by the matching return.
       auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
       new UnreachableInst(RI->getContext(), RI);
       RI->eraseFromParent();
@@ -1805,7 +1812,7 @@ static void relocationViaAlloca(
 
     SmallVector<Instruction *, 20> Uses;
     // PERF: trade a linear scan for repeated reallocation
-    Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+    Uses.reserve(Def->getNumUses());
     for (User *U : Def->users()) {
       if (!isa<ConstantExpr>(U)) {
         // If the def has a ConstantExpr use, then the def is either a
@@ -1817,7 +1824,7 @@ static void relocationViaAlloca(
       }
     }
 
-    std::sort(Uses.begin(), Uses.end());
+    llvm::sort(Uses.begin(), Uses.end());
     auto Last = std::unique(Uses.begin(), Uses.end());
     Uses.erase(Last, Uses.end());
 
@@ -1977,7 +1984,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
         Cost += 2;
 
     } else {
-      llvm_unreachable("unsupported instruciton type during rematerialization");
+      llvm_unreachable("unsupported instruction type during rematerialization");
     }
   }
 
@@ -2024,7 +2031,7 @@ static void rematerializeLiveValues(CallSite CS,
   SmallVector<Value *, 32> LiveValuesToBeDeleted;
 
   for (Value *LiveValue: Info.LiveSet) {
-    // For each live pointer find it's defining chain
+    // For each live pointer find its defining chain
     SmallVector<Instruction *, 3> ChainToBase;
     assert(Info.PointerToBase.count(LiveValue));
     Value *RootOfChain =
@@ -2461,22 +2468,8 @@ static void stripNonValidDataFromBody(Function &F) {
         continue;
       }
 
-    if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
-      assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
-      bool IsImmutableTBAA =
-          MD->getNumOperands() == 4 &&
-          mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
-
-      if (!IsImmutableTBAA)
-        continue; // no work to do, MD_tbaa is already marked mutable
-
-      MDNode *Base = cast<MDNode>(MD->getOperand(0));
-      MDNode *Access = cast<MDNode>(MD->getOperand(1));
-      uint64_t Offset =
-          mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
-
-      MDNode *MutableTBAA =
-          Builder.createTBAAStructTagNode(Base, Access, Offset);
+    if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) {
+      MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag);
       I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
     }
 
@@ -2537,30 +2530,31 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
     return false;
   };
 
+
+  // Delete any unreachable statepoints so that we don't have unrewritten
+  // statepoints surviving this pass.  This makes testing easier and the
+  // resulting IR less confusing to human readers.
+  DeferredDominance DD(DT);
+  bool MadeChange = removeUnreachableBlocks(F, nullptr, &DD);
+  DD.flush();
+
   // Gather all the statepoints which need rewritten.  Be careful to only
   // consider those in reachable code since we need to ask dominance queries
   // when rewriting.  We'll delete the unreachable ones in a moment.
   SmallVector<CallSite, 64> ParsePointNeeded;
-  bool HasUnreachableStatepoint = false;
   for (Instruction &I : instructions(F)) {
     // TODO: only the ones with the flag set!
     if (NeedsRewrite(I)) {
-      if (DT.isReachableFromEntry(I.getParent()))
-        ParsePointNeeded.push_back(CallSite(&I));
-      else
-        HasUnreachableStatepoint = true;
+      // NOTE removeUnreachableBlocks() is stronger than
+      // DominatorTree::isReachableFromEntry(). In other words
+      // removeUnreachableBlocks can remove some blocks for which
+      // isReachableFromEntry() returns true.
+      assert(DT.isReachableFromEntry(I.getParent()) &&
+            "no unreachable blocks expected");
+      ParsePointNeeded.push_back(CallSite(&I));
     }
   }
 
-  bool MadeChange = false;
-
-  // Delete any unreachable statepoints so that we don't have unrewritten
-  // statepoints surviving this pass.  This makes testing easier and the
-  // resulting IR less confusing to human readers.  Rather than be fancy, we
-  // just reuse a utility function which removes the unreachable blocks.
-  if (HasUnreachableStatepoint)
-    MadeChange |= removeUnreachableBlocks(F);
-
   // Return early if no work to do.
   if (ParsePointNeeded.empty())
     return MadeChange;
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index 3e12649ddedc..5e3ddeda2d49 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -17,7 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -30,6 +29,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -54,9 +54,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
 #include <vector>
@@ -71,8 +69,6 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
 STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
 STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
 STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
-STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by"
-                              "IPSCCP");
 
 namespace {
 
@@ -261,7 +257,7 @@ public:
   bool MarkBlockExecutable(BasicBlock *BB) {
     if (!BBExecutable.insert(BB).second)
       return false;
-    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
     BBWorkList.push_back(BB);  // Add the block to the work list!
     return true;
   }
@@ -329,6 +325,10 @@ public:
     return BBExecutable.count(BB);
   }
 
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible.
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
   std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
     std::vector<LatticeVal> StructValues;
     auto *STy = dyn_cast<StructType>(V->getType());
@@ -341,20 +341,13 @@ public:
     return StructValues;
   }
 
-  ValueLatticeElement getLatticeValueFor(Value *V) {
+  const LatticeVal &getLatticeValueFor(Value *V) const {
     assert(!V->getType()->isStructTy() &&
            "Should use getStructLatticeValueFor");
-    std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
-        PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
-    ValueLatticeElement &LV = PI.first->second;
-    if (PI.second) {
-      DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
-      assert(I != ValueState.end() &&
-             "V not found in ValueState nor Paramstate map!");
-      LV = I->second.toValueLattice();
-    }
-
-    return LV;
+    DenseMap<Value *, LatticeVal>::const_iterator I = ValueState.find(V);
+    assert(I != ValueState.end() &&
+           "V not found in ValueState nor Paramstate map!");
+    return I->second;
   }
 
   /// getTrackedRetVals - Get the inferred return value map.
@@ -415,55 +408,57 @@ private:
   // markConstant - Make a value be marked as "constant".  If the value
   // is not already a constant, add it to the instruction work list so that
   // the users of the instruction are updated later.
-  void markConstant(LatticeVal &IV, Value *V, Constant *C) {
-    if (!IV.markConstant(C)) return;
-    DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+  bool markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (!IV.markConstant(C)) return false;
+    LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
     pushToWorkList(IV, V);
+    return true;
   }
 
-  void markConstant(Value *V, Constant *C) {
+  bool markConstant(Value *V, Constant *C) {
     assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
-    markConstant(ValueState[V], V, C);
+    return markConstant(ValueState[V], V, C);
   }
 
   void markForcedConstant(Value *V, Constant *C) {
     assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
     LatticeVal &IV = ValueState[V];
     IV.markForcedConstant(C);
-    DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+    LLVM_DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
     pushToWorkList(IV, V);
   }
 
   // markOverdefined - Make a value be marked as "overdefined". If the
   // value is not already overdefined, add it to the overdefined instruction
   // work list so that the users of the instruction are updated later.
-  void markOverdefined(LatticeVal &IV, Value *V) {
-    if (!IV.markOverdefined()) return;
-
-    DEBUG(dbgs() << "markOverdefined: ";
-          if (auto *F = dyn_cast<Function>(V))
-            dbgs() << "Function '" << F->getName() << "'\n";
-          else
-            dbgs() << *V << '\n');
+  bool markOverdefined(LatticeVal &IV, Value *V) {
+    if (!IV.markOverdefined()) return false;
+
+    LLVM_DEBUG(dbgs() << "markOverdefined: ";
+               if (auto *F = dyn_cast<Function>(V)) dbgs()
+               << "Function '" << F->getName() << "'\n";
+               else dbgs() << *V << '\n');
     // Only instructions go on the work list
     pushToWorkList(IV, V);
+    return true;
   }
 
-  void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+  bool mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
     if (IV.isOverdefined() || MergeWithV.isUnknown())
-      return;  // Noop.
+      return false; // Noop.
     if (MergeWithV.isOverdefined())
       return markOverdefined(IV, V);
     if (IV.isUnknown())
       return markConstant(IV, V, MergeWithV.getConstant());
     if (IV.getConstant() != MergeWithV.getConstant())
       return markOverdefined(IV, V);
+    return false;
   }
 
-  void mergeInValue(Value *V, LatticeVal MergeWithV) {
+  bool mergeInValue(Value *V, LatticeVal MergeWithV) {
     assert(!V->getType()->isStructTy() &&
            "non-structs should use markConstant");
-    mergeInValue(ValueState[V], V, MergeWithV);
+    return mergeInValue(ValueState[V], V, MergeWithV);
   }
 
   /// getValueState - Return the LatticeVal object that corresponds to the
@@ -534,30 +529,27 @@ private:
 
   /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
   /// work list if it is not already executable.
-  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+  bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
     if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
-      return;  // This edge is already known to be executable!
+      return false;  // This edge is already known to be executable!
 
     if (!MarkBlockExecutable(Dest)) {
       // If the destination is already executable, we just made an *edge*
       // feasible that wasn't before.  Revisit the PHI nodes in the block
       // because they have potentially new operands.
-      DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
-            << " -> " << Dest->getName() << '\n');
+      LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+                        << " -> " << Dest->getName() << '\n');
 
       for (PHINode &PN : Dest->phis())
         visitPHINode(PN);
     }
+    return true;
   }
 
   // getFeasibleSuccessors - Return a vector of booleans to indicate which
   // successors are reachable from a given terminator instruction.
   void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
 
-  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
-  // block to the 'To' basic block is currently feasible.
-  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
-
   // OperandChangedState - This method is invoked on all of the users of an
   // instruction that was just changed state somehow.  Based on this
   // information, we need to update the specified user of this instruction.
@@ -614,7 +606,7 @@ private:
   void visitInstruction(Instruction &I) {
     // All the instructions we don't do any special handling for just
     // go to overdefined.
-    DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+    LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
     markOverdefined(&I);
   }
 };
@@ -701,68 +693,17 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
     return;
   }
 
-  DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+  LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
 
 // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
 // block to the 'To' basic block is currently feasible.
 bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
-  assert(BBExecutable.count(To) && "Dest should always be alive!");
-
-  // Make sure the source basic block is executable!!
-  if (!BBExecutable.count(From)) return false;
-
-  // Check to make sure this edge itself is actually feasible now.
-  TerminatorInst *TI = From->getTerminator();
-  if (auto *BI = dyn_cast<BranchInst>(TI)) {
-    if (BI->isUnconditional())
-      return true;
-
-    LatticeVal BCValue = getValueState(BI->getCondition());
-
-    // Overdefined condition variables mean the branch could go either way,
-    // undef conditions mean that neither edge is feasible yet.
-    ConstantInt *CI = BCValue.getConstantInt();
-    if (!CI)
-      return !BCValue.isUnknown();
-
-    // Constant condition variables mean the branch can only go a single way.
-    return BI->getSuccessor(CI->isZero()) == To;
-  }
-
-  // Unwinding instructions successors are always executable.
-  if (TI->isExceptional())
-    return true;
-
-  if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-    if (SI->getNumCases() < 1)
-      return true;
-
-    LatticeVal SCValue = getValueState(SI->getCondition());
-    ConstantInt *CI = SCValue.getConstantInt();
-
-    if (!CI)
-      return !SCValue.isUnknown();
-
-    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
-  }
-
-  // In case of indirect branch and its address is a blockaddress, we mark
-  // the target as executable.
-  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
-    LatticeVal IBRValue = getValueState(IBR->getAddress());
-    BlockAddress *Addr = IBRValue.getBlockAddress();
-
-    if (!Addr)
-      return !IBRValue.isUnknown();
-
-    // At this point, the indirectbr is branching on a blockaddress.
-    return Addr->getBasicBlock() == To;
-  }
-
-  DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
-  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+  // Check if we've called markEdgeExecutable on the edge yet. (We could
+  // be more aggressive and try to consider edges which haven't been marked
+  // yet, but there isn't any need.)
+  return KnownFeasibleEdges.count(Edge(From, To));
 }
 
 // visit Implementations - Something changed in this instruction, either an
@@ -786,7 +727,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // If this PN returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (PN.getType()->isStructTy())
-    return markOverdefined(&PN);
+    return (void)markOverdefined(&PN);
 
   if (getValueState(&PN).isOverdefined())
     return;  // Quick exit
@@ -794,7 +735,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
   // and slow us down a lot.  Just mark them overdefined.
   if (PN.getNumIncomingValues() > 64)
-    return markOverdefined(&PN);
+    return (void)markOverdefined(&PN);
 
   // Look at all of the executable operands of the PHI node.  If any of them
   // are overdefined, the PHI becomes overdefined as well.  If they are all
@@ -810,7 +751,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
       continue;
 
     if (IV.isOverdefined())    // PHI node becomes overdefined!
-      return markOverdefined(&PN);
+      return (void)markOverdefined(&PN);
 
     if (!OperandVal) {   // Grab the first value.
       OperandVal = IV.getConstant();
@@ -824,7 +765,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     // Check to see if there are two different constants merging, if so, the PHI
     // node is overdefined.
     if (IV.getConstant() != OperandVal)
-      return markOverdefined(&PN);
+      return (void)markOverdefined(&PN);
   }
 
   // If we exited the loop, this means that the PHI node only has constant
@@ -892,11 +833,11 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // If this returns a struct, mark all elements over defined, we don't track
   // structs in structs.
   if (EVI.getType()->isStructTy())
-    return markOverdefined(&EVI);
+    return (void)markOverdefined(&EVI);
 
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
-    return markOverdefined(&EVI);
+    return (void)markOverdefined(&EVI);
 
   Value *AggVal = EVI.getAggregateOperand();
   if (AggVal->getType()->isStructTy()) {
@@ -905,19 +846,19 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
     mergeInValue(getValueState(&EVI), &EVI, EltVal);
   } else {
     // Otherwise, must be extracting from an array.
-    return markOverdefined(&EVI);
+    return (void)markOverdefined(&EVI);
   }
 }
 
 void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   auto *STy = dyn_cast<StructType>(IVI.getType());
   if (!STy)
-    return markOverdefined(&IVI);
+    return (void)markOverdefined(&IVI);
 
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
-    return markOverdefined(&IVI);
+    return (void)markOverdefined(&IVI);
 
   Value *Aggr = IVI.getAggregateOperand();
   unsigned Idx = *IVI.idx_begin();
@@ -946,7 +887,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // If this select returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (I.getType()->isStructTy())
-    return markOverdefined(&I);
+    return (void)markOverdefined(&I);
 
   LatticeVal CondValue = getValueState(I.getCondition());
   if (CondValue.isUnknown())
@@ -967,12 +908,12 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // select ?, C, C -> C.
   if (TVal.isConstant() && FVal.isConstant() &&
       TVal.getConstant() == FVal.getConstant())
-    return markConstant(&I, FVal.getConstant());
+    return (void)markConstant(&I, FVal.getConstant());
 
   if (TVal.isUnknown())   // select ?, undef, X -> X.
-    return mergeInValue(&I, FVal);
+    return (void)mergeInValue(&I, FVal);
   if (FVal.isUnknown())   // select ?, X, undef -> X.
-    return mergeInValue(&I, TVal);
+    return (void)mergeInValue(&I, TVal);
   markOverdefined(&I);
 }
 
@@ -990,7 +931,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
     // X op Y -> undef.
     if (isa<UndefValue>(C))
       return;
-    return markConstant(IV, &I, C);
+    return (void)markConstant(IV, &I, C);
   }
 
   // If something is undef, wait for it to resolve.
@@ -1003,7 +944,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
   // overdefined, and we can replace it with zero.
   if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
     if (V1State.isConstant() && V1State.getConstant()->isNullValue())
-      return markConstant(IV, &I, V1State.getConstant());
+      return (void)markConstant(IV, &I, V1State.getConstant());
 
   // If this is:
   // -> AND/MUL with 0
@@ -1026,12 +967,12 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
         // X and 0 = 0
         // X * 0 = 0
         if (NonOverdefVal->getConstant()->isNullValue())
-          return markConstant(IV, &I, NonOverdefVal->getConstant());
+          return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
       } else {
         // X or -1 = -1
         if (ConstantInt *CI = NonOverdefVal->getConstantInt())
           if (CI->isMinusOne())
-            return markConstant(IV, &I, NonOverdefVal->getConstant());
+            return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
       }
     }
   }
@@ -1041,22 +982,36 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
 
 // Handle ICmpInst instruction.
 void SCCPSolver::visitCmpInst(CmpInst &I) {
-  LatticeVal V1State = getValueState(I.getOperand(0));
-  LatticeVal V2State = getValueState(I.getOperand(1));
-
   LatticeVal &IV = ValueState[&I];
   if (IV.isOverdefined()) return;
 
-  if (V1State.isConstant() && V2State.isConstant()) {
-    Constant *C = ConstantExpr::getCompare(
-        I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+  Value *Op1 = I.getOperand(0);
+  Value *Op2 = I.getOperand(1);
+
+  // For parameters, use ParamState which includes constant range info if
+  // available.
+  auto V1Param = ParamState.find(Op1);
+  ValueLatticeElement V1State = (V1Param != ParamState.end())
+                                    ? V1Param->second
+                                    : getValueState(Op1).toValueLattice();
+
+  auto V2Param = ParamState.find(Op2);
+  ValueLatticeElement V2State = V2Param != ParamState.end()
+                                    ? V2Param->second
+                                    : getValueState(Op2).toValueLattice();
+
+  Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
+  if (C) {
     if (isa<UndefValue>(C))
       return;
-    return markConstant(IV, &I, C);
+    LatticeVal CV;
+    CV.markConstant(C);
+    mergeInValue(&I, CV);
+    return;
   }
 
   // If operands are still unknown, wait for it to resolve.
-  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+  if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant())
     return;
 
   markOverdefined(&I);
@@ -1076,7 +1031,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
       return;  // Operands are not resolved yet.
 
     if (State.isOverdefined())
-      return markOverdefined(&I);
+      return (void)markOverdefined(&I);
 
     assert(State.isConstant() && "Unknown state!");
     Operands.push_back(State.getConstant());
@@ -1114,7 +1069,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
 void SCCPSolver::visitLoadInst(LoadInst &I) {
   // If this load is of a struct, just mark the result overdefined.
   if (I.getType()->isStructTy())
-    return markOverdefined(&I);
+    return (void)markOverdefined(&I);
 
   LatticeVal PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
@@ -1123,13 +1078,17 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
   if (IV.isOverdefined()) return;
 
   if (!PtrVal.isConstant() || I.isVolatile())
-    return markOverdefined(IV, &I);
+    return (void)markOverdefined(IV, &I);
 
   Constant *Ptr = PtrVal.getConstant();
 
   // load null is undefined.
-  if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
-    return;
+  if (isa<ConstantPointerNull>(Ptr)) {
+    if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
+      return (void)markOverdefined(IV, &I);
+    else
+      return;
+  }
 
   // Transform load (constant global) into the value loaded.
   if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
@@ -1148,7 +1107,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
   if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
     if (isa<UndefValue>(C))
       return;
-    return markConstant(IV, &I, C);
+    return (void)markConstant(IV, &I, C);
   }
 
   // Otherwise we cannot say for certain what value this load will produce.
@@ -1180,7 +1139,7 @@ CallOverdefined:
         if (State.isUnknown())
           return;  // Operands are not resolved yet.
         if (State.isOverdefined())
-          return markOverdefined(I);
+          return (void)markOverdefined(I);
         assert(State.isConstant() && "Unknown state!");
         Operands.push_back(State.getConstant());
       }
@@ -1194,12 +1153,12 @@ CallOverdefined:
         // call -> undef.
         if (isa<UndefValue>(C))
           return;
-        return markConstant(I, C);
+        return (void)markConstant(I, C);
       }
     }
 
     // Otherwise, we don't know anything about this call, mark it overdefined.
-    return markOverdefined(I);
+    return (void)markOverdefined(I);
   }
 
   // If this is a local function that doesn't have its address taken, mark its
@@ -1227,8 +1186,16 @@ CallOverdefined:
       } else {
         // Most other parts of the Solver still only use the simpler value
         // lattice, so we propagate changes for parameters to both lattices.
-        getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL);
-        mergeInValue(&*AI, getValueState(*CAI));
+        LatticeVal ConcreteArgument = getValueState(*CAI);
+        bool ParamChanged =
+            getParamState(&*AI).mergeIn(ConcreteArgument.toValueLattice(), DL);
+         bool ValueChanged = mergeInValue(&*AI, ConcreteArgument);
+        // Add argument to work list, if the state of a parameter changes but
+        // ValueState does not change (because it is already overdefined there),
+        // We have to take changes in ParamState into account, as it is used
+        // when evaluating Cmp instructions.
+        if (!ValueChanged && ParamChanged)
+          pushToWorkList(ValueState[&*AI], &*AI);
       }
     }
   }
@@ -1262,7 +1229,7 @@ void SCCPSolver::Solve() {
     while (!OverdefinedInstWorkList.empty()) {
       Value *I = OverdefinedInstWorkList.pop_back_val();
 
-      DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
 
       // "I" got into the work list because it either made the transition from
       // bottom to constant, or to overdefined.
@@ -1280,7 +1247,7 @@ void SCCPSolver::Solve() {
     while (!InstWorkList.empty()) {
       Value *I = InstWorkList.pop_back_val();
 
-      DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
 
       // "I" got into the work list because it made the transition from undef to
       // constant.
@@ -1300,7 +1267,7 @@ void SCCPSolver::Solve() {
       BasicBlock *BB = BBWorkList.back();
       BBWorkList.pop_back();
 
-      DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+      LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
 
       // Notify all instructions in this basic block that they are newly
       // executable.
@@ -1521,7 +1488,11 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         break;
       case Instruction::ICmp:
         // X == undef -> undef.  Other comparisons get more complicated.
-        if (cast<ICmpInst>(&I)->isEquality())
+        Op0LV = getValueState(I.getOperand(0));
+        Op1LV = getValueState(I.getOperand(1));
+
+        if ((Op0LV.isUnknown() || Op1LV.isUnknown()) &&
+            cast<ICmpInst>(&I)->isEquality())
           break;
         markOverdefined(&I);
         return true;
@@ -1566,11 +1537,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       }
 
       // Otherwise, it is a branch on a symbolic value which is currently
-      // considered to be undef.  Handle this by forcing the input value to the
-      // branch to false.
-      markForcedConstant(BI->getCondition(),
-                         ConstantInt::getFalse(TI->getContext()));
-      return true;
+      // considered to be undef.  Make sure some edge is executable, so a
+      // branch on "undef" always flows somewhere.
+      // FIXME: Distinguish between dead code and an LLVM "undef" value.
+      BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
+      if (markEdgeExecutable(&BB, DefaultSuccessor))
+        return true;
+
+      continue;
     }
 
    if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
@@ -1591,11 +1565,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       }
 
       // Otherwise, it is a branch on a symbolic value which is currently
-      // considered to be undef.  Handle this by forcing the input value to the
-      // branch to the first successor.
-      markForcedConstant(IBR->getAddress(),
-                         BlockAddress::get(IBR->getSuccessor(0)));
-      return true;
+      // considered to be undef.  Make sure some edge is executable, so a
+      // branch on "undef" always flows somewhere.
+      // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
+      // we can assume the branch has undefined behavior instead.
+      BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
+      if (markEdgeExecutable(&BB, DefaultSuccessor))
+        return true;
+
+      continue;
     }
 
     if (auto *SI = dyn_cast<SwitchInst>(TI)) {
@@ -1610,56 +1588,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         return true;
       }
 
-      markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
-      return true;
-    }
-  }
-
-  return false;
-}
-
-static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) {
-  bool Changed = false;
-
-  // Currently we only use range information for integer values.
-  if (!V->getType()->isIntegerTy())
-    return false;
-
-  const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
-  if (!IV.isConstantRange())
-    return false;
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Make sure some edge is executable, so a
+      // branch on "undef" always flows somewhere.
+      // FIXME: Distinguish between dead code and an LLVM "undef" value.
+      BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
+      if (markEdgeExecutable(&BB, DefaultSuccessor))
+        return true;
 
-  for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) {
-    const Use &U = *UI++;
-    auto *Icmp = dyn_cast<ICmpInst>(U.getUser());
-    if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent()))
       continue;
-
-    auto getIcmpLatticeValue = [&](Value *Op) {
-      if (auto *C = dyn_cast<Constant>(Op))
-        return ValueLatticeElement::get(C);
-      return Solver.getLatticeValueFor(Op);
-    };
-
-    ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0));
-    ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1));
-
-    Constant *C = nullptr;
-    if (A.satisfiesPredicate(Icmp->getPredicate(), B))
-      C = ConstantInt::getTrue(Icmp->getType());
-    else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B))
-      C = ConstantInt::getFalse(Icmp->getType());
-
-    if (C) {
-      Icmp->replaceAllUsesWith(C);
-      DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C
-                   << ", because of range information " << A << " " << B
-                   << "\n");
-      Icmp->eraseFromParent();
-      Changed = true;
     }
   }
-  return Changed;
+
+  return false;
 }
 
 static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
@@ -1679,26 +1620,18 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
     }
     Const = ConstantStruct::get(ST, ConstVals);
   } else {
-    const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+    const LatticeVal &IV = Solver.getLatticeValueFor(V);
     if (IV.isOverdefined())
       return false;
 
-    if (IV.isConstantRange()) {
-      if (IV.getConstantRange().isSingleElement())
-        Const =
-            ConstantInt::get(V->getType(), IV.asConstantInteger().getValue());
-      else
-        return false;
-    } else
-      Const =
-          IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+    Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
   }
   assert(Const && "Constant is nullptr here!");
 
   // Replacing `musttail` instructions with constant breaks `musttail` invariant
   // unless the call itself can be removed
   CallInst *CI = dyn_cast<CallInst>(V);
-  if (CI && CI->isMustTailCall() && !isInstructionTriviallyDead(CI)) {
+  if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
     CallSite CS(CI);
     Function *F = CS.getCalledFunction();
 
@@ -1706,12 +1639,12 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
     if (F)
       Solver.AddMustTailCallee(F);
 
-    DEBUG(dbgs() << "  Can\'t treat the result of musttail call : " << *CI
-                 << " as a constant\n");
+    LLVM_DEBUG(dbgs() << "  Can\'t treat the result of musttail call : " << *CI
+                      << " as a constant\n");
     return false;
   }
 
-  DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
+  LLVM_DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
 
   // Replaces all of the uses of a variable with uses of the constant.
   V->replaceAllUsesWith(Const);
@@ -1722,7 +1655,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
 // and return true if the function was modified.
 static bool runSCCP(Function &F, const DataLayout &DL,
                     const TargetLibraryInfo *TLI) {
-  DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+  LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
   SCCPSolver Solver(DL, TLI);
 
   // Mark the first block of the function as being executable.
@@ -1736,7 +1669,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
   bool ResolvedUndefs = true;
   while (ResolvedUndefs) {
     Solver.Solve();
-    DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+    LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n");
     ResolvedUndefs = Solver.ResolvedUndefsIn(F);
   }
 
@@ -1748,7 +1681,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
   for (BasicBlock &BB : F) {
     if (!Solver.isBlockExecutable(&BB)) {
-      DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
+      LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
 
       ++NumDeadBlocks;
       NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
@@ -1785,6 +1718,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   auto PA = PreservedAnalyses();
   PA.preserve<GlobalsAA>();
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }
 
@@ -1807,6 +1741,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
   }
 
   // runOnFunction - Run the Sparse Conditional Constant Propagation
@@ -1844,15 +1779,15 @@ static void findReturnsToZap(Function &F,
   // There is a non-removable musttail call site of this function. Zapping
   // returns is not allowed.
   if (Solver.isMustTailCallee(&F)) {
-    DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
-                 << " due to present musttail call of it\n");
+    LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
+                      << " due to present musttail call of it\n");
     return;
   }
 
   for (BasicBlock &BB : F) {
     if (CallInst *CI = BB.getTerminatingMustTailCall()) {
-      DEBUG(dbgs() << "Can't zap return of the block due to present "
-                   << "musttail call : " << *CI << "\n");
+      LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
+                        << "musttail call : " << *CI << "\n");
       (void)CI;
       return;
     }
@@ -1863,8 +1798,8 @@ static void findReturnsToZap(Function &F,
   }
 }
 
-static bool runIPSCCP(Module &M, const DataLayout &DL,
-                      const TargetLibraryInfo *TLI) {
+bool llvm::runIPSCCP(Module &M, const DataLayout &DL,
+                     const TargetLibraryInfo *TLI) {
   SCCPSolver Solver(DL, TLI);
 
   // Loop over all functions, marking arguments to those with their addresses
@@ -1904,13 +1839,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
   // Solve for constants.
   bool ResolvedUndefs = true;
+  Solver.Solve();
   while (ResolvedUndefs) {
-    Solver.Solve();
-
-    DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+    LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
     ResolvedUndefs = false;
     for (Function &F : M)
-      ResolvedUndefs |= Solver.ResolvedUndefsIn(F);
+      if (Solver.ResolvedUndefsIn(F)) {
+        // We run Solve() after we resolved an undef in a function, because
+        // we might deduce a fact that eliminates an undef in another function.
+        Solver.Solve();
+        ResolvedUndefs = true;
+      }
   }
 
   bool MadeChanges = false;
@@ -1930,18 +1869,12 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
           ++IPNumArgsElimed;
           continue;
         }
-
-        if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI))
-          ++IPNumRangeInfoUsed;
       }
 
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
       if (!Solver.isBlockExecutable(&*BB)) {
-        DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
-
+        LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
         ++NumDeadBlocks;
-        NumInstRemoved +=
-            changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
 
         MadeChanges = true;
 
@@ -1955,7 +1888,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (Inst->getType()->isVoidTy())
           continue;
         if (tryToReplaceWithConstant(Solver, Inst)) {
-          if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+          if (Inst->isSafeToRemove())
             Inst->eraseFromParent();
           // Hey, we just changed something!
           MadeChanges = true;
@@ -1964,6 +1897,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
       }
     }
 
+    // Change dead blocks to unreachable. We do it after replacing constants in
+    // all executable blocks, because changeToUnreachable may remove PHI nodes
+    // in executable blocks we found values for. The function's entry block is
+    // not part of BlocksToErase, so we have to handle it separately.
+    for (BasicBlock *BB : BlocksToErase)
+      NumInstRemoved +=
+          changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+    if (!Solver.isBlockExecutable(&F.front()))
+      NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
+                                            /*UseLLVMTrap=*/false);
+
     // Now that all instructions in the function are constant folded, erase dead
     // blocks, because we can now use ConstantFoldTerminator to get rid of
     // in-edges.
@@ -1983,31 +1927,33 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
         bool Folded = ConstantFoldTerminator(I->getParent());
         if (!Folded) {
-          // The constant folder may not have been able to fold the terminator
-          // if this is a branch or switch on undef.  Fold it manually as a
-          // branch to the first successor.
-#ifndef NDEBUG
-          if (auto *BI = dyn_cast<BranchInst>(I)) {
-            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
-                   "Branch should be foldable!");
-          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
-            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          // If the branch can't be folded, we must have forced an edge
+          // for an indeterminate value. Force the terminator to fold
+          // to that edge.
+          Constant *C;
+          BasicBlock *Dest;
+          if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+            Dest = SI->case_begin()->getCaseSuccessor();
+            C = SI->case_begin()->getCaseValue();
+          } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+            Dest = BI->getSuccessor(1);
+            C = ConstantInt::getFalse(BI->getContext());
+          } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) {
+            Dest = IBR->getSuccessor(0);
+            C = BlockAddress::get(IBR->getSuccessor(0));
           } else {
-            llvm_unreachable("Didn't fold away reference to block!");
+            llvm_unreachable("Unexpected terminator instruction");
           }
-#endif
-
-          // Make this an uncond branch to the first successor.
-          TerminatorInst *TI = I->getParent()->getTerminator();
-          BranchInst::Create(TI->getSuccessor(0), TI);
+          assert(Solver.isEdgeFeasible(I->getParent(), Dest) &&
+                 "Didn't find feasible edge?");
+          (void)Dest;
 
-          // Remove entries in successor phi nodes to remove edges.
-          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
-            TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
-          // Remove the old terminator.
-          TI->eraseFromParent();
+          I->setOperand(0, C);
+          Folded = ConstantFoldTerminator(I->getParent());
         }
+        assert(Folded &&
+              "Expect TermInst on constantint or blockaddress to be folded");
+        (void) Folded;
       }
 
       // Finally, delete the basic block.
@@ -2058,7 +2004,8 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
     GlobalVariable *GV = I->first;
     assert(!I->second.isOverdefined() &&
            "Overdefined values should have been taken out of the map!");
-    DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+    LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
+                      << "' is constant!\n");
     while (!GV->use_empty()) {
       StoreInst *SI = cast<StoreInst>(GV->user_back());
       SI->eraseFromParent();
@@ -2069,55 +2016,3 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
   return MadeChanges;
 }
-
-PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
-  const DataLayout &DL = M.getDataLayout();
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
-  if (!runIPSCCP(M, DL, &TLI))
-    return PreservedAnalyses::all();
-  return PreservedAnalyses::none();
-}
-
-namespace {
-
-//===--------------------------------------------------------------------===//
-//
-/// IPSCCP Class - This class implements interprocedural Sparse Conditional
-/// Constant Propagation.
-///
-class IPSCCPLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  IPSCCPLegacyPass() : ModulePass(ID) {
-    initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnModule(Module &M) override {
-    if (skipModule(M))
-      return false;
-    const DataLayout &DL = M.getDataLayout();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return runIPSCCP(M, DL, TLI);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-};
-
-} // end anonymous namespace
-
-char IPSCCPLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
-                      "Interprocedural Sparse Conditional Constant Propagation",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
-                    "Interprocedural Sparse Conditional Constant Propagation",
-                    false, false)
-
-// createIPSCCPPass - This is the public interface to this file.
-ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index bfe3754f0769..6c3f012c6280 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -42,6 +42,8 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantFolder.h"
@@ -79,7 +81,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -124,14 +125,9 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
                                         cl::Hidden);
 
-/// Hidden option to allow more aggressive splitting.
-static cl::opt<bool>
-SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices",
-                              cl::init(false), cl::Hidden);
-
 namespace {
 
-/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+/// A custom IRBuilder inserter which prefixes all names, but only in
 /// Assert builds.
 class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
   std::string Prefix;
@@ -151,23 +147,23 @@ protected:
   }
 };
 
-/// \brief Provide a type for IRBuilder that drops names in release builds.
+/// Provide a type for IRBuilder that drops names in release builds.
 using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
 
-/// \brief A used slice of an alloca.
+/// A used slice of an alloca.
 ///
 /// This structure represents a slice of an alloca used by some instruction. It
 /// stores both the begin and end offsets of this use, a pointer to the use
 /// itself, and a flag indicating whether we can classify the use as splittable
 /// or not when forming partitions of the alloca.
 class Slice {
-  /// \brief The beginning offset of the range.
+  /// The beginning offset of the range.
   uint64_t BeginOffset = 0;
 
-  /// \brief The ending offset, not included in the range.
+  /// The ending offset, not included in the range.
   uint64_t EndOffset = 0;
 
-  /// \brief Storage for both the use of this slice and whether it can be
+  /// Storage for both the use of this slice and whether it can be
   /// split.
   PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
 
@@ -189,7 +185,7 @@ public:
   bool isDead() const { return getUse() == nullptr; }
   void kill() { UseAndIsSplittable.setPointer(nullptr); }
 
-  /// \brief Support for ordering ranges.
+  /// Support for ordering ranges.
   ///
   /// This provides an ordering over ranges such that start offsets are
   /// always increasing, and within equal start offsets, the end offsets are
@@ -207,7 +203,7 @@ public:
     return false;
   }
 
-  /// \brief Support comparison with a single offset to allow binary searches.
+  /// Support comparison with a single offset to allow binary searches.
   friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
                                               uint64_t RHSOffset) {
     return LHS.beginOffset() < RHSOffset;
@@ -233,7 +229,7 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
 
 } // end namespace llvm
 
-/// \brief Representation of the alloca slices.
+/// Representation of the alloca slices.
 ///
 /// This class represents the slices of an alloca which are formed by its
 /// various uses. If a pointer escapes, we can't fully build a representation
@@ -242,16 +238,16 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
 /// starting at a particular offset before splittable slices.
 class llvm::sroa::AllocaSlices {
 public:
-  /// \brief Construct the slices of a particular alloca.
+  /// Construct the slices of a particular alloca.
   AllocaSlices(const DataLayout &DL, AllocaInst &AI);
 
-  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  /// Test whether a pointer to the allocation escapes our analysis.
   ///
   /// If this is true, the slices are never fully built and should be
   /// ignored.
   bool isEscaped() const { return PointerEscapingInstr; }
 
-  /// \brief Support for iterating over the slices.
+  /// Support for iterating over the slices.
   /// @{
   using iterator = SmallVectorImpl<Slice>::iterator;
   using range = iterator_range<iterator>;
@@ -266,10 +262,10 @@ public:
   const_iterator end() const { return Slices.end(); }
   /// @}
 
-  /// \brief Erase a range of slices.
+  /// Erase a range of slices.
   void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
 
-  /// \brief Insert new slices for this alloca.
+  /// Insert new slices for this alloca.
   ///
   /// This moves the slices into the alloca's slices collection, and re-sorts
   /// everything so that the usual ordering properties of the alloca's slices
@@ -278,7 +274,7 @@ public:
     int OldSize = Slices.size();
     Slices.append(NewSlices.begin(), NewSlices.end());
     auto SliceI = Slices.begin() + OldSize;
-    std::sort(SliceI, Slices.end());
+    llvm::sort(SliceI, Slices.end());
     std::inplace_merge(Slices.begin(), SliceI, Slices.end());
   }
 
@@ -287,10 +283,10 @@ public:
   class partition_iterator;
   iterator_range<partition_iterator> partitions();
 
-  /// \brief Access the dead users for this alloca.
+  /// Access the dead users for this alloca.
   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
 
-  /// \brief Access the dead operands referring to this alloca.
+  /// Access the dead operands referring to this alloca.
   ///
   /// These are operands which have cannot actually be used to refer to the
   /// alloca as they are outside its range and the user doesn't correct for
@@ -316,11 +312,11 @@ private:
   friend class AllocaSlices::SliceBuilder;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// \brief Handle to alloca instruction to simplify method interfaces.
+  /// Handle to alloca instruction to simplify method interfaces.
   AllocaInst &AI;
 #endif
 
-  /// \brief The instruction responsible for this alloca not having a known set
+  /// The instruction responsible for this alloca not having a known set
   /// of slices.
   ///
   /// When an instruction (potentially) escapes the pointer to the alloca, we
@@ -328,7 +324,7 @@ private:
   /// alloca. This will be null if the alloca slices are analyzed successfully.
   Instruction *PointerEscapingInstr;
 
-  /// \brief The slices of the alloca.
+  /// The slices of the alloca.
   ///
   /// We store a vector of the slices formed by uses of the alloca here. This
   /// vector is sorted by increasing begin offset, and then the unsplittable
@@ -336,7 +332,7 @@ private:
   /// details.
   SmallVector<Slice, 8> Slices;
 
-  /// \brief Instructions which will become dead if we rewrite the alloca.
+  /// Instructions which will become dead if we rewrite the alloca.
   ///
   /// Note that these are not separated by slice. This is because we expect an
   /// alloca to be completely rewritten or not rewritten at all. If rewritten,
@@ -344,7 +340,7 @@ private:
   /// they come from outside of the allocated space.
   SmallVector<Instruction *, 8> DeadUsers;
 
-  /// \brief Operands which will become dead if we rewrite the alloca.
+  /// Operands which will become dead if we rewrite the alloca.
   ///
   /// These are operands that in their particular use can be replaced with
   /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
@@ -355,7 +351,7 @@ private:
   SmallVector<Use *, 8> DeadOperands;
 };
 
-/// \brief A partition of the slices.
+/// A partition of the slices.
 ///
 /// An ephemeral representation for a range of slices which can be viewed as
 /// a partition of the alloca. This range represents a span of the alloca's
@@ -371,32 +367,32 @@ private:
 
   using iterator = AllocaSlices::iterator;
 
-  /// \brief The beginning and ending offsets of the alloca for this
+  /// The beginning and ending offsets of the alloca for this
   /// partition.
   uint64_t BeginOffset, EndOffset;
 
-  /// \brief The start and end iterators of this partition.
+  /// The start and end iterators of this partition.
   iterator SI, SJ;
 
-  /// \brief A collection of split slice tails overlapping the partition.
+  /// A collection of split slice tails overlapping the partition.
   SmallVector<Slice *, 4> SplitTails;
 
-  /// \brief Raw constructor builds an empty partition starting and ending at
+  /// Raw constructor builds an empty partition starting and ending at
   /// the given iterator.
   Partition(iterator SI) : SI(SI), SJ(SI) {}
 
 public:
-  /// \brief The start offset of this partition.
+  /// The start offset of this partition.
   ///
   /// All of the contained slices start at or after this offset.
   uint64_t beginOffset() const { return BeginOffset; }
 
-  /// \brief The end offset of this partition.
+  /// The end offset of this partition.
   ///
   /// All of the contained slices end at or before this offset.
   uint64_t endOffset() const { return EndOffset; }
 
-  /// \brief The size of the partition.
+  /// The size of the partition.
   ///
   /// Note that this can never be zero.
   uint64_t size() const {
@@ -404,7 +400,7 @@ public:
     return EndOffset - BeginOffset;
   }
 
-  /// \brief Test whether this partition contains no slices, and merely spans
+  /// Test whether this partition contains no slices, and merely spans
   /// a region occupied by split slices.
   bool empty() const { return SI == SJ; }
 
@@ -421,7 +417,7 @@ public:
   iterator end() const { return SJ; }
   /// @}
 
-  /// \brief Get the sequence of split slice tails.
+  /// Get the sequence of split slice tails.
   ///
   /// These tails are of slices which start before this partition but are
   /// split and overlap into the partition. We accumulate these while forming
@@ -429,7 +425,7 @@ public:
   ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
 };
 
-/// \brief An iterator over partitions of the alloca's slices.
+/// An iterator over partitions of the alloca's slices.
 ///
 /// This iterator implements the core algorithm for partitioning the alloca's
 /// slices. It is a forward iterator as we don't support backtracking for
@@ -443,18 +439,18 @@ class AllocaSlices::partition_iterator
                                   Partition> {
   friend class AllocaSlices;
 
-  /// \brief Most of the state for walking the partitions is held in a class
+  /// Most of the state for walking the partitions is held in a class
   /// with a nice interface for examining them.
   Partition P;
 
-  /// \brief We need to keep the end of the slices to know when to stop.
+  /// We need to keep the end of the slices to know when to stop.
   AllocaSlices::iterator SE;
 
-  /// \brief We also need to keep track of the maximum split end offset seen.
+  /// We also need to keep track of the maximum split end offset seen.
   /// FIXME: Do we really?
   uint64_t MaxSplitSliceEndOffset = 0;
 
-  /// \brief Sets the partition to be empty at given iterator, and sets the
+  /// Sets the partition to be empty at given iterator, and sets the
   /// end iterator.
   partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
       : P(SI), SE(SE) {
@@ -464,7 +460,7 @@ class AllocaSlices::partition_iterator
       advance();
   }
 
-  /// \brief Advance the iterator to the next partition.
+  /// Advance the iterator to the next partition.
   ///
   /// Requires that the iterator not be at the end of the slices.
   void advance() {
@@ -619,7 +615,7 @@ public:
   Partition &operator*() { return P; }
 };
 
-/// \brief A forward range over the partitions of the alloca's slices.
+/// A forward range over the partitions of the alloca's slices.
 ///
 /// This accesses an iterator range over the partitions of the alloca's
 /// slices. It computes these partitions on the fly based on the overlapping
@@ -643,7 +639,7 @@ static Value *foldSelectInst(SelectInst &SI) {
   return nullptr;
 }
 
-/// \brief A helper that folds a PHI node or a select.
+/// A helper that folds a PHI node or a select.
 static Value *foldPHINodeOrSelectInst(Instruction &I) {
   if (PHINode *PN = dyn_cast<PHINode>(&I)) {
     // If PN merges together the same value, return that value.
@@ -652,7 +648,7 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
   return foldSelectInst(cast<SelectInst>(I));
 }
 
-/// \brief Builder for the alloca slices.
+/// Builder for the alloca slices.
 ///
 /// This class builds a set of alloca slices by recursively visiting the uses
 /// of an alloca and making a slice for each load and store at each offset.
@@ -668,7 +664,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
   SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
   SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
 
-  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  /// Set to de-duplicate dead instructions found in the use walk.
   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
@@ -687,11 +683,12 @@ private:
     // Completely skip uses which have a zero size or start either before or
     // past the end of the allocation.
     if (Size == 0 || Offset.uge(AllocSize)) {
-      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
-                   << " which has zero size or starts outside of the "
-                   << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << AS.AI << "\n"
-                   << "       use: " << I << "\n");
+      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
+                        << Offset
+                        << " which has zero size or starts outside of the "
+                        << AllocSize << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << I << "\n");
       return markAsDead(I);
     }
 
@@ -706,10 +703,11 @@ private:
     // them, and so have to record at least the information here.
     assert(AllocSize >= BeginOffset); // Established above.
     if (Size > AllocSize - BeginOffset) {
-      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
-                   << " to remain within the " << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << AS.AI << "\n"
-                   << "       use: " << I << "\n");
+      LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
+                        << Offset << " to remain within the " << AllocSize
+                        << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << I << "\n");
       EndOffset = AllocSize;
     }
 
@@ -802,18 +800,18 @@ private:
     uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
 
     // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
+    // bounds of the allocation, it's behavior is undefined, so simply
     // ignore it. Note that this is more strict than the generic clamping
     // behavior of insertUse. We also try to handle cases which might run the
     // risk of overflow.
     // FIXME: We should instead consider the pointer to have escaped if this
     // function is being instrumented for addressing bugs or race conditions.
     if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
-      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
-                   << " which extends past the end of the " << AllocSize
-                   << " byte alloca:\n"
-                   << "    alloca: " << AS.AI << "\n"
-                   << "       use: " << SI << "\n");
+      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
+                        << Offset << " which extends past the end of the "
+                        << AllocSize << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << SI << "\n");
       return markAsDead(SI);
     }
 
@@ -1027,7 +1025,7 @@ private:
 
   void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
 
-  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  /// Disable SROA entirely if there are unhandled users of the alloca.
   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
 };
 
@@ -1062,7 +1060,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
 
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
-  std::sort(Slices.begin(), Slices.end());
+  llvm::sort(Slices.begin(), Slices.end());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1240,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
 }
 
 static void speculatePHINodeLoads(PHINode &PN) {
-  DEBUG(dbgs() << "    original: " << PN << "\n");
+  LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
 
   Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
   IRBuilderTy PHIBuilder(&PN);
@@ -1263,10 +1261,21 @@ static void speculatePHINodeLoads(PHINode &PN) {
   }
 
   // Inject loads into all of the pred blocks.
+  DenseMap<BasicBlock*, Value*> InjectedLoads;
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
     BasicBlock *Pred = PN.getIncomingBlock(Idx);
-    TerminatorInst *TI = Pred->getTerminator();
     Value *InVal = PN.getIncomingValue(Idx);
+
+    // A PHI node is allowed to have multiple (duplicated) entries for the same
+    // basic block, as long as the value is the same. So if we already injected
+    // a load in the predecessor, then we should reuse the same load for all
+    // duplicated entries.
+    if (Value* V = InjectedLoads.lookup(Pred)) {
+      NewPN->addIncoming(V, Pred);
+      continue;
+    }
+
+    TerminatorInst *TI = Pred->getTerminator();
     IRBuilderTy PredBuilder(TI);
 
     LoadInst *Load = PredBuilder.CreateLoad(
@@ -1276,9 +1285,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
     if (AATags)
       Load->setAAMetadata(AATags);
     NewPN->addIncoming(Load, Pred);
+    InjectedLoads[Pred] = Load;
   }
 
-  DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  LLVM_DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
   PN.eraseFromParent();
 }
 
@@ -1318,7 +1328,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
 }
 
 static void speculateSelectInstLoads(SelectInst &SI) {
-  DEBUG(dbgs() << "    original: " << SI << "\n");
+  LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
 
   IRBuilderTy IRB(&SI);
   Value *TV = SI.getTrueValue();
@@ -1349,14 +1359,14 @@ static void speculateSelectInstLoads(SelectInst &SI) {
     Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
                                 LI->getName() + ".sroa.speculated");
 
-    DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
     LI->replaceAllUsesWith(V);
     LI->eraseFromParent();
   }
   SI.eraseFromParent();
 }
 
-/// \brief Build a GEP out of a base pointer and indices.
+/// Build a GEP out of a base pointer and indices.
 ///
 /// This will return the BasePtr if that is valid, or build a new GEP
 /// instruction using the IRBuilder if GEP-ing is needed.
@@ -1374,7 +1384,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
                                NamePrefix + "sroa_idx");
 }
 
-/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// Get a natural GEP off of the BasePtr walking through Ty toward
 /// TargetTy without changing the offset of the pointer.
 ///
 /// This routine assumes we've already established a properly offset GEP with
@@ -1423,7 +1433,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   return buildGEP(IRB, BasePtr, Indices, NamePrefix);
 }
 
-/// \brief Recursively compute indices for a natural GEP.
+/// Recursively compute indices for a natural GEP.
 ///
 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
 /// element types adding appropriate indices for the GEP.
@@ -1491,7 +1501,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                   Indices, NamePrefix);
 }
 
-/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// Get a natural GEP from a base pointer to a particular offset and
 /// resulting in a particular type.
 ///
 /// The goal is to produce a "natural" looking GEP that works with the existing
@@ -1526,7 +1536,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                   Indices, NamePrefix);
 }
 
-/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// Compute an adjusted pointer from Ptr by Offset bytes where the
 /// resulting pointer has PointerTy.
 ///
 /// This tries very hard to compute a "natural" GEP which arrives at the offset
@@ -1635,7 +1645,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
   return Ptr;
 }
 
-/// \brief Compute the adjusted alignment for a load or store from an offset.
+/// Compute the adjusted alignment for a load or store from an offset.
 static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
                                      const DataLayout &DL) {
   unsigned Alignment;
@@ -1656,7 +1666,7 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
   return MinAlign(Alignment, Offset);
 }
 
-/// \brief Test whether we can convert a value from the old to the new type.
+/// Test whether we can convert a value from the old to the new type.
 ///
 /// This predicate should be used to guard calls to convertValue in order to
 /// ensure that we only try to convert viable values. The strategy is that we
@@ -1707,7 +1717,7 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   return true;
 }
 
-/// \brief Generic routine to convert an SSA value to a value of a different
+/// Generic routine to convert an SSA value to a value of a different
 /// type.
 ///
 /// This will try various different casting techniques, such as bitcasts,
@@ -1759,7 +1769,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   return IRB.CreateBitCast(V, NewTy);
 }
 
-/// \brief Test whether the given slice use can be promoted to a vector.
+/// Test whether the given slice use can be promoted to a vector.
 ///
 /// This function is called to test each entry in a partition which is slated
 /// for a single slice.
@@ -1830,7 +1840,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   return true;
 }
 
-/// \brief Test whether the given alloca partitioning and range of slices can be
+/// Test whether the given alloca partitioning and range of slices can be
 /// promoted to a vector.
 ///
 /// This is a quick test to check whether we can rewrite a particular alloca
@@ -1896,7 +1906,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
              "All non-integer types eliminated!");
       return RHSTy->getNumElements() < LHSTy->getNumElements();
     };
-    std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+    llvm::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
     CandidateTys.erase(
         std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
         CandidateTys.end());
@@ -1943,7 +1953,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   return nullptr;
 }
 
-/// \brief Test whether a slice of an alloca is valid for integer widening.
+/// Test whether a slice of an alloca is valid for integer widening.
 ///
 /// This implements the necessary checking for the \c isIntegerWideningViable
 /// test below on a single slice of the alloca.
@@ -1970,6 +1980,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     // We can't handle loads that extend past the allocated memory.
     if (DL.getTypeStoreSize(LI->getType()) > Size)
       return false;
+    // So far, AllocaSliceRewriter does not support widening split slice tails
+    // in rewriteIntegerLoad.
+    if (S.beginOffset() < AllocBeginOffset)
+      return false;
     // Note that we don't count vector loads or stores as whole-alloca
     // operations which enable integer widening because we would prefer to use
     // vector widening instead.
@@ -1991,6 +2005,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     // We can't handle stores that extend past the allocated memory.
     if (DL.getTypeStoreSize(ValueTy) > Size)
       return false;
+    // So far, AllocaSliceRewriter does not support widening split slice tails
+    // in rewriteIntegerStore.
+    if (S.beginOffset() < AllocBeginOffset)
+      return false;
     // Note that we don't count vector loads or stores as whole-alloca
     // operations which enable integer widening because we would prefer to use
     // vector widening instead.
@@ -2021,7 +2039,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
   return true;
 }
 
-/// \brief Test whether the given alloca partition's integer operations can be
+/// Test whether the given alloca partition's integer operations can be
 /// widened to promotable ones.
 ///
 /// This is a quick test to check whether we can rewrite the integer loads and
@@ -2072,7 +2090,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                              IntegerType *Ty, uint64_t Offset,
                              const Twine &Name) {
-  DEBUG(dbgs() << "       start: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
   IntegerType *IntTy = cast<IntegerType>(V->getType());
   assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
          "Element extends past full value");
@@ -2081,13 +2099,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
     ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
   if (ShAmt) {
     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
-    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
   }
   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
          "Cannot extract to a larger integer!");
   if (Ty != IntTy) {
     V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
-    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     trunced: " << *V << "\n");
   }
   return V;
 }
@@ -2098,10 +2116,10 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
   IntegerType *Ty = cast<IntegerType>(V->getType());
   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
          "Cannot insert a larger integer!");
-  DEBUG(dbgs() << "       start: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
   if (Ty != IntTy) {
     V = IRB.CreateZExt(V, IntTy, Name + ".ext");
-    DEBUG(dbgs() << "    extended: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
   }
   assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
          "Element store outside of alloca store");
@@ -2110,15 +2128,15 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
     ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
   if (ShAmt) {
     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
-    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
   }
 
   if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
     APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
     Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
-    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    LLVM_DEBUG(dbgs() << "      masked: " << *Old << "\n");
     V = IRB.CreateOr(Old, V, Name + ".insert");
-    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "    inserted: " << *V << "\n");
   }
   return V;
 }
@@ -2135,7 +2153,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
   if (NumElements == 1) {
     V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
                                  Name + ".extract");
-    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     extract: " << *V << "\n");
     return V;
   }
 
@@ -2145,7 +2163,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
     Mask.push_back(IRB.getInt32(i));
   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
                               ConstantVector::get(Mask), Name + ".extract");
-  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
   return V;
 }
 
@@ -2159,7 +2177,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
     // Single element to insert.
     V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
                                 Name + ".insert");
-    DEBUG(dbgs() << "     insert: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "     insert: " << *V << "\n");
     return V;
   }
 
@@ -2184,7 +2202,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
       Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
                               ConstantVector::get(Mask), Name + ".expand");
-  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
 
   Mask.clear();
   for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
@@ -2192,11 +2210,11 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
 
   V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
 
-  DEBUG(dbgs() << "    blend: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
   return V;
 }
 
-/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// Visitor to rewrite instructions using p particular slice of an alloca
 /// to use a new alloca.
 ///
 /// Also implements the rewriting to vector-based accesses when the partition
@@ -2295,9 +2313,9 @@ public:
     IsSplittable = I->isSplittable();
     IsSplit =
         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
-    DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
-    DEBUG(AS.printSlice(dbgs(), I, ""));
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
+    LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
+    LLVM_DEBUG(dbgs() << "\n");
 
     // Compute the intersecting offset range.
     assert(BeginOffset < NewAllocaEndOffset);
@@ -2327,7 +2345,7 @@ private:
 
   // Every instruction which can end up as a user must have a rewrite rule.
   bool visitInstruction(Instruction &I) {
-    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    LLVM_DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
@@ -2369,7 +2387,7 @@ private:
                           );
   }
 
-  /// \brief Compute suitable alignment to access this slice of the *new*
+  /// Compute suitable alignment to access this slice of the *new*
   /// alloca.
   ///
   /// You can optionally pass a type to this routine and if that type's ABI
@@ -2431,10 +2449,13 @@ private:
   }
 
   bool visitLoadInst(LoadInst &LI) {
-    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
+    AAMDNodes AATags;
+    LI.getAAMetadata(AATags);
+
     unsigned AS = LI.getPointerAddressSpace();
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
@@ -2453,6 +2474,8 @@ private:
                  TargetTy->isIntegerTy()))) {
       LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                               LI.isVolatile(), LI.getName());
+      if (AATags)
+        NewLI->setAAMetadata(AATags);
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
 
@@ -2488,6 +2511,8 @@ private:
       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
                                               getSliceAlign(TargetTy),
                                               LI.isVolatile(), LI.getName());
+      if (AATags)
+        NewLI->setAAMetadata(AATags);
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
 
@@ -2524,11 +2549,12 @@ private:
 
     Pass.DeadInsts.insert(&LI);
     deleteIfTriviallyDead(OldOp);
-    DEBUG(dbgs() << "          to: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *V << "\n");
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+                                  AAMDNodes AATags) {
     if (V->getType() != VecTy) {
       unsigned BeginIndex = getIndex(NewBeginOffset);
       unsigned EndIndex = getIndex(NewEndOffset);
@@ -2546,14 +2572,15 @@ private:
       V = insertVector(IRB, Old, V, BeginIndex, "vec");
     }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    if (AATags)
+      Store->setAAMetadata(AATags);
     Pass.DeadInsts.insert(&SI);
 
-    (void)Store;
-    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return true;
   }
 
-  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
     if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
@@ -2567,16 +2594,21 @@ private:
     V = convertValue(DL, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+    if (AATags)
+      Store->setAAMetadata(AATags);
     Pass.DeadInsts.insert(&SI);
-    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return true;
   }
 
   bool visitStoreInst(StoreInst &SI) {
-    DEBUG(dbgs() << "    original: " << SI << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
     Value *OldOp = SI.getOperand(1);
     assert(OldOp == OldPtr);
 
+    AAMDNodes AATags;
+    SI.getAAMetadata(AATags);
+
     Value *V = SI.getValueOperand();
 
     // Strip all inbounds GEPs and pointer casts to try to dig out any root
@@ -2598,9 +2630,9 @@ private:
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(V, SI);
+      return rewriteIntegerStore(V, SI, AATags);
 
     const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
     StoreInst *NewSI;
@@ -2631,16 +2663,18 @@ private:
                                      SI.isVolatile());
     }
     NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+    if (AATags)
+      NewSI->setAAMetadata(AATags);
     if (SI.isVolatile())
       NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
     Pass.DeadInsts.insert(&SI);
     deleteIfTriviallyDead(OldOp);
 
-    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
     return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
   }
 
-  /// \brief Compute an integer value from splatting an i8 across the given
+  /// Compute an integer value from splatting an i8 across the given
   /// number of bytes.
   ///
   /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
@@ -2667,25 +2701,27 @@ private:
     return V;
   }
 
-  /// \brief Compute a vector splat for a given element value.
+  /// Compute a vector splat for a given element value.
   Value *getVectorSplat(Value *V, unsigned NumElements) {
     V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
-    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "       splat: " << *V << "\n");
     return V;
   }
 
   bool visitMemSetInst(MemSetInst &II) {
-    DEBUG(dbgs() << "    original: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
     assert(II.getRawDest() == OldPtr);
 
+    AAMDNodes AATags;
+    II.getAAMetadata(AATags);
+
     // If the memset has a variable size, it cannot be split, just adjust the
     // pointer to the new alloca.
     if (!isa<Constant>(II.getLength())) {
       assert(!IsSplit);
       assert(NewBeginOffset == BeginOffset);
       II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
-      Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+      II.setDestAlignment(getSliceAlign());
 
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -2710,8 +2746,9 @@ private:
       CallInst *New = IRB.CreateMemSet(
           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
           getSliceAlign(), II.isVolatile());
-      (void)New;
-      DEBUG(dbgs() << "          to: " << *New << "\n");
+      if (AATags)
+        New->setAAMetadata(AATags);
+      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
 
@@ -2773,10 +2810,11 @@ private:
       V = convertValue(DL, IRB, V, AllocaTy);
     }
 
-    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
-                                        II.isVolatile());
-    (void)New;
-    DEBUG(dbgs() << "          to: " << *New << "\n");
+    StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                            II.isVolatile());
+    if (AATags)
+      New->setAAMetadata(AATags);
+    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
     return !II.isVolatile();
   }
 
@@ -2784,7 +2822,10 @@ private:
     // Rewriting of memory transfer instructions can be a bit tricky. We break
     // them into two categories: split intrinsics and unsplit intrinsics.
 
-    DEBUG(dbgs() << "    original: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
+
+    AAMDNodes AATags;
+    II.getAAMetadata(AATags);
 
     bool IsDest = &II.getRawDestUse() == OldUse;
     assert((IsDest && II.getRawDest() == OldPtr) ||
@@ -2801,18 +2842,16 @@ private:
     // update both source and dest of a single call.
     if (!IsSplittable) {
       Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
-      if (IsDest)
+      if (IsDest) {
         II.setDest(AdjustedPtr);
-      else
+        II.setDestAlignment(SliceAlign);
+      }
+      else {
         II.setSource(AdjustedPtr);
-
-      if (II.getAlignment() > SliceAlign) {
-        Type *CstTy = II.getAlignmentCst()->getType();
-        II.setAlignment(
-            ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+        II.setSourceAlignment(SliceAlign);
       }
 
-      DEBUG(dbgs() << "          to: " << II << "\n");
+      LLVM_DEBUG(dbgs() << "          to: " << II << "\n");
       deleteIfTriviallyDead(OldPtr);
       return false;
     }
@@ -2862,8 +2901,10 @@ private:
     // Compute the relative offset for the other pointer within the transfer.
     unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
     APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
-    unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
-                                   OtherOffset.zextOrTrunc(64).getZExtValue());
+    unsigned OtherAlign =
+      IsDest ? II.getSourceAlignment() : II.getDestAlignment();
+    OtherAlign =  MinAlign(OtherAlign ? OtherAlign : 1,
+                           OtherOffset.zextOrTrunc(64).getZExtValue());
 
     if (EmitMemCpy) {
       // Compute the other pointer, folding as much as possible to produce
@@ -2875,11 +2916,25 @@ private:
       Type *SizeTy = II.getLength()->getType();
       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
 
-      CallInst *New = IRB.CreateMemCpy(
-          IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
-          MinAlign(SliceAlign, OtherAlign), II.isVolatile());
-      (void)New;
-      DEBUG(dbgs() << "          to: " << *New << "\n");
+      Value *DestPtr, *SrcPtr;
+      unsigned DestAlign, SrcAlign;
+      // Note: IsDest is true iff we're copying into the new alloca slice
+      if (IsDest) {
+        DestPtr = OurPtr;
+        DestAlign = SliceAlign;
+        SrcPtr = OtherPtr;
+        SrcAlign = OtherAlign;
+      } else {
+        DestPtr = OtherPtr;
+        DestAlign = OtherAlign;
+        SrcPtr = OurPtr;
+        SrcAlign = SliceAlign;
+      }
+      CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
+                                       Size, II.isVolatile());
+      if (AATags)
+        New->setAAMetadata(AATags);
+      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
 
@@ -2927,8 +2982,11 @@ private:
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
-      Src =
-          IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+      LoadInst *Load = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
+                                             "copyload");
+      if (AATags)
+        Load->setAAMetadata(AATags);
+      Src = Load;
     }
 
     if (VecTy && !IsWholeAlloca && IsDest) {
@@ -2946,15 +3004,16 @@ private:
 
     StoreInst *Store = cast<StoreInst>(
         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
-    (void)Store;
-    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    if (AATags)
+      Store->setAAMetadata(AATags);
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return !II.isVolatile();
   }
 
   bool visitIntrinsicInst(IntrinsicInst &II) {
     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
            II.getIntrinsicID() == Intrinsic::lifetime_end);
-    DEBUG(dbgs() << "    original: " << II << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
     assert(II.getArgOperand(1) == OldPtr);
 
     // Record this instruction for deletion.
@@ -2982,13 +3041,13 @@ private:
       New = IRB.CreateLifetimeEnd(Ptr, Size);
 
     (void)New;
-    DEBUG(dbgs() << "          to: " << *New << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
 
     return true;
   }
 
   bool visitPHINode(PHINode &PN) {
-    DEBUG(dbgs() << "    original: " << PN << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
     assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
     assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
 
@@ -3007,7 +3066,7 @@ private:
     // Replace the operands which were using the old pointer.
     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
 
-    DEBUG(dbgs() << "          to: " << PN << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << PN << "\n");
     deleteIfTriviallyDead(OldPtr);
 
     // PHIs can't be promoted on their own, but often can be speculated. We
@@ -3018,7 +3077,7 @@ private:
   }
 
   bool visitSelectInst(SelectInst &SI) {
-    DEBUG(dbgs() << "    original: " << SI << "\n");
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
     assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
            "Pointer isn't an operand!");
     assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
@@ -3031,7 +3090,7 @@ private:
     if (SI.getOperand(2) == OldPtr)
       SI.setOperand(2, NewPtr);
 
-    DEBUG(dbgs() << "          to: " << SI << "\n");
+    LLVM_DEBUG(dbgs() << "          to: " << SI << "\n");
     deleteIfTriviallyDead(OldPtr);
 
     // Selects can't be promoted on their own, but often can be speculated. We
@@ -3044,7 +3103,7 @@ private:
 
 namespace {
 
-/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+/// Visitor to rewrite aggregate loads and stores as scalar.
 ///
 /// This pass aggressively rewrites all aggregate loads and stores on
 /// a particular pointer (or any pointer derived from it which we can identify)
@@ -3067,7 +3126,7 @@ public:
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
   bool rewrite(Instruction &I) {
-    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    LLVM_DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
     enqueueUsers(I);
     bool Changed = false;
     while (!Queue.empty()) {
@@ -3089,7 +3148,7 @@ private:
   // Conservative default is to not rewrite anything.
   bool visitInstruction(Instruction &I) { return false; }
 
-  /// \brief Generic recursive split emission class.
+  /// Generic recursive split emission class.
   template <typename Derived> class OpSplitter {
   protected:
     /// The builder used to form new instructions.
@@ -3113,7 +3172,7 @@ private:
         : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
 
   public:
-    /// \brief Generic recursive split emission routine.
+    /// Generic recursive split emission routine.
     ///
     /// This method recursively splits an aggregate op (load or store) into
     /// scalar or vector ops. It splits recursively until it hits a single value
@@ -3165,8 +3224,10 @@ private:
   };
 
   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
-    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
-        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+    AAMDNodes AATags;
+
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
 
     /// Emit a leaf load of a single value. This is called at the leaves of the
     /// recursive emission to actually load values.
@@ -3175,9 +3236,11 @@ private:
       // Load the single value and insert it using the indices.
       Value *GEP =
           IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
-      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+      LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load");
+      if (AATags)
+        Load->setAAMetadata(AATags);
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
-      DEBUG(dbgs() << "          to: " << *Load << "\n");
+      LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
   };
 
@@ -3187,8 +3250,10 @@ private:
       return false;
 
     // We have an aggregate being loaded, split it apart.
-    DEBUG(dbgs() << "    original: " << LI << "\n");
-    LoadOpSplitter Splitter(&LI, *U);
+    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
+    AAMDNodes AATags;
+    LI.getAAMetadata(AATags);
+    LoadOpSplitter Splitter(&LI, *U, AATags);
     Value *V = UndefValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
     LI.replaceAllUsesWith(V);
@@ -3197,8 +3262,9 @@ private:
   }
 
   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
-    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
-        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
+    AAMDNodes AATags;
 
     /// Emit a leaf store of a single value. This is called at the leaves of the
     /// recursive emission to actually produce stores.
@@ -3212,9 +3278,10 @@ private:
           IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
       Value *InBoundsGEP =
           IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
-      Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
-      (void)Store;
-      DEBUG(dbgs() << "          to: " << *Store << "\n");
+      StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+      if (AATags)
+        Store->setAAMetadata(AATags);
+      LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     }
   };
 
@@ -3226,8 +3293,10 @@ private:
       return false;
 
     // We have an aggregate being stored, split it apart.
-    DEBUG(dbgs() << "    original: " << SI << "\n");
-    StoreOpSplitter Splitter(&SI, *U);
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
+    AAMDNodes AATags;
+    SI.getAAMetadata(AATags);
+    StoreOpSplitter Splitter(&SI, *U, AATags);
     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
     SI.eraseFromParent();
     return true;
@@ -3256,7 +3325,7 @@ private:
 
 } // end anonymous namespace
 
-/// \brief Strip aggregate type wrapping.
+/// Strip aggregate type wrapping.
 ///
 /// This removes no-op aggregate types wrapping an underlying type. It will
 /// strip as many layers of types as it can without changing either the type
@@ -3286,7 +3355,7 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
   return stripAggregateTypeWrapping(DL, InnerTy);
 }
 
-/// \brief Try to find a partition of the aggregate type passed in for a given
+/// Try to find a partition of the aggregate type passed in for a given
 /// offset and size.
 ///
 /// This recurses through the aggregate type and tries to compute a subtype
@@ -3392,7 +3461,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
   return SubTy;
 }
 
-/// \brief Pre-split loads and stores to simplify rewriting.
+/// Pre-split loads and stores to simplify rewriting.
 ///
 /// We want to break up the splittable load+store pairs as much as
 /// possible. This is important to do as a preprocessing step, as once we
@@ -3423,7 +3492,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
 ///
 /// \returns true if any changes are made.
 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
-  DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
 
   // Track the loads and stores which are candidates for pre-splitting here, in
   // the order they first appear during the partition scan. These give stable
@@ -3455,7 +3524,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // maybe it would make it more principled?
   SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
 
-  DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  LLVM_DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
   for (auto &P : AS.partitions()) {
     for (Slice &S : P) {
       Instruction *I = cast<Instruction>(S.getUse()->getUser());
@@ -3510,7 +3579,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       }
 
       // Record the initial split.
-      DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      LLVM_DEBUG(dbgs() << "    Candidate: " << *I << "\n");
       auto &Offsets = SplitOffsetsMap[I];
       assert(Offsets.Splits.empty() &&
              "Should not have splits the first time we see an instruction!");
@@ -3570,10 +3639,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
                         if (LoadOffsets.Splits == StoreOffsets.Splits)
                           return false;
 
-                        DEBUG(dbgs()
-                              << "    Mismatched splits for load and store:\n"
-                              << "      " << *LI << "\n"
-                              << "      " << *SI << "\n");
+                        LLVM_DEBUG(
+                            dbgs()
+                            << "    Mismatched splits for load and store:\n"
+                            << "      " << *LI << "\n"
+                            << "      " << *SI << "\n");
 
                         // We've found a store and load that we need to split
                         // with mismatched relative splits. Just give up on them
@@ -3646,7 +3716,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
     IRB.SetInsertPoint(LI);
 
-    DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+    LLVM_DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
 
     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
     int Idx = 0, Size = Offsets.Splits.size();
@@ -3656,7 +3726,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       auto *PartPtrTy = PartTy->getPointerTo(AS);
       LoadInst *PLoad = IRB.CreateAlignedLoad(
           getAdjustedPtr(IRB, DL, BasePtr,
-                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                         APInt(DL.getIndexSizeInBits(AS), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
           getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
           LI->getName());
@@ -3671,9 +3741,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
                 /*IsSplittable*/ false));
-      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
-                   << ", " << NewSlices.back().endOffset() << "): " << *PLoad
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                        << ", " << NewSlices.back().endOffset()
+                        << "): " << *PLoad << "\n");
 
       // See if we've handled all the splits.
       if (Idx >= Size)
@@ -3693,14 +3763,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       StoreInst *SI = cast<StoreInst>(LU);
       if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
         DeferredStores = true;
-        DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
+        LLVM_DEBUG(dbgs() << "    Deferred splitting of store: " << *SI
+                          << "\n");
         continue;
       }
 
       Value *StoreBasePtr = SI->getPointerOperand();
       IRB.SetInsertPoint(SI);
 
-      DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+      LLVM_DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
 
       for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
         LoadInst *PLoad = SplitLoads[Idx];
@@ -3712,11 +3783,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
         StoreInst *PStore = IRB.CreateAlignedStore(
             PLoad,
             getAdjustedPtr(IRB, DL, StoreBasePtr,
-                           APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                           APInt(DL.getIndexSizeInBits(AS), PartOffset),
                            PartPtrTy, StoreBasePtr->getName() + "."),
             getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
         PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
-        DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+        LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
       }
 
       // We want to immediately iterate on any allocas impacted by splitting
@@ -3765,7 +3836,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     Value *LoadBasePtr = LI->getPointerOperand();
     Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
 
-    DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+    LLVM_DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
 
     // Check whether we have an already split load.
     auto SplitLoadsMapI = SplitLoadsMap.find(LI);
@@ -3775,7 +3846,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
              "Too few split loads for the number of splits in the store!");
     } else {
-      DEBUG(dbgs() << "          of load: " << *LI << "\n");
+      LLVM_DEBUG(dbgs() << "          of load: " << *LI << "\n");
     }
 
     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
@@ -3794,7 +3865,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
         auto AS = LI->getPointerAddressSpace();
         PLoad = IRB.CreateAlignedLoad(
             getAdjustedPtr(IRB, DL, LoadBasePtr,
-                           APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                           APInt(DL.getIndexSizeInBits(AS), PartOffset),
                            LoadPartPtrTy, LoadBasePtr->getName() + "."),
             getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
             LI->getName());
@@ -3806,7 +3877,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       StoreInst *PStore = IRB.CreateAlignedStore(
           PLoad,
           getAdjustedPtr(IRB, DL, StoreBasePtr,
-                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                         APInt(DL.getIndexSizeInBits(AS), PartOffset),
                          StorePartPtrTy, StoreBasePtr->getName() + "."),
           getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
 
@@ -3815,11 +3886,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
                 /*IsSplittable*/ false));
-      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
-                   << ", " << NewSlices.back().endOffset() << "): " << *PStore
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                        << ", " << NewSlices.back().endOffset()
+                        << "): " << *PStore << "\n");
       if (!SplitLoads) {
-        DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+        LLVM_DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
       }
 
       // See if we've finished all the splits.
@@ -3874,10 +3945,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // sequence.
   AS.insert(NewSlices);
 
-  DEBUG(dbgs() << "  Pre-split slices:\n");
+  LLVM_DEBUG(dbgs() << "  Pre-split slices:\n");
 #ifndef NDEBUG
   for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
-    DEBUG(AS.print(dbgs(), I, "    "));
+    LLVM_DEBUG(AS.print(dbgs(), I, "    "));
 #endif
 
   // Finally, don't try to promote any allocas that new require re-splitting.
@@ -3891,7 +3962,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   return true;
 }
 
-/// \brief Rewrite an alloca partition's users.
+/// Rewrite an alloca partition's users.
 ///
 /// This routine drives both of the rewriting goals of the SROA pass. It tries
 /// to rewrite uses of an alloca partition to be conducive for SSA value
@@ -3934,10 +4005,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // exact same type as the original, and with the same access offsets. In that
   // case, re-use the existing alloca, but still run through the rewriter to
   // perform phi and select speculation.
+  // P.beginOffset() can be non-zero even with the same type in a case with
+  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
   AllocaInst *NewAI;
-  if (SliceTy == AI.getAllocatedType()) {
-    assert(P.beginOffset() == 0 &&
-           "Non-zero begin offset but same alloca type");
+  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
     NewAI = &AI;
     // FIXME: We should be able to bail at this point with "nothing changed".
     // FIXME: We might want to defer PHI speculation until after here.
@@ -3958,12 +4029,14 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     NewAI = new AllocaInst(
       SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+    // Copy the old AI debug location over to the new one.
+    NewAI->setDebugLoc(AI.getDebugLoc());
     ++NumNewAllocas;
   }
 
-  DEBUG(dbgs() << "Rewriting alloca partition "
-               << "[" << P.beginOffset() << "," << P.endOffset()
-               << ") to: " << *NewAI << "\n");
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
+                    << "[" << P.beginOffset() << "," << P.endOffset()
+                    << ") to: " << *NewAI << "\n");
 
   // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
@@ -4040,7 +4113,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   return NewAI;
 }
 
-/// \brief Walks the slices of an alloca and form partitions based on them,
+/// Walks the slices of an alloca and form partitions based on them,
 /// rewriting each of their uses.
 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   if (AS.begin() == AS.end())
@@ -4063,7 +4136,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
   const uint64_t MaxBitVectorSize = 1024;
-  if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) {
+  if (AllocaSize <= MaxBitVectorSize) {
     // If a byte boundary is included in any load or store, a slice starting or
     // ending at the boundary is not splittable.
     SmallBitVector SplittableOffset(AllocaSize + 1, true);
@@ -4106,7 +4179,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   }
 
   if (!IsSorted)
-    std::sort(AS.begin(), AS.end());
+    llvm::sort(AS.begin(), AS.end());
 
   /// Describes the allocas introduced by rewritePartition in order to migrate
   /// the debug info.
@@ -4201,7 +4274,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   return Changed;
 }
 
-/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+/// Clobber a use with undef, deleting the used value if it becomes dead.
 void SROA::clobberUse(Use &U) {
   Value *OldV = U;
   // Replace the use with an undef value.
@@ -4216,13 +4289,13 @@ void SROA::clobberUse(Use &U) {
     }
 }
 
-/// \brief Analyze an alloca for SROA.
+/// Analyze an alloca for SROA.
 ///
 /// This analyzes the alloca to ensure we can reason about it, builds
 /// the slices of the alloca, and then hands it off to be split and
 /// rewritten as needed.
 bool SROA::runOnAlloca(AllocaInst &AI) {
-  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
   ++NumAllocasAnalyzed;
 
   // Special case dead allocas, as they're trivial.
@@ -4246,7 +4319,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // Build the slices using a recursive instruction-visiting builder.
   AllocaSlices AS(DL, AI);
-  DEBUG(AS.print(dbgs()));
+  LLVM_DEBUG(AS.print(dbgs()));
   if (AS.isEscaped())
     return Changed;
 
@@ -4274,18 +4347,18 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   Changed |= splitAlloca(AI, AS);
 
-  DEBUG(dbgs() << "  Speculating PHIs\n");
+  LLVM_DEBUG(dbgs() << "  Speculating PHIs\n");
   while (!SpeculatablePHIs.empty())
     speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
 
-  DEBUG(dbgs() << "  Speculating Selects\n");
+  LLVM_DEBUG(dbgs() << "  Speculating Selects\n");
   while (!SpeculatableSelects.empty())
     speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
 
   return Changed;
 }
 
-/// \brief Delete the dead instructions accumulated in this run.
+/// Delete the dead instructions accumulated in this run.
 ///
 /// Recursively deletes the dead instructions we've accumulated. This is done
 /// at the very end to maximize locality of the recursive delete and to
@@ -4299,7 +4372,7 @@ bool SROA::deleteDeadInstructions(
   bool Changed = false;
   while (!DeadInsts.empty()) {
     Instruction *I = DeadInsts.pop_back_val();
-    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
 
     // If the instruction is an alloca, find the possible dbg.declare connected
     // to it, and remove it too. We must do this before calling RAUW or we will
@@ -4327,7 +4400,7 @@ bool SROA::deleteDeadInstructions(
   return Changed;
 }
 
-/// \brief Promote the allocas, using the best available technique.
+/// Promote the allocas, using the best available technique.
 ///
 /// This attempts to promote whatever allocas have been identified as viable in
 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
@@ -4338,7 +4411,7 @@ bool SROA::promoteAllocas(Function &F) {
 
   NumPromoted += PromotableAllocas.size();
 
-  DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
   PromoteMemToReg(PromotableAllocas, *DT, AC);
   PromotableAllocas.clear();
   return true;
@@ -4346,7 +4419,7 @@ bool SROA::promoteAllocas(Function &F) {
 
 PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
                                 AssumptionCache &RunAC) {
-  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
   DT = &RunDT;
   AC = &RunAC;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index 3b99ddff2e06..526487d3477e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -45,6 +45,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeScalarizerPass(Registry);
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
+  initializeLoopGuardWideningLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
   initializeNewGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
@@ -52,9 +53,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeGVNHoistLegacyPassPass(Registry);
   initializeGVNSinkLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
-  initializeInductiveRangeCheckEliminationPass(Registry);
+  initializeIRCELegacyPassPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
   initializeInferAddressSpacesPass(Registry);
+  initializeInstSimplifyLegacyPassPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLegacyLICMPassPass(Registry);
   initializeLegacyLoopSinkPassPass(Registry);
@@ -68,6 +70,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
+  initializeLoopUnrollAndJamPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeLoopVersioningLICMPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
@@ -83,7 +86,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeRegToMemPass(Registry);
   initializeRewriteStatepointsForGCLegacyPassPass(Registry);
   initializeSCCPLegacyPassPass(Registry);
-  initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
@@ -104,6 +106,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializePostInlineEntryExitInstrumenterPass(Registry);
 }
 
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
   initializeScalarOpts(*unwrap(R));
 }
@@ -148,10 +154,6 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIndVarSimplifyPass());
 }
 
-void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createInstructionCombiningPass());
-}
-
 void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createJumpThreadingPass());
 }
@@ -180,14 +182,14 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRerollPass());
 }
 
-void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createLoopSimplifyCFGPass());
-}
-
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollPass());
 }
 
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollAndJamPass());
+}
+
 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnswitchPass());
 }
@@ -200,14 +202,6 @@ void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPartiallyInlineLibCallsPass());
 }
 
-void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createLowerSwitchPass());
-}
-
-void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
-}
-
 void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createReassociatePass());
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 4a96e0ddca16..967f4a42a8fb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -165,8 +165,8 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -190,7 +190,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -213,7 +212,7 @@ static cl::opt<bool>
 
 namespace {
 
-/// \brief A helper class for separating a constant offset from a GEP index.
+/// A helper class for separating a constant offset from a GEP index.
 ///
 /// In real programs, a GEP index may be more complicated than a simple addition
 /// of something and a constant integer which can be trivially splitted. For
@@ -340,16 +339,15 @@ private:
   const DominatorTree *DT;
 };
 
-/// \brief A pass that tries to split every GEP in the function into a variadic
+/// A pass that tries to split every GEP in the function into a variadic
 /// base and a constant offset. It is a FunctionPass because searching for the
 /// constant offset may inspect other basic blocks.
 class SeparateConstOffsetFromGEP : public FunctionPass {
 public:
   static char ID;
 
-  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
-                             bool LowerGEP = false)
-      : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
+  SeparateConstOffsetFromGEP(bool LowerGEP = false)
+      : FunctionPass(ID), LowerGEP(LowerGEP) {
     initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
   }
 
@@ -450,7 +448,6 @@ private:
   const DataLayout *DL = nullptr;
   DominatorTree *DT = nullptr;
   ScalarEvolution *SE;
-  const TargetMachine *TM;
 
   LoopInfo *LI;
   TargetLibraryInfo *TLI;
@@ -480,10 +477,8 @@ INITIALIZE_PASS_END(
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 
-FunctionPass *
-llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
-                                           bool LowerGEP) {
-  return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
+  return new SeparateConstOffsetFromGEP(LowerGEP);
 }
 
 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -502,6 +497,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
   Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
   // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
   // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+  // FIXME: this does not appear to be covered by any tests
+  //        (with x86/aarch64 backends at least)
   if (BO->getOpcode() == Instruction::Or &&
       !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
     return false;
@@ -590,6 +587,10 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
     // Trace into subexpressions for more hoisting opportunities.
     if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
       ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+  } else if (isa<TruncInst>(V)) {
+    ConstantOffset =
+        find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
+            .trunc(BitWidth);
   } else if (isa<SExtInst>(V)) {
     ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
                           ZeroExtended, NonNegative).sext(BitWidth);
@@ -654,8 +655,9 @@ ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
   }
 
   if (CastInst *Cast = dyn_cast<CastInst>(U)) {
-    assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
-           "We only traced into two types of CastInst: sext and zext");
+    assert(
+        (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
+        "Only following instructions can be traced: sext, zext & trunc");
     ExtInsts.push_back(Cast);
     UserChain[ChainIndex] = nullptr;
     return distributeExtsAndCloneChain(ChainIndex - 1);
@@ -706,7 +708,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   BinaryOperator::BinaryOps NewOp = BO->getOpcode();
   if (BO->getOpcode() == Instruction::Or) {
     // Rebuild "or" as "add", because "or" may be invalid for the new
-    // epxression.
+    // expression.
     //
     // For instance, given
     //   a | (b + 5) where a and b + 5 have no common bits,
@@ -943,6 +945,10 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 
   if (!NeedsExtraction)
     return Changed;
+
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
+
   // If LowerGEP is disabled, before really splitting the GEP, check whether the
   // backend supports the addressing mode we are about to produce. If no, this
   // splitting probably won't be beneficial.
@@ -951,9 +957,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // of variable indices. Therefore, we don't check for addressing modes in that
   // case.
   if (!LowerGEP) {
-    TargetTransformInfo &TTI =
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-            *GEP->getParent()->getParent());
     unsigned AddrSpace = GEP->getPointerAddressSpace();
     if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
                                    /*BaseGV=*/nullptr, AccumulativeByteOffset,
@@ -1016,7 +1019,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   if (LowerGEP) {
     // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
     // arithmetic operations if the target uses alias analysis in codegen.
-    if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+    if (TTI.useAA())
       lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
     else
       lowerToArithmetics(GEP, AccumulativeByteOffset);
@@ -1065,7 +1068,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
       DL->getTypeAllocSize(GEP->getResultElementType()));
   Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
   if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
-    // Very likely. As long as %gep is natually aligned, the byte offset we
+    // Very likely. As long as %gep is naturally aligned, the byte offset we
     // extracted should be a multiple of sizeof(*%gep).
     int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
     NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
@@ -1295,7 +1298,7 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
 
   // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
   const DataLayout &DAL = First->getModule()->getDataLayout();
-  APInt Offset(DAL.getPointerSizeInBits(
+  APInt Offset(DAL.getIndexSizeInBits(
                    cast<PointerType>(First->getType())->getAddressSpace()),
                0);
   Value *NewBase =
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aba732bc413f..34510cb40732 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1,4 +1,4 @@
-//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,10 +17,14 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -66,180 +70,65 @@ static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
                       cl::desc("The cost threshold for unswitching a loop."));
 
-static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
-                                        Constant &Replacement) {
-  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-
-  // Replace uses of LIC in the loop with the given constant.
-  for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
-    // Grab the use and walk past it so we can clobber it in the use list.
-    Use *U = &*UI++;
-    Instruction *UserI = dyn_cast<Instruction>(U->getUser());
-    if (!UserI || !L.contains(UserI))
-      continue;
-
-    // Replace this use within the loop body.
-    *U = &Replacement;
-  }
-}
-
-/// Update the IDom for a basic block whose predecessor set has changed.
-///
-/// This routine is designed to work when the domtree update is relatively
-/// localized by leveraging a known common dominator, often a loop header.
+/// Collect all of the loop invariant input values transitively used by the
+/// homogeneous instruction graph from a given root.
 ///
-/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS
-/// approach here as we can do that easily by persisting the candidate IDom's
-/// dominating set between each predecessor.
-///
-/// FIXME: Longer term, many uses of this can be replaced by an incremental
-/// domtree update strategy that starts from a known dominating block and
-/// rebuilds that subtree.
-static bool updateIDomWithKnownCommonDominator(BasicBlock *BB,
-                                               BasicBlock *KnownDominatingBB,
-                                               DominatorTree &DT) {
-  assert(pred_begin(BB) != pred_end(BB) &&
-         "This routine does not handle unreachable blocks!");
-
-  BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock();
-
-  BasicBlock *IDom = *pred_begin(BB);
-  assert(DT.dominates(KnownDominatingBB, IDom) &&
-         "Bad known dominating block!");
-
-  // Walk all of the other predecessors finding the nearest common dominator
-  // until all predecessors are covered or we reach the loop header. The loop
-  // header necessarily dominates all loop exit blocks in loop simplified form
-  // so we can early-exit the moment we hit that block.
-  for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB);
-       PI != PE && IDom != KnownDominatingBB; ++PI) {
-    assert(DT.dominates(KnownDominatingBB, *PI) &&
-           "Bad known dominating block!");
-    IDom = DT.findNearestCommonDominator(IDom, *PI);
-  }
+/// This essentially walks from a root recursively through loop variant operands
+/// which have the exact same opcode and finds all inputs which are loop
+/// invariant. For some operations these can be re-associated and unswitched out
+/// of the loop entirely.
+static TinyPtrVector<Value *>
+collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
+                                         LoopInfo &LI) {
+  assert(!L.isLoopInvariant(&Root) &&
+         "Only need to walk the graph if root itself is not invariant.");
+  TinyPtrVector<Value *> Invariants;
+
+  // Build a worklist and recurse through operators collecting invariants.
+  SmallVector<Instruction *, 4> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  Worklist.push_back(&Root);
+  Visited.insert(&Root);
+  do {
+    Instruction &I = *Worklist.pop_back_val();
+    for (Value *OpV : I.operand_values()) {
+      // Skip constants as unswitching isn't interesting for them.
+      if (isa<Constant>(OpV))
+        continue;
 
-  if (IDom == OrigIDom)
-    return false;
+      // Add it to our result if loop invariant.
+      if (L.isLoopInvariant(OpV)) {
+        Invariants.push_back(OpV);
+        continue;
+      }
 
-  DT.changeImmediateDominator(BB, IDom);
-  return true;
-}
+      // If not an instruction with the same opcode, nothing we can do.
+      Instruction *OpI = dyn_cast<Instruction>(OpV);
+      if (!OpI || OpI->getOpcode() != Root.getOpcode())
+        continue;
 
-// Note that we don't currently use the IDFCalculator here for two reasons:
-// 1) It computes dominator tree levels for the entire function on each run
-//    of 'compute'. While this isn't terrible, given that we expect to update
-//    relatively small subtrees of the domtree, it isn't necessarily the right
-//    tradeoff.
-// 2) The interface doesn't fit this usage well. It doesn't operate in
-//    append-only, and builds several sets that we don't need.
-//
-// FIXME: Neither of these issues are a big deal and could be addressed with
-// some amount of refactoring of IDFCalculator. That would allow us to share
-// the core logic here (which is solving the same core problem).
-static void appendDomFrontier(DomTreeNode *Node,
-                              SmallSetVector<BasicBlock *, 4> &Worklist,
-                              SmallVectorImpl<DomTreeNode *> &DomNodes,
-                              SmallPtrSetImpl<BasicBlock *> &DomSet) {
-  assert(DomNodes.empty() && "Must start with no dominator nodes.");
-  assert(DomSet.empty() && "Must start with an empty dominator set.");
-
-  // First flatten this subtree into sequence of nodes by doing a pre-order
-  // walk.
-  DomNodes.push_back(Node);
-  // We intentionally re-evaluate the size as each node can add new children.
-  // Because this is a tree walk, this cannot add any duplicates.
-  for (int i = 0; i < (int)DomNodes.size(); ++i)
-    DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
-
-  // Now create a set of the basic blocks so we can quickly test for
-  // dominated successors. We could in theory use the DFS numbers of the
-  // dominator tree for this, but we want this to remain predictably fast
-  // even while we mutate the dominator tree in ways that would invalidate
-  // the DFS numbering.
-  for (DomTreeNode *InnerN : DomNodes)
-    DomSet.insert(InnerN->getBlock());
-
-  // Now re-walk the nodes, appending every successor of every node that isn't
-  // in the set. Note that we don't append the node itself, even though if it
-  // is a successor it does not strictly dominate itself and thus it would be
-  // part of the dominance frontier. The reason we don't append it is that
-  // the node passed in came *from* the worklist and so it has already been
-  // processed.
-  for (DomTreeNode *InnerN : DomNodes)
-    for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
-      if (!DomSet.count(SuccBB))
-        Worklist.insert(SuccBB);
-
-  DomNodes.clear();
-  DomSet.clear();
-}
+      // Visit this operand.
+      if (Visited.insert(OpI).second)
+        Worklist.push_back(OpI);
+    }
+  } while (!Worklist.empty());
 
-/// Update the dominator tree after unswitching a particular former exit block.
-///
-/// This handles the full update of the dominator tree after hoisting a block
-/// that previously was an exit block (or split off of an exit block) up to be
-/// reached from the new immediate dominator of the preheader.
-///
-/// The common case is simple -- we just move the unswitched block to have an
-/// immediate dominator of the old preheader. But in complex cases, there may
-/// be other blocks reachable from the unswitched block that are immediately
-/// dominated by some node between the unswitched one and the old preheader.
-/// All of these also need to be hoisted in the dominator tree. We also want to
-/// minimize queries to the dominator tree because each step of this
-/// invalidates any DFS numbers that would make queries fast.
-static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
-                                  DominatorTree &DT) {
-  DomTreeNode *OldPHNode = DT[OldPH];
-  DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
-  // If the dominator tree has already been updated for this unswitched node,
-  // we're done. This makes it easier to use this routine if there are multiple
-  // paths to the same unswitched destination.
-  if (UnswitchedNode->getIDom() == OldPHNode)
-    return;
+  return Invariants;
+}
 
-  // First collect the domtree nodes that we are hoisting over. These are the
-  // set of nodes which may have children that need to be hoisted as well.
-  SmallPtrSet<DomTreeNode *, 4> DomChain;
-  for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
-       IDom = IDom->getIDom())
-    DomChain.insert(IDom);
-
-  // The unswitched block ends up immediately dominated by the old preheader --
-  // regardless of whether it is the loop exit block or split off of the loop
-  // exit block.
-  DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
-
-  // For everything that moves up the dominator tree, we need to examine the
-  // dominator frontier to see if it additionally should move up the dominator
-  // tree. This lambda appends the dominator frontier for a node on the
-  // worklist.
-  SmallSetVector<BasicBlock *, 4> Worklist;
-
-  // Scratch data structures reused by domfrontier finding.
-  SmallVector<DomTreeNode *, 4> DomNodes;
-  SmallPtrSet<BasicBlock *, 4> DomSet;
-
-  // Append the initial dom frontier nodes.
-  appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet);
-
-  // Walk the worklist. We grow the list in the loop and so must recompute size.
-  for (int i = 0; i < (int)Worklist.size(); ++i) {
-    auto *BB = Worklist[i];
-
-    DomTreeNode *Node = DT[BB];
-    assert(!DomChain.count(Node) &&
-           "Cannot be dominated by a block you can reach!");
-
-    // If this block had an immediate dominator somewhere in the chain
-    // we hoisted over, then its position in the domtree needs to move as it is
-    // reachable from a node hoisted over this chain.
-    if (!DomChain.count(Node->getIDom()))
-      continue;
+static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
+                                     Constant &Replacement) {
+  assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
 
-    DT.changeImmediateDominator(Node, OldPHNode);
+  // Replace uses of LIC in the loop with the given constant.
+  for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) {
+    // Grab the use and walk past it so we can clobber it in the use list.
+    Use *U = &*UI++;
+    Instruction *UserI = dyn_cast<Instruction>(U->getUser());
 
-    // Now add this node's dominator frontier to the worklist as well.
-    appendDomFrontier(Node, Worklist, DomNodes, DomSet);
+    // Replace this use within the loop body.
+    if (UserI && L.contains(UserI))
+      U->set(&Replacement);
   }
 }
 
@@ -261,6 +150,26 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
   llvm_unreachable("Basic blocks should never be empty!");
 }
 
+/// Insert code to test a set of loop invariant values, and conditionally branch
+/// on them.
+static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
+                                                  ArrayRef<Value *> Invariants,
+                                                  bool Direction,
+                                                  BasicBlock &UnswitchedSucc,
+                                                  BasicBlock &NormalSucc) {
+  IRBuilder<> IRB(&BB);
+  Value *Cond = Invariants.front();
+  for (Value *Invariant :
+       make_range(std::next(Invariants.begin()), Invariants.end()))
+    if (Direction)
+      Cond = IRB.CreateOr(Cond, Invariant);
+    else
+      Cond = IRB.CreateAnd(Cond, Invariant);
+
+  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+                   Direction ? &NormalSucc : &UnswitchedSucc);
+}
+
 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
 ///
 /// Requires that the loop exit and unswitched basic block are the same, and
@@ -293,7 +202,8 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
 static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
                                                       BasicBlock &UnswitchedBB,
                                                       BasicBlock &OldExitingBB,
-                                                      BasicBlock &OldPH) {
+                                                      BasicBlock &OldPH,
+                                                      bool FullUnswitch) {
   assert(&ExitBB != &UnswitchedBB &&
          "Must have different loop exit and unswitched blocks!");
   Instruction *InsertPt = &*UnswitchedBB.begin();
@@ -314,7 +224,11 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
       if (PN.getIncomingBlock(i) != &OldExitingBB)
         continue;
 
-      Value *Incoming = PN.removeIncomingValue(i);
+      Value *Incoming = PN.getIncomingValue(i);
+      if (FullUnswitch)
+        // No more edge from the old exiting block to the exit block.
+        PN.removeIncomingValue(i);
+
       NewPN->addIncoming(Incoming, &OldPH);
     }
 
@@ -325,6 +239,76 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
   }
 }
 
+/// Hoist the current loop up to the innermost loop containing a remaining exit.
+///
+/// Because we've removed an exit from the loop, we may have changed the set of
+/// loops reachable and need to move the current loop up the loop nest or even
+/// to an entirely separate nest.
+static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
+                                 DominatorTree &DT, LoopInfo &LI) {
+  // If the loop is already at the top level, we can't hoist it anywhere.
+  Loop *OldParentL = L.getParentLoop();
+  if (!OldParentL)
+    return;
+
+  SmallVector<BasicBlock *, 4> Exits;
+  L.getExitBlocks(Exits);
+  Loop *NewParentL = nullptr;
+  for (auto *ExitBB : Exits)
+    if (Loop *ExitL = LI.getLoopFor(ExitBB))
+      if (!NewParentL || NewParentL->contains(ExitL))
+        NewParentL = ExitL;
+
+  if (NewParentL == OldParentL)
+    return;
+
+  // The new parent loop (if different) should always contain the old one.
+  if (NewParentL)
+    assert(NewParentL->contains(OldParentL) &&
+           "Can only hoist this loop up the nest!");
+
+  // The preheader will need to move with the body of this loop. However,
+  // because it isn't in this loop we also need to update the primary loop map.
+  assert(OldParentL == LI.getLoopFor(&Preheader) &&
+         "Parent loop of this loop should contain this loop's preheader!");
+  LI.changeLoopFor(&Preheader, NewParentL);
+
+  // Remove this loop from its old parent.
+  OldParentL->removeChildLoop(&L);
+
+  // Add the loop either to the new parent or as a top-level loop.
+  if (NewParentL)
+    NewParentL->addChildLoop(&L);
+  else
+    LI.addTopLevelLoop(&L);
+
+  // Remove this loops blocks from the old parent and every other loop up the
+  // nest until reaching the new parent. Also update all of these
+  // no-longer-containing loops to reflect the nesting change.
+  for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
+       OldContainingL = OldContainingL->getParentLoop()) {
+    llvm::erase_if(OldContainingL->getBlocksVector(),
+                   [&](const BasicBlock *BB) {
+                     return BB == &Preheader || L.contains(BB);
+                   });
+
+    OldContainingL->getBlocksSet().erase(&Preheader);
+    for (BasicBlock *BB : L.blocks())
+      OldContainingL->getBlocksSet().erase(BB);
+
+    // Because we just hoisted a loop out of this one, we have essentially
+    // created new exit paths from it. That means we need to form LCSSA PHI
+    // nodes for values used in the no-longer-nested loop.
+    formLCSSA(*OldContainingL, DT, &LI, nullptr);
+
+    // We shouldn't need to form dedicated exits because the exit introduced
+    // here is the (just split by unswitching) preheader. As such, it is
+    // necessarily dedicated.
+    assert(OldContainingL->hasDedicatedExits() &&
+           "Unexpected predecessor of hoisted loop preheader!");
+  }
+}
+
 /// Unswitch a trivial branch if the condition is loop invariant.
 ///
 /// This routine should only be called when loop code leading to the branch has
@@ -339,48 +323,83 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
 /// (splitting the exit block as necessary). It simplifies the branch within
 /// the loop to an unconditional branch but doesn't remove it entirely. Further
 /// cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
 static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
-                                  LoopInfo &LI) {
+                                  LoopInfo &LI, ScalarEvolution *SE) {
   assert(BI.isConditional() && "Can only unswitch a conditional branch!");
-  DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
+  LLVM_DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
 
-  Value *LoopCond = BI.getCondition();
+  // The loop invariant values that we want to unswitch.
+  TinyPtrVector<Value *> Invariants;
 
-  // Need a trivial loop condition to unswitch.
-  if (!L.isLoopInvariant(LoopCond))
-    return false;
+  // When true, we're fully unswitching the branch rather than just unswitching
+  // some input conditions to the branch.
+  bool FullUnswitch = false;
+
+  if (L.isLoopInvariant(BI.getCondition())) {
+    Invariants.push_back(BI.getCondition());
+    FullUnswitch = true;
+  } else {
+    if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
+      Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
+    if (Invariants.empty())
+      // Couldn't find invariant inputs!
+      return false;
+  }
 
-  // FIXME: We should compute this once at the start and update it!
-  SmallVector<BasicBlock *, 16> ExitBlocks;
-  L.getExitBlocks(ExitBlocks);
-  SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
-                                             ExitBlocks.end());
-
-  // Check to see if a successor of the branch is guaranteed to
-  // exit through a unique exit block without having any
-  // side-effects.  If so, determine the value of Cond that causes
-  // it to do this.
-  ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
-  ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+  // Check that one of the branch's successors exits, and which one.
+  bool ExitDirection = true;
   int LoopExitSuccIdx = 0;
   auto *LoopExitBB = BI.getSuccessor(0);
-  if (!ExitBlockSet.count(LoopExitBB)) {
-    std::swap(CondVal, Replacement);
+  if (L.contains(LoopExitBB)) {
+    ExitDirection = false;
     LoopExitSuccIdx = 1;
     LoopExitBB = BI.getSuccessor(1);
-    if (!ExitBlockSet.count(LoopExitBB))
+    if (L.contains(LoopExitBB))
       return false;
   }
   auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
-  assert(L.contains(ContinueBB) &&
-         "Cannot have both successors exit and still be in the loop!");
-
   auto *ParentBB = BI.getParent();
   if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
     return false;
 
-  DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
-               << " == " << LoopCond << "\n");
+  // When unswitching only part of the branch's condition, we need the exit
+  // block to be reached directly from the partially unswitched input. This can
+  // be done when the exit block is along the true edge and the branch condition
+  // is a graph of `or` operations, or the exit block is along the false edge
+  // and the condition is a graph of `and` operations.
+  if (!FullUnswitch) {
+    if (ExitDirection) {
+      if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or)
+        return false;
+    } else {
+      if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And)
+        return false;
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "    unswitching trivial invariant conditions for: " << BI
+           << "\n";
+    for (Value *Invariant : Invariants) {
+      dbgs() << "      " << *Invariant << " == true";
+      if (Invariant != Invariants.back())
+        dbgs() << " ||";
+      dbgs() << "\n";
+    }
+  });
+
+  // If we have scalar evolutions, we need to invalidate them including this
+  // loop and the loop containing the exit block.
+  if (SE) {
+    if (Loop *ExitL = LI.getLoopFor(LoopExitBB))
+      SE->forgetLoop(ExitL);
+    else
+      // Forget the entire nest as this exits the entire nest.
+      SE->forgetTopmostLoop(&L);
+  }
 
   // Split the preheader, so that we know that there is a safe place to insert
   // the conditional branch. We will change the preheader to have a conditional
@@ -393,45 +412,73 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // unswitching. We need to split this if there are other loop predecessors.
   // Because the loop is in simplified form, *any* other predecessor is enough.
   BasicBlock *UnswitchedBB;
-  if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
-    (void)PredBB;
-    assert(PredBB == BI.getParent() &&
+  if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
+    assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
            "A branch's parent isn't a predecessor!");
     UnswitchedBB = LoopExitBB;
   } else {
     UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
   }
 
-  // Now splice the branch to gate reaching the new preheader and re-point its
-  // successors.
-  OldPH->getInstList().splice(std::prev(OldPH->end()),
-                              BI.getParent()->getInstList(), BI);
+  // Actually move the invariant uses into the unswitched position. If possible,
+  // we do this by moving the instructions, but when doing partial unswitching
+  // we do it by building a new merge of the values in the unswitched position.
   OldPH->getTerminator()->eraseFromParent();
-  BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
-  BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
-
-  // Create a new unconditional branch that will continue the loop as a new
-  // terminator.
-  BranchInst::Create(ContinueBB, ParentBB);
+  if (FullUnswitch) {
+    // If fully unswitching, we can use the existing branch instruction.
+    // Splice it into the old PH to gate reaching the new preheader and re-point
+    // its successors.
+    OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
+                                BI);
+    BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+    BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+    // Create a new unconditional branch that will continue the loop as a new
+    // terminator.
+    BranchInst::Create(ContinueBB, ParentBB);
+  } else {
+    // Only unswitching a subset of inputs to the condition, so we will need to
+    // build a new branch that merges the invariant inputs.
+    if (ExitDirection)
+      assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+                 Instruction::Or &&
+             "Must have an `or` of `i1`s for the condition!");
+    else
+      assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+                 Instruction::And &&
+             "Must have an `and` of `i1`s for the condition!");
+    buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
+                                          *UnswitchedBB, *NewPH);
+  }
 
   // Rewrite the relevant PHI nodes.
   if (UnswitchedBB == LoopExitBB)
     rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
   else
     rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
-                                              *ParentBB, *OldPH);
+                                              *ParentBB, *OldPH, FullUnswitch);
 
   // Now we need to update the dominator tree.
-  updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
-  // But if we split something off of the loop exit block then we also removed
-  // one of the predecessors for the loop exit block and may need to update its
-  // idom.
-  if (UnswitchedBB != LoopExitBB)
-    updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT);
+  DT.insertEdge(OldPH, UnswitchedBB);
+  if (FullUnswitch)
+    DT.deleteEdge(ParentBB, UnswitchedBB);
+
+  // The constant we can replace all of our invariants with inside the loop
+  // body. If any of the invariants have a value other than this the loop won't
+  // be entered.
+  ConstantInt *Replacement = ExitDirection
+                                 ? ConstantInt::getFalse(BI.getContext())
+                                 : ConstantInt::getTrue(BI.getContext());
 
   // Since this is an i1 condition we can also trivially replace uses of it
   // within the loop with a constant.
-  replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+  for (Value *Invariant : Invariants)
+    replaceLoopInvariantUses(L, Invariant, *Replacement);
+
+  // If this was full unswitching, we may have changed the nesting relationship
+  // for this loop so hoist it to its correct parent if needed.
+  if (FullUnswitch)
+    hoistLoopToNewParent(L, *NewPH, DT, LI);
 
   ++NumTrivial;
   ++NumBranches;
@@ -461,9 +508,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
 /// switch will not be revisited. If after unswitching there is only a single
 /// in-loop successor, the switch is further simplified to an unconditional
 /// branch. Still more cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
 static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
-                                  LoopInfo &LI) {
-  DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
+                                  LoopInfo &LI, ScalarEvolution *SE) {
+  LLVM_DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
   Value *LoopCond = SI.getCondition();
 
   // If this isn't switching on an invariant condition, we can't unswitch it.
@@ -472,41 +522,62 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
 
   auto *ParentBB = SI.getParent();
 
-  // FIXME: We should compute this once at the start and update it!
-  SmallVector<BasicBlock *, 16> ExitBlocks;
-  L.getExitBlocks(ExitBlocks);
-  SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
-                                             ExitBlocks.end());
-
   SmallVector<int, 4> ExitCaseIndices;
   for (auto Case : SI.cases()) {
     auto *SuccBB = Case.getCaseSuccessor();
-    if (ExitBlockSet.count(SuccBB) &&
+    if (!L.contains(SuccBB) &&
         areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
       ExitCaseIndices.push_back(Case.getCaseIndex());
   }
   BasicBlock *DefaultExitBB = nullptr;
-  if (ExitBlockSet.count(SI.getDefaultDest()) &&
+  if (!L.contains(SI.getDefaultDest()) &&
       areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
       !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
     DefaultExitBB = SI.getDefaultDest();
   else if (ExitCaseIndices.empty())
     return false;
 
-  DEBUG(dbgs() << "    unswitching trivial cases...\n");
+  LLVM_DEBUG(dbgs() << "    unswitching trivial cases...\n");
+
+  // We may need to invalidate SCEVs for the outermost loop reached by any of
+  // the exits.
+  Loop *OuterL = &L;
 
+  if (DefaultExitBB) {
+    // Clear out the default destination temporarily to allow accurate
+    // predecessor lists to be examined below.
+    SI.setDefaultDest(nullptr);
+    // Check the loop containing this exit.
+    Loop *ExitL = LI.getLoopFor(DefaultExitBB);
+    if (!ExitL || ExitL->contains(OuterL))
+      OuterL = ExitL;
+  }
+
+  // Store the exit cases into a separate data structure and remove them from
+  // the switch.
   SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
   ExitCases.reserve(ExitCaseIndices.size());
   // We walk the case indices backwards so that we remove the last case first
   // and don't disrupt the earlier indices.
   for (unsigned Index : reverse(ExitCaseIndices)) {
     auto CaseI = SI.case_begin() + Index;
+    // Compute the outer loop from this exit.
+    Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
+    if (!ExitL || ExitL->contains(OuterL))
+      OuterL = ExitL;
     // Save the value of this case.
     ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
     // Delete the unswitched cases.
     SI.removeCase(CaseI);
   }
 
+  if (SE) {
+    if (OuterL)
+      SE->forgetLoop(OuterL);
+    else
+      SE->forgetTopmostLoop(&L);
+  }
+
   // Check if after this all of the remaining cases point at the same
   // successor.
   BasicBlock *CommonSuccBB = nullptr;
@@ -517,23 +588,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
                            SI.case_begin()->getCaseSuccessor();
                   }))
     CommonSuccBB = SI.case_begin()->getCaseSuccessor();
-
-  if (DefaultExitBB) {
-    // We can't remove the default edge so replace it with an edge to either
-    // the single common remaining successor (if we have one) or an unreachable
-    // block.
-    if (CommonSuccBB) {
-      SI.setDefaultDest(CommonSuccBB);
-    } else {
-      BasicBlock *UnreachableBB = BasicBlock::Create(
-          ParentBB->getContext(),
-          Twine(ParentBB->getName()) + ".unreachable_default",
-          ParentBB->getParent());
-      new UnreachableInst(ParentBB->getContext(), UnreachableBB);
-      SI.setDefaultDest(UnreachableBB);
-      DT.addNewBlock(UnreachableBB, ParentBB);
-    }
-  } else {
+  if (!DefaultExitBB) {
     // If we're not unswitching the default, we need it to match any cases to
     // have a common successor or if we have no cases it is the common
     // successor.
@@ -570,9 +625,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     } else {
       auto *SplitBB =
           SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
-      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
-                                                *ParentBB, *OldPH);
-      updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT);
+      rewritePHINodesForExitAndUnswitchedBlocks(
+          *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
       DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
     }
   }
@@ -597,9 +651,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     if (!SplitExitBB) {
       // If this is the first time we see this, do the split and remember it.
       SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
-      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
-                                                *ParentBB, *OldPH);
-      updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT);
+      rewritePHINodesForExitAndUnswitchedBlocks(
+          *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
     }
     // Update the case pair to point to the split block.
     CasePair.second = SplitExitBB;
@@ -612,14 +665,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     BasicBlock *UnswitchedBB = CasePair.second;
 
     NewSI->addCase(CaseVal, UnswitchedBB);
-    updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
   }
 
   // If the default was unswitched, re-point it and add explicit cases for
   // entering the loop.
   if (DefaultExitBB) {
     NewSI->setDefaultDest(DefaultExitBB);
-    updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
 
     // We removed all the exit cases, so we just copy the cases to the
     // unswitched switch.
@@ -633,11 +684,57 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   // pointing at unreachable and other complexity.
   if (CommonSuccBB) {
     BasicBlock *BB = SI.getParent();
+    // We may have had multiple edges to this common successor block, so remove
+    // them as predecessors. We skip the first one, either the default or the
+    // actual first case.
+    bool SkippedFirst = DefaultExitBB == nullptr;
+    for (auto Case : SI.cases()) {
+      assert(Case.getCaseSuccessor() == CommonSuccBB &&
+             "Non-common successor!");
+      (void)Case;
+      if (!SkippedFirst) {
+        SkippedFirst = true;
+        continue;
+      }
+      CommonSuccBB->removePredecessor(BB,
+                                      /*DontDeleteUselessPHIs*/ true);
+    }
+    // Now nuke the switch and replace it with a direct branch.
     SI.eraseFromParent();
     BranchInst::Create(CommonSuccBB, BB);
+  } else if (DefaultExitBB) {
+    assert(SI.getNumCases() > 0 &&
+           "If we had no cases we'd have a common successor!");
+    // Move the last case to the default successor. This is valid as if the
+    // default got unswitched it cannot be reached. This has the advantage of
+    // being simple and keeping the number of edges from this switch to
+    // successors the same, and avoiding any PHI update complexity.
+    auto LastCaseI = std::prev(SI.case_end());
+    SI.setDefaultDest(LastCaseI->getCaseSuccessor());
+    SI.removeCase(LastCaseI);
   }
 
-  DT.verifyDomTree();
+  // Walk the unswitched exit blocks and the unswitched split blocks and update
+  // the dominator tree based on the CFG edits. While we are walking unordered
+  // containers here, the API for applyUpdates takes an unordered list of
+  // updates and requires them to not contain duplicates.
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+  for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
+    DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
+    DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
+  }
+  for (auto SplitUnswitchedPair : SplitExitBBMap) {
+    auto *UnswitchedBB = SplitUnswitchedPair.second;
+    DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB});
+    DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
+  }
+  DT.applyUpdates(DTUpdates);
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+  // We may have changed the nesting relationship for this loop so hoist it to
+  // its correct parent if needed.
+  hoistLoopToNewParent(L, *NewPH, DT, LI);
+
   ++NumTrivial;
   ++NumSwitches;
   return true;
@@ -652,8 +749,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
 ///
 /// The return value indicates whether anything was unswitched (and therefore
 /// changed).
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
 static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
-                                         LoopInfo &LI) {
+                                         LoopInfo &LI, ScalarEvolution *SE) {
   bool Changed = false;
 
   // If loop header has only one reachable successor we should keep looking for
@@ -687,8 +787,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
       if (isa<Constant>(SI->getCondition()))
         return Changed;
 
-      if (!unswitchTrivialSwitch(L, *SI, DT, LI))
-        // Coludn't unswitch this one so we're done.
+      if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE))
+        // Couldn't unswitch this one so we're done.
         return Changed;
 
       // Mark that we managed to unswitch something.
@@ -719,17 +819,19 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
 
     // Found a trivial condition candidate: non-foldable conditional branch. If
     // we fail to unswitch this, we can't do anything else that is trivial.
-    if (!unswitchTrivialBranch(L, *BI, DT, LI))
+    if (!unswitchTrivialBranch(L, *BI, DT, LI, SE))
       return Changed;
 
     // Mark that we managed to unswitch something.
     Changed = true;
 
-    // We unswitched the branch. This should always leave us with an
-    // unconditional branch that we can follow now.
+    // If we only unswitched some of the conditions feeding the branch, we won't
+    // have collapsed it to a single successor.
     BI = cast<BranchInst>(CurrentBB->getTerminator());
-    assert(!BI->isConditional() &&
-           "Cannot form a conditional branch by unswitching1");
+    if (BI->isConditional())
+      return Changed;
+
+    // Follow the newly unconditional branch into its successor.
     CurrentBB = BI->getSuccessor(0);
 
     // When continuing, if we exit the loop or reach a previous visited block,
@@ -748,8 +850,12 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
 ///
 /// This routine handles cloning all of the necessary loop blocks and exit
 /// blocks including rewriting their instructions and the relevant PHI nodes.
-/// It skips loop and exit blocks that are not necessary based on the provided
-/// set. It also correctly creates the unconditional branch in the cloned
+/// Any loop blocks or exit blocks which are dominated by a different successor
+/// than the one for this clone of the loop blocks can be trivially skipped. We
+/// use the `DominatingSucc` map to determine whether a block satisfies that
+/// property with a simple map lookup.
+///
+/// It also correctly creates the unconditional branch in the cloned
 /// unswitched parent block to only point at the unswitched successor.
 ///
 /// This does not handle most of the necessary updates to `LoopInfo`. Only exit
@@ -763,9 +869,10 @@ static BasicBlock *buildClonedLoopBlocks(
     Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
     ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
     BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
-    const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
-    ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT,
-    LoopInfo &LI) {
+    const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
+    ValueToValueMapTy &VMap,
+    SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
+    DominatorTree &DT, LoopInfo &LI) {
   SmallVector<BasicBlock *, 4> NewBlocks;
   NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
 
@@ -780,26 +887,29 @@ static BasicBlock *buildClonedLoopBlocks(
     NewBlocks.push_back(NewBB);
     VMap[OldBB] = NewBB;
 
-    // Add the block to the domtree. We'll move it to the correct position
-    // below.
-    DT.addNewBlock(NewBB, SplitBB);
-
     return NewBB;
   };
 
+  // We skip cloning blocks when they have a dominating succ that is not the
+  // succ we are cloning for.
+  auto SkipBlock = [&](BasicBlock *BB) {
+    auto It = DominatingSucc.find(BB);
+    return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
+  };
+
   // First, clone the preheader.
   auto *ClonedPH = CloneBlock(LoopPH);
 
   // Then clone all the loop blocks, skipping the ones that aren't necessary.
   for (auto *LoopBB : L.blocks())
-    if (!SkippedLoopAndExitBlocks.count(LoopBB))
+    if (!SkipBlock(LoopBB))
       CloneBlock(LoopBB);
 
   // Split all the loop exit edges so that when we clone the exit blocks, if
   // any of the exit blocks are *also* a preheader for some other loop, we
   // don't create multiple predecessors entering the loop header.
   for (auto *ExitBB : ExitBlocks) {
-    if (SkippedLoopAndExitBlocks.count(ExitBB))
+    if (SkipBlock(ExitBB))
       continue;
 
     // When we are going to clone an exit, we don't need to clone all the
@@ -822,17 +932,6 @@ static BasicBlock *buildClonedLoopBlocks(
     assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
            "Cloned exit block has the wrong successor!");
 
-    // Move the merge block's idom to be the split point as one exit is
-    // dominated by one header, and the other by another, so we know the split
-    // point dominates both. While the dominator tree isn't fully accurate, we
-    // want sub-trees within the original loop to be correctly reflect
-    // dominance within that original loop (at least) and that requires moving
-    // the merge block out of that subtree.
-    // FIXME: This is very brittle as we essentially have a partial contract on
-    // the dominator tree. We really need to instead update it and keep it
-    // valid or stop relying on it.
-    DT.changeImmediateDominator(MergeBB, SplitBB);
-
     // Remap any cloned instructions and create a merge phi node for them.
     for (auto ZippedInsts : llvm::zip_first(
              llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
@@ -872,28 +971,63 @@ static BasicBlock *buildClonedLoopBlocks(
           AC.registerAssumption(II);
     }
 
-  // Remove the cloned parent as a predecessor of the cloned continue successor
-  // if we did in fact clone it.
-  auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
-  if (auto *ClonedContinueSuccBB =
-          cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
-    ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
-                                            /*DontDeleteUselessPHIs*/ true);
-  // Replace the cloned branch with an unconditional branch to the cloneed
-  // unswitched successor.
-  auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
-  ClonedParentBB->getTerminator()->eraseFromParent();
-  BranchInst::Create(ClonedSuccBB, ClonedParentBB);
-
   // Update any PHI nodes in the cloned successors of the skipped blocks to not
   // have spurious incoming values.
   for (auto *LoopBB : L.blocks())
-    if (SkippedLoopAndExitBlocks.count(LoopBB))
+    if (SkipBlock(LoopBB))
       for (auto *SuccBB : successors(LoopBB))
         if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
           for (PHINode &PN : ClonedSuccBB->phis())
             PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
 
+  // Remove the cloned parent as a predecessor of any successor we ended up
+  // cloning other than the unswitched one.
+  auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+  for (auto *SuccBB : successors(ParentBB)) {
+    if (SuccBB == UnswitchedSuccBB)
+      continue;
+
+    auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
+    if (!ClonedSuccBB)
+      continue;
+
+    ClonedSuccBB->removePredecessor(ClonedParentBB,
+                                    /*DontDeleteUselessPHIs*/ true);
+  }
+
+  // Replace the cloned branch with an unconditional branch to the cloned
+  // unswitched successor.
+  auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+  ClonedParentBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
+  // If there are duplicate entries in the PHI nodes because of multiple edges
+  // to the unswitched successor, we need to nuke all but one as we replaced it
+  // with a direct branch.
+  for (PHINode &PN : ClonedSuccBB->phis()) {
+    bool Found = false;
+    // Loop over the incoming operands backwards so we can easily delete as we
+    // go without invalidating the index.
+    for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
+      if (PN.getIncomingBlock(i) != ClonedParentBB)
+        continue;
+      if (!Found) {
+        Found = true;
+        continue;
+      }
+      PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
+    }
+  }
+
+  // Record the domtree updates for the new blocks.
+  SmallPtrSet<BasicBlock *, 4> SuccSet;
+  for (auto *ClonedBB : NewBlocks) {
+    for (auto *SuccBB : successors(ClonedBB))
+      if (SuccSet.insert(SuccBB).second)
+        DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
+    SuccSet.clear();
+  }
+
   return ClonedPH;
 }
 
@@ -911,11 +1045,8 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
     for (auto *BB : OrigL.blocks()) {
       auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
       ClonedL.addBlockEntry(ClonedBB);
-      if (LI.getLoopFor(BB) == &OrigL) {
-        assert(!LI.getLoopFor(ClonedBB) &&
-               "Should not have an existing loop for this block!");
+      if (LI.getLoopFor(BB) == &OrigL)
         LI.changeLoopFor(ClonedBB, &ClonedL);
-      }
     }
   };
 
@@ -965,9 +1096,9 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
 /// original loop, multiple cloned sibling loops may be created. All of them
 /// are returned so that the newly introduced loop nest roots can be
 /// identified.
-static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
-                              const ValueToValueMapTy &VMap, LoopInfo &LI,
-                              SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+                             const ValueToValueMapTy &VMap, LoopInfo &LI,
+                             SmallVectorImpl<Loop *> &NonChildClonedLoops) {
   Loop *ClonedL = nullptr;
 
   auto *OrigPH = OrigL.getLoopPreheader();
@@ -1060,6 +1191,7 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
     } else {
       LI.addTopLevelLoop(ClonedL);
     }
+    NonChildClonedLoops.push_back(ClonedL);
 
     ClonedL->reserveBlocks(BlocksInClonedLoop.size());
     // We don't want to just add the cloned loop blocks based on how we
@@ -1128,11 +1260,11 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
   // matter as we're just trying to build up the map from inside-out; we use
   // the map in a more stably ordered way below.
   auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
-  std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
-            [&](BasicBlock *LHS, BasicBlock *RHS) {
-              return ExitLoopMap.lookup(LHS)->getLoopDepth() <
-                     ExitLoopMap.lookup(RHS)->getLoopDepth();
-            });
+  llvm::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
+             [&](BasicBlock *LHS, BasicBlock *RHS) {
+               return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+                      ExitLoopMap.lookup(RHS)->getLoopDepth();
+             });
 
   // Populate the existing ExitLoopMap with everything reachable from each
   // exit, starting from the inner most exit.
@@ -1212,60 +1344,69 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
     NonChildClonedLoops.push_back(cloneLoopNest(
         *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
   }
+}
+
+static void
+deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+                       ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
+                       DominatorTree &DT) {
+  // Find all the dead clones, and remove them from their successors.
+  SmallVector<BasicBlock *, 16> DeadBlocks;
+  for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+    for (auto &VMap : VMaps)
+      if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
+        if (!DT.isReachableFromEntry(ClonedBB)) {
+          for (BasicBlock *SuccBB : successors(ClonedBB))
+            SuccBB->removePredecessor(ClonedBB);
+          DeadBlocks.push_back(ClonedBB);
+        }
 
-  // Return the main cloned loop if any.
-  return ClonedL;
+  // Drop any remaining references to break cycles.
+  for (BasicBlock *BB : DeadBlocks)
+    BB->dropAllReferences();
+  // Erase them from the IR.
+  for (BasicBlock *BB : DeadBlocks)
+    BB->eraseFromParent();
 }
 
-static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
-                                     SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                                     DominatorTree &DT, LoopInfo &LI) {
-  // Walk the dominator tree to build up the set of blocks we will delete here.
-  // The order is designed to allow us to always delete bottom-up and avoid any
-  // dangling uses.
-  SmallSetVector<BasicBlock *, 16> DeadBlocks;
-  DeadBlocks.insert(DeadSubtreeRoot);
-  for (int i = 0; i < (int)DeadBlocks.size(); ++i)
-    for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) {
-      // FIXME: This assert should pass and that means we don't change nearly
-      // as much below! Consider rewriting all of this to avoid deleting
-      // blocks. They are always cloned before being deleted, and so instead
-      // could just be moved.
-      // FIXME: This in turn means that we might actually be more able to
-      // update the domtree.
-      assert((L.contains(ChildN->getBlock()) ||
-              llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) &&
-             "Should never reach beyond the loop and exits when deleting!");
-      DeadBlocks.insert(ChildN->getBlock());
+static void
+deleteDeadBlocksFromLoop(Loop &L,
+                         SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                         DominatorTree &DT, LoopInfo &LI) {
+  // Find all the dead blocks, and remove them from their successors.
+  SmallVector<BasicBlock *, 16> DeadBlocks;
+  for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+    if (!DT.isReachableFromEntry(BB)) {
+      for (BasicBlock *SuccBB : successors(BB))
+        SuccBB->removePredecessor(BB);
+      DeadBlocks.push_back(BB);
     }
 
+  SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(),
+                                             DeadBlocks.end());
+
   // Filter out the dead blocks from the exit blocks list so that it can be
   // used in the caller.
   llvm::erase_if(ExitBlocks,
-                 [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
-
-  // Remove these blocks from their successors.
-  for (auto *BB : DeadBlocks)
-    for (BasicBlock *SuccBB : successors(BB))
-      SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
+                 [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
 
   // Walk from this loop up through its parents removing all of the dead blocks.
   for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
     for (auto *BB : DeadBlocks)
       ParentL->getBlocksSet().erase(BB);
     llvm::erase_if(ParentL->getBlocksVector(),
-                   [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+                   [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
   }
 
   // Now delete the dead child loops. This raw delete will clear them
   // recursively.
   llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
-    if (!DeadBlocks.count(ChildL->getHeader()))
+    if (!DeadBlockSet.count(ChildL->getHeader()))
       return false;
 
     assert(llvm::all_of(ChildL->blocks(),
                         [&](BasicBlock *ChildBB) {
-                          return DeadBlocks.count(ChildBB);
+                          return DeadBlockSet.count(ChildBB);
                         }) &&
            "If the child loop header is dead all blocks in the child loop must "
            "be dead as well!");
@@ -1273,19 +1414,20 @@ static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
     return true;
   });
 
-  // Remove the mappings for the dead blocks.
-  for (auto *BB : DeadBlocks)
+  // Remove the loop mappings for the dead blocks and drop all the references
+  // from these blocks to others to handle cyclic references as we start
+  // deleting the blocks themselves.
+  for (auto *BB : DeadBlocks) {
+    // Check that the dominator tree has already been updated.
+    assert(!DT.getNode(BB) && "Should already have cleared domtree!");
     LI.changeLoopFor(BB, nullptr);
-
-  // Drop all the references from these blocks to others to handle cyclic
-  // references as we start deleting the blocks themselves.
-  for (auto *BB : DeadBlocks)
     BB->dropAllReferences();
+  }
 
-  for (auto *BB : llvm::reverse(DeadBlocks)) {
-    DT.eraseNode(BB);
+  // Actually delete the blocks now that they've been fully unhooked from the
+  // IR.
+  for (auto *BB : DeadBlocks)
     BB->eraseFromParent();
-  }
 }
 
 /// Recompute the set of blocks in a loop after unswitching.
@@ -1333,14 +1475,15 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
   if (LoopBlockSet.empty())
     return LoopBlockSet;
 
-  // Add the loop header to the set.
-  LoopBlockSet.insert(Header);
-
   // We found backedges, recurse through them to identify the loop blocks.
   while (!Worklist.empty()) {
     BasicBlock *BB = Worklist.pop_back_val();
     assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
 
+    // No need to walk past the header.
+    if (BB == Header)
+      continue;
+
     // Because we know the inner loop structure remains valid we can use the
     // loop structure to jump immediately across the entire nested loop.
     // Further, because it is in loop simplified form, we can directly jump
@@ -1361,9 +1504,10 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
           continue;
 
         // Insert all of the blocks (other than those already present) into
-        // the loop set. The only block we expect to already be in the set is
-        // the one we used to find this loop as we immediately handle the
-        // others the first time we encounter the loop.
+        // the loop set. We expect at least the block that led us to find the
+        // inner loop to be in the block set, but we may also have other loop
+        // blocks if they were already enqueued as predecessors of some other
+        // outer loop block.
         for (auto *InnerBB : InnerL->blocks()) {
           if (InnerBB == BB) {
             assert(LoopBlockSet.count(InnerBB) &&
@@ -1371,9 +1515,7 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
             continue;
           }
 
-          bool Inserted = LoopBlockSet.insert(InnerBB).second;
-          (void)Inserted;
-          assert(Inserted && "Should only insert an inner loop once!");
+          LoopBlockSet.insert(InnerBB);
         }
 
         // Add the preheader to the worklist so we will continue past the
@@ -1389,6 +1531,8 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
         Worklist.push_back(Pred);
   }
 
+  assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
+
   // We've found all the blocks participating in the loop, return our completed
   // set.
   return LoopBlockSet;
@@ -1636,32 +1780,58 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
   } while (!DomWorklist.empty());
 }
 
-/// Take an invariant branch that has been determined to be safe and worthwhile
-/// to unswitch despite being non-trivial to do so and perform the unswitch.
-///
-/// This directly updates the CFG to hoist the predicate out of the loop, and
-/// clone the necessary parts of the loop to maintain behavior.
-///
-/// It also updates both dominator tree and loopinfo based on the unswitching.
-///
-/// Once unswitching has been performed it runs the provided callback to report
-/// the new loops and no-longer valid loops to the caller.
-static bool unswitchInvariantBranch(
-    Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
-    AssumptionCache &AC,
-    function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
-  assert(BI.isConditional() && "Can only unswitch a conditional branch!");
-  assert(L.isLoopInvariant(BI.getCondition()) &&
-         "Can only unswitch an invariant branch condition!");
+static bool unswitchNontrivialInvariants(
+    Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants,
+    DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
+    function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+    ScalarEvolution *SE) {
+  auto *ParentBB = TI.getParent();
+  BranchInst *BI = dyn_cast<BranchInst>(&TI);
+  SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
+
+  // We can only unswitch switches, conditional branches with an invariant
+  // condition, or combining invariant conditions with an instruction.
+  assert((SI || BI->isConditional()) &&
+         "Can only unswitch switches and conditional branch!");
+  bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
+  if (FullUnswitch)
+    assert(Invariants.size() == 1 &&
+           "Cannot have other invariants with full unswitching!");
+  else
+    assert(isa<Instruction>(BI->getCondition()) &&
+           "Partial unswitching requires an instruction as the condition!");
+
+  // Constant and BBs tracking the cloned and continuing successor. When we are
+  // unswitching the entire condition, this can just be trivially chosen to
+  // unswitch towards `true`. However, when we are unswitching a set of
+  // invariants combined with `and` or `or`, the combining operation determines
+  // the best direction to unswitch: we want to unswitch the direction that will
+  // collapse the branch.
+  bool Direction = true;
+  int ClonedSucc = 0;
+  if (!FullUnswitch) {
+    if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) {
+      assert(cast<Instruction>(BI->getCondition())->getOpcode() ==
+                 Instruction::And &&
+             "Only `or` and `and` instructions can combine invariants being "
+             "unswitched.");
+      Direction = false;
+      ClonedSucc = 1;
+    }
+  }
 
-  // Constant and BBs tracking the cloned and continuing successor.
-  const int ClonedSucc = 0;
-  auto *ParentBB = BI.getParent();
-  auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
-  auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
+  BasicBlock *RetainedSuccBB =
+      BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
+  SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
+  if (BI)
+    UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
+  else
+    for (auto Case : SI->cases())
+      if (Case.getCaseSuccessor() != RetainedSuccBB)
+        UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
 
-  assert(UnswitchedSuccBB != ContinueSuccBB &&
-         "Should not unswitch a branch that always goes to the same place!");
+  assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
+         "Should not unswitch the same successor we are retaining!");
 
   // The branch should be in this exact loop. Any inner loop's invariant branch
   // should be handled by unswitching that inner loop. The caller of this
@@ -1680,9 +1850,6 @@ static bool unswitchInvariantBranch(
     if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
       return false;
 
-  SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
-                                            ExitBlocks.end());
-
   // Compute the parent loop now before we start hacking on things.
   Loop *ParentL = L.getParentLoop();
 
@@ -1701,27 +1868,31 @@ static bool unswitchInvariantBranch(
       OuterExitL = NewOuterExitL;
   }
 
-  // If the edge we *aren't* cloning in the unswitch (the continuing edge)
-  // dominates its target, we can skip cloning the dominated region of the loop
-  // and its exits. We compute this as a set of nodes to be skipped.
-  SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
-  if (ContinueSuccBB->getUniquePredecessor() ||
-      llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
-        return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
-      })) {
-    visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
-      SkippedLoopAndExitBlocks.insert(BB);
-      return true;
-    });
+  // At this point, we're definitely going to unswitch something so invalidate
+  // any cached information in ScalarEvolution for the outer most loop
+  // containing an exit block and all nested loops.
+  if (SE) {
+    if (OuterExitL)
+      SE->forgetLoop(OuterExitL);
+    else
+      SE->forgetTopmostLoop(&L);
   }
-  // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
-  // edge) dominates its target, we will end up with dead nodes in the original
-  // loop and its exits that will need to be deleted. Here, we just retain that
-  // the property holds and will compute the deleted set later.
-  bool DeleteUnswitchedSucc =
-      UnswitchedSuccBB->getUniquePredecessor() ||
-      llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
-        return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
+
+  // If the edge from this terminator to a successor dominates that successor,
+  // store a map from each block in its dominator subtree to it. This lets us
+  // tell when cloning for a particular successor if a block is dominated by
+  // some *other* successor with a single data structure. We use this to
+  // significantly reduce cloning.
+  SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
+  for (auto *SuccBB : llvm::concat<BasicBlock *const>(
+           makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
+    if (SuccBB->getUniquePredecessor() ||
+        llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+          return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
+        }))
+      visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
+        DominatingSucc[BB] = SuccBB;
+        return true;
       });
 
   // Split the preheader, so that we know that there is a safe place to insert
@@ -1732,52 +1903,162 @@ static bool unswitchInvariantBranch(
   BasicBlock *SplitBB = L.getLoopPreheader();
   BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
 
-  // Keep a mapping for the cloned values.
-  ValueToValueMapTy VMap;
+  // Keep track of the dominator tree updates needed.
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+
+  // Clone the loop for each unswitched successor.
+  SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+  VMaps.reserve(UnswitchedSuccBBs.size());
+  SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
+  for (auto *SuccBB : UnswitchedSuccBBs) {
+    VMaps.emplace_back(new ValueToValueMapTy());
+    ClonedPHs[SuccBB] = buildClonedLoopBlocks(
+        L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
+        DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI);
+  }
 
-  // Build the cloned blocks from the loop.
-  auto *ClonedPH = buildClonedLoopBlocks(
-      L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
-      ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI);
+  // The stitching of the branched code back together depends on whether we're
+  // doing full unswitching or not with the exception that we always want to
+  // nuke the initial terminator placed in the split block.
+  SplitBB->getTerminator()->eraseFromParent();
+  if (FullUnswitch) {
+    // First we need to unhook the successor relationship as we'll be replacing
+    // the terminator with a direct branch. This is much simpler for branches
+    // than switches so we handle those first.
+    if (BI) {
+      // Remove the parent as a predecessor of the unswitched successor.
+      assert(UnswitchedSuccBBs.size() == 1 &&
+             "Only one possible unswitched block for a branch!");
+      BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
+      UnswitchedSuccBB->removePredecessor(ParentBB,
+                                          /*DontDeleteUselessPHIs*/ true);
+      DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
+    } else {
+      // Note that we actually want to remove the parent block as a predecessor
+      // of *every* case successor. The case successor is either unswitched,
+      // completely eliminating an edge from the parent to that successor, or it
+      // is a duplicate edge to the retained successor as the retained successor
+      // is always the default successor and as we'll replace this with a direct
+      // branch we no longer need the duplicate entries in the PHI nodes.
+      assert(SI->getDefaultDest() == RetainedSuccBB &&
+             "Not retaining default successor!");
+      for (auto &Case : SI->cases())
+        Case.getCaseSuccessor()->removePredecessor(
+            ParentBB,
+            /*DontDeleteUselessPHIs*/ true);
+
+      // We need to use the set to populate domtree updates as even when there
+      // are multiple cases pointing at the same successor we only want to
+      // remove and insert one edge in the domtree.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
+    }
 
-  // Build the cloned loop structure itself. This may be substantially
-  // different from the original structure due to the simplified CFG. This also
-  // handles inserting all the cloned blocks into the correct loops.
-  SmallVector<Loop *, 4> NonChildClonedLoops;
-  Loop *ClonedL =
-      buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
+    // Now that we've unhooked the successor relationship, splice the terminator
+    // from the original loop to the split.
+    SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
 
-  // Remove the parent as a predecessor of the unswitched successor.
-  UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
+    // Now wire up the terminator to the preheaders.
+    if (BI) {
+      BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+      BI->setSuccessor(ClonedSucc, ClonedPH);
+      BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+    } else {
+      assert(SI && "Must either be a branch or switch!");
+
+      // Walk the cases and directly update their successors.
+      SI->setDefaultDest(LoopPH);
+      for (auto &Case : SI->cases())
+        if (Case.getCaseSuccessor() == RetainedSuccBB)
+          Case.setSuccessor(LoopPH);
+        else
+          Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
+
+      // We need to use the set to populate domtree updates as even when there
+      // are multiple cases pointing at the same successor we only want to
+      // remove and insert one edge in the domtree.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        DTUpdates.push_back(
+            {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
+    }
 
-  // Now splice the branch from the original loop and use it to select between
-  // the two loops.
-  SplitBB->getTerminator()->eraseFromParent();
-  SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
-  BI.setSuccessor(ClonedSucc, ClonedPH);
-  BI.setSuccessor(1 - ClonedSucc, LoopPH);
+    // Create a new unconditional branch to the continuing block (as opposed to
+    // the one cloned).
+    BranchInst::Create(RetainedSuccBB, ParentBB);
+  } else {
+    assert(BI && "Only branches have partial unswitching.");
+    assert(UnswitchedSuccBBs.size() == 1 &&
+           "Only one possible unswitched block for a branch!");
+    BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+    // When doing a partial unswitch, we have to do a bit more work to build up
+    // the branch in the split block.
+    buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+                                          *ClonedPH, *LoopPH);
+    DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+  }
 
-  // Create a new unconditional branch to the continuing block (as opposed to
-  // the one cloned).
-  BranchInst::Create(ContinueSuccBB, ParentBB);
+  // Apply the updates accumulated above to get an up-to-date dominator tree.
+  DT.applyUpdates(DTUpdates);
 
-  // Delete anything that was made dead in the original loop due to
-  // unswitching.
-  if (DeleteUnswitchedSucc)
-    deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI);
+  // Now that we have an accurate dominator tree, first delete the dead cloned
+  // blocks so that we can accurately build any cloned loops. It is important to
+  // not delete the blocks from the original loop yet because we still want to
+  // reference the original loop to understand the cloned loop's structure.
+  deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT);
+
+  // Build the cloned loop structure itself. This may be substantially
+  // different from the original structure due to the simplified CFG. This also
+  // handles inserting all the cloned blocks into the correct loops.
+  SmallVector<Loop *, 4> NonChildClonedLoops;
+  for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
+    buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
 
+  // Now that our cloned loops have been built, we can update the original loop.
+  // First we delete the dead blocks from it and then we rebuild the loop
+  // structure taking these deletions into account.
+  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI);
   SmallVector<Loop *, 4> HoistedLoops;
   bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
 
-  // This will have completely invalidated the dominator tree. We can't easily
-  // bound how much is invalid because in some cases we will refine the
-  // predecessor set of exit blocks of the loop which can move large unrelated
-  // regions of code into a new subtree.
-  //
-  // FIXME: Eventually, we should use an incremental update utility that
-  // leverages the existing information in the dominator tree (and potentially
-  // the nature of the change) to more efficiently update things.
-  DT.recalculate(*SplitBB->getParent());
+  // This transformation has a high risk of corrupting the dominator tree, and
+  // the below steps to rebuild loop structures will result in hard to debug
+  // errors in that case so verify that the dominator tree is sane first.
+  // FIXME: Remove this when the bugs stop showing up and rely on existing
+  // verification steps.
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+  if (BI) {
+    // If we unswitched a branch which collapses the condition to a known
+    // constant we want to replace all the uses of the invariants within both
+    // the original and cloned blocks. We do this here so that we can use the
+    // now updated dominator tree to identify which side the users are on.
+    assert(UnswitchedSuccBBs.size() == 1 &&
+           "Only one possible unswitched block for a branch!");
+    BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+    ConstantInt *UnswitchedReplacement =
+        Direction ? ConstantInt::getTrue(BI->getContext())
+                  : ConstantInt::getFalse(BI->getContext());
+    ConstantInt *ContinueReplacement =
+        Direction ? ConstantInt::getFalse(BI->getContext())
+                  : ConstantInt::getTrue(BI->getContext());
+    for (Value *Invariant : Invariants)
+      for (auto UI = Invariant->use_begin(), UE = Invariant->use_end();
+           UI != UE;) {
+        // Grab the use and walk past it so we can clobber it in the use list.
+        Use *U = &*UI++;
+        Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+        if (!UserI)
+          continue;
+
+        // Replace it with the 'continue' side if in the main loop body, and the
+        // unswitched if in the cloned blocks.
+        if (DT.dominates(LoopPH, UserI->getParent()))
+          U->set(ContinueReplacement);
+        else if (DT.dominates(ClonedPH, UserI->getParent()))
+          U->set(UnswitchedReplacement);
+      }
+  }
 
   // We can change which blocks are exit blocks of all the cloned sibling
   // loops, the current loop, and any parent loops which shared exit blocks
@@ -1791,57 +2072,50 @@ static bool unswitchInvariantBranch(
   // also need to cover any intervening loops. We add all of these loops to
   // a list and sort them by loop depth to achieve this without updating
   // unnecessary loops.
-  auto UpdateLCSSA = [&](Loop &UpdateL) {
+  auto UpdateLoop = [&](Loop &UpdateL) {
 #ifndef NDEBUG
-    for (Loop *ChildL : UpdateL)
+    UpdateL.verifyLoop();
+    for (Loop *ChildL : UpdateL) {
+      ChildL->verifyLoop();
       assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
              "Perturbed a child loop's LCSSA form!");
+    }
 #endif
+    // First build LCSSA for this loop so that we can preserve it when
+    // forming dedicated exits. We don't want to perturb some other loop's
+    // LCSSA while doing that CFG edit.
     formLCSSA(UpdateL, DT, &LI, nullptr);
+
+    // For loops reached by this loop's original exit blocks we may
+    // introduced new, non-dedicated exits. At least try to re-form dedicated
+    // exits for these loops. This may fail if they couldn't have dedicated
+    // exits to start with.
+    formDedicatedExitBlocks(&UpdateL, &DT, &LI, /*PreserveLCSSA*/ true);
   };
 
   // For non-child cloned loops and hoisted loops, we just need to update LCSSA
   // and we can do it in any order as they don't nest relative to each other.
-  for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
-    UpdateLCSSA(*UpdatedL);
+  //
+  // Also check if any of the loops we have updated have become top-level loops
+  // as that will necessitate widening the outer loop scope.
+  for (Loop *UpdatedL :
+       llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
+    UpdateLoop(*UpdatedL);
+    if (!UpdatedL->getParentLoop())
+      OuterExitL = nullptr;
+  }
+  if (IsStillLoop) {
+    UpdateLoop(L);
+    if (!L.getParentLoop())
+      OuterExitL = nullptr;
+  }
 
   // If the original loop had exit blocks, walk up through the outer most loop
   // of those exit blocks to update LCSSA and form updated dedicated exits.
-  if (OuterExitL != &L) {
-    SmallVector<Loop *, 4> OuterLoops;
-    // We start with the cloned loop and the current loop if they are loops and
-    // move toward OuterExitL. Also, if either the cloned loop or the current
-    // loop have become top level loops we need to walk all the way out.
-    if (ClonedL) {
-      OuterLoops.push_back(ClonedL);
-      if (!ClonedL->getParentLoop())
-        OuterExitL = nullptr;
-    }
-    if (IsStillLoop) {
-      OuterLoops.push_back(&L);
-      if (!L.getParentLoop())
-        OuterExitL = nullptr;
-    }
-    // Grab all of the enclosing loops now.
+  if (OuterExitL != &L)
     for (Loop *OuterL = ParentL; OuterL != OuterExitL;
          OuterL = OuterL->getParentLoop())
-      OuterLoops.push_back(OuterL);
-
-    // Finally, update our list of outer loops. This is nicely ordered to work
-    // inside-out.
-    for (Loop *OuterL : OuterLoops) {
-      // First build LCSSA for this loop so that we can preserve it when
-      // forming dedicated exits. We don't want to perturb some other loop's
-      // LCSSA while doing that CFG edit.
-      UpdateLCSSA(*OuterL);
-
-      // For loops reached by this loop's original exit blocks we may
-      // introduced new, non-dedicated exits. At least try to re-form dedicated
-      // exits for these loops. This may fail if they couldn't have dedicated
-      // exits to start with.
-      formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
-    }
-  }
+      UpdateLoop(*OuterL);
 
 #ifndef NDEBUG
   // Verify the entire loop structure to catch any incorrect updates before we
@@ -1856,7 +2130,7 @@ static bool unswitchInvariantBranch(
   for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
     if (UpdatedL->getParentLoop() == ParentL)
       SibLoops.push_back(UpdatedL);
-  NonTrivialUnswitchCB(IsStillLoop, SibLoops);
+  UnswitchCB(IsStillLoop, SibLoops);
 
   ++NumBranches;
   return true;
@@ -1895,50 +2169,69 @@ computeDomSubtreeCost(DomTreeNode &N,
   return Cost;
 }
 
-/// Unswitch control flow predicated on loop invariant conditions.
-///
-/// This first hoists all branches or switches which are trivial (IE, do not
-/// require duplicating any part of the loop) out of the loop body. It then
-/// looks at other loop invariant control flows and tries to unswitch those as
-/// well by cloning the loop if the result is small enough.
 static bool
-unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
-             TargetTransformInfo &TTI, bool NonTrivial,
-             function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
-  assert(L.isRecursivelyLCSSAForm(DT, LI) &&
-         "Loops must be in LCSSA form before unswitching.");
-  bool Changed = false;
+unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                      AssumptionCache &AC, TargetTransformInfo &TTI,
+                      function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+                      ScalarEvolution *SE) {
+  // Collect all invariant conditions within this loop (as opposed to an inner
+  // loop which would be handled when visiting that inner loop).
+  SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4>
+      UnswitchCandidates;
+  for (auto *BB : L.blocks()) {
+    if (LI.getLoopFor(BB) != &L)
+      continue;
 
-  // Must be in loop simplified form: we need a preheader and dedicated exits.
-  if (!L.isLoopSimplifyForm())
-    return false;
+    if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      // We can only consider fully loop-invariant switch conditions as we need
+      // to completely eliminate the switch after unswitching.
+      if (!isa<Constant>(SI->getCondition()) &&
+          L.isLoopInvariant(SI->getCondition()))
+        UnswitchCandidates.push_back({SI, {SI->getCondition()}});
+      continue;
+    }
 
-  // Try trivial unswitch first before loop over other basic blocks in the loop.
-  Changed |= unswitchAllTrivialConditions(L, DT, LI);
+    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
+        BI->getSuccessor(0) == BI->getSuccessor(1))
+      continue;
 
-  // If we're not doing non-trivial unswitching, we're done. We both accept
-  // a parameter but also check a local flag that can be used for testing
-  // a debugging.
-  if (!NonTrivial && !EnableNonTrivialUnswitch)
-    return Changed;
-
-  // Collect all remaining invariant branch conditions within this loop (as
-  // opposed to an inner loop which would be handled when visiting that inner
-  // loop).
-  SmallVector<TerminatorInst *, 4> UnswitchCandidates;
-  for (auto *BB : L.blocks())
-    if (LI.getLoopFor(BB) == &L)
-      if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
-        if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
-            BI->getSuccessor(0) != BI->getSuccessor(1))
-          UnswitchCandidates.push_back(BI);
+    if (L.isLoopInvariant(BI->getCondition())) {
+      UnswitchCandidates.push_back({BI, {BI->getCondition()}});
+      continue;
+    }
+
+    Instruction &CondI = *cast<Instruction>(BI->getCondition());
+    if (CondI.getOpcode() != Instruction::And &&
+      CondI.getOpcode() != Instruction::Or)
+      continue;
+
+    TinyPtrVector<Value *> Invariants =
+        collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
+    if (Invariants.empty())
+      continue;
+
+    UnswitchCandidates.push_back({BI, std::move(Invariants)});
+  }
 
   // If we didn't find any candidates, we're done.
   if (UnswitchCandidates.empty())
-    return Changed;
+    return false;
 
-  DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
-               << " non-trivial loop invariant conditions for unswitching.\n");
+  // Check if there are irreducible CFG cycles in this loop. If so, we cannot
+  // easily unswitch non-trivial edges out of the loop. Doing so might turn the
+  // irreducible control flow into reducible control flow and introduce new
+  // loops "out of thin air". If we ever discover important use cases for doing
+  // this, we can add support to loop unswitch, but it is a lot of complexity
+  // for what seems little or no real world benefit.
+  LoopBlocksRPO RPOT(&L);
+  RPOT.perform(&LI);
+  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+    return false;
+
+  LLVM_DEBUG(
+      dbgs() << "Considering " << UnswitchCandidates.size()
+             << " non-trivial loop invariant conditions for unswitching.\n");
 
   // Given that unswitching these terminators will require duplicating parts of
   // the loop, so we need to be able to model that cost. Compute the ephemeral
@@ -1962,10 +2255,10 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
         continue;
 
       if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
-        return Changed;
+        return false;
       if (auto CS = CallSite(&I))
         if (CS.isConvergent() || CS.cannotDuplicate())
-          return Changed;
+          return false;
 
       Cost += TTI.getUserCost(&I);
     }
@@ -1974,7 +2267,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
     assert(LoopCost >= 0 && "Must not have negative loop costs!");
     BBCostMap[BB] = Cost;
   }
-  DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n");
+  LLVM_DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n");
 
   // Now we find the best candidate by searching for the one with the following
   // properties in order:
@@ -1993,8 +2286,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
   SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
   // Given a terminator which might be unswitched, computes the non-duplicated
   // cost for that terminator.
-  auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
-    BasicBlock &BB = *TI->getParent();
+  auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) {
+    BasicBlock &BB = *TI.getParent();
     SmallPtrSet<BasicBlock *, 4> Visited;
 
     int Cost = LoopCost;
@@ -2003,6 +2296,26 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
       if (!Visited.insert(SuccBB).second)
         continue;
 
+      // If this is a partial unswitch candidate, then it must be a conditional
+      // branch with a condition of either `or` or `and`. In that case, one of
+      // the successors is necessarily duplicated, so don't even try to remove
+      // its cost.
+      if (!FullUnswitch) {
+        auto &BI = cast<BranchInst>(TI);
+        if (cast<Instruction>(BI.getCondition())->getOpcode() ==
+            Instruction::And) {
+          if (SuccBB == BI.getSuccessor(1))
+            continue;
+        } else {
+          assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+                     Instruction::Or &&
+                 "Only `and` and `or` conditions can result in a partial "
+                 "unswitch!");
+          if (SuccBB == BI.getSuccessor(0))
+            continue;
+        }
+      }
+
       // This successor's domtree will not need to be duplicated after
       // unswitching if the edge to the successor dominates it (and thus the
       // entire tree). This essentially means there is no other path into this
@@ -2026,27 +2339,95 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
   };
   TerminatorInst *BestUnswitchTI = nullptr;
   int BestUnswitchCost;
-  for (TerminatorInst *CandidateTI : UnswitchCandidates) {
-    int CandidateCost = ComputeUnswitchedCost(CandidateTI);
-    DEBUG(dbgs() << "  Computed cost of " << CandidateCost
-                 << " for unswitch candidate: " << *CandidateTI << "\n");
+  ArrayRef<Value *> BestUnswitchInvariants;
+  for (auto &TerminatorAndInvariants : UnswitchCandidates) {
+    TerminatorInst &TI = *TerminatorAndInvariants.first;
+    ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
+    BranchInst *BI = dyn_cast<BranchInst>(&TI);
+    int CandidateCost = ComputeUnswitchedCost(
+        TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
+                                     Invariants[0] == BI->getCondition()));
+    LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                      << " for unswitch candidate: " << TI << "\n");
     if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
-      BestUnswitchTI = CandidateTI;
+      BestUnswitchTI = &TI;
       BestUnswitchCost = CandidateCost;
+      BestUnswitchInvariants = Invariants;
     }
   }
 
-  if (BestUnswitchCost < UnswitchThreshold) {
-    DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
-                 << BestUnswitchCost << ") branch: " << *BestUnswitchTI
-                 << "\n");
-    Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
-                                       LI, AC, NonTrivialUnswitchCB);
-  } else {
-    DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
-                 << "\n");
+  if (BestUnswitchCost >= UnswitchThreshold) {
+    LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
+                      << BestUnswitchCost << "\n");
+    return false;
   }
 
+  LLVM_DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
+                    << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
+                    << "\n");
+  return unswitchNontrivialInvariants(
+      L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE);
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+///
+/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
+/// updated based on the unswitch.
+///
+/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
+/// true, we will attempt to do non-trivial unswitching as well as trivial
+/// unswitching.
+///
+/// The `UnswitchCB` callback provided will be run after unswitching is
+/// complete, with the first parameter set to `true` if the provided loop
+/// remains a loop, and a list of new sibling loops created.
+///
+/// If `SE` is non-null, we will update that analysis based on the unswitching
+/// done.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                         AssumptionCache &AC, TargetTransformInfo &TTI,
+                         bool NonTrivial,
+                         function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+                         ScalarEvolution *SE) {
+  assert(L.isRecursivelyLCSSAForm(DT, LI) &&
+         "Loops must be in LCSSA form before unswitching.");
+  bool Changed = false;
+
+  // Must be in loop simplified form: we need a preheader and dedicated exits.
+  if (!L.isLoopSimplifyForm())
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  if (unswitchAllTrivialConditions(L, DT, LI, SE)) {
+    // If we unswitched successfully we will want to clean up the loop before
+    // processing it further so just mark it as unswitched and return.
+    UnswitchCB(/*CurrentLoopValid*/ true, {});
+    return true;
+  }
+
+  // If we're not doing non-trivial unswitching, we're done. We both accept
+  // a parameter but also check a local flag that can be used for testing
+  // a debugging.
+  if (!NonTrivial && !EnableNonTrivialUnswitch)
+    return false;
+
+  // For non-trivial unswitching, because it often creates new loops, we rely on
+  // the pass manager to iterate on the loops rather than trying to immediately
+  // reach a fixed point. There is no substantial advantage to iterating
+  // internally, and if any of the new loops are simplified enough to contain
+  // trivial unswitching we want to prefer those.
+
+  // Try to unswitch the best invariant condition. We prefer this full unswitch to
+  // a partial unswitch when possible below the threshold.
+  if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE))
+    return true;
+
+  // No other opportunities to unswitch.
   return Changed;
 }
 
@@ -2056,16 +2437,18 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
   Function &F = *L.getHeader()->getParent();
   (void)F;
 
-  DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+  LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
+                    << "\n");
 
   // Save the current loop name in a variable so that we can report it even
   // after it has been deleted.
   std::string LoopName = L.getName();
 
-  auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
-                                                  ArrayRef<Loop *> NewLoops) {
+  auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+                                        ArrayRef<Loop *> NewLoops) {
     // If we did a non-trivial unswitch, we have added new (cloned) loops.
-    U.addSiblingLoops(NewLoops);
+    if (!NewLoops.empty())
+      U.addSiblingLoops(NewLoops);
 
     // If the current loop remains valid, we should revisit it to catch any
     // other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2075,15 +2458,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
       U.markLoopAsDeleted(L, LoopName);
   };
 
-  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
-                    NonTrivialUnswitchCB))
+  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
+                    &AR.SE))
     return PreservedAnalyses::all();
 
-#ifndef NDEBUG
   // Historically this pass has had issues with the dominator tree so verify it
   // in asserts builds.
-  AR.DT.verifyDomTree();
-#endif
+  assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
   return getLoopPassPreservedAnalyses();
 }
 
@@ -2118,15 +2499,19 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   Function &F = *L->getHeader()->getParent();
 
-  DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+  LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
+                    << "\n");
 
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
-                                         ArrayRef<Loop *> NewLoops) {
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+  auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+                               ArrayRef<Loop *> NewLoops) {
     // If we did a non-trivial unswitch, we have added new (cloned) loops.
     for (auto *NewL : NewLoops)
       LPM.addLoop(*NewL);
@@ -2140,18 +2525,16 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
       LPM.markLoopAsDeleted(*L);
   };
 
-  bool Changed =
-      unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
+  bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE);
 
   // If anything was unswitched, also clear any cached information about this
   // loop.
   LPM.deleteSimpleAnalysisLoop(L);
 
-#ifndef NDEBUG
   // Historically this pass has had issues with the dominator tree so verify it
   // in asserts builds.
-  DT.verifyDomTree();
-#endif
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
   return Changed;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 1522170dc3b9..b7b1db76b492 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -39,7 +40,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <utility>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index cfb8a062299f..ca6b93e0b4a9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
   if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
-    if (isa<LoadInst>(Inst))
+    if (Inst->mayReadFromMemory())
       return false;
 
     // We don't want to sink across a critical edge if we don't dominate the
@@ -187,11 +187,9 @@ static bool SinkInstruction(Instruction *Inst,
   if (!SuccToSinkTo)
     return false;
 
-  DEBUG(dbgs() << "Sink" << *Inst << " (";
-        Inst->getParent()->printAsOperand(dbgs(), false);
-        dbgs() << " -> ";
-        SuccToSinkTo->printAsOperand(dbgs(), false);
-        dbgs() << ")\n");
+  LLVM_DEBUG(dbgs() << "Sink" << *Inst << " (";
+             Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> ";
+             SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n");
 
   // Move the instruction.
   Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
@@ -244,7 +242,7 @@ static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
 
   do {
     MadeChange = false;
-    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+    LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
     // Process all basic blocks.
     for (BasicBlock &I : F)
       MadeChange |= ProcessBlock(I, DT, LI, AA);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index 23156d5a4d83..6743e19a7c92 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -64,7 +64,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
     // block. We should consider using actual post-dominance here in the
     // future.
     if (UI->getParent() != PhiBB) {
-      DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n");
+      LLVM_DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n");
       return false;
     }
 
@@ -75,7 +75,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
     // probably change this to do at least a limited scan of the intervening
     // instructions and allow handling stores in easily proven safe cases.
     if (mayBeMemoryDependent(*UI)) {
-      DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n");
+      LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n");
       return false;
     }
 
@@ -126,8 +126,8 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
         // If when we directly test whether this is safe it fails, bail.
         if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
             mayBeMemoryDependent(*OpI)) {
-          DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: " << *OpI
-                       << "\n");
+          LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: "
+                            << *OpI << "\n");
           // Record the stack of instructions which reach this node as unsafe
           // so we prune subsequent searches.
           UnsafeSet.insert(OpI);
@@ -229,7 +229,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
     NonFreeMat |= MatCost != TTI.TCC_Free;
   }
   if (!NonFreeMat) {
-    DEBUG(dbgs() << "    Free: " << PN << "\n");
+    LLVM_DEBUG(dbgs() << "    Free: " << PN << "\n");
     // No profit in free materialization.
     return false;
   }
@@ -237,7 +237,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
   // Now check that the uses of this PHI can actually be speculated,
   // otherwise we'll still have to materialize the PHI value.
   if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
-    DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n");
+    LLVM_DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n");
     return false;
   }
 
@@ -266,7 +266,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
       // Assume we will commute the constant to the RHS to be canonical.
       Idx = 1;
 
-    // Get the intrinsic ID if this user is an instrinsic.
+    // Get the intrinsic ID if this user is an intrinsic.
     Intrinsic::ID IID = Intrinsic::not_intrinsic;
     if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
       IID = UserII->getIntrinsicID();
@@ -288,9 +288,13 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
       // just bail. We're only interested in cases where folding the incoming
       // constants is at least break-even on all paths.
       if (FoldedCost > MatCost) {
-        DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC << "\n"
-                        "    Materializing cost:    " << MatCost << "\n"
-                        "    Accumulated folded cost: " << FoldedCost << "\n");
+        LLVM_DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC
+                          << "\n"
+                             "    Materializing cost:    "
+                          << MatCost
+                          << "\n"
+                             "    Accumulated folded cost: "
+                          << FoldedCost << "\n");
         return false;
       }
     }
@@ -310,8 +314,8 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
                                             "less that its materialized cost, "
                                             "the sum must be as well.");
 
-  DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost)
-               << ": " << PN << "\n");
+  LLVM_DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost)
+                    << ": " << PN << "\n");
   CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
   return true;
 }
@@ -489,9 +493,13 @@ findProfitablePHIs(ArrayRef<PHINode *> PNs,
           // and zero out the cost of everything it depends on.
           int CostSavings = CostSavingsMap.find(PN)->second;
           if (SpecCost > CostSavings) {
-            DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN << "\n"
-                            "    Cost savings:     " << CostSavings << "\n"
-                            "    Speculation cost: " << SpecCost << "\n");
+            LLVM_DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN
+                              << "\n"
+                                 "    Cost savings:     "
+                              << CostSavings
+                              << "\n"
+                                 "    Speculation cost: "
+                              << SpecCost << "\n");
             continue;
           }
 
@@ -545,7 +553,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
                           SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
                           SmallSetVector<BasicBlock *, 16> &PredSet,
                           DominatorTree &DT) {
-  DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n");
+  LLVM_DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n");
   NumPHIsSpeculated += SpecPNs.size();
 
   // Split any critical edges so that we have a block to hoist into.
@@ -558,8 +566,8 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
         CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
     if (NewPredBB) {
       ++NumEdgesSplit;
-      DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName()
+                        << "\n");
       SpecPreds.push_back(NewPredBB);
     } else {
       assert(PredBB->getSingleSuccessor() == ParentBB &&
@@ -593,14 +601,15 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
 
   int NumSpecInsts = SpecList.size() * SpecPreds.size();
   int NumRedundantInsts = NumSpecInsts - SpecList.size();
-  DEBUG(dbgs() << "  Inserting " << NumSpecInsts << " speculated instructions, "
-               << NumRedundantInsts << " redundancies\n");
+  LLVM_DEBUG(dbgs() << "  Inserting " << NumSpecInsts
+                    << " speculated instructions, " << NumRedundantInsts
+                    << " redundancies\n");
   NumSpeculatedInstructions += NumSpecInsts;
   NumNewRedundantInstructions += NumRedundantInsts;
 
   // Each predecessor is numbered by its index in `SpecPreds`, so for each
   // instruction we speculate, the speculated instruction is stored in that
-  // index of the vector asosciated with the original instruction. We also
+  // index of the vector associated with the original instruction. We also
   // store the incoming values for each predecessor from any PHIs used.
   SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
 
@@ -716,7 +725,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
 /// true when at least some speculation occurs.
 static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
                                DominatorTree &DT, TargetTransformInfo &TTI) {
-  DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+  LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
 
   // Savings in cost from speculating around a PHI node.
   SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
@@ -745,7 +754,7 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
             PNs.end());
   // If no PHIs were profitable, skip.
   if (PNs.empty()) {
-    DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
+    LLVM_DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
     return false;
   }
 
@@ -763,13 +772,13 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
     // differently.
     if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
         isa<InvokeInst>(PredBB->getTerminator())) {
-      DEBUG(dbgs() << "  Invalid: predecessor terminator: " << PredBB->getName()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "  Invalid: predecessor terminator: "
+                        << PredBB->getName() << "\n");
       return false;
     }
   }
   if (PredSet.size() < 2) {
-    DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n");
+    LLVM_DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n");
     return false;
   }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a7c308b59877..f5e1dd6ed850 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -62,7 +62,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
@@ -137,6 +137,7 @@ INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
 void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.setPreservesCFG();
 }
 
 bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
@@ -151,8 +152,8 @@ namespace llvm {
 
 bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
   if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
-    DEBUG(dbgs() << "Not running SpeculativeExecution because "
-                    "TTI->hasBranchDivergence() is false.\n");
+    LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
+                         "TTI->hasBranchDivergence() is false.\n");
     return false;
   }
 
@@ -251,7 +252,7 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
 
 bool SpeculativeExecutionPass::considerHoistingFromTo(
     BasicBlock &FromBlock, BasicBlock &ToBlock) {
-  SmallSet<const Instruction *, 8> NotHoisted;
+  SmallPtrSet<const Instruction *, 8> NotHoisted;
   const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
     for (Value* V : U->operand_values()) {
       if (Instruction *I = dyn_cast<Instruction>(V)) {
@@ -314,6 +315,7 @@ PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }
 }  // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index ce40af1223f6..2061db13639a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -61,6 +61,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -80,7 +81,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <limits>
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index b8fb80b6cc26..d650264176aa 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
@@ -55,6 +56,12 @@ static const char *const FlowBlockName = "Flow";
 
 namespace {
 
+static cl::opt<bool> ForceSkipUniformRegions(
+  "structurizecfg-skip-uniform-regions",
+  cl::Hidden,
+  cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
+  cl::init(false));
+
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -120,7 +127,7 @@ public:
   bool resultIsRememberedBlock() { return ResultIsRemembered; }
 };
 
-/// @brief Transforms the control flow graph on one single entry/exit region
+/// Transforms the control flow graph on one single entry/exit region
 /// at a time.
 ///
 /// After the transform all "If"/"Then"/"Else" style control flow looks like
@@ -176,6 +183,7 @@ class StructurizeCFG : public RegionPass {
   Function *Func;
   Region *ParentRegion;
 
+  DivergenceAnalysis *DA;
   DominatorTree *DT;
   LoopInfo *LI;
 
@@ -196,6 +204,9 @@ class StructurizeCFG : public RegionPass {
 
   void orderNodes();
 
+  Loop *getAdjustedLoop(RegionNode *RN);
+  unsigned getAdjustedLoopDepth(RegionNode *RN);
+
   void analyzeLoops(RegionNode *N);
 
   Value *invert(Value *Condition);
@@ -242,8 +253,11 @@ class StructurizeCFG : public RegionPass {
 public:
   static char ID;
 
-  explicit StructurizeCFG(bool SkipUniformRegions = false)
-      : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+  explicit StructurizeCFG(bool SkipUniformRegions_ = false)
+      : RegionPass(ID),
+        SkipUniformRegions(SkipUniformRegions_) {
+    if (ForceSkipUniformRegions.getNumOccurrences())
+      SkipUniformRegions = ForceSkipUniformRegions.getValue();
     initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
@@ -278,7 +292,7 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                     false, false)
 
-/// \brief Initialize the types and constants used in the pass
+/// Initialize the types and constants used in the pass
 bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   LLVMContext &Context = R->getEntry()->getContext();
 
@@ -290,7 +304,27 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   return false;
 }
 
-/// \brief Build up the general order of nodes
+/// Use the exit block to determine the loop if RN is a SubRegion.
+Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) {
+  if (RN->isSubRegion()) {
+    Region *SubRegion = RN->getNodeAs<Region>();
+    return LI->getLoopFor(SubRegion->getExit());
+  }
+
+  return LI->getLoopFor(RN->getEntry());
+}
+
+/// Use the exit block to determine the loop depth if RN is a SubRegion.
+unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) {
+  if (RN->isSubRegion()) {
+    Region *SubR = RN->getNodeAs<Region>();
+    return LI->getLoopDepth(SubR->getExit());
+  }
+
+  return LI->getLoopDepth(RN->getEntry());
+}
+
+/// Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
   ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
   SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
@@ -299,16 +333,15 @@ void StructurizeCFG::orderNodes() {
   // to what we want.  The only problem with it is that sometimes backedges
   // for outer loops will be visited before backedges for inner loops.
   for (RegionNode *RN : RPOT) {
-    BasicBlock *BB = RN->getEntry();
-    Loop *Loop = LI->getLoopFor(BB);
+    Loop *Loop = getAdjustedLoop(RN);
     ++LoopBlocks[Loop];
   }
 
   unsigned CurrentLoopDepth = 0;
   Loop *CurrentLoop = nullptr;
   for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    BasicBlock *BB = (*I)->getEntry();
-    unsigned LoopDepth = LI->getLoopDepth(BB);
+    RegionNode *RN = cast<RegionNode>(*I);
+    unsigned LoopDepth = getAdjustedLoopDepth(RN);
 
     if (is_contained(Order, *I))
       continue;
@@ -320,15 +353,14 @@ void StructurizeCFG::orderNodes() {
       auto LoopI = I;
       while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
         LoopI++;
-        BasicBlock *LoopBB = (*LoopI)->getEntry();
-        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+        if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) {
           --BlockCount;
           Order.push_back(*LoopI);
         }
       }
     }
 
-    CurrentLoop = LI->getLoopFor(BB);
+    CurrentLoop = getAdjustedLoop(RN);
     if (CurrentLoop)
       LoopBlocks[CurrentLoop]--;
 
@@ -343,7 +375,7 @@ void StructurizeCFG::orderNodes() {
   std::reverse(Order.begin(), Order.end());
 }
 
-/// \brief Determine the end of the loops
+/// Determine the end of the loops
 void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
     // Test for exit as back edge
@@ -362,15 +394,16 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   }
 }
 
-/// \brief Invert the given condition
+/// Invert the given condition
 Value *StructurizeCFG::invert(Value *Condition) {
   // First: Check if it's a constant
   if (Constant *C = dyn_cast<Constant>(Condition))
     return ConstantExpr::getNot(C);
 
   // Second: If the condition is already inverted, return the original value
-  if (match(Condition, m_Not(m_Value(Condition))))
-    return Condition;
+  Value *NotCondition;
+  if (match(Condition, m_Not(m_Value(NotCondition))))
+    return NotCondition;
 
   if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
     // Third: Check all the users for an invert
@@ -394,7 +427,7 @@ Value *StructurizeCFG::invert(Value *Condition) {
   llvm_unreachable("Unhandled condition to invert");
 }
 
-/// \brief Build the condition for one edge
+/// Build the condition for one edge
 Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
                                       bool Invert) {
   Value *Cond = Invert ? BoolFalse : BoolTrue;
@@ -407,7 +440,7 @@ Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
   return Cond;
 }
 
-/// \brief Analyze the predecessors of each block and build up predicates
+/// Analyze the predecessors of each block and build up predicates
 void StructurizeCFG::gatherPredicates(RegionNode *N) {
   RegionInfo *RI = ParentRegion->getRegionInfo();
   BasicBlock *BB = N->getEntry();
@@ -465,7 +498,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
   }
 }
 
-/// \brief Collect various loop and predicate infos
+/// Collect various loop and predicate infos
 void StructurizeCFG::collectInfos() {
   // Reset predicate
   Predicates.clear();
@@ -478,10 +511,10 @@ void StructurizeCFG::collectInfos() {
   Visited.clear();
 
   for (RegionNode *RN : reverse(Order)) {
-    DEBUG(dbgs() << "Visiting: "
-                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-                 << RN->getEntry()->getName() << " Loop Depth: "
-                 << LI->getLoopDepth(RN->getEntry()) << "\n");
+    LLVM_DEBUG(dbgs() << "Visiting: "
+                      << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+                      << RN->getEntry()->getName() << " Loop Depth: "
+                      << LI->getLoopDepth(RN->getEntry()) << "\n");
 
     // Analyze all the conditions leading to a node
     gatherPredicates(RN);
@@ -494,7 +527,7 @@ void StructurizeCFG::collectInfos() {
   }
 }
 
-/// \brief Insert the missing branch conditions
+/// Insert the missing branch conditions
 void StructurizeCFG::insertConditions(bool Loops) {
   BranchVector &Conds = Loops ? LoopConds : Conditions;
   Value *Default = Loops ? BoolTrue : BoolFalse;
@@ -540,7 +573,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
   }
 }
 
-/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// Remove all PHI values coming from "From" into "To" and remember
 /// them in DeletedPhis
 void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
   PhiMap &Map = DeletedPhis[To];
@@ -552,7 +585,7 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
   }
 }
 
-/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+/// Add a dummy PHI value as soon as we knew the new predecessor
 void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
   for (PHINode &Phi : To->phis()) {
     Value *Undef = UndefValue::get(Phi.getType());
@@ -561,7 +594,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
   AddedPhis[To].push_back(From);
 }
 
-/// \brief Add the real PHI value as soon as everything is set up
+/// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SSAUpdater Updater;
   for (const auto &AddedPhi : AddedPhis) {
@@ -601,7 +634,7 @@ void StructurizeCFG::setPhiValues() {
   assert(DeletedPhis.empty());
 }
 
-/// \brief Remove phi values from all successors and then remove the terminator.
+/// Remove phi values from all successors and then remove the terminator.
 void StructurizeCFG::killTerminator(BasicBlock *BB) {
   TerminatorInst *Term = BB->getTerminator();
   if (!Term)
@@ -611,10 +644,12 @@ void StructurizeCFG::killTerminator(BasicBlock *BB) {
        SI != SE; ++SI)
     delPhiValues(BB, *SI);
 
+  if (DA)
+    DA->removeValue(Term);
   Term->eraseFromParent();
 }
 
-/// \brief Let node exit(s) point to NewExit
+/// Let node exit(s) point to NewExit
 void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
                                 bool IncludeDominator) {
   if (Node->isSubRegion()) {
@@ -660,7 +695,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
   }
 }
 
-/// \brief Create a new flow node and update dominator tree and region info
+/// Create a new flow node and update dominator tree and region info
 BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
   LLVMContext &Context = Func->getContext();
   BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
@@ -672,7 +707,7 @@ BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
   return Flow;
 }
 
-/// \brief Create a new or reuse the previous node as flow node
+/// Create a new or reuse the previous node as flow node
 BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
   BasicBlock *Entry = PrevNode->getEntry();
 
@@ -691,7 +726,7 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
   return Flow;
 }
 
-/// \brief Returns the region exit if possible, otherwise just a new flow node
+/// Returns the region exit if possible, otherwise just a new flow node
 BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
                                         bool ExitUseAllowed) {
   if (!Order.empty() || !ExitUseAllowed)
@@ -703,13 +738,13 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
   return Exit;
 }
 
-/// \brief Set the previous node
+/// Set the previous node
 void StructurizeCFG::setPrevNode(BasicBlock *BB) {
   PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
                                         : nullptr;
 }
 
-/// \brief Does BB dominate all the predicates of Node?
+/// Does BB dominate all the predicates of Node?
 bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
   return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
@@ -717,7 +752,7 @@ bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
   });
 }
 
-/// \brief Can we predict that this node will always be called?
+/// Can we predict that this node will always be called?
 bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
   bool Dominated = false;
@@ -845,7 +880,7 @@ void StructurizeCFG::createFlow() {
 }
 
 /// Handle a rare case where the disintegrated nodes instructions
-/// no longer dominate all their uses. Not sure if this is really nessasary
+/// no longer dominate all their uses. Not sure if this is really necessary
 void StructurizeCFG::rebuildSSA() {
   SSAUpdater Updater;
   for (BasicBlock *BB : ParentRegion->blocks())
@@ -878,30 +913,60 @@ void StructurizeCFG::rebuildSSA() {
     }
 }
 
-static bool hasOnlyUniformBranches(const Region *R,
+static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
                                    const DivergenceAnalysis &DA) {
-  for (const BasicBlock *BB : R->blocks()) {
-    const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!Br || !Br->isConditional())
-      continue;
+  for (auto E : R->elements()) {
+    if (!E->isSubRegion()) {
+      auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
+      if (!Br || !Br->isConditional())
+        continue;
 
-    if (!DA.isUniform(Br->getCondition()))
-      return false;
-    DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+      if (!DA.isUniform(Br))
+        return false;
+      LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
+                        << " has uniform terminator\n");
+    } else {
+      // Explicitly refuse to treat regions as uniform if they have non-uniform
+      // subregions. We cannot rely on DivergenceAnalysis for branches in
+      // subregions because those branches may have been removed and re-created,
+      // so we look for our metadata instead.
+      //
+      // Warning: It would be nice to treat regions as uniform based only on
+      // their direct child basic blocks' terminators, regardless of whether
+      // subregions are uniform or not. However, this requires a very careful
+      // look at SIAnnotateControlFlow to make sure nothing breaks there.
+      for (auto BB : E->getNodeAs<Region>()->blocks()) {
+        auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+        if (!Br || !Br->isConditional())
+          continue;
+
+        if (!Br->getMetadata(UniformMDKindID))
+          return false;
+      }
+    }
   }
   return true;
 }
 
-/// \brief Run the transformation for each region found
+/// Run the transformation for each region found
 bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   if (R->isTopLevelRegion())
     return false;
 
+  DA = nullptr;
+
   if (SkipUniformRegions) {
     // TODO: We could probably be smarter here with how we handle sub-regions.
-    auto &DA = getAnalysis<DivergenceAnalysis>();
-    if (hasOnlyUniformBranches(R, DA)) {
-      DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
+    // We currently rely on the fact that metadata is set by earlier invocations
+    // of the pass on sub-regions, and that this metadata doesn't get lost --
+    // but we shouldn't rely on metadata for correctness!
+    unsigned UniformMDKindID =
+        R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
+    DA = &getAnalysis<DivergenceAnalysis>();
+
+    if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
+      LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
+                        << '\n');
 
       // Mark all direct child block terminators as having been treated as
       // uniform. To account for a possible future in which non-uniform
@@ -913,7 +978,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
           continue;
 
         if (Instruction *Term = E->getEntry()->getTerminator())
-          Term->setMetadata("structurizecfg.uniform", MD);
+          Term->setMetadata(UniformMDKindID, MD);
       }
 
       return false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 2a1106b41de2..f8cd6c17a5a6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -87,7 +87,7 @@ STATISTIC(NumEliminated, "Number of tail calls removed");
 STATISTIC(NumRetDuped,   "Number of return duplicated");
 STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
-/// \brief Scan the specified function for alloca instructions.
+/// Scan the specified function for alloca instructions.
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
   // Because of PR962, we don't TRE dynamic allocas.
@@ -302,7 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
     if (Visited[CI->getParent()] != ESCAPED) {
       // If the escape point was part way through the block, calls after the
       // escape point wouldn't have been put into DeferredTails.
-      DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
+      LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
       CI->setTailCall();
       Modified = true;
     } else {
@@ -699,8 +699,8 @@ static bool foldReturnAndProcessPred(
     BranchInst *BI = UncondBranchPreds.pop_back_val();
     BasicBlock *Pred = BI->getParent();
     if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
-      DEBUG(dbgs() << "FOLDING: " << *BB
-            << "INTO UNCOND BRANCH PRED: " << *Pred);
+      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+                        << "INTO UNCOND BRANCH PRED: " << *Pred);
       ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
 
       // Cleanup: if all predecessors of BB have been eliminated by
diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
index 0f0668f24db5..e3ef42362223 100644
--- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -69,7 +69,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include <utility>
 
 using namespace llvm;
@@ -114,7 +114,7 @@ static bool shouldHaveDiscriminator(const Instruction *I) {
   return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
 }
 
-/// \brief Assign DWARF discriminators.
+/// Assign DWARF discriminators.
 ///
 /// To assign discriminators, we examine the boundaries of every
 /// basic block and its successors. Suppose there is a basic block B1
@@ -210,9 +210,9 @@ static bool addDiscriminators(Function &F) {
       // it in 1 byte ULEB128 representation.
       unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
       I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
-      DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
-                   << DIL->getColumn() << ":" << Discriminator << " " << I
-                   << "\n");
+      LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                        << DIL->getColumn() << ":" << Discriminator << " " << I
+                        << "\n");
       Changed = true;
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 8f59913e14bb..516a785dce1e 100644
--- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -36,7 +37,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -45,16 +45,22 @@
 
 using namespace llvm;
 
-void llvm::DeleteDeadBlock(BasicBlock *BB) {
+void llvm::DeleteDeadBlock(BasicBlock *BB, DeferredDominance *DDT) {
   assert((pred_begin(BB) == pred_end(BB) ||
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
   TerminatorInst *BBTerm = BB->getTerminator();
+  std::vector<DominatorTree::UpdateType> Updates;
 
   // Loop through all of our successors and make sure they know that one
   // of their predecessors is going away.
-  for (BasicBlock *Succ : BBTerm->successors())
+  if (DDT)
+    Updates.reserve(BBTerm->getNumSuccessors());
+  for (BasicBlock *Succ : BBTerm->successors()) {
     Succ->removePredecessor(BB);
+    if (DDT)
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+  }
 
   // Zap all the instructions in the block.
   while (!BB->empty()) {
@@ -69,8 +75,12 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
     BB->getInstList().pop_back();
   }
 
-  // Zap the block!
-  BB->eraseFromParent();
+  if (DDT) {
+    DDT->applyUpdates(Updates);
+    DDT->deleteBB(BB); // Deferred deletion of BB.
+  } else {
+    BB->eraseFromParent(); // Zap the block!
+  }
 }
 
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
@@ -107,9 +117,12 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
                                      LoopInfo *LI,
-                                     MemoryDependenceResults *MemDep) {
-  // Don't merge away blocks who have their address taken.
-  if (BB->hasAddressTaken()) return false;
+                                     MemoryDependenceResults *MemDep,
+                                     DeferredDominance *DDT) {
+  assert(!(DT && DDT) && "Cannot call with both DT and DDT.");
+
+  if (BB->hasAddressTaken())
+    return false;
 
   // Can't merge if there are multiple predecessors, or no predecessors.
   BasicBlock *PredBB = BB->getUniquePredecessor();
@@ -121,16 +134,9 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   if (PredBB->getTerminator()->isExceptional())
     return false;
 
-  succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
-  BasicBlock *OnlySucc = BB;
-  for (; SI != SE; ++SI)
-    if (*SI != OnlySucc) {
-      OnlySucc = nullptr;     // There are multiple distinct successors!
-      break;
-    }
-
-  // Can't merge if there are multiple successors.
-  if (!OnlySucc) return false;
+  // Can't merge if there are multiple distinct successors.
+  if (PredBB->getUniqueSuccessor() != BB)
+    return false;
 
   // Can't merge if there is PHI loop.
   for (PHINode &PN : BB->phis())
@@ -139,14 +145,27 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
         return false;
 
   // Begin by getting rid of unneeded PHIs.
-  SmallVector<Value *, 4> IncomingValues;
+  SmallVector<AssertingVH<Value>, 4> IncomingValues;
   if (isa<PHINode>(BB->front())) {
     for (PHINode &PN : BB->phis())
-      if (PN.getIncomingValue(0) != &PN)
+      if (!isa<PHINode>(PN.getIncomingValue(0)) ||
+          cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB)
         IncomingValues.push_back(PN.getIncomingValue(0));
     FoldSingleEntryPHINodes(BB, MemDep);
   }
 
+  // Deferred DT update: Collect all the edges that exit BB. These
+  // dominator edges will be redirected from Pred.
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DDT) {
+    Updates.reserve(1 + (2 * succ_size(BB)));
+    Updates.push_back({DominatorTree::Delete, PredBB, BB});
+    for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+      Updates.push_back({DominatorTree::Delete, BB, *I});
+      Updates.push_back({DominatorTree::Insert, PredBB, *I});
+    }
+  }
+
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
 
@@ -158,8 +177,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
 
   // Eliminate duplicate dbg.values describing the entry PHI node post-splice.
-  for (auto *Incoming : IncomingValues) {
-    if (isa<Instruction>(Incoming)) {
+  for (auto Incoming : IncomingValues) {
+    if (isa<Instruction>(*Incoming)) {
       SmallVector<DbgValueInst *, 2> DbgValues;
       SmallDenseSet<std::pair<DILocalVariable *, DIExpression *>, 2>
           DbgValueSet;
@@ -193,7 +212,12 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   if (MemDep)
     MemDep->invalidateCachedPredecessors();
 
-  BB->eraseFromParent();
+  if (DDT) {
+    DDT->deleteBB(BB); // Deferred deletion of BB.
+    DDT->applyUpdates(Updates);
+  } else {
+    BB->eraseFromParent(); // Nuke BB.
+  }
   return true;
 }
 
@@ -309,13 +333,21 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       DominatorTree *DT, LoopInfo *LI,
                                       bool PreserveLCSSA, bool &HasLoopExit) {
   // Update dominator tree if available.
-  if (DT)
-    DT->splitBlock(NewBB);
+  if (DT) {
+    if (OldBB == DT->getRootNode()->getBlock()) {
+      assert(NewBB == &NewBB->getParent()->getEntryBlock());
+      DT->setNewRoot(NewBB);
+    } else {
+      // Split block expects NewBB to have a non-empty set of predecessors.
+      DT->splitBlock(NewBB);
+    }
+  }
 
   // The rest of the logic is only relevant for updating the loop structures.
   if (!LI)
     return;
 
+  assert(DT && "DT should be available to update LoopInfo!");
   Loop *L = LI->getLoopFor(OldBB);
 
   // If we need to preserve loop analyses, collect some information about how
@@ -493,7 +525,6 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
     // Insert dummy values as the incoming value.
     for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
       cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
-    return NewBB;
   }
 
   // Update DominatorTree, LoopInfo, and LCCSA analysis information.
@@ -501,8 +532,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA,
                             HasLoopExit);
 
-  // Update the PHI nodes in BB with the values coming from NewBB.
-  UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+  if (!Preds.empty()) {
+    // Update the PHI nodes in BB with the values coming from NewBB.
+    UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+  }
+
   return NewBB;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 464d1a34f518..3e30c27a9f33 100644
--- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -28,7 +28,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index b60dfb4f3541..5f5c4150d3bb 100644
--- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -105,12 +105,23 @@ static bool setRetNonNull(Function &F) {
   return true;
 }
 
+static bool setNonLazyBind(Function &F) {
+  if (F.hasFnAttribute(Attribute::NonLazyBind))
+    return false;
+  F.addFnAttr(Attribute::NonLazyBind);
+  return true;
+}
+
 bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
 
   bool Changed = false;
+
+  if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
+    Changed |= setNonLazyBind(F);
+
   switch (TheLibFunc) {
   case LibFunc_strlen:
   case LibFunc_wcslen:
@@ -375,6 +386,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_fseek:
   case LibFunc_ftell:
   case LibFunc_fgetc:
+  case LibFunc_fgetc_unlocked:
   case LibFunc_fseeko:
   case LibFunc_ftello:
   case LibFunc_fileno:
@@ -393,6 +405,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F);
     return Changed;
   case LibFunc_fputc:
+  case LibFunc_fputc_unlocked:
   case LibFunc_fstat:
   case LibFunc_frexp:
   case LibFunc_frexpf:
@@ -402,21 +415,25 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_fgets:
+  case LibFunc_fgets_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
   case LibFunc_fread:
+  case LibFunc_fread_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     return Changed;
   case LibFunc_fwrite:
+  case LibFunc_fwrite_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     // FIXME: readonly #1?
     return Changed;
   case LibFunc_fputs:
+  case LibFunc_fputs_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -447,6 +464,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_gets:
   case LibFunc_getchar:
+  case LibFunc_getchar_unlocked:
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_getitimer:
@@ -485,6 +503,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_putc:
+  case LibFunc_putc_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
@@ -505,6 +524,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_putchar:
+  case LibFunc_putchar_unlocked:
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_popen:
@@ -687,9 +707,9 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setRetNonNull(F);
     Changed |= setRetDoesNotAlias(F);
     return Changed;
-  //TODO: add LibFunc entries for:
-  //case LibFunc_memset_pattern4:
-  //case LibFunc_memset_pattern8:
+  // TODO: add LibFunc entries for:
+  // case LibFunc_memset_pattern4:
+  // case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -709,6 +729,19 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   }
 }
 
+bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                           LibFunc DoubleFn, LibFunc FloatFn,
+                           LibFunc LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
 //- Emit LibCalls ------------------------------------------------------------//
 
 Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
@@ -973,6 +1006,24 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   return CI;
 }
 
+Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputc_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(),
+                                       B.getInt32Ty(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction("fputc_unlocked"), *TLI);
+  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
+  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
 Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fputs))
@@ -991,6 +1042,24 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   return CI;
 }
 
+Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputs_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef FPutsUnlockedName = TLI->getName(LibFunc_fputs_unlocked);
+  Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
+                                       B.getInt8PtrTy(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FPutsUnlockedName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
 Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fwrite))
@@ -1013,3 +1082,119 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
     CI->setCallingConv(Fn->getCallingConv());
   return CI;
 }
+
+Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_malloc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(),
+                                         DL.getIntPtrType(Context));
+  inferLibFuncAttributes(*M->getFunction("malloc"), *TLI);
+  CallInst *CI = B.CreateCall(Malloc, Num, "malloc");
+
+  if (const Function *F = dyn_cast<Function>(Malloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+                        IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+  if (!TLI.has(LibFunc_calloc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+                                         PtrType, PtrType);
+  inferLibFuncAttributes(*M->getFunction("calloc"), TLI);
+  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc");
+
+  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+                                IRBuilder<> &B, const DataLayout &DL,
+                                const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fwrite_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteUnlockedName = TLI->getName(LibFunc_fwrite_unlocked);
+  Constant *F = M->getOrInsertFunction(
+      FWriteUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FWriteUnlockedName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fgetc_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F =
+      M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction("fgetc_unlocked"), *TLI);
+  CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
+                               IRBuilder<> &B, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fgets_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F =
+      M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(),
+                             B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
+  inferLibFuncAttributes(*M->getFunction("fgets_unlocked"), *TLI);
+  CallInst *CI =
+      B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+                               IRBuilder<> &B, const DataLayout &DL,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fread_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FReadUnlockedName = TLI->getName(LibFunc_fread_unlocked);
+  Constant *F = M->getOrInsertFunction(
+      FReadUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FReadUnlockedName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index f711b192f604..05512a6dff3e 100644
--- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -34,7 +35,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 
@@ -173,7 +173,7 @@ Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
   return isDivisionOp() ? Value.Quotient : Value.Remainder;
 }
 
-/// \brief Check if a value looks like a hash.
+/// Check if a value looks like a hash.
 ///
 /// The routine is expected to detect values computed using the most common hash
 /// algorithms. Typically, hash computations end with one of the following
diff --git a/contrib/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 5dc6068d4a0b..4d9c22e57a68 100644
--- a/contrib/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -389,12 +389,14 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
   // Inspect the arguments of the call site. If an argument's type doesn't
   // match the corresponding formal argument's type in the callee, bitcast it
   // to the correct type.
-  for (Use &U : CS.args()) {
-    unsigned ArgNo = CS.getArgumentNo(&U);
-    Type *FormalTy = Callee->getFunctionType()->getParamType(ArgNo);
-    Type *ActualTy = U.get()->getType();
+  auto CalleeType = Callee->getFunctionType();
+  auto CalleeParamNum = CalleeType->getNumParams();
+  for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
+    auto *Arg = CS.getArgument(ArgNo); 
+    Type *FormalTy = CalleeType->getParamType(ArgNo);
+    Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::Create(Instruction::BitCast, U.get(), FormalTy, "",
+      auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "",
                                     CS.getInstruction());
       CS.setArgument(ArgNo, Cast);
     }
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 16af2c7b808b..61448e9acb57 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -31,7 +32,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 using namespace llvm;
@@ -43,44 +43,36 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   DebugInfoFinder *DIFinder) {
   DenseMap<const MDNode *, MDNode *> Cache;
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
-  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+  if (BB->hasName())
+    NewBB->setName(BB->getName() + NameSuffix);
 
   bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
   Module *TheModule = F ? F->getParent() : nullptr;
 
   // Loop over all instructions, and copy them over.
-  for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
-       II != IE; ++II) {
-
-    if (DIFinder && TheModule) {
-      if (auto *DDI = dyn_cast<DbgDeclareInst>(II))
-        DIFinder->processDeclare(*TheModule, DDI);
-      else if (auto *DVI = dyn_cast<DbgValueInst>(II))
-        DIFinder->processValue(*TheModule, DVI);
+  for (const Instruction &I : *BB) {
+    if (DIFinder && TheModule)
+      DIFinder->processInstruction(*TheModule, I);
 
-      if (auto DbgLoc = II->getDebugLoc())
-        DIFinder->processLocation(*TheModule, DbgLoc.get());
-    }
-
-    Instruction *NewInst = II->clone();
-    if (II->hasName())
-      NewInst->setName(II->getName()+NameSuffix);
+    Instruction *NewInst = I.clone();
+    if (I.hasName())
+      NewInst->setName(I.getName() + NameSuffix);
     NewBB->getInstList().push_back(NewInst);
-    VMap[&*II] = NewInst; // Add instruction map to value.
+    VMap[&I] = NewInst; // Add instruction map to value.
 
-    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+    hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       if (isa<ConstantInt>(AI->getArraySize()))
         hasStaticAllocas = true;
       else
         hasDynamicAllocas = true;
     }
   }
-  
+
   if (CodeInfo) {
     CodeInfo->ContainsCalls          |= hasCalls;
     CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
-    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
                                         BB != &BB->getParent()->getEntryBlock();
   }
   return NewBB;
@@ -175,7 +167,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
     // Create a new basic block and copy instructions into it!
     BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
-                                      SP ? &DIFinder : nullptr);
+                                      ModuleLevelChanges ? &DIFinder : nullptr);
 
     // Add basic block mapping.
     VMap[&BB] = CBB;
@@ -197,15 +189,15 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       Returns.push_back(RI);
   }
 
-  for (DISubprogram *ISP : DIFinder.subprograms()) {
-    if (ISP != SP) {
+  for (DISubprogram *ISP : DIFinder.subprograms())
+    if (ISP != SP)
       VMap.MD()[ISP].reset(ISP);
-    }
-  }
 
-  for (auto *Type : DIFinder.types()) {
+  for (DICompileUnit *CU : DIFinder.compile_units())
+    VMap.MD()[CU].reset(CU);
+
+  for (DIType *Type : DIFinder.types())
     VMap.MD()[Type].reset(Type);
-  }
 
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
@@ -283,7 +275,7 @@ namespace {
 
     /// The specified block is found to be reachable, clone it and
     /// anything that it can reach.
-    void CloneBlock(const BasicBlock *BB, 
+    void CloneBlock(const BasicBlock *BB,
                     BasicBlock::const_iterator StartingInst,
                     std::vector<const BasicBlock*> &ToClone);
   };
@@ -546,7 +538,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     // phi nodes will have invalid entries.  Update the PHI nodes in this
     // case.
     PHINode *PN = cast<PHINode>(NewBB->begin());
-    NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+    NumPreds = pred_size(NewBB);
     if (NumPreds != PN->getNumIncomingValues()) {
       assert(NumPreds < PN->getNumIncomingValues());
       // Count how many times each predecessor comes to this block.
@@ -718,7 +710,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                             ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
 }
 
-/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+/// Remaps instructions in \p Blocks using the mapping in \p VMap.
 void llvm::remapInstructionsInBlocks(
     const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
   // Rewrite the code to refer to itself.
@@ -728,7 +720,7 @@ void llvm::remapInstructionsInBlocks(
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 }
 
-/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
 /// Blocks.
 ///
 /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
@@ -792,12 +784,13 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
   return NewLoop;
 }
 
-/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// Duplicate non-Phi instructions from the beginning of block up to
 /// StopAt instruction into a split block between BB and its predecessor.
 BasicBlock *
 llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
                                           Instruction *StopAt,
-                                          ValueToValueMapTy &ValueMapping) {
+                                          ValueToValueMapTy &ValueMapping,
+                                          DominatorTree *DT) {
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
   // account for entry from PredBB.
@@ -805,13 +798,15 @@ llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
     ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
 
-  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  BasicBlock *NewBB = SplitEdge(PredBB, BB, DT);
   NewBB->setName(PredBB->getName() + ".split");
   Instruction *NewTerm = NewBB->getTerminator();
 
   // Clone the non-phi instructions of BB into NewBB, keeping track of the
   // mapping and using it to remap operands in the cloned instructions.
-  for (; StopAt != &*BI; ++BI) {
+  // Stop once we see the terminator too. This covers the case where BB's
+  // terminator gets replaced and StopAt == BB's terminator.
+  for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) {
     Instruction *New = BI->clone();
     New->setName(BI->getName());
     New->insertBefore(NewTerm);
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
index 8fee10854229..35c7511a24b9 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -32,33 +32,34 @@ static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
 /// copies of global variables and functions, and making their (initializers and
 /// references, respectively) refer to the right globals.
 ///
-std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
+std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
   // Create the value map that maps things from the old module over to the new
   // module.
   ValueToValueMapTy VMap;
   return CloneModule(M, VMap);
 }
 
-std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+std::unique_ptr<Module> llvm::CloneModule(const Module &M,
                                           ValueToValueMapTy &VMap) {
   return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
 }
 
 std::unique_ptr<Module> llvm::CloneModule(
-    const Module *M, ValueToValueMapTy &VMap,
+    const Module &M, ValueToValueMapTy &VMap,
     function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
-      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
-  New->setDataLayout(M->getDataLayout());
-  New->setTargetTriple(M->getTargetTriple());
-  New->setModuleInlineAsm(M->getModuleInlineAsm());
-   
+      llvm::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+  New->setSourceFileName(M.getSourceFileName());
+  New->setDataLayout(M.getDataLayout());
+  New->setTargetTriple(M.getTargetTriple());
+  New->setModuleInlineAsm(M.getModuleInlineAsm());
+
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
   // don't worry about attributes or initializers, they will come later.
   //
-  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
     GlobalVariable *GV = new GlobalVariable(*New, 
                                             I->getValueType(),
@@ -72,7 +73,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // Loop over the functions in the module, making external functions as before
-  for (const Function &I : *M) {
+  for (const Function &I : M) {
     Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
                                     I.getLinkage(), I.getName(), New.get());
     NF->copyAttributesFrom(&I);
@@ -80,7 +81,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // Loop over the aliases in the module
-  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I) {
     if (!ShouldCloneDefinition(&*I)) {
       // An alias cannot act as an external reference, so we need to create
@@ -114,7 +115,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   // have been created, loop through and copy the global variable referrers
   // over...  We also set the attributes on the global now.
   //
-  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
     if (I->isDeclaration())
       continue;
@@ -139,7 +140,7 @@ std::unique_ptr<Module> llvm::CloneModule(
 
   // Similarly, copy over function bodies now...
   //
-  for (const Function &I : *M) {
+  for (const Function &I : M) {
     if (I.isDeclaration())
       continue;
 
@@ -169,7 +170,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // And aliases
-  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I) {
     // We already dealt with undefined aliases above.
     if (!ShouldCloneDefinition(&*I))
@@ -180,8 +181,9 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // And named metadata....
-  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
-         E = M->named_metadata_end(); I != E; ++I) {
+  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+                                             E = M.named_metadata_end();
+       I != E; ++I) {
     const NamedMDNode &NMD = *I;
     NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
     for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
@@ -194,7 +196,7 @@ std::unique_ptr<Module> llvm::CloneModule(
 extern "C" {
 
 LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
-  return wrap(CloneModule(unwrap(M)).release());
+  return wrap(CloneModule(*unwrap(M)).release());
 }
 
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 7a404241cb14..f31dab9f96af 100644
--- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -66,6 +66,7 @@
 #include <vector>
 
 using namespace llvm;
+using ProfileCount = Function::ProfileCount;
 
 #define DEBUG_TYPE "code-extractor"
 
@@ -77,12 +78,10 @@ static cl::opt<bool>
 AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
                  cl::desc("Aggregate arguments to code-extracted functions"));
 
-/// \brief Test whether a block is valid for extraction.
-bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
-                                              bool AllowVarArgs) {
-  // Landing pads must be in the function where they were inserted for cleanup.
-  if (BB.isEHPad())
-    return false;
+/// Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB,
+                                      const SetVector<BasicBlock *> &Result,
+                                      bool AllowVarArgs, bool AllowAlloca) {
   // taking the address of a basic block moved to another function is illegal
   if (BB.hasAddressTaken())
     return false;
@@ -111,11 +110,63 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
     }
   }
 
-  // Don't hoist code containing allocas or invokes. If explicitly requested,
-  // allow vastart.
+  // If explicitly requested, allow vastart and alloca. For invoke instructions
+  // verify that extraction is valid.
   for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
-    if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
-      return false;
+    if (isa<AllocaInst>(I)) {
+       if (!AllowAlloca)
+         return false;
+       continue;
+    }
+
+    if (const auto *II = dyn_cast<InvokeInst>(I)) {
+      // Unwind destination (either a landingpad, catchswitch, or cleanuppad)
+      // must be a part of the subgraph which is being extracted.
+      if (auto *UBB = II->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      continue;
+    }
+
+    // All catch handlers of a catchswitch instruction as well as the unwind
+    // destination must be in the subgraph.
+    if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) {
+      if (auto *UBB = CSI->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      for (auto *HBB : CSI->handlers())
+        if (!Result.count(const_cast<BasicBlock*>(HBB)))
+          return false;
+      continue;
+    }
+
+    // Make sure that entire catch handler is within subgraph. It is sufficient
+    // to check that catch return's block is in the list.
+    if (const auto *CPI = dyn_cast<CatchPadInst>(I)) {
+      for (const auto *U : CPI->users())
+        if (const auto *CRI = dyn_cast<CatchReturnInst>(U))
+          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+            return false;
+      continue;
+    }
+
+    // And do similar checks for cleanup handler - the entire handler must be
+    // in subgraph which is going to be extracted. For cleanup return should
+    // additionally check that the unwind destination is also in the subgraph.
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) {
+      for (const auto *U : CPI->users())
+        if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+            return false;
+      continue;
+    }
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) {
+      if (auto *UBB = CRI->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      continue;
+    }
+
     if (const CallInst *CI = dyn_cast<CallInst>(I))
       if (const Function *F = CI->getCalledFunction())
         if (F->getIntrinsicID() == Intrinsic::vastart) {
@@ -129,10 +180,10 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
   return true;
 }
 
-/// \brief Build a set of blocks to extract if the input blocks are viable.
+/// Build a set of blocks to extract if the input blocks are viable.
 static SetVector<BasicBlock *>
 buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
-                        bool AllowVarArgs) {
+                        bool AllowVarArgs, bool AllowAlloca) {
   assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
   SetVector<BasicBlock *> Result;
 
@@ -145,32 +196,42 @@ buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
 
     if (!Result.insert(BB))
       llvm_unreachable("Repeated basic blocks in extraction input");
-    if (!CodeExtractor::isBlockValidForExtraction(*BB, AllowVarArgs)) {
-      Result.clear();
-      return Result;
-    }
   }
 
-#ifndef NDEBUG
-  for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
-                                         E = Result.end();
-       I != E; ++I)
-    for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
-         PI != PE; ++PI)
-      assert(Result.count(*PI) &&
-             "No blocks in this region may have entries from outside the region"
-             " except for the first block!");
-#endif
+  for (auto *BB : Result) {
+    if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca))
+      return {};
+
+    // Make sure that the first block is not a landing pad.
+    if (BB == Result.front()) {
+      if (BB->isEHPad()) {
+        LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n");
+        return {};
+      }
+      continue;
+    }
+
+    // All blocks other than the first must not have predecessors outside of
+    // the subgraph which is being extracted.
+    for (auto *PBB : predecessors(BB))
+      if (!Result.count(PBB)) {
+        LLVM_DEBUG(
+            dbgs() << "No blocks in this region may have entries from "
+                      "outside the region except for the first block!\n");
+        return {};
+      }
+  }
 
   return Result;
 }
 
 CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
-                             BranchProbabilityInfo *BPI, bool AllowVarArgs)
+                             BranchProbabilityInfo *BPI, bool AllowVarArgs,
+                             bool AllowAlloca)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(AllowVarArgs),
-      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs)) {}
+      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)) {}
 
 CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
                              BlockFrequencyInfo *BFI,
@@ -178,7 +239,8 @@ CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
-                                     /* AllowVarArgs */ false)) {}
+                                     /* AllowVarArgs */ false,
+                                     /* AllowAlloca */ false)) {}
 
 /// definedInRegion - Return true if the specified value is defined in the
 /// extracted region.
@@ -562,8 +624,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
                                            BasicBlock *newHeader,
                                            Function *oldFunction,
                                            Module *M) {
-  DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
-  DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+  LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+  LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
 
   // This function returns unsigned, outputs will go back by reference.
   switch (NumExitBlocks) {
@@ -577,20 +639,20 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
 
   // Add the types of the input values to the function's argument list
   for (Value *value : inputs) {
-    DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
     paramTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
   for (Value *output : outputs) {
-    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+    LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
     if (AggregateArgs)
       paramTy.push_back(output->getType());
     else
       paramTy.push_back(PointerType::getUnqual(output->getType()));
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Function type: " << *RetTy << " f(";
     for (Type *i : paramTy)
       dbgs() << *i << ", ";
@@ -620,16 +682,89 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   if (oldFunction->hasUWTable())
     newFunction->setHasUWTable();
 
-  // Inherit all of the target dependent attributes.
+  // Inherit all of the target dependent attributes and white-listed
+  // target independent attributes.
   //  (e.g. If the extracted region contains a call to an x86.sse
   //  instruction we need to make sure that the extracted region has the
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
-  for (const auto &Attr : AB.td_attrs())
-    newFunction->addFnAttr(Attr.first, Attr.second);
+  for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+    if (Attr.isStringAttribute()) {
+      if (Attr.getKindAsString() == "thunk")
+        continue;
+    } else
+      switch (Attr.getKindAsEnum()) {
+      // Those attributes cannot be propagated safely. Explicitly list them
+      // here so we get a warning if new attributes are added. This list also
+      // includes non-function attributes.
+      case Attribute::Alignment:
+      case Attribute::AllocSize:
+      case Attribute::ArgMemOnly:
+      case Attribute::Builtin:
+      case Attribute::ByVal:
+      case Attribute::Convergent:
+      case Attribute::Dereferenceable:
+      case Attribute::DereferenceableOrNull:
+      case Attribute::InAlloca:
+      case Attribute::InReg:
+      case Attribute::InaccessibleMemOnly:
+      case Attribute::InaccessibleMemOrArgMemOnly:
+      case Attribute::JumpTable:
+      case Attribute::Naked:
+      case Attribute::Nest:
+      case Attribute::NoAlias:
+      case Attribute::NoBuiltin:
+      case Attribute::NoCapture:
+      case Attribute::NoReturn:
+      case Attribute::None:
+      case Attribute::NonNull:
+      case Attribute::ReadNone:
+      case Attribute::ReadOnly:
+      case Attribute::Returned:
+      case Attribute::ReturnsTwice:
+      case Attribute::SExt:
+      case Attribute::Speculatable:
+      case Attribute::StackAlignment:
+      case Attribute::StructRet:
+      case Attribute::SwiftError:
+      case Attribute::SwiftSelf:
+      case Attribute::WriteOnly:
+      case Attribute::ZExt:
+      case Attribute::EndAttrKinds:
+        continue;
+      // Those attributes should be safe to propagate to the extracted function.
+      case Attribute::AlwaysInline:
+      case Attribute::Cold:
+      case Attribute::NoRecurse:
+      case Attribute::InlineHint:
+      case Attribute::MinSize:
+      case Attribute::NoDuplicate:
+      case Attribute::NoImplicitFloat:
+      case Attribute::NoInline:
+      case Attribute::NonLazyBind:
+      case Attribute::NoRedZone:
+      case Attribute::NoUnwind:
+      case Attribute::OptForFuzzing:
+      case Attribute::OptimizeNone:
+      case Attribute::OptimizeForSize:
+      case Attribute::SafeStack:
+      case Attribute::ShadowCallStack:
+      case Attribute::SanitizeAddress:
+      case Attribute::SanitizeMemory:
+      case Attribute::SanitizeThread:
+      case Attribute::SanitizeHWAddress:
+      case Attribute::StackProtect:
+      case Attribute::StackProtectReq:
+      case Attribute::StackProtectStrong:
+      case Attribute::StrictFP:
+      case Attribute::UWTable:
+      case Attribute::NoCfCheck:
+        break;
+      }
 
+    newFunction->addFnAttr(Attr);
+  }
   newFunction->getBasicBlockList().push_back(newRootNode);
 
   // Create an iterator to name all of the arguments we inserted.
@@ -1093,10 +1228,10 @@ Function *CodeExtractor::extractCodeRegion() {
 
   // Update the entry count of the function.
   if (BFI) {
-    Optional<uint64_t> EntryCount =
-        BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
-    if (EntryCount.hasValue())
-      newFunction->setEntryCount(EntryCount.getValue());
+    auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+    if (Count.hasValue())
+      newFunction->setEntryCount(
+          ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
     BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
   }
 
@@ -1104,6 +1239,10 @@ Function *CodeExtractor::extractCodeRegion() {
 
   moveCodeToFunction(newFunction);
 
+  // Propagate personality info to the new function if there is one.
+  if (oldFunction->hasPersonalityFn())
+    newFunction->setPersonalityFn(oldFunction->getPersonalityFn());
+
   // Update the branch weights for the exit block.
   if (BFI && NumExitBlocks > 1)
     calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
@@ -1139,7 +1278,7 @@ Function *CodeExtractor::extractCodeRegion() {
         }
     }
 
-  DEBUG(if (verifyFunction(*newFunction)) 
-        report_fatal_error("verifyFunction failed!"));
+  LLVM_DEBUG(if (verifyFunction(*newFunction))
+                 report_fatal_error("verifyFunction failed!"));
   return newFunction;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
index 82b67c293102..9a0240144d08 100644
--- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
@@ -138,7 +138,7 @@ bool optimizeGlobalCtorsList(Module &M,
     if (!F)
       continue;
 
-    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+    LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
     // We cannot simplify external ctor functions.
     if (F->empty())
diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index 6d3d287defdb..56ff03c7f5e1 100644
--- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -9,11 +9,11 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
diff --git a/contrib/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 421663f82565..569ea58a3047 100644
--- a/contrib/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -9,14 +9,13 @@
 
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 static void insertCall(Function &CurFn, StringRef Func,
@@ -92,17 +91,27 @@ static bool runOnFunction(Function &F, bool PostInlining) {
 
   if (!ExitFunc.empty()) {
     for (BasicBlock &BB : F) {
-      TerminatorInst *T = BB.getTerminator();
+      Instruction *T = BB.getTerminator();
+      if (!isa<ReturnInst>(T))
+        continue;
+
+      // If T is preceded by a musttail call, that's the real terminator.
+      Instruction *Prev = T->getPrevNode();
+      if (BitCastInst *BCI = dyn_cast_or_null<BitCastInst>(Prev))
+        Prev = BCI->getPrevNode();
+      if (CallInst *CI = dyn_cast_or_null<CallInst>(Prev)) {
+        if (CI->isMustTailCall())
+          T = CI;
+      }
+
       DebugLoc DL;
       if (DebugLoc TerminatorDL = T->getDebugLoc())
         DL = TerminatorDL;
       else if (auto SP = F.getSubprogram())
         DL = DebugLoc::get(0, 0, SP);
 
-      if (isa<ReturnInst>(T)) {
-        insertCall(F, ExitFunc, T, DL);
-        Changed = true;
-      }
+      insertCall(F, ExitFunc, T, DL);
+      Changed = true;
     }
     F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
index 78d7474e5b95..c9c96fbe5da0 100644
--- a/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -14,9 +14,9 @@
 
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 static Constant *getDefaultPersonalityFn(Module *M) {
@@ -73,8 +73,8 @@ IRBuilder<> *EscapeEnumerator::Next() {
     F.setPersonalityFn(PersFn);
   }
 
-  if (isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
-    report_fatal_error("Funclet EH not supported");
+  if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+    report_fatal_error("Scoped EH not supported");
   }
 
   LandingPadInst *LPad =
diff --git a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
index 3c5e299fae98..7fd9425efed3 100644
--- a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
@@ -174,6 +175,11 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
   return false;
 }
 
+static Constant *getInitializer(Constant *C) {
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+}
+
 /// Return the value that would be computed by a load from P after the stores
 /// reflected by 'memory' have been performed.  If we can't decide, return null.
 Constant *Evaluator::ComputeLoadResult(Constant *P) {
@@ -189,18 +195,96 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {
     return nullptr;
   }
 
-  // Handle a constantexpr getelementptr.
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
-    if (CE->getOpcode() == Instruction::GetElementPtr &&
-        isa<GlobalVariable>(CE->getOperand(0))) {
-      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
-      if (GV->hasDefinitiveInitializer())
-        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
+    switch (CE->getOpcode()) {
+    // Handle a constantexpr getelementptr.
+    case Instruction::GetElementPtr:
+      if (auto *I = getInitializer(CE->getOperand(0)))
+        return ConstantFoldLoadThroughGEPConstantExpr(I, CE);
+      break;
+    // Handle a constantexpr bitcast.
+    case Instruction::BitCast:
+      Constant *Val = getVal(CE->getOperand(0));
+      auto MM = MutatedMemory.find(Val);
+      auto *I = (MM != MutatedMemory.end()) ? MM->second
+                                            : getInitializer(CE->getOperand(0));
+      if (I)
+        return ConstantFoldLoadThroughBitcast(
+            I, P->getType()->getPointerElementType(), DL);
+      break;
     }
+  }
 
   return nullptr;  // don't know how to evaluate.
 }
 
+static Function *getFunction(Constant *C) {
+  if (auto *Fn = dyn_cast<Function>(C))
+    return Fn;
+
+  if (auto *Alias = dyn_cast<GlobalAlias>(C))
+    if (auto *Fn = dyn_cast<Function>(Alias->getAliasee()))
+      return Fn;
+  return nullptr;
+}
+
+Function *
+Evaluator::getCalleeWithFormalArgs(CallSite &CS,
+                                   SmallVector<Constant *, 8> &Formals) {
+  auto *V = CS.getCalledValue();
+  if (auto *Fn = getFunction(getVal(V)))
+    return getFormalParams(CS, Fn, Formals) ? Fn : nullptr;
+
+  auto *CE = dyn_cast<ConstantExpr>(V);
+  if (!CE || CE->getOpcode() != Instruction::BitCast ||
+      !getFormalParams(CS, getFunction(CE->getOperand(0)), Formals))
+    return nullptr;
+
+  return dyn_cast<Function>(
+      ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+}
+
+bool Evaluator::getFormalParams(CallSite &CS, Function *F,
+                                SmallVector<Constant *, 8> &Formals) {
+  if (!F)
+    return false;
+
+  auto *FTy = F->getFunctionType();
+  if (FTy->getNumParams() > CS.getNumArgOperands()) {
+    LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+    return false;
+  }
+
+  auto ArgI = CS.arg_begin();
+  for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
+       ++ParI) {
+    auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
+    if (!ArgC) {
+      LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
+      return false;
+    }
+    Formals.push_back(ArgC);
+    ++ArgI;
+  }
+  return true;
+}
+
+/// If call expression contains bitcast then we may need to cast
+/// evaluated return value to a type of the call expression.
+Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
+  if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+    return RV;
+
+  if (auto *FT =
+          dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
+    RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
+    if (!RV)
+      LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
+  }
+  return RV;
+}
+
 /// Evaluate all instructions in block BB, returning true if successful, false
 /// if we can't evaluate it.  NewBB returns the next BB that control flows into,
 /// or null upon return.
@@ -210,22 +294,23 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
   while (true) {
     Constant *InstResult = nullptr;
 
-    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+    LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
 
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
       if (!SI->isSimple()) {
-        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
         return false;  // no volatile/atomic accesses.
       }
       Constant *Ptr = getVal(SI->getOperand(1));
       if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
-        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
         Ptr = FoldedPtr;
-        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+        LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
       }
       if (!isSimpleEnoughPointerToCommit(Ptr)) {
         // If this is too complex for us to commit, reject it.
-        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+        LLVM_DEBUG(
+            dbgs() << "Pointer is too complex for us to evaluate store.");
         return false;
       }
 
@@ -234,14 +319,15 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
       if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
-        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
-              << "\n");
+        LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
+                          << *Val << "\n");
         return false;
       }
 
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         if (CE->getOpcode() == Instruction::BitCast) {
-          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+          LLVM_DEBUG(dbgs()
+                     << "Attempting to resolve bitcast on constant ptr.\n");
           // If we're evaluating a store through a bitcast, then we need
           // to pull the bitcast off the pointer type and push it onto the
           // stored value.
@@ -252,7 +338,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
           // introspecting NewTy to find a legal conversion.
-          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+          Constant *NewVal;
+          while (!(NewVal = ConstantFoldLoadThroughBitcast(Val, NewTy, DL))) {
             // If NewTy is a struct, we can convert the pointer to the struct
             // into a pointer to its first member.
             // FIXME: This could be extended to support arrays as well.
@@ -270,17 +357,14 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
             } else {
-              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
-                    "evaluate.\n");
+              LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                                   "evaluate.\n");
               return false;
             }
           }
 
-          // If we found compatible types, go ahead and push the bitcast
-          // onto the stored value.
-          Val = ConstantExpr::getBitCast(Val, NewTy);
-
-          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+          Val = NewVal;
+          LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
         }
       }
 
@@ -289,37 +373,37 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult = ConstantExpr::get(BO->getOpcode(),
                                      getVal(BO->getOperand(0)),
                                      getVal(BO->getOperand(1)));
-      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
+                        << *InstResult << "\n");
     } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
       InstResult = ConstantExpr::getCompare(CI->getPredicate(),
                                             getVal(CI->getOperand(0)),
                                             getVal(CI->getOperand(1)));
-      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+                        << "\n");
     } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
       InstResult = ConstantExpr::getCast(CI->getOpcode(),
                                          getVal(CI->getOperand(0)),
                                          CI->getType());
-      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+                        << "\n");
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
       InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
                                            getVal(SI->getOperand(1)),
                                            getVal(SI->getOperand(2)));
-      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+                        << "\n");
     } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
       InstResult = ConstantExpr::getExtractValue(
           getVal(EVI->getAggregateOperand()), EVI->getIndices());
-      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
+                        << *InstResult << "\n");
     } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
       InstResult = ConstantExpr::getInsertValue(
           getVal(IVI->getAggregateOperand()),
           getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
-      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
+                        << *InstResult << "\n");
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
@@ -329,60 +413,63 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult =
           ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
                                          cast<GEPOperator>(GEP)->isInBounds());
-      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
       if (!LI->isSimple()) {
-        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        LLVM_DEBUG(
+            dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
         return false;  // no volatile/atomic accesses.
       }
 
       Constant *Ptr = getVal(LI->getOperand(0));
       if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
         Ptr = FoldedPtr;
-        DEBUG(dbgs() << "Found a constant pointer expression, constant "
-              "folding: " << *Ptr << "\n");
+        LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
+                             "folding: "
+                          << *Ptr << "\n");
       }
       InstResult = ComputeLoadResult(Ptr);
       if (!InstResult) {
-        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
-              "\n");
+        LLVM_DEBUG(
+            dbgs() << "Failed to compute load result. Can not evaluate load."
+                      "\n");
         return false; // Could not evaluate load.
       }
 
-      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+      LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
       if (AI->isArrayAllocation()) {
-        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
         return false;  // Cannot handle array allocs.
       }
       Type *Ty = AI->getAllocatedType();
       AllocaTmps.push_back(llvm::make_unique<GlobalVariable>(
           Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
-          AI->getName()));
+          AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
+          AI->getType()->getPointerAddressSpace()));
       InstResult = AllocaTmps.back().get();
-      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+      LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallSite CS(&*CurInst);
 
       // Debug info can safely be ignored here.
       if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
-        DEBUG(dbgs() << "Ignoring debug info.\n");
+        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
       if (isa<InlineAsm>(CS.getCalledValue())) {
-        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
         return false;
       }
 
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
         if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
           if (MSI->isVolatile()) {
-            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
-                  "intrinsic.\n");
+            LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
+                              << "intrinsic.\n");
             return false;
           }
           Constant *Ptr = getVal(MSI->getDest());
@@ -390,7 +477,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           Constant *DestVal = ComputeLoadResult(getVal(Ptr));
           if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
             // This memset is a no-op.
-            DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
             ++CurInst;
             continue;
           }
@@ -398,7 +485,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
         if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
             II->getIntrinsicID() == Intrinsic::lifetime_end) {
-          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
           ++CurInst;
           continue;
         }
@@ -407,7 +494,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // We don't insert an entry into Values, as it doesn't have a
           // meaningful return value.
           if (!II->use_empty()) {
-            DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
+            LLVM_DEBUG(dbgs()
+                       << "Found unused invariant_start. Can't evaluate.\n");
             return false;
           }
           ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
@@ -419,54 +507,54 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
                 Size->getValue().getLimitedValue() >=
                     DL.getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
-              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
-                    << "\n");
+              LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: "
+                                << *GV << "\n");
             } else {
-              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
-                    "invariant.\n");
+              LLVM_DEBUG(dbgs()
+                         << "Found a global var, but can not treat it as an "
+                            "invariant.\n");
             }
           }
           // Continue even if we do nothing.
           ++CurInst;
           continue;
         } else if (II->getIntrinsicID() == Intrinsic::assume) {
-          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n");
           ++CurInst;
           continue;
         } else if (II->getIntrinsicID() == Intrinsic::sideeffect) {
-          DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
+          LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
           ++CurInst;
           continue;
         }
 
-        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
         return false;
       }
 
       // Resolve function pointers.
-      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      SmallVector<Constant *, 8> Formals;
+      Function *Callee = getCalleeWithFormalArgs(CS, Formals);
       if (!Callee || Callee->isInterposable()) {
-        DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
         return false;  // Cannot resolve.
       }
 
-      SmallVector<Constant*, 8> Formals;
-      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
-        Formals.push_back(getVal(*i));
-
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
         if (Constant *C = ConstantFoldCall(CS, Callee, Formals, TLI)) {
-          InstResult = C;
-          DEBUG(dbgs() << "Constant folded function call. Result: " <<
-                *InstResult << "\n");
+          InstResult = castCallResultIfNeeded(CS.getCalledValue(), C);
+          if (!InstResult)
+            return false;
+          LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
+                            << *InstResult << "\n");
         } else {
-          DEBUG(dbgs() << "Can not constant fold function call.\n");
+          LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n");
           return false;
         }
       } else {
         if (Callee->getFunctionType()->isVarArg()) {
-          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
           return false;
         }
 
@@ -474,21 +562,24 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         // Execute the call, if successful, use the return value.
         ValueStack.emplace_back();
         if (!EvaluateFunction(Callee, RetVal, Formals)) {
-          DEBUG(dbgs() << "Failed to evaluate function.\n");
+          LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n");
           return false;
         }
         ValueStack.pop_back();
-        InstResult = RetVal;
+        InstResult = castCallResultIfNeeded(CS.getCalledValue(), RetVal);
+        if (RetVal && !InstResult)
+          return false;
 
         if (InstResult) {
-          DEBUG(dbgs() << "Successfully evaluated function. Result: "
-                       << *InstResult << "\n\n");
+          LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
+                            << *InstResult << "\n\n");
         } else {
-          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+          LLVM_DEBUG(dbgs()
+                     << "Successfully evaluated function. Result: 0\n\n");
         }
       }
     } else if (isa<TerminatorInst>(CurInst)) {
-      DEBUG(dbgs() << "Found a terminator instruction.\n");
+      LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
 
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
         if (BI->isUnconditional()) {
@@ -515,17 +606,18 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         NextBB = nullptr;
       } else {
         // invoke, unwind, resume, unreachable.
-        DEBUG(dbgs() << "Can not handle terminator.");
+        LLVM_DEBUG(dbgs() << "Can not handle terminator.");
         return false;  // Cannot handle this terminator.
       }
 
       // We succeeded at evaluating this block!
-      DEBUG(dbgs() << "Successfully evaluated block.\n");
+      LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
       return true;
     } else {
       // Did not know how to evaluate this!
-      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
-            "\n");
+      LLVM_DEBUG(
+          dbgs() << "Failed to evaluate block due to unhandled instruction."
+                    "\n");
       return false;
     }
 
@@ -539,7 +631,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
     // If we just processed an invoke, we finished evaluating the block.
     if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
       NextBB = II->getNormalDest();
-      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
       return true;
     }
 
@@ -578,7 +670,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
   while (true) {
     BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
-    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+    LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
 
     if (!EvaluateBlock(CurInst, NextBB))
       return false;
diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 5fdcc6d1d727..3c6c9c9a5df4 100644
--- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
@@ -24,7 +25,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 
 using namespace llvm;
@@ -36,16 +36,16 @@ namespace {
 class FlattenCFGOpt {
   AliasAnalysis *AA;
 
-  /// \brief Use parallel-and or parallel-or to generate conditions for
+  /// Use parallel-and or parallel-or to generate conditions for
   /// conditional branches.
   bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
 
-  /// \brief If \param BB is the merge block of an if-region, attempt to merge
+  /// If \param BB is the merge block of an if-region, attempt to merge
   /// the if-region with an adjacent if-region upstream if two if-regions
   /// contain identical instructions.
   bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
 
-  /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
+  /// Compare a pair of blocks: \p Block1 and \p Block2, which
   /// are from two if-regions whose entry blocks are \p Head1 and \p
   /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
   /// instructions, and have no memory reference alias with \p Head2.
@@ -312,7 +312,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     new UnreachableInst(CB->getContext(), CB);
   } while (Iteration);
 
-  DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+  LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
   return true;
 }
 
@@ -469,7 +469,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   // Remove \param SecondEntryBlock
   SecondEntryBlock->dropAllReferences();
   SecondEntryBlock->eraseFromParent();
-  DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+  LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
   return true;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 75539428b688..69203f9f2485 100644
--- a/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -377,7 +376,7 @@ int FunctionComparator::cmpConstants(const Constant *L,
     }
   }
   default: // Unknown constant, abort.
-    DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+    LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
     llvm_unreachable("Constant ValueID not recognized.");
     return -1;
   }
@@ -925,7 +924,7 @@ FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
   H.add(F.arg_size());
 
   SmallVector<const BasicBlock *, 8> BBs;
-  SmallSet<const BasicBlock *, 16> VisitedBBs;
+  SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
 
   // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
   // accumulating the hash of the function "structure." (BB and opcode sequence)
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 6b5f593073b4..479816a339d0 100644
--- a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -206,15 +206,10 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
   // definition.
   if (GV.hasName()) {
     ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID());
-    if (VI) {
-      // Need to check all summaries are local in case of hash collisions.
-      bool IsLocal = VI.getSummaryList().size() &&
-          llvm::all_of(VI.getSummaryList(),
-                       [](const std::unique_ptr<GlobalValueSummary> &Summary) {
-                         return Summary->isDSOLocal();
-                       });
-      if (IsLocal)
-        GV.setDSOLocal(true);
+    if (VI && VI.isDSOLocal()) {
+      GV.setDSOLocal(true);
+      if (GV.hasDLLImportStorageClass())
+        GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
     }
   }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index 245fefb38ee8..ff6970db47da 100644
--- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -60,7 +60,7 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) {
 }
 
 static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
-                             SmallPtrSetImpl<const PHINode *> &PhiUsers) {
+                             SmallPtrSetImpl<const Value *> &VisitedUsers) {
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     if (GV->isExternallyInitialized())
       GS.StoredType = GlobalStatus::StoredOnce;
@@ -75,7 +75,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
       if (!isa<PointerType>(CE->getType()))
         return true;
 
-      if (analyzeGlobalAux(CE, GS, PhiUsers))
+      // FIXME: Do we need to add constexpr selects to VisitedUsers?
+      if (analyzeGlobalAux(CE, GS, VisitedUsers))
         return true;
     } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
       if (!GS.HasMultipleAccessingFunctions) {
@@ -137,20 +138,18 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
             GS.StoredType = GlobalStatus::Stored;
           }
         }
-      } else if (isa<BitCastInst>(I)) {
-        if (analyzeGlobalAux(I, GS, PhiUsers))
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        // Skip over bitcasts and GEPs; we don't care about the type or offset
+        // of the pointer.
+        if (analyzeGlobalAux(I, GS, VisitedUsers))
           return true;
-      } else if (isa<GetElementPtrInst>(I)) {
-        if (analyzeGlobalAux(I, GS, PhiUsers))
-          return true;
-      } else if (isa<SelectInst>(I)) {
-        if (analyzeGlobalAux(I, GS, PhiUsers))
-          return true;
-      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
-        // PHI nodes we can check just like select or GEP instructions, but we
-        // have to be careful about infinite recursion.
-        if (PhiUsers.insert(PN).second) // Not already visited.
-          if (analyzeGlobalAux(I, GS, PhiUsers))
+      } else if (isa<SelectInst>(I) || isa<PHINode>(I)) {
+        // Look through selects and PHIs to find if the pointer is
+        // conditionally accessed. Make sure we only visit an instruction
+        // once; otherwise, we can get infinite recursion or exponential
+        // compile time.
+        if (VisitedUsers.insert(I).second)
+          if (analyzeGlobalAux(I, GS, VisitedUsers))
             return true;
       } else if (isa<CmpInst>(I)) {
         GS.IsCompared = true;
@@ -191,6 +190,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
 GlobalStatus::GlobalStatus() = default;
 
 bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
-  SmallPtrSet<const PHINode *, 16> PhiUsers;
-  return analyzeGlobalAux(V, GS, PhiUsers);
+  SmallPtrSet<const Value *, 16> VisitedUsers;
+  return analyzeGlobalAux(V, GS, VisitedUsers);
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index b8c12ad5ea84..8382220fc9e1 100644
--- a/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -161,7 +161,7 @@ void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
 
 void ImportedFunctionsInliningStatistics::calculateRealInlines() {
   // Removing duplicated Callers.
-  std::sort(NonImportedCallers.begin(), NonImportedCallers.end());
+  llvm::sort(NonImportedCallers.begin(), NonImportedCallers.end());
   NonImportedCallers.erase(
       std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
       NonImportedCallers.end());
@@ -190,13 +190,14 @@ ImportedFunctionsInliningStatistics::getSortedNodes() {
   for (const NodesMapTy::value_type& Node : NodesMap)
     SortedNodes.push_back(&Node);
 
-  std::sort(
+  llvm::sort(
       SortedNodes.begin(), SortedNodes.end(),
       [&](const SortedNodesTy::value_type &Lhs,
           const SortedNodesTy::value_type &Rhs) {
         if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
           return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
-        if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
+        if (Lhs->second->NumberOfRealInlines !=
+            Rhs->second->NumberOfRealInlines)
           return Lhs->second->NumberOfRealInlines >
                  Rhs->second->NumberOfRealInlines;
         return Lhs->first() < Rhs->first();
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
index fedf6e100d6c..0315aac1cf84 100644
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -60,7 +61,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -72,6 +72,7 @@
 #include <vector>
 
 using namespace llvm;
+using ProfileCount = Function::ProfileCount;
 
 static cl::opt<bool>
 EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
@@ -1247,7 +1248,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
   // better alignment.
-  Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1);
+  Builder.CreateMemCpy(Dst, /*DstAlign*/1, Src, /*SrcAlign*/1, Size);
 }
 
 /// When inlining a call site that has a byval argument,
@@ -1431,29 +1432,29 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
 
 /// Update the branch metadata for cloned call instructions.
 static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
-                              const Optional<uint64_t> &CalleeEntryCount,
+                              const ProfileCount &CalleeEntryCount,
                               const Instruction *TheCall,
                               ProfileSummaryInfo *PSI,
                               BlockFrequencyInfo *CallerBFI) {
-  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
+      CalleeEntryCount.getCount() < 1)
     return;
-  Optional<uint64_t> CallSiteCount =
-      PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+  auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
   uint64_t CallCount =
       std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
-               CalleeEntryCount.getValue());
+               CalleeEntryCount.getCount());
 
   for (auto const &Entry : VMap)
     if (isa<CallInst>(Entry.first))
       if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
-        CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+        CI->updateProfWeight(CallCount, CalleeEntryCount.getCount());
   for (BasicBlock &BB : *Callee)
     // No need to update the callsite if it is pruned during inlining.
     if (VMap.count(&BB))
       for (Instruction &I : BB)
         if (CallInst *CI = dyn_cast<CallInst>(&I))
-          CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
-                               CalleeEntryCount.getValue());
+          CI->updateProfWeight(CalleeEntryCount.getCount() - CallCount,
+                               CalleeEntryCount.getCount());
 }
 
 /// Update the entry count of callee after inlining.
@@ -1467,18 +1468,19 @@ static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
   // callsite is M, the new callee count is set to N - M. M is estimated from
   // the caller's entry count, its entry block frequency and the block frequency
   // of the callsite.
-  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  auto CalleeCount = Callee->getEntryCount();
   if (!CalleeCount.hasValue() || !PSI)
     return;
-  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
+  auto CallCount = PSI->getProfileCount(CallInst, CallerBFI);
   if (!CallCount.hasValue())
     return;
   // Since CallSiteCount is an estimate, it could exceed the original callee
   // count and has to be set to 0.
-  if (CallCount.getValue() > CalleeCount.getValue())
-    Callee->setEntryCount(0);
+  if (CallCount.getValue() > CalleeCount.getCount())
+    CalleeCount.setCount(0);
   else
-    Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
+    CalleeCount.setCount(CalleeCount.getCount() - CallCount.getValue());
+  Callee->setEntryCount(CalleeCount);
 }
 
 /// This function inlines the called function into the basic block of the
@@ -1500,10 +1502,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   IFI.reset();
 
   Function *CalledFunc = CS.getCalledFunction();
-  if (!CalledFunc ||              // Can't inline external function or indirect
-      CalledFunc->isDeclaration() ||
-      (!ForwardVarArgsTo && CalledFunc->isVarArg())) // call, or call to a vararg function!
-      return false;
+  if (!CalledFunc ||               // Can't inline external function or indirect
+      CalledFunc->isDeclaration()) // call!
+    return false;
 
   // The inliner does not know how to inline through calls with operand bundles
   // in general ...
@@ -1568,7 +1569,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   Instruction *CallSiteEHPad = nullptr;
   if (CallerPersonality) {
     EHPersonality Personality = classifyEHPersonality(CallerPersonality);
-    if (isFuncletEHPersonality(Personality)) {
+    if (isScopedEHPersonality(Personality)) {
       Optional<OperandBundleUse> ParentFunclet =
           CS.getOperandBundle(LLVMContext::OB_funclet);
       if (ParentFunclet)
@@ -1630,9 +1631,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     auto &DL = Caller->getParent()->getDataLayout();
 
-    assert((CalledFunc->arg_size() == CS.arg_size() || ForwardVarArgsTo) &&
-           "Varargs calls can only be inlined if the Varargs are forwarded!");
-
     // Calculate the vector of arguments to pass into the function cloner, which
     // matches up the formal to the actual argument values.
     CallSite::arg_iterator AI = CS.arg_begin();
@@ -1815,9 +1813,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   }
 
   SmallVector<Value*,4> VarArgsToForward;
+  SmallVector<AttributeSet, 4> VarArgsAttrs;
   for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
-       i < CS.getNumArgOperands(); i++)
+       i < CS.getNumArgOperands(); i++) {
     VarArgsToForward.push_back(CS.getArgOperand(i));
+    VarArgsAttrs.push_back(CS.getAttributes().getParamAttributes(i));
+  }
 
   bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
   if (InlinedFunctionInfo.ContainsCalls) {
@@ -1825,6 +1826,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     if (CallInst *CI = dyn_cast<CallInst>(TheCall))
       CallSiteTailKind = CI->getTailCallKind();
 
+    // For inlining purposes, the "notail" marker is the same as no marker.
+    if (CallSiteTailKind == CallInst::TCK_NoTail)
+      CallSiteTailKind = CallInst::TCK_None;
+
     for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
          ++BB) {
       for (auto II = BB->begin(); II != BB->end();) {
@@ -1833,6 +1838,40 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         if (!CI)
           continue;
 
+        // Forward varargs from inlined call site to calls to the
+        // ForwardVarArgsTo function, if requested, and to musttail calls.
+        if (!VarArgsToForward.empty() &&
+            ((ForwardVarArgsTo &&
+              CI->getCalledFunction() == ForwardVarArgsTo) ||
+             CI->isMustTailCall())) {
+          // Collect attributes for non-vararg parameters.
+          AttributeList Attrs = CI->getAttributes();
+          SmallVector<AttributeSet, 8> ArgAttrs;
+          if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
+            for (unsigned ArgNo = 0;
+                 ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
+              ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+          }
+
+          // Add VarArg attributes.
+          ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
+          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
+                                     Attrs.getRetAttributes(), ArgAttrs);
+          // Add VarArgs to existing parameters.
+          SmallVector<Value *, 6> Params(CI->arg_operands());
+          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
+          CallInst *NewCI =
+              CallInst::Create(CI->getCalledFunction() ? CI->getCalledFunction()
+                                                       : CI->getCalledValue(),
+                               Params, "", CI);
+          NewCI->setDebugLoc(CI->getDebugLoc());
+          NewCI->setAttributes(Attrs);
+          NewCI->setCallingConv(CI->getCallingConv());
+          CI->replaceAllUsesWith(NewCI);
+          CI->eraseFromParent();
+          CI = NewCI;
+        }
+
         if (Function *F = CI->getCalledFunction())
           InlinedDeoptimizeCalls |=
               F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
@@ -1850,6 +1889,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         //    f -> musttail g ->     tail f  ==>  f ->     tail f
         //    f ->          g -> musttail f  ==>  f ->          f
         //    f ->          g ->     tail f  ==>  f ->          f
+        //
+        // Inlined notail calls should remain notail calls.
         CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
         if (ChildTCK != CallInst::TCK_NoTail)
           ChildTCK = std::min(CallSiteTailKind, ChildTCK);
@@ -1860,16 +1901,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         // 'nounwind'.
         if (MarkNoUnwind)
           CI->setDoesNotThrow();
-
-        if (ForwardVarArgsTo && !VarArgsToForward.empty() &&
-            CI->getCalledFunction() == ForwardVarArgsTo) {
-          SmallVector<Value*, 6> Params(CI->arg_operands());
-          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
-          CallInst *Call = CallInst::Create(CI->getCalledFunction(), Params, "", CI);
-          Call->setDebugLoc(CI->getDebugLoc());
-          CI->replaceAllUsesWith(Call);
-          CI->eraseFromParent();
-        }
       }
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index 23ec45edb3ef..003721f2b939 100644
--- a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -17,7 +17,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
index 5a90dcb033b2..3fbb3487884b 100644
--- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -372,7 +372,7 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
 /// information about the operands are known. Implements both 32bit and 64bit
 /// scalar division.
 ///
-/// @brief Replace Rem with generated code.
+/// Replace Rem with generated code.
 bool llvm::expandRemainder(BinaryOperator *Rem) {
   assert((Rem->getOpcode() == Instruction::SRem ||
           Rem->getOpcode() == Instruction::URem) &&
@@ -430,7 +430,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
 /// when more information about the operands are known. Implements both
 /// 32bit and 64bit scalar division.
 ///
-/// @brief Replace Div with generated code.
+/// Replace Div with generated code.
 bool llvm::expandDivision(BinaryOperator *Div) {
   assert((Div->getOpcode() == Instruction::SDiv ||
           Div->getOpcode() == Instruction::UDiv) &&
@@ -482,7 +482,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {
 /// that have no or very little suppport for smaller than 32 bit integer 
 /// arithmetic.
 ///
-/// @brief Replace Rem with emulation code.
+/// Replace Rem with emulation code.
 bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
   assert((Rem->getOpcode() == Instruction::SRem ||
           Rem->getOpcode() == Instruction::URem) &&
@@ -531,7 +531,7 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
 /// 64 bits. Uses the above routines and extends the inputs/truncates the
 /// outputs to operate in 64 bits.
 ///
-/// @brief Replace Rem with emulation code.
+/// Replace Rem with emulation code.
 bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
   assert((Rem->getOpcode() == Instruction::SRem ||
           Rem->getOpcode() == Instruction::URem) &&
@@ -580,7 +580,7 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
 /// in 32 bits; that is, these routines are good for targets that have no
 /// or very little support for smaller than 32 bit integer arithmetic.
 ///
-/// @brief Replace Div with emulation code.
+/// Replace Div with emulation code.
 bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
   assert((Div->getOpcode() == Instruction::SDiv ||
           Div->getOpcode() == Instruction::UDiv) &&
@@ -628,7 +628,7 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
 /// above routines and extends the inputs/truncates the outputs to operate
 /// in 64 bits.
 ///
-/// @brief Replace Div with emulation code.
+/// Replace Div with emulation code.
 bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
   assert((Div->getOpcode() == Instruction::SDiv ||
           Div->getOpcode() == Instruction::UDiv) &&
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
index ae0e2bb6c280..956d0387c7a8 100644
--- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -36,13 +36,14 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
@@ -214,18 +215,27 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
         Worklist.push_back(PostProcessPN);
 
     // Keep track of PHI nodes that we want to remove because they did not have
-    // any uses rewritten.
+    // any uses rewritten. If the new PHI is used, store it so that we can
+    // try to propagate dbg.value intrinsics to it.
+    SmallVector<PHINode *, 2> NeedDbgValues;
     for (PHINode *PN : AddedPHIs)
       if (PN->use_empty())
         PHIsToRemove.insert(PN);
-
+      else
+        NeedDbgValues.push_back(PN);
+    insertDebugValuesForPHIs(InstBB, NeedDbgValues);
     Changed = true;
   }
-  // Remove PHI nodes that did not have any uses rewritten.
-  for (PHINode *PN : PHIsToRemove) {
-    assert (PN->use_empty() && "Trying to remove a phi with uses.");
-    PN->eraseFromParent();
-  }
+  // Remove PHI nodes that did not have any uses rewritten. We need to redo the
+  // use_empty() check here, because even if the PHI node wasn't used when added
+  // to PHIsToRemove, later added PHI nodes can be using it.  This cleanup is
+  // not guaranteed to handle trees/cycles of PHI nodes that only are used by
+  // each other. Such situations has only been noticed when the input IR
+  // contains unreachable code, and leaving some extra redundant PHI nodes in
+  // such situations is considered a minor problem.
+  for (PHINode *PN : PHIsToRemove)
+    if (PN->use_empty())
+      PN->eraseFromParent();
   return Changed;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 42aca757c2af..9832a6f24e1f 100644
--- a/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -79,11 +79,11 @@ public:
   bool perform() {
     bool Changed = false;
     for (auto &CI : WorkList) {
-      DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+                        << "\n");
       if (perform(CI)) {
         Changed = true;
-        DEBUG(dbgs() << "Transformed\n");
+        LLVM_DEBUG(dbgs() << "Transformed\n");
       }
     }
     return Changed;
@@ -421,7 +421,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
                                               const LibFunc &Func) {
   // FIXME: LibFunc_powf and powl TBD.
   if (Func != LibFunc_pow) {
-    DEBUG(dbgs() << "Not handled powf() and powl()\n");
+    LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n");
     return nullptr;
   }
 
@@ -433,7 +433,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
   if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
     double D = CF->getValueAPF().convertToDouble();
     if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
-      DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+      LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
       return nullptr;
     }
 
@@ -447,7 +447,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
   // If the Base value coming from an integer type.
   Instruction *I = dyn_cast<Instruction>(Base);
   if (!I) {
-    DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+    LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n");
     return nullptr;
   }
   unsigned Opcode = I->getOpcode();
@@ -461,7 +461,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
     else if (BW == 32)
       UpperV = 32.0f;
     else {
-      DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+      LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n");
       return nullptr;
     }
 
@@ -477,7 +477,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
     Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
     return BBBuilder.CreateOr(Cond0, Cond);
   }
-  DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+  LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
   return nullptr;
 }
 
@@ -496,9 +496,9 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
   SuccBB->setName("cdce.end");
   CI->removeFromParent();
   CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
-  DEBUG(dbgs() << "== Basic Block After ==");
-  DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
-               << *CallBB->getSingleSuccessor() << "\n");
+  LLVM_DEBUG(dbgs() << "== Basic Block After ==");
+  LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+                    << *CallBB->getSingleSuccessor() << "\n");
 }
 
 // Perform the transformation to a single candidate.
@@ -529,10 +529,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
   bool Changed = CCDCE.perform();
 
 // Verify the dominator after we've updated it locally.
-#ifndef NDEBUG
-  if (DT)
-    DT->verifyDomTree();
-#endif
+  assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast));
   return Changed;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
index acccf7abf808..ae3cb077a3af 100644
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -73,6 +73,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
 #include <climits>
@@ -100,7 +101,8 @@ STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
 bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
-                                  const TargetLibraryInfo *TLI) {
+                                  const TargetLibraryInfo *TLI,
+                                  DeferredDominance *DDT) {
   TerminatorInst *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
@@ -123,6 +125,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Replace the conditional branch with an unconditional one.
       Builder.CreateBr(Destination);
       BI->eraseFromParent();
+      if (DDT)
+        DDT->deleteEdge(BB, OldDest);
       return true;
     }
 
@@ -193,9 +197,12 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
                           createBranchWeights(Weights));
         }
         // Remove this entry.
-        DefaultDest->removePredecessor(SI->getParent());
+        BasicBlock *ParentBB = SI->getParent();
+        DefaultDest->removePredecessor(ParentBB);
         i = SI->removeCase(i);
         e = SI->case_end();
+        if (DDT)
+          DDT->deleteEdge(ParentBB, DefaultDest);
         continue;
       }
 
@@ -221,14 +228,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
       BasicBlock *BB = SI->getParent();
+      std::vector <DominatorTree::UpdateType> Updates;
+      if (DDT)
+        Updates.reserve(SI->getNumSuccessors() - 1);
 
       // Remove entries from PHI nodes which we no longer branch to...
       for (BasicBlock *Succ : SI->successors()) {
         // Found case matching a constant operand?
-        if (Succ == TheOnlyDest)
+        if (Succ == TheOnlyDest) {
           TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
-        else
+        } else {
           Succ->removePredecessor(BB);
+          if (DDT)
+            Updates.push_back({DominatorTree::Delete, BB, Succ});
+        }
       }
 
       // Delete the old switch.
@@ -236,6 +249,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       SI->eraseFromParent();
       if (DeleteDeadConditions)
         RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      if (DDT)
+        DDT->applyUpdates(Updates);
       return true;
     }
 
@@ -281,14 +296,23 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     if (auto *BA =
           dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
+      std::vector <DominatorTree::UpdateType> Updates;
+      if (DDT)
+        Updates.reserve(IBI->getNumDestinations() - 1);
+
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
 
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
-        if (IBI->getDestination(i) == TheOnlyDest)
+        if (IBI->getDestination(i) == TheOnlyDest) {
           TheOnlyDest = nullptr;
-        else
-          IBI->getDestination(i)->removePredecessor(IBI->getParent());
+        } else {
+          BasicBlock *ParentBB = IBI->getParent();
+          BasicBlock *DestBB = IBI->getDestination(i);
+          DestBB->removePredecessor(ParentBB);
+          if (DDT)
+            Updates.push_back({DominatorTree::Delete, ParentBB, DestBB});
+        }
       }
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
@@ -303,6 +327,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         new UnreachableInst(BB->getContext(), BB);
       }
 
+      if (DDT)
+        DDT->applyUpdates(Updates);
       return true;
     }
   }
@@ -346,6 +372,11 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
       return false;
     return true;
   }
+  if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
+    if (DLI->getLabel())
+      return false;
+    return true;
+  }
 
   if (!I->mayHaveSideEffects())
     return true;
@@ -353,8 +384,9 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    // Safe to delete llvm.stacksave if dead.
-    if (II->getIntrinsicID() == Intrinsic::stacksave)
+    // Safe to delete llvm.stacksave and launder.invariant.group if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave ||
+        II->getIntrinsicID() == Intrinsic::launder_invariant_group)
       return true;
 
     // Lifetime intrinsics are dead when their right-hand is undef.
@@ -402,17 +434,31 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
 
   SmallVector<Instruction*, 16> DeadInsts;
   DeadInsts.push_back(I);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
 
-  do {
-    I = DeadInsts.pop_back_val();
+  return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI) {
+  // Process the dead instruction list until empty.
+  while (!DeadInsts.empty()) {
+    Instruction &I = *DeadInsts.pop_back_val();
+    assert(I.use_empty() && "Instructions with uses are not dead.");
+    assert(isInstructionTriviallyDead(&I, TLI) &&
+           "Live instruction found in dead worklist!");
+
+    // Don't lose the debug info while deleting the instructions.
+    salvageDebugInfo(I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      Value *OpV = I->getOperand(i);
-      I->setOperand(i, nullptr);
+    for (Use &OpU : I.operands()) {
+      Value *OpV = OpU.get();
+      OpU.set(nullptr);
 
-      if (!OpV->use_empty()) continue;
+      if (!OpV->use_empty())
+        continue;
 
       // If the operand is an instruction that became dead as we nulled out the
       // operand, and if it is 'trivially' dead, delete it in a future loop
@@ -422,10 +468,8 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
           DeadInsts.push_back(OpI);
     }
 
-    I->eraseFromParent();
-  } while (!DeadInsts.empty());
-
-  return true;
+    I.eraseFromParent();
+  }
 }
 
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
@@ -477,6 +521,8 @@ simplifyAndDCEInstruction(Instruction *I,
                           const DataLayout &DL,
                           const TargetLibraryInfo *TLI) {
   if (isInstructionTriviallyDead(I, TLI)) {
+    salvageDebugInfo(*I);
+
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
@@ -579,7 +625,8 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
 ///
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the and to 0.
-void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
+                                        DeferredDominance *DDT) {
   // This only adjusts blocks with PHI nodes.
   if (!isa<PHINode>(BB->begin()))
     return;
@@ -602,13 +649,18 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
     // of the block.
     if (PhiIt != OldPhiIt) PhiIt = &BB->front();
   }
+  if (DDT)
+    DDT->deleteEdge(Pred, BB);
 }
 
 /// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
 /// predecessor is known to have one successor (DestBB!).  Eliminate the edge
 /// between them, moving the instructions in the predecessor into DestBB and
 /// deleting the predecessor block.
-void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT,
+                                       DeferredDominance *DDT) {
+  assert(!(DT && DDT) && "Cannot call with both DT and DDT.");
+
   // If BB has single-entry PHI nodes, fold them.
   while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
     Value *NewVal = PN->getIncomingValue(0);
@@ -621,6 +673,24 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
   BasicBlock *PredBB = DestBB->getSinglePredecessor();
   assert(PredBB && "Block doesn't have a single predecessor!");
 
+  bool ReplaceEntryBB = false;
+  if (PredBB == &DestBB->getParent()->getEntryBlock())
+    ReplaceEntryBB = true;
+
+  // Deferred DT update: Collect all the edges that enter PredBB. These
+  // dominator edges will be redirected to DestBB.
+  std::vector <DominatorTree::UpdateType> Updates;
+  if (DDT && !ReplaceEntryBB) {
+    Updates.reserve(1 + (2 * pred_size(PredBB)));
+    Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
+    for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
+      Updates.push_back({DominatorTree::Delete, *I, PredBB});
+      // This predecessor of PredBB may already have DestBB as a successor.
+      if (llvm::find(successors(*I), DestBB) == succ_end(*I))
+        Updates.push_back({DominatorTree::Insert, *I, DestBB});
+    }
+  }
+
   // Zap anything that took the address of DestBB.  Not doing this will give the
   // address an invalid value.
   if (DestBB->hasAddressTaken()) {
@@ -641,7 +711,7 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
 
   // If the PredBB is the entry block of the function, move DestBB up to
   // become the entry block after we erase PredBB.
-  if (PredBB == &DestBB->getParent()->getEntryBlock())
+  if (ReplaceEntryBB)
     DestBB->moveAfter(PredBB);
 
   if (DT) {
@@ -653,8 +723,19 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
       DT->eraseNode(PredBB);
     }
   }
-  // Nuke BB.
-  PredBB->eraseFromParent();
+
+  if (DDT) {
+    DDT->deleteBB(PredBB); // Deferred deletion of BB.
+    if (ReplaceEntryBB)
+      // The entry block was removed and there is no external interface for the
+      // dominator tree to be notified of this change. In this corner-case we
+      // recalculate the entire tree.
+      DDT->recalculate(*(DestBB->getParent()));
+    else
+      DDT->applyUpdates(Updates);
+  } else {
+    PredBB->eraseFromParent(); // Nuke BB.
+  }
 }
 
 /// CanMergeValues - Return true if we can choose one of these values to use
@@ -671,8 +752,8 @@ static bool CanMergeValues(Value *First, Value *Second) {
 static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
   assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
 
-  DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
-        << Succ->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
+                    << Succ->getName() << "\n");
   // Shortcut, if there is only a single predecessor it must be BB and merging
   // is always safe
   if (Succ->getSinglePredecessor()) return true;
@@ -695,10 +776,11 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
         if (BBPreds.count(IBB) &&
             !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
                             PN->getIncomingValue(PI))) {
-          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
-                << Succ->getName() << " is conflicting with "
-                << BBPN->getName() << " with regard to common predecessor "
-                << IBB->getName() << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "Can't fold, phi node " << PN->getName() << " in "
+                     << Succ->getName() << " is conflicting with "
+                     << BBPN->getName() << " with regard to common predecessor "
+                     << IBB->getName() << "\n");
           return false;
         }
       }
@@ -711,9 +793,10 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
         BasicBlock *IBB = PN->getIncomingBlock(PI);
         if (BBPreds.count(IBB) &&
             !CanMergeValues(Val, PN->getIncomingValue(PI))) {
-          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
-                << Succ->getName() << " is conflicting with regard to common "
-                << "predecessor " << IBB->getName() << "\n");
+          LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName()
+                            << " in " << Succ->getName()
+                            << " is conflicting with regard to common "
+                            << "predecessor " << IBB->getName() << "\n");
           return false;
         }
       }
@@ -726,7 +809,7 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
 using PredBlockVector = SmallVector<BasicBlock *, 16>;
 using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
 
-/// \brief Determines the value to use as the phi node input for a block.
+/// Determines the value to use as the phi node input for a block.
 ///
 /// Select between \p OldVal any value that we know flows from \p BB
 /// to a particular phi on the basis of which one (if either) is not
@@ -755,7 +838,7 @@ static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
   return OldVal;
 }
 
-/// \brief Create a map from block to value for the operands of a
+/// Create a map from block to value for the operands of a
 /// given phi.
 ///
 /// Create a map from block to value for each non-undef value flowing
@@ -774,7 +857,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN,
   }
 }
 
-/// \brief Replace the incoming undef values to a phi with the values
+/// Replace the incoming undef values to a phi with the values
 /// from a block-to-value map.
 ///
 /// \param PN The phi we are replacing the undefs in.
@@ -794,7 +877,7 @@ static void replaceUndefValuesInPhi(PHINode *PN,
   }
 }
 
-/// \brief Replace a value flowing from a block to a phi with
+/// Replace a value flowing from a block to a phi with
 /// potentially multiple instances of that value flowing from the
 /// block's predecessors to the phi.
 ///
@@ -861,7 +944,8 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
 /// potential side-effect free intrinsics and the branch.  If possible,
 /// eliminate BB by rewriting all the predecessors to branch to the successor
 /// block and return true.  If we can't transform, return false.
-bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+                                                   DeferredDominance *DDT) {
   assert(BB != &BB->getParent()->getEntryBlock() &&
          "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
 
@@ -900,7 +984,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
     }
   }
 
-  DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+  LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DDT) {
+    Updates.reserve(1 + (2 * pred_size(BB)));
+    Updates.push_back({DominatorTree::Delete, BB, Succ});
+    // All predecessors of BB will be moved to Succ.
+    for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+      Updates.push_back({DominatorTree::Delete, *I, BB});
+      // This predecessor of BB may already have Succ as a successor.
+      if (llvm::find(successors(*I), Succ) == succ_end(*I))
+        Updates.push_back({DominatorTree::Insert, *I, Succ});
+    }
+  }
 
   if (isa<PHINode>(Succ->begin())) {
     // If there is more than one pred of succ, and there are PHI nodes in
@@ -946,7 +1043,13 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
   // Everything that jumped to BB now goes to Succ.
   BB->replaceAllUsesWith(Succ);
   if (!Succ->hasName()) Succ->takeName(BB);
-  BB->eraseFromParent();              // Delete the old basic block.
+
+  if (DDT) {
+    DDT->deleteBB(BB); // Deferred deletion of the old basic block.
+    DDT->applyUpdates(Updates);
+  } else {
+    BB->eraseFromParent(); // Delete the old basic block.
+  }
   return true;
 }
 
@@ -1125,6 +1228,31 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   return false;
 }
 
+/// Check if the alloc size of \p ValTy is large enough to cover the variable
+/// (or fragment of the variable) described by \p DII.
+///
+/// This is primarily intended as a helper for the different
+/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is
+/// converted describes an alloca'd variable, so we need to use the
+/// alloc size of the value when doing the comparison. E.g. an i1 value will be
+/// identified as covering an n-bit fragment, if the store size of i1 is at
+/// least n bits.
+static bool valueCoversEntireFragment(Type *ValTy, DbgInfoIntrinsic *DII) {
+  const DataLayout &DL = DII->getModule()->getDataLayout();
+  uint64_t ValueSize = DL.getTypeAllocSizeInBits(ValTy);
+  if (auto FragmentSize = DII->getFragmentSizeInBits())
+    return ValueSize >= *FragmentSize;
+  // We can't always calculate the size of the DI variable (e.g. if it is a
+  // VLA). Try to use the size of the alloca that the dbg intrinsic describes
+  // intead.
+  if (DII->isAddressOfVariable())
+    if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
+      if (auto FragmentSize = AI->getAllocationSizeInBits(DL))
+        return ValueSize >= *FragmentSize;
+  // Could not determine size of variable. Conservatively return false.
+  return false;
+}
+
 /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
 /// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
 void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
@@ -1135,6 +1263,21 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
   auto *DIExpr = DII->getExpression();
   Value *DV = SI->getOperand(0);
 
+  if (!valueCoversEntireFragment(SI->getValueOperand()->getType(), DII)) {
+    // FIXME: If storing to a part of the variable described by the dbg.declare,
+    // then we want to insert a dbg.value for the corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    // For now, when there is a store to parts of the variable (but we do not
+    // know which part) we insert an dbg.value instrinsic to indicate that we
+    // know nothing about the variable's content.
+    DV = UndefValue::get(DV->getType());
+    if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+      Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, DII->getDebugLoc(),
+                                      SI);
+    return;
+  }
+
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
   Argument *ExtendedArg = nullptr;
@@ -1178,6 +1321,15 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
   if (LdStHasDebugValue(DIVar, DIExpr, LI))
     return;
 
+  if (!valueCoversEntireFragment(LI->getType(), DII)) {
+    // FIXME: If only referring to a part of the variable described by the
+    // dbg.declare, then we want to insert a dbg.value for the corresponding
+    // fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    return;
+  }
+
   // We are now tracking the loaded value instead of the address. In the
   // future if multi-location support is added to the IR, it might be
   // preferable to keep tracking both the loaded value and the original
@@ -1198,6 +1350,15 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
   if (PhiHasDebugValue(DIVar, DIExpr, APN))
     return;
 
+  if (!valueCoversEntireFragment(APN->getType(), DII)) {
+    // FIXME: If only referring to a part of the variable described by the
+    // dbg.declare, then we want to insert a dbg.value for the corresponding
+    // fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    return;
+  }
+
   BasicBlock *BB = APN->getParent();
   auto InsertionPt = BB->getFirstInsertionPt();
 
@@ -1237,33 +1398,91 @@ bool llvm::LowerDbgDeclare(Function &F) {
     // stored on the stack, while the dbg.declare can only describe
     // the stack slot (and at a lexical-scope granularity). Later
     // passes will attempt to elide the stack slot.
-    if (AI && !isArray(AI)) {
-      for (auto &AIUse : AI->uses()) {
-        User *U = AIUse.getUser();
-        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-          if (AIUse.getOperandNo() == 1)
-            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
-        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-          ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
-          // This is a call by-value or some other instruction that
-          // takes a pointer to the variable. Insert a *value*
-          // intrinsic that describes the alloca.
-          DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(),
-                                      DDI->getExpression(), DDI->getDebugLoc(),
-                                      CI);
-        }
+    if (!AI || isArray(AI))
+      continue;
+
+    // A volatile load/store means that the alloca can't be elided anyway.
+    if (llvm::any_of(AI->users(), [](User *U) -> bool {
+          if (LoadInst *LI = dyn_cast<LoadInst>(U))
+            return LI->isVolatile();
+          if (StoreInst *SI = dyn_cast<StoreInst>(U))
+            return SI->isVolatile();
+          return false;
+        }))
+      continue;
+
+    for (auto &AIUse : AI->uses()) {
+      User *U = AIUse.getUser();
+      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+        if (AIUse.getOperandNo() == 1)
+          ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+      } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        // This is a call by-value or some other instruction that takes a
+        // pointer to the variable. Insert a *value* intrinsic that describes
+        // the variable by dereferencing the alloca.
+        auto *DerefExpr =
+            DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
+        DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
+                                    DDI->getDebugLoc(), CI);
       }
-      DDI->eraseFromParent();
     }
+    DDI->eraseFromParent();
   }
   return true;
 }
 
+/// Propagate dbg.value intrinsics through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
+  assert(BB && "No BasicBlock to clone dbg.value(s) from.");
+  if (InsertedPHIs.size() == 0)
+    return;
+
+  // Map existing PHI nodes to their dbg.values.
+  ValueToValueMapTy DbgValueMap;
+  for (auto &I : *BB) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+  if (DbgValueMap.size() == 0)
+    return;
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = BB->getContext();
+  for (auto PHI : InsertedPHIs) {
+    BasicBlock *Parent = PHI->getParent();
+    // Avoid inserting an intrinsic into an EH block.
+    if (Parent->getFirstNonPHI()->isEHPad())
+      continue;
+    auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        NewDbgII->setOperand(0, PhiMAV);
+        auto InsertionPt = Parent->getFirstInsertionPt();
+        assert(InsertionPt != Parent->end() && "Ill-formed basic block");
+        NewDbgII->insertBefore(&*InsertionPt);
+      }
+    }
+  }
+}
+
 /// Finds all intrinsics declaring local variables as living in the memory that
 /// 'V' points to. This may include a mix of dbg.declare and
 /// dbg.addr intrinsics.
 TinyPtrVector<DbgInfoIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return {};
   auto *L = LocalAsMetadata::getIfExists(V);
   if (!L)
     return {};
@@ -1282,6 +1501,10 @@ TinyPtrVector<DbgInfoIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
 }
 
 void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return;
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1289,8 +1512,12 @@ void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
           DbgValues.push_back(DVI);
 }
 
-static void findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgUsers,
-                         Value *V) {
+void llvm::findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgUsers,
+                        Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return;
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1308,11 +1535,11 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
     auto *DIExpr = DII->getExpression();
     assert(DIVar && "Missing variable");
     DIExpr = DIExpression::prepend(DIExpr, DerefBefore, Offset, DerefAfter);
-    // Insert llvm.dbg.declare immediately after InsertBefore, and remove old
+    // Insert llvm.dbg.declare immediately before InsertBefore, and remove old
     // llvm.dbg.declare.
     Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
     if (DII == InsertBefore)
-      InsertBefore = &*std::next(InsertBefore->getIterator());
+      InsertBefore = InsertBefore->getNextNode();
     DII->eraseFromParent();
   }
   return !DbgAddrs.empty();
@@ -1364,66 +1591,293 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
       }
 }
 
-void llvm::salvageDebugInfo(Instruction &I) {
-  SmallVector<DbgValueInst *, 1> DbgValues;
+/// Wrap \p V in a ValueAsMetadata instance.
+static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
+  return MetadataAsValue::get(C, ValueAsMetadata::get(V));
+}
+
+bool llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, &I);
+  if (DbgUsers.empty())
+    return false;
+
   auto &M = *I.getModule();
+  auto &DL = M.getDataLayout();
+  auto &Ctx = I.getContext();
+  auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
 
-  auto wrapMD = [&](Value *V) {
-    return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+  auto doSalvage = [&](DbgInfoIntrinsic *DII, SmallVectorImpl<uint64_t> &Ops) {
+    auto *DIExpr = DII->getExpression();
+    if (!Ops.empty()) {
+      // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
+      // are implicitly pointing out the value as a DWARF memory location
+      // description.
+      bool WithStackValue = isa<DbgValueInst>(DII);
+      DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
+    }
+    DII->setOperand(0, wrapMD(I.getOperand(0)));
+    DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+    LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
   };
 
-  auto applyOffset = [&](DbgValueInst *DVI, uint64_t Offset) {
-    auto *DIExpr = DVI->getExpression();
-    DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
-                                   DIExpression::NoDeref,
-                                   DIExpression::WithStackValue);
-    DVI->setOperand(0, wrapMD(I.getOperand(0)));
-    DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
-    DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+  auto applyOffset = [&](DbgInfoIntrinsic *DII, uint64_t Offset) {
+    SmallVector<uint64_t, 8> Ops;
+    DIExpression::appendOffset(Ops, Offset);
+    doSalvage(DII, Ops);
   };
 
-  if (isa<BitCastInst>(&I) || isa<IntToPtrInst>(&I)) {
-    // Bitcasts are entirely irrelevant for debug info. Rewrite dbg.value,
-    // dbg.addr, and dbg.declare to use the cast's source.
-    SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
-    findDbgUsers(DbgUsers, &I);
+  auto applyOps = [&](DbgInfoIntrinsic *DII,
+                      std::initializer_list<uint64_t> Opcodes) {
+    SmallVector<uint64_t, 8> Ops(Opcodes);
+    doSalvage(DII, Ops);
+  };
+
+  if (auto *CI = dyn_cast<CastInst>(&I)) {
+    if (!CI->isNoopCast(DL))
+      return false;
+
+    // No-op casts are irrelevant for debug info.
+    MetadataAsValue *CastSrc = wrapMD(I.getOperand(0));
     for (auto *DII : DbgUsers) {
-      DII->setOperand(0, wrapMD(I.getOperand(0)));
-      DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+      DII->setOperand(0, CastSrc);
+      LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
     }
+    return true;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-    findDbgValues(DbgValues, &I);
-    for (auto *DVI : DbgValues) {
-      unsigned BitWidth =
-          M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
-      APInt Offset(BitWidth, 0);
-      // Rewrite a constant GEP into a DIExpression.  Since we are performing
-      // arithmetic to compute the variable's *value* in the DIExpression, we
-      // need to mark the expression with a DW_OP_stack_value.
-      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
-        // GEP offsets are i32 and thus always fit into an int64_t.
-        applyOffset(DVI, Offset.getSExtValue());
-    }
+    unsigned BitWidth =
+        M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
+    // Rewrite a constant GEP into a DIExpression.  Since we are performing
+    // arithmetic to compute the variable's *value* in the DIExpression, we
+    // need to mark the expression with a DW_OP_stack_value.
+    APInt Offset(BitWidth, 0);
+    if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
+      for (auto *DII : DbgUsers)
+        applyOffset(DII, Offset.getSExtValue());
+    return true;
   } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
-    if (BI->getOpcode() == Instruction::Add)
-      if (auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1)))
-        if (ConstInt->getBitWidth() <= 64) {
-          APInt Offset = ConstInt->getValue();
-          findDbgValues(DbgValues, &I);
-          for (auto *DVI : DbgValues)
-            applyOffset(DVI, Offset.getSExtValue());
-        }
+    // Rewrite binary operations with constant integer operands.
+    auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1));
+    if (!ConstInt || ConstInt->getBitWidth() > 64)
+      return false;
+
+    uint64_t Val = ConstInt->getSExtValue();
+    for (auto *DII : DbgUsers) {
+      switch (BI->getOpcode()) {
+      case Instruction::Add:
+        applyOffset(DII, Val);
+        break;
+      case Instruction::Sub:
+        applyOffset(DII, -int64_t(Val));
+        break;
+      case Instruction::Mul:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul});
+        break;
+      case Instruction::SDiv:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_div});
+        break;
+      case Instruction::SRem:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod});
+        break;
+      case Instruction::Or:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_or});
+        break;
+      case Instruction::And:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_and});
+        break;
+      case Instruction::Xor:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor});
+        break;
+      case Instruction::Shl:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl});
+        break;
+      case Instruction::LShr:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr});
+        break;
+      case Instruction::AShr:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra});
+        break;
+      default:
+        // TODO: Salvage constants from each kind of binop we know about.
+        return false;
+      }
+    }
+    return true;
   } else if (isa<LoadInst>(&I)) {
-    findDbgValues(DbgValues, &I);
-    for (auto *DVI : DbgValues) {
+    MetadataAsValue *AddrMD = wrapMD(I.getOperand(0));
+    for (auto *DII : DbgUsers) {
       // Rewrite the load into DW_OP_deref.
-      auto *DIExpr = DVI->getExpression();
+      auto *DIExpr = DII->getExpression();
       DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
-      DVI->setOperand(0, wrapMD(I.getOperand(0)));
-      DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
-      DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
+      DII->setOperand(0, AddrMD);
+      DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+      LLVM_DEBUG(dbgs() << "SALVAGE:  " << *DII << '\n');
+    }
+    return true;
+  }
+  return false;
+}
+
+/// A replacement for a dbg.value expression.
+using DbgValReplacement = Optional<DIExpression *>;
+
+/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
+/// possibly moving/deleting users to prevent use-before-def. Returns true if
+/// changes are made.
+static bool rewriteDebugUsers(
+    Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
+    function_ref<DbgValReplacement(DbgInfoIntrinsic &DII)> RewriteExpr) {
+  // Find debug users of From.
+  SmallVector<DbgInfoIntrinsic *, 1> Users;
+  findDbgUsers(Users, &From);
+  if (Users.empty())
+    return false;
+
+  // Prevent use-before-def of To.
+  bool Changed = false;
+  SmallPtrSet<DbgInfoIntrinsic *, 1> DeleteOrSalvage;
+  if (isa<Instruction>(&To)) {
+    bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
+
+    for (auto *DII : Users) {
+      // It's common to see a debug user between From and DomPoint. Move it
+      // after DomPoint to preserve the variable update without any reordering.
+      if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
+        LLVM_DEBUG(dbgs() << "MOVE:  " << *DII << '\n');
+        DII->moveAfter(&DomPoint);
+        Changed = true;
+
+      // Users which otherwise aren't dominated by the replacement value must
+      // be salvaged or deleted.
+      } else if (!DT.dominates(&DomPoint, DII)) {
+        DeleteOrSalvage.insert(DII);
+      }
     }
   }
+
+  // Update debug users without use-before-def risk.
+  for (auto *DII : Users) {
+    if (DeleteOrSalvage.count(DII))
+      continue;
+
+    LLVMContext &Ctx = DII->getContext();
+    DbgValReplacement DVR = RewriteExpr(*DII);
+    if (!DVR)
+      continue;
+
+    DII->setOperand(0, wrapValueInMetadata(Ctx, &To));
+    DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR));
+    LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
+    Changed = true;
+  }
+
+  if (!DeleteOrSalvage.empty()) {
+    // Try to salvage the remaining debug users.
+    Changed |= salvageDebugInfo(From);
+
+    // Delete the debug users which weren't salvaged.
+    for (auto *DII : DeleteOrSalvage) {
+      if (DII->getVariableLocation() == &From) {
+        LLVM_DEBUG(dbgs() << "Erased UseBeforeDef:  " << *DII << '\n');
+        DII->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would
+/// losslessly preserve the bits and semantics of the value. This predicate is
+/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result.
+///
+/// Note that Type::canLosslesslyBitCastTo is not suitable here because it
+/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>,
+/// and also does not allow lossless pointer <-> integer conversions.
+static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy,
+                                         Type *ToTy) {
+  // Trivially compatible types.
+  if (FromTy == ToTy)
+    return true;
+
+  // Handle compatible pointer <-> integer conversions.
+  if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) {
+    bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy);
+    bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) &&
+                              !DL.isNonIntegralPointerType(ToTy);
+    return SameSize && LosslessConversion;
+  }
+
+  // TODO: This is not exhaustive.
+  return false;
+}
+
+bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
+                                 Instruction &DomPoint, DominatorTree &DT) {
+  // Exit early if From has no debug users.
+  if (!From.isUsedByMetadata())
+    return false;
+
+  assert(&From != &To && "Can't replace something with itself");
+
+  Type *FromTy = From.getType();
+  Type *ToTy = To.getType();
+
+  auto Identity = [&](DbgInfoIntrinsic &DII) -> DbgValReplacement {
+    return DII.getExpression();
+  };
+
+  // Handle no-op conversions.
+  Module &M = *From.getModule();
+  const DataLayout &DL = M.getDataLayout();
+  if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
+    return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+  // Handle integer-to-integer widening and narrowing.
+  // FIXME: Use DW_OP_convert when it's available everywhere.
+  if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) {
+    uint64_t FromBits = FromTy->getPrimitiveSizeInBits();
+    uint64_t ToBits = ToTy->getPrimitiveSizeInBits();
+    assert(FromBits != ToBits && "Unexpected no-op conversion");
+
+    // When the width of the result grows, assume that a debugger will only
+    // access the low `FromBits` bits when inspecting the source variable.
+    if (FromBits < ToBits)
+      return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+    // The width of the result has shrunk. Use sign/zero extension to describe
+    // the source variable's high bits.
+    auto SignOrZeroExt = [&](DbgInfoIntrinsic &DII) -> DbgValReplacement {
+      DILocalVariable *Var = DII.getVariable();
+
+      // Without knowing signedness, sign/zero extension isn't possible.
+      auto Signedness = Var->getSignedness();
+      if (!Signedness)
+        return None;
+
+      bool Signed = *Signedness == DIBasicType::Signedness::Signed;
+
+      if (!Signed) {
+        // In the unsigned case, assume that a debugger will initialize the
+        // high bits to 0 and do a no-op conversion.
+        return Identity(DII);
+      } else {
+        // In the signed case, the high bits are given by sign extension, i.e:
+        //   (To >> (ToBits - 1)) * ((2 ^ FromBits) - 1)
+        // Calculate the high bits and OR them together with the low bits.
+        SmallVector<uint64_t, 8> Ops({dwarf::DW_OP_dup, dwarf::DW_OP_constu,
+                                      (ToBits - 1), dwarf::DW_OP_shr,
+                                      dwarf::DW_OP_lit0, dwarf::DW_OP_not,
+                                      dwarf::DW_OP_mul, dwarf::DW_OP_or});
+        return DIExpression::appendToStack(DII.getExpression(), Ops);
+      }
+    };
+    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
+  }
+
+  // TODO: Floating-point conversions, vectors.
+  return false;
 }
 
 unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
@@ -1448,13 +1902,19 @@ unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
 }
 
 unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
-                                   bool PreserveLCSSA) {
+                                   bool PreserveLCSSA, DeferredDominance *DDT) {
   BasicBlock *BB = I->getParent();
+  std::vector <DominatorTree::UpdateType> Updates;
+
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
-  for (BasicBlock *Successor : successors(BB))
+  if (DDT)
+    Updates.reserve(BB->getTerminator()->getNumSuccessors());
+  for (BasicBlock *Successor : successors(BB)) {
     Successor->removePredecessor(BB, PreserveLCSSA);
-
+    if (DDT)
+      Updates.push_back({DominatorTree::Delete, BB, Successor});
+  }
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
   if (UseLLVMTrap) {
@@ -1474,11 +1934,13 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
     BB->getInstList().erase(BBI++);
     ++NumInstrsRemoved;
   }
+  if (DDT)
+    DDT->applyUpdates(Updates);
   return NumInstrsRemoved;
 }
 
 /// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
+static void changeToCall(InvokeInst *II, DeferredDominance *DDT = nullptr) {
   SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
@@ -1491,11 +1953,16 @@ static void changeToCall(InvokeInst *II) {
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
+  BasicBlock *NormalDestBB = II->getNormalDest();
+  BranchInst::Create(NormalDestBB, II);
 
   // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(II->getParent());
+  BasicBlock *BB = II->getParent();
+  BasicBlock *UnwindDestBB = II->getUnwindDest();
+  UnwindDestBB->removePredecessor(BB);
   II->eraseFromParent();
+  if (DDT)
+    DDT->deleteEdge(BB, UnwindDestBB);
 }
 
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
@@ -1536,7 +2003,8 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
 }
 
 static bool markAliveBlocks(Function &F,
-                            SmallPtrSetImpl<BasicBlock*> &Reachable) {
+                            SmallPtrSetImpl<BasicBlock*> &Reachable,
+                            DeferredDominance *DDT = nullptr) {
   SmallVector<BasicBlock*, 128> Worklist;
   BasicBlock *BB = &F.front();
   Worklist.push_back(BB);
@@ -1549,41 +2017,44 @@ static bool markAliveBlocks(Function &F,
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
     for (Instruction &I : *BB) {
-      // Assumptions that are known to be false are equivalent to unreachable.
-      // Also, if the condition is undefined, then we make the choice most
-      // beneficial to the optimizer, and choose that to also be unreachable.
-      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
-        if (II->getIntrinsicID() == Intrinsic::assume) {
-          if (match(II->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
-            // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(II, false);
-            Changed = true;
-            break;
-          }
-        }
-
-        if (II->getIntrinsicID() == Intrinsic::experimental_guard) {
-          // A call to the guard intrinsic bails out of the current compilation
-          // unit if the predicate passed to it is false.  If the predicate is a
-          // constant false, then we know the guard will bail out of the current
-          // compile unconditionally, so all code following it is dead.
-          //
-          // Note: unlike in llvm.assume, it is not "obviously profitable" for
-          // guards to treat `undef` as `false` since a guard on `undef` can
-          // still be useful for widening.
-          if (match(II->getArgOperand(0), m_Zero()))
-            if (!isa<UnreachableInst>(II->getNextNode())) {
-              changeToUnreachable(II->getNextNode(), /*UseLLVMTrap=*/ false);
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        Value *Callee = CI->getCalledValue();
+        // Handle intrinsic calls.
+        if (Function *F = dyn_cast<Function>(Callee)) {
+          auto IntrinsicID = F->getIntrinsicID();
+          // Assumptions that are known to be false are equivalent to
+          // unreachable. Also, if the condition is undefined, then we make the
+          // choice most beneficial to the optimizer, and choose that to also be
+          // unreachable.
+          if (IntrinsicID == Intrinsic::assume) {
+            if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
+              // Don't insert a call to llvm.trap right before the unreachable.
+              changeToUnreachable(CI, false, false, DDT);
               Changed = true;
               break;
             }
-        }
-      }
-
-      if (auto *CI = dyn_cast<CallInst>(&I)) {
-        Value *Callee = CI->getCalledValue();
-        if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+          } else if (IntrinsicID == Intrinsic::experimental_guard) {
+            // A call to the guard intrinsic bails out of the current
+            // compilation unit if the predicate passed to it is false. If the
+            // predicate is a constant false, then we know the guard will bail
+            // out of the current compile unconditionally, so all code following
+            // it is dead.
+            //
+            // Note: unlike in llvm.assume, it is not "obviously profitable" for
+            // guards to treat `undef` as `false` since a guard on `undef` can
+            // still be useful for widening.
+            if (match(CI->getArgOperand(0), m_Zero()))
+              if (!isa<UnreachableInst>(CI->getNextNode())) {
+                changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false,
+                                    false, DDT);
+                Changed = true;
+                break;
+              }
+          }
+        } else if ((isa<ConstantPointerNull>(Callee) &&
+                    !NullPointerIsDefined(CI->getFunction())) ||
+                   isa<UndefValue>(Callee)) {
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DDT);
           Changed = true;
           break;
         }
@@ -1593,17 +2064,16 @@ static bool markAliveBlocks(Function &F,
           // though.
           if (!isa<UnreachableInst>(CI->getNextNode())) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(CI->getNextNode(), false);
+            changeToUnreachable(CI->getNextNode(), false, false, DDT);
             Changed = true;
           }
           break;
         }
-      }
+      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        // Store to undef and store to null are undefined and used to signal
+        // that they should be changed to unreachable by passes that can't
+        // modify the CFG.
 
-      // Store to undef and store to null are undefined and used to signal that
-      // they should be changed to unreachable by passes that can't modify the
-      // CFG.
-      if (auto *SI = dyn_cast<StoreInst>(&I)) {
         // Don't touch volatile stores.
         if (SI->isVolatile()) continue;
 
@@ -1611,8 +2081,9 @@ static bool markAliveBlocks(Function &F,
 
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
-             SI->getPointerAddressSpace() == 0)) {
-          changeToUnreachable(SI, true);
+             !NullPointerIsDefined(SI->getFunction(),
+                                   SI->getPointerAddressSpace()))) {
+          changeToUnreachable(SI, true, false, DDT);
           Changed = true;
           break;
         }
@@ -1623,17 +2094,23 @@ static bool markAliveBlocks(Function &F,
     if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
       // Turn invokes that call 'nounwind' functions into ordinary calls.
       Value *Callee = II->getCalledValue();
-      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        changeToUnreachable(II, true);
+      if ((isa<ConstantPointerNull>(Callee) &&
+           !NullPointerIsDefined(BB->getParent())) ||
+          isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true, false, DDT);
         Changed = true;
       } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
         if (II->use_empty() && II->onlyReadsMemory()) {
           // jump to the normal destination branch.
-          BranchInst::Create(II->getNormalDest(), II);
-          II->getUnwindDest()->removePredecessor(II->getParent());
+          BasicBlock *NormalDestBB = II->getNormalDest();
+          BasicBlock *UnwindDestBB = II->getUnwindDest();
+          BranchInst::Create(NormalDestBB, II);
+          UnwindDestBB->removePredecessor(II->getParent());
           II->eraseFromParent();
+          if (DDT)
+            DDT->deleteEdge(BB, UnwindDestBB);
         } else
-          changeToCall(II);
+          changeToCall(II, DDT);
         Changed = true;
       }
     } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
@@ -1679,7 +2156,7 @@ static bool markAliveBlocks(Function &F,
       }
     }
 
-    Changed |= ConstantFoldTerminator(BB, true);
+    Changed |= ConstantFoldTerminator(BB, true, nullptr, DDT);
     for (BasicBlock *Successor : successors(BB))
       if (Reachable.insert(Successor).second)
         Worklist.push_back(Successor);
@@ -1687,11 +2164,11 @@ static bool markAliveBlocks(Function &F,
   return Changed;
 }
 
-void llvm::removeUnwindEdge(BasicBlock *BB) {
+void llvm::removeUnwindEdge(BasicBlock *BB, DeferredDominance *DDT) {
   TerminatorInst *TI = BB->getTerminator();
 
   if (auto *II = dyn_cast<InvokeInst>(TI)) {
-    changeToCall(II);
+    changeToCall(II, DDT);
     return;
   }
 
@@ -1719,15 +2196,18 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
   UnwindDest->removePredecessor(BB);
   TI->replaceAllUsesWith(NewTI);
   TI->eraseFromParent();
+  if (DDT)
+    DDT->deleteEdge(BB, UnwindDest);
 }
 
 /// removeUnreachableBlocks - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
 /// after modifying the CFG.
-bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
+bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
+                                   DeferredDominance *DDT) {
   SmallPtrSet<BasicBlock*, 16> Reachable;
-  bool Changed = markAliveBlocks(F, Reachable);
+  bool Changed = markAliveBlocks(F, Reachable, DDT);
 
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
@@ -1737,25 +2217,39 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
   NumRemoved += F.size()-Reachable.size();
 
   // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(&*BB))
+  // their internal references. Update DDT and LVI if available.
+  std::vector <DominatorTree::UpdateType> Updates;
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) {
+    auto *BB = &*I;
+    if (Reachable.count(BB))
       continue;
-
-    for (BasicBlock *Successor : successors(&*BB))
+    for (BasicBlock *Successor : successors(BB)) {
       if (Reachable.count(Successor))
-        Successor->removePredecessor(&*BB);
+        Successor->removePredecessor(BB);
+      if (DDT)
+        Updates.push_back({DominatorTree::Delete, BB, Successor});
+    }
     if (LVI)
-      LVI->eraseBlock(&*BB);
+      LVI->eraseBlock(BB);
     BB->dropAllReferences();
   }
 
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(&*I))
-      I = F.getBasicBlockList().erase(I);
-    else
+  for (Function::iterator I = ++F.begin(); I != F.end();) {
+    auto *BB = &*I;
+    if (Reachable.count(BB)) {
       ++I;
+      continue;
+    }
+    if (DDT) {
+      DDT->deleteBB(BB); // deferred deletion of BB.
+      ++I;
+    } else {
+      I = F.getBasicBlockList().erase(I);
+    }
+  }
 
+  if (DDT)
+    DDT->applyUpdates(Updates);
   return true;
 }
 
@@ -1848,8 +2342,8 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To,
     if (!Dominates(Root, U))
       continue;
     U.set(To);
-    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
-                 << *To << " in " << *U << "\n");
+    LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName()
+                      << "' as " << *To << " in " << *U << "\n");
     ++Count;
   }
   return Count;
@@ -1953,7 +2447,7 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
   if (!NewTy->isPointerTy())
     return;
 
-  unsigned BitWidth = DL.getTypeSizeInBits(NewTy);
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(NewTy);
   if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
     MDNode *NN = MDNode::get(OldLI.getContext(), None);
     NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
@@ -2265,7 +2759,7 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
     // Static allocas (constant size in the entry block) are handled by
     // prologue/epilogue insertion so they're free anyway. We definitely don't
     // want to make them non-constant.
-    return !dyn_cast<AllocaInst>(I)->isStaticAlloca();
+    return !cast<AllocaInst>(I)->isStaticAlloca();
   case Instruction::GetElementPtr:
     if (OpIdx == 0)
       return true;
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
new file mode 100644
index 000000000000..6e92e679f999
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -0,0 +1,645 @@
+//===----------------- LoopRotationUtils.cpp -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utilities to convert a loop into a loop with bottom test.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const SimplifyQuery &SQ;
+  bool RotationOnly;
+  bool IsUtilMode;
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ,
+             bool RotationOnly, bool IsUtilMode)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        SQ(SQ), RotationOnly(RotationOnly), IsUtilMode(IsUtilMode) {}
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA(InsertedPHIs);
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    SmallVector<DbgValueInst *, 1> DbgValues;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal);
+    for (auto &DbgValue : DbgValues) {
+      // The original users in the OrigHeader are already using the original
+      // definitions.
+      BasicBlock *UserBB = DbgValue->getParent();
+      if (UserBB == OrigHeader)
+        continue;
+
+      // Users in the OrigPreHeader need to use the value to which the
+      // original definitions are mapped and anything else can be handled by
+      // the SSAUpdater. To avoid adding PHINodes, check if the value is
+      // available in UserBB, if not substitute undef.
+      Value *NewVal;
+      if (UserBB == OrigPreheader)
+        NewVal = OrigPreHeaderVal;
+      else if (SSA.HasValueForBlock(UserBB))
+        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+      else
+        NewVal = UndefValue::get(OrigHeaderVal->getType());
+      DbgValue->setOperand(0,
+                           MetadataAsValue::get(OrigHeaderVal->getContext(),
+                                                ValueAsMetadata::get(NewVal)));
+    }
+  }
+}
+
+// Look for a phi which is only used outside the loop (via a LCSSA phi)
+// in the exit from the header. This means that rotating the loop can
+// remove the phi.
+static bool shouldRotateLoopExitingLatch(Loop *L) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *HeaderExit = Header->getTerminator()->getSuccessor(0);
+  if (L->contains(HeaderExit))
+    HeaderExit = Header->getTerminator()->getSuccessor(1);
+
+  for (auto &Phi : Header->phis()) {
+    // Look for uses of this phi in the loop/via exits other than the header.
+    if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {
+          return cast<Instruction>(U)->getParent() != HeaderExit;
+        }))
+      continue;
+    return true;
+  }
+
+  return false;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified. Or if we think it will be profitable.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+      !shouldRotateLoopExitingLatch(L))
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    if (Metrics.notDuplicatable) {
+      LLVM_DEBUG(
+          dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                 << " instructions: ";
+          L->dump());
+      return false;
+    }
+    if (Metrics.convergent) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                           "instructions: ";
+                 L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MaxHeaderSize)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader || !L->hasDedicatedExits())
+    return false;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated. We should also invalidate
+  // all outer loops because insertion and deletion of blocks that happens
+  // during the rotation may violate invariants related to backedge taken
+  // infos in them.
+  if (SE)
+    SE->forgetTopmostLoop(L);
+
+  LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+  using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+  };
+  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+       I != E; ++I) {
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+      DbgIntrinsics.insert(makeHash(DII));
+    else
+      break;
+  }
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Avoid inserting the same intrinsic twice.
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+      if (DbgIntrinsics.count(makeHash(DII))) {
+        C->deleteValue();
+        continue;
+      }
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = SimplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      ValueMap[Inst] = V;
+      if (!C->mayHaveSideEffects()) {
+        C->deleteValue();
+        C = nullptr;
+      }
+    } else {
+      ValueMap[Inst] = C;
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+
+      if (auto *II = dyn_cast<IntrinsicInst>(C))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+    DT->applyUpdates(Updates);
+  }
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+          NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+  LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+                    << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  bool SimplifiedLatch = false;
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  if (!RotationOnly)
+    SimplifiedLatch = simplifyLoopLatch(L);
+
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange || SimplifiedLatch;
+}
+
+
+/// The utility to convert a loop into a loop with bottom test.
+bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
+                        AssumptionCache *AC, DominatorTree *DT,
+                        ScalarEvolution *SE, const SimplifyQuery &SQ,
+                        bool RotationOnly = true,
+                        unsigned Threshold = unsigned(-1),
+                        bool IsUtilMode = true) {
+  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, SQ, RotationOnly, IsUtilMode);
+
+  return LR.processLoop(L);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index f43af9772771..970494eb4704 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -52,6 +52,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -64,9 +65,8 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -141,8 +141,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
   if (!PreheaderBB)
     return nullptr;
 
-  DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
-               << PreheaderBB->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+                    << PreheaderBB->getName() << "\n");
 
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
@@ -170,7 +170,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
   } while (!Worklist.empty());
 }
 
-/// \brief The first part of loop-nestification is to find a PHI node that tells
+/// The first part of loop-nestification is to find a PHI node that tells
 /// us how to partition the loops.
 static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
                                         AssumptionCache *AC) {
@@ -195,7 +195,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
   return nullptr;
 }
 
-/// \brief If this loop has multiple backedges, try to pull one of them out into
+/// If this loop has multiple backedges, try to pull one of them out into
 /// a nested loop.
 ///
 /// This is important for code that looks like
@@ -242,7 +242,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
       OuterLoopPreds.push_back(PN->getIncomingBlock(i));
     }
   }
-  DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
 
   // If ScalarEvolution is around and knows anything about values in
   // this loop, tell it to forget them, because we're about to
@@ -332,7 +332,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
   return NewOuter;
 }
 
-/// \brief This method is called when the specified loop has more than one
+/// This method is called when the specified loop has more than one
 /// backedge in it.
 ///
 /// If this occurs, revector all of these backedges to target a new basic block
@@ -371,8 +371,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
   BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
 
-  DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
-               << BEBlock->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+                    << BEBlock->getName() << "\n");
 
   // Move the new backedge block to right after the last backedge block.
   Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
@@ -457,7 +457,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   return BEBlock;
 }
 
-/// \brief Simplify one loop and queue further loops for simplification.
+/// Simplify one loop and queue further loops for simplification.
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
                             DominatorTree *DT, LoopInfo *LI,
                             ScalarEvolution *SE, AssumptionCache *AC,
@@ -484,8 +484,8 @@ ReprocessLoop:
     // Delete each unique out-of-loop (and thus dead) predecessor.
     for (BasicBlock *P : BadPreds) {
 
-      DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
-                   << P->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+                        << P->getName() << "\n");
 
       // Zap the dead pred's terminator and replace it with unreachable.
       TerminatorInst *TI = P->getTerminator();
@@ -504,16 +504,13 @@ ReprocessLoop:
       if (BI->isConditional()) {
         if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
 
-          DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
-                       << ExitingBlock->getName() << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+                     << ExitingBlock->getName() << "\n");
 
           BI->setCondition(ConstantInt::get(Cond->getType(),
                                             !L->contains(BI->getSuccessor(0))));
 
-          // This may make the loop analyzable, force SCEV recomputation.
-          if (SE)
-            SE->forgetLoop(L);
-
           Changed = true;
         }
       }
@@ -617,11 +614,8 @@ ReprocessLoop:
       // comparison and the branch.
       bool AllInvariant = true;
       bool AnyInvariant = false;
-      for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) {
+      for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) {
         Instruction *Inst = &*I++;
-        // Skip debug info intrinsics.
-        if (isa<DbgInfoIntrinsic>(Inst))
-          continue;
         if (Inst == CI)
           continue;
         if (!L->makeLoopInvariant(Inst, AnyInvariant,
@@ -648,15 +642,8 @@ ReprocessLoop:
 
       // Success. The block is now dead, so remove it from the loop,
       // update the dominator tree and delete it.
-      DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
-                   << ExitingBlock->getName() << "\n");
-
-      // Notify ScalarEvolution before deleting this block. Currently assume the
-      // parent loop doesn't change (spliting edges doesn't count). If blocks,
-      // CFG edges, or other values in the parent loop change, then we need call
-      // to forgetLoop() for the parent instead.
-      if (SE)
-        SE->forgetLoop(L);
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+                        << ExitingBlock->getName() << "\n");
 
       assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
       Changed = true;
@@ -679,6 +666,12 @@ ReprocessLoop:
     }
   }
 
+  // Changing exit conditions for blocks may affect exit counts of this loop and
+  // any of its paretns, so we must invalidate the entire subtree if we've made
+  // any changes.
+  if (Changed && SE)
+    SE->forgetTopmostLoop(L);
+
   return Changed;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 92dfb1c7204d..04b8c1417e0a 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -33,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -63,8 +63,7 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
 
 /// Convert the instruction operands from referencing the current values into
 /// those specified by VMap.
-static inline void remapInstruction(Instruction *I,
-                                    ValueToValueMapTy &VMap) {
+void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
 
@@ -97,16 +96,10 @@ static inline void remapInstruction(Instruction *I,
 
 /// Folds a basic block into its predecessor if it only has one predecessor, and
 /// that predecessor only has one successor.
-/// The LoopInfo Analysis that is passed will be kept consistent.  If folding is
-/// successful references to the containing loop must be removed from
-/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
-/// references to the eliminated BB.  The argument ForgottenLoops contains a set
-/// of loops that have already been forgotten to prevent redundant, expensive
-/// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
-static BasicBlock *
-foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
-                         SmallPtrSetImpl<Loop *> &ForgottenLoops,
-                         DominatorTree *DT) {
+/// The LoopInfo Analysis that is passed will be kept consistent.
+BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
+                                           ScalarEvolution *SE,
+                                           DominatorTree *DT) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -116,7 +109,8 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
   if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
     return nullptr;
 
-  DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
+  LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
+                    << OnlyPred->getName() << "\n");
 
   // Resolve any PHI nodes at the start of the block.  They are all
   // guaranteed to have exactly one entry if they exist, unless there are
@@ -149,13 +143,6 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
       DT->eraseNode(BB);
     }
 
-  // ScalarEvolution holds references to loop exit blocks.
-  if (SE) {
-    if (Loop *L = LI->getLoopFor(BB)) {
-      if (ForgottenLoops.insert(L).second)
-        SE->forgetLoop(L);
-    }
-  }
   LI->removeBlock(BB);
 
   // Inherit predecessor's name if it exists...
@@ -265,6 +252,48 @@ static bool isEpilogProfitable(Loop *L) {
   return false;
 }
 
+/// Perform some cleanup and simplifications on loops after unrolling. It is
+/// useful to simplify the IV's in the new loop, as well as do a quick
+/// simplify/dce pass of the instructions.
+void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                                   ScalarEvolution *SE, DominatorTree *DT,
+                                   AssumptionCache *AC) {
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && SimplifyIVs) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty())
+      if (Instruction *Inst =
+              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const std::vector<BasicBlock *> &NewLoopBlocks = L->getBlocks();
+  for (BasicBlock *BB : NewLoopBlocks) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+      Instruction *Inst = &*I++;
+
+      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
+        if (LI->replacementPreservesLCSSAForm(Inst, V))
+          Inst->replaceAllUsesWith(V);
+      if (isInstructionTriviallyDead(Inst))
+        BB->getInstList().erase(Inst);
+    }
+  }
+
+  // TODO: after peeling or unrolling, previously loop variant conditions are
+  // likely to fold to constants, eagerly propagating those here will require
+  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be
+  // appropriate.
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form.  Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
 /// branch instruction. However, if the trip count (and multiple) are not known,
@@ -310,19 +339,19 @@ LoopUnrollResult llvm::UnrollLoop(
 
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
-    DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
+    LLVM_DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
     return LoopUnrollResult::Unmodified;
   }
 
   BasicBlock *LatchBlock = L->getLoopLatch();
   if (!LatchBlock) {
-    DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
+    LLVM_DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
     return LoopUnrollResult::Unmodified;
   }
 
   // Loops with indirectbr cannot be cloned.
   if (!L->isSafeToClone()) {
-    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    LLVM_DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -335,8 +364,9 @@ LoopUnrollResult llvm::UnrollLoop(
 
   if (!BI || BI->isUnconditional()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
-    DEBUG(dbgs() <<
-             "  Can't unroll; loop not terminated by a conditional branch.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "  Can't unroll; loop not terminated by a conditional branch.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -345,22 +375,22 @@ LoopUnrollResult llvm::UnrollLoop(
   };
 
   if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
-    DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
-                    " exiting the loop can be unrolled\n");
+    LLVM_DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
+                         " exiting the loop can be unrolled\n");
     return LoopUnrollResult::Unmodified;
   }
 
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
-    DEBUG(dbgs() <<
-          "  Won't unroll loop: address of header block is taken.\n");
+    LLVM_DEBUG(
+        dbgs() << "  Won't unroll loop: address of header block is taken.\n");
     return LoopUnrollResult::Unmodified;
   }
 
   if (TripCount != 0)
-    DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
+    LLVM_DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
   if (TripMultiple != 1)
-    DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");
+    LLVM_DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");
 
   // Effectively "DCE" unrolled iterations that are beyond the tripcount
   // and will never be executed.
@@ -369,7 +399,7 @@ LoopUnrollResult llvm::UnrollLoop(
 
   // Don't enter the unroll code if there is nothing to do.
   if (TripCount == 0 && Count < 2 && PeelCount == 0) {
-    DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -403,8 +433,9 @@ LoopUnrollResult llvm::UnrollLoop(
          "Did not expect runtime trip-count unrolling "
          "and peeling for the same loop");
 
+  bool Peeled = false;
   if (PeelCount) {
-    bool Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
+    Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
 
     // Successful peeling may result in a change in the loop preheader/trip
     // counts. If we later unroll the loop, we want these to be updated.
@@ -419,7 +450,7 @@ LoopUnrollResult llvm::UnrollLoop(
 
   // Loops containing convergent instructions must have a count that divides
   // their TripMultiple.
-  DEBUG(
+  LLVM_DEBUG(
       {
         bool HasConvergent = false;
         for (auto &BB : L->blocks())
@@ -442,18 +473,12 @@ LoopUnrollResult llvm::UnrollLoop(
     if (Force)
       RuntimeTripCount = false;
     else {
-      DEBUG(
-          dbgs() << "Wont unroll; remainder loop could not be generated"
-                    "when assuming runtime trip count\n");
+      LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
     }
   }
 
-  // Notify ScalarEvolution that the loop will be substantially changed,
-  // if not outright eliminated.
-  if (SE)
-    SE->forgetLoop(L);
-
   // If we know the trip count, we know the multiple...
   unsigned BreakoutTrip = 0;
   if (TripCount != 0) {
@@ -468,8 +493,8 @@ LoopUnrollResult llvm::UnrollLoop(
   using namespace ore;
   // Report the unrolling decision.
   if (CompletelyUnroll) {
-    DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
-                 << " with trip count " << TripCount << "!\n");
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+                      << " with trip count " << TripCount << "!\n");
     if (ORE)
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
@@ -478,8 +503,8 @@ LoopUnrollResult llvm::UnrollLoop(
                << NV("UnrollCount", TripCount) << " iterations";
       });
   } else if (PeelCount) {
-    DEBUG(dbgs() << "PEELING loop %" << Header->getName()
-                 << " with iteration count " << PeelCount << "!\n");
+    LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+                      << " with iteration count " << PeelCount << "!\n");
     if (ORE)
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
@@ -495,31 +520,42 @@ LoopUnrollResult llvm::UnrollLoop(
                   << NV("UnrollCount", Count);
     };
 
-    DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
-          << " by " << Count);
+    LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
+                      << Count);
     if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
-      DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+      LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
       if (ORE)
         ORE->emit([&]() {
           return DiagBuilder() << " with a breakout at trip "
                                << NV("BreakoutTrip", BreakoutTrip);
         });
     } else if (TripMultiple != 1) {
-      DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
       if (ORE)
         ORE->emit([&]() {
           return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
                                << " trips per branch";
         });
     } else if (RuntimeTripCount) {
-      DEBUG(dbgs() << " with run-time trip count");
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
       if (ORE)
         ORE->emit(
             [&]() { return DiagBuilder() << " with run-time trip count"; });
     }
-    DEBUG(dbgs() << "!\n");
+    LLVM_DEBUG(dbgs() << "!\n");
   }
 
+  // We are going to make changes to this loop. SCEV may be keeping cached info
+  // about it, in particular about backedge taken count. The changes we make
+  // are guaranteed to invalidate this information for our loop. It is tempting
+  // to only invalidate the loop being unrolled, but it is incorrect as long as
+  // all exiting branches from all inner loops have impact on the outer loops,
+  // and if something changes inside them then any of outer loops may also
+  // change. When we forget outermost loop, we also forget all contained loops
+  // and this is what we need here.
+  if (SE)
+    SE->forgetTopmostLoop(L);
+
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
 
@@ -577,14 +613,9 @@ LoopUnrollResult llvm::UnrollLoop(
              "Header should not be in a sub-loop");
       // Tell LI about New.
       const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
-      if (OldLoop) {
+      if (OldLoop)
         LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-        // Forget the old loop, since its inputs may have changed.
-        if (SE)
-          SE->forgetLoop(OldLoop);
-      }
-
       if (*BB == Header)
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
@@ -769,17 +800,15 @@ LoopUnrollResult llvm::UnrollLoop(
     }
   }
 
-  if (DT && UnrollVerifyDomtree)
-    DT->verifyDomTree();
+  assert(!DT || !UnrollVerifyDomtree ||
+      DT->verify(DominatorTree::VerificationLevel::Fast));
 
   // Merge adjacent basic blocks, if possible.
-  SmallPtrSet<Loop *, 4> ForgottenLoops;
   for (BasicBlock *Latch : Latches) {
     BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold =
-              foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
+      if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
         // Dest has been folded into Fold. Update our worklists accordingly.
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
         UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
@@ -789,40 +818,10 @@ LoopUnrollResult llvm::UnrollLoop(
     }
   }
 
-  // Simplify any new induction variables in the partially unrolled loop.
-  if (SE && !CompletelyUnroll && Count > 1) {
-    SmallVector<WeakTrackingVH, 16> DeadInsts;
-    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
-
-    // Aggressively clean up dead instructions that simplifyLoopIVs already
-    // identified. Any remaining should be cleaned up below.
-    while (!DeadInsts.empty())
-      if (Instruction *Inst =
-              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
-  }
-
-  // At this point, the code is well formed.  We now do a quick sweep over the
-  // inserted code, doing constant propagation and dead code elimination as we
-  // go.
-  const DataLayout &DL = Header->getModule()->getDataLayout();
-  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
-  for (BasicBlock *BB : NewLoopBlocks) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-      Instruction *Inst = &*I++;
-
-      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
-        if (LI->replacementPreservesLCSSAForm(Inst, V))
-          Inst->replaceAllUsesWith(V);
-      if (isInstructionTriviallyDead(Inst))
-        BB->getInstList().erase(Inst);
-    }
-  }
-
-  // TODO: after peeling or unrolling, previously loop variant conditions are
-  // likely to fold to constants, eagerly propagating those here will require
-  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be
-  // appropriate.
+  // At this point, the code is well formed.  We now simplify the unrolled loop,
+  // doing constant propagation and dead code elimination as we go.
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && (Count > 1 || Peeled), LI, SE,
+                          DT, AC);
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
new file mode 100644
index 000000000000..b919f73c3817
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -0,0 +1,785 @@
+//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam as a routine, much like
+// LoopUnroll.cpp implements loop unroll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
+STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
+
+typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
+
+// Partition blocks in an outer/inner loop pair into blocks before and after
+// the loop
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+                                     BasicBlockSet &ForeBlocks,
+                                     BasicBlockSet &SubLoopBlocks,
+                                     BasicBlockSet &AftBlocks,
+                                     DominatorTree *DT) {
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+
+  for (BasicBlock *BB : L->blocks()) {
+    if (!SubLoop->contains(BB)) {
+      if (DT->dominates(SubLoopLatch, BB))
+        AftBlocks.insert(BB);
+      else
+        ForeBlocks.insert(BB);
+    }
+  }
+
+  // Check that all blocks in ForeBlocks together dominate the subloop
+  // TODO: This might ideally be done better with a dominator/postdominators.
+  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
+  for (BasicBlock *BB : ForeBlocks) {
+    if (BB == SubLoopPreHeader)
+      continue;
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!ForeBlocks.count(TI->getSuccessor(i)))
+        return false;
+  }
+
+  return true;
+}
+
+// Looks at the phi nodes in Header for values coming from Latch. For these
+// instructions and all their operands calls Visit on them, keeping going for
+// all the operands in AftBlocks. Returns false if Visit returns false,
+// otherwise returns true. This is used to process the instructions in the
+// Aft blocks that need to be moved before the subloop. It is used in two
+// places. One to check that the required set of instructions can be moved
+// before the loop. Then to collect the instructions to actually move in
+// moveHeaderPhiOperandsToForeBlocks.
+template <typename T>
+static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
+                                     BasicBlockSet &AftBlocks, T Visit) {
+  SmallVector<Instruction *, 8> Worklist;
+  for (auto &Phi : Header->phis()) {
+    Value *V = Phi.getIncomingValueForBlock(Latch);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Worklist.push_back(I);
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    if (!Visit(I))
+      return false;
+
+    if (AftBlocks.count(I->getParent()))
+      for (auto &U : I->operands())
+        if (Instruction *II = dyn_cast<Instruction>(U))
+          Worklist.push_back(II);
+  }
+
+  return true;
+}
+
+// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
+static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
+                                              BasicBlock *Latch,
+                                              Instruction *InsertLoc,
+                                              BasicBlockSet &AftBlocks) {
+  // We need to ensure we move the instructions in the correct order,
+  // starting with the earliest required instruction and moving forward.
+  std::vector<Instruction *> Visited;
+  processHeaderPhiOperands(Header, Latch, AftBlocks,
+                           [&Visited, &AftBlocks](Instruction *I) {
+                             if (AftBlocks.count(I->getParent()))
+                               Visited.push_back(I);
+                             return true;
+                           });
+
+  // Move all instructions in program order to before the InsertLoc
+  BasicBlock *InsertLocBB = InsertLoc->getParent();
+  for (Instruction *I : reverse(Visited)) {
+    if (I->getParent() != InsertLocBB)
+      I->moveBefore(InsertLoc);
+  }
+}
+
+/*
+  This method performs Unroll and Jam. For a simple loop like:
+  for (i = ..)
+    Fore(i)
+    for (j = ..)
+      SubLoop(i, j)
+    Aft(i)
+
+  Instead of doing normal inner or outer unrolling, we do:
+  for (i = .., i+=2)
+    Fore(i)
+    Fore(i+1)
+    for (j = ..)
+      SubLoop(i, j)
+      SubLoop(i+1, j)
+    Aft(i)
+    Aft(i+1)
+
+  So the outer loop is essetially unrolled and then the inner loops are fused
+  ("jammed") together into a single loop. This can increase speed when there
+  are loads in SubLoop that are invariant to i, as they become shared between
+  the now jammed inner loops.
+
+  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
+  Fore blocks are those before the inner loop, Aft are those after. Normal
+  Unroll code is used to copy each of these sets of blocks and the results are
+  combined together into the final form above.
+
+  isSafeToUnrollAndJam should be used prior to calling this to make sure the
+  unrolling will be valid. Checking profitablility is also advisable.
+*/
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                       unsigned TripMultiple, bool UnrollRemainder,
+                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                       AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
+
+  // When we enter here we should have already checked that it is safe
+  BasicBlock *Header = L->getHeader();
+  assert(L->getSubLoops().size() == 1);
+  Loop *SubLoop = *L->begin();
+
+  // Don't enter the unroll code if there is nothing to do.
+  if (TripCount == 0 && Count < 2) {
+    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = (Count == TripCount);
+
+  // We use the runtime remainder in cases where we don't know trip multiple
+  if (TripMultiple == 1 || TripMultiple % Count != 0) {
+    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+                                    /*UseEpilogRemainder*/ true,
+                                    UnrollRemainder, LI, SE, DT, AC, true)) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
+      return LoopUnrollResult::Unmodified;
+    }
+  }
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (SE) {
+    SE->forgetLoop(L);
+    SE->forgetLoop(SubLoop);
+  }
+
+  using namespace ore;
+  // Report the unrolling decision.
+  if (CompletelyUnroll) {
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
+                      << Header->getName() << " with trip count " << TripCount
+                      << "!\n");
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+                                 L->getHeader())
+              << "completely unroll and jammed loop with "
+              << NV("UnrollCount", TripCount) << " iterations");
+  } else {
+    auto DiagBuilder = [&]() {
+      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+                              L->getHeader());
+      return Diag << "unroll and jammed loop by a factor of "
+                  << NV("UnrollCount", Count);
+    };
+
+    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
+                      << " by " << Count);
+    if (TripMultiple != 1) {
+      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      ORE->emit([&]() {
+        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
+                             << " trips per branch";
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
+      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
+    }
+    LLVM_DEBUG(dbgs() << "!\n");
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  assert(Preheader && LatchBlock && Header);
+  assert(BI && !BI->isUnconditional());
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+  bool SubLoopContinueOnTrue = SubLoop->contains(
+      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
+
+  // Partition blocks in an outer/inner loop pair into blocks before and after
+  // the loop
+  BasicBlockSet SubLoopBlocks;
+  BasicBlockSet ForeBlocks;
+  BasicBlockSet AftBlocks;
+  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
+                           DT);
+
+  // We keep track of the entering/first and exiting/last block of each of
+  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
+  // blocks easier.
+  std::vector<BasicBlock *> ForeBlocksFirst;
+  std::vector<BasicBlock *> ForeBlocksLast;
+  std::vector<BasicBlock *> SubLoopBlocksFirst;
+  std::vector<BasicBlock *> SubLoopBlocksLast;
+  std::vector<BasicBlock *> AftBlocksFirst;
+  std::vector<BasicBlock *> AftBlocksLast;
+  ForeBlocksFirst.push_back(Header);
+  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
+  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
+  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
+  AftBlocksFirst.push_back(SubLoop->getExitBlock());
+  AftBlocksLast.push_back(L->getExitingBlock());
+  // Maps Blocks[0] -> Blocks[It]
+  ValueToValueMapTy LastValueMap;
+
+  // Move any instructions from fore phi operands from AftBlocks into Fore.
+  moveHeaderPhiOperandsToForeBlocks(
+      Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
+      AftBlocks);
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (!isa<DbgInfoIntrinsic>(&I))
+          if (const DILocation *DIL = I.getDebugLoc())
+            I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
+  // Copy all blocks
+  for (unsigned It = 1; It != Count; ++It) {
+    std::vector<BasicBlock *> NewBlocks;
+    // Maps Blocks[It] -> Blocks[It-1]
+    DenseMap<Value *, Value *> PrevItValueMap;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      if (ForeBlocks.count(*BB)) {
+        L->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == ForeBlocksFirst[0])
+          ForeBlocksFirst.push_back(New);
+        if (*BB == ForeBlocksLast[0])
+          ForeBlocksLast.push_back(New);
+      } else if (SubLoopBlocks.count(*BB)) {
+        SubLoop->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == SubLoopBlocksFirst[0])
+          SubLoopBlocksFirst.push_back(New);
+        if (*BB == SubLoopBlocksLast[0])
+          SubLoopBlocksLast.push_back(New);
+      } else if (AftBlocks.count(*BB)) {
+        L->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == AftBlocksFirst[0])
+          AftBlocksFirst.push_back(New);
+        if (*BB == AftBlocksLast[0])
+          AftBlocksLast.push_back(New);
+      } else {
+        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
+      }
+
+      // Update our running maps of newest clones
+      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI) {
+        PrevItValueMap[VI->second] =
+            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
+        LastValueMap[VI->first] = VI->second;
+      }
+
+      NewBlocks.push_back(New);
+
+      // Update DomTree:
+      if (*BB == ForeBlocksFirst[0])
+        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
+      else if (*BB == SubLoopBlocksFirst[0])
+        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
+      else if (*BB == AftBlocksFirst[0])
+        DT->addNewBlock(New, AftBlocksLast[It - 1]);
+      else {
+        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
+        // structure.
+        auto BBDomNode = DT->getNode(*BB);
+        auto BBIDom = BBDomNode->getIDom();
+        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+        assert(OriginalBBIDom);
+        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
+        DT->addNewBlock(
+            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+      }
+    }
+
+    // Remap all instructions in the most recent iteration
+    for (BasicBlock *NewBlock : NewBlocks) {
+      for (Instruction &I : *NewBlock) {
+        ::remapInstruction(&I, LastValueMap);
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+      }
+    }
+
+    // Alter the ForeBlocks phi's, pointing them at the latest version of the
+    // value from the previous iteration's phis
+    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
+      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
+      assert(OldValue && "should have incoming edge from Aft[It]");
+      Value *NewValue = OldValue;
+      if (Value *PrevValue = PrevItValueMap[OldValue])
+        NewValue = PrevValue;
+
+      assert(Phi.getNumOperands() == 2);
+      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
+      Phi.setIncomingValue(0, NewValue);
+      Phi.removeIncomingValue(1);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // finish up connecting the blocks and phi nodes. At this point LastValueMap
+  // is the last unrolled iterations values.
+
+  // Update Phis in BB from OldBB to point to NewBB
+  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
+                            BasicBlock *NewBB) {
+    for (PHINode &Phi : BB->phis()) {
+      int I = Phi.getBasicBlockIndex(OldBB);
+      Phi.setIncomingBlock(I, NewBB);
+    }
+  };
+  // Update Phis in BB from OldBB to point to NewBB and use the latest value
+  // from LastValueMap
+  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
+                                     BasicBlock *NewBB,
+                                     ValueToValueMapTy &LastValueMap) {
+    for (PHINode &Phi : BB->phis()) {
+      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
+        if (Phi.getIncomingBlock(b) == OldBB) {
+          Value *OldValue = Phi.getIncomingValue(b);
+          if (Value *LastValue = LastValueMap[OldValue])
+            Phi.setIncomingValue(b, LastValue);
+          Phi.setIncomingBlock(b, NewBB);
+          break;
+        }
+      }
+    }
+  };
+  // Move all the phis from Src into Dest
+  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
+    Instruction *insertPoint = Dest->getFirstNonPHI();
+    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
+      Phi->moveBefore(insertPoint);
+  };
+
+  // Update the PHI values outside the loop to point to the last block
+  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
+                           LastValueMap);
+
+  // Update ForeBlocks successors and phi nodes
+  BranchInst *ForeTerm =
+      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
+  BasicBlock *Dest = SubLoopBlocksFirst[0];
+  ForeTerm->setSuccessor(0, Dest);
+
+  if (CompletelyUnroll) {
+    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
+      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
+      Phi->getParent()->getInstList().erase(Phi);
+    }
+  } else {
+    // Update the PHI values to point to the last aft block
+    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
+                             AftBlocksLast.back(), LastValueMap);
+  }
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Remap ForeBlock successors from previous iteration to this
+    BranchInst *ForeTerm =
+        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
+    BasicBlock *Dest = ForeBlocksFirst[It];
+    ForeTerm->setSuccessor(0, Dest);
+  }
+
+  // Subloop successors and phis
+  BranchInst *SubTerm =
+      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
+  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
+  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
+                  ForeBlocksLast.back());
+  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
+                  SubLoopBlocksLast.back());
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *SubTerm =
+        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+    SubTerm->eraseFromParent();
+
+    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
+                    ForeBlocksLast.back());
+    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
+                    SubLoopBlocksLast.back());
+    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
+  }
+
+  // Aft blocks successors and phis
+  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+  if (CompletelyUnroll) {
+    BranchInst::Create(LoopExit, Term);
+    Term->eraseFromParent();
+  } else {
+    Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+  }
+  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
+                  SubLoopBlocksLast.back());
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *AftTerm =
+        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(AftBlocksFirst[It], AftTerm);
+    AftTerm->eraseFromParent();
+
+    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
+                    SubLoopBlocksLast.back());
+    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
+  }
+
+  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
+  // new ones required.
+  if (Count != 1) {
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
+                           SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+                           SubLoopBlocksLast[0], AftBlocksFirst[0]);
+
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
+    DT->applyUpdates(DTUpdates);
+  }
+
+  // Merge adjacent basic blocks, if possible.
+  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
+  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
+  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
+  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
+  while (!MergeBlocks.empty()) {
+    BasicBlock *BB = *MergeBlocks.begin();
+    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+    if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
+      BasicBlock *Dest = Term->getSuccessor(0);
+      if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
+        // Don't remove BB and add Fold as they are the same BB
+        assert(Fold == BB);
+        (void)Fold;
+        MergeBlocks.erase(Dest);
+      } else
+        MergeBlocks.erase(BB);
+    } else
+      MergeBlocks.erase(BB);
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+
+  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
+  ++NumUnrolledAndJammed;
+
+#ifndef NDEBUG
+  // We shouldn't have done anything to break loop simplify form or LCSSA.
+  Loop *OuterL = L->getParentLoop();
+  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
+  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
+  if (!CompletelyUnroll)
+    assert(L->isLoopSimplifyForm());
+  assert(SubLoop->isLoopSimplifyForm());
+  assert(DT->verify());
+#endif
+
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->erase(L);
+
+  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+                          : LoopUnrollResult::PartiallyUnrolled;
+}
+
+static bool getLoadsAndStores(BasicBlockSet &Blocks,
+                              SmallVector<Value *, 4> &MemInstr) {
+  // Scan the BBs and collect legal loads and stores.
+  // Returns false if non-simple loads/stores are found.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &I : *BB) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        if (!Ld->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+        if (!St->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (I.mayReadOrWriteMemory()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
+                              SmallVector<Value *, 4> &Later,
+                              unsigned LoopDepth, bool InnerLoop,
+                              DependenceInfo &DI) {
+  // Use DA to check for dependencies between loads and stores that make unroll
+  // and jam invalid
+  for (Value *I : Earlier) {
+    for (Value *J : Later) {
+      Instruction *Src = cast<Instruction>(I);
+      Instruction *Dst = cast<Instruction>(J);
+      if (Src == Dst)
+        continue;
+      // Ignore Input dependencies.
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+        continue;
+
+      // Track dependencies, and if we find them take a conservative approach
+      // by allowing only = or < (not >), altough some > would be safe
+      // (depending upon unroll width).
+      // For the inner loop, we need to disallow any (> <) dependencies
+      // FIXME: Allow > so long as distance is less than unroll width
+      if (auto D = DI.depends(Src, Dst, true)) {
+        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+        if (D->isConfused())
+          return false;
+        if (!InnerLoop) {
+          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
+            return false;
+        } else {
+          assert(LoopDepth + 1 <= D->getLevels());
+          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
+              D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
+            return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
+                              BasicBlockSet &SubLoopBlocks,
+                              BasicBlockSet &AftBlocks, DependenceInfo &DI) {
+  // Get all loads/store pairs for each blocks
+  SmallVector<Value *, 4> ForeMemInstr;
+  SmallVector<Value *, 4> SubLoopMemInstr;
+  SmallVector<Value *, 4> AftMemInstr;
+  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
+      !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
+      !getLoadsAndStores(AftBlocks, AftMemInstr))
+    return false;
+
+  // Check for dependencies between any blocks that may change order
+  unsigned LoopDepth = L->getLoopDepth();
+  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
+                           DI) &&
+         checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
+         checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
+                           DI) &&
+         checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
+                           DI);
+}
+
+bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+                                DependenceInfo &DI) {
+  /* We currently handle outer loops like this:
+        |
+    ForeFirst    <----\    }
+     Blocks           |    } ForeBlocks
+    ForeLast          |    }
+        |             |
+    SubLoopFirst  <\  |    }
+     Blocks        |  |    } SubLoopBlocks
+    SubLoopLast   -/  |    }
+        |             |
+    AftFirst          |    }
+     Blocks           |    } AftBlocks
+    AftLast     ------/    }
+        |
+
+    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
+    and AftBlocks, providing that there is one edge from Fores to SubLoops,
+    one edge from SubLoops to Afts and a single outer loop exit (from Afts).
+    In practice we currently limit Aft blocks to a single block, and limit
+    things further in the profitablility checks of the unroll and jam pass.
+
+    Because of the way we rearrange basic blocks, we also require that
+    the Fore blocks on all unrolled iterations are safe to move before the
+    SubLoop blocks of all iterations. So we require that the phi node looping
+    operands of ForeHeader can be moved to at least the end of ForeEnd, so that
+    we can arrange cloned Fore Blocks before the subloop and match up Phi's
+    correctly.
+
+    i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
+    It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+
+    There are then a number of checks along the lines of no calls, no
+    exceptions, inner loop IV is consistent, etc. Note that for loops requiring
+    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
+    UnrollAndJamLoop if the trip count cannot be easily calculated.
+  */
+
+  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+    return false;
+  Loop *SubLoop = L->getSubLoops()[0];
+  if (!SubLoop->isLoopSimplifyForm())
+    return false;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Exit = L->getExitingBlock();
+  BasicBlock *SubLoopHeader = SubLoop->getHeader();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+  if (Latch != Exit)
+    return false;
+  if (SubLoopLatch != SubLoopExit)
+    return false;
+
+  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
+    return false;
+
+  // Split blocks into Fore/SubLoop/Aft based on dominators
+  BasicBlockSet SubLoopBlocks;
+  BasicBlockSet ForeBlocks;
+  BasicBlockSet AftBlocks;
+  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
+                                AftBlocks, &DT))
+    return false;
+
+  // Aft blocks may need to move instructions to fore blocks, which becomes more
+  // difficult if there are multiple (potentially conditionally executed)
+  // blocks. For now we just exclude loops with multiple aft blocks.
+  if (AftBlocks.size() != 1)
+    return false;
+
+  // Check inner loop IV is consistent between all iterations
+  const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
+  if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
+      !SubLoopBECountSC->getType()->isIntegerTy())
+    return false;
+  ScalarEvolution::LoopDisposition LD =
+      SE.getLoopDisposition(SubLoopBECountSC, L);
+  if (LD != ScalarEvolution::LoopInvariant)
+    return false;
+
+  // Check the loop safety info for exceptions.
+  LoopSafetyInfo LSI;
+  computeLoopSafetyInfo(&LSI, L);
+  if (LSI.MayThrow)
+    return false;
+
+  // We've ruled out the easy stuff and now need to check that there are no
+  // interdependencies which may prevent us from moving the:
+  //  ForeBlocks before Subloop and AftBlocks.
+  //  Subloop before AftBlocks.
+  //  ForeBlock phi operands before the subloop
+
+  // Make sure we can move all instructions we need to before the subloop
+  if (!processHeaderPhiOperands(
+          Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
+            if (SubLoop->contains(I->getParent()))
+              return false;
+            if (AftBlocks.count(I->getParent())) {
+              // If we hit a phi node in afts we know we are done (probably
+              // LCSSA)
+              if (isa<PHINode>(I))
+                return false;
+              // Can't move instructions with side effects or memory
+              // reads/writes
+              if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
+                return false;
+            }
+            // Keep going
+            return true;
+          }))
+    return false;
+
+  // Check for memory dependencies which prohibit the unrolling we are doing.
+  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
+  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
+  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
+    return false;
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
index c84ae7d693d7..13794c53f24b 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
@@ -30,6 +31,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -46,6 +48,7 @@
 #include <limits>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "loop-unroll"
 
@@ -66,7 +69,7 @@ static const unsigned InfiniteIterationsToInvariance =
     std::numeric_limits<unsigned>::max();
 
 // Check whether we are capable of peeling this loop.
-static bool canPeel(Loop *L) {
+bool llvm::canPeel(Loop *L) {
   // Make sure the loop is in simplified form
   if (!L->isLoopSimplifyForm())
     return false;
@@ -136,11 +139,109 @@ static unsigned calculateIterationsToInvariance(
   return ToInvariance;
 }
 
+// Return the number of iterations to peel off that make conditions in the
+// body true/false. For example, if we peel 2 iterations off the loop below,
+// the condition i < 2 can be evaluated at compile time.
+//  for (i = 0; i < n; i++)
+//    if (i < 2)
+//      ..
+//    else
+//      ..
+//   }
+static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
+                                         ScalarEvolution &SE) {
+  assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
+  unsigned DesiredPeelCount = 0;
+
+  for (auto *BB : L.blocks()) {
+    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      continue;
+
+    // Ignore loop exit condition.
+    if (L.getLoopLatch() == BB)
+      continue;
+
+    Value *Condition = BI->getCondition();
+    Value *LeftVal, *RightVal;
+    CmpInst::Predicate Pred;
+    if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
+      continue;
+
+    const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
+    const SCEV *RightSCEV = SE.getSCEV(RightVal);
+
+    // Do not consider predicates that are known to be true or false
+    // independently of the loop iteration.
+    if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) ||
+        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV,
+                            RightSCEV))
+      continue;
+
+    // Check if we have a condition with one AddRec and one non AddRec
+    // expression. Normalize LeftSCEV to be the AddRec.
+    if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+      if (isa<SCEVAddRecExpr>(RightSCEV)) {
+        std::swap(LeftSCEV, RightSCEV);
+        Pred = ICmpInst::getSwappedPredicate(Pred);
+      } else
+        continue;
+    }
+
+    const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
+
+    // Avoid huge SCEV computations in the loop below, make sure we only
+    // consider AddRecs of the loop we are trying to peel and avoid
+    // non-monotonic predicates, as we will not be able to simplify the loop
+    // body.
+    // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can
+    //        simplify the loop, if we peel 1 additional iteration, if there
+    //        is no wrapping.
+    bool Increasing;
+    if (!LeftAR->isAffine() || LeftAR->getLoop() != &L ||
+        !SE.isMonotonicPredicate(LeftAR, Pred, Increasing))
+      continue;
+    (void)Increasing;
+
+    // Check if extending the current DesiredPeelCount lets us evaluate Pred
+    // or !Pred in the loop body statically.
+    unsigned NewPeelCount = DesiredPeelCount;
+
+    const SCEV *IterVal = LeftAR->evaluateAtIteration(
+        SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
+
+    // If the original condition is not known, get the negated predicate
+    // (which holds on the else branch) and check if it is known. This allows
+    // us to peel of iterations that make the original condition false.
+    if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
+      Pred = ICmpInst::getInversePredicate(Pred);
+
+    const SCEV *Step = LeftAR->getStepRecurrence(SE);
+    while (NewPeelCount < MaxPeelCount &&
+           SE.isKnownPredicate(Pred, IterVal, RightSCEV)) {
+      IterVal = SE.getAddExpr(IterVal, Step);
+      NewPeelCount++;
+    }
+
+    // Only peel the loop if the monotonic predicate !Pred becomes known in the
+    // first iteration of the loop body after peeling.
+    if (NewPeelCount > DesiredPeelCount &&
+        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
+                            RightSCEV))
+      DesiredPeelCount = NewPeelCount;
+  }
+
+  return DesiredPeelCount;
+}
+
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::UnrollingPreferences &UP,
-                            unsigned &TripCount) {
+                            unsigned &TripCount, ScalarEvolution &SE) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
+  // Save the UP.PeelCount value set by the target in
+  // TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+  unsigned TargetPeelCount = UP.PeelCount;
   UP.PeelCount = 0;
   if (!canPeel(L))
     return;
@@ -149,6 +250,19 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!L->empty())
     return;
 
+  // If the user provided a peel count, use that.
+  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
+  if (UserPeelCount) {
+    LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
+                      << " iterations.\n");
+    UP.PeelCount = UnrollForcePeelCount;
+    return;
+  }
+
+  // Skip peeling if it's disabled.
+  if (!UP.AllowPeeling)
+    return;
+
   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
   // iterations of the loop. For this we compute the number for iterations after
   // which every Phi is guaranteed to become an invariant, and try to peel the
@@ -160,7 +274,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
     // Now go through all Phis to calculate their the number of iterations they
     // need to become invariants.
-    unsigned DesiredPeelCount = 0;
+    // Start the max computation with the UP.PeelCount value set by the target
+    // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+    unsigned DesiredPeelCount = TargetPeelCount;
     BasicBlock *BackEdge = L->getLoopLatch();
     assert(BackEdge && "Loop is not in simplified form?");
     for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
@@ -170,15 +286,21 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       if (ToInvariance != InfiniteIterationsToInvariance)
         DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
     }
+
+    // Pay respect to limitations implied by loop size and the max peel count.
+    unsigned MaxPeelCount = UnrollPeelMaxCount;
+    MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
+
+    DesiredPeelCount = std::max(DesiredPeelCount,
+                                countToEliminateCompares(*L, MaxPeelCount, SE));
+
     if (DesiredPeelCount > 0) {
-      // Pay respect to limitations implied by loop size and the max peel count.
-      unsigned MaxPeelCount = UnrollPeelMaxCount;
-      MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
       // Consider max peel count limitation.
       assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to turn"
-                   << " some Phis into invariants.\n");
+      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                        << " iteration(s) to turn"
+                        << " some Phis into invariants.\n");
       UP.PeelCount = DesiredPeelCount;
       return;
     }
@@ -189,44 +311,37 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (TripCount)
     return;
 
-  // If the user provided a peel count, use that.
-  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
-  if (UserPeelCount) {
-    DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
-                 << " iterations.\n");
-    UP.PeelCount = UnrollForcePeelCount;
-    return;
-  }
-
   // If we don't know the trip count, but have reason to believe the average
   // trip count is low, peeling should be beneficial, since we will usually
   // hit the peeled section.
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
-  if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) {
+  if (L->getHeader()->getParent()->hasProfileData()) {
     Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
     if (!PeelCount)
       return;
 
-    DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
+                      << "\n");
 
     if (*PeelCount) {
       if ((*PeelCount <= UnrollPeelMaxCount) &&
           (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
-        DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n");
+        LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
+                          << " iterations.\n");
         UP.PeelCount = *PeelCount;
         return;
       }
-      DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
-      DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
-      DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n");
-      DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
+      LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+      LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
     }
   }
 }
 
-/// \brief Update the branch weights of the latch of a peeled-off loop
+/// Update the branch weights of the latch of a peeled-off loop
 /// iteration.
 /// This sets the branch weights for the latch of the recently peeled off loop
 /// iteration correctly. 
@@ -267,12 +382,12 @@ static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
   }
 }
 
-/// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
+/// Clones the body of the loop L, putting it between \p InsertTop and \p
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
 /// peeled off.
 /// \param Exit The exit block of the original loop.
-/// \param[out] NewBlocks A list of the the blocks in the newly created clone
+/// \param[out] NewBlocks A list of the blocks in the newly created clone
 /// \param[out] VMap The value map between the loop and the new clone.
 /// \param LoopBlocks A helper for DFS-traversal of the loop.
 /// \param LVMap A value-map that maps instructions from the original loop to
@@ -376,7 +491,7 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
     LVMap[KV.first] = KV.second;
 }
 
-/// \brief Peel off the first \p PeelCount iterations of loop \p L.
+/// Peel off the first \p PeelCount iterations of loop \p L.
 ///
 /// Note that this does not peel them off as a single straight-line block.
 /// Rather, each iteration is peeled off separately, and needs to check the
@@ -388,8 +503,8 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
                     ScalarEvolution *SE, DominatorTree *DT,
                     AssumptionCache *AC, bool PreserveLCSSA) {
-  if (!canPeel(L))
-    return false;
+  assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
+  assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
 
   LoopBlocksDFS LoopBlocks(L);
   LoopBlocks.perform(LI);
@@ -500,10 +615,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
       // the original loop body.
       if (Iter == 0)
         DT->changeImmediateDominator(Exit, cast<BasicBlock>(LVMap[Latch]));
-#ifndef NDEBUG
-      if (VerifyDomInfo)
-        DT->verifyDomTree();
-#endif
+      assert(DT->verify(DominatorTree::VerificationLevel::Fast));
     }
 
     updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index f79f423ce019..0057b4ba7ce1 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -21,8 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -33,7 +33,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -418,8 +418,9 @@ canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
   // UnrollRuntimeMultiExit is true. This will need updating the logic in
   // connectEpilog/connectProlog.
   if (!LatchExit->getSinglePredecessor()) {
-    DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
-                    "predecessor.\n");
+    LLVM_DEBUG(
+        dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+                  "predecessor.\n");
     return false;
   }
   // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
@@ -528,14 +529,14 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, AssumptionCache *AC,
                                       bool PreserveLCSSA) {
-  DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
-  DEBUG(L->dump());
-  DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" :
-        dbgs() << "Using prolog remainder.\n");
+  LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+  LLVM_DEBUG(L->dump());
+  LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
+                                : dbgs() << "Using prolog remainder.\n");
 
   // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm()) {
-    DEBUG(dbgs() << "Not in simplify form!\n");
+    LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
     return false;
   }
 
@@ -561,7 +562,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
   if (!isMultiExitUnrollingEnabled &&
       (!L->getExitingBlock() || OtherExits.size())) {
-    DEBUG(
+    LLVM_DEBUG(
         dbgs()
         << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
            "enabled!\n");
@@ -581,7 +582,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy()) {
-    DEBUG(dbgs() << "Could not compute exit block SCEV\n");
+    LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
     return false;
   }
 
@@ -591,7 +592,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC)) {
-    DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
+    LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
     return false;
   }
 
@@ -601,15 +602,16 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
       Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
-    DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
+    LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
   }
 
   // This constraint lets us deal with an overflowing trip count easily; see the
   // comment on ModVal below.
   if (Log2_32(Count) > BEWidth) {
-    DEBUG(dbgs()
-          << "Count failed constraint on overflow trip count calculation.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "Count failed constraint on overflow trip count calculation.\n");
     return false;
   }
 
@@ -763,7 +765,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // values from the cloned region. Also update the dominator info for
   // OtherExits and their immediate successors, since we have new edges into
   // OtherExits.
-  SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
+  SmallPtrSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
   for (auto *BB : OtherExits) {
    for (auto &II : *BB) {
 
@@ -878,10 +880,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                   NewPreHeader, VMap, DT, LI, PreserveLCSSA);
   }
 
-  // If this loop is nested, then the loop unroller changes the code in the
-  // parent loop, so the Scalar Evolution pass needs to be run again.
-  if (Loop *ParentLoop = L->getParentLoop())
-    SE->forgetLoop(ParentLoop);
+  // If this loop is nested, then the loop unroller changes the code in the any
+  // of its parent loops, so the Scalar Evolution pass needs to be run again.
+  SE->forgetTopmostLoop(L);
 
   // Canonicalize to LoopSimplifyForm both original and remainder loops. We
   // cannot rely on the LoopUnrollPass to do this because it only does
@@ -897,7 +898,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   }
 
   if (remainderLoop && UnrollRemainder) {
-    DEBUG(dbgs() << "Unrolling remainder loop\n");
+    LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
     UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1,
                /*Force*/ false, /*AllowRuntime*/ false,
                /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 0a357f4b5004..46af120a428b 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -16,8 +16,10 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -553,47 +555,48 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
 
   if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes,
                       DB, AC, DT)) {
-    DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
-    DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi
+                      << "\n");
     return true;
   }
   // Not a reduction of known type.
@@ -921,13 +924,13 @@ bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
 }
 
 /// This function is called when we suspect that the update-chain of a phi node
-/// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts, 
-/// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime 
-/// predicate P under which the SCEV expression for the phi can be the 
-/// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the 
-/// cast instructions that are involved in the update-chain of this induction. 
-/// A caller that adds the required runtime predicate can be free to drop these 
-/// cast instructions, and compute the phi using \p AR (instead of some scev 
+/// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts,
+/// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime
+/// predicate P under which the SCEV expression for the phi can be the
+/// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the
+/// cast instructions that are involved in the update-chain of this induction.
+/// A caller that adds the required runtime predicate can be free to drop these
+/// cast instructions, and compute the phi using \p AR (instead of some scev
 /// expression with casts).
 ///
 /// For example, without a predicate the scev expression can take the following
@@ -962,7 +965,7 @@ static bool getCastsForInductionPHI(PredicatedScalarEvolution &PSE,
   assert(PSE.getSCEV(PN) == AR && "Unexpected phi node SCEV expression");
   const Loop *L = AR->getLoop();
 
-  // Find any cast instructions that participate in the def-use chain of 
+  // Find any cast instructions that participate in the def-use chain of
   // PhiScev in the loop.
   // FORNOW/TODO: We currently expect the def-use chain to include only
   // two-operand instructions, where one of the operands is an invariant.
@@ -1050,7 +1053,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
     AR = PSE.getAsAddRec(Phi);
 
   if (!AR) {
-    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    LLVM_DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return false;
   }
 
@@ -1084,14 +1087,15 @@ bool InductionDescriptor::isInductionPHI(
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
 
   if (!AR) {
-    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    LLVM_DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return false;
   }
 
   if (AR->getLoop() != TheLoop) {
     // FIXME: We should treat this as a uniform. Unfortunately, we
     // don't currently know how to handled uniform PHIs.
-    DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
     return false;
   }
 
@@ -1172,11 +1176,12 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
         BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA);
 
     if (!NewExitBB)
-      DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
-                   << *L << "\n");
+      LLVM_DEBUG(
+          dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+                 << *L << "\n");
     else
-      DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
-                   << NewExitBB->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+                        << NewExitBB->getName() << "\n");
     return true;
   };
 
@@ -1199,7 +1204,7 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
   return Changed;
 }
 
-/// \brief Returns the instructions that use values defined in the loop.
+/// Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
   SmallVector<Instruction *, 8> UsedOutside;
 
@@ -1276,7 +1281,7 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 }
 
-/// \brief Find string metadata for loop
+/// Find string metadata for loop
 ///
 /// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
 /// operand or null otherwise.  If the string metadata is not found return
@@ -1428,6 +1433,32 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
     DT->deleteEdge(Preheader, L->getHeader());
   }
 
+  // Given LCSSA form is satisfied, we should not have users of instructions
+  // within the dead loop outside of the loop. However, LCSSA doesn't take
+  // unreachable uses into account. We handle them here.
+  // We could do it after drop all references (in this case all users in the
+  // loop will be already eliminated and we have less work to do but according
+  // to API doc of User::dropAllReferences only valid operation after dropping
+  // references, is deletion. So let's substitute all usages of
+  // instruction from the loop with undef value of corresponding type first.
+  for (auto *Block : L->blocks())
+    for (Instruction &I : *Block) {
+      auto *Undef = UndefValue::get(I.getType());
+      for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E;) {
+        Use &U = *UI;
+        ++UI;
+        if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
+          if (L->contains(Usr->getParent()))
+            continue;
+        // If we have a DT then we can check that uses outside a loop only in
+        // unreachable block.
+        if (DT)
+          assert(!DT->isReachableFromEntry(U) &&
+                 "Unexpected user in reachable block");
+        U.set(Undef);
+      }
+    }
+
   // Remove the block from the reference counting scheme, so that we can
   // delete it freely later.
   for (auto *Block : L->blocks())
@@ -1455,54 +1486,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   }
 }
 
-/// Returns true if the instruction in a loop is guaranteed to execute at least
-/// once.
-bool llvm::isGuaranteedToExecute(const Instruction &Inst,
-                                 const DominatorTree *DT, const Loop *CurLoop,
-                                 const LoopSafetyInfo *SafetyInfo) {
-  // We have to check to make sure that the instruction dominates all
-  // of the exit blocks.  If it doesn't, then there is a path out of the loop
-  // which does not execute this instruction, so we can't hoist it.
-
-  // If the instruction is in the header block for the loop (which is very
-  // common), it is always guaranteed to dominate the exit blocks.  Since this
-  // is a common case, and can save some work, check it now.
-  if (Inst.getParent() == CurLoop->getHeader())
-    // If there's a throw in the header block, we can't guarantee we'll reach
-    // Inst.
-    return !SafetyInfo->HeaderMayThrow;
-
-  // Somewhere in this loop there is an instruction which may throw and make us
-  // exit the loop.
-  if (SafetyInfo->MayThrow)
-    return false;
-
-  // Get the exit blocks for the current loop.
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  CurLoop->getExitBlocks(ExitBlocks);
-
-  // Verify that the block dominates each of the exit blocks of the loop.
-  for (BasicBlock *ExitBlock : ExitBlocks)
-    if (!DT->dominates(Inst.getParent(), ExitBlock))
-      return false;
-
-  // As a degenerate case, if the loop is statically infinite then we haven't
-  // proven anything since there are no exit blocks.
-  if (ExitBlocks.empty())
-    return false;
-
-  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
-  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
-  // just a special case of this.)
-  return true;
-}
-
 Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // Only support loops with a unique exiting block, and a latch.
   if (!L->getExitingBlock())
     return None;
 
-  // Get the branch weights for the the loop's backedge.
+  // Get the branch weights for the loop's backedge.
   BranchInst *LatchBR =
       dyn_cast<BranchInst>(L->getLoopLatch()->getTerminator());
   if (!LatchBR || LatchBR->getNumSuccessors() != 2)
@@ -1530,7 +1519,7 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
     return (FalseVal + (TrueVal / 2)) / TrueVal;
 }
 
-/// \brief Adds a 'fast' flag to floating point operations.
+/// Adds a 'fast' flag to floating point operations.
 static Value *addFastMathFlag(Value *V) {
   if (isa<FPMathOperator>(V)) {
     FastMathFlags Flags;
@@ -1540,6 +1529,38 @@ static Value *addFastMathFlag(Value *V) {
   return V;
 }
 
+// Helper to generate an ordered reduction.
+Value *
+llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
+                          unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+
+  // Extract and apply reduction ops in ascending order:
+  // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+  Value *Result = Acc;
+  for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+    Value *Ext =
+        Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+                                   "bin.rdx");
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
+                                                    Ext);
+    }
+
+    if (!RedOps.empty())
+      propagateIRFlags(Result, RedOps);
+  }
+
+  return Result;
+}
+
 // Helper to generate a log2 shuffle reduction.
 Value *
 llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 29756d9dab7f..abbcd5f9e3b8 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -140,9 +140,12 @@ void LoopVersioning::addPHINodes(
     if (!PN) {
       PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
                            &PHIBlock->front());
-      for (auto *User : Inst->users())
-        if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
-          User->replaceUsesOfWith(Inst, PN);
+      SmallVector<User*, 8> UsersToUpdate;
+      for (User *U : Inst->users())
+        if (!VersionedLoop->contains(cast<Instruction>(U)->getParent()))
+          UsersToUpdate.push_back(U);
+      for (User *U : UsersToUpdate)
+        U->replaceUsesOfWith(Inst, PN);
       PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
     }
   }
@@ -248,7 +251,7 @@ void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
 }
 
 namespace {
-/// \brief Also expose this is a pass.  Currently this is only used for
+/// Also expose this is a pass.  Currently this is only used for
 /// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
 /// array accesses from the loop.
 class LoopVersioningPass : public FunctionPass {
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index ee84541e526d..c852d538b0d1 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -21,7 +21,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "lowerinvoke"
@@ -48,10 +48,12 @@ static bool runImpl(Function &F) {
   bool Changed = false;
   for (BasicBlock &BB : F)
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
-      SmallVector<Value *, 16> CallArgs(II->op_begin(), II->op_end() - 3);
+      SmallVector<Value *, 16> CallArgs(II->arg_begin(), II->arg_end());
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
       // Insert a normal call instruction...
       CallInst *NewCall =
-          CallInst::Create(II->getCalledValue(), CallArgs, "", II);
+          CallInst::Create(II->getCalledValue(), CallArgs, OpBundles, "", II);
       NewCall->takeName(II);
       NewCall->setCallingConv(II->getCallingConv());
       NewCall->setAttributes(II->getAttributes());
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 57dc225e9dab..03006ef3a2d3 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -409,8 +409,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                               /* SrcAddr */ Memcpy->getRawSource(),
                               /* DstAddr */ Memcpy->getRawDest(),
                               /* CopyLen */ CI,
-                              /* SrcAlign */ Memcpy->getAlignment(),
-                              /* DestAlign */ Memcpy->getAlignment(),
+                              /* SrcAlign */ Memcpy->getSourceAlignment(),
+                              /* DestAlign */ Memcpy->getDestAlignment(),
                               /* SrcIsVolatile */ Memcpy->isVolatile(),
                               /* DstIsVolatile */ Memcpy->isVolatile(),
                               /* TargetTransformInfo */ TTI);
@@ -419,8 +419,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                                 /* SrcAddr */ Memcpy->getRawSource(),
                                 /* DstAddr */ Memcpy->getRawDest(),
                                 /* CopyLen */ Memcpy->getLength(),
-                                /* SrcAlign */ Memcpy->getAlignment(),
-                                /* DestAlign */ Memcpy->getAlignment(),
+                                /* SrcAlign */ Memcpy->getSourceAlignment(),
+                                /* DestAlign */ Memcpy->getDestAlignment(),
                                 /* SrcIsVolatile */ Memcpy->isVolatile(),
                                 /* DstIsVolatile */ Memcpy->isVolatile(),
                                 /* TargetTransfomrInfo */ TTI);
@@ -432,8 +432,8 @@ void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
                     /* SrcAddr */ Memmove->getRawSource(),
                     /* DstAddr */ Memmove->getRawDest(),
                     /* CopyLen */ Memmove->getLength(),
-                    /* SrcAlign */ Memmove->getAlignment(),
-                    /* DestAlign */ Memmove->getAlignment(),
+                    /* SrcAlign */ Memmove->getSourceAlignment(),
+                    /* DestAlign */ Memmove->getDestAlignment(),
                     /* SrcIsVolatile */ Memmove->isVolatile(),
                     /* DstIsVolatile */ Memmove->isVolatile());
 }
@@ -443,6 +443,6 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
                    /* DstAddr */ Memset->getRawDest(),
                    /* CopyLen */ Memset->getLength(),
                    /* SetValue */ Memset->getValue(),
-                   /* Alignment */ Memset->getAlignment(),
+                   /* Alignment */ Memset->getDestAlignment(),
                    Memset->isVolatile());
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 344cb35df986..e99ecfef19cd 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -29,7 +29,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -74,7 +74,7 @@ namespace {
 
     LowerSwitch() : FunctionPass(ID) {
       initializeLowerSwitchPass(*PassRegistry::getPassRegistry());
-    } 
+    }
 
     bool runOnFunction(Function &F) override;
 
@@ -155,11 +155,8 @@ bool LowerSwitch::runOnFunction(Function &F) {
 }
 
 /// Used for debugging purposes.
-static raw_ostream& operator<<(raw_ostream &O,
-                               const LowerSwitch::CaseVector &C)
-    LLVM_ATTRIBUTE_USED;
-
-static raw_ostream& operator<<(raw_ostream &O,
+LLVM_ATTRIBUTE_USED
+static raw_ostream &operator<<(raw_ostream &O,
                                const LowerSwitch::CaseVector &C) {
   O << "[";
 
@@ -172,7 +169,7 @@ static raw_ostream& operator<<(raw_ostream &O,
   return O << "]";
 }
 
-/// \brief Update the first occurrence of the "switch statement" BB in the PHI
+/// Update the first occurrence of the "switch statement" BB in the PHI
 /// node with the "new" BB. The other occurrences will:
 ///
 /// 1) Be updated by subsequent calls to this function.  Switch statements may
@@ -245,14 +242,13 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
 
   unsigned Mid = Size / 2;
   std::vector<CaseRange> LHS(Begin, Begin + Mid);
-  DEBUG(dbgs() << "LHS: " << LHS << "\n");
+  LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n");
   std::vector<CaseRange> RHS(Begin + Mid, End);
-  DEBUG(dbgs() << "RHS: " << RHS << "\n");
+  LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n");
 
   CaseRange &Pivot = *(Begin + Mid);
-  DEBUG(dbgs() << "Pivot ==> "
-               << Pivot.Low->getValue()
-               << " -" << Pivot.High->getValue() << "\n");
+  LLVM_DEBUG(dbgs() << "Pivot ==> " << Pivot.Low->getValue() << " -"
+                    << Pivot.High->getValue() << "\n");
 
   // NewLowerBound here should never be the integer minimal value.
   // This is because it is computed from a case range that is never
@@ -274,20 +270,14 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
       NewUpperBound = LHS.back().High;
   }
 
-  DEBUG(dbgs() << "LHS Bounds ==> ";
-        if (LowerBound) {
-          dbgs() << LowerBound->getSExtValue();
-        } else {
-          dbgs() << "NONE";
-        }
-        dbgs() << " - " << NewUpperBound->getSExtValue() << "\n";
-        dbgs() << "RHS Bounds ==> ";
-        dbgs() << NewLowerBound->getSExtValue() << " - ";
-        if (UpperBound) {
-          dbgs() << UpperBound->getSExtValue() << "\n";
-        } else {
-          dbgs() << "NONE\n";
-        });
+  LLVM_DEBUG(dbgs() << "LHS Bounds ==> "; if (LowerBound) {
+    dbgs() << LowerBound->getSExtValue();
+  } else { dbgs() << "NONE"; } dbgs() << " - "
+                                      << NewUpperBound->getSExtValue() << "\n";
+             dbgs() << "RHS Bounds ==> ";
+             dbgs() << NewLowerBound->getSExtValue() << " - "; if (UpperBound) {
+               dbgs() << UpperBound->getSExtValue() << "\n";
+             } else { dbgs() << "NONE\n"; });
 
   // Create a new node that checks if the value is < pivot. Go to the
   // left branch if it is and right branch if not.
@@ -337,7 +327,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
     } else if (Leaf.Low->isZero()) {
       // Val >= 0 && Val <= Hi --> Val <=u Hi
       Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
-                          "SwitchLeaf");      
+                          "SwitchLeaf");
     } else {
       // Emit V-Lo <=u Hi-Lo
       Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
@@ -364,7 +354,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
     for (uint64_t j = 0; j < Range; ++j) {
       PN->removeIncomingValue(OrigBlock);
     }
-    
+
     int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
     assert(BlockIdx != -1 && "Switch didn't go to this successor??");
     PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
@@ -382,7 +372,7 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
     Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
                               Case.getCaseSuccessor()));
 
-  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+  llvm::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
   if (Cases.size() >= 2) {
@@ -443,9 +433,9 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
   // Prepare cases vector.
   CaseVector Cases;
   unsigned numCmps = Clusterify(Cases, SI);
-  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
-               << ". Total compares: " << numCmps << "\n");
-  DEBUG(dbgs() << "Cases: " << Cases << "\n");
+  LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+                    << ". Total compares: " << numCmps << "\n");
+  LLVM_DEBUG(dbgs() << "Cases: " << Cases << "\n");
   (void)numCmps;
 
   ConstantInt *LowerBound = nullptr;
@@ -505,6 +495,10 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
     }
 #endif
 
+    // As the default block in the switch is unreachable, update the PHI nodes
+    // (remove the entry to the default block) to reflect this.
+    Default->removePredecessor(OrigBlock);
+
     // Use the most popular block as the new default, reducing the number of
     // cases.
     assert(MaxPop > 0 && PopSucc);
@@ -518,29 +512,33 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
     if (Cases.empty()) {
       BranchInst::Create(Default, CurBlock);
       SI->eraseFromParent();
+      // As all the cases have been replaced with a single branch, only keep
+      // one entry in the PHI nodes.
+      for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I)
+        PopSucc->removePredecessor(OrigBlock);
       return;
     }
   }
 
+  unsigned NrOfDefaults = (SI->getDefaultDest() == Default) ? 1 : 0;
+  for (const auto &Case : SI->cases())
+    if (Case.getCaseSuccessor() == Default)
+      NrOfDefaults++;
+
   // Create a new, empty default block so that the new hierarchy of
   // if-then statements go to this and the PHI nodes are happy.
   BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
   F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
   BranchInst::Create(Default, NewDefault);
 
-  // If there is an entry in any PHI nodes for the default edge, make sure
-  // to update them as well.
-  for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
-    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
-    PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
-  }
-
   BasicBlock *SwitchBlock =
       switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
                     OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
 
+  // If there are entries in any PHI nodes for the default edge, make sure
+  // to update them as well.
+  fixPhis(Default, OrigBlock, NewDefault, NrOfDefaults);
+
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
 
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index 29f289b62da0..23145e584751 100644
--- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -22,7 +22,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <vector>
 
diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index 0f7bd76c03ca..323f2552ca80 100644
--- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -29,7 +29,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
index dc780542ce68..6d0b96f6aa8a 100644
--- a/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -14,19 +14,38 @@
 #include "llvm/Transforms/Utils/OrderedInstructions.h"
 using namespace llvm;
 
+bool OrderedInstructions::localDominates(const Instruction *InstA,
+                                         const Instruction *InstB) const {
+  assert(InstA->getParent() == InstB->getParent() &&
+         "Instructions must be in the same basic block");
+
+  const BasicBlock *IBB = InstA->getParent();
+  auto OBB = OBBMap.find(IBB);
+  if (OBB == OBBMap.end())
+    OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
+  return OBB->second->dominates(InstA, InstB);
+}
+
 /// Given 2 instructions, use OrderedBasicBlock to check for dominance relation
 /// if the instructions are in the same basic block, Otherwise, use dominator
 /// tree.
 bool OrderedInstructions::dominates(const Instruction *InstA,
                                     const Instruction *InstB) const {
-  const BasicBlock *IBB = InstA->getParent();
   // Use ordered basic block to do dominance check in case the 2 instructions
   // are in the same basic block.
-  if (IBB == InstB->getParent()) {
-    auto OBB = OBBMap.find(IBB);
-    if (OBB == OBBMap.end())
-      OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
-    return OBB->second->dominates(InstA, InstB);
-  }
+  if (InstA->getParent() == InstB->getParent())
+    return localDominates(InstA, InstB);
   return DT->dominates(InstA->getParent(), InstB->getParent());
 }
+
+bool OrderedInstructions::dfsBefore(const Instruction *InstA,
+                                    const Instruction *InstB) const {
+  // Use ordered basic block in case the 2 instructions are in the same basic
+  // block.
+  if (InstA->getParent() == InstB->getParent())
+    return localDominates(InstA, InstB);
+
+  DomTreeNode *DA = DT->getNode(InstA->getParent());
+  DomTreeNode *DB = DT->getNode(InstB->getParent());
+  return DA->getDFSNumIn() < DB->getDFSNumIn();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index d47be6ea566b..2923977b791a 100644
--- a/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -32,7 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/OrderedInstructions.h"
 #include <algorithm>
 #define DEBUG_TYPE "predicateinfo"
@@ -118,7 +120,7 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
     return false;
   if (ArgA && ArgB)
     return ArgA->getArgNo() < ArgB->getArgNo();
-  return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+  return OI.dfsBefore(cast<Instruction>(A), cast<Instruction>(B));
 }
 
 // This compares ValueDFS structures, creating OrderedBasicBlocks where
@@ -479,6 +481,19 @@ void PredicateInfo::buildPredicateInfo() {
   renameUses(OpsToRename);
 }
 
+// Create a ssa_copy declaration with custom mangling, because
+// Intrinsic::getDeclaration does not handle overloaded unnamed types properly:
+// all unnamed types get mangled to the same string. We use the pointer
+// to the type as name here, as it guarantees unique names for different
+// types and we remove the declarations when destroying PredicateInfo.
+// It is a workaround for PR38117, because solving it in a fully general way is
+// tricky (FIXME).
+static Function *getCopyDeclaration(Module *M, Type *Ty) {
+  std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty);
+  return cast<Function>(M->getOrInsertFunction(
+      Name, getType(M->getContext(), Intrinsic::ssa_copy, Ty)));
+}
+
 // Given the renaming stack, make all the operands currently on the stack real
 // by inserting them into the IR.  Return the last operation's value.
 Value *PredicateInfo::materializeStack(unsigned int &Counter,
@@ -507,8 +522,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
     // order in the case of multiple predicateinfo in the same block.
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
-      Function *IF = Intrinsic::getDeclaration(
-          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+      if (IF->user_begin() == IF->user_end())
+        CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
       PredicateMap.insert({PIC, ValInfo});
@@ -518,8 +534,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
       assert(PAssume &&
              "Should not have gotten here without it being an assume");
       IRBuilder<> B(PAssume->AssumeInst);
-      Function *IF = Intrinsic::getDeclaration(
-          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+      if (IF->user_begin() == IF->user_end())
+        CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
@@ -553,10 +570,11 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
   auto Comparator = [&](const Value *A, const Value *B) {
     return valueComesBefore(OI, A, B);
   };
-  std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+  llvm::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
   ValueDFS_Compare Compare(OI);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
+    LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
     unsigned Counter = 0;
     SmallVector<ValueDFS, 16> OrderedUses;
     const auto &ValueInfo = getValueInfo(Op);
@@ -625,15 +643,15 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
       // we want to.
       bool PossibleCopy = VD.PInfo != nullptr;
       if (RenameStack.empty()) {
-        DEBUG(dbgs() << "Rename Stack is empty\n");
+        LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
       } else {
-        DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
-                     << RenameStack.back().DFSIn << ","
-                     << RenameStack.back().DFSOut << ")\n");
+        LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                          << RenameStack.back().DFSIn << ","
+                          << RenameStack.back().DFSOut << ")\n");
       }
 
-      DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
-                   << VD.DFSOut << ")\n");
+      LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                        << VD.DFSOut << ")\n");
 
       bool ShouldPush = (VD.Def || PossibleCopy);
       bool OutOfScope = !stackIsInScope(RenameStack, VD);
@@ -652,7 +670,7 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
       if (VD.Def || PossibleCopy)
         continue;
       if (!DebugCounter::shouldExecute(RenameCounter)) {
-        DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
         continue;
       }
       ValueDFS &Result = RenameStack.back();
@@ -663,8 +681,9 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
       if (!Result.Def)
         Result.Def = materializeStack(Counter, RenameStack, Op);
 
-      DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
-                   << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+      LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                        << *VD.U->get() << " in " << *(VD.U->getUser())
+                        << "\n");
       assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
              "Predicateinfo def should have dominated this use");
       VD.U->set(Result.Def);
@@ -702,7 +721,22 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
   buildPredicateInfo();
 }
 
-PredicateInfo::~PredicateInfo() {}
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+  // Collect function pointers in set first, as SmallSet uses a SmallVector
+  // internally and we have to remove the asserting value handles first.
+  SmallPtrSet<Function *, 20> FunctionPtrs;
+  for (auto &F : CreatedDeclarations)
+    FunctionPtrs.insert(&*F);
+  CreatedDeclarations.clear();
+
+  for (Function *F : FunctionPtrs) {
+    assert(F->user_begin() == F->user_end() &&
+           "PredicateInfo consumer did not remove all SSA copies.");
+    F->eraseFromParent();
+  }
+}
 
 void PredicateInfo::verifyPredicateInfo() const {}
 
@@ -720,6 +754,20 @@ void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AssumptionCacheTracker>();
 }
 
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+  for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
+    Instruction *Inst = &*I++;
+    const auto *PI = PredInfo.getPredicateInfoFor(Inst);
+    auto *II = dyn_cast<IntrinsicInst>(Inst);
+    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+      continue;
+
+    Inst->replaceAllUsesWith(II->getOperand(0));
+    Inst->eraseFromParent();
+  }
+}
+
 bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -727,6 +775,8 @@ bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   PredInfo->print(dbgs());
   if (VerifyPredicateInfo)
     PredInfo->verifyPredicateInfo();
+
+  replaceCreatedSSACopys(*PredInfo, F);
   return false;
 }
 
@@ -735,12 +785,14 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   OS << "PredicateInfo for function: " << F.getName() << "\n";
-  make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(OS);
 
+  replaceCreatedSSACopys(*PredInfo, F);
   return PreservedAnalyses::all();
 }
 
-/// \brief An assembly annotator class to print PredicateInfo information in
+/// An assembly annotator class to print PredicateInfo information in
 /// comments.
 class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
   friend class PredicateInfo;
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index fcd3bd08482a..86e15bbd7f22 100644
--- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -45,7 +46,6 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -164,26 +164,27 @@ struct AllocaInfo {
   }
 };
 
-// Data package used by RenamePass()
-class RenamePassData {
-public:
+/// Data package used by RenamePass().
+struct RenamePassData {
   using ValVector = std::vector<Value *>;
+  using LocationVector = std::vector<DebugLoc>;
 
-  RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V)
-      : BB(B), Pred(P), Values(std::move(V)) {}
+  RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L)
+      : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {}
 
   BasicBlock *BB;
   BasicBlock *Pred;
   ValVector Values;
+  LocationVector Locations;
 };
 
-/// \brief This assigns and keeps a per-bb relative ordering of load/store
+/// This assigns and keeps a per-bb relative ordering of load/store
 /// instructions in the block that directly load or store an alloca.
 ///
 /// This functionality is important because it avoids scanning large basic
 /// blocks multiple times when promoting many allocas in the same block.
 class LargeBlockInfo {
-  /// \brief For each instruction that we track, keep the index of the
+  /// For each instruction that we track, keep the index of the
   /// instruction.
   ///
   /// The index starts out as the number of the instruction from the start of
@@ -242,7 +243,7 @@ struct PromoteMem2Reg {
   /// Reverse mapping of Allocas.
   DenseMap<AllocaInst *, unsigned> AllocaLookup;
 
-  /// \brief The PhiNodes we're adding.
+  /// The PhiNodes we're adding.
   ///
   /// That map is used to simplify some Phi nodes as we iterate over it, so
   /// it should have deterministic iterators.  We could use a MapVector, but
@@ -294,7 +295,7 @@ private:
   unsigned getNumPreds(const BasicBlock *BB) {
     unsigned &NP = BBNumPreds[BB];
     if (NP == 0)
-      NP = std::distance(pred_begin(BB), pred_end(BB)) + 1;
+      NP = pred_size(BB) + 1;
     return NP - 1;
   }
 
@@ -303,6 +304,7 @@ private:
                            SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
   void RenamePass(BasicBlock *BB, BasicBlock *Pred,
                   RenamePassData::ValVector &IncVals,
+                  RenamePassData::LocationVector &IncLocs,
                   std::vector<RenamePassData> &Worklist);
   bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
 };
@@ -345,7 +347,7 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   }
 }
 
-/// \brief Rewrite as many loads as possible given a single store.
+/// Rewrite as many loads as possible given a single store.
 ///
 /// When there is only a single store, we can use the domtree to trivially
 /// replace all of the dominated loads with the stored value. Do so, and return
@@ -475,7 +477,7 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 
   // Sort the stores by their index, making it efficient to do a lookup with a
   // binary search.
-  std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
+  llvm::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
 
   // Walk all of the loads from this alloca, replacing them with the nearest
   // store above them, if any.
@@ -509,6 +511,11 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
           !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
         addAssumeNonNull(AC, LI);
 
+      // If the replacement value is the load, this must occur in unreachable
+      // code.
+      if (ReplVal == LI)
+        ReplVal = UndefValue::get(LI->getType());
+
       LI->replaceAllUsesWith(ReplVal);
     }
 
@@ -631,10 +638,10 @@ void PromoteMem2Reg::run() {
     SmallVector<BasicBlock *, 32> PHIBlocks;
     IDF.calculate(PHIBlocks);
     if (PHIBlocks.size() > 1)
-      std::sort(PHIBlocks.begin(), PHIBlocks.end(),
-                [this](BasicBlock *A, BasicBlock *B) {
-                  return BBNumbers.lookup(A) < BBNumbers.lookup(B);
-                });
+      llvm::sort(PHIBlocks.begin(), PHIBlocks.end(),
+                 [this](BasicBlock *A, BasicBlock *B) {
+                   return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+                 });
 
     unsigned CurrentVersion = 0;
     for (BasicBlock *BB : PHIBlocks)
@@ -653,15 +660,20 @@ void PromoteMem2Reg::run() {
   for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
     Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
 
+  // When handling debug info, treat all incoming values as if they have unknown
+  // locations until proven otherwise.
+  RenamePassData::LocationVector Locations(Allocas.size());
+
   // Walks all basic blocks in the function performing the SSA rename algorithm
   // and inserting the phi nodes we marked as necessary
   std::vector<RenamePassData> RenamePassWorkList;
-  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values));
+  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
+                                  std::move(Locations));
   do {
     RenamePassData RPD = std::move(RenamePassWorkList.back());
     RenamePassWorkList.pop_back();
     // RenamePass may add new worklist entries.
-    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
   } while (!RenamePassWorkList.empty());
 
   // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
@@ -740,7 +752,7 @@ void PromoteMem2Reg::run() {
     // Ok, now we know that all of the PHI nodes are missing entries for some
     // basic blocks.  Start by sorting the incoming predecessors for efficient
     // access.
-    std::sort(Preds.begin(), Preds.end());
+    llvm::sort(Preds.begin(), Preds.end());
 
     // Now we loop through all BB's which have entries in SomePHI and remove
     // them from the Preds list.
@@ -772,7 +784,7 @@ void PromoteMem2Reg::run() {
   NewPhiNodes.clear();
 }
 
-/// \brief Determine which blocks the value is live in.
+/// Determine which blocks the value is live in.
 ///
 /// These are blocks which lead to uses.  Knowing this allows us to avoid
 /// inserting PHI nodes into blocks which don't lead to uses (thus, the
@@ -846,7 +858,7 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
-/// \brief Queue a phi-node to be added to a basic-block for a specific Alloca.
+/// Queue a phi-node to be added to a basic-block for a specific Alloca.
 ///
 /// Returns true if there wasn't already a phi-node for that variable
 bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
@@ -868,13 +880,24 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
   return true;
 }
 
-/// \brief Recursively traverse the CFG of the function, renaming loads and
+/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
+/// create a merged location incorporating \p DL, or to set \p DL directly.
+static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
+                                           bool ApplyMergedLoc) {
+  if (ApplyMergedLoc)
+    PN->applyMergedLocation(PN->getDebugLoc(), DL);
+  else
+    PN->setDebugLoc(DL);
+}
+
+/// Recursively traverse the CFG of the function, renaming loads and
 /// stores to the allocas which we are promoting.
 ///
 /// IncomingVals indicates what value each Alloca contains on exit from the
 /// predecessor block Pred.
 void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
                                 RenamePassData::ValVector &IncomingVals,
+                                RenamePassData::LocationVector &IncomingLocs,
                                 std::vector<RenamePassData> &Worklist) {
 NextIteration:
   // If we are inserting any phi nodes into this BB, they will already be in the
@@ -899,6 +922,10 @@ NextIteration:
       do {
         unsigned AllocaNo = PhiToAllocaMap[APN];
 
+        // Update the location of the phi node.
+        updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo],
+                                       APN->getNumIncomingValues() > 0);
+
         // Add N incoming values to the PHI node.
         for (unsigned i = 0; i != NumEdges; ++i)
           APN->addIncoming(IncomingVals[AllocaNo], Pred);
@@ -960,8 +987,11 @@ NextIteration:
         continue;
 
       // what value were we writing?
-      IncomingVals[ai->second] = SI->getOperand(0);
+      unsigned AllocaNo = ai->second;
+      IncomingVals[AllocaNo] = SI->getOperand(0);
+
       // Record debuginfo for the store before removing it.
+      IncomingLocs[AllocaNo] = SI->getDebugLoc();
       for (DbgInfoIntrinsic *DII : AllocaDbgDeclares[ai->second])
         ConvertDebugDeclareToDebugValue(DII, SI, DIB);
       BB->getInstList().erase(SI);
@@ -984,7 +1014,7 @@ NextIteration:
 
   for (; I != E; ++I)
     if (VisitedSuccs.insert(*I).second)
-      Worklist.emplace_back(*I, Pred, IncomingVals);
+      Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs);
 
   goto NextIteration;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index b2231d68a301..ca184ed7c4e3 100644
--- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -178,7 +178,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
 
-  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
   return InsertedPHI;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
new file mode 100644
index 000000000000..397bac2940a4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -0,0 +1,191 @@
+//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdaterBulk class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdaterbulk"
+
+/// Helper function for finding a block which should have a value for the given
+/// user. For PHI-nodes this block is the corresponding predecessor, for other
+/// instructions it's their parent block.
+static BasicBlock *getUserBB(Use *U) {
+  auto *User = cast<Instruction>(U->getUser());
+
+  if (auto *UserPN = dyn_cast<PHINode>(User))
+    return UserPN->getIncomingBlock(*U);
+  else
+    return User->getParent();
+}
+
+/// Add a new variable to the SSA rewriter. This needs to be called before
+/// AddAvailableValue or AddUse calls.
+unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) {
+  unsigned Var = Rewrites.size();
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = "
+                    << *Ty << ", Name = " << Name << "\n");
+  RewriteInfo RI(Name, Ty);
+  Rewrites.push_back(RI);
+  return Var;
+}
+
+/// Indicate that a rewritten value is available in the specified block with the
+/// specified value.
+void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) {
+  assert(Var < Rewrites.size() && "Variable not found!");
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var
+                    << ": added new available value" << *V << " in "
+                    << BB->getName() << "\n");
+  Rewrites[Var].Defines[BB] = V;
+}
+
+/// Record a use of the symbolic value. This use will be updated with a
+/// rewritten value when RewriteAllUses is called.
+void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
+  assert(Var < Rewrites.size() && "Variable not found!");
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get()
+                    << " in " << getUserBB(U)->getName() << "\n");
+  Rewrites[Var].Uses.push_back(U);
+}
+
+/// Return true if the SSAUpdater already has a value for the specified variable
+/// in the specified block.
+bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
+  return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
+}
+
+// Compute value at the given block BB. We either should already know it, or we
+// should be able to recursively reach it going up dominator tree.
+Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
+                                      DominatorTree *DT) {
+  if (!R.Defines.count(BB)) {
+    if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) {
+      BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock();
+      Value *V = computeValueAt(IDom, R, DT);
+      R.Defines[BB] = V;
+    } else
+      R.Defines[BB] = UndefValue::get(R.Ty);
+  }
+  return R.Defines[BB];
+}
+
+/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks.
+/// This is basically a subgraph limited by DefBlocks and UsingBlocks.
+static void
+ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks,
+                    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+                    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks,
+                    PredIteratorCache &PredCache) {
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(),
+                                                    UsingBlocks.end());
+
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (BasicBlock *P : PredCache.get(BB)) {
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// Perform all the necessary updates, including new PHI-nodes insertion and the
+/// requested uses update.
+void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
+                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+  for (auto &R : Rewrites) {
+    // Compute locations for new phi-nodes.
+    // For that we need to initialize DefBlocks from definitions in R.Defines,
+    // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use
+    // this set for computing iterated dominance frontier (IDF).
+    // The IDF blocks are the blocks where we need to insert new phi-nodes.
+    ForwardIDFCalculator IDF(*DT);
+    LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size()
+                      << " use(s)\n");
+
+    SmallPtrSet<BasicBlock *, 2> DefBlocks;
+    for (auto &Def : R.Defines)
+      DefBlocks.insert(Def.first);
+    IDF.setDefiningBlocks(DefBlocks);
+
+    SmallPtrSet<BasicBlock *, 2> UsingBlocks;
+    for (Use *U : R.Uses)
+      UsingBlocks.insert(getUserBB(U));
+
+    SmallVector<BasicBlock *, 32> IDFBlocks;
+    SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+    ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache);
+    IDF.resetLiveInBlocks();
+    IDF.setLiveInBlocks(LiveInBlocks);
+    IDF.calculate(IDFBlocks);
+
+    // We've computed IDF, now insert new phi-nodes there.
+    SmallVector<PHINode *, 4> InsertedPHIsForVar;
+    for (auto *FrontierBB : IDFBlocks) {
+      IRBuilder<> B(FrontierBB, FrontierBB->begin());
+      PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
+      R.Defines[FrontierBB] = PN;
+      InsertedPHIsForVar.push_back(PN);
+      if (InsertedPHIs)
+        InsertedPHIs->push_back(PN);
+    }
+
+    // Fill in arguments of the inserted PHIs.
+    for (auto *PN : InsertedPHIsForVar) {
+      BasicBlock *PBB = PN->getParent();
+      for (BasicBlock *Pred : PredCache.get(PBB))
+        PN->addIncoming(computeValueAt(Pred, R, DT), Pred);
+    }
+
+    // Rewrite actual uses with the inserted definitions.
+    SmallPtrSet<Use *, 4> ProcessedUses;
+    for (Use *U : R.Uses) {
+      if (!ProcessedUses.insert(U).second)
+        continue;
+      Value *V = computeValueAt(getUserBB(U), R, DT);
+      Value *OldVal = U->get();
+      assert(OldVal && "Invalid use!");
+      // Notify that users of the existing value that it is being replaced.
+      if (OldVal != V && OldVal->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(OldVal, V);
+      LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V
+                        << "\n");
+      U->set(V);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7c195788e416..c87b5c16ffce 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -28,6 +27,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -66,7 +66,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -688,9 +687,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
     // predecessors unless there is only one predecessor.
-    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
-                                               pred_end(SI->getParent())) <=
-        128)
+    if (SI->getNumSuccessors() * pred_size(SI->getParent()) <= 128)
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
@@ -847,9 +844,9 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       // Remove PHI node entries for the dead edge.
       ThisCases[0].Dest->removePredecessor(TI->getParent());
 
-      DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-                   << "Through successor TI: " << *TI << "Leaving: " << *NI
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                        << "Through successor TI: " << *TI << "Leaving: " << *NI
+                        << "\n");
 
       EraseTerminatorInstAndDCECond(TI);
       return true;
@@ -861,8 +858,8 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
       DeadCases.insert(PredCases[i].Value);
 
-    DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-                 << "Through successor TI: " << *TI);
+    LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                      << "Through successor TI: " << *TI);
 
     // Collect branch weights into a vector.
     SmallVector<uint32_t, 8> Weights;
@@ -888,7 +885,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
     if (HasWeight && Weights.size() >= 2)
       setBranchWeights(SI, Weights);
 
-    DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+    LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
   }
 
@@ -929,9 +926,9 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
   Instruction *NI = Builder.CreateBr(TheRealDest);
   (void)NI;
 
-  DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-               << "Through successor TI: " << *TI << "Leaving: " << *NI
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                    << "Through successor TI: " << *TI << "Leaving: " << *NI
+                    << "\n");
 
   EraseTerminatorInstAndDCECond(TI);
   return true;
@@ -1290,31 +1287,44 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
       return Changed;
 
-    // For a normal instruction, we just move one to right before the branch,
-    // then replace all uses of the other with the first.  Finally, we remove
-    // the now redundant second instruction.
-    BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);
-    if (!I2->use_empty())
-      I2->replaceAllUsesWith(I1);
-    I1->andIRFlags(I2);
-    unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
-                           LLVMContext::MD_range,
-                           LLVMContext::MD_fpmath,
-                           LLVMContext::MD_invariant_load,
-                           LLVMContext::MD_nonnull,
-                           LLVMContext::MD_invariant_group,
-                           LLVMContext::MD_align,
-                           LLVMContext::MD_dereferenceable,
-                           LLVMContext::MD_dereferenceable_or_null,
-                           LLVMContext::MD_mem_parallel_loop_access};
-    combineMetadata(I1, I2, KnownIDs);
-
-    // I1 and I2 are being combined into a single instruction.  Its debug
-    // location is the merged locations of the original instructions.
-    I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
-    I2->eraseFromParent();
-    Changed = true;
+    if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
+      assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+      // The debug location is an integral part of a debug info intrinsic
+      // and can't be separated from it or replaced.  Instead of attempting
+      // to merge locations, simply hoist both copies of the intrinsic.
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB1->getInstList(), I1);
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB2->getInstList(), I2);
+      Changed = true;
+    } else {
+      // For a normal instruction, we just move one to right before the branch,
+      // then replace all uses of the other with the first.  Finally, we remove
+      // the now redundant second instruction.
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB1->getInstList(), I1);
+      if (!I2->use_empty())
+        I2->replaceAllUsesWith(I1);
+      I1->andIRFlags(I2);
+      unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                             LLVMContext::MD_range,
+                             LLVMContext::MD_fpmath,
+                             LLVMContext::MD_invariant_load,
+                             LLVMContext::MD_nonnull,
+                             LLVMContext::MD_invariant_group,
+                             LLVMContext::MD_align,
+                             LLVMContext::MD_dereferenceable,
+                             LLVMContext::MD_dereferenceable_or_null,
+                             LLVMContext::MD_mem_parallel_loop_access};
+      combineMetadata(I1, I2, KnownIDs);
+
+      // I1 and I2 are being combined into a single instruction.  Its debug
+      // location is the merged locations of the original instructions.
+      I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+      I2->eraseFromParent();
+      Changed = true;
+    }
 
     I1 = &*BB1_Itr++;
     I2 = &*BB2_Itr++;
@@ -1728,7 +1738,8 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   LockstepReverseIterator LRI(UnconditionalPreds);
   while (LRI.isValid() &&
          canSinkInstructions(*LRI, PHIOperands)) {
-    DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0] << "\n");
+    LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
+                      << "\n");
     InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
     ++ScanIdx;
     --LRI;
@@ -1740,7 +1751,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
       for (auto *V : PHIOperands[I])
         if (InstructionsToSink.count(V) == 0)
           ++NumPHIdValues;
-    DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+    LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
     unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
     if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
         NumPHIInsts++;
@@ -1768,7 +1779,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
     if (!Profitable)
       return false;
 
-    DEBUG(dbgs() << "SINK: Splitting edge\n");
+    LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
     if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))
@@ -1790,16 +1801,17 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   // and never actually sink it which means we produce more PHIs than intended.
   // This is unlikely in practice though.
   for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
-    DEBUG(dbgs() << "SINK: Sink: "
-                 << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "SINK: Sink: "
+                      << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+                      << "\n");
 
     // Because we've sunk every instruction in turn, the current instruction to
     // sink is always at index 0.
     LRI.reset();
     if (!ProfitableToSinkInstruction(LRI)) {
       // Too many PHIs would be created.
-      DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+      LLVM_DEBUG(
+          dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
       break;
     }
 
@@ -1811,7 +1823,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   return Changed;
 }
 
-/// \brief Determine if we can hoist sink a sole store instruction out of a
+/// Determine if we can hoist sink a sole store instruction out of a
 /// conditional block.
 ///
 /// We are looking for code like the following:
@@ -1851,12 +1863,9 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 
   // Look for a store to the same pointer in BrBB.
   unsigned MaxNumInstToLookAt = 9;
-  for (Instruction &CurI : reverse(*BrBB)) {
+  for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug())) {
     if (!MaxNumInstToLookAt)
       break;
-    // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(CurI))
-      continue;
     --MaxNumInstToLookAt;
 
     // Could be calling an instruction that affects memory like free().
@@ -1875,7 +1884,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
   return nullptr;
 }
 
-/// \brief Speculate a conditional basic block flattening the CFG.
+/// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
 /// instructions like this is most often not desirable. Instead, there is an MI
@@ -2045,7 +2054,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
-  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+  LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
   // Insert a select of the value of the speculated store.
   if (SpeculatedStoreValue) {
@@ -2106,19 +2115,16 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
 /// Return true if we can thread a branch across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
-  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   unsigned Size = 0;
 
-  for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
-    if (isa<DbgInfoIntrinsic>(BBI))
-      continue;
+  for (Instruction &I : BB->instructionsWithoutDebug()) {
     if (Size > 10)
       return false; // Don't clone large BB's.
     ++Size;
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
-    for (User *U : BBI->users()) {
+    for (User *U : I.users()) {
       Instruction *UI = cast<Instruction>(U);
       if (UI->getParent() != BB || isa<PHINode>(UI))
         return false;
@@ -2260,6 +2266,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // dependence information for this check, but simplifycfg can't keep it up
   // to date, and this catches most of the cases we care about anyway.
   BasicBlock *BB = PN->getParent();
+  const Function *Fn = BB->getParent();
+  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+    return false;
+
   BasicBlock *IfTrue, *IfFalse;
   Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
   if (!IfCond ||
@@ -2350,8 +2360,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
       }
   }
 
-  DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
-               << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
+                    << "  T: " << IfTrue->getName()
+                    << "  F: " << IfFalse->getName() << "\n");
 
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
@@ -2475,9 +2486,9 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 
   (void)RI;
 
-  DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
-               << "\n  " << *BI << "NewRet = " << *RI
-               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
+  LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+                    << "\n  " << *BI << "NewRet = " << *RI << "TRUEBLOCK: "
+                    << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
 
   EraseTerminatorInstAndDCECond(BI);
 
@@ -2486,7 +2497,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 
 /// Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
-static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+static bool tryCSEWithPredecessor(Instruction *Inst, BasicBlock *PB) {
   if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
     return false;
   for (Instruction &I : *PB) {
@@ -2543,14 +2554,16 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         if (PBI->isConditional() &&
             (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
              BI->getSuccessor(0) == PBI->getSuccessor(1))) {
-          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+          for (auto I = BB->instructionsWithoutDebug().begin(),
+                    E = BB->instructionsWithoutDebug().end();
+               I != E;) {
             Instruction *Curr = &*I++;
             if (isa<CmpInst>(Curr)) {
               Cond = Curr;
               break;
             }
             // Quit if we can't remove this instruction.
-            if (!checkCSEInPredecessor(Curr, PB))
+            if (!tryCSEWithPredecessor(Curr, PB))
               return false;
           }
         }
@@ -2650,7 +2663,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         continue;
     }
 
-    DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+    LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
     IRBuilder<> Builder(PBI);
 
     // If we need to invert the condition in the pred block to match, do so now.
@@ -2860,7 +2873,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
       if (!AlternativeV)
         break;
 
-      assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2);
+      assert(pred_size(Succ) == 2);
       auto PredI = pred_begin(Succ);
       BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
       if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
@@ -2903,14 +2916,13 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
     // thread this store.
     unsigned N = 0;
-    for (auto &I : *BB) {
+    for (auto &I : BB->instructionsWithoutDebug()) {
       // Cheap instructions viable for folding.
       if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
           isa<StoreInst>(I))
         ++N;
       // Free instructions.
-      else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
-               IsaBitcastOfPointerType(I))
+      else if (isa<TerminatorInst>(I) || IsaBitcastOfPointerType(I))
         continue;
       else
         return false;
@@ -2965,6 +2977,21 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     if (&*I != PStore && I->mayReadOrWriteMemory())
       return false;
 
+  // If PostBB has more than two predecessors, we need to split it so we can
+  // sink the store.
+  if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
+    // We know that QFB's only successor is PostBB. And QFB has a single
+    // predecessor. If QTB exists, then its only successor is also PostBB.
+    // If QTB does not exist, then QFB's only predecessor has a conditional
+    // branch to QFB and PostBB.
+    BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
+    BasicBlock *NewBB = SplitBlockPredecessors(PostBB, { QFB, TruePred},
+                                               "condstore.split");
+    if (!NewBB)
+      return false;
+    PostBB = NewBB;
+  }
+
   // OK, we're going to sink the stores to PostBB. The store has to be
   // conditional though, so first create the predicate.
   Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
@@ -3100,7 +3127,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
   if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
       (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
     return false;
-  if (!PostBB->hasNUses(2) || !QBI->getParent()->hasNUses(2))
+  if (!QBI->getParent()->hasNUses(2))
     return false;
 
   // OK, this is a sequence of two diamonds or triangles.
@@ -3200,11 +3227,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // If this is a conditional branch in an empty block, and if any
   // predecessors are a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
-  BasicBlock::iterator BBI = BB->begin();
+
   // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(BBI))
-    ++BBI;
-  if (&*BBI != BI)
+  if (&*BB->instructionsWithoutDebug().begin() != BI)
     return false;
 
   int PBIOp, BIOp;
@@ -3261,8 +3286,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // Finally, if everything is ok, fold the branches to logical ops.
   BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
 
-  DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
-               << "AND: " << *BI->getParent());
+  LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+                    << "AND: " << *BI->getParent());
 
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
@@ -3280,7 +3305,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     OtherDest = InfLoopBlock;
   }
 
-  DEBUG(dbgs() << *PBI->getParent()->getParent());
+  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
@@ -3364,8 +3389,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     }
   }
 
-  DEBUG(dbgs() << "INTO: " << *PBI->getParent());
-  DEBUG(dbgs() << *PBI->getParent()->getParent());
+  LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // This basic block is probably dead.  We know it has at least
   // one fewer predecessor.
@@ -3665,9 +3690,9 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 
   BasicBlock *BB = BI->getParent();
 
-  DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
-               << " cases into SWITCH.  BB is:\n"
-               << *BB);
+  LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+                    << " cases into SWITCH.  BB is:\n"
+                    << *BB);
 
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
@@ -3690,8 +3715,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
     // for the edge we just added.
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
 
-    DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
-                 << "\nEXTRABB = " << *BB);
+    LLVM_DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
+                      << "\nEXTRABB = " << *BB);
     BB = NewBB;
   }
 
@@ -3722,7 +3747,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // Erase the old branch instruction.
   EraseTerminatorInstAndDCECond(BI);
 
-  DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
+  LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
 }
 
@@ -3873,6 +3898,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     switch (IntrinsicID) {
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
+    case Intrinsic::dbg_label:
     case Intrinsic::lifetime_end:
       break;
     default:
@@ -4049,8 +4075,8 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   if (!UncondBranchPreds.empty() && DupRet) {
     while (!UncondBranchPreds.empty()) {
       BasicBlock *Pred = UncondBranchPreds.pop_back_val();
-      DEBUG(dbgs() << "FOLDING: " << *BB
-                   << "INTO UNCOND BRANCH PRED: " << *Pred);
+      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+                        << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
 
@@ -4374,7 +4400,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
       DeadCases.push_back(Case.getCaseValue());
-      DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
+      LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
+                        << " is dead.\n");
     }
   }
 
@@ -4390,7 +4417,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+    LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
     BasicBlock *NewDefault =
         SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
     SI->setDefaultDest(&*NewDefault);
@@ -4607,24 +4634,20 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
   // which we can constant-propagate the CaseVal, continue to its successor.
   SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
-  for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
-       ++I) {
-    if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+  for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
+    if (TerminatorInst *T = dyn_cast<TerminatorInst>(&I)) {
       // If the terminator is a simple branch, continue to the next block.
       if (T->getNumSuccessors() != 1 || T->isExceptional())
         return false;
       Pred = CaseDest;
       CaseDest = T->getSuccessor(0);
-    } else if (isa<DbgInfoIntrinsic>(I)) {
-      // Skip debug intrinsic.
-      continue;
-    } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) {
+    } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
       // Instruction is side-effect free and constant.
 
       // If the instruction has uses outside this block or a phi node slot for
       // the block, it is not safe to bypass the instruction since it would then
       // no longer dominate all its uses.
-      for (auto &Use : I->uses()) {
+      for (auto &Use : I.uses()) {
         User *User = Use.getUser();
         if (Instruction *I = dyn_cast<Instruction>(User))
           if (I->getParent() == CaseDest)
@@ -4635,7 +4658,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
         return false;
       }
 
-      ConstantPool.insert(std::make_pair(&*I, C));
+      ConstantPool.insert(std::make_pair(&I, C));
     } else {
       break;
     }
@@ -4670,30 +4693,31 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
 }
 
 // Helper function used to add CaseVal to the list of cases that generate
-// Result.
-static void MapCaseToResult(ConstantInt *CaseVal,
-                            SwitchCaseResultVectorTy &UniqueResults,
-                            Constant *Result) {
+// Result. Returns the updated number of cases that generate this result.
+static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
+                                 SwitchCaseResultVectorTy &UniqueResults,
+                                 Constant *Result) {
   for (auto &I : UniqueResults) {
     if (I.first == Result) {
       I.second.push_back(CaseVal);
-      return;
+      return I.second.size();
     }
   }
   UniqueResults.push_back(
       std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
+  return 1;
 }
 
 // Helper function that initializes a map containing
 // results for the PHI node of the common destination block for a switch
 // instruction. Returns false if multiple PHI nodes have been found or if
 // there is not a common destination block for the switch.
-static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
-                                  BasicBlock *&CommonDest,
-                                  SwitchCaseResultVectorTy &UniqueResults,
-                                  Constant *&DefaultResult,
-                                  const DataLayout &DL,
-                                  const TargetTransformInfo &TTI) {
+static bool
+InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
+                      SwitchCaseResultVectorTy &UniqueResults,
+                      Constant *&DefaultResult, const DataLayout &DL,
+                      const TargetTransformInfo &TTI,
+                      uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
   for (auto &I : SI->cases()) {
     ConstantInt *CaseVal = I.getCaseValue();
 
@@ -4703,10 +4727,21 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
                         DL, TTI))
       return false;
 
-    // Only one value per case is permitted
+    // Only one value per case is permitted.
     if (Results.size() > 1)
       return false;
-    MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Add the case->result mapping to UniqueResults.
+    const uintptr_t NumCasesForResult =
+        MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Early out if there are too many cases for this result.
+    if (NumCasesForResult > MaxCasesPerResult)
+      return false;
+
+    // Early out if there are too many unique results.
+    if (UniqueResults.size() > MaxUniqueResults)
+      return false;
 
     // Check the PHI consistency.
     if (!PHI)
@@ -4806,7 +4841,7 @@ static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
   SwitchCaseResultVectorTy UniqueResults;
   // Collect all the cases that will deliver the same value from the switch.
   if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
-                             DL, TTI))
+                             DL, TTI, 2, 1))
     return false;
   // Selects choose between maximum two values.
   if (UniqueResults.size() != 2)
@@ -5384,8 +5419,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   }
 
   bool ReturnedEarly = false;
-  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
-    PHINode *PHI = PHIs[I];
+  for (PHINode *PHI : PHIs) {
     const ResultListTy &ResultList = ResultLists[PHI];
 
     // If using a bitmask, use any value to fill the lookup table holes.
@@ -5475,7 +5509,7 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   SmallVector<int64_t,4> Values;
   for (auto &C : SI->cases())
     Values.push_back(C.getCaseValue()->getValue().getSExtValue());
-  std::sort(Values.begin(), Values.end());
+  llvm::sort(Values.begin(), Values.end());
 
   // If the switch is already dense, there's nothing useful to do here.
   if (isSwitchDense(Values))
@@ -5558,11 +5592,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
-    BasicBlock::iterator BBI = BB->begin();
-    // Ignore dbg intrinsics.
-    while (isa<DbgInfoIntrinsic>(BBI))
-      ++BBI;
-    if (SI == &*BBI)
+    if (SI == &*BB->instructionsWithoutDebug().begin())
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
         return simplifyCFG(BB, TTI, Options) | true;
   }
@@ -5649,7 +5679,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 /// any transform which might inhibit optimization (such as our ability to
 /// specialize a particular handler via tail commoning).  We do this by not
 /// merging any blocks which require us to introduce a phi.  Since the same
-/// values are flowing through both blocks, we don't loose any ability to
+/// values are flowing through both blocks, we don't lose any ability to
 /// specialize.  If anything, we make such specialization more likely.
 ///
 /// TODO - This transformation could remove entries from a phi in the target
@@ -5679,7 +5709,7 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
 
     // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
-    SmallSet<BasicBlock *, 16> Preds;
+    SmallPtrSet<BasicBlock *, 16> Preds;
     Preds.insert(pred_begin(BB), pred_end(BB));
     for (BasicBlock *Pred : Preds) {
       InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
@@ -5697,7 +5727,7 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
         Inst.eraseFromParent();
     }
 
-    SmallSet<BasicBlock *, 16> Succs;
+    SmallPtrSet<BasicBlock *, 16> Succs;
     Succs.insert(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : Succs) {
       Succ->removePredecessor(BB);
@@ -5721,9 +5751,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
   // header. (This is for early invocations before loop simplify and
   // vectorization to keep canonical loop forms for nested loops. These blocks
   // can be eliminated when the pass is invoked later in the back-end.)
+  // Note that if BB has only one predecessor then we do not introduce new
+  // backedge, so we can eliminate BB.
   bool NeedCanonicalLoop =
       Options.NeedCanonicalLoop &&
-      (LoopHeaders && (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
+      (LoopHeaders && pred_size(BB) > 1 &&
+       (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
@@ -5771,6 +5804,9 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
 
 bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
+  const Function *Fn = BB->getParent();
+  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+    return false;
 
   // Conditional branch
   if (isValueEqualityComparison(BI)) {
@@ -5783,18 +5819,12 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
-    BasicBlock::iterator I = BB->begin();
-    // Ignore dbg intrinsics.
-    while (isa<DbgInfoIntrinsic>(I))
-      ++I;
+    auto I = BB->instructionsWithoutDebug().begin();
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return simplifyCFG(BB, TTI, Options) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())) {
       ++I;
-      // Ignore dbg intrinsics.
-      while (isa<DbgInfoIntrinsic>(I))
-        ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
         return simplifyCFG(BB, TTI, Options) | true;
     }
@@ -5920,17 +5950,20 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
       if (!LI->isVolatile())
-        return LI->getPointerAddressSpace() == 0;
+        return !NullPointerIsDefined(LI->getFunction(),
+                                     LI->getPointerAddressSpace());
 
     // Store to null is undefined.
     if (StoreInst *SI = dyn_cast<StoreInst>(Use))
       if (!SI->isVolatile())
-        return SI->getPointerAddressSpace() == 0 &&
+        return (!NullPointerIsDefined(SI->getFunction(),
+                                      SI->getPointerAddressSpace())) &&
                SI->getPointerOperand() == I;
 
     // A call to null is undefined.
     if (auto CS = CallSite(Use))
-      return CS.getCalledValue() == I;
+      return !NullPointerIsDefined(CS->getFunction()) &&
+             CS.getCalledValue() == I;
   }
   return false;
 }
@@ -5971,7 +6004,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // or that just have themself as a predecessor.  These are unreachable.
   if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
       BB->getSinglePredecessor() == BB) {
-    DEBUG(dbgs() << "Removing BB: \n" << *BB);
+    LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
     DeleteDeadBlock(BB);
     return true;
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index ad1faea0a7ae..e381fbc34ab4 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 
@@ -80,6 +81,7 @@ namespace {
     bool replaceIVUserWithLoopInvariant(Instruction *UseInst);
 
     bool eliminateOverflowIntrinsic(CallInst *CI);
+    bool eliminateTrunc(TruncInst *TI);
     bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
     bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
@@ -147,8 +149,8 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   if (SE->getSCEV(UseInst) != FoldedExpr)
     return nullptr;
 
-  DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
-        << " -> " << *UseInst << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+                    << " -> " << *UseInst << '\n');
 
   UseInst->setOperand(OperIdx, IVSrc);
   assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
@@ -221,7 +223,7 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
     // for now.
     return false;
 
-  DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
   ICmp->setPredicate(InvariantPredicate);
   ICmp->setOperand(0, NewLHS);
   ICmp->setOperand(1, NewRHS);
@@ -252,11 +254,11 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   if (SE->isKnownPredicate(Pred, S, X)) {
     ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
     DeadInsts.emplace_back(ICmp);
-    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
     ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
     DeadInsts.emplace_back(ICmp);
-    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   } else if (makeIVComparisonInvariant(ICmp, IVOperand)) {
     // fallthrough to end of function
   } else if (ICmpInst::isSigned(OriginalPred) &&
@@ -267,7 +269,8 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
     // we turn the instruction's predicate to its unsigned version. Note that
     // we cannot rely on Pred here unless we check if we have swapped it.
     assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
-    DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp
+                      << '\n');
     ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
   } else
     return;
@@ -293,7 +296,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
         SDiv->getName() + ".udiv", SDiv);
     UDiv->setIsExact(SDiv->isExact());
     SDiv->replaceAllUsesWith(UDiv);
-    DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
     ++NumSimplifiedSDiv;
     Changed = true;
     DeadInsts.push_back(SDiv);
@@ -309,7 +312,7 @@ void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
   auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
                                       Rem->getName() + ".urem", Rem);
   Rem->replaceAllUsesWith(URem);
-  DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
   ++NumSimplifiedSRem;
   Changed = true;
   DeadInsts.emplace_back(Rem);
@@ -318,7 +321,7 @@ void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
 // i % n  -->  i  if i is in [0,n).
 void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
   Rem->replaceAllUsesWith(Rem->getOperand(0));
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
   Changed = true;
   DeadInsts.emplace_back(Rem);
@@ -332,7 +335,7 @@ void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
   SelectInst *Sel =
       SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
   Rem->replaceAllUsesWith(Sel);
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
   Changed = true;
   DeadInsts.emplace_back(Rem);
@@ -492,6 +495,118 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
   return true;
 }
 
+bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
+  // It is always legal to replace
+  //   icmp <pred> i32 trunc(iv), n
+  // with
+  //   icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate.
+  // Or with
+  //   icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate.
+  // Or with either of these if pred is an equality predicate.
+  //
+  // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for
+  // every comparison which uses trunc, it means that we can replace each of
+  // them with comparison of iv against sext/zext(n). We no longer need trunc
+  // after that.
+  //
+  // TODO: Should we do this if we can widen *some* comparisons, but not all
+  // of them? Sometimes it is enough to enable other optimizations, but the
+  // trunc instruction will stay in the loop.
+  Value *IV = TI->getOperand(0);
+  Type *IVTy = IV->getType();
+  const SCEV *IVSCEV = SE->getSCEV(IV);
+  const SCEV *TISCEV = SE->getSCEV(TI);
+
+  // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can
+  // get rid of trunc
+  bool DoesSExtCollapse = false;
+  bool DoesZExtCollapse = false;
+  if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy))
+    DoesSExtCollapse = true;
+  if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy))
+    DoesZExtCollapse = true;
+
+  // If neither sext nor zext does collapse, it is not profitable to do any
+  // transform. Bail.
+  if (!DoesSExtCollapse && !DoesZExtCollapse)
+    return false;
+
+  // Collect users of the trunc that look like comparisons against invariants.
+  // Bail if we find something different.
+  SmallVector<ICmpInst *, 4> ICmpUsers;
+  for (auto *U : TI->users()) {
+    // We don't care about users in unreachable blocks.
+    if (isa<Instruction>(U) &&
+        !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
+      continue;
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
+      if (ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) {
+        assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
+        // If we cannot get rid of trunc, bail.
+        if (ICI->isSigned() && !DoesSExtCollapse)
+          return false;
+        if (ICI->isUnsigned() && !DoesZExtCollapse)
+          return false;
+        // For equality, either signed or unsigned works.
+        ICmpUsers.push_back(ICI);
+      } else
+        return false;
+    } else
+      return false;
+  }
+
+  auto CanUseZExt = [&](ICmpInst *ICI) {
+    // Unsigned comparison can be widened as unsigned.
+    if (ICI->isUnsigned())
+      return true;
+    // Is it profitable to do zext?
+    if (!DoesZExtCollapse)
+      return false;
+    // For equality, we can safely zext both parts.
+    if (ICI->isEquality())
+      return true;
+    // Otherwise we can only use zext when comparing two non-negative or two
+    // negative values. But in practice, we will never pass DoesZExtCollapse
+    // check for a negative value, because zext(trunc(x)) is non-negative. So
+    // it only make sense to check for non-negativity here.
+    const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0));
+    const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1));
+    return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2);
+  };
+  // Replace all comparisons against trunc with comparisons against IV.
+  for (auto *ICI : ICmpUsers) {
+    auto *Op1 = ICI->getOperand(1);
+    Instruction *Ext = nullptr;
+    // For signed/unsigned predicate, replace the old comparison with comparison
+    // of immediate IV against sext/zext of the invariant argument. If we can
+    // use either sext or zext (i.e. we are dealing with equality predicate),
+    // then prefer zext as a more canonical form.
+    // TODO: If we see a signed comparison which can be turned into unsigned,
+    // we can do it here for canonicalization purposes.
+    ICmpInst::Predicate Pred = ICI->getPredicate();
+    if (CanUseZExt(ICI)) {
+      assert(DoesZExtCollapse && "Unprofitable zext?");
+      Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
+      Pred = ICmpInst::getUnsignedPredicate(Pred);
+    } else {
+      assert(DoesSExtCollapse && "Unprofitable sext?");
+      Ext = new SExtInst(Op1, IVTy, "sext", ICI);
+      assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
+    }
+    bool Changed;
+    L->makeLoopInvariant(Ext, Changed);
+    (void)Changed;
+    ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
+    ICI->replaceAllUsesWith(NewICI);
+    DeadInsts.emplace_back(ICI);
+  }
+
+  // Trunc no longer needed.
+  TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+  DeadInsts.emplace_back(TI);
+  return true;
+}
+
 /// Eliminate an operation that consumes a simple IV and has no observable
 /// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
 /// but UseInst may not be.
@@ -516,6 +631,10 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     if (eliminateOverflowIntrinsic(CI))
       return true;
 
+  if (auto *TI = dyn_cast<TruncInst>(UseInst))
+    if (eliminateTrunc(TI))
+      return true;
+
   if (eliminateIdentitySCEV(UseInst, IVOperand))
     return true;
 
@@ -548,8 +667,8 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
   auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
 
   I->replaceAllUsesWith(Invariant);
-  DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
-               << " with loop invariant: " << *S << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
+                    << " with loop invariant: " << *S << '\n');
   ++NumFoldedUser;
   Changed = true;
   DeadInsts.emplace_back(I);
@@ -589,7 +708,7 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
   if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
     return false;
 
-  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
 
   UseInst->replaceAllUsesWith(IVOperand);
   ++NumElimIdentity;
@@ -771,6 +890,15 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       SimpleIVUsers.pop_back_val();
     Instruction *UseInst = UseOper.first;
 
+    // If a user of the IndVar is trivially dead, we prefer just to mark it dead
+    // rather than try to do some complex analysis or transformation (such as
+    // widening) basing on it.
+    // TODO: Propagate TLI and pass it here to handle more cases.
+    if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) {
+      DeadInsts.emplace_back(UseInst);
+      continue;
+    }
+
     // Bypass back edges to avoid extra work.
     if (UseInst == CurrIV) continue;
 
@@ -783,7 +911,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     for (unsigned N = 0; IVOperand; ++N) {
       assert(N <= Simplified.size() && "runaway iteration");
 
-      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+      Value *NewOper = foldIVUser(UseInst, IVOperand);
       if (!NewOper)
         break; // done folding
       IVOperand = dyn_cast<Instruction>(NewOper);
@@ -791,12 +919,12 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     if (!IVOperand)
       continue;
 
-    if (eliminateIVUser(UseOper.first, IVOperand)) {
+    if (eliminateIVUser(UseInst, IVOperand)) {
       pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
       continue;
     }
 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
       if ((isa<OverflowingBinaryOperator>(BO) &&
            strengthenOverflowingOperation(BO, IVOperand)) ||
           (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
@@ -806,13 +934,13 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       }
     }
 
-    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+    CastInst *Cast = dyn_cast<CastInst>(UseInst);
     if (V && Cast) {
       V->visitCast(Cast);
       continue;
     }
-    if (isSimpleIVUser(UseOper.first, L, SE)) {
-      pushIVUsers(UseOper.first, L, Simplified, SimpleIVUsers);
+    if (isSimpleIVUser(UseInst, L, SE)) {
+      pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
     }
   }
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 03a1d55ddc30..8c48597fc2e4 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -7,10 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification.  If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
+// This file implements the library calls simplifier. It does not implement
+// any pass, but can't be used by other passes to do simplifications.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,7 +19,9 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -33,7 +33,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -104,19 +103,51 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
   });
 }
 
-/// \brief Check whether the overloaded unary floating point function
-/// corresponding to \a Ty is available.
-static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                            LibFunc DoubleFn, LibFunc FloatFn,
-                            LibFunc LongDoubleFn) {
-  switch (Ty->getTypeID()) {
-  case Type::FloatTyID:
-    return TLI->has(FloatFn);
-  case Type::DoubleTyID:
-    return TLI->has(DoubleFn);
-  default:
-    return TLI->has(LongDoubleFn);
-  }
+static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
+  if (Base < 2 || Base > 36)
+    // handle special zero base
+    if (Base != 0)
+      return nullptr;
+
+  char *End;
+  std::string nptr = Str.str();
+  errno = 0;
+  long long int Result = strtoll(nptr.c_str(), &End, Base);
+  if (errno)
+    return nullptr;
+
+  // if we assume all possible target locales are ASCII supersets,
+  // then if strtoll successfully parses a number on the host,
+  // it will also successfully parse the same way on the target
+  if (*End != '\0')
+    return nullptr;
+
+  if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result))
+    return nullptr;
+
+  return ConstantInt::get(CI->getType(), Result);
+}
+
+static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B,
+                                const TargetLibraryInfo *TLI) {
+  CallInst *FOpen = dyn_cast<CallInst>(File);
+  if (!FOpen)
+    return false;
+
+  Function *InnerCallee = FOpen->getCalledFunction();
+  if (!InnerCallee)
+    return false;
+
+  LibFunc Func;
+  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
+      Func != LibFunc_fopen)
+    return false;
+
+  inferLibFuncAttributes(*CI->getCalledFunction(), *TLI);
+  if (PointerMayBeCaptured(File, true, true))
+    return false;
+
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -156,9 +187,8 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(CpyDst, Src,
-                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1),
-                 1);
+  B.CreateMemCpy(CpyDst, 1, Src, 1,
+                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
   return Dst;
 }
 
@@ -346,8 +376,8 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, Src,
-                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1);
+  B.CreateMemCpy(Dst, 1, Src, 1,
+                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   return Dst;
 }
 
@@ -371,7 +401,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, Src, LenV, 1);
+  B.CreateMemCpy(Dst, 1, Src, 1, LenV);
   return DstEnd;
 }
 
@@ -388,7 +418,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   --SrcLen;
 
   if (SrcLen == 0) {
-    // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+    // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
     B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
     return Dst;
   }
@@ -407,8 +437,8 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
     return nullptr;
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
-  // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
-  B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1);
+  // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
+  B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len));
 
   return Dst;
 }
@@ -508,7 +538,7 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
-  Module &M = *CI->getParent()->getParent()->getParent();
+  Module &M = *CI->getModule();
   unsigned WCharSize = TLI->getWCharSize(M) * 8;
   // We cannot perform this optimization without wchar_size metadata.
   if (WCharSize == 0)
@@ -816,40 +846,19 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
-  // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-  B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                 CI->getArgOperand(2), 1);
+  // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
+  B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                 CI->getArgOperand(2));
   return CI->getArgOperand(0);
 }
 
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
-  // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-  B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                  CI->getArgOperand(2), 1);
+  // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
+  B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                  CI->getArgOperand(2));
   return CI->getArgOperand(0);
 }
 
-// TODO: Does this belong in BuildLibCalls or should all of those similar
-// functions be moved here?
-static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
-                         IRBuilder<> &B, const TargetLibraryInfo &TLI) {
-  LibFunc Func;
-  if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  const DataLayout &DL = M->getDataLayout();
-  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
-                                         PtrType, PtrType);
-  CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
-
-  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
-    CI->setCallingConv(F->getCallingConv());
-
-  return CI;
-}
-
 /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
 static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
                                const TargetLibraryInfo &TLI) {
@@ -901,12 +910,19 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
   if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
     return Calloc;
 
-  // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+  // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
   return CI->getArgOperand(0);
 }
 
+Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) {
+  if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
+    return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // Math Library Optimizations
 //===----------------------------------------------------------------------===//
@@ -1666,12 +1682,12 @@ Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
-  // abs(x) -> x >s -1 ? x : -x
-  Value *Op = CI->getArgOperand(0);
-  Value *Pos =
-      B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos");
-  Value *Neg = B.CreateNeg(Op, "neg");
-  return B.CreateSelect(Pos, Op, Neg);
+  // abs(x) -> x <s 0 ? -x : x
+  // The negation has 'nsw' because abs of INT_MIN is undefined.
+  Value *X = CI->getArgOperand(0);
+  Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
+  Value *NegX = B.CreateNSWNeg(X, "neg");
+  return B.CreateSelect(IsNeg, NegX, X);
 }
 
 Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
@@ -1695,6 +1711,29 @@ Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
                      ConstantInt::get(CI->getType(), 0x7F));
 }
 
+Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) {
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  return convertStrToNumber(CI, Str, 10);
+}
+
+Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) {
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  if (!isa<ConstantPointerNull>(CI->getArgOperand(1)))
+    return nullptr;
+
+  if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) {
+    return convertStrToNumber(CI, Str, CInt->getSExtValue());
+  }
+
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // Formatting and IO Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -1826,15 +1865,13 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
   if (CI->getNumArgOperands() == 2) {
     // Make sure there's no % in the constant array.  We could try to handle
     // %% -> % in the future if we cared.
-    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-      if (FormatStr[i] == '%')
-        return nullptr; // we found a format specifier, bail out.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // we found a format specifier, bail out.
 
-    // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+    // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
+    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
                    ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                                    FormatStr.size() + 1),
-                   1); // Copy the null byte.
+                                    FormatStr.size() + 1)); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
   }
 
@@ -1868,7 +1905,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
       return nullptr;
     Value *IncLen =
         B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, IncLen);
 
     // The sprintf result is the unincremented number of bytes in the string.
     return B.CreateIntCast(Len, CI->getType(), false);
@@ -1897,6 +1934,93 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+    return nullptr;
+
+  // Check for size
+  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!Size)
+    return nullptr;
+
+  uint64_t N = Size->getZExtValue();
+
+  // If we just have a format string (nothing else crazy) transform it.
+  if (CI->getNumArgOperands() == 3) {
+    // Make sure there's no % in the constant array.  We could try to handle
+    // %% -> % in the future if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // we found a format specifier, bail out.
+
+    if (N == 0)
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    else if (N < FormatStr.size() + 1)
+      return nullptr;
+
+    // sprintf(str, size, fmt) -> llvm.memcpy(align 1 str, align 1 fmt,
+    // strlen(fmt)+1)
+    B.CreateMemCpy(
+        CI->getArgOperand(0), 1, CI->getArgOperand(2), 1,
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                         FormatStr.size() + 1)); // Copy the null byte.
+    return ConstantInt::get(CI->getType(), FormatStr.size());
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
+      CI->getNumArgOperands() == 4) {
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      if (N == 0)
+        return ConstantInt::get(CI->getType(), 1);
+      else if (N == 1)
+        return nullptr;
+
+      // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(3)->getType()->isIntegerTy())
+        return nullptr;
+      Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
+      Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
+      StringRef Str;
+      if (!getConstantStringInfo(CI->getArgOperand(3), Str))
+        return nullptr;
+
+      if (N == 0)
+        return ConstantInt::get(CI->getType(), Str.size());
+      else if (N < Str.size() + 1)
+        return nullptr;
+
+      B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(3), 1,
+                     ConstantInt::get(CI->getType(), Str.size() + 1));
+
+      // The snprintf result is the unincremented number of bytes in the string.
+      return ConstantInt::get(CI->getType(), Str.size());
+    }
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
+  if (Value *V = optimizeSnPrintFString(CI, B)) {
+    return V;
+  }
+
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
   optimizeErrorReporting(CI, B, 0);
 
@@ -1913,9 +2037,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
 
   // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
   if (CI->getNumArgOperands() == 2) {
-    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-      if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
-        return nullptr;        // We found a format specifier.
+    // Could handle %% -> % if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // We found a format specifier.
 
     return emitFWrite(
         CI->getArgOperand(1),
@@ -1973,22 +2097,27 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   // Get the element size and count.
   ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-  if (!SizeC || !CountC)
-    return nullptr;
-  uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
-
-  // If this is writing zero records, remove the call (it's a noop).
-  if (Bytes == 0)
-    return ConstantInt::get(CI->getType(), 0);
-
-  // If this is writing one byte, turn it into fputc.
-  // This optimisation is only valid, if the return value is unused.
-  if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
-    Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
-    Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
-    return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+  if (SizeC && CountC) {
+    uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+    }
   }
 
+  if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
+    return emitFWriteUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+                              CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
+                              TLI);
+
   return nullptr;
 }
 
@@ -1997,12 +2126,18 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
 
   // Don't rewrite fputs to fwrite when optimising for size because fwrite
   // requires more arguments and thus extra MOVs are required.
-  if (CI->getParent()->getParent()->optForSize())
+  if (CI->getFunction()->optForSize())
     return nullptr;
 
-  // We can't optimize if return value is used.
-  if (!CI->use_empty())
-    return nullptr;
+  // Check if has any use
+  if (!CI->use_empty()) {
+    if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
+      return emitFPutSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                               TLI);
+    else
+      // We can't optimize if return value is used.
+      return nullptr;
+  }
 
   // fputs(s,F) --> fwrite(s,1,strlen(s),F)
   uint64_t Len = GetStringLength(CI->getArgOperand(0));
@@ -2016,6 +2151,40 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
       CI->getArgOperand(1), B, DL, TLI);
 }
 
+Value *LibCallSimplifier::optimizeFPutc(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 1);
+
+  if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
+    return emitFPutCUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                             TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFGetc(CallInst *CI, IRBuilder<> &B) {
+  if (isLocallyOpenedFile(CI->getArgOperand(0), CI, B, TLI))
+    return emitFGetCUnlocked(CI->getArgOperand(0), B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFGets(CallInst *CI, IRBuilder<> &B) {
+  if (isLocallyOpenedFile(CI->getArgOperand(2), CI, B, TLI))
+    return emitFGetSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+                             CI->getArgOperand(2), B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) {
+  if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
+    return emitFReadUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+                             CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
+                             TLI);
+
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
   // Check for a constant string.
   StringRef Str;
@@ -2099,6 +2268,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeMemMove(CI, Builder);
     case LibFunc_memset:
       return optimizeMemSet(CI, Builder);
+    case LibFunc_realloc:
+      return optimizeRealloc(CI, Builder);
     case LibFunc_wcslen:
       return optimizeWcslen(CI, Builder);
     default:
@@ -2290,16 +2461,33 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeIsAscii(CI, Builder);
     case LibFunc_toascii:
       return optimizeToAscii(CI, Builder);
+    case LibFunc_atoi:
+    case LibFunc_atol:
+    case LibFunc_atoll:
+      return optimizeAtoi(CI, Builder);
+    case LibFunc_strtol:
+    case LibFunc_strtoll:
+      return optimizeStrtol(CI, Builder);
     case LibFunc_printf:
       return optimizePrintF(CI, Builder);
     case LibFunc_sprintf:
       return optimizeSPrintF(CI, Builder);
+    case LibFunc_snprintf:
+      return optimizeSnPrintF(CI, Builder);
     case LibFunc_fprintf:
       return optimizeFPrintF(CI, Builder);
     case LibFunc_fwrite:
       return optimizeFWrite(CI, Builder);
+    case LibFunc_fread:
+      return optimizeFRead(CI, Builder);
     case LibFunc_fputs:
       return optimizeFPuts(CI, Builder);
+    case LibFunc_fgets:
+      return optimizeFGets(CI, Builder);
+    case LibFunc_fputc:
+      return optimizeFPutc(CI, Builder);
+    case LibFunc_fgetc:
+      return optimizeFGetc(CI, Builder);
     case LibFunc_puts:
       return optimizePuts(CI, Builder);
     case LibFunc_perror:
@@ -2307,8 +2495,6 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     case LibFunc_vfprintf:
     case LibFunc_fiprintf:
       return optimizeErrorReporting(CI, Builder, 0);
-    case LibFunc_fputc:
-      return optimizeErrorReporting(CI, Builder, 1);
     default:
       return nullptr;
     }
@@ -2393,8 +2579,8 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                                                      IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                   CI->getArgOperand(2), 1);
+    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                   CI->getArgOperand(2));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2403,8 +2589,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                                                       IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), 1);
+    B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                    CI->getArgOperand(2));
     return CI->getArgOperand(0);
   }
   return nullptr;
diff --git a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
index 968eb0208f43..f8d758c54983 100644
--- a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
@@ -101,7 +101,8 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
   // At this point module should have the proper mix of globals and locals.
   // As we attempt to partition this module, we must not change any
   // locals to globals.
-  DEBUG(dbgs() << "Partition module with (" << M->size() << ")functions\n");
+  LLVM_DEBUG(dbgs() << "Partition module with (" << M->size()
+                    << ")functions\n");
   ClusterMapType GVtoClusterMap;
   ComdatMembersType ComdatMembers;
 
@@ -180,28 +181,31 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
           std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
                                        GVtoClusterMap.member_end()), I));
 
-  std::sort(Sets.begin(), Sets.end(), [](const SortType &a, const SortType &b) {
-    if (a.first == b.first)
-      return a.second->getData()->getName() > b.second->getData()->getName();
-    else
-      return a.first > b.first;
-  });
+  llvm::sort(Sets.begin(), Sets.end(),
+             [](const SortType &a, const SortType &b) {
+               if (a.first == b.first)
+                 return a.second->getData()->getName() >
+                        b.second->getData()->getName();
+               else
+                 return a.first > b.first;
+             });
 
   for (auto &I : Sets) {
     unsigned CurrentClusterID = BalancinQueue.top().first;
     unsigned CurrentClusterSize = BalancinQueue.top().second;
     BalancinQueue.pop();
 
-    DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" << I.first
-                 << ") ----> " << I.second->getData()->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size("
+                      << I.first << ") ----> " << I.second->getData()->getName()
+                      << "\n");
 
     for (ClusterMapType::member_iterator MI =
              GVtoClusterMap.findLeader(I.second);
          MI != GVtoClusterMap.member_end(); ++MI) {
       if (!Visited.insert(*MI).second)
         continue;
-      DEBUG(dbgs() << "----> " << (*MI)->getName()
-                   << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+      LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName()
+                        << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
       Visited.insert(*MI);
       ClusterIDMap[*MI] = CurrentClusterID;
       CurrentClusterSize++;
@@ -270,7 +274,7 @@ void llvm::SplitModule(
   for (unsigned I = 0; I < N; ++I) {
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> MPart(
-        CloneModule(M.get(), VMap, [&](const GlobalValue *GV) {
+        CloneModule(*M, VMap, [&](const GlobalValue *GV) {
           if (ClusterIDMap.count(GV))
             return (ClusterIDMap[GV] == I);
           else
diff --git a/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 49dc15cf5e7c..ac0b519f4a77 100644
--- a/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
@@ -75,6 +74,3 @@ bool StripGCRelocates::runOnFunction(Function &F) {
 INITIALIZE_PASS(StripGCRelocates, "strip-gc-relocates",
                 "Strip gc.relocates inserted through RewriteStatepointsForGC",
                 true, false)
-FunctionPass *llvm::createStripGCRelocatesPass() {
-  return new StripGCRelocates();
-}
diff --git a/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index cd0378e0140c..8956a089a99c 100644
--- a/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index ed444e4cf43c..e633ac0c874d 100644
--- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -19,7 +19,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
index f6c7d1c4989e..afd842f59911 100644
--- a/contrib/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
@@ -12,7 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils.h"
 #include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Utils.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
@@ -33,7 +36,6 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializePromoteLegacyPassPass(Registry);
   initializeStripNonLineTableDebugInfoPass(Registry);
   initializeUnifyFunctionExitNodesPass(Registry);
-  initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
   initializeStripGCRelocatesPass(Registry);
   initializePredicateInfoPrinterLegacyPassPass(Registry);
@@ -43,3 +45,12 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
 void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
   initializeTransformUtils(*unwrap(R));
 }
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
diff --git a/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp b/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
index c3feea6a0a41..948d9bd5baad 100644
--- a/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -20,8 +20,14 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
       StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
     return false;
 
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredVal->getType());
+
+  // The store size must be byte-aligned to support future type casts.
+  if (llvm::alignTo(StoreSize, 8) != StoreSize)
+    return false;
+
   // The store has to be at least as big as the load.
-  if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+  if (StoreSize < DL.getTypeSizeInBits(LoadTy))
     return false;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -389,8 +395,8 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
     NewLoad->takeName(SrcVal);
     NewLoad->setAlignment(SrcVal->getAlignment());
 
-    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
-    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+    LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
 
     // Replace uses of the original load with the wider load.  On a big endian
     // system, we need to shift down to get the relevant bits.
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index dc83b6d4d292..5f3d127202ad 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -6,6 +6,38 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
+// This pass merges loads/stores to/from sequential memory addresses into vector
+// loads/stores.  Although there's nothing GPU-specific in here, this pass is
+// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
+//
+// (For simplicity below we talk about loads only, but everything also applies
+// to stores.)
+//
+// This pass is intended to be run late in the pipeline, after other
+// vectorization opportunities have been exploited.  So the assumption here is
+// that immediately following our new vector load we'll need to extract out the
+// individual elements of the load, so we can operate on them individually.
+//
+// On CPUs this transformation is usually not beneficial, because extracting the
+// elements of a vector register is expensive on most architectures.  It's
+// usually better just to load each element individually into its own scalar
+// register.
+//
+// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
+// "vector load" loads directly into a series of scalar registers.  In effect,
+// extracting the elements of the vector is free.  It's therefore always
+// beneficial to vectorize a sequence of loads on these architectures.
+//
+// Vectorizing (perhaps a better name might be "coalescing") loads can have
+// large performance impacts on GPU kernels, and opportunities for vectorizing
+// are common in GPU code.  This pass tries very hard to find such
+// opportunities; its runtime is quadratic in the number of loads in a BB.
+//
+// Some CPU architectures, such as ARM, have instructions that load into
+// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
+// could use this pass (with some modifications), but currently it implements
+// its own pass to do something similar to what we do here.
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -21,6 +53,7 @@
 #include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
@@ -45,7 +78,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
@@ -65,8 +97,16 @@ static const unsigned StackAdjustedAlignment = 4;
 
 namespace {
 
+/// ChainID is an arbitrary token that is allowed to be different only for the
+/// accesses that are guaranteed to be considered non-consecutive by
+/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
+/// together and reducing the number of instructions the main search operates on
+/// at a time, i.e. this is to reduce compile time and nothing else as the main
+/// search has O(n^2) time complexity. The underlying type of ChainID should not
+/// be relied upon.
+using ChainID = const Value *;
 using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<Value *, InstrList>;
+using InstrListMap = MapVector<ChainID, InstrList>;
 
 class Vectorizer {
   Function &F;
@@ -86,10 +126,6 @@ public:
   bool run();
 
 private:
-  Value *getPointerOperand(Value *I) const;
-
-  GetElementPtrInst *getSourceGEP(Value *Src) const;
-
   unsigned getPointerAddressSpace(Value *I);
 
   unsigned getAlignment(LoadInst *LI) const {
@@ -108,7 +144,15 @@ private:
     return DL.getABITypeAlignment(SI->getValueOperand()->getType());
   }
 
+  static const unsigned MaxDepth = 3;
+
   bool isConsecutiveAccess(Value *A, Value *B);
+  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+                              unsigned Depth = 0) const;
+  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
+                                   unsigned Depth) const;
+  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+                          unsigned Depth) const;
 
   /// After vectorization, reorder the instructions that I depends on
   /// (the instructions defining its operands), to ensure they dominate I.
@@ -239,14 +283,6 @@ bool Vectorizer::run() {
   return Changed;
 }
 
-Value *Vectorizer::getPointerOperand(Value *I) const {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return nullptr;
-}
-
 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
   if (LoadInst *L = dyn_cast<LoadInst>(I))
     return L->getPointerAddressSpace();
@@ -255,23 +291,10 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
   return -1;
 }
 
-GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
-  // First strip pointer bitcasts. Make sure pointee size is the same with
-  // and without casts.
-  // TODO: a stride set by the add instruction below can match the difference
-  // in pointee type size here. Currently it will not be vectorized.
-  Value *SrcPtr = getPointerOperand(Src);
-  Value *SrcBase = SrcPtr->stripPointerCasts();
-  if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
-      DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
-    SrcPtr = SrcBase;
-  return dyn_cast<GetElementPtrInst>(SrcPtr);
-}
-
 // FIXME: Merge with llvm::isConsecutiveAccess
 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
+  Value *PtrA = getLoadStorePointerOperand(A);
+  Value *PtrB = getLoadStorePointerOperand(B);
   unsigned ASA = getPointerAddressSpace(A);
   unsigned ASB = getPointerAddressSpace(B);
 
@@ -280,18 +303,27 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
     return false;
 
   // Make sure that A and B are different pointers of the same size type.
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   Type *PtrATy = PtrA->getType()->getPointerElementType();
   Type *PtrBTy = PtrB->getType()->getPointerElementType();
   if (PtrA == PtrB ||
+      PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
       DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
       DL.getTypeStoreSize(PtrATy->getScalarType()) !=
           DL.getTypeStoreSize(PtrBTy->getScalarType()))
     return false;
 
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
 
-  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
+                                        const APInt &PtrDelta,
+                                        unsigned Depth) const {
+  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+  APInt OffsetA(PtrBitWidth, 0);
+  APInt OffsetB(PtrBitWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
@@ -300,11 +332,11 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   // Check if they are based on the same pointer. That makes the offsets
   // sufficient.
   if (PtrA == PtrB)
-    return OffsetDelta == Size;
+    return OffsetDelta == PtrDelta;
 
   // Compute the necessary base pointer delta to have the necessary final delta
-  // equal to the size.
-  APInt BaseDelta = Size - OffsetDelta;
+  // equal to the pointer delta requested.
+  APInt BaseDelta = PtrDelta - OffsetDelta;
 
   // Compute the distance with SCEV between the base pointers.
   const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
@@ -314,71 +346,127 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   if (X == PtrSCEVB)
     return true;
 
+  // The above check will not catch the cases where one of the pointers is
+  // factorized but the other one is not, such as (C + (S * (A + B))) vs
+  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+  // and getting the simplified difference.
+  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+  if (C == Dist)
+    return true;
+
   // Sometimes even this doesn't work, because SCEV can't always see through
   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
   // things the hard way.
+  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+                                             APInt PtrDelta,
+                                             unsigned Depth) const {
+  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+  if (!GEPA || !GEPB)
+    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
 
   // Look through GEPs after checking they're the same except for the last
   // index.
-  GetElementPtrInst *GEPA = getSourceGEP(A);
-  GetElementPtrInst *GEPB = getSourceGEP(B);
-  if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+      GEPA->getPointerOperand() != GEPB->getPointerOperand())
     return false;
-  unsigned FinalIndex = GEPA->getNumOperands() - 1;
-  for (unsigned i = 0; i < FinalIndex; i++)
-    if (GEPA->getOperand(i) != GEPB->getOperand(i))
+  gep_type_iterator GTIA = gep_type_begin(GEPA);
+  gep_type_iterator GTIB = gep_type_begin(GEPB);
+  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+    if (GTIA.getOperand() != GTIB.getOperand())
       return false;
+    ++GTIA;
+    ++GTIB;
+  }
 
-  Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
-  Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
       OpA->getType() != OpB->getType())
     return false;
 
+  if (PtrDelta.isNegative()) {
+    if (PtrDelta.isMinSignedValue())
+      return false;
+    PtrDelta.negate();
+    std::swap(OpA, OpB);
+  }
+  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+  if (PtrDelta.urem(Stride) != 0)
+    return false;
+  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
   // Only look through a ZExt/SExt.
   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
     return false;
 
   bool Signed = isa<SExtInst>(OpA);
 
-  OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+  // At this point A could be a function parameter, i.e. not an instruction
+  Value *ValA = OpA->getOperand(0);
   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
-  if (!OpA || !OpB || OpA->getType() != OpB->getType())
+  if (!OpB || ValA->getType() != OpB->getType())
     return false;
 
-  // Now we need to prove that adding 1 to OpA won't overflow.
+  // Now we need to prove that adding IdxDiff to ValA won't overflow.
   bool Safe = false;
-  // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
-  // we're okay.
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+  // ValA, we're okay.
   if (OpB->getOpcode() == Instruction::Add &&
       isa<ConstantInt>(OpB->getOperand(1)) &&
-      cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
     if (Signed)
       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
     else
       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
   }
 
-  unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
 
   // Second attempt:
-  // If any bits are known to be zero other than the sign bit in OpA, we can
-  // add 1 to it while guaranteeing no overflow of any sort.
+  // If all set bits of IdxDiff or any higher order bit other than the sign bit
+  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+  // overflow of any sort.
   if (!Safe) {
+    OpA = dyn_cast<Instruction>(ValA);
+    if (!OpA)
+      return false;
     KnownBits Known(BitWidth);
     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
-    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
-      Safe = true;
+    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
+    if (Signed)
+      BitsAllowedToBeSet.clearBit(BitWidth - 1);
+    if (BitsAllowedToBeSet.ult(IdxDiff))
+      return false;
   }
 
-  if (!Safe)
+  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+  return X == OffsetSCEVB;
+}
+
+bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
+                                    const APInt &PtrDelta,
+                                    unsigned Depth) const {
+  if (Depth++ == MaxDepth)
     return false;
 
-  const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
-  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
-  const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
-  const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
-  return X2 == OffsetSCEVB;
+  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
+    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
+      return SelectA->getCondition() == SelectB->getCondition() &&
+             areConsecutivePointers(SelectA->getTrueValue(),
+                                    SelectB->getTrueValue(), PtrDelta, Depth) &&
+             areConsecutivePointers(SelectA->getFalseValue(),
+                                    SelectB->getFalseValue(), PtrDelta, Depth);
+    }
+  }
+  return false;
 }
 
 void Vectorizer::reorder(Instruction *I) {
@@ -448,7 +536,7 @@ Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
   SmallVector<Instruction *, 16> Instrs;
   for (Instruction *I : Chain) {
-    Value *PtrOperand = getPointerOperand(I);
+    Value *PtrOperand = getLoadStorePointerOperand(I);
     assert(PtrOperand && "Instruction must have a pointer operand.");
     Instrs.push_back(I);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
@@ -484,7 +572,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
   SmallVector<Instruction *, 16> ChainInstrs;
 
   bool IsLoadChain = isa<LoadInst>(Chain[0]);
-  DEBUG({
+  LLVM_DEBUG({
     for (Instruction *I : Chain) {
       if (IsLoadChain)
         assert(isa<LoadInst>(I) &&
@@ -506,11 +594,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
                    Intrinsic::sideeffect) {
       // Ignore llvm.sideeffect calls.
     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
-      DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
+      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
+                        << '\n');
       break;
     } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
-      DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
-                   << '\n');
+      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+                        << '\n');
       break;
     }
   }
@@ -536,32 +625,40 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
       if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
         break;
 
-      if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
+      auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
+      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
+      if (MemLoad && ChainLoad)
         continue;
 
+      // We can ignore the alias if the we have a load store pair and the load
+      // is known to be invariant. The load cannot be clobbered by the store.
+      auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
+        return LI->getMetadata(LLVMContext::MD_invariant_load);
+      };
+
       // We can ignore the alias as long as the load comes before the store,
       // because that means we won't be moving the load past the store to
       // vectorize it (the vectorized load is inserted at the location of the
       // first load in the chain).
-      if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
-          OBB.dominates(ChainInstr, MemInstr))
+      if (isa<StoreInst>(MemInstr) && ChainLoad &&
+          (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
         continue;
 
       // Same case, but in reverse.
-      if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
-          OBB.dominates(MemInstr, ChainInstr))
+      if (MemLoad && isa<StoreInst>(ChainInstr) &&
+          (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
         continue;
 
       if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
                         MemoryLocation::get(ChainInstr))) {
-        DEBUG({
+        LLVM_DEBUG({
           dbgs() << "LSV: Found alias:\n"
                     "  Aliasing instruction and pointer:\n"
                  << "  " << *MemInstr << '\n'
-                 << "  " << *getPointerOperand(MemInstr) << '\n'
+                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
                  << "  Aliased instruction and pointer:\n"
                  << "  " << *ChainInstr << '\n'
-                 << "  " << *getPointerOperand(ChainInstr) << '\n';
+                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
         });
         // Save this aliasing memory instruction as a barrier, but allow other
         // instructions that precede the barrier to be vectorized with this one.
@@ -594,6 +691,20 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
   return Chain.slice(0, ChainIdx);
 }
 
+static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
+  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+    // The select's themselves are distinct instructions even if they share the
+    // same condition and evaluate to consecutive pointers for true and false
+    // values of the condition. Therefore using the select's themselves for
+    // grouping instructions would put consecutive accesses into different lists
+    // and they won't be even checked for being consecutive, and won't be
+    // vectorized.
+    return Sel->getCondition();
+  }
+  return ObjPtr;
+}
+
 std::pair<InstrListMap, InstrListMap>
 Vectorizer::collectInstructions(BasicBlock *BB) {
   InstrListMap LoadRefs;
@@ -632,8 +743,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
       unsigned AS = Ptr->getType()->getPointerAddressSpace();
       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
 
+      unsigned VF = VecRegSize / TySize;
+      VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
       // No point in looking at these if they're too big to vectorize.
-      if (TySize > VecRegSize / 2)
+      if (TySize > VecRegSize / 2 ||
+          (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
         continue;
 
       // Make sure all the users of a vector are constant-index extracts.
@@ -644,8 +759,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Save the load locations.
-      Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
-      LoadRefs[ObjPtr].push_back(LI);
+      const ChainID ID = getChainID(Ptr, DL);
+      LoadRefs[ID].push_back(LI);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
       if (!SI->isSimple())
         continue;
@@ -675,8 +790,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
       unsigned AS = Ptr->getType()->getPointerAddressSpace();
       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
 
+      unsigned VF = VecRegSize / TySize;
+      VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
       // No point in looking at these if they're too big to vectorize.
-      if (TySize > VecRegSize / 2)
+      if (TySize > VecRegSize / 2 ||
+          (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
         continue;
 
       if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
@@ -686,8 +805,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Save store location.
-      Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
-      StoreRefs[ObjPtr].push_back(SI);
+      const ChainID ID = getChainID(Ptr, DL);
+      StoreRefs[ID].push_back(SI);
     }
   }
 
@@ -697,12 +816,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
   bool Changed = false;
 
-  for (const std::pair<Value *, InstrList> &Chain : Map) {
+  for (const std::pair<ChainID, InstrList> &Chain : Map) {
     unsigned Size = Chain.second.size();
     if (Size < 2)
       continue;
 
-    DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
 
     // Process the stores in chunks of 64.
     for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
@@ -716,7 +835,8 @@ bool Vectorizer::vectorizeChains(InstrListMap &Map) {
 }
 
 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
-  DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
+  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
+                    << " instructions.\n");
   SmallVector<int, 16> Heads, Tails;
   int ConsecutiveChain[64];
 
@@ -852,14 +972,14 @@ bool Vectorizer::vectorizeStoreChain(
   // vector factor, break it into two pieces.
   unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
-    DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
-                    " Creating two separate arrays.\n");
+    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+                         " Creating two separate arrays.\n");
     return vectorizeStoreChain(Chain.slice(0, TargetVF),
                                InstructionsProcessed) |
            vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "LSV: Stores to vectorize:\n";
     for (Instruction *I : Chain)
       dbgs() << "  " << *I << "\n";
@@ -1000,8 +1120,8 @@ bool Vectorizer::vectorizeLoadChain(
   // vector factor, break it into two pieces.
   unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
-    DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
-                    " Creating two separate arrays.\n");
+    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+                         " Creating two separate arrays.\n");
     return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
            vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
   }
@@ -1024,7 +1144,7 @@ bool Vectorizer::vectorizeLoadChain(
     Alignment = NewAlign;
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "LSV: Loads to vectorize:\n";
     for (Instruction *I : Chain)
       I->dump();
@@ -1107,7 +1227,7 @@ bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
   bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
                                                    SzInBytes * 8, AddressSpace,
                                                    Alignment, &Fast);
-  DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
-               << " and fast? " << Fast << "\n";);
+  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+                    << " and fast? " << Fast << "\n";);
   return !Allows || !Fast;
 }
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
new file mode 100644
index 000000000000..697bc1b448d7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -0,0 +1,1072 @@
+//===- LoopVectorizationLegality.cpp --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides loop vectorization legality analysis. Original code
+// resided in LoopVectorize.cpp for a long time.
+//
+// At this point, it is implemented as a utility class, not as an analysis
+// pass. It should be easy to create an analysis pass around it if there
+// is a need (but D45420 needs to happen first).
+//
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+static cl::opt<bool>
+    EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                       cl::desc("Enable if-conversion during vectorization."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum allowed number of runtime memory checks with a "
+             "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed with a "
+             "vectorize(enable) pragma"));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+namespace llvm {
+
+OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
+                                                  StringRef RemarkName,
+                                                  Loop *TheLoop,
+                                                  Instruction *I) {
+  Value *CodeRegion = TheLoop->getHeader();
+  DebugLoc DL = TheLoop->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+  R << "loop not vectorized: ";
+  return R;
+}
+
+bool LoopVectorizeHints::Hint::validate(unsigned Val) {
+  switch (Kind) {
+  case HK_WIDTH:
+    return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+  case HK_UNROLL:
+    return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+  case HK_FORCE:
+    return (Val <= 1);
+  case HK_ISVECTORIZED:
+    return (Val == 0 || Val == 1);
+  }
+  return false;
+}
+
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+                                       OptimizationRemarkEmitter &ORE)
+    : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
+      Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+      Force("vectorize.enable", FK_Undefined, HK_FORCE),
+      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+  // Populate values with existing loop metadata.
+  getHintsFromMetadata();
+
+  // force-vector-interleave overrides DisableInterleaving.
+  if (VectorizerParams::isInterleaveForced())
+    Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+  if (IsVectorized.Value != 1)
+    // If the vectorization width and interleaving count are both 1 then
+    // consider the loop to have been already vectorized because there's
+    // nothing more that we can do.
+    IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+  LLVM_DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+             << "LV: Interleaving disabled by the pass manager\n");
+}
+
+bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L,
+                                            bool AlwaysVectorize) const {
+  if (getForce() == LoopVectorizeHints::FK_Disabled) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+    emitRemarkWithHints();
+    return false;
+  }
+
+  if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+    emitRemarkWithHints();
+    return false;
+  }
+
+  if (getIsVectorized() == 1) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+    // FIXME: Add interleave.disable metadata. This will allow
+    // vectorize.disable to be used without disabling the pass and errors
+    // to differentiate between disabled vectorization and a width of 1.
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+                                        "AllDisabled", L->getStartLoc(),
+                                        L->getHeader())
+             << "loop not vectorized: vectorization and interleaving are "
+                "explicitly disabled, or the loop has already been "
+                "vectorized";
+    });
+    return false;
+  }
+
+  return true;
+}
+
+void LoopVectorizeHints::emitRemarkWithHints() const {
+  using namespace ore;
+
+  ORE.emit([&]() {
+    if (Force.Value == LoopVectorizeHints::FK_Disabled)
+      return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+                                      TheLoop->getStartLoc(),
+                                      TheLoop->getHeader())
+             << "loop not vectorized: vectorization is explicitly disabled";
+    else {
+      OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+                                 TheLoop->getStartLoc(), TheLoop->getHeader());
+      R << "loop not vectorized";
+      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+        R << " (Force=" << NV("Force", true);
+        if (Width.Value != 0)
+          R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+        if (Interleave.Value != 0)
+          R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+        R << ")";
+      }
+      return R;
+    }
+  });
+}
+
+const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
+  if (getWidth() == 1)
+    return LV_NAME;
+  if (getForce() == LoopVectorizeHints::FK_Disabled)
+    return LV_NAME;
+  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+    return LV_NAME;
+  return OptimizationRemarkAnalysis::AlwaysPrint;
+}
+
+void LoopVectorizeHints::getHintsFromMetadata() {
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (!LoopID)
+    return;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+    const MDString *S = nullptr;
+    SmallVector<Metadata *, 4> Args;
+
+    // The expected hint is either a MDString or a MDNode with the first
+    // operand a MDString.
+    if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+      if (!MD || MD->getNumOperands() == 0)
+        continue;
+      S = dyn_cast<MDString>(MD->getOperand(0));
+      for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+        Args.push_back(MD->getOperand(i));
+    } else {
+      S = dyn_cast<MDString>(LoopID->getOperand(i));
+      assert(Args.size() == 0 && "too many arguments for MDString");
+    }
+
+    if (!S)
+      continue;
+
+    // Check if the hint starts with the loop metadata prefix.
+    StringRef Name = S->getString();
+    if (Args.size() == 1)
+      setHint(Name, Args[0]);
+  }
+}
+
+void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
+  if (!Name.startswith(Prefix()))
+    return;
+  Name = Name.substr(Prefix().size(), StringRef::npos);
+
+  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+  if (!C)
+    return;
+  unsigned Val = C->getZExtValue();
+
+  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+  for (auto H : Hints) {
+    if (Name == H->Name) {
+      if (H->validate(Val))
+        H->Value = Val;
+      else
+        LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+      break;
+    }
+  }
+}
+
+MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name,
+                                               unsigned V) const {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  Metadata *MDs[] = {
+      MDString::get(Context, Name),
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+  return MDNode::get(Context, MDs);
+}
+
+bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node,
+                                                 ArrayRef<Hint> HintTypes) {
+  MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
+  if (!Name)
+    return false;
+
+  for (auto H : HintTypes)
+    if (Name->getString().endswith(H.Name))
+      return true;
+  return false;
+}
+
+void LoopVectorizeHints::writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+  if (HintTypes.empty())
+    return;
+
+  // Reserve the first element to LoopID (see below).
+  SmallVector<Metadata *, 4> MDs(1);
+  // If the loop already has metadata, then ignore the existing operands.
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+      // If node in update list, ignore old value.
+      if (!matchesHintMetadataName(Node, HintTypes))
+        MDs.push_back(Node);
+    }
+  }
+
+  // Now, add the missing hints.
+  for (auto H : HintTypes)
+    MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+  // Replace current metadata node with new one.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+
+  TheLoop->setLoopID(NewLoopID);
+}
+
+bool LoopVectorizationRequirements::doesNotMeet(
+    Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+  const char *PassName = Hints.vectorizeAnalysisPassName();
+  bool Failed = false;
+  if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysisFPCommute(
+                 PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
+                 UnsafeAlgebraInst->getParent())
+             << "loop not vectorized: cannot prove it is safe to reorder "
+                "floating-point operations";
+    });
+    Failed = true;
+  }
+
+  // Test if runtime memcheck thresholds are exceeded.
+  bool PragmaThresholdReached =
+      NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+  bool ThresholdReached =
+      NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+  if ((ThresholdReached && !Hints.allowReordering()) ||
+      PragmaThresholdReached) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+                                                L->getStartLoc(),
+                                                L->getHeader())
+             << "loop not vectorized: cannot prove it is safe to reorder "
+                "memory operations";
+    });
+    LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+    Failed = true;
+  }
+
+  return Failed;
+}
+
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+//   1) it has a canonical IV (starting from 0 and with stride 1),
+//   2) its latch terminator is a conditional branch and,
+//   3) its latch condition is a compare instruction whose operands are the
+//      canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+  assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+  // If Lp is the outer loop, it's uniform by definition.
+  if (Lp == OuterLp)
+    return true;
+  assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+  // 1.
+  PHINode *IV = Lp->getCanonicalInductionVariable();
+  if (!IV) {
+    LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+    return false;
+  }
+
+  // 2.
+  BasicBlock *Latch = Lp->getLoopLatch();
+  auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+    return false;
+  }
+
+  // 3.
+  auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+  if (!LatchCmp) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+    return false;
+  }
+
+  Value *CondOp0 = LatchCmp->getOperand(0);
+  Value *CondOp1 = LatchCmp->getOperand(1);
+  Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+  if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+      !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+    LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+    return false;
+  }
+
+  return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+  if (!isUniformLoop(Lp, OuterLp))
+    return false;
+
+  // Check if nested loops are uniform.
+  for (Loop *SubLp : *Lp)
+    if (!isUniformLoopNest(SubLp, OuterLp))
+      return false;
+
+  return true;
+}
+
+/// Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (PHINode &Phi : BB->phis()) {
+    for (Value *V : Phi.incoming_values())
+      if (auto *C = dyn_cast<Constant>(V))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+  if (Ty->isPointerTy())
+    return DL.getIntPtrType(Ty);
+
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
+  return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+  Ty0 = convertPointerToIntegerType(DL, Ty0);
+  Ty1 = convertPointerToIntegerType(DL, Ty1);
+  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+    return Ty0;
+  return Ty1;
+}
+
+/// Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSetImpl<Value *> &AllowedExit) {
+  // Reduction and Induction instructions are allowed to have exit users. All
+  // other instructions must not have external users.
+  if (!AllowedExit.count(Inst))
+    // Check that all of the users of the loop are inside the BB.
+    for (User *U : Inst->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(UI)) {
+        LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+        return true;
+      }
+    }
+  return false;
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  const ValueToValueMap &Strides =
+      getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+
+  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+  if (Stride == 1 || Stride == -1)
+    return Stride;
+  return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return LAI->isUniform(V);
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+  assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Check whether the BB terminator is a BranchInst. Any other terminator is
+    // not supported yet.
+    auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!Br) {
+      LLVM_DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n");
+      ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+                << "loop control flow is not understood by vectorizer");
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+
+    // Check whether the BranchInst is a supported one. Only unconditional
+    // branches, conditional branches with an outer loop invariant condition or
+    // backedges are supported.
+    if (Br && Br->isConditional() &&
+        !TheLoop->isLoopInvariant(Br->getCondition()) &&
+        !LI->isLoopHeader(Br->getSuccessor(0)) &&
+        !LI->isLoopHeader(Br->getSuccessor(1))) {
+      LLVM_DEBUG(dbgs() << "LV: Unsupported conditional branch.\n");
+      ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+                << "loop control flow is not understood by vectorizer");
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+  }
+
+  // Check whether inner loops are uniform. At this point, we only support
+  // simple outer loops scenarios with uniform nested loops.
+  if (!isUniformLoopNest(TheLoop /*loop nest*/,
+                         TheLoop /*context outer loop*/)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Not vectorizing: Outer loop contains divergent loops.\n");
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  return Result;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+    PHINode *Phi, const InductionDescriptor &ID,
+    SmallPtrSetImpl<Value *> &AllowedExit) {
+  Inductions[Phi] = ID;
+
+  // In case this induction also comes with casts that we know we can ignore
+  // in the vectorized loop body, record them here. All casts could be recorded
+  // here for ignoring, but suffices to record only the first (as it is the
+  // only one that may bw used outside the cast sequence).
+  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+  if (!Casts.empty())
+    InductionCastsToIgnore.insert(*Casts.begin());
+
+  Type *PhiTy = Phi->getType();
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+  // Get the widest type.
+  if (!PhiTy->isFloatingPointTy()) {
+    if (!WidestIndTy)
+      WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+    else
+      WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+  }
+
+  // Int inductions are special because we only allow one IV.
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+      ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+      isa<Constant>(ID.getStartValue()) &&
+      cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+    // Use the phi node with the widest type as induction. Use the last
+    // one if there are multiple (no good reason for doing this other
+    // than it is expedient). We've checked that it begins at zero and
+    // steps by one, so this is a canonical induction variable.
+    if (!PrimaryInduction || PhiTy == WidestIndTy)
+      PrimaryInduction = Phi;
+  }
+
+  // Both the PHI node itself, and the "post-increment" value feeding
+  // back into the PHI node may have external users.
+  // We can allow those uses, except if the SCEVs we have for them rely
+  // on predicates that only hold within the loop, since allowing the exit
+  // currently means re-using this SCEV outside the loop.
+  if (PSE.getUnionPredicate().isAlwaysTrue()) {
+    AllowedExit.insert(Phi);
+    AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+  BasicBlock *Header = TheLoop->getHeader();
+
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+  // For each block in the loop.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Scan the instructions in the block and look for hazards.
+    for (Instruction &I : *BB) {
+      if (auto *Phi = dyn_cast<PHINode>(&I)) {
+        Type *PhiTy = Phi->getType();
+        // Check that this PHI type is allowed.
+        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+            !PhiTy->isPointerTy()) {
+          ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+                    << "loop control flow is not understood by vectorizer");
+          LLVM_DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+          return false;
+        }
+
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
+        if (BB != Header) {
+          // Check that this instruction has no outside users or is an
+          // identified reduction value with an outside user.
+          if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
+            continue;
+          ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
+                    << "value could not be identified as "
+                       "an induction or reduction variable");
+          return false;
+        }
+
+        // We only allow if-converted PHIs with exactly two incoming values.
+        if (Phi->getNumIncomingValues() != 2) {
+          ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+                    << "control flow not understood by vectorizer");
+          LLVM_DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
+
+        RecurrenceDescriptor RedDes;
+        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+                                                 DT)) {
+          if (RedDes.hasUnsafeAlgebra())
+            Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+          AllowedExit.insert(RedDes.getLoopExitInstr());
+          Reductions[Phi] = RedDes;
+          continue;
+        }
+
+        InductionDescriptor ID;
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+            Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+          continue;
+        }
+
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
+          FirstOrderRecurrences.insert(Phi);
+          continue;
+        }
+
+        // As a last resort, coerce the PHI to a AddRec expression
+        // and re-try classifying it a an induction PHI.
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          continue;
+        }
+
+        ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
+                  << "value that could not be identified as "
+                     "reduction is used outside the loop");
+        LLVM_DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
+        return false;
+      } // end of PHI handling
+
+      // We handle calls that:
+      //   * Are debug info intrinsics.
+      //   * Have a mapping to an IR intrinsic.
+      //   * Have a vector version available.
+      auto *CI = dyn_cast<CallInst>(&I);
+      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+          !isa<DbgInfoIntrinsic>(CI) &&
+          !(CI->getCalledFunction() && TLI &&
+            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+        ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
+                  << "call instruction cannot be vectorized");
+        LLVM_DEBUG(
+            dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
+        return false;
+      }
+
+      // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+      // second argument is the same (i.e. loop invariant)
+      if (CI && hasVectorInstrinsicScalarOpd(
+                    getVectorIntrinsicIDForCall(CI, TLI), 1)) {
+        auto *SE = PSE.getSE();
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+          ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
+                    << "intrinsic instruction cannot be vectorized");
+          LLVM_DEBUG(dbgs()
+                     << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+          return false;
+        }
+      }
+
+      // Check that the instruction return type is vectorizable.
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(I.getType()) &&
+           !I.getType()->isVoidTy()) ||
+          isa<ExtractElementInst>(I)) {
+        ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
+                  << "instruction return type cannot be vectorized");
+        LLVM_DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
+        return false;
+      }
+
+      // Check that the stored type is vectorizable.
+      if (auto *ST = dyn_cast<StoreInst>(&I)) {
+        Type *T = ST->getValueOperand()->getType();
+        if (!VectorType::isValidElementType(T)) {
+          ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
+                    << "store instruction cannot be vectorized");
+          return false;
+        }
+
+        // FP instructions can allow unsafe algebra, thus vectorizable by
+        // non-IEEE-754 compliant SIMD units.
+        // This applies to floating-point math operations and calls, not memory
+        // operations, shuffles, or casts, as they don't change precision or
+        // semantics.
+      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+                 !I.isFast()) {
+        LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+        Hints->setPotentiallyUnsafe();
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+        ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
+                  << "value cannot be used outside the loop");
+        return false;
+      }
+    } // next instr.
+  }
+
+  if (!PrimaryInduction) {
+    LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    if (Inductions.empty()) {
+      ORE->emit(createMissedAnalysis("NoInductionVariable")
+                << "loop induction variable could not be identified");
+      return false;
+    }
+  }
+
+  // Now we know the widest induction type, check if our found induction
+  // is the same size. If it's not, unset it here and InnerLoopVectorizer
+  // will create another.
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+  LAI = &(*GetLAA)(*TheLoop);
+  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+  if (LAR) {
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+                                        "loop not vectorized: ", *LAR);
+    });
+  }
+  if (!LAI->canVectorizeMemory())
+    return false;
+
+  if (LAI->hasStoreToLoopInvariantAddress()) {
+    ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
+              << "write to a loop invariant address could not be vectorized");
+    LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+    return false;
+  }
+
+  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+  return true;
+}
+
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+  Value *In0 = const_cast<Value *>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
+    return false;
+
+  return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+  return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+  for (Instruction &I : *BB) {
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Value *Operand : I.operands()) {
+      if (auto *C = dyn_cast<Constant>(Operand))
+        if (C->canTrap())
+          return false;
+    }
+    // We might be able to hoist the load.
+    if (I.mayReadFromMemory()) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      if (!LI)
+        return false;
+      if (!SafePtrs.count(LI->getPointerOperand())) {
+        // !llvm.mem.parallel_loop_access implies if-conversion safety.
+        // Otherwise, record that the load needs (real or emulated) masking
+        // and let the cost model decide.
+        if (!IsAnnotatedParallel)
+          MaskedOp.insert(LI);
+        continue;
+      }
+    }
+
+    if (I.mayWriteToMemory()) {
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (!SI)
+        return false;
+      // Predicated store requires some form of masking:
+      // 1) masked store HW instruction,
+      // 2) emulation via load-blend-store (only if safe and legal to do so,
+      //    be aware on the race conditions), or
+      // 3) element-by-element predicate check and scalar store.
+      MaskedOp.insert(SI);
+      continue;
+    }
+    if (I.mayThrow())
+      return false;
+  }
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion) {
+    ORE->emit(createMissedAnalysis("IfConversionDisabled")
+              << "if-conversion is disabled");
+    return false;
+  }
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+  // A list of pointers that we can safely read and write to.
+  SmallPtrSet<Value *, 8> SafePointes;
+
+  // Collect safe addresses.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (blockNeedsPredication(BB))
+      continue;
+
+    for (Instruction &I : *BB)
+      if (auto *Ptr = getLoadStorePointerOperand(&I))
+        SafePointes.insert(Ptr);
+  }
+
+  // Collect the blocks that need predication.
+  BasicBlock *Header = TheLoop->getHeader();
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator())) {
+      ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
+                << "loop contains a switch statement");
+      return false;
+    }
+
+    // We must be able to predicate all blocks that need to be predicated.
+    if (blockNeedsPredication(BB)) {
+      if (!blockCanBePredicated(BB, SafePointes)) {
+        ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                  << "control flow cannot be substituted for a select");
+        return false;
+      }
+    } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+      ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                << "control flow cannot be substituted for a select");
+      return false;
+    }
+  }
+
+  // We can if-convert this loop.
+  return true;
+}
+
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
+                                                    bool UseVPlanNativePath) {
+  assert((UseVPlanNativePath || Lp->empty()) &&
+         "VPlan-native path is not enabled.");
+
+  // TODO: ORE should be improved to show more accurate information when an
+  // outer loop can't be vectorized because a nested loop is not understood or
+  // legal. Something like: "outer_loop_location: loop not vectorized:
+  // (inner_loop_location) loop control flow is not understood by vectorizer".
+
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!Lp->getLoopPreheader()) {
+    LLVM_DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // We must have a single backedge.
+  if (Lp->getNumBackEdges() != 1) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // We must have a single exiting block.
+  if (!Lp->getExitingBlock()) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // We only handle bottom-tested loops, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
+    Loop *Lp, bool UseVPlanNativePath) {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Recursively check whether the loop control flow of nested loops is
+  // understood.
+  for (Loop *SubLp : *Lp)
+    if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+
+  return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  // Check whether the loop-related control flow in the loop nest is expected by
+  // vectorizer.
+  if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // We need to have a loop header.
+  LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+                    << '\n');
+
+  // Specific checks for outer loops. We skip the remaining legal checks at this
+  // point because they don't support outer loops.
+  if (!TheLoop->empty()) {
+    assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
+
+    if (!canVectorizeOuterLoop()) {
+      LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n");
+      // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+      // outer loops.
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+    return Result;
+  }
+
+  assert(TheLoop->empty() && "Inner loop expected.");
+  // Check if we can if-convert non-single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs()) {
+    LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeMemory()) {
+    LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+                    << (LAI->getRuntimePointerChecking()->Need
+                            ? " (with a runtime bound check)"
+                            : "")
+                    << "!\n");
+
+  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+    ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
+              << "Too many SCEV assumptions need to be made and checked "
+              << "at runtime");
+    LLVM_DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Okay! We've done all the tests. If any have failed, return false. Otherwise
+  // we can vectorize, and at this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return Result;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
new file mode 100644
index 000000000000..2aa219064299
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -0,0 +1,282 @@
+//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a LoopVectorizationPlanner class.
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
+///
+/// Also provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+
+#include "VPlan.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+/// VPlan-based builder utility analogous to IRBuilder.
+class VPBuilder {
+private:
+  VPBasicBlock *BB = nullptr;
+  VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+  VPInstruction *createInstruction(unsigned Opcode,
+                                   ArrayRef<VPValue *> Operands) {
+    VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+    if (BB)
+      BB->insert(Instr, InsertPt);
+    return Instr;
+  }
+
+  VPInstruction *createInstruction(unsigned Opcode,
+                                   std::initializer_list<VPValue *> Operands) {
+    return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+  }
+
+public:
+  VPBuilder() {}
+
+  /// Clear the insertion point: created instructions will not be inserted into
+  /// a block.
+  void clearInsertionPoint() {
+    BB = nullptr;
+    InsertPt = VPBasicBlock::iterator();
+  }
+
+  VPBasicBlock *getInsertBlock() const { return BB; }
+  VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+
+  /// InsertPoint - A saved insertion point.
+  class VPInsertPoint {
+    VPBasicBlock *Block = nullptr;
+    VPBasicBlock::iterator Point;
+
+  public:
+    /// Creates a new insertion point which doesn't point to anything.
+    VPInsertPoint() = default;
+
+    /// Creates a new insertion point at the given location.
+    VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
+        : Block(InsertBlock), Point(InsertPoint) {}
+
+    /// Returns true if this insert point is set.
+    bool isSet() const { return Block != nullptr; }
+
+    VPBasicBlock *getBlock() const { return Block; }
+    VPBasicBlock::iterator getPoint() const { return Point; }
+  };
+
+  /// Sets the current insert point to a previously-saved location.
+  void restoreIP(VPInsertPoint IP) {
+    if (IP.isSet())
+      setInsertPoint(IP.getBlock(), IP.getPoint());
+    else
+      clearInsertionPoint();
+  }
+
+  /// This specifies that created VPInstructions should be appended to the end
+  /// of the specified block.
+  void setInsertPoint(VPBasicBlock *TheBB) {
+    assert(TheBB && "Attempting to set a null insert point");
+    BB = TheBB;
+    InsertPt = BB->end();
+  }
+
+  /// This specifies that created instructions should be inserted at the
+  /// specified point.
+  void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
+    BB = TheBB;
+    InsertPt = IP;
+  }
+
+  /// Insert and return the specified instruction.
+  VPInstruction *insert(VPInstruction *I) const {
+    BB->insert(I, InsertPt);
+    return I;
+  }
+
+  /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
+  /// its underlying Instruction.
+  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                        Instruction *Inst = nullptr) {
+    VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+    NewVPInst->setUnderlyingValue(Inst);
+    return NewVPInst;
+  }
+  VPValue *createNaryOp(unsigned Opcode,
+                        std::initializer_list<VPValue *> Operands,
+                        Instruction *Inst = nullptr) {
+    return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+  }
+
+  VPValue *createNot(VPValue *Operand) {
+    return createInstruction(VPInstruction::Not, {Operand});
+  }
+
+  VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+  }
+
+  VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+  }
+
+  //===--------------------------------------------------------------------===//
+  // RAII helpers.
+  //===--------------------------------------------------------------------===//
+
+  /// RAII object that stores the current insertion point and restores it when
+  /// the object is destroyed.
+  class InsertPointGuard {
+    VPBuilder &Builder;
+    VPBasicBlock *Block;
+    VPBasicBlock::iterator Point;
+
+  public:
+    InsertPointGuard(VPBuilder &B)
+        : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
+
+    InsertPointGuard(const InsertPointGuard &) = delete;
+    InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
+    ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
+  };
+};
+
+/// TODO: The following VectorizationFactor was pulled out of
+/// LoopVectorizationCostModel class. LV also deals with
+/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// We need to streamline them.
+
+/// Information about vectorization costs
+struct VectorizationFactor {
+  // Vector width with best cost
+  unsigned Width;
+  // Cost of the loop with that width
+  unsigned Cost;
+};
+
+/// Planner drives the vectorization process after having passed
+/// Legality checks.
+class LoopVectorizationPlanner {
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+
+  using VPlanPtr = std::unique_ptr<VPlan>;
+
+  SmallVector<VPlanPtr, 4> VPlans;
+
+  /// This class is used to enable the VPlan to invoke a method of ILV. This is
+  /// needed until the method is refactored out of ILV and becomes reusable.
+  struct VPCallbackILV : public VPCallback {
+    InnerLoopVectorizer &ILV;
+
+    VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+    Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+  };
+
+  /// A builder used to construct the current plan.
+  VPBuilder Builder;
+
+  unsigned BestVF = 0;
+  unsigned BestUF = 0;
+
+public:
+  LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+                           const TargetTransformInfo *TTI,
+                           LoopVectorizationLegality *Legal,
+                           LoopVectorizationCostModel &CM)
+      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
+
+  /// Plan how to best vectorize, return the best VF and its cost.
+  VectorizationFactor plan(bool OptForSize, unsigned UserVF);
+
+  /// Use the VPlan-native path to plan how to best vectorize, return the best
+  /// VF and its cost.
+  VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+
+  /// Finalize the best decision and dispose of all other VPlans.
+  void setBestPlan(unsigned VF, unsigned UF);
+
+  /// Generate the IR code for the body of the vectorized loop according to the
+  /// best selected VPlan.
+  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+  void printPlans(raw_ostream &O) {
+    for (const auto &Plan : VPlans)
+      O << *Plan;
+  }
+
+  /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+  /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+  /// returned value holds for the entire \p Range.
+  static bool
+  getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+                           VFRange &Range);
+
+protected:
+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+  /// according to the information gathered by Legal when it checked if it is
+  /// legal to vectorize the loop.
+  void buildVPlans(unsigned MinVF, unsigned MaxVF);
+
+private:
+  /// Build a VPlan according to the information gathered by Legal. \return a
+  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+  /// exclusive, possibly decreasing \p Range.End.
+  VPlanPtr buildVPlan(VFRange &Range);
+
+  /// Build a VPlan using VPRecipes according to the information gather by
+  /// Legal. This method is only used for the legacy inner loop vectorizer.
+  VPlanPtr
+  buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+                          SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+  /// according to the information gathered by Legal when it checked if it is
+  /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+  void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5bcf0c0a7ba6..3c693f5d5ee0 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -26,6 +26,14 @@
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
 //
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
@@ -47,8 +55,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
-#include "VPlan.h"
-#include "VPlanBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
+#include "VPlanHCFGBuilder.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -57,11 +66,9 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -70,6 +77,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -124,6 +132,7 @@
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -145,10 +154,6 @@ using namespace llvm;
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 
-static cl::opt<bool>
-    EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
-                       cl::desc("Enable if-conversion during vectorization."));
-
 /// Loops with a known constant trip count below this number are vectorized only
 /// if no scalar iteration overheads are incurred.
 static cl::opt<unsigned> TinyTripCountVectorThreshold(
@@ -184,9 +189,6 @@ static cl::opt<unsigned> ForceTargetNumVectorRegs(
     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's number of vector registers."));
 
-/// Maximum vectorization interleave count.
-static const unsigned MaxInterleaveFactor = 16;
-
 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's max interleave factor for "
@@ -209,7 +211,7 @@ static cl::opt<unsigned> SmallLoopCost(
         "The cost of a loop that is considered 'small' by the interleaver."));
 
 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
-    "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
     cl::desc("Enable the use of the block frequency analysis to access PGO "
              "heuristics minimizing code growth in cold regions and being more "
              "aggressive in hot regions."));
@@ -238,71 +240,21 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
     cl::desc("The maximum interleave count to use when interleaving a scalar "
              "reduction in a nested loop."));
 
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
-    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
-    cl::desc("The maximum allowed number of runtime memory checks with a "
-             "vectorize(enable) pragma."));
-
-static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
-    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
-    cl::desc("The maximum number of SCEV checks allowed."));
-
-static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
-    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
-    cl::desc("The maximum number of SCEV checks allowed with a "
-             "vectorize(enable) pragma"));
-
-/// Create an analysis remark that explains why vectorization failed
-///
-/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
-/// RemarkName is the identifier for the remark.  If \p I is passed it is an
-/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
-/// the location of the remark.  \return the remark object that can be
-/// streamed to.
-static OptimizationRemarkAnalysis
-createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
-                     Instruction *I = nullptr) {
-  Value *CodeRegion = TheLoop->getHeader();
-  DebugLoc DL = TheLoop->getStartLoc();
-
-  if (I) {
-    CodeRegion = I->getParent();
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-  }
-
-  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
-  R << "loop not vectorized: ";
-  return R;
-}
-
-namespace {
-
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-class LoopVectorizationRequirements;
-
-} // end anonymous namespace
-
-/// Returns true if the given loop body has a cycle, excluding the loop
-/// itself.
-static bool hasCyclesInLoopBody(const Loop &L) {
-  if (!L.empty())
-    return true;
-
-  for (const auto &SCC :
-       make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
-                  scc_iterator<Loop, LoopBodyTraits>::end(L))) {
-    if (SCC.size() > 1) {
-      DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
-      DEBUG(L.dump());
-      return true;
-    }
-  }
-  return false;
-}
+static cl::opt<bool> EnableVPlanNativePath(
+    "enable-vplan-native-path", cl::init(false), cl::Hidden,
+    cl::desc("Enable VPlan-native vectorization path with "
+             "support for outer loop vectorization."));
+
+// This flag enables the stress testing of the VPlan H-CFG construction in the
+// VPlan-native vectorization path. It must be used in conjuction with
+// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
+// verification of the H-CFGs built.
+static cl::opt<bool> VPlanBuildStressTest(
+    "vplan-build-stress-test", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Build VPlan for every supported loop nest in the function and bail "
+        "out right after the build (stress test the VPlan H-CFG construction "
+        "in the VPlan-native vectorization path)."));
 
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
@@ -317,16 +269,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
 // in the project. They can be effectively organized in a common Load/Store
 // utilities unit.
 
-/// A helper function that returns the pointer operand of a load or store
-/// instruction.
-static Value *getPointerOperand(Value *I) {
-  if (auto *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (auto *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return nullptr;
-}
-
 /// A helper function that returns the type of loaded or stored value.
 static Type *getMemInstValueType(Value *I) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -373,7 +315,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 
 /// A helper function that returns the reciprocal of the block probability of
 /// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for for every X iterations of the loop header.
+/// will execute once for every X iterations of the loop header.
 ///
 /// TODO: We should use actual block probability here, if available. Currently,
 ///       we always assume predicated blocks have a 50% chance of executing.
@@ -502,7 +444,7 @@ public:
   void vectorizeMemoryInstruction(Instruction *Instr,
                                   VectorParts *BlockInMask = nullptr);
 
-  /// \brief Set the debug location in the builder using the debug location in
+  /// Set the debug location in the builder using the debug location in
   /// the instruction.
   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
 
@@ -538,7 +480,7 @@ protected:
   /// vectorizing this phi node.
   void fixReduction(PHINode *Phi);
 
-  /// \brief The Loop exit block may have single value PHI nodes with some
+  /// The Loop exit block may have single value PHI nodes with some
   /// incoming value. While vectorizing we only handled real values
   /// that were defined inside the loop and we should have one value for
   /// each predecessor of its parent basic block. See PR14725.
@@ -573,9 +515,9 @@ protected:
   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
   /// variable on which to base the steps, \p Step is the size of the step, and
   /// \p EntryVal is the value from the original loop that maps to the steps.
-  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
-  /// can be a truncate instruction).
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+  /// Note that \p EntryVal doesn't have to be an induction variable - it
+  /// can also be a truncate instruction.
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
                         const InductionDescriptor &ID);
 
   /// Create a vector induction phi node based on an existing scalar one. \p
@@ -602,10 +544,20 @@ protected:
   /// vector loop for both the Phi and the cast. 
   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
-  void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
-                                              Value *VectorLoopValue, 
-                                              unsigned Part, 
-                                              unsigned Lane = UINT_MAX);
+  ///
+  /// \p EntryVal is the value from the original loop that maps to the vector
+  /// phi node and is used to distinguish what is the IV currently being
+  /// processed - original one (if \p EntryVal is a phi corresponding to the
+  /// original IV) or the "newly-created" one based on the proof mentioned above
+  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
+  /// latter case \p EntryVal is a TruncInst and we must not record anything for
+  /// that IV, but it's error-prone to expect callers of this routine to care
+  /// about that, hence this explicit parameter.
+  void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
+                                             const Instruction *EntryVal,
+                                             Value *VectorLoopValue,
+                                             unsigned Part,
+                                             unsigned Lane = UINT_MAX);
 
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
@@ -646,7 +598,7 @@ protected:
   /// loop.
   void addMetadata(Instruction *To, Instruction *From);
 
-  /// \brief Similar to the previous function but it adds the metadata to a
+  /// Similar to the previous function but it adds the metadata to a
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
@@ -679,7 +631,7 @@ protected:
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// LoopVersioning.  It's only set up (non-null) if memchecks were
   /// used.
   ///
   /// This is currently only used to add no-alias metadata based on the
@@ -777,7 +729,7 @@ private:
 
 } // end namespace llvm
 
-/// \brief Look for a meaningful debug location on the instruction or it's
+/// Look for a meaningful debug location on the instruction or it's
 /// operands.
 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
   if (!I)
@@ -849,7 +801,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
 
 namespace llvm {
 
-/// \brief The group of interleaved loads/stores sharing the same stride and
+/// The group of interleaved loads/stores sharing the same stride and
 /// close to each other.
 ///
 /// Each member in this group has an index starting from 0, and the largest
@@ -893,7 +845,7 @@ public:
   unsigned getAlignment() const { return Align; }
   unsigned getNumMembers() const { return Members.size(); }
 
-  /// \brief Try to insert a new member \p Instr with index \p Index and
+  /// Try to insert a new member \p Instr with index \p Index and
   /// alignment \p NewAlign. The index is related to the leader and it could be
   /// negative if it is the new leader.
   ///
@@ -927,7 +879,7 @@ public:
     return true;
   }
 
-  /// \brief Get the member with the given index \p Index
+  /// Get the member with the given index \p Index
   ///
   /// \returns nullptr if contains no such member.
   Instruction *getMember(unsigned Index) const {
@@ -938,7 +890,7 @@ public:
     return Members.find(Key)->second;
   }
 
-  /// \brief Get the index for the given member. Unlike the key in the member
+  /// Get the index for the given member. Unlike the key in the member
   /// map, the index starts from 0.
   unsigned getIndex(Instruction *Instr) const {
     for (auto I : Members)
@@ -989,7 +941,7 @@ private:
 
 namespace {
 
-/// \brief Drive the analysis of interleaved memory accesses in the loop.
+/// Drive the analysis of interleaved memory accesses in the loop.
 ///
 /// Use this class to analyze interleaved accesses only when we can vectorize
 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
@@ -1000,11 +952,12 @@ namespace {
 class InterleavedAccessInfo {
 public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
-                        DominatorTree *DT, LoopInfo *LI)
-      : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
+                        DominatorTree *DT, LoopInfo *LI,
+                        const LoopAccessInfo *LAI)
+    : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
   ~InterleavedAccessInfo() {
-    SmallSet<InterleaveGroup *, 4> DelSet;
+    SmallPtrSet<InterleaveGroup *, 4> DelSet;
     // Avoid releasing a pointer twice.
     for (auto &I : InterleaveGroupMap)
       DelSet.insert(I.second);
@@ -1012,16 +965,16 @@ public:
       delete Ptr;
   }
 
-  /// \brief Analyze the interleaved accesses and collect them in interleave
+  /// Analyze the interleaved accesses and collect them in interleave
   /// groups. Substitute symbolic strides using \p Strides.
-  void analyzeInterleaving(const ValueToValueMap &Strides);
+  void analyzeInterleaving();
 
-  /// \brief Check if \p Instr belongs to any interleave group.
+  /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
     return InterleaveGroupMap.count(Instr);
   }
 
-  /// \brief Get the interleave group that \p Instr belongs to.
+  /// Get the interleave group that \p Instr belongs to.
   ///
   /// \returns nullptr if doesn't have such group.
   InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
@@ -1030,13 +983,10 @@ public:
     return nullptr;
   }
 
-  /// \brief Returns true if an interleaved group that may access memory
+  /// Returns true if an interleaved group that may access memory
   /// out-of-bounds requires a scalar epilogue iteration for correctness.
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 
-  /// \brief Initialize the LoopAccessInfo used for dependence checking.
-  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
-
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -1047,7 +997,7 @@ private:
   Loop *TheLoop;
   DominatorTree *DT;
   LoopInfo *LI;
-  const LoopAccessInfo *LAI = nullptr;
+  const LoopAccessInfo *LAI;
 
   /// True if the loop may contain non-reversed interleaved groups with
   /// out-of-bounds accesses. We ensure we don't speculatively access memory
@@ -1061,7 +1011,7 @@ private:
   /// access to a set of dependent sink accesses.
   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
 
-  /// \brief The descriptor for a strided memory access.
+  /// The descriptor for a strided memory access.
   struct StrideDescriptor {
     StrideDescriptor() = default;
     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
@@ -1081,10 +1031,10 @@ private:
     unsigned Align = 0;
   };
 
-  /// \brief A type for holding instructions and their stride descriptors.
+  /// A type for holding instructions and their stride descriptors.
   using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
 
-  /// \brief Create a new interleave group with the given instruction \p Instr,
+  /// Create a new interleave group with the given instruction \p Instr,
   /// stride \p Stride and alignment \p Align.
   ///
   /// \returns the newly created interleave group.
@@ -1096,7 +1046,7 @@ private:
     return InterleaveGroupMap[Instr];
   }
 
-  /// \brief Release the group and remove all the relationships.
+  /// Release the group and remove all the relationships.
   void releaseGroup(InterleaveGroup *Group) {
     for (unsigned i = 0; i < Group->getFactor(); i++)
       if (Instruction *Member = Group->getMember(i))
@@ -1105,28 +1055,28 @@ private:
     delete Group;
   }
 
-  /// \brief Collect all the accesses with a constant stride in program order.
+  /// Collect all the accesses with a constant stride in program order.
   void collectConstStrideAccesses(
       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
       const ValueToValueMap &Strides);
 
-  /// \brief Returns true if \p Stride is allowed in an interleaved group.
+  /// Returns true if \p Stride is allowed in an interleaved group.
   static bool isStrided(int Stride) {
     unsigned Factor = std::abs(Stride);
     return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
   }
 
-  /// \brief Returns true if \p BB is a predicated block.
+  /// Returns true if \p BB is a predicated block.
   bool isPredicated(BasicBlock *BB) const {
     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
   }
 
-  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+  /// Returns true if LoopAccessInfo can be used for dependence queries.
   bool areDependencesValid() const {
     return LAI && LAI->getDepChecker().getDependences();
   }
 
-  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
+  /// Returns true if memory accesses \p A and \p B can be reordered, if
   /// necessary, when constructing interleaved groups.
   ///
   /// \p A must precede \p B in program order. We return false if reordering is
@@ -1174,7 +1124,7 @@ private:
     return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
   }
 
-  /// \brief Collect the dependences from LoopAccessInfo.
+  /// Collect the dependences from LoopAccessInfo.
   ///
   /// We process the dependences once during the interleaved access analysis to
   /// enable constant-time dependence queries.
@@ -1187,315 +1137,6 @@ private:
   }
 };
 
-/// Utility class for getting and setting loop vectorizer hints in the form
-/// of loop metadata.
-/// This class keeps a number of loop annotations locally (as member variables)
-/// and can, upon request, write them back as metadata on the loop. It will
-/// initially scan the loop for existing metadata, and will update the local
-/// values based on information in the loop.
-/// We cannot write all values to metadata, as the mere presence of some info,
-/// for example 'force', means a decision has been made. So, we need to be
-/// careful NOT to add them if the user hasn't specifically asked so.
-class LoopVectorizeHints {
-  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
-
-  /// Hint - associates name and validation with the hint value.
-  struct Hint {
-    const char *Name;
-    unsigned Value; // This may have to change for non-numeric values.
-    HintKind Kind;
-
-    Hint(const char *Name, unsigned Value, HintKind Kind)
-        : Name(Name), Value(Value), Kind(Kind) {}
-
-    bool validate(unsigned Val) {
-      switch (Kind) {
-      case HK_WIDTH:
-        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
-      case HK_UNROLL:
-        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
-      case HK_FORCE:
-        return (Val <= 1);
-      case HK_ISVECTORIZED:
-        return (Val==0 || Val==1);
-      }
-      return false;
-    }
-  };
-
-  /// Vectorization width.
-  Hint Width;
-
-  /// Vectorization interleave factor.
-  Hint Interleave;
-
-  /// Vectorization forced
-  Hint Force;
-
-  /// Already Vectorized
-  Hint IsVectorized;
-
-  /// Return the loop metadata prefix.
-  static StringRef Prefix() { return "llvm.loop."; }
-
-  /// True if there is any unsafe math in the loop.
-  bool PotentiallyUnsafe = false;
-
-public:
-  enum ForceKind {
-    FK_Undefined = -1, ///< Not selected.
-    FK_Disabled = 0,   ///< Forcing disabled.
-    FK_Enabled = 1,    ///< Forcing enabled.
-  };
-
-  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
-                     OptimizationRemarkEmitter &ORE)
-      : Width("vectorize.width", VectorizerParams::VectorizationFactor,
-              HK_WIDTH),
-        Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
-        Force("vectorize.enable", FK_Undefined, HK_FORCE),
-        IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
-    // Populate values with existing loop metadata.
-    getHintsFromMetadata();
-
-    // force-vector-interleave overrides DisableInterleaving.
-    if (VectorizerParams::isInterleaveForced())
-      Interleave.Value = VectorizerParams::VectorizationInterleave;
-
-    if (IsVectorized.Value != 1)
-      // If the vectorization width and interleaving count are both 1 then
-      // consider the loop to have been already vectorized because there's
-      // nothing more that we can do.
-      IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
-    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
-          << "LV: Interleaving disabled by the pass manager\n");
-  }
-
-  /// Mark the loop L as already vectorized by setting the width to 1.
-  void setAlreadyVectorized() {
-    IsVectorized.Value = 1;
-    Hint Hints[] = {IsVectorized};
-    writeHintsToMetadata(Hints);
-  }
-
-  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
-    if (getForce() == LoopVectorizeHints::FK_Disabled) {
-      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
-      emitRemarkWithHints();
-      return false;
-    }
-
-    if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
-      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
-      emitRemarkWithHints();
-      return false;
-    }
-
-    if (getIsVectorized() == 1) {
-      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
-      // FIXME: Add interleave.disable metadata. This will allow
-      // vectorize.disable to be used without disabling the pass and errors
-      // to differentiate between disabled vectorization and a width of 1.
-      ORE.emit([&]() {
-        return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
-                                          "AllDisabled", L->getStartLoc(),
-                                          L->getHeader())
-               << "loop not vectorized: vectorization and interleaving are "
-                  "explicitly disabled, or the loop has already been "
-                  "vectorized";
-      });
-      return false;
-    }
-
-    return true;
-  }
-
-  /// Dumps all the hint information.
-  void emitRemarkWithHints() const {
-    using namespace ore;
-
-    ORE.emit([&]() {
-      if (Force.Value == LoopVectorizeHints::FK_Disabled)
-        return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
-                                        TheLoop->getStartLoc(),
-                                        TheLoop->getHeader())
-               << "loop not vectorized: vectorization is explicitly disabled";
-      else {
-        OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
-                                   TheLoop->getStartLoc(),
-                                   TheLoop->getHeader());
-        R << "loop not vectorized";
-        if (Force.Value == LoopVectorizeHints::FK_Enabled) {
-          R << " (Force=" << NV("Force", true);
-          if (Width.Value != 0)
-            R << ", Vector Width=" << NV("VectorWidth", Width.Value);
-          if (Interleave.Value != 0)
-            R << ", Interleave Count="
-              << NV("InterleaveCount", Interleave.Value);
-          R << ")";
-        }
-        return R;
-      }
-    });
-  }
-
-  unsigned getWidth() const { return Width.Value; }
-  unsigned getInterleave() const { return Interleave.Value; }
-  unsigned getIsVectorized() const { return IsVectorized.Value; }
-  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
-
-  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
-  /// pass name to force the frontend to print the diagnostic.
-  const char *vectorizeAnalysisPassName() const {
-    if (getWidth() == 1)
-      return LV_NAME;
-    if (getForce() == LoopVectorizeHints::FK_Disabled)
-      return LV_NAME;
-    if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
-      return LV_NAME;
-    return OptimizationRemarkAnalysis::AlwaysPrint;
-  }
-
-  bool allowReordering() const {
-    // When enabling loop hints are provided we allow the vectorizer to change
-    // the order of operations that is given by the scalar loop. This is not
-    // enabled by default because can be unsafe or inefficient. For example,
-    // reordering floating-point operations will change the way round-off
-    // error accumulates in the loop.
-    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
-  }
-
-  bool isPotentiallyUnsafe() const {
-    // Avoid FP vectorization if the target is unsure about proper support.
-    // This may be related to the SIMD unit in the target not handling
-    // IEEE 754 FP ops properly, or bad single-to-double promotions.
-    // Otherwise, a sequence of vectorized loops, even without reduction,
-    // could lead to different end results on the destination vectors.
-    return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
-  }
-
-  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
-
-private:
-  /// Find hints specified in the loop metadata and update local values.
-  void getHintsFromMetadata() {
-    MDNode *LoopID = TheLoop->getLoopID();
-    if (!LoopID)
-      return;
-
-    // First operand should refer to the loop id itself.
-    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
-    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      const MDString *S = nullptr;
-      SmallVector<Metadata *, 4> Args;
-
-      // The expected hint is either a MDString or a MDNode with the first
-      // operand a MDString.
-      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
-        if (!MD || MD->getNumOperands() == 0)
-          continue;
-        S = dyn_cast<MDString>(MD->getOperand(0));
-        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
-          Args.push_back(MD->getOperand(i));
-      } else {
-        S = dyn_cast<MDString>(LoopID->getOperand(i));
-        assert(Args.size() == 0 && "too many arguments for MDString");
-      }
-
-      if (!S)
-        continue;
-
-      // Check if the hint starts with the loop metadata prefix.
-      StringRef Name = S->getString();
-      if (Args.size() == 1)
-        setHint(Name, Args[0]);
-    }
-  }
-
-  /// Checks string hint with one operand and set value if valid.
-  void setHint(StringRef Name, Metadata *Arg) {
-    if (!Name.startswith(Prefix()))
-      return;
-    Name = Name.substr(Prefix().size(), StringRef::npos);
-
-    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
-    if (!C)
-      return;
-    unsigned Val = C->getZExtValue();
-
-    Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
-    for (auto H : Hints) {
-      if (Name == H->Name) {
-        if (H->validate(Val))
-          H->Value = Val;
-        else
-          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
-        break;
-      }
-    }
-  }
-
-  /// Create a new hint from name / value pair.
-  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
-    LLVMContext &Context = TheLoop->getHeader()->getContext();
-    Metadata *MDs[] = {MDString::get(Context, Name),
-                       ConstantAsMetadata::get(
-                           ConstantInt::get(Type::getInt32Ty(Context), V))};
-    return MDNode::get(Context, MDs);
-  }
-
-  /// Matches metadata with hint name.
-  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
-    MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
-    if (!Name)
-      return false;
-
-    for (auto H : HintTypes)
-      if (Name->getString().endswith(H.Name))
-        return true;
-    return false;
-  }
-
-  /// Sets current hints into loop metadata, keeping other values intact.
-  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
-    if (HintTypes.empty())
-      return;
-
-    // Reserve the first element to LoopID (see below).
-    SmallVector<Metadata *, 4> MDs(1);
-    // If the loop already has metadata, then ignore the existing operands.
-    MDNode *LoopID = TheLoop->getLoopID();
-    if (LoopID) {
-      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
-        // If node in update list, ignore old value.
-        if (!matchesHintMetadataName(Node, HintTypes))
-          MDs.push_back(Node);
-      }
-    }
-
-    // Now, add the missing hints.
-    for (auto H : HintTypes)
-      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
-
-    // Replace current metadata node with new one.
-    LLVMContext &Context = TheLoop->getHeader()->getContext();
-    MDNode *NewLoopID = MDNode::get(Context, MDs);
-    // Set operand 0 to refer to the loop id itself.
-    NewLoopID->replaceOperandWith(0, NewLoopID);
-
-    TheLoop->setLoopID(NewLoopID);
-  }
-
-  /// The loop these hints belong to.
-  const Loop *TheLoop;
-
-  /// Interface to emit optimization remarks.
-  OptimizationRemarkEmitter &ORE;
-};
-
 } // end anonymous namespace
 
 static void emitMissedWarning(Function *F, Loop *L,
@@ -1519,333 +1160,7 @@ static void emitMissedWarning(Function *F, Loop *L,
   }
 }
 
-namespace {
-
-/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
-/// to what vectorization factor.
-/// This class does not look at the profitability of vectorization, only the
-/// legality. This class has two main kinds of checks:
-/// * Memory checks - The code in canVectorizeMemory checks if vectorization
-///   will change the order of memory accesses in a way that will change the
-///   correctness of the program.
-/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
-/// checks for a number of different conditions, such as the availability of a
-/// single induction variable, that all types are supported and vectorize-able,
-/// etc. This code reflects the capabilities of InnerLoopVectorizer.
-/// This class is also used by InnerLoopVectorizer for identifying
-/// induction variable and the different reduction variables.
-class LoopVectorizationLegality {
-public:
-  LoopVectorizationLegality(
-      Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
-      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
-      const TargetTransformInfo *TTI,
-      std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
-      OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
-      LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
-      : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
-        ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H),
-        DB(DB), AC(AC) {}
-
-  /// ReductionList contains the reduction descriptors for all
-  /// of the reductions that were found in the loop.
-  using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
-
-  /// InductionList saves induction variables and maps them to the
-  /// induction descriptor.
-  using InductionList = MapVector<PHINode *, InductionDescriptor>;
-
-  /// RecurrenceSet contains the phi nodes that are recurrences other than
-  /// inductions and reductions.
-  using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
-
-  /// Returns true if it is legal to vectorize this loop.
-  /// This does not mean that it is profitable to vectorize this
-  /// loop, only that it is legal to do so.
-  bool canVectorize();
-
-  /// Returns the primary induction variable.
-  PHINode *getPrimaryInduction() { return PrimaryInduction; }
-
-  /// Returns the reduction variables found in the loop.
-  ReductionList *getReductionVars() { return &Reductions; }
-
-  /// Returns the induction variables found in the loop.
-  InductionList *getInductionVars() { return &Inductions; }
-
-  /// Return the first-order recurrences found in the loop.
-  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
-
-  /// Return the set of instructions to sink to handle first-order recurrences.
-  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
-
-  /// Returns the widest induction type.
-  Type *getWidestInductionType() { return WidestIndTy; }
-
-  /// Returns True if V is a Phi node of an induction variable in this loop.
-  bool isInductionPhi(const Value *V);
-
-  /// Returns True if V is a cast that is part of an induction def-use chain,
-  /// and had been proven to be redundant under a runtime guard (in other
-  /// words, the cast has the same SCEV expression as the induction phi).
-  bool isCastedInductionVariable(const Value *V);
-
-  /// Returns True if V can be considered as an induction variable in this 
-  /// loop. V can be the induction phi, or some redundant cast in the def-use
-  /// chain of the inducion phi.
-  bool isInductionVariable(const Value *V);
-
-  /// Returns True if PN is a reduction variable in this loop.
-  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
-
-  /// Returns True if Phi is a first-order recurrence in this loop.
-  bool isFirstOrderRecurrence(const PHINode *Phi);
-
-  /// Return true if the block BB needs to be predicated in order for the loop
-  /// to be vectorized.
-  bool blockNeedsPredication(BasicBlock *BB);
-
-  /// Check if this pointer is consecutive when vectorizing. This happens
-  /// when the last index of the GEP is the induction variable, or that the
-  /// pointer itself is an induction variable.
-  /// This check allows us to vectorize A[idx] into a wide load/store.
-  /// Returns:
-  /// 0 - Stride is unknown or non-consecutive.
-  /// 1 - Address is consecutive.
-  /// -1 - Address is consecutive, and decreasing.
-  /// NOTE: This method must only be used before modifying the original scalar
-  /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
-  int isConsecutivePtr(Value *Ptr);
-
-  /// Returns true if the value V is uniform within the loop.
-  bool isUniform(Value *V);
-
-  /// Returns the information that we collected about runtime memory check.
-  const RuntimePointerChecking *getRuntimePointerChecking() const {
-    return LAI->getRuntimePointerChecking();
-  }
-
-  const LoopAccessInfo *getLAI() const { return LAI; }
-
-  /// \brief Check if \p Instr belongs to any interleaved access group.
-  bool isAccessInterleaved(Instruction *Instr) {
-    return InterleaveInfo.isInterleaved(Instr);
-  }
-
-  /// \brief Get the interleaved access group that \p Instr belongs to.
-  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
-    return InterleaveInfo.getInterleaveGroup(Instr);
-  }
-
-  /// \brief Returns true if an interleaved group requires a scalar iteration
-  /// to handle accesses with gaps.
-  bool requiresScalarEpilogue() const {
-    return InterleaveInfo.requiresScalarEpilogue();
-  }
-
-  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
-
-  uint64_t getMaxSafeRegisterWidth() const {
-	  return LAI->getDepChecker().getMaxSafeRegisterWidth();
-  }
-
-  bool hasStride(Value *V) { return LAI->hasStride(V); }
-
-  /// Returns true if the target machine supports masked store operation
-  /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
-    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
-  }
-
-  /// Returns true if the target machine supports masked load operation
-  /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
-    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
-  }
-
-  /// Returns true if the target machine supports masked scatter operation
-  /// for the given \p DataType.
-  bool isLegalMaskedScatter(Type *DataType) {
-    return TTI->isLegalMaskedScatter(DataType);
-  }
-
-  /// Returns true if the target machine supports masked gather operation
-  /// for the given \p DataType.
-  bool isLegalMaskedGather(Type *DataType) {
-    return TTI->isLegalMaskedGather(DataType);
-  }
-
-  /// Returns true if the target machine can represent \p V as a masked gather
-  /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
-    auto *LI = dyn_cast<LoadInst>(V);
-    auto *SI = dyn_cast<StoreInst>(V);
-    if (!LI && !SI)
-      return false;
-    auto *Ptr = getPointerOperand(V);
-    auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
-  }
-
-  /// Returns true if vector representation of the instruction \p I
-  /// requires mask.
-  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
-
-  unsigned getNumStores() const { return LAI->getNumStores(); }
-  unsigned getNumLoads() const { return LAI->getNumLoads(); }
-  unsigned getNumPredStores() const { return NumPredStores; }
-
-  /// Returns true if \p I is an instruction that will be scalarized with
-  /// predication. Such instructions include conditional stores and
-  /// instructions that may divide by zero.
-  bool isScalarWithPredication(Instruction *I);
-
-  /// Returns true if \p I is a memory instruction with consecutive memory
-  /// access that can be widened.
-  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
-
-  // Returns true if the NoNaN attribute is set on the function.
-  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
-
-private:
-  /// Check if a single basic block loop is vectorizable.
-  /// At this point we know that this is a loop with a constant trip count
-  /// and we only need to check individual instructions.
-  bool canVectorizeInstrs();
-
-  /// When we vectorize loops we may change the order in which
-  /// we read and write from memory. This method checks if it is
-  /// legal to vectorize the code, considering only memory constrains.
-  /// Returns true if the loop is vectorizable
-  bool canVectorizeMemory();
-
-  /// Return true if we can vectorize this loop using the IF-conversion
-  /// transformation.
-  bool canVectorizeWithIfConvert();
-
-  /// Return true if all of the instructions in the block can be speculatively
-  /// executed. \p SafePtrs is a list of addresses that are known to be legal
-  /// and we know that we can read from them without segfault.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
-
-  /// Updates the vectorization state by adding \p Phi to the inductions list.
-  /// This can set \p Phi as the main induction of the loop if \p Phi is a
-  /// better choice for the main induction than the existing one.
-  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
-                       SmallPtrSetImpl<Value *> &AllowedExit);
-
-  /// Create an analysis remark that explains why vectorization failed
-  ///
-  /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
-  /// an instruction that prevents vectorization.  Otherwise the loop is used
-  /// for the location of the remark.  \return the remark object that can be
-  /// streamed to.
-  OptimizationRemarkAnalysis
-  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
-    return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
-                                  RemarkName, TheLoop, I);
-  }
-
-  /// \brief If an access has a symbolic strides, this maps the pointer value to
-  /// the stride symbol.
-  const ValueToValueMap *getSymbolicStrides() {
-    // FIXME: Currently, the set of symbolic strides is sometimes queried before
-    // it's collected.  This happens from canVectorizeWithIfConvert, when the
-    // pointer is checked to reference consecutive elements suitable for a
-    // masked access.
-    return LAI ? &LAI->getSymbolicStrides() : nullptr;
-  }
-
-  unsigned NumPredStores = 0;
-
-  /// The loop that we evaluate.
-  Loop *TheLoop;
-
-  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
-  /// Applies dynamic knowledge to simplify SCEV expressions in the context
-  /// of existing SCEV assumptions. The analysis will also add a minimal set
-  /// of new predicates if this is required to enable vectorization and
-  /// unrolling.
-  PredicatedScalarEvolution &PSE;
-
-  /// Target Library Info.
-  TargetLibraryInfo *TLI;
-
-  /// Target Transform Info
-  const TargetTransformInfo *TTI;
-
-  /// Dominator Tree.
-  DominatorTree *DT;
-
-  // LoopAccess analysis.
-  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
-
-  // And the loop-accesses info corresponding to this loop.  This pointer is
-  // null until canVectorizeMemory sets it up.
-  const LoopAccessInfo *LAI = nullptr;
-
-  /// Interface to emit optimization remarks.
-  OptimizationRemarkEmitter *ORE;
-
-  /// The interleave access information contains groups of interleaved accesses
-  /// with the same stride and close to each other.
-  InterleavedAccessInfo InterleaveInfo;
-
-  //  ---  vectorization state --- //
-
-  /// Holds the primary induction variable. This is the counter of the
-  /// loop.
-  PHINode *PrimaryInduction = nullptr;
-
-  /// Holds the reduction variables.
-  ReductionList Reductions;
-
-  /// Holds all of the induction variables that we found in the loop.
-  /// Notice that inductions don't need to start at zero and that induction
-  /// variables can be pointers.
-  InductionList Inductions;
-
-  /// Holds all the casts that participate in the update chain of the induction 
-  /// variables, and that have been proven to be redundant (possibly under a 
-  /// runtime guard). These casts can be ignored when creating the vectorized 
-  /// loop body.
-  SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
-
-  /// Holds the phi nodes that are first-order recurrences.
-  RecurrenceSet FirstOrderRecurrences;
-
-  /// Holds instructions that need to sink past other instructions to handle
-  /// first-order recurrences.
-  DenseMap<Instruction *, Instruction *> SinkAfter;
-
-  /// Holds the widest induction type encountered.
-  Type *WidestIndTy = nullptr;
-
-  /// Allowed outside users. This holds the induction and reduction
-  /// vars which can be accessed from outside the loop.
-  SmallPtrSet<Value *, 4> AllowedExit;
-
-  /// Can we assume the absence of NaNs.
-  bool HasFunNoNaNAttr = false;
-
-  /// Vectorization requirements that will go through late-evaluation.
-  LoopVectorizationRequirements *Requirements;
-
-  /// Used to emit an analysis of any legality issues.
-  LoopVectorizeHints *Hints;
-
-  /// The demanded bits analsyis is used to compute the minimum type size in
-  /// which a reduction can be computed.
-  DemandedBits *DB;
-
-  /// The assumption cache analysis is used to compute the minimum type size in
-  /// which a reduction can be computed.
-  AssumptionCache *AC;
-
-  /// While vectorizing these instructions we have to generate a
-  /// call to the appropriate masked intrinsic
-  SmallPtrSet<const Instruction *, 8> MaskedOp;
-};
+namespace llvm {
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
@@ -1862,23 +1177,15 @@ public:
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
-                             const LoopVectorizeHints *Hints)
+                             const LoopVectorizeHints *Hints,
+                             InterleavedAccessInfo &IAI)
       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
+    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization should be avoided up front.
   Optional<unsigned> computeMaxVF(bool OptForSize);
 
-  /// Information about vectorization costs
-  struct VectorizationFactor {
-    // Vector width with best cost
-    unsigned Width;
-
-    // Cost of the loop with that width
-    unsigned Cost;
-  };
-
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
@@ -1912,7 +1219,7 @@ public:
   /// avoid redundant calculations.
   void setCostBasedWideningDecision(unsigned VF);
 
-  /// \brief A struct that represents some properties of the register usage
+  /// A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
     /// Holds the number of loop invariant values that are used in the loop.
@@ -1920,9 +1227,6 @@ public:
 
     /// Holds the maximum number of concurrent live intervals in the loop.
     unsigned MaxLocalUsers;
-
-    /// Holds the number of instructions in the loop.
-    unsigned NumInstructions;
   };
 
   /// \return Returns information about the register usages of the loop for the
@@ -2072,7 +1376,69 @@ public:
     collectLoopScalars(VF);
   }
 
+  /// Returns true if the target machine supports masked store operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+  }
+
+  /// Returns true if the target machine supports masked load operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+  }
+
+  /// Returns true if the target machine supports masked scatter operation
+  /// for the given \p DataType.
+  bool isLegalMaskedScatter(Type *DataType) {
+    return TTI.isLegalMaskedScatter(DataType);
+  }
+
+  /// Returns true if the target machine supports masked gather operation
+  /// for the given \p DataType.
+  bool isLegalMaskedGather(Type *DataType) {
+    return TTI.isLegalMaskedGather(DataType);
+  }
+
+  /// Returns true if the target machine can represent \p V as a masked gather
+  /// or scatter operation.
+  bool isLegalGatherOrScatter(Value *V) {
+    bool LI = isa<LoadInst>(V);
+    bool SI = isa<StoreInst>(V);
+    if (!LI && !SI)
+      return false;
+    auto *Ty = getMemInstValueType(V);
+    return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+  }
+
+  /// Returns true if \p I is an instruction that will be scalarized with
+  /// predication. Such instructions include conditional stores and
+  /// instructions that may divide by zero.
+  bool isScalarWithPredication(Instruction *I);
+
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
+  /// Check if \p Instr belongs to any interleaved access group.
+  bool isAccessInterleaved(Instruction *Instr) {
+    return InterleaveInfo.isInterleaved(Instr);
+  }
+
+  /// Get the interleaved access group that \p Instr belongs to.
+  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+    return InterleaveInfo.getInterleaveGroup(Instr);
+  }
+
+  /// Returns true if an interleaved group requires a scalar iteration
+  /// to handle accesses with gaps.
+  bool requiresScalarEpilogue() const {
+    return InterleaveInfo.requiresScalarEpilogue();
+  }
+
 private:
+  unsigned NumPredStores = 0;
+
   /// \return An upper bound for the vectorization factor, larger than zero.
   /// One is returned if vectorization should best be avoided due to cost.
   unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
@@ -2124,12 +1490,16 @@ private:
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
 
+  /// Returns true if an artificially high cost for emulated masked memrefs
+  /// should be used.
+  bool useEmulatedMaskMemRefHack(Instruction *I);
+
   /// Create an analysis remark that explains why vectorization failed
   ///
   /// \p RemarkName is the identifier for the remark.  \return the remark object
   /// that can be streamed to.
   OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
-    return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+    return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
                                   RemarkName, TheLoop);
   }
 
@@ -2231,6 +1601,10 @@ public:
   /// Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
 
+  /// The interleave access information contains groups of interleaved accesses
+  /// with the same stride and close to each other.
+  InterleavedAccessInfo &InterleaveInfo;
+
   /// Values to ignore in the cost model.
   SmallPtrSet<const Value *, 16> ValuesToIgnore;
 
@@ -2238,271 +1612,78 @@ public:
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
-} // end anonymous namespace
-
-namespace llvm {
-
-/// InnerLoopVectorizer vectorizes loops which contain only one basic
-/// LoopVectorizationPlanner - drives the vectorization process after having
-/// passed Legality checks.
-/// The planner builds and optimizes the Vectorization Plans which record the
-/// decisions how to vectorize the given loop. In particular, represent the
-/// control-flow of the vectorized version, the replication of instructions that
-/// are to be scalarized, and interleave access groups.
-class LoopVectorizationPlanner {
-  /// The loop that we evaluate.
-  Loop *OrigLoop;
-
-  /// Loop Info analysis.
-  LoopInfo *LI;
-
-  /// Target Library Info.
-  const TargetLibraryInfo *TLI;
-
-  /// Target Transform Info.
-  const TargetTransformInfo *TTI;
-
-  /// The legality analysis.
-  LoopVectorizationLegality *Legal;
-
-  /// The profitablity analysis.
-  LoopVectorizationCostModel &CM;
-
-  using VPlanPtr = std::unique_ptr<VPlan>;
-
-  SmallVector<VPlanPtr, 4> VPlans;
-
-  /// This class is used to enable the VPlan to invoke a method of ILV. This is
-  /// needed until the method is refactored out of ILV and becomes reusable.
-  struct VPCallbackILV : public VPCallback {
-    InnerLoopVectorizer &ILV;
-
-    VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
-
-    Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
-      return ILV.getOrCreateVectorValue(V, Part);
-    }
-  };
-
-  /// A builder used to construct the current plan.
-  VPBuilder Builder;
-
-  /// When we if-convert we need to create edge masks. We have to cache values
-  /// so that we don't end up with exponential recursion/IR. Note that
-  /// if-conversion currently takes place during VPlan-construction, so these
-  /// caches are only used at that stage.
-  using EdgeMaskCacheTy =
-      DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
-  using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
-  EdgeMaskCacheTy EdgeMaskCache;
-  BlockMaskCacheTy BlockMaskCache;
-
-  unsigned BestVF = 0;
-  unsigned BestUF = 0;
-
-public:
-  LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
-                           const TargetTransformInfo *TTI,
-                           LoopVectorizationLegality *Legal,
-                           LoopVectorizationCostModel &CM)
-      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
-
-  /// Plan how to best vectorize, return the best VF and its cost.
-  LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
-                                                       unsigned UserVF);
-
-  /// Finalize the best decision and dispose of all other VPlans.
-  void setBestPlan(unsigned VF, unsigned UF);
-
-  /// Generate the IR code for the body of the vectorized loop according to the
-  /// best selected VPlan.
-  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
-
-  void printPlans(raw_ostream &O) {
-    for (const auto &Plan : VPlans)
-      O << *Plan;
-  }
-
-protected:
-  /// Collect the instructions from the original loop that would be trivially
-  /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions(
-      SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
-  /// A range of powers-of-2 vectorization factors with fixed start and
-  /// adjustable end. The range includes start and excludes end, e.g.,:
-  /// [1, 9) = {1, 2, 4, 8}
-  struct VFRange {
-    // A power of 2.
-    const unsigned Start;
-
-    // Need not be a power of 2. If End <= Start range is empty.
-    unsigned End;
-  };
-
-  /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
-  /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
-  /// returned value holds for the entire \p Range.
-  bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
-                                VFRange &Range);
-
-  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
-  /// according to the information gathered by Legal when it checked if it is
-  /// legal to vectorize the loop.
-  void buildVPlans(unsigned MinVF, unsigned MaxVF);
-
-private:
-  /// A helper function that computes the predicate of the block BB, assuming
-  /// that the header block of the loop is set to True. It returns the *entry*
-  /// mask for the block BB.
-  VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
-
-  /// A helper function that computes the predicate of the edge between SRC
-  /// and DST.
-  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
-
-  /// Check if \I belongs to an Interleave Group within the given VF \p Range,
-  /// \return true in the first returned value if so and false otherwise.
-  /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
-  /// for \p Range.Start, and provide it as the second returned value.
-  /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
-  /// \return value is <true, nullptr>, as it is handled by another recipe.
-  /// \p Range.End may be decreased to ensure same decision from \p Range.Start
-  /// to \p Range.End.
-  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
-
-  // Check if \I is a memory instruction to be widened for \p Range.Start and
-  // potentially masked. Such instructions are handled by a recipe that takes an
-  // additional VPInstruction for the mask.
-  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
-                                                   VFRange &Range,
-                                                   VPlanPtr &Plan);
-
-  /// Check if an induction recipe should be constructed for \I within the given
-  /// VF \p Range. If so build and return it. If not, return null. \p Range.End
-  /// may be decreased to ensure same decision from \p Range.Start to
-  /// \p Range.End.
-  VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
-                                                        VFRange &Range);
-
-  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
-  /// a sequence of select instructions as the vectorizer currently performs
-  /// full if-conversion.
-  VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
-
-  /// Check if \p I can be widened within the given VF \p Range. If \p I can be
-  /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
-  /// extended to include \p I or else build a new VPWidenRecipe for it and
-  /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
-  /// false otherwise. Range.End may be decreased to ensure same decision from
-  /// \p Range.Start to \p Range.End.
-  bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
-
-  /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
-  /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
-  /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
-  /// Region. Update the packing decision of predicated instructions if they
-  /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
-  /// \p Range.Start to \p Range.End.
-  VPBasicBlock *handleReplication(
-      Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
-      DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
-      VPlanPtr &Plan);
-
-  /// Create a replicating region for instruction \p I that requires
-  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
-  VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
-                                       VPlanPtr &Plan);
-
-  /// Build a VPlan according to the information gathered by Legal. \return a
-  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
-  /// exclusive, possibly decreasing \p Range.End.
-  VPlanPtr buildVPlan(VFRange &Range,
-                                    const SmallPtrSetImpl<Value *> &NeedDef);
-};
-
 } // end namespace llvm
 
-namespace {
-
-/// \brief This holds vectorization requirements that must be verified late in
-/// the process. The requirements are set by legalize and costmodel. Once
-/// vectorization has been determined to be possible and profitable the
-/// requirements can be verified by looking for metadata or compiler options.
-/// For example, some loops require FP commutativity which is only allowed if
-/// vectorization is explicitly specified or if the fast-math compiler option
-/// has been provided.
-/// Late evaluation of these requirements allows helpful diagnostics to be
-/// composed that tells the user what need to be done to vectorize the loop. For
-/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
-/// evaluation should be used only when diagnostics can generated that can be
-/// followed by a non-expert user.
-class LoopVectorizationRequirements {
-public:
-  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
-
-  void addUnsafeAlgebraInst(Instruction *I) {
-    // First unsafe algebra instruction.
-    if (!UnsafeAlgebraInst)
-      UnsafeAlgebraInst = I;
-  }
-
-  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
-
-  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
-    const char *PassName = Hints.vectorizeAnalysisPassName();
-    bool Failed = false;
-    if (UnsafeAlgebraInst && !Hints.allowReordering()) {
-      ORE.emit([&]() {
-        return OptimizationRemarkAnalysisFPCommute(
-                   PassName, "CantReorderFPOps",
-                   UnsafeAlgebraInst->getDebugLoc(),
-                   UnsafeAlgebraInst->getParent())
-               << "loop not vectorized: cannot prove it is safe to reorder "
-                  "floating-point operations";
-      });
-      Failed = true;
-    }
-
-    // Test if runtime memcheck thresholds are exceeded.
-    bool PragmaThresholdReached =
-        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
-    bool ThresholdReached =
-        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
-    if ((ThresholdReached && !Hints.allowReordering()) ||
-        PragmaThresholdReached) {
-      ORE.emit([&]() {
-        return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
-                                                  L->getStartLoc(),
-                                                  L->getHeader())
-               << "loop not vectorized: cannot prove it is safe to reorder "
-                  "memory operations";
-      });
-      DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
-      Failed = true;
-    }
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+                                   OptimizationRemarkEmitter *ORE) {
+  assert(!OuterLp->empty() && "This is not an outer loop");
+  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+  // Only outer loops with an explicit vectorization hint are supported.
+  // Unannotated outer loops are ignored.
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
 
-    return Failed;
+  Function *Fn = OuterLp->getHeader()->getParent();
+  if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+    return false;
   }
 
-private:
-  unsigned NumRuntimePointerChecks = 0;
-  Instruction *UnsafeAlgebraInst = nullptr;
+  if (!Hints.getWidth()) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
+    emitMissedWarning(Fn, OuterLp, Hints, ORE);
+    return false;
+  }
 
-  /// Interface to emit optimization remarks.
-  OptimizationRemarkEmitter &ORE;
-};
+  if (Hints.getInterleave() > 1) {
+    // TODO: Interleave support is future work.
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+                         "outer loops.\n");
+    emitMissedWarning(Fn, OuterLp, Hints, ORE);
+    return false;
+  }
 
-} // end anonymous namespace
+  return true;
+}
 
-static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
-  if (L.empty()) {
-    if (!hasCyclesInLoopBody(L))
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+                                  OptimizationRemarkEmitter *ORE,
+                                  SmallVectorImpl<Loop *> &V) {
+  // Collect inner loops and outer loops without irreducible control flow. For
+  // now, only collect outer loops that have explicit vectorization hints. If we
+  // are stress testing the VPlan H-CFG construction, we collect the outermost
+  // loop of every loop nest.
+  if (L.empty() || VPlanBuildStressTest ||
+      (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+    LoopBlocksRPO RPOT(&L);
+    RPOT.perform(LI);
+    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
       V.push_back(&L);
-    return;
+      // TODO: Collect inner loops inside marked outer loops in case
+      // vectorization fails for the outer loop. Do not invoke
+      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+      // already known to be reducible. We can use an inherited attribute for
+      // that.
+      return;
+    }
   }
   for (Loop *InnerL : L)
-    addAcyclicInnerLoop(*InnerL, V);
+    collectSupportedLoops(*InnerL, LI, ORE, V);
 }
 
 namespace {
@@ -2571,14 +1752,16 @@ struct LoopVectorize : public FunctionPass {
 //===----------------------------------------------------------------------===//
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // We need to place the broadcast of invariant variables outside the loop.
+  // We need to place the broadcast of invariant variables outside the loop,
+  // but only if it's proven safe to do so. Else, broadcast will be inside
+  // vector loop body.
   Instruction *Instr = dyn_cast<Instruction>(V);
-  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
-  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
-
+  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
+                     (!Instr ||
+                      DT->dominates(Instr->getParent(), LoopVectorPreHeader));
   // Place the code for broadcasting invariant variables in the new preheader.
   IRBuilder<>::InsertPointGuard Guard(Builder);
-  if (Invariant)
+  if (SafeToHoist)
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
   // Broadcast the scalar into all locations in the vector.
@@ -2589,6 +1772,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
 
 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+         "Expected either an induction phi-node or a truncate of it!");
   Value *Start = II.getStartValue();
 
   // Construct the initial value of the vector IV in the vector loop preheader
@@ -2636,17 +1821,18 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // factor. The last of those goes into the PHI.
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
                                     &*LoopVectorBody->getFirstInsertionPt());
+  VecInd->setDebugLoc(EntryVal->getDebugLoc());
   Instruction *LastInduction = VecInd;
   for (unsigned Part = 0; Part < UF; ++Part) {
     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
 
     if (isa<TruncInst>(EntryVal))
       addMetadata(LastInduction, EntryVal);
-    else
-      recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
 
     LastInduction = cast<Instruction>(addFastMathFlag(
         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
   }
 
   // Move the last step to the end of the latch block. This ensures consistent
@@ -2677,8 +1863,20 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
 }
 
 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
-    const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
-    unsigned Lane) {
+    const InductionDescriptor &ID, const Instruction *EntryVal,
+    Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+         "Expected either an induction phi-node or a truncate of it!");
+
+  // This induction variable is not the phi from the original loop but the
+  // newly-created IV based on the proof that casted Phi is equal to the
+  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
+  // re-uses the same InductionDescriptor that original IV uses but we don't
+  // have to do any recording in this case - that is done when original IV is
+  // processed.
+  if (isa<TruncInst>(EntryVal))
+    return;
+
   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
   if (Casts.empty())
     return;
@@ -2775,8 +1973,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
-      else
-        recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
+      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
     }
   }
 
@@ -2847,7 +2044,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
 }
 
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
-                                           Value *EntryVal,
+                                           Instruction *EntryVal,
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF > 1 && "VF should be greater than one");
@@ -2882,25 +2079,11 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
-      recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
+      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
     }
   }
 }
 
-int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
-  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
-    ValueToValueMap();
-
-  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
-  if (Stride == 1 || Stride == -1)
-    return Stride;
-  return 0;
-}
-
-bool LoopVectorizationLegality::isUniform(Value *V) {
-  return LAI->isUniform(V);
-}
-
 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
@@ -3060,7 +2243,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
-  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+  const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
 
   // Skip if current instruction is not the insert position.
@@ -3068,7 +2251,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     return;
 
   const DataLayout &DL = Instr->getModule()->getDataLayout();
-  Value *Ptr = getPointerOperand(Instr);
+  Value *Ptr = getLoadStorePointerOperand(Instr);
 
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
@@ -3090,6 +2273,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   if (Group->isReverse())
     Index += (VF - 1) * Group->getFactor();
 
+  bool InBounds = false;
+  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+    InBounds = gep->isInBounds();
+
   for (unsigned Part = 0; Part < UF; Part++) {
     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
 
@@ -3105,6 +2292,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     //       A[i+2] = c;     // Member of index 2 (Current instruction)
     // Current pointer is pointed to A[i+2], adjust it to A[i].
     NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+    if (InBounds)
+      cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
 
     // Cast to the vector pointer type.
     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
@@ -3210,7 +2399,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
-  Value *Ptr = getPointerOperand(Instr);
+  Value *Ptr = getLoadStorePointerOperand(Instr);
   unsigned Alignment = getMemInstAlignment(Instr);
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
@@ -3241,10 +2430,37 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   if (isMaskRequired)
     Mask = *BlockInMask;
 
+  bool InBounds = false;
+  if (auto *gep = dyn_cast<GetElementPtrInst>(
+          getLoadStorePointerOperand(Instr)->stripPointerCasts()))
+    InBounds = gep->isInBounds();
+
+  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+    // Calculate the pointer for the specific unroll-part.
+    GetElementPtrInst *PartPtr = nullptr;
+
+    if (Reverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
+      PartPtr->setIsInBounds(InBounds);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
+      PartPtr->setIsInBounds(InBounds);
+      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+        Mask[Part] = reverseVector(Mask[Part]);
+    } else {
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
+      PartPtr->setIsInBounds(InBounds);
+    }
+
+    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+  };
+
   // Handle Stores:
   if (SI) {
-    assert(!Legal->isUniform(SI->getPointerOperand()) &&
-           "We do not allow storing to uniform addresses");
     setDebugLocFromInst(Builder, SI);
 
     for (unsigned Part = 0; Part < UF; ++Part) {
@@ -3256,30 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
                                             MaskPart);
       } else {
-        // Calculate the pointer for the specific unroll-part.
-        Value *PartPtr =
-            Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
           // to reverse the order of elements in the stored value.
           StoredVal = reverseVector(StoredVal);
           // We don't want to update the value in the map as it might be used in
           // another expression. So don't call resetVectorValue(StoredVal).
-
-          // If the address is consecutive but reversed, then the
-          // wide store needs to start at the last vector element.
-          PartPtr =
-              Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
-          PartPtr =
-              Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
-          if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
-            Mask[Part] = reverseVector(Mask[Part]);
         }
-
-        Value *VecPtr =
-            Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
-
+        auto *VecPtr = CreateVecPtr(Part, Ptr);
         if (isMaskRequired)
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             Mask[Part]);
@@ -3303,21 +2503,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
                                          nullptr, "wide.masked.gather");
       addMetadata(NewLI, LI);
     } else {
-      // Calculate the pointer for the specific unroll-part.
-      Value *PartPtr =
-          Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
-      if (Reverse) {
-        // If the address is consecutive but reversed, then the
-        // wide load needs to start at the last vector element.
-        PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
-        PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
-        if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
-          Mask[Part] = reverseVector(Mask[Part]);
-      }
-
-      Value *VecPtr =
-          Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+      auto *VecPtr = CreateVecPtr(Part, Ptr);
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
                                          UndefValue::get(DataTy),
@@ -3471,7 +2657,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // does not evenly divide the trip count, no adjustment is necessary since
   // there will already be scalar iterations. Note that the minimum iterations
   // check ensures that N >= Step.
-  if (VF > 1 && Legal->requiresScalarEpilogue()) {
+  if (VF > 1 && Cost->requiresScalarEpilogue()) {
     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
     R = Builder.CreateSelect(IsZero, Step, R);
   }
@@ -3522,8 +2708,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   // vector trip count is zero. This check also covers the case where adding one
   // to the backedge-taken count overflowed leading to an incorrect trip count
   // of zero. In this case we will also jump to the scalar loop.
-  auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
-                                           : ICmpInst::ICMP_ULT;
+  auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+                                          : ICmpInst::ICMP_ULT;
   Value *CheckMinIters = Builder.CreateICmp(
       P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
 
@@ -3728,6 +2914,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     // Create phi nodes to merge from the  backedge-taken check block.
     PHINode *BCResumeVal = PHINode::Create(
         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+    // Copy original phi DL over to the new one.
+    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
     Value *&EndValue = IVEndValues[OrigPhi];
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
@@ -3885,7 +3073,7 @@ struct CSEDenseMapInfo {
 
 } // end anonymous namespace
 
-///\brief Perform cse of induction variable instructions.
+///Perform cse of induction variable instructions.
 static void cse(BasicBlock *BB) {
   // Perform simple cse.
   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
@@ -3907,7 +3095,7 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
                                          const TargetTransformInfo &TTI) {
@@ -4088,7 +3276,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
 
         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
-      } else if (isa<LoadInst>(I)) {
+      } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
         // Don't do anything with the operands, just extend the result.
         continue;
       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
@@ -4103,7 +3291,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
       } else {
-        llvm_unreachable("Unhandled instruction type!");
+        // If we don't know what to do, be conservative and don't do anything.
+        continue;
       }
 
       // Lastly, extend the result.
@@ -4346,12 +3535,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // Finally, fix users of the recurrence outside the loop. The users will need
   // either the last value of the scalar recurrence or the last value of the
   // vector recurrence we extracted in the middle block. Since the loop is in
-  // LCSSA form, we just need to find the phi node for the original scalar
+  // LCSSA form, we just need to find all the phi nodes for the original scalar
   // recurrence in the exit block, and then add an edge for the middle block.
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getIncomingValue(0) == Phi) {
       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
-      break;
     }
   }
 }
@@ -4954,7 +4142,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
   default:
     // This instruction is not vectorized by simple widening.
-    DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.
 }
@@ -4972,465 +4160,7 @@ void InnerLoopVectorizer::updateAnalysis() {
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
-  DEBUG(DT->verifyDomTree());
-}
-
-/// \brief Check whether it is safe to if-convert this phi node.
-///
-/// Phi nodes with constant expressions that can trap are not safe to if
-/// convert.
-static bool canIfConvertPHINodes(BasicBlock *BB) {
-  for (PHINode &Phi : BB->phis()) {
-    for (Value *V : Phi.incoming_values())
-      if (auto *C = dyn_cast<Constant>(V))
-        if (C->canTrap())
-          return false;
-  }
-  return true;
-}
-
-bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
-  if (!EnableIfConversion) {
-    ORE->emit(createMissedAnalysis("IfConversionDisabled")
-              << "if-conversion is disabled");
-    return false;
-  }
-
-  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-
-  // A list of pointers that we can safely read and write to.
-  SmallPtrSet<Value *, 8> SafePointes;
-
-  // Collect safe addresses.
-  for (BasicBlock *BB : TheLoop->blocks()) {
-    if (blockNeedsPredication(BB))
-      continue;
-
-    for (Instruction &I : *BB)
-      if (auto *Ptr = getPointerOperand(&I))
-        SafePointes.insert(Ptr);
-  }
-
-  // Collect the blocks that need predication.
-  BasicBlock *Header = TheLoop->getHeader();
-  for (BasicBlock *BB : TheLoop->blocks()) {
-    // We don't support switch statements inside loops.
-    if (!isa<BranchInst>(BB->getTerminator())) {
-      ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
-                << "loop contains a switch statement");
-      return false;
-    }
-
-    // We must be able to predicate all blocks that need to be predicated.
-    if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointes)) {
-        ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
-                  << "control flow cannot be substituted for a select");
-        return false;
-      }
-    } else if (BB != Header && !canIfConvertPHINodes(BB)) {
-      ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
-                << "control flow cannot be substituted for a select");
-      return false;
-    }
-  }
-
-  // We can if-convert this loop.
-  return true;
-}
-
-bool LoopVectorizationLegality::canVectorize() {
-  // Store the result and return it at the end instead of exiting early, in case
-  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
-  bool Result = true;
-  
-  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
-  // We must have a loop in canonical form. Loops with indirectbr in them cannot
-  // be canonicalized.
-  if (!TheLoop->getLoopPreheader()) {
-    DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
-    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
-              << "loop control flow is not understood by vectorizer");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // FIXME: The code is currently dead, since the loop gets sent to
-  // LoopVectorizationLegality is already an innermost loop.
-  //
-  // We can only vectorize innermost loops.
-  if (!TheLoop->empty()) {
-    ORE->emit(createMissedAnalysis("NotInnermostLoop")
-              << "loop is not the innermost loop");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // We must have a single backedge.
-  if (TheLoop->getNumBackEdges() != 1) {
-    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
-              << "loop control flow is not understood by vectorizer");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // We must have a single exiting block.
-  if (!TheLoop->getExitingBlock()) {
-    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
-              << "loop control flow is not understood by vectorizer");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // We only handle bottom-tested loops, i.e. loop in which the condition is
-  // checked at the end of each iteration. With that we can assume that all
-  // instructions in the loop are executed the same number of times.
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
-              << "loop control flow is not understood by vectorizer");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // We need to have a loop header.
-  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
-               << '\n');
-
-  // Check if we can if-convert non-single-bb loops.
-  unsigned NumBlocks = TheLoop->getNumBlocks();
-  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
-    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // Check if we can vectorize the instructions and CFG in this loop.
-  if (!canVectorizeInstrs()) {
-    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeMemory()) {
-    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  DEBUG(dbgs() << "LV: We can vectorize this loop"
-               << (LAI->getRuntimePointerChecking()->Need
-                       ? " (with a runtime bound check)"
-                       : "")
-               << "!\n");
-
-  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
-
-  // If an override option has been passed in for interleaved accesses, use it.
-  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
-    UseInterleaved = EnableInterleavedMemAccesses;
-
-  // Analyze interleaved memory accesses.
-  if (UseInterleaved)
-    InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
-
-  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
-  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
-    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
-
-  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
-    ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
-              << "Too many SCEV assumptions need to be made and checked "
-              << "at runtime");
-    DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // Okay! We've done all the tests. If any have failed, return false. Otherwise
-  // we can vectorize, and at this point we don't have any other mem analysis
-  // which may limit our maximum vectorization factor, so just return true with
-  // no restrictions.
-  return Result;
-}
-
-static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
-  if (Ty->isPointerTy())
-    return DL.getIntPtrType(Ty);
-
-  // It is possible that char's or short's overflow when we ask for the loop's
-  // trip count, work around this by changing the type size.
-  if (Ty->getScalarSizeInBits() < 32)
-    return Type::getInt32Ty(Ty->getContext());
-
-  return Ty;
-}
-
-static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
-  Ty0 = convertPointerToIntegerType(DL, Ty0);
-  Ty1 = convertPointerToIntegerType(DL, Ty1);
-  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
-    return Ty0;
-  return Ty1;
-}
-
-/// \brief Check that the instruction has outside loop users and is not an
-/// identified reduction variable.
-static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
-                               SmallPtrSetImpl<Value *> &AllowedExit) {
-  // Reduction and Induction instructions are allowed to have exit users. All
-  // other instructions must not have external users.
-  if (!AllowedExit.count(Inst))
-    // Check that all of the users of the loop are inside the BB.
-    for (User *U : Inst->users()) {
-      Instruction *UI = cast<Instruction>(U);
-      // This user may be a reduction exit value.
-      if (!TheLoop->contains(UI)) {
-        DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
-        return true;
-      }
-    }
-  return false;
-}
-
-void LoopVectorizationLegality::addInductionPhi(
-    PHINode *Phi, const InductionDescriptor &ID,
-    SmallPtrSetImpl<Value *> &AllowedExit) {
-  Inductions[Phi] = ID;
-
-  // In case this induction also comes with casts that we know we can ignore
-  // in the vectorized loop body, record them here. All casts could be recorded
-  // here for ignoring, but suffices to record only the first (as it is the
-  // only one that may bw used outside the cast sequence).
-  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
-  if (!Casts.empty())
-    InductionCastsToIgnore.insert(*Casts.begin());
-
-  Type *PhiTy = Phi->getType();
-  const DataLayout &DL = Phi->getModule()->getDataLayout();
-
-  // Get the widest type.
-  if (!PhiTy->isFloatingPointTy()) {
-    if (!WidestIndTy)
-      WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
-    else
-      WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
-  }
-
-  // Int inductions are special because we only allow one IV.
-  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
-      ID.getConstIntStepValue() &&
-      ID.getConstIntStepValue()->isOne() &&
-      isa<Constant>(ID.getStartValue()) &&
-      cast<Constant>(ID.getStartValue())->isNullValue()) {
-
-    // Use the phi node with the widest type as induction. Use the last
-    // one if there are multiple (no good reason for doing this other
-    // than it is expedient). We've checked that it begins at zero and
-    // steps by one, so this is a canonical induction variable.
-    if (!PrimaryInduction || PhiTy == WidestIndTy)
-      PrimaryInduction = Phi;
-  }
-
-  // Both the PHI node itself, and the "post-increment" value feeding
-  // back into the PHI node may have external users.
-  // We can allow those uses, except if the SCEVs we have for them rely
-  // on predicates that only hold within the loop, since allowing the exit
-  // currently means re-using this SCEV outside the loop.
-  if (PSE.getUnionPredicate().isAlwaysTrue()) {
-    AllowedExit.insert(Phi);
-    AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
-  }
-
-  DEBUG(dbgs() << "LV: Found an induction variable.\n");
-}
-
-bool LoopVectorizationLegality::canVectorizeInstrs() {
-  BasicBlock *Header = TheLoop->getHeader();
-
-  // Look for the attribute signaling the absence of NaNs.
-  Function &F = *Header->getParent();
-  HasFunNoNaNAttr =
-      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
-
-  // For each block in the loop.
-  for (BasicBlock *BB : TheLoop->blocks()) {
-    // Scan the instructions in the block and look for hazards.
-    for (Instruction &I : *BB) {
-      if (auto *Phi = dyn_cast<PHINode>(&I)) {
-        Type *PhiTy = Phi->getType();
-        // Check that this PHI type is allowed.
-        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
-            !PhiTy->isPointerTy()) {
-          ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
-                    << "loop control flow is not understood by vectorizer");
-          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
-          return false;
-        }
-
-        // If this PHINode is not in the header block, then we know that we
-        // can convert it to select during if-conversion. No need to check if
-        // the PHIs in this block are induction or reduction variables.
-        if (BB != Header) {
-          // Check that this instruction has no outside users or is an
-          // identified reduction value with an outside user.
-          if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
-            continue;
-          ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
-                    << "value could not be identified as "
-                       "an induction or reduction variable");
-          return false;
-        }
-
-        // We only allow if-converted PHIs with exactly two incoming values.
-        if (Phi->getNumIncomingValues() != 2) {
-          ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
-                    << "control flow not understood by vectorizer");
-          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-          return false;
-        }
-
-        RecurrenceDescriptor RedDes;
-        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
-                                                 DT)) {
-          if (RedDes.hasUnsafeAlgebra())
-            Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
-          AllowedExit.insert(RedDes.getLoopExitInstr());
-          Reductions[Phi] = RedDes;
-          continue;
-        }
-
-        InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
-          addInductionPhi(Phi, ID, AllowedExit);
-          if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
-            Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
-          continue;
-        }
-
-        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
-                                                         SinkAfter, DT)) {
-          FirstOrderRecurrences.insert(Phi);
-          continue;
-        }
-
-        // As a last resort, coerce the PHI to a AddRec expression
-        // and re-try classifying it a an induction PHI.
-        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
-          addInductionPhi(Phi, ID, AllowedExit);
-          continue;
-        }
-
-        ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
-                  << "value that could not be identified as "
-                     "reduction is used outside the loop");
-        DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
-        return false;
-      } // end of PHI handling
-
-      // We handle calls that:
-      //   * Are debug info intrinsics.
-      //   * Have a mapping to an IR intrinsic.
-      //   * Have a vector version available.
-      auto *CI = dyn_cast<CallInst>(&I);
-      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
-          !isa<DbgInfoIntrinsic>(CI) &&
-          !(CI->getCalledFunction() && TLI &&
-            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
-        ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
-                  << "call instruction cannot be vectorized");
-        DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
-        return false;
-      }
-
-      // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
-      // second argument is the same (i.e. loop invariant)
-      if (CI && hasVectorInstrinsicScalarOpd(
-                    getVectorIntrinsicIDForCall(CI, TLI), 1)) {
-        auto *SE = PSE.getSE();
-        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
-          ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
-                    << "intrinsic instruction cannot be vectorized");
-          DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
-          return false;
-        }
-      }
-
-      // Check that the instruction return type is vectorizable.
-      // Also, we can't vectorize extractelement instructions.
-      if ((!VectorType::isValidElementType(I.getType()) &&
-           !I.getType()->isVoidTy()) ||
-          isa<ExtractElementInst>(I)) {
-        ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
-                  << "instruction return type cannot be vectorized");
-        DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
-        return false;
-      }
-
-      // Check that the stored type is vectorizable.
-      if (auto *ST = dyn_cast<StoreInst>(&I)) {
-        Type *T = ST->getValueOperand()->getType();
-        if (!VectorType::isValidElementType(T)) {
-          ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
-                    << "store instruction cannot be vectorized");
-          return false;
-        }
-
-        // FP instructions can allow unsafe algebra, thus vectorizable by
-        // non-IEEE-754 compliant SIMD units.
-        // This applies to floating-point math operations and calls, not memory
-        // operations, shuffles, or casts, as they don't change precision or
-        // semantics.
-      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
-                 !I.isFast()) {
-        DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
-        Hints->setPotentiallyUnsafe();
-      }
-
-      // Reduction instructions are allowed to have exit users.
-      // All other instructions must not have external users.
-      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
-        ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
-                  << "value cannot be used outside the loop");
-        return false;
-      }
-    } // next instr.
-  }
-
-  if (!PrimaryInduction) {
-    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
-    if (Inductions.empty()) {
-      ORE->emit(createMissedAnalysis("NoInductionVariable")
-                << "loop induction variable could not be identified");
-      return false;
-    }
-  }
-
-  // Now we know the widest induction type, check if our found induction
-  // is the same size. If it's not, unset it here and InnerLoopVectorizer
-  // will create another.
-  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
-    PrimaryInduction = nullptr;
-
-  return true;
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
 }
 
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@@ -5458,7 +4188,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
       if (Ptr == Store->getValueOperand())
         return WideningDecision == CM_Scalarize;
-    assert(Ptr == getPointerOperand(MemAccess) &&
+    assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
            "Ptr is neither a value or pointer operand");
     return WideningDecision != CM_GatherScatter;
   };
@@ -5524,7 +4254,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     }
   for (auto *I : ScalarPtrs)
     if (!PossibleNonScalarPtrs.count(I)) {
-      DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
       Worklist.insert(I);
     }
 
@@ -5541,8 +4271,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
       continue;
     Worklist.insert(Ind);
     Worklist.insert(IndUpdate);
-    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
-    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+                      << "\n");
   }
 
   // Insert the forced scalars.
@@ -5569,7 +4300,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
                   isScalarUse(J, Src));
         })) {
       Worklist.insert(Src);
-      DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
     }
   }
 
@@ -5609,21 +4340,30 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     // The induction variable and its update instruction will remain scalar.
     Worklist.insert(Ind);
     Worklist.insert(IndUpdate);
-    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
-    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+                      << "\n");
   }
 
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
-  if (!blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
+  if (!Legal->blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
   default:
     break;
-  case Instruction::Store:
-    return !isMaskRequired(I);
+  case Instruction::Load:
+  case Instruction::Store: {
+    if (!Legal->isMaskRequired(I))
+      return false;
+    auto *Ptr = getLoadStorePointerOperand(I);
+    auto *Ty = getMemInstValueType(I);
+    return isa<LoadInst>(I) ?
+        !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
+      : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+  }
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
@@ -5633,17 +4373,17 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
   return false;
 }
 
-bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
-                                                              unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+                                                               unsigned VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
   assert((LI || SI) && "Invalid memory instruction");
 
-  auto *Ptr = getPointerOperand(I);
+  auto *Ptr = getLoadStorePointerOperand(I);
 
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!isConsecutivePtr(Ptr))
+  if (!Legal->isConsecutivePtr(Ptr))
     return false;
 
   // If the instruction is a store located in a predicated block, it will be
@@ -5694,7 +4434,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
     Worklist.insert(Cmp);
-    DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
   }
 
   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
@@ -5726,7 +4466,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       // If there's no pointer operand, there's nothing to do.
-      auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
       if (!Ptr)
         continue;
 
@@ -5734,7 +4474,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
       // pointer operand.
       auto UsersAreMemAccesses =
           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
-            return getPointerOperand(U) == Ptr;
+            return getLoadStorePointerOperand(U) == Ptr;
           });
 
       // Ensure the memory instruction will not be scalarized or used by
@@ -5755,7 +4495,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // aren't also identified as possibly non-uniform.
   for (auto *V : ConsecutiveLikePtrs)
     if (!PossibleNonUniformPtrs.count(V)) {
-      DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
+      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
       Worklist.insert(V);
     }
 
@@ -5774,10 +4514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
             auto *J = cast<Instruction>(U);
             return !TheLoop->contains(J) || Worklist.count(J) ||
-                   (OI == getPointerOperand(J) && isUniformDecision(J, VF));
+                   (OI == getLoadStorePointerOperand(J) &&
+                    isUniformDecision(J, VF));
           })) {
         Worklist.insert(OI);
-        DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
+        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
       }
     }
   }
@@ -5785,7 +4526,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, and I is known to not require scalarization.
   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
+    return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
   };
 
   // For an instruction to be added into Worklist above, all its users inside
@@ -5822,123 +4563,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
     // The induction variable and its update instruction will remain uniform.
     Worklist.insert(Ind);
     Worklist.insert(IndUpdate);
-    DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
-    DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
+                      << "\n");
   }
 
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationLegality::canVectorizeMemory() {
-  LAI = &(*GetLAA)(*TheLoop);
-  InterleaveInfo.setLAI(LAI);
-  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
-  if (LAR) {
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
-                                        "loop not vectorized: ", *LAR);
-    });
-  }
-  if (!LAI->canVectorizeMemory())
-    return false;
-
-  if (LAI->hasStoreToLoopInvariantAddress()) {
-    ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
-              << "write to a loop invariant address could not be vectorized");
-    DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
-    return false;
-  }
-
-  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
-
-  return true;
-}
-
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
-  Value *In0 = const_cast<Value *>(V);
-  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
-  if (!PN)
-    return false;
-
-  return Inductions.count(PN);
-}
-
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
-  auto *Inst = dyn_cast<Instruction>(V);
-  return (Inst && InductionCastsToIgnore.count(Inst));
-}
-
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
-  return isInductionPhi(V) || isCastedInductionVariable(V);
-}
-
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
-  return FirstOrderRecurrences.count(Phi);
-}
-
-bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
-  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-}
-
-bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-
-  for (Instruction &I : *BB) {
-    // Check that we don't have a constant expression that can trap as operand.
-    for (Value *Operand : I.operands()) {
-      if (auto *C = dyn_cast<Constant>(Operand))
-        if (C->canTrap())
-          return false;
-    }
-    // We might be able to hoist the load.
-    if (I.mayReadFromMemory()) {
-      auto *LI = dyn_cast<LoadInst>(&I);
-      if (!LI)
-        return false;
-      if (!SafePtrs.count(LI->getPointerOperand())) {
-        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
-            isLegalMaskedGather(LI->getType())) {
-          MaskedOp.insert(LI);
-          continue;
-        }
-        // !llvm.mem.parallel_loop_access implies if-conversion safety.
-        if (IsAnnotatedParallel)
-          continue;
-        return false;
-      }
-    }
-
-    if (I.mayWriteToMemory()) {
-      auto *SI = dyn_cast<StoreInst>(&I);
-      // We only support predication of stores in basic blocks with one
-      // predecessor.
-      if (!SI)
-        return false;
-
-      // Build a masked store if it is legal for the target.
-      if (isLegalMaskedStore(SI->getValueOperand()->getType(),
-                             SI->getPointerOperand()) ||
-          isLegalMaskedScatter(SI->getValueOperand()->getType())) {
-        MaskedOp.insert(SI);
-        continue;
-      }
-
-      bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
-      bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
-
-      if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
-          !isSinglePredecessor)
-        return false;
-    }
-    if (I.mayThrow())
-      return false;
-  }
-
-  return true;
-}
-
 void InterleavedAccessInfo::collectConstStrideAccesses(
     MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
     const ValueToValueMap &Strides) {
@@ -5959,7 +4591,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       if (!LI && !SI)
         continue;
 
-      Value *Ptr = getPointerOperand(&I);
+      Value *Ptr = getLoadStorePointerOperand(&I);
       // We don't check wrapping here because we don't know yet if Ptr will be
       // part of a full group or a group with gaps. Checking wrapping for all
       // pointers (even those that end up in groups with no gaps) will be overly
@@ -6019,9 +4651,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
 // this group because it and (2) are dependent. However, (1) can be grouped
 // with other accesses that may precede it in program order. Note that a
 // bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving(
-    const ValueToValueMap &Strides) {
-  DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+void InterleavedAccessInfo::analyzeInterleaving() {
+  LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+  const ValueToValueMap &Strides = LAI->getSymbolicStrides();
 
   // Holds all accesses with a constant stride.
   MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
@@ -6062,7 +4694,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
     if (isStrided(DesB.Stride)) {
       Group = getInterleaveGroup(B);
       if (!Group) {
-        DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
+        LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
+                          << '\n');
         Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
       }
       if (B->mayWriteToMemory())
@@ -6121,7 +4754,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
 
       // Ignore A if it's already in a group or isn't the same kind of memory
       // operation as B.
-      if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
+      // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory
+      // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory()
+      // should have returned false - except for the case we asked for optimization
+      // remarks.
+      if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory())
+          || (A->mayWriteToMemory() != B->mayWriteToMemory()))
         continue;
 
       // Check rules 1 and 2. Ignore A if its stride or size is different from
@@ -6160,8 +4798,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
 
       // Try to insert A into B's group.
       if (Group->insertMember(A, IndexA, DesA.Align)) {
-        DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
-                     << "    into the interleave group with" << *B << '\n');
+        LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
+                          << "    into the interleave group with" << *B
+                          << '\n');
         InterleaveGroupMap[A] = Group;
 
         // Set the first load in program order as the insert position.
@@ -6174,8 +4813,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
   // Remove interleaved store groups with gaps.
   for (InterleaveGroup *Group : StoreGroups)
     if (Group->getNumMembers() != Group->getFactor()) {
-      DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
-                      "to gaps.\n");
+      LLVM_DEBUG(
+          dbgs() << "LV: Invalidate candidate interleaved store group due "
+                    "to gaps.\n");
       releaseGroup(Group);
     }
   // Remove interleaved groups with gaps (currently only loads) whose memory
@@ -6204,21 +4844,23 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // So we check only group member 0 (which is always guaranteed to exist),
     // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reveresed accsess -- see Case 3).
-    Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
+    Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
     if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
                       /*ShouldCheckWrap=*/true)) {
-      DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
-                      "first group member potentially pointer-wrapping.\n");
+      LLVM_DEBUG(
+          dbgs() << "LV: Invalidate candidate interleaved group due to "
+                    "first group member potentially pointer-wrapping.\n");
       releaseGroup(Group);
       continue;
     }
     Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
     if (LastMember) {
-      Value *LastMemberPtr = getPointerOperand(LastMember);
+      Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
       if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
                         /*ShouldCheckWrap=*/true)) {
-        DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
-                        "last group member potentially pointer-wrapping.\n");
+        LLVM_DEBUG(
+            dbgs() << "LV: Invalidate candidate interleaved group due to "
+                      "last group member potentially pointer-wrapping.\n");
         releaseGroup(Group);
       }
     } else {
@@ -6228,29 +4870,25 @@ void InterleavedAccessInfo::analyzeInterleaving(
       // to look for a member at index factor - 1, since every group must have
       // a member at index zero.
       if (Group->isReverse()) {
-        DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
-                        "a reverse access with gaps.\n");
+        LLVM_DEBUG(
+            dbgs() << "LV: Invalidate candidate interleaved group due to "
+                      "a reverse access with gaps.\n");
         releaseGroup(Group);
         continue;
       }
-      DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+      LLVM_DEBUG(
+          dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
       RequiresScalarEpilogue = true;
     }
   }
 }
 
 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
-  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
-    return None;
-  }
-
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
     // TODO: It may by useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
-    DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+    LLVM_DEBUG(
+        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
 
     ORE->emit(
       createMissedAnalysis("CantVersionLoopWithDivergentTarget")
@@ -6268,20 +4906,22 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
               << "runtime pointer checks needed. Enable vectorization of this "
                  "loop with '#pragma clang loop vectorize(enable)' when "
                  "compiling with -Os/-Oz");
-    DEBUG(dbgs()
-          << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
     return None;
   }
 
   // If we optimize the program for size, avoid creating the tail loop.
-  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   // If we don't know the precise trip count, don't try to vectorize.
   if (TC < 2) {
     ORE->emit(
         createMissedAnalysis("UnknownLoopCountComplexCFG")
         << "unable to calculate the loop count due to complex control flow");
-    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
     return None;
   }
 
@@ -6299,7 +4939,8 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
                  "same time. Enable vectorization of this loop "
                  "with '#pragma clang loop vectorize(enable)' "
                  "when compiling with -Os/-Oz");
-    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
     return None;
   }
 
@@ -6324,29 +4965,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
 
   unsigned MaxVectorSize = WidestRegister / WidestType;
 
-  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
-               << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
-               << " bits.\n");
+  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+                    << " / " << WidestType << " bits.\n");
+  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+                    << WidestRegister << " bits.\n");
 
-  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
-                                " into one vector!");
+  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
+                                 " into one vector!");
   if (MaxVectorSize == 0) {
-    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     MaxVectorSize = 1;
     return MaxVectorSize;
   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
              isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
     // choosing a higher viable VF as done in the loop below.
-    DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
-                 << ConstTripCount << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+                      << ConstTripCount << "\n");
     MaxVectorSize = ConstTripCount;
     return MaxVectorSize;
   }
 
   unsigned MaxVF = MaxVectorSize;
-  if (MaximizeBandwidth && !OptForSize) {
+  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
+      (MaximizeBandwidth && !OptForSize)) {
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
     SmallVector<unsigned, 8> VFs;
@@ -6366,24 +5008,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
         break;
       }
     }
+    if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
+      if (MaxVF < MinVF) {
+        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
+                          << ") with target's minimum: " << MinVF << '\n');
+        MaxVF = MinVF;
+      }
+    }
   }
   return MaxVF;
 }
 
-LoopVectorizationCostModel::VectorizationFactor
+VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
   float Cost = expectedCost(1).first;
-#ifndef NDEBUG
   const float ScalarCost = Cost;
-#endif /* NDEBUG */
   unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
-  // Ignore scalar width, because the user explicitly wants vectorization.
   if (ForceVectorization && MaxVF > 1) {
-    Width = 2;
-    Cost = expectedCost(Width).first / (float)Width;
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    Cost = std::numeric_limits<float>::max();
   }
 
   for (unsigned i = 2; i <= MaxVF; i *= 2) {
@@ -6392,10 +5040,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
     // the vector elements.
     VectorizationCostTy C = expectedCost(i);
     float VectorCost = C.first / (float)i;
-    DEBUG(dbgs() << "LV: Vector loop of width " << i
-                 << " costs: " << (int)VectorCost << ".\n");
+    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+                      << " costs: " << (int)VectorCost << ".\n");
     if (!C.second && !ForceVectorization) {
-      DEBUG(
+      LLVM_DEBUG(
           dbgs() << "LV: Not considering vector loop of width " << i
                  << " because it will not generate any vector instructions.\n");
       continue;
@@ -6406,10 +5054,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
     }
   }
 
-  DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
-        << "LV: Vectorization seems to be not beneficial, "
-        << "but was forced by a user.\n");
-  DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
+  if (!EnableCondStoresVectorization && NumPredStores) {
+    ORE->emit(createMissedAnalysis("ConditionalStore")
+              << "store that is conditionally executed prevents vectorization");
+    LLVM_DEBUG(
+        dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    Width = 1;
+    Cost = ScalarCost;
+  }
+
+  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+             << "LV: Vectorization seems to be not beneficial, "
+             << "but was forced by a user.\n");
+  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
   return Factor;
 }
@@ -6457,7 +5114,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
       //        optimization to non-pointer types.
       //
       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
-          !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
+          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
         continue;
 
       MinWidth = std::min(MinWidth,
@@ -6501,8 +5158,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
     return 1;
 
   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
-  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
-               << " registers\n");
+  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+                    << " registers\n");
 
   if (VF == 1) {
     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
@@ -6516,7 +5173,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
-  R.NumInstructions = std::max(R.NumInstructions, 1U);
 
   // We calculate the interleave count using the following formula.
   // Subtract the number of loop invariants from the number of available
@@ -6561,7 +5217,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
   if (VF > 1 && !Legal->getReductionVars()->empty()) {
-    DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+    LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
 
@@ -6572,7 +5228,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
-  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
@@ -6600,11 +5256,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
 
     if (EnableLoadStoreRuntimeInterleave &&
         std::max(StoresIC, LoadsIC) > SmallIC) {
-      DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+      LLVM_DEBUG(
+          dbgs() << "LV: Interleaving to saturate store or load ports.\n");
       return std::max(StoresIC, LoadsIC);
     }
 
-    DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
     return SmallIC;
   }
 
@@ -6612,11 +5269,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // this point) that could benefit from interleaving.
   bool HasReductions = !Legal->getReductionVars()->empty();
   if (TTI.enableAggressiveInterleaving(HasReductions)) {
-    DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+    LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
   }
 
-  DEBUG(dbgs() << "LV: Not Interleaving.\n");
+  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
   return 1;
 }
 
@@ -6643,7 +5300,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   DFS.perform(LI);
 
   RegisterUsage RU;
-  RU.NumInstructions = 0;
 
   // Each 'key' in the map opens a new interval. The values
   // of the map are the index of the 'last seen' usage of the
@@ -6655,14 +5311,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   // Marks the end of each interval.
   IntervalMap EndPoint;
   // Saves the list of instruction indices that are used in the loop.
-  SmallSet<Instruction *, 8> Ends;
+  SmallPtrSet<Instruction *, 8> Ends;
   // Saves the list of values that are used in the loop but are
   // defined outside the loop, such as arguments and constants.
   SmallPtrSet<Value *, 8> LoopInvariants;
 
   unsigned Index = 0;
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    RU.NumInstructions += BB->size();
     for (Instruction &I : *BB) {
       IdxToInstr[Index++] = &I;
 
@@ -6695,7 +5350,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   for (auto &Interval : EndPoint)
     TransposeEnds[Interval.second].push_back(Interval.first);
 
-  SmallSet<Instruction *, 8> OpenIntervals;
+  SmallPtrSet<Instruction *, 8> OpenIntervals;
 
   // Get the size of the widest register.
   unsigned MaxSafeDepDist = -1U;
@@ -6708,7 +5363,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   SmallVector<RegisterUsage, 8> RUs(VFs.size());
   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
 
-  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
   // A lambda that gets the register usage for the given type and VF.
   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
@@ -6753,8 +5408,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
     }
 
-    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
-                 << OpenIntervals.size() << '\n');
+    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+                      << OpenIntervals.size() << '\n');
 
     // Add the current instruction to the list of open intervals.
     OpenIntervals.insert(I);
@@ -6769,10 +5424,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
     }
 
-    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
-    DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
-    DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
-    DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+    LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+    LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+    LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
+                      << '\n');
 
     RU.LoopInvariantRegs = Invariant;
     RU.MaxLocalUsers = MaxUsages[i];
@@ -6782,6 +5437,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   return RUs;
 }
 
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+  // TODO: Cost model for emulated masked load/store is completely
+  // broken. This hack guides the cost model to use an artificially
+  // high enough value to practically disable vectorization with such
+  // operations, except where previously deployed legality hack allowed
+  // using very low cost values. This is to avoid regressions coming simply
+  // from moving "masked load/store" check from legality to cost model. 
+  // Masked Load/Gather emulation was previously never allowed.
+  // Limited number of Masked Store/Scatter emulation was allowed.
+  assert(isScalarWithPredication(I) &&
+         "Expecting a scalar emulated instruction");
+  return isa<LoadInst>(I) ||
+         (isa<StoreInst>(I) &&
+          NumPredStores > NumberOfStoresToPredicate);
+}
+
 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
   // If we aren't vectorizing the loop, or if we've already collected the
   // instructions to scalarize, there's nothing to do. Collection may already
@@ -6802,11 +5473,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
     if (!Legal->blockNeedsPredication(BB))
       continue;
     for (Instruction &I : *BB)
-      if (Legal->isScalarWithPredication(&I)) {
+      if (isScalarWithPredication(&I)) {
         ScalarCostsTy ScalarCosts;
-        if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+        // Do not apply discount logic if hacked cost is needed
+        // for emulated masked memrefs.
+        if (!useEmulatedMaskMemRefHack(&I) &&
+            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
-
         // Remember that BB will remain after vectorization.
         PredicatedBBsAfterVectorization.insert(BB);
       }
@@ -6841,7 +5514,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
     // If the instruction is scalar with predication, it will be analyzed
     // separately. We ignore it within the context of PredInst.
-    if (Legal->isScalarWithPredication(I))
+    if (isScalarWithPredication(I))
       return false;
 
     // If any of the instruction's operands are uniform after vectorization,
@@ -6895,7 +5568,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
-    if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
                                                  true, false);
       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
@@ -6937,11 +5610,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
     VectorizationCostTy BlockCost;
 
     // For each instruction in the old loop.
-    for (Instruction &I : *BB) {
-      // Skip dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
-
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
       // Skip ignored values.
       if (ValuesToIgnore.count(&I) ||
           (VF > 1 && VecValuesToIgnore.count(&I)))
@@ -6955,8 +5624,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
       BlockCost.first += C.first;
       BlockCost.second |= C.second;
-      DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
-                   << VF << " For instruction: " << I << '\n');
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+                        << " for VF " << VF << " For instruction: " << I
+                        << '\n');
     }
 
     // If we are vectorizing a predicated block, it will have been
@@ -6975,7 +5645,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
   return Cost;
 }
 
-/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// Gets Address Access SCEV after verifying that the access pattern
 /// is loop invariant except the induction variable dependence.
 ///
 /// This SCEV can be sent to the Target in order to estimate the address
@@ -7017,7 +5687,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   unsigned Alignment = getMemInstAlignment(I);
   unsigned AS = getMemInstAddressSpace(I);
-  Value *Ptr = getPointerOperand(I);
+  Value *Ptr = getLoadStorePointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
 
   // Figure out whether the access is strided and get the stride value
@@ -7038,9 +5708,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // If we have a predicated store, it may not be executed for each vector
   // lane. Scale the cost by the probability of executing the predicated
   // block.
-  if (Legal->isScalarWithPredication(I))
+  if (isScalarWithPredication(I)) {
     Cost /= getReciprocalPredBlockProb();
 
+    if (useEmulatedMaskMemRefHack(I))
+      // Artificially setting to a high enough value to practically disable
+      // vectorization with such operations.
+      Cost = 3000000;
+  }
+
   return Cost;
 }
 
@@ -7049,7 +5725,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
   unsigned Alignment = getMemInstAlignment(I);
-  Value *Ptr = getPointerOperand(I);
+  Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getMemInstAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
 
@@ -7085,7 +5761,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
   unsigned Alignment = getMemInstAlignment(I);
-  Value *Ptr = getPointerOperand(I);
+  Value *Ptr = getLoadStorePointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
@@ -7098,7 +5774,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   Type *VectorTy = ToVectorTy(ValTy, VF);
   unsigned AS = getMemInstAddressSpace(I);
 
-  auto Group = Legal->getInterleavedAccessGroup(I);
+  auto Group = getInterleavedAccessGroup(I);
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
@@ -7165,13 +5841,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
   if (VF == 1)
     return;
+  NumPredStores = 0;
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the old loop.
     for (Instruction &I : *BB) {
-      Value *Ptr = getPointerOperand(&I);
+      Value *Ptr =  getLoadStorePointerOperand(&I);
       if (!Ptr)
         continue;
 
+      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+        NumPredStores++;
       if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
         // Scalar load + broadcast
         unsigned Cost = getUniformMemOpCost(&I, VF);
@@ -7180,9 +5859,10 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       }
 
       // We assume that widening is the best solution when possible.
-      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+      if (memoryInstructionCanBeWidened(&I, VF)) {
         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
-        int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
+        int ConsecutiveStride =
+               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
                "Expected consecutive stride.");
         InstWidening Decision =
@@ -7194,8 +5874,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       // Choose between Interleaving, Gather/Scatter or Scalarization.
       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
       unsigned NumAccesses = 1;
-      if (Legal->isAccessInterleaved(&I)) {
-        auto Group = Legal->getInterleavedAccessGroup(&I);
+      if (isAccessInterleaved(&I)) {
+        auto Group = getInterleavedAccessGroup(&I);
         assert(Group && "Fail to get an interleaved access group.");
 
         // Make one decision for the whole group.
@@ -7207,7 +5887,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       }
 
       unsigned GatherScatterCost =
-          Legal->isLegalGatherOrScatter(&I)
+          isLegalGatherOrScatter(&I)
               ? getGatherScatterCost(&I, VF) * NumAccesses
               : std::numeric_limits<unsigned>::max();
 
@@ -7232,7 +5912,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       // If the instructions belongs to an interleave group, the whole group
       // receives the same decision. The whole group receives the cost, but
       // the cost will actually be assigned to one instruction.
-      if (auto Group = Legal->getInterleavedAccessGroup(&I))
+      if (auto Group = getInterleavedAccessGroup(&I))
         setWideningDecision(Group, VF, Decision, Cost);
       else
         setWideningDecision(&I, VF, Decision, Cost);
@@ -7252,7 +5932,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
   for (BasicBlock *BB : TheLoop->blocks())
     for (Instruction &I : *BB) {
       Instruction *PtrDef =
-        dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+        dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
       if (PtrDef && TheLoop->contains(PtrDef) &&
           getWideningDecision(&I, VF) != CM_GatherScatter)
         AddrDefs.insert(PtrDef);
@@ -7282,7 +5962,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
         // Scalarize a widened load of address.
         setWideningDecision(I, VF, CM_Scalarize,
                             (VF * getMemoryInstructionCost(I, 1)));
-      else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+      else if (auto Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
           if (Instruction *Member = Group->getMember(I))
@@ -7368,7 +6048,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF > 1 && Legal->isScalarWithPredication(I)) {
+    if (VF > 1 && isScalarWithPredication(I)) {
       unsigned Cost = 0;
 
       // These instructions have a non-void type, so account for the phi nodes
@@ -7566,7 +6246,7 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
   // Check if the pointer operand of a load or store instruction is
   // consecutive.
-  if (auto *Ptr = getPointerOperand(Inst))
+  if (auto *Ptr = getLoadStorePointerOperand(Inst))
     return Legal->isConsecutivePtr(Ptr);
   return false;
 }
@@ -7591,23 +6271,59 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   }
 }
 
-LoopVectorizationCostModel::VectorizationFactor
+VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
+                                                unsigned UserVF) {
+  // Width 1 means no vectorization, cost 0 means uncomputed cost.
+  const VectorizationFactor NoVectorization = {1U, 0U};
+
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
+  if (!OrigLoop->empty()) {
+    // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.
+    // This won't be necessary when UserVF is not required in the VPlan-native
+    // path.
+    if (VPlanBuildStressTest && !UserVF)
+      UserVF = 4;
+
+    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+    assert(UserVF && "Expected UserVF for outer loop vectorization.");
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    buildVPlans(UserVF, UserVF);
+
+    // For VPlan build stress testing, we bail out after VPlan construction.
+    if (VPlanBuildStressTest)
+      return NoVectorization;
+
+    return {UserVF, 0};
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+                "VPlan-native path.\n");
+  return NoVectorization;
+}
+
+VectorizationFactor
 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
-  // Width 1 means no vectorize, cost 0 means uncomputed cost.
-  const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
-                                                                           0U};
+  assert(OrigLoop->empty() && "Inner loop expected.");
+  // Width 1 means no vectorization, cost 0 means uncomputed cost.
+  const VectorizationFactor NoVectorization = {1U, 0U};
   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
     return NoVectorization;
 
   if (UserVF) {
-    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
     CM.selectUserVectorizationFactor(UserVF);
-    buildVPlans(UserVF, UserVF);
-    DEBUG(printPlans(dbgs()));
+    buildVPlansWithVPRecipes(UserVF, UserVF);
+    LLVM_DEBUG(printPlans(dbgs()));
     return {UserVF, 0};
   }
 
@@ -7624,8 +6340,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlans(1, MaxVF);
-  DEBUG(printPlans(dbgs()));
+  buildVPlansWithVPRecipes(1, MaxVF);
+  LLVM_DEBUG(printPlans(dbgs()));
   if (MaxVF == 1)
     return NoVectorization;
 
@@ -7634,7 +6350,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
 }
 
 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
-  DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
+  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
+                    << '\n');
   BestVF = VF;
   BestUF = UF;
 
@@ -7784,30 +6501,15 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
 /// vectorization decision can potentially shorten this sub-range during
 /// buildVPlan().
 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
-
-  // Collect conditions feeding internal conditional branches; they need to be
-  // represented in VPlan for it to model masking.
-  SmallPtrSet<Value *, 1> NeedDef;
-
-  auto *Latch = OrigLoop->getLoopLatch();
-  for (BasicBlock *BB : OrigLoop->blocks()) {
-    if (BB == Latch)
-      continue;
-    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
-    if (Branch && Branch->isConditional())
-      NeedDef.insert(Branch->getCondition());
-  }
-
   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
     VFRange SubRange = {VF, MaxVF + 1};
-    VPlans.push_back(buildVPlan(SubRange, NeedDef));
+    VPlans.push_back(buildVPlan(SubRange));
     VF = SubRange.End;
   }
 }
 
-VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
-                                                  BasicBlock *Dst,
-                                                  VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
+                                         VPlanPtr &Plan) {
   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
 
   // Look for cached value.
@@ -7837,8 +6539,7 @@ VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
   return EdgeMaskCache[Edge] = EdgeMask;
 }
 
-VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
-                                                     VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
 
   // Look for cached value.
@@ -7871,10 +6572,9 @@ VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
   return BlockMaskCache[BB] = BlockMask;
 }
 
-VPInterleaveRecipe *
-LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
-                                                VFRange &Range) {
-  const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
+VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
+                                                           VFRange &Range) {
+  const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
   if (!IG)
     return nullptr;
 
@@ -7886,7 +6586,7 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
                   LoopVectorizationCostModel::CM_Interleave);
     };
   };
-  if (!getDecisionAndClampRange(isIGMember(I), Range))
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
     return nullptr;
 
   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
@@ -7899,8 +6599,8 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
 }
 
 VPWidenMemoryInstructionRecipe *
-LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
-                                           VPlanPtr &Plan) {
+VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+                                  VPlanPtr &Plan) {
   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
     return nullptr;
 
@@ -7919,7 +6619,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
     return Decision != LoopVectorizationCostModel::CM_Scalarize;
   };
 
-  if (!getDecisionAndClampRange(willWiden, Range))
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
     return nullptr;
 
   VPValue *Mask = nullptr;
@@ -7930,8 +6630,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
 }
 
 VPWidenIntOrFpInductionRecipe *
-LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
-                                                 VFRange &Range) {
+VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
     // Check if this is an integer or fp induction. If so, build the recipe that
     // produces its scalar and vector values.
@@ -7956,15 +6655,14 @@ LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
   };
 
-  if (isa<TruncInst>(I) &&
-      getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
+  if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
+                               isOptimizableIVTruncate(I), Range))
     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
                                              cast<TruncInst>(I));
   return nullptr;
 }
 
-VPBlendRecipe *
-LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
   PHINode *Phi = dyn_cast<PHINode>(I);
   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
     return nullptr;
@@ -7988,9 +6686,9 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
   return new VPBlendRecipe(Phi, Masks);
 }
 
-bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
-                                          VFRange &Range) {
-  if (Legal->isScalarWithPredication(I))
+bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
+                                 VFRange &Range) {
+  if (CM.isScalarWithPredication(I))
     return false;
 
   auto IsVectorizableOpcode = [](unsigned Opcode) {
@@ -8074,7 +6772,7 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
     return true;
   };
 
-  if (!getDecisionAndClampRange(willWiden, Range))
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
     return false;
 
   // Success: widen this instruction. We optimize the common case where
@@ -8089,15 +6787,15 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
   return true;
 }
 
-VPBasicBlock *LoopVectorizationPlanner::handleReplication(
+VPBasicBlock *VPRecipeBuilder::handleReplication(
     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
     VPlanPtr &Plan) {
-  bool IsUniform = getDecisionAndClampRange(
+  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);
 
-  bool IsPredicated = Legal->isScalarWithPredication(I);
+  bool IsPredicated = CM.isScalarWithPredication(I);
   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
 
   // Find if I uses a predicated instruction. If so, it will use its scalar
@@ -8110,24 +6808,25 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication(
 
   // Finalize the recipe for Instr, first if it is not predicated.
   if (!IsPredicated) {
-    DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
     VPBB->appendRecipe(Recipe);
     return VPBB;
   }
-  DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+  LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
   assert(VPBB->getSuccessors().empty() &&
          "VPBB has successors when handling predicated replication.");
   // Record predicated instructions for above packing optimizations.
   PredInst2Recipe[I] = Recipe;
-  VPBlockBase *Region =
-    VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
-  return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
+  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+  VPBlockUtils::insertBlockAfter(Region, VPBB);
+  auto *RegSucc = new VPBasicBlock();
+  VPBlockUtils::insertBlockAfter(RegSucc, Region);
+  return RegSucc;
 }
 
-VPRegionBlock *
-LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
-                                                VPRecipeBase *PredRecipe,
-                                                VPlanPtr &Plan) {
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
+                                                      VPRecipeBase *PredRecipe,
+                                                      VPlanPtr &Plan) {
   // Instructions marked for predication are replicated and placed under an
   // if-then construct to prevent side-effects.
 
@@ -8147,19 +6846,67 @@ LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
-  Entry->setTwoSuccessors(Pred, Exit);
-  Pred->setOneSuccessor(Exit);
+  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
+  VPBlockUtils::connectBlocks(Pred, Exit);
 
   return Region;
 }
 
-LoopVectorizationPlanner::VPlanPtr
-LoopVectorizationPlanner::buildVPlan(VFRange &Range,
-                                     const SmallPtrSetImpl<Value *> &NeedDef) {
-  EdgeMaskCache.clear();
-  BlockMaskCache.clear();
-  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
-  DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
+                                        VPlanPtr &Plan, VPBasicBlock *VPBB) {
+  VPRecipeBase *Recipe = nullptr;
+  // Check if Instr should belong to an interleave memory recipe, or already
+  // does. In the latter case Instr is irrelevant.
+  if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+    VPBB->appendRecipe(Recipe);
+    return true;
+  }
+
+  // Check if Instr is a memory operation that should be widened.
+  if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+    VPBB->appendRecipe(Recipe);
+    return true;
+  }
+
+  // Check if Instr should form some PHI recipe.
+  if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+    VPBB->appendRecipe(Recipe);
+    return true;
+  }
+  if ((Recipe = tryToBlend(Instr, Plan))) {
+    VPBB->appendRecipe(Recipe);
+    return true;
+  }
+  if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+    VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+    return true;
+  }
+
+  // Check if Instr is to be widened by a general VPWidenRecipe, after
+  // having first checked for specific widening recipes that deal with
+  // Interleave Groups, Inductions and Phi nodes.
+  if (tryToWiden(Instr, VPBB, Range))
+    return true;
+
+  return false;
+}
+
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
+                                                        unsigned MaxVF) {
+  assert(OrigLoop->empty() && "Inner loop expected.");
+
+  // Collect conditions feeding internal conditional branches; they need to be
+  // represented in VPlan for it to model masking.
+  SmallPtrSet<Value *, 1> NeedDef;
+
+  auto *Latch = OrigLoop->getLoopLatch();
+  for (BasicBlock *BB : OrigLoop->blocks()) {
+    if (BB == Latch)
+      continue;
+    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+    if (Branch && Branch->isConditional())
+      NeedDef.insert(Branch->getCondition());
+  }
 
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
@@ -8170,15 +6917,31 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
   SmallPtrSet<Instruction *, 4> DeadInstructions;
   collectTriviallyDeadInstructions(DeadInstructions);
 
+  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+    VFRange SubRange = {VF, MaxVF + 1};
+    VPlans.push_back(
+        buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+    VF = SubRange.End;
+  }
+}
+
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlanWithVPRecipes(
+    VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
   // Hold a mapping from predicated instructions to their recipes, in order to
   // fix their AlsoPack behavior if a user is determined to replicate and use a
   // scalar instead of vector value.
   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
 
+  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+  DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
   auto Plan = llvm::make_unique<VPlan>(VPBB);
 
+  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
   // Represent values that will have defs inside VPlan.
   for (Value *V : NeedDef)
     Plan->addVPValue(V);
@@ -8193,7 +6956,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
     // ingredients and fill a new VPBasicBlock.
     unsigned VPBBsForBB = 0;
     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
-    VPBB->setOneSuccessor(FirstVPBBForBB);
+    VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
     VPBB = FirstVPBBForBB;
     Builder.setInsertPoint(VPBB);
 
@@ -8201,18 +6964,17 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
 
     // Organize the ingredients to vectorize from current basic block in the
     // right order.
-    for (Instruction &I : *BB) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
       Instruction *Instr = &I;
 
       // First filter out irrelevant instructions, to ensure no recipes are
       // built for them.
-      if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
-          DeadInstructions.count(Instr))
+      if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
         continue;
 
       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
       // member of the IG, do not construct any Recipe for it.
-      const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
+      const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
       if (IG && Instr != IG->getInsertPos() &&
           Range.Start >= 2 && // Query is illegal for VF == 1
           CM.getWideningDecision(Instr, Range.Start) ==
@@ -8227,8 +6989,9 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
       // should follow.
       auto SAIt = SinkAfter.find(Instr);
       if (SAIt != SinkAfter.end()) {
-        DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
-                     << " to vectorize a 1st order recurrence.\n");
+        LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
+                          << *SAIt->second
+                          << " to vectorize a 1st order recurrence.\n");
         SinkAfterInverse[SAIt->second] = Instr;
         continue;
       }
@@ -8244,45 +7007,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
 
     // Introduce each ingredient into VPlan.
     for (Instruction *Instr : Ingredients) {
-      VPRecipeBase *Recipe = nullptr;
-
-      // Check if Instr should belong to an interleave memory recipe, or already
-      // does. In the latter case Instr is irrelevant.
-      if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
-        VPBB->appendRecipe(Recipe);
-        continue;
-      }
-
-      // Check if Instr is a memory operation that should be widened.
-      if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
-        VPBB->appendRecipe(Recipe);
-        continue;
-      }
-
-      // Check if Instr should form some PHI recipe.
-      if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
-        VPBB->appendRecipe(Recipe);
-        continue;
-      }
-      if ((Recipe = tryToBlend(Instr, Plan))) {
-        VPBB->appendRecipe(Recipe);
-        continue;
-      }
-      if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
-        VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
-        continue;
-      }
-
-      // Check if Instr is to be widened by a general VPWidenRecipe, after
-      // having first checked for specific widening recipes that deal with
-      // Interleave Groups, Inductions and Phi nodes.
-      if (tryToWiden(Instr, VPBB, Range))
+      if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
         continue;
 
       // Otherwise, if all widening options failed, Instruction is to be
       // replicated. This may create a successor for VPBB.
-      VPBasicBlock *NextVPBB =
-        handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
+      VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
+          Instr, Range, VPBB, PredInst2Recipe, Plan);
       if (NextVPBB != VPBB) {
         VPBB = NextVPBB;
         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
@@ -8297,7 +7028,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
-  PreEntry->disconnectSuccessor(Entry);
+  VPBlockUtils::disconnectBlocks(PreEntry, Entry);
   delete PreEntry;
 
   std::string PlanName;
@@ -8316,6 +7047,30 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
   return Plan;
 }
 
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
+  assert(!OrigLoop->empty());
+  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+  // Create new empty VPlan
+  auto Plan = llvm::make_unique<VPlan>();
+
+  // Build hierarchical CFG
+  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI);
+  HCFGBuilder.buildHierarchicalCFG(*Plan.get());
+
+  return Plan;
+}
+
+Value* LoopVectorizationPlanner::VPCallbackILV::
+getOrCreateVectorValues(Value *V, unsigned Part) {
+      return ILV.getOrCreateVectorValue(V, Part);
+}
+
 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n"
     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
@@ -8480,28 +7235,66 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+    OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
+
+  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+  Function *F = L->getHeader()->getParent();
+  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints, IAI);
+  // Use the planner for outer loop vectorization.
+  // TODO: CM is not used at this point inside the planner. Turn CM into an
+  // optional argument if we don't need it in the future.
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+
+  // Get user vectorization factor.
+  unsigned UserVF = Hints.getWidth();
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  LVP.planInVPlanNativePath(OptForSize, UserVF);
+
+  // Returning false. We are currently not generating vector code in the VPlan
+  // native path.
+  return false;
+}
+
 bool LoopVectorizePass::processLoop(Loop *L) {
-  assert(L->empty() && "Only process inner loops.");
+  assert((EnableVPlanNativePath || L->empty()) &&
+         "VPlan-native path is not enabled. Only process inner loops.");
 
 #ifndef NDEBUG
   const std::string DebugLocStr = getDebugLocString(L);
 #endif /* NDEBUG */
 
-  DEBUG(dbgs() << "\nLV: Checking a loop in \""
-               << L->getHeader()->getParent()->getName() << "\" from "
-               << DebugLocStr << "\n");
+  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
+                    << L->getHeader()->getParent()->getName() << "\" from "
+                    << DebugLocStr << "\n");
 
   LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
 
-  DEBUG(dbgs() << "LV: Loop hints:"
-               << " force="
-               << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
-                       ? "disabled"
-                       : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
-                              ? "enabled"
-                              : "?"))
-               << " width=" << Hints.getWidth()
-               << " unroll=" << Hints.getInterleave() << "\n");
+  LLVM_DEBUG(
+      dbgs() << "LV: Loop hints:"
+             << " force="
+             << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+                     ? "disabled"
+                     : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+                            ? "enabled"
+                            : "?"))
+             << " width=" << Hints.getWidth()
+             << " unroll=" << Hints.getInterleave() << "\n");
 
   // Function containing loop
   Function *F = L->getHeader()->getParent();
@@ -8515,7 +7308,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // benefit from vectorization, respectively.
 
   if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
-    DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
     return false;
   }
 
@@ -8523,10 +7316,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements(*ORE);
-  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
+  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
                                 &Requirements, &Hints, DB, AC);
-  if (!LVL.canVectorize()) {
-    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+  if (!LVL.canVectorize(EnableVPlanNativePath)) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
     emitMissedWarning(F, L, Hints, ORE);
     return false;
   }
@@ -8536,11 +7329,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool OptForSize =
       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
 
+  // Entrance to the VPlan-native vectorization path. Outer loops are processed
+  // here. They may require CFG and instruction level transformations before
+  // even evaluating whether vectorization is profitable. Since we cannot modify
+  // the incoming IR, we need to build VPlan upfront in the vectorization
+  // pipeline.
+  if (!L->empty())
+    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+                                        ORE, Hints);
+
+  assert(L->empty() && "Inner loop expected.");
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
-  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
-  bool HasExpectedTC = (ExpectedTC > 0);
-
+  // Prefer constant trip counts over profile data, over upper bound estimate.
+  unsigned ExpectedTC = 0;
+  bool HasExpectedTC = false;
+  if (const SCEVConstant *ConstExits =
+      dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
+    const APInt &ExitsCount = ConstExits->getAPInt();
+    // We are interested in small values for ExpectedTC. Skip over those that
+    // can't fit an unsigned.
+    if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
+      ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
+      HasExpectedTC = true;
+    }
+  }
+  // ExpectedTC may be large because it's bound by a variable. Check
+  // profiling information to validate we should vectorize.
   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
     auto EstimatedTC = getLoopEstimatedTripCount(L);
     if (EstimatedTC) {
@@ -8548,15 +7363,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       HasExpectedTC = true;
     }
   }
+  if (!HasExpectedTC) {
+    ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+    HasExpectedTC = (ExpectedTC > 0);
+  }
 
   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
-    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                 << "This loop is worth vectorizing only if no scalar "
-                 << "iteration overheads are incurred.");
+    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                      << "This loop is worth vectorizing only if no scalar "
+                      << "iteration overheads are incurred.");
     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-      DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
-      DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "\n");
       // Loops with a very small trip count are considered for vectorization
       // under OptForSize, thereby making sure the cost of their loop body is
       // dominant, free of runtime guards and scalar iteration overheads.
@@ -8569,10 +7388,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // an integer loop and the vector instructions selected are purely integer
   // vector instructions?
   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
-    DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
-                    "attribute is used.\n");
-    ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                   "NoImplicitFloat", L)
+    LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+                         "attribute is used.\n");
+    ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
+                                     "NoImplicitFloat", L)
               << "loop not vectorized due to NoImplicitFloat attribute");
     emitMissedWarning(F, L, Hints, ORE);
     return false;
@@ -8584,17 +7403,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // additional fp-math flags can help.
   if (Hints.isPotentiallyUnsafe() &&
       TTI->isFPVectorizationPotentiallyUnsafe()) {
-    DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
     ORE->emit(
-        createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
+        createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
         << "loop not vectorized due to unsafe FP support.");
     emitMissedWarning(F, L, Hints, ORE);
     return false;
   }
 
+  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+    UseInterleaved = EnableInterleavedMemAccesses;
+
+  // Analyze interleaved memory accesses.
+  if (UseInterleaved) {
+    IAI.analyzeInterleaving();
+  }
+
   // Use the cost model.
   LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints);
+                                &Hints, IAI);
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
@@ -8604,8 +7436,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   unsigned UserVF = Hints.getWidth();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  LoopVectorizationCostModel::VectorizationFactor VF =
-      LVP.plan(OptForSize, UserVF);
+  VectorizationFactor VF = LVP.plan(OptForSize, UserVF);
 
   // Select the interleave count.
   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
@@ -8617,14 +7448,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
   bool VectorizeLoop = true, InterleaveLoop = true;
   if (Requirements.doesNotMeet(F, L, Hints)) {
-    DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
-                    "requirements.\n");
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+                         "requirements.\n");
     emitMissedWarning(F, L, Hints, ORE);
     return false;
   }
 
   if (VF.Width == 1) {
-    DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+    LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
     VecDiagMsg = std::make_pair(
         "VectorizationNotBeneficial",
         "the cost-model indicates that vectorization is not beneficial");
@@ -8633,7 +7464,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   if (IC == 1 && UserIC <= 1) {
     // Tell the user interleaving is not beneficial.
-    DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+    LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
     IntDiagMsg = std::make_pair(
         "InterleavingNotBeneficial",
         "the cost-model indicates that interleaving is not beneficial");
@@ -8645,8 +7476,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     }
   } else if (IC > 1 && UserIC == 1) {
     // Tell the user interleaving is beneficial, but it explicitly disabled.
-    DEBUG(dbgs()
-          << "LV: Interleaving is beneficial but is explicitly disabled.");
+    LLVM_DEBUG(
+        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
     IntDiagMsg = std::make_pair(
         "InterleavingBeneficialButDisabled",
         "the cost-model indicates that interleaving is beneficial "
@@ -8673,24 +7504,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     });
     return false;
   } else if (!VectorizeLoop && InterleaveLoop) {
-    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
     ORE->emit([&]() {
       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
                                         L->getStartLoc(), L->getHeader())
              << VecDiagMsg.second;
     });
   } else if (VectorizeLoop && !InterleaveLoop) {
-    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
-                 << DebugLocStr << '\n');
+    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+                      << ") in " << DebugLocStr << '\n');
     ORE->emit([&]() {
       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
                                         L->getStartLoc(), L->getHeader())
              << IntDiagMsg.second;
     });
   } else if (VectorizeLoop && InterleaveLoop) {
-    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
-                 << DebugLocStr << '\n');
-    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+                      << ") in " << DebugLocStr << '\n');
+    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   }
 
   LVP.setBestPlan(VF.Width, IC);
@@ -8737,7 +7568,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
 
-  DEBUG(verifyFunction(*L->getHeader()->getParent()));
+  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
   return true;
 }
 
@@ -8785,7 +7616,7 @@ bool LoopVectorizePass::runImpl(
   SmallVector<Loop *, 8> Worklist;
 
   for (Loop *L : *LI)
-    addAcyclicInnerLoop(*L, Worklist);
+    collectSupportedLoops(*L, LI, ORE, Worklist);
 
   LoopsAnalyzed += Worklist.size();
 
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f301fc361abc..ac8c4f046c6f 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -161,7 +161,7 @@ static const unsigned MaxMemDepDistance = 160;
 /// regions to be handled.
 static const int MinScheduleRegionSize = 16;
 
-/// \brief Predicate for the element types that the SLP vectorizer supports.
+/// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
 /// vectors. We also filter target specific types which have absolutely no
@@ -246,13 +246,15 @@ static bool isSplat(ArrayRef<Value *> VL) {
 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
 /// ret <4 x i8> %ins4
 /// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
 static Optional<TargetTransformInfo::ShuffleKind>
 isShuffle(ArrayRef<Value *> VL) {
   auto *EI0 = cast<ExtractElementInst>(VL[0]);
   unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
-  enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+  enum ShuffleMode { Unknown, Select, Permute };
   ShuffleMode CommonShuffleMode = Unknown;
   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
     auto *EI = cast<ExtractElementInst>(VL[I]);
@@ -272,7 +274,11 @@ isShuffle(ArrayRef<Value *> VL) {
       continue;
     // For correct shuffling we have to have at most 2 different vector operands
     // in all extractelement instructions.
-    if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+    if (!Vec1 || Vec1 == Vec)
+      Vec1 = Vec;
+    else if (!Vec2 || Vec2 == Vec)
+      Vec2 = Vec;
+    else
       return None;
     if (CommonShuffleMode == Permute)
       continue;
@@ -282,119 +288,17 @@ isShuffle(ArrayRef<Value *> VL) {
       CommonShuffleMode = Permute;
       continue;
     }
-    // Check the shuffle mode for the current operation.
-    if (!Vec1)
-      Vec1 = Vec;
-    else if (Vec != Vec1)
-      Vec2 = Vec;
-    // Example: shufflevector A, B, <0,5,2,7>
-    // I is odd and IntIdx for A == I - FirstAlternate shuffle.
-    // I is even and IntIdx for B == I - FirstAlternate shuffle.
-    // Example: shufflevector A, B, <4,1,6,3>
-    // I is even and IntIdx for A == I - SecondAlternate shuffle.
-    // I is odd and IntIdx for B == I - SecondAlternate shuffle.
-    const bool IIsEven = I & 1;
-    const bool CurrVecIsA = Vec == Vec1;
-    const bool IIsOdd = !IIsEven;
-    const bool CurrVecIsB = !CurrVecIsA;
-    ShuffleMode CurrentShuffleMode =
-        ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
-                                                            : SecondAlternate;
-    // Common mode is not set or the same as the shuffle mode of the current
-    // operation - alternate.
-    if (CommonShuffleMode == Unknown)
-      CommonShuffleMode = CurrentShuffleMode;
-    // Common shuffle mode is not the same as the shuffle mode of the current
-    // operation - permutation.
-    if (CommonShuffleMode != CurrentShuffleMode)
-      CommonShuffleMode = Permute;
+    CommonShuffleMode = Select;
   }
   // If we're not crossing lanes in different vectors, consider it as blending.
-  if ((CommonShuffleMode == FirstAlternate ||
-       CommonShuffleMode == SecondAlternate) &&
-      Vec2)
-    return TargetTransformInfo::SK_Alternate;
+  if (CommonShuffleMode == Select && Vec2)
+    return TargetTransformInfo::SK_Select;
   // If Vec2 was never used, we have a permutation of a single vector, otherwise
   // we have permutation of 2 vectors.
   return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
               : TargetTransformInfo::SK_PermuteSingleSrc;
 }
 
-///\returns Opcode that can be clubbed with \p Op to create an alternate
-/// sequence which can later be merged as a ShuffleVector instruction.
-static unsigned getAltOpcode(unsigned Op) {
-  switch (Op) {
-  case Instruction::FAdd:
-    return Instruction::FSub;
-  case Instruction::FSub:
-    return Instruction::FAdd;
-  case Instruction::Add:
-    return Instruction::Sub;
-  case Instruction::Sub:
-    return Instruction::Add;
-  default:
-    return 0;
-  }
-}
-
-static bool isOdd(unsigned Value) {
-  return Value & 1;
-}
-
-static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
-                            unsigned CheckedOpcode) {
-  return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
-}
-
-/// Chooses the correct key for scheduling data. If \p Op has the same (or
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
-/// OpValue.
-static Value *isOneOf(Value *OpValue, Value *Op) {
-  auto *I = dyn_cast<Instruction>(Op);
-  if (!I)
-    return OpValue;
-  auto *OpInst = cast<Instruction>(OpValue);
-  unsigned OpInstOpcode = OpInst->getOpcode();
-  unsigned IOpcode = I->getOpcode();
-  if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
-    return Op;
-  return OpValue;
-}
-
-namespace {
-
-/// Contains data for the instructions going to be vectorized.
-struct RawInstructionsData {
-  /// Main Opcode of the instructions going to be vectorized.
-  unsigned Opcode = 0;
-
-  /// The list of instructions have some instructions with alternate opcodes.
-  bool HasAltOpcodes = false;
-};
-
-} // end anonymous namespace
-
-/// Checks the list of the vectorized instructions \p VL and returns info about
-/// this list.
-static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
-  auto *I0 = dyn_cast<Instruction>(VL[0]);
-  if (!I0)
-    return {};
-  RawInstructionsData Res;
-  unsigned Opcode = I0->getOpcode();
-  // Walk through the list of the vectorized instructions
-  // in order to check its structure described by RawInstructionsData.
-  for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
-    auto *I = dyn_cast<Instruction>(VL[Cnt]);
-    if (!I)
-      return {};
-    if (Opcode != I->getOpcode())
-      Res.HasAltOpcodes = true;
-  }
-  Res.Opcode = Opcode;
-  return Res;
-}
-
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -402,42 +306,90 @@ struct InstructionsState {
   /// The very first instruction in the list with the main opcode.
   Value *OpValue = nullptr;
 
-  /// The main opcode for the list of instructions.
-  unsigned Opcode = 0;
+  /// The main/alternate instruction.
+  Instruction *MainOp = nullptr;
+  Instruction *AltOp = nullptr;
+
+  /// The main/alternate opcodes for the list of instructions.
+  unsigned getOpcode() const {
+    return MainOp ? MainOp->getOpcode() : 0;
+  }
+
+  unsigned getAltOpcode() const {
+    return AltOp ? AltOp->getOpcode() : 0;
+  }
 
   /// Some of the instructions in the list have alternate opcodes.
-  bool IsAltShuffle = false;
+  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+  bool isOpcodeOrAlt(Instruction *I) const {
+    unsigned CheckedOpcode = I->getOpcode();
+    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+  }
 
-  InstructionsState() = default;
-  InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
-      : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
+  InstructionsState() = delete;
+  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
 };
 
 } // end anonymous namespace
 
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+  auto *I = dyn_cast<Instruction>(Op);
+  if (I && S.isOpcodeOrAlt(I))
+    return Op;
+  return S.OpValue;
+}
+
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list 
 /// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
-  auto Res = getMainOpcode(VL);
-  unsigned Opcode = Res.Opcode;
-  if (!Res.HasAltOpcodes)
-    return InstructionsState(VL[0], Opcode, false);
-  auto *OpInst = cast<Instruction>(VL[0]);
-  unsigned AltOpcode = getAltOpcode(Opcode);
-  // Examine each element in the list instructions VL to determine
-  // if some operations there could be considered as an alternative
-  // (for example as subtraction relates to addition operation).
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+                                       unsigned BaseIndex = 0) {
+  // Make sure these are all Instructions.
+  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+  unsigned AltOpcode = Opcode;
+  unsigned AltIndex = BaseIndex;
+
+  // Check for one alternate opcode from another BinaryOperator.
+  // TODO - generalize to support all operators (types, calls etc.).
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
-    auto *I = cast<Instruction>(VL[Cnt]);
-    unsigned InstOpcode = I->getOpcode();
-    if ((Res.HasAltOpcodes &&
-         InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
-        (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
-      return InstructionsState(OpInst, 0, false);
-    }
+    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+        continue;
+      if (Opcode == AltOpcode) {
+        AltOpcode = InstOpcode;
+        AltIndex = Cnt;
+        continue;
+      }
+    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+      if (Ty0 == Ty1) {
+        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+          continue;
+        if (Opcode == AltOpcode) {
+          AltOpcode = InstOpcode;
+          AltIndex = Cnt;
+          continue;
+        }
+      }
+    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      continue;
+    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
   }
-  return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
+
+  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+                           cast<Instruction>(VL[AltIndex]));
 }
 
 /// \returns true if all of the values in \p VL have the same type or false
@@ -452,16 +404,21 @@ static bool allSameType(ArrayRef<Value *> VL) {
 }
 
 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
-  assert(Opcode == Instruction::ExtractElement ||
-         Opcode == Instruction::ExtractValue);
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+  unsigned Opcode = E->getOpcode();
+  assert((Opcode == Instruction::ExtractElement ||
+          Opcode == Instruction::ExtractValue) &&
+         "Expected extractelement or extractvalue instruction.");
   if (Opcode == Instruction::ExtractElement) {
-    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-    return CI && CI->getZExtValue() == Idx;
-  } else {
-    ExtractValueInst *EI = cast<ExtractValueInst>(E);
-    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
+    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+    if (!CI)
+      return None;
+    return CI->getZExtValue();
   }
+  ExtractValueInst *EI = cast<ExtractValueInst>(E);
+  if (EI->getNumIndices() != 1)
+    return None;
+  return *EI->idx_begin();
 }
 
 /// \returns True if in-tree use also needs extract. This refers to
@@ -549,7 +506,7 @@ public:
       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
   }
 
-  /// \brief Vectorize the tree that starts with the elements in \p VL.
+  /// Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
   Value *vectorizeTree();
 
@@ -585,8 +542,8 @@ public:
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
-    NumLoadsWantToKeepOrder = 0;
-    NumLoadsWantToChangeOrder = 0;
+    NumOpsWantToKeepOrder.clear();
+    NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -596,12 +553,22 @@ public:
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
 
-  /// \brief Perform LICM and CSE on the newly generated gather sequences.
-  void optimizeGatherSequence(Function &F);
+  /// Perform LICM and CSE on the newly generated gather sequences.
+  void optimizeGatherSequence();
+
+  /// \returns The best order of instructions for vectorization.
+  Optional<ArrayRef<unsigned>> bestOrder() const {
+    auto I = std::max_element(
+        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+          return D1.second < D2.second;
+        });
+    if (I == NumOpsWantToKeepOrder.end() ||
+        I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+      return None;
 
-  /// \returns true if it is beneficial to reverse the vector order.
-  bool shouldReorder() const {
-    return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+    return makeArrayRef(I->getFirst());
   }
 
   /// \return The vector element size in bits to use when vectorizing the
@@ -625,7 +592,7 @@ public:
     return MinVecRegSize;
   }
 
-  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+  /// Check if ArrayType or StructType is isomorphic to some VectorType.
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -648,9 +615,13 @@ private:
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
-  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
-  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
+  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a
+  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+  /// returns false, setting \p CurrentOrder to either an empty vector or a
+  /// non-identity permutation that allows to reuse extract instructions.
+  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+                       SmallVectorImpl<unsigned> &CurrentOrder) const;
 
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
@@ -658,22 +629,19 @@ private:
   /// Vectorize a single entry in the tree, starting in \p VL.
   Value *vectorizeTree(ArrayRef<Value *> VL);
 
-  /// \returns the pointer to the vectorized value if \p VL is already
-  /// vectorized, or NULL. They may happen in cycles.
-  Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
-
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
-  int getGatherCost(Type *Ty);
+  int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
   int getGatherCost(ArrayRef<Value *> VL);
 
-  /// \brief Set the Builder insert point to one after the last instruction in
+  /// Set the Builder insert point to one after the last instruction in
   /// the bundle
-  void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
+  void setInsertPointAfterBundle(ArrayRef<Value *> VL,
+                                 const InstructionsState &S);
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -684,7 +652,8 @@ private:
 
   /// \reorder commutative operands in alt shuffle if they result in
   ///  vectorized code.
-  void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+  void reorderAltShuffleOperands(const InstructionsState &S,
+                                 ArrayRef<Value *> VL,
                                  SmallVectorImpl<Value *> &Left,
                                  SmallVectorImpl<Value *> &Right);
 
@@ -698,8 +667,12 @@ private:
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
-      assert(VL.size() == Scalars.size() && "Invalid size");
-      return std::equal(VL.begin(), VL.end(), Scalars.begin());
+      if (VL.size() == Scalars.size())
+        return std::equal(VL.begin(), VL.end(), Scalars.begin());
+      return VL.size() == ReuseShuffleIndices.size() &&
+             std::equal(
+                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
     }
 
     /// A vector of scalars.
@@ -711,6 +684,12 @@ private:
     /// Do we need to gather this sequence ?
     bool NeedToGather = false;
 
+    /// Does this sequence require some shuffling?
+    SmallVector<unsigned, 4> ReuseShuffleIndices;
+
+    /// Does this entry require reordering?
+    ArrayRef<unsigned> ReorderIndices;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -725,13 +704,17 @@ private:
   };
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
-                          int &UserTreeIdx) {
+  void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+                    ArrayRef<unsigned> ReuseShuffleIndices = None,
+                    ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
+    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+                                     ReuseShuffleIndices.end());
+    Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -744,7 +727,6 @@ private:
     if (UserTreeIdx >= 0)
       Last->UserTreeIndices.push_back(UserTreeIdx);
     UserTreeIdx = idx;
-    return Last;
   }
 
   /// -- Vectorization State --
@@ -758,13 +740,6 @@ private:
     return nullptr;
   }
 
-  const TreeEntry *getTreeEntry(Value *V) const {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return &VectorizableTree[I->second];
-    return nullptr;
-  }
-
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
@@ -1038,7 +1013,7 @@ private:
     template <typename ReadyListType>
     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
       SD->IsScheduled = true;
-      DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
+      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
       ScheduleData *BundleMember = SD;
       while (BundleMember) {
@@ -1061,8 +1036,8 @@ private:
               assert(!DepBundle->IsScheduled &&
                      "already scheduled bundle gets ready");
               ReadyList.insert(DepBundle);
-              DEBUG(dbgs()
-                    << "SLP:    gets ready (def): " << *DepBundle << "\n");
+              LLVM_DEBUG(dbgs()
+                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
             }
           });
         }
@@ -1075,8 +1050,8 @@ private:
             assert(!DepBundle->IsScheduled &&
                    "already scheduled bundle gets ready");
             ReadyList.insert(DepBundle);
-            DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle
-                         << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
           }
         }
         BundleMember = BundleMember->NextInBundle;
@@ -1101,7 +1076,8 @@ private:
         doForAllOpcodes(I, [&](ScheduleData *SD) {
           if (SD->isSchedulingEntity() && SD->isReady()) {
             ReadyList.insert(SD);
-            DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    initially in ready list: " << *I << "\n");
           }
         });
       }
@@ -1110,7 +1086,8 @@ private:
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
+    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                           const InstructionsState &S);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1120,7 +1097,7 @@ private:
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
-    bool extendSchedulingRegion(Value *V, Value *OpValue);
+    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
@@ -1201,11 +1178,38 @@ private:
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
-  // Number of load bundles that contain consecutive loads.
-  int NumLoadsWantToKeepOrder = 0;
+  using OrdersType = SmallVector<unsigned, 4>;
+  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+  /// sorted SmallVectors of unsigned.
+  struct OrdersTypeDenseMapInfo {
+    static OrdersType getEmptyKey() {
+      OrdersType V;
+      V.push_back(~1U);
+      return V;
+    }
+
+    static OrdersType getTombstoneKey() {
+      OrdersType V;
+      V.push_back(~2U);
+      return V;
+    }
+
+    static unsigned getHashValue(const OrdersType &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+      return LHS == RHS;
+    }
+  };
 
-  // Number of load bundles that contain consecutive loads in reversed order.
-  int NumLoadsWantToChangeOrder = 0;
+  /// Contains orders of operations along with the number of bundles that have
+  /// operations in this order. It stores only those orders that require
+  /// reordering, if reordering is not required it is counted using \a
+  /// NumOpsWantToKeepOriginalOrder.
+  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+  /// Number of bundles that do not require reordering.
+  unsigned NumOpsWantToKeepOriginalOrder = 0;
 
   // Analysis and block reference.
   Function *F;
@@ -1242,7 +1246,7 @@ template <> struct GraphTraits<BoUpSLP *> {
   /// NodeRef has to be a pointer per the GraphWriter.
   using NodeRef = TreeEntry *;
 
-  /// \brief Add the VectorizableTree to the index iterator to be able to return
+  /// Add the VectorizableTree to the index iterator to be able to return
   /// TreeEntry pointers.
   struct ChildIteratorType
       : public iterator_adaptor_base<ChildIteratorType,
@@ -1340,16 +1344,22 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
+      int FoundLane = Lane;
+      if (!Entry->ReuseShuffleIndices.empty()) {
+        FoundLane =
+            std::distance(Entry->ReuseShuffleIndices.begin(),
+                          llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+      }
 
       // Check if the scalar is externally used as an extra arg.
       auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
-        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
-              Lane << " from " << *Scalar << ".\n");
-        ExternalUses.emplace_back(Scalar, nullptr, Lane);
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+                          << Lane << " from " << *Scalar << ".\n");
+        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
       }
       for (User *U : Scalar->users()) {
-        DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
         Instruction *UserInst = dyn_cast<Instruction>(U);
         if (!UserInst)
@@ -1363,8 +1373,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
           // be used.
           if (UseScalar != U ||
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
-            DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
-                         << ".\n");
+            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                              << ".\n");
             assert(!UseEntry->NeedToGather && "Bad state");
             continue;
           }
@@ -1374,9 +1384,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
         if (is_contained(UserIgnoreList, UserInst))
           continue;
 
-        DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
-              Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+                          << Lane << " from " << *Scalar << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
       }
     }
   }
@@ -1388,28 +1398,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
-    DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
-    DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
-      DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
-    DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
@@ -1420,8 +1430,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Don't vectorize ephemeral values.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (EphValues.count(VL[i])) {
-      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
-            ") is ephemeral.\n");
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+                        << ") is ephemeral.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
@@ -1429,18 +1439,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   // Check if this is a duplicate of another entry.
   if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
-      if (E->Scalars[i] != VL[i]) {
-        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false, UserTreeIdx);
-        return;
-      }
+    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+    if (!E->isSame(VL)) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+      newTreeEntry(VL, false, UserTreeIdx);
+      return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
     // properly draw the graph rather than for the actual vectorization.
     E->UserTreeIndices.push_back(UserTreeIdx);
-    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+                      << ".\n");
     return;
   }
 
@@ -1450,8 +1459,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (!I)
       continue;
     if (getTreeEntry(I)) {
-      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
-            ") is already in tree.\n");
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+                        << ") is already in tree.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
@@ -1461,7 +1470,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // we need to gather the scalars.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
-      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
@@ -1475,19 +1484,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
-    DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Check that every instruction appears once in this bundle.
-  for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    for (unsigned j = i + 1; j < e; ++j)
-      if (VL[i] == VL[j]) {
-        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false, UserTreeIdx);
-        return;
-      }
+  SmallVector<unsigned, 4> ReuseShuffleIndicies;
+  SmallVector<Value *, 4> UniqueValues;
+  DenseMap<Value *, unsigned> UniquePositions;
+  for (Value *V : VL) {
+    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+    ReuseShuffleIndicies.emplace_back(Res.first->second);
+    if (Res.second)
+      UniqueValues.emplace_back(V);
+  }
+  if (UniqueValues.size() == VL.size()) {
+    ReuseShuffleIndicies.clear();
+  } else {
+    LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+    if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+      newTreeEntry(VL, false, UserTreeIdx);
+      return;
+    }
+    VL = UniqueValues;
+  }
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
@@ -1495,18 +1517,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
-    DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+  if (!BS.tryScheduleBundle(VL, this, S)) {
+    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
     return;
   }
-  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned ShuffleOrOp = S.IsAltShuffle ?
-                (unsigned) Instruction::ShuffleVector : S.Opcode;
+  unsigned ShuffleOrOp = S.isAltShuffle() ?
+                (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1517,15 +1539,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           TerminatorInst *Term = dyn_cast<TerminatorInst>(
               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
           if (Term) {
-            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+            LLVM_DEBUG(
+                dbgs()
+                << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx);
+            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
@@ -1540,13 +1564,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      bool Reuse = canReuseExtract(VL, VL0);
+      OrdersType CurrentOrder;
+      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
       if (Reuse) {
-        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
-      } else {
-        BS.cancelScheduling(VL, VL0);
+        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+        ++NumOpsWantToKeepOriginalOrder;
+        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        return;
       }
-      newTreeEntry(VL, Reuse, UserTreeIdx);
+      if (!CurrentOrder.empty()) {
+        LLVM_DEBUG({
+          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+                    "with order";
+          for (unsigned Idx : CurrentOrder)
+            dbgs() << " " << Idx;
+          dbgs() << "\n";
+        });
+        // Insert new order with initial value 0, if it does not exist,
+        // otherwise return the iterator to the existing one.
+        auto StoredCurrentOrderAndNum =
+            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+        ++StoredCurrentOrderAndNum->getSecond();
+        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+                     StoredCurrentOrderAndNum->getFirst());
+        return;
+      }
+      LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(VL, VL0);
       return;
     }
     case Instruction::Load: {
@@ -1561,62 +1607,67 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx);
-        DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
-        LoadInst *L = cast<LoadInst>(VL[i]);
+      SmallVector<Value *, 4> PointerOps(VL.size());
+      auto POIter = PointerOps.begin();
+      for (Value *V : VL) {
+        auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
-          DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
+        *POIter = L->getPointerOperand();
+        ++POIter;
       }
 
-      // Check if the loads are consecutive, reversed, or neither.
-      // TODO: What we really want is to sort the loads, but for now, check
-      // the two likely directions.
-      bool Consecutive = true;
-      bool ReverseConsecutive = true;
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          Consecutive = false;
-          break;
+      OrdersType CurrentOrder;
+      // Check the order of pointer operands.
+      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+        Value *Ptr0;
+        Value *PtrN;
+        if (CurrentOrder.empty()) {
+          Ptr0 = PointerOps.front();
+          PtrN = PointerOps.back();
         } else {
-          ReverseConsecutive = false;
+          Ptr0 = PointerOps[CurrentOrder.front()];
+          PtrN = PointerOps[CurrentOrder.back()];
         }
-      }
-
-      if (Consecutive) {
-        ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true, UserTreeIdx);
-        DEBUG(dbgs() << "SLP: added a vector of loads.\n");
-        return;
-      }
-
-      // If none of the load pairs were consecutive when checked in order,
-      // check the reverse order.
-      if (ReverseConsecutive)
-        for (unsigned i = VL.size() - 1; i > 0; --i)
-          if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
-            ReverseConsecutive = false;
-            break;
+        const SCEV *Scev0 = SE->getSCEV(Ptr0);
+        const SCEV *ScevN = SE->getSCEV(PtrN);
+        const auto *Diff =
+            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+        // Check that the sorted loads are consecutive.
+        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+          if (CurrentOrder.empty()) {
+            // Original loads are consecutive and does not require reordering.
+            ++NumOpsWantToKeepOriginalOrder;
+            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+                         ReuseShuffleIndicies);
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+          } else {
+            // Need to reorder.
+            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            ++I->getSecond();
+            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+                         ReuseShuffleIndicies, I->getFirst());
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
+          return;
+        }
+      }
 
+      LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx);
-
-      if (ReverseConsecutive) {
-        ++NumLoadsWantToChangeOrder;
-        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
-      } else {
-        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      }
+      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -1636,13 +1687,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
-          DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs()
+                     << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
@@ -1664,14 +1716,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
-          DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs()
+                     << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
@@ -1702,14 +1755,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
+        reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
@@ -1729,9 +1782,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // We don't combine GEPs with complicated (nested) indexing.
       for (unsigned j = 0; j < VL.size(); ++j) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
-          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1742,9 +1795,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned j = 0; j < VL.size(); ++j) {
         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
         if (Ty0 != CurTy) {
-          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+          LLVM_DEBUG(dbgs()
+                     << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1753,16 +1807,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned j = 0; j < VL.size(); ++j) {
         auto Op = cast<Instruction>(VL[j])->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
-          DEBUG(
-              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          LLVM_DEBUG(dbgs()
+                     << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1778,13 +1832,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
-          DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
@@ -1801,8 +1855,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx);
-        DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
       Function *Int = CI->getCalledFunction();
@@ -1815,9 +1869,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
-          DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
-                       << "\n");
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+                            << "\n");
           return;
         }
         // ctlz,cttz and powi are special intrinsics whose second argument
@@ -1826,10 +1880,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx);
-            DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
-                         << " argument "<< A1I<<"!=" << A1J
-                         << "\n");
+            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+                              << " argument " << A1I << "!=" << A1J << "\n");
             return;
           }
         }
@@ -1839,14 +1892,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
-          DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
-                       << *VL[i] << '\n');
+          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+                            << *CI << "!=" << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1861,19 +1914,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::ShuffleVector:
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
-      if (!S.IsAltShuffle) {
+      if (!S.isAltShuffle()) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx);
-        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
+        reorderAltShuffleOperands(S, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
@@ -1891,8 +1944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx);
-      DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
 }
@@ -1922,15 +1975,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+                              SmallVectorImpl<unsigned> &CurrentOrder) const {
   Instruction *E0 = cast<Instruction>(OpValue);
   assert(E0->getOpcode() == Instruction::ExtractElement ||
          E0->getOpcode() == Instruction::ExtractValue);
-  assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
+  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *Vec = E0->getOperand(0);
 
+  CurrentOrder.clear();
+
   // We have to extract from a vector/aggregate with the same number of elements.
   unsigned NElts;
   if (E0->getOpcode() == Instruction::ExtractValue) {
@@ -1950,15 +2006,40 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
     return false;
 
   // Check that all of the indices extract from the correct offset.
-  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
-    Instruction *Inst = cast<Instruction>(VL[I]);
-    if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
-      return false;
+  bool ShouldKeepOrder = true;
+  unsigned E = VL.size();
+  // Assign to all items the initial value E + 1 so we can check if the extract
+  // instruction index was used already.
+  // Also, later we can check that all the indices are used and we have a
+  // consecutive access in the extract instructions, by checking that no
+  // element of CurrentOrder still has value E + 1.
+  CurrentOrder.assign(E, E + 1);
+  unsigned I = 0;
+  for (; I < E; ++I) {
+    auto *Inst = cast<Instruction>(VL[I]);
     if (Inst->getOperand(0) != Vec)
-      return false;
+      break;
+    Optional<unsigned> Idx = getExtractIndex(Inst);
+    if (!Idx)
+      break;
+    const unsigned ExtIdx = *Idx;
+    if (ExtIdx != I) {
+      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+        break;
+      ShouldKeepOrder = false;
+      CurrentOrder[ExtIdx] = I;
+    } else {
+      if (CurrentOrder[I] != E + 1)
+        break;
+      CurrentOrder[I] = I;
+    }
+  }
+  if (I < E) {
+    CurrentOrder.clear();
+    return false;
   }
 
-  return true;
+  return ShouldKeepOrder;
 }
 
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
@@ -1984,13 +2065,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     VecTy = VectorType::get(
         IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
 
+  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+  int ReuseShuffleCost = 0;
+  if (NeedToShuffleReuses) {
+    ReuseShuffleCost =
+        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+  }
   if (E->NeedToGather) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
-      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+      return ReuseShuffleCost +
+             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
+    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+        allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
         int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
@@ -2007,37 +2097,86 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
                                             IO->getZExtValue());
           }
         }
-        return Cost;
+        return ReuseShuffleCost + Cost;
       }
     }
-    return getGatherCost(E->Scalars);
+    return ReuseShuffleCost + getGatherCost(VL);
   }
   InstructionsState S = getSameOpcode(VL);
-  assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.IsAltShuffle ?
-               (unsigned) Instruction::ShuffleVector : S.Opcode;
+  unsigned ShuffleOrOp = S.isAltShuffle() ?
+               (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
 
     case Instruction::ExtractValue:
     case Instruction::ExtractElement:
-      if (canReuseExtract(VL, S.OpValue)) {
-        int DeadCost = 0;
+      if (NeedToShuffleReuses) {
+        unsigned Idx = 0;
+        for (unsigned I : E->ReuseShuffleIndices) {
+          if (ShuffleOrOp == Instruction::ExtractElement) {
+            auto *IO = cast<ConstantInt>(
+                cast<ExtractElementInst>(VL[I])->getIndexOperand());
+            Idx = IO->getZExtValue();
+            ReuseShuffleCost -= TTI->getVectorInstrCost(
+                Instruction::ExtractElement, VecTy, Idx);
+          } else {
+            ReuseShuffleCost -= TTI->getVectorInstrCost(
+                Instruction::ExtractElement, VecTy, Idx);
+            ++Idx;
+          }
+        }
+        Idx = ReuseShuffleNumbers;
+        for (Value *V : VL) {
+          if (ShuffleOrOp == Instruction::ExtractElement) {
+            auto *IO = cast<ConstantInt>(
+                cast<ExtractElementInst>(V)->getIndexOperand());
+            Idx = IO->getZExtValue();
+          } else {
+            --Idx;
+          }
+          ReuseShuffleCost +=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+        }
+      }
+      if (!E->NeedToGather) {
+        int DeadCost = ReuseShuffleCost;
+        if (!E->ReorderIndices.empty()) {
+          // TODO: Merge this shuffle with the ReuseShuffleCost.
+          DeadCost += TTI->getShuffleCost(
+              TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+        }
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
           Instruction *E = cast<Instruction>(VL[i]);
           // If all users are going to be vectorized, instruction can be
           // considered as dead.
           // The same, if have only one user, it will be vectorized for sure.
-          if (areAllUsersVectorized(E))
+          if (areAllUsersVectorized(E)) {
             // Take credit for instruction that will become dead.
-            DeadCost +=
+            if (E->hasOneUse()) {
+              Instruction *Ext = E->user_back();
+              if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+                  all_of(Ext->users(),
+                         [](User *U) { return isa<GetElementPtrInst>(U); })) {
+                // Use getExtractWithExtendCost() to calculate the cost of
+                // extractelement/ext pair.
+                DeadCost -= TTI->getExtractWithExtendCost(
+                    Ext->getOpcode(), Ext->getType(), VecTy, i);
+                // Add back the cost of s|zext which is subtracted seperately.
+                DeadCost += TTI->getCastInstrCost(
+                    Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+                continue;
+              }
+            }
+            DeadCost -=
                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+          }
         }
-        return -DeadCost;
+        return DeadCost;
       }
-      return getGatherCost(VecTy);
+      return ReuseShuffleCost + getGatherCost(VL);
 
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -2052,24 +2191,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
+      int ScalarEltCost =
+          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
 
       // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy, VL0);
+      int ScalarCost = VL.size() * ScalarEltCost;
 
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
+      int VecCost = 0;
+      // Check if the values are candidates to demote.
+      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+        VecCost = ReuseShuffleCost +
+                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+      }
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
+      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+                                                  Builder.getInt1Ty(), VL0);
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-      int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
-      int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
-      return VecCost - ScalarCost;
+      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Add:
     case Instruction::FAdd:
@@ -2098,42 +2250,43 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       TargetTransformInfo::OperandValueProperties Op1VP =
           TargetTransformInfo::OP_None;
       TargetTransformInfo::OperandValueProperties Op2VP =
-          TargetTransformInfo::OP_None;
+          TargetTransformInfo::OP_PowerOf2;
 
       // If all operands are exactly the same ConstantInt then set the
       // operand kind to OK_UniformConstantValue.
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt = nullptr;
-      for (unsigned i = 0; i < VL.size(); ++i) {
+      ConstantInt *CInt0 = nullptr;
+      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
         const Instruction *I = cast<Instruction>(VL[i]);
-        if (!isa<ConstantInt>(I->getOperand(1))) {
+        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
+        if (!CInt) {
           Op2VK = TargetTransformInfo::OK_AnyValue;
+          Op2VP = TargetTransformInfo::OP_None;
           break;
         }
+        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+            !CInt->getValue().isPowerOf2())
+          Op2VP = TargetTransformInfo::OP_None;
         if (i == 0) {
-          CInt = cast<ConstantInt>(I->getOperand(1));
+          CInt0 = CInt;
           continue;
         }
-        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
-            CInt != cast<ConstantInt>(I->getOperand(1)))
+        if (CInt0 != CInt)
           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
-      // FIXME: Currently cost of model modification for division by power of
-      // 2 is handled for X86 and AArch64. Add support for other targets.
-      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
-          CInt->getValue().isPowerOf2())
-        Op2VP = TargetTransformInfo::OP_PowerOf2;
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
-      int ScalarCost =
-          VecTy->getNumElements() *
-          TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
-                                      Op2VP, Operands);
-      int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP, Operands);
-      return VecCost - ScalarCost;
+      int ScalarEltCost = TTI->getArithmeticInstrCost(
+          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
+                                                Op2VK, Op1VP, Op2VP, Operands);
+      return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
       TargetTransformInfo::OperandValueKind Op1VK =
@@ -2141,83 +2294,119 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_UniformConstantValue;
 
-      int ScalarCost =
-          VecTy->getNumElements() *
+      int ScalarEltCost =
           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost =
           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
-      return VecCost - ScalarCost;
+      return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
-      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
-      int ScalarLdCost = VecTy->getNumElements() *
+      unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+      int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0, VL0);
-      return VecLdCost - ScalarLdCost;
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+      int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+      int VecLdCost =
+          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+      if (!E->ReorderIndices.empty()) {
+        // TODO: Merge this shuffle with the ReuseShuffleCost.
+        VecLdCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
+      return ReuseShuffleCost + VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
-      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
-      int ScalarStCost = VecTy->getNumElements() *
+      unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+      int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0, VL0);
-      return VecStCost - ScalarStCost;
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+      int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+      int VecStCost =
+          TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+      return ReuseShuffleCost + VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
+      SmallVector<Type *, 4> ScalarTys;
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
         ScalarTys.push_back(CI->getArgOperand(op)->getType());
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
         FMF = FPMO->getFastMathFlags();
 
-      int ScalarCallCost = VecTy->getNumElements() *
+      int ScalarEltCost =
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+      int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
       SmallVector<Value *, 4> Args(CI->arg_operands());
       int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
                                                    VecTy->getNumElements());
 
-      DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
-            << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
-            << " for " << *CI << "\n");
+      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+                        << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+                        << " for " << *CI << "\n");
 
-      return VecCallCost - ScalarCallCost;
+      return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
+      assert(S.isAltShuffle() &&
+             ((Instruction::isBinaryOp(S.getOpcode()) &&
+               Instruction::isBinaryOp(S.getAltOpcode())) ||
+              (Instruction::isCast(S.getOpcode()) &&
+               Instruction::isCast(S.getAltOpcode()))) &&
+             "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
-      int VecCost = 0;
+      if (NeedToShuffleReuses) {
+        for (unsigned Idx : E->ReuseShuffleIndices) {
+          Instruction *I = cast<Instruction>(VL[Idx]);
+          ReuseShuffleCost -= TTI->getInstructionCost(
+              I, TargetTransformInfo::TCK_RecipThroughput);
+        }
+        for (Value *V : VL) {
+          Instruction *I = cast<Instruction>(V);
+          ReuseShuffleCost += TTI->getInstructionCost(
+              I, TargetTransformInfo::TCK_RecipThroughput);
+        }
+      }
       for (Value *i : VL) {
         Instruction *I = cast<Instruction>(i);
-        if (!I)
-          break;
-        ScalarCost +=
-            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+        ScalarCost += TTI->getInstructionCost(
+            I, TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
-      Instruction *I0 = cast<Instruction>(VL[0]);
-      VecCost =
-          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
-      Instruction *I1 = cast<Instruction>(VL[1]);
-      VecCost +=
-          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
-      VecCost +=
-          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
-      return VecCost - ScalarCost;
+      int VecCost = 0;
+      if (Instruction::isBinaryOp(S.getOpcode())) {
+        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
+        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      } else {
+        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
+        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
+        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
+        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+      }
+      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+      return ReuseShuffleCost + VecCost - ScalarCost;
     }
     default:
       llvm_unreachable("Unknown instruction");
@@ -2225,8 +2414,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 }
 
 bool BoUpSLP::isFullyVectorizableTinyTree() {
-  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
-        VectorizableTree.size() << " is fully vectorizable .\n");
+  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+                    << VectorizableTree.size() << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
   if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
@@ -2296,13 +2485,13 @@ int BoUpSLP::getSpillCost() {
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
-    DEBUG(
+    LLVM_DEBUG({
       dbgs() << "SLP: #LV: " << LiveValues.size();
       for (auto *X : LiveValues)
         dbgs() << " " << X->getName();
       dbgs() << ", Looking at ";
       Inst->dump();
-      );
+    });
 
     // Now find the sequence of instructions between PrevInst and Inst.
     BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
@@ -2314,7 +2503,10 @@ int BoUpSLP::getSpillCost() {
         continue;
       }
 
-      if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+      // Debug informations don't impact spill cost.
+      if ((isa<CallInst>(&*PrevInstIt) &&
+           !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+          &*PrevInstIt != PrevInst) {
         SmallVector<Type*, 4> V;
         for (auto *II : LiveValues)
           V.push_back(VectorType::get(II->getType(), BundleWidth));
@@ -2332,19 +2524,41 @@ int BoUpSLP::getSpillCost() {
 
 int BoUpSLP::getTreeCost() {
   int Cost = 0;
-  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
-        VectorizableTree.size() << ".\n");
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << VectorizableTree.size() << ".\n");
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
 
-  for (TreeEntry &TE : VectorizableTree) {
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry &TE = VectorizableTree[I];
+
+    // We create duplicate tree entries for gather sequences that have multiple
+    // uses. However, we should not compute the cost of duplicate sequences.
+    // For example, if we have a build vector (i.e., insertelement sequence)
+    // that is used by more than one vector instruction, we only need to
+    // compute the cost of the insertelement instructions once. The redundent
+    // instructions will be eliminated by CSE.
+    //
+    // We should consider not creating duplicate tree entries for gather
+    // sequences, and instead add additional edges to the tree representing
+    // their uses. Since such an approach results in fewer total entries,
+    // existing heuristics based on tree size may yeild different results.
+    //
+    if (TE.NeedToGather &&
+        std::any_of(std::next(VectorizableTree.begin(), I + 1),
+                    VectorizableTree.end(), [TE](TreeEntry &Entry) {
+                      return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+                    }))
+      continue;
+
     int C = getEntryCost(&TE);
-    DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
-                 << *TE.Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                      << " for bundle that starts with " << *TE.Scalars[0]
+                      << ".\n");
     Cost += C;
   }
 
-  SmallSet<Value *, 16> ExtractCostCalculated;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
   int ExtractCost = 0;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
@@ -2385,7 +2599,7 @@ int BoUpSLP::getTreeCost() {
        << "SLP: Extract Cost = " << ExtractCost << ".\n"
        << "SLP: Total Cost = " << Cost << ".\n";
   }
-  DEBUG(dbgs() << Str);
+  LLVM_DEBUG(dbgs() << Str);
 
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
@@ -2393,10 +2607,14 @@ int BoUpSLP::getTreeCost() {
   return Cost;
 }
 
-int BoUpSLP::getGatherCost(Type *Ty) {
+int BoUpSLP::getGatherCost(Type *Ty,
+                           const DenseSet<unsigned> &ShuffledIndices) {
   int Cost = 0;
   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
-    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (!ShuffledIndices.count(i))
+      Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  if (!ShuffledIndices.empty())
+      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
 }
 
@@ -2407,7 +2625,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
   // Find the cost of inserting/extracting values from the vector.
-  return getGatherCost(VecTy);
+  // Check if the same elements are inserted several times and count them as
+  // shuffle candidates.
+  DenseSet<unsigned> ShuffledElements;
+  DenseSet<Value *> UniqueElements;
+  // Iterate in reverse order to consider insert elements with the high cost.
+  for (unsigned I = VL.size(); I > 0; --I) {
+    unsigned Idx = I - 1;
+    if (!UniqueElements.insert(VL[Idx]).second)
+      ShuffledElements.insert(Idx);
+  }
+  return getGatherCost(VecTy, ShuffledElements);
 }
 
 // Reorder commutative operations in alternate shuffle if the resulting vectors
@@ -2419,16 +2647,14 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
 // load a[3] + load b[3]
 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
 // code.
-void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
+                                        ArrayRef<Value *> VL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
   // Push left and right operands of binary operation into Left and Right
-  unsigned AltOpcode = getAltOpcode(Opcode);
-  (void)AltOpcode;
   for (Value *V : VL) {
     auto *I = cast<Instruction>(V);
-    assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
-           "Incorrect instruction in vector");
+    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
     Left.push_back(I->getOperand(0));
     Right.push_back(I->getOperand(1));
   }
@@ -2608,7 +2834,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
   // add a[1],c[2]  load b[1]
   // b[2]           load b[2]
   // add a[3],c[3]  load b[3]
-  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+  for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
@@ -2629,17 +2855,15 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
   }
 }
 
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
+                                        const InstructionsState &S) {
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
-  auto *Front = cast<Instruction>(OpValue);
+  auto *Front = cast<Instruction>(S.OpValue);
   auto *BB = Front->getParent();
-  const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
-  const unsigned AltOpcode = getAltOpcode(Opcode);
   assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
-    return !sameOpcodeOrAlt(Opcode, AltOpcode,
-                            cast<Instruction>(V)->getOpcode()) ||
-           cast<Instruction>(V)->getParent() == BB;
+    auto *I = cast<Instruction>(V);
+    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
   }));
 
   // The last instruction in the bundle in program order.
@@ -2651,7 +2875,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
     auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
+        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -2679,7 +2903,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
   if (!LastInst) {
     SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
-      if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
+      if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
         LastInst = &I;
       if (Bundle.empty())
         break;
@@ -2705,7 +2929,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
       if (TreeEntry *E = getTreeEntry(VL[i])) {
         // Find which lane we need to extract.
         int FoundLane = -1;
-        for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+        for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
           // Is this the lane of the scalar that we are looking for ?
           if (E->Scalars[Lane] == VL[i]) {
             FoundLane = Lane;
@@ -2713,6 +2937,11 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
           }
         }
         assert(FoundLane >= 0 && "Could not find the correct lane");
+        if (!E->ReuseShuffleIndices.empty()) {
+          FoundLane =
+              std::distance(E->ReuseShuffleIndices.begin(),
+                            llvm::find(E->ReuseShuffleIndices, FoundLane));
+        }
         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
       }
     }
@@ -2721,66 +2950,128 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
-  if (const TreeEntry *En = getTreeEntry(OpValue)) {
-    if (En->isSame(VL) && En->VectorizedValue)
-      return En->VectorizedValue;
-  }
-  return nullptr;
-}
-
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
-  if (S.Opcode) {
+  if (S.getOpcode()) {
     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      if (E->isSame(VL))
-        return vectorizeTree(E);
+      if (E->isSame(VL)) {
+        Value *V = vectorizeTree(E);
+        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+          // We need to get the vectorized value but without shuffle.
+          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+            V = SV->getOperand(0);
+          } else {
+            // Reshuffle to get only unique values.
+            SmallVector<unsigned, 4> UniqueIdxs;
+            SmallSet<unsigned, 4> UsedIdxs;
+            for(unsigned Idx : E->ReuseShuffleIndices)
+              if (UsedIdxs.insert(Idx).second)
+                UniqueIdxs.emplace_back(Idx);
+            V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                            UniqueIdxs);
+          }
+        }
+        return V;
+      }
     }
   }
 
   Type *ScalarTy = S.OpValue->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     ScalarTy = SI->getValueOperand()->getType();
+
+  // Check that every instruction appears once in this bundle.
+  SmallVector<unsigned, 4> ReuseShuffleIndicies;
+  SmallVector<Value *, 4> UniqueValues;
+  if (VL.size() > 2) {
+    DenseMap<Value *, unsigned> UniquePositions;
+    for (Value *V : VL) {
+      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+      ReuseShuffleIndicies.emplace_back(Res.first->second);
+      if (Res.second || isa<Constant>(V))
+        UniqueValues.emplace_back(V);
+    }
+    // Do not shuffle single element or if number of unique values is not power
+    // of 2.
+    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+        !llvm::isPowerOf2_32(UniqueValues.size()))
+      ReuseShuffleIndicies.clear();
+    else
+      VL = UniqueValues;
+  }
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
-  return Gather(VL, VecTy);
+  Value *V = Gather(VL, VecTy);
+  if (!ReuseShuffleIndicies.empty()) {
+    V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                    ReuseShuffleIndicies, "shuffle");
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      GatherSeq.insert(I);
+      CSEBlocks.insert(I->getParent());
+    }
+  }
+  return V;
+}
+
+static void inversePermutation(ArrayRef<unsigned> Indices,
+                               SmallVectorImpl<unsigned> &Mask) {
+  Mask.clear();
+  const unsigned E = Indices.size();
+  Mask.resize(E);
+  for (unsigned I = 0; I < E; ++I)
+    Mask[Indices[I]] = I;
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
-    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
 
   InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+  Instruction *VL0 = cast<Instruction>(S.OpValue);
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
 
+  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+
   if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, VL0);
+    setInsertPointAfterBundle(E->Scalars, S);
     auto *V = Gather(E->Scalars, VecTy);
+    if (NeedToShuffleReuses) {
+      V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                      E->ReuseShuffleIndices, "shuffle");
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        GatherSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+    }
     E->VectorizedValue = V;
     return V;
   }
 
-  unsigned ShuffleOrOp = S.IsAltShuffle ?
-           (unsigned) Instruction::ShuffleVector : S.Opcode;
+  unsigned ShuffleOrOp = S.isAltShuffle() ?
+           (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
-      E->VectorizedValue = NewPhi;
+      Value *V = NewPhi;
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
+      E->VectorizedValue = V;
 
       // PHINodes may have multiple entries from the same block. We want to
       // visit every block once.
-      SmallSet<BasicBlock*, 4> VisitedBBs;
+      SmallPtrSet<BasicBlock*, 4> VisitedBBs;
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
@@ -2803,32 +3094,74 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
              "Invalid number of incoming values");
-      return NewPhi;
+      return V;
     }
 
     case Instruction::ExtractElement: {
-      if (canReuseExtract(E->Scalars, VL0)) {
+      if (!E->NeedToGather) {
         Value *V = VL0->getOperand(0);
+        if (!E->ReorderIndices.empty()) {
+          OrdersType Mask;
+          inversePermutation(E->ReorderIndices, Mask);
+          Builder.SetInsertPoint(VL0);
+          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+                                          "reorder_shuffle");
+        }
+        if (NeedToShuffleReuses) {
+          // TODO: Merge this shuffle with the ReorderShuffleMask.
+          if (!E->ReorderIndices.empty())
+            Builder.SetInsertPoint(VL0);
+          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                          E->ReuseShuffleIndices, "shuffle");
+        }
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
       auto *V = Gather(E->Scalars, VecTy);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
+      }
       E->VectorizedValue = V;
       return V;
     }
     case Instruction::ExtractValue: {
-      if (canReuseExtract(E->Scalars, VL0)) {
+      if (!E->NeedToGather) {
         LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
         Builder.SetInsertPoint(LI);
         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
         Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
         LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
-        E->VectorizedValue = V;
-        return propagateMetadata(V, E->Scalars);
+        Value *NewV = propagateMetadata(V, E->Scalars);
+        if (!E->ReorderIndices.empty()) {
+          OrdersType Mask;
+          inversePermutation(E->ReorderIndices, Mask);
+          NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+                                             "reorder_shuffle");
+        }
+        if (NeedToShuffleReuses) {
+          // TODO: Merge this shuffle with the ReorderShuffleMask.
+          NewV = Builder.CreateShuffleVector(
+              NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
+        }
+        E->VectorizedValue = NewV;
+        return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
       auto *V = Gather(E->Scalars, VecTy);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
+      }
       E->VectorizedValue = V;
       return V;
     }
@@ -2848,15 +3181,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       for (Value *V : E->Scalars)
         INVL.push_back(cast<Instruction>(V)->getOperand(0));
 
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
 
       Value *InVec = vectorizeTree(INVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars, VL0))
-        return V;
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
 
       CastInst *CI = dyn_cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -2869,23 +3208,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
       }
 
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
 
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
 
-      if (Value *V = alreadyVectorized(E->Scalars, VL0))
-        return V;
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.Opcode == Instruction::FCmp)
+      if (S.getOpcode() == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
 
+      propagateIRFlags(V, E->Scalars, VL0);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
-      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
       ++NumVectorInstructions;
       return V;
     }
@@ -2897,16 +3242,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
       }
 
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
 
       Value *Cond = vectorizeTree(CondVec);
       Value *True = vectorizeTree(TrueVec);
       Value *False = vectorizeTree(FalseVec);
 
-      if (Value *V = alreadyVectorized(E->Scalars, VL0))
-        return V;
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -2931,7 +3282,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
+        reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
                                        RHSVL);
       else
         for (Value *V : E->Scalars) {
@@ -2940,29 +3291,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
           RHSVL.push_back(I->getOperand(1));
         }
 
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
 
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars, VL0))
-        return V;
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+      propagateIRFlags(V, E->Scalars, VL0);
+      if (auto *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
+
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
-      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
       ++NumVectorInstructions;
 
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        return propagateMetadata(I, E->Scalars);
-
       return V;
     }
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      bool IsReorder = !E->ReorderIndices.empty();
+      if (IsReorder) {
+        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+        VL0 = cast<Instruction>(S.OpValue);
+      }
+      setInsertPointAfterBundle(E->Scalars, S);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -2984,9 +3346,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
       }
       LI->setAlignment(Alignment);
-      E->VectorizedValue = LI;
+      Value *V = propagateMetadata(LI, E->Scalars);
+      if (IsReorder) {
+        OrdersType Mask;
+        inversePermutation(E->ReorderIndices, Mask);
+        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                        Mask, "reorder_shuffle");
+      }
+      if (NeedToShuffleReuses) {
+        // TODO: Merge this shuffle with the ReorderShuffleMask.
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
+      E->VectorizedValue = V;
       ++NumVectorInstructions;
-      return propagateMetadata(LI, E->Scalars);
+      return V;
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
@@ -2997,12 +3371,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       for (Value *V : E->Scalars)
         ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
 
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
 
       Value *VecValue = vectorizeTree(ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
-      StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+      StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
@@ -3013,13 +3387,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (!Alignment)
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
 
-      S->setAlignment(Alignment);
-      E->VectorizedValue = S;
+      ST->setAlignment(Alignment);
+      Value *V = propagateMetadata(ST, E->Scalars);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
+      E->VectorizedValue = V;
       ++NumVectorInstructions;
-      return propagateMetadata(S, E->Scalars);
+      return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
 
       ValueList Op0VL;
       for (Value *V : E->Scalars)
@@ -3040,17 +3419,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Value *V = Builder.CreateGEP(
           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
+
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        return propagateMetadata(I, E->Scalars);
-
       return V;
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, VL0);
+      setInsertPointAfterBundle(E->Scalars, S);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
       Value *ScalarArg = nullptr;
@@ -3074,7 +3457,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         }
 
         Value *OpVec = vectorizeTree(OpVL);
-        DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
 
@@ -3092,58 +3475,87 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (ScalarArg && getTreeEntry(ScalarArg))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
+      propagateIRFlags(V, E->Scalars, VL0);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
-      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
-      assert(Instruction::isBinaryOp(S.Opcode) &&
+      assert(S.isAltShuffle() &&
+             ((Instruction::isBinaryOp(S.getOpcode()) &&
+               Instruction::isBinaryOp(S.getAltOpcode())) ||
+              (Instruction::isCast(S.getOpcode()) &&
+               Instruction::isCast(S.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
-      reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
-      setInsertPointAfterBundle(E->Scalars, VL0);
-
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars, VL0))
-        return V;
+      Value *LHS, *RHS;
+      if (Instruction::isBinaryOp(S.getOpcode())) {
+        reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
+        setInsertPointAfterBundle(E->Scalars, S);
+        LHS = vectorizeTree(LHSVL);
+        RHS = vectorizeTree(RHSVL);
+      } else {
+        ValueList INVL;
+        for (Value *V : E->Scalars)
+          INVL.push_back(cast<Instruction>(V)->getOperand(0));
+        setInsertPointAfterBundle(E->Scalars, S);
+        LHS = vectorizeTree(INVL);
+      }
 
-      // Create a vector of LHS op1 RHS
-      Value *V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
 
-      unsigned AltOpcode = getAltOpcode(S.Opcode);
-      // Create a vector of LHS op2 RHS
-      Value *V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
+      Value *V0, *V1;
+      if (Instruction::isBinaryOp(S.getOpcode())) {
+        V0 = Builder.CreateBinOp(
+          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+        V1 = Builder.CreateBinOp(
+          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+      } else {
+        V0 = Builder.CreateCast(
+            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+        V1 = Builder.CreateCast(
+            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+      }
 
       // Create shuffle to take alternate operations from the vector.
-      // Also, gather up odd and even scalar ops to propagate IR flags to
+      // Also, gather up main and alt scalar ops to propagate IR flags to
       // each vector operation.
-      ValueList OddScalars, EvenScalars;
+      ValueList OpScalars, AltScalars;
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        if (isOdd(i)) {
+        auto *OpInst = cast<Instruction>(E->Scalars[i]);
+        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+        if (OpInst->getOpcode() == S.getAltOpcode()) {
           Mask[i] = Builder.getInt32(e + i);
-          OddScalars.push_back(E->Scalars[i]);
+          AltScalars.push_back(E->Scalars[i]);
         } else {
           Mask[i] = Builder.getInt32(i);
-          EvenScalars.push_back(E->Scalars[i]);
+          OpScalars.push_back(E->Scalars[i]);
         }
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
-      propagateIRFlags(V0, EvenScalars);
-      propagateIRFlags(V1, OddScalars);
+      propagateIRFlags(V0, OpScalars);
+      propagateIRFlags(V1, AltScalars);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
+      if (NeedToShuffleReuses) {
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+                                        E->ReuseShuffleIndices, "shuffle");
+      }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        return propagateMetadata(I, E->Scalars);
 
       return V;
     }
@@ -3182,7 +3594,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     VectorizableTree[0].VectorizedValue = Trunc;
   }
 
-  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+                    << " values .\n");
 
   // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
   // specified by ScalarType.
@@ -3259,7 +3672,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
         Ex = extend(ScalarRoot, Ex, Scalar->getType());
         CSEBlocks.insert(cast<Instruction>(User)->getParent());
         User->replaceUsesOfWith(Scalar, Ex);
-     }
+      }
     } else {
       Builder.SetInsertPoint(&F->getEntryBlock().front());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
@@ -3268,7 +3681,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       User->replaceUsesOfWith(Scalar, Ex);
     }
 
-    DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
   }
 
   // For each vectorized value:
@@ -3289,7 +3702,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       if (!Ty->isVoidTy()) {
 #ifndef NDEBUG
         for (User *U : Scalar->users()) {
-          DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to replace users in the ignorelist by undef.
           assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
@@ -3299,7 +3712,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
         Value *Undef = UndefValue::get(Ty);
         Scalar->replaceAllUsesWith(Undef);
       }
-      DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
       eraseInstruction(cast<Instruction>(Scalar));
     }
   }
@@ -3309,18 +3722,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   return VectorizableTree[0].VectorizedValue;
 }
 
-void BoUpSLP::optimizeGatherSequence(Function &F) {
-  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
-        << " gather sequences instructions.\n");
+void BoUpSLP::optimizeGatherSequence() {
+  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+                    << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
-  for (Instruction *it : GatherSeq) {
-    InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
-
-    if (!Insert)
+  for (Instruction *I : GatherSeq) {
+    if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
       continue;
 
     // Check if this block is inside a loop.
-    Loop *L = LI->getLoopFor(Insert->getParent());
+    Loop *L = LI->getLoopFor(I->getParent());
     if (!L)
       continue;
 
@@ -3332,27 +3743,41 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
     // If the vector or the element that we insert into it are
     // instructions that are defined in this basic block then we can't
     // hoist this instruction.
-    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
-    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
-    if (CurrVec && L->contains(CurrVec))
+    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+    if (Op0 && L->contains(Op0))
       continue;
-    if (NewElem && L->contains(NewElem))
+    if (Op1 && L->contains(Op1))
       continue;
 
     // We can hoist this instruction. Move it to the pre-header.
-    Insert->moveBefore(PreHeader->getTerminator());
+    I->moveBefore(PreHeader->getTerminator());
   }
 
+  // Make a list of all reachable blocks in our CSE queue.
+  SmallVector<const DomTreeNode *, 8> CSEWorkList;
+  CSEWorkList.reserve(CSEBlocks.size());
+  for (BasicBlock *BB : CSEBlocks)
+    if (DomTreeNode *N = DT->getNode(BB)) {
+      assert(DT->isReachableFromEntry(N));
+      CSEWorkList.push_back(N);
+    }
+
+  // Sort blocks by domination. This ensures we visit a block after all blocks
+  // dominating it are visited.
+  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+                   [this](const DomTreeNode *A, const DomTreeNode *B) {
+    return DT->properlyDominates(A, B);
+  });
+
   // Perform O(N^2) search over the gather sequences and merge identical
   // instructions. TODO: We can further optimize this scan if we split the
   // instructions into different buckets based on the insert lane.
   SmallVector<Instruction *, 16> Visited;
-  ReversePostOrderTraversal<Function *> RPOT(&F);
-  for (auto BB : RPOT) {
-    // Traverse CSEBlocks by RPOT order.
-    if (!CSEBlocks.count(BB))
-      continue;
-
+  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+           "Worklist not sorted properly!");
+    BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
       Instruction *In = &*it++;
@@ -3383,8 +3808,9 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP, Value *OpValue) {
-  if (isa<PHINode>(OpValue))
+                                                 BoUpSLP *SLP,
+                                                 const InstructionsState &S) {
+  if (isa<PHINode>(S.OpValue))
     return true;
 
   // Initialize the instruction bundle.
@@ -3392,12 +3818,12 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
-  DEBUG(dbgs() << "SLP:  bundle: " << *OpValue << "\n");
+  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
-    if (!extendSchedulingRegion(V, OpValue))
+    if (!extendSchedulingRegion(V, S))
       return false;
   }
 
@@ -3409,8 +3835,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
       // A bundle member was scheduled as single instruction before and now
       // needs to be scheduled as part of the bundle. We just get rid of the
       // existing schedule.
-      DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
-                   << " was already scheduled\n");
+      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
+                        << " was already scheduled\n");
       ReSchedule = true;
     }
     assert(BundleMember->isSchedulingEntity() &&
@@ -3445,8 +3871,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     initialFillReadyList(ReadyInsts);
   }
 
-  DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
-               << BB->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+                    << BB->getName() << "\n");
 
   calculateDependencies(Bundle, true, SLP);
 
@@ -3464,7 +3890,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, OpValue);
+    cancelScheduling(VL, S.OpValue);
     return false;
   }
   return true;
@@ -3476,7 +3902,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
     return;
 
   ScheduleData *Bundle = getScheduleData(OpValue);
-  DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
+  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
@@ -3507,13 +3933,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
 }
 
 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
-                                                      Value *OpValue) {
-  if (getScheduleData(V, isOneOf(OpValue, V)))
+                                                      const InstructionsState &S) {
+  if (getScheduleData(V, isOneOf(S, V)))
     return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
-  auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
+  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
     ScheduleData *ISD = getScheduleData(I);
     if (!ISD)
       return false;
@@ -3521,8 +3947,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
            "ScheduleData not in scheduling region");
     ScheduleData *SD = allocateScheduleDataChunks();
     SD->Inst = I;
-    SD->init(SchedulingRegionID, OpValue);
-    ExtraScheduleDataMap[I][OpValue] = SD;
+    SD->init(SchedulingRegionID, S.OpValue);
+    ExtraScheduleDataMap[I][S.OpValue] = SD;
     return true;
   };
   if (CheckSheduleForI(I))
@@ -3532,10 +3958,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
-    if (isOneOf(OpValue, I) != I)
+    if (isOneOf(S, I) != I)
       CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
-    DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
   }
   // Search up and down at the same time, because we don't know if the new
@@ -3547,7 +3973,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
   BasicBlock::iterator LowerEnd = BB->end();
   while (true) {
     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
-      DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
+      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
       return false;
     }
 
@@ -3555,9 +3981,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
       if (&*UpIter == I) {
         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
         ScheduleStart = I;
-        if (isOneOf(OpValue, I) != I)
+        if (isOneOf(S, I) != I)
           CheckSheduleForI(I);
-        DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
+                          << "\n");
         return true;
       }
       UpIter++;
@@ -3567,10 +3994,11 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
                          nullptr);
         ScheduleEnd = I->getNextNode();
-        if (isOneOf(OpValue, I) != I)
+        if (isOneOf(S, I) != I)
           CheckSheduleForI(I);
         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
-        DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
+                          << "\n");
         return true;
       }
       DownIter++;
@@ -3634,7 +4062,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
       assert(isInSchedulingRegion(BundleMember));
       if (!BundleMember->hasValidDependencies()) {
 
-        DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
+        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
+                          << "\n");
         BundleMember->Dependencies = 0;
         BundleMember->resetUnscheduledDeps();
 
@@ -3726,7 +4155,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
             // and we can abort this loop at i6.
             if (DistToSrc >= 2 * MaxMemDepDistance)
-                break;
+              break;
             DistToSrc++;
           }
         }
@@ -3735,7 +4164,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
     }
     if (InsertInReadyList && SD->isReady()) {
       ReadyInsts.push_back(SD);
-      DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
+      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
+                        << "\n");
     }
   }
 }
@@ -3758,7 +4188,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
 
-  DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
 
   BS->resetSchedule();
 
@@ -4024,7 +4454,11 @@ void BoUpSLP::computeMinimumValueSizes() {
   // We start by looking at each entry that can be demoted. We compute the
   // maximum bit width required to store the scalar by using ValueTracking to
   // compute the number of high-order bits we can truncate.
-  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+      llvm::all_of(TreeRoot, [](Value *R) {
+        assert(R->hasOneUse() && "Root should have only one use!");
+        return isa<GetElementPtrInst>(R->user_back());
+      })) {
     MaxBitWidth = 8u;
 
     // Determine if the sign bit of all the roots is known to be zero. If not,
@@ -4187,7 +4621,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
     return false;
 
-  DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
   // Use the bottom up slp vectorizer to construct chains that start with
   // store instructions.
@@ -4202,8 +4636,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
     // Vectorize trees that end at stores.
     if (!Stores.empty()) {
-      DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
-                   << " underlying objects.\n");
+      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                        << " underlying objects.\n");
       Changed |= vectorizeStoreChains(R);
     }
 
@@ -4214,21 +4648,21 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
     // is primarily intended to catch gather-like idioms ending at
     // non-consecutive loads.
     if (!GEPs.empty()) {
-      DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
-                   << " underlying objects.\n");
+      LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+                        << " underlying objects.\n");
       Changed |= vectorizeGEPIndices(BB, R);
     }
   }
 
   if (Changed) {
-    R.optimizeGatherSequence(F);
-    DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
-    DEBUG(verifyFunction(F));
+    R.optimizeGatherSequence();
+    LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+    LLVM_DEBUG(verifyFunction(F));
   }
   return Changed;
 }
 
-/// \brief Check that the Values in the slice in VL array are still existent in
+/// Check that the Values in the slice in VL array are still existent in
 /// the WeakTrackingVH array.
 /// Vectorization of part of the VL array may cause later values in the VL array
 /// to become invalid. We track when this has happened in the WeakTrackingVH
@@ -4243,30 +4677,28 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned VecRegSize) {
-  unsigned ChainLen = Chain.size();
-  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
-        << "\n");
-  unsigned Sz = R.getVectorElementSize(Chain[0]);
-  unsigned VF = VecRegSize / Sz;
+  const unsigned ChainLen = Chain.size();
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+                    << "\n");
+  const unsigned Sz = R.getVectorElementSize(Chain[0]);
+  const unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
+  const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
 
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
-  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
-    if (i + VF > e)
-      break;
+  for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
 
     // Check that a previous iteration of this loop did not delete the Value.
     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
       continue;
 
-    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
-          << "\n");
+    LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+                      << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     R.buildTree(Operands);
@@ -4277,9 +4709,10 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
 
     int Cost = R.getTreeCost();
 
-    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
+                      << "\n");
     if (Cost < -SLPCostThreshold) {
-      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
 
       using namespace ore;
 
@@ -4416,64 +4849,48 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, true);
+  return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           bool AllowReorder) {
+                                           int UserCost, bool AllowReorder) {
   if (VL.size() < 2)
     return false;
 
-  DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
-               << ".\n");
+  LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+                    << VL.size() << ".\n");
 
-  // Check that all of the parts are scalar instructions of the same type.
-  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
-  if (!I0)
+  // Check that all of the parts are scalar instructions of the same type,
+  // we permit an alternate opcode via InstructionsState.
+  InstructionsState S = getSameOpcode(VL);
+  if (!S.getOpcode())
     return false;
 
-  unsigned Opcode0 = I0->getOpcode();
-
+  Instruction *I0 = cast<Instruction>(S.OpValue);
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
   if (MaxVF < 2) {
-     R.getORE()->emit([&]() {
-         return OptimizationRemarkMissed(
-                    SV_NAME, "SmallVF", I0)
-                << "Cannot SLP vectorize list: vectorization factor "
-                << "less than 2 is not supported";
-     });
-     return false;
+    R.getORE()->emit([&]() {
+      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+             << "Cannot SLP vectorize list: vectorization factor "
+             << "less than 2 is not supported";
+    });
+    return false;
   }
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
     if (!isValidElementType(Ty)) {
-      // NOTE: the following will give user internal llvm type name, which may not be useful
-      R.getORE()->emit([&]() {
-          std::string type_str;
-          llvm::raw_string_ostream rso(type_str);
-          Ty->print(rso);
-          return OptimizationRemarkMissed(
-                     SV_NAME, "UnsupportedType", I0)
-                 << "Cannot SLP vectorize list: type "
-                 << rso.str() + " is unsupported by vectorizer";
-      });
-      return false;
-    }
-    Instruction *Inst = dyn_cast<Instruction>(V);
-
-    if (!Inst)
-        return false;
-    if (Inst->getOpcode() != Opcode0) {
+      // NOTE: the following will give user internal llvm type name, which may
+      // not be useful.
       R.getORE()->emit([&]() {
-          return OptimizationRemarkMissed(
-                     SV_NAME, "InequableTypes", I0)
-                 << "Cannot SLP vectorize list: not all of the "
-                 << "parts of scalar instructions are of the same type: "
-                 << ore::NV("Instruction1Opcode", I0) << " and "
-                 << ore::NV("Instruction2Opcode", Inst);
+        std::string type_str;
+        llvm::raw_string_ostream rso(type_str);
+        Ty->print(rso);
+        return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+               << "Cannot SLP vectorize list: type "
+               << rso.str() + " is unsupported by vectorizer";
       });
       return false;
     }
@@ -4510,13 +4927,15 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
         continue;
 
-      DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                        << "\n");
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
       R.buildTree(Ops);
+      Optional<ArrayRef<unsigned>> Order = R.bestOrder();
       // TODO: check if we can allow reordering for more cases.
-      if (AllowReorder && R.shouldReorder()) {
+      if (AllowReorder && Order) {
+        // TODO: reorder tree nodes without tree rebuilding.
         // Conceptually, there is nothing actually preventing us from trying to
         // reorder a larger list. In fact, we do exactly this when vectorizing
         // reductions. However, at this point, we only expect to get here when
@@ -4529,12 +4948,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost();
+      int Cost = R.getTreeCost() - UserCost;
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
-        DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
                                                     cast<Instruction>(Ops[0]))
                                  << "SLP vectorized with cost " << ore::NV("Cost", Cost)
@@ -4552,18 +4971,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
   if (!Changed && CandidateFound) {
     R.getORE()->emit([&]() {
-        return OptimizationRemarkMissed(
-                   SV_NAME, "NotBeneficial",  I0)
-               << "List vectorization was possible but not beneficial with cost "
-               << ore::NV("Cost", MinCost) << " >= "
-               << ore::NV("Treshold", -SLPCostThreshold);
+      return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+             << "List vectorization was possible but not beneficial with cost "
+             << ore::NV("Cost", MinCost) << " >= "
+             << ore::NV("Treshold", -SLPCostThreshold);
     });
   } else if (!Changed) {
     R.getORE()->emit([&]() {
-        return OptimizationRemarkMissed(
-                   SV_NAME, "NotPossible", I0)
-               << "Cannot SLP vectorize list: vectorization was impossible"
-               << " with available vectorization factors";
+      return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+             << "Cannot SLP vectorize list: vectorization was impossible"
+             << " with available vectorization factors";
     });
   }
   return Changed;
@@ -4612,7 +5029,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   return false;
 }
 
-/// \brief Generate a shuffle mask to be used in a reduction tree.
+/// Generate a shuffle mask to be used in a reduction tree.
 ///
 /// \param VecLen The length of the vector to be reduced.
 /// \param NumEltsToRdx The number of elements that should be reduced in the
@@ -5095,6 +5512,77 @@ class HorizontalReduction {
         return OperationData(
             Instruction::FCmp, LHS, RHS, RK_Max,
             cast<Instruction>(Select->getCondition())->hasNoNaNs());
+      } else {
+        // Try harder: look for min/max pattern based on instructions producing
+        // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+        // During the intermediate stages of SLP, it's very common to have
+        // pattern like this (since optimizeGatherSequence is run only once
+        // at the end):
+        // %1 = extractelement <2 x i32> %a, i32 0
+        // %2 = extractelement <2 x i32> %a, i32 1
+        // %cond = icmp sgt i32 %1, %2
+        // %3 = extractelement <2 x i32> %a, i32 0
+        // %4 = extractelement <2 x i32> %a, i32 1
+        // %select = select i1 %cond, i32 %3, i32 %4
+        CmpInst::Predicate Pred;
+        Instruction *L1;
+        Instruction *L2;
+
+        LHS = Select->getTrueValue();
+        RHS = Select->getFalseValue();
+        Value *Cond = Select->getCondition();
+
+        // TODO: Support inverse predicates.
+        if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+          if (!isa<ExtractElementInst>(RHS) ||
+              !L2->isIdenticalTo(cast<Instruction>(RHS)))
+            return OperationData(V);
+        } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+          if (!isa<ExtractElementInst>(LHS) ||
+              !L1->isIdenticalTo(cast<Instruction>(LHS)))
+            return OperationData(V);
+        } else {
+          if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+            return OperationData(V);
+          if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+              !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+              !L2->isIdenticalTo(cast<Instruction>(RHS)))
+            return OperationData(V);
+        }
+        switch (Pred) {
+        default:
+          return OperationData(V);
+
+        case CmpInst::ICMP_ULT:
+        case CmpInst::ICMP_ULE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+        case CmpInst::ICMP_SLT:
+        case CmpInst::ICMP_SLE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+        case CmpInst::FCMP_OLT:
+        case CmpInst::FCMP_OLE:
+        case CmpInst::FCMP_ULT:
+        case CmpInst::FCMP_ULE:
+          return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+                               cast<Instruction>(Cond)->hasNoNaNs());
+
+        case CmpInst::ICMP_UGT:
+        case CmpInst::ICMP_UGE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+        case CmpInst::ICMP_SGT:
+        case CmpInst::ICMP_SGE:
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+        case CmpInst::FCMP_OGT:
+        case CmpInst::FCMP_OGE:
+        case CmpInst::FCMP_UGT:
+        case CmpInst::FCMP_UGE:
+          return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+                               cast<Instruction>(Cond)->hasNoNaNs());
+        }
       }
     }
     return OperationData(V);
@@ -5103,7 +5591,7 @@ class HorizontalReduction {
 public:
   HorizontalReduction() = default;
 
-  /// \brief Try to find a reduction tree.
+  /// Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
     assert((!Phi || is_contained(Phi->operands(), B)) &&
            "Thi phi needs to use the binary operator");
@@ -5131,6 +5619,8 @@ public:
     Type *Ty = B->getType();
     if (!isValidElementType(Ty))
       return false;
+    if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+      return false;
 
     ReducedValueData.clear();
     ReductionRoot = B;
@@ -5229,7 +5719,7 @@ public:
     return true;
   }
 
-  /// \brief Attempt to vectorize the tree found by
+  /// Attempt to vectorize the tree found by
   /// matchAssociativeReduction.
   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
     if (ReducedVals.empty())
@@ -5262,9 +5752,14 @@ public:
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
-      if (V.shouldReorder()) {
-        SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
-        V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
+      Optional<ArrayRef<unsigned>> Order = V.bestOrder();
+      // TODO: Handle orders of size less than number of elements in the vector.
+      if (Order && Order->size() == VL.size()) {
+        // TODO: reorder tree nodes without tree rebuilding.
+        SmallVector<Value *, 4> ReorderedOps(VL.size());
+        llvm::transform(*Order, ReorderedOps.begin(),
+                        [VL](const unsigned Idx) { return VL[Idx]; });
+        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
@@ -5272,8 +5767,9 @@ public:
       V.computeMinimumValueSizes();
 
       // Estimate cost.
-      int Cost =
-          V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+      int TreeCost = V.getTreeCost();
+      int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+      int Cost = TreeCost + ReductionCost;
       if (Cost >= -SLPCostThreshold) {
           V.getORE()->emit([&]() {
               return OptimizationRemarkMissed(
@@ -5286,8 +5782,8 @@ public:
           break;
       }
 
-      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
-                   << ". (HorRdx)\n");
+      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+                        << Cost << ". (HorRdx)\n");
       V.getORE()->emit([&]() {
           return OptimizationRemark(
                      SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
@@ -5349,7 +5845,7 @@ public:
   }
 
 private:
-  /// \brief Calculate the cost of a reduction.
+  /// Calculate the cost of a reduction.
   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
                        unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
@@ -5408,16 +5904,16 @@ private:
     }
     ScalarReduxCost *= (ReduxWidth - 1);
 
-    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
-                 << " for reduction that starts with " << *FirstReducedVal
-                 << " (It is a "
-                 << (IsPairwiseReduction ? "pairwise" : "splitting")
-                 << " reduction)\n");
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+                      << " for reduction that starts with " << *FirstReducedVal
+                      << " (It is a "
+                      << (IsPairwiseReduction ? "pairwise" : "splitting")
+                      << " reduction)\n");
 
     return VecReduxCost - ScalarReduxCost;
   }
 
-  /// \brief Emit a horizontal reduction of the vectorized value.
+  /// Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
                        unsigned ReduxWidth, const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
@@ -5453,7 +5949,7 @@ private:
 
 } // end anonymous namespace
 
-/// \brief Recognize construction of vectors like
+/// Recognize construction of vectors like
 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
@@ -5462,9 +5958,17 @@ private:
 ///
 /// Returns true if it matches
 static bool findBuildVector(InsertElementInst *LastInsertElem,
-                            SmallVectorImpl<Value *> &BuildVectorOpds) {
+                            TargetTransformInfo *TTI,
+                            SmallVectorImpl<Value *> &BuildVectorOpds,
+                            int &UserCost) {
+  UserCost = 0;
   Value *V = nullptr;
   do {
+    if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
+      UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+                                          LastInsertElem->getType(),
+                                          CI->getZExtValue());
+    }
     BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
     V = LastInsertElem->getOperand(0);
     if (isa<UndefValue>(V))
@@ -5477,7 +5981,7 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
   return true;
 }
 
-/// \brief Like findBuildVector, but looks for construction of aggregate.
+/// Like findBuildVector, but looks for construction of aggregate.
 ///
 /// \return true if it matches.
 static bool findBuildAggregate(InsertValueInst *IV,
@@ -5500,7 +6004,7 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
 
-/// \brief Try and get a reduction value from a phi node.
+/// Try and get a reduction value from a phi node.
 ///
 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
 /// if they come from either \p ParentBB or a containing loop latch.
@@ -5513,9 +6017,8 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   // reduction phi. Vectorizing such cases has been reported to cause
   // miscompiles. See PR25787.
   auto DominatedReduxValue = [&](Value *R) {
-    return (
-        dyn_cast<Instruction>(R) &&
-        DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+    return isa<Instruction>(R) &&
+           DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
   };
 
   Value *Rdx = nullptr;
@@ -5585,7 +6088,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
   SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
-  SmallSet<Value *, 8> VisitedInstrs;
+  SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
   while (!Stack.empty()) {
     Value *V;
@@ -5671,7 +6174,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
   if (!findBuildAggregate(IVI, BuildVectorOpds))
     return false;
 
-  DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
   return tryToVectorizeList(BuildVectorOpds, R);
@@ -5679,13 +6182,17 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
+  int UserCost;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildVector(IEI, BuildVectorOpds))
+  if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+      (llvm::all_of(BuildVectorOpds,
+                    [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+       isShuffle(BuildVectorOpds)))
     return false;
 
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R);
+  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
 }
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -5722,7 +6229,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
-  SmallSet<Value *, 16> VisitedInstrs;
+  SmallPtrSet<Value *, 16> VisitedInstrs;
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -5757,14 +6264,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
-      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+                        << NumElts << ")\n");
       // The order in which the phi nodes appear in the program does not matter.
       // So allow tryToVectorizeList to reorder them if it is beneficial. This
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
       bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 &&
-          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            /*UserCost=*/0, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
@@ -5844,7 +6352,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
         isa<InsertValueInst>(it))
       PostProcessInstructions.push_back(&*it);
-
   }
 
   return Changed;
@@ -5858,8 +6365,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
     if (Entry.second.size() < 2)
       continue;
 
-    DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
-                 << Entry.second.size() << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+                      << Entry.second.size() << ".\n");
 
     // We process the getelementptr list in chunks of 16 (like we do for
     // stores) to minimize compile-time.
@@ -5941,14 +6448,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
     if (it->second.size() < 2)
       continue;
 
-    DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
-          << it->second.size() << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+                      << it->second.size() << ".\n");
 
     // Process the stores in chunks of 16.
     // TODO: The limit of 16 inhibits greater vectorization factors.
     //       For example, AVX2 supports v32i8. Increasing this limit, however,
     //       may cause a significant compile-time increase.
-    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
       unsigned Len = std::min<unsigned>(CE - CI, 16);
       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
     }
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
new file mode 100644
index 000000000000..f43a8bb123b1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -0,0 +1,131 @@
+//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class TargetTransformInfo;
+class TargetLibraryInfo;
+
+/// Helper class to create VPRecipies from IR instructions.
+class VPRecipeBuilder {
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+
+  VPBuilder &Builder;
+
+  /// When we if-convert we need to create edge masks. We have to cache values
+  /// so that we don't end up with exponential recursion/IR. Note that
+  /// if-conversion currently takes place during VPlan-construction, so these
+  /// caches are only used at that stage.
+  using EdgeMaskCacheTy =
+      DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+  using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+  EdgeMaskCacheTy EdgeMaskCache;
+  BlockMaskCacheTy BlockMaskCache;
+
+public:
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+  /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+  /// \return true in the first returned value if so and false otherwise.
+  /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+  /// for \p Range.Start, and provide it as the second returned value.
+  /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+  /// \return value is <true, nullptr>, as it is handled by another recipe.
+  /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+  /// to \p Range.End.
+  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+
+  /// Check if \I is a memory instruction to be widened for \p Range.Start and
+  /// potentially masked. Such instructions are handled by a recipe that takes
+  /// an additional VPInstruction for the mask.
+  VPWidenMemoryInstructionRecipe *
+  tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+
+  /// Check if an induction recipe should be constructed for \I within the given
+  /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+  /// may be decreased to ensure same decision from \p Range.Start to
+  /// \p Range.End.
+  VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+                                                        VFRange &Range);
+
+  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+  /// a sequence of select instructions as the vectorizer currently performs
+  /// full if-conversion.
+  VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
+
+  /// Check if \p I can be widened within the given VF \p Range. If \p I can be
+  /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
+  /// extended to include \p I or else build a new VPWidenRecipe for it and
+  /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
+  /// false otherwise. Range.End may be decreased to ensure same decision from
+  /// \p Range.Start to \p Range.End.
+  bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
+
+  /// Create a replicating region for instruction \p I that requires
+  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+  VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+                                       VPlanPtr &Plan);
+
+public:
+  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+                  const TargetTransformInfo *TTI,
+                  LoopVectorizationLegality *Legal,
+                  LoopVectorizationCostModel &CM, VPBuilder &Builder)
+      : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
+        Builder(Builder) {}
+
+  /// Check if a recipe can be create for \p I withing the given VF \p Range.
+  /// If a recipe can be created, it adds it to \p VPBB.
+  bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan,
+                         VPBasicBlock *VPBB);
+
+  /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+  /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+  /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+  /// Region. Update the packing decision of predicated instructions if they
+  /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+  /// \p Range.Start to \p Range.End.
+  VPBasicBlock *handleReplication(
+      Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+      DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+      VPlanPtr &Plan);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4e54fc6db2a5..f7b07b722bb1 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -116,7 +116,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
   BasicBlock *PrevBB = CFG.PrevBB;
   BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
                                          PrevBB->getParent(), CFG.LastBB);
-  DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
 
   // Hook up the new basic block to its predecessors.
   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -125,7 +125,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
     assert(PredBB && "Predecessor basic-block not found building successor.");
     auto *PredBBTerminator = PredBB->getTerminator();
-    DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
     if (isa<UnreachableInst>(PredBBTerminator)) {
       assert(PredVPSuccessors.size() == 1 &&
              "Predecessor ending w/o branch must have single successor.");
@@ -175,8 +175,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
   }
 
   // 2. Fill the IR basic block with IR instructions.
-  DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
-               << " in BB:" << NewBB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+                    << " in BB:" << NewBB->getName() << '\n');
 
   State->CFG.VPBB2IRBB[this] = NewBB;
   State->CFG.PrevVPBB = this;
@@ -184,7 +184,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
   for (VPRecipeBase &Recipe : Recipes)
     Recipe.execute(*State);
 
-  DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+  LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
 }
 
 void VPRegionBlock::execute(VPTransformState *State) {
@@ -193,7 +193,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
   if (!isReplicator()) {
     // Visit the VPBlocks connected to "this", starting from it.
     for (VPBlockBase *Block : RPOT) {
-      DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+      LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
       Block->execute(State);
     }
     return;
@@ -210,7 +210,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
       State->Instance->Lane = Lane;
       // Visit the VPBlocks connected to \p this, starting from it.
       for (VPBlockBase *Block : RPOT) {
-        DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+        LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
         Block->execute(State);
       }
     }
@@ -220,6 +220,15 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Instance.reset();
 }
 
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+  Parent = InsertPos->getParent();
+  Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+  return getParent()->getRecipeList().erase(getIterator());
+}
+
 void VPInstruction::generateInstruction(VPTransformState &State,
                                         unsigned Part) {
   IRBuilder<> &Builder = State.Builder;
@@ -356,7 +365,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
            "One successor of a basic block does not lead to the other.");
     assert(InterimSucc->getSinglePredecessor() &&
            "Interim successor has more than one predecessor.");
-    assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 &&
+    assert(pred_size(PostDomSucc) == 2 &&
            "PostDom successor has more than two predecessors.");
     DT->addNewBlock(InterimSucc, BB);
     DT->addNewBlock(PostDomSucc, BB);
@@ -448,6 +457,18 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
   bumpIndent(1);
   for (const VPRecipeBase &Recipe : *BasicBlock)
     Recipe.print(OS, Indent);
+
+  // Dump the condition bit.
+  const VPValue *CBV = BasicBlock->getCondBit();
+  if (CBV) {
+    OS << " +\n" << Indent << " \"CondBit: ";
+    if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
+      CBI->printAsOperand(OS);
+      OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
+    } else
+      CBV->printAsOperand(OS);
+  }
+
   bumpIndent(-2);
   OS << "\n" << Indent << "]\n";
   dumpEdges(BasicBlock);
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm/lib/Transforms/Vectorize/VPlan.h
index 2ccabfd6af25..866951cb79a4 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -30,6 +30,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
@@ -42,15 +43,10 @@
 #include <map>
 #include <string>
 
-// The (re)use of existing LoopVectorize classes is subject to future VPlan
-// refactoring.
-namespace {
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-} // namespace
-
 namespace llvm {
 
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
 class BasicBlock;
 class DominatorTree;
 class InnerLoopVectorizer;
@@ -60,6 +56,20 @@ class raw_ostream;
 class Value;
 class VPBasicBlock;
 class VPRegionBlock;
+class VPlan;
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 9) = {1, 2, 4, 8}
+struct VFRange {
+  // A power of 2.
+  const unsigned Start;
+
+  // Need not be a power of 2. If End <= Start range is empty.
+  unsigned End;
+};
+
+using VPlanPtr = std::unique_ptr<VPlan>;
 
 /// In what follows, the term "input IR" refers to code that is fed into the
 /// vectorizer whereas the term "output IR" refers to code that is generated by
@@ -311,6 +321,8 @@ struct VPTransformState {
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
 class VPBlockBase {
+  friend class VPBlockUtils;
+
 private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -327,6 +339,9 @@ private:
   /// List of successor blocks.
   SmallVector<VPBlockBase *, 1> Successors;
 
+  /// Successor selector, null for zero or single successor blocks.
+  VPValue *CondBit = nullptr;
+
   /// Add \p Successor as the last successor to this block.
   void appendSuccessor(VPBlockBase *Successor) {
     assert(Successor && "Cannot add nullptr successor!");
@@ -377,6 +392,7 @@ public:
   /// for any other purpose, as the values may change as LLVM evolves.
   unsigned getVPBlockID() const { return SubclassID; }
 
+  VPRegionBlock *getParent() { return Parent; }
   const VPRegionBlock *getParent() const { return Parent; }
 
   void setParent(VPRegionBlock *P) { Parent = P; }
@@ -411,6 +427,9 @@ public:
     return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
   }
 
+  size_t getNumSuccessors() const { return Successors.size(); }
+  size_t getNumPredecessors() const { return Predecessors.size(); }
+
   /// An Enclosing Block of a block B is any block containing B, including B
   /// itself. \return the closest enclosing block starting from "this", which
   /// has successors. \return the root enclosing block if all enclosing blocks
@@ -454,34 +473,41 @@ public:
     return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
   }
 
-  /// Sets a given VPBlockBase \p Successor as the single successor and \return
-  /// \p Successor. The parent of this Block is copied to be the parent of
-  /// \p Successor.
-  VPBlockBase *setOneSuccessor(VPBlockBase *Successor) {
+  /// \return the condition bit selecting the successor.
+  VPValue *getCondBit() { return CondBit; }
+
+  const VPValue *getCondBit() const { return CondBit; }
+
+  void setCondBit(VPValue *CV) { CondBit = CV; }
+
+  /// Set a given VPBlockBase \p Successor as the single successor of this
+  /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
+  /// This VPBlockBase must have no successors.
+  void setOneSuccessor(VPBlockBase *Successor) {
     assert(Successors.empty() && "Setting one successor when others exist.");
     appendSuccessor(Successor);
-    Successor->appendPredecessor(this);
-    Successor->Parent = Parent;
-    return Successor;
   }
 
-  /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two
-  /// successors. The parent of this Block is copied to be the parent of both
-  /// \p IfTrue and \p IfFalse.
-  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
+  /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+  /// successors of this VPBlockBase. \p Condition is set as the successor
+  /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
+  /// IfFalse. This VPBlockBase must have no successors.
+  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+                        VPValue *Condition) {
     assert(Successors.empty() && "Setting two successors when others exist.");
+    assert(Condition && "Setting two successors without condition!");
+    CondBit = Condition;
     appendSuccessor(IfTrue);
     appendSuccessor(IfFalse);
-    IfTrue->appendPredecessor(this);
-    IfFalse->appendPredecessor(this);
-    IfTrue->Parent = Parent;
-    IfFalse->Parent = Parent;
   }
 
-  void disconnectSuccessor(VPBlockBase *Successor) {
-    assert(Successor && "Successor to disconnect is null.");
-    removeSuccessor(Successor);
-    Successor->removePredecessor(this);
+  /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
+  /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
+  /// as successor of any VPBasicBlock in \p NewPreds.
+  void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
+    assert(Predecessors.empty() && "Block predecessors already set.");
+    for (auto *Pred : NewPreds)
+      appendPredecessor(Pred);
   }
 
   /// The method which generates the output IR that correspond to this
@@ -539,6 +565,15 @@ public:
 
   /// Each recipe prints itself.
   virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+
+  /// Insert an unlinked recipe into a basic block immediately before
+  /// the specified recipe.
+  void insertBefore(VPRecipeBase *InsertPos);
+
+  /// This method unlinks 'this' from the containing basic block and deletes it.
+  ///
+  /// \returns an iterator pointing to the element after the erased one
+  iplist<VPRecipeBase>::iterator eraseFromParent();
 };
 
 /// This is a concrete Recipe that models a single VPlan-level instruction.
@@ -546,6 +581,8 @@ public:
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPUser, public VPRecipeBase {
+  friend class VPlanHCFGTransforms;
+
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
   enum { Not = Instruction::OtherOpsEnd + 1 };
@@ -559,10 +596,13 @@ private:
   void generateInstruction(VPTransformState &State, unsigned Part);
 
 public:
-  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
       : VPUser(VPValue::VPInstructionSC, Operands),
         VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
 
+  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+      : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPValue *V) {
     return V->getVPValueID() == VPValue::VPInstructionSC;
@@ -907,7 +947,10 @@ public:
   inline const VPRecipeBase &back() const { return Recipes.back(); }
   inline VPRecipeBase &back() { return Recipes.back(); }
 
-  /// \brief Returns a pointer to a member of the recipe list.
+  /// Returns a reference to the list of recipes.
+  RecipeListTy &getRecipeList() { return Recipes; }
+
+  /// Returns a pointer to a member of the recipe list.
   static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
     return &VPBasicBlock::Recipes;
   }
@@ -968,6 +1011,9 @@ public:
     Entry->setParent(this);
     Exit->setParent(this);
   }
+  VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+      : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+        IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
     if (Entry)
@@ -982,9 +1028,27 @@ public:
   const VPBlockBase *getEntry() const { return Entry; }
   VPBlockBase *getEntry() { return Entry; }
 
+  /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+  /// EntryBlock must have no predecessors.
+  void setEntry(VPBlockBase *EntryBlock) {
+    assert(EntryBlock->getPredecessors().empty() &&
+           "Entry block cannot have predecessors.");
+    Entry = EntryBlock;
+    EntryBlock->setParent(this);
+  }
+
   const VPBlockBase *getExit() const { return Exit; }
   VPBlockBase *getExit() { return Exit; }
 
+  /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
+  /// ExitBlock must have no successors.
+  void setExit(VPBlockBase *ExitBlock) {
+    assert(ExitBlock->getSuccessors().empty() &&
+           "Exit block cannot have successors.");
+    Exit = ExitBlock;
+    ExitBlock->setParent(this);
+  }
+
   /// An indicator whether this region is to generate multiple replicated
   /// instances of output IR corresponding to its VPBlockBases.
   bool isReplicator() const { return IsReplicator; }
@@ -1012,6 +1076,13 @@ private:
   /// Holds the name of the VPlan, for printing.
   std::string Name;
 
+  /// Holds all the external definitions created for this VPlan.
+  // TODO: Introduce a specific representation for external definitions in
+  // VPlan. External definitions must be immutable and hold a pointer to its
+  // underlying IR that will be used to implement its structural comparison
+  // (operators '==' and '<').
+  SmallPtrSet<VPValue *, 16> VPExternalDefs;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -1024,6 +1095,8 @@ public:
       VPBlockBase::deleteCFG(Entry);
     for (auto &MapEntry : Value2VPValue)
       delete MapEntry.second;
+    for (VPValue *Def : VPExternalDefs)
+      delete Def;
   }
 
   /// Generate the IR code for this VPlan.
@@ -1042,6 +1115,12 @@ public:
 
   void setName(const Twine &newName) { Name = newName.str(); }
 
+  /// Add \p VPVal to the pool of external definitions if it's not already
+  /// in the pool.
+  void addExternalDef(VPValue *VPVal) {
+    VPExternalDefs.insert(VPVal);
+  }
+
   void addVPValue(Value *V) {
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
@@ -1189,6 +1268,72 @@ template <> struct GraphTraits<Inverse<VPBlockBase *>> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// VPlan Utilities
+//===----------------------------------------------------------------------===//
+
+/// Class that provides utilities for VPBlockBases in VPlan.
+class VPBlockUtils {
+public:
+  VPBlockUtils() = delete;
+
+  /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
+  /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
+  /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
+  /// has more than one successor, its conditional bit is propagated to \p
+  /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+  static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+    assert(NewBlock->getSuccessors().empty() &&
+           "Can't insert new block with successors.");
+    // TODO: move successors from BlockPtr to NewBlock when this functionality
+    // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
+    // already has successors.
+    BlockPtr->setOneSuccessor(NewBlock);
+    NewBlock->setPredecessors({BlockPtr});
+    NewBlock->setParent(BlockPtr->getParent());
+  }
+
+  /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
+  /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
+  /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
+  /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
+  /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
+  /// must have neither successors nor predecessors.
+  static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+                                   VPValue *Condition, VPBlockBase *BlockPtr) {
+    assert(IfTrue->getSuccessors().empty() &&
+           "Can't insert IfTrue with successors.");
+    assert(IfFalse->getSuccessors().empty() &&
+           "Can't insert IfFalse with successors.");
+    BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+    IfTrue->setPredecessors({BlockPtr});
+    IfFalse->setPredecessors({BlockPtr});
+    IfTrue->setParent(BlockPtr->getParent());
+    IfFalse->setParent(BlockPtr->getParent());
+  }
+
+  /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
+  /// the successors of \p From and \p From to the predecessors of \p To. Both
+  /// VPBlockBases must have the same parent, which can be null. Both
+  /// VPBlockBases can be already connected to other VPBlockBases.
+  static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
+    assert((From->getParent() == To->getParent()) &&
+           "Can't connect two block with different parents");
+    assert(From->getNumSuccessors() < 2 &&
+           "Blocks can't have more than two successors.");
+    From->appendSuccessor(To);
+    To->appendPredecessor(From);
+  }
+
+  /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
+  /// from the successors of \p From and \p From from the predecessors of \p To.
+  static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
+    assert(To && "Successor to disconnect is null.");
+    From->removeSuccessor(To);
+    To->removePredecessor(From);
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanBuilder.h b/contrib/llvm/lib/Transforms/Vectorize/VPlanBuilder.h
deleted file mode 100644
index d6eb3397d044..000000000000
--- a/contrib/llvm/lib/Transforms/Vectorize/VPlanBuilder.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides a VPlan-based builder utility analogous to IRBuilder.
-/// It provides an instruction-level API for generating VPInstructions while
-/// abstracting away the Recipe manipulation details.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
-
-#include "VPlan.h"
-
-namespace llvm {
-
-class VPBuilder {
-private:
-  VPBasicBlock *BB = nullptr;
-  VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
-
-  VPInstruction *createInstruction(unsigned Opcode,
-                                   std::initializer_list<VPValue *> Operands) {
-    VPInstruction *Instr = new VPInstruction(Opcode, Operands);
-    BB->insert(Instr, InsertPt);
-    return Instr;
-  }
-
-public:
-  VPBuilder() {}
-
-  /// \brief This specifies that created VPInstructions should be appended to
-  /// the end of the specified block.
-  void setInsertPoint(VPBasicBlock *TheBB) {
-    assert(TheBB && "Attempting to set a null insert point");
-    BB = TheBB;
-    InsertPt = BB->end();
-  }
-
-  VPValue *createNot(VPValue *Operand) {
-    return createInstruction(VPInstruction::Not, {Operand});
-  }
-
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
-  }
-
-  VPValue *createOr(VPValue *LHS, VPValue *RHS) {
-    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
-  }
-};
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
new file mode 100644
index 000000000000..08129b74cddf
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -0,0 +1,336 @@
+//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the construction of a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR. This construction comprises the following
+/// components and steps:
+//
+/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
+/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
+/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
+/// in the plain CFG.
+/// NOTE: At this point, there is a direct correspondence between all the
+/// VPBasicBlocks created for the initial plain CFG and the incoming
+/// BasicBlocks. However, this might change in the future.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "llvm/Analysis/LoopIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+namespace {
+// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+private:
+  // The outermost loop of the input loop nest considered for vectorization.
+  Loop *TheLoop;
+
+  // Loop Info analysis.
+  LoopInfo *LI;
+
+  // Vectorization plan that we are working on.
+  VPlan &Plan;
+
+  // Output Top Region.
+  VPRegionBlock *TopRegion = nullptr;
+
+  // Builder of the VPlan instruction-level representation.
+  VPBuilder VPIRBuilder;
+
+  // NOTE: The following maps are intentionally destroyed after the plain CFG
+  // construction because subsequent VPlan-to-VPlan transformation may
+  // invalidate them.
+  // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+  DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+  // Map incoming Value definitions to their newly-created VPValues.
+  DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+  // Hold phi node's that need to be fixed once the plain CFG has been built.
+  SmallVector<PHINode *, 8> PhisToFix;
+
+  // Utility functions.
+  void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+  void fixPhiNodes();
+  VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+  bool isExternalDef(Value *Val);
+  VPValue *getOrCreateVPOperand(Value *IRVal);
+  void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+      : TheLoop(Lp), LI(LI), Plan(P) {}
+
+  // Build the plain CFG and return its Top Region.
+  VPRegionBlock *buildPlainCFG();
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+  SmallVector<VPBlockBase *, 8> VPBBPreds;
+  // Collect VPBB predecessors.
+  for (BasicBlock *Pred : predecessors(BB))
+    VPBBPreds.push_back(getOrCreateVPBB(Pred));
+
+  VPBB->setPredecessors(VPBBPreds);
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixPhiNodes() {
+  for (auto *Phi : PhisToFix) {
+    assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+    VPValue *VPVal = IRDef2VPValue[Phi];
+    assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
+    auto *VPPhi = cast<VPInstruction>(VPVal);
+    assert(VPPhi->getNumOperands() == 0 &&
+           "Expected VPInstruction with no operands.");
+
+    for (Value *Op : Phi->operands())
+      VPPhi->addOperand(getOrCreateVPOperand(Op));
+  }
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
+// existing one if it was already created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+  auto BlockIt = BB2VPBB.find(BB);
+  if (BlockIt != BB2VPBB.end())
+    // Retrieve existing VPBB.
+    return BlockIt->second;
+
+  // Create new VPBB.
+  LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
+  VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+  BB2VPBB[BB] = VPBB;
+  VPBB->setParent(TopRegion);
+  return VPBB;
+}
+
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+  // All the Values that are not Instructions are considered external
+  // definitions for now.
+  Instruction *Inst = dyn_cast<Instruction>(Val);
+  if (!Inst)
+    return true;
+
+  BasicBlock *InstParent = Inst->getParent();
+  assert(InstParent && "Expected instruction parent.");
+
+  // Check whether Instruction definition is in loop PH.
+  BasicBlock *PH = TheLoop->getLoopPreheader();
+  assert(PH && "Expected loop pre-header.");
+
+  if (InstParent == PH)
+    // Instruction definition is in outermost loop PH.
+    return false;
+
+  // Check whether Instruction definition is in the loop exit.
+  BasicBlock *Exit = TheLoop->getUniqueExitBlock();
+  assert(Exit && "Expected loop with single exit.");
+  if (InstParent == Exit) {
+    // Instruction definition is in outermost loop exit.
+    return false;
+  }
+
+  // Check whether Instruction definition is in loop body.
+  return !TheLoop->contains(Inst);
+}
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+  auto VPValIt = IRDef2VPValue.find(IRVal);
+  if (VPValIt != IRDef2VPValue.end())
+    // Operand has an associated VPInstruction or VPValue that was previously
+    // created.
+    return VPValIt->second;
+
+  // Operand doesn't have a previously created VPInstruction/VPValue. This
+  // means that operand is:
+  //   A) a definition external to VPlan,
+  //   B) any other Value without specific representation in VPlan.
+  // For now, we use VPValue to represent A and B and classify both as external
+  // definitions. We may introduce specific VPValue subclasses for them in the
+  // future.
+  assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+  // A and B: Create VPValue and add it to the pool of external definitions and
+  // to the Value->VPValue map.
+  VPValue *NewVPVal = new VPValue(IRVal);
+  Plan.addExternalDef(NewVPVal);
+  IRDef2VPValue[IRVal] = NewVPVal;
+  return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+                                                  BasicBlock *BB) {
+  VPIRBuilder.setInsertPoint(VPBB);
+  for (Instruction &InstRef : *BB) {
+    Instruction *Inst = &InstRef;
+
+    // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+    // visited Inst when we shouldn't, breaking the RPO traversal order.
+    assert(!IRDef2VPValue.count(Inst) &&
+           "Instruction shouldn't have been visited.");
+
+    if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+      // Branch instruction is not explicitly represented in VPlan but we need
+      // to represent its condition bit when it's conditional.
+      if (Br->isConditional())
+        getOrCreateVPOperand(Br->getCondition());
+
+      // Skip the rest of the Instruction processing for Branch instructions.
+      continue;
+    }
+
+    VPInstruction *NewVPInst;
+    if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+      // Phi node's operands may have not been visited at this point. We create
+      // an empty VPInstruction that we will fix once the whole plain CFG has
+      // been built.
+      NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+          Inst->getOpcode(), {} /*No operands*/, Inst));
+      PhisToFix.push_back(Phi);
+    } else {
+      // Translate LLVM-IR operands into VPValue operands and set them in the
+      // new VPInstruction.
+      SmallVector<VPValue *, 4> VPOperands;
+      for (Value *Op : Inst->operands())
+        VPOperands.push_back(getOrCreateVPOperand(Op));
+
+      // Build VPInstruction for any arbitraty Instruction without specific
+      // representation in VPlan.
+      NewVPInst = cast<VPInstruction>(
+          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+    }
+
+    IRDef2VPValue[Inst] = NewVPInst;
+  }
+}
+
+// Main interface to build the plain CFG.
+VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
+  // 1. Create the Top Region. It will be the parent of all VPBBs.
+  TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
+
+  // 2. Scan the body of the loop in a topological order to visit each basic
+  // block after having visited its predecessor basic blocks. Create a VPBB for
+  // each BB and link it to its successor and predecessor VPBBs. Note that
+  // predecessors must be set in the same order as they are in the incomming IR.
+  // Otherwise, there might be problems with existing phi nodes and algorithm
+  // based on predecessors traversal.
+
+  // Loop PH needs to be explicitly visited since it's not taken into account by
+  // LoopBlocksDFS.
+  BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
+  assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+         "Unexpected loop preheader");
+  VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
+  createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
+  // Create empty VPBB for Loop H so that we can link PH->H.
+  VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
+  // Preheader's predecessors will be set during the loop RPO traversal below.
+  PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+
+  LoopBlocksRPO RPO(TheLoop);
+  RPO.perform(LI);
+
+  for (BasicBlock *BB : RPO) {
+    // Create or retrieve the VPBasicBlock for this BB and create its
+    // VPInstructions.
+    VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+    createVPInstructionsForVPBB(VPBB, BB);
+
+    // Set VPBB successors. We create empty VPBBs for successors if they don't
+    // exist already. Recipes will be created when the successor is visited
+    // during the RPO traversal.
+    TerminatorInst *TI = BB->getTerminator();
+    assert(TI && "Terminator expected.");
+    unsigned NumSuccs = TI->getNumSuccessors();
+
+    if (NumSuccs == 1) {
+      VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
+      assert(SuccVPBB && "VPBB Successor not found.");
+      VPBB->setOneSuccessor(SuccVPBB);
+    } else if (NumSuccs == 2) {
+      VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
+      assert(SuccVPBB0 && "Successor 0 not found.");
+      VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
+      assert(SuccVPBB1 && "Successor 1 not found.");
+
+      // Get VPBB's condition bit.
+      assert(isa<BranchInst>(TI) && "Unsupported terminator!");
+      auto *Br = cast<BranchInst>(TI);
+      Value *BrCond = Br->getCondition();
+      // Look up the branch condition to get the corresponding VPValue
+      // representing the condition bit in VPlan (which may be in another VPBB).
+      assert(IRDef2VPValue.count(BrCond) &&
+             "Missing condition bit in IRDef2VPValue!");
+      VPValue *VPCondBit = IRDef2VPValue[BrCond];
+
+      // Link successors using condition bit.
+      VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+    } else
+      llvm_unreachable("Number of successors not supported.");
+
+    // Set VPBB predecessors in the same order as they are in the incoming BB.
+    setVPBBPredsFromBB(VPBB, BB);
+  }
+
+  // 3. Process outermost loop exit. We created an empty VPBB for the loop
+  // single exit BB during the RPO traversal of the loop body but Instructions
+  // weren't visited because it's not part of the the loop.
+  BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+  assert(LoopExitBB && "Loops with multiple exits are not supported.");
+  VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
+  createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
+  // Loop exit was already set as successor of the loop exiting BB.
+  // We only set its predecessor VPBB now.
+  setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+
+  // 4. The whole CFG has been built at this point so all the input Values must
+  // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
+  // VPlan operands.
+  fixPhiNodes();
+
+  // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
+  // Top Region entry and exit.
+  TopRegion->setEntry(PreheaderVPBB);
+  TopRegion->setExit(LoopExitVPBB);
+  return TopRegion;
+}
+
+// Public interface to build a H-CFG.
+void VPlanHCFGBuilder::buildHierarchicalCFG(VPlan &Plan) {
+  // Build Top Region enclosing the plain CFG and set it as VPlan entry.
+  PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
+  VPRegionBlock *TopRegion = PCFGBuilder.buildPlainCFG();
+  Plan.setEntry(TopRegion);
+  LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+
+  Verifier.verifyHierarchicalCFG(TopRegion);
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
new file mode 100644
index 000000000000..c4e69843615a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -0,0 +1,55 @@
+//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanHCFGBuilder class which contains the public
+/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR.
+///
+/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
+/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
+/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
+/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
+/// other than the Top Region will have a parent VPRegionBlock and allows us
+/// to easily add more nodes before/after the main vector loop (such as the
+/// reduction epilogue).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+
+#include "VPlan.h"
+#include "VPlanVerifier.h"
+
+namespace llvm {
+
+class Loop;
+
+/// Main class to build the VPlan H-CFG for an incoming IR.
+class VPlanHCFGBuilder {
+private:
+  // The outermost loop of the input loop nest considered for vectorization.
+  Loop *TheLoop;
+
+  // Loop Info analysis.
+  LoopInfo *LI;
+
+  // VPlan verifier utility.
+  VPlanVerifier Verifier;
+
+public:
+  VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI) : TheLoop(Lp), LI(LI) {}
+
+  /// Build H-CFG for TheLoop and update \p Plan accordingly.
+  void buildHierarchicalCFG(VPlan &Plan);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
new file mode 100644
index 000000000000..e3cbab077e61
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
@@ -0,0 +1,73 @@
+//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGTransforms.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+    VPlanPtr &Plan,
+    LoopVectorizationLegality::InductionList *Inductions,
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
+  VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+  for (VPBlockBase *Base : RPOT) {
+    // Do not widen instructions in pre-header and exit blocks.
+    if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
+      continue;
+
+    VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+    VPRecipeBase *LastRecipe = nullptr;
+    // Introduce each ingredient into VPlan.
+    for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
+      VPRecipeBase *Ingredient = &*I++;
+      // Can only handle VPInstructions.
+      VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
+      Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+      if (DeadInstructions.count(Inst)) {
+        Ingredient->eraseFromParent();
+        continue;
+      }
+
+      VPRecipeBase *NewRecipe = nullptr;
+      // Create VPWidenMemoryInstructionRecipe for loads and stores.
+      if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/);
+      else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+        InductionDescriptor II = Inductions->lookup(Phi);
+        if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+            II.getKind() == InductionDescriptor::IK_FpInduction) {
+          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
+        } else
+          NewRecipe = new VPWidenPHIRecipe(Phi);
+      } else {
+        // If the last recipe is a VPWidenRecipe, add Inst to it instead of
+        // creating a new recipe.
+        if (VPWidenRecipe *WidenRecipe =
+                dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) {
+          WidenRecipe->appendInstruction(Inst);
+          Ingredient->eraseFromParent();
+          continue;
+        }
+        NewRecipe = new VPWidenRecipe(Inst);
+      }
+
+      NewRecipe->insertBefore(Ingredient);
+      LastRecipe = NewRecipe;
+      Ingredient->eraseFromParent();
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
new file mode 100644
index 000000000000..ae549c6871b3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
@@ -0,0 +1,36 @@
+//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility VPlan to VPlan transformations.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+
+#include "VPlan.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+
+class VPlanHCFGTransforms {
+
+public:
+  /// Replaces the VPInstructions in \p Plan with corresponding
+  /// widen recipes.
+  static void VPInstructionsToVPRecipes(
+      VPlanPtr &Plan,
+      LoopVectorizationLegality::InductionList *Inductions,
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 50966891e0eb..08f142915b49 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -37,13 +37,34 @@ class VPUser;
 // coming from the input IR, instructions which VPlan will generate if executed
 // and live-outs which the VPlan will need to fix accordingly.
 class VPValue {
+  friend class VPBuilder;
 private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
   SmallVector<VPUser *, 1> Users;
 
 protected:
-  VPValue(const unsigned char SC) : SubclassID(SC) {}
+  // Hold the underlying Value, if any, attached to this VPValue.
+  Value *UnderlyingVal;
+
+  VPValue(const unsigned char SC, Value *UV = nullptr)
+      : SubclassID(SC), UnderlyingVal(UV) {}
+
+  // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
+  // the front-end and back-end of VPlan so that the middle-end is as
+  // independent as possible of the underlying IR. We grant access to the
+  // underlying IR using friendship. In that way, we should be able to use VPlan
+  // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
+  // back-end and analysis information for the new IR.
+
+  /// Return the underlying Value attached to this VPValue.
+  Value *getUnderlyingValue() { return UnderlyingVal; }
+
+  // Set \p Val as the underlying Value of this VPValue.
+  void setUnderlyingValue(Value *Val) {
+    assert(!UnderlyingVal && "Underlying Value is already set.");
+    UnderlyingVal = Val;
+  }
 
 public:
   /// An enumeration for keeping track of the concrete subclass of VPValue that
@@ -52,7 +73,7 @@ public:
   /// type identification.
   enum { VPValueSC, VPUserSC, VPInstructionSC };
 
-  VPValue() : SubclassID(VPValueSC) {}
+  VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {}
   VPValue(const VPValue &) = delete;
   VPValue &operator=(const VPValue &) = delete;
 
@@ -94,11 +115,6 @@ class VPUser : public VPValue {
 private:
   SmallVector<VPValue *, 2> Operands;
 
-  void addOperand(VPValue *Operand) {
-    Operands.push_back(Operand);
-    Operand->addUser(*this);
-  }
-
 protected:
   VPUser(const unsigned char SC) : VPValue(SC) {}
   VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
@@ -120,6 +136,11 @@ public:
            V->getVPValueID() <= VPInstructionSC;
   }
 
+  void addOperand(VPValue *Operand) {
+    Operands.push_back(Operand);
+    Operand->addUser(*this);
+  }
+
   unsigned getNumOperands() const { return Operands.size(); }
   inline VPValue *getOperand(unsigned N) const {
     assert(N < Operands.size() && "Operand index out of bounds");
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
new file mode 100644
index 000000000000..054bed4e177f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -0,0 +1,133 @@
+//===-- VPlanVerifier.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class VPlanVerifier, which contains utility functions
+/// to check the consistency and invariants of a VPlan.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanVerifier.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
+                                        cl::Hidden,
+                                        cl::desc("Verify VPlan H-CFG."));
+
+#ifndef NDEBUG
+/// Utility function that checks whether \p VPBlockVec has duplicate
+/// VPBlockBases.
+static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
+  SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
+  for (const auto *Block : VPBlockVec) {
+    if (VPBlockSet.count(Block))
+      return true;
+    VPBlockSet.insert(Block);
+  }
+  return false;
+}
+#endif
+
+/// Helper function that verifies the CFG invariants of the VPBlockBases within
+/// \p Region. Checks in this function are generic for VPBlockBases. They are
+/// not specific for VPBasicBlocks or VPRegionBlocks.
+static void verifyBlocksInRegion(const VPRegionBlock *Region) {
+  for (const VPBlockBase *VPB :
+       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+    // Check block's parent.
+    assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+
+    // Check block's condition bit.
+    if (VPB->getNumSuccessors() > 1)
+      assert(VPB->getCondBit() && "Missing condition bit!");
+    else
+      assert(!VPB->getCondBit() && "Unexpected condition bit!");
+
+    // Check block's successors.
+    const auto &Successors = VPB->getSuccessors();
+    // There must be only one instance of a successor in block's successor list.
+    // TODO: This won't work for switch statements.
+    assert(!hasDuplicates(Successors) &&
+           "Multiple instances of the same successor.");
+
+    for (const VPBlockBase *Succ : Successors) {
+      // There must be a bi-directional link between block and successor.
+      const auto &SuccPreds = Succ->getPredecessors();
+      assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) !=
+                 SuccPreds.end() &&
+             "Missing predecessor link.");
+      (void)SuccPreds;
+    }
+
+    // Check block's predecessors.
+    const auto &Predecessors = VPB->getPredecessors();
+    // There must be only one instance of a predecessor in block's predecessor
+    // list.
+    // TODO: This won't work for switch statements.
+    assert(!hasDuplicates(Predecessors) &&
+           "Multiple instances of the same predecessor.");
+
+    for (const VPBlockBase *Pred : Predecessors) {
+      // Block and predecessor must be inside the same region.
+      assert(Pred->getParent() == VPB->getParent() &&
+             "Predecessor is not in the same region.");
+
+      // There must be a bi-directional link between block and predecessor.
+      const auto &PredSuccs = Pred->getSuccessors();
+      assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) !=
+                 PredSuccs.end() &&
+             "Missing successor link.");
+      (void)PredSuccs;
+    }
+  }
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+static void verifyRegion(const VPRegionBlock *Region) {
+  const VPBlockBase *Entry = Region->getEntry();
+  const VPBlockBase *Exit = Region->getExit();
+
+  // Entry and Exit shouldn't have any predecessor/successor, respectively.
+  assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
+  assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+  (void)Entry;
+  (void)Exit;
+
+  verifyBlocksInRegion(Region);
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Recurse inside nested VPRegionBlocks.
+static void verifyRegionRec(const VPRegionBlock *Region) {
+  verifyRegion(Region);
+
+  // Recurse inside nested regions.
+  for (const VPBlockBase *VPB :
+       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+    if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
+      verifyRegionRec(SubRegion);
+  }
+}
+
+void VPlanVerifier::verifyHierarchicalCFG(
+    const VPRegionBlock *TopRegion) const {
+  if (!EnableHCFGVerifier)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
+  assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
+  verifyRegionRec(TopRegion);
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/contrib/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
new file mode 100644
index 000000000000..d2f99d006a66
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -0,0 +1,44 @@
+//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the class VPlanVerifier, which contains utility functions
+/// to check the consistency of a VPlan. This includes the following kinds of
+/// invariants:
+///
+/// 1. Region/Block invariants:
+///   - Region's entry/exit block must have no predecessors/successors,
+///     respectively.
+///   - Block's parent must be the region immediately containing the block.
+///   - Linked blocks must have a bi-directional link (successor/predecessor).
+///   - All predecessors/successors of a block must belong to the same region.
+///   - Blocks must have no duplicated successor/predecessor.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+
+#include "VPlan.h"
+
+namespace llvm {
+
+/// Class with utility functions that can be used to check the consistency and
+/// invariants of a VPlan, including the components of its H-CFG.
+class VPlanVerifier {
+public:
+  /// Verify the invariants of the H-CFG starting from \p TopRegion. The
+  /// verification process comprises the following steps:
+  /// 1. Region/Block verification: Check the Region/Block verification
+  /// invariants for every region in the H-CFG.
+  void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+};
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index b04905bfc6fa..f62a88558328 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -34,10 +34,6 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
   initializeVectorization(*unwrap(R));
 }
 
-// DEPRECATED: Remove after the LLVM 5 release.
-void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-}
-
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopVectorizePass());
 }
diff --git a/contrib/llvm/lib/XRay/Trace.cpp b/contrib/llvm/lib/XRay/Trace.cpp
index d1fcf1c35b36..a8764b25483c 100644
--- a/contrib/llvm/lib/XRay/Trace.cpp
+++ b/contrib/llvm/lib/XRay/Trace.cpp
@@ -48,7 +48,8 @@ Error readBinaryFormatHeader(StringRef Data, XRayFileHeader &FileHeader) {
   FileHeader.NonstopTSC = Bitfield & 1uL << 1;
   FileHeader.CycleFrequency = HeaderExtractor.getU64(&OffsetPtr);
   std::memcpy(&FileHeader.FreeFormData, Data.bytes_begin() + OffsetPtr, 16);
-  if (FileHeader.Version != 1 && FileHeader.Version != 2)
+  if (FileHeader.Version != 1 && FileHeader.Version != 2 &&
+      FileHeader.Version != 3)
     return make_error<StringError>(
         Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
         std::make_error_code(std::errc::invalid_argument));
@@ -78,7 +79,8 @@ Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
   //   (4)   sint32 : function id
   //   (8)   uint64 : tsc
   //   (4)   uint32 : thread id
-  //   (12)  -      : padding
+  //   (4)   uint32 : process id
+  //   (8)   -      : padding
   for (auto S = Data.drop_front(32); !S.empty(); S = S.drop_front(32)) {
     DataExtractor RecordExtractor(S, true, 8);
     uint32_t OffsetPtr = 0;
@@ -110,6 +112,7 @@ Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
       Record.FuncId = RecordExtractor.getSigned(&OffsetPtr, sizeof(int32_t));
       Record.TSC = RecordExtractor.getU64(&OffsetPtr);
       Record.TId = RecordExtractor.getU32(&OffsetPtr);
+      Record.PId = RecordExtractor.getU32(&OffsetPtr);
       break;
     }
     case 1: { // Arg payload record.
@@ -118,14 +121,18 @@ Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
       OffsetPtr += 2;
       int32_t FuncId = RecordExtractor.getSigned(&OffsetPtr, sizeof(int32_t));
       auto TId = RecordExtractor.getU32(&OffsetPtr);
-      if (Record.FuncId != FuncId || Record.TId != TId)
+      auto PId = RecordExtractor.getU32(&OffsetPtr);
+
+      // Make a check for versions above 3 for the Pid field
+      if (Record.FuncId != FuncId || Record.TId != TId ||
+          (FileHeader.Version >= 3 ? Record.PId != PId : false))
         return make_error<StringError>(
-            Twine("Corrupted log, found payload following non-matching "
-                  "function + thread record. Record for ") +
-                Twine(Record.FuncId) + " != " + Twine(FuncId),
+            Twine("Corrupted log, found arg payload following non-matching "
+                  "function + thread record. Record for function ") +
+                Twine(Record.FuncId) + " != " + Twine(FuncId) + "; offset: " +
+                Twine(S.data() - Data.data()),
             std::make_error_code(std::errc::executable_format_error));
-      // Advance another four bytes to avoid padding.
-      OffsetPtr += 4;
+
       auto Arg = RecordExtractor.getU64(&OffsetPtr);
       Record.CallArgs.push_back(Arg);
       break;
@@ -147,6 +154,7 @@ Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
 struct FDRState {
   uint16_t CPUId;
   uint16_t ThreadId;
+  int32_t ProcessId;
   uint64_t BaseTSC;
 
   /// Encode some of the state transitions for the FDR log reader as explicit
@@ -160,6 +168,7 @@ struct FDRState {
     CUSTOM_EVENT_DATA,
     CALL_ARGUMENT,
     BUFFER_EXTENTS,
+    PID_RECORD,
   };
   Token Expects;
 
@@ -187,6 +196,8 @@ const char *fdrStateToTwine(const FDRState::Token &state) {
     return "CALL_ARGUMENT";
   case FDRState::Token::BUFFER_EXTENTS:
     return "BUFFER_EXTENTS";
+  case FDRState::Token::PID_RECORD:
+    return "PID_RECORD";
   }
   return "UNKNOWN";
 }
@@ -267,6 +278,23 @@ Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
   return Error::success();
 }
 
+/// State transition when a PidRecord is encountered.
+Error processFDRPidRecord(FDRState &State, uint8_t RecordFirstByte,
+                          DataExtractor &RecordExtractor) {
+
+  if (State.Expects != FDRState::Token::PID_RECORD)
+    return make_error<StringError>(
+        Twine("Malformed log. Read Pid record kind out of sequence; "
+              "expected: ") +
+            fdrStateToTwine(State.Expects),
+        std::make_error_code(std::errc::executable_format_error));
+
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.ProcessId = RecordExtractor.getU32(&OffsetPtr);
+  State.Expects = FDRState::Token::NEW_CPU_ID_RECORD;
+  return Error::success();
+}
+
 /// State transition when a CustomEventMarker is encountered.
 Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte,
                                DataExtractor &RecordExtractor,
@@ -324,6 +352,9 @@ Error processFDRCallArgumentRecord(FDRState &State, uint8_t RecordFirstByte,
 /// Beginning with Version 2 of the FDR log, we do not depend on the size of the
 /// buffer, but rather use the extents to determine how far to read in the log
 /// for this particular buffer.
+///
+/// In Version 3, FDR log now includes a pid metadata record after
+/// WallTimeMarker
 Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
                                DataExtractor &RecordExtractor,
                                size_t &RecordSize,
@@ -360,6 +391,9 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
     if (auto E =
             processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
       return E;
+    // In Version 3 and and above, a PidRecord is expected after WallTimeRecord
+    if (Version >= 3)
+      State.Expects = FDRState::Token::PID_RECORD;
     break;
   case 5: // CustomEventMarker
     if (auto E = processCustomEventMarker(State, RecordFirstByte,
@@ -375,6 +409,10 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
     if (auto E = processBufferExtents(State, RecordFirstByte, RecordExtractor))
       return E;
     break;
+  case 9: // Pid
+    if (auto E = processFDRPidRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
   default:
     // Widen the record type to uint16_t to prevent conversion to char.
     return make_error<StringError>(
@@ -404,6 +442,10 @@ Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
     return make_error<StringError>(
         "Malformed log. Received Function Record when expecting wallclock.",
         std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::PID_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record when expecting pid.",
+        std::make_error_code(std::errc::executable_format_error));
   case FDRState::Token::NEW_CPU_ID_RECORD:
     return make_error<StringError>(
         "Malformed log. Received Function Record before first CPU record.",
@@ -433,6 +475,7 @@ Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
     }
     Record.CPU = State.CPUId;
     Record.TId = State.ThreadId;
+    Record.PId = State.ProcessId;
     // Back up to read first 32 bits, including the 4 we pulled RecordType
     // and RecordKind out of. The remaining 28 are FunctionId.
     uint32_t OffsetPtr = 0;
@@ -476,6 +519,7 @@ Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
 /// BufSize: 8 byte unsigned integer indicating how large the buffer is.
 /// NewBuffer: 16 byte metadata record with Thread Id.
 /// WallClockTime: 16 byte metadata record with human readable time.
+/// Pid: 16 byte metadata record with Pid
 /// NewCPUId: 16 byte metadata record with CPUId and a 64 bit TSC reading.
 /// EOB: 16 byte record in a thread buffer plus mem garbage to fill BufSize.
 /// FunctionSequence: NewCPUId | TSCWrap | FunctionRecord
@@ -489,6 +533,11 @@ Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
 /// BufferExtents: 16 byte metdata record describing how many usable bytes are
 ///                in the buffer. This is measured from the start of the buffer
 ///                and must always be at least 48 (bytes).
+///
+/// In Version 3, we make the following changes:
+///
+/// ThreadBuffer: BufferExtents NewBuffer WallClockTime Pid NewCPUId
+///               FunctionSequence
 /// EOB: *deprecated*
 Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
                  std::vector<XRayRecord> &Records) {
@@ -522,6 +571,7 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
     InitialExpectation = FDRState::Token::NEW_BUFFER_RECORD_OR_EOF;
     break;
   case 2:
+  case 3:
     InitialExpectation = FDRState::Token::BUFFER_EXTENTS;
     break;
   default:
@@ -529,7 +579,7 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
         Twine("Unsupported version '") + Twine(FileHeader.Version) + "'",
         std::make_error_code(std::errc::executable_format_error));
   }
-  FDRState State{0, 0, 0, InitialExpectation, BufferSize, 0};
+  FDRState State{0, 0, 0, 0, InitialExpectation, BufferSize, 0};
 
   // RecordSize will tell the loop how far to seek ahead based on the record
   // type that we have just read.
@@ -571,7 +621,7 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
     if (!isBufferExtents)
       State.CurrentBufferConsumed += RecordSize;
     assert(State.CurrentBufferConsumed <= State.CurrentBufferSize);
-    if (FileHeader.Version == 2 &&
+    if ((FileHeader.Version == 2 || FileHeader.Version == 3) &&
         State.CurrentBufferSize == State.CurrentBufferConsumed) {
       // In Version 2 of the log, we don't need to scan to the end of the thread
       // buffer if we've already consumed all the bytes we need to.
@@ -620,8 +670,8 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
   Records.clear();
   std::transform(Trace.Records.begin(), Trace.Records.end(),
                  std::back_inserter(Records), [&](const YAMLXRayRecord &R) {
-                   return XRayRecord{R.RecordType, R.CPU, R.Type,    R.FuncId,
-                                     R.TSC,        R.TId, R.CallArgs};
+                   return XRayRecord{R.RecordType, R.CPU, R.Type, R.FuncId,
+                                     R.TSC,        R.TId, R.PId,  R.CallArgs};
                  });
   return Error::success();
 }
@@ -680,7 +730,7 @@ Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   Trace T;
   switch (Type) {
   case NAIVE_FORMAT:
-    if (Version == 1 || Version == 2) {
+    if (Version == 1 || Version == 2 || Version == 3) {
       if (auto E = loadNaiveFormatLog(Data, T.FileHeader, T.Records))
         return std::move(E);
     } else {
@@ -691,7 +741,7 @@ Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
     }
     break;
   case FLIGHT_DATA_RECORDER_FORMAT:
-    if (Version == 1 || Version == 2) {
+    if (Version == 1 || Version == 2 || Version == 3) {
       if (auto E = loadFDRLog(Data, T.FileHeader, T.Records))
         return std::move(E);
     } else {
diff --git a/contrib/llvm/tools/bugpoint/BugDriver.cpp b/contrib/llvm/tools/bugpoint/BugDriver.cpp
index 37bdb7bc96b6..3832e075a693 100644
--- a/contrib/llvm/tools/bugpoint/BugDriver.cpp
+++ b/contrib/llvm/tools/bugpoint/BugDriver.cpp
@@ -55,12 +55,11 @@ cl::opt<std::string> OutputFile("output",
                                          "(for miscompilation detection)"));
 }
 
-/// setNewProgram - If we reduce or update the program somehow, call this method
-/// to update bugdriver with it.  This deletes the old module and sets the
-/// specified one as the current program.
-void BugDriver::setNewProgram(Module *M) {
-  delete Program;
-  Program = M;
+/// If we reduce or update the program somehow, call this method to update
+/// bugdriver with it.  This deletes the old module and sets the specified one
+/// as the current program.
+void BugDriver::setNewProgram(std::unique_ptr<Module> M) {
+  Program = std::move(M);
 }
 
 /// getPassesString - Turn a list of passes into a string which indicates the
@@ -85,7 +84,6 @@ BugDriver::BugDriver(const char *toolname, bool find_bugs, unsigned timeout,
       MemoryLimit(memlimit), UseValgrind(use_valgrind) {}
 
 BugDriver::~BugDriver() {
-  delete Program;
   if (Interpreter != SafeInterpreter)
     delete Interpreter;
   delete SafeInterpreter;
@@ -121,6 +119,12 @@ std::unique_ptr<Module> llvm::parseInputFile(StringRef Filename,
   return Result;
 }
 
+std::unique_ptr<Module> BugDriver::swapProgramIn(std::unique_ptr<Module> M) {
+  std::unique_ptr<Module> OldProgram = std::move(Program);
+  Program = std::move(M);
+  return OldProgram;
+}
+
 // This method takes the specified list of LLVM input files, attempts to load
 // them, either as assembly or bitcode, then link them together. It returns
 // true on failure (if, for example, an input bitcode file could not be
@@ -131,7 +135,7 @@ bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
   assert(!Filenames.empty() && "Must specify at least on input filename!");
 
   // Load the first input file.
-  Program = parseInputFile(Filenames[0], Context).release();
+  Program = parseInputFile(Filenames[0], Context);
   if (!Program)
     return true;
 
@@ -172,7 +176,7 @@ Error BugDriver::run() {
   // miscompilation.
   if (!PassesToRun.empty()) {
     outs() << "Running selected passes on program to test for crash: ";
-    if (runPasses(Program, PassesToRun))
+    if (runPasses(*Program, PassesToRun))
       return debugOptimizerCrash();
   }
 
@@ -182,7 +186,7 @@ Error BugDriver::run() {
 
   // Test to see if we have a code generator crash.
   outs() << "Running the code generator to test for a crash: ";
-  if (Error E = compileProgram(Program)) {
+  if (Error E = compileProgram(*Program)) {
     outs() << toString(std::move(E));
     return debugCodeGeneratorCrash();
   }
@@ -195,7 +199,7 @@ Error BugDriver::run() {
   bool CreatedOutput = false;
   if (ReferenceOutputFile.empty()) {
     outs() << "Generating reference output from raw program: ";
-    if (Error E = createReferenceFile(Program)) {
+    if (Error E = createReferenceFile(*Program)) {
       errs() << toString(std::move(E));
       return debugCodeGeneratorCrash();
     }
@@ -211,7 +215,7 @@ Error BugDriver::run() {
   // matches, then we assume there is a miscompilation bug and try to
   // diagnose it.
   outs() << "*** Checking the code generator...\n";
-  Expected<bool> Diff = diffProgram(Program, "", "", false);
+  Expected<bool> Diff = diffProgram(*Program, "", "", false);
   if (Error E = Diff.takeError()) {
     errs() << toString(std::move(E));
     return debugCodeGeneratorCrash();
diff --git a/contrib/llvm/tools/bugpoint/BugDriver.h b/contrib/llvm/tools/bugpoint/BugDriver.h
index 0e6a9b4f2f38..bc60ae753548 100644
--- a/contrib/llvm/tools/bugpoint/BugDriver.h
+++ b/contrib/llvm/tools/bugpoint/BugDriver.h
@@ -50,7 +50,7 @@ class BugDriver {
   LLVMContext &Context;
   const char *ToolName;            // argv[0] of bugpoint
   std::string ReferenceOutputFile; // Name of `good' output file
-  Module *Program;                 // The raw program, linked together
+  std::unique_ptr<Module> Program; // The raw program, linked together
   std::vector<std::string> PassesToRun;
   AbstractInterpreter *Interpreter;     // How to run the program
   AbstractInterpreter *SafeInterpreter; // To generate reference output, etc.
@@ -128,15 +128,10 @@ public:
   ///
   bool isExecutingJIT();
 
-  Module *getProgram() const { return Program; }
+  Module &getProgram() const { return *Program; }
 
-  /// swapProgramIn - Set the current module to the specified module, returning
-  /// the old one.
-  Module *swapProgramIn(Module *M) {
-    Module *OldProgram = Program;
-    Program = M;
-    return OldProgram;
-  }
+  /// Set the current module to the specified module, returning the old one.
+  std::unique_ptr<Module> swapProgramIn(std::unique_ptr<Module> M);
 
   AbstractInterpreter *switchToSafeInterpreter() {
     AbstractInterpreter *Old = Interpreter;
@@ -146,55 +141,47 @@ public:
 
   void switchToInterpreter(AbstractInterpreter *AI) { Interpreter = AI; }
 
-  /// setNewProgram - If we reduce or update the program somehow, call this
-  /// method to update bugdriver with it.  This deletes the old module and sets
-  /// the specified one as the current program.
-  void setNewProgram(Module *M);
+  /// If we reduce or update the program somehow, call this method to update
+  /// bugdriver with it.  This deletes the old module and sets the specified one
+  /// as the current program.
+  void setNewProgram(std::unique_ptr<Module> M);
 
   /// Try to compile the specified module. This is used for code generation
   /// crash testing.
-  Error compileProgram(Module *M) const;
+  Error compileProgram(Module &M) const;
 
-  /// executeProgram - This method runs "Program", capturing the output of the
-  /// program to a file.  A recommended filename may be optionally specified.
-  ///
-  Expected<std::string> executeProgram(const Module *Program,
+  /// This method runs "Program", capturing the output of the program to a file.
+  /// A recommended filename may be optionally specified.
+  Expected<std::string> executeProgram(const Module &Program,
                                        std::string OutputFilename,
                                        std::string Bitcode,
                                        const std::string &SharedObjects,
                                        AbstractInterpreter *AI) const;
 
-  /// executeProgramSafely - Used to create reference output with the "safe"
-  /// backend, if reference output is not provided.  If there is a problem with
-  /// the code generator (e.g., llc crashes), this will return false and set
-  /// Error.
-  ///
+  /// Used to create reference output with the "safe" backend, if reference
+  /// output is not provided.  If there is a problem with the code generator
+  /// (e.g., llc crashes), this will return false and set Error.
   Expected<std::string>
-  executeProgramSafely(const Module *Program,
+  executeProgramSafely(const Module &Program,
                        const std::string &OutputFile) const;
 
-  /// createReferenceFile - calls compileProgram and then records the output
-  /// into ReferenceOutputFile. Returns true if reference file created, false
-  /// otherwise. Note: initializeExecutionEnvironment should be called BEFORE
-  /// this function.
-  ///
-  Error createReferenceFile(Module *M, const std::string &Filename =
+  /// Calls compileProgram and then records the output into ReferenceOutputFile.
+  /// Returns true if reference file created, false otherwise. Note:
+  /// initializeExecutionEnvironment should be called BEFORE this function.
+  Error createReferenceFile(Module &M, const std::string &Filename =
                                            "bugpoint.reference.out-%%%%%%%");
 
-  /// diffProgram - This method executes the specified module and diffs the
-  /// output against the file specified by ReferenceOutputFile.  If the output
-  /// is different, 1 is returned.  If there is a problem with the code
-  /// generator (e.g., llc crashes), this will return -1 and set Error.
-  ///
-  Expected<bool> diffProgram(const Module *Program,
+  /// This method executes the specified module and diffs the output against the
+  /// file specified by ReferenceOutputFile.  If the output is different, 1 is
+  /// returned.  If there is a problem with the code generator (e.g., llc
+  /// crashes), this will return -1 and set Error.
+  Expected<bool> diffProgram(const Module &Program,
                              const std::string &BitcodeFile = "",
                              const std::string &SharedObj = "",
                              bool RemoveBitcode = false) const;
 
-  /// EmitProgressBitcode - This function is used to output M to a file named
-  /// "bugpoint-ID.bc".
-  ///
-  void EmitProgressBitcode(const Module *M, const std::string &ID,
+  /// This function is used to output M to a file named "bugpoint-ID.bc".
+  void EmitProgressBitcode(const Module &M, const std::string &ID,
                            bool NoFlyer = false) const;
 
   /// This method clones the current Program and deletes the specified
@@ -210,7 +197,7 @@ public:
   /// MayModifySemantics argument is true, then the cleanups is allowed to
   /// modify how the code behaves.
   ///
-  std::unique_ptr<Module> performFinalCleanups(Module *M,
+  std::unique_ptr<Module> performFinalCleanups(std::unique_ptr<Module> M,
                                                bool MayModifySemantics = false);
 
   /// Given a module, extract up to one loop from it into a new function. This
@@ -243,7 +230,7 @@ public:
   /// or failed, unless Quiet is set.  ExtraArgs specifies additional arguments
   /// to pass to the child bugpoint instance.
   ///
-  bool runPasses(Module *Program, const std::vector<std::string> &PassesToRun,
+  bool runPasses(Module &Program, const std::vector<std::string> &PassesToRun,
                  std::string &OutputFilename, bool DeleteOutput = false,
                  bool Quiet = false, unsigned NumExtraArgs = 0,
                  const char *const *ExtraArgs = nullptr) const;
@@ -252,7 +239,7 @@ public:
   /// false indicating whether or not the optimizer crashed on the specified
   /// input (true = crashed).  Does not produce any output.
   ///
-  bool runPasses(Module *M, const std::vector<std::string> &PassesToRun) const {
+  bool runPasses(Module &M, const std::vector<std::string> &PassesToRun) const {
     std::string Filename;
     return runPasses(M, PassesToRun, Filename, true);
   }
@@ -265,13 +252,12 @@ public:
   /// failure.
   Error runManyPasses(const std::vector<std::string> &AllPasses);
 
-  /// writeProgramToFile - This writes the current "Program" to the named
-  /// bitcode file.  If an error occurs, true is returned.
-  ///
-  bool writeProgramToFile(const std::string &Filename, const Module *M) const;
+  /// This writes the current "Program" to the named bitcode file.  If an error
+  /// occurs, true is returned.
+  bool writeProgramToFile(const std::string &Filename, const Module &M) const;
   bool writeProgramToFile(const std::string &Filename, int FD,
-                          const Module *M) const;
-  bool writeProgramToFile(int FD, const Module *M) const;
+                          const Module &M) const;
+  bool writeProgramToFile(int FD, const Module &M) const;
 
 private:
   /// initializeExecutionEnvironment - This method is used to set up the
diff --git a/contrib/llvm/tools/bugpoint/CrashDebugger.cpp b/contrib/llvm/tools/bugpoint/CrashDebugger.cpp
index 9097917d5fef..a5b31e1ab321 100644
--- a/contrib/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/contrib/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -32,7 +33,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <set>
 using namespace llvm;
 
@@ -85,16 +85,16 @@ Expected<ReducePassList::TestResult>
 ReducePassList::doTest(std::vector<std::string> &Prefix,
                        std::vector<std::string> &Suffix) {
   std::string PrefixOutput;
-  Module *OrigProgram = nullptr;
+  std::unique_ptr<Module> OrigProgram;
   if (!Prefix.empty()) {
     outs() << "Checking to see if these passes crash: "
            << getPassesString(Prefix) << ": ";
     if (BD.runPasses(BD.getProgram(), Prefix, PrefixOutput))
       return KeepPrefix;
 
-    OrigProgram = BD.Program;
+    OrigProgram = std::move(BD.Program);
 
-    BD.Program = parseInputFile(PrefixOutput, BD.getContext()).release();
+    BD.Program = parseInputFile(PrefixOutput, BD.getContext());
     if (BD.Program == nullptr) {
       errs() << BD.getToolName() << ": Error reading bitcode file '"
              << PrefixOutput << "'!\n";
@@ -106,31 +106,27 @@ ReducePassList::doTest(std::vector<std::string> &Prefix,
   outs() << "Checking to see if these passes crash: " << getPassesString(Suffix)
          << ": ";
 
-  if (BD.runPasses(BD.getProgram(), Suffix)) {
-    delete OrigProgram; // The suffix crashes alone...
-    return KeepSuffix;
-  }
+  if (BD.runPasses(BD.getProgram(), Suffix))
+    return KeepSuffix; // The suffix crashes alone...
 
   // Nothing failed, restore state...
-  if (OrigProgram) {
-    delete BD.Program;
-    BD.Program = OrigProgram;
-  }
+  if (OrigProgram)
+    BD.Program = std::move(OrigProgram);
   return NoFailure;
 }
 
+using BugTester = bool (*)(const BugDriver &, Module *);
+
 namespace {
-/// ReduceCrashingGlobalVariables - This works by removing the global
-/// variable's initializer and seeing if the program still crashes. If it
-/// does, then we keep that program and try again.
-///
-class ReduceCrashingGlobalVariables : public ListReducer<GlobalVariable *> {
+/// ReduceCrashingGlobalInitializers - This works by removing global variable
+/// initializers and seeing if the program still crashes. If it does, then we
+/// keep that program and try again.
+class ReduceCrashingGlobalInitializers : public ListReducer<GlobalVariable *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
 
 public:
-  ReduceCrashingGlobalVariables(BugDriver &bd,
-                                bool (*testFn)(const BugDriver &, Module *))
+  ReduceCrashingGlobalInitializers(BugDriver &bd, BugTester testFn)
       : BD(bd), TestFn(testFn) {}
 
   Expected<TestResult> doTest(std::vector<GlobalVariable *> &Prefix,
@@ -146,11 +142,11 @@ public:
 };
 }
 
-bool ReduceCrashingGlobalVariables::TestGlobalVariables(
+bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
     std::vector<GlobalVariable *> &GVs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // Convert list to set for fast lookup...
   std::set<GlobalVariable *> GVSet;
@@ -175,8 +171,8 @@ bool ReduceCrashingGlobalVariables::TestGlobalVariables(
     }
 
   // Try running the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
 
     // Make sure to use global variable pointers that point into the now-current
     // module.
@@ -184,7 +180,6 @@ bool ReduceCrashingGlobalVariables::TestGlobalVariables(
     return true;
   }
 
-  delete M;
   return false;
 }
 
@@ -195,11 +190,10 @@ namespace {
 ///
 class ReduceCrashingFunctions : public ListReducer<Function *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
 
 public:
-  ReduceCrashingFunctions(BugDriver &bd,
-                          bool (*testFn)(const BugDriver &, Module *))
+  ReduceCrashingFunctions(BugDriver &bd, BugTester testFn)
       : BD(bd), TestFn(testFn) {}
 
   Expected<TestResult> doTest(std::vector<Function *> &Prefix,
@@ -241,12 +235,12 @@ static void RemoveFunctionReferences(Module *M, const char *Name) {
 
 bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
   // If main isn't present, claim there is no problem.
-  if (KeepMain && !is_contained(Funcs, BD.getProgram()->getFunction("main")))
+  if (KeepMain && !is_contained(Funcs, BD.getProgram().getFunction("main")))
     return false;
 
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // Convert list to set for fast lookup...
   std::set<Function *> Functions;
@@ -305,19 +299,18 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
     }
 
     // Finally, remove any null members from any global intrinsic.
-    RemoveFunctionReferences(M, "llvm.used");
-    RemoveFunctionReferences(M, "llvm.compiler.used");
+    RemoveFunctionReferences(M.get(), "llvm.used");
+    RemoveFunctionReferences(M.get(), "llvm.compiler.used");
   }
   // Try running the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
 
     // Make sure to use function pointers that point into the now-current
     // module.
     Funcs.assign(Functions.begin(), Functions.end());
     return true;
   }
-  delete M;
   return false;
 }
 
@@ -368,11 +361,10 @@ void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
 ///
 class ReduceCrashingBlocks : public ListReducer<const BasicBlock *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
 
 public:
-  ReduceCrashingBlocks(BugDriver &BD,
-                       bool (*testFn)(const BugDriver &, Module *))
+  ReduceCrashingBlocks(BugDriver &BD, BugTester testFn)
       : BD(BD), TestFn(testFn) {}
 
   Expected<TestResult> doTest(std::vector<const BasicBlock *> &Prefix,
@@ -391,7 +383,7 @@ public:
 bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // Convert list to set for fast lookup...
   SmallPtrSet<BasicBlock *, 8> Blocks;
@@ -409,31 +401,32 @@ bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   outs() << ": ";
 
   // Loop over and delete any hack up any blocks that are not listed...
-  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
-    for (Function::iterator BB = I->begin(), E = I->end(); BB != E; ++BB)
-      if (!Blocks.count(&*BB) && BB->getTerminator()->getNumSuccessors()) {
+  for (Function &F : M->functions()) {
+    for (BasicBlock &BB : F) {
+      if (!Blocks.count(&BB) && BB.getTerminator()->getNumSuccessors()) {
         // Loop over all of the successors of this block, deleting any PHI nodes
         // that might include it.
-        for (succ_iterator SI = succ_begin(&*BB), E = succ_end(&*BB); SI != E;
-             ++SI)
-          (*SI)->removePredecessor(&*BB);
+        for (BasicBlock *Succ : successors(&BB))
+          Succ->removePredecessor(&BB);
 
-        TerminatorInst *BBTerm = BB->getTerminator();
+        TerminatorInst *BBTerm = BB.getTerminator();
         if (BBTerm->isEHPad() || BBTerm->getType()->isTokenTy())
           continue;
         if (!BBTerm->getType()->isVoidTy())
           BBTerm->replaceAllUsesWith(Constant::getNullValue(BBTerm->getType()));
 
         // Replace the old terminator instruction.
-        BB->getInstList().pop_back();
-        new UnreachableInst(BB->getContext(), &*BB);
+        BB.getInstList().pop_back();
+        new UnreachableInst(BB.getContext(), &BB);
       }
+    }
+  }
 
   // The CFG Simplifier pass may delete one of the basic blocks we are
   // interested in.  If it does we need to take the block out of the list.  Make
   // a "persistent mapping" by turning basic blocks into <function, name> pairs.
   // This won't work well if blocks are unnamed, but that is just the risk we
-  // have to take.
+  // have to take. FIXME: Can we just name the blocks?
   std::vector<std::pair<std::string, std::string>> BlockInfo;
 
   for (BasicBlock *BB : Blocks)
@@ -450,31 +443,30 @@ bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   // Verify we didn't break anything
   std::vector<std::string> Passes;
   Passes.push_back("verify");
-  std::unique_ptr<Module> New = BD.runPassesOn(M, Passes);
-  delete M;
+  std::unique_ptr<Module> New = BD.runPassesOn(M.get(), Passes);
   if (!New) {
     errs() << "verify failed!\n";
     exit(1);
   }
-  M = New.release();
+  M = std::move(New);
 
   // Try running on the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
 
     // Make sure to use basic block pointers that point into the now-current
     // module, and that they don't include any deleted blocks.
     BBs.clear();
-    const ValueSymbolTable &GST = M->getValueSymbolTable();
-    for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
-      Function *F = cast<Function>(GST.lookup(BlockInfo[i].first));
-      Value *V = F->getValueSymbolTable()->lookup(BlockInfo[i].second);
+    const ValueSymbolTable &GST = BD.getProgram().getValueSymbolTable();
+    for (const auto &BI : BlockInfo) {
+      Function *F = cast<Function>(GST.lookup(BI.first));
+      Value *V = F->getValueSymbolTable()->lookup(BI.second);
       if (V && V->getType() == Type::getLabelTy(V->getContext()))
         BBs.push_back(cast<BasicBlock>(V));
     }
     return true;
   }
-  delete M; // It didn't crash, try something else.
+  // It didn't crash, try something else.
   return false;
 }
 
@@ -486,13 +478,11 @@ namespace {
 ///
 class ReduceCrashingConditionals : public ListReducer<const BasicBlock *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
   bool Direction;
 
 public:
-  ReduceCrashingConditionals(BugDriver &bd,
-                             bool (*testFn)(const BugDriver &, Module *),
-                             bool Direction)
+  ReduceCrashingConditionals(BugDriver &bd, BugTester testFn, bool Direction)
       : BD(bd), TestFn(testFn), Direction(Direction) {}
 
   Expected<TestResult> doTest(std::vector<const BasicBlock *> &Prefix,
@@ -512,7 +502,7 @@ bool ReduceCrashingConditionals::TestBlocks(
     std::vector<const BasicBlock *> &BBs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // Convert list to set for fast lookup...
   SmallPtrSet<const BasicBlock *, 8> Blocks;
@@ -560,22 +550,21 @@ bool ReduceCrashingConditionals::TestBlocks(
   // Verify we didn't break anything
   std::vector<std::string> Passes;
   Passes.push_back("verify");
-  std::unique_ptr<Module> New = BD.runPassesOn(M, Passes);
-  delete M;
+  std::unique_ptr<Module> New = BD.runPassesOn(M.get(), Passes);
   if (!New) {
     errs() << "verify failed!\n";
     exit(1);
   }
-  M = New.release();
+  M = std::move(New);
 
   // Try running on the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
 
     // Make sure to use basic block pointers that point into the now-current
     // module, and that they don't include any deleted blocks.
     BBs.clear();
-    const ValueSymbolTable &GST = M->getValueSymbolTable();
+    const ValueSymbolTable &GST = BD.getProgram().getValueSymbolTable();
     for (auto &BI : BlockInfo) {
       auto *F = cast<Function>(GST.lookup(BI.first));
       Value *V = F->getValueSymbolTable()->lookup(BI.second);
@@ -584,7 +573,7 @@ bool ReduceCrashingConditionals::TestBlocks(
     }
     return true;
   }
-  delete M; // It didn't crash, try something else.
+  // It didn't crash, try something else.
   return false;
 }
 
@@ -594,12 +583,12 @@ namespace {
 
 class ReduceSimplifyCFG : public ListReducer<const BasicBlock *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
   TargetTransformInfo TTI;
 
 public:
-  ReduceSimplifyCFG(BugDriver &bd, bool (*testFn)(const BugDriver &, Module *))
-      : BD(bd), TestFn(testFn), TTI(bd.getProgram()->getDataLayout()) {}
+  ReduceSimplifyCFG(BugDriver &bd, BugTester testFn)
+      : BD(bd), TestFn(testFn), TTI(bd.getProgram().getDataLayout()) {}
 
   Expected<TestResult> doTest(std::vector<const BasicBlock *> &Prefix,
                               std::vector<const BasicBlock *> &Kept) override {
@@ -617,7 +606,7 @@ public:
 bool ReduceSimplifyCFG::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // Convert list to set for fast lookup...
   SmallPtrSet<const BasicBlock *, 8> Blocks;
@@ -653,22 +642,21 @@ bool ReduceSimplifyCFG::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   // Verify we didn't break anything
   std::vector<std::string> Passes;
   Passes.push_back("verify");
-  std::unique_ptr<Module> New = BD.runPassesOn(M, Passes);
-  delete M;
+  std::unique_ptr<Module> New = BD.runPassesOn(M.get(), Passes);
   if (!New) {
     errs() << "verify failed!\n";
     exit(1);
   }
-  M = New.release();
+  M = std::move(New);
 
   // Try running on the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
 
     // Make sure to use basic block pointers that point into the now-current
     // module, and that they don't include any deleted blocks.
     BBs.clear();
-    const ValueSymbolTable &GST = M->getValueSymbolTable();
+    const ValueSymbolTable &GST = BD.getProgram().getValueSymbolTable();
     for (auto &BI : BlockInfo) {
       auto *F = cast<Function>(GST.lookup(BI.first));
       Value *V = F->getValueSymbolTable()->lookup(BI.second);
@@ -677,7 +665,7 @@ bool ReduceSimplifyCFG::TestBlocks(std::vector<const BasicBlock *> &BBs) {
     }
     return true;
   }
-  delete M; // It didn't crash, try something else.
+  // It didn't crash, try something else.
   return false;
 }
 
@@ -687,11 +675,10 @@ namespace {
 ///
 class ReduceCrashingInstructions : public ListReducer<const Instruction *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
 
 public:
-  ReduceCrashingInstructions(BugDriver &bd,
-                             bool (*testFn)(const BugDriver &, Module *))
+  ReduceCrashingInstructions(BugDriver &bd, BugTester testFn)
       : BD(bd), TestFn(testFn) {}
 
   Expected<TestResult> doTest(std::vector<const Instruction *> &Prefix,
@@ -711,7 +698,7 @@ bool ReduceCrashingInstructions::TestInsts(
     std::vector<const Instruction *> &Insts) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // Convert list to set for fast lookup...
   SmallPtrSet<Instruction *, 32> Instructions;
@@ -745,8 +732,8 @@ bool ReduceCrashingInstructions::TestInsts(
   Passes.run(*M);
 
   // Try running on the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
 
     // Make sure to use instruction pointers that point into the now-current
     // module, and that they don't include any deleted blocks.
@@ -755,7 +742,7 @@ bool ReduceCrashingInstructions::TestInsts(
       Insts.push_back(Inst);
     return true;
   }
-  delete M; // It didn't crash, try something else.
+  // It didn't crash, try something else.
   return false;
 }
 
@@ -764,11 +751,10 @@ namespace {
 // names to avoid having to convert back and forth every time.
 class ReduceCrashingNamedMD : public ListReducer<std::string> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
 
 public:
-  ReduceCrashingNamedMD(BugDriver &bd,
-                        bool (*testFn)(const BugDriver &, Module *))
+  ReduceCrashingNamedMD(BugDriver &bd, BugTester testFn)
       : BD(bd), TestFn(testFn) {}
 
   Expected<TestResult> doTest(std::vector<std::string> &Prefix,
@@ -787,7 +773,7 @@ public:
 bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
 
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   outs() << "Checking for crash with only these named metadata nodes:";
   unsigned NumPrint = std::min<size_t>(NamedMDs.size(), 10);
@@ -821,11 +807,10 @@ bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
   Passes.run(*M);
 
   // Try running on the hacked up program...
-  if (TestFn(BD, M)) {
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
     return true;
   }
-  delete M; // It didn't crash, try something else.
   return false;
 }
 
@@ -833,11 +818,10 @@ namespace {
 // Reduce the list of operands to named metadata nodes
 class ReduceCrashingNamedMDOps : public ListReducer<const MDNode *> {
   BugDriver &BD;
-  bool (*TestFn)(const BugDriver &, Module *);
+  BugTester TestFn;
 
 public:
-  ReduceCrashingNamedMDOps(BugDriver &bd,
-                           bool (*testFn)(const BugDriver &, Module *))
+  ReduceCrashingNamedMDOps(BugDriver &bd, BugTester testFn)
       : BD(bd), TestFn(testFn) {}
 
   Expected<TestResult> doTest(std::vector<const MDNode *> &Prefix,
@@ -868,11 +852,11 @@ bool ReduceCrashingNamedMDOps::TestNamedMDOps(
     outs() << " named metadata operands: ";
 
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap).release();
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram(), VMap);
 
   // This is a little wasteful. In the future it might be good if we could have
   // these dropped during cloning.
-  for (auto &NamedMD : BD.getProgram()->named_metadata()) {
+  for (auto &NamedMD : BD.getProgram().named_metadata()) {
     // Drop the old one and create a new one
     M->eraseNamedMetadata(M->getNamedMetadata(NamedMD.getName()));
     NamedMDNode *NewNamedMDNode =
@@ -888,85 +872,82 @@ bool ReduceCrashingNamedMDOps::TestNamedMDOps(
   Passes.run(*M);
 
   // Try running on the hacked up program...
-  if (TestFn(BD, M)) {
+  if (TestFn(BD, M.get())) {
     // Make sure to use instruction pointers that point into the now-current
     // module, and that they don't include any deleted blocks.
     NamedMDOps.clear();
     for (const MDNode *Node : OldMDNodeOps)
       NamedMDOps.push_back(cast<MDNode>(*VMap.getMappedMD(Node)));
 
-    BD.setNewProgram(M); // It crashed, keep the trimmed version...
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
     return true;
   }
-  delete M; // It didn't crash, try something else.
+  // It didn't crash, try something else.
   return false;
 }
 
-static Error ReduceGlobalInitializers(BugDriver &BD,
-                                      bool (*TestFn)(const BugDriver &,
-                                                     Module *)) {
-  if (BD.getProgram()->global_begin() != BD.getProgram()->global_end()) {
-    // Now try to reduce the number of global variable initializers in the
-    // module to something small.
-    Module *M = CloneModule(BD.getProgram()).release();
-    bool DeletedInit = false;
-
-    for (Module::global_iterator I = M->global_begin(), E = M->global_end();
-         I != E; ++I)
-      if (I->hasInitializer()) {
-        DeleteGlobalInitializer(&*I);
-        I->setLinkage(GlobalValue::ExternalLinkage);
-        I->setComdat(nullptr);
-        DeletedInit = true;
-      }
-
-    if (!DeletedInit) {
-      delete M; // No change made...
-    } else {
-      // See if the program still causes a crash...
-      outs() << "\nChecking to see if we can delete global inits: ";
-
-      if (TestFn(BD, M)) { // Still crashes?
-        BD.setNewProgram(M);
-        outs() << "\n*** Able to remove all global initializers!\n";
-      } else { // No longer crashes?
-        outs() << "  - Removing all global inits hides problem!\n";
-        delete M;
-
-        std::vector<GlobalVariable *> GVs;
-
-        for (Module::global_iterator I = BD.getProgram()->global_begin(),
-                                     E = BD.getProgram()->global_end();
-             I != E; ++I)
-          if (I->hasInitializer())
-            GVs.push_back(&*I);
-
-        if (GVs.size() > 1 && !BugpointIsInterrupted) {
-          outs() << "\n*** Attempting to reduce the number of global "
-                 << "variables in the testcase\n";
-
-          unsigned OldSize = GVs.size();
-          Expected<bool> Result =
-              ReduceCrashingGlobalVariables(BD, TestFn).reduceList(GVs);
-          if (Error E = Result.takeError())
-            return E;
-
-          if (GVs.size() < OldSize)
-            BD.EmitProgressBitcode(BD.getProgram(), "reduced-global-variables");
-        }
-      }
+/// Attempt to eliminate as many global initializers as possible.
+static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
+  Module &OrigM = BD.getProgram();
+  if (OrigM.global_empty())
+    return Error::success();
+
+  // Now try to reduce the number of global variable initializers in the
+  // module to something small.
+  std::unique_ptr<Module> M = CloneModule(OrigM);
+  bool DeletedInit = false;
+
+  for (GlobalVariable &GV : M->globals()) {
+    if (GV.hasInitializer()) {
+      DeleteGlobalInitializer(&GV);
+      GV.setLinkage(GlobalValue::ExternalLinkage);
+      GV.setComdat(nullptr);
+      DeletedInit = true;
     }
   }
+
+  if (!DeletedInit)
+    return Error::success();
+
+  // See if the program still causes a crash...
+  outs() << "\nChecking to see if we can delete global inits: ";
+
+  if (TestFn(BD, M.get())) { // Still crashes?
+    BD.setNewProgram(std::move(M));
+    outs() << "\n*** Able to remove all global initializers!\n";
+    return Error::success();
+  }
+
+  // No longer crashes.
+  outs() << "  - Removing all global inits hides problem!\n";
+
+  std::vector<GlobalVariable *> GVs;
+  for (GlobalVariable &GV : OrigM.globals())
+    if (GV.hasInitializer())
+      GVs.push_back(&GV);
+
+  if (GVs.size() > 1 && !BugpointIsInterrupted) {
+    outs() << "\n*** Attempting to reduce the number of global initializers "
+           << "in the testcase\n";
+
+    unsigned OldSize = GVs.size();
+    Expected<bool> Result =
+        ReduceCrashingGlobalInitializers(BD, TestFn).reduceList(GVs);
+    if (Error E = Result.takeError())
+      return E;
+
+    if (GVs.size() < OldSize)
+      BD.EmitProgressBitcode(BD.getProgram(), "reduced-global-variables");
+  }
   return Error::success();
 }
 
-static Error ReduceInsts(BugDriver &BD,
-                        bool (*TestFn)(const BugDriver &, Module *)) {
+static Error ReduceInsts(BugDriver &BD, BugTester TestFn) {
   // Attempt to delete instructions using bisection. This should help out nasty
   // cases with large basic blocks where the problem is at one end.
   if (!BugpointIsInterrupted) {
     std::vector<const Instruction *> Insts;
-    for (const Function &F : *BD.getProgram())
+    for (const Function &F : BD.getProgram())
       for (const BasicBlock &BB : F)
         for (const Instruction &I : BB)
           if (!isa<TerminatorInst>(&I))
@@ -1001,8 +982,8 @@ static Error ReduceInsts(BugDriver &BD,
     // Loop over all of the (non-terminator) instructions remaining in the
     // function, attempting to delete them.
     unsigned CurInstructionNum = 0;
-    for (Module::const_iterator FI = BD.getProgram()->begin(),
-                                E = BD.getProgram()->end();
+    for (Module::const_iterator FI = BD.getProgram().begin(),
+                                E = BD.getProgram().end();
          FI != E; ++FI)
       if (!FI->isDeclaration())
         for (Function::const_iterator BI = FI->begin(), E = FI->end(); BI != E;
@@ -1028,7 +1009,7 @@ static Error ReduceInsts(BugDriver &BD,
               if (TestFn(BD, M.get())) {
                 // Yup, it does, we delete the old module, and continue trying
                 // to reduce the testcase...
-                BD.setNewProgram(M.release());
+                BD.setNewProgram(std::move(M));
                 InstructionsToSkipBeforeDeleting = CurInstructionNum;
                 goto TryAgain; // I wish I had a multi-level break here!
               }
@@ -1048,8 +1029,7 @@ static Error ReduceInsts(BugDriver &BD,
 /// DebugACrash - Given a predicate that determines whether a component crashes
 /// on a program, try to destructively reduce the program while still keeping
 /// the predicate true.
-static Error DebugACrash(BugDriver &BD,
-                         bool (*TestFn)(const BugDriver &, Module *)) {
+static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
   // See if we can get away with nuking some of the global variable initializers
   // in the program...
   if (!NoGlobalRM)
@@ -1058,7 +1038,7 @@ static Error DebugACrash(BugDriver &BD,
 
   // Now try to reduce the number of functions in the module to something small.
   std::vector<Function *> Functions;
-  for (Function &F : *BD.getProgram())
+  for (Function &F : BD.getProgram())
     if (!F.isDeclaration())
       Functions.push_back(&F);
 
@@ -1080,7 +1060,7 @@ static Error DebugACrash(BugDriver &BD,
   // eliminate blocks.
   if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
     std::vector<const BasicBlock *> Blocks;
-    for (Function &F : *BD.getProgram())
+    for (Function &F : BD.getProgram())
       for (BasicBlock &BB : F)
         Blocks.push_back(&BB);
     unsigned OldSize = Blocks.size();
@@ -1102,7 +1082,7 @@ static Error DebugACrash(BugDriver &BD,
   //
   if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
     std::vector<const BasicBlock *> Blocks;
-    for (Function &F : *BD.getProgram())
+    for (Function &F : BD.getProgram())
       for (BasicBlock &BB : F)
         Blocks.push_back(&BB);
     unsigned OldSize = Blocks.size();
@@ -1115,7 +1095,7 @@ static Error DebugACrash(BugDriver &BD,
 
   if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
     std::vector<const BasicBlock *> Blocks;
-    for (Function &F : *BD.getProgram())
+    for (Function &F : BD.getProgram())
       for (BasicBlock &BB : F)
         Blocks.push_back(&BB);
     unsigned OldSize = Blocks.size();
@@ -1137,7 +1117,7 @@ static Error DebugACrash(BugDriver &BD,
     std::unique_ptr<Module> M = CloneModule(BD.getProgram());
     strip(*M);
     if (TestFn(BD, M.get()))
-      BD.setNewProgram(M.release());
+      BD.setNewProgram(std::move(M));
   };
   if (!NoStripDebugInfo && !BugpointIsInterrupted) {
     outs() << "\n*** Attempting to strip the debug info: ";
@@ -1154,7 +1134,7 @@ static Error DebugACrash(BugDriver &BD,
       // by dropping global named metadata that anchors them
       outs() << "\n*** Attempting to remove named metadata: ";
       std::vector<std::string> NamedMDNames;
-      for (auto &NamedMD : BD.getProgram()->named_metadata())
+      for (auto &NamedMD : BD.getProgram().named_metadata())
         NamedMDNames.push_back(NamedMD.getName().str());
       Expected<bool> Result =
           ReduceCrashingNamedMD(BD, TestFn).reduceList(NamedMDNames);
@@ -1166,7 +1146,7 @@ static Error DebugACrash(BugDriver &BD,
       // Now that we quickly dropped all the named metadata that doesn't
       // contribute to the crash, bisect the operands of the remaining ones
       std::vector<const MDNode *> NamedMDOps;
-      for (auto &NamedMD : BD.getProgram()->named_metadata())
+      for (auto &NamedMD : BD.getProgram().named_metadata())
         for (auto op : NamedMD.operands())
           NamedMDOps.push_back(op);
       Expected<bool> Result =
@@ -1180,15 +1160,13 @@ static Error DebugACrash(BugDriver &BD,
   // Try to clean up the testcase by running funcresolve and globaldce...
   if (!BugpointIsInterrupted) {
     outs() << "\n*** Attempting to perform final cleanups: ";
-    Module *M = CloneModule(BD.getProgram()).release();
-    M = BD.performFinalCleanups(M, true).release();
+    std::unique_ptr<Module> M = CloneModule(BD.getProgram());
+    M = BD.performFinalCleanups(std::move(M), true);
 
     // Find out if the pass still crashes on the cleaned up program...
-    if (TestFn(BD, M)) {
-      BD.setNewProgram(M); // Yup, it does, keep the reduced version...
-    } else {
-      delete M;
-    }
+    if (M && TestFn(BD, M.get()))
+      BD.setNewProgram(
+          std::move(M)); // Yup, it does, keep the reduced version...
   }
 
   BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplified");
@@ -1197,7 +1175,7 @@ static Error DebugACrash(BugDriver &BD,
 }
 
 static bool TestForOptimizerCrash(const BugDriver &BD, Module *M) {
-  return BD.runPasses(M, BD.getPassesToRun());
+  return BD.runPasses(*M, BD.getPassesToRun());
 }
 
 /// debugOptimizerCrash - This method is called when some pass crashes on input.
@@ -1218,13 +1196,13 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
          << (PassesToRun.size() == 1 ? ": " : "es: ")
          << getPassesString(PassesToRun) << '\n';
 
-  EmitProgressBitcode(Program, ID);
+  EmitProgressBitcode(*Program, ID);
 
   return DebugACrash(*this, TestForOptimizerCrash);
 }
 
 static bool TestForCodeGenCrash(const BugDriver &BD, Module *M) {
-  if (Error E = BD.compileProgram(M)) {
+  if (Error E = BD.compileProgram(*M)) {
     if (VerboseErrors)
       errs() << toString(std::move(E)) << "\n";
     else {
diff --git a/contrib/llvm/tools/bugpoint/ExecutionDriver.cpp b/contrib/llvm/tools/bugpoint/ExecutionDriver.cpp
index 7562aa603bbb..773bad69fae0 100644
--- a/contrib/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/contrib/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -265,11 +265,9 @@ Error BugDriver::initializeExecutionEnvironment() {
   return Error::success();
 }
 
-/// compileProgram - Try to compile the specified module, returning false and
-/// setting Error if an error occurs.  This is used for code generation
-/// crash testing.
-///
-Error BugDriver::compileProgram(Module *M) const {
+/// Try to compile the specified module, returning false and setting Error if an
+/// error occurs.  This is used for code generation crash testing.
+Error BugDriver::compileProgram(Module &M) const {
   // Emit the program to a bitcode file...
   auto Temp =
       sys::fs::TempFile::create(OutputPrefix + "-test-program-%%%%%%%.bc");
@@ -290,11 +288,10 @@ Error BugDriver::compileProgram(Module *M) const {
   return Interpreter->compileProgram(Temp->TmpName, Timeout, MemoryLimit);
 }
 
-/// executeProgram - This method runs "Program", capturing the output of the
-/// program to a file, returning the filename of the file.  A recommended
-/// filename may be optionally specified.
-///
-Expected<std::string> BugDriver::executeProgram(const Module *Program,
+/// This method runs "Program", capturing the output of the program to a file,
+/// returning the filename of the file.  A recommended filename may be
+/// optionally specified.
+Expected<std::string> BugDriver::executeProgram(const Module &Program,
                                                 std::string OutputFile,
                                                 std::string BitcodeFile,
                                                 const std::string &SharedObj,
@@ -373,11 +370,10 @@ Expected<std::string> BugDriver::executeProgram(const Module *Program,
   return OutputFile;
 }
 
-/// executeProgramSafely - Used to create reference output with the "safe"
-/// backend, if reference output is not provided.
-///
+/// Used to create reference output with the "safe" backend, if reference output
+/// is not provided.
 Expected<std::string>
-BugDriver::executeProgramSafely(const Module *Program,
+BugDriver::executeProgramSafely(const Module &Program,
                                 const std::string &OutputFile) const {
   return executeProgram(Program, OutputFile, "", "", SafeInterpreter);
 }
@@ -404,16 +400,14 @@ BugDriver::compileSharedObject(const std::string &BitcodeFile) {
   return SharedObjectFile;
 }
 
-/// createReferenceFile - calls compileProgram and then records the output
-/// into ReferenceOutputFile. Returns true if reference file created, false
-/// otherwise. Note: initializeExecutionEnvironment should be called BEFORE
-/// this function.
-///
-Error BugDriver::createReferenceFile(Module *M, const std::string &Filename) {
-  if (Error E = compileProgram(Program))
+/// Calls compileProgram and then records the output into ReferenceOutputFile.
+/// Returns true if reference file created, false otherwise. Note:
+/// initializeExecutionEnvironment should be called BEFORE this function.
+Error BugDriver::createReferenceFile(Module &M, const std::string &Filename) {
+  if (Error E = compileProgram(*Program))
     return E;
 
-  Expected<std::string> Result = executeProgramSafely(Program, Filename);
+  Expected<std::string> Result = executeProgramSafely(*Program, Filename);
   if (Error E = Result.takeError()) {
     if (Interpreter != SafeInterpreter) {
       E = joinErrors(
@@ -432,12 +426,11 @@ Error BugDriver::createReferenceFile(Module *M, const std::string &Filename) {
   return Error::success();
 }
 
-/// diffProgram - This method executes the specified module and diffs the
-/// output against the file specified by ReferenceOutputFile.  If the output
-/// is different, 1 is returned.  If there is a problem with the code
-/// generator (e.g., llc crashes), this will set ErrMsg.
-///
-Expected<bool> BugDriver::diffProgram(const Module *Program,
+/// This method executes the specified module and diffs the output against the
+/// file specified by ReferenceOutputFile.  If the output is different, 1 is
+/// returned.  If there is a problem with the code generator (e.g., llc
+/// crashes), this will set ErrMsg.
+Expected<bool> BugDriver::diffProgram(const Module &Program,
                                       const std::string &BitcodeFile,
                                       const std::string &SharedObject,
                                       bool RemoveBitcode) const {
diff --git a/contrib/llvm/tools/bugpoint/ExtractFunction.cpp b/contrib/llvm/tools/bugpoint/ExtractFunction.cpp
index 431dcedfe203..48f1575c25eb 100644
--- a/contrib/llvm/tools/bugpoint/ExtractFunction.cpp
+++ b/contrib/llvm/tools/bugpoint/ExtractFunction.cpp
@@ -85,7 +85,7 @@ std::unique_ptr<Module>
 BugDriver::deleteInstructionFromProgram(const Instruction *I,
                                         unsigned Simplification) {
   // FIXME, use vmap?
-  Module *Clone = CloneModule(Program).release();
+  std::unique_ptr<Module> Clone = CloneModule(*Program);
 
   const BasicBlock *PBB = I->getParent();
   const Function *PF = PBB->getParent();
@@ -118,8 +118,7 @@ BugDriver::deleteInstructionFromProgram(const Instruction *I,
     Passes.push_back("simplifycfg"); // Delete dead control flow
 
   Passes.push_back("verify");
-  std::unique_ptr<Module> New = runPassesOn(Clone, Passes);
-  delete Clone;
+  std::unique_ptr<Module> New = runPassesOn(Clone.get(), Passes);
   if (!New) {
     errs() << "Instruction removal failed.  Sorry. :(  Please report a bug!\n";
     exit(1);
@@ -128,7 +127,8 @@ BugDriver::deleteInstructionFromProgram(const Instruction *I,
 }
 
 std::unique_ptr<Module>
-BugDriver::performFinalCleanups(Module *M, bool MayModifySemantics) {
+BugDriver::performFinalCleanups(std::unique_ptr<Module> M,
+                                bool MayModifySemantics) {
   // Make all functions external, so GlobalDCE doesn't delete them...
   for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
     I->setLinkage(GlobalValue::ExternalLinkage);
@@ -141,12 +141,11 @@ BugDriver::performFinalCleanups(Module *M, bool MayModifySemantics) {
   else
     CleanupPasses.push_back("deadargelim");
 
-  std::unique_ptr<Module> New = runPassesOn(M, CleanupPasses);
+  std::unique_ptr<Module> New = runPassesOn(M.get(), CleanupPasses);
   if (!New) {
     errs() << "Final cleanups failed.  Sorry. :(  Please report a bug!\n";
     return nullptr;
   }
-  delete M;
   return New;
 }
 
@@ -157,7 +156,7 @@ std::unique_ptr<Module> BugDriver::extractLoop(Module *M) {
   std::unique_ptr<Module> NewM = runPassesOn(M, LoopExtractPasses);
   if (!NewM) {
     outs() << "*** Loop extraction failed: ";
-    EmitProgressBitcode(M, "loopextraction", true);
+    EmitProgressBitcode(*M, "loopextraction", true);
     outs() << "*** Sorry. :(  Please report a bug!\n";
     return nullptr;
   }
@@ -319,15 +318,15 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
   }
 
   ValueToValueMapTy NewVMap;
-  std::unique_ptr<Module> New = CloneModule(M, NewVMap);
+  std::unique_ptr<Module> New = CloneModule(*M, NewVMap);
 
   // Remove the Test functions from the Safe module
   std::set<Function *> TestFunctions;
   for (unsigned i = 0, e = F.size(); i != e; ++i) {
     Function *TNOF = cast<Function>(VMap[F[i]]);
-    DEBUG(errs() << "Removing function ");
-    DEBUG(TNOF->printAsOperand(errs(), false));
-    DEBUG(errs() << "\n");
+    LLVM_DEBUG(errs() << "Removing function ");
+    LLVM_DEBUG(TNOF->printAsOperand(errs(), false));
+    LLVM_DEBUG(errs() << "\n");
     TestFunctions.insert(cast<Function>(NewVMap[TNOF]));
     DeleteFunctionBody(TNOF); // Function is now external in this module!
   }
@@ -378,15 +377,21 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
     outs() << "*** Basic Block extraction failed!\n";
     errs() << "Error creating temporary file: " << toString(Temp.takeError())
            << "\n";
-    EmitProgressBitcode(M, "basicblockextractfail", true);
+    EmitProgressBitcode(*M, "basicblockextractfail", true);
     return nullptr;
   }
   DiscardTemp Discard{*Temp};
 
+  // Extract all of the blocks except the ones in BBs.
+  SmallVector<BasicBlock *, 32> BlocksToExtract;
+  for (Function &F : *M)
+    for (BasicBlock &BB : F)
+      // Check if this block is going to be extracted.
+      if (std::find(BBs.begin(), BBs.end(), &BB) == BBs.end())
+        BlocksToExtract.push_back(&BB);
+
   raw_fd_ostream OS(Temp->FD, /*shouldClose*/ false);
-  for (std::vector<BasicBlock *>::const_iterator I = BBs.begin(), E = BBs.end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
+  for (BasicBlock *BB : BBs) {
     // If the BB doesn't have a name, give it one so we have something to key
     // off of.
     if (!BB->hasName())
@@ -396,7 +401,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
   OS.flush();
   if (OS.has_error()) {
     errs() << "Error writing list of blocks to not extract\n";
-    EmitProgressBitcode(M, "basicblockextractfail", true);
+    EmitProgressBitcode(*M, "basicblockextractfail", true);
     OS.clear_error();
     return nullptr;
   }
@@ -411,7 +416,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
 
   if (!Ret) {
     outs() << "*** Basic Block extraction failed, please report a bug!\n";
-    EmitProgressBitcode(M, "basicblockextractfail", true);
+    EmitProgressBitcode(*M, "basicblockextractfail", true);
   }
   return Ret;
 }
diff --git a/contrib/llvm/tools/bugpoint/FindBugs.cpp b/contrib/llvm/tools/bugpoint/FindBugs.cpp
index 40502cbf9495..a695e875b787 100644
--- a/contrib/llvm/tools/bugpoint/FindBugs.cpp
+++ b/contrib/llvm/tools/bugpoint/FindBugs.cpp
@@ -32,7 +32,7 @@ BugDriver::runManyPasses(const std::vector<std::string> &AllPasses) {
   outs() << "\n";
   if (ReferenceOutputFile.empty()) {
     outs() << "Generating reference output from raw program: \n";
-    if (Error E = createReferenceFile(Program))
+    if (Error E = createReferenceFile(*Program))
       return E;
   }
 
@@ -53,7 +53,7 @@ BugDriver::runManyPasses(const std::vector<std::string> &AllPasses) {
     }
 
     std::string Filename;
-    if (runPasses(Program, PassesToRun, Filename, false)) {
+    if (runPasses(*Program, PassesToRun, Filename, false)) {
       outs() << "\n";
       outs() << "Optimizer passes caused failure!\n\n";
       return debugOptimizerCrash();
@@ -65,7 +65,7 @@ BugDriver::runManyPasses(const std::vector<std::string> &AllPasses) {
     // Step 3: Compile the optimized code.
     //
     outs() << "Running the code generator to test for a crash: ";
-    if (Error E = compileProgram(Program)) {
+    if (Error E = compileProgram(*Program)) {
       outs() << "\n*** compileProgram threw an exception: ";
       outs() << toString(std::move(E));
       return debugCodeGeneratorCrash();
@@ -77,7 +77,7 @@ BugDriver::runManyPasses(const std::vector<std::string> &AllPasses) {
     // output (created above).
     //
     outs() << "*** Checking if passes caused miscompliation:\n";
-    Expected<bool> Diff = diffProgram(Program, Filename, "", false);
+    Expected<bool> Diff = diffProgram(*Program, Filename, "", false);
     if (Error E = Diff.takeError()) {
       errs() << toString(std::move(E));
       return debugCodeGeneratorCrash();
diff --git a/contrib/llvm/tools/bugpoint/Miscompilation.cpp b/contrib/llvm/tools/bugpoint/Miscompilation.cpp
index 80f4cea23481..375bee7a0d50 100644
--- a/contrib/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/contrib/llvm/tools/bugpoint/Miscompilation.cpp
@@ -152,8 +152,8 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
          << "' passes compile correctly after the '" << getPassesString(Prefix)
          << "' passes: ";
 
-  std::unique_ptr<Module> OriginalInput(
-      BD.swapProgramIn(PrefixOutput.release()));
+  std::unique_ptr<Module> OriginalInput =
+      BD.swapProgramIn(std::move(PrefixOutput));
   if (BD.runPasses(BD.getProgram(), Suffix, BitcodeResult, false /*delete*/,
                    true /*quiet*/)) {
     errs() << " Error running this sequence of passes"
@@ -179,7 +179,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
   // Otherwise, we must not be running the bad pass anymore.
   outs() << " yup.\n"; // No miscompilation!
   // Restore orig program & free test.
-  delete BD.swapProgramIn(OriginalInput.release());
+  BD.setNewProgram(std::move(OriginalInput));
   return NoFailure;
 }
 
@@ -230,23 +230,22 @@ static Expected<std::unique_ptr<Module>> testMergedProgram(const BugDriver &BD,
                                                            const Module &M2,
                                                            bool &Broken) {
   // Resulting merge of M1 and M2.
-  auto Merged = CloneModule(&M1);
-  if (Linker::linkModules(*Merged, CloneModule(&M2)))
+  auto Merged = CloneModule(M1);
+  if (Linker::linkModules(*Merged, CloneModule(M2)))
     // TODO: Shouldn't we thread the error up instead of exiting?
     exit(1);
 
   // Execute the program.
-  Expected<bool> Diff = BD.diffProgram(Merged.get(), "", "", false);
+  Expected<bool> Diff = BD.diffProgram(*Merged, "", "", false);
   if (Error E = Diff.takeError())
     return std::move(E);
   Broken = *Diff;
   return std::move(Merged);
 }
 
-/// TestFuncs - split functions in a Module into two groups: those that are
-/// under consideration for miscompilation vs. those that are not, and test
+/// split functions in a Module into two groups: those that are under
+/// consideration for miscompilation vs. those that are not, and test
 /// accordingly. Each group of functions becomes a separate Module.
-///
 Expected<bool>
 ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
   // Test to see if the function is misoptimized if we ONLY run it on the
@@ -266,8 +265,8 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
   //   we can conclude that a function triggers the bug when in fact one
   //   needs a larger set of original functions to do so.
   ValueToValueMapTy VMap;
-  Module *Clone = CloneModule(BD.getProgram(), VMap).release();
-  Module *Orig = BD.swapProgramIn(Clone);
+  std::unique_ptr<Module> Clone = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> Orig = BD.swapProgramIn(std::move(Clone));
 
   std::vector<Function *> FuncsOnClone;
   for (unsigned i = 0, e = Funcs.size(); i != e; ++i) {
@@ -284,19 +283,18 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
   Expected<bool> Broken =
       TestFn(BD, std::move(ToOptimize), std::move(ToNotOptimize));
 
-  delete BD.swapProgramIn(Orig);
+  BD.setNewProgram(std::move(Orig));
 
   return Broken;
 }
 
-/// DisambiguateGlobalSymbols - Give anonymous global values names.
-///
-static void DisambiguateGlobalSymbols(Module *M) {
-  for (Module::global_iterator I = M->global_begin(), E = M->global_end();
-       I != E; ++I)
+/// Give anonymous global values names.
+static void DisambiguateGlobalSymbols(Module &M) {
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I)
     if (!I->hasName())
       I->setName("anon_global");
-  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     if (!I->hasName())
       I->setName("anon_fn");
 }
@@ -317,17 +315,14 @@ ExtractLoops(BugDriver &BD,
 
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
-    Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize.get(),
-                                                   MiscompiledFunctions, VMap)
-                             .release();
+    std::unique_ptr<Module> ToOptimize = SplitFunctionsOutOfModule(
+        ToNotOptimize.get(), MiscompiledFunctions, VMap);
     std::unique_ptr<Module> ToOptimizeLoopExtracted =
-        BD.extractLoop(ToOptimize);
-    if (!ToOptimizeLoopExtracted) {
+        BD.extractLoop(ToOptimize.get());
+    if (!ToOptimizeLoopExtracted)
       // If the loop extractor crashed or if there were no extractible loops,
       // then this chapter of our odyssey is over with.
-      delete ToOptimize;
       return MadeChange;
-    }
 
     errs() << "Extracted a loop from the breaking portion of the program.\n";
 
@@ -346,10 +341,9 @@ ExtractLoops(BugDriver &BD,
       return false;
 
     // Delete the original and set the new program.
-    Module *Old = BD.swapProgramIn(New->release());
+    std::unique_ptr<Module> Old = BD.swapProgramIn(std::move(*New));
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
-    delete Old;
 
     if (Failure) {
       BD.switchToInterpreter(AI);
@@ -360,25 +354,23 @@ ExtractLoops(BugDriver &BD,
       errs() << "      Continuing on with un-loop-extracted version.\n";
 
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-tno.bc",
-                            ToNotOptimize.get());
+                            *ToNotOptimize);
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to.bc",
-                            ToOptimize);
+                            *ToOptimize);
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to-le.bc",
-                            ToOptimizeLoopExtracted.get());
+                            *ToOptimizeLoopExtracted);
 
       errs() << "Please submit the " << OutputPrefix
              << "-loop-extract-fail-*.bc files.\n";
-      delete ToOptimize;
       return MadeChange;
     }
-    delete ToOptimize;
     BD.switchToInterpreter(AI);
 
     outs() << "  Testing after loop extraction:\n";
     // Clone modules, the tester function will free them.
     std::unique_ptr<Module> TOLEBackup =
-        CloneModule(ToOptimizeLoopExtracted.get(), VMap);
-    std::unique_ptr<Module> TNOBackup = CloneModule(ToNotOptimize.get(), VMap);
+        CloneModule(*ToOptimizeLoopExtracted, VMap);
+    std::unique_ptr<Module> TNOBackup = CloneModule(*ToNotOptimize, VMap);
 
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
@@ -413,7 +405,7 @@ ExtractLoops(BugDriver &BD,
         MiscompiledFunctions.push_back(NewF);
       }
 
-      BD.setNewProgram(ToNotOptimize.release());
+      BD.setNewProgram(std::move(ToNotOptimize));
       return MadeChange;
     }
 
@@ -444,7 +436,7 @@ ExtractLoops(BugDriver &BD,
       MiscompiledFunctions.push_back(NewF);
     }
 
-    BD.setNewProgram(ToNotOptimize.release());
+    BD.setNewProgram(std::move(ToNotOptimize));
     MadeChange = true;
   }
 }
@@ -508,8 +500,8 @@ ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock *> &BBs) {
 
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
-  Module *Clone = CloneModule(BD.getProgram(), VMap).release();
-  Module *Orig = BD.swapProgramIn(Clone);
+  std::unique_ptr<Module> Clone = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> Orig = BD.swapProgramIn(std::move(Clone));
   std::vector<Function *> FuncsOnClone;
   std::vector<BasicBlock *> BBsOnClone;
   for (unsigned i = 0, e = FunctionsBeingTested.size(); i != e; ++i) {
@@ -531,10 +523,10 @@ ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock *> &BBs) {
   if (std::unique_ptr<Module> New =
           BD.extractMappedBlocksFromModule(BBsOnClone, ToOptimize.get())) {
     Expected<bool> Ret = TestFn(BD, std::move(New), std::move(ToNotOptimize));
-    delete BD.swapProgramIn(Orig);
+    BD.setNewProgram(std::move(Orig));
     return Ret;
   }
-  delete BD.swapProgramIn(Orig);
+  BD.setNewProgram(std::move(Orig));
   return false;
 }
 
@@ -577,23 +569,19 @@ ExtractBlocks(BugDriver &BD,
   }
 
   ValueToValueMapTy VMap;
-  Module *ProgClone = CloneModule(BD.getProgram(), VMap).release();
-  Module *ToExtract =
-      SplitFunctionsOutOfModule(ProgClone, MiscompiledFunctions, VMap)
-          .release();
+  std::unique_ptr<Module> ProgClone = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> ToExtract =
+      SplitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
   std::unique_ptr<Module> Extracted =
-      BD.extractMappedBlocksFromModule(Blocks, ToExtract);
+      BD.extractMappedBlocksFromModule(Blocks, ToExtract.get());
   if (!Extracted) {
     // Weird, extraction should have worked.
     errs() << "Nondeterministic problem extracting blocks??\n";
-    delete ProgClone;
-    delete ToExtract;
     return false;
   }
 
   // Otherwise, block extraction succeeded.  Link the two program fragments back
   // together.
-  delete ToExtract;
 
   std::vector<std::pair<std::string, FunctionType *>> MisCompFunctions;
   for (Module::iterator I = Extracted->begin(), E = Extracted->end(); I != E;
@@ -605,7 +593,7 @@ ExtractBlocks(BugDriver &BD,
     exit(1);
 
   // Set the new program and delete the old one.
-  BD.setNewProgram(ProgClone);
+  BD.setNewProgram(std::move(ProgClone));
 
   // Update the list of miscompiled functions.
   MiscompiledFunctions.clear();
@@ -631,8 +619,8 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
   // miscompiled... first build a list of all of the non-external functions in
   // the program.
   std::vector<Function *> MiscompiledFunctions;
-  Module *Prog = BD.getProgram();
-  for (Function &F : *Prog)
+  Module &Prog = BD.getProgram();
+  for (Function &F : Prog)
     if (!F.isDeclaration())
       MiscompiledFunctions.push_back(&F);
 
@@ -718,8 +706,8 @@ static Expected<bool> TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
   if (!Optimized) {
     errs() << " Error running this sequence of passes"
            << " on the input program!\n";
-    delete BD.swapProgramIn(Test.get());
-    BD.EmitProgressBitcode(Test.get(), "pass-error", false);
+    BD.setNewProgram(std::move(Test));
+    BD.EmitProgressBitcode(*Test, "pass-error", false);
     if (Error E = BD.debugOptimizerCrash())
       return std::move(E);
     return false;
@@ -734,7 +722,7 @@ static Expected<bool> TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
   if (auto New = std::move(*Result)) {
     outs() << (Broken ? " nope.\n" : " yup.\n");
     // Delete the original and set the new program.
-    delete BD.swapProgramIn(New.release());
+    BD.setNewProgram(std::move(New));
   }
   return Broken;
 }
@@ -760,7 +748,7 @@ Error BugDriver::debugMiscompilation() {
   outs() << "\n*** Found miscompiling pass"
          << (getPassesToRun().size() == 1 ? "" : "es") << ": "
          << getPassesString(getPassesToRun()) << '\n';
-  EmitProgressBitcode(Program, "passinput");
+  EmitProgressBitcode(*Program, "passinput");
 
   Expected<std::vector<Function *>> MiscompiledFunctions =
       DebugAMiscompilation(*this, TestOptimizer);
@@ -776,11 +764,11 @@ Error BugDriver::debugMiscompilation() {
           .release();
 
   outs() << "  Non-optimized portion: ";
-  EmitProgressBitcode(ToNotOptimize, "tonotoptimize", true);
+  EmitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
   delete ToNotOptimize; // Delete hacked module.
 
   outs() << "  Portion that is input to optimizer: ";
-  EmitProgressBitcode(ToOptimize, "tooptimize");
+  EmitProgressBitcode(*ToOptimize, "tooptimize");
   delete ToOptimize; // Delete hacked module.
 
   return Error::success();
@@ -788,15 +776,15 @@ Error BugDriver::debugMiscompilation() {
 
 /// Get the specified modules ready for code generator testing.
 ///
-static void CleanupAndPrepareModules(BugDriver &BD,
-                                     std::unique_ptr<Module> &Test,
-                                     Module *Safe) {
+static std::unique_ptr<Module>
+CleanupAndPrepareModules(BugDriver &BD, std::unique_ptr<Module> Test,
+                         Module *Safe) {
   // Clean up the modules, removing extra cruft that we don't need anymore...
-  Test = BD.performFinalCleanups(Test.get());
+  Test = BD.performFinalCleanups(std::move(Test));
 
   // If we are executing the JIT, we have several nasty issues to take care of.
   if (!BD.isExecutingJIT())
-    return;
+    return Test;
 
   // First, if the main function is in the Safe module, we must add a stub to
   // the Test module to call into it.  Thus, we create a new function `main'
@@ -942,6 +930,8 @@ static void CleanupAndPrepareModules(BugDriver &BD,
     errs() << "Bugpoint has a bug, which corrupted a module!!\n";
     abort();
   }
+
+  return Test;
 }
 
 /// This is the predicate function used to check to see if the "Test" portion of
@@ -951,7 +941,7 @@ static void CleanupAndPrepareModules(BugDriver &BD,
 static Expected<bool> TestCodeGenerator(BugDriver &BD,
                                         std::unique_ptr<Module> Test,
                                         std::unique_ptr<Module> Safe) {
-  CleanupAndPrepareModules(BD, Test, Safe.get());
+  Test = CleanupAndPrepareModules(BD, std::move(Test), Safe.get());
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
@@ -962,7 +952,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
            << "Error making unique filename: " << EC.message() << "\n";
     exit(1);
   }
-  if (BD.writeProgramToFile(TestModuleBC.str(), TestModuleFD, Test.get())) {
+  if (BD.writeProgramToFile(TestModuleBC.str(), TestModuleFD, *Test)) {
     errs() << "Error writing bitcode to `" << TestModuleBC.str()
            << "'\nExiting.";
     exit(1);
@@ -981,7 +971,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
     exit(1);
   }
 
-  if (BD.writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, Safe.get())) {
+  if (BD.writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, *Safe)) {
     errs() << "Error writing bitcode to `" << SafeModuleBC << "'\nExiting.";
     exit(1);
   }
@@ -1015,7 +1005,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
 Error BugDriver::debugCodeGenerator() {
   if ((void *)SafeInterpreter == (void *)Interpreter) {
     Expected<std::string> Result =
-        executeProgramSafely(Program, "bugpoint.safe.out");
+        executeProgramSafely(*Program, "bugpoint.safe.out");
     if (Result) {
       outs() << "\n*** The \"safe\" i.e. 'known good' backend cannot match "
              << "the reference diff.  This may be due to a\n    front-end "
@@ -1028,7 +1018,7 @@ Error BugDriver::debugCodeGenerator() {
     return Error::success();
   }
 
-  DisambiguateGlobalSymbols(Program);
+  DisambiguateGlobalSymbols(*Program);
 
   Expected<std::vector<Function *>> Funcs =
       DebugAMiscompilation(*this, TestCodeGenerator);
@@ -1042,7 +1032,8 @@ Error BugDriver::debugCodeGenerator() {
       SplitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
 
   // Condition the modules
-  CleanupAndPrepareModules(*this, ToCodeGen, ToNotCodeGen.get());
+  ToCodeGen =
+      CleanupAndPrepareModules(*this, std::move(ToCodeGen), ToNotCodeGen.get());
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
@@ -1054,7 +1045,7 @@ Error BugDriver::debugCodeGenerator() {
     exit(1);
   }
 
-  if (writeProgramToFile(TestModuleBC.str(), TestModuleFD, ToCodeGen.get())) {
+  if (writeProgramToFile(TestModuleBC.str(), TestModuleFD, *ToCodeGen)) {
     errs() << "Error writing bitcode to `" << TestModuleBC << "'\nExiting.";
     exit(1);
   }
@@ -1070,8 +1061,7 @@ Error BugDriver::debugCodeGenerator() {
     exit(1);
   }
 
-  if (writeProgramToFile(SafeModuleBC.str(), SafeModuleFD,
-                         ToNotCodeGen.get())) {
+  if (writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, *ToNotCodeGen)) {
     errs() << "Error writing bitcode to `" << SafeModuleBC << "'\nExiting.";
     exit(1);
   }
diff --git a/contrib/llvm/tools/bugpoint/OptimizerDriver.cpp b/contrib/llvm/tools/bugpoint/OptimizerDriver.cpp
index ee3f2f0174d2..cbb048db8fe7 100644
--- a/contrib/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/contrib/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -43,18 +43,14 @@ static cl::opt<bool> PreserveBitcodeUseListOrder(
     cl::desc("Preserve use-list order when writing LLVM bitcode."),
     cl::init(true), cl::Hidden);
 
-// ChildOutput - This option captures the name of the child output file that
-// is set up by the parent bugpoint process
-static cl::opt<std::string> ChildOutput("child-output", cl::ReallyHidden);
 static cl::opt<std::string>
     OptCmd("opt-command", cl::init(""),
            cl::desc("Path to opt. (default: search path "
                     "for 'opt'.)"));
 
-/// writeProgramToFile - This writes the current "Program" to the named bitcode
-/// file.  If an error occurs, true is returned.
-///
-static bool writeProgramToFileAux(ToolOutputFile &Out, const Module *M) {
+/// This writes the current "Program" to the named bitcode file.  If an error
+/// occurs, true is returned.
+static bool writeProgramToFileAux(ToolOutputFile &Out, const Module &M) {
   WriteBitcodeToFile(M, Out.os(), PreserveBitcodeUseListOrder);
   Out.os().close();
   if (!Out.os().has_error()) {
@@ -65,12 +61,12 @@ static bool writeProgramToFileAux(ToolOutputFile &Out, const Module *M) {
 }
 
 bool BugDriver::writeProgramToFile(const std::string &Filename, int FD,
-                                   const Module *M) const {
+                                   const Module &M) const {
   ToolOutputFile Out(Filename, FD);
   return writeProgramToFileAux(Out, M);
 }
 
-bool BugDriver::writeProgramToFile(int FD, const Module *M) const {
+bool BugDriver::writeProgramToFile(int FD, const Module &M) const {
   raw_fd_ostream OS(FD, /*shouldClose*/ false);
   WriteBitcodeToFile(M, OS, PreserveBitcodeUseListOrder);
   OS.flush();
@@ -81,7 +77,7 @@ bool BugDriver::writeProgramToFile(int FD, const Module *M) const {
 }
 
 bool BugDriver::writeProgramToFile(const std::string &Filename,
-                                   const Module *M) const {
+                                   const Module &M) const {
   std::error_code EC;
   ToolOutputFile Out(Filename, EC, sys::fs::F_None);
   if (!EC)
@@ -89,10 +85,9 @@ bool BugDriver::writeProgramToFile(const std::string &Filename,
   return true;
 }
 
-/// EmitProgressBitcode - This function is used to output the current Program
-/// to a file named "bugpoint-ID.bc".
-///
-void BugDriver::EmitProgressBitcode(const Module *M, const std::string &ID,
+/// This function is used to output the current Program to a file named
+/// "bugpoint-ID.bc".
+void BugDriver::EmitProgressBitcode(const Module &M, const std::string &ID,
                                     bool NoFlyer) const {
   // Output the input to the current pass to a bitcode file, emit a message
   // telling the user how to reproduce it: opt -foo blah.bc
@@ -132,7 +127,7 @@ static cl::list<std::string> OptArgs("opt-args", cl::Positional,
 /// outs() a single line message indicating whether compilation was successful
 /// or failed.
 ///
-bool BugDriver::runPasses(Module *Program,
+bool BugDriver::runPasses(Module &Program,
                           const std::vector<std::string> &Passes,
                           std::string &OutputFilename, bool DeleteOutput,
                           bool Quiet, unsigned NumExtraArgs,
@@ -180,6 +175,10 @@ bool BugDriver::runPasses(Module *Program,
     errs() << "Cannot find `opt' in PATH!\n";
     return 1;
   }
+  if (!sys::fs::exists(tool)) {
+    errs() << "Specified `opt' binary does not exist: " << tool << "\n";
+    return 1;
+  }
 
   std::string Prog;
   if (UseValgrind) {
@@ -195,20 +194,20 @@ bool BugDriver::runPasses(Module *Program,
   }
 
   // setup the child process' arguments
-  SmallVector<const char *, 8> Args;
+  SmallVector<StringRef, 8> Args;
   if (UseValgrind) {
     Args.push_back("valgrind");
     Args.push_back("--error-exitcode=1");
     Args.push_back("-q");
-    Args.push_back(tool.c_str());
+    Args.push_back(tool);
   } else
-    Args.push_back(tool.c_str());
+    Args.push_back(tool);
 
   for (unsigned i = 0, e = OptArgs.size(); i != e; ++i)
-    Args.push_back(OptArgs[i].c_str());
+    Args.push_back(OptArgs[i]);
   Args.push_back("-disable-symbolication");
   Args.push_back("-o");
-  Args.push_back(OutputFilename.c_str());
+  Args.push_back(OutputFilename);
   std::vector<std::string> pass_args;
   for (unsigned i = 0, e = PluginLoader::getNumPlugins(); i != e; ++i) {
     pass_args.push_back(std::string("-load"));
@@ -225,12 +224,11 @@ bool BugDriver::runPasses(Module *Program,
   Args.push_back(Temp->TmpName.c_str());
   for (unsigned i = 0; i < NumExtraArgs; ++i)
     Args.push_back(*ExtraArgs);
-  Args.push_back(nullptr);
 
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = Args.size() - 1; i != e; ++i) errs()
-        << " " << Args[i];
-        errs() << "\n";);
+  LLVM_DEBUG(errs() << "\nAbout to run:\t";
+             for (unsigned i = 0, e = Args.size() - 1; i != e; ++i) errs()
+             << " " << Args[i];
+             errs() << "\n";);
 
   Optional<StringRef> Redirects[3] = {None, None, None};
   // Redirect stdout and stderr to nowhere if SilencePasses is given.
@@ -240,8 +238,8 @@ bool BugDriver::runPasses(Module *Program,
   }
 
   std::string ErrMsg;
-  int result = sys::ExecuteAndWait(Prog, Args.data(), nullptr, Redirects,
-                                   Timeout, MemoryLimit, &ErrMsg);
+  int result = sys::ExecuteAndWait(Prog, Args, None, Redirects, Timeout,
+                                   MemoryLimit, &ErrMsg);
 
   // If we are supposed to delete the bitcode file or if the passes crashed,
   // remove it now.  This may fail if the file was never created, but that's ok.
@@ -271,7 +269,7 @@ std::unique_ptr<Module>
 BugDriver::runPassesOn(Module *M, const std::vector<std::string> &Passes,
                        unsigned NumExtraArgs, const char *const *ExtraArgs) {
   std::string BitcodeResult;
-  if (runPasses(M, Passes, BitcodeResult, false /*delete*/, true /*quiet*/,
+  if (runPasses(*M, Passes, BitcodeResult, false /*delete*/, true /*quiet*/,
                 NumExtraArgs, ExtraArgs)) {
     return nullptr;
   }
diff --git a/contrib/llvm/tools/bugpoint/ToolRunner.cpp b/contrib/llvm/tools/bugpoint/ToolRunner.cpp
index 8094dfdd78fa..812e8e3bbae5 100644
--- a/contrib/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/contrib/llvm/tools/bugpoint/ToolRunner.cpp
@@ -53,13 +53,14 @@ cl::opt<std::string>
 /// RunProgramWithTimeout - This function provides an alternate interface
 /// to the sys::Program::ExecuteAndWait interface.
 /// @see sys::Program::ExecuteAndWait
-static int RunProgramWithTimeout(StringRef ProgramPath, const char **Args,
-                                 StringRef StdInFile, StringRef StdOutFile,
-                                 StringRef StdErrFile, unsigned NumSeconds = 0,
+static int RunProgramWithTimeout(StringRef ProgramPath,
+                                 ArrayRef<StringRef> Args, StringRef StdInFile,
+                                 StringRef StdOutFile, StringRef StdErrFile,
+                                 unsigned NumSeconds = 0,
                                  unsigned MemoryLimit = 0,
                                  std::string *ErrMsg = nullptr) {
   Optional<StringRef> Redirects[3] = {StdInFile, StdOutFile, StdErrFile};
-  return sys::ExecuteAndWait(ProgramPath, Args, nullptr, Redirects, NumSeconds,
+  return sys::ExecuteAndWait(ProgramPath, Args, None, Redirects, NumSeconds,
                              MemoryLimit, ErrMsg);
 }
 
@@ -69,24 +70,22 @@ static int RunProgramWithTimeout(StringRef ProgramPath, const char **Args,
 /// fails. Remote client is required to return 255 if it failed or program exit
 /// code otherwise.
 /// @see sys::Program::ExecuteAndWait
-static int RunProgramRemotelyWithTimeout(StringRef RemoteClientPath,
-                                         const char **Args, StringRef StdInFile,
-                                         StringRef StdOutFile,
-                                         StringRef StdErrFile,
-                                         unsigned NumSeconds = 0,
-                                         unsigned MemoryLimit = 0) {
+static int RunProgramRemotelyWithTimeout(
+    StringRef RemoteClientPath, ArrayRef<StringRef> Args, StringRef StdInFile,
+    StringRef StdOutFile, StringRef StdErrFile, unsigned NumSeconds = 0,
+    unsigned MemoryLimit = 0) {
   Optional<StringRef> Redirects[3] = {StdInFile, StdOutFile, StdErrFile};
 
   // Run the program remotely with the remote client
-  int ReturnCode = sys::ExecuteAndWait(RemoteClientPath, Args, nullptr,
-                                       Redirects, NumSeconds, MemoryLimit);
+  int ReturnCode = sys::ExecuteAndWait(RemoteClientPath, Args, None, Redirects,
+                                       NumSeconds, MemoryLimit);
 
   // Has the remote client fail?
   if (255 == ReturnCode) {
     std::ostringstream OS;
     OS << "\nError running remote client:\n ";
-    for (const char **Arg = Args; *Arg; ++Arg)
-      OS << " " << *Arg;
+    for (StringRef Arg : Args)
+      OS << " " << Arg.str();
     OS << "\n";
 
     // The error message is in the output file, let's print it out from there.
@@ -105,12 +104,12 @@ static int RunProgramRemotelyWithTimeout(StringRef RemoteClientPath,
   return ReturnCode;
 }
 
-static Error ProcessFailure(StringRef ProgPath, const char **Args,
+static Error ProcessFailure(StringRef ProgPath, ArrayRef<StringRef> Args,
                             unsigned Timeout = 0, unsigned MemoryLimit = 0) {
   std::ostringstream OS;
   OS << "\nError running tool:\n ";
-  for (const char **Arg = Args; *Arg; ++Arg)
-    OS << " " << *Arg;
+  for (StringRef Arg : Args)
+    OS << " " << Arg.str();
   OS << "\n";
 
   // Rerun the compiler, capturing any error messages to print them.
@@ -171,7 +170,7 @@ Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
                                   const std::vector<std::string> &CCArgs,
                                   const std::vector<std::string> &SharedLibs,
                                   unsigned Timeout, unsigned MemoryLimit) {
-  std::vector<const char *> LLIArgs;
+  std::vector<StringRef> LLIArgs;
   LLIArgs.push_back(LLIPath.c_str());
   LLIArgs.push_back("-force-interpreter=true");
 
@@ -179,26 +178,25 @@ Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
                                                 e = SharedLibs.end();
        i != e; ++i) {
     LLIArgs.push_back("-load");
-    LLIArgs.push_back((*i).c_str());
+    LLIArgs.push_back(*i);
   }
 
   // Add any extra LLI args.
   for (unsigned i = 0, e = ToolArgs.size(); i != e; ++i)
-    LLIArgs.push_back(ToolArgs[i].c_str());
+    LLIArgs.push_back(ToolArgs[i]);
 
-  LLIArgs.push_back(Bitcode.c_str());
+  LLIArgs.push_back(Bitcode);
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
-    LLIArgs.push_back(Args[i].c_str());
-  LLIArgs.push_back(nullptr);
+    LLIArgs.push_back(Args[i]);
 
   outs() << "<lli>";
   outs().flush();
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = LLIArgs.size() - 1; i != e; ++i) errs()
-        << " " << LLIArgs[i];
-        errs() << "\n";);
-  return RunProgramWithTimeout(LLIPath, &LLIArgs[0], InputFile, OutputFile,
+  LLVM_DEBUG(errs() << "\nAbout to run:\t";
+             for (unsigned i = 0, e = LLIArgs.size() - 1; i != e; ++i) errs()
+             << " " << LLIArgs[i];
+             errs() << "\n";);
+  return RunProgramWithTimeout(LLIPath, LLIArgs, InputFile, OutputFile,
                                OutputFile, Timeout, MemoryLimit);
 }
 
@@ -206,7 +204,7 @@ void AbstractInterpreter::anchor() {}
 
 #if defined(LLVM_ON_UNIX)
 const char EXESuffix[] = "";
-#elif defined(LLVM_ON_WIN32)
+#elif defined(_WIN32)
 const char EXESuffix[] = "exe";
 #endif
 
@@ -215,7 +213,7 @@ const char EXESuffix[] = "exe";
 /// itself. This allows us to find another LLVM tool if it is built in the same
 /// directory. An empty string is returned on error; note that this function
 /// just mainpulates the path and doesn't check for executability.
-/// @brief Find a named executable.
+/// Find a named executable.
 static std::string PrependMainExecutablePath(const std::string &ExeName,
                                              const char *Argv0,
                                              void *MainAddr) {
@@ -285,22 +283,20 @@ public:
 Error CustomCompiler::compileProgram(const std::string &Bitcode,
                                      unsigned Timeout, unsigned MemoryLimit) {
 
-  std::vector<const char *> ProgramArgs;
+  std::vector<StringRef> ProgramArgs;
   ProgramArgs.push_back(CompilerCommand.c_str());
 
   for (std::size_t i = 0; i < CompilerArgs.size(); ++i)
     ProgramArgs.push_back(CompilerArgs.at(i).c_str());
-  ProgramArgs.push_back(Bitcode.c_str());
-  ProgramArgs.push_back(nullptr);
+  ProgramArgs.push_back(Bitcode);
 
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = CompilerArgs.size(); i != e; ++i)
     ProgramArgs.push_back(CompilerArgs[i].c_str());
 
-  if (RunProgramWithTimeout(CompilerCommand, &ProgramArgs[0], "", "", "",
-                            Timeout, MemoryLimit))
-    return ProcessFailure(CompilerCommand, &ProgramArgs[0], Timeout,
-                          MemoryLimit);
+  if (RunProgramWithTimeout(CompilerCommand, ProgramArgs, "", "", "", Timeout,
+                            MemoryLimit))
+    return ProcessFailure(CompilerCommand, ProgramArgs, Timeout, MemoryLimit);
   return Error::success();
 }
 
@@ -336,19 +332,18 @@ Expected<int> CustomExecutor::ExecuteProgram(
     const std::vector<std::string> &SharedLibs, unsigned Timeout,
     unsigned MemoryLimit) {
 
-  std::vector<const char *> ProgramArgs;
-  ProgramArgs.push_back(ExecutionCommand.c_str());
+  std::vector<StringRef> ProgramArgs;
+  ProgramArgs.push_back(ExecutionCommand);
 
   for (std::size_t i = 0; i < ExecutorArgs.size(); ++i)
-    ProgramArgs.push_back(ExecutorArgs.at(i).c_str());
-  ProgramArgs.push_back(Bitcode.c_str());
-  ProgramArgs.push_back(nullptr);
+    ProgramArgs.push_back(ExecutorArgs[i]);
+  ProgramArgs.push_back(Bitcode);
 
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
-    ProgramArgs.push_back(Args[i].c_str());
+    ProgramArgs.push_back(Args[i]);
 
-  return RunProgramWithTimeout(ExecutionCommand, &ProgramArgs[0], InputFile,
+  return RunProgramWithTimeout(ExecutionCommand, ProgramArgs, InputFile,
                                OutputFile, OutputFile, Timeout, MemoryLimit);
 }
 
@@ -463,31 +458,28 @@ Expected<CC::FileType> LLC::OutputCode(const std::string &Bitcode,
     exit(1);
   }
   OutputAsmFile = UniqueFile.str();
-  std::vector<const char *> LLCArgs;
-  LLCArgs.push_back(LLCPath.c_str());
+  std::vector<StringRef> LLCArgs;
+  LLCArgs.push_back(LLCPath);
 
   // Add any extra LLC args.
   for (unsigned i = 0, e = ToolArgs.size(); i != e; ++i)
-    LLCArgs.push_back(ToolArgs[i].c_str());
+    LLCArgs.push_back(ToolArgs[i]);
 
   LLCArgs.push_back("-o");
-  LLCArgs.push_back(OutputAsmFile.c_str()); // Output to the Asm file
-  LLCArgs.push_back(Bitcode.c_str());       // This is the input bitcode
+  LLCArgs.push_back(OutputAsmFile); // Output to the Asm file
+  LLCArgs.push_back(Bitcode);       // This is the input bitcode
 
   if (UseIntegratedAssembler)
     LLCArgs.push_back("-filetype=obj");
 
-  LLCArgs.push_back(nullptr);
-
   outs() << (UseIntegratedAssembler ? "<llc-ia>" : "<llc>");
   outs().flush();
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = LLCArgs.size() - 1; i != e; ++i) errs()
-        << " " << LLCArgs[i];
-        errs() << "\n";);
-  if (RunProgramWithTimeout(LLCPath, &LLCArgs[0], "", "", "", Timeout,
-                            MemoryLimit))
-    return ProcessFailure(LLCPath, &LLCArgs[0], Timeout, MemoryLimit);
+  LLVM_DEBUG(errs() << "\nAbout to run:\t";
+             for (unsigned i = 0, e = LLCArgs.size() - 1; i != e; ++i) errs()
+             << " " << LLCArgs[i];
+             errs() << "\n";);
+  if (RunProgramWithTimeout(LLCPath, LLCArgs, "", "", "", Timeout, MemoryLimit))
+    return ProcessFailure(LLCPath, LLCArgs, Timeout, MemoryLimit);
   return UseIntegratedAssembler ? CC::ObjectFile : CC::AsmFile;
 }
 
@@ -581,32 +573,31 @@ Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
                                   const std::vector<std::string> &SharedLibs,
                                   unsigned Timeout, unsigned MemoryLimit) {
   // Construct a vector of parameters, incorporating those from the command-line
-  std::vector<const char *> JITArgs;
+  std::vector<StringRef> JITArgs;
   JITArgs.push_back(LLIPath.c_str());
   JITArgs.push_back("-force-interpreter=false");
 
   // Add any extra LLI args.
   for (unsigned i = 0, e = ToolArgs.size(); i != e; ++i)
-    JITArgs.push_back(ToolArgs[i].c_str());
+    JITArgs.push_back(ToolArgs[i]);
 
   for (unsigned i = 0, e = SharedLibs.size(); i != e; ++i) {
     JITArgs.push_back("-load");
-    JITArgs.push_back(SharedLibs[i].c_str());
+    JITArgs.push_back(SharedLibs[i]);
   }
   JITArgs.push_back(Bitcode.c_str());
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
-    JITArgs.push_back(Args[i].c_str());
-  JITArgs.push_back(nullptr);
+    JITArgs.push_back(Args[i]);
 
   outs() << "<jit>";
   outs().flush();
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = JITArgs.size() - 1; i != e; ++i) errs()
-        << " " << JITArgs[i];
-        errs() << "\n";);
-  DEBUG(errs() << "\nSending output to " << OutputFile << "\n");
-  return RunProgramWithTimeout(LLIPath, &JITArgs[0], InputFile, OutputFile,
+  LLVM_DEBUG(errs() << "\nAbout to run:\t";
+             for (unsigned i = 0, e = JITArgs.size() - 1; i != e; ++i) errs()
+             << " " << JITArgs[i];
+             errs() << "\n";);
+  LLVM_DEBUG(errs() << "\nSending output to " << OutputFile << "\n");
+  return RunProgramWithTimeout(LLIPath, JITArgs, InputFile, OutputFile,
                                OutputFile, Timeout, MemoryLimit);
 }
 
@@ -630,15 +621,15 @@ AbstractInterpreter::createJIT(const char *Argv0, std::string &Message,
 // CC abstraction
 //
 
-static bool IsARMArchitecture(std::vector<const char *> Args) {
-  for (std::vector<const char *>::const_iterator I = Args.begin(),
-                                                 E = Args.end();
-       I != E; ++I) {
-    if (StringRef(*I).equals_lower("-arch")) {
-      ++I;
-      if (I != E && StringRef(*I).startswith_lower("arm"))
-        return true;
-    }
+static bool IsARMArchitecture(std::vector<StringRef> Args) {
+  for (size_t I = 0; I < Args.size(); ++I) {
+    if (!Args[I].equals_lower("-arch"))
+      continue;
+    ++I;
+    if (I == Args.size())
+      break;
+    if (Args[I].startswith_lower("arm"))
+      return true;
   }
 
   return false;
@@ -651,9 +642,9 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
                                  const std::string &OutputFile,
                                  const std::vector<std::string> &ArgsForCC,
                                  unsigned Timeout, unsigned MemoryLimit) {
-  std::vector<const char *> CCArgs;
+  std::vector<StringRef> CCArgs;
 
-  CCArgs.push_back(CCPath.c_str());
+  CCArgs.push_back(CCPath);
 
   if (TargetTriple.getArch() == Triple::x86)
     CCArgs.push_back("-m32");
@@ -661,7 +652,7 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
   for (std::vector<std::string>::const_iterator I = ccArgs.begin(),
                                                 E = ccArgs.end();
        I != E; ++I)
-    CCArgs.push_back(I->c_str());
+    CCArgs.push_back(*I);
 
   // Specify -x explicitly in case the extension is wonky
   if (fileType != ObjectFile) {
@@ -680,7 +671,7 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
     }
   }
 
-  CCArgs.push_back(ProgramFile.c_str()); // Specify the input filename.
+  CCArgs.push_back(ProgramFile); // Specify the input filename.
 
   CCArgs.push_back("-x");
   CCArgs.push_back("none");
@@ -693,51 +684,50 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
     errs() << "Error making unique filename: " << EC.message() << "\n";
     exit(1);
   }
-  CCArgs.push_back(OutputBinary.c_str()); // Output to the right file...
+  CCArgs.push_back(OutputBinary); // Output to the right file...
 
   // Add any arguments intended for CC. We locate them here because this is
   // most likely -L and -l options that need to come before other libraries but
   // after the source. Other options won't be sensitive to placement on the
   // command line, so this should be safe.
   for (unsigned i = 0, e = ArgsForCC.size(); i != e; ++i)
-    CCArgs.push_back(ArgsForCC[i].c_str());
+    CCArgs.push_back(ArgsForCC[i]);
 
   CCArgs.push_back("-lm"); // Hard-code the math library...
   CCArgs.push_back("-O2"); // Optimize the program a bit...
   if (TargetTriple.getArch() == Triple::sparc)
     CCArgs.push_back("-mcpu=v9");
-  CCArgs.push_back(nullptr); // NULL terminator
 
   outs() << "<CC>";
   outs().flush();
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = CCArgs.size() - 1; i != e; ++i) errs()
-        << " " << CCArgs[i];
-        errs() << "\n";);
-  if (RunProgramWithTimeout(CCPath, &CCArgs[0], "", "", ""))
-    return ProcessFailure(CCPath, &CCArgs[0]);
+  LLVM_DEBUG(errs() << "\nAbout to run:\t";
+             for (unsigned i = 0, e = CCArgs.size() - 1; i != e; ++i) errs()
+             << " " << CCArgs[i];
+             errs() << "\n";);
+  if (RunProgramWithTimeout(CCPath, CCArgs, "", "", ""))
+    return ProcessFailure(CCPath, CCArgs);
 
-  std::vector<const char *> ProgramArgs;
+  std::vector<StringRef> ProgramArgs;
 
   // Declared here so that the destructor only runs after
   // ProgramArgs is used.
   std::string Exec;
 
   if (RemoteClientPath.empty())
-    ProgramArgs.push_back(OutputBinary.c_str());
+    ProgramArgs.push_back(OutputBinary);
   else {
-    ProgramArgs.push_back(RemoteClientPath.c_str());
-    ProgramArgs.push_back(RemoteHost.c_str());
+    ProgramArgs.push_back(RemoteClientPath);
+    ProgramArgs.push_back(RemoteHost);
     if (!RemoteUser.empty()) {
       ProgramArgs.push_back("-l");
-      ProgramArgs.push_back(RemoteUser.c_str());
+      ProgramArgs.push_back(RemoteUser);
     }
     if (!RemotePort.empty()) {
       ProgramArgs.push_back("-p");
-      ProgramArgs.push_back(RemotePort.c_str());
+      ProgramArgs.push_back(RemotePort);
     }
     if (!RemoteExtra.empty()) {
-      ProgramArgs.push_back(RemoteExtra.c_str());
+      ProgramArgs.push_back(RemoteExtra);
     }
 
     // Full path to the binary. We need to cd to the exec directory because
@@ -747,28 +737,28 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
     Exec += env_pwd;
     Exec += "; ./";
     Exec += OutputBinary.c_str();
-    ProgramArgs.push_back(Exec.c_str());
+    ProgramArgs.push_back(Exec);
   }
 
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
-    ProgramArgs.push_back(Args[i].c_str());
-  ProgramArgs.push_back(nullptr); // NULL terminator
+    ProgramArgs.push_back(Args[i]);
 
   // Now that we have a binary, run it!
   outs() << "<program>";
   outs().flush();
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = ProgramArgs.size() - 1; i != e; ++i) errs()
-        << " " << ProgramArgs[i];
-        errs() << "\n";);
+  LLVM_DEBUG(
+      errs() << "\nAbout to run:\t";
+      for (unsigned i = 0, e = ProgramArgs.size() - 1; i != e; ++i) errs()
+      << " " << ProgramArgs[i];
+      errs() << "\n";);
 
   FileRemover OutputBinaryRemover(OutputBinary.str(), !SaveTemps);
 
   if (RemoteClientPath.empty()) {
-    DEBUG(errs() << "<run locally>");
+    LLVM_DEBUG(errs() << "<run locally>");
     std::string Error;
-    int ExitCode = RunProgramWithTimeout(OutputBinary.str(), &ProgramArgs[0],
+    int ExitCode = RunProgramWithTimeout(OutputBinary.str(), ProgramArgs,
                                          InputFile, OutputFile, OutputFile,
                                          Timeout, MemoryLimit, &Error);
     // Treat a signal (usually SIGSEGV) or timeout as part of the program output
@@ -782,7 +772,7 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
   } else {
     outs() << "<run remotely>";
     outs().flush();
-    return RunProgramRemotelyWithTimeout(RemoteClientPath, &ProgramArgs[0],
+    return RunProgramRemotelyWithTimeout(RemoteClientPath, ProgramArgs,
                                          InputFile, OutputFile, OutputFile,
                                          Timeout, MemoryLimit);
   }
@@ -800,9 +790,9 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
   }
   OutputFile = UniqueFilename.str();
 
-  std::vector<const char *> CCArgs;
+  std::vector<StringRef> CCArgs;
 
-  CCArgs.push_back(CCPath.c_str());
+  CCArgs.push_back(CCPath);
 
   if (TargetTriple.getArch() == Triple::x86)
     CCArgs.push_back("-m32");
@@ -810,7 +800,7 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
   for (std::vector<std::string>::const_iterator I = ccArgs.begin(),
                                                 E = ccArgs.end();
        I != E; ++I)
-    CCArgs.push_back(I->c_str());
+    CCArgs.push_back(*I);
 
   // Compile the C/asm file into a shared object
   if (fileType != ObjectFile) {
@@ -818,7 +808,7 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
     CCArgs.push_back(fileType == AsmFile ? "assembler" : "c");
   }
   CCArgs.push_back("-fno-strict-aliasing");
-  CCArgs.push_back(InputFile.c_str()); // Specify the input filename.
+  CCArgs.push_back(InputFile); // Specify the input filename.
   CCArgs.push_back("-x");
   CCArgs.push_back("none");
   if (TargetTriple.getArch() == Triple::sparc)
@@ -842,7 +832,7 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
     CCArgs.push_back("-mcpu=v9");
 
   CCArgs.push_back("-o");
-  CCArgs.push_back(OutputFile.c_str()); // Output to the right filename.
+  CCArgs.push_back(OutputFile);         // Output to the right filename.
   CCArgs.push_back("-O2");              // Optimize the program a bit.
 
   // Add any arguments intended for CC. We locate them here because this is
@@ -850,17 +840,16 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
   // after the source. Other options won't be sensitive to placement on the
   // command line, so this should be safe.
   for (unsigned i = 0, e = ArgsForCC.size(); i != e; ++i)
-    CCArgs.push_back(ArgsForCC[i].c_str());
-  CCArgs.push_back(nullptr); // NULL terminator
+    CCArgs.push_back(ArgsForCC[i]);
 
   outs() << "<CC>";
   outs().flush();
-  DEBUG(errs() << "\nAbout to run:\t";
-        for (unsigned i = 0, e = CCArgs.size() - 1; i != e; ++i) errs()
-        << " " << CCArgs[i];
-        errs() << "\n";);
-  if (RunProgramWithTimeout(CCPath, &CCArgs[0], "", "", ""))
-    return ProcessFailure(CCPath, &CCArgs[0]);
+  LLVM_DEBUG(errs() << "\nAbout to run:\t";
+             for (unsigned i = 0, e = CCArgs.size() - 1; i != e; ++i) errs()
+             << " " << CCArgs[i];
+             errs() << "\n";);
+  if (RunProgramWithTimeout(CCPath, CCArgs, "", "", ""))
+    return ProcessFailure(CCPath, CCArgs);
   return Error::success();
 }
 
diff --git a/contrib/llvm/tools/bugpoint/bugpoint.cpp b/contrib/llvm/tools/bugpoint/bugpoint.cpp
index ec1ca2e54968..f6b7d08455d4 100644
--- a/contrib/llvm/tools/bugpoint/bugpoint.cpp
+++ b/contrib/llvm/tools/bugpoint/bugpoint.cpp
@@ -15,17 +15,18 @@
 
 #include "BugDriver.h"
 #include "ToolRunner.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Valgrind.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -117,9 +118,7 @@ void initializePollyPasses(llvm::PassRegistry &Registry);
 
 int main(int argc, char **argv) {
 #ifndef DEBUG_BUGPOINT
-  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
-  llvm::PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 #endif
 
   // Initialize passes
@@ -132,6 +131,7 @@ int main(int argc, char **argv) {
   initializeAnalysis(Registry);
   initializeTransformUtils(Registry);
   initializeInstCombine(Registry);
+  initializeAggressiveInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
 
diff --git a/contrib/llvm/tools/llc/llc.cpp b/contrib/llvm/tools/llc/llc.cpp
index a4810890f9b4..2329fb3e87c9 100644
--- a/contrib/llvm/tools/llc/llc.cpp
+++ b/contrib/llvm/tools/llc/llc.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -40,14 +41,14 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <memory>
@@ -66,6 +67,11 @@ InputLanguage("x", cl::desc("Input language ('ir' or 'mir')"));
 static cl::opt<std::string>
 OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"));
 
+static cl::opt<std::string>
+    SplitDwarfOutputFile("split-dwarf-output",
+                         cl::desc(".dwo output filename"),
+                         cl::value_desc("filename"));
+
 static cl::opt<unsigned>
 TimeCompilations("time-compilations", cl::Hidden, cl::init(1u),
                  cl::value_desc("N"),
@@ -226,7 +232,7 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
     OpenFlags |= sys::fs::F_Text;
   auto FDOut = llvm::make_unique<ToolOutputFile>(OutputFilename, EC, OpenFlags);
   if (EC) {
-    errs() << EC.message() << '\n';
+    WithColor::error() << EC.message() << '\n';
     return nullptr;
   }
 
@@ -262,20 +268,18 @@ static void InlineAsmDiagHandler(const SMDiagnostic &SMD, void *Context,
 
   // For testing purposes, we print the LocCookie here.
   if (LocCookie)
-    errs() << "note: !srcloc = " << LocCookie << "\n";
+    WithColor::note() << "!srcloc = " << LocCookie << "\n";
 }
 
 // main - Entry point for the llc compiler.
 //
 int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
 
   // Enable debug stream buffering.
   EnableDebugBuffering = true;
 
   LLVMContext Context;
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
   // Initialize targets first, so that --version shows registered targets.
   InitializeAllTargets();
@@ -327,7 +331,7 @@ int main(int argc, char **argv) {
     YamlFile =
         llvm::make_unique<ToolOutputFile>(RemarksFilename, EC, sys::fs::F_None);
     if (EC) {
-      errs() << EC.message() << '\n';
+      WithColor::error(errs(), argv[0]) << EC.message() << '\n';
       return 1;
     }
     Context.setDiagnosticsOutputFile(
@@ -336,7 +340,8 @@ int main(int argc, char **argv) {
 
   if (InputLanguage != "" && InputLanguage != "ir" &&
       InputLanguage != "mir") {
-    errs() << argv[0] << "Input language must be '', 'IR' or 'MIR'\n";
+    WithColor::error(errs(), argv[0])
+        << "input language must be '', 'IR' or 'MIR'\n";
     return 1;
   }
 
@@ -359,7 +364,8 @@ static bool addPass(PassManagerBase &PM, const char *argv0,
   const PassRegistry *PR = PassRegistry::getPassRegistry();
   const PassInfo *PI = PR->getPassInfo(PassName);
   if (!PI) {
-    errs() << argv0 << ": run-pass " << PassName << " is not registered.\n";
+    WithColor::error(errs(), argv0)
+        << "run-pass " << PassName << " is not registered.\n";
     return true;
   }
 
@@ -367,7 +373,8 @@ static bool addPass(PassManagerBase &PM, const char *argv0,
   if (PI->getNormalCtor())
     P = PI->getNormalCtor()();
   else {
-    errs() << argv0 << ": cannot create pass: " << PI->getPassName() << "\n";
+    WithColor::error(errs(), argv0)
+        << "cannot create pass: " << PI->getPassName() << "\n";
     return true;
   }
   std::string Banner = std::string("After ") + std::string(P->getPassName());
@@ -395,17 +402,9 @@ static int compileModule(char **argv, LLVMContext &Context) {
       if (MIR)
         M = MIR->parseIRModule();
     } else
-      M = parseIRFile(InputFilename, Err, Context);
+      M = parseIRFile(InputFilename, Err, Context, false);
     if (!M) {
-      Err.print(argv[0], errs());
-      return 1;
-    }
-
-    // Verify module immediately to catch problems before doInitialization() is
-    // called on any passes.
-    if (!NoVerify && verifyModule(*M, &errs())) {
-      errs() << argv[0] << ": " << InputFilename
-             << ": error: input module is broken!\n";
+      Err.print(argv[0], WithColor::error(errs(), argv[0]));
       return 1;
     }
 
@@ -425,7 +424,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
   const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
                                                          Error);
   if (!TheTarget) {
-    errs() << argv[0] << ": " << Error;
+    WithColor::error(errs(), argv[0]) << Error;
     return 1;
   }
 
@@ -434,7 +433,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
   CodeGenOpt::Level OLvl = CodeGenOpt::Default;
   switch (OptLevel) {
   default:
-    errs() << argv[0] << ": invalid optimization level.\n";
+    WithColor::error(errs(), argv[0]) << "invalid optimization level.\n";
     return 1;
   case ' ': break;
   case '0': OLvl = CodeGenOpt::None; break;
@@ -473,6 +472,17 @@ static int compileModule(char **argv, LLVMContext &Context) {
       GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]);
   if (!Out) return 1;
 
+  std::unique_ptr<ToolOutputFile> DwoOut;
+  if (!SplitDwarfOutputFile.empty()) {
+    std::error_code EC;
+    DwoOut = llvm::make_unique<ToolOutputFile>(SplitDwarfOutputFile, EC,
+                                               sys::fs::F_None);
+    if (EC) {
+      WithColor::error(errs(), argv[0]) << EC.message() << '\n';
+      return 1;
+    }
+  }
+
   // Build up all of the passes that we want to do to the module.
   legacy::PassManager PM;
 
@@ -487,14 +497,27 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // Add the target data from the target machine, if it exists, or the module.
   M->setDataLayout(Target->createDataLayout());
 
+  // This needs to be done after setting datalayout since it calls verifier
+  // to check debug info whereas verifier relies on correct datalayout.
+  UpgradeDebugInfo(*M);
+
+  // Verify module immediately to catch problems before doInitialization() is
+  // called on any passes.
+  if (!NoVerify && verifyModule(*M, &errs())) {
+    std::string Prefix =
+        (Twine(argv[0]) + Twine(": ") + Twine(InputFilename)).str();
+    WithColor::error(errs(), Prefix) << "input module is broken!\n";
+    return 1;
+  }
+
   // Override function attributes based on CPUStr, FeaturesStr, and command line
   // flags.
   setFunctionAttributes(CPUStr, FeaturesStr, *M);
 
   if (RelaxAll.getNumOccurrences() > 0 &&
       FileType != TargetMachine::CGFT_ObjectFile)
-    errs() << argv[0]
-             << ": warning: ignoring -mc-relax-all because filetype != obj";
+    WithColor::warning(errs(), argv[0])
+        << ": warning: ignoring -mc-relax-all because filetype != obj";
 
   {
     raw_pwrite_stream *OS = &Out->os();
@@ -518,13 +541,15 @@ static int compileModule(char **argv, LLVMContext &Context) {
     // selection.
     if (!RunPassNames->empty()) {
       if (!MIR) {
-        errs() << argv0 << ": run-pass is for .mir file only.\n";
+        WithColor::warning(errs(), argv[0])
+            << "run-pass is for .mir file only.\n";
         return 1;
       }
       TargetPassConfig &TPC = *LLVMTM.createPassConfig(PM);
       if (TPC.hasLimitedCodeGenPipeline()) {
-        errs() << argv0 << ": run-pass cannot be used with "
-               << TPC.getLimitedCodeGenPipelineReason(" and ") << ".\n";
+        WithColor::warning(errs(), argv[0])
+            << "run-pass cannot be used with "
+            << TPC.getLimitedCodeGenPipelineReason(" and ") << ".\n";
         return 1;
       }
 
@@ -539,9 +564,12 @@ static int compileModule(char **argv, LLVMContext &Context) {
       TPC.setInitialized();
       PM.add(createPrintMIRPass(*OS));
       PM.add(createFreeMachineFunctionPass());
-    } else if (Target->addPassesToEmitFile(PM, *OS, FileType, NoVerify, MMI)) {
-      errs() << argv0 << ": target does not support generation of this"
-             << " file type!\n";
+    } else if (Target->addPassesToEmitFile(PM, *OS,
+                                           DwoOut ? &DwoOut->os() : nullptr,
+                                           FileType, NoVerify, MMI)) {
+      WithColor::warning(errs(), argv[0])
+          << "target does not support generation of this"
+          << " file type!\n";
       return 1;
     }
 
@@ -560,7 +588,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
     // in the future.
     SmallVector<char, 0> CompileTwiceBuffer;
     if (CompileTwice) {
-      std::unique_ptr<Module> M2(llvm::CloneModule(M.get()));
+      std::unique_ptr<Module> M2(llvm::CloneModule(*M));
       PM.run(*M2);
       CompileTwiceBuffer = Buffer;
       Buffer.clear();
@@ -596,6 +624,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
   // Declare success.
   Out->keep();
+  if (DwoOut)
+    DwoOut->keep();
 
   return 0;
 }
diff --git a/contrib/llvm/tools/lli/OrcLazyJIT.cpp b/contrib/llvm/tools/lli/OrcLazyJIT.cpp
deleted file mode 100644
index f1a752e0790d..000000000000
--- a/contrib/llvm/tools/lli/OrcLazyJIT.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-//===- OrcLazyJIT.cpp - Basic Orc-based JIT for lazy execution ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OrcLazyJIT.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <system_error>
-
-using namespace llvm;
-
-namespace {
-
-enum class DumpKind {
-  NoDump,
-  DumpFuncsToStdOut,
-  DumpModsToStdOut,
-  DumpModsToDisk
-};
-
-} // end anonymous namespace
-
-static cl::opt<DumpKind> OrcDumpKind(
-    "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
-    cl::init(DumpKind::NoDump),
-    cl::values(clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
-               clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
-                          "Dump function names to stdout."),
-               clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
-                          "Dump modules to stdout."),
-               clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
-                          "Dump modules to the current "
-                          "working directory. (WARNING: "
-                          "will overwrite existing files).")),
-    cl::Hidden);
-
-static cl::opt<bool> OrcInlineStubs("orc-lazy-inline-stubs",
-                                    cl::desc("Try to inline stubs"),
-                                    cl::init(true), cl::Hidden);
-
-OrcLazyJIT::TransformFtor OrcLazyJIT::createDebugDumper() {
-  switch (OrcDumpKind) {
-  case DumpKind::NoDump:
-    return [](std::shared_ptr<Module> M) { return M; };
-
-  case DumpKind::DumpFuncsToStdOut:
-    return [](std::shared_ptr<Module> M) {
-      printf("[ ");
-
-      for (const auto &F : *M) {
-        if (F.isDeclaration())
-          continue;
-
-        if (F.hasName()) {
-          std::string Name(F.getName());
-          printf("%s ", Name.c_str());
-        } else
-          printf("<anon> ");
-      }
-
-      printf("]\n");
-      return M;
-    };
-
-  case DumpKind::DumpModsToStdOut:
-    return [](std::shared_ptr<Module> M) {
-             outs() << "----- Module Start -----\n" << *M
-                    << "----- Module End -----\n";
-
-             return M;
-           };
-
-  case DumpKind::DumpModsToDisk:
-    return [](std::shared_ptr<Module> M) {
-             std::error_code EC;
-             raw_fd_ostream Out(M->getModuleIdentifier() + ".ll", EC,
-                                sys::fs::F_Text);
-             if (EC) {
-               errs() << "Couldn't open " << M->getModuleIdentifier()
-                      << " for dumping.\nError:" << EC.message() << "\n";
-               exit(1);
-             }
-             Out << *M;
-             return M;
-           };
-  }
-  llvm_unreachable("Unknown DumpKind");
-}
-
-// Defined in lli.cpp.
-CodeGenOpt::Level getOptLevel();
-
-template <typename PtrTy>
-static PtrTy fromTargetAddress(JITTargetAddress Addr) {
-  return reinterpret_cast<PtrTy>(static_cast<uintptr_t>(Addr));
-}
-
-int llvm::runOrcLazyJIT(std::vector<std::unique_ptr<Module>> Ms,
-                        const std::vector<std::string> &Args) {
-  // Add the program's symbols into the JIT's search space.
-  if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr)) {
-    errs() << "Error loading program symbols.\n";
-    return 1;
-  }
-
-  // Grab a target machine and try to build a factory function for the
-  // target-specific Orc callback manager.
-  EngineBuilder EB;
-  EB.setOptLevel(getOptLevel());
-  auto TM = std::unique_ptr<TargetMachine>(EB.selectTarget());
-  Triple T(TM->getTargetTriple());
-  auto CompileCallbackMgr = orc::createLocalCompileCallbackManager(T, 0);
-
-  // If we couldn't build the factory function then there must not be a callback
-  // manager for this target. Bail out.
-  if (!CompileCallbackMgr) {
-    errs() << "No callback manager available for target '"
-           << TM->getTargetTriple().str() << "'.\n";
-    return 1;
-  }
-
-  auto IndirectStubsMgrBuilder = orc::createLocalIndirectStubsManagerBuilder(T);
-
-  // If we couldn't build a stubs-manager-builder for this target then bail out.
-  if (!IndirectStubsMgrBuilder) {
-    errs() << "No indirect stubs manager available for target '"
-           << TM->getTargetTriple().str() << "'.\n";
-    return 1;
-  }
-
-  // Everything looks good. Build the JIT.
-  OrcLazyJIT J(std::move(TM), std::move(CompileCallbackMgr),
-               std::move(IndirectStubsMgrBuilder),
-               OrcInlineStubs);
-
-  // Add the module, look up main and run it.
-  for (auto &M : Ms)
-    cantFail(J.addModule(std::shared_ptr<Module>(std::move(M))));
-
-  if (auto MainSym = J.findSymbol("main")) {
-    typedef int (*MainFnPtr)(int, const char*[]);
-    std::vector<const char *> ArgV;
-    for (auto &Arg : Args)
-      ArgV.push_back(Arg.c_str());
-    auto Main = fromTargetAddress<MainFnPtr>(cantFail(MainSym.getAddress()));
-    return Main(ArgV.size(), (const char**)ArgV.data());
-  } else if (auto Err = MainSym.takeError())
-    logAllUnhandledErrors(std::move(Err), llvm::errs(), "");
-  else
-    errs() << "Could not find main function.\n";
-
-  return 1;
-}
diff --git a/contrib/llvm/tools/lli/OrcLazyJIT.h b/contrib/llvm/tools/lli/OrcLazyJIT.h
deleted file mode 100644
index a5cc804bb045..000000000000
--- a/contrib/llvm/tools/lli/OrcLazyJIT.h
+++ /dev/null
@@ -1,201 +0,0 @@
-//===- OrcLazyJIT.h - Basic Orc-based JIT for lazy execution ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Simple Orc-based JIT. Uses the compile-on-demand layer to break up and
-// lazily compile modules.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLI_ORCLAZYJIT_H
-#define LLVM_TOOLS_LLI_ORCLAZYJIT_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
-#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-class OrcLazyJIT {
-public:
-
-  using CompileCallbackMgr = orc::JITCompileCallbackManager;
-  using ObjLayerT = orc::RTDyldObjectLinkingLayer;
-  using CompileLayerT = orc::IRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
-  using TransformFtor =
-          std::function<std::shared_ptr<Module>(std::shared_ptr<Module>)>;
-  using IRDumpLayerT = orc::IRTransformLayer<CompileLayerT, TransformFtor>;
-  using CODLayerT = orc::CompileOnDemandLayer<IRDumpLayerT, CompileCallbackMgr>;
-  using IndirectStubsManagerBuilder = CODLayerT::IndirectStubsManagerBuilderT;
-  using ModuleHandleT = CODLayerT::ModuleHandleT;
-
-  OrcLazyJIT(std::unique_ptr<TargetMachine> TM,
-             std::unique_ptr<CompileCallbackMgr> CCMgr,
-             IndirectStubsManagerBuilder IndirectStubsMgrBuilder,
-             bool InlineStubs)
-      : TM(std::move(TM)), DL(this->TM->createDataLayout()),
-        CCMgr(std::move(CCMgr)),
-        ObjectLayer([]() { return std::make_shared<SectionMemoryManager>(); }),
-        CompileLayer(ObjectLayer, orc::SimpleCompiler(*this->TM)),
-        IRDumpLayer(CompileLayer, createDebugDumper()),
-        CODLayer(IRDumpLayer, extractSingleFunction, *this->CCMgr,
-                 std::move(IndirectStubsMgrBuilder), InlineStubs),
-        CXXRuntimeOverrides(
-            [this](const std::string &S) { return mangle(S); }) {}
-
-  ~OrcLazyJIT() {
-    // Run any destructors registered with __cxa_atexit.
-    CXXRuntimeOverrides.runDestructors();
-    // Run any IR destructors.
-    for (auto &DtorRunner : IRStaticDestructorRunners)
-      if (auto Err = DtorRunner.runViaLayer(CODLayer)) {
-        // FIXME: OrcLazyJIT should probably take a "shutdownError" callback to
-        //        report these errors on.
-        report_fatal_error(std::move(Err));
-      }
-  }
-
-  Error addModule(std::shared_ptr<Module> M) {
-    if (M->getDataLayout().isDefault())
-      M->setDataLayout(DL);
-
-    // Rename, bump linkage and record static constructors and destructors.
-    // We have to do this before we hand over ownership of the module to the
-    // JIT.
-    std::vector<std::string> CtorNames, DtorNames;
-    {
-      unsigned CtorId = 0, DtorId = 0;
-      for (auto Ctor : orc::getConstructors(*M)) {
-        std::string NewCtorName = ("$static_ctor." + Twine(CtorId++)).str();
-        Ctor.Func->setName(NewCtorName);
-        Ctor.Func->setLinkage(GlobalValue::ExternalLinkage);
-        Ctor.Func->setVisibility(GlobalValue::HiddenVisibility);
-        CtorNames.push_back(mangle(NewCtorName));
-      }
-      for (auto Dtor : orc::getDestructors(*M)) {
-        std::string NewDtorName = ("$static_dtor." + Twine(DtorId++)).str();
-        Dtor.Func->setLinkage(GlobalValue::ExternalLinkage);
-        Dtor.Func->setVisibility(GlobalValue::HiddenVisibility);
-        DtorNames.push_back(mangle(Dtor.Func->getName()));
-        Dtor.Func->setName(NewDtorName);
-      }
-    }
-
-    // Symbol resolution order:
-    //   1) Search the JIT symbols.
-    //   2) Check for C++ runtime overrides.
-    //   3) Search the host process (LLI)'s symbol table.
-    if (!ModulesHandle) {
-      auto Resolver =
-        orc::createLambdaResolver(
-          [this](const std::string &Name) -> JITSymbol {
-            if (auto Sym = CODLayer.findSymbol(Name, true))
-              return Sym;
-            return CXXRuntimeOverrides.searchOverrides(Name);
-          },
-          [](const std::string &Name) {
-            if (auto Addr =
-                RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-              return JITSymbol(Addr, JITSymbolFlags::Exported);
-            return JITSymbol(nullptr);
-          }
-        );
-
-      // Add the module to the JIT.
-      if (auto ModulesHandleOrErr =
-          CODLayer.addModule(std::move(M), std::move(Resolver)))
-        ModulesHandle = std::move(*ModulesHandleOrErr);
-      else
-        return ModulesHandleOrErr.takeError();
-
-    } else if (auto Err = CODLayer.addExtraModule(*ModulesHandle, std::move(M)))
-      return Err;
-
-    // Run the static constructors, and save the static destructor runner for
-    // execution when the JIT is torn down.
-    orc::CtorDtorRunner<CODLayerT> CtorRunner(std::move(CtorNames),
-                                              *ModulesHandle);
-    if (auto Err = CtorRunner.runViaLayer(CODLayer))
-      return Err;
-
-    IRStaticDestructorRunners.emplace_back(std::move(DtorNames),
-                                           *ModulesHandle);
-
-    return Error::success();
-  }
-
-  JITSymbol findSymbol(const std::string &Name) {
-    return CODLayer.findSymbol(mangle(Name), true);
-  }
-
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name) {
-    return CODLayer.findSymbolIn(H, mangle(Name), true);
-  }
-
-private:
-  std::string mangle(const std::string &Name) {
-    std::string MangledName;
-    {
-      raw_string_ostream MangledNameStream(MangledName);
-      Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    }
-    return MangledName;
-  }
-
-  static std::set<Function*> extractSingleFunction(Function &F) {
-    std::set<Function*> Partition;
-    Partition.insert(&F);
-    return Partition;
-  }
-
-  static TransformFtor createDebugDumper();
-
-  std::unique_ptr<TargetMachine> TM;
-  DataLayout DL;
-  SectionMemoryManager CCMgrMemMgr;
-
-  std::unique_ptr<CompileCallbackMgr> CCMgr;
-  ObjLayerT ObjectLayer;
-  CompileLayerT CompileLayer;
-  IRDumpLayerT IRDumpLayer;
-  CODLayerT CODLayer;
-
-  orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
-  std::vector<orc::CtorDtorRunner<CODLayerT>> IRStaticDestructorRunners;
-  llvm::Optional<CODLayerT::ModuleHandleT> ModulesHandle;
-};
-
-int runOrcLazyJIT(std::vector<std::unique_ptr<Module>> Ms,
-                  const std::vector<std::string> &Args);
-
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLI_ORCLAZYJIT_H
diff --git a/contrib/llvm/tools/lli/RemoteJITUtils.h b/contrib/llvm/tools/lli/RemoteJITUtils.h
index 4e948413865c..944881070c70 100644
--- a/contrib/llvm/tools/lli/RemoteJITUtils.h
+++ b/contrib/llvm/tools/lli/RemoteJITUtils.h
@@ -84,7 +84,7 @@ public:
     this->MemMgr = std::move(MemMgr);
   }
 
-  void setResolver(std::shared_ptr<JITSymbolResolver> Resolver) {
+  void setResolver(std::shared_ptr<LegacyJITSymbolResolver> Resolver) {
     this->Resolver = std::move(Resolver);
   }
 
@@ -145,7 +145,7 @@ public:
 
 private:
   std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr;
-  std::shared_ptr<JITSymbolResolver> Resolver;
+  std::shared_ptr<LegacyJITSymbolResolver> Resolver;
 };
 }
 
diff --git a/contrib/llvm/tools/lli/lli.cpp b/contrib/llvm/tools/lli/lli.cpp
index a33c51d77877..1940dbd848cc 100644
--- a/contrib/llvm/tools/lli/lli.cpp
+++ b/contrib/llvm/tools/lli/lli.cpp
@@ -13,18 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "OrcLazyJIT.h"
 #include "RemoteJITUtils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/Interpreter.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h"
 #include "llvm/ExecutionEngine/OrcMCJITReplacement.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
@@ -33,6 +35,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
@@ -40,18 +43,18 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PluginLoader.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include <cerrno>
@@ -176,6 +179,28 @@ namespace {
     cl::desc("Generate software floating point library calls"),
     cl::init(false));
 
+  enum class DumpKind {
+    NoDump,
+    DumpFuncsToStdOut,
+    DumpModsToStdOut,
+    DumpModsToDisk
+  };
+
+  cl::opt<DumpKind> OrcDumpKind(
+      "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
+      cl::init(DumpKind::NoDump),
+      cl::values(clEnumValN(DumpKind::NoDump, "no-dump",
+                            "Don't dump anything."),
+                 clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
+                            "Dump function names to stdout."),
+                 clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
+                            "Dump modules to stdout."),
+                 clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
+                            "Dump modules to the current "
+                            "working directory. (WARNING: "
+                            "will overwrite existing files).")),
+      cl::Hidden);
+
   ExitOnError ExitOnErr;
 }
 
@@ -295,7 +320,7 @@ static void addCygMingExtraModule(ExecutionEngine &EE, LLVMContext &Context,
 CodeGenOpt::Level getOptLevel() {
   switch (OptLevel) {
   default:
-    errs() << "lli: Invalid optimization level.\n";
+    WithColor::error(errs(), "lli") << "invalid optimization level.\n";
     exit(1);
   case '0': return CodeGenOpt::None;
   case '1': return CodeGenOpt::Less;
@@ -312,14 +337,14 @@ static void reportError(SMDiagnostic Err, const char *ProgName) {
   exit(1);
 }
 
+int runOrcLazyJIT(LLVMContext &Ctx, std::vector<std::unique_ptr<Module>> Ms,
+                  const std::vector<std::string> &Args);
+
 //===----------------------------------------------------------------------===//
 // main Driver function
 //
 int main(int argc, char **argv, char * const *envp) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-
-  atexit(llvm_shutdown); // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   if (argc > 1)
     ExitOnErr.setBanner(std::string(argv[0]) + ": ");
@@ -358,7 +383,7 @@ int main(int argc, char **argv, char * const *envp) {
     Args.push_back(InputFile);
     for (auto &Arg : InputArgv)
       Args.push_back(Arg);
-    return runOrcLazyJIT(std::move(Ms), Args);
+    return runOrcLazyJIT(Context, std::move(Ms), Args);
   }
 
   if (EnableCacheManager) {
@@ -378,8 +403,8 @@ int main(int argc, char **argv, char * const *envp) {
   std::string ErrorMsg;
   EngineBuilder builder(std::move(Owner));
   builder.setMArch(MArch);
-  builder.setMCPU(MCPU);
-  builder.setMAttrs(MAttrs);
+  builder.setMCPU(getCPUStr());
+  builder.setMAttrs(getFeatureList());
   if (RelocModel.getNumOccurrences())
     builder.setRelocationModel(RelocModel);
   if (CMModel.getNumOccurrences())
@@ -407,8 +432,8 @@ int main(int argc, char **argv, char * const *envp) {
     builder.setMCJITMemoryManager(
       std::unique_ptr<RTDyldMemoryManager>(RTDyldMM));
   } else if (RemoteMCJIT) {
-    errs() << "error: Remote process execution does not work with the "
-              "interpreter.\n";
+    WithColor::error(errs(), argv[0])
+        << "remote process execution does not work with the interpreter.\n";
     exit(1);
   }
 
@@ -423,9 +448,10 @@ int main(int argc, char **argv, char * const *envp) {
   std::unique_ptr<ExecutionEngine> EE(builder.create());
   if (!EE) {
     if (!ErrorMsg.empty())
-      errs() << argv[0] << ": error creating EE: " << ErrorMsg << "\n";
+      WithColor::error(errs(), argv[0])
+          << "error creating EE: " << ErrorMsg << "\n";
     else
-      errs() << argv[0] << ": unknown error creating EE!\n";
+      WithColor::error(errs(), argv[0]) << "unknown error creating EE!\n";
     exit(1);
   }
 
@@ -496,9 +522,13 @@ int main(int argc, char **argv, char * const *envp) {
                 JITEventListener::createOProfileJITEventListener());
   EE->RegisterJITEventListener(
                 JITEventListener::createIntelJITEventListener());
+  if (!RemoteMCJIT)
+    EE->RegisterJITEventListener(
+                JITEventListener::createPerfJITEventListener());
 
   if (!NoLazyCompilation && RemoteMCJIT) {
-    errs() << "warning: remote mcjit does not support lazy compilation\n";
+    WithColor::warning(errs(), argv[0])
+        << "remote mcjit does not support lazy compilation\n";
     NoLazyCompilation = true;
   }
   EE->DisableLazyCompilation(NoLazyCompilation);
@@ -524,7 +554,8 @@ int main(int argc, char **argv, char * const *envp) {
   //
   Function *EntryFn = Mod->getFunction(EntryFunc);
   if (!EntryFn) {
-    errs() << '\'' << EntryFunc << "\' function not found in module.\n";
+    WithColor::error(errs(), argv[0])
+        << '\'' << EntryFunc << "\' function not found in module.\n";
     return -1;
   }
 
@@ -537,16 +568,19 @@ int main(int argc, char **argv, char * const *envp) {
   // remote JIT on Unix platforms.
   if (RemoteMCJIT) {
 #ifndef LLVM_ON_UNIX
-    errs() << "Warning: host does not support external remote targets.\n"
-           << "  Defaulting to local execution\n";
+    WithColor::warning(errs(), argv[0])
+        << "host does not support external remote targets.\n";
+    WithColor::note() << "defaulting to local execution\n";
     return -1;
 #else
     if (ChildExecPath.empty()) {
-      errs() << "-remote-mcjit requires -mcjit-remote-process.\n";
+      WithColor::error(errs(), argv[0])
+          << "-remote-mcjit requires -mcjit-remote-process.\n";
       exit(1);
     } else if (!sys::fs::can_execute(ChildExecPath)) {
-      errs() << "Unable to find usable child executable: '" << ChildExecPath
-             << "'\n";
+      WithColor::error(errs(), argv[0])
+          << "unable to find usable child executable: '" << ChildExecPath
+          << "'\n";
       return -1;
     }
 #endif
@@ -586,10 +620,11 @@ int main(int argc, char **argv, char * const *envp) {
       ResultGV.IntVal = APInt(32, Result);
       Args.push_back(ResultGV);
       EE->runFunction(ExitF, Args);
-      errs() << "ERROR: exit(" << Result << ") returned!\n";
+      WithColor::error(errs(), argv[0]) << "exit(" << Result << ") returned!\n";
       abort();
     } else {
-      errs() << "ERROR: exit defined with wrong prototype!\n";
+      WithColor::error(errs(), argv[0])
+          << "exit defined with wrong prototype!\n";
       abort();
     }
   } else {
@@ -602,13 +637,15 @@ int main(int argc, char **argv, char * const *envp) {
     // Lanch the remote process and get a channel to it.
     std::unique_ptr<FDRawChannel> C = launchRemote();
     if (!C) {
-      errs() << "Failed to launch remote JIT.\n";
+      WithColor::error(errs(), argv[0]) << "failed to launch remote JIT.\n";
       exit(1);
     }
 
     // Create a remote target client running over the channel.
+    llvm::orc::ExecutionSession ES;
+    ES.setErrorReporter([&](Error Err) { ExitOnErr(std::move(Err)); });
     typedef orc::remote::OrcRemoteTargetClient MyRemote;
-    auto R = ExitOnErr(MyRemote::Create(*C, ExitOnErr));
+    auto R = ExitOnErr(MyRemote::Create(*C, ES));
 
     // Create a remote memory manager.
     auto RemoteMM = ExitOnErr(R->createRemoteMemoryManager());
@@ -632,8 +669,8 @@ int main(int argc, char **argv, char * const *envp) {
     // FIXME: argv and envp handling.
     JITTargetAddress Entry = EE->getFunctionAddress(EntryFn->getName().str());
     EE->finalizeObject();
-    DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at 0x"
-                 << format("%llx", Entry) << "\n");
+    LLVM_DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at 0x"
+                      << format("%llx", Entry) << "\n");
     Result = ExitOnErr(R->callIntVoid(Entry));
 
     // Like static constructors, the remote target MCJIT support doesn't handle
@@ -651,6 +688,130 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
+static orc::IRTransformLayer2::TransformFunction createDebugDumper() {
+  switch (OrcDumpKind) {
+  case DumpKind::NoDump:
+    return [](std::unique_ptr<Module> M) { return M; };
+
+  case DumpKind::DumpFuncsToStdOut:
+    return [](std::unique_ptr<Module> M) {
+      printf("[ ");
+
+      for (const auto &F : *M) {
+        if (F.isDeclaration())
+          continue;
+
+        if (F.hasName()) {
+          std::string Name(F.getName());
+          printf("%s ", Name.c_str());
+        } else
+          printf("<anon> ");
+      }
+
+      printf("]\n");
+      return M;
+    };
+
+  case DumpKind::DumpModsToStdOut:
+    return [](std::unique_ptr<Module> M) {
+      outs() << "----- Module Start -----\n"
+             << *M << "----- Module End -----\n";
+
+      return M;
+    };
+
+  case DumpKind::DumpModsToDisk:
+    return [](std::unique_ptr<Module> M) {
+      std::error_code EC;
+      raw_fd_ostream Out(M->getModuleIdentifier() + ".ll", EC, sys::fs::F_Text);
+      if (EC) {
+        errs() << "Couldn't open " << M->getModuleIdentifier()
+               << " for dumping.\nError:" << EC.message() << "\n";
+        exit(1);
+      }
+      Out << *M;
+      return M;
+    };
+  }
+  llvm_unreachable("Unknown DumpKind");
+}
+
+int runOrcLazyJIT(LLVMContext &Ctx, std::vector<std::unique_ptr<Module>> Ms,
+                  const std::vector<std::string> &Args) {
+  // Bail out early if no modules loaded.
+  if (Ms.empty())
+    return 0;
+
+  // Add lli's symbols into the JIT's search space.
+  std::string ErrMsg;
+  sys::DynamicLibrary LibLLI =
+      sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg);
+  if (!LibLLI.isValid()) {
+    errs() << "Error loading lli symbols: " << ErrMsg << ".\n";
+    return 1;
+  }
+
+  const auto &TT = Ms.front()->getTargetTriple();
+  orc::JITTargetMachineBuilder TMD =
+      TT.empty() ? ExitOnErr(orc::JITTargetMachineBuilder::detectHost())
+                 : orc::JITTargetMachineBuilder(Triple(TT));
+
+  TMD.setArch(MArch)
+      .setCPU(getCPUStr())
+      .addFeatures(getFeatureList())
+      .setRelocationModel(RelocModel.getNumOccurrences()
+                              ? Optional<Reloc::Model>(RelocModel)
+                              : None)
+      .setCodeModel(CMModel.getNumOccurrences()
+                        ? Optional<CodeModel::Model>(CMModel)
+                        : None);
+  auto TM = ExitOnErr(TMD.createTargetMachine());
+  auto DL = TM->createDataLayout();
+  auto ES = llvm::make_unique<orc::ExecutionSession>();
+  auto J =
+      ExitOnErr(orc::LLLazyJIT::Create(std::move(ES), std::move(TM), DL, Ctx));
+
+  auto Dump = createDebugDumper();
+
+  J->setLazyCompileTransform(
+    [&](std::unique_ptr<Module> M) {
+      if (verifyModule(*M, &dbgs())) {
+        dbgs() << "Bad module: " << *M << "\n";
+        exit(1);
+      }
+      return Dump(std::move(M));
+    });
+  J->getMainVSO().setFallbackDefinitionGenerator(
+      orc::DynamicLibraryFallbackGenerator(
+          std::move(LibLLI), DL, [](orc::SymbolStringPtr) { return true; }));
+
+  orc::MangleAndInterner Mangle(J->getExecutionSession(), DL);
+  orc::LocalCXXRuntimeOverrides2 CXXRuntimeOverrides;
+  ExitOnErr(CXXRuntimeOverrides.enable(J->getMainVSO(), Mangle));
+
+  for (auto &M : Ms) {
+    orc::makeAllSymbolsExternallyAccessible(*M);
+    ExitOnErr(J->addLazyIRModule(std::move(M)));
+  }
+
+  ExitOnErr(J->runConstructors());
+
+  auto MainSym = ExitOnErr(J->lookup("main"));
+  typedef int (*MainFnPtr)(int, const char *[]);
+  std::vector<const char *> ArgV;
+  for (auto &Arg : Args)
+    ArgV.push_back(Arg.c_str());
+  auto Main =
+      reinterpret_cast<MainFnPtr>(static_cast<uintptr_t>(MainSym.getAddress()));
+  auto Result = Main(ArgV.size(), (const char **)ArgV.data());
+
+  ExitOnErr(J->runDestructors());
+
+  CXXRuntimeOverrides.runDestructors();
+
+  return Result;
+}
+
 std::unique_ptr<FDRawChannel> launchRemote() {
 #ifndef LLVM_ON_UNIX
   llvm_unreachable("launchRemote not supported on non-Unix platforms");
diff --git a/contrib/llvm/tools/llvm-ar/llvm-ar.cpp b/contrib/llvm/tools/llvm-ar/llvm-ar.cpp
index ae7d1a7f1b7a..9023bdd1a0d6 100644
--- a/contrib/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/contrib/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -15,8 +15,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
-#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/MachO.h"
@@ -26,15 +24,17 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/LineIterator.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
@@ -47,10 +47,76 @@ using namespace llvm;
 // The name this program was invoked as.
 static StringRef ToolName;
 
+// The basename of this program.
+static StringRef Stem;
+
+const char RanlibHelp[] = R"(
+OVERVIEW: LLVM Ranlib (llvm-ranlib)
+
+  This program generates an index to speed access to archives
+
+USAGE: llvm-ranlib <archive-file>
+
+OPTIONS:
+  -help                             - Display available options
+  -version                          - Display the version of this program
+)";
+
+const char ArHelp[] = R"(
+OVERVIEW: LLVM Archiver (llvm-ar)
+
+  This program archives bitcode files into single libraries
+
+USAGE: llvm-ar [options] [relpos] [count] <archive-file> [members]...
+
+OPTIONS:
+  -M                                -
+  -format                           - Archive format to create
+    =default                        -   default
+    =gnu                            -   gnu
+    =darwin                         -   darwin
+    =bsd                            -   bsd
+  -plugin=<string>                  - plugin (ignored for compatibility
+  -help                             - Display available options
+  -version                          - Display the version of this program
+
+OPERATIONS:
+  d[NsS]       - delete file(s) from the archive
+  m[abiSs]     - move file(s) in the archive
+  p[kN]        - print file(s) found in the archive
+  q[ufsS]      - quick append file(s) to the archive
+  r[abfiuRsS]  - replace or insert file(s) into the archive
+  t            - display contents of archive
+  x[No]        - extract file(s) from the archive
+
+MODIFIERS (operation specific):
+  [a] - put file(s) after [relpos]
+  [b] - put file(s) before [relpos] (same as [i])
+  [D] - use zero for timestamps and uids/gids (default)
+  [i] - put file(s) before [relpos] (same as [b])
+  [o] - preserve original dates
+  [s] - create an archive index (cf. ranlib)
+  [S] - do not build a symbol table
+  [T] - create a thin archive
+  [u] - update only files newer than archive contents
+  [U] - use actual timestamps and uids/gids
+
+MODIFIERS (generic):
+  [c] - do not warn if the library had to be created
+  [v] - be verbose about actions taken
+)";
+
+void printHelpMessage() {
+  if (Stem.contains_lower("ranlib"))
+    outs() << RanlibHelp;
+  else if (Stem.contains_lower("ar"))
+    outs() << ArHelp;
+}
+
 // Show the error message and exit.
 LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
   errs() << ToolName << ": " << Error << ".\n";
-  cl::PrintHelpMessage();
+  printHelpMessage();
   exit(1);
 }
 
@@ -76,55 +142,18 @@ static void failIfError(Error E, Twine Context = "") {
   });
 }
 
-// llvm-ar/llvm-ranlib remaining positional arguments.
-static cl::list<std::string>
-    RestOfArgs(cl::Positional, cl::ZeroOrMore,
-               cl::desc("[relpos] [count] <archive-file> [members]..."));
+static SmallVector<const char *, 256> PositionalArgs;
 
-static cl::opt<bool> MRI("M", cl::desc(""));
-static cl::opt<std::string> Plugin("plugin", cl::desc("plugin (ignored for compatibility"));
+static bool MRI;
 
 namespace {
-enum Format { Default, GNU, BSD, DARWIN };
+enum Format { Default, GNU, BSD, DARWIN, Unknown };
 }
 
-static cl::opt<Format>
-    FormatOpt("format", cl::desc("Archive format to create"),
-              cl::values(clEnumValN(Default, "default", "default"),
-                         clEnumValN(GNU, "gnu", "gnu"),
-                         clEnumValN(DARWIN, "darwin", "darwin"),
-                         clEnumValN(BSD, "bsd", "bsd")));
+static Format FormatType = Default;
 
 static std::string Options;
 
-// Provide additional help output explaining the operations and modifiers of
-// llvm-ar. This object instructs the CommandLine library to print the text of
-// the constructor when the --help option is given.
-static cl::extrahelp MoreHelp(
-  "\nOPERATIONS:\n"
-  "  d[NsS]       - delete file(s) from the archive\n"
-  "  m[abiSs]     - move file(s) in the archive\n"
-  "  p[kN]        - print file(s) found in the archive\n"
-  "  q[ufsS]      - quick append file(s) to the archive\n"
-  "  r[abfiuRsS]  - replace or insert file(s) into the archive\n"
-  "  t            - display contents of archive\n"
-  "  x[No]        - extract file(s) from the archive\n"
-  "\nMODIFIERS (operation specific):\n"
-  "  [a] - put file(s) after [relpos]\n"
-  "  [b] - put file(s) before [relpos] (same as [i])\n"
-  "  [i] - put file(s) before [relpos] (same as [b])\n"
-  "  [o] - preserve original dates\n"
-  "  [s] - create an archive index (cf. ranlib)\n"
-  "  [S] - do not build a symbol table\n"
-  "  [T] - create a thin archive\n"
-  "  [u] - update only files newer than archive contents\n"
-  "\nMODIFIERS (generic):\n"
-  "  [c] - do not warn if the library had to be created\n"
-  "  [v] - be verbose about actions taken\n"
-);
-
-static const char OptionChars[] = "dmpqrtxabiosSTucv";
-
 // This enumeration delineates the kinds of operations on an archive
 // that are permitted.
 enum ArchiveOperation {
@@ -166,30 +195,23 @@ static std::vector<StringRef> Members;
 // Extract the member filename from the command line for the [relpos] argument
 // associated with a, b, and i modifiers
 static void getRelPos() {
-  if(RestOfArgs.size() == 0)
+  if (PositionalArgs.size() == 0)
     fail("Expected [relpos] for a, b, or i modifier");
-  RelPos = RestOfArgs[0];
-  RestOfArgs.erase(RestOfArgs.begin());
-}
-
-static void getOptions() {
-  if(RestOfArgs.size() == 0)
-    fail("Expected options");
-  Options = RestOfArgs[0];
-  RestOfArgs.erase(RestOfArgs.begin());
+  RelPos = PositionalArgs[0];
+  PositionalArgs.erase(PositionalArgs.begin());
 }
 
 // Get the archive file name from the command line
 static void getArchive() {
-  if(RestOfArgs.size() == 0)
+  if (PositionalArgs.size() == 0)
     fail("An archive name must be specified");
-  ArchiveName = RestOfArgs[0];
-  RestOfArgs.erase(RestOfArgs.begin());
+  ArchiveName = PositionalArgs[0];
+  PositionalArgs.erase(PositionalArgs.begin());
 }
 
-// Copy over remaining items in RestOfArgs to our Members vector
+// Copy over remaining items in PositionalArgs to our Members vector
 static void getMembers() {
-  for (auto &Arg : RestOfArgs)
+  for (auto &Arg : PositionalArgs)
     Members.push_back(Arg);
 }
 
@@ -200,13 +222,11 @@ static void runMRIScript();
 // modifier/operation pairs have not been violated.
 static ArchiveOperation parseCommandLine() {
   if (MRI) {
-    if (!RestOfArgs.empty())
+    if (!PositionalArgs.empty() || !Options.empty())
       fail("Cannot mix -M and other options");
     runMRIScript();
   }
 
-  getOptions();
-
   // Keep track of number of operations. We can only specify one
   // per execution.
   unsigned NumOperations = 0;
@@ -370,6 +390,7 @@ static void doExtract(StringRef Name, const object::Archive::Child &C) {
 
   int FD;
   failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
+                                        sys::fs::CD_CreateAlways,
                                         sys::fs::F_None, Mode),
               Name);
 
@@ -651,7 +672,7 @@ performWriteOperation(ArchiveOperation Operation,
     NewMembers = computeNewArchiveMembers(Operation, OldArchive);
 
   object::Archive::Kind Kind;
-  switch (FormatOpt) {
+  switch (FormatType) {
   case Default:
     if (Thin)
       Kind = object::Archive::K_GNU;
@@ -677,6 +698,8 @@ performWriteOperation(ArchiveOperation Operation,
       fail("Only the gnu format has a thin mode");
     Kind = object::Archive::K_DARWIN;
     break;
+  case Unknown:
+    llvm_unreachable("");
   }
 
   Error E =
@@ -758,7 +781,7 @@ static int performOperation(ArchiveOperation Operation,
 }
 
 static void runMRIScript() {
-  enum class MRICommand { AddLib, AddMod, Create, Save, End, Invalid };
+  enum class MRICommand { AddLib, AddMod, Create, Delete, Save, End, Invalid };
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getSTDIN();
   failIfError(Buf.getError());
@@ -779,6 +802,7 @@ static void runMRIScript() {
                        .Case("addlib", MRICommand::AddLib)
                        .Case("addmod", MRICommand::AddMod)
                        .Case("create", MRICommand::Create)
+                       .Case("delete", MRICommand::Delete)
                        .Case("save", MRICommand::Save)
                        .Case("end", MRICommand::End)
                        .Default(MRICommand::Invalid);
@@ -813,6 +837,12 @@ static void runMRIScript() {
         fail("File already saved");
       ArchiveName = Rest;
       break;
+    case MRICommand::Delete: {
+      StringRef Name = sys::path::filename(Rest);
+      llvm::erase_if(NewMembers,
+                     [=](NewArchiveMember &M) { return M.MemberName == Name; });
+      break;
+    }
     case MRICommand::Save:
       Saved = true;
       break;
@@ -829,67 +859,113 @@ static void runMRIScript() {
   exit(0);
 }
 
-static int ar_main() {
-  // Do our own parsing of the command line because the CommandLine utility
-  // can't handle the grouped positional parameters without a dash.
+static bool handleGenericOption(StringRef arg) {
+  if (arg == "-help" || arg == "--help") {
+    printHelpMessage();
+    return true;
+  }
+  if (arg == "-version" || arg == "--version") {
+    cl::PrintVersionMessage();
+    return true;
+  }
+  return false;
+}
+
+static int ar_main(int argc, char **argv) {
+  SmallVector<const char *, 0> Argv(argv, argv + argc);
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
+  for(size_t i = 1; i < Argv.size(); ++i) {
+    StringRef Arg = Argv[i];
+    const char *match;
+    auto MatchFlagWithArg = [&](const char *expected) {
+      size_t len = strlen(expected);
+      if (Arg == expected) {
+        if (++i >= Argv.size())
+          fail(std::string(expected) + " requires an argument");
+        match = Argv[i];
+        return true;
+      }
+      if (Arg.startswith(expected) && Arg.size() > len &&
+                 Arg[len] == '=') {
+        match = Arg.data() + len + 1;
+        return true;
+      }
+      return false;
+    };
+    if (handleGenericOption(Argv[i]))
+      return 0;
+    if (Arg == "--") {
+      for(; i < Argv.size(); ++i)
+        PositionalArgs.push_back(Argv[i]);
+      break;
+    }
+    if (Arg[0] == '-') {
+      if (Arg.startswith("--"))
+        Arg = Argv[i] + 2;
+      else
+        Arg = Argv[i] + 1;
+      if (Arg == "M") {
+        MRI = true;
+      } else if (MatchFlagWithArg("format")) {
+        FormatType = StringSwitch<Format>(match)
+            .Case("default", Default)
+            .Case("gnu", GNU)
+            .Case("darwin", DARWIN)
+            .Case("bsd", BSD)
+            .Default(Unknown);
+        if (FormatType == Unknown)
+          fail(std::string("Invalid format ") + match);
+      } else if (MatchFlagWithArg("plugin")) {
+        // Ignored.
+      } else {
+        Options += Argv[i] + 1;
+      }
+    } else if (Options.empty()) {
+      Options += Argv[i];
+    } else {
+      PositionalArgs.push_back(Argv[i]);
+    }
+  }
   ArchiveOperation Operation = parseCommandLine();
   return performOperation(Operation, nullptr);
 }
 
-static int ranlib_main() {
-  if (RestOfArgs.size() != 1)
-    fail(ToolName + " takes just one archive as an argument");
-  ArchiveName = RestOfArgs[0];
+static int ranlib_main(int argc, char **argv) {
+  bool ArchiveSpecified = false;
+  for(int i = 1; i < argc; ++i) {
+    if (handleGenericOption(argv[i])) {
+      return 0;
+    } else {
+      if (ArchiveSpecified)
+        fail("Exactly one archive should be specified");
+      ArchiveSpecified = true;
+      ArchiveName = argv[i];
+    }
+  }
   return performOperation(CreateSymTab, nullptr);
 }
 
 int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
   ToolName = argv[0];
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllAsmParsers();
 
-  StringRef Stem = sys::path::stem(ToolName);
-  if (Stem.find("dlltool") != StringRef::npos)
+  Stem = sys::path::stem(ToolName);
+  if (Stem.contains_lower("dlltool"))
     return dlltoolDriverMain(makeArrayRef(argv, argc));
 
-  if (Stem.find("ranlib") == StringRef::npos &&
-      Stem.find("lib") != StringRef::npos)
-    return libDriverMain(makeArrayRef(argv, argc));
+  if (Stem.contains_lower("ranlib"))
+    return ranlib_main(argc, argv);
 
-  for (int i = 1; i < argc; i++) {
-    // If an argument starts with a dash and only contains chars
-    // that belong to the options chars set, remove the dash.
-    // We can't handle it after the command line options parsing
-    // is done, since it will error out on an unrecognized string
-    // starting with a dash.
-    // Make sure this doesn't match the actual llvm-ar specific options
-    // that start with a dash.
-    StringRef S = argv[i];
-    if (S.startswith("-") &&
-        S.find_first_not_of(OptionChars, 1) == StringRef::npos) {
-      argv[i]++;
-      break;
-    }
-    if (S == "--")
-      break;
-  }
+  if (Stem.contains_lower("lib"))
+    return libDriverMain(makeArrayRef(argv, argc));
 
-  // Have the command line options parsed and handle things
-  // like --help and --version.
-  cl::ParseCommandLineOptions(argc, argv,
-    "LLVM Archiver (llvm-ar)\n\n"
-    "  This program archives bitcode files into single libraries\n"
-  );
-
-  if (Stem.find("ranlib") != StringRef::npos)
-    return ranlib_main();
-  if (Stem.find("ar") != StringRef::npos)
-    return ar_main();
+  if (Stem.contains_lower("ar"))
+    return ar_main(argc, argv);
   fail("Not ranlib, ar, lib or dlltool!");
 }
diff --git a/contrib/llvm/tools/llvm-as/llvm-as.cpp b/contrib/llvm/tools/llvm-as/llvm-as.cpp
index 9f0f162b74f8..bb4233aa9ba0 100644
--- a/contrib/llvm/tools/llvm-as/llvm-as.cpp
+++ b/contrib/llvm/tools/llvm-as/llvm-as.cpp
@@ -19,12 +19,12 @@
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -59,7 +59,12 @@ static cl::opt<bool> PreserveBitcodeUseListOrder(
     cl::desc("Preserve use-list order when writing LLVM bitcode."),
     cl::init(true), cl::Hidden);
 
-static void WriteOutputFile(const Module *M) {
+static cl::opt<std::string> ClDataLayout("data-layout",
+                                         cl::desc("data layout string to use"),
+                                         cl::value_desc("layout-string"),
+                                         cl::init(""));
+
+static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) {
   // Infer the output filename if needed.
   if (OutputFilename.empty()) {
     if (InputFilename == "-") {
@@ -79,30 +84,44 @@ static void WriteOutputFile(const Module *M) {
     exit(1);
   }
 
-  if (Force || !CheckBitcodeOutputToConsole(Out->os(), true))
-    WriteBitcodeToFile(M, Out->os(), PreserveBitcodeUseListOrder, nullptr,
-                       EmitModuleHash);
+  if (Force || !CheckBitcodeOutputToConsole(Out->os(), true)) {
+    const ModuleSummaryIndex *IndexToWrite = nullptr;
+    // Don't attempt to write a summary index unless it contains any entries.
+    // Otherwise we get an empty summary section.
+    if (Index && Index->begin() != Index->end())
+      IndexToWrite = Index;
+    if (!IndexToWrite || (M && (!M->empty() || !M->global_empty())))
+      // If we have a non-empty Module, then we write the Module plus
+      // any non-null Index along with it as a per-module Index.
+      // If both are empty, this will give an empty module block, which is
+      // the expected behavior.
+      WriteBitcodeToFile(*M, Out->os(), PreserveBitcodeUseListOrder,
+                         IndexToWrite, EmitModuleHash);
+    else
+      // Otherwise, with an empty Module but non-empty Index, we write a
+      // combined index.
+      WriteIndexToFile(*IndexToWrite, Out->os());
+  }
 
   // Declare success.
   Out->keep();
 }
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
   LLVMContext Context;
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm .ll -> .bc assembler\n");
 
   // Parse the file now...
   SMDiagnostic Err;
-  std::unique_ptr<Module> M =
-      parseAssemblyFile(InputFilename, Err, Context, nullptr, !DisableVerify);
+  auto ModuleAndIndex = parseAssemblyFileWithIndex(
+      InputFilename, Err, Context, nullptr, !DisableVerify, ClDataLayout);
+  std::unique_ptr<Module> M = std::move(ModuleAndIndex.Mod);
   if (!M.get()) {
     Err.print(argv[0], errs());
     return 1;
   }
+  std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
   if (!DisableVerify) {
     std::string ErrorStr;
@@ -113,13 +132,17 @@ int main(int argc, char **argv) {
       errs() << OS.str();
       return 1;
     }
+    // TODO: Implement and call summary index verifier.
   }
 
-  if (DumpAsm)
+  if (DumpAsm) {
     errs() << "Here's the assembly:\n" << *M.get();
+    if (Index.get() && Index->begin() != Index->end())
+      Index->print(errs());
+  }
 
   if (!DisableOutput)
-    WriteOutputFile(M.get());
+    WriteOutputFile(M.get(), Index.get());
 
   return 0;
 }
diff --git a/contrib/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/contrib/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index 7f20e136eefd..1939dc6440fe 100644
--- a/contrib/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/contrib/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -33,11 +33,11 @@
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/SHA1.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -75,7 +75,9 @@ namespace {
 /// CurStreamTypeType - A type for CurStreamType
 enum CurStreamTypeType {
   UnknownBitstream,
-  LLVMIRBitstream
+  LLVMIRBitstream,
+  ClangSerializedASTBitstream,
+  ClangSerializedDiagnosticsBitstream,
 };
 
 }
@@ -306,6 +308,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       return nullptr;
       STRINGIFY_CODE(FS, PERMODULE)
       STRINGIFY_CODE(FS, PERMODULE_PROFILE)
+      STRINGIFY_CODE(FS, PERMODULE_RELBF)
       STRINGIFY_CODE(FS, PERMODULE_GLOBALVAR_INIT_REFS)
       STRINGIFY_CODE(FS, COMBINED)
       STRINGIFY_CODE(FS, COMBINED_PROFILE)
@@ -314,6 +317,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_ALIAS)
       STRINGIFY_CODE(FS, COMBINED_ORIGINAL_NAME)
       STRINGIFY_CODE(FS, VERSION)
+      STRINGIFY_CODE(FS, FLAGS)
       STRINGIFY_CODE(FS, TYPE_TESTS)
       STRINGIFY_CODE(FS, TYPE_TEST_ASSUME_VCALLS)
       STRINGIFY_CODE(FS, TYPE_CHECKED_LOAD_VCALLS)
@@ -322,6 +326,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, VALUE_GUID)
       STRINGIFY_CODE(FS, CFI_FUNCTION_DEFS)
       STRINGIFY_CODE(FS, CFI_FUNCTION_DECLS)
+      STRINGIFY_CODE(FS, TYPE_ID)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch(CodeID) {
@@ -442,7 +447,7 @@ static std::map<unsigned, PerBlockIDStats> BlockIDStats;
 /// ReportError - All bitcode analysis errors go through this function, making this a
 /// good place to breakpoint if debugging.
 static bool ReportError(const Twine &Err) {
-  errs() << Err << "\n";
+  WithColor::error() << Err << "\n";
   return true;
 }
 
@@ -597,7 +602,7 @@ static bool ParseBlock(BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo,
     ++BlockStats.NumRecords;
 
     StringRef Blob;
-    unsigned CurrentRecordPos = Stream.GetCurrentBitNo();
+    uint64_t CurrentRecordPos = Stream.GetCurrentBitNo();
     unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
 
     // Increment the # occurrences of this code.
@@ -694,7 +699,7 @@ static bool ParseBlock(BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo,
           std::string Str;
           bool ArrayIsPrintable = true;
           for (unsigned j = i - 1, je = Record.size(); j != je; ++j) {
-            if (!isprint(static_cast<unsigned char>(Record[j]))) {
+            if (!isPrint(static_cast<unsigned char>(Record[j]))) {
               ArrayIsPrintable = false;
               break;
             }
@@ -714,7 +719,7 @@ static bool ParseBlock(BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo,
         } else {
           bool BlobIsPrintable = true;
           for (unsigned i = 0, e = Blob.size(); i != e; ++i)
-            if (!isprint(static_cast<unsigned char>(Blob[i]))) {
+            if (!isPrint(static_cast<unsigned char>(Blob[i]))) {
               BlobIsPrintable = false;
               break;
             }
@@ -743,6 +748,35 @@ static void PrintSize(uint64_t Bits) {
                    (double)Bits/8, (unsigned long)(Bits/32));
 }
 
+static CurStreamTypeType ReadSignature(BitstreamCursor &Stream) {
+  char Signature[6];
+  Signature[0] = Stream.Read(8);
+  Signature[1] = Stream.Read(8);
+
+  // Autodetect the file contents, if it is one we know.
+  if (Signature[0] == 'C' && Signature[1] == 'P') {
+    Signature[2] = Stream.Read(8);
+    Signature[3] = Stream.Read(8);
+    if (Signature[2] == 'C' && Signature[3] == 'H')
+      return ClangSerializedASTBitstream;
+  } else if (Signature[0] == 'D' && Signature[1] == 'I') {
+    Signature[2] = Stream.Read(8);
+    Signature[3] = Stream.Read(8);
+    if (Signature[2] == 'A' && Signature[3] == 'G')
+      return ClangSerializedDiagnosticsBitstream;
+  } else {
+    Signature[2] = Stream.Read(4);
+    Signature[3] = Stream.Read(4);
+    Signature[4] = Stream.Read(4);
+    Signature[5] = Stream.Read(4);
+    if (Signature[0] == 'B' && Signature[1] == 'C' &&
+        Signature[2] == 0x0 && Signature[3] == 0xC &&
+        Signature[4] == 0xE && Signature[5] == 0xD)
+      return LLVMIRBitstream;
+  }
+  return UnknownBitstream;
+}
+
 static bool openBitcodeFile(StringRef Path,
                             std::unique_ptr<MemoryBuffer> &MemBuf,
                             BitstreamCursor &Stream,
@@ -786,22 +820,7 @@ static bool openBitcodeFile(StringRef Path,
   }
 
   Stream = BitstreamCursor(ArrayRef<uint8_t>(BufPtr, EndBufPtr));
-
-  // Read the stream signature.
-  char Signature[6];
-  Signature[0] = Stream.Read(8);
-  Signature[1] = Stream.Read(8);
-  Signature[2] = Stream.Read(4);
-  Signature[3] = Stream.Read(4);
-  Signature[4] = Stream.Read(4);
-  Signature[5] = Stream.Read(4);
-
-  // Autodetect the file contents, if it is one we know.
-  CurStreamType = UnknownBitstream;
-  if (Signature[0] == 'B' && Signature[1] == 'C' &&
-      Signature[2] == 0x0 && Signature[3] == 0xC &&
-      Signature[4] == 0xE && Signature[5] == 0xD)
-    CurStreamType = LLVMIRBitstream;
+  CurStreamType = ReadSignature(Stream);
 
   return false;
 }
@@ -870,8 +889,18 @@ static int AnalyzeBitcode() {
   outs() << "\n";
   outs() << "        Stream type: ";
   switch (CurStreamType) {
-  case UnknownBitstream: outs() << "unknown\n"; break;
-  case LLVMIRBitstream:  outs() << "LLVM IR\n"; break;
+  case UnknownBitstream:
+    outs() << "unknown\n";
+    break;
+  case LLVMIRBitstream:
+    outs() << "LLVM IR\n";
+    break;
+  case ClangSerializedASTBitstream:
+    outs() << "Clang Serialized AST\n";
+    break;
+  case ClangSerializedDiagnosticsBitstream:
+    outs() << "Clang Serialized Diagnostics\n";
+    break;
   }
   outs() << "  # Toplevel Blocks: " << NumTopBlocks << "\n";
   outs() << "\n";
@@ -961,11 +990,7 @@ static int AnalyzeBitcode() {
 
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, "llvm-bcanalyzer file analyzer\n");
-
   return AnalyzeBitcode();
 }
diff --git a/contrib/llvm/tools/llvm-cov/CodeCoverage.cpp b/contrib/llvm/tools/llvm-cov/CodeCoverage.cpp
index c5ea50bff273..e93b63d388e0 100644
--- a/contrib/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/contrib/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CoverageExporterJson.h"
 #include "CoverageFilters.h"
 #include "CoverageReport.h"
 #include "CoverageSummaryInfo.h"
@@ -32,8 +33,8 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/Threading.h"
 #include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
 
 #include <functional>
@@ -48,83 +49,84 @@ void exportCoverageDataToJson(const coverage::CoverageMapping &CoverageMapping,
                               raw_ostream &OS);
 
 namespace {
-/// \brief The implementation of the coverage tool.
+/// The implementation of the coverage tool.
 class CodeCoverageTool {
 public:
   enum Command {
-    /// \brief The show command.
+    /// The show command.
     Show,
-    /// \brief The report command.
+    /// The report command.
     Report,
-    /// \brief The export command.
+    /// The export command.
     Export
   };
 
   int run(Command Cmd, int argc, const char **argv);
 
 private:
-  /// \brief Print the error message to the error output stream.
+  /// Print the error message to the error output stream.
   void error(const Twine &Message, StringRef Whence = "");
 
-  /// \brief Print the warning message to the error output stream.
+  /// Print the warning message to the error output stream.
   void warning(const Twine &Message, StringRef Whence = "");
 
-  /// \brief Convert \p Path into an absolute path and append it to the list
+  /// Convert \p Path into an absolute path and append it to the list
   /// of collected paths.
   void addCollectedPath(const std::string &Path);
 
-  /// \brief If \p Path is a regular file, collect the path. If it's a
+  /// If \p Path is a regular file, collect the path. If it's a
   /// directory, recursively collect all of the paths within the directory.
   void collectPaths(const std::string &Path);
 
-  /// \brief Return a memory buffer for the given source file.
+  /// Return a memory buffer for the given source file.
   ErrorOr<const MemoryBuffer &> getSourceFile(StringRef SourceFile);
 
-  /// \brief Create source views for the expansions of the view.
+  /// Create source views for the expansions of the view.
   void attachExpansionSubViews(SourceCoverageView &View,
                                ArrayRef<ExpansionRecord> Expansions,
                                const CoverageMapping &Coverage);
 
-  /// \brief Create the source view of a particular function.
+  /// Create the source view of a particular function.
   std::unique_ptr<SourceCoverageView>
   createFunctionView(const FunctionRecord &Function,
                      const CoverageMapping &Coverage);
 
-  /// \brief Create the main source view of a particular source file.
+  /// Create the main source view of a particular source file.
   std::unique_ptr<SourceCoverageView>
   createSourceFileView(StringRef SourceFile, const CoverageMapping &Coverage);
 
-  /// \brief Load the coverage mapping data. Return nullptr if an error occurred.
+  /// Load the coverage mapping data. Return nullptr if an error occurred.
   std::unique_ptr<CoverageMapping> load();
 
-  /// \brief Create a mapping from files in the Coverage data to local copies
+  /// Create a mapping from files in the Coverage data to local copies
   /// (path-equivalence).
   void remapPathNames(const CoverageMapping &Coverage);
 
-  /// \brief Remove input source files which aren't mapped by \p Coverage.
+  /// Remove input source files which aren't mapped by \p Coverage.
   void removeUnmappedInputs(const CoverageMapping &Coverage);
 
-  /// \brief If a demangler is available, demangle all symbol names.
+  /// If a demangler is available, demangle all symbol names.
   void demangleSymbols(const CoverageMapping &Coverage);
 
-  /// \brief Write out a source file view to the filesystem.
+  /// Write out a source file view to the filesystem.
   void writeSourceFileView(StringRef SourceFile, CoverageMapping *Coverage,
                            CoveragePrinter *Printer, bool ShowFilenames);
 
   typedef llvm::function_ref<int(int, const char **)> CommandLineParserType;
 
-  int show(int argc, const char **argv,
-           CommandLineParserType commandLineParser);
-
-  int report(int argc, const char **argv,
+  int doShow(int argc, const char **argv,
              CommandLineParserType commandLineParser);
 
-  int export_(int argc, const char **argv,
-              CommandLineParserType commandLineParser);
+  int doReport(int argc, const char **argv,
+               CommandLineParserType commandLineParser);
+
+  int doExport(int argc, const char **argv,
+               CommandLineParserType commandLineParser);
 
   std::vector<StringRef> ObjectFilenames;
   CoverageViewOptions ViewOpts;
   CoverageFiltersMatchAll Filters;
+  CoverageFilters IgnoreFilenameFilters;
 
   /// The path to the indexed profile.
   std::string PGOFilename;
@@ -188,7 +190,8 @@ void CodeCoverageTool::addCollectedPath(const std::string &Path) {
     return;
   }
   sys::path::remove_dots(EffectivePath, /*remove_dot_dots=*/true);
-  SourceFiles.emplace_back(EffectivePath.str());
+  if (!IgnoreFilenameFilters.matchesFilename(EffectivePath))
+    SourceFiles.emplace_back(EffectivePath.str());
 }
 
 void CodeCoverageTool::collectPaths(const std::string &Path) {
@@ -198,7 +201,7 @@ void CodeCoverageTool::collectPaths(const std::string &Path) {
     if (PathRemapping)
       addCollectedPath(Path);
     else
-      error("Missing source file", Path);
+      warning("Source file doesn't exist, proceeded by ignoring it.", Path);
     return;
   }
 
@@ -210,12 +213,16 @@ void CodeCoverageTool::collectPaths(const std::string &Path) {
   if (llvm::sys::fs::is_directory(Status)) {
     std::error_code EC;
     for (llvm::sys::fs::recursive_directory_iterator F(Path, EC), E;
-         F != E && !EC; F.increment(EC)) {
+         F != E; F.increment(EC)) {
+
+      if (EC) {
+        warning(EC.message(), F->path());
+        continue;
+      }
+
       if (llvm::sys::fs::is_regular_file(F->path()))
         addCollectedPath(F->path());
     }
-    if (EC)
-      warning(EC.message(), Path);
   }
 }
 
@@ -471,14 +478,13 @@ void CodeCoverageTool::demangleSymbols(const CoverageMapping &Coverage) {
   OutputTOF.os().close();
 
   // Invoke the demangler.
-  std::vector<const char *> ArgsV;
-  for (const std::string &Arg : ViewOpts.DemanglerOpts)
-    ArgsV.push_back(Arg.c_str());
-  ArgsV.push_back(nullptr);
+  std::vector<StringRef> ArgsV;
+  for (StringRef Arg : ViewOpts.DemanglerOpts)
+    ArgsV.push_back(Arg);
   Optional<StringRef> Redirects[] = {InputPath.str(), OutputPath.str(), {""}};
   std::string ErrMsg;
-  int RC = sys::ExecuteAndWait(ViewOpts.DemanglerOpts[0], ArgsV.data(),
-                               /*env=*/nullptr, Redirects, /*secondsToWait=*/0,
+  int RC = sys::ExecuteAndWait(ViewOpts.DemanglerOpts[0], ArgsV,
+                               /*env=*/None, Redirects, /*secondsToWait=*/0,
                                /*memoryLimit=*/0, &ErrMsg);
   if (RC) {
     error(ErrMsg, ViewOpts.DemanglerOpts[0]);
@@ -592,6 +598,12 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
                "regular expression"),
       cl::ZeroOrMore, cl::cat(FilteringCategory));
 
+  cl::list<std::string> IgnoreFilenameRegexFilters(
+      "ignore-filename-regex", cl::Optional,
+      cl::desc("Skip source code files with file paths that match the given "
+               "regular expression"),
+      cl::ZeroOrMore, cl::cat(FilteringCategory));
+
   cl::opt<double> RegionCoverageLtFilter(
       "region-coverage-lt", cl::Optional,
       cl::desc("Show code coverage only for functions with region coverage "
@@ -636,6 +648,12 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "summary-only", cl::Optional,
       cl::desc("Export only summary information for each source file"));
 
+  cl::opt<unsigned> NumThreads(
+      "num-threads", cl::init(0),
+      cl::desc("Number of merge threads to use (default: autodetect)"));
+  cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
+                        cl::aliasopt(NumThreads));
+
   auto commandLineParser = [&, this](int argc, const char **argv) -> int {
     cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
     ViewOpts.Debug = DebugDump;
@@ -703,6 +721,7 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
             llvm::make_unique<NameRegexCoverageFilter>(Regex));
       Filters.push_back(std::move(NameFilterer));
     }
+
     if (RegionCoverageLtFilter.getNumOccurrences() ||
         RegionCoverageGtFilter.getNumOccurrences() ||
         LineCoverageLtFilter.getNumOccurrences() ||
@@ -723,6 +742,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       Filters.push_back(std::move(StatFilterer));
     }
 
+    // Create the ignore filename filters.
+    for (const auto &RE : IgnoreFilenameRegexFilters)
+      IgnoreFilenameFilters.push_back(
+          llvm::make_unique<NameRegexCoverageFilter>(RE));
+
     if (!Arches.empty()) {
       for (const std::string &Arch : Arches) {
         if (Triple(Arch).getArch() == llvm::Triple::ArchType::UnknownArch) {
@@ -737,6 +761,7 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       }
     }
 
+    // IgnoreFilenameFilters are applied even when InputSourceFiles specified.
     for (const std::string &File : InputSourceFiles)
       collectPaths(File);
 
@@ -749,23 +774,24 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     ViewOpts.ShowRegionSummary = RegionSummary;
     ViewOpts.ShowInstantiationSummary = InstantiationSummary;
     ViewOpts.ExportSummaryOnly = SummaryOnly;
+    ViewOpts.NumThreads = NumThreads;
 
     return 0;
   };
 
   switch (Cmd) {
   case Show:
-    return show(argc, argv, commandLineParser);
+    return doShow(argc, argv, commandLineParser);
   case Report:
-    return report(argc, argv, commandLineParser);
+    return doReport(argc, argv, commandLineParser);
   case Export:
-    return export_(argc, argv, commandLineParser);
+    return doExport(argc, argv, commandLineParser);
   }
   return 0;
 }
 
-int CodeCoverageTool::show(int argc, const char **argv,
-                           CommandLineParserType commandLineParser) {
+int CodeCoverageTool::doShow(int argc, const char **argv,
+                             CommandLineParserType commandLineParser) {
 
   cl::OptionCategory ViewCategory("Viewing options");
 
@@ -808,12 +834,6 @@ int CodeCoverageTool::show(int argc, const char **argv,
       "project-title", cl::Optional,
       cl::desc("Set project title for the coverage report"));
 
-  cl::opt<unsigned> NumThreads(
-      "num-threads", cl::init(0),
-      cl::desc("Number of merge threads to use (default: autodetect)"));
-  cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
-                        cl::aliasopt(NumThreads));
-
   auto Err = commandLineParser(argc, argv);
   if (Err)
     return Err;
@@ -856,8 +876,10 @@ int CodeCoverageTool::show(int argc, const char **argv,
 
   if (SourceFiles.empty())
     // Get the source files from the function coverage mapping.
-    for (StringRef Filename : Coverage->getUniqueSourceFiles())
-      SourceFiles.push_back(Filename);
+    for (StringRef Filename : Coverage->getUniqueSourceFiles()) {
+      if (!IgnoreFilenameFilters.matchesFilename(Filename))
+        SourceFiles.push_back(Filename);
+    }
 
   // Create an index out of the source files.
   if (ViewOpts.hasOutputDirectory()) {
@@ -910,6 +932,8 @@ int CodeCoverageTool::show(int argc, const char **argv,
       (SourceFiles.size() != 1) || ViewOpts.hasOutputDirectory() ||
       (ViewOpts.Format == CoverageViewOptions::OutputFormat::HTML);
 
+  auto NumThreads = ViewOpts.NumThreads;
+
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
     NumThreads =
@@ -932,8 +956,8 @@ int CodeCoverageTool::show(int argc, const char **argv,
   return 0;
 }
 
-int CodeCoverageTool::report(int argc, const char **argv,
-                             CommandLineParserType commandLineParser) {
+int CodeCoverageTool::doReport(int argc, const char **argv,
+                               CommandLineParserType commandLineParser) {
   cl::opt<bool> ShowFunctionSummaries(
       "show-functions", cl::Optional, cl::init(false),
       cl::desc("Show coverage summaries for each function"));
@@ -954,7 +978,7 @@ int CodeCoverageTool::report(int argc, const char **argv,
   CoverageReport Report(ViewOpts, *Coverage.get());
   if (!ShowFunctionSummaries) {
     if (SourceFiles.empty())
-      Report.renderFileReports(llvm::outs());
+      Report.renderFileReports(llvm::outs(), IgnoreFilenameFilters);
     else
       Report.renderFileReports(llvm::outs(), SourceFiles);
   } else {
@@ -969,8 +993,8 @@ int CodeCoverageTool::report(int argc, const char **argv,
   return 0;
 }
 
-int CodeCoverageTool::export_(int argc, const char **argv,
-                              CommandLineParserType commandLineParser) {
+int CodeCoverageTool::doExport(int argc, const char **argv,
+                               CommandLineParserType commandLineParser) {
 
   auto Err = commandLineParser(argc, argv);
   if (Err)
@@ -987,7 +1011,12 @@ int CodeCoverageTool::export_(int argc, const char **argv,
     return 1;
   }
 
-  exportCoverageDataToJson(*Coverage.get(), ViewOpts, outs());
+  auto Exporter = CoverageExporterJson(*Coverage.get(), ViewOpts, outs());
+
+  if (SourceFiles.empty())
+    Exporter.renderRoot(IgnoreFilenameFilters);
+  else
+    Exporter.renderRoot(SourceFiles);
 
   return 0;
 }
diff --git a/contrib/llvm/tools/llvm-cov/CoverageExporter.h b/contrib/llvm/tools/llvm-cov/CoverageExporter.h
new file mode 100644
index 000000000000..884fba96d618
--- /dev/null
+++ b/contrib/llvm/tools/llvm-cov/CoverageExporter.h
@@ -0,0 +1,52 @@
+//===- CoverageExporter.h - Code coverage exporter ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class defines a code coverage exporter interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGEEXPORTER_H
+#define LLVM_COV_COVERAGEEXPORTER_H
+
+#include "CoverageFilters.h"
+#include "CoverageSummaryInfo.h"
+#include "CoverageViewOptions.h"
+#include "llvm/ProfileData/Coverage/CoverageMapping.h"
+
+namespace llvm {
+
+/// Exports the code coverage information.
+class CoverageExporter {
+protected:
+  /// The full CoverageMapping object to export.
+  const coverage::CoverageMapping &Coverage;
+
+  /// The options passed to the tool.
+  const CoverageViewOptions &Options;
+
+  /// Output stream to print JSON to.
+  raw_ostream &OS;
+
+  CoverageExporter(const coverage::CoverageMapping &CoverageMapping,
+                   const CoverageViewOptions &Options, raw_ostream &OS)
+      : Coverage(CoverageMapping), Options(Options), OS(OS) {}
+
+public:
+  virtual ~CoverageExporter(){};
+
+  /// Render the CoverageMapping object.
+  virtual void renderRoot(const CoverageFilters &IgnoreFilenameFilters) = 0;
+
+  /// Render the CoverageMapping object for specified source files.
+  virtual void renderRoot(const std::vector<std::string> &SourceFiles) = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_COV_COVERAGEEXPORTER_H
diff --git a/contrib/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/contrib/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index 7b700908968d..56c3a0003b02 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/contrib/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -41,394 +41,346 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CoverageExporterJson.h"
 #include "CoverageReport.h"
-#include "CoverageSummaryInfo.h"
-#include "CoverageViewOptions.h"
-#include "llvm/ProfileData/Coverage/CoverageMapping.h"
-#include <stack>
 
-/// \brief The semantic version combined as a string.
+/// The semantic version combined as a string.
 #define LLVM_COVERAGE_EXPORT_JSON_STR "2.0.0"
 
-/// \brief Unique type identifier for JSON coverage export.
+/// Unique type identifier for JSON coverage export.
 #define LLVM_COVERAGE_EXPORT_JSON_TYPE_STR "llvm.coverage.json.export"
 
 using namespace llvm;
-using namespace coverage;
 
-class CoverageExporterJson {
-  const CoverageViewOptions &Options;
-
-  /// \brief Output stream to print JSON to.
-  raw_ostream &OS;
-
-  /// \brief The full CoverageMapping object to export.
-  const CoverageMapping &Coverage;
-
-  /// \brief States that the JSON rendering machine can be in.
-  enum JsonState { None, NonEmptyElement, EmptyElement };
-
-  /// \brief Tracks state of the JSON output.
-  std::stack<JsonState> State;
-
-  /// \brief Emit a serialized scalar.
-  void emitSerialized(const int64_t Value) { OS << Value; }
-
-  /// \brief Emit a serialized string.
-  void emitSerialized(const std::string &Value) {
-    OS << "\"";
-    for (char C : Value) {
-      if (C != '\\')
-        OS << C;
-      else
-        OS << "\\\\";
-    }
-    OS << "\"";
-  }
-
-  /// \brief Emit a comma if there is a previous element to delimit.
-  void emitComma() {
-    if (State.top() == JsonState::NonEmptyElement) {
-      OS << ",";
-    } else if (State.top() == JsonState::EmptyElement) {
-      State.pop();
-      assert((State.size() >= 1) && "Closed too many JSON elements");
-      State.push(JsonState::NonEmptyElement);
-    }
-  }
-
-  /// \brief Emit a starting dictionary/object character.
-  void emitDictStart() {
-    emitComma();
-    State.push(JsonState::EmptyElement);
-    OS << "{";
-  }
-
-  /// \brief Emit a dictionary/object key but no value.
-  void emitDictKey(const std::string &Key) {
-    emitComma();
-    emitSerialized(Key);
-    OS << ":";
-    State.pop();
-    assert((State.size() >= 1) && "Closed too many JSON elements");
+CoverageExporterJson::CoverageExporterJson(
+    const coverage::CoverageMapping &CoverageMapping,
+    const CoverageViewOptions &Options, raw_ostream &OS)
+    : CoverageExporter(CoverageMapping, Options, OS) {
+  State.push(JsonState::None);
+}
 
-    // We do not want to emit a comma after this key.
-    State.push(JsonState::EmptyElement);
-  }
+void CoverageExporterJson::emitSerialized(const int64_t Value) { OS << Value; }
 
-  /// \brief Emit a dictionary/object key/value pair.
-  template <typename V>
-  void emitDictElement(const std::string &Key, const V &Value) {
-    emitComma();
-    emitSerialized(Key);
-    OS << ":";
-    emitSerialized(Value);
+void CoverageExporterJson::emitSerialized(const std::string &Value) {
+  OS << "\"";
+  for (char C : Value) {
+    if (C != '\\')
+      OS << C;
+    else
+      OS << "\\\\";
   }
+  OS << "\"";
+}
 
-  /// \brief Emit a closing dictionary/object character.
-  void emitDictEnd() {
+void CoverageExporterJson::emitComma() {
+  if (State.top() == JsonState::NonEmptyElement) {
+    OS << ",";
+  } else if (State.top() == JsonState::EmptyElement) {
     State.pop();
     assert((State.size() >= 1) && "Closed too many JSON elements");
-    OS << "}";
-  }
-
-  /// \brief Emit a starting array character.
-  void emitArrayStart() {
-    emitComma();
-    State.push(JsonState::EmptyElement);
-    OS << "[";
-  }
-
-  /// \brief Emit an array element.
-  template <typename V> void emitArrayElement(const V &Value) {
-    emitComma();
-    emitSerialized(Value);
+    State.push(JsonState::NonEmptyElement);
   }
+}
 
-  /// \brief emit a closing array character.
-  void emitArrayEnd() {
-    State.pop();
-    assert((State.size() >= 1) && "Closed too many JSON elements");
-    OS << "]";
-  }
+void CoverageExporterJson::emitDictStart() {
+  emitComma();
+  State.push(JsonState::EmptyElement);
+  OS << "{";
+}
 
-  /// \brief Render the CoverageMapping object.
-  void renderRoot() {
-    // Start Root of JSON object.
-    emitDictStart();
+void CoverageExporterJson::emitDictKey(const std::string &Key) {
+  emitComma();
+  emitSerialized(Key);
+  OS << ":";
+  State.pop();
+  assert((State.size() >= 1) && "Closed too many JSON elements");
 
-    emitDictElement("version", LLVM_COVERAGE_EXPORT_JSON_STR);
-    emitDictElement("type", LLVM_COVERAGE_EXPORT_JSON_TYPE_STR);
-    emitDictKey("data");
+  // We do not want to emit a comma after this key.
+  State.push(JsonState::EmptyElement);
+}
 
-    // Start List of Exports.
-    emitArrayStart();
+void CoverageExporterJson::emitDictEnd() {
+  State.pop();
+  assert((State.size() >= 1) && "Closed too many JSON elements");
+  OS << "}";
+}
 
-    // Start Export.
-    emitDictStart();
+void CoverageExporterJson::emitArrayStart() {
+  emitComma();
+  State.push(JsonState::EmptyElement);
+  OS << "[";
+}
 
-    emitDictKey("files");
+void CoverageExporterJson::emitArrayEnd() {
+  State.pop();
+  assert((State.size() >= 1) && "Closed too many JSON elements");
+  OS << "]";
+}
 
-    FileCoverageSummary Totals = FileCoverageSummary("Totals");
-    std::vector<std::string> SourceFiles;
-    for (StringRef SF : Coverage.getUniqueSourceFiles())
+void CoverageExporterJson::renderRoot(
+    const CoverageFilters &IgnoreFilenameFilters) {
+  std::vector<std::string> SourceFiles;
+  for (StringRef SF : Coverage.getUniqueSourceFiles()) {
+    if (!IgnoreFilenameFilters.matchesFilename(SF))
       SourceFiles.emplace_back(SF);
-    auto FileReports = CoverageReport::prepareFileReports(Coverage, Totals,
-                                                          SourceFiles, Options);
-    renderFiles(SourceFiles, FileReports);
-
-    // Skip functions-level information for summary-only export mode.
-    if (!Options.ExportSummaryOnly) {
-      emitDictKey("functions");
-      renderFunctions(Coverage.getCoveredFunctions());
-    }
-
-    emitDictKey("totals");
-    renderSummary(Totals);
-
-    // End Export.
-    emitDictEnd();
-
-    // End List of Exports.
-    emitArrayEnd();
-
-    // End Root of JSON Object.
-    emitDictEnd();
-
-    assert((State.top() == JsonState::None) &&
-           "All Elements In JSON were Closed");
   }
+  renderRoot(SourceFiles);
+}
 
-  /// \brief Render an array of all the given functions.
-  void
-  renderFunctions(const iterator_range<FunctionRecordIterator> &Functions) {
-    // Start List of Functions.
-    emitArrayStart();
-
-    for (const auto &Function : Functions) {
-      // Start Function.
-      emitDictStart();
-
-      emitDictElement("name", Function.Name);
-      emitDictElement("count", Function.ExecutionCount);
-      emitDictKey("regions");
-
-      renderRegions(Function.CountedRegions);
+void CoverageExporterJson::renderRoot(
+    const std::vector<std::string> &SourceFiles) {
+  // Start Root of JSON object.
+  emitDictStart();
 
-      emitDictKey("filenames");
+  emitDictElement("version", LLVM_COVERAGE_EXPORT_JSON_STR);
+  emitDictElement("type", LLVM_COVERAGE_EXPORT_JSON_TYPE_STR);
+  emitDictKey("data");
 
-      // Start Filenames for Function.
-      emitArrayStart();
+  // Start List of Exports.
+  emitArrayStart();
 
-      for (const auto &FileName : Function.Filenames)
-        emitArrayElement(FileName);
+  // Start Export.
+  emitDictStart();
 
-      // End Filenames for Function.
-      emitArrayEnd();
+  emitDictKey("files");
 
-      // End Function.
-      emitDictEnd();
-    }
+  FileCoverageSummary Totals = FileCoverageSummary("Totals");
+  auto FileReports = CoverageReport::prepareFileReports(Coverage, Totals,
+                                                        SourceFiles, Options);
+  renderFiles(SourceFiles, FileReports);
 
-    // End List of Functions.
-    emitArrayEnd();
+  // Skip functions-level information for summary-only export mode.
+  if (!Options.ExportSummaryOnly) {
+    emitDictKey("functions");
+    renderFunctions(Coverage.getCoveredFunctions());
   }
 
-  /// \brief Render an array of all the source files, also pass back a Summary.
-  void renderFiles(ArrayRef<std::string> SourceFiles,
-                   ArrayRef<FileCoverageSummary> FileReports) {
-    // Start List of Files.
-    emitArrayStart();
-
-    for (unsigned I = 0, E = SourceFiles.size(); I < E; ++I) {
-      // Render the file.
-      auto FileCoverage = Coverage.getCoverageForFile(SourceFiles[I]);
-      renderFile(FileCoverage, FileReports[I]);
-    }
+  emitDictKey("totals");
+  renderSummary(Totals);
 
-    // End List of Files.
-    emitArrayEnd();
-  }
+  // End Export.
+  emitDictEnd();
 
-  /// \brief Render a single file.
-  void renderFile(const CoverageData &FileCoverage,
-                  const FileCoverageSummary &FileReport) {
-    // Start File.
-    emitDictStart();
+  // End List of Exports.
+  emitArrayEnd();
 
-    emitDictElement("filename", FileCoverage.getFilename());
+  // End Root of JSON Object.
+  emitDictEnd();
 
-    // Skip segments and expansions for summary-only export mode.
-    if (!Options.ExportSummaryOnly) {
-      emitDictKey("segments");
+  assert((State.top() == JsonState::None) &&
+         "All Elements In JSON were Closed");
+}
 
-      // Start List of Segments.
-      emitArrayStart();
+void CoverageExporterJson::renderFunctions(
+    const iterator_range<coverage::FunctionRecordIterator> &Functions) {
+  // Start List of Functions.
+  emitArrayStart();
 
-      for (const auto &Segment : FileCoverage)
-        renderSegment(Segment);
+  for (const auto &Function : Functions) {
+    // Start Function.
+    emitDictStart();
 
-      // End List of Segments.
-      emitArrayEnd();
+    emitDictElement("name", Function.Name);
+    emitDictElement("count", Function.ExecutionCount);
+    emitDictKey("regions");
 
-      emitDictKey("expansions");
+    renderRegions(Function.CountedRegions);
 
-      // Start List of Expansions.
-      emitArrayStart();
+    emitDictKey("filenames");
 
-      for (const auto &Expansion : FileCoverage.getExpansions())
-        renderExpansion(Expansion);
+    // Start Filenames for Function.
+    emitArrayStart();
 
-      // End List of Expansions.
-      emitArrayEnd();
-    }
+    for (const auto &FileName : Function.Filenames)
+      emitArrayElement(FileName);
 
-    emitDictKey("summary");
-    renderSummary(FileReport);
+    // End Filenames for Function.
+    emitArrayEnd();
 
-    // End File.
+    // End Function.
     emitDictEnd();
   }
 
-  /// \brief Render a CoverageSegment.
-  void renderSegment(const CoverageSegment &Segment) {
-    // Start Segment.
-    emitArrayStart();
+  // End List of Functions.
+  emitArrayEnd();
+}
 
-    emitArrayElement(Segment.Line);
-    emitArrayElement(Segment.Col);
-    emitArrayElement(Segment.Count);
-    emitArrayElement(Segment.HasCount);
-    emitArrayElement(Segment.IsRegionEntry);
+void CoverageExporterJson::renderFiles(
+    ArrayRef<std::string> SourceFiles,
+    ArrayRef<FileCoverageSummary> FileReports) {
+  // Start List of Files.
+  emitArrayStart();
 
-    // End Segment.
-    emitArrayEnd();
+  for (unsigned I = 0, E = SourceFiles.size(); I < E; ++I) {
+    renderFile(SourceFiles[I], FileReports[I]);
   }
 
-  /// \brief Render an ExpansionRecord.
-  void renderExpansion(const ExpansionRecord &Expansion) {
-    // Start Expansion.
-    emitDictStart();
-
-    // Mark the beginning and end of this expansion in the source file.
-    emitDictKey("source_region");
-    renderRegion(Expansion.Region);
+  // End List of Files.
+  emitArrayEnd();
+}
 
-    // Enumerate the coverage information for the expansion.
-    emitDictKey("target_regions");
-    renderRegions(Expansion.Function.CountedRegions);
+void CoverageExporterJson::renderFile(const std::string &Filename,
+                                      const FileCoverageSummary &FileReport) {
+  // Start File.
+  emitDictStart();
 
-    emitDictKey("filenames");
-    // Start List of Filenames to map the fileIDs.
-    emitArrayStart();
-    for (const auto &Filename : Expansion.Function.Filenames)
-      emitArrayElement(Filename);
-    // End List of Filenames.
-    emitArrayEnd();
+  emitDictElement("filename", Filename);
 
-    // End Expansion.
-    emitDictEnd();
+  if (!Options.ExportSummaryOnly) {
+    // Calculate and render detailed coverage information for given file.
+    auto FileCoverage = Coverage.getCoverageForFile(Filename);
+    renderFileCoverage(FileCoverage, FileReport);
   }
 
-  /// \brief Render a list of CountedRegions.
-  void renderRegions(ArrayRef<CountedRegion> Regions) {
-    // Start List of Regions.
-    emitArrayStart();
+  emitDictKey("summary");
+  renderSummary(FileReport);
 
-    for (const auto &Region : Regions)
-      renderRegion(Region);
+  // End File.
+  emitDictEnd();
+}
 
-    // End List of Regions.
-    emitArrayEnd();
-  }
 
-  /// \brief Render a single CountedRegion.
-  void renderRegion(const CountedRegion &Region) {
-    // Start CountedRegion.
-    emitArrayStart();
+void CoverageExporterJson::renderFileCoverage(
+    const coverage::CoverageData &FileCoverage,
+    const FileCoverageSummary &FileReport) {
+  emitDictKey("segments");
 
-    emitArrayElement(Region.LineStart);
-    emitArrayElement(Region.ColumnStart);
-    emitArrayElement(Region.LineEnd);
-    emitArrayElement(Region.ColumnEnd);
-    emitArrayElement(Region.ExecutionCount);
-    emitArrayElement(Region.FileID);
-    emitArrayElement(Region.ExpandedFileID);
-    emitArrayElement(Region.Kind);
+  // Start List of Segments.
+  emitArrayStart();
 
-    // End CountedRegion.
-    emitArrayEnd();
-  }
+  for (const auto &Segment : FileCoverage)
+    renderSegment(Segment);
 
-  /// \brief Render a FileCoverageSummary.
-  void renderSummary(const FileCoverageSummary &Summary) {
-    // Start Summary for the file.
-    emitDictStart();
+  // End List of Segments.
+  emitArrayEnd();
 
-    emitDictKey("lines");
+  emitDictKey("expansions");
 
-    // Start Line Coverage Summary.
-    emitDictStart();
-    emitDictElement("count", Summary.LineCoverage.getNumLines());
-    emitDictElement("covered", Summary.LineCoverage.getCovered());
-    emitDictElement("percent", Summary.LineCoverage.getPercentCovered());
-    // End Line Coverage Summary.
-    emitDictEnd();
+  // Start List of Expansions.
+  emitArrayStart();
 
-    emitDictKey("functions");
+  for (const auto &Expansion : FileCoverage.getExpansions())
+    renderExpansion(Expansion);
 
-    // Start Function Coverage Summary.
-    emitDictStart();
-    emitDictElement("count", Summary.FunctionCoverage.getNumFunctions());
-    emitDictElement("covered", Summary.FunctionCoverage.getExecuted());
-    emitDictElement("percent", Summary.FunctionCoverage.getPercentCovered());
-    // End Function Coverage Summary.
-    emitDictEnd();
+  // End List of Expansions.
+  emitArrayEnd();
+}
 
-    emitDictKey("instantiations");
+void CoverageExporterJson::renderSegment(
+    const coverage::CoverageSegment &Segment) {
+  // Start Segment.
+  emitArrayStart();
 
-    // Start Instantiation Coverage Summary.
-    emitDictStart();
-    emitDictElement("count", Summary.InstantiationCoverage.getNumFunctions());
-    emitDictElement("covered", Summary.InstantiationCoverage.getExecuted());
-    emitDictElement("percent",
-                    Summary.InstantiationCoverage.getPercentCovered());
-    // End Function Coverage Summary.
-    emitDictEnd();
+  emitArrayElement(Segment.Line);
+  emitArrayElement(Segment.Col);
+  emitArrayElement(Segment.Count);
+  emitArrayElement(Segment.HasCount);
+  emitArrayElement(Segment.IsRegionEntry);
 
-    emitDictKey("regions");
+  // End Segment.
+  emitArrayEnd();
+}
 
-    // Start Region Coverage Summary.
-    emitDictStart();
-    emitDictElement("count", Summary.RegionCoverage.getNumRegions());
-    emitDictElement("covered", Summary.RegionCoverage.getCovered());
-    emitDictElement("notcovered",
-                    Summary.RegionCoverage.getNumRegions() -
-                        Summary.RegionCoverage.getCovered());
-    emitDictElement("percent", Summary.RegionCoverage.getPercentCovered());
-    // End Region Coverage Summary.
-    emitDictEnd();
+void CoverageExporterJson::renderExpansion(
+    const coverage::ExpansionRecord &Expansion) {
+  // Start Expansion.
+  emitDictStart();
+
+  // Mark the beginning and end of this expansion in the source file.
+  emitDictKey("source_region");
+  renderRegion(Expansion.Region);
+
+  // Enumerate the coverage information for the expansion.
+  emitDictKey("target_regions");
+  renderRegions(Expansion.Function.CountedRegions);
+
+  emitDictKey("filenames");
+  // Start List of Filenames to map the fileIDs.
+  emitArrayStart();
+  for (const auto &Filename : Expansion.Function.Filenames)
+    emitArrayElement(Filename);
+  // End List of Filenames.
+  emitArrayEnd();
+
+  // End Expansion.
+  emitDictEnd();
+}
 
-    // End Summary for the file.
-    emitDictEnd();
-  }
+void CoverageExporterJson::renderRegions(
+    ArrayRef<coverage::CountedRegion> Regions) {
+  // Start List of Regions.
+  emitArrayStart();
 
-public:
-  CoverageExporterJson(const CoverageMapping &CoverageMapping,
-                       const CoverageViewOptions &Options, raw_ostream &OS)
-      : Options(Options), OS(OS), Coverage(CoverageMapping) {
-    State.push(JsonState::None);
-  }
+  for (const auto &Region : Regions)
+    renderRegion(Region);
 
-  /// \brief Print the CoverageMapping.
-  void print() { renderRoot(); }
-};
+  // End List of Regions.
+  emitArrayEnd();
+}
 
-/// \brief Export the given CoverageMapping to a JSON Format.
-void exportCoverageDataToJson(const CoverageMapping &CoverageMapping,
-                              const CoverageViewOptions &Options,
-                              raw_ostream &OS) {
-  auto Exporter = CoverageExporterJson(CoverageMapping, Options, OS);
+void CoverageExporterJson::renderRegion(const coverage::CountedRegion &Region) {
+  // Start CountedRegion.
+  emitArrayStart();
+
+  emitArrayElement(Region.LineStart);
+  emitArrayElement(Region.ColumnStart);
+  emitArrayElement(Region.LineEnd);
+  emitArrayElement(Region.ColumnEnd);
+  emitArrayElement(Region.ExecutionCount);
+  emitArrayElement(Region.FileID);
+  emitArrayElement(Region.ExpandedFileID);
+  emitArrayElement(Region.Kind);
+
+  // End CountedRegion.
+  emitArrayEnd();
+}
 
-  Exporter.print();
+void CoverageExporterJson::renderSummary(const FileCoverageSummary &Summary) {
+  // Start Summary for the file.
+  emitDictStart();
+
+  emitDictKey("lines");
+
+  // Start Line Coverage Summary.
+  emitDictStart();
+  emitDictElement("count", Summary.LineCoverage.getNumLines());
+  emitDictElement("covered", Summary.LineCoverage.getCovered());
+  emitDictElement("percent", Summary.LineCoverage.getPercentCovered());
+  // End Line Coverage Summary.
+  emitDictEnd();
+
+  emitDictKey("functions");
+
+  // Start Function Coverage Summary.
+  emitDictStart();
+  emitDictElement("count", Summary.FunctionCoverage.getNumFunctions());
+  emitDictElement("covered", Summary.FunctionCoverage.getExecuted());
+  emitDictElement("percent", Summary.FunctionCoverage.getPercentCovered());
+  // End Function Coverage Summary.
+  emitDictEnd();
+
+  emitDictKey("instantiations");
+
+  // Start Instantiation Coverage Summary.
+  emitDictStart();
+  emitDictElement("count", Summary.InstantiationCoverage.getNumFunctions());
+  emitDictElement("covered", Summary.InstantiationCoverage.getExecuted());
+  emitDictElement("percent", Summary.InstantiationCoverage.getPercentCovered());
+  // End Function Coverage Summary.
+  emitDictEnd();
+
+  emitDictKey("regions");
+
+  // Start Region Coverage Summary.
+  emitDictStart();
+  emitDictElement("count", Summary.RegionCoverage.getNumRegions());
+  emitDictElement("covered", Summary.RegionCoverage.getCovered());
+  emitDictElement("notcovered", Summary.RegionCoverage.getNumRegions() -
+                                    Summary.RegionCoverage.getCovered());
+  emitDictElement("percent", Summary.RegionCoverage.getPercentCovered());
+  // End Region Coverage Summary.
+  emitDictEnd();
+
+  // End Summary for the file.
+  emitDictEnd();
 }
diff --git a/contrib/llvm/tools/llvm-cov/CoverageExporterJson.h b/contrib/llvm/tools/llvm-cov/CoverageExporterJson.h
new file mode 100644
index 000000000000..f88dffa0ebea
--- /dev/null
+++ b/contrib/llvm/tools/llvm-cov/CoverageExporterJson.h
@@ -0,0 +1,112 @@
+//===- CoverageExporterJson.h - Code coverage JSON exporter ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements a code coverage exporter for JSON format.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGEEXPORTERJSON_H
+#define LLVM_COV_COVERAGEEXPORTERJSON_H
+
+#include "CoverageExporter.h"
+#include <stack>
+
+namespace llvm {
+
+class CoverageExporterJson : public CoverageExporter {
+  /// States that the JSON rendering machine can be in.
+  enum JsonState { None, NonEmptyElement, EmptyElement };
+
+  /// Tracks state of the JSON output.
+  std::stack<JsonState> State;
+
+  /// Emit a serialized scalar.
+  void emitSerialized(const int64_t Value);
+
+  /// Emit a serialized string.
+  void emitSerialized(const std::string &Value);
+
+  /// Emit a comma if there is a previous element to delimit.
+  void emitComma();
+
+  /// Emit a starting dictionary/object character.
+  void emitDictStart();
+
+  /// Emit a dictionary/object key but no value.
+  void emitDictKey(const std::string &Key);
+
+  /// Emit a dictionary/object key/value pair.
+  template <typename V>
+  void emitDictElement(const std::string &Key, const V &Value) {
+    emitComma();
+    emitSerialized(Key);
+    OS << ":";
+    emitSerialized(Value);
+  }
+
+  /// Emit a closing dictionary/object character.
+  void emitDictEnd();
+
+  /// Emit a starting array character.
+  void emitArrayStart();
+
+  /// Emit an array element.
+  template <typename V> void emitArrayElement(const V &Value) {
+    emitComma();
+    emitSerialized(Value);
+  }
+
+  /// emit a closing array character.
+  void emitArrayEnd();
+
+  /// Render an array of all the given functions.
+  void renderFunctions(
+      const iterator_range<coverage::FunctionRecordIterator> &Functions);
+
+  /// Render an array of all the source files, also pass back a Summary.
+  void renderFiles(ArrayRef<std::string> SourceFiles,
+                   ArrayRef<FileCoverageSummary> FileReports);
+
+  /// Render a single file.
+  void renderFile(const std::string &Filename,
+                  const FileCoverageSummary &FileReport);
+
+  /// Render summary for a single file.
+  void renderFileCoverage(const coverage::CoverageData &FileCoverage,
+                          const FileCoverageSummary &FileReport);
+
+  /// Render a CoverageSegment.
+  void renderSegment(const coverage::CoverageSegment &Segment);
+
+  /// Render an ExpansionRecord.
+  void renderExpansion(const coverage::ExpansionRecord &Expansion);
+
+  /// Render a list of CountedRegions.
+  void renderRegions(ArrayRef<coverage::CountedRegion> Regions);
+
+  /// Render a single CountedRegion.
+  void renderRegion(const coverage::CountedRegion &Region);
+
+  /// Render a FileCoverageSummary.
+  void renderSummary(const FileCoverageSummary &Summary);
+
+public:
+  CoverageExporterJson(const coverage::CoverageMapping &CoverageMapping,
+                       const CoverageViewOptions &Options, raw_ostream &OS);
+
+  /// Render the CoverageMapping object.
+  void renderRoot(const CoverageFilters &IgnoreFilenameFilters) override;
+
+  /// Render the CoverageMapping object for specified source files.
+  void renderRoot(const std::vector<std::string> &SourceFiles) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_COV_COVERAGEEXPORTERJSON_H
diff --git a/contrib/llvm/tools/llvm-cov/CoverageFilters.cpp b/contrib/llvm/tools/llvm-cov/CoverageFilters.cpp
index 441179601dcc..4dd0f552c7e0 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageFilters.cpp
+++ b/contrib/llvm/tools/llvm-cov/CoverageFilters.cpp
@@ -30,6 +30,10 @@ bool NameRegexCoverageFilter::matches(
   return llvm::Regex(Regex).match(Function.Name);
 }
 
+bool NameRegexCoverageFilter::matchesFilename(StringRef Filename) const {
+  return llvm::Regex(Regex).match(Filename);
+}
+
 bool NameWhitelistCoverageFilter::matches(
     const coverage::CoverageMapping &,
     const coverage::FunctionRecord &Function) const {
@@ -63,6 +67,14 @@ bool CoverageFilters::matches(const coverage::CoverageMapping &CM,
   return false;
 }
 
+bool CoverageFilters::matchesFilename(StringRef Filename) const {
+  for (const auto &Filter : Filters) {
+    if (Filter->matchesFilename(Filename))
+      return true;
+  }
+  return false;
+}
+
 bool CoverageFiltersMatchAll::matches(
     const coverage::CoverageMapping &CM,
     const coverage::FunctionRecord &Function) const {
diff --git a/contrib/llvm/tools/llvm-cov/CoverageFilters.h b/contrib/llvm/tools/llvm-cov/CoverageFilters.h
index aeaf61de1730..6424ca5a8081 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageFilters.h
+++ b/contrib/llvm/tools/llvm-cov/CoverageFilters.h
@@ -22,19 +22,24 @@
 
 namespace llvm {
 
-/// \brief Matches specific functions that pass the requirement of this filter.
+/// Matches specific functions that pass the requirement of this filter.
 class CoverageFilter {
 public:
   virtual ~CoverageFilter() {}
 
-  /// \brief Return true if the function passes the requirements of this filter.
+  /// Return true if the function passes the requirements of this filter.
   virtual bool matches(const coverage::CoverageMapping &CM,
                        const coverage::FunctionRecord &Function) const {
     return true;
   }
+
+  /// Return true if the filename passes the requirements of this filter.
+  virtual bool matchesFilename(StringRef Filename) const {
+    return true;
+  }
 };
 
-/// \brief Matches functions that contain a specific string in their name.
+/// Matches functions that contain a specific string in their name.
 class NameCoverageFilter : public CoverageFilter {
   StringRef Name;
 
@@ -45,7 +50,7 @@ public:
                const coverage::FunctionRecord &Function) const override;
 };
 
-/// \brief Matches functions whose name matches a certain regular expression.
+/// Matches functions whose name matches a certain regular expression.
 class NameRegexCoverageFilter : public CoverageFilter {
   StringRef Regex;
 
@@ -54,9 +59,11 @@ public:
 
   bool matches(const coverage::CoverageMapping &CM,
                const coverage::FunctionRecord &Function) const override;
+
+  bool matchesFilename(StringRef Filename) const override;
 };
 
-/// \brief Matches functions whose name appears in a SpecialCaseList in the
+/// Matches functions whose name appears in a SpecialCaseList in the
 /// whitelist_fun section.
 class NameWhitelistCoverageFilter : public CoverageFilter {
   const SpecialCaseList &Whitelist;
@@ -69,7 +76,7 @@ public:
                const coverage::FunctionRecord &Function) const override;
 };
 
-/// \brief Matches numbers that pass a certain threshold.
+/// Matches numbers that pass a certain threshold.
 template <typename T> class StatisticThresholdFilter {
 public:
   enum Operation { LessThan, GreaterThan };
@@ -81,7 +88,7 @@ protected:
   StatisticThresholdFilter(Operation Op, T Threshold)
       : Op(Op), Threshold(Threshold) {}
 
-  /// \brief Return true if the given number is less than
+  /// Return true if the given number is less than
   /// or greater than the certain threshold.
   bool PassesThreshold(T Value) const {
     switch (Op) {
@@ -94,7 +101,7 @@ protected:
   }
 };
 
-/// \brief Matches functions whose region coverage percentage
+/// Matches functions whose region coverage percentage
 /// is above/below a certain percentage.
 class RegionCoverageFilter : public CoverageFilter,
                              public StatisticThresholdFilter<double> {
@@ -106,7 +113,7 @@ public:
                const coverage::FunctionRecord &Function) const override;
 };
 
-/// \brief Matches functions whose line coverage percentage
+/// Matches functions whose line coverage percentage
 /// is above/below a certain percentage.
 class LineCoverageFilter : public CoverageFilter,
                            public StatisticThresholdFilter<double> {
@@ -118,7 +125,7 @@ public:
                const coverage::FunctionRecord &Function) const override;
 };
 
-/// \brief A collection of filters.
+/// A collection of filters.
 /// Matches functions that match any filters contained
 /// in an instance of this class.
 class CoverageFilters : public CoverageFilter {
@@ -126,16 +133,18 @@ protected:
   std::vector<std::unique_ptr<CoverageFilter>> Filters;
 
 public:
-  /// \brief Append a filter to this collection.
+  /// Append a filter to this collection.
   void push_back(std::unique_ptr<CoverageFilter> Filter);
 
   bool empty() const { return Filters.empty(); }
 
   bool matches(const coverage::CoverageMapping &CM,
                const coverage::FunctionRecord &Function) const override;
+
+  bool matchesFilename(StringRef Filename) const override;
 };
 
-/// \brief A collection of filters.
+/// A collection of filters.
 /// Matches functions that match all of the filters contained
 /// in an instance of this class.
 class CoverageFiltersMatchAll : public CoverageFilters {
diff --git a/contrib/llvm/tools/llvm-cov/CoverageReport.cpp b/contrib/llvm/tools/llvm-cov/CoverageReport.cpp
index 9c553a7f64c7..607a3ceb30cb 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageReport.cpp
+++ b/contrib/llvm/tools/llvm-cov/CoverageReport.cpp
@@ -16,13 +16,15 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
 #include <numeric>
 
 using namespace llvm;
 
 namespace {
 
-/// \brief Helper struct which prints trimmed and aligned columns.
+/// Helper struct which prints trimmed and aligned columns.
 struct Column {
   enum TrimKind { NoTrim, WidthTrim, RightTrim };
 
@@ -89,7 +91,7 @@ size_t FileReportColumns[] = {25, 12, 18, 10, 12, 18, 10,
                               16, 16, 10, 12, 18, 10};
 size_t FunctionReportColumns[] = {25, 10, 8, 8, 10, 8, 8};
 
-/// \brief Adjust column widths to fit long file paths and function names.
+/// Adjust column widths to fit long file paths and function names.
 void adjustColumnWidths(ArrayRef<StringRef> Files,
                         ArrayRef<StringRef> Functions) {
   for (StringRef Filename : Files)
@@ -99,7 +101,7 @@ void adjustColumnWidths(ArrayRef<StringRef> Files,
         std::max(FunctionReportColumns[0], Funcname.size());
 }
 
-/// \brief Prints a horizontal divider long enough to cover the given column
+/// Prints a horizontal divider long enough to cover the given column
 /// widths.
 void renderDivider(ArrayRef<size_t> ColumnWidths, raw_ostream &OS) {
   size_t Length = std::accumulate(ColumnWidths.begin(), ColumnWidths.end(), 0);
@@ -107,7 +109,7 @@ void renderDivider(ArrayRef<size_t> ColumnWidths, raw_ostream &OS) {
     OS << '-';
 }
 
-/// \brief Return the color which correponds to the coverage percentage of a
+/// Return the color which correponds to the coverage percentage of a
 /// certain metric.
 template <typename T>
 raw_ostream::Colors determineCoveragePercentageColor(const T &Info) {
@@ -117,7 +119,7 @@ raw_ostream::Colors determineCoveragePercentageColor(const T &Info) {
                                           : raw_ostream::RED;
 }
 
-/// \brief Get the number of redundant path components in each path in \p Paths.
+/// Get the number of redundant path components in each path in \p Paths.
 unsigned getNumRedundantPathComponents(ArrayRef<std::string> Paths) {
   // To start, set the number of redundant path components to the maximum
   // possible value.
@@ -146,7 +148,7 @@ unsigned getNumRedundantPathComponents(ArrayRef<std::string> Paths) {
   return NumRedundant;
 }
 
-/// \brief Determine the length of the longest redundant prefix of the paths in
+/// Determine the length of the longest redundant prefix of the paths in
 /// \p Paths.
 unsigned getRedundantPrefixLen(ArrayRef<std::string> Paths) {
   // If there's at most one path, no path components are redundant.
@@ -319,50 +321,72 @@ void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
   }
 }
 
+void CoverageReport::prepareSingleFileReport(const StringRef Filename,
+    const coverage::CoverageMapping *Coverage,
+    const CoverageViewOptions &Options, const unsigned LCP,
+    FileCoverageSummary *FileReport, const CoverageFilter *Filters) {
+  for (const auto &Group : Coverage->getInstantiationGroups(Filename)) {
+    std::vector<FunctionCoverageSummary> InstantiationSummaries;
+    for (const coverage::FunctionRecord *F : Group.getInstantiations()) {
+      if (!Filters->matches(*Coverage, *F))
+        continue;
+      auto InstantiationSummary = FunctionCoverageSummary::get(*Coverage, *F);
+      FileReport->addInstantiation(InstantiationSummary);
+      InstantiationSummaries.push_back(InstantiationSummary);
+    }
+    if (InstantiationSummaries.empty())
+      continue;
+
+    auto GroupSummary =
+        FunctionCoverageSummary::get(Group, InstantiationSummaries);
+
+    if (Options.Debug)
+      outs() << "InstantiationGroup: " << GroupSummary.Name << " with "
+             << "size = " << Group.size() << "\n";
+
+    FileReport->addFunction(GroupSummary);
+  }
+}
+
 std::vector<FileCoverageSummary> CoverageReport::prepareFileReports(
     const coverage::CoverageMapping &Coverage, FileCoverageSummary &Totals,
     ArrayRef<std::string> Files, const CoverageViewOptions &Options,
     const CoverageFilter &Filters) {
-  std::vector<FileCoverageSummary> FileReports;
   unsigned LCP = getRedundantPrefixLen(Files);
+  auto NumThreads = Options.NumThreads;
 
-  for (StringRef Filename : Files) {
-    FileCoverageSummary Summary(Filename.drop_front(LCP));
-
-    for (const auto &Group : Coverage.getInstantiationGroups(Filename)) {
-      std::vector<FunctionCoverageSummary> InstantiationSummaries;
-      for (const coverage::FunctionRecord *F : Group.getInstantiations()) {
-        if (!Filters.matches(Coverage, *F))
-          continue;
-        auto InstantiationSummary = FunctionCoverageSummary::get(Coverage, *F);
-        Summary.addInstantiation(InstantiationSummary);
-        Totals.addInstantiation(InstantiationSummary);
-        InstantiationSummaries.push_back(InstantiationSummary);
-      }
-      if (InstantiationSummaries.empty())
-        continue;
-
-      auto GroupSummary =
-          FunctionCoverageSummary::get(Group, InstantiationSummaries);
+  // If NumThreads is not specified, auto-detect a good default.
+  if (NumThreads == 0)
+    NumThreads =
+        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
+                              unsigned(Files.size())));
 
-      if (Options.Debug)
-        outs() << "InstantiationGroup: " << GroupSummary.Name << " with "
-               << "size = " << Group.size() << "\n";
+  ThreadPool Pool(NumThreads);
 
-      Summary.addFunction(GroupSummary);
-      Totals.addFunction(GroupSummary);
-    }
+  std::vector<FileCoverageSummary> FileReports;
+  FileReports.reserve(Files.size());
 
-    FileReports.push_back(Summary);
+  for (StringRef Filename : Files) {
+    FileReports.emplace_back(Filename.drop_front(LCP));
+    Pool.async(&CoverageReport::prepareSingleFileReport, Filename,
+               &Coverage, Options, LCP, &FileReports.back(), &Filters);
   }
+  Pool.wait();
+
+  for (const auto &FileReport : FileReports)
+    Totals += FileReport;
 
   return FileReports;
 }
 
-void CoverageReport::renderFileReports(raw_ostream &OS) const {
+void CoverageReport::renderFileReports(
+    raw_ostream &OS, const CoverageFilters &IgnoreFilenameFilters) const {
   std::vector<std::string> UniqueSourceFiles;
-  for (StringRef SF : Coverage.getUniqueSourceFiles())
-    UniqueSourceFiles.emplace_back(SF.str());
+  for (StringRef SF : Coverage.getUniqueSourceFiles()) {
+    // Apply ignore source files filters.
+    if (!IgnoreFilenameFilters.matchesFilename(SF))
+      UniqueSourceFiles.emplace_back(SF.str());
+  }
   renderFileReports(OS, UniqueSourceFiles);
 }
 
diff --git a/contrib/llvm/tools/llvm-cov/CoverageReport.h b/contrib/llvm/tools/llvm-cov/CoverageReport.h
index 1c9e68e832f3..4a6527e9fe5d 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageReport.h
+++ b/contrib/llvm/tools/llvm-cov/CoverageReport.h
@@ -1,4 +1,4 @@
-//===- CoverageReport.h - Code coverage report ---------------------------===//
+//===- CoverageReport.h - Code coverage report ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,7 +20,7 @@
 
 namespace llvm {
 
-/// \brief Displays the code coverage report.
+/// Displays the code coverage report.
 class CoverageReport {
   const CoverageViewOptions &Options;
   const coverage::CoverageMapping &Coverage;
@@ -44,8 +44,17 @@ public:
                      const CoverageViewOptions &Options,
                      const CoverageFilter &Filters = CoverageFiltersMatchAll());
 
+  static void
+  prepareSingleFileReport(const StringRef Filename,
+                          const coverage::CoverageMapping *Coverage,
+                          const CoverageViewOptions &Options,
+                          const unsigned LCP,
+                          FileCoverageSummary *FileReport,
+                          const CoverageFilter *Filters);
+
   /// Render file reports for every unique file in the coverage mapping.
-  void renderFileReports(raw_ostream &OS) const;
+  void renderFileReports(raw_ostream &OS,
+                         const CoverageFilters &IgnoreFilenameFilters) const;
 
   /// Render file reports for the files specified in \p Files.
   void renderFileReports(raw_ostream &OS, ArrayRef<std::string> Files) const;
diff --git a/contrib/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/contrib/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 8eae0b7fec97..0845e2ce2e77 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/contrib/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -20,12 +20,12 @@
 
 namespace llvm {
 
-/// \brief Provides information about region coverage for a function/file.
+/// Provides information about region coverage for a function/file.
 class RegionCoverageInfo {
-  /// \brief The number of regions that were executed at least once.
+  /// The number of regions that were executed at least once.
   size_t Covered;
 
-  /// \brief The total number of regions in a function/file.
+  /// The total number of regions in a function/file.
   size_t NumRegions;
 
 public:
@@ -61,12 +61,12 @@ public:
   }
 };
 
-/// \brief Provides information about line coverage for a function/file.
+/// Provides information about line coverage for a function/file.
 class LineCoverageInfo {
-  /// \brief The number of lines that were executed at least once.
+  /// The number of lines that were executed at least once.
   size_t Covered;
 
-  /// \brief The total number of lines in a function/file.
+  /// The total number of lines in a function/file.
   size_t NumLines;
 
 public:
@@ -102,12 +102,12 @@ public:
   }
 };
 
-/// \brief Provides information about function coverage for a file.
+/// Provides information about function coverage for a file.
 class FunctionCoverageInfo {
-  /// \brief The number of functions that were executed.
+  /// The number of functions that were executed.
   size_t Executed;
 
-  /// \brief The total number of functions in this file.
+  /// The total number of functions in this file.
   size_t NumFunctions;
 
 public:
@@ -116,6 +116,12 @@ public:
   FunctionCoverageInfo(size_t Executed, size_t NumFunctions)
       : Executed(Executed), NumFunctions(NumFunctions) {}
 
+  FunctionCoverageInfo &operator+=(const FunctionCoverageInfo &RHS) {
+    Executed += RHS.Executed;
+    NumFunctions += RHS.NumFunctions;
+    return *this;
+  }
+
   void addFunction(bool Covered) {
     if (Covered)
       ++Executed;
@@ -136,7 +142,7 @@ public:
   }
 };
 
-/// \brief A summary of function's code coverage.
+/// A summary of function's code coverage.
 struct FunctionCoverageSummary {
   std::string Name;
   uint64_t ExecutionCount;
@@ -152,7 +158,7 @@ struct FunctionCoverageSummary {
       : Name(Name), ExecutionCount(ExecutionCount),
         RegionCoverage(RegionCoverage), LineCoverage(LineCoverage) {}
 
-  /// \brief Compute the code coverage summary for the given function coverage
+  /// Compute the code coverage summary for the given function coverage
   /// mapping record.
   static FunctionCoverageSummary get(const coverage::CoverageMapping &CM,
                                      const coverage::FunctionRecord &Function);
@@ -164,7 +170,7 @@ struct FunctionCoverageSummary {
       ArrayRef<FunctionCoverageSummary> Summaries);
 };
 
-/// \brief A summary of file's code coverage.
+/// A summary of file's code coverage.
 struct FileCoverageSummary {
   StringRef Name;
   RegionCoverageInfo RegionCoverage;
@@ -176,6 +182,14 @@ struct FileCoverageSummary {
       : Name(Name), RegionCoverage(), LineCoverage(), FunctionCoverage(),
         InstantiationCoverage() {}
 
+  FileCoverageSummary &operator+=(const FileCoverageSummary &RHS) {
+    RegionCoverage += RHS.RegionCoverage;
+    LineCoverage += RHS.LineCoverage;
+    FunctionCoverage += RHS.FunctionCoverage;
+    InstantiationCoverage += RHS.InstantiationCoverage;
+    return *this;
+  }
+
   void addFunction(const FunctionCoverageSummary &Function) {
     RegionCoverage += Function.RegionCoverage;
     LineCoverage += Function.LineCoverage;
@@ -187,11 +201,11 @@ struct FileCoverageSummary {
   }
 };
 
-/// \brief A cache for demangled symbols.
+/// A cache for demangled symbols.
 struct DemangleCache {
   StringMap<std::string> DemangledNames;
 
-  /// \brief Demangle \p Sym if possible. Otherwise, just return \p Sym.
+  /// Demangle \p Sym if possible. Otherwise, just return \p Sym.
   StringRef demangle(StringRef Sym) const {
     const auto DemangledName = DemangledNames.find(Sym);
     if (DemangledName == DemangledNames.end())
diff --git a/contrib/llvm/tools/llvm-cov/CoverageViewOptions.h b/contrib/llvm/tools/llvm-cov/CoverageViewOptions.h
index 17614c4e9ba2..20085a957bb5 100644
--- a/contrib/llvm/tools/llvm-cov/CoverageViewOptions.h
+++ b/contrib/llvm/tools/llvm-cov/CoverageViewOptions.h
@@ -10,12 +10,13 @@
 #ifndef LLVM_COV_COVERAGEVIEWOPTIONS_H
 #define LLVM_COV_COVERAGEVIEWOPTIONS_H
 
+#include "llvm/Config/llvm-config.h"
 #include "RenderingSupport.h"
 #include <vector>
 
 namespace llvm {
 
-/// \brief The options for displaying the code coverage information.
+/// The options for displaying the code coverage information.
 struct CoverageViewOptions {
   enum class OutputFormat {
     Text,
@@ -39,26 +40,27 @@ struct CoverageViewOptions {
   uint32_t TabSize;
   std::string ProjectTitle;
   std::string CreatedTimeStr;
+  unsigned NumThreads;
 
-  /// \brief Change the output's stream color if the colors are enabled.
+  /// Change the output's stream color if the colors are enabled.
   ColoredRawOstream colored_ostream(raw_ostream &OS,
                                     raw_ostream::Colors Color) const {
     return llvm::colored_ostream(OS, Color, Colors);
   }
 
-  /// \brief Check if an output directory has been specified.
+  /// Check if an output directory has been specified.
   bool hasOutputDirectory() const { return !ShowOutputDirectory.empty(); }
 
-  /// \brief Check if a demangler has been specified.
+  /// Check if a demangler has been specified.
   bool hasDemangler() const { return !DemanglerOpts.empty(); }
 
-  /// \brief Check if a project title has been specified.
+  /// Check if a project title has been specified.
   bool hasProjectTitle() const { return !ProjectTitle.empty(); }
 
-  /// \brief Check if the created time of the profile data file is available.
+  /// Check if the created time of the profile data file is available.
   bool hasCreatedTime() const { return !CreatedTimeStr.empty(); }
 
-  /// \brief Get the LLVM version string.
+  /// Get the LLVM version string.
   std::string getLLVMVersionString() const {
     std::string VersionString = "Generated by llvm-cov -- llvm version ";
     VersionString += LLVM_VERSION_STRING;
diff --git a/contrib/llvm/tools/llvm-cov/RenderingSupport.h b/contrib/llvm/tools/llvm-cov/RenderingSupport.h
index aa70fbc23e3c..2cfe24919142 100644
--- a/contrib/llvm/tools/llvm-cov/RenderingSupport.h
+++ b/contrib/llvm/tools/llvm-cov/RenderingSupport.h
@@ -15,7 +15,7 @@
 
 namespace llvm {
 
-/// \brief A helper class that resets the output stream's color if needed
+/// A helper class that resets the output stream's color if needed
 /// when destroyed.
 class ColoredRawOstream {
   ColoredRawOstream(const ColoredRawOstream &OS) = delete;
@@ -45,7 +45,7 @@ inline raw_ostream &operator<<(const ColoredRawOstream &OS, T &&Value) {
   return OS.OS << std::forward<T>(Value);
 }
 
-/// \brief Change the color of the output stream if the `IsColorUsed` flag
+/// Change the color of the output stream if the `IsColorUsed` flag
 /// is true. Returns an object that resets the color when destroyed.
 inline ColoredRawOstream colored_ostream(raw_ostream &OS,
                                          raw_ostream::Colors Color,
diff --git a/contrib/llvm/tools/llvm-cov/SourceCoverageView.cpp b/contrib/llvm/tools/llvm-cov/SourceCoverageView.cpp
index 8c39dab580de..a5a8fa9a4814 100644
--- a/contrib/llvm/tools/llvm-cov/SourceCoverageView.cpp
+++ b/contrib/llvm/tools/llvm-cov/SourceCoverageView.cpp
@@ -65,7 +65,8 @@ CoveragePrinter::createOutputStream(StringRef Path, StringRef Extension,
     return errorCodeToError(E);
 
   std::error_code E;
-  raw_ostream *RawStream = new raw_fd_ostream(FullPath, E, sys::fs::F_RW);
+  raw_ostream *RawStream =
+      new raw_fd_ostream(FullPath, E, sys::fs::FA_Read | sys::fs::FA_Write);
   auto OS = CoveragePrinter::OwnedStream(RawStream);
   if (E)
     return errorCodeToError(E);
diff --git a/contrib/llvm/tools/llvm-cov/SourceCoverageView.h b/contrib/llvm/tools/llvm-cov/SourceCoverageView.h
index 7f58ea5d7be8..e3a2f9e5c0b4 100644
--- a/contrib/llvm/tools/llvm-cov/SourceCoverageView.h
+++ b/contrib/llvm/tools/llvm-cov/SourceCoverageView.h
@@ -27,7 +27,7 @@ using namespace coverage;
 class CoverageFiltersMatchAll;
 class SourceCoverageView;
 
-/// \brief A view that represents a macro or include expansion.
+/// A view that represents a macro or include expansion.
 struct ExpansionView {
   CounterMappingRegion Region;
   std::unique_ptr<SourceCoverageView> View;
@@ -52,7 +52,7 @@ struct ExpansionView {
   }
 };
 
-/// \brief A view that represents a function instantiation.
+/// A view that represents a function instantiation.
 struct InstantiationView {
   StringRef FunctionName;
   unsigned Line;
@@ -68,7 +68,7 @@ struct InstantiationView {
   }
 };
 
-/// \brief A file manager that handles format-aware file creation.
+/// A file manager that handles format-aware file creation.
 class CoveragePrinter {
 public:
   struct StreamDestructor {
@@ -82,18 +82,18 @@ protected:
 
   CoveragePrinter(const CoverageViewOptions &Opts) : Opts(Opts) {}
 
-  /// \brief Return `OutputDir/ToplevelDir/Path.Extension`. If \p InToplevel is
+  /// Return `OutputDir/ToplevelDir/Path.Extension`. If \p InToplevel is
   /// false, skip the ToplevelDir component. If \p Relative is false, skip the
   /// OutputDir component.
   std::string getOutputPath(StringRef Path, StringRef Extension,
                             bool InToplevel, bool Relative = true) const;
 
-  /// \brief If directory output is enabled, create a file in that directory
+  /// If directory output is enabled, create a file in that directory
   /// at the path given by getOutputPath(). Otherwise, return stdout.
   Expected<OwnedStream> createOutputStream(StringRef Path, StringRef Extension,
                                            bool InToplevel) const;
 
-  /// \brief Return the sub-directory name for file coverage reports.
+  /// Return the sub-directory name for file coverage reports.
   static StringRef getCoverageDir() { return "coverage"; }
 
 public:
@@ -105,14 +105,14 @@ public:
   /// @name File Creation Interface
   /// @{
 
-  /// \brief Create a file to print a coverage view into.
+  /// Create a file to print a coverage view into.
   virtual Expected<OwnedStream> createViewFile(StringRef Path,
                                                bool InToplevel) = 0;
 
-  /// \brief Close a file which has been used to print a coverage view.
+  /// Close a file which has been used to print a coverage view.
   virtual void closeViewFile(OwnedStream OS) = 0;
 
-  /// \brief Create an index which lists reports for the given source files.
+  /// Create an index which lists reports for the given source files.
   virtual Error createIndexFile(ArrayRef<std::string> SourceFiles,
                                 const CoverageMapping &Coverage,
                                 const CoverageFiltersMatchAll &Filters) = 0;
@@ -120,7 +120,7 @@ public:
   /// @}
 };
 
-/// \brief A code coverage view of a source file or function.
+/// A code coverage view of a source file or function.
 ///
 /// A source coverage view and its nested sub-views form a file-oriented
 /// representation of code coverage data. This view can be printed out by a
@@ -161,73 +161,73 @@ protected:
   /// @name Rendering Interface
   /// @{
 
-  /// \brief Render a header for the view.
+  /// Render a header for the view.
   virtual void renderViewHeader(raw_ostream &OS) = 0;
 
-  /// \brief Render a footer for the view.
+  /// Render a footer for the view.
   virtual void renderViewFooter(raw_ostream &OS) = 0;
 
-  /// \brief Render the source name for the view.
+  /// Render the source name for the view.
   virtual void renderSourceName(raw_ostream &OS, bool WholeFile) = 0;
 
-  /// \brief Render the line prefix at the given \p ViewDepth.
+  /// Render the line prefix at the given \p ViewDepth.
   virtual void renderLinePrefix(raw_ostream &OS, unsigned ViewDepth) = 0;
 
-  /// \brief Render the line suffix at the given \p ViewDepth.
+  /// Render the line suffix at the given \p ViewDepth.
   virtual void renderLineSuffix(raw_ostream &OS, unsigned ViewDepth) = 0;
 
-  /// \brief Render a view divider at the given \p ViewDepth.
+  /// Render a view divider at the given \p ViewDepth.
   virtual void renderViewDivider(raw_ostream &OS, unsigned ViewDepth) = 0;
 
-  /// \brief Render a source line with highlighting.
+  /// Render a source line with highlighting.
   virtual void renderLine(raw_ostream &OS, LineRef L,
                           const LineCoverageStats &LCS, unsigned ExpansionCol,
                           unsigned ViewDepth) = 0;
 
-  /// \brief Render the line's execution count column.
+  /// Render the line's execution count column.
   virtual void renderLineCoverageColumn(raw_ostream &OS,
                                         const LineCoverageStats &Line) = 0;
 
-  /// \brief Render the line number column.
+  /// Render the line number column.
   virtual void renderLineNumberColumn(raw_ostream &OS, unsigned LineNo) = 0;
 
-  /// \brief Render all the region's execution counts on a line.
+  /// Render all the region's execution counts on a line.
   virtual void renderRegionMarkers(raw_ostream &OS,
                                    const LineCoverageStats &Line,
                                    unsigned ViewDepth) = 0;
 
-  /// \brief Render the site of an expansion.
+  /// Render the site of an expansion.
   virtual void renderExpansionSite(raw_ostream &OS, LineRef L,
                                    const LineCoverageStats &LCS,
                                    unsigned ExpansionCol,
                                    unsigned ViewDepth) = 0;
 
-  /// \brief Render an expansion view and any nested views.
+  /// Render an expansion view and any nested views.
   virtual void renderExpansionView(raw_ostream &OS, ExpansionView &ESV,
                                    unsigned ViewDepth) = 0;
 
-  /// \brief Render an instantiation view and any nested views.
+  /// Render an instantiation view and any nested views.
   virtual void renderInstantiationView(raw_ostream &OS, InstantiationView &ISV,
                                        unsigned ViewDepth) = 0;
 
-  /// \brief Render \p Title, a project title if one is available, and the
+  /// Render \p Title, a project title if one is available, and the
   /// created time.
   virtual void renderTitle(raw_ostream &OS, StringRef CellText) = 0;
 
-  /// \brief Render the table header for a given source file.
+  /// Render the table header for a given source file.
   virtual void renderTableHeader(raw_ostream &OS, unsigned FirstUncoveredLineNo,
                                  unsigned IndentLevel) = 0;
 
   /// @}
 
-  /// \brief Format a count using engineering notation with 3 significant
+  /// Format a count using engineering notation with 3 significant
   /// digits.
   static std::string formatCount(uint64_t N);
 
-  /// \brief Check if region marker output is expected for a line.
+  /// Check if region marker output is expected for a line.
   bool shouldRenderRegionMarkers(const LineCoverageStats &LCS) const;
 
-  /// \brief Check if there are any sub-views attached to this view.
+  /// Check if there are any sub-views attached to this view.
   bool hasSubViews() const;
 
   SourceCoverageView(StringRef SourceName, const MemoryBuffer &File,
@@ -243,20 +243,20 @@ public:
 
   virtual ~SourceCoverageView() {}
 
-  /// \brief Return the source name formatted for the host OS.
+  /// Return the source name formatted for the host OS.
   std::string getSourceName() const;
 
   const CoverageViewOptions &getOptions() const { return Options; }
 
-  /// \brief Add an expansion subview to this view.
+  /// Add an expansion subview to this view.
   void addExpansion(const CounterMappingRegion &Region,
                     std::unique_ptr<SourceCoverageView> View);
 
-  /// \brief Add a function instantiation subview to this view.
+  /// Add a function instantiation subview to this view.
   void addInstantiation(StringRef FunctionName, unsigned Line,
                         std::unique_ptr<SourceCoverageView> View);
 
-  /// \brief Print the code coverage information for a specific portion of a
+  /// Print the code coverage information for a specific portion of a
   /// source file to the output stream.
   void print(raw_ostream &OS, bool WholeFile, bool ShowSourceName,
              bool ShowTitle, unsigned ViewDepth = 0);
diff --git a/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
index e45c6f4cb473..acb67aa5cfc7 100644
--- a/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
+++ b/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
@@ -25,31 +25,29 @@ namespace {
 
 // Return a string with the special characters in \p Str escaped.
 std::string escape(StringRef Str, const CoverageViewOptions &Opts) {
-  std::string Result;
+  std::string TabExpandedResult;
   unsigned ColNum = 0; // Record the column number.
   for (char C : Str) {
-    ++ColNum;
-    if (C == '&')
-      Result += "&amp;";
-    else if (C == '<')
-      Result += "&lt;";
-    else if (C == '>')
-      Result += "&gt;";
-    else if (C == '\"')
-      Result += "&quot;";
-    else if (C == '\n' || C == '\r') {
-      Result += C;
-      ColNum = 0;
-    } else if (C == '\t') {
-      // Replace '\t' with TabSize spaces.
-      unsigned NumSpaces = Opts.TabSize - (--ColNum % Opts.TabSize);
+    if (C == '\t') {
+      // Replace '\t' with up to TabSize spaces.
+      unsigned NumSpaces = Opts.TabSize - (ColNum % Opts.TabSize);
       for (unsigned I = 0; I < NumSpaces; ++I)
-        Result += "&nbsp;";
+        TabExpandedResult += ' ';
       ColNum += NumSpaces;
-    } else
-      Result += C;
+    } else {
+      TabExpandedResult += C;
+      if (C == '\n' || C == '\r')
+        ColNum = 0;
+      else
+        ++ColNum;
+    }
+  }
+  std::string EscapedHTML;
+  {
+    raw_string_ostream OS{EscapedHTML};
+    printHTMLEscaped(TabExpandedResult, OS);
   }
-  return Result;
+  return EscapedHTML;
 }
 
 // Create a \p Name tag around \p Str, and optionally set its \p ClassName.
@@ -116,24 +114,39 @@ table {
   background: #ffffff;
   border: 1px solid #dbdbdb;
 }
+.light-row-bold {
+  background: #ffffff;
+  border: 1px solid #dbdbdb;
+  font-weight: bold;
+}
 .column-entry {
-  text-align: right;
+  text-align: left;
 }
-.column-entry-left {
+.column-entry-bold {
+  font-weight: bold;
   text-align: left;
 }
 .column-entry-yellow {
-  text-align: right;
+  text-align: left;
   background-color: #ffffd0;
 }
+.column-entry-yellow:hover {
+  background-color: #fffff0;
+}
 .column-entry-red {
-  text-align: right;
+  text-align: left;
   background-color: #ffd0d0;
 }
+.column-entry-red:hover {
+  background-color: #fff0f0;
+}
 .column-entry-green {
-  text-align: right;
+  text-align: left;
   background-color: #d0ffd0;
 }
+.column-entry-green:hover {
+  background-color: #f0fff0;
+}
 .line-number {
   text-align: right;
   color: #aaa;
@@ -184,10 +197,14 @@ table {
 }
 th, td {
   vertical-align: top;
-  padding: 2px 5px;
+  padding: 2px 8px;
   border-collapse: collapse;
   border-right: solid 1px #eee;
   border-left: solid 1px #eee;
+  text-align: left;
+}
+td pre {
+  display: inline-block;
 }
 td:first-child {
   border-left: none;
@@ -195,6 +212,9 @@ td:first-child {
 td:last-child {
   border-right: none;
 }
+tr:hover {
+  background-color: #f0f0f0;
+}
 )";
 
 const char *EndHeader = "</head>";
@@ -287,13 +307,14 @@ void CoveragePrinterHTML::closeViewFile(OwnedStream OS) {
 static void emitColumnLabelsForIndex(raw_ostream &OS,
                                      const CoverageViewOptions &Opts) {
   SmallVector<std::string, 4> Columns;
-  Columns.emplace_back(tag("td", "Filename", "column-entry-left"));
-  Columns.emplace_back(tag("td", "Function Coverage", "column-entry"));
+  Columns.emplace_back(tag("td", "Filename", "column-entry-bold"));
+  Columns.emplace_back(tag("td", "Function Coverage", "column-entry-bold"));
   if (Opts.ShowInstantiationSummary)
-    Columns.emplace_back(tag("td", "Instantiation Coverage", "column-entry"));
-  Columns.emplace_back(tag("td", "Line Coverage", "column-entry"));
+    Columns.emplace_back(
+        tag("td", "Instantiation Coverage", "column-entry-bold"));
+  Columns.emplace_back(tag("td", "Line Coverage", "column-entry-bold"));
   if (Opts.ShowRegionSummary)
-    Columns.emplace_back(tag("td", "Region Coverage", "column-entry"));
+    Columns.emplace_back(tag("td", "Region Coverage", "column-entry-bold"));
   OS << tag("tr", join(Columns.begin(), Columns.end(), ""));
 }
 
@@ -339,7 +360,7 @@ void CoveragePrinterHTML::emitFileSummary(raw_ostream &OS, StringRef SF,
   // Simplify the display file path, and wrap it in a link if requested.
   std::string Filename;
   if (IsTotals) {
-    Filename = "TOTALS";
+    Filename = SF;
   } else {
     Filename = buildLinkToFile(SF, FCS);
   }
@@ -360,7 +381,10 @@ void CoveragePrinterHTML::emitFileSummary(raw_ostream &OS, StringRef SF,
                               FCS.RegionCoverage.getNumRegions(),
                               FCS.RegionCoverage.getPercentCovered());
 
-  OS << tag("tr", join(Columns.begin(), Columns.end(), ""), "light-row");
+  if (IsTotals)
+    OS << tag("tr", join(Columns.begin(), Columns.end(), ""), "light-row-bold");
+  else
+    OS << tag("tr", join(Columns.begin(), Columns.end(), ""), "light-row");
 }
 
 Error CoveragePrinterHTML::createIndexFile(
diff --git a/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.h b/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.h
index 91b4ad4e220c..cb41fcaf37b9 100644
--- a/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.h
+++ b/contrib/llvm/tools/llvm-cov/SourceCoverageViewHTML.h
@@ -22,7 +22,7 @@ using namespace coverage;
 
 struct FileCoverageSummary;
 
-/// \brief A coverage printer for html output.
+/// A coverage printer for html output.
 class CoveragePrinterHTML : public CoveragePrinter {
 public:
   Expected<OwnedStream> createViewFile(StringRef Path,
@@ -45,7 +45,7 @@ private:
                               const FileCoverageSummary &FCS) const;
 };
 
-/// \brief A code coverage view which supports html-based rendering.
+/// A code coverage view which supports html-based rendering.
 class SourceCoverageViewHTML : public SourceCoverageView {
   void renderViewHeader(raw_ostream &OS) override;
 
diff --git a/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
index 2480ee9f416a..aac70baed613 100644
--- a/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
+++ b/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
@@ -51,13 +51,13 @@ namespace {
 static const unsigned LineCoverageColumnWidth = 7;
 static const unsigned LineNumberColumnWidth = 5;
 
-/// \brief Get the width of the leading columns.
+/// Get the width of the leading columns.
 unsigned getCombinedColumnWidth(const CoverageViewOptions &Opts) {
   return (Opts.ShowLineStats ? LineCoverageColumnWidth + 1 : 0) +
          (Opts.ShowLineNumbers ? LineNumberColumnWidth + 1 : 0);
 }
 
-/// \brief The width of the line that is used to divide between the view and
+/// The width of the line that is used to divide between the view and
 /// the subviews.
 unsigned getDividerWidth(const CoverageViewOptions &Opts) {
   return getCombinedColumnWidth(Opts) + 4;
diff --git a/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.h b/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.h
index cabf91975df3..a46f35cc6495 100644
--- a/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.h
+++ b/contrib/llvm/tools/llvm-cov/SourceCoverageViewText.h
@@ -20,7 +20,7 @@ namespace llvm {
 
 using namespace coverage;
 
-/// \brief A coverage printer for text output.
+/// A coverage printer for text output.
 class CoveragePrinterText : public CoveragePrinter {
 public:
   Expected<OwnedStream> createViewFile(StringRef Path,
@@ -36,7 +36,7 @@ public:
       : CoveragePrinter(Opts) {}
 };
 
-/// \brief A code coverage view which supports text-based rendering.
+/// A code coverage view which supports text-based rendering.
 class SourceCoverageViewText : public SourceCoverageView {
   void renderViewHeader(raw_ostream &OS) override;
 
diff --git a/contrib/llvm/tools/llvm-cov/TestingSupport.cpp b/contrib/llvm/tools/llvm-cov/TestingSupport.cpp
index 4713d75f17dd..e07abdbd17f1 100644
--- a/contrib/llvm/tools/llvm-cov/TestingSupport.cpp
+++ b/contrib/llvm/tools/llvm-cov/TestingSupport.cpp
@@ -75,8 +75,7 @@ int convertForTestingMain(int argc, const char *argv[]) {
     return 1;
 
   int FD;
-  if (auto Err =
-          sys::fs::openFileForWrite(OutputFilename, FD, sys::fs::F_None)) {
+  if (auto Err = sys::fs::openFileForWrite(OutputFilename, FD)) {
     errs() << "error: " << Err.message() << "\n";
     return 1;
   }
diff --git a/contrib/llvm/tools/llvm-cov/llvm-cov.cpp b/contrib/llvm/tools/llvm-cov/llvm-cov.cpp
index 158415870250..4c3b574451c3 100644
--- a/contrib/llvm/tools/llvm-cov/llvm-cov.cpp
+++ b/contrib/llvm/tools/llvm-cov/llvm-cov.cpp
@@ -14,32 +14,31 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
 
 using namespace llvm;
 
-/// \brief The main entry point for the 'show' subcommand.
+/// The main entry point for the 'show' subcommand.
 int showMain(int argc, const char *argv[]);
 
-/// \brief The main entry point for the 'report' subcommand.
+/// The main entry point for the 'report' subcommand.
 int reportMain(int argc, const char *argv[]);
 
-/// \brief The main entry point for the 'export' subcommand.
+/// The main entry point for the 'export' subcommand.
 int exportMain(int argc, const char *argv[]);
 
-/// \brief The main entry point for the 'convert-for-testing' subcommand.
+/// The main entry point for the 'convert-for-testing' subcommand.
 int convertForTestingMain(int argc, const char *argv[]);
 
-/// \brief The main entry point for the gcov compatible coverage tool.
+/// The main entry point for the gcov compatible coverage tool.
 int gcovMain(int argc, const char *argv[]);
 
-/// \brief Top level help.
+/// Top level help.
 static int helpMain(int argc, const char *argv[]) {
   errs() << "Usage: llvm-cov {export|gcov|report|show} [OPTION]...\n\n"
          << "Shows code coverage information.\n\n"
@@ -52,17 +51,14 @@ static int helpMain(int argc, const char *argv[]) {
   return 0;
 }
 
-/// \brief Top level version information.
+/// Top level version information.
 static int versionMain(int argc, const char *argv[]) {
   cl::PrintVersionMessage();
   return 0;
 }
 
 int main(int argc, const char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   // If argv[0] is or ends with 'gcov', always be gcov compatible
   if (sys::path::stem(argv[0]).endswith_lower("gcov"))
diff --git a/contrib/llvm/tools/llvm-cxxdump/Error.cpp b/contrib/llvm/tools/llvm-cxxdump/Error.cpp
index d59547e3a2ce..54207fad32af 100644
--- a/contrib/llvm/tools/llvm-cxxdump/Error.cpp
+++ b/contrib/llvm/tools/llvm-cxxdump/Error.cpp
@@ -1,4 +1,4 @@
-//===- Error.cxx - system_error extensions for llvm-cxxdump -----*- C++ -*-===//
+//===- Error.cpp - system_error extensions for llvm-cxxdump -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp b/contrib/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
index 9b687e4fbe22..09e40d9b0db7 100644
--- a/contrib/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/contrib/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -20,9 +20,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
@@ -533,9 +531,7 @@ static void dumpInput(StringRef File) {
 }
 
 int main(int argc, const char *argv[]) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;
+  InitLLVM X(argc, argv);
 
   // Initialize targets.
   llvm::InitializeAllTargetInfos();
diff --git a/contrib/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/contrib/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index 9c6a1612fa08..afc1e4a8d128 100644
--- a/contrib/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/contrib/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -9,8 +9,7 @@
 
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 #include <iostream>
@@ -81,8 +80,7 @@ static void demangle(llvm::raw_ostream &OS, const std::string &Mangled) {
 }
 
 int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
 
   cl::ParseCommandLineOptions(argc, argv, "llvm symbol undecoration tool\n");
 
diff --git a/contrib/llvm/tools/llvm-diff/DifferenceEngine.cpp b/contrib/llvm/tools/llvm-diff/DifferenceEngine.cpp
index 95a63d7f9c83..af0a055ea21f 100644
--- a/contrib/llvm/tools/llvm-diff/DifferenceEngine.cpp
+++ b/contrib/llvm/tools/llvm-diff/DifferenceEngine.cpp
@@ -303,6 +303,26 @@ class FunctionDifferenceEngine {
       if (TryUnify) tryUnify(LI->getSuccessor(0), RI->getSuccessor(0));
       return false;
 
+    } else if (isa<IndirectBrInst>(L)) {
+      IndirectBrInst *LI = cast<IndirectBrInst>(L);
+      IndirectBrInst *RI = cast<IndirectBrInst>(R);
+      if (LI->getNumDestinations() != RI->getNumDestinations()) {
+        if (Complain) Engine.log("indirectbr # of destinations differ");
+        return true;
+      }
+
+      if (!equivalentAsOperands(LI->getAddress(), RI->getAddress())) {
+        if (Complain) Engine.log("indirectbr addresses differ");
+        return true;
+      }
+
+      if (TryUnify) {
+        for (unsigned i = 0; i < LI->getNumDestinations(); i++) {
+          tryUnify(LI->getDestination(i), RI->getDestination(i));
+        }
+      }
+      return false;
+
     } else if (isa<SwitchInst>(L)) {
       SwitchInst *LI = cast<SwitchInst>(L);
       SwitchInst *RI = cast<SwitchInst>(R);
@@ -377,9 +397,9 @@ class FunctionDifferenceEngine {
       return equivalentAsOperands(cast<ConstantExpr>(L),
                                   cast<ConstantExpr>(R));
 
-    // Nulls of the "same type" don't always actually have the same
+    // Constants of the "same type" don't always actually have the same
     // type; I don't know why.  Just white-list them.
-    if (isa<ConstantPointerNull>(L))
+    if (isa<ConstantPointerNull>(L) || isa<UndefValue>(L) || isa<ConstantAggregateZero>(L))
       return true;
 
     // Block addresses only match if we've already encountered the
@@ -388,6 +408,19 @@ class FunctionDifferenceEngine {
       return Blocks[cast<BlockAddress>(L)->getBasicBlock()]
                  == cast<BlockAddress>(R)->getBasicBlock();
 
+    // If L and R are ConstantVectors, compare each element
+    if (isa<ConstantVector>(L)) {
+      ConstantVector *CVL = cast<ConstantVector>(L);
+      ConstantVector *CVR = cast<ConstantVector>(R);
+      if (CVL->getType()->getNumElements() != CVR->getType()->getNumElements())
+        return false;
+      for (unsigned i = 0; i < CVL->getType()->getNumElements(); i++) {
+        if (!equivalentAsOperands(CVL->getOperand(i), CVR->getOperand(i)))
+          return false;
+      }
+      return true;
+    }
+
     return false;
   }
 
diff --git a/contrib/llvm/tools/llvm-dis/llvm-dis.cpp b/contrib/llvm/tools/llvm-dis/llvm-dis.cpp
index c91aa1c71a15..8143a2a5a934 100644
--- a/contrib/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/contrib/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -16,24 +16,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
 #include <system_error>
 using namespace llvm;
 
@@ -129,10 +128,10 @@ struct LLVMDisDiagnosticHandler : public DiagnosticHandler {
     raw_ostream &OS = errs();
     OS << Prefix << ": ";
     switch (DI.getSeverity()) {
-      case DS_Error: OS << "error: "; break;
-      case DS_Warning: OS << "warning: "; break;
+      case DS_Error: WithColor::error(OS); break;
+      case DS_Warning: WithColor::warning(OS); break;
       case DS_Remark: OS << "remark: "; break;
-      case DS_Note: OS << "note: "; break;
+      case DS_Note: WithColor::note(OS); break;
     }
 
     DiagnosticPrinterRawOStream DP(OS);
@@ -148,33 +147,29 @@ struct LLVMDisDiagnosticHandler : public DiagnosticHandler {
 
 static ExitOnError ExitOnErr;
 
-static std::unique_ptr<Module> openInputFile(LLVMContext &Context) {
-  std::unique_ptr<MemoryBuffer> MB =
-      ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(InputFilename)));
-  std::unique_ptr<Module> M = ExitOnErr(getOwningLazyBitcodeModule(
-      std::move(MB), Context,
-      /*ShouldLazyLoadMetadata=*/true, SetImporting));
-  if (MaterializeMetadata)
-    ExitOnErr(M->materializeMetadata());
-  else
-    ExitOnErr(M->materializeAll());
-  return M;
-}
-
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
 
   ExitOnErr.setBanner(std::string(argv[0]) + ": error: ");
 
   LLVMContext Context;
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   Context.setDiagnosticHandler(
       llvm::make_unique<LLVMDisDiagnosticHandler>(argv[0]));
   cl::ParseCommandLineOptions(argc, argv, "llvm .bc -> .ll disassembler\n");
 
-  std::unique_ptr<Module> M = openInputFile(Context);
+  std::unique_ptr<MemoryBuffer> MB =
+      ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(InputFilename)));
+  std::unique_ptr<Module> M = ExitOnErr(getLazyBitcodeModule(
+      *MB, Context, /*ShouldLazyLoadMetadata=*/true, SetImporting));
+  if (MaterializeMetadata)
+    ExitOnErr(M->materializeMetadata());
+  else
+    ExitOnErr(M->materializeAll());
+
+  BitcodeLTOInfo LTOInfo = ExitOnErr(getBitcodeLTOInfo(*MB));
+  std::unique_ptr<ModuleSummaryIndex> Index;
+  if (LTOInfo.HasSummary)
+    Index = ExitOnErr(getModuleSummaryIndex(*MB));
 
   // Just use stdout.  We won't actually print anything on it.
   if (DontPrint)
@@ -203,8 +198,11 @@ int main(int argc, char **argv) {
     Annotator.reset(new CommentWriter());
 
   // All that llvm-dis does is write the assembly to a file.
-  if (!DontPrint)
+  if (!DontPrint) {
     M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
+    if (Index)
+      Index->print(Out->os());
+  }
 
   // Declare success.
   Out->keep();
diff --git a/contrib/llvm/tools/llvm-dwarfdump/Statistics.cpp b/contrib/llvm/tools/llvm-dwarfdump/Statistics.cpp
index 9a7454a52624..5af853d4ef28 100644
--- a/contrib/llvm/tools/llvm-dwarfdump/Statistics.cpp
+++ b/contrib/llvm/tools/llvm-dwarfdump/Statistics.cpp
@@ -34,8 +34,14 @@ struct GlobalStats {
 
 /// Extract the low pc from a Die.
 static uint64_t getLowPC(DWARFDie Die) {
-  if (Die.getAddressRanges().size())
-    return Die.getAddressRanges()[0].LowPC;
+  auto RangesOrError = Die.getAddressRanges();
+  DWARFAddressRangesVector Ranges;
+  if (RangesOrError)
+    Ranges = RangesOrError.get();
+  else
+    llvm::consumeError(RangesOrError.takeError());
+  if (Ranges.size())
+    return Ranges[0].LowPC;
   return dwarf::toAddress(Die.find(dwarf::DW_AT_low_pc), 0);
 }
 
@@ -137,7 +143,13 @@ static void collectStatsRecursive(DWARFDie Die, std::string Prefix,
     }
 
     // PC Ranges.
-    auto Ranges = Die.getAddressRanges();
+    auto RangesOrError = Die.getAddressRanges();
+    if (!RangesOrError) {
+      llvm::consumeError(RangesOrError.takeError());
+      return;
+    }
+       
+    auto Ranges = RangesOrError.get();
     uint64_t BytesInThisScope = 0;
     for (auto Range : Ranges)
       BytesInThisScope += Range.HighPC - Range.LowPC;
@@ -165,11 +177,11 @@ static void collectStatsRecursive(DWARFDie Die, std::string Prefix,
 /// \{
 static void printDatum(raw_ostream &OS, const char *Key, StringRef Value) {
   OS << ",\"" << Key << "\":\"" << Value << '"';
-  DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
+  LLVM_DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
 }
 static void printDatum(raw_ostream &OS, const char *Key, uint64_t Value) {
   OS << ",\"" << Key << "\":" << Value;
-  DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
+  LLVM_DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
 }
 /// \}
 
@@ -206,8 +218,9 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
     VarWithLoc += Stats.TotalVarWithLoc + Constants;
     VarTotal += TotalVars + Constants;
     VarUnique += Stats.VarsInFunction.size();
-    DEBUG(for (auto V : Stats.VarsInFunction)
-            llvm::dbgs() << Entry.getKey() << ": " << V << "\n");
+    LLVM_DEBUG(for (auto V
+                    : Stats.VarsInFunction) llvm::dbgs()
+               << Entry.getKey() << ": " << V << "\n");
     NumFunctions += Stats.IsFunction;
     NumInlinedFunctions += Stats.IsFunction * Stats.NumFnInlined;
   }
@@ -215,8 +228,8 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   // Print summary.
   OS.SetBufferSize(1024);
   OS << "{\"version\":\"" << Version << '"';
-  DEBUG(llvm::dbgs() << "Variable location quality metrics\n";
-        llvm::dbgs() << "---------------------------------\n");
+  LLVM_DEBUG(llvm::dbgs() << "Variable location quality metrics\n";
+             llvm::dbgs() << "---------------------------------\n");
   printDatum(OS, "file", Filename.str());
   printDatum(OS, "format", FormatName);
   printDatum(OS, "source functions", NumFunctions);
@@ -228,7 +241,7 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
              GlobalStats.ScopeBytesFromFirstDefinition);
   printDatum(OS, "scope bytes covered", GlobalStats.ScopeBytesCovered);
   OS << "}\n";
-  DEBUG(
+  LLVM_DEBUG(
       llvm::dbgs() << "Total Availability: "
                    << (int)std::round((VarWithLoc * 100.0) / VarTotal) << "%\n";
       llvm::dbgs() << "PC Ranges covered: "
diff --git a/contrib/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/contrib/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 12c005de6005..d75f33906098 100644
--- a/contrib/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/contrib/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -22,14 +22,13 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Regex.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -310,6 +309,62 @@ static void filterByName(const StringSet<> &Names,
 
 }
 
+static void getDies(DWARFContext &DICtx, const AppleAcceleratorTable &Accel,
+                    StringRef Name, SmallVectorImpl<DWARFDie> &Dies) {
+  for (const auto &Entry : Accel.equal_range(Name)) {
+    if (llvm::Optional<uint64_t> Off = Entry.getDIESectionOffset()) {
+      if (DWARFDie Die = DICtx.getDIEForOffset(*Off))
+        Dies.push_back(Die);
+    }
+  }
+}
+
+static DWARFDie toDie(const DWARFDebugNames::Entry &Entry,
+                      DWARFContext &DICtx) {
+  llvm::Optional<uint64_t> CUOff = Entry.getCUOffset();
+  llvm::Optional<uint64_t> Off = Entry.getDIEUnitOffset();
+  if (!CUOff || !Off)
+    return DWARFDie();
+
+  DWARFCompileUnit *CU = DICtx.getCompileUnitForOffset(*CUOff);
+  if (!CU)
+    return DWARFDie();
+
+  if (llvm::Optional<uint64_t> DWOId = CU->getDWOId()) {
+    // This is a skeleton unit. Look up the DIE in the DWO unit.
+    CU = DICtx.getDWOCompileUnitForHash(*DWOId);
+    if (!CU)
+      return DWARFDie();
+  }
+
+  return CU->getDIEForOffset(CU->getOffset() + *Off);
+}
+
+static void getDies(DWARFContext &DICtx, const DWARFDebugNames &Accel,
+                    StringRef Name, SmallVectorImpl<DWARFDie> &Dies) {
+  for (const auto &Entry : Accel.equal_range(Name)) {
+    if (DWARFDie Die = toDie(Entry, DICtx))
+      Dies.push_back(Die);
+  }
+}
+
+/// Print only DIEs that have a certain name.
+static void filterByAccelName(ArrayRef<std::string> Names, DWARFContext &DICtx,
+                              raw_ostream &OS) {
+  SmallVector<DWARFDie, 4> Dies;
+  for (const auto &Name : Names) {
+    getDies(DICtx, DICtx.getAppleNames(), Name, Dies);
+    getDies(DICtx, DICtx.getAppleTypes(), Name, Dies);
+    getDies(DICtx, DICtx.getAppleNamespaces(), Name, Dies);
+    getDies(DICtx, DICtx.getDebugNames(), Name, Dies);
+  }
+  llvm::sort(Dies.begin(), Dies.end());
+  Dies.erase(std::unique(Dies.begin(), Dies.end()), Dies.end());
+
+  for (DWARFDie Die : Dies)
+    Die.dump(OS, 0, getDumpOpts());
+}
+
 /// Handle the --lookup option and dump the DIEs and line info for the given
 /// address.
 static bool lookup(DWARFContext &DICtx, uint64_t Address, raw_ostream &OS) {
@@ -361,28 +416,8 @@ static bool dumpObjectFile(ObjectFile &Obj, DWARFContext &DICtx, Twine Filename,
 
   // Handle the --find option and lower it to --debug-info=<offset>.
   if (!Find.empty()) {
-    DumpOffsets[DIDT_ID_DebugInfo] = [&]() -> llvm::Optional<uint64_t> {
-      for (auto Name : Find) {
-        auto find = [&](const DWARFAcceleratorTable &Accel)
-            -> llvm::Optional<uint64_t> {
-          for (auto Entry : Accel.equal_range(Name))
-            for (auto Atom : Entry)
-              if (auto Offset = Atom.getAsSectionOffset())
-                return Offset;
-          return None;
-        };
-        if (auto Offset = find(DICtx.getAppleNames()))
-          return DumpOffsets[DIDT_ID_DebugInfo] = *Offset;
-        if (auto Offset = find(DICtx.getAppleTypes()))
-          return DumpOffsets[DIDT_ID_DebugInfo] = *Offset;
-        if (auto Offset = find(DICtx.getAppleNamespaces()))
-          return DumpOffsets[DIDT_ID_DebugInfo] = *Offset;
-      }
-      return None;
-    }();
-    // Early exit if --find was specified but the current file doesn't have it.
-    if (!DumpOffsets[DIDT_ID_DebugInfo])
-      return true;
+    filterByAccelName(Find, DICtx, OS);
+    return true;
   }
 
   // Dump the complete DWARF structure.
@@ -477,6 +512,8 @@ static bool handleFile(StringRef Filename, HandlerFn HandleObj,
 static std::vector<std::string> expandBundle(const std::string &InputPath) {
   std::vector<std::string> BundlePaths;
   SmallString<256> BundlePath(InputPath);
+  // Normalize input path. This is necessary to accept `bundle.dSYM/`.
+  sys::path::remove_dots(BundlePath);
   // Manually open up the bundle to avoid introducing additional dependencies.
   if (sys::fs::is_directory(BundlePath) &&
       sys::path::extension(BundlePath) == ".dSYM") {
@@ -505,15 +542,12 @@ static std::vector<std::string> expandBundle(const std::string &InputPath) {
 }
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
 
-  HideUnrelatedOptions({&DwarfDumpCategory, &SectionCategory});
+  HideUnrelatedOptions({&DwarfDumpCategory, &SectionCategory, &ColorCategory});
   cl::ParseCommandLineOptions(
       argc, argv,
       "pretty-print DWARF debug information in object files"
@@ -565,7 +599,7 @@ int main(int argc, char **argv) {
     ShowChildren = true;
 
   // Defaults to a.out if no filenames specified.
-  if (InputFilenames.size() == 0)
+  if (InputFilenames.empty())
     InputFilenames.push_back("a.out");
 
   // Expand any .dSYM bundles to the individual object files contained therein.
diff --git a/contrib/llvm/tools/llvm-extract/llvm-extract.cpp b/contrib/llvm/tools/llvm-extract/llvm-extract.cpp
index c39ffa58fbf7..94aaa2f52eb5 100644
--- a/contrib/llvm/tools/llvm-extract/llvm-extract.cpp
+++ b/contrib/llvm/tools/llvm-extract/llvm-extract.cpp
@@ -25,10 +25,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Regex.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -67,6 +65,12 @@ ExtractRegExpFuncs("rfunc", cl::desc("Specify function(s) to extract using a "
                                      "regular expression"),
                    cl::ZeroOrMore, cl::value_desc("rfunction"));
 
+// ExtractBlocks - The blocks to extract from the module.
+static cl::list<std::string>
+    ExtractBlocks("bb",
+                  cl::desc("Specify <function, basic block> pairs to extract"),
+                  cl::ZeroOrMore, cl::value_desc("function:bb"));
+
 // ExtractAlias - The alias to extract from the module.
 static cl::list<std::string>
 ExtractAliases("alias", cl::desc("Specify alias to extract"),
@@ -107,12 +111,9 @@ static cl::opt<bool> PreserveAssemblyUseListOrder(
     cl::init(false), cl::Hidden);
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
 
   LLVMContext Context;
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm extractor\n");
 
   // Use lazy loading, since we only care about selected global values.
@@ -228,6 +229,32 @@ int main(int argc, char **argv) {
     }
   }
 
+  // Figure out which BasicBlocks we should extract.
+  SmallVector<BasicBlock *, 4> BBs;
+  for (StringRef StrPair : ExtractBlocks) {
+    auto BBInfo = StrPair.split(':');
+    // Get the function.
+    Function *F = M->getFunction(BBInfo.first);
+    if (!F) {
+      errs() << argv[0] << ": program doesn't contain a function named '"
+             << BBInfo.first << "'!\n";
+      return 1;
+    }
+    // Do not materialize this function.
+    GVs.insert(F);
+    // Get the basic block.
+    auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
+      return BB.getName().equals(BBInfo.second);
+    });
+    if (Res == F->end()) {
+      errs() << argv[0] << ": function " << F->getName()
+             << " doesn't contain a basic block named '" << BBInfo.second
+             << "'!\n";
+      return 1;
+    }
+    BBs.push_back(&*Res);
+  }
+
   // Use *argv instead of argv[0] to work around a wrong GCC warning.
   ExitOnError ExitOnErr(std::string(*argv) + ": error reading input: ");
 
@@ -286,6 +313,14 @@ int main(int argc, char **argv) {
     ExitOnErr(M->materializeAll());
   }
 
+  // Extract the specified basic blocks from the module and erase the existing
+  // functions.
+  if (!ExtractBlocks.empty()) {
+    legacy::PassManager PM;
+    PM.add(createBlockExtractorPass(BBs, true));
+    PM.run(*M);
+  }
+
   // In addition to deleting all other functions, we also want to spiff it
   // up a little bit.  Do this now.
   legacy::PassManager Passes;
diff --git a/contrib/llvm/tools/llvm-link/llvm-link.cpp b/contrib/llvm/tools/llvm-link/llvm-link.cpp
index 50f506aeaae9..b7a888375b3d 100644
--- a/contrib/llvm/tools/llvm-link/llvm-link.cpp
+++ b/contrib/llvm/tools/llvm-link/llvm-link.cpp
@@ -26,13 +26,12 @@
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
@@ -120,7 +119,8 @@ static std::unique_ptr<Module> loadFile(const char *argv0,
                                         LLVMContext &Context,
                                         bool MaterializeMetadata = true) {
   SMDiagnostic Err;
-  if (Verbose) errs() << "Loading '" << FN << "'\n";
+  if (Verbose)
+    errs() << "Loading '" << FN << "'\n";
   std::unique_ptr<Module> Result;
   if (DisableLazyLoad)
     Result = parseIRFile(FN, Err, Context);
@@ -188,12 +188,12 @@ struct LLVMLinkDiagnosticHandler : public DiagnosticHandler {
     unsigned Severity = DI.getSeverity();
     switch (Severity) {
     case DS_Error:
-      errs() << "ERROR: ";
+      WithColor::error();
       break;
     case DS_Warning:
       if (SuppressWarnings)
         return true;
-      errs() << "WARNING: ";
+      WithColor::warning();
       break;
     case DS_Remark:
     case DS_Note:
@@ -238,8 +238,8 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
     auto &SrcModule = ModuleLoaderCache(argv0, FileName);
 
     if (verifyModule(SrcModule, &errs())) {
-      errs() << argv0 << ": " << FileName
-             << ": error: input module is broken!\n";
+      errs() << argv0 << ": " << FileName;
+      WithColor::error() << "input module is broken!\n";
       return false;
     }
 
@@ -262,7 +262,7 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
       errs() << "Importing " << FunctionName << " from " << FileName << "\n";
 
     auto &Entry = ImportList[FileName];
-    Entry.insert(std::make_pair(F->getGUID(), /* (Unused) threshold */ 1.0));
+    Entry.insert(F->getGUID());
   }
   auto CachedModuleLoader = [&](StringRef Identifier) {
     return ModuleLoaderCache.takeModule(Identifier);
@@ -283,7 +283,8 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
   for (const auto &File : Files) {
     std::unique_ptr<Module> M = loadFile(argv0, File, Context);
     if (!M.get()) {
-      errs() << argv0 << ": error loading file '" << File << "'\n";
+      errs() << argv0 << ": ";
+      WithColor::error() << " loading file '" << File << "'\n";
       return false;
     }
 
@@ -291,7 +292,8 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
     // doing that debug metadata in the src module might already be pointing to
     // the destination.
     if (DisableDITypeMap && verifyModule(*M, &errs())) {
-      errs() << argv0 << ": " << File << ": error: input module is broken!\n";
+      errs() << argv0 << ": " << File << ": ";
+      WithColor::error() << "input module is broken!\n";
       return false;
     }
 
@@ -345,16 +347,12 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
 }
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-
+  InitLLVM X(argc, argv);
   ExitOnErr.setBanner(std::string(argv[0]) + ": ");
 
   LLVMContext Context;
   Context.setDiagnosticHandler(
     llvm::make_unique<LLVMLinkDiagnosticHandler>(), true);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
   if (!DisableDITypeMap)
@@ -380,25 +378,28 @@ int main(int argc, char **argv) {
   if (!importFunctions(argv[0], *Composite))
     return 1;
 
-  if (DumpAsm) errs() << "Here's the assembly:\n" << *Composite;
+  if (DumpAsm)
+    errs() << "Here's the assembly:\n" << *Composite;
 
   std::error_code EC;
   ToolOutputFile Out(OutputFilename, EC, sys::fs::F_None);
   if (EC) {
-    errs() << EC.message() << '\n';
+    WithColor::error() << EC.message() << '\n';
     return 1;
   }
 
   if (verifyModule(*Composite, &errs())) {
-    errs() << argv[0] << ": error: linked module is broken!\n";
+    errs() << argv[0] << ": ";
+    WithColor::error() << "linked module is broken!\n";
     return 1;
   }
 
-  if (Verbose) errs() << "Writing bitcode...\n";
+  if (Verbose)
+    errs() << "Writing bitcode...\n";
   if (OutputAssembly) {
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os(), true))
-    WriteBitcodeToFile(Composite.get(), Out.os(), PreserveBitcodeUseListOrder);
+    WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
 
   // Declare success.
   Out.keep();
diff --git a/contrib/llvm/tools/llvm-lto/llvm-lto.cpp b/contrib/llvm/tools/llvm-lto/llvm-lto.cpp
index 7d71a3e8dfe3..75668a9dd8b6 100644
--- a/contrib/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/contrib/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
@@ -40,11 +40,9 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -157,7 +155,16 @@ static cl::opt<std::string>
     ThinLTOCacheDir("thinlto-cache-dir", cl::desc("Enable ThinLTO caching."));
 
 static cl::opt<int>
-    ThinLTOCachePruningInterval("thinlto-cache-pruning-interval", cl::desc("Set ThinLTO cache pruning interval."));
+    ThinLTOCachePruningInterval("thinlto-cache-pruning-interval",
+    cl::init(1200), cl::desc("Set ThinLTO cache pruning interval."));
+
+static cl::opt<int>
+    ThinLTOCacheMaxSizeBytes("thinlto-cache-max-size-bytes",
+    cl::desc("Set ThinLTO cache pruning directory maximum size in bytes."));
+
+static cl::opt<int>
+    ThinLTOCacheMaxSizeFiles("thinlto-cache-max-size-files", cl::init(1000000),
+    cl::desc("Set ThinLTO cache pruning directory maximum number of files."));
 
 static cl::opt<std::string> ThinLTOSaveTempsPrefix(
     "thinlto-save-temps",
@@ -343,7 +350,7 @@ void printIndexStats() {
   }
 }
 
-/// \brief List symbols in each IR file.
+/// List symbols in each IR file.
 ///
 /// The main point here is to provide lit-testable coverage for the LTOModule
 /// functionality that's exposed by the C API to list symbols.  Moreover, this
@@ -367,13 +374,13 @@ static void listSymbols(const TargetOptions &Options) {
 /// This is meant to enable testing of ThinLTO combined index generation,
 /// currently available via the gold plugin via -thinlto.
 static void createCombinedModuleSummaryIndex() {
-  ModuleSummaryIndex CombinedIndex;
+  ModuleSummaryIndex CombinedIndex(/*HaveGVs=*/false);
   uint64_t NextModuleId = 0;
   for (auto &Filename : InputFilenames) {
     ExitOnError ExitOnErr("llvm-lto: error loading file '" + Filename + "': ");
     std::unique_ptr<MemoryBuffer> MB =
         ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(Filename)));
-    ExitOnErr(readModuleSummaryIndex(*MB, CombinedIndex, ++NextModuleId));
+    ExitOnErr(readModuleSummaryIndex(*MB, CombinedIndex, NextModuleId++));
   }
   std::error_code EC;
   assert(!OutputFilename.empty());
@@ -462,7 +469,7 @@ static void writeModuleToFile(Module &TheModule, StringRef Filename) {
   raw_fd_ostream OS(Filename, EC, sys::fs::OpenFlags::F_None);
   error(EC, "error opening the file '" + Filename + "'");
   maybeVerifyModule(TheModule);
-  WriteBitcodeToFile(&TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
 }
 
 class ThinLTOProcessing {
@@ -474,6 +481,8 @@ public:
     ThinGenerator.setTargetOptions(Options);
     ThinGenerator.setCacheDir(ThinLTOCacheDir);
     ThinGenerator.setCachePruningInterval(ThinLTOCachePruningInterval);
+    ThinGenerator.setCacheMaxSizeFiles(ThinLTOCacheMaxSizeFiles);
+    ThinGenerator.setCacheMaxSizeBytes(ThinLTOCacheMaxSizeBytes);
     ThinGenerator.setFreestanding(EnableFreestanding);
 
     // Add all the exported symbols to the table of symbols to preserve.
@@ -788,11 +797,7 @@ private:
 } // end namespace thinlto
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n");
 
   if (OptLevel < '0' || OptLevel > '3')
diff --git a/contrib/llvm/tools/llvm-lto2/llvm-lto2.cpp b/contrib/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 70aae0f41507..442973f90209 100644
--- a/contrib/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/contrib/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -17,7 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/LTO/Caching.h"
 #include "llvm/LTO/LTO.h"
@@ -113,6 +113,9 @@ static cl::opt<bool>
     DebugPassManager("debug-pass-manager", cl::init(false), cl::Hidden,
                      cl::desc("Print pass management debugging information"));
 
+static cl::opt<std::string>
+    StatsFile("stats-file", cl::desc("Filename to write statistics to"));
+
 static void check(Error E, std::string Msg) {
   if (!E)
     return;
@@ -189,7 +192,8 @@ static int run(int argc, char **argv) {
     DiagnosticPrinterRawOStream DP(errs());
     DI.print(DP);
     errs() << '\n';
-    exit(1);
+    if (DI.getSeverity() == DS_Error)
+      exit(1);
   };
 
   Conf.CPU = MCPU;
@@ -240,10 +244,15 @@ static int run(int argc, char **argv) {
 
   Conf.OverrideTriple = OverrideTriple;
   Conf.DefaultTriple = DefaultTriple;
+  Conf.StatsFile = StatsFile;
 
   ThinBackend Backend;
   if (ThinLTODistributedIndexes)
-    Backend = createWriteIndexesThinBackend("", "", true, "");
+    Backend = createWriteIndexesThinBackend(/* OldPrefix */ "",
+                                            /* NewPrefix */ "",
+                                            /* ShouldEmitImportsFiles */ true,
+                                            /* LinkedObjectsFile */ nullptr,
+                                            /* OnWrite */ {});
   else
     Backend = createInProcessThinBackend(Threads);
   LTO Lto(std::move(Conf), std::move(Backend));
@@ -296,8 +305,7 @@ static int run(int argc, char **argv) {
     return llvm::make_unique<lto::NativeObjectStream>(std::move(S));
   };
 
-  auto AddBuffer = [&](size_t Task, std::unique_ptr<MemoryBuffer> MB,
-                       StringRef Path) {
+  auto AddBuffer = [&](size_t Task, std::unique_ptr<MemoryBuffer> MB) {
     *AddStream(Task)->OS << MB->getBuffer();
   };
 
diff --git a/contrib/llvm/tools/llvm-mc/llvm-mc.cpp b/contrib/llvm/tools/llvm-mc/llvm-mc.cpp
index 3987be2bd688..f494d02f3bca 100644
--- a/contrib/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/contrib/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -20,34 +20,38 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptionsCommandFlags.def"
+#include "llvm/MC/MCTargetOptionsCommandFlags.inc"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
 
 using namespace llvm;
 
 static cl::opt<std::string>
 InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
 
-static cl::opt<std::string>
-OutputFilename("o", cl::desc("Output filename"),
-               cl::value_desc("filename"));
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"),
+                                           cl::init("-"));
+
+static cl::opt<std::string> SplitDwarfFile("split-dwarf-file",
+                                           cl::desc("DWO output filename"),
+                                           cl::value_desc("filename"));
 
 static cl::opt<bool>
 ShowEncoding("show-encoding", cl::desc("Show instruction encodings"));
@@ -148,6 +152,11 @@ static cl::opt<std::string>
 DebugCompilationDir("fdebug-compilation-dir",
                     cl::desc("Specifies the debug info's compilation dir"));
 
+static cl::list<std::string>
+DebugPrefixMap("fdebug-prefix-map",
+               cl::desc("Map file source paths in debug info"),
+               cl::value_desc("= separated key-value pairs"));
+
 static cl::opt<std::string>
 MainFileName("main-file-name",
              cl::desc("Specifies the name we should consider the input file"));
@@ -188,7 +197,7 @@ static const Target *GetTarget(const char *ProgName) {
   const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
                                                          Error);
   if (!TheTarget) {
-    errs() << ProgName << ": " << Error;
+    WithColor::error(errs(), ProgName) << Error;
     return nullptr;
   }
 
@@ -197,15 +206,11 @@ static const Target *GetTarget(const char *ProgName) {
   return TheTarget;
 }
 
-static std::unique_ptr<ToolOutputFile> GetOutputStream() {
-  if (OutputFilename == "")
-    OutputFilename = "-";
-
+static std::unique_ptr<ToolOutputFile> GetOutputStream(StringRef Path) {
   std::error_code EC;
-  auto Out =
-      llvm::make_unique<ToolOutputFile>(OutputFilename, EC, sys::fs::F_None);
+  auto Out = llvm::make_unique<ToolOutputFile>(Path, EC, sys::fs::F_None);
   if (EC) {
-    errs() << EC.message() << '\n';
+    WithColor::error() << EC.message() << '\n';
     return nullptr;
   }
 
@@ -238,144 +243,10 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI,
 
   bool Error = false;
   while (Lexer.Lex().isNot(AsmToken::Eof)) {
-    const AsmToken &Tok = Lexer.getTok();
-
-    switch (Tok.getKind()) {
-    default:
-      SrcMgr.PrintMessage(Lexer.getLoc(), SourceMgr::DK_Warning,
-                          "unknown token");
+    Lexer.getTok().dump(OS);
+    OS << "\n";
+    if (Lexer.getTok().getKind() == AsmToken::Error)
       Error = true;
-      break;
-    case AsmToken::Error:
-      Error = true; // error already printed.
-      break;
-    case AsmToken::Identifier:
-      OS << "identifier: " << Lexer.getTok().getString();
-      break;
-    case AsmToken::Integer:
-      OS << "int: " << Lexer.getTok().getString();
-      break;
-    case AsmToken::Real:
-      OS << "real: " << Lexer.getTok().getString();
-      break;
-    case AsmToken::String:
-      OS << "string: " << Lexer.getTok().getString();
-      break;
-
-    case AsmToken::Amp:            OS << "Amp"; break;
-    case AsmToken::AmpAmp:         OS << "AmpAmp"; break;
-    case AsmToken::At:             OS << "At"; break;
-    case AsmToken::Caret:          OS << "Caret"; break;
-    case AsmToken::Colon:          OS << "Colon"; break;
-    case AsmToken::Comma:          OS << "Comma"; break;
-    case AsmToken::Dollar:         OS << "Dollar"; break;
-    case AsmToken::Dot:            OS << "Dot"; break;
-    case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
-    case AsmToken::Eof:            OS << "Eof"; break;
-    case AsmToken::Equal:          OS << "Equal"; break;
-    case AsmToken::EqualEqual:     OS << "EqualEqual"; break;
-    case AsmToken::Exclaim:        OS << "Exclaim"; break;
-    case AsmToken::ExclaimEqual:   OS << "ExclaimEqual"; break;
-    case AsmToken::Greater:        OS << "Greater"; break;
-    case AsmToken::GreaterEqual:   OS << "GreaterEqual"; break;
-    case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
-    case AsmToken::Hash:           OS << "Hash"; break;
-    case AsmToken::LBrac:          OS << "LBrac"; break;
-    case AsmToken::LCurly:         OS << "LCurly"; break;
-    case AsmToken::LParen:         OS << "LParen"; break;
-    case AsmToken::Less:           OS << "Less"; break;
-    case AsmToken::LessEqual:      OS << "LessEqual"; break;
-    case AsmToken::LessGreater:    OS << "LessGreater"; break;
-    case AsmToken::LessLess:       OS << "LessLess"; break;
-    case AsmToken::Minus:          OS << "Minus"; break;
-    case AsmToken::Percent:        OS << "Percent"; break;
-    case AsmToken::Pipe:           OS << "Pipe"; break;
-    case AsmToken::PipePipe:       OS << "PipePipe"; break;
-    case AsmToken::Plus:           OS << "Plus"; break;
-    case AsmToken::RBrac:          OS << "RBrac"; break;
-    case AsmToken::RCurly:         OS << "RCurly"; break;
-    case AsmToken::RParen:         OS << "RParen"; break;
-    case AsmToken::Slash:          OS << "Slash"; break;
-    case AsmToken::Star:           OS << "Star"; break;
-    case AsmToken::Tilde:          OS << "Tilde"; break;
-    case AsmToken::PercentCall16:
-      OS << "PercentCall16";
-      break;
-    case AsmToken::PercentCall_Hi:
-      OS << "PercentCall_Hi";
-      break;
-    case AsmToken::PercentCall_Lo:
-      OS << "PercentCall_Lo";
-      break;
-    case AsmToken::PercentDtprel_Hi:
-      OS << "PercentDtprel_Hi";
-      break;
-    case AsmToken::PercentDtprel_Lo:
-      OS << "PercentDtprel_Lo";
-      break;
-    case AsmToken::PercentGot:
-      OS << "PercentGot";
-      break;
-    case AsmToken::PercentGot_Disp:
-      OS << "PercentGot_Disp";
-      break;
-    case AsmToken::PercentGot_Hi:
-      OS << "PercentGot_Hi";
-      break;
-    case AsmToken::PercentGot_Lo:
-      OS << "PercentGot_Lo";
-      break;
-    case AsmToken::PercentGot_Ofst:
-      OS << "PercentGot_Ofst";
-      break;
-    case AsmToken::PercentGot_Page:
-      OS << "PercentGot_Page";
-      break;
-    case AsmToken::PercentGottprel:
-      OS << "PercentGottprel";
-      break;
-    case AsmToken::PercentGp_Rel:
-      OS << "PercentGp_Rel";
-      break;
-    case AsmToken::PercentHi:
-      OS << "PercentHi";
-      break;
-    case AsmToken::PercentHigher:
-      OS << "PercentHigher";
-      break;
-    case AsmToken::PercentHighest:
-      OS << "PercentHighest";
-      break;
-    case AsmToken::PercentLo:
-      OS << "PercentLo";
-      break;
-    case AsmToken::PercentNeg:
-      OS << "PercentNeg";
-      break;
-    case AsmToken::PercentPcrel_Hi:
-      OS << "PercentPcrel_Hi";
-      break;
-    case AsmToken::PercentPcrel_Lo:
-      OS << "PercentPcrel_Lo";
-      break;
-    case AsmToken::PercentTlsgd:
-      OS << "PercentTlsgd";
-      break;
-    case AsmToken::PercentTlsldm:
-      OS << "PercentTlsldm";
-      break;
-    case AsmToken::PercentTprel_Hi:
-      OS << "PercentTprel_Hi";
-      break;
-    case AsmToken::PercentTprel_Lo:
-      OS << "PercentTprel_Lo";
-      break;
-    }
-
-    // Print the token string.
-    OS << " (\"";
-    OS.write_escaped(Tok.getString());
-    OS << "\")\n";
   }
 
   return Error;
@@ -388,12 +259,13 @@ static int fillCommandLineSymbols(MCAsmParser &Parser) {
     auto Val = Pair.second;
 
     if (Sym.empty() || Val.empty()) {
-      errs() << "error: defsym must be of the form: sym=value: " << I << "\n";
+      WithColor::error() << "defsym must be of the form: sym=value: " << I
+                         << "\n";
       return 1;
     }
     int64_t Value;
     if (Val.getAsInteger(0, Value)) {
-      errs() << "error: Value is not an integer: " << Val << "\n";
+      WithColor::error() << "value is not an integer: " << Val << "\n";
       return 1;
     }
     Parser.getContext().setSymbolValue(Parser.getStreamer(), Sym, Value);
@@ -411,8 +283,8 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
       TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions));
 
   if (!TAP) {
-    errs() << ProgName
-           << ": error: this target does not support assembly parsing.\n";
+    WithColor::error(errs(), ProgName)
+        << "this target does not support assembly parsing.\n";
     return 1;
   }
 
@@ -428,10 +300,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
 }
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   // Initialize targets and assembly printers/parsers.
   llvm::InitializeAllTargetInfos();
@@ -460,7 +329,8 @@ int main(int argc, char **argv) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
       MemoryBuffer::getFileOrSTDIN(InputFilename);
   if (std::error_code EC = BufferPtr.getError()) {
-    errs() << InputFilename << ": " << EC.message() << '\n';
+    WithColor::error(errs(), ProgName)
+        << InputFilename << ": " << EC.message() << '\n';
     return 1;
   }
   MemoryBuffer *Buffer = BufferPtr->get();
@@ -484,8 +354,8 @@ int main(int argc, char **argv) {
 
   if (CompressDebugSections != DebugCompressionType::None) {
     if (!zlib::isAvailable()) {
-      errs() << ProgName
-             << ": build tools with zlib to enable -compress-debug-sections";
+      WithColor::error(errs(), ProgName)
+          << "build tools with zlib to enable -compress-debug-sections";
       return 1;
     }
     MAI->setCompressDebugSections(CompressDebugSections);
@@ -522,8 +392,24 @@ int main(int argc, char **argv) {
     if (!sys::fs::current_path(CWD))
       Ctx.setCompilationDir(CWD);
   }
+  for (const auto &Arg : DebugPrefixMap) {
+    const auto &KV = StringRef(Arg).split('=');
+    Ctx.addDebugPrefixMapEntry(KV.first, KV.second);
+  }
   if (!MainFileName.empty())
     Ctx.setMainFileName(MainFileName);
+  if (GenDwarfForAssembly && DwarfVersion >= 5) {
+    // DWARF v5 needs the root file as well as the compilation directory.
+    // If we find a '.file 0' directive that will supersede these values.
+    MD5 Hash;
+    MD5::MD5Result *Cksum =
+        (MD5::MD5Result *)Ctx.allocate(sizeof(MD5::MD5Result), 1);
+    Hash.update(Buffer->getBuffer());
+    Hash.final(*Cksum);
+    Ctx.setMCLineTableRootFile(
+        /*CUID=*/0, Ctx.getCompilationDir(),
+        !MainFileName.empty() ? MainFileName : InputFilename, Cksum, None);
+  }
 
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
@@ -534,10 +420,21 @@ int main(int argc, char **argv) {
     FeaturesStr = Features.getString();
   }
 
-  std::unique_ptr<ToolOutputFile> Out = GetOutputStream();
+  std::unique_ptr<ToolOutputFile> Out = GetOutputStream(OutputFilename);
   if (!Out)
     return 1;
 
+  std::unique_ptr<ToolOutputFile> DwoOut;
+  if (!SplitDwarfFile.empty()) {
+    if (FileType != OFT_ObjectFile) {
+      WithColor::error() << "dwo output only supported with object files\n";
+      return 1;
+    }
+    DwoOut = GetOutputStream(SplitDwarfFile);
+    if (!DwoOut)
+      return 1;
+  }
+
   std::unique_ptr<buffer_ostream> BOS;
   raw_pwrite_stream *OS = &Out->os();
   std::unique_ptr<MCStreamer> Str;
@@ -552,8 +449,8 @@ int main(int argc, char **argv) {
                                         *MAI, *MCII, *MRI);
 
     if (!IP) {
-      errs()
-          << "error: unable to create instruction printer for target triple '"
+      WithColor::error()
+          << "unable to create instruction printer for target triple '"
           << TheTriple.normalize() << "' with assembly variant "
           << OutputAsmVariant << ".\n";
       return 1;
@@ -563,16 +460,17 @@ int main(int argc, char **argv) {
     IP->setPrintImmHex(PrintImmHex);
 
     // Set up the AsmStreamer.
-    MCCodeEmitter *CE = nullptr;
-    MCAsmBackend *MAB = nullptr;
-    if (ShowEncoding) {
-      CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
-      MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
-    }
+    std::unique_ptr<MCCodeEmitter> CE;
+    if (ShowEncoding)
+      CE.reset(TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
+
+    std::unique_ptr<MCAsmBackend> MAB(
+        TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
     auto FOut = llvm::make_unique<formatted_raw_ostream>(*OS);
-    Str.reset(TheTarget->createAsmStreamer(
-        Ctx, std::move(FOut), /*asmverbose*/ true,
-        /*useDwarfDirectory*/ true, IP, CE, MAB, ShowInst));
+    Str.reset(
+        TheTarget->createAsmStreamer(Ctx, std::move(FOut), /*asmverbose*/ true,
+                                     /*useDwarfDirectory*/ true, IP,
+                                     std::move(CE), std::move(MAB), ShowInst));
 
   } else if (FileType == OFT_Null) {
     Str.reset(TheTarget->createNullStreamer(Ctx));
@@ -590,7 +488,9 @@ int main(int argc, char **argv) {
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
     Str.reset(TheTarget->createMCObjectStreamer(
-        TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB), *OS,
+        TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB),
+        DwoOut ? MAB->createDwoObjectWriter(*OS, DwoOut->os())
+               : MAB->createObjectWriter(*OS),
         std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll,
         MCOptions.MCIncrementalLinkerCompatible,
         /*DWARFMustBeAtTheEnd*/ false));
@@ -598,6 +498,9 @@ int main(int argc, char **argv) {
       Str->InitSections(true);
   }
 
+  // Use Assembler information for parsing.
+  Str->setUseAssemblerInfoForParsing(true);
+
   int Res = 1;
   bool disassemble = false;
   switch (Action) {
@@ -622,6 +525,10 @@ int main(int argc, char **argv) {
                                     *Buffer, SrcMgr, Out->os());
 
   // Keep output if no errors.
-  if (Res == 0) Out->keep();
+  if (Res == 0) {
+    Out->keep();
+    if (DwoOut)
+      DwoOut->keep();
+  }
   return Res;
 }
diff --git a/contrib/llvm/tools/llvm-mca/CodeRegion.cpp b/contrib/llvm/tools/llvm-mca/CodeRegion.cpp
new file mode 100644
index 000000000000..896865996504
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/CodeRegion.cpp
@@ -0,0 +1,66 @@
+//===-------------------------- CodeRegion.cpp -----------------*- C++ -* -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the CodeRegions interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "CodeRegion.h"
+
+using namespace llvm;
+
+namespace mca {
+
+bool CodeRegion::isLocInRange(SMLoc Loc) const {
+  if (RangeEnd.isValid() && Loc.getPointer() > RangeEnd.getPointer())
+    return false;
+  if (RangeStart.isValid() && Loc.getPointer() < RangeStart.getPointer())
+    return false;
+  return true;
+}
+
+void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
+  assert(!Regions.empty() && "Missing Default region");
+  const CodeRegion &CurrentRegion = *Regions.back();
+  if (CurrentRegion.startLoc().isValid() && !CurrentRegion.endLoc().isValid()) {
+    SM.PrintMessage(Loc, SourceMgr::DK_Warning,
+                    "Ignoring invalid region start");
+    return;
+  }
+
+  // Remove the default region if there are user defined regions.
+  if (!CurrentRegion.startLoc().isValid())
+    Regions.erase(Regions.begin());
+  addRegion(Description, Loc);
+}
+
+void CodeRegions::endRegion(SMLoc Loc) {
+  assert(!Regions.empty() && "Missing Default region");
+  CodeRegion &CurrentRegion = *Regions.back();
+  if (CurrentRegion.endLoc().isValid()) {
+    SM.PrintMessage(Loc, SourceMgr::DK_Warning, "Ignoring invalid region end");
+    return;
+  }
+
+  CurrentRegion.setEndLocation(Loc);
+}
+
+void CodeRegions::addInstruction(std::unique_ptr<const MCInst> Instruction) {
+  const SMLoc &Loc = Instruction->getLoc();
+  const auto It =
+      std::find_if(Regions.rbegin(), Regions.rend(),
+                   [Loc](const std::unique_ptr<CodeRegion> &Region) {
+                     return Region->isLocInRange(Loc);
+                   });
+  if (It != Regions.rend())
+    (*It)->addInstruction(std::move(Instruction));
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/CodeRegion.h b/contrib/llvm/tools/llvm-mca/CodeRegion.h
new file mode 100644
index 000000000000..7f0025e4884c
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/CodeRegion.h
@@ -0,0 +1,131 @@
+//===-------------------------- CodeRegion.h -------------------*- C++ -* -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements class CodeRegion and CodeRegions.
+///
+/// A CodeRegion describes a region of assembly code guarded by special LLVM-MCA
+/// comment directives.
+///
+///   # LLVM-MCA-BEGIN foo
+///     ...  ## asm
+///   # LLVM-MCA-END
+///
+/// A comment starting with substring LLVM-MCA-BEGIN marks the beginning of a
+/// new region of code.
+/// A comment starting with substring LLVM-MCA-END marks the end of the
+/// last-seen region of code.
+///
+/// Code regions are not allowed to overlap. Each region can have a optional
+/// description; internally, regions are described by a range of source
+/// locations (SMLoc objects).
+///
+/// An instruction (a MCInst) is added to a region R only if its location is in
+/// range [R.RangeStart, R.RangeEnd].
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_H
+#define LLVM_TOOLS_LLVM_MCA_CODEREGION_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include <vector>
+
+namespace mca {
+
+/// A region of assembly code.
+///
+/// It identifies a sequence of machine instructions.
+class CodeRegion {
+  // An optional descriptor for this region.
+  llvm::StringRef Description;
+  // Instructions that form this region.
+  std::vector<std::unique_ptr<const llvm::MCInst>> Instructions;
+  // Source location range.
+  llvm::SMLoc RangeStart;
+  llvm::SMLoc RangeEnd;
+
+  CodeRegion(const CodeRegion &) = delete;
+  CodeRegion &operator=(const CodeRegion &) = delete;
+
+public:
+  CodeRegion(llvm::StringRef Desc, llvm::SMLoc Start)
+      : Description(Desc), RangeStart(Start), RangeEnd() {}
+
+  void addInstruction(std::unique_ptr<const llvm::MCInst> Instruction) {
+    Instructions.emplace_back(std::move(Instruction));
+  }
+
+  llvm::SMLoc startLoc() const { return RangeStart; }
+  llvm::SMLoc endLoc() const { return RangeEnd; }
+
+  void setEndLocation(llvm::SMLoc End) { RangeEnd = End; }
+  bool empty() const { return Instructions.empty(); }
+  bool isLocInRange(llvm::SMLoc Loc) const;
+
+  const std::vector<std::unique_ptr<const llvm::MCInst>> &
+  getInstructions() const {
+    return Instructions;
+  }
+
+  llvm::StringRef getDescription() const { return Description; }
+};
+
+class CodeRegions {
+  // A source manager. Used by the tool to generate meaningful warnings.
+  llvm::SourceMgr &SM;
+
+  std::vector<std::unique_ptr<CodeRegion>> Regions;
+
+  // Construct a new region of code guarded by LLVM-MCA comments.
+  void addRegion(llvm::StringRef Description, llvm::SMLoc Loc) {
+    Regions.emplace_back(llvm::make_unique<CodeRegion>(Description, Loc));
+  }
+
+  CodeRegions(const CodeRegions &) = delete;
+  CodeRegions &operator=(const CodeRegions &) = delete;
+
+public:
+  typedef std::vector<std::unique_ptr<CodeRegion>>::iterator iterator;
+  typedef std::vector<std::unique_ptr<CodeRegion>>::const_iterator
+      const_iterator;
+
+  iterator begin() { return Regions.begin(); }
+  iterator end() { return Regions.end(); }
+  const_iterator begin() const { return Regions.cbegin(); }
+  const_iterator end() const { return Regions.cend(); }
+
+  void beginRegion(llvm::StringRef Description, llvm::SMLoc Loc);
+  void endRegion(llvm::SMLoc Loc);
+  void addInstruction(std::unique_ptr<const llvm::MCInst> Instruction);
+
+  CodeRegions(llvm::SourceMgr &S) : SM(S) {
+    // Create a default region for the input code sequence.
+    addRegion("Default", llvm::SMLoc());
+  }
+
+  const std::vector<std::unique_ptr<const llvm::MCInst>> &
+  getInstructionSequence(unsigned Idx) const {
+    return Regions[Idx]->getInstructions();
+  }
+
+  bool empty() const {
+    return std::all_of(Regions.begin(), Regions.end(),
+                       [](const std::unique_ptr<CodeRegion> &Region) {
+                         return Region->empty();
+                       });
+  }
+};
+
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/Context.cpp b/contrib/llvm/tools/llvm-mca/Context.cpp
new file mode 100644
index 000000000000..685714e64b92
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Context.cpp
@@ -0,0 +1,63 @@
+//===---------------------------- Context.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a class for holding ownership of various simulated
+/// hardware units.  A Context also provides a utility routine for constructing
+/// a default out-of-order pipeline with fetch, dispatch, execute, and retire
+/// stages).
+///
+//===----------------------------------------------------------------------===//
+
+#include "Context.h"
+#include "DispatchStage.h"
+#include "ExecuteStage.h"
+#include "FetchStage.h"
+#include "RegisterFile.h"
+#include "RetireControlUnit.h"
+#include "RetireStage.h"
+#include "Scheduler.h"
+
+namespace mca {
+
+using namespace llvm;
+
+std::unique_ptr<Pipeline>
+Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
+                               SourceMgr &SrcMgr) {
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Create the hardware units defining the backend.
+  auto RCU = llvm::make_unique<RetireControlUnit>(SM);
+  auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
+  auto HWS = llvm::make_unique<Scheduler>(
+      SM, Opts.LoadQueueSize, Opts.StoreQueueSize, Opts.AssumeNoAlias);
+
+  // Create the pipeline and its stages.
+  auto P = llvm::make_unique<Pipeline>();
+  auto F = llvm::make_unique<FetchStage>(IB, SrcMgr);
+  auto D = llvm::make_unique<DispatchStage>(
+      STI, MRI, Opts.RegisterFileSize, Opts.DispatchWidth, *RCU, *PRF, *HWS);
+  auto R = llvm::make_unique<RetireStage>(*RCU, *PRF);
+  auto E = llvm::make_unique<ExecuteStage>(*RCU, *HWS);
+
+  // Add the hardware to the context.
+  addHardwareUnit(std::move(RCU));
+  addHardwareUnit(std::move(PRF));
+  addHardwareUnit(std::move(HWS));
+
+  // Build the pipeline.
+  P->appendStage(std::move(F));
+  P->appendStage(std::move(D));
+  P->appendStage(std::move(R));
+  P->appendStage(std::move(E));
+  return P;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/Context.h b/contrib/llvm/tools/llvm-mca/Context.h
new file mode 100644
index 000000000000..cf483fa7b37d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Context.h
@@ -0,0 +1,68 @@
+//===---------------------------- Context.h ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a class for holding ownership of various simulated
+/// hardware units.  A Context also provides a utility routine for constructing
+/// a default out-of-order pipeline with fetch, dispatch, execute, and retire
+/// stages).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_CONTEXT_H
+#define LLVM_TOOLS_LLVM_MCA_CONTEXT_H
+#include "HardwareUnit.h"
+#include "InstrBuilder.h"
+#include "Pipeline.h"
+#include "SourceMgr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <memory>
+
+namespace mca {
+
+/// This is a convenience struct to hold the parameters necessary for creating
+/// the pre-built "default" out-of-order pipeline.
+struct PipelineOptions {
+  PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
+                  bool NoAlias)
+      : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
+        StoreQueueSize(SQS), AssumeNoAlias(NoAlias) {}
+  unsigned DispatchWidth;
+  unsigned RegisterFileSize;
+  unsigned LoadQueueSize;
+  unsigned StoreQueueSize;
+  bool AssumeNoAlias;
+};
+
+class Context {
+  llvm::SmallVector<std::unique_ptr<HardwareUnit>, 4> Hardware;
+  const llvm::MCRegisterInfo &MRI;
+  const llvm::MCSubtargetInfo &STI;
+
+public:
+  Context(const llvm::MCRegisterInfo &R, const llvm::MCSubtargetInfo &S)
+      : MRI(R), STI(S) {}
+  Context(const Context &C) = delete;
+  Context &operator=(const Context &C) = delete;
+
+  void addHardwareUnit(std::unique_ptr<HardwareUnit> H) {
+    Hardware.push_back(std::move(H));
+  }
+
+  /// Construct a basic pipeline for simulating an out-of-order pipeline.
+  /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages.
+  std::unique_ptr<Pipeline> createDefaultPipeline(const PipelineOptions &Opts,
+                                                  InstrBuilder &IB,
+                                                  SourceMgr &SrcMgr);
+};
+
+} // namespace mca
+#endif // LLVM_TOOLS_LLVM_MCA_CONTEXT_H
diff --git a/contrib/llvm/tools/llvm-mca/DispatchStage.cpp b/contrib/llvm/tools/llvm-mca/DispatchStage.cpp
new file mode 100644
index 000000000000..be6f1f89be5c
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/DispatchStage.cpp
@@ -0,0 +1,149 @@
+//===--------------------- DispatchStage.cpp --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file models the dispatch component of an instruction pipeline.
+///
+/// The DispatchStage is responsible for updating instruction dependencies
+/// and communicating to the simulated instruction scheduler that an instruction
+/// is ready to be scheduled for execution.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DispatchStage.h"
+#include "HWEventListener.h"
+#include "Scheduler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
+                                                ArrayRef<unsigned> UsedRegs) {
+  LLVM_DEBUG(dbgs() << "[E] Instruction Dispatched: #" << IR << '\n');
+  notifyEvent<HWInstructionEvent>(HWInstructionDispatchedEvent(IR, UsedRegs));
+}
+
+bool DispatchStage::checkPRF(const InstRef &IR) {
+  SmallVector<unsigned, 4> RegDefs;
+  for (const std::unique_ptr<WriteState> &RegDef :
+       IR.getInstruction()->getDefs())
+    RegDefs.emplace_back(RegDef->getRegisterID());
+
+  const unsigned RegisterMask = PRF.isAvailable(RegDefs);
+  // A mask with all zeroes means: register files are available.
+  if (RegisterMask) {
+    notifyEvent<HWStallEvent>(
+        HWStallEvent(HWStallEvent::RegisterFileStall, IR));
+    return false;
+  }
+
+  return true;
+}
+
+bool DispatchStage::checkRCU(const InstRef &IR) {
+  const unsigned NumMicroOps = IR.getInstruction()->getDesc().NumMicroOps;
+  if (RCU.isAvailable(NumMicroOps))
+    return true;
+  notifyEvent<HWStallEvent>(
+      HWStallEvent(HWStallEvent::RetireControlUnitStall, IR));
+  return false;
+}
+
+bool DispatchStage::checkScheduler(const InstRef &IR) {
+  HWStallEvent::GenericEventType Event;
+  const bool Ready = SC.canBeDispatched(IR, Event);
+  if (!Ready)
+    notifyEvent<HWStallEvent>(HWStallEvent(Event, IR));
+  return Ready;
+}
+
+void DispatchStage::updateRAWDependencies(ReadState &RS,
+                                          const MCSubtargetInfo &STI) {
+  SmallVector<WriteRef, 4> DependentWrites;
+
+  collectWrites(DependentWrites, RS.getRegisterID());
+  RS.setDependentWrites(DependentWrites.size());
+  // We know that this read depends on all the writes in DependentWrites.
+  // For each write, check if we have ReadAdvance information, and use it
+  // to figure out in how many cycles this read becomes available.
+  const ReadDescriptor &RD = RS.getDescriptor();
+  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
+  for (WriteRef &WR : DependentWrites) {
+    WriteState &WS = *WR.getWriteState();
+    unsigned WriteResID = WS.getWriteResourceID();
+    int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID);
+    WS.addUser(&RS, ReadAdvance);
+  }
+}
+
+void DispatchStage::dispatch(InstRef IR) {
+  assert(!CarryOver && "Cannot dispatch another instruction!");
+  Instruction &IS = *IR.getInstruction();
+  const InstrDesc &Desc = IS.getDesc();
+  const unsigned NumMicroOps = Desc.NumMicroOps;
+  if (NumMicroOps > DispatchWidth) {
+    assert(AvailableEntries == DispatchWidth);
+    AvailableEntries = 0;
+    CarryOver = NumMicroOps - DispatchWidth;
+  } else {
+    assert(AvailableEntries >= NumMicroOps);
+    AvailableEntries -= NumMicroOps;
+  }
+
+  // A dependency-breaking instruction doesn't have to wait on the register
+  // input operands, and it is often optimized at register renaming stage.
+  // Update RAW dependencies if this instruction is not a dependency-breaking
+  // instruction. A dependency-breaking instruction is a zero-latency
+  // instruction that doesn't consume hardware resources.
+  // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
+  if (!Desc.isZeroLatency())
+    for (std::unique_ptr<ReadState> &RS : IS.getUses())
+      updateRAWDependencies(*RS, STI);
+
+  // By default, a dependency-breaking zero-latency instruction is expected to
+  // be optimized at register renaming stage. That means, no physical register
+  // is allocated to the instruction.
+  SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
+  for (std::unique_ptr<WriteState> &WS : IS.getDefs())
+    PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles,
+                         !Desc.isZeroLatency());
+
+  // Reserve slots in the RCU, and notify the instruction that it has been
+  // dispatched to the schedulers for execution.
+  IS.dispatch(RCU.reserveSlot(IR, NumMicroOps));
+
+  // Notify listeners of the "instruction dispatched" event.
+  notifyInstructionDispatched(IR, RegisterFiles);
+}
+
+void DispatchStage::cycleStart() {
+  AvailableEntries = CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
+  CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U;
+}
+
+bool DispatchStage::execute(InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  if (!isAvailable(Desc.NumMicroOps) || !canDispatch(IR))
+    return false;
+  dispatch(IR);
+  return true;
+}
+
+#ifndef NDEBUG
+void DispatchStage::dump() const {
+  PRF.dump();
+  RCU.dump();
+}
+#endif
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/DispatchStage.h b/contrib/llvm/tools/llvm-mca/DispatchStage.h
new file mode 100644
index 000000000000..f21789a29c50
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/DispatchStage.h
@@ -0,0 +1,106 @@
+//===----------------------- DispatchStage.h --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file models the dispatch component of an instruction pipeline.
+///
+/// The DispatchStage is responsible for updating instruction dependencies
+/// and communicating to the simulated instruction scheduler that an instruction
+/// is ready to be scheduled for execution.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
+#define LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
+
+#include "HWEventListener.h"
+#include "Instruction.h"
+#include "RegisterFile.h"
+#include "RetireControlUnit.h"
+#include "Stage.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace mca {
+
+class Scheduler;
+
+// Implements the hardware dispatch logic.
+//
+// This class is responsible for the dispatch stage, in which instructions are
+// dispatched in groups to the Scheduler.  An instruction can be dispatched if
+// the following conditions are met:
+//  1) There are enough entries in the reorder buffer (see class
+//     RetireControlUnit) to write the opcodes associated with the instruction.
+//  2) There are enough temporaries to rename output register operands.
+//  3) There are enough entries available in the used buffered resource(s).
+//
+// The number of micro opcodes that can be dispatched in one cycle is limited by
+// the value of field 'DispatchWidth'. A "dynamic dispatch stall" occurs when
+// processor resources are not available. Dispatch stall events are counted
+// during the entire execution of the code, and displayed by the performance
+// report when flag '-dispatch-stats' is specified.
+//
+// If the number of micro opcodes exceedes DispatchWidth, then the instruction
+// is dispatched in multiple cycles.
+class DispatchStage : public Stage {
+  unsigned DispatchWidth;
+  unsigned AvailableEntries;
+  unsigned CarryOver;
+  const llvm::MCSubtargetInfo &STI;
+  RetireControlUnit &RCU;
+  RegisterFile &PRF;
+  Scheduler &SC;
+
+  bool checkRCU(const InstRef &IR);
+  bool checkPRF(const InstRef &IR);
+  bool checkScheduler(const InstRef &IR);
+  void dispatch(InstRef IR);
+  void updateRAWDependencies(ReadState &RS, const llvm::MCSubtargetInfo &STI);
+
+  void notifyInstructionDispatched(const InstRef &IR,
+                                   llvm::ArrayRef<unsigned> UsedPhysRegs);
+
+  bool isAvailable(unsigned NumEntries) const {
+    return NumEntries <= AvailableEntries || AvailableEntries == DispatchWidth;
+  }
+
+  bool canDispatch(const InstRef &IR) {
+    assert(isAvailable(IR.getInstruction()->getDesc().NumMicroOps));
+    return checkRCU(IR) && checkPRF(IR) && checkScheduler(IR);
+  }
+
+  void collectWrites(llvm::SmallVectorImpl<WriteRef> &Vec,
+                     unsigned RegID) const {
+    return PRF.collectWrites(Vec, RegID);
+  }
+
+public:
+  DispatchStage(const llvm::MCSubtargetInfo &Subtarget,
+                const llvm::MCRegisterInfo &MRI, unsigned RegisterFileSize,
+                unsigned MaxDispatchWidth, RetireControlUnit &R,
+                RegisterFile &F, Scheduler &Sched)
+      : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
+        CarryOver(0U), STI(Subtarget), RCU(R), PRF(F), SC(Sched) {}
+
+  // We can always try to dispatch, so returning false is okay in this case.
+  // The retire stage, which controls the RCU, might have items to complete but
+  // RetireStage::hasWorkToComplete will check for that case.
+  virtual bool hasWorkToComplete() const override final { return false; }
+  virtual void cycleStart() override final;
+  virtual bool execute(InstRef &IR) override final;
+  void notifyDispatchStall(const InstRef &IR, unsigned EventType);
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
diff --git a/contrib/llvm/tools/llvm-mca/DispatchStatistics.cpp b/contrib/llvm/tools/llvm-mca/DispatchStatistics.cpp
new file mode 100644
index 000000000000..4bddbef9a0c8
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/DispatchStatistics.cpp
@@ -0,0 +1,71 @@
+//===--------------------- DispatchStatistics.cpp ---------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the DispatchStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DispatchStatistics.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void DispatchStatistics::onEvent(const HWStallEvent &Event) {
+  if (Event.Type < HWStallEvent::LastGenericEvent)
+    HWStalls[Event.Type]++;
+}
+
+void DispatchStatistics::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched)
+    ++NumDispatched;
+}
+
+void DispatchStatistics::printDispatchHistogram(llvm::raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nDispatch Logic - "
+             << "number of cycles where we saw N instructions dispatched:\n";
+  TempStream << "[# dispatched], [# cycles]\n";
+  for (const std::pair<unsigned, unsigned> &Entry : DispatchGroupSizePerCycle) {
+    TempStream << " " << Entry.first << ",              " << Entry.second
+               << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nDynamic Dispatch Stall Cycles:\n";
+  TempStream << "RAT     - Register unavailable:                      "
+             << HWStalls[HWStallEvent::RegisterFileStall];
+  TempStream << "\nRCU     - Retire tokens unavailable:                 "
+             << HWStalls[HWStallEvent::RetireControlUnitStall];
+  TempStream << "\nSCHEDQ  - Scheduler full:                            "
+             << HWStalls[HWStallEvent::SchedulerQueueFull];
+  TempStream << "\nLQ      - Load queue full:                           "
+             << HWStalls[HWStallEvent::LoadQueueFull];
+  TempStream << "\nSQ      - Store queue full:                          "
+             << HWStalls[HWStallEvent::StoreQueueFull];
+  TempStream << "\nGROUP   - Static restrictions on the dispatch group: "
+             << HWStalls[HWStallEvent::DispatchGroupStall];
+  TempStream << '\n';
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/DispatchStatistics.h b/contrib/llvm/tools/llvm-mca/DispatchStatistics.h
new file mode 100644
index 000000000000..1e389d54766b
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/DispatchStatistics.h
@@ -0,0 +1,84 @@
+//===--------------------- DispatchStatistics.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a view that prints a few statistics related to the
+/// dispatch logic. It collects and analyzes instruction dispatch events as
+/// well as static/dynamic dispatch stall events.
+///
+/// Example:
+/// ========
+///
+/// Dynamic Dispatch Stall Cycles:
+/// RAT     - Register unavailable:                      0
+/// RCU     - Retire tokens unavailable:                 0
+/// SCHEDQ  - Scheduler full:                            42
+/// LQ      - Load queue full:                           0
+/// SQ      - Store queue full:                          0
+/// GROUP   - Static restrictions on the dispatch group: 0
+///
+///
+/// Dispatch Logic - number of cycles where we saw N instructions dispatched:
+/// [# dispatched], [# cycles]
+///  0,              15  (11.5%)
+///  2,              4  (3.1%)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H
+
+#include "View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+class DispatchStatistics : public View {
+  unsigned NumDispatched;
+  unsigned NumCycles;
+
+  // Counts dispatch stall events caused by unavailability of resources.  There
+  // is one counter for every generic stall kind (see class HWStallEvent).
+  llvm::SmallVector<unsigned, 8> HWStalls;
+
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram DispatchGroupSizePerCycle;
+
+  void updateHistograms() {
+    DispatchGroupSizePerCycle[NumDispatched]++;
+    NumDispatched = 0;
+  }
+
+  void printDispatchHistogram(llvm::raw_ostream &OS) const;
+
+  void printDispatchStalls(llvm::raw_ostream &OS) const;
+
+public:
+  DispatchStatistics()
+      : NumDispatched(0), NumCycles(0),
+        HWStalls(HWStallEvent::LastGenericEvent) {}
+
+  void onEvent(const HWStallEvent &Event) override;
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void onCycleBegin() override { NumCycles++; }
+
+  void onCycleEnd() override { updateHistograms(); }
+
+  void printView(llvm::raw_ostream &OS) const override {
+    printDispatchStalls(OS);
+    printDispatchHistogram(OS);
+  }
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/ExecuteStage.cpp b/contrib/llvm/tools/llvm-mca/ExecuteStage.cpp
new file mode 100644
index 000000000000..437f864b072c
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/ExecuteStage.cpp
@@ -0,0 +1,210 @@
+//===---------------------- ExecuteStage.cpp --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the execution stage of an instruction pipeline.
+///
+/// The ExecuteStage is responsible for managing the hardware scheduler
+/// and issuing notifications that an instruction has been executed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ExecuteStage.h"
+#include "Scheduler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+using namespace llvm;
+
+// Reclaim the simulated resources used by the scheduler.
+void ExecuteStage::reclaimSchedulerResources() {
+  SmallVector<ResourceRef, 8> ResourcesFreed;
+  HWS.reclaimSimulatedResources(ResourcesFreed);
+  for (const ResourceRef &RR : ResourcesFreed)
+    notifyResourceAvailable(RR);
+}
+
+// Update the scheduler's instruction queues.
+void ExecuteStage::updateSchedulerQueues() {
+  SmallVector<InstRef, 4> InstructionIDs;
+  HWS.updateIssuedQueue(InstructionIDs);
+  for (const InstRef &IR : InstructionIDs)
+    notifyInstructionExecuted(IR);
+  InstructionIDs.clear();
+
+  HWS.updatePendingQueue(InstructionIDs);
+  for (const InstRef &IR : InstructionIDs)
+    notifyInstructionReady(IR);
+}
+
+// Issue instructions that are waiting in the scheduler's ready queue.
+void ExecuteStage::issueReadyInstructions() {
+  SmallVector<InstRef, 4> InstructionIDs;
+  InstRef IR = HWS.select();
+  while (IR.isValid()) {
+    SmallVector<std::pair<ResourceRef, double>, 4> Used;
+    HWS.issueInstruction(IR, Used);
+
+    // Reclaim instruction resources and perform notifications.
+    const InstrDesc &Desc = IR.getInstruction()->getDesc();
+    notifyReleasedBuffers(Desc.Buffers);
+    notifyInstructionIssued(IR, Used);
+    if (IR.getInstruction()->isExecuted())
+      notifyInstructionExecuted(IR);
+
+    // Instructions that have been issued during this cycle might have unblocked
+    // other dependent instructions. Dependent instructions may be issued during
+    // this same cycle if operands have ReadAdvance entries.  Promote those
+    // instructions to the ReadyQueue and tell to the caller that we need
+    // another round of 'issue()'.
+    HWS.promoteToReadyQueue(InstructionIDs);
+    for (const InstRef &I : InstructionIDs)
+      notifyInstructionReady(I);
+    InstructionIDs.clear();
+
+    // Select the next instruction to issue.
+    IR = HWS.select();
+  }
+}
+
+// The following routine is the maintenance routine of the ExecuteStage.
+// It is responsible for updating the hardware scheduler (HWS), including
+// reclaiming the HWS's simulated hardware resources, as well as updating the
+// HWS's queues.
+//
+// This routine also processes the instructions that are ready for issuance.
+// These instructions are managed by the HWS's ready queue and can be accessed
+// via the Scheduler::select() routine.
+//
+// Notifications are issued to this stage's listeners when instructions are
+// moved between the HWS's queues.  In particular, when an instruction becomes
+// ready or executed.
+void ExecuteStage::cycleStart() {
+  reclaimSchedulerResources();
+  updateSchedulerQueues();
+  issueReadyInstructions();
+}
+
+// Schedule the instruction for execution on the hardware.
+bool ExecuteStage::execute(InstRef &IR) {
+#ifndef NDEBUG
+  // Ensure that the HWS has not stored this instruction in its queues.
+  HWS.sanityCheck(IR);
+#endif
+  // Reserve a slot in each buffered resource. Also, mark units with
+  // BufferSize=0 as reserved. Resources with a buffer size of zero will only
+  // be released after MCIS is issued, and all the ResourceCycles for those
+  // units have been consumed.
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  HWS.reserveBuffers(Desc.Buffers);
+  notifyReservedBuffers(Desc.Buffers);
+
+  // Obtain a slot in the LSU.  If we cannot reserve resources, return true, so
+  // that succeeding stages can make progress.
+  if (!HWS.reserveResources(IR))
+    return true;
+
+  // If we did not return early, then the scheduler is ready for execution.
+  notifyInstructionReady(IR);
+
+  // Don't add a zero-latency instruction to the Wait or Ready queue.
+  // A zero-latency instruction doesn't consume any scheduler resources. That is
+  // because it doesn't need to be executed, and it is often removed at register
+  // renaming stage. For example, register-register moves are often optimized at
+  // register renaming stage by simply updating register aliases. On some
+  // targets, zero-idiom instructions (for example: a xor that clears the value
+  // of a register) are treated specially, and are often eliminated at register
+  // renaming stage.
+  //
+  // Instructions that use an in-order dispatch/issue processor resource must be
+  // issued immediately to the pipeline(s). Any other in-order buffered
+  // resources (i.e. BufferSize=1) is consumed.
+  //
+  // If we cannot issue immediately, the HWS will add IR to its ready queue for
+  // execution later, so we must return early here.
+  if (!HWS.issueImmediately(IR))
+    return true;
+
+  LLVM_DEBUG(dbgs() << "[SCHEDULER] Instruction #" << IR
+                    << " issued immediately\n");
+
+  // Issue IR.  The resources for this issuance will be placed in 'Used.'
+  SmallVector<std::pair<ResourceRef, double>, 4> Used;
+  HWS.issueInstruction(IR, Used);
+
+  // Perform notifications.
+  notifyReleasedBuffers(Desc.Buffers);
+  notifyInstructionIssued(IR, Used);
+  if (IR.getInstruction()->isExecuted())
+    notifyInstructionExecuted(IR);
+
+  return true;
+}
+
+void ExecuteStage::notifyInstructionExecuted(const InstRef &IR) {
+  HWS.onInstructionExecuted(IR);
+  LLVM_DEBUG(dbgs() << "[E] Instruction Executed: #" << IR << '\n');
+  notifyEvent<HWInstructionEvent>(
+      HWInstructionEvent(HWInstructionEvent::Executed, IR));
+  RCU.onInstructionExecuted(IR.getInstruction()->getRCUTokenID());
+}
+
+void ExecuteStage::notifyInstructionReady(const InstRef &IR) {
+  LLVM_DEBUG(dbgs() << "[E] Instruction Ready: #" << IR << '\n');
+  notifyEvent<HWInstructionEvent>(
+      HWInstructionEvent(HWInstructionEvent::Ready, IR));
+}
+
+void ExecuteStage::notifyResourceAvailable(const ResourceRef &RR) {
+  LLVM_DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.'
+                    << RR.second << "]\n");
+  for (HWEventListener *Listener : getListeners())
+    Listener->onResourceAvailable(RR);
+}
+
+void ExecuteStage::notifyInstructionIssued(
+    const InstRef &IR, ArrayRef<std::pair<ResourceRef, double>> Used) {
+  LLVM_DEBUG({
+    dbgs() << "[E] Instruction Issued: #" << IR << '\n';
+    for (const std::pair<ResourceRef, unsigned> &Resource : Used) {
+      dbgs() << "[E] Resource Used: [" << Resource.first.first << '.'
+             << Resource.first.second << "], ";
+      dbgs() << "cycles: " << Resource.second << '\n';
+    }
+  });
+  notifyEvent<HWInstructionEvent>(HWInstructionIssuedEvent(IR, Used));
+}
+
+void ExecuteStage::notifyReservedBuffers(ArrayRef<uint64_t> Buffers) {
+  if (Buffers.empty())
+    return;
+
+  SmallVector<unsigned, 4> BufferIDs(Buffers.begin(), Buffers.end());
+  std::transform(Buffers.begin(), Buffers.end(), BufferIDs.begin(),
+                 [&](uint64_t Op) { return HWS.getResourceID(Op); });
+  for (HWEventListener *Listener : getListeners())
+    Listener->onReservedBuffers(BufferIDs);
+}
+
+void ExecuteStage::notifyReleasedBuffers(ArrayRef<uint64_t> Buffers) {
+  if (Buffers.empty())
+    return;
+
+  SmallVector<unsigned, 4> BufferIDs(Buffers.begin(), Buffers.end());
+  std::transform(Buffers.begin(), Buffers.end(), BufferIDs.begin(),
+                 [&](uint64_t Op) { return HWS.getResourceID(Op); });
+  for (HWEventListener *Listener : getListeners())
+    Listener->onReleasedBuffers(BufferIDs);
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/ExecuteStage.h b/contrib/llvm/tools/llvm-mca/ExecuteStage.h
new file mode 100644
index 000000000000..4914a9373e7c
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/ExecuteStage.h
@@ -0,0 +1,67 @@
+//===---------------------- ExecuteStage.h ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the execution stage of an instruction pipeline.
+///
+/// The ExecuteStage is responsible for managing the hardware scheduler
+/// and issuing notifications that an instruction has been executed.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
+#define LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
+
+#include "Instruction.h"
+#include "RetireControlUnit.h"
+#include "Scheduler.h"
+#include "Stage.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mca {
+
+class ExecuteStage : public Stage {
+  // Owner will go away when we move listeners/eventing to the stages.
+  RetireControlUnit &RCU;
+  Scheduler &HWS;
+
+  // The following routines are used to maintain the HWS.
+  void reclaimSchedulerResources();
+  void updateSchedulerQueues();
+  void issueReadyInstructions();
+
+public:
+  ExecuteStage(RetireControlUnit &R, Scheduler &S) : Stage(), RCU(R), HWS(S) {}
+  ExecuteStage(const ExecuteStage &Other) = delete;
+  ExecuteStage &operator=(const ExecuteStage &Other) = delete;
+
+  // The ExecuteStage will always complete all of its work per call to
+  // execute(), so it is never left in a 'to-be-processed' state.
+  virtual bool hasWorkToComplete() const override final { return false; }
+
+  virtual void cycleStart() override final;
+  virtual bool execute(InstRef &IR) override final;
+
+  void
+  notifyInstructionIssued(const InstRef &IR,
+                          llvm::ArrayRef<std::pair<ResourceRef, double>> Used);
+  void notifyInstructionExecuted(const InstRef &IR);
+  void notifyInstructionReady(const InstRef &IR);
+  void notifyResourceAvailable(const ResourceRef &RR);
+
+  // Notify listeners that buffered resources were consumed.
+  void notifyReservedBuffers(llvm::ArrayRef<uint64_t> Buffers);
+
+  // Notify listeners that buffered resources were freed.
+  void notifyReleasedBuffers(llvm::ArrayRef<uint64_t> Buffers);
+};
+
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
diff --git a/contrib/llvm/tools/llvm-mca/FetchStage.cpp b/contrib/llvm/tools/llvm-mca/FetchStage.cpp
new file mode 100644
index 000000000000..3da117c0abc1
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/FetchStage.cpp
@@ -0,0 +1,46 @@
+//===---------------------- FetchStage.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the Fetch stage of an instruction pipeline.  Its sole
+/// purpose in life is to produce instructions for the rest of the pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "FetchStage.h"
+
+namespace mca {
+
+bool FetchStage::hasWorkToComplete() const { return SM.hasNext(); }
+
+bool FetchStage::execute(InstRef &IR) {
+  if (!SM.hasNext())
+    return false;
+  const SourceRef SR = SM.peekNext();
+  std::unique_ptr<Instruction> I = IB.createInstruction(*SR.second);
+  IR = InstRef(SR.first, I.get());
+  Instructions[IR.getSourceIndex()] = std::move(I);
+  return true;
+}
+
+void FetchStage::postExecute() { SM.updateNext(); }
+
+void FetchStage::cycleEnd() {
+  // Find the first instruction which hasn't been retired.
+  const InstMap::iterator It =
+      llvm::find_if(Instructions, [](const InstMap::value_type &KeyValuePair) {
+        return !KeyValuePair.second->isRetired();
+      });
+
+  // Erase instructions up to the first that hasn't been retired.
+  if (It != Instructions.begin())
+    Instructions.erase(Instructions.begin(), It);
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/FetchStage.h b/contrib/llvm/tools/llvm-mca/FetchStage.h
new file mode 100644
index 000000000000..620075d24fea
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/FetchStage.h
@@ -0,0 +1,45 @@
+//===---------------------- FetchStage.h ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the Fetch stage of an instruction pipeline.  Its sole
+/// purpose in life is to produce instructions for the rest of the pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
+#define LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
+
+#include "InstrBuilder.h"
+#include "SourceMgr.h"
+#include "Stage.h"
+#include <map>
+
+namespace mca {
+
+class FetchStage : public Stage {
+  using InstMap = std::map<unsigned, std::unique_ptr<Instruction>>;
+  InstMap Instructions;
+  InstrBuilder &IB;
+  SourceMgr &SM;
+
+public:
+  FetchStage(InstrBuilder &IB, SourceMgr &SM) : IB(IB), SM(SM) {}
+  FetchStage(const FetchStage &Other) = delete;
+  FetchStage &operator=(const FetchStage &Other) = delete;
+
+  bool hasWorkToComplete() const override final;
+  bool execute(InstRef &IR) override final;
+  void postExecute() override final;
+  void cycleEnd() override final;
+};
+
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
diff --git a/contrib/llvm/tools/llvm-mca/HWEventListener.cpp b/contrib/llvm/tools/llvm-mca/HWEventListener.cpp
new file mode 100644
index 000000000000..f27a04a9a980
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/HWEventListener.cpp
@@ -0,0 +1,21 @@
+//===----------------------- HWEventListener.cpp ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a vtable anchor for class HWEventListener.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HWEventListener.h"
+
+namespace mca {
+
+// Anchor the vtable here.
+void HWEventListener::anchor() {}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/HWEventListener.h b/contrib/llvm/tools/llvm-mca/HWEventListener.h
new file mode 100644
index 000000000000..aa3e6dcf19a0
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/HWEventListener.h
@@ -0,0 +1,141 @@
+//===----------------------- HWEventListener.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the main interface for hardware event listeners.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+
+#include "Instruction.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <utility>
+
+namespace mca {
+
+// An HWInstructionEvent represents state changes of instructions that
+// listeners might be interested in. Listeners can choose to ignore any event
+// they are not interested in.
+class HWInstructionEvent {
+public:
+  // This is the list of event types that are shared by all targets, that
+  // generic subtarget-agnostic classes (e.g., Pipeline, HWInstructionEvent,
+  // ...) and generic Views can manipulate.
+  // Subtargets are free to define additional event types, that are goin to be
+  // handled by generic components as opaque values, but can still be
+  // emitted by subtarget-specific pipeline stages (e.g., ExecuteStage,
+  // DispatchStage, ...) and interpreted by subtarget-specific EventListener
+  // implementations.
+  enum GenericEventType {
+    Invalid = 0,
+    // Events generated by the Retire Control Unit.
+    Retired,
+    // Events generated by the Scheduler.
+    Ready,
+    Issued,
+    Executed,
+    // Events generated by the Dispatch logic.
+    Dispatched,
+
+    LastGenericEventType,
+  };
+
+  HWInstructionEvent(unsigned type, const InstRef &Inst)
+      : Type(type), IR(Inst) {}
+
+  // The event type. The exact meaning depends on the subtarget.
+  const unsigned Type;
+
+  // The instruction this event was generated for.
+  const InstRef &IR;
+};
+
+class HWInstructionIssuedEvent : public HWInstructionEvent {
+public:
+  using ResourceRef = std::pair<uint64_t, uint64_t>;
+  HWInstructionIssuedEvent(const InstRef &IR,
+                           llvm::ArrayRef<std::pair<ResourceRef, double>> UR)
+      : HWInstructionEvent(HWInstructionEvent::Issued, IR), UsedResources(UR) {}
+
+  llvm::ArrayRef<std::pair<ResourceRef, double>> UsedResources;
+};
+
+class HWInstructionDispatchedEvent : public HWInstructionEvent {
+public:
+  HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs)
+      : HWInstructionEvent(HWInstructionEvent::Dispatched, IR),
+        UsedPhysRegs(Regs) {}
+  // Number of physical register allocated for this instruction. There is one
+  // entry per register file.
+  llvm::ArrayRef<unsigned> UsedPhysRegs;
+};
+
+class HWInstructionRetiredEvent : public HWInstructionEvent {
+public:
+  HWInstructionRetiredEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs)
+      : HWInstructionEvent(HWInstructionEvent::Retired, IR),
+        FreedPhysRegs(Regs) {}
+  // Number of register writes that have been architecturally committed. There
+  // is one entry per register file.
+  llvm::ArrayRef<unsigned> FreedPhysRegs;
+};
+
+// A HWStallEvent represents a pipeline stall caused by the lack of hardware
+// resources.
+class HWStallEvent {
+public:
+  enum GenericEventType {
+    Invalid = 0,
+    // Generic stall events generated by the DispatchStage.
+    RegisterFileStall,
+    RetireControlUnitStall,
+    // Generic stall events generated by the Scheduler.
+    DispatchGroupStall,
+    SchedulerQueueFull,
+    LoadQueueFull,
+    StoreQueueFull,
+    LastGenericEvent
+  };
+
+  HWStallEvent(unsigned type, const InstRef &Inst) : Type(type), IR(Inst) {}
+
+  // The exact meaning of the stall event type depends on the subtarget.
+  const unsigned Type;
+
+  // The instruction this event was generated for.
+  const InstRef &IR;
+};
+
+class HWEventListener {
+public:
+  // Generic events generated by the pipeline.
+  virtual void onCycleBegin() {}
+  virtual void onCycleEnd() {}
+
+  virtual void onEvent(const HWInstructionEvent &Event) {}
+  virtual void onEvent(const HWStallEvent &Event) {}
+
+  using ResourceRef = std::pair<uint64_t, uint64_t>;
+  virtual void onResourceAvailable(const ResourceRef &RRef) {}
+
+  // Events generated by the Scheduler when buffered resources are
+  // consumed/freed.
+  virtual void onReservedBuffers(llvm::ArrayRef<unsigned> Buffers) {}
+  virtual void onReleasedBuffers(llvm::ArrayRef<unsigned> Buffers) {}
+
+  virtual ~HWEventListener() {}
+
+private:
+  virtual void anchor();
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/HardwareUnit.cpp b/contrib/llvm/tools/llvm-mca/HardwareUnit.cpp
new file mode 100644
index 000000000000..103cde9afcc8
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/HardwareUnit.cpp
@@ -0,0 +1,23 @@
+//===------------------------- HardwareUnit.cpp -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the anchor for the base class that describes
+/// simulated hardware units.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnit.h"
+
+namespace mca {
+
+// Pin the vtable with this method.
+HardwareUnit::~HardwareUnit() = default;
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/HardwareUnit.h b/contrib/llvm/tools/llvm-mca/HardwareUnit.h
new file mode 100644
index 000000000000..e8c496ab967a
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/HardwareUnit.h
@@ -0,0 +1,31 @@
+//===-------------------------- HardwareUnit.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a base class for describing a simulated hardware
+/// unit.  These units are used to construct a simulated backend.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
+#define LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
+
+namespace mca {
+
+class HardwareUnit {
+  HardwareUnit(const HardwareUnit &H) = delete;
+  HardwareUnit &operator=(const HardwareUnit &H) = delete;
+
+public:
+  HardwareUnit() = default;
+  virtual ~HardwareUnit();
+};
+
+} // namespace mca
+#endif // LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
diff --git a/contrib/llvm/tools/llvm-mca/InstrBuilder.cpp b/contrib/llvm/tools/llvm-mca/InstrBuilder.cpp
new file mode 100644
index 000000000000..dbd457196f9d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/InstrBuilder.cpp
@@ -0,0 +1,465 @@
+//===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the InstrBuilder interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstrBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+using namespace llvm;
+
+static void initializeUsedResources(InstrDesc &ID,
+                                    const MCSchedClassDesc &SCDesc,
+                                    const MCSubtargetInfo &STI,
+                                    ArrayRef<uint64_t> ProcResourceMasks) {
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Populate resources consumed.
+  using ResourcePlusCycles = std::pair<uint64_t, ResourceUsage>;
+  std::vector<ResourcePlusCycles> Worklist;
+
+  // Track cycles contributed by resources that are in a "Super" relationship.
+  // This is required if we want to correctly match the behavior of method
+  // SubtargetEmitter::ExpandProcResource() in Tablegen. When computing the set
+  // of "consumed" processor resources and resource cycles, the logic in
+  // ExpandProcResource() doesn't update the number of resource cycles
+  // contributed by a "Super" resource to a group.
+  // We need to take this into account when we find that a processor resource is
+  // part of a group, and it is also used as the "Super" of other resources.
+  // This map stores the number of cycles contributed by sub-resources that are
+  // part of a "Super" resource. The key value is the "Super" resource mask ID.
+  DenseMap<uint64_t, unsigned> SuperResources;
+
+  for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) {
+    const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I;
+    const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
+    uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
+    if (PR.BufferSize != -1)
+      ID.Buffers.push_back(Mask);
+    CycleSegment RCy(0, PRE->Cycles, false);
+    Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
+    if (PR.SuperIdx) {
+      uint64_t Super = ProcResourceMasks[PR.SuperIdx];
+      SuperResources[Super] += PRE->Cycles;
+    }
+  }
+
+  // Sort elements by mask popcount, so that we prioritize resource units over
+  // resource groups, and smaller groups over larger groups.
+  llvm::sort(Worklist.begin(), Worklist.end(),
+             [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) {
+               unsigned popcntA = countPopulation(A.first);
+               unsigned popcntB = countPopulation(B.first);
+               if (popcntA < popcntB)
+                 return true;
+               if (popcntA > popcntB)
+                 return false;
+               return A.first < B.first;
+             });
+
+  uint64_t UsedResourceUnits = 0;
+
+  // Remove cycles contributed by smaller resources.
+  for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
+    ResourcePlusCycles &A = Worklist[I];
+    if (!A.second.size()) {
+      A.second.NumUnits = 0;
+      A.second.setReserved();
+      ID.Resources.emplace_back(A);
+      continue;
+    }
+
+    ID.Resources.emplace_back(A);
+    uint64_t NormalizedMask = A.first;
+    if (countPopulation(A.first) == 1) {
+      UsedResourceUnits |= A.first;
+    } else {
+      // Remove the leading 1 from the resource group mask.
+      NormalizedMask ^= PowerOf2Floor(NormalizedMask);
+    }
+
+    for (unsigned J = I + 1; J < E; ++J) {
+      ResourcePlusCycles &B = Worklist[J];
+      if ((NormalizedMask & B.first) == NormalizedMask) {
+        B.second.CS.Subtract(A.second.size() - SuperResources[A.first]);
+        if (countPopulation(B.first) > 1)
+          B.second.NumUnits++;
+      }
+    }
+  }
+
+  // A SchedWrite may specify a number of cycles in which a resource group
+  // is reserved. For example (on target x86; cpu Haswell):
+  //
+  //  SchedWriteRes<[HWPort0, HWPort1, HWPort01]> {
+  //    let ResourceCycles = [2, 2, 3];
+  //  }
+  //
+  // This means:
+  // Resource units HWPort0 and HWPort1 are both used for 2cy.
+  // Resource group HWPort01 is the union of HWPort0 and HWPort1.
+  // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01
+  // will not be usable for 2 entire cycles from instruction issue.
+  //
+  // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency
+  // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an
+  // extra delay on top of the 2 cycles latency.
+  // During those extra cycles, HWPort01 is not usable by other instructions.
+  for (ResourcePlusCycles &RPC : ID.Resources) {
+    if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) {
+      // Remove the leading 1 from the resource group mask.
+      uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
+      if ((Mask & UsedResourceUnits) == Mask)
+        RPC.second.setReserved();
+    }
+  }
+
+  LLVM_DEBUG({
+    for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
+      dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
+    for (const uint64_t R : ID.Buffers)
+      dbgs() << "\t\tBuffer Mask=" << R << '\n';
+  });
+}
+
+static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
+                              const MCSchedClassDesc &SCDesc,
+                              const MCSubtargetInfo &STI) {
+  if (MCDesc.isCall()) {
+    // We cannot estimate how long this call will take.
+    // Artificially set an arbitrarily high latency (100cy).
+    ID.MaxLatency = 100U;
+    return;
+  }
+
+  int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+  // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
+  ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
+}
+
+void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
+                                  unsigned SchedClassID) {
+  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
+  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+
+  // These are for now the (strong) assumptions made by this algorithm:
+  //  * The number of explicit and implicit register definitions in a MCInst
+  //    matches the number of explicit and implicit definitions according to
+  //    the opcode descriptor (MCInstrDesc).
+  //  * Register definitions take precedence over register uses in the operands
+  //    list.
+  //  * If an opcode specifies an optional definition, then the optional
+  //    definition is always the last operand in the sequence, and it can be
+  //    set to zero (i.e. "no register").
+  //
+  // These assumptions work quite well for most out-of-order in-tree targets
+  // like x86. This is mainly because the vast majority of instructions is
+  // expanded to MCInst using a straightforward lowering logic that preserves
+  // the ordering of the operands.
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+  unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
+  unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
+  unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
+  if (MCDesc.hasOptionalDef())
+    TotalDefs++;
+  ID.Writes.resize(TotalDefs);
+  // Iterate over the operands list, and skip non-register operands.
+  // The first NumExplictDefs register operands are expected to be register
+  // definitions.
+  unsigned CurrentDef = 0;
+  unsigned i = 0;
+  for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
+    const MCOperand &Op = MCI.getOperand(i);
+    if (!Op.isReg())
+      continue;
+
+    WriteDescriptor &Write = ID.Writes[CurrentDef];
+    Write.OpIndex = i;
+    if (CurrentDef < NumWriteLatencyEntries) {
+      const MCWriteLatencyEntry &WLE =
+          *STI.getWriteLatencyEntry(&SCDesc, CurrentDef);
+      // Conservatively default to MaxLatency.
+      Write.Latency =
+          WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
+      Write.SClassOrWriteResourceID = WLE.WriteResourceID;
+    } else {
+      // Assign a default latency for this write.
+      Write.Latency = ID.MaxLatency;
+      Write.SClassOrWriteResourceID = 0;
+    }
+    Write.IsOptionalDef = false;
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
+    CurrentDef++;
+  }
+
+  if (CurrentDef != NumExplicitDefs)
+    llvm::report_fatal_error(
+        "error: Expected more register operand definitions. ");
+
+  CurrentDef = 0;
+  for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
+    unsigned Index = NumExplicitDefs + CurrentDef;
+    WriteDescriptor &Write = ID.Writes[Index];
+    Write.OpIndex = ~CurrentDef;
+    Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef];
+    if (Index < NumWriteLatencyEntries) {
+      const MCWriteLatencyEntry &WLE =
+          *STI.getWriteLatencyEntry(&SCDesc, Index);
+      // Conservatively default to MaxLatency.
+      Write.Latency =
+          WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
+      Write.SClassOrWriteResourceID = WLE.WriteResourceID;
+    } else {
+      // Assign a default latency for this write.
+      Write.Latency = ID.MaxLatency;
+      Write.SClassOrWriteResourceID = 0;
+    }
+
+    Write.IsOptionalDef = false;
+    assert(Write.RegisterID != 0 && "Expected a valid phys register!");
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
+             << ", PhysReg=" << MRI.getName(Write.RegisterID)
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
+  }
+
+  if (MCDesc.hasOptionalDef()) {
+    // Always assume that the optional definition is the last operand of the
+    // MCInst sequence.
+    const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
+    if (i == MCI.getNumOperands() || !Op.isReg())
+      llvm::report_fatal_error(
+          "error: expected a register operand for an optional "
+          "definition. Instruction has not be correctly analyzed.\n",
+          false);
+
+    WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
+    Write.OpIndex = MCI.getNumOperands() - 1;
+    // Assign a default latency for this write.
+    Write.Latency = ID.MaxLatency;
+    Write.SClassOrWriteResourceID = 0;
+    Write.IsOptionalDef = true;
+  }
+}
+
+void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
+                                 unsigned SchedClassID) {
+  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+
+  // Skip explicit definitions.
+  unsigned i = 0;
+  for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
+    const MCOperand &Op = MCI.getOperand(i);
+    if (Op.isReg())
+      NumExplicitDefs--;
+  }
+
+  if (NumExplicitDefs)
+    llvm::report_fatal_error(
+        "error: Expected more register operand definitions. ", false);
+
+  unsigned NumExplicitUses = MCI.getNumOperands() - i;
+  unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
+  if (MCDesc.hasOptionalDef()) {
+    assert(NumExplicitUses);
+    NumExplicitUses--;
+  }
+  unsigned TotalUses = NumExplicitUses + NumImplicitUses;
+  if (!TotalUses)
+    return;
+
+  ID.Reads.resize(TotalUses);
+  for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
+    ReadDescriptor &Read = ID.Reads[CurrentUse];
+    Read.OpIndex = i + CurrentUse;
+    Read.UseIndex = CurrentUse;
+    Read.SchedClassID = SchedClassID;
+    LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex
+                      << ", UseIndex=" << Read.UseIndex << '\n');
+  }
+
+  for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
+    ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
+    Read.OpIndex = ~CurrentUse;
+    Read.UseIndex = NumExplicitUses + CurrentUse;
+    Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
+    Read.SchedClassID = SchedClassID;
+    LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", RegisterID="
+                      << MRI.getName(Read.RegisterID) << '\n');
+  }
+}
+
+const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
+  assert(STI.getSchedModel().hasInstrSchedModel() &&
+         "Itineraries are not yet supported!");
+
+  // Obtain the instruction descriptor from the opcode.
+  unsigned short Opcode = MCI.getOpcode();
+  const MCInstrDesc &MCDesc = MCII.get(Opcode);
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Then obtain the scheduling class information from the instruction.
+  unsigned SchedClassID = MCDesc.getSchedClass();
+  unsigned CPUID = SM.getProcessorID();
+
+  // Try to solve variant scheduling classes.
+  if (SchedClassID) {
+    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
+
+    if (!SchedClassID)
+      llvm::report_fatal_error("unable to resolve this variant class.");
+  }
+
+  // Check if this instruction is supported. Otherwise, report a fatal error.
+  const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+  if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
+    std::string ToString;
+    llvm::raw_string_ostream OS(ToString);
+    WithColor::error() << "found an unsupported instruction in the input"
+                       << " assembly sequence.\n";
+    MCIP.printInst(&MCI, OS, "", STI);
+    OS.flush();
+
+    WithColor::note() << "instruction: " << ToString << '\n';
+    llvm::report_fatal_error(
+        "Don't know how to analyze unsupported instructions.");
+  }
+
+  // Create a new empty descriptor.
+  std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
+  ID->NumMicroOps = SCDesc.NumMicroOps;
+
+  if (MCDesc.isCall()) {
+    // We don't correctly model calls.
+    WithColor::warning() << "found a call in the input assembly sequence.\n";
+    WithColor::note() << "call instructions are not correctly modeled. "
+                      << "Assume a latency of 100cy.\n";
+  }
+
+  if (MCDesc.isReturn()) {
+    WithColor::warning() << "found a return instruction in the input"
+                         << " assembly sequence.\n";
+    WithColor::note() << "program counter updates are ignored.\n";
+  }
+
+  ID->MayLoad = MCDesc.mayLoad();
+  ID->MayStore = MCDesc.mayStore();
+  ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
+
+  initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
+  computeMaxLatency(*ID, MCDesc, SCDesc, STI);
+  populateWrites(*ID, MCI, SchedClassID);
+  populateReads(*ID, MCI, SchedClassID);
+
+  LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
+  LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
+
+  // Now add the new descriptor.
+  SchedClassID = MCDesc.getSchedClass();
+  if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
+    Descriptors[MCI.getOpcode()] = std::move(ID);
+    return *Descriptors[MCI.getOpcode()];
+  }
+
+  VariantDescriptors[&MCI] = std::move(ID);
+  return *VariantDescriptors[&MCI];
+}
+
+const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) {
+  if (Descriptors.find_as(MCI.getOpcode()) != Descriptors.end())
+    return *Descriptors[MCI.getOpcode()];
+
+  if (VariantDescriptors.find(&MCI) != VariantDescriptors.end())
+    return *VariantDescriptors[&MCI];
+
+  return createInstrDescImpl(MCI);
+}
+
+std::unique_ptr<Instruction>
+InstrBuilder::createInstruction(const MCInst &MCI) {
+  const InstrDesc &D = getOrCreateInstrDesc(MCI);
+  std::unique_ptr<Instruction> NewIS = llvm::make_unique<Instruction>(D);
+
+  // Initialize Reads first.
+  for (const ReadDescriptor &RD : D.Reads) {
+    int RegID = -1;
+    if (!RD.isImplicitRead()) {
+      // explicit read.
+      const MCOperand &Op = MCI.getOperand(RD.OpIndex);
+      // Skip non-register operands.
+      if (!Op.isReg())
+        continue;
+      RegID = Op.getReg();
+    } else {
+      // Implicit read.
+      RegID = RD.RegisterID;
+    }
+
+    // Skip invalid register operands.
+    if (!RegID)
+      continue;
+
+    // Okay, this is a register operand. Create a ReadState for it.
+    assert(RegID > 0 && "Invalid register ID found!");
+    NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
+  }
+
+  // Early exit if there are no writes.
+  if (D.Writes.empty())
+    return NewIS;
+
+  // Track register writes that implicitly clear the upper portion of the
+  // underlying super-registers using an APInt.
+  APInt WriteMask(D.Writes.size(), 0);
+
+  // Now query the MCInstrAnalysis object to obtain information about which
+  // register writes implicitly clear the upper portion of a super-register.
+  MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
+
+  // Initialize writes.
+  unsigned WriteIndex = 0;
+  for (const WriteDescriptor &WD : D.Writes) {
+    unsigned RegID = WD.isImplicitWrite() ? WD.RegisterID
+                                          : MCI.getOperand(WD.OpIndex).getReg();
+    // Check if this is a optional definition that references NoReg.
+    if (WD.IsOptionalDef && !RegID) {
+      ++WriteIndex;
+      continue;
+    }
+
+    assert(RegID && "Expected a valid register ID!");
+    NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(
+        WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex]));
+    ++WriteIndex;
+  }
+
+  return NewIS;
+}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/InstrBuilder.h b/contrib/llvm/tools/llvm-mca/InstrBuilder.h
new file mode 100644
index 000000000000..69a53b6fec21
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/InstrBuilder.h
@@ -0,0 +1,85 @@
+//===--------------------- InstrBuilder.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A builder class for instructions that are statically analyzed by llvm-mca.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+
+#include "Instruction.h"
+#include "Support.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace mca {
+
+class DispatchUnit;
+
+/// A builder class that knows how to construct Instruction objects.
+///
+/// Every llvm-mca Instruction is described by an object of class InstrDesc.
+/// An InstrDesc describes which registers are read/written by the instruction,
+/// as well as the instruction latency and hardware resources consumed.
+///
+/// This class is used by the tool to construct Instructions and instruction
+/// descriptors (i.e. InstrDesc objects).
+/// Information from the machine scheduling model is used to identify processor
+/// resources that are consumed by an instruction.
+class InstrBuilder {
+  const llvm::MCSubtargetInfo &STI;
+  const llvm::MCInstrInfo &MCII;
+  const llvm::MCRegisterInfo &MRI;
+  const llvm::MCInstrAnalysis &MCIA;
+  llvm::MCInstPrinter &MCIP;
+  llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
+
+  llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
+  llvm::DenseMap<const llvm::MCInst *, std::unique_ptr<const InstrDesc>>
+      VariantDescriptors;
+
+  const InstrDesc &createInstrDescImpl(const llvm::MCInst &MCI);
+  InstrBuilder(const InstrBuilder &) = delete;
+  InstrBuilder &operator=(const InstrBuilder &) = delete;
+
+  void populateWrites(InstrDesc &ID, const llvm::MCInst &MCI,
+                      unsigned SchedClassID);
+  void populateReads(InstrDesc &ID, const llvm::MCInst &MCI,
+                     unsigned SchedClassID);
+
+public:
+  InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
+               const llvm::MCRegisterInfo &mri,
+               const llvm::MCInstrAnalysis &mcia, llvm::MCInstPrinter &mcip)
+      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), MCIP(mcip),
+        ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
+    computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
+  }
+
+  const InstrDesc &getOrCreateInstrDesc(const llvm::MCInst &MCI);
+  // Returns an array of processor resource masks.
+  // Masks are computed by function mca::computeProcResourceMasks. see
+  // Support.h for a description of how masks are computed and how masks can be
+  // used to solve set membership problems.
+  llvm::ArrayRef<uint64_t> getProcResourceMasks() const {
+    return ProcResourceMasks;
+  }
+
+  void clear() { VariantDescriptors.shrink_and_clear(); }
+
+  std::unique_ptr<Instruction> createInstruction(const llvm::MCInst &MCI);
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/Instruction.cpp b/contrib/llvm/tools/llvm-mca/Instruction.cpp
new file mode 100644
index 000000000000..0c8476705572
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Instruction.cpp
@@ -0,0 +1,177 @@
+//===--------------------- Instruction.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines abstractions used by the Pipeline to model register reads,
+// register writes and instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void ReadState::writeStartEvent(unsigned Cycles) {
+  assert(DependentWrites);
+  assert(CyclesLeft == UNKNOWN_CYCLES);
+
+  // This read may be dependent on more than one write. This typically occurs
+  // when a definition is the result of multiple writes where at least one
+  // write does a partial register update.
+  // The HW is forced to do some extra bookkeeping to track of all the
+  // dependent writes, and implement a merging scheme for the partial writes.
+  --DependentWrites;
+  TotalCycles = std::max(TotalCycles, Cycles);
+
+  if (!DependentWrites) {
+    CyclesLeft = TotalCycles;
+    IsReady = !CyclesLeft;
+  }
+}
+
+void WriteState::onInstructionIssued() {
+  assert(CyclesLeft == UNKNOWN_CYCLES);
+  // Update the number of cycles left based on the WriteDescriptor info.
+  CyclesLeft = getLatency();
+
+  // Now that the time left before write-back is known, notify
+  // all the users.
+  for (const std::pair<ReadState *, int> &User : Users) {
+    ReadState *RS = User.first;
+    unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
+    RS->writeStartEvent(ReadCycles);
+  }
+}
+
+void WriteState::addUser(ReadState *User, int ReadAdvance) {
+  // If CyclesLeft is different than -1, then we don't need to
+  // update the list of users. We can just notify the user with
+  // the actual number of cycles left (which may be zero).
+  if (CyclesLeft != UNKNOWN_CYCLES) {
+    unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance);
+    User->writeStartEvent(ReadCycles);
+    return;
+  }
+
+  std::pair<ReadState *, int> NewPair(User, ReadAdvance);
+  Users.insert(NewPair);
+}
+
+void WriteState::cycleEvent() {
+  // Note: CyclesLeft can be a negative number. It is an error to
+  // make it an unsigned quantity because users of this write may
+  // specify a negative ReadAdvance.
+  if (CyclesLeft != UNKNOWN_CYCLES)
+    CyclesLeft--;
+}
+
+void ReadState::cycleEvent() {
+  // Update the total number of cycles.
+  if (DependentWrites && TotalCycles) {
+    --TotalCycles;
+    return;
+  }
+
+  // Bail out immediately if we don't know how many cycles are left.
+  if (CyclesLeft == UNKNOWN_CYCLES)
+    return;
+
+  if (CyclesLeft) {
+    --CyclesLeft;
+    IsReady = !CyclesLeft;
+  }
+}
+
+#ifndef NDEBUG
+void WriteState::dump() const {
+  dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << getLatency() << ", RegID "
+         << getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }";
+}
+
+void WriteRef::dump() const {
+  dbgs() << "IID=" << getSourceIndex() << ' ';
+  if (isValid())
+    getWriteState()->dump();
+  else
+    dbgs() << "(null)";
+}
+#endif
+
+void Instruction::dispatch(unsigned RCUToken) {
+  assert(Stage == IS_INVALID);
+  Stage = IS_AVAILABLE;
+  RCUTokenID = RCUToken;
+
+  // Check if input operands are already available.
+  update();
+}
+
+void Instruction::execute() {
+  assert(Stage == IS_READY);
+  Stage = IS_EXECUTING;
+
+  // Set the cycles left before the write-back stage.
+  CyclesLeft = Desc.MaxLatency;
+
+  for (UniqueDef &Def : Defs)
+    Def->onInstructionIssued();
+
+  // Transition to the "executed" stage if this is a zero-latency instruction.
+  if (!CyclesLeft)
+    Stage = IS_EXECUTED;
+}
+
+void Instruction::update() {
+  assert(isDispatched() && "Unexpected instruction stage found!");
+
+  if (!llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+    return;
+
+  // A partial register write cannot complete before a dependent write.
+  auto IsDefReady = [&](const UniqueDef &Def) {
+    if (const WriteState *Write = Def->getDependentWrite()) {
+      int WriteLatency = Write->getCyclesLeft();
+      if (WriteLatency == UNKNOWN_CYCLES)
+        return false;
+      return static_cast<unsigned>(WriteLatency) < Desc.MaxLatency;
+    }
+    return true;
+  };
+
+  if (llvm::all_of(Defs, IsDefReady))
+    Stage = IS_READY;
+}
+
+void Instruction::cycleEvent() {
+  if (isReady())
+    return;
+
+  if (isDispatched()) {
+    for (UniqueUse &Use : Uses)
+      Use->cycleEvent();
+
+    update();
+    return;
+  }
+
+  assert(isExecuting() && "Instruction not in-flight?");
+  assert(CyclesLeft && "Instruction already executed?");
+  for (UniqueDef &Def : Defs)
+    Def->cycleEvent();
+  CyclesLeft--;
+  if (!CyclesLeft)
+    Stage = IS_EXECUTED;
+}
+
+const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/Instruction.h b/contrib/llvm/tools/llvm-mca/Instruction.h
new file mode 100644
index 000000000000..ddf5c3a5e33f
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Instruction.h
@@ -0,0 +1,427 @@
+//===--------------------- Instruction.h ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines abstractions used by the Pipeline to model register reads,
+/// register writes and instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+
+#include "llvm/Support/MathExtras.h"
+
+#ifndef NDEBUG
+#include "llvm/Support/raw_ostream.h"
+#endif
+
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace mca {
+
+constexpr int UNKNOWN_CYCLES = -512;
+
+/// A register write descriptor.
+struct WriteDescriptor {
+  // Operand index. The index is negative for implicit writes only.
+  // For implicit writes, the actual operand index is computed performing
+  // a bitwise not of the OpIndex.
+  int OpIndex;
+  // Write latency. Number of cycles before write-back stage.
+  unsigned Latency;
+  // This field is set to a value different than zero only if this
+  // is an implicit definition.
+  unsigned RegisterID;
+  // Instruction itineraries would set this field to the SchedClass ID.
+  // Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry
+  // element associated to this write.
+  // When computing read latencies, this value is matched against the
+  // "ReadAdvance" information. The hardware backend may implement
+  // dedicated forwarding paths to quickly propagate write results to dependent
+  // instructions waiting in the reservation station (effectively bypassing the
+  // write-back stage).
+  unsigned SClassOrWriteResourceID;
+  // True only if this is a write obtained from an optional definition.
+  // Optional definitions are allowed to reference regID zero (i.e. "no
+  // register").
+  bool IsOptionalDef;
+
+  bool isImplicitWrite() const { return OpIndex < 0; };
+};
+
+/// A register read descriptor.
+struct ReadDescriptor {
+  // A MCOperand index. This is used by the Dispatch logic to identify register
+  // reads. Implicit reads have negative indices. The actual operand index of an
+  // implicit read is the bitwise not of field OpIndex.
+  int OpIndex;
+  // The actual "UseIdx". This is used to query the ReadAdvance table. Explicit
+  // uses always come first in the sequence of uses.
+  unsigned UseIndex;
+  // This field is only set if this is an implicit read.
+  unsigned RegisterID;
+  // Scheduling Class Index. It is used to query the scheduling model for the
+  // MCSchedClassDesc object.
+  unsigned SchedClassID;
+
+  bool isImplicitRead() const { return OpIndex < 0; };
+};
+
+class ReadState;
+
+/// Tracks uses of a register definition (e.g. register write).
+///
+/// Each implicit/explicit register write is associated with an instance of
+/// this class. A WriteState object tracks the dependent users of a
+/// register write. It also tracks how many cycles are left before the write
+/// back stage.
+class WriteState {
+  const WriteDescriptor &WD;
+  // On instruction issue, this field is set equal to the write latency.
+  // Before instruction issue, this field defaults to -512, a special
+  // value that represents an "unknown" number of cycles.
+  int CyclesLeft;
+
+  // Actual register defined by this write. This field is only used
+  // to speedup queries on the register file.
+  // For implicit writes, this field always matches the value of
+  // field RegisterID from WD.
+  unsigned RegisterID;
+
+  // True if this write implicitly clears the upper portion of RegisterID's
+  // super-registers.
+  bool ClearsSuperRegs;
+
+  // This field is set if this is a partial register write, and it has a false
+  // dependency on any previous write of the same register (or a portion of it).
+  // DependentWrite must be able to complete before this write completes, so
+  // that we don't break the WAW, and the two writes can be merged together.
+  const WriteState *DependentWrite;
+
+  // A list of dependent reads. Users is a set of dependent
+  // reads. A dependent read is added to the set only if CyclesLeft
+  // is "unknown". As soon as CyclesLeft is 'known', each user in the set
+  // gets notified with the actual CyclesLeft.
+
+  // The 'second' element of a pair is a "ReadAdvance" number of cycles.
+  std::set<std::pair<ReadState *, int>> Users;
+
+public:
+  WriteState(const WriteDescriptor &Desc, unsigned RegID,
+             bool clearsSuperRegs = false)
+      : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
+        ClearsSuperRegs(clearsSuperRegs), DependentWrite(nullptr) {}
+  WriteState(const WriteState &Other) = delete;
+  WriteState &operator=(const WriteState &Other) = delete;
+
+  int getCyclesLeft() const { return CyclesLeft; }
+  unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; }
+  unsigned getRegisterID() const { return RegisterID; }
+  unsigned getLatency() const { return WD.Latency; }
+
+  void addUser(ReadState *Use, int ReadAdvance);
+  unsigned getNumUsers() const { return Users.size(); }
+  bool clearsSuperRegisters() const { return ClearsSuperRegs; }
+
+  const WriteState *getDependentWrite() const { return DependentWrite; }
+  void setDependentWrite(const WriteState *Write) { DependentWrite = Write; }
+
+  // On every cycle, update CyclesLeft and notify dependent users.
+  void cycleEvent();
+  void onInstructionIssued();
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// Tracks register operand latency in cycles.
+///
+/// A read may be dependent on more than one write. This occurs when some
+/// writes only partially update the register associated to this read.
+class ReadState {
+  const ReadDescriptor &RD;
+  // Physical register identified associated to this read.
+  unsigned RegisterID;
+  // Number of writes that contribute to the definition of RegisterID.
+  // In the absence of partial register updates, the number of DependentWrites
+  // cannot be more than one.
+  unsigned DependentWrites;
+  // Number of cycles left before RegisterID can be read. This value depends on
+  // the latency of all the dependent writes. It defaults to UNKNOWN_CYCLES.
+  // It gets set to the value of field TotalCycles only when the 'CyclesLeft' of
+  // every dependent write is known.
+  int CyclesLeft;
+  // This field is updated on every writeStartEvent(). When the number of
+  // dependent writes (i.e. field DependentWrite) is zero, this value is
+  // propagated to field CyclesLeft.
+  unsigned TotalCycles;
+  // This field is set to true only if there are no dependent writes, and
+  // there are no `CyclesLeft' to wait.
+  bool IsReady;
+
+public:
+  bool isReady() const { return IsReady; }
+
+  ReadState(const ReadDescriptor &Desc, unsigned RegID)
+      : RD(Desc), RegisterID(RegID), DependentWrites(0),
+        CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {}
+  ReadState(const ReadState &Other) = delete;
+  ReadState &operator=(const ReadState &Other) = delete;
+
+  const ReadDescriptor &getDescriptor() const { return RD; }
+  unsigned getSchedClass() const { return RD.SchedClassID; }
+  unsigned getRegisterID() const { return RegisterID; }
+
+  void cycleEvent();
+  void writeStartEvent(unsigned Cycles);
+  void setDependentWrites(unsigned Writes) {
+    DependentWrites = Writes;
+    IsReady = !Writes;
+  }
+};
+
+/// A sequence of cycles.
+///
+/// This class can be used as a building block to construct ranges of cycles.
+class CycleSegment {
+  unsigned Begin; // Inclusive.
+  unsigned End;   // Exclusive.
+  bool Reserved;  // Resources associated to this segment must be reserved.
+
+public:
+  CycleSegment(unsigned StartCycle, unsigned EndCycle, bool IsReserved = false)
+      : Begin(StartCycle), End(EndCycle), Reserved(IsReserved) {}
+
+  bool contains(unsigned Cycle) const { return Cycle >= Begin && Cycle < End; }
+  bool startsAfter(const CycleSegment &CS) const { return End <= CS.Begin; }
+  bool endsBefore(const CycleSegment &CS) const { return Begin >= CS.End; }
+  bool overlaps(const CycleSegment &CS) const {
+    return !startsAfter(CS) && !endsBefore(CS);
+  }
+  bool isExecuting() const { return Begin == 0 && End != 0; }
+  bool isExecuted() const { return End == 0; }
+  bool operator<(const CycleSegment &Other) const {
+    return Begin < Other.Begin;
+  }
+  CycleSegment &operator--(void) {
+    if (Begin)
+      Begin--;
+    if (End)
+      End--;
+    return *this;
+  }
+
+  bool isValid() const { return Begin <= End; }
+  unsigned size() const { return End - Begin; };
+  void Subtract(unsigned Cycles) {
+    assert(End >= Cycles);
+    End -= Cycles;
+  }
+
+  unsigned begin() const { return Begin; }
+  unsigned end() const { return End; }
+  void setEnd(unsigned NewEnd) { End = NewEnd; }
+  bool isReserved() const { return Reserved; }
+  void setReserved() { Reserved = true; }
+};
+
+/// Helper used by class InstrDesc to describe how hardware resources
+/// are used.
+///
+/// This class describes how many resource units of a specific resource kind
+/// (and how many cycles) are "used" by an instruction.
+struct ResourceUsage {
+  CycleSegment CS;
+  unsigned NumUnits;
+  ResourceUsage(CycleSegment Cycles, unsigned Units = 1)
+      : CS(Cycles), NumUnits(Units) {}
+  unsigned size() const { return CS.size(); }
+  bool isReserved() const { return CS.isReserved(); }
+  void setReserved() { CS.setReserved(); }
+};
+
+/// An instruction descriptor
+struct InstrDesc {
+  std::vector<WriteDescriptor> Writes; // Implicit writes are at the end.
+  std::vector<ReadDescriptor> Reads;   // Implicit reads are at the end.
+
+  // For every resource used by an instruction of this kind, this vector
+  // reports the number of "consumed cycles".
+  std::vector<std::pair<uint64_t, ResourceUsage>> Resources;
+
+  // A list of buffered resources consumed by this instruction.
+  std::vector<uint64_t> Buffers;
+  unsigned MaxLatency;
+  // Number of MicroOps for this instruction.
+  unsigned NumMicroOps;
+
+  bool MayLoad;
+  bool MayStore;
+  bool HasSideEffects;
+
+  // A zero latency instruction doesn't consume any scheduler resources.
+  bool isZeroLatency() const { return !MaxLatency && Resources.empty(); }
+};
+
+/// An instruction propagated through the simulated instruction pipeline.
+///
+/// This class is used to monitor changes to the internal state of instructions
+/// that are sent to the various components of the simulated hardware pipeline.
+class Instruction {
+  const InstrDesc &Desc;
+
+  enum InstrStage {
+    IS_INVALID,   // Instruction in an invalid state.
+    IS_AVAILABLE, // Instruction dispatched but operands are not ready.
+    IS_READY,     // Instruction dispatched and operands ready.
+    IS_EXECUTING, // Instruction issued.
+    IS_EXECUTED,  // Instruction executed. Values are written back.
+    IS_RETIRED    // Instruction retired.
+  };
+
+  // The current instruction stage.
+  enum InstrStage Stage;
+
+  // This value defaults to the instruction latency. This instruction is
+  // considered executed when field CyclesLeft goes to zero.
+  int CyclesLeft;
+
+  // Retire Unit token ID for this instruction.
+  unsigned RCUTokenID;
+
+  using UniqueDef = std::unique_ptr<WriteState>;
+  using UniqueUse = std::unique_ptr<ReadState>;
+  using VecDefs = std::vector<UniqueDef>;
+  using VecUses = std::vector<UniqueUse>;
+
+  // Output dependencies.
+  // One entry per each implicit and explicit register definition.
+  VecDefs Defs;
+
+  // Input dependencies.
+  // One entry per each implicit and explicit register use.
+  VecUses Uses;
+
+public:
+  Instruction(const InstrDesc &D)
+      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES) {}
+  Instruction(const Instruction &Other) = delete;
+  Instruction &operator=(const Instruction &Other) = delete;
+
+  VecDefs &getDefs() { return Defs; }
+  const VecDefs &getDefs() const { return Defs; }
+  VecUses &getUses() { return Uses; }
+  const VecUses &getUses() const { return Uses; }
+  const InstrDesc &getDesc() const { return Desc; }
+  unsigned getRCUTokenID() const { return RCUTokenID; }
+  int getCyclesLeft() const { return CyclesLeft; }
+
+  unsigned getNumUsers() const {
+    unsigned NumUsers = 0;
+    for (const UniqueDef &Def : Defs)
+      NumUsers += Def->getNumUsers();
+    return NumUsers;
+  }
+
+  // Transition to the dispatch stage, and assign a RCUToken to this
+  // instruction. The RCUToken is used to track the completion of every
+  // register write performed by this instruction.
+  void dispatch(unsigned RCUTokenID);
+
+  // Instruction issued. Transition to the IS_EXECUTING state, and update
+  // all the definitions.
+  void execute();
+
+  // Force a transition from the IS_AVAILABLE state to the IS_READY state if
+  // input operands are all ready. State transitions normally occur at the
+  // beginning of a new cycle (see method cycleEvent()). However, the scheduler
+  // may decide to promote instructions from the wait queue to the ready queue
+  // as the result of another issue event.  This method is called every time the
+  // instruction might have changed in state.
+  void update();
+
+  bool isDispatched() const { return Stage == IS_AVAILABLE; }
+  bool isReady() const { return Stage == IS_READY; }
+  bool isExecuting() const { return Stage == IS_EXECUTING; }
+  bool isExecuted() const { return Stage == IS_EXECUTED; }
+  bool isRetired() const { return Stage == IS_RETIRED; }
+
+  void retire() {
+    assert(isExecuted() && "Instruction is in an invalid state!");
+    Stage = IS_RETIRED;
+  }
+
+  void cycleEvent();
+};
+
+/// An InstRef contains both a SourceMgr index and Instruction pair.  The index
+/// is used as a unique identifier for the instruction.  MCA will make use of
+/// this index as a key throughout MCA.
+class InstRef : public std::pair<unsigned, Instruction *> {
+public:
+  InstRef() : std::pair<unsigned, Instruction *>(0, nullptr) {}
+  InstRef(unsigned Index, Instruction *I)
+      : std::pair<unsigned, Instruction *>(Index, I) {}
+
+  unsigned getSourceIndex() const { return first; }
+  Instruction *getInstruction() { return second; }
+  const Instruction *getInstruction() const { return second; }
+
+  /// Returns true if  this InstRef has been populated.
+  bool isValid() const { return second != nullptr; }
+
+#ifndef NDEBUG
+  void print(llvm::raw_ostream &OS) const { OS << getSourceIndex(); }
+#endif
+};
+
+#ifndef NDEBUG
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const InstRef &IR) {
+  IR.print(OS);
+  return OS;
+}
+#endif
+
+/// A reference to a register write.
+///
+/// This class is mainly used by the register file to describe register
+/// mappings. It correlates a register write to the source index of the
+/// defining instruction.
+class WriteRef {
+  std::pair<unsigned, WriteState *> Data;
+  static const unsigned INVALID_IID;
+
+public:
+  WriteRef() : Data(INVALID_IID, nullptr) {}
+  WriteRef(unsigned SourceIndex, WriteState *WS) : Data(SourceIndex, WS) {}
+
+  unsigned getSourceIndex() const { return Data.first; }
+  const WriteState *getWriteState() const { return Data.second; }
+  WriteState *getWriteState() { return Data.second; }
+  void invalidate() { Data = std::make_pair(INVALID_IID, nullptr); }
+
+  bool isValid() const {
+    return Data.first != INVALID_IID && Data.second != nullptr;
+  }
+  bool operator==(const WriteRef &Other) const {
+    return Data == Other.Data;
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/InstructionInfoView.cpp b/contrib/llvm/tools/llvm-mca/InstructionInfoView.cpp
new file mode 100644
index 000000000000..0e50a96d19c1
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/InstructionInfoView.cpp
@@ -0,0 +1,91 @@
+//===--------------------- InstructionInfoView.cpp --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the InstructionInfoView API.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstructionInfoView.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void InstructionInfoView::printView(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  const MCSchedModel &SM = STI.getSchedModel();
+  unsigned Instructions = Source.size();
+
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  TempStream << "\n\nInstruction Info:\n";
+  TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
+             << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n\n";
+
+  TempStream << "[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
+  for (unsigned I = 0, E = Instructions; I < E; ++I) {
+    const MCInst &Inst = Source.getMCInstFromIndex(I);
+    const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
+
+    // Obtain the scheduling class information from the instruction.
+    unsigned SchedClassID = MCDesc.getSchedClass();
+    unsigned CPUID = SM.getProcessorID();
+
+    // Try to solve variant scheduling classes.
+    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &Inst, CPUID);
+
+    const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+    unsigned NumMicroOpcodes = SCDesc.NumMicroOps;
+    unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+    Optional<double> RThroughput =
+        MCSchedModel::getReciprocalThroughput(STI, SCDesc);
+
+    TempStream << ' ' << NumMicroOpcodes << "    ";
+    if (NumMicroOpcodes < 10)
+      TempStream << "  ";
+    else if (NumMicroOpcodes < 100)
+      TempStream << ' ';
+    TempStream << Latency << "   ";
+    if (Latency < 10)
+      TempStream << "  ";
+    else if (Latency < 100)
+      TempStream << ' ';
+
+    if (RThroughput.hasValue()) {
+      double RT = RThroughput.getValue();
+      TempStream << format("%.2f", RT) << ' ';
+      if (RT < 10.0)
+        TempStream << "  ";
+      else if (RT < 100.0)
+        TempStream << ' ';
+    } else {
+      TempStream << " -     ";
+    }
+    TempStream << (MCDesc.mayLoad() ? " *     " : "       ");
+    TempStream << (MCDesc.mayStore() ? " *     " : "       ");
+    TempStream << (MCDesc.hasUnmodeledSideEffects() ? " U " : "   ");
+
+    MCIP.printInst(&Inst, InstrStream, "", STI);
+    InstrStream.flush();
+
+    // Consume any tabs or spaces at the beginning of the string.
+    StringRef Str(Instruction);
+    Str = Str.ltrim();
+    TempStream << "    " << Str << '\n';
+    Instruction = "";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+} // namespace mca.
diff --git a/contrib/llvm/tools/llvm-mca/InstructionInfoView.h b/contrib/llvm/tools/llvm-mca/InstructionInfoView.h
new file mode 100644
index 000000000000..0770ae3d2b57
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/InstructionInfoView.h
@@ -0,0 +1,66 @@
+//===--------------------- InstructionInfoView.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the instruction info view.
+///
+/// The goal fo the instruction info view is to print the latency and reciprocal
+/// throughput information for every instruction in the input sequence.
+/// This section also reports extra information related to the number of micro
+/// opcodes, and opcode properties (i.e. 'MayLoad', 'MayStore', 'HasSideEffects)
+///
+/// Example:
+///
+/// Instruction Info:
+/// [1]: #uOps
+/// [2]: Latency
+/// [3]: RThroughput
+/// [4]: MayLoad
+/// [5]: MayStore
+/// [6]: HasSideEffects
+///
+/// [1]    [2]    [3]    [4]    [5]    [6]	Instructions:
+///  1      2     1.00                    	vmulps	%xmm0, %xmm1, %xmm2
+///  1      3     1.00                    	vhaddps	%xmm2, %xmm2, %xmm3
+///  1      3     1.00                    	vhaddps	%xmm3, %xmm3, %xmm4
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
+
+#include "SourceMgr.h"
+#include "View.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+/// A view that prints out generic instruction information.
+class InstructionInfoView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  const llvm::MCInstrInfo &MCII;
+  const SourceMgr &Source;
+  llvm::MCInstPrinter &MCIP;
+
+public:
+  InstructionInfoView(const llvm::MCSubtargetInfo &sti,
+                      const llvm::MCInstrInfo &mcii, const SourceMgr &S,
+                      llvm::MCInstPrinter &IP)
+      : STI(sti), MCII(mcii), Source(S), MCIP(IP) {}
+
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/InstructionTables.cpp b/contrib/llvm/tools/llvm-mca/InstructionTables.cpp
new file mode 100644
index 000000000000..9b9dbc37fbdb
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/InstructionTables.cpp
@@ -0,0 +1,70 @@
+//===--------------------- InstructionTables.cpp ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the method InstructionTables::execute().
+/// Method execute() prints a theoretical resource pressure distribution based
+/// on the information available in the scheduling model, and without running
+/// the pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstructionTables.h"
+
+namespace mca {
+
+using namespace llvm;
+
+bool InstructionTables::execute(InstRef &IR) {
+  ArrayRef<uint64_t> Masks = IB.getProcResourceMasks();
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  UsedResources.clear();
+
+  // Identify the resources consumed by this instruction.
+  for (const std::pair<uint64_t, ResourceUsage> Resource : Desc.Resources) {
+    // Skip zero-cycle resources (i.e., unused resources).
+    if (!Resource.second.size())
+      continue;
+    double Cycles = static_cast<double>(Resource.second.size());
+    unsigned Index = std::distance(
+        Masks.begin(), std::find(Masks.begin(), Masks.end(), Resource.first));
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(Index);
+    unsigned NumUnits = ProcResource.NumUnits;
+    if (!ProcResource.SubUnitsIdxBegin) {
+      // The number of cycles consumed by each unit.
+      Cycles /= NumUnits;
+      for (unsigned I = 0, E = NumUnits; I < E; ++I) {
+        ResourceRef ResourceUnit = std::make_pair(Index, 1U << I);
+        UsedResources.emplace_back(std::make_pair(ResourceUnit, Cycles));
+      }
+      continue;
+    }
+
+    // This is a group. Obtain the set of resources contained in this
+    // group. Some of these resources may implement multiple units.
+    // Uniformly distribute Cycles across all of the units.
+    for (unsigned I1 = 0; I1 < NumUnits; ++I1) {
+      unsigned SubUnitIdx = ProcResource.SubUnitsIdxBegin[I1];
+      const MCProcResourceDesc &SubUnit = *SM.getProcResource(SubUnitIdx);
+      // Compute the number of cycles consumed by each resource unit.
+      double RUCycles = Cycles / (NumUnits * SubUnit.NumUnits);
+      for (unsigned I2 = 0, E2 = SubUnit.NumUnits; I2 < E2; ++I2) {
+        ResourceRef ResourceUnit = std::make_pair(SubUnitIdx, 1U << I2);
+        UsedResources.emplace_back(std::make_pair(ResourceUnit, RUCycles));
+      }
+    }
+  }
+
+  // Send a fake instruction issued event to all the views.
+  HWInstructionIssuedEvent Event(IR, UsedResources);
+  notifyEvent<HWInstructionIssuedEvent>(Event);
+  return true;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/InstructionTables.h b/contrib/llvm/tools/llvm-mca/InstructionTables.h
new file mode 100644
index 000000000000..18e019988430
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/InstructionTables.h
@@ -0,0 +1,43 @@
+//===--------------------- InstructionTables.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a custom stage to generate instruction tables.
+/// See the description of command-line flag -instruction-tables in
+/// docs/CommandGuide/lvm-mca.rst
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H
+
+#include "InstrBuilder.h"
+#include "Scheduler.h"
+#include "Stage.h"
+#include "View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSchedule.h"
+
+namespace mca {
+
+class InstructionTables : public Stage {
+  const llvm::MCSchedModel &SM;
+  InstrBuilder &IB;
+  llvm::SmallVector<std::pair<ResourceRef, double>, 4> UsedResources;
+
+public:
+  InstructionTables(const llvm::MCSchedModel &Model, InstrBuilder &Builder)
+      : Stage(), SM(Model), IB(Builder) {}
+
+  bool hasWorkToComplete() const override final { return false; }
+  bool execute(InstRef &IR) override final;
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/LSUnit.cpp b/contrib/llvm/tools/llvm-mca/LSUnit.cpp
new file mode 100644
index 000000000000..9ee3b6171893
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/LSUnit.cpp
@@ -0,0 +1,148 @@
+//===----------------------- LSUnit.cpp --------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load-Store Unit for the llvm-mca tool.
+///
+//===----------------------------------------------------------------------===//
+
+#include "LSUnit.h"
+#include "Instruction.h"
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+#ifndef NDEBUG
+void LSUnit::dump() const {
+  dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
+  dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n';
+  dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n';
+  dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n';
+}
+#endif
+
+void LSUnit::assignLQSlot(unsigned Index) {
+  assert(!isLQFull());
+  assert(LoadQueue.count(Index) == 0);
+
+  LLVM_DEBUG(dbgs() << "[LSUnit] - AssignLQSlot <Idx=" << Index
+                    << ",slot=" << LoadQueue.size() << ">\n");
+  LoadQueue.insert(Index);
+}
+
+void LSUnit::assignSQSlot(unsigned Index) {
+  assert(!isSQFull());
+  assert(StoreQueue.count(Index) == 0);
+
+  LLVM_DEBUG(dbgs() << "[LSUnit] - AssignSQSlot <Idx=" << Index
+                    << ",slot=" << StoreQueue.size() << ">\n");
+  StoreQueue.insert(Index);
+}
+
+bool LSUnit::reserve(const InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  unsigned MayLoad = Desc.MayLoad;
+  unsigned MayStore = Desc.MayStore;
+  unsigned IsMemBarrier = Desc.HasSideEffects;
+  if (!MayLoad && !MayStore)
+    return false;
+
+  const unsigned Index = IR.getSourceIndex();
+  if (MayLoad) {
+    if (IsMemBarrier)
+      LoadBarriers.insert(Index);
+    assignLQSlot(Index);
+  }
+  if (MayStore) {
+    if (IsMemBarrier)
+      StoreBarriers.insert(Index);
+    assignSQSlot(Index);
+  }
+  return true;
+}
+
+bool LSUnit::isReady(const InstRef &IR) const {
+  const unsigned Index = IR.getSourceIndex();
+  bool IsALoad = LoadQueue.count(Index) != 0;
+  bool IsAStore = StoreQueue.count(Index) != 0;
+  assert((IsALoad || IsAStore) && "Instruction is not in queue!");
+
+  if (IsALoad && !LoadBarriers.empty()) {
+    unsigned LoadBarrierIndex = *LoadBarriers.begin();
+    if (Index > LoadBarrierIndex)
+      return false;
+    if (Index == LoadBarrierIndex && Index != *LoadQueue.begin())
+      return false;
+  }
+
+  if (IsAStore && !StoreBarriers.empty()) {
+    unsigned StoreBarrierIndex = *StoreBarriers.begin();
+    if (Index > StoreBarrierIndex)
+      return false;
+    if (Index == StoreBarrierIndex && Index != *StoreQueue.begin())
+      return false;
+  }
+
+  if (NoAlias && IsALoad)
+    return true;
+
+  if (StoreQueue.size()) {
+    // Check if this memory operation is younger than the older store.
+    if (Index > *StoreQueue.begin())
+      return false;
+  }
+
+  // Okay, we are older than the oldest store in the queue.
+  // If there are no pending loads, then we can say for sure that this
+  // instruction is ready.
+  if (isLQEmpty())
+    return true;
+
+  // Check if there are no older loads.
+  if (Index <= *LoadQueue.begin())
+    return true;
+
+  // There is at least one younger load.
+  return !IsAStore;
+}
+
+void LSUnit::onInstructionExecuted(const InstRef &IR) {
+  const unsigned Index = IR.getSourceIndex();
+  std::set<unsigned>::iterator it = LoadQueue.find(Index);
+  if (it != LoadQueue.end()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the load queue.\n");
+    LoadQueue.erase(it);
+  }
+
+  it = StoreQueue.find(Index);
+  if (it != StoreQueue.end()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the store queue.\n");
+    StoreQueue.erase(it);
+  }
+
+  if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the set of store barriers.\n");
+    StoreBarriers.erase(StoreBarriers.begin());
+  }
+  if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the set of load barriers.\n");
+    LoadBarriers.erase(LoadBarriers.begin());
+  }
+}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/LSUnit.h b/contrib/llvm/tools/llvm-mca/LSUnit.h
new file mode 100644
index 000000000000..817522190589
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/LSUnit.h
@@ -0,0 +1,147 @@
+//===------------------------- LSUnit.h --------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load/Store unit class that models load/store queues and that implements
+/// a simple weak memory consistency model.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+
+#include <set>
+
+namespace mca {
+
+class InstRef;
+struct InstrDesc;
+
+/// A Load/Store Unit implementing a load and store queues.
+///
+/// This class implements a load queue and a store queue to emulate the
+/// out-of-order execution of memory operations.
+/// Each load (or store) consumes an entry in the load (or store) queue.
+///
+/// Rules are:
+/// 1) A younger load is allowed to pass an older load only if there are no
+///    stores nor barriers in between the two loads.
+/// 2) An younger store is not allowed to pass an older store.
+/// 3) A younger store is not allowed to pass an older load.
+/// 4) A younger load is allowed to pass an older store only if the load does
+///    not alias with the store.
+///
+/// This class optimistically assumes that loads don't alias store operations.
+/// Under this assumption, younger loads are always allowed to pass older
+/// stores (this would only affects rule 4).
+/// Essentially, this LSUnit doesn't attempt to run any sort alias analysis to
+/// predict when loads and stores don't alias with eachother.
+///
+/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
+/// set to `false` by the constructor of LSUnit.
+///
+/// In the case of write-combining memory, rule 2. could be relaxed to allow
+/// reordering of non-aliasing store operations. At the moment, this is not
+/// allowed.
+/// To put it in another way, there is no option to specify a different memory
+/// type for memory operations (example: write-through, write-combining, etc.).
+/// Also, there is no way to weaken the memory model, and this unit currently
+/// doesn't support write-combining behavior.
+///
+/// No assumptions are made on the size of the store buffer.
+/// As mentioned before, this class doesn't perform alias analysis.
+/// Consequently,  LSUnit doesn't know how to identify cases where
+/// store-to-load forwarding may occur.
+///
+/// LSUnit doesn't attempt to predict whether a load or store hits or misses
+/// the L1 cache. To be more specific, LSUnit doesn't know anything about
+/// the cache hierarchy and memory types.
+/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
+/// scheduling model provides an "optimistic" load-to-use latency (which usually
+/// matches the load-to-use latency for when there is a hit in the L1D).
+///
+/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
+/// memory-barrier like instructions.
+/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
+/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
+/// serializes loads without forcing a flush of the load queue.
+/// Similarly, instructions that both `mayStore` and have `unmodeled side
+/// effects` are treated like store barriers. A full memory
+/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
+/// effects. This is obviously inaccurate, but this is the best that we can do
+/// at the moment.
+///
+/// Each load/store barrier consumes one entry in the load/store queue. A
+/// load/store barrier enforces ordering of loads/stores:
+///  - A younger load cannot pass a load barrier.
+///  - A younger store cannot pass a store barrier.
+///
+/// A younger load has to wait for the memory load barrier to execute.
+/// A load/store barrier is "executed" when it becomes the oldest entry in
+/// the load/store queue(s). That also means, all the older loads/stores have
+/// already been executed.
+class LSUnit {
+  // Load queue size.
+  // LQ_Size == 0 means that there are infinite slots in the load queue.
+  unsigned LQ_Size;
+
+  // Store queue size.
+  // SQ_Size == 0 means that there are infinite slots in the store queue.
+  unsigned SQ_Size;
+
+  // If true, loads will never alias with stores. This is the default.
+  bool NoAlias;
+
+  std::set<unsigned> LoadQueue;
+  std::set<unsigned> StoreQueue;
+
+  void assignLQSlot(unsigned Index);
+  void assignSQSlot(unsigned Index);
+  bool isReadyNoAlias(unsigned Index) const;
+
+  // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
+  // conservatively treated as a store barrier. It forces older store to be
+  // executed before newer stores are issued.
+  std::set<unsigned> StoreBarriers;
+
+  // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
+  // conservatively treated as a load barrier. It forces older loads to execute
+  // before newer loads are issued.
+  std::set<unsigned> LoadBarriers;
+
+public:
+  LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
+      : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+
+  bool isSQEmpty() const { return StoreQueue.empty(); }
+  bool isLQEmpty() const { return LoadQueue.empty(); }
+  bool isSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; }
+  bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
+
+  // Returns true if this instruction has been successfully enqueued.
+  bool reserve(const InstRef &IR);
+
+  // The rules are:
+  // 1. A store may not pass a previous store.
+  // 2. A load may not pass a previous store unless flag 'NoAlias' is set.
+  // 3. A load may pass a previous load.
+  // 4. A store may not pass a previous load (regardless of flag 'NoAlias').
+  // 5. A load has to wait until an older load barrier is fully executed.
+  // 6. A store has to wait until an older store barrier is fully executed.
+  bool isReady(const InstRef &IR) const;
+  void onInstructionExecuted(const InstRef &IR);
+};
+
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/Pipeline.cpp b/contrib/llvm/tools/llvm-mca/Pipeline.cpp
new file mode 100644
index 000000000000..7c937e7b48b5
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Pipeline.cpp
@@ -0,0 +1,99 @@
+//===--------------------- Pipeline.cpp -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements an ordered container of stages that simulate the
+/// pipeline of a hardware backend.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Pipeline.h"
+#include "HWEventListener.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+using namespace llvm;
+
+void Pipeline::addEventListener(HWEventListener *Listener) {
+  if (Listener)
+    Listeners.insert(Listener);
+  for (auto &S : Stages)
+    S->addListener(Listener);
+}
+
+bool Pipeline::hasWorkToProcess() {
+  const auto It = llvm::find_if(Stages, [](const std::unique_ptr<Stage> &S) {
+    return S->hasWorkToComplete();
+  });
+  return It != Stages.end();
+}
+
+// This routine returns early if any stage returns 'false' after execute() is
+// called on it.
+bool Pipeline::executeStages(InstRef &IR) {
+  for (const std::unique_ptr<Stage> &S : Stages)
+    if (!S->execute(IR))
+      return false;
+  return true;
+}
+
+void Pipeline::preExecuteStages() {
+  for (const std::unique_ptr<Stage> &S : Stages)
+    S->preExecute();
+}
+
+void Pipeline::postExecuteStages() {
+  for (const std::unique_ptr<Stage> &S : Stages)
+    S->postExecute();
+}
+
+void Pipeline::run() {
+  while (hasWorkToProcess()) {
+    notifyCycleBegin();
+    runCycle();
+    notifyCycleEnd();
+    ++Cycles;
+  }
+}
+
+void Pipeline::runCycle() {
+  // Update the stages before we do any processing for this cycle.
+  InstRef IR;
+  for (auto &S : Stages)
+    S->cycleStart();
+
+  // Continue executing this cycle until any stage claims it cannot make
+  // progress.
+  while (true) {
+    preExecuteStages();
+    if (!executeStages(IR))
+      break;
+    postExecuteStages();
+  }
+
+  for (auto &S : Stages)
+    S->cycleEnd();
+}
+
+void Pipeline::notifyCycleBegin() {
+  LLVM_DEBUG(dbgs() << "[E] Cycle begin: " << Cycles << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->onCycleBegin();
+}
+
+void Pipeline::notifyCycleEnd() {
+  LLVM_DEBUG(dbgs() << "[E] Cycle end: " << Cycles << "\n\n");
+  for (HWEventListener *Listener : Listeners)
+    Listener->onCycleEnd();
+}
+} // namespace mca.
diff --git a/contrib/llvm/tools/llvm-mca/Pipeline.h b/contrib/llvm/tools/llvm-mca/Pipeline.h
new file mode 100644
index 000000000000..6916e422be39
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Pipeline.h
@@ -0,0 +1,79 @@
+//===--------------------- Pipeline.h ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements an ordered container of stages that simulate the
+/// pipeline of a hardware backend.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_PIPELINE_H
+#define LLVM_TOOLS_LLVM_MCA_PIPELINE_H
+
+#include "Scheduler.h"
+#include "Stage.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mca {
+
+class HWEventListener;
+class HWInstructionEvent;
+class HWStallEvent;
+
+/// A pipeline for a specific subtarget.
+///
+/// It emulates an out-of-order execution of instructions. Instructions are
+/// fetched from a MCInst sequence managed by an initial 'Fetch' stage.
+/// Instructions are firstly fetched, then dispatched to the schedulers, and
+/// then executed.
+///
+/// This class tracks the lifetime of an instruction from the moment where
+/// it gets dispatched to the schedulers, to the moment where it finishes
+/// executing and register writes are architecturally committed.
+/// In particular, it monitors changes in the state of every instruction
+/// in flight.
+///
+/// Instructions are executed in a loop of iterations. The number of iterations
+/// is defined by the SourceMgr object, which is managed by the initial stage
+/// of the instruction pipeline.
+///
+/// The Pipeline entry point is method 'run()' which executes cycles in a loop
+/// until there are new instructions to dispatch, and not every instruction
+/// has been retired.
+///
+/// Internally, the Pipeline collects statistical information in the form of
+/// histograms. For example, it tracks how the dispatch group size changes
+/// over time.
+class Pipeline {
+  Pipeline(const Pipeline &P) = delete;
+  Pipeline &operator=(const Pipeline &P) = delete;
+
+  /// An ordered list of stages that define this instruction pipeline.
+  llvm::SmallVector<std::unique_ptr<Stage>, 8> Stages;
+  std::set<HWEventListener *> Listeners;
+  unsigned Cycles;
+
+  void preExecuteStages();
+  bool executeStages(InstRef &IR);
+  void postExecuteStages();
+  void runCycle();
+
+  bool hasWorkToProcess();
+  void notifyCycleBegin();
+  void notifyCycleEnd();
+
+public:
+  Pipeline() : Cycles(0) {}
+  void appendStage(std::unique_ptr<Stage> S) { Stages.push_back(std::move(S)); }
+  void run();
+  void addEventListener(HWEventListener *Listener);
+};
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_PIPELINE_H
diff --git a/contrib/llvm/tools/llvm-mca/PipelinePrinter.cpp b/contrib/llvm/tools/llvm-mca/PipelinePrinter.cpp
new file mode 100644
index 000000000000..c5b1a12b792f
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/PipelinePrinter.cpp
@@ -0,0 +1,26 @@
+//===--------------------- PipelinePrinter.cpp ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the PipelinePrinter interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PipelinePrinter.h"
+#include "View.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void PipelinePrinter::printReport(llvm::raw_ostream &OS) const {
+  for (const auto &V : Views)
+    V->printView(OS);
+}
+} // namespace mca.
diff --git a/contrib/llvm/tools/llvm-mca/PipelinePrinter.h b/contrib/llvm/tools/llvm-mca/PipelinePrinter.h
new file mode 100644
index 000000000000..fe871414418f
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/PipelinePrinter.h
@@ -0,0 +1,52 @@
+//===--------------------- PipelinePrinter.h --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements class PipelinePrinter.
+///
+/// PipelinePrinter allows the customization of the performance report.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
+#define LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
+
+#include "Pipeline.h"
+#include "View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+/// A printer class that knows how to collects statistics on the
+/// code analyzed by the llvm-mca tool.
+///
+/// This class knows how to print out the analysis information collected
+/// during the execution of the code. Internally, it delegates to other
+/// classes the task of printing out timeline information as well as
+/// resource pressure.
+class PipelinePrinter {
+  Pipeline &P;
+  llvm::SmallVector<std::unique_ptr<View>, 8> Views;
+
+public:
+  PipelinePrinter(Pipeline &pipeline) : P(pipeline) {}
+
+  void addView(std::unique_ptr<View> V) {
+    P.addEventListener(V.get());
+    Views.emplace_back(std::move(V));
+  }
+
+  void printReport(llvm::raw_ostream &OS) const;
+};
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
diff --git a/contrib/llvm/tools/llvm-mca/README.txt b/contrib/llvm/tools/llvm-mca/README.txt
new file mode 100644
index 000000000000..8b1670db0fca
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/README.txt
@@ -0,0 +1,865 @@
+llvm-mca - LLVM Machine Code Analyzer
+-------------------------------------
+
+llvm-mca is a performance analysis tool that uses information which is already
+available in LLVM (e.g., scheduling models) to statically measure the
+performance of machine code in a specific cpu.
+
+Performance is measured in terms of throughput as well as processor resource
+consumption.  The tool currently works for processors with an out-of-order
+backend, for which there is a scheduling model available in LLVM.
+
+The main goal of this tool is not just to predict the performance of the code
+when run on the target, but also help with diagnosing potential performance
+issues.
+
+Given an assembly code sequence, llvm-mca estimates the IPC (instructions Per
+cycle), as well as hardware resources pressure. The analysis and reporting style
+were inspired by the IACA tool from Intel.
+
+The presence of long data dependency chains, as well as poor usage of hardware
+resources may lead to bottlenecks in the backend.  The tool is able to generate
+a detailed report which should help with identifying and analyzing sources of
+bottlenecks.
+
+Scheduling models are mostly used to compute instruction latencies, to obtain
+read-advance information, and understand how processor resources are used by
+instructions.  By design, the quality of the performance analysis conducted by
+the tool is inevitably affected by the quality of the target scheduling models.
+However, scheduling models intentionally do not describe all processor details,
+since the goal is just to enable the scheduling of machine instructions during
+compilation. That means, there are processor details which are not important for
+the purpose of scheduling instructions (and therefore not described by the
+scheduling model), but are very important for this tool.
+
+A few examples of details that are missing in scheduling models are:
+ - Actual dispatch width (it often differs from the issue width).
+ - Number of read/write ports in the register file(s).
+ - Length of the load/store queue in the LSUnit.
+
+It is also very difficult to find a "good" abstract model to describe the
+behavior of out-of-order processors. So, we have to keep in mind that all of
+these aspects are going to affect the quality of the static analysis performed
+by the tool.
+
+An extensive list of known limitations is reported in one of the last sections
+of this document. There is also a section related to design problems which must
+be addressed (hopefully with the help of the community).  At the moment, the
+tool has been mostly tested for x86 targets, but there are still several
+limitations, some of which could be overcome by integrating extra information
+into the scheduling models.
+
+How the tool works
+------------------
+
+The tool takes assembly code as input. Assembly code is parsed into a sequence
+of MCInst with the help of the existing LLVM target assembly parsers. The parsed
+sequence of MCInst is then analyzed by a 'Pipeline' module to generate a
+performance report.
+
+The Pipeline module internally emulates the execution of the machine code
+sequence in a loop of iterations (which by default is 100). At the end of this
+process, the pipeline collects a number of statistics which are then printed out
+in the form of a report.
+
+Here is an example of performance report generated by the tool for a dot-product
+of two packed float vectors of four elements. The analysis is conducted for
+target x86, cpu btver2:
+
+///////////////////
+
+Iterations:     300
+Instructions:   900
+Total Cycles:   610
+Dispatch Width: 2
+IPC:            1.48
+
+
+Resources:
+[0] - JALU0
+[1] - JALU1
+[2] - JDiv
+[3] - JFPM
+[4] - JFPU0
+[5] - JFPU1
+[6] - JLAGU
+[7] - JSAGU
+[8] - JSTC
+[9] - JVIMUL
+
+
+Resource pressure per iteration:
+[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ -      -      -      -     2.00   1.00    -      -      -      -
+
+Resource pressure by instruction:
+[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    	Instructions:
+ -      -      -      -      -     1.00    -      -      -      -     	vmulps	%xmm0, %xmm1, %xmm2
+ -      -      -      -     1.00    -      -      -      -      -     	vhaddps	%xmm2, %xmm2, %xmm3
+ -      -      -      -     1.00    -      -      -      -      -     	vhaddps	%xmm3, %xmm3, %xmm4
+
+
+Instruction Info:
+[1]: #uOps
+[2]: Latency
+[3]: RThroughput
+[4]: MayLoad
+[5]: MayStore
+[6]: HasSideEffects
+
+[1]    [2]    [3]    [4]    [5]    [6]	Instructions:
+ 1      2     1.00                    	vmulps	%xmm0, %xmm1, %xmm2
+ 1      3     1.00                    	vhaddps	%xmm2, %xmm2, %xmm3
+ 1      3     1.00                    	vhaddps	%xmm3, %xmm3, %xmm4
+
+///////////////////
+
+According to this report, the dot-product kernel has been executed 300 times,
+for a total of 900 instructions dynamically executed.
+
+The report is structured in three main sections.  A first section collects a few
+performance numbers; the goal of this section is to give a very quick overview
+of the performance throughput. In this example, the two important performance
+indicators are a) the predicted total number of cycles, and b) the IPC.
+IPC is probably the most important throughput indicator. A big delta between the
+Dispatch Width and the computed IPC is an indicator of potential performance
+issues.
+
+The second section is the so-called "resource pressure view".  This view reports
+the average number of resource cycles consumed every iteration by instructions
+for every processor resource unit available on the target.  Information is
+structured in two tables. The first table reports the number of resource cycles
+spent on average every iteration. The second table correlates the resource
+cycles to the machine instruction in the sequence. For example, every iteration
+of the dot-product, instruction 'vmulps' always executes on resource unit [5]
+(JFPU1 - floating point pipeline #1), consuming an average of 1 resource cycle
+per iteration.  Note that on Jaguar, vector FP multiply can only be issued to
+pipeline JFPU1, while horizontal FP adds can only be issued to pipeline JFPU0.
+
+The third (and last) section of the report shows the latency and reciprocal
+throughput of every instruction in the sequence. That section also reports extra
+information related to the number of micro opcodes, and opcode properties (i.e.,
+'MayLoad', 'MayStore', and 'UnmodeledSideEffects').
+
+The resource pressure view helps with identifying bottlenecks caused by high
+usage of specific hardware resources.  Situations with resource pressure mainly
+concentrated on a few resources should, in general, be avoided.  Ideally,
+pressure should be uniformly distributed between multiple resources.
+
+Timeline View
+-------------
+
+A detailed report of each instruction's state transitions over time can be
+enabled using the command line flag '-timeline'.  This prints an extra section
+in the report which contains the so-called "timeline view".  Below is the
+timeline view for the dot-product example from the previous section.
+
+///////////////
+Timeline view:
+     	          012345
+Index	0123456789
+
+[0,0]	DeeER.    .    .	vmulps	%xmm0, %xmm1, %xmm2
+[0,1]	D==eeeER  .    .	vhaddps	%xmm2, %xmm2, %xmm3
+[0,2]	.D====eeeER    .	vhaddps	%xmm3, %xmm3, %xmm4
+
+[1,0]	.DeeE-----R    .	vmulps	%xmm0, %xmm1, %xmm2
+[1,1]	. D=eeeE---R   .	vhaddps	%xmm2, %xmm2, %xmm3
+[1,2]	. D====eeeER   .	vhaddps	%xmm3, %xmm3, %xmm4
+
+[2,0]	.  DeeE-----R  .	vmulps	%xmm0, %xmm1, %xmm2
+[2,1]	.  D====eeeER  .	vhaddps	%xmm2, %xmm2, %xmm3
+[2,2]	.   D======eeeER	vhaddps	%xmm3, %xmm3, %xmm4
+
+
+Average Wait times (based on the timeline view):
+[0]: Executions
+[1]: Average time spent waiting in a scheduler's queue
+[2]: Average time spent waiting in a scheduler's queue while ready
+[3]: Average time elapsed from WB until retire stage
+
+      [0]    [1]    [2]    [3]
+0.     3     1.0    1.0    3.3  	vmulps	%xmm0, %xmm1, %xmm2
+1.     3     3.3    0.7    1.0  	vhaddps	%xmm2, %xmm2, %xmm3
+2.     3     5.7    0.0    0.0  	vhaddps	%xmm3, %xmm3, %xmm4
+///////////////
+
+The timeline view is very interesting because it shows how instructions changed
+in state during execution.  It also gives an idea of how the tool "sees"
+instructions executed on the target.
+
+The timeline view is structured in two tables.  The first table shows how
+instructions change in state over time (measured in cycles); the second table
+(named "Average Wait times") reports useful timing statistics which should help
+diagnose performance bottlenecks caused by long data dependencies and
+sub-optimal usage of hardware resources.
+
+An instruction in the timeline view is identified by a pair of indices, where
+the 'first' index identifies an iteration, and the 'second' index is the actual
+instruction index (i.e., where it appears in the code sequence).
+
+Excluding the first and last column, the remaining columns are in cycles.
+Cycles are numbered sequentially starting from 0.  The following characters are
+used to describe the state of an instruction:
+
+ D : Instruction dispatched.
+ e : Instruction executing.
+ E : Instruction executed.
+ R : Instruction retired.
+ = : Instruction already dispatched, waiting to be executed.
+ - : Instruction executed, waiting to be retired.
+
+Based on the timeline view from the example, we know that:
+  - Instruction [1, 0] was dispatched at cycle 1.
+  - Instruction [1, 0] started executing at cycle 2.
+  - Instruction [1, 0] reached the write back stage at cycle 4.
+  - Instruction [1, 0] was retired at cycle 10.
+
+Instruction [1, 0] (i.e., the vmulps from iteration #1) doesn't have to wait in
+the scheduler's queue for the operands to become available. By the time the
+vmulps is dispatched, operands are already available, and pipeline JFPU1 is
+ready to serve another instruction.  So the instruction can be immediately
+issued on the JFPU1 pipeline. That is demonstrated by the fact that the
+instruction only spent 1cy in the scheduler's queue.
+
+There is a gap of 5 cycles between the write-back stage and the retire event.
+That is because instructions must retire in program order, so [1,0] has to wait
+for [0, 2] to be retired first (i.e., it has to wait until cycle 10).
+
+In the dot-product example, all instructions are in a RAW (Read After Write)
+dependency chain.  Register %xmm2 written by the vmulps is immediately used by
+the first vhaddps, and register %xmm3 written by the first vhaddps is used by
+the second vhaddps.  Long data dependencies negatively affect the ILP
+(Instruction Level Parallelism).
+
+In the dot-product example, there are anti-dependencies introduced by
+instructions from different iterations.  However, those dependencies can be
+removed at register renaming stage (at the cost of allocating register aliases,
+and therefore consuming temporary registers).
+
+Table "Average Wait times" helps diagnose performance issues that are caused by
+the presence of long latency instructions and potentially long data dependencies
+which may limit the ILP.  Note that the tool by default assumes at least 1cy
+between the dispatch event and the issue event.
+
+When the performance is limited by data dependencies and/or long latency
+instructions, the number of cycles spent while in the "ready" state is expected
+to be very small when compared with the total number of cycles spent in the
+scheduler's queue.  So the difference between the two counters is a good
+indicator of how big of an impact data dependencies had on the execution of
+instructions.  When performance is mostly limited by the lack of hardware
+resources, the delta between the two counters is small.  However, the number of
+cycles spent in the queue tends to be bigger (i.e., more than 1-3cy) especially
+when compared with other low latency instructions.
+
+Extra statistics to further diagnose performance issues.
+--------------------------------------------------------
+
+Flag '-verbose' enables extra statistics and performance counters for the
+dispatch logic, the reorder buffer, the retire control unit and the register
+file.
+
+Below is an example of verbose output generated by the tool for the dot-product
+example discussed in the previous sections.
+
+///////////////////
+Iterations:     300
+Instructions:   900
+Total Cycles:   610
+Dispatch Width: 2
+IPC:            1.48
+
+
+Dynamic Dispatch Stall Cycles:
+RAT     - Register unavailable:                      0
+RCU     - Retire tokens unavailable:                 0
+SCHEDQ  - Scheduler full:                            272
+LQ      - Load queue full:                           0
+SQ      - Store queue full:                          0
+GROUP   - Static restrictions on the dispatch group: 0
+
+
+Register Alias Table:
+Total number of mappings created: 900
+Max number of mappings used:      35
+
+
+Dispatch Logic - number of cycles where we saw N instructions dispatched:
+[# dispatched], [# cycles]
+ 0,              24  (3.9%)
+ 1,              272  (44.6%)
+ 2,              314  (51.5%)
+
+
+Schedulers - number of cycles where we saw N instructions issued:
+[# issued], [# cycles]
+ 0,          7  (1.1%)
+ 1,          306  (50.2%)
+ 2,          297  (48.7%)
+
+
+Retire Control Unit - number of cycles where we saw N instructions retired:
+[# retired], [# cycles]
+ 0,           109  (17.9%)
+ 1,           102  (16.7%)
+ 2,           399  (65.4%)
+
+
+Scheduler's queue usage:
+JALU01,  0/20
+JFPU01,  18/18
+JLSAGU,  0/12
+///////////////////
+
+Based on the verbose report, the pipeline was only able to dispatch two
+instructions 51.5% of the time.  The dispatch group was limited to one
+instruction 44.6% of the cycles, which corresponds to 272 cycles.
+
+If we look at section "Dynamic Dispatch Stall Cycles", we can see how counter
+SCHEDQ reports 272 cycles.  Counter SCHEDQ is incremented every time the
+dispatch logic is unable to dispatch a full group of two instructions because
+the scheduler's queue is full.
+
+Section "Scheduler's queue usage" shows how the maximum number of buffer entries
+(i.e., scheduler's queue entries) used at runtime for resource JFPU01 reached
+its maximum. Note that AMD Jaguar implements three schedulers:
+  * JALU01 - A scheduler for ALU instructions
+  * JLSAGU - A scheduler for address generation
+  * JFPU01 - A scheduler floating point operations.
+
+The dot-product is a kernel of three floating point instructions (a vector
+multiply followed by two horizontal adds).  That explains why only the floating
+point scheduler appears to be used according to section "Scheduler's queue
+usage".
+
+A full scheduler's queue is either caused by data dependency chains, or by a
+sub-optimal usage of hardware resources.  Sometimes, resource pressure can be
+mitigated by rewriting the kernel using different instructions that consume
+different scheduler resources.  Schedulers with a small queue are less resilient
+to bottlenecks caused by the presence of long data dependencies.
+
+In this example, we can conclude that the IPC is mostly limited by data
+dependencies, and not by resource pressure.
+
+LLVM-MCA instruction flow
+-------------------------
+
+This section describes the instruction flow through the out-of-order backend,
+as well as the functional units involved in the process.
+
+An instruction goes through a default sequence of stages:
+    - Dispatch (Instruction is dispatched to the schedulers).
+    - Issue (Instruction is issued to the processor pipelines).
+    - Write Back (Instruction is executed, and results are written back).
+    - Retire (Instruction is retired; writes are architecturally committed).
+
+The tool only models the out-of-order portion of a processor. Therefore, the
+instruction fetch and decode stages are not modeled. Performance bottlenecks in
+the frontend are not diagnosed by this tool.  The tool assumes that instructions
+have all been decoded and placed in a queue.  Also, the tool doesn't know
+anything about branch prediction.
+
+The long term plan is to make the process customizable, so that processors can
+define their own. This is a future work.
+
+Instruction Dispatch
+--------------------
+
+During the Dispatch stage, instructions are picked in program order from a queue
+of already decoded instructions, and dispatched in groups to the hardware
+schedulers.  The dispatch logic is implemented by class DispatchStage in file
+DispatchStage.h.
+
+The size of a dispatch group depends on the availability of hardware resources,
+and it cannot exceed the value of field 'DispatchWidth' in class DispatchStage.
+Note that field DispatchWidth defaults to the value of field 'IssueWidth' from
+the scheduling model.
+
+Users can override the DispatchWidth value with flag "-dispatch=<N>" (where 'N'
+is an unsigned quantity).
+
+An instruction can be dispatched if:
+ - The size of the dispatch group is smaller than DispatchWidth
+ - There are enough entries in the reorder buffer
+ - There are enough temporary registers to do register renaming
+ - Schedulers are not full.
+
+Since r329067, scheduling models can now optionally specify which register
+files are available on the processor. Class DispatchStage(see DispatchStage.h)
+would use that information to initialize register file descriptors.
+
+By default, if the model doesn't describe register files, the tool
+(optimistically) assumes a single register file with an unbounded number of
+temporary registers.  Users can limit the number of temporary registers that
+are globally available for register renaming using flag
+`-register-file-size=<N>`, where N is the number of temporaries.  A value of
+zero for N means 'unbounded'.  Knowing how many temporaries are available for
+register renaming, the tool can predict dispatch stalls caused by the lack of
+temporaries.
+
+The number of reorder buffer entries consumed by an instruction depends on the
+number of micro-opcodes it specifies in the target scheduling model (see field
+'NumMicroOpcodes' of TableGen class ProcWriteResources and its derived classes;
+TargetSchedule.td).
+
+The reorder buffer is implemented by class RetireControlUnit (see
+DispatchStage.h).  Its goal is to track the progress of instructions that are
+"in-flight", and retire instructions in program order.  The number of entries
+in the reorder buffer defaults to the value of field 'MicroOpBufferSize' from
+the target scheduling model.
+
+Instructions that are dispatched to the schedulers consume scheduler buffer
+entries.  The tool queries the scheduling model to figure out the set of
+buffered resources consumed by an instruction.  Buffered resources are treated
+like "scheduler" resources, and the field 'BufferSize' (from the processor
+resource TableGen definition) defines the size of the scheduler's queue.
+
+Zero latency instructions (for example NOP instructions) don't consume scheduler
+resources.  However, those instructions still reserve a number of slots in the
+reorder buffer.
+
+Instruction Issue
+-----------------
+
+As mentioned in the previous section, each scheduler resource implements a queue
+of instructions.  An instruction has to wait in the scheduler's queue until
+input register operands become available.  Only at that point, does the
+instruction becomes eligible for execution and may be issued (potentially
+out-of-order) to a pipeline for execution.
+
+Instruction latencies can be computed by the tool with the help of the
+scheduling model; latency values are defined by the scheduling model through
+ProcWriteResources objects.
+
+Class Scheduler (see file Scheduler.h) knows how to emulate multiple processor
+schedulers.  A Scheduler is responsible for tracking data dependencies, and
+dynamically select which processor resources are consumed/used by instructions.
+
+Internally, the Scheduler class delegates the management of processor resource
+units and resource groups to the ResourceManager class.  ResourceManager is also
+responsible for selecting resource units that are effectively consumed by
+instructions.  For example, if an instruction consumes 1cy of a resource group,
+the ResourceManager object selects one of the available units from the group; by
+default, it uses a round-robin selector to guarantee that resource usage is
+uniformly distributed between all units of a group.
+
+Internally, class Scheduler implements three instruction queues:
+  - WaitQueue: a queue of instructions whose operands are not ready yet.
+  - ReadyQueue: a queue of instructions ready to execute.
+  - IssuedQueue: a queue of instructions executing.
+
+Depending on the operands availability, instructions that are dispatched to the
+Scheduler are either placed into the WaitQueue or into the ReadyQueue.
+
+Every cycle, class Scheduler checks if instructions can be moved from the
+WaitQueue to the ReadyQueue, and if instructions from the ReadyQueue can be
+issued to the underlying pipelines.  The algorithm prioritizes older
+instructions over younger instructions.
+
+Objects of class ResourceState (see Scheduler.h) describe processor resources.
+There is an instance of class ResourceState for each single processor resource
+specified by the scheduling model.  A ResourceState object for a processor
+resource with multiple units dynamically tracks the availability of every single
+unit.  For example, the ResourceState of a resource group tracks the
+availability of every resource in that group.  Internally, ResourceState
+implements a round-robin selector to dynamically pick the next unit to use from
+the group.
+
+Write-Back and Retire Stage
+---------------------------
+
+Issued instructions are moved from the ReadyQueue to the IssuedQueue.  There,
+instructions wait until they reach the write-back stage.  At that point, they
+get removed from the queue and the retire control unit is notified.
+
+On the event of "instruction executed", the retire control unit flags the
+instruction as "ready to retire".
+
+Instruction are retired in program order; an "instruction retired" event is sent
+to the register file which frees the temporary registers allocated for the
+instruction at register renaming stage.
+
+Load/Store Unit and Memory Consistency Model
+--------------------------------------------
+
+The tool attempts to emulate out-of-order execution of memory operations.  Class
+LSUnit (see file LSUnit.h) emulates a load/store unit implementing queues for
+speculative execution of loads and stores.
+
+Each load (or store) consumes an entry in the load (or store) queue.  The number
+of slots in the load/store queues is unknown by the tool, since there is no
+mention of it in the scheduling model.  In practice, users can specify flag
+`-lqueue=N` (vic. `-squeue=N`) to limit the number of entries in the queue to be
+equal to exactly N (an unsigned value). If N is zero, then the tool assumes an
+unbounded queue (this is the default).
+
+LSUnit implements a relaxed consistency model for memory loads and stores. The
+rules are:
+1) A younger load is allowed to pass an older load only if there is no
+   intervening store in between the two loads.
+2) An younger store is not allowed to pass an older store.
+3) A younger store is not allowed to pass an older load.
+4) A younger load is allowed to pass an older store provided that the load does
+   not alias with the store.
+
+By default, this class conservatively (i.e., pessimistically) assumes that loads
+always may-alias store operations.  Essentially, this LSUnit doesn't perform
+any sort of alias analysis to rule out cases where loads and stores don't
+overlap with each other.  The downside of this approach however is that younger
+loads are never allowed to pass older stores.  To make it possible for a
+younger load to pass an older store, users can use the command line flag
+-noalias.  Under 'noalias', a younger load is always allowed to pass an older
+store.
+
+Note that, in the case of write-combining memory, rule 2. could be relaxed a bit
+to allow reordering of non-aliasing store operations.  That being said, at the
+moment, there is no way to further relax the memory model (flag -noalias is the
+only option).  Essentially, there is no option to specify a different memory
+type (for example: write-back, write-combining, write-through; etc.) and
+consequently to weaken or strengthen the memory model.
+
+Other limitations are:
+ * LSUnit doesn't know when store-to-load forwarding may occur.
+ * LSUnit doesn't know anything about the cache hierarchy and memory types.
+ * LSUnit doesn't know how to identify serializing operations and memory fences.
+
+No assumption is made on the store buffer size.  As mentioned before, LSUnit
+conservatively assumes a may-alias relation between loads and stores, and it
+doesn't attempt to identify cases where store-to-load forwarding would occur in
+practice.
+
+LSUnit doesn't attempt to predict whether a load or store hits or misses the L1
+cache.  It only knows if an instruction "MayLoad" and/or "MayStore".  For loads,
+the scheduling model provides an "optimistic" load-to-use latency (which usually
+matches the load-to-use latency for when there is a hit in the L1D).
+
+Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
+memory-barrier like instructions.  LSUnit conservatively assumes that an
+instruction which has both 'MayLoad' and 'UnmodeledSideEffects' behaves like a
+"soft" load-barrier.  That means, it serializes loads without forcing a flush of
+the load queue.  Similarly, instructions flagged with both 'MayStore' and
+'UnmodeledSideEffects' are treated like store barriers.  A full memory barrier
+is a 'MayLoad' and 'MayStore' instruction with 'UnmodeledSideEffects'.  This is
+inaccurate, but it is the best that we can do at the moment with the current
+information available in LLVM.
+
+A load/store barrier consumes one entry of the load/store queue.  A load/store
+barrier enforces ordering of loads/stores.  A younger load cannot pass a load
+barrier.  Also, a younger store cannot pass a store barrier.  A younger load has
+to wait for the memory/load barrier to execute.  A load/store barrier is
+"executed" when it becomes the oldest entry in the load/store queue(s). That
+also means, by construction, all the older loads/stores have been executed.
+
+In conclusion the full set of rules is:
+  1. A store may not pass a previous store.
+  2. A load may not pass a previous store unless flag 'NoAlias' is set.
+  3. A load may pass a previous load.
+  4. A store may not pass a previous load (regardless of flag 'NoAlias').
+  5. A load has to wait until an older load barrier is fully executed.
+  6. A store has to wait until an older store barrier is fully executed.
+
+Known limitations
+-----------------
+Previous sections described cases where the tool is missing information to give
+an accurate report.  For example, the first sections of this document explained
+how the lack of knowledge about the processor negatively affects the performance
+analysis.  The lack of knowledge is often a consequence of how scheduling models
+are defined; as mentioned before, scheduling models intentionally don't describe
+processors in fine details. That being said, the LLVM machine model can be
+extended to expose more details, as long as they are opt-in for targets.
+
+The accuracy of the performance analysis is also affected by assumptions made by
+the processor model used by the tool.
+
+Most recent Intel and AMD processors implement dedicated LoopBuffer/OpCache in
+the hardware frontend to speedup the throughput in the presence of tight loops.
+The presence of these buffers complicates the decoding logic, and requires
+knowledge on the branch predictor too.  Class 'SchedMachineModel' in TableGen
+provides a field named 'LoopMicroOpBufferSize' which is used to describe loop
+buffers.  However, the purpose of that field is to enable loop unrolling of
+tight loops; essentially, it affects the cost model used by pass loop-unroll.
+
+At the current state, the tool only describes the out-of-order portion of a
+processor, and consequently doesn't try to predict the frontend throughput. That
+being said, this tool could be definitely extended in future to also account for
+the hardware frontend when doing performance analysis.  This would inevitably
+require extra (extensive) processor knowledge related to all the available
+decoding paths in the hardware frontend, as well as branch prediction.
+
+Currently, the tool assumes a zero-latency "perfect" fetch&decode
+stage; the full sequence of decoded instructions is immediately visible to the
+dispatch logic from the start.
+
+The tool doesn't know about simultaneous mutithreading.  According to the tool,
+processor resources are not statically/dynamically partitioned.  Processor
+resources are fully available to the hardware thread executing the
+microbenchmark.
+
+The execution model implemented by this tool assumes that instructions are
+firstly dispatched in groups to hardware schedulers, and then issued to
+pipelines for execution.  The model assumes dynamic scheduling of instructions.
+Instructions are placed in a queue and potentially executed out-of-order (based
+on the operand availability). The dispatch stage is definitely distinct from the
+issue stage. This will change in future; as mentioned in the first section, the
+end goal is to let processors customize the process.
+
+This model doesn't correctly describe processors where the dispatch/issue is a
+single stage. This is what happens for example in VLIW processors, where
+instructions are packaged and statically scheduled at compile time; it is up to
+the compiler to predict the latency of instructions and package issue groups
+accordingly. For such targets, there is no dynamic scheduling done by the
+hardware.
+
+Existing classes (DispatchStage, Scheduler, etc.) could be extended/adapted to
+support processors with a single dispatch/issue stage. The execution flow would
+require some changes in the way how existing components (i.e.,  DispatchStage,
+Scheduler, etc.) interact. This can be a future development.
+
+The following sections describes other known limitations.  The goal is not to
+provide an extensive list of limitations; we want to report what we believe are
+the most important limitations, and suggest possible methods to overcome them.
+
+Load/Store barrier instructions and serializing operations
+----------------------------------------------------------
+Section "Load/Store Unit and Memory Consistency Model" already mentioned how
+LLVM doesn't know about serializing operations and memory barriers.  Most of it
+boils down to the fact that class MCInstrDesc (intentionally) doesn't expose
+those properties.  Instead, both serializing operations and memory barriers
+"have side-effects" according to MCInstrDesc.  That is because, at least for
+scheduling purposes, knowing that an instruction has unmodeled side effects is
+often enough to treat the instruction like a compiler scheduling barrier.
+
+A performance analysis tool could use the extra knowledge on barriers and
+serializing operations to generate a more accurate performance report.  One way
+to improve this is by reserving a couple of bits in field 'Flags' from class
+MCInstrDesc: one bit for barrier operations, and another bit to mark
+instructions as serializing operations.
+
+Lack of support for instruction itineraries
+-------------------------------------------
+The current version of the tool doesn't know how to process instruction
+itineraries.  This is probably one of the most important limitations, since it
+affects a few out-of-order processors in LLVM.
+
+As mentioned in section 'Instruction Issue', class Scheduler delegates to an
+instance of class ResourceManager the handling of processor resources.
+ResourceManager is where most of the scheduling logic is implemented.
+
+Adding support for instruction itineraries requires that we teach
+ResourceManager how to handle functional units and instruction stages.  This
+development can be a future extension, and it would probably require a few
+changes to the ResourceManager interface.
+
+Instructions that affect control flow are not correctly modeled
+---------------------------------------------------------------
+Examples of instructions that affect the control flow are: return, indirect
+branches, calls, etc.  The tool doesn't try to predict/evaluate branch targets.
+In particular, the tool doesn't model any sort of branch prediction, nor does it
+attempt to track changes to the program counter.  The tool always assumes that
+the input assembly sequence is the body of a microbenchmark (a simple loop
+executed for a number of iterations). The "next" instruction in sequence is
+always the next instruction to dispatch.
+
+Call instructions default to an arbitrary high latency of 100cy. A warning is
+generated if the tool encounters a call instruction in the sequence.  Return
+instructions are not evaluated, and therefore control flow is not affected.
+However, the tool still queries the processor scheduling model to obtain latency
+information for instructions that affect the control flow.
+
+Known limitations on X86 processors
+-----------------------------------
+
+1) Partial register updates versus full register updates.
+
+On x86-64, a 32-bit GPR write fully updates the super-register. Example:
+      add %edi %eax    ## eax += edi
+
+Here, register %eax aliases the lower half of 64-bit register %rax. On x86-64,
+register %rax is fully updated by the 'add' (the upper half of %rax is zeroed).
+Essentially, it "kills" any previous definition of (the upper half of) register
+%rax.
+
+On the other hand, 8/16 bit register writes only perform a so-called "partial
+register update". Example:
+      add %di, %ax     ## ax += di
+
+Here, register %eax is only partially updated. To be more specific, the lower
+half of %eax is set, and the upper half is left unchanged. There is also no
+change in the upper 48 bits of register %rax.
+
+To get accurate performance analysis, the tool has to know which instructions
+perform a partial register update, and which instructions fully update the
+destination's super-register.
+
+One way to expose this information is (again) via TableGen.  For example, we
+could add a flag in the TableGen instruction class to tag instructions that
+perform partial register updates. Something like this: 'bit
+hasPartialRegisterUpdate = 1'. However, this would force a `let
+hasPartialRegisterUpdate = 0` on several instruction definitions.
+
+Another approach is to have a MCSubtargetInfo hook similar to this:
+    virtual bool updatesSuperRegisters(unsigned short opcode) { return false; }
+
+Targets will be able to override this method if needed.  Again, this is just an
+idea. But the plan is to have this fixed as a future development.
+
+2) Macro Op fusion.
+
+The tool doesn't know about macro-op fusion. On modern x86 processors, a
+'cmp/test' followed by a 'jmp' is fused into a single macro operation.  The
+advantage is that the fused pair only consumes a single slot in the dispatch
+group.
+
+As a future development, the tool should be extended to address macro-fusion.
+Ideally, we could have LLVM generate a table enumerating all the opcode pairs
+that can be fused together. That table could be exposed to the tool via the
+MCSubtargetInfo interface.  This is just an idea; there may be better ways to
+implement this.
+
+3) Intel processors: mixing legacy SSE with AVX instructions.
+
+On modern Intel processors with AVX, mixing legacy SSE code with AVX code
+negatively impacts the performance.  The tool is not aware of this issue, and
+the performance penalty is not accounted when doing the analysis. This is
+something that we would like to improve in future.
+
+4) Zero-latency register moves and Zero-idioms.
+
+Most modern AMD/Intel processors know how to optimize out register-register
+moves and zero idioms at register renaming stage. The tool doesn't know
+about these patterns, and this may negatively impact the performance analysis.
+
+Known design problems
+---------------------
+This section describes two design issues that are currently affecting the tool.
+The long term plan is to "fix" these issues.
+Both limitations would be easily fixed if we teach the tool how to directly
+manipulate MachineInstr objects (instead of MCInst objects).
+
+1) Variant instructions not correctly modeled.
+
+The tool doesn't know how to analyze instructions with a "variant" scheduling
+class descriptor. A variant scheduling class needs to be resolved dynamically.
+The "actual" scheduling class often depends on the subtarget, as well as
+properties of the specific MachineInstr object.
+
+Unfortunately, the tool manipulates MCInst, and it doesn't know anything about
+MachineInstr. As a consequence, the tool cannot use the existing machine
+subtarget hooks that are normally used to resolve the variant scheduling class.
+This is a major design issue which mostly affects ARM/AArch64 targets.  It
+mostly boils down to the fact that the existing scheduling framework was meant
+to work for MachineInstr.
+
+When the tool encounters a "variant" instruction, it assumes a generic 1cy
+latency. However, the tool would not be able to tell which processor resources
+are effectively consumed by the variant instruction.
+
+2) MCInst and MCInstrDesc.
+
+Performance analysis tools require data dependency information to correctly
+predict the runtime performance of the code. This tool must always be able to
+obtain the set of implicit/explicit register defs/uses for every instruction of
+the input assembly sequence.
+
+In the first section of this document, it was mentioned how the tool takes as
+input an assembly sequence. That sequence is parsed into a MCInst sequence with
+the help of assembly parsers available from the targets.
+
+A MCInst is a very low-level instruction representation. The tool can inspect
+the MCOperand sequence of an MCInst to identify register operands. However,
+there is no way to tell register operands that are definitions from register
+operands that are uses.
+
+In LLVM, class MCInstrDesc is used to fully describe target instructions and
+their operands. The opcode of a machine instruction (a MachineInstr object) can
+be used to query the instruction set through method `MCInstrInfo::get' to obtain
+the associated MCInstrDesc object.
+
+However class MCInstrDesc describes properties and operands of MachineInstr
+objects. Essentially, MCInstrDesc is not meant to be used to describe MCInst
+objects.  To be more specific, MCInstrDesc objects are automatically generated
+via TableGen from the instruction set description in the target .td files.  For
+example, field `MCInstrDesc::NumDefs' is always equal to the cardinality of the
+`(outs)` set from the TableGen instruction definition.
+
+By construction, register definitions always appear at the beginning of the
+MachineOperands list in MachineInstr. Basically, the (outs) are the first
+operands of a MachineInstr, and the (ins) will come after in the machine operand
+list. Knowing the number of register definitions is enough to identify
+all the register operands that are definitions.
+
+In a normal compilation process, MCInst objects are generated from MachineInstr
+objects through a lowering step. By default the lowering logic simply iterates
+over the machine operands of a MachineInstr, and converts/expands them into
+equivalent MCOperand objects.
+
+The default lowering strategy has the advantage of preserving all of the above
+mentioned assumptions on the machine operand sequence. That means, register
+definitions would still be at the beginning of the MCOperand sequence, and
+register uses would come after.
+
+Targets may still define custom lowering routines for specific opcodes. Some of
+these routines may lower operands in a way that potentially breaks (some of) the
+assumptions on the machine operand sequence which were valid for MachineInstr.
+Luckily, this is not the most common form of lowering done by the targets, and
+the vast majority of the MachineInstr are lowered based on the default strategy
+which preserves the original machine operand sequence.  This is especially true
+for x86, where the custom lowering logic always preserves the original (i.e.,
+from the MachineInstr) operand sequence.
+
+This tool currently works under the strong (and potentially incorrect)
+assumption that register def/uses in a MCInst can always be identified by
+querying the machine instruction descriptor for the opcode. This assumption made
+it possible to develop this tool and get good numbers at least for the
+processors available in the x86 backend.
+
+That being said, the analysis is still potentially incorrect for other targets.
+So we plan (with the help of the community) to find a proper mechanism to map
+when possible MCOperand indices back to MachineOperand indices of the equivalent
+MachineInstr.  This would be equivalent to describing changes made by the
+lowering step which affected the operand sequence. For example, we could have an
+index for every register MCOperand (or -1, if the operand didn't exist in the
+original MachineInstr). The mapping could look like this <0,1,3,2>.  Here,
+MCOperand #2 was obtained from the lowering of MachineOperand #3. etc.
+
+This information could be automatically generated via TableGen for all the
+instructions whose custom lowering step breaks assumptions made by the tool on
+the register operand sequence (In general, these instructions should be the
+minority of a target's instruction set). Unfortunately, we don't have that
+information now.  As a consequence, we assume that the number of explicit
+register definitions is the same number specified in MCInstrDesc.  We also
+assume that register definitions always come first in the operand sequence.
+
+In conclusion: these are for now the strong assumptions made by the tool:
+  * The number of explicit and implicit register definitions in a MCInst
+    matches the number of explicit and implicit definitions specified by the
+    MCInstrDesc object.
+  * Register uses always come after register definitions.
+  * If an opcode specifies an optional definition, then the optional
+    definition is always the last register operand in the sequence.
+
+Note that some of the information accessible from the MCInstrDesc is always
+valid for MCInst. For example: implicit register defs, implicit register uses
+and 'MayLoad/MayStore/HasUnmodeledSideEffects' opcode properties still apply to
+MCInst. The tool knows about this, and uses that information during its
+analysis.
+
+Future work
+-----------
+ * Address limitations (described in section "Known limitations").
+ * Let processors specify the selection strategy for processor resource groups
+   and resources with multiple units. The tool currently uses a round-robin
+   selector to pick the next resource to use.
+ * Address limitations specifically described in section "Known limitations on
+   X86 processors".
+ * Address design issues identified in section "Known design problems".
+ * Define a standard interface for "Views". This would let users customize the
+   performance report generated by the tool.
+
+When interfaces are mature/stable:
+ * Move the logic into a library. This will enable a number of other
+   interesting use cases.
+
+Work is currently tracked on https://bugs.llvm.org. llvm-mca bugs are tagged
+with prefix [llvm-mca]. You can easily find the full list of open bugs if you
+search for that tag.
diff --git a/contrib/llvm/tools/llvm-mca/RegisterFile.cpp b/contrib/llvm/tools/llvm-mca/RegisterFile.cpp
new file mode 100644
index 000000000000..44de105b8996
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RegisterFile.cpp
@@ -0,0 +1,343 @@
+//===--------------------- RegisterFile.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a register mapping file class.  This class is responsible
+/// for managing hardware register files and the tracking of data dependencies
+/// between registers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RegisterFile.h"
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+RegisterFile::RegisterFile(const llvm::MCSchedModel &SM,
+                           const llvm::MCRegisterInfo &mri, unsigned NumRegs)
+    : MRI(mri), RegisterMappings(mri.getNumRegs(),
+                                 {WriteRef(), {IndexPlusCostPairTy(0, 1), 0}}) {
+  initialize(SM, NumRegs);
+}
+
+void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
+  // Create a default register file that "sees" all the machine registers
+  // declared by the target. The number of physical registers in the default
+  // register file is set equal to `NumRegs`. A value of zero for `NumRegs`
+  // means: this register file has an unbounded number of physical registers.
+  addRegisterFile({} /* all registers */, NumRegs);
+  if (!SM.hasExtraProcessorInfo())
+    return;
+
+  // For each user defined register file, allocate a RegisterMappingTracker
+  // object. The size of every register file, as well as the mapping between
+  // register files and register classes is specified via tablegen.
+  const MCExtraProcessorInfo &Info = SM.getExtraProcessorInfo();
+  for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) {
+    const MCRegisterFileDesc &RF = Info.RegisterFiles[I];
+    // Skip invalid register files with zero physical registers.
+    unsigned Length = RF.NumRegisterCostEntries;
+    if (!RF.NumPhysRegs)
+      continue;
+    // The cost of a register definition is equivalent to the number of
+    // physical registers that are allocated at register renaming stage.
+    const MCRegisterCostEntry *FirstElt =
+        &Info.RegisterCostTable[RF.RegisterCostEntryIdx];
+    addRegisterFile(ArrayRef<MCRegisterCostEntry>(FirstElt, Length),
+                    RF.NumPhysRegs);
+  }
+}
+
+void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
+                                   unsigned NumPhysRegs) {
+  // A default register file is always allocated at index #0. That register file
+  // is mainly used to count the total number of mappings created by all
+  // register files at runtime. Users can limit the number of available physical
+  // registers in register file #0 through the command line flag
+  // `-register-file-size`.
+  unsigned RegisterFileIndex = RegisterFiles.size();
+  RegisterFiles.emplace_back(NumPhysRegs);
+
+  // Special case where there is no register class identifier in the set.
+  // An empty set of register classes means: this register file contains all
+  // the physical registers specified by the target.
+  // We optimistically assume that a register can be renamed at the cost of a
+  // single physical register. The constructor of RegisterFile ensures that
+  // a RegisterMapping exists for each logical register defined by the Target.
+  if (Entries.empty())
+    return;
+
+  // Now update the cost of individual registers.
+  for (const MCRegisterCostEntry &RCE : Entries) {
+    const MCRegisterClass &RC = MRI.getRegClass(RCE.RegisterClassID);
+    for (const MCPhysReg Reg : RC) {
+      RegisterRenamingInfo &Entry = RegisterMappings[Reg].second;
+      IndexPlusCostPairTy &IPC = Entry.IndexPlusCost;
+      if (IPC.first && IPC.first != RegisterFileIndex) {
+        // The only register file that is allowed to overlap is the default
+        // register file at index #0. The analysis is inaccurate if register
+        // files overlap.
+        errs() << "warning: register " << MRI.getName(Reg)
+               << " defined in multiple register files.";
+      }
+      IPC = std::make_pair(RegisterFileIndex, RCE.Cost);
+      Entry.RenameAs = Reg;
+
+      // Assume the same cost for each sub-register.
+      for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) {
+        RegisterRenamingInfo &OtherEntry = RegisterMappings[*I].second;
+        if (!OtherEntry.IndexPlusCost.first &&
+            (!OtherEntry.RenameAs ||
+             MRI.isSuperRegister(*I, OtherEntry.RenameAs))) {
+          OtherEntry.IndexPlusCost = IPC;
+          OtherEntry.RenameAs = Reg;
+        }
+      }
+    }
+  }
+}
+
+void RegisterFile::allocatePhysRegs(const RegisterRenamingInfo &Entry,
+                                    MutableArrayRef<unsigned> UsedPhysRegs) {
+  unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+  unsigned Cost = Entry.IndexPlusCost.second;
+  if (RegisterFileIndex) {
+    RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
+    RMT.NumUsedPhysRegs += Cost;
+    UsedPhysRegs[RegisterFileIndex] += Cost;
+  }
+
+  // Now update the default register mapping tracker.
+  RegisterFiles[0].NumUsedPhysRegs += Cost;
+  UsedPhysRegs[0] += Cost;
+}
+
+void RegisterFile::freePhysRegs(const RegisterRenamingInfo &Entry,
+                                MutableArrayRef<unsigned> FreedPhysRegs) {
+  unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+  unsigned Cost = Entry.IndexPlusCost.second;
+  if (RegisterFileIndex) {
+    RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
+    RMT.NumUsedPhysRegs -= Cost;
+    FreedPhysRegs[RegisterFileIndex] += Cost;
+  }
+
+  // Now update the default register mapping tracker.
+  RegisterFiles[0].NumUsedPhysRegs -= Cost;
+  FreedPhysRegs[0] += Cost;
+}
+
+void RegisterFile::addRegisterWrite(WriteRef Write,
+                                    MutableArrayRef<unsigned> UsedPhysRegs,
+                                    bool ShouldAllocatePhysRegs) {
+  WriteState &WS = *Write.getWriteState();
+  unsigned RegID = WS.getRegisterID();
+  assert(RegID && "Adding an invalid register definition?");
+
+  LLVM_DEBUG({
+    dbgs() << "RegisterFile: addRegisterWrite [ " << Write.getSourceIndex()
+           << ", " << MRI.getName(RegID) << "]\n";
+  });
+
+  // If RenameAs is equal to RegID, then RegID is subject to register renaming
+  // and false dependencies on RegID are all eliminated.
+
+  // If RenameAs references the invalid register, then we optimistically assume
+  // that it can be renamed. In the absence of tablegen descriptors for register
+  // files, RenameAs is always set to the invalid register ID.  In all other
+  // cases, RenameAs must be either equal to RegID, or it must reference a
+  // super-register of RegID.
+
+  // If RenameAs is a super-register of RegID, then a write to RegID has always
+  // a false dependency on RenameAs. The only exception is for when the write
+  // implicitly clears the upper portion of the underlying register.
+  // If a write clears its super-registers, then it is renamed as `RenameAs`.
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  if (RRI.RenameAs && RRI.RenameAs != RegID) {
+    RegID = RRI.RenameAs;
+    const WriteRef &OtherWrite = RegisterMappings[RegID].first;
+
+    if (!WS.clearsSuperRegisters()) {
+      // The processor keeps the definition of `RegID` together with register
+      // `RenameAs`. Since this partial write is not renamed, no physical
+      // register is allocated.
+      ShouldAllocatePhysRegs = false;
+
+      if (OtherWrite.getSourceIndex() != Write.getSourceIndex()) {
+        // This partial write has a false dependency on RenameAs.
+        WS.setDependentWrite(OtherWrite.getWriteState());
+      }
+    }
+  }
+
+  // Update the mapping for register RegID including its sub-registers.
+  RegisterMappings[RegID].first = Write;
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    RegisterMappings[*I].first = Write;
+
+  // No physical registers are allocated for instructions that are optimized in
+  // hardware. For example, zero-latency data-dependency breaking instructions
+  // don't consume physical registers.
+  if (ShouldAllocatePhysRegs)
+    allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+
+  if (!WS.clearsSuperRegisters())
+    return;
+
+  for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    RegisterMappings[*I].first = Write;
+}
+
+void RegisterFile::removeRegisterWrite(const WriteState &WS,
+                                       MutableArrayRef<unsigned> FreedPhysRegs,
+                                       bool ShouldFreePhysRegs) {
+  unsigned RegID = WS.getRegisterID();
+
+  assert(RegID != 0 && "Invalidating an already invalid register?");
+  assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
+         "Invalidating a write of unknown cycles!");
+  assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
+
+  unsigned RenameAs = RegisterMappings[RegID].second.RenameAs;
+  if (RenameAs && RenameAs != RegID) {
+    RegID = RenameAs;
+
+    if (!WS.clearsSuperRegisters()) {
+      // Keep the definition of `RegID` together with register `RenameAs`.
+      ShouldFreePhysRegs = false;
+    }
+  }
+
+  if (ShouldFreePhysRegs)
+    freePhysRegs(RegisterMappings[RegID].second, FreedPhysRegs);
+
+  WriteRef &WR = RegisterMappings[RegID].first;
+  if (WR.getWriteState() == &WS)
+    WR.invalidate();
+
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    WriteRef &OtherWR = RegisterMappings[*I].first;
+    if (OtherWR.getWriteState() == &WS)
+      OtherWR.invalidate();
+  }
+
+  if (!WS.clearsSuperRegisters())
+    return;
+
+  for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    WriteRef &OtherWR = RegisterMappings[*I].first;
+    if (OtherWR.getWriteState() == &WS)
+      OtherWR.invalidate();
+  }
+}
+
+void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
+                                 unsigned RegID) const {
+  assert(RegID && RegID < RegisterMappings.size());
+  LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
+                    << MRI.getName(RegID) << '\n');
+  const WriteRef &WR = RegisterMappings[RegID].first;
+  if (WR.isValid())
+    Writes.push_back(WR);
+
+  // Handle potential partial register updates.
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    const WriteRef &WR = RegisterMappings[*I].first;
+    if (WR.isValid())
+      Writes.push_back(WR);
+  }
+
+  // Remove duplicate entries and resize the input vector.
+  llvm::sort(Writes.begin(), Writes.end(),
+             [](const WriteRef &Lhs, const WriteRef &Rhs) {
+               return Lhs.getWriteState() < Rhs.getWriteState();
+             });
+  auto It = std::unique(Writes.begin(), Writes.end());
+  Writes.resize(std::distance(Writes.begin(), It));
+
+  LLVM_DEBUG({
+    for (const WriteRef &WR : Writes) {
+      const WriteState &WS = *WR.getWriteState();
+      dbgs() << "[PRF] Found a dependent use of Register "
+             << MRI.getName(WS.getRegisterID()) << " (defined by intruction #"
+             << WR.getSourceIndex() << ")\n";
+    }
+  });
+}
+
+unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
+  SmallVector<unsigned, 4> NumPhysRegs(getNumRegisterFiles());
+
+  // Find how many new mappings must be created for each register file.
+  for (const unsigned RegID : Regs) {
+    const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+    const IndexPlusCostPairTy &Entry = RRI.IndexPlusCost;
+    if (Entry.first)
+      NumPhysRegs[Entry.first] += Entry.second;
+    NumPhysRegs[0] += Entry.second;
+  }
+
+  unsigned Response = 0;
+  for (unsigned I = 0, E = getNumRegisterFiles(); I < E; ++I) {
+    unsigned NumRegs = NumPhysRegs[I];
+    if (!NumRegs)
+      continue;
+
+    const RegisterMappingTracker &RMT = RegisterFiles[I];
+    if (!RMT.NumPhysRegs) {
+      // The register file has an unbounded number of microarchitectural
+      // registers.
+      continue;
+    }
+
+    if (RMT.NumPhysRegs < NumRegs) {
+      // The current register file is too small. This may occur if the number of
+      // microarchitectural registers in register file #0 was changed by the
+      // users via flag -reg-file-size. Alternatively, the scheduling model
+      // specified a too small number of registers for this register file.
+      report_fatal_error(
+          "Not enough microarchitectural registers in the register file");
+    }
+
+    if (RMT.NumPhysRegs < (RMT.NumUsedPhysRegs + NumRegs))
+      Response |= (1U << I);
+  }
+
+  return Response;
+}
+
+#ifndef NDEBUG
+void RegisterFile::dump() const {
+  for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I) {
+    const RegisterMapping &RM = RegisterMappings[I];
+    if (!RM.first.getWriteState())
+      continue;
+    const RegisterRenamingInfo &RRI = RM.second;
+    dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << RRI.IndexPlusCost.first
+           << ", Cost=" << RRI.IndexPlusCost.second
+           << ", RenameAs=" << RRI.RenameAs << ", ";
+    RM.first.dump();
+    dbgs() << '\n';
+  }
+
+  for (unsigned I = 0, E = getNumRegisterFiles(); I < E; ++I) {
+    dbgs() << "Register File #" << I;
+    const RegisterMappingTracker &RMT = RegisterFiles[I];
+    dbgs() << "\n  TotalMappings:        " << RMT.NumPhysRegs
+           << "\n  NumUsedMappings:      " << RMT.NumUsedPhysRegs << '\n';
+  }
+}
+#endif
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/RegisterFile.h b/contrib/llvm/tools/llvm-mca/RegisterFile.h
new file mode 100644
index 000000000000..349e9789b6ee
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RegisterFile.h
@@ -0,0 +1,172 @@
+//===--------------------- RegisterFile.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a register mapping file class.  This class is responsible
+/// for managing hardware register files and the tracking of data dependencies
+/// between registers.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
+#define LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
+
+#include "HardwareUnit.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
+
+namespace mca {
+
+class ReadState;
+class WriteState;
+class WriteRef;
+
+/// Manages hardware register files, and tracks register definitions for
+/// register renaming purposes.
+class RegisterFile : public HardwareUnit {
+  const llvm::MCRegisterInfo &MRI;
+
+  // Each register file is associated with an instance of
+  // RegisterMappingTracker.
+  // A RegisterMappingTracker keeps track of the number of physical registers
+  // which have been dynamically allocated by the simulator.
+  struct RegisterMappingTracker {
+    // The total number of physical registers that are available in this
+    // register file for register renaming purpouses.  A value of zero for this
+    // field means: this register file has an unbounded number of physical
+    // registers.
+    const unsigned NumPhysRegs;
+    // Number of physical registers that are currently in use.
+    unsigned NumUsedPhysRegs;
+
+    RegisterMappingTracker(unsigned NumPhysRegisters)
+        : NumPhysRegs(NumPhysRegisters), NumUsedPhysRegs(0) {}
+  };
+
+  // A vector of register file descriptors.  This set always contains at least
+  // one entry. Entry at index #0 is reserved.  That entry describes a register
+  // file with an unbounded number of physical registers that "sees" all the
+  // hardware registers declared by the target (i.e. all the register
+  // definitions in the target specific `XYZRegisterInfo.td` - where `XYZ` is
+  // the target name).
+  //
+  // Users can limit the number of physical registers that are available in
+  // regsiter file #0 specifying command line flag `-register-file-size=<uint>`.
+  llvm::SmallVector<RegisterMappingTracker, 4> RegisterFiles;
+
+  // This type is used to propagate information about the owner of a register,
+  // and the cost of allocating it in the PRF. Register cost is defined as the
+  // number of physical registers consumed by the PRF to allocate a user
+  // register.
+  //
+  // For example: on X86 BtVer2, a YMM register consumes 2 128-bit physical
+  // registers. So, the cost of allocating a YMM register in BtVer2 is 2.
+  using IndexPlusCostPairTy = std::pair<unsigned, unsigned>;
+
+  // Struct RegisterRenamingInfo maps registers to register files.
+  // There is a RegisterRenamingInfo object for every register defined by
+  // the target. RegisteRenamingInfo objects are stored into vector
+  // RegisterMappings, and register IDs can be used to reference them.
+  struct RegisterRenamingInfo {
+    IndexPlusCostPairTy IndexPlusCost;
+    llvm::MCPhysReg RenameAs;
+  };
+
+  // RegisterMapping objects are mainly used to track physical register
+  // definitions. There is a RegisterMapping for every register defined by the
+  // Target. For each register, a RegisterMapping pair contains a descriptor of
+  // the last register write (in the form of a WriteRef object), as well as a
+  // RegisterRenamingInfo to quickly identify owning register files.
+  //
+  // This implementation does not allow overlapping register files. The only
+  // register file that is allowed to overlap with other register files is
+  // register file #0. If we exclude register #0, every register is "owned" by
+  // at most one register file.
+  using RegisterMapping = std::pair<WriteRef, RegisterRenamingInfo>;
+
+  // This map contains one entry for each register defined by the target.
+  std::vector<RegisterMapping> RegisterMappings;
+
+  // This method creates a new register file descriptor.
+  // The new register file owns all of the registers declared by register
+  // classes in the 'RegisterClasses' set.
+  //
+  // Processor models allow the definition of RegisterFile(s) via tablegen. For
+  // example, this is a tablegen definition for a x86 register file for
+  // XMM[0-15] and YMM[0-15], that allows up to 60 renames (each rename costs 1
+  // physical register).
+  //
+  //    def FPRegisterFile : RegisterFile<60, [VR128RegClass, VR256RegClass]>
+  //
+  // Here FPRegisterFile contains all the registers defined by register class
+  // VR128RegClass and VR256RegClass. FPRegisterFile implements 60
+  // registers which can be used for register renaming purpose.
+  void
+  addRegisterFile(llvm::ArrayRef<llvm::MCRegisterCostEntry> RegisterClasses,
+                  unsigned NumPhysRegs);
+
+  // Consumes physical registers in each register file specified by the
+  // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`.
+  void allocatePhysRegs(const RegisterRenamingInfo &Entry,
+                        llvm::MutableArrayRef<unsigned> UsedPhysRegs);
+
+  // Releases previously allocated physical registers from the register file(s).
+  // This method is called from `invalidateRegisterMapping()`.
+  void freePhysRegs(const RegisterRenamingInfo &Entry,
+                    llvm::MutableArrayRef<unsigned> FreedPhysRegs);
+
+  // Create an instance of RegisterMappingTracker for every register file
+  // specified by the processor model.
+  // If no register file is specified, then this method creates a default
+  // register file with an unbounded number of physical registers.
+  void initialize(const llvm::MCSchedModel &SM, unsigned NumRegs);
+
+public:
+  RegisterFile(const llvm::MCSchedModel &SM, const llvm::MCRegisterInfo &mri,
+               unsigned NumRegs = 0);
+
+  // This method updates the register mappings inserting a new register
+  // definition. This method is also responsible for updating the number of
+  // allocated physical registers in each register file modified by the write.
+  // No physical regiser is allocated when flag ShouldAllocatePhysRegs is set.
+  void addRegisterWrite(WriteRef Write,
+                        llvm::MutableArrayRef<unsigned> UsedPhysRegs,
+                        bool ShouldAllocatePhysRegs = true);
+
+  // Removes write \param WS from the register mappings.
+  // Physical registers may be released to reflect this update.
+  void removeRegisterWrite(const WriteState &WS,
+                           llvm::MutableArrayRef<unsigned> FreedPhysRegs,
+                           bool ShouldFreePhysRegs = true);
+
+  // Checks if there are enough physical registers in the register files.
+  // Returns a "response mask" where each bit represents the response from a
+  // different register file.  A mask of all zeroes means that all register
+  // files are available.  Otherwise, the mask can be used to identify which
+  // register file was busy.  This sematic allows us to classify dispatch
+  // stalls caused by the lack of register file resources.
+  //
+  // Current implementation can simulate up to 32 register files (including the
+  // special register file at index #0).
+  unsigned isAvailable(llvm::ArrayRef<unsigned> Regs) const;
+  void collectWrites(llvm::SmallVectorImpl<WriteRef> &Writes,
+                     unsigned RegID) const;
+  void updateOnRead(ReadState &RS, unsigned RegID);
+
+  unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
diff --git a/contrib/llvm/tools/llvm-mca/RegisterFileStatistics.cpp b/contrib/llvm/tools/llvm-mca/RegisterFileStatistics.cpp
new file mode 100644
index 000000000000..1b07bf9a3b33
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RegisterFileStatistics.cpp
@@ -0,0 +1,107 @@
+//===--------------------- RegisterFileStatistics.cpp -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the RegisterFileStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RegisterFileStatistics.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void RegisterFileStatistics::initializeRegisterFileInfo() {
+  const MCSchedModel &SM = STI.getSchedModel();
+  RegisterFileUsage Empty = {0, 0, 0};
+  if (!SM.hasExtraProcessorInfo()) {
+    // Assume a single register file.
+    RegisterFiles.emplace_back(Empty);
+    return;
+  }
+
+  // Initialize a RegisterFileUsage for every user defined register file, plus
+  // the default register file which is always at index #0.
+  const MCExtraProcessorInfo &PI = SM.getExtraProcessorInfo();
+  // There is always an "InvalidRegisterFile" entry in tablegen. That entry can
+  // be skipped. If there are no user defined register files, then reserve a
+  // single entry for the default register file at index #0.
+  unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U);
+  RegisterFiles.resize(NumRegFiles);
+  std::fill(RegisterFiles.begin(), RegisterFiles.end(), Empty);
+}
+
+void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) {
+  switch (Event.Type) {
+  default:
+    break;
+  case HWInstructionEvent::Retired: {
+    const auto &RE = static_cast<const HWInstructionRetiredEvent &>(Event);
+    for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I)
+      RegisterFiles[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I];
+    break;
+  }
+  case HWInstructionEvent::Dispatched: {
+    const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event);
+    for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I) {
+      RegisterFileUsage &RFU = RegisterFiles[I];
+      unsigned NumUsedPhysRegs = DE.UsedPhysRegs[I];
+      RFU.CurrentlyUsedMappings += NumUsedPhysRegs;
+      RFU.TotalMappings += NumUsedPhysRegs;
+      RFU.MaxUsedMappings =
+          std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings);
+    }
+  }
+  }
+}
+
+void RegisterFileStatistics::printView(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  TempStream << "\n\nRegister File statistics:";
+  const RegisterFileUsage &GlobalUsage = RegisterFiles[0];
+  TempStream << "\nTotal number of mappings created:    "
+             << GlobalUsage.TotalMappings;
+  TempStream << "\nMax number of mappings used:         "
+             << GlobalUsage.MaxUsedMappings << '\n';
+
+  for (unsigned I = 1, E = RegisterFiles.size(); I < E; ++I) {
+    const RegisterFileUsage &RFU = RegisterFiles[I];
+    // Obtain the register file descriptor from the scheduling model.
+    assert(STI.getSchedModel().hasExtraProcessorInfo() &&
+           "Unable to find register file info!");
+    const MCExtraProcessorInfo &PI =
+        STI.getSchedModel().getExtraProcessorInfo();
+    assert(I <= PI.NumRegisterFiles && "Unexpected register file index!");
+    const MCRegisterFileDesc &RFDesc = PI.RegisterFiles[I];
+    // Skip invalid register files.
+    if (!RFDesc.NumPhysRegs)
+      continue;
+
+    TempStream << "\n*  Register File #" << I;
+    TempStream << " -- " << StringRef(RFDesc.Name) << ':';
+    TempStream << "\n   Number of physical registers:     ";
+    if (!RFDesc.NumPhysRegs)
+      TempStream << "unbounded";
+    else
+      TempStream << RFDesc.NumPhysRegs;
+    TempStream << "\n   Total number of mappings created: "
+               << RFU.TotalMappings;
+    TempStream << "\n   Max number of mappings used:      "
+               << RFU.MaxUsedMappings << '\n';
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/RegisterFileStatistics.h b/contrib/llvm/tools/llvm-mca/RegisterFileStatistics.h
new file mode 100644
index 000000000000..cbe816cd3332
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RegisterFileStatistics.h
@@ -0,0 +1,67 @@
+//===--------------------- RegisterFileStatistics.h -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This view collects and prints register file usage statistics.
+///
+/// Example  (-mcpu=btver2):
+/// ========================
+///
+/// Register File statistics:
+/// Total number of mappings created:    6
+/// Max number of mappings used:         3
+///
+/// *  Register File #1 -- FpuPRF:
+///    Number of physical registers:     72
+///    Total number of mappings created: 0
+///    Max number of mappings used:      0
+///
+/// *  Register File #2 -- IntegerPRF:
+///    Number of physical registers:     64
+///    Total number of mappings created: 6
+///    Max number of mappings used:      3
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H
+
+#include "View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace mca {
+
+class RegisterFileStatistics : public View {
+  const llvm::MCSubtargetInfo &STI;
+
+  // Used to track the number of physical registers used in a register file.
+  struct RegisterFileUsage {
+    unsigned TotalMappings;
+    unsigned MaxUsedMappings;
+    unsigned CurrentlyUsedMappings;
+  };
+
+  // There is one entry for each register file implemented by the processor.
+  llvm::SmallVector<RegisterFileUsage, 4> RegisterFiles;
+
+  void initializeRegisterFileInfo();
+
+public:
+  RegisterFileStatistics(const llvm::MCSubtargetInfo &sti) : STI(sti) {
+    initializeRegisterFileInfo();
+  }
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/ResourcePressureView.cpp b/contrib/llvm/tools/llvm-mca/ResourcePressureView.cpp
new file mode 100644
index 000000000000..fe9d5b7fabc8
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/ResourcePressureView.cpp
@@ -0,0 +1,171 @@
+//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods in the ResourcePressureView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ResourcePressureView.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void ResourcePressureView::initialize() {
+  // Populate the map of resource descriptors.
+  unsigned R2VIndex = 0;
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    Resource2VecIndex.insert(std::pair<unsigned, unsigned>(I, R2VIndex));
+    R2VIndex += ProcResource.NumUnits;
+  }
+
+  NumResourceUnits = R2VIndex;
+  ResourceUsage.resize(NumResourceUnits * (Source.size() + 1));
+  std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0.0);
+}
+
+void ResourcePressureView::onEvent(const HWInstructionEvent &Event) {
+  // We're only interested in Issue events.
+  if (Event.Type != HWInstructionEvent::Issued)
+    return;
+  const auto &IssueEvent = static_cast<const HWInstructionIssuedEvent &>(Event);
+  const unsigned SourceIdx = Event.IR.getSourceIndex() % Source.size();
+  for (const std::pair<ResourceRef, double> &Use : IssueEvent.UsedResources) {
+    const ResourceRef &RR = Use.first;
+    assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end());
+    unsigned R2VIndex = Resource2VecIndex[RR.first];
+    R2VIndex += countTrailingZeros(RR.second);
+    ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second;
+    ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second;
+  }
+}
+
+static void printColumnNames(formatted_raw_ostream &OS,
+                             const MCSchedModel &SM) {
+  unsigned Column = OS.getColumn();
+  for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
+       I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    for (unsigned J = 0; J < NumUnits; ++J) {
+      Column += 7;
+      OS << "[" << ResourceIndex;
+      if (NumUnits > 1)
+        OS << '.' << J;
+      OS << ']';
+      OS.PadToColumn(Column);
+    }
+
+    ResourceIndex++;
+  }
+}
+
+static void printResourcePressure(formatted_raw_ostream &OS, double Pressure,
+                                  unsigned Col) {
+  if (!Pressure || Pressure < 0.005) {
+    OS << " - ";
+  } else {
+    // Round to the value to the nearest hundredth and then print it.
+    OS << format("%.2f", floor((Pressure * 100) + 0.5) / 100);
+  }
+  OS.PadToColumn(Col);
+}
+
+void ResourcePressureView::printResourcePressurePerIteration(
+    raw_ostream &OS, unsigned Executions) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  formatted_raw_ostream FOS(TempStream);
+
+  FOS << "\n\nResources:\n";
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
+       I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    for (unsigned J = 0; J < NumUnits; ++J) {
+      FOS << '[' << ResourceIndex;
+      if (NumUnits > 1)
+        FOS << '.' << J;
+      FOS << ']';
+      FOS.PadToColumn(6);
+      FOS << "- " << ProcResource.Name << '\n';
+    }
+
+    ResourceIndex++;
+  }
+
+  FOS << "\n\nResource pressure per iteration:\n";
+  FOS.flush();
+  printColumnNames(FOS, SM);
+  FOS << '\n';
+  FOS.flush();
+
+  for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) {
+    double Usage = ResourceUsage[I + Source.size() * E];
+    printResourcePressure(FOS, Usage / Executions, (I + 1) * 7);
+  }
+
+  FOS.flush();
+  OS << Buffer;
+}
+
+void ResourcePressureView::printResourcePressurePerInstruction(
+    raw_ostream &OS, unsigned Executions) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  formatted_raw_ostream FOS(TempStream);
+
+  FOS << "\n\nResource pressure by instruction:\n";
+  printColumnNames(FOS, STI.getSchedModel());
+  FOS << "Instructions:\n";
+
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  for (unsigned I = 0, E = Source.size(); I < E; ++I) {
+    for (unsigned J = 0; J < NumResourceUnits; ++J) {
+      double Usage = ResourceUsage[J + I * NumResourceUnits];
+      printResourcePressure(FOS, Usage / Executions, (J + 1) * 7);
+    }
+
+    MCIP.printInst(&Source.getMCInstFromIndex(I), InstrStream, "", STI);
+    InstrStream.flush();
+    StringRef Str(Instruction);
+
+    // Remove any tabs or spaces at the beginning of the instruction.
+    Str = Str.ltrim();
+
+    FOS << Str << '\n';
+    Instruction = "";
+
+    FOS.flush();
+    OS << Buffer;
+    Buffer = "";
+  }
+}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/ResourcePressureView.h b/contrib/llvm/tools/llvm-mca/ResourcePressureView.h
new file mode 100644
index 000000000000..fe1c6af5e6f6
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/ResourcePressureView.h
@@ -0,0 +1,109 @@
+//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file define class ResourcePressureView.
+/// Class ResourcePressureView observes hardware events generated by
+/// the Pipeline object and collects statistics related to resource usage at
+/// instruction granularity.
+/// Resource pressure information is then printed out to a stream in the
+/// form of a table like the one from the example below:
+///
+/// Resources:
+/// [0] - JALU0
+/// [1] - JALU1
+/// [2] - JDiv
+/// [3] - JFPM
+/// [4] - JFPU0
+/// [5] - JFPU1
+/// [6] - JLAGU
+/// [7] - JSAGU
+/// [8] - JSTC
+/// [9] - JVIMUL
+///
+/// Resource pressure per iteration:
+/// [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+/// 0.00   0.00   0.00   0.00   2.00   2.00   0.00   0.00   0.00   0.00
+///
+/// Resource pressure by instruction:
+/// [0]  [1]  [2]  [3]  [4]  [5]  [6]  [7]  [8]  [9]  Instructions:
+///  -    -    -    -    -   1.00  -    -    -    -   vpermilpd  $1,    %xmm0,
+///  %xmm1
+///  -    -    -    -   1.00  -    -    -    -    -   vaddps     %xmm0, %xmm1,
+///  %xmm2
+///  -    -    -    -    -   1.00  -    -    -    -   vmovshdup  %xmm2, %xmm3
+///  -    -    -    -   1.00  -    -    -    -    -   vaddss     %xmm2, %xmm3,
+///  %xmm4
+///
+/// In this example, we have AVX code executed on AMD Jaguar (btver2).
+/// Both shuffles and vector floating point add operations on XMM registers have
+/// a reciprocal throughput of 1cy.
+/// Each add is issued to pipeline JFPU0, while each shuffle is issued to
+/// pipeline JFPU1. The overall pressure per iteration is reported by two
+/// tables: the first smaller table is the resource pressure per iteration;
+/// the second table reports resource pressure per instruction. Values are the
+/// average resource cycles consumed by an instruction.
+/// Every vector add from the example uses resource JFPU0 for an average of 1cy
+/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per
+/// iteration.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+
+#include "SourceMgr.h"
+#include "View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+/// This class collects resource pressure statistics and it is able to print
+/// out all the collected information as a table to an output stream.
+class ResourcePressureView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  llvm::MCInstPrinter &MCIP;
+  const SourceMgr &Source;
+
+  // Map to quickly obtain the ResourceUsage column index from a processor
+  // resource ID.
+  llvm::DenseMap<unsigned, unsigned> Resource2VecIndex;
+
+  // Table of resources used by instructions.
+  std::vector<double> ResourceUsage;
+  unsigned NumResourceUnits;
+
+  const llvm::MCInst &GetMCInstFromIndex(unsigned Index) const;
+  void printResourcePressurePerIteration(llvm::raw_ostream &OS,
+                                         unsigned Executions) const;
+  void printResourcePressurePerInstruction(llvm::raw_ostream &OS,
+                                           unsigned Executions) const;
+  void initialize();
+
+public:
+  ResourcePressureView(const llvm::MCSubtargetInfo &sti,
+                       llvm::MCInstPrinter &Printer, const SourceMgr &SM)
+      : STI(sti), MCIP(Printer), Source(SM) {
+    initialize();
+  }
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void printView(llvm::raw_ostream &OS) const override {
+    unsigned Executions = Source.getNumIterations();
+    printResourcePressurePerIteration(OS, Executions);
+    printResourcePressurePerInstruction(OS, Executions);
+  }
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/RetireControlUnit.cpp b/contrib/llvm/tools/llvm-mca/RetireControlUnit.cpp
new file mode 100644
index 000000000000..123058541f28
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RetireControlUnit.cpp
@@ -0,0 +1,87 @@
+//===---------------------- RetireControlUnit.cpp ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file simulates the hardware responsible for retiring instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RetireControlUnit.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+RetireControlUnit::RetireControlUnit(const llvm::MCSchedModel &SM)
+    : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
+      AvailableSlots(SM.MicroOpBufferSize), MaxRetirePerCycle(0) {
+  // Check if the scheduling model provides extra information about the machine
+  // processor. If so, then use that information to set the reorder buffer size
+  // and the maximum number of instructions retired per cycle.
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (EPI.ReorderBufferSize)
+      AvailableSlots = EPI.ReorderBufferSize;
+    MaxRetirePerCycle = EPI.MaxRetirePerCycle;
+  }
+
+  assert(AvailableSlots && "Invalid reorder buffer size!");
+  Queue.resize(AvailableSlots);
+}
+
+// Reserves a number of slots, and returns a new token.
+unsigned RetireControlUnit::reserveSlot(const InstRef &IR,
+                                        unsigned NumMicroOps) {
+  assert(isAvailable(NumMicroOps));
+  unsigned NormalizedQuantity =
+      std::min(NumMicroOps, static_cast<unsigned>(Queue.size()));
+  // Zero latency instructions may have zero mOps. Artificially bump this
+  // value to 1. Although zero latency instructions don't consume scheduler
+  // resources, they still consume one slot in the retire queue.
+  NormalizedQuantity = std::max(NormalizedQuantity, 1U);
+  unsigned TokenID = NextAvailableSlotIdx;
+  Queue[NextAvailableSlotIdx] = {IR, NormalizedQuantity, false};
+  NextAvailableSlotIdx += NormalizedQuantity;
+  NextAvailableSlotIdx %= Queue.size();
+  AvailableSlots -= NormalizedQuantity;
+  return TokenID;
+}
+
+const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const {
+  return Queue[CurrentInstructionSlotIdx];
+}
+
+void RetireControlUnit::consumeCurrentToken() {
+  const RetireControlUnit::RUToken &Current = peekCurrentToken();
+  assert(Current.NumSlots && "Reserved zero slots?");
+  assert(Current.IR.isValid() && "Invalid RUToken in the RCU queue.");
+
+  // Update the slot index to be the next item in the circular queue.
+  CurrentInstructionSlotIdx += Current.NumSlots;
+  CurrentInstructionSlotIdx %= Queue.size();
+  AvailableSlots += Current.NumSlots;
+}
+
+void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
+  assert(Queue.size() > TokenID);
+  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR.isValid());
+  Queue[TokenID].Executed = true;
+}
+
+#ifndef NDEBUG
+void RetireControlUnit::dump() const {
+  dbgs() << "Retire Unit: { Total Slots=" << Queue.size()
+         << ", Available Slots=" << AvailableSlots << " }\n";
+}
+#endif
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/RetireControlUnit.h b/contrib/llvm/tools/llvm-mca/RetireControlUnit.h
new file mode 100644
index 000000000000..3530ff21ba0d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RetireControlUnit.h
@@ -0,0 +1,98 @@
+//===---------------------- RetireControlUnit.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file simulates the hardware responsible for retiring instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
+#define LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
+
+#include "HardwareUnit.h"
+#include "Instruction.h"
+#include "llvm/MC/MCSchedule.h"
+#include <vector>
+
+namespace mca {
+
+/// This class tracks which instructions are in-flight (i.e., dispatched but not
+/// retired) in the OoO backend.
+//
+/// This class checks on every cycle if/which instructions can be retired.
+/// Instructions are retired in program order.
+/// In the event of an instruction being retired, the pipeline that owns
+/// this RetireControlUnit (RCU) gets notified.
+///
+/// On instruction retired, register updates are all architecturally
+/// committed, and any temporary registers originally allocated for the
+/// retired instruction are freed.
+struct RetireControlUnit : public HardwareUnit {
+  // A RUToken is created by the RCU for every instruction dispatched to the
+  // schedulers.  These "tokens" are managed by the RCU in its token Queue.
+  //
+  // On every cycle ('cycleEvent'), the RCU iterates through the token queue
+  // looking for any token with its 'Executed' flag set.  If a token has that
+  // flag set, then the instruction has reached the write-back stage and will
+  // be retired by the RCU.
+  //
+  // 'NumSlots' represents the number of entries consumed by the instruction in
+  // the reorder buffer. Those entries will become available again once the
+  // instruction is retired.
+  //
+  // Note that the size of the reorder buffer is defined by the scheduling
+  // model via field 'NumMicroOpBufferSize'.
+  struct RUToken {
+    InstRef IR;
+    unsigned NumSlots; // Slots reserved to this instruction.
+    bool Executed;     // True if the instruction is past the WB stage.
+  };
+
+private:
+  unsigned NextAvailableSlotIdx;
+  unsigned CurrentInstructionSlotIdx;
+  unsigned AvailableSlots;
+  unsigned MaxRetirePerCycle; // 0 means no limit.
+  std::vector<RUToken> Queue;
+
+public:
+  RetireControlUnit(const llvm::MCSchedModel &SM);
+
+  bool isFull() const { return !AvailableSlots; }
+  bool isEmpty() const { return AvailableSlots == Queue.size(); }
+  bool isAvailable(unsigned Quantity = 1) const {
+    // Some instructions may declare a number of uOps which exceeds the size
+    // of the reorder buffer. To avoid problems, cap the amount of slots to
+    // the size of the reorder buffer.
+    Quantity = std::min(Quantity, static_cast<unsigned>(Queue.size()));
+    return AvailableSlots >= Quantity;
+  }
+
+  unsigned getMaxRetirePerCycle() const { return MaxRetirePerCycle; }
+
+  // Reserves a number of slots, and returns a new token.
+  unsigned reserveSlot(const InstRef &IS, unsigned NumMicroOps);
+
+  // Return the current token from the RCU's circular token queue.
+  const RUToken &peekCurrentToken() const;
+
+  // Advance the pointer to the next token in the circular token queue.
+  void consumeCurrentToken();
+
+  // Update the RCU token to represent the executed state.
+  void onInstructionExecuted(unsigned TokenID);
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
diff --git a/contrib/llvm/tools/llvm-mca/RetireControlUnitStatistics.cpp b/contrib/llvm/tools/llvm-mca/RetireControlUnitStatistics.cpp
new file mode 100644
index 000000000000..edb855e11e84
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RetireControlUnitStatistics.cpp
@@ -0,0 +1,49 @@
+//===--------------------- RetireControlUnitStatistics.cpp ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the RetireControlUnitStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RetireControlUnitStatistics.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Retired)
+    ++NumRetired;
+}
+
+void RetireControlUnitStatistics::printView(llvm::raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nRetire Control Unit - "
+             << "number of cycles where we saw N instructions retired:\n";
+  TempStream << "[# retired], [# cycles]\n";
+
+  for (const std::pair<unsigned, unsigned> &Entry : RetiredPerCycle) {
+    TempStream << " " << Entry.first;
+    if (Entry.first < 10)
+      TempStream << ",           ";
+    else
+      TempStream << ",          ";
+    TempStream << Entry.second << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/RetireControlUnitStatistics.h b/contrib/llvm/tools/llvm-mca/RetireControlUnitStatistics.h
new file mode 100644
index 000000000000..1f03e7efe889
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RetireControlUnitStatistics.h
@@ -0,0 +1,60 @@
+//===--------------------- RetireControlUnitStatistics.h --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines class RetireControlUnitStatistics: a view that knows how
+/// to print general statistics related to the retire control unit.
+///
+/// Example:
+/// ========
+///
+/// Retire Control Unit - number of cycles where we saw N instructions retired:
+/// [# retired], [# cycles]
+///  0,           9  (6.9%)
+///  1,           6  (4.6%)
+///  2,           1  (0.8%)
+///  4,           3  (2.3%)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
+
+#include "View.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+class RetireControlUnitStatistics : public View {
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram RetiredPerCycle;
+
+  unsigned NumRetired;
+  unsigned NumCycles;
+
+  void updateHistograms() {
+    RetiredPerCycle[NumRetired]++;
+    NumRetired = 0;
+  }
+
+public:
+  RetireControlUnitStatistics() : NumRetired(0), NumCycles(0) {}
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void onCycleBegin() override { NumCycles++; }
+
+  void onCycleEnd() override { updateHistograms(); }
+
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/RetireStage.cpp b/contrib/llvm/tools/llvm-mca/RetireStage.cpp
new file mode 100644
index 000000000000..386ec54d7ba3
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RetireStage.cpp
@@ -0,0 +1,55 @@
+//===---------------------- RetireStage.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the retire stage of an instruction pipeline.
+/// The RetireStage represents the process logic that interacts with the
+/// simulated RetireControlUnit hardware.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RetireStage.h"
+#include "HWEventListener.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+void RetireStage::cycleStart() {
+  if (RCU.isEmpty())
+    return;
+
+  const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle();
+  unsigned NumRetired = 0;
+  while (!RCU.isEmpty()) {
+    if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle)
+      break;
+    const RetireControlUnit::RUToken &Current = RCU.peekCurrentToken();
+    if (!Current.Executed)
+      break;
+    RCU.consumeCurrentToken();
+    notifyInstructionRetired(Current.IR);
+    NumRetired++;
+  }
+}
+
+void RetireStage::notifyInstructionRetired(const InstRef &IR) {
+  LLVM_DEBUG(dbgs() << "[E] Instruction Retired: #" << IR << '\n');
+  SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+
+  for (const std::unique_ptr<WriteState> &WS : IR.getInstruction()->getDefs())
+    PRF.removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency());
+  notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/RetireStage.h b/contrib/llvm/tools/llvm-mca/RetireStage.h
new file mode 100644
index 000000000000..8cf672d92c6e
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/RetireStage.h
@@ -0,0 +1,48 @@
+//===---------------------- RetireStage.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the retire stage of an instruction pipeline.
+/// The RetireStage represents the process logic that interacts with the
+/// simulated RetireControlUnit hardware.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
+#define LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
+
+#include "RegisterFile.h"
+#include "RetireControlUnit.h"
+#include "Stage.h"
+
+namespace mca {
+
+class RetireStage : public Stage {
+  // Owner will go away when we move listeners/eventing to the stages.
+  RetireControlUnit &RCU;
+  RegisterFile &PRF;
+
+public:
+  RetireStage(RetireControlUnit &R, RegisterFile &F)
+      : Stage(), RCU(R), PRF(F) {}
+  RetireStage(const RetireStage &Other) = delete;
+  RetireStage &operator=(const RetireStage &Other) = delete;
+
+  virtual bool hasWorkToComplete() const override final {
+    return !RCU.isEmpty();
+  }
+  virtual void cycleStart() override final;
+  virtual bool execute(InstRef &IR) override final { return true; }
+  void notifyInstructionRetired(const InstRef &IR);
+  void onInstructionExecuted(unsigned TokenID);
+};
+
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
diff --git a/contrib/llvm/tools/llvm-mca/Scheduler.cpp b/contrib/llvm/tools/llvm-mca/Scheduler.cpp
new file mode 100644
index 000000000000..975a50e4b638
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Scheduler.cpp
@@ -0,0 +1,403 @@
+//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A scheduler for processor resource units and processor resource groups.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Scheduler.h"
+#include "Support.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+uint64_t ResourceState::selectNextInSequence() {
+  assert(isReady());
+  uint64_t Next = getNextInSequence();
+  while (!isSubResourceReady(Next)) {
+    updateNextInSequence();
+    Next = getNextInSequence();
+  }
+  return Next;
+}
+
+#ifndef NDEBUG
+void ResourceState::dump() const {
+  dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask
+         << ", NEXT: " << NextInSequenceMask << ", RDYMASK: " << ReadyMask
+         << ", BufferSize=" << BufferSize
+         << ", AvailableSlots=" << AvailableSlots
+         << ", Reserved=" << Unavailable << '\n';
+}
+#endif
+
+void ResourceManager::initialize(const llvm::MCSchedModel &SM) {
+  computeProcResourceMasks(SM, ProcResID2Mask);
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I)
+    addResource(*SM.getProcResource(I), I, ProcResID2Mask[I]);
+}
+
+// Adds a new resource state in Resources, as well as a new descriptor in
+// ResourceDescriptor. Map 'Resources' allows to quickly obtain ResourceState
+// objects from resource mask identifiers.
+void ResourceManager::addResource(const MCProcResourceDesc &Desc,
+                                  unsigned Index, uint64_t Mask) {
+  assert(Resources.find(Mask) == Resources.end() && "Resource already added!");
+  Resources[Mask] = llvm::make_unique<ResourceState>(Desc, Index, Mask);
+}
+
+// Returns the actual resource consumed by this Use.
+// First, is the primary resource ID.
+// Second, is the specific sub-resource ID.
+std::pair<uint64_t, uint64_t> ResourceManager::selectPipe(uint64_t ResourceID) {
+  ResourceState &RS = *Resources[ResourceID];
+  uint64_t SubResourceID = RS.selectNextInSequence();
+  if (RS.isAResourceGroup())
+    return selectPipe(SubResourceID);
+  return std::pair<uint64_t, uint64_t>(ResourceID, SubResourceID);
+}
+
+void ResourceState::removeFromNextInSequence(uint64_t ID) {
+  assert(NextInSequenceMask);
+  assert(countPopulation(ID) == 1);
+  if (ID > getNextInSequence())
+    RemovedFromNextInSequence |= ID;
+  NextInSequenceMask = NextInSequenceMask & (~ID);
+  if (!NextInSequenceMask) {
+    NextInSequenceMask = ResourceSizeMask;
+    assert(NextInSequenceMask != RemovedFromNextInSequence);
+    NextInSequenceMask ^= RemovedFromNextInSequence;
+    RemovedFromNextInSequence = 0;
+  }
+}
+
+void ResourceManager::use(ResourceRef RR) {
+  // Mark the sub-resource referenced by RR as used.
+  ResourceState &RS = *Resources[RR.first];
+  RS.markSubResourceAsUsed(RR.second);
+  // If there are still available units in RR.first,
+  // then we are done.
+  if (RS.isReady())
+    return;
+
+  // Notify to other resources that RR.first is no longer available.
+  for (const std::pair<uint64_t, UniqueResourceState> &Res : Resources) {
+    ResourceState &Current = *Res.second.get();
+    if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first)
+      continue;
+
+    if (Current.containsResource(RR.first)) {
+      Current.markSubResourceAsUsed(RR.first);
+      Current.removeFromNextInSequence(RR.first);
+    }
+  }
+}
+
+void ResourceManager::release(ResourceRef RR) {
+  ResourceState &RS = *Resources[RR.first];
+  bool WasFullyUsed = !RS.isReady();
+  RS.releaseSubResource(RR.second);
+  if (!WasFullyUsed)
+    return;
+
+  for (const std::pair<uint64_t, UniqueResourceState> &Res : Resources) {
+    ResourceState &Current = *Res.second.get();
+    if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first)
+      continue;
+
+    if (Current.containsResource(RR.first))
+      Current.releaseSubResource(RR.first);
+  }
+}
+
+ResourceStateEvent
+ResourceManager::canBeDispatched(ArrayRef<uint64_t> Buffers) const {
+  ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE;
+  for (uint64_t Buffer : Buffers) {
+    Result = isBufferAvailable(Buffer);
+    if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE)
+      break;
+  }
+  return Result;
+}
+
+void ResourceManager::reserveBuffers(ArrayRef<uint64_t> Buffers) {
+  for (const uint64_t R : Buffers) {
+    reserveBuffer(R);
+    ResourceState &Resource = *Resources[R];
+    if (Resource.isADispatchHazard()) {
+      assert(!Resource.isReserved());
+      Resource.setReserved();
+    }
+  }
+}
+
+void ResourceManager::releaseBuffers(ArrayRef<uint64_t> Buffers) {
+  for (const uint64_t R : Buffers)
+    releaseBuffer(R);
+}
+
+bool ResourceManager::canBeIssued(const InstrDesc &Desc) const {
+  return std::all_of(Desc.Resources.begin(), Desc.Resources.end(),
+                     [&](const std::pair<uint64_t, const ResourceUsage> &E) {
+                       unsigned NumUnits =
+                           E.second.isReserved() ? 0U : E.second.NumUnits;
+                       return isReady(E.first, NumUnits);
+                     });
+}
+
+// Returns true if all resources are in-order, and there is at least one
+// resource which is a dispatch hazard (BufferSize = 0).
+bool ResourceManager::mustIssueImmediately(const InstrDesc &Desc) {
+  if (!canBeIssued(Desc))
+    return false;
+  bool AllInOrderResources = all_of(Desc.Buffers, [&](uint64_t BufferMask) {
+    const ResourceState &Resource = *Resources[BufferMask];
+    return Resource.isInOrder() || Resource.isADispatchHazard();
+  });
+  if (!AllInOrderResources)
+    return false;
+
+  return any_of(Desc.Buffers, [&](uint64_t BufferMask) {
+    return Resources[BufferMask]->isADispatchHazard();
+  });
+}
+
+void ResourceManager::issueInstruction(
+    const InstrDesc &Desc,
+    SmallVectorImpl<std::pair<ResourceRef, double>> &Pipes) {
+  for (const std::pair<uint64_t, ResourceUsage> &R : Desc.Resources) {
+    const CycleSegment &CS = R.second.CS;
+    if (!CS.size()) {
+      releaseResource(R.first);
+      continue;
+    }
+
+    assert(CS.begin() == 0 && "Invalid {Start, End} cycles!");
+    if (!R.second.isReserved()) {
+      ResourceRef Pipe = selectPipe(R.first);
+      use(Pipe);
+      BusyResources[Pipe] += CS.size();
+      // Replace the resource mask with a valid processor resource index.
+      const ResourceState &RS = *Resources[Pipe.first];
+      Pipe.first = RS.getProcResourceID();
+      Pipes.emplace_back(
+          std::pair<ResourceRef, double>(Pipe, static_cast<double>(CS.size())));
+    } else {
+      assert((countPopulation(R.first) > 1) && "Expected a group!");
+      // Mark this group as reserved.
+      assert(R.second.isReserved());
+      reserveResource(R.first);
+      BusyResources[ResourceRef(R.first, R.first)] += CS.size();
+    }
+  }
+}
+
+void ResourceManager::cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed) {
+  for (std::pair<ResourceRef, unsigned> &BR : BusyResources) {
+    if (BR.second)
+      BR.second--;
+    if (!BR.second) {
+      // Release this resource.
+      const ResourceRef &RR = BR.first;
+
+      if (countPopulation(RR.first) == 1)
+        release(RR);
+
+      releaseResource(RR.first);
+      ResourcesFreed.push_back(RR);
+    }
+  }
+
+  for (const ResourceRef &RF : ResourcesFreed)
+    BusyResources.erase(RF);
+}
+
+#ifndef NDEBUG
+void Scheduler::dump() const {
+  dbgs() << "[SCHEDULER]: WaitQueue size is: " << WaitQueue.size() << '\n';
+  dbgs() << "[SCHEDULER]: ReadyQueue size is: " << ReadyQueue.size() << '\n';
+  dbgs() << "[SCHEDULER]: IssuedQueue size is: " << IssuedQueue.size() << '\n';
+  Resources->dump();
+}
+#endif
+
+bool Scheduler::canBeDispatched(const InstRef &IR,
+                                HWStallEvent::GenericEventType &Event) const {
+  Event = HWStallEvent::Invalid;
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+
+  if (Desc.MayLoad && LSU->isLQFull())
+    Event = HWStallEvent::LoadQueueFull;
+  else if (Desc.MayStore && LSU->isSQFull())
+    Event = HWStallEvent::StoreQueueFull;
+  else {
+    switch (Resources->canBeDispatched(Desc.Buffers)) {
+    default:
+      return true;
+    case ResourceStateEvent::RS_BUFFER_UNAVAILABLE:
+      Event = HWStallEvent::SchedulerQueueFull;
+      break;
+    case ResourceStateEvent::RS_RESERVED:
+      Event = HWStallEvent::DispatchGroupStall;
+    }
+  }
+
+  return false;
+}
+
+void Scheduler::issueInstructionImpl(
+    InstRef &IR,
+    SmallVectorImpl<std::pair<ResourceRef, double>> &UsedResources) {
+  Instruction *IS = IR.getInstruction();
+  const InstrDesc &D = IS->getDesc();
+
+  // Issue the instruction and collect all the consumed resources
+  // into a vector. That vector is then used to notify the listener.
+  Resources->issueInstruction(D, UsedResources);
+
+  // Notify the instruction that it started executing.
+  // This updates the internal state of each write.
+  IS->execute();
+
+  if (IS->isExecuting())
+    IssuedQueue[IR.getSourceIndex()] = IS;
+}
+
+// Release the buffered resources and issue the instruction.
+void Scheduler::issueInstruction(
+    InstRef &IR,
+    SmallVectorImpl<std::pair<ResourceRef, double>> &UsedResources) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  releaseBuffers(Desc.Buffers);
+  issueInstructionImpl(IR, UsedResources);
+}
+
+void Scheduler::promoteToReadyQueue(SmallVectorImpl<InstRef> &Ready) {
+  // Scan the set of waiting instructions and promote them to the
+  // ready queue if operands are all ready.
+  for (auto I = WaitQueue.begin(), E = WaitQueue.end(); I != E;) {
+    const unsigned IID = I->first;
+    Instruction *IS = I->second;
+
+    // Check if this instruction is now ready. In case, force
+    // a transition in state using method 'update()'.
+    if (!IS->isReady())
+      IS->update();
+
+    const InstrDesc &Desc = IS->getDesc();
+    bool IsMemOp = Desc.MayLoad || Desc.MayStore;
+    if (!IS->isReady() || (IsMemOp && !LSU->isReady({IID, IS}))) {
+      ++I;
+      continue;
+    }
+
+    Ready.emplace_back(IID, IS);
+    ReadyQueue[IID] = IS;
+    auto ToRemove = I;
+    ++I;
+    WaitQueue.erase(ToRemove);
+  }
+}
+
+InstRef Scheduler::select() {
+  // Find the oldest ready-to-issue instruction in the ReadyQueue.
+  auto It = std::find_if(ReadyQueue.begin(), ReadyQueue.end(),
+                         [&](const QueueEntryTy &Entry) {
+                           const InstrDesc &D = Entry.second->getDesc();
+                           return Resources->canBeIssued(D);
+                         });
+
+  if (It == ReadyQueue.end())
+    return {0, nullptr};
+
+  // We want to prioritize older instructions over younger instructions to
+  // minimize the pressure on the reorder buffer.  We also want to
+  // rank higher the instructions with more users to better expose ILP.
+
+  // Compute a rank value based on the age of an instruction (i.e. its source
+  // index) and its number of users. The lower the rank value, the better.
+  int Rank = It->first - It->second->getNumUsers();
+  for (auto I = It, E = ReadyQueue.end(); I != E; ++I) {
+    int CurrentRank = I->first - I->second->getNumUsers();
+    if (CurrentRank < Rank) {
+      const InstrDesc &D = I->second->getDesc();
+      if (Resources->canBeIssued(D))
+        It = I;
+    }
+  }
+
+  // We found an instruction to issue.
+  InstRef IR(It->first, It->second);
+  ReadyQueue.erase(It);
+  return IR;
+}
+
+void Scheduler::updatePendingQueue(SmallVectorImpl<InstRef> &Ready) {
+  // Notify to instructions in the pending queue that a new cycle just
+  // started.
+  for (QueueEntryTy Entry : WaitQueue)
+    Entry.second->cycleEvent();
+  promoteToReadyQueue(Ready);
+}
+
+void Scheduler::updateIssuedQueue(SmallVectorImpl<InstRef> &Executed) {
+  for (auto I = IssuedQueue.begin(), E = IssuedQueue.end(); I != E;) {
+    const QueueEntryTy Entry = *I;
+    Instruction *IS = Entry.second;
+    IS->cycleEvent();
+    if (IS->isExecuted()) {
+      Executed.push_back({Entry.first, Entry.second});
+      auto ToRemove = I;
+      ++I;
+      IssuedQueue.erase(ToRemove);
+    } else {
+      LLVM_DEBUG(dbgs() << "[SCHEDULER]: Instruction #" << Entry.first
+                        << " is still executing.\n");
+      ++I;
+    }
+  }
+}
+
+void Scheduler::onInstructionExecuted(const InstRef &IR) {
+  LSU->onInstructionExecuted(IR);
+}
+
+void Scheduler::reclaimSimulatedResources(SmallVectorImpl<ResourceRef> &Freed) {
+  Resources->cycleEvent(Freed);
+}
+
+bool Scheduler::reserveResources(InstRef &IR) {
+  // If necessary, reserve queue entries in the load-store unit (LSU).
+  const bool Reserved = LSU->reserve(IR);
+  if (!IR.getInstruction()->isReady() || (Reserved && !LSU->isReady(IR))) {
+    LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the Wait Queue\n");
+    WaitQueue[IR.getSourceIndex()] = IR.getInstruction();
+    return false;
+  }
+  return true;
+}
+
+bool Scheduler::issueImmediately(InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  if (!Desc.isZeroLatency() && !Resources->mustIssueImmediately(Desc)) {
+    LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR
+                      << " to the Ready Queue\n");
+    ReadyQueue[IR.getSourceIndex()] = IR.getInstruction();
+    return false;
+  }
+  return true;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/Scheduler.h b/contrib/llvm/tools/llvm-mca/Scheduler.h
new file mode 100644
index 000000000000..428fbc01707d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Scheduler.h
@@ -0,0 +1,515 @@
+//===--------------------- Scheduler.h ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A scheduler for Processor Resource Units and Processor Resource Groups.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+
+#include "HWEventListener.h"
+#include "HardwareUnit.h"
+#include "Instruction.h"
+#include "LSUnit.h"
+#include "RetireControlUnit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+/// Used to notify the internal state of a processor resource.
+///
+/// A processor resource is available if it is not reserved, and there are
+/// available slots in the buffer.  A processor resource is unavailable if it
+/// is either reserved, or the associated buffer is full. A processor resource
+/// with a buffer size of -1 is always available if it is not reserved.
+///
+/// Values of type ResourceStateEvent are returned by method
+/// ResourceState::isBufferAvailable(), which is used to query the internal
+/// state of a resource.
+///
+/// The naming convention for resource state events is:
+///  * Event names start with prefix RS_
+///  * Prefix RS_ is followed by a string describing the actual resource state.
+enum ResourceStateEvent {
+  RS_BUFFER_AVAILABLE,
+  RS_BUFFER_UNAVAILABLE,
+  RS_RESERVED
+};
+
+/// A descriptor for processor resources.
+///
+/// Each object of class ResourceState is associated to a specific processor
+/// resource. There is an instance of this class for every processor resource
+/// defined by the scheduling model.
+/// A ResourceState dynamically tracks the availability of units of a processor
+/// resource. For example, the ResourceState of a ProcResGroup tracks the
+/// availability of resource units which are part of the group.
+///
+/// Internally, ResourceState uses a round-robin selector to identify
+/// which unit of the group shall be used next.
+class ResourceState {
+  // Index to the MCProcResourceDesc in the processor Model.
+  unsigned ProcResourceDescIndex;
+  // A resource mask. This is generated by the tool with the help of
+  // function `mca::createProcResourceMasks' (see Support.h).
+  uint64_t ResourceMask;
+
+  // A ProcResource can specify a number of units. For the purpose of dynamic
+  // scheduling, a processor resource with more than one unit behaves like a
+  // group. This field has one bit set for every unit/resource that is part of
+  // the group.
+  // For groups, this field defaults to 'ResourceMask'. For non-group
+  // resources, the number of bits set in this mask is equivalent to the
+  // number of units (i.e. field 'NumUnits' in 'ProcResourceUnits').
+  uint64_t ResourceSizeMask;
+
+  // A simple round-robin selector for processor resources.
+  // Each bit of the mask identifies a sub resource within this group.
+  //
+  // As an example, lets assume that this ResourceState describes a
+  // processor resource group composed of the following three units:
+  //   ResourceA -- 0b001
+  //   ResourceB -- 0b010
+  //   ResourceC -- 0b100
+  //
+  // Each unit is identified by a ResourceMask which always contains a
+  // single bit set. Field NextInSequenceMask is initially set to value
+  // 0xb111. That value is obtained by OR'ing the resource masks of
+  // processor resource that are part of the group.
+  //
+  //   NextInSequenceMask  -- 0b111
+  //
+  // Field NextInSequenceMask is used by the resource manager (i.e.
+  // an object of class ResourceManager) to select the "next available resource"
+  // from the set. The algorithm would prioritize resources with a bigger
+  // ResourceMask value.
+  //
+  // In this example, there are three resources in the set, and 'ResourceC'
+  // has the highest mask value. The round-robin selector would firstly select
+  //  'ResourceC', then 'ResourceB', and eventually 'ResourceA'.
+  //
+  // When a resource R is used, its corresponding bit is cleared from the set.
+  //
+  // Back to the example:
+  // If 'ResourceC' is selected, then the new value of NextInSequenceMask
+  // becomes 0xb011.
+  //
+  // When NextInSequenceMask becomes zero, it is reset to its original value
+  // (in this example, that value would be 0b111).
+  uint64_t NextInSequenceMask;
+
+  // Some instructions can only be issued on very specific pipeline resources.
+  // For those instructions, we know exactly which resource would be consumed
+  // without having to dynamically select it using field 'NextInSequenceMask'.
+  //
+  // The resource mask bit associated to the (statically) selected
+  // processor resource is still cleared from the 'NextInSequenceMask'.
+  // If that bit was already zero in NextInSequenceMask, then we update
+  // mask 'RemovedFromNextInSequence'.
+  //
+  // When NextInSequenceMask is reset back to its initial value, the algorithm
+  // removes any bits which are set in RemoveFromNextInSequence.
+  uint64_t RemovedFromNextInSequence;
+
+  // A mask of ready units.
+  uint64_t ReadyMask;
+
+  // Buffered resources will have this field set to a positive number bigger
+  // than 0. A buffered resource behaves like a separate reservation station
+  // implementing its own buffer for out-of-order execution.
+  // A buffer of 1 is for units that force in-order execution.
+  // A value of 0 is treated specially. In particular, a resource with
+  // A BufferSize = 0 is for an in-order issue/dispatch resource.
+  // That means, this resource is reserved starting from the dispatch event,
+  // until all the "resource cycles" are consumed after the issue event.
+  // While this resource is reserved, no other instruction may be dispatched.
+  int BufferSize;
+
+  // Available slots in the buffer (zero, if this is not a buffered resource).
+  unsigned AvailableSlots;
+
+  // True if this is resource is currently unavailable.
+  // An instruction may "reserve" a resource for a number of cycles.
+  // During those cycles, the reserved resource cannot be used for other
+  // instructions, even if the ReadyMask is set.
+  bool Unavailable;
+
+  bool isSubResourceReady(uint64_t ID) const { return ReadyMask & ID; }
+
+  /// Returns the mask identifier of the next available resource in the set.
+  uint64_t getNextInSequence() const {
+    assert(NextInSequenceMask);
+    return llvm::PowerOf2Floor(NextInSequenceMask);
+  }
+
+  /// Returns the mask of the next available resource within the set,
+  /// and updates the resource selector.
+  void updateNextInSequence() {
+    NextInSequenceMask ^= getNextInSequence();
+    if (!NextInSequenceMask)
+      NextInSequenceMask = ResourceSizeMask;
+  }
+
+  uint64_t computeResourceSizeMaskForGroup(uint64_t ResourceMask) {
+    assert(llvm::countPopulation(ResourceMask) > 1);
+    return ResourceMask ^ llvm::PowerOf2Floor(ResourceMask);
+  }
+
+public:
+  ResourceState(const llvm::MCProcResourceDesc &Desc, unsigned Index,
+                uint64_t Mask)
+      : ProcResourceDescIndex(Index), ResourceMask(Mask) {
+    bool IsAGroup = llvm::countPopulation(ResourceMask) > 1;
+    ResourceSizeMask = IsAGroup ? computeResourceSizeMaskForGroup(ResourceMask)
+                                : ((1ULL << Desc.NumUnits) - 1);
+    NextInSequenceMask = ResourceSizeMask;
+    RemovedFromNextInSequence = 0;
+    ReadyMask = ResourceSizeMask;
+    BufferSize = Desc.BufferSize;
+    AvailableSlots = BufferSize == -1 ? 0U : static_cast<unsigned>(BufferSize);
+    Unavailable = false;
+  }
+
+  unsigned getProcResourceID() const { return ProcResourceDescIndex; }
+  uint64_t getResourceMask() const { return ResourceMask; }
+  int getBufferSize() const { return BufferSize; }
+
+  bool isBuffered() const { return BufferSize > 0; }
+  bool isInOrder() const { return BufferSize == 1; }
+  bool isADispatchHazard() const { return BufferSize == 0; }
+  bool isReserved() const { return Unavailable; }
+
+  void setReserved() { Unavailable = true; }
+  void clearReserved() { Unavailable = false; }
+
+  // A resource is ready if it is not reserved, and if there are enough
+  // available units.
+  // If a resource is also a dispatch hazard, then we don't check if
+  // it is reserved because that check would always return true.
+  // A resource marked as "dispatch hazard" is always reserved at
+  // dispatch time. When this method is called, the assumption is that
+  // the user of this resource has been already dispatched.
+  bool isReady(unsigned NumUnits = 1) const {
+    return (!isReserved() || isADispatchHazard()) &&
+           llvm::countPopulation(ReadyMask) >= NumUnits;
+  }
+  bool isAResourceGroup() const {
+    return llvm::countPopulation(ResourceMask) > 1;
+  }
+
+  bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
+
+  void markSubResourceAsUsed(uint64_t ID) {
+    assert(isSubResourceReady(ID));
+    ReadyMask ^= ID;
+  }
+
+  void releaseSubResource(uint64_t ID) {
+    assert(!isSubResourceReady(ID));
+    ReadyMask ^= ID;
+  }
+
+  unsigned getNumUnits() const {
+    return isAResourceGroup() ? 1U : llvm::countPopulation(ResourceSizeMask);
+  }
+
+  uint64_t selectNextInSequence();
+  void removeFromNextInSequence(uint64_t ID);
+
+  ResourceStateEvent isBufferAvailable() const {
+    if (isADispatchHazard() && isReserved())
+      return RS_RESERVED;
+    if (!isBuffered() || AvailableSlots)
+      return RS_BUFFER_AVAILABLE;
+    return RS_BUFFER_UNAVAILABLE;
+  }
+
+  void reserveBuffer() {
+    if (AvailableSlots)
+      AvailableSlots--;
+  }
+
+  void releaseBuffer() {
+    if (BufferSize > 0)
+      AvailableSlots++;
+    assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// A resource unit identifier.
+///
+/// This is used to identify a specific processor resource unit using a pair
+/// of indices where the 'first' index is a processor resource mask, and the
+/// 'second' index is an index for a "sub-resource" (i.e. unit).
+typedef std::pair<uint64_t, uint64_t> ResourceRef;
+
+// First: a MCProcResourceDesc index identifying a buffered resource.
+// Second: max number of buffer entries used in this resource.
+typedef std::pair<unsigned, unsigned> BufferUsageEntry;
+
+/// A resource manager for processor resource units and groups.
+///
+/// This class owns all the ResourceState objects, and it is responsible for
+/// acting on requests from a Scheduler by updating the internal state of
+/// ResourceState objects.
+/// This class doesn't know about instruction itineraries and functional units.
+/// In future, it can be extended to support itineraries too through the same
+/// public interface.
+class ResourceManager {
+  // The resource manager owns all the ResourceState.
+  using UniqueResourceState = std::unique_ptr<ResourceState>;
+  llvm::SmallDenseMap<uint64_t, UniqueResourceState> Resources;
+
+  // Keeps track of which resources are busy, and how many cycles are left
+  // before those become usable again.
+  llvm::SmallDenseMap<ResourceRef, unsigned> BusyResources;
+
+  // A table to map processor resource IDs to processor resource masks.
+  llvm::SmallVector<uint64_t, 8> ProcResID2Mask;
+
+  // Adds a new resource state in Resources, as well as a new descriptor in
+  // ResourceDescriptor.
+  void addResource(const llvm::MCProcResourceDesc &Desc, unsigned Index,
+                   uint64_t Mask);
+
+  // Populate resource descriptors.
+  void initialize(const llvm::MCSchedModel &SM);
+
+  // Returns the actual resource unit that will be used.
+  ResourceRef selectPipe(uint64_t ResourceID);
+
+  void use(ResourceRef RR);
+  void release(ResourceRef RR);
+
+  unsigned getNumUnits(uint64_t ResourceID) const {
+    assert(Resources.find(ResourceID) != Resources.end());
+    return Resources.find(ResourceID)->getSecond()->getNumUnits();
+  }
+
+  // Reserve a specific Resource kind.
+  void reserveBuffer(uint64_t ResourceID) {
+    assert(isBufferAvailable(ResourceID) ==
+           ResourceStateEvent::RS_BUFFER_AVAILABLE);
+    ResourceState &Resource = *Resources[ResourceID];
+    Resource.reserveBuffer();
+  }
+
+  void releaseBuffer(uint64_t ResourceID) {
+    Resources[ResourceID]->releaseBuffer();
+  }
+
+  ResourceStateEvent isBufferAvailable(uint64_t ResourceID) const {
+    const ResourceState &Resource = *Resources.find(ResourceID)->second;
+    return Resource.isBufferAvailable();
+  }
+
+  bool isReady(uint64_t ResourceID, unsigned NumUnits) const {
+    const ResourceState &Resource = *Resources.find(ResourceID)->second;
+    return Resource.isReady(NumUnits);
+  }
+
+public:
+  ResourceManager(const llvm::MCSchedModel &SM)
+      : ProcResID2Mask(SM.getNumProcResourceKinds()) {
+    initialize(SM);
+  }
+
+  // Returns RS_BUFFER_AVAILABLE if buffered resources are not reserved, and if
+  // there are enough available slots in the buffers.
+  ResourceStateEvent canBeDispatched(llvm::ArrayRef<uint64_t> Buffers) const;
+
+  // Return the processor resource identifier associated to this Mask.
+  unsigned resolveResourceMask(uint64_t Mask) const {
+    return Resources.find(Mask)->second->getProcResourceID();
+  }
+
+  // Consume a slot in every buffered resource from array 'Buffers'. Resource
+  // units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
+  void reserveBuffers(llvm::ArrayRef<uint64_t> Buffers);
+
+  // Release buffer entries previously allocated by method reserveBuffers.
+  void releaseBuffers(llvm::ArrayRef<uint64_t> Buffers);
+
+  void reserveResource(uint64_t ResourceID) {
+    ResourceState &Resource = *Resources[ResourceID];
+    assert(!Resource.isReserved());
+    Resource.setReserved();
+  }
+
+  void releaseResource(uint64_t ResourceID) {
+    ResourceState &Resource = *Resources[ResourceID];
+    Resource.clearReserved();
+  }
+
+  // Returns true if all resources are in-order, and there is at least one
+  // resource which is a dispatch hazard (BufferSize = 0).
+  bool mustIssueImmediately(const InstrDesc &Desc);
+
+  bool canBeIssued(const InstrDesc &Desc) const;
+
+  void issueInstruction(
+      const InstrDesc &Desc,
+      llvm::SmallVectorImpl<std::pair<ResourceRef, double>> &Pipes);
+
+  void cycleEvent(llvm::SmallVectorImpl<ResourceRef> &ResourcesFreed);
+
+#ifndef NDEBUG
+  void dump() const {
+    for (const std::pair<uint64_t, UniqueResourceState> &Resource : Resources)
+      Resource.second->dump();
+  }
+#endif
+}; // namespace mca
+
+/// Class Scheduler is responsible for issuing instructions to pipeline
+/// resources. Internally, it delegates to a ResourceManager the management of
+/// processor resources.
+/// This class is also responsible for tracking the progress of instructions
+/// from the dispatch stage, until the write-back stage.
+///
+/// An nstruction dispatched to the Scheduler is initially placed into either
+/// the 'WaitQueue' or the 'ReadyQueue' depending on the availability of the
+/// input operands. Instructions in the WaitQueue are ordered by instruction
+/// index. An instruction is moved from the WaitQueue to the ReadyQueue when
+/// register operands become available, and all memory dependencies are met.
+/// Instructions that are moved from the WaitQueue to the ReadyQueue transition
+/// from state 'IS_AVAILABLE' to state 'IS_READY'.
+///
+/// At the beginning of each cycle, the Scheduler checks if there are
+/// instructions in the WaitQueue that can be moved to the ReadyQueue.  If the
+/// ReadyQueue is not empty, then older instructions from the queue are issued
+/// to the processor pipelines, and the underlying ResourceManager is updated
+/// accordingly.  The ReadyQueue is ordered by instruction index to guarantee
+/// that the first instructions in the set are also the oldest.
+///
+/// An Instruction is moved from the ReadyQueue the `IssuedQueue` when it is
+/// issued to a (one or more) pipeline(s). This event also causes an instruction
+/// state transition (i.e. from state IS_READY, to state IS_EXECUTING).
+/// An Instruction leaves the IssuedQueue when it reaches the write-back stage.
+class Scheduler : public HardwareUnit {
+  const llvm::MCSchedModel &SM;
+
+  // Hardware resources that are managed by this scheduler.
+  std::unique_ptr<ResourceManager> Resources;
+  std::unique_ptr<LSUnit> LSU;
+
+  using QueueEntryTy = std::pair<unsigned, Instruction *>;
+  std::map<unsigned, Instruction *> WaitQueue;
+  std::map<unsigned, Instruction *> ReadyQueue;
+  std::map<unsigned, Instruction *> IssuedQueue;
+
+  /// Issue an instruction without updating the ready queue.
+  void issueInstructionImpl(
+      InstRef &IR,
+      llvm::SmallVectorImpl<std::pair<ResourceRef, double>> &Pipes);
+
+public:
+  Scheduler(const llvm::MCSchedModel &Model, unsigned LoadQueueSize,
+            unsigned StoreQueueSize, bool AssumeNoAlias)
+      : SM(Model), Resources(llvm::make_unique<ResourceManager>(SM)),
+        LSU(llvm::make_unique<LSUnit>(LoadQueueSize, StoreQueueSize,
+                                      AssumeNoAlias)) {}
+
+  /// Check if the instruction in 'IR' can be dispatched.
+  ///
+  /// The DispatchStage is responsible for querying the Scheduler before
+  /// dispatching new instructions. This routine is used for performing such
+  /// a query.  If the instruction 'IR' can be dispatched, then true is
+  /// returned, otherwise false is returned with Event set to the stall type.
+  bool canBeDispatched(const InstRef &IR,
+                       HWStallEvent::GenericEventType &Event) const;
+
+  /// Returns true if there is availibility for IR in the LSU.
+  bool isReady(const InstRef &IR) const { return LSU->isReady(IR); }
+
+  /// Issue an instruction.  The Used container is populated with
+  /// the resource objects consumed on behalf of issuing this instruction.
+  void
+  issueInstruction(InstRef &IR,
+                   llvm::SmallVectorImpl<std::pair<ResourceRef, double>> &Used);
+
+  /// This routine will attempt to issue an instruction immediately (for
+  /// zero-latency instructions).
+  ///
+  /// Returns true if the instruction is issued immediately.  If this does not
+  /// occur, then the instruction will be added to the Scheduler's ReadyQueue.
+  bool issueImmediately(InstRef &IR);
+
+  /// Reserve one entry in each buffered resource.
+  void reserveBuffers(llvm::ArrayRef<uint64_t> Buffers) {
+    Resources->reserveBuffers(Buffers);
+  }
+
+  /// Release buffer entries previously allocated by method reserveBuffers.
+  void releaseBuffers(llvm::ArrayRef<uint64_t> Buffers) {
+    Resources->releaseBuffers(Buffers);
+  }
+
+  /// Update the resources managed by the scheduler.
+  /// This routine is to be called at the start of a new cycle, and is
+  /// responsible for updating scheduler resources.  Resources are released
+  /// once they have been fully consumed.
+  void reclaimSimulatedResources(llvm::SmallVectorImpl<ResourceRef> &Freed);
+
+  /// Move instructions from the WaitQueue to the ReadyQueue if input operands
+  /// are all available.
+  void promoteToReadyQueue(llvm::SmallVectorImpl<InstRef> &Ready);
+
+  /// Update the ready queue.
+  void updatePendingQueue(llvm::SmallVectorImpl<InstRef> &Ready);
+
+  /// Update the issued queue.
+  void updateIssuedQueue(llvm::SmallVectorImpl<InstRef> &Executed);
+
+  /// Updates the Scheduler's resources to reflect that an instruction has just
+  /// been executed.
+  void onInstructionExecuted(const InstRef &IR);
+
+  /// Obtain the processor's resource identifier for the given
+  /// resource mask.
+  unsigned getResourceID(uint64_t Mask) {
+    return Resources->resolveResourceMask(Mask);
+  }
+
+  /// Reserve resources necessary to issue the instruction.
+  /// Returns true if the resources are ready and the (LSU) can
+  /// execute the given instruction immediately.
+  bool reserveResources(InstRef &IR);
+
+  /// Select the next instruction to issue from the ReadyQueue.
+  /// This method gives priority to older instructions.
+  InstRef select();
+
+#ifndef NDEBUG
+  // Update the ready queues.
+  void dump() const;
+
+  // This routine performs a sanity check.  This routine should only be called
+  // when we know that 'IR' is not in the scheduler's instruction queues.
+  void sanityCheck(const InstRef &IR) const {
+    const unsigned Idx = IR.getSourceIndex();
+    assert(WaitQueue.find(Idx) == WaitQueue.end());
+    assert(ReadyQueue.find(Idx) == ReadyQueue.end());
+    assert(IssuedQueue.find(Idx) == IssuedQueue.end());
+  }
+#endif // !NDEBUG
+};
+} // namespace mca
+
+#endif // LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
diff --git a/contrib/llvm/tools/llvm-mca/SchedulerStatistics.cpp b/contrib/llvm/tools/llvm-mca/SchedulerStatistics.cpp
new file mode 100644
index 000000000000..5c6d22a71812
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/SchedulerStatistics.cpp
@@ -0,0 +1,94 @@
+//===--------------------- SchedulerStatistics.cpp --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the SchedulerStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "SchedulerStatistics.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Issued)
+    ++NumIssued;
+}
+
+void SchedulerStatistics::onReservedBuffers(ArrayRef<unsigned> Buffers) {
+  for (const unsigned Buffer : Buffers) {
+    if (BufferedResources.find(Buffer) != BufferedResources.end()) {
+      BufferUsage &BU = BufferedResources[Buffer];
+      BU.SlotsInUse++;
+      BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+      continue;
+    }
+
+    BufferedResources.insert(
+        std::pair<unsigned, BufferUsage>(Buffer, {1U, 1U}));
+  }
+}
+
+void SchedulerStatistics::onReleasedBuffers(ArrayRef<unsigned> Buffers) {
+  for (const unsigned Buffer : Buffers) {
+    assert(BufferedResources.find(Buffer) != BufferedResources.end() &&
+           "Buffered resource not in map?");
+    BufferUsage &BU = BufferedResources[Buffer];
+    BU.SlotsInUse--;
+  }
+}
+
+void SchedulerStatistics::printSchedulerStatistics(
+    llvm::raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nSchedulers - number of cycles where we saw N instructions "
+                "issued:\n";
+  TempStream << "[# issued], [# cycles]\n";
+  for (const std::pair<unsigned, unsigned> &Entry : IssuedPerCycle) {
+    TempStream << " " << Entry.first << ",          " << Entry.second << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+void SchedulerStatistics::printSchedulerUsage(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nScheduler's queue usage:\n";
+  // Early exit if no buffered resources were consumed.
+  if (BufferedResources.empty()) {
+    TempStream << "No scheduler resources used.\n";
+    TempStream.flush();
+    OS << Buffer;
+    return;
+  }
+
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    if (ProcResource.BufferSize <= 0)
+      continue;
+
+    const auto It = BufferedResources.find(I);
+    unsigned MaxUsedSlots =
+        It == BufferedResources.end() ? 0 : It->second.MaxUsedSlots;
+    TempStream << ProcResource.Name << ",  " << MaxUsedSlots << '/'
+               << ProcResource.BufferSize << '\n';
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/SchedulerStatistics.h b/contrib/llvm/tools/llvm-mca/SchedulerStatistics.h
new file mode 100644
index 000000000000..7383c54a1615
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/SchedulerStatistics.h
@@ -0,0 +1,91 @@
+//===--------------------- SchedulerStatistics.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines class SchedulerStatistics. Class SchedulerStatistics is a
+/// View that listens to instruction issue events in order to print general
+/// statistics related to the hardware schedulers.
+///
+/// Example:
+/// ========
+///
+/// Schedulers - number of cycles where we saw N instructions issued:
+/// [# issued], [# cycles]
+///  0,          7  (5.4%)
+///  1,          4  (3.1%)
+///  2,          8  (6.2%)
+///
+/// Scheduler's queue usage:
+/// JALU01,  0/20
+/// JFPU01,  18/18
+/// JLSAGU,  0/12
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
+
+#include "View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+class SchedulerStatistics : public View {
+  const llvm::MCSchedModel &SM;
+
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram IssuedPerCycle;
+
+  unsigned NumIssued;
+  unsigned NumCycles;
+
+  // Tracks the usage of a scheduler's queue.
+  struct BufferUsage {
+    unsigned SlotsInUse;
+    unsigned MaxUsedSlots;
+  };
+
+  std::map<unsigned, BufferUsage> BufferedResources;
+
+  void updateHistograms() {
+    IssuedPerCycle[NumIssued]++;
+    NumIssued = 0;
+  }
+
+  void printSchedulerStatistics(llvm::raw_ostream &OS) const;
+  void printSchedulerUsage(llvm::raw_ostream &OS) const;
+
+public:
+  SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+      : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0) {}
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void onCycleBegin() override { NumCycles++; }
+
+  void onCycleEnd() override { updateHistograms(); }
+
+  // Increases the number of used scheduler queue slots of every buffered
+  // resource in the Buffers set.
+  void onReservedBuffers(llvm::ArrayRef<unsigned> Buffers) override;
+
+  // Decreases by one the number of used scheduler queue slots of every
+  // buffered resource in the Buffers set.
+  void onReleasedBuffers(llvm::ArrayRef<unsigned> Buffers) override;
+
+  void printView(llvm::raw_ostream &OS) const override {
+    printSchedulerStatistics(OS);
+    printSchedulerUsage(OS);
+  }
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/SourceMgr.h b/contrib/llvm/tools/llvm-mca/SourceMgr.h
new file mode 100644
index 000000000000..15a85a69569f
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/SourceMgr.h
@@ -0,0 +1,63 @@
+//===--------------------- SourceMgr.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements class SourceMgr. Class SourceMgr abstracts the input
+/// code sequence (a sequence of MCInst), and assings unique identifiers to
+/// every instruction in the sequence.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+
+#include "llvm/MC/MCInst.h"
+#include <vector>
+
+namespace mca {
+
+typedef std::pair<unsigned, const llvm::MCInst *> SourceRef;
+
+class SourceMgr {
+  using InstVec = std::vector<std::unique_ptr<const llvm::MCInst>>;
+  const InstVec &Sequence;
+  unsigned Current;
+  unsigned Iterations;
+  static const unsigned DefaultIterations = 100;
+
+public:
+  SourceMgr(const InstVec &MCInstSequence, unsigned NumIterations)
+      : Sequence(MCInstSequence), Current(0),
+        Iterations(NumIterations ? NumIterations : DefaultIterations) {}
+
+  unsigned getCurrentIteration() const { return Current / Sequence.size(); }
+  unsigned getNumIterations() const { return Iterations; }
+  unsigned size() const { return Sequence.size(); }
+  const InstVec &getSequence() const { return Sequence; }
+
+  bool hasNext() const { return Current < (Iterations * size()); }
+  void updateNext() { Current++; }
+
+  const SourceRef peekNext() const {
+    unsigned Index = getCurrentInstructionIndex();
+    return SourceRef(Current, Sequence[Index].get());
+  }
+
+  unsigned getCurrentInstructionIndex() const {
+    return Current % Sequence.size();
+  }
+
+  const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
+    return *Sequence[Index % size()];
+  }
+
+  bool isEmpty() const { return size() == 0; }
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/Stage.cpp b/contrib/llvm/tools/llvm-mca/Stage.cpp
new file mode 100644
index 000000000000..7ead940e63c1
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Stage.cpp
@@ -0,0 +1,27 @@
+//===---------------------- Stage.cpp ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a stage.
+/// A chain of stages compose an instruction pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stage.h"
+
+namespace mca {
+
+// Pin the vtable here in the implementation file.
+Stage::Stage() {}
+
+void Stage::addListener(HWEventListener *Listener) {
+  Listeners.insert(Listener);
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/Stage.h b/contrib/llvm/tools/llvm-mca/Stage.h
new file mode 100644
index 000000000000..9dbdcd89a33b
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Stage.h
@@ -0,0 +1,76 @@
+//===---------------------- Stage.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a stage.
+/// A chain of stages compose an instruction pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_STAGE_H
+#define LLVM_TOOLS_LLVM_MCA_STAGE_H
+
+#include "HWEventListener.h"
+#include <set>
+
+namespace mca {
+
+class InstRef;
+
+class Stage {
+  Stage(const Stage &Other) = delete;
+  Stage &operator=(const Stage &Other) = delete;
+  std::set<HWEventListener *> Listeners;
+
+protected:
+  const std::set<HWEventListener *> &getListeners() const { return Listeners; }
+
+public:
+  Stage();
+  virtual ~Stage() = default;
+
+  /// Called prior to preExecute to ensure that the stage has items that it
+  /// is to process.  For example, a FetchStage might have more instructions
+  /// that need to be processed, or a RCU might have items that have yet to
+  /// retire.
+  virtual bool hasWorkToComplete() const = 0;
+
+  /// Called once at the start of each cycle.  This can be used as a setup
+  /// phase to prepare for the executions during the cycle.
+  virtual void cycleStart() {}
+
+  /// Called once at the end of each cycle.
+  virtual void cycleEnd() {}
+
+  /// Called prior to executing the list of stages.
+  /// This can be called multiple times per cycle.
+  virtual void preExecute() {}
+
+  /// Called as a cleanup and finalization phase after each execution.
+  /// This will only be called if all stages return a success from their
+  /// execute callback.  This can be called multiple times per cycle.
+  virtual void postExecute() {}
+
+  /// The primary action that this stage performs.
+  /// Returning false prevents successor stages from having their 'execute'
+  /// routine called.  This can be called multiple times during a single cycle.
+  virtual bool execute(InstRef &IR) = 0;
+
+  /// Add a listener to receive callbacks during the execution of this stage.
+  void addListener(HWEventListener *Listener);
+
+  /// Notify listeners of a particular hardware event.
+  template <typename EventT> void notifyEvent(const EventT &Event) {
+    for (HWEventListener *Listener : Listeners)
+      Listener->onEvent(Event);
+  }
+};
+
+} // namespace mca
+#endif // LLVM_TOOLS_LLVM_MCA_STAGE_H
diff --git a/contrib/llvm/tools/llvm-mca/SummaryView.cpp b/contrib/llvm/tools/llvm-mca/SummaryView.cpp
new file mode 100644
index 000000000000..01399055c4fd
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/SummaryView.cpp
@@ -0,0 +1,85 @@
+//===--------------------- SummaryView.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the functionalities used by the SummaryView to print
+/// the report information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "SummaryView.h"
+#include "Support.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Format.h"
+
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+using namespace llvm;
+
+SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+                         unsigned Width)
+    : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
+      NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
+      ProcResourceMasks(Model.getNumProcResourceKinds(), 0) {
+  computeProcResourceMasks(SM, ProcResourceMasks);
+}
+
+void SummaryView::onEvent(const HWInstructionEvent &Event) {
+  // We are only interested in the "instruction dispatched" events generated by
+  // the dispatch stage for instructions that are part of iteration #0.
+  if (Event.Type != HWInstructionEvent::Dispatched)
+    return;
+
+  if (Event.IR.getSourceIndex() >= Source.size())
+    return;
+
+  // Update the cumulative number of resource cycles based on the processor
+  // resource usage information available from the instruction descriptor. We
+  // need to compute the cumulative number of resource cycles for every
+  // processor resource which is consumed by an instruction of the block.
+  const Instruction &Inst = *Event.IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
+  NumMicroOps += Desc.NumMicroOps;
+  for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
+    if (RU.second.size()) {
+      const auto It = find(ProcResourceMasks, RU.first);
+      assert(It != ProcResourceMasks.end() &&
+             "Invalid processor resource mask!");
+      ProcResourceUsage[std::distance(ProcResourceMasks.begin(), It)] +=
+          RU.second.size();
+    }
+  }
+}
+
+void SummaryView::printView(raw_ostream &OS) const {
+  unsigned Iterations = Source.getNumIterations();
+  unsigned Instructions = Source.size();
+  unsigned TotalInstructions = Instructions * Iterations;
+  double IPC = (double)TotalInstructions / TotalCycles;
+  double BlockRThroughput = computeBlockRThroughput(
+      SM, DispatchWidth, NumMicroOps, ProcResourceUsage);
+
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "Iterations:        " << Iterations;
+  TempStream << "\nInstructions:      " << TotalInstructions;
+  TempStream << "\nTotal Cycles:      " << TotalCycles;
+  TempStream << "\nDispatch Width:    " << DispatchWidth;
+  TempStream << "\nIPC:               " << format("%.2f", IPC);
+
+  // Round to the block reciprocal throughput to the nearest tenth.
+  TempStream << "\nBlock RThroughput: "
+             << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10)
+             << '\n';
+  TempStream.flush();
+  OS << Buffer;
+}
+} // namespace mca.
diff --git a/contrib/llvm/tools/llvm-mca/SummaryView.h b/contrib/llvm/tools/llvm-mca/SummaryView.h
new file mode 100644
index 000000000000..b799ce3aa747
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/SummaryView.h
@@ -0,0 +1,76 @@
+//===--------------------- SummaryView.h ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the summary view.
+///
+/// The goal of the summary view is to give a very quick overview of the
+/// performance throughput. Below is an example of summary view:
+///
+///
+/// Iterations:        300
+/// Instructions:      900
+/// Total Cycles:      610
+/// Dispatch Width:    2
+/// IPC:               1.48
+/// Block RThroughput: 2.0
+///
+/// The summary view collects a few performance numbers. The two main
+/// performance indicators are 'Total Cycles' and IPC (Instructions Per Cycle).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
+
+#include "SourceMgr.h"
+#include "View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+/// A view that collects and prints a few performance numbers.
+class SummaryView : public View {
+  const llvm::MCSchedModel &SM;
+  const SourceMgr &Source;
+  const unsigned DispatchWidth;
+  unsigned TotalCycles;
+  // The total number of micro opcodes contributed by a block of instructions.
+  unsigned NumMicroOps;
+  // For each processor resource, this vector stores the cumulative number of
+  // resource cycles consumed by the analyzed code block.
+  llvm::SmallVector<unsigned, 8> ProcResourceUsage;
+
+  // Each processor resource is associated with a so-called processor resource
+  // mask. This vector allows to correlate processor resource IDs with processor
+  // resource masks. There is exactly one element per each processor resource
+  // declared by the scheduling model.
+  llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
+
+  // Compute the reciprocal throughput for the analyzed code block.
+  // The reciprocal block throughput is computed as the MAX between:
+  //   - NumMicroOps / DispatchWidth
+  //   - Total Resource Cycles / #Units   (for every resource consumed).
+  double getBlockRThroughput() const;
+
+public:
+  SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+              unsigned Width);
+
+  void onCycleEnd() override { ++TotalCycles; }
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/Support.cpp b/contrib/llvm/tools/llvm-mca/Support.cpp
new file mode 100644
index 000000000000..8f6b8a91f38f
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Support.cpp
@@ -0,0 +1,79 @@
+//===--------------------- Support.cpp --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a few helper functions used by various pipeline
+/// components.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Support.h"
+#include "llvm/MC/MCSchedule.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void computeProcResourceMasks(const MCSchedModel &SM,
+                              SmallVectorImpl<uint64_t> &Masks) {
+  unsigned ProcResourceID = 0;
+
+  // Create a unique bitmask for every processor resource unit.
+  // Skip resource at index 0, since it always references 'InvalidUnit'.
+  Masks.resize(SM.getNumProcResourceKinds());
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+    if (Desc.SubUnitsIdxBegin)
+      continue;
+    Masks[I] = 1ULL << ProcResourceID;
+    ProcResourceID++;
+  }
+
+  // Create a unique bitmask for every processor resource group.
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+    if (!Desc.SubUnitsIdxBegin)
+      continue;
+    Masks[I] = 1ULL << ProcResourceID;
+    for (unsigned U = 0; U < Desc.NumUnits; ++U) {
+      uint64_t OtherMask = Masks[Desc.SubUnitsIdxBegin[U]];
+      Masks[I] |= OtherMask;
+    }
+    ProcResourceID++;
+  }
+}
+
+double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
+                               unsigned NumMicroOps,
+                               ArrayRef<unsigned> ProcResourceUsage) {
+  // The block throughput is bounded from above by the hardware dispatch
+  // throughput. That is because the DispatchWidth is an upper bound on the
+  // number of opcodes that can be part of a single dispatch group.
+  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
+
+  // The block throughput is also limited by the amount of hardware parallelism.
+  // The number of available resource units affects the resource pressure
+  // distribution, as well as how many blocks can be executed every cycle.
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    unsigned ResourceCycles = ProcResourceUsage[I];
+    if (!ResourceCycles)
+      continue;
+
+    const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
+    double Throughput = static_cast<double>(ResourceCycles) / MCDesc.NumUnits;
+    Max = std::max(Max, Throughput);
+  }
+
+  // The block reciprocal throughput is computed as the MAX of:
+  //  - (NumMicroOps / DispatchWidth)
+  //  - (NumUnits / ResourceCycles)   for every consumed processor resource.
+  return Max;
+}
+
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/Support.h b/contrib/llvm/tools/llvm-mca/Support.h
new file mode 100644
index 000000000000..fd8d8b5a23b3
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/Support.h
@@ -0,0 +1,58 @@
+//===--------------------- Support.h ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Helper functions used by various pipeline components.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H
+#define LLVM_TOOLS_LLVM_MCA_SUPPORT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSchedule.h"
+
+namespace mca {
+
+/// Populates vector Masks with processor resource masks.
+///
+/// The number of bits set in a mask depends on the processor resource type.
+/// Each processor resource mask has at least one bit set. For groups, the
+/// number of bits set in the mask is equal to the cardinality of the group plus
+/// one. Excluding the most significant bit, the remaining bits in the mask
+/// identify processor resources that are part of the group.
+///
+/// Example:
+///
+///  ResourceA  -- Mask: 0b001
+///  ResourceB  -- Mask: 0b010
+///  ResourceAB -- Mask: 0b100 U (ResourceA::Mask | ResourceB::Mask) == 0b111
+///
+/// ResourceAB is a processor resource group containing ResourceA and ResourceB.
+/// Each resource mask uniquely identifies a resource; both ResourceA and
+/// ResourceB only have one bit set.
+/// ResourceAB is a group; excluding the most significant bit in the mask, the
+/// remaining bits identify the composition of the group.
+///
+/// Resource masks are used by the ResourceManager to solve set membership
+/// problems with simple bit manipulation operations.
+void computeProcResourceMasks(const llvm::MCSchedModel &SM,
+                              llvm::SmallVectorImpl<uint64_t> &Masks);
+
+/// Compute the reciprocal block throughput from a set of processor resource
+/// cycles. The reciprocal block throughput is computed as the MAX between:
+///  - NumMicroOps / DispatchWidth
+///  - ProcResourceCycles / #ProcResourceUnits  (for every consumed resource).
+double computeBlockRThroughput(const llvm::MCSchedModel &SM,
+                               unsigned DispatchWidth, unsigned NumMicroOps,
+                               llvm::ArrayRef<unsigned> ProcResourceUsage);
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/TimelineView.cpp b/contrib/llvm/tools/llvm-mca/TimelineView.cpp
new file mode 100644
index 000000000000..6e75cac0d432
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/TimelineView.cpp
@@ -0,0 +1,240 @@
+//===--------------------- TimelineView.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements the TimelineView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "TimelineView.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void TimelineView::initialize(unsigned MaxIterations) {
+  unsigned NumInstructions =
+      AsmSequence.getNumIterations() * AsmSequence.size();
+  if (!MaxIterations)
+    MaxIterations = DEFAULT_ITERATIONS;
+  unsigned NumEntries =
+      std::min(NumInstructions, MaxIterations * AsmSequence.size());
+  Timeline.resize(NumEntries);
+  TimelineViewEntry NullTVEntry = {0, 0, 0, 0, 0};
+  std::fill(Timeline.begin(), Timeline.end(), NullTVEntry);
+
+  WaitTime.resize(AsmSequence.size());
+  WaitTimeEntry NullWTEntry = {0, 0, 0, 0};
+  std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry);
+}
+
+void TimelineView::onEvent(const HWInstructionEvent &Event) {
+  const unsigned Index = Event.IR.getSourceIndex();
+  if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+    return;
+  switch (Event.Type) {
+  case HWInstructionEvent::Retired: {
+    TimelineViewEntry &TVEntry = Timeline[Index];
+    TVEntry.CycleRetired = CurrentCycle;
+
+    // Update the WaitTime entry which corresponds to this Index.
+    WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()];
+    WTEntry.Executions++;
+    WTEntry.CyclesSpentInSchedulerQueue +=
+        TVEntry.CycleIssued - TVEntry.CycleDispatched;
+    assert(TVEntry.CycleDispatched <= TVEntry.CycleReady);
+    WTEntry.CyclesSpentInSQWhileReady +=
+        TVEntry.CycleIssued - TVEntry.CycleReady;
+    WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
+        (TVEntry.CycleRetired - 1) - TVEntry.CycleExecuted;
+    break;
+  }
+  case HWInstructionEvent::Ready:
+    Timeline[Index].CycleReady = CurrentCycle;
+    break;
+  case HWInstructionEvent::Issued:
+    Timeline[Index].CycleIssued = CurrentCycle;
+    break;
+  case HWInstructionEvent::Executed:
+    Timeline[Index].CycleExecuted = CurrentCycle;
+    break;
+  case HWInstructionEvent::Dispatched:
+    Timeline[Index].CycleDispatched = CurrentCycle;
+    break;
+  default:
+    return;
+  }
+  LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
+                                      const WaitTimeEntry &Entry,
+                                      unsigned SourceIndex) const {
+  OS << SourceIndex << '.';
+  OS.PadToColumn(7);
+
+  if (Entry.Executions == 0) {
+    OS << "-      -      -      -     ";
+  } else {
+    double AverageTime1, AverageTime2, AverageTime3;
+    unsigned Executions = Entry.Executions;
+    AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions;
+    AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions;
+    AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions;
+
+    OS << Executions;
+    OS.PadToColumn(13);
+
+    OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10);
+    OS.PadToColumn(20);
+    OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10);
+    OS.PadToColumn(27);
+    OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
+    OS.PadToColumn(34);
+  }
+}
+
+void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
+  if (WaitTime.empty())
+    return;
+
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  formatted_raw_ostream FOS(TempStream);
+
+  FOS << "\n\nAverage Wait times (based on the timeline view):\n"
+      << "[0]: Executions\n"
+      << "[1]: Average time spent waiting in a scheduler's queue\n"
+      << "[2]: Average time spent waiting in a scheduler's queue while ready\n"
+      << "[3]: Average time elapsed from WB until retire stage\n\n";
+  FOS << "      [0]    [1]    [2]    [3]\n";
+
+  // Use a different string stream for the instruction.
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  for (unsigned I = 0, E = WaitTime.size(); I < E; ++I) {
+    printWaitTimeEntry(FOS, WaitTime[I], I);
+    // Append the instruction info at the end of the line.
+    const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
+
+    MCIP.printInst(&Inst, InstrStream, "", STI);
+    InstrStream.flush();
+
+    // Consume any tabs or spaces at the beginning of the string.
+    StringRef Str(Instruction);
+    Str = Str.ltrim();
+    FOS << "   " << Str << '\n';
+    FOS.flush();
+    Instruction = "";
+
+    OS << Buffer;
+    Buffer = "";
+  }
+}
+
+void TimelineView::printTimelineViewEntry(formatted_raw_ostream &OS,
+                                          const TimelineViewEntry &Entry,
+                                          unsigned Iteration,
+                                          unsigned SourceIndex) const {
+  if (Iteration == 0 && SourceIndex == 0)
+    OS << '\n';
+  OS << '[' << Iteration << ',' << SourceIndex << ']';
+  OS.PadToColumn(10);
+  for (unsigned I = 0, E = Entry.CycleDispatched; I < E; ++I)
+    OS << ((I % 5 == 0) ? '.' : ' ');
+  OS << TimelineView::DisplayChar::Dispatched;
+  if (Entry.CycleDispatched != Entry.CycleExecuted) {
+    // Zero latency instructions have the same value for CycleDispatched,
+    // CycleIssued and CycleExecuted.
+    for (unsigned I = Entry.CycleDispatched + 1, E = Entry.CycleIssued; I < E;
+         ++I)
+      OS << TimelineView::DisplayChar::Waiting;
+    if (Entry.CycleIssued == Entry.CycleExecuted)
+      OS << TimelineView::DisplayChar::DisplayChar::Executed;
+    else {
+      if (Entry.CycleDispatched != Entry.CycleIssued)
+        OS << TimelineView::DisplayChar::Executing;
+      for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E;
+           ++I)
+        OS << TimelineView::DisplayChar::Executing;
+      OS << TimelineView::DisplayChar::Executed;
+    }
+  }
+
+  for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I)
+    OS << TimelineView::DisplayChar::RetireLag;
+  OS << TimelineView::DisplayChar::Retired;
+
+  // Skip other columns.
+  for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I)
+    OS << ((I % 5 == 0 || I == LastCycle) ? '.' : ' ');
+}
+
+static void printTimelineHeader(formatted_raw_ostream &OS, unsigned Cycles) {
+  OS << "\n\nTimeline view:\n";
+  if (Cycles >= 10) {
+    OS.PadToColumn(10);
+    for (unsigned I = 0; I <= Cycles; ++I) {
+      if (((I / 10) & 1) == 0)
+        OS << ' ';
+      else
+        OS << I % 10;
+    }
+    OS << '\n';
+  }
+
+  OS << "Index";
+  OS.PadToColumn(10);
+  for (unsigned I = 0; I <= Cycles; ++I) {
+    if (((I / 10) & 1) == 0)
+      OS << I % 10;
+    else
+      OS << ' ';
+  }
+  OS << '\n';
+}
+
+void TimelineView::printTimeline(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream StringStream(Buffer);
+  formatted_raw_ostream FOS(StringStream);
+
+  printTimelineHeader(FOS, LastCycle);
+  FOS.flush();
+  OS << Buffer;
+
+  // Use a different string stream for the instruction.
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  for (unsigned I = 0, E = Timeline.size(); I < E; ++I) {
+    Buffer = "";
+    const TimelineViewEntry &Entry = Timeline[I];
+    if (Entry.CycleRetired == 0)
+      return;
+
+    unsigned Iteration = I / AsmSequence.size();
+    unsigned SourceIndex = I % AsmSequence.size();
+    printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
+    // Append the instruction info at the end of the line.
+    const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
+    MCIP.printInst(&Inst, InstrStream, "", STI);
+    InstrStream.flush();
+
+    // Consume any tabs or spaces at the beginning of the string.
+    StringRef Str(Instruction);
+    Str = Str.ltrim();
+    FOS << "   " << Str << '\n';
+    FOS.flush();
+    Instruction = "";
+    OS << Buffer;
+  }
+}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/TimelineView.h b/contrib/llvm/tools/llvm-mca/TimelineView.h
new file mode 100644
index 000000000000..e53c23ec1cc2
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/TimelineView.h
@@ -0,0 +1,189 @@
+//===--------------------- TimelineView.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements a timeline view for the llvm-mca tool.
+///
+/// Class TimelineView observes events generated by the pipeline. For every
+/// instruction executed by the pipeline, it stores information related to
+/// state transition. It then plots that information in the form of a table
+/// as reported by the example below:
+///
+/// Timeline view:
+///     	          0123456
+/// Index	0123456789
+///
+/// [0,0]	DeER .    .    ..	vmovshdup  %xmm0, %xmm1
+/// [0,1]	DeER .    .    ..	vpermilpd  $1, %xmm0, %xmm2
+/// [0,2]	.DeER.    .    ..	vpermilps  $231, %xmm0, %xmm5
+/// [0,3]	.DeeeER   .    ..	vaddss  %xmm1, %xmm0, %xmm3
+/// [0,4]	. D==eeeER.    ..	vaddss  %xmm3, %xmm2, %xmm4
+/// [0,5]	. D=====eeeER  ..	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// [1,0]	.  DeE------R  ..	vmovshdup  %xmm0, %xmm1
+/// [1,1]	.  DeE------R  ..	vpermilpd  $1, %xmm0, %xmm2
+/// [1,2]	.   DeE-----R  ..	vpermilps  $231, %xmm0, %xmm5
+/// [1,3]	.   D=eeeE--R  ..	vaddss  %xmm1, %xmm0, %xmm3
+/// [1,4]	.    D===eeeER ..	vaddss  %xmm3, %xmm2, %xmm4
+/// [1,5]	.    D======eeeER	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// There is an entry for every instruction in the input assembly sequence.
+/// The first field is a pair of numbers obtained from the instruction index.
+/// The first element of the pair is the iteration index, while the second
+/// element of the pair is a sequence number (i.e. a position in the assembly
+/// sequence).
+/// The second field of the table is the actual timeline information; each
+/// column is the information related to a specific cycle of execution.
+/// The timeline of an instruction is described by a sequence of character
+/// where each character represents the instruction state at a specific cycle.
+///
+/// Possible instruction states are:
+///  D: Instruction Dispatched
+///  e: Instruction Executing
+///  E: Instruction Executed (write-back stage)
+///  R: Instruction retired
+///  =: Instruction waiting in the Scheduler's queue
+///  -: Instruction executed, waiting to retire in order.
+///
+/// dots ('.') and empty spaces are cycles where the instruction is not
+/// in-flight.
+///
+/// The last column is the assembly instruction associated to the entry.
+///
+/// Based on the timeline view information from the example, instruction 0
+/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3.
+/// Instruction [0,1] was also dispatched at cycle 0, and it retired at
+/// the same cycle than instruction [0,0].
+/// Instruction [0,4] has been dispatched at cycle 2. However, it had to
+/// wait for two cycles before being issued. That is because operands
+/// became ready only at cycle 5.
+///
+/// This view helps further understanding bottlenecks and the impact of
+/// resource pressure on the code.
+///
+/// To better understand why instructions had to wait for multiple cycles in
+/// the scheduler's queue, class TimelineView also reports extra timing info
+/// in another table named "Average Wait times" (see example below).
+///
+///
+/// Average Wait times (based on the timeline view):
+/// [0]: Executions
+/// [1]: Average time spent waiting in a scheduler's queue
+/// [2]: Average time spent waiting in a scheduler's queue while ready
+/// [3]: Average time elapsed from WB until retire stage
+///
+///	[0]	[1]	[2]	[3]
+/// 0.	 2	1.0	1.0	3.0	vmovshdup  %xmm0, %xmm1
+/// 1.	 2	1.0	1.0	3.0	vpermilpd  $1, %xmm0, %xmm2
+/// 2.	 2	1.0	1.0	2.5	vpermilps  $231, %xmm0, %xmm5
+/// 3.	 2	1.5	0.5	1.0	vaddss  %xmm1, %xmm0, %xmm3
+/// 4.	 2	3.5	0.0	0.0	vaddss  %xmm3, %xmm2, %xmm4
+/// 5.	 2	6.5	0.0	0.0	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// By comparing column [2] with column [1], we get an idea about how many
+/// cycles were spent in the scheduler's queue due to data dependencies.
+///
+/// In this example, instruction 5 spent an average of ~6 cycles in the
+/// scheduler's queue. As soon as operands became ready, the instruction
+/// was immediately issued to the pipeline(s).
+/// That is expected because instruction 5 cannot transition to the "ready"
+/// state until %xmm4 is written by instruction 4.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+
+#include "SourceMgr.h"
+#include "View.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace mca {
+
+/// This class listens to instruction state transition events
+/// in order to construct a timeline information.
+///
+/// For every instruction executed by the Pipeline, this class constructs
+/// a TimelineViewEntry object. TimelineViewEntry objects are then used
+/// to print the timeline information, as well as the "average wait times"
+/// for every instruction in the input assembly sequence.
+class TimelineView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  llvm::MCInstPrinter &MCIP;
+  const SourceMgr &AsmSequence;
+
+  unsigned CurrentCycle;
+  unsigned MaxCycle;
+  unsigned LastCycle;
+
+  struct TimelineViewEntry {
+    unsigned CycleDispatched;
+    unsigned CycleReady;
+    unsigned CycleIssued;
+    unsigned CycleExecuted;
+    unsigned CycleRetired;
+  };
+  std::vector<TimelineViewEntry> Timeline;
+
+  struct WaitTimeEntry {
+    unsigned Executions;
+    unsigned CyclesSpentInSchedulerQueue;
+    unsigned CyclesSpentInSQWhileReady;
+    unsigned CyclesSpentAfterWBAndBeforeRetire;
+  };
+  std::vector<WaitTimeEntry> WaitTime;
+
+  void printTimelineViewEntry(llvm::formatted_raw_ostream &OS,
+                              const TimelineViewEntry &E, unsigned Iteration,
+                              unsigned SourceIndex) const;
+  void printWaitTimeEntry(llvm::formatted_raw_ostream &OS,
+                          const WaitTimeEntry &E, unsigned Index) const;
+
+  const unsigned DEFAULT_ITERATIONS = 10;
+
+  void initialize(unsigned MaxIterations);
+
+  // Display characters for the TimelineView report output.
+  struct DisplayChar {
+    static const char Dispatched = 'D';
+    static const char Executed = 'E';
+    static const char Retired = 'R';
+    static const char Waiting = '='; // Instruction is waiting in the scheduler.
+    static const char Executing = 'e';
+    static const char RetireLag = '-'; // The instruction is waiting to retire.
+  };
+
+public:
+  TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer,
+               const SourceMgr &Sequence, unsigned MaxIterations,
+               unsigned Cycles)
+      : STI(sti), MCIP(Printer), AsmSequence(Sequence), CurrentCycle(0),
+        MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0) {
+    initialize(MaxIterations);
+  }
+
+  // Event handlers.
+  void onCycleEnd() override { ++CurrentCycle; }
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  // print functionalities.
+  void printTimeline(llvm::raw_ostream &OS) const;
+  void printAverageWaitTimes(llvm::raw_ostream &OS) const;
+  void printView(llvm::raw_ostream &OS) const override {
+    printTimeline(OS);
+    printAverageWaitTimes(OS);
+  }
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/View.cpp b/contrib/llvm/tools/llvm-mca/View.cpp
new file mode 100644
index 000000000000..390a7aeb3b9d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/View.cpp
@@ -0,0 +1,20 @@
+//===----------------------- View.cpp ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the virtual anchor method in View.h to pin the vtable.
+///
+//===----------------------------------------------------------------------===//
+
+#include "View.h"
+
+namespace mca {
+
+void View::anchor() {}
+} // namespace mca
diff --git a/contrib/llvm/tools/llvm-mca/View.h b/contrib/llvm/tools/llvm-mca/View.h
new file mode 100644
index 000000000000..9ba94a5da977
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/View.h
@@ -0,0 +1,32 @@
+//===----------------------- View.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the main interface for Views. Each view contributes a
+/// portion of the final report generated by the tool.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_VIEW_H
+#define LLVM_TOOLS_LLVM_MCA_VIEW_H
+
+#include "HWEventListener.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+class View : public HWEventListener {
+public:
+  virtual void printView(llvm::raw_ostream &OS) const = 0;
+  virtual ~View() = default;
+  void anchor() override;
+};
+} // namespace mca
+
+#endif
diff --git a/contrib/llvm/tools/llvm-mca/llvm-mca.cpp b/contrib/llvm/tools/llvm-mca/llvm-mca.cpp
new file mode 100644
index 000000000000..2d292f375e6e
--- /dev/null
+++ b/contrib/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -0,0 +1,552 @@
+//===-- llvm-mca.cpp - Machine Code Analyzer -------------------*- C++ -* -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This utility is a simple driver that allows static performance analysis on
+// machine code similarly to how IACA (Intel Architecture Code Analyzer) works.
+//
+//   llvm-mca [options] <file-name>
+//      -march <type>
+//      -mcpu <cpu>
+//      -o <file>
+//
+// The target defaults to the host target.
+// The cpu defaults to the 'native' host cpu.
+// The output defaults to standard output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeRegion.h"
+#include "Context.h"
+#include "DispatchStatistics.h"
+#include "FetchStage.h"
+#include "InstructionInfoView.h"
+#include "InstructionTables.h"
+#include "Pipeline.h"
+#include "PipelinePrinter.h"
+#include "RegisterFileStatistics.h"
+#include "ResourcePressureView.h"
+#include "RetireControlUnitStatistics.h"
+#include "SchedulerStatistics.h"
+#include "SummaryView.h"
+#include "TimelineView.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
+
+using namespace llvm;
+
+static cl::OptionCategory ToolOptions("Tool Options");
+static cl::OptionCategory ViewOptions("View Options");
+
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<input file>"),
+                                          cl::cat(ToolOptions), cl::init("-"));
+
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::init("-"), cl::cat(ToolOptions),
+                                           cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    ArchName("march", cl::desc("Target arch to assemble for, "
+                               "see -version for available targets"),
+             cl::cat(ToolOptions));
+
+static cl::opt<std::string>
+    TripleName("mtriple", cl::desc("Target triple to assemble for, "
+                                   "see -version for available targets"),
+               cl::cat(ToolOptions));
+
+static cl::opt<std::string>
+    MCPU("mcpu",
+         cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+         cl::value_desc("cpu-name"), cl::cat(ToolOptions), cl::init("native"));
+
+static cl::opt<int>
+    OutputAsmVariant("output-asm-variant",
+                     cl::desc("Syntax variant to use for output printing"),
+                     cl::cat(ToolOptions), cl::init(-1));
+
+static cl::opt<unsigned> Iterations("iterations",
+                                    cl::desc("Number of iterations to run"),
+                                    cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<unsigned>
+    DispatchWidth("dispatch", cl::desc("Override the processor dispatch width"),
+                  cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<unsigned>
+    RegisterFileSize("register-file-size",
+                     cl::desc("Maximum number of temporary registers which can "
+                              "be used for register mappings"),
+                     cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<bool>
+    PrintRegisterFileStats("register-file-stats",
+                           cl::desc("Print register file statistics"),
+                           cl::cat(ViewOptions), cl::init(false));
+
+static cl::opt<bool> PrintDispatchStats("dispatch-stats",
+                                        cl::desc("Print dispatch statistics"),
+                                        cl::cat(ViewOptions), cl::init(false));
+
+static cl::opt<bool>
+    PrintSummaryView("summary-view", cl::Hidden,
+                     cl::desc("Print summary view (enabled by default)"),
+                     cl::cat(ViewOptions), cl::init(true));
+
+static cl::opt<bool> PrintSchedulerStats("scheduler-stats",
+                                         cl::desc("Print scheduler statistics"),
+                                         cl::cat(ViewOptions), cl::init(false));
+
+static cl::opt<bool>
+    PrintRetireStats("retire-stats",
+                     cl::desc("Print retire control unit statistics"),
+                     cl::cat(ViewOptions), cl::init(false));
+
+static cl::opt<bool> PrintResourcePressureView(
+    "resource-pressure",
+    cl::desc("Print the resource pressure view (enabled by default)"),
+    cl::cat(ViewOptions), cl::init(true));
+
+static cl::opt<bool> PrintTimelineView("timeline",
+                                       cl::desc("Print the timeline view"),
+                                       cl::cat(ViewOptions), cl::init(false));
+
+static cl::opt<unsigned> TimelineMaxIterations(
+    "timeline-max-iterations",
+    cl::desc("Maximum number of iterations to print in timeline view"),
+    cl::cat(ViewOptions), cl::init(0));
+
+static cl::opt<unsigned> TimelineMaxCycles(
+    "timeline-max-cycles",
+    cl::desc(
+        "Maximum number of cycles in the timeline view. Defaults to 80 cycles"),
+    cl::cat(ViewOptions), cl::init(80));
+
+static cl::opt<bool>
+    AssumeNoAlias("noalias",
+                  cl::desc("If set, assume that loads and stores do not alias"),
+                  cl::cat(ToolOptions), cl::init(true));
+
+static cl::opt<unsigned>
+    LoadQueueSize("lqueue",
+                  cl::desc("Size of the load queue (unbound by default)"),
+                  cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<unsigned>
+    StoreQueueSize("squeue",
+                   cl::desc("Size of the store queue (unbound by default)"),
+                   cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<bool>
+    PrintInstructionTables("instruction-tables",
+                           cl::desc("Print instruction tables"),
+                           cl::cat(ToolOptions), cl::init(false));
+
+static cl::opt<bool> PrintInstructionInfoView(
+    "instruction-info",
+    cl::desc("Print the instruction info view (enabled by default)"),
+    cl::cat(ViewOptions), cl::init(true));
+
+static cl::opt<bool> EnableAllStats("all-stats",
+                                    cl::desc("Print all hardware statistics"),
+                                    cl::cat(ViewOptions), cl::init(false));
+
+static cl::opt<bool>
+    EnableAllViews("all-views",
+                   cl::desc("Print all views including hardware statistics"),
+                   cl::cat(ViewOptions), cl::init(false));
+
+namespace {
+
+const Target *getTarget(const char *ProgName) {
+  TripleName = Triple::normalize(TripleName);
+  if (TripleName.empty())
+    TripleName = Triple::normalize(sys::getDefaultTargetTriple());
+  Triple TheTriple(TripleName);
+
+  // Get the target specific parser.
+  std::string Error;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
+  if (!TheTarget) {
+    errs() << ProgName << ": " << Error;
+    return nullptr;
+  }
+
+  // Return the found target.
+  return TheTarget;
+}
+
+// A comment consumer that parses strings.
+// The only valid tokens are strings.
+class MCACommentConsumer : public AsmCommentConsumer {
+public:
+  mca::CodeRegions &Regions;
+
+  MCACommentConsumer(mca::CodeRegions &R) : Regions(R) {}
+  void HandleComment(SMLoc Loc, StringRef CommentText) override {
+    // Skip empty comments.
+    StringRef Comment(CommentText);
+    if (Comment.empty())
+      return;
+
+    // Skip spaces and tabs
+    unsigned Position = Comment.find_first_not_of(" \t");
+    if (Position >= Comment.size())
+      // we reached the end of the comment. Bail out.
+      return;
+
+    Comment = Comment.drop_front(Position);
+    if (Comment.consume_front("LLVM-MCA-END")) {
+      Regions.endRegion(Loc);
+      return;
+    }
+
+    // Now try to parse string LLVM-MCA-BEGIN
+    if (!Comment.consume_front("LLVM-MCA-BEGIN"))
+      return;
+
+    // Skip spaces and tabs
+    Position = Comment.find_first_not_of(" \t");
+    if (Position < Comment.size())
+      Comment = Comment.drop_front(Position);
+    // Use the rest of the string as a descriptor for this code snippet.
+    Regions.beginRegion(Comment, Loc);
+  }
+};
+
+int AssembleInput(const char *ProgName, MCAsmParser &Parser,
+                  const Target *TheTarget, MCSubtargetInfo &STI,
+                  MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
+  std::unique_ptr<MCTargetAsmParser> TAP(
+      TheTarget->createMCAsmParser(STI, Parser, MCII, MCOptions));
+
+  if (!TAP) {
+    WithColor::error() << "this target does not support assembly parsing.\n";
+    return 1;
+  }
+
+  Parser.setTargetParser(*TAP);
+  return Parser.Run(false);
+}
+
+ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() {
+  if (OutputFilename == "")
+    OutputFilename = "-";
+  std::error_code EC;
+  auto Out =
+      llvm::make_unique<ToolOutputFile>(OutputFilename, EC, sys::fs::F_None);
+  if (!EC)
+    return std::move(Out);
+  return EC;
+}
+
+class MCStreamerWrapper final : public MCStreamer {
+  mca::CodeRegions &Regions;
+
+public:
+  MCStreamerWrapper(MCContext &Context, mca::CodeRegions &R)
+      : MCStreamer(Context), Regions(R) {}
+
+  // We only want to intercept the emission of new instructions.
+  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                               bool /* unused */) override {
+    Regions.addInstruction(llvm::make_unique<const MCInst>(Inst));
+  }
+
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+    return true;
+  }
+
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override {}
+  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override {}
+  void EmitGPRel32Value(const MCExpr *Value) override {}
+  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+  void EmitCOFFSymbolType(int Type) override {}
+  void EndCOFFSymbolDef() override {}
+
+  const std::vector<std::unique_ptr<const MCInst>> &
+  GetInstructionSequence(unsigned Index) const {
+    return Regions.getInstructionSequence(Index);
+  }
+};
+} // end of anonymous namespace
+
+static void processOptionImpl(cl::opt<bool> &O, const cl::opt<bool> &Default) {
+  if (!O.getNumOccurrences() || O.getPosition() < Default.getPosition())
+    O = Default.getValue();
+}
+
+static void processViewOptions() {
+  if (!EnableAllViews.getNumOccurrences() &&
+      !EnableAllStats.getNumOccurrences())
+    return;
+
+  if (EnableAllViews.getNumOccurrences()) {
+    processOptionImpl(PrintSummaryView, EnableAllViews);
+    processOptionImpl(PrintResourcePressureView, EnableAllViews);
+    processOptionImpl(PrintTimelineView, EnableAllViews);
+    processOptionImpl(PrintInstructionInfoView, EnableAllViews);
+  }
+
+  const cl::opt<bool> &Default =
+      EnableAllViews.getPosition() < EnableAllStats.getPosition()
+          ? EnableAllStats
+          : EnableAllViews;
+  processOptionImpl(PrintSummaryView, Default);
+  processOptionImpl(PrintRegisterFileStats, Default);
+  processOptionImpl(PrintDispatchStats, Default);
+  processOptionImpl(PrintSchedulerStats, Default);
+  processOptionImpl(PrintRetireStats, Default);
+}
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+
+  // Initialize targets and assembly parsers.
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
+  // Enable printing of available targets when flag --version is specified.
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+  cl::HideUnrelatedOptions({&ToolOptions, &ViewOptions});
+
+  // Parse flags and initialize target options.
+  cl::ParseCommandLineOptions(argc, argv,
+                              "llvm machine code performance analyzer.\n");
+
+  MCTargetOptions MCOptions;
+  MCOptions.PreserveAsmComments = false;
+
+  // Get the target from the triple. If a triple is not specified, then select
+  // the default triple for the host. If the triple doesn't correspond to any
+  // registered target, then exit with an error message.
+  const char *ProgName = argv[0];
+  const Target *TheTarget = getTarget(ProgName);
+  if (!TheTarget)
+    return 1;
+
+  // GetTarget() may replaced TripleName with a default triple.
+  // For safety, reconstruct the Triple object.
+  Triple TheTriple(TripleName);
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = BufferPtr.getError()) {
+    WithColor::error() << InputFilename << ": " << EC.message() << '\n';
+    return 1;
+  }
+
+  // Apply overrides to llvm-mca specific options.
+  processViewOptions();
+
+  SourceMgr SrcMgr;
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
+
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  assert(MRI && "Unable to create target register info!");
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  assert(MAI && "Unable to create target asm info!");
+
+  MCObjectFileInfo MOFI;
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+  MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx);
+
+  std::unique_ptr<buffer_ostream> BOS;
+
+  mca::CodeRegions Regions(SrcMgr);
+  MCStreamerWrapper Str(Ctx, Regions);
+
+  std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+
+  std::unique_ptr<MCInstrAnalysis> MCIA(
+      TheTarget->createMCInstrAnalysis(MCII.get()));
+
+  if (!MCPU.compare("native"))
+    MCPU = llvm::sys::getHostCPUName();
+
+  std::unique_ptr<MCSubtargetInfo> STI(
+      TheTarget->createMCSubtargetInfo(TripleName, MCPU, /* FeaturesStr */ ""));
+  if (!STI->isCPUStringValid(MCPU))
+    return 1;
+
+  if (!PrintInstructionTables && !STI->getSchedModel().isOutOfOrder()) {
+    WithColor::error() << "please specify an out-of-order cpu. '" << MCPU
+                       << "' is an in-order cpu.\n";
+    return 1;
+  }
+
+  if (!STI->getSchedModel().hasInstrSchedModel()) {
+    WithColor::error()
+        << "unable to find instruction-level scheduling information for"
+        << " target triple '" << TheTriple.normalize() << "' and cpu '" << MCPU
+        << "'.\n";
+
+    if (STI->getSchedModel().InstrItineraries)
+      WithColor::note()
+          << "cpu '" << MCPU << "' provides itineraries. However, "
+          << "instruction itineraries are currently unsupported.\n";
+    return 1;
+  }
+
+  std::unique_ptr<MCAsmParser> P(createMCAsmParser(SrcMgr, Ctx, Str, *MAI));
+  MCAsmLexer &Lexer = P->getLexer();
+  MCACommentConsumer CC(Regions);
+  Lexer.setCommentConsumer(&CC);
+
+  if (AssembleInput(ProgName, *P, TheTarget, *STI, *MCII, MCOptions))
+    return 1;
+
+  if (Regions.empty()) {
+    WithColor::error() << "no assembly instructions found.\n";
+    return 1;
+  }
+
+  // Now initialize the output file.
+  auto OF = getOutputStream();
+  if (std::error_code EC = OF.getError()) {
+    WithColor::error() << EC.message() << '\n';
+    return 1;
+  }
+
+  unsigned AssemblerDialect = P->getAssemblerDialect();
+  if (OutputAsmVariant >= 0)
+    AssemblerDialect = static_cast<unsigned>(OutputAsmVariant);
+  std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
+      Triple(TripleName), AssemblerDialect, *MAI, *MCII, *MRI));
+  if (!IP) {
+    WithColor::error()
+        << "unable to create instruction printer for target triple '"
+        << TheTriple.normalize() << "' with assembly variant "
+        << AssemblerDialect << ".\n";
+    return 1;
+  }
+
+  std::unique_ptr<llvm::ToolOutputFile> TOF = std::move(*OF);
+
+  const MCSchedModel &SM = STI->getSchedModel();
+
+  unsigned Width = SM.IssueWidth;
+  if (DispatchWidth)
+    Width = DispatchWidth;
+
+  // Create an instruction builder.
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA, *IP);
+
+  // Create a context to control ownership of the pipeline hardware.
+  mca::Context MCA(*MRI, *STI);
+
+  mca::PipelineOptions PO(Width, RegisterFileSize, LoadQueueSize,
+                          StoreQueueSize, AssumeNoAlias);
+
+  // Number each region in the sequence.
+  unsigned RegionIdx = 0;
+  for (const std::unique_ptr<mca::CodeRegion> &Region : Regions) {
+    // Skip empty code regions.
+    if (Region->empty())
+      continue;
+
+    // Don't print the header of this region if it is the default region, and
+    // it doesn't have an end location.
+    if (Region->startLoc().isValid() || Region->endLoc().isValid()) {
+      TOF->os() << "\n[" << RegionIdx++ << "] Code Region";
+      StringRef Desc = Region->getDescription();
+      if (!Desc.empty())
+        TOF->os() << " - " << Desc;
+      TOF->os() << "\n\n";
+    }
+
+    mca::SourceMgr S(Region->getInstructions(),
+                     PrintInstructionTables ? 1 : Iterations);
+
+    if (PrintInstructionTables) {
+      //  Create a pipeline, stages, and a printer.
+      auto P = llvm::make_unique<mca::Pipeline>();
+      P->appendStage(llvm::make_unique<mca::FetchStage>(IB, S));
+      P->appendStage(llvm::make_unique<mca::InstructionTables>(SM, IB));
+      mca::PipelinePrinter Printer(*P);
+
+      // Create the views for this pipeline, execute, and emit a report.
+      if (PrintInstructionInfoView) {
+        Printer.addView(
+            llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
+      }
+      Printer.addView(
+          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
+      P->run();
+      Printer.printReport(TOF->os());
+      continue;
+    }
+
+    // Create a basic pipeline simulating an out-of-order backend.
+    auto P = MCA.createDefaultPipeline(PO, IB, S);
+    mca::PipelinePrinter Printer(*P);
+
+    if (PrintSummaryView)
+      Printer.addView(llvm::make_unique<mca::SummaryView>(SM, S, Width));
+
+    if (PrintInstructionInfoView)
+      Printer.addView(
+          llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
+
+    if (PrintDispatchStats)
+      Printer.addView(llvm::make_unique<mca::DispatchStatistics>());
+
+    if (PrintSchedulerStats)
+      Printer.addView(llvm::make_unique<mca::SchedulerStatistics>(*STI));
+
+    if (PrintRetireStats)
+      Printer.addView(llvm::make_unique<mca::RetireControlUnitStatistics>());
+
+    if (PrintRegisterFileStats)
+      Printer.addView(llvm::make_unique<mca::RegisterFileStatistics>(*STI));
+
+    if (PrintResourcePressureView)
+      Printer.addView(
+          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
+
+    if (PrintTimelineView) {
+      Printer.addView(llvm::make_unique<mca::TimelineView>(
+          *STI, *IP, S, TimelineMaxIterations, TimelineMaxCycles));
+    }
+
+    P->run();
+    Printer.printReport(TOF->os());
+
+    // Clear the InstrBuilder internal state in preparation for another round.
+    IB.clear();
+  }
+
+  TOF->keep();
+  return 0;
+}
diff --git a/contrib/llvm/tools/llvm-modextract/llvm-modextract.cpp b/contrib/llvm/tools/llvm-modextract/llvm-modextract.cpp
index b2d21c23a094..9fd8340505aa 100644
--- a/contrib/llvm/tools/llvm-modextract/llvm-modextract.cpp
+++ b/contrib/llvm/tools/llvm-modextract/llvm-modextract.cpp
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
   }
 
   std::unique_ptr<Module> M = ExitOnErr(Ms[ModuleIndex].parseModule(Context));
-  WriteBitcodeToFile(M.get(), Out->os());
+  WriteBitcodeToFile(*M, Out->os());
 
   Out->keep();
   return 0;
diff --git a/contrib/llvm/tools/llvm-nm/llvm-nm.cpp b/contrib/llvm/tools/llvm-nm/llvm-nm.cpp
index b6ac9c20a946..37c1bf85809e 100644
--- a/contrib/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/contrib/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -33,9 +33,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
@@ -82,6 +81,11 @@ cl::alias ExternalOnly2("g", cl::desc("Alias for --extern-only"),
                         cl::aliasopt(ExternalOnly), cl::Grouping,
                         cl::ZeroOrMore);
 
+cl::opt<bool> NoWeakSymbols("no-weak",
+                            cl::desc("Show only non-weak symbols"));
+cl::alias NoWeakSymbols2("W", cl::desc("Alias for --no-weak"),
+                         cl::aliasopt(NoWeakSymbols), cl::Grouping);
+
 cl::opt<bool> BSDFormat("B", cl::desc("Alias for --format=bsd"),
                         cl::Grouping);
 cl::opt<bool> POSIXFormat("P", cl::desc("Alias for --format=posix"),
@@ -270,8 +274,16 @@ struct NMSymbol {
 } // anonymous namespace
 
 static bool compareSymbolAddress(const NMSymbol &A, const NMSymbol &B) {
-  bool ADefined = !(A.Sym.getFlags() & SymbolRef::SF_Undefined);
-  bool BDefined = !(B.Sym.getFlags() & SymbolRef::SF_Undefined);
+  bool ADefined;
+  if (A.Sym.getRawDataRefImpl().p)
+    ADefined = !(A.Sym.getFlags() & SymbolRef::SF_Undefined);
+  else
+    ADefined = A.TypeChar != 'U';
+  bool BDefined;
+  if (B.Sym.getRawDataRefImpl().p)
+    BDefined = !(B.Sym.getFlags() & SymbolRef::SF_Undefined);
+  else
+    BDefined = B.TypeChar != 'U';
   return std::make_tuple(ADefined, A.Address, A.Name, A.Size) <
          std::make_tuple(BDefined, B.Address, B.Name, B.Size);
 }
@@ -697,7 +709,7 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
 
     if (ReverseSort)
       Cmp = [=](const NMSymbol &A, const NMSymbol &B) { return Cmp(B, A); };
-    std::sort(SymbolList.begin(), SymbolList.end(), Cmp);
+    llvm::sort(SymbolList.begin(), SymbolList.end(), Cmp);
   }
 
   if (!PrintFileName) {
@@ -761,8 +773,10 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
 
     bool Undefined = SymFlags & SymbolRef::SF_Undefined;
     bool Global = SymFlags & SymbolRef::SF_Global;
+    bool Weak = SymFlags & SymbolRef::SF_Weak;
     if ((!Undefined && UndefinedOnly) || (Undefined && DefinedOnly) ||
-        (!Global && ExternalOnly) || (SizeSort && !PrintAddress))
+        (!Global && ExternalOnly) || (SizeSort && !PrintAddress) ||
+        (Weak && NoWeakSymbols))
       continue;
     if (PrintFileName) {
       if (!ArchitectureName.empty())
@@ -1004,6 +1018,10 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
     StringRef SectionName;
     Obj.getSectionName(Ref, SectionName);
     StringRef SegmentName = Obj.getSectionFinalSegmentName(Ref);
+    if (Obj.is64Bit() && 
+        Obj.getHeader64().filetype == MachO::MH_KEXT_BUNDLE &&
+        SegmentName == "__TEXT_EXEC" && SectionName == "__text")
+      return 't';
     if (SegmentName == "__TEXT" && SectionName == "__text")
       return 't';
     if (SegmentName == "__DATA" && SectionName == "__data")
@@ -1203,6 +1221,8 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
   raw_string_ostream LOS(LazysNameBuffer);
   std::string WeaksNameBuffer;
   raw_string_ostream WOS(WeaksNameBuffer);
+  std::string FunctionStartsNameBuffer;
+  raw_string_ostream FOS(FunctionStartsNameBuffer);
   if (MachO && !NoDyldInfo) {
     MachO::mach_header H;
     MachO::mach_header_64 H_64;
@@ -1573,6 +1593,93 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
           I++;
         }
       }
+
+      // Trying adding symbol from the function starts table and LC_MAIN entry
+      // point.
+      SmallVector<uint64_t, 8> FoundFns;
+      uint64_t lc_main_offset = UINT64_MAX;
+      for (const auto &Command : MachO->load_commands()) {
+        if (Command.C.cmd == MachO::LC_FUNCTION_STARTS) {
+          // We found a function starts segment, parse the addresses for 
+          // consumption.
+          MachO::linkedit_data_command LLC =
+            MachO->getLinkeditDataLoadCommand(Command);
+
+          MachO->ReadULEB128s(LLC.dataoff, FoundFns);
+        } else if (Command.C.cmd == MachO::LC_MAIN) {
+          MachO::entry_point_command LCmain =
+            MachO->getEntryPointCommand(Command);
+          lc_main_offset = LCmain.entryoff;
+        }
+      }
+      // See if these addresses are already in the symbol table.
+      unsigned FunctionStartsAdded = 0;
+      for (uint64_t f = 0; f < FoundFns.size(); f++) {
+        bool found = false;
+        for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
+          if (SymbolList[J].Address == FoundFns[f] + BaseSegmentAddress)
+            found = true;
+        }
+        // See this address is not already in the symbol table fake up an
+        // nlist for it.
+	if (!found) {
+          NMSymbol F;
+          memset(&F, '\0', sizeof(NMSymbol));
+          F.Name = "<redacted function X>";
+          F.Address = FoundFns[f] + BaseSegmentAddress;
+          F.Size = 0;
+          // There is no symbol in the nlist symbol table for this so we set
+          // Sym effectivly to null and the rest of code in here must test for
+          // it and not do things like Sym.getFlags() for it.
+          F.Sym = BasicSymbolRef();
+          F.SymFlags = 0;
+          F.NType = MachO::N_SECT;
+          F.NSect = 0;
+          StringRef SegmentName = StringRef();
+          StringRef SectionName = StringRef();
+          for (const SectionRef &Section : MachO->sections()) {
+            Section.getName(SectionName);
+            SegmentName = MachO->getSectionFinalSegmentName(
+                                                Section.getRawDataRefImpl());
+            F.NSect++;
+            if (F.Address >= Section.getAddress() &&
+                F.Address < Section.getAddress() + Section.getSize()) {
+              F.Section = Section;
+              break;
+            }
+          }
+          if (SegmentName == "__TEXT" && SectionName == "__text")
+            F.TypeChar = 't';
+          else if (SegmentName == "__DATA" && SectionName == "__data")
+            F.TypeChar = 'd';
+          else if (SegmentName == "__DATA" && SectionName == "__bss")
+            F.TypeChar = 'b';
+          else
+            F.TypeChar = 's';
+          F.NDesc = 0;
+          F.IndirectName = StringRef();
+          SymbolList.push_back(F);
+          if (FoundFns[f] == lc_main_offset)
+            FOS << "<redacted LC_MAIN>";
+          else
+            FOS << "<redacted function " << f << ">";
+          FOS << '\0';
+          FunctionStartsAdded++;
+        }
+      }
+      if (FunctionStartsAdded) {
+        FOS.flush();
+        const char *Q = FunctionStartsNameBuffer.c_str();
+        for (unsigned K = 0; K < FunctionStartsAdded; K++) {
+          SymbolList[I].Name = Q;
+          Q += strlen(Q) + 1;
+          if (SymbolList[I].TypeChar == 'I') {
+            SymbolList[I].IndirectName = Q;
+            Q += strlen(Q) + 1;
+          }
+          I++;
+        }
+      }
     }
   }
 
@@ -1915,11 +2022,7 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
 }
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, "llvm symbol table dumper\n");
 
   // llvm-nm only reads binary files.
diff --git a/contrib/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/contrib/llvm/tools/llvm-objcopy/ObjcopyOpts.td
new file mode 100644
index 000000000000..2af2108d98d3
--- /dev/null
+++ b/contrib/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -0,0 +1,99 @@
+include "llvm/Option/OptParser.td"
+
+multiclass Eq<string name> {
+  def NAME: Separate<["--", "-"], name>;
+  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
+}
+
+def help : Flag<["-", "--"], "help">;
+defm binary_architecture : Eq<"binary-architecture">,
+                           HelpText<"Used when transforming an architecture-less format (such as binary) to another format">;
+def B : JoinedOrSeparate<["-"], "B">,
+        Alias<binary_architecture>;
+defm input_target : Eq<"input-target">,
+                    HelpText<"Format of the input file">,
+                    Values<"binary">;
+defm output_target : Eq<"output-target">,
+                     HelpText<"Format of the output file">,
+                     Values<"binary">;
+def O : JoinedOrSeparate<["-"], "O">,
+        Alias<output_target>;
+defm split_dwo : Eq<"split-dwo">,
+                 MetaVarName<"dwo-file">,
+                 HelpText<"Equivalent to extract-dwo on the input file to <dwo-file>, then strip-dwo on the input file">;
+defm add_gnu_debuglink : Eq<"add-gnu-debuglink">,
+                         MetaVarName<"debug-file">,
+                         HelpText<"Add a .gnu_debuglink for <debug-file>">;
+defm remove_section : Eq<"remove-section">,
+                      MetaVarName<"section">,
+                      HelpText<"Remove <section>">;
+defm rename_section : Eq<"rename-section">,
+                      MetaVarName<"old=new">,
+                      HelpText<"Renames a section from old to new">;
+defm redefine_symbol : Eq<"redefine-sym">,
+                       MetaVarName<"old=new">,
+                       HelpText<"Change the name of a symbol old to new">;
+def R : JoinedOrSeparate<["-"], "R">,
+        Alias<remove_section>;
+defm keep : Eq<"keep">,
+            MetaVarName<"section">,
+            HelpText<"Keep <section>">;
+defm only_keep : Eq<"only-keep">,
+                 MetaVarName<"section">,
+                 HelpText<"Remove all but <section>">;
+def j : JoinedOrSeparate<["-"], "j">,
+                      Alias<only_keep>;
+defm add_section : Eq<"add-section">,
+                   MetaVarName<"section=file">,
+                   HelpText<"Make a section named <section> with the contents of <file>.">;
+def strip_all : Flag<["-", "--"], "strip-all">,
+                HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
+def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
+                    HelpText<"Compaitable with GNU objcopy's --strip-all">;
+def strip_debug : Flag<["-", "--"], "strip-debug">,
+                  HelpText<"Remove all debug information">;
+def strip_dwo : Flag<["-", "--"], "strip-dwo">,
+                HelpText<"Remove all DWARF .dwo sections from file">;
+def strip_sections : Flag<["-", "--"], "strip-sections">,
+                     HelpText<"Remove all section headers">;
+def strip_non_alloc : Flag<["-", "--"], "strip-non-alloc">,
+                      HelpText<"Remove all non-allocated sections">;
+def extract_dwo : Flag<["-", "--"], "extract-dwo">,
+                  HelpText<"Remove all sections that are not DWARF .dwo sections from file">;
+def localize_hidden : Flag<["-", "--"], "localize-hidden">,
+                      HelpText<"Mark all symbols that have hidden or internal visibility as local">;
+defm localize_symbol : Eq<"localize-symbol">,
+                       MetaVarName<"symbol">,
+                       HelpText<"Mark <symbol> as local">;
+def L : JoinedOrSeparate<["-"], "L">,
+        Alias<localize_symbol>;
+defm globalize_symbol : Eq<"globalize-symbol">,
+                       MetaVarName<"symbol">,
+                       HelpText<"Mark <symbol> as global">;
+defm weaken_symbol : Eq<"weaken-symbol">,
+                       MetaVarName<"symbol">,
+                       HelpText<"Mark <symbol> as weak">;
+def W : JoinedOrSeparate<["-"], "W">,
+        Alias<weaken_symbol>;
+def weaken : Flag<["-", "--"], "weaken">,
+                  HelpText<"Mark all global symbols as weak">;
+def discard_all : Flag<["-", "--"], "discard-all">,
+                      HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">,
+        Alias<discard_all>;
+defm strip_symbol : Eq<"strip-symbol">,
+                       MetaVarName<"symbol">,
+                       HelpText<"Remove symbol <symbol>">;
+def N : JoinedOrSeparate<["-"], "N">,
+        Alias<strip_symbol>;
+defm keep_symbol : Eq<"keep-symbol">,
+                       MetaVarName<"symbol">,
+                       HelpText<"Do not remove symbol <symbol>">;
+def K : JoinedOrSeparate<["-"], "K">,
+        Alias<keep_symbol>;
+def only_keep_debug : Flag<["-", "--"], "only-keep-debug">,
+                          HelpText<"Currently ignored. Only for compaitability with GNU objcopy.">;
+def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
+                      HelpText<"Remove all symbols not needed by relocations">;
+def keep_file_symbols : Flag<["-", "--"], "keep-file-symbols">,
+                      HelpText<"Do not remove file symbols">;
diff --git a/contrib/llvm/tools/llvm-objcopy/Object.cpp b/contrib/llvm/tools/llvm-objcopy/Object.cpp
index 9e82448187ea..7e88f5263a39 100644
--- a/contrib/llvm/tools/llvm-objcopy/Object.cpp
+++ b/contrib/llvm/tools/llvm-objcopy/Object.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
@@ -26,64 +27,117 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::objcopy;
 using namespace object;
 using namespace ELF;
 
-template <class ELFT> void Segment::writeHeader(FileOutputBuffer &Out) const {
-  using Elf_Ehdr = typename ELFT::Ehdr;
-  using Elf_Phdr = typename ELFT::Phdr;
+Buffer::~Buffer() {}
 
-  uint8_t *Buf = Out.getBufferStart();
-  Buf += sizeof(Elf_Ehdr) + Index * sizeof(Elf_Phdr);
-  Elf_Phdr &Phdr = *reinterpret_cast<Elf_Phdr *>(Buf);
-  Phdr.p_type = Type;
-  Phdr.p_flags = Flags;
-  Phdr.p_offset = Offset;
-  Phdr.p_vaddr = VAddr;
-  Phdr.p_paddr = PAddr;
-  Phdr.p_filesz = FileSize;
-  Phdr.p_memsz = MemSize;
-  Phdr.p_align = Align;
+void FileBuffer::allocate(size_t Size) {
+  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+      FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable);
+  handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) {
+    error("failed to open " + getName() + ": " + E.message());
+  });
+  Buf = std::move(*BufferOrErr);
+}
+
+Error FileBuffer::commit() { return Buf->commit(); }
+
+uint8_t *FileBuffer::getBufferStart() {
+  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
 }
 
-void Segment::writeSegment(FileOutputBuffer &Out) const {
-  uint8_t *Buf = Out.getBufferStart() + Offset;
-  // We want to maintain segments' interstitial data and contents exactly.
-  // This lets us just copy segments directly.
-  std::copy(std::begin(Contents), std::end(Contents), Buf);
+void MemBuffer::allocate(size_t Size) {
+  Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName());
+}
+
+Error MemBuffer::commit() { return Error::success(); }
+
+uint8_t *MemBuffer::getBufferStart() {
+  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+}
+
+std::unique_ptr<WritableMemoryBuffer> MemBuffer::releaseMemoryBuffer() {
+  return std::move(Buf);
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writePhdr(const Segment &Seg) {
+  using Elf_Phdr = typename ELFT::Phdr;
+
+  uint8_t *B = Buf.getBufferStart();
+  B += Obj.ProgramHdrSegment.Offset + Seg.Index * sizeof(Elf_Phdr);
+  Elf_Phdr &Phdr = *reinterpret_cast<Elf_Phdr *>(B);
+  Phdr.p_type = Seg.Type;
+  Phdr.p_flags = Seg.Flags;
+  Phdr.p_offset = Seg.Offset;
+  Phdr.p_vaddr = Seg.VAddr;
+  Phdr.p_paddr = Seg.PAddr;
+  Phdr.p_filesz = Seg.FileSize;
+  Phdr.p_memsz = Seg.MemSize;
+  Phdr.p_align = Seg.Align;
 }
 
 void SectionBase::removeSectionReferences(const SectionBase *Sec) {}
+void SectionBase::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {}
 void SectionBase::initialize(SectionTableRef SecTable) {}
 void SectionBase::finalize() {}
+void SectionBase::markSymbols() {}
+
+template <class ELFT> void ELFWriter<ELFT>::writeShdr(const SectionBase &Sec) {
+  uint8_t *B = Buf.getBufferStart();
+  B += Sec.HeaderOffset;
+  typename ELFT::Shdr &Shdr = *reinterpret_cast<typename ELFT::Shdr *>(B);
+  Shdr.sh_name = Sec.NameIndex;
+  Shdr.sh_type = Sec.Type;
+  Shdr.sh_flags = Sec.Flags;
+  Shdr.sh_addr = Sec.Addr;
+  Shdr.sh_offset = Sec.Offset;
+  Shdr.sh_size = Sec.Size;
+  Shdr.sh_link = Sec.Link;
+  Shdr.sh_info = Sec.Info;
+  Shdr.sh_addralign = Sec.Align;
+  Shdr.sh_entsize = Sec.EntrySize;
+}
 
-template <class ELFT>
-void SectionBase::writeHeader(FileOutputBuffer &Out) const {
-  uint8_t *Buf = Out.getBufferStart();
-  Buf += HeaderOffset;
-  typename ELFT::Shdr &Shdr = *reinterpret_cast<typename ELFT::Shdr *>(Buf);
-  Shdr.sh_name = NameIndex;
-  Shdr.sh_type = Type;
-  Shdr.sh_flags = Flags;
-  Shdr.sh_addr = Addr;
-  Shdr.sh_offset = Offset;
-  Shdr.sh_size = Size;
-  Shdr.sh_link = Link;
-  Shdr.sh_info = Info;
-  Shdr.sh_addralign = Align;
-  Shdr.sh_entsize = EntrySize;
-}
-
-void Section::writeSection(FileOutputBuffer &Out) const {
-  if (Type == SHT_NOBITS)
+SectionVisitor::~SectionVisitor() {}
+
+void BinarySectionWriter::visit(const SectionIndexSection &Sec) {
+  error("Cannot write symbol section index table '" + Sec.Name + "' ");
+}
+
+void BinarySectionWriter::visit(const SymbolTableSection &Sec) {
+  error("Cannot write symbol table '" + Sec.Name + "' out to binary");
+}
+
+void BinarySectionWriter::visit(const RelocationSection &Sec) {
+  error("Cannot write relocation section '" + Sec.Name + "' out to binary");
+}
+
+void BinarySectionWriter::visit(const GnuDebugLinkSection &Sec) {
+  error("Cannot write '" + Sec.Name + "' out to binary");
+}
+
+void BinarySectionWriter::visit(const GroupSection &Sec) {
+  error("Cannot write '" + Sec.Name + "' out to binary");
+}
+
+void SectionWriter::visit(const Section &Sec) {
+  if (Sec.Type == SHT_NOBITS)
     return;
-  uint8_t *Buf = Out.getBufferStart() + Offset;
-  std::copy(std::begin(Contents), std::end(Contents), Buf);
+  uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
+  std::copy(std::begin(Sec.Contents), std::end(Sec.Contents), Buf);
 }
 
-void OwnedDataSection::writeSection(FileOutputBuffer &Out) const {
-  uint8_t *Buf = Out.getBufferStart() + Offset;
-  std::copy(std::begin(Data), std::end(Data), Buf);
+void Section::accept(SectionVisitor &Visitor) const { Visitor.visit(*this); }
+
+void SectionWriter::visit(const OwnedDataSection &Sec) {
+  uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
+  std::copy(std::begin(Sec.Data), std::end(Sec.Data), Buf);
+}
+
+void OwnedDataSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
 }
 
 void StringTableSection::addString(StringRef Name) {
@@ -97,8 +151,35 @@ uint32_t StringTableSection::findIndex(StringRef Name) const {
 
 void StringTableSection::finalize() { StrTabBuilder.finalize(); }
 
-void StringTableSection::writeSection(FileOutputBuffer &Out) const {
-  StrTabBuilder.write(Out.getBufferStart() + Offset);
+void SectionWriter::visit(const StringTableSection &Sec) {
+  Sec.StrTabBuilder.write(Out.getBufferStart() + Sec.Offset);
+}
+
+void StringTableSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
+}
+
+template <class ELFT>
+void ELFSectionWriter<ELFT>::visit(const SectionIndexSection &Sec) {
+  uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
+  auto *IndexesBuffer = reinterpret_cast<typename ELFT::Word *>(Buf);
+  std::copy(std::begin(Sec.Indexes), std::end(Sec.Indexes), IndexesBuffer);
+}
+
+void SectionIndexSection::initialize(SectionTableRef SecTable) {
+  Size = 0;
+  setSymTab(SecTable.getSectionOfType<SymbolTableSection>(
+      Link,
+      "Link field value " + Twine(Link) + " in section " + Name + " is invalid",
+      "Link field value " + Twine(Link) + " in section " + Name +
+          " is not a symbol table"));
+  Symbols->setShndxTable(this);
+}
+
+void SectionIndexSection::finalize() { Link = Symbols->Index; }
+
+void SectionIndexSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
 }
 
 static bool isValidReservedSectionIndex(uint16_t Index, uint16_t Machine) {
@@ -119,8 +200,13 @@ static bool isValidReservedSectionIndex(uint16_t Index, uint16_t Machine) {
   return false;
 }
 
+// Large indexes force us to clarify exactly what this function should do. This
+// function should return the value that will appear in st_shndx when written
+// out.
 uint16_t Symbol::getShndx() const {
   if (DefinedIn != nullptr) {
+    if (DefinedIn->Index >= SHN_LORESERVE)
+      return SHN_XINDEX;
     return DefinedIn->Index;
   }
   switch (ShndxType) {
@@ -134,11 +220,18 @@ uint16_t Symbol::getShndx() const {
   case SYMBOL_HEXAGON_SCOMMON_2:
   case SYMBOL_HEXAGON_SCOMMON_4:
   case SYMBOL_HEXAGON_SCOMMON_8:
+  case SYMBOL_XINDEX:
     return static_cast<uint16_t>(ShndxType);
   }
   llvm_unreachable("Symbol with invalid ShndxType encountered");
 }
 
+void SymbolTableSection::assignIndices() {
+  uint32_t Index = 0;
+  for (auto &Sym : Symbols)
+    Sym->Index = Index++;
+}
+
 void SymbolTableSection::addSymbol(StringRef Name, uint8_t Bind, uint8_t Type,
                                    SectionBase *DefinedIn, uint64_t Value,
                                    uint8_t Visibility, uint16_t Shndx,
@@ -148,6 +241,8 @@ void SymbolTableSection::addSymbol(StringRef Name, uint8_t Bind, uint8_t Type,
   Sym.Binding = Bind;
   Sym.Type = Type;
   Sym.DefinedIn = DefinedIn;
+  if (DefinedIn != nullptr)
+    DefinedIn->HasSymbol = true;
   if (DefinedIn == nullptr) {
     if (Shndx >= SHN_LORESERVE)
       Sym.ShndxType = static_cast<SymbolShndxType>(Shndx);
@@ -163,16 +258,33 @@ void SymbolTableSection::addSymbol(StringRef Name, uint8_t Bind, uint8_t Type,
 }
 
 void SymbolTableSection::removeSectionReferences(const SectionBase *Sec) {
+  if (SectionIndexTable == Sec)
+    SectionIndexTable = nullptr;
   if (SymbolNames == Sec) {
     error("String table " + SymbolNames->Name +
           " cannot be removed because it is referenced by the symbol table " +
           this->Name);
   }
-  auto Iter =
-      std::remove_if(std::begin(Symbols), std::end(Symbols),
-                     [=](const SymPtr &Sym) { return Sym->DefinedIn == Sec; });
-  Size -= (std::end(Symbols) - Iter) * this->EntrySize;
-  Symbols.erase(Iter, std::end(Symbols));
+  removeSymbols([Sec](const Symbol &Sym) { return Sym.DefinedIn == Sec; });
+}
+
+void SymbolTableSection::updateSymbols(function_ref<void(Symbol &)> Callable) {
+  std::for_each(std::begin(Symbols) + 1, std::end(Symbols),
+                [Callable](SymPtr &Sym) { Callable(*Sym); });
+  std::stable_partition(
+      std::begin(Symbols), std::end(Symbols),
+      [](const SymPtr &Sym) { return Sym->Binding == STB_LOCAL; });
+  assignIndices();
+}
+
+void SymbolTableSection::removeSymbols(
+    function_ref<bool(const Symbol &)> ToRemove) {
+  Symbols.erase(
+      std::remove_if(std::begin(Symbols) + 1, std::end(Symbols),
+                     [ToRemove](const SymPtr &Sym) { return ToRemove(*Sym); }),
+      std::end(Symbols));
+  Size = Symbols.size() * EntrySize;
+  assignIndices();
 }
 
 void SymbolTableSection::initialize(SectionTableRef SecTable) {
@@ -200,7 +312,17 @@ void SymbolTableSection::finalize() {
   Info = MaxLocalIndex + 1;
 }
 
-void SymbolTableSection::addSymbolNames() {
+void SymbolTableSection::prepareForLayout() {
+  // Add all potential section indexes before file layout so that the section
+  // index section has the approprite size.
+  if (SectionIndexTable != nullptr) {
+    for (const auto &Sym : Symbols) {
+      if (Sym->DefinedIn != nullptr && Sym->DefinedIn->Index >= SHN_LORESERVE)
+        SectionIndexTable->addIndex(Sym->DefinedIn->Index);
+      else
+        SectionIndexTable->addIndex(SHN_UNDEF);
+    }
+  }
   // Add all of our strings to SymbolNames so that SymbolNames has the right
   // size before layout is decided.
   for (auto &Sym : Symbols)
@@ -213,13 +335,18 @@ const Symbol *SymbolTableSection::getSymbolByIndex(uint32_t Index) const {
   return Symbols[Index].get();
 }
 
+Symbol *SymbolTableSection::getSymbolByIndex(uint32_t Index) {
+  return const_cast<Symbol *>(
+      static_cast<const SymbolTableSection *>(this)->getSymbolByIndex(Index));
+}
+
 template <class ELFT>
-void SymbolTableSectionImpl<ELFT>::writeSection(FileOutputBuffer &Out) const {
+void ELFSectionWriter<ELFT>::visit(const SymbolTableSection &Sec) {
   uint8_t *Buf = Out.getBufferStart();
-  Buf += Offset;
+  Buf += Sec.Offset;
   typename ELFT::Sym *Sym = reinterpret_cast<typename ELFT::Sym *>(Buf);
   // Loop though symbols setting each entry of the symbol table.
-  for (auto &Symbol : Symbols) {
+  for (auto &Symbol : Sec.Symbols) {
     Sym->st_name = Symbol->NameIndex;
     Sym->st_value = Symbol->Value;
     Sym->st_size = Symbol->Size;
@@ -231,13 +358,18 @@ void SymbolTableSectionImpl<ELFT>::writeSection(FileOutputBuffer &Out) const {
   }
 }
 
+void SymbolTableSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
+}
+
 template <class SymTabType>
 void RelocSectionWithSymtabBase<SymTabType>::removeSectionReferences(
     const SectionBase *Sec) {
   if (Symbols == Sec) {
-    error("Symbol table " + Symbols->Name + " cannot be removed because it is "
-                                            "referenced by the relocation "
-                                            "section " +
+    error("Symbol table " + Symbols->Name +
+          " cannot be removed because it is "
+          "referenced by the relocation "
+          "section " +
           this->Name);
   }
 }
@@ -252,9 +384,9 @@ void RelocSectionWithSymtabBase<SymTabType>::initialize(
           " is not a symbol table"));
 
   if (Info != SHN_UNDEF)
-    setSection(SecTable.getSection(Info,
-                                   "Info field value " + Twine(Info) +
-                                       " in section " + Name + " is invalid"));
+    setSection(SecTable.getSection(Info, "Info field value " + Twine(Info) +
+                                             " in section " + Name +
+                                             " is invalid"));
   else
     setSection(nullptr);
 }
@@ -267,16 +399,15 @@ void RelocSectionWithSymtabBase<SymTabType>::finalize() {
 }
 
 template <class ELFT>
-void setAddend(Elf_Rel_Impl<ELFT, false> &Rel, uint64_t Addend) {}
+static void setAddend(Elf_Rel_Impl<ELFT, false> &Rel, uint64_t Addend) {}
 
 template <class ELFT>
-void setAddend(Elf_Rel_Impl<ELFT, true> &Rela, uint64_t Addend) {
+static void setAddend(Elf_Rel_Impl<ELFT, true> &Rela, uint64_t Addend) {
   Rela.r_addend = Addend;
 }
 
-template <class ELFT>
-template <class T>
-void RelocationSection<ELFT>::writeRel(T *Buf) const {
+template <class RelRange, class T>
+static void writeRel(const RelRange &Relocations, T *Buf) {
   for (const auto &Reloc : Relocations) {
     Buf->r_offset = Reloc.Offset;
     setAddend(*Buf, Reloc.Addend);
@@ -286,43 +417,138 @@ void RelocationSection<ELFT>::writeRel(T *Buf) const {
 }
 
 template <class ELFT>
-void RelocationSection<ELFT>::writeSection(FileOutputBuffer &Out) const {
-  uint8_t *Buf = Out.getBufferStart() + Offset;
-  if (Type == SHT_REL)
-    writeRel(reinterpret_cast<Elf_Rel *>(Buf));
+void ELFSectionWriter<ELFT>::visit(const RelocationSection &Sec) {
+  uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
+  if (Sec.Type == SHT_REL)
+    writeRel(Sec.Relocations, reinterpret_cast<Elf_Rel *>(Buf));
   else
-    writeRel(reinterpret_cast<Elf_Rela *>(Buf));
+    writeRel(Sec.Relocations, reinterpret_cast<Elf_Rela *>(Buf));
 }
 
-void DynamicRelocationSection::writeSection(FileOutputBuffer &Out) const {
-  std::copy(std::begin(Contents), std::end(Contents),
-            Out.getBufferStart() + Offset);
+void RelocationSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
 }
 
-void SectionWithStrTab::removeSectionReferences(const SectionBase *Sec) {
-  if (StrTab == Sec) {
-    error("String table " + StrTab->Name + " cannot be removed because it is "
-                                           "referenced by the section " +
+void RelocationSection::removeSymbols(
+    function_ref<bool(const Symbol &)> ToRemove) {
+  for (const Relocation &Reloc : Relocations)
+    if (ToRemove(*Reloc.RelocSymbol))
+      error("not stripping symbol `" + Reloc.RelocSymbol->Name +
+            "' because it is named in a relocation");
+}
+
+void RelocationSection::markSymbols() {
+  for (const Relocation &Reloc : Relocations)
+    Reloc.RelocSymbol->Referenced = true;
+}
+
+void SectionWriter::visit(const DynamicRelocationSection &Sec) {
+  std::copy(std::begin(Sec.Contents), std::end(Sec.Contents),
+            Out.getBufferStart() + Sec.Offset);
+}
+
+void DynamicRelocationSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
+}
+
+void Section::removeSectionReferences(const SectionBase *Sec) {
+  if (LinkSection == Sec) {
+    error("Section " + LinkSection->Name +
+          " cannot be removed because it is "
+          "referenced by the section " +
           this->Name);
   }
 }
 
-bool SectionWithStrTab::classof(const SectionBase *S) {
-  return isa<DynamicSymbolTableSection>(S) || isa<DynamicSection>(S);
+void GroupSection::finalize() {
+  this->Info = Sym->Index;
+  this->Link = SymTab->Index;
 }
 
-void SectionWithStrTab::initialize(SectionTableRef SecTable) {
-  auto StrTab = SecTable.getSection(Link,
-                                    "Link field value " + Twine(Link) +
-                                        " in section " + Name + " is invalid");
-  if (StrTab->Type != SHT_STRTAB) {
-    error("Link field value " + Twine(Link) + " in section " + Name +
-          " is not a string table");
+void GroupSection::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
+  if (ToRemove(*Sym)) {
+    error("Symbol " + Sym->Name +
+          " cannot be removed because it is "
+          "referenced by the section " +
+          this->Name + "[" + Twine(this->Index) + "]");
   }
-  setStrTab(StrTab);
 }
 
-void SectionWithStrTab::finalize() { this->Link = StrTab->Index; }
+void GroupSection::markSymbols() {
+  if (Sym)
+    Sym->Referenced = true;
+}
+
+void Section::initialize(SectionTableRef SecTable) {
+  if (Link != ELF::SHN_UNDEF) {
+    LinkSection =
+        SecTable.getSection(Link, "Link field value " + Twine(Link) +
+                                      " in section " + Name + " is invalid");
+    if (LinkSection->Type == ELF::SHT_SYMTAB)
+      LinkSection = nullptr;
+  }
+}
+
+void Section::finalize() { this->Link = LinkSection ? LinkSection->Index : 0; }
+
+void GnuDebugLinkSection::init(StringRef File, StringRef Data) {
+  FileName = sys::path::filename(File);
+  // The format for the .gnu_debuglink starts with the file name and is
+  // followed by a null terminator and then the CRC32 of the file. The CRC32
+  // should be 4 byte aligned. So we add the FileName size, a 1 for the null
+  // byte, and then finally push the size to alignment and add 4.
+  Size = alignTo(FileName.size() + 1, 4) + 4;
+  // The CRC32 will only be aligned if we align the whole section.
+  Align = 4;
+  Type = ELF::SHT_PROGBITS;
+  Name = ".gnu_debuglink";
+  // For sections not found in segments, OriginalOffset is only used to
+  // establish the order that sections should go in. By using the maximum
+  // possible offset we cause this section to wind up at the end.
+  OriginalOffset = std::numeric_limits<uint64_t>::max();
+  JamCRC crc;
+  crc.update(ArrayRef<char>(Data.data(), Data.size()));
+  // The CRC32 value needs to be complemented because the JamCRC dosn't
+  // finalize the CRC32 value. It also dosn't negate the initial CRC32 value
+  // but it starts by default at 0xFFFFFFFF which is the complement of zero.
+  CRC32 = ~crc.getCRC();
+}
+
+GnuDebugLinkSection::GnuDebugLinkSection(StringRef File) : FileName(File) {
+  // Read in the file to compute the CRC of it.
+  auto DebugOrErr = MemoryBuffer::getFile(File);
+  if (!DebugOrErr)
+    error("'" + File + "': " + DebugOrErr.getError().message());
+  auto Debug = std::move(*DebugOrErr);
+  init(File, Debug->getBuffer());
+}
+
+template <class ELFT>
+void ELFSectionWriter<ELFT>::visit(const GnuDebugLinkSection &Sec) {
+  auto Buf = Out.getBufferStart() + Sec.Offset;
+  char *File = reinterpret_cast<char *>(Buf);
+  Elf_Word *CRC =
+      reinterpret_cast<Elf_Word *>(Buf + Sec.Size - sizeof(Elf_Word));
+  *CRC = Sec.CRC32;
+  std::copy(std::begin(Sec.FileName), std::end(Sec.FileName), File);
+}
+
+void GnuDebugLinkSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
+}
+
+template <class ELFT>
+void ELFSectionWriter<ELFT>::visit(const GroupSection &Sec) {
+  ELF::Elf32_Word *Buf =
+      reinterpret_cast<ELF::Elf32_Word *>(Out.getBufferStart() + Sec.Offset);
+  *Buf++ = Sec.FlagWord;
+  for (const auto *S : Sec.GroupMembers)
+    support::endian::write32<ELFT::TargetEndianness>(Buf++, S->Index);
+}
+
+void GroupSection::accept(SectionVisitor &Visitor) const {
+  Visitor.visit(*this);
+}
 
 // Returns true IFF a section is wholly inside the range of a segment
 static bool sectionWithinSegment(const SectionBase &Section,
@@ -345,7 +571,7 @@ static bool segmentOverlapsSegment(const Segment &Child,
          Parent.OriginalOffset + Parent.FileSize > Child.OriginalOffset;
 }
 
-static bool compareSegments(const Segment *A, const Segment *B) {
+static bool compareSegmentsByOffset(const Segment *A, const Segment *B) {
   // Any segment without a parent segment should come before a segment
   // that has a parent segment.
   if (A->OriginalOffset < B->OriginalOffset)
@@ -355,14 +581,36 @@ static bool compareSegments(const Segment *A, const Segment *B) {
   return A->Index < B->Index;
 }
 
-template <class ELFT>
-void Object<ELFT>::readProgramHeaders(const ELFFile<ELFT> &ElfFile) {
+static bool compareSegmentsByPAddr(const Segment *A, const Segment *B) {
+  if (A->PAddr < B->PAddr)
+    return true;
+  if (A->PAddr > B->PAddr)
+    return false;
+  return A->Index < B->Index;
+}
+
+template <class ELFT> void ELFBuilder<ELFT>::setParentSegment(Segment &Child) {
+  for (auto &Parent : Obj.segments()) {
+    // Every segment will overlap with itself but we don't want a segment to
+    // be it's own parent so we avoid that situation.
+    if (&Child != &Parent && segmentOverlapsSegment(Child, Parent)) {
+      // We want a canonical "most parental" segment but this requires
+      // inspecting the ParentSegment.
+      if (compareSegmentsByOffset(&Parent, &Child))
+        if (Child.ParentSegment == nullptr ||
+            compareSegmentsByOffset(&Parent, Child.ParentSegment)) {
+          Child.ParentSegment = &Parent;
+        }
+    }
+  }
+}
+
+template <class ELFT> void ELFBuilder<ELFT>::readProgramHeaders() {
   uint32_t Index = 0;
   for (const auto &Phdr : unwrapOrError(ElfFile.program_headers())) {
     ArrayRef<uint8_t> Data{ElfFile.base() + Phdr.p_offset,
                            (size_t)Phdr.p_filesz};
-    Segments.emplace_back(llvm::make_unique<Segment>(Data));
-    Segment &Seg = *Segments.back();
+    Segment &Seg = Obj.addSegment(Data);
     Seg.Type = Phdr.p_type;
     Seg.Flags = Phdr.p_flags;
     Seg.OriginalOffset = Phdr.p_offset;
@@ -373,58 +621,124 @@ void Object<ELFT>::readProgramHeaders(const ELFFile<ELFT> &ElfFile) {
     Seg.MemSize = Phdr.p_memsz;
     Seg.Align = Phdr.p_align;
     Seg.Index = Index++;
-    for (auto &Section : Sections) {
-      if (sectionWithinSegment(*Section, Seg)) {
-        Seg.addSection(&*Section);
-        if (!Section->ParentSegment ||
-            Section->ParentSegment->Offset > Seg.Offset) {
-          Section->ParentSegment = &Seg;
+    for (auto &Section : Obj.sections()) {
+      if (sectionWithinSegment(Section, Seg)) {
+        Seg.addSection(&Section);
+        if (!Section.ParentSegment ||
+            Section.ParentSegment->Offset > Seg.Offset) {
+          Section.ParentSegment = &Seg;
         }
       }
     }
   }
+
+  auto &ElfHdr = Obj.ElfHdrSegment;
+  // Creating multiple PT_PHDR segments technically is not valid, but PT_LOAD
+  // segments must not overlap, and other types fit even less.
+  ElfHdr.Type = PT_PHDR;
+  ElfHdr.Flags = 0;
+  ElfHdr.OriginalOffset = ElfHdr.Offset = 0;
+  ElfHdr.VAddr = 0;
+  ElfHdr.PAddr = 0;
+  ElfHdr.FileSize = ElfHdr.MemSize = sizeof(Elf_Ehdr);
+  ElfHdr.Align = 0;
+  ElfHdr.Index = Index++;
+
+  const auto &Ehdr = *ElfFile.getHeader();
+  auto &PrHdr = Obj.ProgramHdrSegment;
+  PrHdr.Type = PT_PHDR;
+  PrHdr.Flags = 0;
+  // The spec requires us to have p_vaddr % p_align == p_offset % p_align.
+  // Whereas this works automatically for ElfHdr, here OriginalOffset is
+  // always non-zero and to ensure the equation we assign the same value to
+  // VAddr as well.
+  PrHdr.OriginalOffset = PrHdr.Offset = PrHdr.VAddr = Ehdr.e_phoff;
+  PrHdr.PAddr = 0;
+  PrHdr.FileSize = PrHdr.MemSize = Ehdr.e_phentsize * Ehdr.e_phnum;
+  // The spec requires us to naturally align all the fields.
+  PrHdr.Align = sizeof(Elf_Addr);
+  PrHdr.Index = Index++;
+
   // Now we do an O(n^2) loop through the segments in order to match up
   // segments.
-  for (auto &Child : Segments) {
-    for (auto &Parent : Segments) {
-      // Every segment will overlap with itself but we don't want a segment to
-      // be it's own parent so we avoid that situation.
-      if (&Child != &Parent && segmentOverlapsSegment(*Child, *Parent)) {
-        // We want a canonical "most parental" segment but this requires
-        // inspecting the ParentSegment.
-        if (compareSegments(Parent.get(), Child.get()))
-          if (Child->ParentSegment == nullptr ||
-              compareSegments(Parent.get(), Child->ParentSegment)) {
-            Child->ParentSegment = Parent.get();
-          }
-      }
-    }
+  for (auto &Child : Obj.segments())
+    setParentSegment(Child);
+  setParentSegment(ElfHdr);
+  setParentSegment(PrHdr);
+}
+
+template <class ELFT>
+void ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
+  auto SecTable = Obj.sections();
+  auto SymTab = SecTable.template getSectionOfType<SymbolTableSection>(
+      GroupSec->Link,
+      "Link field value " + Twine(GroupSec->Link) + " in section " +
+          GroupSec->Name + " is invalid",
+      "Link field value " + Twine(GroupSec->Link) + " in section " +
+          GroupSec->Name + " is not a symbol table");
+  auto Sym = SymTab->getSymbolByIndex(GroupSec->Info);
+  if (!Sym)
+    error("Info field value " + Twine(GroupSec->Info) + " in section " +
+          GroupSec->Name + " is not a valid symbol index");
+  GroupSec->setSymTab(SymTab);
+  GroupSec->setSymbol(Sym);
+  if (GroupSec->Contents.size() % sizeof(ELF::Elf32_Word) ||
+      GroupSec->Contents.empty())
+    error("The content of the section " + GroupSec->Name + " is malformed");
+  const ELF::Elf32_Word *Word =
+      reinterpret_cast<const ELF::Elf32_Word *>(GroupSec->Contents.data());
+  const ELF::Elf32_Word *End =
+      Word + GroupSec->Contents.size() / sizeof(ELF::Elf32_Word);
+  GroupSec->setFlagWord(*Word++);
+  for (; Word != End; ++Word) {
+    uint32_t Index = support::endian::read32<ELFT::TargetEndianness>(Word);
+    GroupSec->addMember(SecTable.getSection(
+        Index, "Group member index " + Twine(Index) + " in section " +
+                   GroupSec->Name + " is invalid"));
   }
 }
 
 template <class ELFT>
-void Object<ELFT>::initSymbolTable(const object::ELFFile<ELFT> &ElfFile,
-                                   SymbolTableSection *SymTab,
-                                   SectionTableRef SecTable) {
+void ELFBuilder<ELFT>::initSymbolTable(SymbolTableSection *SymTab) {
   const Elf_Shdr &Shdr = *unwrapOrError(ElfFile.getSection(SymTab->Index));
   StringRef StrTabData = unwrapOrError(ElfFile.getStringTableForSymtab(Shdr));
+  ArrayRef<Elf_Word> ShndxData;
 
-  for (const auto &Sym : unwrapOrError(ElfFile.symbols(&Shdr))) {
+  auto Symbols = unwrapOrError(ElfFile.symbols(&Shdr));
+  for (const auto &Sym : Symbols) {
     SectionBase *DefSection = nullptr;
     StringRef Name = unwrapOrError(Sym.getName(StrTabData));
 
-    if (Sym.st_shndx >= SHN_LORESERVE) {
-      if (!isValidReservedSectionIndex(Sym.st_shndx, Machine)) {
+    if (Sym.st_shndx == SHN_XINDEX) {
+      if (SymTab->getShndxTable() == nullptr)
+        error("Symbol '" + Name +
+              "' has index SHN_XINDEX but no SHT_SYMTAB_SHNDX section exists.");
+      if (ShndxData.data() == nullptr) {
+        const Elf_Shdr &ShndxSec =
+            *unwrapOrError(ElfFile.getSection(SymTab->getShndxTable()->Index));
+        ShndxData = unwrapOrError(
+            ElfFile.template getSectionContentsAsArray<Elf_Word>(&ShndxSec));
+        if (ShndxData.size() != Symbols.size())
+          error("Symbol section index table does not have the same number of "
+                "entries as the symbol table.");
+      }
+      Elf_Word Index = ShndxData[&Sym - Symbols.begin()];
+      DefSection = Obj.sections().getSection(
+          Index,
+          "Symbol '" + Name + "' has invalid section index " +
+              Twine(Index));
+    } else if (Sym.st_shndx >= SHN_LORESERVE) {
+      if (!isValidReservedSectionIndex(Sym.st_shndx, Obj.Machine)) {
         error(
             "Symbol '" + Name +
             "' has unsupported value greater than or equal to SHN_LORESERVE: " +
             Twine(Sym.st_shndx));
       }
     } else if (Sym.st_shndx != SHN_UNDEF) {
-      DefSection = SecTable.getSection(
-          Sym.st_shndx,
-          "Symbol '" + Name + "' is defined in invalid section with index " +
-              Twine(Sym.st_shndx));
+      DefSection = Obj.sections().getSection(
+          Sym.st_shndx, "Symbol '" + Name +
+                            "' is defined has invalid section index " +
+                            Twine(Sym.st_shndx));
     }
 
     SymTab->addSymbol(Name, Sym.getBinding(), Sym.getType(), DefSection,
@@ -440,9 +754,9 @@ static void getAddend(uint64_t &ToSet, const Elf_Rel_Impl<ELFT, true> &Rela) {
   ToSet = Rela.r_addend;
 }
 
-template <class ELFT, class T>
-void initRelocations(RelocationSection<ELFT> *Relocs,
-                     SymbolTableSection *SymbolTable, T RelRange) {
+template <class T>
+static void initRelocations(RelocationSection *Relocs,
+                            SymbolTableSection *SymbolTable, T RelRange) {
   for (const auto &Rel : RelRange) {
     Relocation ToAdd;
     ToAdd.Offset = Rel.r_offset;
@@ -453,14 +767,14 @@ void initRelocations(RelocationSection<ELFT> *Relocs,
   }
 }
 
-SectionBase *SectionTableRef::getSection(uint16_t Index, Twine ErrMsg) {
+SectionBase *SectionTableRef::getSection(uint32_t Index, Twine ErrMsg) {
   if (Index == SHN_UNDEF || Index > Sections.size())
     error(ErrMsg);
   return Sections[Index - 1].get();
 }
 
 template <class T>
-T *SectionTableRef::getSectionOfType(uint16_t Index, Twine IndexErrMsg,
+T *SectionTableRef::getSectionOfType(uint32_t Index, Twine IndexErrMsg,
                                      Twine TypeErrMsg) {
   if (T *Sec = dyn_cast<T>(getSection(Index, IndexErrMsg)))
     return Sec;
@@ -468,147 +782,221 @@ T *SectionTableRef::getSectionOfType(uint16_t Index, Twine IndexErrMsg,
 }
 
 template <class ELFT>
-std::unique_ptr<SectionBase>
-Object<ELFT>::makeSection(const object::ELFFile<ELFT> &ElfFile,
-                          const Elf_Shdr &Shdr) {
+SectionBase &ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
   ArrayRef<uint8_t> Data;
   switch (Shdr.sh_type) {
   case SHT_REL:
   case SHT_RELA:
     if (Shdr.sh_flags & SHF_ALLOC) {
       Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-      return llvm::make_unique<DynamicRelocationSection>(Data);
+      return Obj.addSection<DynamicRelocationSection>(Data);
     }
-    return llvm::make_unique<RelocationSection<ELFT>>();
+    return Obj.addSection<RelocationSection>();
   case SHT_STRTAB:
     // If a string table is allocated we don't want to mess with it. That would
     // mean altering the memory image. There are no special link types or
     // anything so we can just use a Section.
     if (Shdr.sh_flags & SHF_ALLOC) {
       Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-      return llvm::make_unique<Section>(Data);
+      return Obj.addSection<Section>(Data);
     }
-    return llvm::make_unique<StringTableSection>();
+    return Obj.addSection<StringTableSection>();
   case SHT_HASH:
   case SHT_GNU_HASH:
     // Hash tables should refer to SHT_DYNSYM which we're not going to change.
     // Because of this we don't need to mess with the hash tables either.
     Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return llvm::make_unique<Section>(Data);
+    return Obj.addSection<Section>(Data);
+  case SHT_GROUP:
+    Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
+    return Obj.addSection<GroupSection>(Data);
   case SHT_DYNSYM:
     Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return llvm::make_unique<DynamicSymbolTableSection>(Data);
+    return Obj.addSection<DynamicSymbolTableSection>(Data);
   case SHT_DYNAMIC:
     Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return llvm::make_unique<DynamicSection>(Data);
+    return Obj.addSection<DynamicSection>(Data);
   case SHT_SYMTAB: {
-    auto SymTab = llvm::make_unique<SymbolTableSectionImpl<ELFT>>();
-    SymbolTable = SymTab.get();
-    return std::move(SymTab);
+    auto &SymTab = Obj.addSection<SymbolTableSection>();
+    Obj.SymbolTable = &SymTab;
+    return SymTab;
+  }
+  case SHT_SYMTAB_SHNDX: {
+    auto &ShndxSection = Obj.addSection<SectionIndexSection>();
+    Obj.SectionIndexTable = &ShndxSection;
+    return ShndxSection;
   }
   case SHT_NOBITS:
-    return llvm::make_unique<Section>(Data);
+    return Obj.addSection<Section>(Data);
   default:
     Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return llvm::make_unique<Section>(Data);
+    return Obj.addSection<Section>(Data);
   }
 }
 
-template <class ELFT>
-SectionTableRef Object<ELFT>::readSectionHeaders(const ELFFile<ELFT> &ElfFile) {
+template <class ELFT> void ELFBuilder<ELFT>::readSectionHeaders() {
   uint32_t Index = 0;
   for (const auto &Shdr : unwrapOrError(ElfFile.sections())) {
     if (Index == 0) {
       ++Index;
       continue;
     }
-    SecPtr Sec = makeSection(ElfFile, Shdr);
-    Sec->Name = unwrapOrError(ElfFile.getSectionName(&Shdr));
-    Sec->Type = Shdr.sh_type;
-    Sec->Flags = Shdr.sh_flags;
-    Sec->Addr = Shdr.sh_addr;
-    Sec->Offset = Shdr.sh_offset;
-    Sec->OriginalOffset = Shdr.sh_offset;
-    Sec->Size = Shdr.sh_size;
-    Sec->Link = Shdr.sh_link;
-    Sec->Info = Shdr.sh_info;
-    Sec->Align = Shdr.sh_addralign;
-    Sec->EntrySize = Shdr.sh_entsize;
-    Sec->Index = Index++;
-    Sections.push_back(std::move(Sec));
-  }
-
-  SectionTableRef SecTable(Sections);
+    auto &Sec = makeSection(Shdr);
+    Sec.Name = unwrapOrError(ElfFile.getSectionName(&Shdr));
+    Sec.Type = Shdr.sh_type;
+    Sec.Flags = Shdr.sh_flags;
+    Sec.Addr = Shdr.sh_addr;
+    Sec.Offset = Shdr.sh_offset;
+    Sec.OriginalOffset = Shdr.sh_offset;
+    Sec.Size = Shdr.sh_size;
+    Sec.Link = Shdr.sh_link;
+    Sec.Info = Shdr.sh_info;
+    Sec.Align = Shdr.sh_addralign;
+    Sec.EntrySize = Shdr.sh_entsize;
+    Sec.Index = Index++;
+  }
+
+  // If a section index table exists we'll need to initialize it before we
+  // initialize the symbol table because the symbol table might need to
+  // reference it.
+  if (Obj.SectionIndexTable)
+    Obj.SectionIndexTable->initialize(Obj.sections());
 
   // Now that all of the sections have been added we can fill out some extra
   // details about symbol tables. We need the symbol table filled out before
   // any relocations.
-  if (SymbolTable) {
-    SymbolTable->initialize(SecTable);
-    initSymbolTable(ElfFile, SymbolTable, SecTable);
+  if (Obj.SymbolTable) {
+    Obj.SymbolTable->initialize(Obj.sections());
+    initSymbolTable(Obj.SymbolTable);
   }
 
   // Now that all sections and symbols have been added we can add
   // relocations that reference symbols and set the link and info fields for
   // relocation sections.
-  for (auto &Section : Sections) {
-    if (Section.get() == SymbolTable)
+  for (auto &Section : Obj.sections()) {
+    if (&Section == Obj.SymbolTable)
       continue;
-    Section->initialize(SecTable);
-    if (auto RelSec = dyn_cast<RelocationSection<ELFT>>(Section.get())) {
+    Section.initialize(Obj.sections());
+    if (auto RelSec = dyn_cast<RelocationSection>(&Section)) {
       auto Shdr = unwrapOrError(ElfFile.sections()).begin() + RelSec->Index;
       if (RelSec->Type == SHT_REL)
-        initRelocations(RelSec, SymbolTable, unwrapOrError(ElfFile.rels(Shdr)));
+        initRelocations(RelSec, Obj.SymbolTable,
+                        unwrapOrError(ElfFile.rels(Shdr)));
       else
-        initRelocations(RelSec, SymbolTable,
+        initRelocations(RelSec, Obj.SymbolTable,
                         unwrapOrError(ElfFile.relas(Shdr)));
+    } else if (auto GroupSec = dyn_cast<GroupSection>(&Section)) {
+      initGroupSection(GroupSec);
     }
   }
-
-  return SecTable;
 }
 
-template <class ELFT> Object<ELFT>::Object(const ELFObjectFile<ELFT> &Obj) {
-  const auto &ElfFile = *Obj.getELFFile();
+template <class ELFT> void ELFBuilder<ELFT>::build() {
   const auto &Ehdr = *ElfFile.getHeader();
 
-  std::copy(Ehdr.e_ident, Ehdr.e_ident + 16, Ident);
-  Type = Ehdr.e_type;
-  Machine = Ehdr.e_machine;
-  Version = Ehdr.e_version;
-  Entry = Ehdr.e_entry;
-  Flags = Ehdr.e_flags;
+  std::copy(Ehdr.e_ident, Ehdr.e_ident + 16, Obj.Ident);
+  Obj.Type = Ehdr.e_type;
+  Obj.Machine = Ehdr.e_machine;
+  Obj.Version = Ehdr.e_version;
+  Obj.Entry = Ehdr.e_entry;
+  Obj.Flags = Ehdr.e_flags;
+
+  readSectionHeaders();
+  readProgramHeaders();
+
+  uint32_t ShstrIndex = Ehdr.e_shstrndx;
+  if (ShstrIndex == SHN_XINDEX)
+    ShstrIndex = unwrapOrError(ElfFile.getSection(0))->sh_link;
+
+  Obj.SectionNames =
+      Obj.sections().template getSectionOfType<StringTableSection>(
+          ShstrIndex,
+          "e_shstrndx field value " + Twine(Ehdr.e_shstrndx) +
+              " in elf header " + " is invalid",
+          "e_shstrndx field value " + Twine(Ehdr.e_shstrndx) +
+              " in elf header " + " is not a string table");
+}
+
+// A generic size function which computes sizes of any random access range.
+template <class R> size_t size(R &&Range) {
+  return static_cast<size_t>(std::end(Range) - std::begin(Range));
+}
+
+Writer::~Writer() {}
 
-  SectionTableRef SecTable = readSectionHeaders(ElfFile);
-  readProgramHeaders(ElfFile);
+Reader::~Reader() {}
 
-  SectionNames = SecTable.getSectionOfType<StringTableSection>(
-      Ehdr.e_shstrndx,
-      "e_shstrndx field value " + Twine(Ehdr.e_shstrndx) + " in elf header " +
-          " is invalid",
-      "e_shstrndx field value " + Twine(Ehdr.e_shstrndx) + " in elf header " +
-          " is not a string table");
+ElfType ELFReader::getElfType() const {
+  if (isa<ELFObjectFile<ELF32LE>>(Bin))
+    return ELFT_ELF32LE;
+  if (isa<ELFObjectFile<ELF64LE>>(Bin))
+    return ELFT_ELF64LE;
+  if (isa<ELFObjectFile<ELF32BE>>(Bin))
+    return ELFT_ELF32BE;
+  if (isa<ELFObjectFile<ELF64BE>>(Bin))
+    return ELFT_ELF64BE;
+  llvm_unreachable("Invalid ELFType");
 }
 
-template <class ELFT>
-void Object<ELFT>::writeHeader(FileOutputBuffer &Out) const {
-  uint8_t *Buf = Out.getBufferStart();
-  Elf_Ehdr &Ehdr = *reinterpret_cast<Elf_Ehdr *>(Buf);
-  std::copy(Ident, Ident + 16, Ehdr.e_ident);
-  Ehdr.e_type = Type;
-  Ehdr.e_machine = Machine;
-  Ehdr.e_version = Version;
-  Ehdr.e_entry = Entry;
-  Ehdr.e_phoff = sizeof(Elf_Ehdr);
-  Ehdr.e_flags = Flags;
+std::unique_ptr<Object> ELFReader::create() const {
+  auto Obj = llvm::make_unique<Object>();
+  if (auto *o = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
+    ELFBuilder<ELF32LE> Builder(*o, *Obj);
+    Builder.build();
+    return Obj;
+  } else if (auto *o = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
+    ELFBuilder<ELF64LE> Builder(*o, *Obj);
+    Builder.build();
+    return Obj;
+  } else if (auto *o = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
+    ELFBuilder<ELF32BE> Builder(*o, *Obj);
+    Builder.build();
+    return Obj;
+  } else if (auto *o = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
+    ELFBuilder<ELF64BE> Builder(*o, *Obj);
+    Builder.build();
+    return Obj;
+  }
+  error("Invalid file type");
+}
+
+template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
+  uint8_t *B = Buf.getBufferStart();
+  Elf_Ehdr &Ehdr = *reinterpret_cast<Elf_Ehdr *>(B);
+  std::copy(Obj.Ident, Obj.Ident + 16, Ehdr.e_ident);
+  Ehdr.e_type = Obj.Type;
+  Ehdr.e_machine = Obj.Machine;
+  Ehdr.e_version = Obj.Version;
+  Ehdr.e_entry = Obj.Entry;
+  Ehdr.e_phoff = Obj.ProgramHdrSegment.Offset;
+  Ehdr.e_flags = Obj.Flags;
   Ehdr.e_ehsize = sizeof(Elf_Ehdr);
   Ehdr.e_phentsize = sizeof(Elf_Phdr);
-  Ehdr.e_phnum = Segments.size();
+  Ehdr.e_phnum = size(Obj.segments());
   Ehdr.e_shentsize = sizeof(Elf_Shdr);
   if (WriteSectionHeaders) {
-    Ehdr.e_shoff = SHOffset;
-    Ehdr.e_shnum = Sections.size() + 1;
-    Ehdr.e_shstrndx = SectionNames->Index;
+    Ehdr.e_shoff = Obj.SHOffset;
+    // """
+    // If the number of sections is greater than or equal to
+    // SHN_LORESERVE (0xff00), this member has the value zero and the actual
+    // number of section header table entries is contained in the sh_size field
+    // of the section header at index 0.
+    // """
+    auto Shnum = size(Obj.sections()) + 1;
+    if (Shnum >= SHN_LORESERVE)
+      Ehdr.e_shnum = 0;
+    else
+      Ehdr.e_shnum = Shnum;
+    // """
+    // If the section name string table section index is greater than or equal
+    // to SHN_LORESERVE (0xff00), this member has the value SHN_XINDEX (0xffff)
+    // and the actual index of the section name string table section is
+    // contained in the sh_link field of the section header at index 0.
+    // """
+    if (Obj.SectionNames->Index >= SHN_LORESERVE)
+      Ehdr.e_shstrndx = SHN_XINDEX;
+    else
+      Ehdr.e_shstrndx = Obj.SectionNames->Index;
   } else {
     Ehdr.e_shoff = 0;
     Ehdr.e_shnum = 0;
@@ -616,42 +1004,46 @@ void Object<ELFT>::writeHeader(FileOutputBuffer &Out) const {
   }
 }
 
-template <class ELFT>
-void Object<ELFT>::writeProgramHeaders(FileOutputBuffer &Out) const {
-  for (auto &Phdr : Segments)
-    Phdr->template writeHeader<ELFT>(Out);
+template <class ELFT> void ELFWriter<ELFT>::writePhdrs() {
+  for (auto &Seg : Obj.segments())
+    writePhdr(Seg);
 }
 
-template <class ELFT>
-void Object<ELFT>::writeSectionHeaders(FileOutputBuffer &Out) const {
-  uint8_t *Buf = Out.getBufferStart() + SHOffset;
+template <class ELFT> void ELFWriter<ELFT>::writeShdrs() {
+  uint8_t *B = Buf.getBufferStart() + Obj.SHOffset;
   // This reference serves to write the dummy section header at the begining
   // of the file. It is not used for anything else
-  Elf_Shdr &Shdr = *reinterpret_cast<Elf_Shdr *>(Buf);
+  Elf_Shdr &Shdr = *reinterpret_cast<Elf_Shdr *>(B);
   Shdr.sh_name = 0;
   Shdr.sh_type = SHT_NULL;
   Shdr.sh_flags = 0;
   Shdr.sh_addr = 0;
   Shdr.sh_offset = 0;
-  Shdr.sh_size = 0;
-  Shdr.sh_link = 0;
+  // See writeEhdr for why we do this.
+  uint64_t Shnum = size(Obj.sections()) + 1;
+  if (Shnum >= SHN_LORESERVE)
+    Shdr.sh_size = Shnum;
+  else
+    Shdr.sh_size = 0;
+  // See writeEhdr for why we do this.
+  if (Obj.SectionNames != nullptr && Obj.SectionNames->Index >= SHN_LORESERVE)
+    Shdr.sh_link = Obj.SectionNames->Index;
+  else
+    Shdr.sh_link = 0;
   Shdr.sh_info = 0;
   Shdr.sh_addralign = 0;
   Shdr.sh_entsize = 0;
 
-  for (auto &Section : Sections)
-    Section->template writeHeader<ELFT>(Out);
+  for (auto &Sec : Obj.sections())
+    writeShdr(Sec);
 }
 
-template <class ELFT>
-void Object<ELFT>::writeSectionData(FileOutputBuffer &Out) const {
-  for (auto &Section : Sections)
-    Section->writeSection(Out);
+template <class ELFT> void ELFWriter<ELFT>::writeSectionData() {
+  for (auto &Sec : Obj.sections())
+    Sec.accept(*SecWriter);
 }
 
-template <class ELFT>
-void Object<ELFT>::removeSections(
-    std::function<bool(const SectionBase &)> ToRemove) {
+void Object::removeSections(std::function<bool(const SectionBase &)> ToRemove) {
 
   auto Iter = std::stable_partition(
       std::begin(Sections), std::end(Sections), [=](const SecPtr &Sec) {
@@ -665,12 +1057,10 @@ void Object<ELFT>::removeSections(
       });
   if (SymbolTable != nullptr && ToRemove(*SymbolTable))
     SymbolTable = nullptr;
-  if (ToRemove(*SectionNames)) {
-    if (WriteSectionHeaders)
-      error("Cannot remove " + SectionNames->Name +
-            " because it is the section header string table.");
+  if (SectionNames != nullptr && ToRemove(*SectionNames))
     SectionNames = nullptr;
-  }
+  if (SectionIndexTable != nullptr && ToRemove(*SectionIndexTable))
+    SectionIndexTable = nullptr;
   // Now make sure there are no remaining references to the sections that will
   // be removed. Sometimes it is impossible to remove a reference so we emit
   // an error here instead.
@@ -684,14 +1074,15 @@ void Object<ELFT>::removeSections(
   Sections.erase(Iter, std::end(Sections));
 }
 
-template <class ELFT>
-void Object<ELFT>::addSection(StringRef SecName, ArrayRef<uint8_t> Data) {
-  auto Sec = llvm::make_unique<OwnedDataSection>(SecName, Data);
-  Sec->OriginalOffset = ~0ULL;
-  Sections.push_back(std::move(Sec));
+void Object::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
+  if (!SymbolTable)
+    return;
+
+  for (const SecPtr &Sec : Sections)
+    Sec->removeSymbols(ToRemove);
 }
 
-template <class ELFT> void ELFObject<ELFT>::sortSections() {
+void Object::sortSections() {
   // Put all sections in offset order. Maintain the ordering as closely as
   // possible while meeting that demand however.
   auto CompareSections = [](const SecPtr &A, const SecPtr &B) {
@@ -716,7 +1107,8 @@ static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) {
 
 // Orders segments such that if x = y->ParentSegment then y comes before x.
 static void OrderSegments(std::vector<Segment *> &Segments) {
-  std::stable_sort(std::begin(Segments), std::end(Segments), compareSegments);
+  std::stable_sort(std::begin(Segments), std::end(Segments),
+                   compareSegmentsByOffset);
 }
 
 // This function finds a consistent layout for a list of segments starting from
@@ -725,7 +1117,7 @@ static void OrderSegments(std::vector<Segment *> &Segments) {
 static uint64_t LayoutSegments(std::vector<Segment *> &Segments,
                                uint64_t Offset) {
   assert(std::is_sorted(std::begin(Segments), std::end(Segments),
-                        compareSegments));
+                        compareSegmentsByOffset));
   // The only way a segment should move is if a section was between two
   // segments and that section was removed. If that section isn't in a segment
   // then it's acceptable, but not ideal, to simply move it to after the
@@ -755,8 +1147,8 @@ static uint64_t LayoutSegments(std::vector<Segment *> &Segments,
 // does not have a ParentSegment. It returns either the offset given if all
 // sections had a ParentSegment or an offset one past the last section if there
 // was a section that didn't have a ParentSegment.
-template <class SecPtr>
-static uint64_t LayoutSections(std::vector<SecPtr> &Sections, uint64_t Offset) {
+template <class Range>
+static uint64_t LayoutSections(Range Sections, uint64_t Offset) {
   // Now the offset of every segment has been set we can assign the offsets
   // of each section. For sections that are covered by a segment we should use
   // the segment's original offset and the section's original offset to compute
@@ -765,106 +1157,154 @@ static uint64_t LayoutSections(std::vector<SecPtr> &Sections, uint64_t Offset) {
   // covered by segments we can just bump Offset to the next valid location.
   uint32_t Index = 1;
   for (auto &Section : Sections) {
-    Section->Index = Index++;
-    if (Section->ParentSegment != nullptr) {
-      auto Segment = Section->ParentSegment;
-      Section->Offset =
-          Segment->Offset + (Section->OriginalOffset - Segment->OriginalOffset);
+    Section.Index = Index++;
+    if (Section.ParentSegment != nullptr) {
+      auto Segment = *Section.ParentSegment;
+      Section.Offset =
+          Segment.Offset + (Section.OriginalOffset - Segment.OriginalOffset);
     } else {
-      Offset = alignTo(Offset, Section->Align == 0 ? 1 : Section->Align);
-      Section->Offset = Offset;
-      if (Section->Type != SHT_NOBITS)
-        Offset += Section->Size;
+      Offset = alignTo(Offset, Section.Align == 0 ? 1 : Section.Align);
+      Section.Offset = Offset;
+      if (Section.Type != SHT_NOBITS)
+        Offset += Section.Size;
     }
   }
   return Offset;
 }
 
-template <class ELFT> void ELFObject<ELFT>::assignOffsets() {
+template <class ELFT> void ELFWriter<ELFT>::assignOffsets() {
   // We need a temporary list of segments that has a special order to it
   // so that we know that anytime ->ParentSegment is set that segment has
   // already had its offset properly set.
   std::vector<Segment *> OrderedSegments;
-  for (auto &Segment : this->Segments)
-    OrderedSegments.push_back(Segment.get());
+  for (auto &Segment : Obj.segments())
+    OrderedSegments.push_back(&Segment);
+  OrderedSegments.push_back(&Obj.ElfHdrSegment);
+  OrderedSegments.push_back(&Obj.ProgramHdrSegment);
   OrderSegments(OrderedSegments);
-  // The size of ELF + program headers will not change so it is ok to assume
-  // that the first offset of the first segment is a good place to start
-  // outputting sections. This covers both the standard case and the PT_PHDR
-  // case.
-  uint64_t Offset;
-  if (!OrderedSegments.empty()) {
-    Offset = OrderedSegments[0]->Offset;
-  } else {
-    Offset = sizeof(Elf_Ehdr);
-  }
+  // Offset is used as the start offset of the first segment to be laid out.
+  // Since the ELF Header (ElfHdrSegment) must be at the start of the file,
+  // we start at offset 0.
+  uint64_t Offset = 0;
   Offset = LayoutSegments(OrderedSegments, Offset);
-  Offset = LayoutSections(this->Sections, Offset);
+  Offset = LayoutSections(Obj.sections(), Offset);
   // If we need to write the section header table out then we need to align the
   // Offset so that SHOffset is valid.
-  if (this->WriteSectionHeaders)
+  if (WriteSectionHeaders)
     Offset = alignTo(Offset, sizeof(typename ELFT::Addr));
-  this->SHOffset = Offset;
+  Obj.SHOffset = Offset;
 }
 
-template <class ELFT> size_t ELFObject<ELFT>::totalSize() const {
+template <class ELFT> size_t ELFWriter<ELFT>::totalSize() const {
   // We already have the section header offset so we can calculate the total
   // size by just adding up the size of each section header.
-  auto NullSectionSize = this->WriteSectionHeaders ? sizeof(Elf_Shdr) : 0;
-  return this->SHOffset + this->Sections.size() * sizeof(Elf_Shdr) +
+  auto NullSectionSize = WriteSectionHeaders ? sizeof(Elf_Shdr) : 0;
+  return Obj.SHOffset + size(Obj.sections()) * sizeof(Elf_Shdr) +
          NullSectionSize;
 }
 
-template <class ELFT> void ELFObject<ELFT>::write(FileOutputBuffer &Out) const {
-  this->writeHeader(Out);
-  this->writeProgramHeaders(Out);
-  this->writeSectionData(Out);
-  if (this->WriteSectionHeaders)
-    this->writeSectionHeaders(Out);
+template <class ELFT> void ELFWriter<ELFT>::write() {
+  writeEhdr();
+  writePhdrs();
+  writeSectionData();
+  if (WriteSectionHeaders)
+    writeShdrs();
+  if (auto E = Buf.commit())
+    reportError(Buf.getName(), errorToErrorCode(std::move(E)));
 }
 
-template <class ELFT> void ELFObject<ELFT>::finalize() {
-  // Make sure we add the names of all the sections.
-  if (this->SectionNames != nullptr)
-    for (const auto &Section : this->Sections) {
-      this->SectionNames->addString(Section->Name);
+template <class ELFT> void ELFWriter<ELFT>::finalize() {
+  // It could happen that SectionNames has been removed and yet the user wants
+  // a section header table output. We need to throw an error if a user tries
+  // to do that.
+  if (Obj.SectionNames == nullptr && WriteSectionHeaders)
+    error("Cannot write section header table because section header string "
+          "table was removed.");
+
+  Obj.sortSections();
+
+  // We need to assign indexes before we perform layout because we need to know
+  // if we need large indexes or not. We can assign indexes first and check as
+  // we go to see if we will actully need large indexes.
+  bool NeedsLargeIndexes = false;
+  if (size(Obj.sections()) >= SHN_LORESERVE) {
+    auto Sections = Obj.sections();
+    NeedsLargeIndexes =
+        std::any_of(Sections.begin() + SHN_LORESERVE, Sections.end(),
+                    [](const SectionBase &Sec) { return Sec.HasSymbol; });
+    // TODO: handle case where only one section needs the large index table but
+    // only needs it because the large index table hasn't been removed yet.
+  }
+
+  if (NeedsLargeIndexes) {
+    // This means we definitely need to have a section index table but if we
+    // already have one then we should use it instead of making a new one.
+    if (Obj.SymbolTable != nullptr && Obj.SectionIndexTable == nullptr) {
+      // Addition of a section to the end does not invalidate the indexes of
+      // other sections and assigns the correct index to the new section.
+      auto &Shndx = Obj.addSection<SectionIndexSection>();
+      Obj.SymbolTable->setShndxTable(&Shndx);
+      Shndx.setSymTab(Obj.SymbolTable);
+    }
+  } else {
+    // Since we don't need SectionIndexTable we should remove it and all
+    // references to it.
+    if (Obj.SectionIndexTable != nullptr) {
+      Obj.removeSections([this](const SectionBase &Sec) {
+        return &Sec == Obj.SectionIndexTable;
+      });
+    }
+  }
+
+  // Make sure we add the names of all the sections. Importantly this must be
+  // done after we decide to add or remove SectionIndexes.
+  if (Obj.SectionNames != nullptr)
+    for (const auto &Section : Obj.sections()) {
+      Obj.SectionNames->addString(Section.Name);
     }
-  // Make sure we add the names of all the symbols.
-  if (this->SymbolTable != nullptr)
-    this->SymbolTable->addSymbolNames();
 
-  sortSections();
+  // Before we can prepare for layout the indexes need to be finalized.
+  uint64_t Index = 0;
+  for (auto &Sec : Obj.sections())
+    Sec.Index = Index++;
+
+  // The symbol table does not update all other sections on update. For
+  // instance, symbol names are not added as new symbols are added. This means
+  // that some sections, like .strtab, don't yet have their final size.
+  if (Obj.SymbolTable != nullptr)
+    Obj.SymbolTable->prepareForLayout();
+
   assignOffsets();
 
   // Finalize SectionNames first so that we can assign name indexes.
-  if (this->SectionNames != nullptr)
-    this->SectionNames->finalize();
+  if (Obj.SectionNames != nullptr)
+    Obj.SectionNames->finalize();
   // Finally now that all offsets and indexes have been set we can finalize any
   // remaining issues.
-  uint64_t Offset = this->SHOffset + sizeof(Elf_Shdr);
-  for (auto &Section : this->Sections) {
-    Section->HeaderOffset = Offset;
+  uint64_t Offset = Obj.SHOffset + sizeof(Elf_Shdr);
+  for (auto &Section : Obj.sections()) {
+    Section.HeaderOffset = Offset;
     Offset += sizeof(Elf_Shdr);
-    if (this->WriteSectionHeaders)
-      Section->NameIndex = this->SectionNames->findIndex(Section->Name);
-    Section->finalize();
+    if (WriteSectionHeaders)
+      Section.NameIndex = Obj.SectionNames->findIndex(Section.Name);
+    Section.finalize();
   }
-}
 
-template <class ELFT> size_t BinaryObject<ELFT>::totalSize() const {
-  return TotalSize;
+  Buf.allocate(totalSize());
+  SecWriter = llvm::make_unique<ELFSectionWriter<ELFT>>(Buf);
 }
 
-template <class ELFT>
-void BinaryObject<ELFT>::write(FileOutputBuffer &Out) const {
-  for (auto &Section : this->Sections) {
-    if ((Section->Flags & SHF_ALLOC) == 0)
+void BinaryWriter::write() {
+  for (auto &Section : Obj.sections()) {
+    if ((Section.Flags & SHF_ALLOC) == 0)
       continue;
-    Section->writeSection(Out);
+    Section.accept(*SecWriter);
   }
+  if (auto E = Buf.commit())
+    reportError(Buf.getName(), errorToErrorCode(std::move(E)));
 }
 
-template <class ELFT> void BinaryObject<ELFT>::finalize() {
+void BinaryWriter::finalize() {
   // TODO: Create a filter range to construct OrderedSegments from so that this
   // code can be deduped with assignOffsets above. This should also solve the
   // todo below for LayoutSections.
@@ -873,13 +1313,25 @@ template <class ELFT> void BinaryObject<ELFT>::finalize() {
   // already had it's offset properly set. We only want to consider the segments
   // that will affect layout of allocated sections so we only add those.
   std::vector<Segment *> OrderedSegments;
-  for (auto &Section : this->Sections) {
-    if ((Section->Flags & SHF_ALLOC) != 0 &&
-        Section->ParentSegment != nullptr) {
-      OrderedSegments.push_back(Section->ParentSegment);
+  for (auto &Section : Obj.sections()) {
+    if ((Section.Flags & SHF_ALLOC) != 0 && Section.ParentSegment != nullptr) {
+      OrderedSegments.push_back(Section.ParentSegment);
     }
   }
-  OrderSegments(OrderedSegments);
+
+  // For binary output, we're going to use physical addresses instead of
+  // virtual addresses, since a binary output is used for cases like ROM
+  // loading and physical addresses are intended for ROM loading.
+  // However, if no segment has a physical address, we'll fallback to using
+  // virtual addresses for all.
+  if (std::all_of(std::begin(OrderedSegments), std::end(OrderedSegments),
+                  [](const Segment *Segment) { return Segment->PAddr == 0; }))
+    for (const auto &Segment : OrderedSegments)
+      Segment->PAddr = Segment->VAddr;
+
+  std::stable_sort(std::begin(OrderedSegments), std::end(OrderedSegments),
+                   compareSegmentsByPAddr);
+
   // Because we add a ParentSegment for each section we might have duplicate
   // segments in OrderedSegments. If there were duplicates then LayoutSegments
   // would do very strange things.
@@ -887,6 +1339,8 @@ template <class ELFT> void BinaryObject<ELFT>::finalize() {
       std::unique(std::begin(OrderedSegments), std::end(OrderedSegments));
   OrderedSegments.erase(End, std::end(OrderedSegments));
 
+  uint64_t Offset = 0;
+
   // Modify the first segment so that there is no gap at the start. This allows
   // our layout algorithm to proceed as expected while not out writing out the
   // gap at the start.
@@ -895,30 +1349,29 @@ template <class ELFT> void BinaryObject<ELFT>::finalize() {
     auto Sec = Seg->firstSection();
     auto Diff = Sec->OriginalOffset - Seg->OriginalOffset;
     Seg->OriginalOffset += Diff;
-    // The size needs to be shrunk as well
+    // The size needs to be shrunk as well.
     Seg->FileSize -= Diff;
-    Seg->MemSize -= Diff;
-    // The VAddr needs to be adjusted so that the alignment is correct as well
-    Seg->VAddr += Diff;
-    Seg->PAddr = Seg->VAddr;
-    // We don't want this to be shifted by alignment so we need to set the
-    // alignment to zero.
-    Seg->Align = 0;
+    // The PAddr needs to be increased to remove the gap before the first
+    // section.
+    Seg->PAddr += Diff;
+    uint64_t LowestPAddr = Seg->PAddr;
+    for (auto &Segment : OrderedSegments) {
+      Segment->Offset = Segment->PAddr - LowestPAddr;
+      Offset = std::max(Offset, Segment->Offset + Segment->FileSize);
+    }
   }
 
-  uint64_t Offset = LayoutSegments(OrderedSegments, 0);
-
   // TODO: generalize LayoutSections to take a range. Pass a special range
   // constructed from an iterator that skips values for which a predicate does
   // not hold. Then pass such a range to LayoutSections instead of constructing
   // AllocatedSections here.
   std::vector<SectionBase *> AllocatedSections;
-  for (auto &Section : this->Sections) {
-    if ((Section->Flags & SHF_ALLOC) == 0)
+  for (auto &Section : Obj.sections()) {
+    if ((Section.Flags & SHF_ALLOC) == 0)
       continue;
-    AllocatedSections.push_back(Section.get());
+    AllocatedSections.push_back(&Section);
   }
-  LayoutSections(AllocatedSections, Offset);
+  LayoutSections(make_pointee_range(AllocatedSections), Offset);
 
   // Now that every section has been laid out we just need to compute the total
   // file size. This might not be the same as the offset returned by
@@ -929,23 +1382,22 @@ template <class ELFT> void BinaryObject<ELFT>::finalize() {
     if (Section->Type != SHT_NOBITS)
       TotalSize = std::max(TotalSize, Section->Offset + Section->Size);
   }
+
+  Buf.allocate(TotalSize);
+  SecWriter = llvm::make_unique<BinarySectionWriter>(Buf);
 }
 
 namespace llvm {
-
-template class Object<ELF64LE>;
-template class Object<ELF64BE>;
-template class Object<ELF32LE>;
-template class Object<ELF32BE>;
-
-template class ELFObject<ELF64LE>;
-template class ELFObject<ELF64BE>;
-template class ELFObject<ELF32LE>;
-template class ELFObject<ELF32BE>;
-
-template class BinaryObject<ELF64LE>;
-template class BinaryObject<ELF64BE>;
-template class BinaryObject<ELF32LE>;
-template class BinaryObject<ELF32BE>;
-
+namespace objcopy {
+
+template class ELFBuilder<ELF64LE>;
+template class ELFBuilder<ELF64BE>;
+template class ELFBuilder<ELF32LE>;
+template class ELFBuilder<ELF32BE>;
+
+template class ELFWriter<ELF64LE>;
+template class ELFWriter<ELF64BE>;
+template class ELFWriter<ELF32LE>;
+template class ELFWriter<ELF32BE>;
+} // end namespace objcopy
 } // end namespace llvm
diff --git a/contrib/llvm/tools/llvm-objcopy/Object.h b/contrib/llvm/tools/llvm-objcopy/Object.h
index 639f0f29ceba..76748d5fc641 100644
--- a/contrib/llvm/tools/llvm-objcopy/Object.h
+++ b/contrib/llvm/tools/llvm-objcopy/Object.h
@@ -16,6 +16,8 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/JamCRC.h"
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -24,24 +26,209 @@
 #include <vector>
 
 namespace llvm {
+namespace objcopy {
 
-class FileOutputBuffer;
+class Buffer;
 class SectionBase;
+class Section;
+class OwnedDataSection;
+class StringTableSection;
+class SymbolTableSection;
+class RelocationSection;
+class DynamicRelocationSection;
+class GnuDebugLinkSection;
+class GroupSection;
+class SectionIndexSection;
 class Segment;
+class Object;
+struct Symbol;
 
 class SectionTableRef {
-private:
-  ArrayRef<std::unique_ptr<SectionBase>> Sections;
+  MutableArrayRef<std::unique_ptr<SectionBase>> Sections;
 
 public:
-  SectionTableRef(ArrayRef<std::unique_ptr<SectionBase>> Secs)
+  using iterator = pointee_iterator<std::unique_ptr<SectionBase> *>;
+
+  explicit SectionTableRef(MutableArrayRef<std::unique_ptr<SectionBase>> Secs)
       : Sections(Secs) {}
   SectionTableRef(const SectionTableRef &) = default;
 
-  SectionBase *getSection(uint16_t Index, Twine ErrMsg);
+  iterator begin() { return iterator(Sections.data()); }
+  iterator end() { return iterator(Sections.data() + Sections.size()); }
+
+  SectionBase *getSection(uint32_t Index, Twine ErrMsg);
 
   template <class T>
-  T *getSectionOfType(uint16_t Index, Twine IndexErrMsg, Twine TypeErrMsg);
+  T *getSectionOfType(uint32_t Index, Twine IndexErrMsg, Twine TypeErrMsg);
+};
+
+enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE };
+
+class SectionVisitor {
+public:
+  virtual ~SectionVisitor();
+
+  virtual void visit(const Section &Sec) = 0;
+  virtual void visit(const OwnedDataSection &Sec) = 0;
+  virtual void visit(const StringTableSection &Sec) = 0;
+  virtual void visit(const SymbolTableSection &Sec) = 0;
+  virtual void visit(const RelocationSection &Sec) = 0;
+  virtual void visit(const DynamicRelocationSection &Sec) = 0;
+  virtual void visit(const GnuDebugLinkSection &Sec) = 0;
+  virtual void visit(const GroupSection &Sec) = 0;
+  virtual void visit(const SectionIndexSection &Sec) = 0;
+};
+
+class SectionWriter : public SectionVisitor {
+protected:
+  Buffer &Out;
+
+public:
+  virtual ~SectionWriter(){};
+
+  void visit(const Section &Sec) override;
+  void visit(const OwnedDataSection &Sec) override;
+  void visit(const StringTableSection &Sec) override;
+  void visit(const DynamicRelocationSection &Sec) override;
+  virtual void visit(const SymbolTableSection &Sec) override = 0;
+  virtual void visit(const RelocationSection &Sec) override = 0;
+  virtual void visit(const GnuDebugLinkSection &Sec) override = 0;
+  virtual void visit(const GroupSection &Sec) override = 0;
+  virtual void visit(const SectionIndexSection &Sec) override = 0;
+
+  explicit SectionWriter(Buffer &Buf) : Out(Buf) {}
+};
+
+template <class ELFT> class ELFSectionWriter : public SectionWriter {
+private:
+  using Elf_Word = typename ELFT::Word;
+  using Elf_Rel = typename ELFT::Rel;
+  using Elf_Rela = typename ELFT::Rela;
+
+public:
+  virtual ~ELFSectionWriter() {}
+  void visit(const SymbolTableSection &Sec) override;
+  void visit(const RelocationSection &Sec) override;
+  void visit(const GnuDebugLinkSection &Sec) override;
+  void visit(const GroupSection &Sec) override;
+  void visit(const SectionIndexSection &Sec) override;
+
+  explicit ELFSectionWriter(Buffer &Buf) : SectionWriter(Buf) {}
+};
+
+#define MAKE_SEC_WRITER_FRIEND                                                 \
+  friend class SectionWriter;                                                  \
+  template <class ELFT> friend class ELFSectionWriter;
+
+class BinarySectionWriter : public SectionWriter {
+public:
+  virtual ~BinarySectionWriter() {}
+
+  void visit(const SymbolTableSection &Sec) override;
+  void visit(const RelocationSection &Sec) override;
+  void visit(const GnuDebugLinkSection &Sec) override;
+  void visit(const GroupSection &Sec) override;
+  void visit(const SectionIndexSection &Sec) override;
+
+  explicit BinarySectionWriter(Buffer &Buf) : SectionWriter(Buf) {}
+};
+
+// The class Buffer abstracts out the common interface of FileOutputBuffer and
+// WritableMemoryBuffer so that the hierarchy of Writers depends on this
+// abstract interface and doesn't depend on a particular implementation.
+// TODO: refactor the buffer classes in LLVM to enable us to use them here
+// directly.
+class Buffer {
+  StringRef Name;
+
+public:
+  virtual ~Buffer();
+  virtual void allocate(size_t Size) = 0;
+  virtual uint8_t *getBufferStart() = 0;
+  virtual Error commit() = 0;
+
+  explicit Buffer(StringRef Name) : Name(Name) {}
+  StringRef getName() const { return Name; }
+};
+
+class FileBuffer : public Buffer {
+  std::unique_ptr<FileOutputBuffer> Buf;
+
+public:
+  void allocate(size_t Size) override;
+  uint8_t *getBufferStart() override;
+  Error commit() override;
+
+  explicit FileBuffer(StringRef FileName) : Buffer(FileName) {}
+};
+
+class MemBuffer : public Buffer {
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+
+public:
+  void allocate(size_t Size) override;
+  uint8_t *getBufferStart() override;
+  Error commit() override;
+
+  explicit MemBuffer(StringRef Name) : Buffer(Name) {}
+
+  std::unique_ptr<WritableMemoryBuffer> releaseMemoryBuffer();
+};
+
+class Writer {
+protected:
+  Object &Obj;
+  Buffer &Buf;
+
+public:
+  virtual ~Writer();
+  virtual void finalize() = 0;
+  virtual void write() = 0;
+
+  Writer(Object &O, Buffer &B) : Obj(O), Buf(B) {}
+};
+
+template <class ELFT> class ELFWriter : public Writer {
+private:
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Phdr = typename ELFT::Phdr;
+  using Elf_Ehdr = typename ELFT::Ehdr;
+
+  void writeEhdr();
+  void writePhdr(const Segment &Seg);
+  void writeShdr(const SectionBase &Sec);
+
+  void writePhdrs();
+  void writeShdrs();
+  void writeSectionData();
+
+  void assignOffsets();
+
+  std::unique_ptr<ELFSectionWriter<ELFT>> SecWriter;
+
+  size_t totalSize() const;
+
+public:
+  virtual ~ELFWriter() {}
+  bool WriteSectionHeaders = true;
+
+  void finalize() override;
+  void write() override;
+  ELFWriter(Object &Obj, Buffer &Buf, bool WSH)
+      : Writer(Obj, Buf), WriteSectionHeaders(WSH) {}
+};
+
+class BinaryWriter : public Writer {
+private:
+  std::unique_ptr<BinarySectionWriter> SecWriter;
+
+  uint64_t TotalSize;
+
+public:
+  ~BinaryWriter() {}
+  void finalize() override;
+  void write() override;
+  BinaryWriter(Object &Obj, Buffer &Buf) : Writer(Obj, Buf) {}
 };
 
 class SectionBase {
@@ -49,8 +236,9 @@ public:
   StringRef Name;
   Segment *ParentSegment = nullptr;
   uint64_t HeaderOffset;
-  uint64_t OriginalOffset;
+  uint64_t OriginalOffset = std::numeric_limits<uint64_t>::max();
   uint32_t Index;
+  bool HasSymbol = false;
 
   uint64_t Addr = 0;
   uint64_t Align = 1;
@@ -68,8 +256,9 @@ public:
   virtual void initialize(SectionTableRef SecTable);
   virtual void finalize();
   virtual void removeSectionReferences(const SectionBase *Sec);
-  template <class ELFT> void writeHeader(FileOutputBuffer &Out) const;
-  virtual void writeSection(FileOutputBuffer &Out) const = 0;
+  virtual void removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
+  virtual void accept(SectionVisitor &Visitor) const = 0;
+  virtual void markSymbols();
 };
 
 class Segment {
@@ -102,7 +291,8 @@ public:
   uint64_t OriginalOffset;
   Segment *ParentSegment = nullptr;
 
-  Segment(ArrayRef<uint8_t> Data) : Contents(Data) {}
+  explicit Segment(ArrayRef<uint8_t> Data) : Contents(Data) {}
+  Segment() {}
 
   const SectionBase *firstSection() const {
     if (!Sections.empty())
@@ -112,22 +302,26 @@ public:
 
   void removeSection(const SectionBase *Sec) { Sections.erase(Sec); }
   void addSection(const SectionBase *Sec) { Sections.insert(Sec); }
-  template <class ELFT> void writeHeader(FileOutputBuffer &Out) const;
-  void writeSegment(FileOutputBuffer &Out) const;
 };
 
 class Section : public SectionBase {
-private:
+  MAKE_SEC_WRITER_FRIEND
+
   ArrayRef<uint8_t> Contents;
+  SectionBase *LinkSection = nullptr;
 
 public:
-  Section(ArrayRef<uint8_t> Data) : Contents(Data) {}
+  explicit Section(ArrayRef<uint8_t> Data) : Contents(Data) {}
 
-  void writeSection(FileOutputBuffer &Out) const override;
+  void accept(SectionVisitor &Visitor) const override;
+  void removeSectionReferences(const SectionBase *Sec) override;
+  void initialize(SectionTableRef SecTable) override;
+  void finalize() override;
 };
 
 class OwnedDataSection : public SectionBase {
-private:
+  MAKE_SEC_WRITER_FRIEND
+
   std::vector<uint8_t> Data;
 
 public:
@@ -136,8 +330,10 @@ public:
     Name = SecName;
     Type = ELF::SHT_PROGBITS;
     Size = Data.size();
+    OriginalOffset = std::numeric_limits<uint64_t>::max();
   }
-  void writeSection(FileOutputBuffer &Out) const override;
+
+  void accept(SectionVisitor &Sec) const override;
 };
 
 // There are two types of string tables that can exist, dynamic and not dynamic.
@@ -149,7 +345,8 @@ public:
 // classof method checks that the particular instance is not allocated. This
 // then agrees with the makeSection method used to construct most sections.
 class StringTableSection : public SectionBase {
-private:
+  MAKE_SEC_WRITER_FRIEND
+
   StringTableBuilder StrTabBuilder;
 
 public:
@@ -160,7 +357,7 @@ public:
   void addString(StringRef Name);
   uint32_t findIndex(StringRef Name) const;
   void finalize() override;
-  void writeSection(FileOutputBuffer &Out) const override;
+  void accept(SectionVisitor &Visitor) const override;
 
   static bool classof(const SectionBase *S) {
     if (S->Flags & ELF::SHF_ALLOC)
@@ -181,6 +378,7 @@ enum SymbolShndxType {
   SYMBOL_HEXAGON_SCOMMON_2 = ELF::SHN_HEXAGON_SCOMMON_2,
   SYMBOL_HEXAGON_SCOMMON_4 = ELF::SHN_HEXAGON_SCOMMON_4,
   SYMBOL_HEXAGON_SCOMMON_8 = ELF::SHN_HEXAGON_SCOMMON_8,
+  SYMBOL_XINDEX = ELF::SHN_XINDEX,
 };
 
 struct Symbol {
@@ -194,41 +392,79 @@ struct Symbol {
   uint8_t Type;
   uint64_t Value;
   uint8_t Visibility;
+  bool Referenced = false;
 
   uint16_t getShndx() const;
 };
 
+class SectionIndexSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+private:
+  std::vector<uint32_t> Indexes;
+  SymbolTableSection *Symbols = nullptr;
+
+public:
+  virtual ~SectionIndexSection() {}
+  void addIndex(uint32_t Index) {
+    Indexes.push_back(Index);
+    Size += 4;
+  }
+  void setSymTab(SymbolTableSection *SymTab) { Symbols = SymTab; }
+  void initialize(SectionTableRef SecTable) override;
+  void finalize() override;
+  void accept(SectionVisitor &Visitor) const override;
+
+  SectionIndexSection() {
+    Name = ".symtab_shndx";
+    Align = 4;
+    EntrySize = 4;
+    Type = ELF::SHT_SYMTAB_SHNDX;
+  }
+};
+
 class SymbolTableSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+
+  void setStrTab(StringTableSection *StrTab) { SymbolNames = StrTab; }
+  void assignIndices();
+
 protected:
   std::vector<std::unique_ptr<Symbol>> Symbols;
   StringTableSection *SymbolNames = nullptr;
+  SectionIndexSection *SectionIndexTable = nullptr;
 
   using SymPtr = std::unique_ptr<Symbol>;
 
 public:
-  void setStrTab(StringTableSection *StrTab) { SymbolNames = StrTab; }
   void addSymbol(StringRef Name, uint8_t Bind, uint8_t Type,
                  SectionBase *DefinedIn, uint64_t Value, uint8_t Visibility,
                  uint16_t Shndx, uint64_t Sz);
-  void addSymbolNames();
+  void prepareForLayout();
+  // An 'empty' symbol table still contains a null symbol.
+  bool empty() const { return Symbols.size() == 1; }
+  void setShndxTable(SectionIndexSection *ShndxTable) {
+    SectionIndexTable = ShndxTable;
+  }
+  const SectionIndexSection *getShndxTable() const { return SectionIndexTable; }
   const SectionBase *getStrTab() const { return SymbolNames; }
   const Symbol *getSymbolByIndex(uint32_t Index) const;
+  Symbol *getSymbolByIndex(uint32_t Index);
+  void updateSymbols(function_ref<void(Symbol &)> Callable);
+
   void removeSectionReferences(const SectionBase *Sec) override;
   void initialize(SectionTableRef SecTable) override;
   void finalize() override;
+  void accept(SectionVisitor &Visitor) const override;
+  void removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
 
   static bool classof(const SectionBase *S) {
     return S->Type == ELF::SHT_SYMTAB;
   }
 };
 
-// Only writeSection depends on the ELF type so we implement it in a subclass.
-template <class ELFT> class SymbolTableSectionImpl : public SymbolTableSection {
-  void writeSection(FileOutputBuffer &Out) const override;
-};
-
 struct Relocation {
-  const Symbol *RelocSymbol = nullptr;
+  Symbol *RelocSymbol = nullptr;
   uint64_t Offset;
   uint64_t Addend;
   uint32_t Type;
@@ -260,33 +496,29 @@ public:
 // that code between the two symbol table types.
 template <class SymTabType>
 class RelocSectionWithSymtabBase : public RelocationSectionBase {
-private:
   SymTabType *Symbols = nullptr;
+  void setSymTab(SymTabType *SymTab) { Symbols = SymTab; }
 
 protected:
   RelocSectionWithSymtabBase() = default;
 
 public:
-  void setSymTab(SymTabType *StrTab) { Symbols = StrTab; }
   void removeSectionReferences(const SectionBase *Sec) override;
   void initialize(SectionTableRef SecTable) override;
   void finalize() override;
 };
 
-template <class ELFT>
 class RelocationSection
     : public RelocSectionWithSymtabBase<SymbolTableSection> {
-private:
-  using Elf_Rel = typename ELFT::Rel;
-  using Elf_Rela = typename ELFT::Rela;
+  MAKE_SEC_WRITER_FRIEND
 
   std::vector<Relocation> Relocations;
 
-  template <class T> void writeRel(T *Buf) const;
-
 public:
   void addRelocation(Relocation Rel) { Relocations.push_back(Rel); }
-  void writeSection(FileOutputBuffer &Out) const override;
+  void accept(SectionVisitor &Visitor) const override;
+  void removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
+  void markSymbols() override;
 
   static bool classof(const SectionBase *S) {
     if (S->Flags & ELF::SHF_ALLOC)
@@ -295,32 +527,51 @@ public:
   }
 };
 
-class SectionWithStrTab : public Section {
-private:
-  const SectionBase *StrTab = nullptr;
+// TODO: The way stripping and groups interact is complicated
+// and still needs to be worked on.
+
+class GroupSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
+  const SymbolTableSection *SymTab = nullptr;
+  Symbol *Sym = nullptr;
+  ELF::Elf32_Word FlagWord;
+  SmallVector<SectionBase *, 3> GroupMembers;
 
 public:
-  SectionWithStrTab(ArrayRef<uint8_t> Data) : Section(Data) {}
+  // TODO: Contents is present in several classes of the hierarchy.
+  // This needs to be refactored to avoid duplication.
+  ArrayRef<uint8_t> Contents;
 
-  void setStrTab(const SectionBase *StringTable) { StrTab = StringTable; }
-  void removeSectionReferences(const SectionBase *Sec) override;
-  void initialize(SectionTableRef SecTable) override;
+  explicit GroupSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
+
+  void setSymTab(const SymbolTableSection *SymTabSec) { SymTab = SymTabSec; }
+  void setSymbol(Symbol *S) { Sym = S; }
+  void setFlagWord(ELF::Elf32_Word W) { FlagWord = W; }
+  void addMember(SectionBase *Sec) { GroupMembers.push_back(Sec); }
+
+  void initialize(SectionTableRef SecTable) override{};
+  void accept(SectionVisitor &) const override;
   void finalize() override;
-  static bool classof(const SectionBase *S);
+  void removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
+  void markSymbols() override;
+
+  static bool classof(const SectionBase *S) {
+    return S->Type == ELF::SHT_GROUP;
+  }
 };
 
-class DynamicSymbolTableSection : public SectionWithStrTab {
+class DynamicSymbolTableSection : public Section {
 public:
-  DynamicSymbolTableSection(ArrayRef<uint8_t> Data) : SectionWithStrTab(Data) {}
+  explicit DynamicSymbolTableSection(ArrayRef<uint8_t> Data) : Section(Data) {}
 
   static bool classof(const SectionBase *S) {
     return S->Type == ELF::SHT_DYNSYM;
   }
 };
 
-class DynamicSection : public SectionWithStrTab {
+class DynamicSection : public Section {
 public:
-  DynamicSection(ArrayRef<uint8_t> Data) : SectionWithStrTab(Data) {}
+  explicit DynamicSection(ArrayRef<uint8_t> Data) : Section(Data) {}
 
   static bool classof(const SectionBase *S) {
     return S->Type == ELF::SHT_DYNAMIC;
@@ -329,13 +580,15 @@ public:
 
 class DynamicRelocationSection
     : public RelocSectionWithSymtabBase<DynamicSymbolTableSection> {
+  MAKE_SEC_WRITER_FRIEND
+
 private:
   ArrayRef<uint8_t> Contents;
 
 public:
-  DynamicRelocationSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
+  explicit DynamicRelocationSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
 
-  void writeSection(FileOutputBuffer &Out) const override;
+  void accept(SectionVisitor &) const override;
 
   static bool classof(const SectionBase *S) {
     if (!(S->Flags & ELF::SHF_ALLOC))
@@ -344,90 +597,125 @@ public:
   }
 };
 
-template <class ELFT> class Object {
-private:
-  using SecPtr = std::unique_ptr<SectionBase>;
-  using SegPtr = std::unique_ptr<Segment>;
-
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Ehdr = typename ELFT::Ehdr;
-  using Elf_Phdr = typename ELFT::Phdr;
-
-  void initSymbolTable(const object::ELFFile<ELFT> &ElfFile,
-                       SymbolTableSection *SymTab, SectionTableRef SecTable);
-  SecPtr makeSection(const object::ELFFile<ELFT> &ElfFile,
-                     const Elf_Shdr &Shdr);
-  void readProgramHeaders(const object::ELFFile<ELFT> &ElfFile);
-  SectionTableRef readSectionHeaders(const object::ELFFile<ELFT> &ElfFile);
+class GnuDebugLinkSection : public SectionBase {
+  MAKE_SEC_WRITER_FRIEND
 
-protected:
-  StringTableSection *SectionNames = nullptr;
-  SymbolTableSection *SymbolTable = nullptr;
-  std::vector<SecPtr> Sections;
-  std::vector<SegPtr> Segments;
+private:
+  StringRef FileName;
+  uint32_t CRC32;
 
-  void writeHeader(FileOutputBuffer &Out) const;
-  void writeProgramHeaders(FileOutputBuffer &Out) const;
-  void writeSectionData(FileOutputBuffer &Out) const;
-  void writeSectionHeaders(FileOutputBuffer &Out) const;
+  void init(StringRef File, StringRef Data);
 
 public:
-  uint8_t Ident[16];
-  uint64_t Entry;
-  uint64_t SHOffset;
-  uint32_t Type;
-  uint32_t Machine;
-  uint32_t Version;
-  uint32_t Flags;
-  bool WriteSectionHeaders = true;
-
-  Object(const object::ELFObjectFile<ELFT> &Obj);
-  virtual ~Object() = default;
+  // If we add this section from an external source we can use this ctor.
+  explicit GnuDebugLinkSection(StringRef File);
+  void accept(SectionVisitor &Visitor) const override;
+};
 
-  const SymbolTableSection *getSymTab() const { return SymbolTable; }
-  const SectionBase *getSectionHeaderStrTab() const { return SectionNames; }
-  void removeSections(std::function<bool(const SectionBase &)> ToRemove);
-  void addSection(StringRef SecName, ArrayRef<uint8_t> Data);
-  virtual size_t totalSize() const = 0;
-  virtual void finalize() = 0;
-  virtual void write(FileOutputBuffer &Out) const = 0;
+class Reader {
+public:
+  virtual ~Reader();
+  virtual std::unique_ptr<Object> create() const = 0;
 };
 
-template <class ELFT> class ELFObject : public Object<ELFT> {
-private:
-  using SecPtr = std::unique_ptr<SectionBase>;
-  using SegPtr = std::unique_ptr<Segment>;
+using object::Binary;
+using object::ELFFile;
+using object::ELFObjectFile;
+using object::OwningBinary;
 
+template <class ELFT> class ELFBuilder {
+private:
+  using Elf_Addr = typename ELFT::Addr;
   using Elf_Shdr = typename ELFT::Shdr;
   using Elf_Ehdr = typename ELFT::Ehdr;
-  using Elf_Phdr = typename ELFT::Phdr;
+  using Elf_Word = typename ELFT::Word;
 
-  void sortSections();
-  void assignOffsets();
+  const ELFFile<ELFT> &ElfFile;
+  Object &Obj;
+
+  void setParentSegment(Segment &Child);
+  void readProgramHeaders();
+  void initGroupSection(GroupSection *GroupSec);
+  void initSymbolTable(SymbolTableSection *SymTab);
+  void readSectionHeaders();
+  SectionBase &makeSection(const Elf_Shdr &Shdr);
 
 public:
-  ELFObject(const object::ELFObjectFile<ELFT> &Obj) : Object<ELFT>(Obj) {}
+  ELFBuilder(const ELFObjectFile<ELFT> &ElfObj, Object &Obj)
+      : ElfFile(*ElfObj.getELFFile()), Obj(Obj) {}
 
-  void finalize() override;
-  size_t totalSize() const override;
-  void write(FileOutputBuffer &Out) const override;
+  void build();
 };
 
-template <class ELFT> class BinaryObject : public Object<ELFT> {
+class ELFReader : public Reader {
+  Binary *Bin;
+
+public:
+  ElfType getElfType() const;
+  std::unique_ptr<Object> create() const override;
+  explicit ELFReader(Binary *B) : Bin(B){};
+};
+
+class Object {
 private:
   using SecPtr = std::unique_ptr<SectionBase>;
   using SegPtr = std::unique_ptr<Segment>;
 
-  uint64_t TotalSize;
+  std::vector<SecPtr> Sections;
+  std::vector<SegPtr> Segments;
 
 public:
-  BinaryObject(const object::ELFObjectFile<ELFT> &Obj) : Object<ELFT>(Obj) {}
+  template <class T>
+  using Range = iterator_range<
+      pointee_iterator<typename std::vector<std::unique_ptr<T>>::iterator>>;
 
-  void finalize() override;
-  size_t totalSize() const override;
-  void write(FileOutputBuffer &Out) const override;
-};
+  template <class T>
+  using ConstRange = iterator_range<pointee_iterator<
+      typename std::vector<std::unique_ptr<T>>::const_iterator>>;
+
+  // It is often the case that the ELF header and the program header table are
+  // not present in any segment. This could be a problem during file layout,
+  // because other segments may get assigned an offset where either of the
+  // two should reside, which will effectively corrupt the resulting binary.
+  // Other than that we use these segments to track program header offsets
+  // when they may not follow the ELF header.
+  Segment ElfHdrSegment;
+  Segment ProgramHdrSegment;
 
+  uint8_t Ident[16];
+  uint64_t Entry;
+  uint64_t SHOffset;
+  uint32_t Type;
+  uint32_t Machine;
+  uint32_t Version;
+  uint32_t Flags;
+
+  StringTableSection *SectionNames = nullptr;
+  SymbolTableSection *SymbolTable = nullptr;
+  SectionIndexSection *SectionIndexTable = nullptr;
+
+  void sortSections();
+  SectionTableRef sections() { return SectionTableRef(Sections); }
+  ConstRange<SectionBase> sections() const {
+    return make_pointee_range(Sections);
+  }
+  Range<Segment> segments() { return make_pointee_range(Segments); }
+  ConstRange<Segment> segments() const { return make_pointee_range(Segments); }
+
+  void removeSections(std::function<bool(const SectionBase &)> ToRemove);
+  void removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
+  template <class T, class... Ts> T &addSection(Ts &&... Args) {
+    auto Sec = llvm::make_unique<T>(std::forward<Ts>(Args)...);
+    auto Ptr = Sec.get();
+    Sections.emplace_back(std::move(Sec));
+    return *Ptr;
+  }
+  Segment &addSegment(ArrayRef<uint8_t> Data) {
+    Segments.emplace_back(llvm::make_unique<Segment>(Data));
+    return *Segments.back();
+  }
+};
+} // end namespace objcopy
 } // end namespace llvm
 
 #endif // LLVM_TOOLS_OBJCOPY_OBJECT_H
diff --git a/contrib/llvm/tools/llvm-objcopy/StripOpts.td b/contrib/llvm/tools/llvm-objcopy/StripOpts.td
new file mode 100644
index 000000000000..333b0d288efa
--- /dev/null
+++ b/contrib/llvm/tools/llvm-objcopy/StripOpts.td
@@ -0,0 +1,49 @@
+include "llvm/Option/OptParser.td"
+
+multiclass Eq<string name> {
+  def NAME: Separate<["--", "-"], name>;
+  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
+}
+
+def help : Flag<["-", "--"], "help">;
+
+defm output : Eq<"o">,
+              MetaVarName<"output">,
+              HelpText<"Write output to <file>">;
+
+def strip_all : Flag<["-", "--"], "strip-all">,
+                HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
+
+def strip_debug : Flag<["-", "--"], "strip-debug">,
+                  HelpText<"Remove debugging symbols only">;
+
+def d : Flag<["-"], "d">,
+        Alias<strip_debug>;
+
+def g : Flag<["-"], "g">,
+        Alias<strip_debug>;
+
+def S : Flag<["-"], "S">,
+        Alias<strip_debug>;
+
+defm remove_section : Eq<"remove-section">,
+                      MetaVarName<"section">,
+                      HelpText<"Remove <section>">;
+
+def R : JoinedOrSeparate<["-"], "R">,
+        Alias<remove_section>;
+
+defm keep_symbol : Eq<"keep-symbol">,
+                   MetaVarName<"symbol">,
+                   HelpText<"Do not remove symbol <symbol>">;
+
+def K : JoinedOrSeparate<["-"], "K">,
+        Alias<keep_symbol>;
+
+def discard_all : Flag<["-", "--"], "discard-all">,
+                  HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">,
+        Alias<discard_all>;
+
+def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
+                      HelpText<"Remove all symbols not needed by relocations">;
diff --git a/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
index 20ce93bb40e8..4ccc67cc75db 100644
--- a/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -13,10 +13,15 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/Error.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -24,9 +29,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -39,13 +43,122 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::objcopy;
 using namespace object;
 using namespace ELF;
 
-// The name this program was invoked as.
-static StringRef ToolName;
+namespace {
+
+enum ObjcopyID {
+  OBJCOPY_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OBJCOPY_##ID,
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
+#include "ObjcopyOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info ObjcopyInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {OBJCOPY_##PREFIX,                                                           \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   OBJCOPY_##ID,                                                               \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   OBJCOPY_##GROUP,                                                            \
+   OBJCOPY_##ALIAS,                                                            \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+class ObjcopyOptTable : public opt::OptTable {
+public:
+  ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {}
+};
+
+enum StripID {
+  STRIP_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  STRIP_##ID,
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
+#include "StripOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info StripInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
+   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
+   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
+   STRIP_##ALIAS,  ALIASARGS,  VALUES},
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+class StripOptTable : public opt::OptTable {
+public:
+  StripOptTable() : OptTable(StripInfoTable, true) {}
+};
+
+struct CopyConfig {
+  StringRef OutputFilename;
+  StringRef InputFilename;
+  StringRef OutputFormat;
+  StringRef InputFormat;
+  StringRef BinaryArch;
+
+  StringRef SplitDWO;
+  StringRef AddGnuDebugLink;
+  std::vector<StringRef> ToRemove;
+  std::vector<StringRef> Keep;
+  std::vector<StringRef> OnlyKeep;
+  std::vector<StringRef> AddSection;
+  std::vector<StringRef> SymbolsToLocalize;
+  std::vector<StringRef> SymbolsToGlobalize;
+  std::vector<StringRef> SymbolsToWeaken;
+  std::vector<StringRef> SymbolsToRemove;
+  std::vector<StringRef> SymbolsToKeep;
+  StringMap<StringRef> SectionsToRename;
+  StringMap<StringRef> SymbolsToRename;
+  bool StripAll = false;
+  bool StripAllGNU = false;
+  bool StripDebug = false;
+  bool StripSections = false;
+  bool StripNonAlloc = false;
+  bool StripDWO = false;
+  bool StripUnneeded = false;
+  bool ExtractDWO = false;
+  bool LocalizeHidden = false;
+  bool Weaken = false;
+  bool DiscardAll = false;
+  bool OnlyKeepDebug = false;
+  bool KeepFileSymbols = false;
+};
+
+using SectionPred = std::function<bool(const SectionBase &Sec)>;
+
+} // namespace
 
 namespace llvm {
+namespace objcopy {
+
+// The name this program was invoked as.
+StringRef ToolName;
 
 LLVM_ATTRIBUTE_NORETURN void error(Twine Message) {
   errs() << ToolName << ": " << Message << ".\n";
@@ -69,95 +182,55 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
   exit(1);
 }
 
+} // end namespace objcopy
 } // end namespace llvm
 
-static cl::opt<std::string> InputFilename(cl::Positional, cl::desc("<input>"));
-static cl::opt<std::string> OutputFilename(cl::Positional, cl::desc("<output>"),
-                                           cl::init("-"));
-static cl::opt<std::string>
-    OutputFormat("O", cl::desc("Set output format to one of the following:"
-                               "\n\tbinary"));
-static cl::list<std::string> ToRemove("remove-section",
-                                      cl::desc("Remove <section>"),
-                                      cl::value_desc("section"));
-static cl::alias ToRemoveA("R", cl::desc("Alias for remove-section"),
-                           cl::aliasopt(ToRemove));
-static cl::opt<bool> StripAll(
-    "strip-all",
-    cl::desc(
-        "Removes non-allocated sections other than .gnu.warning* sections"));
-static cl::opt<bool>
-    StripAllGNU("strip-all-gnu",
-                cl::desc("Removes symbol, relocation, and debug information"));
-static cl::list<std::string> Keep("keep", cl::desc("Keep <section>"),
-                                  cl::value_desc("section"));
-static cl::list<std::string> OnlyKeep("only-keep",
-                                      cl::desc("Remove all but <section>"),
-                                      cl::value_desc("section"));
-static cl::alias OnlyKeepA("j", cl::desc("Alias for only-keep"),
-                           cl::aliasopt(OnlyKeep));
-static cl::opt<bool> StripDebug("strip-debug",
-                                cl::desc("Removes all debug information"));
-static cl::opt<bool> StripSections("strip-sections",
-                                   cl::desc("Remove all section headers"));
-static cl::opt<bool>
-    StripNonAlloc("strip-non-alloc",
-                  cl::desc("Remove all non-allocated sections"));
-static cl::opt<bool>
-    StripDWO("strip-dwo", cl::desc("Remove all DWARF .dwo sections from file"));
-static cl::opt<bool> ExtractDWO(
-    "extract-dwo",
-    cl::desc("Remove all sections that are not DWARF .dwo sections from file"));
-static cl::opt<std::string>
-    SplitDWO("split-dwo",
-             cl::desc("Equivalent to extract-dwo on the input file to "
-                      "<dwo-file>, then strip-dwo on the input file"),
-             cl::value_desc("dwo-file"));
-static cl::list<std::string> AddSection(
-    "add-section",
-    cl::desc("Make a section named <section> with the contents of <file>."),
-    cl::value_desc("section=file"));
-
-using SectionPred = std::function<bool(const SectionBase &Sec)>;
-
-bool IsDWOSection(const SectionBase &Sec) { return Sec.Name.endswith(".dwo"); }
+static bool IsDWOSection(const SectionBase &Sec) {
+  return Sec.Name.endswith(".dwo");
+}
 
-template <class ELFT>
-bool OnlyKeepDWOPred(const Object<ELFT> &Obj, const SectionBase &Sec) {
+static bool OnlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
   // We can't remove the section header string table.
-  if (&Sec == Obj.getSectionHeaderStrTab())
+  if (&Sec == Obj.SectionNames)
     return false;
   // Short of keeping the string table we want to keep everything that is a DWO
   // section and remove everything else.
   return !IsDWOSection(Sec);
 }
 
-template <class ELFT>
-void WriteObjectFile(const Object<ELFT> &Obj, StringRef File) {
-  std::unique_ptr<FileOutputBuffer> Buffer;
-  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-      FileOutputBuffer::create(File, Obj.totalSize(),
-                               FileOutputBuffer::F_executable);
-  handleAllErrors(BufferOrErr.takeError(), [](const ErrorInfoBase &) {
-    error("failed to open " + OutputFilename);
-  });
-  Buffer = std::move(*BufferOrErr);
-
-  Obj.write(*Buffer);
-  if (auto E = Buffer->commit())
-    reportError(File, errorToErrorCode(std::move(E)));
+static std::unique_ptr<Writer> CreateWriter(const CopyConfig &Config,
+                                            Object &Obj, Buffer &Buf,
+                                            ElfType OutputElfType) {
+  if (Config.OutputFormat == "binary") {
+    return llvm::make_unique<BinaryWriter>(Obj, Buf);
+  }
+  // Depending on the initial ELFT and OutputFormat we need a different Writer.
+  switch (OutputElfType) {
+  case ELFT_ELF32LE:
+    return llvm::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF64LE:
+    return llvm::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF32BE:
+    return llvm::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF64BE:
+    return llvm::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  }
+  llvm_unreachable("Invalid output format");
 }
 
-template <class ELFT>
-void SplitDWOToFile(const ELFObjectFile<ELFT> &ObjFile, StringRef File) {
-  // Construct a second output file for the DWO sections.
-  ELFObject<ELFT> DWOFile(ObjFile);
-
-  DWOFile.removeSections([&](const SectionBase &Sec) {
-    return OnlyKeepDWOPred<ELFT>(DWOFile, Sec);
-  });
-  DWOFile.finalize();
-  WriteObjectFile(DWOFile, File);
+static void SplitDWOToFile(const CopyConfig &Config, const Reader &Reader,
+                           StringRef File, ElfType OutputElfType) {
+  auto DWOFile = Reader.create();
+  DWOFile->removeSections(
+      [&](const SectionBase &Sec) { return OnlyKeepDWOPred(*DWOFile, Sec); });
+  FileBuffer FB(File);
+  auto Writer = CreateWriter(Config, *DWOFile, FB, OutputElfType);
+  Writer->finalize();
+  Writer->write();
 }
 
 // This function handles the high level operations of GNU objcopy including
@@ -167,47 +240,104 @@ void SplitDWOToFile(const ELFObjectFile<ELFT> &ObjFile, StringRef File) {
 // any previous removals. Lastly whether or not something is removed shouldn't
 // depend a) on the order the options occur in or b) on some opaque priority
 // system. The only priority is that keeps/copies overrule removes.
-template <class ELFT> void CopyBinary(const ELFObjectFile<ELFT> &ObjFile) {
-  std::unique_ptr<Object<ELFT>> Obj;
+static void HandleArgs(const CopyConfig &Config, Object &Obj,
+                       const Reader &Reader, ElfType OutputElfType) {
 
-  if (!OutputFormat.empty() && OutputFormat != "binary")
-    error("invalid output format '" + OutputFormat + "'");
-  if (!OutputFormat.empty() && OutputFormat == "binary")
-    Obj = llvm::make_unique<BinaryObject<ELFT>>(ObjFile);
-  else
-    Obj = llvm::make_unique<ELFObject<ELFT>>(ObjFile);
+  if (!Config.SplitDWO.empty()) {
+    SplitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType);
+  }
+
+  // TODO: update or remove symbols only if there is an option that affects
+  // them.
+  if (Obj.SymbolTable) {
+    Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
+      if ((Config.LocalizeHidden &&
+           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
+          (!Config.SymbolsToLocalize.empty() &&
+           is_contained(Config.SymbolsToLocalize, Sym.Name)))
+        Sym.Binding = STB_LOCAL;
+
+      if (!Config.SymbolsToGlobalize.empty() &&
+          is_contained(Config.SymbolsToGlobalize, Sym.Name))
+        Sym.Binding = STB_GLOBAL;
+
+      if (!Config.SymbolsToWeaken.empty() &&
+          is_contained(Config.SymbolsToWeaken, Sym.Name) &&
+          Sym.Binding == STB_GLOBAL)
+        Sym.Binding = STB_WEAK;
+
+      if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
+          Sym.getShndx() != SHN_UNDEF)
+        Sym.Binding = STB_WEAK;
+
+      const auto I = Config.SymbolsToRename.find(Sym.Name);
+      if (I != Config.SymbolsToRename.end())
+        Sym.Name = I->getValue();
+    });
+
+    // The purpose of this loop is to mark symbols referenced by sections
+    // (like GroupSection or RelocationSection). This way, we know which
+    // symbols are still 'needed' and wich are not.
+    if (Config.StripUnneeded) {
+      for (auto &Section : Obj.sections())
+        Section.markSymbols();
+    }
+
+    Obj.removeSymbols([&](const Symbol &Sym) {
+      if ((!Config.SymbolsToKeep.empty() &&
+           is_contained(Config.SymbolsToKeep, Sym.Name)) ||
+          (Config.KeepFileSymbols && Sym.Type == STT_FILE))
+        return false;
+
+      if (Config.DiscardAll && Sym.Binding == STB_LOCAL &&
+          Sym.getShndx() != SHN_UNDEF && Sym.Type != STT_FILE &&
+          Sym.Type != STT_SECTION)
+        return true;
 
-  if (!SplitDWO.empty())
-    SplitDWOToFile<ELFT>(ObjFile, SplitDWO.getValue());
+      if (Config.StripAll || Config.StripAllGNU)
+        return true;
+
+      if (!Config.SymbolsToRemove.empty() &&
+          is_contained(Config.SymbolsToRemove, Sym.Name)) {
+        return true;
+      }
+
+      if (Config.StripUnneeded && !Sym.Referenced &&
+          (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
+          Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
+        return true;
+
+      return false;
+    });
+  }
 
   SectionPred RemovePred = [](const SectionBase &) { return false; };
 
   // Removes:
-
-  if (!ToRemove.empty()) {
-    RemovePred = [&](const SectionBase &Sec) {
-      return std::find(std::begin(ToRemove), std::end(ToRemove), Sec.Name) !=
-             std::end(ToRemove);
+  if (!Config.ToRemove.empty()) {
+    RemovePred = [&Config](const SectionBase &Sec) {
+      return std::find(std::begin(Config.ToRemove), std::end(Config.ToRemove),
+                       Sec.Name) != std::end(Config.ToRemove);
     };
   }
 
-  if (StripDWO || !SplitDWO.empty())
+  if (Config.StripDWO || !Config.SplitDWO.empty())
     RemovePred = [RemovePred](const SectionBase &Sec) {
       return IsDWOSection(Sec) || RemovePred(Sec);
     };
 
-  if (ExtractDWO)
+  if (Config.ExtractDWO)
     RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      return OnlyKeepDWOPred(*Obj, Sec) || RemovePred(Sec);
+      return OnlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
     };
 
-  if (StripAllGNU)
+  if (Config.StripAllGNU)
     RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
       if (RemovePred(Sec))
         return true;
       if ((Sec.Flags & SHF_ALLOC) != 0)
         return false;
-      if (&Sec == Obj->getSectionHeaderStrTab())
+      if (&Sec == Obj.SectionNames)
         return false;
       switch (Sec.Type) {
       case SHT_SYMTAB:
@@ -219,33 +349,32 @@ template <class ELFT> void CopyBinary(const ELFObjectFile<ELFT> &ObjFile) {
       return Sec.Name.startswith(".debug");
     };
 
-  if (StripSections) {
+  if (Config.StripSections) {
     RemovePred = [RemovePred](const SectionBase &Sec) {
       return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0;
     };
-    Obj->WriteSectionHeaders = false;
   }
 
-  if (StripDebug) {
+  if (Config.StripDebug) {
     RemovePred = [RemovePred](const SectionBase &Sec) {
       return RemovePred(Sec) || Sec.Name.startswith(".debug");
     };
   }
 
-  if (StripNonAlloc)
+  if (Config.StripNonAlloc)
     RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
       if (RemovePred(Sec))
         return true;
-      if (&Sec == Obj->getSectionHeaderStrTab())
+      if (&Sec == Obj.SectionNames)
         return false;
       return (Sec.Flags & SHF_ALLOC) == 0;
     };
 
-  if (StripAll)
+  if (Config.StripAll)
     RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
       if (RemovePred(Sec))
         return true;
-      if (&Sec == Obj->getSectionHeaderStrTab())
+      if (&Sec == Obj.SectionNames)
         return false;
       if (Sec.Name.startswith(".gnu.warning"))
         return false;
@@ -253,47 +382,67 @@ template <class ELFT> void CopyBinary(const ELFObjectFile<ELFT> &ObjFile) {
     };
 
   // Explicit copies:
-
-  if (!OnlyKeep.empty()) {
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+  if (!Config.OnlyKeep.empty()) {
+    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
       // Explicitly keep these sections regardless of previous removes.
-      if (std::find(std::begin(OnlyKeep), std::end(OnlyKeep), Sec.Name) !=
-          std::end(OnlyKeep))
+      if (std::find(std::begin(Config.OnlyKeep), std::end(Config.OnlyKeep),
+                    Sec.Name) != std::end(Config.OnlyKeep))
         return false;
 
       // Allow all implicit removes.
-      if (RemovePred(Sec)) {
+      if (RemovePred(Sec))
         return true;
-      }
 
       // Keep special sections.
-      if (Obj->getSectionHeaderStrTab() == &Sec) {
+      if (Obj.SectionNames == &Sec)
         return false;
-      }
-      if (Obj->getSymTab() == &Sec || Obj->getSymTab()->getStrTab() == &Sec) {
+      if (Obj.SymbolTable == &Sec ||
+          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
         return false;
-      }
+
       // Remove everything else.
       return true;
     };
   }
 
-  if (!Keep.empty()) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
+  if (!Config.Keep.empty()) {
+    RemovePred = [Config, RemovePred](const SectionBase &Sec) {
       // Explicitly keep these sections regardless of previous removes.
-      if (std::find(std::begin(Keep), std::end(Keep), Sec.Name) !=
-          std::end(Keep))
+      if (std::find(std::begin(Config.Keep), std::end(Config.Keep), Sec.Name) !=
+          std::end(Config.Keep))
         return false;
       // Otherwise defer to RemovePred.
       return RemovePred(Sec);
     };
   }
 
-  Obj->removeSections(RemovePred);
+  // This has to be the last predicate assignment.
+  // If the option --keep-symbol has been specified
+  // and at least one of those symbols is present
+  // (equivalently, the updated symbol table is not empty)
+  // the symbol table and the string table should not be removed.
+  if ((!Config.SymbolsToKeep.empty() || Config.KeepFileSymbols) &&
+      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
+    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
+      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
+        return false;
+      return RemovePred(Sec);
+    };
+  }
 
-  if (!AddSection.empty()) {
-    for (const auto &Flag : AddSection) {
-      auto SecPair = StringRef(Flag).split("=");
+  Obj.removeSections(RemovePred);
+
+  if (!Config.SectionsToRename.empty()) {
+    for (auto &Sec : Obj.sections()) {
+      const auto Iter = Config.SectionsToRename.find(Sec.Name);
+      if (Iter != Config.SectionsToRename.end())
+        Sec.Name = Iter->second;
+    }
+  }
+
+  if (!Config.AddSection.empty()) {
+    for (const auto &Flag : Config.AddSection) {
+      auto SecPair = Flag.split("=");
       auto SecName = SecPair.first;
       auto File = SecPair.second;
       auto BufOrErr = MemoryBuffer::getFile(File);
@@ -302,44 +451,256 @@ template <class ELFT> void CopyBinary(const ELFObjectFile<ELFT> &ObjFile) {
       auto Buf = std::move(*BufOrErr);
       auto BufPtr = reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
       auto BufSize = Buf->getBufferSize();
-      Obj->addSection(SecName, ArrayRef<uint8_t>(BufPtr, BufSize));
+      Obj.addSection<OwnedDataSection>(SecName,
+                                       ArrayRef<uint8_t>(BufPtr, BufSize));
     }
   }
 
-  Obj->finalize();
-  WriteObjectFile(*Obj, OutputFilename.getValue());
+  if (!Config.AddGnuDebugLink.empty())
+    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink);
 }
 
-int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
-  cl::ParseCommandLineOptions(argc, argv, "llvm objcopy utility\n");
-  ToolName = argv[0];
-  if (InputFilename.empty()) {
-    cl::PrintHelpMessage();
-    return 2;
+static void ExecuteElfObjcopyOnBinary(const CopyConfig &Config, Binary &Binary,
+                                      Buffer &Out) {
+  ELFReader Reader(&Binary);
+  std::unique_ptr<Object> Obj = Reader.create();
+
+  HandleArgs(Config, *Obj, Reader, Reader.getElfType());
+
+  std::unique_ptr<Writer> Writer =
+      CreateWriter(Config, *Obj, Out, Reader.getElfType());
+  Writer->finalize();
+  Writer->write();
+}
+
+// For regular archives this function simply calls llvm::writeArchive,
+// For thin archives it writes the archive file itself as well as its members.
+static Error deepWriteArchive(StringRef ArcName,
+                              ArrayRef<NewArchiveMember> NewMembers,
+                              bool WriteSymtab, object::Archive::Kind Kind,
+                              bool Deterministic, bool Thin) {
+  Error E =
+      writeArchive(ArcName, NewMembers, WriteSymtab, Kind, Deterministic, Thin);
+  if (!Thin || E)
+    return E;
+  for (const NewArchiveMember &Member : NewMembers) {
+    // Internally, FileBuffer will use the buffer created by
+    // FileOutputBuffer::create, for regular files (that is the case for
+    // deepWriteArchive) FileOutputBuffer::create will return OnDiskBuffer.
+    // OnDiskBuffer uses a temporary file and then renames it. So in reality
+    // there is no inefficiency / duplicated in-memory buffers in this case. For
+    // now in-memory buffers can not be completely avoided since
+    // NewArchiveMember still requires them even though writeArchive does not
+    // write them on disk.
+    FileBuffer FB(Member.MemberName);
+    FB.allocate(Member.Buf->getBufferSize());
+    std::copy(Member.Buf->getBufferStart(), Member.Buf->getBufferEnd(),
+              FB.getBufferStart());
+    if (auto E = FB.commit())
+      return E;
+  }
+  return Error::success();
+}
+
+static void ExecuteElfObjcopyOnArchive(const CopyConfig &Config, const Archive &Ar) {
+  std::vector<NewArchiveMember> NewArchiveMembers;
+  Error Err = Error::success();
+  for (const Archive::Child &Child : Ar.children(Err)) {
+    Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
+    if (!ChildOrErr)
+      reportError(Ar.getFileName(), ChildOrErr.takeError());
+    Expected<StringRef> ChildNameOrErr = Child.getName();
+    if (!ChildNameOrErr)
+      reportError(Ar.getFileName(), ChildNameOrErr.takeError());
+
+    MemBuffer MB(ChildNameOrErr.get());
+    ExecuteElfObjcopyOnBinary(Config, **ChildOrErr, MB);
+
+    Expected<NewArchiveMember> Member =
+        NewArchiveMember::getOldMember(Child, true);
+    if (!Member)
+      reportError(Ar.getFileName(), Member.takeError());
+    Member->Buf = MB.releaseMemoryBuffer();
+    Member->MemberName = Member->Buf->getBufferIdentifier();
+    NewArchiveMembers.push_back(std::move(*Member));
   }
-  Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(InputFilename);
+
+  if (Err)
+    reportError(Config.InputFilename, std::move(Err));
+  if (Error E =
+          deepWriteArchive(Config.OutputFilename, NewArchiveMembers,
+                           Ar.hasSymbolTable(), Ar.kind(), true, Ar.isThin()))
+    reportError(Config.OutputFilename, std::move(E));
+}
+
+static void ExecuteElfObjcopy(const CopyConfig &Config) {
+  Expected<OwningBinary<llvm::object::Binary>> BinaryOrErr =
+      createBinary(Config.InputFilename);
   if (!BinaryOrErr)
-    reportError(InputFilename, BinaryOrErr.takeError());
-  Binary &Binary = *BinaryOrErr.get().getBinary();
-  if (auto *o = dyn_cast<ELFObjectFile<ELF64LE>>(&Binary)) {
-    CopyBinary(*o);
-    return 0;
+    reportError(Config.InputFilename, BinaryOrErr.takeError());
+
+  if (Archive *Ar = dyn_cast<Archive>(BinaryOrErr.get().getBinary()))
+    return ExecuteElfObjcopyOnArchive(Config, *Ar);
+
+  FileBuffer FB(Config.OutputFilename);
+  ExecuteElfObjcopyOnBinary(Config, *BinaryOrErr.get().getBinary(), FB);
+}
+
+// ParseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseObjcopyOptions will print the help messege and
+// exit.
+static CopyConfig ParseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
+  ObjcopyOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    T.PrintHelp(errs(), "llvm-objcopy <input> [ <output> ]", "objcopy tool");
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_help)) {
+    T.PrintHelp(outs(), "llvm-objcopy <input> [ <output> ]", "objcopy tool");
+    exit(0);
+  }
+
+  SmallVector<const char *, 2> Positional;
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
+    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
+    Positional.push_back(Arg->getValue());
+
+  if (Positional.empty())
+    error("No input file specified");
+
+  if (Positional.size() > 2)
+    error("Too many positional arguments");
+
+  CopyConfig Config;
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
+  Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
+  Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
+  Config.BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
+
+  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
+  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      error("Bad format for --redefine-sym");
+    auto Old2New = StringRef(Arg->getValue()).split('=');
+    if (!Config.SymbolsToRename.insert(Old2New).second)
+      error("Multiple redefinition of symbol " + Old2New.first);
   }
-  if (auto *o = dyn_cast<ELFObjectFile<ELF32LE>>(&Binary)) {
-    CopyBinary(*o);
-    return 0;
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      error("Bad format for --rename-section");
+    auto Old2New = StringRef(Arg->getValue()).split('=');
+    if (!Config.SectionsToRename.insert(Old2New).second)
+      error("Already have a section rename for " + Old2New.first);
   }
-  if (auto *o = dyn_cast<ELFObjectFile<ELF64BE>>(&Binary)) {
-    CopyBinary(*o);
-    return 0;
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
+    Config.ToRemove.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep))
+    Config.Keep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep))
+    Config.OnlyKeep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
+    Config.AddSection.push_back(Arg->getValue());
+  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
+  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
+  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
+  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
+  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
+  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
+  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
+  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
+  Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
+  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
+  Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all);
+  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
+  Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
+    Config.SymbolsToLocalize.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
+    Config.SymbolsToGlobalize.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
+    Config.SymbolsToWeaken.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
+    Config.SymbolsToRemove.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
+    Config.SymbolsToKeep.push_back(Arg->getValue());
+
+  return Config;
+}
+
+// ParseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseStripOptions will print the help messege and
+// exit.
+static CopyConfig ParseStripOptions(ArrayRef<const char *> ArgsArr) {
+  StripOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    T.PrintHelp(errs(), "llvm-strip <input> [ <output> ]", "strip tool");
+    exit(1);
   }
-  if (auto *o = dyn_cast<ELFObjectFile<ELF32BE>>(&Binary)) {
-    CopyBinary(*o);
-    return 0;
+
+  if (InputArgs.hasArg(STRIP_help)) {
+    T.PrintHelp(outs(), "llvm-strip <input> [ <output> ]", "strip tool");
+    exit(0);
   }
-  reportError(InputFilename, object_error::invalid_file_type);
+
+  SmallVector<const char *, 2> Positional;
+  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
+    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
+  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
+    Positional.push_back(Arg->getValue());
+
+  if (Positional.empty())
+    error("No input file specified");
+
+  if (Positional.size() > 2)
+    error("Support for multiple input files is not implemented yet");
+
+  CopyConfig Config;
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename =
+      InputArgs.getLastArgValue(STRIP_output, Positional[0]);
+
+  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
+
+  Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all);
+  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
+  Config.StripAll = InputArgs.hasArg(STRIP_strip_all);
+
+  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll)
+    Config.StripAll = true;
+
+  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
+    Config.ToRemove.push_back(Arg->getValue());
+
+  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
+    Config.SymbolsToKeep.push_back(Arg->getValue());
+
+  return Config;
+}
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+  ToolName = argv[0];
+  CopyConfig Config;
+  if (sys::path::stem(ToolName).endswith_lower("strip"))
+    Config = ParseStripOptions(makeArrayRef(argv + 1, argc));
+  else
+    Config = ParseObjcopyOptions(makeArrayRef(argv + 1, argc));
+  ExecuteElfObjcopy(Config);
 }
diff --git a/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.h b/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.h
index 6732e410d8e0..e222b65dc78f 100644
--- a/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.h
+++ b/contrib/llvm/tools/llvm-objcopy/llvm-objcopy.h
@@ -17,8 +17,12 @@
 #include <string>
 
 namespace llvm {
+namespace objcopy {
 
 LLVM_ATTRIBUTE_NORETURN extern void error(Twine Message);
+LLVM_ATTRIBUTE_NORETURN extern void reportError(StringRef File, Error E);
+LLVM_ATTRIBUTE_NORETURN extern void reportError(StringRef File,
+                                                std::error_code EC);
 
 // This is taken from llvm-readobj.
 // [see here](llvm/tools/llvm-readobj/llvm-readobj.h:38)
@@ -32,6 +36,7 @@ template <class T> T unwrapOrError(Expected<T> EO) {
   error(Buf);
 }
 
+} // end namespace objcopy
 } // end namespace llvm
 
 #endif // LLVM_TOOLS_OBJCOPY_OBJCOPY_H
diff --git a/contrib/llvm/tools/llvm-objdump/COFFDump.cpp b/contrib/llvm/tools/llvm-objdump/COFFDump.cpp
index 780d1e9e6111..7ca5d04593ff 100644
--- a/contrib/llvm/tools/llvm-objdump/COFFDump.cpp
+++ b/contrib/llvm/tools/llvm-objdump/COFFDump.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the COFF-specific dumper for llvm-objdump.
+/// This file implements the COFF-specific dumper for llvm-objdump.
 /// It outputs the Win64 EH data structures as plain text.
 /// The encoding of the unwind codes is described in MSDN:
 /// http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx
@@ -453,7 +453,7 @@ static bool getPDataSection(const COFFObjectFile *Obj,
       Rels.push_back(Reloc);
 
     // Sort relocations by address.
-    std::sort(Rels.begin(), Rels.end(), RelocAddressLess);
+    llvm::sort(Rels.begin(), Rels.end(), RelocAddressLess);
 
     ArrayRef<uint8_t> Contents;
     error(Obj->getSectionContents(Pdata, Contents));
diff --git a/contrib/llvm/tools/llvm-objdump/ELFDump.cpp b/contrib/llvm/tools/llvm-objdump/ELFDump.cpp
index 7f5fe5a9d3b8..f4d36656a6c4 100644
--- a/contrib/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/contrib/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the ELF-specific dumper for llvm-objdump.
+/// This file implements the ELF-specific dumper for llvm-objdump.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -21,6 +21,77 @@
 using namespace llvm;
 using namespace llvm::object;
 
+template <class ELFT>
+Expected<StringRef> getDynamicStrTab(const ELFFile<ELFT> *Elf) {
+  typedef ELFFile<ELFT> ELFO;
+
+  auto DynamicEntriesOrError = Elf->dynamicEntries();
+  if (!DynamicEntriesOrError)
+    return DynamicEntriesOrError.takeError();
+
+  for (const typename ELFO::Elf_Dyn &Dyn : *DynamicEntriesOrError) {
+    if (Dyn.d_tag == ELF::DT_STRTAB) {
+      auto MappedAddrOrError = Elf->toMappedAddr(Dyn.getPtr());
+      if (!MappedAddrOrError)
+        consumeError(MappedAddrOrError.takeError());
+      return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
+    }
+  }
+
+  // If the dynamic segment is not present, we fall back on the sections.
+  auto SectionsOrError = Elf->sections();
+  if (!SectionsOrError)
+    return SectionsOrError.takeError();
+
+  for (const typename ELFO::Elf_Shdr &Sec : *SectionsOrError) {
+    if (Sec.sh_type == ELF::SHT_DYNSYM)
+      return Elf->getStringTableForSymtab(Sec);
+  }
+
+  return createError("dynamic string table not found");
+}
+
+template <class ELFT>
+void printDynamicSection(const ELFFile<ELFT> *Elf, StringRef Filename) {
+  auto ProgramHeaderOrError = Elf->program_headers();
+  if (!ProgramHeaderOrError)
+    report_error(Filename, ProgramHeaderOrError.takeError());
+
+  auto DynamicEntriesOrError = Elf->dynamicEntries();
+  if (!DynamicEntriesOrError)
+    report_error(Filename, DynamicEntriesOrError.takeError());
+
+  outs() << "Dynamic Section:\n";
+  for (const auto &Dyn : *DynamicEntriesOrError) {
+    if (Dyn.d_tag == ELF::DT_NULL)
+      continue;
+
+    StringRef Str = StringRef(Elf->getDynamicTagAsString(Dyn.d_tag));
+
+    if (Str.empty()) {
+      std::string HexStr = utohexstr(static_cast<uint64_t>(Dyn.d_tag), true);
+      outs() << format("  0x%-19s", HexStr.c_str());
+    } else {
+      // We use "-21" in order to match GNU objdump's output.
+      outs() << format("  %-21s", Str.data());
+    }
+
+    const char *Fmt =
+        ELFT::Is64Bits ? "0x%016" PRIx64 "\n" : "0x%08" PRIx64 "\n";
+    if (Dyn.d_tag == ELF::DT_NEEDED) {
+      Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf);
+      if (StrTabOrErr) {
+        const char *Data = StrTabOrErr.get().data();
+        outs() << (Data + Dyn.d_un.d_val) << "\n";
+        continue;
+      }
+      warn(errorToErrorCode(StrTabOrErr.takeError()).message());
+      consumeError(StrTabOrErr.takeError());
+    }
+    outs() << format(Fmt, (uint64_t)Dyn.d_un.d_val);
+  }
+}
+
 template <class ELFT> void printProgramHeaders(const ELFFile<ELFT> *o) {
   typedef ELFFile<ELFT> ELFO;
   outs() << "Program Header:\n";
@@ -103,3 +174,21 @@ void llvm::printELFFileHeader(const object::ObjectFile *Obj) {
   if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
     printProgramHeaders(ELFObj->getELFFile());
 }
+
+void llvm::printELFDynamicSection(const object::ObjectFile *Obj) {
+  // Little-endian 32-bit
+  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
+    printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
+
+  // Big-endian 32-bit
+  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
+    printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
+
+  // Little-endian 64-bit
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
+
+  // Big-endian 64-bit
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+    printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
+}
diff --git a/contrib/llvm/tools/llvm-objdump/MachODump.cpp b/contrib/llvm/tools/llvm-objdump/MachODump.cpp
index 9908c2f2d016..bdf80c73b999 100644
--- a/contrib/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/contrib/llvm/tools/llvm-objdump/MachODump.cpp
@@ -76,11 +76,6 @@ cl::opt<bool> llvm::UniversalHeaders("universal-headers",
                                               "(requires -macho)"));
 
 cl::opt<bool>
-    llvm::ArchiveHeaders("archive-headers",
-                         cl::desc("Print archive headers for Mach-O archives "
-                                  "(requires -macho)"));
-
-cl::opt<bool>
     ArchiveMemberOffsets("archive-member-offsets",
                          cl::desc("Print the offset to each archive member for "
                                   "Mach-O archives (requires -macho and "
@@ -1284,14 +1279,35 @@ static void DumpLiteralPointerSection(MachOObjectFile *O,
   }
 }
 
-static void DumpInitTermPointerSection(MachOObjectFile *O, const char *sect,
+static void DumpInitTermPointerSection(MachOObjectFile *O,
+                                       const SectionRef &Section,
+                                       const char *sect,
                                        uint32_t sect_size, uint64_t sect_addr,
                                        SymbolAddressMap *AddrMap,
                                        bool verbose) {
   uint32_t stride;
   stride = (O->is64Bit()) ? sizeof(uint64_t) : sizeof(uint32_t);
+
+  // Collect the external relocation symbols for the pointers.
+  std::vector<std::pair<uint64_t, SymbolRef>> Relocs;
+  for (const RelocationRef &Reloc : Section.relocations()) {
+    DataRefImpl Rel;
+    MachO::any_relocation_info RE;
+    bool isExtern = false;
+    Rel = Reloc.getRawDataRefImpl();
+    RE = O->getRelocation(Rel);
+    isExtern = O->getPlainRelocationExternal(RE);
+    if (isExtern) {
+      uint64_t RelocOffset = Reloc.getOffset();
+      symbol_iterator RelocSym = Reloc.getSymbol();
+      Relocs.push_back(std::make_pair(RelocOffset, *RelocSym));
+    }
+  }
+  array_pod_sort(Relocs.begin(), Relocs.end());
+
   for (uint32_t i = 0; i < sect_size; i += stride) {
     const char *SymbolName = nullptr;
+    uint64_t p;
     if (O->is64Bit()) {
       outs() << format("0x%016" PRIx64, sect_addr + i * stride) << " ";
       uint64_t pointer_value;
@@ -1299,8 +1315,7 @@ static void DumpInitTermPointerSection(MachOObjectFile *O, const char *sect,
       if (O->isLittleEndian() != sys::IsLittleEndianHost)
         sys::swapByteOrder(pointer_value);
       outs() << format("0x%016" PRIx64, pointer_value);
-      if (verbose)
-        SymbolName = GuessSymbolName(pointer_value, AddrMap);
+      p = pointer_value;
     } else {
       outs() << format("0x%08" PRIx64, sect_addr + i * stride) << " ";
       uint32_t pointer_value;
@@ -1308,11 +1323,25 @@ static void DumpInitTermPointerSection(MachOObjectFile *O, const char *sect,
       if (O->isLittleEndian() != sys::IsLittleEndianHost)
         sys::swapByteOrder(pointer_value);
       outs() << format("0x%08" PRIx32, pointer_value);
-      if (verbose)
-        SymbolName = GuessSymbolName(pointer_value, AddrMap);
+      p = pointer_value;
+    }
+    if (verbose) {
+      // First look for an external relocation entry for this pointer.
+      auto Reloc = find_if(Relocs, [&](const std::pair<uint64_t, SymbolRef> &P) {
+        return P.first == i;
+      });
+      if (Reloc != Relocs.end()) {
+        symbol_iterator RelocSym = Reloc->second;
+        Expected<StringRef> SymName = RelocSym->getName();
+        if (!SymName)
+          report_error(O->getFileName(), SymName.takeError());
+        outs() << " " << *SymName;
+      } else {
+        SymbolName = GuessSymbolName(p, AddrMap);
+        if (SymbolName)
+          outs() << " " << SymbolName;
+      }
     }
-    if (SymbolName)
-      outs() << " " << SymbolName;
     outs() << "\n";
   }
 }
@@ -1463,8 +1492,8 @@ static void DumpSectionContents(StringRef Filename, MachOObjectFile *O,
             break;
           case MachO::S_MOD_INIT_FUNC_POINTERS:
           case MachO::S_MOD_TERM_FUNC_POINTERS:
-            DumpInitTermPointerSection(O, sect, sect_size, sect_addr, &AddrMap,
-                                       verbose);
+            DumpInitTermPointerSection(O, Section, sect, sect_size, sect_addr,
+                                       &AddrMap, verbose);
             break;
           default:
             outs() << "Unknown section type ("
@@ -2149,19 +2178,22 @@ void llvm::ParseInputMachO(StringRef Filename) {
 
 // The block of info used by the Symbolizer call backs.
 struct DisassembleInfo {
+  DisassembleInfo(MachOObjectFile *O, SymbolAddressMap *AddrMap,
+                  std::vector<SectionRef> *Sections, bool verbose)
+    : verbose(verbose), O(O), AddrMap(AddrMap), Sections(Sections) {}
   bool verbose;
   MachOObjectFile *O;
   SectionRef S;
   SymbolAddressMap *AddrMap;
   std::vector<SectionRef> *Sections;
-  const char *class_name;
-  const char *selector_name;
-  char *method;
-  char *demangled_name;
-  uint64_t adrp_addr;
-  uint32_t adrp_inst;
+  const char *class_name = nullptr;
+  const char *selector_name = nullptr;
+  std::unique_ptr<char[]> method = nullptr;
+  char *demangled_name = nullptr;
+  uint64_t adrp_addr = 0;
+  uint32_t adrp_inst = 0;
   std::unique_ptr<SymbolAddressMap> bindtable;
-  uint32_t depth;
+  uint32_t depth = 0;
 };
 
 // SymbolizerGetOpInfo() is the operand information call back function.
@@ -2756,32 +2788,33 @@ static void method_reference(struct DisassembleInfo *info,
   if (*ReferenceName != nullptr) {
     if (strcmp(*ReferenceName, "_objc_msgSend") == 0) {
       if (info->selector_name != nullptr) {
-        if (info->method != nullptr)
-          free(info->method);
         if (info->class_name != nullptr) {
-          info->method = (char *)malloc(5 + strlen(info->class_name) +
-                                        strlen(info->selector_name));
-          if (info->method != nullptr) {
-            strcpy(info->method, "+[");
-            strcat(info->method, info->class_name);
-            strcat(info->method, " ");
-            strcat(info->method, info->selector_name);
-            strcat(info->method, "]");
-            *ReferenceName = info->method;
+          info->method = llvm::make_unique<char[]>(
+              5 + strlen(info->class_name) + strlen(info->selector_name));
+          char *method = info->method.get();
+          if (method != nullptr) {
+            strcpy(method, "+[");
+            strcat(method, info->class_name);
+            strcat(method, " ");
+            strcat(method, info->selector_name);
+            strcat(method, "]");
+            *ReferenceName = method;
             *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message;
           }
         } else {
-          info->method = (char *)malloc(9 + strlen(info->selector_name));
-          if (info->method != nullptr) {
+          info->method =
+              llvm::make_unique<char[]>(9 + strlen(info->selector_name));
+          char *method = info->method.get();
+          if (method != nullptr) {
             if (Arch == Triple::x86_64)
-              strcpy(info->method, "-[%rdi ");
+              strcpy(method, "-[%rdi ");
             else if (Arch == Triple::aarch64)
-              strcpy(info->method, "-[x0 ");
+              strcpy(method, "-[x0 ");
             else
-              strcpy(info->method, "-[r? ");
-            strcat(info->method, info->selector_name);
-            strcat(info->method, "]");
-            *ReferenceName = info->method;
+              strcpy(method, "-[r? ");
+            strcat(method, info->selector_name);
+            strcat(method, "]");
+            *ReferenceName = method;
             *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message;
           }
         }
@@ -2789,19 +2822,19 @@ static void method_reference(struct DisassembleInfo *info,
       }
     } else if (strcmp(*ReferenceName, "_objc_msgSendSuper2") == 0) {
       if (info->selector_name != nullptr) {
-        if (info->method != nullptr)
-          free(info->method);
-        info->method = (char *)malloc(17 + strlen(info->selector_name));
-        if (info->method != nullptr) {
+        info->method =
+            llvm::make_unique<char[]>(17 + strlen(info->selector_name));
+        char *method = info->method.get();
+        if (method != nullptr) {
           if (Arch == Triple::x86_64)
-            strcpy(info->method, "-[[%rdi super] ");
+            strcpy(method, "-[[%rdi super] ");
           else if (Arch == Triple::aarch64)
-            strcpy(info->method, "-[[x0 super] ");
+            strcpy(method, "-[[x0 super] ");
           else
-            strcpy(info->method, "-[[r? super] ");
-          strcat(info->method, info->selector_name);
-          strcat(info->method, "]");
-          *ReferenceName = info->method;
+            strcpy(method, "-[[r? super] ");
+          strcat(method, info->selector_name);
+          strcat(method, "]");
+          *ReferenceName = method;
           *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message;
         }
         info->class_name = nullptr;
@@ -3196,6 +3229,8 @@ struct imageInfo_t {
 /* masks for objc_image_info.flags */
 #define OBJC_IMAGE_IS_REPLACEMENT (1 << 0)
 #define OBJC_IMAGE_SUPPORTS_GC (1 << 1)
+#define OBJC_IMAGE_IS_SIMULATED (1 << 5)
+#define OBJC_IMAGE_HAS_CATEGORY_CLASS_PROPERTIES (1 << 6)
 
 struct message_ref64 {
   uint64_t imp; /* IMP (64-bit pointer) */
@@ -5557,12 +5592,24 @@ static void print_image_info64(SectionRef S, struct DisassembleInfo *info) {
     outs() << " OBJC_IMAGE_IS_REPLACEMENT";
   if (o.flags & OBJC_IMAGE_SUPPORTS_GC)
     outs() << " OBJC_IMAGE_SUPPORTS_GC";
+  if (o.flags & OBJC_IMAGE_IS_SIMULATED)
+    outs() << " OBJC_IMAGE_IS_SIMULATED";
+  if (o.flags & OBJC_IMAGE_HAS_CATEGORY_CLASS_PROPERTIES)
+    outs() << " OBJC_IMAGE_HAS_CATEGORY_CLASS_PROPERTIES";
   swift_version = (o.flags >> 8) & 0xff;
   if (swift_version != 0) {
     if (swift_version == 1)
       outs() << " Swift 1.0";
     else if (swift_version == 2)
       outs() << " Swift 1.1";
+    else if(swift_version == 3)
+      outs() << " Swift 2.0";
+    else if(swift_version == 4)
+      outs() << " Swift 3.0";
+    else if(swift_version == 5)
+      outs() << " Swift 4.0";
+    else if(swift_version == 6)
+      outs() << " Swift 4.1";
     else
       outs() << " unknown future Swift version (" << swift_version << ")";
   }
@@ -5606,6 +5653,14 @@ static void print_image_info32(SectionRef S, struct DisassembleInfo *info) {
       outs() << " Swift 1.0";
     else if (swift_version == 2)
       outs() << " Swift 1.1";
+    else if(swift_version == 3)
+      outs() << " Swift 2.0";
+    else if(swift_version == 4)
+      outs() << " Swift 3.0";
+    else if(swift_version == 5)
+      outs() << " Swift 4.0";
+    else if(swift_version == 6)
+      outs() << " Swift 4.1";
     else
       outs() << " unknown future Swift version (" << swift_version << ")";
   }
@@ -5659,21 +5714,8 @@ static void printObjc2_64bit_MetaData(MachOObjectFile *O, bool verbose) {
     Sections.push_back(Section);
   }
 
-  struct DisassembleInfo info;
-  // Set up the block of info used by the Symbolizer call backs.
-  info.verbose = verbose;
-  info.O = O;
-  info.AddrMap = &AddrMap;
-  info.Sections = &Sections;
-  info.class_name = nullptr;
-  info.selector_name = nullptr;
-  info.method = nullptr;
-  info.demangled_name = nullptr;
-  info.bindtable = nullptr;
-  info.adrp_addr = 0;
-  info.adrp_inst = 0;
-
-  info.depth = 0;
+  struct DisassembleInfo info(O, &AddrMap, &Sections, verbose);
+
   SectionRef CL = get_section(O, "__OBJC2", "__class_list");
   if (CL == SectionRef())
     CL = get_section(O, "__DATA", "__objc_classlist");
@@ -5757,19 +5799,7 @@ static void printObjc2_32bit_MetaData(MachOObjectFile *O, bool verbose) {
     Sections.push_back(Section);
   }
 
-  struct DisassembleInfo info;
-  // Set up the block of info used by the Symbolizer call backs.
-  info.verbose = verbose;
-  info.O = O;
-  info.AddrMap = &AddrMap;
-  info.Sections = &Sections;
-  info.class_name = nullptr;
-  info.selector_name = nullptr;
-  info.method = nullptr;
-  info.demangled_name = nullptr;
-  info.bindtable = nullptr;
-  info.adrp_addr = 0;
-  info.adrp_inst = 0;
+  struct DisassembleInfo info(O, &AddrMap, &Sections, verbose);
 
   SectionRef CL = get_section(O, "__OBJC2", "__class_list");
   if (CL == SectionRef())
@@ -5867,19 +5897,7 @@ static bool printObjc1_32bit_MetaData(MachOObjectFile *O, bool verbose) {
     Sections.push_back(Section);
   }
 
-  struct DisassembleInfo info;
-  // Set up the block of info used by the Symbolizer call backs.
-  info.verbose = verbose;
-  info.O = O;
-  info.AddrMap = &AddrMap;
-  info.Sections = &Sections;
-  info.class_name = nullptr;
-  info.selector_name = nullptr;
-  info.method = nullptr;
-  info.demangled_name = nullptr;
-  info.bindtable = nullptr;
-  info.adrp_addr = 0;
-  info.adrp_inst = 0;
+  struct DisassembleInfo info(O, &AddrMap, &Sections, verbose);
 
   for (i = 0; i < S.getSize(); i += sizeof(struct objc_module_t)) {
     p = S.getAddress() + i;
@@ -6040,19 +6058,7 @@ static void DumpProtocolSection(MachOObjectFile *O, const char *sect,
     Sections.push_back(Section);
   }
 
-  struct DisassembleInfo info;
-  // Set up the block of info used by the Symbolizer call backs.
-  info.verbose = true;
-  info.O = O;
-  info.AddrMap = &AddrMap;
-  info.Sections = &Sections;
-  info.class_name = nullptr;
-  info.selector_name = nullptr;
-  info.method = nullptr;
-  info.demangled_name = nullptr;
-  info.bindtable = nullptr;
-  info.adrp_addr = 0;
-  info.adrp_inst = 0;
+  struct DisassembleInfo info(O, &AddrMap, &Sections, true);
 
   const char *p;
   struct objc_protocol_t protocol;
@@ -6748,7 +6754,7 @@ static const char *SymbolizerSymbolLookUp(void *DisInfo,
   return SymbolName;
 }
 
-/// \brief Emits the comments that are stored in the CommentStream.
+/// Emits the comments that are stored in the CommentStream.
 /// Each comment in the CommentStream must end with a newline.
 static void emitComments(raw_svector_ostream &CommentStream,
                          SmallString<128> &CommentsToEmit,
@@ -6817,7 +6823,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   std::unique_ptr<MCDisassembler> DisAsm(
       TheTarget->createMCDisassembler(*STI, Ctx));
   std::unique_ptr<MCSymbolizer> Symbolizer;
-  struct DisassembleInfo SymbolizerInfo;
+  struct DisassembleInfo SymbolizerInfo(nullptr, nullptr, nullptr, false);
   std::unique_ptr<MCRelocationInfo> RelInfo(
       TheTarget->createMCRelocationInfo(TripleName, Ctx));
   if (RelInfo) {
@@ -6855,7 +6861,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   std::unique_ptr<MCInstPrinter> ThumbIP;
   std::unique_ptr<MCContext> ThumbCtx;
   std::unique_ptr<MCSymbolizer> ThumbSymbolizer;
-  struct DisassembleInfo ThumbSymbolizerInfo;
+  struct DisassembleInfo ThumbSymbolizerInfo(nullptr, nullptr, nullptr, false);
   std::unique_ptr<MCRelocationInfo> ThumbRelInfo;
   if (ThumbTarget) {
     ThumbMRI.reset(ThumbTarget->createMCRegInfo(ThumbTripleName));
@@ -6904,7 +6910,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
                         BaseSegmentAddress);
 
   // Sort the symbols by address, just in case they didn't come in that way.
-  std::sort(Symbols.begin(), Symbols.end(), SymbolSorter());
+  llvm::sort(Symbols.begin(), Symbols.end(), SymbolSorter());
 
   // Build a data in code table that is sorted on by the address of each entry.
   uint64_t BaseAddress = 0;
@@ -6940,10 +6946,12 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         errs() << "llvm-objdump: " << Filename << ": " << EC.message() << '\n';
         return;
       }
-      DbgObj =
-          ObjectFile::createMachOObjectFile(BufOrErr.get()->getMemBufferRef())
-              .get()
-              .release();
+      Expected<std::unique_ptr<MachOObjectFile>> DbgObjCheck =
+          ObjectFile::createMachOObjectFile(BufOrErr.get()->getMemBufferRef());
+
+      if (DbgObjCheck.takeError())
+        report_error(MachOOF->getFileName(), DbgObjCheck.takeError());
+      DbgObj = DbgObjCheck.get().release();
     }
 
     // Setup the DIContext
@@ -7003,26 +7011,12 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
     SymbolizerInfo.S = Sections[SectIdx];
     SymbolizerInfo.AddrMap = &AddrMap;
     SymbolizerInfo.Sections = &Sections;
-    SymbolizerInfo.class_name = nullptr;
-    SymbolizerInfo.selector_name = nullptr;
-    SymbolizerInfo.method = nullptr;
-    SymbolizerInfo.demangled_name = nullptr;
-    SymbolizerInfo.bindtable = nullptr;
-    SymbolizerInfo.adrp_addr = 0;
-    SymbolizerInfo.adrp_inst = 0;
     // Same for the ThumbSymbolizer
     ThumbSymbolizerInfo.verbose = !NoSymbolicOperands;
     ThumbSymbolizerInfo.O = MachOOF;
     ThumbSymbolizerInfo.S = Sections[SectIdx];
     ThumbSymbolizerInfo.AddrMap = &AddrMap;
     ThumbSymbolizerInfo.Sections = &Sections;
-    ThumbSymbolizerInfo.class_name = nullptr;
-    ThumbSymbolizerInfo.selector_name = nullptr;
-    ThumbSymbolizerInfo.method = nullptr;
-    ThumbSymbolizerInfo.demangled_name = nullptr;
-    ThumbSymbolizerInfo.bindtable = nullptr;
-    ThumbSymbolizerInfo.adrp_addr = 0;
-    ThumbSymbolizerInfo.adrp_inst = 0;
 
     unsigned int Arch = MachOOF->getArch();
 
@@ -7293,12 +7287,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
     TripleName = "";
     ThumbTripleName = "";
 
-    if (SymbolizerInfo.method != nullptr)
-      free(SymbolizerInfo.method);
     if (SymbolizerInfo.demangled_name != nullptr)
       free(SymbolizerInfo.demangled_name);
-    if (ThumbSymbolizerInfo.method != nullptr)
-      free(ThumbSymbolizerInfo.method);
     if (ThumbSymbolizerInfo.demangled_name != nullptr)
       free(ThumbSymbolizerInfo.demangled_name);
   }
@@ -7310,12 +7300,25 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
 
 namespace {
 
-template <typename T> static uint64_t readNext(const char *&Buf) {
+template <typename T>
+static uint64_t read(StringRef Contents, ptrdiff_t Offset) {
   using llvm::support::little;
   using llvm::support::unaligned;
 
-  uint64_t Val = support::endian::read<T, little, unaligned>(Buf);
-  Buf += sizeof(T);
+  if (Offset + sizeof(T) > Contents.size()) {
+    outs() << "warning: attempt to read past end of buffer\n";
+    return T();
+  }
+
+  uint64_t Val =
+      support::endian::read<T, little, unaligned>(Contents.data() + Offset);
+  return Val;
+}
+
+template <typename T>
+static uint64_t readNext(StringRef Contents, ptrdiff_t &Offset) {
+  T Val = read<T>(Contents, Offset);
+  Offset += sizeof(T);
   return Val;
 }
 
@@ -7335,18 +7338,18 @@ struct CompactUnwindEntry {
   CompactUnwindEntry(StringRef Contents, unsigned Offset, bool Is64)
       : OffsetInSection(Offset) {
     if (Is64)
-      read<uint64_t>(Contents.data() + Offset);
+      read<uint64_t>(Contents, Offset);
     else
-      read<uint32_t>(Contents.data() + Offset);
+      read<uint32_t>(Contents, Offset);
   }
 
 private:
-  template <typename UIntPtr> void read(const char *Buf) {
-    FunctionAddr = readNext<UIntPtr>(Buf);
-    Length = readNext<uint32_t>(Buf);
-    CompactEncoding = readNext<uint32_t>(Buf);
-    PersonalityAddr = readNext<UIntPtr>(Buf);
-    LSDAAddr = readNext<UIntPtr>(Buf);
+  template <typename UIntPtr> void read(StringRef Contents, ptrdiff_t Offset) {
+    FunctionAddr = readNext<UIntPtr>(Contents, Offset);
+    Length = readNext<uint32_t>(Contents, Offset);
+    CompactEncoding = readNext<uint32_t>(Contents, Offset);
+    PersonalityAddr = readNext<UIntPtr>(Contents, Offset);
+    LSDAAddr = readNext<UIntPtr>(Contents, Offset);
   }
 };
 }
@@ -7448,7 +7451,7 @@ printMachOCompactUnwindSection(const MachOObjectFile *Obj,
 
   // First populate the initial raw offsets, encodings and so on from the entry.
   for (unsigned Offset = 0; Offset < Contents.size(); Offset += EntrySize) {
-    CompactUnwindEntry Entry(Contents.data(), Offset, Is64);
+    CompactUnwindEntry Entry(Contents, Offset, Is64);
     CompactUnwinds.push_back(Entry);
   }
 
@@ -7515,19 +7518,19 @@ printMachOCompactUnwindSection(const MachOObjectFile *Obj,
 // __unwind_info section dumping
 //===----------------------------------------------------------------------===//
 
-static void printRegularSecondLevelUnwindPage(const char *PageStart) {
-  const char *Pos = PageStart;
-  uint32_t Kind = readNext<uint32_t>(Pos);
+static void printRegularSecondLevelUnwindPage(StringRef PageData) {
+  ptrdiff_t Pos = 0;
+  uint32_t Kind = readNext<uint32_t>(PageData, Pos);
   (void)Kind;
   assert(Kind == 2 && "kind for a regular 2nd level index should be 2");
 
-  uint16_t EntriesStart = readNext<uint16_t>(Pos);
-  uint16_t NumEntries = readNext<uint16_t>(Pos);
+  uint16_t EntriesStart = readNext<uint16_t>(PageData, Pos);
+  uint16_t NumEntries = readNext<uint16_t>(PageData, Pos);
 
-  Pos = PageStart + EntriesStart;
+  Pos = EntriesStart;
   for (unsigned i = 0; i < NumEntries; ++i) {
-    uint32_t FunctionOffset = readNext<uint32_t>(Pos);
-    uint32_t Encoding = readNext<uint32_t>(Pos);
+    uint32_t FunctionOffset = readNext<uint32_t>(PageData, Pos);
+    uint32_t Encoding = readNext<uint32_t>(PageData, Pos);
 
     outs() << "      [" << i << "]: "
            << "function offset=" << format("0x%08" PRIx32, FunctionOffset)
@@ -7537,24 +7540,23 @@ static void printRegularSecondLevelUnwindPage(const char *PageStart) {
 }
 
 static void printCompressedSecondLevelUnwindPage(
-    const char *PageStart, uint32_t FunctionBase,
+    StringRef PageData, uint32_t FunctionBase,
     const SmallVectorImpl<uint32_t> &CommonEncodings) {
-  const char *Pos = PageStart;
-  uint32_t Kind = readNext<uint32_t>(Pos);
+  ptrdiff_t Pos = 0;
+  uint32_t Kind = readNext<uint32_t>(PageData, Pos);
   (void)Kind;
   assert(Kind == 3 && "kind for a compressed 2nd level index should be 3");
 
-  uint16_t EntriesStart = readNext<uint16_t>(Pos);
-  uint16_t NumEntries = readNext<uint16_t>(Pos);
+  uint16_t EntriesStart = readNext<uint16_t>(PageData, Pos);
+  uint16_t NumEntries = readNext<uint16_t>(PageData, Pos);
 
-  uint16_t EncodingsStart = readNext<uint16_t>(Pos);
-  readNext<uint16_t>(Pos);
-  const auto *PageEncodings = reinterpret_cast<const support::ulittle32_t *>(
-      PageStart + EncodingsStart);
+  uint16_t EncodingsStart = readNext<uint16_t>(PageData, Pos);
+  readNext<uint16_t>(PageData, Pos);
+  StringRef PageEncodings = PageData.substr(EncodingsStart, StringRef::npos);
 
-  Pos = PageStart + EntriesStart;
+  Pos = EntriesStart;
   for (unsigned i = 0; i < NumEntries; ++i) {
-    uint32_t Entry = readNext<uint32_t>(Pos);
+    uint32_t Entry = readNext<uint32_t>(PageData, Pos);
     uint32_t FunctionOffset = FunctionBase + (Entry & 0xffffff);
     uint32_t EncodingIdx = Entry >> 24;
 
@@ -7562,7 +7564,9 @@ static void printCompressedSecondLevelUnwindPage(
     if (EncodingIdx < CommonEncodings.size())
       Encoding = CommonEncodings[EncodingIdx];
     else
-      Encoding = PageEncodings[EncodingIdx - CommonEncodings.size()];
+      Encoding = read<uint32_t>(PageEncodings,
+                                sizeof(uint32_t) *
+                                    (EncodingIdx - CommonEncodings.size()));
 
     outs() << "      [" << i << "]: "
            << "function offset=" << format("0x%08" PRIx32, FunctionOffset)
@@ -7585,13 +7589,13 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
 
   StringRef Contents;
   UnwindInfo.getContents(Contents);
-  const char *Pos = Contents.data();
+  ptrdiff_t Pos = 0;
 
   //===----------------------------------
   // Section header
   //===----------------------------------
 
-  uint32_t Version = readNext<uint32_t>(Pos);
+  uint32_t Version = readNext<uint32_t>(Contents, Pos);
   outs() << "  Version:                                   "
          << format("0x%" PRIx32, Version) << '\n';
   if (Version != 1) {
@@ -7599,24 +7603,24 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
     return;
   }
 
-  uint32_t CommonEncodingsStart = readNext<uint32_t>(Pos);
+  uint32_t CommonEncodingsStart = readNext<uint32_t>(Contents, Pos);
   outs() << "  Common encodings array section offset:     "
          << format("0x%" PRIx32, CommonEncodingsStart) << '\n';
-  uint32_t NumCommonEncodings = readNext<uint32_t>(Pos);
+  uint32_t NumCommonEncodings = readNext<uint32_t>(Contents, Pos);
   outs() << "  Number of common encodings in array:       "
          << format("0x%" PRIx32, NumCommonEncodings) << '\n';
 
-  uint32_t PersonalitiesStart = readNext<uint32_t>(Pos);
+  uint32_t PersonalitiesStart = readNext<uint32_t>(Contents, Pos);
   outs() << "  Personality function array section offset: "
          << format("0x%" PRIx32, PersonalitiesStart) << '\n';
-  uint32_t NumPersonalities = readNext<uint32_t>(Pos);
+  uint32_t NumPersonalities = readNext<uint32_t>(Contents, Pos);
   outs() << "  Number of personality functions in array:  "
          << format("0x%" PRIx32, NumPersonalities) << '\n';
 
-  uint32_t IndicesStart = readNext<uint32_t>(Pos);
+  uint32_t IndicesStart = readNext<uint32_t>(Contents, Pos);
   outs() << "  Index array section offset:                "
          << format("0x%" PRIx32, IndicesStart) << '\n';
-  uint32_t NumIndices = readNext<uint32_t>(Pos);
+  uint32_t NumIndices = readNext<uint32_t>(Contents, Pos);
   outs() << "  Number of indices in array:                "
          << format("0x%" PRIx32, NumIndices) << '\n';
 
@@ -7631,9 +7635,9 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
 
   SmallVector<uint32_t, 64> CommonEncodings;
   outs() << "  Common encodings: (count = " << NumCommonEncodings << ")\n";
-  Pos = Contents.data() + CommonEncodingsStart;
+  Pos = CommonEncodingsStart;
   for (unsigned i = 0; i < NumCommonEncodings; ++i) {
-    uint32_t Encoding = readNext<uint32_t>(Pos);
+    uint32_t Encoding = readNext<uint32_t>(Contents, Pos);
     CommonEncodings.push_back(Encoding);
 
     outs() << "    encoding[" << i << "]: " << format("0x%08" PRIx32, Encoding)
@@ -7648,9 +7652,9 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
   // roughly). Particularly since they only get 2 bits in the compact encoding.
 
   outs() << "  Personality functions: (count = " << NumPersonalities << ")\n";
-  Pos = Contents.data() + PersonalitiesStart;
+  Pos = PersonalitiesStart;
   for (unsigned i = 0; i < NumPersonalities; ++i) {
-    uint32_t PersonalityFn = readNext<uint32_t>(Pos);
+    uint32_t PersonalityFn = readNext<uint32_t>(Contents, Pos);
     outs() << "    personality[" << i + 1
            << "]: " << format("0x%08" PRIx32, PersonalityFn) << '\n';
   }
@@ -7671,13 +7675,13 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
   SmallVector<IndexEntry, 4> IndexEntries;
 
   outs() << "  Top level indices: (count = " << NumIndices << ")\n";
-  Pos = Contents.data() + IndicesStart;
+  Pos = IndicesStart;
   for (unsigned i = 0; i < NumIndices; ++i) {
     IndexEntry Entry;
 
-    Entry.FunctionOffset = readNext<uint32_t>(Pos);
-    Entry.SecondLevelPageStart = readNext<uint32_t>(Pos);
-    Entry.LSDAStart = readNext<uint32_t>(Pos);
+    Entry.FunctionOffset = readNext<uint32_t>(Contents, Pos);
+    Entry.SecondLevelPageStart = readNext<uint32_t>(Contents, Pos);
+    Entry.LSDAStart = readNext<uint32_t>(Contents, Pos);
     IndexEntries.push_back(Entry);
 
     outs() << "    [" << i << "]: "
@@ -7696,12 +7700,14 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
   // the first top-level index's LSDAOffset to the last (sentinel).
 
   outs() << "  LSDA descriptors:\n";
-  Pos = Contents.data() + IndexEntries[0].LSDAStart;
-  int NumLSDAs = (IndexEntries.back().LSDAStart - IndexEntries[0].LSDAStart) /
-                 (2 * sizeof(uint32_t));
+  Pos = IndexEntries[0].LSDAStart;
+  const uint32_t LSDASize = 2 * sizeof(uint32_t);
+  int NumLSDAs =
+      (IndexEntries.back().LSDAStart - IndexEntries[0].LSDAStart) / LSDASize;
+
   for (int i = 0; i < NumLSDAs; ++i) {
-    uint32_t FunctionOffset = readNext<uint32_t>(Pos);
-    uint32_t LSDAOffset = readNext<uint32_t>(Pos);
+    uint32_t FunctionOffset = readNext<uint32_t>(Contents, Pos);
+    uint32_t LSDAOffset = readNext<uint32_t>(Contents, Pos);
     outs() << "    [" << i << "]: "
            << "function offset=" << format("0x%08" PRIx32, FunctionOffset)
            << ", "
@@ -7729,12 +7735,19 @@ static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
            << "base function offset="
            << format("0x%08" PRIx32, IndexEntries[i].FunctionOffset) << '\n';
 
-    Pos = Contents.data() + IndexEntries[i].SecondLevelPageStart;
-    uint32_t Kind = *reinterpret_cast<const support::ulittle32_t *>(Pos);
+    Pos = IndexEntries[i].SecondLevelPageStart;
+    if (Pos + sizeof(uint32_t) > Contents.size()) {
+      outs() << "warning: invalid offset for second level page: " << Pos << '\n';
+      continue;
+    }
+
+    uint32_t Kind =
+        *reinterpret_cast<const support::ulittle32_t *>(Contents.data() + Pos);
     if (Kind == 2)
-      printRegularSecondLevelUnwindPage(Pos);
+      printRegularSecondLevelUnwindPage(Contents.substr(Pos, 4096));
     else if (Kind == 3)
-      printCompressedSecondLevelUnwindPage(Pos, IndexEntries[i].FunctionOffset,
+      printCompressedSecondLevelUnwindPage(Contents.substr(Pos, 4096),
+                                           IndexEntries[i].FunctionOffset,
                                            CommonEncodings);
     else
       outs() << "    Skipping 2nd level page with unknown kind " << Kind
@@ -9352,6 +9365,26 @@ static void PrintThreadCommand(MachO::thread_command t, const char *Ptr,
           outs() << "\t    esh.flavor " << es.esh.flavor << "  esh.count "
                  << es.esh.count << "\n";
         }
+      } else if (flavor == MachO::x86_EXCEPTION_STATE64) {
+        outs() << "     flavor x86_EXCEPTION_STATE64\n";
+        if (count == MachO::x86_EXCEPTION_STATE64_COUNT)
+          outs() << "      count x86_EXCEPTION_STATE64_COUNT\n";
+        else
+          outs() << "      count " << count
+                 << " (not x86_EXCEPTION_STATE64_COUNT)\n";
+        struct MachO::x86_exception_state64_t es64;
+        left = end - begin;
+        if (left >= sizeof(MachO::x86_exception_state64_t)) {
+          memcpy(&es64, begin, sizeof(MachO::x86_exception_state64_t));
+          begin += sizeof(MachO::x86_exception_state64_t);
+        } else {
+          memset(&es64, '\0', sizeof(MachO::x86_exception_state64_t));
+          memcpy(&es64, begin, left);
+          begin += left;
+        }
+        if (isLittleEndian != sys::IsLittleEndianHost)
+          swapStruct(es64);
+        Print_x86_exception_state_t(es64);
       } else {
         outs() << "     flavor " << flavor << " (unknown)\n";
         outs() << "      count " << count << "\n";
diff --git a/contrib/llvm/tools/llvm-objdump/WasmDump.cpp b/contrib/llvm/tools/llvm-objdump/WasmDump.cpp
index 0d8ffba6ba45..045002cd4b34 100644
--- a/contrib/llvm/tools/llvm-objdump/WasmDump.cpp
+++ b/contrib/llvm/tools/llvm-objdump/WasmDump.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the wasm-specific dumper for llvm-objdump.
+/// This file implements the wasm-specific dumper for llvm-objdump.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/tools/llvm-objdump/llvm-objdump.cpp b/contrib/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 3a9112423cff..8041e6f59940 100644
--- a/contrib/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/contrib/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -20,10 +20,12 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -50,10 +52,8 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
@@ -68,6 +68,12 @@
 using namespace llvm;
 using namespace object;
 
+cl::opt<bool>
+    llvm::AllHeaders("all-headers",
+                     cl::desc("Display all available header information"));
+static cl::alias AllHeadersShort("x", cl::desc("Alias for --all-headers"),
+                                 cl::aliasopt(AllHeaders));
+
 static cl::list<std::string>
 InputFilenames(cl::Positional, cl::desc("<input object files>"),cl::ZeroOrMore);
 
@@ -85,10 +91,30 @@ static cl::alias
 DisassembleAlld("D", cl::desc("Alias for --disassemble-all"),
              cl::aliasopt(DisassembleAll));
 
+cl::opt<std::string> llvm::Demangle("demangle",
+                                    cl::desc("Demangle symbols names"),
+                                    cl::ValueOptional, cl::init("none"));
+
+static cl::alias DemangleShort("C", cl::desc("Alias for --demangle"),
+                               cl::aliasopt(Demangle));
+
+static cl::list<std::string>
+DisassembleFunctions("df",
+                     cl::CommaSeparated,
+                     cl::desc("List of functions to disassemble"));
+static StringSet<> DisasmFuncsSet;
+
 cl::opt<bool>
 llvm::Relocations("r", cl::desc("Display the relocation entries in the file"));
 
 cl::opt<bool>
+llvm::DynamicRelocations("dynamic-reloc",
+  cl::desc("Display the dynamic relocation entries in the file"));
+static cl::alias
+DynamicRelocationsd("R", cl::desc("Alias for --dynamic-reloc"),
+             cl::aliasopt(DynamicRelocations));
+
+cl::opt<bool>
 llvm::SectionContents("s", cl::desc("Display the content of each section"));
 
 cl::opt<bool>
@@ -182,6 +208,21 @@ static cl::alias
 PrivateHeadersShort("p", cl::desc("Alias for --private-headers"),
                     cl::aliasopt(PrivateHeaders));
 
+cl::opt<bool> llvm::FileHeaders(
+    "file-headers",
+    cl::desc("Display the contents of the overall file header"));
+
+static cl::alias FileHeadersShort("f", cl::desc("Alias for --file-headers"),
+                                  cl::aliasopt(FileHeaders));
+
+cl::opt<bool>
+    llvm::ArchiveHeaders("archive-headers",
+                         cl::desc("Display archive header information"));
+
+cl::alias
+ArchiveHeadersShort("a", cl::desc("Alias for --archive-headers"),
+                    cl::aliasopt(ArchiveHeaders));
+
 cl::opt<bool>
     llvm::PrintImmHex("print-imm-hex",
                       cl::desc("Use hex format for immediate values"));
@@ -196,7 +237,7 @@ cl::opt<DIDumpType> llvm::DwarfDumpType(
 cl::opt<bool> PrintSource(
     "source",
     cl::desc(
-        "Display source inlined with disassembly. Implies disassmble object"));
+        "Display source inlined with disassembly. Implies disassemble object"));
 
 cl::alias PrintSourceShort("S", cl::desc("Alias for -source"),
                            cl::aliasopt(PrintSource));
@@ -297,6 +338,11 @@ LLVM_ATTRIBUTE_NORETURN void llvm::error(Twine Message) {
   exit(1);
 }
 
+void llvm::warn(StringRef Message) {
+  errs() << ToolName << ": warning: " << Message << ".\n";
+  errs().flush();
+}
+
 LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
                                                 Twine Message) {
   errs() << ToolName << ": '" << File << "': " << Message << ".\n";
@@ -396,252 +442,6 @@ bool llvm::RelocAddressLess(RelocationRef a, RelocationRef b) {
   return a.getOffset() < b.getOffset();
 }
 
-namespace {
-class SourcePrinter {
-protected:
-  DILineInfo OldLineInfo;
-  const ObjectFile *Obj = nullptr;
-  std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
-  // File name to file contents of source
-  std::unordered_map<std::string, std::unique_ptr<MemoryBuffer>> SourceCache;
-  // Mark the line endings of the cached source
-  std::unordered_map<std::string, std::vector<StringRef>> LineCache;
-
-private:
-  bool cacheSource(const std::string& File);
-
-public:
-  SourcePrinter() = default;
-  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch) : Obj(Obj) {
-    symbolize::LLVMSymbolizer::Options SymbolizerOpts(
-        DILineInfoSpecifier::FunctionNameKind::None, true, false, false,
-        DefaultArch);
-    Symbolizer.reset(new symbolize::LLVMSymbolizer(SymbolizerOpts));
-  }
-  virtual ~SourcePrinter() = default;
-  virtual void printSourceLine(raw_ostream &OS, uint64_t Address,
-                               StringRef Delimiter = "; ");
-};
-
-bool SourcePrinter::cacheSource(const std::string& File) {
-  auto BufferOrError = MemoryBuffer::getFile(File);
-  if (!BufferOrError)
-    return false;
-  // Chomp the file to get lines
-  size_t BufferSize = (*BufferOrError)->getBufferSize();
-  const char *BufferStart = (*BufferOrError)->getBufferStart();
-  for (const char *Start = BufferStart, *End = BufferStart;
-       End < BufferStart + BufferSize; End++)
-    if (*End == '\n' || End == BufferStart + BufferSize - 1 ||
-        (*End == '\r' && *(End + 1) == '\n')) {
-      LineCache[File].push_back(StringRef(Start, End - Start));
-      if (*End == '\r')
-        End++;
-      Start = End + 1;
-    }
-  SourceCache[File] = std::move(*BufferOrError);
-  return true;
-}
-
-void SourcePrinter::printSourceLine(raw_ostream &OS, uint64_t Address,
-                                    StringRef Delimiter) {
-  if (!Symbolizer)
-    return;
-  DILineInfo LineInfo = DILineInfo();
-  auto ExpectecLineInfo =
-      Symbolizer->symbolizeCode(Obj->getFileName(), Address);
-  if (!ExpectecLineInfo)
-    consumeError(ExpectecLineInfo.takeError());
-  else
-    LineInfo = *ExpectecLineInfo;
-
-  if ((LineInfo.FileName == "<invalid>") || OldLineInfo.Line == LineInfo.Line ||
-      LineInfo.Line == 0)
-    return;
-
-  if (PrintLines)
-    OS << Delimiter << LineInfo.FileName << ":" << LineInfo.Line << "\n";
-  if (PrintSource) {
-    if (SourceCache.find(LineInfo.FileName) == SourceCache.end())
-      if (!cacheSource(LineInfo.FileName))
-        return;
-    auto FileBuffer = SourceCache.find(LineInfo.FileName);
-    if (FileBuffer != SourceCache.end()) {
-      auto LineBuffer = LineCache.find(LineInfo.FileName);
-      if (LineBuffer != LineCache.end()) {
-        if (LineInfo.Line > LineBuffer->second.size())
-          return;
-        // Vector begins at 0, line numbers are non-zero
-        OS << Delimiter << LineBuffer->second[LineInfo.Line - 1].ltrim()
-           << "\n";
-      }
-    }
-  }
-  OldLineInfo = LineInfo;
-}
-
-static bool isArmElf(const ObjectFile *Obj) {
-  return (Obj->isELF() &&
-          (Obj->getArch() == Triple::aarch64 ||
-           Obj->getArch() == Triple::aarch64_be ||
-           Obj->getArch() == Triple::arm || Obj->getArch() == Triple::armeb ||
-           Obj->getArch() == Triple::thumb ||
-           Obj->getArch() == Triple::thumbeb));
-}
-
-class PrettyPrinter {
-public:
-  virtual ~PrettyPrinter() = default;
-  virtual void printInst(MCInstPrinter &IP, const MCInst *MI,
-                         ArrayRef<uint8_t> Bytes, uint64_t Address,
-                         raw_ostream &OS, StringRef Annot,
-                         MCSubtargetInfo const &STI, SourcePrinter *SP) {
-    if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address);
-    if (!NoLeadingAddr)
-      OS << format("%8" PRIx64 ":", Address);
-    if (!NoShowRawInsn) {
-      OS << "\t";
-      dumpBytes(Bytes, OS);
-    }
-    if (MI)
-      IP.printInst(MI, OS, "", STI);
-    else
-      OS << " <unknown>";
-  }
-};
-PrettyPrinter PrettyPrinterInst;
-class HexagonPrettyPrinter : public PrettyPrinter {
-public:
-  void printLead(ArrayRef<uint8_t> Bytes, uint64_t Address,
-                 raw_ostream &OS) {
-    uint32_t opcode =
-      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | Bytes[0];
-    if (!NoLeadingAddr)
-      OS << format("%8" PRIx64 ":", Address);
-    if (!NoShowRawInsn) {
-      OS << "\t";
-      dumpBytes(Bytes.slice(0, 4), OS);
-      OS << format("%08" PRIx32, opcode);
-    }
-  }
-  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
-                 uint64_t Address, raw_ostream &OS, StringRef Annot,
-                 MCSubtargetInfo const &STI, SourcePrinter *SP) override {
-    if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, "");
-    if (!MI) {
-      printLead(Bytes, Address, OS);
-      OS << " <unknown>";
-      return;
-    }
-    std::string Buffer;
-    {
-      raw_string_ostream TempStream(Buffer);
-      IP.printInst(MI, TempStream, "", STI);
-    }
-    StringRef Contents(Buffer);
-    // Split off bundle attributes
-    auto PacketBundle = Contents.rsplit('\n');
-    // Split off first instruction from the rest
-    auto HeadTail = PacketBundle.first.split('\n');
-    auto Preamble = " { ";
-    auto Separator = "";
-    while(!HeadTail.first.empty()) {
-      OS << Separator;
-      Separator = "\n";
-      if (SP && (PrintSource || PrintLines))
-        SP->printSourceLine(OS, Address, "");
-      printLead(Bytes, Address, OS);
-      OS << Preamble;
-      Preamble = "   ";
-      StringRef Inst;
-      auto Duplex = HeadTail.first.split('\v');
-      if(!Duplex.second.empty()){
-        OS << Duplex.first;
-        OS << "; ";
-        Inst = Duplex.second;
-      }
-      else
-        Inst = HeadTail.first;
-      OS << Inst;
-      Bytes = Bytes.slice(4);
-      Address += 4;
-      HeadTail = HeadTail.second.split('\n');
-    }
-    OS << " } " << PacketBundle.second;
-  }
-};
-HexagonPrettyPrinter HexagonPrettyPrinterInst;
-
-class AMDGCNPrettyPrinter : public PrettyPrinter {
-public:
-  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
-                 uint64_t Address, raw_ostream &OS, StringRef Annot,
-                 MCSubtargetInfo const &STI, SourcePrinter *SP) override {
-    if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address);
-
-    if (!MI) {
-      OS << " <unknown>";
-      return;
-    }
-
-    SmallString<40> InstStr;
-    raw_svector_ostream IS(InstStr);
-
-    IP.printInst(MI, IS, "", STI);
-
-    OS << left_justify(IS.str(), 60) << format("// %012" PRIX64 ": ", Address);
-    typedef support::ulittle32_t U32;
-    for (auto D : makeArrayRef(reinterpret_cast<const U32*>(Bytes.data()),
-                               Bytes.size() / sizeof(U32)))
-      // D should be explicitly casted to uint32_t here as it is passed
-      // by format to snprintf as vararg.
-      OS << format("%08" PRIX32 " ", static_cast<uint32_t>(D));
-
-    if (!Annot.empty())
-      OS << "// " << Annot;
-  }
-};
-AMDGCNPrettyPrinter AMDGCNPrettyPrinterInst;
-
-class BPFPrettyPrinter : public PrettyPrinter {
-public:
-  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
-                 uint64_t Address, raw_ostream &OS, StringRef Annot,
-                 MCSubtargetInfo const &STI, SourcePrinter *SP) override {
-    if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address);
-    if (!NoLeadingAddr)
-      OS << format("%8" PRId64 ":", Address / 8);
-    if (!NoShowRawInsn) {
-      OS << "\t";
-      dumpBytes(Bytes, OS);
-    }
-    if (MI)
-      IP.printInst(MI, OS, "", STI);
-    else
-      OS << " <unknown>";
-  }
-};
-BPFPrettyPrinter BPFPrettyPrinterInst;
-
-PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
-  switch(Triple.getArch()) {
-  default:
-    return PrettyPrinterInst;
-  case Triple::hexagon:
-    return HexagonPrettyPrinterInst;
-  case Triple::amdgcn:
-    return AMDGCNPrettyPrinterInst;
-  case Triple::bpfel:
-  case Triple::bpfeb:
-    return BPFPrettyPrinterInst;
-  }
-}
-}
-
 template <class ELFT>
 static std::error_code getRelocationValueString(const ELFObjectFile<ELFT> *Obj,
                                                 const RelocationRef &RelRef,
@@ -671,9 +471,11 @@ static std::error_code getRelocationValueString(const ELFObjectFile<ELFT> *Obj,
   if (!StrTabOrErr)
     return errorToErrorCode(StrTabOrErr.takeError());
   StringRef StrTab = *StrTabOrErr;
-  uint8_t type = RelRef.getType();
-  StringRef res;
   int64_t addend = 0;
+  // If there is no Symbol associated with the relocation, we set the undef
+  // boolean value to 'true'. This will prevent us from calling functions that
+  // requires the relocation to be associated with a symbol.
+  bool undef = false;
   switch (Sec->sh_type) {
   default:
     return object_error::parse_failed;
@@ -684,97 +486,41 @@ static std::error_code getRelocationValueString(const ELFObjectFile<ELFT> *Obj,
   case ELF::SHT_RELA: {
     const Elf_Rela *ERela = Obj->getRela(Rel);
     addend = ERela->r_addend;
+    undef = ERela->getSymbol(false) == 0;
     break;
   }
   }
-  symbol_iterator SI = RelRef.getSymbol();
-  const Elf_Sym *symb = Obj->getSymbol(SI->getRawDataRefImpl());
   StringRef Target;
-  if (symb->getType() == ELF::STT_SECTION) {
-    Expected<section_iterator> SymSI = SI->getSection();
-    if (!SymSI)
-      return errorToErrorCode(SymSI.takeError());
-    const Elf_Shdr *SymSec = Obj->getSection((*SymSI)->getRawDataRefImpl());
-    auto SecName = EF.getSectionName(SymSec);
-    if (!SecName)
-      return errorToErrorCode(SecName.takeError());
-    Target = *SecName;
-  } else {
-    Expected<StringRef> SymName = symb->getName(StrTab);
-    if (!SymName)
-      return errorToErrorCode(SymName.takeError());
-    Target = *SymName;
-  }
-  switch (EF.getHeader()->e_machine) {
-  case ELF::EM_X86_64:
-    switch (type) {
-    case ELF::R_X86_64_PC8:
-    case ELF::R_X86_64_PC16:
-    case ELF::R_X86_64_PC32: {
-      std::string fmtbuf;
-      raw_string_ostream fmt(fmtbuf);
-      fmt << Target << (addend < 0 ? "" : "+") << addend << "-P";
-      fmt.flush();
-      Result.append(fmtbuf.begin(), fmtbuf.end());
-    } break;
-    case ELF::R_X86_64_8:
-    case ELF::R_X86_64_16:
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-    case ELF::R_X86_64_64: {
-      std::string fmtbuf;
-      raw_string_ostream fmt(fmtbuf);
-      fmt << Target << (addend < 0 ? "" : "+") << addend;
-      fmt.flush();
-      Result.append(fmtbuf.begin(), fmtbuf.end());
-    } break;
-    default:
-      res = "Unknown";
-    }
-    break;
-  case ELF::EM_LANAI:
-  case ELF::EM_AVR:
-  case ELF::EM_AARCH64: {
-    std::string fmtbuf;
-    raw_string_ostream fmt(fmtbuf);
-    fmt << Target;
-    if (addend != 0)
-      fmt << (addend < 0 ? "" : "+") << addend;
-    fmt.flush();
-    Result.append(fmtbuf.begin(), fmtbuf.end());
-    break;
-  }
-  case ELF::EM_386:
-  case ELF::EM_IAMCU:
-  case ELF::EM_ARM:
-  case ELF::EM_HEXAGON:
-  case ELF::EM_MIPS:
-  case ELF::EM_BPF:
-  case ELF::EM_RISCV:
-    res = Target;
-    break;
-  case ELF::EM_WEBASSEMBLY:
-    switch (type) {
-    case ELF::R_WEBASSEMBLY_DATA: {
-      std::string fmtbuf;
-      raw_string_ostream fmt(fmtbuf);
-      fmt << Target << (addend < 0 ? "" : "+") << addend;
-      fmt.flush();
-      Result.append(fmtbuf.begin(), fmtbuf.end());
-      break;
-    }
-    case ELF::R_WEBASSEMBLY_FUNCTION:
-      res = Target;
-      break;
-    default:
-      res = "Unknown";
+  if (!undef) {
+    symbol_iterator SI = RelRef.getSymbol();
+    const Elf_Sym *symb = Obj->getSymbol(SI->getRawDataRefImpl());
+    if (symb->getType() == ELF::STT_SECTION) {
+      Expected<section_iterator> SymSI = SI->getSection();
+      if (!SymSI)
+        return errorToErrorCode(SymSI.takeError());
+      const Elf_Shdr *SymSec = Obj->getSection((*SymSI)->getRawDataRefImpl());
+      auto SecName = EF.getSectionName(SymSec);
+      if (!SecName)
+        return errorToErrorCode(SecName.takeError());
+      Target = *SecName;
+    } else {
+      Expected<StringRef> SymName = symb->getName(StrTab);
+      if (!SymName)
+        return errorToErrorCode(SymName.takeError());
+      Target = *SymName;
     }
-    break;
-  default:
-    res = "Unknown";
-  }
-  if (Result.empty())
-    Result.append(res.begin(), res.end());
+  } else
+    Target = "*ABS*";
+
+  // Default scheme is to print Target, as well as "+ <addend>" for nonzero
+  // addend. Should be acceptable for all normal purposes.
+  std::string fmtbuf;
+  raw_string_ostream fmt(fmtbuf);
+  fmt << Target;
+  if (addend != 0)
+    fmt << (addend < 0 ? "" : "+") << addend;
+  fmt.flush();
+  Result.append(fmtbuf.begin(), fmtbuf.end());
   return std::error_code();
 }
 
@@ -887,9 +633,21 @@ static std::error_code getRelocationValueString(const WasmObjectFile *Obj,
                                                 const RelocationRef &RelRef,
                                                 SmallVectorImpl<char> &Result) {
   const wasm::WasmRelocation& Rel = Obj->getWasmRelocation(RelRef);
+  symbol_iterator SI = RelRef.getSymbol();
   std::string fmtbuf;
   raw_string_ostream fmt(fmtbuf);
-  fmt << Rel.Index << (Rel.Addend < 0 ? "" : "+") << Rel.Addend;
+  if (SI == Obj->symbol_end()) {
+    // Not all wasm relocations have symbols associated with them.
+    // In particular R_WEBASSEMBLY_TYPE_INDEX_LEB.
+    fmt << Rel.Index;
+  } else {
+    Expected<StringRef> SymNameOrErr = SI->getName();
+    if (!SymNameOrErr)
+      return errorToErrorCode(SymNameOrErr.takeError());
+    StringRef SymName = *SymNameOrErr;
+    Result.append(SymName.begin(), SymName.end());
+  }
+  fmt << (Rel.Addend < 0 ? "" : "+") << Rel.Addend;
   fmt.flush();
   Result.append(fmtbuf.begin(), fmtbuf.end());
   return std::error_code();
@@ -1087,7 +845,7 @@ static std::error_code getRelocationValueString(const RelocationRef &Rel,
   llvm_unreachable("unknown object file format");
 }
 
-/// @brief Indicates whether this relocation should hidden when listing
+/// Indicates whether this relocation should hidden when listing
 /// relocations, usually because it is the trailing part of a multipart
 /// relocation that will be printed as part of the leading relocation.
 static bool getHidden(RelocationRef RelRef) {
@@ -1120,6 +878,304 @@ static bool getHidden(RelocationRef RelRef) {
   return false;
 }
 
+namespace {
+class SourcePrinter {
+protected:
+  DILineInfo OldLineInfo;
+  const ObjectFile *Obj = nullptr;
+  std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
+  // File name to file contents of source
+  std::unordered_map<std::string, std::unique_ptr<MemoryBuffer>> SourceCache;
+  // Mark the line endings of the cached source
+  std::unordered_map<std::string, std::vector<StringRef>> LineCache;
+
+private:
+  bool cacheSource(const DILineInfo& LineInfoFile);
+
+public:
+  SourcePrinter() = default;
+  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch) : Obj(Obj) {
+    symbolize::LLVMSymbolizer::Options SymbolizerOpts(
+        DILineInfoSpecifier::FunctionNameKind::None, true, false, false,
+        DefaultArch);
+    Symbolizer.reset(new symbolize::LLVMSymbolizer(SymbolizerOpts));
+  }
+  virtual ~SourcePrinter() = default;
+  virtual void printSourceLine(raw_ostream &OS, uint64_t Address,
+                               StringRef Delimiter = "; ");
+};
+
+bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
+  std::unique_ptr<MemoryBuffer> Buffer;
+  if (LineInfo.Source) {
+    Buffer = MemoryBuffer::getMemBuffer(*LineInfo.Source);
+  } else {
+    auto BufferOrError = MemoryBuffer::getFile(LineInfo.FileName);
+    if (!BufferOrError)
+      return false;
+    Buffer = std::move(*BufferOrError);
+  }
+  // Chomp the file to get lines
+  size_t BufferSize = Buffer->getBufferSize();
+  const char *BufferStart = Buffer->getBufferStart();
+  for (const char *Start = BufferStart, *End = BufferStart;
+       End < BufferStart + BufferSize; End++)
+    if (*End == '\n' || End == BufferStart + BufferSize - 1 ||
+        (*End == '\r' && *(End + 1) == '\n')) {
+      LineCache[LineInfo.FileName].push_back(StringRef(Start, End - Start));
+      if (*End == '\r')
+        End++;
+      Start = End + 1;
+    }
+  SourceCache[LineInfo.FileName] = std::move(Buffer);
+  return true;
+}
+
+void SourcePrinter::printSourceLine(raw_ostream &OS, uint64_t Address,
+                                    StringRef Delimiter) {
+  if (!Symbolizer)
+    return;
+  DILineInfo LineInfo = DILineInfo();
+  auto ExpectecLineInfo =
+      Symbolizer->symbolizeCode(Obj->getFileName(), Address);
+  if (!ExpectecLineInfo)
+    consumeError(ExpectecLineInfo.takeError());
+  else
+    LineInfo = *ExpectecLineInfo;
+
+  if ((LineInfo.FileName == "<invalid>") || OldLineInfo.Line == LineInfo.Line ||
+      LineInfo.Line == 0)
+    return;
+
+  if (PrintLines)
+    OS << Delimiter << LineInfo.FileName << ":" << LineInfo.Line << "\n";
+  if (PrintSource) {
+    if (SourceCache.find(LineInfo.FileName) == SourceCache.end())
+      if (!cacheSource(LineInfo))
+        return;
+    auto FileBuffer = SourceCache.find(LineInfo.FileName);
+    if (FileBuffer != SourceCache.end()) {
+      auto LineBuffer = LineCache.find(LineInfo.FileName);
+      if (LineBuffer != LineCache.end()) {
+        if (LineInfo.Line > LineBuffer->second.size())
+          return;
+        // Vector begins at 0, line numbers are non-zero
+        OS << Delimiter << LineBuffer->second[LineInfo.Line - 1].ltrim()
+           << "\n";
+      }
+    }
+  }
+  OldLineInfo = LineInfo;
+}
+
+static bool isArmElf(const ObjectFile *Obj) {
+  return (Obj->isELF() &&
+          (Obj->getArch() == Triple::aarch64 ||
+           Obj->getArch() == Triple::aarch64_be ||
+           Obj->getArch() == Triple::arm || Obj->getArch() == Triple::armeb ||
+           Obj->getArch() == Triple::thumb ||
+           Obj->getArch() == Triple::thumbeb));
+}
+
+class PrettyPrinter {
+public:
+  virtual ~PrettyPrinter() = default;
+  virtual void printInst(MCInstPrinter &IP, const MCInst *MI,
+                         ArrayRef<uint8_t> Bytes, uint64_t Address,
+                         raw_ostream &OS, StringRef Annot,
+                         MCSubtargetInfo const &STI, SourcePrinter *SP,
+                         std::vector<RelocationRef> *Rels = nullptr) {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address);
+    if (!NoLeadingAddr)
+      OS << format("%8" PRIx64 ":", Address);
+    if (!NoShowRawInsn) {
+      OS << "\t";
+      dumpBytes(Bytes, OS);
+    }
+    if (MI)
+      IP.printInst(MI, OS, "", STI);
+    else
+      OS << " <unknown>";
+  }
+};
+PrettyPrinter PrettyPrinterInst;
+class HexagonPrettyPrinter : public PrettyPrinter {
+public:
+  void printLead(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                 raw_ostream &OS) {
+    uint32_t opcode =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | Bytes[0];
+    if (!NoLeadingAddr)
+      OS << format("%8" PRIx64 ":", Address);
+    if (!NoShowRawInsn) {
+      OS << "\t";
+      dumpBytes(Bytes.slice(0, 4), OS);
+      OS << format("%08" PRIx32, opcode);
+    }
+  }
+  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &OS, StringRef Annot,
+                 MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 std::vector<RelocationRef> *Rels) override {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address, "");
+    if (!MI) {
+      printLead(Bytes, Address, OS);
+      OS << " <unknown>";
+      return;
+    }
+    std::string Buffer;
+    {
+      raw_string_ostream TempStream(Buffer);
+      IP.printInst(MI, TempStream, "", STI);
+    }
+    StringRef Contents(Buffer);
+    // Split off bundle attributes
+    auto PacketBundle = Contents.rsplit('\n');
+    // Split off first instruction from the rest
+    auto HeadTail = PacketBundle.first.split('\n');
+    auto Preamble = " { ";
+    auto Separator = "";
+    StringRef Fmt = "\t\t\t%08" PRIx64 ":  ";
+    std::vector<RelocationRef>::const_iterator rel_cur = Rels->begin();
+    std::vector<RelocationRef>::const_iterator rel_end = Rels->end();
+
+    // Hexagon's packets require relocations to be inline rather than
+    // clustered at the end of the packet.
+    auto PrintReloc = [&]() -> void {
+      while ((rel_cur != rel_end) && (rel_cur->getOffset() <= Address)) {
+        if (rel_cur->getOffset() == Address) {
+          SmallString<16> name;
+          SmallString<32> val;
+          rel_cur->getTypeName(name);
+          error(getRelocationValueString(*rel_cur, val));
+          OS << Separator << format(Fmt.data(), Address) << name << "\t" << val
+                << "\n";
+          return;
+        }
+        rel_cur++;
+      }
+    };
+
+    while(!HeadTail.first.empty()) {
+      OS << Separator;
+      Separator = "\n";
+      if (SP && (PrintSource || PrintLines))
+        SP->printSourceLine(OS, Address, "");
+      printLead(Bytes, Address, OS);
+      OS << Preamble;
+      Preamble = "   ";
+      StringRef Inst;
+      auto Duplex = HeadTail.first.split('\v');
+      if(!Duplex.second.empty()){
+        OS << Duplex.first;
+        OS << "; ";
+        Inst = Duplex.second;
+      }
+      else
+        Inst = HeadTail.first;
+      OS << Inst;
+      HeadTail = HeadTail.second.split('\n');
+      if (HeadTail.first.empty())
+        OS << " } " << PacketBundle.second;
+      PrintReloc();
+      Bytes = Bytes.slice(4);
+      Address += 4;
+    }
+  }
+};
+HexagonPrettyPrinter HexagonPrettyPrinterInst;
+
+class AMDGCNPrettyPrinter : public PrettyPrinter {
+public:
+  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &OS, StringRef Annot,
+                 MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 std::vector<RelocationRef> *Rels) override {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address);
+
+    typedef support::ulittle32_t U32;
+
+    if (MI) {
+      SmallString<40> InstStr;
+      raw_svector_ostream IS(InstStr);
+
+      IP.printInst(MI, IS, "", STI);
+
+      OS << left_justify(IS.str(), 60);
+    } else {
+      // an unrecognized encoding - this is probably data so represent it
+      // using the .long directive, or .byte directive if fewer than 4 bytes
+      // remaining
+      if (Bytes.size() >= 4) {
+        OS << format("\t.long 0x%08" PRIx32 " ",
+                     static_cast<uint32_t>(*reinterpret_cast<const U32*>(Bytes.data())));
+        OS.indent(42);
+      } else {
+          OS << format("\t.byte 0x%02" PRIx8, Bytes[0]);
+          for (unsigned int i = 1; i < Bytes.size(); i++)
+            OS << format(", 0x%02" PRIx8, Bytes[i]);
+          OS.indent(55 - (6 * Bytes.size()));
+      }
+    }
+
+    OS << format("// %012" PRIX64 ": ", Address);
+    if (Bytes.size() >=4) {
+      for (auto D : makeArrayRef(reinterpret_cast<const U32*>(Bytes.data()),
+                                 Bytes.size() / sizeof(U32)))
+        // D should be explicitly casted to uint32_t here as it is passed
+        // by format to snprintf as vararg.
+        OS << format("%08" PRIX32 " ", static_cast<uint32_t>(D));
+    } else {
+      for (unsigned int i = 0; i < Bytes.size(); i++)
+        OS << format("%02" PRIX8 " ", Bytes[i]);
+    }
+
+    if (!Annot.empty())
+      OS << "// " << Annot;
+  }
+};
+AMDGCNPrettyPrinter AMDGCNPrettyPrinterInst;
+
+class BPFPrettyPrinter : public PrettyPrinter {
+public:
+  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &OS, StringRef Annot,
+                 MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 std::vector<RelocationRef> *Rels) override {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address);
+    if (!NoLeadingAddr)
+      OS << format("%8" PRId64 ":", Address / 8);
+    if (!NoShowRawInsn) {
+      OS << "\t";
+      dumpBytes(Bytes, OS);
+    }
+    if (MI)
+      IP.printInst(MI, OS, "", STI);
+    else
+      OS << " <unknown>";
+  }
+};
+BPFPrettyPrinter BPFPrettyPrinterInst;
+
+PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
+  switch(Triple.getArch()) {
+  default:
+    return PrettyPrinterInst;
+  case Triple::hexagon:
+    return HexagonPrettyPrinterInst;
+  case Triple::amdgcn:
+    return AMDGCNPrettyPrinterInst;
+  case Triple::bpfel:
+  case Triple::bpfeb:
+    return BPFPrettyPrinterInst;
+  }
+}
+}
+
 static uint8_t getElfSymbolType(const ObjectFile *Obj, const SymbolRef &Sym) {
   assert(Obj->isELF());
   if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(Obj))
@@ -1254,6 +1310,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   // Create a mapping from virtual address to symbol name.  This is used to
   // pretty print the symbols while disassembling.
   std::map<SectionRef, SectionSymbolsTy> AllSymbols;
+  SectionSymbolsTy AbsoluteSymbols;
   for (const SymbolRef &Symbol : Obj->symbols()) {
     Expected<uint64_t> AddressOrErr = Symbol.getAddress();
     if (!AddressOrErr)
@@ -1269,15 +1326,17 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     Expected<section_iterator> SectionOrErr = Symbol.getSection();
     if (!SectionOrErr)
       report_error(Obj->getFileName(), SectionOrErr.takeError());
-    section_iterator SecI = *SectionOrErr;
-    if (SecI == Obj->section_end())
-      continue;
 
     uint8_t SymbolType = ELF::STT_NOTYPE;
     if (Obj->isELF())
       SymbolType = getElfSymbolType(Obj, Symbol);
 
-    AllSymbols[*SecI].emplace_back(Address, *Name, SymbolType);
+    section_iterator SecI = *SectionOrErr;
+    if (SecI != Obj->section_end())
+      AllSymbols[*SecI].emplace_back(Address, *Name, SymbolType);
+    else
+      AbsoluteSymbols.emplace_back(Address, *Name, SymbolType);
+
 
   }
   if (AllSymbols.empty() && Obj->isELF())
@@ -1313,6 +1372,8 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
       if (Sec != SectionAddresses.end())
         AllSymbols[Sec->second].emplace_back(VA, Name, ELF::STT_NOTYPE);
+      else
+        AbsoluteSymbols.emplace_back(VA, Name, ELF::STT_NOTYPE);
     }
   }
 
@@ -1320,6 +1381,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   // a symbol near an address.
   for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols)
     array_pod_sort(SecSyms.second.begin(), SecSyms.second.end());
+  array_pod_sort(AbsoluteSymbols.begin(), AbsoluteSymbols.end());
 
   for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
     if (!DisassembleAll && (!Section.isText() || Section.isVirtual()))
@@ -1349,8 +1411,8 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
       }
     }
 
-    std::sort(DataMappingSymsAddr.begin(), DataMappingSymsAddr.end());
-    std::sort(TextMappingSymsAddr.begin(), TextMappingSymsAddr.end());
+    llvm::sort(DataMappingSymsAddr.begin(), DataMappingSymsAddr.end());
+    llvm::sort(TextMappingSymsAddr.begin(), TextMappingSymsAddr.end());
 
     if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
       // AMDGPU disassembler uses symbolizer for printing labels
@@ -1375,30 +1437,22 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     }
 
     // Sort relocations by address.
-    std::sort(Rels.begin(), Rels.end(), RelocAddressLess);
+    llvm::sort(Rels.begin(), Rels.end(), RelocAddressLess);
 
     StringRef SegmentName = "";
     if (const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(Obj)) {
       DataRefImpl DR = Section.getRawDataRefImpl();
       SegmentName = MachO->getSectionFinalSegmentName(DR);
     }
-    StringRef name;
-    error(Section.getName(name));
-
-    if ((SectionAddr <= StopAddress) &&
-        (SectionAddr + SectSize) >= StartAddress) {
-    outs() << "Disassembly of section ";
-    if (!SegmentName.empty())
-      outs() << SegmentName << ",";
-    outs() << name << ':';
-    }
+    StringRef SectionName;
+    error(Section.getName(SectionName));
 
     // If the section has no symbol at the start, just insert a dummy one.
     if (Symbols.empty() || std::get<0>(Symbols[0]) != 0) {
-      Symbols.insert(Symbols.begin(),
-                     std::make_tuple(SectionAddr, name, Section.isText()
-                                                            ? ELF::STT_FUNC
-                                                            : ELF::STT_OBJECT));
+      Symbols.insert(
+          Symbols.begin(),
+          std::make_tuple(SectionAddr, SectionName,
+                          Section.isText() ? ELF::STT_FUNC : ELF::STT_OBJECT));
     }
 
     SmallString<40> Comments;
@@ -1411,6 +1465,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
     uint64_t Size;
     uint64_t Index;
+    bool PrintedSection = false;
 
     std::vector<RelocationRef>::const_iterator rel_cur = Rels.begin();
     std::vector<RelocationRef>::const_iterator rel_end = Rels.end();
@@ -1435,13 +1490,24 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
         continue;
       }
 
+      /// Skip if user requested specific symbols and this is not in the list
+      if (!DisasmFuncsSet.empty() &&
+          !DisasmFuncsSet.count(std::get<1>(Symbols[si])))
+        continue;
+
+      if (!PrintedSection) {
+        PrintedSection = true;
+        outs() << "Disassembly of section ";
+        if (!SegmentName.empty())
+          outs() << SegmentName << ",";
+        outs() << SectionName << ':';
+      }
+
       // Stop disassembly at the stop address specified
       if (End + SectionAddr > StopAddress)
         End = StopAddress - SectionAddr;
 
       if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
-        // make size 4 bytes folded
-        End = Start + ((End - Start) & ~0x3ull);
         if (std::get<2>(Symbols[si]) == ELF::STT_AMDGPU_HSA_KERNEL) {
           // skip amd_kernel_code_t at the begining of kernel symbol (256 bytes)
           Start += 256;
@@ -1458,7 +1524,32 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
         }
       }
 
-      outs() << '\n' << std::get<1>(Symbols[si]) << ":\n";
+      auto PrintSymbol = [](StringRef Name) {
+        outs() << '\n' << Name << ":\n";
+      };
+      StringRef SymbolName = std::get<1>(Symbols[si]);
+      if (Demangle.getValue() == "" || Demangle.getValue() == "itanium") {
+        char *DemangledSymbol = nullptr;
+        size_t Size = 0;
+        int Status;
+        DemangledSymbol =
+            itaniumDemangle(SymbolName.data(), DemangledSymbol, &Size, &Status);
+        if (Status == 0)
+          PrintSymbol(StringRef(DemangledSymbol));
+        else
+          PrintSymbol(SymbolName);
+
+        if (Size != 0)
+          free(DemangledSymbol);
+      } else
+        PrintSymbol(SymbolName);
+
+      // Don't print raw contents of a virtual section. A virtual section
+      // doesn't have any contents in the file.
+      if (Section.isVirtual()) {
+        outs() << "...\n";
+        continue;
+      }
 
 #ifndef NDEBUG
       raw_ostream &DebugOut = DebugFlag ? dbgs() : nulls();
@@ -1560,7 +1651,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
             }
             Byte = Bytes.slice(Index)[0];
             outs() << format(" %02x", Byte);
-            AsciiData[NumBytes] = isprint(Byte) ? Byte : '.';
+            AsciiData[NumBytes] = isPrint(Byte) ? Byte : '.';
 
             uint8_t IndentOffset = 0;
             NumBytes++;
@@ -1594,7 +1685,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
         PIP.printInst(*IP, Disassembled ? &Inst : nullptr,
                       Bytes.slice(Index, Size), SectionAddr + Index, outs(), "",
-                      *STI, &SP);
+                      *STI, &SP, &Rels);
         outs() << CommentStream.str();
         Comments.clear();
 
@@ -1623,55 +1714,65 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
                 --SectionAddress;
                 TargetSectionSymbols = &AllSymbols[SectionAddress->second];
               } else {
-                TargetSectionSymbols = nullptr;
+                TargetSectionSymbols = &AbsoluteSymbols;
               }
             }
 
             // Find the first symbol in the section whose offset is less than
-            // or equal to the target.
-            if (TargetSectionSymbols) {
-              auto TargetSym = std::upper_bound(
-                  TargetSectionSymbols->begin(), TargetSectionSymbols->end(),
+            // or equal to the target. If there isn't a section that contains
+            // the target, find the nearest preceding absolute symbol.
+            auto TargetSym = std::upper_bound(
+                TargetSectionSymbols->begin(), TargetSectionSymbols->end(),
+                Target, [](uint64_t LHS,
+                           const std::tuple<uint64_t, StringRef, uint8_t> &RHS) {
+                  return LHS < std::get<0>(RHS);
+                });
+            if (TargetSym == TargetSectionSymbols->begin()) {
+              TargetSectionSymbols = &AbsoluteSymbols;
+              TargetSym = std::upper_bound(
+                  AbsoluteSymbols.begin(), AbsoluteSymbols.end(),
                   Target, [](uint64_t LHS,
                              const std::tuple<uint64_t, StringRef, uint8_t> &RHS) {
-                    return LHS < std::get<0>(RHS);
-                  });
-              if (TargetSym != TargetSectionSymbols->begin()) {
-                --TargetSym;
-                uint64_t TargetAddress = std::get<0>(*TargetSym);
-                StringRef TargetName = std::get<1>(*TargetSym);
-                outs() << " <" << TargetName;
-                uint64_t Disp = Target - TargetAddress;
-                if (Disp)
-                  outs() << "+0x" << Twine::utohexstr(Disp);
-                outs() << '>';
-              }
+                            return LHS < std::get<0>(RHS);
+                          });
+            }
+            if (TargetSym != TargetSectionSymbols->begin()) {
+              --TargetSym;
+              uint64_t TargetAddress = std::get<0>(*TargetSym);
+              StringRef TargetName = std::get<1>(*TargetSym);
+              outs() << " <" << TargetName;
+              uint64_t Disp = Target - TargetAddress;
+              if (Disp)
+                outs() << "+0x" << Twine::utohexstr(Disp);
+              outs() << '>';
             }
           }
         }
         outs() << "\n";
 
-        // Print relocation for instruction.
-        while (rel_cur != rel_end) {
-          bool hidden = getHidden(*rel_cur);
-          uint64_t addr = rel_cur->getOffset();
-          SmallString<16> name;
-          SmallString<32> val;
+        // Hexagon does this in pretty printer
+        if (Obj->getArch() != Triple::hexagon)
+          // Print relocation for instruction.
+          while (rel_cur != rel_end) {
+            bool hidden = getHidden(*rel_cur);
+            uint64_t addr = rel_cur->getOffset();
+            SmallString<16> name;
+            SmallString<32> val;
+
+            // If this relocation is hidden, skip it.
+            if (hidden || ((SectionAddr + addr) < StartAddress)) {
+              ++rel_cur;
+              continue;
+            }
 
-          // If this relocation is hidden, skip it.
-          if (hidden || ((SectionAddr + addr) < StartAddress)) {
+            // Stop when rel_cur's address is past the current instruction.
+            if (addr >= Index + Size) break;
+            rel_cur->getTypeName(name);
+            error(getRelocationValueString(*rel_cur, val));
+            outs() << format(Fmt.data(), SectionAddr + addr) << name
+                   << "\t" << val << "\n";
             ++rel_cur;
-            continue;
           }
-
-          // Stop when rel_cur's address is past the current instruction.
-          if (addr >= Index + Size) break;
-          rel_cur->getTypeName(name);
-          error(getRelocationValueString(*rel_cur, val));
-          outs() << format(Fmt.data(), SectionAddr + addr) << name
-                 << "\t" << val << "\n";
-          ++rel_cur;
-        }
       }
     }
   }
@@ -1707,10 +1808,44 @@ void llvm::PrintRelocations(const ObjectFile *Obj) {
   }
 }
 
+void llvm::PrintDynamicRelocations(const ObjectFile *Obj) {
+
+  // For the moment, this option is for ELF only
+  if (!Obj->isELF())
+    return;
+
+  const auto *Elf = dyn_cast<ELFObjectFileBase>(Obj);
+
+  if (!Elf || Elf->getEType() != ELF::ET_DYN) {
+    error("not a dynamic object");
+    return;
+  }
+
+  StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 : "%08" PRIx64;
+
+  std::vector<SectionRef> DynRelSec = Obj->dynamic_relocation_sections();
+  if (DynRelSec.empty())
+    return;
+
+  outs() << "DYNAMIC RELOCATION RECORDS\n";
+  for (const SectionRef &Section : DynRelSec) {
+    if (Section.relocation_begin() == Section.relocation_end())
+      continue;
+    for (const RelocationRef &Reloc : Section.relocations()) {
+      uint64_t address = Reloc.getOffset();
+      SmallString<32> relocname;
+      SmallString<32> valuestr;
+      Reloc.getTypeName(relocname);
+      error(getRelocationValueString(Reloc, valuestr));
+      outs() << format(Fmt.data(), address) << " " << relocname << " "
+             << valuestr << "\n";
+    }
+  }
+}
+
 void llvm::PrintSectionHeaders(const ObjectFile *Obj) {
   outs() << "Sections:\n"
             "Idx Name          Size      Address          Type\n";
-  unsigned i = 0;
   for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
     StringRef Name;
     error(Section.getName(Name));
@@ -1721,9 +1856,9 @@ void llvm::PrintSectionHeaders(const ObjectFile *Obj) {
     bool BSS = Section.isBSS();
     std::string Type = (std::string(Text ? "TEXT " : "") +
                         (Data ? "DATA " : "") + (BSS ? "BSS" : ""));
-    outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n", i,
-                     Name.str().c_str(), Size, Address, Type.c_str());
-    ++i;
+    outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n",
+                     (unsigned)Section.getIndex(), Name.str().c_str(), Size,
+                     Address, Type.c_str());
   }
 }
 
@@ -1764,7 +1899,7 @@ void llvm::PrintSectionContents(const ObjectFile *Obj) {
       // Print ascii.
       outs() << "  ";
       for (std::size_t i = 0; i < 16 && addr + i < end; ++i) {
-        if (std::isprint(static_cast<unsigned char>(Contents[addr + i]) & 0xFF))
+        if (isPrint(static_cast<unsigned char>(Contents[addr + i]) & 0xFF))
           outs() << Contents[addr + i];
         else
           outs() << ".";
@@ -2018,8 +2153,10 @@ static void printFaultMaps(const ObjectFile *Obj) {
 }
 
 static void printPrivateFileHeaders(const ObjectFile *o, bool onlyFirst) {
-  if (o->isELF())
-    return printELFFileHeader(o);
+  if (o->isELF()) {
+    printELFFileHeader(o);
+    return printELFDynamicSection(o);
+  }
   if (o->isCOFF())
     return printCOFFFileHeader(o);
   if (o->isWasm())
@@ -2033,7 +2170,86 @@ static void printPrivateFileHeaders(const ObjectFile *o, bool onlyFirst) {
   report_error(o->getFileName(), "Invalid/Unsupported object file format");
 }
 
-static void DumpObject(ObjectFile *o, const Archive *a = nullptr) {
+static void printFileHeaders(const ObjectFile *o) {
+  if (!o->isELF() && !o->isCOFF())
+    report_error(o->getFileName(), "Invalid/Unsupported object file format");
+
+  Triple::ArchType AT = o->getArch();
+  outs() << "architecture: " << Triple::getArchTypeName(AT) << "\n";
+  Expected<uint64_t> StartAddrOrErr = o->getStartAddress();
+  if (!StartAddrOrErr)
+    report_error(o->getFileName(), StartAddrOrErr.takeError());
+  outs() << "start address: "
+         << format("0x%0*x", o->getBytesInAddress(), StartAddrOrErr.get())
+         << "\n";
+}
+
+static void printArchiveChild(StringRef Filename, const Archive::Child &C) {
+  Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
+  if (!ModeOrErr) {
+    errs() << "ill-formed archive entry.\n";
+    consumeError(ModeOrErr.takeError());
+    return;
+  }
+  sys::fs::perms Mode = ModeOrErr.get();
+  outs() << ((Mode & sys::fs::owner_read) ? "r" : "-");
+  outs() << ((Mode & sys::fs::owner_write) ? "w" : "-");
+  outs() << ((Mode & sys::fs::owner_exe) ? "x" : "-");
+  outs() << ((Mode & sys::fs::group_read) ? "r" : "-");
+  outs() << ((Mode & sys::fs::group_write) ? "w" : "-");
+  outs() << ((Mode & sys::fs::group_exe) ? "x" : "-");
+  outs() << ((Mode & sys::fs::others_read) ? "r" : "-");
+  outs() << ((Mode & sys::fs::others_write) ? "w" : "-");
+  outs() << ((Mode & sys::fs::others_exe) ? "x" : "-");
+
+  outs() << " ";
+
+  Expected<unsigned> UIDOrErr = C.getUID();
+  if (!UIDOrErr)
+    report_error(Filename, UIDOrErr.takeError());
+  unsigned UID = UIDOrErr.get();
+  outs() << format("%d/", UID);
+
+  Expected<unsigned> GIDOrErr = C.getGID();
+  if (!GIDOrErr)
+    report_error(Filename, GIDOrErr.takeError());
+  unsigned GID = GIDOrErr.get();
+  outs() << format("%-d ", GID);
+
+  Expected<uint64_t> Size = C.getRawSize();
+  if (!Size)
+    report_error(Filename, Size.takeError());
+  outs() << format("%6" PRId64, Size.get()) << " ";
+
+  StringRef RawLastModified = C.getRawLastModified();
+  unsigned Seconds;
+  if (RawLastModified.getAsInteger(10, Seconds))
+    outs() << "(date: \"" << RawLastModified
+           << "\" contains non-decimal chars) ";
+  else {
+    // Since ctime(3) returns a 26 character string of the form:
+    // "Sun Sep 16 01:03:52 1973\n\0"
+    // just print 24 characters.
+    time_t t = Seconds;
+    outs() << format("%.24s ", ctime(&t));
+  }
+
+  StringRef Name = "";
+  Expected<StringRef> NameOrErr = C.getName();
+  if (!NameOrErr) {
+    consumeError(NameOrErr.takeError());
+    Expected<StringRef> RawNameOrErr = C.getRawName();
+    if (!RawNameOrErr)
+      report_error(Filename, NameOrErr.takeError());
+    Name = RawNameOrErr.get();
+  } else {
+    Name = NameOrErr.get();
+  }
+  outs() << Name << "\n";
+}
+
+static void DumpObject(ObjectFile *o, const Archive *a = nullptr,
+                       const Archive::Child *c = nullptr) {
   StringRef ArchiveName = a != nullptr ? a->getFileName() : "";
   // Avoid other output when using a raw option.
   if (!RawClangAST) {
@@ -2045,10 +2261,14 @@ static void DumpObject(ObjectFile *o, const Archive *a = nullptr) {
     outs() << ":\tfile format " << o->getFileFormatName() << "\n\n";
   }
 
+  if (ArchiveHeaders && !MachOOpt)
+    printArchiveChild(a->getFileName(), *c);
   if (Disassemble)
     DisassembleObject(o, Relocations);
   if (Relocations && !Disassemble)
     PrintRelocations(o);
+  if (DynamicRelocations)
+    PrintDynamicRelocations(o);
   if (SectionHeaders)
     PrintSectionHeaders(o);
   if (SectionContents)
@@ -2059,6 +2279,8 @@ static void DumpObject(ObjectFile *o, const Archive *a = nullptr) {
     PrintUnwindInfo(o);
   if (PrivateHeaders || FirstPrivateHeader)
     printPrivateFileHeaders(o, FirstPrivateHeader);
+  if (FileHeaders)
+    printFileHeaders(o);
   if (ExportsTrie)
     printExportsTrie(o);
   if (Rebase)
@@ -2082,7 +2304,8 @@ static void DumpObject(ObjectFile *o, const Archive *a = nullptr) {
   }
 }
 
-static void DumpObject(const COFFImportFile *I, const Archive *A) {
+static void DumpObject(const COFFImportFile *I, const Archive *A,
+                       const Archive::Child *C = nullptr) {
   StringRef ArchiveName = A ? A->getFileName() : "";
 
   // Avoid other output when using a raw option.
@@ -2092,11 +2315,13 @@ static void DumpObject(const COFFImportFile *I, const Archive *A) {
            << ":\tfile format COFF-import-file"
            << "\n\n";
 
+  if (ArchiveHeaders && !MachOOpt)
+    printArchiveChild(A->getFileName(), *C);
   if (SymbolTable)
     printCOFFSymbolTable(I);
 }
 
-/// @brief Dump each object file in \a a;
+/// Dump each object file in \a a;
 static void DumpArchive(const Archive *a) {
   Error Err = Error::success();
   for (auto &C : a->children(Err)) {
@@ -2107,9 +2332,9 @@ static void DumpArchive(const Archive *a) {
       continue;
     }
     if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
-      DumpObject(o, a);
+      DumpObject(o, a, &C);
     else if (COFFImportFile *I = dyn_cast<COFFImportFile>(&*ChildOrErr.get()))
-      DumpObject(I, a);
+      DumpObject(I, a, &C);
     else
       report_error(a->getFileName(), object_error::invalid_file_type);
   }
@@ -2117,7 +2342,7 @@ static void DumpArchive(const Archive *a) {
     report_error(a->getFileName(), std::move(Err));
 }
 
-/// @brief Open file and figure out how to dump it.
+/// Open file and figure out how to dump it.
 static void DumpInput(StringRef file) {
 
   // If we are using the Mach-O specific object file parser, then let it parse
@@ -2143,10 +2368,7 @@ static void DumpInput(StringRef file) {
 }
 
 int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   // Initialize targets and assembly printers/parsers.
   llvm::InitializeAllTargetInfos();
@@ -2165,15 +2387,25 @@ int main(int argc, char **argv) {
   if (InputFilenames.size() == 0)
     InputFilenames.push_back("a.out");
 
+  if (AllHeaders)
+    PrivateHeaders = Relocations = SectionHeaders = SymbolTable = true;
+
   if (DisassembleAll || PrintSource || PrintLines)
     Disassemble = true;
+
+  if (Demangle.getValue() != "none" && Demangle.getValue() != "" &&
+      Demangle.getValue() != "itanium")
+    warn("Unsupported demangling style");
+
   if (!Disassemble
       && !Relocations
+      && !DynamicRelocations
       && !SectionHeaders
       && !SectionContents
       && !SymbolTable
       && !UnwindInfo
       && !PrivateHeaders
+      && !FileHeaders
       && !FirstPrivateHeader
       && !ExportsTrie
       && !Rebase
@@ -2182,7 +2414,7 @@ int main(int argc, char **argv) {
       && !WeakBind
       && !RawClangAST
       && !(UniversalHeaders && MachOOpt)
-      && !(ArchiveHeaders && MachOOpt)
+      && !ArchiveHeaders
       && !(IndirectSymbols && MachOOpt)
       && !(DataInCode && MachOOpt)
       && !(LinkOptHints && MachOOpt)
@@ -2197,6 +2429,9 @@ int main(int argc, char **argv) {
     return 2;
   }
 
+  DisasmFuncsSet.insert(DisassembleFunctions.begin(),
+                        DisassembleFunctions.end());
+
   llvm::for_each(InputFilenames, DumpInput);
 
   return EXIT_SUCCESS;
diff --git a/contrib/llvm/tools/llvm-objdump/llvm-objdump.h b/contrib/llvm/tools/llvm-objdump/llvm-objdump.h
index 2fcd506884b1..b2eb6e9d7771 100644
--- a/contrib/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/contrib/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -30,13 +30,16 @@ namespace object {
 extern cl::opt<std::string> TripleName;
 extern cl::opt<std::string> ArchName;
 extern cl::opt<std::string> MCPU;
+extern cl::opt<std::string> Demangle;
 extern cl::list<std::string> MAttrs;
 extern cl::list<std::string> FilterSections;
+extern cl::opt<bool> AllHeaders;
 extern cl::opt<bool> Disassemble;
 extern cl::opt<bool> DisassembleAll;
 extern cl::opt<bool> NoShowRawInsn;
 extern cl::opt<bool> NoLeadingAddr;
 extern cl::opt<bool> PrivateHeaders;
+extern cl::opt<bool> FileHeaders;
 extern cl::opt<bool> FirstPrivateHeader;
 extern cl::opt<bool> ExportsTrie;
 extern cl::opt<bool> Rebase;
@@ -56,6 +59,7 @@ extern cl::opt<bool> ObjcMetaData;
 extern cl::opt<std::string> DisSymName;
 extern cl::opt<bool> NonVerbose;
 extern cl::opt<bool> Relocations;
+extern cl::opt<bool> DynamicRelocations;
 extern cl::opt<bool> SectionHeaders;
 extern cl::opt<bool> SectionContents;
 extern cl::opt<bool> SymbolTable;
@@ -75,6 +79,7 @@ void printMachOBindTable(object::MachOObjectFile* o);
 void printMachOLazyBindTable(object::MachOObjectFile* o);
 void printMachOWeakBindTable(object::MachOObjectFile* o);
 void printELFFileHeader(const object::ObjectFile *o);
+void printELFDynamicSection(const object::ObjectFile *Obj);
 void printCOFFFileHeader(const object::ObjectFile *o);
 void printCOFFSymbolTable(const object::COFFImportFile *i);
 void printCOFFSymbolTable(const object::COFFObjectFile *o);
@@ -88,10 +93,12 @@ void printLazyBindTable(object::ObjectFile *o);
 void printWeakBindTable(object::ObjectFile *o);
 void printRawClangAST(const object::ObjectFile *o);
 void PrintRelocations(const object::ObjectFile *o);
+void PrintDynamicRelocations(const object::ObjectFile *o);
 void PrintSectionHeaders(const object::ObjectFile *o);
 void PrintSectionContents(const object::ObjectFile *o);
 void PrintSymbolTable(const object::ObjectFile *o, StringRef ArchiveName,
                       StringRef ArchitectureName = StringRef());
+void warn(StringRef Message);
 LLVM_ATTRIBUTE_NORETURN void error(Twine Message);
 LLVM_ATTRIBUTE_NORETURN void report_error(StringRef File, Twine Message);
 LLVM_ATTRIBUTE_NORETURN void report_error(StringRef File, std::error_code EC);
diff --git a/contrib/llvm/tools/llvm-pdbutil/Analyze.cpp b/contrib/llvm/tools/llvm-pdbutil/Analyze.cpp
index 6c603dd8542b..974ab49d9440 100644
--- a/contrib/llvm/tools/llvm-pdbutil/Analyze.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/Analyze.cpp
@@ -125,7 +125,7 @@ Error AnalysisStyle::dump() {
 
     const auto &Collisions = CollisionsIter->second;
     outs() << TypeName << "\n";
-    outs() << formatv("    [HEAD] {0:x} {1} {2}\n", A.second,
+    outs() << formatv("    [HEAD] {0:x} {1} {2}\n", uint32_t(A.second),
                       getLeafTypeName(HeadRecord.Type), TypeName);
     for (const auto &Chain : Collisions) {
       if (Chain.TI == TI)
diff --git a/contrib/llvm/tools/llvm-pdbutil/Diff.cpp b/contrib/llvm/tools/llvm-pdbutil/Diff.cpp
deleted file mode 100644
index 286dc51c29b6..000000000000
--- a/contrib/llvm/tools/llvm-pdbutil/Diff.cpp
+++ /dev/null
@@ -1,644 +0,0 @@
-//===- Diff.cpp - PDB diff utility ------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Diff.h"
-
-#include "DiffPrinter.h"
-#include "FormatUtil.h"
-#include "StreamUtil.h"
-#include "llvm-pdbutil.h"
-
-#include "llvm/ADT/StringSet.h"
-
-#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Native/Formatters.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
-#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-
-#include "llvm/Support/FormatAdapters.h"
-#include "llvm/Support/FormatProviders.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Path.h"
-
-using namespace llvm;
-using namespace llvm::pdb;
-
-namespace {
-// Compare and format two stream numbers.  Stream numbers are considered
-// identical if they contain the same value, equivalent if they are both
-// the invalid stream or neither is the invalid stream, and different if
-// one is the invalid stream and another isn't.
-struct StreamNumberProvider {
-  static DiffResult compare(uint16_t L, uint16_t R) {
-    if (L == R)
-      return DiffResult::IDENTICAL;
-    bool LP = L != kInvalidStreamIndex;
-    bool RP = R != kInvalidStreamIndex;
-    if (LP != RP)
-      return DiffResult::DIFFERENT;
-    return DiffResult::EQUIVALENT;
-  }
-
-  static std::string format(uint16_t SN, bool Right) {
-    if (SN == kInvalidStreamIndex)
-      return "(not present)";
-    return formatv("{0}", SN).str();
-  }
-};
-
-// Compares and formats two module indices.  Modis are considered identical
-// if they are identical, equivalent if they either both contain a value or
-// both don't contain a value, and different if one contains a value and the
-// other doesn't.
-struct ModiProvider {
-  DiffResult compare(Optional<uint32_t> L, Optional<uint32_t> R) {
-    if (L == R)
-      return DiffResult::IDENTICAL;
-    if (L.hasValue() != R.hasValue())
-      return DiffResult::DIFFERENT;
-    return DiffResult::EQUIVALENT;
-  }
-
-  std::string format(Optional<uint32_t> Modi, bool Right) {
-    if (!Modi.hasValue())
-      return "(not present)";
-    return formatv("{0}", *Modi).str();
-  }
-};
-
-// Compares and formats two paths embedded in the PDB, ignoring the beginning
-// of the path if the user specified it as a "root path" on the command line.
-struct BinaryPathProvider {
-  explicit BinaryPathProvider(uint32_t MaxLen) : MaxLen(MaxLen) {}
-
-  DiffResult compare(StringRef L, StringRef R) {
-    if (L == R)
-      return DiffResult::IDENTICAL;
-
-    SmallString<64> LN = removeRoot(L, false);
-    SmallString<64> RN = removeRoot(R, true);
-
-    return (LN.equals_lower(RN)) ? DiffResult::EQUIVALENT
-                                 : DiffResult::DIFFERENT;
-  }
-
-  std::string format(StringRef S, bool Right) {
-    if (S.empty())
-      return "(empty)";
-
-    SmallString<64> Native = removeRoot(S, Right);
-    return truncateStringFront(Native.str(), MaxLen);
-  }
-
-  SmallString<64> removeRoot(StringRef Path, bool IsRight) const {
-    SmallString<64> Native(Path);
-    auto &RootOpt = IsRight ? opts::diff::RightRoot : opts::diff::LeftRoot;
-    SmallString<64> Root(static_cast<std::string>(RootOpt));
-    // pdb paths always use windows syntax, convert slashes to backslashes.
-    sys::path::native(Root, sys::path::Style::windows);
-    if (sys::path::has_stem(Root, sys::path::Style::windows))
-      sys::path::append(Root, sys::path::Style::windows,
-                        sys::path::get_separator(sys::path::Style::windows));
-
-    sys::path::replace_path_prefix(Native, Root, "", sys::path::Style::windows);
-    return Native;
-  }
-  uint32_t MaxLen;
-};
-
-// Compare and format two stream purposes.  For general streams, this just
-// compares the description.  For module streams it uses the path comparison
-// algorithm taking into consideration the binary root, described above.
-// Formatting stream purposes just prints the stream purpose, except for
-// module streams and named streams, where it prefixes the name / module
-// with an identifier.  Example:
-//
-//   Named Stream "\names"
-//   Module Stream "foo.obj"
-//
-// If a named stream is too long to fit in a column, it is truncated at the
-// end, and if a module is too long to fit in a column, it is truncated at the
-// beginning.  Example:
-//
-//  Named Stream "\Really Long Str..."
-//  Module Stream "...puts\foo.obj"
-//
-struct StreamPurposeProvider {
-  explicit StreamPurposeProvider(uint32_t MaxLen) : MaxLen(MaxLen) {}
-
-  DiffResult compare(const StreamInfo &L, const StreamInfo &R) {
-    if (L.getPurpose() != R.getPurpose())
-      return DiffResult::DIFFERENT;
-    if (L.getPurpose() == StreamPurpose::ModuleStream) {
-      BinaryPathProvider PathProvider(MaxLen);
-      return PathProvider.compare(L.getShortName(), R.getShortName());
-    }
-    return (L.getShortName() == R.getShortName()) ? DiffResult::IDENTICAL
-                                                  : DiffResult::DIFFERENT;
-  }
-
-  std::string format(const StreamInfo &P, bool Right) {
-    if (P.getPurpose() == StreamPurpose::Other ||
-        P.getPurpose() == StreamPurpose::Symbols)
-      return truncateStringBack(P.getShortName(), MaxLen);
-    if (P.getPurpose() == StreamPurpose::NamedStream)
-      return truncateQuotedNameBack("Named Stream", P.getShortName(), MaxLen);
-
-    assert(P.getPurpose() == StreamPurpose::ModuleStream);
-    uint32_t ExtraChars = strlen("Module \"\"");
-    BinaryPathProvider PathProvider(MaxLen - ExtraChars);
-    std::string Result = PathProvider.format(P.getShortName(), Right);
-    return formatv("Module \"{0}\"", Result);
-  }
-
-  uint32_t MaxLen;
-};
-} // namespace
-
-namespace llvm {
-template <> struct format_provider<PdbRaw_FeatureSig> {
-  static void format(const PdbRaw_FeatureSig &Sig, raw_ostream &Stream,
-                     StringRef Style) {
-    switch (Sig) {
-    case PdbRaw_FeatureSig::MinimalDebugInfo:
-      Stream << "MinimalDebugInfo";
-      break;
-    case PdbRaw_FeatureSig::NoTypeMerge:
-      Stream << "NoTypeMerge";
-      break;
-    case PdbRaw_FeatureSig::VC110:
-      Stream << "VC110";
-      break;
-    case PdbRaw_FeatureSig::VC140:
-      Stream << "VC140";
-      break;
-    }
-  }
-};
-}
-
-template <typename R> using ValueOfRange = llvm::detail::ValueOfRange<R>;
-
-DiffStyle::DiffStyle(PDBFile &File1, PDBFile &File2)
-    : File1(File1), File2(File2) {}
-
-Error DiffStyle::dump() {
-  if (auto EC = diffSuperBlock())
-    return EC;
-
-  if (auto EC = diffFreePageMap())
-    return EC;
-
-  if (auto EC = diffStreamDirectory())
-    return EC;
-
-  if (auto EC = diffStringTable())
-    return EC;
-
-  if (auto EC = diffInfoStream())
-    return EC;
-
-  if (auto EC = diffDbiStream())
-    return EC;
-
-  if (auto EC = diffSectionContribs())
-    return EC;
-
-  if (auto EC = diffSectionMap())
-    return EC;
-
-  if (auto EC = diffFpoStream())
-    return EC;
-
-  if (auto EC = diffTpiStream(StreamTPI))
-    return EC;
-
-  if (auto EC = diffTpiStream(StreamIPI))
-    return EC;
-
-  if (auto EC = diffPublics())
-    return EC;
-
-  if (auto EC = diffGlobals())
-    return EC;
-
-  return Error::success();
-}
-
-Error DiffStyle::diffSuperBlock() {
-  DiffPrinter D(2, "MSF Super Block", 16, 20, opts::diff::PrintResultColumn,
-                opts::diff::PrintValueColumns, outs());
-  D.printExplicit("File", DiffResult::UNSPECIFIED,
-                  truncateStringFront(File1.getFilePath(), 18),
-                  truncateStringFront(File2.getFilePath(), 18));
-  D.print("Block Size", File1.getBlockSize(), File2.getBlockSize());
-  D.print("Block Count", File1.getBlockCount(), File2.getBlockCount());
-  D.print("Unknown 1", File1.getUnknown1(), File2.getUnknown1());
-  D.print("Directory Size", File1.getNumDirectoryBytes(),
-          File2.getNumDirectoryBytes());
-  return Error::success();
-}
-
-Error DiffStyle::diffStreamDirectory() {
-  DiffPrinter D(2, "Stream Directory", 30, 20, opts::diff::PrintResultColumn,
-                opts::diff::PrintValueColumns, outs());
-  D.printExplicit("File", DiffResult::UNSPECIFIED,
-                  truncateStringFront(File1.getFilePath(), 18),
-                  truncateStringFront(File2.getFilePath(), 18));
-
-  SmallVector<StreamInfo, 32> P;
-  SmallVector<StreamInfo, 32> Q;
-  discoverStreamPurposes(File1, P);
-  discoverStreamPurposes(File2, Q);
-  D.print("Stream Count", File1.getNumStreams(), File2.getNumStreams());
-  auto PI = to_vector<32>(enumerate(P));
-  auto QI = to_vector<32>(enumerate(Q));
-
-  // Scan all streams in the left hand side, looking for ones that are also
-  // in the right.  Each time we find one, remove it.  When we're done, Q
-  // should contain all the streams that are in the right but not in the left.
-  StreamPurposeProvider StreamProvider(28);
-  for (const auto &P : PI) {
-    typedef decltype(PI) ContainerType;
-    typedef typename ContainerType::value_type value_type;
-
-    auto Iter = llvm::find_if(QI, [P, &StreamProvider](const value_type &V) {
-      DiffResult Result = StreamProvider.compare(P.value(), V.value());
-      return Result == DiffResult::EQUIVALENT ||
-             Result == DiffResult::IDENTICAL;
-    });
-
-    if (Iter == QI.end()) {
-      D.printExplicit(StreamProvider.format(P.value(), false),
-                      DiffResult::DIFFERENT, P.index(), "(not present)");
-      continue;
-    }
-
-    D.print<EquivalentDiffProvider>(StreamProvider.format(P.value(), false),
-                                    P.index(), Iter->index());
-    QI.erase(Iter);
-  }
-
-  for (const auto &Q : QI) {
-    D.printExplicit(StreamProvider.format(Q.value(), true),
-                    DiffResult::DIFFERENT, "(not present)", Q.index());
-  }
-
-  return Error::success();
-}
-
-Error DiffStyle::diffStringTable() {
-  DiffPrinter D(2, "String Table", 30, 20, opts::diff::PrintResultColumn,
-                opts::diff::PrintValueColumns, outs());
-  D.printExplicit("File", DiffResult::UNSPECIFIED,
-                  truncateStringFront(File1.getFilePath(), 18),
-                  truncateStringFront(File2.getFilePath(), 18));
-
-  auto ExpectedST1 = File1.getStringTable();
-  auto ExpectedST2 = File2.getStringTable();
-  bool Has1 = !!ExpectedST1;
-  bool Has2 = !!ExpectedST2;
-  std::string Count1 = Has1 ? llvm::utostr(ExpectedST1->getNameCount())
-                            : "(string table not present)";
-  std::string Count2 = Has2 ? llvm::utostr(ExpectedST2->getNameCount())
-                            : "(string table not present)";
-  D.print("Number of Strings", Count1, Count2);
-
-  if (!Has1 || !Has2) {
-    consumeError(ExpectedST1.takeError());
-    consumeError(ExpectedST2.takeError());
-    return Error::success();
-  }
-
-  auto &ST1 = *ExpectedST1;
-  auto &ST2 = *ExpectedST2;
-
-  D.print("Hash Version", ST1.getHashVersion(), ST2.getHashVersion());
-  D.print("Byte Size", ST1.getByteSize(), ST2.getByteSize());
-  D.print("Signature", ST1.getSignature(), ST2.getSignature());
-
-  // Both have a valid string table, dive in and compare individual strings.
-
-  auto IdList1 = ST1.name_ids();
-  auto IdList2 = ST2.name_ids();
-  StringSet<> LS;
-  StringSet<> RS;
-  uint32_t Empty1 = 0;
-  uint32_t Empty2 = 0;
-  for (auto ID : IdList1) {
-    auto S = ST1.getStringForID(ID);
-    if (!S)
-      return S.takeError();
-    if (S->empty())
-      ++Empty1;
-    else
-      LS.insert(*S);
-  }
-  for (auto ID : IdList2) {
-    auto S = ST2.getStringForID(ID);
-    if (!S)
-      return S.takeError();
-    if (S->empty())
-      ++Empty2;
-    else
-      RS.insert(*S);
-  }
-  D.print("Empty Strings", Empty1, Empty2);
-
-  for (const auto &S : LS) {
-    auto R = RS.find(S.getKey());
-    std::string Truncated = truncateStringMiddle(S.getKey(), 28);
-    uint32_t I = cantFail(ST1.getIDForString(S.getKey()));
-    if (R == RS.end()) {
-      D.printExplicit(Truncated, DiffResult::DIFFERENT, I, "(not present)");
-      continue;
-    }
-
-    uint32_t J = cantFail(ST2.getIDForString(R->getKey()));
-    D.print<EquivalentDiffProvider>(Truncated, I, J);
-    RS.erase(R);
-  }
-
-  for (const auto &S : RS) {
-    auto L = LS.find(S.getKey());
-    std::string Truncated = truncateStringMiddle(S.getKey(), 28);
-    uint32_t J = cantFail(ST2.getIDForString(S.getKey()));
-    if (L == LS.end()) {
-      D.printExplicit(Truncated, DiffResult::DIFFERENT, "(not present)", J);
-      continue;
-    }
-
-    uint32_t I = cantFail(ST1.getIDForString(L->getKey()));
-    D.print<EquivalentDiffProvider>(Truncated, I, J);
-  }
-  return Error::success();
-}
-
-Error DiffStyle::diffFreePageMap() { return Error::success(); }
-
-Error DiffStyle::diffInfoStream() {
-  DiffPrinter D(2, "PDB Stream", 22, 40, opts::diff::PrintResultColumn,
-                opts::diff::PrintValueColumns, outs());
-  D.printExplicit("File", DiffResult::UNSPECIFIED,
-                  truncateStringFront(File1.getFilePath(), 38),
-                  truncateStringFront(File2.getFilePath(), 38));
-
-  auto ExpectedInfo1 = File1.getPDBInfoStream();
-  auto ExpectedInfo2 = File2.getPDBInfoStream();
-
-  bool Has1 = !!ExpectedInfo1;
-  bool Has2 = !!ExpectedInfo2;
-  if (!(Has1 && Has2)) {
-    std::string L = Has1 ? "(present)" : "(not present)";
-    std::string R = Has2 ? "(present)" : "(not present)";
-    D.print("Stream", L, R);
-
-    consumeError(ExpectedInfo1.takeError());
-    consumeError(ExpectedInfo2.takeError());
-    return Error::success();
-  }
-
-  auto &IS1 = *ExpectedInfo1;
-  auto &IS2 = *ExpectedInfo2;
-  D.print("Stream Size", IS1.getStreamSize(), IS2.getStreamSize());
-  D.print("Age", IS1.getAge(), IS2.getAge());
-  D.print("Guid", IS1.getGuid(), IS2.getGuid());
-  D.print("Signature", IS1.getSignature(), IS2.getSignature());
-  D.print("Version", IS1.getVersion(), IS2.getVersion());
-  D.diffUnorderedArray("Feature", IS1.getFeatureSignatures(),
-                       IS2.getFeatureSignatures());
-  D.print("Named Stream Size", IS1.getNamedStreamMapByteSize(),
-          IS2.getNamedStreamMapByteSize());
-  StringMap<uint32_t> NSL = IS1.getNamedStreams().getStringMap();
-  StringMap<uint32_t> NSR = IS2.getNamedStreams().getStringMap();
-  D.diffUnorderedMap<EquivalentDiffProvider>("Named Stream", NSL, NSR);
-  return Error::success();
-}
-
-typedef std::pair<uint32_t, DbiModuleDescriptor> IndexedModuleDescriptor;
-typedef std::vector<IndexedModuleDescriptor> IndexedModuleDescriptorList;
-
-static IndexedModuleDescriptorList
-getModuleDescriptors(const DbiModuleList &ML) {
-  IndexedModuleDescriptorList List;
-  List.reserve(ML.getModuleCount());
-  for (uint32_t I = 0; I < ML.getModuleCount(); ++I)
-    List.emplace_back(I, ML.getModuleDescriptor(I));
-  return List;
-}
-
-static IndexedModuleDescriptorList::iterator
-findOverrideEquivalentModule(uint32_t Modi,
-                             IndexedModuleDescriptorList &OtherList) {
-  auto &EqMap = opts::diff::Equivalences;
-
-  auto Iter = EqMap.find(Modi);
-  if (Iter == EqMap.end())
-    return OtherList.end();
-
-  uint32_t EqValue = Iter->second;
-
-  return llvm::find_if(OtherList,
-                       [EqValue](const IndexedModuleDescriptor &Desc) {
-                         return Desc.first == EqValue;
-                       });
-}
-
-static IndexedModuleDescriptorList::iterator
-findEquivalentModule(const IndexedModuleDescriptor &Item,
-                     IndexedModuleDescriptorList &OtherList, bool ItemIsRight) {
-
-  if (!ItemIsRight) {
-    uint32_t Modi = Item.first;
-    auto OverrideIter = findOverrideEquivalentModule(Modi, OtherList);
-    if (OverrideIter != OtherList.end())
-      return OverrideIter;
-  }
-
-  BinaryPathProvider PathProvider(28);
-
-  auto Iter = OtherList.begin();
-  auto End = OtherList.end();
-  for (; Iter != End; ++Iter) {
-    const IndexedModuleDescriptor *Left = &Item;
-    const IndexedModuleDescriptor *Right = &*Iter;
-    if (ItemIsRight)
-      std::swap(Left, Right);
-    DiffResult Result = PathProvider.compare(Left->second.getModuleName(),
-                                             Right->second.getModuleName());
-    if (Result == DiffResult::EQUIVALENT || Result == DiffResult::IDENTICAL)
-      return Iter;
-  }
-  return OtherList.end();
-}
-
-static void diffOneModule(DiffPrinter &D, const IndexedModuleDescriptor &Item,
-                          IndexedModuleDescriptorList &Other,
-                          bool ItemIsRight) {
-  StreamPurposeProvider HeaderProvider(70);
-  StreamInfo Info = StreamInfo::createModuleStream(
-      Item.second.getModuleName(), Item.second.getModuleStreamIndex(),
-      Item.first);
-  D.printFullRow(HeaderProvider.format(Info, ItemIsRight));
-
-  const auto *L = &Item;
-
-  auto Iter = findEquivalentModule(Item, Other, ItemIsRight);
-  if (Iter == Other.end()) {
-    // We didn't find this module at all on the other side.  Just print one row
-    // and continue.
-    if (ItemIsRight)
-      D.print<ModiProvider>("- Modi", None, Item.first);
-    else
-      D.print<ModiProvider>("- Modi", Item.first, None);
-    return;
-  }
-
-  // We did find this module.  Go through and compare each field.
-  const auto *R = &*Iter;
-  if (ItemIsRight)
-    std::swap(L, R);
-
-  BinaryPathProvider PathProvider(28);
-  D.print<ModiProvider>("- Modi", L->first, R->first);
-  D.print<BinaryPathProvider>("- Obj File Name", L->second.getObjFileName(),
-                              R->second.getObjFileName(), PathProvider);
-  D.print<StreamNumberProvider>("- Debug Stream",
-                                L->second.getModuleStreamIndex(),
-                                R->second.getModuleStreamIndex());
-  D.print("- C11 Byte Size", L->second.getC11LineInfoByteSize(),
-          R->second.getC11LineInfoByteSize());
-  D.print("- C13 Byte Size", L->second.getC13LineInfoByteSize(),
-          R->second.getC13LineInfoByteSize());
-  D.print("- # of files", L->second.getNumberOfFiles(),
-          R->second.getNumberOfFiles());
-  D.print("- Pdb File Path Index", L->second.getPdbFilePathNameIndex(),
-          R->second.getPdbFilePathNameIndex());
-  D.print("- Source File Name Index", L->second.getSourceFileNameIndex(),
-          R->second.getSourceFileNameIndex());
-  D.print("- Symbol Byte Size", L->second.getSymbolDebugInfoByteSize(),
-          R->second.getSymbolDebugInfoByteSize());
-  Other.erase(Iter);
-}
-
-Error DiffStyle::diffDbiStream() {
-  DiffPrinter D(2, "DBI Stream", 40, 30, opts::diff::PrintResultColumn,
-                opts::diff::PrintValueColumns, outs());
-  D.printExplicit("File", DiffResult::UNSPECIFIED,
-                  truncateStringFront(File1.getFilePath(), 28),
-                  truncateStringFront(File2.getFilePath(), 28));
-
-  auto ExpectedDbi1 = File1.getPDBDbiStream();
-  auto ExpectedDbi2 = File2.getPDBDbiStream();
-
-  bool Has1 = !!ExpectedDbi1;
-  bool Has2 = !!ExpectedDbi2;
-  if (!(Has1 && Has2)) {
-    std::string L = Has1 ? "(present)" : "(not present)";
-    std::string R = Has2 ? "(present)" : "(not present)";
-    D.print("Stream", L, R);
-
-    consumeError(ExpectedDbi1.takeError());
-    consumeError(ExpectedDbi2.takeError());
-    return Error::success();
-  }
-
-  auto &DL = *ExpectedDbi1;
-  auto &DR = *ExpectedDbi2;
-
-  D.print("Dbi Version", (uint32_t)DL.getDbiVersion(),
-          (uint32_t)DR.getDbiVersion());
-  D.print("Age", DL.getAge(), DR.getAge());
-  D.print("Machine", (uint16_t)DL.getMachineType(),
-          (uint16_t)DR.getMachineType());
-  D.print("Flags", DL.getFlags(), DR.getFlags());
-  D.print("Build Major", DL.getBuildMajorVersion(), DR.getBuildMajorVersion());
-  D.print("Build Minor", DL.getBuildMinorVersion(), DR.getBuildMinorVersion());
-  D.print("Build Number", DL.getBuildNumber(), DR.getBuildNumber());
-  D.print("PDB DLL Version", DL.getPdbDllVersion(), DR.getPdbDllVersion());
-  D.print("PDB DLL RBLD", DL.getPdbDllRbld(), DR.getPdbDllRbld());
-  D.print<StreamNumberProvider>("DBG (FPO)",
-                                DL.getDebugStreamIndex(DbgHeaderType::FPO),
-                                DR.getDebugStreamIndex(DbgHeaderType::FPO));
-  D.print<StreamNumberProvider>(
-      "DBG (Exception)", DL.getDebugStreamIndex(DbgHeaderType::Exception),
-      DR.getDebugStreamIndex(DbgHeaderType::Exception));
-  D.print<StreamNumberProvider>("DBG (Fixup)",
-                                DL.getDebugStreamIndex(DbgHeaderType::Fixup),
-                                DR.getDebugStreamIndex(DbgHeaderType::Fixup));
-  D.print<StreamNumberProvider>(
-      "DBG (OmapToSrc)", DL.getDebugStreamIndex(DbgHeaderType::OmapToSrc),
-      DR.getDebugStreamIndex(DbgHeaderType::OmapToSrc));
-  D.print<StreamNumberProvider>(
-      "DBG (OmapFromSrc)", DL.getDebugStreamIndex(DbgHeaderType::OmapFromSrc),
-      DR.getDebugStreamIndex(DbgHeaderType::OmapFromSrc));
-  D.print<StreamNumberProvider>(
-      "DBG (SectionHdr)", DL.getDebugStreamIndex(DbgHeaderType::SectionHdr),
-      DR.getDebugStreamIndex(DbgHeaderType::SectionHdr));
-  D.print<StreamNumberProvider>(
-      "DBG (TokenRidMap)", DL.getDebugStreamIndex(DbgHeaderType::TokenRidMap),
-      DR.getDebugStreamIndex(DbgHeaderType::TokenRidMap));
-  D.print<StreamNumberProvider>("DBG (Xdata)",
-                                DL.getDebugStreamIndex(DbgHeaderType::Xdata),
-                                DR.getDebugStreamIndex(DbgHeaderType::Xdata));
-  D.print<StreamNumberProvider>("DBG (Pdata)",
-                                DL.getDebugStreamIndex(DbgHeaderType::Pdata),
-                                DR.getDebugStreamIndex(DbgHeaderType::Pdata));
-  D.print<StreamNumberProvider>("DBG (NewFPO)",
-                                DL.getDebugStreamIndex(DbgHeaderType::NewFPO),
-                                DR.getDebugStreamIndex(DbgHeaderType::NewFPO));
-  D.print<StreamNumberProvider>(
-      "DBG (SectionHdrOrig)",
-      DL.getDebugStreamIndex(DbgHeaderType::SectionHdrOrig),
-      DR.getDebugStreamIndex(DbgHeaderType::SectionHdrOrig));
-  D.print<StreamNumberProvider>("Globals Stream",
-                                DL.getGlobalSymbolStreamIndex(),
-                                DR.getGlobalSymbolStreamIndex());
-  D.print<StreamNumberProvider>("Publics Stream",
-                                DL.getPublicSymbolStreamIndex(),
-                                DR.getPublicSymbolStreamIndex());
-  D.print<StreamNumberProvider>("Symbol Records", DL.getSymRecordStreamIndex(),
-                                DR.getSymRecordStreamIndex());
-  D.print("Has CTypes", DL.hasCTypes(), DR.hasCTypes());
-  D.print("Is Incrementally Linked", DL.isIncrementallyLinked(),
-          DR.isIncrementallyLinked());
-  D.print("Is Stripped", DL.isStripped(), DR.isStripped());
-  const DbiModuleList &ML = DL.modules();
-  const DbiModuleList &MR = DR.modules();
-  D.print("Module Count", ML.getModuleCount(), MR.getModuleCount());
-  D.print("Source File Count", ML.getSourceFileCount(),
-          MR.getSourceFileCount());
-  auto MDL = getModuleDescriptors(ML);
-  auto MDR = getModuleDescriptors(MR);
-  // Scan all module descriptors from the left, and look for corresponding
-  // module descriptors on the right.
-  for (const auto &L : MDL)
-    diffOneModule(D, L, MDR, false);
-
-  for (const auto &R : MDR)
-    diffOneModule(D, R, MDL, true);
-
-  return Error::success();
-}
-
-Error DiffStyle::diffSectionContribs() { return Error::success(); }
-
-Error DiffStyle::diffSectionMap() { return Error::success(); }
-
-Error DiffStyle::diffFpoStream() { return Error::success(); }
-
-Error DiffStyle::diffTpiStream(int Index) { return Error::success(); }
-
-Error DiffStyle::diffModuleInfoStream(int Index) { return Error::success(); }
-
-Error DiffStyle::diffPublics() { return Error::success(); }
-
-Error DiffStyle::diffGlobals() { return Error::success(); }
diff --git a/contrib/llvm/tools/llvm-pdbutil/Diff.h b/contrib/llvm/tools/llvm-pdbutil/Diff.h
deleted file mode 100644
index 6037576e21bb..000000000000
--- a/contrib/llvm/tools/llvm-pdbutil/Diff.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- Diff.h - PDB diff utility --------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_DIFF_H
-#define LLVM_TOOLS_LLVMPDBDUMP_DIFF_H
-
-#include "OutputStyle.h"
-
-namespace llvm {
-namespace pdb {
-class PDBFile;
-class DiffStyle : public OutputStyle {
-public:
-  explicit DiffStyle(PDBFile &File1, PDBFile &File2);
-
-  Error dump() override;
-
-private:
-  Error diffSuperBlock();
-  Error diffStreamDirectory();
-  Error diffStringTable();
-  Error diffFreePageMap();
-  Error diffInfoStream();
-  Error diffDbiStream();
-  Error diffSectionContribs();
-  Error diffSectionMap();
-  Error diffFpoStream();
-  Error diffTpiStream(int Index);
-  Error diffModuleInfoStream(int Index);
-  Error diffPublics();
-  Error diffGlobals();
-
-  PDBFile &File1;
-  PDBFile &File2;
-};
-}
-}
-
-#endif
diff --git a/contrib/llvm/tools/llvm-pdbutil/DiffPrinter.cpp b/contrib/llvm/tools/llvm-pdbutil/DiffPrinter.cpp
deleted file mode 100644
index dd61cc182593..000000000000
--- a/contrib/llvm/tools/llvm-pdbutil/DiffPrinter.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-
-#include "DiffPrinter.h"
-
-#include "llvm/Support/FormatAdapters.h"
-
-using namespace llvm;
-using namespace llvm::pdb;
-
-namespace {
-struct Colorize {
-  Colorize(raw_ostream &OS, DiffResult Result) : OS(OS) {
-    if (!OS.has_colors())
-      return;
-    switch (Result) {
-    case DiffResult::IDENTICAL:
-      OS.changeColor(raw_ostream::Colors::GREEN, false);
-      break;
-    case DiffResult::EQUIVALENT:
-      OS.changeColor(raw_ostream::Colors::YELLOW, true);
-      break;
-    default:
-      OS.changeColor(raw_ostream::Colors::RED, false);
-      break;
-    }
-  }
-
-  ~Colorize() {
-    if (OS.has_colors())
-      OS.resetColor();
-  }
-
-  raw_ostream &OS;
-};
-}
-
-DiffPrinter::DiffPrinter(uint32_t Indent, StringRef Header,
-                         uint32_t PropertyWidth, uint32_t FieldWidth,
-                         bool Result, bool Fields, raw_ostream &Stream)
-    : PrintResult(Result), PrintValues(Fields), Indent(Indent),
-      PropertyWidth(PropertyWidth), FieldWidth(FieldWidth), OS(Stream) {
-  printHeaderRow();
-  printFullRow(Header);
-}
-
-DiffPrinter::~DiffPrinter() {}
-
-uint32_t DiffPrinter::tableWidth() const {
-  // `|`
-  uint32_t W = 1;
-
-  // `<width>|`
-  W += PropertyWidth + 1;
-
-  if (PrintResult) {
-    // ` I |`
-    W += 4;
-  }
-
-  if (PrintValues) {
-    // `<width>|<width>|`
-    W += 2 * (FieldWidth + 1);
-  }
-  return W;
-}
-
-void DiffPrinter::printFullRow(StringRef Text) {
-  newLine();
-  printValue(Text, DiffResult::UNSPECIFIED, AlignStyle::Center,
-             tableWidth() - 2, true);
-  printSeparatorRow();
-}
-
-void DiffPrinter::printSeparatorRow() {
-  newLine();
-  OS << formatv("{0}", fmt_repeat('-', PropertyWidth));
-  if (PrintResult) {
-    OS << '+';
-    OS << formatv("{0}", fmt_repeat('-', 3));
-  }
-  if (PrintValues) {
-    OS << '+';
-    OS << formatv("{0}", fmt_repeat('-', FieldWidth));
-    OS << '+';
-    OS << formatv("{0}", fmt_repeat('-', FieldWidth));
-  }
-  OS << '|';
-}
-
-void DiffPrinter::printHeaderRow() {
-  newLine('-');
-  OS << formatv("{0}", fmt_repeat('-', tableWidth() - 1));
-}
-
-void DiffPrinter::newLine(char InitialChar) {
-  OS << "\n";
-  OS.indent(Indent) << InitialChar;
-}
-
-void DiffPrinter::printExplicit(StringRef Property, DiffResult C,
-                                StringRef Left, StringRef Right) {
-  newLine();
-  printValue(Property, DiffResult::UNSPECIFIED, AlignStyle::Right,
-             PropertyWidth, true);
-  printResult(C);
-  printValue(Left, C, AlignStyle::Center, FieldWidth, false);
-  printValue(Right, C, AlignStyle::Center, FieldWidth, false);
-  printSeparatorRow();
-}
-
-void DiffPrinter::printResult(DiffResult Result) {
-  if (!PrintResult)
-    return;
-  switch (Result) {
-  case DiffResult::DIFFERENT:
-    printValue("D", Result, AlignStyle::Center, 3, true);
-    break;
-  case DiffResult::EQUIVALENT:
-    printValue("E", Result, AlignStyle::Center, 3, true);
-    break;
-  case DiffResult::IDENTICAL:
-    printValue("I", Result, AlignStyle::Center, 3, true);
-    break;
-  case DiffResult::UNSPECIFIED:
-    printValue(" ", Result, AlignStyle::Center, 3, true);
-    break;
-  }
-}
-
-void DiffPrinter::printValue(StringRef Value, DiffResult C, AlignStyle Style,
-                             uint32_t Width, bool Force) {
-  if (!Force && !PrintValues)
-    return;
-
-  if (Style == AlignStyle::Right)
-    --Width;
-
-  std::string FormattedItem =
-      formatv("{0}", fmt_align(Value, Style, Width)).str();
-  if (C != DiffResult::UNSPECIFIED) {
-    Colorize Color(OS, C);
-    OS << FormattedItem;
-  } else
-    OS << FormattedItem;
-  if (Style == AlignStyle::Right)
-    OS << ' ';
-  OS << '|';
-}
diff --git a/contrib/llvm/tools/llvm-pdbutil/DiffPrinter.h b/contrib/llvm/tools/llvm-pdbutil/DiffPrinter.h
deleted file mode 100644
index 475747d8dc11..000000000000
--- a/contrib/llvm/tools/llvm-pdbutil/DiffPrinter.h
+++ /dev/null
@@ -1,172 +0,0 @@
-//===- DiffPrinter.h ------------------------------------------ *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_DIFFPRINTER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_DIFFPRINTER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <list>
-#include <unordered_set>
-
-namespace std {
-template <> struct hash<llvm::pdb::PdbRaw_FeatureSig> {
-  typedef llvm::pdb::PdbRaw_FeatureSig argument_type;
-  typedef std::size_t result_type;
-  result_type operator()(argument_type Item) const {
-    return std::hash<uint32_t>{}(uint32_t(Item));
-  }
-};
-} // namespace std
-
-namespace llvm {
-namespace pdb {
-
-class PDBFile;
-
-enum class DiffResult { UNSPECIFIED, IDENTICAL, EQUIVALENT, DIFFERENT };
-
-struct IdenticalDiffProvider {
-  template <typename T, typename U>
-  DiffResult compare(const T &Left, const U &Right) {
-    return (Left == Right) ? DiffResult::IDENTICAL : DiffResult::DIFFERENT;
-  }
-
-  template <typename T> std::string format(const T &Item, bool Right) {
-    return formatv("{0}", Item).str();
-  }
-};
-
-struct EquivalentDiffProvider {
-  template <typename T, typename U>
-  DiffResult compare(const T &Left, const U &Right) {
-    return (Left == Right) ? DiffResult::IDENTICAL : DiffResult::EQUIVALENT;
-  }
-
-  template <typename T> std::string format(const T &Item, bool Right) {
-    return formatv("{0}", Item).str();
-  }
-};
-
-class DiffPrinter {
-public:
-  DiffPrinter(uint32_t Indent, StringRef Header, uint32_t PropertyWidth,
-              uint32_t FieldWidth, bool Result, bool Values,
-              raw_ostream &Stream);
-  ~DiffPrinter();
-
-  template <typename T, typename U> struct Identical {};
-
-  template <typename Provider = IdenticalDiffProvider, typename T, typename U>
-  void print(StringRef Property, const T &Left, const U &Right,
-             Provider P = Provider()) {
-    std::string L = P.format(Left, false);
-    std::string R = P.format(Right, true);
-
-    DiffResult Result = P.compare(Left, Right);
-    printExplicit(Property, Result, L, R);
-  }
-
-  void printExplicit(StringRef Property, DiffResult C, StringRef Left,
-                     StringRef Right);
-
-  template <typename T, typename U>
-  void printExplicit(StringRef Property, DiffResult C, const T &Left,
-                     const U &Right) {
-    std::string L = formatv("{0}", Left).str();
-    std::string R = formatv("{0}", Right).str();
-    printExplicit(Property, C, StringRef(L), StringRef(R));
-  }
-
-  template <typename T, typename U>
-  void diffUnorderedArray(StringRef Property, ArrayRef<T> Left,
-                          ArrayRef<U> Right) {
-    std::unordered_set<T> LS(Left.begin(), Left.end());
-    std::unordered_set<U> RS(Right.begin(), Right.end());
-    std::string Count1 = formatv("{0} element(s)", Left.size());
-    std::string Count2 = formatv("{0} element(s)", Right.size());
-    print(std::string(Property) + "s (set)", Count1, Count2);
-    for (const auto &L : LS) {
-      auto Iter = RS.find(L);
-      std::string Text = formatv("{0}", L).str();
-      if (Iter == RS.end()) {
-        print(Property, Text, "(not present)");
-        continue;
-      }
-      print(Property, Text, Text);
-      RS.erase(Iter);
-    }
-    for (const auto &R : RS) {
-      auto Iter = LS.find(R);
-      std::string Text = formatv("{0}", R).str();
-      if (Iter == LS.end()) {
-        print(Property, "(not present)", Text);
-        continue;
-      }
-      print(Property, Text, Text);
-    }
-  }
-
-  template <typename ValueProvider = IdenticalDiffProvider, typename T,
-            typename U>
-  void diffUnorderedMap(StringRef Property, const StringMap<T> &Left,
-                        const StringMap<U> &Right,
-                        ValueProvider P = ValueProvider()) {
-    StringMap<U> RightCopy(Right);
-
-    std::string Count1 = formatv("{0} element(s)", Left.size());
-    std::string Count2 = formatv("{0} element(s)", Right.size());
-    print(std::string(Property) + "s (map)", Count1, Count2);
-
-    for (const auto &L : Left) {
-      auto Iter = RightCopy.find(L.getKey());
-      if (Iter == RightCopy.end()) {
-        printExplicit(L.getKey(), DiffResult::DIFFERENT, L.getValue(),
-                      "(not present)");
-        continue;
-      }
-
-      print(L.getKey(), L.getValue(), Iter->getValue(), P);
-      RightCopy.erase(Iter);
-    }
-
-    for (const auto &R : RightCopy) {
-      printExplicit(R.getKey(), DiffResult::DIFFERENT, "(not present)",
-                    R.getValue());
-    }
-  }
-
-  void printFullRow(StringRef Text);
-
-private:
-  uint32_t tableWidth() const;
-
-  void printHeaderRow();
-  void printSeparatorRow();
-  void newLine(char InitialChar = '|');
-  void printValue(StringRef Value, DiffResult C, AlignStyle Style,
-                  uint32_t Width, bool Force);
-  void printResult(DiffResult Result);
-
-  bool PrintResult;
-  bool PrintValues;
-  uint32_t Indent;
-  uint32_t PropertyWidth;
-  uint32_t FieldWidth;
-  raw_ostream &OS;
-};
-} // namespace pdb
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index dd8436728baf..9e59adc71967 100644
--- a/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -90,7 +90,13 @@ Error DumpOutputStyle::dump() {
     P.NewLine();
   }
 
-  if (opts::dump::DumpStringTable) {
+  if (opts::dump::DumpNamedStreams) {
+    if (auto EC = dumpNamedStreams())
+      return EC;
+    P.NewLine();
+  }
+
+  if (opts::dump::DumpStringTable || opts::dump::DumpStringTableDetails) {
     if (auto EC = dumpStringTable())
       return EC;
     P.NewLine();
@@ -145,6 +151,11 @@ Error DumpOutputStyle::dump() {
     }
   }
 
+  if (opts::dump::DumpGSIRecords) {
+    if (auto EC = dumpGSIRecords())
+      return EC;
+  }
+
   if (opts::dump::DumpGlobals) {
     if (auto EC = dumpGlobals())
       return EC;
@@ -434,6 +445,86 @@ static void iterateModuleSubsections(
                       });
 }
 
+static Expected<std::pair<std::unique_ptr<MappedBlockStream>,
+                          ArrayRef<llvm::object::coff_section>>>
+loadSectionHeaders(PDBFile &File, DbgHeaderType Type) {
+  if (!File.hasPDBDbiStream())
+    return make_error<StringError>(
+        "Section headers require a DBI Stream, which could not be loaded",
+        inconvertibleErrorCode());
+
+  auto &Dbi = cantFail(File.getPDBDbiStream());
+  uint32_t SI = Dbi.getDebugStreamIndex(Type);
+
+  if (SI == kInvalidStreamIndex)
+    return make_error<StringError>(
+        "PDB does not contain the requested image section header type",
+        inconvertibleErrorCode());
+
+  auto Stream = File.createIndexedStream(SI);
+  if (!Stream)
+    return make_error<StringError>("Could not load the required stream data",
+                                   inconvertibleErrorCode());
+
+  ArrayRef<object::coff_section> Headers;
+  if (Stream->getLength() % sizeof(object::coff_section) != 0)
+    return make_error<StringError>(
+        "Section header array size is not a multiple of section header size",
+        inconvertibleErrorCode());
+
+  uint32_t NumHeaders = Stream->getLength() / sizeof(object::coff_section);
+  BinaryStreamReader Reader(*Stream);
+  cantFail(Reader.readArray(Headers, NumHeaders));
+  return std::make_pair(std::move(Stream), Headers);
+}
+
+static std::vector<std::string> getSectionNames(PDBFile &File) {
+  auto ExpectedHeaders = loadSectionHeaders(File, DbgHeaderType::SectionHdr);
+  if (!ExpectedHeaders)
+    return {};
+
+  std::unique_ptr<MappedBlockStream> Stream;
+  ArrayRef<object::coff_section> Headers;
+  std::tie(Stream, Headers) = std::move(*ExpectedHeaders);
+  std::vector<std::string> Names;
+  for (const auto &H : Headers)
+    Names.push_back(H.Name);
+  return Names;
+}
+
+static void dumpSectionContrib(LinePrinter &P, const SectionContrib &SC,
+                               ArrayRef<std::string> SectionNames,
+                               uint32_t FieldWidth) {
+  std::string NameInsert;
+  if (SC.ISect > 0 && SC.ISect <= SectionNames.size()) {
+    StringRef SectionName = SectionNames[SC.ISect - 1];
+    NameInsert = formatv("[{0}]", SectionName).str();
+  } else
+    NameInsert = "[???]";
+  P.formatLine("SC{5}  | mod = {2}, {0}, size = {1}, data crc = {3}, reloc "
+               "crc = {4}",
+               formatSegmentOffset(SC.ISect, SC.Off), fmtle(SC.Size),
+               fmtle(SC.Imod), fmtle(SC.DataCrc), fmtle(SC.RelocCrc),
+               fmt_align(NameInsert, AlignStyle::Left, FieldWidth + 2));
+  AutoIndent Indent(P, FieldWidth + 2);
+  P.formatLine("      {0}",
+               formatSectionCharacteristics(P.getIndentLevel() + 6,
+                                            SC.Characteristics, 3, " | "));
+}
+
+static void dumpSectionContrib(LinePrinter &P, const SectionContrib2 &SC,
+                               ArrayRef<std::string> SectionNames,
+                               uint32_t FieldWidth) {
+  P.formatLine("SC2[{6}] | mod = {2}, {0}, size = {1}, data crc = {3}, reloc "
+               "crc = {4}, coff section = {5}",
+               formatSegmentOffset(SC.Base.ISect, SC.Base.Off),
+               fmtle(SC.Base.Size), fmtle(SC.Base.Imod), fmtle(SC.Base.DataCrc),
+               fmtle(SC.Base.RelocCrc), fmtle(SC.ISectCoff));
+  P.formatLine("      {0}",
+               formatSectionCharacteristics(P.getIndentLevel() + 6,
+                                            SC.Base.Characteristics, 3, " | "));
+}
+
 Error DumpOutputStyle::dumpModules() {
   printHeader(P, "Modules");
   AutoIndent Indent(P);
@@ -456,6 +547,10 @@ Error DumpOutputStyle::dumpModules() {
   iterateSymbolGroups(
       File, PrintScope{P, 11}, [&](uint32_t Modi, const SymbolGroup &Strings) {
         auto Desc = Modules.getModuleDescriptor(Modi);
+        if (opts::dump::DumpSectionContribs) {
+          std::vector<std::string> Sections = getSectionNames(getPdb());
+          dumpSectionContrib(P, Desc.getSectionContrib(), Sections, 0);
+        }
         P.formatLine("Obj: `{0}`: ", Desc.getObjFileName());
         P.formatLine("debug stream: {0}, # files: {1}, has ec info: {2}",
                      Desc.getModuleStreamIndex(), Desc.getNumberOfFiles(),
@@ -848,14 +943,7 @@ Error DumpOutputStyle::dumpXme() {
   return Error::success();
 }
 
-Error DumpOutputStyle::dumpStringTable() {
-  printHeader(P, "String Table");
-
-  if (File.isObj()) {
-    P.formatLine("Dumping string table is not supported for object files");
-    return Error::success();
-  }
-
+Error DumpOutputStyle::dumpStringTableFromPdb() {
   AutoIndent Indent(P);
   auto IS = getPdb().getStringTable();
   if (!IS) {
@@ -864,37 +952,121 @@ Error DumpOutputStyle::dumpStringTable() {
     return Error::success();
   }
 
-  if (IS->name_ids().empty()) {
-    P.formatLine("Empty");
-    return Error::success();
+  if (opts::dump::DumpStringTable) {
+    if (IS->name_ids().empty())
+      P.formatLine("Empty");
+    else {
+      auto MaxID =
+          std::max_element(IS->name_ids().begin(), IS->name_ids().end());
+      uint32_t Digits = NumDigits(*MaxID);
+
+      P.formatLine("{0} | {1}", fmt_align("ID", AlignStyle::Right, Digits),
+                   "String");
+
+      std::vector<uint32_t> SortedIDs(IS->name_ids().begin(),
+                                      IS->name_ids().end());
+      llvm::sort(SortedIDs.begin(), SortedIDs.end());
+      for (uint32_t I : SortedIDs) {
+        auto ES = IS->getStringForID(I);
+        llvm::SmallString<32> Str;
+        if (!ES) {
+          consumeError(ES.takeError());
+          Str = "Error reading string";
+        } else if (!ES->empty()) {
+          Str.append("'");
+          Str.append(*ES);
+          Str.append("'");
+        }
+
+        if (!Str.empty())
+          P.formatLine("{0} | {1}", fmt_align(I, AlignStyle::Right, Digits),
+                       Str);
+      }
+    }
   }
 
-  auto MaxID = std::max_element(IS->name_ids().begin(), IS->name_ids().end());
-  uint32_t Digits = NumDigits(*MaxID);
-
-  P.formatLine("{0} | {1}", fmt_align("ID", AlignStyle::Right, Digits),
-               "String");
-
-  std::vector<uint32_t> SortedIDs(IS->name_ids().begin(), IS->name_ids().end());
-  std::sort(SortedIDs.begin(), SortedIDs.end());
-  for (uint32_t I : SortedIDs) {
-    auto ES = IS->getStringForID(I);
-    llvm::SmallString<32> Str;
-    if (!ES) {
-      consumeError(ES.takeError());
-      Str = "Error reading string";
-    } else if (!ES->empty()) {
-      Str.append("'");
-      Str.append(*ES);
-      Str.append("'");
+  if (opts::dump::DumpStringTableDetails) {
+    P.NewLine();
+    {
+      P.printLine("String Table Header:");
+      AutoIndent Indent(P);
+      P.formatLine("Signature: {0}", IS->getSignature());
+      P.formatLine("Hash Version: {0}", IS->getHashVersion());
+      P.formatLine("Name Buffer Size: {0}", IS->getByteSize());
+      P.NewLine();
     }
 
-    if (!Str.empty())
-      P.formatLine("{0} | {1}", fmt_align(I, AlignStyle::Right, Digits), Str);
+    BinaryStreamRef NameBuffer = IS->getStringTable().getBuffer();
+    ArrayRef<uint8_t> Contents;
+    cantFail(NameBuffer.readBytes(0, NameBuffer.getLength(), Contents));
+    P.formatBinary("Name Buffer", Contents, 0);
+    P.NewLine();
+    {
+      P.printLine("Hash Table:");
+      AutoIndent Indent(P);
+      P.formatLine("Bucket Count: {0}", IS->name_ids().size());
+      for (const auto &Entry : enumerate(IS->name_ids()))
+        P.formatLine("Bucket[{0}] : {1}", Entry.index(),
+                     uint32_t(Entry.value()));
+      P.formatLine("Name Count: {0}", IS->getNameCount());
+    }
   }
   return Error::success();
 }
 
+Error DumpOutputStyle::dumpStringTableFromObj() {
+  iterateModuleSubsections<DebugStringTableSubsectionRef>(
+      File, PrintScope{P, 4},
+      [&](uint32_t Modi, const SymbolGroup &Strings,
+          DebugStringTableSubsectionRef &Strings2) {
+        BinaryStreamRef StringTableBuffer = Strings2.getBuffer();
+        BinaryStreamReader Reader(StringTableBuffer);
+        while (Reader.bytesRemaining() > 0) {
+          StringRef Str;
+          uint32_t Offset = Reader.getOffset();
+          cantFail(Reader.readCString(Str));
+          if (Str.empty())
+            continue;
+
+          P.formatLine("{0} | {1}", fmt_align(Offset, AlignStyle::Right, 4),
+                       Str);
+        }
+      });
+  return Error::success();
+}
+
+Error DumpOutputStyle::dumpNamedStreams() {
+  printHeader(P, "Named Streams");
+  AutoIndent Indent(P, 2);
+
+  if (File.isObj()) {
+    P.formatLine("Dumping Named Streams is only supported for PDB files.");
+    return Error::success();
+  }
+  ExitOnError Err("Invalid PDB File: ");
+
+  auto &IS = Err(File.pdb().getPDBInfoStream());
+  const NamedStreamMap &NS = IS.getNamedStreams();
+  for (const auto &Entry : NS.entries()) {
+    P.printLine(Entry.getKey());
+    AutoIndent Indent2(P, 2);
+    P.formatLine("Index: {0}", Entry.getValue());
+    P.formatLine("Size in bytes: {0}",
+                 File.pdb().getStreamByteSize(Entry.getValue()));
+  }
+
+  return Error::success();
+}
+
+Error DumpOutputStyle::dumpStringTable() {
+  printHeader(P, "String Table");
+
+  if (File.isPdb())
+    return dumpStringTableFromPdb();
+
+  return dumpStringTableFromObj();
+}
+
 static void buildDepSet(LazyRandomTypeCollection &Types,
                         ArrayRef<TypeIndex> Indices,
                         std::map<TypeIndex, CVType> &DepSet) {
@@ -975,8 +1147,15 @@ Error DumpOutputStyle::dumpTypesFromObjectFile() {
     if (auto EC = S.getName(SectionName))
       return errorCodeToError(EC);
 
-    if (SectionName != ".debug$T")
+    // .debug$T is a standard CodeView type section, while .debug$P is the same
+    // format but used for MSVC precompiled header object files.
+    if (SectionName == ".debug$T")
+      printHeader(P, "Types (.debug$T)");
+    else if (SectionName == ".debug$P")
+      printHeader(P, "Precompiled Types (.debug$P)");
+    else
       continue;
+
     StringRef Contents;
     if (auto EC = S.getContents(Contents))
       return errorCodeToError(EC);
@@ -1124,6 +1303,7 @@ Error DumpOutputStyle::dumpModuleSymsForObj() {
       File, PrintScope{P, 2},
       [&](uint32_t Modi, const SymbolGroup &Strings,
           DebugSymbolsSubsectionRef &Symbols) {
+        Dumper.setSymbolGroup(&Strings);
         for (auto Symbol : Symbols) {
           if (auto EC = Visitor.visitSymbolRecord(Symbol)) {
             SymbolError = llvm::make_unique<Error>(std::move(EC));
@@ -1165,8 +1345,8 @@ Error DumpOutputStyle::dumpModuleSymsForPdb() {
 
         SymbolVisitorCallbackPipeline Pipeline;
         SymbolDeserializer Deserializer(nullptr, CodeViewContainer::Pdb);
-        MinimalSymbolDumper Dumper(P, opts::dump::DumpSymRecordBytes, Ids,
-                                   Types);
+        MinimalSymbolDumper Dumper(P, opts::dump::DumpSymRecordBytes, Strings,
+                                   Ids, Types);
 
         Pipeline.addCallbackToPipeline(Deserializer);
         Pipeline.addCallbackToPipeline(Dumper);
@@ -1182,6 +1362,39 @@ Error DumpOutputStyle::dumpModuleSymsForPdb() {
   return Error::success();
 }
 
+Error DumpOutputStyle::dumpGSIRecords() {
+  printHeader(P, "GSI Records");
+  AutoIndent Indent(P);
+
+  if (File.isObj()) {
+    P.formatLine("Dumping Globals is not supported for object files");
+    return Error::success();
+  }
+
+  if (!getPdb().hasPDBSymbolStream()) {
+    P.formatLine("GSI Common Symbol Stream not present");
+    return Error::success();
+  }
+
+  auto &Records = cantFail(getPdb().getPDBSymbolStream());
+  auto &Types = File.types();
+  auto &Ids = File.ids();
+
+  P.printLine("Records");
+  SymbolVisitorCallbackPipeline Pipeline;
+  SymbolDeserializer Deserializer(nullptr, CodeViewContainer::Pdb);
+  MinimalSymbolDumper Dumper(P, opts::dump::DumpSymRecordBytes, Ids, Types);
+
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(Dumper);
+  CVSymbolVisitor Visitor(Pipeline);
+
+  BinaryStreamRef SymStream = Records.getSymbolArray().getUnderlyingStream();
+  if (auto E = Visitor.visitSymbolStream(Records.getSymbolArray(), 0))
+    return E;
+  return Error::success();
+}
+
 Error DumpOutputStyle::dumpGlobals() {
   printHeader(P, "Global Symbols");
   AutoIndent Indent(P);
@@ -1287,6 +1500,7 @@ Error DumpOutputStyle::dumpSymbolsFromGSI(const GSIHashTable &Table,
     Pipeline.addCallbackToPipeline(Dumper);
     CVSymbolVisitor Visitor(Pipeline);
 
+
     BinaryStreamRef SymStream =
         ExpectedSyms->getSymbolArray().getUnderlyingStream();
     for (uint32_t PubSymOff : Table) {
@@ -1299,24 +1513,23 @@ Error DumpOutputStyle::dumpSymbolsFromGSI(const GSIHashTable &Table,
   }
 
   // Return early if we aren't dumping public hash table and address map info.
-  if (!HashExtras)
-    return Error::success();
-
-  P.formatLine("Hash Entries");
-  {
-    AutoIndent Indent2(P);
-    for (const PSHashRecord &HR : Table.HashRecords)
-      P.formatLine("off = {0}, refcnt = {1}", uint32_t(HR.Off),
-                   uint32_t(HR.CRef));
-  }
+  if (HashExtras) {
+    P.formatBinary("Hash Bitmap", Table.HashBitmap, 0);
 
-  // FIXME: Dump the bitmap.
+    P.formatLine("Hash Entries");
+    {
+      AutoIndent Indent2(P);
+      for (const PSHashRecord &HR : Table.HashRecords)
+        P.formatLine("off = {0}, refcnt = {1}", uint32_t(HR.Off),
+          uint32_t(HR.CRef));
+    }
 
-  P.formatLine("Hash Buckets");
-  {
-    AutoIndent Indent2(P);
-    for (uint32_t Hash : Table.HashBuckets)
-      P.formatLine("{0:x8}", Hash);
+    P.formatLine("Hash Buckets");
+    {
+      AutoIndent Indent2(P);
+      for (uint32_t Hash : Table.HashBuckets)
+        P.formatLine("{0:x8}", Hash);
+    }
   }
 
   return Error::success();
@@ -1344,39 +1557,6 @@ Error DumpOutputStyle::dumpSectionHeaders() {
   return Error::success();
 }
 
-static Expected<std::pair<std::unique_ptr<MappedBlockStream>,
-                          ArrayRef<llvm::object::coff_section>>>
-loadSectionHeaders(PDBFile &File, DbgHeaderType Type) {
-  if (!File.hasPDBDbiStream())
-    return make_error<StringError>(
-        "Section headers require a DBI Stream, which could not be loaded",
-        inconvertibleErrorCode());
-
-  auto &Dbi = cantFail(File.getPDBDbiStream());
-  uint32_t SI = Dbi.getDebugStreamIndex(Type);
-
-  if (SI == kInvalidStreamIndex)
-    return make_error<StringError>(
-        "PDB does not contain the requested image section header type",
-        inconvertibleErrorCode());
-
-  auto Stream = File.createIndexedStream(SI);
-  if (!Stream)
-    return make_error<StringError>("Could not load the required stream data",
-                                   inconvertibleErrorCode());
-
-  ArrayRef<object::coff_section> Headers;
-  if (Stream->getLength() % sizeof(object::coff_section) != 0)
-    return make_error<StringError>(
-        "Section header array size is not a multiple of section header size",
-        inconvertibleErrorCode());
-
-  uint32_t NumHeaders = Stream->getLength() / sizeof(object::coff_section);
-  BinaryStreamReader Reader(*Stream);
-  cantFail(Reader.readArray(Headers, NumHeaders));
-  return std::make_pair(std::move(Stream), Headers);
-}
-
 void DumpOutputStyle::dumpSectionHeaders(StringRef Label, DbgHeaderType Type) {
   printHeader(P, Label);
 
@@ -1423,20 +1603,6 @@ void DumpOutputStyle::dumpSectionHeaders(StringRef Label, DbgHeaderType Type) {
   return;
 }
 
-std::vector<std::string> getSectionNames(PDBFile &File) {
-  auto ExpectedHeaders = loadSectionHeaders(File, DbgHeaderType::SectionHdr);
-  if (!ExpectedHeaders)
-    return {};
-
-  std::unique_ptr<MappedBlockStream> Stream;
-  ArrayRef<object::coff_section> Headers;
-  std::tie(Stream, Headers) = std::move(*ExpectedHeaders);
-  std::vector<std::string> Names;
-  for (const auto &H : Headers)
-    Names.push_back(H.Name);
-  return Names;
-}
-
 Error DumpOutputStyle::dumpSectionContribs() {
   printHeader(P, "Section Contributions");
 
@@ -1465,33 +1631,10 @@ Error DumpOutputStyle::dumpSectionContribs() {
       MaxNameLen = (Max == Names.end() ? 0 : Max->size());
     }
     void visit(const SectionContrib &SC) override {
-      assert(SC.ISect > 0);
-      std::string NameInsert;
-      if (SC.ISect < Names.size()) {
-        StringRef SectionName = Names[SC.ISect - 1];
-        NameInsert = formatv("[{0}]", SectionName).str();
-      } else
-        NameInsert = "[???]";
-      P.formatLine("SC{5}  | mod = {2}, {0}, size = {1}, data crc = {3}, reloc "
-                   "crc = {4}",
-                   formatSegmentOffset(SC.ISect, SC.Off), fmtle(SC.Size),
-                   fmtle(SC.Imod), fmtle(SC.DataCrc), fmtle(SC.RelocCrc),
-                   fmt_align(NameInsert, AlignStyle::Left, MaxNameLen + 2));
-      AutoIndent Indent(P, MaxNameLen + 2);
-      P.formatLine("      {0}",
-                   formatSectionCharacteristics(P.getIndentLevel() + 6,
-                                                SC.Characteristics, 3, " | "));
+      dumpSectionContrib(P, SC, Names, MaxNameLen);
     }
     void visit(const SectionContrib2 &SC) override {
-      P.formatLine(
-          "SC2[{6}] | mod = {2}, {0}, size = {1}, data crc = {3}, reloc "
-          "crc = {4}, coff section = {5}",
-          formatSegmentOffset(SC.Base.ISect, SC.Base.Off), fmtle(SC.Base.Size),
-          fmtle(SC.Base.Imod), fmtle(SC.Base.DataCrc), fmtle(SC.Base.RelocCrc),
-          fmtle(SC.ISectCoff));
-      P.formatLine("      {0}", formatSectionCharacteristics(
-                                    P.getIndentLevel() + 6,
-                                    SC.Base.Characteristics, 3, " | "));
+      dumpSectionContrib(P, SC, Names, MaxNameLen);
     }
 
   private:
diff --git a/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.h b/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
index 3ce2884b2712..e7e9252f2fa9 100644
--- a/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
+++ b/contrib/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
@@ -74,7 +74,10 @@ private:
   Error dumpStreamSummary();
   Error dumpSymbolStats();
   Error dumpUdtStats();
+  Error dumpNamedStreams();
   Error dumpStringTable();
+  Error dumpStringTableFromPdb();
+  Error dumpStringTableFromObj();
   Error dumpLines();
   Error dumpInlineeLines();
   Error dumpXmi();
@@ -85,6 +88,7 @@ private:
   Error dumpModuleFiles();
   Error dumpModuleSymsForPdb();
   Error dumpModuleSymsForObj();
+  Error dumpGSIRecords();
   Error dumpGlobals();
   Error dumpPublics();
   Error dumpSymbolsFromGSI(const GSIHashTable &Table, bool HashExtras);
diff --git a/contrib/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp b/contrib/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
new file mode 100644
index 000000000000..d16bfa480e1d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
@@ -0,0 +1,469 @@
+//===- ExplainOutputStyle.cpp --------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExplainOutputStyle.h"
+
+#include "FormatUtil.h"
+#include "InputFile.h"
+#include "StreamUtil.h"
+#include "llvm-pdbutil.h"
+
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+ExplainOutputStyle::ExplainOutputStyle(InputFile &File, uint64_t FileOffset)
+    : File(File), FileOffset(FileOffset), P(2, false, outs()) {}
+
+Error ExplainOutputStyle::dump() {
+  P.formatLine("Explaining file offset {0} of file '{1}'.", FileOffset,
+               File.getFilePath());
+
+  if (File.isPdb())
+    return explainPdbFile();
+
+  return explainBinaryFile();
+}
+
+Error ExplainOutputStyle::explainPdbFile() {
+  bool IsAllocated = explainPdbBlockStatus();
+  if (!IsAllocated)
+    return Error::success();
+
+  AutoIndent Indent(P);
+  if (isPdbSuperBlock())
+    explainPdbSuperBlockOffset();
+  else if (isPdbFpmBlock())
+    explainPdbFpmBlockOffset();
+  else if (isPdbBlockMapBlock())
+    explainPdbBlockMapOffset();
+  else if (isPdbStreamDirectoryBlock())
+    explainPdbStreamDirectoryOffset();
+  else if (auto Index = getPdbBlockStreamIndex())
+    explainPdbStreamOffset(*Index);
+  else
+    explainPdbUnknownBlock();
+  return Error::success();
+}
+
+Error ExplainOutputStyle::explainBinaryFile() {
+  std::unique_ptr<BinaryByteStream> Stream =
+      llvm::make_unique<BinaryByteStream>(File.unknown().getBuffer(),
+                                          llvm::support::little);
+  switch (opts::explain::InputType) {
+  case opts::explain::InputFileType::DBIStream: {
+    DbiStream Dbi(std::move(Stream));
+    if (auto EC = Dbi.reload(nullptr))
+      return EC;
+    explainStreamOffset(Dbi, FileOffset);
+    break;
+  }
+  case opts::explain::InputFileType::PDBStream: {
+    InfoStream Info(std::move(Stream));
+    if (auto EC = Info.reload())
+      return EC;
+    explainStreamOffset(Info, FileOffset);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid input file type!");
+  }
+  return Error::success();
+}
+
+uint32_t ExplainOutputStyle::pdbBlockIndex() const {
+  return FileOffset / File.pdb().getBlockSize();
+}
+
+uint32_t ExplainOutputStyle::pdbBlockOffset() const {
+  uint64_t BlockStart = pdbBlockIndex() * File.pdb().getBlockSize();
+  assert(FileOffset >= BlockStart);
+  return FileOffset - BlockStart;
+}
+
+bool ExplainOutputStyle::isPdbSuperBlock() const {
+  return pdbBlockIndex() == 0;
+}
+
+bool ExplainOutputStyle::isPdbFpm1() const {
+  return ((pdbBlockIndex() - 1) % File.pdb().getBlockSize() == 0);
+}
+bool ExplainOutputStyle::isPdbFpm2() const {
+  return ((pdbBlockIndex() - 2) % File.pdb().getBlockSize() == 0);
+}
+
+bool ExplainOutputStyle::isPdbFpmBlock() const {
+  return isPdbFpm1() || isPdbFpm2();
+}
+
+bool ExplainOutputStyle::isPdbBlockMapBlock() const {
+  return pdbBlockIndex() == File.pdb().getBlockMapIndex();
+}
+
+bool ExplainOutputStyle::isPdbStreamDirectoryBlock() const {
+  const auto &Layout = File.pdb().getMsfLayout();
+  return llvm::is_contained(Layout.DirectoryBlocks, pdbBlockIndex());
+}
+
+Optional<uint32_t> ExplainOutputStyle::getPdbBlockStreamIndex() const {
+  const auto &Layout = File.pdb().getMsfLayout();
+  for (const auto &Entry : enumerate(Layout.StreamMap)) {
+    if (!llvm::is_contained(Entry.value(), pdbBlockIndex()))
+      continue;
+    return Entry.index();
+  }
+  return None;
+}
+
+bool ExplainOutputStyle::explainPdbBlockStatus() {
+  if (FileOffset >= File.pdb().getFileSize()) {
+    P.formatLine("Address {0} is not in the file (file size = {1}).",
+                 FileOffset, File.pdb().getFileSize());
+    return false;
+  }
+  P.formatLine("Block:Offset = {2:X-}:{1:X-4}.", FileOffset, pdbBlockOffset(),
+               pdbBlockIndex());
+
+  bool IsFree = File.pdb().getMsfLayout().FreePageMap[pdbBlockIndex()];
+  P.formatLine("Address is in block {0} ({1}allocated).", pdbBlockIndex(),
+               IsFree ? "un" : "");
+  return !IsFree;
+}
+
+#define endof(Class, Field) (offsetof(Class, Field) + sizeof(Class::Field))
+
+void ExplainOutputStyle::explainPdbSuperBlockOffset() {
+  P.formatLine("This corresponds to offset {0} of the MSF super block, ",
+               pdbBlockOffset());
+  if (pdbBlockOffset() < endof(SuperBlock, MagicBytes))
+    P.printLine("which is part of the MSF file magic.");
+  else if (pdbBlockOffset() < endof(SuperBlock, BlockSize)) {
+    P.printLine("which contains the block size of the file.");
+    P.formatLine("The current value is {0}.",
+                 uint32_t(File.pdb().getMsfLayout().SB->BlockSize));
+  } else if (pdbBlockOffset() < endof(SuperBlock, FreeBlockMapBlock)) {
+    P.printLine("which contains the index of the FPM block (e.g. 1 or 2).");
+    P.formatLine("The current value is {0}.",
+                 uint32_t(File.pdb().getMsfLayout().SB->FreeBlockMapBlock));
+  } else if (pdbBlockOffset() < endof(SuperBlock, NumBlocks)) {
+    P.printLine("which contains the number of blocks in the file.");
+    P.formatLine("The current value is {0}.",
+                 uint32_t(File.pdb().getMsfLayout().SB->NumBlocks));
+  } else if (pdbBlockOffset() < endof(SuperBlock, NumDirectoryBytes)) {
+    P.printLine("which contains the number of bytes in the stream directory.");
+    P.formatLine("The current value is {0}.",
+                 uint32_t(File.pdb().getMsfLayout().SB->NumDirectoryBytes));
+  } else if (pdbBlockOffset() < endof(SuperBlock, Unknown1)) {
+    P.printLine("whose purpose is unknown.");
+    P.formatLine("The current value is {0}.",
+                 uint32_t(File.pdb().getMsfLayout().SB->Unknown1));
+  } else if (pdbBlockOffset() < endof(SuperBlock, BlockMapAddr)) {
+    P.printLine("which contains the file offset of the block map.");
+    P.formatLine("The current value is {0}.",
+                 uint32_t(File.pdb().getMsfLayout().SB->BlockMapAddr));
+  } else {
+    assert(pdbBlockOffset() > sizeof(SuperBlock));
+    P.printLine(
+        "which is outside the range of valid data for the super block.");
+  }
+}
+
+static std::string toBinaryString(uint8_t Byte) {
+  char Result[9] = {0};
+  for (int I = 0; I < 8; ++I) {
+    char C = (Byte & 1) ? '1' : '0';
+    Result[I] = C;
+    Byte >>= 1;
+  }
+  return std::string(Result);
+}
+
+void ExplainOutputStyle::explainPdbFpmBlockOffset() {
+  const MSFLayout &Layout = File.pdb().getMsfLayout();
+  uint32_t MainFpm = Layout.mainFpmBlock();
+  uint32_t AltFpm = Layout.alternateFpmBlock();
+
+  assert(isPdbFpmBlock());
+  uint32_t Fpm = isPdbFpm1() ? 1 : 2;
+  uint32_t FpmChunk = pdbBlockIndex() / File.pdb().getBlockSize();
+  assert((Fpm == MainFpm) || (Fpm == AltFpm));
+  (void)AltFpm;
+  bool IsMain = (Fpm == MainFpm);
+  P.formatLine("Address is in FPM{0} ({1} FPM)", Fpm, IsMain ? "Main" : "Alt");
+  uint32_t DescribedBlockStart =
+      8 * (FpmChunk * File.pdb().getBlockSize() + pdbBlockOffset());
+  if (DescribedBlockStart > File.pdb().getBlockCount()) {
+    P.printLine("Address is in extraneous FPM space.");
+    return;
+  }
+
+  P.formatLine("Address describes the allocation status of blocks [{0},{1})",
+               DescribedBlockStart, DescribedBlockStart + 8);
+  ArrayRef<uint8_t> Bytes;
+  cantFail(File.pdb().getMsfBuffer().readBytes(FileOffset, 1, Bytes));
+  P.formatLine("Status = {0} (Note: 0 = allocated, 1 = free)",
+               toBinaryString(Bytes[0]));
+}
+
+void ExplainOutputStyle::explainPdbBlockMapOffset() {
+  uint64_t BlockMapOffset = File.pdb().getBlockMapOffset();
+  uint32_t OffsetInBlock = FileOffset - BlockMapOffset;
+  P.formatLine("Address is at offset {0} of the directory block list",
+               OffsetInBlock);
+}
+
+static uint32_t getOffsetInStream(ArrayRef<support::ulittle32_t> StreamBlocks,
+                                  uint64_t FileOffset, uint32_t BlockSize) {
+  uint32_t BlockIndex = FileOffset / BlockSize;
+  uint32_t OffsetInBlock = FileOffset - BlockIndex * BlockSize;
+
+  auto Iter = llvm::find(StreamBlocks, BlockIndex);
+  assert(Iter != StreamBlocks.end());
+  uint32_t StreamBlockIndex = std::distance(StreamBlocks.begin(), Iter);
+  return StreamBlockIndex * BlockSize + OffsetInBlock;
+}
+
+void ExplainOutputStyle::explainPdbStreamOffset(uint32_t Stream) {
+  SmallVector<StreamInfo, 12> Streams;
+  discoverStreamPurposes(File.pdb(), Streams);
+
+  assert(Stream <= Streams.size());
+  const StreamInfo &S = Streams[Stream];
+  const auto &Layout = File.pdb().getStreamLayout(Stream);
+  uint32_t StreamOff =
+      getOffsetInStream(Layout.Blocks, FileOffset, File.pdb().getBlockSize());
+  P.formatLine("Address is at offset {0}/{1} of Stream {2} ({3}){4}.",
+               StreamOff, Layout.Length, Stream, S.getLongName(),
+               (StreamOff > Layout.Length) ? " in unused space" : "");
+  switch (S.getPurpose()) {
+  case StreamPurpose::DBI: {
+    DbiStream &Dbi = cantFail(File.pdb().getPDBDbiStream());
+    explainStreamOffset(Dbi, StreamOff);
+    break;
+  }
+  case StreamPurpose::PDB: {
+    InfoStream &Info = cantFail(File.pdb().getPDBInfoStream());
+    explainStreamOffset(Info, StreamOff);
+    break;
+  }
+  case StreamPurpose::IPI:
+  case StreamPurpose::TPI:
+  case StreamPurpose::ModuleStream:
+  case StreamPurpose::NamedStream:
+  default:
+    break;
+  }
+}
+
+void ExplainOutputStyle::explainPdbStreamDirectoryOffset() {
+  auto DirectoryBlocks = File.pdb().getDirectoryBlockArray();
+  const auto &Layout = File.pdb().getMsfLayout();
+  uint32_t StreamOff =
+      getOffsetInStream(DirectoryBlocks, FileOffset, File.pdb().getBlockSize());
+  P.formatLine("Address is at offset {0}/{1} of Stream Directory{2}.",
+               StreamOff, uint32_t(Layout.SB->NumDirectoryBytes),
+               uint32_t(StreamOff > Layout.SB->NumDirectoryBytes)
+                   ? " in unused space"
+                   : "");
+}
+
+void ExplainOutputStyle::explainPdbUnknownBlock() {
+  P.formatLine("Address has unknown purpose.");
+}
+
+template <typename T>
+static void printStructField(LinePrinter &P, StringRef Label, T Value) {
+  P.formatLine("which contains {0}.", Label);
+  P.formatLine("The current value is {0}.", Value);
+}
+
+static void explainDbiHeaderOffset(LinePrinter &P, DbiStream &Dbi,
+                                   uint32_t Offset) {
+  const DbiStreamHeader *Header = Dbi.getHeader();
+  assert(Header != nullptr);
+
+  if (Offset < endof(DbiStreamHeader, VersionSignature))
+    printStructField(P, "the DBI Stream Version Signature",
+                     int32_t(Header->VersionSignature));
+  else if (Offset < endof(DbiStreamHeader, VersionHeader))
+    printStructField(P, "the DBI Stream Version Header",
+                     uint32_t(Header->VersionHeader));
+  else if (Offset < endof(DbiStreamHeader, Age))
+    printStructField(P, "the age of the DBI Stream", uint32_t(Header->Age));
+  else if (Offset < endof(DbiStreamHeader, GlobalSymbolStreamIndex))
+    printStructField(P, "the index of the Global Symbol Stream",
+                     uint16_t(Header->GlobalSymbolStreamIndex));
+  else if (Offset < endof(DbiStreamHeader, BuildNumber))
+    printStructField(P, "the build number", uint16_t(Header->BuildNumber));
+  else if (Offset < endof(DbiStreamHeader, PublicSymbolStreamIndex))
+    printStructField(P, "the index of the Public Symbol Stream",
+                     uint16_t(Header->PublicSymbolStreamIndex));
+  else if (Offset < endof(DbiStreamHeader, PdbDllVersion))
+    printStructField(P, "the version of mspdb.dll",
+                     uint16_t(Header->PdbDllVersion));
+  else if (Offset < endof(DbiStreamHeader, SymRecordStreamIndex))
+    printStructField(P, "the index of the Symbol Record Stream",
+                     uint16_t(Header->SymRecordStreamIndex));
+  else if (Offset < endof(DbiStreamHeader, PdbDllRbld))
+    printStructField(P, "the rbld of mspdb.dll", uint16_t(Header->PdbDllRbld));
+  else if (Offset < endof(DbiStreamHeader, ModiSubstreamSize))
+    printStructField(P, "the size of the Module Info Substream",
+                     int32_t(Header->ModiSubstreamSize));
+  else if (Offset < endof(DbiStreamHeader, SecContrSubstreamSize))
+    printStructField(P, "the size of the Section Contribution Substream",
+                     int32_t(Header->SecContrSubstreamSize));
+  else if (Offset < endof(DbiStreamHeader, SectionMapSize))
+    printStructField(P, "the size of the Section Map Substream",
+                     int32_t(Header->SectionMapSize));
+  else if (Offset < endof(DbiStreamHeader, FileInfoSize))
+    printStructField(P, "the size of the File Info Substream",
+                     int32_t(Header->FileInfoSize));
+  else if (Offset < endof(DbiStreamHeader, TypeServerSize))
+    printStructField(P, "the size of the Type Server Map",
+                     int32_t(Header->TypeServerSize));
+  else if (Offset < endof(DbiStreamHeader, MFCTypeServerIndex))
+    printStructField(P, "the index of the MFC Type Server stream",
+                     uint32_t(Header->MFCTypeServerIndex));
+  else if (Offset < endof(DbiStreamHeader, OptionalDbgHdrSize))
+    printStructField(P, "the size of the Optional Debug Stream array",
+                     int32_t(Header->OptionalDbgHdrSize));
+  else if (Offset < endof(DbiStreamHeader, ECSubstreamSize))
+    printStructField(P, "the size of the Edit & Continue Substream",
+                     int32_t(Header->ECSubstreamSize));
+  else if (Offset < endof(DbiStreamHeader, Flags))
+    printStructField(P, "the DBI Stream flags", uint16_t(Header->Flags));
+  else if (Offset < endof(DbiStreamHeader, MachineType))
+    printStructField(P, "the machine type", uint16_t(Header->MachineType));
+  else if (Offset < endof(DbiStreamHeader, Reserved))
+    printStructField(P, "reserved data", uint32_t(Header->Reserved));
+}
+
+static void explainDbiModiSubstreamOffset(LinePrinter &P, DbiStream &Dbi,
+                                          uint32_t Offset) {
+  VarStreamArray<DbiModuleDescriptor> ModuleDescriptors;
+  BinaryStreamRef ModiSubstreamData = Dbi.getModiSubstreamData().StreamData;
+  BinaryStreamReader Reader(ModiSubstreamData);
+
+  cantFail(Reader.readArray(ModuleDescriptors, ModiSubstreamData.getLength()));
+  auto Prev = ModuleDescriptors.begin();
+  assert(Prev.offset() == 0);
+  auto Current = Prev;
+  uint32_t Index = 0;
+  while (true) {
+    Prev = Current;
+    ++Current;
+    if (Current == ModuleDescriptors.end() || Offset < Current.offset())
+      break;
+    ++Index;
+  }
+
+  DbiModuleDescriptor &Descriptor = *Prev;
+  P.formatLine("which contains the descriptor for module {0} ({1}).", Index,
+               Descriptor.getModuleName());
+}
+
+template <typename T>
+static void dontExplain(LinePrinter &Printer, T &Stream, uint32_t Offset) {}
+
+template <typename T, typename SubstreamRangeT>
+static void explainSubstreamOffset(LinePrinter &P, uint32_t OffsetInStream,
+                                   T &Stream,
+                                   const SubstreamRangeT &Substreams) {
+  uint32_t SubOffset = OffsetInStream;
+  for (const auto &Entry : Substreams) {
+    if (Entry.Size <= 0)
+      continue;
+    uint32_t S = static_cast<uint32_t>(Entry.Size);
+    if (SubOffset < S) {
+      P.formatLine("address is at offset {0}/{1} of the {2}.", SubOffset, S,
+                   Entry.Label);
+      Entry.Explain(P, Stream, SubOffset);
+      return;
+    }
+    SubOffset -= S;
+  }
+}
+
+void ExplainOutputStyle::explainStreamOffset(DbiStream &Dbi,
+                                             uint32_t OffsetInStream) {
+  P.printLine("Within the DBI stream:");
+  AutoIndent Indent(P);
+  const DbiStreamHeader *Header = Dbi.getHeader();
+  assert(Header != nullptr);
+
+  struct SubstreamInfo {
+    int32_t Size;
+    StringRef Label;
+    void (*Explain)(LinePrinter &, DbiStream &, uint32_t);
+  } Substreams[] = {
+      {sizeof(DbiStreamHeader), "DBI Stream Header", explainDbiHeaderOffset},
+      {int32_t(Header->ModiSubstreamSize), "Module Info Substream",
+       explainDbiModiSubstreamOffset},
+      {int32_t(Header->SecContrSubstreamSize), "Section Contribution Substream",
+       dontExplain<DbiStream>},
+      {int32_t(Header->SectionMapSize), "Section Map", dontExplain<DbiStream>},
+      {int32_t(Header->FileInfoSize), "File Info Substream",
+       dontExplain<DbiStream>},
+      {int32_t(Header->TypeServerSize), "Type Server Map Substream",
+       dontExplain<DbiStream>},
+      {int32_t(Header->ECSubstreamSize), "Edit & Continue Substream",
+       dontExplain<DbiStream>},
+      {int32_t(Header->OptionalDbgHdrSize), "Optional Debug Stream Array",
+       dontExplain<DbiStream>},
+  };
+
+  explainSubstreamOffset(P, OffsetInStream, Dbi, Substreams);
+}
+
+static void explainPdbStreamHeaderOffset(LinePrinter &P, InfoStream &Info,
+                                         uint32_t Offset) {
+  const InfoStreamHeader *Header = Info.getHeader();
+  assert(Header != nullptr);
+
+  if (Offset < endof(InfoStreamHeader, Version))
+    printStructField(P, "the PDB Stream Version Signature",
+                     uint32_t(Header->Version));
+  else if (Offset < endof(InfoStreamHeader, Signature))
+    printStructField(P, "the signature of the PDB Stream",
+                     uint32_t(Header->Signature));
+  else if (Offset < endof(InfoStreamHeader, Age))
+    printStructField(P, "the age of the PDB", uint32_t(Header->Age));
+  else if (Offset < endof(InfoStreamHeader, Guid))
+    printStructField(P, "the guid of the PDB", fmt_guid(Header->Guid.Guid));
+}
+
+void ExplainOutputStyle::explainStreamOffset(InfoStream &Info,
+                                             uint32_t OffsetInStream) {
+  P.printLine("Within the PDB stream:");
+  AutoIndent Indent(P);
+
+  struct SubstreamInfo {
+    uint32_t Size;
+    StringRef Label;
+    void (*Explain)(LinePrinter &, InfoStream &, uint32_t);
+  } Substreams[] = {{sizeof(InfoStreamHeader), "PDB Stream Header",
+                     explainPdbStreamHeaderOffset},
+                    {Info.getNamedStreamMapByteSize(), "Named Stream Map",
+                     dontExplain<InfoStream>},
+                    {Info.getStreamSize(), "PDB Feature Signatures",
+                     dontExplain<InfoStream>}};
+
+  explainSubstreamOffset(P, OffsetInStream, Info, Substreams);
+}
diff --git a/contrib/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h b/contrib/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h
new file mode 100644
index 000000000000..9a497accb812
--- /dev/null
+++ b/contrib/llvm/tools/llvm-pdbutil/ExplainOutputStyle.h
@@ -0,0 +1,68 @@
+//===- ExplainOutputStyle.h ----------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_EXPLAINOUTPUTSTYLE_H
+#define LLVM_TOOLS_LLVMPDBDUMP_EXPLAINOUTPUTSTYLE_H
+
+#include "LinePrinter.h"
+#include "OutputStyle.h"
+
+#include <string>
+
+namespace llvm {
+
+namespace pdb {
+
+class DbiStream;
+class InfoStream;
+class InputFile;
+
+class ExplainOutputStyle : public OutputStyle {
+
+public:
+  ExplainOutputStyle(InputFile &File, uint64_t FileOffset);
+
+  Error dump() override;
+
+private:
+  Error explainPdbFile();
+  Error explainBinaryFile();
+
+  bool explainPdbBlockStatus();
+
+  bool isPdbFpm1() const;
+  bool isPdbFpm2() const;
+
+  bool isPdbSuperBlock() const;
+  bool isPdbFpmBlock() const;
+  bool isPdbBlockMapBlock() const;
+  bool isPdbStreamDirectoryBlock() const;
+  Optional<uint32_t> getPdbBlockStreamIndex() const;
+
+  void explainPdbSuperBlockOffset();
+  void explainPdbFpmBlockOffset();
+  void explainPdbBlockMapOffset();
+  void explainPdbStreamDirectoryOffset();
+  void explainPdbStreamOffset(uint32_t Stream);
+  void explainPdbUnknownBlock();
+
+  void explainStreamOffset(DbiStream &Stream, uint32_t OffsetInStream);
+  void explainStreamOffset(InfoStream &Stream, uint32_t OffsetInStream);
+
+  uint32_t pdbBlockIndex() const;
+  uint32_t pdbBlockOffset() const;
+
+  InputFile &File;
+  const uint64_t FileOffset;
+  LinePrinter P;
+};
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/tools/llvm-pdbutil/InputFile.cpp b/contrib/llvm/tools/llvm-pdbutil/InputFile.cpp
index 8b05381174df..7b5af7e96920 100644
--- a/contrib/llvm/tools/llvm-pdbutil/InputFile.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/InputFile.cpp
@@ -95,7 +95,8 @@ static inline bool isDebugSSection(object::SectionRef Section,
 
 static bool isDebugTSection(SectionRef Section, CVTypeArray &Types) {
   BinaryStreamReader Reader;
-  if (!isCodeViewDebugSubsection(Section, ".debug$T", Reader))
+  if (!isCodeViewDebugSubsection(Section, ".debug$T", Reader) &&
+      !isCodeViewDebugSubsection(Section, ".debug$P", Reader))
     return false;
   cantFail(Reader.readArray(Types, Reader.bytesRemaining()));
   return true;
@@ -242,7 +243,7 @@ void SymbolGroup::formatFromChecksumsOffset(LinePrinter &Printer,
   }
 }
 
-Expected<InputFile> InputFile::open(StringRef Path) {
+Expected<InputFile> InputFile::open(StringRef Path, bool AllowUnknownFile) {
   InputFile IF;
   if (!llvm::sys::fs::exists(Path))
     return make_error<StringError>(formatv("File {0} not found", Path),
@@ -263,7 +264,7 @@ Expected<InputFile> InputFile::open(StringRef Path) {
     return std::move(IF);
   }
 
-  if (Magic == file_magic::unknown) {
+  if (Magic == file_magic::pdb) {
     std::unique_ptr<IPDBSession> Session;
     if (auto Err = loadDataForPDB(PDB_ReaderType::Native, Path, Session))
       return std::move(Err);
@@ -274,9 +275,19 @@ Expected<InputFile> InputFile::open(StringRef Path) {
     return std::move(IF);
   }
 
-  return make_error<StringError>(
-      formatv("File {0} is not a supported file type", Path),
-      inconvertibleErrorCode());
+  if (!AllowUnknownFile)
+    return make_error<StringError>(
+        formatv("File {0} is not a supported file type", Path),
+        inconvertibleErrorCode());
+
+  auto Result = MemoryBuffer::getFile(Path, -1LL, false);
+  if (!Result)
+    return make_error<StringError>(
+        formatv("File {0} could not be opened", Path), Result.getError());
+
+  IF.UnknownFile = std::move(*Result);
+  IF.PdbOrObj = IF.UnknownFile.get();
+  return std::move(IF);
 }
 
 PDBFile &InputFile::pdb() {
@@ -299,6 +310,25 @@ const object::COFFObjectFile &InputFile::obj() const {
   return *PdbOrObj.get<object::COFFObjectFile *>();
 }
 
+MemoryBuffer &InputFile::unknown() {
+  assert(isUnknown());
+  return *PdbOrObj.get<MemoryBuffer *>();
+}
+
+const MemoryBuffer &InputFile::unknown() const {
+  assert(isUnknown());
+  return *PdbOrObj.get<MemoryBuffer *>();
+}
+
+StringRef InputFile::getFilePath() const {
+  if (isPdb())
+    return pdb().getFilePath();
+  if (isObj())
+    return obj().getFileName();
+  assert(isUnknown());
+  return unknown().getBufferIdentifier();
+}
+
 bool InputFile::hasTypes() const {
   if (isPdb())
     return pdb().hasPDBTpiStream();
@@ -323,6 +353,8 @@ bool InputFile::isObj() const {
   return PdbOrObj.is<object::COFFObjectFile *>();
 }
 
+bool InputFile::isUnknown() const { return PdbOrObj.is<MemoryBuffer *>(); }
+
 codeview::LazyRandomTypeCollection &
 InputFile::getOrCreateTypeCollection(TypeCollectionKind Kind) {
   if (Types && Kind == kTypes)
diff --git a/contrib/llvm/tools/llvm-pdbutil/InputFile.h b/contrib/llvm/tools/llvm-pdbutil/InputFile.h
index 8063439133c2..552f3a3b2127 100644
--- a/contrib/llvm/tools/llvm-pdbutil/InputFile.h
+++ b/contrib/llvm/tools/llvm-pdbutil/InputFile.h
@@ -43,7 +43,8 @@ class InputFile {
 
   std::unique_ptr<NativeSession> PdbSession;
   object::OwningBinary<object::Binary> CoffObject;
-  PointerUnion<PDBFile *, object::COFFObjectFile *> PdbOrObj;
+  std::unique_ptr<MemoryBuffer> UnknownFile;
+  PointerUnion3<PDBFile *, object::COFFObjectFile *, MemoryBuffer *> PdbOrObj;
 
   using TypeCollectionPtr = std::unique_ptr<codeview::LazyRandomTypeCollection>;
 
@@ -58,12 +59,17 @@ public:
   ~InputFile();
   InputFile(InputFile &&Other) = default;
 
-  static Expected<InputFile> open(StringRef Path);
+  static Expected<InputFile> open(StringRef Path,
+                                  bool AllowUnknownFile = false);
 
   PDBFile &pdb();
   const PDBFile &pdb() const;
   object::COFFObjectFile &obj();
   const object::COFFObjectFile &obj() const;
+  MemoryBuffer &unknown();
+  const MemoryBuffer &unknown() const;
+
+  StringRef getFilePath() const;
 
   bool hasTypes() const;
   bool hasIds() const;
@@ -77,6 +83,7 @@ public:
 
   bool isPdb() const;
   bool isObj() const;
+  bool isUnknown() const;
 };
 
 class SymbolGroup {
diff --git a/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index 40a0e46efd48..b454ab345456 100644
--- a/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -10,6 +10,7 @@
 #include "MinimalSymbolDumper.h"
 
 #include "FormatUtil.h"
+#include "InputFile.h"
 #include "LinePrinter.h"
 
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
@@ -18,6 +19,7 @@
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 #include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
@@ -450,6 +452,17 @@ Error MinimalSymbolDumper::visitKnownRecord(CVSymbol &CVR,
 Error MinimalSymbolDumper::visitKnownRecord(CVSymbol &CVR, FileStaticSym &FS) {
   P.format(" `{0}`", FS.Name);
   AutoIndent Indent(P, 7);
+  if (SymGroup) {
+    Expected<StringRef> FileName =
+        SymGroup->getNameFromStringTable(FS.ModFilenameOffset);
+    if (FileName) {
+      P.formatLine("type = {0}, file name = {1} ({2}), flags = {3}",
+                   typeIndex(FS.Index), FS.ModFilenameOffset, *FileName,
+                   formatLocalSymFlags(P.getIndentLevel() + 9, FS.Flags));
+    }
+    return Error::success();
+  }
+
   P.formatLine("type = {0}, file name offset = {1}, flags = {2}",
                typeIndex(FS.Index), FS.ModFilenameOffset,
                formatLocalSymFlags(P.getIndentLevel() + 9, FS.Flags));
diff --git a/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.h b/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.h
index d9e9861d5b30..1c26a85a4eaf 100644
--- a/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.h
+++ b/contrib/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.h
@@ -19,6 +19,7 @@ class LazyRandomTypeCollection;
 
 namespace pdb {
 class LinePrinter;
+class SymbolGroup;
 
 class MinimalSymbolDumper : public codeview::SymbolVisitorCallbacks {
 public:
@@ -26,11 +27,19 @@ public:
                       codeview::LazyRandomTypeCollection &Ids,
                       codeview::LazyRandomTypeCollection &Types)
       : P(P), RecordBytes(RecordBytes), Ids(Ids), Types(Types) {}
+  MinimalSymbolDumper(LinePrinter &P, bool RecordBytes,
+                      const SymbolGroup &SymGroup,
+                      codeview::LazyRandomTypeCollection &Ids,
+                      codeview::LazyRandomTypeCollection &Types)
+      : P(P), RecordBytes(RecordBytes), SymGroup(&SymGroup), Ids(Ids),
+        Types(Types) {}
 
   Error visitSymbolBegin(codeview::CVSymbol &Record) override;
   Error visitSymbolBegin(codeview::CVSymbol &Record, uint32_t Offset) override;
   Error visitSymbolEnd(codeview::CVSymbol &Record) override;
 
+  void setSymbolGroup(const SymbolGroup *Group) { SymGroup = Group; }
+
 #define SYMBOL_RECORD(EnumName, EnumVal, Name)                                 \
   virtual Error visitKnownRecord(codeview::CVSymbol &CVR,                      \
                                  codeview::Name &Record) override;
@@ -45,6 +54,7 @@ private:
 
   LinePrinter &P;
   bool RecordBytes;
+  const SymbolGroup *SymGroup = nullptr;
   codeview::LazyRandomTypeCollection &Ids;
   codeview::LazyRandomTypeCollection &Types;
 };
diff --git a/contrib/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
index fae89920e0b8..569bca7490fa 100644
--- a/contrib/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
@@ -303,8 +303,9 @@ Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
     P.formatLine("unique name: `{0}`", Class.UniqueName);
   P.formatLine("vtable: {0}, base list: {1}, field list: {2}",
                Class.VTableShape, Class.DerivationList, Class.FieldList);
-  P.formatLine("options: {0}",
-               formatClassOptions(P.getIndentLevel(), Class.Options));
+  P.formatLine("options: {0}, sizeof {1}",
+               formatClassOptions(P.getIndentLevel(), Class.Options),
+               Class.Size);
   return Error::success();
 }
 
@@ -314,8 +315,9 @@ Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
   if (Union.hasUniqueName())
     P.formatLine("unique name: `{0}`", Union.UniqueName);
   P.formatLine("field list: {0}", Union.FieldList);
-  P.formatLine("options: {0}",
-               formatClassOptions(P.getIndentLevel(), Union.Options));
+  P.formatLine("options: {0}, sizeof {1}",
+               formatClassOptions(P.getIndentLevel(), Union.Options),
+               Union.Size);
   return Error::success();
 }
 
@@ -467,6 +469,21 @@ Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR, LabelRecord &R) {
   return Error::success();
 }
 
+Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                               PrecompRecord &Precomp) {
+  P.format(" start index = {0:X+}, types count = {1:X+}, signature = {2:X+},"
+           " precomp path = {3}",
+           Precomp.StartTypeIndex, Precomp.TypesCount, Precomp.Signature,
+           Precomp.PrecompFilePath);
+  return Error::success();
+}
+
+Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                               EndPrecompRecord &EP) {
+  P.format(" signature = {0:X+}", EP.Signature);
+  return Error::success();
+}
+
 Error MinimalTypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
                                                NestedTypeRecord &Nested) {
   P.format(" [name = `{0}`, parent = {1}]", Nested.Name, Nested.Type);
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp
index 10b3d9ee7304..bcdecca81aec 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyBuiltinDumper.cpp
@@ -87,7 +87,12 @@ StringRef BuiltinDumper::getTypeName(const PDBSymbolTypeBuiltin &Symbol) {
     return "HRESULT";
   case PDB_BuiltinType::BCD:
     return "HRESULT";
-  default:
-    return "void";
+  case PDB_BuiltinType::Char16:
+    return "char16_t";
+  case PDB_BuiltinType::Char32:
+    return "char32_t";
+  case PDB_BuiltinType::None:
+    return "...";
   }
+  llvm_unreachable("Unknown PDB_BuiltinType");
 }
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
index 66c29fc5d4ee..a572522c8cd7 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
@@ -50,12 +50,9 @@ bool PrettyClassLayoutGraphicalDumper::start(const UDTLayoutBase &Layout) {
     uint32_t RelativeOffset = Item->getOffsetInParent();
     CurrentAbsoluteOffset = ClassOffsetZero + RelativeOffset;
 
-    // Since there is storage there, it should be set!  However, this might
-    // be an empty base, in which case it could extend outside the bounds of
-    // the parent class.
+    // This might be an empty base, in which case it could extend outside the
+    // bounds of the parent class.
     if (RelativeOffset < UseMap.size() && (Item->getSize() > 0)) {
-      assert(UseMap.test(RelativeOffset));
-
       // If there is any remaining padding in this class, and the offset of the
       // new item is after the padding, then we must have just jumped over some
       // padding.  Print a padding row and then look for where the next block
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
index 65e8badbc99a..0d99c9b1245c 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
@@ -55,62 +55,73 @@ void CompilandDumper::start(const PDBSymbolCompiland &Symbol,
 
   if (opts & Flags::Lines) {
     const IPDBSession &Session = Symbol.getSession();
-    auto Files = Session.getSourceFilesForCompiland(Symbol);
-    Printer.Indent();
-    while (auto File = Files->getNext()) {
-      Printer.NewLine();
-      WithColor(Printer, PDB_ColorItem::Path).get() << File->getFileName();
-
-      auto Lines = Session.findLineNumbers(Symbol, *File);
+    if (auto Files = Session.getSourceFilesForCompiland(Symbol)) {
       Printer.Indent();
-      while (auto Line = Lines->getNext()) {
+      while (auto File = Files->getNext()) {
         Printer.NewLine();
-        uint32_t LineStart = Line->getLineNumber();
-        uint32_t LineEnd = Line->getLineNumberEnd();
-
-        Printer << "Line ";
-        PDB_ColorItem StatementColor = Line->isStatement()
-                                           ? PDB_ColorItem::Keyword
-                                           : PDB_ColorItem::LiteralValue;
-        WithColor(Printer, StatementColor).get() << LineStart;
-        if (LineStart != LineEnd)
-          WithColor(Printer, StatementColor).get() << " - " << LineEnd;
-
-        uint32_t ColumnStart = Line->getColumnNumber();
-        uint32_t ColumnEnd = Line->getColumnNumberEnd();
-        if (ColumnStart != 0 || ColumnEnd != 0) {
-          Printer << ", Column: ";
-          WithColor(Printer, StatementColor).get() << ColumnStart;
-          if (ColumnEnd != ColumnStart)
-            WithColor(Printer, StatementColor).get() << " - " << ColumnEnd;
+        WithColor(Printer, PDB_ColorItem::Path).get() << File->getFileName();
+        if (File->getChecksumType() != PDB_Checksum::None) {
+          auto ChecksumType = File->getChecksumType();
+          auto ChecksumHexString = toHex(File->getChecksum());
+          WithColor(Printer, PDB_ColorItem::Comment).get()
+              << " (" << ChecksumType << ": " << ChecksumHexString << ")";
         }
 
-        Printer << ", Address: ";
-        if (Line->getLength() > 0) {
-          uint64_t AddrStart = Line->getVirtualAddress();
-          uint64_t AddrEnd = AddrStart + Line->getLength() - 1;
-          WithColor(Printer, PDB_ColorItem::Address).get()
+        auto Lines = Session.findLineNumbers(Symbol, *File);
+        if (!Lines)
+          continue;
+
+        Printer.Indent();
+        while (auto Line = Lines->getNext()) {
+          Printer.NewLine();
+          uint32_t LineStart = Line->getLineNumber();
+          uint32_t LineEnd = Line->getLineNumberEnd();
+
+          Printer << "Line ";
+          PDB_ColorItem StatementColor = Line->isStatement()
+            ? PDB_ColorItem::Keyword
+            : PDB_ColorItem::LiteralValue;
+          WithColor(Printer, StatementColor).get() << LineStart;
+          if (LineStart != LineEnd)
+            WithColor(Printer, StatementColor).get() << " - " << LineEnd;
+
+          uint32_t ColumnStart = Line->getColumnNumber();
+          uint32_t ColumnEnd = Line->getColumnNumberEnd();
+          if (ColumnStart != 0 || ColumnEnd != 0) {
+            Printer << ", Column: ";
+            WithColor(Printer, StatementColor).get() << ColumnStart;
+            if (ColumnEnd != ColumnStart)
+              WithColor(Printer, StatementColor).get() << " - " << ColumnEnd;
+          }
+
+          Printer << ", Address: ";
+          if (Line->getLength() > 0) {
+            uint64_t AddrStart = Line->getVirtualAddress();
+            uint64_t AddrEnd = AddrStart + Line->getLength() - 1;
+            WithColor(Printer, PDB_ColorItem::Address).get()
               << "[" << format_hex(AddrStart, 10) << " - "
               << format_hex(AddrEnd, 10) << "]";
-          Printer << " (" << Line->getLength() << " bytes)";
-        } else {
-          uint64_t AddrStart = Line->getVirtualAddress();
-          WithColor(Printer, PDB_ColorItem::Address).get()
+            Printer << " (" << Line->getLength() << " bytes)";
+          } else {
+            uint64_t AddrStart = Line->getVirtualAddress();
+            WithColor(Printer, PDB_ColorItem::Address).get()
               << "[" << format_hex(AddrStart, 10) << "] ";
-          Printer << "(0 bytes)";
+            Printer << "(0 bytes)";
+          }
         }
+        Printer.Unindent();
       }
       Printer.Unindent();
     }
-    Printer.Unindent();
   }
 
   if (opts & Flags::Children) {
-    auto ChildrenEnum = Symbol.findAllChildren();
-    Printer.Indent();
-    while (auto Child = ChildrenEnum->getNext())
-      Child->dump(*this);
-    Printer.Unindent();
+    if (auto ChildrenEnum = Symbol.findAllChildren()) {
+      Printer.Indent();
+      while (auto Child = ChildrenEnum->getNext())
+        Child->dump(*this);
+      Printer.Unindent();
+    }
   }
 }
 
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
index fc40d90cee96..1270223b1c78 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
@@ -21,9 +21,10 @@ ExternalSymbolDumper::ExternalSymbolDumper(LinePrinter &P)
     : PDBSymDumper(true), Printer(P) {}
 
 void ExternalSymbolDumper::start(const PDBSymbolExe &Symbol) {
-  auto Vars = Symbol.findAllChildren<PDBSymbolPublicSymbol>();
-  while (auto Var = Vars->getNext())
-    Var->dump(*this);
+  if (auto Vars = Symbol.findAllChildren<PDBSymbolPublicSymbol>()) {
+    while (auto Var = Vars->getNext())
+      Var->dump(*this);
+  }
 }
 
 void ExternalSymbolDumper::dump(const PDBSymbolPublicSymbol &Symbol) {
@@ -34,7 +35,7 @@ void ExternalSymbolDumper::dump(const PDBSymbolPublicSymbol &Symbol) {
   Printer.NewLine();
   uint64_t Addr = Symbol.getVirtualAddress();
 
-  Printer << "[";
+  Printer << "public [";
   WithColor(Printer, PDB_ColorItem::Address).get() << format_hex(Addr, 10);
   Printer << "] ";
   WithColor(Printer, PDB_ColorItem::Identifier).get() << LinkageName;
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
index 0bffc73f6c74..177d8a009a2b 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
@@ -189,6 +189,8 @@ void FunctionDumper::start(const PDBSymbolFunc &Symbol, PointerType Pointer) {
       if (++Index < Arguments->getChildCount())
         Printer << ", ";
     }
+    if (Signature->isCVarArgs())
+      Printer << ", ...";
   }
   Printer << ")";
   if (Symbol.isConstType())
@@ -250,6 +252,9 @@ void FunctionDumper::dump(const PDBSymbolTypePointer &Symbol) {
       WithColor(Printer, PDB_ColorItem::Keyword).get() << "volatile ";
     PointeeType->dump(*this);
     Printer << (Symbol.isReference() ? "&" : "*");
+
+    if (Symbol.getRawSymbol().isRestrictedType())
+      WithColor(Printer, PDB_ColorItem::Keyword).get() << " __restrict";
   }
 }
 
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
index 0f6086395ad1..663a608fe429 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
@@ -128,14 +128,13 @@ filterAndSortClassDefs(LinePrinter &Printer, Enumerator &E,
   }
 
   if (Comp)
-    std::sort(Filtered.begin(), Filtered.end(), Comp);
+    llvm::sort(Filtered.begin(), Filtered.end(), Comp);
   return Filtered;
 }
 
 TypeDumper::TypeDumper(LinePrinter &P) : PDBSymDumper(true), Printer(P) {}
 
 void TypeDumper::start(const PDBSymbolExe &Exe) {
-  auto Children = Exe.findAllChildren();
   if (opts::pretty::Enums) {
     if (auto Enums = Exe.findAllChildren<PDBSymbolTypeEnum>()) {
       Printer.NewLine();
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
index ba3b4c8035c5..65443d6bca90 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
@@ -63,6 +63,9 @@ void TypedefDumper::dump(const PDBSymbolTypePointer &Symbol) {
     PointeeType->dump(*this);
     Printer << ((Symbol.isReference()) ? "&" : "*");
   }
+
+  if (Symbol.getRawSymbol().isRestrictedType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " __restrict";
 }
 
 void TypedefDumper::dump(const PDBSymbolTypeFunctionSig &Symbol) {
diff --git a/contrib/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp b/contrib/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
index 4884fc8ee5a4..ddac8cf0da4a 100644
--- a/contrib/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
@@ -169,6 +169,9 @@ void VariableDumper::dumpRight(const PDBSymbolTypeFunctionSig &Symbol) {
     WithColor(Printer, PDB_ColorItem::Keyword).get() << " const";
   if (Symbol.isVolatileType())
     WithColor(Printer, PDB_ColorItem::Keyword).get() << " volatile";
+
+  if (Symbol.getRawSymbol().isRestrictedType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " __restrict";
 }
 
 void VariableDumper::dump(const PDBSymbolTypePointer &Symbol) {
@@ -189,6 +192,9 @@ void VariableDumper::dump(const PDBSymbolTypePointer &Symbol) {
     WithColor(Printer, PDB_ColorItem::Keyword).get() << " const ";
   if (Symbol.isVolatileType())
     WithColor(Printer, PDB_ColorItem::Keyword).get() << " volatile ";
+
+  if (Symbol.getRawSymbol().isRestrictedType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " __restrict ";
 }
 
 void VariableDumper::dumpRight(const PDBSymbolTypePointer &Symbol) {
diff --git a/contrib/llvm/tools/llvm-pdbutil/StreamUtil.cpp b/contrib/llvm/tools/llvm-pdbutil/StreamUtil.cpp
index 991c99aa8686..367d947d25ee 100644
--- a/contrib/llvm/tools/llvm-pdbutil/StreamUtil.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/StreamUtil.cpp
@@ -49,16 +49,9 @@ StreamInfo StreamInfo::createModuleStream(StringRef Module,
   return Result;
 }
 
-static inline StreamInfo otherStream(StringRef Label, uint32_t Idx) {
-  return StreamInfo::createStream(StreamPurpose::Other, Label, Idx);
-}
-
-static inline StreamInfo namedStream(StringRef Label, uint32_t Idx) {
-  return StreamInfo::createStream(StreamPurpose::NamedStream, Label, Idx);
-}
-
-static inline StreamInfo symbolStream(StringRef Label, uint32_t Idx) {
-  return StreamInfo::createStream(StreamPurpose::Symbols, Label, Idx);
+static inline StreamInfo stream(StreamPurpose Purpose, StringRef Label,
+                                uint32_t Idx) {
+  return StreamInfo::createStream(Purpose, Label, Idx);
 }
 
 static inline StreamInfo moduleStream(StringRef Label, uint32_t StreamIdx,
@@ -105,60 +98,75 @@ void llvm::pdb::discoverStreamPurposes(PDBFile &File,
   Streams.resize(StreamCount);
   for (uint16_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
     if (StreamIdx == OldMSFDirectory)
-      Streams[StreamIdx] = otherStream("Old MSF Directory", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Old MSF Directory", StreamIdx);
     else if (StreamIdx == StreamPDB)
-      Streams[StreamIdx] = otherStream("PDB Stream", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::PDB, "PDB Stream", StreamIdx);
     else if (StreamIdx == StreamDBI)
-      Streams[StreamIdx] = otherStream("DBI Stream", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::DBI, "DBI Stream", StreamIdx);
     else if (StreamIdx == StreamTPI)
-      Streams[StreamIdx] = otherStream("TPI Stream", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::TPI, "TPI Stream", StreamIdx);
     else if (StreamIdx == StreamIPI)
-      Streams[StreamIdx] = otherStream("IPI Stream", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::IPI, "IPI Stream", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getGlobalSymbolStreamIndex())
-      Streams[StreamIdx] = otherStream("Global Symbol Hash", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::GlobalHash, "Global Symbol Hash", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getPublicSymbolStreamIndex())
-      Streams[StreamIdx] = otherStream("Public Symbol Hash", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::PublicHash, "Public Symbol Hash", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getSymRecordStreamIndex())
-      Streams[StreamIdx] = symbolStream("Symbol Records", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Symbols, "Symbol Records", StreamIdx);
     else if (Tpi && StreamIdx == Tpi->getTypeHashStreamIndex())
-      Streams[StreamIdx] = otherStream("TPI Hash", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::TpiHash, "TPI Hash", StreamIdx);
     else if (Tpi && StreamIdx == Tpi->getTypeHashStreamAuxIndex())
-      Streams[StreamIdx] = otherStream("TPI Aux Hash", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "TPI Aux Hash", StreamIdx);
     else if (Ipi && StreamIdx == Ipi->getTypeHashStreamIndex())
-      Streams[StreamIdx] = otherStream("IPI Hash", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::IpiHash, "IPI Hash", StreamIdx);
     else if (Ipi && StreamIdx == Ipi->getTypeHashStreamAuxIndex())
-      Streams[StreamIdx] = otherStream("IPI Aux Hash", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "IPI Aux Hash", StreamIdx);
     else if (Dbi &&
              StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Exception))
-      Streams[StreamIdx] = otherStream("Exception Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Exception Data", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Fixup))
-      Streams[StreamIdx] = otherStream("Fixup Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Fixup Data", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::FPO))
-      Streams[StreamIdx] = otherStream("FPO Data", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::Other, "FPO Data", StreamIdx);
     else if (Dbi &&
              StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::NewFPO))
-      Streams[StreamIdx] = otherStream("New FPO Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "New FPO Data", StreamIdx);
     else if (Dbi &&
              StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::OmapFromSrc))
-      Streams[StreamIdx] = otherStream("Omap From Source Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Omap From Source Data", StreamIdx);
     else if (Dbi &&
              StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::OmapToSrc))
-      Streams[StreamIdx] = otherStream("Omap To Source Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Omap To Source Data", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Pdata))
-      Streams[StreamIdx] = otherStream("Pdata", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::Other, "Pdata", StreamIdx);
     else if (Dbi &&
              StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::SectionHdr))
-      Streams[StreamIdx] = otherStream("Section Header Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Section Header Data", StreamIdx);
     else if (Dbi &&
              StreamIdx ==
                  Dbi->getDebugStreamIndex(DbgHeaderType::SectionHdrOrig))
-      Streams[StreamIdx] =
-          otherStream("Section Header Original Data", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::Other,
+                                  "Section Header Original Data", StreamIdx);
     else if (Dbi &&
              StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::TokenRidMap))
-      Streams[StreamIdx] = otherStream("Token Rid Data", StreamIdx);
+      Streams[StreamIdx] =
+          stream(StreamPurpose::Other, "Token Rid Data", StreamIdx);
     else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Xdata))
-      Streams[StreamIdx] = otherStream("Xdata", StreamIdx);
+      Streams[StreamIdx] = stream(StreamPurpose::Other, "Xdata", StreamIdx);
     else {
       auto ModIter = ModStreams.find(StreamIdx);
       auto NSIter = NamedStreams.find(StreamIdx);
@@ -167,9 +175,10 @@ void llvm::pdb::discoverStreamPurposes(PDBFile &File,
             moduleStream(ModIter->second.Descriptor.getModuleName(), StreamIdx,
                          ModIter->second.Modi);
       } else if (NSIter != NamedStreams.end()) {
-        Streams[StreamIdx] = namedStream(NSIter->second, StreamIdx);
+        Streams[StreamIdx] =
+            stream(StreamPurpose::NamedStream, NSIter->second, StreamIdx);
       } else {
-        Streams[StreamIdx] = otherStream("???", StreamIdx);
+        Streams[StreamIdx] = stream(StreamPurpose::Other, "???", StreamIdx);
       }
     }
   }
diff --git a/contrib/llvm/tools/llvm-pdbutil/StreamUtil.h b/contrib/llvm/tools/llvm-pdbutil/StreamUtil.h
index 443267ca3290..0e2e80707361 100644
--- a/contrib/llvm/tools/llvm-pdbutil/StreamUtil.h
+++ b/contrib/llvm/tools/llvm-pdbutil/StreamUtil.h
@@ -19,7 +19,20 @@
 namespace llvm {
 namespace pdb {
 class PDBFile;
-enum class StreamPurpose { NamedStream, ModuleStream, Symbols, Other };
+enum class StreamPurpose {
+  NamedStream,
+  ModuleStream,
+  Symbols,
+  PDB,
+  DBI,
+  TPI,
+  IPI,
+  GlobalHash,
+  PublicHash,
+  TpiHash,
+  IpiHash,
+  Other
+};
 
 struct StreamInfo {
 public:
diff --git a/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 089f7256536f..5b0d21f83db7 100644
--- a/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -15,15 +15,18 @@
 
 #include "Analyze.h"
 #include "BytesOutputStyle.h"
-#include "Diff.h"
 #include "DumpOutputStyle.h"
+#include "ExplainOutputStyle.h"
 #include "InputFile.h"
 #include "LinePrinter.h"
 #include "OutputStyle.h"
+#include "PrettyClassDefinitionDumper.h"
 #include "PrettyCompilandDumper.h"
+#include "PrettyEnumDumper.h"
 #include "PrettyExternalSymbolDumper.h"
 #include "PrettyFunctionDumper.h"
 #include "PrettyTypeDumper.h"
+#include "PrettyTypedefDumper.h"
 #include "PrettyVariableDumper.h"
 #include "YAMLOutputStyle.h"
 
@@ -45,10 +48,12 @@
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -63,7 +68,11 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/COM.h"
 #include "llvm/Support/CommandLine.h"
@@ -71,6 +80,7 @@
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -82,7 +92,6 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
-
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::msf;
@@ -97,8 +106,6 @@ cl::SubCommand
     PrettySubcommand("pretty",
                      "Dump semantic information about types and symbols");
 
-cl::SubCommand DiffSubcommand("diff", "Diff the contents of 2 PDB files");
-
 cl::SubCommand
     YamlToPdbSubcommand("yaml2pdb",
                         "Generate a PDB file from a YAML description");
@@ -113,6 +120,12 @@ cl::SubCommand
 cl::SubCommand MergeSubcommand("merge",
                                "Merge multiple PDBs into a single PDB");
 
+cl::SubCommand ExplainSubcommand("explain",
+                                 "Explain the meaning of a file offset");
+
+cl::SubCommand ExportSubcommand("export",
+                                "Write binary data from a stream to a file");
+
 cl::OptionCategory TypeCategory("Symbol Type Options");
 cl::OptionCategory FilterCategory("Filtering and Sorting Options");
 cl::OptionCategory OtherOptions("Other Options");
@@ -147,6 +160,19 @@ cl::list<std::string> InputFilenames(cl::Positional,
                                      cl::desc("<input PDB files>"),
                                      cl::OneOrMore, cl::sub(PrettySubcommand));
 
+cl::opt<bool> InjectedSources("injected-sources",
+                              cl::desc("Display injected sources"),
+                              cl::cat(OtherOptions), cl::sub(PrettySubcommand));
+cl::opt<bool> ShowInjectedSourceContent(
+    "injected-source-content",
+    cl::desc("When displaying an injected source, display the file content"),
+    cl::cat(OtherOptions), cl::sub(PrettySubcommand));
+
+cl::list<std::string> WithName(
+    "with-name",
+    cl::desc("Display any symbol or type with the specified exact name"),
+    cl::cat(TypeCategory), cl::ZeroOrMore, cl::sub(PrettySubcommand));
+
 cl::opt<bool> Compilands("compilands", cl::desc("Display compilands"),
                          cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 cl::opt<bool> Symbols("module-syms",
@@ -286,44 +312,6 @@ cl::opt<bool> NoEnumDefs("no-enum-definitions",
                          cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 }
 
-namespace diff {
-cl::opt<bool> PrintValueColumns(
-    "values", cl::init(true),
-    cl::desc("Print one column for each PDB with the field value"),
-    cl::Optional, cl::sub(DiffSubcommand));
-cl::opt<bool>
-    PrintResultColumn("result", cl::init(false),
-                      cl::desc("Print a column with the result status"),
-                      cl::Optional, cl::sub(DiffSubcommand));
-
-cl::list<std::string>
-    RawModiEquivalences("modi-equivalence", cl::ZeroOrMore,
-                        cl::value_desc("left,right"),
-                        cl::desc("Modules with the specified indices will be "
-                                 "treated as referring to the same module"),
-                        cl::sub(DiffSubcommand));
-
-cl::opt<std::string> LeftRoot(
-    "left-bin-root", cl::Optional,
-    cl::desc("Treats the specified path as the root of the tree containing "
-             "binaries referenced by the left PDB.  The root is stripped from "
-             "embedded paths when doing equality comparisons."),
-    cl::sub(DiffSubcommand));
-cl::opt<std::string> RightRoot(
-    "right-bin-root", cl::Optional,
-    cl::desc("Treats the specified path as the root of the tree containing "
-             "binaries referenced by the right PDB.  The root is stripped from "
-             "embedded paths when doing equality comparisons"),
-    cl::sub(DiffSubcommand));
-
-cl::opt<std::string> Left(cl::Positional, cl::desc("<left>"),
-                          cl::sub(DiffSubcommand));
-cl::opt<std::string> Right(cl::Positional, cl::desc("<right>"),
-                           cl::sub(DiffSubcommand));
-
-llvm::DenseMap<uint32_t, uint32_t> Equivalences;
-}
-
 cl::OptionCategory FileOptions("Module & File Options");
 
 namespace bytes {
@@ -482,6 +470,10 @@ cl::opt<bool> DumpPublics("publics", cl::desc("dump Publics stream data"),
 cl::opt<bool> DumpPublicExtras("public-extras",
                                cl::desc("dump Publics hashes and address maps"),
                                cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
+cl::opt<bool>
+    DumpGSIRecords("gsi-records",
+                   cl::desc("dump public / global common record stream"),
+                   cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
 cl::opt<bool> DumpSymbols("symbols", cl::desc("dump module symbols"),
                           cl::cat(SymbolOptions), cl::sub(DumpSubcommand));
 
@@ -525,8 +517,16 @@ cl::opt<bool> JustMyCode("jmc", cl::Optional,
                          cl::cat(FileOptions), cl::sub(DumpSubcommand));
 
 // MISCELLANEOUS OPTIONS
+cl::opt<bool> DumpNamedStreams("named-streams",
+                               cl::desc("dump PDB named stream table"),
+                               cl::cat(MiscOptions), cl::sub(DumpSubcommand));
+
 cl::opt<bool> DumpStringTable("string-table", cl::desc("dump PDB String Table"),
                               cl::cat(MiscOptions), cl::sub(DumpSubcommand));
+cl::opt<bool> DumpStringTableDetails("string-table-details",
+                                     cl::desc("dump PDB String Table Details"),
+                                     cl::cat(MiscOptions),
+                                     cl::sub(DumpSubcommand));
 
 cl::opt<bool> DumpSectionContribs("section-contribs",
                                   cl::desc("dump section contributions"),
@@ -629,6 +629,47 @@ cl::opt<std::string>
     PdbOutputFile("pdb", cl::desc("the name of the PDB file to write"),
                   cl::sub(MergeSubcommand));
 }
+
+namespace explain {
+cl::list<std::string> InputFilename(cl::Positional,
+                                    cl::desc("<input PDB file>"), cl::Required,
+                                    cl::sub(ExplainSubcommand));
+
+cl::list<uint64_t> Offsets("offset", cl::desc("The file offset to explain"),
+                           cl::sub(ExplainSubcommand), cl::OneOrMore);
+
+cl::opt<InputFileType> InputType(
+    "input-type", cl::desc("Specify how to interpret the input file"),
+    cl::init(InputFileType::PDBFile), cl::Optional, cl::sub(ExplainSubcommand),
+    cl::values(clEnumValN(InputFileType::PDBFile, "pdb-file",
+                          "Treat input as a PDB file (default)"),
+               clEnumValN(InputFileType::PDBStream, "pdb-stream",
+                          "Treat input as raw contents of PDB stream"),
+               clEnumValN(InputFileType::DBIStream, "dbi-stream",
+                          "Treat input as raw contents of DBI stream"),
+               clEnumValN(InputFileType::Names, "names-stream",
+                          "Treat input as raw contents of /names named stream"),
+               clEnumValN(InputFileType::ModuleStream, "mod-stream",
+                          "Treat input as raw contents of a module stream")));
+} // namespace explain
+
+namespace exportstream {
+cl::list<std::string> InputFilename(cl::Positional,
+                                    cl::desc("<input PDB file>"), cl::Required,
+                                    cl::sub(ExportSubcommand));
+cl::opt<std::string> OutputFile("out",
+                                cl::desc("The file to write the stream to"),
+                                cl::Required, cl::sub(ExportSubcommand));
+cl::opt<std::string>
+    Stream("stream", cl::Required,
+           cl::desc("The index or name of the stream whose contents to export"),
+           cl::sub(ExportSubcommand));
+cl::opt<bool> ForceName("name",
+                        cl::desc("Force the interpretation of -stream as a "
+                                 "string, even if it is a valid integer"),
+                        cl::sub(ExportSubcommand), cl::Optional,
+                        cl::init(false));
+} // namespace exportstream
 }
 
 static ExitOnError ExitOnErr;
@@ -761,7 +802,6 @@ static void pdb2Yaml(StringRef Path) {
 }
 
 static void dumpRaw(StringRef Path) {
-
   InputFile IF = ExitOnErr(InputFile::open(Path));
 
   auto O = llvm::make_unique<DumpOutputStyle>(IF);
@@ -785,18 +825,6 @@ static void dumpAnalysis(StringRef Path) {
   ExitOnErr(O->dump());
 }
 
-static void diff(StringRef Path1, StringRef Path2) {
-  std::unique_ptr<IPDBSession> Session1;
-  std::unique_ptr<IPDBSession> Session2;
-
-  auto &File1 = loadPDB(Path1, Session1);
-  auto &File2 = loadPDB(Path2, Session2);
-
-  auto O = llvm::make_unique<DiffStyle>(File1, File2);
-
-  ExitOnErr(O->dump());
-}
-
 bool opts::pretty::shouldDumpSymLevel(SymLevel Search) {
   if (SymTypes.empty())
     return true;
@@ -840,6 +868,62 @@ bool opts::pretty::compareDataSymbols(
   return getTypeLength(*F1) > getTypeLength(*F2);
 }
 
+static std::string stringOr(std::string Str, std::string IfEmpty) {
+  return (Str.empty()) ? IfEmpty : Str;
+}
+
+static void dumpInjectedSources(LinePrinter &Printer, IPDBSession &Session) {
+  auto Sources = Session.getInjectedSources();
+  if (0 == Sources->getChildCount()) {
+    Printer.printLine("There are no injected sources.");
+    return;
+  }
+
+  while (auto IS = Sources->getNext()) {
+    Printer.NewLine();
+    std::string File = stringOr(IS->getFileName(), "<null>");
+    uint64_t Size = IS->getCodeByteSize();
+    std::string Obj = stringOr(IS->getObjectFileName(), "<null>");
+    std::string VFName = stringOr(IS->getVirtualFileName(), "<null>");
+    uint32_t CRC = IS->getCrc32();
+
+    std::string CompressionStr;
+    llvm::raw_string_ostream Stream(CompressionStr);
+    Stream << IS->getCompression();
+    WithColor(Printer, PDB_ColorItem::Path).get() << File;
+    Printer << " (";
+    WithColor(Printer, PDB_ColorItem::LiteralValue).get() << Size;
+    Printer << " bytes): ";
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << "obj";
+    Printer << "=";
+    WithColor(Printer, PDB_ColorItem::Path).get() << Obj;
+    Printer << ", ";
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << "vname";
+    Printer << "=";
+    WithColor(Printer, PDB_ColorItem::Path).get() << VFName;
+    Printer << ", ";
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << "crc";
+    Printer << "=";
+    WithColor(Printer, PDB_ColorItem::LiteralValue).get() << CRC;
+    Printer << ", ";
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << "compression";
+    Printer << "=";
+    WithColor(Printer, PDB_ColorItem::LiteralValue).get() << Stream.str();
+
+    if (!opts::pretty::ShowInjectedSourceContent)
+      continue;
+
+    // Set the indent level to 0 when printing file content.
+    int Indent = Printer.getIndentLevel();
+    Printer.Unindent(Indent);
+
+    Printer.printLine(IS->getCode());
+
+    // Re-indent back to the original level.
+    Printer.Indent(Indent);
+  }
+}
+
 static void dumpPretty(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
 
@@ -857,6 +941,8 @@ static void dumpPretty(StringRef Path) {
   LinePrinter Printer(2, UseColor, Stream);
 
   auto GlobalScope(Session->getGlobalScope());
+  if (!GlobalScope)
+    return;
   std::string FileName(GlobalScope->getSymbolsFileName());
 
   WithColor(Printer, PDB_ColorItem::None).get() << "Summary for ";
@@ -889,19 +975,96 @@ static void dumpPretty(StringRef Path) {
     outs() << "HasPrivateSymbols ";
   Printer.Unindent();
 
+  if (!opts::pretty::WithName.empty()) {
+    Printer.NewLine();
+    WithColor(Printer, PDB_ColorItem::SectionHeader).get()
+        << "---SYMBOLS & TYPES BY NAME---";
+
+    for (StringRef Name : opts::pretty::WithName) {
+      auto Symbols = GlobalScope->findChildren(
+          PDB_SymType::None, Name, PDB_NameSearchFlags::NS_CaseSensitive);
+      if (!Symbols || Symbols->getChildCount() == 0) {
+        Printer.formatLine("[not found] - {0}", Name);
+        continue;
+      }
+      Printer.formatLine("[{0} occurrences] - {1}", Symbols->getChildCount(),
+                         Name);
+
+      AutoIndent Indent(Printer);
+      Printer.NewLine();
+
+      while (auto Symbol = Symbols->getNext()) {
+        switch (Symbol->getSymTag()) {
+        case PDB_SymType::Typedef: {
+          TypedefDumper TD(Printer);
+          std::unique_ptr<PDBSymbolTypeTypedef> T =
+              llvm::unique_dyn_cast<PDBSymbolTypeTypedef>(std::move(Symbol));
+          TD.start(*T);
+          break;
+        }
+        case PDB_SymType::Enum: {
+          EnumDumper ED(Printer);
+          std::unique_ptr<PDBSymbolTypeEnum> E =
+              llvm::unique_dyn_cast<PDBSymbolTypeEnum>(std::move(Symbol));
+          ED.start(*E);
+          break;
+        }
+        case PDB_SymType::UDT: {
+          ClassDefinitionDumper CD(Printer);
+          std::unique_ptr<PDBSymbolTypeUDT> C =
+              llvm::unique_dyn_cast<PDBSymbolTypeUDT>(std::move(Symbol));
+          CD.start(*C);
+          break;
+        }
+        case PDB_SymType::BaseClass:
+        case PDB_SymType::Friend: {
+          TypeDumper TD(Printer);
+          Symbol->dump(TD);
+          break;
+        }
+        case PDB_SymType::Function: {
+          FunctionDumper FD(Printer);
+          std::unique_ptr<PDBSymbolFunc> F =
+              llvm::unique_dyn_cast<PDBSymbolFunc>(std::move(Symbol));
+          FD.start(*F, FunctionDumper::PointerType::None);
+          break;
+        }
+        case PDB_SymType::Data: {
+          VariableDumper VD(Printer);
+          std::unique_ptr<PDBSymbolData> D =
+              llvm::unique_dyn_cast<PDBSymbolData>(std::move(Symbol));
+          VD.start(*D);
+          break;
+        }
+        case PDB_SymType::PublicSymbol: {
+          ExternalSymbolDumper ED(Printer);
+          std::unique_ptr<PDBSymbolPublicSymbol> PS =
+              llvm::unique_dyn_cast<PDBSymbolPublicSymbol>(std::move(Symbol));
+          ED.dump(*PS);
+          break;
+        }
+        default:
+          llvm_unreachable("Unexpected symbol tag!");
+        }
+      }
+    }
+    llvm::outs().flush();
+  }
+
   if (opts::pretty::Compilands) {
     Printer.NewLine();
     WithColor(Printer, PDB_ColorItem::SectionHeader).get()
         << "---COMPILANDS---";
-    Printer.Indent();
-    auto Compilands = GlobalScope->findAllChildren<PDBSymbolCompiland>();
-    CompilandDumper Dumper(Printer);
-    CompilandDumpFlags options = CompilandDumper::Flags::None;
-    if (opts::pretty::Lines)
-      options = options | CompilandDumper::Flags::Lines;
-    while (auto Compiland = Compilands->getNext())
-      Dumper.start(*Compiland, options);
-    Printer.Unindent();
+    if (auto Compilands = GlobalScope->findAllChildren<PDBSymbolCompiland>()) {
+      Printer.Indent();
+      CompilandDumper Dumper(Printer);
+      CompilandDumpFlags options = CompilandDumper::Flags::None;
+      if (opts::pretty::Lines)
+        options = options | CompilandDumper::Flags::Lines;
+      while (auto Compiland = Compilands->getNext())
+        Dumper.start(*Compiland, options);
+      Printer.Unindent();
+    }
   }
 
   if (opts::pretty::Classes || opts::pretty::Enums || opts::pretty::Typedefs) {
@@ -916,12 +1079,13 @@ static void dumpPretty(StringRef Path) {
   if (opts::pretty::Symbols) {
     Printer.NewLine();
     WithColor(Printer, PDB_ColorItem::SectionHeader).get() << "---SYMBOLS---";
-    Printer.Indent();
-    auto Compilands = GlobalScope->findAllChildren<PDBSymbolCompiland>();
-    CompilandDumper Dumper(Printer);
-    while (auto Compiland = Compilands->getNext())
-      Dumper.start(*Compiland, true);
-    Printer.Unindent();
+    if (auto Compilands = GlobalScope->findAllChildren<PDBSymbolCompiland>()) {
+      Printer.Indent();
+      CompilandDumper Dumper(Printer);
+      while (auto Compiland = Compilands->getNext())
+        Dumper.start(*Compiland, true);
+      Printer.Unindent();
+    }
   }
 
   if (opts::pretty::Globals) {
@@ -929,45 +1093,49 @@ static void dumpPretty(StringRef Path) {
     WithColor(Printer, PDB_ColorItem::SectionHeader).get() << "---GLOBALS---";
     Printer.Indent();
     if (shouldDumpSymLevel(opts::pretty::SymLevel::Functions)) {
-      FunctionDumper Dumper(Printer);
-      auto Functions = GlobalScope->findAllChildren<PDBSymbolFunc>();
-      if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
-        while (auto Function = Functions->getNext()) {
-          Printer.NewLine();
-          Dumper.start(*Function, FunctionDumper::PointerType::None);
-        }
-      } else {
-        std::vector<std::unique_ptr<PDBSymbolFunc>> Funcs;
-        while (auto Func = Functions->getNext())
-          Funcs.push_back(std::move(Func));
-        std::sort(Funcs.begin(), Funcs.end(),
-                  opts::pretty::compareFunctionSymbols);
-        for (const auto &Func : Funcs) {
-          Printer.NewLine();
-          Dumper.start(*Func, FunctionDumper::PointerType::None);
+      if (auto Functions = GlobalScope->findAllChildren<PDBSymbolFunc>()) {
+        FunctionDumper Dumper(Printer);
+        if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
+          while (auto Function = Functions->getNext()) {
+            Printer.NewLine();
+            Dumper.start(*Function, FunctionDumper::PointerType::None);
+          }
+        } else {
+          std::vector<std::unique_ptr<PDBSymbolFunc>> Funcs;
+          while (auto Func = Functions->getNext())
+            Funcs.push_back(std::move(Func));
+          llvm::sort(Funcs.begin(), Funcs.end(),
+                     opts::pretty::compareFunctionSymbols);
+          for (const auto &Func : Funcs) {
+            Printer.NewLine();
+            Dumper.start(*Func, FunctionDumper::PointerType::None);
+          }
         }
       }
     }
     if (shouldDumpSymLevel(opts::pretty::SymLevel::Data)) {
-      auto Vars = GlobalScope->findAllChildren<PDBSymbolData>();
-      VariableDumper Dumper(Printer);
-      if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
-        while (auto Var = Vars->getNext())
-          Dumper.start(*Var);
-      } else {
-        std::vector<std::unique_ptr<PDBSymbolData>> Datas;
-        while (auto Var = Vars->getNext())
-          Datas.push_back(std::move(Var));
-        std::sort(Datas.begin(), Datas.end(), opts::pretty::compareDataSymbols);
-        for (const auto &Var : Datas)
-          Dumper.start(*Var);
+      if (auto Vars = GlobalScope->findAllChildren<PDBSymbolData>()) {
+        VariableDumper Dumper(Printer);
+        if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
+          while (auto Var = Vars->getNext())
+            Dumper.start(*Var);
+        } else {
+          std::vector<std::unique_ptr<PDBSymbolData>> Datas;
+          while (auto Var = Vars->getNext())
+            Datas.push_back(std::move(Var));
+          llvm::sort(Datas.begin(), Datas.end(),
+                     opts::pretty::compareDataSymbols);
+          for (const auto &Var : Datas)
+            Dumper.start(*Var);
+        }
       }
     }
     if (shouldDumpSymLevel(opts::pretty::SymLevel::Thunks)) {
-      auto Thunks = GlobalScope->findAllChildren<PDBSymbolThunk>();
-      CompilandDumper Dumper(Printer);
-      while (auto Thunk = Thunks->getNext())
-        Dumper.dump(*Thunk);
+      if (auto Thunks = GlobalScope->findAllChildren<PDBSymbolThunk>()) {
+        CompilandDumper Dumper(Printer);
+        while (auto Thunk = Thunks->getNext())
+          Dumper.dump(*Thunk);
+      }
     }
     Printer.Unindent();
   }
@@ -981,6 +1149,19 @@ static void dumpPretty(StringRef Path) {
   if (opts::pretty::Lines) {
     Printer.NewLine();
   }
+  if (opts::pretty::InjectedSources) {
+    Printer.NewLine();
+    WithColor(Printer, PDB_ColorItem::SectionHeader).get()
+        << "---INJECTED SOURCES---";
+    AutoIndent Indent1(Printer);
+
+    if (ReaderType == PDB_ReaderType::Native)
+      Printer.printLine(
+          "Injected sources are not supported with the native reader.");
+    else
+      dumpInjectedSources(Printer, *Session);
+  }
+
   outs().flush();
 }
 
@@ -1033,6 +1214,58 @@ static void mergePdbs() {
   ExitOnErr(Builder.commit(OutFile));
 }
 
+static void explain() {
+  std::unique_ptr<IPDBSession> Session;
+  InputFile IF =
+      ExitOnErr(InputFile::open(opts::explain::InputFilename.front(), true));
+
+  for (uint64_t Off : opts::explain::Offsets) {
+    auto O = llvm::make_unique<ExplainOutputStyle>(IF, Off);
+
+    ExitOnErr(O->dump());
+  }
+}
+
+static void exportStream() {
+  std::unique_ptr<IPDBSession> Session;
+  PDBFile &File = loadPDB(opts::exportstream::InputFilename.front(), Session);
+
+  std::unique_ptr<MappedBlockStream> SourceStream;
+  uint32_t Index = 0;
+  bool Success = false;
+  std::string OutFileName = opts::exportstream::OutputFile;
+
+  if (!opts::exportstream::ForceName) {
+    // First try to parse it as an integer, if it fails fall back to treating it
+    // as a named stream.
+    if (to_integer(opts::exportstream::Stream, Index)) {
+      if (Index >= File.getNumStreams()) {
+        errs() << "Error: " << Index << " is not a valid stream index.\n";
+        exit(1);
+      }
+      Success = true;
+      outs() << "Dumping contents of stream index " << Index << " to file "
+             << OutFileName << ".\n";
+    }
+  }
+
+  if (!Success) {
+    InfoStream &IS = cantFail(File.getPDBInfoStream());
+    Index = ExitOnErr(IS.getNamedStreamIndex(opts::exportstream::Stream));
+    outs() << "Dumping contents of stream '" << opts::exportstream::Stream
+           << "' (index " << Index << ") to file " << OutFileName << ".\n";
+  }
+
+  SourceStream = MappedBlockStream::createIndexedStream(
+      File.getMsfLayout(), File.getMsfBuffer(), Index, File.getAllocator());
+  auto OutFile = ExitOnErr(
+      FileOutputBuffer::create(OutFileName, SourceStream->getLength()));
+  FileBufferByteStream DestStream(std::move(OutFile), llvm::support::little);
+  BinaryStreamWriter Writer(DestStream);
+  ExitOnErr(Writer.writeStreamRef(*SourceStream));
+  ExitOnErr(DestStream.commit());
+}
+
 static bool parseRange(StringRef Str,
                        Optional<opts::bytes::NumberRange> &Parsed) {
   if (Str.empty())
@@ -1064,21 +1297,11 @@ static void simplifyChunkList(llvm::cl::list<opts::ModuleSubsection> &Chunks) {
   Chunks.push_back(opts::ModuleSubsection::All);
 }
 
-int main(int argc_, const char *argv_[]) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv_[0]);
-  PrettyStackTraceProgram X(argc_, argv_);
-
+int main(int Argc, const char **Argv) {
+  InitLLVM X(Argc, Argv);
   ExitOnErr.setBanner("llvm-pdbutil: ");
 
-  SmallVector<const char *, 256> argv;
-  SpecificBumpPtrAllocator<char> ArgAllocator;
-  ExitOnErr(errorCodeToError(sys::Process::GetArgumentVector(
-      argv, makeArrayRef(argv_, argc_), ArgAllocator)));
-
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
-
-  cl::ParseCommandLineOptions(argv.size(), argv.data(), "LLVM PDB Dumper\n");
+  cl::ParseCommandLineOptions(Argc, Argv, "LLVM PDB Dumper\n");
 
   if (opts::BytesSubcommand) {
     if (!parseRange(opts::bytes::DumpBlockRangeOpt,
@@ -1113,6 +1336,7 @@ int main(int argc_, const char *argv_[]) {
       opts::dump::DumpStreams = true;
       opts::dump::DumpStreamBlocks = true;
       opts::dump::DumpStringTable = true;
+      opts::dump::DumpStringTableDetails = true;
       opts::dump::DumpSummary = true;
       opts::dump::DumpSymbols = true;
       opts::dump::DumpSymbolStats = true;
@@ -1146,11 +1370,6 @@ int main(int argc_, const char *argv_[]) {
     if (opts::pdb2yaml::DumpModules)
       opts::pdb2yaml::DbiStream = true;
   }
-  if (opts::DiffSubcommand) {
-    if (!opts::diff::PrintResultColumn && !opts::diff::PrintValueColumns) {
-      llvm::errs() << "WARNING: No diff columns specified\n";
-    }
-  }
 
   llvm::sys::InitializeCOMRAII COM(llvm::sys::COMThreadingMode::MultiThreaded);
 
@@ -1205,27 +1424,16 @@ int main(int argc_, const char *argv_[]) {
     llvm::for_each(opts::dump::InputFilenames, dumpRaw);
   } else if (opts::BytesSubcommand) {
     llvm::for_each(opts::bytes::InputFilenames, dumpBytes);
-  } else if (opts::DiffSubcommand) {
-    for (StringRef S : opts::diff::RawModiEquivalences) {
-      StringRef Left;
-      StringRef Right;
-      std::tie(Left, Right) = S.split(',');
-      uint32_t X, Y;
-      if (!to_integer(Left, X) || !to_integer(Right, Y)) {
-        errs() << formatv("invalid value {0} specified for modi equivalence\n",
-                          S);
-        exit(1);
-      }
-      opts::diff::Equivalences[X] = Y;
-    }
-
-    diff(opts::diff::Left, opts::diff::Right);
   } else if (opts::MergeSubcommand) {
     if (opts::merge::InputFilenames.size() < 2) {
       errs() << "merge subcommand requires at least 2 input files.\n";
       exit(1);
     }
     mergePdbs();
+  } else if (opts::ExplainSubcommand) {
+    explain();
+  } else if (opts::ExportSubcommand) {
+    exportStream();
   }
 
   outs().flush();
diff --git a/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.h b/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
index 3ce03d5880af..7496adaeb62f 100644
--- a/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
+++ b/contrib/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
@@ -75,6 +75,8 @@ bool compareFunctionSymbols(
 bool compareDataSymbols(const std::unique_ptr<llvm::pdb::PDBSymbolData> &F1,
                         const std::unique_ptr<llvm::pdb::PDBSymbolData> &F2);
 
+extern llvm::cl::list<std::string> WithName;
+
 extern llvm::cl::opt<bool> Compilands;
 extern llvm::cl::opt<bool> Symbols;
 extern llvm::cl::opt<bool> Globals;
@@ -142,7 +144,9 @@ extern llvm::cl::opt<bool> DumpLines;
 extern llvm::cl::opt<bool> DumpInlineeLines;
 extern llvm::cl::opt<bool> DumpXmi;
 extern llvm::cl::opt<bool> DumpXme;
+extern llvm::cl::opt<bool> DumpNamedStreams;
 extern llvm::cl::opt<bool> DumpStringTable;
+extern llvm::cl::opt<bool> DumpStringTableDetails;
 extern llvm::cl::opt<bool> DumpTypes;
 extern llvm::cl::opt<bool> DumpTypeData;
 extern llvm::cl::opt<bool> DumpTypeExtras;
@@ -158,6 +162,7 @@ extern llvm::cl::opt<uint32_t> DumpModi;
 extern llvm::cl::opt<bool> JustMyCode;
 extern llvm::cl::opt<bool> DumpSymbols;
 extern llvm::cl::opt<bool> DumpSymRecordBytes;
+extern llvm::cl::opt<bool> DumpGSIRecords;
 extern llvm::cl::opt<bool> DumpGlobals;
 extern llvm::cl::opt<bool> DumpGlobalExtras;
 extern llvm::cl::opt<bool> DumpPublics;
@@ -187,13 +192,19 @@ extern llvm::cl::list<ModuleSubsection> DumpModuleSubsections;
 extern llvm::cl::opt<bool> DumpModuleSyms;
 } // namespace pdb2yaml
 
-namespace diff {
-extern llvm::cl::opt<bool> PrintValueColumns;
-extern llvm::cl::opt<bool> PrintResultColumn;
-extern llvm::DenseMap<uint32_t, uint32_t> Equivalences;
-extern llvm::cl::opt<std::string> LeftRoot;
-extern llvm::cl::opt<std::string> RightRoot;
-} // namespace diff
+namespace explain {
+enum class InputFileType { PDBFile, PDBStream, DBIStream, Names, ModuleStream };
+
+extern llvm::cl::list<std::string> InputFilename;
+extern llvm::cl::list<uint64_t> Offsets;
+extern llvm::cl::opt<InputFileType> InputType;
+} // namespace explain
+
+namespace exportstream {
+extern llvm::cl::opt<std::string> OutputFile;
+extern llvm::cl::opt<std::string> Stream;
+extern llvm::cl::opt<bool> ForceName;
+} // namespace exportstream
 }
 
 #endif
diff --git a/contrib/llvm/tools/llvm-profdata/llvm-profdata.cpp b/contrib/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 9afd0ae92eae..1a0b9e127bbc 100644
--- a/contrib/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/contrib/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -24,32 +24,42 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
 using namespace llvm;
 
-enum ProfileFormat { PF_None = 0, PF_Text, PF_Binary, PF_GCC };
+enum ProfileFormat {
+  PF_None = 0,
+  PF_Text,
+  PF_Compact_Binary,
+  PF_GCC,
+  PF_Binary
+};
 
-static void warn(StringRef Prefix, Twine Message, std::string Whence = "",
+static void warn(Twine Message, std::string Whence = "",
                  std::string Hint = "") {
-  errs() << Prefix;
+  WithColor::warning();
   if (!Whence.empty())
     errs() << Whence << ": ";
   errs() << Message << "\n";
   if (!Hint.empty())
-    errs() << Hint << "\n";
+    WithColor::note() << Hint << "\n";
 }
 
 static void exitWithError(Twine Message, std::string Whence = "",
                           std::string Hint = "") {
-  warn("error: ", Message, Whence, Hint);
+  WithColor::error();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
   ::exit(1);
 }
 
@@ -232,7 +242,8 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
   if (OutputFilename.compare("-") == 0)
     exitWithError("Cannot write indexed profdata format to stdout.");
 
-  if (OutputFormat != PF_Binary && OutputFormat != PF_Text)
+  if (OutputFormat != PF_Binary && OutputFormat != PF_Compact_Binary &&
+      OutputFormat != PF_Text)
     exitWithError("Unknown format is specified.");
 
   std::error_code EC;
@@ -298,7 +309,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
     if (isFatalError(IPE))
       exitWithError(make_error<InstrProfError>(IPE), WC->ErrWhence);
     else
-      warn("warning: ", toString(make_error<InstrProfError>(IPE)),
+      warn(toString(make_error<InstrProfError>(IPE)),
            WC->ErrWhence);
   }
 
@@ -312,8 +323,8 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 }
 
 static sampleprof::SampleProfileFormat FormatMap[] = {
-    sampleprof::SPF_None, sampleprof::SPF_Text, sampleprof::SPF_Binary,
-    sampleprof::SPF_GCC};
+    sampleprof::SPF_None, sampleprof::SPF_Text, sampleprof::SPF_Compact_Binary,
+    sampleprof::SPF_GCC, sampleprof::SPF_Binary};
 
 static void mergeSampleProfile(const WeightedFileVector &Inputs,
                                StringRef OutputFilename,
@@ -462,6 +473,8 @@ static int merge_main(int argc, const char *argv[]) {
   cl::opt<ProfileFormat> OutputFormat(
       cl::desc("Format of output profile"), cl::init(PF_Binary),
       cl::values(clEnumValN(PF_Binary, "binary", "Binary encoding (default)"),
+                 clEnumValN(PF_Compact_Binary, "compbinary",
+                            "Compact binary encoding"),
                  clEnumValN(PF_Text, "text", "Text encoding"),
                  clEnumValN(PF_GCC, "gcc",
                             "GCC encoding (only meaningful for -sample)")));
@@ -787,7 +800,7 @@ static int show_main(int argc, const char *argv[]) {
     exitWithErrorCode(EC, OutputFilename);
 
   if (ShowAllFunctions && !ShowFunction.empty())
-    errs() << "warning: -function argument ignored: showing all functions\n";
+    WithColor::warning() << "-function argument ignored: showing all functions\n";
 
   std::vector<uint32_t> Cutoffs(DetailedSummaryCutoffs.begin(),
                                 DetailedSummaryCutoffs.end());
@@ -802,10 +815,7 @@ static int show_main(int argc, const char *argv[]) {
 }
 
 int main(int argc, const char *argv[]) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   StringRef ProgName(sys::path::filename(argv[0]));
   if (argc > 1) {
diff --git a/contrib/llvm/tools/llvm-readobj/ARMEHABIPrinter.h b/contrib/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
index 4417aa60fe90..51128f113c4c 100644
--- a/contrib/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
+++ b/contrib/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
@@ -323,10 +323,10 @@ inline void OpcodeDecoder::Decode(const uint8_t *Opcodes, off_t Offset,
 
 template <typename ET>
 class PrinterContext {
-  typedef typename object::ELFFile<ET>::Elf_Sym Elf_Sym;
-  typedef typename object::ELFFile<ET>::Elf_Shdr Elf_Shdr;
-  typedef typename object::ELFFile<ET>::Elf_Rel Elf_Rel;
-  typedef typename object::ELFFile<ET>::Elf_Word Elf_Word;
+  typedef typename ET::Sym Elf_Sym;
+  typedef typename ET::Shdr Elf_Shdr;
+  typedef typename ET::Rel Elf_Rel;
+  typedef typename ET::Word Elf_Word;
 
   ScopedPrinter &SW;
   const object::ELFFile<ET> *ELF;
@@ -386,7 +386,7 @@ PrinterContext<ET>::FunctionAtAddress(unsigned Section,
 }
 
 template <typename ET>
-const typename object::ELFFile<ET>::Elf_Shdr *
+const typename ET::Shdr *
 PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
                                        off_t IndexTableOffset) const {
   /// Iterate through the sections, searching for the relocation section
@@ -410,7 +410,7 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
       if (R.r_offset != static_cast<unsigned>(IndexTableOffset))
         continue;
 
-      typename object::ELFFile<ET>::Elf_Rela RelA;
+      typename ET::Rela RelA;
       RelA.r_offset = R.r_offset;
       RelA.r_info = R.r_info;
       RelA.r_addend = 0;
@@ -586,4 +586,3 @@ void PrinterContext<ET>::PrintUnwindInformation() const {
 }
 
 #endif
-
diff --git a/contrib/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/contrib/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 1a033b1eb42e..a90840b22c8d 100644
--- a/contrib/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/contrib/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -33,7 +33,7 @@
 // (.pdata) entry.
 //
 // The exception data contains information about the frame setup, all of the
-// epilouge scopes (for functions for which there are multiple exit points) and
+// epilogue scopes (for functions for which there are multiple exit points) and
 // the associated exception handler.  Additionally, the entry contains byte-code
 // describing how to unwind the function (c.f. Decoder::decodeOpcodes).
 //
diff --git a/contrib/llvm/tools/llvm-readobj/COFFDumper.cpp b/contrib/llvm/tools/llvm-readobj/COFFDumper.cpp
index 0e76e75c085d..0ed4ccd09f6f 100644
--- a/contrib/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/contrib/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the COFF-specific dumper for llvm-readobj.
+/// This file implements the COFF-specific dumper for llvm-readobj.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -67,6 +67,8 @@ struct LoadConfigTables {
   uint32_t GuardFlags = 0;
   uint64_t GuardFidTableVA = 0;
   uint64_t GuardFidTableCount = 0;
+  uint64_t GuardLJmpTableVA = 0;
+  uint64_t GuardLJmpTableCount = 0;
 };
 
 class COFFDumper : public ObjDumper {
@@ -242,7 +244,7 @@ std::error_code createCOFFDumper(const object::ObjectFile *Obj,
 
 } // namespace llvm
 
-// Given a a section and an offset into this section the function returns the
+// Given a section and an offset into this section the function returns the
 // symbol used for the relocation at the offset.
 std::error_code COFFDumper::resolveSymbol(const coff_section *Section,
                                           uint64_t Offset, SymbolRef &Sym) {
@@ -605,8 +607,8 @@ void COFFDumper::cacheRelocations() {
       RelocMap[Section].push_back(Reloc);
 
     // Sort relocations by address.
-    std::sort(RelocMap[Section].begin(), RelocMap[Section].end(),
-              relocAddressLess);
+    llvm::sort(RelocMap[Section].begin(), RelocMap[Section].end(),
+               relocAddressLess);
   }
 }
 
@@ -767,7 +769,7 @@ void COFFDumper::printRVATable(uint64_t TableVA, uint64_t Count,
   for (uintptr_t I = TableStart; I < TableEnd; I += EntrySize) {
     uint32_t RVA = *reinterpret_cast<const ulittle32_t *>(I);
     raw_ostream &OS = W.startLine();
-    OS << "0x" << W.hex(Obj->getImageBase() + RVA);
+    OS << W.hex(Obj->getImageBase() + RVA);
     if (PrintExtra)
       PrintExtra(OS, reinterpret_cast<const uint8_t *>(I));
     OS << '\n';
@@ -800,6 +802,11 @@ void COFFDumper::printCOFFLoadConfig() {
       printRVATable(Tables.GuardFidTableVA, Tables.GuardFidTableCount, 4);
     }
   }
+
+  if (Tables.GuardLJmpTableVA) {
+    ListScope LS(W, "GuardLJmpTable");
+    printRVATable(Tables.GuardLJmpTableVA, Tables.GuardLJmpTableCount, 4);
+  }
 }
 
 template <typename T>
@@ -879,6 +886,9 @@ void COFFDumper::printCOFFLoadConfig(const T *Conf, LoadConfigTables &Tables) {
   W.printHex("GuardRFVerifyStackPointerFunctionPointer",
              Conf->GuardRFVerifyStackPointerFunctionPointer);
   W.printHex("HotPatchTableOffset", Conf->HotPatchTableOffset);
+
+  Tables.GuardLJmpTableVA = Conf->GuardLongJumpTargetTable;
+  Tables.GuardLJmpTableCount = Conf->GuardLongJumpTargetCount;
 }
 
 void COFFDumper::printBaseOfDataField(const pe32_header *Hdr) {
@@ -892,7 +902,9 @@ void COFFDumper::printCodeViewDebugInfo() {
   for (const SectionRef &S : Obj->sections()) {
     StringRef SectionName;
     error(S.getName(SectionName));
-    if (SectionName == ".debug$T")
+    // .debug$T is a standard CodeView type section, while .debug$P is the same
+    // format but used for MSVC precompiled header object files.
+    if (SectionName == ".debug$T" || SectionName == ".debug$P")
       printCodeViewTypeSection(SectionName, S);
   }
   for (const SectionRef &S : Obj->sections()) {
@@ -1812,10 +1824,9 @@ void COFFDumper::printStackMap() const {
 
   if (Obj->isLittleEndian())
     prettyPrintStackMap(
-                      llvm::outs(),
-                      StackMapV2Parser<support::little>(StackMapContentsArray));
+        W, StackMapV2Parser<support::little>(StackMapContentsArray));
   else
-    prettyPrintStackMap(llvm::outs(),
+    prettyPrintStackMap(W,
                         StackMapV2Parser<support::big>(StackMapContentsArray));
 }
 
diff --git a/contrib/llvm/tools/llvm-readobj/COFFImportDumper.cpp b/contrib/llvm/tools/llvm-readobj/COFFImportDumper.cpp
index 3b546b3ef508..18010c34f0f3 100644
--- a/contrib/llvm/tools/llvm-readobj/COFFImportDumper.cpp
+++ b/contrib/llvm/tools/llvm-readobj/COFFImportDumper.cpp
@@ -8,41 +8,51 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the COFF import library dumper for llvm-readobj.
+/// This file implements the COFF import library dumper for llvm-readobj.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm::object;
 
 namespace llvm {
 
-void dumpCOFFImportFile(const COFFImportFile *File) {
-  outs() << '\n';
-  outs() << "File: " << File->getFileName() << "\n";
-  outs() << "Format: COFF-import-file\n";
+void dumpCOFFImportFile(const COFFImportFile *File, ScopedPrinter &Writer) {
+  Writer.startLine() << '\n';
+  Writer.printString("File", File->getFileName());
+  Writer.printString("Format", "COFF-import-file");
 
   const coff_import_header *H = File->getCOFFImportHeader();
   switch (H->getType()) {
-  case COFF::IMPORT_CODE:  outs() << "Type: code\n"; break;
-  case COFF::IMPORT_DATA:  outs() << "Type: data\n"; break;
-  case COFF::IMPORT_CONST: outs() << "Type: const\n"; break;
+  case COFF::IMPORT_CODE:  Writer.printString("Type", "code"); break;
+  case COFF::IMPORT_DATA:  Writer.printString("Type", "data"); break;
+  case COFF::IMPORT_CONST: Writer.printString("Type", "const"); break;
   }
 
   switch (H->getNameType()) {
-  case COFF::IMPORT_ORDINAL: outs() << "Name type: ordinal\n"; break;
-  case COFF::IMPORT_NAME: outs() << "Name type: name\n"; break;
-  case COFF::IMPORT_NAME_NOPREFIX: outs() << "Name type: noprefix\n"; break;
-  case COFF::IMPORT_NAME_UNDECORATE: outs() << "Name type: undecorate\n"; break;
+  case COFF::IMPORT_ORDINAL:
+    Writer.printString("Name type", "ordinal");
+    break;
+  case COFF::IMPORT_NAME:
+    Writer.printString("Name type", "name");
+    break;
+  case COFF::IMPORT_NAME_NOPREFIX:
+    Writer.printString("Name type", "noprefix");
+    break;
+  case COFF::IMPORT_NAME_UNDECORATE:
+    Writer.printString("Name type", "undecorate");
+    break;
   }
 
   for (const object::BasicSymbolRef &Sym : File->symbols()) {
-    outs() << "Symbol: ";
-    Sym.printName(outs());
-    outs() << "\n";
+    raw_ostream &OS = Writer.startLine();
+    OS << "Symbol: ";
+    Sym.printName(OS);
+    OS << "\n";
   }
 }
 
diff --git a/contrib/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/contrib/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
new file mode 100644
index 000000000000..5a1eef1d007d
--- /dev/null
+++ b/contrib/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -0,0 +1,245 @@
+//===--- DwarfCFIEHPrinter.h - DWARF-based Unwind Information Printer -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_READOBJ_DWARFCFIEHPRINTER_H
+#define LLVM_TOOLS_LLVM_READOBJ_DWARFCFIEHPRINTER_H
+
+#include "Error.h"
+#include "llvm-readobj.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/type_traits.h"
+
+namespace llvm {
+namespace DwarfCFIEH {
+
+template <typename ELFT>
+class PrinterContext {
+  ScopedPrinter &W;
+  const object::ELFFile<ELFT> *Obj;
+
+  void printEHFrameHdr(uint64_t Offset, uint64_t Address, uint64_t Size) const;
+
+  void printEHFrame(const typename ELFT::Shdr *EHFrameShdr) const;
+
+public:
+  PrinterContext(ScopedPrinter &W, const object::ELFFile<ELFT> *Obj)
+      : W(W), Obj(Obj) {}
+
+  void printUnwindInformation() const;
+};
+
+template <class ELFO>
+static const typename ELFO::Elf_Shdr *findSectionByAddress(const ELFO *Obj,
+                                                           uint64_t Addr) {
+  auto Sections = Obj->sections();
+  if (Error E = Sections.takeError())
+    reportError(toString(std::move(E)));
+
+  for (const auto &Shdr : *Sections)
+    if (Shdr.sh_addr == Addr)
+      return &Shdr;
+  return nullptr;
+}
+
+template <typename ELFT>
+void PrinterContext<ELFT>::printUnwindInformation() const {
+  const typename ELFT::Phdr *EHFramePhdr = nullptr;
+
+  auto PHs = Obj->program_headers();
+  if (Error E = PHs.takeError())
+    reportError(toString(std::move(E)));
+
+  for (const auto &Phdr : *PHs) {
+    if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
+      EHFramePhdr = &Phdr;
+      if (Phdr.p_memsz != Phdr.p_filesz)
+        reportError("p_memsz does not match p_filesz for GNU_EH_FRAME");
+      break;
+    }
+  }
+
+  if (EHFramePhdr)
+    printEHFrameHdr(EHFramePhdr->p_offset, EHFramePhdr->p_vaddr,
+                    EHFramePhdr->p_memsz);
+
+  auto Sections = Obj->sections();
+  if (Error E = Sections.takeError())
+    reportError(toString(std::move(E)));
+
+  for (const auto &Shdr : *Sections) {
+    auto SectionName = Obj->getSectionName(&Shdr);
+    if (Error E = SectionName.takeError())
+      reportError(toString(std::move(E)));
+
+    if (*SectionName == ".eh_frame")
+      printEHFrame(&Shdr);
+  }
+}
+
+template <typename ELFT>
+void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
+                                           uint64_t EHFrameHdrAddress,
+                                           uint64_t EHFrameHdrSize) const {
+  ListScope L(W, "EH_FRAME Header");
+  W.startLine() << format("Address: 0x%" PRIx64 "\n", EHFrameHdrAddress);
+  W.startLine() << format("Offset: 0x%" PRIx64 "\n", EHFrameHdrOffset);
+  W.startLine() << format("Size: 0x%" PRIx64 "\n", EHFrameHdrSize);
+
+  const auto *EHFrameHdrShdr = findSectionByAddress(Obj, EHFrameHdrAddress);
+  if (EHFrameHdrShdr) {
+    auto SectionName = Obj->getSectionName(EHFrameHdrShdr);
+    if (Error E = SectionName.takeError())
+      reportError(toString(std::move(E)));
+
+    W.printString("Corresponding Section", *SectionName);
+  }
+
+  DataExtractor DE(
+      StringRef(reinterpret_cast<const char *>(Obj->base()) + EHFrameHdrOffset,
+                EHFrameHdrSize),
+      ELFT::TargetEndianness == support::endianness::little,
+      ELFT::Is64Bits ? 8 : 4);
+
+  DictScope D(W, "Header");
+  uint32_t Offset = 0;
+
+  auto Version = DE.getU8(&Offset);
+  W.printNumber("version", Version);
+  if (Version != 1)
+    reportError("only version 1 of .eh_frame_hdr is supported");
+
+  uint64_t EHFramePtrEnc = DE.getU8(&Offset);
+  W.startLine() << format("eh_frame_ptr_enc: 0x%" PRIx64 "\n", EHFramePtrEnc);
+  if (EHFramePtrEnc != (dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4))
+    reportError("unexpected encoding eh_frame_ptr_enc");
+
+  uint64_t FDECountEnc = DE.getU8(&Offset);
+  W.startLine() << format("fde_count_enc: 0x%" PRIx64 "\n", FDECountEnc);
+  if (FDECountEnc != dwarf::DW_EH_PE_udata4)
+    reportError("unexpected encoding fde_count_enc");
+
+  uint64_t TableEnc = DE.getU8(&Offset);
+  W.startLine() << format("table_enc: 0x%" PRIx64 "\n", TableEnc);
+  if (TableEnc != (dwarf::DW_EH_PE_datarel | dwarf::DW_EH_PE_sdata4))
+    reportError("unexpected encoding table_enc");
+
+  auto EHFramePtr = DE.getSigned(&Offset, 4) + EHFrameHdrAddress + 4;
+  W.startLine() << format("eh_frame_ptr: 0x%" PRIx64 "\n", EHFramePtr);
+
+  auto FDECount = DE.getUnsigned(&Offset, 4);
+  W.printNumber("fde_count", FDECount);
+
+  unsigned NumEntries = 0;
+  uint64_t PrevPC = 0;
+  while (Offset + 8 <= EHFrameHdrSize && NumEntries < FDECount) {
+    DictScope D(W, std::string("entry ")  + std::to_string(NumEntries));
+
+    auto InitialPC = DE.getSigned(&Offset, 4) + EHFrameHdrAddress;
+    W.startLine() << format("initial_location: 0x%" PRIx64 "\n", InitialPC);
+    auto Address = DE.getSigned(&Offset, 4) + EHFrameHdrAddress;
+    W.startLine() << format("address: 0x%" PRIx64 "\n", Address);
+
+    if (InitialPC < PrevPC)
+      reportError("initial_location is out of order");
+
+    PrevPC = InitialPC;
+    ++NumEntries;
+  }
+}
+
+template <typename ELFT>
+void PrinterContext<ELFT>::printEHFrame(
+    const typename ELFT::Shdr *EHFrameShdr) const {
+  uint64_t Address = EHFrameShdr->sh_addr;
+  uint64_t ShOffset = EHFrameShdr->sh_offset;
+  W.startLine() << format(".eh_frame section at offset 0x%" PRIx64
+                          " address 0x%" PRIx64 ":\n",
+                          ShOffset, Address);
+  W.indent();
+
+  auto Result = Obj->getSectionContents(EHFrameShdr);
+  if (Error E = Result.takeError())
+    reportError(toString(std::move(E)));
+
+  auto Contents = Result.get();
+  DWARFDataExtractor DE(
+      StringRef(reinterpret_cast<const char *>(Contents.data()),
+                Contents.size()),
+      ELFT::TargetEndianness == support::endianness::little,
+      ELFT::Is64Bits ? 8 : 4);
+  DWARFDebugFrame EHFrame(/*IsEH=*/true, /*EHFrameAddress=*/Address);
+  EHFrame.parse(DE);
+
+  for (const auto &Entry : EHFrame) {
+    if (const auto *CIE = dyn_cast<dwarf::CIE>(&Entry)) {
+      W.startLine() << format("[0x%" PRIx64 "] CIE length=%" PRIu64 "\n",
+                              Address + CIE->getOffset(),
+                              CIE->getLength());
+      W.indent();
+
+      W.printNumber("version", CIE->getVersion());
+      W.printString("augmentation", CIE->getAugmentationString());
+      W.printNumber("code_alignment_factor", CIE->getCodeAlignmentFactor());
+      W.printNumber("data_alignment_factor", CIE->getDataAlignmentFactor());
+      W.printNumber("return_address_register", CIE->getReturnAddressRegister());
+
+      W.getOStream() << "\n";
+      W.startLine() << "Program:\n";
+      W.indent();
+      CIE->cfis().dump(W.getOStream(), nullptr, W.getIndentLevel());
+      W.unindent();
+
+      W.unindent();
+      W.getOStream() << "\n";
+
+    } else if (const auto *FDE = dyn_cast<dwarf::FDE>(&Entry)) {
+      W.startLine() << format("[0x%" PRIx64 "] FDE length=%" PRIu64
+                              " cie=[0x%" PRIx64 "]\n",
+                              Address + FDE->getOffset(),
+                              FDE->getLength(),
+                              Address + FDE->getLinkedCIE()->getOffset());
+      W.indent();
+
+      W.startLine() << format("initial_location: 0x%" PRIx64 "\n",
+                              FDE->getInitialLocation());
+      W.startLine()
+        << format("address_range: 0x%" PRIx64 " (end : 0x%" PRIx64 ")\n",
+                  FDE->getAddressRange(),
+                  FDE->getInitialLocation() + FDE->getAddressRange());
+
+      W.getOStream() << "\n";
+      W.startLine() << "Program:\n";
+      W.indent();
+      FDE->cfis().dump(W.getOStream(), nullptr, W.getIndentLevel());
+      W.unindent();
+
+      W.unindent();
+      W.getOStream() << "\n";
+    } else {
+      llvm_unreachable("unexpected DWARF frame kind");
+    }
+  }
+
+  W.unindent();
+}
+
+}
+}
+
+#endif
diff --git a/contrib/llvm/tools/llvm-readobj/ELFDumper.cpp b/contrib/llvm/tools/llvm-readobj/ELFDumper.cpp
index 5605eaea7555..645ec2d7e04b 100644
--- a/contrib/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/contrib/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -8,11 +8,12 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements the ELF-specific dumper for llvm-readobj.
+/// This file implements the ELF-specific dumper for llvm-readobj.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "ARMEHABIPrinter.h"
+#include "DwarfCFIEHPrinter.h"
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StackMapPrinter.h"
@@ -43,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MipsABIFlags.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -77,28 +79,32 @@ using namespace ELF;
 
 #define TYPEDEF_ELF_TYPES(ELFT)                                                \
   using ELFO = ELFFile<ELFT>;                                                  \
-  using Elf_Addr = typename ELFO::Elf_Addr;                                    \
-  using Elf_Shdr = typename ELFO::Elf_Shdr;                                    \
-  using Elf_Sym = typename ELFO::Elf_Sym;                                      \
-  using Elf_Dyn = typename ELFO::Elf_Dyn;                                      \
-  using Elf_Dyn_Range = typename ELFO::Elf_Dyn_Range;                          \
-  using Elf_Rel = typename ELFO::Elf_Rel;                                      \
-  using Elf_Rela = typename ELFO::Elf_Rela;                                    \
-  using Elf_Rel_Range = typename ELFO::Elf_Rel_Range;                          \
-  using Elf_Rela_Range = typename ELFO::Elf_Rela_Range;                        \
-  using Elf_Phdr = typename ELFO::Elf_Phdr;                                    \
-  using Elf_Half = typename ELFO::Elf_Half;                                    \
-  using Elf_Ehdr = typename ELFO::Elf_Ehdr;                                    \
-  using Elf_Word = typename ELFO::Elf_Word;                                    \
-  using Elf_Hash = typename ELFO::Elf_Hash;                                    \
-  using Elf_GnuHash = typename ELFO::Elf_GnuHash;                              \
-  using Elf_Sym_Range = typename ELFO::Elf_Sym_Range;                          \
-  using Elf_Versym = typename ELFO::Elf_Versym;                                \
-  using Elf_Verneed = typename ELFO::Elf_Verneed;                              \
-  using Elf_Vernaux = typename ELFO::Elf_Vernaux;                              \
-  using Elf_Verdef = typename ELFO::Elf_Verdef;                                \
-  using Elf_Verdaux = typename ELFO::Elf_Verdaux;                              \
-  using uintX_t = typename ELFO::uintX_t;
+  using Elf_Addr = typename ELFT::Addr;                                        \
+  using Elf_Shdr = typename ELFT::Shdr;                                        \
+  using Elf_Sym = typename ELFT::Sym;                                          \
+  using Elf_Dyn = typename ELFT::Dyn;                                          \
+  using Elf_Dyn_Range = typename ELFT::DynRange;                               \
+  using Elf_Rel = typename ELFT::Rel;                                          \
+  using Elf_Rela = typename ELFT::Rela;                                        \
+  using Elf_Relr = typename ELFT::Relr;                                        \
+  using Elf_Rel_Range = typename ELFT::RelRange;                               \
+  using Elf_Rela_Range = typename ELFT::RelaRange;                             \
+  using Elf_Relr_Range = typename ELFT::RelrRange;                             \
+  using Elf_Phdr = typename ELFT::Phdr;                                        \
+  using Elf_Half = typename ELFT::Half;                                        \
+  using Elf_Ehdr = typename ELFT::Ehdr;                                        \
+  using Elf_Word = typename ELFT::Word;                                        \
+  using Elf_Hash = typename ELFT::Hash;                                        \
+  using Elf_GnuHash = typename ELFT::GnuHash;                                  \
+  using Elf_Note  = typename ELFT::Note;                                       \
+  using Elf_Sym_Range = typename ELFT::SymRange;                               \
+  using Elf_Versym = typename ELFT::Versym;                                    \
+  using Elf_Verneed = typename ELFT::Verneed;                                  \
+  using Elf_Vernaux = typename ELFT::Vernaux;                                  \
+  using Elf_Verdef = typename ELFT::Verdef;                                    \
+  using Elf_Verdaux = typename ELFT::Verdaux;                                  \
+  using Elf_CGProfile = typename ELFT::CGProfile;                              \
+  using uintX_t = typename ELFT::uint;
 
 namespace {
 
@@ -113,11 +119,11 @@ struct DynRegionInfo {
   DynRegionInfo(const void *A, uint64_t S, uint64_t ES)
       : Addr(A), Size(S), EntSize(ES) {}
 
-  /// \brief Address in current address space.
+  /// Address in current address space.
   const void *Addr = nullptr;
-  /// \brief Size in bytes of the region.
+  /// Size in bytes of the region.
   uint64_t Size = 0;
-  /// \brief Size of each entity in the region.
+  /// Size of each entity in the region.
   uint64_t EntSize = 0;
 
   template <typename Type> ArrayRef<Type> getAsArrayRef() const {
@@ -162,8 +168,13 @@ public:
 
   void printHashHistogram() override;
 
+  void printCGProfile() override;
+  void printAddrsig() override;
+
   void printNotes() override;
 
+  void printELFLinkerOptions() override;
+
 private:
   std::unique_ptr<DumpStyle<ELFT>> ELFDumperStyle;
 
@@ -198,6 +209,7 @@ private:
   const ELFO *Obj;
   DynRegionInfo DynRelRegion;
   DynRegionInfo DynRelaRegion;
+  DynRegionInfo DynRelrRegion;
   DynRegionInfo DynPLTRelRegion;
   DynRegionInfo DynSymRegion;
   DynRegionInfo DynamicTable;
@@ -206,6 +218,8 @@ private:
   const Elf_Hash *HashTable = nullptr;
   const Elf_GnuHash *GnuHashTable = nullptr;
   const Elf_Shdr *DotSymtabSec = nullptr;
+  const Elf_Shdr *DotCGProfileSec = nullptr;
+  const Elf_Shdr *DotAddrsigSec = nullptr;
   StringRef DynSymtabName;
   ArrayRef<Elf_Word> ShndxTable;
 
@@ -248,18 +262,23 @@ public:
 
   Elf_Rel_Range dyn_rels() const;
   Elf_Rela_Range dyn_relas() const;
+  Elf_Relr_Range dyn_relrs() const;
   std::string getFullSymbolName(const Elf_Sym *Symbol, StringRef StrTable,
                                 bool IsDynamic) const;
   void getSectionNameIndex(const Elf_Sym *Symbol, const Elf_Sym *FirstSym,
                            StringRef &SectionName,
                            unsigned &SectionIndex) const;
+  StringRef getStaticSymbolName(uint32_t Index) const;
 
   void printSymbolsHelper(bool IsDynamic) const;
   const Elf_Shdr *getDotSymtabSec() const { return DotSymtabSec; }
+  const Elf_Shdr *getDotCGProfileSec() const { return DotCGProfileSec; }
+  const Elf_Shdr *getDotAddrsigSec() const { return DotAddrsigSec; }
   ArrayRef<Elf_Word> getShndxTable() const { return ShndxTable; }
   StringRef getDynamicStringTable() const { return DynamicStringTable; }
   const DynRegionInfo &getDynRelRegion() const { return DynRelRegion; }
   const DynRegionInfo &getDynRelaRegion() const { return DynRelaRegion; }
+  const DynRegionInfo &getDynRelrRegion() const { return DynRelrRegion; }
   const DynRegionInfo &getDynPLTRelRegion() const { return DynPLTRelRegion; }
   const Elf_Hash *getHashTable() const { return HashTable; }
   const Elf_GnuHash *getGnuHashTable() const { return GnuHashTable; }
@@ -295,8 +314,8 @@ template <class ELFT> class MipsGOTParser;
 
 template <typename ELFT> class DumpStyle {
 public:
-  using Elf_Shdr = typename ELFFile<ELFT>::Elf_Shdr;
-  using Elf_Sym = typename ELFFile<ELFT>::Elf_Sym;
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Sym = typename ELFT::Sym;
 
   DumpStyle(ELFDumper<ELFT> *Dumper) : Dumper(Dumper) {}
   virtual ~DumpStyle() = default;
@@ -315,7 +334,10 @@ public:
                            bool IsDynamic) = 0;
   virtual void printProgramHeaders(const ELFFile<ELFT> *Obj) = 0;
   virtual void printHashHistogram(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printCGProfile(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printAddrsig(const ELFFile<ELFT> *Obj) = 0;
   virtual void printNotes(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printELFLinkerOptions(const ELFFile<ELFT> *Obj) = 0;
   virtual void printMipsGOT(const MipsGOTParser<ELFT> &Parser) = 0;
   virtual void printMipsPLT(const MipsGOTParser<ELFT> &Parser) = 0;
   const ELFDumper<ELFT> *dumper() const { return Dumper; }
@@ -344,7 +366,10 @@ public:
                           size_t Offset) override;
   void printProgramHeaders(const ELFO *Obj) override;
   void printHashHistogram(const ELFFile<ELFT> *Obj) override;
+  void printCGProfile(const ELFFile<ELFT> *Obj) override;
+  void printAddrsig(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
+  void printELFLinkerOptions(const ELFFile<ELFT> *Obj) override;
   void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
   void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
 
@@ -374,6 +399,7 @@ private:
   }
   void printHashedSymbol(const ELFO *Obj, const Elf_Sym *FirstSym, uint32_t Sym,
                          StringRef StrTable, uint32_t Bucket);
+  void printRelocHeader(unsigned SType);
   void printRelocation(const ELFO *Obj, const Elf_Shdr *SymTab,
                        const Elf_Rela &R, bool IsRela);
   void printSymbol(const ELFO *Obj, const Elf_Sym *Symbol, const Elf_Sym *First,
@@ -404,7 +430,10 @@ public:
   void printDynamicRelocations(const ELFO *Obj) override;
   void printProgramHeaders(const ELFO *Obj) override;
   void printHashHistogram(const ELFFile<ELFT> *Obj) override;
+  void printCGProfile(const ELFFile<ELFT> *Obj) override;
+  void printAddrsig(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
+  void printELFLinkerOptions(const ELFFile<ELFT> *Obj) override;
   void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
   void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
 
@@ -730,6 +759,16 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
 }
 
 template <typename ELFT>
+StringRef ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
+  StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*DotSymtabSec));
+  Elf_Sym_Range Syms = unwrapOrError(Obj->symbols(DotSymtabSec));
+  if (Index >= Syms.size())
+    reportError("Invalid symbol index");
+  const Elf_Sym *Sym = &Syms[Index];
+  return unwrapOrError(Sym->getName(StrTable));
+}
+
+template <typename ELFT>
 std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
                                                StringRef StrTable,
                                                bool IsDynamic) const {
@@ -1007,7 +1046,6 @@ static const EnumEntry<unsigned> ElfMachineType[] = {
   ENUM_ENT(EM_56800EX,       "EM_56800EX"),
   ENUM_ENT(EM_AMDGPU,        "EM_AMDGPU"),
   ENUM_ENT(EM_RISCV,         "RISC-V"),
-  ENUM_ENT(EM_WEBASSEMBLY,   "EM_WEBASSEMBLY"),
   ENUM_ENT(EM_LANAI,         "EM_LANAI"),
   ENUM_ENT(EM_BPF,           "EM_BPF"),
 };
@@ -1255,9 +1293,39 @@ static const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
 };
 
 static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_ARCH_NONE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_ARCH_R600),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_ARCH_GCN)
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_NONE),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_R600),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_R630),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_RS880),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_RV670),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_RV710),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_RV730),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_RV770),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_CEDAR),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_CYPRESS),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_JUNIPER),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_REDWOOD),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_SUMO),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_BARTS),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_CAICOS),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_CAYMAN),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_TURKS),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX600),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX601),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX700),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX701),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX702),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX703),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX704),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX801),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX802),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX803),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX810),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX900),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX902),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK)
 };
 
 static const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
@@ -1353,6 +1421,16 @@ ELFDumper<ELFT>::ELFDumper(const ELFFile<ELFT> *Obj, ScopedPrinter &Writer)
         reportError("Multiple SHT_GNU_verneed");
       dot_gnu_version_r_sec = &Sec;
       break;
+    case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
+      if (DotCGProfileSec != nullptr)
+        reportError("Multiple .note.llvm.cgprofile");
+      DotCGProfileSec = &Sec;
+      break;
+    case ELF::SHT_LLVM_ADDRSIG:
+      if (DotAddrsigSec != nullptr)
+        reportError("Multiple .llvm_addrsig");
+      DotAddrsigSec = &Sec;
+      break;
     }
   }
 
@@ -1427,6 +1505,18 @@ void ELFDumper<ELFT>::parseDynamicTable(
     case ELF::DT_RELENT:
       DynRelRegion.EntSize = Dyn.getVal();
       break;
+    case ELF::DT_RELR:
+    case ELF::DT_ANDROID_RELR:
+      DynRelrRegion.Addr = toMappedAddr(Dyn.getPtr());
+      break;
+    case ELF::DT_RELRSZ:
+    case ELF::DT_ANDROID_RELRSZ:
+      DynRelrRegion.Size = Dyn.getVal();
+      break;
+    case ELF::DT_RELRENT:
+    case ELF::DT_ANDROID_RELRENT:
+      DynRelrRegion.EntSize = Dyn.getVal();
+      break;
     case ELF::DT_PLTREL:
       if (Dyn.getVal() == DT_REL)
         DynPLTRelRegion.EntSize = sizeof(Elf_Rel);
@@ -1460,6 +1550,11 @@ typename ELFDumper<ELFT>::Elf_Rela_Range ELFDumper<ELFT>::dyn_relas() const {
   return DynRelaRegion.getAsArrayRef<Elf_Rela>();
 }
 
+template <typename ELFT>
+typename ELFDumper<ELFT>::Elf_Relr_Range ELFDumper<ELFT>::dyn_relrs() const {
+  return DynRelrRegion.getAsArrayRef<Elf_Relr>();
+}
+
 template<class ELFT>
 void ELFDumper<ELFT>::printFileHeaders() {
   ELFDumperStyle->printFileHeaders(Obj);
@@ -1497,93 +1592,69 @@ template <class ELFT> void ELFDumper<ELFT>::printHashHistogram() {
   ELFDumperStyle->printHashHistogram(Obj);
 }
 
+template <class ELFT> void ELFDumper<ELFT>::printCGProfile() {
+  ELFDumperStyle->printCGProfile(Obj);
+}
+
 template <class ELFT> void ELFDumper<ELFT>::printNotes() {
   ELFDumperStyle->printNotes(Obj);
 }
 
-#define LLVM_READOBJ_TYPE_CASE(name) \
-  case DT_##name: return #name
+template <class ELFT> void ELFDumper<ELFT>::printELFLinkerOptions() {
+  ELFDumperStyle->printELFLinkerOptions(Obj);
+}
 
 static const char *getTypeString(unsigned Arch, uint64_t Type) {
+#define DYNAMIC_TAG(n, v)
   switch (Arch) {
   case EM_HEXAGON:
     switch (Type) {
-    LLVM_READOBJ_TYPE_CASE(HEXAGON_SYMSZ);
-    LLVM_READOBJ_TYPE_CASE(HEXAGON_VER);
-    LLVM_READOBJ_TYPE_CASE(HEXAGON_PLT);
+#define HEXAGON_DYNAMIC_TAG(name, value)                                       \
+  case DT_##name:                                                              \
+    return #name;
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef HEXAGON_DYNAMIC_TAG
     }
+
   case EM_MIPS:
     switch (Type) {
-    LLVM_READOBJ_TYPE_CASE(MIPS_RLD_MAP_REL);
-    LLVM_READOBJ_TYPE_CASE(MIPS_RLD_VERSION);
-    LLVM_READOBJ_TYPE_CASE(MIPS_FLAGS);
-    LLVM_READOBJ_TYPE_CASE(MIPS_BASE_ADDRESS);
-    LLVM_READOBJ_TYPE_CASE(MIPS_LOCAL_GOTNO);
-    LLVM_READOBJ_TYPE_CASE(MIPS_SYMTABNO);
-    LLVM_READOBJ_TYPE_CASE(MIPS_UNREFEXTNO);
-    LLVM_READOBJ_TYPE_CASE(MIPS_GOTSYM);
-    LLVM_READOBJ_TYPE_CASE(MIPS_RLD_MAP);
-    LLVM_READOBJ_TYPE_CASE(MIPS_PLTGOT);
-    LLVM_READOBJ_TYPE_CASE(MIPS_OPTIONS);
+#define MIPS_DYNAMIC_TAG(name, value)                                          \
+  case DT_##name:                                                              \
+    return #name;
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef MIPS_DYNAMIC_TAG
+    }
+
+    case EM_PPC64:
+      switch(Type) {
+#define PPC64_DYNAMIC_TAG(name, value)                                         \
+    case DT_##name:                                                            \
+      return #name;
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef PPC64_DYNAMIC_TAG
     }
   }
+#undef DYNAMIC_TAG
   switch (Type) {
-  LLVM_READOBJ_TYPE_CASE(ANDROID_REL);
-  LLVM_READOBJ_TYPE_CASE(ANDROID_RELSZ);
-  LLVM_READOBJ_TYPE_CASE(ANDROID_RELA);
-  LLVM_READOBJ_TYPE_CASE(ANDROID_RELASZ);
-  LLVM_READOBJ_TYPE_CASE(BIND_NOW);
-  LLVM_READOBJ_TYPE_CASE(DEBUG);
-  LLVM_READOBJ_TYPE_CASE(FINI);
-  LLVM_READOBJ_TYPE_CASE(FINI_ARRAY);
-  LLVM_READOBJ_TYPE_CASE(FINI_ARRAYSZ);
-  LLVM_READOBJ_TYPE_CASE(FLAGS);
-  LLVM_READOBJ_TYPE_CASE(FLAGS_1);
-  LLVM_READOBJ_TYPE_CASE(HASH);
-  LLVM_READOBJ_TYPE_CASE(INIT);
-  LLVM_READOBJ_TYPE_CASE(INIT_ARRAY);
-  LLVM_READOBJ_TYPE_CASE(INIT_ARRAYSZ);
-  LLVM_READOBJ_TYPE_CASE(PREINIT_ARRAY);
-  LLVM_READOBJ_TYPE_CASE(PREINIT_ARRAYSZ);
-  LLVM_READOBJ_TYPE_CASE(JMPREL);
-  LLVM_READOBJ_TYPE_CASE(NEEDED);
-  LLVM_READOBJ_TYPE_CASE(NULL);
-  LLVM_READOBJ_TYPE_CASE(PLTGOT);
-  LLVM_READOBJ_TYPE_CASE(PLTREL);
-  LLVM_READOBJ_TYPE_CASE(PLTRELSZ);
-  LLVM_READOBJ_TYPE_CASE(REL);
-  LLVM_READOBJ_TYPE_CASE(RELA);
-  LLVM_READOBJ_TYPE_CASE(RELENT);
-  LLVM_READOBJ_TYPE_CASE(RELSZ);
-  LLVM_READOBJ_TYPE_CASE(RELAENT);
-  LLVM_READOBJ_TYPE_CASE(RELASZ);
-  LLVM_READOBJ_TYPE_CASE(RPATH);
-  LLVM_READOBJ_TYPE_CASE(RUNPATH);
-  LLVM_READOBJ_TYPE_CASE(SONAME);
-  LLVM_READOBJ_TYPE_CASE(STRSZ);
-  LLVM_READOBJ_TYPE_CASE(STRTAB);
-  LLVM_READOBJ_TYPE_CASE(SYMBOLIC);
-  LLVM_READOBJ_TYPE_CASE(SYMENT);
-  LLVM_READOBJ_TYPE_CASE(SYMTAB);
-  LLVM_READOBJ_TYPE_CASE(TEXTREL);
-  LLVM_READOBJ_TYPE_CASE(VERDEF);
-  LLVM_READOBJ_TYPE_CASE(VERDEFNUM);
-  LLVM_READOBJ_TYPE_CASE(VERNEED);
-  LLVM_READOBJ_TYPE_CASE(VERNEEDNUM);
-  LLVM_READOBJ_TYPE_CASE(VERSYM);
-  LLVM_READOBJ_TYPE_CASE(RELACOUNT);
-  LLVM_READOBJ_TYPE_CASE(RELCOUNT);
-  LLVM_READOBJ_TYPE_CASE(GNU_HASH);
-  LLVM_READOBJ_TYPE_CASE(TLSDESC_PLT);
-  LLVM_READOBJ_TYPE_CASE(TLSDESC_GOT);
-  LLVM_READOBJ_TYPE_CASE(AUXILIARY);
-  LLVM_READOBJ_TYPE_CASE(FILTER);
+// Now handle all dynamic tags except the architecture specific ones
+#define MIPS_DYNAMIC_TAG(name, value)
+#define HEXAGON_DYNAMIC_TAG(name, value)
+#define PPC64_DYNAMIC_TAG(name, value)
+// Also ignore marker tags such as DT_HIOS (maps to DT_VERNEEDNUM), etc.
+#define DYNAMIC_TAG_MARKER(name, value)
+#define DYNAMIC_TAG(name, value)                                               \
+  case DT_##name:                                                              \
+    return #name;
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef DYNAMIC_TAG
+#undef MIPS_DYNAMIC_TAG
+#undef HEXAGON_DYNAMIC_TAG
+#undef PPC64_DYNAMIC_TAG
+#undef DYNAMIC_TAG_MARKER
   default: return "unknown";
   }
 }
 
-#undef LLVM_READOBJ_TYPE_CASE
-
 #define LLVM_READOBJ_DT_FLAG_ENT(prefix, enum) \
   { #enum, prefix##_##enum }
 
@@ -1771,16 +1842,20 @@ void ELFDumper<ELFT>::printValue(uint64_t Type, uint64_t Value) {
 
 template<class ELFT>
 void ELFDumper<ELFT>::printUnwindInfo() {
+  const unsigned Machine = Obj->getHeader()->e_machine;
+  if (Machine == EM_386 || Machine == EM_X86_64) {
+    DwarfCFIEH::PrinterContext<ELFT> Ctx(W, Obj);
+    return Ctx.printUnwindInformation();
+  }
   W.startLine() << "UnwindInfo not implemented.\n";
 }
 
 namespace {
 
-template <> void ELFDumper<ELFType<support::little, false>>::printUnwindInfo() {
+template <> void ELFDumper<ELF32LE>::printUnwindInfo() {
   const unsigned Machine = Obj->getHeader()->e_machine;
   if (Machine == EM_ARM) {
-    ARM::EHABI::PrinterContext<ELFType<support::little, false>> Ctx(
-        W, Obj, DotSymtabSec);
+    ARM::EHABI::PrinterContext<ELF32LE> Ctx(W, Obj, DotSymtabSec);
     return Ctx.PrintUnwindInformation();
   }
   W.startLine() << "UnwindInfo not implemented.\n";
@@ -1841,9 +1916,8 @@ void ELFDumper<ELFT>::printNeededLibraries() {
 
   std::stable_sort(Libs.begin(), Libs.end());
 
-  for (const auto &L : Libs) {
-    outs() << "  " << L << "\n";
-  }
+  for (const auto &L : Libs)
+     W.startLine() << L << "\n";
 }
 
 
@@ -1877,7 +1951,7 @@ void ELFDumper<ELFT>::printGnuHashTable() {
 }
 
 template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
-  outs() << "LoadName: " << SOName << '\n';
+  W.printString("LoadName", SOName);
 }
 
 template <class ELFT>
@@ -1887,7 +1961,7 @@ void ELFDumper<ELFT>::printAttributes() {
 
 namespace {
 
-template <> void ELFDumper<ELFType<support::little, false>>::printAttributes() {
+template <> void ELFDumper<ELF32LE>::printAttributes() {
   if (Obj->getHeader()->e_machine != EM_ARM) {
     W.startLine() << "Attributes not implemented.\n";
     return;
@@ -2219,7 +2293,9 @@ static const EnumEntry<unsigned> ElfMipsASEFlags[] = {
   {"MSA",                Mips::AFL_ASE_MSA},
   {"MIPS16",             Mips::AFL_ASE_MIPS16},
   {"microMIPS",          Mips::AFL_ASE_MICROMIPS},
-  {"XPA",                Mips::AFL_ASE_XPA}
+  {"XPA",                Mips::AFL_ASE_XPA},
+  {"CRC",                Mips::AFL_ASE_CRC},
+  {"GINV",               Mips::AFL_ASE_GINV},
 };
 
 static const EnumEntry<unsigned> ElfMipsFpABIType[] = {
@@ -2361,14 +2437,18 @@ template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
   ArrayRef<uint8_t> StackMapContentsArray =
       unwrapOrError(Obj->getSectionContents(StackMapSection));
 
-  prettyPrintStackMap(outs(), StackMapV2Parser<ELFT::TargetEndianness>(
-                                  StackMapContentsArray));
+  prettyPrintStackMap(
+      W, StackMapV2Parser<ELFT::TargetEndianness>(StackMapContentsArray));
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printGroupSections() {
   ELFDumperStyle->printGroupSections(Obj);
 }
 
+template <class ELFT> void ELFDumper<ELFT>::printAddrsig() {
+  ELFDumperStyle->printAddrsig(Obj);
+}
+
 static inline void printFields(formatted_raw_ostream &OS, StringRef Str1,
                                StringRef Str2) {
   OS.PadToColumn(2u);
@@ -2378,6 +2458,30 @@ static inline void printFields(formatted_raw_ostream &OS, StringRef Str1,
   OS.flush();
 }
 
+template <class ELFT>
+static std::string getSectionHeadersNumString(const ELFFile<ELFT> *Obj) {
+  const typename ELFT::Ehdr *ElfHeader = Obj->getHeader();
+  if (ElfHeader->e_shnum != 0)
+    return to_string(ElfHeader->e_shnum);
+
+  ArrayRef<typename ELFT::Shdr> Arr = unwrapOrError(Obj->sections());
+  if (Arr.empty())
+    return "0";
+  return "0 (" + to_string(Arr[0].sh_size) + ")";
+}
+
+template <class ELFT>
+static std::string getSectionHeaderTableIndexString(const ELFFile<ELFT> *Obj) {
+  const typename ELFT::Ehdr *ElfHeader = Obj->getHeader();
+  if (ElfHeader->e_shstrndx != SHN_XINDEX)
+    return to_string(ElfHeader->e_shstrndx);
+
+  ArrayRef<typename ELFT::Shdr> Arr = unwrapOrError(Obj->sections());
+  if (Arr.empty())
+    return "65535 (corrupt: out of range)";
+  return to_string(ElfHeader->e_shstrndx) + " (" + to_string(Arr[0].sh_link) + ")";
+}
+
 template <class ELFT> void GNUStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   const Elf_Ehdr *e = Obj->getHeader();
   OS << "ELF Header:\n";
@@ -2423,9 +2527,9 @@ template <class ELFT> void GNUStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   printFields(OS, "Number of program headers:", Str);
   Str = to_string(e->e_shentsize) + " (bytes)";
   printFields(OS, "Size of section headers:", Str);
-  Str = to_string(e->e_shnum);
+  Str = getSectionHeadersNumString(Obj);
   printFields(OS, "Number of section headers:", Str);
-  Str = to_string(e->e_shstrndx);
+  Str = getSectionHeaderTableIndexString(Obj);
   printFields(OS, "Section header string table index:", Str);
 }
 
@@ -2440,15 +2544,17 @@ struct GroupSection {
   StringRef Signature;
   uint64_t ShName;
   uint64_t Index;
+  uint32_t Link;
+  uint32_t Info;
   uint32_t Type;
   std::vector<GroupMember> Members;
 };
 
 template <class ELFT>
 std::vector<GroupSection> getGroups(const ELFFile<ELFT> *Obj) {
-  using Elf_Shdr = typename ELFFile<ELFT>::Elf_Shdr;
-  using Elf_Sym = typename ELFFile<ELFT>::Elf_Sym;
-  using Elf_Word = typename ELFFile<ELFT>::Elf_Word;
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Sym = typename ELFT::Sym;
+  using Elf_Word = typename ELFT::Word;
 
   std::vector<GroupSection> Ret;
   uint64_t I = 0;
@@ -2466,7 +2572,14 @@ std::vector<GroupSection> getGroups(const ELFFile<ELFT> *Obj) {
 
     StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
     StringRef Signature = StrTable.data() + Sym->st_name;
-    Ret.push_back({Name, Signature, Sec.sh_name, I - 1, Data[0], {}});
+    Ret.push_back({Name, 
+                   Signature, 
+                   Sec.sh_name, 
+                   I - 1,
+                   Sec.sh_link,
+                   Sec.sh_info,
+                   Data[0], 
+                   {}});
 
     std::vector<GroupMember> &GM = Ret.back().Members;
     for (uint32_t Ndx : Data.slice(1)) {
@@ -2522,7 +2635,6 @@ void GNUStyle<ELFT>::printRelocation(const ELFO *Obj, const Elf_Shdr *SymTab,
                                      const Elf_Rela &R, bool IsRela) {
   std::string Offset, Info, Addend, Value;
   SmallString<32> RelocName;
-  StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*SymTab));
   StringRef TargetName;
   const Elf_Sym *Sym = nullptr;
   unsigned Width = ELFT::Is64Bits ? 16 : 8;
@@ -2538,6 +2650,7 @@ void GNUStyle<ELFT>::printRelocation(const ELFO *Obj, const Elf_Shdr *SymTab,
         Obj->getSection(Sym, SymTab, this->dumper()->getShndxTable()));
     TargetName = unwrapOrError(Obj->getSectionName(Sec));
   } else if (Sym) {
+    StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*SymTab));
     TargetName = unwrapOrError(Sym->getName(StrTable));
   }
 
@@ -2569,35 +2682,62 @@ void GNUStyle<ELFT>::printRelocation(const ELFO *Obj, const Elf_Shdr *SymTab,
   OS << "\n";
 }
 
-static inline void printRelocHeader(raw_ostream &OS, bool Is64, bool IsRela) {
-  if (Is64)
-    OS << "    Offset             Info             Type"
+template <class ELFT> void GNUStyle<ELFT>::printRelocHeader(unsigned SType) {
+  bool IsRela = SType == ELF::SHT_RELA || SType == ELF::SHT_ANDROID_RELA;
+  bool IsRelr = SType == ELF::SHT_RELR || SType == ELF::SHT_ANDROID_RELR;
+  if (ELFT::Is64Bits)
+    OS << "    ";
+  else
+    OS << " ";
+  if (IsRelr && opts::RawRelr)
+    OS << "Data  ";
+  else
+    OS << "Offset";
+  if (ELFT::Is64Bits)
+    OS << "             Info             Type"
        << "               Symbol's Value  Symbol's Name";
   else
-    OS << " Offset     Info    Type                Sym. Value  "
-       << "Symbol's Name";
+    OS << "     Info    Type                Sym. Value  Symbol's Name";
   if (IsRela)
-    OS << (IsRela ? " + Addend" : "");
+    OS << " + Addend";
   OS << "\n";
 }
 
 template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
   bool HasRelocSections = false;
   for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
-    if (Sec.sh_type != ELF::SHT_REL && Sec.sh_type != ELF::SHT_RELA &&
+    if (Sec.sh_type != ELF::SHT_REL &&
+        Sec.sh_type != ELF::SHT_RELA &&
+        Sec.sh_type != ELF::SHT_RELR &&
         Sec.sh_type != ELF::SHT_ANDROID_REL &&
-        Sec.sh_type != ELF::SHT_ANDROID_RELA)
+        Sec.sh_type != ELF::SHT_ANDROID_RELA &&
+        Sec.sh_type != ELF::SHT_ANDROID_RELR)
       continue;
     HasRelocSections = true;
     StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
     unsigned Entries = Sec.getEntityCount();
+    std::vector<Elf_Rela> AndroidRelas;
+    if (Sec.sh_type == ELF::SHT_ANDROID_REL ||
+        Sec.sh_type == ELF::SHT_ANDROID_RELA) {
+      // Android's packed relocation section needs to be unpacked first
+      // to get the actual number of entries.
+      AndroidRelas = unwrapOrError(Obj->android_relas(&Sec));
+      Entries = AndroidRelas.size();
+    }
+    std::vector<Elf_Rela> RelrRelas;
+    if (!opts::RawRelr && (Sec.sh_type == ELF::SHT_RELR ||
+                           Sec.sh_type == ELF::SHT_ANDROID_RELR)) {
+      // .relr.dyn relative relocation section needs to be unpacked first
+      // to get the actual number of entries.
+      Elf_Relr_Range Relrs = unwrapOrError(Obj->relrs(&Sec));
+      RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+      Entries = RelrRelas.size();
+    }
     uintX_t Offset = Sec.sh_offset;
     OS << "\nRelocation section '" << Name << "' at offset 0x"
        << to_hexString(Offset, false) << " contains " << Entries
        << " entries:\n";
-    printRelocHeader(OS, ELFT::Is64Bits,
-                     Sec.sh_type == ELF::SHT_RELA ||
-                         Sec.sh_type == ELF::SHT_ANDROID_RELA);
+    printRelocHeader(Sec.sh_type);
     const Elf_Shdr *SymTab = unwrapOrError(Obj->getSection(Sec.sh_link));
     switch (Sec.sh_type) {
     case ELF::SHT_REL:
@@ -2613,9 +2753,19 @@ template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
       for (const auto &R : unwrapOrError(Obj->relas(&Sec)))
         printRelocation(Obj, SymTab, R, true);
       break;
+    case ELF::SHT_RELR:
+    case ELF::SHT_ANDROID_RELR:
+      if (opts::RawRelr)
+        for (const auto &R : unwrapOrError(Obj->relrs(&Sec)))
+          OS << to_string(format_hex_no_prefix(R, ELFT::Is64Bits ? 16 : 8))
+             << "\n";
+      else
+        for (const auto &R : RelrRelas)
+          printRelocation(Obj, SymTab, R, false);
+      break;
     case ELF::SHT_ANDROID_REL:
     case ELF::SHT_ANDROID_RELA:
-      for (const auto &R : unwrapOrError(Obj->android_relas(&Sec)))
+      for (const auto &R : AndroidRelas)
         printRelocation(Obj, SymTab, R, Sec.sh_type == ELF::SHT_ANDROID_RELA);
       break;
     }
@@ -2694,8 +2844,17 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
     return "GROUP";
   case SHT_SYMTAB_SHNDX:
     return "SYMTAB SECTION INDICES";
+  case SHT_RELR:
+  case SHT_ANDROID_RELR:
+    return "RELR";
   case SHT_LLVM_ODRTAB:
     return "LLVM_ODRTAB";
+  case SHT_LLVM_LINKER_OPTIONS:
+    return "LLVM_LINKER_OPTIONS";
+  case SHT_LLVM_CALL_GRAPH_PROFILE:
+    return "LLVM_CALL_GRAPH_PROFILE";
+  case SHT_LLVM_ADDRSIG:
+    return "LLVM_ADDRSIG";
   // FIXME: Parse processor specific GNU attributes
   case SHT_GNU_ATTRIBUTES:
     return "ATTRIBUTES";
@@ -2727,7 +2886,9 @@ template <class ELFT> void GNUStyle<ELFT>::printSections(const ELFO *Obj) {
     Bias = 8;
     Width = 8;
   }
-  OS << "There are " << to_string(Obj->getHeader()->e_shnum)
+
+  ArrayRef<Elf_Shdr> Sections = unwrapOrError(Obj->sections());
+  OS << "There are " << to_string(Sections.size())
      << " section headers, starting at offset "
      << "0x" << to_hexString(Obj->getHeader()->e_shoff, false) << ":\n\n";
   OS << "Section Headers:\n";
@@ -2746,7 +2907,7 @@ template <class ELFT> void GNUStyle<ELFT>::printSections(const ELFO *Obj) {
     printField(f);
   OS << "\n";
 
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec : Sections) {
     Number = to_string(SectionIndex);
     Fields[0].Str = Number;
     Fields[1].Str = unwrapOrError(Obj->getSectionName(&Sec));
@@ -3198,13 +3359,14 @@ template <class ELFT>
 void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
   const DynRegionInfo &DynRelRegion = this->dumper()->getDynRelRegion();
   const DynRegionInfo &DynRelaRegion = this->dumper()->getDynRelaRegion();
+  const DynRegionInfo &DynRelrRegion = this->dumper()->getDynRelrRegion();
   const DynRegionInfo &DynPLTRelRegion = this->dumper()->getDynPLTRelRegion();
   if (DynRelaRegion.Size > 0) {
     OS << "\n'RELA' relocation section at offset "
        << format_hex(reinterpret_cast<const uint8_t *>(DynRelaRegion.Addr) -
                          Obj->base(),
                      1) << " contains " << DynRelaRegion.Size << " bytes:\n";
-    printRelocHeader(OS, ELFT::Is64Bits, true);
+    printRelocHeader(ELF::SHT_RELA);
     for (const Elf_Rela &Rela : this->dumper()->dyn_relas())
       printDynamicRelocation(Obj, Rela, true);
   }
@@ -3213,7 +3375,7 @@ void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
        << format_hex(reinterpret_cast<const uint8_t *>(DynRelRegion.Addr) -
                          Obj->base(),
                      1) << " contains " << DynRelRegion.Size << " bytes:\n";
-    printRelocHeader(OS, ELFT::Is64Bits, false);
+    printRelocHeader(ELF::SHT_REL);
     for (const Elf_Rel &Rel : this->dumper()->dyn_rels()) {
       Elf_Rela Rela;
       Rela.r_offset = Rel.r_offset;
@@ -3222,6 +3384,18 @@ void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
       printDynamicRelocation(Obj, Rela, false);
     }
   }
+  if (DynRelrRegion.Size > 0) {
+    OS << "\n'RELR' relocation section at offset "
+       << format_hex(reinterpret_cast<const uint8_t *>(DynRelrRegion.Addr) -
+                         Obj->base(),
+                     1) << " contains " << DynRelrRegion.Size << " bytes:\n";
+    printRelocHeader(ELF::SHT_REL);
+    Elf_Relr_Range Relrs = this->dumper()->dyn_relrs();
+    std::vector<Elf_Rela> RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+    for (const Elf_Rela &Rela : RelrRelas) {
+      printDynamicRelocation(Obj, Rela, false);
+    }
+  }
   if (DynPLTRelRegion.Size) {
     OS << "\n'PLT' relocation section at offset "
        << format_hex(reinterpret_cast<const uint8_t *>(DynPLTRelRegion.Addr) -
@@ -3229,11 +3403,11 @@ void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
                      1) << " contains " << DynPLTRelRegion.Size << " bytes:\n";
   }
   if (DynPLTRelRegion.EntSize == sizeof(Elf_Rela)) {
-    printRelocHeader(OS, ELFT::Is64Bits, true);
+    printRelocHeader(ELF::SHT_RELA);
     for (const Elf_Rela &Rela : DynPLTRelRegion.getAsArrayRef<Elf_Rela>())
       printDynamicRelocation(Obj, Rela, true);
   } else {
-    printRelocHeader(OS, ELFT::Is64Bits, false);
+    printRelocHeader(ELF::SHT_REL);
     for (const Elf_Rel &Rel : DynPLTRelRegion.getAsArrayRef<Elf_Rel>()) {
       Elf_Rela Rela;
       Rela.r_offset = Rel.r_offset;
@@ -3349,6 +3523,16 @@ void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
   }
 }
 
+template <class ELFT>
+void GNUStyle<ELFT>::printCGProfile(const ELFFile<ELFT> *Obj) {
+  OS << "GNUStyle::printCGProfile not implemented\n";
+}
+
+template <class ELFT>
+void GNUStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
+    OS << "GNUStyle::printAddrsig not implemented\n";
+}
+
 static std::string getGNUNoteTypeName(const uint32_t NT) {
   static const struct {
     uint32_t ID;
@@ -3358,6 +3542,7 @@ static std::string getGNUNoteTypeName(const uint32_t NT) {
       {ELF::NT_GNU_HWCAP, "NT_GNU_HWCAP (DSO-supplied software HWCAP info)"},
       {ELF::NT_GNU_BUILD_ID, "NT_GNU_BUILD_ID (unique build ID bitstring)"},
       {ELF::NT_GNU_GOLD_VERSION, "NT_GNU_GOLD_VERSION (gold version)"},
+      {ELF::NT_GNU_PROPERTY_TYPE_0, "NT_GNU_PROPERTY_TYPE_0 (property note)"},
   };
 
   for (const auto &Note : Notes)
@@ -3422,9 +3607,65 @@ static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
 }
 
 template <typename ELFT>
+static void printGNUProperty(raw_ostream &OS, uint32_t Type, uint32_t DataSize,
+                             ArrayRef<uint8_t> Data) {
+  switch (Type) {
+  default:
+    OS << format("    <application-specific type 0x%x>\n", Type);
+    return;
+  case GNU_PROPERTY_STACK_SIZE: {
+    OS << "    stack size: ";
+    if (DataSize == sizeof(typename ELFT::uint))
+      OS << format("0x%llx\n",
+                   (uint64_t)(*(const typename ELFT::Addr *)Data.data()));
+    else
+      OS << format("<corrupt length: 0x%x>\n", DataSize);
+    break;
+  }
+  case GNU_PROPERTY_NO_COPY_ON_PROTECTED:
+    OS << "    no copy on protected";
+    if (DataSize)
+      OS << format(" <corrupt length: 0x%x>", DataSize);
+    OS << "\n";
+    break;
+  case GNU_PROPERTY_X86_FEATURE_1_AND:
+    OS << "    X86 features: ";
+    if (DataSize != 4 && DataSize != 8) {
+      OS << format("<corrupt length: 0x%x>\n", DataSize);
+      break;
+    }
+    uint64_t CFProtection =
+        (DataSize == 4)
+            ? support::endian::read32<ELFT::TargetEndianness>(Data.data())
+            : support::endian::read64<ELFT::TargetEndianness>(Data.data());
+    if (CFProtection == 0) {
+      OS << "none\n";
+      break;
+    }
+    if (CFProtection & GNU_PROPERTY_X86_FEATURE_1_IBT) {
+      OS << "IBT";
+      CFProtection &= ~GNU_PROPERTY_X86_FEATURE_1_IBT;
+      if (CFProtection)
+        OS << ", ";
+    }
+    if (CFProtection & GNU_PROPERTY_X86_FEATURE_1_SHSTK) {
+      OS << "SHSTK";
+      CFProtection &= ~GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+      if (CFProtection)
+        OS << ", ";
+    }
+    if (CFProtection)
+      OS << format("<unknown flags: 0x%llx>", CFProtection);
+    OS << "\n";
+    break;
+  }
+}
+
+template <typename ELFT>
 static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
-                         ArrayRef<typename ELFFile<ELFT>::Elf_Word> Words,
-                         size_t Size) {
+                         ArrayRef<typename ELFT::Word> Words, size_t Size) {
+  using Elf_Word = typename ELFT::Word;
+
   switch (NoteType) {
   default:
     return;
@@ -3456,15 +3697,37 @@ static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
     OS << "    Version: "
        << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
     break;
-  }
+  case ELF::NT_GNU_PROPERTY_TYPE_0:
+    OS << "    Properties:";
+
+    ArrayRef<uint8_t> Arr(reinterpret_cast<const uint8_t *>(Words.data()),
+                          Size);
+    while (Arr.size() >= 8) {
+      uint32_t Type = *reinterpret_cast<const Elf_Word *>(Arr.data());
+      uint32_t DataSize = *reinterpret_cast<const Elf_Word *>(Arr.data() + 4);
+      Arr = Arr.drop_front(8);
+
+      // Take padding size into account if present.
+      uint64_t PaddedSize = alignTo(DataSize, sizeof(typename ELFT::uint));
+      if (Arr.size() < PaddedSize) {
+        OS << format("    <corrupt type (0x%x) datasz: 0x%x>\n", Type,
+                     DataSize);
+        break;
+      }
+      printGNUProperty<ELFT>(OS, Type, DataSize, Arr.take_front(PaddedSize));
+      Arr = Arr.drop_front(PaddedSize);
+    }
 
+    if (!Arr.empty())
+      OS << "    <corrupted GNU_PROPERTY_TYPE_0>";
+    break;
+  }
   OS << '\n';
 }
 
 template <typename ELFT>
 static void printAMDGPUNote(raw_ostream &OS, uint32_t NoteType,
-                            ArrayRef<typename ELFFile<ELFT>::Elf_Word> Words,
-                            size_t Size) {
+                            ArrayRef<typename ELFT::Word> Words, size_t Size) {
   switch (NoteType) {
   default:
     return;
@@ -3499,66 +3762,66 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   const Elf_Ehdr *e = Obj->getHeader();
   bool IsCore = e->e_type == ELF::ET_CORE;
 
-  auto process = [&](const typename ELFFile<ELFT>::Elf_Off Offset,
-                     const typename ELFFile<ELFT>::Elf_Addr Size) {
-    if (Size <= 0)
-      return;
-
-    const auto *P = static_cast<const uint8_t *>(Obj->base() + Offset);
-    const auto *E = P + Size;
-
+  auto PrintHeader = [&](const typename ELFT::Off Offset,
+                         const typename ELFT::Addr Size) {
     OS << "Displaying notes found at file offset " << format_hex(Offset, 10)
        << " with length " << format_hex(Size, 10) << ":\n"
        << "  Owner                 Data size\tDescription\n";
+  };
 
-    while (P < E) {
-      const Elf_Word *Words = reinterpret_cast<const Elf_Word *>(&P[0]);
-
-      uint32_t NameSize = Words[0];
-      uint32_t DescriptorSize = Words[1];
-      uint32_t Type = Words[2];
-
-      ArrayRef<Elf_Word> Descriptor(&Words[3 + (alignTo<4>(NameSize) / 4)],
-                                    alignTo<4>(DescriptorSize) / 4);
-
-      StringRef Name;
-      if (NameSize)
-        Name =
-            StringRef(reinterpret_cast<const char *>(&Words[3]), NameSize - 1);
-
-      OS << "  " << Name << std::string(22 - NameSize, ' ')
-         << format_hex(DescriptorSize, 10) << '\t';
-
-      if (Name == "GNU") {
-        OS << getGNUNoteTypeName(Type) << '\n';
-        printGNUNote<ELFT>(OS, Type, Descriptor, DescriptorSize);
-      } else if (Name == "FreeBSD") {
-        OS << getFreeBSDNoteTypeName(Type) << '\n';
-      } else if (Name == "AMD") {
-        OS << getAMDGPUNoteTypeName(Type) << '\n';
-        printAMDGPUNote<ELFT>(OS, Type, Descriptor, DescriptorSize);
-      } else {
-        OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
-      }
-      OS << '\n';
-
-      P = P + 3 * sizeof(Elf_Word) + alignTo<4>(NameSize) +
-          alignTo<4>(DescriptorSize);
+  auto ProcessNote = [&](const Elf_Note &Note) {
+    StringRef Name = Note.getName();
+    ArrayRef<Elf_Word> Descriptor = Note.getDesc();
+    Elf_Word Type = Note.getType();
+
+    OS << "  " << Name << std::string(22 - Name.size(), ' ')
+       << format_hex(Descriptor.size(), 10) << '\t';
+
+    if (Name == "GNU") {
+      OS << getGNUNoteTypeName(Type) << '\n';
+      printGNUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+    } else if (Name == "FreeBSD") {
+      OS << getFreeBSDNoteTypeName(Type) << '\n';
+    } else if (Name == "AMD") {
+      OS << getAMDGPUNoteTypeName(Type) << '\n';
+      printAMDGPUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+    } else {
+      OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
     }
+    OS << '\n';
   };
 
   if (IsCore) {
-    for (const auto &P : unwrapOrError(Obj->program_headers()))
-      if (P.p_type == PT_NOTE)
-        process(P.p_offset, P.p_filesz);
+    for (const auto &P : unwrapOrError(Obj->program_headers())) {
+      if (P.p_type != PT_NOTE)
+        continue;
+      PrintHeader(P.p_offset, P.p_filesz);
+      Error Err = Error::success();
+      for (const auto &Note : Obj->notes(P, Err))
+        ProcessNote(Note);
+      if (Err)
+        error(std::move(Err));
+    }
   } else {
-    for (const auto &S : unwrapOrError(Obj->sections()))
-      if (S.sh_type == SHT_NOTE)
-        process(S.sh_offset, S.sh_size);
+    for (const auto &S : unwrapOrError(Obj->sections())) {
+      if (S.sh_type != SHT_NOTE)
+        continue;
+      PrintHeader(S.sh_offset, S.sh_size);
+      Error Err = Error::success();
+      for (const auto &Note : Obj->notes(S, Err))
+        ProcessNote(Note);
+      if (Err)
+        error(std::move(Err));
+    }
   }
 }
 
 template <class ELFT>
+void GNUStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
+  OS << "printELFLinkerOptions not implemented!\n";
+}
+
+template <class ELFT>
 void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
   size_t Bias = ELFT::Is64Bits ? 8 : 0;
   auto PrintEntry = [&](const Elf_Addr *E, StringRef Purpose) {
@@ -3715,7 +3978,7 @@ template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
                    unsigned(ELF::EF_MIPS_MACH));
     else if (e->e_machine == EM_AMDGPU)
       W.printFlags("Flags", e->e_flags, makeArrayRef(ElfHeaderAMDGPUFlags),
-                   unsigned(ELF::EF_AMDGPU_ARCH));
+                   unsigned(ELF::EF_AMDGPU_MACH));
     else if (e->e_machine == EM_RISCV)
       W.printFlags("Flags", e->e_flags, makeArrayRef(ElfHeaderRISCVFlags));
     else
@@ -3724,8 +3987,8 @@ template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
     W.printNumber("ProgramHeaderEntrySize", e->e_phentsize);
     W.printNumber("ProgramHeaderCount", e->e_phnum);
     W.printNumber("SectionHeaderEntrySize", e->e_shentsize);
-    W.printNumber("SectionHeaderCount", e->e_shnum);
-    W.printNumber("StringTableSectionIndex", e->e_shstrndx);
+    W.printString("SectionHeaderCount", getSectionHeadersNumString(Obj));
+    W.printString("StringTableSectionIndex", getSectionHeaderTableIndexString(Obj));
   }
 }
 
@@ -3738,6 +4001,8 @@ void LLVMStyle<ELFT>::printGroupSections(const ELFO *Obj) {
     DictScope D(W, "Group");
     W.printNumber("Name", G.Name, G.ShName);
     W.printNumber("Index", G.Index);
+    W.printNumber("Link", G.Link);
+    W.printNumber("Info", G.Info);
     W.printHex("Type", getGroupType(G.Type), G.Type);
     W.startLine() << "Signature: " << G.Signature << "\n";
 
@@ -3768,9 +4033,12 @@ template <class ELFT> void LLVMStyle<ELFT>::printRelocations(const ELFO *Obj) {
   for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
     ++SectionNumber;
 
-    if (Sec.sh_type != ELF::SHT_REL && Sec.sh_type != ELF::SHT_RELA &&
+    if (Sec.sh_type != ELF::SHT_REL &&
+        Sec.sh_type != ELF::SHT_RELA &&
+        Sec.sh_type != ELF::SHT_RELR &&
         Sec.sh_type != ELF::SHT_ANDROID_REL &&
-        Sec.sh_type != ELF::SHT_ANDROID_RELA)
+        Sec.sh_type != ELF::SHT_ANDROID_RELA &&
+        Sec.sh_type != ELF::SHT_ANDROID_RELR)
       continue;
 
     StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
@@ -3803,6 +4071,19 @@ void LLVMStyle<ELFT>::printRelocations(const Elf_Shdr *Sec, const ELFO *Obj) {
     for (const Elf_Rela &R : unwrapOrError(Obj->relas(Sec)))
       printRelocation(Obj, R, SymTab);
     break;
+  case ELF::SHT_RELR:
+  case ELF::SHT_ANDROID_RELR: {
+    Elf_Relr_Range Relrs = unwrapOrError(Obj->relrs(Sec));
+    if (opts::RawRelr) {
+      for (const Elf_Relr &R : Relrs)
+        W.startLine() << W.hex(R) << "\n";
+    } else {
+      std::vector<Elf_Rela> RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+      for (const Elf_Rela &R : RelrRelas)
+        printRelocation(Obj, R, SymTab);
+    }
+    break;
+  }
   case ELF::SHT_ANDROID_REL:
   case ELF::SHT_ANDROID_RELA:
     for (const Elf_Rela &R : unwrapOrError(Obj->android_relas(Sec)))
@@ -3983,6 +4264,7 @@ template <class ELFT>
 void LLVMStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
   const DynRegionInfo &DynRelRegion = this->dumper()->getDynRelRegion();
   const DynRegionInfo &DynRelaRegion = this->dumper()->getDynRelaRegion();
+  const DynRegionInfo &DynRelrRegion = this->dumper()->getDynRelrRegion();
   const DynRegionInfo &DynPLTRelRegion = this->dumper()->getDynPLTRelRegion();
   if (DynRelRegion.Size && DynRelaRegion.Size)
     report_fatal_error("There are both REL and RELA dynamic relocations");
@@ -3999,6 +4281,12 @@ void LLVMStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
       Rela.r_addend = 0;
       printDynamicRelocation(Obj, Rela);
     }
+  if (DynRelrRegion.Size > 0) {
+    Elf_Relr_Range Relrs = this->dumper()->dyn_relrs();
+    std::vector<Elf_Rela> RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+    for (const Elf_Rela &Rela : RelrRelas)
+      printDynamicRelocation(Obj, Rela);
+  }
   if (DynPLTRelRegion.EntSize == sizeof(Elf_Rela))
     for (const Elf_Rela &Rela : DynPLTRelRegion.getAsArrayRef<Elf_Rela>())
       printDynamicRelocation(Obj, Rela);
@@ -4062,11 +4350,71 @@ void LLVMStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
 }
 
 template <class ELFT>
+void LLVMStyle<ELFT>::printCGProfile(const ELFFile<ELFT> *Obj) {
+  ListScope L(W, "CGProfile");
+  if (!this->dumper()->getDotCGProfileSec())
+    return;
+  auto CGProfile =
+      unwrapOrError(Obj->template getSectionContentsAsArray<Elf_CGProfile>(
+          this->dumper()->getDotCGProfileSec()));
+  for (const Elf_CGProfile &CGPE : CGProfile) {
+    DictScope D(W, "CGProfileEntry");
+    W.printNumber("From", this->dumper()->getStaticSymbolName(CGPE.cgp_from),
+                  CGPE.cgp_from);
+    W.printNumber("To", this->dumper()->getStaticSymbolName(CGPE.cgp_to),
+                  CGPE.cgp_to);
+    W.printNumber("Weight", CGPE.cgp_weight);
+  }
+}
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
+  ListScope L(W, "Addrsig");
+  if (!this->dumper()->getDotAddrsigSec())
+    return;
+  ArrayRef<uint8_t> Contents = unwrapOrError(
+      Obj->getSectionContents(this->dumper()->getDotAddrsigSec()));
+  const uint8_t *Cur = Contents.begin();
+  const uint8_t *End = Contents.end();
+  while (Cur != End) {
+    unsigned Size;
+    const char *Err;
+    uint64_t SymIndex = decodeULEB128(Cur, &Size, Contents.end(), &Err);
+    if (Err)
+      reportError(Err);
+    W.printNumber("Sym", this->dumper()->getStaticSymbolName(SymIndex),
+                  SymIndex);
+    Cur += Size;
+  }
+}
+
+template <class ELFT>
 void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   W.startLine() << "printNotes not implemented!\n";
 }
 
 template <class ELFT>
+void LLVMStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
+  ListScope L(W, "LinkerOptions");
+
+  for (const Elf_Shdr &Shdr : unwrapOrError(Obj->sections())) {
+    if (Shdr.sh_type != ELF::SHT_LLVM_LINKER_OPTIONS)
+      continue;
+
+    ArrayRef<uint8_t> Contents = unwrapOrError(Obj->getSectionContents(&Shdr));
+    for (const uint8_t *P = Contents.begin(), *E = Contents.end(); P < E; ) {
+      StringRef Key = StringRef(reinterpret_cast<const char *>(P));
+      StringRef Value =
+          StringRef(reinterpret_cast<const char *>(P) + Key.size() + 1);
+
+      W.printString(Key, Value);
+
+      P = P + Key.size() + Value.size() + 2;
+    }
+  }
+}
+
+template <class ELFT>
 void LLVMStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
   auto PrintEntry = [&](const Elf_Addr *E) {
     W.printHex("Address", Parser.getGotAddress(E));
diff --git a/contrib/llvm/tools/llvm-readobj/MachODumper.cpp b/contrib/llvm/tools/llvm-readobj/MachODumper.cpp
index 64178d7b33ad..69ef1556f78d 100644
--- a/contrib/llvm/tools/llvm-readobj/MachODumper.cpp
+++ b/contrib/llvm/tools/llvm-readobj/MachODumper.cpp
@@ -669,12 +669,11 @@ void MachODumper::printStackMap() const {
       StackMapContents.size());
 
   if (Obj->isLittleEndian())
-     prettyPrintStackMap(
-                      llvm::outs(),
-                      StackMapV2Parser<support::little>(StackMapContentsArray));
+    prettyPrintStackMap(
+        W, StackMapV2Parser<support::little>(StackMapContentsArray));
   else
-     prettyPrintStackMap(llvm::outs(),
-                         StackMapV2Parser<support::big>(StackMapContentsArray));
+    prettyPrintStackMap(W,
+                        StackMapV2Parser<support::big>(StackMapContentsArray));
 }
 
 void MachODumper::printNeededLibraries() {
diff --git a/contrib/llvm/tools/llvm-readobj/ObjDumper.cpp b/contrib/llvm/tools/llvm-readobj/ObjDumper.cpp
index 2a0a90e5cfd5..a725140c9d33 100644
--- a/contrib/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/contrib/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -8,13 +8,15 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief This file implements ObjDumper.
+/// This file implements ObjDumper.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "ObjDumper.h"
 #include "Error.h"
+#include "llvm-readobj.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -25,4 +27,122 @@ ObjDumper::ObjDumper(ScopedPrinter &Writer) : W(Writer) {}
 ObjDumper::~ObjDumper() {
 }
 
+static void printAsPrintable(raw_ostream &W, const uint8_t *Start, size_t Len) {
+  for (size_t i = 0; i < Len; i++)
+    W << (isPrint(Start[i]) ? static_cast<char>(Start[i]) : '.');
+}
+
+static Expected<object::SectionRef>
+getSecNameOrIndexAsSecRef(const object::ObjectFile *Obj, StringRef SecName) {
+  char *StrPtr;
+  long SectionIndex = strtol(SecName.data(), &StrPtr, 10);
+  object::SectionRef Section;
+  long SecIndex;
+  if (Obj->isELF())
+    SecIndex = 0;
+  else
+    SecIndex = 1;
+  for (object::SectionRef SecRef : Obj->sections()) {
+    if (*StrPtr) {
+      StringRef SectionName;
+
+      if (std::error_code E = SecRef.getName(SectionName))
+        return errorCodeToError(E);
+
+      if (SectionName == SecName)
+        return SecRef;
+    } else if (SecIndex == SectionIndex)
+      return SecRef;
+
+    SecIndex++;
+  }
+  return make_error<StringError>("invalid section reference",
+                                 object::object_error::parse_failed);
+}
+
+void ObjDumper::printSectionAsString(const object::ObjectFile *Obj,
+                                     StringRef SecName) {
+  Expected<object::SectionRef> SectionRefOrError =
+      getSecNameOrIndexAsSecRef(Obj, SecName);
+  if (!SectionRefOrError)
+    error(std::move(SectionRefOrError));
+  object::SectionRef Section = *SectionRefOrError;
+  StringRef SectionName;
+
+  if (std::error_code E = Section.getName(SectionName))
+    error(E);
+  W.startLine() << "String dump of section '" << SectionName << "':\n";
+
+  StringRef SectionContent;
+  Section.getContents(SectionContent);
+
+  const uint8_t *SecContent = SectionContent.bytes_begin();
+  const uint8_t *CurrentWord = SecContent;
+  const uint8_t *SecEnd = SectionContent.bytes_end();
+
+  while (CurrentWord <= SecEnd) {
+    size_t WordSize = strnlen(reinterpret_cast<const char *>(CurrentWord),
+                              SecEnd - CurrentWord);
+    if (!WordSize) {
+      CurrentWord++;
+      continue;
+    }
+    W.startLine() << format("[%6tx] ", CurrentWord - SecContent);
+    printAsPrintable(W.startLine(), CurrentWord, WordSize);
+    W.startLine() << '\n';
+    CurrentWord += WordSize + 1;
+  }
+}
+
+void ObjDumper::printSectionAsHex(const object::ObjectFile *Obj,
+                                  StringRef SecName) {
+  Expected<object::SectionRef> SectionRefOrError =
+      getSecNameOrIndexAsSecRef(Obj, SecName);
+  if (!SectionRefOrError)
+    error(std::move(SectionRefOrError));
+  object::SectionRef Section = *SectionRefOrError;
+  StringRef SectionName;
+
+  if (std::error_code E = Section.getName(SectionName))
+    error(E);
+  W.startLine() << "Hex dump of section '" << SectionName << "':\n";
+
+  StringRef SectionContent;
+  Section.getContents(SectionContent);
+  const uint8_t *SecContent = SectionContent.bytes_begin();
+  const uint8_t *SecEnd = SecContent + SectionContent.size();
+
+  for (const uint8_t *SecPtr = SecContent; SecPtr < SecEnd; SecPtr += 16) {
+    const uint8_t *TmpSecPtr = SecPtr;
+    uint8_t i;
+    uint8_t k;
+
+    W.startLine() << format_hex(SecPtr - SecContent, 10);
+    W.startLine() << ' ';
+    for (i = 0; TmpSecPtr < SecEnd && i < 4; ++i) {
+      for (k = 0; TmpSecPtr < SecEnd && k < 4; k++, TmpSecPtr++) {
+        uint8_t Val = *(reinterpret_cast<const uint8_t *>(TmpSecPtr));
+        W.startLine() << format_hex_no_prefix(Val, 2);
+      }
+      W.startLine() << ' ';
+    }
+
+    // We need to print the correct amount of spaces to match the format.
+    // We are adding the (4 - i) last rows that are 8 characters each.
+    // Then, the (4 - i) spaces that are in between the rows.
+    // Least, if we cut in a middle of a row, we add the remaining characters,
+    // which is (8 - (k * 2))
+    if (i < 4)
+      W.startLine() << format("%*c", (4 - i) * 8 + (4 - i) + (8 - (k * 2)),
+                              ' ');
+
+    TmpSecPtr = SecPtr;
+    for (i = 0; TmpSecPtr + i < SecEnd && i < 16; ++i)
+      W.startLine() << (isPrint(TmpSecPtr[i]) ? static_cast<char>(TmpSecPtr[i])
+                                              : '.');
+
+    W.startLine() << '\n';
+  }
+}
+
 } // namespace llvm
diff --git a/contrib/llvm/tools/llvm-readobj/ObjDumper.h b/contrib/llvm/tools/llvm-readobj/ObjDumper.h
index c5b331d944a2..8c3a7bec73be 100644
--- a/contrib/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/contrib/llvm/tools/llvm-readobj/ObjDumper.h
@@ -13,6 +13,9 @@
 #include <memory>
 #include <system_error>
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ObjectFile.h"
+
 namespace llvm {
 namespace object {
 class COFFImportFile;
@@ -41,13 +44,17 @@ public:
   virtual void printDynamicTable() { }
   virtual void printNeededLibraries() { }
   virtual void printProgramHeaders() { }
+  virtual void printSectionAsHex(StringRef SectionName) {}
   virtual void printHashTable() { }
   virtual void printGnuHashTable() { }
   virtual void printLoadName() {}
   virtual void printVersionInfo() {}
   virtual void printGroupSections() {}
   virtual void printHashHistogram() {}
+  virtual void printCGProfile() {}
+  virtual void printAddrsig() {}
   virtual void printNotes() {}
+  virtual void printELFLinkerOptions() {}
 
   // Only implemented for ARM ELF at this time.
   virtual void printAttributes() { }
@@ -81,6 +88,9 @@ public:
 
   virtual void printStackMap() const = 0;
 
+  void printSectionAsString(const object::ObjectFile *Obj, StringRef SecName);
+  void printSectionAsHex(const object::ObjectFile *Obj, StringRef SecName);
+
 protected:
   ScopedPrinter &W;
 };
@@ -101,7 +111,8 @@ std::error_code createWasmDumper(const object::ObjectFile *Obj,
                                  ScopedPrinter &Writer,
                                  std::unique_ptr<ObjDumper> &Result);
 
-void dumpCOFFImportFile(const object::COFFImportFile *File);
+void dumpCOFFImportFile(const object::COFFImportFile *File,
+                        ScopedPrinter &Writer);
 
 void dumpCodeViewMergedTypes(
     ScopedPrinter &Writer, llvm::codeview::MergingTypeTableBuilder &IDTable,
diff --git a/contrib/llvm/tools/llvm-readobj/StackMapPrinter.h b/contrib/llvm/tools/llvm-readobj/StackMapPrinter.h
index f4ed68e92d78..77a054b178a5 100644
--- a/contrib/llvm/tools/llvm-readobj/StackMapPrinter.h
+++ b/contrib/llvm/tools/llvm-readobj/StackMapPrinter.h
@@ -11,69 +11,70 @@
 #define LLVM_TOOLS_LLVM_READOBJ_STACKMAPPRINTER_H
 
 #include "llvm/Object/StackMapParser.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
 
 // Pretty print a stackmap to the given ostream.
-template <typename OStreamT, typename StackMapParserT>
-void prettyPrintStackMap(OStreamT &OS, const StackMapParserT &SMP) {
+template <typename StackMapParserT>
+void prettyPrintStackMap(ScopedPrinter &W, const StackMapParserT &SMP) {
 
-  OS << "LLVM StackMap Version: " << SMP.getVersion()
-     << "\nNum Functions: " << SMP.getNumFunctions();
+  W.printNumber("LLVM StackMap Version",  SMP.getVersion());
+  W.printNumber("Num Functions", SMP.getNumFunctions());
 
   // Functions:
   for (const auto &F : SMP.functions())
-    OS << "\n  Function address: " << F.getFunctionAddress()
+    W.startLine() << "  Function address: " << F.getFunctionAddress()
        << ", stack size: " << F.getStackSize()
-       << ", callsite record count: " << F.getRecordCount();
+       << ", callsite record count: " << F.getRecordCount() << "\n";
 
   // Constants:
-  OS << "\nNum Constants: " << SMP.getNumConstants();
+  W.printNumber("Num Constants", SMP.getNumConstants());
   unsigned ConstantIndex = 0;
   for (const auto &C : SMP.constants())
-    OS << "\n  #" << ++ConstantIndex << ": " << C.getValue();
+    W.startLine() << "  #" << ++ConstantIndex << ": " << C.getValue() << "\n";
 
   // Records:
-  OS << "\nNum Records: " << SMP.getNumRecords();
+  W.printNumber("Num Records", SMP.getNumRecords());
   for (const auto &R : SMP.records()) {
-    OS << "\n  Record ID: " << R.getID()
-       << ", instruction offset: " << R.getInstructionOffset()
-       << "\n    " << R.getNumLocations() << " locations:";
+    W.startLine() << "  Record ID: " << R.getID()
+                  << ", instruction offset: " << R.getInstructionOffset()
+                  << "\n";
+    W.startLine() << "    " << R.getNumLocations() << " locations:\n";
 
     unsigned LocationIndex = 0;
     for (const auto &Loc : R.locations()) {
-      OS << "\n      #" << ++LocationIndex << ": ";
+      raw_ostream &OS = W.startLine();
+      OS << "      #" << ++LocationIndex << ": ";
       switch (Loc.getKind()) {
       case StackMapParserT::LocationKind::Register:
-        OS << "Register R#" << Loc.getDwarfRegNum();
+        OS << "Register R#" << Loc.getDwarfRegNum() << "\n";
         break;
       case StackMapParserT::LocationKind::Direct:
-        OS << "Direct R#" << Loc.getDwarfRegNum() << " + "
-           << Loc.getOffset();
+        OS << "Direct R#" << Loc.getDwarfRegNum() << " + " << Loc.getOffset()
+           << "\n";
         break;
       case StackMapParserT::LocationKind::Indirect:
-        OS << "Indirect [R#" << Loc.getDwarfRegNum() << " + "
-           << Loc.getOffset() << "]";
+        OS << "Indirect [R#" << Loc.getDwarfRegNum() << " + " << Loc.getOffset()
+           << "]\n";
         break;
       case StackMapParserT::LocationKind::Constant:
-        OS << "Constant " << Loc.getSmallConstant();
+        OS << "Constant " << Loc.getSmallConstant() << "\n";
         break;
       case StackMapParserT::LocationKind::ConstantIndex:
         OS << "ConstantIndex #" << Loc.getConstantIndex() << " ("
-           << SMP.getConstant(Loc.getConstantIndex()).getValue() << ")";
+           << SMP.getConstant(Loc.getConstantIndex()).getValue() << ")\n";
         break;
       }
     }
 
-    OS << "\n    " << R.getNumLiveOuts() << " live-outs: [ ";
+    raw_ostream &OS = W.startLine();
+    OS << "    " << R.getNumLiveOuts() << " live-outs: [ ";
     for (const auto &LO : R.liveouts())
       OS << "R#" << LO.getDwarfRegNum() << " ("
          << LO.getSizeInBytes() << "-bytes) ";
     OS << "]\n";
   }
-
- OS << "\n";
-
 }
 
 }
diff --git a/contrib/llvm/tools/llvm-readobj/WasmDumper.cpp b/contrib/llvm/tools/llvm-readobj/WasmDumper.cpp
index 223c1c752469..ce224836225e 100644
--- a/contrib/llvm/tools/llvm-readobj/WasmDumper.cpp
+++ b/contrib/llvm/tools/llvm-readobj/WasmDumper.cpp
@@ -23,12 +23,11 @@ using namespace object;
 namespace {
 
 static const EnumEntry<unsigned> WasmSymbolTypes[] = {
-#define ENUM_ENTRY(X) { #X, static_cast<unsigned>(WasmSymbol::SymbolType::X) }
-  ENUM_ENTRY(FUNCTION_IMPORT),
-  ENUM_ENTRY(FUNCTION_EXPORT),
-  ENUM_ENTRY(GLOBAL_IMPORT),
-  ENUM_ENTRY(GLOBAL_EXPORT),
-  ENUM_ENTRY(DEBUG_FUNCTION_NAME),
+#define ENUM_ENTRY(X) { #X, wasm::WASM_SYMBOL_TYPE_##X }
+  ENUM_ENTRY(FUNCTION),
+  ENUM_ENTRY(DATA),
+  ENUM_ENTRY(GLOBAL),
+  ENUM_ENTRY(SECTION),
 #undef ENUM_ENTRY
 };
 
@@ -81,11 +80,18 @@ void WasmDumper::printRelocation(const SectionRef &Section,
   Reloc.getTypeName(RelocTypeName);
   const wasm::WasmRelocation &WasmReloc = Obj->getWasmRelocation(Reloc);
 
+  StringRef SymName;
+  symbol_iterator SI = Reloc.getSymbol();
+  if (SI != Obj->symbol_end())
+    SymName = error(SI->getName());
+
   bool HasAddend = false;
   switch (RelocType) {
   case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
   case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
   case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
+  case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
+  case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32:
     HasAddend = true;
     break;
   default:
@@ -95,13 +101,19 @@ void WasmDumper::printRelocation(const SectionRef &Section,
     DictScope Group(W, "Relocation");
     W.printNumber("Type", RelocTypeName, RelocType);
     W.printHex("Offset", Reloc.getOffset());
-    W.printHex("Index", WasmReloc.Index);
+    if (!SymName.empty())
+      W.printString("Symbol", SymName);
+    else
+      W.printHex("Index", WasmReloc.Index);
     if (HasAddend)
       W.printNumber("Addend", WasmReloc.Addend);
   } else {
     raw_ostream& OS = W.startLine();
-    OS << W.hex(Reloc.getOffset()) << " " << RelocTypeName << "["
-       << WasmReloc.Index << "]";
+    OS << W.hex(Reloc.getOffset()) << " " << RelocTypeName << " ";
+    if (!SymName.empty())
+      OS << SymName;
+    else
+      OS << WasmReloc.Index;
     if (HasAddend)
       OS << " " << WasmReloc.Addend;
     OS << "\n";
@@ -155,12 +167,10 @@ void WasmDumper::printSections() {
       W.printString("Name", WasmSec.Name);
       if (WasmSec.Name == "linking") {
         const wasm::WasmLinkingData &LinkingData = Obj->linkingData();
-        W.printNumber("DataSize", LinkingData.DataSize);
         if (!LinkingData.InitFunctions.empty()) {
           ListScope Group(W, "InitFunctions");
           for (const wasm::WasmInitFunc &F: LinkingData.InitFunctions)
-            W.startLine() << F.FunctionIndex << " (priority=" << F.Priority
-                          << ")\n";
+            W.startLine() << F.Symbol << " (priority=" << F.Priority << ")\n";
         }
       }
       break;
@@ -204,9 +214,9 @@ void WasmDumper::printSections() {
 void WasmDumper::printSymbol(const SymbolRef &Sym) {
   DictScope D(W, "Symbol");
   WasmSymbol Symbol = Obj->getWasmSymbol(Sym.getRawDataRefImpl());
-  W.printString("Name", Symbol.Name);
-  W.printEnum("Type", static_cast<unsigned>(Symbol.Type), makeArrayRef(WasmSymbolTypes));
-  W.printHex("Flags", Symbol.Flags);
+  W.printString("Name", Symbol.Info.Name);
+  W.printEnum("Type", Symbol.Info.Kind, makeArrayRef(WasmSymbolTypes));
+  W.printHex("Flags", Symbol.Info.Flags);
 }
 
 }
diff --git a/contrib/llvm/tools/llvm-readobj/llvm-readobj.cpp b/contrib/llvm/tools/llvm-readobj/llvm-readobj.cpp
index c076582794fe..a7236c02b8ae 100644
--- a/contrib/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/contrib/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -34,11 +34,10 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -147,6 +146,18 @@ namespace opts {
   cl::alias ProgramHeadersShort("l", cl::desc("Alias for --program-headers"),
                                 cl::aliasopt(ProgramHeaders));
 
+  // -string-dump
+  cl::list<std::string> StringDump("string-dump", cl::desc("<number|name>"),
+                                   cl::ZeroOrMore);
+  cl::alias StringDumpShort("p", cl::desc("Alias for --string-dump"),
+                            cl::aliasopt(StringDump));
+
+  // -hex-dump
+  cl::list<std::string> HexDump("hex-dump", cl::desc("<number|name>"),
+                                cl::ZeroOrMore);
+  cl::alias HexDumpShort("x", cl::desc("Alias for --hex-dump"),
+                         cl::aliasopt(HexDump));
+
   // -hash-table
   cl::opt<bool> HashTable("hash-table",
     cl::desc("Display ELF hash table"));
@@ -159,6 +170,10 @@ namespace opts {
   cl::opt<bool> ExpandRelocs("expand-relocs",
     cl::desc("Expand each shown relocation to multiple lines"));
 
+  // -raw-relr
+  cl::opt<bool> RawRelr("raw-relr",
+    cl::desc("Do not decode relocations in SHT_RELR section, display raw contents"));
+
   // -codeview
   cl::opt<bool> CodeView("codeview",
                          cl::desc("Display CodeView debug information"));
@@ -228,6 +243,11 @@ namespace opts {
   COFFLoadConfig("coff-load-config",
                  cl::desc("Display the PE/COFF load config"));
 
+  // -elf-linker-options
+  cl::opt<bool>
+  ELFLinkerOptions("elf-linker-options",
+                   cl::desc("Display the ELF .linker-options section"));
+
   // -macho-data-in-code
   cl::opt<bool>
   MachODataInCode("macho-data-in-code",
@@ -280,6 +300,11 @@ namespace opts {
   cl::alias HashHistogramShort("I", cl::desc("Alias for -elf-hash-histogram"),
                                cl::aliasopt(HashHistogram));
 
+  cl::opt<bool> CGProfile("elf-cg-profile", cl::desc("Display callgraph profile section"));
+
+  cl::opt<bool> Addrsig("elf-addrsig",
+                        cl::desc("Display address-significance table"));
+
   cl::opt<OutputStyleTy>
       Output("elf-output-style", cl::desc("Specify ELF dump style"),
              cl::values(clEnumVal(LLVM, "LLVM default style"),
@@ -355,7 +380,7 @@ struct ReadObjTypeTableBuilder {
 }
 static ReadObjTypeTableBuilder CVTypes;
 
-/// @brief Creates an format-specific object file dumper.
+/// Creates an format-specific object file dumper.
 static std::error_code createDumper(const ObjectFile *Obj,
                                     ScopedPrinter &Writer,
                                     std::unique_ptr<ObjDumper> &Result) {
@@ -374,20 +399,20 @@ static std::error_code createDumper(const ObjectFile *Obj,
   return readobj_error::unsupported_obj_file_format;
 }
 
-/// @brief Dumps the specified object file.
-static void dumpObject(const ObjectFile *Obj) {
-  ScopedPrinter Writer(outs());
+/// Dumps the specified object file.
+static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer) {
   std::unique_ptr<ObjDumper> Dumper;
   if (std::error_code EC = createDumper(Obj, Writer, Dumper))
     reportError(Obj->getFileName(), EC);
 
   if (opts::Output == opts::LLVM) {
-    outs() << '\n';
-    outs() << "File: " << Obj->getFileName() << "\n";
-    outs() << "Format: " << Obj->getFileFormatName() << "\n";
-    outs() << "Arch: " << Triple::getArchTypeName(
-                              (llvm::Triple::ArchType)Obj->getArch()) << "\n";
-    outs() << "AddressSize: " << (8 * Obj->getBytesInAddress()) << "bit\n";
+    Writer.startLine() << "\n";
+    Writer.printString("File", Obj->getFileName());
+    Writer.printString("Format", Obj->getFileFormatName());
+    Writer.printString("Arch", Triple::getArchTypeName(
+                                   (llvm::Triple::ArchType)Obj->getArch()));
+    Writer.printString("AddressSize",
+                       formatv("{0}bit", 8 * Obj->getBytesInAddress()));
     Dumper->printLoadName();
   }
 
@@ -411,6 +436,14 @@ static void dumpObject(const ObjectFile *Obj) {
     Dumper->printNeededLibraries();
   if (opts::ProgramHeaders)
     Dumper->printProgramHeaders();
+  if (!opts::StringDump.empty())
+    llvm::for_each(opts::StringDump, [&Dumper, Obj](StringRef SectionName) {
+      Dumper->printSectionAsString(Obj, SectionName);
+    });
+  if (!opts::HexDump.empty())
+    llvm::for_each(opts::HexDump, [&Dumper, Obj](StringRef SectionName) {
+      Dumper->printSectionAsHex(Obj, SectionName);
+    });
   if (opts::HashTable)
     Dumper->printHashTable();
   if (opts::GnuHashTable)
@@ -418,6 +451,8 @@ static void dumpObject(const ObjectFile *Obj) {
   if (opts::VersionInfo)
     Dumper->printVersionInfo();
   if (Obj->isELF()) {
+    if (opts::ELFLinkerOptions)
+      Dumper->printELFLinkerOptions();
     if (Obj->getArch() == llvm::Triple::arm)
       if (opts::ARMAttributes)
         Dumper->printAttributes();
@@ -435,6 +470,10 @@ static void dumpObject(const ObjectFile *Obj) {
       Dumper->printGroupSections();
     if (opts::HashHistogram)
       Dumper->printHashHistogram();
+    if (opts::CGProfile)
+      Dumper->printCGProfile();
+    if (opts::Addrsig)
+      Dumper->printAddrsig();
     if (opts::Notes)
       Dumper->printNotes();
   }
@@ -476,8 +515,8 @@ static void dumpObject(const ObjectFile *Obj) {
     Dumper->printStackMap();
 }
 
-/// @brief Dumps each object file in \a Arc;
-static void dumpArchive(const Archive *Arc) {
+/// Dumps each object file in \a Arc;
+static void dumpArchive(const Archive *Arc, ScopedPrinter &Writer) {
   Error Err = Error::success();
   for (auto &Child : Arc->children(Err)) {
     Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
@@ -488,9 +527,9 @@ static void dumpArchive(const Archive *Arc) {
       continue;
     }
     if (ObjectFile *Obj = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
-      dumpObject(Obj);
+      dumpObject(Obj, Writer);
     else if (COFFImportFile *Imp = dyn_cast<COFFImportFile>(&*ChildOrErr.get()))
-      dumpCOFFImportFile(Imp);
+      dumpCOFFImportFile(Imp, Writer);
     else
       reportError(Arc->getFileName(), readobj_error::unrecognized_file_format);
   }
@@ -498,21 +537,22 @@ static void dumpArchive(const Archive *Arc) {
     reportError(Arc->getFileName(), std::move(Err));
 }
 
-/// @brief Dumps each object file in \a MachO Universal Binary;
-static void dumpMachOUniversalBinary(const MachOUniversalBinary *UBinary) {
+/// Dumps each object file in \a MachO Universal Binary;
+static void dumpMachOUniversalBinary(const MachOUniversalBinary *UBinary,
+                                     ScopedPrinter &Writer) {
   for (const MachOUniversalBinary::ObjectForArch &Obj : UBinary->objects()) {
     Expected<std::unique_ptr<MachOObjectFile>> ObjOrErr = Obj.getAsObjectFile();
     if (ObjOrErr)
-      dumpObject(&*ObjOrErr.get());
+      dumpObject(&*ObjOrErr.get(), Writer);
     else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
       reportError(UBinary->getFileName(), ObjOrErr.takeError());
     }
     else if (Expected<std::unique_ptr<Archive>> AOrErr = Obj.getAsArchive())
-      dumpArchive(&*AOrErr.get());
+      dumpArchive(&*AOrErr.get(), Writer);
   }
 }
 
-/// @brief Dumps \a WinRes, Windows Resource (.res) file;
+/// Dumps \a WinRes, Windows Resource (.res) file;
 static void dumpWindowsResourceFile(WindowsResource *WinRes) {
   ScopedPrinter Printer{outs()};
   WindowsRes::Dumper Dumper(WinRes, Printer);
@@ -521,8 +561,9 @@ static void dumpWindowsResourceFile(WindowsResource *WinRes) {
 }
 
 
-/// @brief Opens \a File and dumps it.
+/// Opens \a File and dumps it.
 static void dumpInput(StringRef File) {
+  ScopedPrinter Writer(outs());
 
   // Attempt to open the binary.
   Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
@@ -531,14 +572,14 @@ static void dumpInput(StringRef File) {
   Binary &Binary = *BinaryOrErr.get().getBinary();
 
   if (Archive *Arc = dyn_cast<Archive>(&Binary))
-    dumpArchive(Arc);
+    dumpArchive(Arc, Writer);
   else if (MachOUniversalBinary *UBinary =
                dyn_cast<MachOUniversalBinary>(&Binary))
-    dumpMachOUniversalBinary(UBinary);
+    dumpMachOUniversalBinary(UBinary, Writer);
   else if (ObjectFile *Obj = dyn_cast<ObjectFile>(&Binary))
-    dumpObject(Obj);
+    dumpObject(Obj, Writer);
   else if (COFFImportFile *Import = dyn_cast<COFFImportFile>(&Binary))
-    dumpCOFFImportFile(Import);
+    dumpCOFFImportFile(Import, Writer);
   else if (WindowsResource *WinRes = dyn_cast<WindowsResource>(&Binary))
     dumpWindowsResourceFile(WinRes);
   else
@@ -546,17 +587,14 @@ static void dumpInput(StringRef File) {
 }
 
 int main(int argc, const char *argv[]) {
-  StringRef ToolName = argv[0];
-  sys::PrintStackTraceOnErrorSignal(ToolName);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;
+  InitLLVM X(argc, argv);
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
   opts::WideOutput.setHiddenFlag(cl::Hidden);
 
-  if (sys::path::stem(ToolName).find("readelf") != StringRef::npos)
+  if (sys::path::stem(argv[0]).find("readelf") != StringRef::npos)
     opts::Output = opts::GNU;
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n");
diff --git a/contrib/llvm/tools/llvm-readobj/llvm-readobj.h b/contrib/llvm/tools/llvm-readobj/llvm-readobj.h
index 840ddbabdc59..374ffd03e13a 100644
--- a/contrib/llvm/tools/llvm-readobj/llvm-readobj.h
+++ b/contrib/llvm/tools/llvm-readobj/llvm-readobj.h
@@ -60,6 +60,7 @@ namespace opts {
   extern llvm::cl::opt<bool> DynamicSymbols;
   extern llvm::cl::opt<bool> UnwindInfo;
   extern llvm::cl::opt<bool> ExpandRelocs;
+  extern llvm::cl::opt<bool> RawRelr;
   extern llvm::cl::opt<bool> CodeView;
   extern llvm::cl::opt<bool> CodeViewSubsectionBytes;
   extern llvm::cl::opt<bool> ARMAttributes;
diff --git a/contrib/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp b/contrib/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
index b09594622ca9..54db1ec113fc 100644
--- a/contrib/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/contrib/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -27,11 +27,9 @@
 #include "llvm/Object/SymbolSize.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,7 +40,7 @@ using namespace llvm::object;
 
 static cl::list<std::string>
 InputFileList(cl::Positional, cl::ZeroOrMore,
-              cl::desc("<input file>"));
+              cl::desc("<input files>"));
 
 enum ActionType {
   AC_Execute,
@@ -736,11 +734,8 @@ static int linkAndVerify() {
 }
 
 int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-
+  InitLLVM X(argc, argv);
   ProgramName = argv[0];
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
diff --git a/contrib/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/contrib/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index b51ec513f23b..6d40a5403504 100644
--- a/contrib/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/contrib/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -22,10 +22,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 #include <cstring>
@@ -103,24 +101,19 @@ static bool error(Expected<T> &ResOrErr) {
 
 static bool parseCommand(StringRef InputString, bool &IsData,
                          std::string &ModuleName, uint64_t &ModuleOffset) {
-  const char *kDataCmd = "DATA ";
-  const char *kCodeCmd = "CODE ";
   const char kDelimiters[] = " \n\r";
-  IsData = false;
   ModuleName = "";
-  const char *pos = InputString.data();
-  if (strncmp(pos, kDataCmd, strlen(kDataCmd)) == 0) {
-    IsData = true;
-    pos += strlen(kDataCmd);
-  } else if (strncmp(pos, kCodeCmd, strlen(kCodeCmd)) == 0) {
+  if (InputString.consume_front("CODE ")) {
     IsData = false;
-    pos += strlen(kCodeCmd);
+  } else if (InputString.consume_front("DATA ")) {
+    IsData = true;
   } else {
     // If no cmd, assume it's CODE.
     IsData = false;
   }
+  const char *pos = InputString.data();
   // Skip delimiters and parse input filename (if needed).
-  if (ClBinaryName == "") {
+  if (ClBinaryName.empty()) {
     pos += strspn(pos, kDelimiters);
     if (*pos == '"' || *pos == '\'') {
       char quote = *pos;
@@ -145,10 +138,7 @@ static bool parseCommand(StringRef InputString, bool &IsData,
 }
 
 int main(int argc, char **argv) {
-  // Print stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  InitLLVM X(argc, argv);
 
   llvm::sys::InitializeCOMRAII COM(llvm::sys::COMThreadingMode::MultiThreaded);
 
@@ -188,7 +178,7 @@ int main(int argc, char **argv) {
     if (ClPrintAddress) {
       outs() << "0x";
       outs().write_hex(ModuleOffset);
-      StringRef Delimiter = (ClPrettyPrint == true) ? ": " : "\n";
+      StringRef Delimiter = ClPrettyPrint ? ": " : "\n";
       outs() << Delimiter;
     }
     if (IsData) {
diff --git a/contrib/llvm/tools/llvm-xray/func-id-helper.cc b/contrib/llvm/tools/llvm-xray/func-id-helper.cpp
index 3234010695b2..c2bef6ddfb39 100644
--- a/contrib/llvm/tools/llvm-xray/func-id-helper.cc
+++ b/contrib/llvm/tools/llvm-xray/func-id-helper.cpp
@@ -1,4 +1,4 @@
-//===- xray-fc-account.cc - XRay Function Call Accounting Tool ------------===//
+//===- xray-fc-account.cpp: XRay Function Call Accounting Tool ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,6 +19,10 @@ using namespace llvm;
 using namespace xray;
 
 std::string FuncIdConversionHelper::SymbolOrNumber(int32_t FuncId) const {
+  auto CacheIt = CachedNames.find(FuncId);
+  if (CacheIt != CachedNames.end())
+    return CacheIt->second;
+
   std::ostringstream F;
   auto It = FunctionAddresses.find(FuncId);
   if (It == FunctionAddresses.end()) {
@@ -37,7 +41,9 @@ std::string FuncIdConversionHelper::SymbolOrNumber(int32_t FuncId) const {
       F << "@(" << std::hex << It->second << ")";
     });
 
-  return F.str();
+  auto S = F.str();
+  CachedNames[FuncId] = S;
+  return S;
 }
 
 std::string FuncIdConversionHelper::FileLineAndColumn(int32_t FuncId) const {
diff --git a/contrib/llvm/tools/llvm-xray/func-id-helper.h b/contrib/llvm/tools/llvm-xray/func-id-helper.h
index 7348a7100b05..3e0780d54f90 100644
--- a/contrib/llvm/tools/llvm-xray/func-id-helper.h
+++ b/contrib/llvm/tools/llvm-xray/func-id-helper.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
 #define LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include <unordered_map>
 
@@ -28,6 +29,7 @@ private:
   std::string BinaryInstrMap;
   symbolize::LLVMSymbolizer &Symbolizer;
   const FunctionAddressMap &FunctionAddresses;
+  mutable llvm::DenseMap<int32_t, std::string> CachedNames;
 
 public:
   FuncIdConversionHelper(std::string BinaryInstrMap,
diff --git a/contrib/llvm/tools/llvm-xray/llvm-xray.cc b/contrib/llvm/tools/llvm-xray/llvm-xray.cpp
index 17cc9f90dd71..e74628f5025f 100644
--- a/contrib/llvm/tools/llvm-xray/llvm-xray.cc
+++ b/contrib/llvm/tools/llvm-xray/llvm-xray.cpp
@@ -1,4 +1,4 @@
-//===- llvm-xray.cc - XRay Tool Main Program ------------------------------===//
+//===- llvm-xray.cpp: XRay Tool Main Program ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-xray/xray-account.cc b/contrib/llvm/tools/llvm-xray/xray-account.cpp
index 7b684aad693d..2776a8888481 100644
--- a/contrib/llvm/tools/llvm-xray/xray-account.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-account.cpp
@@ -237,16 +237,19 @@ ResultRow getStats(std::vector<uint64_t> &Timings) {
   auto MinMax = std::minmax_element(Timings.begin(), Timings.end());
   R.Min = *MinMax.first;
   R.Max = *MinMax.second;
+  R.Count = Timings.size();
+
   auto MedianOff = Timings.size() / 2;
   std::nth_element(Timings.begin(), Timings.begin() + MedianOff, Timings.end());
   R.Median = Timings[MedianOff];
+
   auto Pct90Off = std::floor(Timings.size() * 0.9);
   std::nth_element(Timings.begin(), Timings.begin() + Pct90Off, Timings.end());
   R.Pct90 = Timings[Pct90Off];
+
   auto Pct99Off = std::floor(Timings.size() * 0.99);
-  std::nth_element(Timings.begin(), Timings.begin() + Pct90Off, Timings.end());
+  std::nth_element(Timings.begin(), Timings.begin() + Pct99Off, Timings.end());
   R.Pct99 = Timings[Pct99Off];
-  R.Count = Timings.size();
   return R;
 }
 
@@ -279,79 +282,79 @@ void LatencyAccountant::exportStats(const XRayFileHeader &Header, F Fn) const {
   // Sort the data according to user-provided flags.
   switch (AccountSortOutput) {
   case SortField::FUNCID:
-    std::sort(Results.begin(), Results.end(),
-              [](const TupleType &L, const TupleType &R) {
-                if (AccountSortOrder == SortDirection::ASCENDING)
-                  return std::get<0>(L) < std::get<0>(R);
-                if (AccountSortOrder == SortDirection::DESCENDING)
-                  return std::get<0>(L) > std::get<0>(R);
-                llvm_unreachable("Unknown sort direction");
-              });
+    llvm::sort(Results.begin(), Results.end(),
+               [](const TupleType &L, const TupleType &R) {
+                 if (AccountSortOrder == SortDirection::ASCENDING)
+                   return std::get<0>(L) < std::get<0>(R);
+                 if (AccountSortOrder == SortDirection::DESCENDING)
+                   return std::get<0>(L) > std::get<0>(R);
+                 llvm_unreachable("Unknown sort direction");
+               });
     break;
   case SortField::COUNT:
-    std::sort(Results.begin(), Results.end(),
-              [](const TupleType &L, const TupleType &R) {
-                if (AccountSortOrder == SortDirection::ASCENDING)
-                  return std::get<1>(L) < std::get<1>(R);
-                if (AccountSortOrder == SortDirection::DESCENDING)
-                  return std::get<1>(L) > std::get<1>(R);
-                llvm_unreachable("Unknown sort direction");
-              });
+    llvm::sort(Results.begin(), Results.end(),
+               [](const TupleType &L, const TupleType &R) {
+                 if (AccountSortOrder == SortDirection::ASCENDING)
+                   return std::get<1>(L) < std::get<1>(R);
+                 if (AccountSortOrder == SortDirection::DESCENDING)
+                   return std::get<1>(L) > std::get<1>(R);
+                 llvm_unreachable("Unknown sort direction");
+               });
     break;
   default:
     // Here we need to look into the ResultRow for the rest of the data that
     // we want to sort by.
-    std::sort(Results.begin(), Results.end(),
-              [&](const TupleType &L, const TupleType &R) {
-                auto &LR = std::get<2>(L);
-                auto &RR = std::get<2>(R);
-                switch (AccountSortOutput) {
-                case SortField::COUNT:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Count < RR.Count;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Count > RR.Count;
-                  llvm_unreachable("Unknown sort direction");
-                case SortField::MIN:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Min < RR.Min;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Min > RR.Min;
-                  llvm_unreachable("Unknown sort direction");
-                case SortField::MED:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Median < RR.Median;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Median > RR.Median;
-                  llvm_unreachable("Unknown sort direction");
-                case SortField::PCT90:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Pct90 < RR.Pct90;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Pct90 > RR.Pct90;
-                  llvm_unreachable("Unknown sort direction");
-                case SortField::PCT99:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Pct99 < RR.Pct99;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Pct99 > RR.Pct99;
-                  llvm_unreachable("Unknown sort direction");
-                case SortField::MAX:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Max < RR.Max;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Max > RR.Max;
-                  llvm_unreachable("Unknown sort direction");
-                case SortField::SUM:
-                  if (AccountSortOrder == SortDirection::ASCENDING)
-                    return LR.Sum < RR.Sum;
-                  if (AccountSortOrder == SortDirection::DESCENDING)
-                    return LR.Sum > RR.Sum;
-                  llvm_unreachable("Unknown sort direction");
-                default:
-                  llvm_unreachable("Unsupported sort order");
-                }
-              });
+    llvm::sort(Results.begin(), Results.end(),
+               [&](const TupleType &L, const TupleType &R) {
+                 auto &LR = std::get<2>(L);
+                 auto &RR = std::get<2>(R);
+                 switch (AccountSortOutput) {
+                 case SortField::COUNT:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Count < RR.Count;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Count > RR.Count;
+                   llvm_unreachable("Unknown sort direction");
+                 case SortField::MIN:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Min < RR.Min;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Min > RR.Min;
+                   llvm_unreachable("Unknown sort direction");
+                 case SortField::MED:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Median < RR.Median;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Median > RR.Median;
+                   llvm_unreachable("Unknown sort direction");
+                 case SortField::PCT90:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Pct90 < RR.Pct90;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Pct90 > RR.Pct90;
+                   llvm_unreachable("Unknown sort direction");
+                 case SortField::PCT99:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Pct99 < RR.Pct99;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Pct99 > RR.Pct99;
+                   llvm_unreachable("Unknown sort direction");
+                 case SortField::MAX:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Max < RR.Max;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Max > RR.Max;
+                   llvm_unreachable("Unknown sort direction");
+                 case SortField::SUM:
+                   if (AccountSortOrder == SortDirection::ASCENDING)
+                     return LR.Sum < RR.Sum;
+                   if (AccountSortOrder == SortDirection::DESCENDING)
+                     return LR.Sum > RR.Sum;
+                   llvm_unreachable("Unknown sort direction");
+                 default:
+                   llvm_unreachable("Unsupported sort order");
+                 }
+               });
     break;
   }
 
@@ -473,9 +476,9 @@ static CommandRegistration Unused(&Account, []() -> Error {
     errs()
         << "Error processing record: "
         << llvm::formatv(
-               R"({{type: {0}; cpu: {1}; record-type: {2}; function-id: {3}; tsc: {4}; thread-id: {5}}})",
+               R"({{type: {0}; cpu: {1}; record-type: {2}; function-id: {3}; tsc: {4}; thread-id: {5}; process-id: {6}}})",
                Record.RecordType, Record.CPU, Record.Type, Record.FuncId,
-               Record.TId)
+               Record.TSC, Record.TId, Record.PId)
         << '\n';
     for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) {
       errs() << "Thread ID: " << ThreadStack.first << "\n";
diff --git a/contrib/llvm/tools/llvm-xray/xray-account.h b/contrib/llvm/tools/llvm-xray/xray-account.h
index cc9ba897e537..5c457f178166 100644
--- a/contrib/llvm/tools/llvm-xray/xray-account.h
+++ b/contrib/llvm/tools/llvm-xray/xray-account.h
@@ -29,12 +29,11 @@ namespace xray {
 class LatencyAccountant {
 public:
   typedef std::map<int32_t, std::vector<uint64_t>> FunctionLatencyMap;
-  typedef std::map<llvm::sys::ProcessInfo::ProcessId,
-                   std::pair<uint64_t, uint64_t>>
+  typedef std::map<llvm::sys::procid_t, std::pair<uint64_t, uint64_t>>
       PerThreadMinMaxTSCMap;
   typedef std::map<uint8_t, std::pair<uint64_t, uint64_t>> PerCPUMinMaxTSCMap;
   typedef std::vector<std::pair<int32_t, uint64_t>> FunctionStack;
-  typedef std::map<llvm::sys::ProcessInfo::ProcessId, FunctionStack>
+  typedef std::map<llvm::sys::procid_t, FunctionStack>
       PerThreadFunctionStackMap;
 
 private:
@@ -79,8 +78,7 @@ public:
   ///
   bool accountRecord(const XRayRecord &Record);
 
-  const FunctionStack *
-  getThreadFunctionStack(llvm::sys::ProcessInfo::ProcessId TId) const {
+  const FunctionStack *getThreadFunctionStack(llvm::sys::procid_t TId) const {
     auto I = PerThreadFunctionStack.find(TId);
     if (I == PerThreadFunctionStack.end())
       return nullptr;
diff --git a/contrib/llvm/tools/llvm-xray/xray-color-helper.cc b/contrib/llvm/tools/llvm-xray/xray-color-helper.cpp
index 61314d3c766a..78a264b73d8f 100644
--- a/contrib/llvm/tools/llvm-xray/xray-color-helper.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-color-helper.cpp
@@ -1,4 +1,4 @@
-//===-- xray-graph.cc - XRay Function Call Graph Renderer -----------------===//
+//===-- xray-graph.cpp: XRay Function Call Graph Renderer -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-xray/xray-converter.cc b/contrib/llvm/tools/llvm-xray/xray-converter.cpp
index aa0da55207b3..90e14d0d8896 100644
--- a/contrib/llvm/tools/llvm-xray/xray-converter.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-converter.cpp
@@ -1,4 +1,4 @@
-//===- xray-converter.cc - XRay Trace Conversion --------------------------===//
+//===- xray-converter.cpp: XRay Trace Conversion --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -91,7 +91,7 @@ void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
     Trace.Records.push_back({R.RecordType, R.CPU, R.Type, R.FuncId,
                              Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
                                        : llvm::to_string(R.FuncId),
-                             R.TSC, R.TId, R.CallArgs});
+                             R.TSC, R.TId, R.PId, R.CallArgs});
   }
   Output Out(OS, nullptr, 0);
   Out << Trace;
@@ -100,7 +100,7 @@ void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
 void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
   // First write out the file header, in the correct endian-appropriate format
   // (XRay assumes currently little endian).
-  support::endian::Writer<support::endianness::little> Writer(OS);
+  support::endian::Writer Writer(OS, support::endianness::little);
   const auto &FH = Records.getFileHeader();
   Writer.write(FH.Version);
   Writer.write(FH.Type);
@@ -141,7 +141,12 @@ void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
     Writer.write(R.FuncId);
     Writer.write(R.TSC);
     Writer.write(R.TId);
-    Writer.write(Padding4B);
+
+    if (FH.Version >= 3)
+      Writer.write(R.PId);
+    else
+      Writer.write(Padding4B);
+
     Writer.write(Padding4B);
     Writer.write(Padding4B);
   }
@@ -229,19 +234,29 @@ StackTrieNode *findOrCreateStackNode(
   return CurrentStack;
 }
 
-void writeTraceViewerRecord(raw_ostream &OS, int32_t FuncId, uint32_t TId,
-                            bool Symbolize,
+void writeTraceViewerRecord(uint16_t Version, raw_ostream &OS, int32_t FuncId,
+                            uint32_t TId, uint32_t PId, bool Symbolize,
                             const FuncIdConversionHelper &FuncIdHelper,
                             double EventTimestampUs,
                             const StackTrieNode &StackCursor,
                             StringRef FunctionPhenotype) {
   OS << "    ";
-  OS << llvm::formatv(
-      R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "1", )"
-      R"("ts" : "{3:f3}", "sf" : "{4}" })",
-      (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
-                 : llvm::to_string(FuncId)),
-      FunctionPhenotype, TId, EventTimestampUs, StackCursor.ExtraData.id);
+  if (Version >= 3) {
+    OS << llvm::formatv(
+        R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "{3}", )"
+        R"("ts" : "{4:f4}", "sf" : "{5}" })",
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                   : llvm::to_string(FuncId)),
+        FunctionPhenotype, TId, PId, EventTimestampUs,
+        StackCursor.ExtraData.id);
+  } else {
+    OS << llvm::formatv(
+        R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "1", )"
+        R"("ts" : "{3:f3}", "sf" : "{4}" })",
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                   : llvm::to_string(FuncId)),
+        FunctionPhenotype, TId, EventTimestampUs, StackCursor.ExtraData.id);
+  }
 }
 
 } // namespace
@@ -249,6 +264,7 @@ void writeTraceViewerRecord(raw_ostream &OS, int32_t FuncId, uint32_t TId,
 void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
                                                     raw_ostream &OS) {
   const auto &FH = Records.getFileHeader();
+  auto Version = FH.Version;
   auto CycleFreq = FH.CycleFrequency;
 
   unsigned id_counter = 0;
@@ -282,11 +298,11 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
                                           StackRootsByThreadId, StacksByStackId,
                                           &id_counter, NodeStore);
       // Each record is represented as a json dictionary with function name,
-      // type of B for begin or E for end, thread id, process id (faked),
+      // type of B for begin or E for end, thread id, process id,
       // timestamp in microseconds, and a stack frame id. The ids are logged
       // in an id dictionary after the events.
-      writeTraceViewerRecord(OS, R.FuncId, R.TId, Symbolize, FuncIdHelper,
-                             EventTimestampUs, *StackCursor, "B");
+      writeTraceViewerRecord(Version, OS, R.FuncId, R.TId, R.PId, Symbolize,
+                             FuncIdHelper, EventTimestampUs, *StackCursor, "B");
       break;
     case RecordTypes::EXIT:
     case RecordTypes::TAIL_EXIT:
@@ -297,9 +313,12 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
       // (And/Or in loop termination below)
       StackTrieNode *PreviousCursor = nullptr;
       do {
-        writeTraceViewerRecord(OS, StackCursor->FuncId, R.TId, Symbolize,
-                               FuncIdHelper, EventTimestampUs, *StackCursor,
-                               "E");
+        if (PreviousCursor != nullptr) {
+          OS << ",\n";
+        }
+        writeTraceViewerRecord(Version, OS, StackCursor->FuncId, R.TId, R.PId,
+                               Symbolize, FuncIdHelper, EventTimestampUs,
+                               *StackCursor, "E");
         PreviousCursor = StackCursor;
         StackCursor = StackCursor->Parent;
       } while (PreviousCursor->FuncId != R.FuncId && StackCursor != nullptr);
diff --git a/contrib/llvm/tools/llvm-xray/xray-extract.cc b/contrib/llvm/tools/llvm-xray/xray-extract.cpp
index cd87798d0e60..10fe7d8d6209 100644
--- a/contrib/llvm/tools/llvm-xray/xray-extract.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-extract.cpp
@@ -1,4 +1,4 @@
-//===- xray-extract.cc - XRay Instrumentation Map Extraction --------------===//
+//===- xray-extract.cpp: XRay Instrumentation Map Extraction --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-xray/xray-graph-diff.cc b/contrib/llvm/tools/llvm-xray/xray-graph-diff.cpp
index 3c69b3fb0751..a22f2a99811d 100644
--- a/contrib/llvm/tools/llvm-xray/xray-graph-diff.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-graph-diff.cpp
@@ -1,4 +1,4 @@
-//===-- xray-graph-diff.cc - XRay Function Call Graph Renderer ------------===//
+//===-- xray-graph-diff.cpp: XRay Function Call Graph Renderer ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-xray/xray-graph.cc b/contrib/llvm/tools/llvm-xray/xray-graph.cpp
index feb676331f89..c619bf86299b 100644
--- a/contrib/llvm/tools/llvm-xray/xray-graph.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-graph.cpp
@@ -1,4 +1,4 @@
-//===-- xray-graph.cc - XRay Function Call Graph Renderer -----------------===//
+//===-- xray-graph.cpp: XRay Function Call Graph Renderer -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-xray/xray-graph.h b/contrib/llvm/tools/llvm-xray/xray-graph.h
index a43df265d0e1..fc7f8bb470f2 100644
--- a/contrib/llvm/tools/llvm-xray/xray-graph.h
+++ b/contrib/llvm/tools/llvm-xray/xray-graph.h
@@ -80,7 +80,7 @@ public:
   using FunctionStack = SmallVector<FunctionAttr, 4>;
 
   using PerThreadFunctionStackMap =
-      DenseMap<llvm::sys::ProcessInfo::ProcessId, FunctionStack>;
+      DenseMap<llvm::sys::procid_t, FunctionStack>;
 
   class GraphT : public Graph<FunctionStats, CallStats, int32_t> {
   public:
diff --git a/contrib/llvm/tools/llvm-xray/xray-registry.cc b/contrib/llvm/tools/llvm-xray/xray-registry.cpp
index 36d3a2e58f97..fe58e4deaa1e 100644
--- a/contrib/llvm/tools/llvm-xray/xray-registry.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-registry.cpp
@@ -1,4 +1,4 @@
-//===- xray-registry.cc - Implement a command registry. -------------------===//
+//===- xray-registry.cpp: Implement a command registry. -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/llvm-xray/xray-stacks.cc b/contrib/llvm/tools/llvm-xray/xray-stacks.cpp
index 9474de047990..1a6069780a31 100644
--- a/contrib/llvm/tools/llvm-xray/xray-stacks.cc
+++ b/contrib/llvm/tools/llvm-xray/xray-stacks.cpp
@@ -1,4 +1,4 @@
-//===- xray-stacks.cc - XRay Function Call Stack Accounting ---------------===//
+//===- xray-stacks.cpp: XRay Function Call Stack Accounting ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/tools/opt/BreakpointPrinter.cpp b/contrib/llvm/tools/opt/BreakpointPrinter.cpp
index e5614ed061e3..d3f54c034f55 100644
--- a/contrib/llvm/tools/opt/BreakpointPrinter.cpp
+++ b/contrib/llvm/tools/opt/BreakpointPrinter.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Breakpoint location printer.
+/// Breakpoint location printer.
 ///
 //===----------------------------------------------------------------------===//
 #include "BreakpointPrinter.h"
diff --git a/contrib/llvm/tools/opt/BreakpointPrinter.h b/contrib/llvm/tools/opt/BreakpointPrinter.h
index 81c88e19199e..57670e5ee8d8 100644
--- a/contrib/llvm/tools/opt/BreakpointPrinter.h
+++ b/contrib/llvm/tools/opt/BreakpointPrinter.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Breakpoint location printer.
+/// Breakpoint location printer.
 ///
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_TOOLS_OPT_BREAKPOINTPRINTER_H
diff --git a/contrib/llvm/tools/opt/Debugify.cpp b/contrib/llvm/tools/opt/Debugify.cpp
index 40ee545c098d..6c3cdc75e334 100644
--- a/contrib/llvm/tools/opt/Debugify.cpp
+++ b/contrib/llvm/tools/opt/Debugify.cpp
@@ -12,6 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "Debugify.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/BasicBlock.h"
@@ -34,10 +35,37 @@ using namespace llvm;
 
 namespace {
 
-bool applyDebugifyMetadata(Module &M) {
+cl::opt<bool> Quiet("debugify-quiet",
+                    cl::desc("Suppress verbose debugify output"));
+
+raw_ostream &dbg() { return Quiet ? nulls() : errs(); }
+
+uint64_t getAllocSizeInBits(Module &M, Type *Ty) {
+  return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0;
+}
+
+bool isFunctionSkipped(Function &F) {
+  return F.isDeclaration() || !F.hasExactDefinition();
+}
+
+/// Find the basic block's terminating instruction.
+///
+/// Special care is needed to handle musttail and deopt calls, as these behave
+/// like (but are in fact not) terminators.
+Instruction *findTerminatingInstruction(BasicBlock &BB) {
+  if (auto *I = BB.getTerminatingMustTailCall())
+    return I;
+  if (auto *I = BB.getTerminatingDeoptimizeCall())
+    return I;
+  return BB.getTerminator();
+}
+
+bool applyDebugifyMetadata(Module &M,
+                           iterator_range<Module::iterator> Functions,
+                           StringRef Banner) {
   // Skip modules with debug info.
   if (M.getNamedMetadata("llvm.dbg.cu")) {
-    errs() << "Debugify: Skipping module with debug info\n";
+    dbg() << Banner << "Skipping module with debug info\n";
     return false;
   }
 
@@ -47,7 +75,7 @@ bool applyDebugifyMetadata(Module &M) {
   // Get a DIType which corresponds to Ty.
   DenseMap<uint64_t, DIType *> TypeCache;
   auto getCachedDIType = [&](Type *Ty) -> DIType * {
-    uint64_t Size = M.getDataLayout().getTypeAllocSizeInBits(Ty);
+    uint64_t Size = getAllocSizeInBits(M, Ty);
     DIType *&DTy = TypeCache[Size];
     if (!DTy) {
       std::string Name = "ty" + utostr(Size);
@@ -59,20 +87,19 @@ bool applyDebugifyMetadata(Module &M) {
   unsigned NextLine = 1;
   unsigned NextVar = 1;
   auto File = DIB.createFile(M.getName(), "/");
-  auto CU =
-      DIB.createCompileUnit(dwarf::DW_LANG_C, DIB.createFile(M.getName(), "/"),
-                            "debugify", /*isOptimized=*/true, "", 0);
+  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
+                                  /*isOptimized=*/true, "", 0);
 
   // Visit each instruction.
-  for (Function &F : M) {
-    if (F.isDeclaration())
+  for (Function &F : Functions) {
+    if (isFunctionSkipped(F))
       continue;
 
     auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
     bool IsLocalToUnit = F.hasPrivateLinkage() || F.hasInternalLinkage();
     auto SP =
         DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine, SPType,
-                           IsLocalToUnit, F.hasExactDefinition(), NextLine,
+                           IsLocalToUnit, /*isDefinition=*/true, NextLine,
                            DINode::FlagZero, /*isOptimized=*/true);
     F.setSubprogram(SP);
     for (BasicBlock &BB : F) {
@@ -80,23 +107,39 @@ bool applyDebugifyMetadata(Module &M) {
       for (Instruction &I : BB)
         I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
 
+      // Inserting debug values into EH pads can break IR invariants.
+      if (BB.isEHPad())
+        continue;
+
+      // Find the terminating instruction, after which no debug values are
+      // attached.
+      Instruction *LastInst = findTerminatingInstruction(BB);
+      assert(LastInst && "Expected basic block with a terminator");
+
+      // Maintain an insertion point which can't be invalidated when updates
+      // are made.
+      BasicBlock::iterator InsertPt = BB.getFirstInsertionPt();
+      assert(InsertPt != BB.end() && "Expected to find an insertion point");
+      Instruction *InsertBefore = &*InsertPt;
+
       // Attach debug values.
-      for (Instruction &I : BB) {
+      for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) {
         // Skip void-valued instructions.
-        if (I.getType()->isVoidTy())
+        if (I->getType()->isVoidTy())
           continue;
 
-        // Skip the terminator instruction and any just-inserted intrinsics.
-        if (isa<TerminatorInst>(&I) || isa<DbgValueInst>(&I))
-          break;
+        // Phis and EH pads must be grouped at the beginning of the block.
+        // Only advance the insertion point when we finish visiting these.
+        if (!isa<PHINode>(I) && !I->isEHPad())
+          InsertBefore = I->getNextNode();
 
         std::string Name = utostr(NextVar++);
-        const DILocation *Loc = I.getDebugLoc().get();
+        const DILocation *Loc = I->getDebugLoc().get();
         auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
-                                               getCachedDIType(I.getType()),
+                                               getCachedDIType(I->getType()),
                                                /*AlwaysPreserve=*/true);
-        DIB.insertDbgValueIntrinsic(&I, LocalVar, DIB.createExpression(), Loc,
-                                    BB.getTerminator());
+        DIB.insertDbgValueIntrinsic(I, LocalVar, DIB.createExpression(), Loc,
+                                    InsertBefore);
       }
     }
     DIB.finalizeSubprogram(SP);
@@ -112,48 +155,110 @@ bool applyDebugifyMetadata(Module &M) {
   };
   addDebugifyOperand(NextLine - 1); // Original number of lines.
   addDebugifyOperand(NextVar - 1);  // Original number of variables.
+  assert(NMD->getNumOperands() == 2 &&
+         "llvm.debugify should have exactly 2 operands!");
+
+  // Claim that this synthetic debug info is valid.
+  StringRef DIVersionKey = "Debug Info Version";
+  if (!M.getModuleFlag(DIVersionKey))
+    M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION);
+
   return true;
 }
 
-void checkDebugifyMetadata(Module &M) {
+/// Return true if a mis-sized diagnostic is issued for \p DVI.
+bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
+  // The size of a dbg.value's value operand should match the size of the
+  // variable it corresponds to.
+  //
+  // TODO: This, along with a check for non-null value operands, should be
+  // promoted to verifier failures.
+  Value *V = DVI->getValue();
+  if (!V)
+    return false;
+
+  // For now, don't try to interpret anything more complicated than an empty
+  // DIExpression. Eventually we should try to handle OP_deref and fragments.
+  if (DVI->getExpression()->getNumElements())
+    return false;
+
+  Type *Ty = V->getType();
+  uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty);
+  Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits();
+  if (!ValueOperandSize || !DbgVarSize)
+    return false;
+
+  bool HasBadSize = false;
+  if (Ty->isIntegerTy()) {
+    auto Signedness = DVI->getVariable()->getSignedness();
+    if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
+      HasBadSize = ValueOperandSize < *DbgVarSize;
+  } else {
+    HasBadSize = ValueOperandSize != *DbgVarSize;
+  }
+
+  if (HasBadSize) {
+    dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize
+          << ", but its variable has size " << *DbgVarSize << ": ";
+    DVI->print(dbg());
+    dbg() << "\n";
+  }
+  return HasBadSize;
+}
+
+bool checkDebugifyMetadata(Module &M,
+                           iterator_range<Module::iterator> Functions,
+                           StringRef NameOfWrappedPass, StringRef Banner,
+                           bool Strip, DebugifyStatsMap *StatsMap) {
   // Skip modules without debugify metadata.
   NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
-  if (!NMD)
-    return;
+  if (!NMD) {
+    dbg() << Banner << "Skipping module without debugify metadata\n";
+    return false;
+  }
 
   auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
     return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
         ->getZExtValue();
   };
+  assert(NMD->getNumOperands() == 2 &&
+         "llvm.debugify should have exactly 2 operands!");
   unsigned OriginalNumLines = getDebugifyOperand(0);
   unsigned OriginalNumVars = getDebugifyOperand(1);
   bool HasErrors = false;
 
-  // Find missing lines.
+  // Track debug info loss statistics if able.
+  DebugifyStatistics *Stats = nullptr;
+  if (StatsMap && !NameOfWrappedPass.empty())
+    Stats = &StatsMap->operator[](NameOfWrappedPass);
+
   BitVector MissingLines{OriginalNumLines, true};
-  for (Function &F : M) {
+  BitVector MissingVars{OriginalNumVars, true};
+  for (Function &F : Functions) {
+    if (isFunctionSkipped(F))
+      continue;
+
+    // Find missing lines.
     for (Instruction &I : instructions(F)) {
       if (isa<DbgValueInst>(&I))
         continue;
 
       auto DL = I.getDebugLoc();
-      if (DL) {
+      if (DL && DL.getLine() != 0) {
         MissingLines.reset(DL.getLine() - 1);
         continue;
       }
 
-      outs() << "ERROR: Instruction with empty DebugLoc -- ";
-      I.print(outs());
-      outs() << "\n";
-      HasErrors = true;
+      if (!DL) {
+        dbg() << "ERROR: Instruction with empty DebugLoc in function ";
+        dbg() << F.getName() << " --";
+        I.print(dbg());
+        dbg() << "\n";
+        HasErrors = true;
+      }
     }
-  }
-  for (unsigned Idx : MissingLines.set_bits())
-    outs() << "WARNING: Missing line " << Idx + 1 << "\n";
 
-  // Find missing variables.
-  BitVector MissingVars{OriginalNumVars, true};
-  for (Function &F : M) {
+    // Find missing variables and mis-sized debug values.
     for (Instruction &I : instructions(F)) {
       auto *DVI = dyn_cast<DbgValueInst>(&I);
       if (!DVI)
@@ -162,21 +267,70 @@ void checkDebugifyMetadata(Module &M) {
       unsigned Var = ~0U;
       (void)to_integer(DVI->getVariable()->getName(), Var, 10);
       assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable");
-      MissingVars.reset(Var - 1);
+      bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI);
+      if (!HasBadSize)
+        MissingVars.reset(Var - 1);
+      HasErrors |= HasBadSize;
     }
   }
+
+  // Print the results.
+  for (unsigned Idx : MissingLines.set_bits())
+    dbg() << "WARNING: Missing line " << Idx + 1 << "\n";
+
   for (unsigned Idx : MissingVars.set_bits())
-    outs() << "ERROR: Missing variable " << Idx + 1 << "\n";
-  HasErrors |= MissingVars.count() > 0;
+    dbg() << "WARNING: Missing variable " << Idx + 1 << "\n";
+
+  // Update DI loss statistics.
+  if (Stats) {
+    Stats->NumDbgLocsExpected += OriginalNumLines;
+    Stats->NumDbgLocsMissing += MissingLines.count();
+    Stats->NumDbgValuesExpected += OriginalNumVars;
+    Stats->NumDbgValuesMissing += MissingVars.count();
+  }
+
+  dbg() << Banner;
+  if (!NameOfWrappedPass.empty())
+    dbg() << " [" << NameOfWrappedPass << "]";
+  dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n';
 
-  outs() << "CheckDebugify: " << (HasErrors ? "FAIL" : "PASS") << "\n";
+  // Strip the Debugify Metadata if required.
+  if (Strip) {
+    StripDebugInfo(M);
+    M.eraseNamedMetadata(NMD);
+    return true;
+  }
+
+  return false;
 }
 
-/// Attach synthetic debug info to everything.
-struct DebugifyPass : public ModulePass {
-  bool runOnModule(Module &M) override { return applyDebugifyMetadata(M); }
+/// ModulePass for attaching synthetic debug info to everything, used with the
+/// legacy module pass manager.
+struct DebugifyModulePass : public ModulePass {
+  bool runOnModule(Module &M) override {
+    return applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: ");
+  }
+
+  DebugifyModulePass() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+};
+
+/// FunctionPass for attaching synthetic debug info to instructions within a
+/// single function, used with the legacy module pass manager.
+struct DebugifyFunctionPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    Module &M = *F.getParent();
+    auto FuncIt = F.getIterator();
+    return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+                                 "FunctionDebugify: ");
+  }
 
-  DebugifyPass() : ModulePass(ID) {}
+  DebugifyFunctionPass() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
@@ -185,28 +339,125 @@ struct DebugifyPass : public ModulePass {
   static char ID; // Pass identification.
 };
 
-/// Check debug info inserted by -debugify for completeness.
-struct CheckDebugifyPass : public ModulePass {
+/// ModulePass for checking debug info inserted by -debugify, used with the
+/// legacy module pass manager.
+struct CheckDebugifyModulePass : public ModulePass {
   bool runOnModule(Module &M) override {
-    checkDebugifyMetadata(M);
-    return false;
+    return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
+                                 "CheckModuleDebugify", Strip, StatsMap);
+  }
+
+  CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "",
+                          DebugifyStatsMap *StatsMap = nullptr)
+      : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+        StatsMap(StatsMap) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+
+private:
+  bool Strip;
+  StringRef NameOfWrappedPass;
+  DebugifyStatsMap *StatsMap;
+};
+
+/// FunctionPass for checking debug info inserted by -debugify-function, used
+/// with the legacy module pass manager.
+struct CheckDebugifyFunctionPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    Module &M = *F.getParent();
+    auto FuncIt = F.getIterator();
+    return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+                                 NameOfWrappedPass, "CheckFunctionDebugify",
+                                 Strip, StatsMap);
   }
 
-  CheckDebugifyPass() : ModulePass(ID) {}
+  CheckDebugifyFunctionPass(bool Strip = false,
+                            StringRef NameOfWrappedPass = "",
+                            DebugifyStatsMap *StatsMap = nullptr)
+      : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+        StatsMap(StatsMap) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 
   static char ID; // Pass identification.
+
+private:
+  bool Strip;
+  StringRef NameOfWrappedPass;
+  DebugifyStatsMap *StatsMap;
 };
 
 } // end anonymous namespace
 
-char DebugifyPass::ID = 0;
-static RegisterPass<DebugifyPass> X("debugify",
-                                    "Attach debug info to everything");
+void exportDebugifyStats(llvm::StringRef Path, const DebugifyStatsMap &Map) {
+  std::error_code EC;
+  raw_fd_ostream OS{Path, EC};
+  if (EC) {
+    errs() << "Could not open file: " << EC.message() << ", " << Path << '\n';
+    return;
+  }
+
+  OS << "Pass Name" << ',' << "# of missing debug values" << ','
+     << "# of missing locations" << ',' << "Missing/Expected value ratio" << ','
+     << "Missing/Expected location ratio" << '\n';
+  for (const auto &Entry : Map) {
+    StringRef Pass = Entry.first;
+    DebugifyStatistics Stats = Entry.second;
+
+    OS << Pass << ',' << Stats.NumDbgValuesMissing << ','
+       << Stats.NumDbgLocsMissing << ',' << Stats.getMissingValueRatio() << ','
+       << Stats.getEmptyLocationRatio() << '\n';
+  }
+}
+
+ModulePass *createDebugifyModulePass() { return new DebugifyModulePass(); }
+
+FunctionPass *createDebugifyFunctionPass() {
+  return new DebugifyFunctionPass();
+}
+
+PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
+  applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: ");
+  return PreservedAnalyses::all();
+}
+
+ModulePass *createCheckDebugifyModulePass(bool Strip,
+                                          StringRef NameOfWrappedPass,
+                                          DebugifyStatsMap *StatsMap) {
+  return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
+}
+
+FunctionPass *createCheckDebugifyFunctionPass(bool Strip,
+                                              StringRef NameOfWrappedPass,
+                                              DebugifyStatsMap *StatsMap) {
+  return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
+}
+
+PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false,
+                        nullptr);
+  return PreservedAnalyses::all();
+}
+
+char DebugifyModulePass::ID = 0;
+static RegisterPass<DebugifyModulePass> DM("debugify",
+                                           "Attach debug info to everything");
+
+char CheckDebugifyModulePass::ID = 0;
+static RegisterPass<CheckDebugifyModulePass>
+    CDM("check-debugify", "Check debug info from -debugify");
+
+char DebugifyFunctionPass::ID = 0;
+static RegisterPass<DebugifyFunctionPass> DF("debugify-function",
+                                             "Attach debug info to a function");
 
-char CheckDebugifyPass::ID = 0;
-static RegisterPass<CheckDebugifyPass> Y("check-debugify",
-                                         "Check debug info from -debugify");
+char CheckDebugifyFunctionPass::ID = 0;
+static RegisterPass<CheckDebugifyFunctionPass>
+    CDF("check-debugify-function", "Check debug info from -debugify-function");
diff --git a/contrib/llvm/tools/opt/Debugify.h b/contrib/llvm/tools/opt/Debugify.h
new file mode 100644
index 000000000000..d1a60c73e723
--- /dev/null
+++ b/contrib/llvm/tools/opt/Debugify.h
@@ -0,0 +1,75 @@
+//===- Debugify.h - Attach synthetic debug info to everything -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Interface to the `debugify` synthetic debug info testing utility.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OPT_DEBUGIFY_H
+#define LLVM_TOOLS_OPT_DEBUGIFY_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+llvm::ModulePass *createDebugifyModulePass();
+llvm::FunctionPass *createDebugifyFunctionPass();
+
+struct NewPMDebugifyPass : public llvm::PassInfoMixin<NewPMDebugifyPass> {
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+};
+
+/// Track how much `debugify` information has been lost.
+struct DebugifyStatistics {
+  /// Number of missing dbg.values.
+  unsigned NumDbgValuesMissing = 0;
+
+  /// Number of dbg.values expected.
+  unsigned NumDbgValuesExpected = 0;
+
+  /// Number of instructions with empty debug locations.
+  unsigned NumDbgLocsMissing = 0;
+
+  /// Number of instructions expected to have debug locations.
+  unsigned NumDbgLocsExpected = 0;
+
+  /// Get the ratio of missing/expected dbg.values.
+  float getMissingValueRatio() const {
+    return float(NumDbgValuesMissing) / float(NumDbgLocsExpected);
+  }
+
+  /// Get the ratio of missing/expected instructions with locations.
+  float getEmptyLocationRatio() const {
+    return float(NumDbgLocsMissing) / float(NumDbgLocsExpected);
+  }
+};
+
+/// Map pass names to a per-pass DebugifyStatistics instance.
+using DebugifyStatsMap = llvm::MapVector<llvm::StringRef, DebugifyStatistics>;
+
+/// Export per-pass debugify statistics to the file specified by \p Path.
+void exportDebugifyStats(llvm::StringRef Path, const DebugifyStatsMap &Map);
+
+llvm::ModulePass *
+createCheckDebugifyModulePass(bool Strip = false,
+                              llvm::StringRef NameOfWrappedPass = "",
+                              DebugifyStatsMap *StatsMap = nullptr);
+
+llvm::FunctionPass *
+createCheckDebugifyFunctionPass(bool Strip = false,
+                                llvm::StringRef NameOfWrappedPass = "",
+                                DebugifyStatsMap *StatsMap = nullptr);
+
+struct NewPMCheckDebugifyPass
+    : public llvm::PassInfoMixin<NewPMCheckDebugifyPass> {
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+};
+
+#endif // LLVM_TOOLS_OPT_DEBUGIFY_H
diff --git a/contrib/llvm/tools/opt/NewPMDriver.cpp b/contrib/llvm/tools/opt/NewPMDriver.cpp
index a3f16f2538c4..a91d4cb5f9cd 100644
--- a/contrib/llvm/tools/opt/NewPMDriver.cpp
+++ b/contrib/llvm/tools/opt/NewPMDriver.cpp
@@ -13,12 +13,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "Debugify.h"
 #include "NewPMDriver.h"
+#include "PassPrinters.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
@@ -26,6 +28,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -40,6 +43,10 @@ static cl::opt<bool>
     DebugPM("debug-pass-manager", cl::Hidden,
             cl::desc("Print pass management debugging information"));
 
+static cl::list<std::string>
+    PassPlugins("load-pass-plugin",
+                cl::desc("Load passes from plugin library"));
+
 // This flag specifies a textual description of the alias analysis pipeline to
 // use when querying for aliasing information. It only works in concert with
 // the "passes" flag above.
@@ -82,6 +89,11 @@ static cl::opt<std::string> VectorizerStartEPPipeline(
     cl::desc("A textual description of the function pass pipeline inserted at "
              "the VectorizerStart extension point into default pipelines"),
     cl::Hidden);
+static cl::opt<std::string> PipelineStartEPPipeline(
+    "passes-ep-pipeline-start",
+    cl::desc("A textual description of the function pass pipeline inserted at "
+             "the PipelineStart extension point into default pipelines"),
+    cl::Hidden);
 enum PGOKind { NoPGO, InstrGen, InstrUse, SampleUse };
 static cl::opt<PGOKind> PGOKindFlag(
     "pgo-kind", cl::init(NoPGO), cl::Hidden,
@@ -159,6 +171,12 @@ static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
       PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass,
                            DebugLogging);
     });
+  if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
+    PB.registerPipelineStartEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) {
+          PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
+                               DebugLogging);
+        });
 }
 
 #ifdef LINK_POLLY_INTO_TOOLS
@@ -174,7 +192,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
                            VerifierKind VK,
                            bool ShouldPreserveAssemblyUseListOrder,
                            bool ShouldPreserveBitcodeUseListOrder,
-                           bool EmitSummaryIndex, bool EmitModuleHash) {
+                           bool EmitSummaryIndex, bool EmitModuleHash,
+                           bool EnableDebugify) {
   bool VerifyEachPass = VK == VK_VerifyEachPass;
 
   Optional<PGOOptions> P;
@@ -197,6 +216,32 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   PassBuilder PB(TM, P);
   registerEPCallbacks(PB, VerifyEachPass, DebugPM);
 
+  // Load requested pass plugins and let them register pass builder callbacks
+  for (auto &PluginFN : PassPlugins) {
+    auto PassPlugin = PassPlugin::Load(PluginFN);
+    if (!PassPlugin) {
+      errs() << "Failed to load passes from '" << PluginFN
+             << "'. Request ignored.\n";
+      continue;
+    }
+
+    PassPlugin->registerPassBuilderCallbacks(PB);
+  }
+
+  // Register a callback that creates the debugify passes as needed.
+  PB.registerPipelineParsingCallback(
+      [](StringRef Name, ModulePassManager &MPM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+        if (Name == "debugify") {
+          MPM.addPass(NewPMDebugifyPass());
+          return true;
+        } else if (Name == "check-debugify") {
+          MPM.addPass(NewPMCheckDebugifyPass());
+          return true;
+        }
+        return false;
+      });
+
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::RegisterPollyPasses(PB);
 #endif
@@ -227,6 +272,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   ModulePassManager MPM(DebugPM);
   if (VK > VK_NoVerifier)
     MPM.addPass(VerifierPass());
+  if (EnableDebugify)
+    MPM.addPass(NewPMDebugifyPass());
 
   if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
     errs() << Arg0 << ": unable to parse pass pipeline description.\n";
@@ -235,6 +282,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
 
   if (VK > VK_NoVerifier)
     MPM.addPass(VerifierPass());
+  if (EnableDebugify)
+    MPM.addPass(NewPMCheckDebugifyPass());
 
   // Add any relevant output pass at the end of the pipeline.
   switch (OK) {
diff --git a/contrib/llvm/tools/opt/NewPMDriver.h b/contrib/llvm/tools/opt/NewPMDriver.h
index e5490deaeaf5..7d74a5777d11 100644
--- a/contrib/llvm/tools/opt/NewPMDriver.h
+++ b/contrib/llvm/tools/opt/NewPMDriver.h
@@ -42,7 +42,7 @@ enum VerifierKind {
 };
 }
 
-/// \brief Driver function to run the new pass manager over a module.
+/// Driver function to run the new pass manager over a module.
 ///
 /// This function only exists factored away from opt.cpp in order to prevent
 /// inclusion of the new pass manager headers and the old headers into the same
@@ -57,7 +57,8 @@ bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
                      opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
                      bool ShouldPreserveAssemblyUseListOrder,
                      bool ShouldPreserveBitcodeUseListOrder,
-                     bool EmitSummaryIndex, bool EmitModuleHash);
-}
+                     bool EmitSummaryIndex, bool EmitModuleHash,
+                     bool EnableDebugify);
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/tools/opt/PassPrinters.cpp b/contrib/llvm/tools/opt/PassPrinters.cpp
index f52b52080949..310d491c06a5 100644
--- a/contrib/llvm/tools/opt/PassPrinters.cpp
+++ b/contrib/llvm/tools/opt/PassPrinters.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Utilities to print analysis info for various kinds of passes.
+/// Utilities to print analysis info for various kinds of passes.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/tools/opt/PassPrinters.h b/contrib/llvm/tools/opt/PassPrinters.h
index 14b6e43d18e0..e66f3f457b7a 100644
--- a/contrib/llvm/tools/opt/PassPrinters.h
+++ b/contrib/llvm/tools/opt/PassPrinters.h
@@ -8,13 +8,15 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Utilities to print analysis info for various kinds of passes.
+/// Utilities to print analysis info for various kinds of passes.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TOOLS_OPT_PASSPRINTERS_H
 #define LLVM_TOOLS_OPT_PASSPRINTERS_H
 
+#include "llvm/IR/PassManager.h"
+
 namespace llvm {
 
 class BasicBlockPass;
@@ -25,6 +27,7 @@ class LoopPass;
 class PassInfo;
 class raw_ostream;
 class RegionPass;
+class Module;
 
 FunctionPass *createFunctionPassPrinter(const PassInfo *PI, raw_ostream &out,
                                         bool Quiet);
diff --git a/contrib/llvm/tools/opt/opt.cpp b/contrib/llvm/tools/opt/opt.cpp
index c471e0f2e3ec..6e287b6c0ab6 100644
--- a/contrib/llvm/tools/opt/opt.cpp
+++ b/contrib/llvm/tools/opt/opt.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "BreakpointPrinter.h"
+#include "Debugify.h"
 #include "NewPMDriver.h"
 #include "PassPrinters.h"
 #include "llvm/ADT/Triple.h"
@@ -23,8 +24,9 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
@@ -41,10 +43,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/PluginLoader.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -123,7 +123,11 @@ StripDebug("strip-debug",
            cl::desc("Strip debugger symbol info from translation unit"));
 
 static cl::opt<bool>
-DisableInline("disable-inlining", cl::desc("Do not run the inliner pass"));
+    StripNamedMetadata("strip-named-metadata",
+                       cl::desc("Strip module-level named metadata"));
+
+static cl::opt<bool> DisableInline("disable-inlining",
+                                   cl::desc("Do not run the inliner pass"));
 
 static cl::opt<bool>
 DisableOptimizations("disable-opt",
@@ -203,6 +207,21 @@ QuietA("quiet", cl::desc("Alias for -q"), cl::aliasopt(Quiet));
 static cl::opt<bool>
 AnalyzeOnly("analyze", cl::desc("Only perform analysis, no optimization"));
 
+static cl::opt<bool> EnableDebugify(
+    "enable-debugify",
+    cl::desc(
+        "Start the pipeline with debugify and end it with check-debugify"));
+
+static cl::opt<bool> DebugifyEach(
+    "debugify-each",
+    cl::desc(
+        "Start each pass with debugify and end it with check-debugify"));
+
+static cl::opt<std::string>
+    DebugifyExport("debugify-export",
+                   cl::desc("Export per-pass debugify statistics to this file"),
+                   cl::value_desc("filename"), cl::init(""));
+
 static cl::opt<bool>
 PrintBreakpoints("print-breakpoints-for-testing",
                  cl::desc("Print select breakpoints location for testing"));
@@ -252,6 +271,48 @@ static cl::opt<std::string>
                     cl::desc("YAML output filename for pass remarks"),
                     cl::value_desc("filename"));
 
+class OptCustomPassManager : public legacy::PassManager {
+  DebugifyStatsMap DIStatsMap;
+
+public:
+  using super = legacy::PassManager;
+
+  void add(Pass *P) override {
+    // Wrap each pass with (-check)-debugify passes if requested, making
+    // exceptions for passes which shouldn't see -debugify instrumentation.
+    bool WrapWithDebugify = DebugifyEach && !P->getAsImmutablePass() &&
+                            !isIRPrintingPass(P) && !isBitcodeWriterPass(P);
+    if (!WrapWithDebugify) {
+      super::add(P);
+      return;
+    }
+
+    // Apply -debugify/-check-debugify before/after each pass and collect
+    // debug info loss statistics.
+    PassKind Kind = P->getPassKind();
+    StringRef Name = P->getPassName();
+
+    // TODO: Implement Debugify for BasicBlockPass, LoopPass.
+    switch (Kind) {
+      case PT_Function:
+        super::add(createDebugifyFunctionPass());
+        super::add(P);
+        super::add(createCheckDebugifyFunctionPass(true, Name, &DIStatsMap));
+        break;
+      case PT_Module:
+        super::add(createDebugifyModulePass());
+        super::add(P);
+        super::add(createCheckDebugifyModulePass(true, Name, &DIStatsMap));
+        break;
+      default:
+        super::add(P);
+        break;
+    }
+  }
+
+  const DebugifyStatsMap &getDebugifyStatsMap() const { return DIStatsMap; }
+};
+
 static inline void addPass(legacy::PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
@@ -362,13 +423,11 @@ void initializePollyPasses(llvm::PassRegistry &Registry);
 // main for opt
 //
 int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  llvm::PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
 
   // Enable debug stream buffering.
   EnableDebugBuffering = true;
 
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   LLVMContext Context;
 
   InitializeAllTargets();
@@ -387,6 +446,7 @@ int main(int argc, char **argv) {
   initializeAnalysis(Registry);
   initializeTransformUtils(Registry);
   initializeInstCombine(Registry);
+  initializeAggressiveInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
@@ -408,6 +468,7 @@ int main(int argc, char **argv) {
   initializePostInlineEntryExitInstrumenterPass(Registry);
   initializeUnreachableBlockElimLegacyPassPass(Registry);
   initializeExpandReductionsPass(Registry);
+  initializeWasmEHPreparePass(Registry);
   initializeWriteBitcodePassPass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
@@ -449,7 +510,7 @@ int main(int argc, char **argv) {
 
   // Load the input module...
   std::unique_ptr<Module> M =
-      parseIRFile(InputFilename, Err, Context, !NoVerify);
+      parseIRFile(InputFilename, Err, Context, !NoVerify, ClDataLayout);
 
   if (!M) {
     Err.print(argv[0], errs());
@@ -460,6 +521,18 @@ int main(int argc, char **argv) {
   if (StripDebug)
     StripDebugInfo(*M);
 
+  // Erase module-level named metadata, if requested.
+  if (StripNamedMetadata) {
+    while (!M->named_metadata_empty()) {
+      NamedMDNode *NMD = &*M->named_metadata_begin();
+      M->eraseNamedMetadata(NMD);
+    }
+  }
+
+  // If we are supposed to override the target triple or data layout, do so now.
+  if (!TargetTriple.empty())
+    M->setTargetTriple(Triple::normalize(TargetTriple));
+
   // Immediately run the verifier to catch any problems before starting up the
   // pass pipelines.  Otherwise we can crash on broken code during
   // doInitialization().
@@ -469,12 +542,6 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  // If we are supposed to override the target triple or data layout, do so now.
-  if (!TargetTriple.empty())
-    M->setTargetTriple(Triple::normalize(TargetTriple));
-  if (!ClDataLayout.empty())
-    M->setDataLayout(ClDataLayout);
-
   // Figure out what stream we are supposed to write to...
   std::unique_ptr<ToolOutputFile> Out;
   std::unique_ptr<ToolOutputFile> ThinLinkOut;
@@ -548,15 +615,15 @@ int main(int argc, char **argv) {
                            OptRemarkFile.get(), PassPipeline, OK, VK,
                            PreserveAssemblyUseListOrder,
                            PreserveBitcodeUseListOrder, EmitSummaryIndex,
-                           EmitModuleHash)
+                           EmitModuleHash, EnableDebugify)
                ? 0
                : 1;
   }
 
   // Create a PassManager to hold and optimize the collection of passes we are
   // about to build.
-  //
-  legacy::PassManager Passes;
+  OptCustomPassManager Passes;
+  bool AddOneTimeDebugifyPasses = EnableDebugify && !DebugifyEach;
 
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
   TargetLibraryInfoImpl TLII(ModuleTriple);
@@ -570,6 +637,9 @@ int main(int argc, char **argv) {
   Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
                                                      : TargetIRAnalysis()));
 
+  if (AddOneTimeDebugifyPasses)
+    Passes.add(createDebugifyModulePass());
+
   std::unique_ptr<legacy::FunctionPassManager> FPasses;
   if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
       OptLevelO3) {
@@ -715,12 +785,15 @@ int main(int argc, char **argv) {
   if (!NoVerify && !VerifyEach)
     Passes.add(createVerifierPass());
 
+  if (AddOneTimeDebugifyPasses)
+    Passes.add(createCheckDebugifyModulePass(false));
+
   // In run twice mode, we want to make sure the output is bit-by-bit
   // equivalent if we run the pass manager again, so setup two buffers and
   // a stream to write to them. Note that llc does something similar and it
   // may be worth to abstract this out in the future.
   SmallVector<char, 0> Buffer;
-  SmallVector<char, 0> CompileTwiceBuffer;
+  SmallVector<char, 0> FirstRunBuffer;
   std::unique_ptr<raw_svector_ostream> BOS;
   raw_ostream *OS = nullptr;
 
@@ -749,28 +822,30 @@ int main(int argc, char **argv) {
   // Before executing passes, print the final values of the LLVM options.
   cl::PrintOptionValues();
 
-  // If requested, run all passes again with the same pass manager to catch
-  // bugs caused by persistent state in the passes
-  if (RunTwice) {
-      std::unique_ptr<Module> M2(CloneModule(M.get()));
-      Passes.run(*M2);
-      CompileTwiceBuffer = Buffer;
-      Buffer.clear();
-  }
-
-  // Now that we have all of the passes ready, run them.
-  Passes.run(*M);
-
-  // Compare the two outputs and make sure they're the same
-  if (RunTwice) {
+  if (!RunTwice) {
+    // Now that we have all of the passes ready, run them.
+    Passes.run(*M);
+  } else {
+    // If requested, run all passes twice with the same pass manager to catch
+    // bugs caused by persistent state in the passes.
+    std::unique_ptr<Module> M2(CloneModule(*M));
+    // Run all passes on the original module first, so the second run processes
+    // the clone to catch CloneModule bugs.
+    Passes.run(*M);
+    FirstRunBuffer = Buffer;
+    Buffer.clear();
+
+    Passes.run(*M2);
+
+    // Compare the two outputs and make sure they're the same
     assert(Out);
-    if (Buffer.size() != CompileTwiceBuffer.size() ||
-        (memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
-         0)) {
-      errs() << "Running the pass manager twice changed the output.\n"
-                "Writing the result of the second run to the specified output.\n"
-                "To generate the one-run comparison binary, just run without\n"
-                "the compile-twice option\n";
+    if (Buffer.size() != FirstRunBuffer.size() ||
+        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
+      errs()
+          << "Running the pass manager twice changed the output.\n"
+             "Writing the result of the second run to the specified output.\n"
+             "To generate the one-run comparison binary, just run without\n"
+             "the compile-twice option\n";
       Out->os() << BOS->str();
       Out->keep();
       if (OptRemarkFile)
@@ -780,6 +855,9 @@ int main(int argc, char **argv) {
     Out->os() << BOS->str();
   }
 
+  if (DebugifyEach && !DebugifyExport.empty())
+    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
+
   // Declare success.
   if (!NoOutput || PrintBreakpoints)
     Out->keep();
diff --git a/contrib/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/contrib/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index f2d304bfcf5b..e808661b7a51 100644
--- a/contrib/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -105,6 +105,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -272,9 +273,17 @@ public:
       return true;
 
     // ... or if any of its super classes are a subset of RHS.
-    for (const ClassInfo *CI : SuperClasses)
-      if (CI->isSubsetOf(RHS))
+    SmallVector<const ClassInfo *, 16> Worklist(SuperClasses.begin(),
+                                                SuperClasses.end());
+    SmallPtrSet<const ClassInfo *, 16> Visited;
+    while (!Worklist.empty()) {
+      auto *CI = Worklist.pop_back_val();
+      if (CI == &RHS)
         return true;
+      for (auto *Super : CI->SuperClasses)
+        if (Visited.insert(Super).second)
+          Worklist.push_back(Super);
+    }
 
     return false;
   }
@@ -378,6 +387,9 @@ struct MatchableInfo {
     /// The operand name this is, if anything.
     StringRef SrcOpName;
 
+    /// The operand name this is, before renaming for tied operands.
+    StringRef OrigSrcOpName;
+
     /// The suboperand index within SrcOpName, or -1 for the entire operand.
     int SubOpIdx;
 
@@ -416,14 +428,22 @@ struct MatchableInfo {
       RegOperand
     } Kind;
 
+    /// Tuple containing the index of the (earlier) result operand that should
+    /// be copied from, as well as the indices of the corresponding (parsed)
+    /// operands in the asm string.
+    struct TiedOperandsTuple {
+      unsigned ResOpnd;
+      unsigned SrcOpnd1Idx;
+      unsigned SrcOpnd2Idx;
+    };
+
     union {
       /// This is the operand # in the AsmOperands list that this should be
       /// copied from.
       unsigned AsmOperandNum;
 
-      /// TiedOperandNum - This is the (earlier) result operand that should be
-      /// copied from.
-      unsigned TiedOperandNum;
+      /// Description of tied operands.
+      TiedOperandsTuple TiedOperands;
 
       /// ImmVal - This is the immediate value added to the instruction.
       int64_t ImmVal;
@@ -444,10 +464,11 @@ struct MatchableInfo {
       return X;
     }
 
-    static ResOperand getTiedOp(unsigned TiedOperandNum) {
+    static ResOperand getTiedOp(unsigned TiedOperandNum, unsigned SrcOperand1,
+                                unsigned SrcOperand2) {
       ResOperand X;
       X.Kind = TiedOperand;
-      X.TiedOperandNum = TiedOperandNum;
+      X.TiedOperands = { TiedOperandNum, SrcOperand1, SrcOperand2 };
       X.MINumOperands = 1;
       return X;
     }
@@ -560,7 +581,7 @@ struct MatchableInfo {
 
   /// validate - Return true if this matchable is a valid thing to match against
   /// and perform a bunch of validity checking.
-  bool validate(StringRef CommentDelimiter, bool Hack) const;
+  bool validate(StringRef CommentDelimiter, bool IsAlias) const;
 
   /// findAsmOperand - Find the AsmOperand with the specified name and
   /// suboperand index.
@@ -573,14 +594,21 @@ struct MatchableInfo {
 
   /// findAsmOperandNamed - Find the first AsmOperand with the specified name.
   /// This does not check the suboperand index.
-  int findAsmOperandNamed(StringRef N) const {
-    auto I = find_if(AsmOperands,
+  int findAsmOperandNamed(StringRef N, int LastIdx = -1) const {
+    auto I = std::find_if(AsmOperands.begin() + LastIdx + 1, AsmOperands.end(),
                      [&](const AsmOperand &Op) { return Op.SrcOpName == N; });
     return (I != AsmOperands.end()) ? I - AsmOperands.begin() : -1;
   }
 
+  int findAsmOperandOriginallyNamed(StringRef N) const {
+    auto I =
+        find_if(AsmOperands,
+                [&](const AsmOperand &Op) { return Op.OrigSrcOpName == N; });
+    return (I != AsmOperands.end()) ? I - AsmOperands.begin() : -1;
+  }
+
   void buildInstructionResultOperands();
-  void buildAliasResultOperands();
+  void buildAliasResultOperands(bool AliasConstraintsAreChecked);
 
   /// operator< - Compare two matchables.
   bool operator<(const MatchableInfo &RHS) const {
@@ -620,6 +648,10 @@ struct MatchableInfo {
     if (Mnemonic != RHS.Mnemonic)
       return false;
 
+    // Different variants can't conflict.
+    if (AsmVariantID != RHS.AsmVariantID)
+      return false;
+
     // The number of operands is unambiguous.
     if (AsmOperands.size() != RHS.AsmOperands.size())
       return false;
@@ -770,6 +802,8 @@ public:
 LLVM_DUMP_METHOD void MatchableInfo::dump() const {
   errs() << TheDef->getName() << " -- " << "flattened:\"" << AsmString <<"\"\n";
 
+  errs() << "  variant: " << AsmVariantID << "\n";
+
   for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) {
     const AsmOperand &Op = AsmOperands[i];
     errs() << "  op[" << i << "] = " << Op.Class->ClassName << " - ";
@@ -840,10 +874,6 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
       if (Op.AsmOperandNum > (unsigned)SrcAsmOperand)
         --Op.AsmOperandNum;
       break;
-    case ResOperand::TiedOperand:
-      if (Op.TiedOperandNum > (unsigned)SrcAsmOperand)
-        --Op.TiedOperandNum;
-      break;
     }
   }
 }
@@ -1019,7 +1049,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
     addAsmOperand(String.substr(Prev), IsIsolatedToken);
 }
 
-bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
+bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   // Reject matchables with no .s string.
   if (AsmString.empty())
     PrintFatalError(TheDef->getLoc(), "instruction with empty asm string");
@@ -1052,17 +1082,10 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
       PrintFatalError(TheDef->getLoc(),
                       "matchable with operand modifier '" + Tok +
                       "' not supported by asm matcher.  Mark isCodeGenOnly!");
-
     // Verify that any operand is only mentioned once.
     // We reject aliases and ignore instructions for now.
-    if (Tok[0] == '$' && !OperandNames.insert(Tok).second) {
-      if (!Hack)
-        PrintFatalError(TheDef->getLoc(),
-                        "ERROR: matchable with tied operand '" + Tok +
-                        "' can never be matched!");
-      // FIXME: Should reject these.  The ARM backend hits this with $lane in a
-      // bunch of instructions.  It is unclear what the right answer is.
-      DEBUG({
+    if (!IsAlias && Tok[0] == '$' && !OperandNames.insert(Tok).second) {
+      LLVM_DEBUG({
         errs() << "warning: '" << TheDef->getName() << "': "
                << "ignoring instruction with tied operand '"
                << Tok << "'\n";
@@ -1448,11 +1471,13 @@ void AsmMatcherInfo::buildInfo() {
                            SubtargetFeaturePairs.end());
 #ifndef NDEBUG
   for (const auto &Pair : SubtargetFeatures)
-    DEBUG(Pair.second.dump());
+    LLVM_DEBUG(Pair.second.dump());
 #endif // NDEBUG
   assert(SubtargetFeatures.size() <= 64 && "Too many subtarget features!");
 
   bool HasMnemonicFirst = AsmParser->getValueAsBit("HasMnemonicFirst");
+  bool ReportMultipleNearMisses =
+      AsmParser->getValueAsBit("ReportMultipleNearMisses");
 
   // Parse the instructions; we need to do this first so that we can gather the
   // singleton register classes.
@@ -1495,7 +1520,7 @@ void AsmMatcherInfo::buildInfo() {
 
       // Ignore instructions which shouldn't be matched and diagnose invalid
       // instruction definitions with an error.
-      if (!II->validate(CommentDelimiter, true))
+      if (!II->validate(CommentDelimiter, false))
         continue;
 
       Matchables.push_back(std::move(II));
@@ -1507,7 +1532,6 @@ void AsmMatcherInfo::buildInfo() {
       Records.getAllDerivedDefinitions("InstAlias");
     for (unsigned i = 0, e = AllInstAliases.size(); i != e; ++i) {
       auto Alias = llvm::make_unique<CodeGenInstAlias>(AllInstAliases[i],
-                                                       Variant.AsmVariantNo,
                                                        Target);
 
       // If the tblgen -match-prefix option is specified (for tblgen hackers),
@@ -1526,7 +1550,7 @@ void AsmMatcherInfo::buildInfo() {
       II->initialize(*this, SingletonRegisters, Variant, HasMnemonicFirst);
 
       // Validate the alias definitions.
-      II->validate(CommentDelimiter, false);
+      II->validate(CommentDelimiter, true);
 
       Matchables.push_back(std::move(II));
     }
@@ -1599,7 +1623,12 @@ void AsmMatcherInfo::buildInfo() {
         NewMatchables.push_back(std::move(AliasII));
       }
     } else
-      II->buildAliasResultOperands();
+      // FIXME: The tied operands checking is not yet integrated with the
+      // framework for reporting multiple near misses. To prevent invalid
+      // formats from being matched with an alias if a tied-operands check
+      // would otherwise have disallowed it, we just disallow such constructs
+      // in TableGen completely.
+      II->buildAliasResultOperands(!ReportMultipleNearMisses);
   }
   if (!NewMatchables.empty())
     Matchables.insert(Matchables.end(),
@@ -1672,6 +1701,7 @@ buildInstructionOperandReference(MatchableInfo *II,
 
   // Set up the operand class.
   Op->Class = getOperandClass(Operands[Idx], Op->SubOpIdx);
+  Op->OrigSrcOpName = OperandName;
 
   // If the named operand is tied, canonicalize it to the untied operand.
   // For example, something like:
@@ -1716,6 +1746,7 @@ void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II,
       Op.Class = getOperandClass(CGA.ResultOperands[i].getRecord(),
                                  Op.SubOpIdx);
       Op.SrcOpName = OperandName;
+      Op.OrigSrcOpName = OperandName;
       return;
     }
 
@@ -1734,11 +1765,16 @@ void MatchableInfo::buildInstructionResultOperands() {
     if (OpInfo.MINumOperands == 1)
       TiedOp = OpInfo.getTiedRegister();
     if (TiedOp != -1) {
-      ResOperands.push_back(ResOperand::getTiedOp(TiedOp));
+      int TiedSrcOperand = findAsmOperandOriginallyNamed(OpInfo.Name);
+      if (TiedSrcOperand != -1 &&
+          ResOperands[TiedOp].Kind == ResOperand::RenderAsmOperand)
+        ResOperands.push_back(ResOperand::getTiedOp(
+            TiedOp, ResOperands[TiedOp].AsmOperandNum, TiedSrcOperand));
+      else
+        ResOperands.push_back(ResOperand::getTiedOp(TiedOp, 0, 0));
       continue;
     }
 
-    // Find out what operand from the asmparser this MCInst operand comes from.
     int SrcOperand = findAsmOperandNamed(OpInfo.Name);
     if (OpInfo.Name.empty() || SrcOperand == -1) {
       // This may happen for operands that are tied to a suboperand of a
@@ -1767,10 +1803,16 @@ void MatchableInfo::buildInstructionResultOperands() {
   }
 }
 
-void MatchableInfo::buildAliasResultOperands() {
+void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) {
   const CodeGenInstAlias &CGA = *DefRec.get<const CodeGenInstAlias*>();
   const CodeGenInstruction *ResultInst = getResultInst();
 
+  // Map of:  $reg -> #lastref
+  //   where $reg is the name of the operand in the asm string
+  //   where #lastref is the last processed index where $reg was referenced in
+  //   the asm string.
+  SmallDenseMap<StringRef, int> OperandRefs;
+
   // Loop over all operands of the result instruction, determining how to
   // populate them.
   unsigned AliasOpNo = 0;
@@ -1783,8 +1825,46 @@ void MatchableInfo::buildAliasResultOperands() {
     if (OpInfo->MINumOperands == 1)
       TiedOp = OpInfo->getTiedRegister();
     if (TiedOp != -1) {
-      ResOperands.push_back(ResOperand::getTiedOp(TiedOp));
-      continue;
+      unsigned SrcOp1 = 0;
+      unsigned SrcOp2 = 0;
+
+      // If an operand has been specified twice in the asm string,
+      // add the two source operand's indices to the TiedOp so that
+      // at runtime the 'tied' constraint is checked.
+      if (ResOperands[TiedOp].Kind == ResOperand::RenderAsmOperand) {
+        SrcOp1 = ResOperands[TiedOp].AsmOperandNum;
+
+        // Find the next operand (similarly named operand) in the string.
+        StringRef Name = AsmOperands[SrcOp1].SrcOpName;
+        auto Insert = OperandRefs.try_emplace(Name, SrcOp1);
+        SrcOp2 = findAsmOperandNamed(Name, Insert.first->second);
+
+        // Not updating the record in OperandRefs will cause TableGen
+        // to fail with an error at the end of this function.
+        if (AliasConstraintsAreChecked)
+          Insert.first->second = SrcOp2;
+
+        // In case it only has one reference in the asm string,
+        // it doesn't need to be checked for tied constraints.
+        SrcOp2 = (SrcOp2 == (unsigned)-1) ? SrcOp1 : SrcOp2;
+      }
+
+      // If the alias operand is of a different operand class, we only want
+      // to benefit from the tied-operands check and just match the operand
+      // as a normal, but not copy the original (TiedOp) to the result
+      // instruction. We do this by passing -1 as the tied operand to copy.
+      if (ResultInst->Operands[i].Rec->getName() !=
+          ResultInst->Operands[TiedOp].Rec->getName()) {
+        SrcOp1 = ResOperands[TiedOp].AsmOperandNum;
+        int SubIdx = CGA.ResultInstOperandIndex[AliasOpNo].second;
+        StringRef Name = CGA.ResultOperands[AliasOpNo].getName();
+        SrcOp2 = findAsmOperand(Name, SubIdx);
+        ResOperands.push_back(
+            ResOperand::getTiedOp((unsigned)-1, SrcOp1, SrcOp2));
+      } else {
+        ResOperands.push_back(ResOperand::getTiedOp(TiedOp, SrcOp1, SrcOp2));
+        continue;
+      }
     }
 
     // Handle all the suboperands for this operand.
@@ -1803,6 +1883,11 @@ void MatchableInfo::buildAliasResultOperands() {
           PrintFatalError(TheDef->getLoc(), "Instruction '" +
                         TheDef->getName() + "' has operand '" + OpName +
                         "' that doesn't appear in asm string!");
+
+        // Add it to the operand references. If it is added a second time, the
+        // record won't be updated and it will fail later on.
+        OperandRefs.try_emplace(Name, SrcOperand);
+
         unsigned NumOperands = (SubIdx == -1 ? OpInfo->MINumOperands : 1);
         ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand,
                                                         NumOperands));
@@ -1821,6 +1906,13 @@ void MatchableInfo::buildAliasResultOperands() {
       }
     }
   }
+
+  // Check that operands are not repeated more times than is supported.
+  for (auto &T : OperandRefs) {
+    if (T.second != -1 && findAsmOperandNamed(T.first, T.second) != -1)
+      PrintFatalError(TheDef->getLoc(),
+                      "Operand '" + T.first + "' can never be matched");
+  }
 }
 
 static unsigned
@@ -1897,9 +1989,15 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   CvtOS << "      static_cast<" << TargetOperandClass
         << "&>(*Operands[OpIdx]).addRegOperands(Inst, 1);\n";
   CvtOS << "      break;\n";
-  CvtOS << "    case CVT_Tied:\n";
-  CvtOS << "      Inst.addOperand(Inst.getOperand(OpIdx));\n";
+  CvtOS << "    case CVT_Tied: {\n";
+  CvtOS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
+  CvtOS << "                          std::begin(TiedAsmOperandTable)) &&\n";
+  CvtOS << "             \"Tied operand not found\");\n";
+  CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n";
+  CvtOS << "      if (TiedResOpnd != (uint8_t) -1)\n";
+  CvtOS << "        Inst.addOperand(Inst.getOperand(TiedResOpnd));\n";
   CvtOS << "      break;\n";
+  CvtOS << "    }\n";
 
   std::string OperandFnBody;
   raw_string_ostream OpOS(OperandFnBody);
@@ -1930,6 +2028,10 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   OperandConversionKinds.insert(CachedHashString("CVT_Tied"));
   enum { CVT_Done, CVT_Reg, CVT_Tied };
 
+  // Map of e.g. <0, 2, 3> -> "Tie_0_2_3" enum label.
+  std::map<std::tuple<uint8_t, uint8_t, uint8_t>, std::string>
+  TiedOperandsEnumMap;
+
   for (auto &II : Infos) {
     // Check if we have a custom match function.
     StringRef AsmMatchConverter =
@@ -2050,11 +2152,24 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         // If this operand is tied to a previous one, just copy the MCInst
         // operand from the earlier one.We can only tie single MCOperand values.
         assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand");
-        unsigned TiedOp = OpInfo.TiedOperandNum;
-        assert(i > TiedOp && "Tied operand precedes its target!");
-        Signature += "__Tie" + utostr(TiedOp);
+        uint8_t TiedOp = OpInfo.TiedOperands.ResOpnd;
+        uint8_t SrcOp1 =
+            OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst;
+        uint8_t SrcOp2 =
+            OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst;
+        assert((i > TiedOp || TiedOp == (uint8_t)-1) &&
+               "Tied operand precedes its target!");
+        auto TiedTupleName = std::string("Tie") + utostr(TiedOp) + '_' +
+                             utostr(SrcOp1) + '_' + utostr(SrcOp2);
+        Signature += "__" + TiedTupleName;
         ConversionRow.push_back(CVT_Tied);
         ConversionRow.push_back(TiedOp);
+        ConversionRow.push_back(SrcOp1);
+        ConversionRow.push_back(SrcOp2);
+
+        // Also create an 'enum' for this combination of tied operands.
+        auto Key = std::make_tuple(TiedOp, SrcOp1, SrcOp2);
+        TiedOperandsEnumMap.emplace(Key, TiedTupleName);
         break;
       }
       case MatchableInfo::ResOperand::ImmOperand: {
@@ -2139,6 +2254,33 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   // Finish up the operand number lookup function.
   OpOS << "    }\n  }\n}\n\n";
 
+  // Output a static table for tied operands.
+  if (TiedOperandsEnumMap.size()) {
+    // The number of tied operand combinations will be small in practice,
+    // but just add the assert to be sure.
+    assert(TiedOperandsEnumMap.size() <= 254 &&
+           "Too many tied-operand combinations to reference with "
+           "an 8bit offset from the conversion table, where index "
+           "'255' is reserved as operand not to be copied.");
+
+    OS << "enum {\n";
+    for (auto &KV : TiedOperandsEnumMap) {
+      OS << "  " << KV.second << ",\n";
+    }
+    OS << "};\n\n";
+
+    OS << "static const uint8_t TiedAsmOperandTable[][3] = {\n";
+    for (auto &KV : TiedOperandsEnumMap) {
+      OS << "  /* " << KV.second << " */ { "
+         << utostr(std::get<0>(KV.first)) << ", "
+         << utostr(std::get<1>(KV.first)) << ", "
+         << utostr(std::get<2>(KV.first)) << " },\n";
+    }
+    OS << "};\n\n";
+  } else
+    OS << "static const uint8_t TiedAsmOperandTable[][3] = "
+          "{ /* empty  */ {0, 0, 0} };\n\n";
+
   OS << "namespace {\n";
 
   // Output the operand conversion kind enum.
@@ -2165,9 +2307,26 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
     assert(ConversionTable[Row].size() % 2 == 0 && "bad conversion row!");
     OS << "  // " << InstructionConversionKinds[Row] << "\n";
     OS << "  { ";
-    for (unsigned i = 0, e = ConversionTable[Row].size(); i != e; i += 2)
-      OS << OperandConversionKinds[ConversionTable[Row][i]] << ", "
-         << (unsigned)(ConversionTable[Row][i + 1]) << ", ";
+    for (unsigned i = 0, e = ConversionTable[Row].size(); i != e; i += 2) {
+      OS << OperandConversionKinds[ConversionTable[Row][i]] << ", ";
+      if (OperandConversionKinds[ConversionTable[Row][i]] !=
+          CachedHashString("CVT_Tied")) {
+        OS << (unsigned)(ConversionTable[Row][i + 1]) << ", ";
+        continue;
+      }
+
+      // For a tied operand, emit a reference to the TiedAsmOperandTable
+      // that contains the operand to copy, and the parsed operands to
+      // check for their tied constraints.
+      auto Key = std::make_tuple((uint8_t)ConversionTable[Row][i + 1],
+                                 (uint8_t)ConversionTable[Row][i + 2],
+                                 (uint8_t)ConversionTable[Row][i + 3]);
+      auto TiedOpndEnum = TiedOperandsEnumMap.find(Key);
+      assert(TiedOpndEnum != TiedOperandsEnumMap.end() &&
+             "No record for tied operand pair");
+      OS << TiedOpndEnum->second << ", ";
+      i += 2;
+    }
     OS << "CVT_Done },\n";
   }
 
@@ -2307,14 +2466,20 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
       continue;
 
     OS << "  // '" << CI.ClassName << "' class\n";
-    OS << "  case " << CI.Name << ":\n";
-    OS << "    if (Operand." << CI.PredicateMethod << "())\n";
+    OS << "  case " << CI.Name << ": {\n";
+    OS << "    DiagnosticPredicate DP(Operand." << CI.PredicateMethod
+       << "());\n";
+    OS << "    if (DP.isMatch())\n";
     OS << "      return MCTargetAsmParser::Match_Success;\n";
-    if (!CI.DiagnosticType.empty())
-      OS << "    return " << Info.Target.getName() << "AsmParser::Match_"
+    if (!CI.DiagnosticType.empty()) {
+      OS << "    if (DP.isNearMatch())\n";
+      OS << "      return " << Info.Target.getName() << "AsmParser::Match_"
          << CI.DiagnosticType << ";\n";
+      OS << "    break;\n";
+    }
     else
       OS << "    break;\n";
+    OS << "    }\n";
   }
   OS << "  } // end switch (Kind)\n\n";
 
@@ -2825,6 +2990,48 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   OS << "}\n\n";
 }
 
+static void emitAsmTiedOperandConstraints(CodeGenTarget &Target,
+                                          AsmMatcherInfo &Info,
+                                          raw_ostream &OS) {
+  std::string AsmParserName =
+      Info.AsmParser->getValueAsString("AsmParserClassName");
+  OS << "static bool ";
+  OS << "checkAsmTiedOperandConstraints(const " << Target.getName()
+     << AsmParserName << "&AsmParser,\n";
+  OS << "                               unsigned Kind,\n";
+  OS << "                               const OperandVector &Operands,\n";
+  OS << "                               uint64_t &ErrorInfo) {\n";
+  OS << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n";
+  OS << "  const uint8_t *Converter = ConversionTable[Kind];\n";
+  OS << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n";
+  OS << "    switch (*p) {\n";
+  OS << "    case CVT_Tied: {\n";
+  OS << "      unsigned OpIdx = *(p+1);\n";
+  OS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
+  OS << "                              std::begin(TiedAsmOperandTable)) &&\n";
+  OS << "             \"Tied operand not found\");\n";
+  OS << "      unsigned OpndNum1 = TiedAsmOperandTable[OpIdx][1];\n";
+  OS << "      unsigned OpndNum2 = TiedAsmOperandTable[OpIdx][2];\n";
+  OS << "      if (OpndNum1 != OpndNum2) {\n";
+  OS << "        auto &SrcOp1 = Operands[OpndNum1];\n";
+  OS << "        auto &SrcOp2 = Operands[OpndNum2];\n";
+  OS << "        if (SrcOp1->isReg() && SrcOp2->isReg()) {\n";
+  OS << "          if (!AsmParser.regsEqual(*SrcOp1, *SrcOp2)) {\n";
+  OS << "            ErrorInfo = OpndNum2;\n";
+  OS << "            return false;\n";
+  OS << "          }\n";
+  OS << "        }\n";
+  OS << "      }\n";
+  OS << "      break;\n";
+  OS << "    }\n";
+  OS << "    default:\n";
+  OS << "      break;\n";
+  OS << "    }\n";
+  OS << "  }\n";
+  OS << "  return true;\n";
+  OS << "}\n\n";
+}
+
 static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
                                      unsigned VariantCount) {
   OS << "static std::string " << Target.getName()
@@ -3072,6 +3279,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
       Info.Target.getName(), ClassName, "ComputeAvailableFeatures",
       Info.SubtargetFeatures, OS);
 
+  if (!ReportMultipleNearMisses)
+    emitAsmTiedOperandConstraints(Target, Info, OS);
+
   StringToOffsetTable StringTable;
 
   size_t MaxNumOperands = 0;
@@ -3495,6 +3705,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "    if (matchingInlineAsm) {\n";
   OS << "      convertToMapAndConstraints(it->ConvertFn, Operands);\n";
+  if (!ReportMultipleNearMisses) {
+    OS << "      if (!checkAsmTiedOperandConstraints(*this, it->ConvertFn, "
+          "Operands, ErrorInfo))\n";
+    OS << "        return Match_InvalidTiedOperand;\n";
+    OS << "\n";
+  }
   OS << "      return Match_Success;\n";
   OS << "    }\n\n";
   OS << "    // We have selected a definite instruction, convert the parsed\n"
@@ -3569,6 +3785,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "    }\n";
   }
 
+  if (!ReportMultipleNearMisses) {
+    OS << "    if (!checkAsmTiedOperandConstraints(*this, it->ConvertFn, "
+          "Operands, ErrorInfo))\n";
+    OS << "      return Match_InvalidTiedOperand;\n";
+    OS << "\n";
+  }
+
   OS << "    DEBUG_WITH_TYPE(\n";
   OS << "        \"asm-matcher\",\n";
   OS << "        dbgs() << \"Opcode result: complete match, selecting this opcode\\n\");\n";
diff --git a/contrib/llvm/utils/TableGen/AsmWriterEmitter.cpp b/contrib/llvm/utils/TableGen/AsmWriterEmitter.cpp
index 723c0cd773f7..3c4c9c8e5c6e 100644
--- a/contrib/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -351,8 +351,8 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
     // If we don't have enough bits for this operand, don't include it.
     if (NumBits > BitsLeft) {
-      DEBUG(errs() << "Not enough bits to densely encode " << NumBits
-                   << " more bits\n");
+      LLVM_DEBUG(errs() << "Not enough bits to densely encode " << NumBits
+                        << " more bits\n");
       break;
     }
 
@@ -727,10 +727,6 @@ public:
 } // end anonymous namespace
 
 static unsigned CountNumOperands(StringRef AsmString, unsigned Variant) {
-  std::string FlatAsmString =
-      CodeGenInstruction::FlattenAsmStringVariants(AsmString, Variant);
-  AsmString = FlatAsmString;
-
   return AsmString.count(' ') + AsmString.count('\t');
 }
 
@@ -782,7 +778,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     const DagInit *DI = R->getValueAsDag("ResultInst");
     const DefInit *Op = cast<DefInit>(DI->getOperator());
     AliasMap[getQualifiedName(Op->getDef())].insert(
-        std::make_pair(CodeGenInstAlias(R, Variant, Target), Priority));
+        std::make_pair(CodeGenInstAlias(R, Target), Priority));
   }
 
   // A map of which conditions need to be met for each instruction operand
@@ -799,14 +795,20 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     for (auto &Alias : Aliases.second) {
       const CodeGenInstAlias &CGA = Alias.first;
       unsigned LastOpNo = CGA.ResultInstOperandIndex.size();
-      unsigned NumResultOps =
-          CountNumOperands(CGA.ResultInst->AsmString, Variant);
+      std::string FlatInstAsmString =
+         CodeGenInstruction::FlattenAsmStringVariants(CGA.ResultInst->AsmString,
+                                                      Variant);
+      unsigned NumResultOps = CountNumOperands(FlatInstAsmString, Variant);
+
+      std::string FlatAliasAsmString =
+        CodeGenInstruction::FlattenAsmStringVariants(CGA.AsmString,
+                                                      Variant);
 
       // Don't emit the alias if it has more operands than what it's aliasing.
-      if (NumResultOps < CountNumOperands(CGA.AsmString, Variant))
+      if (NumResultOps < CountNumOperands(FlatAliasAsmString, Variant))
         continue;
 
-      IAPrinter IAP(CGA.Result->getAsString(), CGA.AsmString);
+      IAPrinter IAP(CGA.Result->getAsString(), FlatAliasAsmString);
 
       StringRef Namespace = Target.getName();
       std::vector<Record *> ReqFeatures;
diff --git a/contrib/llvm/utils/TableGen/CTagsEmitter.cpp b/contrib/llvm/utils/TableGen/CTagsEmitter.cpp
index 5213cd904462..a0f83f1c9910 100644
--- a/contrib/llvm/utils/TableGen/CTagsEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/CTagsEmitter.cpp
@@ -73,7 +73,7 @@ void CTagsEmitter::run(raw_ostream &OS) {
   for (const auto &D : Defs)
     Tags.push_back(Tag(D.first, locate(D.second.get())));
   // Emit tags.
-  std::sort(Tags.begin(), Tags.end());
+  llvm::sort(Tags.begin(), Tags.end());
   OS << "!_TAG_FILE_FORMAT\t1\t/original ctags format/\n";
   OS << "!_TAG_FILE_SORTED\t1\t/0=unsorted, 1=sorted, 2=foldcase/\n";
   for (const Tag &T : Tags)
diff --git a/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index 493066ec234b..1abe3a88bfbf 100644
--- a/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -808,7 +808,7 @@ TypeSetByHwMode TypeInfer::getLegalTypes() {
 
 #ifndef NDEBUG
 TypeInfer::ValidateOnExit::~ValidateOnExit() {
-  if (!VTS.validate()) {
+  if (Infer.Validate && !VTS.validate()) {
     dbgs() << "Type set is empty for each HW mode:\n"
               "possible type contradiction in the pattern below "
               "(use -print-records with llvm-tblgen to see all "
@@ -1134,6 +1134,14 @@ Record *TreePredicateFn::getScalarMemoryVT() const {
     return nullptr;
   return R->getValueAsDef("ScalarMemoryVT");
 }
+bool TreePredicateFn::hasGISelPredicateCode() const {
+  return !PatFragRec->getRecord()
+              ->getValueAsString("GISelPredicateCode")
+              .empty();
+}
+std::string TreePredicateFn::getGISelPredicateCode() const {
+  return PatFragRec->getRecord()->getValueAsString("GISelPredicateCode");
+}
 
 StringRef TreePredicateFn::getImmType() const {
   if (immCodeUsesAPInt())
@@ -1305,7 +1313,7 @@ std::string PatternToMatch::getPredicateCheck() const {
   SmallVector<const Predicate*,4> PredList;
   for (const Predicate &P : Predicates)
     PredList.push_back(&P);
-  std::sort(PredList.begin(), PredList.end(), deref<llvm::less>());
+  llvm::sort(PredList.begin(), PredList.end(), deref<llvm::less>());
 
   std::string Check;
   for (unsigned i = 0, e = PredList.size(); i != e; ++i) {
@@ -1564,7 +1572,7 @@ bool TreePatternNode::hasProperTypeByHwMode() const {
   for (const TypeSetByHwMode &S : Types)
     if (!S.isDefaultOnly())
       return true;
-  for (TreePatternNode *C : Children)
+  for (const TreePatternNodePtr &C : Children)
     if (C->hasProperTypeByHwMode())
       return true;
   return false;
@@ -1574,7 +1582,7 @@ bool TreePatternNode::hasPossibleType() const {
   for (const TypeSetByHwMode &S : Types)
     if (!S.isPossible())
       return false;
-  for (TreePatternNode *C : Children)
+  for (const TreePatternNodePtr &C : Children)
     if (!C->hasPossibleType())
       return false;
   return true;
@@ -1587,7 +1595,7 @@ bool TreePatternNode::setDefaultMode(unsigned Mode) {
     if (S.get(DefaultMode).empty())
       return false;
   }
-  for (TreePatternNode *C : Children)
+  for (const TreePatternNodePtr &C : Children)
     if (!C->setDefaultMode(Mode))
       return false;
   return true;
@@ -1644,13 +1652,6 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
 // TreePatternNode implementation
 //
 
-TreePatternNode::~TreePatternNode() {
-#if 0 // FIXME: implement refcounted tree nodes!
-  for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    delete getChild(i);
-#endif
-}
-
 static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
   if (Operator->getName() == "set" ||
       Operator->getName() == "implicit")
@@ -1662,21 +1663,31 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
   if (Operator->isSubClassOf("SDNode"))
     return CDP.getSDNodeInfo(Operator).getNumResults();
 
-  if (Operator->isSubClassOf("PatFrag")) {
+  if (Operator->isSubClassOf("PatFrags")) {
     // If we've already parsed this pattern fragment, get it.  Otherwise, handle
     // the forward reference case where one pattern fragment references another
     // before it is processed.
-    if (TreePattern *PFRec = CDP.getPatternFragmentIfRead(Operator))
-      return PFRec->getOnlyTree()->getNumTypes();
+    if (TreePattern *PFRec = CDP.getPatternFragmentIfRead(Operator)) {
+      // The number of results of a fragment with alternative records is the
+      // maximum number of results across all alternatives.
+      unsigned NumResults = 0;
+      for (auto T : PFRec->getTrees())
+        NumResults = std::max(NumResults, T->getNumTypes());
+      return NumResults;
+    }
 
-    // Get the result tree.
-    DagInit *Tree = Operator->getValueAsDag("Fragment");
-    Record *Op = nullptr;
-    if (Tree)
-      if (DefInit *DI = dyn_cast<DefInit>(Tree->getOperator()))
-        Op = DI->getDef();
-    assert(Op && "Invalid Fragment");
-    return GetNumNodeResults(Op, CDP);
+    ListInit *LI = Operator->getValueAsListInit("Fragments");
+    assert(LI && "Invalid Fragment");
+    unsigned NumResults = 0;
+    for (Init *I : LI->getValues()) {
+      Record *Op = nullptr;
+      if (DagInit *Dag = dyn_cast<DagInit>(I))
+        if (DefInit *DI = dyn_cast<DefInit>(Dag->getOperator()))
+          Op = DI->getDef();
+      assert(Op && "Invalid Fragment");
+      NumResults = std::max(NumResults, GetNumNodeResults(Op, CDP));
+    }
+    return NumResults;
   }
 
   if (Operator->isSubClassOf("Instruction")) {
@@ -1783,16 +1794,17 @@ bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N,
 
 /// clone - Make a copy of this tree and all of its children.
 ///
-TreePatternNode *TreePatternNode::clone() const {
-  TreePatternNode *New;
+TreePatternNodePtr TreePatternNode::clone() const {
+  TreePatternNodePtr New;
   if (isLeaf()) {
-    New = new TreePatternNode(getLeafValue(), getNumTypes());
+    New = std::make_shared<TreePatternNode>(getLeafValue(), getNumTypes());
   } else {
-    std::vector<TreePatternNode*> CChildren;
+    std::vector<TreePatternNodePtr> CChildren;
     CChildren.reserve(Children.size());
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
       CChildren.push_back(getChild(i)->clone());
-    New = new TreePatternNode(getOperator(), CChildren, getNumTypes());
+    New = std::make_shared<TreePatternNode>(getOperator(), std::move(CChildren),
+                                            getNumTypes());
   }
   New->setName(getName());
   New->Types = Types;
@@ -1813,8 +1825,8 @@ void TreePatternNode::RemoveAllTypes() {
 
 /// SubstituteFormalArguments - Replace the formal arguments in this tree
 /// with actual values specified by ArgMap.
-void TreePatternNode::
-SubstituteFormalArguments(std::map<std::string, TreePatternNode*> &ArgMap) {
+void TreePatternNode::SubstituteFormalArguments(
+    std::map<std::string, TreePatternNodePtr> &ArgMap) {
   if (isLeaf()) return;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
@@ -1826,12 +1838,12 @@ SubstituteFormalArguments(std::map<std::string, TreePatternNode*> &ArgMap) {
       if (isa<UnsetInit>(Val) || (isa<DefInit>(Val) &&
           cast<DefInit>(Val)->getDef()->getName() == "node")) {
         // We found a use of a formal argument, replace it with its value.
-        TreePatternNode *NewChild = ArgMap[Child->getName()];
+        TreePatternNodePtr NewChild = ArgMap[Child->getName()];
         assert(NewChild && "Couldn't find formal argument!");
         assert((Child->getPredicateFns().empty() ||
                 NewChild->getPredicateFns() == Child->getPredicateFns()) &&
                "Non-empty child predicate clobbered!");
-        setChild(i, NewChild);
+        setChild(i, std::move(NewChild));
       }
     } else {
       getChild(i)->SubstituteFormalArguments(ArgMap);
@@ -1841,29 +1853,81 @@ SubstituteFormalArguments(std::map<std::string, TreePatternNode*> &ArgMap) {
 
 
 /// InlinePatternFragments - If this pattern refers to any pattern
-/// fragments, inline them into place, giving us a pattern without any
-/// PatFrag references.
-TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
+/// fragments, return the set of inlined versions (this can be more than
+/// one if a PatFrags record has multiple alternatives).
+void TreePatternNode::InlinePatternFragments(
+  TreePatternNodePtr T, TreePattern &TP,
+  std::vector<TreePatternNodePtr> &OutAlternatives) {
+
   if (TP.hasError())
-    return nullptr;
+    return;
+
+  if (isLeaf()) {
+    OutAlternatives.push_back(T);  // nothing to do.
+    return;
+  }
 
-  if (isLeaf())
-     return this;  // nothing to do.
   Record *Op = getOperator();
 
-  if (!Op->isSubClassOf("PatFrag")) {
-    // Just recursively inline children nodes.
-    for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
-      TreePatternNode *Child = getChild(i);
-      TreePatternNode *NewChild = Child->InlinePatternFragments(TP);
+  if (!Op->isSubClassOf("PatFrags")) {
+    if (getNumChildren() == 0) {
+      OutAlternatives.push_back(T);
+      return;
+    }
 
-      assert((Child->getPredicateFns().empty() ||
-              NewChild->getPredicateFns() == Child->getPredicateFns()) &&
-             "Non-empty child predicate clobbered!");
+    // Recursively inline children nodes.
+    std::vector<std::vector<TreePatternNodePtr> > ChildAlternatives;
+    ChildAlternatives.resize(getNumChildren());
+    for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
+      TreePatternNodePtr Child = getChildShared(i);
+      Child->InlinePatternFragments(Child, TP, ChildAlternatives[i]);
+      // If there are no alternatives for any child, there are no
+      // alternatives for this expression as whole.
+      if (ChildAlternatives[i].empty())
+        return;
 
-      setChild(i, NewChild);
+      for (auto NewChild : ChildAlternatives[i])
+        assert((Child->getPredicateFns().empty() ||
+                NewChild->getPredicateFns() == Child->getPredicateFns()) &&
+               "Non-empty child predicate clobbered!");
     }
-    return this;
+
+    // The end result is an all-pairs construction of the resultant pattern.
+    std::vector<unsigned> Idxs;
+    Idxs.resize(ChildAlternatives.size());
+    bool NotDone;
+    do {
+      // Create the variant and add it to the output list.
+      std::vector<TreePatternNodePtr> NewChildren;
+      for (unsigned i = 0, e = ChildAlternatives.size(); i != e; ++i)
+        NewChildren.push_back(ChildAlternatives[i][Idxs[i]]);
+      TreePatternNodePtr R = std::make_shared<TreePatternNode>(
+          getOperator(), std::move(NewChildren), getNumTypes());
+
+      // Copy over properties.
+      R->setName(getName());
+      R->setPredicateFns(getPredicateFns());
+      R->setTransformFn(getTransformFn());
+      for (unsigned i = 0, e = getNumTypes(); i != e; ++i)
+        R->setType(i, getExtType(i));
+
+      // Register alternative.
+      OutAlternatives.push_back(R);
+
+      // Increment indices to the next permutation by incrementing the
+      // indices from last index backward, e.g., generate the sequence
+      // [0, 0], [0, 1], [1, 0], [1, 1].
+      int IdxsIdx;
+      for (IdxsIdx = Idxs.size() - 1; IdxsIdx >= 0; --IdxsIdx) {
+        if (++Idxs[IdxsIdx] == ChildAlternatives[IdxsIdx].size())
+          Idxs[IdxsIdx] = 0;
+        else
+          break;
+      }
+      NotDone = (IdxsIdx >= 0);
+    } while (NotDone);
+
+    return;
   }
 
   // Otherwise, we found a reference to a fragment.  First, look up its
@@ -1874,39 +1938,42 @@ TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
   if (Frag->getNumArgs() != Children.size()) {
     TP.error("'" + Op->getName() + "' fragment requires " +
              Twine(Frag->getNumArgs()) + " operands!");
-    return nullptr;
+    return;
   }
 
-  TreePatternNode *FragTree = Frag->getOnlyTree()->clone();
-
-  TreePredicateFn PredFn(Frag);
-  if (!PredFn.isAlwaysTrue())
-    FragTree->addPredicateFn(PredFn);
+  // Compute the map of formal to actual arguments.
+  std::map<std::string, TreePatternNodePtr> ArgMap;
+  for (unsigned i = 0, e = Frag->getNumArgs(); i != e; ++i) {
+    const TreePatternNodePtr &Child = getChildShared(i);
+    ArgMap[Frag->getArgName(i)] = Child;
+  }
 
-  // Resolve formal arguments to their actual value.
-  if (Frag->getNumArgs()) {
-    // Compute the map of formal to actual arguments.
-    std::map<std::string, TreePatternNode*> ArgMap;
-    for (unsigned i = 0, e = Frag->getNumArgs(); i != e; ++i)
-      ArgMap[Frag->getArgName(i)] = getChild(i)->InlinePatternFragments(TP);
+  // Loop over all fragment alternatives.
+  for (auto Alternative : Frag->getTrees()) {
+    TreePatternNodePtr FragTree = Alternative->clone();
 
-    FragTree->SubstituteFormalArguments(ArgMap);
-  }
+    TreePredicateFn PredFn(Frag);
+    if (!PredFn.isAlwaysTrue())
+      FragTree->addPredicateFn(PredFn);
 
-  FragTree->setName(getName());
-  for (unsigned i = 0, e = Types.size(); i != e; ++i)
-    FragTree->UpdateNodeType(i, getExtType(i), TP);
+    // Resolve formal arguments to their actual value.
+    if (Frag->getNumArgs())
+      FragTree->SubstituteFormalArguments(ArgMap);
 
-  // Transfer in the old predicates.
-  for (const TreePredicateFn &Pred : getPredicateFns())
-    FragTree->addPredicateFn(Pred);
+    // Transfer types.  Note that the resolved alternative may have fewer
+    // (but not more) results than the PatFrags node.
+    FragTree->setName(getName());
+    for (unsigned i = 0, e = FragTree->getNumTypes(); i != e; ++i)
+      FragTree->UpdateNodeType(i, getExtType(i), TP);
 
-  // Get a new copy of this fragment to stitch into here.
-  //delete this;    // FIXME: implement refcounting!
+    // Transfer in the old predicates.
+    for (const TreePredicateFn &Pred : getPredicateFns())
+      FragTree->addPredicateFn(Pred);
 
-  // The fragment we inlined could have recursive inlining that is needed.  See
-  // if there are any pattern fragments in it and inline them as needed.
-  return FragTree->InlinePatternFragments(TP);
+    // The fragment we inlined could have recursive inlining that is needed.  See
+    // if there are any pattern fragments in it and inline them as needed.
+    FragTree->InlinePatternFragments(FragTree, TP, OutAlternatives);
+  }
 }
 
 /// getImplicitType - Check to see if the specified record has an implicit
@@ -1953,7 +2020,7 @@ static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo,
     return TypeSetByHwMode(T.getRegisterClass(R).getValueTypes());
   }
 
-  if (R->isSubClassOf("PatFrag")) {
+  if (R->isSubClassOf("PatFrags")) {
     assert(ResNo == 0 && "FIXME: PatFrag with multiple results?");
     // Pattern fragment types will be resolved when they are inlined.
     return TypeSetByHwMode(); // Unknown.
@@ -2205,35 +2272,6 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     return false;
   }
 
-  // special handling for set, which isn't really an SDNode.
-  if (getOperator()->getName() == "set") {
-    assert(getNumTypes() == 0 && "Set doesn't produce a value");
-    assert(getNumChildren() >= 2 && "Missing RHS of a set?");
-    unsigned NC = getNumChildren();
-
-    TreePatternNode *SetVal = getChild(NC-1);
-    bool MadeChange = SetVal->ApplyTypeConstraints(TP, NotRegisters);
-
-    for (unsigned i = 0; i < NC-1; ++i) {
-      TreePatternNode *Child = getChild(i);
-      MadeChange |= Child->ApplyTypeConstraints(TP, NotRegisters);
-
-      // Types of operands must match.
-      MadeChange |= Child->UpdateNodeType(0, SetVal->getExtType(i), TP);
-      MadeChange |= SetVal->UpdateNodeType(i, Child->getExtType(0), TP);
-    }
-    return MadeChange;
-  }
-
-  if (getOperator()->getName() == "implicit") {
-    assert(getNumTypes() == 0 && "Node doesn't produce a value");
-
-    bool MadeChange = false;
-    for (unsigned i = 0; i < getNumChildren(); ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
-    return MadeChange;
-  }
-
   if (const CodeGenIntrinsic *Int = getIntrinsicInfo(CDP)) {
     bool MadeChange = false;
 
@@ -2508,10 +2546,10 @@ TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
   Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
-TreePattern::TreePattern(Record *TheRec, TreePatternNode *Pat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
-                         isInputPattern(isInput), HasError(false),
-                         Infer(*this) {
+TreePattern::TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput,
+                         CodeGenDAGPatterns &cdp)
+    : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
+      Infer(*this) {
   Trees.push_back(Pat);
 }
 
@@ -2524,8 +2562,8 @@ void TreePattern::error(const Twine &Msg) {
 }
 
 void TreePattern::ComputeNamedNodes() {
-  for (TreePatternNode *Tree : Trees)
-    ComputeNamedNodes(Tree);
+  for (TreePatternNodePtr &Tree : Trees)
+    ComputeNamedNodes(Tree.get());
 }
 
 void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
@@ -2536,22 +2574,22 @@ void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
     ComputeNamedNodes(N->getChild(i));
 }
 
-
-TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
+TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
+                                                 StringRef OpName) {
   if (DefInit *DI = dyn_cast<DefInit>(TheInit)) {
     Record *R = DI->getDef();
 
     // Direct reference to a leaf DagNode or PatFrag?  Turn it into a
     // TreePatternNode of its own.  For example:
     ///   (foo GPR, imm) -> (foo GPR, (imm))
-    if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrag"))
+    if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrags"))
       return ParseTreePattern(
         DagInit::get(DI, nullptr,
                      std::vector<std::pair<Init*, StringInit*> >()),
         OpName);
 
     // Input argument?
-    TreePatternNode *Res = new TreePatternNode(DI, 1);
+    TreePatternNodePtr Res = std::make_shared<TreePatternNode>(DI, 1);
     if (R->getName() == "node" && !OpName.empty()) {
       if (OpName.empty())
         error("'node' argument requires a name to match with operand list");
@@ -2566,16 +2604,18 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
   if (isa<UnsetInit>(TheInit)) {
     if (OpName.empty())
       error("'?' argument requires a name to match with operand list");
-    TreePatternNode *Res = new TreePatternNode(TheInit, 1);
+    TreePatternNodePtr Res = std::make_shared<TreePatternNode>(TheInit, 1);
     Args.push_back(OpName);
     Res->setName(OpName);
     return Res;
   }
 
-  if (IntInit *II = dyn_cast<IntInit>(TheInit)) {
+  if (isa<IntInit>(TheInit) || isa<BitInit>(TheInit)) {
     if (!OpName.empty())
-      error("Constant int argument should not have a name!");
-    return new TreePatternNode(II, 1);
+      error("Constant int or bit argument should not have a name!");
+    if (isa<BitInit>(TheInit))
+      TheInit = TheInit->convertInitializerTo(IntRecTy::get());
+    return std::make_shared<TreePatternNode>(TheInit, 1);
   }
 
   if (BitsInit *BI = dyn_cast<BitsInit>(TheInit)) {
@@ -2601,8 +2641,8 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
     if (Dag->getNumArgs() != 1)
       error("Type cast only takes one operand!");
 
-    TreePatternNode *New = ParseTreePattern(Dag->getArg(0),
-                                            Dag->getArgNameStr(0));
+    TreePatternNodePtr New =
+        ParseTreePattern(Dag->getArg(0), Dag->getArgNameStr(0));
 
     // Apply the type cast.
     assert(New->getNumTypes() == 1 && "FIXME: Unhandled");
@@ -2615,7 +2655,7 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
   }
 
   // Verify that this is something that makes sense for an operator.
-  if (!Operator->isSubClassOf("PatFrag") &&
+  if (!Operator->isSubClassOf("PatFrags") &&
       !Operator->isSubClassOf("SDNode") &&
       !Operator->isSubClassOf("Instruction") &&
       !Operator->isSubClassOf("SDNodeXForm") &&
@@ -2650,7 +2690,7 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
       error("Cannot use '" + Operator->getName() + "' in an output pattern!");
   }
 
-  std::vector<TreePatternNode*> Children;
+  std::vector<TreePatternNodePtr> Children;
 
   // Parse all the operands.
   for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i)
@@ -2660,7 +2700,7 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
   // node (which is hard-coded to have either zero or one result).
   unsigned NumResults = GetNumNodeResults(Operator, CDP);
 
-  // If the operator is an intrinsic, then this is just syntactic sugar for for
+  // If the operator is an intrinsic, then this is just syntactic sugar for
   // (intrinsic_* <number>, ..children..).  Pick the right intrinsic node, and
   // convert the intrinsic name to a number.
   if (Operator->isSubClassOf("Intrinsic")) {
@@ -2677,13 +2717,13 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
     else // Otherwise, no chain.
       Operator = getDAGPatterns().get_intrinsic_wo_chain_sdnode();
 
-    TreePatternNode *IIDNode = new TreePatternNode(IntInit::get(IID), 1);
-    Children.insert(Children.begin(), IIDNode);
+    Children.insert(Children.begin(),
+                    std::make_shared<TreePatternNode>(IntInit::get(IID), 1));
   }
 
   if (Operator->isSubClassOf("ComplexPattern")) {
     for (unsigned i = 0; i < Children.size(); ++i) {
-      TreePatternNode *Child = Children[i];
+      TreePatternNodePtr Child = Children[i];
 
       if (Child->getName().empty())
         error("All arguments to a ComplexPattern must be named");
@@ -2702,7 +2742,9 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
     }
   }
 
-  TreePatternNode *Result = new TreePatternNode(Operator, Children, NumResults);
+  TreePatternNodePtr Result =
+      std::make_shared<TreePatternNode>(Operator, std::move(Children),
+                                        NumResults);
   Result->setName(OpName);
 
   if (Dag->getName()) {
@@ -2718,7 +2760,7 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
 /// more type generic things and have useless type casts fold away.
 ///
 /// This returns true if any change is made.
-static bool SimplifyTree(TreePatternNode *&N) {
+static bool SimplifyTree(TreePatternNodePtr &N) {
   if (N->isLeaf())
     return false;
 
@@ -2728,7 +2770,7 @@ static bool SimplifyTree(TreePatternNode *&N) {
       N->getExtType(0).isValueTypeByHwMode(false) &&
       N->getExtType(0) == N->getChild(0)->getExtType(0) &&
       N->getName().empty()) {
-    N = N->getChild(0);
+    N = N->getChildShared(0);
     SimplifyTree(N);
     return true;
   }
@@ -2736,9 +2778,9 @@ static bool SimplifyTree(TreePatternNode *&N) {
   // Walk all children.
   bool MadeChange = false;
   for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) {
-    TreePatternNode *Child = N->getChild(i);
+    TreePatternNodePtr Child = N->getChildShared(i);
     MadeChange |= SimplifyTree(Child);
-    N->setChild(i, Child);
+    N->setChild(i, std::move(Child));
   }
   return MadeChange;
 }
@@ -2756,7 +2798,7 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
-    for (TreePatternNode *&Tree : Trees) {
+    for (TreePatternNodePtr &Tree : Trees) {
       MadeChange |= Tree->ApplyTypeConstraints(*this, false);
       MadeChange |= SimplifyTree(Tree);
     }
@@ -2784,7 +2826,7 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
           // changing the type of the input register in this case.  This allows
           // us to match things like:
           //  def : Pat<(v1i64 (bitconvert(v2i32 DPR:$src))), (v1i64 DPR:$src)>;
-          if (Node == Trees[0] && Node->isLeaf()) {
+          if (Node == Trees[0].get() && Node->isLeaf()) {
             DefInit *DI = dyn_cast<DefInit>(Node->getLeafValue());
             if (DI && (DI->getDef()->isSubClassOf("RegisterClass") ||
                        DI->getDef()->isSubClassOf("RegisterOperand")))
@@ -2815,7 +2857,7 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
   }
 
   bool HasUnresolvedTypes = false;
-  for (const TreePatternNode *Tree : Trees)
+  for (const TreePatternNodePtr &Tree : Trees)
     HasUnresolvedTypes |= Tree->ContainsUnresolvedType(*this);
   return !HasUnresolvedTypes;
 }
@@ -2832,7 +2874,7 @@ void TreePattern::print(raw_ostream &OS) const {
 
   if (Trees.size() > 1)
     OS << "[\n";
-  for (const TreePatternNode *Tree : Trees) {
+  for (const TreePatternNodePtr &Tree : Trees) {
     OS << "\t";
     Tree->print(OS);
     OS << "\n";
@@ -2936,17 +2978,17 @@ void CodeGenDAGPatterns::ParseComplexPatterns() {
 /// inside a pattern fragment to a pattern fragment.
 ///
 void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
-  std::vector<Record*> Fragments = Records.getAllDerivedDefinitions("PatFrag");
+  std::vector<Record*> Fragments = Records.getAllDerivedDefinitions("PatFrags");
 
   // First step, parse all of the fragments.
   for (Record *Frag : Fragments) {
     if (OutFrags != Frag->isSubClassOf("OutPatFrag"))
       continue;
 
-    DagInit *Tree = Frag->getValueAsDag("Fragment");
+    ListInit *LI = Frag->getValueAsListInit("Fragments");
     TreePattern *P =
         (PatternFragments[Frag] = llvm::make_unique<TreePattern>(
-             Frag, Tree, !Frag->isSubClassOf("OutPatFrag"),
+             Frag, LI, !Frag->isSubClassOf("OutPatFrag"),
              *this)).get();
 
     // Validate the argument list, converting it to set, to discard duplicates.
@@ -2994,13 +3036,15 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
     // this fragment uses it.
     TreePredicateFn PredFn(P);
     if (!PredFn.isAlwaysTrue())
-      P->getOnlyTree()->addPredicateFn(PredFn);
+      for (auto T : P->getTrees())
+        T->addPredicateFn(PredFn);
 
     // If there is a node transformation corresponding to this, keep track of
     // it.
     Record *Transform = Frag->getValueAsDef("OperandTransform");
     if (!getSDNodeTransform(Transform).second.empty())    // not noop xform?
-      P->getOnlyTree()->setTransformFn(Transform);
+      for (auto T : P->getTrees())
+        T->setTransformFn(Transform);
   }
 
   // Now that we've parsed all of the tree fragments, do a closure on them so
@@ -3013,12 +3057,18 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
     ThePat.InlinePatternFragments();
 
     // Infer as many types as possible.  Don't worry about it if we don't infer
-    // all of them, some may depend on the inputs of the pattern.
-    ThePat.InferAllTypes();
-    ThePat.resetError();
+    // all of them, some may depend on the inputs of the pattern.  Also, don't
+    // validate type sets; validation may cause spurious failures e.g. if a
+    // fragment needs floating-point types but the current target does not have
+    // any (this is only an error if that fragment is ever used!).
+    {
+      TypeInfer::SuppressValidation SV(ThePat.getInfer());
+      ThePat.InferAllTypes();
+      ThePat.resetError();
+    }
 
     // If debugging, print out the pattern fragment result.
-    DEBUG(ThePat.dump());
+    LLVM_DEBUG(ThePat.dump());
   }
 }
 
@@ -3048,9 +3098,9 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
     // Copy the operands over into a DAGDefaultOperand.
     DAGDefaultOperand DefaultOpInfo;
 
-    TreePatternNode *T = P.getTree(0);
+    const TreePatternNodePtr &T = P.getTree(0);
     for (unsigned op = 0, e = T->getNumChildren(); op != e; ++op) {
-      TreePatternNode *TPN = T->getChild(op);
+      TreePatternNodePtr TPN = T->getChildShared(op);
       while (TPN->ApplyTypeConstraints(P, false))
         /* Resolve all types */;
 
@@ -3059,7 +3109,7 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
                         DefaultOps[i]->getName() +
                         "' doesn't have a concrete type!");
       }
-      DefaultOpInfo.DefaultOps.push_back(TPN);
+      DefaultOpInfo.DefaultOps.push_back(std::move(TPN));
     }
 
     // Insert it into the DefaultOperands map so we can find it later.
@@ -3069,15 +3119,15 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
 
 /// HandleUse - Given "Pat" a leaf in the pattern, check to see if it is an
 /// instruction input.  Return true if this is a real use.
-static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
-                      std::map<std::string, TreePatternNode*> &InstInputs) {
+static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat,
+                      std::map<std::string, TreePatternNodePtr> &InstInputs) {
   // No name -> not interesting.
   if (Pat->getName().empty()) {
     if (Pat->isLeaf()) {
       DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue());
       if (DI && (DI->getDef()->isSubClassOf("RegisterClass") ||
                  DI->getDef()->isSubClassOf("RegisterOperand")))
-        I->error("Input " + DI->getDef()->getName() + " must be named!");
+        I.error("Input " + DI->getDef()->getName() + " must be named!");
     }
     return false;
   }
@@ -3085,7 +3135,8 @@ static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
   Record *Rec;
   if (Pat->isLeaf()) {
     DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue());
-    if (!DI) I->error("Input $" + Pat->getName() + " must be an identifier!");
+    if (!DI)
+      I.error("Input $" + Pat->getName() + " must be an identifier!");
     Rec = DI->getDef();
   } else {
     Rec = Pat->getOperator();
@@ -3095,7 +3146,7 @@ static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
   if (Rec->getName() == "srcvalue")
     return false;
 
-  TreePatternNode *&Slot = InstInputs[Pat->getName()];
+  TreePatternNodePtr &Slot = InstInputs[Pat->getName()];
   if (!Slot) {
     Slot = Pat;
     return true;
@@ -3110,24 +3161,38 @@ static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
 
   // Ensure that the inputs agree if we've already seen this input.
   if (Rec != SlotRec)
-    I->error("All $" + Pat->getName() + " inputs must agree with each other");
+    I.error("All $" + Pat->getName() + " inputs must agree with each other");
+  // Ensure that the types can agree as well.
+  Slot->UpdateNodeType(0, Pat->getExtType(0), I);
+  Pat->UpdateNodeType(0, Slot->getExtType(0), I);
   if (Slot->getExtTypes() != Pat->getExtTypes())
-    I->error("All $" + Pat->getName() + " inputs must agree with each other");
+    I.error("All $" + Pat->getName() + " inputs must agree with each other");
   return true;
 }
 
 /// FindPatternInputsAndOutputs - Scan the specified TreePatternNode (which is
 /// part of "I", the instruction), computing the set of inputs and outputs of
 /// the pattern.  Report errors if we see anything naughty.
-void CodeGenDAGPatterns::
-FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
-                            std::map<std::string, TreePatternNode*> &InstInputs,
-                            std::map<std::string, TreePatternNode*>&InstResults,
-                            std::vector<Record*> &InstImpResults) {
+void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
+    TreePattern &I, TreePatternNodePtr Pat,
+    std::map<std::string, TreePatternNodePtr> &InstInputs,
+    std::map<std::string, TreePatternNodePtr> &InstResults,
+    std::vector<Record *> &InstImpResults) {
+
+  // The instruction pattern still has unresolved fragments.  For *named*
+  // nodes we must resolve those here.  This may not result in multiple
+  // alternatives.
+  if (!Pat->getName().empty()) {
+    TreePattern SrcPattern(I.getRecord(), Pat, true, *this);
+    SrcPattern.InlinePatternFragments();
+    SrcPattern.InferAllTypes();
+    Pat = SrcPattern.getOnlyTree();
+  }
+
   if (Pat->isLeaf()) {
     bool isUse = HandleUse(I, Pat, InstInputs);
     if (!isUse && Pat->getTransformFn())
-      I->error("Cannot specify a transform function for a non-input value!");
+      I.error("Cannot specify a transform function for a non-input value!");
     return;
   }
 
@@ -3135,11 +3200,11 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
     for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) {
       TreePatternNode *Dest = Pat->getChild(i);
       if (!Dest->isLeaf())
-        I->error("implicitly defined value should be a register!");
+        I.error("implicitly defined value should be a register!");
 
       DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
       if (!Val || !Val->getDef()->isSubClassOf("Register"))
-        I->error("implicitly defined value should be a register!");
+        I.error("implicitly defined value should be a register!");
       InstImpResults.push_back(Val->getDef());
     }
     return;
@@ -3150,9 +3215,9 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
     // and recurse.
     for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) {
       if (Pat->getChild(i)->getNumTypes() == 0)
-        I->error("Cannot have void nodes inside of patterns!");
-      FindPatternInputsAndOutputs(I, Pat->getChild(i), InstInputs, InstResults,
-                                  InstImpResults);
+        I.error("Cannot have void nodes inside of patterns!");
+      FindPatternInputsAndOutputs(I, Pat->getChildShared(i), InstInputs,
+                                  InstResults, InstImpResults);
     }
 
     // If this is a non-leaf node with no children, treat it basically as if
@@ -3160,27 +3225,33 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
     bool isUse = HandleUse(I, Pat, InstInputs);
 
     if (!isUse && Pat->getTransformFn())
-      I->error("Cannot specify a transform function for a non-input value!");
+      I.error("Cannot specify a transform function for a non-input value!");
     return;
   }
 
   // Otherwise, this is a set, validate and collect instruction results.
   if (Pat->getNumChildren() == 0)
-    I->error("set requires operands!");
+    I.error("set requires operands!");
 
   if (Pat->getTransformFn())
-    I->error("Cannot specify a transform function on a set node!");
+    I.error("Cannot specify a transform function on a set node!");
 
   // Check the set destinations.
   unsigned NumDests = Pat->getNumChildren()-1;
   for (unsigned i = 0; i != NumDests; ++i) {
-    TreePatternNode *Dest = Pat->getChild(i);
+    TreePatternNodePtr Dest = Pat->getChildShared(i);
+    // For set destinations we also must resolve fragments here.
+    TreePattern DestPattern(I.getRecord(), Dest, false, *this);
+    DestPattern.InlinePatternFragments();
+    DestPattern.InferAllTypes();
+    Dest = DestPattern.getOnlyTree();
+
     if (!Dest->isLeaf())
-      I->error("set destination should be a register!");
+      I.error("set destination should be a register!");
 
     DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
     if (!Val) {
-      I->error("set destination should be a register!");
+      I.error("set destination should be a register!");
       continue;
     }
 
@@ -3189,20 +3260,20 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
         Val->getDef()->isSubClassOf("RegisterOperand") ||
         Val->getDef()->isSubClassOf("PointerLikeRegClass")) {
       if (Dest->getName().empty())
-        I->error("set destination must have a name!");
+        I.error("set destination must have a name!");
       if (InstResults.count(Dest->getName()))
-        I->error("cannot set '" + Dest->getName() +"' multiple times");
+        I.error("cannot set '" + Dest->getName() + "' multiple times");
       InstResults[Dest->getName()] = Dest;
     } else if (Val->getDef()->isSubClassOf("Register")) {
       InstImpResults.push_back(Val->getDef());
     } else {
-      I->error("set destination should be a register!");
+      I.error("set destination should be a register!");
     }
   }
 
   // Verify and collect info from the computation.
-  FindPatternInputsAndOutputs(I, Pat->getChild(NumDests),
-                              InstInputs, InstResults, InstImpResults);
+  FindPatternInputsAndOutputs(I, Pat->getChildShared(NumDests), InstInputs,
+                              InstResults, InstImpResults);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3217,18 +3288,17 @@ public:
   bool mayLoad;
   bool isBitcast;
   bool isVariadic;
+  bool hasChain;
 
   InstAnalyzer(const CodeGenDAGPatterns &cdp)
     : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
-      isBitcast(false), isVariadic(false) {}
-
-  void Analyze(const TreePattern *Pat) {
-    // Assume only the first tree is the pattern. The others are clobber nodes.
-    AnalyzeNode(Pat->getTree(0));
-  }
+      isBitcast(false), isVariadic(false), hasChain(false) {}
 
   void Analyze(const PatternToMatch &Pat) {
-    AnalyzeNode(Pat.getSrcPattern());
+    const TreePatternNode *N = Pat.getSrcPattern();
+    AnalyzeNode(N);
+    // These properties are detected only on the root node.
+    isBitcast = IsNodeBitcast(N);
   }
 
 private:
@@ -3236,20 +3306,12 @@ private:
     if (hasSideEffects || mayLoad || mayStore || isVariadic)
       return false;
 
-    if (N->getNumChildren() != 2)
-      return false;
-
-    const TreePatternNode *N0 = N->getChild(0);
-    if (!N0->isLeaf() || !isa<DefInit>(N0->getLeafValue()))
+    if (N->isLeaf())
       return false;
-
-    const TreePatternNode *N1 = N->getChild(1);
-    if (N1->isLeaf())
-      return false;
-    if (N1->getNumChildren() != 1 || !N1->getChild(0)->isLeaf())
+    if (N->getNumChildren() != 1 || !N->getChild(0)->isLeaf())
       return false;
 
-    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N1->getOperator());
+    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N->getOperator());
     if (OpInfo.getNumResults() != 1 || OpInfo.getNumOperands() != 1)
       return false;
     return OpInfo.getEnumName() == "ISD::BITCAST";
@@ -3275,17 +3337,12 @@ public:
     for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
       AnalyzeNode(N->getChild(i));
 
-    // Ignore set nodes, which are not SDNodes.
-    if (N->getOperator()->getName() == "set") {
-      isBitcast = IsNodeBitcast(N);
-      return;
-    }
-
     // Notice properties of the node.
     if (N->NodeHasProperty(SDNPMayStore, CDP)) mayStore = true;
     if (N->NodeHasProperty(SDNPMayLoad, CDP)) mayLoad = true;
     if (N->NodeHasProperty(SDNPSideEffect, CDP)) hasSideEffects = true;
     if (N->NodeHasProperty(SDNPVariadic, CDP)) isVariadic = true;
+    if (N->NodeHasProperty(SDNPHasChain, CDP)) hasChain = true;
 
     if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
       // If this is an intrinsic, analyze it.
@@ -3348,7 +3405,13 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
   InstInfo.mayLoad |= PatInfo.mayLoad;
 
   // These flags are silently added without any verification.
-  InstInfo.isBitcast |= PatInfo.isBitcast;
+  // FIXME: To match historical behavior of TableGen, for now add those flags
+  // only when we're inferring from the primary instruction pattern.
+  if (PatDef->isSubClassOf("Instruction")) {
+    InstInfo.isBitcast |= PatInfo.isBitcast;
+    InstInfo.hasChain |= PatInfo.hasChain;
+    InstInfo.hasChain_Inferred = true;
+  }
 
   // Don't infer isVariadic. This flag means something different on SDNodes and
   // instructions. For example, a CALL SDNode is variadic because it has the
@@ -3419,37 +3482,30 @@ static bool checkOperandClass(CGIOperandList::OperandInfo &OI,
   return false;
 }
 
-const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
+void CodeGenDAGPatterns::parseInstructionPattern(
     CodeGenInstruction &CGI, ListInit *Pat, DAGInstMap &DAGInsts) {
 
   assert(!DAGInsts.count(CGI.TheDef) && "Instruction already parsed!");
 
   // Parse the instruction.
-  TreePattern *I = new TreePattern(CGI.TheDef, Pat, true, *this);
-  // Inline pattern fragments into it.
-  I->InlinePatternFragments();
-
-  // Infer as many types as possible.  If we cannot infer all of them, we can
-  // never do anything with this instruction pattern: report it to the user.
-  if (!I->InferAllTypes())
-    I->error("Could not infer all types in pattern!");
+  TreePattern I(CGI.TheDef, Pat, true, *this);
 
   // InstInputs - Keep track of all of the inputs of the instruction, along
   // with the record they are declared as.
-  std::map<std::string, TreePatternNode*> InstInputs;
+  std::map<std::string, TreePatternNodePtr> InstInputs;
 
   // InstResults - Keep track of all the virtual registers that are 'set'
   // in the instruction, including what reg class they are.
-  std::map<std::string, TreePatternNode*> InstResults;
+  std::map<std::string, TreePatternNodePtr> InstResults;
 
   std::vector<Record*> InstImpResults;
 
   // Verify that the top-level forms in the instruction are of void type, and
   // fill in the InstResults map.
   SmallString<32> TypesString;
-  for (unsigned j = 0, e = I->getNumTrees(); j != e; ++j) {
+  for (unsigned j = 0, e = I.getNumTrees(); j != e; ++j) {
     TypesString.clear();
-    TreePatternNode *Pat = I->getTree(j);
+    TreePatternNodePtr Pat = I.getTree(j);
     if (Pat->getNumTypes() != 0) {
       raw_svector_ostream OS(TypesString);
       for (unsigned k = 0, ke = Pat->getNumTypes(); k != ke; ++k) {
@@ -3457,7 +3513,7 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
           OS << ", ";
         Pat->getExtType(k).writeToStream(OS);
       }
-      I->error("Top-level forms in instruction pattern should have"
+      I.error("Top-level forms in instruction pattern should have"
                " void types, has types " +
                OS.str());
     }
@@ -3473,31 +3529,31 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
   unsigned NumResults = InstResults.size();
 
   // Parse the operands list from the (ops) list, validating it.
-  assert(I->getArgList().empty() && "Args list should still be empty here!");
+  assert(I.getArgList().empty() && "Args list should still be empty here!");
 
   // Check that all of the results occur first in the list.
   std::vector<Record*> Results;
-  SmallVector<TreePatternNode *, 2> ResNodes;
+  SmallVector<TreePatternNodePtr, 2> ResNodes;
   for (unsigned i = 0; i != NumResults; ++i) {
     if (i == CGI.Operands.size())
-      I->error("'" + InstResults.begin()->first +
+      I.error("'" + InstResults.begin()->first +
                "' set but does not appear in operand list!");
     const std::string &OpName = CGI.Operands[i].Name;
 
     // Check that it exists in InstResults.
-    TreePatternNode *RNode = InstResults[OpName];
+    TreePatternNodePtr RNode = InstResults[OpName];
     if (!RNode)
-      I->error("Operand $" + OpName + " does not exist in operand list!");
+      I.error("Operand $" + OpName + " does not exist in operand list!");
 
-    ResNodes.push_back(RNode);
 
     Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
+    ResNodes.push_back(std::move(RNode));
     if (!R)
-      I->error("Operand $" + OpName + " should be a set destination: all "
+      I.error("Operand $" + OpName + " should be a set destination: all "
                "outputs must occur before inputs in operand list!");
 
     if (!checkOperandClass(CGI.Operands[i], R))
-      I->error("Operand $" + OpName + " class mismatch!");
+      I.error("Operand $" + OpName + " class mismatch!");
 
     // Remember the return type.
     Results.push_back(CGI.Operands[i].Rec);
@@ -3506,19 +3562,16 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
     InstResults.erase(OpName);
   }
 
-  // Loop over the inputs next.  Make a copy of InstInputs so we can destroy
-  // the copy while we're checking the inputs.
-  std::map<std::string, TreePatternNode*> InstInputsCheck(InstInputs);
-
-  std::vector<TreePatternNode*> ResultNodeOperands;
+  // Loop over the inputs next.
+  std::vector<TreePatternNodePtr> ResultNodeOperands;
   std::vector<Record*> Operands;
   for (unsigned i = NumResults, e = CGI.Operands.size(); i != e; ++i) {
     CGIOperandList::OperandInfo &Op = CGI.Operands[i];
     const std::string &OpName = Op.Name;
     if (OpName.empty())
-      I->error("Operand #" + Twine(i) + " in operands list has no name!");
+      I.error("Operand #" + Twine(i) + " in operands list has no name!");
 
-    if (!InstInputsCheck.count(OpName)) {
+    if (!InstInputs.count(OpName)) {
       // If this is an operand with a DefaultOps set filled in, we can ignore
       // this.  When we codegen it, we will do so as always executed.
       if (Op.Rec->isSubClassOf("OperandWithDefaultOps")) {
@@ -3527,22 +3580,22 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
         if (!getDefaultOperand(Op.Rec).DefaultOps.empty())
           continue;
       }
-      I->error("Operand $" + OpName +
+      I.error("Operand $" + OpName +
                " does not appear in the instruction pattern");
     }
-    TreePatternNode *InVal = InstInputsCheck[OpName];
-    InstInputsCheck.erase(OpName);   // It occurred, remove from map.
+    TreePatternNodePtr InVal = InstInputs[OpName];
+    InstInputs.erase(OpName);   // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
       Record *InRec = static_cast<DefInit*>(InVal->getLeafValue())->getDef();
       if (!checkOperandClass(Op, InRec))
-        I->error("Operand $" + OpName + "'s register class disagrees"
+        I.error("Operand $" + OpName + "'s register class disagrees"
                  " between the operand and pattern");
     }
     Operands.push_back(Op.Rec);
 
     // Construct the result for the dest-pattern operand list.
-    TreePatternNode *OpNode = InVal->clone();
+    TreePatternNodePtr OpNode = InVal->clone();
 
     // No predicate is useful on the result.
     OpNode->clearPredicateFns();
@@ -3550,42 +3603,47 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
     // Promote the xform function to be an explicit node if set.
     if (Record *Xform = OpNode->getTransformFn()) {
       OpNode->setTransformFn(nullptr);
-      std::vector<TreePatternNode*> Children;
+      std::vector<TreePatternNodePtr> Children;
       Children.push_back(OpNode);
-      OpNode = new TreePatternNode(Xform, Children, OpNode->getNumTypes());
+      OpNode = std::make_shared<TreePatternNode>(Xform, std::move(Children),
+                                                 OpNode->getNumTypes());
     }
 
-    ResultNodeOperands.push_back(OpNode);
+    ResultNodeOperands.push_back(std::move(OpNode));
   }
 
-  if (!InstInputsCheck.empty())
-    I->error("Input operand $" + InstInputsCheck.begin()->first +
-             " occurs in pattern but not in operands list!");
+  if (!InstInputs.empty())
+    I.error("Input operand $" + InstInputs.begin()->first +
+            " occurs in pattern but not in operands list!");
 
-  TreePatternNode *ResultPattern =
-    new TreePatternNode(I->getRecord(), ResultNodeOperands,
-                        GetNumNodeResults(I->getRecord(), *this));
+  TreePatternNodePtr ResultPattern = std::make_shared<TreePatternNode>(
+      I.getRecord(), std::move(ResultNodeOperands),
+      GetNumNodeResults(I.getRecord(), *this));
   // Copy fully inferred output node types to instruction result pattern.
   for (unsigned i = 0; i != NumResults; ++i) {
     assert(ResNodes[i]->getNumTypes() == 1 && "FIXME: Unhandled");
     ResultPattern->setType(i, ResNodes[i]->getExtType(0));
   }
 
+  // FIXME: Assume only the first tree is the pattern. The others are clobber
+  // nodes.
+  TreePatternNodePtr Pattern = I.getTree(0);
+  TreePatternNodePtr SrcPattern;
+  if (Pattern->getOperator()->getName() == "set") {
+    SrcPattern = Pattern->getChild(Pattern->getNumChildren()-1)->clone();
+  } else{
+    // Not a set (store or something?)
+    SrcPattern = Pattern;
+  }
+
   // Create and insert the instruction.
   // FIXME: InstImpResults should not be part of DAGInstruction.
-  DAGInstruction TheInst(I, Results, Operands, InstImpResults);
-  DAGInsts.insert(std::make_pair(I->getRecord(), TheInst));
-
-  // Use a temporary tree pattern to infer all types and make sure that the
-  // constructed result is correct.  This depends on the instruction already
-  // being inserted into the DAGInsts map.
-  TreePattern Temp(I->getRecord(), ResultPattern, false, *this);
-  Temp.InferAllTypes(&I->getNamedNodesMap());
-
-  DAGInstruction &TheInsertedInst = DAGInsts.find(I->getRecord())->second;
-  TheInsertedInst.setResultPattern(Temp.getOnlyTree());
+  Record *R = I.getRecord();
+  DAGInsts.emplace(std::piecewise_construct, std::forward_as_tuple(R),
+                   std::forward_as_tuple(Results, Operands, InstImpResults,
+                                         SrcPattern, ResultPattern));
 
-  return TheInsertedInst;
+  LLVM_DEBUG(I.dump());
 }
 
 /// ParseInstructions - Parse all of the instructions, inlining and resolving
@@ -3625,51 +3683,32 @@ void CodeGenDAGPatterns::ParseInstructions() {
       // Create and insert the instruction.
       std::vector<Record*> ImpResults;
       Instructions.insert(std::make_pair(Instr,
-                          DAGInstruction(nullptr, Results, Operands, ImpResults)));
+                            DAGInstruction(Results, Operands, ImpResults)));
       continue;  // no pattern.
     }
 
     CodeGenInstruction &CGI = Target.getInstruction(Instr);
-    const DAGInstruction &DI = parseInstructionPattern(CGI, LI, Instructions);
-
-    (void)DI;
-    DEBUG(DI.getPattern()->dump());
+    parseInstructionPattern(CGI, LI, Instructions);
   }
 
   // If we can, convert the instructions to be patterns that are matched!
   for (auto &Entry : Instructions) {
+    Record *Instr = Entry.first;
     DAGInstruction &TheInst = Entry.second;
-    TreePattern *I = TheInst.getPattern();
-    if (!I) continue;  // No pattern.
-
-    if (PatternRewriter)
-      PatternRewriter(I);
-    // FIXME: Assume only the first tree is the pattern. The others are clobber
-    // nodes.
-    TreePatternNode *Pattern = I->getTree(0);
-    TreePatternNode *SrcPattern;
-    if (Pattern->getOperator()->getName() == "set") {
-      SrcPattern = Pattern->getChild(Pattern->getNumChildren()-1)->clone();
-    } else{
-      // Not a set (store or something?)
-      SrcPattern = Pattern;
-    }
+    TreePatternNodePtr SrcPattern = TheInst.getSrcPattern();
+    TreePatternNodePtr ResultPattern = TheInst.getResultPattern();
 
-    Record *Instr = Entry.first;
-    ListInit *Preds = Instr->getValueAsListInit("Predicates");
-    int Complexity = Instr->getValueAsInt("AddedComplexity");
-    AddPatternToMatch(
-        I,
-        PatternToMatch(Instr, makePredList(Preds), SrcPattern,
-                       TheInst.getResultPattern(), TheInst.getImpResults(),
-                       Complexity, Instr->getID()));
+    if (SrcPattern && ResultPattern) {
+      TreePattern Pattern(Instr, SrcPattern, true, *this);
+      TreePattern Result(Instr, ResultPattern, false, *this);
+      ParseOnePattern(Instr, Pattern, Result, TheInst.getImpResults());
+    }
   }
 }
 
+typedef std::pair<TreePatternNode *, unsigned> NameRecord;
 
-typedef std::pair<const TreePatternNode*, unsigned> NameRecord;
-
-static void FindNames(const TreePatternNode *P,
+static void FindNames(TreePatternNode *P,
                       std::map<std::string, NameRecord> &Names,
                       TreePattern *PatternTop) {
   if (!P->getName().empty()) {
@@ -3698,7 +3737,7 @@ std::vector<Predicate> CodeGenDAGPatterns::makePredList(ListInit *L) {
   }
 
   // Sort so that different orders get canonicalized to the same string.
-  std::sort(Preds.begin(), Preds.end());
+  llvm::sort(Preds.begin(), Preds.end());
   return Preds;
 }
 
@@ -3742,34 +3781,18 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
         SrcNames[Entry.first].second == 1)
       Pattern->error("Pattern has dead named input: $" + Entry.first);
 
-  PatternsToMatch.push_back(std::move(PTM));
+  PatternsToMatch.push_back(PTM);
 }
 
 void CodeGenDAGPatterns::InferInstructionFlags() {
   ArrayRef<const CodeGenInstruction*> Instructions =
     Target.getInstructionsByEnumValue();
 
-  // First try to infer flags from the primary instruction pattern, if any.
-  SmallVector<CodeGenInstruction*, 8> Revisit;
   unsigned Errors = 0;
-  for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
-    CodeGenInstruction &InstInfo =
-      const_cast<CodeGenInstruction &>(*Instructions[i]);
-
-    // Get the primary instruction pattern.
-    const TreePattern *Pattern = getInstruction(InstInfo.TheDef).getPattern();
-    if (!Pattern) {
-      if (InstInfo.hasUndefFlags())
-        Revisit.push_back(&InstInfo);
-      continue;
-    }
-    InstAnalyzer PatInfo(*this);
-    PatInfo.Analyze(Pattern);
-    Errors += InferFromPattern(InstInfo, PatInfo, InstInfo.TheDef);
-  }
 
-  // Second, look for single-instruction patterns defined outside the
-  // instruction.
+  // Try to infer flags from all patterns in PatternToMatch.  These include
+  // both the primary instruction patterns (which always come first) and
+  // patterns defined outside the instruction.
   for (const PatternToMatch &PTM : ptms()) {
     // We can only infer from single-instruction patterns, otherwise we won't
     // know which instruction should get the flags.
@@ -3793,9 +3816,11 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   if (Errors)
     PrintFatalError("pattern conflicts");
 
-  // Revisit instructions with undefined flags and no pattern.
+  // If requested by the target, guess any undefined properties.
   if (Target.guessInstructionProperties()) {
-    for (CodeGenInstruction *InstInfo : Revisit) {
+    for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
+      CodeGenInstruction *InstInfo =
+        const_cast<CodeGenInstruction *>(Instructions[i]);
       if (InstInfo->InferredFrom)
         continue;
       // The mayLoad and mayStore flags default to false.
@@ -3807,7 +3832,9 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   }
 
   // Complain about any flags that are still undefined.
-  for (CodeGenInstruction *InstInfo : Revisit) {
+  for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
+    CodeGenInstruction *InstInfo =
+      const_cast<CodeGenInstruction *>(Instructions[i]);
     if (InstInfo->InferredFrom)
       continue;
     if (InstInfo->hasSideEffects_Unset)
@@ -3919,6 +3946,122 @@ static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) {
   return false;
 }
 
+void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
+       TreePattern &Pattern, TreePattern &Result,
+       const std::vector<Record *> &InstImpResults) {
+
+  // Inline pattern fragments and expand multiple alternatives.
+  Pattern.InlinePatternFragments();
+  Result.InlinePatternFragments();
+
+  if (Result.getNumTrees() != 1)
+    Result.error("Cannot use multi-alternative fragments in result pattern!");
+
+  // Infer types.
+  bool IterateInference;
+  bool InferredAllPatternTypes, InferredAllResultTypes;
+  do {
+    // Infer as many types as possible.  If we cannot infer all of them, we
+    // can never do anything with this pattern: report it to the user.
+    InferredAllPatternTypes =
+        Pattern.InferAllTypes(&Pattern.getNamedNodesMap());
+
+    // Infer as many types as possible.  If we cannot infer all of them, we
+    // can never do anything with this pattern: report it to the user.
+    InferredAllResultTypes =
+        Result.InferAllTypes(&Pattern.getNamedNodesMap());
+
+    IterateInference = false;
+
+    // Apply the type of the result to the source pattern.  This helps us
+    // resolve cases where the input type is known to be a pointer type (which
+    // is considered resolved), but the result knows it needs to be 32- or
+    // 64-bits.  Infer the other way for good measure.
+    for (auto T : Pattern.getTrees())
+      for (unsigned i = 0, e = std::min(Result.getOnlyTree()->getNumTypes(),
+                                        T->getNumTypes());
+         i != e; ++i) {
+        IterateInference |= T->UpdateNodeType(
+            i, Result.getOnlyTree()->getExtType(i), Result);
+        IterateInference |= Result.getOnlyTree()->UpdateNodeType(
+            i, T->getExtType(i), Result);
+      }
+
+    // If our iteration has converged and the input pattern's types are fully
+    // resolved but the result pattern is not fully resolved, we may have a
+    // situation where we have two instructions in the result pattern and
+    // the instructions require a common register class, but don't care about
+    // what actual MVT is used.  This is actually a bug in our modelling:
+    // output patterns should have register classes, not MVTs.
+    //
+    // In any case, to handle this, we just go through and disambiguate some
+    // arbitrary types to the result pattern's nodes.
+    if (!IterateInference && InferredAllPatternTypes &&
+        !InferredAllResultTypes)
+      IterateInference =
+          ForceArbitraryInstResultType(Result.getTree(0).get(), Result);
+  } while (IterateInference);
+
+  // Verify that we inferred enough types that we can do something with the
+  // pattern and result.  If these fire the user has to add type casts.
+  if (!InferredAllPatternTypes)
+    Pattern.error("Could not infer all types in pattern!");
+  if (!InferredAllResultTypes) {
+    Pattern.dump();
+    Result.error("Could not infer all types in pattern result!");
+  }
+
+  // Promote the xform function to be an explicit node if set.
+  const TreePatternNodePtr &DstPattern = Result.getOnlyTree();
+  std::vector<TreePatternNodePtr> ResultNodeOperands;
+  for (unsigned ii = 0, ee = DstPattern->getNumChildren(); ii != ee; ++ii) {
+    TreePatternNodePtr OpNode = DstPattern->getChildShared(ii);
+    if (Record *Xform = OpNode->getTransformFn()) {
+      OpNode->setTransformFn(nullptr);
+      std::vector<TreePatternNodePtr> Children;
+      Children.push_back(OpNode);
+      OpNode = std::make_shared<TreePatternNode>(Xform, std::move(Children),
+                                                 OpNode->getNumTypes());
+    }
+    ResultNodeOperands.push_back(OpNode);
+  }
+
+  TreePatternNodePtr DstShared =
+      DstPattern->isLeaf()
+          ? DstPattern
+          : std::make_shared<TreePatternNode>(DstPattern->getOperator(),
+                                              std::move(ResultNodeOperands),
+                                              DstPattern->getNumTypes());
+
+  for (unsigned i = 0, e = Result.getOnlyTree()->getNumTypes(); i != e; ++i)
+    DstShared->setType(i, Result.getOnlyTree()->getExtType(i));
+
+  TreePattern Temp(Result.getRecord(), DstShared, false, *this);
+  Temp.InferAllTypes();
+
+  ListInit *Preds = TheDef->getValueAsListInit("Predicates");
+  int Complexity = TheDef->getValueAsInt("AddedComplexity");
+
+  if (PatternRewriter)
+    PatternRewriter(&Pattern);
+
+  // A pattern may end up with an "impossible" type, i.e. a situation
+  // where all types have been eliminated for some node in this pattern.
+  // This could occur for intrinsics that only make sense for a specific
+  // value type, and use a specific register class. If, for some mode,
+  // that register class does not accept that type, the type inference
+  // will lead to a contradiction, which is not an error however, but
+  // a sign that this pattern will simply never match.
+  if (Temp.getOnlyTree()->hasPossibleType())
+    for (auto T : Pattern.getTrees())
+      if (T->hasPossibleType())
+        AddPatternToMatch(&Pattern,
+                          PatternToMatch(TheDef, makePredList(Preds),
+                                         T, Temp.getOnlyTree(),
+                                         InstImpResults, Complexity,
+                                         TheDef->getID()));
+}
+
 void CodeGenDAGPatterns::ParsePatterns() {
   std::vector<Record*> Patterns = Records.getAllDerivedDefinitions("Pattern");
 
@@ -3929,10 +4072,7 @@ void CodeGenDAGPatterns::ParsePatterns() {
     if (hasNullFragReference(Tree))
       continue;
 
-    TreePattern *Pattern = new TreePattern(CurPattern, Tree, true, *this);
-
-    // Inline pattern fragments into it.
-    Pattern->InlinePatternFragments();
+    TreePattern Pattern(CurPattern, Tree, true, *this);
 
     ListInit *LI = CurPattern->getValueAsListInit("ResultInstrs");
     if (LI->empty()) continue;  // no pattern.
@@ -3940,119 +4080,19 @@ void CodeGenDAGPatterns::ParsePatterns() {
     // Parse the instruction.
     TreePattern Result(CurPattern, LI, false, *this);
 
-    // Inline pattern fragments into it.
-    Result.InlinePatternFragments();
-
     if (Result.getNumTrees() != 1)
       Result.error("Cannot handle instructions producing instructions "
                    "with temporaries yet!");
 
-    bool IterateInference;
-    bool InferredAllPatternTypes, InferredAllResultTypes;
-    do {
-      // Infer as many types as possible.  If we cannot infer all of them, we
-      // can never do anything with this pattern: report it to the user.
-      InferredAllPatternTypes =
-        Pattern->InferAllTypes(&Pattern->getNamedNodesMap());
-
-      // Infer as many types as possible.  If we cannot infer all of them, we
-      // can never do anything with this pattern: report it to the user.
-      InferredAllResultTypes =
-          Result.InferAllTypes(&Pattern->getNamedNodesMap());
-
-      IterateInference = false;
-
-      // Apply the type of the result to the source pattern.  This helps us
-      // resolve cases where the input type is known to be a pointer type (which
-      // is considered resolved), but the result knows it needs to be 32- or
-      // 64-bits.  Infer the other way for good measure.
-      for (unsigned i = 0, e = std::min(Result.getTree(0)->getNumTypes(),
-                                        Pattern->getTree(0)->getNumTypes());
-           i != e; ++i) {
-        IterateInference = Pattern->getTree(0)->UpdateNodeType(
-            i, Result.getTree(0)->getExtType(i), Result);
-        IterateInference |= Result.getTree(0)->UpdateNodeType(
-            i, Pattern->getTree(0)->getExtType(i), Result);
-      }
-
-      // If our iteration has converged and the input pattern's types are fully
-      // resolved but the result pattern is not fully resolved, we may have a
-      // situation where we have two instructions in the result pattern and
-      // the instructions require a common register class, but don't care about
-      // what actual MVT is used.  This is actually a bug in our modelling:
-      // output patterns should have register classes, not MVTs.
-      //
-      // In any case, to handle this, we just go through and disambiguate some
-      // arbitrary types to the result pattern's nodes.
-      if (!IterateInference && InferredAllPatternTypes &&
-          !InferredAllResultTypes)
-        IterateInference =
-            ForceArbitraryInstResultType(Result.getTree(0), Result);
-    } while (IterateInference);
-
-    // Verify that we inferred enough types that we can do something with the
-    // pattern and result.  If these fire the user has to add type casts.
-    if (!InferredAllPatternTypes)
-      Pattern->error("Could not infer all types in pattern!");
-    if (!InferredAllResultTypes) {
-      Pattern->dump();
-      Result.error("Could not infer all types in pattern result!");
-    }
-
     // Validate that the input pattern is correct.
-    std::map<std::string, TreePatternNode*> InstInputs;
-    std::map<std::string, TreePatternNode*> InstResults;
+    std::map<std::string, TreePatternNodePtr> InstInputs;
+    std::map<std::string, TreePatternNodePtr> InstResults;
     std::vector<Record*> InstImpResults;
-    for (unsigned j = 0, ee = Pattern->getNumTrees(); j != ee; ++j)
-      FindPatternInputsAndOutputs(Pattern, Pattern->getTree(j),
-                                  InstInputs, InstResults,
-                                  InstImpResults);
+    for (unsigned j = 0, ee = Pattern.getNumTrees(); j != ee; ++j)
+      FindPatternInputsAndOutputs(Pattern, Pattern.getTree(j), InstInputs,
+                                  InstResults, InstImpResults);
 
-    // Promote the xform function to be an explicit node if set.
-    TreePatternNode *DstPattern = Result.getOnlyTree();
-    std::vector<TreePatternNode*> ResultNodeOperands;
-    for (unsigned ii = 0, ee = DstPattern->getNumChildren(); ii != ee; ++ii) {
-      TreePatternNode *OpNode = DstPattern->getChild(ii);
-      if (Record *Xform = OpNode->getTransformFn()) {
-        OpNode->setTransformFn(nullptr);
-        std::vector<TreePatternNode*> Children;
-        Children.push_back(OpNode);
-        OpNode = new TreePatternNode(Xform, Children, OpNode->getNumTypes());
-      }
-      ResultNodeOperands.push_back(OpNode);
-    }
-    DstPattern = Result.getOnlyTree();
-    if (!DstPattern->isLeaf())
-      DstPattern = new TreePatternNode(DstPattern->getOperator(),
-                                       ResultNodeOperands,
-                                       DstPattern->getNumTypes());
-
-    for (unsigned i = 0, e = Result.getOnlyTree()->getNumTypes(); i != e; ++i)
-      DstPattern->setType(i, Result.getOnlyTree()->getExtType(i));
-
-    TreePattern Temp(Result.getRecord(), DstPattern, false, *this);
-    Temp.InferAllTypes();
-
-    // A pattern may end up with an "impossible" type, i.e. a situation
-    // where all types have been eliminated for some node in this pattern.
-    // This could occur for intrinsics that only make sense for a specific
-    // value type, and use a specific register class. If, for some mode,
-    // that register class does not accept that type, the type inference
-    // will lead to a contradiction, which is not an error however, but
-    // a sign that this pattern will simply never match.
-    if (Pattern->getTree(0)->hasPossibleType() &&
-        Temp.getOnlyTree()->hasPossibleType()) {
-      ListInit *Preds = CurPattern->getValueAsListInit("Predicates");
-      int Complexity = CurPattern->getValueAsInt("AddedComplexity");
-      if (PatternRewriter)
-        PatternRewriter(Pattern);
-      AddPatternToMatch(
-          Pattern,
-          PatternToMatch(
-              CurPattern, makePredList(Preds), Pattern->getTree(0),
-              Temp.getOnlyTree(), std::move(InstImpResults), Complexity,
-              CurPattern->getID()));
-    }
+    ParseOnePattern(CurPattern, Pattern, Result, InstImpResults);
   }
 }
 
@@ -4071,25 +4111,24 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
   std::vector<PatternToMatch> Copy = PatternsToMatch;
   PatternsToMatch.clear();
 
-  auto AppendPattern = [this,&ModeChecks](PatternToMatch &P, unsigned Mode) {
-    TreePatternNode *NewSrc = P.SrcPattern->clone();
-    TreePatternNode *NewDst = P.DstPattern->clone();
+  auto AppendPattern = [this, &ModeChecks](PatternToMatch &P, unsigned Mode) {
+    TreePatternNodePtr NewSrc = P.SrcPattern->clone();
+    TreePatternNodePtr NewDst = P.DstPattern->clone();
     if (!NewSrc->setDefaultMode(Mode) || !NewDst->setDefaultMode(Mode)) {
-      delete NewSrc;
-      delete NewDst;
       return;
     }
 
     std::vector<Predicate> Preds = P.Predicates;
     const std::vector<Predicate> &MC = ModeChecks[Mode];
     Preds.insert(Preds.end(), MC.begin(), MC.end());
-    PatternsToMatch.emplace_back(P.getSrcRecord(), Preds, NewSrc, NewDst,
-                                 P.getDstRegs(), P.getAddedComplexity(),
-                                 Record::getNewUID(), Mode);
+    PatternsToMatch.emplace_back(P.getSrcRecord(), Preds, std::move(NewSrc),
+                                 std::move(NewDst), P.getDstRegs(),
+                                 P.getAddedComplexity(), Record::getNewUID(),
+                                 Mode);
   };
 
   for (PatternToMatch &P : Copy) {
-    TreePatternNode *SrcP = nullptr, *DstP = nullptr;
+    TreePatternNodePtr SrcP = nullptr, DstP = nullptr;
     if (P.SrcPattern->hasProperTypeByHwMode())
       SrcP = P.SrcPattern;
     if (P.DstPattern->hasProperTypeByHwMode())
@@ -4101,9 +4140,9 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
     std::set<unsigned> Modes;
     if (SrcP)
-      collectModes(Modes, SrcP);
+      collectModes(Modes, SrcP.get());
     if (DstP)
-      collectModes(Modes, DstP);
+      collectModes(Modes, DstP.get());
 
     // The predicate for the default mode needs to be constructed for each
     // pattern separately.
@@ -4171,13 +4210,13 @@ static void FindDepVars(TreePatternNode *N, MultipleUseVarSet &DepVars) {
 /// Dump the dependent variable set:
 static void DumpDepVars(MultipleUseVarSet &DepVars) {
   if (DepVars.empty()) {
-    DEBUG(errs() << "<empty set>");
+    LLVM_DEBUG(errs() << "<empty set>");
   } else {
-    DEBUG(errs() << "[ ");
+    LLVM_DEBUG(errs() << "[ ");
     for (const auto &DepVar : DepVars) {
-      DEBUG(errs() << DepVar.getKey() << " ");
+      LLVM_DEBUG(errs() << DepVar.getKey() << " ");
     }
-    DEBUG(errs() << "]");
+    LLVM_DEBUG(errs() << "]");
   }
 }
 #endif
@@ -4185,11 +4224,11 @@ static void DumpDepVars(MultipleUseVarSet &DepVars) {
 
 /// CombineChildVariants - Given a bunch of permutations of each child of the
 /// 'operator' node, put them together in all possible ways.
-static void CombineChildVariants(TreePatternNode *Orig,
-               const std::vector<std::vector<TreePatternNode*> > &ChildVariants,
-                                 std::vector<TreePatternNode*> &OutVariants,
-                                 CodeGenDAGPatterns &CDP,
-                                 const MultipleUseVarSet &DepVars) {
+static void CombineChildVariants(
+    TreePatternNodePtr Orig,
+    const std::vector<std::vector<TreePatternNodePtr>> &ChildVariants,
+    std::vector<TreePatternNodePtr> &OutVariants, CodeGenDAGPatterns &CDP,
+    const MultipleUseVarSet &DepVars) {
   // Make sure that each operand has at least one variant to choose from.
   for (const auto &Variants : ChildVariants)
     if (Variants.empty())
@@ -4201,20 +4240,20 @@ static void CombineChildVariants(TreePatternNode *Orig,
   bool NotDone;
   do {
 #ifndef NDEBUG
-    DEBUG(if (!Idxs.empty()) {
-            errs() << Orig->getOperator()->getName() << ": Idxs = [ ";
-              for (unsigned Idx : Idxs) {
-                errs() << Idx << " ";
-            }
-            errs() << "]\n";
-          });
+    LLVM_DEBUG(if (!Idxs.empty()) {
+      errs() << Orig->getOperator()->getName() << ": Idxs = [ ";
+      for (unsigned Idx : Idxs) {
+        errs() << Idx << " ";
+      }
+      errs() << "]\n";
+    });
 #endif
     // Create the variant and add it to the output list.
-    std::vector<TreePatternNode*> NewChildren;
+    std::vector<TreePatternNodePtr> NewChildren;
     for (unsigned i = 0, e = ChildVariants.size(); i != e; ++i)
       NewChildren.push_back(ChildVariants[i][Idxs[i]]);
-    auto R = llvm::make_unique<TreePatternNode>(
-        Orig->getOperator(), NewChildren, Orig->getNumTypes());
+    TreePatternNodePtr R = std::make_shared<TreePatternNode>(
+        Orig->getOperator(), std::move(NewChildren), Orig->getNumTypes());
 
     // Copy over properties.
     R->setName(Orig->getName());
@@ -4230,10 +4269,10 @@ static void CombineChildVariants(TreePatternNode *Orig,
     //   (and GPRC:$a, GPRC:$b) -> (and GPRC:$b, GPRC:$a)
     // which are the same pattern.  Ignore the dups.
     if (R->canPatternMatch(ErrString, CDP) &&
-        none_of(OutVariants, [&](TreePatternNode *Variant) {
-          return R->isIsomorphicTo(Variant, DepVars);
+        none_of(OutVariants, [&](TreePatternNodePtr Variant) {
+          return R->isIsomorphicTo(Variant.get(), DepVars);
         }))
-      OutVariants.push_back(R.release());
+      OutVariants.push_back(R);
 
     // Increment indices to the next permutation by incrementing the
     // indices from last index backward, e.g., generate the sequence
@@ -4251,21 +4290,21 @@ static void CombineChildVariants(TreePatternNode *Orig,
 
 /// CombineChildVariants - A helper function for binary operators.
 ///
-static void CombineChildVariants(TreePatternNode *Orig,
-                                 const std::vector<TreePatternNode*> &LHS,
-                                 const std::vector<TreePatternNode*> &RHS,
-                                 std::vector<TreePatternNode*> &OutVariants,
+static void CombineChildVariants(TreePatternNodePtr Orig,
+                                 const std::vector<TreePatternNodePtr> &LHS,
+                                 const std::vector<TreePatternNodePtr> &RHS,
+                                 std::vector<TreePatternNodePtr> &OutVariants,
                                  CodeGenDAGPatterns &CDP,
                                  const MultipleUseVarSet &DepVars) {
-  std::vector<std::vector<TreePatternNode*> > ChildVariants;
+  std::vector<std::vector<TreePatternNodePtr>> ChildVariants;
   ChildVariants.push_back(LHS);
   ChildVariants.push_back(RHS);
   CombineChildVariants(Orig, ChildVariants, OutVariants, CDP, DepVars);
 }
 
-
-static void GatherChildrenOfAssociativeOpcode(TreePatternNode *N,
-                                     std::vector<TreePatternNode *> &Children) {
+static void
+GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
+                                  std::vector<TreePatternNodePtr> &Children) {
   assert(N->getNumChildren()==2 &&"Associative but doesn't have 2 children!");
   Record *Operator = N->getOperator();
 
@@ -4277,21 +4316,21 @@ static void GatherChildrenOfAssociativeOpcode(TreePatternNode *N,
   }
 
   if (N->getChild(0)->isLeaf() || N->getChild(0)->getOperator() != Operator)
-    Children.push_back(N->getChild(0));
+    Children.push_back(N->getChildShared(0));
   else
-    GatherChildrenOfAssociativeOpcode(N->getChild(0), Children);
+    GatherChildrenOfAssociativeOpcode(N->getChildShared(0), Children);
 
   if (N->getChild(1)->isLeaf() || N->getChild(1)->getOperator() != Operator)
-    Children.push_back(N->getChild(1));
+    Children.push_back(N->getChildShared(1));
   else
-    GatherChildrenOfAssociativeOpcode(N->getChild(1), Children);
+    GatherChildrenOfAssociativeOpcode(N->getChildShared(1), Children);
 }
 
 /// GenerateVariantsOf - Given a pattern N, generate all permutations we can of
 /// the (potentially recursive) pattern by using algebraic laws.
 ///
-static void GenerateVariantsOf(TreePatternNode *N,
-                               std::vector<TreePatternNode*> &OutVariants,
+static void GenerateVariantsOf(TreePatternNodePtr N,
+                               std::vector<TreePatternNodePtr> &OutVariants,
                                CodeGenDAGPatterns &CDP,
                                const MultipleUseVarSet &DepVars) {
   // We cannot permute leaves or ComplexPattern uses.
@@ -4306,14 +4345,14 @@ static void GenerateVariantsOf(TreePatternNode *N,
   // If this node is associative, re-associate.
   if (NodeInfo.hasProperty(SDNPAssociative)) {
     // Re-associate by pulling together all of the linked operators
-    std::vector<TreePatternNode*> MaximalChildren;
+    std::vector<TreePatternNodePtr> MaximalChildren;
     GatherChildrenOfAssociativeOpcode(N, MaximalChildren);
 
     // Only handle child sizes of 3.  Otherwise we'll end up trying too many
     // permutations.
     if (MaximalChildren.size() == 3) {
       // Find the variants of all of our maximal children.
-      std::vector<TreePatternNode*> AVariants, BVariants, CVariants;
+      std::vector<TreePatternNodePtr> AVariants, BVariants, CVariants;
       GenerateVariantsOf(MaximalChildren[0], AVariants, CDP, DepVars);
       GenerateVariantsOf(MaximalChildren[1], BVariants, CDP, DepVars);
       GenerateVariantsOf(MaximalChildren[2], CVariants, CDP, DepVars);
@@ -4323,12 +4362,12 @@ static void GenerateVariantsOf(TreePatternNode *N,
       // Within these forms, we can also permute A/B/C.
 
       // Generate legal pair permutations of A/B/C.
-      std::vector<TreePatternNode*> ABVariants;
-      std::vector<TreePatternNode*> BAVariants;
-      std::vector<TreePatternNode*> ACVariants;
-      std::vector<TreePatternNode*> CAVariants;
-      std::vector<TreePatternNode*> BCVariants;
-      std::vector<TreePatternNode*> CBVariants;
+      std::vector<TreePatternNodePtr> ABVariants;
+      std::vector<TreePatternNodePtr> BAVariants;
+      std::vector<TreePatternNodePtr> ACVariants;
+      std::vector<TreePatternNodePtr> CAVariants;
+      std::vector<TreePatternNodePtr> BCVariants;
+      std::vector<TreePatternNodePtr> CBVariants;
       CombineChildVariants(N, AVariants, BVariants, ABVariants, CDP, DepVars);
       CombineChildVariants(N, BVariants, AVariants, BAVariants, CDP, DepVars);
       CombineChildVariants(N, AVariants, CVariants, ACVariants, CDP, DepVars);
@@ -4356,10 +4395,10 @@ static void GenerateVariantsOf(TreePatternNode *N,
   }
 
   // Compute permutations of all children.
-  std::vector<std::vector<TreePatternNode*> > ChildVariants;
+  std::vector<std::vector<TreePatternNodePtr>> ChildVariants;
   ChildVariants.resize(N->getNumChildren());
   for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    GenerateVariantsOf(N->getChild(i), ChildVariants[i], CDP, DepVars);
+    GenerateVariantsOf(N->getChildShared(i), ChildVariants[i], CDP, DepVars);
 
   // Build all permutations based on how the children were formed.
   CombineChildVariants(N, ChildVariants, OutVariants, CDP, DepVars);
@@ -4388,19 +4427,19 @@ static void GenerateVariantsOf(TreePatternNode *N,
       // after those.
       assert(NC >= 3 &&
              "Commutative intrinsic should have at least 3 children!");
-      std::vector<std::vector<TreePatternNode*> > Variants;
-      Variants.push_back(ChildVariants[0]); // Intrinsic id.
-      Variants.push_back(ChildVariants[2]);
-      Variants.push_back(ChildVariants[1]);
+      std::vector<std::vector<TreePatternNodePtr>> Variants;
+      Variants.push_back(std::move(ChildVariants[0])); // Intrinsic id.
+      Variants.push_back(std::move(ChildVariants[2]));
+      Variants.push_back(std::move(ChildVariants[1]));
       for (unsigned i = 3; i != NC; ++i)
-        Variants.push_back(ChildVariants[i]);
+        Variants.push_back(std::move(ChildVariants[i]));
       CombineChildVariants(N, Variants, OutVariants, CDP, DepVars);
     } else if (NC == N->getNumChildren()) {
-      std::vector<std::vector<TreePatternNode*> > Variants;
-      Variants.push_back(ChildVariants[1]);
-      Variants.push_back(ChildVariants[0]);
+      std::vector<std::vector<TreePatternNodePtr>> Variants;
+      Variants.push_back(std::move(ChildVariants[1]));
+      Variants.push_back(std::move(ChildVariants[0]));
       for (unsigned i = 2; i != NC; ++i)
-        Variants.push_back(ChildVariants[i]);
+        Variants.push_back(std::move(ChildVariants[i]));
       CombineChildVariants(N, Variants, OutVariants, CDP, DepVars);
     }
   }
@@ -4410,7 +4449,7 @@ static void GenerateVariantsOf(TreePatternNode *N,
 // GenerateVariants - Generate variants.  For example, commutative patterns can
 // match multiple ways.  Add them to PatternsToMatch as well.
 void CodeGenDAGPatterns::GenerateVariants() {
-  DEBUG(errs() << "Generating instruction variants.\n");
+  LLVM_DEBUG(errs() << "Generating instruction variants.\n");
 
   // Loop over all of the patterns we've collected, checking to see if we can
   // generate variants of the instruction, through the exploitation of
@@ -4423,28 +4462,26 @@ void CodeGenDAGPatterns::GenerateVariants() {
   //
   for (unsigned i = 0, e = PatternsToMatch.size(); i != e; ++i) {
     MultipleUseVarSet             DepVars;
-    std::vector<TreePatternNode*> Variants;
+    std::vector<TreePatternNodePtr> Variants;
     FindDepVars(PatternsToMatch[i].getSrcPattern(), DepVars);
-    DEBUG(errs() << "Dependent/multiply used variables: ");
-    DEBUG(DumpDepVars(DepVars));
-    DEBUG(errs() << "\n");
-    GenerateVariantsOf(PatternsToMatch[i].getSrcPattern(), Variants, *this,
-                       DepVars);
+    LLVM_DEBUG(errs() << "Dependent/multiply used variables: ");
+    LLVM_DEBUG(DumpDepVars(DepVars));
+    LLVM_DEBUG(errs() << "\n");
+    GenerateVariantsOf(PatternsToMatch[i].getSrcPatternShared(), Variants,
+                       *this, DepVars);
 
     assert(!Variants.empty() && "Must create at least original variant!");
     if (Variants.size() == 1)  // No additional variants for this pattern.
       continue;
 
-    DEBUG(errs() << "FOUND VARIANTS OF: ";
-          PatternsToMatch[i].getSrcPattern()->dump();
-          errs() << "\n");
+    LLVM_DEBUG(errs() << "FOUND VARIANTS OF: ";
+               PatternsToMatch[i].getSrcPattern()->dump(); errs() << "\n");
 
     for (unsigned v = 0, e = Variants.size(); v != e; ++v) {
-      TreePatternNode *Variant = Variants[v];
+      TreePatternNodePtr Variant = Variants[v];
 
-      DEBUG(errs() << "  VAR#" << v <<  ": ";
-            Variant->dump();
-            errs() << "\n");
+      LLVM_DEBUG(errs() << "  VAR#" << v << ": "; Variant->dump();
+                 errs() << "\n");
 
       // Scan to see if an instruction or explicit pattern already matches this.
       bool AlreadyExists = false;
@@ -4456,7 +4493,7 @@ void CodeGenDAGPatterns::GenerateVariants() {
         // Check to see if this variant already exists.
         if (Variant->isIsomorphicTo(PatternsToMatch[p].getSrcPattern(),
                                     DepVars)) {
-          DEBUG(errs() << "  *** ALREADY EXISTS, ignoring variant.\n");
+          LLVM_DEBUG(errs() << "  *** ALREADY EXISTS, ignoring variant.\n");
           AlreadyExists = true;
           break;
         }
@@ -4467,11 +4504,11 @@ void CodeGenDAGPatterns::GenerateVariants() {
       // Otherwise, add it to the list of patterns we have.
       PatternsToMatch.push_back(PatternToMatch(
           PatternsToMatch[i].getSrcRecord(), PatternsToMatch[i].getPredicates(),
-          Variant, PatternsToMatch[i].getDstPattern(),
+          Variant, PatternsToMatch[i].getDstPatternShared(),
           PatternsToMatch[i].getDstRegs(),
           PatternsToMatch[i].getAddedComplexity(), Record::getNewUID()));
     }
 
-    DEBUG(errs() << "\n");
+    LLVM_DEBUG(errs() << "\n");
   }
 }
diff --git a/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.h b/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.h
index 8a8132c7f894..9be3816cc7fc 100644
--- a/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/contrib/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -43,6 +43,9 @@ class TreePatternNode;
 class CodeGenDAGPatterns;
 class ComplexPattern;
 
+/// Shared pointer for TreePatternNode.
+using TreePatternNodePtr = std::shared_ptr<TreePatternNode>;
+
 /// This represents a set of MVTs. Since the underlying type for the MVT
 /// is uint8_t, there are at most 256 values. To reduce the number of memory
 /// allocations and deallocations, represent the set as a sequence of bits.
@@ -330,9 +333,21 @@ struct TypeInfer {
     TypeSetByHwMode &VTS;
   };
 
+  struct SuppressValidation {
+    SuppressValidation(TypeInfer &TI) : Infer(TI), SavedValidate(TI.Validate) {
+      Infer.Validate = false;
+    }
+    ~SuppressValidation() {
+      Infer.Validate = SavedValidate;
+    }
+    TypeInfer &Infer;
+    bool SavedValidate;
+  };
+
   TreePattern &TP;
   unsigned ForceMode;     // Mode to use when set.
   bool CodeGen = false;   // Set during generation of matcher code.
+  bool Validate = true;   // Indicate whether to validate types.
 
 private:
   TypeSetByHwMode getLegalTypes();
@@ -540,6 +555,10 @@ public:
   /// ValueType record for the memory VT.
   Record *getScalarMemoryVT() const;
 
+  // If true, indicates that GlobalISel-based C++ code was supplied.
+  bool hasGISelPredicateCode() const;
+  std::string getGISelPredicateCode() const;
+
 private:
   bool hasPredCode() const;
   bool hasImmCode() const;
@@ -552,9 +571,6 @@ private:
 };
 
 
-/// FIXME: TreePatternNode's can be shared in some cases (due to dag-shaped
-/// patterns), and as such should be ref counted.  We currently just leak all
-/// TreePatternNode objects!
 class TreePatternNode {
   /// The type of each node result.  Before and during type inference, each
   /// result may be a set of possible types.  After (successful) type inference,
@@ -581,18 +597,19 @@ class TreePatternNode {
   /// it can be substituted into the resulting instruction on a pattern match.
   Record *TransformFn;
 
-  std::vector<TreePatternNode*> Children;
+  std::vector<TreePatternNodePtr> Children;
+
 public:
-  TreePatternNode(Record *Op, const std::vector<TreePatternNode*> &Ch,
+  TreePatternNode(Record *Op, std::vector<TreePatternNodePtr> Ch,
                   unsigned NumResults)
-    : Operator(Op), Val(nullptr), TransformFn(nullptr), Children(Ch) {
+      : Operator(Op), Val(nullptr), TransformFn(nullptr),
+        Children(std::move(Ch)) {
     Types.resize(NumResults);
   }
   TreePatternNode(Init *val, unsigned NumResults)    // leaf ctor
     : Operator(nullptr), Val(val), TransformFn(nullptr) {
     Types.resize(NumResults);
   }
-  ~TreePatternNode();
 
   bool hasName() const { return !Name.empty(); }
   const std::string &getName() const { return Name; }
@@ -626,15 +643,17 @@ public:
   Record *getOperator() const { assert(!isLeaf()); return Operator; }
 
   unsigned getNumChildren() const { return Children.size(); }
-  TreePatternNode *getChild(unsigned N) const { return Children[N]; }
-  void setChild(unsigned i, TreePatternNode *N) {
-    Children[i] = N;
+  TreePatternNode *getChild(unsigned N) const { return Children[N].get(); }
+  const TreePatternNodePtr &getChildShared(unsigned N) const {
+    return Children[N];
   }
+  void setChild(unsigned i, TreePatternNodePtr N) { Children[i] = N; }
 
   /// hasChild - Return true if N is any of our children.
   bool hasChild(const TreePatternNode *N) const {
     for (unsigned i = 0, e = Children.size(); i != e; ++i)
-      if (Children[i] == N) return true;
+      if (Children[i].get() == N)
+        return true;
     return false;
   }
 
@@ -694,7 +713,7 @@ public:   // Higher level manipulation routines.
 
   /// clone - Return a new copy of this tree.
   ///
-  TreePatternNode *clone() const;
+  TreePatternNodePtr clone() const;
 
   /// RemoveAllTypes - Recursively strip all the types of this tree.
   void RemoveAllTypes();
@@ -708,13 +727,15 @@ public:   // Higher level manipulation routines.
 
   /// SubstituteFormalArguments - Replace the formal arguments in this tree
   /// with actual values specified by ArgMap.
-  void SubstituteFormalArguments(std::map<std::string,
-                                          TreePatternNode*> &ArgMap);
+  void
+  SubstituteFormalArguments(std::map<std::string, TreePatternNodePtr> &ArgMap);
 
   /// InlinePatternFragments - If this pattern refers to any pattern
-  /// fragments, inline them into place, giving us a pattern without any
-  /// PatFrag references.
-  TreePatternNode *InlinePatternFragments(TreePattern &TP);
+  /// fragments, return the set of inlined versions (this can be more than
+  /// one if a PatFrags record has multiple alternatives).
+  void InlinePatternFragments(TreePatternNodePtr T,
+                              TreePattern &TP,
+                              std::vector<TreePatternNodePtr> &OutAlternatives);
 
   /// ApplyTypeConstraints - Apply all of the type constraints relevant to
   /// this node and its children in the tree.  This returns true if it makes a
@@ -759,11 +780,11 @@ class TreePattern {
   /// Trees - The list of pattern trees which corresponds to this pattern.
   /// Note that PatFrag's only have a single tree.
   ///
-  std::vector<TreePatternNode*> Trees;
+  std::vector<TreePatternNodePtr> Trees;
 
   /// NamedNodes - This is all of the nodes that have names in the trees in this
   /// pattern.
-  StringMap<SmallVector<TreePatternNode*,1> > NamedNodes;
+  StringMap<SmallVector<TreePatternNode *, 1>> NamedNodes;
 
   /// TheRecord - The actual TableGen record corresponding to this pattern.
   ///
@@ -802,21 +823,21 @@ public:
               CodeGenDAGPatterns &ise);
   TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
               CodeGenDAGPatterns &ise);
-  TreePattern(Record *TheRec, TreePatternNode *Pat, bool isInput,
+  TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput,
               CodeGenDAGPatterns &ise);
 
   /// getTrees - Return the tree patterns which corresponds to this pattern.
   ///
-  const std::vector<TreePatternNode*> &getTrees() const { return Trees; }
+  const std::vector<TreePatternNodePtr> &getTrees() const { return Trees; }
   unsigned getNumTrees() const { return Trees.size(); }
-  TreePatternNode *getTree(unsigned i) const { return Trees[i]; }
-  void setTree(unsigned i, TreePatternNode *Tree) { Trees[i] = Tree; }
-  TreePatternNode *getOnlyTree() const {
+  const TreePatternNodePtr &getTree(unsigned i) const { return Trees[i]; }
+  void setTree(unsigned i, TreePatternNodePtr Tree) { Trees[i] = Tree; }
+  const TreePatternNodePtr &getOnlyTree() const {
     assert(Trees.size() == 1 && "Doesn't have exactly one pattern!");
     return Trees[0];
   }
 
-  const StringMap<SmallVector<TreePatternNode*,1> > &getNamedNodesMap() {
+  const StringMap<SmallVector<TreePatternNode *, 1>> &getNamedNodesMap() {
     if (NamedNodes.empty())
       ComputeNamedNodes();
     return NamedNodes;
@@ -838,17 +859,20 @@ public:
 
   /// InlinePatternFragments - If this pattern refers to any pattern
   /// fragments, inline them into place, giving us a pattern without any
-  /// PatFrag references.
+  /// PatFrags references.  This may increase the number of trees in the
+  /// pattern if a PatFrags has multiple alternatives.
   void InlinePatternFragments() {
-    for (unsigned i = 0, e = Trees.size(); i != e; ++i)
-      Trees[i] = Trees[i]->InlinePatternFragments(*this);
+    std::vector<TreePatternNodePtr> Copy = Trees;
+    Trees.clear();
+    for (unsigned i = 0, e = Copy.size(); i != e; ++i)
+      Copy[i]->InlinePatternFragments(Copy[i], *this, Trees);
   }
 
   /// InferAllTypes - Infer/propagate as many types throughout the expression
   /// patterns as possible.  Return true if all types are inferred, false
   /// otherwise.  Bail out if a type contradiction is found.
-  bool InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> >
-                          *NamedTypes=nullptr);
+  bool InferAllTypes(
+      const StringMap<SmallVector<TreePatternNode *, 1>> *NamedTypes = nullptr);
 
   /// error - If this is the first error in the current resolution step,
   /// print it and set the error flag.  Otherwise, continue silently.
@@ -866,7 +890,7 @@ public:
   void dump() const;
 
 private:
-  TreePatternNode *ParseTreePattern(Init *DI, StringRef OpName);
+  TreePatternNodePtr ParseTreePattern(Init *DI, StringRef OpName);
   void ComputeNamedNodes();
   void ComputeNamedNodes(TreePatternNode *N);
 };
@@ -900,31 +924,30 @@ inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
 /// DAGDefaultOperand - One of these is created for each OperandWithDefaultOps
 /// that has a set ExecuteAlways / DefaultOps field.
 struct DAGDefaultOperand {
-  std::vector<TreePatternNode*> DefaultOps;
+  std::vector<TreePatternNodePtr> DefaultOps;
 };
 
 class DAGInstruction {
-  TreePattern *Pattern;
   std::vector<Record*> Results;
   std::vector<Record*> Operands;
   std::vector<Record*> ImpResults;
-  TreePatternNode *ResultPattern;
+  TreePatternNodePtr SrcPattern;
+  TreePatternNodePtr ResultPattern;
+
 public:
-  DAGInstruction(TreePattern *TP,
-                 const std::vector<Record*> &results,
+  DAGInstruction(const std::vector<Record*> &results,
                  const std::vector<Record*> &operands,
-                 const std::vector<Record*> &impresults)
-    : Pattern(TP), Results(results), Operands(operands),
-      ImpResults(impresults), ResultPattern(nullptr) {}
+                 const std::vector<Record*> &impresults,
+                 TreePatternNodePtr srcpattern = nullptr,
+                 TreePatternNodePtr resultpattern = nullptr)
+    : Results(results), Operands(operands), ImpResults(impresults),
+      SrcPattern(srcpattern), ResultPattern(resultpattern) {}
 
-  TreePattern *getPattern() const { return Pattern; }
   unsigned getNumResults() const { return Results.size(); }
   unsigned getNumOperands() const { return Operands.size(); }
   unsigned getNumImpResults() const { return ImpResults.size(); }
   const std::vector<Record*>& getImpResults() const { return ImpResults; }
 
-  void setResultPattern(TreePatternNode *R) { ResultPattern = R; }
-
   Record *getResult(unsigned RN) const {
     assert(RN < Results.size());
     return Results[RN];
@@ -940,7 +963,8 @@ public:
     return ImpResults[RN];
   }
 
-  TreePatternNode *getResultPattern() const { return ResultPattern; }
+  TreePatternNodePtr getSrcPattern() const { return SrcPattern; }
+  TreePatternNodePtr getResultPattern() const { return ResultPattern; }
 };
 
 /// This class represents a condition that has to be satisfied for a pattern
@@ -994,25 +1018,17 @@ public:
 /// processed to produce isel.
 class PatternToMatch {
 public:
-  PatternToMatch(Record *srcrecord, const std::vector<Predicate> &preds,
-                 TreePatternNode *src, TreePatternNode *dst,
-                 const std::vector<Record*> &dstregs,
-                 int complexity, unsigned uid, unsigned setmode = 0)
-    : SrcRecord(srcrecord), SrcPattern(src), DstPattern(dst),
-      Predicates(preds), Dstregs(std::move(dstregs)),
-      AddedComplexity(complexity), ID(uid), ForceMode(setmode) {}
-
-  PatternToMatch(Record *srcrecord, std::vector<Predicate> &&preds,
-                 TreePatternNode *src, TreePatternNode *dst,
-                 std::vector<Record*> &&dstregs,
-                 int complexity, unsigned uid, unsigned setmode = 0)
-    : SrcRecord(srcrecord), SrcPattern(src), DstPattern(dst),
-      Predicates(preds), Dstregs(std::move(dstregs)),
-      AddedComplexity(complexity), ID(uid), ForceMode(setmode) {}
+  PatternToMatch(Record *srcrecord, std::vector<Predicate> preds,
+                 TreePatternNodePtr src, TreePatternNodePtr dst,
+                 std::vector<Record *> dstregs, int complexity,
+                 unsigned uid, unsigned setmode = 0)
+      : SrcRecord(srcrecord), SrcPattern(src), DstPattern(dst),
+        Predicates(std::move(preds)), Dstregs(std::move(dstregs)),
+        AddedComplexity(complexity), ID(uid), ForceMode(setmode) {}
 
   Record          *SrcRecord;   // Originating Record for the pattern.
-  TreePatternNode *SrcPattern;  // Source pattern to match.
-  TreePatternNode *DstPattern;  // Resulting pattern.
+  TreePatternNodePtr SrcPattern;      // Source pattern to match.
+  TreePatternNodePtr DstPattern;      // Resulting pattern.
   std::vector<Predicate> Predicates;  // Top level predicate conditions
                                       // to match.
   std::vector<Record*> Dstregs; // Physical register defs being matched.
@@ -1021,8 +1037,10 @@ public:
   unsigned         ForceMode;   // Force this mode in type inference when set.
 
   Record          *getSrcRecord()  const { return SrcRecord; }
-  TreePatternNode *getSrcPattern() const { return SrcPattern; }
-  TreePatternNode *getDstPattern() const { return DstPattern; }
+  TreePatternNode *getSrcPattern() const { return SrcPattern.get(); }
+  TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; }
+  TreePatternNode *getDstPattern() const { return DstPattern.get(); }
+  TreePatternNodePtr getDstPatternShared() const { return DstPattern; }
   const std::vector<Record*> &getDstRegs() const { return Dstregs; }
   int         getAddedComplexity() const { return AddedComplexity; }
   const std::vector<Predicate> &getPredicates() const { return Predicates; }
@@ -1156,7 +1174,7 @@ public:
 
   /// Parse the Pattern for an instruction, and insert the result in DAGInsts.
   typedef std::map<Record*, DAGInstruction, LessRecordByID> DAGInstMap;
-  const DAGInstruction &parseInstructionPattern(
+  void parseInstructionPattern(
       CodeGenInstruction &CGI, ListInit *Pattern,
       DAGInstMap &DAGInsts);
 
@@ -1193,13 +1211,15 @@ private:
 
   std::vector<Predicate> makePredList(ListInit *L);
 
+  void ParseOnePattern(Record *TheDef,
+                       TreePattern &Pattern, TreePattern &Result,
+                       const std::vector<Record *> &InstImpResults);
   void AddPatternToMatch(TreePattern *Pattern, PatternToMatch &&PTM);
-  void FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
-                                   std::map<std::string,
-                                   TreePatternNode*> &InstInputs,
-                                   std::map<std::string,
-                                   TreePatternNode*> &InstResults,
-                                   std::vector<Record*> &InstImpResults);
+  void FindPatternInputsAndOutputs(
+      TreePattern &I, TreePatternNodePtr Pat,
+      std::map<std::string, TreePatternNodePtr> &InstInputs,
+      std::map<std::string, TreePatternNodePtr> &InstResults,
+      std::vector<Record *> &InstImpResults);
 };
 
 
diff --git a/contrib/llvm/utils/TableGen/CodeGenInstruction.cpp b/contrib/llvm/utils/TableGen/CodeGenInstruction.cpp
index 44ee16f6fd74..eb35020d3d3a 100644
--- a/contrib/llvm/utils/TableGen/CodeGenInstruction.cpp
+++ b/contrib/llvm/utils/TableGen/CodeGenInstruction.cpp
@@ -306,11 +306,13 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   isIndirectBranch = R->getValueAsBit("isIndirectBranch");
   isCompare    = R->getValueAsBit("isCompare");
   isMoveImm    = R->getValueAsBit("isMoveImm");
+  isMoveReg    = R->getValueAsBit("isMoveReg");
   isBitcast    = R->getValueAsBit("isBitcast");
   isSelect     = R->getValueAsBit("isSelect");
   isBarrier    = R->getValueAsBit("isBarrier");
   isCall       = R->getValueAsBit("isCall");
   isAdd        = R->getValueAsBit("isAdd");
+  isTrap       = R->getValueAsBit("isTrap");
   canFoldAsLoad = R->getValueAsBit("canFoldAsLoad");
   isPredicable = Operands.isPredicable || R->getValueAsBit("isPredicable");
   isConvertibleToThreeAddress = R->getValueAsBit("isConvertibleToThreeAddress");
@@ -327,6 +329,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   isInsertSubreg = R->getValueAsBit("isInsertSubreg");
   isConvergent = R->getValueAsBit("isConvergent");
   hasNoSchedulingInfo = R->getValueAsBit("hasNoSchedulingInfo");
+  FastISelShouldIgnore = R->getValueAsBit("FastISelShouldIgnore");
 
   bool Unset;
   mayLoad      = R->getValueAsBitOrUnset("mayLoad", Unset);
@@ -344,6 +347,10 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   ImplicitDefs = R->getValueAsListOfDefs("Defs");
   ImplicitUses = R->getValueAsListOfDefs("Uses");
 
+  // This flag is only inferred from the pattern.
+  hasChain = false;
+  hasChain_Inferred = false;
+
   // Parse Constraints.
   ParseConstraints(R->getValueAsString("Constraints"), Operands);
 
@@ -588,12 +595,10 @@ unsigned CodeGenInstAlias::ResultOperand::getMINumOperands() const {
   return MIOpInfo->getNumArgs();
 }
 
-CodeGenInstAlias::CodeGenInstAlias(Record *R, unsigned Variant,
-                                   CodeGenTarget &T)
+CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T)
     : TheDef(R) {
   Result = R->getValueAsDag("ResultInst");
   AsmString = R->getValueAsString("AsmString");
-  AsmString = CodeGenInstruction::FlattenAsmStringVariants(AsmString, Variant);
 
 
   // Verify that the root of the result is an instruction.
@@ -630,8 +635,14 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, unsigned Variant,
     // of a complex operand, in which case we include them anyways, as we
     // don't have any other way to specify the whole operand.
     if (ResultInst->Operands[i].MINumOperands == 1 &&
-        ResultInst->Operands[i].getTiedRegister() != -1)
-      continue;
+        ResultInst->Operands[i].getTiedRegister() != -1) {
+      // Tied operands of different RegisterClass should be explicit within an
+      // instruction's syntax and so cannot be skipped.
+      int TiedOpNum = ResultInst->Operands[i].getTiedRegister();
+      if (ResultInst->Operands[i].Rec->getName() ==
+          ResultInst->Operands[TiedOpNum].Rec->getName())
+        continue;
+    }
 
     if (AliasOpNo >= Result->getNumArgs())
       PrintFatalError(R->getLoc(), "not enough arguments for instruction!");
diff --git a/contrib/llvm/utils/TableGen/CodeGenInstruction.h b/contrib/llvm/utils/TableGen/CodeGenInstruction.h
index 9cff95b1247f..a50c3e60e6e7 100644
--- a/contrib/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/contrib/llvm/utils/TableGen/CodeGenInstruction.h
@@ -15,7 +15,7 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENINSTRUCTION_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/SMLoc.h"
 #include <string>
 #include <utility>
@@ -226,11 +226,13 @@ template <typename T> class ArrayRef;
     bool isIndirectBranch : 1;
     bool isCompare : 1;
     bool isMoveImm : 1;
+    bool isMoveReg : 1;
     bool isBitcast : 1;
     bool isSelect : 1;
     bool isBarrier : 1;
     bool isCall : 1;
     bool isAdd : 1;
+    bool isTrap : 1;
     bool canFoldAsLoad : 1;
     bool mayLoad : 1;
     bool mayLoad_Unset : 1;
@@ -258,6 +260,9 @@ template <typename T> class ArrayRef;
     bool isInsertSubreg : 1;
     bool isConvergent : 1;
     bool hasNoSchedulingInfo : 1;
+    bool FastISelShouldIgnore : 1;
+    bool hasChain : 1;
+    bool hasChain_Inferred : 1;
 
     std::string DeprecatedReason;
     bool HasComplexDeprecationPredicate;
@@ -350,7 +355,7 @@ template <typename T> class ArrayRef;
     /// of them are matched by the operand, the second value should be -1.
     std::vector<std::pair<unsigned, int> > ResultInstOperandIndex;
 
-    CodeGenInstAlias(Record *R, unsigned Variant, CodeGenTarget &T);
+    CodeGenInstAlias(Record *R, CodeGenTarget &T);
 
     bool tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
                          Record *InstOpRec, bool hasSubOps, ArrayRef<SMLoc> Loc,
diff --git a/contrib/llvm/utils/TableGen/CodeGenIntrinsics.h b/contrib/llvm/utils/TableGen/CodeGenIntrinsics.h
index 91305034dc24..5d0715959120 100644
--- a/contrib/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/contrib/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -15,7 +15,7 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENINTRINSICS_H
 
 #include "SDNodeProperties.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
 #include <string>
 #include <vector>
 
diff --git a/contrib/llvm/utils/TableGen/CodeGenMapTable.cpp b/contrib/llvm/utils/TableGen/CodeGenMapTable.cpp
index 43348b622a74..e5b0426cdcc3 100644
--- a/contrib/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/contrib/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -243,7 +243,12 @@ void MapTableEmitter::buildRowInstrMap() {
     std::vector<Init*> KeyValue;
     ListInit *RowFields = InstrMapDesc.getRowFields();
     for (Init *RowField : RowFields->getValues()) {
-      Init *CurInstrVal = CurInstr->getValue(RowField)->getValue();
+      RecordVal *RecVal = CurInstr->getValue(RowField);
+      if (RecVal == nullptr)
+        PrintFatalError(CurInstr->getLoc(), "No value " +
+                        RowField->getAsString() + " found in \"" +
+                        CurInstr->getName() + "\" instruction description.");
+      Init *CurInstrVal = RecVal->getValue();
       KeyValue.push_back(CurInstrVal);
     }
 
diff --git a/contrib/llvm/utils/TableGen/CodeGenRegisters.cpp b/contrib/llvm/utils/TableGen/CodeGenRegisters.cpp
index a6b0a4beb8ea..b0d13b7d38f3 100644
--- a/contrib/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/contrib/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -21,7 +21,6 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -52,7 +51,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
-  : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true) {
+  : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
   Name = R->getName();
   if (R->getValue("Namespace"))
     Namespace = R->getValueAsString("Namespace");
@@ -63,7 +62,7 @@ CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
 CodeGenSubRegIndex::CodeGenSubRegIndex(StringRef N, StringRef Nspace,
                                        unsigned Enum)
   : TheDef(nullptr), Name(N), Namespace(Nspace), Size(-1), Offset(-1),
-    EnumValue(Enum), AllSuperRegsCovered(true) {
+    EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
 }
 
 std::string CodeGenSubRegIndex::getQualifiedName() const {
@@ -162,8 +161,9 @@ CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
     HasDisjunctSubRegs(false),
     SubRegsComplete(false),
     SuperRegsComplete(false),
-    TopoSig(~0u)
-{}
+    TopoSig(~0u) {
+  Artificial = R->getValueAsBit("isArtificial");
+}
 
 void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
   std::vector<Record*> SRIs = TheDef->getValueAsListOfDefs("SubRegIndices");
@@ -276,6 +276,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
     CodeGenRegister *SR = ExplicitSubRegs[i];
     CodeGenSubRegIndex *Idx = ExplicitSubRegIndices[i];
+    if (!SR->Artificial)
+      Idx->Artificial = false;
     if (!SubRegs.insert(std::make_pair(Idx, SR)).second)
       PrintFatalError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
                       " appears twice in Register " + getName());
@@ -386,13 +388,17 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   // user already specified.
   for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
     CodeGenRegister *SR = ExplicitSubRegs[i];
-    if (!SR->CoveredBySubRegs || SR->ExplicitSubRegs.size() <= 1)
+    if (!SR->CoveredBySubRegs || SR->ExplicitSubRegs.size() <= 1 ||
+        SR->Artificial)
       continue;
 
     // SR is composed of multiple sub-regs. Find their names in this register.
     SmallVector<CodeGenSubRegIndex*, 8> Parts;
-    for (unsigned j = 0, e = SR->ExplicitSubRegs.size(); j != e; ++j)
-      Parts.push_back(getSubRegIndex(SR->ExplicitSubRegs[j]));
+    for (unsigned j = 0, e = SR->ExplicitSubRegs.size(); j != e; ++j) {
+      CodeGenSubRegIndex &I = *SR->ExplicitSubRegIndices[j];
+      if (!I.Artificial)
+        Parts.push_back(getSubRegIndex(SR->ExplicitSubRegs[j]));
+    }
 
     // Offer this as an existing spelling for the concatenation of Parts.
     CodeGenSubRegIndex &Idx = *ExplicitSubRegIndices[i];
@@ -602,6 +608,13 @@ unsigned CodeGenRegister::getWeight(const CodeGenRegBank &RegBank) const {
 namespace {
 
 struct TupleExpander : SetTheory::Expander {
+  // Reference to SynthDefs in the containing CodeGenRegBank, to keep track of
+  // the synthesized definitions for their lifetime.
+  std::vector<std::unique_ptr<Record>> &SynthDefs;
+
+  TupleExpander(std::vector<std::unique_ptr<Record>> &SynthDefs)
+      : SynthDefs(SynthDefs) {}
+
   void expand(SetTheory &ST, Record *Def, SetTheory::RecSet &Elts) override {
     std::vector<Record*> Indices = Def->getValueAsListOfDefs("SubRegIndices");
     unsigned Dim = Indices.size();
@@ -646,7 +659,9 @@ struct TupleExpander : SetTheory::Expander {
       // Create a new Record representing the synthesized register. This record
       // is only for consumption by CodeGenRegister, it is not added to the
       // RecordKeeper.
-      Record *NewReg = new Record(Name, Def->getLoc(), Def->getRecords());
+      SynthDefs.emplace_back(
+          llvm::make_unique<Record>(Name, Def->getLoc(), Def->getRecords()));
+      Record *NewReg = SynthDefs.back().get();
       Elts.insert(NewReg);
 
       // Copy Proto super-classes.
@@ -710,7 +725,7 @@ struct TupleExpander : SetTheory::Expander {
 //===----------------------------------------------------------------------===//
 
 static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) {
-  std::sort(M.begin(), M.end(), deref<llvm::less>());
+  llvm::sort(M.begin(), M.end(), deref<llvm::less>());
   M.erase(std::unique(M.begin(), M.end(), deref<llvm::equal>()), M.end());
 }
 
@@ -736,10 +751,12 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   Orders.resize(1 + AltOrders->size());
 
   // Default allocation order always contains all registers.
+  Artificial = true;
   for (unsigned i = 0, e = Elements->size(); i != e; ++i) {
     Orders[0].push_back((*Elements)[i]);
     const CodeGenRegister *Reg = RegBank.getReg((*Elements)[i]);
     Members.push_back(Reg);
+    Artificial &= Reg->Artificial;
     TopoSigs.set(Reg->getTopoSig());
   }
   sortAndUniqueRegisters(Members);
@@ -798,8 +815,11 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
     CopyCost(0),
     Allocatable(true),
     AllocationPriority(0) {
-  for (const auto R : Members)
+  Artificial = true;
+  for (const auto R : Members) {
     TopoSigs.set(R->getTopoSig());
+    Artificial &= R->Artificial;
+  }
 }
 
 // Compute inherited propertied for a synthesized register class.
@@ -915,6 +935,8 @@ void CodeGenRegisterClass::computeSubClasses(CodeGenRegBank &RegBank) {
     CodeGenRegisterClass &RC = *I;
     RC.SubClasses.resize(RegClasses.size());
     RC.SubClasses.set(RC.EnumValue);
+    if (RC.Artificial)
+      continue;
 
     // Normally, all subclasses have IDs >= rci, unless RC is part of a clique.
     for (auto I2 = I.base(), E2 = RegClasses.end(); I2 != E2; ++I2) {
@@ -975,7 +997,7 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
   for (auto &RC : RegClasses)
     if (SuperRegRCsBV[RC.EnumValue])
       SuperRegRCs.emplace_back(&RC);
-  std::sort(SuperRegRCs.begin(), SuperRegRCs.end(), SizeOrder);
+  llvm::sort(SuperRegRCs.begin(), SuperRegRCs.end(), SizeOrder);
   assert(SuperRegRCs.front() == BiggestSuperRegRC && "Biggest class wasn't first");
 
   // Find all the subreg classes and order them by size too.
@@ -986,11 +1008,11 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
     if (SuperRegClassesBV.any())
       SuperRegClasses.push_back(std::make_pair(&RC, SuperRegClassesBV));
   }
-  std::sort(SuperRegClasses.begin(), SuperRegClasses.end(),
-            [&](const std::pair<CodeGenRegisterClass *, BitVector> &A,
-                const std::pair<CodeGenRegisterClass *, BitVector> &B) {
-              return SizeOrder(A.first, B.first);
-            });
+  llvm::sort(SuperRegClasses.begin(), SuperRegClasses.end(),
+             [&](const std::pair<CodeGenRegisterClass *, BitVector> &A,
+                 const std::pair<CodeGenRegisterClass *, BitVector> &B) {
+               return SizeOrder(A.first, B.first);
+             });
 
   // Find the biggest subclass and subreg class such that R:subidx is in the
   // subreg class for all R in subclass.
@@ -1043,12 +1065,15 @@ void CodeGenRegisterClass::getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
 }
 
 // Populate a unique sorted list of units from a register set.
-void CodeGenRegisterClass::buildRegUnitSet(
+void CodeGenRegisterClass::buildRegUnitSet(const CodeGenRegBank &RegBank,
   std::vector<unsigned> &RegUnits) const {
   std::vector<unsigned> TmpUnits;
-  for (RegUnitIterator UnitI(Members); UnitI.isValid(); ++UnitI)
-    TmpUnits.push_back(*UnitI);
-  std::sort(TmpUnits.begin(), TmpUnits.end());
+  for (RegUnitIterator UnitI(Members); UnitI.isValid(); ++UnitI) {
+    const RegUnit &RU = RegBank.getRegUnit(*UnitI);
+    if (!RU.Artificial)
+      TmpUnits.push_back(*UnitI);
+  }
+  llvm::sort(TmpUnits.begin(), TmpUnits.end());
   std::unique_copy(TmpUnits.begin(), TmpUnits.end(),
                    std::back_inserter(RegUnits));
 }
@@ -1062,12 +1087,13 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   // Configure register Sets to understand register classes and tuples.
   Sets.addFieldExpander("RegisterClass", "MemberList");
   Sets.addFieldExpander("CalleeSavedRegs", "SaveList");
-  Sets.addExpander("RegisterTuples", llvm::make_unique<TupleExpander>());
+  Sets.addExpander("RegisterTuples",
+                   llvm::make_unique<TupleExpander>(SynthDefs));
 
   // Read in the user-defined (named) sub-register indices.
   // More indices will be synthesized later.
   std::vector<Record*> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
-  std::sort(SRIs.begin(), SRIs.end(), LessRecord());
+  llvm::sort(SRIs.begin(), SRIs.end(), LessRecord());
   for (unsigned i = 0, e = SRIs.size(); i != e; ++i)
     getSubRegIdx(SRIs[i]);
   // Build composite maps from ComposedOf fields.
@@ -1076,7 +1102,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 
   // Read in the register definitions.
   std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
-  std::sort(Regs.begin(), Regs.end(), LessRecordRegister());
+  llvm::sort(Regs.begin(), Regs.end(), LessRecordRegister());
   // Assign the enumeration values.
   for (unsigned i = 0, e = Regs.size(); i != e; ++i)
     getReg(Regs[i]);
@@ -1087,7 +1113,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 
   for (Record *R : Tups) {
     std::vector<Record *> TupRegs = *Sets.expand(R);
-    std::sort(TupRegs.begin(), TupRegs.end(), LessRecordRegister());
+    llvm::sort(TupRegs.begin(), TupRegs.end(), LessRecordRegister());
     for (Record *RC : TupRegs)
       getReg(RC);
   }
@@ -1131,6 +1157,18 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   for (auto &Reg : Registers)
     Reg.computeSuperRegs(*this);
 
+  // For each pair of Reg:SR, if both are non-artificial, mark the
+  // corresponding sub-register index as non-artificial.
+  for (auto &Reg : Registers) {
+    if (Reg.Artificial)
+      continue;
+    for (auto P : Reg.getSubRegs()) {
+      const CodeGenRegister *SR = P.second;
+      if (!SR->Artificial)
+        P.first->Artificial = false;
+    }
+  }
+
   // Native register units are associated with a leaf register. They've all been
   // discovered now.
   NumNativeRegUnits = RegUnits.size();
@@ -1141,9 +1179,11 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
     PrintFatalError("No 'RegisterClass' subclasses defined!");
 
   // Allocate user-defined register classes.
-  for (auto *RC : RCs) {
-    RegClasses.emplace_back(*this, RC);
-    addToMaps(&RegClasses.back());
+  for (auto *R : RCs) {
+    RegClasses.emplace_back(*this, R);
+    CodeGenRegisterClass &RC = RegClasses.back();
+    if (!RC.Artificial)
+      addToMaps(&RC);
   }
 
   // Infer missing classes to create a full algebra.
@@ -1554,21 +1594,24 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
         Reg = UnitI.getReg();
         Weight = 0;
       }
-      unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight;
-      if (!UWeight) {
-        UWeight = 1;
-        RegBank.increaseRegUnitWeight(*UnitI, UWeight);
+      if (!RegBank.getRegUnit(*UnitI).Artificial) {
+        unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight;
+        if (!UWeight) {
+          UWeight = 1;
+          RegBank.increaseRegUnitWeight(*UnitI, UWeight);
+        }
+        Weight += UWeight;
       }
-      Weight += UWeight;
     }
     if (Weight > MaxWeight)
       MaxWeight = Weight;
     if (I->Weight != MaxWeight) {
-      DEBUG(
-        dbgs() << "UberSet " << I - UberSets.begin() << " Weight " << MaxWeight;
-        for (auto &Unit : I->Regs)
-          dbgs() << " " << Unit->getName();
-        dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "UberSet " << I - UberSets.begin() << " Weight "
+                        << MaxWeight;
+                 for (auto &Unit
+                      : I->Regs) dbgs()
+                 << " " << Unit->getName();
+                 dbgs() << "\n");
       // Update the set weight.
       I->Weight = MaxWeight;
     }
@@ -1595,9 +1638,10 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
 static bool normalizeWeight(CodeGenRegister *Reg,
                             std::vector<UberRegSet> &UberSets,
                             std::vector<UberRegSet*> &RegSets,
-                            SparseBitVector<> &NormalRegs,
+                            BitVector &NormalRegs,
                             CodeGenRegister::RegUnitList &NormalUnits,
                             CodeGenRegBank &RegBank) {
+  NormalRegs.resize(std::max(Reg->EnumValue + 1, NormalRegs.size()));
   if (NormalRegs.test(Reg->EnumValue))
     return false;
   NormalRegs.set(Reg->EnumValue);
@@ -1637,7 +1681,8 @@ static bool normalizeWeight(CodeGenRegister *Reg,
     }
     else {
       // Adjust the existing single unit.
-      RegBank.increaseRegUnitWeight(AdjustUnit, UberSet->Weight - RegWeight);
+      if (!RegBank.getRegUnit(AdjustUnit).Artificial)
+        RegBank.increaseRegUnitWeight(AdjustUnit, UberSet->Weight - RegWeight);
       // The unit may be shared among sets and registers within this set.
       computeUberWeights(UberSets, RegBank);
     }
@@ -1670,7 +1715,7 @@ void CodeGenRegBank::computeRegUnitWeights() {
     Changed = false;
     for (auto &Reg : Registers) {
       CodeGenRegister::RegUnitList NormalUnits;
-      SparseBitVector<> NormalRegs;
+      BitVector NormalRegs;
       Changed |= normalizeWeight(&Reg, UberSets, RegSets, NormalRegs,
                                  NormalUnits, *this);
     }
@@ -1734,8 +1779,8 @@ void CodeGenRegBank::pruneUnitSets() {
           && (SubSet.Units.size() + 3 > SuperSet.Units.size())
           && UnitWeight == RegUnits[SuperSet.Units[0]].Weight
           && UnitWeight == RegUnits[SuperSet.Units.back()].Weight) {
-        DEBUG(dbgs() << "UnitSet " << SubIdx << " subsumed by " << SuperIdx
-              << "\n");
+        LLVM_DEBUG(dbgs() << "UnitSet " << SubIdx << " subsumed by " << SuperIdx
+                          << "\n");
         // We can pick any of the set names for the merged set. Go for the
         // shortest one to avoid picking the name of one of the classes that are
         // artificially created by tablegen. So "FPR128_lo" instead of
@@ -1771,7 +1816,7 @@ void CodeGenRegBank::computeRegUnitSets() {
   // Compute a unique RegUnitSet for each RegClass.
   auto &RegClasses = getRegClasses();
   for (auto &RC : RegClasses) {
-    if (!RC.Allocatable)
+    if (!RC.Allocatable || RC.Artificial)
       continue;
 
     // Speculatively grow the RegUnitSets to hold the new set.
@@ -1779,7 +1824,7 @@ void CodeGenRegBank::computeRegUnitSets() {
     RegUnitSets.back().Name = RC.getName();
 
     // Compute a sorted list of units in this class.
-    RC.buildRegUnitSet(RegUnitSets.back().Units);
+    RC.buildRegUnitSet(*this, RegUnitSets.back().Units);
 
     // Find an existing RegUnitSet.
     std::vector<RegUnitSet>::const_iterator SetI =
@@ -1788,29 +1833,26 @@ void CodeGenRegBank::computeRegUnitSets() {
       RegUnitSets.pop_back();
   }
 
-  DEBUG(dbgs() << "\nBefore pruning:\n";
-        for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
-             USIdx < USEnd; ++USIdx) {
-          dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
-                 << ":";
-          for (auto &U : RegUnitSets[USIdx].Units)
-            printRegUnitName(U);
-          dbgs() << "\n";
-        });
+  LLVM_DEBUG(dbgs() << "\nBefore pruning:\n"; for (unsigned USIdx = 0,
+                                                   USEnd = RegUnitSets.size();
+                                                   USIdx < USEnd; ++USIdx) {
+    dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name << ":";
+    for (auto &U : RegUnitSets[USIdx].Units)
+      printRegUnitName(U);
+    dbgs() << "\n";
+  });
 
   // Iteratively prune unit sets.
   pruneUnitSets();
 
-  DEBUG(dbgs() << "\nBefore union:\n";
-        for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
-             USIdx < USEnd; ++USIdx) {
-          dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
-                 << ":";
-          for (auto &U : RegUnitSets[USIdx].Units)
-            printRegUnitName(U);
-          dbgs() << "\n";
-        }
-        dbgs() << "\nUnion sets:\n");
+  LLVM_DEBUG(dbgs() << "\nBefore union:\n"; for (unsigned USIdx = 0,
+                                                 USEnd = RegUnitSets.size();
+                                                 USIdx < USEnd; ++USIdx) {
+    dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name << ":";
+    for (auto &U : RegUnitSets[USIdx].Units)
+      printRegUnitName(U);
+    dbgs() << "\n";
+  } dbgs() << "\nUnion sets:\n");
 
   // Iterate over all unit sets, including new ones added by this loop.
   unsigned NumRegUnitSubSets = RegUnitSets.size();
@@ -1850,11 +1892,11 @@ void CodeGenRegBank::computeRegUnitSets() {
       if (SetI != std::prev(RegUnitSets.end()))
         RegUnitSets.pop_back();
       else {
-        DEBUG(dbgs() << "UnitSet " << RegUnitSets.size()-1
-              << " " << RegUnitSets.back().Name << ":";
-              for (auto &U : RegUnitSets.back().Units)
-                printRegUnitName(U);
-              dbgs() << "\n";);
+        LLVM_DEBUG(dbgs() << "UnitSet " << RegUnitSets.size() - 1 << " "
+                          << RegUnitSets.back().Name << ":";
+                   for (auto &U
+                        : RegUnitSets.back().Units) printRegUnitName(U);
+                   dbgs() << "\n";);
       }
     }
   }
@@ -1862,15 +1904,14 @@ void CodeGenRegBank::computeRegUnitSets() {
   // Iteratively prune unit sets after inferring supersets.
   pruneUnitSets();
 
-  DEBUG(dbgs() << "\n";
-        for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
-             USIdx < USEnd; ++USIdx) {
-          dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
-                 << ":";
-          for (auto &U : RegUnitSets[USIdx].Units)
-            printRegUnitName(U);
-          dbgs() << "\n";
-        });
+  LLVM_DEBUG(
+      dbgs() << "\n"; for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
+                           USIdx < USEnd; ++USIdx) {
+        dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name << ":";
+        for (auto &U : RegUnitSets[USIdx].Units)
+          printRegUnitName(U);
+        dbgs() << "\n";
+      });
 
   // For each register class, list the UnitSets that are supersets.
   RegClassUnitSets.resize(RegClasses.size());
@@ -1882,26 +1923,26 @@ void CodeGenRegBank::computeRegUnitSets() {
 
     // Recompute the sorted list of units in this class.
     std::vector<unsigned> RCRegUnits;
-    RC.buildRegUnitSet(RCRegUnits);
+    RC.buildRegUnitSet(*this, RCRegUnits);
 
     // Don't increase pressure for unallocatable regclasses.
     if (RCRegUnits.empty())
       continue;
 
-    DEBUG(dbgs() << "RC " << RC.getName() << " Units: \n";
-          for (auto U : RCRegUnits)
-            printRegUnitName(U);
-          dbgs() << "\n  UnitSetIDs:");
+    LLVM_DEBUG(dbgs() << "RC " << RC.getName() << " Units: \n";
+               for (auto U
+                    : RCRegUnits) printRegUnitName(U);
+               dbgs() << "\n  UnitSetIDs:");
 
     // Find all supersets.
     for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
          USIdx != USEnd; ++USIdx) {
       if (isRegUnitSubSet(RCRegUnits, RegUnitSets[USIdx].Units)) {
-        DEBUG(dbgs() << " " << USIdx);
+        LLVM_DEBUG(dbgs() << " " << USIdx);
         RegClassUnitSets[RCIdx].push_back(USIdx);
       }
     }
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
     assert(!RegClassUnitSets[RCIdx].empty() && "missing unit set for regclass");
   }
 
@@ -2069,10 +2110,14 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
   // Compute the set of registers supporting each SubRegIndex.
   SubReg2SetMap SRSets;
   for (const auto R : RC->getMembers()) {
+    if (R->Artificial)
+      continue;
     const CodeGenRegister::SubRegMap &SRM = R->getSubRegs();
     for (CodeGenRegister::SubRegMap::const_iterator I = SRM.begin(),
-         E = SRM.end(); I != E; ++I)
-      SRSets[I->first].push_back(R);
+         E = SRM.end(); I != E; ++I) {
+      if (!I->first->Artificial)
+        SRSets[I->first].push_back(R);
+    }
   }
 
   for (auto I : SRSets)
@@ -2081,6 +2126,8 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
   // Find matching classes for all SRSets entries.  Iterate in SubRegIndex
   // numerical order to visit synthetic indices last.
   for (const auto &SubIdx : SubRegIndices) {
+    if (SubIdx.Artificial)
+      continue;
     SubReg2SetMap::const_iterator I = SRSets.find(&SubIdx);
     // Unsupported SubRegIndex. Skip it.
     if (I == SRSets.end())
@@ -2137,6 +2184,8 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
     for (auto I = FirstSubRegRC, E = std::prev(RegClasses.end());
          I != std::next(E); ++I) {
       CodeGenRegisterClass &SubRC = *I;
+      if (SubRC.Artificial)
+        continue;
       // Topological shortcut: SubRC members have the wrong shape.
       if (!TopoSigs.anyCommon(SubRC.getTopoSigs()))
         continue;
@@ -2182,6 +2231,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
   // Watch out for iterator invalidation here.
   for (auto I = RegClasses.begin(), E = RegClasses.end(); I != E; ++I) {
     CodeGenRegisterClass *RC = &*I;
+    if (RC->Artificial)
+      continue;
 
     // Synthesize answers for getSubClassWithSubReg().
     inferSubClassWithSubReg(RC);
diff --git a/contrib/llvm/utils/TableGen/CodeGenRegisters.h b/contrib/llvm/utils/TableGen/CodeGenRegisters.h
index f2f1e6971af9..32aa33c80b3a 100644
--- a/contrib/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/contrib/llvm/utils/TableGen/CodeGenRegisters.h
@@ -19,16 +19,16 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
 #include <cassert>
@@ -80,6 +80,10 @@ namespace llvm {
     // Are all super-registers containing this SubRegIndex covered by their
     // sub-registers?
     bool AllSuperRegsCovered;
+    // A subregister index is "artificial" if every subregister obtained
+    // from applying this index is artificial. Artificial subregister
+    // indexes are not used to create new register classes.
+    bool Artificial;
 
     CodeGenSubRegIndex(Record *R, unsigned Enum);
     CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
@@ -150,6 +154,7 @@ namespace llvm {
     unsigned CostPerUse;
     bool CoveredBySubRegs;
     bool HasDisjunctSubRegs;
+    bool Artificial;
 
     // Map SubRegIndex -> Register.
     typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<llvm::less>>
@@ -331,6 +336,8 @@ namespace llvm {
     /// True if there are at least 2 subregisters which do not interfere.
     bool HasDisjunctSubRegs;
     bool CoveredBySubRegs;
+    /// A register class is artificial if all its members are artificial.
+    bool Artificial;
 
     // Return the Record that defined this class, or NULL if the class was
     // created by TableGen.
@@ -427,7 +434,8 @@ namespace llvm {
     const BitVector &getTopoSigs() const { return TopoSigs; }
 
     // Populate a unique sorted list of units from a register set.
-    void buildRegUnitSet(std::vector<unsigned> &RegUnits) const;
+    void buildRegUnitSet(const CodeGenRegBank &RegBank,
+                         std::vector<unsigned> &RegUnits) const;
 
     CodeGenRegisterClass(CodeGenRegBank&, Record *R);
 
@@ -475,8 +483,11 @@ namespace llvm {
     // Index into RegClassUnitSets where we can find the list of UnitSets that
     // contain this unit.
     unsigned RegClassUnitSetsIdx;
+    // A register unit is artificial if at least one of its roots is
+    // artificial.
+    bool Artificial;
 
-    RegUnit() : Weight(0), RegClassUnitSetsIdx(0) {
+    RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
       Roots[0] = Roots[1] = nullptr;
     }
 
@@ -551,6 +562,9 @@ namespace llvm {
     // Give each register unit set an order based on sorting criteria.
     std::vector<unsigned> RegUnitSetOrder;
 
+    // Keep track of synthesized definitions generated in TupleExpander.
+    std::vector<std::unique_ptr<Record>> SynthDefs;
+
     // Add RC to *2RC maps.
     void addToMaps(CodeGenRegisterClass*);
 
@@ -648,8 +662,12 @@ namespace llvm {
     // registers.
     unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
       RegUnits.resize(RegUnits.size() + 1);
-      RegUnits.back().Roots[0] = R0;
-      RegUnits.back().Roots[1] = R1;
+      RegUnit &RU = RegUnits.back();
+      RU.Roots[0] = R0;
+      RU.Roots[1] = R1;
+      RU.Artificial = R0->Artificial;
+      if (R1)
+        RU.Artificial |= R1->Artificial;
       return RegUnits.size() - 1;
     }
 
diff --git a/contrib/llvm/utils/TableGen/CodeGenSchedule.cpp b/contrib/llvm/utils/TableGen/CodeGenSchedule.cpp
index b753e19a5443..9331fadf4099 100644
--- a/contrib/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/contrib/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -12,17 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenInstruction.h"
 #include "CodeGenSchedule.h"
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include <algorithm>
 #include <iterator>
@@ -50,38 +51,113 @@ struct InstrsOp : public SetTheory::Operator {
 };
 
 // (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
-//
-// TODO: Since this is a prefix match, perform a binary search over the
-// instruction names using lower_bound. Note that the predefined instrs must be
-// scanned linearly first. However, this is only safe if the regex pattern has
-// no top-level bars. The DAG already has a list of patterns, so there's no
-// reason to use top-level bars, but we need a way to verify they don't exist
-// before implementing the optimization.
 struct InstRegexOp : public SetTheory::Operator {
   const CodeGenTarget &Target;
   InstRegexOp(const CodeGenTarget &t): Target(t) {}
 
+  /// Remove any text inside of parentheses from S.
+  static std::string removeParens(llvm::StringRef S) {
+    std::string Result;
+    unsigned Paren = 0;
+    // NB: We don't care about escaped parens here.
+    for (char C : S) {
+      switch (C) {
+      case '(':
+        ++Paren;
+        break;
+      case ')':
+        --Paren;
+        break;
+      default:
+        if (Paren == 0)
+          Result += C;
+      }
+    }
+    return Result;
+  }
+
   void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
              ArrayRef<SMLoc> Loc) override {
-    SmallVector<Regex, 4> RegexList;
+    ArrayRef<const CodeGenInstruction *> Instructions =
+        Target.getInstructionsByEnumValue();
+
+    unsigned NumGeneric = Target.getNumFixedInstructions();
+    unsigned NumPseudos = Target.getNumPseudoInstructions();
+    auto Generics = Instructions.slice(0, NumGeneric);
+    auto Pseudos = Instructions.slice(NumGeneric, NumPseudos);
+    auto NonPseudos = Instructions.slice(NumGeneric + NumPseudos);
+
     for (Init *Arg : make_range(Expr->arg_begin(), Expr->arg_end())) {
       StringInit *SI = dyn_cast<StringInit>(Arg);
       if (!SI)
-        PrintFatalError(Loc, "instregex requires pattern string: "
-          + Expr->getAsString());
-      std::string pat = SI->getValue();
-      // Implement a python-style prefix match.
-      if (pat[0] != '^') {
-        pat.insert(0, "^(");
-        pat.insert(pat.end(), ')');
+        PrintFatalError(Loc, "instregex requires pattern string: " +
+                                 Expr->getAsString());
+      StringRef Original = SI->getValue();
+
+      // Extract a prefix that we can binary search on.
+      static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
+      auto FirstMeta = Original.find_first_of(RegexMetachars);
+
+      // Look for top-level | or ?. We cannot optimize them to binary search.
+      if (removeParens(Original).find_first_of("|?") != std::string::npos)
+        FirstMeta = 0;
+
+      Optional<Regex> Regexpr = None;
+      StringRef Prefix = Original.substr(0, FirstMeta);
+      StringRef PatStr = Original.substr(FirstMeta);
+      if (!PatStr.empty()) {
+        // For the rest use a python-style prefix match.
+        std::string pat = PatStr;
+        if (pat[0] != '^') {
+          pat.insert(0, "^(");
+          pat.insert(pat.end(), ')');
+        }
+        Regexpr = Regex(pat);
       }
-      RegexList.push_back(Regex(pat));
-    }
-    for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
-      for (auto &R : RegexList) {
-        if (R.match(Inst->TheDef->getName()))
+
+      int NumMatches = 0;
+
+      // The generic opcodes are unsorted, handle them manually.
+      for (auto *Inst : Generics) {
+        StringRef InstName = Inst->TheDef->getName();
+        if (InstName.startswith(Prefix) &&
+            (!Regexpr || Regexpr->match(InstName.substr(Prefix.size())))) {
           Elts.insert(Inst->TheDef);
+          NumMatches++;
+        }
       }
+
+      // Target instructions are split into two ranges: pseudo instructions
+      // first, than non-pseudos. Each range is in lexicographical order
+      // sorted by name. Find the sub-ranges that start with our prefix.
+      struct Comp {
+        bool operator()(const CodeGenInstruction *LHS, StringRef RHS) {
+          return LHS->TheDef->getName() < RHS;
+        }
+        bool operator()(StringRef LHS, const CodeGenInstruction *RHS) {
+          return LHS < RHS->TheDef->getName() &&
+                 !RHS->TheDef->getName().startswith(LHS);
+        }
+      };
+      auto Range1 =
+          std::equal_range(Pseudos.begin(), Pseudos.end(), Prefix, Comp());
+      auto Range2 = std::equal_range(NonPseudos.begin(), NonPseudos.end(),
+                                     Prefix, Comp());
+
+      // For these ranges we know that instruction names start with the prefix.
+      // Check if there's a regex that needs to be checked.
+      const auto HandleNonGeneric = [&](const CodeGenInstruction *Inst) {
+        StringRef InstName = Inst->TheDef->getName();
+        if (!Regexpr || Regexpr->match(InstName.substr(Prefix.size()))) {
+          Elts.insert(Inst->TheDef);
+          NumMatches++;
+        }
+      };
+      std::for_each(Range1.first, Range1.second, HandleNonGeneric);
+      std::for_each(Range2.first, Range2.second, HandleNonGeneric);
+
+      if (0 == NumMatches)
+        PrintFatalError(Loc, "instregex has no matches: " + Original);
     }
   }
 };
@@ -139,16 +215,49 @@ CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
 
   // Populate each CodeGenProcModel's WriteResDefs, ReadAdvanceDefs, and
   // ProcResourceDefs.
-  DEBUG(dbgs() << "\n+++ RESOURCE DEFINITIONS (collectProcResources) +++\n");
+  LLVM_DEBUG(
+      dbgs() << "\n+++ RESOURCE DEFINITIONS (collectProcResources) +++\n");
   collectProcResources();
 
+  // Collect optional processor description.
+  collectOptionalProcessorInfo();
+
+  checkCompleteness();
+}
+
+void CodeGenSchedModels::collectRetireControlUnits() {
+  RecVec Units = Records.getAllDerivedDefinitions("RetireControlUnit");
+
+  for (Record *RCU : Units) {
+    CodeGenProcModel &PM = getProcModel(RCU->getValueAsDef("SchedModel"));
+    if (PM.RetireControlUnit) {
+      PrintError(RCU->getLoc(),
+                 "Expected a single RetireControlUnit definition");
+      PrintNote(PM.RetireControlUnit->getLoc(),
+                "Previous definition of RetireControlUnit was here");
+    }
+    PM.RetireControlUnit = RCU;
+  }
+}
+
+/// Collect optional processor information.
+void CodeGenSchedModels::collectOptionalProcessorInfo() {
+  // Find register file definitions for each processor.
+  collectRegisterFiles();
+
+  // Collect processor RetireControlUnit descriptors if available.
+  collectRetireControlUnits();
+
+  // Find pfm counter definitions for each processor.
+  collectPfmCounters();
+
   checkCompleteness();
 }
 
 /// Gather all processor models.
 void CodeGenSchedModels::collectProcModels() {
   RecVec ProcRecords = Records.getAllDerivedDefinitions("Processor");
-  std::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
+  llvm::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
 
   // Reserve space because we can. Reallocation would be ok.
   ProcModels.reserve(ProcRecords.size()+1);
@@ -160,7 +269,7 @@ void CodeGenSchedModels::collectProcModels() {
   ProcModelMap[NoModelDef] = 0;
 
   // For each processor, find a unique machine model.
-  DEBUG(dbgs() << "+++ PROCESSOR MODELs (addProcModel) +++\n");
+  LLVM_DEBUG(dbgs() << "+++ PROCESSOR MODELs (addProcModel) +++\n");
   for (Record *ProcRecord : ProcRecords)
     addProcModel(ProcRecord);
 }
@@ -184,7 +293,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) {
     ProcModels.emplace_back(ProcModels.size(), Name,
                             ProcDef->getValueAsDef("SchedModel"), ModelKey);
   }
-  DEBUG(ProcModels.back().dump());
+  LLVM_DEBUG(ProcModels.back().dump());
 }
 
 // Recursively find all reachable SchedReadWrite records.
@@ -267,7 +376,7 @@ void CodeGenSchedModels::collectSchedRW() {
   // Find all ReadWrites referenced by SchedAlias. AliasDefs needs to be sorted
   // for the loop below that initializes Alias vectors.
   RecVec AliasDefs = Records.getAllDerivedDefinitions("SchedAlias");
-  std::sort(AliasDefs.begin(), AliasDefs.end(), LessRecord());
+  llvm::sort(AliasDefs.begin(), AliasDefs.end(), LessRecord());
   for (Record *ADef : AliasDefs) {
     Record *MatchDef = ADef->getValueAsDef("MatchRW");
     Record *AliasDef = ADef->getValueAsDef("AliasRW");
@@ -285,12 +394,12 @@ void CodeGenSchedModels::collectSchedRW() {
   }
   // Sort and add the SchedReadWrites directly referenced by instructions or
   // itinerary resources. Index reads and writes in separate domains.
-  std::sort(SWDefs.begin(), SWDefs.end(), LessRecord());
+  llvm::sort(SWDefs.begin(), SWDefs.end(), LessRecord());
   for (Record *SWDef : SWDefs) {
     assert(!getSchedRWIdx(SWDef, /*IsRead=*/false) && "duplicate SchedWrite");
     SchedWrites.emplace_back(SchedWrites.size(), SWDef);
   }
-  std::sort(SRDefs.begin(), SRDefs.end(), LessRecord());
+  llvm::sort(SRDefs.begin(), SRDefs.end(), LessRecord());
   for (Record *SRDef : SRDefs) {
     assert(!getSchedRWIdx(SRDef, /*IsRead-*/true) && "duplicate SchedWrite");
     SchedReads.emplace_back(SchedReads.size(), SRDef);
@@ -312,26 +421,26 @@ void CodeGenSchedModels::collectSchedRW() {
       PrintFatalError(ADef->getLoc(), "Cannot Alias an Alias");
     RW.Aliases.push_back(ADef);
   }
-  DEBUG(
-    dbgs() << "\n+++ SCHED READS and WRITES (collectSchedRW) +++\n";
-    for (unsigned WIdx = 0, WEnd = SchedWrites.size(); WIdx != WEnd; ++WIdx) {
-      dbgs() << WIdx << ": ";
-      SchedWrites[WIdx].dump();
-      dbgs() << '\n';
-    }
-    for (unsigned RIdx = 0, REnd = SchedReads.size(); RIdx != REnd; ++RIdx) {
-      dbgs() << RIdx << ": ";
-      SchedReads[RIdx].dump();
-      dbgs() << '\n';
-    }
-    RecVec RWDefs = Records.getAllDerivedDefinitions("SchedReadWrite");
-    for (Record *RWDef : RWDefs) {
-      if (!getSchedRWIdx(RWDef, RWDef->isSubClassOf("SchedRead"))) {
-        const std::string &Name = RWDef->getName();
-        if (Name != "NoWrite" && Name != "ReadDefault")
-          dbgs() << "Unused SchedReadWrite " << RWDef->getName() << '\n';
-      }
-    });
+  LLVM_DEBUG(
+      dbgs() << "\n+++ SCHED READS and WRITES (collectSchedRW) +++\n";
+      for (unsigned WIdx = 0, WEnd = SchedWrites.size(); WIdx != WEnd; ++WIdx) {
+        dbgs() << WIdx << ": ";
+        SchedWrites[WIdx].dump();
+        dbgs() << '\n';
+      } for (unsigned RIdx = 0, REnd = SchedReads.size(); RIdx != REnd;
+             ++RIdx) {
+        dbgs() << RIdx << ": ";
+        SchedReads[RIdx].dump();
+        dbgs() << '\n';
+      } RecVec RWDefs = Records.getAllDerivedDefinitions("SchedReadWrite");
+      for (Record *RWDef
+           : RWDefs) {
+        if (!getSchedRWIdx(RWDef, RWDef->isSubClassOf("SchedRead"))) {
+          StringRef Name = RWDef->getName();
+          if (Name != "NoWrite" && Name != "ReadDefault")
+            dbgs() << "Unused SchedReadWrite " << Name << '\n';
+        }
+      });
 }
 
 /// Compute a SchedWrite name from a sequence of writes.
@@ -346,16 +455,12 @@ std::string CodeGenSchedModels::genRWName(ArrayRef<unsigned> Seq, bool IsRead) {
   return Name;
 }
 
-unsigned CodeGenSchedModels::getSchedRWIdx(Record *Def, bool IsRead,
-                                           unsigned After) const {
+unsigned CodeGenSchedModels::getSchedRWIdx(const Record *Def,
+                                           bool IsRead) const {
   const std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
-  assert(After < RWVec.size() && "start position out of bounds");
-  for (std::vector<CodeGenSchedRW>::const_iterator I = RWVec.begin() + After,
-         E = RWVec.end(); I != E; ++I) {
-    if (I->TheDef == Def)
-      return I - RWVec.begin();
-  }
-  return 0;
+  const auto I = find_if(
+      RWVec, [Def](const CodeGenSchedRW &RW) { return RW.TheDef == Def; });
+  return I == RWVec.end() ? 0 : std::distance(RWVec.begin(), I);
 }
 
 bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
@@ -372,10 +477,8 @@ bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
   return false;
 }
 
-namespace llvm {
-
-void splitSchedReadWrites(const RecVec &RWDefs,
-                          RecVec &WriteDefs, RecVec &ReadDefs) {
+static void splitSchedReadWrites(const RecVec &RWDefs,
+                                 RecVec &WriteDefs, RecVec &ReadDefs) {
   for (Record *RWDef : RWDefs) {
     if (RWDef->isSubClassOf("SchedWrite"))
       WriteDefs.push_back(RWDef);
@@ -386,16 +489,14 @@ void splitSchedReadWrites(const RecVec &RWDefs,
   }
 }
 
-} // end namespace llvm
-
 // Split the SchedReadWrites defs and call findRWs for each list.
 void CodeGenSchedModels::findRWs(const RecVec &RWDefs,
                                  IdxVec &Writes, IdxVec &Reads) const {
-    RecVec WriteDefs;
-    RecVec ReadDefs;
-    splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs);
-    findRWs(WriteDefs, Writes, false);
-    findRWs(ReadDefs, Reads, true);
+  RecVec WriteDefs;
+  RecVec ReadDefs;
+  splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs);
+  findRWs(WriteDefs, Writes, false);
+  findRWs(ReadDefs, Reads, true);
 }
 
 // Call getSchedRWIdx for all elements in a sequence of SchedRW defs.
@@ -432,11 +533,10 @@ void CodeGenSchedModels::expandRWSeqForProc(
 
   const CodeGenSchedRW &SchedWrite = getSchedRW(RWIdx, IsRead);
   Record *AliasDef = nullptr;
-  for (RecIter AI = SchedWrite.Aliases.begin(), AE = SchedWrite.Aliases.end();
-       AI != AE; ++AI) {
-    const CodeGenSchedRW &AliasRW = getSchedRW((*AI)->getValueAsDef("AliasRW"));
-    if ((*AI)->getValueInit("SchedModel")->isComplete()) {
-      Record *ModelDef = (*AI)->getValueAsDef("SchedModel");
+  for (const Record *Rec : SchedWrite.Aliases) {
+    const CodeGenSchedRW &AliasRW = getSchedRW(Rec->getValueAsDef("AliasRW"));
+    if (Rec->getValueInit("SchedModel")->isComplete()) {
+      Record *ModelDef = Rec->getValueAsDef("SchedModel");
       if (&getProcModel(ModelDef) != &ProcModel)
         continue;
     }
@@ -457,9 +557,9 @@ void CodeGenSchedModels::expandRWSeqForProc(
   }
   int Repeat =
     SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1;
-  for (int i = 0; i < Repeat; ++i) {
-    for (unsigned I : SchedWrite.Sequence) {
-      expandRWSeqForProc(I, RWSeq, IsRead, ProcModel);
+  for (int I = 0, E = Repeat; I < E; ++I) {
+    for (unsigned Idx : SchedWrite.Sequence) {
+      expandRWSeqForProc(Idx, RWSeq, IsRead, ProcModel);
     }
   }
 }
@@ -469,13 +569,11 @@ unsigned CodeGenSchedModels::findRWForSequence(ArrayRef<unsigned> Seq,
                                                bool IsRead) {
   std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
 
-  for (std::vector<CodeGenSchedRW>::iterator I = RWVec.begin(), E = RWVec.end();
-       I != E; ++I) {
-    if (makeArrayRef(I->Sequence) == Seq)
-      return I - RWVec.begin();
-  }
+  auto I = find_if(RWVec, [Seq](CodeGenSchedRW &RW) {
+    return makeArrayRef(RW.Sequence) == Seq;
+  });
   // Index zero reserved for invalid RW.
-  return 0;
+  return I == RWVec.end() ? 0 : std::distance(RWVec.begin(), I);
 }
 
 /// Add this ReadWrite if it doesn't already exist.
@@ -489,12 +587,10 @@ unsigned CodeGenSchedModels::findOrInsertRW(ArrayRef<unsigned> Seq,
   if (Idx)
     return Idx;
 
-  unsigned RWIdx = IsRead ? SchedReads.size() : SchedWrites.size();
+  std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
+  unsigned RWIdx = RWVec.size();
   CodeGenSchedRW SchedRW(RWIdx, IsRead, Seq, genRWName(Seq, IsRead));
-  if (IsRead)
-    SchedReads.push_back(SchedRW);
-  else
-    SchedWrites.push_back(SchedRW);
+  RWVec.push_back(SchedRW);
   return RWIdx;
 }
 
@@ -504,10 +600,9 @@ unsigned CodeGenSchedModels::findOrInsertRW(ArrayRef<unsigned> Seq,
 void CodeGenSchedModels::collectSchedClasses() {
 
   // NoItinerary is always the first class at Idx=0
-  SchedClasses.resize(1);
-  SchedClasses.back().Index = 0;
-  SchedClasses.back().Name = "NoInstrModel";
-  SchedClasses.back().ItinClassDef = Records.getDef("NoItinerary");
+  assert(SchedClasses.empty() && "Expected empty sched class");
+  SchedClasses.emplace_back(0, "NoInstrModel",
+                            Records.getDef("NoItinerary"));
   SchedClasses.back().ProcIndices.push_back(0);
 
   // Create a SchedClass for each unique combination of itinerary class and
@@ -519,32 +614,34 @@ void CodeGenSchedModels::collectSchedClasses() {
       findRWs(Inst->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
 
     // ProcIdx == 0 indicates the class applies to all processors.
-    IdxVec ProcIndices(1, 0);
-
-    unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, ProcIndices);
+    unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/{0});
     InstrClassMap[Inst->TheDef] = SCIdx;
   }
   // Create classes for InstRW defs.
   RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
-  std::sort(InstRWDefs.begin(), InstRWDefs.end(), LessRecord());
-  DEBUG(dbgs() << "\n+++ SCHED CLASSES (createInstRWClass) +++\n");
+  llvm::sort(InstRWDefs.begin(), InstRWDefs.end(), LessRecord());
+  LLVM_DEBUG(dbgs() << "\n+++ SCHED CLASSES (createInstRWClass) +++\n");
   for (Record *RWDef : InstRWDefs)
     createInstRWClass(RWDef);
 
   NumInstrSchedClasses = SchedClasses.size();
 
   bool EnableDump = false;
-  DEBUG(EnableDump = true);
+  LLVM_DEBUG(EnableDump = true);
   if (!EnableDump)
     return;
 
-  dbgs() << "\n+++ ITINERARIES and/or MACHINE MODELS (collectSchedClasses) +++\n";
+  LLVM_DEBUG(
+      dbgs()
+      << "\n+++ ITINERARIES and/or MACHINE MODELS (collectSchedClasses) +++\n");
   for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
     StringRef InstName = Inst->TheDef->getName();
-    unsigned SCIdx = InstrClassMap.lookup(Inst->TheDef);
+    unsigned SCIdx = getSchedClassIdx(*Inst);
     if (!SCIdx) {
-      if (!Inst->hasNoSchedulingInfo)
-        dbgs() << "No machine model for " << Inst->TheDef->getName() << '\n';
+      LLVM_DEBUG({
+        if (!Inst->hasNoSchedulingInfo)
+          dbgs() << "No machine model for " << Inst->TheDef->getName() << '\n';
+      });
       continue;
     }
     CodeGenSchedClass &SC = getSchedClass(SCIdx);
@@ -560,58 +657,51 @@ void CodeGenSchedModels::collectSchedClasses() {
     }
     if (!SC.Writes.empty()) {
       ProcIndices.push_back(0);
-      dbgs() << "SchedRW machine model for " << InstName;
-      for (IdxIter WI = SC.Writes.begin(), WE = SC.Writes.end(); WI != WE; ++WI)
-        dbgs() << " " << SchedWrites[*WI].Name;
-      for (IdxIter RI = SC.Reads.begin(), RE = SC.Reads.end(); RI != RE; ++RI)
-        dbgs() << " " << SchedReads[*RI].Name;
-      dbgs() << '\n';
+      LLVM_DEBUG({
+        dbgs() << "SchedRW machine model for " << InstName;
+        for (IdxIter WI = SC.Writes.begin(), WE = SC.Writes.end(); WI != WE;
+             ++WI)
+          dbgs() << " " << SchedWrites[*WI].Name;
+        for (IdxIter RI = SC.Reads.begin(), RE = SC.Reads.end(); RI != RE; ++RI)
+          dbgs() << " " << SchedReads[*RI].Name;
+        dbgs() << '\n';
+      });
     }
     const RecVec &RWDefs = SchedClasses[SCIdx].InstRWs;
     for (Record *RWDef : RWDefs) {
       const CodeGenProcModel &ProcModel =
-        getProcModel(RWDef->getValueAsDef("SchedModel"));
+          getProcModel(RWDef->getValueAsDef("SchedModel"));
       ProcIndices.push_back(ProcModel.Index);
-      dbgs() << "InstRW on " << ProcModel.ModelName << " for " << InstName;
+      LLVM_DEBUG(dbgs() << "InstRW on " << ProcModel.ModelName << " for "
+                        << InstName);
       IdxVec Writes;
       IdxVec Reads;
       findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"),
               Writes, Reads);
-      for (unsigned WIdx : Writes)
-        dbgs() << " " << SchedWrites[WIdx].Name;
-      for (unsigned RIdx : Reads)
-        dbgs() << " " << SchedReads[RIdx].Name;
-      dbgs() << '\n';
+      LLVM_DEBUG({
+        for (unsigned WIdx : Writes)
+          dbgs() << " " << SchedWrites[WIdx].Name;
+        for (unsigned RIdx : Reads)
+          dbgs() << " " << SchedReads[RIdx].Name;
+        dbgs() << '\n';
+      });
     }
     // If ProcIndices contains zero, the class applies to all processors.
-    if (!std::count(ProcIndices.begin(), ProcIndices.end(), 0)) {
-      for (const CodeGenProcModel &PM : ProcModels) {
-        if (!std::count(ProcIndices.begin(), ProcIndices.end(), PM.Index))
-          dbgs() << "No machine model for " << Inst->TheDef->getName()
-                 << " on processor " << PM.ModelName << '\n';
+    LLVM_DEBUG({
+      if (!std::count(ProcIndices.begin(), ProcIndices.end(), 0)) {
+        for (const CodeGenProcModel &PM : ProcModels) {
+          if (!std::count(ProcIndices.begin(), ProcIndices.end(), PM.Index))
+            dbgs() << "No machine model for " << Inst->TheDef->getName()
+                   << " on processor " << PM.ModelName << '\n';
+        }
       }
-    }
-  }
-}
-
-/// Find an SchedClass that has been inferred from a per-operand list of
-/// SchedWrites and SchedReads.
-unsigned CodeGenSchedModels::findSchedClassIdx(Record *ItinClassDef,
-                                               ArrayRef<unsigned> Writes,
-                                               ArrayRef<unsigned> Reads) const {
-  for (SchedClassIter I = schedClassBegin(), E = schedClassEnd(); I != E; ++I) {
-    if (I->ItinClassDef == ItinClassDef && makeArrayRef(I->Writes) == Writes &&
-        makeArrayRef(I->Reads) == Reads) {
-      return I - schedClassBegin();
-    }
+    });
   }
-  return 0;
 }
 
 // Get the SchedClass index for an instruction.
-unsigned CodeGenSchedModels::getSchedClassIdx(
-  const CodeGenInstruction &Inst) const {
-
+unsigned
+CodeGenSchedModels::getSchedClassIdx(const CodeGenInstruction &Inst) const {
   return InstrClassMap.lookup(Inst.TheDef);
 }
 
@@ -655,22 +745,27 @@ unsigned CodeGenSchedModels::addSchedClass(Record *ItinClassDef,
                                            ArrayRef<unsigned> ProcIndices) {
   assert(!ProcIndices.empty() && "expect at least one ProcIdx");
 
-  unsigned Idx = findSchedClassIdx(ItinClassDef, OperWrites, OperReads);
+  auto IsKeyEqual = [=](const CodeGenSchedClass &SC) {
+                     return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads);
+                   };
+
+  auto I = find_if(make_range(schedClassBegin(), schedClassEnd()), IsKeyEqual);
+  unsigned Idx = I == schedClassEnd() ? 0 : std::distance(schedClassBegin(), I);
   if (Idx || SchedClasses[0].isKeyEqual(ItinClassDef, OperWrites, OperReads)) {
     IdxVec PI;
     std::set_union(SchedClasses[Idx].ProcIndices.begin(),
                    SchedClasses[Idx].ProcIndices.end(),
                    ProcIndices.begin(), ProcIndices.end(),
                    std::back_inserter(PI));
-    SchedClasses[Idx].ProcIndices.swap(PI);
+    SchedClasses[Idx].ProcIndices = std::move(PI);
     return Idx;
   }
   Idx = SchedClasses.size();
-  SchedClasses.resize(Idx+1);
+  SchedClasses.emplace_back(Idx,
+                            createSchedClassName(ItinClassDef, OperWrites,
+                                                 OperReads),
+                            ItinClassDef);
   CodeGenSchedClass &SC = SchedClasses.back();
-  SC.Index = Idx;
-  SC.Name = createSchedClassName(ItinClassDef, OperWrites, OperReads);
-  SC.ItinClassDef = ItinClassDef;
   SC.Writes = OperWrites;
   SC.Reads = OperReads;
   SC.ProcIndices = ProcIndices;
@@ -685,106 +780,104 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
   // intersects with an existing class via a previous InstRWDef. Instrs that do
   // not intersect with an existing class refer back to their former class as
   // determined from ItinDef or SchedRW.
-  SmallVector<std::pair<unsigned, SmallVector<Record *, 8>>, 4> ClassInstrs;
+  SmallMapVector<unsigned, SmallVector<Record *, 8>, 4> ClassInstrs;
   // Sort Instrs into sets.
   const RecVec *InstDefs = Sets.expand(InstRWDef);
   if (InstDefs->empty())
     PrintFatalError(InstRWDef->getLoc(), "No matching instruction opcodes");
 
-  for (Record *InstDef : make_range(InstDefs->begin(), InstDefs->end())) {
+  for (Record *InstDef : *InstDefs) {
     InstClassMapTy::const_iterator Pos = InstrClassMap.find(InstDef);
     if (Pos == InstrClassMap.end())
       PrintFatalError(InstDef->getLoc(), "No sched class for instruction.");
     unsigned SCIdx = Pos->second;
-    unsigned CIdx = 0, CEnd = ClassInstrs.size();
-    for (; CIdx != CEnd; ++CIdx) {
-      if (ClassInstrs[CIdx].first == SCIdx)
-        break;
-    }
-    if (CIdx == CEnd) {
-      ClassInstrs.resize(CEnd + 1);
-      ClassInstrs[CIdx].first = SCIdx;
-    }
-    ClassInstrs[CIdx].second.push_back(InstDef);
+    ClassInstrs[SCIdx].push_back(InstDef);
   }
   // For each set of Instrs, create a new class if necessary, and map or remap
   // the Instrs to it.
-  unsigned CIdx = 0, CEnd = ClassInstrs.size();
-  for (; CIdx != CEnd; ++CIdx) {
-    unsigned OldSCIdx = ClassInstrs[CIdx].first;
-    ArrayRef<Record*> InstDefs = ClassInstrs[CIdx].second;
+  for (auto &Entry : ClassInstrs) {
+    unsigned OldSCIdx = Entry.first;
+    ArrayRef<Record*> InstDefs = Entry.second;
     // If the all instrs in the current class are accounted for, then leave
     // them mapped to their old class.
     if (OldSCIdx) {
       const RecVec &RWDefs = SchedClasses[OldSCIdx].InstRWs;
       if (!RWDefs.empty()) {
         const RecVec *OrigInstDefs = Sets.expand(RWDefs[0]);
-        unsigned OrigNumInstrs = 0;
-        for (Record *OIDef : make_range(OrigInstDefs->begin(), OrigInstDefs->end())) {
-          if (InstrClassMap[OIDef] == OldSCIdx)
-            ++OrigNumInstrs;
-        }
+        unsigned OrigNumInstrs =
+          count_if(*OrigInstDefs, [&](Record *OIDef) {
+                     return InstrClassMap[OIDef] == OldSCIdx;
+                   });
         if (OrigNumInstrs == InstDefs.size()) {
           assert(SchedClasses[OldSCIdx].ProcIndices[0] == 0 &&
                  "expected a generic SchedClass");
-          DEBUG(dbgs() << "InstRW: Reuse SC " << OldSCIdx << ":"
-                << SchedClasses[OldSCIdx].Name << " on "
-                << InstRWDef->getValueAsDef("SchedModel")->getName() << "\n");
+          Record *RWModelDef = InstRWDef->getValueAsDef("SchedModel");
+          // Make sure we didn't already have a InstRW containing this
+          // instruction on this model.
+          for (Record *RWD : RWDefs) {
+            if (RWD->getValueAsDef("SchedModel") == RWModelDef &&
+                RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) {
+              for (Record *Inst : InstDefs) {
+                PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " +
+                            Inst->getName() + " also matches " +
+                            RWD->getValue("Instrs")->getValue()->getAsString());
+              }
+            }
+          }
+          LLVM_DEBUG(dbgs() << "InstRW: Reuse SC " << OldSCIdx << ":"
+                            << SchedClasses[OldSCIdx].Name << " on "
+                            << RWModelDef->getName() << "\n");
           SchedClasses[OldSCIdx].InstRWs.push_back(InstRWDef);
           continue;
         }
       }
     }
     unsigned SCIdx = SchedClasses.size();
-    SchedClasses.resize(SCIdx+1);
+    SchedClasses.emplace_back(SCIdx, createSchedClassName(InstDefs), nullptr);
     CodeGenSchedClass &SC = SchedClasses.back();
-    SC.Index = SCIdx;
-    SC.Name = createSchedClassName(InstDefs);
-    DEBUG(dbgs() << "InstRW: New SC " << SCIdx << ":" << SC.Name << " on "
-          << InstRWDef->getValueAsDef("SchedModel")->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "InstRW: New SC " << SCIdx << ":" << SC.Name << " on "
+                      << InstRWDef->getValueAsDef("SchedModel")->getName()
+                      << "\n");
 
     // Preserve ItinDef and Writes/Reads for processors without an InstRW entry.
     SC.ItinClassDef = SchedClasses[OldSCIdx].ItinClassDef;
     SC.Writes = SchedClasses[OldSCIdx].Writes;
     SC.Reads = SchedClasses[OldSCIdx].Reads;
     SC.ProcIndices.push_back(0);
-    // Map each Instr to this new class.
-    // Note that InstDefs may be a smaller list than InstRWDef's "Instrs".
-    Record *RWModelDef = InstRWDef->getValueAsDef("SchedModel");
-    SmallSet<unsigned, 4> RemappedClassIDs;
-    for (ArrayRef<Record*>::const_iterator
-           II = InstDefs.begin(), IE = InstDefs.end(); II != IE; ++II) {
-      unsigned OldSCIdx = InstrClassMap[*II];
-      if (OldSCIdx && RemappedClassIDs.insert(OldSCIdx).second) {
-        for (RecIter RI = SchedClasses[OldSCIdx].InstRWs.begin(),
-               RE = SchedClasses[OldSCIdx].InstRWs.end(); RI != RE; ++RI) {
-          if ((*RI)->getValueAsDef("SchedModel") == RWModelDef) {
-            PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " +
-                          (*II)->getName() + " also matches " +
-                          (*RI)->getValue("Instrs")->getValue()->getAsString());
+    // If we had an old class, copy it's InstRWs to this new class.
+    if (OldSCIdx) {
+      Record *RWModelDef = InstRWDef->getValueAsDef("SchedModel");
+      for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) {
+        if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) {
+          for (Record *InstDef : InstDefs) {
+            PrintFatalError(OldRWDef->getLoc(), "Overlapping InstRW def " +
+                       InstDef->getName() + " also matches " +
+                       OldRWDef->getValue("Instrs")->getValue()->getAsString());
           }
-          assert(*RI != InstRWDef && "SchedClass has duplicate InstRW def");
-          SC.InstRWs.push_back(*RI);
         }
+        assert(OldRWDef != InstRWDef &&
+               "SchedClass has duplicate InstRW def");
+        SC.InstRWs.push_back(OldRWDef);
       }
-      InstrClassMap[*II] = SCIdx;
     }
+    // Map each Instr to this new class.
+    for (Record *InstDef : InstDefs)
+      InstrClassMap[InstDef] = SCIdx;
     SC.InstRWs.push_back(InstRWDef);
   }
 }
 
 // True if collectProcItins found anything.
 bool CodeGenSchedModels::hasItineraries() const {
-  for (const CodeGenProcModel &PM : make_range(procModelBegin(),procModelEnd())) {
+  for (const CodeGenProcModel &PM : make_range(procModelBegin(),procModelEnd()))
     if (PM.hasItineraries())
       return true;
-  }
   return false;
 }
 
 // Gather the processor itineraries.
 void CodeGenSchedModels::collectProcItins() {
-  DEBUG(dbgs() << "\n+++ PROBLEM ITINERARIES (collectProcItins) +++\n");
+  LLVM_DEBUG(dbgs() << "\n+++ PROBLEM ITINERARIES (collectProcItins) +++\n");
   for (CodeGenProcModel &ProcModel : ProcModels) {
     if (!ProcModel.hasItineraries())
       continue;
@@ -798,37 +891,39 @@ void CodeGenSchedModels::collectProcItins() {
     // Insert each itinerary data record in the correct position within
     // the processor model's ItinDefList.
     for (Record *ItinData : ItinRecords) {
-      Record *ItinDef = ItinData->getValueAsDef("TheClass");
+      const Record *ItinDef = ItinData->getValueAsDef("TheClass");
       bool FoundClass = false;
-      for (SchedClassIter SCI = schedClassBegin(), SCE = schedClassEnd();
-           SCI != SCE; ++SCI) {
+
+      for (const CodeGenSchedClass &SC :
+           make_range(schedClassBegin(), schedClassEnd())) {
         // Multiple SchedClasses may share an itinerary. Update all of them.
-        if (SCI->ItinClassDef == ItinDef) {
-          ProcModel.ItinDefList[SCI->Index] = ItinData;
+        if (SC.ItinClassDef == ItinDef) {
+          ProcModel.ItinDefList[SC.Index] = ItinData;
           FoundClass = true;
         }
       }
       if (!FoundClass) {
-        DEBUG(dbgs() << ProcModel.ItinsDef->getName()
-              << " missing class for itinerary " << ItinDef->getName() << '\n');
+        LLVM_DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+                          << " missing class for itinerary "
+                          << ItinDef->getName() << '\n');
       }
     }
     // Check for missing itinerary entries.
     assert(!ProcModel.ItinDefList[0] && "NoItinerary class can't have rec");
-    DEBUG(
-      for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
-        if (!ProcModel.ItinDefList[i])
-          dbgs() << ProcModel.ItinsDef->getName()
-                 << " missing itinerary for class "
-                 << SchedClasses[i].Name << '\n';
-      });
+    LLVM_DEBUG(
+        for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
+          if (!ProcModel.ItinDefList[i])
+            dbgs() << ProcModel.ItinsDef->getName()
+                   << " missing itinerary for class " << SchedClasses[i].Name
+                   << '\n';
+        });
   }
 }
 
 // Gather the read/write types for each itinerary class.
 void CodeGenSchedModels::collectProcItinRW() {
   RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
-  std::sort(ItinRWDefs.begin(), ItinRWDefs.end(), LessRecord());
+  llvm::sort(ItinRWDefs.begin(), ItinRWDefs.end(), LessRecord());
   for (Record *RWDef  : ItinRWDefs) {
     if (!RWDef->getValueInit("SchedModel")->isComplete())
       PrintFatalError(RWDef->getLoc(), "SchedModel is undefined");
@@ -854,8 +949,9 @@ void CodeGenSchedModels::collectProcUnsupportedFeatures() {
 /// Infer new classes from existing classes. In the process, this may create new
 /// SchedWrites from sequences of existing SchedWrites.
 void CodeGenSchedModels::inferSchedClasses() {
-  DEBUG(dbgs() << "\n+++ INFERRING SCHED CLASSES (inferSchedClasses) +++\n");
-  DEBUG(dbgs() << NumInstrSchedClasses << " instr sched classes.\n");
+  LLVM_DEBUG(
+      dbgs() << "\n+++ INFERRING SCHED CLASSES (inferSchedClasses) +++\n");
+  LLVM_DEBUG(dbgs() << NumInstrSchedClasses << " instr sched classes.\n");
 
   // Visit all existing classes and newly created classes.
   for (unsigned Idx = 0; Idx != SchedClasses.size(); ++Idx) {
@@ -881,20 +977,18 @@ void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef,
     const CodeGenProcModel &PM = ProcModels[PIdx];
     // For all ItinRW entries.
     bool HasMatch = false;
-    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
-         II != IE; ++II) {
-      RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+    for (const Record *Rec : PM.ItinRWDefs) {
+      RecVec Matched = Rec->getValueAsListOfDefs("MatchedItinClasses");
       if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
         continue;
       if (HasMatch)
-        PrintFatalError((*II)->getLoc(), "Duplicate itinerary class "
+        PrintFatalError(Rec->getLoc(), "Duplicate itinerary class "
                       + ItinClassDef->getName()
                       + " in ItinResources for " + PM.ModelName);
       HasMatch = true;
       IdxVec Writes, Reads;
-      findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
-      IdxVec ProcIndices(1, PIdx);
-      inferFromRW(Writes, Reads, FromClassIdx, ProcIndices);
+      findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+      inferFromRW(Writes, Reads, FromClassIdx, PIdx);
     }
   }
 }
@@ -917,8 +1011,7 @@ void CodeGenSchedModels::inferFromInstRWs(unsigned SCIdx) {
     IdxVec Writes, Reads;
     findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
     unsigned PIdx = getProcModel(Rec->getValueAsDef("SchedModel")).Index;
-    IdxVec ProcIndices(1, PIdx);
-    inferFromRW(Writes, Reads, SCIdx, ProcIndices); // May mutate SchedClasses.
+    inferFromRW(Writes, Reads, SCIdx, PIdx); // May mutate SchedClasses.
   }
 }
 
@@ -1001,10 +1094,10 @@ bool PredTransitions::mutuallyExclusive(Record *PredDef,
     const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(PC.RWIdx, PC.IsRead);
     assert(SchedRW.HasVariants && "PredCheck must refer to a SchedVariant");
     RecVec Variants = SchedRW.TheDef->getValueAsListOfDefs("Variants");
-    for (RecIter VI = Variants.begin(), VE = Variants.end(); VI != VE; ++VI) {
-      if ((*VI)->getValueAsDef("Predicate") == PredDef)
-        return true;
-    }
+    if (any_of(Variants, [PredDef](const Record *R) {
+          return R->getValueAsDef("Predicate") == PredDef;
+        }))
+      return true;
   }
   return false;
 }
@@ -1022,12 +1115,10 @@ static bool hasAliasedVariants(const CodeGenSchedRW &RW,
     if (AliasRW.IsSequence) {
       IdxVec ExpandedRWs;
       SchedModels.expandRWSequence(AliasRW.Index, ExpandedRWs, AliasRW.IsRead);
-      for (IdxIter SI = ExpandedRWs.begin(), SE = ExpandedRWs.end();
-           SI != SE; ++SI) {
-        if (hasAliasedVariants(SchedModels.getSchedRW(*SI, AliasRW.IsRead),
-                               SchedModels)) {
+      for (unsigned SI : ExpandedRWs) {
+        if (hasAliasedVariants(SchedModels.getSchedRW(SI, AliasRW.IsRead),
+                               SchedModels))
           return true;
-        }
       }
     }
   }
@@ -1036,27 +1127,16 @@ static bool hasAliasedVariants(const CodeGenSchedRW &RW,
 
 static bool hasVariant(ArrayRef<PredTransition> Transitions,
                        CodeGenSchedModels &SchedModels) {
-  for (ArrayRef<PredTransition>::iterator
-         PTI = Transitions.begin(), PTE = Transitions.end();
-       PTI != PTE; ++PTI) {
-    for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
-           WSI = PTI->WriteSequences.begin(), WSE = PTI->WriteSequences.end();
-         WSI != WSE; ++WSI) {
-      for (SmallVectorImpl<unsigned>::const_iterator
-             WI = WSI->begin(), WE = WSI->end(); WI != WE; ++WI) {
-        if (hasAliasedVariants(SchedModels.getSchedWrite(*WI), SchedModels))
+  for (const PredTransition &PTI : Transitions) {
+    for (const SmallVectorImpl<unsigned> &WSI : PTI.WriteSequences)
+      for (unsigned WI : WSI)
+        if (hasAliasedVariants(SchedModels.getSchedWrite(WI), SchedModels))
           return true;
-      }
-    }
-    for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
-           RSI = PTI->ReadSequences.begin(), RSE = PTI->ReadSequences.end();
-         RSI != RSE; ++RSI) {
-      for (SmallVectorImpl<unsigned>::const_iterator
-             RI = RSI->begin(), RE = RSI->end(); RI != RE; ++RI) {
-        if (hasAliasedVariants(SchedModels.getSchedRead(*RI), SchedModels))
+
+    for (const SmallVectorImpl<unsigned> &RSI : PTI.ReadSequences)
+      for (unsigned RI : RSI)
+        if (hasAliasedVariants(SchedModels.getSchedRead(RI), SchedModels))
           return true;
-      }
-    }
   }
   return false;
 }
@@ -1080,7 +1160,7 @@ void PredTransitions::getIntersectingVariants(
     // Push each variant. Assign TransVecIdx later.
     const RecVec VarDefs = SchedRW.TheDef->getValueAsListOfDefs("Variants");
     for (Record *VarDef : VarDefs)
-      Variants.push_back(TransVariant(VarDef, SchedRW.Index, VarProcIdx, 0));
+      Variants.emplace_back(VarDef, SchedRW.Index, VarProcIdx, 0);
     if (VarProcIdx == 0)
       GenericRW = true;
   }
@@ -1100,12 +1180,10 @@ void PredTransitions::getIntersectingVariants(
     if (AliasRW.HasVariants) {
       const RecVec VarDefs = AliasRW.TheDef->getValueAsListOfDefs("Variants");
       for (Record *VD : VarDefs)
-        Variants.push_back(TransVariant(VD, AliasRW.Index, AliasProcIdx, 0));
-    }
-    if (AliasRW.IsSequence) {
-      Variants.push_back(
-        TransVariant(AliasRW.TheDef, SchedRW.Index, AliasProcIdx, 0));
+        Variants.emplace_back(VD, AliasRW.Index, AliasProcIdx, 0);
     }
+    if (AliasRW.IsSequence)
+      Variants.emplace_back(AliasRW.TheDef, SchedRW.Index, AliasProcIdx, 0);
     if (AliasProcIdx == 0)
       GenericRW = true;
   }
@@ -1164,7 +1242,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
   IdxVec SelectedRWs;
   if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) {
     Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate");
-    Trans.PredTerm.push_back(PredCheck(IsRead, VInfo.RWIdx,PredDef));
+    Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx,PredDef);
     RecVec SelectedDefs = VInfo.VarOrSeqDef->getValueAsListOfDefs("Selected");
     SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead);
   }
@@ -1181,11 +1259,8 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
   if (SchedRW.IsVariadic) {
     unsigned OperIdx = RWSequences.size()-1;
     // Make N-1 copies of this transition's last sequence.
-    for (unsigned i = 1, e = SelectedRWs.size(); i != e; ++i) {
-      // Create a temporary copy the vector could reallocate.
-      RWSequences.reserve(RWSequences.size() + 1);
-      RWSequences.push_back(RWSequences[OperIdx]);
-    }
+    RWSequences.insert(RWSequences.end(), SelectedRWs.size() - 1,
+                       RWSequences[OperIdx]);
     // Push each of the N elements of the SelectedRWs onto a copy of the last
     // sequence (split the current operand into N operands).
     // Note that write sequences should be expanded within this loop--the entire
@@ -1267,7 +1342,7 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
   // Build up a set of partial results starting at the back of
   // PredTransitions. Remember the first new transition.
   unsigned StartIdx = TransVec.size();
-  TransVec.resize(TransVec.size() + 1);
+  TransVec.emplace_back();
   TransVec.back().PredTerm = Trans.PredTerm;
   TransVec.back().ProcIndices = Trans.ProcIndices;
 
@@ -1278,7 +1353,7 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
     // Push a new (empty) write sequence onto all partial Transitions.
     for (std::vector<PredTransition>::iterator I =
            TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
-      I->WriteSequences.resize(I->WriteSequences.size() + 1);
+      I->WriteSequences.emplace_back();
     }
     substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
   }
@@ -1289,7 +1364,7 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
     // Push a new (empty) read sequence onto all partial Transitions.
     for (std::vector<PredTransition>::iterator I =
            TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
-      I->ReadSequences.resize(I->ReadSequences.size() + 1);
+      I->ReadSequences.emplace_back();
     }
     substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
   }
@@ -1304,37 +1379,30 @@ static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
   for (ArrayRef<PredTransition>::iterator
          I = LastTransitions.begin(), E = LastTransitions.end(); I != E; ++I) {
     IdxVec OperWritesVariant;
-    for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
-           WSI = I->WriteSequences.begin(), WSE = I->WriteSequences.end();
-         WSI != WSE; ++WSI) {
-      // Create a new write representing the expanded sequence.
-      OperWritesVariant.push_back(
-        SchedModels.findOrInsertRW(*WSI, /*IsRead=*/false));
-    }
+    transform(I->WriteSequences, std::back_inserter(OperWritesVariant),
+              [&SchedModels](ArrayRef<unsigned> WS) {
+                return SchedModels.findOrInsertRW(WS, /*IsRead=*/false);
+              });
     IdxVec OperReadsVariant;
-    for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
-           RSI = I->ReadSequences.begin(), RSE = I->ReadSequences.end();
-         RSI != RSE; ++RSI) {
-      // Create a new read representing the expanded sequence.
-      OperReadsVariant.push_back(
-        SchedModels.findOrInsertRW(*RSI, /*IsRead=*/true));
-    }
-    IdxVec ProcIndices(I->ProcIndices.begin(), I->ProcIndices.end());
+    transform(I->ReadSequences, std::back_inserter(OperReadsVariant),
+              [&SchedModels](ArrayRef<unsigned> RS) {
+                return SchedModels.findOrInsertRW(RS, /*IsRead=*/true);
+              });
     CodeGenSchedTransition SCTrans;
     SCTrans.ToClassIdx =
       SchedModels.addSchedClass(/*ItinClassDef=*/nullptr, OperWritesVariant,
-                                OperReadsVariant, ProcIndices);
-    SCTrans.ProcIndices = ProcIndices;
+                                OperReadsVariant, I->ProcIndices);
+    SCTrans.ProcIndices.assign(I->ProcIndices.begin(), I->ProcIndices.end());
     // The final PredTerm is unique set of predicates guarding the transition.
     RecVec Preds;
-    for (SmallVectorImpl<PredCheck>::const_iterator
-           PI = I->PredTerm.begin(), PE = I->PredTerm.end(); PI != PE; ++PI) {
-      Preds.push_back(PI->Predicate);
-    }
-    RecIter PredsEnd = std::unique(Preds.begin(), Preds.end());
-    Preds.resize(PredsEnd - Preds.begin());
-    SCTrans.PredTerm = Preds;
-    SchedModels.getSchedClass(FromClassIdx).Transitions.push_back(SCTrans);
+    transform(I->PredTerm, std::back_inserter(Preds),
+              [](const PredCheck &P) {
+                return P.Predicate;
+              });
+    Preds.erase(std::unique(Preds.begin(), Preds.end()), Preds.end());
+    SCTrans.PredTerm = std::move(Preds);
+    SchedModels.getSchedClass(FromClassIdx)
+        .Transitions.push_back(std::move(SCTrans));
   }
 }
 
@@ -1345,48 +1413,42 @@ void CodeGenSchedModels::inferFromRW(ArrayRef<unsigned> OperWrites,
                                      ArrayRef<unsigned> OperReads,
                                      unsigned FromClassIdx,
                                      ArrayRef<unsigned> ProcIndices) {
-  DEBUG(dbgs() << "INFER RW proc("; dumpIdxVec(ProcIndices); dbgs() << ") ");
+  LLVM_DEBUG(dbgs() << "INFER RW proc("; dumpIdxVec(ProcIndices);
+             dbgs() << ") ");
 
   // Create a seed transition with an empty PredTerm and the expanded sequences
   // of SchedWrites for the current SchedClass.
   std::vector<PredTransition> LastTransitions;
-  LastTransitions.resize(1);
+  LastTransitions.emplace_back();
   LastTransitions.back().ProcIndices.append(ProcIndices.begin(),
                                             ProcIndices.end());
 
   for (unsigned WriteIdx : OperWrites) {
     IdxVec WriteSeq;
     expandRWSequence(WriteIdx, WriteSeq, /*IsRead=*/false);
-    unsigned Idx = LastTransitions[0].WriteSequences.size();
-    LastTransitions[0].WriteSequences.resize(Idx + 1);
-    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].WriteSequences[Idx];
-    for (IdxIter WI = WriteSeq.begin(), WE = WriteSeq.end(); WI != WE; ++WI)
-      Seq.push_back(*WI);
-    DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
-  }
-  DEBUG(dbgs() << " Reads: ");
+    LastTransitions[0].WriteSequences.emplace_back();
+    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].WriteSequences.back();
+    Seq.append(WriteSeq.begin(), WriteSeq.end());
+    LLVM_DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
+  }
+  LLVM_DEBUG(dbgs() << " Reads: ");
   for (unsigned ReadIdx : OperReads) {
     IdxVec ReadSeq;
     expandRWSequence(ReadIdx, ReadSeq, /*IsRead=*/true);
-    unsigned Idx = LastTransitions[0].ReadSequences.size();
-    LastTransitions[0].ReadSequences.resize(Idx + 1);
-    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].ReadSequences[Idx];
-    for (IdxIter RI = ReadSeq.begin(), RE = ReadSeq.end(); RI != RE; ++RI)
-      Seq.push_back(*RI);
-    DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
+    LastTransitions[0].ReadSequences.emplace_back();
+    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].ReadSequences.back();
+    Seq.append(ReadSeq.begin(), ReadSeq.end());
+    LLVM_DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
   }
-  DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << '\n');
 
   // Collect all PredTransitions for individual operands.
   // Iterate until no variant writes remain.
   while (hasVariant(LastTransitions, *this)) {
     PredTransitions Transitions(*this);
-    for (std::vector<PredTransition>::const_iterator
-           I = LastTransitions.begin(), E = LastTransitions.end();
-         I != E; ++I) {
-      Transitions.substituteVariants(*I);
-    }
-    DEBUG(Transitions.dump());
+    for (const PredTransition &Trans : LastTransitions)
+      Transitions.substituteVariants(Trans);
+    LLVM_DEBUG(Transitions.dump());
     LastTransitions.swap(Transitions.TransVec);
   }
   // If the first transition has no variants, nothing to do.
@@ -1447,6 +1509,47 @@ void CodeGenSchedModels::verifyProcResourceGroups(CodeGenProcModel &PM) {
   }
 }
 
+// Collect all the RegisterFile definitions available in this target.
+void CodeGenSchedModels::collectRegisterFiles() {
+  RecVec RegisterFileDefs = Records.getAllDerivedDefinitions("RegisterFile");
+
+  // RegisterFiles is the vector of CodeGenRegisterFile.
+  for (Record *RF : RegisterFileDefs) {
+    // For each register file definition, construct a CodeGenRegisterFile object
+    // and add it to the appropriate scheduling model.
+    CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel"));
+    PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF));
+    CodeGenRegisterFile &CGRF = PM.RegisterFiles.back();
+
+    // Now set the number of physical registers as well as the cost of registers
+    // in each register class.
+    CGRF.NumPhysRegs = RF->getValueAsInt("NumPhysRegs");
+    RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses");
+    std::vector<int64_t> RegisterCosts = RF->getValueAsListOfInts("RegCosts");
+    for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) {
+      int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1;
+      CGRF.Costs.emplace_back(RegisterClasses[I], Cost);
+    }
+  }
+}
+
+// Collect all the RegisterFile definitions available in this target.
+void CodeGenSchedModels::collectPfmCounters() {
+  for (Record *Def : Records.getAllDerivedDefinitions("PfmIssueCounter")) {
+    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
+    PM.PfmIssueCounterDefs.emplace_back(Def);
+  }
+  for (Record *Def : Records.getAllDerivedDefinitions("PfmCycleCounter")) {
+    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
+    if (PM.PfmCycleCounterDef) {
+      PrintFatalError(Def->getLoc(),
+                      "multiple cycle counters for " +
+                          Def->getValueAsDef("SchedModel")->getName());
+    }
+    PM.PfmCycleCounterDef = Def;
+  }
+}
+
 // Collect and sort WriteRes, ReadAdvance, and ProcResources.
 void CodeGenSchedModels::collectProcResources() {
   ProcResourceDefs = Records.getAllDerivedDefinitions("ProcResourceUnits");
@@ -1455,26 +1558,24 @@ void CodeGenSchedModels::collectProcResources() {
   // Add any subtarget-specific SchedReadWrites that are directly associated
   // with processor resources. Refer to the parent SchedClass's ProcIndices to
   // determine which processors they apply to.
-  for (SchedClassIter SCI = schedClassBegin(), SCE = schedClassEnd();
-       SCI != SCE; ++SCI) {
-    if (SCI->ItinClassDef)
-      collectItinProcResources(SCI->ItinClassDef);
-    else {
-      // This class may have a default ReadWrite list which can be overriden by
-      // InstRW definitions.
-      if (!SCI->InstRWs.empty()) {
-        for (RecIter RWI = SCI->InstRWs.begin(), RWE = SCI->InstRWs.end();
-             RWI != RWE; ++RWI) {
-          Record *RWModelDef = (*RWI)->getValueAsDef("SchedModel");
-          IdxVec ProcIndices(1, getProcModel(RWModelDef).Index);
-          IdxVec Writes, Reads;
-          findRWs((*RWI)->getValueAsListOfDefs("OperandReadWrites"),
-                  Writes, Reads);
-          collectRWResources(Writes, Reads, ProcIndices);
-        }
-      }
-      collectRWResources(SCI->Writes, SCI->Reads, SCI->ProcIndices);
+  for (const CodeGenSchedClass &SC :
+       make_range(schedClassBegin(), schedClassEnd())) {
+    if (SC.ItinClassDef) {
+      collectItinProcResources(SC.ItinClassDef);
+      continue;
+    }
+
+    // This class may have a default ReadWrite list which can be overriden by
+    // InstRW definitions.
+    for (Record *RW : SC.InstRWs) {
+      Record *RWModelDef = RW->getValueAsDef("SchedModel");
+      unsigned PIdx = getProcModel(RWModelDef).Index;
+      IdxVec Writes, Reads;
+      findRWs(RW->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+      collectRWResources(Writes, Reads, PIdx);
     }
+
+    collectRWResources(SC.Writes, SC.Reads, SC.ProcIndices);
   }
   // Add resources separately defined by each subtarget.
   RecVec WRDefs = Records.getAllDerivedDefinitions("WriteRes");
@@ -1509,38 +1610,45 @@ void CodeGenSchedModels::collectProcResources() {
     if (!is_contained(PM.ProcResourceDefs, PRG))
       PM.ProcResourceDefs.push_back(PRG);
   }
+  // Add ProcResourceUnits unconditionally.
+  for (Record *PRU : Records.getAllDerivedDefinitions("ProcResourceUnits")) {
+    if (!PRU->getValueInit("SchedModel")->isComplete())
+      continue;
+    CodeGenProcModel &PM = getProcModel(PRU->getValueAsDef("SchedModel"));
+    if (!is_contained(PM.ProcResourceDefs, PRU))
+      PM.ProcResourceDefs.push_back(PRU);
+  }
   // Finalize each ProcModel by sorting the record arrays.
   for (CodeGenProcModel &PM : ProcModels) {
-    std::sort(PM.WriteResDefs.begin(), PM.WriteResDefs.end(),
-              LessRecord());
-    std::sort(PM.ReadAdvanceDefs.begin(), PM.ReadAdvanceDefs.end(),
-              LessRecord());
-    std::sort(PM.ProcResourceDefs.begin(), PM.ProcResourceDefs.end(),
-              LessRecord());
-    DEBUG(
-      PM.dump();
-      dbgs() << "WriteResDefs: ";
-      for (RecIter RI = PM.WriteResDefs.begin(),
-             RE = PM.WriteResDefs.end(); RI != RE; ++RI) {
-        if ((*RI)->isSubClassOf("WriteRes"))
-          dbgs() << (*RI)->getValueAsDef("WriteType")->getName() << " ";
-        else
-          dbgs() << (*RI)->getName() << " ";
-      }
-      dbgs() << "\nReadAdvanceDefs: ";
-      for (RecIter RI = PM.ReadAdvanceDefs.begin(),
-             RE = PM.ReadAdvanceDefs.end(); RI != RE; ++RI) {
-        if ((*RI)->isSubClassOf("ReadAdvance"))
-          dbgs() << (*RI)->getValueAsDef("ReadType")->getName() << " ";
-        else
-          dbgs() << (*RI)->getName() << " ";
-      }
-      dbgs() << "\nProcResourceDefs: ";
-      for (RecIter RI = PM.ProcResourceDefs.begin(),
-             RE = PM.ProcResourceDefs.end(); RI != RE; ++RI) {
-        dbgs() << (*RI)->getName() << " ";
-      }
-      dbgs() << '\n');
+    llvm::sort(PM.WriteResDefs.begin(), PM.WriteResDefs.end(),
+               LessRecord());
+    llvm::sort(PM.ReadAdvanceDefs.begin(), PM.ReadAdvanceDefs.end(),
+               LessRecord());
+    llvm::sort(PM.ProcResourceDefs.begin(), PM.ProcResourceDefs.end(),
+               LessRecord());
+    LLVM_DEBUG(
+        PM.dump();
+        dbgs() << "WriteResDefs: "; for (RecIter RI = PM.WriteResDefs.begin(),
+                                         RE = PM.WriteResDefs.end();
+                                         RI != RE; ++RI) {
+          if ((*RI)->isSubClassOf("WriteRes"))
+            dbgs() << (*RI)->getValueAsDef("WriteType")->getName() << " ";
+          else
+            dbgs() << (*RI)->getName() << " ";
+        } dbgs() << "\nReadAdvanceDefs: ";
+        for (RecIter RI = PM.ReadAdvanceDefs.begin(),
+             RE = PM.ReadAdvanceDefs.end();
+             RI != RE; ++RI) {
+          if ((*RI)->isSubClassOf("ReadAdvance"))
+            dbgs() << (*RI)->getValueAsDef("ReadType")->getName() << " ";
+          else
+            dbgs() << (*RI)->getName() << " ";
+        } dbgs()
+        << "\nProcResourceDefs: ";
+        for (RecIter RI = PM.ProcResourceDefs.begin(),
+             RE = PM.ProcResourceDefs.end();
+             RI != RE; ++RI) { dbgs() << (*RI)->getName() << " "; } dbgs()
+        << '\n');
     verifyProcResourceGroups(PM);
   }
 
@@ -1552,6 +1660,7 @@ void CodeGenSchedModels::checkCompleteness() {
   bool Complete = true;
   bool HadCompleteModel = false;
   for (const CodeGenProcModel &ProcModel : procModels()) {
+    const bool HasItineraries = ProcModel.hasItineraries();
     if (!ProcModel.ModelDef->getValueAsBit("CompleteModel"))
       continue;
     for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
@@ -1572,7 +1681,7 @@ void CodeGenSchedModels::checkCompleteness() {
       const CodeGenSchedClass &SC = getSchedClass(SCIdx);
       if (!SC.Writes.empty())
         continue;
-      if (SC.ItinClassDef != nullptr &&
+      if (HasItineraries && SC.ItinClassDef != nullptr &&
           SC.ItinClassDef->getName() != "NoItinerary")
         continue;
 
@@ -1619,8 +1728,7 @@ void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) {
       HasMatch = true;
       IdxVec Writes, Reads;
       findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
-      IdxVec ProcIndices(1, PIdx);
-      collectRWResources(Writes, Reads, ProcIndices);
+      collectRWResources(Writes, Reads, PIdx);
     }
   }
 }
diff --git a/contrib/llvm/utils/TableGen/CodeGenSchedule.h b/contrib/llvm/utils/TableGen/CodeGenSchedule.h
index 46e22cd12810..07c11596adee 100644
--- a/contrib/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/contrib/llvm/utils/TableGen/CodeGenSchedule.h
@@ -26,6 +26,7 @@ namespace llvm {
 class CodeGenTarget;
 class CodeGenSchedModels;
 class CodeGenInstruction;
+class CodeGenRegisterClass;
 
 using RecVec = std::vector<Record*>;
 using RecIter = std::vector<Record*>::const_iterator;
@@ -33,9 +34,6 @@ using RecIter = std::vector<Record*>::const_iterator;
 using IdxVec = std::vector<unsigned>;
 using IdxIter = std::vector<unsigned>::const_iterator;
 
-void splitSchedReadWrites(const RecVec &RWDefs,
-                          RecVec &WriteDefs, RecVec &ReadDefs);
-
 /// We have two kinds of SchedReadWrites. Explicitly defined and inferred
 /// sequences.  TheDef is nonnull for explicit SchedWrites, but Sequence may or
 /// may not be empty. TheDef is null for inferred sequences, and Sequence must
@@ -142,9 +140,11 @@ struct CodeGenSchedClass {
   // off to join another inferred class.
   RecVec InstRWs;
 
-  CodeGenSchedClass(): Index(0), ItinClassDef(nullptr) {}
+  CodeGenSchedClass(unsigned Index, std::string Name, Record *ItinClassDef)
+    : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {}
 
-  bool isKeyEqual(Record *IC, ArrayRef<unsigned> W, ArrayRef<unsigned> R) {
+  bool isKeyEqual(Record *IC, ArrayRef<unsigned> W,
+                  ArrayRef<unsigned> R) const {
     return ItinClassDef == IC && makeArrayRef(Writes) == W &&
            makeArrayRef(Reads) == R;
   }
@@ -158,6 +158,38 @@ struct CodeGenSchedClass {
 #endif
 };
 
+/// Represent the cost of allocating a register of register class RCDef.
+///
+/// The cost of allocating a register is equivalent to the number of physical
+/// registers used by the register renamer. Register costs are defined at
+/// register class granularity.
+struct CodeGenRegisterCost {
+  Record *RCDef;
+  unsigned Cost;
+  CodeGenRegisterCost(Record *RC, unsigned RegisterCost)
+      : RCDef(RC), Cost(RegisterCost) {}
+  CodeGenRegisterCost(const CodeGenRegisterCost &) = default;
+  CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete;
+};
+
+/// A processor register file.
+///
+/// This class describes a processor register file. Register file information is
+/// currently consumed by external tools like llvm-mca to predict dispatch
+/// stalls due to register pressure.
+struct CodeGenRegisterFile {
+  std::string Name;
+  Record *RegisterFileDef;
+
+  unsigned NumPhysRegs;
+  std::vector<CodeGenRegisterCost> Costs;
+
+  CodeGenRegisterFile(StringRef name, Record *def)
+      : Name(name), RegisterFileDef(def), NumPhysRegs(0) {}
+
+  bool hasDefaultCosts() const { return Costs.empty(); }
+};
+
 // Processor model.
 //
 // ModelName is a unique name used to name an instantiation of MCSchedModel.
@@ -199,11 +231,21 @@ struct CodeGenProcModel {
 
   // Per-operand machine model resources associated with this processor.
   RecVec ProcResourceDefs;
-  RecVec ProcResGroupDefs;
 
-  CodeGenProcModel(unsigned Idx, const std::string &Name, Record *MDef,
+  // List of Register Files.
+  std::vector<CodeGenRegisterFile> RegisterFiles;
+
+  // Optional Retire Control Unit definition.
+  Record *RetireControlUnit;
+
+  // List of PfmCounters.
+  RecVec PfmIssueCounterDefs;
+  Record *PfmCycleCounterDef = nullptr;
+
+  CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
                    Record *IDef) :
-    Index(Idx), ModelName(Name), ModelDef(MDef), ItinsDef(IDef) {}
+    Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
+    RetireControlUnit(nullptr) {}
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
@@ -213,6 +255,12 @@ struct CodeGenProcModel {
     return !WriteResDefs.empty() || !ItinRWDefs.empty();
   }
 
+  bool hasExtraProcessorInfo() const {
+    return RetireControlUnit || !RegisterFiles.empty() ||
+        !PfmIssueCounterDefs.empty() ||
+        PfmCycleCounterDef != nullptr;
+  }
+
   unsigned getProcResourceIdx(Record *PRDef) const;
 
   bool isUnsupported(const CodeGenInstruction &Inst) const;
@@ -336,11 +384,11 @@ public:
     return const_cast<CodeGenSchedRW&>(
       IsRead ? getSchedRead(Idx) : getSchedWrite(Idx));
   }
-  const CodeGenSchedRW &getSchedRW(Record*Def) const {
+  const CodeGenSchedRW &getSchedRW(Record *Def) const {
     return const_cast<CodeGenSchedModels&>(*this).getSchedRW(Def);
   }
 
-  unsigned getSchedRWIdx(Record *Def, bool IsRead, unsigned After = 0) const;
+  unsigned getSchedRWIdx(const Record *Def, bool IsRead) const;
 
   // Return true if the given write record is referenced by a ReadAdvance.
   bool hasReadOfWrite(Record *WriteDef) const;
@@ -379,9 +427,6 @@ public:
 
   unsigned findOrInsertRW(ArrayRef<unsigned> Seq, bool IsRead);
 
-  unsigned findSchedClassIdx(Record *ItinClassDef, ArrayRef<unsigned> Writes,
-                             ArrayRef<unsigned> Reads) const;
-
   Record *findProcResUnits(Record *ProcResKind, const CodeGenProcModel &PM,
                            ArrayRef<SMLoc> Loc) const;
 
@@ -398,6 +443,14 @@ private:
 
   void collectSchedClasses();
 
+  void collectRetireControlUnits();
+
+  void collectRegisterFiles();
+
+  void collectPfmCounters();
+
+  void collectOptionalProcessorInfo();
+
   std::string createSchedClassName(Record *ItinClassDef,
                                    ArrayRef<unsigned> OperWrites,
                                    ArrayRef<unsigned> OperReads);
diff --git a/contrib/llvm/utils/TableGen/CodeGenTarget.cpp b/contrib/llvm/utils/TableGen/CodeGenTarget.cpp
index 168bd690831f..cb73ca83c9bb 100644
--- a/contrib/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/contrib/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -174,6 +174,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::iPTR:     return "MVT::iPTR";
   case MVT::iPTRAny:  return "MVT::iPTRAny";
   case MVT::Untyped:  return "MVT::Untyped";
+  case MVT::ExceptRef: return "MVT::ExceptRef";
   default: llvm_unreachable("ILLEGAL VALUE TYPE!");
   }
 }
@@ -224,6 +225,9 @@ Record *CodeGenTarget::getInstructionSet() const {
   return TargetRec->getValueAsDef("InstructionSet");
 }
 
+bool CodeGenTarget::getAllowRegisterRenaming() const {
+  return TargetRec->getValueAsInt("AllowRegisterRenaming");
+}
 
 /// getAsmParser - Return the AssemblyParser definition for this target.
 ///
@@ -274,7 +278,7 @@ CodeGenRegBank &CodeGenTarget::getRegBank() const {
 
 void CodeGenTarget::ReadRegAltNameIndices() const {
   RegAltNameIndices = Records.getAllDerivedDefinitions("RegAltNameIndex");
-  std::sort(RegAltNameIndices.begin(), RegAltNameIndices.end(), LessRecord());
+  llvm::sort(RegAltNameIndices.begin(), RegAltNameIndices.end(), LessRecord());
 }
 
 /// getRegisterByName - If there is a register with the specific AsmName,
@@ -299,7 +303,7 @@ std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
   }
 
   // Remove duplicates.
-  std::sort(Result.begin(), Result.end());
+  llvm::sort(Result.begin(), Result.end());
   Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
   return Result;
 }
@@ -310,7 +314,7 @@ void CodeGenTarget::ReadLegalValueTypes() const {
     LegalValueTypes.insert(LegalValueTypes.end(), RC.VTs.begin(), RC.VTs.end());
 
   // Remove duplicates.
-  std::sort(LegalValueTypes.begin(), LegalValueTypes.end());
+  llvm::sort(LegalValueTypes.begin(), LegalValueTypes.end());
   LegalValueTypes.erase(std::unique(LegalValueTypes.begin(),
                                     LegalValueTypes.end()),
                         LegalValueTypes.end());
@@ -345,13 +349,18 @@ GetInstByName(const char *Name,
   return I->second.get();
 }
 
-/// \brief Return all of the instructions defined by the target, ordered by
+static const char *const FixedInstrs[] = {
+#define HANDLE_TARGET_OPCODE(OPC) #OPC,
+#include "llvm/Support/TargetOpcodes.def"
+    nullptr};
+
+unsigned CodeGenTarget::getNumFixedInstructions() {
+  return array_lengthof(FixedInstrs) - 1;
+}
+
+/// Return all of the instructions defined by the target, ordered by
 /// their enum value.
 void CodeGenTarget::ComputeInstrsByEnum() const {
-  static const char *const FixedInstrs[] = {
-#define HANDLE_TARGET_OPCODE(OPC) #OPC,
-#include "llvm/CodeGen/TargetOpcodes.def"
-      nullptr};
   const auto &Insts = getInstructions();
   for (const char *const *p = FixedInstrs; *p; ++p) {
     const CodeGenInstruction *Instr = GetInstByName(*p, Insts, Records);
@@ -360,21 +369,29 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
     InstrsByEnum.push_back(Instr);
   }
   unsigned EndOfPredefines = InstrsByEnum.size();
+  assert(EndOfPredefines == getNumFixedInstructions() &&
+         "Missing generic opcode");
 
   for (const auto &I : Insts) {
     const CodeGenInstruction *CGI = I.second.get();
-    if (CGI->Namespace != "TargetOpcode")
+    if (CGI->Namespace != "TargetOpcode") {
       InstrsByEnum.push_back(CGI);
+      if (CGI->TheDef->getValueAsBit("isPseudo"))
+        ++NumPseudoInstructions;
+    }
   }
 
   assert(InstrsByEnum.size() == Insts.size() && "Missing predefined instr");
 
   // All of the instructions are now in random order based on the map iteration.
-  // Sort them by name.
-  std::sort(InstrsByEnum.begin() + EndOfPredefines, InstrsByEnum.end(),
-            [](const CodeGenInstruction *Rec1, const CodeGenInstruction *Rec2) {
-    return Rec1->TheDef->getName() < Rec2->TheDef->getName();
-  });
+  llvm::sort(
+      InstrsByEnum.begin() + EndOfPredefines, InstrsByEnum.end(),
+      [](const CodeGenInstruction *Rec1, const CodeGenInstruction *Rec2) {
+        const auto &D1 = *Rec1->TheDef;
+        const auto &D2 = *Rec2->TheDef;
+        return std::make_tuple(!D1.getValueAsBit("isPseudo"), D1.getName()) <
+               std::make_tuple(!D2.getValueAsBit("isPseudo"), D2.getName());
+      });
 }
 
 
@@ -496,11 +513,11 @@ CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC,
     if (isTarget == TargetOnly)
       Intrinsics.push_back(CodeGenIntrinsic(Defs[I]));
   }
-  std::sort(Intrinsics.begin(), Intrinsics.end(),
-            [](const CodeGenIntrinsic &LHS, const CodeGenIntrinsic &RHS) {
-              return std::tie(LHS.TargetPrefix, LHS.Name) <
-                     std::tie(RHS.TargetPrefix, RHS.Name);
-            });
+  llvm::sort(Intrinsics.begin(), Intrinsics.end(),
+             [](const CodeGenIntrinsic &LHS, const CodeGenIntrinsic &RHS) {
+               return std::tie(LHS.TargetPrefix, LHS.Name) <
+                      std::tie(RHS.TargetPrefix, RHS.Name);
+             });
   Targets.push_back({"", 0, 0});
   for (size_t I = 0, E = Intrinsics.size(); I < E; ++I)
     if (Intrinsics[I].TargetPrefix != Targets.back().Name) {
@@ -604,8 +621,12 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
     MVT::SimpleValueType VT;
     if (TyEl->isSubClassOf("LLVMMatchType")) {
       unsigned MatchTy = TyEl->getValueAsInt("Number");
-      assert(MatchTy < OverloadedVTs.size() &&
-             "Invalid matching number!");
+      if (MatchTy >= OverloadedVTs.size()) {
+        PrintError(R->getLoc(),
+                   "Parameter #" + Twine(i) + " has out of bounds matching "
+                   "number " + Twine(MatchTy));
+        PrintFatalError(Twine("ParamTypes is ") + TypeList->getAsString());
+      }
       VT = OverloadedVTs[MatchTy];
       // It only makes sense to use the extended and truncated vector element
       // variants with iAny types; otherwise, if the intrinsic is not
@@ -688,6 +709,6 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   Properties = parseSDPatternOperatorProperties(R);
 
   // Sort the argument attributes for later benefit.
-  std::sort(ArgumentAttributes.begin(), ArgumentAttributes.end());
+  llvm::sort(ArgumentAttributes.begin(), ArgumentAttributes.end());
 }
 
diff --git a/contrib/llvm/utils/TableGen/CodeGenTarget.h b/contrib/llvm/utils/TableGen/CodeGenTarget.h
index 7280d707fba6..d2833d5b6a92 100644
--- a/contrib/llvm/utils/TableGen/CodeGenTarget.h
+++ b/contrib/llvm/utils/TableGen/CodeGenTarget.h
@@ -62,6 +62,7 @@ class CodeGenTarget {
   mutable std::unique_ptr<CodeGenSchedModels> SchedModels;
 
   mutable std::vector<const CodeGenInstruction*> InstrsByEnum;
+  mutable unsigned NumPseudoInstructions = 0;
 public:
   CodeGenTarget(RecordKeeper &Records);
   ~CodeGenTarget();
@@ -77,6 +78,11 @@ public:
   ///
   Record *getInstructionSet() const;
 
+  /// getAllowRegisterRenaming - Return the AllowRegisterRenaming flag value for
+  /// this target.
+  ///
+  bool getAllowRegisterRenaming() const;
+
   /// getAsmParser - Return the AssemblyParser definition for this target.
   ///
   Record *getAsmParser() const;
@@ -140,11 +146,25 @@ public:
     return *I->second;
   }
 
-  /// getInstructionsByEnumValue - Return all of the instructions defined by the
-  /// target, ordered by their enum value.
-  ArrayRef<const CodeGenInstruction *>
-  getInstructionsByEnumValue() const {
-    if (InstrsByEnum.empty()) ComputeInstrsByEnum();
+  /// Returns the number of predefined instructions.
+  static unsigned getNumFixedInstructions();
+
+  /// Returns the number of pseudo instructions.
+  unsigned getNumPseudoInstructions() const {
+    if (InstrsByEnum.empty())
+      ComputeInstrsByEnum();
+    return NumPseudoInstructions;
+  }
+
+  /// Return all of the instructions defined by the target, ordered by their
+  /// enum value.
+  /// The following order of instructions is also guaranteed:
+  /// - fixed / generic instructions as declared in TargetOpcodes.def, in order;
+  /// - pseudo instructions in lexicographical order sorted by name;
+  /// - other instructions in lexicographical order sorted by name.
+  ArrayRef<const CodeGenInstruction *> getInstructionsByEnumValue() const {
+    if (InstrsByEnum.empty())
+      ComputeInstrsByEnum();
     return InstrsByEnum;
   }
 
diff --git a/contrib/llvm/utils/TableGen/DAGISelEmitter.cpp b/contrib/llvm/utils/TableGen/DAGISelEmitter.cpp
index 9592ab7052f4..62a0ff700725 100644
--- a/contrib/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -110,9 +110,11 @@ struct PatternSortingPredicate {
     if (LHSPatSize < RHSPatSize) return true;
     if (LHSPatSize > RHSPatSize) return false;
 
-    // Sort based on the UID of the pattern, giving us a deterministic ordering
-    // if all other sorting conditions fail.
-    assert(LHS == RHS || LHS->ID != RHS->ID);
+    // Sort based on the UID of the pattern, to reflect source order.
+    // Note that this is not guaranteed to be unique, since a single source
+    // pattern may have been resolved into multiple match patterns due to
+    // alternative fragments.  To ensure deterministic output, always use
+    // std::stable_sort with this predicate.
     return LHS->ID < RHS->ID;
   }
 };
@@ -137,13 +139,16 @@ void DAGISelEmitter::run(raw_ostream &OS) {
         "// When neither of the GET_DAGISEL* macros is defined, the functions\n"
         "// are emitted inline.\n\n";
 
-  DEBUG(errs() << "\n\nALL PATTERNS TO MATCH:\n\n";
-        for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(),
-             E = CGP.ptm_end(); I != E; ++I) {
-          errs() << "PATTERN: ";   I->getSrcPattern()->dump();
-          errs() << "\nRESULT:  "; I->getDstPattern()->dump();
-          errs() << "\n";
-        });
+  LLVM_DEBUG(errs() << "\n\nALL PATTERNS TO MATCH:\n\n";
+             for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(),
+                  E = CGP.ptm_end();
+                  I != E; ++I) {
+               errs() << "PATTERN: ";
+               I->getSrcPattern()->dump();
+               errs() << "\nRESULT:  ";
+               I->getDstPattern()->dump();
+               errs() << "\n";
+             });
 
   // Add all the patterns to a temporary list so we can sort them.
   std::vector<const PatternToMatch*> Patterns;
@@ -153,7 +158,8 @@ void DAGISelEmitter::run(raw_ostream &OS) {
 
   // We want to process the matches in order of minimal cost.  Sort the patterns
   // so the least cost one is at the start.
-  std::sort(Patterns.begin(), Patterns.end(), PatternSortingPredicate(CGP));
+  std::stable_sort(Patterns.begin(), Patterns.end(),
+                   PatternSortingPredicate(CGP));
 
 
   // Convert each variant of each pattern into a Matcher.
diff --git a/contrib/llvm/utils/TableGen/DAGISelMatcher.h b/contrib/llvm/utils/TableGen/DAGISelMatcher.h
index c672b0acac9f..ecc1f1dd094a 100644
--- a/contrib/llvm/utils/TableGen/DAGISelMatcher.h
+++ b/contrib/llvm/utils/TableGen/DAGISelMatcher.h
@@ -13,8 +13,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
   struct CodeGenRegister;
diff --git a/contrib/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/contrib/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index a19b9e4b95c7..ce23651b9682 100644
--- a/contrib/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/contrib/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -58,7 +58,7 @@ namespace {
     /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts
     /// out with all of the types removed.  This allows us to insert type checks
     /// as we scan the tree.
-    TreePatternNode *PatWithNoTypes;
+    TreePatternNodePtr PatWithNoTypes;
 
     /// VariableMap - A map from variable names ('$dst') to the recorded operand
     /// number that they were captured as.  These are biased by 1 to make
@@ -101,10 +101,6 @@ namespace {
   public:
     MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
 
-    ~MatcherGen() {
-      delete PatWithNoTypes;
-    }
-
     bool EmitMatcherCode(unsigned Variant);
     void EmitResultCode();
 
@@ -134,10 +130,6 @@ namespace {
       return VarMapEntry-1;
     }
 
-    /// GetInstPatternNode - Get the pattern for an instruction.
-    const TreePatternNode *GetInstPatternNode(const DAGInstruction &Ins,
-                                              const TreePatternNode *N);
-
     void EmitResultOperand(const TreePatternNode *N,
                            SmallVectorImpl<unsigned> &ResultOps);
     void EmitResultOfNamedOperand(const TreePatternNode *N,
@@ -521,7 +513,8 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   }
 
   // Emit the matcher for the pattern structure and types.
-  EmitMatchCode(Pattern.getSrcPattern(), PatWithNoTypes, Pattern.ForceMode);
+  EmitMatchCode(Pattern.getSrcPattern(), PatWithNoTypes.get(),
+                Pattern.ForceMode);
 
   // If the pattern has a predicate on it (e.g. only enabled when a subtarget
   // feature is around, do the check).
@@ -533,7 +526,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   // because they are generally more expensive to evaluate and more difficult to
   // factor.
   for (unsigned i = 0, e = MatchedComplexPatterns.size(); i != e; ++i) {
-    const TreePatternNode *N = MatchedComplexPatterns[i].first;
+    auto N = MatchedComplexPatterns[i].first;
 
     // Remember where the results of this match get stuck.
     if (N->isLeaf()) {
@@ -664,28 +657,6 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
   N->dump();
 }
 
-/// GetInstPatternNode - Get the pattern for an instruction.
-///
-const TreePatternNode *MatcherGen::
-GetInstPatternNode(const DAGInstruction &Inst, const TreePatternNode *N) {
-  const TreePattern *InstPat = Inst.getPattern();
-
-  // FIXME2?: Assume actual pattern comes before "implicit".
-  TreePatternNode *InstPatNode;
-  if (InstPat)
-    InstPatNode = InstPat->getTree(0);
-  else if (/*isRoot*/ N == Pattern.getDstPattern())
-    InstPatNode = Pattern.getSrcPattern();
-  else
-    return nullptr;
-
-  if (InstPatNode && !InstPatNode->isLeaf() &&
-      InstPatNode->getOperator()->getName() == "set")
-    InstPatNode = InstPatNode->getChild(InstPatNode->getNumChildren()-1);
-
-  return InstPatNode;
-}
-
 static bool
 mayInstNodeLoadOrStore(const TreePatternNode *N,
                        const CodeGenDAGPatterns &CGP) {
@@ -723,25 +694,6 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   CodeGenInstruction &II = CGT.getInstruction(Op);
   const DAGInstruction &Inst = CGP.getInstruction(Op);
 
-  // If we can, get the pattern for the instruction we're generating. We derive
-  // a variety of information from this pattern, such as whether it has a chain.
-  //
-  // FIXME2: This is extremely dubious for several reasons, not the least of
-  // which it gives special status to instructions with patterns that Pat<>
-  // nodes can't duplicate.
-  const TreePatternNode *InstPatNode = GetInstPatternNode(Inst, N);
-
-  // NodeHasChain - Whether the instruction node we're creating takes chains.
-  bool NodeHasChain = InstPatNode &&
-                      InstPatNode->TreeHasProperty(SDNPHasChain, CGP);
-
-  // Instructions which load and store from memory should have a chain,
-  // regardless of whether they happen to have an internal pattern saying so.
-  if (Pattern.getSrcPattern()->TreeHasProperty(SDNPHasChain, CGP)
-      && (II.hasCtrlDep || II.mayLoad || II.mayStore || II.canFoldAsLoad ||
-          II.hasSideEffects))
-      NodeHasChain = true;
-
   bool isRoot = N == Pattern.getDstPattern();
 
   // TreeHasOutGlue - True if this tree has glue.
@@ -784,7 +736,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       const DAGDefaultOperand &DefaultOp
         = CGP.getDefaultOperand(OperandNode);
       for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i)
-        EmitResultOperand(DefaultOp.DefaultOps[i], InstOps);
+        EmitResultOperand(DefaultOp.DefaultOps[i].get(), InstOps);
       continue;
     }
 
@@ -895,6 +847,26 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
                                              NumNodesThatLoadOrStore != 1));
   }
 
+  // Determine whether we need to attach a chain to this node.
+  bool NodeHasChain = false;
+  if (Pattern.getSrcPattern()->TreeHasProperty(SDNPHasChain, CGP)) {
+    // For some instructions, we were able to infer from the pattern whether
+    // they should have a chain.  Otherwise, attach the chain to the root.
+    //
+    // FIXME2: This is extremely dubious for several reasons, not the least of
+    // which it gives special status to instructions with patterns that Pat<>
+    // nodes can't duplicate.
+    if (II.hasChain_Inferred)
+      NodeHasChain = II.hasChain;
+    else
+      NodeHasChain = isRoot;
+    // Instructions which load and store from memory should have a chain,
+    // regardless of whether they happen to have a pattern saying so.
+    if (II.hasCtrlDep || II.mayLoad || II.mayStore || II.canFoldAsLoad ||
+        II.hasSideEffects)
+      NodeHasChain = true;
+  }
+
   assert((!ResultVTs.empty() || TreeHasOutGlue || NodeHasChain) &&
          "Node has no result");
 
diff --git a/contrib/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/contrib/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index 0bb656826fbd..554c7438ce3d 100644
--- a/contrib/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/contrib/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -293,15 +293,12 @@ static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
     if (Scan != e &&
         // Don't print it's obvious nothing extra could be merged anyway.
         Scan+1 != e) {
-      DEBUG(errs() << "Couldn't merge this:\n";
-            Optn->print(errs(), 4);
-            errs() << "into this:\n";
-            OptionsToMatch[Scan]->print(errs(), 4);
-            if (Scan+1 != e)
-              OptionsToMatch[Scan+1]->printOne(errs());
-            if (Scan+2 < e)
-              OptionsToMatch[Scan+2]->printOne(errs());
-            errs() << "\n");
+      LLVM_DEBUG(errs() << "Couldn't merge this:\n"; Optn->print(errs(), 4);
+                 errs() << "into this:\n";
+                 OptionsToMatch[Scan]->print(errs(), 4);
+                 if (Scan + 1 != e) OptionsToMatch[Scan + 1]->printOne(errs());
+                 if (Scan + 2 < e) OptionsToMatch[Scan + 2]->printOne(errs());
+                 errs() << "\n");
     }
     
     // If we only found one option starting with this matcher, no factoring is
diff --git a/contrib/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/contrib/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
index 1c1932a0144a..0db0f55f5ed6 100644
--- a/contrib/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -278,30 +278,30 @@ public:
 // dbgsInsnClass - When debugging, print instruction class stages.
 //
 void dbgsInsnClass(const std::vector<unsigned> &InsnClass) {
-  DEBUG(dbgs() << "InsnClass: ");
+  LLVM_DEBUG(dbgs() << "InsnClass: ");
   for (unsigned i = 0; i < InsnClass.size(); ++i) {
     if (i > 0) {
-      DEBUG(dbgs() << ", ");
+      LLVM_DEBUG(dbgs() << ", ");
     }
-    DEBUG(dbgs() << "0x" << Twine::utohexstr(InsnClass[i]));
+    LLVM_DEBUG(dbgs() << "0x" << Twine::utohexstr(InsnClass[i]));
   }
   DFAInput InsnInput = getDFAInsnInput(InsnClass);
-  DEBUG(dbgs() << " (input: 0x" << Twine::utohexstr(InsnInput) << ")");
+  LLVM_DEBUG(dbgs() << " (input: 0x" << Twine::utohexstr(InsnInput) << ")");
 }
 
 //
 // dbgsStateInfo - When debugging, print the set of state info.
 //
 void dbgsStateInfo(const std::set<unsigned> &stateInfo) {
-  DEBUG(dbgs() << "StateInfo: ");
+  LLVM_DEBUG(dbgs() << "StateInfo: ");
   unsigned i = 0;
   for (std::set<unsigned>::iterator SI = stateInfo.begin();
        SI != stateInfo.end(); ++SI, ++i) {
     unsigned thisState = *SI;
     if (i > 0) {
-      DEBUG(dbgs() << ", ");
+      LLVM_DEBUG(dbgs() << ", ");
     }
-    DEBUG(dbgs() << "0x" << Twine::utohexstr(thisState));
+    LLVM_DEBUG(dbgs() << "0x" << Twine::utohexstr(thisState));
   }
 }
 
@@ -310,7 +310,7 @@ void dbgsStateInfo(const std::set<unsigned> &stateInfo) {
 //
 void dbgsIndent(unsigned indent) {
   for (unsigned i = 0; i < indent; ++i) {
-    DEBUG(dbgs() << " ");
+    LLVM_DEBUG(dbgs() << " ");
   }
 }
 #endif // NDEBUG
@@ -361,7 +361,8 @@ void State::AddInsnClass(std::vector<unsigned> &InsnClass,
 
     DenseSet<unsigned> VisitedResourceStates;
 
-    DEBUG(dbgs() << "  thisState: 0x" << Twine::utohexstr(thisState) << "\n");
+    LLVM_DEBUG(dbgs() << "  thisState: 0x" << Twine::utohexstr(thisState)
+                      << "\n");
     AddInsnClassStages(InsnClass, ComboBitToBitsMap,
                                 numstages - 1, numstages,
                                 thisState, thisState,
@@ -378,7 +379,7 @@ void State::AddInsnClassStages(std::vector<unsigned> &InsnClass,
   assert((chkstage < numstages) && "AddInsnClassStages: stage out of range");
   unsigned thisStage = InsnClass[chkstage];
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgsIndent((1 + numstages - chkstage) << 1);
     dbgs() << "AddInsnClassStages " << chkstage << " (0x"
            << Twine::utohexstr(thisStage) << ") from ";
@@ -395,10 +396,10 @@ void State::AddInsnClassStages(std::vector<unsigned> &InsnClass,
     if (resourceMask & thisStage) {
       unsigned combo = ComboBitToBitsMap[resourceMask];
       if (combo && ((~prevState & combo) != combo)) {
-        DEBUG(dbgs() << "\tSkipped Add 0x" << Twine::utohexstr(prevState)
-                     << " - combo op 0x" << Twine::utohexstr(resourceMask)
-                     << " (0x" << Twine::utohexstr(combo)
-                     << ") cannot be scheduled\n");
+        LLVM_DEBUG(dbgs() << "\tSkipped Add 0x" << Twine::utohexstr(prevState)
+                          << " - combo op 0x" << Twine::utohexstr(resourceMask)
+                          << " (0x" << Twine::utohexstr(combo)
+                          << ") cannot be scheduled\n");
         continue;
       }
       //
@@ -406,7 +407,7 @@ void State::AddInsnClassStages(std::vector<unsigned> &InsnClass,
       // resource state if that resource was used.
       //
       unsigned ResultingResourceState = prevState | resourceMask | combo;
-      DEBUG({
+      LLVM_DEBUG({
         dbgsIndent((2 + numstages - chkstage) << 1);
         dbgs() << "0x" << Twine::utohexstr(prevState) << " | 0x"
                << Twine::utohexstr(resourceMask);
@@ -433,13 +434,15 @@ void State::AddInsnClassStages(std::vector<unsigned> &InsnClass,
           if (VisitedResourceStates.count(ResultingResourceState) == 0) {
             VisitedResourceStates.insert(ResultingResourceState);
             PossibleStates.insert(ResultingResourceState);
-            DEBUG(dbgs() << "\tResultingResourceState: 0x"
-                         << Twine::utohexstr(ResultingResourceState) << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "\tResultingResourceState: 0x"
+                       << Twine::utohexstr(ResultingResourceState) << "\n");
           } else {
-            DEBUG(dbgs() << "\tSkipped Add - state already seen\n");
+            LLVM_DEBUG(dbgs() << "\tSkipped Add - state already seen\n");
           }
         } else {
-          DEBUG(dbgs() << "\tSkipped Add - no final resources available\n");
+          LLVM_DEBUG(dbgs()
+                     << "\tSkipped Add - no final resources available\n");
         }
       } else {
         //
@@ -447,13 +450,13 @@ void State::AddInsnClassStages(std::vector<unsigned> &InsnClass,
         // stage in InsnClass for available resources.
         //
         if (ResultingResourceState != prevState) {
-          DEBUG(dbgs() << "\n");
+          LLVM_DEBUG(dbgs() << "\n");
           AddInsnClassStages(InsnClass, ComboBitToBitsMap,
                                 chkstage - 1, numstages,
                                 ResultingResourceState, origState,
                                 VisitedResourceStates, PossibleStates);
         } else {
-          DEBUG(dbgs() << "\tSkipped Add - no resources available\n");
+          LLVM_DEBUG(dbgs() << "\tSkipped Add - no resources available\n");
         }
       }
     }
@@ -494,10 +497,11 @@ bool State::canMaybeAddInsnClass(std::vector<unsigned> &InsnClass,
       //       These cases are caught later in AddInsnClass.
       unsigned combo = ComboBitToBitsMap[InsnClass[i]];
       if (combo && ((~resources & combo) != combo)) {
-        DEBUG(dbgs() << "\tSkipped canMaybeAdd 0x"
-                     << Twine::utohexstr(resources) << " - combo op 0x"
-                     << Twine::utohexstr(InsnClass[i]) << " (0x"
-                     << Twine::utohexstr(combo) << ") cannot be scheduled\n");
+        LLVM_DEBUG(dbgs() << "\tSkipped canMaybeAdd 0x"
+                          << Twine::utohexstr(resources) << " - combo op 0x"
+                          << Twine::utohexstr(InsnClass[i]) << " (0x"
+                          << Twine::utohexstr(combo)
+                          << ") cannot be scheduled\n");
         available = false;
         break;
       }
@@ -537,9 +541,10 @@ void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName,
                            int maxResources, int numCombos, int maxStages) {
   unsigned numStates = states.size();
 
-  DEBUG(dbgs() << "-----------------------------------------------------------------------------\n");
-  DEBUG(dbgs() << "writeTableAndAPI\n");
-  DEBUG(dbgs() << "Total states: " << numStates << "\n");
+  LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
+                       "----------------------\n");
+  LLVM_DEBUG(dbgs() << "writeTableAndAPI\n");
+  LLVM_DEBUG(dbgs() << "Total states: " << numStates << "\n");
 
   OS << "namespace llvm {\n";
 
@@ -647,9 +652,10 @@ int DFAPacketizerEmitter::collectAllFuncUnits(
                             std::map<std::string, unsigned> &FUNameToBitsMap,
                             int &maxFUs,
                             raw_ostream &OS) {
-  DEBUG(dbgs() << "-----------------------------------------------------------------------------\n");
-  DEBUG(dbgs() << "collectAllFuncUnits");
-  DEBUG(dbgs() << " (" << ProcItinList.size() << " itineraries)\n");
+  LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
+                       "----------------------\n");
+  LLVM_DEBUG(dbgs() << "collectAllFuncUnits");
+  LLVM_DEBUG(dbgs() << " (" << ProcItinList.size() << " itineraries)\n");
 
   int totalFUs = 0;
   // Parse functional units for all the itineraries.
@@ -657,10 +663,8 @@ int DFAPacketizerEmitter::collectAllFuncUnits(
     Record *Proc = ProcItinList[i];
     std::vector<Record*> FUs = Proc->getValueAsListOfDefs("FU");
 
-    DEBUG(dbgs() << "    FU:" << i
-                 << " (" << FUs.size() << " FUs) "
-                 << Proc->getName());
-
+    LLVM_DEBUG(dbgs() << "    FU:" << i << " (" << FUs.size() << " FUs) "
+                      << Proc->getName());
 
     // Convert macros to bits for each stage.
     unsigned numFUs = FUs.size();
@@ -669,14 +673,14 @@ int DFAPacketizerEmitter::collectAllFuncUnits(
                       "Exceeded maximum number of representable resources");
       unsigned FuncResources = (unsigned) (1U << j);
       FUNameToBitsMap[FUs[j]->getName()] = FuncResources;
-      DEBUG(dbgs() << " " << FUs[j]->getName() << ":0x"
-                   << Twine::utohexstr(FuncResources));
+      LLVM_DEBUG(dbgs() << " " << FUs[j]->getName() << ":0x"
+                        << Twine::utohexstr(FuncResources));
     }
     if (((int) numFUs) > maxFUs) {
       maxFUs = numFUs;
     }
     totalFUs += numFUs;
-    DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "\n");
   }
   return totalFUs;
 }
@@ -690,18 +694,18 @@ int DFAPacketizerEmitter::collectAllComboFuncs(
                             std::map<std::string, unsigned> &FUNameToBitsMap,
                             std::map<unsigned, unsigned> &ComboBitToBitsMap,
                             raw_ostream &OS) {
-  DEBUG(dbgs() << "-----------------------------------------------------------------------------\n");
-  DEBUG(dbgs() << "collectAllComboFuncs");
-  DEBUG(dbgs() << " (" << ComboFuncList.size() << " sets)\n");
+  LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
+                       "----------------------\n");
+  LLVM_DEBUG(dbgs() << "collectAllComboFuncs");
+  LLVM_DEBUG(dbgs() << " (" << ComboFuncList.size() << " sets)\n");
 
   int numCombos = 0;
   for (unsigned i = 0, N = ComboFuncList.size(); i < N; ++i) {
     Record *Func = ComboFuncList[i];
     std::vector<Record*> FUs = Func->getValueAsListOfDefs("CFD");
 
-    DEBUG(dbgs() << "    CFD:" << i
-                 << " (" << FUs.size() << " combo FUs) "
-                 << Func->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "    CFD:" << i << " (" << FUs.size() << " combo FUs) "
+                      << Func->getName() << "\n");
 
     // Convert macros to bits for each stage.
     for (unsigned j = 0, N = FUs.size(); j < N; ++j) {
@@ -714,20 +718,20 @@ int DFAPacketizerEmitter::collectAllComboFuncs(
       const std::string &ComboFuncName = ComboFunc->getName();
       unsigned ComboBit = FUNameToBitsMap[ComboFuncName];
       unsigned ComboResources = ComboBit;
-      DEBUG(dbgs() << "      combo: " << ComboFuncName << ":0x"
-                   << Twine::utohexstr(ComboResources) << "\n");
+      LLVM_DEBUG(dbgs() << "      combo: " << ComboFuncName << ":0x"
+                        << Twine::utohexstr(ComboResources) << "\n");
       for (unsigned k = 0, M = FuncList.size(); k < M; ++k) {
         std::string FuncName = FuncList[k]->getName();
         unsigned FuncResources = FUNameToBitsMap[FuncName];
-        DEBUG(dbgs() << "        " << FuncName << ":0x"
-                     << Twine::utohexstr(FuncResources) << "\n");
+        LLVM_DEBUG(dbgs() << "        " << FuncName << ":0x"
+                          << Twine::utohexstr(FuncResources) << "\n");
         ComboResources |= FuncResources;
       }
       ComboBitToBitsMap[ComboBit] = ComboResources;
       numCombos++;
-      DEBUG(dbgs() << "          => combo bits: " << ComboFuncName << ":0x"
-                   << Twine::utohexstr(ComboBit) << " = 0x"
-                   << Twine::utohexstr(ComboResources) << "\n");
+      LLVM_DEBUG(dbgs() << "          => combo bits: " << ComboFuncName << ":0x"
+                        << Twine::utohexstr(ComboBit) << " = 0x"
+                        << Twine::utohexstr(ComboResources) << "\n");
     }
   }
   return numCombos;
@@ -747,8 +751,8 @@ int DFAPacketizerEmitter::collectOneInsnClass(const std::string &ProcName,
   // The number of stages.
   unsigned NStages = StageList.size();
 
-  DEBUG(dbgs() << "    " << ItinData->getValueAsDef("TheClass")->getName()
-               << "\n");
+  LLVM_DEBUG(dbgs() << "    " << ItinData->getValueAsDef("TheClass")->getName()
+                    << "\n");
 
   std::vector<unsigned> UnitBits;
 
@@ -760,8 +764,8 @@ int DFAPacketizerEmitter::collectOneInsnClass(const std::string &ProcName,
     const std::vector<Record*> &UnitList =
       Stage->getValueAsListOfDefs("Units");
 
-    DEBUG(dbgs() << "        stage:" << i
-                 << " [" << UnitList.size() << " units]:");
+    LLVM_DEBUG(dbgs() << "        stage:" << i << " [" << UnitList.size()
+                      << " units]:");
     unsigned dbglen = 26;  // cursor after stage dbgs
 
     // Compute the bitwise or of each unit used in this stage.
@@ -769,7 +773,7 @@ int DFAPacketizerEmitter::collectOneInsnClass(const std::string &ProcName,
     for (unsigned j = 0, M = UnitList.size(); j < M; ++j) {
       // Conduct bitwise or.
       std::string UnitName = UnitList[j]->getName();
-      DEBUG(dbgs() << " " << j << ":" << UnitName);
+      LLVM_DEBUG(dbgs() << " " << j << ":" << UnitName);
       dbglen += 3 + UnitName.length();
       assert(FUNameToBitsMap.count(UnitName));
       UnitBitValue |= FUNameToBitsMap[UnitName];
@@ -780,15 +784,16 @@ int DFAPacketizerEmitter::collectOneInsnClass(const std::string &ProcName,
 
     while (dbglen <= 64) {   // line up bits dbgs
         dbglen += 8;
-        DEBUG(dbgs() << "\t");
+        LLVM_DEBUG(dbgs() << "\t");
     }
-    DEBUG(dbgs() << " (bits: 0x" << Twine::utohexstr(UnitBitValue) << ")\n");
+    LLVM_DEBUG(dbgs() << " (bits: 0x" << Twine::utohexstr(UnitBitValue)
+                      << ")\n");
   }
 
   if (!UnitBits.empty())
     allInsnClasses.push_back(UnitBits);
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "        ";
     dbgsInsnClass(UnitBits);
     dbgs() << "\n";
@@ -811,10 +816,10 @@ int DFAPacketizerEmitter::collectAllInsnClasses(const std::string &ProcName,
   unsigned M = ItinDataList.size();
 
   int numInsnClasses = 0;
-  DEBUG(dbgs() << "-----------------------------------------------------------------------------\n"
-               << "collectAllInsnClasses "
-               << ProcName
-               << " (" << M << " classes)\n");
+  LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
+                       "----------------------\n"
+                    << "collectAllInsnClasses " << ProcName << " (" << M
+                    << " classes)\n");
 
   // Collect stages for each instruction class for all itinerary data
   for (unsigned j = 0; j < M; j++) {
@@ -914,7 +919,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
   //
   while (!WorkList.empty()) {
     const State *current = WorkList.pop_back_val();
-    DEBUG({
+    LLVM_DEBUG({
       dbgs() << "---------------------\n";
       dbgs() << "Processing state: " << current->stateNum << " - ";
       dbgsStateInfo(current->stateInfo);
@@ -922,7 +927,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
     });
     for (unsigned i = 0; i < allInsnClasses.size(); i++) {
       std::vector<unsigned> InsnClass = allInsnClasses[i];
-      DEBUG({
+      LLVM_DEBUG({
         dbgs() << i << " ";
         dbgsInsnClass(InsnClass);
         dbgs() << "\n";
@@ -938,11 +943,11 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
         const State *NewState = nullptr;
         current->AddInsnClass(InsnClass, ComboBitToBitsMap, NewStateResources);
         if (NewStateResources.empty()) {
-          DEBUG(dbgs() << "  Skipped - no new states generated\n");
+          LLVM_DEBUG(dbgs() << "  Skipped - no new states generated\n");
           continue;
         }
 
-        DEBUG({
+        LLVM_DEBUG({
           dbgs() << "\t";
           dbgsStateInfo(NewStateResources);
           dbgs() << "\n";
@@ -954,7 +959,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
         auto VI = Visited.find(NewStateResources);
         if (VI != Visited.end()) {
           NewState = VI->second;
-          DEBUG({
+          LLVM_DEBUG({
             dbgs() << "\tFound existing state: " << NewState->stateNum
                    << " - ";
             dbgsStateInfo(NewState->stateInfo);
@@ -965,7 +970,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
           NewState->stateInfo = NewStateResources;
           Visited[NewStateResources] = NewState;
           WorkList.push_back(NewState);
-          DEBUG({
+          LLVM_DEBUG({
             dbgs() << "\tAccepted new state: " << NewState->stateNum << " - ";
             dbgsStateInfo(NewState->stateInfo);
             dbgs() << "\n";
diff --git a/contrib/llvm/utils/TableGen/DisassemblerEmitter.cpp b/contrib/llvm/utils/TableGen/DisassemblerEmitter.cpp
index 6e1d8dde981c..b99a0a973a2c 100644
--- a/contrib/llvm/utils/TableGen/DisassemblerEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
+#include "WebAssemblyDisassemblerEmitter.h"
 #include "X86DisassemblerTables.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/TableGen/Error.h"
@@ -74,8 +75,8 @@ using namespace llvm::X86Disassembler;
 ///     accurate.  Sometimes they are not.
 /// (3) to fix the tables to reflect the actual context (for example, required
 ///     prefixes), and possibly to add a new context by editing
-///     lib/Target/X86/X86DisassemblerDecoderCommon.h.  This is unlikely to be
-///     the cause.
+///     include/llvm/Support/X86DisassemblerDecoderCommon.h.  This is unlikely
+///     to be the cause.
 ///
 /// DisassemblerEmitter.cpp contains the implementation for the emitter,
 ///   which simply pulls out instructions from the CodeGenTarget and pushes them
@@ -125,6 +126,14 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
     return;
   }
 
+  // WebAssembly has variable length opcodes, so can't use EmitFixedLenDecoder
+  // below (which depends on a Size table-gen Record), and also uses a custom
+  // disassembler.
+  if (Target.getName() == "WebAssembly") {
+    emitWebAssemblyDisassemblerTables(OS, Target.getInstructionsByEnumValue());
+    return;
+  }
+
   // ARM and Thumb have a CHECK() macro to deal with DecodeStatuses.
   if (Target.getName() == "ARM" || Target.getName() == "Thumb" ||
       Target.getName() == "AArch64" || Target.getName() == "ARM64") {
diff --git a/contrib/llvm/utils/TableGen/FastISelEmitter.cpp b/contrib/llvm/utils/TableGen/FastISelEmitter.cpp
index 610f4d21bf2d..c0902e4c6f1a 100644
--- a/contrib/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -36,8 +36,18 @@ struct InstructionMemo {
   std::string Name;
   const CodeGenRegisterClass *RC;
   std::string SubRegNo;
-  std::vector<std::string>* PhysRegs;
+  std::vector<std::string> PhysRegs;
   std::string PredicateCheck;
+
+  InstructionMemo(std::string Name, const CodeGenRegisterClass *RC,
+                  std::string SubRegNo, std::vector<std::string> PhysRegs,
+                  std::string PredicateCheck)
+    : Name(Name), RC(RC), SubRegNo(SubRegNo), PhysRegs(PhysRegs),
+      PredicateCheck(PredicateCheck) {}
+
+  // Make sure we do not copy InstructionMemo.
+  InstructionMemo(const InstructionMemo &Other) = delete;
+  InstructionMemo(InstructionMemo &&Other) = default;
 };
 } // End anonymous namespace
 
@@ -453,6 +463,13 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     if (II.Operands.empty())
       continue;
 
+    // Allow instructions to be marked as unavailable for FastISel for
+    // certain cases, i.e. an ISA has two 'and' instruction which differ
+    // by what registers they can use but are otherwise identical for
+    // codegen purposes.
+    if (II.FastISelShouldIgnore)
+      continue;
+
     // For now, ignore multi-instruction patterns.
     bool MultiInsts = false;
     for (unsigned i = 0, e = Dst->getNumChildren(); i != e; ++i) {
@@ -520,10 +537,10 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
                              DstRC))
       continue;
 
-    std::vector<std::string>* PhysRegInputs = new std::vector<std::string>();
+    std::vector<std::string> PhysRegInputs;
     if (InstPatNode->getOperator()->getName() == "imm" ||
         InstPatNode->getOperator()->getName() == "fpimm")
-      PhysRegInputs->push_back("");
+      PhysRegInputs.push_back("");
     else {
       // Compute the PhysRegs used by the given pattern, and check that
       // the mapping from the src to dst patterns is simple.
@@ -541,7 +558,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
           ++DstIndex;
         }
 
-        PhysRegInputs->push_back(PhysReg);
+        PhysRegInputs.push_back(PhysReg);
       }
 
       if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst->getNumChildren())
@@ -565,13 +582,13 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     std::string PredicateCheck = Pattern.getPredicateCheck();
 
     // Ok, we found a pattern that we can handle. Remember it.
-    InstructionMemo Memo = {
+    InstructionMemo Memo(
       Pattern.getDstPattern()->getOperator()->getName(),
       DstRC,
       SubRegNo,
       PhysRegInputs,
       PredicateCheck
-    };
+    );
 
     int complexity = Pattern.getPatternComplexity(CGP);
 
@@ -585,8 +602,8 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 
        // Note: Instructions with the same complexity will appear in the order
           // that they are encountered.
-    SimplePatterns[Operands][OpcodeName][VT][RetVT].insert(
-      std::make_pair(complexity, Memo));
+    SimplePatterns[Operands][OpcodeName][VT][RetVT].emplace(complexity,
+                                                            std::move(Memo));
 
     // If any of the operands were immediates with predicates on them, strip
     // them down to a signature that doesn't have predicates so that we can
@@ -641,22 +658,22 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
       OS << "  ";
     }
 
-    for (unsigned i = 0; i < Memo.PhysRegs->size(); ++i) {
-      if ((*Memo.PhysRegs)[i] != "")
+    for (unsigned i = 0; i < Memo.PhysRegs.size(); ++i) {
+      if (Memo.PhysRegs[i] != "")
         OS << "  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, "
-           << "TII.get(TargetOpcode::COPY), "
-           << (*Memo.PhysRegs)[i] << ").addReg(Op" << i << ");\n";
+           << "TII.get(TargetOpcode::COPY), " << Memo.PhysRegs[i]
+           << ").addReg(Op" << i << ");\n";
     }
 
     OS << "  return fastEmitInst_";
     if (Memo.SubRegNo.empty()) {
-      Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
-     ImmediatePredicates, true);
+      Operands.PrintManglingSuffix(OS, Memo.PhysRegs, ImmediatePredicates,
+                                   true);
       OS << "(" << InstNS << "::" << Memo.Name << ", ";
       OS << "&" << InstNS << "::" << Memo.RC->getName() << "RegClass";
       if (!Operands.empty())
         OS << ", ";
-      Operands.PrintArguments(OS, *Memo.PhysRegs);
+      Operands.PrintArguments(OS, Memo.PhysRegs);
       OS << ");\n";
     } else {
       OS << "extractsubreg(" << RetVTName
@@ -811,7 +828,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
       = SignaturesWithConstantForms.find(Operands);
     if (MI != SignaturesWithConstantForms.end()) {
       // Unique any duplicates out of the list.
-      std::sort(MI->second.begin(), MI->second.end());
+      llvm::sort(MI->second.begin(), MI->second.end());
       MI->second.erase(std::unique(MI->second.begin(), MI->second.end()),
                        MI->second.end());
 
diff --git a/contrib/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp b/contrib/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
index 03930d7132df..76ba1c001092 100644
--- a/contrib/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -606,12 +606,13 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups,
     // NumToSkip entry itself, so subtract two  from the displacement here
     // to account for that.
     uint32_t FixupIdx = *I;
-    uint32_t Delta = DestIdx - FixupIdx - 2;
-    // Our NumToSkip entries are 16-bits. Make sure our table isn't too
+    uint32_t Delta = DestIdx - FixupIdx - 3;
+    // Our NumToSkip entries are 24-bits. Make sure our table isn't too
     // big.
-    assert(Delta < 65536U && "disassembler decoding table too large!");
+    assert(Delta < (1u << 24));
     Table[FixupIdx] = (uint8_t)Delta;
     Table[FixupIdx + 1] = (uint8_t)(Delta >> 8);
+    Table[FixupIdx + 2] = (uint8_t)(Delta >> 16);
   }
 }
 
@@ -646,7 +647,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
     } else {
       Table.push_back(MCD::OPC_FilterValue);
       // Encode and emit the value to filter against.
-      uint8_t Buffer[8];
+      uint8_t Buffer[16];
       unsigned Len = encodeULEB128(Filter.first, Buffer);
       Table.insert(Table.end(), Buffer, Buffer + Len);
       // Reserve space for the NumToSkip entry. We'll backpatch the value
@@ -654,6 +655,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
       PrevFilter = Table.size();
       Table.push_back(0);
       Table.push_back(0);
+      Table.push_back(0);
     }
 
     // We arrive at a category of instructions with the same segment value.
@@ -666,10 +668,11 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
     // of the filter itself to be able to skip forward when false. Subtract
     // two as to account for the width of the NumToSkip field itself.
     if (PrevFilter) {
-      uint32_t NumToSkip = Table.size() - PrevFilter - 2;
-      assert(NumToSkip < 65536U && "disassembler decoding table too large!");
+      uint32_t NumToSkip = Table.size() - PrevFilter - 3;
+      assert(NumToSkip < (1u << 24) && "disassembler decoding table too large!");
       Table[PrevFilter] = (uint8_t)NumToSkip;
       Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8);
+      Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16);
     }
   }
 
@@ -745,13 +748,16 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
         OS << (unsigned)*I++ << ", ";
       OS << (unsigned)*I++ << ", ";
 
-      // 16-bit numtoskip value.
+      // 24-bit numtoskip value.
       uint8_t Byte = *I++;
       uint32_t NumToSkip = Byte;
       OS << (unsigned)Byte << ", ";
       Byte = *I++;
       OS << (unsigned)Byte << ", ";
       NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
       OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
       break;
     }
@@ -765,13 +771,16 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
       for (; *I >= 128; ++I)
         OS << (unsigned)*I << ", ";
       OS << (unsigned)*I++ << ", ";
-      // 16-bit numtoskip value.
+      // 24-bit numtoskip value.
       uint8_t Byte = *I++;
       uint32_t NumToSkip = Byte;
       OS << (unsigned)Byte << ", ";
       Byte = *I++;
       OS << (unsigned)Byte << ", ";
       NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
       OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
       break;
     }
@@ -782,13 +791,16 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
         OS << (unsigned)*I << ", ";
       OS << (unsigned)*I++ << ", ";
 
-      // 16-bit numtoskip value.
+      // 24-bit numtoskip value.
       uint8_t Byte = *I++;
       uint32_t NumToSkip = Byte;
       OS << (unsigned)Byte << ", ";
       Byte = *I++;
       OS << (unsigned)Byte << ", ";
       NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
       OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
       break;
     }
@@ -797,7 +809,7 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
       bool IsTry = *I == MCD::OPC_TryDecode;
       ++I;
       // Extract the ULEB128 encoded Opcode to a buffer.
-      uint8_t Buffer[8], *p = Buffer;
+      uint8_t Buffer[16], *p = Buffer;
       while ((*p++ = *I++) >= 128)
         assert((p - Buffer) <= (ptrdiff_t)sizeof(Buffer)
                && "ULEB128 value too large!");
@@ -822,13 +834,16 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
 
       // Fallthrough for OPC_TryDecode.
 
-      // 16-bit numtoskip value.
+      // 24-bit numtoskip value.
       uint8_t Byte = *I++;
       uint32_t NumToSkip = Byte;
       OS << (unsigned)Byte << ", ";
       Byte = *I++;
       OS << (unsigned)Byte << ", ";
       NumToSkip |= Byte << 8;
+      Byte = *I++;
+      OS << utostr(Byte) << ", ";
+      NumToSkip |= Byte << 16;
 
       OS << "// Opcode: "
          << NumberedInstructions[Opc]->TheDef->getName()
@@ -1226,6 +1241,7 @@ void FilterChooser::emitPredicateTableEntry(DecoderTableInfo &TableInfo,
   TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
   TableInfo.Table.push_back(0);
   TableInfo.Table.push_back(0);
+  TableInfo.Table.push_back(0);
 }
 
 void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
@@ -1311,18 +1327,19 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
     TableInfo.Table.push_back(MCD::OPC_CheckField);
     TableInfo.Table.push_back(StartBits[I-1]);
     TableInfo.Table.push_back(NumBits);
-    uint8_t Buffer[8], *p;
+    uint8_t Buffer[16], *p;
     encodeULEB128(FieldVals[I-1], Buffer);
     for (p = Buffer; *p >= 128 ; ++p)
       TableInfo.Table.push_back(*p);
     TableInfo.Table.push_back(*p);
     // Push location for NumToSkip backpatching.
     TableInfo.FixupStack.back().push_back(TableInfo.Table.size());
-    // The fixup is always 16-bits, so go ahead and allocate the space
+    // The fixup is always 24-bits, so go ahead and allocate the space
     // in the table so all our relative position calculations work OK even
     // before we fully resolve the real value here.
     TableInfo.Table.push_back(0);
     TableInfo.Table.push_back(0);
+    TableInfo.Table.push_back(0);
   }
 
   // Check for soft failure of the match.
@@ -1342,7 +1359,7 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
   // can decode it.
   TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode :
       MCD::OPC_TryDecode);
-  uint8_t Buffer[8], *p;
+  uint8_t Buffer[16], *p;
   encodeULEB128(Opc, Buffer);
   for (p = Buffer; *p >= 128 ; ++p)
     TableInfo.Table.push_back(*p);
@@ -1362,6 +1379,7 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
     // Allocate the space for the fixup.
     TableInfo.Table.push_back(0);
     TableInfo.Table.push_back(0);
+    TableInfo.Table.push_back(0);
   }
 }
 
@@ -1701,10 +1719,9 @@ void FilterChooser::emitTableEntries(DecoderTableInfo &TableInfo) const {
 static std::string findOperandDecoderMethod(TypedInit *TI) {
   std::string Decoder;
 
-  RecordRecTy *Type = cast<RecordRecTy>(TI->getType());
-  Record *TypeRecord = Type->getRecord();
+  Record *Record = cast<DefInit>(TI)->getDef();
 
-  RecordVal *DecoderString = TypeRecord->getValue("DecoderMethod");
+  RecordVal *DecoderString = Record->getValue("DecoderMethod");
   StringInit *String = DecoderString ?
     dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
   if (String) {
@@ -1713,14 +1730,14 @@ static std::string findOperandDecoderMethod(TypedInit *TI) {
       return Decoder;
   }
 
-  if (TypeRecord->isSubClassOf("RegisterOperand"))
-    TypeRecord = TypeRecord->getValueAsDef("RegClass");
+  if (Record->isSubClassOf("RegisterOperand"))
+    Record = Record->getValueAsDef("RegClass");
 
-  if (TypeRecord->isSubClassOf("RegisterClass")) {
-    Decoder = "Decode" + TypeRecord->getName().str() + "RegisterClass";
-  } else if (TypeRecord->isSubClassOf("PointerLikeRegClass")) {
+  if (Record->isSubClassOf("RegisterClass")) {
+    Decoder = "Decode" + Record->getName().str() + "RegisterClass";
+  } else if (Record->isSubClassOf("PointerLikeRegClass")) {
     Decoder = "DecodePointerLikeRegClass" +
-      utostr(TypeRecord->getValueAsInt("RegClassKind"));
+      utostr(Record->getValueAsInt("RegClassKind"));
   }
 
   return Decoder;
@@ -1860,9 +1877,9 @@ static bool populateInstruction(CodeGenTarget &Target,
         CGI.Operands.getSubOperandNumber(OpIdx);
       const std::string &Name = CGI.Operands[SO.first].Name;
 
-      DEBUG(dbgs() << "Numbered operand mapping for " << Def.getName() << ": " <<
-                      Name << "(" << SO.first << ", " << SO.second << ") => " <<
-                      Vals[i].getName() << "\n");
+      LLVM_DEBUG(dbgs() << "Numbered operand mapping for " << Def.getName()
+                        << ": " << Name << "(" << SO.first << ", " << SO.second
+                        << ") => " << Vals[i].getName() << "\n");
 
       std::string Decoder;
       Record *TypeRecord = CGI.Operands[SO.first].Rec;
@@ -1878,10 +1895,8 @@ static bool populateInstruction(CodeGenTarget &Target,
           CGI.Operands[SO.first].MIOperandInfo->getNumArgs()) {
         Init *Arg = CGI.Operands[SO.first].MIOperandInfo->
                       getArg(SO.second);
-        if (TypedInit *TI = cast<TypedInit>(Arg)) {
-          RecordRecTy *Type = cast<RecordRecTy>(TI->getType());
-          TypeRecord = Type->getRecord();
-        }
+        if (DefInit *DI = cast<DefInit>(Arg))
+          TypeRecord = DI->getDef();
       }
 
       bool isReg = false;
@@ -1959,7 +1974,7 @@ static bool populateInstruction(CodeGenTarget &Target,
     // to interpret it.  As a first step, require the target to provide
     // callbacks for decoding register classes.
     std::string Decoder = findOperandDecoderMethod(TI);
-    Record *TypeRecord = cast<RecordRecTy>(TI->getType())->getRecord();
+    Record *TypeRecord = cast<DefInit>(TI)->getDef();
 
     RecordVal *HasCompleteDecoderVal =
       TypeRecord->getValue("hasCompleteDecoder");
@@ -2026,7 +2041,7 @@ static bool populateInstruction(CodeGenTarget &Target,
   Operands[Opc] = InsnOperands;
 
 #if 0
-  DEBUG({
+  LLVM_DEBUG({
       // Dumps the instruction encoding bits.
       dumpBits(errs(), Bits);
 
@@ -2048,10 +2063,16 @@ static bool populateInstruction(CodeGenTarget &Target,
 
 // emitFieldFromInstruction - Emit the templated helper function
 // fieldFromInstruction().
+// On Windows we make sure that this function is not inlined when
+// using the VS compiler. It has a bug which causes the function
+// to be optimized out in some circustances. See llvm.org/pr38292
 static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
   OS << "// Helper function for extracting fields from encoded instructions.\n"
      << "template<typename InsnType>\n"
-   << "static InsnType fieldFromInstruction(InsnType insn, unsigned startBit,\n"
+     << "#if defined(_MSC_VER) && !defined(__clang__)\n"
+     << "__declspec(noinline)\n"
+     << "#endif\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned startBit,\n"
      << "                                     unsigned numBits) {\n"
      << "    assert(startBit + numBits <= (sizeof(InsnType)*8) &&\n"
      << "           \"Instruction field out of bounds!\");\n"
@@ -2068,8 +2089,10 @@ static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
 // decodeInstruction().
 static void emitDecodeInstruction(formatted_raw_ostream &OS) {
   OS << "template<typename InsnType>\n"
-     << "static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,\n"
-     << "                                      InsnType insn, uint64_t Address,\n"
+     << "static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], "
+        "MCInst &MI,\n"
+     << "                                      InsnType insn, uint64_t "
+        "Address,\n"
      << "                                      const void *DisAsm,\n"
      << "                                      const MCSubtargetInfo &STI) {\n"
      << "  const FeatureBitset& Bits = STI.getFeatureBits();\n"
@@ -2088,7 +2111,8 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      unsigned Len = *++Ptr;\n"
      << "      ++Ptr;\n"
      << "      CurFieldValue = fieldFromInstruction(insn, Start, Len);\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_ExtractField(\" << Start << \", \"\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_ExtractField(\" << Start << "
+        "\", \"\n"
      << "                   << Len << \"): \" << CurFieldValue << \"\\n\");\n"
      << "      break;\n"
      << "    }\n"
@@ -2097,16 +2121,20 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      unsigned Len;\n"
      << "      InsnType Val = decodeULEB128(++Ptr, &Len);\n"
      << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 16-bit integer.\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
      << "      unsigned NumToSkip = *Ptr++;\n"
      << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
      << "\n"
      << "      // Perform the filter operation.\n"
      << "      if (Val != CurFieldValue)\n"
      << "        Ptr += NumToSkip;\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_FilterValue(\" << Val << \", \" << NumToSkip\n"
-     << "                   << \"): \" << ((Val != CurFieldValue) ? \"FAIL:\" : \"PASS:\")\n"
-     << "                   << \" continuing at \" << (Ptr - DecodeTable) << \"\\n\");\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_FilterValue(\" << Val << "
+        "\", \" << NumToSkip\n"
+     << "                   << \"): \" << ((Val != CurFieldValue) ? \"FAIL:\" "
+        ": \"PASS:\")\n"
+     << "                   << \" continuing at \" << (Ptr - DecodeTable) << "
+        "\"\\n\");\n"
      << "\n"
      << "      break;\n"
      << "    }\n"
@@ -2117,18 +2145,23 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      // Decode the field value.\n"
      << "      uint32_t ExpectedValue = decodeULEB128(++Ptr, &Len);\n"
      << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 16-bit integer.\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
      << "      unsigned NumToSkip = *Ptr++;\n"
      << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
      << "\n"
      << "      // If the actual and expected values don't match, skip.\n"
      << "      if (ExpectedValue != FieldValue)\n"
      << "        Ptr += NumToSkip;\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_CheckField(\" << Start << \", \"\n"
-     << "                   << Len << \", \" << ExpectedValue << \", \" << NumToSkip\n"
-     << "                   << \"): FieldValue = \" << FieldValue << \", ExpectedValue = \"\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_CheckField(\" << Start << "
+        "\", \"\n"
+     << "                   << Len << \", \" << ExpectedValue << \", \" << "
+        "NumToSkip\n"
+     << "                   << \"): FieldValue = \" << FieldValue << \", "
+        "ExpectedValue = \"\n"
      << "                   << ExpectedValue << \": \"\n"
-     << "                   << ((ExpectedValue == FieldValue) ? \"PASS\\n\" : \"FAIL\\n\"));\n"
+     << "                   << ((ExpectedValue == FieldValue) ? \"PASS\\n\" : "
+        "\"FAIL\\n\"));\n"
      << "      break;\n"
      << "    }\n"
      << "    case MCD::OPC_CheckPredicate: {\n"
@@ -2136,15 +2169,17 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      // Decode the Predicate Index value.\n"
      << "      unsigned PIdx = decodeULEB128(++Ptr, &Len);\n"
      << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 16-bit integer.\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
      << "      unsigned NumToSkip = *Ptr++;\n"
      << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
      << "      // Check the predicate.\n"
      << "      bool Pred;\n"
      << "      if (!(Pred = checkDecoderPredicate(PIdx, Bits)))\n"
      << "        Ptr += NumToSkip;\n"
      << "      (void)Pred;\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_CheckPredicate(\" << PIdx << \"): \"\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_CheckPredicate(\" << PIdx "
+        "<< \"): \"\n"
      << "            << (Pred ? \"PASS\\n\" : \"FAIL\\n\"));\n"
      << "\n"
      << "      break;\n"
@@ -2160,12 +2195,14 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      MI.clear();\n"
      << "      MI.setOpcode(Opc);\n"
      << "      bool DecodeComplete;\n"
-     << "      S = decodeToMCInst(S, DecodeIdx, insn, MI, Address, DisAsm, DecodeComplete);\n"
+     << "      S = decodeToMCInst(S, DecodeIdx, insn, MI, Address, DisAsm, "
+        "DecodeComplete);\n"
      << "      assert(DecodeComplete);\n"
      << "\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_Decode: opcode \" << Opc\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_Decode: opcode \" << Opc\n"
      << "                   << \", using decoder \" << DecodeIdx << \": \"\n"
-     << "                   << (S != MCDisassembler::Fail ? \"PASS\" : \"FAIL\") << \"\\n\");\n"
+     << "                   << (S != MCDisassembler::Fail ? \"PASS\" : "
+        "\"FAIL\") << \"\\n\");\n"
      << "      return S;\n"
      << "    }\n"
      << "    case MCD::OPC_TryDecode: {\n"
@@ -2175,29 +2212,35 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      Ptr += Len;\n"
      << "      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);\n"
      << "      Ptr += Len;\n"
-     << "      // NumToSkip is a plain 16-bit integer.\n"
+     << "      // NumToSkip is a plain 24-bit integer.\n"
      << "      unsigned NumToSkip = *Ptr++;\n"
      << "      NumToSkip |= (*Ptr++) << 8;\n"
+     << "      NumToSkip |= (*Ptr++) << 16;\n"
      << "\n"
      << "      // Perform the decode operation.\n"
      << "      MCInst TmpMI;\n"
      << "      TmpMI.setOpcode(Opc);\n"
      << "      bool DecodeComplete;\n"
-     << "      S = decodeToMCInst(S, DecodeIdx, insn, TmpMI, Address, DisAsm, DecodeComplete);\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_TryDecode: opcode \" << Opc\n"
+     << "      S = decodeToMCInst(S, DecodeIdx, insn, TmpMI, Address, DisAsm, "
+        "DecodeComplete);\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_TryDecode: opcode \" << "
+        "Opc\n"
      << "                   << \", using decoder \" << DecodeIdx << \": \");\n"
      << "\n"
      << "      if (DecodeComplete) {\n"
      << "        // Decoding complete.\n"
-     << "        DEBUG(dbgs() << (S != MCDisassembler::Fail ? \"PASS\" : \"FAIL\") << \"\\n\");\n"
+     << "        LLVM_DEBUG(dbgs() << (S != MCDisassembler::Fail ? \"PASS\" : "
+        "\"FAIL\") << \"\\n\");\n"
      << "        MI = TmpMI;\n"
      << "        return S;\n"
      << "      } else {\n"
      << "        assert(S == MCDisassembler::Fail);\n"
      << "        // If the decoding was incomplete, skip.\n"
      << "        Ptr += NumToSkip;\n"
-     << "        DEBUG(dbgs() << \"FAIL: continuing at \" << (Ptr - DecodeTable) << \"\\n\");\n"
-     << "        // Reset decode status. This also drops a SoftFail status that could be\n"
+     << "        LLVM_DEBUG(dbgs() << \"FAIL: continuing at \" << (Ptr - "
+        "DecodeTable) << \"\\n\");\n"
+     << "        // Reset decode status. This also drops a SoftFail status "
+        "that could be\n"
      << "        // set before the decode attempt.\n"
      << "        S = MCDisassembler::Success;\n"
      << "      }\n"
@@ -2213,16 +2256,18 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      bool Fail = (insn & PositiveMask) || (~insn & NegativeMask);\n"
      << "      if (Fail)\n"
      << "        S = MCDisassembler::SoftFail;\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_SoftFail: \" << (Fail ? \"FAIL\\n\":\"PASS\\n\"));\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_SoftFail: \" << (Fail ? "
+        "\"FAIL\\n\":\"PASS\\n\"));\n"
      << "      break;\n"
      << "    }\n"
      << "    case MCD::OPC_Fail: {\n"
-     << "      DEBUG(dbgs() << Loc << \": OPC_Fail\\n\");\n"
+     << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_Fail\\n\");\n"
      << "      return MCDisassembler::Fail;\n"
      << "    }\n"
      << "    }\n"
      << "  }\n"
-     << "  llvm_unreachable(\"bogosity detected in disassembler state machine!\");\n"
+     << "  llvm_unreachable(\"bogosity detected in disassembler state "
+        "machine!\");\n"
      << "}\n\n";
 }
 
diff --git a/contrib/llvm/utils/TableGen/GlobalISelEmitter.cpp b/contrib/llvm/utils/TableGen/GlobalISelEmitter.cpp
index c7d662db5a2f..69726cc9f257 100644
--- a/contrib/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -35,11 +35,11 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/CodeGenCoverage.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -84,6 +84,8 @@ namespace {
 
 /// Get the name of the enum value used to number the predicate function.
 std::string getEnumNameForPredicate(const TreePredicateFn &Predicate) {
+  if (Predicate.hasGISelPredicateCode())
+    return "GIPFP_MI_" + Predicate.getFnName();
   return "GIPFP_" + Predicate.getImmTypeIdentifier().str() + "_" +
          Predicate.getFnName();
 }
@@ -100,6 +102,7 @@ private:
   LLT Ty;
 
 public:
+  LLTCodeGen() = default;
   LLTCodeGen(const LLT &Ty) : Ty(Ty) {}
 
   std::string getCxxEnumValue() const {
@@ -148,7 +151,7 @@ public:
 
   const LLT &get() const { return Ty; }
 
-  /// This ordering is used for std::unique() and std::sort(). There's no
+  /// This ordering is used for std::unique() and llvm::sort(). There's no
   /// particular logic behind the order but either A < B or B < A must be
   /// true if A != B.
   bool operator<(const LLTCodeGen &Other) const {
@@ -176,6 +179,9 @@ public:
   bool operator==(const LLTCodeGen &B) const { return Ty == B.Ty; }
 };
 
+// Track all types that are used so we can emit the corresponding enum.
+std::set<LLTCodeGen> KnownTypes;
+
 class InstructionMatcher;
 /// Convert an MVT to an equivalent LLT if possible, or the invalid LLT() for
 /// MVTs that don't map cleanly to an LLT (e.g., iPTR, *any, ...).
@@ -260,6 +266,11 @@ std::string explainOperator(Record *Operator) {
             ")")
         .str();
 
+  if (Operator->isSubClassOf("SDNodeXForm"))
+    return (" (Operator is an unmapped SDNodeXForm, " + Operator->getName() +
+            ")")
+        .str();
+
   return (" (Operator " + Operator->getName() + " not understood)").str();
 }
 
@@ -280,12 +291,16 @@ static Error isTrivialOperatorNode(const TreePatternNode *N) {
     if (Predicate.isImmediatePattern())
       continue;
 
-    if (Predicate.isNonExtLoad())
+    if (Predicate.isNonExtLoad() || Predicate.isAnyExtLoad() ||
+        Predicate.isSignExtLoad() || Predicate.isZeroExtLoad())
       continue;
 
     if (Predicate.isNonTruncStore())
       continue;
 
+    if (Predicate.isLoad() && Predicate.getMemoryVT())
+      continue;
+
     if (Predicate.isLoad() || Predicate.isStore()) {
       if (Predicate.isUnindexed())
         continue;
@@ -306,6 +321,9 @@ static Error isTrivialOperatorNode(const TreePatternNode *N) {
          Predicate.isAtomicOrderingWeakerThanRelease()))
       continue;
 
+    if (Predicate.hasGISelPredicateCode())
+      continue;
+
     HasUnsupportedPredicate = true;
     Explanation = Separator + "Has a predicate (" + explainPredicates(N) + ")";
     Separator = ", ";
@@ -315,12 +333,7 @@ static Error isTrivialOperatorNode(const TreePatternNode *N) {
     break;
   }
 
-  if (N->getTransformFn()) {
-    Explanation += Separator + "Has a transform function";
-    Separator = ", ";
-  }
-
-  if (!HasUnsupportedPredicate && !N->getTransformFn())
+  if (!HasUnsupportedPredicate)
     return Error::success();
 
   return failedImport(Explanation);
@@ -394,19 +407,42 @@ public:
   /// A bitfield of RecordFlagsBits flags.
   unsigned Flags;
 
+  /// The actual run-time value, if known
+  int64_t RawValue;
+
   MatchTableRecord(Optional<unsigned> LabelID_, StringRef EmitStr,
-                   unsigned NumElements, unsigned Flags)
+                   unsigned NumElements, unsigned Flags,
+                   int64_t RawValue = std::numeric_limits<int64_t>::min())
       : LabelID(LabelID_.hasValue() ? LabelID_.getValue() : ~0u),
-        EmitStr(EmitStr), NumElements(NumElements), Flags(Flags) {
+        EmitStr(EmitStr), NumElements(NumElements), Flags(Flags),
+        RawValue(RawValue) {
+
     assert((!LabelID_.hasValue() || LabelID != ~0u) &&
            "This value is reserved for non-labels");
   }
+  MatchTableRecord(const MatchTableRecord &Other) = default;
+  MatchTableRecord(MatchTableRecord &&Other) = default;
+
+  /// Useful if a Match Table Record gets optimized out
+  void turnIntoComment() {
+    Flags |= MTRF_Comment;
+    Flags &= ~MTRF_CommaFollows;
+    NumElements = 0;
+  }
+
+  /// For Jump Table generation purposes
+  bool operator<(const MatchTableRecord &Other) const {
+    return RawValue < Other.RawValue;
+  }
+  int64_t getRawValue() const { return RawValue; }
 
   void emit(raw_ostream &OS, bool LineBreakNextAfterThis,
             const MatchTable &Table) const;
   unsigned size() const { return NumElements; }
 };
 
+class Matcher;
+
 /// Holds the contents of a generated MatchTable to enable formatting and the
 /// necessary index tracking needed to support GIM_Try.
 class MatchTable {
@@ -419,10 +455,11 @@ class MatchTable {
   /// The currently defined labels.
   DenseMap<unsigned, unsigned> LabelMap;
   /// Tracks the sum of MatchTableRecord::NumElements as the table is built.
-  unsigned CurrentSize;
-
+  unsigned CurrentSize = 0;
   /// A unique identifier for a MatchTable label.
-  static unsigned CurrentLabelID;
+  unsigned CurrentLabelID = 0;
+  /// Determines if the table should be instrumented for rule coverage tracking.
+  bool IsWithCoverage;
 
 public:
   static MatchTableRecord LineBreak;
@@ -443,11 +480,20 @@ public:
     return MatchTableRecord(None, NamedValue, 1,
                             MatchTableRecord::MTRF_CommaFollows);
   }
+  static MatchTableRecord NamedValue(StringRef NamedValue, int64_t RawValue) {
+    return MatchTableRecord(None, NamedValue, 1,
+                            MatchTableRecord::MTRF_CommaFollows, RawValue);
+  }
   static MatchTableRecord NamedValue(StringRef Namespace,
                                      StringRef NamedValue) {
     return MatchTableRecord(None, (Namespace + "::" + NamedValue).str(), 1,
                             MatchTableRecord::MTRF_CommaFollows);
   }
+  static MatchTableRecord NamedValue(StringRef Namespace, StringRef NamedValue,
+                                     int64_t RawValue) {
+    return MatchTableRecord(None, (Namespace + "::" + NamedValue).str(), 1,
+                            MatchTableRecord::MTRF_CommaFollows, RawValue);
+  }
   static MatchTableRecord IntValue(int64_t IntValue) {
     return MatchTableRecord(None, llvm::to_string(IntValue), 1,
                             MatchTableRecord::MTRF_CommaFollows);
@@ -465,7 +511,12 @@ public:
                                 MatchTableRecord::MTRF_CommaFollows);
   }
 
-  MatchTable(unsigned ID) : ID(ID), CurrentSize(0) {}
+  static MatchTable buildTable(ArrayRef<Matcher *> Rules, bool WithCoverage);
+
+  MatchTable(bool WithCoverage, unsigned ID = 0)
+      : ID(ID), IsWithCoverage(WithCoverage) {}
+
+  bool isWithCoverage() const { return IsWithCoverage; }
 
   void push_back(const MatchTableRecord &Value) {
     if (Value.Flags & MatchTableRecord::MTRF_Label)
@@ -474,7 +525,7 @@ public:
     CurrentSize += Value.size();
   }
 
-  unsigned allocateLabelID() const { return CurrentLabelID++; }
+  unsigned allocateLabelID() { return CurrentLabelID++; }
 
   void defineLabel(unsigned LabelID) {
     LabelMap.insert(std::make_pair(LabelID, CurrentSize));
@@ -519,8 +570,6 @@ public:
   }
 };
 
-unsigned MatchTable::CurrentLabelID = 0;
-
 MatchTableRecord MatchTable::LineBreak = {
     None, "" /* Emit String */, 0 /* Elements */,
     MatchTableRecord::MTRF_LineBreakFollows};
@@ -573,65 +622,172 @@ class RuleMatcher;
 class Matcher {
 public:
   virtual ~Matcher() = default;
+  virtual void optimize() {}
   virtual void emit(MatchTable &Table) = 0;
-  virtual std::unique_ptr<PredicateMatcher> forgetFirstCondition() = 0;
+
+  virtual bool hasFirstCondition() const = 0;
+  virtual const PredicateMatcher &getFirstCondition() const = 0;
+  virtual std::unique_ptr<PredicateMatcher> popFirstCondition() = 0;
 };
 
-class GroupMatcher : public Matcher {
-  SmallVector<std::unique_ptr<PredicateMatcher>, 8> Conditions;
-  SmallVector<Matcher *, 8> Rules;
+MatchTable MatchTable::buildTable(ArrayRef<Matcher *> Rules,
+                                  bool WithCoverage) {
+  MatchTable Table(WithCoverage);
+  for (Matcher *Rule : Rules)
+    Rule->emit(Table);
+
+  return Table << MatchTable::Opcode("GIM_Reject") << MatchTable::LineBreak;
+}
+
+class GroupMatcher final : public Matcher {
+  /// Conditions that form a common prefix of all the matchers contained.
+  SmallVector<std::unique_ptr<PredicateMatcher>, 1> Conditions;
+
+  /// All the nested matchers, sharing a common prefix.
+  std::vector<Matcher *> Matchers;
+
+  /// An owning collection for any auxiliary matchers created while optimizing
+  /// nested matchers contained.
+  std::vector<std::unique_ptr<Matcher>> MatcherStorage;
 
 public:
-  void addCondition(std::unique_ptr<PredicateMatcher> &&Predicate) {
-    Conditions.emplace_back(std::move(Predicate));
+  /// Add a matcher to the collection of nested matchers if it meets the
+  /// requirements, and return true. If it doesn't, do nothing and return false.
+  ///
+  /// Expected to preserve its argument, so it could be moved out later on.
+  bool addMatcher(Matcher &Candidate);
+
+  /// Mark the matcher as fully-built and ensure any invariants expected by both
+  /// optimize() and emit(...) methods. Generally, both sequences of calls
+  /// are expected to lead to a sensible result:
+  ///
+  /// addMatcher(...)*; finalize(); optimize(); emit(...); and
+  /// addMatcher(...)*; finalize(); emit(...);
+  ///
+  /// or generally
+  ///
+  /// addMatcher(...)*; finalize(); { optimize()*; emit(...); }*
+  ///
+  /// Multiple calls to optimize() are expected to be handled gracefully, though
+  /// optimize() is not expected to be idempotent. Multiple calls to finalize()
+  /// aren't generally supported. emit(...) is expected to be non-mutating and
+  /// producing the exact same results upon repeated calls.
+  ///
+  /// addMatcher() calls after the finalize() call are not supported.
+  ///
+  /// finalize() and optimize() are both allowed to mutate the contained
+  /// matchers, so moving them out after finalize() is not supported.
+  void finalize();
+  void optimize() override;
+  void emit(MatchTable &Table) override;
+
+  /// Could be used to move out the matchers added previously, unless finalize()
+  /// has been already called. If any of the matchers are moved out, the group
+  /// becomes safe to destroy, but not safe to re-use for anything else.
+  iterator_range<std::vector<Matcher *>::iterator> matchers() {
+    return make_range(Matchers.begin(), Matchers.end());
   }
-  void addRule(Matcher &Rule) { Rules.push_back(&Rule); }
-  const std::unique_ptr<PredicateMatcher> &conditions_back() const {
-    return Conditions.back();
+  size_t size() const { return Matchers.size(); }
+  bool empty() const { return Matchers.empty(); }
+
+  std::unique_ptr<PredicateMatcher> popFirstCondition() override {
+    assert(!Conditions.empty() &&
+           "Trying to pop a condition from a condition-less group");
+    std::unique_ptr<PredicateMatcher> P = std::move(Conditions.front());
+    Conditions.erase(Conditions.begin());
+    return P;
   }
-  bool lastConditionMatches(const PredicateMatcher &Predicate) const;
-  bool conditions_empty() const { return Conditions.empty(); }
-  void clear() {
-    Conditions.clear();
-    Rules.clear();
+  const PredicateMatcher &getFirstCondition() const override {
+    assert(!Conditions.empty() &&
+           "Trying to get a condition from a condition-less group");
+    return *Conditions.front();
   }
+  bool hasFirstCondition() const override { return !Conditions.empty(); }
+
+private:
+  /// See if a candidate matcher could be added to this group solely by
+  /// analyzing its first condition.
+  bool candidateConditionMatches(const PredicateMatcher &Predicate) const;
+};
+
+class SwitchMatcher : public Matcher {
+  /// All the nested matchers, representing distinct switch-cases. The first
+  /// conditions (as Matcher::getFirstCondition() reports) of all the nested
+  /// matchers must share the same type and path to a value they check, in other
+  /// words, be isIdenticalDownToValue, but have different values they check
+  /// against.
+  std::vector<Matcher *> Matchers;
+
+  /// The representative condition, with a type and a path (InsnVarID and OpIdx
+  /// in most cases)  shared by all the matchers contained.
+  std::unique_ptr<PredicateMatcher> Condition = nullptr;
+
+  /// Temporary set used to check that the case values don't repeat within the
+  /// same switch.
+  std::set<MatchTableRecord> Values;
+
+  /// An owning collection for any auxiliary matchers created while optimizing
+  /// nested matchers contained.
+  std::vector<std::unique_ptr<Matcher>> MatcherStorage;
+
+public:
+  bool addMatcher(Matcher &Candidate);
+
+  void finalize();
   void emit(MatchTable &Table) override;
 
-  std::unique_ptr<PredicateMatcher> forgetFirstCondition() override {
-    // We shouldn't need to mess up with groups, since we
-    // should have merged everything shareable upfront.
-    // If we start to look into reordering predicates,
-    // we may want to reconsider this.
-    assert(0 && "Groups should be formed maximal for now");
-    llvm_unreachable("No need for this for now");
+  iterator_range<std::vector<Matcher *>::iterator> matchers() {
+    return make_range(Matchers.begin(), Matchers.end());
   }
+  size_t size() const { return Matchers.size(); }
+  bool empty() const { return Matchers.empty(); }
+
+  std::unique_ptr<PredicateMatcher> popFirstCondition() override {
+    // SwitchMatcher doesn't have a common first condition for its cases, as all
+    // the cases only share a kind of a value (a type and a path to it) they
+    // match, but deliberately differ in the actual value they match.
+    llvm_unreachable("Trying to pop a condition from a condition-less group");
+  }
+  const PredicateMatcher &getFirstCondition() const override {
+    llvm_unreachable("Trying to pop a condition from a condition-less group");
+  }
+  bool hasFirstCondition() const override { return false; }
+
+private:
+  /// See if the predicate type has a Switch-implementation for it.
+  static bool isSupportedPredicateType(const PredicateMatcher &Predicate);
+
+  bool candidateConditionMatches(const PredicateMatcher &Predicate) const;
+
+  /// emit()-helper
+  static void emitPredicateSpecificOpcodes(const PredicateMatcher &P,
+                                           MatchTable &Table);
 };
 
 /// Generates code to check that a match rule matches.
 class RuleMatcher : public Matcher {
 public:
-  using ActionVec = std::vector<std::unique_ptr<MatchAction>>;
-  using action_iterator = ActionVec::iterator;
+  using ActionList = std::list<std::unique_ptr<MatchAction>>;
+  using action_iterator = ActionList::iterator;
 
 protected:
   /// A list of matchers that all need to succeed for the current rule to match.
   /// FIXME: This currently supports a single match position but could be
   /// extended to support multiple positions to support div/rem fusion or
   /// load-multiple instructions.
-  std::vector<std::unique_ptr<InstructionMatcher>> Matchers;
+  using MatchersTy = std::vector<std::unique_ptr<InstructionMatcher>> ;
+  MatchersTy Matchers;
 
   /// A list of actions that need to be taken when all predicates in this rule
   /// have succeeded.
-  ActionVec Actions;
+  ActionList Actions;
 
-  using DefinedInsnVariablesMap =
-      std::map<const InstructionMatcher *, unsigned>;
+  using DefinedInsnVariablesMap = std::map<InstructionMatcher *, unsigned>;
 
-  /// A map of instruction matchers to the local variables created by
-  /// emitCaptureOpcodes().
+  /// A map of instruction matchers to the local variables
   DefinedInsnVariablesMap InsnVariableIDs;
 
-  using MutatableInsnSet = SmallPtrSet<const InstructionMatcher *, 4>;
+  using MutatableInsnSet = SmallPtrSet<InstructionMatcher *, 4>;
 
   // The set of instruction matchers that have not yet been claimed for mutation
   // by a BuildMI.
@@ -641,7 +797,7 @@ protected:
   /// the renderers.
   StringMap<OperandMatcher *> DefinedOperands;
 
-  /// ID for the next instruction variable defined with defineInsnVar()
+  /// ID for the next instruction variable defined with implicitlyDefineInsnVar()
   unsigned NextInsnVarID;
 
   /// ID for the next output instruction allocated with allocateOutputInsnID()
@@ -651,6 +807,7 @@ protected:
   unsigned NextTempRegID;
 
   std::vector<Record *> RequiredFeatures;
+  std::vector<std::unique_ptr<PredicateMatcher>> EpilogueMatchers;
 
   ArrayRef<SMLoc> SrcLoc;
 
@@ -684,16 +841,9 @@ public:
   action_iterator insertAction(action_iterator InsertPt, Args &&... args);
 
   /// Define an instruction without emitting any code to do so.
-  /// This is used for the root of the match.
-  unsigned implicitlyDefineInsnVar(const InstructionMatcher &Matcher);
-  void clearImplicitMap() {
-    NextInsnVarID = 0;
-    InsnVariableIDs.clear();
-  };
-  /// Define an instruction and emit corresponding state-machine opcodes.
-  unsigned defineInsnVar(MatchTable &Table, const InstructionMatcher &Matcher,
-                         unsigned InsnVarID, unsigned OpIdx);
-  unsigned getInsnVarID(const InstructionMatcher &InsnMatcher) const;
+  unsigned implicitlyDefineInsnVar(InstructionMatcher &Matcher);
+
+  unsigned getInsnVarID(InstructionMatcher &InsnMatcher) const;
   DefinedInsnVariablesMap::const_iterator defined_insn_vars_begin() const {
     return InsnVariableIDs.begin();
   }
@@ -715,7 +865,7 @@ public:
   mutatable_insns() const {
     return make_range(mutatable_insns_begin(), mutatable_insns_end());
   }
-  void reserveInsnMatcherForMutation(const InstructionMatcher *InsnMatcher) {
+  void reserveInsnMatcherForMutation(InstructionMatcher *InsnMatcher) {
     bool R = MutatableInsns.erase(InsnMatcher);
     assert(R && "Reserving a mutatable insn that isn't available");
     (void)R;
@@ -743,11 +893,10 @@ public:
     return I->second;
   }
 
-  const InstructionMatcher &getInstructionMatcher(StringRef SymbolicName) const;
+  InstructionMatcher &getInstructionMatcher(StringRef SymbolicName) const;
   const OperandMatcher &getOperandMatcher(StringRef Name) const;
 
-  void emitCaptureOpcodes(MatchTable &Table);
-
+  void optimize() override;
   void emit(MatchTable &Table) override;
 
   /// Compare the priority of this object and B.
@@ -759,7 +908,12 @@ public:
   /// matcher.
   unsigned countRendererFns() const;
 
-  std::unique_ptr<PredicateMatcher> forgetFirstCondition() override;
+  std::unique_ptr<PredicateMatcher> popFirstCondition() override;
+  const PredicateMatcher &getFirstCondition() const override;
+  LLTCodeGen getFirstConditionAsRootType();
+  bool hasFirstCondition() const override;
+  unsigned getNumOperands() const;
+  StringRef getOpcode() const;
 
   // FIXME: Remove this as soon as possible
   InstructionMatcher &insnmatchers_front() const { return *Matchers.front(); }
@@ -767,6 +921,9 @@ public:
   unsigned allocateOutputInsnID() { return NextOutputInsnID++; }
   unsigned allocateTempRegID() { return NextTempRegID++; }
 
+  iterator_range<MatchersTy::iterator> insnmatchers() {
+    return make_range(Matchers.begin(), Matchers.end());
+  }
   bool insnmatchers_empty() const { return Matchers.empty(); }
   void insnmatchers_pop_front() { Matchers.erase(Matchers.begin()); }
 };
@@ -777,58 +934,69 @@ using action_iterator = RuleMatcher::action_iterator;
 
 template <class PredicateTy> class PredicateListMatcher {
 private:
-  typedef std::vector<std::unique_ptr<PredicateTy>> PredicateVec;
-  PredicateVec Predicates;
-
   /// Template instantiations should specialize this to return a string to use
   /// for the comment emitted when there are no predicates.
   std::string getNoPredicateComment() const;
 
+protected:
+  using PredicatesTy = std::deque<std::unique_ptr<PredicateTy>>;
+  PredicatesTy Predicates;
+
+  /// Track if the list of predicates was manipulated by one of the optimization
+  /// methods.
+  bool Optimized = false;
+
 public:
-  /// Construct a new operand predicate and add it to the matcher.
+  /// Construct a new predicate and add it to the matcher.
   template <class Kind, class... Args>
-  Optional<Kind *> addPredicate(Args&&... args) {
-    Predicates.emplace_back(
-        llvm::make_unique<Kind>(std::forward<Args>(args)...));
-    return static_cast<Kind *>(Predicates.back().get());
-  }
+  Optional<Kind *> addPredicate(Args &&... args);
 
-  typename PredicateVec::const_iterator predicates_begin() const {
+  typename PredicatesTy::iterator predicates_begin() {
     return Predicates.begin();
   }
-  typename PredicateVec::const_iterator predicates_end() const {
+  typename PredicatesTy::iterator predicates_end() {
     return Predicates.end();
   }
-  iterator_range<typename PredicateVec::const_iterator> predicates() const {
+  iterator_range<typename PredicatesTy::iterator> predicates() {
     return make_range(predicates_begin(), predicates_end());
   }
-  typename PredicateVec::size_type predicates_size() const {
+  typename PredicatesTy::size_type predicates_size() const {
     return Predicates.size();
   }
   bool predicates_empty() const { return Predicates.empty(); }
 
   std::unique_ptr<PredicateTy> predicates_pop_front() {
     std::unique_ptr<PredicateTy> Front = std::move(Predicates.front());
-    Predicates.erase(Predicates.begin());
+    Predicates.pop_front();
+    Optimized = true;
     return Front;
   }
 
+  void prependPredicate(std::unique_ptr<PredicateTy> &&Predicate) {
+    Predicates.push_front(std::move(Predicate));
+  }
+
+  void eraseNullPredicates() {
+    const auto NewEnd =
+        std::stable_partition(Predicates.begin(), Predicates.end(),
+                              std::logical_not<std::unique_ptr<PredicateTy>>());
+    if (NewEnd != Predicates.begin()) {
+      Predicates.erase(Predicates.begin(), NewEnd);
+      Optimized = true;
+    }
+  }
+
   /// Emit MatchTable opcodes that tests whether all the predicates are met.
   template <class... Args>
-  void emitPredicateListOpcodes(MatchTable &Table, Args &&... args) const {
-    if (Predicates.empty()) {
+  void emitPredicateListOpcodes(MatchTable &Table, Args &&... args) {
+    if (Predicates.empty() && !Optimized) {
       Table << MatchTable::Comment(getNoPredicateComment())
             << MatchTable::LineBreak;
       return;
     }
 
-    unsigned OpIdx = (*predicates_begin())->getOpIdx();
-    (void)OpIdx;
-    for (const auto &Predicate : predicates()) {
-      assert(Predicate->getOpIdx() == OpIdx &&
-             "Checks touch different operands?");
+    for (const auto &Predicate : predicates())
       Predicate->emitPredicateOpcodes(Table, std::forward<Args>(args)...);
-    }
   }
 };
 
@@ -846,8 +1014,12 @@ public:
   /// are currently not compared between each other.
   enum PredicateKind {
     IPM_Opcode,
+    IPM_NumOperands,
     IPM_ImmPredicate,
     IPM_AtomicOrderingMMO,
+    IPM_MemoryLLTSize,
+    IPM_MemoryVsLLTSize,
+    IPM_GenericPredicate,
     OPM_SameOperand,
     OPM_ComplexPattern,
     OPM_IntrinsicID,
@@ -869,7 +1041,9 @@ public:
   PredicateMatcher(PredicateKind Kind, unsigned InsnVarID, unsigned OpIdx = ~0)
       : Kind(Kind), InsnVarID(InsnVarID), OpIdx(OpIdx) {}
 
+  unsigned getInsnVarID() const { return InsnVarID; }
   unsigned getOpIdx() const { return OpIdx; }
+
   virtual ~PredicateMatcher() = default;
   /// Emit MatchTable opcodes that check the predicate for the given operand.
   virtual void emitPredicateOpcodes(MatchTable &Table,
@@ -878,16 +1052,23 @@ public:
   PredicateKind getKind() const { return Kind; }
 
   virtual bool isIdentical(const PredicateMatcher &B) const {
-    if (InsnVarID != 0 || OpIdx != (unsigned)~0) {
-      // We currently don't hoist the record of instruction properly.
-      // Therefore we can only work on the orig instruction (InsnVarID
-      // == 0).
-      DEBUG(dbgs() << "Non-zero instr ID not supported yet\n");
-      return false;
-    }
     return B.getKind() == getKind() && InsnVarID == B.InsnVarID &&
            OpIdx == B.OpIdx;
   }
+
+  virtual bool isIdenticalDownToValue(const PredicateMatcher &B) const {
+    return hasValue() && PredicateMatcher::isIdentical(B);
+  }
+
+  virtual MatchTableRecord getValue() const {
+    assert(hasValue() && "Can not get a value of a value-less predicate!");
+    llvm_unreachable("Not implemented yet");
+  }
+  virtual bool hasValue() const { return false; }
+
+  /// Report the maximum number of temporary operands needed by the predicate
+  /// matcher.
+  virtual unsigned countRendererFns() const { return 0; }
 };
 
 /// Generates code to check a predicate of an operand.
@@ -903,20 +1084,10 @@ public:
       : PredicateMatcher(Kind, InsnVarID, OpIdx) {}
   virtual ~OperandPredicateMatcher() {}
 
-  /// Emit MatchTable opcodes to capture instructions into the MIs table.
-  ///
-  /// Only InstructionOperandMatcher needs to do anything for this method the
-  /// rest just walk the tree.
-  virtual void emitCaptureOpcodes(MatchTable &Table, RuleMatcher &Rule) const {}
-
   /// Compare the priority of this object and B.
   ///
   /// Returns true if this object is more important than B.
   virtual bool isHigherPriorityThan(const OperandPredicateMatcher &B) const;
-
-  /// Report the maximum number of temporary operands needed by the predicate
-  /// matcher.
-  virtual unsigned countRendererFns() const { return 0; }
 };
 
 template <>
@@ -935,12 +1106,17 @@ public:
       : OperandPredicateMatcher(OPM_SameOperand, InsnVarID, OpIdx),
         MatchingName(MatchingName) {}
 
-  static bool classof(const OperandPredicateMatcher *P) {
+  static bool classof(const PredicateMatcher *P) {
     return P->getKind() == OPM_SameOperand;
   }
 
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override;
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return OperandPredicateMatcher::isIdentical(B) &&
+           MatchingName == cast<SameOperandMatcher>(&B)->MatchingName;
+  }
 };
 
 /// Generates code to check that an operand is a particular LLT.
@@ -949,7 +1125,15 @@ protected:
   LLTCodeGen Ty;
 
 public:
-  static std::set<LLTCodeGen> KnownTypes;
+  static std::map<LLTCodeGen, unsigned> TypeIDValues;
+
+  static void initTypeIDValuesMap() {
+    TypeIDValues.clear();
+
+    unsigned ID = 0;
+    for (const LLTCodeGen LLTy : KnownTypes)
+      TypeIDValues[LLTy] = ID++;
+  }
 
   LLTOperandMatcher(unsigned InsnVarID, unsigned OpIdx, const LLTCodeGen &Ty)
       : OperandPredicateMatcher(OPM_LLT, InsnVarID, OpIdx), Ty(Ty) {
@@ -963,18 +1147,30 @@ public:
     return OperandPredicateMatcher::isIdentical(B) &&
            Ty == cast<LLTOperandMatcher>(&B)->Ty;
   }
+  MatchTableRecord getValue() const override {
+    const auto VI = TypeIDValues.find(Ty);
+    if (VI == TypeIDValues.end())
+      return MatchTable::NamedValue(getTy().getCxxEnumValue());
+    return MatchTable::NamedValue(getTy().getCxxEnumValue(), VI->second);
+  }
+  bool hasValue() const override {
+    if (TypeIDValues.size() != KnownTypes.size())
+      initTypeIDValuesMap();
+    return TypeIDValues.count(Ty);
+  }
+
+  LLTCodeGen getTy() const { return Ty; }
 
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override {
     Table << MatchTable::Opcode("GIM_CheckType") << MatchTable::Comment("MI")
           << MatchTable::IntValue(InsnVarID) << MatchTable::Comment("Op")
           << MatchTable::IntValue(OpIdx) << MatchTable::Comment("Type")
-          << MatchTable::NamedValue(Ty.getCxxEnumValue())
-          << MatchTable::LineBreak;
+          << getValue() << MatchTable::LineBreak;
   }
 };
 
-std::set<LLTCodeGen> LLTOperandMatcher::KnownTypes;
+std::map<LLTCodeGen, unsigned> LLTOperandMatcher::TypeIDValues;
 
 /// Generates code to check that an operand is a pointer to any address space.
 ///
@@ -1207,7 +1403,18 @@ public:
     assert(SymbolicName.empty() && "Operand already has a symbolic name");
     SymbolicName = Name;
   }
-  unsigned getOperandIndex() const { return OpIdx; }
+
+  /// Construct a new operand predicate and add it to the matcher.
+  template <class Kind, class... Args>
+  Optional<Kind *> addPredicate(Args &&... args) {
+    if (isSameAsAnotherOperand())
+      return None;
+    Predicates.emplace_back(llvm::make_unique<Kind>(
+        getInsnVarID(), getOpIdx(), std::forward<Args>(args)...));
+    return static_cast<Kind *>(Predicates.back().get());
+  }
+
+  unsigned getOpIdx() const { return OpIdx; }
   unsigned getInsnVarID() const;
 
   std::string getOperandExpr(unsigned InsnVarID) const {
@@ -1220,23 +1427,19 @@ public:
   Error addTypeCheckPredicate(const TypeSetByHwMode &VTy,
                               bool OperandIsAPointer);
 
-  /// Emit MatchTable opcodes to capture instructions into the MIs table.
-  void emitCaptureOpcodes(MatchTable &Table, RuleMatcher &Rule) const {
-    for (const auto &Predicate : predicates())
-      Predicate->emitCaptureOpcodes(Table, Rule);
-  }
-
   /// Emit MatchTable opcodes that test whether the instruction named in
   /// InsnVarID matches all the predicates and all the operands.
-  void emitPredicateOpcodes(MatchTable &Table, RuleMatcher &Rule) const {
-    std::string Comment;
-    raw_string_ostream CommentOS(Comment);
-    CommentOS << "MIs[" << getInsnVarID() << "] ";
-    if (SymbolicName.empty())
-      CommentOS << "Operand " << OpIdx;
-    else
-      CommentOS << SymbolicName;
-    Table << MatchTable::Comment(CommentOS.str()) << MatchTable::LineBreak;
+  void emitPredicateOpcodes(MatchTable &Table, RuleMatcher &Rule) {
+    if (!Optimized) {
+      std::string Comment;
+      raw_string_ostream CommentOS(Comment);
+      CommentOS << "MIs[" << getInsnVarID() << "] ";
+      if (SymbolicName.empty())
+        CommentOS << "Operand " << OpIdx;
+      else
+        CommentOS << SymbolicName;
+      Table << MatchTable::Comment(CommentOS.str()) << MatchTable::LineBreak;
+    }
 
     emitPredicateListOpcodes(Table, Rule);
   }
@@ -1244,7 +1447,7 @@ public:
   /// Compare the priority of this object and B.
   ///
   /// Returns true if this object is more important than B.
-  bool isHigherPriorityThan(const OperandMatcher &B) const {
+  bool isHigherPriorityThan(OperandMatcher &B) {
     // Operand matchers involving more predicates have higher priority.
     if (predicates_size() > B.predicates_size())
       return true;
@@ -1252,7 +1455,7 @@ public:
       return false;
 
     // This assumes that predicates are added in a consistent order.
-    for (const auto &Predicate : zip(predicates(), B.predicates())) {
+    for (auto &&Predicate : zip(predicates(), B.predicates())) {
       if (std::get<0>(Predicate)->isHigherPriorityThan(*std::get<1>(Predicate)))
         return true;
       if (std::get<1>(Predicate)->isHigherPriorityThan(*std::get<0>(Predicate)))
@@ -1264,7 +1467,7 @@ public:
 
   /// Report the maximum number of temporary operands needed by the operand
   /// matcher.
-  unsigned countRendererFns() const {
+  unsigned countRendererFns() {
     return std::accumulate(
         predicates().begin(), predicates().end(), 0,
         [](unsigned A,
@@ -1277,7 +1480,7 @@ public:
     return AllocatedTemporariesBaseID;
   }
 
-  bool isSameAsAnotherOperand() const {
+  bool isSameAsAnotherOperand() {
     for (const auto &Predicate : predicates())
       if (isa<SameOperandMatcher>(Predicate))
         return true;
@@ -1285,21 +1488,6 @@ public:
   }
 };
 
-// Specialize OperandMatcher::addPredicate() to refrain from adding redundant
-// predicates.
-template <>
-template <class Kind, class... Args>
-Optional<Kind *>
-PredicateListMatcher<OperandPredicateMatcher>::addPredicate(Args &&... args) {
-  auto *OpMatcher = static_cast<OperandMatcher *>(this);
-  if (static_cast<OperandMatcher *>(this)->isSameAsAnotherOperand())
-    return None;
-  Predicates.emplace_back(llvm::make_unique<Kind>(OpMatcher->getInsnVarID(),
-                                                  OpMatcher->getOperandIndex(),
-                                                  std::forward<Args>(args)...));
-  return static_cast<Kind *>(Predicates.back().get());
-}
-
 Error OperandMatcher::addTypeCheckPredicate(const TypeSetByHwMode &VTy,
                                             bool OperandIsAPointer) {
   if (!VTy.isMachineValueType())
@@ -1343,15 +1531,11 @@ public:
   isHigherPriorityThan(const InstructionPredicateMatcher &B) const {
     return Kind < B.Kind;
   };
-
-  /// Report the maximum number of temporary operands needed by the predicate
-  /// matcher.
-  virtual unsigned countRendererFns() const { return 0; }
 };
 
 template <>
 std::string
-PredicateListMatcher<InstructionPredicateMatcher>::getNoPredicateComment() const {
+PredicateListMatcher<PredicateMatcher>::getNoPredicateComment() const {
   return "No instruction predicates";
 }
 
@@ -1360,7 +1544,17 @@ class InstructionOpcodeMatcher : public InstructionPredicateMatcher {
 protected:
   const CodeGenInstruction *I;
 
+  static DenseMap<const CodeGenInstruction *, unsigned> OpcodeValues;
+
 public:
+  static void initOpcodeValuesMap(const CodeGenTarget &Target) {
+    OpcodeValues.clear();
+
+    unsigned OpcodeValue = 0;
+    for (const CodeGenInstruction *I : Target.getInstructionsByEnumValue())
+      OpcodeValues[I] = OpcodeValue++;
+  }
+
   InstructionOpcodeMatcher(unsigned InsnVarID, const CodeGenInstruction *I)
       : InstructionPredicateMatcher(IPM_Opcode, InsnVarID), I(I) {}
 
@@ -1372,12 +1566,19 @@ public:
     return InstructionPredicateMatcher::isIdentical(B) &&
            I == cast<InstructionOpcodeMatcher>(&B)->I;
   }
+  MatchTableRecord getValue() const override {
+    const auto VI = OpcodeValues.find(I);
+    if (VI != OpcodeValues.end())
+      return MatchTable::NamedValue(I->Namespace, I->TheDef->getName(),
+                                    VI->second);
+    return MatchTable::NamedValue(I->Namespace, I->TheDef->getName());
+  }
+  bool hasValue() const override { return OpcodeValues.count(I); }
 
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override {
     Table << MatchTable::Opcode("GIM_CheckOpcode") << MatchTable::Comment("MI")
-          << MatchTable::IntValue(InsnVarID)
-          << MatchTable::NamedValue(I->Namespace, I->TheDef->getName())
+          << MatchTable::IntValue(InsnVarID) << getValue()
           << MatchTable::LineBreak;
   }
 
@@ -1404,6 +1605,42 @@ public:
   bool isConstantInstruction() const {
     return I->TheDef->getName() == "G_CONSTANT";
   }
+
+  StringRef getOpcode() const { return I->TheDef->getName(); }
+  unsigned getNumOperands() const { return I->Operands.size(); }
+
+  StringRef getOperandType(unsigned OpIdx) const {
+    return I->Operands[OpIdx].OperandType;
+  }
+};
+
+DenseMap<const CodeGenInstruction *, unsigned>
+    InstructionOpcodeMatcher::OpcodeValues;
+
+class InstructionNumOperandsMatcher final : public InstructionPredicateMatcher {
+  unsigned NumOperands = 0;
+
+public:
+  InstructionNumOperandsMatcher(unsigned InsnVarID, unsigned NumOperands)
+      : InstructionPredicateMatcher(IPM_NumOperands, InsnVarID),
+        NumOperands(NumOperands) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == IPM_NumOperands;
+  }
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return InstructionPredicateMatcher::isIdentical(B) &&
+           NumOperands == cast<InstructionNumOperandsMatcher>(&B)->NumOperands;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_CheckNumOperands")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("Expected")
+          << MatchTable::IntValue(NumOperands) << MatchTable::LineBreak;
+  }
 };
 
 /// Generates code to check that this instruction is a constant whose value
@@ -1483,10 +1720,17 @@ public:
       : InstructionPredicateMatcher(IPM_AtomicOrderingMMO, InsnVarID),
         Order(Order), Comparator(Comparator) {}
 
-  static bool classof(const InstructionPredicateMatcher *P) {
+  static bool classof(const PredicateMatcher *P) {
     return P->getKind() == IPM_AtomicOrderingMMO;
   }
 
+  bool isIdentical(const PredicateMatcher &B) const override {
+    if (!InstructionPredicateMatcher::isIdentical(B))
+      return false;
+    const auto &R = *cast<AtomicOrderingMMOPredicateMatcher>(&B);
+    return Order == R.Order && Comparator == R.Comparator;
+  }
+
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override {
     StringRef Opcode = "GIM_CheckAtomicOrdering";
@@ -1503,14 +1747,113 @@ public:
   }
 };
 
+/// Generates code to check that the size of an MMO is exactly N bytes.
+class MemorySizePredicateMatcher : public InstructionPredicateMatcher {
+protected:
+  unsigned MMOIdx;
+  uint64_t Size;
+
+public:
+  MemorySizePredicateMatcher(unsigned InsnVarID, unsigned MMOIdx, unsigned Size)
+      : InstructionPredicateMatcher(IPM_MemoryLLTSize, InsnVarID),
+        MMOIdx(MMOIdx), Size(Size) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == IPM_MemoryLLTSize;
+  }
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return InstructionPredicateMatcher::isIdentical(B) &&
+           MMOIdx == cast<MemorySizePredicateMatcher>(&B)->MMOIdx &&
+           Size == cast<MemorySizePredicateMatcher>(&B)->Size;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_CheckMemorySizeEqualTo")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("MMO") << MatchTable::IntValue(MMOIdx)
+          << MatchTable::Comment("Size") << MatchTable::IntValue(Size)
+          << MatchTable::LineBreak;
+  }
+};
+
+/// Generates code to check that the size of an MMO is less-than, equal-to, or
+/// greater than a given LLT.
+class MemoryVsLLTSizePredicateMatcher : public InstructionPredicateMatcher {
+public:
+  enum RelationKind {
+    GreaterThan,
+    EqualTo,
+    LessThan,
+  };
+
+protected:
+  unsigned MMOIdx;
+  RelationKind Relation;
+  unsigned OpIdx;
+
+public:
+  MemoryVsLLTSizePredicateMatcher(unsigned InsnVarID, unsigned MMOIdx,
+                                  enum RelationKind Relation,
+                                  unsigned OpIdx)
+      : InstructionPredicateMatcher(IPM_MemoryVsLLTSize, InsnVarID),
+        MMOIdx(MMOIdx), Relation(Relation), OpIdx(OpIdx) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == IPM_MemoryVsLLTSize;
+  }
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return InstructionPredicateMatcher::isIdentical(B) &&
+           MMOIdx == cast<MemoryVsLLTSizePredicateMatcher>(&B)->MMOIdx &&
+           Relation == cast<MemoryVsLLTSizePredicateMatcher>(&B)->Relation &&
+           OpIdx == cast<MemoryVsLLTSizePredicateMatcher>(&B)->OpIdx;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode(Relation == EqualTo
+                                    ? "GIM_CheckMemorySizeEqualToLLT"
+                                    : Relation == GreaterThan
+                                          ? "GIM_CheckMemorySizeGreaterThanLLT"
+                                          : "GIM_CheckMemorySizeLessThanLLT")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("MMO") << MatchTable::IntValue(MMOIdx)
+          << MatchTable::Comment("OpIdx") << MatchTable::IntValue(OpIdx)
+          << MatchTable::LineBreak;
+  }
+};
+
+/// Generates code to check an arbitrary C++ instruction predicate.
+class GenericInstructionPredicateMatcher : public InstructionPredicateMatcher {
+protected:
+  TreePredicateFn Predicate;
+
+public:
+  GenericInstructionPredicateMatcher(unsigned InsnVarID,
+                                     TreePredicateFn Predicate)
+      : InstructionPredicateMatcher(IPM_GenericPredicate, InsnVarID),
+        Predicate(Predicate) {}
+
+  static bool classof(const InstructionPredicateMatcher *P) {
+    return P->getKind() == IPM_GenericPredicate;
+  }
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_CheckCxxInsnPredicate")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("FnId")
+          << MatchTable::NamedValue(getEnumNameForPredicate(Predicate))
+          << MatchTable::LineBreak;
+  }
+};
+
 /// Generates code to check that a set of predicates and operands match for a
 /// particular instruction.
 ///
 /// Typical predicates include:
 /// * Has a specific opcode.
 /// * Has an nsw/nuw flag or doesn't.
-class InstructionMatcher
-    : public PredicateListMatcher<InstructionPredicateMatcher> {
+class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
 protected:
   typedef std::vector<std::unique_ptr<OperandMatcher>> OperandVec;
 
@@ -1519,6 +1862,7 @@ protected:
   /// The operands to match. All rendered operands must be present even if the
   /// condition is always true.
   OperandVec Operands;
+  bool NumOperandsCheck = true;
 
   std::string SymbolicName;
   unsigned InsnVarID;
@@ -1531,9 +1875,17 @@ public:
     InsnVarID = Rule.implicitlyDefineInsnVar(*this);
   }
 
+  /// Construct a new instruction predicate and add it to the matcher.
+  template <class Kind, class... Args>
+  Optional<Kind *> addPredicate(Args &&... args) {
+    Predicates.emplace_back(
+        llvm::make_unique<Kind>(getInsnVarID(), std::forward<Args>(args)...));
+    return static_cast<Kind *>(Predicates.back().get());
+  }
+
   RuleMatcher &getRuleMatcher() const { return Rule; }
 
-  unsigned getVarID() const { return InsnVarID; }
+  unsigned getInsnVarID() const { return InsnVarID; }
 
   /// Add an operand to the matcher.
   OperandMatcher &addOperand(unsigned OpIdx, const std::string &SymbolicName,
@@ -1549,7 +1901,7 @@ public:
   OperandMatcher &getOperand(unsigned OpIdx) {
     auto I = std::find_if(Operands.begin(), Operands.end(),
                           [&OpIdx](const std::unique_ptr<OperandMatcher> &X) {
-                            return X->getOperandIndex() == OpIdx;
+                            return X->getOpIdx() == OpIdx;
                           });
     if (I != Operands.end())
       return **I;
@@ -1572,21 +1924,17 @@ public:
 
   void pop_front() { Operands.erase(Operands.begin()); }
 
-  /// Emit MatchTable opcodes to check the shape of the match and capture
-  /// instructions into the MIs table.
-  void emitCaptureOpcodes(MatchTable &Table, RuleMatcher &Rule) {
-    Table << MatchTable::Opcode("GIM_CheckNumOperands")
-          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
-          << MatchTable::Comment("Expected")
-          << MatchTable::IntValue(getNumOperands()) << MatchTable::LineBreak;
-    for (const auto &Operand : Operands)
-      Operand->emitCaptureOpcodes(Table, Rule);
-  }
+  void optimize();
 
   /// Emit MatchTable opcodes that test whether the instruction named in
   /// InsnVarName matches all the predicates and all the operands.
-  void emitPredicateOpcodes(MatchTable &Table, RuleMatcher &Rule) const {
+  void emitPredicateOpcodes(MatchTable &Table, RuleMatcher &Rule) {
+    if (NumOperandsCheck)
+      InstructionNumOperandsMatcher(InsnVarID, getNumOperands())
+          .emitPredicateOpcodes(Table, Rule);
+
     emitPredicateListOpcodes(Table, Rule);
+
     for (const auto &Operand : Operands)
       Operand->emitPredicateOpcodes(Table, Rule);
   }
@@ -1594,17 +1942,19 @@ public:
   /// Compare the priority of this object and B.
   ///
   /// Returns true if this object is more important than B.
-  bool isHigherPriorityThan(const InstructionMatcher &B) const {
+  bool isHigherPriorityThan(InstructionMatcher &B) {
     // Instruction matchers involving more operands have higher priority.
     if (Operands.size() > B.Operands.size())
       return true;
     if (Operands.size() < B.Operands.size())
       return false;
 
-    for (const auto &Predicate : zip(predicates(), B.predicates())) {
-      if (std::get<0>(Predicate)->isHigherPriorityThan(*std::get<1>(Predicate)))
+    for (auto &&P : zip(predicates(), B.predicates())) {
+      auto L = static_cast<InstructionPredicateMatcher *>(std::get<0>(P).get());
+      auto R = static_cast<InstructionPredicateMatcher *>(std::get<1>(P).get());
+      if (L->isHigherPriorityThan(*R))
         return true;
-      if (std::get<1>(Predicate)->isHigherPriorityThan(*std::get<0>(Predicate)))
+      if (R->isHigherPriorityThan(*L))
         return false;
     }
 
@@ -1620,13 +1970,13 @@ public:
 
   /// Report the maximum number of temporary operands needed by the instruction
   /// matcher.
-  unsigned countRendererFns() const {
-    return std::accumulate(predicates().begin(), predicates().end(), 0,
-                           [](unsigned A,
-                              const std::unique_ptr<InstructionPredicateMatcher>
-                                  &Predicate) {
-                             return A + Predicate->countRendererFns();
-                           }) +
+  unsigned countRendererFns() {
+    return std::accumulate(
+               predicates().begin(), predicates().end(), 0,
+               [](unsigned A,
+                  const std::unique_ptr<PredicateMatcher> &Predicate) {
+                 return A + Predicate->countRendererFns();
+               }) +
            std::accumulate(
                Operands.begin(), Operands.end(), 0,
                [](unsigned A, const std::unique_ptr<OperandMatcher> &Operand) {
@@ -1634,24 +1984,36 @@ public:
                });
   }
 
-  bool isConstantInstruction() const {
-    for (const auto &P : predicates())
-      if (const InstructionOpcodeMatcher *Opcode =
-              dyn_cast<InstructionOpcodeMatcher>(P.get()))
-        return Opcode->isConstantInstruction();
-    return false;
+  InstructionOpcodeMatcher &getOpcodeMatcher() {
+    for (auto &P : predicates())
+      if (auto *OpMatcher = dyn_cast<InstructionOpcodeMatcher>(P.get()))
+        return *OpMatcher;
+    llvm_unreachable("Didn't find an opcode matcher");
+  }
+
+  bool isConstantInstruction() {
+    return getOpcodeMatcher().isConstantInstruction();
   }
+
+  StringRef getOpcode() { return getOpcodeMatcher().getOpcode(); }
 };
 
-template <>
-template <class Kind, class... Args>
-Optional<Kind *>
-PredicateListMatcher<InstructionPredicateMatcher>::addPredicate(
-    Args &&... args) {
-  InstructionMatcher *InstMatcher = static_cast<InstructionMatcher *>(this);
-  Predicates.emplace_back(llvm::make_unique<Kind>(InstMatcher->getVarID(),
-                                                  std::forward<Args>(args)...));
-  return static_cast<Kind *>(Predicates.back().get());
+StringRef RuleMatcher::getOpcode() const {
+  return Matchers.front()->getOpcode();
+}
+
+unsigned RuleMatcher::getNumOperands() const {
+  return Matchers.front()->getNumOperands();
+}
+
+LLTCodeGen RuleMatcher::getFirstConditionAsRootType() {
+  InstructionMatcher &InsnMatcher = *Matchers.front();
+  if (!InsnMatcher.predicates_empty())
+    if (const auto *TM =
+            dyn_cast<LLTOperandMatcher>(&**InsnMatcher.predicates_begin()))
+      if (TM->getInsnVarID() == 0 && TM->getOpIdx() == 0)
+        return TM->getTy();
+  return {};
 }
 
 /// Generates code to check that the operand is a register defined by an
@@ -1679,21 +2041,73 @@ public:
 
   InstructionMatcher &getInsnMatcher() const { return *InsnMatcher; }
 
-  void emitCaptureOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
-    unsigned InsnID =
-        Rule.defineInsnVar(Table, *InsnMatcher, InsnVarID, getOpIdx());
-    (void)InsnID;
-    assert(InsnMatcher->getVarID() == InsnID &&
-           "Mismatch between build and emit");
-    InsnMatcher->emitCaptureOpcodes(Table, Rule);
+  void emitCaptureOpcodes(MatchTable &Table, RuleMatcher &Rule) const {
+    const unsigned NewInsnVarID = InsnMatcher->getInsnVarID();
+    Table << MatchTable::Opcode("GIM_RecordInsn")
+          << MatchTable::Comment("DefineMI")
+          << MatchTable::IntValue(NewInsnVarID) << MatchTable::Comment("MI")
+          << MatchTable::IntValue(getInsnVarID())
+          << MatchTable::Comment("OpIdx") << MatchTable::IntValue(getOpIdx())
+          << MatchTable::Comment("MIs[" + llvm::to_string(NewInsnVarID) + "]")
+          << MatchTable::LineBreak;
   }
 
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override {
+    emitCaptureOpcodes(Table, Rule);
     InsnMatcher->emitPredicateOpcodes(Table, Rule);
   }
+
+  bool isHigherPriorityThan(const OperandPredicateMatcher &B) const override {
+    if (OperandPredicateMatcher::isHigherPriorityThan(B))
+      return true;
+    if (B.OperandPredicateMatcher::isHigherPriorityThan(*this))
+      return false;
+
+    if (const InstructionOperandMatcher *BP =
+            dyn_cast<InstructionOperandMatcher>(&B))
+      if (InsnMatcher->isHigherPriorityThan(*BP->InsnMatcher))
+        return true;
+    return false;
+  }
 };
 
+void InstructionMatcher::optimize() {
+  SmallVector<std::unique_ptr<PredicateMatcher>, 8> Stash;
+  const auto &OpcMatcher = getOpcodeMatcher();
+
+  Stash.push_back(predicates_pop_front());
+  if (Stash.back().get() == &OpcMatcher) {
+    if (NumOperandsCheck && OpcMatcher.getNumOperands() < getNumOperands())
+      Stash.emplace_back(
+          new InstructionNumOperandsMatcher(InsnVarID, getNumOperands()));
+    NumOperandsCheck = false;
+
+    for (auto &OM : Operands)
+      for (auto &OP : OM->predicates())
+        if (isa<IntrinsicIDOperandMatcher>(OP)) {
+          Stash.push_back(std::move(OP));
+          OM->eraseNullPredicates();
+          break;
+        }
+  }
+
+  if (InsnVarID > 0) {
+    assert(!Operands.empty() && "Nested instruction is expected to def a vreg");
+    for (auto &OP : Operands[0]->predicates())
+      OP.reset();
+    Operands[0]->eraseNullPredicates();
+  }
+  for (auto &OM : Operands) {
+    for (auto &OP : OM->predicates())
+      if (isa<LLTOperandMatcher>(OP))
+        Stash.push_back(std::move(OP));
+    OM->eraseNullPredicates();
+  }
+  while (!Stash.empty())
+    prependPredicate(Stash.pop_back_val());
+}
+
 //===- Actions ------------------------------------------------------------===//
 class OperandRenderer {
 public:
@@ -1706,7 +2120,8 @@ public:
     OR_Imm,
     OR_Register,
     OR_TempRegister,
-    OR_ComplexPattern
+    OR_ComplexPattern,
+    OR_Custom
   };
 
 protected:
@@ -1749,7 +2164,7 @@ public:
     Table << MatchTable::Opcode("GIR_Copy") << MatchTable::Comment("NewInsnID")
           << MatchTable::IntValue(NewInsnID) << MatchTable::Comment("OldInsnID")
           << MatchTable::IntValue(OldInsnVarID) << MatchTable::Comment("OpIdx")
-          << MatchTable::IntValue(Operand.getOperandIndex())
+          << MatchTable::IntValue(Operand.getOpIdx())
           << MatchTable::Comment(SymbolicName) << MatchTable::LineBreak;
   }
 };
@@ -1785,7 +2200,7 @@ public:
           << MatchTable::Comment("NewInsnID") << MatchTable::IntValue(NewInsnID)
           << MatchTable::Comment("OldInsnID")
           << MatchTable::IntValue(OldInsnVarID) << MatchTable::Comment("OpIdx")
-          << MatchTable::IntValue(Operand.getOperandIndex())
+          << MatchTable::IntValue(Operand.getOpIdx())
           << MatchTable::NamedValue(
                  (ZeroRegisterDef->getValue("Namespace")
                       ? ZeroRegisterDef->getValueAsString("Namespace")
@@ -1816,7 +2231,7 @@ public:
   const StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
-    const InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
+    InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
     unsigned OldInsnVarID = Rule.getInsnVarID(InsnMatcher);
     Table << MatchTable::Opcode(Signed ? "GIR_CopyConstantAsSImm"
                                        : "GIR_CopyConstantAsUImm")
@@ -1847,7 +2262,7 @@ public:
   const StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
-    const InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
+    InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
     unsigned OldInsnVarID = Rule.getInsnVarID(InsnMatcher);
     Table << MatchTable::Opcode("GIR_CopyFConstantAsFPImm")
           << MatchTable::Comment("NewInsnID") << MatchTable::IntValue(NewInsnID)
@@ -1887,7 +2302,7 @@ public:
           << MatchTable::Comment("NewInsnID") << MatchTable::IntValue(NewInsnID)
           << MatchTable::Comment("OldInsnID")
           << MatchTable::IntValue(OldInsnVarID) << MatchTable::Comment("OpIdx")
-          << MatchTable::IntValue(Operand.getOperandIndex())
+          << MatchTable::IntValue(Operand.getOpIdx())
           << MatchTable::Comment("SubRegIdx")
           << MatchTable::IntValue(SubReg->EnumValue)
           << MatchTable::Comment(SymbolicName) << MatchTable::LineBreak;
@@ -2018,6 +2433,37 @@ public:
   }
 };
 
+class CustomRenderer : public OperandRenderer {
+protected:
+  unsigned InsnID;
+  const Record &Renderer;
+  /// The name of the operand.
+  const std::string SymbolicName;
+
+public:
+  CustomRenderer(unsigned InsnID, const Record &Renderer,
+                 StringRef SymbolicName)
+      : OperandRenderer(OR_Custom), InsnID(InsnID), Renderer(Renderer),
+        SymbolicName(SymbolicName) {}
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_Custom;
+  }
+
+  void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
+    InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
+    unsigned OldInsnVarID = Rule.getInsnVarID(InsnMatcher);
+    Table << MatchTable::Opcode("GIR_CustomRenderer")
+          << MatchTable::Comment("InsnID") << MatchTable::IntValue(InsnID)
+          << MatchTable::Comment("OldInsnID")
+          << MatchTable::IntValue(OldInsnVarID)
+          << MatchTable::Comment("Renderer")
+          << MatchTable::NamedValue(
+                 "GICR_" + Renderer.getValueAsString("RendererFn").str())
+          << MatchTable::Comment(SymbolicName) << MatchTable::LineBreak;
+  }
+};
+
 /// An action taken when all Matcher predicates succeeded for a parent rule.
 ///
 /// Typical actions include:
@@ -2051,7 +2497,7 @@ class BuildMIAction : public MatchAction {
 private:
   unsigned InsnID;
   const CodeGenInstruction *I;
-  const InstructionMatcher *Matched;
+  InstructionMatcher *Matched;
   std::vector<std::unique_ptr<OperandRenderer>> OperandRenderers;
 
   /// True if the instruction can be built solely by mutating the opcode.
@@ -2066,7 +2512,7 @@ private:
       if (const auto *Copy = dyn_cast<CopyRenderer>(&*Renderer.value())) {
         const OperandMatcher &OM = Rule.getOperandMatcher(Copy->getSymbolicName());
         if (Insn != &OM.getInstructionMatcher() ||
-            OM.getOperandIndex() != Renderer.index())
+            OM.getOpIdx() != Renderer.index())
           return false;
       } else
         return false;
@@ -2079,10 +2525,11 @@ public:
   BuildMIAction(unsigned InsnID, const CodeGenInstruction *I)
       : InsnID(InsnID), I(I), Matched(nullptr) {}
 
+  unsigned getInsnID() const { return InsnID; }
   const CodeGenInstruction *getCGI() const { return I; }
 
   void chooseInsnToMutate(RuleMatcher &Rule) {
-    for (const auto *MutateCandidate : Rule.mutatable_insns()) {
+    for (auto *MutateCandidate : Rule.mutatable_insns()) {
       if (canMutate(Rule, MutateCandidate)) {
         // Take the first one we're offered that we're able to mutate.
         Rule.reserveInsnMatcherForMutation(MutateCandidate);
@@ -2160,7 +2607,7 @@ public:
       std::vector<unsigned> MergeInsnIDs;
       for (const auto &IDMatcherPair : Rule.defined_insn_vars())
         MergeInsnIDs.push_back(IDMatcherPair.second);
-      std::sort(MergeInsnIDs.begin(), MergeInsnIDs.end());
+      llvm::sort(MergeInsnIDs.begin(), MergeInsnIDs.end());
       for (const auto &MergeInsnID : MergeInsnIDs)
         Table << MatchTable::IntValue(MergeInsnID);
       Table << MatchTable::NamedValue("GIU_MergeMemOperands_EndOfList")
@@ -2274,27 +2721,13 @@ action_iterator RuleMatcher::insertAction(action_iterator InsertPt,
                          llvm::make_unique<Kind>(std::forward<Args>(args)...));
 }
 
-unsigned
-RuleMatcher::implicitlyDefineInsnVar(const InstructionMatcher &Matcher) {
+unsigned RuleMatcher::implicitlyDefineInsnVar(InstructionMatcher &Matcher) {
   unsigned NewInsnVarID = NextInsnVarID++;
   InsnVariableIDs[&Matcher] = NewInsnVarID;
   return NewInsnVarID;
 }
 
-unsigned RuleMatcher::defineInsnVar(MatchTable &Table,
-                                    const InstructionMatcher &Matcher,
-                                    unsigned InsnID, unsigned OpIdx) {
-  unsigned NewInsnVarID = implicitlyDefineInsnVar(Matcher);
-  Table << MatchTable::Opcode("GIM_RecordInsn")
-        << MatchTable::Comment("DefineMI") << MatchTable::IntValue(NewInsnVarID)
-        << MatchTable::Comment("MI") << MatchTable::IntValue(InsnID)
-        << MatchTable::Comment("OpIdx") << MatchTable::IntValue(OpIdx)
-        << MatchTable::Comment("MIs[" + llvm::to_string(NewInsnVarID) + "]")
-        << MatchTable::LineBreak;
-  return NewInsnVarID;
-}
-
-unsigned RuleMatcher::getInsnVarID(const InstructionMatcher &InsnMatcher) const {
+unsigned RuleMatcher::getInsnVarID(InstructionMatcher &InsnMatcher) const {
   const auto &I = InsnVariableIDs.find(&InsnMatcher);
   if (I != InsnVariableIDs.end())
     return I->second;
@@ -2312,7 +2745,7 @@ void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) {
   OM.addPredicate<SameOperandMatcher>(OM.getSymbolicName());
 }
 
-const InstructionMatcher &
+InstructionMatcher &
 RuleMatcher::getInstructionMatcher(StringRef SymbolicName) const {
   for (const auto &I : InsnVariableIDs)
     if (I.first->getSymbolicName() == SymbolicName)
@@ -2331,25 +2764,10 @@ RuleMatcher::getOperandMatcher(StringRef Name) const {
   return *I->second;
 }
 
-/// Emit MatchTable opcodes to check the shape of the match and capture
-/// instructions into local variables.
-void RuleMatcher::emitCaptureOpcodes(MatchTable &Table) {
-  assert(Matchers.size() == 1 && "Cannot handle multi-root matchers yet");
-  unsigned InsnVarID = implicitlyDefineInsnVar(*Matchers.front());
-  (void)InsnVarID;
-  assert(Matchers.front()->getVarID() == InsnVarID &&
-         "IDs differ between build and emit");
-  Matchers.front()->emitCaptureOpcodes(Table, *this);
-}
-
 void RuleMatcher::emit(MatchTable &Table) {
   if (Matchers.empty())
     llvm_unreachable("Unexpected empty matcher!");
 
-  // Reset the ID generation so that the emitted IDs match the ones
-  // we set while building the InstructionMatcher and such.
-  clearImplicitMap();
-
   // The representation supports rules that require multiple roots such as:
   //    %ptr(p0) = ...
   //    %elt0(s32) = G_LOAD %ptr
@@ -2363,7 +2781,9 @@ void RuleMatcher::emit(MatchTable &Table) {
 
   unsigned LabelID = Table.allocateLabelID();
   Table << MatchTable::Opcode("GIM_Try", +1)
-        << MatchTable::Comment("On fail goto") << MatchTable::JumpTarget(LabelID)
+        << MatchTable::Comment("On fail goto")
+        << MatchTable::JumpTarget(LabelID)
+        << MatchTable::Comment(("Rule ID " + Twine(RuleID) + " //").str())
         << MatchTable::LineBreak;
 
   if (!RequiredFeatures.empty()) {
@@ -2372,8 +2792,6 @@ void RuleMatcher::emit(MatchTable &Table) {
           << MatchTable::LineBreak;
   }
 
-  emitCaptureOpcodes(Table);
-
   Matchers.front()->emitPredicateOpcodes(Table, *this);
 
   // We must also check if it's safe to fold the matched instructions.
@@ -2388,7 +2806,7 @@ void RuleMatcher::emit(MatchTable &Table) {
 
       InsnIDs.push_back(Pair.second);
     }
-    std::sort(InsnIDs.begin(), InsnIDs.end());
+    llvm::sort(InsnIDs.begin(), InsnIDs.end());
 
     for (const auto &InsnID : InsnIDs) {
       // Reject the difficult cases until we have a more accurate check.
@@ -2433,15 +2851,22 @@ void RuleMatcher::emit(MatchTable &Table) {
     }
   }
 
+  for (const auto &PM : EpilogueMatchers)
+    PM->emitPredicateOpcodes(Table, *this);
+
   for (const auto &MA : Actions)
     MA->emitActionOpcodes(Table, *this);
 
-  if (GenerateCoverage)
+  if (Table.isWithCoverage())
     Table << MatchTable::Opcode("GIR_Coverage") << MatchTable::IntValue(RuleID)
           << MatchTable::LineBreak;
+  else
+    Table << MatchTable::Comment(("GIR_Coverage, " + Twine(RuleID) + ",").str())
+          << MatchTable::LineBreak;
 
   Table << MatchTable::Opcode("GIR_Done", -1) << MatchTable::LineBreak
         << MatchTable::Label(LabelID);
+  ++NumPatternEmitted;
 }
 
 bool RuleMatcher::isHigherPriorityThan(const RuleMatcher &B) const {
@@ -2505,7 +2930,7 @@ void SameOperandMatcher::emitPredicateOpcodes(MatchTable &Table,
                                               RuleMatcher &Rule) const {
   const OperandMatcher &OtherOM = Rule.getOperandMatcher(MatchingName);
   unsigned OtherInsnVarID = Rule.getInsnVarID(OtherOM.getInstructionMatcher());
-  assert(OtherInsnVarID == OtherOM.getInstructionMatcher().getVarID());
+  assert(OtherInsnVarID == OtherOM.getInstructionMatcher().getInsnVarID());
 
   Table << MatchTable::Opcode("GIM_CheckIsSameOperand")
         << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
@@ -2513,7 +2938,7 @@ void SameOperandMatcher::emitPredicateOpcodes(MatchTable &Table,
         << MatchTable::Comment("OtherMI")
         << MatchTable::IntValue(OtherInsnVarID)
         << MatchTable::Comment("OtherOpIdx")
-        << MatchTable::IntValue(OtherOM.getOperandIndex())
+        << MatchTable::IntValue(OtherOM.getOpIdx())
         << MatchTable::LineBreak;
 }
 
@@ -2541,25 +2966,43 @@ private:
   /// GIComplexPatternEquiv.
   DenseMap<const Record *, const Record *> ComplexPatternEquivs;
 
+  /// Keep track of the equivalence between SDNodeXForm's and
+  /// GICustomOperandRenderer. Map entries are specified by subclassing
+  /// GISDNodeXFormEquiv.
+  DenseMap<const Record *, const Record *> SDNodeXFormEquivs;
+
+  /// Keep track of Scores of PatternsToMatch similar to how the DAG does.
+  /// This adds compatibility for RuleMatchers to use this for ordering rules.
+  DenseMap<uint64_t, int> RuleMatcherScores;
+
   // Map of predicates to their subtarget features.
   SubtargetFeatureInfoMap SubtargetFeatures;
 
   // Rule coverage information.
   Optional<CodeGenCoverage> RuleCoverage;
 
+  void gatherOpcodeValues();
+  void gatherTypeIDValues();
   void gatherNodeEquivs();
+  // Instruction predicate code that will be emitted in generated functions.
+  SmallVector<std::string, 2> InstructionPredicateCodes;
+  unsigned getOrCreateInstructionPredicateFnId(StringRef Code);
+
   Record *findNodeEquiv(Record *N) const;
+  const CodeGenInstruction *getEquivNode(Record &Equiv,
+                                         const TreePatternNode *N) const;
 
   Error importRulePredicates(RuleMatcher &M, ArrayRef<Predicate> Predicates);
-  Expected<InstructionMatcher &> createAndImportSelDAGMatcher(
-      RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-      const TreePatternNode *Src, unsigned &TempOpIdx) const;
+  Expected<InstructionMatcher &>
+  createAndImportSelDAGMatcher(RuleMatcher &Rule,
+                               InstructionMatcher &InsnMatcher,
+                               const TreePatternNode *Src, unsigned &TempOpIdx);
   Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R,
                                            unsigned &TempOpIdx) const;
   Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
                            const TreePatternNode *SrcChild,
                            bool OperandIsAPointer, unsigned OpIdx,
-                           unsigned &TempOpIdx) const;
+                           unsigned &TempOpIdx);
 
   Expected<BuildMIAction &>
   createAndImportInstructionRenderer(RuleMatcher &M,
@@ -2585,9 +3028,14 @@ private:
   importImplicitDefRenderers(BuildMIAction &DstMIBuilder,
                              const std::vector<Record *> &ImplicitDefs) const;
 
-  void emitImmPredicates(raw_ostream &OS, StringRef TypeIdentifier,
-                         StringRef Type,
-                         std::function<bool(const Record *R)> Filter);
+  void emitCxxPredicateFns(raw_ostream &OS, StringRef CodeFieldName,
+                           StringRef TypeIdentifier, StringRef ArgType,
+                           StringRef ArgName, StringRef AdditionalDeclarations,
+                           std::function<bool(const Record *R)> Filter);
+  void emitImmPredicateFns(raw_ostream &OS, StringRef TypeIdentifier,
+                           StringRef ArgType,
+                           std::function<bool(const Record *R)> Filter);
+  void emitMIPredicateFns(raw_ostream &OS);
 
   /// Analyze pattern \p P, returning a matcher for it if possible.
   /// Otherwise, return an Error explaining why we don't support it.
@@ -2595,19 +3043,15 @@ private:
 
   void declareSubtargetFeature(Record *Predicate);
 
-  TreePatternNode *fixupPatternNode(TreePatternNode *N);
-  void fixupPatternTrees(TreePattern *P);
+  MatchTable buildMatchTable(MutableArrayRef<RuleMatcher> Rules, bool Optimize,
+                             bool WithCoverage);
 
+public:
   /// Takes a sequence of \p Rules and group them based on the predicates
-  /// they share. \p StorageGroupMatcher is used as a memory container
-  /// for the the group that are created as part of this process.
-  /// The optimization process does not change the relative order of
-  /// the rules. In particular, we don't try to share predicates if
-  /// that means reordering the rules (e.g., we won't group R1 and R3
-  /// in the following example as it would imply reordering R2 and R3
-  /// => R1 p1, R2 p2, R3 p1).
+  /// they share. \p MatcherStorage is used as a memory container
+  /// for the group that are created as part of this process.
   ///
-  /// What this optimization does looks like:
+  /// What this optimization does looks like if GroupT = GroupMatcher:
   /// Output without optimization:
   /// \verbatim
   /// # R1
@@ -2628,11 +3072,34 @@ private:
   ///  # R2
   ///   # predicate C
   /// \endverbatim
-  std::vector<Matcher *> optimizeRules(
-      const std::vector<Matcher *> &Rules,
-      std::vector<std::unique_ptr<GroupMatcher>> &StorageGroupMatcher);
+  template <class GroupT>
+  static std::vector<Matcher *> optimizeRules(
+      ArrayRef<Matcher *> Rules,
+      std::vector<std::unique_ptr<Matcher>> &MatcherStorage);
 };
 
+void GlobalISelEmitter::gatherOpcodeValues() {
+  InstructionOpcodeMatcher::initOpcodeValuesMap(Target);
+}
+
+void GlobalISelEmitter::gatherTypeIDValues() {
+  LLTOperandMatcher::initTypeIDValuesMap();
+}
+unsigned GlobalISelEmitter::getOrCreateInstructionPredicateFnId(StringRef Code) {
+  // There's not very many predicates that need to be here at the moment so we
+  // just maintain a simple set-like vector. If it grows then we'll need to do
+  // something more efficient.
+  const auto &I = std::find(InstructionPredicateCodes.begin(),
+                            InstructionPredicateCodes.end(),
+                            Code);
+  if (I == InstructionPredicateCodes.end()) {
+    unsigned ID = InstructionPredicateCodes.size();
+    InstructionPredicateCodes.push_back(Code);
+    return ID;
+  }
+  return std::distance(InstructionPredicateCodes.begin(), I);
+}
+
 void GlobalISelEmitter::gatherNodeEquivs() {
   assert(NodeEquivs.empty());
   for (Record *Equiv : RK.getAllDerivedDefinitions("GINodeEquiv"))
@@ -2645,15 +3112,36 @@ void GlobalISelEmitter::gatherNodeEquivs() {
       continue;
     ComplexPatternEquivs[SelDAGEquiv] = Equiv;
  }
+
+ assert(SDNodeXFormEquivs.empty());
+ for (Record *Equiv : RK.getAllDerivedDefinitions("GISDNodeXFormEquiv")) {
+   Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent");
+   if (!SelDAGEquiv)
+     continue;
+   SDNodeXFormEquivs[SelDAGEquiv] = Equiv;
+ }
 }
 
 Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
   return NodeEquivs.lookup(N);
 }
 
+const CodeGenInstruction *
+GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const {
+  for (const auto &Predicate : N->getPredicateFns()) {
+    if (!Equiv.isValueUnset("IfSignExtend") && Predicate.isLoad() &&
+        Predicate.isSignExtLoad())
+      return &Target.getInstruction(Equiv.getValueAsDef("IfSignExtend"));
+    if (!Equiv.isValueUnset("IfZeroExtend") && Predicate.isLoad() &&
+        Predicate.isZeroExtLoad())
+      return &Target.getInstruction(Equiv.getValueAsDef("IfZeroExtend"));
+  }
+  return &Target.getInstruction(Equiv.getValueAsDef("I"));
+}
+
 GlobalISelEmitter::GlobalISelEmitter(RecordKeeper &RK)
-    : RK(RK), CGP(RK, [&](TreePattern *P) { fixupPatternTrees(P); }),
-      Target(CGP.getTargetInfo()), CGRegs(RK, Target.getHwModes()) {}
+    : RK(RK), CGP(RK), Target(CGP.getTargetInfo()),
+      CGRegs(RK, Target.getHwModes()) {}
 
 //===- Emitter ------------------------------------------------------------===//
 
@@ -2672,7 +3160,7 @@ GlobalISelEmitter::importRulePredicates(RuleMatcher &M,
 
 Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-    const TreePatternNode *Src, unsigned &TempOpIdx) const {
+    const TreePatternNode *Src, unsigned &TempOpIdx) {
   Record *SrcGIEquivOrNull = nullptr;
   const CodeGenInstruction *SrcGIOrNull = nullptr;
 
@@ -2693,7 +3181,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     if (!SrcGIEquivOrNull)
       return failedImport("Pattern operator lacks an equivalent Instruction" +
                           explainOperator(Src->getOperator()));
-    SrcGIOrNull = &Target.getInstruction(SrcGIEquivOrNull->getValueAsDef("I"));
+    SrcGIOrNull = getEquivNode(*SrcGIEquivOrNull, Src);
 
     // The operators look good: match the opcode
     InsnMatcher.addPredicate<InstructionOpcodeMatcher>(SrcGIOrNull);
@@ -2718,8 +3206,26 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       continue;
     }
 
-    // No check required. G_LOAD by itself is a non-extending load.
-    if (Predicate.isNonExtLoad())
+    // G_LOAD is used for both non-extending and any-extending loads. 
+    if (Predicate.isLoad() && Predicate.isNonExtLoad()) {
+      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+          0, MemoryVsLLTSizePredicateMatcher::EqualTo, 0);
+      continue;
+    }
+    if (Predicate.isLoad() && Predicate.isAnyExtLoad()) {
+      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+          0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
+      continue;
+    }
+
+    // No check required. We already did it by swapping the opcode.
+    if (!SrcGIEquivOrNull->isValueUnset("IfSignExtend") &&
+        Predicate.isSignExtLoad())
+      continue;
+
+    // No check required. We already did it by swapping the opcode.
+    if (!SrcGIEquivOrNull->isValueUnset("IfZeroExtend") &&
+        Predicate.isZeroExtLoad())
       continue;
 
     // No check required. G_STORE by itself is a non-extending store.
@@ -2734,8 +3240,13 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         if (!MemTyOrNone)
           return failedImport("MemVT could not be converted to LLT");
 
-        OperandMatcher &OM = InsnMatcher.getOperand(0);
-        OM.addPredicate<LLTOperandMatcher>(MemTyOrNone.getValue());
+        // MMO's work in bytes so we must take care of unusual types like i1
+        // don't round down.
+        unsigned MemSizeInBits =
+            llvm::alignTo(MemTyOrNone->get().getSizeInBits(), 8);
+
+        InsnMatcher.addPredicate<MemorySizePredicateMatcher>(
+            0, MemSizeInBits / 8);
         continue;
       }
     }
@@ -2794,6 +3305,11 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       }
     }
 
+    if (Predicate.hasGISelPredicateCode()) {
+      InsnMatcher.addPredicate<GenericInstructionPredicateMatcher>(Predicate);
+      continue;
+    }
+
     return failedImport("Src pattern child has predicate (" +
                         explainPredicates(Src) + ")");
   }
@@ -2872,7 +3388,7 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule,
                                             const TreePatternNode *SrcChild,
                                             bool OperandIsAPointer,
                                             unsigned OpIdx,
-                                            unsigned &TempOpIdx) const {
+                                            unsigned &TempOpIdx) {
   OperandMatcher &OM =
       InsnMatcher.addOperand(OpIdx, SrcChild->getName(), TempOpIdx);
   if (OM.isSameAsAnotherOperand())
@@ -2986,10 +3502,6 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule,
 Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
     TreePatternNode *DstChild) {
-  if (DstChild->getTransformFn() != nullptr) {
-    return failedImport("Dst pattern child has transform fn " +
-                        DstChild->getTransformFn()->getName());
-  }
 
   const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName());
   if (SubOperand.hasValue()) {
@@ -3000,6 +3512,18 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
   }
 
   if (!DstChild->isLeaf()) {
+
+    if (DstChild->getOperator()->isSubClassOf("SDNodeXForm")) {
+      auto Child = DstChild->getChild(0);
+      auto I = SDNodeXFormEquivs.find(DstChild->getOperator());
+      if (I != SDNodeXFormEquivs.end()) {
+        DstMIBuilder.addRenderer<CustomRenderer>(*I->second, Child->getName());
+        return InsertPt;
+      }
+      return failedImport("SDNodeXForm " + Child->getName() +
+                          " has no custom renderer");
+    }
+
     // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't
     // inline, but in MI it's just another operand.
     if (DstChild->getOperator()->isSubClassOf("SDNode")) {
@@ -3104,10 +3628,6 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
       return InsertPt;
     }
 
-    if (ChildRec->isSubClassOf("SDNodeXForm"))
-      return failedImport("Dst pattern child def is an unsupported tablegen "
-                          "class (SDNodeXForm)");
-
     return failedImport(
         "Dst pattern child def is an unsupported tablegen class");
   }
@@ -3135,7 +3655,7 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
 
 Expected<action_iterator>
 GlobalISelEmitter::createAndImportSubInstructionRenderer(
-    action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
+    const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
     unsigned TempRegID) {
   auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst);
 
@@ -3143,7 +3663,6 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
 
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
-  InsertPt = InsertPtOrError.get();
 
   BuildMIAction &DstMIBuilder =
       *static_cast<BuildMIAction *>(InsertPtOrError.get()->get());
@@ -3151,10 +3670,13 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   // Assign the result to TempReg.
   DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID, true);
 
-  InsertPtOrError = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst);
+  InsertPtOrError =
+      importExplicitUseRenderers(InsertPtOrError.get(), M, DstMIBuilder, Dst);
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
 
+  M.insertAction<ConstrainOperandsToDefinitionAction>(InsertPt,
+                                                      DstMIBuilder.getInsnID());
   return InsertPtOrError.get();
 }
 
@@ -3311,7 +3833,9 @@ Error GlobalISelEmitter::importImplicitDefRenderers(
 
 Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   // Keep track of the matchers and actions to emit.
+  int Score = P.getPatternComplexity(CGP);
   RuleMatcher M(P.getSrcRecord()->getLoc());
+  RuleMatcherScores[M.getRuleID()] = Score;
   M.addAction<DebugCommentAction>(llvm::to_string(*P.getSrcPattern()) +
                                   "  =>  " +
                                   llvm::to_string(*P.getDstPattern()));
@@ -3526,14 +4050,15 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
 // Emit imm predicate table and an enum to reference them with.
 // The 'Predicate_' part of the name is redundant but eliminating it is more
 // trouble than it's worth.
-void GlobalISelEmitter::emitImmPredicates(
-    raw_ostream &OS, StringRef TypeIdentifier, StringRef Type,
+void GlobalISelEmitter::emitCxxPredicateFns(
+    raw_ostream &OS, StringRef CodeFieldName, StringRef TypeIdentifier,
+    StringRef ArgType, StringRef ArgName, StringRef AdditionalDeclarations,
     std::function<bool(const Record *R)> Filter) {
   std::vector<const Record *> MatchedRecords;
   const auto &Defs = RK.getAllDerivedDefinitions("PatFrag");
   std::copy_if(Defs.begin(), Defs.end(), std::back_inserter(MatchedRecords),
                [&](Record *Record) {
-                 return !Record->getValueAsString("ImmediateCode").empty() &&
+                 return !Record->getValueAsString(CodeFieldName).empty() &&
                         Filter(Record);
                });
 
@@ -3550,16 +4075,20 @@ void GlobalISelEmitter::emitImmPredicates(
     OS << "};\n";
   }
 
-  OS << "bool " << Target.getName() << "InstructionSelector::testImmPredicate_"
-     << TypeIdentifier << "(unsigned PredicateID, " << Type
-     << " Imm) const {\n";
+  OS << "bool " << Target.getName() << "InstructionSelector::test" << ArgName
+     << "Predicate_" << TypeIdentifier << "(unsigned PredicateID, " << ArgType << " "
+     << ArgName << ") const {\n"
+     << AdditionalDeclarations;
+  if (!AdditionalDeclarations.empty())
+    OS << "\n";
   if (!MatchedRecords.empty())
     OS << "  switch (PredicateID) {\n";
   for (const auto *Record : MatchedRecords) {
     OS << "  case GIPFP_" << TypeIdentifier << "_Predicate_"
        << Record->getName() << ": {\n"
-       << "    " << Record->getValueAsString("ImmediateCode") << "\n"
-       << "    llvm_unreachable(\"ImmediateCode should have returned\");\n"
+       << "    " << Record->getValueAsString(CodeFieldName) << "\n"
+       << "    llvm_unreachable(\"" << CodeFieldName
+       << " should have returned\");\n"
        << "    return false;\n"
        << "  }\n";
   }
@@ -3570,38 +4099,144 @@ void GlobalISelEmitter::emitImmPredicates(
      << "}\n";
 }
 
+void GlobalISelEmitter::emitImmPredicateFns(
+    raw_ostream &OS, StringRef TypeIdentifier, StringRef ArgType,
+    std::function<bool(const Record *R)> Filter) {
+  return emitCxxPredicateFns(OS, "ImmediateCode", TypeIdentifier, ArgType,
+                             "Imm", "", Filter);
+}
+
+void GlobalISelEmitter::emitMIPredicateFns(raw_ostream &OS) {
+  return emitCxxPredicateFns(
+      OS, "GISelPredicateCode", "MI", "const MachineInstr &", "MI",
+      "  const MachineFunction &MF = *MI.getParent()->getParent();\n"
+      "  const MachineRegisterInfo &MRI = MF.getRegInfo();\n"
+      "  (void)MRI;",
+      [](const Record *R) { return true; });
+}
+
+template <class GroupT>
 std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
-    const std::vector<Matcher *> &Rules,
-    std::vector<std::unique_ptr<GroupMatcher>> &StorageGroupMatcher) {
+    ArrayRef<Matcher *> Rules,
+    std::vector<std::unique_ptr<Matcher>> &MatcherStorage) {
+
   std::vector<Matcher *> OptRules;
-  // Start with a stupid grouping for now.
-  std::unique_ptr<GroupMatcher> CurrentGroup = make_unique<GroupMatcher>();
-  assert(CurrentGroup->conditions_empty());
-  unsigned NbGroup = 0;
-  for (Matcher *Rule : Rules) {
-    std::unique_ptr<PredicateMatcher> Predicate = Rule->forgetFirstCondition();
-    if (!CurrentGroup->conditions_empty() &&
-        !CurrentGroup->lastConditionMatches(*Predicate)) {
-      // Start a new group.
-      ++NbGroup;
+  std::unique_ptr<GroupT> CurrentGroup = make_unique<GroupT>();
+  assert(CurrentGroup->empty() && "Newly created group isn't empty!");
+  unsigned NumGroups = 0;
+
+  auto ProcessCurrentGroup = [&]() {
+    if (CurrentGroup->empty())
+      // An empty group is good to be reused:
+      return;
+
+    // If the group isn't large enough to provide any benefit, move all the
+    // added rules out of it and make sure to re-create the group to properly
+    // re-initialize it:
+    if (CurrentGroup->size() < 2)
+      for (Matcher *M : CurrentGroup->matchers())
+        OptRules.push_back(M);
+    else {
+      CurrentGroup->finalize();
       OptRules.push_back(CurrentGroup.get());
-      StorageGroupMatcher.emplace_back(std::move(CurrentGroup));
-      CurrentGroup = make_unique<GroupMatcher>();
-      assert(CurrentGroup->conditions_empty());
+      MatcherStorage.emplace_back(std::move(CurrentGroup));
+      ++NumGroups;
     }
-    if (CurrentGroup->conditions_empty())
-      CurrentGroup->addCondition(std::move(Predicate));
-    CurrentGroup->addRule(*Rule);
-  }
-  if (!CurrentGroup->conditions_empty()) {
-    ++NbGroup;
-    OptRules.push_back(CurrentGroup.get());
-    StorageGroupMatcher.emplace_back(std::move(CurrentGroup));
+    CurrentGroup = make_unique<GroupT>();
+  };
+  for (Matcher *Rule : Rules) {
+    // Greedily add as many matchers as possible to the current group:
+    if (CurrentGroup->addMatcher(*Rule))
+      continue;
+
+    ProcessCurrentGroup();
+    assert(CurrentGroup->empty() && "A group wasn't properly re-initialized");
+
+    // Try to add the pending matcher to a newly created empty group:
+    if (!CurrentGroup->addMatcher(*Rule))
+      // If we couldn't add the matcher to an empty group, that group type
+      // doesn't support that kind of matchers at all, so just skip it:
+      OptRules.push_back(Rule);
   }
-  DEBUG(dbgs() << "NbGroup: " << NbGroup << "\n");
+  ProcessCurrentGroup();
+
+  LLVM_DEBUG(dbgs() << "NumGroups: " << NumGroups << "\n");
+  assert(CurrentGroup->empty() && "The last group wasn't properly processed");
   return OptRules;
 }
 
+MatchTable
+GlobalISelEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules,
+                                   bool Optimize, bool WithCoverage) {
+  std::vector<Matcher *> InputRules;
+  for (Matcher &Rule : Rules)
+    InputRules.push_back(&Rule);
+
+  if (!Optimize)
+    return MatchTable::buildTable(InputRules, WithCoverage);
+
+  unsigned CurrentOrdering = 0;
+  StringMap<unsigned> OpcodeOrder;
+  for (RuleMatcher &Rule : Rules) {
+    const StringRef Opcode = Rule.getOpcode();
+    assert(!Opcode.empty() && "Didn't expect an undefined opcode");
+    if (OpcodeOrder.count(Opcode) == 0)
+      OpcodeOrder[Opcode] = CurrentOrdering++;
+  }
+
+  std::stable_sort(InputRules.begin(), InputRules.end(),
+                   [&OpcodeOrder](const Matcher *A, const Matcher *B) {
+                     auto *L = static_cast<const RuleMatcher *>(A);
+                     auto *R = static_cast<const RuleMatcher *>(B);
+                     return std::make_tuple(OpcodeOrder[L->getOpcode()],
+                                            L->getNumOperands()) <
+                            std::make_tuple(OpcodeOrder[R->getOpcode()],
+                                            R->getNumOperands());
+                   });
+
+  for (Matcher *Rule : InputRules)
+    Rule->optimize();
+
+  std::vector<std::unique_ptr<Matcher>> MatcherStorage;
+  std::vector<Matcher *> OptRules =
+      optimizeRules<GroupMatcher>(InputRules, MatcherStorage);
+
+  for (Matcher *Rule : OptRules)
+    Rule->optimize();
+
+  OptRules = optimizeRules<SwitchMatcher>(OptRules, MatcherStorage);
+
+  return MatchTable::buildTable(OptRules, WithCoverage);
+}
+
+void GroupMatcher::optimize() {
+  // Make sure we only sort by a specific predicate within a range of rules that
+  // all have that predicate checked against a specific value (not a wildcard):
+  auto F = Matchers.begin();
+  auto T = F;
+  auto E = Matchers.end();
+  while (T != E) {
+    while (T != E) {
+      auto *R = static_cast<RuleMatcher *>(*T);
+      if (!R->getFirstConditionAsRootType().get().isValid())
+        break;
+      ++T;
+    }
+    std::stable_sort(F, T, [](Matcher *A, Matcher *B) {
+      auto *L = static_cast<RuleMatcher *>(A);
+      auto *R = static_cast<RuleMatcher *>(B);
+      return L->getFirstConditionAsRootType() <
+             R->getFirstConditionAsRootType();
+    });
+    if (T != E)
+      F = ++T;
+  }
+  GlobalISelEmitter::optimizeRules<GroupMatcher>(Matchers, MatcherStorage)
+      .swap(Matchers);
+  GlobalISelEmitter::optimizeRules<SwitchMatcher>(Matchers, MatcherStorage)
+      .swap(Matchers);
+}
+
 void GlobalISelEmitter::run(raw_ostream &OS) {
   if (!UseCoverageFile.empty()) {
     RuleCoverage = CodeGenCoverage();
@@ -3617,6 +4252,11 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
     }
   }
 
+  // Track the run-time opcode values
+  gatherOpcodeValues();
+  // Track the run-time LLT ID values
+  gatherTypeIDValues();
+
   // Track the GINodeEquiv definitions.
   gatherNodeEquivs();
 
@@ -3652,14 +4292,19 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
     Rules.push_back(std::move(MatcherOrErr.get()));
   }
 
+  // Comparison function to order records by name.
+  auto orderByName = [](const Record *A, const Record *B) {
+    return A->getName() < B->getName();
+  };
+
   std::vector<Record *> ComplexPredicates =
       RK.getAllDerivedDefinitions("GIComplexOperandMatcher");
-  std::sort(ComplexPredicates.begin(), ComplexPredicates.end(),
-            [](const Record *A, const Record *B) {
-              if (A->getName() < B->getName())
-                return true;
-              return false;
-            });
+  llvm::sort(ComplexPredicates.begin(), ComplexPredicates.end(), orderByName);
+
+  std::vector<Record *> CustomRendererFns =
+      RK.getAllDerivedDefinitions("GICustomOperandRenderer");
+  llvm::sort(CustomRendererFns.begin(), CustomRendererFns.end(), orderByName);
+
   unsigned MaxTemporaries = 0;
   for (const auto &Rule : Rules)
     MaxTemporaries = std::max(MaxTemporaries, Rule.countRendererFns());
@@ -3677,21 +4322,33 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
         "ComplexRendererFns("
      << Target.getName()
      << "InstructionSelector::*ComplexMatcherMemFn)(MachineOperand &) const;\n"
-     << "  const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> "
-        "MatcherInfo;\n"
-     << "  static " << Target.getName()
+
+     << "  typedef void(" << Target.getName()
+     << "InstructionSelector::*CustomRendererFn)(MachineInstrBuilder &, const "
+        "MachineInstr&) "
+        "const;\n"
+     << "  const ISelInfoTy<PredicateBitset, ComplexMatcherMemFn, "
+        "CustomRendererFn> "
+        "ISelInfo;\n";
+  OS << "  static " << Target.getName()
      << "InstructionSelector::ComplexMatcherMemFn ComplexPredicateFns[];\n"
-     << "bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const "
+     << "  static " << Target.getName()
+     << "InstructionSelector::CustomRendererFn CustomRenderers[];\n"
+     << "  bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const "
         "override;\n"
-     << "bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) "
+     << "  bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) "
         "const override;\n"
-     << "bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat "
+     << "  bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat "
         "&Imm) const override;\n"
+     << "  const int64_t *getMatchTable() const override;\n"
+     << "  bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI) "
+        "const override;\n"
      << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n\n";
 
   OS << "#ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n"
      << ", State(" << MaxTemporaries << "),\n"
-     << "MatcherInfo({TypeObjects, FeatureBitsets, ComplexPredicateFns})\n"
+     << "ISelInfo(TypeObjects, NumTypeObjects, FeatureBitsets"
+     << ", ComplexPredicateFns, CustomRenderers)\n"
      << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n\n";
 
   OS << "#ifdef GET_GLOBALISEL_IMPL\n";
@@ -3723,9 +4380,9 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   // Emit a table containing the LLT objects needed by the matcher and an enum
   // for the matcher to reference them with.
   std::vector<LLTCodeGen> TypeObjects;
-  for (const auto &Ty : LLTOperandMatcher::KnownTypes)
+  for (const auto &Ty : KnownTypes)
     TypeObjects.push_back(Ty);
-  std::sort(TypeObjects.begin(), TypeObjects.end());
+  llvm::sort(TypeObjects.begin(), TypeObjects.end());
   OS << "// LLT Objects.\n"
      << "enum {\n";
   for (const auto &TypeObject : TypeObjects) {
@@ -3733,7 +4390,8 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
     TypeObject.emitCxxEnumValue(OS);
     OS << ",\n";
   }
-  OS << "};\n"
+  OS << "};\n";
+  OS << "const static size_t NumTypeObjects = " << TypeObjects.size() << ";\n"
      << "const static LLT TypeObjects[] = {\n";
   for (const auto &TypeObject : TypeObjects) {
     OS << "  ";
@@ -3747,7 +4405,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   std::vector<std::vector<Record *>> FeatureBitsets;
   for (auto &Rule : Rules)
     FeatureBitsets.push_back(Rule.getRequiredFeatures());
-  std::sort(
+  llvm::sort(
       FeatureBitsets.begin(), FeatureBitsets.end(),
       [&](const std::vector<Record *> &A, const std::vector<Record *> &B) {
         if (A.size() < B.size())
@@ -3798,18 +4456,19 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   OS << "};\n"
      << "// See constructor for table contents\n\n";
 
-  emitImmPredicates(OS, "I64", "int64_t", [](const Record *R) {
+  emitImmPredicateFns(OS, "I64", "int64_t", [](const Record *R) {
     bool Unset;
     return !R->getValueAsBitOrUnset("IsAPFloat", Unset) &&
            !R->getValueAsBit("IsAPInt");
   });
-  emitImmPredicates(OS, "APFloat", "const APFloat &", [](const Record *R) {
+  emitImmPredicateFns(OS, "APFloat", "const APFloat &", [](const Record *R) {
     bool Unset;
     return R->getValueAsBitOrUnset("IsAPFloat", Unset);
   });
-  emitImmPredicates(OS, "APInt", "const APInt &", [](const Record *R) {
+  emitImmPredicateFns(OS, "APInt", "const APInt &", [](const Record *R) {
     return R->getValueAsBit("IsAPInt");
   });
+  emitMIPredicateFns(OS);
   OS << "\n";
 
   OS << Target.getName() << "InstructionSelector::ComplexMatcherMemFn\n"
@@ -3821,22 +4480,30 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
        << ", // " << Record->getName() << "\n";
   OS << "};\n\n";
 
-  OS << "bool " << Target.getName()
-     << "InstructionSelector::selectImpl(MachineInstr &I, CodeGenCoverage "
-        "&CoverageInfo) const {\n"
-     << "  MachineFunction &MF = *I.getParent()->getParent();\n"
-     << "  MachineRegisterInfo &MRI = MF.getRegInfo();\n"
-     << "  // FIXME: This should be computed on a per-function basis rather "
-        "than per-insn.\n"
-     << "  AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, "
-        "&MF);\n"
-     << "  const PredicateBitset AvailableFeatures = getAvailableFeatures();\n"
-     << "  NewMIVector OutMIs;\n"
-     << "  State.MIs.clear();\n"
-     << "  State.MIs.push_back(&I);\n\n";
+  OS << "// Custom renderers.\n"
+     << "enum {\n"
+     << "  GICR_Invalid,\n";
+  for (const auto &Record : CustomRendererFns)
+    OS << "  GICR_" << Record->getValueAsString("RendererFn") << ", \n";
+  OS << "};\n";
+
+  OS << Target.getName() << "InstructionSelector::CustomRendererFn\n"
+     << Target.getName() << "InstructionSelector::CustomRenderers[] = {\n"
+     << "  nullptr, // GICP_Invalid\n";
+  for (const auto &Record : CustomRendererFns)
+    OS << "  &" << Target.getName()
+       << "InstructionSelector::" << Record->getValueAsString("RendererFn")
+       << ", // " << Record->getName() << "\n";
+  OS << "};\n\n";
 
   std::stable_sort(Rules.begin(), Rules.end(), [&](const RuleMatcher &A,
                                                    const RuleMatcher &B) {
+    int ScoreA = RuleMatcherScores[A.getRuleID()];
+    int ScoreB = RuleMatcherScores[B.getRuleID()];
+    if (ScoreA > ScoreB)
+      return true;
+    if (ScoreB > ScoreA)
+      return false;
     if (A.isHigherPriorityThan(B)) {
       assert(!B.isHigherPriorityThan(A) && "Cannot be more important "
                                            "and less important at "
@@ -3845,32 +4512,37 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
     }
     return false;
   });
-  std::vector<std::unique_ptr<GroupMatcher>> StorageGroupMatcher;
 
-  std::vector<Matcher *> InputRules;
-  for (Matcher &Rule : Rules)
-    InputRules.push_back(&Rule);
-
-  std::vector<Matcher *> OptRules =
-      OptimizeMatchTable ? optimizeRules(InputRules, StorageGroupMatcher)
-                         : InputRules;
+  OS << "bool " << Target.getName()
+     << "InstructionSelector::selectImpl(MachineInstr &I, CodeGenCoverage "
+        "&CoverageInfo) const {\n"
+     << "  MachineFunction &MF = *I.getParent()->getParent();\n"
+     << "  MachineRegisterInfo &MRI = MF.getRegInfo();\n"
+     << "  // FIXME: This should be computed on a per-function basis rather "
+        "than per-insn.\n"
+     << "  AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, "
+        "&MF);\n"
+     << "  const PredicateBitset AvailableFeatures = getAvailableFeatures();\n"
+     << "  NewMIVector OutMIs;\n"
+     << "  State.MIs.clear();\n"
+     << "  State.MIs.push_back(&I);\n\n"
+     << "  if (executeMatchTable(*this, OutMIs, State, ISelInfo"
+     << ", getMatchTable(), TII, MRI, TRI, RBI, AvailableFeatures"
+     << ", CoverageInfo)) {\n"
+     << "    return true;\n"
+     << "  }\n\n"
+     << "  return false;\n"
+     << "}\n\n";
 
-  MatchTable Table(0);
-  for (Matcher *Rule : OptRules) {
-    Rule->emit(Table);
-    ++NumPatternEmitted;
-  }
-  Table << MatchTable::Opcode("GIM_Reject") << MatchTable::LineBreak;
+  const MatchTable Table =
+      buildMatchTable(Rules, OptimizeMatchTable, GenerateCoverage);
+  OS << "const int64_t *" << Target.getName()
+     << "InstructionSelector::getMatchTable() const {\n";
   Table.emitDeclaration(OS);
-  OS << "  if (executeMatchTable(*this, OutMIs, State, MatcherInfo, ";
+  OS << "  return ";
   Table.emitUse(OS);
-  OS << ", TII, MRI, TRI, RBI, AvailableFeatures, CoverageInfo)) {\n"
-     << "    return true;\n"
-     << "  }\n\n";
-
-  OS << "  return false;\n"
-     << "}\n"
-     << "#endif // ifdef GET_GLOBALISEL_IMPL\n";
+  OS << ";\n}\n";
+  OS << "#endif // ifdef GET_GLOBALISEL_IMPL\n";
 
   OS << "#ifdef GET_GLOBALISEL_PREDICATES_DECL\n"
      << "PredicateBitset AvailableModuleFeatures;\n"
@@ -3899,137 +4571,290 @@ void GlobalISelEmitter::declareSubtargetFeature(Record *Predicate) {
         Predicate, SubtargetFeatureInfo(Predicate, SubtargetFeatures.size()));
 }
 
-TreePatternNode *GlobalISelEmitter::fixupPatternNode(TreePatternNode *N) {
-  if (!N->isLeaf()) {
-    for (unsigned I = 0, E = N->getNumChildren(); I < E; ++I) {
-      TreePatternNode *OrigChild = N->getChild(I);
-      TreePatternNode *NewChild = fixupPatternNode(OrigChild);
-      if (OrigChild != NewChild)
-        N->setChild(I, NewChild);
+void RuleMatcher::optimize() {
+  for (auto &Item : InsnVariableIDs) {
+    InstructionMatcher &InsnMatcher = *Item.first;
+    for (auto &OM : InsnMatcher.operands()) {
+      // Complex Patterns are usually expensive and they relatively rarely fail
+      // on their own: more often we end up throwing away all the work done by a
+      // matching part of a complex pattern because some other part of the
+      // enclosing pattern didn't match. All of this makes it beneficial to
+      // delay complex patterns until the very end of the rule matching,
+      // especially for targets having lots of complex patterns.
+      for (auto &OP : OM->predicates())
+        if (isa<ComplexPatternOperandMatcher>(OP))
+          EpilogueMatchers.emplace_back(std::move(OP));
+      OM->eraseNullPredicates();
     }
+    InsnMatcher.optimize();
+  }
+  llvm::sort(
+      EpilogueMatchers.begin(), EpilogueMatchers.end(),
+      [](const std::unique_ptr<PredicateMatcher> &L,
+         const std::unique_ptr<PredicateMatcher> &R) {
+        return std::make_tuple(L->getKind(), L->getInsnVarID(), L->getOpIdx()) <
+               std::make_tuple(R->getKind(), R->getInsnVarID(), R->getOpIdx());
+      });
+}
 
-    if (N->getOperator()->getName() == "ld") {
-      // If it's a signext-load we need to adapt the pattern slightly. We need
-      // to split the node into (sext (ld ...)), remove the <<signext>> predicate,
-      // and then apply the <<signextTY>> predicate by updating the result type
-      // of the load.
-      //
-      // For example:
-      //   (ld:[i32] [iPTR])<<unindexed>><<signext>><<signexti16>>
-      // must be transformed into:
-      //   (sext:[i32] (ld:[i16] [iPTR])<<unindexed>>)
-      //
-      // Likewise for zeroext-load and anyext-load.
-
-      std::vector<TreePredicateFn> Predicates;
-      bool IsSignExtLoad = false;
-      bool IsZeroExtLoad = false;
-      bool IsAnyExtLoad = false;
-      Record *MemVT = nullptr;
-      for (const auto &P : N->getPredicateFns()) {
-        if (P.isLoad() && P.isSignExtLoad()) {
-          IsSignExtLoad = true;
-          continue;
-        }
-        if (P.isLoad() && P.isZeroExtLoad()) {
-          IsZeroExtLoad = true;
-          continue;
-        }
-        if (P.isLoad() && P.isAnyExtLoad()) {
-          IsAnyExtLoad = true;
-          continue;
-        }
-        if (P.isLoad() && P.getMemoryVT()) {
-          MemVT = P.getMemoryVT();
-          continue;
-        }
-        Predicates.push_back(P);
-      }
-
-      if ((IsSignExtLoad || IsZeroExtLoad || IsAnyExtLoad) && MemVT) {
-        assert((IsSignExtLoad + IsZeroExtLoad + IsAnyExtLoad) == 1 &&
-               "IsSignExtLoad, IsZeroExtLoad, IsAnyExtLoad are mutually exclusive");
-        TreePatternNode *Ext = new TreePatternNode(
-            RK.getDef(IsSignExtLoad ? "sext"
-                                    : IsZeroExtLoad ? "zext" : "anyext"),
-            {N}, 1);
-        Ext->setType(0, N->getType(0));
-        N->clearPredicateFns();
-        N->setPredicateFns(Predicates);
-        N->setType(0, getValueType(MemVT));
-        return Ext;
-      }
-    }
-  }
-
-  return N;
+bool RuleMatcher::hasFirstCondition() const {
+  if (insnmatchers_empty())
+    return false;
+  InstructionMatcher &Matcher = insnmatchers_front();
+  if (!Matcher.predicates_empty())
+    return true;
+  for (auto &OM : Matcher.operands())
+    for (auto &OP : OM->predicates())
+      if (!isa<InstructionOperandMatcher>(OP))
+        return true;
+  return false;
 }
 
-void GlobalISelEmitter::fixupPatternTrees(TreePattern *P) {
-  for (unsigned I = 0, E = P->getNumTrees(); I < E; ++I) {
-    TreePatternNode *OrigTree = P->getTree(I);
-    TreePatternNode *NewTree = fixupPatternNode(OrigTree);
-    if (OrigTree != NewTree)
-      P->setTree(I, NewTree);
-  }
+const PredicateMatcher &RuleMatcher::getFirstCondition() const {
+  assert(!insnmatchers_empty() &&
+         "Trying to get a condition from an empty RuleMatcher");
+
+  InstructionMatcher &Matcher = insnmatchers_front();
+  if (!Matcher.predicates_empty())
+    return **Matcher.predicates_begin();
+  // If there is no more predicate on the instruction itself, look at its
+  // operands.
+  for (auto &OM : Matcher.operands())
+    for (auto &OP : OM->predicates())
+      if (!isa<InstructionOperandMatcher>(OP))
+        return *OP;
+
+  llvm_unreachable("Trying to get a condition from an InstructionMatcher with "
+                   "no conditions");
 }
 
-std::unique_ptr<PredicateMatcher> RuleMatcher::forgetFirstCondition() {
+std::unique_ptr<PredicateMatcher> RuleMatcher::popFirstCondition() {
   assert(!insnmatchers_empty() &&
-         "Trying to forget something that does not exist");
+         "Trying to pop a condition from an empty RuleMatcher");
 
   InstructionMatcher &Matcher = insnmatchers_front();
-  std::unique_ptr<PredicateMatcher> Condition;
   if (!Matcher.predicates_empty())
-    Condition = Matcher.predicates_pop_front();
-  if (!Condition) {
-    // If there is no more predicate on the instruction itself, look at its
-    // operands.
-    assert(!Matcher.operands_empty() &&
-           "Empty instruction should have been discarded");
-    OperandMatcher &OpMatcher = **Matcher.operands_begin();
-    assert(!OpMatcher.predicates_empty() && "no operand constraint");
-    Condition = OpMatcher.predicates_pop_front();
-    // If this operand is free of constraints, rip it off.
-    if (OpMatcher.predicates_empty())
-      Matcher.pop_front();
-  }
-  // Rip the instruction off when it is empty.
-  if (Matcher.operands_empty() && Matcher.predicates_empty())
-    insnmatchers_pop_front();
-  return Condition;
-}
-
-bool GroupMatcher::lastConditionMatches(
+    return Matcher.predicates_pop_front();
+  // If there is no more predicate on the instruction itself, look at its
+  // operands.
+  for (auto &OM : Matcher.operands())
+    for (auto &OP : OM->predicates())
+      if (!isa<InstructionOperandMatcher>(OP)) {
+        std::unique_ptr<PredicateMatcher> Result = std::move(OP);
+        OM->eraseNullPredicates();
+        return Result;
+      }
+
+  llvm_unreachable("Trying to pop a condition from an InstructionMatcher with "
+                   "no conditions");
+}
+
+bool GroupMatcher::candidateConditionMatches(
     const PredicateMatcher &Predicate) const {
-  const auto &LastCondition = conditions_back();
-  return Predicate.isIdentical(*LastCondition);
+
+  if (empty()) {
+    // Sharing predicates for nested instructions is not supported yet as we
+    // currently don't hoist the GIM_RecordInsn's properly, therefore we can
+    // only work on the original root instruction (InsnVarID == 0):
+    if (Predicate.getInsnVarID() != 0)
+      return false;
+    // ... otherwise an empty group can handle any predicate with no specific
+    // requirements:
+    return true;
+  }
+
+  const Matcher &Representative = **Matchers.begin();
+  const auto &RepresentativeCondition = Representative.getFirstCondition();
+  // ... if not empty, the group can only accomodate matchers with the exact
+  // same first condition:
+  return Predicate.isIdentical(RepresentativeCondition);
+}
+
+bool GroupMatcher::addMatcher(Matcher &Candidate) {
+  if (!Candidate.hasFirstCondition())
+    return false;
+
+  const PredicateMatcher &Predicate = Candidate.getFirstCondition();
+  if (!candidateConditionMatches(Predicate))
+    return false;
+
+  Matchers.push_back(&Candidate);
+  return true;
+}
+
+void GroupMatcher::finalize() {
+  assert(Conditions.empty() && "Already finalized?");
+  if (empty())
+    return;
+
+  Matcher &FirstRule = **Matchers.begin();
+  for (;;) {
+    // All the checks are expected to succeed during the first iteration:
+    for (const auto &Rule : Matchers)
+      if (!Rule->hasFirstCondition())
+        return;
+    const auto &FirstCondition = FirstRule.getFirstCondition();
+    for (unsigned I = 1, E = Matchers.size(); I < E; ++I)
+      if (!Matchers[I]->getFirstCondition().isIdentical(FirstCondition))
+        return;
+
+    Conditions.push_back(FirstRule.popFirstCondition());
+    for (unsigned I = 1, E = Matchers.size(); I < E; ++I)
+      Matchers[I]->popFirstCondition();
+  }
 }
 
 void GroupMatcher::emit(MatchTable &Table) {
-  unsigned LabelID = Table.allocateLabelID();
-  if (!conditions_empty()) {
+  unsigned LabelID = ~0U;
+  if (!Conditions.empty()) {
+    LabelID = Table.allocateLabelID();
     Table << MatchTable::Opcode("GIM_Try", +1)
           << MatchTable::Comment("On fail goto")
           << MatchTable::JumpTarget(LabelID) << MatchTable::LineBreak;
-    for (auto &Condition : Conditions)
-      Condition->emitPredicateOpcodes(
-          Table, *static_cast<RuleMatcher *>(*Rules.begin()));
   }
-  // Emit the conditions.
-  // Then checks apply the rules.
-  for (const auto &Rule : Rules)
-    Rule->emit(Table);
-  // If we don't succeeded for that block, that means we are not going to select
-  // this instruction.
-  if (!conditions_empty()) {
-    Table << MatchTable::Opcode("GIM_Reject") << MatchTable::LineBreak;
-    Table << MatchTable::Opcode("GIR_Done", -1) << MatchTable::LineBreak
+  for (auto &Condition : Conditions)
+    Condition->emitPredicateOpcodes(
+        Table, *static_cast<RuleMatcher *>(*Matchers.begin()));
+
+  for (const auto &M : Matchers)
+    M->emit(Table);
+
+  // Exit the group
+  if (!Conditions.empty())
+    Table << MatchTable::Opcode("GIM_Reject", -1) << MatchTable::LineBreak
           << MatchTable::Label(LabelID);
+}
+
+bool SwitchMatcher::isSupportedPredicateType(const PredicateMatcher &P) {
+  return isa<InstructionOpcodeMatcher>(P) || isa<LLTOperandMatcher>(P);
+}
+
+bool SwitchMatcher::candidateConditionMatches(
+    const PredicateMatcher &Predicate) const {
+
+  if (empty()) {
+    // Sharing predicates for nested instructions is not supported yet as we
+    // currently don't hoist the GIM_RecordInsn's properly, therefore we can
+    // only work on the original root instruction (InsnVarID == 0):
+    if (Predicate.getInsnVarID() != 0)
+      return false;
+    // ... while an attempt to add even a root matcher to an empty SwitchMatcher
+    // could fail as not all the types of conditions are supported:
+    if (!isSupportedPredicateType(Predicate))
+      return false;
+    // ... or the condition might not have a proper implementation of
+    // getValue() / isIdenticalDownToValue() yet:
+    if (!Predicate.hasValue())
+      return false;
+    // ... otherwise an empty Switch can accomodate the condition with no
+    // further requirements:
+    return true;
+  }
+
+  const Matcher &CaseRepresentative = **Matchers.begin();
+  const auto &RepresentativeCondition = CaseRepresentative.getFirstCondition();
+  // Switch-cases must share the same kind of condition and path to the value it
+  // checks:
+  if (!Predicate.isIdenticalDownToValue(RepresentativeCondition))
+    return false;
+
+  const auto Value = Predicate.getValue();
+  // ... but be unique with respect to the actual value they check:
+  return Values.count(Value) == 0;
+}
+
+bool SwitchMatcher::addMatcher(Matcher &Candidate) {
+  if (!Candidate.hasFirstCondition())
+    return false;
+
+  const PredicateMatcher &Predicate = Candidate.getFirstCondition();
+  if (!candidateConditionMatches(Predicate))
+    return false;
+  const auto Value = Predicate.getValue();
+  Values.insert(Value);
+
+  Matchers.push_back(&Candidate);
+  return true;
+}
+
+void SwitchMatcher::finalize() {
+  assert(Condition == nullptr && "Already finalized");
+  assert(Values.size() == Matchers.size() && "Broken SwitchMatcher");
+  if (empty())
+    return;
+
+  std::stable_sort(Matchers.begin(), Matchers.end(),
+                   [](const Matcher *L, const Matcher *R) {
+                     return L->getFirstCondition().getValue() <
+                            R->getFirstCondition().getValue();
+                   });
+  Condition = Matchers[0]->popFirstCondition();
+  for (unsigned I = 1, E = Values.size(); I < E; ++I)
+    Matchers[I]->popFirstCondition();
+}
+
+void SwitchMatcher::emitPredicateSpecificOpcodes(const PredicateMatcher &P,
+                                                 MatchTable &Table) {
+  assert(isSupportedPredicateType(P) && "Predicate type is not supported");
+
+  if (const auto *Condition = dyn_cast<InstructionOpcodeMatcher>(&P)) {
+    Table << MatchTable::Opcode("GIM_SwitchOpcode") << MatchTable::Comment("MI")
+          << MatchTable::IntValue(Condition->getInsnVarID());
+    return;
+  }
+  if (const auto *Condition = dyn_cast<LLTOperandMatcher>(&P)) {
+    Table << MatchTable::Opcode("GIM_SwitchType") << MatchTable::Comment("MI")
+          << MatchTable::IntValue(Condition->getInsnVarID())
+          << MatchTable::Comment("Op")
+          << MatchTable::IntValue(Condition->getOpIdx());
+    return;
+  }
+
+  llvm_unreachable("emitPredicateSpecificOpcodes is broken: can not handle a "
+                   "predicate type that is claimed to be supported");
+}
+
+void SwitchMatcher::emit(MatchTable &Table) {
+  assert(Values.size() == Matchers.size() && "Broken SwitchMatcher");
+  if (empty())
+    return;
+  assert(Condition != nullptr &&
+         "Broken SwitchMatcher, hasn't been finalized?");
+
+  std::vector<unsigned> LabelIDs(Values.size());
+  std::generate(LabelIDs.begin(), LabelIDs.end(),
+                [&Table]() { return Table.allocateLabelID(); });
+  const unsigned Default = Table.allocateLabelID();
+
+  const int64_t LowerBound = Values.begin()->getRawValue();
+  const int64_t UpperBound = Values.rbegin()->getRawValue() + 1;
+
+  emitPredicateSpecificOpcodes(*Condition, Table);
+
+  Table << MatchTable::Comment("[") << MatchTable::IntValue(LowerBound)
+        << MatchTable::IntValue(UpperBound) << MatchTable::Comment(")")
+        << MatchTable::Comment("default:") << MatchTable::JumpTarget(Default);
+
+  int64_t J = LowerBound;
+  auto VI = Values.begin();
+  for (unsigned I = 0, E = Values.size(); I < E; ++I) {
+    auto V = *VI++;
+    while (J++ < V.getRawValue())
+      Table << MatchTable::IntValue(0);
+    V.turnIntoComment();
+    Table << MatchTable::LineBreak << V << MatchTable::JumpTarget(LabelIDs[I]);
+  }
+  Table << MatchTable::LineBreak;
+
+  for (unsigned I = 0, E = Values.size(); I < E; ++I) {
+    Table << MatchTable::Label(LabelIDs[I]);
+    Matchers[I]->emit(Table);
+    Table << MatchTable::Opcode("GIM_Reject") << MatchTable::LineBreak;
   }
+  Table << MatchTable::Label(Default);
 }
 
-unsigned OperandMatcher::getInsnVarID() const { return Insn.getVarID(); }
+unsigned OperandMatcher::getInsnVarID() const { return Insn.getInsnVarID(); }
 
 } // end anonymous namespace
 
diff --git a/contrib/llvm/utils/TableGen/InfoByHwMode.cpp b/contrib/llvm/utils/TableGen/InfoByHwMode.cpp
index d5a181e130a5..7d1f71cc2647 100644
--- a/contrib/llvm/utils/TableGen/InfoByHwMode.cpp
+++ b/contrib/llvm/utils/TableGen/InfoByHwMode.cpp
@@ -84,7 +84,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
   std::vector<const PairType*> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
-  std::sort(Pairs.begin(), Pairs.end(), deref<std::less<PairType>>());
+  llvm::sort(Pairs.begin(), Pairs.end(), deref<std::less<PairType>>());
 
   OS << '{';
   for (unsigned i = 0, e = Pairs.size(); i != e; ++i) {
@@ -176,7 +176,7 @@ void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
   std::vector<const PairType*> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
-  std::sort(Pairs.begin(), Pairs.end(), deref<std::less<PairType>>());
+  llvm::sort(Pairs.begin(), Pairs.end(), deref<std::less<PairType>>());
 
   OS << '{';
   for (unsigned i = 0, e = Pairs.size(); i != e; ++i) {
diff --git a/contrib/llvm/utils/TableGen/InfoByHwMode.h b/contrib/llvm/utils/TableGen/InfoByHwMode.h
index b2e217498888..4838198e704d 100644
--- a/contrib/llvm/utils/TableGen/InfoByHwMode.h
+++ b/contrib/llvm/utils/TableGen/InfoByHwMode.h
@@ -16,7 +16,7 @@
 #define LLVM_UTILS_TABLEGEN_INFOBYHWMODE_H
 
 #include "CodeGenHwModes.h"
-#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/MachineValueType.h"
 
 #include <map>
 #include <set>
diff --git a/contrib/llvm/utils/TableGen/InstrDocsEmitter.cpp b/contrib/llvm/utils/TableGen/InstrDocsEmitter.cpp
index fa9ee9569427..65cb28cd17a3 100644
--- a/contrib/llvm/utils/TableGen/InstrDocsEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/InstrDocsEmitter.cpp
@@ -109,6 +109,7 @@ void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     FLAG(isBarrier)
     FLAG(isCall)
     FLAG(isAdd)
+    FLAG(isTrap)
     FLAG(canFoldAsLoad)
     FLAG(mayLoad)
     //FLAG(mayLoad_Unset) // Deliberately omitted.
diff --git a/contrib/llvm/utils/TableGen/InstrInfoEmitter.cpp b/contrib/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 379e3245d066..a492daac0d09 100644
--- a/contrib/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -16,6 +16,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
+#include "PredicateExpander.h"
 #include "SequenceToOffsetTable.h"
 #include "TableGenBackends.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -59,6 +60,17 @@ private:
   typedef std::map<std::map<unsigned, unsigned>,
                    std::vector<std::string>> OpNameMapTy;
   typedef std::map<std::string, unsigned>::iterator StrUintMapIter;
+
+  /// Generate member functions in the target-specific GenInstrInfo class.
+  ///
+  /// This method is used to custom expand TIIPredicate definitions.
+  /// See file llvm/Target/TargetInstPredicates.td for a description of what is
+  /// a TIIPredicate and how to use it.
+  void emitTIIHelperMethods(raw_ostream &OS);
+
+  /// Expand TIIPredicate definitions to functions that accept a const MCInst
+  /// reference.
+  void emitMCIIHelperMethods(raw_ostream &OS);
   void emitRecord(const CodeGenInstruction &Inst, unsigned Num,
                   Record *InstrInfo,
                   std::map<std::vector<Record*>, unsigned> &EL,
@@ -339,6 +351,74 @@ void InstrInfoEmitter::emitOperandTypesEnum(raw_ostream &OS,
   OS << "#endif // GET_INSTRINFO_OPERAND_TYPES_ENUM\n\n";
 }
 
+void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS) {
+  RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate");
+  if (TIIPredicates.empty())
+    return;
+
+  CodeGenTarget &Target = CDP.getTargetInfo();
+  const StringRef TargetName = Target.getName();
+  formatted_raw_ostream FOS(OS);
+
+  FOS << "#ifdef GET_GENINSTRINFO_MC_DECL\n";
+  FOS << "#undef GET_GENINSTRINFO_MC_DECL\n\n";
+
+  FOS << "namespace llvm {\n";
+  FOS << "class MCInst;\n\n";
+
+  FOS << "namespace " << TargetName << "_MC {\n\n";
+
+  for (const Record *Rec : TIIPredicates) {
+    FOS << "bool " << Rec->getValueAsString("FunctionName")
+        << "(const MCInst &MI);\n";
+  }
+
+  FOS << "\n} // end " << TargetName << "_MC namespace\n";
+  FOS << "} // end llvm namespace\n\n";
+
+  FOS << "#endif // GET_GENINSTRINFO_MC_DECL\n\n";
+
+  FOS << "#ifdef GET_GENINSTRINFO_MC_HELPERS\n";
+  FOS << "#undef GET_GENINSTRINFO_MC_HELPERS\n\n";
+
+  FOS << "namespace llvm {\n";
+  FOS << "namespace " << TargetName << "_MC {\n\n";
+
+  PredicateExpander PE;
+  PE.setExpandForMC(true);
+  for (const Record *Rec : TIIPredicates) {
+    FOS << "bool " << Rec->getValueAsString("FunctionName");
+    FOS << "(const MCInst &MI) {\n";
+    FOS << "  return ";
+    PE.expandPredicate(FOS, Rec->getValueAsDef("Pred"));
+    FOS << ";\n}\n";
+  }
+
+  FOS << "\n} // end " << TargetName << "_MC namespace\n";
+  FOS << "} // end llvm namespace\n\n";
+
+  FOS << "#endif // GET_GENISTRINFO_MC_HELPERS\n";
+}
+
+void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS) {
+  RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate");
+  if (TIIPredicates.empty())
+    return;
+
+  formatted_raw_ostream FOS(OS);
+  PredicateExpander PE;
+  PE.setExpandForMC(false);
+  PE.setIndentLevel(2);
+
+  for (const Record *Rec : TIIPredicates) {
+    FOS << "\n  static bool " << Rec->getValueAsString("FunctionName");
+    FOS << "(const MachineInstr &MI) {\n";
+    FOS << "    return ";
+    PE.expandPredicate(FOS, Rec->getValueAsDef("Pred"));
+    FOS << ";\n  }\n";
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Main Output.
 //===----------------------------------------------------------------------===//
@@ -435,9 +515,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
      << "  explicit " << ClassName
      << "(int CFSetupOpcode = -1, int CFDestroyOpcode = -1, int CatchRetOpcode = -1, int ReturnOpcode = -1);\n"
-     << "  ~" << ClassName << "() override = default;\n"
-     << "};\n";
-  OS << "} // end llvm namespace\n";
+     << "  ~" << ClassName << "() override = default;\n";
+
+  emitTIIHelperMethods(OS);
+
+  OS << "\n};\n} // end llvm namespace\n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
@@ -461,6 +543,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   emitOperandNameMappings(OS, Target, NumberedInstructions);
 
   emitOperandTypesEnum(OS, Target);
+
+  emitMCIIHelperMethods(OS);
 }
 
 void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
@@ -480,6 +564,8 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
      << Inst.TheDef->getValueAsInt("Size") << ",\t"
      << SchedModels.getSchedClassIdx(Inst) << ",\t0";
 
+  CodeGenTarget &Target = CDP.getTargetInfo();
+
   // Emit all of the target independent flags...
   if (Inst.isPseudo)           OS << "|(1ULL<<MCID::Pseudo)";
   if (Inst.isReturn)           OS << "|(1ULL<<MCID::Return)";
@@ -487,8 +573,10 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   if (Inst.isIndirectBranch)   OS << "|(1ULL<<MCID::IndirectBranch)";
   if (Inst.isCompare)          OS << "|(1ULL<<MCID::Compare)";
   if (Inst.isMoveImm)          OS << "|(1ULL<<MCID::MoveImm)";
+  if (Inst.isMoveReg)          OS << "|(1ULL<<MCID::MoveReg)";
   if (Inst.isBitcast)          OS << "|(1ULL<<MCID::Bitcast)";
   if (Inst.isAdd)              OS << "|(1ULL<<MCID::Add)";
+  if (Inst.isTrap)             OS << "|(1ULL<<MCID::Trap)";
   if (Inst.isSelect)           OS << "|(1ULL<<MCID::Select)";
   if (Inst.isBarrier)          OS << "|(1ULL<<MCID::Barrier)";
   if (Inst.hasDelaySlot)       OS << "|(1ULL<<MCID::DelaySlot)";
@@ -508,8 +596,10 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   if (Inst.Operands.isVariadic)OS << "|(1ULL<<MCID::Variadic)";
   if (Inst.hasSideEffects)     OS << "|(1ULL<<MCID::UnmodeledSideEffects)";
   if (Inst.isAsCheapAsAMove)   OS << "|(1ULL<<MCID::CheapAsAMove)";
-  if (Inst.hasExtraSrcRegAllocReq) OS << "|(1ULL<<MCID::ExtraSrcRegAllocReq)";
-  if (Inst.hasExtraDefRegAllocReq) OS << "|(1ULL<<MCID::ExtraDefRegAllocReq)";
+  if (!Target.getAllowRegisterRenaming() || Inst.hasExtraSrcRegAllocReq)
+    OS << "|(1ULL<<MCID::ExtraSrcRegAllocReq)";
+  if (!Target.getAllowRegisterRenaming() || Inst.hasExtraDefRegAllocReq)
+    OS << "|(1ULL<<MCID::ExtraDefRegAllocReq)";
   if (Inst.isRegSequence) OS << "|(1ULL<<MCID::RegSequence)";
   if (Inst.isExtractSubreg) OS << "|(1ULL<<MCID::ExtractSubreg)";
   if (Inst.isInsertSubreg) OS << "|(1ULL<<MCID::InsertSubreg)";
@@ -550,7 +640,6 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   else
     OS << "OperandInfo" << OpInfo.find(OperandInfo)->second;
 
-  CodeGenTarget &Target = CDP.getTargetInfo();
   if (Inst.HasComplexDeprecationPredicate)
     // Emit a function pointer to the complex predicate method.
     OS << ", -1 "
diff --git a/contrib/llvm/utils/TableGen/IntrinsicEmitter.cpp b/contrib/llvm/utils/TableGen/IntrinsicEmitter.cpp
index ba793ad9b938..06e44e3b57c1 100644
--- a/contrib/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -34,7 +34,7 @@ public:
   IntrinsicEmitter(RecordKeeper &R, bool T)
     : Records(R), TargetOnly(T) {}
 
-  void run(raw_ostream &OS);
+  void run(raw_ostream &OS, bool Enums);
 
   void EmitPrefix(raw_ostream &OS);
 
@@ -56,7 +56,7 @@ public:
 // IntrinsicEmitter Implementation
 //===----------------------------------------------------------------------===//
 
-void IntrinsicEmitter::run(raw_ostream &OS) {
+void IntrinsicEmitter::run(raw_ostream &OS, bool Enums) {
   emitSourceFileHeader("Intrinsic Function Source Fragment", OS);
 
   CodeGenIntrinsicTable Ints(Records, TargetOnly);
@@ -66,29 +66,31 @@ void IntrinsicEmitter::run(raw_ostream &OS) {
 
   EmitPrefix(OS);
 
-  // Emit the enum information.
-  EmitEnumInfo(Ints, OS);
-
-  // Emit the target metadata.
-  EmitTargetInfo(Ints, OS);
+  if (Enums) {
+    // Emit the enum information.
+    EmitEnumInfo(Ints, OS);
+  } else {
+    // Emit the target metadata.
+    EmitTargetInfo(Ints, OS);
 
-  // Emit the intrinsic ID -> name table.
-  EmitIntrinsicToNameTable(Ints, OS);
+    // Emit the intrinsic ID -> name table.
+    EmitIntrinsicToNameTable(Ints, OS);
 
-  // Emit the intrinsic ID -> overload table.
-  EmitIntrinsicToOverloadTable(Ints, OS);
+    // Emit the intrinsic ID -> overload table.
+    EmitIntrinsicToOverloadTable(Ints, OS);
 
-  // Emit the intrinsic declaration generator.
-  EmitGenerator(Ints, OS);
+    // Emit the intrinsic declaration generator.
+    EmitGenerator(Ints, OS);
 
-  // Emit the intrinsic parameter attributes.
-  EmitAttributes(Ints, OS);
+    // Emit the intrinsic parameter attributes.
+    EmitAttributes(Ints, OS);
 
-  // Emit code to translate GCC builtins into LLVM intrinsics.
-  EmitIntrinsicToBuiltinMap(Ints, true, OS);
+    // Emit code to translate GCC builtins into LLVM intrinsics.
+    EmitIntrinsicToBuiltinMap(Ints, true, OS);
 
-  // Emit code to translate MS builtins into LLVM intrinsics.
-  EmitIntrinsicToBuiltinMap(Ints, false, OS);
+    // Emit code to translate MS builtins into LLVM intrinsics.
+    EmitIntrinsicToBuiltinMap(Ints, false, OS);
+  }
 
   EmitSuffix(OS);
 }
@@ -172,7 +174,7 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
 }
 
 
-// NOTE: This must be kept in synch with the copy in lib/VMCore/Function.cpp!
+// NOTE: This must be kept in synch with the copy in lib/IR/Function.cpp!
 enum IIT_Info {
   // Common values should be encoded with 0-15.
   IIT_Done = 0,
@@ -217,7 +219,8 @@ enum IIT_Info {
   IIT_V1024 = 37,
   IIT_STRUCT6 = 38,
   IIT_STRUCT7 = 39,
-  IIT_STRUCT8 = 40
+  IIT_STRUCT8 = 40,
+  IIT_F128 = 41
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -240,6 +243,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   case MVT::f16: return Sig.push_back(IIT_F16);
   case MVT::f32: return Sig.push_back(IIT_F32);
   case MVT::f64: return Sig.push_back(IIT_F64);
+  case MVT::f128: return Sig.push_back(IIT_F128);
   case MVT::token: return Sig.push_back(IIT_TOKEN);
   case MVT::Metadata: return Sig.push_back(IIT_METADATA);
   case MVT::x86mmx: return Sig.push_back(IIT_MMX);
@@ -839,6 +843,12 @@ void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
   OS << "#endif\n\n";
 }
 
-void llvm::EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly) {
-  IntrinsicEmitter(RK, TargetOnly).run(OS);
+void llvm::EmitIntrinsicEnums(RecordKeeper &RK, raw_ostream &OS,
+                              bool TargetOnly) {
+  IntrinsicEmitter(RK, TargetOnly).run(OS, /*Enums=*/true);
+}
+
+void llvm::EmitIntrinsicImpl(RecordKeeper &RK, raw_ostream &OS,
+                             bool TargetOnly) {
+  IntrinsicEmitter(RK, TargetOnly).run(OS, /*Enums=*/false);
 }
diff --git a/contrib/llvm/utils/TableGen/PredicateExpander.cpp b/contrib/llvm/utils/TableGen/PredicateExpander.cpp
new file mode 100644
index 000000000000..68eb32794a02
--- /dev/null
+++ b/contrib/llvm/utils/TableGen/PredicateExpander.cpp
@@ -0,0 +1,262 @@
+//===--------------------- PredicateExpander.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Functionalities used by the Tablegen backends to expand machine predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PredicateExpander.h"
+
+namespace llvm {
+
+void PredicateExpander::expandTrue(formatted_raw_ostream &OS) { OS << "true"; }
+void PredicateExpander::expandFalse(formatted_raw_ostream &OS) {
+  OS << "false";
+}
+
+void PredicateExpander::expandCheckImmOperand(formatted_raw_ostream &OS,
+                                              int OpIndex, int ImmVal) {
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getImm() " << (shouldNegate() ? "!= " : "== ") << ImmVal;
+}
+
+void PredicateExpander::expandCheckImmOperand(formatted_raw_ostream &OS,
+                                              int OpIndex, StringRef ImmVal) {
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getImm() " << (shouldNegate() ? "!= " : "== ") << ImmVal;
+}
+
+void PredicateExpander::expandCheckRegOperand(formatted_raw_ostream &OS,
+                                              int OpIndex, const Record *Reg) {
+  assert(Reg->isSubClassOf("Register") && "Expected a register Record!");
+
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getReg() " << (shouldNegate() ? "!= " : "== ");
+  const StringRef Str = Reg->getValueAsString("Namespace");
+  if (!Str.empty())
+    OS << Str << "::";
+  OS << Reg->getName();
+}
+
+void PredicateExpander::expandCheckInvalidRegOperand(formatted_raw_ostream &OS,
+                                                     int OpIndex) {
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getReg() " << (shouldNegate() ? "!= " : "== ") << "0";
+}
+
+void PredicateExpander::expandCheckSameRegOperand(formatted_raw_ostream &OS,
+                                                  int First, int Second) {
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << First
+     << ").getReg() " << (shouldNegate() ? "!=" : "==") << " MI"
+     << (isByRef() ? "." : "->") << "getOperand(" << Second << ").getReg()";
+}
+
+void PredicateExpander::expandCheckNumOperands(formatted_raw_ostream &OS,
+                                               int NumOps) {
+  OS << "MI" << (isByRef() ? "." : "->") << "getNumOperands() "
+     << (shouldNegate() ? "!= " : "== ") << NumOps;
+}
+
+void PredicateExpander::expandCheckOpcode(formatted_raw_ostream &OS,
+                                          const Record *Inst) {
+  OS << "MI" << (isByRef() ? "." : "->") << "getOpcode() "
+     << (shouldNegate() ? "!= " : "== ") << Inst->getValueAsString("Namespace")
+     << "::" << Inst->getName();
+}
+
+void PredicateExpander::expandCheckOpcode(formatted_raw_ostream &OS,
+                                          const RecVec &Opcodes) {
+  assert(!Opcodes.empty() && "Expected at least one opcode to check!");
+  bool First = true;
+
+  if (Opcodes.size() == 1) {
+    OS << "( ";
+    expandCheckOpcode(OS, Opcodes[0]);
+    OS << " )";
+    return;
+  }
+
+  OS << '(';
+  increaseIndentLevel();
+  for (const Record *Rec : Opcodes) {
+    OS << '\n';
+    OS.PadToColumn(getIndentLevel() * 2);
+    if (!First)
+      OS << (shouldNegate() ? "&& " : "|| ");
+
+    expandCheckOpcode(OS, Rec);
+    First = false;
+  }
+
+  OS << '\n';
+  decreaseIndentLevel();
+  OS.PadToColumn(getIndentLevel() * 2);
+  OS << ')';
+}
+
+void PredicateExpander::expandCheckPseudo(formatted_raw_ostream &OS,
+                                          const RecVec &Opcodes) {
+  if (shouldExpandForMC())
+    expandFalse(OS);
+  else
+    expandCheckOpcode(OS, Opcodes);
+}
+
+void PredicateExpander::expandPredicateSequence(formatted_raw_ostream &OS,
+                                                const RecVec &Sequence,
+                                                bool IsCheckAll) {
+  assert(!Sequence.empty() && "Found an invalid empty predicate set!");
+  if (Sequence.size() == 1)
+    return expandPredicate(OS, Sequence[0]);
+
+  // Okay, there is more than one predicate in the set.
+  bool First = true;
+  OS << (shouldNegate() ? "!(" : "(");
+  increaseIndentLevel();
+
+  bool OldValue = shouldNegate();
+  setNegatePredicate(false);
+  for (const Record *Rec : Sequence) {
+    OS << '\n';
+    OS.PadToColumn(getIndentLevel() * 2);
+    if (!First)
+      OS << (IsCheckAll ? "&& " : "|| ");
+    expandPredicate(OS, Rec);
+    First = false;
+  }
+  OS << '\n';
+  decreaseIndentLevel();
+  OS.PadToColumn(getIndentLevel() * 2);
+  OS << ')';
+  setNegatePredicate(OldValue);
+}
+
+void PredicateExpander::expandTIIFunctionCall(formatted_raw_ostream &OS,
+                                              StringRef TargetName,
+                                              StringRef MethodName) {
+  OS << (shouldNegate() ? "!" : "");
+  if (shouldExpandForMC())
+    OS << TargetName << "_MC::";
+  else
+    OS << TargetName << "Gen"
+       << "InstrInfo::";
+  OS << MethodName << (isByRef() ? "(MI)" : "(*MI)");
+}
+
+void PredicateExpander::expandCheckIsRegOperand(formatted_raw_ostream &OS,
+                                                int OpIndex) {
+  OS << (shouldNegate() ? "!" : "") << "MI" << (isByRef() ? "." : "->")
+     << "getOperand(" << OpIndex << ").isReg() ";
+}
+
+void PredicateExpander::expandCheckIsImmOperand(formatted_raw_ostream &OS,
+                                                int OpIndex) {
+  OS << (shouldNegate() ? "!" : "") << "MI" << (isByRef() ? "." : "->")
+     << "getOperand(" << OpIndex << ").isImm() ";
+}
+
+void PredicateExpander::expandCheckFunctionPredicate(formatted_raw_ostream &OS,
+                                                     StringRef MCInstFn,
+                                                     StringRef MachineInstrFn) {
+  OS << (shouldExpandForMC() ? MCInstFn : MachineInstrFn)
+     << (isByRef() ? "(MI)" : "(*MI)");
+}
+
+void PredicateExpander::expandCheckNonPortable(formatted_raw_ostream &OS,
+                                               StringRef Code) {
+  if (shouldExpandForMC())
+    return expandFalse(OS);
+
+  OS << '(' << Code << ')';
+}
+
+void PredicateExpander::expandPredicate(formatted_raw_ostream &OS,
+                                        const Record *Rec) {
+  OS.flush();
+  unsigned ColNum = getIndentLevel() * 2;
+  if (OS.getColumn() < ColNum)
+    OS.PadToColumn(ColNum);
+
+  if (Rec->isSubClassOf("MCTrue")) {
+    if (shouldNegate())
+      return expandFalse(OS);
+    return expandTrue(OS);
+  }
+
+  if (Rec->isSubClassOf("MCFalse")) {
+    if (shouldNegate())
+      return expandTrue(OS);
+    return expandFalse(OS);
+  }
+
+  if (Rec->isSubClassOf("CheckNot")) {
+    flipNegatePredicate();
+    expandPredicate(OS, Rec->getValueAsDef("Pred"));
+    flipNegatePredicate();
+    return;
+  }
+
+  if (Rec->isSubClassOf("CheckIsRegOperand"))
+    return expandCheckIsRegOperand(OS, Rec->getValueAsInt("OpIndex"));
+
+  if (Rec->isSubClassOf("CheckIsImmOperand"))
+    return expandCheckIsImmOperand(OS, Rec->getValueAsInt("OpIndex"));
+
+  if (Rec->isSubClassOf("CheckRegOperand"))
+    return expandCheckRegOperand(OS, Rec->getValueAsInt("OpIndex"),
+                                 Rec->getValueAsDef("Reg"));
+
+  if (Rec->isSubClassOf("CheckInvalidRegOperand"))
+    return expandCheckInvalidRegOperand(OS, Rec->getValueAsInt("OpIndex"));
+
+  if (Rec->isSubClassOf("CheckImmOperand"))
+    return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"),
+                                 Rec->getValueAsInt("ImmVal"));
+
+  if (Rec->isSubClassOf("CheckImmOperand_s"))
+    return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"),
+                                 Rec->getValueAsString("ImmVal"));
+
+  if (Rec->isSubClassOf("CheckSameRegOperand"))
+    return expandCheckSameRegOperand(OS, Rec->getValueAsInt("FirstIndex"),
+                                     Rec->getValueAsInt("SecondIndex"));
+
+  if (Rec->isSubClassOf("CheckNumOperands"))
+    return expandCheckNumOperands(OS, Rec->getValueAsInt("NumOps"));
+
+  if (Rec->isSubClassOf("CheckPseudo"))
+    return expandCheckPseudo(OS, Rec->getValueAsListOfDefs("ValidOpcodes"));
+
+  if (Rec->isSubClassOf("CheckOpcode"))
+    return expandCheckOpcode(OS, Rec->getValueAsListOfDefs("ValidOpcodes"));
+
+  if (Rec->isSubClassOf("CheckAll"))
+    return expandPredicateSequence(OS, Rec->getValueAsListOfDefs("Predicates"),
+                                   /* AllOf */ true);
+
+  if (Rec->isSubClassOf("CheckAny"))
+    return expandPredicateSequence(OS, Rec->getValueAsListOfDefs("Predicates"),
+                                   /* AllOf */ false);
+
+  if (Rec->isSubClassOf("CheckFunctionPredicate"))
+    return expandCheckFunctionPredicate(
+        OS, Rec->getValueAsString("MCInstFnName"),
+        Rec->getValueAsString("MachineInstrFnName"));
+
+  if (Rec->isSubClassOf("CheckNonPortable"))
+    return expandCheckNonPortable(OS, Rec->getValueAsString("CodeBlock"));
+
+  if (Rec->isSubClassOf("TIIPredicate"))
+    return expandTIIFunctionCall(OS, Rec->getValueAsString("TargetName"),
+                                 Rec->getValueAsString("FunctionName"));
+
+  llvm_unreachable("No known rules to expand this MCInstPredicate");
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/utils/TableGen/PredicateExpander.h b/contrib/llvm/utils/TableGen/PredicateExpander.h
new file mode 100644
index 000000000000..398b376f7a83
--- /dev/null
+++ b/contrib/llvm/utils/TableGen/PredicateExpander.h
@@ -0,0 +1,86 @@
+//===--------------------- PredicateExpander.h ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Functionalities used by the Tablegen backends to expand machine predicates.
+///
+/// See file llvm/Target/TargetInstrPredicate.td for a full list and description
+/// of all the supported MCInstPredicate classes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_PREDICATEEXPANDER_H
+#define LLVM_UTILS_TABLEGEN_PREDICATEEXPANDER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+
+class PredicateExpander {
+  bool EmitCallsByRef;
+  bool NegatePredicate;
+  bool ExpandForMC;
+  unsigned IndentLevel;
+
+  PredicateExpander(const PredicateExpander &) = delete;
+  PredicateExpander &operator=(const PredicateExpander &) = delete;
+
+public:
+  PredicateExpander()
+      : EmitCallsByRef(true), NegatePredicate(false), ExpandForMC(false),
+        IndentLevel(1U) {}
+  bool isByRef() const { return EmitCallsByRef; }
+  bool shouldNegate() const { return NegatePredicate; }
+  bool shouldExpandForMC() const { return ExpandForMC; }
+  unsigned getIndentLevel() const { return IndentLevel; }
+
+  void setByRef(bool Value) { EmitCallsByRef = Value; }
+  void flipNegatePredicate() { NegatePredicate = !NegatePredicate; }
+  void setNegatePredicate(bool Value) { NegatePredicate = Value; }
+  void setExpandForMC(bool Value) { ExpandForMC = Value; }
+  void increaseIndentLevel() { ++IndentLevel; }
+  void decreaseIndentLevel() { --IndentLevel; }
+  void setIndentLevel(unsigned Level) { IndentLevel = Level; }
+
+  using RecVec = std::vector<Record *>;
+  void expandTrue(formatted_raw_ostream &OS);
+  void expandFalse(formatted_raw_ostream &OS);
+  void expandCheckImmOperand(formatted_raw_ostream &OS, int OpIndex,
+                             int ImmVal);
+  void expandCheckImmOperand(formatted_raw_ostream &OS, int OpIndex,
+                             StringRef ImmVal);
+  void expandCheckRegOperand(formatted_raw_ostream &OS, int OpIndex,
+                             const Record *Reg);
+  void expandCheckSameRegOperand(formatted_raw_ostream &OS, int First,
+                                 int Second);
+  void expandCheckNumOperands(formatted_raw_ostream &OS, int NumOps);
+  void expandCheckOpcode(formatted_raw_ostream &OS, const Record *Inst);
+
+  void expandCheckPseudo(formatted_raw_ostream &OS, const RecVec &Opcodes);
+  void expandCheckOpcode(formatted_raw_ostream &OS, const RecVec &Opcodes);
+  void expandPredicateSequence(formatted_raw_ostream &OS,
+                               const RecVec &Sequence, bool IsCheckAll);
+  void expandTIIFunctionCall(formatted_raw_ostream &OS, StringRef TargetName,
+                             StringRef MethodName);
+  void expandCheckIsRegOperand(formatted_raw_ostream &OS, int OpIndex);
+  void expandCheckIsImmOperand(formatted_raw_ostream &OS, int OpIndex);
+  void expandCheckInvalidRegOperand(formatted_raw_ostream &OS, int OpIndex);
+  void expandCheckFunctionPredicate(formatted_raw_ostream &OS,
+                                    StringRef MCInstFn,
+                                    StringRef MachineInstrFn);
+  void expandCheckNonPortable(formatted_raw_ostream &OS, StringRef CodeBlock);
+  void expandPredicate(formatted_raw_ostream &OS, const Record *Rec);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/contrib/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index 63bdd36235a0..a363015730f3 100644
--- a/contrib/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -120,13 +120,13 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
 }
 
 void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
-  DEBUG(dbgs() << "Pseudo definition: " << Rec->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Pseudo definition: " << Rec->getName() << "\n");
 
   // Validate that the result pattern has the corrent number and types
   // of arguments for the instruction it references.
   DagInit *Dag = Rec->getValueAsDag("ResultInst");
   assert(Dag && "Missing result instruction in pseudo expansion!");
-  DEBUG(dbgs() << "  Result: " << *Dag << "\n");
+  LLVM_DEBUG(dbgs() << "  Result: " << *Dag << "\n");
 
   DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
   if (!OpDef)
@@ -170,7 +170,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
   for (unsigned i = 0, e = SourceInsn.Operands.size(); i != e; ++i)
     SourceOperands[SourceInsn.Operands[i].Name] = i;
 
-  DEBUG(dbgs() << "  Operand mapping:\n");
+  LLVM_DEBUG(dbgs() << "  Operand mapping:\n");
   for (unsigned i = 0, e = Insn.Operands.size(); i != e; ++i) {
     // We've already handled constant values. Just map instruction operands
     // here.
@@ -188,7 +188,8 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
       OperandMap[Insn.Operands[i].MIOperandNo + I].Data.Operand =
         SourceOp->getValue();
 
-    DEBUG(dbgs() << "    " << SourceOp->getValue() << " ==> " << i << "\n");
+    LLVM_DEBUG(dbgs() << "    " << SourceOp->getValue() << " ==> " << i
+                      << "\n");
   }
 
   Expansions.push_back(PseudoExpansion(SourceInsn, Insn, OperandMap));
diff --git a/contrib/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp b/contrib/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp
new file mode 100644
index 000000000000..e03663b40f8a
--- /dev/null
+++ b/contrib/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp
@@ -0,0 +1,810 @@
+//===- RISCVCompressInstEmitter.cpp - Generator for RISCV Compression -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// RISCVCompressInstEmitter implements a tablegen-driven CompressPat based
+// RISCV Instruction Compression mechanism.
+//
+//===--------------------------------------------------------------===//
+//
+// RISCVCompressInstEmitter implements a tablegen-driven CompressPat Instruction
+// Compression mechanism for generating RISCV compressed instructions
+// (C ISA Extension) from the expanded instruction form.
+
+// This tablegen backend processes CompressPat declarations in a
+// td file and generates all the required checks to validate the pattern
+// declarations; validate the input and output operands to generate the correct
+// compressed instructions. The checks include validating  different types of
+// operands; register operands, immediate operands, fixed register and fixed
+// immediate inputs.
+//
+// Example:
+// class CompressPat<dag input, dag output> {
+//   dag Input  = input;
+//   dag Output    = output;
+//   list<Predicate> Predicates = [];
+// }
+//
+// let Predicates = [HasStdExtC] in {
+// def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2),
+//                   (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+// }
+//
+// The result is an auto-generated header file
+// 'RISCVGenCompressInstEmitter.inc' which exports two functions for
+// compressing/uncompressing MCInst instructions, plus
+// some helper functions:
+//
+// bool compressInst(MCInst& OutInst, const MCInst &MI,
+//                   const MCSubtargetInfo &STI,
+//                   MCContext &Context);
+//
+// bool uncompressInst(MCInst& OutInst, const MCInst &MI,
+//                     const MCRegisterInfo &MRI,
+//                     const MCSubtargetInfo &STI);
+//
+// The clients that include this auto-generated header file and
+// invoke these functions can compress an instruction before emitting
+// it in the target-specific ASM or ELF streamer or can uncompress
+// an instruction before printing it when the expanded instruction
+// format aliases is favored.
+
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <vector>
+using namespace llvm;
+
+#define DEBUG_TYPE "compress-inst-emitter"
+
+namespace {
+class RISCVCompressInstEmitter {
+  struct OpData {
+    enum MapKind { Operand, Imm, Reg };
+    MapKind Kind;
+    union {
+      unsigned Operand; // Operand number mapped to.
+      uint64_t Imm;     // Integer immediate value.
+      Record *Reg;      // Physical register.
+    } Data;
+    int TiedOpIdx = -1; // Tied operand index within the instruction.
+  };
+  struct CompressPat {
+    CodeGenInstruction Source; // The source instruction definition.
+    CodeGenInstruction Dest;   // The destination instruction to transform to.
+    std::vector<Record *>
+        PatReqFeatures; // Required target features to enable pattern.
+    IndexedMap<OpData>
+        SourceOperandMap; // Maps operands in the Source Instruction to
+                          // the corresponding Dest instruction operand.
+    IndexedMap<OpData>
+        DestOperandMap; // Maps operands in the Dest Instruction
+                        // to the corresponding Source instruction operand.
+    CompressPat(CodeGenInstruction &S, CodeGenInstruction &D,
+                std::vector<Record *> RF, IndexedMap<OpData> &SourceMap,
+                IndexedMap<OpData> &DestMap)
+        : Source(S), Dest(D), PatReqFeatures(RF), SourceOperandMap(SourceMap),
+          DestOperandMap(DestMap) {}
+  };
+
+  RecordKeeper &Records;
+  CodeGenTarget Target;
+  SmallVector<CompressPat, 4> CompressPatterns;
+
+  void addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Inst,
+                            IndexedMap<OpData> &OperandMap, bool IsSourceInst);
+  void evaluateCompressPat(Record *Compress);
+  void emitCompressInstEmitter(raw_ostream &o, bool Compress);
+  bool validateTypes(Record *SubType, Record *Type, bool IsSourceInst);
+  bool validateRegister(Record *Reg, Record *RegClass);
+  void createDagOperandMapping(Record *Rec, StringMap<unsigned> &SourceOperands,
+                               StringMap<unsigned> &DestOperands,
+                               DagInit *SourceDag, DagInit *DestDag,
+                               IndexedMap<OpData> &SourceOperandMap);
+
+  void createInstOperandMapping(Record *Rec, DagInit *SourceDag,
+                                DagInit *DestDag,
+                                IndexedMap<OpData> &SourceOperandMap,
+                                IndexedMap<OpData> &DestOperandMap,
+                                StringMap<unsigned> &SourceOperands,
+                                CodeGenInstruction &DestInst);
+
+public:
+  RISCVCompressInstEmitter(RecordKeeper &R) : Records(R), Target(R) {}
+
+  void run(raw_ostream &o);
+};
+} // End anonymous namespace.
+
+bool RISCVCompressInstEmitter::validateRegister(Record *Reg, Record *RegClass) {
+  assert(Reg->isSubClassOf("Register") && "Reg record should be a Register\n");
+  assert(RegClass->isSubClassOf("RegisterClass") && "RegClass record should be"
+                                                    " a RegisterClass\n");
+  CodeGenRegisterClass RC = Target.getRegisterClass(RegClass);
+  const CodeGenRegister *R = Target.getRegisterByName(Reg->getName().lower());
+  assert((R != nullptr) &&
+         ("Register" + Reg->getName().str() + " not defined!!\n").c_str());
+  return RC.contains(R);
+}
+
+bool RISCVCompressInstEmitter::validateTypes(Record *DagOpType,
+                                             Record *InstOpType,
+                                             bool IsSourceInst) {
+  if (DagOpType == InstOpType)
+    return true;
+  // Only source instruction operands are allowed to not match Input Dag
+  // operands.
+  if (!IsSourceInst)
+    return false;
+
+  if (DagOpType->isSubClassOf("RegisterClass") &&
+      InstOpType->isSubClassOf("RegisterClass")) {
+    CodeGenRegisterClass RC = Target.getRegisterClass(InstOpType);
+    CodeGenRegisterClass SubRC = Target.getRegisterClass(DagOpType);
+    return RC.hasSubClass(&SubRC);
+  }
+
+  // At this point either or both types are not registers, reject the pattern.
+  if (DagOpType->isSubClassOf("RegisterClass") ||
+      InstOpType->isSubClassOf("RegisterClass"))
+    return false;
+
+  // Let further validation happen when compress()/uncompress() functions are
+  // invoked.
+  LLVM_DEBUG(dbgs() << (IsSourceInst ? "Input" : "Output")
+                    << " Dag Operand Type: '" << DagOpType->getName()
+                    << "' and "
+                    << "Instruction Operand Type: '" << InstOpType->getName()
+                    << "' can't be checked at pattern validation time!\n");
+  return true;
+}
+
+/// The patterns in the Dag contain different types of operands:
+/// Register operands, e.g.: GPRC:$rs1; Fixed registers, e.g: X1; Immediate
+/// operands, e.g.: simm6:$imm; Fixed immediate operands, e.g.: 0. This function
+/// maps Dag operands to its corresponding instruction operands. For register
+/// operands and fixed registers it expects the Dag operand type to be contained
+/// in the instantiated instruction operand type. For immediate operands and
+/// immediates no validation checks are enforced at pattern validation time.
+void RISCVCompressInstEmitter::addDagOperandMapping(
+    Record *Rec, DagInit *Dag, CodeGenInstruction &Inst,
+    IndexedMap<OpData> &OperandMap, bool IsSourceInst) {
+  // TiedCount keeps track of the number of operands skipped in Inst
+  // operands list to get to the corresponding Dag operand. This is
+  // necessary because the number of operands in Inst might be greater
+  // than number of operands in the Dag due to how tied operands
+  // are represented.
+  unsigned TiedCount = 0;
+  for (unsigned i = 0, e = Inst.Operands.size(); i != e; ++i) {
+    int TiedOpIdx = Inst.Operands[i].getTiedRegister();
+    if (-1 != TiedOpIdx) {
+      // Set the entry in OperandMap for the tied operand we're skipping.
+      OperandMap[i].Kind = OperandMap[TiedOpIdx].Kind;
+      OperandMap[i].Data = OperandMap[TiedOpIdx].Data;
+      TiedCount++;
+      continue;
+    }
+    if (DefInit *DI = dyn_cast<DefInit>(Dag->getArg(i - TiedCount))) {
+      if (DI->getDef()->isSubClassOf("Register")) {
+        // Check if the fixed register belongs to the Register class.
+        if (!validateRegister(DI->getDef(), Inst.Operands[i].Rec))
+          PrintFatalError(Rec->getLoc(),
+                          "Error in Dag '" + Dag->getAsString() +
+                              "'Register: '" + DI->getDef()->getName() +
+                              "' is not in register class '" +
+                              Inst.Operands[i].Rec->getName() + "'");
+        OperandMap[i].Kind = OpData::Reg;
+        OperandMap[i].Data.Reg = DI->getDef();
+        continue;
+      }
+      // Validate that Dag operand type matches the type defined in the
+      // corresponding instruction. Operands in the input Dag pattern are
+      // allowed to be a subclass of the type specified in corresponding
+      // instruction operand instead of being an exact match.
+      if (!validateTypes(DI->getDef(), Inst.Operands[i].Rec, IsSourceInst))
+        PrintFatalError(Rec->getLoc(),
+                        "Error in Dag '" + Dag->getAsString() + "'. Operand '" +
+                            Dag->getArgNameStr(i - TiedCount) + "' has type '" +
+                            DI->getDef()->getName() +
+                            "' which does not match the type '" +
+                            Inst.Operands[i].Rec->getName() +
+                            "' in the corresponding instruction operand!");
+
+      OperandMap[i].Kind = OpData::Operand;
+    } else if (IntInit *II = dyn_cast<IntInit>(Dag->getArg(i - TiedCount))) {
+      // Validate that corresponding instruction operand expects an immediate.
+      if (Inst.Operands[i].Rec->isSubClassOf("RegisterClass"))
+        PrintFatalError(
+            Rec->getLoc(),
+            ("Error in Dag '" + Dag->getAsString() + "' Found immediate: '" +
+             II->getAsString() +
+             "' but corresponding instruction operand expected a register!"));
+      // No pattern validation check possible for values of fixed immediate.
+      OperandMap[i].Kind = OpData::Imm;
+      OperandMap[i].Data.Imm = II->getValue();
+      LLVM_DEBUG(
+          dbgs() << "  Found immediate '" << II->getValue() << "' at "
+                 << (IsSourceInst ? "input " : "output ")
+                 << "Dag. No validation time check possible for values of "
+                    "fixed immediate.\n");
+    } else
+      llvm_unreachable("Unhandled CompressPat argument type!");
+  }
+}
+
+// Verify the Dag operand count is enough to build an instruction.
+static bool verifyDagOpCount(CodeGenInstruction &Inst, DagInit *Dag,
+                             bool IsSource) {
+  if (Dag->getNumArgs() == Inst.Operands.size())
+    return true;
+  // Source instructions are non compressed instructions and don't have tied
+  // operands.
+  if (IsSource)
+    PrintFatalError("Input operands for Inst '" + Inst.TheDef->getName() +
+                    "' and input Dag operand count mismatch");
+  // The Dag can't have more arguments than the Instruction.
+  if (Dag->getNumArgs() > Inst.Operands.size())
+    PrintFatalError("Inst '" + Inst.TheDef->getName() +
+                    "' and Dag operand count mismatch");
+
+  // The Instruction might have tied operands so the Dag might have
+  //  a fewer operand count.
+  unsigned RealCount = Inst.Operands.size();
+  for (unsigned i = 0; i < Inst.Operands.size(); i++)
+    if (Inst.Operands[i].getTiedRegister() != -1)
+      --RealCount;
+
+  if (Dag->getNumArgs() != RealCount)
+    PrintFatalError("Inst '" + Inst.TheDef->getName() +
+                    "' and Dag operand count mismatch");
+  return true;
+}
+
+static bool validateArgsTypes(Init *Arg1, Init *Arg2) {
+  DefInit *Type1 = dyn_cast<DefInit>(Arg1);
+  DefInit *Type2 = dyn_cast<DefInit>(Arg2);
+  assert(Type1 && ("Arg1 type not found\n"));
+  assert(Type2 && ("Arg2 type not found\n"));
+  return Type1->getDef() == Type2->getDef();
+}
+
+// Creates a mapping between the operand name in the Dag (e.g. $rs1) and
+// its index in the list of Dag operands and checks that operands with the same
+// name have the same types. For example in 'C_ADD $rs1, $rs2' we generate the
+// mapping $rs1 --> 0, $rs2 ---> 1. If the operand appears twice in the (tied)
+// same Dag we use the last occurrence for indexing.
+void RISCVCompressInstEmitter::createDagOperandMapping(
+    Record *Rec, StringMap<unsigned> &SourceOperands,
+    StringMap<unsigned> &DestOperands, DagInit *SourceDag, DagInit *DestDag,
+    IndexedMap<OpData> &SourceOperandMap) {
+  for (unsigned i = 0; i < DestDag->getNumArgs(); ++i) {
+    // Skip fixed immediates and registers, they were handled in
+    // addDagOperandMapping.
+    if ("" == DestDag->getArgNameStr(i))
+      continue;
+    DestOperands[DestDag->getArgNameStr(i)] = i;
+  }
+
+  for (unsigned i = 0; i < SourceDag->getNumArgs(); ++i) {
+    // Skip fixed immediates and registers, they were handled in
+    // addDagOperandMapping.
+    if ("" == SourceDag->getArgNameStr(i))
+      continue;
+
+    StringMap<unsigned>::iterator it =
+        SourceOperands.find(SourceDag->getArgNameStr(i));
+    if (it != SourceOperands.end()) {
+      // Operand sharing the same name in the Dag should be mapped as tied.
+      SourceOperandMap[i].TiedOpIdx = it->getValue();
+      if (!validateArgsTypes(SourceDag->getArg(it->getValue()),
+                             SourceDag->getArg(i)))
+        PrintFatalError(Rec->getLoc(),
+                        "Input Operand '" + SourceDag->getArgNameStr(i) +
+                            "' has a mismatched tied operand!\n");
+    }
+    it = DestOperands.find(SourceDag->getArgNameStr(i));
+    if (it == DestOperands.end())
+      PrintFatalError(Rec->getLoc(), "Operand " + SourceDag->getArgNameStr(i) +
+                                         " defined in Input Dag but not used in"
+                                         " Output Dag!\n");
+    // Input Dag operand types must match output Dag operand type.
+    if (!validateArgsTypes(DestDag->getArg(it->getValue()),
+                           SourceDag->getArg(i)))
+      PrintFatalError(Rec->getLoc(), "Type mismatch between Input and "
+                                     "Output Dag operand '" +
+                                         SourceDag->getArgNameStr(i) + "'!");
+    SourceOperands[SourceDag->getArgNameStr(i)] = i;
+  }
+}
+
+/// Map operand names in the Dag to their index in both corresponding input and
+/// output instructions. Validate that operands defined in the input are
+/// used in the output pattern while populating the maps.
+void RISCVCompressInstEmitter::createInstOperandMapping(
+    Record *Rec, DagInit *SourceDag, DagInit *DestDag,
+    IndexedMap<OpData> &SourceOperandMap, IndexedMap<OpData> &DestOperandMap,
+    StringMap<unsigned> &SourceOperands, CodeGenInstruction &DestInst) {
+  // TiedCount keeps track of the number of operands skipped in Inst
+  // operands list to get to the corresponding Dag operand.
+  unsigned TiedCount = 0;
+  LLVM_DEBUG(dbgs() << "  Operand mapping:\n  Source   Dest\n");
+  for (unsigned i = 0, e = DestInst.Operands.size(); i != e; ++i) {
+    int TiedInstOpIdx = DestInst.Operands[i].getTiedRegister();
+    if (TiedInstOpIdx != -1) {
+      ++TiedCount;
+      DestOperandMap[i].Data = DestOperandMap[TiedInstOpIdx].Data;
+      DestOperandMap[i].Kind = DestOperandMap[TiedInstOpIdx].Kind;
+      if (DestOperandMap[i].Kind == OpData::Operand)
+        // No need to fill the SourceOperandMap here since it was mapped to
+        // destination operand 'TiedInstOpIdx' in a previous iteration.
+        LLVM_DEBUG(dbgs() << "    " << DestOperandMap[i].Data.Operand
+                          << " ====> " << i
+                          << "  Dest operand tied with operand '"
+                          << TiedInstOpIdx << "'\n");
+      continue;
+    }
+    // Skip fixed immediates and registers, they were handled in
+    // addDagOperandMapping.
+    if (DestOperandMap[i].Kind != OpData::Operand)
+      continue;
+
+    unsigned DagArgIdx = i - TiedCount;
+    StringMap<unsigned>::iterator SourceOp =
+        SourceOperands.find(DestDag->getArgNameStr(DagArgIdx));
+    if (SourceOp == SourceOperands.end())
+      PrintFatalError(Rec->getLoc(),
+                      "Output Dag operand '" +
+                          DestDag->getArgNameStr(DagArgIdx) +
+                          "' has no matching input Dag operand.");
+
+    assert(DestDag->getArgNameStr(DagArgIdx) ==
+               SourceDag->getArgNameStr(SourceOp->getValue()) &&
+           "Incorrect operand mapping detected!\n");
+    DestOperandMap[i].Data.Operand = SourceOp->getValue();
+    SourceOperandMap[SourceOp->getValue()].Data.Operand = i;
+    LLVM_DEBUG(dbgs() << "    " << SourceOp->getValue() << " ====> " << i
+                      << "\n");
+  }
+}
+
+/// Validates the CompressPattern and create operand mapping.
+/// These are the checks to validate a CompressPat pattern declarations.
+/// Error out with message under these conditions:
+/// - Dag Input opcode is an expanded instruction and Dag Output opcode is a
+///   compressed instruction.
+/// - Operands in Dag Input must be all used in Dag Output.
+///   Register Operand type in Dag Input Type  must be contained in the
+///   corresponding Source Instruction type.
+/// - Register Operand type in Dag Input must be the  same as in  Dag Ouput.
+/// - Register Operand type in  Dag Output must be the same  as the
+///   corresponding Destination Inst type.
+/// - Immediate Operand type in Dag Input must be the same as in Dag Ouput.
+/// - Immediate Operand type in Dag Ouput must be the same as the corresponding
+///   Destination Instruction type.
+/// - Fixed register must be contained in the corresponding Source Instruction
+///   type.
+/// - Fixed register must be contained in the corresponding Destination
+///   Instruction type. Warning message printed under these conditions:
+/// - Fixed immediate in Dag Input or Dag Ouput cannot be checked at this time
+///   and generate warning.
+/// - Immediate operand type in Dag Input differs from the corresponding Source
+///   Instruction type  and generate a warning.
+void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
+  // Validate input Dag operands.
+  DagInit *SourceDag = Rec->getValueAsDag("Input");
+  assert(SourceDag && "Missing 'Input' in compress pattern!");
+  LLVM_DEBUG(dbgs() << "Input: " << *SourceDag << "\n");
+
+  DefInit *OpDef = dyn_cast<DefInit>(SourceDag->getOperator());
+  if (!OpDef)
+    PrintFatalError(Rec->getLoc(),
+                    Rec->getName() + " has unexpected operator type!");
+  // Checking we are transforming from compressed to uncompressed instructions.
+  Record *Operator = OpDef->getDef();
+  if (!Operator->isSubClassOf("RVInst"))
+    PrintFatalError(Rec->getLoc(), "Input instruction '" + Operator->getName() +
+                                       "' is not a 32 bit wide instruction!");
+  CodeGenInstruction SourceInst(Operator);
+  verifyDagOpCount(SourceInst, SourceDag, true);
+
+  // Validate output Dag operands.
+  DagInit *DestDag = Rec->getValueAsDag("Output");
+  assert(DestDag && "Missing 'Output' in compress pattern!");
+  LLVM_DEBUG(dbgs() << "Output: " << *DestDag << "\n");
+
+  DefInit *DestOpDef = dyn_cast<DefInit>(DestDag->getOperator());
+  if (!DestOpDef)
+    PrintFatalError(Rec->getLoc(),
+                    Rec->getName() + " has unexpected operator type!");
+
+  Record *DestOperator = DestOpDef->getDef();
+  if (!DestOperator->isSubClassOf("RVInst16"))
+    PrintFatalError(Rec->getLoc(), "Output instruction  '" +
+                                       DestOperator->getName() +
+                                       "' is not a 16 bit wide instruction!");
+  CodeGenInstruction DestInst(DestOperator);
+  verifyDagOpCount(DestInst, DestDag, false);
+
+  // Fill the mapping from the source to destination instructions.
+
+  IndexedMap<OpData> SourceOperandMap;
+  SourceOperandMap.grow(SourceInst.Operands.size());
+  // Create a mapping between source Dag operands and source Inst operands.
+  addDagOperandMapping(Rec, SourceDag, SourceInst, SourceOperandMap,
+                       /*IsSourceInst*/ true);
+
+  IndexedMap<OpData> DestOperandMap;
+  DestOperandMap.grow(DestInst.Operands.size());
+  // Create a mapping between destination Dag operands and destination Inst
+  // operands.
+  addDagOperandMapping(Rec, DestDag, DestInst, DestOperandMap,
+                       /*IsSourceInst*/ false);
+
+  StringMap<unsigned> SourceOperands;
+  StringMap<unsigned> DestOperands;
+  createDagOperandMapping(Rec, SourceOperands, DestOperands, SourceDag, DestDag,
+                          SourceOperandMap);
+  // Create operand mapping between the source and destination instructions.
+  createInstOperandMapping(Rec, SourceDag, DestDag, SourceOperandMap,
+                           DestOperandMap, SourceOperands, DestInst);
+
+  // Get the target features for the CompressPat.
+  std::vector<Record *> PatReqFeatures;
+  std::vector<Record *> RF = Rec->getValueAsListOfDefs("Predicates");
+  copy_if(RF, std::back_inserter(PatReqFeatures), [](Record *R) {
+    return R->getValueAsBit("AssemblerMatcherPredicate");
+  });
+
+  CompressPatterns.push_back(CompressPat(SourceInst, DestInst, PatReqFeatures,
+                                         SourceOperandMap, DestOperandMap));
+}
+
+static void getReqFeatures(std::map<StringRef, int> &FeaturesMap,
+                           const std::vector<Record *> &ReqFeatures) {
+  for (auto &R : ReqFeatures) {
+    StringRef AsmCondString = R->getValueAsString("AssemblerCondString");
+
+    // AsmCondString has syntax [!]F(,[!]F)*
+    SmallVector<StringRef, 4> Ops;
+    SplitString(AsmCondString, Ops, ",");
+    assert(!Ops.empty() && "AssemblerCondString cannot be empty");
+
+    for (auto &Op : Ops) {
+      assert(!Op.empty() && "Empty operator");
+      if (FeaturesMap.find(Op) == FeaturesMap.end())
+        FeaturesMap[Op] = FeaturesMap.size();
+    }
+  }
+}
+
+unsigned getMCOpPredicate(DenseMap<const Record *, unsigned> &MCOpPredicateMap,
+                          std::vector<const Record *> &MCOpPredicates,
+                          Record *Rec) {
+  unsigned Entry = MCOpPredicateMap[Rec];
+  if (Entry)
+    return Entry;
+
+  if (!Rec->isValueUnset("MCOperandPredicate")) {
+    MCOpPredicates.push_back(Rec);
+    Entry = MCOpPredicates.size();
+    MCOpPredicateMap[Rec] = Entry;
+    return Entry;
+  }
+
+  PrintFatalError(Rec->getLoc(),
+                  "No MCOperandPredicate on this operand at all: " +
+                      Rec->getName().str() + "'");
+  return 0;
+}
+
+static std::string mergeCondAndCode(raw_string_ostream &CondStream,
+                                    raw_string_ostream &CodeStream) {
+  std::string S;
+  raw_string_ostream CombinedStream(S);
+  CombinedStream.indent(4)
+      << "if ("
+      << CondStream.str().substr(
+             6, CondStream.str().length() -
+                    10) // remove first indentation and last '&&'.
+      << ") {\n";
+  CombinedStream << CodeStream.str();
+  CombinedStream.indent(4) << "  return true;\n";
+  CombinedStream.indent(4) << "} // if\n";
+  return CombinedStream.str();
+}
+
+void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
+                                                       bool Compress) {
+  Record *AsmWriter = Target.getAsmWriter();
+  if (!AsmWriter->getValueAsInt("PassSubtarget"))
+    PrintFatalError("'PassSubtarget' is false. SubTargetInfo object is needed "
+                    "for target features.\n");
+
+  std::string Namespace = Target.getName();
+
+  // Sort entries in CompressPatterns to handle instructions that can have more
+  // than one candidate for compression\uncompression, e.g ADD can be
+  // transformed to a C_ADD or a C_MV. When emitting 'uncompress()' function the
+  // source and destination are flipped and the sort key needs to change
+  // accordingly.
+  std::stable_sort(CompressPatterns.begin(), CompressPatterns.end(),
+                   [Compress](const CompressPat &LHS, const CompressPat &RHS) {
+                     if (Compress)
+                       return (LHS.Source.TheDef->getName().str() <
+                               RHS.Source.TheDef->getName().str());
+                     else
+                       return (LHS.Dest.TheDef->getName().str() <
+                               RHS.Dest.TheDef->getName().str());
+                   });
+
+  // A list of MCOperandPredicates for all operands in use, and the reverse map.
+  std::vector<const Record *> MCOpPredicates;
+  DenseMap<const Record *, unsigned> MCOpPredicateMap;
+
+  std::string F;
+  std::string FH;
+  raw_string_ostream Func(F);
+  raw_string_ostream FuncH(FH);
+  bool NeedMRI = false;
+
+  if (Compress)
+    o << "\n#ifdef GEN_COMPRESS_INSTR\n"
+      << "#undef GEN_COMPRESS_INSTR\n\n";
+  else
+    o << "\n#ifdef GEN_UNCOMPRESS_INSTR\n"
+      << "#undef GEN_UNCOMPRESS_INSTR\n\n";
+
+  if (Compress) {
+    FuncH << "static bool compressInst(MCInst& OutInst,\n";
+    FuncH.indent(25) << "const MCInst &MI,\n";
+    FuncH.indent(25) << "const MCSubtargetInfo &STI,\n";
+    FuncH.indent(25) << "MCContext &Context) {\n";
+  } else {
+    FuncH << "static bool uncompressInst(MCInst& OutInst,\n";
+    FuncH.indent(27) << "const MCInst &MI,\n";
+    FuncH.indent(27) << "const MCRegisterInfo &MRI,\n";
+    FuncH.indent(27) << "const MCSubtargetInfo &STI) {\n";
+  }
+
+  if (CompressPatterns.empty()) {
+    o << FuncH.str();
+    o.indent(2) << "return false;\n}\n";
+    if (Compress)
+      o << "\n#endif //GEN_COMPRESS_INSTR\n";
+    else
+      o << "\n#endif //GEN_UNCOMPRESS_INSTR\n\n";
+    return;
+  }
+
+  std::string CaseString("");
+  raw_string_ostream CaseStream(CaseString);
+  std::string PrevOp("");
+  std::string CurOp("");
+  CaseStream << "  switch (MI.getOpcode()) {\n";
+  CaseStream << "    default: return false;\n";
+
+  for (auto &CompressPat : CompressPatterns) {
+    std::string CondString;
+    std::string CodeString;
+    raw_string_ostream CondStream(CondString);
+    raw_string_ostream CodeStream(CodeString);
+    CodeGenInstruction &Source =
+        Compress ? CompressPat.Source : CompressPat.Dest;
+    CodeGenInstruction &Dest = Compress ? CompressPat.Dest : CompressPat.Source;
+    IndexedMap<OpData> SourceOperandMap =
+        Compress ? CompressPat.SourceOperandMap : CompressPat.DestOperandMap;
+    IndexedMap<OpData> &DestOperandMap =
+        Compress ? CompressPat.DestOperandMap : CompressPat.SourceOperandMap;
+
+    CurOp = Source.TheDef->getName().str();
+    // Check current and previous opcode to decide to continue or end a case.
+    if (CurOp != PrevOp) {
+      if (PrevOp != "")
+        CaseStream.indent(6) << "break;\n    } // case " + PrevOp + "\n";
+      CaseStream.indent(4) << "case " + Namespace + "::" + CurOp + ": {\n";
+    }
+
+    std::map<StringRef, int> FeaturesMap;
+    // Add CompressPat required features.
+    getReqFeatures(FeaturesMap, CompressPat.PatReqFeatures);
+
+    // Add Dest instruction required features.
+    std::vector<Record *> ReqFeatures;
+    std::vector<Record *> RF = Dest.TheDef->getValueAsListOfDefs("Predicates");
+    copy_if(RF, std::back_inserter(ReqFeatures), [](Record *R) {
+      return R->getValueAsBit("AssemblerMatcherPredicate");
+    });
+    getReqFeatures(FeaturesMap, ReqFeatures);
+
+    // Emit checks for all required features.
+    for (auto &F : FeaturesMap) {
+      StringRef Op = F.first;
+      if (Op[0] == '!')
+        CondStream.indent(6) << ("!STI.getFeatureBits()[" + Namespace +
+                                 "::" + Op.substr(1) + "]")
+                                        .str() +
+                                    " &&\n";
+      else
+        CondStream.indent(6)
+            << ("STI.getFeatureBits()[" + Namespace + "::" + Op + "]").str() +
+                   " &&\n";
+    }
+
+    // Start Source Inst operands validation.
+    unsigned OpNo = 0;
+    for (OpNo = 0; OpNo < Source.Operands.size(); ++OpNo) {
+      if (SourceOperandMap[OpNo].TiedOpIdx != -1) {
+        if (Source.Operands[OpNo].Rec->isSubClassOf("RegisterClass"))
+          CondStream.indent(6)
+              << "(MI.getOperand("
+              << std::to_string(OpNo) + ").getReg() ==  MI.getOperand("
+              << std::to_string(SourceOperandMap[OpNo].TiedOpIdx)
+              << ").getReg()) &&\n";
+        else
+          PrintFatalError("Unexpected tied operand types!\n");
+      }
+      // Check for fixed immediates\registers in the source instruction.
+      switch (SourceOperandMap[OpNo].Kind) {
+      case OpData::Operand:
+        // We don't need to do anything for source instruction operand checks.
+        break;
+      case OpData::Imm:
+        CondStream.indent(6)
+            << "(MI.getOperand(" + std::to_string(OpNo) + ").isImm()) &&\n" +
+                   "      (MI.getOperand(" + std::to_string(OpNo) +
+                   ").getImm() == " +
+                   std::to_string(SourceOperandMap[OpNo].Data.Imm) + ") &&\n";
+        break;
+      case OpData::Reg: {
+        Record *Reg = SourceOperandMap[OpNo].Data.Reg;
+        CondStream.indent(6) << "(MI.getOperand(" + std::to_string(OpNo) +
+                                    ").getReg() == " + Namespace +
+                                    "::" + Reg->getName().str() + ") &&\n";
+        break;
+      }
+      }
+    }
+    CodeStream.indent(6) << "// " + Dest.AsmString + "\n";
+    CodeStream.indent(6) << "OutInst.setOpcode(" + Namespace +
+                                "::" + Dest.TheDef->getName().str() + ");\n";
+    OpNo = 0;
+    for (const auto &DestOperand : Dest.Operands) {
+      CodeStream.indent(6) << "// Operand: " + DestOperand.Name + "\n";
+      switch (DestOperandMap[OpNo].Kind) {
+      case OpData::Operand: {
+        unsigned OpIdx = DestOperandMap[OpNo].Data.Operand;
+        // Check that the operand in the Source instruction fits
+        // the type for the Dest instruction.
+        if (DestOperand.Rec->isSubClassOf("RegisterClass")) {
+          NeedMRI = true;
+          // This is a register operand. Check the register class.
+          // Don't check register class if this is a tied operand, it was done
+          // for the operand its tied to.
+          if (DestOperand.getTiedRegister() == -1)
+            CondStream.indent(6)
+                << "(MRI.getRegClass(" + Namespace +
+                       "::" + DestOperand.Rec->getName().str() +
+                       "RegClassID).contains(" + "MI.getOperand(" +
+                       std::to_string(OpIdx) + ").getReg())) &&\n";
+
+          CodeStream.indent(6) << "OutInst.addOperand(MI.getOperand(" +
+                                      std::to_string(OpIdx) + "));\n";
+        } else {
+          // Handling immediate operands.
+          unsigned Entry = getMCOpPredicate(MCOpPredicateMap, MCOpPredicates,
+                                            DestOperand.Rec);
+          CondStream.indent(6) << Namespace + "ValidateMCOperand(" +
+                                      "MI.getOperand(" + std::to_string(OpIdx) +
+                                      "), STI, " + std::to_string(Entry) +
+                                      ") &&\n";
+          CodeStream.indent(6) << "OutInst.addOperand(MI.getOperand(" +
+                                      std::to_string(OpIdx) + "));\n";
+        }
+        break;
+      }
+      case OpData::Imm: {
+        unsigned Entry =
+            getMCOpPredicate(MCOpPredicateMap, MCOpPredicates, DestOperand.Rec);
+        CondStream.indent(6)
+            << Namespace + "ValidateMCOperand(" + "MCOperand::createImm(" +
+                   std::to_string(DestOperandMap[OpNo].Data.Imm) + "), STI, " +
+                   std::to_string(Entry) + ") &&\n";
+        CodeStream.indent(6)
+            << "OutInst.addOperand(MCOperand::createImm(" +
+                   std::to_string(DestOperandMap[OpNo].Data.Imm) + "));\n";
+      } break;
+      case OpData::Reg: {
+        // Fixed register has been validated at pattern validation time.
+        Record *Reg = DestOperandMap[OpNo].Data.Reg;
+        CodeStream.indent(6) << "OutInst.addOperand(MCOperand::createReg(" +
+                                    Namespace + "::" + Reg->getName().str() +
+                                    "));\n";
+      } break;
+      }
+      ++OpNo;
+    }
+    CaseStream << mergeCondAndCode(CondStream, CodeStream);
+    PrevOp = CurOp;
+  }
+  Func << CaseStream.str() << "\n";
+  // Close brace for the last case.
+  Func.indent(4) << "} // case " + CurOp + "\n";
+  Func.indent(2) << "} // switch\n";
+  Func.indent(2) << "return false;\n}\n";
+
+  if (!MCOpPredicates.empty()) {
+    o << "static bool " << Namespace
+      << "ValidateMCOperand(const MCOperand &MCOp,\n"
+      << "                  const MCSubtargetInfo &STI,\n"
+      << "                  unsigned PredicateIndex) {\n"
+      << "  switch (PredicateIndex) {\n"
+      << "  default:\n"
+      << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"
+      << "    break;\n";
+
+    for (unsigned i = 0; i < MCOpPredicates.size(); ++i) {
+      Init *MCOpPred = MCOpPredicates[i]->getValueInit("MCOperandPredicate");
+      if (CodeInit *SI = dyn_cast<CodeInit>(MCOpPred))
+        o << "  case " << i + 1 << ": {\n"
+          << "   // " << MCOpPredicates[i]->getName().str() << SI->getValue()
+          << "\n"
+          << "    }\n";
+      else
+        llvm_unreachable("Unexpected MCOperandPredicate field!");
+    }
+    o << "  }\n"
+      << "}\n\n";
+  }
+
+  o << FuncH.str();
+  if (NeedMRI && Compress)
+    o.indent(2) << "const MCRegisterInfo &MRI = *Context.getRegisterInfo();\n";
+  o << Func.str();
+
+  if (Compress)
+    o << "\n#endif //GEN_COMPRESS_INSTR\n";
+  else
+    o << "\n#endif //GEN_UNCOMPRESS_INSTR\n\n";
+}
+
+void RISCVCompressInstEmitter::run(raw_ostream &o) {
+  Record *CompressClass = Records.getClass("CompressPat");
+  assert(CompressClass && "Compress class definition missing!");
+  std::vector<Record *> Insts;
+  for (const auto &D : Records.getDefs()) {
+    if (D.second->isSubClassOf(CompressClass))
+      Insts.push_back(D.second.get());
+  }
+
+  // Process the CompressPat definitions, validating them as we do so.
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
+    evaluateCompressPat(Insts[i]);
+
+  // Emit file header.
+  emitSourceFileHeader("Compress instruction Source Fragment", o);
+  // Generate compressInst() function.
+  emitCompressInstEmitter(o, true);
+  // Generate uncompressInst() function.
+  emitCompressInstEmitter(o, false);
+}
+
+namespace llvm {
+
+void EmitCompressInst(RecordKeeper &RK, raw_ostream &OS) {
+  RISCVCompressInstEmitter(RK).run(OS);
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/utils/TableGen/RegisterBankEmitter.cpp b/contrib/llvm/utils/TableGen/RegisterBankEmitter.cpp
index 5c6471688044..879b4162d629 100644
--- a/contrib/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -291,9 +291,11 @@ void RegisterBankEmitter::run(raw_ostream &OS) {
       visitRegisterBankClasses(
           RegisterClassHierarchy, RC, "explicit",
           [&Bank](const CodeGenRegisterClass *RC, StringRef Kind) {
-            DEBUG(dbgs() << "Added " << RC->getName() << "(" << Kind << ")\n");
+            LLVM_DEBUG(dbgs()
+                       << "Added " << RC->getName() << "(" << Kind << ")\n");
             Bank.addRegisterClass(RC);
-          }, VisitedRCs);
+          },
+          VisitedRCs);
     }
 
     Banks.push_back(Bank);
diff --git a/contrib/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/contrib/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 7eef2337c140..49016cca799e 100644
--- a/contrib/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -15,19 +15,19 @@
 
 #include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
-#include "Types.h"
 #include "SequenceToOffsetTable.h"
+#include "Types.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -203,11 +203,11 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "  static const RegClassWeight RCWeightTable[] = {\n";
   for (const auto &RC : RegBank.getRegClasses()) {
     const CodeGenRegister::Vec &Regs = RC.getMembers();
-    if (Regs.empty())
+    if (Regs.empty() || RC.Artificial)
       OS << "    {0, 0";
     else {
       std::vector<unsigned> RegUnits;
-      RC.buildRegUnitSet(RegUnits);
+      RC.buildRegUnitSet(RegBank, RegUnits);
       OS << "    {" << (*Regs.begin())->getWeight(RegBank)
          << ", " << RegBank.getRegUnitSetWeight(RegUnits);
     }
@@ -296,7 +296,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
            PSetE = PSetIDs.end(); PSetI != PSetE; ++PSetI) {
       PSets[i].push_back(RegBank.getRegPressureSet(*PSetI).Order);
     }
-    std::sort(PSets[i].begin(), PSets[i].end());
+    llvm::sort(PSets[i].begin(), PSets[i].end());
     PSetsSeqs.add(PSets[i]);
   }
 
diff --git a/contrib/llvm/utils/TableGen/SearchableTableEmitter.cpp b/contrib/llvm/utils/TableGen/SearchableTableEmitter.cpp
index 63252e8c0391..664de2217e94 100644
--- a/contrib/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -13,23 +13,85 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "CodeGenIntrinsics.h"
 #include <algorithm>
+#include <set>
 #include <string>
 #include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "searchable-table-emitter"
 
 namespace {
 
+struct GenericTable;
+
+int getAsInt(Init *B) {
+  return cast<IntInit>(B->convertInitializerTo(IntRecTy::get()))->getValue();
+}
+int getInt(Record *R, StringRef Field) {
+  return getAsInt(R->getValueInit(Field));
+}
+
+struct GenericEnum {
+  using Entry = std::pair<StringRef, int64_t>;
+
+  std::string Name;
+  Record *Class;
+  std::string PreprocessorGuard;
+  std::vector<std::unique_ptr<Entry>> Entries;
+  DenseMap<Record *, Entry *> EntryMap;
+};
+
+struct GenericField {
+  std::string Name;
+  RecTy *RecType = nullptr;
+  bool IsIntrinsic = false;
+  bool IsInstruction = false;
+  GenericEnum *Enum = nullptr;
+
+  GenericField(StringRef Name) : Name(Name) {}
+};
+
+struct SearchIndex {
+  std::string Name;
+  SmallVector<GenericField, 1> Fields;
+  bool EarlyOut;
+};
+
+struct GenericTable {
+  std::string Name;
+  std::string PreprocessorGuard;
+  std::string CppTypeName;
+  SmallVector<GenericField, 2> Fields;
+  std::vector<Record *> Entries;
+
+  std::unique_ptr<SearchIndex> PrimaryKey;
+  SmallVector<std::unique_ptr<SearchIndex>, 2> Indices;
+
+  const GenericField *getFieldByName(StringRef Name) const {
+    for (const auto &Field : Fields) {
+      if (Name == Field.Name)
+        return &Field;
+    }
+    return nullptr;
+  }
+};
+
 class SearchableTableEmitter {
   RecordKeeper &Records;
+  DenseMap<Init *, std::unique_ptr<CodeGenIntrinsic>> Intrinsics;
+  std::vector<std::unique_ptr<GenericEnum>> Enums;
+  DenseMap<Record *, GenericEnum *> EnumMap;
+  std::set<std::string> PreprocessorGuards;
 
 public:
   SearchableTableEmitter(RecordKeeper &R) : Records(R) {}
@@ -39,38 +101,58 @@ public:
 private:
   typedef std::pair<Init *, int> SearchTableEntry;
 
-  int getAsInt(BitsInit *B) {
-    return cast<IntInit>(B->convertInitializerTo(IntRecTy::get()))->getValue();
-  }
-  int getInt(Record *R, StringRef Field) {
-    return getAsInt(R->getValueAsBitsInit(Field));
-  }
+  enum TypeContext {
+    TypeInStaticStruct,
+    TypeInTempStruct,
+    TypeInArgument,
+  };
 
-  std::string primaryRepresentation(Init *I) {
+  std::string primaryRepresentation(const GenericField &Field, Init *I) {
     if (StringInit *SI = dyn_cast<StringInit>(I))
       return SI->getAsString();
     else if (BitsInit *BI = dyn_cast<BitsInit>(I))
       return "0x" + utohexstr(getAsInt(BI));
     else if (BitInit *BI = dyn_cast<BitInit>(I))
       return BI->getValue() ? "true" : "false";
-    else if (CodeInit *CI = dyn_cast<CodeInit>(I)) {
+    else if (CodeInit *CI = dyn_cast<CodeInit>(I))
       return CI->getValue();
-    }
-    PrintFatalError(SMLoc(),
-                    "invalid field type, expected: string, bits, bit or code");
+    else if (Field.IsIntrinsic)
+      return "Intrinsic::" + getIntrinsic(I).EnumName;
+    else if (Field.IsInstruction)
+      return I->getAsString();
+    else if (Field.Enum)
+      return Field.Enum->EntryMap[cast<DefInit>(I)->getDef()]->first;
+    PrintFatalError(Twine("invalid field type for field '") + Field.Name +
+                    "', expected: string, bits, bit or code");
+  }
+
+  bool isIntrinsic(Init *I) {
+    if (DefInit *DI = dyn_cast<DefInit>(I))
+      return DI->getDef()->isSubClassOf("Intrinsic");
+    return false;
+  }
+
+  CodeGenIntrinsic &getIntrinsic(Init *I) {
+    std::unique_ptr<CodeGenIntrinsic> &Intr = Intrinsics[I];
+    if (!Intr)
+      Intr = make_unique<CodeGenIntrinsic>(cast<DefInit>(I)->getDef());
+    return *Intr;
   }
 
-  std::string searchRepresentation(Init *I) {
-    std::string PrimaryRep = primaryRepresentation(I);
-    if (!isa<StringInit>(I))
-      return PrimaryRep;
-    return StringRef(PrimaryRep).upper();
+  bool compareBy(Record *LHS, Record *RHS, const SearchIndex &Index);
+
+  bool isIntegral(Init *I) {
+    return isa<BitsInit>(I) || isIntrinsic(I);
   }
 
-  std::string searchableFieldType(Init *I) {
-    if (isa<StringInit>(I))
-      return "const char *";
-    else if (BitsInit *BI = dyn_cast<BitsInit>(I)) {
+  std::string searchableFieldType(const GenericField &Field, TypeContext Ctx) {
+    if (isa<StringRecTy>(Field.RecType)) {
+      if (Ctx == TypeInStaticStruct)
+        return "const char *";
+      if (Ctx == TypeInTempStruct)
+        return "std::string";
+      return "StringRef";
+    } else if (BitsRecTy *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
       unsigned NumBits = BI->getNumBits();
       if (NumBits <= 8)
         NumBits = 8;
@@ -81,233 +163,617 @@ private:
       else if (NumBits <= 64)
         NumBits = 64;
       else
-        PrintFatalError(SMLoc(), "bitfield too large to search");
+        PrintFatalError(Twine("bitfield '") + Field.Name +
+                        "' too large to search");
       return "uint" + utostr(NumBits) + "_t";
-    }
-    PrintFatalError(SMLoc(), "Unknown type to search by");
+    } else if (Field.Enum || Field.IsIntrinsic || Field.IsInstruction)
+      return "unsigned";
+    PrintFatalError(Twine("Field '") + Field.Name + "' has unknown type '" +
+                    Field.RecType->getAsString() + "' to search by");
   }
 
-  void emitMapping(Record *MappingDesc, raw_ostream &OS);
-  void emitMappingEnum(std::vector<Record *> &Items, Record *InstanceClass,
-                       raw_ostream &OS);
-  void
-  emitPrimaryTable(StringRef Name, std::vector<std::string> &FieldNames,
-                   std::vector<std::string> &SearchFieldNames,
-                   std::vector<std::vector<SearchTableEntry>> &SearchTables,
-                   std::vector<Record *> &Items, raw_ostream &OS);
-  void emitSearchTable(StringRef Name, StringRef Field,
-                       std::vector<SearchTableEntry> &SearchTable,
-                       raw_ostream &OS);
-  void emitLookupDeclaration(StringRef Name, StringRef Field, Init *I,
-                             raw_ostream &OS);
-  void emitLookupFunction(StringRef Name, StringRef Field, Init *I,
-                          raw_ostream &OS);
+  void emitGenericTable(const GenericTable &Table, raw_ostream &OS);
+  void emitGenericEnum(const GenericEnum &Enum, raw_ostream &OS);
+  void emitLookupDeclaration(const GenericTable &Table,
+                             const SearchIndex &Index, raw_ostream &OS);
+  void emitLookupFunction(const GenericTable &Table, const SearchIndex &Index,
+                          bool IsPrimary, raw_ostream &OS);
+  void emitIfdef(StringRef Guard, raw_ostream &OS);
+
+  bool parseFieldType(GenericField &Field, Init *II);
+  std::unique_ptr<SearchIndex>
+  parseSearchIndex(GenericTable &Table, StringRef Name,
+                   const std::vector<StringRef> &Key, bool EarlyOut);
+  void collectEnumEntries(GenericEnum &Enum, StringRef NameField,
+                          StringRef ValueField,
+                          const std::vector<Record *> &Items);
+  void collectTableEntries(GenericTable &Table,
+                           const std::vector<Record *> &Items);
 };
 
 } // End anonymous namespace.
 
-/// Emit an enum providing symbolic access to some preferred field from
-/// C++.
-void SearchableTableEmitter::emitMappingEnum(std::vector<Record *> &Items,
-                                             Record *InstanceClass,
-                                             raw_ostream &OS) {
-  StringRef EnumNameField = InstanceClass->getValueAsString("EnumNameField");
-  StringRef EnumValueField;
-  if (!InstanceClass->isValueUnset("EnumValueField"))
-    EnumValueField = InstanceClass->getValueAsString("EnumValueField");
-
-  OS << "enum " << InstanceClass->getName() << "Values {\n";
-  for (auto Item : Items) {
-    OS << "  " << Item->getValueAsString(EnumNameField);
-    if (EnumValueField != StringRef())
-      OS << " = " << getInt(Item, EnumValueField);
-    OS << ",\n";
+// For search indices that consists of a single field whose numeric value is
+// known, return that numeric value.
+static int64_t getNumericKey(const SearchIndex &Index, Record *Rec) {
+  assert(Index.Fields.size() == 1);
+
+  if (Index.Fields[0].Enum) {
+    Record *EnumEntry = Rec->getValueAsDef(Index.Fields[0].Name);
+    return Index.Fields[0].Enum->EntryMap[EnumEntry]->second;
   }
-  OS << "};\n\n";
-}
 
-void SearchableTableEmitter::emitPrimaryTable(
-    StringRef Name, std::vector<std::string> &FieldNames,
-    std::vector<std::string> &SearchFieldNames,
-    std::vector<std::vector<SearchTableEntry>> &SearchTables,
-    std::vector<Record *> &Items, raw_ostream &OS) {
-  OS << "const " << Name << " " << Name << "sList[] = {\n";
+  return getInt(Rec, Index.Fields[0].Name);
+}
 
-  for (auto Item : Items) {
-    OS << "  { ";
-    for (unsigned i = 0; i < FieldNames.size(); ++i) {
-      OS << primaryRepresentation(Item->getValueInit(FieldNames[i]));
-      if (i != FieldNames.size() - 1)
-        OS << ", ";
+/// Less-than style comparison between \p LHS and \p RHS according to the
+/// key of \p Index.
+bool SearchableTableEmitter::compareBy(Record *LHS, Record *RHS,
+                                       const SearchIndex &Index) {
+  for (const auto &Field : Index.Fields) {
+    Init *LHSI = LHS->getValueInit(Field.Name);
+    Init *RHSI = RHS->getValueInit(Field.Name);
+
+    if (isa<BitsRecTy>(Field.RecType) || isa<IntRecTy>(Field.RecType)) {
+      int64_t LHSi = getAsInt(LHSI);
+      int64_t RHSi = getAsInt(RHSI);
+      if (LHSi < RHSi)
+        return true;
+      if (LHSi > RHSi)
+        return false;
+    } else if (Field.IsIntrinsic) {
+      CodeGenIntrinsic &LHSi = getIntrinsic(LHSI);
+      CodeGenIntrinsic &RHSi = getIntrinsic(RHSI);
+      if (std::tie(LHSi.TargetPrefix, LHSi.Name) <
+          std::tie(RHSi.TargetPrefix, RHSi.Name))
+        return true;
+      if (std::tie(LHSi.TargetPrefix, LHSi.Name) >
+          std::tie(RHSi.TargetPrefix, RHSi.Name))
+        return false;
+    } else if (Field.IsInstruction) {
+      // This does not correctly compare the predefined instructions!
+      Record *LHSr = cast<DefInit>(LHSI)->getDef();
+      Record *RHSr = cast<DefInit>(RHSI)->getDef();
+
+      bool LHSpseudo = LHSr->getValueAsBit("isPseudo");
+      bool RHSpseudo = RHSr->getValueAsBit("isPseudo");
+      if (LHSpseudo && !RHSpseudo)
+        return true;
+      if (!LHSpseudo && RHSpseudo)
+        return false;
+
+      int comp = LHSr->getName().compare(RHSr->getName());
+      if (comp < 0)
+        return true;
+      if (comp > 0)
+        return false;
+    } else if (Field.Enum) {
+      auto LHSr = cast<DefInit>(LHSI)->getDef();
+      auto RHSr = cast<DefInit>(RHSI)->getDef();
+      int64_t LHSv = Field.Enum->EntryMap[LHSr]->second;
+      int64_t RHSv = Field.Enum->EntryMap[RHSr]->second;
+      if (LHSv < RHSv)
+        return true;
+      if (LHSv > RHSv)
+        return false;
+    } else {
+      std::string LHSs = primaryRepresentation(Field, LHSI);
+      std::string RHSs = primaryRepresentation(Field, RHSI);
+
+      if (isa<StringRecTy>(Field.RecType)) {
+        LHSs = StringRef(LHSs).upper();
+        RHSs = StringRef(RHSs).upper();
+      }
+
+      int comp = LHSs.compare(RHSs);
+      if (comp < 0)
+        return true;
+      if (comp > 0)
+        return false;
     }
-    OS << "},\n";
   }
-  OS << "};\n\n";
+  return false;
 }
 
-void SearchableTableEmitter::emitSearchTable(
-    StringRef Name, StringRef Field, std::vector<SearchTableEntry> &SearchTable,
-    raw_ostream &OS) {
-  OS << "const std::pair<" << searchableFieldType(SearchTable[0].first)
-     << ", int> " << Name << "sBy" << Field << "[] = {\n";
-
-  if (isa<BitsInit>(SearchTable[0].first)) {
-    std::stable_sort(SearchTable.begin(), SearchTable.end(),
-                     [this](const SearchTableEntry &LHS,
-                            const SearchTableEntry &RHS) {
-                       return getAsInt(cast<BitsInit>(LHS.first)) <
-                              getAsInt(cast<BitsInit>(RHS.first));
-                     });
+void SearchableTableEmitter::emitIfdef(StringRef Guard, raw_ostream &OS) {
+  OS << "#ifdef " << Guard << "\n";
+  PreprocessorGuards.insert(Guard);
+}
+
+/// Emit a generic enum.
+void SearchableTableEmitter::emitGenericEnum(const GenericEnum &Enum,
+                                             raw_ostream &OS) {
+  emitIfdef((Twine("GET_") + Enum.PreprocessorGuard + "_DECL").str(), OS);
+
+  OS << "enum " << Enum.Name << " {\n";
+  for (const auto &Entry : Enum.Entries)
+    OS << "  " << Entry->first << " = " << Entry->second << ",\n";
+  OS << "};\n";
+
+  OS << "#endif\n\n";
+}
+
+void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
+                                                const SearchIndex &Index,
+                                                bool IsPrimary,
+                                                raw_ostream &OS) {
+  OS << "\n";
+  emitLookupDeclaration(Table, Index, OS);
+  OS << " {\n";
+
+  std::vector<Record *> IndexRowsStorage;
+  ArrayRef<Record *> IndexRows;
+  StringRef IndexTypeName;
+  StringRef IndexName;
+
+  if (IsPrimary) {
+    IndexTypeName = Table.CppTypeName;
+    IndexName = Table.Name;
+    IndexRows = Table.Entries;
   } else {
-    std::stable_sort(SearchTable.begin(), SearchTable.end(),
-                     [this](const SearchTableEntry &LHS,
-                            const SearchTableEntry &RHS) {
-                       return searchRepresentation(LHS.first) <
-                              searchRepresentation(RHS.first);
+    OS << "  struct IndexType {\n";
+    for (const auto &Field : Index.Fields) {
+      OS << "    " << searchableFieldType(Field, TypeInStaticStruct) << " "
+         << Field.Name << ";\n";
+    }
+    OS << "    unsigned _index;\n";
+    OS << "  };\n";
+
+    OS << "  static const struct IndexType Index[] = {\n";
+
+    std::vector<std::pair<Record *, unsigned>> Entries;
+    Entries.reserve(Table.Entries.size());
+    for (unsigned i = 0; i < Table.Entries.size(); ++i)
+      Entries.emplace_back(Table.Entries[i], i);
+
+    std::stable_sort(Entries.begin(), Entries.end(),
+                     [&](const std::pair<Record *, unsigned> &LHS,
+                         const std::pair<Record *, unsigned> &RHS) {
+                       return compareBy(LHS.first, RHS.first, Index);
                      });
+
+    IndexRowsStorage.reserve(Entries.size());
+    for (const auto &Entry : Entries) {
+      IndexRowsStorage.push_back(Entry.first);
+
+      OS << "    { ";
+      bool NeedComma = false;
+      for (const auto &Field : Index.Fields) {
+        if (NeedComma)
+          OS << ", ";
+        NeedComma = true;
+
+        std::string Repr =
+            primaryRepresentation(Field, Entry.first->getValueInit(Field.Name));
+        if (isa<StringRecTy>(Field.RecType))
+          Repr = StringRef(Repr).upper();
+        OS << Repr;
+      }
+      OS << ", " << Entry.second << " },\n";
+    }
+
+    OS << "  };\n\n";
+
+    IndexTypeName = "IndexType";
+    IndexName = "Index";
+    IndexRows = IndexRowsStorage;
+  }
+
+  bool IsContiguous = false;
+
+  if (Index.Fields.size() == 1 &&
+      (Index.Fields[0].Enum || isa<BitsRecTy>(Index.Fields[0].RecType))) {
+    IsContiguous = true;
+    for (unsigned i = 0; i < IndexRows.size(); ++i) {
+      if (getNumericKey(Index, IndexRows[i]) != i) {
+        IsContiguous = false;
+        break;
+      }
+    }
   }
 
-  for (auto Entry : SearchTable) {
-    OS << "  { " << searchRepresentation(Entry.first) << ", " << Entry.second
-       << " },\n";
+  if (IsContiguous) {
+    OS << "  auto Table = makeArrayRef(" << IndexName << ");\n";
+    OS << "  size_t Idx = " << Index.Fields[0].Name << ";\n";
+    OS << "  return Idx >= Table.size() ? nullptr : ";
+    if (IsPrimary)
+      OS << "&Table[Idx]";
+    else
+      OS << "&" << Table.Name << "[Table[Idx]._index]";
+    OS << ";\n";
+    OS << "}\n";
+    return;
   }
-  OS << "};\n\n";
-}
 
-void SearchableTableEmitter::emitLookupFunction(StringRef Name, StringRef Field,
-                                                Init *I, raw_ostream &OS) {
-  bool IsIntegral = isa<BitsInit>(I);
-  std::string FieldType = searchableFieldType(I);
-  std::string PairType = "std::pair<" + FieldType + ", int>";
-
-  // const SysRegs *lookupSysRegByName(const char *Name) {
-  OS << "const " << Name << " *"
-     << "lookup" << Name << "By" << Field;
-  OS << "(" << (IsIntegral ? FieldType : "StringRef") << " " << Field
-     << ") {\n";
-
-  if (IsIntegral) {
-    OS << "  auto CanonicalVal = " << Field << ";\n";
-    OS << " " << PairType << " Val = {CanonicalVal, 0};\n";
-  } else {
-    // Make sure the result is null terminated because it's going via "char *".
-    OS << "  std::string CanonicalVal = " << Field << ".upper();\n";
-    OS << "  " << PairType << " Val = {CanonicalVal.c_str(), 0};\n";
+  if (Index.EarlyOut) {
+    const GenericField &Field = Index.Fields[0];
+    std::string FirstRepr =
+        primaryRepresentation(Field, IndexRows[0]->getValueInit(Field.Name));
+    std::string LastRepr = primaryRepresentation(
+        Field, IndexRows.back()->getValueInit(Field.Name));
+    OS << "  if ((" << Field.Name << " < " << FirstRepr << ") ||\n";
+    OS << "      (" << Field.Name << " > " << LastRepr << "))\n";
+    OS << "    return nullptr;\n\n";
   }
 
-  OS << "  ArrayRef<" << PairType << "> Table(" << Name << "sBy" << Field
-     << ");\n";
-  OS << "  auto Idx = std::lower_bound(Table.begin(), Table.end(), Val";
-
-  if (IsIntegral)
-    OS << ");\n";
-  else {
-    OS << ",\n                              ";
-    OS << "[](const " << PairType << " &LHS, const " << PairType
-       << " &RHS) {\n";
-    OS << "    return std::strcmp(LHS.first, RHS.first) < 0;\n";
-    OS << "  });\n\n";
+  OS << "  struct KeyType {\n";
+  for (const auto &Field : Index.Fields) {
+    OS << "    " << searchableFieldType(Field, TypeInTempStruct) << " "
+       << Field.Name << ";\n";
+  }
+  OS << "  };\n";
+  OS << "  KeyType Key = { ";
+  bool NeedComma = false;
+  for (const auto &Field : Index.Fields) {
+    if (NeedComma)
+      OS << ", ";
+    NeedComma = true;
+
+    OS << Field.Name;
+    if (isa<StringRecTy>(Field.RecType)) {
+      OS << ".upper()";
+      if (IsPrimary)
+        PrintFatalError(Twine("Use a secondary index for case-insensitive "
+                              "comparison of field '") +
+                        Field.Name + "' in table '" + Table.Name + "'");
+    }
+  }
+  OS << " };\n";
+
+  OS << "  auto Table = makeArrayRef(" << IndexName << ");\n";
+  OS << "  auto Idx = std::lower_bound(Table.begin(), Table.end(), Key,\n";
+  OS << "    [](const " << IndexTypeName << " &LHS, const KeyType &RHS) {\n";
+
+  for (const auto &Field : Index.Fields) {
+    if (isa<StringRecTy>(Field.RecType)) {
+      OS << "      int Cmp" << Field.Name << " = StringRef(LHS." << Field.Name
+         << ").compare(RHS." << Field.Name << ");\n";
+      OS << "      if (Cmp" << Field.Name << " < 0) return true;\n";
+      OS << "      if (Cmp" << Field.Name << " > 0) return false;\n";
+    } else {
+      OS << "      if (LHS." << Field.Name << " < RHS." << Field.Name << ")\n";
+      OS << "        return true;\n";
+      OS << "      if (LHS." << Field.Name << " > RHS." << Field.Name << ")\n";
+      OS << "        return false;\n";
+    }
   }
 
-  OS << "  if (Idx == Table.end() || CanonicalVal != Idx->first)\n";
-  OS << "    return nullptr;\n";
+  OS << "      return false;\n";
+  OS << "    });\n\n";
 
-  OS << "  return &" << Name << "sList[Idx->second];\n";
-  OS << "}\n\n";
+  OS << "  if (Idx == Table.end()";
+
+  for (const auto &Field : Index.Fields)
+    OS << " ||\n      Key." << Field.Name << " != Idx->" << Field.Name;
+  OS << ")\n    return nullptr;\n";
+
+  if (IsPrimary)
+    OS << "  return &*Idx;\n";
+  else
+    OS << "  return &" << Table.Name << "[Idx->_index];\n";
+
+  OS << "}\n";
 }
 
-void SearchableTableEmitter::emitLookupDeclaration(StringRef Name,
-                                                   StringRef Field, Init *I,
+void SearchableTableEmitter::emitLookupDeclaration(const GenericTable &Table,
+                                                   const SearchIndex &Index,
                                                    raw_ostream &OS) {
-  bool IsIntegral = isa<BitsInit>(I);
-  std::string FieldType = searchableFieldType(I);
-  OS << "const " << Name << " *"
-     << "lookup" << Name << "By" << Field;
-  OS << "(" << (IsIntegral ? FieldType : "StringRef") << " " << Field
-     << ");\n\n";
+  OS << "const " << Table.CppTypeName << " *" << Index.Name << "(";
+
+  bool NeedComma = false;
+  for (const auto &Field : Index.Fields) {
+    if (NeedComma)
+      OS << ", ";
+    NeedComma = true;
+
+    OS << searchableFieldType(Field, TypeInArgument) << " " << Field.Name;
+  }
+  OS << ")";
 }
 
-void SearchableTableEmitter::emitMapping(Record *InstanceClass,
-                                         raw_ostream &OS) {
-  StringRef TableName = InstanceClass->getName();
-  std::vector<Record *> Items = Records.getAllDerivedDefinitions(TableName);
+void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
+                                              raw_ostream &OS) {
+  emitIfdef((Twine("GET_") + Table.PreprocessorGuard + "_DECL").str(), OS);
 
-  // Gather all the records we're going to need for this particular mapping.
-  std::vector<std::vector<SearchTableEntry>> SearchTables;
-  std::vector<std::string> SearchFieldNames;
+  // Emit the declarations for the functions that will perform lookup.
+  if (Table.PrimaryKey) {
+    emitLookupDeclaration(Table, *Table.PrimaryKey, OS);
+    OS << ";\n";
+  }
+  for (const auto &Index : Table.Indices) {
+    emitLookupDeclaration(Table, *Index, OS);
+    OS << ";\n";
+  }
 
-  std::vector<std::string> FieldNames;
-  for (const RecordVal &Field : InstanceClass->getValues()) {
-    std::string FieldName = Field.getName();
+  OS << "#endif\n\n";
 
-    // Skip uninteresting fields: either built-in, special to us, or injected
-    // template parameters (if they contain a ':').
-    if (FieldName.find(':') != std::string::npos || FieldName == "NAME" ||
-        FieldName == "SearchableFields" || FieldName == "EnumNameField" ||
-        FieldName == "EnumValueField")
-      continue;
+  emitIfdef((Twine("GET_") + Table.PreprocessorGuard + "_IMPL").str(), OS);
 
-    FieldNames.push_back(FieldName);
-  }
+  // The primary data table contains all the fields defined for this map.
+  OS << "const " << Table.CppTypeName << " " << Table.Name << "[] = {\n";
+  for (unsigned i = 0; i < Table.Entries.size(); ++i) {
+    Record *Entry = Table.Entries[i];
+    OS << "  { ";
+
+    bool NeedComma = false;
+    for (const auto &Field : Table.Fields) {
+      if (NeedComma)
+        OS << ", ";
+      NeedComma = true;
+
+      OS << primaryRepresentation(Field, Entry->getValueInit(Field.Name));
+    }
 
-  for (auto *Field : *InstanceClass->getValueAsListInit("SearchableFields")) {
-    SearchTables.emplace_back();
-    SearchFieldNames.push_back(Field->getAsUnquotedString());
+    OS << " }, // " << i << "\n";
   }
+  OS << " };\n";
+
+  // Indexes are sorted "{ Thing, PrimaryIdx }" arrays, so that a binary
+  // search can be performed by "Thing".
+  if (Table.PrimaryKey)
+    emitLookupFunction(Table, *Table.PrimaryKey, true, OS);
+  for (const auto &Index : Table.Indices)
+    emitLookupFunction(Table, *Index, false, OS);
 
-  int Idx = 0;
-  for (Record *Item : Items) {
-    for (unsigned i = 0; i < SearchFieldNames.size(); ++i) {
-      Init *SearchVal = Item->getValueInit(SearchFieldNames[i]);
-      SearchTables[i].emplace_back(SearchVal, Idx);
+  OS << "#endif\n\n";
+}
+
+bool SearchableTableEmitter::parseFieldType(GenericField &Field, Init *II) {
+  if (auto DI = dyn_cast<DefInit>(II)) {
+    Record *TypeRec = DI->getDef();
+    if (TypeRec->isSubClassOf("GenericEnum")) {
+      Field.Enum = EnumMap[TypeRec];
+      Field.RecType = RecordRecTy::get(Field.Enum->Class);
+      return true;
     }
-    ++Idx;
   }
 
-  OS << "#ifdef GET_" << TableName.upper() << "_DECL\n";
-  OS << "#undef GET_" << TableName.upper() << "_DECL\n";
+  return false;
+}
+
+std::unique_ptr<SearchIndex>
+SearchableTableEmitter::parseSearchIndex(GenericTable &Table, StringRef Name,
+                                         const std::vector<StringRef> &Key,
+                                         bool EarlyOut) {
+  auto Index = llvm::make_unique<SearchIndex>();
+  Index->Name = Name;
+  Index->EarlyOut = EarlyOut;
+
+  for (const auto &FieldName : Key) {
+    const GenericField *Field = Table.getFieldByName(FieldName);
+    if (!Field)
+      PrintFatalError(Twine("Search index '") + Name +
+                      "' refers to non-existing field '" + FieldName +
+                      "' in table '" + Table.Name + "'");
+    Index->Fields.push_back(*Field);
+  }
 
-  // Next emit the enum containing the top-level names for use in C++ code if
-  // requested
-  if (!InstanceClass->isValueUnset("EnumNameField")) {
-    emitMappingEnum(Items, InstanceClass, OS);
+  if (EarlyOut && isa<StringRecTy>(Index->Fields[0].RecType)) {
+    PrintFatalError(
+        "Early-out is not supported for string types (in search index '" +
+        Twine(Name) + "'");
   }
 
-  // And the declarations for the functions that will perform lookup.
-  for (unsigned i = 0; i < SearchFieldNames.size(); ++i)
-    emitLookupDeclaration(TableName, SearchFieldNames[i],
-                          SearchTables[i][0].first, OS);
+  return Index;
+}
 
-  OS << "#endif\n\n";
+void SearchableTableEmitter::collectEnumEntries(
+    GenericEnum &Enum, StringRef NameField, StringRef ValueField,
+    const std::vector<Record *> &Items) {
+  for (auto EntryRec : Items) {
+    StringRef Name;
+    if (NameField.empty())
+      Name = EntryRec->getName();
+    else
+      Name = EntryRec->getValueAsString(NameField);
+
+    int64_t Value = 0;
+    if (!ValueField.empty())
+      Value = getInt(EntryRec, ValueField);
+
+    Enum.Entries.push_back(llvm::make_unique<GenericEnum::Entry>(Name, Value));
+    Enum.EntryMap.insert(std::make_pair(EntryRec, Enum.Entries.back().get()));
+  }
 
-  OS << "#ifdef GET_" << TableName.upper() << "_IMPL\n";
-  OS << "#undef GET_" << TableName.upper() << "_IMPL\n";
+  if (ValueField.empty()) {
+    std::stable_sort(Enum.Entries.begin(), Enum.Entries.end(),
+                     [](const std::unique_ptr<GenericEnum::Entry> &LHS,
+                        const std::unique_ptr<GenericEnum::Entry> &RHS) {
+                       return LHS->first < RHS->first;
+                     });
 
-  // The primary data table contains all the fields defined for this map.
-  emitPrimaryTable(TableName, FieldNames, SearchFieldNames, SearchTables, Items,
-                   OS);
+    for (size_t i = 0; i < Enum.Entries.size(); ++i)
+      Enum.Entries[i]->second = i;
+  }
+}
 
-  // Indexes are sorted "{ Thing, PrimaryIdx }" arrays, so that a binary
-  // search can be performed by "Thing".
-  for (unsigned i = 0; i < SearchTables.size(); ++i) {
-    emitSearchTable(TableName, SearchFieldNames[i], SearchTables[i], OS);
-    emitLookupFunction(TableName, SearchFieldNames[i], SearchTables[i][0].first,
-                       OS);
+void SearchableTableEmitter::collectTableEntries(
+    GenericTable &Table, const std::vector<Record *> &Items) {
+  for (auto EntryRec : Items) {
+    for (auto &Field : Table.Fields) {
+      auto TI = dyn_cast<TypedInit>(EntryRec->getValueInit(Field.Name));
+      if (!TI) {
+        PrintFatalError(Twine("Record '") + EntryRec->getName() +
+                        "' in table '" + Table.Name + "' is missing field '" +
+                        Field.Name + "'");
+      }
+      if (!Field.RecType) {
+        Field.RecType = TI->getType();
+      } else {
+        RecTy *Ty = resolveTypes(Field.RecType, TI->getType());
+        if (!Ty)
+          PrintFatalError(Twine("Field '") + Field.Name + "' of table '" +
+                          Table.Name + "' has incompatible type: " +
+                          Ty->getAsString() + " vs. " +
+                          TI->getType()->getAsString());
+        Field.RecType = Ty;
+      }
+    }
+
+    Table.Entries.push_back(EntryRec);
   }
 
-  OS << "#endif\n";
+  Record *IntrinsicClass = Records.getClass("Intrinsic");
+  Record *InstructionClass = Records.getClass("Instruction");
+  for (auto &Field : Table.Fields) {
+    if (auto RecordTy = dyn_cast<RecordRecTy>(Field.RecType)) {
+      if (IntrinsicClass && RecordTy->isSubClassOf(IntrinsicClass))
+        Field.IsIntrinsic = true;
+      else if (InstructionClass && RecordTy->isSubClassOf(InstructionClass))
+        Field.IsInstruction = true;
+    }
+  }
 }
 
 void SearchableTableEmitter::run(raw_ostream &OS) {
-  // Tables are defined to be the direct descendents of "SearchableEntry".
+  // Emit tables in a deterministic order to avoid needless rebuilds.
+  SmallVector<std::unique_ptr<GenericTable>, 4> Tables;
+  DenseMap<Record *, GenericTable *> TableMap;
+
+  // Collect all definitions first.
+  for (auto EnumRec : Records.getAllDerivedDefinitions("GenericEnum")) {
+    StringRef NameField;
+    if (!EnumRec->isValueUnset("NameField"))
+      NameField = EnumRec->getValueAsString("NameField");
+
+    StringRef ValueField;
+    if (!EnumRec->isValueUnset("ValueField"))
+      ValueField = EnumRec->getValueAsString("ValueField");
+
+    auto Enum = llvm::make_unique<GenericEnum>();
+    Enum->Name = EnumRec->getName();
+    Enum->PreprocessorGuard = EnumRec->getName();
+
+    StringRef FilterClass = EnumRec->getValueAsString("FilterClass");
+    Enum->Class = Records.getClass(FilterClass);
+    if (!Enum->Class)
+      PrintFatalError(Twine("Enum FilterClass '") + FilterClass +
+                      "' does not exist");
+
+    collectEnumEntries(*Enum, NameField, ValueField,
+                       Records.getAllDerivedDefinitions(FilterClass));
+    EnumMap.insert(std::make_pair(EnumRec, Enum.get()));
+    Enums.emplace_back(std::move(Enum));
+  }
+
+  for (auto TableRec : Records.getAllDerivedDefinitions("GenericTable")) {
+    auto Table = llvm::make_unique<GenericTable>();
+    Table->Name = TableRec->getName();
+    Table->PreprocessorGuard = TableRec->getName();
+    Table->CppTypeName = TableRec->getValueAsString("CppTypeName");
+
+    std::vector<StringRef> Fields = TableRec->getValueAsListOfStrings("Fields");
+    for (const auto &FieldName : Fields) {
+      Table->Fields.emplace_back(FieldName);
+
+      if (auto TypeOfVal = TableRec->getValue(("TypeOf_" + FieldName).str())) {
+        if (!parseFieldType(Table->Fields.back(), TypeOfVal->getValue())) {
+          PrintFatalError(Twine("Table '") + Table->Name +
+                          "' has bad 'TypeOf_" + FieldName + "': " +
+                          TypeOfVal->getValue()->getAsString());
+        }
+      }
+    }
+
+    collectTableEntries(*Table, Records.getAllDerivedDefinitions(
+                                    TableRec->getValueAsString("FilterClass")));
+
+    if (!TableRec->isValueUnset("PrimaryKey")) {
+      Table->PrimaryKey =
+          parseSearchIndex(*Table, TableRec->getValueAsString("PrimaryKeyName"),
+                           TableRec->getValueAsListOfStrings("PrimaryKey"),
+                           TableRec->getValueAsBit("PrimaryKeyEarlyOut"));
+
+      std::stable_sort(Table->Entries.begin(), Table->Entries.end(),
+                       [&](Record *LHS, Record *RHS) {
+                         return compareBy(LHS, RHS, *Table->PrimaryKey);
+                       });
+    }
+
+    TableMap.insert(std::make_pair(TableRec, Table.get()));
+    Tables.emplace_back(std::move(Table));
+  }
+
+  for (Record *IndexRec : Records.getAllDerivedDefinitions("SearchIndex")) {
+    Record *TableRec = IndexRec->getValueAsDef("Table");
+    auto It = TableMap.find(TableRec);
+    if (It == TableMap.end())
+      PrintFatalError(Twine("SearchIndex '") + IndexRec->getName() +
+                      "' refers to non-existing table '" + TableRec->getName());
+
+    GenericTable &Table = *It->second;
+    Table.Indices.push_back(parseSearchIndex(
+        Table, IndexRec->getName(), IndexRec->getValueAsListOfStrings("Key"),
+        IndexRec->getValueAsBit("EarlyOut")));
+  }
+
+  // Translate legacy tables.
   Record *SearchableTable = Records.getClass("SearchableTable");
   for (auto &NameRec : Records.getClasses()) {
     Record *Class = NameRec.second.get();
     if (Class->getSuperClasses().size() != 1 ||
         !Class->isSubClassOf(SearchableTable))
       continue;
-    emitMapping(Class, OS);
+
+    StringRef TableName = Class->getName();
+    std::vector<Record *> Items = Records.getAllDerivedDefinitions(TableName);
+    if (!Class->isValueUnset("EnumNameField")) {
+      StringRef NameField = Class->getValueAsString("EnumNameField");
+      StringRef ValueField;
+      if (!Class->isValueUnset("EnumValueField"))
+        ValueField = Class->getValueAsString("EnumValueField");
+
+      auto Enum = llvm::make_unique<GenericEnum>();
+      Enum->Name = (Twine(Class->getName()) + "Values").str();
+      Enum->PreprocessorGuard = Class->getName().upper();
+      Enum->Class = Class;
+
+      collectEnumEntries(*Enum, NameField, ValueField, Items);
+
+      Enums.emplace_back(std::move(Enum));
+    }
+
+    auto Table = llvm::make_unique<GenericTable>();
+    Table->Name = (Twine(Class->getName()) + "sList").str();
+    Table->PreprocessorGuard = Class->getName().upper();
+    Table->CppTypeName = Class->getName();
+
+    for (const RecordVal &Field : Class->getValues()) {
+      std::string FieldName = Field.getName();
+
+      // Skip uninteresting fields: either special to us, or injected
+      // template parameters (if they contain a ':').
+      if (FieldName.find(':') != std::string::npos ||
+          FieldName == "SearchableFields" || FieldName == "EnumNameField" ||
+          FieldName == "EnumValueField")
+        continue;
+
+      Table->Fields.emplace_back(FieldName);
+    }
+
+    collectTableEntries(*Table, Items);
+
+    for (const auto &Field :
+         Class->getValueAsListOfStrings("SearchableFields")) {
+      std::string Name =
+          (Twine("lookup") + Table->CppTypeName + "By" + Field).str();
+      Table->Indices.push_back(parseSearchIndex(*Table, Name, {Field}, false));
+    }
+
+    Tables.emplace_back(std::move(Table));
   }
+
+  // Emit everything.
+  for (const auto &Enum : Enums)
+    emitGenericEnum(*Enum, OS);
+
+  for (const auto &Table : Tables)
+    emitGenericTable(*Table, OS);
+
+  // Put all #undefs last, to allow multiple sections guarded by the same
+  // define.
+  for (const auto &Guard : PreprocessorGuards)
+    OS << "#undef " << Guard << "\n";
 }
 
 namespace llvm {
diff --git a/contrib/llvm/utils/TableGen/SubtargetEmitter.cpp b/contrib/llvm/utils/TableGen/SubtargetEmitter.cpp
index 2c5658f8ce75..c5da8d8142ff 100644
--- a/contrib/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -13,6 +13,7 @@
 
 #include "CodeGenTarget.h"
 #include "CodeGenSchedule.h"
+#include "PredicateExpander.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
@@ -90,8 +91,14 @@ class SubtargetEmitter {
   void EmitItineraries(raw_ostream &OS,
                        std::vector<std::vector<InstrItinerary>>
                          &ProcItinLists);
+  unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
+                                  raw_ostream &OS);
+  void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
+                              raw_ostream &OS);
   void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name,
                          char Separator);
+  void EmitProcessorResourceSubUnits(const CodeGenProcModel &ProcModel,
+                                     raw_ostream &OS);
   void EmitProcessorResources(const CodeGenProcModel &ProcModel,
                               raw_ostream &OS);
   Record *FindWriteResources(const CodeGenSchedRW &SchedWrite,
@@ -106,6 +113,10 @@ class SubtargetEmitter {
   void EmitProcessorModels(raw_ostream &OS);
   void EmitProcessorLookup(raw_ostream &OS);
   void EmitSchedModelHelpers(const std::string &ClassName, raw_ostream &OS);
+  void emitSchedModelHelpersImpl(raw_ostream &OS,
+                                 bool OnlyExpandMCInstPredicates = false);
+  void emitGenMCSubtargetInfo(raw_ostream &OS);
+
   void EmitSchedModel(raw_ostream &OS);
   void EmitHwModeCheck(const std::string &ClassName, raw_ostream &OS);
   void ParseFeaturesFunction(raw_ostream &OS, unsigned NumFeatures,
@@ -128,7 +139,7 @@ void SubtargetEmitter::Enumeration(raw_ostream &OS) {
   // Get all records of class and sort
   std::vector<Record*> DefList =
     Records.getAllDerivedDefinitions("SubtargetFeature");
-  std::sort(DefList.begin(), DefList.end(), LessRecord());
+  llvm::sort(DefList.begin(), DefList.end(), LessRecord());
 
   unsigned N = DefList.size();
   if (N == 0)
@@ -167,7 +178,7 @@ unsigned SubtargetEmitter::FeatureKeyValues(raw_ostream &OS) {
   if (FeatureList.empty())
     return 0;
 
-  std::sort(FeatureList.begin(), FeatureList.end(), LessRecordFieldName());
+  llvm::sort(FeatureList.begin(), FeatureList.end(), LessRecordFieldName());
 
   // Begin feature table
   OS << "// Sorted (by key) array of values for CPU features.\n"
@@ -192,8 +203,7 @@ unsigned SubtargetEmitter::FeatureKeyValues(raw_ostream &OS) {
        << "\"" << Desc << "\", "
        << "{ " << Target << "::" << Name << " }, ";
 
-    const std::vector<Record*> &ImpliesList =
-      Feature->getValueAsListOfDefs("Implies");
+    RecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
     OS << "{";
     for (unsigned j = 0, M = ImpliesList.size(); j < M;) {
@@ -218,7 +228,7 @@ unsigned SubtargetEmitter::CPUKeyValues(raw_ostream &OS) {
   // Gather and sort processor information
   std::vector<Record*> ProcessorList =
                           Records.getAllDerivedDefinitions("Processor");
-  std::sort(ProcessorList.begin(), ProcessorList.end(), LessRecordFieldName());
+  llvm::sort(ProcessorList.begin(), ProcessorList.end(), LessRecordFieldName());
 
   // Begin processor table
   OS << "// Sorted (by key) array of values for CPU subtype.\n"
@@ -228,8 +238,7 @@ unsigned SubtargetEmitter::CPUKeyValues(raw_ostream &OS) {
   // For each processor
   for (Record *Processor : ProcessorList) {
     StringRef Name = Processor->getValueAsString("Name");
-    const std::vector<Record*> &FeatureList =
-      Processor->getValueAsListOfDefs("Features");
+    RecVec FeatureList = Processor->getValueAsListOfDefs("Features");
 
     // Emit as { "cpu", "description", { f1 , f2 , ... fn } },
     OS << "  { "
@@ -261,8 +270,7 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
                                                 std::string &ItinString,
                                                 unsigned &NStages) {
   // Get states list
-  const std::vector<Record*> &StageList =
-    ItinData->getValueAsListOfDefs("Stages");
+  RecVec StageList = ItinData->getValueAsListOfDefs("Stages");
 
   // For each stage
   unsigned N = NStages = StageList.size();
@@ -275,7 +283,7 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
     ItinString += "  { " + itostr(Cycles) + ", ";
 
     // Get unit list
-    const std::vector<Record*> &UnitList = Stage->getValueAsListOfDefs("Units");
+    RecVec UnitList = Stage->getValueAsListOfDefs("Units");
 
     // For each unit
     for (unsigned j = 0, M = UnitList.size(); j < M;) {
@@ -304,7 +312,7 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
 void SubtargetEmitter::FormItineraryOperandCycleString(Record *ItinData,
                          std::string &ItinString, unsigned &NOperandCycles) {
   // Get operand cycle list
-  const std::vector<int64_t> &OperandCycleList =
+  std::vector<int64_t> OperandCycleList =
     ItinData->getValueAsListOfInts("OperandCycles");
 
   // For each operand cycle
@@ -322,8 +330,7 @@ void SubtargetEmitter::FormItineraryBypassString(const std::string &Name,
                                                  Record *ItinData,
                                                  std::string &ItinString,
                                                  unsigned NOperandCycles) {
-  const std::vector<Record*> &BypassList =
-    ItinData->getValueAsListOfDefs("Bypasses");
+  RecVec BypassList = ItinData->getValueAsListOfDefs("Bypasses");
   unsigned N = BypassList.size();
   unsigned i = 0;
   for (; i < N;) {
@@ -354,7 +361,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
     if (!ItinsDefSet.insert(ProcModel.ItinsDef).second)
       continue;
 
-    std::vector<Record*> FUs = ProcModel.ItinsDef->getValueAsListOfDefs("FU");
+    RecVec FUs = ProcModel.ItinsDef->getValueAsListOfDefs("FU");
     if (FUs.empty())
       continue;
 
@@ -368,9 +375,9 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
 
     OS << "} // end namespace " << Name << "FU\n";
 
-    std::vector<Record*> BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP");
+    RecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP");
     if (!BPs.empty()) {
-      OS << "\n// Pipeline forwarding pathes for itineraries \"" << Name
+      OS << "\n// Pipeline forwarding paths for itineraries \"" << Name
          << "\"\n" << "namespace " << Name << "Bypass {\n";
 
       OS << "  const unsigned NoBypass = 0;\n";
@@ -442,7 +449,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
       }
 
       // Check to see if stage already exists and create if it doesn't
-      unsigned FindStage = 0;
+      uint16_t FindStage = 0;
       if (NStages > 0) {
         FindStage = ItinStageMap[ItinStageString];
         if (FindStage == 0) {
@@ -458,7 +465,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
       }
 
       // Check to see if operand cycle already exists and create if it doesn't
-      unsigned FindOperandCycle = 0;
+      uint16_t FindOperandCycle = 0;
       if (NOperandCycles > 0) {
         std::string ItinOperandString = ItinOperandCycleString+ItinBypassString;
         FindOperandCycle = ItinOperandMap[ItinOperandString];
@@ -480,10 +487,14 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
       }
 
       // Set up itinerary as location and location + stage count
-      int NumUOps = ItinData ? ItinData->getValueAsInt("NumMicroOps") : 0;
-      InstrItinerary Intinerary = { NumUOps, FindStage, FindStage + NStages,
-                                    FindOperandCycle,
-                                    FindOperandCycle + NOperandCycles };
+      int16_t NumUOps = ItinData ? ItinData->getValueAsInt("NumMicroOps") : 0;
+      InstrItinerary Intinerary = {
+          NumUOps,
+          FindStage,
+          uint16_t(FindStage + NStages),
+          FindOperandCycle,
+          uint16_t(FindOperandCycle + NOperandCycles),
+      };
 
       // Inject - empty slots will be 0, 0
       ItinList[SchedClassIdx] = Intinerary;
@@ -559,7 +570,8 @@ EmitItineraries(raw_ostream &OS,
         ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n";
     }
     // End processor itinerary table
-    OS << "  { 0, ~0U, ~0U, ~0U, ~0U } // end marker\n";
+    OS << "  { 0, uint16_t(~0U), uint16_t(~0U), uint16_t(~0U), uint16_t(~0U) }"
+          "// end marker\n";
     OS << "};\n";
   }
 }
@@ -578,24 +590,216 @@ void SubtargetEmitter::EmitProcessorProp(raw_ostream &OS, const Record *R,
   OS << '\n';
 }
 
+void SubtargetEmitter::EmitProcessorResourceSubUnits(
+    const CodeGenProcModel &ProcModel, raw_ostream &OS) {
+  OS << "\nstatic const unsigned " << ProcModel.ModelName
+     << "ProcResourceSubUnits[] = {\n"
+     << "  0,  // Invalid\n";
+
+  for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {
+    Record *PRDef = ProcModel.ProcResourceDefs[i];
+    if (!PRDef->isSubClassOf("ProcResGroup"))
+      continue;
+    RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources");
+    for (Record *RUDef : ResUnits) {
+      Record *const RU =
+          SchedModels.findProcResUnits(RUDef, ProcModel, PRDef->getLoc());
+      for (unsigned J = 0; J < RU->getValueAsInt("NumUnits"); ++J) {
+        OS << "  " << ProcModel.getProcResourceIdx(RU) << ", ";
+      }
+    }
+    OS << "  // " << PRDef->getName() << "\n";
+  }
+  OS << "};\n";
+}
+
+static void EmitRetireControlUnitInfo(const CodeGenProcModel &ProcModel,
+                                      raw_ostream &OS) {
+  int64_t ReorderBufferSize = 0, MaxRetirePerCycle = 0;
+  if (Record *RCU = ProcModel.RetireControlUnit) {
+    ReorderBufferSize =
+        std::max(ReorderBufferSize, RCU->getValueAsInt("ReorderBufferSize"));
+    MaxRetirePerCycle =
+        std::max(MaxRetirePerCycle, RCU->getValueAsInt("MaxRetirePerCycle"));
+  }
+
+  OS << ReorderBufferSize << ", // ReorderBufferSize\n  ";
+  OS << MaxRetirePerCycle << ", // MaxRetirePerCycle\n  ";
+}
+
+static void EmitRegisterFileInfo(const CodeGenProcModel &ProcModel,
+                                 unsigned NumRegisterFiles,
+                                 unsigned NumCostEntries, raw_ostream &OS) {
+  if (NumRegisterFiles)
+    OS << ProcModel.ModelName << "RegisterFiles,\n  " << (1 + NumRegisterFiles);
+  else
+    OS << "nullptr,\n  0";
+
+  OS << ", // Number of register files.\n  ";
+  if (NumCostEntries)
+    OS << ProcModel.ModelName << "RegisterCosts,\n  ";
+  else
+    OS << "nullptr,\n  ";
+  OS << NumCostEntries << ", // Number of register cost entries.\n";
+}
+
+unsigned
+SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
+                                         raw_ostream &OS) {
+  if (llvm::all_of(ProcModel.RegisterFiles, [](const CodeGenRegisterFile &RF) {
+        return RF.hasDefaultCosts();
+      }))
+    return 0;
+
+  // Print the RegisterCost table first.
+  OS << "\n// {RegisterClassID, Register Cost}\n";
+  OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName
+     << "RegisterCosts"
+     << "[] = {\n";
+
+  for (const CodeGenRegisterFile &RF : ProcModel.RegisterFiles) {
+    // Skip register files with a default cost table.
+    if (RF.hasDefaultCosts())
+      continue;
+    // Add entries to the cost table.
+    for (const CodeGenRegisterCost &RC : RF.Costs) {
+      OS << "  { ";
+      Record *Rec = RC.RCDef;
+      if (Rec->getValue("Namespace"))
+        OS << Rec->getValueAsString("Namespace") << "::";
+      OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n";
+    }
+  }
+  OS << "};\n";
+
+  // Now generate a table with register file info.
+  OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n";
+  OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName
+     << "RegisterFiles"
+     << "[] = {\n"
+     << "  { \"InvalidRegisterFile\", 0, 0, 0 },\n";
+  unsigned CostTblIndex = 0;
+
+  for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) {
+    OS << "  { ";
+    OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", ";
+    unsigned NumCostEntries = RD.Costs.size();
+    OS << NumCostEntries << ", " << CostTblIndex << "},\n";
+    CostTblIndex += NumCostEntries;
+  }
+  OS << "};\n";
+
+  return CostTblIndex;
+}
+
+static bool EmitPfmIssueCountersTable(const CodeGenProcModel &ProcModel,
+                                      raw_ostream &OS) {
+  unsigned NumCounterDefs = 1 + ProcModel.ProcResourceDefs.size();
+  std::vector<const Record *> CounterDefs(NumCounterDefs);
+  bool HasCounters = false;
+  for (const Record *CounterDef : ProcModel.PfmIssueCounterDefs) {
+    const Record *&CD = CounterDefs[ProcModel.getProcResourceIdx(
+        CounterDef->getValueAsDef("Resource"))];
+    if (CD) {
+      PrintFatalError(CounterDef->getLoc(),
+                      "multiple issue counters for " +
+                          CounterDef->getValueAsDef("Resource")->getName());
+    }
+    CD = CounterDef;
+    HasCounters = true;
+  }
+  if (!HasCounters) {
+    return false;
+  }
+  OS << "\nstatic const char* " << ProcModel.ModelName
+     << "PfmIssueCounters[] = {\n";
+  for (unsigned i = 0; i != NumCounterDefs; ++i) {
+    const Record *CounterDef = CounterDefs[i];
+    if (CounterDef) {
+      const auto PfmCounters = CounterDef->getValueAsListOfStrings("Counters");
+      if (PfmCounters.empty())
+        PrintFatalError(CounterDef->getLoc(), "empty counter list");
+      OS << "  \"" << PfmCounters[0];
+      for (unsigned p = 1, e = PfmCounters.size(); p != e; ++p)
+        OS << ",\" \"" << PfmCounters[p];
+      OS << "\",  // #" << i << " = ";
+      OS << CounterDef->getValueAsDef("Resource")->getName() << "\n";
+    } else {
+      OS << "  nullptr, // #" << i << "\n";
+    }
+  }
+  OS << "};\n";
+  return true;
+}
+
+static void EmitPfmCounters(const CodeGenProcModel &ProcModel,
+                            const bool HasPfmIssueCounters, raw_ostream &OS) {
+  OS << "  {\n";
+  // Emit the cycle counter.
+  if (ProcModel.PfmCycleCounterDef)
+    OS << "    \"" << ProcModel.PfmCycleCounterDef->getValueAsString("Counter")
+       << "\",  // Cycle counter.\n";
+  else
+    OS << "    nullptr,  // No cycle counter.\n";
+
+  // Emit a reference to issue counters table.
+  if (HasPfmIssueCounters)
+    OS << "    " << ProcModel.ModelName << "PfmIssueCounters\n";
+  else
+    OS << "    nullptr  // No issue counters.\n";
+  OS << "  }\n";
+}
+
+void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
+                                              raw_ostream &OS) {
+  // Generate a table of register file descriptors (one entry per each user
+  // defined register file), and a table of register costs.
+  unsigned NumCostEntries = EmitRegisterFileTables(ProcModel, OS);
+
+  // Generate a table of ProcRes counter names.
+  const bool HasPfmIssueCounters = EmitPfmIssueCountersTable(ProcModel, OS);
+
+  // Now generate a table for the extra processor info.
+  OS << "\nstatic const llvm::MCExtraProcessorInfo " << ProcModel.ModelName
+     << "ExtraInfo = {\n  ";
+
+  // Add information related to the retire control unit.
+  EmitRetireControlUnitInfo(ProcModel, OS);
+
+  // Add information related to the register files (i.e. where to find register
+  // file descriptors and register costs).
+  EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
+                       NumCostEntries, OS);
+
+  EmitPfmCounters(ProcModel, HasPfmIssueCounters, OS);
+
+  OS << "};\n";
+}
+
 void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
                                               raw_ostream &OS) {
-  OS << "\n// {Name, NumUnits, SuperIdx, IsBuffered}\n";
-  OS << "static const llvm::MCProcResourceDesc "
-     << ProcModel.ModelName << "ProcResources" << "[] = {\n"
-     << "  {DBGFIELD(\"InvalidUnit\")     0, 0, 0},\n";
+  EmitProcessorResourceSubUnits(ProcModel, OS);
 
+  OS << "\n// {Name, NumUnits, SuperIdx, IsBuffered, SubUnitsIdxBegin}\n";
+  OS << "static const llvm::MCProcResourceDesc " << ProcModel.ModelName
+     << "ProcResources"
+     << "[] = {\n"
+     << "  {\"InvalidUnit\", 0, 0, 0, 0},\n";
+
+  unsigned SubUnitsOffset = 1;
   for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {
     Record *PRDef = ProcModel.ProcResourceDefs[i];
 
     Record *SuperDef = nullptr;
     unsigned SuperIdx = 0;
     unsigned NumUnits = 0;
+    const unsigned SubUnitsBeginOffset = SubUnitsOffset;
     int BufferSize = PRDef->getValueAsInt("BufferSize");
     if (PRDef->isSubClassOf("ProcResGroup")) {
       RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources");
       for (Record *RU : ResUnits) {
         NumUnits += RU->getValueAsInt("NumUnits");
+        SubUnitsOffset += RU->getValueAsInt("NumUnits");
       }
     }
     else {
@@ -609,11 +813,17 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
       NumUnits = PRDef->getValueAsInt("NumUnits");
     }
     // Emit the ProcResourceDesc
-    OS << "  {DBGFIELD(\"" << PRDef->getName() << "\") ";
+    OS << "  {\"" << PRDef->getName() << "\", ";
     if (PRDef->getName().size() < 15)
       OS.indent(15 - PRDef->getName().size());
-    OS << NumUnits << ", " << SuperIdx << ", "
-       << BufferSize << "}, // #" << i+1;
+    OS << NumUnits << ", " << SuperIdx << ", " << BufferSize << ", ";
+    if (SubUnitsBeginOffset != SubUnitsOffset) {
+      OS << ProcModel.ModelName << "ProcResourceSubUnits + "
+         << SubUnitsBeginOffset;
+    } else {
+      OS << "nullptr";
+    }
+    OS << "}, // #" << i+1;
     if (SuperDef)
       OS << ", Super=" << SuperDef->getName();
     OS << "\n";
@@ -731,8 +941,7 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
 void SubtargetEmitter::ExpandProcResources(RecVec &PRVec,
                                            std::vector<int64_t> &Cycles,
                                            const CodeGenProcModel &PM) {
-  // Default to 1 resource cycle.
-  Cycles.resize(PRVec.size(), 1);
+  assert(PRVec.size() == Cycles.size() && "failed precondition");
   for (unsigned i = 0, e = PRVec.size(); i != e; ++i) {
     Record *PRDef = PRVec[i];
     RecVec SubResources;
@@ -783,9 +992,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     return;
 
   std::vector<MCSchedClassDesc> &SCTab = SchedTables.ProcSchedClasses.back();
-  DEBUG(dbgs() << "\n+++ SCHED CLASSES (GenSchedClassTables) +++\n");
+  LLVM_DEBUG(dbgs() << "\n+++ SCHED CLASSES (GenSchedClassTables) +++\n");
   for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {
-    DEBUG(SC.dump(&SchedModels));
+    LLVM_DEBUG(SC.dump(&SchedModels));
 
     SCTab.resize(SCTab.size() + 1);
     MCSchedClassDesc &SCDesc = SCTab.back();
@@ -823,7 +1032,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     IdxVec Writes = SC.Writes;
     IdxVec Reads = SC.Reads;
     if (!SC.InstRWs.empty()) {
-      // This class has a default ReadWrite list which can be overriden by
+      // This class has a default ReadWrite list which can be overridden by
       // InstRW definitions.
       Record *RWDef = nullptr;
       for (Record *RW : SC.InstRWs) {
@@ -851,8 +1060,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         }
       }
       if (Writes.empty()) {
-        DEBUG(dbgs() << ProcModel.ModelName
-              << " does not have resources for class " << SC.Name << '\n');
+        LLVM_DEBUG(dbgs() << ProcModel.ModelName
+                          << " does not have resources for class " << SC.Name
+                          << '\n');
       }
     }
     // Sum resources across all operand writes.
@@ -900,6 +1110,21 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         std::vector<int64_t> Cycles =
           WriteRes->getValueAsListOfInts("ResourceCycles");
 
+        if (Cycles.empty()) {
+          // If ResourceCycles is not provided, default to one cycle per
+          // resource.
+          Cycles.resize(PRVec.size(), 1);
+        } else if (Cycles.size() != PRVec.size()) {
+          // If ResourceCycles is provided, check consistency.
+          PrintFatalError(
+              WriteRes->getLoc(),
+              Twine("Inconsistent resource cycles: !size(ResourceCycles) != "
+                    "!size(ProcResources): ")
+                  .concat(Twine(PRVec.size()))
+                  .concat(" vs ")
+                  .concat(Twine(Cycles.size())));
+        }
+
         ExpandProcResources(PRVec, Cycles, ProcModel);
 
         for (unsigned PRIdx = 0, PREnd = PRVec.size();
@@ -949,7 +1174,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
           WriteIDs.push_back(SchedModels.getSchedRWIdx(VW, /*IsRead=*/false));
         }
       }
-      std::sort(WriteIDs.begin(), WriteIDs.end());
+      llvm::sort(WriteIDs.begin(), WriteIDs.end());
       for(unsigned W : WriteIDs) {
         MCReadAdvanceEntry RAEntry;
         RAEntry.UseIdx = UseIdx;
@@ -967,8 +1192,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     // compression.
     //
     // WritePrecRes entries are sorted by ProcResIdx.
-    std::sort(WriteProcResources.begin(), WriteProcResources.end(),
-              LessWriteProcResources());
+    llvm::sort(WriteProcResources.begin(), WriteProcResources.end(),
+               LessWriteProcResources());
 
     SCDesc.NumWriteProcResEntries = WriteProcResources.size();
     std::vector<MCWriteProcResEntry>::iterator WPRPos =
@@ -1119,6 +1344,9 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
   // For each processor model.
   for (const CodeGenProcModel &PM : SchedModels.procModels()) {
+    // Emit extra processor info if available.
+    if (PM.hasExtraProcessorInfo())
+      EmitExtraProcessorInfo(PM, OS);
     // Emit processor resource table.
     if (PM.hasInstrSchedModel())
       EmitProcessorResources(PM, OS);
@@ -1159,9 +1387,13 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
       OS << "  nullptr, nullptr, 0, 0,"
          << " // No instruction-level machine model.\n";
     if (PM.hasItineraries())
-      OS << "  " << PM.ItinsDef->getName() << "\n";
+      OS << "  " << PM.ItinsDef->getName() << ",\n";
+    else
+      OS << "  nullptr, // No Itinerary\n";
+    if (PM.hasExtraProcessorInfo())
+      OS << "  &" << PM.ModelName << "ExtraInfo,\n";
     else
-      OS << "  nullptr // No Itinerary\n";
+      OS << "  nullptr // No extra processor descriptor\n";
     OS << "};\n";
   }
 }
@@ -1173,7 +1405,7 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
   // Gather and sort processor information
   std::vector<Record*> ProcessorList =
                           Records.getAllDerivedDefinitions("Processor");
-  std::sort(ProcessorList.begin(), ProcessorList.end(), LessRecordFieldName());
+  llvm::sort(ProcessorList.begin(), ProcessorList.end(), LessRecordFieldName());
 
   // Begin processor table
   OS << "\n";
@@ -1231,58 +1463,111 @@ void SubtargetEmitter::EmitSchedModel(raw_ostream &OS) {
   OS << "\n#undef DBGFIELD";
 }
 
-void SubtargetEmitter::EmitSchedModelHelpers(const std::string &ClassName,
-                                             raw_ostream &OS) {
-  OS << "unsigned " << ClassName
-     << "\n::resolveSchedClass(unsigned SchedClass, const MachineInstr *MI,"
-     << " const TargetSchedModel *SchedModel) const {\n";
+static void emitPredicateProlog(const RecordKeeper &Records, raw_ostream &OS) {
+  std::string Buffer;
+  raw_string_ostream Stream(Buffer);
+
+  // Collect all the PredicateProlog records and print them to the output
+  // stream.
+  std::vector<Record *> Prologs =
+      Records.getAllDerivedDefinitions("PredicateProlog");
+  llvm::sort(Prologs.begin(), Prologs.end(), LessRecord());
+  for (Record *P : Prologs)
+    Stream << P->getValueAsString("Code") << '\n';
 
-  std::vector<Record*> Prologs = Records.getAllDerivedDefinitions("PredicateProlog");
-  std::sort(Prologs.begin(), Prologs.end(), LessRecord());
-  for (Record *P : Prologs) {
-    OS << P->getValueAsString("Code") << '\n';
+  Stream.flush();
+  OS << Buffer;
+}
+
+static void emitPredicates(const CodeGenSchedTransition &T,
+                           const CodeGenSchedClass &SC,
+                           PredicateExpander &PE,
+                           raw_ostream &OS) {
+  std::string Buffer;
+  raw_string_ostream StringStream(Buffer);
+  formatted_raw_ostream FOS(StringStream);
+
+  FOS.PadToColumn(6);
+  FOS << "if (";
+  for (RecIter RI = T.PredTerm.begin(), RE = T.PredTerm.end(); RI != RE; ++RI) {
+    if (RI != T.PredTerm.begin()) {
+      FOS << "\n";
+      FOS.PadToColumn(8);
+      FOS << "&& ";
+    }
+    const Record *Rec = *RI;
+    if (Rec->isSubClassOf("MCSchedPredicate"))
+      PE.expandPredicate(FOS, Rec->getValueAsDef("Pred"));
+    else
+      FOS << "(" << Rec->getValueAsString("Predicate") << ")";
   }
+
+  FOS << ")\n";
+  FOS.PadToColumn(8);
+  FOS << "return " << T.ToClassIdx << "; // " << SC.Name << '\n';
+  FOS.flush();
+  OS << Buffer;
+}
+
+void SubtargetEmitter::emitSchedModelHelpersImpl(
+    raw_ostream &OS, bool OnlyExpandMCInstPredicates) {
+  // Collect Variant Classes.
   IdxVec VariantClasses;
   for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {
     if (SC.Transitions.empty())
       continue;
     VariantClasses.push_back(SC.Index);
   }
+
   if (!VariantClasses.empty()) {
-    OS << "  switch (SchedClass) {\n";
+    bool FoundPredicates = false;
     for (unsigned VC : VariantClasses) {
+      // Emit code for each variant scheduling class.
       const CodeGenSchedClass &SC = SchedModels.getSchedClass(VC);
-      OS << "  case " << VC << ": // " << SC.Name << '\n';
       IdxVec ProcIndices;
       for (const CodeGenSchedTransition &T : SC.Transitions) {
+        if (OnlyExpandMCInstPredicates &&
+            !all_of(T.PredTerm, [](const Record *Rec) {
+              return Rec->isSubClassOf("MCSchedPredicate");
+            }))
+          continue;
+
         IdxVec PI;
         std::set_union(T.ProcIndices.begin(), T.ProcIndices.end(),
                        ProcIndices.begin(), ProcIndices.end(),
                        std::back_inserter(PI));
         ProcIndices.swap(PI);
       }
+      if (ProcIndices.empty())
+        continue;
+
+      // Emit a switch statement only if there are predicates to expand.
+      if (!FoundPredicates) {
+        OS << "  switch (SchedClass) {\n";
+        FoundPredicates = true;
+      }
+
+      OS << "  case " << VC << ": // " << SC.Name << '\n';
+      PredicateExpander PE;
+      PE.setByRef(false);
+      PE.setExpandForMC(OnlyExpandMCInstPredicates);
       for (unsigned PI : ProcIndices) {
         OS << "    ";
-        if (PI != 0)
-          OS << "if (SchedModel->getProcessorID() == " << PI << ") ";
-        OS << "{ // " << (SchedModels.procModelBegin() + PI)->ModelName
-           << '\n';
+        if (PI != 0) {
+          OS << (OnlyExpandMCInstPredicates
+                     ? "if (CPUID == "
+                     : "if (SchedModel->getProcessorID() == ");
+          OS << PI << ") ";
+        }
+        OS << "{ // " << (SchedModels.procModelBegin() + PI)->ModelName << '\n';
+
         for (const CodeGenSchedTransition &T : SC.Transitions) {
-          if (PI != 0 && !std::count(T.ProcIndices.begin(),
-                                     T.ProcIndices.end(), PI)) {
-              continue;
-          }
-          OS << "      if (";
-          for (RecIter RI = T.PredTerm.begin(), RE = T.PredTerm.end();
-               RI != RE; ++RI) {
-            if (RI != T.PredTerm.begin())
-              OS << "\n          && ";
-            OS << "(" << (*RI)->getValueAsString("Predicate") << ")";
-          }
-          OS << ")\n"
-             << "        return " << T.ToClassIdx << "; // "
-             << SchedModels.getSchedClass(T.ToClassIdx).Name << '\n';
+          if (PI != 0 && !count(T.ProcIndices, PI))
+            continue;
+          PE.setIndentLevel(4);
+          emitPredicates(T, SchedModels.getSchedClass(T.ToClassIdx), PE, OS);
         }
+
         OS << "    }\n";
         if (PI == 0)
           break;
@@ -1291,10 +1576,40 @@ void SubtargetEmitter::EmitSchedModelHelpers(const std::string &ClassName,
         OS << "    return " << SC.Index << ";\n";
       OS << "    break;\n";
     }
-    OS << "  };\n";
+
+    if (FoundPredicates)
+     OS << "  };\n";
   }
-  OS << "  report_fatal_error(\"Expected a variant SchedClass\");\n"
-     << "} // " << ClassName << "::resolveSchedClass\n";
+
+  if (OnlyExpandMCInstPredicates) {
+    OS << "  // Don't know how to resolve this scheduling class.\n"
+       << "  return 0;\n";
+    return;
+  }
+
+  OS << "  report_fatal_error(\"Expected a variant SchedClass\");\n";
+}
+
+void SubtargetEmitter::EmitSchedModelHelpers(const std::string &ClassName,
+                                             raw_ostream &OS) {
+  OS << "unsigned " << ClassName
+     << "\n::resolveSchedClass(unsigned SchedClass, const MachineInstr *MI,"
+     << " const TargetSchedModel *SchedModel) const {\n";
+
+  // Emit the predicate prolog code.
+  emitPredicateProlog(Records, OS);
+
+  // Emit target predicates.
+  emitSchedModelHelpersImpl(OS);
+  
+  OS << "} // " << ClassName << "::resolveSchedClass\n\n";
+
+  OS << "unsigned " << ClassName
+     << "\n::resolveVariantSchedClass(unsigned SchedClass, const MCInst *MI,"
+     << " unsigned CPUID) const {\n"
+     << "  return " << Target << "_MC"
+     << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID);\n"
+     << "} // " << ClassName << "::resolveVariantSchedClass\n";
 }
 
 void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName,
@@ -1322,15 +1637,15 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
                                              unsigned NumProcs) {
   std::vector<Record*> Features =
                        Records.getAllDerivedDefinitions("SubtargetFeature");
-  std::sort(Features.begin(), Features.end(), LessRecord());
+  llvm::sort(Features.begin(), Features.end(), LessRecord());
 
   OS << "// ParseSubtargetFeatures - Parses features string setting specified\n"
      << "// subtarget options.\n"
      << "void llvm::";
   OS << Target;
   OS << "Subtarget::ParseSubtargetFeatures(StringRef CPU, StringRef FS) {\n"
-     << "  DEBUG(dbgs() << \"\\nFeatures:\" << FS);\n"
-     << "  DEBUG(dbgs() << \"\\nCPU:\" << CPU << \"\\n\\n\");\n";
+     << "  LLVM_DEBUG(dbgs() << \"\\nFeatures:\" << FS);\n"
+     << "  LLVM_DEBUG(dbgs() << \"\\nCPU:\" << CPU << \"\\n\\n\");\n";
 
   if (Features.empty()) {
     OS << "}\n";
@@ -1360,6 +1675,34 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
   OS << "}\n";
 }
 
+void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
+  OS << "namespace " << Target << "_MC {\n"
+     << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,\n"
+     << "    const MCInst *MI, unsigned CPUID) {\n";
+  emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);
+  OS << "}\n";
+  OS << "} // end of namespace " << Target << "_MC\n\n";
+
+  OS << "struct " << Target
+     << "GenMCSubtargetInfo : public MCSubtargetInfo {\n";
+  OS << "  " << Target << "GenMCSubtargetInfo(const Triple &TT, \n"
+     << "    StringRef CPU, StringRef FS, ArrayRef<SubtargetFeatureKV> PF,\n"
+     << "    ArrayRef<SubtargetFeatureKV> PD,\n"
+     << "    const SubtargetInfoKV *ProcSched,\n"
+     << "    const MCWriteProcResEntry *WPR,\n"
+     << "    const MCWriteLatencyEntry *WL,\n"
+     << "    const MCReadAdvanceEntry *RA, const InstrStage *IS,\n"
+     << "    const unsigned *OC, const unsigned *FP) :\n"
+     << "      MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched,\n"
+     << "                      WPR, WL, RA, IS, OC, FP) { }\n\n"
+     << "  unsigned resolveVariantSchedClass(unsigned SchedClass,\n"
+     << "      const MCInst *MI, unsigned CPUID) const override {\n"
+     << "    return " << Target << "_MC"
+     << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID); \n";
+  OS << "  }\n";
+  OS << "};\n";
+}
+
 //
 // SubtargetEmitter::run - Main subtarget enumeration emitter.
 //
@@ -1392,10 +1735,12 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 #endif
 
   // MCInstrInfo initialization routine.
+  emitGenMCSubtargetInfo(OS);
+
   OS << "\nstatic inline MCSubtargetInfo *create" << Target
      << "MCSubtargetInfoImpl("
      << "const Triple &TT, StringRef CPU, StringRef FS) {\n";
-  OS << "  return new MCSubtargetInfo(TT, CPU, FS, ";
+  OS << "  return new " << Target << "GenMCSubtargetInfo(TT, CPU, FS, ";
   if (NumFeatures)
     OS << Target << "FeatureKV, ";
   else
@@ -1438,6 +1783,10 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   std::string ClassName = Target + "GenSubtargetInfo";
   OS << "namespace llvm {\n";
   OS << "class DFAPacketizer;\n";
+  OS << "namespace " << Target << "_MC {\n"
+     << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,"
+     << " const MCInst *MI, unsigned CPUID);\n"
+     << "}\n\n";
   OS << "struct " << ClassName << " : public TargetSubtargetInfo {\n"
      << "  explicit " << ClassName << "(const Triple &TT, StringRef CPU, "
      << "StringRef FS);\n"
@@ -1445,6 +1794,8 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << "  unsigned resolveSchedClass(unsigned SchedClass, "
      << " const MachineInstr *DefMI,"
      << " const TargetSchedModel *SchedModel) const override;\n"
+     << "  unsigned resolveVariantSchedClass(unsigned SchedClass,"
+     << " const MCInst *MI, unsigned CPUID) const override;\n"
      << "  DFAPacketizer *createDFAPacketizer(const InstrItineraryData *IID)"
      << " const;\n";
   if (TGT.getHwModes().getNumModeIds() > 1)
diff --git a/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 5153c35b1261..f9b8853cc117 100644
--- a/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -10,6 +10,7 @@
 #include "SubtargetFeatureInfo.h"
 
 #include "Types.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/TableGen/Record.h"
 
 #include <map>
diff --git a/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.h b/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.h
index c55c16a4031e..71e6748c863f 100644
--- a/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.h
+++ b/contrib/llvm/utils/TableGen/SubtargetFeatureInfo.h
@@ -27,20 +27,20 @@ using SubtargetFeatureInfoMap = std::map<Record *, SubtargetFeatureInfo, LessRec
 /// Helper class for storing information on a subtarget feature which
 /// participates in instruction matching.
 struct SubtargetFeatureInfo {
-  /// \brief The predicate record for this feature.
+  /// The predicate record for this feature.
   Record *TheDef;
 
-  /// \brief An unique index assigned to represent this feature.
+  /// An unique index assigned to represent this feature.
   uint64_t Index;
 
   SubtargetFeatureInfo(Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {}
 
-  /// \brief The name of the enumerated constant identifying this feature.
+  /// The name of the enumerated constant identifying this feature.
   std::string getEnumName() const {
     return "Feature_" + TheDef->getName().str();
   }
 
-  /// \brief The name of the enumerated constant identifying the bitnumber for
+  /// The name of the enumerated constant identifying the bitnumber for
   /// this feature.
   std::string getEnumBitName() const {
     return "Feature_" + TheDef->getName().str() + "Bit";
diff --git a/contrib/llvm/utils/TableGen/TableGen.cpp b/contrib/llvm/utils/TableGen/TableGen.cpp
index b0e0385a45c7..b78260625cb2 100644
--- a/contrib/llvm/utils/TableGen/TableGen.cpp
+++ b/contrib/llvm/utils/TableGen/TableGen.cpp
@@ -24,6 +24,7 @@ using namespace llvm;
 
 enum ActionType {
   PrintRecords,
+  DumpJSON,
   GenEmitter,
   GenRegisterInfo,
   GenInstrInfo,
@@ -32,13 +33,16 @@ enum ActionType {
   GenAsmMatcher,
   GenDisassembler,
   GenPseudoLowering,
+  GenCompressInst,
   GenCallingConv,
   GenDAGISel,
   GenDFAPacketizer,
   GenFastISel,
   GenSubtarget,
-  GenIntrinsic,
-  GenTgtIntrinsic,
+  GenIntrinsicEnums,
+  GenIntrinsicImpl,
+  GenTgtIntrinsicEnums,
+  GenTgtIntrinsicImpl,
   PrintEnums,
   PrintSets,
   GenOptParserDefs,
@@ -56,6 +60,8 @@ namespace {
   Action(cl::desc("Action to perform:"),
          cl::values(clEnumValN(PrintRecords, "print-records",
                                "Print all records to stdout (default)"),
+                    clEnumValN(DumpJSON, "dump-json",
+                               "Dump all records as machine-readable JSON"),
                     clEnumValN(GenEmitter, "gen-emitter",
                                "Generate machine code emitter"),
                     clEnumValN(GenRegisterInfo, "gen-register-info",
@@ -72,6 +78,8 @@ namespace {
                                "Generate disassembler"),
                     clEnumValN(GenPseudoLowering, "gen-pseudo-lowering",
                                "Generate pseudo instruction lowering"),
+                    clEnumValN(GenCompressInst, "gen-compress-inst-emitter",
+                               "Generate RISCV compressed instructions."),
                     clEnumValN(GenAsmMatcher, "gen-asm-matcher",
                                "Generate assembly instruction matcher"),
                     clEnumValN(GenDAGISel, "gen-dag-isel",
@@ -82,9 +90,13 @@ namespace {
                                "Generate a \"fast\" instruction selector"),
                     clEnumValN(GenSubtarget, "gen-subtarget",
                                "Generate subtarget enumerations"),
-                    clEnumValN(GenIntrinsic, "gen-intrinsic",
+                    clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums",
+                               "Generate intrinsic enums"),
+                    clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl",
                                "Generate intrinsic information"),
-                    clEnumValN(GenTgtIntrinsic, "gen-tgt-intrinsic",
+                    clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums",
+                               "Generate target intrinsic enums"),
+                    clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl",
                                "Generate target intrinsic information"),
                     clEnumValN(PrintEnums, "print-enums",
                                "Print enum values for a class"),
@@ -117,6 +129,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case PrintRecords:
     OS << Records;           // No argument, dump all contents
     break;
+  case DumpJSON:
+    EmitJSON(Records, OS);
+    break;
   case GenEmitter:
     EmitCodeEmitter(Records, OS);
     break;
@@ -144,6 +159,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenPseudoLowering:
     EmitPseudoLowering(Records, OS);
     break;
+  case GenCompressInst:
+    EmitCompressInst(Records, OS);
+    break;
   case GenDAGISel:
     EmitDAGISel(Records, OS);
     break;
@@ -156,11 +174,17 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenSubtarget:
     EmitSubtarget(Records, OS);
     break;
-  case GenIntrinsic:
-    EmitIntrinsics(Records, OS);
+  case GenIntrinsicEnums:
+    EmitIntrinsicEnums(Records, OS);
+    break;
+  case GenIntrinsicImpl:
+    EmitIntrinsicImpl(Records, OS);
+    break;
+  case GenTgtIntrinsicEnums:
+    EmitIntrinsicEnums(Records, OS, true);
     break;
-  case GenTgtIntrinsic:
-    EmitIntrinsics(Records, OS, true);
+  case GenTgtIntrinsicImpl:
+    EmitIntrinsicImpl(Records, OS, true);
     break;
   case GenOptParserDefs:
     EmitOptParser(Records, OS);
diff --git a/contrib/llvm/utils/TableGen/TableGenBackends.h b/contrib/llvm/utils/TableGen/TableGenBackends.h
index 914cd5a1fc9b..1329a6d833f4 100644
--- a/contrib/llvm/utils/TableGen/TableGenBackends.h
+++ b/contrib/llvm/utils/TableGen/TableGenBackends.h
@@ -62,7 +62,10 @@ namespace llvm {
 class raw_ostream;
 class RecordKeeper;
 
-void EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly = false);
+void EmitIntrinsicEnums(RecordKeeper &RK, raw_ostream &OS,
+                        bool TargetOnly = false);
+void EmitIntrinsicImpl(RecordKeeper &RK, raw_ostream &OS,
+                       bool TargetOnly = false);
 void EmitAsmMatcher(RecordKeeper &RK, raw_ostream &OS);
 void EmitAsmWriter(RecordKeeper &RK, raw_ostream &OS);
 void EmitCallingConv(RecordKeeper &RK, raw_ostream &OS);
@@ -74,6 +77,7 @@ void EmitFastISel(RecordKeeper &RK, raw_ostream &OS);
 void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS);
 void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS);
 void EmitPseudoLowering(RecordKeeper &RK, raw_ostream &OS);
+void EmitCompressInst(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterInfo(RecordKeeper &RK, raw_ostream &OS);
 void EmitSubtarget(RecordKeeper &RK, raw_ostream &OS);
 void EmitMapTable(RecordKeeper &RK, raw_ostream &OS);
diff --git a/contrib/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/contrib/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
new file mode 100644
index 000000000000..df63337d5637
--- /dev/null
+++ b/contrib/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -0,0 +1,116 @@
+//===- WebAssemblyDisassemblerEmitter.cpp - Disassembler tables -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the WebAssembly Disassembler Emitter.
+// It contains the implementation of the disassembler tables.
+// Documentation for the disassembler emitter in general can be found in
+// WebAssemblyDisassemblerEmitter.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyDisassemblerEmitter.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+
+void emitWebAssemblyDisassemblerTables(
+    raw_ostream &OS,
+    const ArrayRef<const CodeGenInstruction *> &NumberedInstructions) {
+  // First lets organize all opcodes by (prefix) byte. Prefix 0 is the
+  // starting table.
+  std::map<unsigned,
+           std::map<unsigned, std::pair<unsigned, const CodeGenInstruction *>>>
+      OpcodeTable;
+  for (unsigned I = 0; I != NumberedInstructions.size(); ++I) {
+    auto &CGI = *NumberedInstructions[I];
+    auto &Def = *CGI.TheDef;
+    if (!Def.getValue("Inst"))
+      continue;
+    auto &Inst = *Def.getValueAsBitsInit("Inst");
+    auto Opc = static_cast<unsigned>(
+        reinterpret_cast<IntInit *>(Inst.convertInitializerTo(IntRecTy::get()))
+            ->getValue());
+    if (Opc == 0xFFFFFFFF)
+      continue; // No opcode defined.
+    assert(Opc <= 0xFFFF);
+    auto Prefix = Opc >> 8;
+    Opc = Opc & 0xFF;
+    auto &CGIP = OpcodeTable[Prefix][Opc];
+    if (!CGIP.second ||
+        // Make sure we store the variant with the least amount of operands,
+        // which is the one without explicit registers. Only few instructions
+        // have these currently, would be good to have for all of them.
+        // FIXME: this picks the first of many typed variants, which is
+        // currently the except_ref one, though this shouldn't matter for
+        // disassembly purposes.
+        CGIP.second->Operands.OperandList.size() >
+            CGI.Operands.OperandList.size()) {
+      CGIP = std::make_pair(I, &CGI);
+    }
+  }
+  OS << "#include \"MCTargetDesc/WebAssemblyMCTargetDesc.h\"\n";
+  OS << "\n";
+  OS << "namespace llvm {\n\n";
+  OS << "enum EntryType : uint8_t { ";
+  OS << "ET_Unused, ET_Prefix, ET_Instruction };\n\n";
+  OS << "struct WebAssemblyInstruction {\n";
+  OS << "  uint16_t Opcode;\n";
+  OS << "  EntryType ET;\n";
+  OS << "  uint8_t NumOperands;\n";
+  OS << "  uint8_t Operands[4];\n";
+  OS << "};\n\n";
+  // Output one table per prefix.
+  for (auto &PrefixPair : OpcodeTable) {
+    if (PrefixPair.second.empty())
+      continue;
+    OS << "WebAssemblyInstruction InstructionTable" << PrefixPair.first;
+    OS << "[] = {\n";
+    for (unsigned I = 0; I <= 0xFF; I++) {
+      auto InstIt = PrefixPair.second.find(I);
+      if (InstIt != PrefixPair.second.end()) {
+        // Regular instruction.
+        assert(InstIt->second.second);
+        auto &CGI = *InstIt->second.second;
+        OS << "  // 0x";
+        OS.write_hex(static_cast<unsigned long long>(I));
+        OS << ": " << CGI.AsmString << "\n";
+        OS << "  { " << InstIt->second.first << ", ET_Instruction, ";
+        OS << CGI.Operands.OperandList.size() << ", {\n";
+        for (auto &Op : CGI.Operands.OperandList) {
+          OS << "      " << Op.OperandType << ",\n";
+        }
+        OS << "    }\n";
+      } else {
+        auto PrefixIt = OpcodeTable.find(I);
+        // If we have a non-empty table for it that's not 0, this is a prefix.
+        if (PrefixIt != OpcodeTable.end() && I && !PrefixPair.first) {
+          OS << "  { 0, ET_Prefix, 0, {}";
+        } else {
+          OS << "  { 0, ET_Unused, 0, {}";
+        }
+      }
+      OS << "  },\n";
+    }
+    OS << "};\n\n";
+  }
+  // Create a table of all extension tables:
+  OS << "struct { uint8_t Prefix; const WebAssemblyInstruction *Table; }\n";
+  OS << "PrefixTable[] = {\n";
+  for (auto &PrefixPair : OpcodeTable) {
+    if (PrefixPair.second.empty() || !PrefixPair.first)
+      continue;
+    OS << "  { " << PrefixPair.first << ", InstructionTable"
+       << PrefixPair.first;
+    OS << " },\n";
+  }
+  OS << "  { 0, nullptr }\n};\n\n";
+  OS << "} // End llvm namespace\n";
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h b/contrib/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h
new file mode 100644
index 000000000000..91f820f120a2
--- /dev/null
+++ b/contrib/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h
@@ -0,0 +1,30 @@
+//===- WebAssemblyDisassemblerEmitter.h - Disassembler tables ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the WebAssembly Disassembler Emitter.
+// It contains the interface of the disassembler tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_WEBASSEMBLYDISASSEMBLEREMITTER_H
+#define LLVM_UTILS_TABLEGEN_WEBASSEMBLYDISASSEMBLEREMITTER_H
+
+#include "CodeGenInstruction.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+void emitWebAssemblyDisassemblerTables(
+    raw_ostream &OS,
+    const ArrayRef<const CodeGenInstruction *> &NumberedInstructions);
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/utils/TableGen/X86DisassemblerShared.h b/contrib/llvm/utils/TableGen/X86DisassemblerShared.h
index e5889e92415d..220765f72410 100644
--- a/contrib/llvm/utils/TableGen/X86DisassemblerShared.h
+++ b/contrib/llvm/utils/TableGen/X86DisassemblerShared.h
@@ -13,7 +13,7 @@
 #include <cstring>
 #include <string>
 
-#include "../../lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h"
+#include "llvm/Support/X86DisassemblerDecoderCommon.h"
 
 struct InstructionSpecifier {
   llvm::X86Disassembler::OperandSpecifier
@@ -49,6 +49,10 @@ struct OpcodeDecision {
 /// entries in this table, rather than 2^(ATTR_max).
 struct ContextDecision {
   OpcodeDecision opcodeDecisions[llvm::X86Disassembler::IC_max];
+
+  ContextDecision() {
+    memset(opcodeDecisions, 0, sizeof(opcodeDecisions));
+  }
 };
 
 #endif
diff --git a/contrib/llvm/utils/TableGen/X86DisassemblerTables.cpp b/contrib/llvm/utils/TableGen/X86DisassemblerTables.cpp
index fce41f7a2cc2..2b5cc1279605 100644
--- a/contrib/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/contrib/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -112,6 +112,10 @@ static inline bool inheritsFrom(InstructionContext child,
     return inheritsFrom(child, IC_64BIT_XD_OPSIZE);
   case IC_XS_OPSIZE:
     return inheritsFrom(child, IC_64BIT_XS_OPSIZE);
+  case IC_XD_ADSIZE:
+    return inheritsFrom(child, IC_64BIT_XD_ADSIZE);
+  case IC_XS_ADSIZE:
+    return inheritsFrom(child, IC_64BIT_XS_ADSIZE);
   case IC_64BIT_REXW:
     return((noPrefix && inheritsFrom(child, IC_64BIT_REXW_XS, noPrefix)) ||
            (noPrefix && inheritsFrom(child, IC_64BIT_REXW_XD, noPrefix)) ||
@@ -122,12 +126,17 @@ static inline bool inheritsFrom(InstructionContext child,
            (!AdSize64 && inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE)) ||
            (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE));
   case IC_64BIT_XD:
-    return(inheritsFrom(child, IC_64BIT_REXW_XD));
+    return(inheritsFrom(child, IC_64BIT_REXW_XD) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_XD_ADSIZE)));
   case IC_64BIT_XS:
-    return(inheritsFrom(child, IC_64BIT_REXW_XS));
+    return(inheritsFrom(child, IC_64BIT_REXW_XS) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_XS_ADSIZE)));
   case IC_64BIT_XD_OPSIZE:
   case IC_64BIT_XS_OPSIZE:
     return false;
+  case IC_64BIT_XD_ADSIZE:
+  case IC_64BIT_XS_ADSIZE:
+    return false;
   case IC_64BIT_REXW_XD:
   case IC_64BIT_REXW_XS:
   case IC_64BIT_REXW_OPSIZE:
@@ -642,21 +651,13 @@ static const char* stringForDecisionType(ModRMDecisionType dt) {
 }
 
 DisassemblerTables::DisassemblerTables() {
-  unsigned i;
-
-  for (i = 0; i < array_lengthof(Tables); i++) {
-    Tables[i] = new ContextDecision;
-    memset(Tables[i], 0, sizeof(ContextDecision));
-  }
+  for (unsigned i = 0; i < array_lengthof(Tables); i++)
+    Tables[i] = llvm::make_unique<ContextDecision>();
 
   HasConflicts = false;
 }
 
 DisassemblerTables::~DisassemblerTables() {
-  unsigned i;
-
-  for (i = 0; i < array_lengthof(Tables); i++)
-    delete Tables[i];
 }
 
 void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
@@ -961,8 +962,12 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
       o << "IC_64BIT_REXW_ADSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_XD) && (index & ATTR_OPSIZE))
       o << "IC_64BIT_XD_OPSIZE";
+    else if ((index & ATTR_64BIT) && (index & ATTR_XD) && (index & ATTR_ADSIZE))
+      o << "IC_64BIT_XD_ADSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_XS) && (index & ATTR_OPSIZE))
       o << "IC_64BIT_XS_OPSIZE";
+    else if ((index & ATTR_64BIT) && (index & ATTR_XS) && (index & ATTR_ADSIZE))
+      o << "IC_64BIT_XS_ADSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_XS))
       o << "IC_64BIT_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_XD))
@@ -982,6 +987,10 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
       o << "IC_XS_OPSIZE";
     else if ((index & ATTR_XD) && (index & ATTR_OPSIZE))
       o << "IC_XD_OPSIZE";
+    else if ((index & ATTR_XS) && (index & ATTR_ADSIZE))
+      o << "IC_XS_ADSIZE";
+    else if ((index & ATTR_XD) && (index & ATTR_ADSIZE))
+      o << "IC_XD_ADSIZE";
     else if (index & ATTR_XS)
       o << "IC_XS";
     else if (index & ATTR_XD)
@@ -1019,6 +1028,7 @@ void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[4], XOP8_MAP_STR);
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[5], XOP9_MAP_STR);
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[6], XOPA_MAP_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[7], THREEDNOW_MAP_STR);
 }
 
 void DisassemblerTables::emit(raw_ostream &o) const {
@@ -1075,14 +1085,9 @@ void DisassemblerTables::setTableFields(ModRMDecision     &decision,
 
         if(previousInfo.name == "NOOP" && (newInfo.name == "XCHG16ar" ||
                                            newInfo.name == "XCHG32ar" ||
-                                           newInfo.name == "XCHG32ar64" ||
                                            newInfo.name == "XCHG64ar"))
           continue; // special case for XCHG*ar and NOOP
 
-        if (previousInfo.name == "DATA16_PREFIX" &&
-            newInfo.name == "DATA32_PREFIX")
-          continue; // special case for data16 and data32
-
         if (outranks(previousInfo.insnContext, newInfo.insnContext))
           continue;
 
diff --git a/contrib/llvm/utils/TableGen/X86DisassemblerTables.h b/contrib/llvm/utils/TableGen/X86DisassemblerTables.h
index 552bbe95f7cd..b0ea9c2e8625 100644
--- a/contrib/llvm/utils/TableGen/X86DisassemblerTables.h
+++ b/contrib/llvm/utils/TableGen/X86DisassemblerTables.h
@@ -41,7 +41,8 @@ private:
   /// [4] XOP8 map opcode
   /// [5] XOP9 map opcode
   /// [6] XOPA map opcode
-  ContextDecision* Tables[7];
+  /// [7] 3dnow map opcode
+  std::unique_ptr<ContextDecision> Tables[8];
 
   // Table of ModRM encodings.
   typedef std::map<std::vector<unsigned>, unsigned> ModRMMapTy;
diff --git a/contrib/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/contrib/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 05f30facd547..d5dc10ecad25 100644
--- a/contrib/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -12,7 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenDAGPatterns.h"
 #include "CodeGenTarget.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -22,6 +21,7 @@ using namespace llvm;
 namespace {
 
 class X86EVEX2VEXTablesEmitter {
+  RecordKeeper &Records;
   CodeGenTarget Target;
 
   // Hold all non-masked & non-broadcasted EVEX encoded instructions
@@ -36,15 +36,8 @@ class X86EVEX2VEXTablesEmitter {
   std::vector<Entry> EVEX2VEX128;
   std::vector<Entry> EVEX2VEX256;
 
-  // Represents a manually added entry to the tables
-  struct ManualEntry {
-    const char *EVEXInstStr;
-    const char *VEXInstStr;
-    bool Is128Bit;
-  };
-
 public:
-  X86EVEX2VEXTablesEmitter(RecordKeeper &R) : Target(R) {}
+  X86EVEX2VEXTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {}
 
   // run - Output X86 EVEX2VEX tables.
   void run(raw_ostream &OS);
@@ -53,36 +46,11 @@ private:
   // Prints the given table as a C++ array of type
   // X86EvexToVexCompressTableEntry
   void printTable(const std::vector<Entry> &Table, raw_ostream &OS);
-
-  bool inExceptionList(const CodeGenInstruction *Inst) {
-    // List of EVEX instructions that match VEX instructions by the encoding
-    // but do not perform the same operation.
-    static constexpr const char *ExceptionList[] = {
-        "VCVTQQ2PD",
-        "VCVTQQ2PS",
-        "VPMAXSQ",
-        "VPMAXUQ",
-        "VPMINSQ",
-        "VPMINUQ",
-        "VPMULLQ",
-        "VPSRAQ",
-        "VDBPSADBW",
-        "VRNDSCALE",
-        "VSCALEFPS"
-    };
-    // Instruction's name starts with one of the entries in the exception list
-    for (StringRef InstStr : ExceptionList) {
-      if (Inst->TheDef->getName().startswith(InstStr))
-        return true;
-    }
-    return false;
-  }
-
 };
 
 void X86EVEX2VEXTablesEmitter::printTable(const std::vector<Entry> &Table,
                                           raw_ostream &OS) {
-  std::string Size = (Table == EVEX2VEX128) ? "128" : "256";
+  StringRef Size = (Table == EVEX2VEX128) ? "128" : "256";
 
   OS << "// X86 EVEX encoded instructions that have a VEX " << Size
      << " encoding\n"
@@ -97,83 +65,6 @@ void X86EVEX2VEXTablesEmitter::printTable(const std::vector<Entry> &Table,
        << ", X86::" << Pair.second->TheDef->getName() << " },\n";
   }
 
-  // Some VEX instructions were duplicated to multiple EVEX versions due the
-  // introduction of mask variants, and thus some of the EVEX versions have
-  // different encoding than the VEX instruction. In order to maximize the
-  // compression we add these entries manually.
-  static constexpr ManualEntry ManuallyAddedEntries[] = {
-      // EVEX-Inst            VEX-Inst           Is128-bit
-      {"VMOVDQU8Z128mr",      "VMOVDQUmr",       true},
-      {"VMOVDQU8Z128rm",      "VMOVDQUrm",       true},
-      {"VMOVDQU8Z128rr",      "VMOVDQUrr",       true},
-      {"VMOVDQU8Z128rr_REV",  "VMOVDQUrr_REV",   true},
-      {"VMOVDQU16Z128mr",     "VMOVDQUmr",       true},
-      {"VMOVDQU16Z128rm",     "VMOVDQUrm",       true},
-      {"VMOVDQU16Z128rr",     "VMOVDQUrr",       true},
-      {"VMOVDQU16Z128rr_REV", "VMOVDQUrr_REV",   true},
-      {"VMOVDQU8Z256mr",      "VMOVDQUYmr",      false},
-      {"VMOVDQU8Z256rm",      "VMOVDQUYrm",      false},
-      {"VMOVDQU8Z256rr",      "VMOVDQUYrr",      false},
-      {"VMOVDQU8Z256rr_REV",  "VMOVDQUYrr_REV",  false},
-      {"VMOVDQU16Z256mr",     "VMOVDQUYmr",      false},
-      {"VMOVDQU16Z256rm",     "VMOVDQUYrm",      false},
-      {"VMOVDQU16Z256rr",     "VMOVDQUYrr",      false},
-      {"VMOVDQU16Z256rr_REV", "VMOVDQUYrr_REV",  false},
-
-      {"VPERMILPDZ128mi",     "VPERMILPDmi",     true},
-      {"VPERMILPDZ128ri",     "VPERMILPDri",     true},
-      {"VPERMILPDZ128rm",     "VPERMILPDrm",     true},
-      {"VPERMILPDZ128rr",     "VPERMILPDrr",     true},
-      {"VPERMILPDZ256mi",     "VPERMILPDYmi",    false},
-      {"VPERMILPDZ256ri",     "VPERMILPDYri",    false},
-      {"VPERMILPDZ256rm",     "VPERMILPDYrm",    false},
-      {"VPERMILPDZ256rr",     "VPERMILPDYrr",    false},
-
-      {"VPBROADCASTQZ128m",   "VPBROADCASTQrm",  true},
-      {"VPBROADCASTQZ128r",   "VPBROADCASTQrr",  true},
-      {"VPBROADCASTQZ256m",   "VPBROADCASTQYrm", false},
-      {"VPBROADCASTQZ256r",   "VPBROADCASTQYrr", false},
-
-      {"VBROADCASTSDZ256m",   "VBROADCASTSDYrm", false},
-      {"VBROADCASTSDZ256r",   "VBROADCASTSDYrr", false},
-
-      {"VBROADCASTF64X2Z128rm", "VBROADCASTF128", false},
-      {"VBROADCASTI64X2Z128rm", "VBROADCASTI128", false},
-
-      {"VEXTRACTF64x2Z256mr", "VEXTRACTF128mr",  false},
-      {"VEXTRACTF64x2Z256rr", "VEXTRACTF128rr",  false},
-      {"VEXTRACTI64x2Z256mr", "VEXTRACTI128mr",  false},
-      {"VEXTRACTI64x2Z256rr", "VEXTRACTI128rr",  false},
-
-      {"VINSERTF64x2Z256rm",  "VINSERTF128rm",   false},
-      {"VINSERTF64x2Z256rr",  "VINSERTF128rr",   false},
-      {"VINSERTI64x2Z256rm",  "VINSERTI128rm",   false},
-      {"VINSERTI64x2Z256rr",  "VINSERTI128rr",   false},
-
-      // These will require some custom adjustment in the conversion pass.
-      {"VALIGNDZ128rri",      "VPALIGNRrri",     true},
-      {"VALIGNQZ128rri",      "VPALIGNRrri",     true},
-      {"VALIGNDZ128rmi",      "VPALIGNRrmi",     true},
-      {"VALIGNQZ128rmi",      "VPALIGNRrmi",     true},
-      {"VSHUFF32X4Z256rmi",   "VPERM2F128rm",    false},
-      {"VSHUFF32X4Z256rri",   "VPERM2F128rr",    false},
-      {"VSHUFF64X2Z256rmi",   "VPERM2F128rm",    false},
-      {"VSHUFF64X2Z256rri",   "VPERM2F128rr",    false},
-      {"VSHUFI32X4Z256rmi",   "VPERM2I128rm",    false},
-      {"VSHUFI32X4Z256rri",   "VPERM2I128rr",    false},
-      {"VSHUFI64X2Z256rmi",   "VPERM2I128rm",    false},
-      {"VSHUFI64X2Z256rri",   "VPERM2I128rr",    false},
-  };
-
-  // Print the manually added entries
-  for (const ManualEntry &Entry : ManuallyAddedEntries) {
-    if ((Table == EVEX2VEX128 && Entry.Is128Bit) ||
-        (Table == EVEX2VEX256 && !Entry.Is128Bit)) {
-      OS << "  { X86::" << Entry.EVEXInstStr << ", X86::" << Entry.VEXInstStr
-         << " },\n";
-    }
-  }
-
   OS << "};\n\n";
 }
 
@@ -210,31 +101,34 @@ static inline uint64_t getValueFromBitsInit(const BitsInit *B) {
 // Function object - Operator() returns true if the given VEX instruction
 // matches the EVEX instruction of this object.
 class IsMatch {
-  const CodeGenInstruction *Inst;
+  const CodeGenInstruction *EVEXInst;
 
 public:
-  IsMatch(const CodeGenInstruction *Inst) : Inst(Inst) {}
+  IsMatch(const CodeGenInstruction *EVEXInst) : EVEXInst(EVEXInst) {}
 
-  bool operator()(const CodeGenInstruction *Inst2) {
-    Record *Rec1 = Inst->TheDef;
-    Record *Rec2 = Inst2->TheDef;
-    uint64_t Rec1WVEX =
-        getValueFromBitsInit(Rec1->getValueAsBitsInit("VEX_WPrefix"));
-    uint64_t Rec2WVEX =
-        getValueFromBitsInit(Rec2->getValueAsBitsInit("VEX_WPrefix"));
+  bool operator()(const CodeGenInstruction *VEXInst) {
+    Record *RecE = EVEXInst->TheDef;
+    Record *RecV = VEXInst->TheDef;
+    uint64_t EVEX_W =
+        getValueFromBitsInit(RecE->getValueAsBitsInit("VEX_WPrefix"));
+    uint64_t VEX_W =
+        getValueFromBitsInit(RecV->getValueAsBitsInit("VEX_WPrefix"));
 
-    if (Rec2->getValueAsDef("OpEnc")->getName().str() != "EncVEX" ||
+    if (RecV->getValueAsDef("OpEnc")->getName().str() != "EncVEX" ||
         // VEX/EVEX fields
-        Rec2->getValueAsDef("OpPrefix") != Rec1->getValueAsDef("OpPrefix") ||
-        Rec2->getValueAsDef("OpMap") != Rec1->getValueAsDef("OpMap") ||
-        Rec2->getValueAsBit("hasVEX_4V") != Rec1->getValueAsBit("hasVEX_4V") ||
-        !equalBitsInits(Rec2->getValueAsBitsInit("EVEX_LL"),
-                        Rec1->getValueAsBitsInit("EVEX_LL")) ||
-        (Rec1WVEX != 2 && Rec2WVEX != 2 && Rec1WVEX != Rec2WVEX) ||
+        RecV->getValueAsDef("OpPrefix") != RecE->getValueAsDef("OpPrefix") ||
+        RecV->getValueAsDef("OpMap") != RecE->getValueAsDef("OpMap") ||
+        RecV->getValueAsBit("hasVEX_4V") != RecE->getValueAsBit("hasVEX_4V") ||
+        !equalBitsInits(RecV->getValueAsBitsInit("EVEX_LL"),
+                        RecE->getValueAsBitsInit("EVEX_LL")) ||
+        // Match is allowed if either is VEX_WIG, or they match, or EVEX
+        // is VEX_W1X and VEX is VEX_W0.
+        (!(EVEX_W == 2 || VEX_W == 2 || EVEX_W == VEX_W ||
+           (EVEX_W == 3 && VEX_W == 0))) ||
         // Instruction's format
-        Rec2->getValueAsDef("Form") != Rec1->getValueAsDef("Form") ||
-        Rec2->getValueAsBit("isAsmParserOnly") !=
-            Rec1->getValueAsBit("isAsmParserOnly"))
+        RecV->getValueAsDef("Form") != RecE->getValueAsDef("Form") ||
+        RecV->getValueAsBit("isAsmParserOnly") !=
+            RecE->getValueAsBit("isAsmParserOnly"))
       return false;
 
     // This is needed for instructions with intrinsic version (_Int).
@@ -243,9 +137,9 @@ public:
     // Also for instructions that their EVEX version was upgraded to work with
     // k-registers. For example VPCMPEQBrm (xmm output register) and
     // VPCMPEQBZ128rm (k register output register).
-    for (unsigned i = 0; i < Inst->Operands.size(); i++) {
-      Record *OpRec1 = Inst->Operands[i].Rec;
-      Record *OpRec2 = Inst2->Operands[i].Rec;
+    for (unsigned i = 0, e = EVEXInst->Operands.size(); i < e; i++) {
+      Record *OpRec1 = EVEXInst->Operands[i].Rec;
+      Record *OpRec2 = VEXInst->Operands[i].Rec;
 
       if (OpRec1 == OpRec2)
         continue;
@@ -315,7 +209,7 @@ void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
              !Inst->TheDef->getValueAsBit("hasEVEX_B") &&
              getValueFromBitsInit(Inst->TheDef->
                                         getValueAsBitsInit("EVEX_LL")) != 2 &&
-             !inExceptionList(Inst))
+             !Inst->TheDef->getValueAsBit("notEVEX2VEXConvertible"))
       EVEXInsts.push_back(Inst);
   }
 
@@ -324,22 +218,34 @@ void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
                                            getValueAsBitsInit("Opcode"));
     // For each EVEX instruction look for a VEX match in the appropriate vector
     // (instructions with the same opcode) using function object IsMatch.
-    auto Match = llvm::find_if(VEXInsts[Opcode], IsMatch(EVEXInst));
-    if (Match != VEXInsts[Opcode].end()) {
-      const CodeGenInstruction *VEXInst = *Match;
-
-      // In case a match is found add new entry to the appropriate table
-      switch (getValueFromBitsInit(
-          EVEXInst->TheDef->getValueAsBitsInit("EVEX_LL"))) {
-      case 0:
-        EVEX2VEX128.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,0}
-        break;
-      case 1:
-        EVEX2VEX256.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,1}
-        break;
-      default:
-        llvm_unreachable("Instruction's size not fit for the mapping!");
-      }
+    // Allow EVEX2VEXOverride to explicitly specify a match.
+    const CodeGenInstruction *VEXInst = nullptr;
+    if (!EVEXInst->TheDef->isValueUnset("EVEX2VEXOverride")) {
+      StringRef AltInstStr =
+        EVEXInst->TheDef->getValueAsString("EVEX2VEXOverride");
+      Record *AltInstRec = Records.getDef(AltInstStr);
+      assert(AltInstRec && "EVEX2VEXOverride instruction not found!");
+      VEXInst = &Target.getInstruction(AltInstRec);
+    } else {
+      auto Match = llvm::find_if(VEXInsts[Opcode], IsMatch(EVEXInst));
+      if (Match != VEXInsts[Opcode].end())
+        VEXInst = *Match;
+    }
+
+    if (!VEXInst)
+      continue;
+
+    // In case a match is found add new entry to the appropriate table
+    switch (getValueFromBitsInit(
+        EVEXInst->TheDef->getValueAsBitsInit("EVEX_LL"))) {
+    case 0:
+      EVEX2VEX128.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,0}
+      break;
+    case 1:
+      EVEX2VEX256.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,1}
+      break;
+    default:
+      llvm_unreachable("Instruction's size not fit for the mapping!");
     }
   }
 
diff --git a/contrib/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/contrib/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index ff1afa89efc8..1ea668643575 100644
--- a/contrib/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/contrib/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -47,7 +47,9 @@ const char *ExplicitAlign[] = {"MOVDQA",  "MOVAPS",  "MOVAPD",  "MOVNTPS",
                                "MOVNTPD", "MOVNTDQ", "MOVNTDQA"};
 
 // List of instructions NOT requiring explicit memory alignment.
-const char *ExplicitUnalign[] = {"MOVDQU", "MOVUPS", "MOVUPD"};
+const char *ExplicitUnalign[] = {"MOVDQU", "MOVUPS", "MOVUPD",
+                                 "PCMPESTRM", "PCMPESTRI",
+                                 "PCMPISTRM", "PCMPISTRI" };
 
 // For manually mapping instructions that do not match by their encoding.
 const ManualMapEntry ManualMapSet[] = {
@@ -63,9 +65,9 @@ const ManualMapEntry ManualMapSet[] = {
     { "ADD16rr_DB",       "ADD16rm",         NO_UNFOLD  },
     { "ADD32rr_DB",       "ADD32rm",         NO_UNFOLD  },
     { "ADD64rr_DB",       "ADD64rm",         NO_UNFOLD  },
-    { "PUSH16r",          "PUSH16rmm",       NO_UNFOLD  },
-    { "PUSH32r",          "PUSH32rmm",       NO_UNFOLD  },
-    { "PUSH64r",          "PUSH64rmm",       NO_UNFOLD  },
+    { "PUSH16r",          "PUSH16rmm",       UNFOLD },
+    { "PUSH32r",          "PUSH32rmm",       UNFOLD },
+    { "PUSH64r",          "PUSH64rmm",       UNFOLD },
     { "TAILJMPr",         "TAILJMPm",        UNFOLD },
     { "TAILJMPr64",       "TAILJMPm64",      UNFOLD },
     { "TAILJMPr64_REX",   "TAILJMPm64_REX",  UNFOLD },
@@ -106,8 +108,8 @@ class X86FoldTablesEmitter {
 
     friend raw_ostream &operator<<(raw_ostream &OS,
                                    const X86FoldTableEntry &E) {
-      OS << "{ X86::" << E.RegInst->TheDef->getName().str()
-         << ", X86::" << E.MemInst->TheDef->getName().str() << ", ";
+      OS << "{ X86::" << E.RegInst->TheDef->getName()
+         << ", X86::" << E.MemInst->TheDef->getName() << ", ";
 
       if (E.IsLoad)
         OS << "TB_FOLDED_LOAD | ";
@@ -157,7 +159,7 @@ private:
 
   // Print the given table as a static const C++ array of type
   // X86MemoryFoldTableEntry.
-  void printTable(const FoldTable &Table, std::string TableName,
+  void printTable(const FoldTable &Table, StringRef TableName,
                   raw_ostream &OS) {
     OS << "static const X86MemoryFoldTableEntry MemoryFold" << TableName
        << "[] = {\n";
@@ -251,16 +253,6 @@ getMemOperandSize(const Record *MemRec, const bool IntrinsicSensitive = false) {
   llvm_unreachable("Memory operand's size not known!");
 }
 
-// Returns true if the record's list of defs includes the given def.
-static inline bool hasDefInList(const Record *Rec, const StringRef List,
-                                const StringRef Def) {
-  if (!Rec->isValueUnset(List)) {
-    return any_of(*(Rec->getValueAsListInit(List)),
-                  [Def](const Init *I) { return I->getAsString() == Def; });
-  }
-  return false;
-}
-
 // Return true if the instruction defined as a register flavor.
 static inline bool hasRegisterFormat(const Record *Inst) {
   const BitsInit *FormBits = Inst->getValueAsBitsInit("FormBits");
@@ -335,20 +327,24 @@ public:
             MemRec->getValueAsDef("OpPrefix") ||
         RegRec->getValueAsDef("OpMap") != MemRec->getValueAsDef("OpMap") ||
         RegRec->getValueAsDef("OpSize") != MemRec->getValueAsDef("OpSize") ||
+        RegRec->getValueAsDef("AdSize") != MemRec->getValueAsDef("AdSize") ||
         RegRec->getValueAsBit("hasVEX_4V") !=
             MemRec->getValueAsBit("hasVEX_4V") ||
         RegRec->getValueAsBit("hasEVEX_K") !=
             MemRec->getValueAsBit("hasEVEX_K") ||
         RegRec->getValueAsBit("hasEVEX_Z") !=
             MemRec->getValueAsBit("hasEVEX_Z") ||
-        RegRec->getValueAsBit("hasEVEX_B") !=
-            MemRec->getValueAsBit("hasEVEX_B") ||
+        // EVEX_B means different things for memory and register forms.
+        RegRec->getValueAsBit("hasEVEX_B") != 0 ||
+        MemRec->getValueAsBit("hasEVEX_B") != 0 ||
         RegRec->getValueAsBit("hasEVEX_RC") !=
             MemRec->getValueAsBit("hasEVEX_RC") ||
         RegRec->getValueAsBit("hasREX_WPrefix") !=
             MemRec->getValueAsBit("hasREX_WPrefix") ||
         RegRec->getValueAsBit("hasLockPrefix") !=
             MemRec->getValueAsBit("hasLockPrefix") ||
+        RegRec->getValueAsBit("hasNoTrackPrefix") !=
+            MemRec->getValueAsBit("hasNoTrackPrefix") ||
         !equalBitsInits(RegRec->getValueAsBitsInit("EVEX_LL"),
                         MemRec->getValueAsBitsInit("EVEX_LL")) ||
         !equalBitsInits(RegRec->getValueAsBitsInit("VEX_WPrefix"),
@@ -511,10 +507,8 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInstr,
   unsigned MemInSize = MemRec->getValueAsDag("InOperandList")->getNumArgs();
   unsigned RegInSize = RegRec->getValueAsDag("InOperandList")->getNumArgs();
 
-  // Instructions which have the WriteRMW value (Read-Modify-Write) should be
-  // added to Table2Addr.
-  if (hasDefInList(MemRec, "SchedRW", "WriteRMW") && MemOutSize != RegOutSize &&
-      MemInSize == RegInSize) {
+  // Instructions which Read-Modify-Write should be added to Table2Addr.
+  if (MemOutSize != RegOutSize && MemInSize == RegInSize) {
     addEntryWithFlags(Table2Addr, RegInstr, MemInstr, S, 0);
     return;
   }
@@ -548,7 +542,7 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInstr,
     }
   } else if (MemInSize == RegInSize + 1 && MemOutSize + 1 == RegOutSize) {
     // Store-Folding cases.
-    // If the memory form instruction performs performs a store, the *output*
+    // If the memory form instruction performs a store, the *output*
     // register of the register form instructions disappear and instead a
     // memory *input* operand appears in the memory form instruction.
     // For example:
@@ -556,7 +550,8 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInstr,
     //   MOVAPSmr => (outs), (ins f128mem:$dst, VR128:$src)
     Record *RegOpRec = RegInstr->Operands[RegOutSize - 1].Rec;
     Record *MemOpRec = MemInstr->Operands[RegOutSize - 1].Rec;
-    if (isRegisterOperand(RegOpRec) && isMemoryOperand(MemOpRec))
+    if (isRegisterOperand(RegOpRec) && isMemoryOperand(MemOpRec) &&
+        getRegOperandSize(RegOpRec) == getMemOperandSize(MemOpRec))
       addEntryWithFlags(Table0, RegInstr, MemInstr, S, 0);
   }
 
diff --git a/contrib/llvm/utils/TableGen/X86RecognizableInstr.cpp b/contrib/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 9afdd7e09638..efd5c195d02b 100644
--- a/contrib/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/contrib/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -40,7 +40,7 @@ static uint8_t byteFromBitsInit(BitsInit &init) {
   uint8_t ret = 0;
 
   for (index = 0; index < width; index++) {
-    if (static_cast<BitInit*>(init.getBit(index))->getValue())
+    if (cast<BitInit>(init.getBit(index))->getValue())
       ret |= mask;
 
     mask <<= 1;
@@ -80,19 +80,19 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   Form     = byteFromRec(Rec, "FormBits");
   Encoding = byteFromRec(Rec, "OpEncBits");
 
-  OpSize           = byteFromRec(Rec, "OpSizeBits");
-  AdSize           = byteFromRec(Rec, "AdSizeBits");
-  HasREX_WPrefix   = Rec->getValueAsBit("hasREX_WPrefix");
-  HasVEX_4V        = Rec->getValueAsBit("hasVEX_4V");
-  VEX_WPrefix      = byteFromRec(Rec,"VEX_WPrefix");
-  IgnoresVEX_L     = Rec->getValueAsBit("ignoresVEX_L");
-  HasEVEX_L2Prefix = Rec->getValueAsBit("hasEVEX_L2");
-  HasEVEX_K        = Rec->getValueAsBit("hasEVEX_K");
-  HasEVEX_KZ       = Rec->getValueAsBit("hasEVEX_Z");
-  HasEVEX_B        = Rec->getValueAsBit("hasEVEX_B");
-  IsCodeGenOnly    = Rec->getValueAsBit("isCodeGenOnly");
-  ForceDisassemble = Rec->getValueAsBit("ForceDisassemble");
-  CD8_Scale        = byteFromRec(Rec, "CD8_Scale");
+  OpSize             = byteFromRec(Rec, "OpSizeBits");
+  AdSize             = byteFromRec(Rec, "AdSizeBits");
+  HasREX_WPrefix     = Rec->getValueAsBit("hasREX_WPrefix");
+  HasVEX_4V          = Rec->getValueAsBit("hasVEX_4V");
+  VEX_WPrefix        = byteFromRec(Rec,"VEX_WPrefix");
+  IgnoresVEX_L       = Rec->getValueAsBit("ignoresVEX_L");
+  HasEVEX_L2Prefix   = Rec->getValueAsBit("hasEVEX_L2");
+  HasEVEX_K          = Rec->getValueAsBit("hasEVEX_K");
+  HasEVEX_KZ         = Rec->getValueAsBit("hasEVEX_Z");
+  HasEVEX_B          = Rec->getValueAsBit("hasEVEX_B");
+  IsCodeGenOnly      = Rec->getValueAsBit("isCodeGenOnly");
+  ForceDisassemble   = Rec->getValueAsBit("ForceDisassemble");
+  CD8_Scale          = byteFromRec(Rec, "CD8_Scale");
 
   Name      = Rec->getName();
 
@@ -164,7 +164,8 @@ InstructionContext RecognizableInstr::insnContext() const {
       llvm_unreachable("Don't support VEX.L if EVEX_L2 is enabled");
     }
     // VEX_L & VEX_W
-    if (!EncodeRC && HasVEX_LPrefix && VEX_WPrefix == X86Local::VEX_W1) {
+    if (!EncodeRC && HasVEX_LPrefix && (VEX_WPrefix == X86Local::VEX_W1 ||
+                                        VEX_WPrefix == X86Local::VEX_W1X)) {
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L_W_OPSIZE);
       else if (OpPrefix == X86Local::XS)
@@ -192,7 +193,8 @@ InstructionContext RecognizableInstr::insnContext() const {
         llvm_unreachable("Invalid prefix");
       }
     } else if (!EncodeRC && HasEVEX_L2Prefix &&
-               VEX_WPrefix == X86Local::VEX_W1) {
+               (VEX_WPrefix == X86Local::VEX_W1 ||
+                VEX_WPrefix == X86Local::VEX_W1X)) {
       // EVEX_L2 & VEX_W
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L2_W_OPSIZE);
@@ -221,7 +223,8 @@ InstructionContext RecognizableInstr::insnContext() const {
         llvm_unreachable("Invalid prefix");
       }
     }
-    else if (VEX_WPrefix == X86Local::VEX_W1) {
+    else if (VEX_WPrefix == X86Local::VEX_W1 ||
+             VEX_WPrefix == X86Local::VEX_W1X) {
       // VEX_W
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_W_OPSIZE);
@@ -243,11 +246,16 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = EVEX_KB(IC_EVEX_XD);
     else if (OpPrefix == X86Local::XS)
       insnContext = EVEX_KB(IC_EVEX_XS);
-    else
+    else if (OpPrefix == X86Local::PS)
       insnContext = EVEX_KB(IC_EVEX);
+    else {
+      errs() << "Instruction does not use a prefix: " << Name << "\n";
+      llvm_unreachable("Invalid prefix");
+    }
     /// eof EVEX
   } else if (Encoding == X86Local::VEX || Encoding == X86Local::XOP) {
-    if (HasVEX_LPrefix && VEX_WPrefix == X86Local::VEX_W1) {
+    if (HasVEX_LPrefix && (VEX_WPrefix == X86Local::VEX_W1 ||
+                           VEX_WPrefix == X86Local::VEX_W1X)) {
       if (OpPrefix == X86Local::PD)
         insnContext = IC_VEX_L_W_OPSIZE;
       else if (OpPrefix == X86Local::XS)
@@ -262,7 +270,8 @@ InstructionContext RecognizableInstr::insnContext() const {
       }
     } else if (OpPrefix == X86Local::PD && HasVEX_LPrefix)
       insnContext = IC_VEX_L_OPSIZE;
-    else if (OpPrefix == X86Local::PD && VEX_WPrefix == X86Local::VEX_W1)
+    else if (OpPrefix == X86Local::PD && (VEX_WPrefix == X86Local::VEX_W1 ||
+                                          VEX_WPrefix == X86Local::VEX_W1X))
       insnContext = IC_VEX_W_OPSIZE;
     else if (OpPrefix == X86Local::PD)
       insnContext = IC_VEX_OPSIZE;
@@ -270,11 +279,14 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_VEX_L_XS;
     else if (HasVEX_LPrefix && OpPrefix == X86Local::XD)
       insnContext = IC_VEX_L_XD;
-    else if (VEX_WPrefix == X86Local::VEX_W1 && OpPrefix == X86Local::XS)
+    else if ((VEX_WPrefix == X86Local::VEX_W1 ||
+              VEX_WPrefix == X86Local::VEX_W1X) && OpPrefix == X86Local::XS)
       insnContext = IC_VEX_W_XS;
-    else if (VEX_WPrefix == X86Local::VEX_W1 && OpPrefix == X86Local::XD)
+    else if ((VEX_WPrefix == X86Local::VEX_W1 ||
+              VEX_WPrefix == X86Local::VEX_W1X) && OpPrefix == X86Local::XD)
       insnContext = IC_VEX_W_XD;
-    else if (VEX_WPrefix == X86Local::VEX_W1 && OpPrefix == X86Local::PS)
+    else if ((VEX_WPrefix == X86Local::VEX_W1 ||
+              VEX_WPrefix == X86Local::VEX_W1X) && OpPrefix == X86Local::PS)
       insnContext = IC_VEX_W;
     else if (HasVEX_LPrefix && OpPrefix == X86Local::PS)
       insnContext = IC_VEX_L;
@@ -297,6 +309,8 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_64BIT_XD_OPSIZE;
     else if (OpSize == X86Local::OpSize16 && OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_XS_OPSIZE;
+    else if (AdSize == X86Local::AdSize32 && OpPrefix == X86Local::PD)
+      insnContext = IC_64BIT_OPSIZE_ADSIZE;
     else if (OpSize == X86Local::OpSize16 && AdSize == X86Local::AdSize32)
       insnContext = IC_64BIT_OPSIZE_ADSIZE;
     else if (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD)
@@ -320,6 +334,12 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_XD_OPSIZE;
     else if (OpSize == X86Local::OpSize16 && OpPrefix == X86Local::XS)
       insnContext = IC_XS_OPSIZE;
+    else if (AdSize == X86Local::AdSize16 && OpPrefix == X86Local::XD)
+      insnContext = IC_XD_ADSIZE;
+    else if (AdSize == X86Local::AdSize16 && OpPrefix == X86Local::XS)
+      insnContext = IC_XS_ADSIZE;
+    else if (AdSize == X86Local::AdSize16 && OpPrefix == X86Local::PD)
+      insnContext = IC_OPSIZE_ADSIZE;
     else if (OpSize == X86Local::OpSize16 && AdSize == X86Local::AdSize16)
       insnContext = IC_OPSIZE_ADSIZE;
     else if (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD)
@@ -544,7 +564,6 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPERAND(rmRegister)
     HANDLE_OPTIONAL(immediate)
     HANDLE_OPTIONAL(immediate) // above might be a register in 7:4
-    HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRMSrcReg4VOp3:
     assert(numPhysicalOperands == 3 &&
@@ -663,41 +682,15 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPERAND(immediate)
     HANDLE_OPERAND(immediate)
     break;
-  case X86Local::MRM_F8:
-    if (Opcode == 0xc6) {
-      assert(numPhysicalOperands == 1 &&
-             "Unexpected number of operands for X86Local::MRM_F8");
-      HANDLE_OPERAND(immediate)
-    } else if (Opcode == 0xc7) {
-      assert(numPhysicalOperands == 1 &&
-             "Unexpected number of operands for X86Local::MRM_F8");
-      HANDLE_OPERAND(relocation)
-    }
-    break;
-  case X86Local::MRM_C0: case X86Local::MRM_C1: case X86Local::MRM_C2:
-  case X86Local::MRM_C3: case X86Local::MRM_C4: case X86Local::MRM_C8:
-  case X86Local::MRM_C9: case X86Local::MRM_CA: case X86Local::MRM_CB:
-  case X86Local::MRM_CF: case X86Local::MRM_D0: case X86Local::MRM_D1:
-  case X86Local::MRM_D4: case X86Local::MRM_D5: case X86Local::MRM_D6:
-  case X86Local::MRM_D7: case X86Local::MRM_D8: case X86Local::MRM_D9:
-  case X86Local::MRM_DA: case X86Local::MRM_DB: case X86Local::MRM_DC:
-  case X86Local::MRM_DD: case X86Local::MRM_DE: case X86Local::MRM_DF:
-  case X86Local::MRM_E0: case X86Local::MRM_E1: case X86Local::MRM_E2:
-  case X86Local::MRM_E3: case X86Local::MRM_E4: case X86Local::MRM_E5:
-  case X86Local::MRM_E8: case X86Local::MRM_E9: case X86Local::MRM_EA:
-  case X86Local::MRM_EB: case X86Local::MRM_EC: case X86Local::MRM_ED:
-  case X86Local::MRM_EE: case X86Local::MRM_EF: case X86Local::MRM_F0:
-  case X86Local::MRM_F1: case X86Local::MRM_F2: case X86Local::MRM_F3:
-  case X86Local::MRM_F4: case X86Local::MRM_F5: case X86Local::MRM_F6:
-  case X86Local::MRM_F7: case X86Local::MRM_F9: case X86Local::MRM_FA:
-  case X86Local::MRM_FB: case X86Local::MRM_FC: case X86Local::MRM_FD:
-  case X86Local::MRM_FE: case X86Local::MRM_FF:
-    // Ignored.
+#define MAP(from, to) case X86Local::MRM_##from:
+  X86_INSTR_MRM_MAPPING
+#undef MAP
+    HANDLE_OPTIONAL(relocation)
     break;
   }
 
-  #undef HANDLE_OPERAND
-  #undef HANDLE_OPTIONAL
+#undef HANDLE_OPERAND
+#undef HANDLE_OPTIONAL
 }
 
 void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
@@ -707,77 +700,64 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   case X86Local::MRM_##from:
 
   llvm::Optional<OpcodeType> opcodeType;
-
-  ModRMFilter*  filter      = nullptr;
-  uint8_t       opcodeToSet = 0;
-
   switch (OpMap) {
   default: llvm_unreachable("Invalid map!");
-  case X86Local::OB:
-  case X86Local::TB:
-  case X86Local::T8:
-  case X86Local::TA:
-  case X86Local::XOP8:
-  case X86Local::XOP9:
-  case X86Local::XOPA:
-    switch (OpMap) {
-    default: llvm_unreachable("Unexpected map!");
-    case X86Local::OB:   opcodeType = ONEBYTE;      break;
-    case X86Local::TB:   opcodeType = TWOBYTE;      break;
-    case X86Local::T8:   opcodeType = THREEBYTE_38; break;
-    case X86Local::TA:   opcodeType = THREEBYTE_3A; break;
-    case X86Local::XOP8: opcodeType = XOP8_MAP;     break;
-    case X86Local::XOP9: opcodeType = XOP9_MAP;     break;
-    case X86Local::XOPA: opcodeType = XOPA_MAP;     break;
-    }
-
-    switch (Form) {
-    default: llvm_unreachable("Invalid form!");
-    case X86Local::Pseudo: llvm_unreachable("Pseudo should not be emitted!");
-    case X86Local::RawFrm:
-    case X86Local::AddRegFrm:
-    case X86Local::RawFrmMemOffs:
-    case X86Local::RawFrmSrc:
-    case X86Local::RawFrmDst:
-    case X86Local::RawFrmDstSrc:
-    case X86Local::RawFrmImm8:
-    case X86Local::RawFrmImm16:
-      filter = new DumbFilter();
-      break;
-    case X86Local::MRMDestReg:
-    case X86Local::MRMSrcReg:
-    case X86Local::MRMSrcReg4VOp3:
-    case X86Local::MRMSrcRegOp4:
-    case X86Local::MRMXr:
-      filter = new ModFilter(true);
-      break;
-    case X86Local::MRMDestMem:
-    case X86Local::MRMSrcMem:
-    case X86Local::MRMSrcMem4VOp3:
-    case X86Local::MRMSrcMemOp4:
-    case X86Local::MRMXm:
-      filter = new ModFilter(false);
-      break;
-    case X86Local::MRM0r:      case X86Local::MRM1r:
-    case X86Local::MRM2r:      case X86Local::MRM3r:
-    case X86Local::MRM4r:      case X86Local::MRM5r:
-    case X86Local::MRM6r:      case X86Local::MRM7r:
-      filter = new ExtendedFilter(true, Form - X86Local::MRM0r);
-      break;
-    case X86Local::MRM0m:      case X86Local::MRM1m:
-    case X86Local::MRM2m:      case X86Local::MRM3m:
-    case X86Local::MRM4m:      case X86Local::MRM5m:
-    case X86Local::MRM6m:      case X86Local::MRM7m:
-      filter = new ExtendedFilter(false, Form - X86Local::MRM0m);
-      break;
-    X86_INSTR_MRM_MAPPING
-      filter = new ExactFilter(0xC0 + Form - X86Local::MRM_C0);   \
-      break;
-    } // switch (Form)
+  case X86Local::OB:        opcodeType = ONEBYTE;       break;
+  case X86Local::TB:        opcodeType = TWOBYTE;       break;
+  case X86Local::T8:        opcodeType = THREEBYTE_38;  break;
+  case X86Local::TA:        opcodeType = THREEBYTE_3A;  break;
+  case X86Local::XOP8:      opcodeType = XOP8_MAP;      break;
+  case X86Local::XOP9:      opcodeType = XOP9_MAP;      break;
+  case X86Local::XOPA:      opcodeType = XOPA_MAP;      break;
+  case X86Local::ThreeDNow: opcodeType = THREEDNOW_MAP; break;
+  }
 
-    opcodeToSet = Opcode;
+  std::unique_ptr<ModRMFilter> filter;
+  switch (Form) {
+  default: llvm_unreachable("Invalid form!");
+  case X86Local::Pseudo: llvm_unreachable("Pseudo should not be emitted!");
+  case X86Local::RawFrm:
+  case X86Local::AddRegFrm:
+  case X86Local::RawFrmMemOffs:
+  case X86Local::RawFrmSrc:
+  case X86Local::RawFrmDst:
+  case X86Local::RawFrmDstSrc:
+  case X86Local::RawFrmImm8:
+  case X86Local::RawFrmImm16:
+    filter = llvm::make_unique<DumbFilter>();
     break;
-  } // switch (OpMap)
+  case X86Local::MRMDestReg:
+  case X86Local::MRMSrcReg:
+  case X86Local::MRMSrcReg4VOp3:
+  case X86Local::MRMSrcRegOp4:
+  case X86Local::MRMXr:
+    filter = llvm::make_unique<ModFilter>(true);
+    break;
+  case X86Local::MRMDestMem:
+  case X86Local::MRMSrcMem:
+  case X86Local::MRMSrcMem4VOp3:
+  case X86Local::MRMSrcMemOp4:
+  case X86Local::MRMXm:
+    filter = llvm::make_unique<ModFilter>(false);
+    break;
+  case X86Local::MRM0r: case X86Local::MRM1r:
+  case X86Local::MRM2r: case X86Local::MRM3r:
+  case X86Local::MRM4r: case X86Local::MRM5r:
+  case X86Local::MRM6r: case X86Local::MRM7r:
+    filter = llvm::make_unique<ExtendedFilter>(true, Form - X86Local::MRM0r);
+    break;
+  case X86Local::MRM0m: case X86Local::MRM1m:
+  case X86Local::MRM2m: case X86Local::MRM3m:
+  case X86Local::MRM4m: case X86Local::MRM5m:
+  case X86Local::MRM6m: case X86Local::MRM7m:
+    filter = llvm::make_unique<ExtendedFilter>(false, Form - X86Local::MRM0m);
+    break;
+  X86_INSTR_MRM_MAPPING
+    filter = llvm::make_unique<ExactFilter>(0xC0 + Form - X86Local::MRM_C0);
+    break;
+  } // switch (Form)
+
+  uint8_t opcodeToSet = Opcode;
 
   unsigned AddressSize = 0;
   switch (AdSize) {
@@ -808,8 +788,6 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
                           VEX_WPrefix == X86Local::VEX_WIG, AddressSize);
   }
 
-  delete filter;
-
 #undef MAP
 }
 
@@ -884,10 +862,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("VR64",                TYPE_MM64)
   TYPE("i64imm",              TYPE_IMM)
   TYPE("anymem",              TYPE_M)
-  TYPE("opaque32mem",         TYPE_M)
-  TYPE("opaque48mem",         TYPE_M)
-  TYPE("opaque80mem",         TYPE_M)
-  TYPE("opaque512mem",        TYPE_M)
+  TYPE("opaquemem",           TYPE_M)
   TYPE("SEGMENT_REG",         TYPE_SEGMENTREG)
   TYPE("DEBUG_REG",           TYPE_DEBUGREG)
   TYPE("CONTROL_REG",         TYPE_CONTROLREG)
@@ -927,7 +902,6 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("VK32WM",              TYPE_VK)
   TYPE("VK64",                TYPE_VK)
   TYPE("VK64WM",              TYPE_VK)
-  TYPE("GR32_NOAX",           TYPE_Rv)
   TYPE("vx64mem",             TYPE_MVSIBX)
   TYPE("vx128mem",            TYPE_MVSIBX)
   TYPE("vx256mem",            TYPE_MVSIBX)
@@ -938,8 +912,8 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("vx256xmem",           TYPE_MVSIBX)
   TYPE("vy128xmem",           TYPE_MVSIBY)
   TYPE("vy256xmem",           TYPE_MVSIBY)
-  TYPE("vy512mem",            TYPE_MVSIBY)
-  TYPE("vz256xmem",           TYPE_MVSIBZ)
+  TYPE("vy512xmem",           TYPE_MVSIBY)
+  TYPE("vz256mem",            TYPE_MVSIBZ)
   TYPE("vz512mem",            TYPE_MVSIBZ)
   TYPE("BNDR",                TYPE_BNDR)
   errs() << "Unhandled type string " << s << "\n";
@@ -1120,10 +1094,7 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s,
   ENCODING("lea64_32mem",     ENCODING_RM)
   ENCODING("lea64mem",        ENCODING_RM)
   ENCODING("anymem",          ENCODING_RM)
-  ENCODING("opaque32mem",     ENCODING_RM)
-  ENCODING("opaque48mem",     ENCODING_RM)
-  ENCODING("opaque80mem",     ENCODING_RM)
-  ENCODING("opaque512mem",    ENCODING_RM)
+  ENCODING("opaquemem",       ENCODING_RM)
   ENCODING("vx64mem",         ENCODING_VSIB)
   ENCODING("vx128mem",        ENCODING_VSIB)
   ENCODING("vx256mem",        ENCODING_VSIB)
@@ -1134,8 +1105,8 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s,
   ENCODING("vx256xmem",       ENCODING_VSIB)
   ENCODING("vy128xmem",       ENCODING_VSIB)
   ENCODING("vy256xmem",       ENCODING_VSIB)
-  ENCODING("vy512mem",        ENCODING_VSIB)
-  ENCODING("vz256xmem",       ENCODING_VSIB)
+  ENCODING("vy512xmem",       ENCODING_VSIB)
+  ENCODING("vz256mem",        ENCODING_VSIB)
   ENCODING("vz512mem",        ENCODING_VSIB)
   errs() << "Unhandled memory encoding " << s << "\n";
   llvm_unreachable("Unhandled memory encoding");
@@ -1195,7 +1166,6 @@ RecognizableInstr::opcodeModifierEncodingFromString(const std::string &s,
   ENCODING("GR64",            ENCODING_RO)
   ENCODING("GR16",            ENCODING_Rv)
   ENCODING("GR8",             ENCODING_RB)
-  ENCODING("GR32_NOAX",       ENCODING_Rv)
   errs() << "Unhandled opcode modifier encoding " << s << "\n";
   llvm_unreachable("Unhandled opcode modifier encoding");
 }
diff --git a/contrib/llvm/utils/TableGen/X86RecognizableInstr.h b/contrib/llvm/utils/TableGen/X86RecognizableInstr.h
index 24509d16d638..c4d34ee6c80c 100644
--- a/contrib/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/contrib/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -122,11 +122,11 @@ namespace X86Local {
   };
 
   enum {
-    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6
+    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7
   };
 
   enum {
-    PS = 1, PD = 2, XS = 3, XD = 4
+    PD = 1, XS = 2, XD = 3, PS = 4
   };
 
   enum {
@@ -142,7 +142,7 @@ namespace X86Local {
   };
 
   enum {
-    VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2
+    VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2, VEX_W1X = 3
   };
 }
 
@@ -210,12 +210,12 @@ private:
   /// Indicates whether the instruction should be emitted into the decode
   /// tables; regardless, it will be emitted into the instruction info table
   bool ShouldBeEmitted;
-  
+
   /// The operands of the instruction, as listed in the CodeGenInstruction.
   /// They are not one-to-one with operands listed in the MCInst; for example,
   /// memory operands expand to 5 operands in the MCInst
   const std::vector<CGIOperandList::OperandInfo>* Operands;
-  
+
   /// The description of the instruction that is emitted into the instruction
   /// info table
   InstructionSpecifier* Spec;
@@ -272,7 +272,7 @@ private:
   static OperandEncoding writemaskRegisterEncodingFromString(const std::string &s,
                                                              uint8_t OpSize);
 
-  /// \brief Adjust the encoding type for an operand based on the instruction.
+  /// Adjust the encoding type for an operand based on the instruction.
   void adjustOperandEncoding(OperandEncoding &encoding);
 
   /// handleOperand - Converts a single operand from the LLVM table format to
@@ -283,7 +283,7 @@ private:
   ///                               operand exists.
   /// @param operandIndex         - The index into the generated operand table.
   ///                               Incremented by this function one or more
-  ///                               times to reflect possible duplicate 
+  ///                               times to reflect possible duplicate
   ///                               operands).
   /// @param physicalOperandIndex - The index of the current operand into the
   ///                               set of non-duplicate ('physical') operands.
@@ -314,12 +314,12 @@ private:
   bool shouldBeEmitted() const {
     return ShouldBeEmitted;
   }
-  
+
   /// emitInstructionSpecifier - Loads the instruction specifier for the current
   ///   instruction into a DisassemblerTables.
   ///
   void emitInstructionSpecifier();
-  
+
   /// emitDecodePath - Populates the proper fields in the decode tables
   ///   corresponding to the decode paths for this instruction.
   ///
@@ -349,7 +349,7 @@ public:
                            const CodeGenInstruction &insn,
                            InstrUID uid);
 };
-  
+
 } // namespace X86Disassembler
 
 } // namespace llvm